diff --git a/.gitignore b/.gitignore
index d11a504bdc56ee98b3d5a0c33f9f75d996e45567..be75938ec401b1d72fa54773c85191aaac7d7f35 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,7 @@ node_modules
 /bazel-*
 /bazel_pip
 /tools/python_bin_path.sh
-/tools/git/gen
+/tensorflow/tools/git/gen
 /pip_test
 /_python_build
 *.pyc
@@ -26,4 +26,11 @@ Podfile.lock
 /tensorflow/contrib/lite/gen/**
 /tensorflow/contrib/lite/examples/ios/simple/data/*.txt
 /tensorflow/contrib/lite/examples/ios/simple/data/*.tflite
-xcuserdata/**
\ No newline at end of file
+xcuserdata/**
+
+# Android
+.gradle
+.idea
+*.iml
+local.properties
+gradleBuild
diff --git a/BUILD b/BUILD
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..4bf647e47aa56cff0b3fd5af7d5df99d8b70549b 100644
--- a/BUILD
+++ b/BUILD
@@ -0,0 +1,6 @@
+exports_files(
+    [
+        "LICENSE",
+        "ACKNOWLEDGEMENTS",
+    ],
+)
diff --git a/CODEOWNERS b/CODEOWNERS
index 57a4df40e651f45dc03493af631d73332e46c182..007a304c3e706ce968576ec8979c08f1a3bcc552 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,53 +1,53 @@
 # NOTE: Disabled temporarily because it's too noisy on pushes.
 # Where component owners are known, add them here.
 
-#tensorflow/core/platform/windows/* @mrry
-#tensorflow/java/* @asimshankar
-#tensorflow/tensorboard/* @jart @dandelionmane
-#tensorflow/tools/docs/* @markdaoust
+# /tensorflow/core/platform/windows/ @mrry
+# /tensorflow/java/ @asimshankar
+# /tensorflow/tensorboard/ @jart @dandelionmane
+# /tensorflow/tools/docs/ @markdaoust
 
 # contrib
 
-# NEED OWNER: tensorflow/contrib/avro/*
-#tensorflow/contrib/batching/* @alextp @chrisolston
-#tensorflow/contrib/bayesflow/* @ebrevdo @rsepassi @jvdillon
-#tensorflow/contrib/boosted_trees/* @sshrdp @yk5 @nataliaponomareva
-#tensorflow/contrib/cmake/* @mrry @benoitsteiner
-#tensorflow/contrib/copy_graph/* @tucker @poxvoculi
-#tensorflow/contrib/crf/* @kentonl
-#tensorflow/contrib/data/* @mrry
-#tensorflow/contrib/distributions/* @jvdillon @langmore @rsepassi
-#tensorflow/contrib/factorization/* @agarwal-ashish @xavigonzalvo
-#tensorflow/contrib/ffmpeg/* @fredbertsch
-# NEED OWNER: tensorflow/contrib/framework/*
-#tensorflow/contrib/graph_editor/* @purpledog
-# NEED OWNER: tensorflow/contrib/grid_rnn/*
-#tensorflow/contrib/hvx/* @satok16
-#tensorflow/contrib/integrate/* @shoyer
-#tensorflow/contrib/kernel_methods/* @petrosmol
-#tensorflow/contrib/ios_examples/* @petewarden
-#tensorflow/contrib/labeled_tensor/* @shoyer
-#tensorflow/contrib/layers/* @fchollet @martinwicke
-#tensorflow/contrib/learn/* @martinwicke @ispirmustafa @alextp
-#tensorflow/contrib/linalg/* @langmore
-#tensorflow/contrib/linear_optimizer/* @petrosmol @andreasst @katsiapis
-#tensorflow/contrib/lookup/* @ysuematsu @andreasst
-#tensorflow/contrib/losses/* @alextp @ispirmustafa
-#tensorflow/contrib/makefile/* @petewarden @satok16 @wolffg
-#tensorflow/contrib/metrics/* @alextp @honkentuber @ispirmustafa
-#tensorflow/contrib/nccl/* @cwhipkey @zheng-xq
-#tensorflow/contrib/opt/* @strategist333
-#tensorflow/contrib/pi_examples/* @maciekcc
-#tensorflow/contrib/quantization/* @petewarden @cwhipkey @keveman
-#tensorflow/contrib/rnn/* @ebrevdo
-#tensorflow/contrib/saved_model/* @nfiedel @sukritiramesh
-#tensorflow/contrib/seq2seq/* @lukaszkaiser
-#tensorflow/contrib/session_bundle/* @nfiedel @sukritiramesh
-#tensorflow/contrib/slim/* @sguada @thenbasilmanran
-#tensorflow/contrib/stateless/* @girving
-#tensorflow/contrib/tensor_forest/* @gilberthendry @thomascolthurst
-#tensorflow/contrib/testing/* @dandelionmane
-#tensorflow/contrib/timeseries/* @allenlavoie
-#tensorflow/contrib/tpu/* @frankchn @saeta @jhseu
-#tensorflow/contrib/training/* @joel-shor @ebrevdo
-#tensorflow/contrib/util/* @sherrym
+# NEED OWNER: /tensorflow/contrib/avro/
+# /tensorflow/contrib/batching/ @alextp @chrisolston
+# /tensorflow/contrib/bayesflow/ @ebrevdo @rsepassi @jvdillon
+# /tensorflow/contrib/boosted_trees/ @sshrdp @yk5 @nataliaponomareva
+# /tensorflow/contrib/cmake/ @mrry @benoitsteiner
+# /tensorflow/contrib/copy_graph/ @tucker @poxvoculi
+# /tensorflow/contrib/crf/ @kentonl
+# /tensorflow/contrib/data/ @mrry
+# /tensorflow/contrib/distributions/ @jvdillon @langmore @rsepassi
+# /tensorflow/contrib/factorization/ @agarwal-ashish @xavigonzalvo
+# /tensorflow/contrib/ffmpeg/ @fredbertsch
+# NEED OWNER: /tensorflow/contrib/framework/
+# /tensorflow/contrib/graph_editor/ @purpledog
+# NEED OWNER: /tensorflow/contrib/grid_rnn/
+# /tensorflow/contrib/hvx/ @satok16
+# /tensorflow/contrib/integrate/ @shoyer
+# /tensorflow/contrib/kernel_methods/ @petrosmol
+# /tensorflow/contrib/ios_examples/ @petewarden
+# /tensorflow/contrib/labeled_tensor/ @shoyer
+# /tensorflow/contrib/layers/ @fchollet @martinwicke
+# /tensorflow/contrib/learn/ @martinwicke @ispirmustafa @alextp
+# /tensorflow/contrib/linalg/ @langmore
+# /tensorflow/contrib/linear_optimizer/ @petrosmol @andreasst @katsiapis
+# /tensorflow/contrib/lookup/ @ysuematsu @andreasst
+# /tensorflow/contrib/losses/ @alextp @ispirmustafa
+# /tensorflow/contrib/makefile/ @petewarden @satok16 @wolffg
+# /tensorflow/contrib/metrics/ @alextp @honkentuber @ispirmustafa
+# /tensorflow/contrib/nccl/ @cwhipkey @zheng-xq
+# /tensorflow/contrib/opt/ @strategist333
+# /tensorflow/contrib/pi_examples/ @maciekcc
+# /tensorflow/contrib/quantization/ @petewarden @cwhipkey @keveman
+# /tensorflow/contrib/rnn/ @ebrevdo
+# /tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh
+# /tensorflow/contrib/seq2seq/ @lukaszkaiser
+# /tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh
+# /tensorflow/contrib/slim/ @sguada @thenbasilmanran
+# /tensorflow/contrib/stateless/ @girving
+# /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst
+# /tensorflow/contrib/testing/ @dandelionmane
+# /tensorflow/contrib/timeseries/ @allenlavoie
+# /tensorflow/contrib/tpu/ @frankchn @saeta @jhseu
+# /tensorflow/contrib/training/ @joel-shor @ebrevdo
+# /tensorflow/contrib/util/ @sherrym
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index ff11d131409b65880f16b80f9fe38dc39ac0e5fa..5fff9d05a1c589636bc9c711e6eb7cc4aba86b2f 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -67,4 +67,4 @@ If the Project Stewards receive a report alleging a violation of the Code of Con
 
 ## Attribution
 
-This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at http://contributor-covenant.org/version/1/4, and includes some aspects of the Geek Feminism Code of Conduct and the Drupal Code of Conduct.
+This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at https://contributor-covenant.org/version/1/4, and includes some aspects of the Geek Feminism Code of Conduct and the Drupal Code of Conduct.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1b537ca73cc94e992e7537fe69c8d0cc8fd13102..3dad41a88c8212b7445c32f241d887306d3c19ad 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -8,8 +8,8 @@ We'd love to accept your patches! Before we can take them, we have to jump a cou
 
 Please fill out either the individual or corporate Contributor License Agreement (CLA).
 
-  * If you are an individual writing original source code and you're sure you own the intellectual property, then you'll need to sign an [individual CLA](http://code.google.com/legal/individual-cla-v1.0.html).
-  * If you work for a company that wants to allow you to contribute your work, then you'll need to sign a [corporate CLA](http://code.google.com/legal/corporate-cla-v1.0.html).
+  * If you are an individual writing original source code and you're sure you own the intellectual property, then you'll need to sign an [individual CLA](https://code.google.com/legal/individual-cla-v1.0.html).
+  * If you work for a company that wants to allow you to contribute your work, then you'll need to sign a [corporate CLA](https://code.google.com/legal/corporate-cla-v1.0.html).
 
 Follow either of the two links above to access the appropriate CLA and instructions for how to sign and return it. Once we receive it, we'll be able to accept your pull requests.
 
@@ -20,6 +20,9 @@ Follow either of the two links above to access the appropriate CLA and instructi
 If you have improvements to TensorFlow, send us your pull requests! For those
 just getting started, Github has a [howto](https://help.github.com/articles/using-pull-requests/).
 
+TensorFlow team members will be assigned to review your pull requests. Once the pull requests are approved and pass continuous integration checks, we will merge the pull requests.
+For some pull requests, we will apply the patch for each pull request to our internal version control system first, and export the change out as a new commit later, at which point the original pull request will be closed. The commits in the pull request will be squashed into a single commit with the pull request creator as the author. These pull requests will be labeled as pending merge internally.
+
 If you want to contribute but you're not sure where to start, take a look at the
 [issues with the "contributions welcome" label](https://github.com/tensorflow/tensorflow/labels/stat%3Acontributions%20welcome).
 These are issues that we believe are particularly well suited for outside
@@ -38,7 +41,7 @@ TensorFlow coding style.
 #### General guidelines and philosophy for contribution
 
 * Include unit tests when you contribute new features, as they help to
-  a) prove that your code works correctly, b) guard against future breaking
+  a) prove that your code works correctly, and b) guard against future breaking
   changes to lower the maintenance cost.
 * Bug fixes also generally require unit tests, because the presence of bugs
   usually indicates insufficient test coverage.
@@ -48,7 +51,7 @@ TensorFlow coding style.
   non-backward-compatible API changes without a major release. Reviewers of your
   pull request will comment on any API compatibility issues.
 * When you contribute a new feature to TensorFlow, the maintenance burden is (by
-  default) transferred to the TensorFlow team. This means that benefit of
+  default) transferred to the TensorFlow team. This means that benefit of the
   contribution must be compared against the cost of maintaining the feature.
 * Full new features (e.g., a new op implementing a cutting-edge algorithm)
   typically will live in
@@ -65,8 +68,8 @@ Include a license at the top of new files.
 * [Java license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/java/src/main/java/org/tensorflow/Graph.java#L1)
 * [Go license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/go/operation.go#L1)
 * [Bash license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/ci_build/ci_sanity.sh#L2)
-* [HTML license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/dist/index.html#L2)
-* [JavaScript/TypeScript license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/components/tf_backend/backend.ts#L1)
+* [HTML license example](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/components/tf_backend/tf-backend.html#L2)
+* [JavaScript/TypeScript license example](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/components/tf_backend/backend.ts#L1)
 
 Bazel BUILD files also need to include a license section, e.g.,
 [BUILD example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/BUILD#L61).
@@ -114,7 +117,7 @@ pylint --rcfile=/tmp/pylintrc myfile.py
 * [Google Java Style Guide](https://google.github.io/styleguide/javaguide.html)
 * [Google JavaScript Style Guide](https://google.github.io/styleguide/jsguide.html)
 * [Google Shell Style Guide](https://google.github.io/styleguide/shell.xml)
-* [Google Objective-C Style Guide](http://google.github.io/styleguide/objcguide.html)
+* [Google Objective-C Style Guide](https://google.github.io/styleguide/objcguide.html)
 
 #### Running sanity check
 
@@ -160,7 +163,7 @@ There are two ways to run TensorFlow unit tests.
    bazel test ${flags} //tensorflow/python/...
    ```
 
-2. Using [Docker](www.docker.com) and TensorFlow's CI scripts.
+2. Using [Docker](https://www.docker.com) and TensorFlow's CI scripts.
 
    ```bash
    # Install Docker first, then this will build and run cpu tests
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index 1a401997c649518766acb2ebb0dea1c128bd0ba4..2f3df7cda9cec29ed0c2266629022f0a22b37df9 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -4,7 +4,7 @@ https://stackoverflow.com/questions/tagged/tensorflow
 
 If you open a GitHub issue, here is our policy:
 
-1. It must be a bug or a feature request.
+1. It must be a bug, a feature request, or a significant problem with documentation (for small docs fixes please send a PR instead).
 2. The form below must be filled out.
 3. It shouldn't be a TensorBoard issue. Those go [here](https://github.com/tensorflow/tensorboard/issues).
 
diff --git a/LICENSE b/LICENSE
index 15ae42140452d32ccf929f59f7eca01a3c7b555f..4862420c0234f7542d4fe8f3520516b484a64aed 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2017 The TensorFlow Authors.  All rights reserved.
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
 
                                  Apache License
                            Version 2.0, January 2004
diff --git a/README.md b/README.md
index aff3427bddb307aea6d6c2466eac14c9edffcc32..916e5200b29841028652c861c49dbb3650baea3c 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
 |-----------------|---------------------|------------------|-------------------|---------------|
-| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) |
+| [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) |
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
@@ -27,10 +27,14 @@ guidelines](CONTRIBUTING.md). This project adheres to TensorFlow's
 uphold this code.**
 
 **We use [GitHub issues](https://github.com/tensorflow/tensorflow/issues) for
-tracking requests and bugs. So please see 
+tracking requests and bugs. So please see
 [TensorFlow Discuss](https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss) for general questions
 and discussion, and please direct specific questions to [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).**
 
+The TensorFlow project strives to abide by generally accepted best practices in open-source software development:
+
+[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/1486/badge)](https://bestpractices.coreinfrastructure.org/projects/1486)
+
 ## Installation
 *See [Installing TensorFlow](https://www.tensorflow.org/get_started/os_setup.html) for instructions on how to install our release binaries or how to build from source.*
 
@@ -46,11 +50,11 @@ packages on Linux, Mac, and Windows.
 
 
 **Individual whl files**
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/))
 * Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp36-cp36m-win_amd64.whl) ([build history](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp36-cp36m-win_amd64.whl) ([build history](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/))
 * Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
 
diff --git a/RELEASE.md b/RELEASE.md
index e04bd3fc505d51ade9e9fa12c822cb695e90b4f3..0720a8c639f8ab87214b11f6a8092b432b916853 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,306 @@
+# Release 1.6.0
+
+## Breaking Changes
+* Prebuilt binaries are now built against CUDA 9.0 and cuDNN 7.
+* Prebuilt binaries will use AVX instructions. This may break TF on older CPUs.
+
+## Major Features And Improvements
+* New Optimizer internal API for non-slot variables. Descendants of AdamOptimizer that access _beta[12]_power will need to be updated.
+* `tf.estimator.{FinalExporter,LatestExporter}` now export stripped SavedModels. This improves forward compatibility of the SavedModel.
+* FFT support added to XLA CPU/GPU.
+
+## Bug Fixes and Other Changes
+* Documentation updates:
+  * Added a second version of Getting Started, which is aimed at ML
+newcomers.
+  * Clarified documentation on `resize_images.align_corners` parameter.
+  * Additional documentation for TPUs.
+* Google Cloud Storage (GCS):
+  * Add client-side throttle.
+  * Add a `FlushCaches()` method to the FileSystem interface, with an implementation for GcsFileSystem.
+* Other:
+  * Add `tf.contrib.distributions.Kumaraswamy`.
+  * `RetryingFileSystem::FlushCaches()` calls the base FileSystem's `FlushCaches()`.
+  * Add auto_correlation to distributions.
+  * Add `tf.contrib.distributions.Autoregressive`.
+  * Add SeparableConv1D layer.
+  * Add convolutional Flipout layers.
+  * When both inputs of `tf.matmul` are bfloat16, it returns bfloat16, instead of float32.
+  * Added `tf.contrib.image.connected_components`.
+  * Add `tf.contrib.framework.CriticalSection` that allows atomic variable access.
+  * Output variance over trees predictions for classifications tasks.
+  * For `pt` and `eval` commands, allow writing tensor values to filesystem as numpy files.
+  * gRPC: Propagate truncated errors (instead of returning gRPC internal error).
+  * Augment parallel_interleave to support 2 kinds of prefetching.
+  * Improved XLA support for C64-related ops log, pow, atan2, tanh.
+  * Add probabilistic convolutional layers.
+
+## API Changes
+* Introducing prepare_variance boolean with default setting to False for backward compatibility.
+* Move `layers_dense_variational_impl.py` to `layers_dense_variational.py`.
+
+## Known Bugs
+* Using XLA:GPU with CUDA 9 and CUDA 9.1 results in garbage results and/or
+  `CUDA_ILLEGAL_ADDRESS` failures.
+
+  Google discovered in mid-December 2017 that the PTX-to-SASS compiler in CUDA 9
+  and CUDA 9.1 sometimes does not properly compute the carry bit when
+  decomposing 64-bit address calculations with large offsets (e.g. `load [x +
+  large_constant]`) into 32-bit arithmetic in SASS.
+
+  As a result, these versions of `ptxas` miscompile most XLA programs which use
+  more than 4GB of temp memory.  This results in garbage results and/or
+  `CUDA_ERROR_ILLEGAL_ADDRESS` failures.
+
+  A fix in CUDA 9.1.121 is expected in late February 2018.  We do not expect a
+  fix for CUDA 9.0.x.  Until the fix is available, the only workaround is to
+  [downgrade](https://developer.nvidia.com/cuda-toolkit-archive) to CUDA 8.0.x
+  or disable XLA:GPU.
+
+  TensorFlow will print a warning if you use XLA:GPU with a known-bad version of
+  CUDA; see e00ba24c4038e7644da417ddc639169b6ea59122.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+4d55397500, Ag Ramesh, Aiden Scandella, Akimasa Kimura, Alex Rothberg, Allen Goodman,
+amilioto, Andrei Costinescu, Andrei Nigmatulin, Anjum Sayed, Anthony Platanios,
+Anush Elangovan, Armando Fandango, Ashish Kumar Ram, Ashwini Shukla, Ben, Bhavani Subramanian,
+Brett Koonce, Carl Thomé, cclauss, Cesc, Changming Sun, Christoph Boeddeker, Clayne Robison,
+Clemens Schulz, Clint (Woonhyuk Baek), codrut3, Cole Gerdemann, Colin Raffel, Daniel Trebbien,
+Daniel Ylitalo, Daniel Zhang, Daniyar, Darjan Salaj, Dave Maclachlan, David Norman, Dong--Jian,
+dongsamb, dssgsra, Edward H, eladweiss, elilienstein, Eric Lilienstein, error.d, Eunji Jeong, fanlu,
+Florian Courtial, fo40225, Fred, Gregg Helt, Guozhong Zhuang, Hanchen Li, hsm207, hyunyoung2,
+ImSheridan, Ishant Mrinal Haloi, Jacky Ko, Jay Young, Jean Flaherty, Jerome, JerrikEph, Jesse
+Kinkead, jfaath, Jian Lin, jinghuangintel, Jiongyan Zhang, Joel Hestness, Joel Shor, Johnny Chan,
+Julian Niedermeier, Julian Wolff, JxKing, K-W-W, Karl Lessard, Kasper Marstal, Keiji Ariyama,
+Koan-Sin Tan, Loki Der Quaeler, Loo Rong Jie, Luke Schaefer, Lynn Jackson, ManHyuk, Matt Basta,
+Matt Smith, Matthew Schulkind, Michael, michaelkhan3, Miguel Piedrafita, Mikalai Drabovich,
+Mike Knapp, mjwen, mktozk, Mohamed Aly, Mohammad Ashraf Bhuiyan, Myungjoo Ham, Naman Bhalla,
+Namrata-Ibm, Nathan Luehr, nathansilberman, Netzeband, Niranjan Hasabnis, Omar Aflak, Ozge
+Yalcinkaya, Parth P Panchal, patrickzzy, Patryk Chrabaszcz, Paul Van Eck, Paweł Kapica, Peng Yu,
+Philip Yang, Pierre Blondeau, Po-Hsien Chu, powderluv, Puyu Wang, Rajendra Arora, Rasmus, Renat
+Idrisov, resec, Robin Richtsfeld, Ronald Eddy Jr, Sahil Singh, Sam Matzek, Sami Kama, sandipmgiri,
+Santiago Castro, Sayed Hadi Hashemi, Scott Tseng, Sergii Khomenko, Shahid, Shengpeng Liu, Shreyash
+Sharma, Shrinidhi Kl, Simone Cirillo, simsicon, Stanislav Levental, starsblinking, Stephen Lumenta,
+Steven Hickson, Su Tang, Taehoon Lee, Takuya Wakisaka, Ted Chang, Ted Ying, Tijmen Verhulsdonck,
+Timofey Kondrashov, vade, vaibhav, Valentin Khrulkov, vchigrin, Victor Costan, Viraj Navkal,
+Vivek Rane, wagonhelm, Yan Facai (颜发才), Yanbo Liang, Yaroslav Bulatov, yegord, Yong Tang,
+Yoni Tsafir, yordun, Yuan (Terry) Tang, Yuxin Wu, zhengdi, Zhengsheng Wei, 田传武
+
+# Release 1.5.0
+
+## Breaking Changes
+* Prebuilt binaries are now built against CUDA 9.0 and cuDNN 7.
+* Starting from 1.6 release, our prebuilt binaries will use AVX instructions.
+  This may break TF on older CPUs.
+
+## Known Bugs
+* Using XLA:GPU with CUDA 9 and CUDA 9.1 results in garbage results and/or
+  `CUDA_ILLEGAL_ADDRESS` failures.
+
+  Google discovered in mid-December 2017 that the PTX-to-SASS compiler in CUDA 9
+  and CUDA 9.1 sometimes does not properly compute the carry bit when
+  decomposing 64-bit address calculations with large offsets (e.g. `load [x +
+  large_constant]`) into 32-bit arithmetic in SASS.
+
+  As a result, these versions of `ptxas` miscompile most XLA programs which use
+  more than 4GB of temp memory.  This results in garbage results and/or
+  `CUDA_ERROR_ILLEGAL_ADDRESS` failures.
+
+  A fix in CUDA 9.1.121 is expected in late February 2018.  We do not expect a
+  fix for CUDA 9.0.x.  Until the fix is available, the only workaround is to
+  [downgrade](https://developer.nvidia.com/cuda-toolkit-archive) to CUDA 8.0.x
+  or disable XLA:GPU.
+
+  TensorFlow will print a warning if you use XLA:GPU with a known-bad version of
+  CUDA; see e00ba24c4038e7644da417ddc639169b6ea59122.
+
+## Major Features And Improvements
+* [Eager execution](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/eager)
+  preview version is now available.
+* [TensorFlow Lite](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/lite)
+  dev preview is now available.
+* CUDA 9.0 and cuDNN 7 support.
+* Accelerated Linear Algebra (XLA):
+  * Add `complex64` support to XLA compiler.
+  * `bfloat` support is now added to XLA infrastructure.
+  * Make `ClusterSpec` propagation work with XLA devices.
+  * Use a determinisitic executor to generate XLA graph.
+* `tf.contrib`:
+  * `tf.contrib.distributions`:
+    * Add `tf.contrib.distributions.Autoregressive`.
+    * Make `tf.contrib.distributions` QuadratureCompound classes support batch
+    * Infer `tf.contrib.distributions.RelaxedOneHotCategorical` `dtype` from arguments.
+    * Make `tf.contrib.distributions` quadrature family parameterized by
+      `quadrature_grid_and_prob` vs `quadrature_degree`.
+    * `auto_correlation` added to `tf.contrib.distributions`
+  * Add `tf.contrib.bayesflow.layers`, a collection of probabilistic (neural) layers.
+  * Add `tf.contrib.bayesflow.halton_sequence`.
+  * Add `tf.contrib.data.make_saveable_from_iterator.`
+  * Add `tf.contrib.data.shuffle_and_repeat`.
+  * Add new custom transformation: `tf.contrib.data.scan()`.
+  * `tf.contrib.distributions.bijectors`:
+    * Add `tf.contrib.distributions.bijectors.MaskedAutoregressiveFlow`.
+    * Add `tf.contrib.distributions.bijectors.Permute`.
+    * Add `tf.contrib.distributions.bijectors.Gumbel`.
+    * Add `tf.contrib.distributions.bijectors.Reshape`.
+    * Support shape inference (i.e., shapes containing -1) in the Reshape bijector.
+* Add `streaming_precision_recall_at_equal_thresholds,` a method for computing
+  streaming precision and recall with `O(num_thresholds + size of predictions)`
+  time and space complexity.
+* Change `RunConfig` default behavior to not set a random seed, making random
+  behavior independently random on distributed workers. We expect this to
+  generally improve training performance. Models that do rely on determinism
+  should set a random seed explicitly.
+* Replaced the implementation of `tf.flags` with `absl.flags`.
+* Add support for `CUBLAS_TENSOR_OP_MATH` in fp16 GEMM
+* Add support for CUDA on NVIDIA Tegra devices
+
+## Bug Fixes and Other Changes
+* Documentation updates:
+  * Clarified that you can only install TensorFlow on 64-bit machines.
+  * Added a short doc explaining how `Estimator`s save checkpoints.
+  * Add documentation for ops supported by the `tf2xla` bridge.
+  * Fix minor typos in the doc of `SpaceToDepth` and `DepthToSpace`.
+  * Updated documentation comments in `mfcc_mel_filterbank.h` and `mfcc.h` to
+    clarify that the input domain is squared magnitude spectra and the weighting
+    is done on linear magnitude spectra (sqrt of inputs).
+  * Change `tf.contrib.distributions` docstring examples to use `tfd` alias
+    rather than `ds`, `bs`.
+  * Fix docstring typos in `tf.distributions.bijectors.Bijector`.
+  * `tf.assert_equal` no longer raises `ValueError.` It now raises
+    `InvalidArgumentError,` as documented.
+  * Update Getting Started docs and API intro.
+* Google Cloud Storage (GCS):
+  * Add userspace DNS caching for the GCS client.
+  * Customize request timeouts for the GCS filesystem.
+  * Improve GCS filesystem caching.
+* Bug Fixes:
+  * Fix bug where partitioned integer variables got their wrong shapes. Before
+  * Fix correctness bug in CPU and GPU implementations of Adadelta.
+  * Fix a bug in `import_meta_graph`'s handling of partitioned variables when
+    importing into a scope. WARNING: This may break loading checkpoints of
+    graphs with partitioned variables saved after using `import_meta_graph` with
+    a non-empty `import_scope` argument.
+  * Fix bug in offline debugger which prevented viewing events.
+  * Added the `WorkerService.DeleteWorkerSession` method to the gRPC interface,
+    to fix a memory leak. Ensure that your master and worker servers are running
+    the same version of TensorFlow to avoid compatibility issues.
+  * Fix bug in peephole implementation of BlockLSTM cell.
+  * Fix bug by casting dtype of `log_det_jacobian` to match `log_prob` in
+    `TransformedDistribution`.
+  * Fix a bug in `import_meta_graph`'s handling of partitioned variables when
+  * Ensure `tf.distributions.Multinomial` doesn't underflow in `log_prob`.
+    Before this change, all partitions of an integer variable were initialized
+    with the shape of the unpartitioned variable; after this change they are
+    initialized correctly.
+* Other:
+  * Add necessary shape util support for bfloat16.
+  * Add a way to run ops using a step function to MonitoredSession.
+  * Add `DenseFlipout` probabilistic layer.
+  * A new flag `ignore_live_threads` is available on train. If set to `True`, it
+    will ignore threads that remain running when tearing down infrastructure
+    after successfully completing training, instead of throwing a RuntimeError.
+  * Restandardize `DenseVariational` as simpler template for other probabilistic
+    layers.
+  * `tf.data` now supports `tf.SparseTensor` components in dataset elements.
+  * It is now possible to iterate over `Tensor`s.
+  * Allow `SparseSegmentReduction` ops to have missing segment IDs.
+  * Modify custom export strategy to account for multidimensional sparse float
+    splits.
+  * `Conv2D`, `Conv2DBackpropInput`, `Conv2DBackpropFilter` now supports arbitrary
+    dilations with GPU and cuDNNv6 support.
+  * `Estimator` now supports `Dataset`: `input_fn` can return a `Dataset`
+    instead of `Tensor`s.
+  * Add `RevBlock`, a memory-efficient implementation of reversible residual layers.
+  * Reduce BFCAllocator internal fragmentation.
+  * Add `cross_entropy` and `kl_divergence` to `tf.distributions.Distribution`.
+  * Add `tf.nn.softmax_cross_entropy_with_logits_v2` which enables backprop
+    w.r.t. the labels.
+  * GPU back-end now uses `ptxas` to compile generated PTX.
+  * `BufferAssignment`'s protocol buffer dump is now deterministic.
+  * Change embedding op to use parallel version of `DynamicStitch`.
+  * Add support for sparse multidimensional feature columns.
+  * Speed up the case for sparse float columns that have only 1 value.
+  * Allow sparse float splits to support multivalent feature columns.
+  * Add `quantile` to `tf.distributions.TransformedDistribution`.
+  * Add `NCHW_VECT_C` support for `tf.depth_to_space` on GPU.
+  * Add `NCHW_VECT_C` support for `tf.space_to_depth` on GPU.
+
+## API Changes
+* Rename `SqueezeDims` attribute to `Axis` in C++ API for Squeeze op.
+* `Stream::BlockHostUntilDone` now returns Status rather than bool.
+* Minor refactor: move stats files from `stochastic` to `common` and remove
+  `stochastic`.
+
+## Known Bugs
+* Using XLA:GPU with CUDA 9 and CUDA 9.1 results in garbage results and/or
+  `CUDA_ILLEGAL_ADDRESS` failures.
+
+  Google discovered in mid-December 2017 that the PTX-to-SASS compiler in CUDA 9
+  and CUDA 9.1 sometimes does not properly compute the carry bit when
+  decomposing 64-bit address calculations with large offsets (e.g. `load [x +
+  large_constant]`) into 32-bit arithmetic in SASS.
+
+  As a result, these versions of `ptxas` miscompile most XLA programs which use
+  more than 4GB of temp memory.  This results in garbage results and/or
+  `CUDA_ERROR_ILLEGAL_ADDRESS` failures.
+
+  A fix in CUDA 9.1.121 is expected in late February 2018.  We do not expect a
+  fix for CUDA 9.0.x.  Until the fix is available, the only workaround is to
+  [downgrade](https://developer.nvidia.com/cuda-toolkit-archive) to CUDA 8.0.x
+  or disable XLA:GPU.
+
+  TensorFlow will print a warning if you use XLA:GPU with a known-bad version of
+  CUDA; see e00ba24c4038e7644da417ddc639169b6ea59122.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+Adam Zahran, Ag Ramesh, Alan Lee, Alan Yee, Alex Sergeev, Alexander, Amir H. Jadidinejad,
+Amy, Anastasios Doumoulakis, Andrei Costinescu, Andrei Nigmatulin, Anthony Platanios,
+Anush Elangovan, arixlin, Armen Donigian, ArtëM Sobolev, Atlas7, Ben Barsdell, Bill Prin,
+Bo Wang, Brett Koonce, Cameron Thomas, Carl Thomé, Cem Eteke, cglewis, Changming Sun,
+Charles Shenton, Chi-Hung, Chris Donahue, Chris Filo Gorgolewski, Chris Hoyean Song,
+Chris Tava, Christian Grail, Christoph Boeddeker, cinqS, Clayne Robison, codrut3, concerttttt,
+CQY, Dan Becker, Dan Jarvis, Daniel Zhang, David Norman, dmaclach, Dmitry Trifonov,
+Donggeon Lim, dongpilYu, Dr. Kashif Rasul, Edd Wilder-James, Eric Lv, fcharras, Felix Abecassis,
+FirefoxMetzger, formath, FredZhang, Gaojin Cao, Gary Deer, Guenther Schmuelling, Hanchen Li,
+Hanmin Qin, hannesa2, hyunyoung2, Ilya Edrenkin, Jackson Kontny, Jan, Javier Luraschi,
+Jay Young, Jayaram Bobba, Jeff, Jeff Carpenter, Jeremy Sharpe, Jeroen BéDorf, Jimmy Jia,
+Jinze Bai, Jiongyan Zhang, Joe Castagneri, Johan Ju, Josh Varty, Julian Niedermeier,
+JxKing, Karl Lessard, Kb Sriram, Keven Wang, Koan-Sin Tan, Kyle Mills, lanhin, LevineHuang,
+Loki Der Quaeler, Loo Rong Jie, Luke Iwanski, LáSzló Csomor, Mahdi Abavisani, Mahmoud Abuzaina,
+ManHyuk, Marek ŠUppa, MathSquared, Mats Linander, Matt Wytock, Matthew Daley, Maximilian Bachl,
+mdymczyk, melvyniandrag, Michael Case, Mike Traynor, miqlas, Namrata-Ibm, Nathan Luehr,
+Nathan Van Doorn, Noa Ezra, Nolan Liu, Oleg Zabluda, opensourcemattress, Ouwen Huang,
+Paul Van Eck, peisong, Peng Yu, PinkySan, pks, powderluv, Qiao Hai-Jun, Qiao Longfei,
+Rajendra Arora, Ralph Tang, resec, Robin Richtsfeld, Rohan Varma, Ryohei Kuroki, SaintNazaire,
+Samuel He, Sandeep Dcunha, sandipmgiri, Sang Han, scott, Scott Mudge, Se-Won Kim, Simon Perkins,
+Simone Cirillo, Steffen Schmitz, Suvojit Manna, Sylvus, Taehoon Lee, Ted Chang, Thomas Deegan,
+Till Hoffmann, Tim, Toni Kunic, Toon Verstraelen, Tristan Rice, Urs KöSter, Utkarsh Upadhyay,
+Vish (Ishaya) Abrams, Winnie Tsang, Yan Chen, Yan Facai (颜发才), Yi Yang, Yong Tang,
+Youssef Hesham, Yuan (Terry) Tang, Zhengsheng Wei, zxcqwe4906, 张志豪, 田传武 
+
+We are also grateful to all who filed issues or helped resolve them, asked and
+answered questions, and were part of inspiring discussions.
+
+# Release 1.4.1
+
+## Bug Fixes and Other Changes
+* `LinearClassifier` fix.
+
+# Release 1.4.0
+
+## Major Features And Improvements
+* `tf.keras` is now part of the core TensorFlow API.
+* [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
+  the core TensorFlow API.
+  * The API is now subject to backwards compatibility guarantees.
+
 # Release 1.4.0
 
 ## Major Features And Improvements
@@ -351,7 +654,7 @@ answered questions, and were part of inspiring discussions.
 * Fixed LIBXSMM integration.
 * Make decode_jpeg/decode_png/decode_gif handle all formats, since users frequently try to decode an image as the wrong type.
 * Improve implicit broadcasting lowering.
-* Improving stability of GCS/Bigquery clients by a faster retrying of stale transmissions.
+* Improving stability of GCS/BigQuery clients by a faster retrying of stale transmissions.
 * Remove OpKernelConstruction::op_def() as part of minimizing proto dependencies.
 * VectorLaplaceDiag distribution added.
 * Android demo no longer requires libtensorflow_demo.so to run (libtensorflow_inference.so still required)
diff --git a/WORKSPACE b/WORKSPACE
index b40913801ba8e3c8ee73f7ba69540b520ad698a6..1e38a9a8cd754886fc5232531816b875de0879a3 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,11 +2,11 @@ workspace(name = "org_tensorflow")
 
 http_archive(
     name = "io_bazel_rules_closure",
-    sha256 = "110fe68753413777944b473c25eed6368c4a0487cee23a7bac1b13cc49d3e257",
-    strip_prefix = "rules_closure-4af89ef1db659eb41f110df189b67d4cf14073e1",
+    sha256 = "6691c58a2cd30a86776dd9bb34898b041e37136f2dc7e24cadaeaf599c95c657",
+    strip_prefix = "rules_closure-08039ba8ca59f64248bb3b6ae016460fe9c9914f",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/4af89ef1db659eb41f110df189b67d4cf14073e1.tar.gz",
-        "https://github.com/bazelbuild/rules_closure/archive/4af89ef1db659eb41f110df189b67d4cf14073e1.tar.gz",  # 2017-08-28
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",
+        "https://github.com/bazelbuild/rules_closure/archive/08039ba8ca59f64248bb3b6ae016460fe9c9914f.tar.gz",  # 2018-01-16
     ],
 )
 
@@ -41,12 +41,12 @@ load("//tensorflow:workspace.bzl", "tf_workspace")
 tf_workspace()
 
 new_http_archive(
-    name = "inception5h",
+    name = "inception_v1",
     build_file = "models.BUILD",
-    sha256 = "d13569f6a98159de37e92e9c8ec4dae8f674fbf475f69fe6199b514f756d4364",
+    sha256 = "7efe12a8363f09bc24d7b7a450304a15655a57a7751929b2c1593a71183bb105",
     urls = [
-        "http://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip",
-        "http://download.tensorflow.org/models/inception5h.zip",
+        "http://storage.googleapis.com/download.tensorflow.org/models/inception_v1.zip",
+        "http://download.tensorflow.org/models/inception_v1.zip",
     ],
 )
 
diff --git a/configure.py b/configure.py
index cf562bdee8ef288e4c2938f50e5c6366ce05ccff..3aa1a3e956c6a559b89cdeb593a96a95188c32ae 100644
--- a/configure.py
+++ b/configure.py
@@ -34,16 +34,26 @@ except ImportError:
 
 _TF_BAZELRC = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            '.tf_configure.bazelrc')
-_DEFAULT_CUDA_VERSION = '8.0'
-_DEFAULT_CUDNN_VERSION = '6'
+_TF_WORKSPACE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                             'WORKSPACE')
+_DEFAULT_CUDA_VERSION = '9.0'
+_DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
 _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
                           'Toolkit/CUDA/v%s' % _DEFAULT_CUDA_VERSION)
+_DEFAULT_TENSORRT_PATH_LINUX = '/usr/lib/x86_64-linux-gnu'
 _TF_OPENCL_VERSION = '1.2'
 _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
 _DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
+_SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15]
+
+_DEFAULT_PROMPT_ASK_ATTEMPTS = 10
+
+
+class UserInputError(Exception):
+  pass
 
 
 def is_windows():
@@ -158,7 +168,7 @@ def get_python_path(environ_cp, python_bin_path):
   try:
     library_paths = run_shell(
         [python_bin_path, '-c',
-         'import site; print("\\n".join(site.getsitepackages()))']).split("\n")
+         'import site; print("\\n".join(site.getsitepackages()))']).split('\n')
   except subprocess.CalledProcessError:
     library_paths = [run_shell(
         [python_bin_path, '-c',
@@ -256,19 +266,6 @@ def reset_tf_configure_bazelrc():
     f.write('import %workspace%/.tf_configure.bazelrc\n')
 
 
-def run_gen_git_source(environ_cp):
-  """Run the gen_git_source to create links.
-
-  The links are for bazel to track dependencies for git hash propagation.
-
-  Args:
-    environ_cp: copy of the os.environ.
-  """
-  cmd = '"%s" tensorflow/tools/git/gen_git_source.py --configure %s' % (
-      environ_cp.get('PYTHON_BIN_PATH'), os.getcwd())
-  os.system(cmd)
-
-
 def cleanup_makefile():
   """Delete any leftover BUILD files from the Makefile build.
 
@@ -301,11 +298,17 @@ def get_var(environ_cp,
       System".
     enabled_by_default: boolean for default behavior.
     question: optional string for how to ask for user input.
-    yes_reply: optionanl string for reply when feature is enabled.
+    yes_reply: optional string for reply when feature is enabled.
     no_reply: optional string for reply when feature is disabled.
 
   Returns:
     boolean value of the variable.
+
+  Raises:
+    UserInputError: if an environment variable is set, but it cannot be
+      interpreted as a boolean indicator, assume that the user has made a
+      scripting error, and will continue to provide invalid input.
+      Raise the error to avoid infinitely looping.
   """
   if not question:
     question = 'Do you wish to build TensorFlow with %s support?' % query_item
@@ -323,6 +326,23 @@ def get_var(environ_cp,
     question += ' [y/N]: '
 
   var = environ_cp.get(var_name)
+  if var is not None:
+    var_content = var.strip().lower()
+    true_strings = ('1', 't', 'true', 'y', 'yes')
+    false_strings = ('0', 'f', 'false', 'n', 'no')
+    if var_content in true_strings:
+      var = True
+    elif var_content in false_strings:
+      var = False
+    else:
+      raise UserInputError(
+          'Environment variable %s must be set as a boolean indicator.\n'
+          'The following are accepted as TRUE : %s.\n'
+          'The following are accepted as FALSE: %s.\n'
+          'Current value is %s.' % (
+              var_name, ', '.join(true_strings), ', '.join(false_strings),
+              var))
+
   while var is None:
     user_input_origin = get_input(question)
     user_input = user_input_origin.strip().lower()
@@ -391,7 +411,7 @@ def set_action_env_var(environ_cp,
       System".
     enabled_by_default: boolean for default behavior.
     question: optional string for how to ask for user input.
-    yes_reply: optionanl string for reply when feature is enabled.
+    yes_reply: optional string for reply when feature is enabled.
     no_reply: optional string for reply when feature is disabled.
   """
   var = int(
@@ -425,7 +445,7 @@ def convert_version_to_int(version):
 
 
 def check_bazel_version(min_version):
-  """Check installed bezel version is at least min_version.
+  """Check installed bazel version is at least min_version.
 
   Args:
     min_version: string for minimum bazel version.
@@ -509,6 +529,21 @@ def set_tf_cuda_clang(environ_cp):
       no_reply=no_reply)
 
 
+def set_tf_download_clang(environ_cp):
+  """Set TF_DOWNLOAD_CLANG action_env."""
+  question = 'Do you want to download a fresh release of clang? (Experimental)'
+  yes_reply = 'Clang will be downloaded and used to compile tensorflow.'
+  no_reply = 'Clang will not be downloaded.'
+  set_action_env_var(
+      environ_cp,
+      'TF_DOWNLOAD_CLANG',
+      None,
+      False,
+      question=question,
+      yes_reply=yes_reply,
+      no_reply=no_reply)
+
+
 def get_from_env_or_user_or_default(environ_cp, var_name, ask_for_var,
                                     var_default):
   """Get var_name either from env, or user or default.
@@ -557,6 +592,219 @@ def set_clang_cuda_compiler_path(environ_cp):
                               clang_cuda_compiler_path)
 
 
+def prompt_loop_or_load_from_env(
+    environ_cp,
+    var_name,
+    var_default,
+    ask_for_var,
+    check_success,
+    error_msg,
+    suppress_default_error=False,
+    n_ask_attempts=_DEFAULT_PROMPT_ASK_ATTEMPTS
+):
+  """Loop over user prompts for an ENV param until receiving a valid response.
+
+  For the env param var_name, read from the environment or verify user input
+  until receiving valid input. When done, set var_name in the environ_cp to its
+  new value.
+
+  Args:
+    environ_cp: (Dict) copy of the os.environ.
+    var_name: (String) string for name of environment variable, e.g. "TF_MYVAR".
+    var_default: (String) default value string.
+    ask_for_var: (String) string for how to ask for user input.
+    check_success: (Function) function that takes one argument and returns a
+      boolean. Should return True if the value provided is considered valid. May
+      contain a complex error message if error_msg does not provide enough
+      information. In that case, set suppress_default_error to True.
+    error_msg: (String) String with one and only one '%s'. Formatted with each
+      invalid response upon check_success(input) failure.
+    suppress_default_error: (Bool) Suppress the above error message in favor of
+      one from the check_success function.
+    n_ask_attempts: (Integer) Number of times to query for valid input before
+      raising an error and quitting.
+
+  Returns:
+    [String] The value of var_name after querying for input.
+
+  Raises:
+    UserInputError: if a query has been attempted n_ask_attempts times without
+      success, assume that the user has made a scripting error, and will
+      continue to provide invalid input. Raise the error to avoid infinitely
+      looping.
+  """
+  default = environ_cp.get(var_name) or var_default
+  full_query = '%s [Default is %s]: ' % (
+      ask_for_var,
+      default,
+  )
+
+  for _ in range(n_ask_attempts):
+    val = get_from_env_or_user_or_default(environ_cp,
+                                          var_name,
+                                          full_query,
+                                          default)
+    if check_success(val):
+      break
+    if not suppress_default_error:
+      print(error_msg % val)
+    environ_cp[var_name] = ''
+  else:
+    raise UserInputError('Invalid %s setting was provided %d times in a row. '
+                         'Assuming to be a scripting mistake.' %
+                         (var_name, n_ask_attempts))
+
+  environ_cp[var_name] = val
+  return val
+
+
+def create_android_ndk_rule(environ_cp):
+  """Set ANDROID_NDK_HOME and write Android NDK WORKSPACE rule."""
+  if is_windows() or is_cygwin():
+    default_ndk_path = cygpath('%s/Android/Sdk/ndk-bundle' %
+                               environ_cp['APPDATA'])
+  elif is_macos():
+    default_ndk_path = '%s/library/Android/Sdk/ndk-bundle' % environ_cp['HOME']
+  else:
+    default_ndk_path = '%s/Android/Sdk/ndk-bundle' % environ_cp['HOME']
+
+  def valid_ndk_path(path):
+    return (os.path.exists(path) and
+            os.path.exists(os.path.join(path, 'source.properties')))
+
+  android_ndk_home_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='ANDROID_NDK_HOME',
+      var_default=default_ndk_path,
+      ask_for_var='Please specify the home path of the Android NDK to use.',
+      check_success=valid_ndk_path,
+      error_msg=('The path %s or its child file "source.properties" '
+                 'does not exist.')
+  )
+
+  write_android_ndk_workspace_rule(android_ndk_home_path)
+
+
+def create_android_sdk_rule(environ_cp):
+  """Set Android variables and write Android SDK WORKSPACE rule."""
+  if is_windows() or is_cygwin():
+    default_sdk_path = cygpath('%s/Android/Sdk' % environ_cp['APPDATA'])
+  elif is_macos():
+    default_sdk_path = '%s/library/Android/Sdk/ndk-bundle' % environ_cp['HOME']
+  else:
+    default_sdk_path = '%s/Android/Sdk' % environ_cp['HOME']
+
+  def valid_sdk_path(path):
+    return (os.path.exists(path) and
+            os.path.exists(os.path.join(path, 'platforms')) and
+            os.path.exists(os.path.join(path, 'build-tools')))
+
+  android_sdk_home_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='ANDROID_SDK_HOME',
+      var_default=default_sdk_path,
+      ask_for_var='Please specify the home path of the Android SDK to use.',
+      check_success=valid_sdk_path,
+      error_msg=('Either %s does not exist, or it does not contain the '
+                 'subdirectories "platforms" and "build-tools".'))
+
+  platforms = os.path.join(android_sdk_home_path, 'platforms')
+  api_levels = sorted(os.listdir(platforms))
+  api_levels = [x.replace('android-', '') for x in api_levels]
+
+  def valid_api_level(api_level):
+    return os.path.exists(os.path.join(android_sdk_home_path,
+                                       'platforms',
+                                       'android-' + api_level))
+
+  android_api_level = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='ANDROID_API_LEVEL',
+      var_default=api_levels[-1],
+      ask_for_var=('Please specify the Android SDK API level to use. '
+                   '[Available levels: %s]') % api_levels,
+      check_success=valid_api_level,
+      error_msg='Android-%s is not present in the SDK path.')
+
+  build_tools = os.path.join(android_sdk_home_path, 'build-tools')
+  versions = sorted(os.listdir(build_tools))
+
+  def valid_build_tools(version):
+    return os.path.exists(os.path.join(android_sdk_home_path,
+                                       'build-tools',
+                                       version))
+
+  android_build_tools_version = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='ANDROID_BUILD_TOOLS_VERSION',
+      var_default=versions[-1],
+      ask_for_var=('Please specify an Android build tools version to use. '
+                   '[Available versions: %s]') % versions,
+      check_success=valid_build_tools,
+      error_msg=('The selected SDK does not have build-tools version %s '
+                 'available.'))
+
+  write_android_sdk_workspace_rule(android_sdk_home_path,
+                                   android_build_tools_version,
+                                   android_api_level)
+
+
+def write_android_sdk_workspace_rule(android_sdk_home_path,
+                                     android_build_tools_version,
+                                     android_api_level):
+  print('Writing android_sdk_workspace rule.\n')
+  with open(_TF_WORKSPACE, 'a') as f:
+    f.write("""
+android_sdk_repository(
+  name="androidsdk",
+  api_level=%s,
+  path="%s",
+  build_tools_version="%s")\n
+""" % (android_api_level, android_sdk_home_path, android_build_tools_version))
+
+
+def write_android_ndk_workspace_rule(android_ndk_home_path):
+  print('Writing android_ndk_workspace rule.')
+  ndk_api_level = check_ndk_level(android_ndk_home_path)
+  if int(ndk_api_level) not in _SUPPORTED_ANDROID_NDK_VERSIONS:
+    print('WARNING: The API level of the NDK in %s is %s, which is not '
+          'supported by Bazel (officially supported versions: %s). Please use '
+          'another version. Compiling Android targets may result in confusing '
+          'errors.\n' % (android_ndk_home_path, ndk_api_level,
+                         _SUPPORTED_ANDROID_NDK_VERSIONS))
+  with open(_TF_WORKSPACE, 'a') as f:
+    f.write("""
+android_ndk_repository(
+  name="androidndk",
+  path="%s",
+  api_level=%s)\n
+""" % (android_ndk_home_path, ndk_api_level))
+
+
+def check_ndk_level(android_ndk_home_path):
+  """Check the revision number of an Android NDK path."""
+  properties_path = '%s/source.properties' % android_ndk_home_path
+  if is_windows() or is_cygwin():
+    properties_path = cygpath(properties_path)
+  with open(properties_path, 'r') as f:
+    filedata = f.read()
+
+  revision = re.search(r'Pkg.Revision = (\d+)', filedata)
+  if revision:
+    return revision.group(1)
+  return None
+
+
+def workspace_has_any_android_rule():
+  """Check the WORKSPACE for existing android_*_repository rules."""
+  with open(_TF_WORKSPACE, 'r') as f:
+    workspace = f.read()
+  has_any_rule = re.search(r'^android_[ns]dk_repository',
+                           workspace,
+                           re.MULTILINE)
+  return has_any_rule
+
+
 def set_gcc_host_compiler_path(environ_cp):
   """Set GCC_HOST_COMPILER_PATH."""
   default_gcc_host_compiler_path = which('gcc') or ''
@@ -566,24 +814,39 @@ def set_gcc_host_compiler_path(environ_cp):
     # os.readlink is only available in linux
     default_gcc_host_compiler_path = os.path.realpath(cuda_bin_symlink)
 
-  ask_gcc_path = (
-      'Please specify which gcc should be used by nvcc as the '
-      'host compiler. [Default is %s]: ') % default_gcc_host_compiler_path
-  while True:
-    gcc_host_compiler_path = get_from_env_or_user_or_default(
-        environ_cp, 'GCC_HOST_COMPILER_PATH', ask_gcc_path,
-        default_gcc_host_compiler_path)
+  gcc_host_compiler_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='GCC_HOST_COMPILER_PATH',
+      var_default=default_gcc_host_compiler_path,
+      ask_for_var=
+      'Please specify which gcc should be used by nvcc as the host compiler.',
+      check_success=os.path.exists,
+      error_msg='Invalid gcc path. %s cannot be found.',
+  )
 
-    if os.path.exists(gcc_host_compiler_path):
-      break
+  write_action_env_to_bazelrc('GCC_HOST_COMPILER_PATH', gcc_host_compiler_path)
 
-    # Reset and retry
-    print('Invalid gcc path. %s cannot be found' % gcc_host_compiler_path)
-    environ_cp['GCC_HOST_COMPILER_PATH'] = ''
 
-  # Set GCC_HOST_COMPILER_PATH
-  environ_cp['GCC_HOST_COMPILER_PATH'] = gcc_host_compiler_path
-  write_action_env_to_bazelrc('GCC_HOST_COMPILER_PATH', gcc_host_compiler_path)
+def reformat_version_sequence(version_str, sequence_count):
+  """Reformat the version string to have the given number of sequences.
+
+  For example:
+  Given (7, 2) -> 7.0
+        (7.0.1, 2) -> 7.0
+        (5, 1) -> 5
+        (5.0.3.2, 1) -> 5
+
+  Args:
+      version_str: String, the version string.
+      sequence_count: int, an integer.
+  Returns:
+      string, reformatted version string.
+  """
+  v = version_str.split('.')
+  if len(v) < sequence_count:
+    v = v + (['0'] * (sequence_count - len(v)))
+
+  return '.'.join(v[:sequence_count])
 
 
 def set_tf_cuda_version(environ_cp):
@@ -592,10 +855,11 @@ def set_tf_cuda_version(environ_cp):
       'Please specify the CUDA SDK version you want to use, '
       'e.g. 7.0. [Leave empty to default to CUDA %s]: ') % _DEFAULT_CUDA_VERSION
 
-  while True:
+  for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
     # Configure the Cuda SDK version to use.
     tf_cuda_version = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDA_VERSION', ask_cuda_version, _DEFAULT_CUDA_VERSION)
+    tf_cuda_version = reformat_version_sequence(str(tf_cuda_version), 2)
 
     # Find out where the CUDA toolkit is installed
     default_cuda_path = _DEFAULT_CUDA_PATH
@@ -630,6 +894,11 @@ def set_tf_cuda_version(environ_cp):
     environ_cp['TF_CUDA_VERSION'] = ''
     environ_cp['CUDA_TOOLKIT_PATH'] = ''
 
+  else:
+    raise UserInputError('Invalid TF_CUDA_SETTING setting was provided %d '
+                         'times in a row. Assuming to be a scripting mistake.' %
+                         _DEFAULT_PROMPT_ASK_ATTEMPTS)
+
   # Set CUDA_TOOLKIT_PATH and TF_CUDA_VERSION
   environ_cp['CUDA_TOOLKIT_PATH'] = cuda_toolkit_path
   write_action_env_to_bazelrc('CUDA_TOOLKIT_PATH', cuda_toolkit_path)
@@ -643,10 +912,11 @@ def set_tf_cudnn_version(environ_cp):
       'Please specify the cuDNN version you want to use. '
       '[Leave empty to default to cuDNN %s.0]: ') % _DEFAULT_CUDNN_VERSION
 
-  while True:
+  for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
     tf_cudnn_version = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDNN_VERSION', ask_cudnn_version,
         _DEFAULT_CUDNN_VERSION)
+    tf_cudnn_version = reformat_version_sequence(str(tf_cudnn_version) ,1)
 
     default_cudnn_path = environ_cp.get('CUDA_TOOLKIT_PATH')
     ask_cudnn_path = (r'Please specify the location where cuDNN %s library is '
@@ -702,6 +972,10 @@ def set_tf_cudnn_version(environ_cp):
       print('%s.%s' % (cudnn_path_from_ldconfig, tf_cudnn_version))
 
     environ_cp['TF_CUDNN_VERSION'] = ''
+  else:
+    raise UserInputError('Invalid TF_CUDNN setting was provided %d '
+                         'times in a row. Assuming to be a scripting mistake.' %
+                         _DEFAULT_PROMPT_ASK_ATTEMPTS)
 
   # Set CUDNN_INSTALL_PATH and TF_CUDNN_VERSION
   environ_cp['CUDNN_INSTALL_PATH'] = cudnn_install_path
@@ -710,6 +984,128 @@ def set_tf_cudnn_version(environ_cp):
   write_action_env_to_bazelrc('TF_CUDNN_VERSION', tf_cudnn_version)
 
 
+def set_tf_tensorrt_install_path(environ_cp):
+  """Set TENSORRT_INSTALL_PATH and TF_TENSORRT_VERSION.
+
+  Adapted from code contributed by Sami Kama (https://github.com/samikama).
+
+  Args:
+    environ_cp: copy of the os.environ.
+
+  Raises:
+    ValueError: if this method was called under non-Linux platform.
+    UserInputError: if user has provided invalid input multiple times.
+  """
+  if not is_linux():
+    raise ValueError('Currently TensorRT is only supported on Linux platform.')
+
+  # Ask user whether to add TensorRT support.
+  if str(int(get_var(
+      environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', False))) != '1':
+    return
+
+  for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
+    ask_tensorrt_path = (r'Please specify the location where TensorRT is '
+                         'installed. [Default is %s]:') % (
+                             _DEFAULT_TENSORRT_PATH_LINUX)
+    trt_install_path = get_from_env_or_user_or_default(
+        environ_cp, 'TENSORRT_INSTALL_PATH', ask_tensorrt_path,
+        _DEFAULT_TENSORRT_PATH_LINUX)
+
+    # Result returned from "read" will be used unexpanded. That make "~"
+    # unusable. Going through one more level of expansion to handle that.
+    trt_install_path = os.path.realpath(
+        os.path.expanduser(trt_install_path))
+
+    def find_libs(search_path):
+      """Search for libnvinfer.so in "search_path"."""
+      fl = set()
+      if os.path.exists(search_path) and os.path.isdir(search_path):
+        fl.update([os.path.realpath(os.path.join(search_path, x))
+                   for x in os.listdir(search_path) if 'libnvinfer.so' in x])
+      return fl
+
+    possible_files = find_libs(trt_install_path)
+    possible_files.update(find_libs(os.path.join(trt_install_path, 'lib')))
+    possible_files.update(find_libs(os.path.join(trt_install_path, 'lib64')))
+
+    def is_compatible(tensorrt_lib, cuda_ver, cudnn_ver):
+      """Check the compatibility between tensorrt and cudnn/cudart libraries."""
+      ldd_bin = which('ldd') or '/usr/bin/ldd'
+      ldd_out = run_shell([ldd_bin, tensorrt_lib]).split(os.linesep)
+      cudnn_pattern = re.compile('.*libcudnn.so\\.?(.*) =>.*$')
+      cuda_pattern = re.compile('.*libcudart.so\\.?(.*) =>.*$')
+      cudnn = None
+      cudart = None
+      for line in ldd_out:
+        if 'libcudnn.so' in line:
+          cudnn = cudnn_pattern.search(line)
+        elif 'libcudart.so' in line:
+          cudart = cuda_pattern.search(line)
+      if cudnn and len(cudnn.group(1)):
+        cudnn = convert_version_to_int(cudnn.group(1))
+      if cudart and len(cudart.group(1)):
+        cudart = convert_version_to_int(cudart.group(1))
+      return (cudnn == cudnn_ver) and (cudart == cuda_ver)
+
+    cuda_ver = convert_version_to_int(environ_cp['TF_CUDA_VERSION'])
+    cudnn_ver = convert_version_to_int(environ_cp['TF_CUDNN_VERSION'])
+    nvinfer_pattern = re.compile('.*libnvinfer.so.?(.*)$')
+    highest_ver = [0, None, None]
+
+    for lib_file in possible_files:
+      if is_compatible(lib_file, cuda_ver, cudnn_ver):
+        ver_str = nvinfer_pattern.search(lib_file).group(1)
+        ver = convert_version_to_int(ver_str) if len(ver_str) else 0
+        if ver > highest_ver[0]:
+          highest_ver = [ver, ver_str, lib_file]
+    if highest_ver[1] is not None:
+      trt_install_path = os.path.dirname(highest_ver[2])
+      tf_tensorrt_version = highest_ver[1]
+      break
+
+    # Try another alternative from ldconfig.
+    ldconfig_bin = which('ldconfig') or '/sbin/ldconfig'
+    ldconfig_output = run_shell([ldconfig_bin, '-p'])
+    search_result = re.search(
+        '.*libnvinfer.so\\.?([0-9.]*).* => (.*)', ldconfig_output)
+    if search_result:
+      libnvinfer_path_from_ldconfig = search_result.group(2)
+      if os.path.exists(libnvinfer_path_from_ldconfig):
+        if is_compatible(libnvinfer_path_from_ldconfig, cuda_ver, cudnn_ver):
+          trt_install_path = os.path.dirname(libnvinfer_path_from_ldconfig)
+          tf_tensorrt_version = search_result.group(1)
+          break
+
+    # Reset and Retry
+    if len(possible_files):
+      print('TensorRT libraries found in one the following directories',
+            'are not compatible with selected cuda and cudnn installations')
+      print(trt_install_path)
+      print(os.path.join(trt_install_path, 'lib'))
+      print(os.path.join(trt_install_path, 'lib64'))
+      if search_result:
+        print(libnvinfer_path_from_ldconfig)
+    else:
+      print('Invalid path to TensorRT. None of the following files can be found:')
+      print(trt_install_path)
+      print(os.path.join(trt_install_path, 'lib'))
+      print(os.path.join(trt_install_path, 'lib64'))
+      if search_result:
+        print(libnvinfer_path_from_ldconfig)
+
+  else:
+    raise UserInputError('Invalid TF_TENSORRT setting was provided %d '
+                         'times in a row. Assuming to be a scripting mistake.' %
+                         _DEFAULT_PROMPT_ASK_ATTEMPTS)
+
+  # Set TENSORRT_INSTALL_PATH and TF_TENSORRT_VERSION
+  environ_cp['TENSORRT_INSTALL_PATH'] = trt_install_path
+  write_action_env_to_bazelrc('TENSORRT_INSTALL_PATH', trt_install_path)
+  environ_cp['TF_TENSORRT_VERSION'] = tf_tensorrt_version
+  write_action_env_to_bazelrc('TF_TENSORRT_VERSION', tf_tensorrt_version)
+
+
 def get_native_cuda_compute_capabilities(environ_cp):
   """Get native cuda compute capabilities.
 
@@ -810,90 +1206,83 @@ def set_other_cuda_vars(environ_cp):
 def set_host_cxx_compiler(environ_cp):
   """Set HOST_CXX_COMPILER."""
   default_cxx_host_compiler = which('g++') or ''
-  ask_cxx_host_compiler = (
-      'Please specify which C++ compiler should be used as'
-      ' the host C++ compiler. [Default is %s]: ') % default_cxx_host_compiler
-
-  while True:
-    host_cxx_compiler = get_from_env_or_user_or_default(
-        environ_cp, 'HOST_CXX_COMPILER', ask_cxx_host_compiler,
-        default_cxx_host_compiler)
-    if os.path.exists(host_cxx_compiler):
-      break
 
-    # Reset and retry
-    print('Invalid C++ compiler path. %s cannot be found' % host_cxx_compiler)
-    environ_cp['HOST_CXX_COMPILER'] = ''
+  host_cxx_compiler = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='HOST_CXX_COMPILER',
+      var_default=default_cxx_host_compiler,
+      ask_for_var=('Please specify which C++ compiler should be used as the '
+                   'host C++ compiler.'),
+      check_success=os.path.exists,
+      error_msg='Invalid C++ compiler path. %s cannot be found.',
+  )
 
-  # Set HOST_CXX_COMPILER
-  environ_cp['HOST_CXX_COMPILER'] = host_cxx_compiler
   write_action_env_to_bazelrc('HOST_CXX_COMPILER', host_cxx_compiler)
 
 
 def set_host_c_compiler(environ_cp):
   """Set HOST_C_COMPILER."""
   default_c_host_compiler = which('gcc') or ''
-  ask_c_host_compiler = (
-      'Please specify which C compiler should be used as the'
-      ' host C compiler. [Default is %s]: ') % default_c_host_compiler
-
-  while True:
-    host_c_compiler = get_from_env_or_user_or_default(
-        environ_cp, 'HOST_C_COMPILER', ask_c_host_compiler,
-        default_c_host_compiler)
-    if os.path.exists(host_c_compiler):
-      break
 
-    # Reset and retry
-    print('Invalid C compiler path. %s cannot be found' % host_c_compiler)
-    environ_cp['HOST_C_COMPILER'] = ''
+  host_c_compiler = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='HOST_C_COMPILER',
+      var_default=default_c_host_compiler,
+      ask_for_var=('Please specify which C compiler should be used as the host'
+                   'C compiler.'),
+      check_success=os.path.exists,
+      error_msg='Invalid C compiler path. %s cannot be found.',
+  )
 
-  # Set HOST_C_COMPILER
-  environ_cp['HOST_C_COMPILER'] = host_c_compiler
   write_action_env_to_bazelrc('HOST_C_COMPILER', host_c_compiler)
 
 
 def set_computecpp_toolkit_path(environ_cp):
   """Set COMPUTECPP_TOOLKIT_PATH."""
-  ask_computecpp_toolkit_path = ('Please specify the location where ComputeCpp '
-                                 'for SYCL %s is installed. [Default is %s]: '
-                                ) % (_TF_OPENCL_VERSION,
-                                     _DEFAULT_COMPUTECPP_TOOLKIT_PATH)
 
-  while True:
-    computecpp_toolkit_path = get_from_env_or_user_or_default(
-        environ_cp, 'COMPUTECPP_TOOLKIT_PATH', ask_computecpp_toolkit_path,
-        _DEFAULT_COMPUTECPP_TOOLKIT_PATH)
+  def toolkit_exists(toolkit_path):
+    """Check if a computecpp toolkit path is valid."""
     if is_linux():
       sycl_rt_lib_path = 'lib/libComputeCpp.so'
     else:
       sycl_rt_lib_path = ''
 
-    sycl_rt_lib_path_full = os.path.join(computecpp_toolkit_path,
+    sycl_rt_lib_path_full = os.path.join(toolkit_path,
                                          sycl_rt_lib_path)
-    if os.path.exists(sycl_rt_lib_path_full):
-      break
+    exists = os.path.exists(sycl_rt_lib_path_full)
+    if not exists:
+      print('Invalid SYCL %s library path. %s cannot be found' %
+            (_TF_OPENCL_VERSION, sycl_rt_lib_path_full))
+    return exists
 
-    print('Invalid SYCL %s library path. %s cannot be found' %
-          (_TF_OPENCL_VERSION, sycl_rt_lib_path_full))
-    environ_cp['COMPUTECPP_TOOLKIT_PATH'] = ''
+  computecpp_toolkit_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='COMPUTECPP_TOOLKIT_PATH',
+      var_default=_DEFAULT_COMPUTECPP_TOOLKIT_PATH,
+      ask_for_var=(
+          'Please specify the location where ComputeCpp for SYCL %s is '
+          'installed.' % _TF_OPENCL_VERSION),
+      check_success=toolkit_exists,
+      error_msg='Invalid SYCL compiler path. %s cannot be found.',
+      suppress_default_error=True)
 
-  # Set COMPUTECPP_TOOLKIT_PATH
-  environ_cp['COMPUTECPP_TOOLKIT_PATH'] = computecpp_toolkit_path
   write_action_env_to_bazelrc('COMPUTECPP_TOOLKIT_PATH',
                               computecpp_toolkit_path)
 
+
 def set_trisycl_include_dir(environ_cp):
-  """Set TRISYCL_INCLUDE_DIR"""
+  """Set TRISYCL_INCLUDE_DIR."""
+
   ask_trisycl_include_dir = ('Please specify the location of the triSYCL '
                              'include directory. (Use --config=sycl_trisycl '
                              'when building with Bazel) '
                              '[Default is %s]: '
-                             ) % (_DEFAULT_TRISYCL_INCLUDE_DIR)
+                            ) % (_DEFAULT_TRISYCL_INCLUDE_DIR)
+
   while True:
     trisycl_include_dir = get_from_env_or_user_or_default(
-      environ_cp, 'TRISYCL_INCLUDE_DIR', ask_trisycl_include_dir,
-      _DEFAULT_TRISYCL_INCLUDE_DIR)
+        environ_cp, 'TRISYCL_INCLUDE_DIR', ask_trisycl_include_dir,
+        _DEFAULT_TRISYCL_INCLUDE_DIR)
     if os.path.exists(trisycl_include_dir):
       break
 
@@ -905,50 +1294,30 @@ def set_trisycl_include_dir(environ_cp):
   write_action_env_to_bazelrc('TRISYCL_INCLUDE_DIR',
                               trisycl_include_dir)
 
-def set_trisycl_include_dir(environ_cp):
-  """Set TRISYCL_INCLUDE_DIR."""
-  ask_trisycl_include_dir = ('Please specify the location of the triSYCL '
-                             'include directory. (Use --config=sycl_trisycl '
-                             'when building with Bazel) '
-                             '[Default is %s]: ') % (
-                                 _DEFAULT_TRISYCL_INCLUDE_DIR)
-  while True:
-    trisycl_include_dir = get_from_env_or_user_or_default(
-        environ_cp, 'TRISYCL_INCLUDE_DIR', ask_trisycl_include_dir,
-        _DEFAULT_TRISYCL_INCLUDE_DIR)
-    if os.path.exists(trisycl_include_dir):
-      break
-
-    print('Invalid triSYCL include directory, %s cannot be found' %
-          (trisycl_include_dir))
-
-  # Set TRISYCL_INCLUDE_DIR
-  environ_cp['TRISYCL_INCLUDE_DIR'] = trisycl_include_dir
-  write_action_env_to_bazelrc('TRISYCL_INCLUDE_DIR', trisycl_include_dir)
-
 
 def set_mpi_home(environ_cp):
   """Set MPI_HOME."""
+
   default_mpi_home = which('mpirun') or which('mpiexec') or ''
   default_mpi_home = os.path.dirname(os.path.dirname(default_mpi_home))
 
-  ask_mpi_home = ('Please specify the MPI toolkit folder. [Default is %s]: '
-                 ) % default_mpi_home
-  while True:
-    mpi_home = get_from_env_or_user_or_default(environ_cp, 'MPI_HOME',
-                                               ask_mpi_home, default_mpi_home)
-
-    if os.path.exists(os.path.join(mpi_home, 'include')) and os.path.exists(
-        os.path.join(mpi_home, 'lib')):
-      break
-
-    print('Invalid path to the MPI Toolkit. %s or %s cannot be found' %
-          (os.path.join(mpi_home, 'include'),
-           os.path.exists(os.path.join(mpi_home, 'lib'))))
-    environ_cp['MPI_HOME'] = ''
+  def valid_mpi_path(mpi_home):
+    exists = (os.path.exists(os.path.join(mpi_home, 'include')) and
+              os.path.exists(os.path.join(mpi_home, 'lib')))
+    if not exists:
+      print('Invalid path to the MPI Toolkit. %s or %s cannot be found' %
+            (os.path.join(mpi_home, 'include'),
+             os.path.exists(os.path.join(mpi_home, 'lib'))))
+    return exists
 
-  # Set MPI_HOME
-  environ_cp['MPI_HOME'] = str(mpi_home)
+  _ = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='MPI_HOME',
+      var_default=default_mpi_home,
+      ask_for_var='Please specify the MPI toolkit folder.',
+      check_success=valid_mpi_path,
+      error_msg='',
+      suppress_default_error=True)
 
 
 def set_other_mpi_vars(environ_cp):
@@ -983,47 +1352,25 @@ def set_other_mpi_vars(environ_cp):
     raise ValueError('Cannot find the MPI library file in %s/lib' % mpi_home)
 
 
-def set_mkl():
-  write_to_bazelrc('build:mkl --define using_mkl=true')
-  write_to_bazelrc('build:mkl -c opt')
-  print(
-      'Add "--config=mkl" to your bazel command to build with MKL '
-      'support.\nPlease note that MKL on MacOS or windows is still not '
-      'supported.\nIf you would like to use a local MKL instead of '
-      'downloading, please set the environment variable \"TF_MKL_ROOT\" every '
-      'time before build.')
-
-
-def set_monolithic():
-  # Add --config=monolithic to your bazel command to use a mostly-static
-  # build and disable modular op registration support (this will revert to
-  # loading TensorFlow with RTLD_GLOBAL in Python). By default (without
-  # --config=monolithic), TensorFlow will build with a dependence on
-  # //tensorflow:libtensorflow_framework.so.
-  write_to_bazelrc('build:monolithic --define framework_shared_object=false')
-  # For projects which use TensorFlow as part of a Bazel build process, putting
-  # nothing in a bazelrc will default to a monolithic build. The following line
-  # opts in to modular op registration support by default:
-  write_to_bazelrc('build --define framework_shared_object=true')
-
-
-def create_android_bazelrc_configs():
-  # Flags for --config=android
-  write_to_bazelrc('build:android --crosstool_top=//external:android/crosstool')
-  write_to_bazelrc(
-      'build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain')
-  # Flags for --config=android_arm
-  write_to_bazelrc('build:android_arm --config=android')
-  write_to_bazelrc('build:android_arm --cpu=armeabi-v7a')
-  # Flags for --config=android_arm64
-  write_to_bazelrc('build:android_arm64 --config=android')
-  write_to_bazelrc('build:android_arm64 --cpu=arm64-v8a')
-
-
 def set_grpc_build_flags():
   write_to_bazelrc('build --define grpc_no_ares=true')
 
 
+def set_windows_build_flags():
+  if is_windows():
+    # The non-monolithic build is not supported yet
+    write_to_bazelrc('build --config monolithic')
+    # Suppress warning messages
+    write_to_bazelrc('build --copt=-w --host_copt=-w')
+    # Output more verbose information when something goes wrong
+    write_to_bazelrc('build --verbose_failures')
+
+
+def config_info_line(name, help_text):
+  """Helper function to print formatted help text for Bazel config options."""
+  print('\t--config=%-12s\t# %s' % (name, help_text))
+
+
 def main():
   # Make a copy of os.environ to be clear when functions and getting and setting
   # environment variables.
@@ -1034,20 +1381,22 @@ def main():
   reset_tf_configure_bazelrc()
   cleanup_makefile()
   setup_python(environ_cp)
-  run_gen_git_source(environ_cp)
 
   if is_windows():
     environ_cp['TF_NEED_S3'] = '0'
     environ_cp['TF_NEED_GCP'] = '0'
     environ_cp['TF_NEED_HDFS'] = '0'
     environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_KAFKA'] = '0'
     environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
     environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
     environ_cp['TF_CUDA_CLANG'] = '0'
+    environ_cp['TF_NEED_TENSORRT'] = '0'
 
   if is_macos():
     environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_TENSORRT'] = '0'
 
   set_build_var(environ_cp, 'TF_NEED_JEMALLOC', 'jemalloc as malloc',
                 'with_jemalloc', True)
@@ -1057,6 +1406,8 @@ def main():
                 'with_hdfs_support', True, 'hdfs')
   set_build_var(environ_cp, 'TF_NEED_S3', 'Amazon S3 File System',
                 'with_s3_support', True, 's3')
+  set_build_var(environ_cp, 'TF_NEED_KAFKA', 'Apache Kafka Platform',
+                'with_kafka_support', False, 'kafka')
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
                 False, 'xla')
   set_build_var(environ_cp, 'TF_NEED_GDR', 'GDR', 'with_gdr_support',
@@ -1079,12 +1430,27 @@ def main():
       'TF_CUDA_CONFIG_REPO' not in environ_cp):
     set_tf_cuda_version(environ_cp)
     set_tf_cudnn_version(environ_cp)
+    if is_linux():
+      set_tf_tensorrt_install_path(environ_cp)
     set_tf_cuda_compute_capabilities(environ_cp)
+    if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get('LD_LIBRARY_PATH') != '1':
+      write_action_env_to_bazelrc('LD_LIBRARY_PATH', environ_cp.get('LD_LIBRARY_PATH'))
 
     set_tf_cuda_clang(environ_cp)
     if environ_cp.get('TF_CUDA_CLANG') == '1':
-      # Set up which clang we should use as the cuda / host compiler.
-      set_clang_cuda_compiler_path(environ_cp)
+      if not is_windows():
+        # Ask if we want to download clang release while building.
+        set_tf_download_clang(environ_cp)
+      else:
+        # We use bazel's generated crosstool on Windows and there is no
+        # way to provide downloaded toolchain for that yet.
+        # TODO(ibiryukov): Investigate using clang as a cuda compiler on
+        # Windows.
+        environ_cp['TF_DOWNLOAD_CLANG'] = '0'
+
+      if environ_cp.get('TF_DOWNLOAD_CLANG') != '1':
+        # Set up which clang we should use as the cuda / host compiler.
+        set_clang_cuda_compiler_path(environ_cp)
     else:
       # Set up which gcc nvcc should use as the host compiler
       # No need to set this on Windows
@@ -1099,9 +1465,29 @@ def main():
 
   set_grpc_build_flags()
   set_cc_opt_flags(environ_cp)
-  set_mkl()
-  set_monolithic()
-  create_android_bazelrc_configs()
+  set_windows_build_flags()
+
+  if workspace_has_any_android_rule():
+    print('The WORKSPACE file has at least one of ["android_sdk_repository", '
+          '"android_ndk_repository"] already set. Will not ask to help '
+          'configure the WORKSPACE. Please delete the existing rules to '
+          'activate the helper.\n')
+  else:
+    if get_var(
+        environ_cp, 'TF_SET_ANDROID_WORKSPACE', 'android workspace',
+        False,
+        ('Would you like to interactively configure ./WORKSPACE for '
+         'Android builds?'),
+        'Searching for NDK and SDK installations.',
+        'Not configuring the WORKSPACE for Android builds.'):
+      create_android_ndk_rule(environ_cp)
+      create_android_sdk_rule(environ_cp)
+
+  print('Preconfigured Bazel build configs. You can use any of the below by '
+        'adding "--config=<>" to your build command. See tools/bazel.rc for '
+        'more details.')
+  config_info_line('mkl', 'Build with MKL support.')
+  config_info_line('monolithic', 'Config for mostly static monolithic build.')
 
 if __name__ == '__main__':
   main()
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index bfebe8a5678a2c0508b31f5dd898eac22186a072..dc995d231d3e591771f801e28024a76610cdba26 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -211,6 +211,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_kafka_support",
+    define_values = {"with_kafka_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
 # Crosses between platforms and file system libraries not supported on those
 # platforms due to limitations in nested select() statements.
 config_setting(
@@ -364,11 +370,17 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-# Make a dummy rule that we can change "default" in select statements to.
-# to disable dependencies in copybara.
 config_setting(
-    name = "dummy_disabled_internal",
-    values = {"define": "with_dummy_disabled_internal=true"},
+    name = "override_eigen_strong_inline",
+    values = {"define": "override_eigen_strong_inline=true"},
+    visibility = ["//visibility:public"],
+)
+
+# TODO(laigd): consider removing this option and make TensorRT enabled
+# automatically when CUDA is enabled.
+config_setting(
+    name = "with_tensorrt_support",
+    values = {"define": "with_tensorrt_support=true"},
     visibility = ["//visibility:public"],
 )
 
@@ -378,6 +390,7 @@ package_group(
         "//learning/meta_rank/...",
         "//tensorflow/...",
         "//tensorflow_fold/llgtm/...",
+        "//third_party/py/tensor2tensor/...",
     ],
 )
 
@@ -409,6 +422,8 @@ filegroup(
         "//tensorflow/c:all_files",
         "//tensorflow/cc:all_files",
         "//tensorflow/cc/saved_model:all_files",
+        "//tensorflow/cc/saved_model/python:all_files",
+        "//tensorflow/cc/tools:all_files",
         "//tensorflow/compiler/aot:all_files",
         "//tensorflow/compiler/aot/tests:all_files",
         "//tensorflow/compiler/jit:all_files",
@@ -427,6 +442,7 @@ filegroup(
         "//tensorflow/compiler/xla/client:all_files",
         "//tensorflow/compiler/xla/client/lib:all_files",
         "//tensorflow/compiler/xla/legacy_flags:all_files",
+        "//tensorflow/compiler/xla/python:all_files",
         "//tensorflow/compiler/xla/service:all_files",
         "//tensorflow/compiler/xla/service/cpu:all_files",
         "//tensorflow/compiler/xla/service/gpu:all_files",
@@ -440,9 +456,6 @@ filegroup(
         "//tensorflow/contrib/all_reduce:all_files",
         "//tensorflow/contrib/android:all_files",
         "//tensorflow/contrib/batching:all_files",
-        "//tensorflow/contrib/batching/kernels:all_files",
-        "//tensorflow/contrib/batching/test_util:all_files",
-        "//tensorflow/contrib/batching/util:all_files",
         "//tensorflow/contrib/bayesflow:all_files",
         "//tensorflow/contrib/boosted_trees:all_files",
         "//tensorflow/contrib/boosted_trees/estimator_batch:all_files",
@@ -452,6 +465,7 @@ filegroup(
         "//tensorflow/contrib/cloud:all_files",
         "//tensorflow/contrib/cloud/kernels:all_files",
         "//tensorflow/contrib/cluster_resolver:all_files",
+        "//tensorflow/contrib/coder:all_files",
         "//tensorflow/contrib/compiler:all_files",
         "//tensorflow/contrib/copy_graph:all_files",
         "//tensorflow/contrib/crf:all_files",
@@ -461,11 +475,15 @@ filegroup(
         "//tensorflow/contrib/data/python/kernel_tests:all_files",
         "//tensorflow/contrib/data/python/ops:all_files",
         "//tensorflow/contrib/decision_trees/proto:all_files",
+        "//tensorflow/contrib/deprecated:all_files",
         "//tensorflow/contrib/distributions:all_files",
+        "//tensorflow/contrib/eager/proto:all_files",
         "//tensorflow/contrib/eager/python:all_files",
         "//tensorflow/contrib/estimator:all_files",
         "//tensorflow/contrib/factorization:all_files",
+        "//tensorflow/contrib/factorization/examples:all_files",
         "//tensorflow/contrib/factorization/kernels:all_files",
+        "//tensorflow/contrib/feature_column:all_files",
         "//tensorflow/contrib/ffmpeg:all_files",
         "//tensorflow/contrib/ffmpeg/default:all_files",
         "//tensorflow/contrib/framework:all_files",
@@ -475,6 +493,7 @@ filegroup(
         "//tensorflow/contrib/graph_editor:all_files",
         "//tensorflow/contrib/grid_rnn:all_files",
         "//tensorflow/contrib/hooks:all_files",
+        "//tensorflow/contrib/hvx/clock_cycle_profiling:all_files",
         "//tensorflow/contrib/hvx/hvx_ops_support_checker:all_files",
         "//tensorflow/contrib/image:all_files",
         "//tensorflow/contrib/input_pipeline:all_files",
@@ -492,6 +511,8 @@ filegroup(
         "//tensorflow/contrib/layers/kernels:all_files",
         "//tensorflow/contrib/learn:all_files",
         "//tensorflow/contrib/learn/python/learn/datasets:all_files",
+        "//tensorflow/contrib/legacy_seq2seq:all_files",
+        "//tensorflow/contrib/libsvm:all_files",
         "//tensorflow/contrib/linalg:all_files",
         "//tensorflow/contrib/linear_optimizer:all_files",
         "//tensorflow/contrib/lite:all_files",
@@ -516,15 +537,23 @@ filegroup(
         "//tensorflow/contrib/lookup:all_files",
         "//tensorflow/contrib/losses:all_files",
         "//tensorflow/contrib/makefile:all_files",
+        "//tensorflow/contrib/memory_stats:all_files",
         "//tensorflow/contrib/meta_graph_transform:all_files",
         "//tensorflow/contrib/metrics:all_files",
         "//tensorflow/contrib/model_pruning:all_files",
-        "//tensorflow/contrib/mpi_collectives:all_files",
-        "//tensorflow/contrib/ndlstm:all_files",
+        "//tensorflow/contrib/model_pruning/examples/cifar10:all_files",
+        "//tensorflow/contrib/nccl:all_files",
         "//tensorflow/contrib/nearest_neighbor:all_files",
         "//tensorflow/contrib/nn:all_files",
         "//tensorflow/contrib/opt:all_files",
+        "//tensorflow/contrib/periodic_resample:all_files",
         "//tensorflow/contrib/predictor:all_files",
+        "//tensorflow/contrib/py2tf:all_files",
+        "//tensorflow/contrib/py2tf/converters:all_files",
+        "//tensorflow/contrib/py2tf/impl:all_files",
+        "//tensorflow/contrib/py2tf/pyct:all_files",
+        "//tensorflow/contrib/py2tf/pyct/static_analysis:all_files",
+        "//tensorflow/contrib/py2tf/utils:all_files",
         "//tensorflow/contrib/quantize:all_files",
         "//tensorflow/contrib/receptive_field:all_files",
         "//tensorflow/contrib/reduce_slice_ops:all_files",
@@ -553,6 +582,7 @@ filegroup(
         "//tensorflow/contrib/tensor_forest/proto:all_files",
         "//tensorflow/contrib/tensorboard:all_files",
         "//tensorflow/contrib/tensorboard/db:all_files",
+        "//tensorflow/contrib/tensorrt:all_files",
         "//tensorflow/contrib/testing:all_files",
         "//tensorflow/contrib/text:all_files",
         "//tensorflow/contrib/tfprof:all_files",
@@ -567,6 +597,7 @@ filegroup(
         "//tensorflow/contrib/util:all_files",
         "//tensorflow/contrib/verbs:all_files",
         "//tensorflow/core:all_files",
+        "//tensorflow/core/api_def:all_files",
         "//tensorflow/core/debug:all_files",
         "//tensorflow/core/distributed_runtime:all_files",
         "//tensorflow/core/distributed_runtime/rpc:all_files",
@@ -577,6 +608,9 @@ filegroup(
         "//tensorflow/core/grappler/optimizers:all_files",
         "//tensorflow/core/grappler/utils:all_files",
         "//tensorflow/core/kernels:all_files",
+        "//tensorflow/core/kernels/batching_util:all_files",
+        "//tensorflow/core/kernels/data:all_files",
+        "//tensorflow/core/kernels/data/sql:all_files",
         "//tensorflow/core/kernels/fuzzing:all_files",
         "//tensorflow/core/kernels/hexagon:all_files",
         "//tensorflow/core/kernels/neon:all_files",
@@ -591,6 +625,7 @@ filegroup(
         "//tensorflow/core/profiler/internal/advisor:all_files",
         "//tensorflow/core/util/ctc:all_files",
         "//tensorflow/core/util/tensor_bundle:all_files",
+        "//tensorflow/examples/adding_an_op:all_files",
         "//tensorflow/examples/android:all_files",
         "//tensorflow/examples/benchmark:all_files",
         "//tensorflow/examples/get_started/regression:all_files",
@@ -598,10 +633,13 @@ filegroup(
         "//tensorflow/examples/image_retraining:all_files",
         "//tensorflow/examples/label_image:all_files",
         "//tensorflow/examples/learn:all_files",
+        "//tensorflow/examples/multibox_detector:all_files",
         "//tensorflow/examples/saved_model:all_files",
         "//tensorflow/examples/speech_commands:all_files",
         "//tensorflow/examples/tutorials/estimators:all_files",
+        "//tensorflow/examples/tutorials/layers:all_files",
         "//tensorflow/examples/tutorials/mnist:all_files",
+        "//tensorflow/examples/tutorials/monitors:all_files",
         "//tensorflow/examples/tutorials/word2vec:all_files",
         "//tensorflow/examples/wav_to_spectrogram:all_files",
         "//tensorflow/go:all_files",
@@ -610,6 +648,7 @@ filegroup(
         "//tensorflow/java/src/main/native:all_files",
         "//tensorflow/python:all_files",
         "//tensorflow/python/data:all_files",
+        "//tensorflow/python/data/kernel_tests:all_files",
         "//tensorflow/python/data/ops:all_files",
         "//tensorflow/python/data/util:all_files",
         "//tensorflow/python/debug:all_files",
@@ -623,6 +662,7 @@ filegroup(
         "//tensorflow/python/kernel_tests/random:all_files",
         "//tensorflow/python/ops/distributions:all_files",
         "//tensorflow/python/ops/linalg:all_files",
+        "//tensorflow/python/ops/losses:all_files",
         "//tensorflow/python/profiler:all_files",
         "//tensorflow/python/profiler/internal:all_files",
         "//tensorflow/python/saved_model:all_files",
@@ -633,6 +673,7 @@ filegroup(
         "//tensorflow/tools/api/tests:all_files",
         "//tensorflow/tools/benchmark:all_files",
         "//tensorflow/tools/build_info:all_files",
+        "//tensorflow/tools/ci_build/gpu_build:all_files",
         "//tensorflow/tools/common:all_files",
         "//tensorflow/tools/compatibility:all_files",
         "//tensorflow/tools/dist_test/server:all_files",
@@ -640,17 +681,20 @@ filegroup(
         "//tensorflow/tools/docker/notebooks:all_files",
         "//tensorflow/tools/docs:all_files",
         "//tensorflow/tools/git:all_files",
+        "//tensorflow/tools/graph_transforms:all_files",
         "//tensorflow/tools/mlpbtxt:all_files",
         "//tensorflow/tools/proto_text:all_files",
         "//tensorflow/tools/quantization:all_files",
         "//tensorflow/tools/test:all_files",
         "//tensorflow/user_ops:all_files",
+        "//third_party/eigen3:all_files",
+        "//third_party/fft2d:all_files",
+        "//third_party/flatbuffers:all_files",
         "//third_party/hadoop:all_files",
-        "//third_party/mpi:all_files",
         "//third_party/sycl:all_files",
         "//third_party/sycl/sycl:all_files",
     ],
-    visibility = [":__subpackages__"],
+    visibility = ["//visibility:public"],
 )
 
 load(
@@ -774,6 +818,7 @@ tf_cc_shared_object(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
         "//tensorflow/cc:scope",
+        "//tensorflow/cc/profiler",
         "//tensorflow/core:tensorflow",
     ],
 )
diff --git a/tensorflow/SECURITY.md b/tensorflow/SECURITY.md
new file mode 100644
index 0000000000000000000000000000000000000000..6ddac1f964dfba3afd240441e2a036bc24ee6d91
--- /dev/null
+++ b/tensorflow/SECURITY.md
@@ -0,0 +1,239 @@
+# Using TensorFlow Securely
+
+This document discusses how to safely deal with untrusted programs (models or
+model parameters), and input data. Below, we also provide guidelines on how to
+report vulnerabilities in TensorFlow.
+
+## TensorFlow models are programs
+
+TensorFlow's runtime system interprets and executes programs. What machine 
+learning practitioners term
+[**models**](https://developers.google.com/machine-learning/glossary/#model) are
+expressed as programs that TensorFlow executes.  TensorFlow programs are encoded
+as computation
+[**graphs**](https://developers.google.com/machine-learning/glossary/#graph).
+The model's parameters are often stored separately in **checkpoints**.
+
+At runtime, TensorFlow executes the computation graph using the parameters
+provided. Note that the behavior of the computation graph may change
+depending on the parameters provided. TensorFlow itself is not a sandbox. When
+executing the computation graph, TensorFlow may read and write files, send and
+receive data over the network, and even spawn additional processes. All these
+tasks are performed with the permissions of the TensorFlow process. Allowing
+for this flexibility makes for a powerful machine learning platform,
+but it has implications for security.
+
+The computation graph may also accept **inputs**. Those inputs are the
+data you supply to TensorFlow to train a model, or to use a model to run
+inference on the data.
+
+**TensorFlow models are programs, and need to be treated as such from a security
+perspective.** 
+
+## Running untrusted models
+
+As a general rule: **Always** execute untrusted models inside a sandbox (e.g.,
+[nsjail](https://github.com/google/nsjail)). 
+
+There are several ways in which a model could become untrusted. Obviously, if an
+untrusted party supplies TensorFlow kernels, arbitrary code may be executed.
+The same is true if the untrusted party provides Python code, such as the
+Python code that generates TensorFlow graphs.
+
+Even if the untrusted party only supplies the serialized computation
+graph (in form of a `GraphDef`, `SavedModel`, or equivalent on-disk format), the
+set of computation primitives available to TensorFlow is powerful enough that
+you should assume that the TensorFlow process effectively executes arbitrary
+code. One common solution is to whitelist only a few safe Ops. While this is
+possible in theory, we still recommend you sandbox the execution.
+
+It depends on the computation graph whether a user provided checkpoint is safe.
+It is easily possible to create computation graphs in which malicious
+checkpoints can trigger unsafe behavior. For example, consider a graph that
+contains a `tf.cond` depending on the value of a `tf.Variable`. One branch of
+the `tf.cond` is harmless, but the other is unsafe. Since the `tf.Variable` is
+stored in the checkpoint, whoever provides the checkpoint now has the ability to
+trigger unsafe behavior, even though the graph is not under their control.
+
+In other words, graphs can contain vulnerabilities of their own. To allow users
+to provide checkpoints to a model you run on their behalf (e.g., in order to
+compare model quality for a fixed model architecture), you must carefully audit
+your model, and we recommend you run the TensorFlow process in a sandbox.
+
+## Accepting untrusted Inputs
+
+It is possible to write models that are secure in a sense that they can safely
+process untrusted inputs assuming there are no bugs. There are two main reasons
+to not rely on this: first, it is easy to write models which must not be exposed
+to untrusted inputs, and second, there are bugs in any software system of
+sufficient complexity. Letting users control inputs could allow them to trigger
+bugs either in TensorFlow or in dependent libraries.
+
+In general, it is good practice to isolate parts of any system which is exposed
+to untrusted (e.g., user-provided) inputs in a sandbox.
+
+A useful analogy to how any TensorFlow graph is executed is any interpreted
+programming language, such as Python. While it is possible to write secure
+Python code which can be exposed to user supplied inputs (by, e.g., carefully
+quoting and sanitizing input strings, size-checking input blobs, etc.), it is
+very easy to write Python programs which are insecure. Even secure Python code
+could be rendered insecure by a bug in the Python interpreter, or in a bug in a
+Python library used (e.g.,
+[this one](https://www.cvedetails.com/cve/CVE-2017-12852/)).
+
+## Running a TensorFlow server
+
+TensorFlow is a platform for distributed computing, and as such there is a
+TensorFlow server (`tf.train.Server`). **The TensorFlow server is meant for
+internal communication only. It is not built for use in an untrusted network.**
+
+For performance reasons, the default TensorFlow server does not include any
+authorization protocol and sends messages unencrypted. It accepts connections
+from anywhere, and executes the graphs it is sent without performing any checks.
+Therefore, if you run a `tf.train.Server` in your network, anybody with
+access to the network can execute what you should consider arbitrary code with
+the privileges of the process running the `tf.train.Server`.
+
+When running distributed TensorFlow, you must isolate the network in which the
+cluster lives. Cloud providers provide instructions for setting up isolated
+networks, which are sometimes branded as "virtual private cloud." Refer to the
+instructions for
+[GCP](https://cloud.google.com/compute/docs/networks-and-firewalls) and
+[AWS](https://aws.amazon.com/vpc/)) for details.
+
+Note that `tf.train.Server` is different from the server created by
+`tensorflow/serving` (the default binary for which is called `ModelServer`).
+By default, `ModelServer` also has no built-in mechanism for authentication.
+Connecting it to an untrusted network allows anyone on this network to run the
+graphs known to the `ModelServer`. This means that an attacker may run
+graphs using untrusted inputs as described above, but they would not be able to
+execute arbitrary graphs. It is possible to safely expose a `ModelServer`
+directly to an untrusted network, **but only if the graphs it is configured to
+use have been carefully audited to be safe**. 
+
+Similar to best practices for other servers, we recommend running any
+`ModelServer` with appropriate privileges (i.e., using a separate user with
+reduced permisisons). In the spirit of defense in depth, we recommend
+authenticating requests to any TensorFlow server connected to an untrusted
+network, as well as sandboxing the server to minimize the adverse effects of
+any breach.
+
+## Vulnerabilities in TensorFlow
+
+TensorFlow is a large and complex system. It also depends on a large set of
+third party libraries (e.g., `numpy`, `libjpeg-turbo`, PNG parsers, `protobuf`).
+It is possible that TensorFlow or its dependent libraries contain
+vulnerabilities that would allow triggering unexpected or dangerous behavior
+with specially crafted inputs.
+
+### What is a vulnerability?
+
+Given TensorFlow's flexibility, it is possible to specify computation graphs
+which exhibit unexpected or unwanted behaviors. The fact that TensorFlow models
+can perform arbitrary computations means that they may read and write files,
+communicate via the network, produce deadlocks and infinite loops, or run out
+of memory. It is only when these behaviors are outside the specifications of the
+operations involved that such behavior is a vulnerability. 
+
+A `FileWriter` writing a file is not unexpected behavior and therefore is not a
+vulnerability in TensorFlow. A `MatMul` allowing arbitrary binary code execution
+**is** a vulnerability.
+
+This is more subtle from a system perspective. For example, it is easy to cause
+a TensorFlow process to try to allocate more memory than available by specifying
+a computation graph containing an ill-considered `tf.tile` operation. TensorFlow
+should exit cleanly in this case (it would raise an exception in Python, or
+return an error `Status` in C++). However, if the surrounding system is not
+expecting the possibility, such behavior could be used in a denial of service
+attack (or worse). Because TensorFlow behaves correctly, this is not a
+vulnerability in TensorFlow (although it would be a vulnerability of this
+hypothetical system).
+
+As a general rule, it is incorrect behavior for Tensorflow to access memory it
+does not own, or to terminate in an unclean way. Bugs in TensorFlow that lead to
+such behaviors constitute a vulnerability.
+
+One of the most critical parts of any system is input handling. If malicious
+input can trigger side effects or incorrect behavior, this is a bug, and likely
+a vulnerability.
+
+### Reporting vulnerabilities
+
+Please email reports about any security related issues you find to
+`security@tensorflow.org`. This mail is delivered to a small security team. Your
+email will be acknowledged within one business day, and you'll receive a more
+detailed response to your email within 7 days indicating the next steps in
+handling your report. For critical problems, you may encrypt your report (see
+below).
+
+Please use a descriptive subject line for your report email. After the initial
+reply to your report, the security team will endeavor to keep you informed of
+the progress being made towards a fix and announcement. 
+
+If you believe that an existing (public) issue is security-related, please send
+an email to `security@tensorflow.org`. The email should include the issue ID and
+a short description of why it should be handled according to this security
+policy.
+
+Once an issue is reported, TensorFlow uses the following disclosure process:
+
+* When a report is received, we confirm the issue and determine its severity.
+* If we know of specific third-party services or software based on TensorFlow
+  that require mitigation before publication, those projects will be notified.
+* An advisory is prepared (but not published) which details the problem and
+  steps for mitigation.
+* Wherever possible, fixes are prepared for the last minor release of the two
+  latest major releases, as well as the master branch. We will attempt to
+  commit these fixes as soon as possible, and as close together as
+  possible.
+* Patch releases are published for all fixed released versions, a
+  notification is sent to discuss@tensorflow.org, and the advisory is published.
+
+Past security advisories are listed below. We credit reporters for identifying
+security issues, although we keep your name confidential if you request it.
+
+#### Encryption key for `security@tensorflow.org`
+
+If your disclosure is extremely sensitive, you may choose to encrypt your
+report using the key below. Please only use this for critical security
+reports.
+
+```
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQENBFpqdzwBCADTeAHLNEe9Vm77AxhmGP+CdjlY84O6DouOCDSq00zFYdIU/7aI
+LjYwhEmDEvLnRCYeFGdIHVtW9YrVktqYE9HXVQC7nULU6U6cvkQbwHCdrjaDaylP
+aJUXkNrrxibhx9YYdy465CfusAaZ0aM+T9DpcZg98SmsSml/HAiiY4mbg/yNVdPs
+SEp/Ui4zdIBNNs6at2gGZrd4qWhdM0MqGJlehqdeUKRICE/mdedXwsWLM8AfEA0e
+OeTVhZ+EtYCypiF4fVl/NsqJ/zhBJpCx/1FBI1Uf/lu2TE4eOS1FgmIqb2j4T+jY
+e+4C8kGB405PAC0n50YpOrOs6k7fiQDjYmbNABEBAAG0LVRlbnNvckZsb3cgU2Vj
+dXJpdHkgPHNlY3VyaXR5QHRlbnNvcmZsb3cub3JnPokBTgQTAQgAOBYhBEkvXzHm
+gOJBnwP4Wxnef3wVoM2yBQJaanc8AhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheA
+AAoJEBnef3wVoM2yNlkIAICqetv33MD9W6mPAXH3eon+KJoeHQHYOuwWfYkUF6CC
+o+X2dlPqBSqMG3bFuTrrcwjr9w1V8HkNuzzOJvCm1CJVKaxMzPuXhBq5+DeT67+a
+T/wK1L2R1bF0gs7Pp40W3np8iAFEh8sgqtxXvLGJLGDZ1Lnfdprg3HciqaVAiTum
+HBFwszszZZ1wAnKJs5KVteFN7GSSng3qBcj0E0ql2nPGEqCVh+6RG/TU5C8gEsEf
+3DX768M4okmFDKTzLNBm+l08kkBFt+P43rNK8dyC4PXk7yJa93SmS/dlK6DZ16Yw
+2FS1StiZSVqygTW59rM5XNwdhKVXy2mf/RtNSr84gSi5AQ0EWmp3PAEIALInfBLR
+N6fAUGPFj+K3za3PeD0fWDijlC9f4Ety/icwWPkOBdYVBn0atzI21thPRbfuUxfe
+zr76xNNrtRRlbDSAChA1J5T86EflowcQor8dNC6fS+oHFCGeUjfEAm16P6mGTo0p
+osdG2XnnTHOOEFbEUeWOwR/zT0QRaGGknoy2pc4doWcJptqJIdTl1K8xyBieik/b
+nSoClqQdZJa4XA3H9G+F4NmoZGEguC5GGb2P9NHYAJ3MLHBHywZip8g9oojIwda+
+OCLL4UPEZ89cl0EyhXM0nIAmGn3Chdjfu3ebF0SeuToGN8E1goUs3qSE77ZdzIsR
+BzZSDFrgmZH+uP0AEQEAAYkBNgQYAQgAIBYhBEkvXzHmgOJBnwP4Wxnef3wVoM2y
+BQJaanc8AhsMAAoJEBnef3wVoM2yX4wIALcYZbQhSEzCsTl56UHofze6C3QuFQIH
+J4MIKrkTfwiHlCujv7GASGU2Vtis5YEyOoMidUVLlwnebE388MmaJYRm0fhYq6lP
+A3vnOCcczy1tbo846bRdv012zdUA+wY+mOITdOoUjAhYulUR0kiA2UdLSfYzbWwy
+7Obq96Jb/cPRxk8jKUu2rqC/KDrkFDtAtjdIHh6nbbQhFuaRuWntISZgpIJxd8Bt
+Gwi0imUVd9m9wZGuTbDGi6YTNk0GPpX5OMF5hjtM/objzTihSw9UN+65Y/oSQM81
+v//Fw6ZeY+HmRDFdirjD7wXtIuER4vqCryIqR6Xe9X8oJXz9L/Jhslc=
+=CDME
+-----END PGP PUBLIC KEY BLOCK-----
+```
+
+### Known vulnerabilities
+
+| Type | Versions affected | Reported by | Additional Information |
+|------|:-----------------:|---------------------------------------|
+| out of bounds read| <=1.4 | TenCent Blade Team | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
+
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
index 083634bd7964b0c12e10a1f3c71be5eab597a6c4..78ad6aec19f3bbbfcb389012ac1577573b3e4901 100644
--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -21,7 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
-from tensorflow.python import *
+from tensorflow.python import *  # pylint: disable=redefined-builtin
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.lazy_loader import LazyLoader
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index ef7eb5a4d16b29aecc34f33cb41dd7cf9450c5f2..9060c58c1395f07eff0ccef7bd430b3402f8c826 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -6,6 +6,7 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
+    "tf_cuda_cc_test",
     "tf_copts",
     "tf_cuda_library",
     "tf_custom_op_library",
@@ -26,6 +27,18 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
+filegroup(
+    name = "srcs",
+    srcs = glob(
+        [
+            "*.cc",
+            "*.h",
+        ],
+        exclude = ["*test*"],
+    ),
+    visibility = ["//visibility:public"],
+)
+
 tf_cuda_library(
     name = "c_api_internal",
     srcs = ["c_api.h"],
@@ -42,6 +55,7 @@ tf_cuda_library(
             "//tensorflow/core:core_cpu",
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
+            "//tensorflow/core:op_gen_lib",
         ],
     }),
 )
@@ -73,10 +87,17 @@ tf_cuda_library(
             "//tensorflow/core:core_cpu",
             "//tensorflow/core:core_cpu_internal",
             "//tensorflow/core:framework",
+            "//tensorflow/core:op_gen_lib",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
         ],
+    }) + select({
+        "//tensorflow:with_xla_support": [
+            "//tensorflow/compiler/tf2xla:xla_compiler",
+            "//tensorflow/compiler/jit",
+        ],
+        "//conditions:default": [],
     }),
 )
 
@@ -121,15 +142,21 @@ tf_cuda_library(
     testonly = 1,
     srcs = ["c_test_util.cc"],
     hdrs = ["c_test_util.h"],
+    visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
     deps = [
         ":c_api",
+        "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
         "//tensorflow/core:test",
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "c_api_test",
     size = "small",
     srcs = ["c_api_test.cc"],
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index bb41f92306b413d610bf115d144b15faa568ee14..85f1d1639b4d09f2de77d326481a86ec246270d0 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/cc/framework/scope_internal.h"
 #include "tensorflow/cc/ops/while_loop.h"
 #include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
 #endif
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -63,6 +64,7 @@ using tensorflow::AllocationDescription;
 using tensorflow::DataType;
 using tensorflow::Graph;
 using tensorflow::GraphDef;
+using tensorflow::mutex_lock;
 using tensorflow::NameRangeMap;
 using tensorflow::NameRangesForNode;
 using tensorflow::NewSession;
@@ -76,6 +78,7 @@ using tensorflow::RunMetadata;
 using tensorflow::RunOptions;
 using tensorflow::Session;
 using tensorflow::Status;
+using tensorflow::string;
 using tensorflow::Tensor;
 using tensorflow::TensorBuffer;
 using tensorflow::TensorId;
@@ -86,8 +89,6 @@ using tensorflow::error::Code;
 using tensorflow::errors::FailedPrecondition;
 using tensorflow::errors::InvalidArgument;
 using tensorflow::gtl::ArraySlice;
-using tensorflow::mutex_lock;
-using tensorflow::string;
 using tensorflow::strings::StrCat;
 
 extern "C" {
@@ -108,6 +109,10 @@ TF_Status* TF_NewStatus() { return new TF_Status; }
 void TF_DeleteStatus(TF_Status* s) { delete s; }
 
 void TF_SetStatus(TF_Status* s, TF_Code code, const char* msg) {
+  if (code == TF_OK) {
+    s->status = Status::OK();
+    return;
+  }
   s->status = Status(static_cast<Code>(code), tensorflow::StringPiece(msg));
 }
 
@@ -194,11 +199,11 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
       reinterpret_cast<intptr_t>(data) % EIGEN_MAX_ALIGN_BYTES != 0) {
     // TF_STRING and TF_RESOURCE tensors have a different representation in
     // TF_Tensor than they do in tensorflow::Tensor. So a copy here is a waste
-    // (any alignement requirements will be taken care of by TF_TensorToTensor
+    // (any alignment requirements will be taken care of by TF_TensorToTensor
     // and TF_TensorFromTensor).
     //
-    // Other types have the same represntation, so copy only if it is safe to do
-    // so.
+    // Other types have the same representation, so copy only if it is safe to
+    // do so.
     buf->data_ = allocate_tensor("TF_NewTensor", len);
     std::memcpy(buf->data_, data, len);
     buf->deallocator_ = deallocate_buffer;
@@ -210,7 +215,13 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
     buf->deallocator_ = deallocator;
     buf->deallocator_arg_ = deallocator_arg;
   }
-  return new TF_Tensor{dtype, TensorShape(dimvec), buf};
+  TF_Tensor* ret = new TF_Tensor{dtype, TensorShape(dimvec), buf};
+  size_t elem_size = TF_DataTypeSize(dtype);
+  if (elem_size > 0 && len < (elem_size * ret->shape.num_elements())) {
+    delete ret;
+    return nullptr;
+  }
+  return ret;
 }
 
 TF_Tensor* TF_TensorMaybeMove(TF_Tensor* tensor) {
@@ -383,12 +394,11 @@ void TF_Reset_Helper(const TF_SessionOptions* opt, const char** containers,
 // be less than the total node count.
 Status ValidateNoCycles(const Graph& g) {
   // TODO(nolivia): check this on a subset of the graph instead of all of it.
-  int total_num_nodes = g.num_node_ids();
   // A node is ready when all of its inputs have been visited.
   std::vector<const Node*> ready;
-  std::vector<int> pending_count(total_num_nodes, 0);
+  std::vector<int> pending_count(g.num_node_ids(), 0);
 
-  for (int i = 0; i < total_num_nodes; ++i) {
+  for (int i = 0; i < g.num_node_ids(); ++i) {
     const Node* n = g.FindNodeId(i);
     if (n == nullptr) continue;
     pending_count[i] = n->in_edges().size();
@@ -421,7 +431,7 @@ Status ValidateNoCycles(const Graph& g) {
     }
   }
 
-  if (processed < total_num_nodes) {
+  if (processed < g.num_nodes()) {
     std::vector<string> nodes_in_cycle;
     for (int i = 0; i < pending_count.size() && nodes_in_cycle.size() < 3;
          ++i) {
@@ -430,7 +440,7 @@ Status ValidateNoCycles(const Graph& g) {
       }
     }
     return errors::InvalidArgument(
-        "Graph is invalid, contains a cycle with ", total_num_nodes - processed,
+        "Graph is invalid, contains a cycle with ", g.num_nodes() - processed,
         " nodes, including: ", str_util::Join(nodes_in_cycle, ", "));
   }
   return Status::OK();
@@ -580,6 +590,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
       status->status = InvalidArgument(
           "invalid string tensor encoding (string #", i, " of ",
           srcarray.size(), "): ", status->status.error_message());
+      delete[] base;
       return nullptr;
     }
     dst += consumed;
@@ -589,6 +600,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
     status->status = InvalidArgument(
         "invalid string tensor encoding (decoded ", (dst - base),
         " bytes, but the tensor is encoded in ", size, " bytes");
+    delete[] base;
     return nullptr;
   }
 
@@ -625,6 +637,73 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in,
   return Status::OK();
 }
 
+void RecordMutation(TF_Graph* graph, const TF_Operation& op,
+                    const char* mutation_type)
+    EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
+  // If any session has already run this node_id, mark this session as
+  // unrunnable.
+  for (auto it : graph->sessions) {
+    if (it.first->last_num_graph_nodes > op.node.id()) {
+      it.second = FailedPrecondition(
+          "Operation '", op.node.DebugString(), "' was changed by ",
+          mutation_type,
+          " after it was run by a session. Nodes can be mutated "
+          "only before they are executed by a session. Either don't modify "
+          "nodes after running them or create a new session.");
+    }
+  }
+}
+
+namespace {
+
+// Helper method that creates a shape handle for a shape described by dims.
+tensorflow::shape_inference::ShapeHandle ShapeHandleFromDims(
+    tensorflow::shape_inference::InferenceContext* ic, int num_dims,
+    const int64_t* dims) {
+  if (num_dims != -1) {
+    std::vector<tensorflow::shape_inference::DimensionHandle> dim_vec;
+    dim_vec.reserve(num_dims);
+    for (int i = 0; i < num_dims; ++i) {
+      dim_vec.push_back(ic->MakeDim(dims[i]));
+    }
+    return ic->MakeShape(dim_vec);
+  } else {
+    return ic->UnknownShape();
+  }
+}
+
+}  // namespace
+
+void TF_GraphSetOutputHandleShapesAndTypes(TF_Graph* graph, TF_Output output,
+                                           int num_shapes_and_types,
+                                           const int64_t** shapes,
+                                           const int* ranks,
+                                           const TF_DataType* types,
+                                           TF_Status* status) {
+  Node* node = &output.oper->node;
+
+  mutex_lock l(graph->mu);
+  tensorflow::shape_inference::InferenceContext* ic =
+      graph->refiner.GetContext(node);
+  if (ic == nullptr) {
+    status->status =
+        InvalidArgument("Node ", node->name(), " was not found in the graph");
+    return;
+  }
+
+  auto shape_and_type_vec =
+      std::vector<tensorflow::shape_inference::ShapeAndType>(
+          num_shapes_and_types);
+  for (int i = 0; i < num_shapes_and_types; ++i) {
+    tensorflow::shape_inference::ShapeHandle shape_handle =
+        ShapeHandleFromDims(ic, ranks[i], shapes[i]);
+    shape_and_type_vec[i] = tensorflow::shape_inference::ShapeAndType(
+        shape_handle, static_cast<DataType>(types[i]));
+  }
+
+  ic->set_output_handle_shapes_and_types(output.index, shape_and_type_vec);
+}
+
 // Helpers for loading a TensorFlow plugin (a .so file).
 Status LoadLibrary(const char* library_filename, void** result,
                    const void** buf, size_t* len);
@@ -858,6 +937,7 @@ int TF_DeviceListCount(const TF_DeviceList* list) {
       status->status = InvalidArgument("index out of bounds");            \
       return err_val;                                                     \
     }                                                                     \
+    status->status = Status::OK();                                        \
     return list->response[index].accessor;                                \
   }
 
@@ -930,7 +1010,6 @@ void TF_GraphSetTensorShape(TF_Graph* graph, TF_Output output,
   Node* node = &output.oper->node;
 
   mutex_lock l(graph->mu);
-  // Set the shape.
   tensorflow::shape_inference::InferenceContext* ic =
       graph->refiner.GetContext(node);
   if (ic == nullptr) {
@@ -938,18 +1017,8 @@ void TF_GraphSetTensorShape(TF_Graph* graph, TF_Output output,
         InvalidArgument("Node ", node->name(), " was not found in the graph");
     return;
   }
-
-  tensorflow::shape_inference::ShapeHandle new_shape;
-  if (num_dims != -1) {
-    std::vector<tensorflow::shape_inference::DimensionHandle> dim_vec;
-    dim_vec.reserve(num_dims);
-    for (int i = 0; i < num_dims; ++i) {
-      dim_vec.push_back(ic->MakeDim(dims[i]));
-    }
-    new_shape = ic->MakeShape(dim_vec);
-  } else {
-    new_shape = ic->UnknownShape();
-  }
+  tensorflow::shape_inference::ShapeHandle new_shape =
+      tensorflow::ShapeHandleFromDims(ic, num_dims, dims);
   status->status = graph->refiner.SetShape(node, output.index, new_shape);
 }
 
@@ -1143,6 +1212,13 @@ void TF_SetAttrTypeList(TF_OperationDescription* desc, const char* attr_name,
                      reinterpret_cast<const DataType*>(values), num_values));
 }
 
+void TF_SetAttrFuncName(TF_OperationDescription* desc, const char* attr_name,
+                        const char* value, size_t length) {
+  tensorflow::NameAttrList func_name;
+  func_name.set_name(std::string(value, value + length));
+  desc->node_builder.Attr(attr_name, func_name);
+}
+
 void TF_SetAttrShape(TF_OperationDescription* desc, const char* attr_name,
                      const int64_t* dims, int num_dims) {
   PartialTensorShape shape;
@@ -1404,7 +1480,13 @@ int TF_OperationOutputConsumers(TF_Output oper_out, TF_Input* consumers,
 }
 
 int TF_OperationNumControlInputs(TF_Operation* oper) {
-  return oper->node.in_edges().size() - oper->node.num_inputs();
+  int count = 0;
+  for (const auto* edge : oper->node.in_edges()) {
+    if (edge->IsControlEdge() && !edge->src()->IsSource()) {
+      ++count;
+    }
+  }
+  return count;
 }
 
 int TF_OperationGetControlInputs(TF_Operation* oper,
@@ -1412,7 +1494,7 @@ int TF_OperationGetControlInputs(TF_Operation* oper,
                                  int max_control_inputs) {
   int count = 0;
   for (const auto* edge : oper->node.in_edges()) {
-    if (edge->IsControlEdge()) {
+    if (edge->IsControlEdge() && !edge->src()->IsSource()) {
       if (count < max_control_inputs) {
         control_inputs[count] = ToOperation(edge->src());
       }
@@ -1425,7 +1507,7 @@ int TF_OperationGetControlInputs(TF_Operation* oper,
 int TF_OperationNumControlOutputs(TF_Operation* oper) {
   int count = 0;
   for (const auto* edge : oper->node.out_edges()) {
-    if (edge->IsControlEdge()) {
+    if (edge->IsControlEdge() && !edge->dst()->IsSink()) {
       ++count;
     }
   }
@@ -1437,7 +1519,7 @@ int TF_OperationGetControlOutputs(TF_Operation* oper,
                                   int max_control_outputs) {
   int count = 0;
   for (const auto* edge : oper->node.out_edges()) {
-    if (edge->IsControlEdge()) {
+    if (edge->IsControlEdge() && !edge->dst()->IsSink()) {
       if (count < max_control_outputs) {
         control_outputs[count] = ToOperation(edge->dst());
       }
@@ -1745,7 +1827,6 @@ void TF_OperationToNodeDef(TF_Operation* oper, TF_Buffer* output_node_def,
 TF_Graph::TF_Graph()
     : graph(tensorflow::OpRegistry::Global()),
       refiner(graph.versions().producer(), graph.op_registry()),
-      num_sessions(0),
       delete_requested(false),
       parent(nullptr),
       parent_inputs(nullptr) {}
@@ -1755,7 +1836,7 @@ TF_Graph* TF_NewGraph() { return new TF_Graph; }
 void TF_DeleteGraph(TF_Graph* g) {
   g->mu.lock();
   g->delete_requested = true;
-  const bool del = g->num_sessions == 0;
+  const bool del = g->sessions.empty();
   g->mu.unlock();
   if (del) delete g;
 }
@@ -1835,6 +1916,16 @@ void TF_ImportGraphDefOptionsSetPrefix(TF_ImportGraphDefOptions* opts,
   opts->opts.prefix = prefix;
 }
 
+void TF_ImportGraphDefOptionsSetUniquifyNames(TF_ImportGraphDefOptions* opts,
+                                              unsigned char uniquify_names) {
+  opts->opts.uniquify_names = uniquify_names;
+}
+
+void TF_ImportGraphDefOptionsSetUniquifyPrefix(TF_ImportGraphDefOptions* opts,
+                                               unsigned char uniquify_prefix) {
+  opts->opts.uniquify_prefix = uniquify_prefix;
+}
+
 void TF_ImportGraphDefOptionsAddInputMapping(TF_ImportGraphDefOptions* opts,
                                              const char* src_name,
                                              int src_index, TF_Output dst) {
@@ -1892,12 +1983,12 @@ void TF_ImportGraphDefResultsReturnOperations(TF_ImportGraphDefResults* results,
   *opers = results->return_nodes.data();
 }
 
-void TF_ImportGraphDefResultsUnusedInputMappings(
-    TF_ImportGraphDefResults* results, int* num_unused_input_mappings,
+void TF_ImportGraphDefResultsMissingUnusedInputMappings(
+    TF_ImportGraphDefResults* results, int* num_missing_unused_input_mappings,
     const char*** src_names, int** src_indexes) {
-  *num_unused_input_mappings = results->unused_key_names.size();
-  *src_names = results->unused_key_names.data();
-  *src_indexes = results->unused_key_indexes.data();
+  *num_missing_unused_input_mappings = results->missing_unused_key_names.size();
+  *src_names = results->missing_unused_key_names.data();
+  *src_indexes = results->missing_unused_key_indexes.data();
 }
 
 void TF_DeleteImportGraphDefResults(TF_ImportGraphDefResults* results) {
@@ -1937,18 +2028,21 @@ static void GraphImportGraphDefLocked(TF_Graph* graph, const GraphDef& def,
     tf_results->return_nodes[i] = ToOperation(results.return_nodes[i]);
   }
 
-  // Populate unused map keys
-  DCHECK(tf_results->unused_key_names.empty());
-  DCHECK(tf_results->unused_key_indexes.empty());
-  DCHECK(tf_results->unused_key_names_data.empty());
-  tf_results->unused_key_names.resize(results.unused_input_map_keys.size());
-  tf_results->unused_key_indexes.resize(results.unused_input_map_keys.size());
-  for (int i = 0; i < results.unused_input_map_keys.size(); ++i) {
-    TensorId id = results.unused_input_map_keys[i];
-    tf_results->unused_key_names_data.push_back(id.first.ToString());
-    tf_results->unused_key_names[i] =
-        tf_results->unused_key_names_data.back().c_str();
-    tf_results->unused_key_indexes[i] = id.second;
+  // Populate missing unused map keys
+  DCHECK(tf_results->missing_unused_key_names.empty());
+  DCHECK(tf_results->missing_unused_key_indexes.empty());
+  DCHECK(tf_results->missing_unused_key_names_data.empty());
+
+  size_t size = results.missing_unused_input_map_keys.size();
+  tf_results->missing_unused_key_names.resize(size);
+  tf_results->missing_unused_key_indexes.resize(size);
+
+  for (int i = 0; i < size; ++i) {
+    TensorId id = results.missing_unused_input_map_keys[i];
+    tf_results->missing_unused_key_names_data.push_back(id.first.ToString());
+    tf_results->missing_unused_key_names[i] =
+        tf_results->missing_unused_key_names_data.back().c_str();
+    tf_results->missing_unused_key_indexes[i] = id.second;
   }
 }
 
@@ -2060,7 +2154,7 @@ Status CopyGraph(Graph* src_graph, Graph* dst_graph,
     opts.return_tensors.push_back(ToTensorId(nodes_to_return[i]));
   }
 
-  // TOOD(skyewm): change to OutputTensor
+  // TODO(skyewm): change to OutputTensor
   tensorflow::ImportGraphDefResults results;
   TF_RETURN_IF_ERROR(
       ImportGraphDef(opts, gdef, dst_graph, dst_refiner, &results));
@@ -2325,11 +2419,12 @@ TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt,
   Session* session;
   status->status = NewSession(opt->options, &session);
   if (status->status.ok()) {
+    TF_Session* new_session = new TF_Session(session, graph);
     if (graph != nullptr) {
       mutex_lock l(graph->mu);
-      graph->num_sessions += 1;
+      graph->sessions[new_session] = Status::OK();
     }
-    return new TF_Session(session, graph);
+    return new_session;
   } else {
     DCHECK_EQ(nullptr, session);
     return nullptr;
@@ -2393,7 +2488,7 @@ TF_Session* TF_LoadSessionFromSavedModel(
 
   TF_Session* session = new TF_Session(bundle.session.release(), graph);
 
-  graph->num_sessions += 1;
+  graph->sessions[session] = Status::OK();
   session->last_num_graph_nodes = graph->graph.num_node_ids();
   return session;
 #endif  // __ANDROID__
@@ -2408,8 +2503,8 @@ void TF_DeleteSession(TF_Session* s, TF_Status* status) {
   TF_Graph* const graph = s->graph;
   if (graph != nullptr) {
     graph->mu.lock();
-    graph->num_sessions -= 1;
-    const bool del = graph->delete_requested && graph->num_sessions == 0;
+    graph->sessions.erase(s);
+    const bool del = graph->delete_requested && graph->sessions.empty();
     graph->mu.unlock();
     if (del) delete graph;
   }
@@ -2425,6 +2520,13 @@ static bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status) {
     mutex_lock session_lock(session->mu);
     session->graph->mu.lock();
     const Graph& graph = session->graph->graph;
+
+    status->status = session->graph->sessions[session];
+    if (!status->status.ok()) {
+      session->graph->mu.unlock();
+      return false;
+    }
+
     const auto num_nodes = graph.num_node_ids();
     if (session->last_num_graph_nodes < num_nodes) {
       status->status = tensorflow::ValidateNoCycles(session->graph->graph);
@@ -2580,4 +2682,54 @@ void TF_SessionPRun(TF_Session* session, const char* handle,
                 output_values, target_names, nullptr, status);
 }
 
+TF_ApiDefMap* TF_NewApiDefMap(TF_Buffer* op_list_buffer, TF_Status* status) {
+  tensorflow::OpList op_list;
+  if (!op_list.ParseFromArray(op_list_buffer->data, op_list_buffer->length)) {
+    status->status = InvalidArgument("Unparseable OpList");
+    return nullptr;
+  }
+  status->status = Status::OK();
+  return new TF_ApiDefMap(op_list);
+}
+
+void TF_DeleteApiDefMap(TF_ApiDefMap* apimap) { delete apimap; }
+
+void TF_ApiDefMapPut(TF_ApiDefMap* api_def_map, const char* text,
+                     size_t text_len, TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "ApiDefMap is not supported in Android.");
+#else
+  mutex_lock l(api_def_map->lock);
+  if (api_def_map->update_docs_called) {
+    status->status = FailedPrecondition(
+        "TF_ApiDefMapPut cannot be called after TF_ApiDefMapGet has been "
+        "called.");
+    return;
+  }
+  string api_def_text(text, text_len);
+  status->status = api_def_map->api_def_map.LoadApiDef(api_def_text);
+#endif  // __ANDROID__
+}
+
+TF_Buffer* TF_ApiDefMapGet(TF_ApiDefMap* api_def_map, const char* name,
+                           size_t name_len, TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "ApiDefMap is not supported in Android.");
+  return nullptr;
+#else
+  mutex_lock l(api_def_map->lock);
+  if (!api_def_map->update_docs_called) {
+    api_def_map->api_def_map.UpdateDocs();
+    api_def_map->update_docs_called = true;
+  }
+  string name_str(name, name_len);
+  const auto* api_def = api_def_map->api_def_map.GetApiDef(name_str);
+
+  TF_Buffer* ret = TF_NewBuffer();
+  status->status = MessageToBuffer(*api_def, ret);
+  return ret;
+#endif  // __ANDROID__
+}
 }  // end extern "C"
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index bb569d67fcbcec29e9494236abd79b3e40db91cd..ad592ef70961ef427bfe9fd322a82bd64df7f9f1 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -226,6 +226,10 @@ typedef struct TF_Tensor TF_Tensor;
 //      (*deallocator)(data, len, deallocator_arg)
 // Clients must provide a custom deallocator function so they can pass in
 // memory managed by something like numpy.
+//
+// May return NULL (and invoke the deallocator) if the provided data buffer
+// (data, len) is inconsistent with a tensor of the given TF_DataType
+// and the shape specified by (dima, num_dims).
 TF_CAPI_EXPORT extern TF_Tensor* TF_NewTensor(
     TF_DataType, const int64_t* dims, int num_dims, void* data, size_t len,
     void (*deallocator)(void* data, size_t len, void* arg),
@@ -511,6 +515,11 @@ TF_CAPI_EXPORT extern void TF_SetAttrTypeList(TF_OperationDescription* desc,
                                               const char* attr_name,
                                               const TF_DataType* values,
                                               int num_values);
+// Set a 'func' attribute to the specified name.
+// `value` must point to a string of length `length` bytes.
+TF_CAPI_EXPORT extern void TF_SetAttrFuncName(TF_OperationDescription* desc,
+                                              const char* attr_name,
+                                              const char* value, size_t length);
 
 // Set `num_dims` to -1 to represent "unknown rank".  Otherwise,
 // `dims` points to an array of length `num_dims`.  `dims[i]` must be
@@ -889,6 +898,20 @@ TF_CAPI_EXPORT extern void TF_DeleteImportGraphDefOptions(
 TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetPrefix(
     TF_ImportGraphDefOptions* opts, const char* prefix);
 
+// Set whether to uniquify imported operation names. If true, imported operation
+// names will be modified if their name already exists in the graph. If false,
+// conflicting names will be treated as an error. Note that this option has no
+// effect if a prefix is set, since the prefix will guarantee all names are
+// unique. Defaults to false.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetUniquifyNames(
+    TF_ImportGraphDefOptions* opts, unsigned char uniquify_names);
+
+// If true, the specified prefix will be modified if it already exists as an
+// operation name or prefix in the graph. If false, a conflicting prefix will be
+// treated as an error. This option has no effect if no prefix is specified.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetUniquifyPrefix(
+    TF_ImportGraphDefOptions* opts, unsigned char uniquify_prefix);
+
 // Set any imported nodes with input `src_name:src_index` to have that input
 // replaced with `dst`. `src_name` refers to a node in the graph to be imported,
 // `dst` references a node already existing in the graph being imported into.
@@ -948,16 +971,16 @@ TF_CAPI_EXPORT extern void TF_ImportGraphDefResultsReturnOperations(
     TF_ImportGraphDefResults* results, int* num_opers, TF_Operation*** opers);
 
 // Fetches any input mappings requested via
-// TF_ImportGraphDefOptionsAddInputMapping() that weren't used as input to any
-// node in the imported graph def. The number of fetched mappings is returned in
-// `num_unused_input_mappings`. The array of each mapping's source node name is
-// returned in `src_names`, and the array of each mapping's source index is
-// returned in `src_indexes`.
+// TF_ImportGraphDefOptionsAddInputMapping() that didn't appear in the GraphDef
+// and weren't used as input to any node in the imported graph def. The number
+// of fetched mappings is returned in `num_missing_unused_input_mappings`. The
+// array of each mapping's source node name is returned in `src_names`, and the
+// array of each mapping's source index is returned in `src_indexes`.
 //
 // `*src_names`, `*src_indexes`, and the memory backing each string in
 // `src_names` are owned by and have the lifetime of `results`.
-TF_CAPI_EXPORT extern void TF_ImportGraphDefResultsUnusedInputMappings(
-    TF_ImportGraphDefResults* results, int* num_unused_input_mappings,
+TF_CAPI_EXPORT extern void TF_ImportGraphDefResultsMissingUnusedInputMappings(
+    TF_ImportGraphDefResults* results, int* num_missing_unused_input_mappings,
     const char*** src_names, int** src_indexes);
 
 // Deletes a results object returned by TF_GraphImportGraphDefWithResults().
@@ -1015,6 +1038,23 @@ TF_CAPI_EXPORT extern void TF_GraphCopyFunction(TF_Graph* g,
                                                 const TF_Function* grad,
                                                 TF_Status* status);
 
+// Returns the number of TF_Functions registered in `g`.
+TF_CAPI_EXPORT extern int TF_GraphNumFunctions(TF_Graph* g);
+
+// Fills in `funcs` with the TF_Function* registered in `g`.
+// `funcs` must point to an array of TF_Function* of length at least
+// `max_func`. In usual usage, max_func should be set to the result of
+// TF_GraphNumFunctions(g). In this case, all the functions registered in
+// `g` will be returned. Else, an unspecified subset.
+//
+// If successful, returns the number of TF_Function* successfully set in
+// `funcs` and sets status to OK. The caller takes ownership of
+// all the returned TF_Functions. They must be deleted with TF_DeleteFunction.
+// On error, returns 0, sets status to the encountered error, and the contents
+// of funcs will be undefined.
+TF_CAPI_EXPORT extern int TF_GraphGetFunctions(TF_Graph* g, TF_Function** funcs,
+                                               int max_func, TF_Status* status);
+
 // Note: The following function may fail on very large protos in the future.
 
 TF_CAPI_EXPORT extern void TF_OperationToNodeDef(TF_Operation* oper,
@@ -1247,11 +1287,12 @@ TF_CAPI_EXPORT extern void TF_DeleteFunction(TF_Function* func);
 
 typedef struct TF_Session TF_Session;
 
-// Return a new execution session with the associated graph, or NULL on error.
+// Return a new execution session with the associated graph, or NULL on
+// error. Does not take ownership of any input parameters.
 //
-// *graph must be a valid graph (not deleted or nullptr).  This function will
-// prevent the graph from being deleted until TF_DeleteSession() is called.
-// Does not take ownership of opts.
+// *`graph` must be a valid graph (not deleted or nullptr). `graph` will be be
+// kept alive for the lifetime of the returned TF_Session. New nodes can still
+// be added to `graph` after this call.
 TF_CAPI_EXPORT extern TF_Session* TF_NewSession(TF_Graph* graph,
                                                 const TF_SessionOptions* opts,
                                                 TF_Status* status);
@@ -1504,6 +1545,49 @@ TF_CAPI_EXPORT extern void TF_DeleteLibraryHandle(TF_Library* lib_handle);
 // in this address space.
 TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllOpList();
 
+// TF_ApiDefMap encapsulates a collection of API definitions for an operation.
+//
+// This object maps the name of a TensorFlow operation to a description of the
+// API to generate for it, as defined by the ApiDef protocol buffer (
+// https://www.tensorflow.org/code/tensorflow/core/framework/api_def.proto)
+//
+// The ApiDef messages are typically used to generate convenience wrapper
+// functions for TensorFlow operations in various language bindings.
+typedef struct TF_ApiDefMap TF_ApiDefMap;
+
+// Creates a new TF_ApiDefMap instance.
+//
+// Params:
+//  op_list_buffer - TF_Buffer instance containing serialized OpList
+//    protocol buffer. (See
+//    https://www.tensorflow.org/code/tensorflow/core/framework/op_def.proto
+//    for the OpList proto definition).
+//  status - Set to OK on success and an appropriate error on failure.
+TF_CAPI_EXPORT extern TF_ApiDefMap* TF_NewApiDefMap(TF_Buffer* op_list_buffer,
+                                                    TF_Status* status);
+
+// Deallocates a TF_ApiDefMap.
+TF_CAPI_EXPORT extern void TF_DeleteApiDefMap(TF_ApiDefMap* apimap);
+
+// Add ApiDefs to the map.
+//
+// `text` corresponds to a text representation of an ApiDefs protocol message.
+// (https://www.tensorflow.org/code/tensorflow/core/framework/api_def.proto).
+//
+// The provided ApiDefs will be merged with existing ones in the map, with
+// precedence given to the newly added version in case of conflicts with
+// previous calls to TF_ApiDefMapPut.
+TF_CAPI_EXPORT extern void TF_ApiDefMapPut(TF_ApiDefMap* api_def_map,
+                                           const char* text, size_t text_len,
+                                           TF_Status* status);
+
+// Returns a serialized ApiDef protocol buffer for the TensorFlow operation
+// named `name`.
+TF_CAPI_EXPORT extern TF_Buffer* TF_ApiDefMapGet(TF_ApiDefMap* api_def_map,
+                                                 const char* name,
+                                                 size_t name_len,
+                                                 TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index dcb818b88b6fca460852beb6e948d2eb6964f663..384e6c8cb97022264c5327da5ca5861057608fbe 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -44,8 +44,12 @@ class NodeNameMapping {
  public:
   NodeNameMapping() = default;
 
-  // Normalize the input/output name and make it unique.
-  string GetIOName(const string& name);
+  // Normalize the input name and make it unique. This is the same as the
+  // function for output, expect that it adds a name mapping for the name.
+  string GetInputName(const string& name);
+
+  // Normalize the output name and make it unique.
+  string GetOutputName(const string& name);
 
   // Make the node name unique.
   string Uniquify(const string& name);
@@ -68,7 +72,7 @@ class NodeNameMapping {
   // This is a superset of values in name_mapping_.
   std::unordered_set<string> used_names_;
   // Mapping from original node name from the graph to the normalized
-  // and uniqified version of it.
+  // and uniquified version of it.
   std::unordered_map<string, string> name_mapping_;
 };
 
@@ -107,7 +111,13 @@ string NodeNameMapping::UniquifyHelper(const string& name) const {
   }
 }
 
-string NodeNameMapping::GetIOName(const string& name) {
+string NodeNameMapping::GetInputName(const string& name) {
+  const string& input_name = GetOutputName(name);
+  name_mapping_[name] = input_name;
+  return input_name;
+}
+
+string NodeNameMapping::GetOutputName(const string& name) {
   const string& input_name = UniquifyHelper(Normalize(name));
   // Record that we used this name, but don't add it to name_mapping_
   // since this name is not for a node.
@@ -214,10 +224,11 @@ Status FillFunctionBody(
 
     // Add control inputs.
     for (const Edge* edge : control_edges) {
-      // Add this control input only if the src node is in the body.
+      // Add this control input only if the src node is in the body or a part of
+      // the inputs.
       const string normalized = node_names.Lookup(edge->src()->name());
       // If we did not find a name for the source of control edge, this
-      // source must be outside of the body. Raise an error.
+      // source must be outside of the body, and not an input. Raise an error.
       if (normalized.empty()) {
         return InvalidArgument(
             "The source of control edge ", edge->DebugString(),
@@ -226,12 +237,17 @@ Status FillFunctionBody(
       }
       node_def->add_input(strings::StrCat("^", normalized));
     }
+
+    // A function is stateful if any of its nodes are stateful.
+    if (node->op_def().is_stateful()) {
+      fdef->mutable_signature()->set_is_stateful(true);
+    }
   }
   return Status::OK();
 }
 
 // Graph to FunctionDef conversion. This code is closely modeled on the Python
-// code in third_party/tensorflow/python/framework/function.py.
+// code in tensorflow/python/framework/function.py.
 Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
                           bool append_hash_to_fn_name,
                           const std::vector<const Node*>& body_nodes,
@@ -274,7 +290,7 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
       TF_RETURN_IF_ERROR(node_names.UseOutputName(output_names[i]));
       argdef->set_name(output_names[i]);
     } else {
-      argdef->set_name(node_names.GetIOName(node->name()));
+      argdef->set_name(node_names.GetOutputName(node->name()));
     }
   }
 
@@ -284,7 +300,7 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
     int idx = inputs[i].index;
     OpDef::ArgDef* argdef = fdef->mutable_signature()->add_input_arg();
     argdef->set_type(node->output_type(idx));
-    const string& input_name = node_names.GetIOName(node->name());
+    const string& input_name = node_names.GetInputName(node->name());
     argdef->set_name(input_name);
     tensor_renaming[strings::StrCat(node->name(), ":", idx)] = input_name;
   }
@@ -307,7 +323,7 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
     TF_RETURN_IF_ERROR(
         NameRangesForNode(*node, node->op_def(), nullptr, &output_ranges));
     for (const auto& output : output_ranges) {
-      const string& output_name = output.first;
+      const StringPiece& output_name = output.first;
       int index_start = output.second.first;
       int index_end = output.second.second;
       for (int i = index_start; i < index_end; ++i) {
@@ -462,7 +478,7 @@ Status ComputeBodyNodes(
   return Status::OK();
 }
 
-}  // anonymous namespace
+}  // namespace
 }  // namespace tensorflow
 
 using tensorflow::Node;
@@ -543,6 +559,28 @@ void TF_GraphCopyFunction(TF_Graph* g, const TF_Function* func,
   status->status = g->graph.AddFunctionLibrary(fdef_lib);
 }
 
+int TF_GraphNumFunctions(TF_Graph* g) {
+  tensorflow::mutex_lock l(g->mu);
+  return g->graph.flib_def().num_functions();
+}
+
+int TF_GraphGetFunctions(TF_Graph* g, TF_Function** funcs, int max_func,
+                         TF_Status* status) {
+  tensorflow::FunctionDefLibrary lib;
+  {
+    tensorflow::mutex_lock l(g->mu);
+    lib = g->graph.flib_def().ToProto();
+  }
+  const auto len = std::min(max_func, static_cast<int>(lib.function_size()));
+  for (int i = 0; i < len; ++i) {
+    TF_Function* func = new TF_Function();
+    func->fdef = lib.function(i);
+    funcs[i] = func;
+  }
+  status->status = tensorflow::Status::OK();
+  return len;
+}
+
 void TF_FunctionToFunctionDef(TF_Function* func, TF_Buffer* output_func_def,
                               TF_Status* status) {
   status->status = MessageToBuffer(func->fdef, output_func_def);
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index d5580b658992413ae6f9cb79ef88751ee28ce465..7ca50119eafe299b307f06c555aec1388e7e82e2 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -330,6 +331,11 @@ class CApiFunctionTest : public ::testing::Test {
           << "Failed to find expected edge " << e.ToString()
           << " in fdef: " << fdef.DebugString();
     }
+    for (const EdgeSpec& e : c_edges) {
+      ASSERT_TRUE(a_edges.find(e) != a_edges.end())
+          << "Failed to find expected control edge " << e.ToString()
+          << " in fdef: " << fdef.DebugString();
+    }
 
     // If caller specified all edges, check that we have seen all
     if (is_exact_edges) {
@@ -979,7 +985,7 @@ TEST_F(CApiFunctionTest, ControlDependency) {
   VerifyFDef(
       {"add_0", "scalar"}, M({{"feed1"}, {"feed2"}}), M({{"add"}}),
       {{"feed1", "add_0:0"}, {"feed2", "add_0:1"}, {"add_0:sum:0", "add"}},
-      {{"scalar", "add_0"}});
+      {{"^scalar", "add_0:2"}});
 }
 
 TEST_F(CApiFunctionTest, ControlDependencyOutsideOfBody) {
@@ -1022,12 +1028,17 @@ TEST_F(CApiFunctionTest, ControlDependencyOutsideOfBody_FromInputNode) {
   TF_Operation* add =
       AddWithCtrlDependency(feed1, feed2, func_graph_, feed1, s_);
   EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  Define(-1, {}, {feed1, feed2}, {add}, {}, true);
-  EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_));
-  EXPECT_EQ(string("The source of control edge [id=3 feed1:-1 -> add:-1] "
-                   "is not in the body. Encountered while creating "
-                   "function 'MyFunc'"),
-            string(TF_Message(s_)));
+  Define(-1, {}, {feed1, feed2}, {add}, {});
+
+  // Use, run, and verify
+  TF_Operation* two = ScalarConst(2, host_graph_, s_);
+  TF_Operation* func_feed = Placeholder(host_graph_, s_);
+  TF_Operation* func_op = Use({two, func_feed});
+  Run({{func_feed, Int32Tensor(3)}}, func_op, 2 + 3);
+  VerifyFDef(
+      {"add_0"}, M({{"feed1"}, {"feed2"}}), M({{"add"}}),
+      {{"feed1", "add_0:0"}, {"feed2", "add_0:1"}, {"add_0:sum:0", "add"}},
+      {{"^feed1", "add_0:2"}});
 }
 
 TEST_F(CApiFunctionTest, DuplicateInputsAreNotAllowed) {
@@ -1462,7 +1473,11 @@ TEST_F(CApiFunctionTest, AppendHash) {
                  /*append_hash=*/true);
   tensorflow::FunctionDef fdef;
   ASSERT_TRUE(GetFunctionDef(func_, &fdef));
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+  ASSERT_EQ(string("func_name_base_ZpgUD4x8oqk"), fdef.signature().name());
+#else
   ASSERT_EQ(string("func_name_base_qaJ8jA8UmGY"), fdef.signature().name());
+#endif
 }
 
 TEST_F(CApiFunctionTest, GetOpDef) {
@@ -1482,9 +1497,124 @@ TEST_F(CApiFunctionTest, GetOpDef) {
   EXPECT_EQ(op_def.name(), func_name_);
   EXPECT_EQ(op_def.input_arg_size(), 1);
   EXPECT_EQ(op_def.output_arg_size(), 1);
+  EXPECT_FALSE(op_def.is_stateful());
+
+  TF_DeleteBuffer(buffer);
+}
+
+void DefineStatefulFunction(const char* name, TF_Function** func) {
+  std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> func_graph(
+      TF_NewGraph(), TF_DeleteGraph);
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> s(TF_NewStatus(),
+                                                           TF_DeleteStatus);
+
+  TF_Tensor* tensor_shape = Int32Tensor({37, 1});
+  TF_Operation* shape = Const(tensor_shape, func_graph.get(), s.get(), "shape");
+  TF_Operation* random =
+      RandomUniform(shape, TF_FLOAT, func_graph.get(), s.get());
+
+  TF_Output inputs[] = {};
+  TF_Output outputs[] = {{random, 0}};
+  *func = TF_GraphToFunction(func_graph.get(), name, /*append_hash=*/false, -1,
+                             /*opers=*/nullptr, 0, inputs, 1, outputs,
+                             /*output_names=*/nullptr,
+                             /*opts=*/nullptr, "", s.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(s.get())) << TF_Message(s.get());
+  ASSERT_NE(*func, nullptr);
+  TF_DeleteTensor(tensor_shape);
+}
+
+TEST_F(CApiFunctionTest, StatefulOpDef) {
+  DefineStatefulFunction(func_name_, &func_);
+  TF_GraphCopyFunction(host_graph_, func_, nullptr, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Test we can retrieve function OpDef from graph
+  TF_Buffer* buffer = TF_NewBuffer();
+  TF_GraphGetOpDef(host_graph_, func_name_, buffer, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Sanity check returned OpDef
+  string data(static_cast<const char*>(buffer->data), buffer->length);
+  OpDef op_def;
+  op_def.ParseFromString(data);
+  EXPECT_EQ(op_def.name(), func_name_);
+  EXPECT_EQ(op_def.input_arg_size(), 0);
+  EXPECT_EQ(op_def.output_arg_size(), 1);
+  EXPECT_TRUE(op_def.is_stateful());
 
   TF_DeleteBuffer(buffer);
 }
 
+void AssertEqual(TF_Function* f1, TF_Function* f2) {
+  string s1, s2;
+  tensorflow::FunctionDef fdef1, fdef2;
+  ASSERT_TRUE(GetFunctionDef(f1, &fdef1));
+  ASSERT_TRUE(GetFunctionDef(f2, &fdef2));
+  SerializeToStringDeterministic(fdef1, &s1);
+  SerializeToStringDeterministic(fdef2, &s2);
+  ASSERT_EQ(s1, s2);
+}
+
+string GetName(TF_Function* func) {
+  tensorflow::FunctionDef fdef;
+  GetFunctionDef(func, &fdef);
+  return fdef.signature().name();
+}
+
+TEST_F(CApiFunctionTest, GetFunctionsFromGraph) {
+  TF_Function* funcs[2];
+
+  // Get functions from empty graph
+  EXPECT_EQ(TF_GraphNumFunctions(host_graph_), 0);
+  TF_GraphGetFunctions(host_graph_, nullptr, 0, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Define a function and add it to host_graph_
+  TF_Function* func0;
+  DefineFunction("FooFunc0", &func0);
+  TF_GraphCopyFunction(host_graph_, func0, nullptr, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Get this function from host_graph_
+  EXPECT_EQ(TF_GraphNumFunctions(host_graph_), 1);
+  EXPECT_EQ(TF_GraphGetFunctions(host_graph_, funcs, 0, s_), 0);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  EXPECT_EQ(TF_GraphGetFunctions(host_graph_, funcs, 1, s_), 1);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  AssertEqual(func0, funcs[0]);
+  TF_DeleteFunction(funcs[0]);
+  EXPECT_EQ(TF_GraphGetFunctions(host_graph_, funcs, 2, s_), 1);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  AssertEqual(func0, funcs[0]);
+  TF_DeleteFunction(funcs[0]);
+
+  // Define a second function
+  TF_Function* func1;
+  DefineFunction("FooFunc1", &func1);
+  TF_GraphCopyFunction(host_graph_, func1, nullptr, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Get both function from host_graph_
+  EXPECT_EQ(TF_GraphNumFunctions(host_graph_), 2);
+  EXPECT_EQ(TF_GraphGetFunctions(host_graph_, funcs, 0, s_), 0);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  EXPECT_EQ(TF_GraphGetFunctions(host_graph_, funcs, 2, s_), 2);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  if (GetName(funcs[0]) == GetName(func0)) {
+    AssertEqual(func0, funcs[0]);
+    AssertEqual(func1, funcs[1]);
+  } else {
+    AssertEqual(func0, funcs[1]);
+    AssertEqual(func1, funcs[0]);
+  }
+
+  TF_DeleteFunction(funcs[0]);
+  TF_DeleteFunction(funcs[1]);
+
+  TF_DeleteFunction(func0);
+  TF_DeleteFunction(func1);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index bb04e01beec931a8ea66d0855eec9625d3a6a5ab..91667056e0eeb224b4b8a034766f11a123cd1a03 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -24,6 +24,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#ifndef __ANDROID__
+#include "tensorflow/core/framework/op_gen_lib.h"
+#endif
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -81,12 +84,20 @@ struct TF_Graph {
   std::unordered_map<tensorflow::string, tensorflow::Node*> name_map
       GUARDED_BY(mu);
 
-  // TF_Graph may only / must be deleted when
-  //   num_sessions == 0 && delete_requested == true
-
-  // num_sessions incremented by TF_NewSession, and decremented by
+  // The keys of this map are all the active sessions using this graph.
+  // Each value is the current "runnability" status of the corresponding
+  // session. Under normal conditions all statuses are Status::OK(), but
+  // if some operation is mutated after it was run by a session (this
+  // is detected in RecordMutation function), that session is no longer
+  // safe to run. Its status will contain the error that will be returned
+  // to the user, should she try running this session.
+  //
+  // Sessions are added to this map in TF_NewSession, and removed in
   // TF_DeleteSession.
-  int num_sessions GUARDED_BY(mu);
+  // TF_Graph may only / must be deleted when
+  //   sessions.size() == 0 && delete_requested == true
+  tensorflow::gtl::FlatMap<TF_Session*, tensorflow::Status> sessions
+      GUARDED_BY(mu);
   bool delete_requested GUARDED_BY(mu);  // set true by TF_DeleteGraph
 
   // Used to link graphs contained in TF_WhileParams to the parent graph that
@@ -135,11 +146,11 @@ struct TF_ImportGraphDefOptions {
 struct TF_ImportGraphDefResults {
   std::vector<TF_Output> return_tensors;
   std::vector<TF_Operation*> return_nodes;
-  std::vector<const char*> unused_key_names;
-  std::vector<int> unused_key_indexes;
+  std::vector<const char*> missing_unused_key_names;
+  std::vector<int> missing_unused_key_indexes;
 
-  // Backing memory for unused_key_names values.
-  std::list<tensorflow::string> unused_key_names_data;
+  // Backing memory for missing_unused_key_names values.
+  std::list<tensorflow::string> missing_unused_key_names_data;
 };
 
 struct TF_DeviceList {
@@ -150,6 +161,22 @@ struct TF_Function {
   tensorflow::FunctionDef fdef;
 };
 
+struct TF_ApiDefMap {
+  explicit TF_ApiDefMap(const tensorflow::OpList& op_list)
+      :
+#ifndef __ANDROID__
+        api_def_map(op_list),
+#endif
+        update_docs_called(false) {
+  }
+
+#ifndef __ANDROID__
+  tensorflow::ApiDefMap api_def_map GUARDED_BY(lock);
+#endif
+  bool update_docs_called GUARDED_BY(lock);
+  tensorflow::mutex lock;
+};
+
 namespace tensorflow {
 
 class TensorCApi {
@@ -167,6 +194,24 @@ TF_Tensor* TF_TensorFromTensor(const Tensor& src, TF_Status* status);
 
 Status MessageToBuffer(const tensorflow::protobuf::Message& in, TF_Buffer* out);
 
+// Set the shapes and types of the output's handle.
+//
+// The lengths of the arrays pointed to by `shapes`, `ranks`, and `types` must
+// all be equal to `num_shapes_and_types`. If `ranks[i] != -1`, (i.e., if the
+// rank is known), then it must be equal to the length of `shapes[i]`; if
+// `ranks[i] == 1`, then `shapes[i]` may be nullptr.
+//
+// TODO(akshayka): Implement a corresponding getter method.
+void TF_GraphSetOutputHandleShapesAndTypes(TF_Graph* graph, TF_Output output,
+                                           int num_shapes_and_types,
+                                           const int64_t** shapes,
+                                           const int* ranks,
+                                           const TF_DataType* types,
+                                           TF_Status* status);
+
+void RecordMutation(TF_Graph* graph, const TF_Operation& op,
+                    const char* mutation_type);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_C_C_API_INTERNAL_H_
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 6ec1db8ccfdb713f330b708e604bd4b502ff7202..028f146be31790b211e546978302e81afe26b231 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/tag_constants.h"
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
 #include "tensorflow/core/framework/node_def.pb_text.h"
@@ -56,6 +57,52 @@ static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
+// Returns the GPU device name if there is one (with arbitrary tie breaking if
+// there are more than one), or "" otherwise.
+string GPUDeviceName(TF_Session* session) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Status* s = status.get();
+  std::unique_ptr<TF_DeviceList, decltype(&TF_DeleteDeviceList)> list(
+      TF_SessionListDevices(session, s), TF_DeleteDeviceList);
+  TF_DeviceList* device_list = list.get();
+
+  CHECK_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  const int num_devices = TF_DeviceListCount(device_list);
+  LOG(INFO) << "There are " << num_devices << " devices.";
+  for (int i = 0; i < num_devices; ++i) {
+    const char* device_name = TF_DeviceListName(device_list, i, s);
+    CHECK_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    const char* device_type = TF_DeviceListType(device_list, i, s);
+    CHECK_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    LOG(INFO) << "Device " << i << " has name " << device_name << ", type "
+              << device_type;
+    if (string(device_type) == DEVICE_GPU) {
+      return device_name;
+    }
+  }
+  // No GPU device found.
+  return "";
+}
+
+string GPUDeviceName() {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Status* s = status.get();
+  std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> graph(TF_NewGraph(),
+                                                             TF_DeleteGraph);
+
+  TF_SessionOptions* opts = TF_NewSessionOptions();
+  TF_Session* sess = TF_NewSession(graph.get(), opts, s);
+  TF_DeleteSessionOptions(opts);
+
+  const string gpu_device_name = GPUDeviceName(sess);
+  TF_DeleteSession(sess, s);
+  CHECK_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  return gpu_device_name;
+}
+
 TEST(CAPI, Version) { EXPECT_STRNE("", TF_Version()); }
 
 TEST(CAPI, Status) {
@@ -93,6 +140,17 @@ TEST(CAPI, Tensor) {
   EXPECT_TRUE(deallocator_called);
 }
 
+void NoOpDeallocator(void* data, size_t, void*) {}
+
+TEST(CAPI, MalformedTensor) {
+  // See https://github.com/tensorflow/tensorflow/issues/7394
+  // num_dims = 0 implies a scalar, so should be backed by at least 4 bytes of
+  // data.
+  TF_Tensor* t =
+      TF_NewTensor(TF_FLOAT, nullptr, 0, nullptr, 0, &NoOpDeallocator, nullptr);
+  ASSERT_TRUE(t == nullptr);
+}
+
 TEST(CAPI, AllocateTensor) {
   const int num_bytes = 6 * sizeof(float);
   int64_t dims[] = {2, 3};
@@ -122,6 +180,10 @@ TEST(CAPI, MaybeMove) {
 }
 
 TEST(CAPI, LibraryLoadFunctions) {
+  // TODO(b/73318067): Fix linking for the GPU test generated by the
+  // tf_cuda_cc_test() bazel rule and remove the next line.
+  if (!GPUDeviceName().empty()) return;
+
   // Load the library.
   TF_Status* status = TF_NewStatus();
   TF_Library* lib =
@@ -574,7 +636,7 @@ TEST(CAPI, ImportGraphDef) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
 
-  // Create a graph with two nodes: x and 3
+  // Create a simple graph.
   Placeholder(graph, s);
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
   ASSERT_TRUE(TF_GraphOperationByName(graph, "feed") != nullptr);
@@ -585,7 +647,7 @@ TEST(CAPI, ImportGraphDef) {
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
   ASSERT_TRUE(TF_GraphOperationByName(graph, "neg") != nullptr);
 
-  // Export to a GraphDef
+  // Export to a GraphDef.
   TF_Buffer* graph_def = TF_NewBuffer();
   TF_GraphToGraphDef(graph, graph_def, s);
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
@@ -605,6 +667,31 @@ TEST(CAPI, ImportGraphDef) {
   ASSERT_TRUE(feed != nullptr);
   ASSERT_TRUE(neg != nullptr);
 
+  // Test basic structure of the imported graph.
+  EXPECT_EQ(0, TF_OperationNumInputs(scalar));
+  EXPECT_EQ(0, TF_OperationNumInputs(feed));
+  ASSERT_EQ(1, TF_OperationNumInputs(neg));
+  TF_Output neg_input = TF_OperationInput({neg, 0});
+  EXPECT_EQ(scalar, neg_input.oper);
+  EXPECT_EQ(0, neg_input.index);
+
+  // Test that we can't see control edges involving the source and sink nodes.
+  TF_Operation* control_ops[100];
+  EXPECT_EQ(0, TF_OperationNumControlInputs(scalar));
+  EXPECT_EQ(0, TF_OperationGetControlInputs(scalar, control_ops, 100));
+  EXPECT_EQ(0, TF_OperationNumControlOutputs(scalar));
+  EXPECT_EQ(0, TF_OperationGetControlOutputs(scalar, control_ops, 100));
+
+  EXPECT_EQ(0, TF_OperationNumControlInputs(feed));
+  EXPECT_EQ(0, TF_OperationGetControlInputs(feed, control_ops, 100));
+  EXPECT_EQ(0, TF_OperationNumControlOutputs(feed));
+  EXPECT_EQ(0, TF_OperationGetControlOutputs(feed, control_ops, 100));
+
+  EXPECT_EQ(0, TF_OperationNumControlInputs(neg));
+  EXPECT_EQ(0, TF_OperationGetControlInputs(neg, control_ops, 100));
+  EXPECT_EQ(0, TF_OperationNumControlOutputs(neg));
+  EXPECT_EQ(0, TF_OperationGetControlOutputs(neg, control_ops, 100));
+
   // Import it again, with an input mapping, return outputs, and a return
   // operation, into the same graph.
   TF_DeleteImportGraphDefOptions(opts);
@@ -628,7 +715,7 @@ TEST(CAPI, ImportGraphDef) {
   ASSERT_TRUE(neg2 != nullptr);
 
   // Check input mapping
-  TF_Output neg_input = TF_OperationInput({neg, 0});
+  neg_input = TF_OperationInput({neg, 0});
   EXPECT_EQ(scalar, neg_input.oper);
   EXPECT_EQ(0, neg_input.index);
 
@@ -773,7 +860,7 @@ TEST(CAPI, ImportGraphDef_WithReturnOutputs) {
   TF_DeleteStatus(s);
 }
 
-TEST(CAPI, ImportGraphDef_UnusedInputMappings) {
+TEST(CAPI, ImportGraphDef_MissingUnusedInputMappings) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
 
@@ -816,7 +903,7 @@ TEST(CAPI, ImportGraphDef_UnusedInputMappings) {
   int num_unused_input_mappings;
   const char** src_names;
   int* src_indexes;
-  TF_ImportGraphDefResultsUnusedInputMappings(
+  TF_ImportGraphDefResultsMissingUnusedInputMappings(
       results, &num_unused_input_mappings, &src_names, &src_indexes);
   ASSERT_EQ(1, num_unused_input_mappings);
   EXPECT_EQ(string("fake"), string(src_names[0]));
@@ -886,6 +973,70 @@ TEST(CAPI, Session) {
   TF_DeleteStatus(s);
 }
 
+// If `device` is non-empty, run Min op on that device.
+// Otherwise run it on the default device (CPU).
+void RunMinTest(const string& device, bool use_XLA) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // Make a placeholder operation.
+  TF_Operation* feed = Placeholder(graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Make a constant operation with the scalar "0", for axis.
+  TF_Operation* one = ScalarConst(0, graph, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Create a session for this graph.
+  CSession csession(graph, s, use_XLA);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  if (!device.empty()) {
+    LOG(INFO) << "Setting op Min on device " << device;
+  }
+  TF_Operation* min = MinWithDevice(feed, one, graph, device, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Run the graph.
+  csession.SetInputs({{feed, Int32Tensor({3, 2, 5})}});
+  csession.SetOutputs({min});
+  csession.Run(s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Tensor* out = csession.output_tensor(0);
+  ASSERT_TRUE(out != nullptr);
+  EXPECT_EQ(TF_INT32, TF_TensorType(out));
+  EXPECT_EQ(0, TF_NumDims(out));  // scalar
+  ASSERT_EQ(sizeof(int32), TF_TensorByteSize(out));
+  int32* output_contents = static_cast<int32*>(TF_TensorData(out));
+  EXPECT_EQ(2, *output_contents);
+
+  // Clean up
+  csession.CloseAndDelete(s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+TEST(CAPI, Session_Min_CPU) { RunMinTest(/*device=*/"", /*use_XLA=*/false); }
+
+TEST(CAPI, Session_Min_XLA_CPU) { RunMinTest(/*device=*/"", /*use_XLA=*/true); }
+
+TEST(CAPI, Session_Min_GPU) {
+  const string gpu_device = GPUDeviceName();
+  // Skip this test if no GPU is available.
+  if (gpu_device.empty()) return;
+
+  RunMinTest(gpu_device, /*use_XLA=*/false);
+}
+
+TEST(CAPI, Session_Min_XLA_GPU) {
+  const string gpu_device = GPUDeviceName();
+  // Skip this test if no GPU is available.
+  if (gpu_device.empty()) return;
+
+  RunMinTest(gpu_device, /*use_XLA=*/true);
+}
+
 TEST(CAPI, SessionPRun) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
@@ -1930,7 +2081,7 @@ TEST_F(CApiAttributesTest, Tensor) {
 }
 
 TEST_F(CApiAttributesTest, StringTensor) {
-  // Create the string-Tensor "atttribute" value.
+  // Create the string-Tensor "attribute" value.
   char encoded[] = {
       0,   0, 0, 0, 0, 0, 0, 0,  // array[uint64] offsets
       1,                         // varint encoded string length
@@ -2027,6 +2178,85 @@ TEST_F(CApiAttributesTest, Errors) {
   EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)) << TF_Message(s_);
 }
 
+TEST(TestApiDef, TestCreateApiDef) {
+  // TODO(b/73318067): Fix linking for the GPU test generated by the
+  // tf_cuda_cc_test() bazel rule and remove the next line.
+  if (!GPUDeviceName().empty()) return;
+
+  TF_Status* status = TF_NewStatus();
+  TF_Library* lib =
+      TF_LoadLibrary("tensorflow/c/test_op.so", status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+
+  TF_Buffer op_list_buf = TF_GetOpList(lib);
+  status = TF_NewStatus();
+  auto* api_def_map = TF_NewApiDefMap(&op_list_buf, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+
+  string op_name = "TestCApi";
+  status = TF_NewStatus();
+  auto* api_def_buf =
+      TF_ApiDefMapGet(api_def_map, op_name.c_str(), op_name.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+
+  tensorflow::ApiDef api_def;
+  EXPECT_TRUE(api_def.ParseFromArray(api_def_buf->data, api_def_buf->length));
+  EXPECT_EQ(op_name, api_def.graph_op_name());
+  EXPECT_EQ(R"doc(Used to test C API)doc", api_def.summary());
+
+  TF_DeleteBuffer(api_def_buf);
+  TF_DeleteApiDefMap(api_def_map);
+  TF_DeleteLibraryHandle(lib);
+}
+
+TEST(TestApiDef, TestCreateApiDefWithOverwrites) {
+  // TODO(b/73318067): Fix linking for the GPU test generated by the
+  // tf_cuda_cc_test() bazel rule and remove the next line.
+  if (!GPUDeviceName().empty()) return;
+
+  TF_Status* status = TF_NewStatus();
+  TF_Library* lib =
+      TF_LoadLibrary("tensorflow/c/test_op.so", status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+
+  TF_Buffer op_list_buf = TF_GetOpList(lib);
+  status = TF_NewStatus();
+  auto* api_def_map = TF_NewApiDefMap(&op_list_buf, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+
+  string api_def_overwrites = R"(op: <
+  graph_op_name: "TestCApi"
+  summary: "New summary"
+>
+)";
+  status = TF_NewStatus();
+  TF_ApiDefMapPut(api_def_map, api_def_overwrites.c_str(),
+                  api_def_overwrites.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+
+  string op_name = "TestCApi";
+  status = TF_NewStatus();
+  auto* api_def_buf =
+      TF_ApiDefMapGet(api_def_map, op_name.c_str(), op_name.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+
+  tensorflow::ApiDef api_def;
+  EXPECT_TRUE(api_def.ParseFromArray(api_def_buf->data, api_def_buf->length));
+  EXPECT_EQ(op_name, api_def.graph_op_name());
+  EXPECT_EQ("New summary", api_def.summary());
+
+  TF_DeleteBuffer(api_def_buf);
+  TF_DeleteApiDefMap(api_def_map);
+  TF_DeleteLibraryHandle(lib);
+}
+
 #undef EXPECT_TF_META
 
 }  // namespace
diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc
index c291a2e440a8515e968b0ce0395b289080f04e8b..a55af46ae2baef1cd4f55f478ec234551f370503 100644
--- a/tensorflow/c/c_test_util.cc
+++ b/tensorflow/c/c_test_util.cc
@@ -15,11 +15,13 @@ limitations under the License.
 
 #include "tensorflow/c/c_test_util.h"
 
+#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/session_options.h"
 
 using tensorflow::GraphDef;
 using tensorflow::NodeDef;
@@ -124,8 +126,9 @@ TF_Operation* ScalarConst(double v, TF_Graph* graph, TF_Status* s,
   return Const(tensor.get(), graph, s, name);
 }
 
-void AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph, TF_Status* s,
-               const char* name, TF_Operation** op, bool check) {
+void AddOpHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                 TF_Status* s, const char* name, TF_Operation** op,
+                 bool check) {
   TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
   TF_Output add_inputs[2] = {{l, 0}, {r, 0}};
   TF_AddInputList(desc, add_inputs, 2);
@@ -139,14 +142,14 @@ void AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph, TF_Status* s,
 TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                   TF_Status* s, const char* name) {
   TF_Operation* op;
-  AddHelper(l, r, graph, s, name, &op, true);
+  AddOpHelper(l, r, graph, s, name, &op, true);
   return op;
 }
 
 TF_Operation* AddNoCheck(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                          TF_Status* s, const char* name) {
   TF_Operation* op;
-  AddHelper(l, r, graph, s, name, &op, false);
+  AddOpHelper(l, r, graph, s, name, &op, false);
   return op;
 }
 
@@ -160,6 +163,36 @@ TF_Operation* AddWithCtrlDependency(TF_Operation* l, TF_Operation* r,
   return TF_FinishOperation(desc, s);
 }
 
+// If `op_device` is non-empty, set the created op on that device.
+void BinaryOpHelper(const char* op_name, TF_Operation* l, TF_Operation* r,
+                    TF_Graph* graph, TF_Status* s, const char* name,
+                    TF_Operation** op, const string& op_device, bool check) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, op_name, name);
+  if (!op_device.empty()) {
+    TF_SetDevice(desc, op_device.c_str());
+  }
+  TF_AddInput(desc, {l, 0});
+  TF_AddInput(desc, {r, 0});
+  *op = TF_FinishOperation(desc, s);
+  if (check) {
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    ASSERT_NE(*op, nullptr);
+  }
+}
+
+TF_Operation* MinWithDevice(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                            const string& op_device, TF_Status* s,
+                            const char* name) {
+  TF_Operation* op;
+  BinaryOpHelper("Min", l, r, graph, s, name, &op, op_device, true);
+  return op;
+}
+
+TF_Operation* Min(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                  TF_Status* s, const char* name) {
+  return MinWithDevice(l, r, graph, /*op_device=*/"", s, name);
+}
+
 TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s,
                   const char* name) {
   TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
@@ -193,6 +226,15 @@ TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph,
   return TF_FinishOperation(desc, s);
 }
 
+TF_Operation* RandomUniform(TF_Operation* shape, TF_DataType dtype,
+                            TF_Graph* graph, TF_Status* s) {
+  TF_OperationDescription* desc =
+      TF_NewOperation(graph, "RandomUniform", "random_uniform");
+  TF_AddInput(desc, {shape, 0});
+  TF_SetAttrType(desc, "dtype", dtype);
+  return TF_FinishOperation(desc, s);
+}
+
 void Split3Helper(TF_Operation* input, TF_Graph* graph, TF_Status* s,
                   const char* name, TF_Operation** op) {
   TF_Operation* zero = ScalarConst(
@@ -360,8 +402,21 @@ std::vector<string> GetFuncNames(const tensorflow::GraphDef& graph_def) {
   return names;
 }
 
-CSession::CSession(TF_Graph* graph, TF_Status* s) {
+CSession::CSession(TF_Graph* graph, TF_Status* s, bool use_XLA) {
   TF_SessionOptions* opts = TF_NewSessionOptions();
+  tensorflow::legacy_flags::MarkForCompilationPassFlags* flags =
+      tensorflow::legacy_flags::GetMarkForCompilationPassFlags();
+  flags->tf_xla_cpu_global_jit = use_XLA;
+  if (use_XLA) {
+    tensorflow::ConfigProto config;
+    config.mutable_graph_options()
+        ->mutable_optimizer_options()
+        ->set_global_jit_level(tensorflow::OptimizerOptions::ON_1);
+    std::string contents;
+    contents.resize(config.ByteSizeLong());
+    config.SerializeToArray(&contents[0], contents.size());
+    TF_SetConfig(opts, contents.data(), contents.size(), s);
+  }
   session_ = TF_NewSession(graph, opts, s);
   TF_DeleteSessionOptions(opts);
 }
diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
index d54733749248fa32c39d88bb0281d329dd50c7bd..2a70177c724c569844a5d8ad42b99bed20209946 100644
--- a/tensorflow/c/c_test_util.h
+++ b/tensorflow/c/c_test_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_C_C_TEST_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_C_C_TEST_UTIL_H_
+#ifndef TENSORFLOW_C_C_TEST_UTIL_H_
+#define TENSORFLOW_C_C_TEST_UTIL_H_
 
 #include "tensorflow/c/c_api.h"
 
@@ -69,12 +69,23 @@ TF_Operation* AddWithCtrlDependency(TF_Operation* l, TF_Operation* r,
 TF_Operation* Add(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s,
                   const char* name = "add");
 
+TF_Operation* Min(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                  TF_Status* s, const char* name = "min");
+
+// If `op_device` is non-empty, set the created op on that device.
+TF_Operation* MinWithDevice(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                            const string& op_device, TF_Status* s,
+                            const char* name = "min");
+
 TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s,
                   const char* name = "neg");
 
 TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s);
 
-// Split `input` along the first dimention into 3 tensors
+TF_Operation* RandomUniform(TF_Operation* shape, TF_DataType dtype,
+                            TF_Graph* graph, TF_Status* s);
+
+// Split `input` along the first dimension into 3 tensors
 TF_Operation* Split3(TF_Operation* input, TF_Graph* graph, TF_Status* s,
                      const char* name = "split3");
 
@@ -105,7 +116,7 @@ std::vector<string> GetFuncNames(const tensorflow::GraphDef& graph_def);
 
 class CSession {
  public:
-  CSession(TF_Graph* graph, TF_Status* s);
+  CSession(TF_Graph* graph, TF_Status* s, bool use_XLA = false);
   explicit CSession(TF_Session* session);
 
   ~CSession();
@@ -121,6 +132,8 @@ class CSession {
 
   TF_Tensor* output_tensor(int i) { return output_values_[i]; }
 
+  TF_Session* mutable_session() { return session_; }
+
  private:
   void DeleteInputValues();
   void ResetOutputValues();
@@ -133,4 +146,4 @@ class CSession {
   std::vector<TF_Operation*> targets_;
 };
 
-#endif  // THIRD_PARTY_TENSORFLOW_C_C_TEST_UTIL_H_
+#endif  // TENSORFLOW_C_C_TEST_UTIL_H_
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index d533758e360bc44a6f52f57eaae5b222e0482860..e55cb672e97e1403a3dd864c91c176426eb3f067 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -6,6 +6,7 @@ load(
     "tf_cuda_cc_test",
     "tf_cc_test",
     "tf_copts",
+    "tfe_xla_copts",
     "tf_cuda_library",
 )
 
@@ -16,7 +17,7 @@ tf_cuda_library(
         "c_api_internal.h",
     ],
     hdrs = ["c_api.h"],
-    copts = tf_copts(),
+    copts = tf_copts() + tfe_xla_copts(),
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
@@ -33,7 +34,15 @@ tf_cuda_library(
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
         ],
-    }),
+    }) + select({
+        "//tensorflow:with_xla_support": [
+            "//tensorflow/compiler/tf2xla:xla_compiler",
+            "//tensorflow/compiler/jit",
+        ],
+        "//conditions:default": [],
+    }) + [
+        "//tensorflow/core:gpu_runtime",
+    ],
 )
 
 tf_cuda_library(
@@ -46,6 +55,7 @@ tf_cuda_library(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib_internal",
@@ -55,8 +65,14 @@ tf_cuda_library(
 tf_cuda_cc_test(
     name = "c_api_test",
     srcs = ["c_api_test.cc"],
+    extra_copts = tfe_xla_copts(),
+    tags = [
+        "guitar",
+        "multi_gpu",
+    ],
     deps = [
         ":c_api",
+        "//tensorflow/c:c_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -113,3 +129,9 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
+
+filegroup(
+    name = "headers",
+    srcs = ["c_api.h"],
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 706c89536db019c7f7389af576815746b2425520..8e834eb99c13d1f26da9f0860897267efc2fd01c 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -25,6 +25,10 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/runtime.h"
+#ifdef TENSORFLOW_EAGER_USE_XLA
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#endif  // TENSORFLOW_EAGER_USE_XLA
+#include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -33,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -43,13 +48,23 @@ using tensorflow::int64;
 using tensorflow::string;
 
 namespace {
-bool IsCPU(tensorflow::Device* d) {
+bool IsCPU(const tensorflow::Device* d) {
   return d == nullptr || d->tensorflow_gpu_device_info() == nullptr;
 }
 
-string DeviceName(tensorflow::Device* d) {
+bool IsXLA(const tensorflow::Device* d) {
+  if (d == nullptr) return false;
+  const auto& device_type = d->attributes().device_type();
+  return device_type.find("XLA") != std::string::npos;
+}
+
+string DeviceName(const tensorflow::Device* d) {
   return (d == nullptr) ? "cpu:0" : d->name();
 }
+
+#ifdef TENSORFLOW_EAGER_USE_XLA
+std::atomic_int_fast64_t func_id_generator(0);
+#endif  // TENSORFLOW_EAGER_USE_XLA
 }  // namespace
 
 extern "C" {
@@ -84,20 +99,15 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
     return nullptr;
   }
 
-  TFE_Context* ret = new TFE_Context(session);
-  ret->policy = opts->policy;
-  ret->pflr.reset(new tensorflow::ProcessFunctionLibraryRuntime(
-      ret->session->device_mgr, opts->session_options.options.env,
-      TF_GRAPH_DEF_VERSION, &ret->func_lib_def, {}));
-  ret->rendezvous =
-      new tensorflow::IntraProcessRendezvous(ret->session->device_mgr);
-
-  return ret;
+  return new TFE_Context(*opts, session);
 }
 
 void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) {
   status->status = tensorflow::Status::OK();
-  tensorflow::gtl::STLDeleteValues(&ctx->kernel_cache);
+  {
+    tensorflow::mutex_lock ml(ctx->cache_mu);
+    tensorflow::gtl::STLDeleteValues(&ctx->kernel_cache);
+  }
   TF_Graph* graph = ctx->session->graph;
   TF_DeleteSession(ctx->session, status);
   TF_DeleteGraph(graph);
@@ -109,6 +119,28 @@ TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) {
   return TF_SessionListDevices(ctx->session, status);
 }
 
+void TFE_ContextClearCaches(TFE_Context* ctx) {
+  tensorflow::mutex_lock ml(ctx->cache_mu);
+  tensorflow::gtl::STLDeleteValues(&ctx->kernel_cache);
+}
+
+void TFE_ContextSetThreadLocalDevicePlacementPolicy(
+    TFE_Context* ctx, TFE_ContextDevicePlacementPolicy policy) {
+  tensorflow::mutex_lock ml(ctx->policy_map_mu);
+  ctx->thread_local_policies[std::this_thread::get_id()] = policy;
+}
+
+extern TFE_ContextDevicePlacementPolicy TFE_ContextGetDevicePlacementPolicy(
+    TFE_Context* ctx) {
+  tensorflow::mutex_lock ml(ctx->policy_map_mu);
+  auto policy_map_it =
+      ctx->thread_local_policies.find(std::this_thread::get_id());
+  if (policy_map_it != ctx->thread_local_policies.end()) {
+    return policy_map_it->second;
+  }
+  return ctx->policy;
+}
+
 TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
   tensorflow::Tensor tensor;
   status->status = tensorflow::TF_TensorToTensor(t, &tensor);
@@ -164,23 +196,16 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
   bool is_same_device =
       (srcd == dstd) || (DeviceName(srcd) == DeviceName(dstd));
   const bool dst_cpu = IsCPU(dstd);
-  if (is_same_device) {
-    return new TFE_TensorHandle(h->t, dst_cpu ? nullptr : dstd);
-  }
   const bool src_cpu = IsCPU(srcd);
-  if (src_cpu == dst_cpu) {
-    TF_SetStatus(
-        status, TF_INVALID_ARGUMENT,
-        tensorflow::strings::StrCat(
-            "TFE_TensorHandleCopyToDevice requires either the source "
-            "TFE_TensorHandle be on or the destination device be on CPU "
-            "or be the same (they are ",
-            DeviceName(srcd), " and ", DeviceName(dstd), " in this call)")
-            .c_str());
-    return nullptr;
+  // both_on_cpu can be true and yet is_same_device is false, if one of src/dst
+  // has device type XLA_CPU, and the other CPU.
+  const bool both_on_cpu = src_cpu && dst_cpu;
+  if (is_same_device || both_on_cpu) {
+    return new TFE_TensorHandle(h->t, dst_cpu ? nullptr : dstd);
   }
   tensorflow::Tensor* src = &(h->t);
-  if (!dst_cpu && !tensorflow::DataTypeCanUseMemcpy(src->dtype())) {
+  if (!dst_cpu && (src->dtype() != tensorflow::DT_VARIANT &&
+                   !tensorflow::DataTypeCanUseMemcpy(src->dtype()))) {
     TF_SetStatus(
         status, TF_INVALID_ARGUMENT,
         tensorflow::strings::StrCat("Can't copy Tensor with type ",
@@ -189,26 +214,22 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
             .c_str());
     return nullptr;
   }
-  if (src_cpu) {
-    tensorflow::Tensor dst(
-        dstd->GetAllocator(tensorflow::AllocatorAttributes()), src->dtype(),
-        src->shape());
-    if (src->shape().num_elements() == 0) {
-      return new TFE_TensorHandle(dst, dstd);
-    }
-    tensorflow::Notification n;
-    dstd->tensorflow_gpu_device_info()->default_context->CopyCPUTensorToDevice(
-        src, dstd, &dst, [status, &n](const tensorflow::Status& s) {
-          status->status = s;
-          n.Notify();
-        });
-    n.WaitForNotification();
-    return (TF_GetCode(status) == TF_OK) ? new TFE_TensorHandle(dst, dstd)
-                                         : nullptr;
-  }
-  CHECK(dst_cpu);
-  tensorflow::Tensor dst(src->dtype(), src->shape());
-  tensorflow::Notification n;
+  tensorflow::AllocatorAttributes attr;
+  if (src->dtype() == tensorflow::DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  tensorflow::Tensor dst(dstd->GetAllocator(attr), src->dtype(), src->shape());
+  if (src->shape().num_elements() == 0) {
+    return new TFE_TensorHandle(dst, dst_cpu ? nullptr : dstd);
+  }
+  tensorflow::DeviceContext* src_device_context = nullptr;
+  if (!src_cpu) {
+    src_device_context = srcd->tensorflow_gpu_device_info()->default_context;
+  }
+  tensorflow::DeviceContext* dst_device_context = nullptr;
+  if (!dst_cpu) {
+    dst_device_context = dstd->tensorflow_gpu_device_info()->default_context;
+  }
   // TODO(ashankar): The Sync() call below may be more aggressive than
   // necessary. It is based on knowledge of implementation details - that
   // GPU devices are implemented using 3 streams - one for host->device copies,
@@ -217,16 +238,18 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
   // but more than necessary (since it waits for operations that might have
   // nothing to do with this tensor to complete).
   status->status = srcd->Sync();
-  if (!status->status.ok()) return nullptr;
-  srcd->tensorflow_gpu_device_info()->default_context->CopyDeviceTensorToCPU(
-      src, "IGNORE_MY_TENSOR_NAME", srcd, &dst,
-      [status, &n](const tensorflow::Status& s) {
-        status->status = s;
-        n.Notify();
-      });
+  tensorflow::Notification n;
+  tensorflow::CopyTensor::ViaDMA("copy", src_device_context, dst_device_context,
+                                 srcd, dstd, tensorflow::AllocatorAttributes(),
+                                 tensorflow::AllocatorAttributes(), src, &dst,
+                                 [status, &n](const tensorflow::Status& s) {
+                                   status->status = s;
+                                   n.Notify();
+                                 });
   n.WaitForNotification();
-  return (TF_GetCode(status) == TF_OK) ? new TFE_TensorHandle(dst, nullptr)
-                                       : nullptr;
+  return (TF_GetCode(status) == TF_OK)
+             ? new TFE_TensorHandle(dst, dst_cpu ? nullptr : dstd)
+             : nullptr;
 }
 
 TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
@@ -247,15 +270,6 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
 
 void TFE_DeleteOp(TFE_Op* op) { delete op; }
 
-static void TFE_OpSetDeviceHelper(TFE_Op* op, tensorflow::Device* device,
-                                  TF_Status* status) {
-  // Questionable heuristic: Place the op on the same device as the first input
-  // placed outside of host memory?
-  if (IsCPU(op->device) && !IsCPU(device)) {
-    op->device = device;
-  }
-}
-
 void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
   tensorflow::Device* d = nullptr;
   if (device_name != nullptr && strlen(device_name) > 0) {
@@ -263,11 +277,32 @@ void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
         op->ctx->session->device_mgr->LookupDevice(device_name, &d);
     if (!status->status.ok()) return;
   }
-  TFE_OpSetDeviceHelper(op, d, status);
+  op->device = d;
+}
+
+const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
+  tensorflow::Device* device =
+      (op->device == nullptr) ? op->ctx->devices()[0] : op->device;
+  return device->name().c_str();
+}
+
+void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
+  op->use_xla = enable;
+#ifndef TENSORFLOW_EAGER_USE_XLA
+  LOG(WARNING) << "This call is a no-op, as the TensorFlow library is not "
+                  "built with XLA support.";
+#endif  // TENSORFLOW_EAGER_USE_XLA
 }
 
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
-  TFE_OpSetDeviceHelper(op, h->d, status);
+  // Questionable heuristic ...
+  //
+  // Motivation: After an 'op' is placed on GPU because some of its earlier
+  // inputs are on GPU, we want to keep the 'op' there, even if some later
+  // inputs of it are not on GPU.
+  if (IsCPU(op->device) && !IsCPU(h->d)) {
+    op->device = h->d;
+  }
   if (!status->status.ok()) return;
   op->inputs.push_back(h->t);
   op->input_devices.push_back(h->d);
@@ -284,7 +319,7 @@ TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
     return TF_ATTR_INT;  // The compiler requires that we return something.
   }
   status->status =
-      tensorflow::AttrTypeByName(op->attr_types, attr_name, &ret, is_list);
+      tensorflow::AttrTypeByName(*op->attr_types, attr_name, &ret, is_list);
   return ret;
 }
 
@@ -420,6 +455,19 @@ void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
                     proto.get(), num_values));
 }
 
+void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
+                               const TFE_Op** value, int num_values) {
+  std::unique_ptr<tensorflow::NameAttrList[]> funcs(
+      new tensorflow::NameAttrList[num_values]);
+  for (int i = 0; i < num_values; i++) {
+    funcs[i].set_name(value[i]->name);
+    value[i]->attrs.FillAttrValueMap(funcs[i].mutable_attr());
+  }
+  op->attrs.Set(attr_name,
+                tensorflow::gtl::ArraySlice<const tensorflow::NameAttrList>(
+                    funcs.get(), num_values));
+}
+
 namespace {
 
 tensorflow::Status ValidateInputTypeAndPlacement(
@@ -438,10 +486,17 @@ tensorflow::Status ValidateInputTypeAndPlacement(
     const tensorflow::Device* actual_device =
         op->input_devices[i] == nullptr ? host_device : op->input_devices[i];
     if (expected_device != actual_device) {
-      switch (ctx->policy) {
-        case TFE_DEVICE_PLACEMENT_EXPLICIT:
+      switch (TFE_ContextGetDevicePlacementPolicy(ctx)) {
+        case TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32:
           // TODO(xpan): See if we could bubble python related error up
           // to python level.
+          if (op->inputs[i].dtype() == tensorflow::DT_INT32) {
+            // Note: enabling silent copies of int32 tensors to match behavior
+            // of graph mode.
+            break;
+          }
+          TF_FALLTHROUGH_INTENDED;
+        case TFE_DEVICE_PLACEMENT_EXPLICIT:
           return tensorflow::errors::InvalidArgument(
               "Tensors on conflicting devices:"
               " cannot compute ",
@@ -494,6 +549,228 @@ tensorflow::Status ValidateInputTypeAndPlacement(
   }
   return tensorflow::Status::OK();
 }
+
+#ifdef TENSORFLOW_EAGER_USE_XLA
+// Synthesizes and returns a wrapper function over `op`, which must be a
+// primitive op (e.g. matmul).
+//
+// The wrapper function conforms to the function signature expected by
+// _XlaLaunchOp, with input params ordered by <constants, (variable) args and
+// resources>. For example, if the op has input params <Const1, Arg2, Const3,
+// Resource4, Arg5>, they will be reordered to <Const1, Const3, Arg2, Arg5,
+// Resource4> as the input params to the synthesized function.
+//
+// It populates `const_input_types`, `arg_input_types` and
+// `op_input_to_func_input` based on the reordering results, that the caller can
+// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets
+// `status` accordingly.
+const tensorflow::FunctionDef* OpToFunction(
+    TFE_Op* op, std::vector<TF_DataType>* const_input_types,
+    std::vector<TF_DataType>* arg_input_types,
+    tensorflow::gtl::FlatMap<int, int>* op_input_to_func_input,
+    TF_Status* status) {
+  DCHECK(!op->is_function());
+
+  tensorflow::FunctionDef fdef;
+
+  // Get the OpDef of the op we are trying to encapsulate.
+  TFE_Context* ctx = op->ctx;
+  const tensorflow::OpRegistrationData* op_data;
+  {
+    tensorflow::tf_shared_lock l(ctx->functions_mu);
+    status->status = ctx->func_lib_def.LookUp(op->name, &op_data);
+    if (!status->status.ok()) {
+      return nullptr;
+    }
+  }
+  const tensorflow::OpDef& op_def = op_data->op_def;
+
+  tensorflow::OpDef* signature = fdef.mutable_signature();
+
+  // Handle constant inputs.
+  const std::unordered_set<string> const_inputs(
+      *tensorflow::XlaOpRegistry::CompileTimeConstantInputs(op->name));
+
+  // First add place holders for the input args, so that we can refer to them by
+  // position in the next loop. Also tally up the resource inputs.
+  int num_resource_inputs = 0;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    if (op_def.input_arg(i).type() == tensorflow::DT_RESOURCE) {
+      ++num_resource_inputs;
+    }
+    signature->add_input_arg();
+  }
+
+  // Now we map the input params from `op_def` to `signature`, where the param
+  // ordering for `signature` is: <constants, args, resources>.
+  int const_index = 0;
+  int arg_index = const_inputs.size();
+  int resource_index = op_def.input_arg_size() - num_resource_inputs;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    const tensorflow::OpDef::ArgDef& op_input_arg = op_def.input_arg(i);
+    tensorflow::OpDef::ArgDef* func_input_arg = nullptr;
+    if (const_inputs.find(op_input_arg.name()) != const_inputs.end()) {
+      VLOG(1) << "For const input, mapping op input " << i << " to func input "
+              << const_index;
+      (*op_input_to_func_input)[i] = const_index;
+      func_input_arg = signature->mutable_input_arg(const_index++);
+      const_input_types->push_back(
+          static_cast<TF_DataType>(op->inputs[i].dtype()));
+    } else if (op_input_arg.type() == tensorflow::DT_RESOURCE) {
+      VLOG(1) << "For resource input, mapping op input " << i
+              << " to func input " << resource_index;
+      (*op_input_to_func_input)[i] = resource_index;
+      func_input_arg = signature->mutable_input_arg(resource_index++);
+    } else {
+      VLOG(1) << "For arg input, mapping op input " << i << " to func input "
+              << arg_index;
+      (*op_input_to_func_input)[i] = arg_index;
+      func_input_arg = signature->mutable_input_arg(arg_index++);
+      arg_input_types->push_back(
+          static_cast<TF_DataType>(op->inputs[i].dtype()));
+    }
+
+    func_input_arg->set_name(op_input_arg.name());
+    func_input_arg->set_type(op->inputs[i].dtype());
+  }
+  VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString();
+
+  // Resources args are at the end of the function input params, and we should
+  // have iterated over all of them.
+  DCHECK_EQ(signature->input_arg_size(), resource_index);
+
+  // Make the synthesized function's name unique.
+  signature->set_name(tensorflow::strings::StrCat(
+      op_def.name(), func_id_generator.fetch_add(1)));
+
+  // Add the node def and set its input names to match op_def's names.
+  const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef();
+  DCHECK_EQ(signature->input_arg_size(), ndef.input_size());
+  *fdef.add_node_def() = ndef;
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    fdef.mutable_node_def(0)->set_input(i, op_def.input_arg(i).name());
+  }
+  VLOG(1) << "Added NodeDef: " << fdef.DebugString();
+
+  // Fix the output names and set output types.
+  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+    tensorflow::OpDef::ArgDef* arg = signature->add_output_arg();
+    const tensorflow::OpDef::ArgDef& op_def_arg = op_def.output_arg(i);
+    const string& out_tensor_name = tensorflow::strings::StrCat(
+        ndef.name(), ":", op_def_arg.name(), ":", 0);
+    arg->set_name(op_def_arg.name());
+    (*fdef.mutable_ret())[op_def_arg.name()] = out_tensor_name;
+    const string& type_attr = op_def_arg.type_attr();
+    if (!type_attr.empty()) {
+      auto i = ndef.attr().find(type_attr);
+      if (i == ndef.attr().end()) {
+        status->status = tensorflow::errors::InvalidArgument(
+            tensorflow::strings::StrCat("Could not find attr ", type_attr,
+                                        " in NodeDef ", ndef.DebugString()));
+        return nullptr;
+      }
+      arg->set_type(i->second.type());
+    }
+  }
+  VLOG(1) << "Fixed Output names and all types: " << fdef.DebugString();
+
+  tensorflow::mutex_lock l(ctx->functions_mu);
+  status->status = ctx->func_lib_def.AddFunctionDef(fdef);
+  if (!status->status.ok()) return nullptr;
+  const auto ret = ctx->func_lib_def.Find(signature->name());
+  DCHECK(ret != nullptr);
+  return ret;
+}
+
+// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed
+// via XLA.
+std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
+  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->name;
+  auto launch_op =
+      std::unique_ptr<TFE_Op>(TFE_NewOp(op->ctx, "_XlaLaunch", status));
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  if (op->device) {
+    TFE_OpSetDevice(launch_op.get(), op->device->name().c_str(), status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+
+  const tensorflow::FunctionDef* fdef;
+  {
+    tensorflow::tf_shared_lock l(op->ctx->functions_mu);
+    fdef = op->ctx->func_lib_def.Find(op->name);
+  }
+  std::vector<TF_DataType> const_input_types;
+  std::vector<TF_DataType> arg_input_types;
+  tensorflow::gtl::FlatMap<int, int> op_input_to_func_input;
+  if (fdef == nullptr) {
+    // See if this is a primitive op, and if so create a function for it, so
+    // that _XlaLaunchOp can access it.
+    fdef = OpToFunction(op, &const_input_types, &arg_input_types,
+                        &op_input_to_func_input, status);
+    if (!status->status.ok()) return nullptr;
+  } else {
+    // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for
+    // functions, so we need to find another way to handle constant inputs.
+    for (int i = const_input_types.size();
+         i < fdef->signature().input_arg_size(); ++i) {
+      VLOG(1) << "Adding Targs from input arg " << i;
+      const tensorflow::OpDef::ArgDef& arg = fdef->signature().input_arg(i);
+      arg_input_types.push_back(static_cast<TF_DataType>(arg.type()));
+    }
+  }
+  DCHECK(fdef != nullptr);
+
+  // Copy inputs and their devices.
+  // Since input param reordering may have occurred between `op` and `launch_op`
+  // via `op_input_to_func_input`, adjust the actual inputs accordingly.
+  launch_op->inputs = op->inputs;
+  launch_op->input_devices = op->input_devices;
+  if (!op_input_to_func_input.empty()) {
+    DCHECK_EQ(op->inputs.size(), op_input_to_func_input.size());
+    if (!op->input_devices.empty()) {
+      DCHECK_EQ(op->input_devices.size(), op_input_to_func_input.size());
+    }
+    for (int i = 0; i < op_input_to_func_input.size(); ++i) {
+      VLOG(1) << "mapping op input " << i << " to func input "
+              << op_input_to_func_input[i];
+
+      launch_op->inputs[op_input_to_func_input[i]] = op->inputs[i];
+      if (!op->input_devices.empty()) {
+        launch_op->input_devices[op_input_to_func_input[i]] =
+            op->input_devices[i];
+      }
+    }
+  }
+  launch_op->attrs.NumInputs(op->inputs.size());
+
+  TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(),
+                        const_input_types.size());
+
+  // Set Targs and Nresources attrs.
+  TFE_OpSetAttrTypeList(launch_op.get(), "Targs", arg_input_types.data(),
+                        arg_input_types.size());
+  const int num_resource_inputs = fdef->signature().input_arg_size() -
+                                  const_input_types.size() -
+                                  arg_input_types.size();
+  TFE_OpSetAttrInt(launch_op.get(), "Nresources", num_resource_inputs);
+
+  // Set Tresults attr.
+  std::vector<TF_DataType> tresults;
+  for (const tensorflow::OpDef::ArgDef& arg : fdef->signature().output_arg()) {
+    tresults.push_back(static_cast<TF_DataType>(arg.type()));
+  }
+  TFE_OpSetAttrTypeList(launch_op.get(), "Tresults", tresults.data(),
+                        tresults.size());
+
+  // Set function attr.
+  tensorflow::AttrValue attr_value;
+  tensorflow::NameAttrList* func = attr_value.mutable_func();
+  func->set_name(fdef->signature().name());
+  launch_op->attrs.Set("function", attr_value);
+
+  return launch_op;
+}
+#endif  // TENSORFLOW_EAGER_USE_XLA
 }  // namespace
 
 void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
@@ -502,11 +779,26 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
   // TODO(ashankar): ASSUMPTION: ctx->devices()[0] is always CPU
   tensorflow::Device* device =
       (op->device == nullptr) ? ctx->devices()[0] : op->device;
+
+#ifdef TENSORFLOW_EAGER_USE_XLA
+  std::unique_ptr<TFE_Op> xla_launch_op;
+  if (op->use_xla && op->name != "_XlaLaunch") {
+    xla_launch_op = BuildXlaLaunch(op, status);
+    if (!status->status.ok()) {
+      return;
+    }
+    op = xla_launch_op.get();
+  }
+#endif  // TENSORFLOW_EAGER_USE_XLA
+
   std::vector<tensorflow::Tensor> outputs(1);
   const tensorflow::MemoryTypeVector* output_memory_types = nullptr;
   tensorflow::Fprint128 cache_key = op->attrs.CacheKey(device->name());
-  tensorflow::KernelAndDevice* kernel =
-      tensorflow::gtl::FindPtrOrNull(ctx->kernel_cache, cache_key);
+  tensorflow::KernelAndDevice* kernel;
+  {
+    tensorflow::tf_shared_lock l(ctx->cache_mu);
+    kernel = tensorflow::gtl::FindPtrOrNull(ctx->kernel_cache, cache_key);
+  }
   if (kernel == nullptr) {
     const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef();
     kernel = new tensorflow::KernelAndDevice(ctx->rendezvous);
@@ -522,6 +814,7 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
       delete kernel;
       return;
     }
+    tensorflow::mutex_lock ml(ctx->cache_mu);
     tensorflow::gtl::InsertOrUpdate(&(ctx->kernel_cache), cache_key, kernel);
   }
   std::vector<TFE_TensorHandle*> copied_tensors;
@@ -534,19 +827,54 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     }
     return;
   }
+  std::unique_ptr<tensorflow::NodeExecStats> maybe_stats;
+  if (ctx->should_store_metadata.load()) {
+    maybe_stats.reset(new tensorflow::NodeExecStats);
+    maybe_stats->set_node_name(op->name);
+    maybe_stats->set_all_start_micros(tensorflow::Env::Default()->NowMicros());
+    maybe_stats->set_op_start_rel_micros(0);
+    maybe_stats->set_scheduled_micros(tensorflow::Env::Default()->NowMicros());
+    // TODO(apassos) track referenced tensors
+  }
   // WARNING: kernel->Run utilizes the FunctionLibraryRuntime
   // (ctx->func_lib(device)), which in turn holds a pointer to func_lib_def,
   // which is GUARDED_BY(ctx->functions_mu). But knowledge of the implementation
-  // of FunctionLibraryRuntime tells use that func_lib_def is not accessed by
+  // of FunctionLibraryRuntime tells us that func_lib_def is not accessed by
   // FunctionLibraryRuntime::Run(), so there is no thread-safety concern here.
   // This is quite subtle. Re-work things to make this better?  (Would it make
   // sense for FunctionLibraryRuntime to ensure thread-safe access to
-  // FunctionLibraryDefinition?).
-  status->status = kernel->Run(&op->inputs, &outputs);
+  // FunctionLibraryDefinition?).  TODO(apassos) figure out how to record stats
+  // for ops which are a part of functions.
+  status->status = kernel->Run(&op->inputs, &outputs, maybe_stats.get());
   for (auto* t : copied_tensors) {
     TFE_DeleteTensorHandle(t);
   }
   if (!status->status.ok()) return;
+  if (maybe_stats != nullptr) {
+    maybe_stats->set_op_end_rel_micros(tensorflow::Env::Default()->NowMicros() -
+                                       maybe_stats->all_start_micros());
+    tensorflow::mutex_lock ml(ctx->metadata_mu);
+    if (ctx->should_store_metadata.load()) {
+      auto* step_stats = ctx->run_metadata.mutable_step_stats();
+      // Lazily initialize the RunMetadata with information about all devices if
+      // this is the first call.
+      while (step_stats->dev_stats_size() < ctx->devices().size()) {
+        step_stats->add_dev_stats();
+      }
+      // Find the current device's index.
+      int device_idx = 0;
+      for (int i = 0; i < ctx->devices().size(); ++i) {
+        if (ctx->devices()[i] == device) {
+          device_idx = i;
+          break;
+        }
+      }
+      // Populate the device stats for this device.
+      auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+      dev_stats->set_device(device->name());
+      *dev_stats->add_node_stats() = *maybe_stats;
+    }
+  }
   *num_retvals = std::min<int>(*num_retvals, outputs.size());
   for (int i = 0; i < *num_retvals; ++i) {
     tensorflow::Device* d = IsCPU(device) ? nullptr : device;
@@ -593,3 +921,20 @@ const tensorflow::Tensor* TFE_TensorHandleUnderlyingTensorInHostMemory(
   }
   return &h->t;
 }
+
+void TFE_ContextEnableRunMetadata(TFE_Context* ctx) {
+  ctx->should_store_metadata.store(true);
+}
+
+void TFE_ContextDisableRunMetadata(TFE_Context* ctx) {
+  tensorflow::mutex_lock ml(ctx->metadata_mu);
+  ctx->should_store_metadata.store(false);
+  ctx->run_metadata.Clear();
+}
+
+void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
+                                  TF_Status* status) {
+  tensorflow::mutex_lock ml(ctx->metadata_mu);
+  status->status = MessageToBuffer(ctx->run_metadata, buf);
+  ctx->run_metadata.Clear();
+}
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index ca105962df0d6655946304159937621022e7fcba..7a321b54da343fd2b8912187bc620c1e7456db0c 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_C_EAGER_C_API_H_
 
 // C API extensions to experiment with eager execution of kernels.
+// WARNING: Unlike tensorflow/c/c_api.h, the API here is not guaranteed to be
+// stable and can change without notice.
 
 #include "tensorflow/c/c_api.h"
 
@@ -59,14 +61,16 @@ TF_CAPI_EXPORT extern void TFE_ContextOptionsSetConfig(
 // Controls how to act when we try to run an operation on a given device but
 // some input tensors are not on that device.
 typedef enum TFE_ContextDevicePlacementPolicy {
-  // The default: running operations with input tensors on the wrong device will
-  // fail.
+  // Running operations with input tensors on the wrong device will fail.
   TFE_DEVICE_PLACEMENT_EXPLICIT = 0,
   // Copy the tensor to the right device but log a warning.
   TFE_DEVICE_PLACEMENT_WARN = 1,
   // Silently copy the tensor, which has a performance cost since the
   // operation will be blocked till the copy completes.
   TFE_DEVICE_PLACEMENT_SILENT = 2,
+  // Default placement policy which silently copies int32 tensors but not other
+  // dtypes.
+  TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
 } TFE_ContextDevicePlacementPolicy;
 
 TF_CAPI_EXPORT extern void TFE_ContextOptionsSetDevicePlacementPolicy(
@@ -83,10 +87,27 @@ typedef struct TFE_Context TFE_Context;
 
 TF_CAPI_EXPORT extern TFE_Context* TFE_NewContext(
     const TFE_ContextOptions* opts, TF_Status* status);
-TF_CAPI_EXPORT extern void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_DeleteContext(TFE_Context* ctx,
+                                             TF_Status* status);
 TF_CAPI_EXPORT extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx,
                                                             TF_Status* status);
 
+// Clears the internal caches in the TFE context. Useful when reseeding random
+// ops.
+TF_CAPI_EXPORT extern void TFE_ContextClearCaches(TFE_Context* ctx);
+
+// Sets a thread-local device placement policy. After this call, other calls to
+// TFE_Execute in the same thread will use the device policy specified here
+// instead of the device policy used to construct the context. This has no
+// effect on the device policy used by other program threads.
+TF_CAPI_EXPORT extern void TFE_ContextSetThreadLocalDevicePlacementPolicy(
+    TFE_Context*, TFE_ContextDevicePlacementPolicy);
+
+// Returns the device placement policy to be used by this context in the current
+// thread.
+TF_CAPI_EXPORT extern TFE_ContextDevicePlacementPolicy
+TFE_ContextGetDevicePlacementPolicy(TFE_Context*);
+
 // A handle to a tensor on a device.
 //
 // Like a TF_Tensor, a TFE_TensorHandle refers to a tensor with a value, shape,
@@ -99,8 +120,10 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t,
 TF_CAPI_EXPORT extern void TFE_DeleteTensorHandle(TFE_TensorHandle* h);
 TF_CAPI_EXPORT extern TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h);
 TF_CAPI_EXPORT extern int TFE_TensorHandleNumDims(TFE_TensorHandle* h);
-TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index);
-TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h);
+TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h,
+                                                  int dim_index);
+TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(
+    TFE_TensorHandle* h);
 TF_CAPI_EXPORT extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h,
                                                          TF_Status* status);
 
@@ -110,10 +133,9 @@ TF_CAPI_EXPORT extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h,
 // that shares the underlying buffer. Otherwise, it currently requires at least
 // one of the source or destination devices to be CPU (i.e., for the source or
 // destination tensor to be placed in host memory).
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
-                                                                     TFE_Context* ctx,
-                                                                     const char* device_name,
-                                                                     TF_Status* status);
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(
+    TFE_TensorHandle* h, TFE_Context* ctx, const char* device_name,
+    TF_Status* status);
 
 // Description of the TensorFlow op to execute.
 //
@@ -128,17 +150,31 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorH
 //     the additional sanity checks there seem unnecessary;
 typedef struct TFE_Op TFE_Op;
 
-TF_CAPI_EXPORT extern TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
+TF_CAPI_EXPORT extern TFE_Op* TFE_NewOp(TFE_Context* ctx,
+                                        const char* op_or_function_name,
                                         TF_Status* status);
 TF_CAPI_EXPORT extern void TFE_DeleteOp(TFE_Op* op);
 
 TF_CAPI_EXPORT extern void TFE_OpSetDevice(TFE_Op* op, const char* device_name,
                                            TF_Status* status);
+// The returned string remains valid throughout the lifetime of 'op'.
+TF_CAPI_EXPORT extern const char* TFE_OpGetDevice(TFE_Op* op,
+                                                  TF_Status* status);
+
+// When 'enable' is set to 1, and if TensorFlow library is built with XLA
+// support, a subsequent TFE_Execute() call on `op` will run the op via XLA.
+//
+// If the library is not built with XLA support, this call would be a no-op.
+TF_CAPI_EXPORT extern void TFE_OpSetXLACompilation(TFE_Op* op,
+                                                   unsigned char enable);
 
-TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h,
+                                          TF_Status* status);
 
-TF_CAPI_EXPORT extern TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
-                                                    unsigned char* is_list, TF_Status* status);
+TF_CAPI_EXPORT extern TF_AttrType TFE_OpGetAttrType(TFE_Op* op,
+                                                    const char* attr_name,
+                                                    unsigned char* is_list,
+                                                    TF_Status* status);
 // Get an attribute type given an op name; a fusion of TFE_NewOp and
 // TFE_OpGetAttrType for use from Python without the overhead of the individual
 // calls and memory management of TFE_Op.
@@ -146,10 +182,13 @@ TF_CAPI_EXPORT extern TF_AttrType TFE_OpNameGetAttrType(
     TFE_Context* ctx, const char* op_or_function_name, const char* attr_name,
     unsigned char* is_list, TF_Status* status);
 
-TF_CAPI_EXPORT extern void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name,
+TF_CAPI_EXPORT extern void TFE_OpSetAttrString(TFE_Op* op,
+                                               const char* attr_name,
                                                const char* value);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name,
+                                            int64_t value);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name,
+                                              float value);
 TF_CAPI_EXPORT extern void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name,
                                              unsigned char value);
 TF_CAPI_EXPORT extern void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name,
@@ -158,7 +197,8 @@ TF_CAPI_EXPORT extern void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name,
 // -1 and `dims` can be null.  If a dimension is unknown, the
 // corresponding entry in the `dims` array must be -1.
 TF_CAPI_EXPORT extern void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name,
-                                              const int64_t* dims, const int num_dims,
+                                              const int64_t* dims,
+                                              const int num_dims,
                                               TF_Status* out_status);
 
 // Sets the attribute attr_name to be a function specified by 'function'.
@@ -169,19 +209,33 @@ TF_CAPI_EXPORT extern void TFE_OpSetAttrFunction(TFE_Op* op,
                                                  const char* attr_name,
                                                  const TFE_Op* value);
 
-TF_CAPI_EXPORT extern void TFE_OpSetAttrStringList(TFE_Op* op, const char* attr_name,
-                                                   const char** value, int num_values);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name,
-                                                const int64_t* values, int num_values);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrFloatList(TFE_Op* op, const char* attr_name,
-                                                  const float* values, int num_values);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name,
-                                                 const unsigned char* values, int num_values);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name,
-                                                 const TF_DataType* values, int num_values);
-TF_CAPI_EXPORT extern void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
-                                                  const int64_t** dims, const int* num_dims,
-                                                  int num_values, TF_Status* out_status);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrStringList(TFE_Op* op,
+                                                   const char* attr_name,
+                                                   const char** value,
+                                                   int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrIntList(TFE_Op* op,
+                                                const char* attr_name,
+                                                const int64_t* values,
+                                                int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrFloatList(TFE_Op* op,
+                                                  const char* attr_name,
+                                                  const float* values,
+                                                  int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrBoolList(TFE_Op* op,
+                                                 const char* attr_name,
+                                                 const unsigned char* values,
+                                                 int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrTypeList(TFE_Op* op,
+                                                 const char* attr_name,
+                                                 const TF_DataType* values,
+                                                 int num_values);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrShapeList(
+    TFE_Op* op, const char* attr_name, const int64_t** dims,
+    const int* num_dims, int num_values, TF_Status* out_status);
+TF_CAPI_EXPORT extern void TFE_OpSetAttrFunctionList(TFE_Op* op,
+                                                     const char* attr_name,
+                                                     const TFE_Op** value,
+                                                     int num_values);
 
 // Execute the operation defined by 'op' and return handles to computed
 // tensors in 'retvals'.
@@ -196,9 +250,9 @@ TF_CAPI_EXPORT extern void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals,
 
 // Add a function (serialized FunctionDef protocol buffer) to ctx so
 // that it can be invoked using TFE_Execute.
-TF_CAPI_EXPORT extern void TFE_ContextAddFunctionDef(TFE_Context* ctx,
-                                                     const char* serialized_function_def,
-                                                     size_t size, TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_ContextAddFunctionDef(
+    TFE_Context* ctx, const char* serialized_function_def, size_t size,
+    TF_Status* status);
 
 // Adds a function (created from TF_GraphToFunction or
 // TF_FunctionImportFunctionDef) to the context, allowing it to be executed with
@@ -207,6 +261,19 @@ TF_CAPI_EXPORT extern void TFE_ContextAddFunction(TFE_Context* ctx,
                                                   TF_Function* function,
                                                   TF_Status* status);
 
+// Enables tracing of RunMetadata on the ops executed from this context.
+TF_CAPI_EXPORT extern void TFE_ContextEnableRunMetadata(TFE_Context* ctx);
+
+// Disables tracing of RunMetadata on the ops executed from this context.
+TF_CAPI_EXPORT extern void TFE_ContextDisableRunMetadata(TFE_Context* ctx);
+
+// Populates the passed-in buffer with a serialized RunMetadata protocol buffer
+// containing any run metadata information accumulated so far and clears this
+// information.
+TF_CAPI_EXPORT extern void TFE_ContextExportRunMetadata(TFE_Context* ctx,
+                                                        TF_Buffer* buf,
+                                                        TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 0971e2ab2fe98cc8bf6f631f41d5adce90ee7051..7b9f1db02ed9c53a280c7bd1284165cac4fb6353 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 #include <string>
+#include <thread>
 #include <vector>
 
 #include "tensorflow/c/c_api.h"
@@ -34,20 +35,34 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/public/version.h"
 
 struct TFE_ContextOptions {
   TF_SessionOptions session_options;
-  TFE_ContextDevicePlacementPolicy policy{TFE_DEVICE_PLACEMENT_EXPLICIT};
+  TFE_ContextDevicePlacementPolicy policy{
+      TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32};
 };
 
 struct TFE_Context {
-  explicit TFE_Context(TF_Session* s) : session(s) {}
-
-  TFE_ContextDevicePlacementPolicy policy;
+  explicit TFE_Context(const TFE_ContextOptions& opts, TF_Session* s)
+      : policy(opts.policy),
+        session(s),
+        rendezvous(new tensorflow::IntraProcessRendezvous(s->device_mgr)),
+        pflr(new tensorflow::ProcessFunctionLibraryRuntime(
+            session->device_mgr, opts.session_options.options.env,
+            TF_GRAPH_DEF_VERSION, &func_lib_def, {})) {}
+
+  const TFE_ContextDevicePlacementPolicy policy;
+
+  // Note: we cannot use C++11 thread_local here as there is no concept of a
+  // thread-local-object-local variable in C++11.
+  tensorflow::mutex policy_map_mu;
+  std::unordered_map<std::thread::id, TFE_ContextDevicePlacementPolicy>
+      thread_local_policies GUARDED_BY(policy_map_mu);
 
   // TFE_Context is an extension of TF_Session. And TF_Session needs a TF_Graph.
-  TF_Session* session;
-  tensorflow::Rendezvous* rendezvous;
+  TF_Session* const session;
+  tensorflow::Rendezvous* const rendezvous;
 
   tensorflow::mutex functions_mu;
   tensorflow::FunctionLibraryDefinition func_lib_def GUARDED_BY(functions_mu){
@@ -56,17 +71,23 @@ struct TFE_Context {
   // One FunctionLibraryRuntime per device.
   // func_libs[i] is the FunctionLibraryRuntime corresponding to
   // session->devices[i].
-  std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr;
+  const std::unique_ptr<tensorflow::ProcessFunctionLibraryRuntime> pflr;
 
+  tensorflow::mutex cache_mu;
   std::unordered_map<tensorflow::Fprint128, tensorflow::KernelAndDevice*,
                      tensorflow::Fprint128Hasher>
-      kernel_cache;
+      kernel_cache GUARDED_BY(cache_mu);
 
-  tensorflow::FunctionLibraryRuntime* func_lib(tensorflow::Device* d) {
+  tensorflow::FunctionLibraryRuntime* func_lib(tensorflow::Device* d) const {
     return pflr->GetFLR(d->name());
   }
 
   const std::vector<tensorflow::Device*>& devices() { return session->devices; }
+
+  // Whether we should compute RunMetadata.
+  std::atomic<bool> should_store_metadata{false};
+  tensorflow::mutex metadata_mu;
+  tensorflow::RunMetadata run_metadata GUARDED_BY(metadata_mu);
 };
 
 struct TFE_TensorHandle {
@@ -86,6 +107,8 @@ struct TFE_TensorHandle {
 };
 
 struct TFE_Op {
+  // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a
+  // primitive operation.
   TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t)
       : ctx(ctx), name(op), attrs(op), attr_types(t), device(nullptr) {}
 
@@ -98,6 +121,7 @@ struct TFE_Op {
   std::vector<tensorflow::Tensor> inputs;
   std::vector<tensorflow::Device*> input_devices;
   tensorflow::Device* device;
+  bool use_xla = false;
 };
 
 #endif  // TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 3fe0b7efa11bc619ed98bf9a1634ade5b6ed0a7c..4a3ecbc0abb16296a84c0d2184dc3fc9f7f3ebb4 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 using tensorflow::string;
 
@@ -59,6 +60,63 @@ TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   return op;
 }
 
+TFE_TensorHandle* TestAxisTensorHandle() {
+  int64_t dims[] = {1};
+  int data[] = {1};
+  TF_Tensor* t = TF_AllocateTensor(
+      TF_INT32, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
+TFE_Op* MinOp(TFE_Context* ctx, TFE_TensorHandle* input,
+              TFE_TensorHandle* axis) {
+  TF_Status* status = TF_NewStatus();
+
+  TFE_Op* op = TFE_NewOp(ctx, "Min", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, input, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, axis, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetAttrBool(op, "keep_dims", 1);
+  TFE_OpSetAttrType(op, "Tidx", TF_INT32);
+  TF_DeleteStatus(status);
+  TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(input));
+
+  return op;
+}
+
+// If there is a GPU device, returns true and sets 'gpu_device_name'
+// accordingly.
+bool GetGPUDeviceName(TFE_Context* ctx, string* gpu_device_name) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
+  CHECK_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  const int num_devices = TF_DeviceListCount(devices);
+  for (int i = 0; i < num_devices; ++i) {
+    const string device_type(TF_DeviceListType(devices, i, status.get()));
+    CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
+    const string device_name(TF_DeviceListName(devices, i, status.get()));
+    CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
+    if (device_type == "GPU") {
+      *gpu_device_name = device_name;
+      LOG(INFO) << "Found GPU device " << device_name;
+      TF_DeleteDeviceList(devices);
+      return true;
+    }
+  }
+  TF_DeleteDeviceList(devices);
+  return false;
+}
+
 void BM_InitOp(int iters) {
   tensorflow::testing::StopTiming();
   TF_Status* status = TF_NewStatus();
@@ -216,11 +274,10 @@ TEST(CAPI, TensorHandleCopyBetweenDevices) {
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 }
 
-TEST(CAPI, TensorHandleSilentCopy) {
+TEST(CAPI, TensorHandleCopyBetweenTwoGPUDevices) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
   TFE_Context* ctx = TFE_NewContext(opts, status.get());
   TFE_DeleteContextOptions(opts);
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
@@ -233,18 +290,111 @@ TEST(CAPI, TensorHandleSilentCopy) {
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   const int num_devices = TF_DeviceListCount(devices);
 
+  const char* kCPUDevice = "CPU:0";
+  if (num_devices < 3) {
+    TF_DeleteDeviceList(devices);
+    TF_DeleteTensor(t);
+    TFE_DeleteTensorHandle(hcpu);
+    TFE_DeleteContext(ctx, status.get());
+    return;
+  }
+  const string gpu_1_name(TF_DeviceListName(devices, 1, status.get()));
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK);
+  const string gpu_2_name(TF_DeviceListName(devices, 2, status.get()));
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK);
+  TFE_TensorHandle* hdevice =
+      TFE_TensorHandleCopyToDevice(hcpu, ctx, gpu_1_name.c_str(), status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK);
+
+  TFE_TensorHandle* hdevice2 = TFE_TensorHandleCopyToDevice(
+      hdevice, ctx, gpu_2_name.c_str(), status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK);
+  TFE_DeleteTensorHandle(hdevice);
+  // Copy back to CPU
+  TFE_TensorHandle* hcopy =
+      TFE_TensorHandleCopyToDevice(hdevice2, ctx, kCPUDevice, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK);
+  TFE_DeleteTensorHandle(hdevice2);
+
+  // Ensure that the contents are the same!
+  TF_Tensor* tcopy = TFE_TensorHandleResolve(hcopy, status.get());
+  TFE_DeleteTensorHandle(hcopy);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK);
+  EXPECT_EQ(TF_TensorByteSize(t), TF_TensorByteSize(tcopy));
+  EXPECT_EQ(
+      0, memcmp(TF_TensorData(t), TF_TensorData(tcopy), TF_TensorByteSize(t)));
+  TF_DeleteTensor(tcopy);
+
+  TF_DeleteDeviceList(devices);
+  TF_DeleteTensor(t);
+  TFE_DeleteTensorHandle(hcpu);
+  TFE_DeleteContext(ctx, status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+}
+
+TEST(CAPI, TensorHandleSilentCopy) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status.get());
+  TFE_DeleteContextOptions(opts);
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TFE_TensorHandle* hcpu = TestMatrixTensorHandle();
+  TF_Tensor* t = TFE_TensorHandleResolve(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
   // Disable the test if no GPU is present.
-  if (num_devices > 1) {
-    const int device_to_use = 1;
-    const string name(TF_DeviceListName(devices, device_to_use, status.get()));
+  string gpu_device_name;
+  if (GetGPUDeviceName(ctx, &gpu_device_name)) {
+    TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
+        hcpu, ctx, gpu_device_name.c_str(), status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-    TFE_TensorHandle* hgpu =
-        TFE_TensorHandleCopyToDevice(hcpu, ctx, name.c_str(), status.get());
+    TFE_Op* matmul = MatMulOp(ctx, hcpu, hgpu);
+    TFE_OpSetDevice(matmul, gpu_device_name.c_str(), status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_TensorHandle* retvals[1];
+    int num_retvals = 1;
+    TFE_Execute(matmul, &retvals[0], &num_retvals, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_DeleteOp(matmul);
+    TFE_DeleteTensorHandle(retvals[0]);
+    TFE_DeleteTensorHandle(hgpu);
+  }
+
+  TF_DeleteTensor(t);
+  TFE_DeleteTensorHandle(hcpu);
+  TFE_DeleteContext(ctx, status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+}
+
+TEST(CAPI, TensorHandleSilentCopyLocal) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts,
+                                             TFE_DEVICE_PLACEMENT_EXPLICIT);
+  TFE_Context* ctx = TFE_NewContext(opts, status.get());
+  TFE_ContextSetThreadLocalDevicePlacementPolicy(ctx,
+                                                 TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_DeleteContextOptions(opts);
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TFE_TensorHandle* hcpu = TestMatrixTensorHandle();
+  TF_Tensor* t = TFE_TensorHandleResolve(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  // Disable the test if no GPU is present.
+  string gpu_device_name;
+  if (GetGPUDeviceName(ctx, &gpu_device_name)) {
+    TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
+        hcpu, ctx, gpu_device_name.c_str(), status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
     TFE_Op* matmul = MatMulOp(ctx, hcpu, hgpu);
-    TFE_OpSetDevice(matmul, name.c_str(), status.get());
+    TFE_OpSetDevice(matmul, gpu_device_name.c_str(), status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
     TFE_TensorHandle* retvals[1];
     int num_retvals = 1;
@@ -255,20 +405,195 @@ TEST(CAPI, TensorHandleSilentCopy) {
     TFE_DeleteTensorHandle(hgpu);
   }
 
-  TF_DeleteDeviceList(devices);
   TF_DeleteTensor(t);
   TFE_DeleteTensorHandle(hcpu);
   TFE_DeleteContext(ctx, status.get());
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 }
 
-TEST(CAPI, Execute) {
+TEST(CAPI, SetAndGetOpDevices) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* matmul = MatMulOp(ctx, m, m);
+
+  // Disable the test if no GPU is present.
+  string gpu_device_name;
+  if (GetGPUDeviceName(ctx, &gpu_device_name)) {
+    TFE_OpSetDevice(matmul, "GPU:0", status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    const char* device_name = TFE_OpGetDevice(matmul, status);
+    ASSERT_TRUE(strstr(device_name, "GPU:0") != nullptr);
+
+    TFE_OpSetDevice(matmul, "CPU:0", status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    device_name = TFE_OpGetDevice(matmul, status);
+    ASSERT_TRUE(strstr(device_name, "CPU:0") != nullptr);
+  }
+
+  TFE_DeleteOp(matmul);
+  TFE_DeleteTensorHandle(m);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+}
+
+TEST(CAPI, Execute_MatMul_CPU) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* matmul = MatMulOp(ctx, m, m);
+  TFE_TensorHandle* retvals[2] = {nullptr};
+  int num_retvals = 2;  // Should be reduced to 1 by the TFE_Execute call.
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(matmul);
+  TFE_DeleteTensorHandle(m);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+  TF_DeleteStatus(status);
+}
+
+TEST(CAPI, Execute_Min_CPU) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* input = TestMatrixTensorHandle();
+  TFE_TensorHandle* axis = TestAxisTensorHandle();
+  TFE_Op* minOp = MinOp(ctx, input, axis);
+  TFE_TensorHandle* retvals[2] = {nullptr};
+  int num_retvals = 2;  // Should be reduced to 1 by the TFE_Execute call.
+  TFE_Execute(minOp, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(minOp);
+  TFE_DeleteTensorHandle(input);
+  TFE_DeleteTensorHandle(axis);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float output[2] = {0};
+  EXPECT_EQ(sizeof(output), TF_TensorByteSize(t));
+  memcpy(&output[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(1, output[0]);
+  EXPECT_EQ(3, output[1]);
+  TF_DeleteStatus(status);
+}
+
+#ifdef TENSORFLOW_EAGER_USE_XLA
+TEST(CAPI, Execute_MatMul_XLA_CPU) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
   TFE_Context* ctx = TFE_NewContext(opts, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
 
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* matmul = MatMulOp(ctx, m, m);
+
+  TFE_OpSetXLACompilation(matmul, true);
+
+  TFE_TensorHandle* retvals[2] = {nullptr};
+  int num_retvals = 2;  // Should be reduced to 1 by the TFE_Execute call.
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  // Running a primitive TF operator via XLA is not yet supported.
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_DeleteOp(matmul);
+  TFE_DeleteTensorHandle(m);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  EXPECT_EQ(1, num_retvals);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+
+  TF_DeleteStatus(status);
+}
+
+TEST(CAPI, Execute_Min_XLA_CPU) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* input = TestMatrixTensorHandle();
+  TFE_TensorHandle* axis = TestAxisTensorHandle();
+  TFE_Op* minOp = MinOp(ctx, input, axis);
+
+  TFE_OpSetXLACompilation(minOp, true);
+
+  TFE_TensorHandle* retvals[2] = {nullptr};
+  int num_retvals = 2;  // Should be reduced to 1 by the TFE_Execute call.
+  TFE_Execute(minOp, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(minOp);
+  TFE_DeleteTensorHandle(input);
+  TFE_DeleteTensorHandle(axis);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float output[2] = {0};
+  EXPECT_EQ(sizeof(output), TF_TensorByteSize(t));
+  memcpy(&output[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(1, output[0]);
+  EXPECT_EQ(3, output[1]);
+  TF_DeleteStatus(status);
+}
+#endif  // TENSORFLOW_EAGER_USE_XLA
+
+TEST(CAPI, ExecuteWithTracing) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  TFE_ContextEnableRunMetadata(ctx);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
   TFE_TensorHandle* m = TestMatrixTensorHandle();
   TFE_Op* matmul = MatMulOp(ctx, m, m);
   TFE_TensorHandle* retvals[2] = {nullptr};
@@ -277,6 +602,13 @@ TEST(CAPI, Execute) {
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteOp(matmul);
   TFE_DeleteTensorHandle(m);
+  TF_Buffer* b = TF_NewBuffer();
+  TFE_ContextExportRunMetadata(ctx, b, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  tensorflow::RunMetadata rm;
+  EXPECT_TRUE(
+      rm.ParseFromString({reinterpret_cast<const char*>(b->data), b->length}));
+  TF_DeleteBuffer(b);
   TFE_DeleteContext(ctx, status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   ASSERT_EQ(1, num_retvals);
@@ -295,7 +627,7 @@ TEST(CAPI, Execute) {
   TF_DeleteStatus(status);
 }
 
-TEST(CAPI, Function) {
+TEST(CAPI, Function_ident_CPU) {
   // First create a simple identity function.
   TF_Graph* function_graph = TF_NewGraph();
   TF_OperationDescription* arg_descr =
@@ -356,6 +688,72 @@ TEST(CAPI, Function) {
   TF_DeleteStatus(status);
 }
 
+#ifdef TENSORFLOW_EAGER_USE_XLA
+TEST(CAPI, Function_ident_XLA_CPU) {
+  // First create a simple identity function.
+  TF_Graph* function_graph = TF_NewGraph();
+  TF_OperationDescription* arg_descr =
+      TF_NewOperation(function_graph, "Placeholder", "arg");
+  TF_SetAttrType(arg_descr, "dtype", TF_INT32);
+  TF_Status* status = TF_NewStatus();
+  TF_Operation* arg = TF_FinishOperation(arg_descr, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_OperationDescription* id_descr =
+      TF_NewOperation(function_graph, "Identity", "id");
+  TF_SetAttrType(id_descr, "T", TF_INT32);
+  TF_AddInput(id_descr, {arg, 0});
+  TF_Operation* id = TF_FinishOperation(id_descr, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_Output input{arg, 0};
+  TF_Output output{id, 0};
+  TF_Function* fn =
+      TF_GraphToFunction(function_graph, "ident", 0, 1, &id, 1, &input, 1,
+                         &output, nullptr, nullptr, "test", status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteGraph(function_graph);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+  TFE_ContextAddFunction(ctx, fn, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteFunction(fn);
+
+  TF_Tensor* t =
+      TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
+  *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
+  TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteTensor(t);
+
+  TFE_Op* op = TFE_NewOp(ctx, "ident", status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TFE_OpAddInput(op, h, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+
+  // Now run it via XLA.
+  TFE_OpSetXLACompilation(op, true);
+
+  std::vector<TFE_TensorHandle*> result;
+  result.push_back(nullptr);
+  int num_retvals = 1;
+  TFE_Execute(op, result.data(), &num_retvals, status);
+  TFE_DeleteOp(op);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  ASSERT_EQ(num_retvals, 1);
+
+  TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
+  TFE_DeleteTensorHandle(h);
+  TF_DeleteTensor(r);
+  TFE_DeleteTensorHandle(result[0]);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteStatus(status);
+}
+#endif  // TENSORFLOW_EAGER_USE_XLA
+
 string MatMulFunction() {
   tensorflow::FunctionDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
diff --git a/tensorflow/c/eager/runtime.cc b/tensorflow/c/eager/runtime.cc
index 38066682a9fc5038c34a4ac3b20a67ceb08ab951..f77a937f1ffc2d146224cb3191a5ca127daefc22 100644
--- a/tensorflow/c/eager/runtime.cc
+++ b/tensorflow/c/eager/runtime.cc
@@ -86,10 +86,9 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
   return Status::OK();
 }
 
-Status AttrTypeByName(const AttrTypeMap* m, const string& attr_name,
+Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
                       TF_AttrType* out, unsigned char* is_list) {
-  CHECK(m);
-  auto* t = gtl::FindOrNull(*m, attr_name);
+  auto* t = gtl::FindOrNull(m, attr_name);
   if (t == nullptr) {
     return errors::InvalidArgument("Attribute '", attr_name,
                                    "' does not exist for this operation");
@@ -173,14 +172,14 @@ void CombineUnordered(const tensorflow::Fprint128& a,
   b->high64 += a.high64;
 }
 
-inline tensorflow::Fprint128 CacheKeyHelper(const StringPiece& s,
+inline tensorflow::Fprint128 CacheKeyHelper(StringPiece s,
                                             const tensorflow::Fprint128& b) {
   // TODO(agarwal): avoid ToString().
   tensorflow::Fprint128 a = tensorflow::Fingerprint128(s.ToString());
   return FingerprintCat128(a, b);
 }
 
-inline tensorflow::Fprint128 CacheKeyHelper(const StringPiece& s, uint64 b) {
+inline tensorflow::Fprint128 CacheKeyHelper(StringPiece s, uint64 b) {
   return CacheKeyHelper(s, {b, b});
 }
 
@@ -262,7 +261,8 @@ Status KernelAndDevice::Init(const NodeDef& ndef, FunctionLibraryRuntime* flib,
 }
 
 Status KernelAndDevice::Run(std::vector<Tensor>* input_tensors,
-                            std::vector<Tensor>* output_tensors) {
+                            std::vector<Tensor>* output_tensors,
+                            NodeExecStats* stats) {
   gtl::InlinedVector<TensorValue, 4> inputs;
   for (Tensor& t : *input_tensors) {
     inputs.push_back(TensorValue(&t));
@@ -284,6 +284,9 @@ Status KernelAndDevice::Run(std::vector<Tensor>* input_tensors,
   params.function_library = flib_;
   params.slice_reader_cache = &slice_reader_cache_;
   params.rendezvous = rendez_;
+  if (stats != nullptr) {
+    params.track_allocations = true;
+  }
   // TODO(apassos): use a thread pool.
   std::function<void(std::function<void()>)> runner =
       [](std::function<void()> f) { f(); };
@@ -297,6 +300,28 @@ Status KernelAndDevice::Run(std::vector<Tensor>* input_tensors,
   for (int i = 0; i < context.num_outputs(); ++i) {
     output_tensors->push_back(Tensor(*context.mutable_output(i)));
   }
+  if (stats != nullptr) {
+    for (const auto& allocator_pair : context.wrapped_allocators()) {
+      AllocatorMemoryUsed* memory = stats->add_memory();
+      memory->set_allocator_name(allocator_pair.first->Name());
+      auto sizes = allocator_pair.second->GetSizes();
+      memory->set_total_bytes(std::get<0>(sizes));
+      memory->set_peak_bytes(std::get<1>(sizes));
+      memory->set_live_bytes(std::get<2>(sizes));
+
+      AllocatorStats allocator_stats;
+      allocator_pair.first->GetStats(&allocator_stats);
+      memory->set_allocator_bytes_in_use(allocator_stats.bytes_in_use);
+      allocator_pair.second->GetRecordsAndUnRef();
+    }
+    auto* ms = stats->mutable_memory_stats();
+    ms->set_temp_memory_size(context.temp_memory_allocated());
+    for (const auto& alloc_id : context.persistent_alloc_ids()) {
+      ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
+    }
+
+    ms->set_persistent_memory_size(context.persistent_memory_allocated());
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/c/eager/runtime.h b/tensorflow/c/eager/runtime.h
index fb97e94a94103d17164cb30f6c6e0ed3e07dc103..4d20b5244a46fcde2eed0a429dced2a77b86aedd 100644
--- a/tensorflow/c/eager/runtime.h
+++ b/tensorflow/c/eager/runtime.h
@@ -43,7 +43,7 @@ typedef std::unordered_map<string, uint32> AttrTypeMap;
 Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out);
 
 // Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
-Status AttrTypeByName(const AttrTypeMap* m, const string& attr_name,
+Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
                       TF_AttrType* out, unsigned char* is_list);
 
 // KernelAndDevice::Init needs a NodeDef only to pass the attribute map through.
@@ -175,7 +175,8 @@ class KernelAndDevice {
       : device_(nullptr), flib_(nullptr), rendez_(rendez) {}
 
   // TODO(ashankar): Handle list-valued inputs.
-  Status Run(std::vector<Tensor>* inputs, std::vector<Tensor>* outputs);
+  Status Run(std::vector<Tensor>* inputs, std::vector<Tensor>* outputs,
+             NodeExecStats* stats);
 
   const OpKernel* kernel() const { return kernel_.get(); }
 
diff --git a/tensorflow/c/eager/runtime_test.cc b/tensorflow/c/eager/runtime_test.cc
index 3236c6be0ec5281e8099219968dd5f5c6c2048c3..643153058ce3d6f0c88dd23a0dec4c6eff060319 100644
--- a/tensorflow/c/eager/runtime_test.cc
+++ b/tensorflow/c/eager/runtime_test.cc
@@ -63,17 +63,17 @@ TEST(AttrTypeMap, Lookup) {
 
   TF_AttrType t;
   unsigned char is_list = 1;
-  s = AttrTypeByName(m, "ThisAttribyteCannotPossiblyExist", &t, &is_list);
+  s = AttrTypeByName(*m, "ThisAttribyteCannotPossiblyExist", &t, &is_list);
   EXPECT_FALSE(s.ok());
   EXPECT_NE(is_list, 0);
-  s = AttrTypeByName(m, "transpose_a", &t, &is_list);
+  s = AttrTypeByName(*m, "transpose_a", &t, &is_list);
   ASSERT_TRUE(s.ok()) << s;
   EXPECT_EQ(TF_ATTR_BOOL, t);
   EXPECT_EQ(is_list, 0);
 
   s = AttrTypeMapForOp("Squeeze", &m);
   ASSERT_TRUE(s.ok()) << s;
-  s = AttrTypeByName(m, "squeeze_dims", &t, &is_list);
+  s = AttrTypeByName(*m, "squeeze_dims", &t, &is_list);
   ASSERT_TRUE(s.ok()) << s;
   EXPECT_EQ(TF_ATTR_INT, t);
   EXPECT_NE(is_list, 0);
@@ -96,7 +96,7 @@ TEST(KernelAndDevice, Run) {
       KernelAndDevice::Init(ndef, env.function_library_runtime(), &kernel);
   ASSERT_TRUE(s.ok()) << s;
   std::vector<Tensor> outputs;
-  s = kernel.Run(&inputs, &outputs);
+  s = kernel.Run(&inputs, &outputs, nullptr);
   ASSERT_TRUE(s.ok()) << s;
   ASSERT_EQ(1, outputs.size());
   const Tensor& out = outputs[0];
@@ -183,7 +183,7 @@ void BM_KernelAndDeviceRun(int iters) {
       KernelAndDevice::Init(ndef, env.function_library_runtime(), &kernel));
   tensorflow::testing::StartTiming();
   for (int i = 0; i < iters; ++i) {
-    TF_CHECK_OK(kernel.Run(&inputs, &outputs));
+    TF_CHECK_OK(kernel.Run(&inputs, &outputs, nullptr));
   }
 }
 BENCHMARK(BM_KernelAndDeviceRun);
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index f52248e7d567b8edd911c6dba1786ceb5d5c721c..bdb0815d6b68444ec1c89b835d563db20ce4d8a1 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -18,12 +18,12 @@ limitations under the License.
 // Language-agnostic gradient tape. Does not perform backpropagation, just
 // maintains the data structures required to do so.
 
-#include <unordered_map>
-#include <unordered_set>
 #include <vector>
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -54,11 +54,11 @@ struct OpTapeEntry {
 // Map from tensor_id to internally-defined operation-id of the operation which
 // produced this tensor. A value of -1 means that the tensor was directly
 // watched and not the result of any operation in the tape.
-using TensorTape = std::unordered_map<int64, int64>;
+using TensorTape = gtl::FlatMap<int64, int64>;
 
 // Map from operation-id to tape entry.
 template <typename BackwardFunction>
-using OpTape = std::unordered_map<int64, OpTapeEntry<BackwardFunction>>;
+using OpTape = gtl::FlatMap<int64, OpTapeEntry<BackwardFunction>>;
 
 // Operations the tape needs to perform on tensors to do backpropagation. Named
 // "vspace" because a subset of these are related to a vector space, such as
@@ -159,9 +159,9 @@ class GradientTape {
 
   // Map from tensor id to number of remaining usages (i.e. how many entries in
   // the tape refer to it); to aid in tape garbage collection.
-  std::unordered_map<int64, int64> tensor_usage_;
+  gtl::FlatMap<int64, int64> tensor_usage_;
 
-  // If true, all activations are deleted in the first call to ComputeGradient.
+  // If false, all activations are deleted in the first call to ComputeGradient.
   // Else, only when this is destructed.
   bool persistent_;
 };
@@ -286,11 +286,11 @@ struct BackpropInitialState {
 
   // Map from tensor ID to how many references still exist for this tensor in
   // the tape.
-  std::unordered_map<int64, int64> tensor_usage_counts;
+  gtl::FlatMap<int64, int64> tensor_usage_counts;
 
   // Maps from op ID to how many output tensors of this op still need to have
   // their gradients computed.
-  std::unordered_map<int64, int64> op_missing_tensor;
+  gtl::FlatMap<int64, int64> op_missing_tensor;
 };
 
 // If `persistent_tape` is true, op_tape is not changed and none of the
@@ -301,8 +301,8 @@ struct BackpropInitialState {
 template <typename BackwardFunction>
 BackpropInitialState<BackwardFunction> PrepareBackprop(
     gtl::ArraySlice<int64> target, const TensorTape& tensor_tape,
-    OpTape<BackwardFunction>* op_tape,
-    const std::unordered_set<int64>& sources_set, bool persistent_tape) {
+    OpTape<BackwardFunction>* op_tape, const gtl::FlatSet<int64>& sources_set,
+    bool persistent_tape) {
   std::vector<int64> tensor_stack;
   tensor_stack.reserve(target.size());
   for (auto t : target) {
@@ -350,7 +350,7 @@ BackpropInitialState<BackwardFunction> PrepareBackprop(
     // Call destructors for all unneeded gradient functions and
     // clear the op_tape. We can clear the tape because ownership of
     // backward functions that will be used for gradient computation
-    // has been transfered to `result`.
+    // has been transferred to `result`.
     for (const auto& op_pair : *op_tape) {
       op_pair.second.backward_function_deleter();
     }
@@ -362,7 +362,7 @@ BackpropInitialState<BackwardFunction> PrepareBackprop(
 template <typename BackwardFunction>
 std::vector<int64> InitialStack(
     const OpTape<BackwardFunction>& op_tape,
-    const std::unordered_map<int64, int64>& op_missing_tensor) {
+    const gtl::FlatMap<int64, int64>& op_missing_tensor) {
   std::vector<int64> result;
   for (auto& op_entry : op_tape) {
     if (op_missing_tensor.find(op_entry.first) == op_missing_tensor.end()) {
@@ -373,13 +373,13 @@ std::vector<int64> InitialStack(
 }
 
 template <typename Gradient, typename BackwardFunction>
-Status InitialGradients(
-    const VSpace<Gradient, BackwardFunction>& vspace,
-    gtl::ArraySlice<int64> target_tensor_ids,
-    gtl::ArraySlice<Gradient*> output_gradients, const TensorTape& tensor_tape,
-    const OpTape<BackwardFunction>& op_tape,
-    const std::unordered_map<int64, int64>& tensor_usage_counts,
-    std::unordered_map<int64, std::vector<Gradient*>>* result) {
+Status InitialGradients(const VSpace<Gradient, BackwardFunction>& vspace,
+                        gtl::ArraySlice<int64> target_tensor_ids,
+                        gtl::ArraySlice<Gradient*> output_gradients,
+                        const TensorTape& tensor_tape,
+                        const OpTape<BackwardFunction>& op_tape,
+                        const gtl::FlatMap<int64, int64>& tensor_usage_counts,
+                        gtl::FlatMap<int64, std::vector<Gradient*>>* result) {
   for (int i = 0; i < target_tensor_ids.size(); ++i) {
     const int64 id = target_tensor_ids[i];
     if (tensor_usage_counts.find(id) != tensor_usage_counts.end()) {
@@ -441,13 +441,13 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
     gtl::ArraySlice<int64> source_tensor_ids,
     gtl::ArraySlice<Gradient*> output_gradients,
     std::vector<Gradient*>* result) {
-  std::unordered_set<int64> sources_set(source_tensor_ids.begin(),
-                                        source_tensor_ids.end());
+  gtl::FlatSet<int64> sources_set(source_tensor_ids.begin(),
+                                  source_tensor_ids.end());
   BackpropInitialState<BackwardFunction> state = PrepareBackprop(
       target_tensor_ids, tensor_tape_, &op_tape_, sources_set, persistent_);
   std::vector<int64> op_stack =
       InitialStack(state.op_tape, state.op_missing_tensor);
-  std::unordered_map<int64, std::vector<Gradient*>> gradients;
+  gtl::FlatMap<int64, std::vector<Gradient*>> gradients;
   Status s = InitialGradients(vspace, target_tensor_ids, output_gradients,
                               tensor_tape_, state.op_tape,
                               state.tensor_usage_counts, &gradients);
@@ -463,7 +463,7 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
     cleanup();
     return s;
   }
-  std::unordered_map<int64, int64> gradients_size;
+  gtl::FlatMap<int64, int64> gradients_size;
   // TODO(apassos) multiple threads could be dequeuing from op_stack at the same
   // time, for better CPU backprop performance.
   VLOG(1) << "Initial stack:";
@@ -472,11 +472,10 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
       VLOG(1) << "  " << t;
     }
   }
-  std::unordered_map<string, std::unordered_set<int>>
-      functions_accept_none_for_indices({
-          {"SoftmaxCrossEntropyWithLogits", {1}},
-          {"FusedBatchNorm", {1, 2, 3, 4}},
-      });
+  gtl::FlatMap<string, gtl::FlatSet<int>> functions_accept_none_for_indices({
+      {"SoftmaxCrossEntropyWithLogits", {1}},
+      {"FusedBatchNorm", {1, 2, 3, 4}},
+  });
   while (!op_stack.empty()) {
     const int64 op = op_stack.back();
     VLOG(1) << "Popped " << op;
@@ -491,6 +490,7 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
     state.op_tape.erase(op_it);
     std::vector<Gradient*> out_gradients;
     out_gradients.reserve(trace.output_tensor_info.size());
+    bool any_gradient_nonzero = false;
     for (int i = 0; i < trace.output_tensor_info.size(); ++i) {
       const int64 id = trace.output_tensor_info[i].id;
       auto grad_it = gradients.find(id);
@@ -506,6 +506,7 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
                            trace.output_tensor_info[i].dtype));
         }
       } else {
+        any_gradient_nonzero = true;
         out_gradients.push_back(vspace.AggregateGradients(grad_it->second));
         if (sources_set.find(grad_it->first) == sources_set.end()) {
           gradients.erase(grad_it);
@@ -513,14 +514,26 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
       }
     }
     std::vector<Gradient*> in_gradients;
-    Status s = vspace.CallBackwardFunction(trace.backward_function,
-                                           out_gradients, &in_gradients);
-    if (!persistent_) {
-      vspace.ReleaseBackwardFunction(trace.backward_function);
-    }
-    if (!s.ok()) {
-      cleanup();
-      return s;
+    if (any_gradient_nonzero) {
+      Status s = vspace.CallBackwardFunction(trace.backward_function,
+                                             out_gradients, &in_gradients);
+      if (!persistent_) {
+        vspace.ReleaseBackwardFunction(trace.backward_function);
+      }
+      if (!s.ok()) {
+        cleanup();
+        return s;
+      }
+    } else {
+      in_gradients.resize(trace.input_tensor_id.size());
+      if (!persistent_) {
+        vspace.ReleaseBackwardFunction(trace.backward_function);
+      }
+      for (Gradient* grad : out_gradients) {
+        if (grad != nullptr) {
+          vspace.DeleteGradient(grad);
+        }
+      }
     }
     VLOG(1) << "Got " << in_gradients.size() << " in_gradients for "
             << trace.input_tensor_id.size() << " sources";
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index ba5a9268b4f671499590d66fb41060dd18e1ce47..6e37cdb5f4beea53d4a2ded0705ae482d0bc2d68 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -22,6 +22,7 @@ namespace tensorflow {
 void AddControlInput(TF_Graph* graph, TF_Operation* op, TF_Operation* input) {
   mutex_lock l(graph->mu);
   graph->graph.AddControlEdge(&input->node, &op->node);
+  RecordMutation(graph, *op, "adding control input");
 }
 
 void SetAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
@@ -36,11 +37,13 @@ void SetAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
 
   mutex_lock l(graph->mu);
   op->node.AddAttr(attr_name, attr_val);
+  RecordMutation(graph, *op, "setting attribute");
 }
 
 void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device) {
   mutex_lock l(graph->mu);
   op->node.set_requested_device(device);
+  RecordMutation(graph, *op, "setting device");
 }
 
 void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
@@ -75,6 +78,25 @@ void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
   }
   status->status = graph->graph.UpdateEdge(&new_src.oper->node, new_src.index,
                                            &dst.oper->node, dst.index);
+
+  if (status->status.ok()) {
+    // This modification only updates the destination node for
+    // the purposes of running this graph in a session. Thus, we don't
+    // record the source node as being modified.
+    RecordMutation(graph, *dst.oper, "updating input tensor");
+  }
+}
+
+void RemoveAllControlInputs(TF_Graph* graph, TF_Operation* op) {
+  mutex_lock l(graph->mu);
+  std::vector<const Edge*> control_edges;
+  for (const Edge* edge : op->node.in_edges()) {
+    if (!edge->IsControlEdge()) continue;
+    control_edges.push_back(edge);
+  }
+  for (const Edge* edge : control_edges) {
+    graph->graph.RemoveControlEdge(edge);
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
index f54585b0a1034ff108202272a11416e34985959e..aa9d9e06b28c54cb8869eb547d36ee3cb0d4e6b8 100644
--- a/tensorflow/c/python_api.h
+++ b/tensorflow/c/python_api.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_C_PYTHON_API_H_
-#define THIRD_PARTY_TENSORFLOW_C_PYTHON_API_H_
+#ifndef TENSORFLOW_C_PYTHON_API_H_
+#define TENSORFLOW_C_PYTHON_API_H_
 
 #include "tensorflow/c/c_api.h"
 
@@ -35,6 +35,8 @@ void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device);
 void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
                 TF_Status* status);
 
+void RemoveAllControlInputs(TF_Graph* graph, TF_Operation* op);
+
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_C_PYTHON_API_H_
+#endif  // TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index e354831d7d25af83c068a68a4f844056263a598c..9060c19e9d2cf965c2b9be07be07c42017da45a8 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -421,7 +421,7 @@ tf_cc_test(
 
 tf_gen_op_wrappers_cc(
     name = "cc_ops",
-    api_def_srcs = ["//tensorflow/core:base_api_def"],
+    api_def_srcs = ["//tensorflow/core/api_def:base_api_def"],
     op_lib_names = [
         "array_ops",
         "audio_ops",
@@ -433,6 +433,7 @@ tf_gen_op_wrappers_cc(
         "linalg_ops",
         "logging_ops",
         "lookup_ops",
+        "manip_ops",
         "math_ops",
         "nn_ops",
         "no_op",
@@ -448,7 +449,6 @@ tf_gen_op_wrappers_cc(
         "ops/const_op.h",
         "ops/standard_ops.h",
     ],
-    override_file = "ops/op_gen_overrides.pbtxt",
     pkg = "//tensorflow/core",
 )
 
@@ -527,14 +527,13 @@ cc_library_with_android_deps(
     ],
     copts = tf_copts(),
     data = [
-        "//tensorflow/core:base_api_def",
+        "//tensorflow/core/api_def:base_api_def",
     ],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:op_gen_overrides_proto_cc",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -547,15 +546,11 @@ tf_cc_test(
         "framework/cc_op_gen.h",
         "framework/cc_op_gen_test.cc",
     ],
-    data = [
-        "//tensorflow/cc:ops/op_gen_overrides.pbtxt",
-    ],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:op_gen_overrides_proto_cc",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -679,7 +674,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
     ],
 )
 
diff --git a/tensorflow/cc/client/client_session_test.cc b/tensorflow/cc/client/client_session_test.cc
index dfbac9788e16e9c7c65abcd1ea213b51d5d5d060..ea5cf5a1f12be316cc6e0d0a02cd3caf4d177400 100644
--- a/tensorflow/cc/client/client_session_test.cc
+++ b/tensorflow/cc/client/client_session_test.cc
@@ -23,7 +23,13 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
-using namespace ops;  // NOLINT(build/namespaces)
+namespace {
+
+using ops::Add;
+using ops::Const;
+using ops::Mul;
+using ops::Placeholder;
+using ops::Sub;
 
 TEST(ClientSessionTest, Basic) {
   Scope root = Scope::NewRootScope();
@@ -89,4 +95,5 @@ TEST(ClientSessionTest, MultiThreaded) {
   test::ExpectTensorEqual<int>(outputs[0], test::AsTensor<int>({-1, 2}, {2}));
 }
 
-}  // end namespace tensorflow
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index d889c518f9c38a9f070970b37a2ad4b1fc26671b..a40ad1ffc3b262840e6ca0043139b1b61e04510d 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -1057,16 +1057,9 @@ string MakeInternal(const string& fname) {
 }  // namespace
 
 void WriteCCOps(const OpList& ops, const ApiDefMap& api_def_map,
-                const string& dot_h_fname, const string& dot_cc_fname,
-                const string& overrides_fnames) {
+                const string& dot_h_fname, const string& dot_cc_fname) {
   Env* env = Env::Default();
 
-  // Load the override map.
-  OpGenOverrideMap override_map;
-  if (!overrides_fnames.empty()) {
-    TF_CHECK_OK(override_map.LoadFileList(env, overrides_fnames));
-  }
-
   // Write the initial boilerplate to the .h and .cc files.
   std::unique_ptr<WritableFile> h = nullptr;
   std::unique_ptr<WritableFile> cc = nullptr;
diff --git a/tensorflow/cc/framework/cc_op_gen.h b/tensorflow/cc/framework/cc_op_gen.h
index cea28990144b9371e8009ce13f912b44044f9aac..c7256a7dc384e652fa1bddfe3aa9893491c2b14c 100644
--- a/tensorflow/cc/framework/cc_op_gen.h
+++ b/tensorflow/cc/framework/cc_op_gen.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
+#define TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
 
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
@@ -24,9 +24,8 @@ namespace tensorflow {
 
 /// Result is written to files dot_h and dot_cc.
 void WriteCCOps(const OpList& ops, const ApiDefMap& api_def_map,
-                const string& dot_h_fname, const string& dot_cc_fname,
-                const string& overrides_fnames);
+                const string& dot_h_fname, const string& dot_cc_fname);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
diff --git a/tensorflow/cc/framework/cc_op_gen_main.cc b/tensorflow/cc/framework/cc_op_gen_main.cc
index 326d5668b8803ee39ffe24900c92e1db87b93601..3157792e15a006555e4924eea3c72ea643e79c1c 100644
--- a/tensorflow/cc/framework/cc_op_gen_main.cc
+++ b/tensorflow/cc/framework/cc_op_gen_main.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 namespace {
 
 void PrintAllCCOps(const std::string& dot_h, const std::string& dot_cc,
-                   const std::string& overrides_fnames, bool include_internal,
+                   bool include_internal,
                    const std::vector<string>& api_def_dirs) {
   OpList ops;
   OpRegistry::Global()->Export(include_internal, &ops);
@@ -49,7 +49,7 @@ void PrintAllCCOps(const std::string& dot_h, const std::string& dot_cc,
 
   api_def_map.UpdateDocs();
 
-  WriteCCOps(ops, api_def_map, dot_h, dot_cc, overrides_fnames);
+  WriteCCOps(ops, api_def_map, dot_h, dot_cc);
 }
 
 }  // namespace
@@ -57,24 +57,21 @@ void PrintAllCCOps(const std::string& dot_h, const std::string& dot_cc,
 
 int main(int argc, char* argv[]) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
-  // TODO(annarev): Update this file to no longer take op_gen_overrides.pbtxt
-  // as an argument.
-  if (argc != 6) {
+  if (argc != 5) {
     for (int i = 1; i < argc; ++i) {
       fprintf(stderr, "Arg %d = %s\n", i, argv[i]);
     }
     fprintf(stderr,
-            "Usage: %s out.h out.cc overrides1.pbtxt,2.pbtxt include_internal "
+            "Usage: %s out.h out.cc include_internal "
             "api_def_dirs1,api_def_dir2 ...\n"
             "  include_internal: 1 means include internal ops\n",
             argv[0]);
     exit(1);
   }
 
-  bool include_internal = tensorflow::StringPiece("1") == argv[4];
+  bool include_internal = tensorflow::StringPiece("1") == argv[3];
   std::vector<tensorflow::string> api_def_dirs = tensorflow::str_util::Split(
-      argv[5], ",", tensorflow::str_util::SkipEmpty());
-  tensorflow::PrintAllCCOps(argv[1], argv[2], argv[3], include_internal,
-                            api_def_dirs);
+      argv[4], ",", tensorflow::str_util::SkipEmpty());
+  tensorflow::PrintAllCCOps(argv[1], argv[2], include_internal, api_def_dirs);
   return 0;
 }
diff --git a/tensorflow/cc/framework/cc_op_gen_test.cc b/tensorflow/cc/framework/cc_op_gen_test.cc
index 0b7e720a5c7b343415eee1aa157b8de755a1e1a5..1e0f2d241bb350897a840dda90d6d0c009b1daad 100644
--- a/tensorflow/cc/framework/cc_op_gen_test.cc
+++ b/tensorflow/cc/framework/cc_op_gen_test.cc
@@ -24,10 +24,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-// TODO(annarev): Remove this op_gen_overrides.pbtxt reference.
-// It is needed only because WriteCCOps takes it as an argument.
-constexpr char kOverridesFnames[] =
-    "tensorflow/cc/ops/op_gen_overrides.pbtxt";
 constexpr char kBaseOpDef[] = R"(
 op {
   name: "Foo"
@@ -96,7 +92,7 @@ void GenerateCcOpFiles(Env* env, const OpList& ops,
   const auto internal_h_file_path = io::JoinPath(tmpdir, "test_internal.h");
   const auto internal_cc_file_path = io::JoinPath(tmpdir, "test_internal.cc");
 
-  WriteCCOps(ops, api_def_map, h_file_path, cc_file_path, kOverridesFnames);
+  WriteCCOps(ops, api_def_map, h_file_path, cc_file_path);
 
   TF_ASSERT_OK(ReadFileToString(env, h_file_path, h_file_text));
   TF_ASSERT_OK(
diff --git a/tensorflow/cc/framework/cc_ops_test.cc b/tensorflow/cc/framework/cc_ops_test.cc
index 5da23036eaadbef270ba839357dc4613bf3bf490..ac05e3cf95b1ce4009ee1424713baf2d34902a94 100644
--- a/tensorflow/cc/framework/cc_ops_test.cc
+++ b/tensorflow/cc/framework/cc_ops_test.cc
@@ -22,8 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace tensorflow {
-using namespace ops;  // NOLINT(build/namespaces)
-
+namespace ops {
 namespace {
 
 Output Linear(const Scope& scope, Input x, Input w, Input b) {
@@ -39,8 +38,6 @@ void GetColocationConstraints(const Output& tensor,
                            constraints));
 }
 
-}  // namespace
-
 TEST(CCOpTest, Basic) {
   Scope root = Scope::NewRootScope();
   auto c = Const(root, {{1, 1}});
@@ -249,4 +246,6 @@ TEST(CCOpTest, InvalidFinalize) {
             string::npos);
 }
 
+}  // namespace
+}  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/framework/grad_op_registry.h b/tensorflow/cc/framework/grad_op_registry.h
index 190b96f68506c6b5252d6c0184f1712310477a8a..0fc5abb20c884a66539682099497e2c8511a620f 100644
--- a/tensorflow/cc/framework/grad_op_registry.h
+++ b/tensorflow/cc/framework/grad_op_registry.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
+#define TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
 
 #include <unordered_map>
 
@@ -72,4 +72,4 @@ class GradOpRegistry {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_GRAD_OP_REGISTRY_H_
diff --git a/tensorflow/cc/framework/gradient_checker.h b/tensorflow/cc/framework/gradient_checker.h
index d055c60d09c2f33fb1f61165f75b2d04618620b7..1aa215a9088335580667e0c23c7244e6e5047f1a 100644
--- a/tensorflow/cc/framework/gradient_checker.h
+++ b/tensorflow/cc/framework/gradient_checker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
+#define TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -60,4 +60,4 @@ Status ComputeGradientError(const Scope& scope, const Output& x,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_GRADIENT_CHECKER_H_
diff --git a/tensorflow/cc/framework/gradient_checker_test.cc b/tensorflow/cc/framework/gradient_checker_test.cc
index fdc457f40af875d7c0c243246755d0cb87c44a62..d4f0a7f5ab3716be41e22c02a21aca028f76fb88 100644
--- a/tensorflow/cc/framework/gradient_checker_test.cc
+++ b/tensorflow/cc/framework/gradient_checker_test.cc
@@ -24,10 +24,18 @@ limitations under the License.
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
-using namespace ops;  // NOLINT(build/namespaces)
-
 namespace {
 
+using ops::Complex;
+using ops::Const;
+using ops::MatMul;
+using ops::Placeholder;
+using ops::Real;
+using ops::Split;
+using ops::Square;
+using ops::Stack;
+using ops::Unstack;
+
 TEST(GradientCheckerTest, BasicFloat) {
   Scope scope = Scope::NewRootScope();
   TensorShape shape({2, 4, 3});
diff --git a/tensorflow/cc/framework/gradients.h b/tensorflow/cc/framework/gradients.h
index 717f6f0636d3dd1a546ef7477b100bbfc86ba13d..0a377ad56d139a6ec26ea97b4e1e43495d0b3165 100644
--- a/tensorflow/cc/framework/gradients.h
+++ b/tensorflow/cc/framework/gradients.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
+#define TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -49,4 +49,4 @@ Output NoGradient();
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_GRADIENTS_H_
diff --git a/tensorflow/cc/framework/gradients_test.cc b/tensorflow/cc/framework/gradients_test.cc
index 07a062e704ed6ffc6389b5897309957a1bfcd1c2..26e3170ad8e4f4fba1c2dc014086acf24d949f72 100644
--- a/tensorflow/cc/framework/gradients_test.cc
+++ b/tensorflow/cc/framework/gradients_test.cc
@@ -26,10 +26,20 @@ limitations under the License.
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
-using namespace ops;  // NOLINT(build/namespaces)
-
 namespace {
 
+using ops::Assign;
+using ops::Const;
+using ops::Identity;
+using ops::MatMul;
+using ops::OnesLike;
+using ops::Placeholder;
+using ops::Square;
+using ops::Stack;
+using ops::StopGradient;
+using ops::Unstack;
+using ops::Variable;
+
 // TODO(andydavis) Add more unit tests once more gradient functions are ported.
 class GradientsTest : public ::testing::Test {
  protected:
diff --git a/tensorflow/cc/framework/ops.h b/tensorflow/cc/framework/ops.h
index 8d4154220c4b18f9286094b10c1b1e96eb4e31e7..a085e1d6e2de5ad63d11eb8979ae64c26b91366f 100644
--- a/tensorflow/cc/framework/ops.h
+++ b/tensorflow/cc/framework/ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_OPS_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_OPS_H_
+#define TENSORFLOW_CC_FRAMEWORK_OPS_H_
 
 #include <type_traits>
 
@@ -296,4 +296,4 @@ class InputList {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_OPS_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_OPS_H_
diff --git a/tensorflow/cc/framework/scope.h b/tensorflow/cc/framework/scope.h
index 0225ac047291d6297af558fddad6e5315389ff40..30c32bd44b0f22d6b29dd3836d431807d0216818 100644
--- a/tensorflow/cc/framework/scope.h
+++ b/tensorflow/cc/framework/scope.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
+#define TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
 
 #include <memory>
 #include <string>
@@ -242,4 +242,4 @@ struct CompositeOpScopes {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_SCOPE_H_
diff --git a/tensorflow/cc/framework/scope_internal.h b/tensorflow/cc/framework/scope_internal.h
index 968c366550ef6f46557cd9b5662d9d0719b31531..8efcfed20d0b86d86d8c20a3d8630c7c6bc909c3 100644
--- a/tensorflow/cc/framework/scope_internal.h
+++ b/tensorflow/cc/framework/scope_internal.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
+#define TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
 
 #include "tensorflow/cc/framework/scope.h"
 
@@ -117,4 +117,4 @@ class Scope::Impl {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
diff --git a/tensorflow/cc/framework/testutil.h b/tensorflow/cc/framework/testutil.h
index a3e19870ec847bcd4f0e0bf0e71dda724024d5d2..7ad6fb4a676639f5d6d3da6a7c08de1894162f0c 100644
--- a/tensorflow/cc/framework/testutil.h
+++ b/tensorflow/cc/framework/testutil.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
+#define TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -44,4 +44,4 @@ void GetTensor(const Scope& scope, const std::vector<Output>& assign_vars,
 }  // namespace test
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_TESTUTIL_H_
diff --git a/tensorflow/cc/framework/while_gradients.h b/tensorflow/cc/framework/while_gradients.h
index 8f592accc93573cb8953a5ab25c04881ca0c2333..cb4e579c8548294ec45b0c3f42cb844e0b87c390 100644
--- a/tensorflow/cc/framework/while_gradients.h
+++ b/tensorflow/cc/framework/while_gradients.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
+#ifndef TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
+#define TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -37,4 +37,4 @@ Status AddWhileLoopGradient(WhileContext* while_ctx, const Scope& scope,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
+#endif  // TENSORFLOW_CC_FRAMEWORK_WHILE_GRADIENTS_H_
diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc
index 455d7330c10cf230462869475f25a1f1b9bf9e9e..4a215fcc9299cf8b8da04cbf151640631ed0d449 100644
--- a/tensorflow/cc/gradients/array_grad_test.cc
+++ b/tensorflow/cc/gradients/array_grad_test.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace tensorflow {
+namespace {
+
 using namespace ops;  // NOLINT(build/namespaces)
 using ops::internal::MirrorPadGrad;
 
-namespace {
-
 class ArrayGradTest : public ::testing::Test {
  protected:
   ArrayGradTest() : scope_(Scope::NewRootScope()) {}
diff --git a/tensorflow/cc/gradients/data_flow_grad_test.cc b/tensorflow/cc/gradients/data_flow_grad_test.cc
index 734dfd3af97b856a7c8c4894c4a6d1a3ade10992..0ba3c0e27b1e545a30925ea3ef9e2c54dc9d0ae9 100644
--- a/tensorflow/cc/gradients/data_flow_grad_test.cc
+++ b/tensorflow/cc/gradients/data_flow_grad_test.cc
@@ -23,10 +23,13 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
-using namespace ops;  // NOLINT(build/namespaces)
-
 namespace {
 
+using ops::Const;
+using ops::DynamicPartition;
+using ops::DynamicStitch;
+using ops::Placeholder;
+
 class DataFlowGradTest : public ::testing::Test {
  protected:
   DataFlowGradTest() : scope_(Scope::NewRootScope()) {}
diff --git a/tensorflow/cc/gradients/grad_testutil.cc b/tensorflow/cc/gradients/grad_testutil.cc
index 04b29d4e8b21eeee200d9e7390868d701eda3c22..304117d3719346202d3a8a18637f7c915d4a47f9 100644
--- a/tensorflow/cc/gradients/grad_testutil.cc
+++ b/tensorflow/cc/gradients/grad_testutil.cc
@@ -18,16 +18,14 @@ limitations under the License.
 #include "tensorflow/cc/framework/grad_op_registry.h"
 
 namespace tensorflow {
-using namespace ops;  // NOLINT(build/namespaces)
-
 namespace test {
 
 Status CallGradFunction(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
-  GradFunc grad_fn;
-  TF_RETURN_IF_ERROR(
-      GradOpRegistry::Global()->Lookup(op.node()->type_string(), &grad_fn));
+  ops::GradFunc grad_fn;
+  TF_RETURN_IF_ERROR(ops::GradOpRegistry::Global()->Lookup(
+      op.node()->type_string(), &grad_fn));
   TF_RETURN_IF_ERROR(grad_fn(scope, op, grad_inputs, grad_outputs));
   TF_RETURN_IF_ERROR(scope.status());
   return Status::OK();
diff --git a/tensorflow/cc/gradients/grad_testutil.h b/tensorflow/cc/gradients/grad_testutil.h
index d31f412754ff59cc7782b14e285071a8d4218d08..70c81f1a73a394322c602a5c51e3c2a40aca2397 100644
--- a/tensorflow/cc/gradients/grad_testutil.h
+++ b/tensorflow/cc/gradients/grad_testutil.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
+#ifndef TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
+#define TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -32,4 +32,4 @@ Status CallGradFunction(const Scope& scope, const Operation& op,
 }  // namespace test
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
+#endif  // TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index d7446b9560fd7dc8377ea3710641906b274313a9..52c177212a8c88f1857defcc38de4a01ac47dab0 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -473,6 +473,41 @@ Status AddNGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("AddN", AddNGrad);
 
+Status PowGrad(const Scope& scope, const Operation& op,
+               const std::vector<Output>& grad_inputs,
+               std::vector<Output>* grad_outputs) {
+  auto x = ConjugateHelper(scope, op.input(0));
+  auto y = ConjugateHelper(scope, op.input(1));
+  auto z = ConjugateHelper(scope, op.output(0));
+  auto grad = grad_inputs[0];
+  // grad * y * pow(x, y - 1)
+  auto one = Cast(scope, Const(scope, 1.0), y.type());
+  auto gx_1 = Mul(scope,
+                  Mul(scope, grad, y),
+                  Pow(scope, x, Sub(scope, y, one)));
+  // Avoid false singularity at x = 0
+  DataType x_dtype = x.type();
+  auto zero = Cast(scope, Const(scope, 0.0), x_dtype);
+  if (x_dtype == DT_COMPLEX64 || x_dtype == DT_COMPLEX128) {
+    // real(x) < 0 is fine for the complex case
+    auto log_x = Where3(scope,
+                        NotEqual(scope, x, zero),
+                        Log(scope, x),
+                        ZerosLike(scope, x));
+    auto gy_1 = Mul(scope, Mul(scope, grad, z), log_x);
+    return BinaryGradCommon(scope, op, grad_outputs, gx_1, gy_1);
+  } else {
+    // There's no sensible real value to return if x < 0, so return 0
+    auto log_x = Where3(scope,
+                        Greater(scope, x, zero),
+                        Log(scope, x),
+                        ZerosLike(scope, x));
+    auto gy_1 = Mul(scope, Mul(scope, grad, z), log_x);
+    return BinaryGradCommon(scope, op, grad_outputs, gx_1, gy_1);
+  }
+}
+REGISTER_GRADIENT_OP("Pow", PowGrad);
+
 // MaximumMinimumGradCommon adds shared ops to calculate gradients for
 // the binary Maximum and Minimum ops.
 Status MaximumMinimumGradCommon(const Scope& scope, const Operation& op,
@@ -794,6 +829,183 @@ Status MinOrMaxGrad(const Scope& scope, const Operation& op,
 REGISTER_GRADIENT_OP("Min", MinOrMaxGrad);
 REGISTER_GRADIENT_OP("Max", MinOrMaxGrad);
 
+Status ProdGrad(const Scope& scope, const Operation& op,
+                const std::vector<Output>& grad_inputs,
+                std::vector<Output>* grad_outputs) {
+  auto zero = Const(scope, 0);
+  auto one = Const(scope, 1);
+
+  // The gradient can be expressed by dividing the product by each entry of
+  // the input tensor. If our input is
+  // [
+  //  [3, 4],
+  //  [5, 6],
+  //  [7, 8]
+  // ]
+  // and we do a Prod operation on the axis 1, we will obtain [[105, 192]].
+  // The gradient will have the same shape as the input
+  //     [
+  //       [105/3, 192/4],
+  // dz *  [105/5, 192/6],
+  //       [105/7, 192/6]
+  //     ]
+  // If the input contains a zero, the division is impossible but
+  // if we take the calculation that gave the first gradient
+  // (3 * 5 * 6)/3 is equal to 5 * 6
+  // the trick will be to cumprod the elements on the axis without
+  // the element at the current position (3 in the example above).
+  // We will take as example:
+  // [
+  //   [
+  //     [3.0, 4.0],
+  //     [5.0, 6.0],
+  //     [7.0, 8.0]
+  //   ],
+  //   [
+  //     [3.0, 5.0],
+  //     [0.0, 6.0],
+  //     [5.0, 6.0]
+  //   ]
+  // ]
+
+  // [2, 3, 2]
+  auto input_shape = Shape(scope, op.input(0));
+
+  // The Reshape with -1 flattens the reduction indices.
+  // [1]
+  auto reduction_indices = Reshape(scope, op.input(1), {-1});
+
+  // [2, 1, 2]
+  auto output_shape_kept_dims =
+      ReducedShapeHelper(scope, input_shape, reduction_indices);
+
+  // [1, 3, 1]
+  auto tile_scaling = SafeDivHelper(scope, input_shape, output_shape_kept_dims);
+
+  // [[[105, 192]], [[0, 180]]]
+  auto grad = Reshape(scope, grad_inputs[0], output_shape_kept_dims);
+
+  // [[[105, 192], [105, 192], [105, 192]], [[0, 180], [0, 180], [0, 180]]]
+  auto grad_tiled = Tile(scope, grad, tile_scaling);
+
+  Scope cpu_scope = scope.WithDevice("/cpu:0");
+
+  // [3]
+  auto rank = Rank(cpu_scope, op.input(0));
+
+
+  // Normalize any negative indices in the reduction_axes to positive values.
+  auto reduction_indices_pos = Mod(cpu_scope, Add(cpu_scope, reduction_indices, rank), rank);
+
+  // [1]
+  auto reduced = Cast(cpu_scope, reduction_indices_pos, DataType::DT_INT32);
+
+  // [0, 1, 2]
+  auto idx = Range(cpu_scope, zero, rank, one);
+
+  // [0, 2]
+  auto other = SetDiff1D(cpu_scope, idx, reduced).out;
+
+  // [1, 0, 2]
+  auto perm =
+      Concat(cpu_scope, std::initializer_list<Input>{reduced, other}, 0);
+
+  // 3 => [3]
+  auto reduced_num = Prod(cpu_scope, Gather(scope, input_shape, reduced), 0);
+
+  // 2 * 2 => [2]
+  auto other_num = Prod(cpu_scope, Gather(scope, input_shape, other), 0);
+
+  // [
+  //    [
+  //       [ 3.,  4.],
+  //       [ 3.,  5.]
+  //   ],
+  //   [
+  //       [ 5.,  6.],
+  //       [ 0.,  6.]
+  //   ],
+  //   [
+  //       [ 7.,  8.],
+  //       [ 5.,  6.]
+  //   ]
+  // ]
+  auto permuted = Transpose(scope, op.input(0), perm);
+
+  // [3, 2, 2]
+  auto permuted_shape = Shape(scope, permuted);
+
+  // [
+  //   [ 3.,  4.,  3.,  5.],
+  //   [ 5.,  6.,  0.,  6.],
+  //   [ 7.,  8.,  5.,  6.]
+  // ]
+  auto reshaped = Reshape(
+      scope, permuted,
+      Stack(scope, std::initializer_list<Input>{reduced_num, other_num}));
+
+  // [
+  //   [ 1.,  1.,  1.,  1.],
+  //   [ 3.,  4.,  3.,  5.],
+  //   [ 15.,  24.,  0.,  30.]
+  // ]
+  auto left = Cumprod(scope, reshaped, zero, Cumprod::Exclusive(true));
+
+  // [
+  //   [ 35.,  48.,  0.,  36.],
+  //   [  7.,   8.,   5.,   6.],
+  //   [  1.,   1.,   1.,   1.]
+  // ]
+  auto right =
+      Cumprod(scope, reshaped, zero, Cumprod::Exclusive(true).Reverse(true));
+
+  // left * right =
+  // [
+  //   [ 35.,  48.,  0.,  36.],
+  //   [ 21.,  32.,  15.,  30.],
+  //   [ 15.,  24.,  0.,  30.]
+  // ]
+  // y =
+  // [
+  //   [
+  //     [ 35.,  48.],
+  //     [ 0.,  36.]
+  //   ],
+  //   [
+  //     [ 21.,  32.],
+  //     [ 15.,  30.]
+  //   ],
+  //   [
+  //     [ 15.,  24.],
+  //     [ 0.,  30.]
+  //   ]
+  // ]
+  auto y = Reshape(scope, Mul(scope, left, right), permuted_shape);
+
+  // out = 
+  // [
+  //   [
+  //     [ 35.,  48.],
+  //     [ 21.,  32.],
+  //     [ 15.,  24.]
+  //   ],
+  //   [
+  //     [ 0.,   36.],
+  //     [ 15.,  30.],
+  //     [ 0.,  30.]
+  //   ]
+  // ]
+  auto out =
+      Mul(scope, grad_tiled, Transpose(scope, y, InvertPermutation(scope, perm)));
+
+  grad_outputs->push_back(Reshape(scope, out, input_shape));
+
+  // stop propagation along reduction_indices
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Prod", ProdGrad);
+
 // MatMulGrad helper function used to compute two MatMul operations
 // based on input matrix transposition combinations.
 Status MatMulGradHelper(const Scope& scope, const bool is_batch,
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 6313f41da5e5f9cf88be4c8a84408a8df77f0e25..1b4c7c2688083e74433da3dce2849b8c37443684 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -23,10 +23,31 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
-using namespace ops;  // NOLINT(build/namespaces)
-
 namespace {
 
+using ops::Abs;
+using ops::Add;
+using ops::AddN;
+using ops::BatchMatMul;
+using ops::Const;
+using ops::Div;
+using ops::Greater;
+using ops::MatMul;
+using ops::Max;
+using ops::Maximum;
+using ops::Mean;
+using ops::Min;
+using ops::Minimum;
+using ops::Mul;
+using ops::Placeholder;
+using ops::Pow;
+using ops::Prod;
+using ops::RealDiv;
+using ops::SquaredDifference;
+using ops::Sub;
+using ops::Sum;
+using ops::Where3;
+
 // TODO(andydavis) Test gradient function against numeric gradients output.
 // TODO(andydavis) As more gradients are added move common test functions
 // to a testutil library.
@@ -83,6 +104,7 @@ class CWiseUnaryGradTest : public ::testing::Test {
 
     Output y;
     switch (op_type) {
+      using namespace ops;  // NOLINT(build/namespaces)
       case ABS:
         y = Abs(scope_, x);
         break;
@@ -843,6 +865,14 @@ TEST_F(NaryGradTest, SquaredDifference) {
   RunTest({x1, x2}, {x1_shape, x2_shape}, {y}, {x1_shape});
 }
 
+TEST_F(NaryGradTest, Pow) {
+  TensorShape shape({3});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  // fix exponent to avoid overflow
+  auto y = Pow(scope_, x, Const(scope_, {1.f, 2.f, 3.f}));
+  RunTest({x}, {shape}, {y}, {shape});
+}
+
 TEST_F(NaryGradTest, Maximum) {
   TensorShape shape({3, 2});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
@@ -865,5 +895,14 @@ TEST_F(NaryGradTest, Minimum) {
   RunTest(x, x_init_value, y, shape);
 }
 
+TEST_F(NaryGradTest, Prod) {
+  TensorShape x_shape({2, 3, 2});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  auto y = Prod(scope_, x, {1});
+  // y's shape is the result of reducing x along axes 1
+  TensorShape y_shape({2, 1, 2});
+  RunTest({x}, {x_shape}, {y}, {y_shape});
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index f9063e836509669d81d03b1d2f0d32d1166b6eca..0cfe5f6e3c49f7c4a3cafbf48ff4e54a0ffd0d47 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -23,10 +23,22 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
-using namespace ops;  // NOLINT(build/namespaces)
-
 namespace {
 
+using ops::BiasAdd;
+using ops::Conv2D;
+using ops::Elu;
+using ops::L2Loss;
+using ops::LogSoftmax;
+using ops::LRN;
+using ops::MaxPool;
+using ops::MaxPoolV2;
+using ops::Placeholder;
+using ops::Relu;
+using ops::Relu6;
+using ops::Selu;
+using ops::Softmax;
+
 class NNGradTest : public ::testing::Test {
  protected:
   NNGradTest() : scope_(Scope::NewRootScope()) {}
diff --git a/tensorflow/cc/ops/const_op.h b/tensorflow/cc/ops/const_op.h
index d11fda475b3db58bf83cdb94079c8fde8d1170f7..424a683665f31b5e25eeceeb40477fc31640ce90 100644
--- a/tensorflow/cc/ops/const_op.h
+++ b/tensorflow/cc/ops/const_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_OPS_CONST_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CC_OPS_CONST_OP_H_
+#ifndef TENSORFLOW_CC_OPS_CONST_OP_H_
+#define TENSORFLOW_CC_OPS_CONST_OP_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -82,4 +82,4 @@ std::vector<NodeBuilder::NodeOut> AsNodeOutList(const Scope& scope,
 }  // namespace ops
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_OPS_CONST_OP_H_
+#endif  // TENSORFLOW_CC_OPS_CONST_OP_H_
diff --git a/tensorflow/cc/ops/op_gen_overrides.pbtxt b/tensorflow/cc/ops/op_gen_overrides.pbtxt
deleted file mode 100644
index 4aac990e748b0a79cbc3b353b4121a582b0883b0..0000000000000000000000000000000000000000
--- a/tensorflow/cc/ops/op_gen_overrides.pbtxt
+++ /dev/null
@@ -1,238 +0,0 @@
-# array_ops
-op { name: "BroadcastArgs" rename_to: "BroadcastDynamicShape" }
-op { name: "BroadcastGradientArgs" hide: true }
-op { name: "ConcatOffset" skip: true }  # Maybe should just be hidden?
-op { name: "Concat" skip: true }
-op { name: "ConcatV2" rename_to: "Concat" }
-op { name: "ExpandDims" input_rename: { from: "dim" to: "axis" } }
-op { name: "ListDiff" rename_to: "SetDiff1D" }
-op { name: "MirrorPadGrad" hide: true }
-op { name: "Reverse" skip: true }
-op { name: "ReverseV2" rename_to: "Reverse" }
-op { name: "Split" input_rename: { from: "split_dim" to: "axis" } }
-op { name: "SplitV" input_rename: { from: "split_dim" to: "axis" } }
-op { name: "Squeeze" attr_rename: { from: "squeeze_dims" to: "axis" } }
-op { name: "Pack" rename_to: "Stack" }
-op { name: "Unpack" rename_to: "Unstack" }
-op { name: "Select" rename_to: "Where3" input_rename: { from: "t" to: "x" } input_rename: { from: "e" to: "y" } }
-op { name: "Where" input_rename: { from: "input" to: "condition" } }
-
-
-# candidate_sampling_ops
-op { name: "ThreadUnsafeUnigramCandidateSampler", skip: true }
-
-# control_flow_ops
-# TODO(joshl): Hide Switch and Merge once we write and migrate users to
-# a Cond() API.
-#op { name: "Switch" hide: true }
-#op { name: "Merge" hide: true }
-op { name: "RefMerge" hide: true }
-op { name: "Exit" hide: true }
-op { name: "RefExit" hide: true }
-op { name: "Enter" hide: true }
-op { name: "RefEnter" hide: true }
-op { name: "RefIdentity" hide: true }
-
-# ctc_ops
-
-# data_flow_ops
-op { name: "FakeQueue" skip: true }
-op { name: "FIFOQueue" skip: true}
-op { name: "FIFOQueueV2" rename_to: "FIFOQueue" }
-op { name: "PaddingFIFOQueue" skip: true }
-op { name: "PaddingFIFOQueueV2" rename_to: "PaddingFIFOQueue" }
-op { name: "PriorityQueue" skip: true }
-op { name: "PriorityQueueV2" rename_to: "PriorityQueue" }
-op { name: "QueueClose" skip: true }
-op { name: "QueueCloseV2" rename_to: "QueueClose" }
-op { name: "QueueDequeue" skip: true }
-op { name: "QueueDequeueV2" rename_to: "QueueDequeue" }
-op { name: "QueueDequeueMany" skip: true }
-op { name: "QueueDequeueManyV2" rename_to: "QueueDequeueMany" }
-op { name: "QueueDequeueUpTo" skip: true }
-op { name: "QueueDequeueUpToV2" rename_to: "QueueDequeueUpTo" }
-op { name: "QueueEnqueue" skip: true }
-op { name: "QueueEnqueueV2" rename_to: "QueueEnqueue" }
-op { name: "QueueEnqueueMany" skip: true }
-op { name: "QueueEnqueueManyV2" rename_to: "QueueEnqueueMany" }
-op { name: "QueueSize" skip: true }
-op { name: "QueueSizeV2" rename_to: "QueueSize" }
-op { name: "RandomShuffleQueue" skip: true }
-op { name: "RandomShuffleQueueV2" rename_to: "RandomShuffleQueue" }
-op { name: "ReaderNumRecordsProduced" skip: true }
-op { name: "ReaderNumRecordsProducedV2" rename_to: "ReaderNumRecordsProduced" }
-op { name: "ReaderNumWorkUnitsCompleted" skip: true }
-op { name: "ReaderNumWorkUnitsCompletedV2" rename_to: "ReaderNumWorkUnitsCompleted" }
-op { name: "ReaderRead" skip: true }
-op { name: "ReaderReadUpTo" skip: true }
-op { name: "ReaderReadUpToV2" rename_to: "ReaderReadUpTo" }
-op { name: "ReaderReadV2" rename_to: "ReaderRead" }
-op { name: "ReaderReset" skip: true }
-op { name: "ReaderResetV2" rename_to: "ReaderReset" }
-op { name: "ReaderRestoreState" skip: true }
-op { name: "ReaderRestoreStateV2" rename_to: "ReaderRestoreState" }
-op { name: "ReaderSerializeState" skip: true }
-op { name: "ReaderSerializeStateV2" rename_to: "ReaderSerializeState" }
-op { name: "FixedLengthRecordReader" skip: true }
-op { name: "FixedLengthRecordReaderV2" rename_to: "FixedLengthRecordReader" }
-op { name: "IdentityReader" skip: true }
-op { name: "IdentityReaderV2" rename_to: "IdentityReader" }
-op { name: "TFRecordReader" skip: true }
-op { name: "TFRecordReaderV2" rename_to: "TFRecordReader" }
-op { name: "TextLineReader" skip: true }
-op { name: "TextLineReaderV2" rename_to: "TextLineReader" }
-
-# Skip hash table ops until we have better support in C++ (ops are currently
-# only used in contrib)
-op { name: "HashTable" skip: true }
-op { name: "InitializeTable" skip: true }
-op { name: "InitializeTableFromTextFile" skip: true }
-op { name: "LookupTableFind" skip: true }
-op { name: "LookupTableImport" skip: true }
-op { name: "LookupTableInsert" skip: true }
-op { name: "LookupTableSize" skip: true }
-op { name: "MutableDenseHashTable" skip: true }
-op { name: "MutableHashTable" skip: true }
-op { name: "MutableHashTableOfTensors" skip: true }
-
-# Stack ops are internal to control flow gradients (not yet implemented in C++)
-op { name: "Stack" skip: true }
-op { name: "StackClose" skip: true }
-op { name: "StackPop" skip: true }
-op { name: "StackPush" skip: true }
-op { name: "StackV2" skip: true }
-op { name: "StackCloseV2" skip: true }
-op { name: "StackPopV2" skip: true }
-op { name: "StackPushV2" skip: true }
-
-op { name: "TensorArrayCloseV2" skip: true }
-op { name: "TensorArrayCloseV3" rename_to: "TensorArrayClose" }
-op { name: "TensorArrayConcatV2" skip: true }
-op { name: "TensorArrayConcatV3" rename_to: "TensorArrayConcat" }
-op { name: "TensorArrayGatherV2" skip: true }
-op { name: "TensorArrayGatherV3" rename_to: "TensorArrayGather" }
-op { name: "TensorArrayGradV2" skip: true }
-op { name: "TensorArrayGradV3" rename_to: "TensorArrayGrad" }
-op { name: "TensorArrayReadV2" skip: true }
-op { name: "TensorArrayReadV3" rename_to: "TensorArrayRead" }
-op { name: "TensorArrayScatterV2" skip: true }
-op { name: "TensorArrayScatterV3" rename_to: "TensorArrayScatter" }
-op { name: "TensorArraySizeV2" skip: true }
-op { name: "TensorArraySizeV3" rename_to: "TensorArraySize" }
-op { name: "TensorArraySplitV2" skip: true }
-op { name: "TensorArraySplitV3" rename_to: "TensorArraySplit" }
-op { name: "TensorArrayV2" skip: true }
-op { name: "TensorArrayV3" rename_to: "TensorArray" }
-op { name: "TensorArrayWriteV2" skip: true }
-op { name: "TensorArrayWriteV3" rename_to: "TensorArrayWrite" }
-
-op { name: "WholeFileReader" skip: true }
-op { name: "WholeFileReaderV2" rename_to: "WholeFileReader" }
-
-# functional_ops
-
-# image_ops
-op { name: "AdjustContrastv2" rename_to: "AdjustContrast" }
-op { name: "ResizeBilinearGrad" hide: true }
-op { name: "ResizeBicubicGrad" hide: true }
-op { name: "ResizeNearestNeighborGrad" hide: true }
-
-# io_ops
-
-# linalg_ops
-op { name: "SelfAdjointEigV2" rename_to: "SelfAdjointEig" }
-
-# logging_ops
-op { name: "AudioSummaryV2" rename_to: "AudioSummary" }
-
-# lookup_ops
-op { name: "LookupTableFind" skip: true }
-op { name: "LookupTableFindV2" rename_to: "LookupTableFind" }
-op { name: "LookupTableInsert" skip: true }
-op { name: "LookupTableInsertV2" rename_to: "LookupTableInsert" }
-op { name: "LookupTableSize" skip: true }
-op { name: "LookupTableSizeV2" rename_to: "LookupTableSize" }
-op { name: "LookupTableExport" skip: true }
-op { name: "LookupTableExportV2" rename_to: "LookupTableExport" }
-op { name: "LookupTableImport" skip: true }
-op { name: "LookupTableImportV2" rename_to: "LookupTableImport" }
-op { name: "HashTable" skip: true }
-op { name: "HashTableV2" rename_to: "HashTable" }
-op { name: "MutableHashTable" skip: true }
-op { name: "MutableHashTableV2" rename_to: "MutableHashTable" }
-op { name: "MutableHashTableOfTensors" skip: true }
-op { name: "MutableHashTableOfTensorsV2" rename_to: "MutableHashTableOfTensors" }
-op { name: "MutableDenseHashTable" skip: true }
-op { name: "MutableDenseHashTableV2" rename_to: "MutableDenseHashTable" }
-op { name: "InitializeTable" skip: true }
-op { name: "InitializeTableV2" rename_to: "InitializeTable" }
-op { name: "InitializeTableFromTextFile" skip: true }
-op { name: "InitializeTableFromTextFileV2" rename_to: "InitializeTableFromTextFile" }
-
-# math_ops
-op { name: "All" alias: "ReduceAll" input_rename: { from: "reduction_indices" to: "axis" } }
-op { name: "Any" alias: "ReduceAny" input_rename: { from: "reduction_indices" to: "axis" } }
-op { name: "Max" alias: "ReduceMax" input_rename: { from: "reduction_indices" to: "axis" } }
-op { name: "Mean" alias: "ReduceMean" input_rename: { from: "reduction_indices" to: "axis" } }
-op { name: "Min" alias: "ReduceMin" input_rename: { from: "reduction_indices" to: "axis" } }
-op { name: "Mul" rename_to: "Multiply" alias: "Mul" }
-op { name: "Neg" rename_to: "Negate" alias: "Neg" }
-op { name: "Prod" alias: "ReduceProd" input_rename: { from: "reduction_indices" to: "axis" } }
-op { name: "Sub" rename_to: "Subtract" alias: "Sub" }
-op { name: "Sum" alias: "ReduceSum" input_rename: { from: "reduction_indices" to: "axis" } }
-op { name: "SigmoidGrad" hide: true }
-op { name: "TanhGrad" hide: true }
-op { name: "InvGrad" hide: true }
-op { name: "ReciprocalGrad" hide: true }
-op { name: "SqrtGrad" hide: true }
-op { name: "RsqrtGrad" hide: true }
-
-# *Grad ops get hidden, only for use by the gradient code.
-op { name: "SigmoidGrad" hide: true }
-op { name: "TanhGrad" hide: true }
-op { name: "InvGrad" hide: true }
-op { name: "ReciprocalGrad" hide: true }
-op { name: "SqrtGrad" hide: true }
-op { name: "RsqrtGrad" hide: true }
-
-# nn_ops
-op { name: "AvgPoolGrad" hide: true }
-op { name: "LRNGrad" hide: true }
-op { name: "MaxPoolGrad" hide: true }
-op { name: "MaxPoolGradWithArgmax" hide: true }
-op { name: "ReluGrad" hide: true }
-op { name: "Relu6Grad" hide: true }
-op { name: "EluGrad" hide: true }
-op { name: "SeluGrad" hide: true }
-op { name: "SoftplusGrad" hide: true }
-op { name: "SoftsignGrad" hide: true }
-op { name: "FractionalAvgPoolGrad" hide: true }
-op { name: "FractionalMaxPoolGrad" hide: true }
-op { name: "TopKV2" rename_to: "TopK" }
-op { name: "BiasAddV1" skip: true }  # Use BiasAdd instead
-
-# parsing_ops
-
-# random_ops
-
-op { name: "RandomStandardNormal" rename_to: "RandomNormal" }
-# script_ops
-# Calling Python functions from a C++ program isn't supported
-op { name: "PyFunc" skip: true }
-op { name: "PyFuncStateless" skip: true}
-
-# sdca_ops
-
-# state_ops
-
-op { name: "Variable" skip: true }
-op { name: "VariableV2" rename_to: "Variable" }
-
-# sparse_ops
-
-# string_ops
-
-# user_ops
-
-# training_ops
-
diff --git a/tensorflow/cc/ops/standard_ops.h b/tensorflow/cc/ops/standard_ops.h
index 0c021f0b3ac02c596e0511e650a3caa0002c25d1..98f53010ecf78f769c7d89d6aafc48fdb772f42e 100644
--- a/tensorflow/cc/ops/standard_ops.h
+++ b/tensorflow/cc/ops/standard_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_OPS_STANDARD_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_OPS_STANDARD_OPS_H_
+#ifndef TENSORFLOW_CC_OPS_STANDARD_OPS_H_
+#define TENSORFLOW_CC_OPS_STANDARD_OPS_H_
 
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/candidate_sampling_ops.h"
@@ -37,4 +37,4 @@ limitations under the License.
 #include "tensorflow/cc/ops/training_ops.h"
 #include "tensorflow/cc/ops/user_ops.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_OPS_STANDARD_OPS_H_
+#endif  // TENSORFLOW_CC_OPS_STANDARD_OPS_H_
diff --git a/tensorflow/cc/ops/while_loop.cc b/tensorflow/cc/ops/while_loop.cc
index e0251efb2a424f86bd5a4885ef22d1928e04bd3e..d1c918d464bc9684b0db6dade2fb80cb2bd6691a 100644
--- a/tensorflow/cc/ops/while_loop.cc
+++ b/tensorflow/cc/ops/while_loop.cc
@@ -116,7 +116,7 @@ Status CreateCond(const Scope& scope, const CondGraphBuilderFn& cond,
   return Status::OK();
 }
 
-// Create the bdoy subgraph defined by `body`. `outputs` must be non-null and
+// Create the body subgraph defined by `body`. `outputs` must be non-null and
 // empty.
 Status CreateBody(const Scope& scope, const BodyGraphBuilderFn& body,
                   const std::vector<Output>& inputs,
diff --git a/tensorflow/cc/ops/while_loop.h b/tensorflow/cc/ops/while_loop.h
index a04476056a058ff0951a6347e8ffc05bc5ff5023..727237b5c7ad4d31dba1aaaf6d5600773d69223e 100644
--- a/tensorflow/cc/ops/while_loop.h
+++ b/tensorflow/cc/ops/while_loop.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_OPS_WHILE_LOOP_H_
-#define THIRD_PARTY_TENSORFLOW_CC_OPS_WHILE_LOOP_H_
+#ifndef TENSORFLOW_CC_OPS_WHILE_LOOP_H_
+#define TENSORFLOW_CC_OPS_WHILE_LOOP_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -71,4 +71,4 @@ Status BuildWhileLoop(const Scope& scope, const std::vector<Output>& inputs,
 }  // namespace ops
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_OPS_WHILE_LOOP_H_
+#endif  // TENSORFLOW_CC_OPS_WHILE_LOOP_H_
diff --git a/tensorflow/cc/profiler/BUILD b/tensorflow/cc/profiler/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..00799526fce572e7bb80199ccb8ce1cc89874031
--- /dev/null
+++ b/tensorflow/cc/profiler/BUILD
@@ -0,0 +1,36 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+
+tf_cuda_cc_test(
+    name = "profiler_test",
+    srcs = ["profiler_test.cc"],
+    deps = [
+        ":profiler",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "profiler",
+    srcs = ["profiler.cc"],
+    hdrs = ["profiler.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
+        "//tensorflow/core/profiler/internal:tfprof_stats",
+    ],
+)
diff --git a/tensorflow/cc/profiler/profiler.cc b/tensorflow/cc/profiler/profiler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e55bac73e6d32a1fa5ddcc1937744e2cf56657d
--- /dev/null
+++ b/tensorflow/cc/profiler/profiler.cc
@@ -0,0 +1,57 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/cc/profiler/profiler.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+Profiler::Profiler(const GraphDef& graph) {
+  std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
+  *graph_ptr = graph;
+  stats_.reset(new TFStats(std::move(graph_ptr), nullptr, nullptr, nullptr));
+}
+
+void Profiler::AddStep(int64 step, const RunMetadata& run_meta) {
+  std::unique_ptr<RunMetadata> run_meta_ptr(new RunMetadata());
+  *run_meta_ptr = run_meta;
+  stats_->AddRunMeta(step, std::move(run_meta_ptr));
+}
+
+GraphNodeProto Profiler::ProfileGraph(const Options& options) {
+  stats_->BuildView(kCmds[1]);
+  return stats_->ShowGraphNode(kCmds[1], options);
+}
+
+GraphNodeProto Profiler::ProfileNameScope(const Options& options) {
+  stats_->BuildView(kCmds[0]);
+  return stats_->ShowGraphNode(kCmds[0], options);
+}
+
+MultiGraphNodeProto Profiler::ProfileOperations(const Options& options) {
+  stats_->BuildView(kCmds[3]);
+  return stats_->ShowMultiGraphNode(kCmds[3], options);
+}
+
+Status Profiler::SerializeToString(string* content) {
+  if (!content) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "Cannot use null string pointer for SerializeToString.");
+  }
+  stats_->SerializeToString(content);
+  return Status::OK();
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/cc/profiler/profiler.h b/tensorflow/cc/profiler/profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..6077c45c5854fd5812ccb7c91522f93ed4e54883
--- /dev/null
+++ b/tensorflow/cc/profiler/profiler.h
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_PROFILER_PROFILER_H_
+#define TENSORFLOW_CC_PROFILER_PROFILER_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/profiler/internal/tfprof_stats.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
+#include "tensorflow/core/profiler/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+/// @addtogroup core
+/// @{
+
+/// A `Profiler` object lets the caller profile the execution of a graph.
+///
+/// Example:
+///     // First build a graph and run tracing.
+///     Scope root = Scope::NewRootScope();
+///     auto a = Placeholder(root, DT_INT32);
+///     auto c = Add(root, a, {41});
+///
+///     ClientSession session(root);
+///     std::vector<Tensor> outputs;
+///     RunOptions run_options;
+///     run_options.set_trace_level(RunOptions::FULL_TRACE);
+///     RunMetadata run_meta;
+///     Status s = session.Run(run_options, { {a, {1}} }, {c}, &outputs,
+///                            &run_meta);
+///     if (!s.ok()) { ... }
+///
+///     // Then create profiler to do profiling.
+///     GraphDef graph;
+///     root.ToGraphDef(&graph);
+///     Profiler profiler(graph);
+///     profiler.AddStep(0, run_meta);
+///     Options opts = ...  // TODO(xpan): Support option building API.
+///     MultiGraphNodeProto r = profiler.ProfileOperations(opts);
+///
+class Profiler {
+ public:
+  /// `graph` is the model's GraphDef.
+  Profiler(const GraphDef& graph);
+
+  /// Adds tracing information `run_meta` to profiler. A `run_meta` is
+  /// generated by a TensorFlow session run call. `step` is the key
+  /// to the `run_meta`. When calling ProfileXXX methods, caller can specify
+  /// `step` in `options` to seletively profile the corresponding `run_meta`.
+  /// Multiple different `run_meta` can be keyed by the same `step` in order
+  /// to group them together.
+  void AddStep(int64 step, const RunMetadata& run_meta);
+
+  /// Profiles the model by organizing nodes in graph structure.
+  /// Each node is an op and the nodes are contected by the op inputs/outputs.
+  GraphNodeProto ProfileGraph(const Options& options);
+
+  /// Profiles the model by organizing nodes in name scope structure.
+  /// Each node is an op, and nodes are organized by the ops' name
+  /// scope, similar to a filesystem tree.
+  /// E.g. /foo is the root of operation /foo/matmul_1 and foo/conv_2.
+  GraphNodeProto ProfileNameScope(const Options& options);
+
+  /// Profiles the model by organizing nodes by operation types.
+  /// Each node is an operation type (e.g. Conv2D or MatMul), containing all
+  /// ops belonging to that type in the model.
+  MultiGraphNodeProto ProfileOperations(const Options& options);
+
+  /// Serialize the profile content (ProfileProto) into a binary string,
+  /// User can write the string to file for offline analysis by
+  /// tfprof command-line tools or graphical user interface.
+  Status SerializeToString(string* content);
+
+ private:
+  std::unique_ptr<TFStats> stats_;
+};
+/// @}
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_PROFILER_PROFILER_H_
diff --git a/tensorflow/cc/profiler/profiler_test.cc b/tensorflow/cc/profiler/profiler_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..280cd74827fc8ae80737eaf61286535fec959aa8
--- /dev/null
+++ b/tensorflow/cc/profiler/profiler_test.cc
@@ -0,0 +1,177 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/test.h"
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/profiler/profiler.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class ProfilerTest : public ::testing::Test {
+ protected:
+  ProfilerTest() {}
+};
+
+GraphDef CreateGraphDef() {
+  Scope root = Scope::NewRootScope();
+
+  auto a = ops::Const<float>(root, {{3, 2}, {-1, 0}});
+
+  auto x = ops::Const(root.WithOpName("x"), {{1.f}, {1.f}});
+
+  auto y = ops::MatMul(root.WithOpName("y"), a, x);
+
+  auto y2 = ops::Square(root, y);
+
+  auto y2_sum = ops::Sum(root, y2, 0);
+
+  auto y_norm = ops::Sqrt(root, y2_sum);
+
+  auto y_div = ops::Div(root.WithOpName("y_normalized"), y, y_norm);
+
+  GraphDef def;
+  TF_CHECK_OK(root.ToGraphDef(&def));
+
+  return def;
+}
+
+Options Default() {
+  Options opts(1000,       /* max_depth */
+               0,          /* min_bytes */
+               0,          /* min_peak_bytes */
+               0,          /* min_residual_bytes */
+               0,          /* min_output_bytes */
+               0,          /* min_micros */
+               0,          /* min_accelerator_micros */
+               0,          /* min_cpu_micros */
+               0,          /* min_params */
+               0,          /* min_float_ops */
+               0,          /* min_occurrence */
+               0,          /* step */
+               "name",     /* order_by */
+               {".*"},     /* account_type_regexes */
+               {".*"},     /* start_name_regexes */
+               {},         /* trim_name_regexes */
+               {".*"}, {}, /* hide_name_regexes */
+               false,      /* account_displayed_op_only */
+               {"micros"}, /* select */
+               {"none"},   /* output_type */
+               {});
+  return opts;
+}
+
+template <typename T>
+const T* ExtractNode(const T& pb, const string& name) {
+  if (pb.name() == name) {
+    return &pb;
+  }
+  for (const T& c : pb.children()) {
+    const T* ret = ExtractNode(c, name);
+    if (ret) return ret;
+  }
+  return nullptr;
+}
+
+TEST_F(ProfilerTest, Basics) {
+  SessionOptions options;
+  options.config.set_allow_soft_placement(true);
+  std::unique_ptr<Session> session(NewSession(options));
+  GraphDef def = CreateGraphDef();
+  if (options.target.empty()) {
+    graph::SetDefaultDevice("/gpu:0", &def);
+  }
+
+  TF_CHECK_OK(session->Create(def));
+
+  Tensor x(DT_FLOAT, TensorShape({2, 1}));
+  auto x_flat = x.flat<float>();
+  x_flat.setRandom();
+  Eigen::Tensor<float, 0, Eigen::RowMajor> inv_norm =
+      x_flat.square().sum().sqrt().inverse();
+  x_flat = x_flat * inv_norm();
+
+  std::vector<Tensor> outputs;
+  RunOptions run_options;
+  run_options.set_trace_level(RunOptions::FULL_TRACE);
+  RunMetadata run_metadata;
+  outputs.clear();
+
+  Profiler profiler(def);
+  for (int i = 0; i < 2; ++i) {
+    TF_CHECK_OK(session->Run(run_options, {{"x", x}}, {"y:0", "y_normalized:0"},
+                             {}, &outputs, &run_metadata));
+    profiler.AddStep(i, run_metadata);
+    CHECK_EQ(size_t{2}, outputs.size());
+  }
+
+  std::vector<DeviceAttributes> resp;
+  TF_CHECK_OK(session->ListDevices(&resp));
+  bool has_gpu = false;
+  for (const auto& dev : resp) {
+    if (dev.device_type() == "GPU") {
+      has_gpu = true;
+    }
+  }
+
+  GraphNodeProto ret = profiler.ProfileNameScope(Default());
+  const GraphNodeProto* matmul = ExtractNode(ret, "y");
+  EXPECT_TRUE(matmul);
+  EXPECT_GT(matmul->exec_micros(), 0);
+  if (has_gpu) {
+    EXPECT_GT(matmul->accelerator_exec_micros(), 0);
+  } else {
+    EXPECT_EQ(matmul->accelerator_exec_micros(), 0);
+  }
+  const GraphNodeProto* square = ExtractNode(ret, "Square");
+  EXPECT_TRUE(square);
+  EXPECT_GT(square->exec_micros(), 0);
+  if (has_gpu) {
+    EXPECT_GT(square->accelerator_exec_micros(), 0);
+  } else {
+    EXPECT_EQ(square->accelerator_exec_micros(), 0);
+  }
+
+  Options opts2 = Default();
+  opts2.output_type = "timeline";
+  string timeline_file = io::JoinPath(testing::TmpDir(), "timeline");
+  opts2.output_options["outfile"] = timeline_file;
+  GraphNodeProto ret2 = profiler.ProfileGraph(opts2);
+  string s;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), timeline_file + "_0", &s));
+  EXPECT_TRUE(s.find("Square") != s.npos);
+
+  MultiGraphNodeProto ret3 = profiler.ProfileOperations(Default());
+  const MultiGraphNodeProto* matmul2 = ExtractNode(ret3, "MatMul");
+  EXPECT_TRUE(matmul2);
+  EXPECT_GT(matmul2->exec_micros(), 0);
+  if (has_gpu) {
+    EXPECT_GT(matmul2->accelerator_exec_micros(), 0);
+  } else {
+    EXPECT_EQ(matmul2->accelerator_exec_micros(), 0);
+  }
+
+  TF_CHECK_OK(session->Close());
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/constants.h b/tensorflow/cc/saved_model/constants.h
index c940df8a8761d97a859be3af30980ff79ca3577a..645a3f101d1ae7dda88ec4ca622c694dc5a7a919 100644
--- a/tensorflow/cc/saved_model/constants.h
+++ b/tensorflow/cc/saved_model/constants.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
+#ifndef TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
+#define TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
 
 namespace tensorflow {
 
@@ -47,4 +47,4 @@ constexpr char kSavedModelVariablesFilename[] = "variables";
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
+#endif  // TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index f98abc8a817eca7bc129bb03a2ad31b97d957065..faa1e378d07ea94ad08ee084d18bf6a113f054af 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -62,6 +62,15 @@ Status ReadSavedModel(const string& export_dir, SavedModel* saved_model_proto) {
                     export_dir);
 }
 
+string GetTagsAsString(const std::unordered_set<string>& tags) {
+  string tags_as_string = "{ ";
+  for (const string& tag : tags) {
+    tags_as_string = strings::StrCat(tags_as_string, tag, " ");
+  }
+  tags_as_string = strings::StrCat(tags_as_string, "}");
+  return tags_as_string;
+}
+
 Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto,
                               const std::unordered_set<string>& tags,
                               MetaGraphDef* meta_graph_def_to_load) {
@@ -77,14 +86,9 @@ Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto,
       return Status::OK();
     }
   }
-  string tags_as_string = "{ ";
-  for (const string& tag : tags) {
-    tags_as_string = strings::StrCat(tags_as_string, tag, " ");
-  }
-  tags_as_string = strings::StrCat(tags_as_string, "}");
   return Status(error::Code::NOT_FOUND,
                 "Could not find meta graph def matching supplied tags: " +
-                    tags_as_string +
+                    GetTagsAsString(tags) +
                     ". To inspect available tag-sets in the SavedModel, please "
                     "use the SavedModel CLI: `saved_model_cli`");
 }
@@ -92,7 +96,9 @@ Status FindMetaGraphDefToLoad(const SavedModel& saved_model_proto,
 Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
                                 const SessionOptions& session_options,
                                 std::unique_ptr<Session>* session) {
-  session->reset(NewSession(session_options));
+  Session* session_p = nullptr;
+  TF_RETURN_IF_ERROR(NewSession(session_options, &session_p));
+  session->reset(session_p);
   return (*session)->Create(meta_graph_def.graph_def());
 }
 
@@ -233,7 +239,8 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
     return Status(error::Code::NOT_FOUND,
                   "SavedModel not found in export directory: " + export_dir);
   }
-  LOG(INFO) << "Loading SavedModel from: " << export_dir;
+  LOG(INFO) << "Loading SavedModel with tags: " << GetTagsAsString(tags)
+            << "; from: " << export_dir;
 
   SavedModel saved_model_proto;
   TF_RETURN_IF_ERROR(ReadSavedModel(export_dir, &saved_model_proto));
@@ -281,7 +288,8 @@ Status LoadSavedModel(const SessionOptions& session_options,
     return end_microseconds - start_microseconds;
   }();
   auto log_and_count = [&](const string& status_str) {
-    LOG(INFO) << "Loading SavedModel: " << status_str << ". Took "
+    LOG(INFO) << "SavedModel load for tags " << GetTagsAsString(tags)
+              << "; Status: " << status_str << ". Took "
               << load_latency_microsecs << " microseconds.";
     load_attempt_count->GetCell(export_dir, status_str)->IncrementBy(1);
   };
diff --git a/tensorflow/cc/saved_model/loader.h b/tensorflow/cc/saved_model/loader.h
index 3d634dd51543bed8d3c074bdc56c251f97d56976..a8e098fa5440e7a8f72fd0b52737dcb06435b908 100644
--- a/tensorflow/cc/saved_model/loader.h
+++ b/tensorflow/cc/saved_model/loader.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 /// SavedModel loading functions and SavedModelBundle struct.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
-#define THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
+#ifndef TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
+#define TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
 
 #include <string>
 #include <unordered_set>
@@ -61,4 +61,4 @@ bool MaybeSavedModelDirectory(const string& export_dir);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
+#endif  // TENSORFLOW_CC_SAVED_MODEL_LOADER_H_
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 0ad6b33bba5fcceaca68e2f179cef2232c689a80..4c64d2cfe3c10e6c7ed82a2d72460a0b34283bb2 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -155,6 +155,24 @@ TEST_F(LoaderTest, NoTagMatchMultiple) {
       << st.error_message();
 }
 
+TEST_F(LoaderTest, SessionCreationFailure) {
+  SavedModelBundle bundle;
+  // Use invalid SessionOptions to cause session creation to fail.  Default
+  // options work, so provide an invalid value for the target field.
+  SessionOptions session_options;
+  constexpr char kInvalidTarget[] = "invalid target";
+  session_options.target = kInvalidTarget;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  Status st = LoadSavedModel(session_options, run_options, export_dir,
+                             {kSavedModelTagServe}, &bundle);
+  EXPECT_FALSE(st.ok());
+  EXPECT_TRUE(StringPiece(st.error_message()).contains(kInvalidTarget))
+      << st.error_message();
+}
+
 TEST_F(LoaderTest, PbtxtFormat) {
   SavedModelBundle bundle;
   SessionOptions session_options;
diff --git a/tensorflow/cc/saved_model/python/BUILD b/tensorflow/cc/saved_model/python/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f5fbc75edcba9d5ae9ef7432de224df766bcab9e
--- /dev/null
+++ b/tensorflow/cc/saved_model/python/BUILD
@@ -0,0 +1,30 @@
+# Description:
+# CLIF wrappers for TensorFlow SavedModels.
+
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_py_clif_cc")
+
+tf_py_clif_cc(
+    name = "loader",
+    srcs = ["loader.clif"],
+    deps = [
+        "//tensorflow/cc/saved_model:loader",
+    ],
+)
diff --git a/tensorflow/cc/saved_model/python/loader.clif b/tensorflow/cc/saved_model/python/loader.clif
new file mode 100644
index 0000000000000000000000000000000000000000..b102757d2eeb46ee713d8ed0d0c3d66b58740ee0
--- /dev/null
+++ b/tensorflow/cc/saved_model/python/loader.clif
@@ -0,0 +1,4 @@
+from "third_party/tensorflow/cc/saved_model/loader.h":
+  namespace `tensorflow`:
+    class SavedModelBundle:
+      def __init__(self)
diff --git a/tensorflow/cc/saved_model/signature_constants.h b/tensorflow/cc/saved_model/signature_constants.h
index b2d39bd55beb48a05489236395a208e41deb9c8f..7d8c07f5cf0a310c20193469cb6d18664f738d96 100644
--- a/tensorflow/cc/saved_model/signature_constants.h
+++ b/tensorflow/cc/saved_model/signature_constants.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
+#ifndef TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
+#define TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
 
 namespace tensorflow {
 
@@ -66,4 +66,4 @@ static constexpr char kRegressOutputs[] = "outputs";
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
+#endif  // TENSORFLOW_CC_SAVED_MODEL_SIGNATURE_CONSTANTS_H_
diff --git a/tensorflow/cc/saved_model/tag_constants.h b/tensorflow/cc/saved_model/tag_constants.h
index b71cb263ca42dab7e830c1880ec4b311bc272f82..68a090e0c4cf79cfa87771a80447b8112fc37fb9 100644
--- a/tensorflow/cc/saved_model/tag_constants.h
+++ b/tensorflow/cc/saved_model/tag_constants.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
-#define THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
+#ifndef TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
+#define TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
 
 namespace tensorflow {
 
@@ -32,4 +32,4 @@ constexpr char kSavedModelTagTrain[] = "train";
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
+#endif  // TENSORFLOW_CC_SAVED_MODEL_TAG_CONSTANTS_H_
diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..97f66e79b8ad9f383b22f56e9385fc6d2080e1f8
--- /dev/null
+++ b/tensorflow/cc/tools/BUILD
@@ -0,0 +1,57 @@
+# Description:
+# TensorFlow cc tools.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+cc_library(
+    name = "freeze_saved_model",
+    srcs = ["freeze_saved_model.cc"],
+    hdrs = ["freeze_saved_model.h"],
+    deps = [
+        "//tensorflow/cc/saved_model:loader",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "freeze_saved_model_test",
+    srcs = ["freeze_saved_model_test.cc"],
+    deps = [
+        ":freeze_saved_model",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+# Google-internal targets.
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/cc/tools/freeze_saved_model.cc b/tensorflow/cc/tools/freeze_saved_model.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ddf372cdef21e1b3892c9a03714478d5a5785517
--- /dev/null
+++ b/tensorflow/cc/tools/freeze_saved_model.cc
@@ -0,0 +1,194 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/tools/freeze_saved_model.h"
+
+#include <queue>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Gets tensor names from tensor_info and inserts them into the set of tensor
+// names.
+void GetTensorNamesFromTensorInfo(const TensorInfo& tensor_info,
+                                  std::unordered_set<string>* tensor_names) {
+  if (tensor_info.has_coo_sparse()) {
+    // If the tensor is sparse we have to add all three tensors of the sparse
+    // representations.
+    const TensorInfo_CooSparse& coo_sparse = tensor_info.coo_sparse();
+    tensor_names->insert(coo_sparse.values_tensor_name());
+    tensor_names->insert(coo_sparse.indices_tensor_name());
+    tensor_names->insert(coo_sparse.dense_shape_tensor_name());
+  } else {
+    tensor_names->insert(tensor_info.name());
+  }
+}
+
+// Gets the union of all inputs and outputs of all SignatureDefs in the bundle
+void GetSignatureDefsInputsAndOutputs(
+    const SavedModelBundle& saved_model_bundle,
+    std::unordered_set<string>* inputs, std::unordered_set<string>* outputs) {
+  for (auto& sigdef_elem : saved_model_bundle.meta_graph_def.signature_def()) {
+    const SignatureDef& signature_def = sigdef_elem.second;
+    for (auto& input_elem : signature_def.inputs()) {
+      GetTensorNamesFromTensorInfo(input_elem.second, inputs);
+    }
+    for (auto& output_elem : signature_def.outputs()) {
+      GetTensorNamesFromTensorInfo(output_elem.second, outputs);
+    }
+  }
+}
+
+// Gets a map from string node name to NodeDef.
+void GetNodeNameToNodeDefMap(
+    GraphDef* graph_def,
+    std::unordered_map<string, NodeDef*>* name_to_node_map) {
+  for (size_t i = 0; i < graph_def->node_size(); i++) {
+    NodeDef* node = graph_def->mutable_node(i);
+    (*name_to_node_map)[node->name()] = node;
+  }
+}
+
+// Gets the set of node names needed by `outputs` and the corresponding set of
+// variable nodes to convert.
+void GetReachableNodesAndVariables(
+    GraphDef* graph_def, const std::unordered_set<string>& outputs,
+    std::unordered_set<string>* reachable_node_names,
+    std::unordered_set<string>* variable_node_names) {
+  // TODO(suharshs): Add support for ResourceVariables.
+  static const std::unordered_set<string>* kVariableTypes =
+      new std::unordered_set<string>({"Variable", "VariableV2"});
+  // name_to_node_map is needed to get the inputs from the NodeDef corresponding
+  // the a string node name. These inputs are used when doing our backwards
+  // traversal.
+  std::unordered_map<string, NodeDef*> name_to_node_map;
+  GetNodeNameToNodeDefMap(graph_def, &name_to_node_map);
+  std::queue<string> nodes_to_visit;
+  for (const string& tensor_name : outputs) {
+    // We need to strip off the tensor part to get the node name.
+    std::vector<string> tensor_name_parts = str_util::Split(tensor_name, ':');
+    nodes_to_visit.push(tensor_name_parts[0]);
+  }
+  // We do a traversal backwards from the outputs specified in the MetaGraphDef.
+  while (!nodes_to_visit.empty()) {
+    const string node_name = nodes_to_visit.front();
+    nodes_to_visit.pop();
+    if (reachable_node_names->find(node_name) != reachable_node_names->end()) {
+      continue;
+    }
+    reachable_node_names->insert(node_name);
+    NodeDef* node = name_to_node_map[node_name];
+    if (kVariableTypes->find(node->op()) != kVariableTypes->end()) {
+      variable_node_names->insert(node->name());
+    }
+    for (const string& input : node->input()) {
+      nodes_to_visit.push(input);
+    }
+  }
+}
+
+// Gets a map from variable name to variable value.
+Status GetVariableNameToTensorMap(
+    Session* session, std::unordered_set<string> variable_names_set,
+    std::unordered_map<string, Tensor>* variable_name_to_value_map) {
+  if (variable_names_set.empty()) {
+    return Status::OK();
+  }
+  std::vector<string> variable_names;
+  std::vector<string> tensor_names;
+  for (const string& node_name : variable_names_set) {
+    variable_names.push_back(node_name);
+    // We need to run tensors, so append ":0".
+    tensor_names.push_back(node_name + ":0");
+  }
+  std::vector<Tensor> outputs;
+  TF_RETURN_IF_ERROR(
+      session->Run(/* inputs */ {}, tensor_names, /* targets */ {}, &outputs));
+  for (size_t i = 0; i < variable_names.size(); i++) {
+    (*variable_name_to_value_map)[variable_names[i]] = outputs[i];
+  }
+  return Status::OK();
+}
+
+// Converts a Variable NodeDef into a Constant NodeDef.
+void ConvertVariableToConstant(const NodeDef& variable_node,
+                               const Tensor& variable_value,
+                               NodeDef* const_node) {
+  const_node->set_name(variable_node.name());
+  const_node->set_op("Const");
+  (*const_node->mutable_attr())["dtype"] = variable_node.attr().at("dtype");
+  variable_value.AsProtoTensorContent(
+      (*const_node->mutable_attr())["value"].mutable_tensor());
+}
+
+// Freezes the subgraph of all nodes needed by `outputs`.
+Status FreezeGraphDef(const SavedModelBundle& saved_model_bundle,
+                      const std::unordered_set<string>& outputs,
+                      GraphDef* frozen_graph_def) {
+  GraphDef graph_def = saved_model_bundle.meta_graph_def.graph_def();
+  // Copy versions and library as-is from original graph.
+  *frozen_graph_def->mutable_versions() = graph_def.versions();
+  *frozen_graph_def->mutable_library() = graph_def.library();
+  // If the graph is empty there is nothing left to do.
+  if (graph_def.node_size() == 0) {
+    return Status::OK();
+  }
+  std::unordered_set<string> reachable_node_names;
+  std::unordered_set<string> variable_node_names;
+  GetReachableNodesAndVariables(&graph_def, outputs, &reachable_node_names,
+                                &variable_node_names);
+  std::unordered_map<string, Tensor> variable_to_value_map;
+  TF_RETURN_IF_ERROR(
+      GetVariableNameToTensorMap(saved_model_bundle.session.get(),
+                                 variable_node_names, &variable_to_value_map));
+  // We copy the nodes in the same order they were in the original graph_def.
+  for (const NodeDef& node : graph_def.node()) {
+    if (reachable_node_names.find(node.name()) == reachable_node_names.end()) {
+      continue;
+    }
+    if (variable_node_names.find(node.name()) != variable_node_names.end()) {
+      ConvertVariableToConstant(node, variable_to_value_map[node.name()],
+                                frozen_graph_def->add_node());
+    } else {
+      // If the node isn't a variable, just copy the node as-is.
+      *frozen_graph_def->add_node() = node;
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status FreezeSavedModel(const SavedModelBundle& saved_model_bundle,
+                        GraphDef* frozen_graph_def,
+                        std::unordered_set<string>* inputs,
+                        std::unordered_set<string>* outputs) {
+  GetSignatureDefsInputsAndOutputs(saved_model_bundle, inputs, outputs);
+  TF_RETURN_IF_ERROR(
+      FreezeGraphDef(saved_model_bundle, *outputs, frozen_graph_def));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/cc/tools/freeze_saved_model.h b/tensorflow/cc/tools/freeze_saved_model.h
new file mode 100644
index 0000000000000000000000000000000000000000..b10f29805a4515f9d49426cc41e0d375cd32b072
--- /dev/null
+++ b/tensorflow/cc/tools/freeze_saved_model.h
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CC_TOOLS_FREEZE_SAVED_MODEL_H_
+#define TENSORFLOW_CC_TOOLS_FREEZE_SAVED_MODEL_H_
+
+#include <unordered_set>
+
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Returns a frozen GraphDef, input tensors, and output tensors from the loaded
+// SavedModelBundle.
+// `inputs` and `outputs` consist of the union of all inputs and outputs in the
+// SignatureDefs in the SavedModelBundle.
+// FreezeSavedModel sets `frozen_graph_def` to a GraphDef of all nodes needed by
+// `outputs`. All variables in the supplied SavedModelBundle are converted to
+// constants, set to the value of the variables, by running the restored Session
+// in the SavedModelBundle.
+// WARNING: Only the variable checkpoints will be reflected in the frozen
+// graph_def. All saved_model assets will be ignored.
+Status FreezeSavedModel(const SavedModelBundle& saved_model_bundle,
+                        GraphDef* frozen_graph_def,
+                        std::unordered_set<string>* inputs,
+                        std::unordered_set<string>* outputs);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_TOOLS_FREEZE_SAVED_MODEL_H_
diff --git a/tensorflow/cc/tools/freeze_saved_model_test.cc b/tensorflow/cc/tools/freeze_saved_model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..52a81a50284aec36bba4e56a0232c886cb0cb6cf
--- /dev/null
+++ b/tensorflow/cc/tools/freeze_saved_model_test.cc
@@ -0,0 +1,307 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/tools/freeze_saved_model.h"
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+class FreezeTest : public ::testing::Test {
+ protected:
+  void GraphDefEqual(const GraphDef& actual, const GraphDef& expected) {
+    EXPECT_EQ(actual.ShortDebugString(), expected.ShortDebugString());
+  }
+
+  // Builds a SignatureDef with the provided `inputs` and `outputs`.
+  SignatureDef BuildSignatureDef(const std::unordered_set<string>& inputs,
+                                 const std::unordered_set<string>& outputs) {
+    SignatureDef signature_def;
+    for (const string& input : inputs) {
+      (*signature_def.mutable_inputs())[input].set_name(input);
+    }
+    for (const string& output : outputs) {
+      (*signature_def.mutable_outputs())[output].set_name(output);
+    }
+    return signature_def;
+  }
+
+  // Adds `signature_def` to `saved_model_bundle` under `key`.
+  void AddSignatureDefToSavedModelBundle(const SignatureDef& signature_def,
+                                         const string& key,
+                                         SavedModelBundle* saved_model_bundle) {
+    MetaGraphDef* meta_graph_def = &saved_model_bundle->meta_graph_def;
+    (*meta_graph_def->mutable_signature_def())[key] = signature_def;
+  }
+
+  // Adds an initialized session to `saved_model_bundle` using `graph_def` and
+  // initializing with `init_node`.
+  Status InitializeSavedModelBundleSession(
+      const GraphDef& graph_def, const string& init_node,
+      SavedModelBundle* saved_model_bundle) {
+    SessionOptions session_options;
+    saved_model_bundle->session.reset(NewSession(session_options));
+    TF_RETURN_IF_ERROR(saved_model_bundle->session->Create(graph_def));
+    if (!init_node.empty()) {
+      std::vector<Tensor> outputs;
+      return saved_model_bundle->session->Run(
+          /* inputs */ {}, /* output_tensors */ {}, {init_node}, &outputs);
+    }
+    return Status::OK();
+  }
+
+  // Adds `graph_def` to `saved_model_bundle` and initializes a session with
+  // `init_node`.
+  Status AddGraphDefToSavedModelBundle(const GraphDef& graph_def,
+                                       const string& init_node,
+                                       SavedModelBundle* saved_model_bundle) {
+    MetaGraphDef* meta_graph_def = &saved_model_bundle->meta_graph_def;
+    *meta_graph_def->mutable_graph_def() = graph_def;
+    return InitializeSavedModelBundleSession(graph_def, init_node,
+                                             saved_model_bundle);
+  }
+
+  // Adds `graph_def` and `outputs` as the GraphDef and SignatureDef in
+  // `saved_model_bundle` and initializes a session with `init_node`.
+  Status AddGraphDefWithOutputsToSavedModelBundle(
+      const GraphDef& graph_def, const std::unordered_set<string>& outputs,
+      const string& init_node, SavedModelBundle* saved_model_bundle) {
+    SignatureDef signature_def =
+        BuildSignatureDef(std::unordered_set<string>(), outputs);
+    AddSignatureDefToSavedModelBundle(signature_def, "signature_def",
+                                      saved_model_bundle);
+    return AddGraphDefToSavedModelBundle(graph_def, init_node,
+                                         saved_model_bundle);
+  }
+
+  // Runs and compares the outputs of `tensor_name` on both the
+  // `unfrozen_session` and the `frozen_graph_def.
+  void RunAndCompareFrozenAndUnfrozenGraphs(Session* unfrozen_session,
+                                            const GraphDef& frozen_graph_def,
+                                            const string& tensor_name) {
+    std::vector<Tensor> unfrozen_outputs;
+    TF_ASSERT_OK(unfrozen_session->Run(/* inputs */ {}, {tensor_name},
+                                       /* targets */ {}, &unfrozen_outputs));
+
+    SessionOptions session_options;
+    std::unique_ptr<Session> frozen_session(NewSession(session_options));
+    TF_ASSERT_OK(frozen_session->Create(frozen_graph_def));
+    std::vector<Tensor> frozen_outputs;
+    TF_ASSERT_OK(frozen_session->Run(/* inputs */ {}, {tensor_name},
+                                     /* targets */ {}, &frozen_outputs));
+
+    test::ExpectTensorEqual<float>(unfrozen_outputs[0], frozen_outputs[0]);
+  }
+};
+
+TEST_F(FreezeTest, InputsAndOutputsSingleSignatureDef) {
+  // Test that inputs and outputs get correctly populated for a single
+  // SignatureDef.
+  SavedModelBundle saved_model_bundle;
+  std::unordered_set<string> expected_inputs = {"input0:0", "input1:0"};
+  std::unordered_set<string> expected_outputs = {"output0:0", "output1:0"};
+  SignatureDef signature_def =
+      BuildSignatureDef(expected_inputs, expected_outputs);
+  AddSignatureDefToSavedModelBundle(signature_def, "signature_def",
+                                    &saved_model_bundle);
+  GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
+                                &outputs));
+  EXPECT_EQ(expected_inputs, inputs);
+  EXPECT_EQ(expected_outputs, outputs);
+}
+
+TEST_F(FreezeTest, InputsAndOutputsMultipleSignatureDefs) {
+  // Test that inputs and outputs get correctly merged and populated when
+  // multiple SignatureDefs are provided.
+  SavedModelBundle saved_model_bundle;
+  SignatureDef signature_def_0 = BuildSignatureDef({"input0:0"}, {"output0:0"});
+  SignatureDef signature_def_1 = BuildSignatureDef({"input1:0"}, {"output1:0"});
+  AddSignatureDefToSavedModelBundle(signature_def_0, "signature_def_0",
+                                    &saved_model_bundle);
+  AddSignatureDefToSavedModelBundle(signature_def_1, "signature_def_1",
+                                    &saved_model_bundle);
+  GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
+                                &outputs));
+  std::unordered_set<string> expected_inputs = {"input0:0", "input1:0"};
+  std::unordered_set<string> expected_outputs = {"output0:0", "output1:0"};
+  EXPECT_EQ(expected_inputs, inputs);
+  EXPECT_EQ(expected_outputs, outputs);
+}
+
+TEST_F(FreezeTest, GraphDefVersionsAndLibrary) {
+  // Test that GraphDef versions and library are copied correctly into the
+  // frozen graph.
+  SavedModelBundle saved_model_bundle;
+  GraphDef graph_def;
+  graph_def.mutable_versions()->set_producer(1234);
+  graph_def.mutable_versions()->set_min_consumer(1234);
+  *graph_def.mutable_library()->add_function() = test::function::NonZero();
+  TF_ASSERT_OK(
+      AddGraphDefToSavedModelBundle(graph_def, "", &saved_model_bundle));
+
+  GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
+                                &outputs));
+
+  GraphDefEqual(frozen_graph_def, graph_def);
+}
+
+TEST_F(FreezeTest, GraphDefWithNoVariables) {
+  // Test freezing a graph with no variables.
+  SavedModelBundle saved_model_bundle;
+  GraphDef graph_def;
+  Scope scope = Scope::NewRootScope();
+  Output a = ops::Const(scope.WithOpName("a"), 10.0f, {});
+  Output b = ops::Const(scope.WithOpName("b"), 10.0f, {});
+  Output c = ops::Mul(scope.WithOpName("c"), a, b);
+  TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
+  TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(graph_def, {"c:0"}, "",
+                                                        &saved_model_bundle));
+
+  GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
+                                &outputs));
+
+  GraphDefEqual(frozen_graph_def, graph_def);
+}
+
+TEST_F(FreezeTest, GraphDefWithVariablesNotNeededByOutputs) {
+  // Test freezing a graph with variables that are not needed by the outputs in
+  // the SignatureDef. The resulting graph shouldn't be frozen, but
+  // non-dependent nodes should be pruned.
+  SavedModelBundle saved_model_bundle;
+  GraphDef graph_def;
+  Scope scope = Scope::NewRootScope();
+  Output a = ops::Const(scope.WithOpName("a"), 10.0f, {});
+  Output b = ops::Const(scope.WithOpName("b"), 10.0f, {});
+  Output c = ops::Mul(scope.WithOpName("c"), a, b);
+  Output var = ops::Variable(scope.WithOpName("var"), {}, DataType::DT_FLOAT);
+  Output assign = ops::Assign(scope.WithOpName("assign"), var, a);
+  TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
+  // "c" isnt dependent on the variable, so nothing should be frozen.
+  TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(
+      graph_def, {"c:0"}, assign.name(), &saved_model_bundle));
+
+  GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
+                                &outputs));
+
+  GraphDef expected_graph_def;
+  Scope expected_scope = Scope::NewRootScope();
+  Output expected_a = ops::Const(expected_scope.WithOpName("a"), 10.0f, {});
+  Output expected_b = ops::Const(expected_scope.WithOpName("b"), 10.0f, {});
+  Output expected_c =
+      ops::Mul(expected_scope.WithOpName("c"), expected_a, expected_b);
+  TF_ASSERT_OK(expected_scope.ToGraphDef(&expected_graph_def));
+
+  GraphDefEqual(frozen_graph_def, expected_graph_def);
+
+  RunAndCompareFrozenAndUnfrozenGraphs(saved_model_bundle.session.get(),
+                                       frozen_graph_def, "c:0");
+}
+
+TEST_F(FreezeTest, GraphDefWithVariablesNeededByOutputs) {
+  // Test freezing a graph with variables that are needed by outputs in the
+  // SignatureDef. The variables should be frozen.
+  SavedModelBundle saved_model_bundle;
+  GraphDef graph_def;
+  Scope scope = Scope::NewRootScope();
+  Output a = ops::Const(scope.WithOpName("a"), 10.0f, {});
+  Output var = ops::Variable(scope.WithOpName("var"), {}, DataType::DT_FLOAT);
+  Output c = ops::Mul(scope.WithOpName("c"), a, var);
+  Output assign = ops::Assign(scope.WithOpName("assign"), var, a);
+  TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
+  // "c" isnt dependent on the variable, so nothing should be frozen.
+  TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(
+      graph_def, {"c:0"}, assign.name(), &saved_model_bundle));
+
+  GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
+                                &outputs));
+
+  // There should be 3 nodes in the resulting graph_def, and none should be
+  // variables.
+  EXPECT_EQ(frozen_graph_def.node_size(), 3);
+  for (const NodeDef& node : frozen_graph_def.node()) {
+    EXPECT_NE(node.op(), "Variable") << node.name();
+    EXPECT_NE(node.op(), "VariableV2") << node.name();
+  }
+
+  RunAndCompareFrozenAndUnfrozenGraphs(saved_model_bundle.session.get(),
+                                       frozen_graph_def, "c:0");
+}
+
+TEST_F(FreezeTest, GraphDefWithVariablesNeededAndNotNeededByOutputs) {
+  // Test freezing a graph with some variables that are needed and not needed by
+  // the outputs in the SignatureDef. The resulting graph should only freeze
+  // dependent variables.
+  SavedModelBundle saved_model_bundle;
+  GraphDef graph_def;
+  Scope scope = Scope::NewRootScope();
+  Output a = ops::Const(scope.WithOpName("a"), 10.0f, {});
+  Output var = ops::Variable(scope.WithOpName("var"), {}, DataType::DT_FLOAT);
+  Output c = ops::Mul(scope.WithOpName("c"), a, var);
+  Output assign = ops::Assign(scope.WithOpName("assign"), var, a);
+  Output var_1 =
+      ops::Variable(scope.WithOpName("var_1"), {}, DataType::DT_FLOAT);
+  Output assign_1 = ops::Assign(scope.WithOpName("assign_1"), var, a);
+  TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
+  // "c" isnt dependent on the variable, so nothing should be frozen.
+  TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(
+      graph_def, {"c:0"}, assign.name(), &saved_model_bundle));
+
+  GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
+                                &outputs));
+
+  // There should be 3 nodes in the resulting graph_def, and none should be
+  // variables.
+  EXPECT_EQ(frozen_graph_def.node_size(), 3);
+  for (const NodeDef& node : frozen_graph_def.node()) {
+    EXPECT_NE(node.op(), "Variable") << node.name();
+    EXPECT_NE(node.op(), "VariableV2") << node.name();
+  }
+
+  RunAndCompareFrozenAndUnfrozenGraphs(saved_model_bundle.session.get(),
+                                       frozen_graph_def, "c:0");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/cc/training/coordinator.h b/tensorflow/cc/training/coordinator.h
index 0e01b19cd98bc797b7bb25da55c05d96f3eb93c7..7168b775251d38687d604b5294405389a8c1b04f 100644
--- a/tensorflow/cc/training/coordinator.h
+++ b/tensorflow/cc/training/coordinator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_TRAINING_COORDINATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CC_TRAINING_COORDINATOR_H_
+#ifndef TENSORFLOW_CC_TRAINING_COORDINATOR_H_
+#define TENSORFLOW_CC_TRAINING_COORDINATOR_H_
 
 #include <atomic>
 #include <memory>
@@ -128,4 +128,4 @@ class Coordinator {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_TRAINING_COORDINATOR_H_
+#endif  // TENSORFLOW_CC_TRAINING_COORDINATOR_H_
diff --git a/tensorflow/cc/training/queue_runner.h b/tensorflow/cc/training/queue_runner.h
index 2d3450032388bfee96055f23cf621af0fa4731ae..21189b4b046b87b8609483109096fda6144681b8 100644
--- a/tensorflow/cc/training/queue_runner.h
+++ b/tensorflow/cc/training/queue_runner.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
-#define THIRD_PARTY_TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
+#ifndef TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
+#define TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
 
 #include <memory>
 #include <string>
@@ -137,4 +137,4 @@ class QueueRunner : public RunnerInterface {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
+#endif  // TENSORFLOW_CC_TRAINING_QUEUE_RUNNER_H_
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index a9a6ea84319a18a8fbce648391bf5918ff6d9a08..0900e87ebabd378e6237b77ca0ef01677c07c244 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -24,7 +24,6 @@ tf_cc_test(
     srcs = ["runtime_test.cc"],
     deps = [
         ":runtime",
-        "//tensorflow/compiler/tf2xla:xla_local_runtime_context",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -53,6 +52,7 @@ cc_library(
         "flags.h",
     ],
     deps = [
+        ":embedded_protocol_buffers",
         ":runtime",  # needed by codegen to print aligned_buffer_bytes
         "//tensorflow/compiler/tf2xla",
         "//tensorflow/compiler/tf2xla:common",
@@ -69,9 +69,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
-        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -81,13 +79,18 @@ cc_library(
 tf_cc_test(
     name = "codegen_test",
     srcs = ["codegen_test.cc"],
-    data = ["codegen_test_h.golden"],
+    data = [
+        "codegen_test_h.golden",
+        "codegen_test_o.golden",
+    ],
     deps = [
         ":tfcompile_lib",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@llvm//:support",  # fixdeps: keep
+        "@llvm//:x86_code_gen",  # fixdeps: keep
     ],
 )
 
@@ -111,6 +114,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -128,7 +132,9 @@ tf_library(
     config = "test_graph_tfadd.config.pbtxt",
     cpp_class = "AddComp",
     graph = "test_graph_tfadd.pbtxt",
-    tags = ["manual"],
+    tags = [
+        "manual",
+    ],
 )
 
 # A test of tf_library that includes a graph with an unknown op, but where
@@ -139,7 +145,9 @@ tf_library(
     config = "test_graph_tfunknownop.config.pbtxt",
     cpp_class = "UnknownOpAddComp",
     graph = "test_graph_tfunknownop.pbtxt",
-    tags = ["manual"],
+    tags = [
+        "manual",
+    ],
 )
 
 # A test of tf_library that includes a graph with an unknown op, but where
@@ -151,7 +159,9 @@ tf_library(
     config = "test_graph_tfunknownop2.config.pbtxt",
     cpp_class = "UnknownOpAddComp",
     graph = "test_graph_tfunknownop.pbtxt",
-    tags = ["manual"],
+    tags = [
+        "manual",
+    ],
 )
 
 # A test of tf_library that includes a graph with an unknown op, but where
@@ -162,7 +172,9 @@ tf_library(
     config = "test_graph_tfunknownop3.config.pbtxt",
     cpp_class = "UnknownOpAddComp",
     graph = "test_graph_tfunknownop.pbtxt",
-    tags = ["manual"],
+    tags = [
+        "manual",
+    ],
 )
 
 # Utility library for benchmark binaries, used by the *_benchmark rules that are
@@ -185,11 +197,27 @@ cc_library(
     name = "benchmark_extra_android",
     tags = [
         "manual",
-        "notap",
     ],
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "embedded_protocol_buffers",
+    srcs = ["embedded_protocol_buffers.cc"],
+    hdrs = ["embedded_protocol_buffers.h"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+        "@llvm//:execution_engine",
+        "@llvm//:support",
+        "@llvm//:target",
+    ],
+)
+
 tf_cc_test(
     name = "benchmark_test",
     srcs = ["benchmark_test.cc"],
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index ae22f7edc423247b34895411d19d7a3c21f86d4f..2cae85e8965216eaaee4d3032015d0016258a5c1 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/aot/embedded_protocol_buffers.h"
 #include "tensorflow/compiler/aot/runtime.h"
 #include "tensorflow/compiler/tf2xla/str_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
@@ -101,21 +102,8 @@ Status ComputeArgSizes(const CompileResult& compile_result,
                        std::vector<int64>* arg_sizes) {
   const xla::ProgramShape& ps = compile_result.program_shape;
   for (int i = 0; i < ps.parameters_size(); ++i) {
-    if (i == ps.parameters_size() - 1 && compile_result.has_context_arg) {
-      // If the compiled function needs a XlaLocalRuntimeContext* arg, it's
-      // always last, and must be represented as an opaque type.
-      const xla::PrimitiveType type = ps.parameters(i).element_type();
-      if (type != xla::OPAQUE) {
-        return errors::InvalidArgument(
-            "expected final context arg to be opaque, but got type: ",
-            xla::PrimitiveType_Name(type), ", from program shape: ",
-            xla::ShapeUtil::HumanString(ps));
-      }
-      arg_sizes->push_back(-1);
-    } else {
-      arg_sizes->push_back(xla::ShapeUtil::ByteSizeOf(
-          ps.parameters(i), compile_result.pointer_size));
-    }
+    arg_sizes->push_back(xla::ShapeUtil::ByteSizeOf(
+        ps.parameters(i), compile_result.pointer_size));
   }
   return Status::OK();
 }
@@ -165,11 +153,6 @@ string RewriteWithName(const string& name, string code,
 Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps,
                      const CompileResult& compile_result, string* methods) {
   size_t num_args = ps.parameters_size();
-  if (compile_result.has_context_arg) {
-    // If the compiled function needs a XlaLocalRuntimeContext* arg, it's
-    // always last, and is set in the class constructor.
-    num_args--;
-  }
   if (config.feed_size() != num_args) {
     return errors::InvalidArgument("mismatch between feed_size(",
                                    config.feed_size(), ") and num_args(",
@@ -281,49 +264,6 @@ string GenNameToIndexCode(const T& entries, bool generate) {
   return code;
 }
 
-// Converts the given `str` into a comma-separated list of per-character values.
-string StringToCharList(const string& str) {
-  string list;
-  for (const char c : str) {
-    if (!list.empty()) {
-      list += ",";
-    }
-    list += strings::StrCat(static_cast<int>(c));
-  }
-  return list;
-}
-
-string GenProgramShapeCode(xla::ProgramShape program_shape, bool generate) {
-  // No need for any static magic if we're not supposed to generate the data.
-  if (!generate) {
-    return "{\n    return nullptr;\n  }";
-  }
-  // The parameter names are currently meaningless, and redundant with the rest
-  // of our metadata, so clear them out to avoid confusion and save space.
-  program_shape.clear_parameter_names();
-  const string proto_str = program_shape.SerializeAsString();
-  // Embed the program shape as a serialized protobuf in the header file.
-  //
-  // TODO(toddw): This strategy will likely fail for larger protobufs, depending
-  // on the C++ compiler that is used. Figure out another solution if necessary.
-  string code = R"({
-    static const xla::ProgramShape* kShape = []() {
-      static const char kProto[] = {{{PROTO_LIST}}};
-      static constexpr int kProtoSize = {{PROTO_SIZE}};
-      xla::ProgramShape* shape = new xla::ProgramShape;
-      shape->ParseFromArray(kProto, kProtoSize);
-      return shape;
-    }();
-    return kShape;
-  })";
-  str_util::ReplaceAllPairs(
-      &code, {
-                 {"{{PROTO_LIST}}", StringToCharList(proto_str)},
-                 {"{{PROTO_SIZE}}", strings::StrCat(proto_str.size())},
-             });
-  return code;
-}
-
 Status ValidateFeedFetchCppNames(const tf2xla::Config& config) {
   for (const tf2xla::Feed& feed : config.feed()) {
     if (!feed.name().empty()) {
@@ -340,8 +280,9 @@ Status ValidateFeedFetchCppNames(const tf2xla::Config& config) {
 
 }  // namespace
 
-Status GenerateHeader(const HeaderOpts& opts, const tf2xla::Config& config,
-                      const CompileResult& compile_result, string* header) {
+Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
+                      const CompileResult& compile_result,
+                      const MetadataResult& metadata_result, string* header) {
   TF_RETURN_IF_ERROR(ValidateConfig(config));
   TF_RETURN_IF_ERROR(ValidateFeedFetchCppNames(config));
   const int64 result_index = compile_result.aot->result_buffer_index();
@@ -391,8 +332,6 @@ Status GenerateHeader(const HeaderOpts& opts, const tf2xla::Config& config,
           ?
           R"(#include "tensorflow/compiler/xla/xla_data.pb.h")"
           : "";
-  const string program_shape_code =
-      GenProgramShapeCode(ps, opts.gen_program_shape);
 
   // Use a poor-man's text templating mechanism; first populate the full header
   // with placeholder tokens, and then rewrite the tokens with real values.
@@ -418,7 +357,9 @@ namespace xla { class ExecutableRunOptions; }
 // (Implementation detail) Entry point to the function in the object file.
 extern "C" void {{ENTRY}}(
     void* result, const xla::ExecutableRunOptions* run_options,
-    const void** args, void** temps);
+    const void** args, void** temps, tensorflow::int64* profile_counters);
+
+{{DECLS_FROM_OBJ_FILE}}
 
 {{NS_START}}
 // {{CLASS}} represents a computation previously specified in a
@@ -474,7 +415,6 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       data->temp_sizes = TempSizes();
       data->num_temps = kNumTemps;
       data->result_index = kResultIndex;
-      data->requires_runtime_context = {{HAS_CONTEXT_ARG}};
       data->arg_names = StaticArgNames();
       data->result_names = StaticResultNames();
       data->program_shape = StaticProgramShape();
@@ -483,7 +423,7 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
     return *kStaticData;
   }
 
-  {{CLASS}}(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_AND_TEMPS)
+  {{CLASS}}(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS)
       : XlaCompiledCpuFunction(StaticData(), alloc_mode) {}
 
   {{CLASS}}(const {{CLASS}}&) = delete;
@@ -496,8 +436,8 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
   // void set_argN_data(void* data)
   //   Sets the buffer of type T for positional argument N. May be called in
   //   any AllocMode. Must be called before Run to have an affect. Must be
-  //   called in AllocMode::RESULTS_AND_TEMPS_ONLY for each positional argument,
-  //   to set the argument buffers.
+  //   called in AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY for each positional
+  //   argument, to set the argument buffers.
   //
   // T* argN_data()
   //   Returns the buffer of type T for positional argument N.
@@ -543,7 +483,10 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
   static const char** StaticResultNames() {{RESULT_NAMES_CODE}}
 
   // Shape of the args and results.
-  static const xla::ProgramShape* StaticProgramShape() {{PROGRAM_SHAPE_CODE}}
+  static const xla::ProgramShape* StaticProgramShape() {
+    static const xla::ProgramShape* kShape = {{PROGRAM_SHAPE_SHIM_EXPRESSION}};
+    return kShape;
+  }
 };
 {{NS_END}}
 
@@ -560,26 +503,68 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       {"{{ARG_SIZES}}", str_util::Join(arg_sizes, ", ")},
       {"{{CLASS}}", opts.class_name},
       {"{{ENTRY}}", compile_result.entry_point},
-      {"{{HAS_CONTEXT_ARG}}",
-       compile_result.has_context_arg ? "true" : "false"},
       {"{{INCLUDE_XLA_DATA_PROTO}}", include_xla_data_proto},
       {"{{METHODS_ARG}}\n", methods_arg},
       {"{{METHODS_RESULT}}\n", methods_result},
       {"{{NS_END}}\n", ns_end},
       {"{{NS_START}}\n", ns_start},
       {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(ps)},
-      {"{{PROGRAM_SHAPE_CODE}}", program_shape_code},
       {"{{RESULT_INDEX}}", strings::StrCat(result_index)},
       {"{{RESULT_NAMES_CODE}}", result_names_code},
       {"{{TEMP_BYTES_ALIGNED}}", strings::StrCat(temp_bytes_aligned)},
       {"{{TEMP_BYTES_TOTAL}}", strings::StrCat(temp_bytes_total)},
       {"{{TEMP_NUM}}", strings::StrCat(temp_sizes.size())},
       {"{{TEMP_SIZES}}", str_util::Join(temp_sizes, ", ")},
-  };
+      {"{{DECLS_FROM_OBJ_FILE}}",
+       str_util::Join(metadata_result.header_variable_decls, "\n")},
+      {"{{PROGRAM_SHAPE_SHIM_EXPRESSION}}",
+       metadata_result.program_shape_access_shim}};
   str_util::ReplaceAllPairs(header, rewrites);
   return Status::OK();
 }
 
+static string CreateUniqueIdentifierForProgramShape(const CodegenOpts& opts) {
+  string result = "__tfcompile";
+  for (const string& n : opts.namespaces) {
+    strings::StrAppend(&result, "_", n);
+  }
+
+  strings::StrAppend(&result, "_", opts.class_name, "_ProgramShape");
+  return result;
+}
+
+Status GenerateMetadata(const CodegenOpts& opts,
+                        const CompileResult& compile_result,
+                        MetadataResult* metadata_result) {
+  std::unique_ptr<xla::ProgramShape> program_shape;
+
+  if (opts.gen_program_shape) {
+    program_shape =
+        tensorflow::MakeUnique<xla::ProgramShape>(compile_result.program_shape);
+    // The parameter names are currently meaningless, and redundant with the
+    // rest of our metadata, so clear them out to avoid confusion and save
+    // space.
+    program_shape->clear_parameter_names();
+  }
+
+  // When asked to serialize a null protobuf, CreateEmbeddedProtocolBuffer gives
+  // a shim that evaluates to nullptr, which is what we want.
+
+  TF_ASSIGN_OR_RETURN(
+      EmbeddedProtocolBuffer embedded_program_shape,
+      CreateEmbeddedProtocolBuffer(opts.target_triple,
+                                   CreateUniqueIdentifierForProgramShape(opts),
+                                   "xla::ProgramShape", program_shape.get()));
+
+  metadata_result->program_shape_access_shim =
+      std::move(embedded_program_shape.cpp_shim_expression);
+  metadata_result->header_variable_decls.emplace_back(
+      std::move(embedded_program_shape.cpp_variable_decl));
+  metadata_result->object_file_data =
+      std::move(embedded_program_shape.object_file_data);
+  return Status::OK();
+}
+
 Status ParseCppClass(const string& cpp_class, string* class_name,
                      std::vector<string>* namespaces) {
   class_name->clear();
diff --git a/tensorflow/compiler/aot/codegen.h b/tensorflow/compiler/aot/codegen.h
index 76dd0cc3cf9470a1beb2a4725724f640aecfec7f..3430b1f96cf4d3c035b76c77ccf124c5d164751e 100644
--- a/tensorflow/compiler/aot/codegen.h
+++ b/tensorflow/compiler/aot/codegen.h
@@ -26,11 +26,15 @@ limitations under the License.
 namespace tensorflow {
 namespace tfcompile {
 
-// HeaderOpts specifies options for header-file generation.
-struct HeaderOpts {
+// CodegenOpts specifies code generation options for the generated header file
+// and the generated metadata object file.
+struct CodegenOpts {
   // The name of the generated C++ class, wrapping the generated function.
   string class_name;
 
+  // Target triple for the architecture we're targeting.
+  string target_triple;
+
   // Namespaces specifies a list of C++ namespaces to add to the generated
   // header.  If empty, all symbols will be in the global namespace.
   std::vector<string> namespaces;
@@ -42,11 +46,36 @@ struct HeaderOpts {
   bool gen_program_shape = false;
 };
 
+// Describes a generated metadata object file.
+struct MetadataResult {
+  // These are top level "extern C" declarations that are expected to be visible
+  // wherever program_shape_access_shim is emitted.
+  std::vector<string> header_variable_decls;
+
+  // program_shape_access_shim is a C++ expression that constructs the
+  // xla::ProgramShape instance for the CompileResult passed to
+  // GenerateMetadata.
+  string program_shape_access_shim;
+
+  // The contents of the object (".o") file.
+  string object_file_data;
+};
+
+// Generates a metadata object file according to `opts` and `compile_result`.
+// The generated object file is returned via `metadata_result`.
+Status GenerateMetadata(const CodegenOpts& opts,
+                        const CompileResult& compile_result,
+                        MetadataResult* metadata_result);
+
 // GenerateHeader uses the meta-information from compile_result to generate a
 // C++ header giving access to the function in the generated object file.  The
 // header includes API usage documentation.
-Status GenerateHeader(const HeaderOpts& opts, const tf2xla::Config& config,
-                      const CompileResult& compile_result, string* header);
+//
+// metadata_result is an instance of MetadataResult obtained by a previous
+// invocation to GenerateMetadata.
+Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
+                      const CompileResult& compile_result,
+                      const MetadataResult& metadata_result, string* header);
 
 // ParseCppClass parses `cpp_class` into its `class_name` and `namespaces`
 // components.  The syntax is [[<optional_namespace>::],...]<class_name>.  This
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index 0f6114666fcc89c631434527d2ae8c92c039ffea..972b7d51ecb3798e61757ac55e973075a23b433a 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "llvm/Support/TargetSelect.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -123,9 +124,39 @@ TEST_F(ParseCppClassTest, ParseFail) {
   ExpectFail("good::0bad");
 }
 
-TEST(GenerateHeader, Golden) {
-  HeaderOpts opts;
+static void CompareWithGoldenFile(
+    const string& tensorflow_relative_golden_file_name,
+    const string& expected_contents) {
+  // To update the golden file, flip update_golden to true and run the
+  // following:
+  // bazel test --test_strategy=local \
+  //   third_party/tensorflow/compiler/aot:codegen_test
+  const bool update_golden = false;
+  const string golden_file_name = io::JoinPath(
+      testing::TensorFlowSrcRoot(), tensorflow_relative_golden_file_name);
+
+  if (update_golden) {
+    TF_EXPECT_OK(
+        WriteStringToFile(Env::Default(), golden_file_name, expected_contents));
+  }
+
+  string golden_file_contents;
+  TF_ASSERT_OK(ReadFileToString(Env::Default(), golden_file_name,
+                                &golden_file_contents));
+  EXPECT_EQ(golden_file_contents, expected_contents);
+}
+
+TEST(CodegenTest, Golden) {
+  // Normally CpuCompiler::CpuCompiler does this, but in this test we've
+  // bypassed the Cpu compiler so we have to do this manually.
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+  LLVMInitializeX86Target();
+  LLVMInitializeX86TargetMC();
+
+  CodegenOpts opts;
   opts.class_name = "MyClass";
+  opts.target_triple = "x86_64-pc-linux";
   opts.namespaces = {"foo", "bar"};
   opts.gen_name_to_index = true;
   opts.gen_program_shape = true;
@@ -145,32 +176,27 @@ TEST(GenerateHeader, Golden) {
       {
           xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
           xla::ShapeUtil::MakeShape(xla::S64, {3, 4}),
-          xla::ShapeUtil::MakeOpaqueShape(),
       },
       xla::ShapeUtil::MakeTupleShape(
           {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})}));
-  compile_result.has_context_arg = true;
   compile_result.entry_point = "entry_point";
   compile_result.pointer_size = 8;
+
+  MetadataResult metadata_result;
+  TF_ASSERT_OK(GenerateMetadata(opts, compile_result, &metadata_result));
+
+  // The other fields in metadata_result are tested as part of the generated
+  // header test.
+
+  CompareWithGoldenFile("compiler/aot/codegen_test_o.golden",
+                        metadata_result.object_file_data);
+
   string header;
-  TF_EXPECT_OK(GenerateHeader(opts, config, compile_result, &header));
+  TF_ASSERT_OK(
+      GenerateHeader(opts, config, compile_result, metadata_result, &header));
 
-  // Compare against the golden file.
-  const string golden_name = io::JoinPath(testing::TensorFlowSrcRoot(),
-                                          "compiler/aot/codegen_test_h.golden");
-  // To update the golden file, flip update_golden to true and run the
-  // following:
-  // bazel test --test_strategy=local \
-  //   third_party/tensorflow/compiler/aot:codegen_test
-  const bool update_golden = false;
-  if (update_golden) {
-    TF_EXPECT_OK(WriteStringToFile(Env::Default(), golden_name, header));
-  }
-  string golden_data;
-  TF_EXPECT_OK(ReadFileToString(Env::Default(), golden_name, &golden_data));
-  EXPECT_EQ(header, golden_data);
+  CompareWithGoldenFile("compiler/aot/codegen_test_h.golden", header);
 }
-
 }  // namespace
 }  // namespace tfcompile
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 65f342ce27ef09092f252f791973f245a8cdd6f3..ac3b5873318873b5fdf41bd556a0b2abddc2b30b 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -19,7 +19,9 @@ namespace xla { class ExecutableRunOptions; }
 // (Implementation detail) Entry point to the function in the object file.
 extern "C" void entry_point(
     void* result, const xla::ExecutableRunOptions* run_options,
-    const void** args, void** temps);
+    const void** args, void** temps, tensorflow::int64* profile_counters);
+
+extern "C" char __tfcompile_foo_bar_MyClass_ProgramShape_protobuf_array_contents[];
 
 namespace foo {
 namespace bar {
@@ -48,7 +50,7 @@ namespace bar {
 //   is guaranteed that no thread may call a non-const method.
 //
 // The logical function signature is:
-//   ((unknown): f32[1,2], (unknown): s64[3,4], (unknown): opaque[]) -> (u32[5,6])
+//   ((unknown): f32[1,2], (unknown): s64[3,4]) -> (u32[5,6])
 //
 // Memory stats:
 //   arg bytes total:    104
@@ -58,11 +60,11 @@ namespace bar {
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
-  static constexpr size_t kNumArgs = 3;
+  static constexpr size_t kNumArgs = 2;
 
   // Byte size of each argument buffer. There are kNumArgs entries.
   static const intptr_t* ArgSizes() {
-    static constexpr intptr_t kArgSizes[kNumArgs] = {8, 96, -1};
+    static constexpr intptr_t kArgSizes[kNumArgs] = {8, 96};
     return kArgSizes;
   }
 
@@ -77,7 +79,6 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
       data->temp_sizes = TempSizes();
       data->num_temps = kNumTemps;
       data->result_index = kResultIndex;
-      data->requires_runtime_context = true;
       data->arg_names = StaticArgNames();
       data->result_names = StaticResultNames();
       data->program_shape = StaticProgramShape();
@@ -86,7 +87,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
     return *kStaticData;
   }
 
-  MyClass(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_AND_TEMPS)
+  MyClass(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS)
       : XlaCompiledCpuFunction(StaticData(), alloc_mode) {}
 
   MyClass(const MyClass&) = delete;
@@ -99,8 +100,8 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   // void set_argN_data(void* data)
   //   Sets the buffer of type T for positional argument N. May be called in
   //   any AllocMode. Must be called before Run to have an affect. Must be
-  //   called in AllocMode::RESULTS_AND_TEMPS_ONLY for each positional argument,
-  //   to set the argument buffers.
+  //   called in AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY for each positional
+  //   argument, to set the argument buffers.
   //
   // T* argN_data()
   //   Returns the buffer of type T for positional argument N.
@@ -236,12 +237,10 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   // Shape of the args and results.
   static const xla::ProgramShape* StaticProgramShape() {
     static const xla::ProgramShape* kShape = []() {
-      static const char kProto[] = {10,12,16,11,26,2,1,2,42,4,10,2,1,0,10,12,16,5,26,2,3,4,42,4,10,2,1,0,10,2,16,14,18,16,16,13,34,12,16,8,26,2,5,6,42,4,10,2,1,0};
-      static constexpr int kProtoSize = 50;
-      xla::ProgramShape* shape = new xla::ProgramShape;
-      shape->ParseFromArray(kProto, kProtoSize);
-      return shape;
-    }();
+    xla::ProgramShape* proto = new xla::ProgramShape;
+    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShape_protobuf_array_contents[0], 52);
+    return proto;
+  }();
     return kShape;
   }
 };
diff --git a/tensorflow/compiler/aot/codegen_test_o.golden b/tensorflow/compiler/aot/codegen_test_o.golden
new file mode 100644
index 0000000000000000000000000000000000000000..eb001c5d45bdfefc76629d7303d89f5480432235
Binary files /dev/null and b/tensorflow/compiler/aot/codegen_test_o.golden differ
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 2b8cc6024cb85e4f6269313927ff66d1d9a1cf79..c87f2b75dfa18ad5c3eda4bd6fcbcb3083ef73fd 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -94,9 +94,8 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
       xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
           .ValueOrDie();
   xla::Computation computation;
-  TF_RETURN_IF_ERROR(ConvertGraphDefToXla(graph_def, config, client,
-                                          &computation,
-                                          &compile_result->has_context_arg));
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToXla(graph_def, config, client, &computation));
   if (!flags.out_session_module.empty()) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::SessionModule> module,
                         computation.Snapshot());
diff --git a/tensorflow/compiler/aot/compile.h b/tensorflow/compiler/aot/compile.h
index 965c2960816b3acc8d2209e6824d88647de0ce14..e03c5b1aa77c1262ed903aae3072ef65f34d80a2 100644
--- a/tensorflow/compiler/aot/compile.h
+++ b/tensorflow/compiler/aot/compile.h
@@ -34,7 +34,6 @@ struct CompileResult {
   // Contains object file and meta-info.
   std::unique_ptr<xla::cpu::CpuAotCompilationResult> aot;
   xla::ProgramShape program_shape;  // Static shape of args and results.
-  bool has_context_arg = false;     // Is last arg XlaLocalRuntimeContext?
   string entry_point;               // Name of generated function.
   int pointer_size = 0;             // Size of a pointer in bytes.
 };
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.cc b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6489929a576d6469c4ff1358ca5ee9d27fb578bb
--- /dev/null
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
@@ -0,0 +1,158 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/aot/embedded_protocol_buffers.h"
+
+#include <memory>
+#include <string>
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/ObjectMemoryBuffer.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "tensorflow/compiler/tf2xla/str_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace tensorflow {
+namespace tfcompile {
+
+using xla::llvm_ir::AsStringRef;
+
+static std::unique_ptr<llvm::Module> CreateModuleWithEmbeddedProtocolBuffer(
+    llvm::LLVMContext* llvm_context, llvm::TargetMachine* target_machine,
+    const ::tensorflow::protobuf::MessageLite& proto,
+    StringPiece unique_identifier, string* protobuf_array_symbol_name,
+    int64* protobuf_array_size) {
+  string protobuf_array_contents = proto.SerializeAsString();
+  *protobuf_array_symbol_name =
+      strings::StrCat(unique_identifier, "_protobuf_array_contents");
+  *protobuf_array_size = protobuf_array_contents.size();
+
+  std::unique_ptr<llvm::Module> module =
+      MakeUnique<llvm::Module>("embedded_data_module", *llvm_context);
+
+  llvm::Constant* protobuf_array_initializer =
+      llvm::ConstantDataArray::getString(*llvm_context,
+                                         AsStringRef(protobuf_array_contents),
+                                         /*AddNull=*/false);
+  new llvm::GlobalVariable(
+      *module, protobuf_array_initializer->getType(),
+      /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
+      protobuf_array_initializer, AsStringRef(*protobuf_array_symbol_name));
+
+  return module;
+}
+
+static string CreateCPPShimExpression(StringPiece qualified_cpp_protobuf_name,
+                                      StringPiece protobuf_array_symbol_name,
+                                      int64 protobuf_array_size) {
+  string code =
+      "[]() {\n"
+      "    {{PROTOBUF_NAME}}* proto = new {{PROTOBUF_NAME}};\n"
+      "    proto->ParseFromArray(&{{ARRAY_SYMBOL}}[0], {{ARRAY_SIZE}});\n"
+      "    return proto;\n"
+      "  }()";
+
+  str_util::ReplaceAllPairs(
+      &code,
+      {
+          {"{{ARRAY_SYMBOL}}", strings::StrCat(protobuf_array_symbol_name)},
+          {"{{ARRAY_SIZE}}", strings::StrCat(protobuf_array_size)},
+          {"{{PROTOBUF_NAME}}", strings::StrCat(qualified_cpp_protobuf_name)},
+      });
+  return code;
+}
+
+static StatusOr<string> CodegenModule(llvm::TargetMachine* target_machine,
+                                      std::unique_ptr<llvm::Module> module) {
+  llvm::SmallVector<char, 0> stream_buffer;
+  llvm::raw_svector_ostream ostream(stream_buffer);
+  llvm::legacy::PassManager codegen_passes;
+
+  if (target_machine->addPassesToEmitFile(
+          codegen_passes, ostream, llvm::TargetMachine::CGFT_ObjectFile)) {
+    return xla::InternalError(
+        "Could not create pass pipeline to generate object file");
+  }
+
+  codegen_passes.run(*module);
+
+  return string(stream_buffer.begin(), stream_buffer.end());
+}
+
+static StatusOr<std::unique_ptr<llvm::TargetMachine>>
+GetTargetMachineFromTriple(StringPiece target_triple) {
+  std::string error;
+  std::string normalized_triple =
+      llvm::Triple::normalize(AsStringRef(target_triple));
+  const llvm::Target* target =
+      llvm::TargetRegistry::lookupTarget(normalized_triple, error);
+  if (target == nullptr) {
+    return xla::InternalError("TargetRegistry::lookupTarget failed: %s",
+                              error.c_str());
+  }
+
+  return WrapUnique(target->createTargetMachine(
+      normalized_triple, /*CPU=*/"",
+      /*Features=*/"", llvm::TargetOptions(), llvm::None));
+}
+
+StatusOr<EmbeddedProtocolBuffer> CreateEmbeddedProtocolBuffer(
+    StringPiece target_triple, StringPiece symbol_prefix,
+    StringPiece qualified_cpp_protobuf_name,
+    const ::tensorflow::protobuf::MessageLite* proto) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::TargetMachine> target_machine,
+                      GetTargetMachineFromTriple(target_triple));
+
+  llvm::LLVMContext llvm_context;
+  string object_file, cpp_shim, cpp_variable_decl;
+
+  if (proto) {
+    string protobuf_array_symbol_name;
+    int64 protobuf_array_size;
+
+    std::unique_ptr<llvm::Module> module_with_serialized_proto =
+        CreateModuleWithEmbeddedProtocolBuffer(
+            &llvm_context, target_machine.get(), *proto, symbol_prefix,
+            &protobuf_array_symbol_name, &protobuf_array_size);
+    TF_ASSIGN_OR_RETURN(object_file,
+                        CodegenModule(target_machine.get(),
+                                      std::move(module_with_serialized_proto)));
+    cpp_shim = CreateCPPShimExpression(qualified_cpp_protobuf_name,
+                                       protobuf_array_symbol_name,
+                                       protobuf_array_size);
+
+    cpp_variable_decl = strings::StrCat("extern \"C\" char ",
+                                        protobuf_array_symbol_name, "[];");
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        object_file,
+        CodegenModule(target_machine.get(),
+                      MakeUnique<llvm::Module>("empty_module", llvm_context)));
+    cpp_shim = "nullptr";
+  }
+
+  return {{cpp_shim, cpp_variable_decl, object_file}};
+}
+
+}  // namespace tfcompile
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
new file mode 100644
index 0000000000000000000000000000000000000000..8436e0ff67f352a24e3d16b46f16c1ad2f3a5957
--- /dev/null
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines utilities to help "embed" protocol buffers into object
+// (".o") files.  These C++ binaries and shared objects can link in these .o to
+// get access to said protocol buffers at runtime.
+
+#ifndef TENSORFLOW_COMPILER_AOT_EMBEDDED_PROTOCOL_BUFFERS_H_
+#define TENSORFLOW_COMPILER_AOT_EMBEDDED_PROTOCOL_BUFFERS_H_
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace tfcompile {
+using xla::StatusOr;
+
+// Represents a protocol buffer embedded into an object file and describes a way
+// to access it at runtime.
+struct EmbeddedProtocolBuffer {
+  // cpp_shim_expression is a C++ expression that creates an instance of said
+  // protocol buffer when executed.
+  string cpp_shim_expression;
+
+  // cpp_variable_decl is an "extern C" array declaration that is used in
+  // cpp_shim_expression.  It must be visible wherever cpp_shim_expression is
+  // emitted.
+  string cpp_variable_decl;
+
+  // The contents of the object (".o") file the protocol buffer is embbed in.
+  // This needs to be linked in to any program that wants to execute
+  // cpp_variable_decl .
+  string object_file_data;
+};
+
+// Creates an object file that contains `proto`.
+//
+// `proto` is allowed to be nullptr, in which case the generated C++ shim
+// expression is just `nullptr`, and the generated object file does not define
+// any symbols.
+//
+// `target_triple` is the target triple for the target architecture for the
+// generated object file.
+//
+// `symbol_prefix` is prefix that is guaranteed to be unique across the binary
+// or DSO the generated object file will be linked into.
+//
+// `qualified_cpp_protobuf_name` is a qualified ("qualified" as in C++
+// namespace qualified) protocol buffer name.  This needs is only used in
+// EmbeddedProtocolBuffer::cpp_shim_expression so relatively qualified
+// names are fine as long as they're valid wherever cpp_shim_expression
+// is emitted.
+StatusOr<EmbeddedProtocolBuffer> CreateEmbeddedProtocolBuffer(
+    StringPiece target_triple, StringPiece symbol_prefix,
+    StringPiece qualified_cpp_protobuf_name,
+    const ::tensorflow::protobuf::MessageLite* proto);
+
+}  // namespace tfcompile
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_AOT_EMBEDDED_PROTOCOL_BUFFERS_H_
diff --git a/tensorflow/compiler/aot/flags.cc b/tensorflow/compiler/aot/flags.cc
index 7c2f27e550d44c2487f91acf1029c962ac3f5d01..8c95cb8f90ee031fdbb97fabd9d86f848b42e4c5 100644
--- a/tensorflow/compiler/aot/flags.cc
+++ b/tensorflow/compiler/aot/flags.cc
@@ -59,8 +59,13 @@ void AppendMainFlags(std::vector<Flag>* flag_list, MainFlags* flags) {
        "namespaces may precede the class name, separated by double-colons.  "
        "The class will be generated in the given namespace(s), or if no "
        "namespaces are given, within the global namespace."},
-      {"out_object", &flags->out_object, "Output object file name."},
+      {"out_function_object", &flags->out_function_object,
+       "Output object file containing the generated function for the "
+       "TensorFlow model."},
       {"out_header", &flags->out_header, "Output header file name."},
+      {"out_metadata_object", &flags->out_metadata_object,
+       "Output object file name containing optional metadata for the generated "
+       "function."},
       {"out_session_module", &flags->out_session_module,
        "Output session module proto."},
       {"gen_name_to_index", &flags->gen_name_to_index,
diff --git a/tensorflow/compiler/aot/flags.h b/tensorflow/compiler/aot/flags.h
index 3519659e3af7cd345f30080a07ce91fb858623fb..d266fbead61f7eb43863d1c67c0f86926ae9452d 100644
--- a/tensorflow/compiler/aot/flags.h
+++ b/tensorflow/compiler/aot/flags.h
@@ -34,7 +34,8 @@ struct MainFlags {
   string target_features;
   string entry_point;
   string cpp_class;
-  string out_object;
+  string out_function_object;
+  string out_metadata_object;
   string out_header;
   string out_session_module;
 
diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc
index ac79c278c1fdf8b6aedcb52121c767b8ba0ad358..6d603a02eb4ceade6832ba67b2981814ee25327a 100644
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/aot/runtime.h"
 
-#include "tensorflow/compiler/tf2xla/xla_local_runtime_context.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 7dfd49cc3b92f83fd64ca62bd2230938ce2d0a65..28aab6eb614ca7123d9e00f7f5cc3661b62e23f7 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -74,7 +74,9 @@ tf_library(
     # compile but the others in this directory succeed, you may need to
     # expand the "required by all tf_library targets" list in tfcompile.bzl.
     include_standard_runtime_deps = False,
-    tags = ["manual"],
+    tags = [
+        "manual",
+    ],
 )
 
 tf_library(
@@ -84,7 +86,9 @@ tf_library(
     cpp_class = "AddWithCkptComp",
     freeze_checkpoint = "test_graph_tfadd_with_ckpt.ckpt",
     graph = "test_graph_tfadd_with_ckpt.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+    ],
 )
 
 tf_library(
@@ -95,7 +99,9 @@ tf_library(
     freeze_checkpoint = "test_graph_tfadd_with_ckpt_saver.ckpt",
     freeze_saver = "test_graph_tfadd_with_ckpt_saver.saver",
     graph = "test_graph_tfadd_with_ckpt_saver.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+    ],
 )
 
 tf_library(
@@ -104,7 +110,9 @@ tf_library(
     config = "test_graph_tffunction.config.pbtxt",
     cpp_class = "FunctionComp",
     graph = "test_graph_tffunction.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+    ],
 )
 
 tf_library(
@@ -113,7 +121,9 @@ tf_library(
     config = "test_graph_tfgather.config.pbtxt",
     cpp_class = "GatherComp",
     graph = "test_graph_tfgather.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+    ],
 )
 
 tf_library(
@@ -122,7 +132,9 @@ tf_library(
     config = "test_graph_tfmatmul.config.pbtxt",
     cpp_class = "foo::bar::MatMulComp",
     graph = "test_graph_tfmatmul.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+    ],
 )
 
 tf_library(
@@ -131,7 +143,9 @@ tf_library(
     config = "test_graph_tfmatmulandadd.config.pbtxt",
     cpp_class = "MatMulAndAddComp",
     graph = "test_graph_tfmatmulandadd.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+    ],
     tfcompile_flags = "--gen_name_to_index --gen_program_shape",
 )
 
@@ -141,13 +155,17 @@ tf_library(
     config = "test_graph_tfsplits.config.pbtxt",
     cpp_class = "SplitsComp",
     graph = "test_graph_tfsplits.pb",
-    tags = ["manual"],
+    tags = [
+        "manual",
+    ],
 )
 
 tf_cc_test(
     name = "tfcompile_test",
     srcs = ["tfcompile_test.cc"],
-    tags = ["manual"],
+    tags = [
+        "manual",
+    ],
     deps = [
         ":test_graph_tfadd",
         ":test_graph_tfadd_with_ckpt",
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index a898eab1d1ab0eb5d55983bf366753c968887296..89c7cd4507cbd476104a039d6083d8f89de11278 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import os
 import sys
 
 from tensorflow.core.protobuf import saver_pb2
@@ -53,7 +54,7 @@ def tfadd_with_ckpt(out_dir):
     sess.run(init_op)
     sess.run(y.assign(y + 42))
     # Without the checkpoint, the variable won't be set to 42.
-    ckpt = '%s/test_graph_tfadd_with_ckpt.ckpt' % out_dir
+    ckpt = os.path.join(out_dir, 'test_graph_tfadd_with_ckpt.ckpt')
     saver.save(sess, ckpt)
 
 
@@ -68,10 +69,10 @@ def tfadd_with_ckpt_saver(out_dir):
     sess.run(init_op)
     sess.run(y.assign(y + 42))
     # Without the checkpoint, the variable won't be set to 42.
-    ckpt_file = '%s/test_graph_tfadd_with_ckpt_saver.ckpt' % out_dir
+    ckpt_file = os.path.join(out_dir, 'test_graph_tfadd_with_ckpt_saver.ckpt')
     saver.save(sess, ckpt_file)
     # Without the SaverDef, the restore op won't be named correctly.
-    saver_file = '%s/test_graph_tfadd_with_ckpt_saver.saver' % out_dir
+    saver_file = os.path.join(out_dir, 'test_graph_tfadd_with_ckpt_saver.saver')
     with open(saver_file, 'wb') as f:
       f.write(saver.as_saver_def().SerializeToString())
 
@@ -129,7 +130,7 @@ def write_graph(build_graph, out_dir):
   g = ops.Graph()
   with g.as_default():
     build_graph(out_dir)
-    filename = '%s/test_graph_%s.pb' % (out_dir, build_graph.__name__)
+    filename = os.path.join(out_dir, 'test_graph_%s.pb' % build_graph.__name__)
     with open(filename, 'wb') as f:
       f.write(g.as_graph_def().SerializeToString())
 
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 6b037f276ad1d6771b904bb970f45f32ae9531b8..413efd9cea3b6f71574615ad9ca92471ff925781 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -70,7 +70,7 @@ TEST(TFCompileTest, Add) {
 // Run tests that use set_argN_data separately, to avoid accidentally re-using
 // non-existent buffers.
 TEST(TFCompileTest, Add_SetArg) {
-  AddComp add(AddComp::AllocMode::RESULTS_AND_TEMPS_ONLY);
+  AddComp add(AddComp::AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY);
 
   int32 arg_x = 10;
   int32 arg_y = 32;
@@ -258,7 +258,7 @@ TEST(TFCompileTest, MatMul2_SetArg) {
   Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
 
   foo::bar::MatMulComp matmul(
-      foo::bar::MatMulComp::AllocMode::RESULTS_AND_TEMPS_ONLY);
+      foo::bar::MatMulComp::AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY);
   matmul.set_thread_pool(&device);
 
   // Test using the set_argN_data() methods.
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 6c385af3b36df78b3f674b3464d68d904ca92907..9dff1be09fede6f65f82c2f36d94be07e781949f 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -4,7 +4,7 @@
 
 To use from your BUILD file, add the following line to load the macro:
 
-load("@org_tensorflow//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 
 Then call the macro like this:
 
@@ -16,14 +16,15 @@ tf_library(
 )
 """
 
-load("@org_tensorflow//tensorflow:tensorflow.bzl", "if_android", "tf_copts")
+load("//tensorflow:tensorflow.bzl",
+     "if_android", "tf_cc_test", "tf_copts")
 
 def tf_library(name, graph, config,
                freeze_checkpoint=None, freeze_saver=None,
                cpp_class=None, gen_test=True, gen_benchmark=True,
                visibility=None, testonly=None,
                tfcompile_flags=None,
-               tfcompile_tool="@org_tensorflow//tensorflow/compiler/aot:tfcompile",
+               tfcompile_tool="//tensorflow/compiler/aot:tfcompile",
                include_standard_runtime_deps=True, deps=None, tags=None):
   """Runs tfcompile to compile a TensorFlow graph into executable code.
 
@@ -102,6 +103,7 @@ def tf_library(name, graph, config,
 
     # Now run freeze_graph to convert variables into constants.
     freeze_args = (" --input_graph=$(location " + graph + ")" +
+                   " --checkpoint_version=1" +
                    " --input_binary=" + str(not graph.endswith(".pbtxt")) +
                    " --input_checkpoint=$(location " + freeze_checkpoint + ")" +
                    " --output_graph=$(location " + freeze_file + ")" +
@@ -119,16 +121,17 @@ def tf_library(name, graph, config,
             out_nodes_file,
         ] + freeze_saver_srcs,
         outs=[freeze_file],
-        cmd=("$(location @org_tensorflow//tensorflow/python/tools:freeze_graph)" +
+        cmd=("$(location //tensorflow/python/tools:freeze_graph)" +
              freeze_args),
-        tools=["@org_tensorflow//tensorflow/python/tools:freeze_graph"],
+        tools=["//tensorflow/python/tools:freeze_graph"],
         tags=tags,
     )
     tfcompile_graph = freeze_file
 
   # Rule that runs tfcompile to produce the header and object file.
   header_file = name + ".h"
-  object_file = name + ".o"
+  metadata_object_file = name + "_tfcompile_metadata.o"
+  function_object_file = name + "_tfcompile_function.o"
   ep = ("__" + PACKAGE_NAME + "__" + name).replace("/", "_")
   if type(tfcompile_flags) == type(""):
     flags = tfcompile_flags
@@ -142,7 +145,8 @@ def tf_library(name, graph, config,
       ],
       outs=[
           header_file,
-          object_file,
+          metadata_object_file,
+          function_object_file,
       ],
       cmd=("$(location " + tfcompile_tool + ")" +
            " --graph=$(location " + tfcompile_graph + ")" +
@@ -151,7 +155,8 @@ def tf_library(name, graph, config,
            " --cpp_class=" + cpp_class +
            " --target_triple=" + target_llvm_triple() +
            " --out_header=$(@D)/" + header_file +
-           " --out_object=$(@D)/" + object_file +
+           " --out_metadata_object=$(@D)/" + metadata_object_file +
+           " --out_function_object=$(@D)/" + function_object_file +
            " " + flags),
       tools=[tfcompile_tool],
       visibility=visibility,
@@ -202,7 +207,7 @@ def tf_library(name, graph, config,
   need_xla_data_proto = (flags and flags.find("--gen_program_shape") != -1)
   native.cc_library(
       name=name,
-      srcs=[object_file],
+      srcs=[function_object_file, metadata_object_file],
       hdrs=[header_file],
       visibility=visibility,
       testonly=testonly,
@@ -210,22 +215,19 @@ def tf_library(name, graph, config,
           # These deps are required by all tf_library targets even if
           # include_standard_runtime_deps is False.  Without them, the
           # generated code will fail to compile.
-          "@org_tensorflow//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
-          "@org_tensorflow//tensorflow/core:framework_lite",
+          "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
+          "//tensorflow/core:framework_lite",
       ] + (need_xla_data_proto and [
           # If we're generating the program shape, we must depend on the proto.
-          "@org_tensorflow//tensorflow/compiler/xla:xla_data_proto",
+          "//tensorflow/compiler/xla:xla_data_proto",
       ] or []) + (include_standard_runtime_deps and [
           # TODO(cwhipkey): only depend on kernel code that the model actually needed.
-          "@org_tensorflow//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d",
-          "@org_tensorflow//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_avx",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_neon",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:cpu_runtime_sse4_1",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_matmul",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d",
-          "@org_tensorflow//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
+          "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d",
+          "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d",
+          "//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
+          "//tensorflow/compiler/xla/service/cpu:runtime_matmul",
+          "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d",
+          "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
           "//third_party/eigen3",
       ] or []) + (deps or []),
       tags=tags,
@@ -251,29 +253,32 @@ def tf_library(name, graph, config,
         name=("gen_" + test_name),
         testonly=1,
         srcs=[
-            "@org_tensorflow//tensorflow/compiler/aot:test.cc",
+            "//tensorflow/compiler/aot:test.cc",
             header_file,
         ],
         outs=[test_file],
         cmd=("sed " + sed_replace +
-             " $(location @org_tensorflow//tensorflow/compiler/aot:test.cc) " +
+             " $(location //tensorflow/compiler/aot:test.cc) " +
              "> $(OUTS)"),
         tags=tags,
     )
 
-    # The cc_test rule for the generated code.
-    native.cc_test(
+    # The cc_test rule for the generated code.  To ensure that this works
+    # reliably across build configurations, we must use tf_cc_test instead of
+    # native.cc_test.  This is related to how we build
+    # //tensorflow/core:lib -- see the note in tensorflow/core/BUILD
+    # for more details.
+    tf_cc_test(
         name=test_name,
         srcs=[test_file],
         deps=[
             ":" + name,
-            "@org_tensorflow//tensorflow/compiler/tf2xla:xla_local_runtime_context",
-            "@org_tensorflow//tensorflow/compiler/aot:runtime",
-            "@org_tensorflow//tensorflow/compiler/aot:tf_library_test_main",
-            "@org_tensorflow//tensorflow/compiler/xla:executable_run_options",
+            "//tensorflow/compiler/aot:runtime",
+            "//tensorflow/compiler/aot:tf_library_test_main",
+            "//tensorflow/compiler/xla:executable_run_options",
             "//third_party/eigen3",
-            "@org_tensorflow//tensorflow/core:lib",
-            "@org_tensorflow//tensorflow/core:test",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
             ],
         tags=tags,
     )
@@ -281,7 +286,7 @@ def tf_library(name, graph, config,
   if gen_benchmark:
     benchmark_name = name + "_benchmark"
     benchmark_file = benchmark_name + ".cc"
-    benchmark_main = ("@org_tensorflow//tensorflow/compiler/aot:" +
+    benchmark_main = ("//tensorflow/compiler/aot:" +
                       "benchmark_main.template")
 
     # Rule to rewrite benchmark.cc to produce the benchmark_file.
@@ -299,7 +304,9 @@ def tf_library(name, graph, config,
         tags=tags,
     )
 
-    # The cc_benchmark rule for the generated code.
+    # The cc_benchmark rule for the generated code.  This does not need the
+    # tf_cc_binary since we (by deliberate design) do not depend on
+    # //tensorflow/core:lib.
     #
     # Note: to get smaller size on android for comparison, compile with:
     #    --copt=-fvisibility=hidden
@@ -313,13 +320,12 @@ def tf_library(name, graph, config,
         linkopts = if_android(["-pie", "-s"]),
         deps=[
             ":" + name,
-            "@org_tensorflow//tensorflow/compiler/tf2xla:xla_local_runtime_context",
-            "@org_tensorflow//tensorflow/compiler/aot:benchmark",
-            "@org_tensorflow//tensorflow/compiler/aot:runtime",
-            "@org_tensorflow//tensorflow/compiler/xla:executable_run_options",
+            "//tensorflow/compiler/aot:benchmark",
+            "//tensorflow/compiler/aot:runtime",
+            "//tensorflow/compiler/xla:executable_run_options",
             "//third_party/eigen3",
         ] + if_android([
-            "@org_tensorflow//tensorflow/compiler/aot:benchmark_extra_android",
+            "//tensorflow/compiler/aot:benchmark_extra_android",
         ]),
         tags=tags,
     )
@@ -329,11 +335,11 @@ def target_llvm_triple():
   # TODO(toddw): Add target_triple for other targets.  For details see:
   # http://llvm.org/docs/doxygen/html/Triple_8h_source.html
   return select({
-      "@org_tensorflow//tensorflow:android_armeabi": "armv5-none-android",
-      "@org_tensorflow//tensorflow:android_arm": "armv7-none-android",
-      "@org_tensorflow//tensorflow:android_arm64": "aarch64-none-android",
-      "@org_tensorflow//tensorflow:android_x86": "i686-none-android",
-      "@org_tensorflow//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
-      "@org_tensorflow//tensorflow:darwin": "x86_64-none-darwin",
+      "//tensorflow:android_armeabi": "armv5-none-android",
+      "//tensorflow:android_arm": "armv7-none-android",
+      "//tensorflow:android_arm64": "aarch64-none-android",
+      "//tensorflow:android_x86": "i686-none-android",
+      "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
+      "//tensorflow:darwin": "x86_64-none-darwin",
       "//conditions:default": "x86_64-pc-linux",
   })
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index 6ab3d474187c7df2131f94c9f42f0d0f2f9d99d7..e2f01179d4e2e4f6ef72b2761d06e130ffa3a94f 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -91,19 +91,26 @@ Status Main(const MainFlags& flags) {
   // Write output files.
   Env* env = Env::Default();
   const std::vector<char>& obj = compile_result.aot->object_file_data();
-  TF_RETURN_IF_ERROR(WriteStringToFile(env, flags.out_object,
+  TF_RETURN_IF_ERROR(WriteStringToFile(env, flags.out_function_object,
                                        StringPiece(obj.data(), obj.size())));
-  HeaderOpts header_opts;
-  header_opts.gen_name_to_index = flags.gen_name_to_index;
-  header_opts.gen_program_shape = flags.gen_program_shape;
+  CodegenOpts codegen_opts;
+  codegen_opts.gen_name_to_index = flags.gen_name_to_index;
+  codegen_opts.gen_program_shape = flags.gen_program_shape;
+  codegen_opts.target_triple = flags.target_triple;
   if (flags.cpp_class.empty()) {
     return errors::InvalidArgument("Must specify --cpp_class");
   }
-  TF_RETURN_IF_ERROR(ParseCppClass(flags.cpp_class, &header_opts.class_name,
-                                   &header_opts.namespaces));
-  string header;
+  TF_RETURN_IF_ERROR(ParseCppClass(flags.cpp_class, &codegen_opts.class_name,
+                                   &codegen_opts.namespaces));
+
+  MetadataResult metadata_result;
   TF_RETURN_IF_ERROR(
-      GenerateHeader(header_opts, config, compile_result, &header));
+      GenerateMetadata(codegen_opts, compile_result, &metadata_result));
+  TF_RETURN_IF_ERROR(WriteStringToFile(env, flags.out_metadata_object,
+                                       metadata_result.object_file_data));
+  string header;
+  TF_RETURN_IF_ERROR(GenerateHeader(codegen_opts, config, compile_result,
+                                    metadata_result, &header));
   TF_RETURN_IF_ERROR(WriteStringToFile(env, flags.out_header, header));
   return Status::OK();
 }
@@ -114,7 +121,8 @@ Status Main(const MainFlags& flags) {
 int main(int argc, char** argv) {
   tensorflow::tfcompile::MainFlags flags;
   flags.target_triple = "x86_64-pc-linux";
-  flags.out_object = "out.o";
+  flags.out_function_object = "out_model.o";
+  flags.out_metadata_object = "out_helper.o";
   flags.out_header = "out.h";
   flags.entry_point = "entry";
 
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index bf7d9cf14d10f41aa48ea594a8d63db97b9973e1..a711319607f4ff2b83aa0ebe50e215b3d0e2258e 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -110,19 +110,6 @@ cc_library(
     alwayslink = True,
 )
 
-# Internal targets below this point.
-
-cc_library(
-    name = "common",
-    srcs = [
-        "defs.cc",
-    ],
-    hdrs = [
-        "defs.h",
-    ],
-    visibility = [":friends"],
-)
-
 cc_library(
     name = "xla_device",
     srcs = [
@@ -135,6 +122,8 @@ cc_library(
         "xla_device_context.h",
         "xla_device_ops.h",
     ],
+    # Public visibility is needed for external TF/XLA backends.
+    visibility = ["//visibility:public"],
     deps = [
         ":common",
         ":jit_compilation_passes",
@@ -164,6 +153,19 @@ cc_library(
     ],
 )
 
+# Internal targets below this point.
+
+cc_library(
+    name = "common",
+    srcs = [
+        "defs.cc",
+    ],
+    hdrs = [
+        "defs.h",
+    ],
+    visibility = [":friends"],
+)
+
 cc_library(
     name = "xla_compilation_cache",
     srcs = ["xla_compilation_cache.cc"],
@@ -215,7 +217,6 @@ cc_library(
         ":common",
         ":compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_launch_op",
-        "//tensorflow/compiler/tf2xla:const_analysis",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -245,12 +246,13 @@ cc_library(
         "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
         "//tensorflow/compiler/jit/ops:parallel_check_op",
         "//tensorflow/compiler/jit/ops:xla_ops",
-        "//tensorflow/compiler/tf2xla:const_analysis",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 22899ebeebc929055518893b358f7950d380d6f6..9c372a012789fc25ca0a711349c09ca62edc6754 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -16,22 +16,30 @@ limitations under the License.
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 
 #include <functional>
+#include <memory>
 #include <numeric>
+#include <string>
+#include <unordered_map>
+#include <vector>
 
 #include "tensorflow/compiler/jit/graph_to_functiondef.h"
 #include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -48,19 +56,75 @@ const char* const kXlaNumResourceArgsAttr = "_XlaNumResourceArgs";
 
 namespace {
 
+bool AreAllParentsConst(const Node& n,
+                        const gtl::FlatSet<const Node*>& runtime_const_nodes) {
+  if (n.type_string() == "GuaranteeConst" || n.type_string() == "Const") {
+    // If the current node is itself a cast-to-const, no need
+    // to look at the incoming edges.
+    return true;
+  }
+
+  bool all_parents_const = true;
+  bool atleast_one_non_control_edge = false;
+  for (const Edge* in : n.in_edges()) {
+    atleast_one_non_control_edge =
+        atleast_one_non_control_edge || !in->IsControlEdge();
+    if (!in->IsControlEdge() && runtime_const_nodes.count(in->src()) == 0) {
+      all_parents_const = false;
+      break;
+    }
+  }
+  return all_parents_const && atleast_one_non_control_edge;
+}
+
+void MarkGuaranteedConstants(
+    const Graph& graph,
+    const std::vector<std::pair<const Node*, Node*>>& src_arg_pairs) {
+  gtl::FlatSet<const Node*> guaranteed_const_nodes;
+  std::vector<const Node*> srcs;
+  srcs.reserve(src_arg_pairs.size());
+  for (const auto& src_arg : src_arg_pairs) {
+    srcs.push_back(src_arg.first);
+  }
+  ReverseDFSFrom(graph, srcs, /*enter=*/nullptr,
+                 /*leave=*/[&guaranteed_const_nodes](const Node* n) {
+                   // TODO(vinuraja): Doesn't work in the presence of loops.
+                   if (AreAllParentsConst(*n, guaranteed_const_nodes)) {
+                     guaranteed_const_nodes.insert(n);
+                   }
+                 });
+
+  for (auto& src_arg : src_arg_pairs) {
+    if (guaranteed_const_nodes.count(src_arg.first) != 0) {
+      VLOG(1) << "Guaranteed const found: " << src_arg.first->DebugString();
+      src_arg.second->AddAttr("_is_guaranteed_constant", true);
+    }
+  }
+}
+
 // A node/slot pair.
 // TODO(phawkins): is there a common definition of this?
 struct NodeSlot {
-  NodeSlot() : node(nullptr), slot(-1) {}
-  NodeSlot(const Node* node, int slot) : node(node), slot(slot) {}
+  NodeSlot() : node(nullptr), slot(-1), dtype(DT_INVALID) {}
+  NodeSlot(const Node* node, int slot)
+      : node(node), slot(slot), dtype(DT_INVALID) {}
+  NodeSlot(const Node* node, int slot, DataType dtype)
+      : node(node), slot(slot), dtype(dtype) {}
 
   const Node* node;
   int slot;
 
+  // Optional: used to record the destination type of a source NodeSlot in case
+  // the source output is a Ref type that is cast to a Tensor at the
+  // destination.
+  DataType dtype;
+
   bool operator==(const NodeSlot& other) const {
-    return node == other.node && slot == other.slot;
+    return node == other.node && slot == other.slot && dtype == other.dtype;
   }
 
+  // Leave dtype out of the hash since there are never two NodeSlots with the
+  // same node and slot and different dtypes.
   struct Hasher {
     uint64 operator()(NodeSlot const& s) const {
       return Hash64Combine(std::hash<const Node*>()(s.node),
@@ -75,10 +139,22 @@ struct NodeSlot {
   };
 };
 
+// TODO(phawkins) add a canonical copy of these operator names and refactor
+// everything to use it.
+static const char* const kArgOp = "_Arg";
+static const char* const kRetValOp = "_Retval";
+static const char* const kHostComputeOp = "_XlaHostCompute";
+static const char* const kSendFromHostOp = "_XlaSendFromHost";
+static const char* const kRecvAtHostOp = "_XlaRecvAtHost";
+
 class Encapsulator {
  public:
-  Encapsulator(string group_attribute, Graph const* graph_in)
-      : group_attribute_(std::move(group_attribute)), graph_in_(graph_in) {}
+  Encapsulator(string group_attribute, string outside_compilation_attribute,
+               Graph const* graph_in)
+      : group_attribute_(std::move(group_attribute)),
+        outside_compilation_attribute_(
+            std::move(outside_compilation_attribute)),
+        graph_in_(graph_in) {}
 
   // Find subgraphs marked with 'group_attribute', and build a new
   // subgraph, one for each value of 'group_attribute'.
@@ -96,57 +172,419 @@ class Encapsulator {
 
   // Write a copy of the input graph to 'graph_out', where the subgraphs are
   // replaced with calls to the new functions.
-  Status BuildOutputGraph(bool parallel_checking, Graph* graph_out);
+  Status BuildOutputGraph(bool parallel_checking, Graph* graph_out,
+                          FunctionLibraryDefinition* library);
 
  private:
-  // Returns the key attribute associated with a node. Returns the empty string
-  // if no key attribute is found.
-  string GetFunctionNameAttr(const Node* node) const;
-
   // A subgraph of the input, all marked with a common 'group_attribute'
-  // value.
-  struct Subgraph {
+  // value. A subgraph may contain multiple `outside_compilation' clusters.
+  //
+  // In the following simple example, A, B, ..., E are nodes in the original
+  // graph. The group attributes and outside_compilation attributes g and oc are
+  // each shown as either 0 or empty.
+  //
+  //  A  -->  B  -->  C  -->  D  -->  E
+  //  g:      g:0     g:0     g:0     g:
+  //  oc:     oc:     oc:0    oc:     oc:
+  //
+  // The example is rewritten to two graphs; one on the host and one to be
+  // compiled. The host graph is as follows. RAH is a RecvAtHost node receiving
+  // input from the compiled cluster, and SFH is a SendFromHost node sending
+  // input back to the compiled cluster. Dotted edges are control edges. A
+  // 'sequencing' node S is inserted, and both RAH and SFH are connected via S
+  // to E (and in general all nodes that depend on nodes in the compiled
+  // cluster) to ensure that they are not pruned.
+  //
+  //  A  -->  Call  -->  E
+  //                     ^
+  //                     .
+  //           ........> S
+  //       ....          ^
+  //     ..             .
+  //  RAH -->  C  --> SFH
+  //
+  // The compiled cluster is as follows. HC is a HostCompute node which is the
+  // source of a channel to the RAH node above and the destination of a channel
+  // from the SFH node above.
+  //
+  //  Arg  --> B  --> HC  --> D --> Retval
+  //
+  // The channels HC/RAH and SFH/HC each transmit multiple tensors, so there is
+  // at most one RAH and SFH in each outside_compilation cluster. This design is
+  // preferred over adding separate Arg/Retval nodes for each transmitted value
+  // because it allows optimizations to the host code that would like to limit
+  // communication between host and device and, e.g., raise only one interrupt
+  // per channel rather than one per transmitted value.
+  //
+  // The shapes of the outputs from the HC node in general cannot be determined
+  // until the shapes of its inputs are known at compile time, since e.g.,
+  // above, the shape of C's outputs aren't known until the shape of its inputs
+  // are known. If the shapes of the HC's outputs can be determined during the
+  // rewrite, they are stored in the node's 'shapes' attr. Otherwise a minimal
+  // graph is stored in the shape_inference_graph attr. This graph can be used
+  // when compiling the HC Op to determined the shape of the SFH inputs given
+  // the shapes of any ancestor RAH outputs. If it can be determined that the
+  // shape of the SFH inputs will not be inferrable even once the shapes of the
+  // RAH outputs are known, an error is returned by the rewriter.
+  class Subgraph {
+   public:
+    // Creates a graph to build the subgraph in, if it doesn't already exist,
+    // using the same op registry and versions as graph_in.
+    Node* MakeNodeImage(const Graph* graph_in, Node* node);
+
+    // Returns the graph the subgraph is being built in.
+    Graph* GetGraph() const;
+
+    // Builds a FunctionDef, and adds it to 'library'. The value of the
+    // 'group_attribute' annotations becomes the function name.  If
+    // 'reuse_existing_functions' is set, use an existing function with the same
+    // name, if any.  If 'rewrite_subgraph_fn' is set, it is applied to the
+    // subgraph before function conversion.
+    Status BuildFunctionDef(const string& name_in,
+                            const RewriteSubgraphFn& rewrite_subgraph_fn,
+                            bool reuse_existing_functions,
+                            FunctionLibraryDefinition* library);
+
+    // Adds the function call node to graph_out.
+    Status AddFunctionCallNode(
+        const std::unordered_map<const Node*, Node*>& node_images,
+        bool parallel_checking, Graph* graph_out);
+
+    // Adds _RecvAtHost and _SendFromHost nodes, where needed, to graph_out.
+    Status AddOutsideCompilationHostIONodes(
+        const string& subgraph_name,
+        const std::unordered_map<const Node*, Node*>& node_images,
+        Graph* graph_out);
+
+    // Returns the names of all the outside_compilation subgraphs in this
+    // Subgraph.
+    void GetOutsideCompilationSubgraphNames(std::vector<string>* names) const;
+
+    // Returns the Node that inputs to the function should be wired up to.
+    Node* GetCallNodeForInputs() const;
+
+    // Returns the Node that outputs to the function should be wired up to.
+    Node* GetCallNodeForOutputs() const;
+
+    // Returns the index of the arg that the dst of edge should connect to.
+    int GetArgIndexForEdge(const Edge* edge) const;
+
+    // Returns the index of the result that the src of edge should connect to.
+    int GetResultIndexForEdge(const Edge* edge) const;
+
+    // Returns the RecvAtHost node for an outside_compilation subgraph.
+    Node* GetRecvAtHostNode(
+        const string& outside_compilation_subgraph_name) const;
+
+    // Returns the output slot for the RecvAtHost node that corresponds to the
+    // source of edge in an outside_compilation subgraph.
+    int GetRecvAtHostSlot(const string& outside_compilation_subgraph_name,
+                          const Edge* edge) const;
+
+    // Returns the SendFromHost node for an outside_compilation subgraph.
+    Node* GetSendFromHostNode(
+        const string& outside_compilation_subgraph_name) const;
+
+    // Returns the input slot for the SendFromHost node that corresponds to the
+    // destination of edge in an outside_compilation subgraph.
+    int GetSendFromHostSlot(const string& outside_compilation_subgraph_name,
+                            const Edge* edge) const;
+
+    // Creates an _Arg node for the src node of edge, and add its index to
+    // args_by_src_, if none exists yet. Also adds its index to args_by_dst_,
+    // and adds the edge within the subgraph from the _Arg node to the image of
+    // the dst node.
+    Status RecordArg(const Edge* edge,
+                     const std::unordered_map<const Node*, Node*>& node_images,
+                     std::vector<std::pair<const Node*, Node*>>* src_arg_pairs);
+
+    // Creates a _Retval node for the src node of edge, and add it to results_,
+    // if none exists yet. If a new _Retval node is created, also adds the edge
+    // within the subgraph from the src to the _Retval node.
+    Status RecordResult(
+        const Edge* edge,
+        const std::unordered_map<const Node*, Node*>& node_images);
+
+    // Creates an outside_compilation subgraph for outside_compilation_id if
+    // none exists yet. Creates an entry for the src node of edge in the list of
+    // inputs for the outside_compilation subgraph, if none exists yet.
+    void RecordOutsideCompilationInputOrControl(
+        const string& outside_compilation_id, const Edge* edge);
+
+    // Creates an outside_compilation subgraph for outside_compilation_id if
+    // none exists yet. Creates an entry for the src node of edge in the list of
+    // outputs by src for the outside_compilation subgraph, if none exists
+    // yet. Creates an entry for the dst node of edge in the list of outputs by
+    // dst for the outside_compilation subgraph.
+    void RecordOutsideCompilationOutputOrControl(
+        const string& outside_compilation_id, const Edge* edge);
+
+    // Adds the HostCompute nodes for each outside_compilation subgraph.
+    Status AddHostComputes(
+        const string& subgraph_name,
+        const std::unordered_map<const Node*, Node*>& node_images);
+
+    // Creates the sequencer node if it doesn't exist, adding it to graph_out.
+    Status MakeSequencingNode(const string& subgraph_name, Graph* graph_out);
+
+    // If there is a sequencer node, adds a control edge from the sequencer to
+    // all the downstream nodes of call_node_outputs.
+    void ConnectSequencerToOutputs(Graph* graph_out);
+
+    Status AddShapeInferenceInfo(
+        const string& outside_compilation_subgraph_name,
+        const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph);
+
+    Status ReplaceFunctionDef(FunctionLibraryDefinition* library);
+
+   private:
+    struct OutsideCompilationSubgraph {
+      // Map from source (producer node/slot) tensors in the original graph to
+      // input index (slot number in the HostCompute/RecvAtHost nodes that will
+      // be created) for the outside_compilation subgraph.
+      std::unordered_map<NodeSlot, int, NodeSlot::Hasher> inputs;
+
+      // Set of nodes in the original graph that are the source of control edges
+      // that cross from the containing compiled subgraph into the
+      // outside_compilation subgraph. These are recorded by
+      // RecordOutsideCompilationInputOrControl while walking all the subgraph
+      // edges, and lifted control edges within the subgraph are added by
+      // AddSendsToOutsideCompilation once the _HostCompute node has been
+      // created. The matching control edge from _RecvAtHost to the
+      // destination is added by CopyEdgeToOutputGraph.
+      std::unordered_set<const Node*> control_inputs;
+
+      // Maps from source (producer node/slot) and destination (consumer
+      // node/slot) tensors in the original graph to output index (slot number
+      // in the SendFromHost/HostCompute nodes that will be created) for the
+      // outside_compilation subgraph.
+      std::unordered_map<NodeSlot, int, NodeSlot::Hasher> outputs_by_src;
+      std::unordered_map<NodeSlot, int, NodeSlot::Hasher> outputs_by_dst;
+
+      // Set of nodes in the original graph that are the destination of control
+      // edges that cross from the outside_compilation subgraph into the
+      // containing compiled subgraph. These are recorded by
+      // RecordOutsideCompilationOutputOrControl while walking all the subgraph
+      // edges, and lifted control edges within the subgraph are added by
+      // AddRecvsFromToOutsideCompilation once the _HostCompute node has been
+      // created. The matching control edge from the source to _SendFromHost to
+      // the destination is added by CopyEdgeToOutputGraph.
+      std::unordered_set<const Node*> control_outputs;
+
+      // Name of the _HostCompute node in the subgraph.
+      string host_compute_name;
+
+      // _RecvAtHost node in the output graph. Not owned.
+      Node* recv_at_host = nullptr;
+
+      // _SendFromHost node in the output graph. Not owned.
+      Node* send_from_host = nullptr;
+    };
+
+    // Builds a ParallelCheck op that compares the output of the original
+    // subgraph with the encapsulated subgraph.
+    Status BuildParallelCheckOp(
+        const std::unordered_map<const Node*, Node*>& node_images,
+        Graph* graph_out);
+
+    // Builds a _RecvAtHost node producing all the inputs of an
+    // outside_compilation subgraph and stores it in oc_subgraph.recv_at_host.
+    Status AddRecvAtHostNode(const string& subgraph_name,
+                             const string& oc_subgraph_name,
+                             OutsideCompilationSubgraph* oc_subgraph,
+                             Graph* graph_out);
+
+    // Builds a _SendFromHost node consuming all the outputs of an
+    // outside_compilation subgraph and stores it in oc_subgraph.send_from_host.
+    Status AddSendFromHostNode(
+        const std::unordered_map<const Node*, Node*>& node_images,
+        const string& subgraph_name, const string& oc_subgraph_name,
+        OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out);
+
     // The subgraph extracted from the input graph, suitable for being turned
     // into a FunctionDef. Inputs are fed by _Arg nodes, and outputs are
     // returned by _Retval nodes.
-    std::unique_ptr<Graph> graph;
+    std::unique_ptr<Graph> graph_;
 
     // Which device are these nodes on? Used to assign a device to the call
     // node.
-    string device;
+    string device_;
 
     // NodeDef for the function call node.
-    NodeDef call_node_def;
+    NodeDef call_node_def_;
 
     // Function call node(s) in the output graph. Not owned.
     // If parallel_checking is enabled, 'call_node_inputs' is the function call
     // node to which inputs should be fed, and 'call_node_outputs' is the
     // parallel check op from which outputs should be read. If parallel checking
     // is disabled, both point to the function call node.
-    Node* call_node_inputs;
-    Node* call_node_outputs;
+    Node* call_node_inputs_;
+    Node* call_node_outputs_;
 
     // Maps from source (producer node/slot) and destination
     // (consumer node/slot) tensors in the input graph to _Arg numbers in
     // the subgraph. The source map is one-to-one, whereas the dest map may be
     // many-to-one.
-    std::unordered_map<NodeSlot, int, NodeSlot::Hasher> args_by_src;
-    std::unordered_map<NodeSlot, int, NodeSlot::Hasher> args_by_dst;
+    std::unordered_map<NodeSlot, int, NodeSlot::Hasher> args_by_src_;
+    std::unordered_map<NodeSlot, int, NodeSlot::Hasher> args_by_dst_;
 
     // The _Arg nodes in the subgraph, in order by argument number.
-    std::vector<Node*> args;
+    std::vector<Node*> args_;
 
     // Map from source tensor in the input graph to result #.
-    std::unordered_map<NodeSlot, int, NodeSlot::Hasher> results;
+    std::unordered_map<NodeSlot, int, NodeSlot::Hasher> results_;
+
+    // The outside_compilation clusters in this subgraph.
+    std::unordered_map<string, OutsideCompilationSubgraph>
+        outside_compilation_subgraphs_;
+
+    // NoOp node in the output graph that is sequenced after the call node and
+    // used to prevent host-side outside_compilation sends and recvs from being
+    // pruned.
+    Node* sequencer_ = nullptr;
   };
 
-  // Builds a ParallelCheck op that compares the output of the original subgraph
-  // with the encapsulated subgraph.
-  Status BuildParallelCheckOp(
+  // Returns the key attribute and outside_compilation attribute associated
+  // with a node in attr, and outside_compilation_attr, respectively. Sets
+  // either result to the empty string if the respective attribute is not
+  // found. Returns error status if there is an outside_compilation attribute
+  // and no key attribute,
+  Status GetFunctionNameAttr(Node const* node, string* attr,
+                             string* outside_compilation_attr) const;
+
+  // Copies edges local to a subgraph. Adds _Arg and _Retval nodes to
+  // subgraphs for data edges that cross subgraph boundaries.
+  Status CopySubgraphEdges(
+      const std::unordered_map<const Node*, Node*>& node_images,
+      std::vector<std::pair<const Node*, Node*>>* src_arg_pairs);
+
+  // Copies all marked nodes to a subgraph. Does nothing for unmarked nodes,
+  // or nodes marked outside_compilation.
+  Status CopySubgraphNodes(std::unordered_map<const Node*, Node*>* node_images);
+
+  // Copies all nodes that aren't in a compiled subgraph to the output graph.
+  Status CopyNodesToOutputGraph(
+      bool parallel_checking, Graph* graph_out,
+      std::unordered_map<const Node*, Node*>* node_images);
+
+  // Adds function call nodes for each compiled subgraph.
+  Status AddFunctionCallNodes(
+      const std::unordered_map<const Node*, Node*>& node_images,
+      bool parallel_checking, Graph* graph_out);
+
+  // Adds _RecvAtHost and _SendFromHost nodes, where needed, for all
+  // outside_compilation subgraphs.
+  Status AddOutsideCompilationHostIONodes(
+      const std::unordered_map<const Node*, Node*>& node_images,
+      Graph* graph_out);
+
+  // Finds the image of an edge source in the output graph. If the edge crosses
+  // a subgraph boundary it is the output of a call node, otherwise it is a node
+  // in the output graph.
+  Status FindOutputImageOfEdgeSrc(
+      const string& src_func_id, const string& src_outside_compilation_id,
+      const string& dst_func_id, const string& dst_outside_compilation_id,
+      const std::unordered_map<const Node*, Node*>& node_images,
+      const Node* original_src_node, Node** src_image);
+
+  // Finds an edge source slot in the output graph. If the edge crosses a
+  // subgraph boundary it is a slot on the output of a call node or a
+  // _RecvAtHost node, otherwise it is a slot on a node in the output graph.
+  int FindOutputSlotOfEdgeSrc(const string& src_func_id,
+                              const string& src_outside_compilation_id,
+                              const string& dst_func_id,
+                              const string& dst_outside_compilation_id,
+                              const Edge* edge);
+
+  // Finds the image of an edge destination in the output graph. If the edge
+  // crosses a subgraph boundary it is the input of a call node or a
+  // _SendFromHost node, otherwise it is a node in the output graph.
+  Status FindOutputImageOfEdgeDst(
+      const string& src_func_id, const string& src_outside_compilation_id,
+      const string& dst_func_id, const string& dst_outside_compilation_id,
+      const std::unordered_map<const Node*, Node*>& node_images,
+      const Node* original_dst_node, Node** dst_image);
+
+  // Finds an edge destination slot in the output graph. If the edge crosses a
+  // subgraph boundary it is a slot on the input of a call node or a
+  // _SendFromHost node, otherwise it is a slot on a node in the output graph.
+  int FindOutputSlotOfEdgeDst(const string& src_func_id,
+                              const string& src_outside_compilation_id,
+                              const string& dst_func_id,
+                              const string& dst_outside_compilation_id,
+                              const Edge* edge);
+
+  // Copies a single edge to the output graph. The edge is either entirely
+  // within the output graph, or crosses into or out of a compiled subgraph.
+  Status CopyEdgeToOutputGraph(
+      const Edge* edge, const string& src_func_id,
+      const string& src_outside_compilation_id, const string& dst_func_id,
+      const string& dst_outside_compilation_id,
       const std::unordered_map<const Node*, Node*>& node_images,
-      const Subgraph& subgraph, Graph* graph_out, Node** parallel_check_op);
+      bool parallel_checking, Graph* graph_out,
+      std::unordered_set<std::pair<NodeSlot, NodeSlot>, NodeSlot::PairHasher>*
+          edges_added);
+
+  // Adds all edges to the output graph.
+  Status AddEdgesToOutputGraph(
+      const std::unordered_map<const Node*, Node*>& node_images,
+      bool parallel_checking, Graph* graph_out);
+
+  // Constructs a minimal shape inference graph that can be used to determine
+  // the shape of send_node at the time that the subgraph is compiled.
+  // recv_at_host_nodes contains the names of all the recv_at_host nodes that
+  // send_node might depend on. These recv_at_host nodes have shapes that are
+  // not known during the rewrite pass, but will be known at compile time.
+  //
+  // If the shapes of all the inputs to send_node can be determined during the
+  // rewrite pass, on exit graphdef_out is empty and the shapes are returned in
+  // static_shape_out. Otherwise graphdef_out contains a graph that can be used
+  // for shape inference at compile time, where all the source nodes of the
+  // graph are either constants with known shapes, or nodes named in
+  // recv_at_host_nodes.
+  //
+  // A non-OK status is returned if neither of the above conditions can be
+  // satisfied, e.g., because send_node depends on a node that doesn't have a
+  // registered shape inference function.
+  Status DoStaticShapeInferenceForOutsideCompilationSend(
+      const Graph& graph_in, const ShapeRefiner& shape_refiner,
+      const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
+      FunctionLibraryDefinition* library,
+      std::vector<TensorShapeProto>* static_shape_out,
+      std::unique_ptr<GraphDef>* graphdef_out);
+
+  // Makes a copy of graph containing only nodes that are ancestors of at least
+  // one node in send_from_host_nodes and store it in pruned_graph. On exit
+  // nodes_images contains a mapping from nodes in graph to nodes in
+  // pruned_graph. All functions in the copied graph are inlined.
+  Status MakePrunedGraphCopyAndInline(
+      const Graph& graph, const std::vector<Node*>& sink_nodes,
+      std::unique_ptr<Graph>* pruned_graph,
+      std::unordered_map<const Node*, Node*>* node_images,
+      FunctionLibraryDefinition* library);
+
+  // Makes a copy of graph containing only nodes that are ancestors of a
+  // send_from_host node in an outside_compilation subgraph, and store it in
+  // pruned_graph. Also perform shape inference on the pruned graph, using
+  // shape_refiner. On exit node_images contains a mapping from nodes in graph
+  // to nodes in pruned_graph.
+  Status MakeGraphForOutsideCompilationSends(
+      const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
+      ShapeRefiner* shape_refiner,
+      std::unordered_map<const Node*, Node*>* node_images,
+      FunctionLibraryDefinition* library);
+
+  // Performs static shape inference, as far as possible, for the send_from_host
+  // nodes in each outside_compilation subgraph. Where it is not possible to
+  // determine the shape statically, stores a serialized GraphDef in the
+  // HostCompute 'shape_inference_graph' attr, to be used at compile time for
+  // final inference. If the shapes are known statically they are stored in the
+  // HostCompute 'shapes' attr.
+  Status GetShapeInfoForOutsideCompilationSends(
+      Graph* graph_out, FunctionLibraryDefinition* library);
 
   const string group_attribute_;
+  const string outside_compilation_attribute_;
   const Graph* graph_in_;
 
   std::unordered_map<string, Subgraph> subgraphs_;
@@ -154,224 +592,401 @@ class Encapsulator {
   TF_DISALLOW_COPY_AND_ASSIGN(Encapsulator);
 };
 
-// TODO(phawkins) add a canonical copy of these operator names and refactor
-// everything to use it.
-static const char* const kArgOp = "_Arg";
-static const char* const kRetValOp = "_Retval";
+Node* Encapsulator::Subgraph::GetCallNodeForInputs() const {
+  return call_node_inputs_;
+}
 
-// Returns the function name attached to 'node', or the empty string if there is
-// none.
-string Encapsulator::GetFunctionNameAttr(Node const* node) const {
-  string attr;
-  if (!GetNodeAttr(node->attrs(), group_attribute_, &attr).ok()) {
-    attr.clear();
-  }
-  return attr;
+Node* Encapsulator::Subgraph::GetCallNodeForOutputs() const {
+  return call_node_outputs_;
 }
 
-Status Encapsulator::SplitIntoSubgraphs() {
-  Status s;
+int Encapsulator::Subgraph::GetArgIndexForEdge(const Edge* edge) const {
+  return args_by_dst_.at(NodeSlot(edge->dst(), edge->dst_input()));
+}
 
-  // Map from input graph nodes to subgraph nodes.
-  std::unordered_map<Node*, Node*> node_images;
+int Encapsulator::Subgraph::GetResultIndexForEdge(const Edge* edge) const {
+  return results_.at(NodeSlot(edge->src(), edge->src_output()));
+}
 
-  // Copy all marked nodes to a subgraph. Do nothing for unmarked nodes.
-  for (Node* node : graph_in_->op_nodes()) {
-    string func_id = GetFunctionNameAttr(node);
-    if (func_id.empty()) continue;
+Node* Encapsulator::Subgraph::GetRecvAtHostNode(
+    const string& outside_compilation_subgraph_name) const {
+  return outside_compilation_subgraphs_.at(outside_compilation_subgraph_name)
+      .recv_at_host;
+}
 
-    Subgraph& subgraph = subgraphs_[func_id];
-    if (!subgraph.graph) {
-      subgraph.graph.reset(new Graph(graph_in_->op_registry()));
-      subgraph.graph->set_versions(graph_in_->versions());
-    }
+int Encapsulator::Subgraph::GetRecvAtHostSlot(
+    const string& outside_compilation_subgraph_name, const Edge* edge) const {
+  return outside_compilation_subgraphs_.at(outside_compilation_subgraph_name)
+      .inputs.at(NodeSlot(edge->src(), edge->src_output()));
+}
 
-    Node* image = subgraph.graph->CopyNode(node);
-    image->ClearAttr(group_attribute_);
-    node_images[node] = image;
+Node* Encapsulator::Subgraph::GetSendFromHostNode(
+    const string& outside_compilation_subgraph_name) const {
+  return outside_compilation_subgraphs_.at(outside_compilation_subgraph_name)
+      .send_from_host;
+}
 
-    if (subgraph.device.empty()) {
-      subgraph.device = node->assigned_device_name().empty()
-                            ? node->requested_device()
-                            : node->assigned_device_name();
-    }
+int Encapsulator::Subgraph::GetSendFromHostSlot(
+    const string& outside_compilation_subgraph_name, const Edge* edge) const {
+  return outside_compilation_subgraphs_.at(outside_compilation_subgraph_name)
+      .outputs_by_dst.at(NodeSlot(edge->dst(), edge->dst_input()));
+}
+
+Node* Encapsulator::Subgraph::MakeNodeImage(const Graph* graph_in, Node* node) {
+  if (!graph_) {
+    graph_.reset(new Graph(graph_in->op_registry()));
+    graph_->set_versions(graph_in->versions());
   }
 
-  // Copy edges local to a subgraph. Add _Arg and _Retval nodes to subgraphs for
-  // data edges that cross subgraph boundaries.
-  for (const Edge* edge : graph_in_->edges()) {
-    string src_func_id = GetFunctionNameAttr(edge->src());
-    string dst_func_id = GetFunctionNameAttr(edge->dst());
-    Node* src_image = gtl::FindWithDefault(node_images, edge->src(), nullptr);
-    Node* dst_image = gtl::FindWithDefault(node_images, edge->dst(), nullptr);
+  if (device_.empty()) {
+    device_ = node->assigned_device_name().empty()
+                  ? node->requested_device()
+                  : node->assigned_device_name();
+  }
 
-    // Copy edges that are local to a subgraph.
-    if (!src_func_id.empty() && src_func_id == dst_func_id) {
-      Graph* g = subgraphs_[src_func_id].graph.get();
-      if (edge->IsControlEdge()) {
-        g->AddControlEdge(src_image, dst_image);
-      } else {
-        g->AddEdge(src_image, edge->src_output(), dst_image, edge->dst_input());
-      }
-      continue;
-    }
+  return graph_->CopyNode(node);
+}
 
-    // Ignore cross-boundary control edges for right now. We will lift them
-    // onto the enclosing call operators in BuildOutputGraph().
-    if (edge->IsControlEdge()) continue;
+Graph* Encapsulator::Subgraph::GetGraph() const { return graph_.get(); }
+
+Status Encapsulator::Subgraph::RecordArg(
+    const Edge* edge, const std::unordered_map<const Node*, Node*>& node_images,
+    std::vector<std::pair<const Node*, Node*>>* src_arg_pairs) {
+  Node* src_node = edge->src();
+  int src_slot = edge->src_output();
+  std::unordered_map<NodeSlot, int, NodeSlot::Hasher>::iterator iter;
+  bool inserted;
+  std::tie(iter, inserted) =
+      args_by_src_.emplace(NodeSlot(src_node, src_slot), args_by_src_.size());
+  int arg_index = iter->second;
+  if (inserted) {
+    NodeDef arg_def;
+    NodeDefBuilder builder(
+        strings::StrCat(src_node->name(), "_", src_slot, "_arg"), kArgOp);
+    DataType dtype = edge->dst()->input_type(edge->dst_input());
+    builder.Attr("T", dtype);
+    builder.Attr("index", arg_index);
+    Status s = builder.Finalize(&arg_def);
+    if (!s.ok()) return s;
 
-    // Add 'src' as an output of its subgraph, if applicable.
-    if (!src_func_id.empty()) {
-      Subgraph& src_subgraph = subgraphs_[src_func_id];
-      int ret_index = src_subgraph.results.size();
-      if (src_subgraph.results
-              .emplace(NodeSlot(edge->src(), edge->src_output()), ret_index)
-              .second) {
-        // Create a new _Retval node
-        DataType dtype = edge->src()->output_type(edge->src_output());
+    Node* arg = graph_->AddNode(arg_def, &s);
+    if (!s.ok()) return s;
 
-        if (IsRefType(dtype)) {
-          return errors::InvalidArgument(
-              "Ref Tensors (e.g., Variables) are not supported: tensor ",
-              edge->src()->name(), ":", edge->src_output());
-        }
+    src_arg_pairs->push_back({src_node, arg});
+    args_.push_back(arg);
+  }
+  Node* dst_node = edge->dst();
+  Node* dst_image = node_images.at(dst_node);
+  int dst_slot = edge->dst_input();
+  args_by_dst_[NodeSlot(dst_node, dst_slot)] = arg_index;
+  graph_->AddEdge(args_[arg_index], 0, dst_image, dst_slot);
+  return Status::OK();
+}
+
+Status Encapsulator::Subgraph::RecordResult(
+    const Edge* edge,
+    const std::unordered_map<const Node*, Node*>& node_images) {
+  Node* src_node = edge->src();
+  Node* src_image = node_images.at(src_node);
+  int src_slot = edge->src_output();
+  std::unordered_map<NodeSlot, int, NodeSlot::Hasher>::iterator iter;
+  bool inserted;
+  std::tie(iter, inserted) =
+      results_.emplace(NodeSlot(src_node, src_slot), results_.size());
+  int ret_index = iter->second;
+  if (inserted) {
+    NodeDef ret_def;
+    NodeDefBuilder builder(
+        strings::StrCat(src_node->name(), "_", src_slot, "_retval"), kRetValOp);
+    DataType dtype = src_node->output_type(src_slot);
+    builder.Attr("T", dtype);
+    builder.Attr("index", ret_index);
+    builder.Input(src_image->name(), src_slot, dtype);
+    Status s = builder.Finalize(&ret_def);
+    if (!s.ok()) return s;
+    Node* ret = graph_->AddNode(ret_def, &s);
+    if (!s.ok()) return s;
 
-        NodeDef ret_def;
-        ret_def.set_op(kRetValOp);
-        ret_def.set_name(strings::StrCat(edge->src()->name(), "_",
-                                         edge->src_output(), "_retval"));
-        AddNodeAttr("T", dtype, &ret_def);
-        AddNodeAttr("index", ret_index, &ret_def);
-        Node* ret = src_subgraph.graph->AddNode(ret_def, &s);
-        if (!s.ok()) return s;
-
-        // Add an edge from 'src' to _Retval.
-        src_subgraph.graph->AddEdge(src_image, edge->src_output(), ret, 0);
+    graph_->AddEdge(src_image, src_slot, ret, 0);
+  }
+  return Status::OK();
+}
+
+void Encapsulator::Subgraph::RecordOutsideCompilationInputOrControl(
+    const string& outside_compilation_id, const Edge* edge) {
+  auto iter = outside_compilation_subgraphs_
+                  .emplace(outside_compilation_id, OutsideCompilationSubgraph())
+                  .first;
+  OutsideCompilationSubgraph& outside_subgraph = iter->second;
+  if (edge->IsControlEdge()) {
+    outside_subgraph.control_inputs.insert(edge->src());
+  } else {
+    int input_index = outside_subgraph.inputs.size();
+    outside_subgraph.inputs.emplace(NodeSlot(edge->src(), edge->src_output()),
+                                    input_index);
+  }
+}
+
+void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl(
+    const string& outside_compilation_id, const Edge* edge) {
+  auto subgraph_iter =
+      outside_compilation_subgraphs_
+          .emplace(outside_compilation_id, OutsideCompilationSubgraph())
+          .first;
+  OutsideCompilationSubgraph& outside_subgraph = subgraph_iter->second;
+  if (edge->IsControlEdge()) {
+    outside_subgraph.control_outputs.insert(edge->dst());
+  } else {
+    DataType dtype = edge->dst()->input_type(edge->dst_input());
+    auto output_iter =
+        outside_subgraph.outputs_by_src
+            .emplace(NodeSlot(edge->src(), edge->src_output(), dtype),
+                     outside_subgraph.outputs_by_src.size())
+            .first;
+    int output_index = output_iter->second;
+    outside_subgraph.outputs_by_dst[NodeSlot(edge->dst(), edge->dst_input())] =
+        output_index;
+  }
+}
+
+Status Encapsulator::Subgraph::AddHostComputes(
+    const string& subgraph_name,
+    const std::unordered_map<const Node*, Node*>& node_images) {
+  for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) {
+    const string& oc_subgraph_name = oc_subgraph_iter.first;
+    OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second;
+    if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty() ||
+        !oc_subgraph.outputs_by_src.empty() ||
+        !oc_subgraph.control_outputs.empty()) {
+      // Build a _HostCompute node.
+      std::vector<NodeDefBuilder::NodeOut> inputs(oc_subgraph.inputs.size());
+      std::vector<DataType> input_dtypes(oc_subgraph.inputs.size(), DT_INVALID);
+      std::vector<DataType> output_dtypes(oc_subgraph.outputs_by_src.size(),
+                                          DT_INVALID);
+
+      for (const auto& input_src : oc_subgraph.inputs) {
+        const Node* src_node = input_src.first.node;
+        Node* src_image = node_images.at(src_node);
+        int src_slot = input_src.first.slot;
+        int input_index = input_src.second;
+
+        DataType dtype = src_node->output_type(src_slot);
+        inputs[input_index].Reset(src_image->name(), src_slot, dtype);
+        input_dtypes[input_index] = dtype;
       }
-    }
 
-    // Add 'dst' as an input of its subgraph, if applicable.
-    if (!dst_func_id.empty()) {
-      Subgraph& dst_subgraph = subgraphs_[dst_func_id];
+      for (const auto& output : oc_subgraph.outputs_by_src) {
+        DataType dtype = output.first.dtype;
+        int output_index = output.second;
+        output_dtypes[output_index] = dtype;
+      }
 
-      // Create an _Arg node for this tensor, if none exists yet.
-      std::unordered_map<NodeSlot, int, NodeSlot::Hasher>::iterator iter;
-      bool inserted;
-      std::tie(iter, inserted) = dst_subgraph.args_by_src.emplace(
-          NodeSlot(edge->src(), edge->src_output()), dst_subgraph.args.size());
-      int arg_index = iter->second;
-      if (inserted) {
-        // This is the first time we have seen this tensor. Create an _Arg node.
-        DataType dtype = edge->dst()->input_type(edge->dst_input());
+      NodeDef host_compute_def;
+      NodeDefBuilder builder(strings::StrCat("outside_compilation_",
+                                             oc_subgraph_name, "_host_compute"),
+                             kHostComputeOp);
+      builder.Input(inputs);
+      builder.Attr("Tinputs", input_dtypes);
+      builder.Attr("Toutputs", output_dtypes);
+      builder.Attr("key",
+                   strings::StrCat("host_compute_channel_", subgraph_name, "_",
+                                   oc_subgraph_name));
+      Status s = builder.Finalize(&host_compute_def);
+      if (!s.ok()) return s;
+
+      Node* host_compute = graph_->AddNode(host_compute_def, &s);
+      if (!s.ok()) return s;
+      oc_subgraph.host_compute_name = host_compute->name();
+
+      // Connect the _HostCompute node to its producers in the subgraph.
+      for (auto& input_src : oc_subgraph.inputs) {
+        const Node* src_node = input_src.first.node;
+        Node* src_image = node_images.at(src_node);
+        int src_slot = input_src.first.slot;
+        int input_index = input_src.second;
+        graph_->AddEdge(src_image, src_slot, host_compute, input_index);
+      }
 
-        if (IsRefType(dtype)) {
-          return errors::InvalidArgument(
-              "Ref Tensors (e.g., Variables) are not supported: tensor ",
-              edge->src()->name(), ":", edge->src_output());
-        }
+      // Connect the _HostCompute node to its control edge producers in the
+      // subgraph.
+      for (const auto& src_node : oc_subgraph.control_inputs) {
+        Node* src_image = node_images.at(src_node);
+        graph_->AddControlEdge(src_image, host_compute);
+      }
 
-        NodeDef arg_def;
-        NodeDefBuilder builder(strings::StrCat(edge->src()->name(), "_",
-                                               edge->src_output(), "_arg"),
-                               kArgOp);
-        builder.Attr("T", dtype);
-        builder.Attr("index", arg_index);
-        s = builder.Finalize(&arg_def);
-        if (!s.ok()) return s;
+      // Connect the consumers in the subgraph to the _HostCompute node.
+      for (const auto& output : oc_subgraph.outputs_by_dst) {
+        const Node* dst_node = output.first.node;
+        Node* dst_image = node_images.at(dst_node);
+        int dst_slot = output.first.slot;
+        int output_index = output.second;
 
-        Node* arg = dst_subgraph.graph->AddNode(arg_def, &s);
-        if (!s.ok()) return s;
+        graph_->AddEdge(host_compute, output_index, dst_image, dst_slot);
+      }
 
-        dst_subgraph.args.push_back(arg);
+      // Connect the control edge consumers in the subgraph to the _HostCompute
+      // node.
+      for (const auto& dst_node : oc_subgraph.control_outputs) {
+        Node* dst_image = node_images.at(dst_node);
+        graph_->AddControlEdge(host_compute, dst_image);
       }
-      // Add an edge from the _Arg node to 'dst' in the subgraph.
-      dst_subgraph.args_by_dst[NodeSlot(edge->dst(), edge->dst_input())] =
-          arg_index;
-      dst_subgraph.graph->AddEdge(dst_subgraph.args[arg_index], 0, dst_image,
-                                  edge->dst_input());
     }
   }
 
-  for (auto& entry : subgraphs_) {
-    FixupSourceAndSinkEdges(entry.second.graph.get());
-  }
+  return Status::OK();
+}
 
-  return s;
+Status Encapsulator::Subgraph::MakeSequencingNode(const string& subgraph_name,
+                                                  Graph* graph_out) {
+  if (sequencer_ == nullptr) {
+    NodeDef seq_def;
+    NodeDefBuilder builder(strings::StrCat(subgraph_name, "_sequencer"),
+                           "NoOp");
+    Status s = builder.Finalize(&seq_def);
+    if (!s.ok()) return s;
+
+    sequencer_ = graph_out->AddNode(seq_def, &s);
+    if (!s.ok()) return s;
+    sequencer_->set_assigned_device_name(device_);
+  }
+  return Status::OK();
 }
 
-Status Encapsulator::BuildFunctionDefs(
-    const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
-    FunctionLibraryDefinition* library) {
-  // For each subgraph, build a FunctionDef.
-  for (auto& subgraph_entry : subgraphs_) {
-    string name = subgraph_entry.first;
-    Subgraph& subgraph = subgraph_entry.second;
+void Encapsulator::Subgraph::ConnectSequencerToOutputs(Graph* graph_out) {
+  if (sequencer_ != nullptr) {
+    std::unordered_set<Node*> output_dependencies;
+    for (Node* node : call_node_outputs_->out_nodes()) {
+      output_dependencies.insert(node);
+    }
+    for (Node* node : output_dependencies) {
+      graph_out->AddControlEdge(sequencer_, node);
+    }
+  }
+}
 
-    subgraph.call_node_def.set_op(name);
-    subgraph.call_node_def.set_name(name);
-    subgraph.call_node_def.set_device(subgraph.device);
+Status Encapsulator::Subgraph::BuildFunctionDef(
+    const string& name_in, const RewriteSubgraphFn& rewrite_subgraph_fn,
+    bool reuse_existing_functions, FunctionLibraryDefinition* library) {
+  // name_in is copied here because name may be modified below if
+  // rewrite_subgraph_fn is true.
+  string name = name_in;
+  call_node_def_.set_op(name);
+  call_node_def_.set_name(name);
+  call_node_def_.set_device(device_);
+
+  if (rewrite_subgraph_fn) {
+    // Initialize the input and output permutations to the identity.
+    std::vector<int> input_permutation(args_by_src_.size());
+    std::iota(input_permutation.begin(), input_permutation.end(), 0);
+    std::vector<int> output_permutation(results_.size());
+    std::iota(output_permutation.begin(), output_permutation.end(), 0);
+
+    TF_RETURN_IF_ERROR(rewrite_subgraph_fn(
+        &graph_, &input_permutation, &output_permutation, &call_node_def_));
+
+    // Apply the input/output permutations to the 'args_by_...' and 'results_'
+    // mappings, so when we build edges in BuildOutputGraph() we
+    // connect them to the right input/output positions.
+    if (input_permutation.size() != args_by_src_.size()) {
+      return errors::InvalidArgument("Input permutation has incorrect size.");
+    }
+    if (output_permutation.size() != results_.size()) {
+      return errors::InvalidArgument("Output permutation has incorrect size.");
+    }
+    for (auto& arg : args_by_src_) {
+      arg.second = input_permutation[arg.second];
+    }
+    for (auto& arg : args_by_dst_) {
+      arg.second = input_permutation[arg.second];
+    }
+    for (auto& result : results_) {
+      result.second = output_permutation[result.second];
+    }
 
-    if (rewrite_subgraph_fn) {
-      // Initialize the input and output permutations to the identity.
-      std::vector<int> input_permutation(subgraph.args_by_src.size());
-      std::iota(input_permutation.begin(), input_permutation.end(), 0);
-      std::vector<int> output_permutation(subgraph.results.size());
-      std::iota(output_permutation.begin(), output_permutation.end(), 0);
+    name = call_node_def_.op();
+  }
 
-      TF_RETURN_IF_ERROR(
-          rewrite_subgraph_fn(&subgraph.graph, &input_permutation,
-                              &output_permutation, &subgraph.call_node_def));
-
-      // Apply the input/output permutations to the 'args_by_...' and 'results'
-      // mappings in 'subgraph', so when we build edges in BuildOutputGraph() we
-      // connect them to the right input/output positions.
-      if (input_permutation.size() != subgraph.args_by_src.size()) {
-        return errors::InvalidArgument("Input permutation has incorrect size.");
-      }
-      if (output_permutation.size() != subgraph.results.size()) {
-        return errors::InvalidArgument(
-            "Output permutation has incorrect size.");
-      }
-      for (auto& arg : subgraph.args_by_src) {
-        arg.second = input_permutation[arg.second];
-      }
-      for (auto& arg : subgraph.args_by_dst) {
-        arg.second = input_permutation[arg.second];
-      }
-      for (auto& result : subgraph.results) {
-        result.second = output_permutation[result.second];
-      }
+  FunctionDef fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
 
-      name = subgraph.call_node_def.op();
-    }
+  if (VLOG_IS_ON(1)) {
+    VLOG(2) << "Build function def " << name;
+    dump_graph::DumpGraphToFile(
+        strings::StrCat("encapsulate_fdef_graph_", name), *graph_, library);
+    dump_graph::DumpFunctionDefToFile(
+        strings::StrCat("encapsulate_fdef_", name), fdef);
+  }
 
-    FunctionDef fdef;
-    TF_RETURN_IF_ERROR(GraphToFunctionDef(*subgraph.graph, name, &fdef));
+  if (!reuse_existing_functions || library->Find(name) == nullptr) {
+    TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+  }
+  return Status::OK();
+}
 
-    if (VLOG_IS_ON(1)) {
-      VLOG(2) << "Build function def " << name;
-      dump_graph::DumpGraphToFile(
-          strings::StrCat("encapsulate_fdef_graph_", name), *subgraph.graph,
-          library);
-      dump_graph::DumpFunctionDefToFile(
-          strings::StrCat("encapsulate_fdef_", name), fdef);
+Status Encapsulator::Subgraph::AddShapeInferenceInfo(
+    const string& outside_compilation_subgraph_name,
+    const std::vector<TensorShapeProto>& shapes, GraphDef* inference_graph) {
+  OutsideCompilationSubgraph& oc_subgraph =
+      outside_compilation_subgraphs_.at(outside_compilation_subgraph_name);
+
+  Node* host_compute = nullptr;
+  for (Node* n : graph_->nodes()) {
+    if (n->name() == oc_subgraph.host_compute_name) {
+      host_compute = n;
+      break;
     }
+  }
+  if (host_compute == nullptr) {
+    return errors::InvalidArgument(
+        "After rewriting subgraph ", outside_compilation_subgraph_name,
+        " there is no HostCompute Op for outside compilation subgraph ",
+        oc_subgraph.host_compute_name);
+  }
 
-    if (!reuse_existing_functions || library->Find(name) == nullptr) {
-      TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+  if (inference_graph == nullptr) {
+    host_compute->AddAttr("shape_inference_graph", "");
+    host_compute->AddAttr("shapes", shapes);
+  } else {
+    string serialized_graph;
+    if (!inference_graph->SerializeToString(&serialized_graph)) {
+      return errors::Internal(
+          "Failed to serialize graph for outside compilation subgraph ",
+          oc_subgraph.host_compute_name);
     }
+    host_compute->AddAttr("shape_inference_graph", serialized_graph);
+    host_compute->AddAttr("shapes", std::vector<TensorShapeProto>());
+  }
+  return Status::OK();
+}
+
+Status Encapsulator::Subgraph::ReplaceFunctionDef(
+    FunctionLibraryDefinition* library) {
+  const string& name = call_node_def_.name();
+
+  FunctionDef fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(2) << "Replace function def " << name;
+    dump_graph::DumpGraphToFile(
+        strings::StrCat("replace_encapsulate_fdef_graph_", name), *graph_,
+        library);
+    dump_graph::DumpFunctionDefToFile(
+        strings::StrCat("replace_encapsulate_fdef_", name), fdef);
   }
+
+  TF_RETURN_IF_ERROR(library->RemoveFunction(name));
+  TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
   return Status::OK();
 }
 
-Status Encapsulator::BuildParallelCheckOp(
+Status Encapsulator::Subgraph::BuildParallelCheckOp(
     const std::unordered_map<const Node*, Node*>& node_images,
-    const Encapsulator::Subgraph& subgraph, Graph* graph_out,
-    Node** parallel_check_op) {
+    Graph* graph_out) {
   // Build an index mapping output positions to node/slot pairs in the
   // original graph.
-  std::vector<NodeSlot> results_by_num(subgraph.results.size());
-  for (const auto& entry : subgraph.results) {
+  std::vector<NodeSlot> results_by_num(results_.size());
+  for (const auto& entry : results_) {
     results_by_num[entry.second] = entry.first;
   }
 
@@ -386,22 +1001,22 @@ Status Encapsulator::BuildParallelCheckOp(
     expected_outputs[i] =
         NodeDefBuilder::NodeOut(node_images.at(node_slot.node)->name(),
                                 node_slot.slot, result_dtypes[i]);
-    actual_outputs[i] = NodeDefBuilder::NodeOut(subgraph.call_node_def.name(),
-                                                i, result_dtypes[i]);
+    actual_outputs[i] =
+        NodeDefBuilder::NodeOut(call_node_def_.name(), i, result_dtypes[i]);
   }
   // Assign the parallel check op to a CPU on the same task as the cluster it is
   // checking.
   string device, dummy;
   if (!DeviceNameUtils::SplitDeviceName(
-          subgraph.call_node_inputs->assigned_device_name(), &device, &dummy)) {
+          call_node_inputs_->assigned_device_name(), &device, &dummy)) {
     return errors::InvalidArgument("Could not parse device name");
   }
   strings::StrAppend(&device, "/cpu:0");
 
   NodeDef check_def;
   TF_RETURN_IF_ERROR(
-      NodeDefBuilder(graph_out->NewName(strings::StrCat(
-                         subgraph.call_node_def.name(), "_parallel_check")),
+      NodeDefBuilder(graph_out->NewName(strings::StrCat(call_node_def_.name(),
+                                                        "_parallel_check")),
                      "ParallelCheck")
           .Device(device)
           .Attr("T", result_dtypes)
@@ -421,65 +1036,558 @@ Status Encapsulator::BuildParallelCheckOp(
     const NodeSlot& node_slot = results_by_num[i];
     graph_out->AddEdge(node_images.at(node_slot.node), node_slot.slot, check_op,
                        i);
-    graph_out->AddEdge(subgraph.call_node_inputs, i, check_op, num_results + i);
+    graph_out->AddEdge(call_node_inputs_, i, check_op, num_results + i);
   }
 
-  *parallel_check_op = check_op;
+  call_node_outputs_ = check_op;
   return Status::OK();
 }
 
-Status Encapsulator::BuildOutputGraph(bool parallel_checking,
-                                      Graph* graph_out) {
+Status Encapsulator::Subgraph::AddFunctionCallNode(
+    const std::unordered_map<const Node*, Node*>& node_images,
+    bool parallel_checking, Graph* graph_out) {
   Status s;
+  call_node_inputs_ = graph_out->AddNode(call_node_def_, &s);
+  if (!s.ok()) return s;
 
-  // Map from nodes in the input graph to nodes in the output graph.
+  // Copy the assigned device and the key_annotation over.
+  call_node_inputs_->set_assigned_device_name(device_);
+  call_node_outputs_ = call_node_inputs_;
+
+  if (parallel_checking) {
+    TF_RETURN_IF_ERROR(BuildParallelCheckOp(node_images, graph_out));
+  }
+  return Status::OK();
+}
+
+Status Encapsulator::Subgraph::AddRecvAtHostNode(
+    const string& subgraph_name, const string& oc_subgraph_name,
+    OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out) {
+  std::vector<DataType> dtypes(oc_subgraph->inputs.size(), DT_INVALID);
+
+  for (const auto& input : oc_subgraph->inputs) {
+    const Node* src_node = input.first.node;
+    int src_slot = input.first.slot;
+    int input_index = input.second;
+
+    DataType dtype = src_node->output_type(src_slot);
+    dtypes[input_index] = dtype;
+  }
+
+  NodeDef recv_def;
+  NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
+                                         "_", oc_subgraph_name, "_recv"),
+                         kRecvAtHostOp);
+  builder.Attr("Toutputs", dtypes);
+  builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
+                                      "_", oc_subgraph_name));
+  Status s = builder.Finalize(&recv_def);
+  if (!s.ok()) return s;
+
+  oc_subgraph->recv_at_host = graph_out->AddNode(recv_def, &s);
+  if (!s.ok()) return s;
+  oc_subgraph->recv_at_host->set_assigned_device_name(device_);
+
+  // Add a control dependency forcing the RecvAtHost to run before the subgraph
+  // completes. This has no effect on execution order but prevents the
+  // RecvAtHost being pruned.
+  TF_RETURN_IF_ERROR(MakeSequencingNode(subgraph_name, graph_out));
+  graph_out->AddControlEdge(oc_subgraph->recv_at_host, sequencer_);
+
+  return Status::OK();
+}
+
+Status Encapsulator::Subgraph::AddSendFromHostNode(
+    const std::unordered_map<const Node*, Node*>& node_images,
+    const string& subgraph_name, const string& oc_subgraph_name,
+    OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out) {
+  std::vector<DataType> dtypes(oc_subgraph->outputs_by_src.size(), DT_INVALID);
+  std::vector<NodeDefBuilder::NodeOut> inputs(
+      oc_subgraph->outputs_by_src.size());
+
+  for (const auto& output : oc_subgraph->outputs_by_src) {
+    const Node* src_node = output.first.node;
+    Node* src_image = node_images.at(src_node);
+    int src_slot = output.first.slot;
+    int output_index = output.second;
+
+    DataType dtype = src_node->output_type(src_slot);
+    dtypes[output_index] = dtype;
+    inputs[output_index].Reset(src_image->name(), src_slot, dtype);
+  }
+
+  NodeDef send_def;
+  NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
+                                         "_", oc_subgraph_name, "_send"),
+                         kSendFromHostOp);
+  builder.Attr("Tinputs", dtypes);
+  builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
+                                      "_", oc_subgraph_name));
+  builder.Input(inputs);
+  Status s = builder.Finalize(&send_def);
+  if (!s.ok()) return s;
+
+  oc_subgraph->send_from_host = graph_out->AddNode(send_def, &s);
+  if (!s.ok()) return s;
+  oc_subgraph->send_from_host->set_assigned_device_name(device_);
+
+  // Add a control dependency forcing the SendFromHost to run before the
+  // subgraph completes. This has no effect on execution order but prevents the
+  // RecvAtHost being pruned.
+  TF_RETURN_IF_ERROR(MakeSequencingNode(subgraph_name, graph_out));
+  graph_out->AddControlEdge(oc_subgraph->send_from_host, sequencer_);
+
+  return Status::OK();
+}
+
+Status Encapsulator::Subgraph::AddOutsideCompilationHostIONodes(
+    const string& subgraph_name,
+    const std::unordered_map<const Node*, Node*>& node_images,
+    Graph* graph_out) {
+  for (auto& outside_compilation_subgraph_entry :
+       outside_compilation_subgraphs_) {
+    const string& oc_name = outside_compilation_subgraph_entry.first;
+    OutsideCompilationSubgraph& oc_subgraph =
+        outside_compilation_subgraph_entry.second;
+
+    if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty()) {
+      TF_RETURN_IF_ERROR(
+          AddRecvAtHostNode(subgraph_name, oc_name, &oc_subgraph, graph_out));
+    }
+
+    if (!oc_subgraph.outputs_by_src.empty() ||
+        !oc_subgraph.control_outputs.empty()) {
+      TF_RETURN_IF_ERROR(AddSendFromHostNode(node_images, subgraph_name,
+                                             oc_name, &oc_subgraph, graph_out));
+    }
+  }
+  return Status::OK();
+}
+
+void Encapsulator::Subgraph::GetOutsideCompilationSubgraphNames(
+    std::vector<string>* names) const {
+  for (auto& entry : outside_compilation_subgraphs_) {
+    names->push_back(entry.first);
+  }
+}
+
+Status Encapsulator::GetFunctionNameAttr(
+    Node const* node, string* attr, string* outside_compilation_attr) const {
+  Status s = GetNodeAttr(node->attrs(), group_attribute_, attr);
+  if (s.code() == error::Code::NOT_FOUND) {
+    // Return empty attr if there's no group_attribute.
+    attr->clear();
+  } else {
+    TF_RETURN_IF_ERROR(s);
+  }
+  bool has_group_attr = s.ok();
+  s = GetNodeAttr(node->attrs(), outside_compilation_attribute_,
+                  outside_compilation_attr);
+  if (s.code() == error::Code::NOT_FOUND) {
+    // Return empty attr if there's no outside_compilation attribute.
+    outside_compilation_attr->clear();
+  } else {
+    TF_RETURN_IF_ERROR(s);
+    if (!has_group_attr) {
+      return errors::InvalidArgument(
+          "Node ", node->name(), " has ", outside_compilation_attribute_,
+          " attribute but no ", group_attribute_, " attribute.");
+    }
+  }
+  return Status::OK();
+}
+
+bool IsInSubgraph(const string& func_id, const string& outside_compilation_id) {
+  return !func_id.empty() && outside_compilation_id.empty();
+}
+
+Status Encapsulator::CopySubgraphNodes(
+    std::unordered_map<const Node*, Node*>* node_images) {
+  for (Node* node : graph_in_->op_nodes()) {
+    string func_id;
+    string outside_compilation_id;
+    TF_RETURN_IF_ERROR(
+        GetFunctionNameAttr(node, &func_id, &outside_compilation_id));
+    if (!IsInSubgraph(func_id, outside_compilation_id)) continue;
+
+    Subgraph& subgraph = subgraphs_[func_id];
+    Node* image = subgraph.MakeNodeImage(graph_in_, node);
+    image->ClearAttr(group_attribute_);
+    (*node_images)[node] = image;
+  }
+  return Status::OK();
+}
+
+Status Encapsulator::CopySubgraphEdges(
+    const std::unordered_map<const Node*, Node*>& node_images,
+    std::vector<std::pair<const Node*, Node*>>* src_arg_pairs) {
+  for (const Edge* edge : graph_in_->edges()) {
+    string src_func_id;
+    string src_outside_compilation_id;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->src(), &src_func_id,
+                                           &src_outside_compilation_id));
+    string dst_func_id;
+    string dst_outside_compilation_id;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->dst(), &dst_func_id,
+                                           &dst_outside_compilation_id));
+    Node* src_image = gtl::FindWithDefault(node_images, edge->src(), nullptr);
+    Node* dst_image = gtl::FindWithDefault(node_images, edge->dst(), nullptr);
+
+    // Copy edges that are local to a subgraph.
+    if (IsInSubgraph(src_func_id, src_outside_compilation_id) &&
+        IsInSubgraph(dst_func_id, dst_outside_compilation_id) &&
+        src_func_id == dst_func_id) {
+      Graph* g = subgraphs_[src_func_id].GetGraph();
+      if (edge->IsControlEdge()) {
+        g->AddControlEdge(src_image, dst_image);
+      } else {
+        g->AddEdge(src_image, edge->src_output(), dst_image, edge->dst_input());
+      }
+      continue;
+    }
+
+    // Record 'src' as an output of its subgraph, if applicable.
+    if (IsInSubgraph(src_func_id, src_outside_compilation_id)) {
+      if (!edge->IsControlEdge()) {
+        DataType dtype = edge->src()->output_type(edge->src_output());
+        if (IsRefType(dtype)) {
+          return errors::InvalidArgument(
+              "Ref Tensors (e.g., Variables) are not supported as results: "
+              "tensor ",
+              edge->src()->name(), ":", edge->src_output());
+        }
+      }
+
+      Subgraph& src_subgraph = subgraphs_[src_func_id];
+      if (src_func_id == dst_func_id) {
+        // src is in the subgraph and dst is outside_compilation in the same
+        // subgraph.
+        src_subgraph.RecordOutsideCompilationInputOrControl(
+            dst_outside_compilation_id, edge);
+      } else {
+        // Ignore control edges leaving the subgraph. We will lift them onto the
+        // enclosing call operators in BuildOutputGraph().
+        if (!edge->IsControlEdge()) {
+          TF_RETURN_IF_ERROR(src_subgraph.RecordResult(edge, node_images));
+        }
+      }
+    }
+
+    // Record 'dst' as an input of its subgraph, if applicable.
+    if (IsInSubgraph(dst_func_id, dst_outside_compilation_id)) {
+      // Look at the type of the destination not the source, since Ref output
+      // Tensors can be automatically cast to non-Ref Tensors at the
+      // destination.
+      if (!edge->IsControlEdge()) {
+        DataType dtype = edge->dst()->input_type(edge->dst_input());
+        if (IsRefType(dtype)) {
+          return errors::InvalidArgument(
+              "Ref Tensors (e.g., Variables) are not supported as args: "
+              "tensor ",
+              edge->src()->name(), ":", edge->src_output());
+        }
+      }
+
+      Subgraph& dst_subgraph = subgraphs_[dst_func_id];
+      if (src_func_id == dst_func_id) {
+        // dst is in the subgraph and src is outside_compilation in the same
+        // subgraph.
+        dst_subgraph.RecordOutsideCompilationOutputOrControl(
+            src_outside_compilation_id, edge);
+      } else {
+        // Ignore control edges entering the subgraph. We will lift them onto
+        // the enclosing call operators in BuildOutputGraph().
+        if (!edge->IsControlEdge()) {
+          TF_RETURN_IF_ERROR(
+              dst_subgraph.RecordArg(edge, node_images, src_arg_pairs));
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status Encapsulator::SplitIntoSubgraphs() {
+  Status s;
+
+  // Map from input graph nodes to subgraph nodes.
   std::unordered_map<const Node*, Node*> node_images;
 
-  // Copy all unmarked nodes to the output graph.
+  // Each entry of src_arg_pairs is a pair whose first element is a node in the
+  // original graph that has an output edge in the subgraph, and whose second
+  // element is the arg node in the subgraph that it sends to. The vector will
+  // be filled in below in AddArgs.
+  std::vector<std::pair<const Node*, Node*>> src_arg_pairs;
+
+  TF_RETURN_IF_ERROR(CopySubgraphNodes(&node_images));
+  TF_RETURN_IF_ERROR(CopySubgraphEdges(node_images, &src_arg_pairs));
+
+  // For each subgraph, add the nodes that deal with inputs and outputs its
+  // nested outside_compilation subgraphs. These could not be added earlier
+  // during CopySubgraphEdges since we need to discover all the types of the
+  // inputs and outputs for an outside_compilation subgraph before creating a
+  // single input and output node for it.
+  for (auto& entry : subgraphs_) {
+    Subgraph& subgraph = entry.second;
+    TF_RETURN_IF_ERROR(subgraph.AddHostComputes(entry.first, node_images));
+  }
+
+  MarkGuaranteedConstants(*graph_in_, src_arg_pairs);
+
+  for (auto& entry : subgraphs_) {
+    Subgraph& subgraph = entry.second;
+    FixupSourceAndSinkEdges(subgraph.GetGraph());
+  }
+
+  return s;
+}
+
+Status Encapsulator::BuildFunctionDefs(
+    const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
+    FunctionLibraryDefinition* library) {
+  for (auto& subgraph_entry : subgraphs_) {
+    string name = subgraph_entry.first;
+    Subgraph& subgraph = subgraph_entry.second;
+    TF_RETURN_IF_ERROR(subgraph.BuildFunctionDef(
+        name, rewrite_subgraph_fn, reuse_existing_functions, library));
+  }
+  return Status::OK();
+}
+
+Status Encapsulator::CopyNodesToOutputGraph(
+    bool parallel_checking, Graph* graph_out,
+    std::unordered_map<const Node*, Node*>* node_images) {
   for (Node* node : graph_in_->op_nodes()) {
-    string func_id = GetFunctionNameAttr(node);
+    string func_id;
+    string outside_compilation_id;
+    TF_RETURN_IF_ERROR(
+        GetFunctionNameAttr(node, &func_id, &outside_compilation_id));
 
     // Don't copy nodes that going to be encapsulated, unless parallel checking
     // is enabled.
-    if (!func_id.empty() && !parallel_checking) continue;
+    if (IsInSubgraph(func_id, outside_compilation_id) && !parallel_checking)
+      continue;
 
     Node* image = graph_out->CopyNode(node);
-    node_images[node] = image;
+    if (!outside_compilation_id.empty()) {
+      if (parallel_checking) {
+        return errors::InvalidArgument(
+            "Parallel checking is not supported when outside_compilation "
+            "clusters are present.");
+      }
+      image->ClearAttr(group_attribute_);
+      image->ClearAttr(outside_compilation_attribute_);
+    }
+    (*node_images)[node] = image;
+  }
+  (*node_images)[graph_in_->source_node()] = graph_out->source_node();
+  (*node_images)[graph_in_->sink_node()] = graph_out->sink_node();
+  return Status::OK();
+}
+
+Status Encapsulator::AddFunctionCallNodes(
+    const std::unordered_map<const Node*, Node*>& node_images,
+    bool parallel_checking, Graph* graph_out) {
+  for (auto& subgraph_entry : subgraphs_) {
+    TF_RETURN_IF_ERROR(subgraph_entry.second.AddFunctionCallNode(
+        node_images, parallel_checking, graph_out));
   }
-  node_images[graph_in_->source_node()] = graph_out->source_node();
-  node_images[graph_in_->sink_node()] = graph_out->sink_node();
+  return Status::OK();
+}
 
-  // Add function call nodes for each subgraph.
+Status Encapsulator::AddOutsideCompilationHostIONodes(
+    const std::unordered_map<const Node*, Node*>& node_images,
+    Graph* graph_out) {
   for (auto& subgraph_entry : subgraphs_) {
+    const string& subgraph_name = subgraph_entry.first;
     Subgraph& subgraph = subgraph_entry.second;
+    TF_RETURN_IF_ERROR(subgraph.AddOutsideCompilationHostIONodes(
+        subgraph_name, node_images, graph_out));
+  }
+  return Status::OK();
+}
 
-    subgraph.call_node_inputs = graph_out->AddNode(subgraph.call_node_def, &s);
-    if (!s.ok()) return s;
+Status Encapsulator::FindOutputImageOfEdgeSrc(
+    const string& src_func_id, const string& src_outside_compilation_id,
+    const string& dst_func_id, const string& dst_outside_compilation_id,
+    const std::unordered_map<const Node*, Node*>& node_images,
+    const Node* original_src_node, Node** src_image) {
+  if (IsInSubgraph(src_func_id, src_outside_compilation_id)) {
+    if (dst_func_id == src_func_id) {
+      // The edge is from a subgraph to an outside_compilation cluster in the
+      // same subgraph so use the appropriate _RecvAtHost node in the output
+      // graph.
+      TF_RET_CHECK(!dst_outside_compilation_id.empty());
+      *src_image = subgraphs_.at(src_func_id)
+                       .GetRecvAtHostNode(dst_outside_compilation_id);
+    } else {
+      // The edge is from a subgraph to a regular node in the output graph so
+      // use the subgraph's call node output.
+      *src_image = subgraphs_.at(src_func_id).GetCallNodeForOutputs();
+    }
+  } else {
+    // The source of the edge is in the output graph so use the node image in
+    // the output graph.
+    *src_image = node_images.at(original_src_node);
+  }
+  return Status::OK();
+}
 
-    // Copy the assigned device and the key_annotation over.
-    subgraph.call_node_inputs->set_assigned_device_name(subgraph.device);
-    subgraph.call_node_outputs = subgraph.call_node_inputs;
+int Encapsulator::FindOutputSlotOfEdgeSrc(
+    const string& src_func_id, const string& src_outside_compilation_id,
+    const string& dst_func_id, const string& dst_outside_compilation_id,
+    const Edge* edge) {
+  if (IsInSubgraph(src_func_id, src_outside_compilation_id)) {
+    const Subgraph& src_subgraph = subgraphs_.at(src_func_id);
+    if (src_func_id == dst_func_id) {
+      // 'src' is in a subgraph and 'dst' is outside_compilation in the same
+      // subgraph. Use the corresponding _RecvAtHost output instead.
+      return src_subgraph.GetRecvAtHostSlot(dst_outside_compilation_id, edge);
+    } else {
+      // 'src' is in a subgraph and 'dst' is a regular node in the output
+      // graph. Use the corresponding call output instead.
+      return src_subgraph.GetResultIndexForEdge(edge);
+    }
+  } else {
+    // The source of the edge is in the output graph so use the regular edge
+    // slot.
+    return edge->src_output();
+  }
+}
 
+Status Encapsulator::FindOutputImageOfEdgeDst(
+    const string& src_func_id, const string& src_outside_compilation_id,
+    const string& dst_func_id, const string& dst_outside_compilation_id,
+    const std::unordered_map<const Node*, Node*>& node_images,
+    const Node* original_dst_node, Node** dst_image) {
+  if (IsInSubgraph(dst_func_id, dst_outside_compilation_id)) {
+    if (src_func_id == dst_func_id) {
+      // The edge is to a subgraph from an outside_compilation cluster in the
+      // same subgraph so use the appropriate _SendFromHost node in the output
+      // graph.
+      TF_RET_CHECK(!src_outside_compilation_id.empty());
+      *dst_image = subgraphs_.at(dst_func_id)
+                       .GetSendFromHostNode(src_outside_compilation_id);
+    } else {
+      // The edge is to a subgraph from a regular node in the output graph so
+      // use the subgraph's call node input.
+      *dst_image = subgraphs_.at(dst_func_id).GetCallNodeForInputs();
+    }
+  } else {
+    // The destination of the edge is in the output graph so use the node image
+    // in the output graph.
+    *dst_image = node_images.at(original_dst_node);
+  }
+  return Status::OK();
+}
+
+int Encapsulator::FindOutputSlotOfEdgeDst(
+    const string& src_func_id, const string& src_outside_compilation_id,
+    const string& dst_func_id, const string& dst_outside_compilation_id,
+    const Edge* edge) {
+  if (IsInSubgraph(dst_func_id, dst_outside_compilation_id)) {
+    const Subgraph& dst_subgraph = subgraphs_.at(dst_func_id);
+    if (dst_func_id == src_func_id) {
+      // 'dst' is in a subgraph and 'src' is outside_compilation in the same
+      // subgraph. Use the corresponding _SendFromHost input instead.
+      return dst_subgraph.GetSendFromHostSlot(src_outside_compilation_id, edge);
+    } else {
+      // 'dst' is in a subgraph and 'src' is a regular node in the output
+      // graph. Use the corresponding call input instead.
+      return dst_subgraph.GetArgIndexForEdge(edge);
+    }
+  } else {
+    // The destination of the edge is in the output graph so use the regular
+    // edge slot.
+    return edge->dst_input();
+  }
+}
+
+Status Encapsulator::CopyEdgeToOutputGraph(
+    const Edge* edge, const string& src_func_id,
+    const string& src_outside_compilation_id, const string& dst_func_id,
+    const string& dst_outside_compilation_id,
+    const std::unordered_map<const Node*, Node*>& node_images,
+    bool parallel_checking, Graph* graph_out,
+    std::unordered_set<std::pair<NodeSlot, NodeSlot>, NodeSlot::PairHasher>*
+        edges_added) {
+  Node* src_image;
+  TF_RETURN_IF_ERROR(FindOutputImageOfEdgeSrc(
+      src_func_id, src_outside_compilation_id, dst_func_id,
+      dst_outside_compilation_id, node_images, edge->src(), &src_image));
+  Node* dst_image;
+  TF_RETURN_IF_ERROR(FindOutputImageOfEdgeDst(
+      src_func_id, src_outside_compilation_id, dst_func_id,
+      dst_outside_compilation_id, node_images, edge->dst(), &dst_image));
+
+  // If this is a control edge then copy it and return. Lift control edges onto
+  // the enclosing call operator.
+  if (edge->IsControlEdge()) {
+    // Add the control edge, if we have not already added it, using the images
+    // determined above (potentially call operators or RecvAtHost/SendFromHost).
+    if (edges_added->emplace(NodeSlot(src_image, -1), NodeSlot(dst_image, -1))
+            .second) {
+      graph_out->AddControlEdge(src_image, dst_image);
+    }
+
+    // If parallel checking is enabled, also add a control edge to the
+    // corresponding parallel check op.
     if (parallel_checking) {
-      TF_RETURN_IF_ERROR(BuildParallelCheckOp(node_images, subgraph, graph_out,
-                                              &subgraph.call_node_outputs));
+      graph_out->AddControlEdge(src_image, node_images.at(edge->dst()));
     }
+    return Status::OK();
+  }
+
+  int src_output =
+      FindOutputSlotOfEdgeSrc(src_func_id, src_outside_compilation_id,
+                              dst_func_id, dst_outside_compilation_id, edge);
+
+  int dst_input =
+      FindOutputSlotOfEdgeDst(src_func_id, src_outside_compilation_id,
+                              dst_func_id, dst_outside_compilation_id, edge);
+
+  if (IsInSubgraph(dst_func_id, dst_outside_compilation_id) &&
+      parallel_checking) {
+    // If we are parallel checking, also feed the tensor as an input to the
+    // corresponding parallel check subgraph.
+    graph_out->AddEdge(src_image, src_output, node_images.at(edge->dst()),
+                       edge->dst_input());
+  }
+
+  // Add the edge, if we have not already added it.
+  if (edges_added
+          ->emplace(NodeSlot(src_image, src_output),
+                    NodeSlot(dst_image, dst_input))
+          .second) {
+    graph_out->AddEdge(src_image, src_output, dst_image, dst_input);
   }
+  return Status::OK();
+}
 
+Status Encapsulator::AddEdgesToOutputGraph(
+    const std::unordered_map<const Node*, Node*>& node_images,
+    bool parallel_checking, Graph* graph_out) {
   // Set of edges already added to the output graph, represented as (src, dst)
   // pairs. We use the set to deduplicate edges; multiple edges in the input
   // graph may map to one edge in the output graph.
   std::unordered_set<std::pair<NodeSlot, NodeSlot>, NodeSlot::PairHasher>
       edges_added;
 
-  // Add edges to the graph_out graph.
   for (const Edge* edge : graph_in_->edges()) {
-    string src_func_id = GetFunctionNameAttr(edge->src());
-    string dst_func_id = GetFunctionNameAttr(edge->dst());
+    string src_func_id;
+    string src_outside_compilation_id;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->src(), &src_func_id,
+                                           &src_outside_compilation_id));
+    string dst_func_id;
+    string dst_outside_compilation_id;
+    TF_RETURN_IF_ERROR(GetFunctionNameAttr(edge->dst(), &dst_func_id,
+                                           &dst_outside_compilation_id));
 
     // Ignore edges that are strictly contained within one subgraph, unless
     // we are constructing parallel check graphs.
-    if (!src_func_id.empty() && src_func_id == dst_func_id) {
+    if (IsInSubgraph(src_func_id, src_outside_compilation_id) &&
+        IsInSubgraph(dst_func_id, dst_outside_compilation_id) &&
+        src_func_id == dst_func_id) {
       if (parallel_checking) {
         Node* src_image = node_images.at(edge->src());
         Node* dst_image = node_images.at(edge->dst());
@@ -493,89 +1601,403 @@ Status Encapsulator::BuildOutputGraph(bool parallel_checking,
       continue;
     }
 
-    // We have an edge that crosses a cluster boundary.
-    Node* src_image = src_func_id.empty()
-                          ? node_images.at(edge->src())
-                          : subgraphs_.at(src_func_id).call_node_outputs;
-    Node* dst_image = dst_func_id.empty()
-                          ? node_images.at(edge->dst())
-                          : subgraphs_.at(dst_func_id).call_node_inputs;
-
-    // Copy control edges. Lift control edges onto the enclosing call operator.
-    if (edge->IsControlEdge()) {
-      // Add the control edge, if we have not already added it.
-      if (edges_added.emplace(NodeSlot(src_image, -1), NodeSlot(dst_image, -1))
-              .second) {
-        graph_out->AddControlEdge(src_image, dst_image);
+    // We have an edge that crosses a cluster boundary or is entirely within the
+    // unclustered graph.
+    TF_RETURN_IF_ERROR(CopyEdgeToOutputGraph(
+        edge, src_func_id, src_outside_compilation_id, dst_func_id,
+        dst_outside_compilation_id, node_images, parallel_checking, graph_out,
+        &edges_added));
+  }
+
+  for (auto& subgraph_entry : subgraphs_) {
+    Subgraph& subgraph = subgraph_entry.second;
+    subgraph.ConnectSequencerToOutputs(graph_out);
+  }
+
+  return Status::OK();
+}
+
+namespace {
+
+// Adds a dummy Const node to graph_out. The "constant" has the type of
+// data_type and the shape indicated in 'shape'. The dummy node is not a valid
+// Const node because it does not have any value defined, but this doesn't
+// matter because it will only be used subsequently for shape inference. (It
+// would be possible to add a switch statement over data_type to create a value
+// for the constant, but that would entail maintaining the logic as new types
+// are added, and is not necessary.)
+Node* AddDummyShapedNode(DataType data_type, const TensorShapeProto& shape,
+                         Graph* graph_out) {
+  TensorProto dummy_proto;
+  dummy_proto.set_dtype(data_type);
+  *dummy_proto.mutable_tensor_shape() = shape;
+  // Don't set any value field in the proto, since it is only going to be used
+  // for shape inference.
+
+  GraphDefBuilder::Options options(graph_out, /*status=*/nullptr);
+  NodeBuilder node_builder(options.GetNameForOp("KnownShape"), "Const",
+                           options.op_registry());
+  node_builder.Attr("dtype", data_type).Attr("value", dummy_proto);
+  return options.FinalizeBuilder(&node_builder);
+}
+
+// Adds a copy of node_in to graph_out and adds the mapping to
+// copied_node_images.
+Status CopyShapeInferenceNodeToGraph(
+    Node* node_in, const Node* send_node,
+    const std::unordered_map<Node*, Node*>& dummy_node_images,
+    FunctionLibraryDefinition* library,
+    std::unordered_map<Node*, Node*>* copied_node_images, Graph* graph_out) {
+  // Once all the ancestor nodes have been added to graph_out, add this node
+  // and connect it to its ancestors.
+  Node* node_out = graph_out->CopyNode(node_in);
+  (*copied_node_images)[node_in] = node_out;
+  // Don't bother to build the shape inference graph if there's a node with no
+  // shape inference function, since it would just result in an error later at
+  // compile time.
+  const OpRegistrationData* op_reg_data;
+  TF_RETURN_IF_ERROR(library->LookUp(node_in->type_string(), &op_reg_data));
+  if (op_reg_data->shape_inference_fn == nullptr) {
+    return errors::InvalidArgument(
+        "Shape inference is not possible for outside_compilation "
+        "SendFromHost node ",
+        send_node->name(), " because it depends on node ", node_in->name(),
+        " which does not have a shape inference function registered.");
+  }
+  // Add all the edges to the newly copied node.
+  for (const Edge* in_edge : node_in->in_edges()) {
+    if (!in_edge->IsControlEdge()) {
+      Node* src = in_edge->src();
+      const auto iter = dummy_node_images.find(src);
+      if (iter == dummy_node_images.end()) {
+        // The src is a copied node so use the original output port.
+        graph_out->AddEdge((*copied_node_images)[in_edge->src()],
+                           in_edge->src_output(), node_out,
+                           in_edge->dst_input());
+      } else {
+        // The src is a dummy node so use output port 0.
+        graph_out->AddEdge(iter->second, 0, node_out, in_edge->dst_input());
       }
+    }
+  }
+  return Status::OK();
+}
 
-      // If parallel checking is enabled, also add a control edge to the
-      // corresponding parallel check op.
-      if (parallel_checking) {
-        graph_out->AddControlEdge(src_image, node_images.at(edge->dst()));
+}  // namespace
+
+Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
+    const Graph& graph_in, const ShapeRefiner& shape_refiner,
+    const std::unordered_set<string>& recv_at_host_nodes, Node* send_node,
+    FunctionLibraryDefinition* library,
+    std::vector<TensorShapeProto>* static_shape_out,
+    std::unique_ptr<GraphDef>* graphdef_out) {
+  // Maps from nodes in graph_in to nodes in graph_out.
+  //
+  // When an edge has fully defined shape the source node in graph_in is
+  // replaced in graph_out by a dummy constant node. The mapping from nodes
+  // in graph_in to dummy nodes is stored in dummy_node_images.
+  //
+  // When a node in graph_in has at least one ancestor that doesn't have fully
+  // defined shape, it is copied into graph_out. The mapping from nodes in
+  // graph_in to copied nodes is stored in copied_node_images.
+  //
+  // The two types of node are treated differently because, when adding edges to
+  // graph_out, an output from a dummy node always uses port 0, whereas an
+  // output from a copied node uses the same port that was used in graph_in.
+  std::unordered_map<Node*, Node*> dummy_node_images;
+  std::unordered_map<Node*, Node*> copied_node_images;
+
+  std::unique_ptr<Graph> graph_out(new Graph(graph_in.op_registry()));
+  graph_out->set_versions(graph_in.versions());
+  static_shape_out->resize(send_node->num_inputs());
+
+  // We don't use the standard ReverseDFS because we want to cut off traversal
+  // whenever we find an output with fully defined shape.
+  // TODO(misard) make this work properly in the presence of control flow.
+  struct Work {
+    Node* node;
+    bool leave;  // Are we entering or leaving node?
+  };
+  std::vector<Work> stack({{send_node, false}});
+  std::vector<bool> visited(graph_in.num_node_ids(), false);
+  while (!stack.empty()) {
+    Work w = stack.back();
+    stack.pop_back();
+    Node* n = w.node;
+
+    if (w.leave) {
+      TF_RETURN_IF_ERROR(CopyShapeInferenceNodeToGraph(
+          n, send_node, dummy_node_images, library, &copied_node_images,
+          graph_out.get()));
+    } else {
+      if (visited[n->id()]) continue;
+      visited[n->id()] = true;
+
+      // Arrange to revisit when all done with all inputs.
+      stack.push_back(Work{n, true});
+
+      bool has_parent_with_unknown_shape = false;
+      for (const Edge* in_edge : n->in_edges()) {
+        if (!in_edge->IsControlEdge()) {
+          Node* src_node = in_edge->src();
+          int src_port = in_edge->src_output();
+          shape_inference::InferenceContext* context =
+              shape_refiner.GetContext(src_node);
+          shape_inference::ShapeHandle shape = context->output(src_port);
+          if (context->FullyDefined(shape)) {
+            // This ancestor has known shape, so instead of adding it to the
+            // stack, add a dummy node with that shape to graph_out and
+            // continue.
+            TensorShapeProto proto;
+            context->ShapeHandleToProto(shape, &proto);
+            dummy_node_images[src_node] = AddDummyShapedNode(
+                src_node->output_type(src_port), proto, graph_out.get());
+            if (n == send_node) {
+              (*static_shape_out)[in_edge->dst_input()] = proto;
+            }
+          } else {
+            if (!visited[src_node->id()]) {
+              has_parent_with_unknown_shape = true;
+              stack.push_back({src_node, false});
+            }
+          }
+        }
+      }
+      if (!has_parent_with_unknown_shape) {
+        if (n == send_node) {
+          // The shapes of all the inputs to send_node are statically known. We
+          // won't have to do any inference at compile time so return now: the
+          // shapes were stored in static_shape_out above.
+          graphdef_out->reset();
+          return Status::OK();
+        } else {
+          // Any shape that is being processed is either the original send node
+          // or has at least one output with statically-unknown shape. If the
+          // latter and it doesn't have any inputs with statically-unknown
+          // shape, then check that it is of the recv nodes that we can fill in
+          // the shape of at run-time later. If it isn't one of those, then we
+          // won't have any additional knowledge at compile time, so we already
+          // know we won't be able to do shape inference and we can return an
+          // error now.
+          if (recv_at_host_nodes.find(n->name()) == recv_at_host_nodes.end()) {
+            return errors::InvalidArgument(
+                "Shape inference is not possible for outside_compilation "
+                "SendFromHost node ",
+                send_node->name(), " because shape of node ", n->name(),
+                " will not be known at compilation time.");
+          }
+        }
       }
-      continue;
     }
+  }
+
+  graphdef_out->reset(new GraphDef());
+  graph_out->ToGraphDef(graphdef_out->get());
 
-    int src_output = edge->src_output();
-    if (!src_func_id.empty()) {
-      // 'src' is in a subgraph. Use the corresponding call output instead.
-      const Subgraph& src_subgraph = subgraphs_.at(src_func_id);
-      src_output =
-          src_subgraph.results.at(NodeSlot(edge->src(), edge->src_output()));
+  return Status::OK();
+}
+
+Status Encapsulator::MakePrunedGraphCopyAndInline(
+    const Graph& graph, const std::vector<Node*>& sink_nodes,
+    std::unique_ptr<Graph>* pruned_graph,
+    std::unordered_map<const Node*, Node*>* node_images,
+    FunctionLibraryDefinition* library) {
+  // First copy all ancestor nodes of sink_nodes into a new graph.
+  pruned_graph->reset(new Graph(library));
+  (*pruned_graph)->set_versions(graph.versions());
+  ReverseDFSFrom(graph, sink_nodes,
+                 /*enter=*/nullptr,
+                 /*leave=*/[&](Node* n) {
+                   if (!n->IsSource()) {
+                     Node* copied = (*pruned_graph)->CopyNode(n);
+                     node_images->emplace(n, copied);
+                   }
+                 });
+
+  // Add all the edges between copied nodes.
+  for (auto entry : *node_images) {
+    const Node* orig = entry.first;
+    Node* image = entry.second;
+    for (const Edge* out_edge : orig->out_edges()) {
+      auto iter = node_images->find(out_edge->dst());
+      if (iter != node_images->end()) {
+        // The source and destination are both in the copied graph.
+        (*pruned_graph)
+            ->AddEdge(image, out_edge->src_output(), iter->second,
+                      out_edge->dst_input());
+      }
     }
+  }
 
-    int dst_input = edge->dst_input();
+  // Find all the function call nodes, and inline them.
+  std::vector<Node*> function_nodes;
+  for (auto node : (*pruned_graph)->nodes()) {
+    const OpRegistrationData* op_reg_data;
+    TF_RETURN_IF_ERROR(library->LookUp(node->type_string(), &op_reg_data));
+    if (op_reg_data->is_function_op) {
+      function_nodes.push_back(node);
+    }
+  }
+  for (auto node : function_nodes) {
+    VLOG(2) << "Inlining function " << node->name();
+    const FunctionDef* fdef = library->Find(node->type_string());
+    if (fdef == nullptr) {
+      return errors::Internal("Failed to find function ", node->type_string(),
+                              " in function library.");
+    }
+    FunctionBody* fbody = nullptr;
+    TF_RETURN_IF_ERROR(
+        FunctionDefToBodyHelper(*fdef, node->attrs(), library,
+                                [library](const string& op, const OpDef** sig) {
+                                  return library->LookUpOpDef(op, sig);
+                                },
+                                &fbody));
+    InlineFunctionBody(*library, pruned_graph->get(), node, fbody);
+    delete fbody;
+  }
 
-    if (!dst_func_id.empty()) {
-      // 'dst' is in a subgraph. Use the corresponding call input instead.
-      const Subgraph& dst_subgraph = subgraphs_.at(dst_func_id);
-      dst_input =
-          dst_subgraph.args_by_dst.at(NodeSlot(edge->dst(), edge->dst_input()));
+  return Status::OK();
+}
 
-      // If we are parallel checking, also feed the tensor as an input to the
-      // corresponding parallel check subgraph.
-      if (parallel_checking) {
-        graph_out->AddEdge(src_image, src_output, node_images.at(edge->dst()),
-                           edge->dst_input());
+Status Encapsulator::MakeGraphForOutsideCompilationSends(
+    const Graph& graph, std::unique_ptr<Graph>* pruned_graph,
+    ShapeRefiner* shape_refiner,
+    std::unordered_map<const Node*, Node*>* node_images,
+    FunctionLibraryDefinition* library) {
+  // Find all the send_from_host nodes in all subgraphs, to use as roots for the
+  // pruning.
+  std::vector<Node*> send_from_host_nodes;
+  for (auto& subgraph_entry : subgraphs_) {
+    Subgraph& subgraph = subgraph_entry.second;
+    std::vector<string> outside_compilation_names;
+    subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
+    for (const auto& name : outside_compilation_names) {
+      Node* send_node = subgraph.GetSendFromHostNode(name);
+      if (send_node != nullptr) {
+        send_from_host_nodes.push_back(send_node);
       }
     }
-    // Add the edge, if we have not already added it.
-    if (edges_added
-            .emplace(NodeSlot(src_image, src_output),
-                     NodeSlot(dst_image, dst_input))
-            .second) {
-      graph_out->AddEdge(src_image, src_output, dst_image, dst_input);
+  }
+
+  // Make a copy of all the graph nodes needed to evaluate the send_from_host
+  // nodes, inlining any functions as needed.
+  TF_RETURN_IF_ERROR(MakePrunedGraphCopyAndInline(
+      graph, send_from_host_nodes, pruned_graph, node_images, library));
+
+  // Perform shape inference on the pruned graph.
+  shape_refiner->set_require_shape_inference_fns(false);
+  FixupSourceAndSinkEdges(pruned_graph->get());
+  std::vector<Node*> post_order;
+  GetReversePostOrder(*(*pruned_graph), &post_order);
+  for (auto node : post_order) {
+    // Ignore the status returned by the shape_refiner. At this point we want
+    // the best effort shapes, even if no shape function is registered for a
+    // node.
+    Status status = shape_refiner->AddNode(node);
+    if (!status.ok()) {
+      VLOG(1) << "Shape inference failed for node: " << status;
     }
   }
 
-  return s;
+  return Status::OK();
+}
+
+Status Encapsulator::GetShapeInfoForOutsideCompilationSends(
+    Graph* graph_out, FunctionLibraryDefinition* library) {
+  std::unique_ptr<Graph> pruned_graph;
+  ShapeRefiner shape_refiner(graph_out->versions(), graph_out->op_registry());
+  std::unordered_map<const Node*, Node*> node_images;
+  TF_RETURN_IF_ERROR(MakeGraphForOutsideCompilationSends(
+      *graph_out, &pruned_graph, &shape_refiner, &node_images, library));
+
+  for (auto& subgraph_entry : subgraphs_) {
+    Subgraph& subgraph = subgraph_entry.second;
+    // Find all the recv_at_host nodes in this subgraph.
+    std::vector<string> outside_compilation_names;
+    subgraph.GetOutsideCompilationSubgraphNames(&outside_compilation_names);
+    std::unordered_set<string> recv_at_host_names;
+    for (const auto& name : outside_compilation_names) {
+      Node* recv_node = subgraph.GetRecvAtHostNode(name);
+      if (recv_node != nullptr) {
+        recv_at_host_names.insert(recv_node->name());
+      }
+    }
+    // For each send_from_host node, do as much shape inference as possible
+    // without knowing the shape of the recv_at_host nodes, and store the
+    // result, along with enough information to complete the job at compile time
+    // once the recv_at_host shapes are known.
+    for (const auto& name : outside_compilation_names) {
+      Node* send_node = subgraph.GetSendFromHostNode(name);
+      std::vector<TensorShapeProto> static_shape;
+      std::unique_ptr<GraphDef> graphdef;
+      if (send_node != nullptr) {
+        TF_RETURN_IF_ERROR(DoStaticShapeInferenceForOutsideCompilationSend(
+            *pruned_graph, shape_refiner, recv_at_host_names,
+            node_images[send_node], library, &static_shape, &graphdef));
+        if (graphdef == nullptr) {
+          VLOG(2) << "Send node  " << send_node->name() << " shapes";
+          for (int i = 0; i < static_shape.size(); ++i) {
+            VLOG(2) << static_shape[i].DebugString();
+          }
+        } else {
+          VLOG(2) << "Send node " << send_node->name() << " graph\n"
+                  << graphdef->DebugString();
+        }
+      }
+      TF_RETURN_IF_ERROR(
+          subgraph.AddShapeInferenceInfo(name, static_shape, graphdef.get()));
+    }
+    if (!outside_compilation_names.empty()) {
+      TF_RETURN_IF_ERROR(subgraph.ReplaceFunctionDef(library));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Encapsulator::BuildOutputGraph(bool parallel_checking, Graph* graph_out,
+                                      FunctionLibraryDefinition* library) {
+  // Map from nodes in the input graph to nodes in the output graph.
+  std::unordered_map<const Node*, Node*> node_images;
+
+  TF_RETURN_IF_ERROR(
+      CopyNodesToOutputGraph(parallel_checking, graph_out, &node_images));
+  TF_RETURN_IF_ERROR(
+      AddFunctionCallNodes(node_images, parallel_checking, graph_out));
+  TF_RETURN_IF_ERROR(AddOutsideCompilationHostIONodes(node_images, graph_out));
+  TF_RETURN_IF_ERROR(
+      AddEdgesToOutputGraph(node_images, parallel_checking, graph_out));
+
+  TF_RETURN_IF_ERROR(
+      GetShapeInfoForOutsideCompilationSends(graph_out, library));
+
+  return Status::OK();
 }
 
 }  // anonymous namespace
 
 Status EncapsulateSubgraphsInFunctions(
-    string group_attribute, const Graph& graph_in,
-    const RewriteSubgraphFn& rewrite_subgraph_fn, bool parallel_checking,
-    bool reuse_existing_functions, std::unique_ptr<Graph>* graph_out,
-    FunctionLibraryDefinition* library) {
+    string group_attribute, string outside_compilation_attribute,
+    const Graph& graph_in, const RewriteSubgraphFn& rewrite_subgraph_fn,
+    bool parallel_checking, bool reuse_existing_functions,
+    std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library) {
   Status s;
 
-  Encapsulator encapsulator(std::move(group_attribute), &graph_in);
-  s = encapsulator.SplitIntoSubgraphs();
-  if (!s.ok()) return s;
+  Encapsulator encapsulator(std::move(group_attribute),
+                            std::move(outside_compilation_attribute),
+                            &graph_in);
+  TF_RETURN_IF_ERROR(encapsulator.SplitIntoSubgraphs());
 
-  s = encapsulator.BuildFunctionDefs(rewrite_subgraph_fn,
-                                     reuse_existing_functions, library);
-  if (!s.ok()) return s;
+  TF_RETURN_IF_ERROR(encapsulator.BuildFunctionDefs(
+      rewrite_subgraph_fn, reuse_existing_functions, library));
 
   std::unique_ptr<Graph> out(new Graph(library));
   out->set_versions(graph_in.versions());
-  s = encapsulator.BuildOutputGraph(parallel_checking, out.get());
-  if (!s.ok()) return s;
+  TF_RETURN_IF_ERROR(
+      encapsulator.BuildOutputGraph(parallel_checking, out.get(), library));
 
   *graph_out = std::move(out);
-  return s;
+  return Status::OK();
 }
 
 // Finds the types of the _Arg nodes, indexed by position.
@@ -690,9 +2112,9 @@ Status EncapsulateSubgraphsPass::Run(
   };
 
   TF_RETURN_IF_ERROR(EncapsulateSubgraphsInFunctions(
-      kXlaClusterAttr, **options.graph, rewrite_subgraph,
-      flags->tf_xla_parallel_checking, /*reuse_existing_functions=*/false,
-      &graph_out, library));
+      kXlaClusterAttr, kXlaOutsideCompilationAttr, **options.graph,
+      rewrite_subgraph, flags->tf_xla_parallel_checking,
+      /*reuse_existing_functions=*/false, &graph_out, library));
 
   if (VLOG_IS_ON(1)) {
     dump_graph::DumpGraphToFile("after_encapsulate_subgraphs", *graph_out,
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
index b0987f76c91ed48df52fab303ea6052ebd8fd336..34be4409a381197d2191e083727aa8d48ab8cd63 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
@@ -48,6 +48,16 @@ typedef std::function<Status(
 // 'group_attribute' must be a string valued-attribute that names the new
 // functions to introduce.
 //
+// 'outside_compilation_attribute' must be a string-valued attribute that is
+// used to tag nodes within a subgraph to be part of an 'outside_compilation'
+// cluster within the subgraph. A cluster is formed from the set of nodes with
+// the same value of outside_compilation_subgraph and group_attribute. The nodes
+// in an outside_compilation cluster are left in the original graph. Edges
+// crossing from the subgraph to an outside_compilation cluster nested in the
+// subgraph are lifted into a SendToHost/RecvAtHost pair of nodes, and edges
+// crossing from an outside_compilation cluster into its enclosing subgraph are
+// lifted into a SendFromHost/RecvFromHost pair of nodes.
+//
 // If 'rewrite_subgraph_fn' is set, it is applied to each subgraph before
 // function conversion.
 //
@@ -64,10 +74,10 @@ typedef std::function<Status(
 // dep from B. Originally D must run after C, post-transformation this
 // dependency is lost.
 Status EncapsulateSubgraphsInFunctions(
-    string group_attribute, const Graph& graph_in,
-    const RewriteSubgraphFn& rewrite_subgraph_fn, bool parallel_checking,
-    bool reuse_existing_functions, std::unique_ptr<Graph>* graph_out,
-    FunctionLibraryDefinition* library);
+    string group_attribute, string outside_compilation_attribute,
+    const Graph& graph_in, const RewriteSubgraphFn& rewrite_subgraph_fn,
+    bool parallel_checking, bool reuse_existing_functions,
+    std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library);
 
 // The attribute that marks function calls produced by the encapsulate
 // subgraphs pass and that should in turn be compiled via _XlaLaunch operators.
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 4a1dbaf05dc7824835f3567c6abcf48222720230..aed9cae0f1799c4524da8ee309344849798755d5 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -29,17 +29,181 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+template <class Tkey, class Tvalue>
+bool EqualProtoMap(const ::tensorflow::protobuf::Map<Tkey, Tvalue>& a,
+                   const ::tensorflow::protobuf::Map<Tkey, Tvalue>& b,
+                   const std::function<string(const Tkey&)>& key_to_string,
+                   const std::function<string(const Tvalue&)>& value_to_string,
+                   const std::function<bool(const Tkey&, const Tvalue&,
+                                            const Tvalue&)>& compare,
+                   const string& map_name, string* diff) {
+  for (const auto& elt_a : a) {
+    const auto iter = b.find(elt_a.first);
+    if (iter == b.end()) {
+      if (diff) {
+        *diff = strings::StrCat(
+            map_name, " expected: contains element with key '",
+            key_to_string(elt_a.first), "' got: map has no such element");
+      }
+      return false;
+    }
+    if (!compare(elt_a.first, elt_a.second, iter->second)) {
+      if (diff) {
+        *diff = strings::StrCat(map_name, " expected: element with key '",
+                                key_to_string(elt_a.first), " has value '",
+                                value_to_string(elt_a.second), "' got: '",
+                                value_to_string(iter->second), "'");
+      }
+      return false;
+    }
+  }
+  for (const auto& elt_b : b) {
+    const auto iter = a.find(elt_b.first);
+    if (iter == a.end()) {
+      if (diff) {
+        *diff = strings::StrCat(map_name, " got: contains element with key '",
+                                key_to_string(elt_b.first),
+                                "' expected: map has no such element");
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b,
+                          const string& diff_preamble, string* diff) {
+  if (a.op() != b.op()) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              ", expected op '", a.op(), "' got '", b.op());
+    }
+    return false;
+  }
+  if (a.device() != b.device()) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              ", expected device '", a.device(), "' got '",
+                              b.device());
+    }
+    return false;
+  }
+  if (a.input_size() != b.input_size()) {
+    if (diff) {
+      *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                              ", expected ", a.input_size(), " inputs got ",
+                              b.input_size(), " expected:\n", a.DebugString(),
+                              "\ngot:\n", b.DebugString());
+    }
+    return false;
+  }
+  for (int i = 0; i < a.input_size(); ++i) {
+    if (a.input(i) != b.input(i)) {
+      if (diff) {
+        *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(),
+                                " input ", i, ", expected ", a.input(i),
+                                " got ", b.input(i), " expected:\n",
+                                a.DebugString(), "\ngot:\n", b.DebugString());
+      }
+      return false;
+    }
+  }
+  return EqualProtoMap<string, AttrValue>(
+      a.attr(), b.attr(), [](const string& s) { return s; },
+      [](const AttrValue& v) { return v.DebugString(); },
+      [](const string& key, const AttrValue& av, const AttrValue& bv) {
+        if (key == "shape_inference_graph") {
+          // Default serialization of GraphDef is unstable because maps don't
+          // serialize deterministically. Rather than go through the hoops to
+          // turn on deterministic serialization of this attr just for this
+          // test, add logic here to compare determinstically.
+          GraphDef ga;
+          if (!ga.ParseFromString(av.s())) {
+            return false;
+          }
+          GraphDef gb;
+          if (!gb.ParseFromString(bv.s())) {
+            return false;
+          }
+          return EqualGraphDef(ga, gb, nullptr);
+        } else {
+          return av.DebugString() == bv.DebugString();
+        }
+      },
+      strings::StrCat(diff_preamble, " attr mismatch for node ", a.name()),
+      diff);
+}
+
 bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
                       string* diff) {
-  // TODO(phawkins) use a more sophisticated equality test.
-  if (a.DebugString() != b.DebugString()) {
+  if (a.signature().DebugString() != b.signature().DebugString()) {
     if (diff) {
-      *diff = strings::StrCat("Definition mismatch for function ",
+      *diff = strings::StrCat("Signature mismatch for function ",
                               a.signature().name(), ", expected:\n",
-                              a.DebugString());
+                              a.signature().DebugString(), "\ngot:\n",
+                              b.signature().DebugString());
     }
     return false;
   }
+  if (!EqualProtoMap<string, AttrValue>(
+          a.attr(), b.attr(), [](const string& s) { return s; },
+          [](const AttrValue& v) { return v.DebugString(); },
+          [](const string& key, const AttrValue& av, const AttrValue& bv) {
+            return av.DebugString() == bv.DebugString();
+          },
+          strings::StrCat("attr mismatch for function ", a.signature().name()),
+          diff)) {
+    return false;
+  }
+  if (!EqualProtoMap<string, string>(
+          a.ret(), b.ret(), [](const string& s) { return s; },
+          [](const string& s) { return s; },
+          [](const string& key, const string& av, const string& bv) {
+            return av == bv;
+          },
+          strings::StrCat("ret mismatch for function ", a.signature().name()),
+          diff)) {
+    return false;
+  }
+  for (int i = 0; i < a.node_def_size(); ++i) {
+    bool found = false;
+    for (int j = 0; j < b.node_def_size(); ++j) {
+      if (a.node_def(i).name() == b.node_def(j).name()) {
+        if (!EqualFunctionNodeDef(
+                a.node_def(i), b.node_def(j),
+                strings::StrCat("Function ", a.signature().name()), diff)) {
+          return false;
+        }
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      if (diff) {
+        *diff = strings::StrCat("Function ", a.signature().name(),
+                                ", expected: has node '", a.node_def(i).name(),
+                                "' got: no node of that name");
+      }
+      return false;
+    }
+  }
+  for (int i = 0; i < b.node_def_size(); ++i) {
+    bool found = false;
+    for (int j = 0; j < a.node_def_size(); ++j) {
+      if (b.node_def(i).name() == a.node_def(j).name()) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      if (diff) {
+        *diff = strings::StrCat("Function ", a.signature().name(),
+                                ", got: has node '", b.node_def(i).name(),
+                                "' expected: no node of that name");
+      }
+      return false;
+    }
+  }
   return true;
 }
 
@@ -82,13 +246,66 @@ bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
         << diff << "\nActual: " << actual.DebugString();          \
   } while (false)
 
-REGISTER_OP("InputTest").Output("o: float");
-
-REGISTER_OP("UnaryTest").Input("a: float").Output("o: float");
+// TODO(misard): remove these fake registrations once there are real Ops to be
+// compiled.
+REGISTER_OP("_XlaHostCompute")
+    .Input("inputs: Tinputs")
+    .Output("outputs: Toutputs")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("Toutputs: list(type) >= 0")
+    .Attr("key: string")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
+
+REGISTER_OP("_XlaSendFromHost")
+    .Input("input: Tinputs")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("key: string")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
+
+REGISTER_OP("_XlaRecvAtHost")
+    .Output("output: Toutputs")
+    .Attr("Toutputs: list(type) >= 0")
+    .Attr("key: string")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
+
+REGISTER_OP("InputTest")
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    });
+
+REGISTER_OP("InputTestShaped")
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Vector(2));
+      return Status::OK();
+    });
+
+REGISTER_OP("UnaryTest")
+    .Input("a: float")
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      ::tensorflow::shape_inference::ShapeHandle o;
+      TF_RETURN_IF_ERROR(c->Merge(c->UnknownShape(), c->input(0), &o));
+      c->set_output(0, o);
+      return Status::OK();
+    });
 REGISTER_OP("BinaryTest")
     .Input("a: float")
     .Input("b: float")
-    .Output("o: float");
+    .Output("o: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      ::tensorflow::shape_inference::ShapeHandle o;
+      TF_RETURN_IF_ERROR(c->Merge(c->UnknownShape(), c->input(0), &o));
+      c->set_output(0, o);
+      return Status::OK();
+    });
+REGISTER_OP("BinaryTest2")
+    .Input("a: float")
+    .Input("b: float")
+    .Output("o: float")
+    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
 REGISTER_OP("AddNLikeTest")
     .Input("inputs: N * T")
@@ -98,10 +315,58 @@ REGISTER_OP("AddNLikeTest")
     .SetIsCommutative()
     .SetIsAggregate();
 
+Node* NoOp(const GraphDefBuilder::Options& opts) {
+  return ops::SourceOp("NoOp", opts);
+}
+
 Node* Input(const GraphDefBuilder::Options& opts) {
   return ops::SourceOp("InputTest", opts);
 }
 
+Node* InputShaped(const GraphDefBuilder::Options& opts) {
+  return ops::SourceOp("InputTestShaped", opts);
+}
+
+Node* KnownShape(const gtl::ArraySlice<int>& shape,
+                 const GraphDefBuilder::Options& opts) {
+  if (opts.HaveError()) return nullptr;
+  NodeBuilder node_builder(opts.GetNameForOp("Const"), "Const",
+                           opts.op_registry());
+  TensorProto value;
+  value.set_dtype(DT_FLOAT);
+  for (int dim : shape) {
+    value.mutable_tensor_shape()->add_dim()->set_size(dim);
+  }
+  return opts.WithAttr("value", value)
+      .WithAttr("dtype", DT_FLOAT)
+      .FinalizeBuilder(&node_builder);
+}
+
+Node* RecvAtHost(const string& key, const gtl::ArraySlice<DataType>& dtypes,
+                 const GraphDefBuilder::Options& opts) {
+  if (opts.HaveError()) return nullptr;
+  NodeBuilder node_builder(opts.GetNameForOp("_XlaRecvAtHost"),
+                           "_XlaRecvAtHost", opts.op_registry());
+  return opts.WithAttr("Toutputs", dtypes)
+      .WithAttr("key", key)
+      .FinalizeBuilder(&node_builder);
+}
+
+Node* SendFromHost(const string& key, const std::vector<ops::NodeOut>& inputs,
+                   const GraphDefBuilder::Options& opts) {
+  if (opts.HaveError()) return nullptr;
+  NodeBuilder node_builder(opts.GetNameForOp("_XlaSendFromHost"),
+                           "_XlaSendFromHost", opts.op_registry());
+  node_builder.Input(inputs);
+  std::vector<DataType> dtypes;
+  for (const auto& node : inputs) {
+    dtypes.push_back(node.dt);
+  }
+  return opts.WithAttr("key", key)
+      .WithAttr("Tinputs", dtypes)
+      .FinalizeBuilder(&node_builder);
+}
+
 Node* Unary(ops::NodeOut a, const GraphDefBuilder::Options& opts) {
   return ops::UnaryOp("UnaryTest", std::move(a), opts);
 }
@@ -111,6 +376,11 @@ Node* Binary(ops::NodeOut a, ops::NodeOut b,
   return ops::BinaryOp("BinaryTest", std::move(a), std::move(b), opts);
 }
 
+Node* BinaryUnknownShape(ops::NodeOut a, ops::NodeOut b,
+                         const GraphDefBuilder::Options& opts) {
+  return ops::BinaryOp("BinaryTest2", std::move(a), std::move(b), opts);
+}
+
 Node* AddNLike(const std::vector<ops::NodeOut>& inputs,
                const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
@@ -145,7 +415,7 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library) {
   if (!s.ok()) return s;
 
   std::unique_ptr<Graph> graph_out;
-  s = EncapsulateSubgraphsInFunctions("_encapsulate", *graph,
+  s = EncapsulateSubgraphsInFunctions("_encapsulate", "_outside", *graph,
                                       /*rewrite_subgraph_fn=*/{},
                                       /*parallel_checking=*/false,
                                       /*reuse_existing_functions=*/false,
@@ -178,6 +448,7 @@ TEST(EncapsulateSubgraphsTest, NoFunctions) {
   FunctionDefLibrary library_out = library_in;
   TF_EXPECT_OK(Encapsulate(&graphdef_out, &library_out));
 
+  // If there are no marked nodes, funcification should be a no-op.
   TF_EXPECT_GRAPH_EQ(graphdef_in, graphdef_out);
   TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_in, library_out);
 }
@@ -230,7 +501,6 @@ TEST(EncapsulateSubgraphsTest, OneFunction) {
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
-  // If there are no marked nodes, funcification should be a no-op.
   TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
   TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
 }
@@ -342,9 +612,9 @@ TEST(EncapsulateSubgraphsTest, InputDeduplication) {
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   std::unique_ptr<Graph> graph;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
-      "_cluster", graph_before_encapsulation, /*rewrite_subgraph_fn=*/{},
-      /*parallel_checking=*/false, /*reuse_existing_functions=*/false, &graph,
-      &library));
+      "_cluster", "_outside", graph_before_encapsulation,
+      /*rewrite_subgraph_fn=*/{}, /*parallel_checking=*/false,
+      /*reuse_existing_functions=*/false, &graph, &library));
 
   std::vector<string> expected_nodes = {"cluster1", "cluster2", "mul", "x"};
   EXPECT_EQ(expected_nodes, GraphNodes(*graph));
@@ -374,9 +644,9 @@ TEST(EncapsulateSubgraphsTest, ParallelChecking) {
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   std::unique_ptr<Graph> graph;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
-      "_cluster", graph_before_encapsulation, /*rewrite_subgraph_fn=*/{},
-      /*parallel_checking=*/true, /*reuse_existing_functions=*/false, &graph,
-      &library));
+      "_cluster", "_outside", graph_before_encapsulation,
+      /*rewrite_subgraph_fn=*/{}, /*parallel_checking=*/true,
+      /*reuse_existing_functions=*/false, &graph, &library));
 
   std::vector<string> expected_nodes = {
       "add1", "add2", "cluster1", "cluster1_parallel_check/_0",
@@ -398,5 +668,978 @@ TEST(EncapsulateSubgraphsTest, ParallelChecking) {
   EXPECT_EQ(expected_edges, GraphEdges(*graph));
 }
 
+const Node* FindNodeByName(const Graph& graph, const string& name) {
+  for (const Node* node : graph.nodes()) {
+    if (node->name() == name) return node;
+  }
+  return nullptr;
+}
+
+bool HasGuaranteeConstAttr(const Node& n) {
+  bool is_guaranteed_constant = false;
+  if (!GetNodeAttr(n.attrs(), "_is_guaranteed_constant",
+                   &is_guaranteed_constant)
+           .ok()) {
+    return false;
+  }
+  return is_guaranteed_constant;
+}
+
+TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Simple) {
+  Scope root = Scope::NewRootScope().ExitOnError().WithDevice(
+      "/job:localhost/replica:0/task:0/cpu:0");
+  auto x1 = ops::Placeholder(root.WithOpName("x1"), DT_FLOAT);
+  auto const_x2 = ops::Const(root.WithOpName("const_x2"), 10.0f);
+  auto const_guarantee_x1 =
+      ops::GuaranteeConst(root.WithOpName("const_guarantee_x1"), x1);
+  auto add1 = ops::Add(root.WithOpName("add1"), const_guarantee_x1, const_x2);
+  add1.node()->AddAttr("_encapsulate", "encapsulate1");
+
+  Graph graph_before(OpRegistry::Global());
+  TF_ASSERT_OK(root.ToGraph(&graph_before));
+
+  std::unique_ptr<Graph> graph_after;
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  int guaranteed_consts = 0;
+  TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
+      "_encapsulate", "_outside", graph_before,
+      /*rewrite_subgraph_fn=*/
+      [&guaranteed_consts](std::unique_ptr<Graph>* graph_ptr,
+                           std::vector<int>* input_permutation,
+                           std::vector<int>* output_permutation,
+                           NodeDef* call_def) {
+        Graph* graph = graph_ptr->get();
+        for (const Node* n : graph->nodes()) {
+          if (n->type_string() == "_Arg" &&
+              StringPiece(n->name()).starts_with("const")) {
+            ++guaranteed_consts;
+            EXPECT_TRUE(HasGuaranteeConstAttr(*n));
+          } else {
+            EXPECT_FALSE(HasGuaranteeConstAttr(*n));
+          }
+        }
+        return Status::OK();
+      },
+      /*parallel_checking=*/false,
+      /*reuse_existing_functions=*/false, &graph_after, &library));
+  EXPECT_EQ(2, guaranteed_consts);
+}
+
+TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Add) {
+  Scope root = Scope::NewRootScope().ExitOnError().WithDevice(
+      "/job:localhost/replica:0/task:0/cpu:0");
+  auto x1 = ops::Placeholder(root.WithOpName("x1"), DT_FLOAT);
+  auto x2 = ops::Placeholder(root.WithOpName("x2"), DT_FLOAT);
+  auto const_guarantee_x1 =
+      ops::GuaranteeConst(root.WithOpName("const_guarantee_x1"), x1);
+  auto const_guarantee_x2 =
+      ops::GuaranteeConst(root.WithOpName("const_guarantee_x2"), x2);
+  auto const_guarantee_add1 = ops::Add(root.WithOpName("const_guarantee_add1"),
+                                       const_guarantee_x1, const_guarantee_x2);
+  auto add2 = ops::Add(root.WithOpName("add2"), const_guarantee_x1, x2);
+  auto mul1 = ops::Mul(root.WithOpName("mul1"), const_guarantee_add1, add2);
+  mul1.node()->AddAttr("_encapsulate", "encapsulate1");
+
+  Graph graph_before(OpRegistry::Global());
+  TF_ASSERT_OK(root.ToGraph(&graph_before));
+
+  std::unique_ptr<Graph> graph_after;
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  int guaranteed_consts = 0;
+  TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
+      "_encapsulate", "_outside", graph_before,
+      /*rewrite_subgraph_fn=*/
+      [&guaranteed_consts](std::unique_ptr<Graph>* graph_ptr,
+                           std::vector<int>* input_permutation,
+                           std::vector<int>* output_permutation,
+                           NodeDef* call_def) {
+        Graph* graph = graph_ptr->get();
+        for (const Node* n : graph->nodes()) {
+          if (n->type_string() == "_Arg" &&
+              StringPiece(n->name()).starts_with("const")) {
+            ++guaranteed_consts;
+            EXPECT_TRUE(HasGuaranteeConstAttr(*n));
+          } else {
+            EXPECT_FALSE(HasGuaranteeConstAttr(*n));
+          }
+        }
+        return Status::OK();
+      },
+      /*parallel_checking=*/false,
+      /*reuse_existing_functions=*/false, &graph_after, &library));
+  // Only 1 runtime const, which is const_guarantee_add1. Add2 has one const
+  // and another non-const, so overall non-const.
+  EXPECT_EQ(1, guaranteed_consts);
+}
+
+// Test with one function to transform and one outside_compilation cluster.
+TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    *library.add_function() = test::function::XTimesTwo();
+
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    // Give nodes 'c' and 'd' names that collide after lowercasing.
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d = Binary(b, c,
+                     b1.opts().WithName("c").WithControlInput(c).WithAttr(
+                         "_encapsulate", "F1"));
+    Node* e = Binary(c, d,
+                     b1.opts()
+                         .WithName("E")
+                         .WithControlInputs({b, d})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* f = Binary(c, e,
+                     b1.opts().WithName("F").WithControlInput(e).WithAttr(
+                         "_encapsulate", "F1"));
+    Binary(a, f, b1.opts().WithName("G").WithControlInput(e));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  string shape_string_expected;
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape_graph;
+    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
+    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+  }
+
+  *library_expected.add_function() = test::function::XTimesTwo();
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"c"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}, {"C"}},
+          {{"F"},
+           "BinaryTest",
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
+           {},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {"C:o:0", "c:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
+           {"c"}},
+      },
+      {{"f_0_retval", "F:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    NodeBuilder node_builder("F1", "F1", lib_def.get());
+    node_builder.Input(a).Input(b);
+    Node* call = b2.opts().FinalizeBuilder(&node_builder);
+
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     b2.opts().WithName("E").WithControlInputs({recv, b}));
+    Node* send = SendFromHost("host_compute_channel_F1_O1", {e},
+                              b2.opts()
+                                  .WithName("outside_compilation_F1_O1_send")
+                                  .WithControlInput(e));
+
+    Node* s = NoOp(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}));
+
+    Binary(a, call, b2.opts().WithName("G").WithControlInputs({s, e}));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with one function to transform and two outside_compilation clusters.
+TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Binary(c, d,
+                     b1.opts()
+                         .WithName("E")
+                         .WithControlInputs({b, d})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* f = Binary(c, e,
+                     b1.opts().WithName("F").WithControlInput(e).WithAttr(
+                         "_encapsulate", "F1"));
+    Node* g = Binary(e, f,
+                     b1.opts()
+                         .WithName("G")
+                         .WithControlInputs({e, f})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O2"));
+    Node* h = Binary(d, e,
+                     b1.opts()
+                         .WithName("H")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O2"));
+    Node* i = Unary(h, b1.opts().WithName("I").WithAttr("_encapsulate", "F1"));
+    Binary(g, i, b1.opts().WithName("J"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  string shape_string_expected_1;
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape1.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape1.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape1.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape1_graph;
+    TF_EXPECT_OK(shape1.ToGraphDef(&shape1_graph));
+    EXPECT_TRUE(shape1_graph.SerializeToString(&shape_string_expected_1));
+  }
+
+  string shape_string_expected_2;
+  {
+    GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
+                     shape2.opts().WithName("E"));
+    Node* recv2 =
+        RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT},
+                   shape2.opts().WithName("outside_compilation_F1_O2_recv"));
+    Node* h = Binary(ops::NodeOut(recv2, 0), e, shape2.opts().WithName("H"));
+    SendFromHost("host_compute_channel_F1_O2", {h},
+                 shape2.opts().WithName("outside_compilation_F1_O2_send"));
+    GraphDef shape2_graph;
+    TF_EXPECT_OK(shape2.ToGraphDef(&shape2_graph));
+    EXPECT_TRUE(shape2_graph.SerializeToString(&shape_string_expected_2));
+  }
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}},
+          {{"I"},
+           "UnaryTest",
+           {"outside_compilation_O2_host_compute:outputs:0"}},
+          {{"F"},
+           "BinaryTest",
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
+           {},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O2_host_compute"},
+           "_XlaHostCompute",
+           {"D:o:0", "F:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O2"},
+            {"shape_inference_graph", shape_string_expected_2},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
+           {"F"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {"C:o:0", "D:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected_1},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
+           {"D"}},
+      },
+      {{"i_0_retval", "I:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    NodeBuilder node_builder("F1", "F1", lib_def.get());
+    node_builder.Input(a).Input(b);
+    Node* call = b2.opts().FinalizeBuilder(&node_builder);
+
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
+                     b2.opts().WithName("E").WithControlInputs({recv1, b}));
+    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e},
+                               b2.opts()
+                                   .WithName("outside_compilation_F1_O1_send")
+                                   .WithControlInput(e));
+
+    Node* recv2 =
+        RecvAtHost("host_compute_channel_F1_O2", {DT_FLOAT, DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O2_recv"));
+    Node* g = Binary(e, ops::NodeOut(recv2, 1),
+                     b2.opts().WithName("G").WithControlInputs({recv2, e}));
+    Node* h = Binary(ops::NodeOut(recv2, 0), e, b2.opts().WithName("H"));
+    Node* send2 =
+        SendFromHost("host_compute_channel_F1_O2", {h},
+                     b2.opts().WithName("outside_compilation_F1_O2_send"));
+
+    Node* s = NoOp(b2.opts()
+                       .WithName("F1_sequencer")
+                       .WithControlInputs({recv1, send1, recv2, send2}));
+
+    Binary(g, call, b2.opts().WithName("J").WithControlInput(s));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with two functions to transform, each with one outside_compilation
+// cluster.
+TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* b = InputShaped(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Binary(c, d,
+                     b1.opts()
+                         .WithName("E")
+                         .WithControlInputs({b, d})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* f = Binary(c, e,
+                     b1.opts().WithName("F").WithControlInput(e).WithAttr(
+                         "_encapsulate", "F1"));
+    Node* g = Binary(e, f,
+                     b1.opts().WithName("G").WithControlInputs({e, f}).WithAttr(
+                         "_encapsulate", "F2"));
+    Node* h = Binary(d, g,
+                     b1.opts()
+                         .WithName("H")
+                         .WithAttr("_encapsulate", "F2")
+                         .WithAttr("_outside", "O1"));
+    Node* i =
+        Binary(f, h, b1.opts().WithName("I").WithAttr("_encapsulate", "F2"));
+    Binary(g, i, b1.opts().WithName("J"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  string shape_string_expected;
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
+                     shape.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape_graph;
+    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
+    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+  }
+
+  TensorShapeProto shape_proto_expected;
+  shape_proto_expected.add_dim()->set_size(2);
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"f_0_retval:float", "d_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"},
+           "BinaryTest",
+           {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
+           {},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {"C:o:0", "D:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
+           {"D"}},
+      },
+      {{"d_0_retval", "D:o:0"}, {"f_0_retval", "F:o:0"}});
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F2", {"e_0_arg:float", "f_0_arg:float"},
+      {"g_0_retval:float", "i_0_retval:float"}, {},
+      {
+          {{"G"}, "BinaryTest", {"e_0_arg", "f_0_arg"}},
+          {{"I"},
+           "BinaryTest",
+           {"f_0_arg", "outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {"G:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F2_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
+      },
+      {{"g_0_retval", "G:o:0"}, {"i_0_retval", "I:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* b = InputShaped(b2.opts().WithName("B"));
+
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT, DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
+                     b2.opts().WithName("E").WithControlInputs({recv1, b}));
+    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {e},
+                               b2.opts()
+                                   .WithName("outside_compilation_F1_O1_send")
+                                   .WithControlInput(e));
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+    Node* s1 = NoOp(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}));
+
+    Node* recv2 =
+        RecvAtHost("host_compute_channel_F2_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F2_O1_recv"));
+    Node* h = Binary(ops::NodeOut(call1, 1), recv2,
+                     b2.opts().WithName("H").WithControlInput(s1));
+    Node* send2 =
+        SendFromHost("host_compute_channel_F2_O1", {h},
+                     b2.opts().WithName("outside_compilation_F2_O1_send"));
+
+    NodeBuilder node_builder2("F2", "F2", lib_def.get());
+    node_builder2.Input(e).Input(call1);
+    Node* call2 = b2.opts()
+                      .WithControlInputs({s1, e, call1})
+                      .FinalizeBuilder(&node_builder2);
+    Node* s2 = NoOp(
+        b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}));
+    Binary(call2, ops::NodeOut(call2, 1),
+           b2.opts().WithName("J").WithControlInput(s2));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with one outside_compilation cluster that has no inputs from the
+// compiled subgraph.
+TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(a, b1.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f =
+        Binary(d, e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1"));
+    Unary(f, b1.opts().WithName("G"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  TensorShapeProto shape_proto_expected;
+  shape_proto_expected.add_dim()->set_size(2);
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"},
+           "BinaryTest",
+           {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {},
+           {{"Tinputs", gtl::ArraySlice<DataType>({})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
+      },
+      {{"f_0_retval", "F:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* e = Unary(a, b2.opts().WithName("E"));
+    Node* send1 =
+        SendFromHost("host_compute_channel_F1_O1", {e},
+                     b2.opts().WithName("outside_compilation_F1_O1_send"));
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+    Node* s1 = NoOp(b2.opts().WithName("F1_sequencer").WithControlInput(send1));
+
+    Unary(call1, b2.opts().WithName("G").WithControlInput(s1));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with one outside_compilation cluster that has no data inputs but has a
+// control input from the compiled subgraph.
+TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(a, b1.opts()
+                           .WithName("E")
+                           .WithControlInput(d)
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f =
+        Binary(d, e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1"));
+    Unary(f, b1.opts().WithName("G"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  TensorShapeProto shape_proto_expected;
+  shape_proto_expected.add_dim()->set_size(2);
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"},
+           "BinaryTest",
+           {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {},
+           {{"Tinputs", gtl::ArraySlice<DataType>({})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}},
+           {"D"}},
+      },
+      {{"f_0_retval", "F:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Unary(a, b2.opts().WithName("E").WithControlInput(recv1));
+    Node* send1 =
+        SendFromHost("host_compute_channel_F1_O1", {e},
+                     b2.opts().WithName("outside_compilation_F1_O1_send"));
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+    Node* s1 = NoOp(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}));
+
+    Unary(call1, b2.opts().WithName("G").WithControlInput(s1));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with one outside_compilation cluster that has no outputs from the
+// compiled subgraph.
+TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(d, b1.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f = Unary(d, b1.opts().WithName("F").WithAttr("_encapsulate", "F1"));
+    Binary(e, f, b1.opts().WithName("G"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"}, "UnaryTest", {"D:o:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {"D:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
+      },
+      {{"f_0_retval", "F:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Unary(recv1, b2.opts().WithName("E"));
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+    Node* s1 = NoOp(b2.opts().WithName("F1_sequencer").WithControlInput(recv1));
+
+    Binary(e, call1, b2.opts().WithName("G").WithControlInput(s1));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with one outside_compilation cluster that has no data outputs but has a
+// control output to the compiled subgraph.
+TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(d, b1.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f = Unary(d, b1.opts().WithName("F").WithControlInput(e).WithAttr(
+                           "_encapsulate", "F1"));
+    Binary(e, f, b1.opts().WithName("G"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"},
+           "UnaryTest",
+           {"D:o:0"},
+           {},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {"D:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", ""},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
+      },
+      {{"f_0_retval", "F:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* recv1 =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = Unary(recv1, b2.opts().WithName("E"));
+    Node* send1 = SendFromHost("host_compute_channel_F1_O1", {},
+                               b2.opts()
+                                   .WithName("outside_compilation_F1_O1_send")
+                                   .WithControlInput(e));
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+    Node* s1 = NoOp(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}));
+
+    Binary(e, call1, b2.opts().WithName("G").WithControlInput(s1));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test with one outside_compilation cluster that has no outputs from the
+// compiled subgraph.
+TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = Input(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1"));
+    Node* d =
+        Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1"));
+    Node* e = Unary(a, b1.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* f = Unary(d, b1.opts().WithName("F").WithAttr("_encapsulate", "F1"));
+    Binary(e, f, b1.opts().WithName("G"));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      {
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
+          {{"F"}, "UnaryTest", {"D:o:0"}},
+      },
+      {{"f_0_retval", "F:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = Input(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+
+    Node* e = Unary(a, b2.opts().WithName("E"));
+    NodeBuilder node_builder1("F1", "F1", lib_def.get());
+    node_builder1.Input(a).Input(b);
+    Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
+
+    Binary(e, call1, b2.opts().WithName("G"));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
+// Test for shape inference of outside compilation.
+TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
+  FunctionDefLibrary library;
+  GraphDef graphdef;
+
+  {
+    *library.add_function() = test::function::XTimesTwo();
+
+    GraphDefBuilder b1(GraphDefBuilder::kFailImmediately);
+    Node* a = InputShaped(b1.opts().WithName("A"));
+    Node* b = Input(b1.opts().WithName("B"));
+    // Give nodes 'c' and 'd' names that collide after lowercasing.
+    Node* c = Unary(a, b1.opts().WithName("C"));
+    Node* d = Unary(b, b1.opts().WithName("c").WithControlInput(c).WithAttr(
+                           "_encapsulate", "F1"));
+    Node* e = BinaryUnknownShape(c, d,
+                                 b1.opts()
+                                     .WithName("E")
+                                     .WithControlInputs({b, d})
+                                     .WithAttr("_encapsulate", "F1")
+                                     .WithAttr("_outside", "O1"));
+    Node* f = Binary(c, e,
+                     b1.opts().WithName("F").WithControlInput(e).WithAttr(
+                         "_encapsulate", "F1"));
+    Binary(a, f, b1.opts().WithName("G").WithControlInput(e));
+    TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
+  }
+
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+
+  FunctionDefLibrary library_expected;
+  GraphDef graphdef_expected;
+
+  string shape_string_expected;
+  {
+    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
+    Node* known = KnownShape({2}, shape.opts().WithName("KnownShape/_0"));
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = BinaryUnknownShape(known, recv, shape.opts().WithName("E"));
+    SendFromHost("host_compute_channel_F1_O1", {e},
+                 shape.opts().WithName("outside_compilation_F1_O1_send"));
+    GraphDef shape_graph;
+    TF_EXPECT_OK(shape.ToGraphDef(&shape_graph));
+    EXPECT_TRUE(shape_graph.SerializeToString(&shape_string_expected));
+  }
+
+  *library_expected.add_function() = test::function::XTimesTwo();
+  *library_expected.add_function() = FunctionDefHelper::Create(
+      "F1", {"b_0_arg:float", "c_0_arg:float"}, {"f_0_retval:float"}, {},
+      {
+          {{"c"}, "UnaryTest", {"b_0_arg"}, {}, {}},
+          {{"F"},
+           "BinaryTest",
+           {"c_0_arg", "outside_compilation_O1_host_compute:outputs:0"},
+           {},
+           {"outside_compilation_O1_host_compute"}},
+          {{"outside_compilation_O1_host_compute"},
+           "_XlaHostCompute",
+           {"c:o:0"},
+           {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_string_expected},
+            {"shapes", gtl::ArraySlice<DataType>({})}},
+           {"c"}},
+      },
+      {{"f_0_retval", "F:o:0"}});
+
+  {
+    std::unique_ptr<FunctionLibraryDefinition> lib_def(
+        new FunctionLibraryDefinition(OpRegistry::Global(), library_expected));
+    GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
+    Node* a = InputShaped(b2.opts().WithName("A"));
+    Node* b = Input(b2.opts().WithName("B"));
+    Node* c = Unary(a, b2.opts().WithName("C"));
+
+    NodeBuilder node_builder("F1", "F1", lib_def.get());
+    node_builder.Input(b).Input(c);
+    Node* call =
+        b2.opts().WithControlInputs({c}).FinalizeBuilder(&node_builder);
+
+    Node* recv =
+        RecvAtHost("host_compute_channel_F1_O1", {DT_FLOAT},
+                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* e = BinaryUnknownShape(
+        c, ops::NodeOut(recv, 0),
+        b2.opts().WithName("E").WithControlInputs({recv, b}));
+    Node* send = SendFromHost("host_compute_channel_F1_O1", {e},
+                              b2.opts()
+                                  .WithName("outside_compilation_F1_O1_send")
+                                  .WithControlInput(e));
+
+    Node* s = NoOp(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}));
+
+    Binary(a, call, b2.opts().WithName("G").WithControlInputs({s, e}));
+    TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
+  }
+
+  TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
+  TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 459a582e157f5ddc63997ca93e7c0294293517d3..9bea5663319c8a25249fdc265cee0191556a7c04 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -16,7 +16,6 @@ cc_library(
         "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla:xla_local_runtime_context",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index e481796d9e626fc8cdf36687ad110b0a8a788be0..6353149e4afdf739fe44dd5c76502ef5d98b8477 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/tf2xla/xla_local_runtime_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -46,7 +45,7 @@ namespace tensorflow {
 // see comment on `AllowsAsynchronousDeallocation()`.
 class XlaAllocator : public xla::DeviceMemoryAllocator {
  public:
-  XlaAllocator(gpu::Platform* platform, OpKernelContext* op_context);
+  XlaAllocator(const gpu::Platform* platform, OpKernelContext* op_context);
   ~XlaAllocator() override;
   xla::StatusOr<gpu::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
                                                 bool retry_on_failure) override;
@@ -80,7 +79,8 @@ class XlaAllocator : public xla::DeviceMemoryAllocator {
   std::unordered_map<void*, Tensor> tensors_;
 };
 
-XlaAllocator::XlaAllocator(gpu::Platform* platform, OpKernelContext* op_context)
+XlaAllocator::XlaAllocator(const gpu::Platform* platform,
+                           OpKernelContext* op_context)
     : xla::DeviceMemoryAllocator(platform), op_context_(op_context) {}
 
 XlaAllocator::~XlaAllocator() = default;
@@ -103,7 +103,6 @@ xla::StatusOr<gpu::DeviceMemoryBase> XlaAllocator::Allocate(
   }
   void* data =
       reinterpret_cast<void*>(const_cast<char*>(t.tensor_data().data()));
-  TF_RET_CHECK(data != nullptr);
   tensors_[data] = t;
   return gpu::DeviceMemoryBase(data, size);
 }
@@ -111,7 +110,6 @@ xla::StatusOr<gpu::DeviceMemoryBase> XlaAllocator::Allocate(
 Status XlaAllocator::RegisterArgument(const Tensor* t) {
   void* data =
       reinterpret_cast<void*>(const_cast<char*>(t->tensor_data().data()));
-  TF_RET_CHECK(data != nullptr);
   tensors_[data] = *t;
   return Status::OK();
 }
@@ -251,24 +249,26 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
+  // Builds an XLA allocator for the device.
+  XlaAllocator xla_allocator(client->platform(), ctx);
+
   XlaCompiler::Options options;
   options.client = client;
   options.device_type = &cache->device_type();
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
+  options.device_allocator = &xla_allocator;
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
+
   OP_REQUIRES_OK(ctx, cache->Compile(options, function_, num_constant_args_,
-                                     variables, ctx, &kernel, &executable));
+                                     variables, ctx, &kernel, &executable,
+                                     /*compile_options=*/nullptr));
 
   VLOG(1) << "Executing XLA Computation...";
 
-  // Builds an XLA allocator for the device.
-  XlaAllocator xla_allocator(client->platform(), ctx);
-  XlaLocalRuntimeContext local_runtime_context;
-
   std::unique_ptr<xla::ShapedBuffer> output;
   // Build xla::ShapedBuffers that point directly to the Tensor buffers.
   std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers;
@@ -291,27 +291,22 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
     gpu::DeviceMemoryBase dmem = gpu::DeviceMemoryBase(
         const_cast<char*>(t->tensor_data().data()), t->tensor_data().size());
 
-    arg_buffers[i] =
-        xla::ShapedBuffer::MakeArrayShapedBuffer(
-            shape, client->platform(), client->default_device_ordinal(), dmem)
-            .ConsumeValueOrDie();
+    const xla::Shape on_device_shape =
+        client->backend().transfer_manager()->HostShapeToDeviceShape(shape);
+    CHECK(xla::ShapeUtil::Equal(shape, on_device_shape))
+        << "On-device shape "
+        << xla::ShapeUtil::HumanStringWithLayout(on_device_shape)
+        << " not the same as on-host shape "
+        << xla::ShapeUtil::HumanStringWithLayout(shape);
+    arg_buffers[i] = xla::MakeUnique<xla::ShapedBuffer>(
+        /*on_host_shape=*/shape, /*on_device_shape=*/shape, client->platform(),
+        client->default_device_ordinal());
+    arg_buffers[i]->set_buffer(dmem, /*index=*/{});
     arg_ptrs[i] = arg_buffers[i].get();
 
     OP_REQUIRES_OK(ctx, xla_allocator.RegisterArgument(t));
   }
 
-  // Make the final parameter point at local_runtime_context.
-  if (kernel->requires_runtime_context) {
-    gpu::DeviceMemoryBase local_runtime_context_dmem(
-        &local_runtime_context, sizeof(local_runtime_context));
-    arg_buffers.push_back(
-        xla::ShapedBuffer::MakeArrayShapedBuffer(
-            xla::ShapeUtil::MakeOpaqueShape(), client->platform(),
-            client->default_device_ordinal(), local_runtime_context_dmem)
-            .ConsumeValueOrDie());
-    arg_ptrs.push_back(arg_buffers.back().get());
-  }
-
   // Execute the computation.
   VLOG(2) << "Executing computation.";
   xla::ExecutableRunOptions run_options;
@@ -323,19 +318,13 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   auto run_result = executable->Run(arg_ptrs, run_options);
   OP_REQUIRES(ctx, run_result.ok(), run_result.status());
 
-  if (local_runtime_context.error) {
-    ctx->CtxFailure(errors::InvalidArgument("Compiled kernel returned error: ",
-                                            local_runtime_context.error_msg));
-    return;
-  }
-
   output = run_result.ConsumeValueOrDie()->release();
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time: " << elapsed << "us";
 
   // Computation output should always be a tuple.
   if (VLOG_IS_ON(2)) {
-    VLOG(2) << "Result tuple shape: " << output->shape().DebugString();
+    VLOG(2) << "Result tuple shape: " << output->on_host_shape().DebugString();
   }
   CHECK_EQ(ctx->num_outputs(), kernel->outputs.size());
 
@@ -387,8 +376,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
     OP_REQUIRES(ctx,
                 write.input_index >= 0 && write.input_index < ctx->num_inputs(),
                 errors::Internal("Invalid input index for variable write."));
-    TensorShape write_shape;
-    OP_REQUIRES_OK(ctx, XLAShapeToTensorShape(write.shape, &write_shape));
 
     gpu::DeviceMemoryBase buffer = output->buffer({output_num});
 
@@ -410,7 +397,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
     // Looks up the owning Tensor by buffer address.
     OP_REQUIRES_OK(
-        ctx, xla_allocator.MakeTensorFromBuffer(buffer, write.type, write_shape,
+        ctx, xla_allocator.MakeTensorFromBuffer(buffer, write.type, write.shape,
                                                 variable->tensor()));
     ++output_num;
   }
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 74c9791f5eaf1fbc43b152520df496a3b552af18..a0211acbbe9eec77d30c7d14293650de8826f41c 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -41,6 +41,7 @@ limitations under the License.
 namespace tensorflow {
 
 const char* const kXlaClusterAttr = "_XlaCluster";
+const char* const kXlaOutsideCompilationAttr = "_XlaOutsideCompilation";
 
 namespace {
 
@@ -172,10 +173,15 @@ bool HasResourceInputOrOutput(const Node& node) {
                    DT_RESOURCE) != node.output_types().end();
 }
 
+struct NodeCompare {
+  bool operator()(const Node* a, const Node* b) { return a->id() < b->id(); }
+};
+using OrderedNodeSet = std::set<Node*, NodeCompare>;
+
 Status FindCompilationCandidates(
     const Graph& graph, FunctionLibraryDefinition* flib_def, Env* env,
     const std::function<bool(const Node*, const DeviceType&)>& is_compilable_fn,
-    std::unordered_set<Node*>* candidates) {
+    OrderedNodeSet* candidates) {
   OptimizerOptions opts;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
       new ProcessFunctionLibraryRuntime(nullptr, env, TF_GRAPH_DEF_VERSION,
@@ -184,6 +190,9 @@ Status FindCompilationCandidates(
       pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
 
   for (Node* node : graph.op_nodes()) {
+    VLOG(2) << "FindCompilationCandidates(): Processing "
+            << node->DebugString();
+
     DeviceType device_type("");
     TF_RETURN_IF_ERROR(
         DeviceTypeOfDevice(node->assigned_device_name(), &device_type));
@@ -210,6 +219,20 @@ Status FindCompilationCandidates(
         !IsCompilableWhile(*node, jit_device_type, 0, lib_runtime)) {
       continue;
     }
+    // _Arg nodes in a top-level function represent feeds.
+    // Do not compile them.
+    if (node->type_string() == "_Arg") {
+      VLOG(2) << "Skipping jit compilation for '_Arg'-typed node "
+              << node->DebugString();
+      continue;
+    }
+    // _Retval nodes in a top-level function represent fetches.
+    // Do not compile them.
+    if (node->type_string() == "_Retval") {
+      VLOG(2) << "Compilation rejected node: return value " << node->name()
+              << ": " << node->type_string();
+      continue;
+    }
     candidates->insert(node);
   }
   return Status::OK();
@@ -291,6 +314,7 @@ Status MarkForCompilationPass::Run(
         static_cast<OptimizerOptions::GlobalJitLevel>(flags->tf_xla_auto_jit);
   }
   bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
+  VLOG(1) << "flags->tf_xla_cpu_global_jit = " << flags->tf_xla_cpu_global_jit;
   const FunctionLibraryDefinition* fld = options.flib_def;
 
   auto is_compilable = [global_jit_level, cpu_global_jit, fld](
@@ -347,7 +371,7 @@ Status MarkForCompilationPass::RunImpl(
 
   Graph* graph = options.graph->get();
 
-  std::unordered_set<Node*> compilation_candidates;
+  OrderedNodeSet compilation_candidates;
   TF_RETURN_IF_ERROR(FindCompilationCandidates(
       *graph, options.flib_def,
       (options.session_options != nullptr) ? options.session_options->env
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.h b/tensorflow/compiler/jit/mark_for_compilation_pass.h
index f91695800f585f37b72173d5e582c38b1154b69b..e9acbfb19e42cb43cb0b986c438a569de29b2ebc 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.h
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.h
@@ -28,6 +28,10 @@ namespace tensorflow {
 // encapsulate subgraphs pass.
 extern const char* const kXlaClusterAttr;
 
+// The attribute that marks nodes in a cluster to be placed outside the xla
+// compilation by the encapsulate subgraphs pass.
+extern const char* const kXlaOutsideCompilationAttr;
+
 // Pass that marks a subset of operators in the graph with attribute
 // _XlaCluster so they are compiled by the EncapsulateSubgraphsPass.
 class MarkForCompilationPass : public GraphOptimizationPass {
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index b3d258aea177fbefa4bae51d8156da2ff86c9032..1a8858cccef623185709ab5dc2187a313dd130f7 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -80,7 +81,7 @@ TEST(XlaCompilationTest, Chains) {
         ops::UnaryOp("UncompilableUnary", c, builder.opts().WithName("D"));
     Node* e = ops::UnaryOp("Relu", d, builder.opts().WithName("E"));
     ops::UnaryOp("Relu", e, builder.opts().WithName("F"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -105,7 +106,7 @@ TEST(XlaCompilationTest, UncompilableCycles) {
     Node* b =
         ops::UnaryOp("UncompilableUnary", a, builder.opts().WithName("B"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -125,7 +126,7 @@ TEST(XlaCompilationTest, CompilableCycles) {
                                          .WithAttr("value", Tensor()));
     Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -148,7 +149,7 @@ TEST(XlaCompilationTest, UnsupportedTypes) {
                      .WithAttr("value", Tensor(DT_COMPLEX128, TensorShape())));
     Node* b = ops::UnaryOp("Neg", a, builder.opts().WithName("B"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -177,7 +178,7 @@ TEST(XlaCompilationTest, ConcatWithConstArg) {
     concat_builder.Input(dim).Input({a, a}).Attr("N", 2);
     builder.opts().FinalizeBuilder(&concat_builder);
 
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -212,7 +213,7 @@ TEST(XlaCompilationTest, FunctionCalls) {
     Node* c = ops::UnaryOp("Relu", b, builder.opts().WithName("C"));
     ops::UnaryOp("UncompilableFn", c, builder.opts().WithName("D"));
     ops::BinaryOp("NoInlineFn", c, c, builder.opts().WithName("E"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph, &flib_def));
@@ -244,7 +245,7 @@ TEST(XlaCompilationTest, MetadataOpsDontStartClusters) {
     Node* c = ops::UnaryOp("Rank", b, builder.opts().WithName("C"));
     Node* d = ops::UnaryOp("Size", c, builder.opts().WithName("D"));
     ops::UnaryOp("Shape", d, builder.opts().WithName("E"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
   TF_ASSERT_OK(MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
@@ -330,7 +331,7 @@ TEST(XlaCompilationTest, SymbolicGradients) {
     d_builder.Input({c, c});
     builder.opts().FinalizeBuilder(&d_builder);
 
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -382,7 +383,7 @@ TEST(XlaCompilationTest, CyclesWithAllDifferentScopes) {
     ops::BinaryOp(
         "MatMul", a, b,
         builder.opts().WithName("C").WithAttr(kXlaScopeAttr, "ScopeC"));
-    TF_CHECK_OK(builder.ToGraph(graph.get()));
+    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -413,7 +414,7 @@ TEST(XlaCompilationTest, CyclesWithSplittingScopes) {
     ops::BinaryOp(
         "Add", b, c,
         builder.opts().WithName("D").WithAttr(kXlaScopeAttr, "Scope2"));
-    TF_CHECK_OK(builder.ToGraph(graph.get()));
+    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -443,7 +444,7 @@ TEST(XlaCompilationTest, CyclesWithDifferentScopesAndBridge) {
         "Relu", a,
         builder.opts().WithName("B").WithAttr(kXlaScopeAttr, "ScopeB"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
-    TF_CHECK_OK(builder.ToGraph(graph.get()));
+    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilation(&graph));
@@ -484,7 +485,7 @@ TEST(XlaCompilationTest, Resources) {
     Node* c = ops::UnaryOp("ResourceOutput", b, builder.opts().WithName("C"));
     Node* d = ops::UnaryOp("ResourceInput", c, builder.opts().WithName("D"));
     ops::UnaryOp("Relu", d, builder.opts().WithName("E"));
-    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
   TF_ASSERT_OK(MarkForCompilation(&graph));
   auto clusters = GetClusters(*graph);
@@ -525,5 +526,32 @@ TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) {
                             "+-- c\n"));
 }
 
+TEST(XlaCompilationTest, Retval) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  GraphDef graphdef;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor()));
+    Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B"));
+    ops::UnaryOp("_Retval", b,
+                 builder.opts()
+                     .WithName("R")
+                     .WithAttr("T", DT_FLOAT)
+                     .WithAttr("index", 0));
+
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+
+  EXPECT_EQ(2, clusters.size());
+  EXPECT_TRUE(clusters.find("R") == clusters.cend());
+  EXPECT_EQ(clusters["A"], clusters["B"]);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index bc2eccd2779b9ff68ae2121f7bc53d6f74aec3e3..6d854a920eb0b4c01b09024ceaef5035e847d392 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -148,8 +148,7 @@ Status BuildArguments(int num_constant_args,
     XlaCompiler::Argument& arg = (*args)[input_num];
     arg.kind = XlaCompiler::Argument::kConstant;
     arg.type = input.dtype();
-    TF_RETURN_IF_ERROR(
-        TensorShapeToXLAShape(input.dtype(), input.shape(), &arg.shape));
+    arg.shape = input.shape();
     arg.constant_value = input;
     ++input_num;
   }
@@ -170,8 +169,7 @@ Status BuildArguments(int num_constant_args,
       arg.constant_value = input;
     }
     arg.type = input.dtype();
-    TF_RETURN_IF_ERROR(
-        TensorShapeToXLAShape(input.dtype(), input.shape(), &arg.shape));
+    arg.shape = input.shape();
     ++input_num;
   }
 
@@ -189,8 +187,7 @@ Status BuildArguments(int num_constant_args,
     if (variable_args[variable_id].present) {
       const Tensor& value = variable_args[variable_id].value;
       arg.type = value.dtype();
-      TF_RETURN_IF_ERROR(
-          TensorShapeToXLAShape(value.dtype(), value.shape(), &arg.shape));
+      arg.shape = value.shape();
       arg.initialized = true;
     } else {
       // The values of uninitialized variables are not passed as inputs, since
@@ -199,7 +196,7 @@ Status BuildArguments(int num_constant_args,
       // uninitialized variables.
       arg.initialized = false;
       arg.type = DT_INVALID;
-      arg.shape = xla::Shape();
+      arg.shape = TensorShape();
     }
     ++input_num;
   }
@@ -214,20 +211,16 @@ Status XlaCompilationCache::BuildExecutable(
     const XlaCompiler::CompilationResult& result,
     std::unique_ptr<xla::LocalExecutable>* executable) {
   VLOG(2) << "Compiling to local executable";
-  xla::Shape opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
 
   std::vector<const xla::Shape*> argument_layouts(
       result.xla_input_shapes.size());
   for (int i = 0; i < result.xla_input_shapes.size(); ++i) {
     argument_layouts[i] = &result.xla_input_shapes[i];
   }
-  if (result.requires_runtime_context) {
-    // The final arg is the XlaLocalRuntimeContext*.
-    argument_layouts.push_back(&opaque_shape);
-  }
   xla::ExecutableBuildOptions build_options;
   build_options.set_device_ordinal(client_->default_device_ordinal());
   build_options.set_result_layout(result.xla_output_shape);
+  build_options.set_device_allocator(options.device_allocator);
 
   auto compile_result =
       client_->Compile(*result.computation, argument_layouts, build_options);
@@ -243,7 +236,8 @@ Status XlaCompilationCache::Compile(
     int num_constant_args, const std::vector<OptionalTensor>& variable_args,
     OpKernelContext* ctx,
     const XlaCompiler::CompilationResult** compilation_result,
-    xla::LocalExecutable** executable) {
+    xla::LocalExecutable** executable,
+    const XlaCompiler::CompileOptions* compile_options) {
   VLOG(1) << "XlaCompilationCache::Compile " << DebugString();
 
   if (VLOG_IS_ON(2)) {
@@ -302,9 +296,9 @@ Status XlaCompilationCache::Compile(
 
     XlaCompiler compiler(options);
     entry->compiled = true;
-    entry->compilation_status =
-        compiler.CompileFunction(XlaCompiler::CompileOptions(), function, args,
-                                 &entry->compilation_result);
+    entry->compilation_status = compiler.CompileFunction(
+        compile_options ? *compile_options : XlaCompiler::CompileOptions(),
+        function, args, &entry->compilation_result);
   }
   *compilation_result = &entry->compilation_result;
   if (entry->compilation_status.ok() && executable) {
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index c3a8f68a157a2d34d4a6716c9951b2b698aead79..0858020716fcf4763e42dc0699ad22cfda756942 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -66,7 +66,8 @@ class XlaCompilationCache : public ResourceBase {
                  const std::vector<OptionalTensor>& variable_args,
                  OpKernelContext* ctx,
                  const XlaCompiler::CompilationResult** compilation_result,
-                 xla::LocalExecutable** executable);
+                 xla::LocalExecutable** executable,
+                 const XlaCompiler::CompileOptions* compile_options);
 
   xla::LocalClient* client() const { return client_; }
   const DeviceType& device_type() const { return device_type_; }
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index fed2c92d763c33aad3c5b3f07c1f33364c797793..c936222f32056e92efced82d5adb3a96c8041a17 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -71,12 +71,14 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
     void* dst_ptr = DMAHelper::base(device_tensor);
     se::DeviceMemoryBase dev_dst_ptr(dst_ptr, total_bytes);
 
-    Status status = Status::OK();
+    Status status;
     stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
     // TODO(hpucha): Make this asynchronous.
-    if (!stream_->BlockHostUntilDone()) {
+    Status block_status = stream_->BlockHostUntilDone();
+    if (!block_status.ok()) {
       status = xla::InternalError(
-          "Failed to complete data transfer on stream %p", stream_);
+          "Failed to complete data transfer on stream %p: %s", stream_,
+          block_status.error_message().c_str());
     }
 
     done(status);
@@ -105,12 +107,14 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
     se::DeviceMemoryBase dev_src_ptr(src_ptr, total_bytes);
     void* dst_ptr = DMAHelper::base(cpu_tensor);
 
-    Status status = Status::OK();
+    Status status;
     stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
     // TODO(hpucha): Make this asynchronous.
-    if (!stream_->BlockHostUntilDone()) {
+    Status block_status = stream_->BlockHostUntilDone();
+    if (!block_status.ok()) {
       status = xla::InternalError(
-          "Failed to complete data transfer on stream %p", stream_);
+          "Failed to complete data transfer on stream %p: %s", stream_,
+          block_status.error_message().c_str());
     }
 
     done(status);
diff --git a/tensorflow/compiler/plugin/BUILD b/tensorflow/compiler/plugin/BUILD
index c1edf2448c54ffddd7b70dcdfb1609080ca81b65..da4bc44c7a75c9f8faf16c537a17a1f2d16d5d61 100644
--- a/tensorflow/compiler/plugin/BUILD
+++ b/tensorflow/compiler/plugin/BUILD
@@ -41,6 +41,15 @@ cc_library(
     ],
 )
 
+# This target is added purely for the purpose of ensuring that `:xla_device` is
+# always publicly visible to external XLA backend/plugin developers.
+cc_library(
+    name = "plugin_device",
+    deps = [
+        "//tensorflow/compiler/jit:xla_device",
+    ],
+)
+
 #-----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 6cad2b0824d86a9549cb77518448a7e4eb781bef..782bf82d4149968d5e5fbfb93bbd4ff1dcd75494 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -144,6 +144,21 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "matrix_triangular_solve_op_test",
+    size = "small",
+    srcs = ["matrix_triangular_solve_op_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "clustering_test",
     size = "small",
@@ -240,6 +255,35 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "extract_image_patches_op_test",
+    size = "small",
+    srcs = ["extract_image_patches_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+tf_xla_py_test(
+    name = "fft_test",
+    size = "medium",
+    srcs = ["fft_test.py"],
+    shard_count = 3,
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/contrib/signal:signal_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:spectral_ops",
+    ],
+)
+
 tf_xla_py_test(
     name = "slice_ops_test",
     size = "small",
@@ -279,6 +323,22 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "image_ops_test",
+    size = "small",
+    srcs = ["image_ops_test.py"],
+    tags = [
+        "optonly",  # Times out frequently in fastbuild mode.
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "lrn_ops_test",
     size = "medium",
@@ -293,6 +353,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "matrix_band_part_test",
+    size = "medium",
+    srcs = ["matrix_band_part_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "momentum_test",
     size = "small",
@@ -367,7 +440,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["random_ops_test.py"],
     # TODO(b/31361304): enable RNG ops on GPU when parallelized.
-    disabled_backends = ["gpu"],
+    disabled_backends = [
+        "gpu",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -402,6 +477,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "reverse_sequence_op_test",
+    size = "medium",
+    srcs = ["reverse_sequence_op_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "rmsprop_test",
     size = "small",
@@ -416,6 +504,20 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "scan_ops_test",
+    size = "small",
+    srcs = ["scan_ops_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "segment_reduction_ops_test",
     size = "medium",
@@ -538,6 +640,7 @@ tf_xla_py_test(
     name = "variable_ops_test",
     size = "small",
     srcs = ["variable_ops_test.py"],
+    tags = ["optonly"],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -564,6 +667,31 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "gather_nd_op_test",
+    size = "medium",
+    srcs = ["gather_nd_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+tf_xla_py_test(
+    name = "scatter_nd_op_test",
+    size = "medium",
+    srcs = ["scatter_nd_op_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "xla_device_test",
     size = "small",
@@ -688,6 +816,17 @@ tf_library(
     tfcompile_flags = ["--xla_cpu_multi_thread_eigen=false"],
 )
 
+tf_xla_py_test(
+    name = "fake_quant_ops_test",
+    size = "medium",
+    srcs = ["fake_quant_ops_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 654dc15e86b21c7742d49281d53c1a75e6a45d3b..30a6d3a74d64f90ad33062df6d1e16e3a575bd63 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -43,7 +43,7 @@ class BinaryOpsTest(XLATestCase):
         output = op(pa, pb)
       result = session.run(output, {pa: a, pb: b})
       if equality_test is None:
-        equality_test = self.assertAllClose
+        equality_test = self.assertAllCloseAccordingToType
       equality_test(result, expected, rtol=1e-3)
 
   def _testSymmetricBinary(self, op, a, b, expected, equality_test=None):
@@ -54,14 +54,20 @@ class BinaryOpsTest(XLATestCase):
     """Tests closeness of two lists of floats."""
     self.assertEqual(len(result), len(expected))
     for i in range(len(result)):
-      self.assertAllClose(result[i], expected[i], rtol)
+      self.assertAllCloseAccordingToType(result[i], expected[i], rtol)
 
   def testFloatOps(self):
     for dtype in self.float_types:
+      if dtype == dtypes.bfloat16.as_numpy_dtype:
+        a = -1.01
+        b = 4.1
+      else:
+        a = -1.001
+        b = 4.01
       self._testBinary(
           lambda x, y: math_ops.approximate_equal(x, y, tolerance=0.0001),
-          np.array([[[[-1, 2.00009999], [-3, 4.01]]]], dtype=dtype),
-          np.array([[[[-1.001, 2], [-3.00009, 4]]]], dtype=dtype),
+          np.array([[[[-1, 2.00009999], [-3, b]]]], dtype=dtype),
+          np.array([[[[a, 2], [-3.00009, 4]]]], dtype=dtype),
           expected=np.array([[[[False, True], [True, False]]]], dtype=dtype))
 
       self._testBinary(
@@ -94,14 +100,12 @@ class BinaryOpsTest(XLATestCase):
           dtype(4),
           expected=np.array([[16], [81]], dtype=dtype))
 
-      atan2_supported = self.device == "XLA_GPU"
-      if atan2_supported:
-        self._testBinary(
-            math_ops.atan2,
-            np.array([0, np.sqrt(2), 1, np.sqrt(2), 0], dtype),
-            np.array([1, np.sqrt(2), 0, -np.sqrt(2), -1], dtype),
-            expected=np.array(
-                [0, np.pi / 4, np.pi / 2, np.pi * 3 / 4, np.pi], dtype=dtype))
+      self._testBinary(
+          math_ops.atan2,
+          np.array([0, np.sqrt(2), 1, np.sqrt(2), 0], dtype),
+          np.array([1, np.sqrt(2), 0, -np.sqrt(2), -1], dtype),
+          expected=np.array(
+              [0, np.pi / 4, np.pi / 2, np.pi * 3 / 4, np.pi], dtype=dtype))
 
       self._testBinary(
           gen_math_ops._reciprocal_grad,
@@ -388,30 +392,28 @@ class BinaryOpsTest(XLATestCase):
               ],
               dtype=dtype))
 
-      atan2_supported = self.device == "XLA_GPU"
-      if atan2_supported:
-        self._testBinary(
-            math_ops.pow,
-            dtype(3 + 2j),
-            dtype(4 - 5j),
-            expected=np.power(dtype(3 + 2j), dtype(4 - 5j)))
-        self._testBinary(  # empty rhs
-            math_ops.pow,
-            np.array([1 + 2j, 2 - 3j], dtype=dtype),
-            np.zeros(shape=[0, 2], dtype=dtype),
-            expected=np.zeros(shape=[0, 2], dtype=dtype))
-        self._testBinary(  # to zero power
-            math_ops.pow,
-            np.array([1 + 2j, 2 - 3j], dtype=dtype),
-            np.zeros(shape=[1, 2], dtype=dtype),
-            expected=np.ones(shape=[1, 2], dtype=dtype))
-        lhs = np.array([1 - 2j, 4 + 3j, 2 - 3j, 3, 2j, 1, 4], dtype=dtype)
-        rhs = np.array([2, 3j, 3 + 4j, 2 + 3j, 3 - 2j, 2, 3 + 3j], dtype=dtype)
-        scalar = dtype(2 + 2j)
-        self._testBinary(math_ops.pow, lhs, rhs, expected=np.power(lhs, rhs))
-        self._testBinary(
-            math_ops.pow, scalar, rhs, expected=np.power(scalar, rhs))
-        self._testBinary(math_ops.pow, lhs, scalar, np.power(lhs, scalar))
+      self._testBinary(
+          math_ops.pow,
+          dtype(3 + 2j),
+          dtype(4 - 5j),
+          expected=np.power(dtype(3 + 2j), dtype(4 - 5j)))
+      self._testBinary(  # empty rhs
+          math_ops.pow,
+          np.array([1 + 2j, 2 - 3j], dtype=dtype),
+          np.zeros(shape=[0, 2], dtype=dtype),
+          expected=np.zeros(shape=[0, 2], dtype=dtype))
+      self._testBinary(  # to zero power
+          math_ops.pow,
+          np.array([1 + 2j, 2 - 3j], dtype=dtype),
+          np.zeros(shape=[1, 2], dtype=dtype),
+          expected=np.ones(shape=[1, 2], dtype=dtype))
+      lhs = np.array([1 - 2j, 4 + 3j, 2 - 3j, 3, 2j, 1, 4], dtype=dtype)
+      rhs = np.array([2, 3j, 3 + 4j, 2 + 3j, 3 - 2j, 2, 3 + 3j], dtype=dtype)
+      scalar = dtype(2 + 2j)
+      self._testBinary(math_ops.pow, lhs, rhs, expected=np.power(lhs, rhs))
+      self._testBinary(
+          math_ops.pow, scalar, rhs, expected=np.power(scalar, rhs))
+      self._testBinary(math_ops.pow, lhs, scalar, np.power(lhs, scalar))
 
       lhs = np.array([4 + 2j, -3 - 1j, 2j, 1], dtype=dtype)
       rhs = np.array([5, -6j, 7 - 3j, -8j], dtype=dtype)
@@ -421,9 +423,8 @@ class BinaryOpsTest(XLATestCase):
       self._testBinary(
           gen_math_ops._sigmoid_grad, lhs, rhs, expected=rhs * lhs * (1 - lhs))
 
-      if atan2_supported:
-        self._testBinary(
-            gen_math_ops._rsqrt_grad, lhs, rhs, expected=lhs**3 * rhs / -2)
+      self._testBinary(
+          gen_math_ops._rsqrt_grad, lhs, rhs, expected=lhs**3 * rhs / -2)
 
       self._testBinary(
           gen_math_ops._sqrt_grad, lhs, rhs, expected=rhs / (2 * lhs))
@@ -547,7 +548,7 @@ class BinaryOpsTest(XLATestCase):
       self._testDivision(dtype)
 
   def testFloatDivision(self):
-    for dtype in self.float_types + self.complex_types:
+    for dtype in self.float_types | self.complex_types:
       self._testDivision(dtype)
 
   def _testRemainder(self, dtype):
@@ -773,15 +774,15 @@ class BinaryOpsTest(XLATestCase):
   def DISABLED_testSparseMatMul(self):
     # Binary wrappers for sparse_matmul with different hints
     def SparseMatmulWrapperTF(a, b):
-      return tf.sparse_matmul(a, b, a_is_sparse=True)
+      return math_ops.sparse_matmul(a, b, a_is_sparse=True)
 
     def SparseMatmulWrapperFT(a, b):
-      return tf.sparse_matmul(a, b, b_is_sparse=True)
+      return math_ops.sparse_matmul(a, b, b_is_sparse=True)
 
     def SparseMatmulWrapperTT(a, b):
-      return tf.sparse_matmul(a, b, a_is_sparse=True, b_is_sparse=True)
+      return math_ops.sparse_matmul(a, b, a_is_sparse=True, b_is_sparse=True)
 
-    self._testMatMul(tf.sparse_matmul)
+    self._testMatMul(math_ops.sparse_matmul)
     self._testMatMul(SparseMatmulWrapperTF)
     self._testMatMul(SparseMatmulWrapperFT)
     self._testMatMul(SparseMatmulWrapperTT)
@@ -1180,6 +1181,50 @@ class BinaryOpsTest(XLATestCase):
                        np.array([4, 5, 6], dtype=np.int32),
                        expected=None)
 
+  def testMatrixSetDiag(self):
+    for dtype in self.numeric_types:
+      # Square
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0]],
+                   dtype=dtype),
+          np.array([1.0, 2.0, 3.0], dtype=dtype),
+          expected=np.array([[1.0, 1.0, 0.0], [1.0, 2.0, 1.0], [1.0, 1.0, 3.0]],
+                            dtype=dtype))
+
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0], [1.0, 0.0, 3.0]],
+                    [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0], [2.0, 0.0, 6.0]]],
+                   dtype=dtype),
+          np.array([[-1.0, 0.0, -3.0], [-4.0, -5.0, -6.0]], dtype=dtype),
+          expected=np.array(
+              [[[-1.0, 0.0, 3.0], [0.0, 0.0, 0.0], [1.0, 0.0, -3.0]],
+               [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0], [2.0, 0.0, -6.0]]],
+              dtype=dtype))
+
+      # Rectangular
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0]], dtype=dtype),
+          np.array([3.0, 4.0], dtype=dtype),
+          expected=np.array([[3.0, 1.0, 0.0], [1.0, 4.0, 1.0]], dtype=dtype))
+
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 1.0]], dtype=dtype),
+          np.array([3.0, 4.0], dtype=dtype),
+          expected=np.array([[3.0, 1.0], [1.0, 4.0], [1.0, 1.0]], dtype=dtype))
+
+      self._testBinary(
+          array_ops.matrix_set_diag,
+          np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0]],
+                    [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0]]], dtype=dtype),
+          np.array([[-1.0, -2.0], [-4.0, -5.0]],
+                   dtype=dtype),
+          expected=np.array([[[-1.0, 0.0, 3.0], [0.0, -2.0, 0.0]],
+                             [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]],
+                            dtype=dtype))
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tests/categorical_op_test.py b/tensorflow/compiler/tests/categorical_op_test.py
index 5e06f9a72401935b9681c35a164b51f50a8538ae..035cdea1786d39f3d21bb63be5c8ccffe1608bdf 100644
--- a/tensorflow/compiler/tests/categorical_op_test.py
+++ b/tensorflow/compiler/tests/categorical_op_test.py
@@ -35,6 +35,9 @@ from tensorflow.python.platform import googletest
 class CategoricalTest(XLATestCase):
   """Test cases for random-number generating operators."""
 
+  def output_dtypes(self):
+    return set(self.int_types).intersection([np.int32, np.int64])
+
   def _chi2(self, expected, actual):
     """Returns Chi2 GOF statistic."""
     actual = np.asarray(actual)
@@ -55,7 +58,8 @@ class CategoricalTest(XLATestCase):
     """
     with self.test_session() as sess, self.test_scope():
       random_seed.set_random_seed(1618)
-      op = random_ops.multinomial(logits, num_samples)
+      op = random_ops.multinomial(logits, num_samples,
+                                  output_dtype=dtypes.int32)
       d = sess.run(op)
 
     batch_size, num_classes = logits.shape
@@ -73,11 +77,11 @@ class CategoricalTest(XLATestCase):
 
     return freqs_mat
 
-  def _testRngIsNotConstant(self, rng, dtype):
+  def _testRngIsNotConstant(self, rng, dtype, output_dtype):
     # Tests that 'rng' does not always return the same value.
     with self.test_session() as sess:
       with self.test_scope():
-        x = rng(dtype)
+        x = rng(dtype, output_dtype)
 
       # The random-number generator, if working correctly, should produce the
       # same output multiple times with low probability.
@@ -92,21 +96,25 @@ class CategoricalTest(XLATestCase):
                       (not np.array_equal(y, w)))
 
   def testCategoricalIsNotConstant(self):
-    def rng(unused_dtype):
-      return random_ops.multinomial([[1., 1., 1.]], 10)
+    def rng(dtype, output_dtype):
+      return random_ops.multinomial(np.array([[1., 1., 1.]], dtype=dtype), 10,
+                                    output_dtype=output_dtype)
 
-    dtype = dtypes.float32
-    self._testRngIsNotConstant(rng, dtype)
+    dtype = np.float32
+    for output_dtype in self.output_dtypes():
+      self._testRngIsNotConstant(rng, dtype, output_dtype)
 
   def testCategoricalIsInRange(self):
-    for dtype in [dtypes.float32, dtypes.float64]:
-      with self.test_session() as sess:
-        with self.test_scope():
-          x = random_ops.multinomial(
-              array_ops.ones(shape=[1, 20], dtype=dtype), 1000)
-        y = sess.run(x)
-        self.assertTrue((y >= 0).sum() == 1000)
-        self.assertTrue((y < 20).sum() == 1000)
+    for dtype in self.float_types:
+      for output_dtype in self.output_dtypes():
+        with self.test_session() as sess:
+          with self.test_scope():
+            x = random_ops.multinomial(
+                array_ops.ones(shape=[1, 20], dtype=dtype), 1000,
+                output_dtype=output_dtype)
+          y = sess.run(x)
+          self.assertTrue((y >= 0).sum() == 1000)
+          self.assertTrue((y < 20).sum() == 1000)
 
   def testSamplingCorrectness(self):
     np.random.seed(1618)  # Make it reproducible.
diff --git a/tensorflow/compiler/tests/conv2d_test.py b/tensorflow/compiler/tests/conv2d_test.py
index 0d617eb37c5d92c87abb0f996b731112257a2b80..62577b70ce96e220d79978f01614b2d9a3647680 100644
--- a/tensorflow/compiler/tests/conv2d_test.py
+++ b/tensorflow/compiler/tests/conv2d_test.py
@@ -34,7 +34,13 @@ from tensorflow.python.platform import googletest
 
 class Conv2DTest(XLATestCase):
 
-  def _VerifyValues(self, input_sizes, filter_sizes, stride, padding, expected):
+  def _VerifyValues(self,
+                    input_sizes=None,
+                    filter_sizes=None,
+                    strides=None,
+                    dilations=None,
+                    padding=None,
+                    expected=None):
     """Tests that tf.nn.conv2d produces the expected value.
 
     Args:
@@ -42,7 +48,8 @@ class Conv2DTest(XLATestCase):
         [batch, input_rows, input_cols, input_depth].
       filter_sizes: Filter tensor dimensions in
         [kernel_rows, kernel_cols, input_depth, output_depth].
-      stride: Stride.
+      strides: Strides.
+      dilations: RHS dilations.
       padding: Padding type.
       expected: Expected output.
     """
@@ -50,73 +57,136 @@ class Conv2DTest(XLATestCase):
     total_size_2 = np.prod(filter_sizes)
     x1 = np.arange(1, total_size_1 + 1, dtype=np.float32).reshape(input_sizes)
     x2 = np.arange(1, total_size_2 + 1, dtype=np.float32).reshape(filter_sizes)
-    strides = [1, stride, stride, 1]
+    strides = [1] + strides + [1]
+    if dilations is None:
+      dilations = [1, 1]
+    dilations = [1] + dilations + [1]
 
     with self.test_session() as sess:
+      t1 = array_ops.placeholder(dtypes.float32, shape=input_sizes)
+      t2 = array_ops.placeholder(dtypes.float32, shape=filter_sizes)
       with self.test_scope():
-        t1 = array_ops.placeholder(dtypes.float32, shape=input_sizes)
-        t2 = array_ops.placeholder(dtypes.float32, shape=filter_sizes)
         out = nn_ops.conv2d(
-            t1, t2, strides=strides, padding=padding, data_format="NHWC")
+            t1,
+            t2,
+            strides=strides,
+            padding=padding,
+            data_format="NHWC",
+            dilations=dilations)
       value = sess.run(out, {t1: x1, t2: x2})
-      self.assertArrayNear(expected, np.ravel(value), 1e-3)
+      self.assertAllClose(expected, value, 1e-3)
 
   def testConv2D1x1Filter(self):
-    expected_output = [
+    expected_output = np.reshape([
         30.0, 36.0, 42.0, 66.0, 81.0, 96.0, 102.0, 126.0, 150.0, 138.0, 171.0,
         204.0, 174.0, 216.0, 258.0, 210.0, 261.0, 312.0
-    ]
+    ], [1, 2, 3, 3])
     self._VerifyValues(
         input_sizes=[1, 2, 3, 3],
         filter_sizes=[1, 1, 3, 3],
-        stride=1,
+        strides=[1, 1],
         padding="VALID",
         expected=expected_output)
 
   def testConv2D2x2Filter(self):
-    expected_output = [2271.0, 2367.0, 2463.0, 2901.0, 3033.0, 3165.0]
+    expected_output = np.reshape(
+        [2271.0, 2367.0, 2463.0, 2901.0, 3033.0, 3165.0], [1, 1, 2, 3])
     self._VerifyValues(
         input_sizes=[1, 2, 3, 3],
         filter_sizes=[2, 2, 3, 3],
-        stride=1,
+        strides=[1, 1],
+        padding="VALID",
+        expected=expected_output)
+
+  def testConv2D2x2Filter2x1Dilation(self):
+    expected_output = np.array([[[[72], [82], [92]], [[112], [122], [132]]]])
+    self._VerifyValues(
+        input_sizes=[1, 4, 4, 1],
+        filter_sizes=[2, 2, 1, 1],
+        strides=[1, 1],
+        dilations=[2, 1],
         padding="VALID",
         expected=expected_output)
 
   def testConv2D1x2Filter(self):
-    expected_output = [
+    expected_output = np.reshape([
         231.0, 252.0, 273.0, 384.0, 423.0, 462.0, 690.0, 765.0, 840.0, 843.0,
         936.0, 1029.0
-    ]
+    ], [1, 2, 2, 3])
     self._VerifyValues(
         input_sizes=[1, 2, 3, 3],
         filter_sizes=[1, 2, 3, 3],
-        stride=1,
+        strides=[1, 1],
         padding="VALID",
         expected=expected_output)
 
   def testConv2D2x2FilterStride2(self):
-    expected_output = [2271.0, 2367.0, 2463.0]
+    expected_output = np.reshape([2271.0, 2367.0, 2463.0], [1, 1, 1, 3])
     self._VerifyValues(
         input_sizes=[1, 2, 3, 3],
         filter_sizes=[2, 2, 3, 3],
-        stride=2,
+        strides=[2, 2],
         padding="VALID",
         expected=expected_output)
 
   def testConv2D2x2FilterStride2Same(self):
-    expected_output = [2271.0, 2367.0, 2463.0, 1230.0, 1305.0, 1380.0]
+    expected_output = np.reshape(
+        [2271.0, 2367.0, 2463.0, 1230.0, 1305.0, 1380.0], [1, 1, 2, 3])
     self._VerifyValues(
         input_sizes=[1, 2, 3, 3],
         filter_sizes=[2, 2, 3, 3],
-        stride=2,
+        strides=[2, 2],
         padding="SAME",
         expected=expected_output)
 
+  def testConv2DEmptyDilation(self):
+    self._VerifyValues(
+        input_sizes=[0, 2, 3, 3],
+        filter_sizes=[1, 1, 3, 3],
+        strides=[1, 1],
+        dilations=[2, 1],
+        padding="VALID",
+        expected=np.zeros([0, 2, 3, 3]))
+
+  def testConv2D2x2FilterDilation(self):
+    self._VerifyValues(
+        input_sizes=[1, 2, 3, 3],
+        filter_sizes=[2, 2, 3, 3],
+        strides=[1, 1],
+        dilations=[1, 2],
+        padding="VALID",
+        expected=np.reshape([2667, 2781, 2895], [1, 1, 1, 3]))
+
+  def testConv2D1x2FilterDilation(self):
+    self._VerifyValues(
+        input_sizes=[1, 2, 3, 3],
+        filter_sizes=[1, 2, 3, 3],
+        strides=[1, 1],
+        dilations=[2, 1],
+        padding="VALID",
+        expected=np.array([[[[231, 252, 273], [384, 423, 462]],
+                            [[690, 765, 840], [843, 936, 1029]]]]))
+
+  def testConv2DKernelSizeMatchesInputSizeDilation(self):
+    self._VerifyValues(
+        input_sizes=[1, 3, 3, 1],
+        filter_sizes=[2, 2, 1, 2],
+        strides=[1, 1],
+        dilations=[2, 2],
+        padding="VALID",
+        expected=np.reshape([108, 128], [1, 1, 1, 2]))
+
 
 class Conv2DBackpropInputTest(XLATestCase):
 
-  def _VerifyValues(self, input_sizes, filter_sizes, out_backprop_sizes, stride,
-                    padding, expected):
+  def _VerifyValues(self,
+                    input_sizes=None,
+                    filter_sizes=None,
+                    out_backprop_sizes=None,
+                    strides=None,
+                    dilations=None,
+                    padding=None,
+                    expected=None):
     """Tests that gen_nn_ops.conv2d_backprop_input produces the expected output.
 
     Args:
@@ -125,7 +195,8 @@ class Conv2DBackpropInputTest(XLATestCase):
       filter_sizes: Filter tensor dimensions in
         [kernel_rows, kernel_cols, input_depth, output_depth].
       out_backprop_sizes: Output gradients tensor dimensions.
-      stride: Stride.
+      strides: Strides.
+      dilations: Dilations.
       padding: Padding type.
       expected: Expected output.
     """
@@ -134,21 +205,25 @@ class Conv2DBackpropInputTest(XLATestCase):
     x1 = np.arange(1, total_size_1 + 1, dtype=np.float32).reshape(filter_sizes)
     x2 = np.arange(
         1, total_size_2 + 1, dtype=np.float32).reshape(out_backprop_sizes)
-    strides = [1, stride, stride, 1]
+    strides = [1] + strides + [1]
+    if dilations is not None:
+      dilations = [1] + dilations + [1]
 
     with self.test_session() as sess:
+      t1 = array_ops.placeholder(dtypes.float32, shape=filter_sizes)
+      t2 = array_ops.placeholder(dtypes.float32, shape=out_backprop_sizes)
       with self.test_scope():
-        t1 = array_ops.placeholder(dtypes.float32, shape=filter_sizes)
-        t2 = array_ops.placeholder(dtypes.float32, shape=out_backprop_sizes)
         out = gen_nn_ops.conv2d_backprop_input(
             input_sizes=input_sizes,
             filter=t1,
             out_backprop=t2,
             strides=strides,
+            dilations=dilations,
             padding=padding,
             data_format="NHWC")
       value = sess.run(out, {t1: x1, t2: x2})
-      self.assertArrayNear(expected, np.ravel(value), 1e-3)
+      self.assertAllEqual(input_sizes, value.shape)
+      self.assertAllClose(expected, np.ravel(value), 1e-3)
 
   def testConv2D1x1Filter(self):
     expected_output = [
@@ -160,7 +235,7 @@ class Conv2DBackpropInputTest(XLATestCase):
         input_sizes=[1, 4, 4, 3],
         filter_sizes=[1, 1, 3, 2],
         out_backprop_sizes=[1, 4, 4, 2],
-        stride=1,
+        strides=[1, 1],
         padding="VALID",
         expected=expected_output)
 
@@ -170,7 +245,7 @@ class Conv2DBackpropInputTest(XLATestCase):
         input_sizes=[1, 1, 5, 1],
         filter_sizes=[1, 2, 1, 1],
         out_backprop_sizes=[1, 1, 2, 1],
-        stride=3,
+        strides=[3, 3],
         padding="VALID",
         expected=expected_output)
 
@@ -180,7 +255,7 @@ class Conv2DBackpropInputTest(XLATestCase):
         input_sizes=[1, 1, 6, 1],
         filter_sizes=[1, 2, 1, 1],
         out_backprop_sizes=[1, 1, 2, 1],
-        stride=3,
+        strides=[3, 3],
         padding="VALID",
         expected=expected_output)
 
@@ -190,7 +265,7 @@ class Conv2DBackpropInputTest(XLATestCase):
         input_sizes=[1, 1, 7, 1],
         filter_sizes=[1, 2, 1, 1],
         out_backprop_sizes=[1, 1, 2, 1],
-        stride=3,
+        strides=[3, 3],
         padding="VALID",
         expected=expected_output)
 
@@ -200,7 +275,7 @@ class Conv2DBackpropInputTest(XLATestCase):
         input_sizes=[1, 2, 3, 1],
         filter_sizes=[2, 2, 1, 1],
         out_backprop_sizes=[1, 2, 3, 1],
-        stride=1,
+        strides=[1, 1],
         padding="SAME",
         expected=expected_output)
 
@@ -213,7 +288,7 @@ class Conv2DBackpropInputTest(XLATestCase):
         input_sizes=[1, 2, 3, 3],
         filter_sizes=[2, 2, 3, 3],
         out_backprop_sizes=[1, 1, 2, 3],
-        stride=1,
+        strides=[1, 1],
         padding="VALID",
         expected=expected_output)
 
@@ -226,7 +301,7 @@ class Conv2DBackpropInputTest(XLATestCase):
         input_sizes=[1, 2, 3, 3],
         filter_sizes=[2, 2, 3, 3],
         out_backprop_sizes=[1, 2, 3, 3],
-        stride=1,
+        strides=[1, 1],
         padding="SAME",
         expected=expected_output)
 
@@ -236,7 +311,7 @@ class Conv2DBackpropInputTest(XLATestCase):
         input_sizes=[1, 3, 3, 1],
         filter_sizes=[1, 2, 1, 1],
         out_backprop_sizes=[1, 3, 2, 1],
-        stride=1,
+        strides=[1, 1],
         padding="VALID",
         expected=expected_output)
 
@@ -246,7 +321,7 @@ class Conv2DBackpropInputTest(XLATestCase):
         input_sizes=[1, 3, 3, 1],
         filter_sizes=[1, 2, 1, 1],
         out_backprop_sizes=[1, 3, 3, 1],
-        stride=1,
+        strides=[1, 1],
         padding="SAME",
         expected=expected_output)
 
@@ -256,7 +331,7 @@ class Conv2DBackpropInputTest(XLATestCase):
         input_sizes=[1, 3, 5, 1],
         filter_sizes=[1, 3, 1, 1],
         out_backprop_sizes=[1, 2, 2, 1],
-        stride=2,
+        strides=[2, 2],
         padding="VALID",
         expected=expected_output)
 
@@ -266,15 +341,76 @@ class Conv2DBackpropInputTest(XLATestCase):
         input_sizes=[1, 2, 3, 1],
         filter_sizes=[2, 2, 1, 1],
         out_backprop_sizes=[1, 1, 2, 1],
-        stride=2,
+        strides=[2, 2],
         padding="SAME",
         expected=expected_output)
 
+  def testConv2D2x2Depth3ValidBackpropInputStride1x1Dilation2x1(self):
+    self._VerifyValues(
+        input_sizes=[1, 3, 6, 1],
+        filter_sizes=[2, 2, 1, 1],
+        out_backprop_sizes=[1, 1, 5, 1],
+        strides=[1, 1],
+        dilations=[2, 1],
+        padding="VALID",
+        expected=[1, 4, 7, 10, 13, 10, 0, 0, 0, 0, 0, 0, 3, 10, 17, 24, 31, 20])
+
+  def testConv2D2x2Depth1ValidBackpropInputDilation1x2(self):
+    self._VerifyValues(
+        input_sizes=[1, 2, 3, 1],
+        filter_sizes=[2, 2, 1, 1],
+        out_backprop_sizes=[1, 1, 1, 1],
+        strides=[1, 1],
+        dilations=[1, 2],
+        padding="VALID",
+        expected=[1, 0, 2, 3, 0, 4])
+
+  def testConv2DEmptyBackpropInputDilation1x2(self):
+    self._VerifyValues(
+        input_sizes=[0, 2, 3, 1],
+        filter_sizes=[2, 2, 1, 1],
+        out_backprop_sizes=[0, 1, 1, 1],
+        strides=[1, 1],
+        dilations=[1, 2],
+        padding="VALID",
+        expected=np.zeros([0]))
+
+  def testConv2D2x2Depth3ValidBackpropInputDilation2x1(self):
+    # The GPU version of this test is not very stable. So adjusting the
+    # error threshold to 1e-4.
+    self._VerifyValues(
+        input_sizes=[1, 3, 2, 3],
+        filter_sizes=[2, 2, 3, 3],
+        out_backprop_sizes=[1, 1, 1, 3],
+        strides=[1, 1],
+        dilations=[2, 1],
+        padding="VALID",
+        expected=[
+            14, 32, 50, 68, 86, 104, 0, 0, 0, 0, 0, 0, 122, 140, 158, 176, 194,
+            212
+        ])
+
+  def testConv2DKernelSizeMatchesInputSizeBackpropInputDilation2x2(self):
+    self._VerifyValues(
+        input_sizes=[1, 3, 3, 1],
+        filter_sizes=[2, 2, 1, 2],
+        out_backprop_sizes=[1, 1, 1, 2],
+        strides=[1, 1],
+        dilations=[2, 2],
+        padding="VALID",
+        expected=[5, 0, 11, 0, 0, 0, 17, 0, 23])
+
 
 class Conv2DBackpropFilterTest(XLATestCase):
 
-  def _VerifyValues(self, input_sizes, filter_sizes, out_backprop_sizes, stride,
-                    padding, expected):
+  def _VerifyValues(self,
+                    input_sizes=None,
+                    filter_sizes=None,
+                    out_backprop_sizes=None,
+                    strides=None,
+                    dilations=None,
+                    padding=None,
+                    expected=None):
     """Tests that gen_nn_ops.conv2d_backprop_filter produces the right output.
 
     Args:
@@ -283,7 +419,8 @@ class Conv2DBackpropFilterTest(XLATestCase):
       filter_sizes: Filter tensor dimensions in
         [kernel_rows, kernel_cols, input_depth, output_depth].
       out_backprop_sizes: Output gradients tensor dimensions.
-      stride: Stride.
+      strides: Stride.
+      dilations: Dilations.
       padding: Padding type.
       expected: Expected output.
     """
@@ -293,22 +430,26 @@ class Conv2DBackpropFilterTest(XLATestCase):
     x1 = np.arange(1, total_size_1 + 1, dtype=np.float32).reshape(input_sizes)
     x2 = np.arange(
         1, total_size_2 + 1, dtype=np.float32).reshape(out_backprop_sizes)
-    strides = [1, stride, stride, 1]
+    strides = [1] + strides + [1]
+    if dilations is not None:
+      dilations = [1] + dilations + [1]
 
     with self.test_session() as sess:
+      t1 = array_ops.placeholder(dtypes.float32, shape=input_sizes)
+      t2 = array_ops.placeholder(dtypes.float32, shape=out_backprop_sizes)
       with self.test_scope():
-        t1 = array_ops.placeholder(dtypes.float32, shape=input_sizes)
-        t2 = array_ops.placeholder(dtypes.float32, shape=out_backprop_sizes)
         tensor = gen_nn_ops.conv2d_backprop_filter(
             input=t1,
             filter_sizes=filter_sizes,
             out_backprop=t2,
             strides=strides,
+            dilations=dilations,
             padding=padding,
             data_format="NHWC")
 
       value = sess.run(tensor, {t1: x1, t2: x2})
-      self.assertArrayNear(expected, np.ravel(value), 1e-3)
+      self.assertAllEqual(filter_sizes, value.shape)
+      self.assertAllClose(expected, np.ravel(value), 1e-3)
 
   def testConv2D1x1Filter(self):
     expected_output = [8056, 8432, 8312, 8704, 8568, 8976]
@@ -316,7 +457,7 @@ class Conv2DBackpropFilterTest(XLATestCase):
         input_sizes=[1, 4, 4, 3],
         filter_sizes=[1, 1, 3, 2],
         out_backprop_sizes=[1, 4, 4, 2],
-        stride=1,
+        strides=[1, 1],
         padding="VALID",
         expected=expected_output)
 
@@ -326,7 +467,7 @@ class Conv2DBackpropFilterTest(XLATestCase):
         input_sizes=[1, 3, 3, 1],
         filter_sizes=[1, 2, 1, 1],
         out_backprop_sizes=[1, 3, 2, 1],
-        stride=1,
+        strides=[1, 1],
         padding="VALID",
         expected=expected_output)
 
@@ -336,7 +477,7 @@ class Conv2DBackpropFilterTest(XLATestCase):
         input_sizes=[1, 2, 3, 1],
         filter_sizes=[2, 2, 1, 1],
         out_backprop_sizes=[1, 1, 2, 1],
-        stride=1,
+        strides=[1, 1],
         padding="VALID",
         expected=expected_output)
 
@@ -350,7 +491,7 @@ class Conv2DBackpropFilterTest(XLATestCase):
         input_sizes=[1, 2, 3, 3],
         filter_sizes=[2, 2, 3, 3],
         out_backprop_sizes=[1, 1, 2, 3],
-        stride=1,
+        strides=[1, 1],
         padding="VALID",
         expected=expected_output)
 
@@ -360,7 +501,7 @@ class Conv2DBackpropFilterTest(XLATestCase):
         input_sizes=[1, 1, 5, 1],
         filter_sizes=[1, 2, 1, 1],
         out_backprop_sizes=[1, 1, 2, 1],
-        stride=3,
+        strides=[3, 3],
         padding="VALID",
         expected=expected_output)
 
@@ -370,7 +511,7 @@ class Conv2DBackpropFilterTest(XLATestCase):
         input_sizes=[1, 1, 6, 1],
         filter_sizes=[1, 2, 1, 1],
         out_backprop_sizes=[1, 1, 2, 1],
-        stride=3,
+        strides=[3, 3],
         padding="VALID",
         expected=expected_output)
 
@@ -380,7 +521,7 @@ class Conv2DBackpropFilterTest(XLATestCase):
         input_sizes=[1, 1, 7, 1],
         filter_sizes=[1, 2, 1, 1],
         out_backprop_sizes=[1, 1, 2, 1],
-        stride=3,
+        strides=[3, 3],
         padding="VALID",
         expected=expected_output)
 
@@ -390,7 +531,7 @@ class Conv2DBackpropFilterTest(XLATestCase):
         input_sizes=[1, 1, 4, 1],
         filter_sizes=[1, 3, 1, 1],
         out_backprop_sizes=[1, 1, 2, 1],
-        stride=1,
+        strides=[1, 1],
         padding="VALID",
         expected=expected_output)
 
@@ -400,7 +541,7 @@ class Conv2DBackpropFilterTest(XLATestCase):
         input_sizes=[1, 1, 4, 1],
         filter_sizes=[1, 3, 1, 1],
         out_backprop_sizes=[1, 1, 4, 1],
-        stride=1,
+        strides=[1, 1],
         padding="SAME",
         expected=expected_output)
 
@@ -410,7 +551,7 @@ class Conv2DBackpropFilterTest(XLATestCase):
         input_sizes=[1, 1, 4, 1],
         filter_sizes=[1, 3, 1, 1],
         out_backprop_sizes=[1, 1, 2, 1],
-        stride=2,
+        strides=[2, 2],
         padding="SAME",
         expected=expected_output)
 
@@ -420,7 +561,7 @@ class Conv2DBackpropFilterTest(XLATestCase):
         input_sizes=[1, 2, 3, 1],
         filter_sizes=[2, 2, 1, 1],
         out_backprop_sizes=[1, 2, 3, 1],
-        stride=1,
+        strides=[1, 1],
         padding="SAME",
         expected=expected_output)
 
@@ -430,7 +571,7 @@ class Conv2DBackpropFilterTest(XLATestCase):
         input_sizes=[1, 3, 5, 1],
         filter_sizes=[1, 3, 1, 1],
         out_backprop_sizes=[1, 2, 2, 1],
-        stride=2,
+        strides=[2, 2],
         padding="VALID",
         expected=expected_output)
 
@@ -440,10 +581,64 @@ class Conv2DBackpropFilterTest(XLATestCase):
         input_sizes=[1, 2, 3, 1],
         filter_sizes=[2, 2, 1, 1],
         out_backprop_sizes=[1, 1, 2, 1],
-        stride=2,
+        strides=[2, 2],
         padding="SAME",
         expected=expected_output)
 
+  def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
+    self._VerifyValues(
+        input_sizes=[1, 3, 6, 1],
+        filter_sizes=[2, 2, 1, 1],
+        out_backprop_sizes=[1, 1, 5, 1],
+        strides=[1, 1],
+        dilations=[2, 1],
+        padding="VALID",
+        expected=[55, 70, 235, 250])
+
+  def testConv2D2x2Depth1ValidBackpropFilterDilation1x2(self):
+    self._VerifyValues(
+        input_sizes=[1, 2, 3, 1],
+        filter_sizes=[2, 2, 1, 1],
+        out_backprop_sizes=[1, 1, 1, 1],
+        strides=[1, 1],
+        dilations=[1, 2],
+        padding="VALID",
+        expected=[1, 3, 4, 6])
+
+  def testConv2DEmptyBackpropFilterDilation1x2(self):
+    self._VerifyValues(
+        input_sizes=[1, 2, 3, 1],
+        filter_sizes=[2, 2, 1, 0],
+        out_backprop_sizes=[1, 1, 1, 0],
+        strides=[1, 1],
+        dilations=[1, 2],
+        padding="VALID",
+        expected=np.zeros([0]))
+
+  def testConv2D2x2Depth3ValidBackpropFilterDilation2x2(self):
+    self._VerifyValues(
+        input_sizes=[1, 3, 4, 3],
+        filter_sizes=[2, 2, 3, 3],
+        out_backprop_sizes=[1, 1, 2, 3],
+        strides=[1, 1],
+        dilations=[2, 2],
+        padding="VALID",
+        expected=[
+            17, 22, 27, 22, 29, 36, 27, 36, 45, 47, 64, 81, 52, 71, 90, 57, 78,
+            99, 137, 190, 243, 142, 197, 252, 147, 204, 261, 167, 232, 297, 172,
+            239, 306, 177, 246, 315
+        ])
+
+  def testConv2DKernelSizeMatchesInputSizeBackpropFilterDilation2x2(self):
+    self._VerifyValues(
+        input_sizes=[1, 3, 3, 1],
+        filter_sizes=[2, 2, 1, 2],
+        out_backprop_sizes=[1, 1, 1, 2],
+        strides=[1, 1],
+        dilations=[2, 2],
+        padding="VALID",
+        expected=[1, 2, 3, 6, 7, 14, 9, 18])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tests/extract_image_patches_op_test.py b/tensorflow/compiler/tests/extract_image_patches_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0361702e7af778176daed941d64e61198090daf2
--- /dev/null
+++ b/tensorflow/compiler/tests/extract_image_patches_op_test.py
@@ -0,0 +1,134 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for ExtractImagePatches op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ExtractImagePatches(XLATestCase):
+  """Functional tests for ExtractImagePatches op."""
+
+  def _VerifyValues(self, image, ksizes, strides, rates, padding, patches):
+    """Tests input-output pairs for the ExtractImagePatches op.
+
+    Args:
+      image: Input tensor with shape: [batch, in_rows, in_cols, depth].
+      ksizes: Patch size specified as: [ksize_rows, ksize_cols].
+      strides: Output strides, specified as [stride_rows, stride_cols].
+      rates: Atrous rates, specified as [rate_rows, rate_cols].
+      padding: Padding type.
+      patches: Expected output.
+    """
+    ksizes = [1] + ksizes + [1]
+    strides = [1] + strides + [1]
+    rates = [1] + rates + [1]
+
+    with self.test_session():
+      image_placeholder = array_ops.placeholder(dtypes.float32)
+      with self.test_scope():
+        out_tensor = array_ops.extract_image_patches(
+            image_placeholder,
+            ksizes=ksizes,
+            strides=strides,
+            rates=rates,
+            padding=padding,
+            name="im2col")
+      feed_dict = {image_placeholder: image}
+      self.assertAllClose(patches, out_tensor.eval(feed_dict=feed_dict))
+
+  def testKsize1x1Stride1x1Rate1x1(self):
+    """Verifies that for 1x1 kernel the output equals the input."""
+    # [2, 3, 4, 5]
+    image = np.reshape(range(120), [2, 3, 4, 5])
+    # [2, 3, 4, 5]
+    patches = np.reshape(range(120), [2, 3, 4, 5])
+    for padding in ["VALID", "SAME"]:
+      self._VerifyValues(
+          image,
+          ksizes=[1, 1],
+          strides=[1, 1],
+          rates=[1, 1],
+          padding=padding,
+          patches=patches)
+
+  def testKsize1x1Stride2x3Rate1x1(self):
+    """Test for 1x1 kernel and strides."""
+    # [2, 4, 5, 3]
+    image = np.reshape(range(120), [2, 4, 5, 3])
+    # [2, 2, 2, 3]
+    patches = image[:, ::2, ::3, :]
+    for padding in ["VALID", "SAME"]:
+      self._VerifyValues(
+          image,
+          ksizes=[1, 1],
+          strides=[2, 3],
+          rates=[1, 1],
+          padding=padding,
+          patches=patches)
+
+  def testKsize2x2Stride1x1Rate1x1Valid(self):
+    """Test for 2x2 kernel with VALID padding."""
+    # [1, 2, 2, 1]
+    image = [[[[1], [2]], [[3], [4]]]]
+    # [1, 1, 1, 4]
+    patches = [[[[1, 2, 3, 4]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[1, 1],
+        padding="VALID",
+        patches=patches)
+
+  def testKsize2x2Stride1x1Rate1x1Same(self):
+    """Test for 2x2 kernel with SAME padding."""
+    # [1, 2, 2, 1]
+    image = [[[[1], [2]], [[3], [4]]]]
+    # [1, 2, 2, 4]
+    patches = [[[[1, 2, 3, 4], [2, 0, 4, 0]], [[3, 4, 0, 0], [4, 0, 0, 0]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[1, 1],
+        padding="SAME",
+        patches=patches)
+
+  def testKsize2x2Stride1x1Rate2x2Valid(self):
+    """Test for 2x2 kernel with 2x2 dilation."""
+    # [1, 2, 2, 1]
+    image = np.arange(16).reshape(1, 4, 4, 1).astype(np.float32)
+    # [1, 2, 2, 4]
+    patches = [[[[0, 2, 8, 10], [1, 3, 9, 11]],
+                [[4, 6, 12, 14], [5, 7, 13, 15]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[2, 2],
+        padding="VALID",
+        patches=patches)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/fake_quant_ops_test.py b/tensorflow/compiler/tests/fake_quant_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfe9400ef0f55ca011d4e23ba5d735899ca2e054
--- /dev/null
+++ b/tensorflow/compiler/tests/fake_quant_ops_test.py
@@ -0,0 +1,452 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.platform import googletest
+
+
+class FakeQuantWithMinMaxArgsTest(XLATestCase):
+  """Test cases for FakeQuantWithMinMaxArgs operation."""
+
+  # 8 bits, wide range.
+  def testOp_with8BitsNoScalingNoNudging(self):
+    self._TestOp(0.0, 255.0, 8, False, 0.0, 255.0, 1.0)
+
+  def testOp_with8BitsScalingAndNudgingDown(self):
+    self._TestOp(0.5, 128.0, 8, False, 0.0, 127.5, 0.5)
+
+  def testOp_with8BitsScalingAndNudgingUp(self):
+    self._TestOp(-128.0, -0.5, 8, False, -127.5, 0.0, 0.5)
+
+  def testOp_with8BitsScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 127.4, 8, False, 0.0, 127.5, 0.5)
+
+  # 8 bits, narrow range.
+  def testOp_with8BitsNarrowRangeNoScalingNoNudging(self):
+    self._TestOp(0.0, 254.0, 8, True, 0.0, 254.0, 1.0)
+
+  def testOp_with8BitsNarrowRangeScalingAndNudgingDown(self):
+    self._TestOp(0.1, 127.1, 8, True, 0.0, 127.0, 0.5)
+
+  def testOp_with8BitsNarrowRangeScalingAndNudgingUp(self):
+    self._TestOp(-127.1, -0.1, 8, True, -127.0, 0.0, 0.5)
+
+  def testOp_with8BitsNarrowRangeScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 126.9, 8, True, 0.0, 127.0, 0.5)
+
+  # 7 bits, wide range.
+  def testOp_with7BitsNoScalingNoNudging(self):
+    self._TestOp(0.0, 127.0, 7, False, 0.0, 127.0, 1.0)
+
+  def testOp_with7BitsScalingAndNudgingDown(self):
+    self._TestOp(0.5, 64.0, 7, False, 0.0, 63.5, 0.5)
+
+  def testOp_with7BitsScalingAndNudgingUp(self):
+    self._TestOp(-64.0, -0.5, 7, False, -63.5, 0.0, 0.5)
+
+  def testOp_with7BitsScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 63.4, 7, False, 0.0, 63.5, 0.5)
+
+  # 7 bits, narrow range.
+  def testOp_with7BitsNarrowRangeNoScalingNoNudging(self):
+    self._TestOp(0.0, 126.0, 7, True, 0.0, 126.0, 1.0)
+
+  def testOp_with7BitsNarrowRangeScalingAndNudgingDown(self):
+    self._TestOp(0.1, 63.1, 7, True, 0.0, 63.0, 0.5)
+
+  def testOp_with7BitsNarrowRangeScalingAndNudgingUp(self):
+    self._TestOp(-63.1, -0.1, 7, True, -63.0, 0.0, 0.5)
+
+  def testOp_with7BitsNarrowRangeScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 62.9, 7, True, 0.0, 63.0, 0.5)
+
+  def _TestOp(self, input_min, input_max, num_bits, narrow_range,
+              expected_nudged_input_min, expected_nudged_input_max,
+              expected_step):
+    inputs = np.array(
+        [
+            expected_nudged_input_min - expected_step,
+            expected_nudged_input_min - 0.01, expected_nudged_input_min,
+            expected_nudged_input_min + 0.01,
+            expected_nudged_input_min + expected_step - 0.01,
+            expected_nudged_input_min + expected_step,
+            expected_nudged_input_min + expected_step + 0.01,
+            expected_nudged_input_max - 0.01, expected_nudged_input_max,
+            expected_nudged_input_max + 0.01,
+            expected_nudged_input_max + expected_step
+        ],
+        dtype=np.float32)
+    expected = np.array(
+        [
+            expected_nudged_input_min, expected_nudged_input_min,
+            expected_nudged_input_min, expected_nudged_input_min,
+            expected_nudged_input_min + expected_step,
+            expected_nudged_input_min + expected_step,
+            expected_nudged_input_min + expected_step,
+            expected_nudged_input_max, expected_nudged_input_max,
+            expected_nudged_input_max, expected_nudged_input_max
+        ],
+        dtype=np.float32)
+
+    with self.test_session() as session:
+      with self.test_scope():
+        input_placeholder = array_ops.placeholder(
+            dtypes.float32, inputs.shape, name="inputs")
+        outputs = array_ops.fake_quant_with_min_max_args(
+            input_placeholder,
+            min=input_min,
+            max=input_max,
+            num_bits=num_bits,
+            narrow_range=narrow_range)
+      result = session.run(outputs, {input_placeholder: inputs})
+      self.assertAllCloseAccordingToType(
+          result, expected, rtol=1e-3, atol=1e-5, bfloat16_rtol=0.03)
+
+
+class FakeQuantWithMinMaxArgsGradientTest(XLATestCase):
+  """Test cases for FakeQuantWithMinMaxArgsGradient operation."""
+
+  # 8 bits, wide range.
+  def testOp_with8BitsNoScalingNoNudging(self):
+    self._TestOp(0.0, 255.0, 8, False, 0.0, 255.0, 1.0)
+
+  def testOp_with8BitsScalingAndNudgingDown(self):
+    self._TestOp(0.5, 128.0, 8, False, 0.0, 127.5, 0.5)
+
+  def testOp_with8BitsScalingAndNudgingUp(self):
+    self._TestOp(-128.0, -0.5, 8, False, -127.5, 0.0, 0.5)
+
+  def testOp_with8BitsScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 127.4, 8, False, 0.0, 127.5, 0.5)
+
+  # 8 bits, narrow range.
+  def testOp_with8BitsNarrowRangeNoScalingNoNudging(self):
+    self._TestOp(0.0, 254.0, 8, True, 0.0, 254.0, 1.0)
+
+  def testOp_with8BitsNarrowRangeScalingAndNudgingDown(self):
+    self._TestOp(0.1, 127.1, 8, True, 0.0, 127.0, 0.5)
+
+  def testOp_with8BitsNarrowRangeScalingAndNudgingUp(self):
+    self._TestOp(-127.1, -0.1, 8, True, -127.0, 0.0, 0.5)
+
+  def testOp_with8BitsNarrowRangeScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 126.9, 8, True, 0.0, 127.0, 0.5)
+
+  # 7 bits, wide range.
+  def testOp_with7BitsNoScalingNoNudging(self):
+    self._TestOp(0.0, 127.0, 7, False, 0.0, 127.0, 1.0)
+
+  def testOp_with7BitsScalingAndNudgingDown(self):
+    self._TestOp(0.5, 64.0, 7, False, 0.0, 63.5, 0.5)
+
+  def testOp_with7BitsScalingAndNudgingUp(self):
+    self._TestOp(-64.0, -0.5, 7, False, -63.5, 0.0, 0.5)
+
+  def testOp_with7BitsScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 63.4, 7, False, 0.0, 63.5, 0.5)
+
+  # 7 bits, narrow range.
+  def testOp_with7BitsNarrowRangeNoScalingNoNudging(self):
+    self._TestOp(0.0, 126.0, 7, True, 0.0, 126.0, 1.0)
+
+  def testOp_with7BitsNarrowRangeScalingAndNudgingDown(self):
+    self._TestOp(0.1, 63.1, 7, True, 0.0, 63.0, 0.5)
+
+  def testOp_with7BitsNarrowRangeScalingAndNudgingUp(self):
+    self._TestOp(-63.1, -0.1, 7, True, -63.0, 0.0, 0.5)
+
+  def testOp_with7BitsNarrowRangeScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 62.9, 7, True, 0.0, 63.0, 0.5)
+
+  def _TestOp(self, input_min, input_max, num_bits, narrow_range,
+              expected_nudged_input_min, expected_nudged_input_max,
+              expected_step):
+    inputs = np.array(
+        [
+            expected_nudged_input_min - expected_step,
+            expected_nudged_input_min - 0.01, expected_nudged_input_min,
+            expected_nudged_input_min + 0.01,
+            expected_nudged_input_min + expected_step - 0.01,
+            expected_nudged_input_min + expected_step,
+            expected_nudged_input_min + expected_step + 0.01,
+            expected_nudged_input_max - 0.01, expected_nudged_input_max,
+            expected_nudged_input_max + 0.01,
+            expected_nudged_input_max + expected_step
+        ],
+        dtype=np.float32)
+    gradients = np.arange(1, len(inputs) + 1, dtype=np.float32)
+    expected_backprops = np.array(
+        [0.0, 0.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0, 0.0],
+        dtype=np.float32)
+
+    with self.test_session() as session:
+      with self.test_scope():
+        gradient_placeholder = array_ops.placeholder(
+            dtypes.float32, gradients.shape, name="gradients")
+        input_placeholder = array_ops.placeholder(
+            dtypes.float32, inputs.shape, name="inputs")
+        outputs = gen_array_ops.fake_quant_with_min_max_args_gradient(
+            gradient_placeholder,
+            input_placeholder,
+            min=input_min,
+            max=input_max,
+            num_bits=num_bits,
+            narrow_range=narrow_range)
+      backprops = session.run(outputs, {
+          gradient_placeholder: gradients,
+          input_placeholder: inputs
+      })
+      self.assertAllCloseAccordingToType(
+          backprops,
+          expected_backprops,
+          rtol=1e-3,
+          atol=1e-5,
+          bfloat16_rtol=0.03)
+
+
+class FakeQuantWithMinMaxVarsTest(XLATestCase):
+  """Test cases for FakeQuantWithMinMaxVars operation."""
+
+  # 8 bits, wide range.
+  def testOp_with8BitsNoScalingNoNudging(self):
+    self._TestOp(0.0, 255.0, 8, False, 0.0, 255.0, 1.0)
+
+  def testOp_with8BitsScalingAndNudgingDown(self):
+    self._TestOp(0.5, 128.0, 8, False, 0.0, 127.5, 0.5)
+
+  def testOp_with8BitsScalingAndNudgingUp(self):
+    self._TestOp(-128.0, -0.5, 8, False, -127.5, 0.0, 0.5)
+
+  def testOp_with8BitsScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 127.4, 8, False, 0.0, 127.5, 0.5)
+
+  # 8 bits, narrow range.
+  def testOp_with8BitsNarrowRangeNoScalingNoNudging(self):
+    self._TestOp(0.0, 254.0, 8, True, 0.0, 254.0, 1.0)
+
+  def testOp_with8BitsNarrowRangeScalingAndNudgingDown(self):
+    self._TestOp(0.1, 127.1, 8, True, 0.0, 127.0, 0.5)
+
+  def testOp_with8BitsNarrowRangeScalingAndNudgingUp(self):
+    self._TestOp(-127.1, -0.1, 8, True, -127.0, 0.0, 0.5)
+
+  def testOp_with8BitsNarrowRangeScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 126.9, 8, True, 0.0, 127.0, 0.5)
+
+  # 7 bits, wide range.
+  def testOp_with7BitsNoScalingNoNudging(self):
+    self._TestOp(0.0, 127.0, 7, False, 0.0, 127.0, 1.0)
+
+  def testOp_with7BitsScalingAndNudgingDown(self):
+    self._TestOp(0.5, 64.0, 7, False, 0.0, 63.5, 0.5)
+
+  def testOp_with7BitsScalingAndNudgingUp(self):
+    self._TestOp(-64.0, -0.5, 7, False, -63.5, 0.0, 0.5)
+
+  def testOp_with7BitsScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 63.4, 7, False, 0.0, 63.5, 0.5)
+
+  # 7 bits, narrow range.
+  def testOp_with7BitsNarrowRangeNoScalingNoNudging(self):
+    self._TestOp(0.0, 126.0, 7, True, 0.0, 126.0, 1.0)
+
+  def testOp_with7BitsNarrowRangeScalingAndNudgingDown(self):
+    self._TestOp(0.1, 63.1, 7, True, 0.0, 63.0, 0.5)
+
+  def testOp_with7BitsNarrowRangeScalingAndNudgingUp(self):
+    self._TestOp(-63.1, -0.1, 7, True, -63.0, 0.0, 0.5)
+
+  def testOp_with7BitsNarrowRangeScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 62.9, 7, True, 0.0, 63.0, 0.5)
+
+  def _TestOp(self, input_min, input_max, num_bits, narrow_range,
+              expected_nudged_input_min, expected_nudged_input_max,
+              expected_step):
+    inputs = np.array(
+        [
+            expected_nudged_input_min - expected_step,
+            expected_nudged_input_min - 0.01, expected_nudged_input_min,
+            expected_nudged_input_min + 0.01,
+            expected_nudged_input_min + expected_step - 0.01,
+            expected_nudged_input_min + expected_step,
+            expected_nudged_input_min + expected_step + 0.01,
+            expected_nudged_input_max - 0.01, expected_nudged_input_max,
+            expected_nudged_input_max + 0.01,
+            expected_nudged_input_max + expected_step
+        ],
+        dtype=np.float32)
+    expected = np.array(
+        [
+            expected_nudged_input_min, expected_nudged_input_min,
+            expected_nudged_input_min, expected_nudged_input_min,
+            expected_nudged_input_min + expected_step,
+            expected_nudged_input_min + expected_step,
+            expected_nudged_input_min + expected_step,
+            expected_nudged_input_max, expected_nudged_input_max,
+            expected_nudged_input_max, expected_nudged_input_max
+        ],
+        dtype=np.float32)
+
+    with self.test_session() as session:
+      with self.test_scope():
+        input_placeholder = array_ops.placeholder(
+            dtypes.float32, inputs.shape, name="inputs")
+        min_placeholder = array_ops.placeholder(dtypes.float32, (), name="min")
+        max_placeholder = array_ops.placeholder(dtypes.float32, (), name="max")
+        outputs = array_ops.fake_quant_with_min_max_vars(
+            input_placeholder,
+            min_placeholder,
+            max_placeholder,
+            num_bits=num_bits,
+            narrow_range=narrow_range)
+      result = session.run(
+          outputs, {
+              input_placeholder: inputs,
+              min_placeholder: input_min,
+              max_placeholder: input_max
+          })
+      self.assertAllCloseAccordingToType(
+          result, expected, rtol=1e-3, atol=1e-5, bfloat16_rtol=0.03)
+
+
+class FakeQuantWithMinMaxVarsGradientTest(XLATestCase):
+  """Test cases for FakeQuantWithMinMaxVarsGradient operation."""
+
+  # 8 bits, wide range.
+  def testOp_with8BitsNoScalingNoNudging(self):
+    self._TestOp(0.0, 255.0, 8, False, 0.0, 255.0, 1.0)
+
+  def testOp_with8BitsScalingAndNudgingDown(self):
+    self._TestOp(0.5, 128.0, 8, False, 0.0, 127.5, 0.5)
+
+  def testOp_with8BitsScalingAndNudgingUp(self):
+    self._TestOp(-128.0, -0.5, 8, False, -127.5, 0.0, 0.5)
+
+  def testOp_with8BitsScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 127.4, 8, False, 0.0, 127.5, 0.5)
+
+  # 8 bits, narrow range.
+  def testOp_with8BitsNarrowRangeNoScalingNoNudging(self):
+    self._TestOp(0.0, 254.0, 8, True, 0.0, 254.0, 1.0)
+
+  def testOp_with8BitsNarrowRangeScalingAndNudgingDown(self):
+    self._TestOp(0.1, 127.1, 8, True, 0.0, 127.0, 0.5)
+
+  def testOp_with8BitsNarrowRangeScalingAndNudgingUp(self):
+    self._TestOp(-127.1, -0.1, 8, True, -127.0, 0.0, 0.5)
+
+  def testOp_with8BitsNarrowRangeScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 126.9, 8, True, 0.0, 127.0, 0.5)
+
+  # 7 bits, wide range.
+  def testOp_with7BitsNoScalingNoNudging(self):
+    self._TestOp(0.0, 127.0, 7, False, 0.0, 127.0, 1.0)
+
+  def testOp_with7BitsScalingAndNudgingDown(self):
+    self._TestOp(0.5, 64.0, 7, False, 0.0, 63.5, 0.5)
+
+  def testOp_with7BitsScalingAndNudgingUp(self):
+    self._TestOp(-64.0, -0.5, 7, False, -63.5, 0.0, 0.5)
+
+  def testOp_with7BitsScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 63.4, 7, False, 0.0, 63.5, 0.5)
+
+  # 7 bits, narrow range.
+  def testOp_with7BitsNarrowRangeNoScalingNoNudging(self):
+    self._TestOp(0.0, 126.0, 7, True, 0.0, 126.0, 1.0)
+
+  def testOp_with7BitsNarrowRangeScalingAndNudgingDown(self):
+    self._TestOp(0.1, 63.1, 7, True, 0.0, 63.0, 0.5)
+
+  def testOp_with7BitsNarrowRangeScalingAndNudgingUp(self):
+    self._TestOp(-63.1, -0.1, 7, True, -63.0, 0.0, 0.5)
+
+  def testOp_with7BitsNarrowRangeScalingAndNudgingBetween(self):
+    self._TestOp(-0.1, 62.9, 7, True, 0.0, 63.0, 0.5)
+
+  def _TestOp(self, input_min, input_max, num_bits, narrow_range,
+              expected_nudged_input_min, expected_nudged_input_max,
+              expected_step):
+    inputs = np.array(
+        [
+            expected_nudged_input_min - expected_step,
+            expected_nudged_input_min - 0.01, expected_nudged_input_min,
+            expected_nudged_input_min + 0.01,
+            expected_nudged_input_min + expected_step - 0.01,
+            expected_nudged_input_min + expected_step,
+            expected_nudged_input_min + expected_step + 0.01,
+            expected_nudged_input_max - 0.01, expected_nudged_input_max,
+            expected_nudged_input_max + 0.01,
+            expected_nudged_input_max + expected_step
+        ],
+        dtype=np.float32)
+    gradients = np.arange(1, len(inputs) + 1, dtype=np.float32)
+    expected_backprops_wrt_input = np.array(
+        [0.0, 0.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0, 0.0],
+        dtype=np.float32)
+    expected_backprops_wrt_min = 1.0 + 2.0
+    expected_backprops_wrt_max = 10.0 + 11.0
+
+    with self.test_session() as session:
+      with self.test_scope():
+        gradient_placeholder = array_ops.placeholder(
+            dtypes.float32, gradients.shape, name="gradients")
+        input_placeholder = array_ops.placeholder(
+            dtypes.float32, inputs.shape, name="inputs")
+        min_placeholder = array_ops.placeholder(dtypes.float32, (), name="min")
+        max_placeholder = array_ops.placeholder(dtypes.float32, (), name="max")
+        outputs = array_ops.fake_quant_with_min_max_vars_gradient(
+            gradient_placeholder,
+            input_placeholder,
+            min_placeholder,
+            max_placeholder,
+            num_bits=num_bits,
+            narrow_range=narrow_range)
+      backprops_wrt_input, backprops_wrt_min, backprops_wrt_max = session.run(
+          outputs, {
+              gradient_placeholder: gradients,
+              input_placeholder: inputs,
+              min_placeholder: input_min,
+              max_placeholder: input_max
+          })
+      self.assertAllCloseAccordingToType(
+          backprops_wrt_input,
+          expected_backprops_wrt_input,
+          rtol=1e-3,
+          atol=1e-5,
+          bfloat16_rtol=0.03)
+      self.assertAllCloseAccordingToType(
+          backprops_wrt_min,
+          expected_backprops_wrt_min,
+          rtol=1e-3,
+          atol=1e-5,
+          bfloat16_rtol=0.03)
+      self.assertAllCloseAccordingToType(
+          backprops_wrt_max,
+          expected_backprops_wrt_max,
+          rtol=1e-3,
+          atol=1e-5,
+          bfloat16_rtol=0.03)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tests/fft_test.py b/tensorflow/compiler/tests/fft_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..afb5fa4bb4fefe5bc2ecded826143ffc83c2b559
--- /dev/null
+++ b/tensorflow/compiler/tests/fft_test.py
@@ -0,0 +1,204 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for FFT via the XLA JIT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+import scipy.signal as sps
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.contrib.signal.python.ops import spectral_ops as signal
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import spectral_ops
+from tensorflow.python.platform import googletest
+
+BATCH_DIMS = (3, 5)
+RTOL = 0.02  # Eigen/cuFFT differ widely from np, especially for FFT3D
+ATOL = 1e-3
+
+
+def pick_10(x):
+  x = list(x)
+  np.random.seed(123)
+  np.random.shuffle(x)
+  return x[:10]
+
+
+def to_32bit(x):
+  if x.dtype == np.complex128:
+    return x.astype(np.complex64)
+  if x.dtype == np.float64:
+    return x.astype(np.float32)
+  return x
+
+
+POWS_OF_2 = 2**np.arange(3, 12)
+INNER_DIMS_1D = list((x,) for x in POWS_OF_2)
+POWS_OF_2 = 2**np.arange(3, 8)  # To avoid OOM on GPU.
+INNER_DIMS_2D = pick_10(itertools.product(POWS_OF_2, POWS_OF_2))
+INNER_DIMS_3D = pick_10(itertools.product(POWS_OF_2, POWS_OF_2, POWS_OF_2))
+
+
+class FFTTest(XLATestCase):
+
+  def _VerifyFftMethod(self, inner_dims, complex_to_input, input_to_expected,
+                       tf_method):
+    for indims in inner_dims:
+      print("nfft =", indims)
+      shape = BATCH_DIMS + indims
+      data = np.arange(np.prod(shape) * 2) / np.prod(indims)
+      np.random.seed(123)
+      np.random.shuffle(data)
+      data = np.reshape(data.astype(np.float32).view(np.complex64), shape)
+      data = to_32bit(complex_to_input(data))
+      expected = to_32bit(input_to_expected(data))
+      with self.test_session() as sess:
+        with self.test_scope():
+          ph = array_ops.placeholder(
+              dtypes.as_dtype(data.dtype), shape=data.shape)
+          out = tf_method(ph)
+        value = sess.run(out, {ph: data})
+        self.assertAllClose(expected, value, rtol=RTOL, atol=ATOL)
+
+  def testContribSignalSTFT(self):
+    ws = 512
+    hs = 128
+    dims = (ws * 20,)
+    shape = BATCH_DIMS + dims
+    data = np.arange(np.prod(shape)) / np.prod(dims)
+    np.random.seed(123)
+    np.random.shuffle(data)
+    data = np.reshape(data.astype(np.float32), shape)
+    window = sps.get_window("hann", ws)
+    expected = sps.stft(
+        data, nperseg=ws, noverlap=ws - hs, boundary=None, window=window)[2]
+    expected = np.swapaxes(expected, -1, -2)
+    expected *= window.sum()  # scipy divides by window sum
+    with self.test_session() as sess:
+      with self.test_scope():
+        ph = array_ops.placeholder(
+            dtypes.as_dtype(data.dtype), shape=data.shape)
+        out = signal.stft(ph, ws, hs)
+
+      value = sess.run(out, {ph: data})
+      self.assertAllClose(expected, value, rtol=RTOL, atol=ATOL)
+
+  def testFFT(self):
+    self._VerifyFftMethod(INNER_DIMS_1D, lambda x: x, np.fft.fft,
+                          spectral_ops.fft)
+
+  def testFFT2D(self):
+    self._VerifyFftMethod(INNER_DIMS_2D, lambda x: x, np.fft.fft2,
+                          spectral_ops.fft2d)
+
+  def testFFT3D(self):
+    self._VerifyFftMethod(INNER_DIMS_3D, lambda x: x,
+                          lambda x: np.fft.fftn(x, axes=(-3, -2, -1)),
+                          spectral_ops.fft3d)
+
+  def testIFFT(self):
+    self._VerifyFftMethod(INNER_DIMS_1D, lambda x: x, np.fft.ifft,
+                          spectral_ops.ifft)
+
+  def testIFFT2D(self):
+    self._VerifyFftMethod(INNER_DIMS_2D, lambda x: x, np.fft.ifft2,
+                          spectral_ops.ifft2d)
+
+  def testIFFT3D(self):
+    self._VerifyFftMethod(INNER_DIMS_3D, lambda x: x,
+                          lambda x: np.fft.ifftn(x, axes=(-3, -2, -1)),
+                          spectral_ops.ifft3d)
+
+  def testRFFT(self):
+    self._VerifyFftMethod(
+        INNER_DIMS_1D, np.real, lambda x: np.fft.rfft(x, n=x.shape[-1]),
+        lambda x: spectral_ops.rfft(x, fft_length=[x.shape[-1].value]))
+
+  def testRFFT2D(self):
+
+    def _tf_fn(x):
+      return spectral_ops.rfft2d(
+          x, fft_length=[x.shape[-2].value, x.shape[-1].value])
+
+    self._VerifyFftMethod(
+        INNER_DIMS_2D, np.real,
+        lambda x: np.fft.rfft2(x, s=[x.shape[-2], x.shape[-1]]), _tf_fn)
+
+  def testRFFT3D(self):
+
+    def _to_expected(x):
+      return np.fft.rfftn(
+          x, axes=(-3, -2, -1), s=[x.shape[-3], x.shape[-2], x.shape[-1]])
+
+    def _tf_fn(x):
+      return spectral_ops.rfft3d(
+          x,
+          fft_length=[x.shape[-3].value, x.shape[-2].value, x.shape[-1].value])
+
+    self._VerifyFftMethod(INNER_DIMS_3D, np.real, _to_expected, _tf_fn)
+
+  def testIRFFT(self):
+
+    def _tf_fn(x):
+      return spectral_ops.irfft(x, fft_length=[2 * (x.shape[-1].value - 1)])
+
+    self._VerifyFftMethod(
+        INNER_DIMS_1D, lambda x: np.fft.rfft(np.real(x), n=x.shape[-1]),
+        lambda x: np.fft.irfft(x, n=2 * (x.shape[-1] - 1)), _tf_fn)
+
+  def testIRFFT2D(self):
+
+    def _tf_fn(x):
+      return spectral_ops.irfft2d(
+          x, fft_length=[x.shape[-2].value, 2 * (x.shape[-1].value - 1)])
+
+    self._VerifyFftMethod(
+        INNER_DIMS_2D,
+        lambda x: np.fft.rfft2(np.real(x), s=[x.shape[-2], x.shape[-1]]),
+        lambda x: np.fft.irfft2(x, s=[x.shape[-2], 2 * (x.shape[-1] - 1)]),
+        _tf_fn)
+
+  def testIRFFT3D(self):
+
+    def _to_input(x):
+      return np.fft.rfftn(
+          np.real(x),
+          axes=(-3, -2, -1),
+          s=[x.shape[-3], x.shape[-2], x.shape[-1]])
+
+    def _to_expected(x):
+      return np.fft.irfftn(
+          x,
+          axes=(-3, -2, -1),
+          s=[x.shape[-3], x.shape[-2], 2 * (x.shape[-1] - 1)])
+
+    def _tf_fn(x):
+      return spectral_ops.irfft3d(
+          x,
+          fft_length=[
+              x.shape[-3].value, x.shape[-2].value, 2 * (x.shape[-1].value - 1)
+          ])
+
+    self._VerifyFftMethod(INNER_DIMS_3D, _to_input, _to_expected, _tf_fn)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index 7e3871312c86530b6d3cb0bbacc16c25d3469832..f9db4cf2017c0b4b6dc0cfeeda6dca7bb9d14f19 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -161,9 +161,9 @@ class FtrlOptimizerTest(XLATestCase):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(
+        self.assertAllCloseAccordingToType(
             np.array([-2.55607247, -3.98729396]), var0.eval(), 1e-5, 1e-5)
-        self.assertAllClose(
+        self.assertAllCloseAccordingToType(
             np.array([-0.28232238, -0.56096673]), var1.eval(), 1e-5, 1e-5)
 
   def testFtrlWithL1(self):
@@ -189,10 +189,10 @@ class FtrlOptimizerTest(XLATestCase):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(np.array([-7.66718769, -10.91273689]), var0.eval(),
-                            rtol=1e-4)
-        self.assertAllClose(np.array([-0.93460727, -1.86147261]), var1.eval(),
-                            rtol=1e-4)
+        self.assertAllCloseAccordingToType(
+            np.array([-7.66718769, -10.91273689]), var0.eval(), rtol=1e-4)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.93460727, -1.86147261]), var1.eval(), rtol=1e-4)
 
   def testFtrlWithL1_L2(self):
     for dtype in self.float_types:
@@ -217,10 +217,10 @@ class FtrlOptimizerTest(XLATestCase):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(np.array([-0.24059935, -0.46829352]), var0.eval(),
-                            rtol=1e-5)
-        self.assertAllClose(np.array([-0.02406147, -0.04830509]), var1.eval(),
-                            rtol=1e-5)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.24059935, -0.46829352]), var0.eval(), rtol=1e-5)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02406147, -0.04830509]), var1.eval(), rtol=1e-5)
 
   def testFtrlWithL1_L2_L2Shrinkage(self):
     """Test the new FTRL op with support for l2 shrinkage.
@@ -244,18 +244,18 @@ class FtrlOptimizerTest(XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
 
         # Run 10 steps FTRL
         for _ in range(10):
           ftrl_update.run()
 
         # Validate updated params
-        self.assertAllClose(np.array([-0.21931979, -0.40642974]), var0.eval(),
-                            rtol=1e-4)
-        self.assertAllClose(np.array([-0.0282721, -0.07188385]), var1.eval(),
-                            rtol=1e-4)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.21931979, -0.40642974]), var0.eval(), rtol=1e-4)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.0282721, -0.07188385]), var1.eval(), rtol=1e-4)
 
   # When variables are initialized with Zero, FTRL-Proximal has two properties:
   # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
@@ -272,8 +272,8 @@ class FtrlOptimizerTest(XLATestCase):
       with self.test_session(), self.test_scope():
         val2, val3 = self.equivAdagradTest_AdagradPart(steps, dtype)
 
-    self.assertAllClose(val0, val2, rtol=1e-4)
-    self.assertAllClose(val1, val3, rtol=1e-4)
+    self.assertAllCloseAccordingToType(val0, val2, rtol=1e-4)
+    self.assertAllCloseAccordingToType(val1, val3, rtol=1e-4)
 
   def testEquivGradientDescentwithoutRegularization(self):
     steps = 5
@@ -284,8 +284,8 @@ class FtrlOptimizerTest(XLATestCase):
         val2, val3 = self.equivGradientDescentTest_GradientDescentPart(
             steps, dtype)
 
-    self.assertAllClose(val0, val2, rtol=1e-5)
-    self.assertAllClose(val1, val3, rtol=1e-5)
+    self.assertAllCloseAccordingToType(val0, val2, rtol=1e-5)
+    self.assertAllCloseAccordingToType(val1, val3, rtol=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/fused_batchnorm_test.py b/tensorflow/compiler/tests/fused_batchnorm_test.py
index 00a9c9a65ba03d099581a3ee0dbe32c33e111231..a80d69fa5f5099b8a8b67df0da9c92b957e9d194 100644
--- a/tensorflow/compiler/tests/fused_batchnorm_test.py
+++ b/tensorflow/compiler/tests/fused_batchnorm_test.py
@@ -155,7 +155,7 @@ class FusedBatchNormTest(XLATestCase):
   def testLearningWithGradientChecker(self):
     self._testLearning(True)
 
-  def testGradient(self):
+  def testGradientTraining(self):
     # TODO(b/64270657): Use gradient_checker here in addition to comparing with
     # this reference implementation.
     channel = 3
@@ -175,7 +175,7 @@ class FusedBatchNormTest(XLATestCase):
       var = array_ops.placeholder(np.float32, shape=scale_shape, name="var")
       scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
       grad_x, grad_scale, grad_offset, _, _ = gen_nn_ops.fused_batch_norm_grad(
-          grad, x, scale, mean, var, data_format="NHWC")
+          grad, x, scale, mean, var, data_format="NHWC", is_training=True)
 
       grad_x_val, grad_scale_val, grad_offset_val = sess.run(
           [grad_x, grad_scale, grad_offset], {
@@ -193,6 +193,53 @@ class FusedBatchNormTest(XLATestCase):
       self.assertAllClose(grad_scale_val, grad_scale_ref, atol=1e-2)
       self.assertAllClose(grad_offset_val, grad_offset_ref, atol=1e-3)
 
+  def testGradientInference(self):
+    # TODO(b/64270657): Use gradient_checker here in addition to comparing with
+    # this reference implementation.
+    channel = 3
+    x_shape = [2, 2, 6, channel]
+    scale_shape = [channel]
+    grad_val = np.random.random_sample(x_shape).astype(np.float32)
+    x_val = np.random.random_sample(x_shape).astype(np.float32)
+    scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+    mean_val = np.random.random_sample(scale_shape).astype(np.float32)
+    var_val = np.random.random_sample(scale_shape).astype(np.float32)
+
+    with self.test_session() as sess, self.test_scope():
+      grad = array_ops.placeholder(np.float32, shape=x_shape, name="grad")
+      x = array_ops.placeholder(np.float32, shape=x_shape, name="x")
+      mean = array_ops.placeholder(np.float32, shape=scale_shape, name="mean")
+      var = array_ops.placeholder(np.float32, shape=scale_shape, name="var")
+      scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
+      with self.test_scope():
+        out = gen_nn_ops.fused_batch_norm_grad(
+            grad, x, scale, mean, var, data_format="NHWC", is_training=False)
+        grad_x, grad_scale, grad_offset, _, _ = out
+
+      ref_x, ref_scale, ref_offset, _, _ = gen_nn_ops.fused_batch_norm_grad(
+          grad, x, scale, mean, var, data_format="NHWC", is_training=False)
+
+      grad_x_val, grad_scale_val, grad_offset_val, = sess.run(
+          [grad_x, grad_scale, grad_offset], {
+              grad: grad_val,
+              x: x_val,
+              mean: mean_val,
+              var: var_val,
+              scale: scale_val
+          })
+      grad_x_ref, grad_scale_ref, grad_offset_ref, = sess.run(
+          [ref_x, ref_scale, ref_offset], {
+              grad: grad_val,
+              x: x_val,
+              mean: mean_val,
+              var: var_val,
+              scale: scale_val
+          })
+
+      self.assertAllClose(grad_x_val, grad_x_ref, atol=1e-2)
+      self.assertAllClose(grad_scale_val, grad_scale_ref, atol=1e-2)
+      self.assertAllClose(grad_offset_val, grad_offset_ref, atol=1e-3)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/gather_nd_op_test.py b/tensorflow/compiler/tests/gather_nd_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9378b1db7245c0da3e8298e7dcd972491616b0cd
--- /dev/null
+++ b/tensorflow/compiler/tests/gather_nd_op_test.py
@@ -0,0 +1,147 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.gather_nd."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class GatherNdTest(XLATestCase):
+
+  def _runGather(self, params, indices):
+    with self.test_session():
+      paramsp = array_ops.placeholder(params.dtype)
+      indicesp = array_ops.placeholder(indices.dtype)
+      with self.test_scope():
+        gather_nd_t = array_ops.gather_nd(paramsp, indicesp)
+      feed_dict = {paramsp: params, indicesp: indices}
+      return gather_nd_t.eval(feed_dict=feed_dict)
+
+  def testSimpleDtype(self):
+    for dtype in self.numeric_types:
+      self.assertAllEqual(
+          np.array([7, 7, 8], dtype=dtype),
+          self._runGather(
+              np.array([8, 1, 2, 3, 7, 5], dtype=dtype),
+              np.array([[4], [4], [0]], np.int32)))
+
+  def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
+    with self.test_session():
+      params = np.ones((3, 3), dtype=np.float32)
+
+      indices_empty = np.empty((0, 2), dtype=np.int32)
+      gather_nd_ok_val = self._runGather(params, indices_empty)
+      self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
+
+      indices_empty = np.empty((0, 1), dtype=np.int32)
+      gather_nd_ok_val = self._runGather(params, indices_empty)
+      self.assertAllClose(np.empty((0, 3), dtype=np.float32), gather_nd_ok_val)
+
+      params_empty = np.empty((0, 3), dtype=np.float32)
+      indices_empty = np.empty((0, 2), dtype=np.int32)
+      gather_nd_ok_val = self._runGather(params_empty, indices_empty)
+      self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
+
+      params_empty = np.empty((0, 3), dtype=np.float32)
+      indices_nonempty = np.zeros((1, 2), dtype=np.int32)
+      with self.assertRaisesWithPredicateMatch(
+          errors.InvalidArgumentError, r"Gather dimension 0 is of size zero"):
+        self._runGather(params_empty, indices_nonempty)
+
+  def testIndexScalar(self):
+    params = np.array(
+        [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
+    indices = np.array([4, 1], dtype=np.int32)
+    gather_nd_val = self._runGather(params, indices)
+    self.assertAllEqual(np.array(7), gather_nd_val)
+
+  def testParamsRankLargerThanIndexIndexScalarSlices(self):
+    params = np.array(
+        [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
+    indices = np.array(
+        [
+            4,
+        ], dtype=np.int32)
+    gather_nd_val = self._runGather(params, indices)
+    self.assertAllEqual(np.array([-7, 7]), gather_nd_val)
+
+  def testParamsRankLargerThanIndexSlices(self):
+    params = np.array(
+        [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
+    indices = np.array([[4], [4], [0]], np.int32)
+    gather_nd_val = self._runGather(params, indices)
+    self.assertAllEqual(np.array([[-7, 7], [-7, 7], [-8, 8]]), gather_nd_val)
+
+  def testHigherRankParamsLargerThanIndexSlices(self):
+    params = np.array(
+        [[[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]],
+         [[-80, -10, -20, -30, -70, -50], [80, 10, 20, 30, 70, 50]]],
+        dtype=np.float32).T
+    indices = np.array([[4], [4], [0]], np.int32)
+    gather_nd_val = self._runGather(params, indices)
+    self.assertAllEqual(params[[4, 4, 0]], gather_nd_val)
+
+  def testEmptyIndicesLastRankMeansCopyEntireTensor(self):
+    params = np.array(
+        [[[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]],
+         [[-80, -10, -20, -30, -70, -50], [80, 10, 20, 30, 70, 50]]],
+        dtype=np.float32).T
+    indices = np.array([[], []], dtype=np.int32)  # Size (2, 0)
+    gather_nd_val = self._runGather(params, indices)
+    self.assertAllEqual(
+        np.vstack((params[np.newaxis, :], params[np.newaxis, :])),
+        gather_nd_val)
+
+  def testHigherRankParamsAndIndicesLargerThanIndexSlices(self):
+    params = np.array(
+        [[[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]],
+         [[-80, -10, -20, -30, -70, -50], [80, 10, 20, 30, 70, 50]]],
+        dtype=np.float32).T
+    indices = np.array([[[3], [2], [1]], [[4], [4], [0]]], np.int32)
+    gather_nd_val = self._runGather(params, indices)
+    self.assertAllEqual(params[[3, 2, 1, 4, 4, 0]].reshape(2, 3, 2, 2),
+                        gather_nd_val)
+
+  def testHigherRankParams(self):
+    shape = (10, 20, 5, 1, 17)
+    params = np.random.rand(*shape).astype(np.float32)
+    indices = np.vstack(
+        [np.random.randint(0, s, size=2000, dtype=np.int32) for s in shape]).T
+    gather_nd_val = self._runGather(params, indices)
+
+    expected = params[tuple(indices.T)]
+    self.assertAllEqual(expected, gather_nd_val)
+
+  def testHigherRankParamsAndIndices(self):
+    shape = (10, 20, 5, 1, 17)
+    params = np.random.rand(*shape).astype(np.float32)
+    indices = np.vstack(
+        [np.random.randint(0, s, size=2000, dtype=np.int32) for s in shape]).T
+    indices_reshaped = indices.reshape([10, 10, 20, 5])
+    gather_nd_val = self._runGather(params, indices_reshaped)
+    expected = params[tuple(indices.T)]
+    self.assertAllEqual(expected.reshape([10, 10, 20]), gather_nd_val)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/gather_test.py b/tensorflow/compiler/tests/gather_test.py
index 13cbe6f312f5175edaec28fa7a8f28064194b0e9..1a8c4519118f69ce51ca9a5eb95a9d706c7766cc 100644
--- a/tensorflow/compiler/tests/gather_test.py
+++ b/tensorflow/compiler/tests/gather_test.py
@@ -122,6 +122,20 @@ class GatherTest(xla_test.XLATestCase):
             gather_np = np.take(params, indices, axis=axis)
             self.assertAllEqual(gather_np, gather_value)
 
+  def testIndicesWithDifferentDimensions(self):
+    with self.test_session():
+      for dtype in self.numeric_tf_types:
+        params = array_ops.placeholder(dtype=dtype)
+        indices = array_ops.placeholder(dtype=np.int32)
+        with self.test_scope():
+          gather = array_ops.gather(params, indices)
+        self.assertAllEqual(
+            7, gather.eval(feed_dict={params: [4, 7, 2], indices: 1}))
+        self.assertAllEqual(
+            [7], gather.eval(feed_dict={params: [4, 7, 2], indices: [1]}))
+        self.assertAllEqual(
+            [[7]], gather.eval(feed_dict={params: [4, 7, 2], indices: [[1]]}))
+
 
 class GatherBenchmark(test.Benchmark):
   """Microbenchmarks for the gather op."""
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..538fa8e8e570b83ed681ecc0501285520cabdecb
--- /dev/null
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -0,0 +1,552 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for image ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import colorsys
+import math
+
+import numpy as np
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_image_ops
+from tensorflow.python.ops import image_ops
+from tensorflow.python.platform import test
+
+
+class RGBToHSVTest(XLATestCase):
+
+  def testBatch(self):
+    # Build an arbitrary RGB image
+    np.random.seed(7)
+    batch_size = 5
+    shape = (batch_size, 2, 7, 3)
+
+    for nptype in self.float_types:
+      inp = np.random.rand(*shape).astype(nptype)
+
+      # Convert to HSV and back, as a batch and individually
+      with self.test_session() as sess:
+        batch0 = array_ops.placeholder(nptype, shape=shape)
+        with self.test_scope():
+          batch1 = image_ops.rgb_to_hsv(batch0)
+          batch2 = image_ops.hsv_to_rgb(batch1)
+        split0 = array_ops.unstack(batch0)
+        with self.test_scope():
+          split1 = list(map(image_ops.rgb_to_hsv, split0))
+          split2 = list(map(image_ops.hsv_to_rgb, split1))
+        join1 = array_ops.stack(split1)
+        join2 = array_ops.stack(split2)
+        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2],
+                                                {
+                                                    batch0: inp
+                                                })
+
+      # Verify that processing batch elements together is the same as separate
+      self.assertAllClose(batch1, join1)
+      self.assertAllClose(batch2, join2)
+      self.assertAllCloseAccordingToType(batch2, inp, bfloat16_atol=0.03)
+
+  def testRGBToHSVRoundTrip(self):
+    data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    for nptype in self.float_types:
+      rgb_np = np.array(data, dtype=nptype).reshape([2, 2, 3]) / 255.
+      with self.test_session():
+        placeholder = array_ops.placeholder(nptype)
+        with self.test_scope():
+          hsv = image_ops.rgb_to_hsv(placeholder)
+          rgb = image_ops.hsv_to_rgb(hsv)
+        rgb_tf = rgb.eval(feed_dict={placeholder: rgb_np})
+      self.assertAllCloseAccordingToType(rgb_tf, rgb_np, bfloat16_atol=0.03)
+
+  def testRGBToHSVNumpy(self):
+    """Tests the RGB to HSV conversion matches a reference implementation."""
+    for nptype in self.float_types:
+      rgb_flat = np.random.random(64 * 3).reshape((64, 3)).astype(nptype)
+      rgb_np = rgb_flat.reshape(4, 4, 4, 3)
+      hsv_np = np.array([
+          colorsys.rgb_to_hsv(
+              r.astype(np.float64), g.astype(np.float64), b.astype(np.float64))
+          for r, g, b in rgb_flat
+      ])
+      hsv_np = hsv_np.reshape(4, 4, 4, 3)
+      with self.test_session():
+        placeholder = array_ops.placeholder(nptype)
+        with self.test_scope():
+          hsv_op = image_ops.rgb_to_hsv(placeholder)
+        hsv_tf = hsv_op.eval(feed_dict={placeholder: rgb_np})
+      self.assertAllCloseAccordingToType(hsv_tf, hsv_np)
+
+
+class AdjustContrastTest(XLATestCase):
+
+  def _testContrast(self, x_np, y_np, contrast_factor):
+    with self.test_session():
+      x = array_ops.placeholder(x_np.dtype, shape=x_np.shape)
+      flt_x = image_ops.convert_image_dtype(x, dtypes.float32)
+      with self.test_scope():
+        y = image_ops.adjust_contrast(flt_x, contrast_factor)
+      y = image_ops.convert_image_dtype(y, x.dtype, saturate=True)
+      y_tf = y.eval({x: x_np})
+      self.assertAllClose(y_tf, y_np, 1e-6)
+
+  def testFloatContrast(self):
+    x_shape = [1, 2, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.float32).reshape(x_shape) / 255.
+
+    y_data = [
+        -45.25, -90.75, -92.5, 62.75, 169.25, 333.5, 28.75, -84.75, 349.5,
+        134.75, 409.25, -116.5
+    ]
+    y_np = np.array(y_data, dtype=np.float32).reshape(x_shape) / 255.
+
+    self._testContrast(x_np, y_np, contrast_factor=2.0)
+
+  def testBatchContrast(self):
+    x_shape = [2, 1, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+
+    y_data = [0, 0, 0, 81, 200, 255, 10, 0, 255, 116, 255, 0]
+    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
+
+    self._testContrast(x_np, y_np, contrast_factor=2.0)
+
+  def _adjustContrastNp(self, x_np, contrast_factor):
+    mean = np.mean(x_np, (1, 2), keepdims=True)
+    y_np = mean + contrast_factor * (x_np - mean)
+    return y_np
+
+  def _adjustContrastTf(self, x_np, contrast_factor):
+    with self.test_session():
+      x = array_ops.placeholder(np.float32)
+      with self.test_scope():
+        y = image_ops.adjust_contrast(x, contrast_factor)
+      y_tf = y.eval({x: x_np})
+    return y_tf
+
+  def testRandomContrast(self):
+    x_shapes = [
+        [1, 2, 2, 3],
+        [2, 1, 2, 3],
+        [1, 2, 2, 3],
+        [2, 5, 5, 3],
+        [2, 1, 1, 3],
+    ]
+    for x_shape in x_shapes:
+      x_np = np.random.rand(*x_shape) * 255.
+      contrast_factor = np.random.rand() * 2.0 + 0.1
+      y_np = self._adjustContrastNp(x_np, contrast_factor)
+      y_tf = self._adjustContrastTf(x_np, contrast_factor)
+      self.assertAllClose(y_tf, y_np, rtol=1e-5, atol=1e-5)
+
+
+class AdjustHueTest(XLATestCase):
+
+  def testAdjustNegativeHue(self):
+    x_shape = [2, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+
+    delta = -0.25
+    y_data = [0, 13, 1, 54, 226, 59, 8, 234, 150, 255, 39, 1]
+    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
+
+    with self.test_session():
+      x = array_ops.placeholder(x_np.dtype, shape=x_shape)
+      flt_x = image_ops.convert_image_dtype(x, dtypes.float32)
+      with self.test_scope():
+        y = gen_image_ops.adjust_hue(flt_x, delta)
+      y = image_ops.convert_image_dtype(y, x.dtype, saturate=True)
+      y_tf = y.eval({x: x_np})
+      self.assertAllEqual(y_tf, y_np)
+
+  def testAdjustPositiveHue(self):
+    x_shape = [2, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+
+    delta = 0.25
+    y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255]
+    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
+
+    with self.test_session():
+      x = array_ops.placeholder(x_np.dtype, shape=x_shape)
+      flt_x = image_ops.convert_image_dtype(x, dtypes.float32)
+      with self.test_scope():
+        y = gen_image_ops.adjust_hue(flt_x, delta)
+      y = image_ops.convert_image_dtype(y, x.dtype, saturate=True)
+      y_tf = y.eval({x: x_np})
+      self.assertAllEqual(y_tf, y_np)
+
+  def testBatchAdjustHue(self):
+    x_shape = [2, 1, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+
+    delta = 0.25
+    y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255]
+    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
+
+    with self.test_session():
+      x = array_ops.placeholder(x_np.dtype, shape=x_shape)
+      flt_x = image_ops.convert_image_dtype(x, dtypes.float32)
+      with self.test_scope():
+        y = gen_image_ops.adjust_hue(flt_x, delta)
+      y = image_ops.convert_image_dtype(y, x.dtype, saturate=True)
+      y_tf = y.eval({x: x_np})
+      self.assertAllEqual(y_tf, y_np)
+
+  def _adjustHueNp(self, x_np, delta_h):
+    self.assertEqual(x_np.shape[-1], 3)
+    x_v = x_np.reshape([-1, 3])
+    y_v = np.ndarray(x_v.shape, dtype=x_v.dtype)
+    channel_count = x_v.shape[0]
+    for i in xrange(channel_count):
+      r = x_v[i][0]
+      g = x_v[i][1]
+      b = x_v[i][2]
+      h, s, v = colorsys.rgb_to_hsv(r, g, b)
+      h += delta_h
+      h = math.fmod(h + 10.0, 1.0)
+      r, g, b = colorsys.hsv_to_rgb(h, s, v)
+      y_v[i][0] = r
+      y_v[i][1] = g
+      y_v[i][2] = b
+    return y_v.reshape(x_np.shape)
+
+  def _adjustHueTf(self, x_np, delta_h):
+    with self.test_session():
+      x = array_ops.placeholder(dtypes.float32)
+      with self.test_scope():
+        y = gen_image_ops.adjust_hue(x, delta_h)
+      y_tf = y.eval({x: x_np})
+    return y_tf
+
+  def testAdjustRandomHue(self):
+    x_shapes = [
+        [2, 2, 3],
+        [4, 2, 3],
+        [2, 4, 3],
+        [2, 5, 3],
+        [1000, 1, 3],
+    ]
+    test_styles = [
+        "all_random",
+        "rg_same",
+        "rb_same",
+        "gb_same",
+        "rgb_same",
+    ]
+    for x_shape in x_shapes:
+      for test_style in test_styles:
+        x_np = np.random.rand(*x_shape) * 255.
+        delta_h = np.random.rand() * 2.0 - 1.0
+        if test_style == "all_random":
+          pass
+        elif test_style == "rg_same":
+          x_np[..., 1] = x_np[..., 0]
+        elif test_style == "rb_same":
+          x_np[..., 2] = x_np[..., 0]
+        elif test_style == "gb_same":
+          x_np[..., 2] = x_np[..., 1]
+        elif test_style == "rgb_same":
+          x_np[..., 1] = x_np[..., 0]
+          x_np[..., 2] = x_np[..., 0]
+        else:
+          raise AssertionError("Invalid test style: %s" % (test_style))
+        y_np = self._adjustHueNp(x_np, delta_h)
+        y_tf = self._adjustHueTf(x_np, delta_h)
+        self.assertAllClose(y_tf, y_np, rtol=2e-5, atol=1e-4)
+
+  def testInvalidShapes(self):
+    fused = False
+    if not fused:
+      # The tests are known to pass with the fused adjust_hue. We will enable
+      # them when the fused implementation is the default.
+      return
+    x_np = np.random.rand(2, 3) * 255.
+    delta_h = np.random.rand() * 2.0 - 1.0
+    fused = False
+    with self.assertRaisesRegexp(ValueError, "Shape must be at least rank 3"):
+      self._adjustHueTf(x_np, delta_h)
+    x_np = np.random.rand(4, 2, 4) * 255.
+    delta_h = np.random.rand() * 2.0 - 1.0
+    with self.assertRaisesOpError("input must have 3 channels"):
+      self._adjustHueTf(x_np, delta_h)
+
+
+class AdjustSaturationTest(XLATestCase):
+
+  def _adjust_saturation(self, image, saturation_factor):
+    image = ops.convert_to_tensor(image, name="image")
+    orig_dtype = image.dtype
+    flt_image = image_ops.convert_image_dtype(image, dtypes.float32)
+    with self.test_scope():
+      saturation_adjusted_image = gen_image_ops.adjust_saturation(
+          flt_image, saturation_factor)
+    return image_ops.convert_image_dtype(saturation_adjusted_image, orig_dtype)
+
+  def testHalfSaturation(self):
+    x_shape = [2, 2, 3]
+    x_rgb_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_rgb_data, dtype=np.uint8).reshape(x_shape)
+
+    saturation_factor = 0.5
+    y_rgb_data = [6, 9, 13, 140, 180, 226, 135, 121, 234, 172, 255, 128]
+    y_np = np.array(y_rgb_data, dtype=np.uint8).reshape(x_shape)
+
+    with self.test_session():
+      x = array_ops.placeholder(x_np.dtype, shape=x_shape)
+      y = self._adjust_saturation(x, saturation_factor)
+      y_tf = y.eval({x: x_np})
+      self.assertAllEqual(y_tf, y_np)
+
+  def testTwiceSaturation(self):
+    x_shape = [2, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+
+    saturation_factor = 2.0
+    y_data = [0, 5, 13, 0, 106, 226, 30, 0, 234, 89, 255, 0]
+    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
+
+    with self.test_session():
+      x = array_ops.placeholder(x_np.dtype, shape=x_shape)
+      y = self._adjust_saturation(x, saturation_factor)
+      y_tf = y.eval({x: x_np})
+      self.assertAllEqual(y_tf, y_np)
+
+  def _adjustSaturationNp(self, x_np, scale):
+    self.assertEqual(x_np.shape[-1], 3)
+    x_v = x_np.reshape([-1, 3])
+    y_v = np.ndarray(x_v.shape, dtype=x_v.dtype)
+    channel_count = x_v.shape[0]
+    for i in xrange(channel_count):
+      r = x_v[i][0]
+      g = x_v[i][1]
+      b = x_v[i][2]
+      h, s, v = colorsys.rgb_to_hsv(r, g, b)
+      s *= scale
+      s = min(1.0, max(0.0, s))
+      r, g, b = colorsys.hsv_to_rgb(h, s, v)
+      y_v[i][0] = r
+      y_v[i][1] = g
+      y_v[i][2] = b
+    return y_v.reshape(x_np.shape)
+
+  def testAdjustRandomSaturation(self):
+    x_shapes = [
+        [2, 2, 3],
+        [4, 2, 3],
+        [2, 4, 3],
+        [2, 5, 3],
+        [1000, 1, 3],
+    ]
+    test_styles = [
+        "all_random",
+        "rg_same",
+        "rb_same",
+        "gb_same",
+        "rgb_same",
+    ]
+    with self.test_session():
+      for x_shape in x_shapes:
+        for test_style in test_styles:
+          x_np = np.random.rand(*x_shape) * 255.
+          scale = np.random.rand()
+          if test_style == "all_random":
+            pass
+          elif test_style == "rg_same":
+            x_np[..., 1] = x_np[..., 0]
+          elif test_style == "rb_same":
+            x_np[..., 2] = x_np[..., 0]
+          elif test_style == "gb_same":
+            x_np[..., 2] = x_np[..., 1]
+          elif test_style == "rgb_same":
+            x_np[..., 1] = x_np[..., 0]
+            x_np[..., 2] = x_np[..., 0]
+          else:
+            raise AssertionError("Invalid test style: %s" % (test_style))
+          y_baseline = self._adjustSaturationNp(x_np, scale)
+          x = array_ops.placeholder(dtypes.float32, shape=x_shape)
+          with self.test_scope():
+            y_fused = self._adjust_saturation(x,
+                                              scale).eval(feed_dict={
+                                                  x: x_np
+                                              })
+          self.assertAllClose(y_fused, y_baseline, rtol=2e-5, atol=1e-5)
+
+
+class ResizeBilinearTest(XLATestCase):
+
+  def _assertForwardOpMatchesExpected(self,
+                                      image_np,
+                                      target_shape,
+                                      expected=None):
+    if expected is None:
+      self.fail("expected must be specified")
+    with self.test_session() as sess, self.test_scope():
+      image = array_ops.placeholder(image_np.dtype)
+      resized = gen_image_ops.resize_bilinear(
+          image, target_shape, align_corners=True)
+      out = sess.run(resized, {image: image_np[np.newaxis, :, :, np.newaxis]})
+      self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out)
+
+  def _assertBackwardOpMatchesExpected(self,
+                                       grads_np,
+                                       input_shape=None,
+                                       dtype=None,
+                                       expected=None):
+    if input_shape is None:
+      self.fail("input_shape must be specified")
+    if expected is None:
+      self.fail("expected must be specified")
+    with self.test_session() as sess, self.test_scope():
+      dtype = dtype or np.float32
+      grads = array_ops.placeholder(np.float32)
+      resized = gen_image_ops._resize_bilinear_grad(
+          grads,
+          np.zeros([1, input_shape[0], input_shape[1], 1], dtype=dtype),
+          align_corners=True)
+      out = sess.run(resized, {grads: grads_np[np.newaxis, :, :, np.newaxis]})
+      self.assertAllCloseAccordingToType(expected[np.newaxis, :, :, np.newaxis],
+                                         out)
+
+  def testAlignCorners1x2To3x2(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array([[1, 2]], dtype=dtype), [3, 3],
+          expected=np.array(
+              [[1, 1.5, 2], [1, 1.5, 2], [1, 1.5, 2]], dtype=np.float32))
+
+  def testAlignCorners1x2To3x2Grad(self):
+    for dtype in self.float_types:
+      self._assertBackwardOpMatchesExpected(
+          np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32),
+          input_shape=[1, 2],
+          dtype=dtype,
+          expected=np.array([[9, 12]], dtype=np.float32))
+
+  def testAlignCorners2x2To1x1(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array([[1, 2], [3, 4]], dtype=dtype), [1, 1],
+          expected=np.array([[1]], dtype=np.float32))
+
+  def testAlignCorners2x2To1x1Grad(self):
+    for dtype in self.float_types:
+      self._assertBackwardOpMatchesExpected(
+          np.array([[7]], dtype=np.float32),
+          input_shape=[2, 2],
+          dtype=dtype,
+          expected=np.array([[7, 0], [0, 0]], dtype=np.float32))
+
+  def testAlignCorners2x2To3x3(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array([[1, 2], [3, 4]], dtype=dtype), [3, 3],
+          expected=np.array(
+              [[1, 1.5, 2], [2, 2.5, 3], [3, 3.5, 4]], dtype=np.float32))
+
+  def testAlignCorners2x2To3x3Grad(self):
+    self._assertBackwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32),
+        input_shape=[2, 2],
+        expected=np.array([[5.25, 8.25], [14.25, 17.25]], dtype=np.float32))
+
+  def testAlignCorners3x3To2x2(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=dtype), [2, 2],
+          expected=np.array([[1, 3], [7, 9]], dtype=np.float32))
+
+  def testAlignCorners3x3To2x2Grad(self):
+    for dtype in self.float_types:
+      self._assertBackwardOpMatchesExpected(
+          np.array([[7, 13], [22, 4]], dtype=np.float32),
+          input_shape=[3, 3],
+          dtype=dtype,
+          expected=np.array(
+              [[7, 0, 13], [0, 0, 0], [22, 0, 4]], dtype=np.float32))
+
+  def testAlignCorners4x4To3x3(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array(
+              [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
+              dtype=dtype), [3, 3],
+          expected=np.array(
+              [[1, 2.5, 4], [7, 8.5, 10], [13, 14.5, 16]], dtype=np.float32))
+
+  def testAlignCorners4x4To3x3Grad(self):
+    for dtype in self.float_types:
+      self._assertBackwardOpMatchesExpected(
+          np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32),
+          input_shape=[4, 4],
+          dtype=dtype,
+          expected=np.array(
+              [[1, 1, 1, 3], [2, 1.25, 1.25, 3], [2, 1.25, 1.25, 3],
+               [7, 4, 4, 9]],
+              dtype=np.float32))
+
+  def testAlignCorners3x3To9x9(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=dtype), [9, 9],
+          expected=np.array(
+              [[1.0, 1.25, 1.50, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00], [
+                  1.75, 2.00, 2.25, 2.50, 2.75, 3.00, 3.25, 3.50, 3.75
+              ], [2.50, 2.75, 3.00, 3.25, 3.50, 3.75, 4.00, 4.25, 4.50], [
+                  3.25, 3.50, 3.75, 4.00, 4.25, 4.50, 4.75, 5.00, 5.25
+              ], [4.00, 4.25, 4.50, 4.75, 5.00, 5.25, 5.50, 5.75, 6.00], [
+                  4.75, 5.00, 5.25, 5.50, 5.75, 6.00, 6.25, 6.50, 6.75
+              ], [5.50, 5.75, 6.00, 6.25, 6.50, 6.75, 7.00, 7.25, 7.50], [
+                  6.25, 6.50, 6.75, 7.00, 7.25, 7.50, 7.75, 8.00, 8.25
+              ], [7.00, 7.25, 7.50, 7.75, 8.00, 8.25, 8.50, 8.75, 9.00]],
+              dtype=np.float32))
+
+  def testAlignCorners3x3To9x9Grad(self):
+    for dtype in self.float_types:
+      self._assertBackwardOpMatchesExpected(
+          np.array(
+              [[1.00, 1.25, 1.50, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00], [
+                  1.75, 2.00, 2.25, 2.50, 2.75, 3.00, 3.25, 3.50, 3.75
+              ], [2.50, 2.75, 3.00, 3.25, 3.50, 3.75, 4.00, 4.25, 4.50], [
+                  3.25, 3.50, 3.75, 4.00, 4.25, 4.50, 4.75, 5.00, 5.25
+              ], [4.00, 4.25, 4.50, 4.75, 5.00, 5.25, 5.50, 5.75, 6.00], [
+                  4.75, 5.00, 5.25, 5.50, 5.75, 6.00, 6.25, 6.50, 6.75
+              ], [5.50, 5.75, 6.00, 6.25, 6.50, 6.75, 7.00, 7.25, 7.50], [
+                  6.25, 6.50, 6.75, 7.00, 7.25, 7.50, 7.75, 8.00, 8.25
+              ], [7.00, 7.25, 7.50, 7.75, 8.00, 8.25, 8.50, 8.75, 9.00]],
+              dtype=np.float32),
+          input_shape=[3, 3],
+          dtype=dtype,
+          expected=np.array(
+              [[12.5, 27.5, 21.875], [42.5, 80.0, 57.5], [40.625, 72.5, 50]],
+              dtype=np.float32))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/matrix_band_part_test.py b/tensorflow/compiler/tests/matrix_band_part_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..29394f9ea5139b30f88f53de0469b27e37d79195
--- /dev/null
+++ b/tensorflow/compiler/tests/matrix_band_part_test.py
@@ -0,0 +1,64 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class MatrixBandPartTest(XLATestCase):
+
+  def _testMatrixBandPart(self, dtype, shape):
+    with self.test_session():
+      batch_shape = shape[:-2]
+      mat = np.ones(shape).astype(dtype)
+      batch_mat = np.tile(mat, batch_shape + [1, 1])
+      for lower in -1, 0, 1, shape[-2] - 1:
+        for upper in -1, 0, 1, shape[-1] - 1:
+          band_np = mat
+          if lower >= 0:
+            band_np = np.triu(band_np, -lower)
+          if upper >= 0:
+            band_np = np.tril(band_np, upper)
+          if batch_shape:
+            band_np = np.tile(band_np, batch_shape + [1, 1])
+
+          placeholder = array_ops.placeholder(dtype)
+          with self.test_scope():
+            band = array_ops.matrix_band_part(
+                placeholder,
+                constant_op.constant(lower, dtype=dtypes.int32),
+                constant_op.constant(upper, dtype=dtypes.int32))
+            feed_dict = {placeholder: batch_mat}
+            self.assertAllEqual(band_np, band.eval(feed_dict=feed_dict))
+
+  def testMatrixBandPart(self):
+    for dtype in self.float_types:
+      for batch_shape in [[], [2,], [1, 3, 2]]:
+        for rows in 1, 2, 7:
+          for cols in 1, 2, 7:
+            self._testMatrixBandPart(dtype, batch_shape + [rows, cols])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cccb7f5789dce39ef8c3d4b3a7573aaa983b3fbd
--- /dev/null
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -0,0 +1,130 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.MatrixTriangularSolve."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def MakePlaceholder(x):
+  return array_ops.placeholder(dtypes.as_dtype(x.dtype), shape=x.shape)
+
+
+class MatrixTriangularSolveOpTest(XLATestCase):
+
+  def _VerifyTriangularSolveBase(self, sess, placeholder_a, placeholder_ca,
+                                 placeholder_b, a, clean_a, b, verification,
+                                 atol):
+    feed_dict = {placeholder_a: a, placeholder_ca: clean_a, placeholder_b: b}
+    verification_np = sess.run(verification, feed_dict)
+    self.assertAllClose(b, verification_np, atol=atol)
+
+  def _VerifyTriangularSolve(self, a, b, lower, adjoint, atol):
+    clean_a = np.tril(a) if lower else np.triu(a)
+    with self.test_session() as sess:
+      placeholder_a = MakePlaceholder(a)
+      placeholder_ca = MakePlaceholder(clean_a)
+      placeholder_b = MakePlaceholder(b)
+      with self.test_scope():
+        x = linalg_ops.matrix_triangular_solve(
+            placeholder_a, placeholder_b, lower=lower, adjoint=adjoint)
+      verification = math_ops.matmul(placeholder_ca, x, adjoint_a=adjoint)
+      self._VerifyTriangularSolveBase(sess, placeholder_a, placeholder_ca,
+                                      placeholder_b, a, clean_a, b,
+                                      verification, atol)
+
+  def _VerifyTriangularSolveCombo(self, a, b, atol=1e-4):
+    transp = lambda x: np.swapaxes(x, -1, -2)
+    for lower, adjoint in itertools.product([True, False], repeat=2):
+      self._VerifyTriangularSolve(
+          a if lower else transp(a), b, lower, adjoint, atol)
+
+  def testBasic(self):
+    rng = np.random.RandomState(0)
+    a = np.tril(rng.randn(5, 5))
+    b = rng.randn(5, 7)
+    for dtype in self.float_types:
+      self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype))
+
+  def testBasicNotActuallyTriangular(self):
+    rng = np.random.RandomState(0)
+    a = rng.randn(5, 5)  # the `a` matrix is not lower-triangular
+    b = rng.randn(5, 7)
+    for dtype in self.float_types:
+      self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype))
+
+  def testBasicComplexDtypes(self):
+    rng = np.random.RandomState(0)
+    a = np.tril(rng.randn(5, 5) + rng.randn(5, 5) * 1j)
+    b = rng.randn(5, 7) + rng.randn(5, 7) * 1j
+    for dtype in self.complex_types:
+      self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype))
+
+  def testBatch(self):
+    rng = np.random.RandomState(0)
+    shapes = [((4, 3, 3), (4, 3, 5)), ((1, 2, 2), (1, 2, 1)),
+              ((1, 1, 1), (1, 1, 2)), ((2, 3, 4, 4), (2, 3, 4, 1))]
+    tuples = itertools.product(self.float_types, shapes)
+    for dtype, (a_shape, b_shape) in tuples:
+      n = a_shape[-1]
+      a = np.tril(rng.rand(*a_shape) - 0.5) / (2.0 * n) + np.eye(n)
+      b = rng.randn(*b_shape)
+      self._VerifyTriangularSolveCombo(
+          a.astype(dtype), b.astype(dtype), atol=1e-3)
+
+  def testLarge(self):
+    n = 1024
+    rng = np.random.RandomState(0)
+    a = np.tril(rng.rand(n, n) - 0.5) / (2.0 * n) + np.eye(n)
+    b = rng.randn(n, n)
+    self._VerifyTriangularSolve(
+        a.astype(np.float32), b.astype(np.float32), True, False, 1e-4)
+
+  def testNonSquareCoefficientMatrix(self):
+    rng = np.random.RandomState(0)
+    for dtype in self.float_types:
+      a = rng.randn(3, 4).astype(dtype)
+      b = rng.randn(4, 4).astype(dtype)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(a, b)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(a, b)
+
+  def testWrongDimensions(self):
+    randn = np.random.RandomState(0).randn
+    for dtype in self.float_types:
+      lhs = constant_op.constant(randn(3, 3), dtype=dtype)
+      rhs = constant_op.constant(randn(4, 3), dtype=dtype)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(lhs, rhs)
+      with self.assertRaises(ValueError):
+        linalg_ops.matrix_triangular_solve(lhs, rhs)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/momentum_test.py b/tensorflow/compiler/tests/momentum_test.py
index c00e3035a0982b2b2e59eb6f53499918515ae71d..af9394e7d7dc9cf7dd009420ff9c845aec8785bd 100644
--- a/tensorflow/compiler/tests/momentum_test.py
+++ b/tensorflow/compiler/tests/momentum_test.py
@@ -96,28 +96,27 @@ class MomentumOptimizerTest(XLATestCase):
   def testNesterovMomentum(self):
     for dtype in self.float_types:
       with self.test_session(), self.test_scope():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
-        var0_np = np.array([1.0, 2.0], dtype=dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype)
+        var0 = resource_variable_ops.ResourceVariable([0.1, 0.2], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([0.3, 0.4], dtype=dtype)
+        var0_np = np.array([0.1, 0.2], dtype=dtype)
+        var1_np = np.array([0.3, 0.4], dtype=dtype)
         accum0_np = np.array([0.0, 0.0], dtype=dtype)
         accum1_np = np.array([0.0, 0.0], dtype=dtype)
-        cost = 5 * var0 * var0 + 3 * var1
+        cost = 0.4 * var0 * var0 + 0.9 * var1
         global_step = resource_variable_ops.ResourceVariable(
             array_ops.zeros([], dtypes.int32), name="global_step")
         mom_op = momentum_lib.MomentumOptimizer(
-            learning_rate=2.0, momentum=0.9, use_nesterov=True)
+            learning_rate=0.1, momentum=0.9, use_nesterov=True)
         opt_op = mom_op.minimize(cost, global_step, [var0, var1])
         variables.global_variables_initializer().run()
         for _ in range(1, 5):
           opt_op.run()
           var0_np, accum0_np = self._update_nesterov_momentum_numpy(
-              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
-          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
-                                                                    accum1_np,
-                                                                    3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+              var0_np, accum0_np, var0_np * 0.8, 0.1, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 0.9, 0.1, 0.9)
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
 
   def testTensorLearningRateAndMomentum(self):
     for dtype in self.float_types:
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index 6a8c3bcd55a6e454a19b6249cf4eb48739c8657f..e72dd4eea9f127e1df96ab166103c4c16372adb6 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -93,11 +93,11 @@ class OpTestBuilder {
  public:
   explicit OpTestBuilder(const string& op_name);
 
-  // Adds an input 'tensor'.
+  // Adds an input 'tensor' as a Placeholder node.
   OpTestBuilder& Input(const Tensor& tensor);
 
-  // Adds a random input tensor with 'type'. If 'dims' is not provided,
-  // RandomDims() is used.
+  // Adds a random input tensor with 'type' as a Placeholder node.
+  // If 'dims' is not provided, RandomDims() is used.
   OpTestBuilder& RandomInput(DataType type);
   OpTestBuilder& RandomInput(DataType type, std::vector<int64> dims);
 
@@ -998,6 +998,13 @@ TEST_F(OpTest, Atanh) {
   });
 }
 
+TEST_F(OpTest, Atan) {
+  Repeatedly([this]() {
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Atan").RandomInput(DT_FLOAT).Attr("T", DT_FLOAT));
+  });
+}
+
 TEST_F(OpTest, Atan2) {
   Repeatedly([this]() {
     auto dims = BroadcastableDims();
@@ -1368,6 +1375,121 @@ TEST_F(OpTest, Conj) {
   });
 }
 
+TEST_F(OpTest, FFT) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(1, kDefaultMaxRank);
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("FFT").RandomInput(DT_COMPLEX64, dims));
+  });
+}
+
+TEST_F(OpTest, FFT2D) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(2, kDefaultMaxRank);
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("FFT2D").RandomInput(DT_COMPLEX64, dims));
+  });
+}
+
+TEST_F(OpTest, FFT3D) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(3, kDefaultMaxRank);
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("FFT3D").RandomInput(DT_COMPLEX64, dims));
+  });
+}
+
+TEST_F(OpTest, IFFT) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(1, kDefaultMaxRank);
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("IFFT").RandomInput(DT_COMPLEX64, dims));
+  });
+}
+
+TEST_F(OpTest, IFFT2D) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(2, kDefaultMaxRank);
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("IFFT2D").RandomInput(DT_COMPLEX64, dims));
+  });
+}
+
+TEST_F(OpTest, IFFT3D) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(3, kDefaultMaxRank);
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("IFFT3D").RandomInput(DT_COMPLEX64, dims));
+  });
+}
+
+TEST_F(OpTest, RFFT) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(1, kDefaultMaxRank, 3);
+    Tensor fft_shape = test::AsTensor<int32>(AsInt32s({dims[dims.size() - 1]}));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("RFFT").RandomInput(DT_FLOAT, dims).Input(fft_shape));
+  });
+}
+
+TEST_F(OpTest, RFFT2D) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(2, kDefaultMaxRank, 3);
+    Tensor fft_shape = test::AsTensor<int32>(
+        AsInt32s({dims[dims.size() - 2], dims[dims.size() - 1]}));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("RFFT2D").RandomInput(DT_FLOAT, dims).Input(fft_shape));
+  });
+}
+
+TEST_F(OpTest, RFFT3D) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(3, kDefaultMaxRank, 3);
+    Tensor fft_shape = test::AsTensor<int32>(AsInt32s(
+        {dims[dims.size() - 3], dims[dims.size() - 2], dims[dims.size() - 1]}));
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("RFFT3D").RandomInput(DT_FLOAT, dims).Input(fft_shape));
+  });
+}
+
+TEST_F(OpTest, IRFFT) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(1, kDefaultMaxRank, 3);
+    int64 orig_size = dims[dims.size() - 1];
+    dims[dims.size() - 1] = dims[dims.size() - 1] / 2 + 1;
+    Tensor fft_shape = test::AsTensor<int32>(AsInt32s({orig_size}));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("IRFFT")
+                                             .RandomInput(DT_COMPLEX64, dims)
+                                             .Input(fft_shape));
+  });
+}
+
+TEST_F(OpTest, IRFFT2D) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(2, kDefaultMaxRank, 3);
+    std::vector<int64> orig_size = {dims[dims.size() - 2],
+                                    dims[dims.size() - 1]};
+    dims[dims.size() - 1] = dims[dims.size() - 1] / 2 + 1;
+    Tensor fft_shape = test::AsTensor<int32>(AsInt32s({orig_size}));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("IRFFT2D")
+                                             .RandomInput(DT_COMPLEX64, dims)
+                                             .Input(fft_shape));
+  });
+}
+
+TEST_F(OpTest, IRFFT3D) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(3, kDefaultMaxRank, 3);
+    std::vector<int64> orig_size = {
+        dims[dims.size() - 3], dims[dims.size() - 2], dims[dims.size() - 1]};
+    dims[dims.size() - 1] = dims[dims.size() - 1] / 2 + 1;
+    Tensor fft_shape = test::AsTensor<int32>(AsInt32s({orig_size}));
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("IRFFT3D")
+                                             .RandomInput(DT_COMPLEX64, dims)
+                                             .Input(fft_shape));
+  });
+}
+
 TEST_F(OpTest, Conv2D) {
   Repeatedly([this]() {
     WindowedSpatialDims d = ChooseWindowedSpatialDims(2);
@@ -1382,7 +1504,7 @@ TEST_F(OpTest, Conv2D) {
 
     std::vector<int64> kernel_dims = {d.kernel_dims[0], d.kernel_dims[1],
                                       features_in, features_out};
-    DataType type = DT_FLOAT;  // TODO(b/65408531): COMPLEX_64 support
+    DataType type = DT_FLOAT;
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Conv2D")
             .RandomInput(type, data_dims)
@@ -1407,7 +1529,7 @@ TEST_F(OpTest, Conv2DBackpropFilter) {
         ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
     Tensor kernel_shape = test::AsTensor<int32>(AsInt32s(
         {d.kernel_dims[0], d.kernel_dims[1], features_in, features_out}));
-    DataType type = DT_FLOAT;  // TODO(b/65408531): COMPLEX_64 support
+    DataType type = DT_FLOAT;
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Conv2DBackpropFilter")
             .RandomInput(type, activations)
@@ -1433,7 +1555,7 @@ TEST_F(OpTest, Conv2DBackpropInput) {
         ImageDims(FORMAT_NHWC, batch, features_out, d.output_dims);
     std::vector<int64> kernel = {d.kernel_dims[0], d.kernel_dims[1],
                                  features_in, features_out};
-    DataType type = DT_FLOAT;  // TODO(b/65408531): COMPLEX_64 support
+    DataType type = DT_FLOAT;
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Conv2DBackpropInput")
             .Input(in_shape)
@@ -1457,7 +1579,7 @@ TEST_F(OpTest, Conv3D) {
 
     std::vector<int64> kernel = {d.kernel_dims[0], d.kernel_dims[1],
                                  d.kernel_dims[2], features_in, features_out};
-    DataType type = DT_FLOAT;  // TODO(b/65408531): COMPLEX_64 support
+    DataType type = DT_FLOAT;
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Conv3D")
             .RandomInput(type, data)
@@ -1482,7 +1604,7 @@ TEST_F(OpTest, Conv3DBackpropFilter) {
     Tensor kernel_shape = test::AsTensor<int32>(
         AsInt32s({d.kernel_dims[0], d.kernel_dims[1], d.kernel_dims[2],
                   features_in, features_out}));
-    DataType type = DT_FLOAT;  // TODO(b/65408531): COMPLEX_64 support
+    DataType type = DT_FLOAT;
     return ExpectTfAndXlaOutputsAreClose(
         OpTestBuilder("Conv3DBackpropFilterV2")
             .RandomInput(type, activations)
@@ -2460,6 +2582,36 @@ TEST_F(OpTest, Reshape) {
   });
 }
 
+TEST_F(OpTest, ResizeBilinear) {
+  Repeatedly([this]() {
+    std::vector<int64> in_dims = RandomDims(4, 4);
+    std::vector<int64> out_dims = RandomDims(2, 2);
+
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("ResizeBilinear")
+            .RandomInput(DT_FLOAT, in_dims)
+            .Input(test::AsTensor<int32>(
+                std::vector<int32>(out_dims.begin(), out_dims.end())))
+            .Attr("T", DT_FLOAT)
+            .Attr("align_corners", true));
+  });
+}
+
+TEST_F(OpTest, ResizeBilinearGrad) {
+  Repeatedly([this]() {
+    std::vector<int64> in_dims = RandomDims(4, 4);
+    std::vector<int64> out_dims = RandomDims(2, 2);
+
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("ResizeBilinearGrad")
+            .RandomInput(DT_FLOAT, in_dims)
+            .RandomInput(DT_FLOAT,
+                         {in_dims[0], out_dims[0], out_dims[1], in_dims[3]})
+            .Attr("T", DT_FLOAT)
+            .Attr("align_corners", true));
+  });
+}
+
 TEST_F(OpTest, Reverse) {
   Repeatedly([this]() {
     std::vector<int64> dims = RandomDims(1);
diff --git a/tensorflow/compiler/tests/reverse_sequence_op_test.py b/tensorflow/compiler/tests/reverse_sequence_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a5d05094e53cfecd9476d7d87f023e8a02d7458
--- /dev/null
+++ b/tensorflow/compiler/tests/reverse_sequence_op_test.py
@@ -0,0 +1,93 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.reverse_sequence_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ReverseSequenceTest(XLATestCase):
+
+  def _testReverseSequence(self,
+                           x,
+                           batch_axis,
+                           seq_axis,
+                           seq_lengths,
+                           truth,
+                           expected_err_re=None):
+    with self.test_session():
+      p = array_ops.placeholder(dtypes.as_dtype(x.dtype))
+      lengths = array_ops.placeholder(dtypes.as_dtype(seq_lengths.dtype))
+      with self.test_scope():
+        ans = array_ops.reverse_sequence(
+            p, batch_axis=batch_axis, seq_axis=seq_axis, seq_lengths=lengths)
+      if expected_err_re is None:
+        tf_ans = ans.eval(feed_dict={p: x, lengths: seq_lengths})
+        self.assertAllClose(tf_ans, truth, atol=1e-10)
+      else:
+        with self.assertRaisesOpError(expected_err_re):
+          ans.eval(feed_dict={p: x, lengths: seq_lengths})
+
+  def testSimple(self):
+    x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int32)
+    expected = np.array([[1, 2, 3], [6, 5, 4], [8, 7, 9]], dtype=np.int32)
+    self._testReverseSequence(
+        x,
+        batch_axis=0,
+        seq_axis=1,
+        seq_lengths=np.array([1, 3, 2], np.int32),
+        truth=expected)
+
+  def _testBasic(self, dtype, len_dtype):
+    x = np.asarray(
+        [[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], [13, 14, 15, 16]],
+         [[17, 18, 19, 20], [21, 22, 23, 24]]],
+        dtype=dtype)
+    x = x.reshape(3, 2, 4, 1, 1)
+    x = x.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2
+
+    # reverse dim 2 up to (0:3, none, 0:4) along dim=0
+    seq_lengths = np.asarray([3, 0, 4], dtype=len_dtype)
+
+    truth_orig = np.asarray(
+        [
+            [[3, 2, 1, 4], [7, 6, 5, 8]],  # reverse 0:3
+            [[9, 10, 11, 12], [13, 14, 15, 16]],  # reverse none
+            [[20, 19, 18, 17], [24, 23, 22, 21]]
+        ],  # reverse 0:4 (all)
+        dtype=dtype)
+    truth_orig = truth_orig.reshape(3, 2, 4, 1, 1)
+    truth = truth_orig.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2
+
+    seq_axis = 0  # permute seq_axis and batch_axis (originally 2 and 0, resp.)
+    batch_axis = 2
+    self._testReverseSequence(x, batch_axis, seq_axis, seq_lengths, truth)
+
+  def testSeqLength(self):
+    for dtype in self.all_types:
+      for seq_dtype in self.int_types:
+        self._testBasic(dtype, seq_dtype)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/scan_ops_test.py b/tensorflow/compiler/tests/scan_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3260e63b23226d736a7ddc0f21a94a8c791e0442
--- /dev/null
+++ b/tensorflow/compiler/tests/scan_ops_test.py
@@ -0,0 +1,229 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for scan ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def numpy_reverse(x, axis):
+  length = len(x.shape)
+  if axis < 0:
+    axis = length + axis
+
+  ix = [
+      slice(None, None, -1) if i == axis else slice(None) for i in range(length)
+  ]
+  return x[ix]
+
+
+def handle_options(func, x, axis, exclusive, reverse):
+  """Adds tf options to numpy scan ops."""
+  length = len(x.shape)
+  if axis < 0:
+    axis = length + axis
+
+  if reverse:
+    x = numpy_reverse(x, axis)
+
+  if exclusive:
+    ix_head = [slice(0, 1) if i == axis else slice(None) for i in range(length)]
+    ix_init = [
+        slice(0, -1) if i == axis else slice(None) for i in range(length)
+    ]
+    if func == np.cumsum:
+      init = np.zeros_like(x[ix_head])
+    elif func == np.cumprod:
+      init = np.ones_like(x[ix_head])
+    else:
+      raise ValueError("Unknown scan function.")
+    x = np.concatenate([init, func(x[ix_init], axis)], axis=axis)
+  else:
+    x = func(x, axis=axis)
+
+  if reverse:
+    x = numpy_reverse(x, axis)
+  return x
+
+
+class CumsumTest(XLATestCase):
+
+  valid_dtypes = [np.float32]
+
+  def axis_dtypes(self):
+    return set(self.int_types).intersection([np.int32, np.int64])
+
+  def _compare(self, x, axis, exclusive, reverse):
+    np_out = handle_options(np.cumsum, x, axis, exclusive, reverse)
+    with self.test_session(), self.test_scope():
+      p = array_ops.placeholder(x.dtype)
+      tf_out = math_ops.cumsum(p, axis, exclusive, reverse).eval(
+          feed_dict={p: x})
+
+    self.assertAllClose(np_out, tf_out)
+
+  def _compareAll(self, x, axis):
+    for exclusive in [True, False]:
+      for reverse in [True, False]:
+        self._compare(x, axis, exclusive, reverse)
+
+  def testEmpty(self):
+    for dtype in self.valid_dtypes:
+      x = np.zeros([0]).astype(dtype)
+      for axis in (-1, 0):
+        self._compareAll(x, axis)
+
+  def testAxisType(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      for axis_dtype in self.axis_dtypes():
+        with self.test_session(), self.test_scope():
+          p = array_ops.placeholder(x.dtype)
+          axis = constant_op.constant(0, axis_dtype)
+          math_ops.cumsum(p, axis).eval(feed_dict={p: x})
+
+  def test1D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      for axis in (-1, 0):
+        self._compareAll(x, axis)
+
+  def test2D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(0, 10).reshape([2, 5]).astype(dtype)
+      for axis in (-2, -1, 0, 1):
+        self._compareAll(x, axis)
+
+  def test3D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(0, 20).reshape([2, 2, 5]).astype(dtype)
+      for axis in (-3, -2, -1, 0, 1, 2):
+        self._compareAll(x, axis)
+
+  def test6D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 145).reshape([2, 2, 3, 3, 2, 2]).astype(dtype)
+      for axis in range(-6, 6, 3):
+        self._compareAll(x, axis)
+
+  def testInvalidAxis(self):
+    x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
+    with self.test_session(), self.test_scope():
+      input_tensor = ops.convert_to_tensor(x)
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
+        math_ops.cumsum(input_tensor, -3).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
+        math_ops.cumsum(input_tensor, 2).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "axis must be a scalar" in str(e)):
+        math_ops.cumsum(input_tensor, [0]).eval()
+
+
+class CumprodTest(XLATestCase):
+
+  valid_dtypes = [np.float32]
+
+  def axis_dtypes(self):
+    return set(self.int_types).intersection([np.int32, np.int64])
+
+  def _compare(self, x, axis, exclusive, reverse):
+    np_out = handle_options(np.cumprod, x, axis, exclusive, reverse)
+    with self.test_session(), self.test_scope():
+      p = array_ops.placeholder(x.dtype)
+      prod = math_ops.cumprod(p, axis, exclusive, reverse)
+      tf_out = prod.eval(feed_dict={p: x})
+
+    self.assertAllClose(np_out, tf_out)
+
+  def _compareAll(self, x, axis):
+    for exclusive in [True, False]:
+      for reverse in [True, False]:
+        self._compare(x, axis, exclusive, reverse)
+
+  def testEmpty(self):
+    for dtype in self.valid_dtypes:
+      x = np.zeros([0]).astype(dtype)
+      for axis in (-1, 0):
+        self._compareAll(x, axis)
+
+  def testAxisType(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      for axis_dtype in self.axis_dtypes():
+        with self.test_session(), self.test_scope():
+          p = array_ops.placeholder(x.dtype)
+          axis = constant_op.constant(0, axis_dtype)
+          math_ops.cumprod(x, axis).eval(feed_dict={p: x})
+
+  def test1D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      for axis in (-1, 0):
+        self._compareAll(x, axis)
+
+  def test2D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 11).reshape([2, 5]).astype(dtype)
+      for axis in (-2, -1, 0, 1):
+        self._compareAll(x, axis)
+
+  def test3D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 21).reshape([2, 2, 5]).astype(dtype)
+      for axis in (-3, -2, -1, 0, 1, 2):
+        self._compareAll(x, axis)
+
+  def test6D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 145).reshape([2, 2, 3, 3, 2, 2]).astype(dtype)
+      for axis in range(-6, 6, 3):
+        self._compareAll(x, axis)
+
+  def testInvalidAxis(self):
+    x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
+    with self.test_session(), self.test_scope():
+      input_tensor = ops.convert_to_tensor(x)
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
+        math_ops.cumprod(input_tensor, -3).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
+        math_ops.cumprod(input_tensor, 2).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "axis must be a scalar" in str(e)):
+        math_ops.cumprod(input_tensor, [0]).eval()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/scatter_nd_op_test.py b/tensorflow/compiler/tests/scatter_nd_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..638946e234daf28dc4a34e6c33fc0f78b8e8699b
--- /dev/null
+++ b/tensorflow/compiler/tests/scatter_nd_op_test.py
@@ -0,0 +1,188 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.scatter_nd."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+def _AsType(v, vtype):
+  return v.astype(vtype) if isinstance(v, np.ndarray) else vtype(v)
+
+
+def _FlatInnerDims(tensor, ndims=2):
+  shape = list(tensor.shape)
+  return tensor.reshape(
+      [functools.reduce(lambda x, y: x * y, shape[:-ndims + 1], 1)] +
+      shape[-ndims + 1:])
+
+
+def _FlatOuterDims(tensor, ndims=2):
+  shape = list(tensor.shape)
+  return tensor.reshape(
+      shape[:ndims - 1] +
+      [functools.reduce(lambda x, y: x * y, shape[ndims - 1:], 1)])
+
+
+def _NumpyScatterNd(ref, indices, updates, op):
+  ixdim = indices.shape[-1]
+  num_updates = indices.size // ixdim
+  total_nd = len(ref.shape)
+  slice_size = 1
+  for i in range(ixdim, total_nd):
+    slice_size *= ref.shape[i]
+  flat_indices = _FlatInnerDims(indices)
+  flat_updates = updates.reshape((num_updates, slice_size))
+  output_flat = _FlatOuterDims(ref, ixdim + 1)
+  for ix_updates, ix_output in enumerate(flat_indices):
+    ix_output = tuple(ix_output)
+    output_flat[ix_output] = op(output_flat[ix_output],
+                                flat_updates[ix_updates])
+  return output_flat.reshape(ref.shape)
+
+
+def _NumpyUpdate(indices, updates, shape):
+  ref = np.zeros(shape, dtype=updates.dtype)
+  return _NumpyScatterNd(ref, indices, updates, lambda p, u: u)
+
+
+class ScatterNdTest(XLATestCase):
+
+  def _VariableRankTest(self,
+                        np_scatter,
+                        tf_scatter,
+                        vtype,
+                        itype,
+                        repeat_indices=False):
+    np.random.seed(8)
+    ref_shapes = [(3, 6), (3, 6), (3, 6, 9), (3, 6, 9), (3, 6, 9), (3, 6, 9)]
+    indices_shapes = [(2,), (2, 2), (2,), (2, 2), (2, 3), (2, 3, 3)]
+    for ref_shape, indices_shape in zip(ref_shapes, indices_shapes):
+      num_updates = indices_shape[0]
+      ixdim = indices_shape[-1]
+
+      indexable_area_shape = ()
+      for i in range(ixdim):
+        indexable_area_shape += (ref_shape[i],)
+      all_indices = [
+          list(coord)
+          for coord, _ in np.ndenumerate(np.empty(indexable_area_shape, vtype))
+      ]
+      np.random.shuffle(all_indices)
+      indices = np.array(all_indices[:num_updates])
+
+      if num_updates > 1 and repeat_indices:
+        indices = indices[:num_updates // 2]
+        for _ in range(num_updates - num_updates // 2):
+          indices = np.append(
+              indices, [indices[np.random.randint(num_updates // 2)]], axis=0)
+        np.random.shuffle(indices)
+      indices = _AsType(indices[:num_updates], itype)
+
+      updates_shape = (num_updates,)
+      for i in range(ixdim, len(ref_shape)):
+        updates_shape += (ref_shape[i],)
+      updates = _AsType(np.random.randn(*(updates_shape)), vtype)
+
+      # Scatter via numpy
+      np_out = np_scatter(indices, updates, ref_shape)
+      # Scatter via tensorflow
+      tf_out = tf_scatter(indices, updates, ref_shape)
+
+      self.assertAllClose(np_out, tf_out)
+
+  def _VariableRankTests(self, np_scatter, tf_scatter):
+    for vtype in self.numeric_types:
+      for itype in set([np.int32, np.int64]).intersection(set(self.int_types)):
+        self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
+
+  def _runScatterNd(self, indices, updates, shape):
+    with self.test_session():
+      updates_placeholder = array_ops.placeholder(updates.dtype)
+      indices_placeholder = array_ops.placeholder(indices.dtype)
+      with self.test_scope():
+        output = array_ops.scatter_nd(indices_placeholder, updates_placeholder,
+                                      shape)
+      feed_dict = {updates_placeholder: updates, indices_placeholder: indices}
+      return output.eval(feed_dict=feed_dict)
+
+  def testSimple(self):
+    indices = np.array([[4], [3], [1], [7]], dtype=np.int32)
+    updates = np.array([9, 10, 11, 12], dtype=np.float32)
+    expected = np.array([0, 11, 0, 10, 9, 0, 0, 12], dtype=np.int32)
+    self.assertAllEqual(expected, self._runScatterNd(indices, updates, [8]))
+
+  def testSimple2(self):
+    indices = np.array([[1, 0], [1, 1]], dtype=np.int32)
+    updates = np.array([11., 12.], dtype=np.float32)
+    expected = np.array([[0., 0.], [11., 12.], [0., 0.]], dtype=np.float32)
+    self.assertAllEqual(expected, self._runScatterNd(indices, updates, [3, 2]))
+
+  def testSimple3(self):
+    indices = np.array([[1]], dtype=np.int32)
+    updates = np.array([[11., 12.]], dtype=np.float32)
+    expected = np.array([[0., 0.], [11., 12.], [0., 0.]])
+    self.assertAllEqual(expected, self._runScatterNd(indices, updates, [3, 2]))
+
+  def testVariableRankUpdate(self):
+    self._VariableRankTests(_NumpyUpdate, self._runScatterNd)
+
+  def testExtraIndicesDimensions(self):
+    indices = np.zeros([1, 1, 2], np.int32)
+    updates = np.zeros([1, 1], np.int32)
+    expected = np.zeros([2, 2], dtype=np.int32)
+    self.assertAllEqual(expected, self._runScatterNd(indices, updates, [2, 2]))
+
+  def testRank3InvalidShape1(self):
+    indices = np.zeros([3, 2, 2], np.int32)
+    updates = np.zeros([2, 2, 2], np.int32)
+    with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                             "Must have updates.shape"):
+      self._runScatterNd(indices, updates, [2, 2, 2])
+
+  def testRank3InvalidShape2(self):
+    indices = np.zeros([2, 2, 1], np.int32)
+    updates = np.zeros([2, 2], np.int32)
+    with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                             "Must have updates.shape"):
+      self._runScatterNd(indices, updates, [2, 2, 2])
+
+  def testScatterOutOfRange(self):
+    updates = np.array([-3, -4, -5]).astype(np.float32)
+
+    # Indices all in range, no problem.
+    indices = np.array([[2], [0], [5]], dtype=np.int32)
+    self._runScatterNd(indices, updates, [6])
+
+    # Indices out of range should not fail. It produces implementation-defined
+    # output.
+    indices = np.array([[-1], [0], [5]], dtype=np.int32)
+    self._runScatterNd(indices, updates, [6])
+    indices = np.array([[2], [0], [6]], dtype=np.int32)
+    self._runScatterNd(indices, updates, [6])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/segment_reduction_ops_test.py b/tensorflow/compiler/tests/segment_reduction_ops_test.py
index 260a04421b62310c109d8f0ea72875a50c234bb0..23bc39cf3f7087424719edfb8b6ee35d87295534 100644
--- a/tensorflow/compiler/tests/segment_reduction_ops_test.py
+++ b/tensorflow/compiler/tests/segment_reduction_ops_test.py
@@ -60,6 +60,14 @@ class SegmentReductionOpsTest(XLATestCase):
               np.array([0, 1, 2, 3, 4, 5], dtype=dtype),
               np.array([3, 0, 2, 1, 3, 3], dtype=np.int32), 4))
 
+  def testUnsortedSegmentSum1DIndices1DDataNegativeIndices(self):
+    for dtype in self.numeric_types:
+      self.assertAllClose(
+          np.array([0, 3, 2, 5], dtype=dtype),
+          self.UnsortedSegmentSum(
+              np.array([0, 1, 2, 3, 4, 5], dtype=dtype),
+              np.array([3, -1, 2, 1, -1, 3], dtype=np.int32), 4))
+
   def testUnsortedSegmentSum1DIndices2DDataDisjoint(self):
     for dtype in self.numeric_types:
       data = np.array(
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index ac039e01623b954e291760fb9b50ef8eae3da7c1..a62925a1818da00cb0a9e82e1281db20fb38b208 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -330,8 +330,7 @@ class TensorArrayTest(xla_test.XLATestCase):
     # Find two different floating point types, create an array of
     # the first type, but try to read the other type.
     if len(self.float_types) > 1:
-      dtype1 = self.float_types[0]
-      dtype2 = self.float_types[1]
+      dtype1, dtype2 = list(self.float_types)[:2]
       with self.test_session(), self.test_scope():
         ta = tensor_array_ops.TensorArray(
             dtype=dtype1, tensor_array_name="foo", size=3)
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index a9a3f4f97f649260e9863fff8ff05d046bd91947..3d3e112f4821ea8e57ea9589a5b4433647ad294b 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -33,6 +33,17 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import googletest
 
 
+def nhwc_to_format(x, data_format):
+  """Converts a numpy array from NHWC format to `data_format`."""
+  rank = len(x.shape)
+  if data_format == "NCHW":
+    return np.transpose(x, [0, rank - 1] + list(range(1, rank - 1)))
+  elif data_format == "NHWC":
+    return x
+  else:
+    raise ValueError("Unknown format {}".format(data_format))
+
+
 class UnaryOpsTest(XLATestCase):
   """Test cases for unary operators."""
 
@@ -56,8 +67,10 @@ class UnaryOpsTest(XLATestCase):
         output = op(pinp)
       result = session.run(output, {pinp: inp})
       if equality_test is None:
-        equality_test = self.assertAllClose
-      equality_test(result, expected, rtol=rtol, atol=atol)
+        self.assertAllCloseAccordingToType(
+            result, expected, rtol=rtol, atol=atol, bfloat16_rtol=0.03)
+      else:
+        equality_test(result, expected, rtol=rtol, atol=atol)
 
   def ListsAreClose(self, result, expected, rtol, atol):
     """Tests closeness of two lists of floats."""
@@ -76,6 +89,12 @@ class UnaryOpsTest(XLATestCase):
           array_ops.diag_part,
           np.arange(36).reshape([2, 3, 2, 3]).astype(dtype),
           np.array([[0, 7, 14], [21, 28, 35]], dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          array_ops.diag, np.array([[1, 2], [3, 4]], dtype=dtype),
+          np.array(
+              [[[[1, 0], [0, 0]], [[0, 2], [0, 0]]], [[[0, 0], [3, 0]],
+                                                      [[0, 0], [0, 4]]]],
+              dtype=dtype))
 
       self._assertOpOutputMatchesExpected(
           array_ops.identity,
@@ -86,6 +105,21 @@ class UnaryOpsTest(XLATestCase):
           array_ops.matrix_diag,
           np.array([[1, 2], [3, 4]], dtype=dtype),
           np.array([[[1, 0], [0, 2]], [[3, 0], [0, 4]]], dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          array_ops.matrix_diag, np.array([1, 2, 3, 4], dtype=dtype),
+          np.array(
+              [[1, 0, 0, 0], [0, 2, 0, 0], [0, 0, 3, 0], [0, 0, 0, 4]],
+              dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          array_ops.matrix_diag,
+          np.array(
+              [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=dtype),
+          np.array(
+              [[[[1, 0, 0], [0, 2, 0], [0, 0, 3]],
+                [[4, 0, 0], [0, 5, 0], [0, 0, 6]]],
+               [[[7, 0, 0], [0, 8, 0], [0, 0, 9]],
+                [[10, 0, 0], [0, 11, 0], [0, 0, 12]]]],
+              dtype=dtype))
       self._assertOpOutputMatchesExpected(
           array_ops.matrix_diag_part,
           np.arange(3 * 2 * 4).reshape([3, 2, 4]).astype(dtype),
@@ -120,6 +154,21 @@ class UnaryOpsTest(XLATestCase):
 
   def testFloatOps(self):
     for dtype in self.float_types:
+      x = np.arange(-0.90, 0.90, 0.25)
+      self._assertOpOutputMatchesExpected(
+          math_ops.acos,
+          x.astype(dtype),
+          expected=np.arccos(x).astype(dtype))
+      self._assertOpOutputMatchesExpected(
+          math_ops.asin,
+          x.astype(dtype),
+          expected=np.arcsin(x).astype(dtype))
+      x = np.arange(-3, 3).reshape(1, 3, 2)
+      self._assertOpOutputMatchesExpected(
+          math_ops.atan,
+          x.astype(dtype),
+          expected=np.arctan(x).astype(dtype))
+
       self._assertOpOutputMatchesExpected(
           math_ops.acosh,
           np.array([1, 2, 3, 4], dtype=dtype),
@@ -331,26 +380,23 @@ class UnaryOpsTest(XLATestCase):
   def testComplexOps(self):
     for dtype in self.complex_types:
 
-      # TODO(b/65408531): Wider support for log (needs atan2).
-      atan2_supported = self.device == "XLA_GPU"
-      if atan2_supported:
-        self._assertOpOutputMatchesExpected(
-            math_ops.acosh,
-            np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
-            expected=np.arccosh(
-                np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.acosh,
+          np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
+          expected=np.arccosh(
+              np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
 
-        self._assertOpOutputMatchesExpected(
-            math_ops.asinh,
-            np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
-            expected=np.arcsinh(
-                np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.asinh,
+          np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
+          expected=np.arcsinh(
+              np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
 
-        self._assertOpOutputMatchesExpected(
-            math_ops.atanh,
-            np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
-            expected=np.arctanh(
-                np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.atanh,
+          np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
+          expected=np.arctanh(
+              np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
 
       self._assertOpOutputMatchesExpected(
           math_ops.cosh,
@@ -377,11 +423,10 @@ class UnaryOpsTest(XLATestCase):
           np.array([[1, 2j, 2 + 3j]], dtype=dtype),
           expected=1.0 / np.array([[1, 2j, 2 + 3j]], dtype=dtype))
 
-      if atan2_supported:
-        self._assertOpOutputMatchesExpected(
-            math_ops.log,
-            np.array([[5j, 3 - 2j]], dtype=dtype),
-            expected=np.log(np.array([[5j, 3 - 2j]], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.log,
+          np.array([[5j, 3 - 2j]], dtype=dtype),
+          expected=np.log(np.array([[5j, 3 - 2j]], dtype=dtype)))
 
       self._assertOpOutputMatchesExpected(
           math_ops.sin,
@@ -395,27 +440,26 @@ class UnaryOpsTest(XLATestCase):
 
       # TODO(b/34703906): improve log1p implementation and make tolerance
       # tighter.
-      if atan2_supported:  # TODO(b/34703906): log support
-        self._assertOpOutputMatchesExpected(
-            math_ops.log1p,
-            np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype),
-            expected=np.log1p(
-                np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.log1p,
+          np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype),
+          expected=np.log1p(
+              np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype)))
 
-        val = np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)
-        self._assertOpOutputMatchesExpected(
-            math_ops.rsqrt, val, expected=1 / np.sqrt(val))
+      val = np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)
+      self._assertOpOutputMatchesExpected(
+          math_ops.rsqrt, val, expected=1 / np.sqrt(val))
 
-        self._assertOpOutputMatchesExpected(
-            math_ops.sigmoid, val, expected=1 / (1 + np.exp(-val)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.sigmoid, val, expected=1 / (1 + np.exp(-val)))
 
-        self._assertOpOutputMatchesExpected(
-            math_ops.sqrt, val, expected=np.sqrt(val))
+      self._assertOpOutputMatchesExpected(
+          math_ops.sqrt, val, expected=np.sqrt(val))
 
-        self._assertOpOutputMatchesExpected(
-            math_ops.tanh,
-            np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype),
-            expected=np.tanh(np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.tanh,
+          np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype),
+          expected=np.tanh(np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)))
 
       self._assertOpOutputMatchesExpected(
           math_ops.tan,
@@ -448,12 +492,10 @@ class UnaryOpsTest(XLATestCase):
           np.array([[-4j, 3 + 2j], [2, -1j]], dtype=dtype),
           expected=np.array([[1, 1], [1, 1]], dtype=dtype))
 
-      if atan2_supported:  # TODO(b/34703906): atan2 support
-        self._assertOpOutputMatchesExpected(
-            math_ops.angle,
-            np.array([1 + 3j, -4 + 7j, 2.7, -3j], dtype=dtype),
-            expected=np.angle(
-                np.array([1 + 3j, -4 + 7j, 2.7, -3j], dtype=dtype)))
+      self._assertOpOutputMatchesExpected(
+          math_ops.angle,
+          np.array([1 + 3j, -4 + 7j, 2.7, -3j], dtype=dtype),
+          expected=np.angle(np.array([1 + 3j, -4 + 7j, 2.7, -3j], dtype=dtype)))
 
       self._assertOpOutputMatchesExpected(
           math_ops.conj,
@@ -541,7 +583,8 @@ class UnaryOpsTest(XLATestCase):
 
   def testCast(self):
     shapes = [[], [4], [2, 3], [2, 0, 4]]
-    types = [dtypes.bool, dtypes.int32, dtypes.float32] + self.complex_tf_types
+    types = (set([dtypes.bool, dtypes.int32, dtypes.float32]) |
+             self.complex_tf_types)
     for shape in shapes:
       for src_type in types:
         for dst_type in types:
@@ -641,55 +684,88 @@ class UnaryOpsTest(XLATestCase):
         equality_test=self.ListsAreClose)
 
   def testDepthToSpace(self):
+    def make_op(data_format):
+      def op(x):
+        return array_ops.depth_to_space(x, block_size=2,
+                                        data_format=data_format)
+      return op
+
     for dtype in self.numeric_types:
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.depth_to_space(x, block_size=2),
-          np.array([[[[1, 2, 3, 4]]]], dtype=dtype),
-          expected=np.array([[[[1], [2]],
-                              [[3], [4]]]], dtype=dtype))
+      for data_format in ["NCHW", "NHWC"]:
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(np.array([[[[1, 2, 3, 4]]]], dtype=dtype),
+                           data_format),
+            expected=nhwc_to_format(np.array([[[[1], [2]],
+                                               [[3], [4]]]], dtype=dtype),
+                                    data_format))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.depth_to_space(x, block_size=2),
-          np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]], dtype=dtype),
-          expected=np.array([[[[1, 2, 3], [4, 5, 6]],
-                              [[7, 8, 9], [10, 11, 12]]]], dtype=dtype))
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(
+                np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]],
+                         dtype=dtype),
+                data_format),
+            expected=nhwc_to_format(
+                np.array([[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]],
+                         dtype=dtype),
+                data_format))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.depth_to_space(x, block_size=2),
-          np.array([[[[1, 2, 3, 4],
-                      [5, 6, 7, 8]],
-                     [[9, 10, 11, 12],
-                      [13, 14, 15, 16]]]], dtype=dtype),
-          expected=np.array([[[[1], [2], [5], [6]],
-                              [[3], [4], [7], [8]],
-                              [[9], [10], [13], [14]],
-                              [[11], [12], [15], [16]]]], dtype=dtype))
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(
+                np.array([[[[1, 2, 3, 4],
+                            [5, 6, 7, 8]],
+                           [[9, 10, 11, 12],
+                            [13, 14, 15, 16]]]], dtype=dtype),
+                data_format),
+            expected=nhwc_to_format(
+                np.array([[[[1], [2], [5], [6]],
+                           [[3], [4], [7], [8]],
+                           [[9], [10], [13], [14]],
+                           [[11], [12], [15], [16]]]], dtype=dtype),
+                data_format))
 
   def testSpaceToDepth(self):
+    def make_op(data_format):
+      def op(x):
+        return array_ops.space_to_depth(x, block_size=2,
+                                        data_format=data_format)
+      return op
+
     for dtype in self.numeric_types:
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.space_to_depth(x, block_size=2),
-          np.array([[[[1], [2]],
-                     [[3], [4]]]], dtype=dtype),
-          expected=np.array([[[[1, 2, 3, 4]]]], dtype=dtype))
+      for data_format in ["NCHW", "NHWC"]:
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(np.array([[[[1], [2]],
+                                      [[3], [4]]]], dtype=dtype),
+                           data_format),
+            expected=nhwc_to_format(np.array([[[[1, 2, 3, 4]]]], dtype=dtype),
+                                    data_format))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.space_to_depth(x, block_size=2),
-          np.array([[[[1, 2, 3], [4, 5, 6]],
-                     [[7, 8, 9], [10, 11, 12]]]], dtype=dtype),
-          expected=np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]],
-                            dtype=dtype))
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(np.array([[[[1, 2, 3], [4, 5, 6]],
+                                      [[7, 8, 9], [10, 11, 12]]]], dtype=dtype),
+                           data_format),
+            expected=nhwc_to_format(
+                np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]],
+                         dtype=dtype),
+                data_format))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.space_to_depth(x, block_size=2),
-          np.array([[[[1], [2], [5], [6]],
-                     [[3], [4], [7], [8]],
-                     [[9], [10], [13], [14]],
-                     [[11], [12], [15], [16]]]], dtype=dtype),
-          expected=np.array([[[[1, 2, 3, 4],
-                               [5, 6, 7, 8]],
-                              [[9, 10, 11, 12],
-                               [13, 14, 15, 16]]]], dtype=dtype))
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(np.array([[[[1], [2], [5], [6]],
+                                      [[3], [4], [7], [8]],
+                                      [[9], [10], [13], [14]],
+                                      [[11], [12], [15], [16]]]], dtype=dtype),
+                           data_format),
+            expected=nhwc_to_format(
+                np.array([[[[1, 2, 3, 4],
+                            [5, 6, 7, 8]],
+                           [[9, 10, 11, 12],
+                            [13, 14, 15, 16]]]], dtype=dtype),
+                data_format))
 
   def _assertSoftplusMatchesExpected(self, features, dtype):
     features = np.array(features, dtype=dtype)
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index c50342dee45eba6ae54f01653ecc81ef096b547b..b08d6ab21e0746558cb3d4818d4c822c45d2e9ee 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -107,11 +107,26 @@ class VariableOpsTest(XLATestCase):
                  [[[30, 31, 32], [33, 34, 35]], [[0, 1, 2], [3, 4, 5]]]],
             ).astype(dtype), sess.run(x))
 
+  def testShape(self):
+    for dtype in self.numeric_types:
+      init = np.ones([2, 3]).astype(dtype)
+      with self.test_session() as session, self.test_scope():
+        v = resource_variable_ops.ResourceVariable(init)
+        session.run(variables.variables_initializer([v]))
+        h = v.handle
+        s32, s64 = session.run([
+            resource_variable_ops.variable_shape(h),
+            resource_variable_ops.variable_shape(h, out_type=dtypes.int64)
+        ])
+        self.assertEqual(s32.dtype, np.int32)
+        self.assertEqual(s64.dtype, np.int64)
+        self.assertAllEqual(s32, [2, 3])
+        self.assertAllEqual(s64, [2, 3])
+
   def testReadWrite(self):
     """Tests initialization, reading, and writing a resource variable."""
     for dtype in self.numeric_types:
       with self.test_session() as session:
-        print(ops.get_default_graph())
         with self.test_scope():
           with variable_scope.variable_scope("ascope", use_resource=True):
             x = variable_scope.get_variable(
diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py
index 0be127997e5211f810ca791187486760881fe172..7e1f5c76ed65946363cc3c113ab1a9862f87b289 100644
--- a/tensorflow/compiler/tests/xla_test.py
+++ b/tensorflow/compiler/tests/xla_test.py
@@ -53,41 +53,100 @@ class XLATestCase(test.TestCase):
     super(XLATestCase, self).__init__(method_name)
     self.device = FLAGS.test_device
     self.has_custom_call = (self.device == 'XLA_CPU')
-    self.all_tf_types = [
+    self._all_tf_types = set([
         dtypes.as_dtype(types_pb2.DataType.Value(name))
         for name in FLAGS.types.split(',')
-    ]
-    self.int_tf_types = [
-        dtype for dtype in self.all_tf_types if dtype.is_integer
-    ]
-    self.float_tf_types = [
-        dtype for dtype in self.all_tf_types if dtype.is_floating
-    ]
-    self.complex_tf_types = [
-        dtype for dtype in self.all_tf_types if dtype.is_complex
-    ]
-    self.numeric_tf_types = (
-        self.int_tf_types + self.float_tf_types + self.complex_tf_types)
-
-    self.all_types = [dtype.as_numpy_dtype for dtype in self.all_tf_types]
-    self.int_types = [dtype.as_numpy_dtype for dtype in self.int_tf_types]
-    self.float_types = [dtype.as_numpy_dtype for dtype in self.float_tf_types]
-    self.complex_types = [
+    ])
+    self.int_tf_types = set([
+        dtype for dtype in self._all_tf_types if dtype.is_integer
+    ])
+    self._float_tf_types = set([
+        dtype for dtype in self._all_tf_types if dtype.is_floating
+    ])
+    self.complex_tf_types = set([
+        dtype for dtype in self._all_tf_types if dtype.is_complex
+    ])
+    self._numeric_tf_types = set(
+        self.int_tf_types | self._float_tf_types | self.complex_tf_types)
+
+    self._all_types = set(
+        [dtype.as_numpy_dtype for dtype in self._all_tf_types])
+    self.int_types = set([dtype.as_numpy_dtype for dtype in self.int_tf_types])
+    self._float_types = set(
+        [dtype.as_numpy_dtype for dtype in self._float_tf_types])
+    self.complex_types = set([
         dtype.as_numpy_dtype for dtype in self.complex_tf_types
-    ]
-    self.numeric_types = self.int_types + self.float_types + self.complex_types
+    ])
+    self._numeric_types = set(
+        self.int_types | self._float_types | self.complex_types)
 
     # Parse the manifest file, if any, into a regex identifying tests to
     # disable
     self.disabled_regex = None
+    self._method_types_filter = dict()
+    # TODO(xpan): Make it text proto if it doesn't scale.
+    # Each line of the manifest file specifies an entry. The entry can be
+    # 1) TestNameRegex  // E.g. CumprodTest.* Or
+    # 2) TestName TypeName  // E.g. AdamOptimizerTest.testSharing DT_BFLOAT16
+    # The 1) disables the entire test. While 2) only filter some numeric types
+    # so that they are not used in those tests.
+
     if FLAGS.disabled_manifest is not None:
       comments_re = re.compile('#.*$')
       manifest_file = open(FLAGS.disabled_manifest, 'r')
-      lines = manifest_file.read().splitlines()
-      lines = [comments_re.sub('', l).strip() for l in lines]
-      self.disabled_regex = re.compile('|'.join(lines))
+      disabled_tests = []
+      disabled_method_types = []
+      for l in manifest_file.read().splitlines():
+        entry = comments_re.sub('', l).strip().split(' ')
+        if len(entry) == 1:
+          disabled_tests.append(entry[0])
+        elif len(entry) == 2:
+          disabled_method_types.append(
+              (entry[0], entry[1].strip().split(',')))
+        else:
+          raise ValueError('Bad entry in manifest file.')
+
+      self.disabled_regex = re.compile('|'.join(disabled_tests))
+      for method, types in disabled_method_types:
+        self._method_types_filter[method] = set([
+            dtypes.as_dtype(types_pb2.DataType.Value(name)).as_numpy_dtype
+            for name in types])
       manifest_file.close()
 
+  @property
+  def all_tf_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    tf_types = set([dtypes.as_dtype(t)
+                    for t in self._method_types_filter.get(name, set())])
+    return self._all_tf_types - tf_types
+
+  @property
+  def float_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    return self._float_types - self._method_types_filter.get(name, set())
+
+  @property
+  def float_tf_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    return self._float_tf_types - self._method_types_filter.get(name, set())
+
+  @property
+  def numeric_tf_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    tf_types = set([dtypes.as_dtype(t)
+                    for t in self._method_types_filter.get(name, set())])
+    return self._numeric_tf_types - tf_types
+
+  @property
+  def numeric_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    return self._numeric_types - self._method_types_filter.get(name, set())
+
+  @property
+  def all_types(self):
+    name = '{}.{}'.format(type(self).__name__, self._testMethodName)
+    return self._all_types - self._method_types_filter.get(name, set())
+
   def setUp(self):
     super(XLATestCase, self).setUp()
     name = '{}.{}'.format(type(self).__name__, self._testMethodName)
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 5a81438b1c48e7f0ef66dae072092974db24c621..3c7dfef03dfb5d86dd63fd4aa84ad56081833035 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 
 package_group(
     name = "internal",
@@ -25,6 +25,30 @@ package(
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 
+cc_library(
+    name = "tf2xla_supported_ops_lib",
+    srcs = ["tf2xla_supported_ops.cc"],
+    hdrs = ["tf2xla_supported_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xla_compiler",
+        "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_binary(
+    name = "tf2xla_supported_ops",
+    srcs = ["tf2xla_supported_ops_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [":tf2xla_supported_ops_lib"],
+)
+
 xla_proto_library(
     name = "tf2xla_proto",
     srcs = ["tf2xla.proto"],
@@ -67,7 +91,6 @@ cc_library(
         # Keep dependencies to a minimum here; this library is used in every AOT
         # binary produced by tfcompile.
         "//tensorflow/compiler/aot:runtime",
-        "//tensorflow/compiler/tf2xla:xla_local_runtime_context",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/core:framework_lite",
     ],
@@ -97,18 +120,21 @@ cc_library(
 cc_library(
     name = "xla_compiler",
     srcs = [
+        "const_analysis.cc",
+        "graph_compiler.cc",
         "xla_compilation_device.cc",
         "xla_compiler.cc",
         "xla_context.cc",
         "xla_helpers.cc",
         "xla_op_kernel.cc",
         "xla_op_registry.cc",
-        "graph_compiler.cc",
+        "xla_resource.cc",
         "xla_cpu_backend.cc",
     ] + if_cuda_is_configured([
         "xla_gpu_backend.cc",
     ]),
     hdrs = [
+        "const_analysis.h",
         "graph_compiler.h",
         "xla_compilation_device.h",
         "xla_compiler.h",
@@ -116,11 +142,11 @@ cc_library(
         "xla_helpers.h",
         "xla_op_kernel.h",
         "xla_op_registry.h",
+        "xla_resource.h",
     ],
     visibility = [":friends"],
     deps = [
         ":common",
-        ":const_analysis",
         ":dump_graph",
         ":functionalize_control_flow",
         ":sharding_util",
@@ -180,6 +206,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:sharding_builder",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -215,6 +242,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -328,28 +356,16 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "const_analysis",
-    srcs = ["const_analysis.cc"],
-    hdrs = ["const_analysis.h"],
-    deps = [
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_cc_test(
     name = "const_analysis_test",
     size = "small",
     srcs = ["const_analysis_test.cc"],
     deps = [
-        ":const_analysis",
+        ":xla_compiler",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:ops",
         "//tensorflow/core:test",
@@ -357,13 +373,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "xla_local_runtime_context",
-    hdrs = ["xla_local_runtime_context.h"],
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/core:framework_lite"],
-)
-
 cc_library(
     name = "dump_graph",
     srcs = [
@@ -400,6 +409,7 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index d57273d84442c17565a6ace1c29170a0f3ba583b..82923722c54d235716b9138d95a75a441df924ca 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 
@@ -27,93 +28,18 @@ namespace tensorflow {
 // compile-time constants.
 Status BackwardsConstAnalysis(const Graph& g,
                               std::vector<bool>* compile_time_const_args) {
-  // TODO(phawkins): annotate these on the kernel registrations, rather than
-  // using a hard-coded list.
-  // (operator, argument) pairs that must be compile-time constants.
-  const std::unordered_multimap<string, string> compile_time_const_inputs = {
-      {"All", "reduction_indices"},
-      {"Any", "reduction_indices"},
-      {"ArgMin", "dimension"},
-      {"ArgMax", "dimension"},
-      {"AvgPoolGrad", "orig_input_shape"},
-      {"AvgPool3DGrad", "orig_input_shape"},
-      {"BatchToSpace", "crops"},
-      {"BatchToSpaceND", "block_shape"},
-      {"BatchToSpaceND", "crops"},
-      {"BroadcastArgs", "s0"},
-      {"BroadcastArgs", "s1"},
-      {"BroadcastGradientArgs", "s0"},
-      {"BroadcastGradientArgs", "s1"},
-      {"Concat", "concat_dim"},
-      {"ConcatV2", "axis"},
-      {"ConcatOffset", "concat_dim"},
-      {"ConcatOffset", "shape"},
-      {"Conv2DBackpropFilter", "filter_sizes"},
-      {"Conv2DBackpropInput", "input_sizes"},
-      {"Conv3DBackpropFilterV2", "filter_sizes"},
-      {"Conv3DBackpropInputV2", "input_sizes"},
-      {"DepthwiseConv2dNativeBackpropFilter", "filter_sizes"},
-      {"DepthwiseConv2dNativeBackpropInput", "input_sizes"},
-      {"DynamicStitch", "indices"},
-      {"ExpandDims", "dim"},
-      {"Fill", "dims"},
-      {"GatherV2", "axis"},
-      {"InvertPermutation", "x"},
-      {"LinSpace", "start"},
-      {"LinSpace", "stop"},
-      {"LinSpace", "num"},
-      {"Max", "reduction_indices"},
-      {"Mean", "reduction_indices"},
-      {"Min", "reduction_indices"},
-      {"OneHot", "depth"},
-      {"Pad", "paddings"},
-      {"PadV2", "paddings"},
-      {"MirrorPad", "paddings"},
-      {"Multinomial", "num_samples"},
-      {"Prod", "reduction_indices"},
-      {"RandomStandardNormal", "shape"},
-      {"RandomUniform", "shape"},
-      {"RandomUniformInt", "shape"},
-      {"Range", "start"},
-      {"Range", "limit"},
-      {"Range", "delta"},
-      {"Reshape", "shape"},
-      {"ResourceStridedSliceAssign", "begin"},
-      {"ResourceStridedSliceAssign", "end"},
-      {"ResourceStridedSliceAssign", "strides"},
-      {"Reverse", "dims"},
-      {"ReverseV2", "axis"},
-      {"Slice", "begin"},
-      {"Slice", "size"},
-      {"SpaceToBatch", "paddings"},
-      {"SpaceToBatchND", "block_shape"},
-      {"SpaceToBatchND", "paddings"},
-      {"Split", "split_dim"},
-      {"SplitV", "split_dim"},
-      {"SplitV", "size_splits"},
-      {"StackV2", "max_size"},
-      {"StridedSlice", "begin"},
-      {"StridedSlice", "end"},
-      {"StridedSlice", "strides"},
-      {"StridedSliceGrad", "shape"},
-      {"StridedSliceGrad", "begin"},
-      {"StridedSliceGrad", "end"},
-      {"StridedSliceGrad", "strides"},
-      {"Sum", "reduction_indices"},
-      {"TensorArrayV3", "size"},
-      {"TensorArraySplitV3", "lengths"},
-      {"Tile", "multiples"},
-      {"Transpose", "perm"}};
-
   // Operators that don't look at the data of their inputs, just the shapes.
   const std::unordered_set<string> metadata_ops = {
-      "Rank", "Shape", "ShapeN", "Size",
+      "Rank",
+      "Shape",
+      "ShapeN",
+      "Size",
   };
 
   Status status;
   std::unordered_set<Node*> must_be_const;
-  auto visit = [&status, &metadata_ops, &compile_time_const_inputs,
-                &must_be_const, compile_time_const_args](Node* node) {
+  auto visit = [&status, &metadata_ops, &must_be_const,
+                compile_time_const_args](Node* node) {
     if (!status.ok()) return;
 
     // If this is a metadata-only op, don't propagate the const requirement.
@@ -136,16 +62,17 @@ Status BackwardsConstAnalysis(const Graph& g,
     }
 
     // Mark any compile-time constant operator arguments as const.
-    auto range = compile_time_const_inputs.equal_range(node->type_string());
-    if (range.first == range.second) return;
+    const std::unordered_set<string>* const_inputs =
+        XlaOpRegistry::CompileTimeConstantInputs(node->type_string());
+    if (!const_inputs || const_inputs->empty()) return;
 
     NameRangeMap input_name_ranges;
     status =
         NameRangesForNode(*node, node->op_def(), &input_name_ranges, nullptr);
     if (!status.ok()) return;
 
-    for (auto it = range.first; it != range.second; ++it) {
-      auto name_range = input_name_ranges.find(it->second);
+    for (const string& input : *const_inputs) {
+      auto name_range = input_name_ranges.find(input);
       if (name_range == input_name_ranges.end()) continue;
 
       for (Edge const* edge : node->in_edges()) {
diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc
index ddd912b87315f7943915153b5bf73531107af54d..03603ee9baefd1d20d220faf63c9c1c427ebdf31 100644
--- a/tensorflow/compiler/tf2xla/dump_graph.cc
+++ b/tensorflow/compiler/tf2xla/dump_graph.cc
@@ -63,7 +63,12 @@ string MakeUniquePath(string name) {
 
 string DumpGraphDefToFile(const string& name, GraphDef const& graph_def) {
   string path = MakeUniquePath(name);
-  TF_CHECK_OK(WriteTextProto(Env::Default(), path, graph_def));
+  Status status = WriteTextProto(Env::Default(), path, graph_def);
+  if (!status.ok()) {
+    VLOG(1) << "Failed to dump GraphDef to file: " << path << " : " << status;
+    path.clear();
+    path = "(unavailable)";
+  }
   return path;
 }
 
@@ -79,7 +84,13 @@ string DumpGraphToFile(const string& name, Graph const& graph,
 
 string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef) {
   string path = MakeUniquePath(name);
-  TF_CHECK_OK(WriteTextProto(Env::Default(), path, fdef));
+  Status status = WriteTextProto(Env::Default(), path, fdef);
+  if (!status.ok()) {
+    VLOG(1) << "Failed to dump FunctionDef to file: " << path << " : "
+            << status;
+    path.clear();
+    path = "(unavailable)";
+  }
   return path;
 }
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 5726d8294a7c7fe81d7f6b803af89ca305aa2deb..f8169795ddfb7fd4e93d3f136c51623385868951 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 
@@ -36,6 +37,8 @@ namespace tensorflow {
 
 namespace {
 
+using xla::StatusOr;
+
 const char* const kArgOp = "_Arg";
 const char* const kRetValOp = "_Retval";
 
@@ -75,6 +78,20 @@ struct Frame {
   std::unordered_set<Node*> nodes;
 };
 
+// Comparison function used for sorting nodes consistently.
+// a) resource variables are last, and
+// b) sort lexicographically by name (for deterministic output).
+struct NodeCmp {
+  bool operator()(const Node* lhs, const Node* rhs) const {
+    bool lhs_is_resource =
+        lhs->num_inputs() > 0 ? (lhs->input_type(0) == DT_RESOURCE) : false;
+    bool rhs_is_resource =
+        rhs->num_inputs() > 0 ? (rhs->input_type(0) == DT_RESOURCE) : false;
+    return std::tie(lhs_is_resource, lhs->name()) <
+           std::tie(rhs_is_resource, rhs->name());
+  }
+};
+
 // Returns a textual representation of the names of the nodes in the input.
 template <typename T>
 string NodesToString(const T& nodes) {
@@ -140,7 +157,7 @@ Status CopySubgraph(const Graph& graph, const Frame* frame,
   return Status::OK();
 }
 
-xla::StatusOr<Node*> AddNode(const NodeDef& node_def, Graph* graph) {
+StatusOr<Node*> AddNode(const NodeDef& node_def, Graph* graph) {
   Status status;
   Node* inserted_node = graph->AddNode(node_def, &status);
   if (!status.ok()) {
@@ -149,7 +166,7 @@ xla::StatusOr<Node*> AddNode(const NodeDef& node_def, Graph* graph) {
   return inserted_node;
 }
 
-xla::StatusOr<Node*> BuildArgNode(Graph* graph, DataType type, int index) {
+StatusOr<Node*> BuildArgNode(Graph* graph, DataType type, int index) {
   NodeDef arg_def;
   NodeDefBuilder builder(strings::StrCat(kArgOp, index), kArgOp);
   builder.Attr("T", type);
@@ -158,7 +175,7 @@ xla::StatusOr<Node*> BuildArgNode(Graph* graph, DataType type, int index) {
   return AddNode(arg_def, graph);
 }
 
-xla::StatusOr<Node*> BuildRetvalNode(Graph* graph, DataType type, int index) {
+StatusOr<Node*> BuildRetvalNode(Graph* graph, DataType type, int index) {
   NodeDef ret_def;
   ret_def.set_op(kRetValOp);
   ret_def.set_name(strings::StrCat(kRetValOp, index));
@@ -268,7 +285,8 @@ Status BuildLoopBody(const Graph& graph, Frame* frame,
 Status FunctionalizeLoop(Graph* graph, Frame* frame,
                          FunctionLibraryDefinition* library) {
   VLOG(2) << "Frame " << frame->name << " before: "
-          << dump_graph::DumpGraphToFile("functionalize_before", *graph);
+          << dump_graph::DumpGraphToFile("functionalize_before", *graph,
+                                         library);
 
   // Split loop-varying Enter nodes with multiple successors. If the same
   // Tensor is fed as input to multiple loop arguments, we may end up with a
@@ -309,16 +327,9 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
   }
   frame->args = std::move(args);
 
-  // Order the arguments so that:
-  // a) resource variables are last, and
-  // b) sort lexicographically by name (for deterministic output).
-  std::sort(frame->args.begin(), frame->args.end(),
-            [](const Arg& a, const Arg& b) {
-              bool a_is_resource = (a.enter->input_type(0) == DT_RESOURCE);
-              bool b_is_resource = (b.enter->input_type(0) == DT_RESOURCE);
-              return std::tie(a_is_resource, a.enter->name()) <
-                     std::tie(b_is_resource, b.enter->name());
-            });
+  std::sort(
+      frame->args.begin(), frame->args.end(),
+      [](const Arg& a, const Arg& b) { return NodeCmp()(a.enter, b.enter); });
 
   if (frame->loop_cond == nullptr) {
     return errors::InvalidArgument("Loop ", frame->name,
@@ -417,16 +428,36 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
       //   identity nodes are values used by the loop body or condition.
       //   The Identity node may have the wrong device so copy the device from
       //   one of its outputs instead.
+      std::deque<const Edge*> possible_exit;
       for (const Edge* edge : arg.switch_node->out_edges()) {
-        if (edge->src_output() == 0 && IsExit(edge->dst())) {
+        if (edge->src_output() == 0) {
+          possible_exit.push_back(edge);
+        }
+        if (IsIdentity(edge->dst())) {
+          TF_RETURN_IF_ERROR(
+              SetNodeShardingFromNeighbors(edge->dst(), /*out_edges=*/true));
+        }
+      }
+      // TODO(b/67425339): Allow general graph between switch and exit.
+      while (!possible_exit.empty()) {
+        const Edge* edge = possible_exit.front();
+        possible_exit.pop_front();
+        if (IsExit(edge->dst())) {
           if (arg.exit != nullptr) {
             return errors::InvalidArgument("Duplicate Exit successors to ",
                                            arg.switch_node->name());
           }
           arg.exit = edge->dst();
-        } else if (StringPiece(edge->dst()->type_string()) == "Identity") {
-          TF_RETURN_IF_ERROR(
-              SetNodeShardingFromNeighbors(edge->dst(), /*out_edges=*/true));
+        } else {
+          if (!IsIdentity(edge->dst())) {
+            return errors::Unimplemented("General graph between switch (",
+                                         arg.switch_node->name(),
+                                         ") and exit node of frame ",
+                                         frame->name, " not supported yet.");
+          }
+          for (const Edge* out : edge->dst()->out_edges()) {
+            possible_exit.push_back(out);
+          }
         }
       }
     }
@@ -440,7 +471,7 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
   TF_RETURN_IF_ERROR(BuildLoopBody(*graph, frame, &arg_types, &body_graph));
 
   VLOG(2) << "Frame " << frame->name << " condition: "
-          << dump_graph::DumpGraphToFile("loop_condition", *cond_graph)
+          << dump_graph::DumpGraphToFile("loop_condition", *cond_graph, library)
           << " body: " << dump_graph::DumpGraphToFile("loop_body", *body_graph);
 
   static std::atomic<int64> sequence_num(0LL);
@@ -521,266 +552,141 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
   frame->parent->nodes.insert(while_node);
 
   VLOG(2) << "Frame " << frame->name << " after: "
-          << dump_graph::DumpGraphToFile("functionalize_after", *graph);
+          << dump_graph::DumpGraphToFile("functionalize_after", *graph,
+                                         library);
 
   return Status::OK();
 }
 
 class FunctionalizeCond {
  public:
-  // Identifies the connected parts of the tf.Cond.
-  struct ClusterHandle {
-    explicit ClusterHandle(int representative = -1)
-        : representative(representative) {}
+  // All nodes are assumed to be either in no branch, then branch, else branch,
+  // or both branches (such as merge nodes).
+  enum Branch {
+    kElseBranch = 0,
+    kThenBranch = 1,
+    kBoth = 2,
+    kNeither = 3,
+    kNumBranchTypes = 4
+  };
 
-    bool operator==(const ClusterHandle& other) const {
-      return representative == other.representative;
-    }
+  // Returns a textual representation of the Branch b.
+  static string Branch_Name(FunctionalizeCond::Branch b);
 
-    bool operator!=(const ClusterHandle& other) const {
-      return !(*this == other);
-    }
+  // Functionalize all the switch-merge nodes of a loop-free graph into XlaIf
+  // nodes. That is, attempt to transform every remaining switch and merge nodes
+  // in the graph into XlaIf nodes.
+  // Precondition: All while loops have been removed from graph.
+  static Status Functionalize(Graph* graph, FunctionLibraryDefinition* library);
 
-    bool operator<(const ClusterHandle& other) const {
-      return representative < other.representative;
+ private:
+  // CondArgNode represents a input to the conditional and its corresponding
+  // switch nodes.
+  struct CondArgNode {
+    explicit CondArgNode(Node* input) : input(input) {}
+    string ToString() const {
+      return strings::StrCat("input=", input->name(),
+                             " switches=", NodesToString(switches));
     }
 
-    bool operator>(const ClusterHandle& other) const {
-      return representative > other.representative;
-    }
+    Node* input;
+    std::vector<Node*> switches;
+  };
+  using CondArgNodes = std::vector<CondArgNode>;
 
+  struct ForwardFlowNode {
+    explicit ForwardFlowNode(Branch branch = Branch::kNeither)
+        : branch(branch), count(0) {}
     string ToString() const {
-      return strings::StrCat("Cluster_", representative);
+      return strings::StrCat("branch=", Branch_Name(branch), " count=", count);
     }
-
-    // Vector of UnionFind<ClusterHandle> indexable by ClusterHandle and Node*.
-    struct Vector {
-      explicit Vector(size_t size) : clusters(size) {}
-
-      UnionFind<ClusterHandle>& at(const ClusterHandle& cluster) {
-        return clusters.at(cluster.representative);
-      }
-
-      UnionFind<ClusterHandle>& at(const Node* node) {
-        return clusters.at(node->id());
-      }
-
-      UnionFind<ClusterHandle>& operator[](const Node* node) {
-        return clusters.at(node->id());
-      }
-
-      size_t size() const { return clusters.size(); }
-
-      void resize(size_t count) { return clusters.resize(count); }
-
-     private:
-      std::vector<UnionFind<ClusterHandle>> clusters;
-    };
-
-   private:
-    int representative;
+    Branch branch;
+    int count;
   };
 
-  // Represents a node in the clustered graph consisting of switch_nodes,
-  // merge_nodes as well as the edges into and out of this node to other
-  // Clusters. Each Cluster corresponds to a ClusterHandle and has a
-  // corresponding representative.
-  struct Cluster {
-    std::unordered_set<Node*> switch_nodes;
-    std::unordered_set<Node*> merge_nodes;
-    std::unordered_set<Cluster*> in_nodes;
-    std::unordered_set<Cluster*> out_nodes;
-
-    // A member of the ClusterHandle corresponding to this Cluster.
-    ClusterHandle representative;
-    bool visited = false;
-  };
+  // Group of switch nodes that will be part of the same XlaIf.
+  struct SwitchCluster {
+    explicit SwitchCluster(Node* predicate) : predicate(predicate) {}
+    string ToString() const {
+      return strings::StrCat(name, " predicate=", predicate->name(),
+                             " switches=", NodesToString(switches));
+    }
 
-  // Represent the clustered graph as map from cluster representative to
-  // Cluster.
-  using ClusteredGraph = std::map<ClusterHandle, Cluster>;
-
-  // The arguments and condition of a XlaIf. The arguments are ordered by node
-  // id in the original graph.
-  struct CondArgs {
-    struct CondCmp {
-      bool operator()(const Node* lhs, const Node* rhs) const {
-        bool lhs_is_resource =
-            lhs->num_inputs() > 0 ? (lhs->input_type(0) == DT_RESOURCE) : false;
-        bool rhs_is_resource =
-            rhs->num_inputs() > 0 ? (rhs->input_type(0) == DT_RESOURCE) : false;
-        return std::tie(lhs_is_resource, lhs->name()) <
-               std::tie(rhs_is_resource, rhs->name());
-      }
-    };
-    Node* conditional = nullptr;
-    std::set<Node*, CondCmp> args;
+    string name;
+    Node* predicate;
+    std::vector<Node*> switches;
   };
 
-  static Status Functionalize(Graph* graph, FunctionLibraryDefinition* library);
-
- private:
-  FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library)
-      : clusters_(graph->num_node_ids()), library_(library), graph_(graph) {}
-
-  // Returns a vector of Switch nodes from the clustered graph where the nodes
-  // are sorted by the number of switch nodes minus number of merge nodes
-  // from a root of the clustered graph to the given Merge node, with ties
-  // broken by the representative of the Cluster. This corresponds to sorting by
-  // nesting depth, from deepest nested to outermost.
-  std::vector<std::pair<int, Cluster*>> SortedSwitchNodes();
-
-  // Returns whether the graph has no conditionals.
-  bool NoConditionals() const { return merge_nodes_.empty(); }
-
-  // Construct the clustered graph by creating nodes for each cluster and the
-  // connections between the clusters. Switch and Merge nodes partition
-  // clusters, so iterate over those. Note: a Cluster may have neither a
-  // Merge or Switch but will have an in/out edge from a Cluster that has.
-  void CreateClusters();
-
-  // Creates the clustered graph by identifying all the edges between different
-  // clusters and collecting all switch and merge nodes that correspond to a
-  // cluster.
-  void CreateClusteredGraph();
-
-  // If `from` and `to` correspond to different clusters, then merge the nodes
-  // in the clustered graph corresponding to `from` and `to`.
-  //
-  // If `remove_from_graph` is specified then the `from` node is also removed
-  // from the clustered graph post contracting the edge.
-  void ContractEdge(Cluster* from, Cluster* to, bool remove_from_graph = false);
-
-  // Converts a Merge node to a XlaIf. This encapsulates the process of
-  // extracting the bodies needed for the then and else branch, creates a XlaIf
-  // node, removing the nodes of the branches from the graph and replacing the
-  // merge node with a XlaIf.
-  Status ConvertCorrespondingMergeToXlaIf(Cluster* switch_cluster);
-
-  // Removes a Switch cluster feeding directly into a Merge cluster by removing
-  // the Switch and Merge nodes and collapsing into a single cluster.
-  Status RemoveTrivialSwitch(Cluster* switch_cluster);
-
-  // Returns the merge cluster corresponding to the switch node. This function
-  // only returns the merge cluster in the case where we have a switch node that
-  // is the single entry point for all paths to a common merge cluster, this
-  // merge cluster may be created by combining multiple merge clusters, that
-  // share the switch cluster as common ancestor, together.
-  //
-  //           Switch
-  //          /      \
-  //     Branch      Branch
-  //          \      /
-  //        merge_cluster
-  //
-  // Note: either of the branches may be empty. The case where both branches are
-  // empty is handled by RemoveTrivialSwitch.
-  gtl::optional<Cluster*> CreateCorrespondingMergeCluster(
-      Cluster* switch_cluster);
-
-  // Determines the arguments needed as input to the Merge cluster originating
-  // from the Switch cluster.
-  xla::StatusOr<CondArgs> DetermineCondArgs(const Cluster& merge_cluster,
-                                            const Cluster& switch_cluster);
-
-  // Builds a XlaIfOp to replace the Merge node with.
-  xla::StatusOr<Node*> BuildAndAddXlaIfOp(const CondArgs& cond_args,
-                                          const Cluster& merge_cluster,
-                                          const std::vector<Node*>& outputs);
+  FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library,
+                    bool dump_graphs)
+      : library_(library), graph_(graph), dump_graphs_(dump_graphs) {}
+
+  // Perform the actual cond functionalization. Iterate over groups of switch
+  // nodes (linked by common predicate), from innermost to outermost, and
+  // extract into XlaIf nodes.
+  Status FunctionalizeInternal();
+
+  // Determines the branch_map (mapping from node to branch of cond) and
+  // frontier (the nodes where the cond ends).
+  StatusOr<std::pair<std::unordered_map<Node*, ForwardFlowNode>,
+                     std::unordered_set<Node*>>>
+  DetermineBranchMapAndFrontier(const SwitchCluster& switch_cluster);
+
+  // Returns XlaIf node created from subgraph of merge and switch nodes. This
+  // encapsulates the process of extracting the bodies needed for the then and
+  // else branch, creates a XlaIf node, removing the nodes of the branches from
+  // the graph and replacing the merge node with a XlaIf.
+  StatusOr<Node*> ConvertToXlaIf(const CondArgNodes& cond_arg_nodes,
+                                 const SwitchCluster& switch_cluster,
+                                 const std::vector<Node*>& switches);
+
+  // Builds a XlaIfOp to replace the Switch-Graph-Merge cluster with.
+  StatusOr<Node*> BuildAndAddXlaIfOp(const CondArgNodes& cond_arg_nodes,
+                                     const SwitchCluster& switch_cluster,
+                                     const std::vector<Node*>& merge_nodes);
 
   // Extracts a function body corresponding to the given input edge of the merge
   // node.
-  Status ExtractBody(const CondArgs& cond_args, const Cluster& merge_cluster,
-                     const std::vector<Node*>& outputs, int input_edge,
+  Status ExtractBody(const CondArgNodes& cond_arg_nodes,
+                     const std::vector<Node*>& switches,
+                     const std::vector<Node*>& merge_nodes, int input_edge,
                      Graph* body);
 
   // Adds all the input edges to `if_node` corresponding to the arguments.
-  Status AddInputEdges(const CondArgs& cond_args, Node* if_node);
+  Status AddInputEdges(const CondArgNodes& cond_arg_nodes, Node* predicate,
+                       Node* if_node);
 
   // Adds all output edges from the `if_node`.
   Status AddOutputEdges(const std::vector<Node*>& outputs, Node* if_node);
 
-  // Removes all nodes from the graph that are part of cluster.
-  void RemoveClusterNodes(Cluster* cluster);
-
-  // Removes all argument nodes that are unused.
-  template <class T>
-  void RemoveUnusedArgs(const T& args);
-
-  // Removes all Merge nodes in merge_cluster.
-  void RemoveMergeNodes(Cluster* merge_cluster);
+  // Returns the switch clusters of graph_ in postorder. Dead switch nodes are
+  // skipped and removed from the graph.
+  StatusOr<std::vector<SwitchCluster>> DeterminePredicateSwitchOrder();
+
+  // Update the state for destination based on the state of source and the node
+  // being updated.
+  Status Join(const ForwardFlowNode& src_state, const Node* dst,
+              ForwardFlowNode* dst_state);
+
+  // Ensure that all nodes in the branch_map are dominated by the switch
+  // nodes. Returns nodes that are not dominated by the switches but are a
+  // control dependency of a node in the cond, and remove such control
+  // dependencies.
+  StatusOr<std::vector<Node*>> EnsureDominanceAndReturnNonDominatedControlNodes(
+      const std::unordered_map<Node*, ForwardFlowNode>& branch_map,
+      const std::vector<Node*>& switches);
+
+  // Validates that the frontier of nodes for the conditional
+  // section are as expected.
+  Status ValidateFrontier(
+      const std::unordered_map<Node*, ForwardFlowNode>& branch_map,
+      const std::unordered_set<Node*>& frontier);
 
-  // Returns the representative member of the corresponding cluster.
-  ClusterHandle Representative(const Node* node) {
-    return clusters_.at(node).Get();
-  }
-
-  ClusteredGraph clustered_graph_;
-  ClusterHandle::Vector clusters_;
-  std::unordered_set<Node*> merge_nodes_;
-  std::unordered_set<Node*> switch_nodes_;
   FunctionLibraryDefinition* library_;
   Graph* graph_;
+  bool dump_graphs_;
 };
 
-std::ostream& operator<<(std::ostream& os,
-                         const FunctionalizeCond::ClusterHandle& c) {
-  os << c.ToString();
-  return os;
-}
-
-// Returns a dot representation of the clustered graph showing the connections
-// between the nodes and the nodes in each cluster.
-string DebugString(const Graph& graph,
-                   FunctionalizeCond::ClusterHandle::Vector* clusters) {
-  string ret = "digraph {\ncompound=true;labeljust=\"r\";ranksep=0.24\n";
-  std::map<FunctionalizeCond::ClusterHandle, string> subgraphs;
-  auto name = [](const Node* n) {
-    return strings::StrCat(n->type_string(), "_", n->id());
-  };
-  for (Node* n : graph.nodes()) {
-    strings::StrAppend(&subgraphs[clusters->at(n).Get()], n->id(), " [label=\"",
-                       name(n), "\"];\n");
-  }
-  for (auto kv : subgraphs) {
-    strings::StrAppend(&ret, "subgraph cluster_", kv.first.ToString(), " {\n",
-                       "style=filled; color=lightgrey;", "label = \"",
-                       kv.first.ToString(), "\";\n", kv.second, "}\n");
-  }
-  for (Node* n : graph.nodes()) {
-    for (Node* in : n->in_nodes()) {
-      strings::StrAppend(&ret, in->id(), " -> ", n->id(), ";\n");
-    }
-  }
-  return strings::StrCat(ret, "} // end");
-}
-
-string DebugString(const FunctionalizeCond::ClusteredGraph& clustered_graph) {
-  string ret = "digraph {\ncompound=true;labeljust=\"r\";\n";
-  auto name = [](const FunctionalizeCond::Cluster& cluster) {
-    return cluster.representative.ToString();
-  };
-  for (auto kv : clustered_graph) {
-    if (!kv.second.switch_nodes.empty() || !kv.second.merge_nodes.empty()) {
-      strings::StrAppend(
-          &ret, kv.first.ToString(), " [label=\"", name(kv.second),
-          kv.second.switch_nodes.empty()
-              ? ""
-              : strings::StrCat(" switches=", kv.second.switch_nodes.size()),
-          kv.second.merge_nodes.empty()
-              ? ""
-              : strings::StrCat(" merges=", kv.second.merge_nodes.size()),
-          "\"];\n");
-    }
-  }
-  for (auto kv : clustered_graph) {
-    for (auto in : kv.second.in_nodes) {
-      strings::StrAppend(&ret, name(*in), " -> ", name(kv.second), ";\n");
-    }
-  }
-  return strings::StrCat(ret, "} // end");
-}
-
 bool IsDeadSwitch(const Node* node) {
   for (const Edge* e : node->out_edges()) {
     const Node* dst = e->dst();
@@ -796,337 +702,454 @@ bool IsDeadSwitch(const Node* node) {
   return true;
 }
 
-void FunctionalizeCond::CreateClusters() {
-  ClusterHandle source_cluster = ClusterHandle(Graph::kSourceId);
-  auto& source = clusters_.at(source_cluster);
-  std::deque<std::pair<ClusterHandle, std::deque<Node*>>> workqueue;
-  workqueue.push_back({source_cluster, {}});
-  for (Node* node : graph_->nodes()) {
-    if (IsSwitch(node)) {
-      switch_nodes_.insert(node);
-    } else if (IsMerge(node)) {
-      merge_nodes_.insert(node);
-    }
-    ClusterHandle& cluster = clusters_.at(node).Get();
-    cluster = ClusterHandle(node->id());
-    // Group all source clusters together.
-    if (node->IsSource() || node->in_edges().empty()) {
-      clusters_.at(node).Merge(&source);
-      source.Merge(&clusters_.at(node));
-      workqueue.front().second.push_back(node);
+string FunctionalizeCond::Branch_Name(FunctionalizeCond::Branch b) {
+  const string branch_name[FunctionalizeCond::kNumBranchTypes + 1] = {
+      "else", "then", "both", "neither", "count"};
+  return branch_name[b];
+}
+
+Status FunctionalizeCond::ValidateFrontier(
+    const std::unordered_map<Node*, FunctionalizeCond::ForwardFlowNode>&
+        branch_map,
+    const std::unordered_set<Node*>& frontier) {
+  std::unordered_set<const Node*> pending[kNumBranchTypes];
+  for (Node* n : frontier) {
+    pending[branch_map.at(n).branch].insert(n);
+  }
+  TF_RET_CHECK(pending[kNeither].empty()) << NodesToString(pending[kNeither]);
+  for (const Node* n : pending[kBoth]) {
+    TF_RET_CHECK(IsMerge(n)) << n->DebugString();
+    // Merge nodes may be in then or else branch too
+  }
+  int index = (pending[kThenBranch].size() <= pending[kElseBranch].size())
+                  ? kThenBranch
+                  : kElseBranch;
+  int other = 1 - index;
+  for (const Node* n : pending[index]) {
+    if (pending[other].find(n) != pending[other].end()) {
+      return errors::Internal(
+          "Node (", n->DebugString().c_str(),
+          ") in both Else and Then branch should be in Both.");
     }
   }
+  // An empty frontier indicates a dead switch. Above we attempt to remove dead
+  // switch nodes, but not all are removed so don't treat it as an error yet.
+  // TODO(jpienaar): Find out why dead switch nodes remain.
+  // if (pending[kBoth].empty() && pending[kThenBranch].empty() &&
+  //     pending[kElseBranch].empty()) {
+  //   return errors::Internal("Unexpected empty frontier for switch nodes");
+  // }
+  return Status::OK();
+}
 
-  // If there are no Merge nodes, then terminate.
-  if (merge_nodes_.empty()) {
-    return;
+Status FunctionalizeCond::Join(const ForwardFlowNode& src_state,
+                               const Node* dst, ForwardFlowNode* dst_state) {
+  TF_RET_CHECK(dst_state->branch != Branch::kBoth &&
+               dst_state->branch != Branch::kNumBranchTypes)
+      << "Unexpected/Invalid branch type: Merging "
+      << Branch_Name(src_state.branch) << " with "
+      << Branch_Name(dst_state->branch);
+  if (dst_state->branch == Branch::kNeither) {
+    dst_state->branch = src_state.branch;
+  } else if (src_state.branch != dst_state->branch &&
+             src_state.branch != Branch::kNeither) {
+    if (IsMerge(dst)) {
+      dst_state->branch = Branch::kBoth;
+    } else {
+      return errors::Internal("Illegal merge: ", src_state.ToString(), " with ",
+                              dst_state->ToString(), " for ",
+                              dst->DebugString());
+    }
   }
+  ++dst_state->count;
+  return Status::OK();
+}
 
-  // Remove all dead Switch nodes.
-  RemoveUnusedArgs(switch_nodes_);
-
-  // All parent_'s are still nullptr so clusters_ may still be resized. Resize
-  // conservatively assuming all merge nodes become XlaIf nodes.
-  clusters_.resize(clusters_.size() + merge_nodes_.size());
-
-  std::unordered_set<Node*> marked;
-  while (!workqueue.empty()) {
-    auto cluster_queue = workqueue.front();
-    VLOG(4) << "Cluster: " << cluster_queue.first << " Queue: {"
-            << str_util::Join(cluster_queue.second, ",",
-                              [](string* output, const Node* node) {
-                                strings::StrAppend(output, node->id());
-                              })
-            << "}";
-
-    UnionFind<ClusterHandle>& repr = clusters_.at(cluster_queue.first);
-    workqueue.pop_front();
-    std::deque<Node*> switch_nodes;
-    std::deque<Node*> merge_nodes;
-    std::unordered_set<Node*> cluster_member;
-    while (!cluster_queue.second.empty()) {
-      // Iterate node workqueue and flow forward merging all nodes reachable
-      // that are neither a Switch or a Merge and whose inputs are all part of
-      // the same cluster.
-      Node* cur = cluster_queue.second.front();
-      cluster_queue.second.pop_front();
-      if (marked.find(cur) != marked.end()) {
-        continue;
-      }
-      if (IsMerge(cur)) {
-        merge_nodes.push_back(cur);
-        marked.insert(cur);
-        continue;
-      }
-      if (IsSwitch(cur)) {
-        switch_nodes.push_back(cur);
-        marked.insert(cur);
-        continue;
-      }
-      clusters_.at(cur).Merge(&repr);
-      cluster_member.insert(cur);
-      for (Node* out : cur->out_nodes()) {
-        bool all_ancestors_in_cluster = true;
-        for (Node* in : out->in_nodes()) {
-          if (IsMerge(out)) {
-            merge_nodes.push_back(out);
-          }
-          if (IsSwitch(out)) {
-            switch_nodes.push_back(out);
-          }
-          if (cluster_member.find(in) == cluster_member.end()) {
-            all_ancestors_in_cluster = false;
-            break;
-          }
-        }
-        if (all_ancestors_in_cluster && out->IsOp()) {
-          cluster_queue.second.push_back(out);
-          marked.insert(cur);
-        }
+StatusOr<std::vector<FunctionalizeCond::SwitchCluster>>
+FunctionalizeCond::DeterminePredicateSwitchOrder() {
+  struct Cluster {
+    bool operator==(const Cluster& other) const {
+      return representative == other.representative;
+    }
+    int representative = -1;
+  };
+
+  // Perform a DFS over the graph and
+  // * Determine the reverse topological order of the nodes (there should be no
+  //   cycles at this point so the post-order numbering corresponds to the
+  //   reverse topological sorting);
+  // * Identify dead switches;
+  // * Initialize the cluster's representative;
+  std::vector<UnionFind<Cluster>> clusters(graph_->num_node_ids());
+  std::vector<Node*> dead_switches;
+  std::vector<Node*> switch_order;
+  std::vector<Node*> rev_topo_sorted_nodes;
+  DFS(*graph_, nullptr, [&](Node* n) {
+    clusters[n->id()].Get().representative = n->id();
+    if (IsSwitch(n)) {
+      if (IsDeadSwitch(n)) {
+        dead_switches.push_back(n);
+      } else {
+        rev_topo_sorted_nodes.push_back(n);
+        switch_order.push_back(n);
       }
+    } else if (n->IsOp()) {
+      // Exclude src and sink nodes from further consideration.
+      rev_topo_sorted_nodes.push_back(n);
     }
+  });
+
+  std::vector<SwitchCluster> switch_clusters;
+  // Return early if there are no switches in the graph.
+  if (switch_order.empty()) {
+    return switch_clusters;
+  }
+
+  // Remove all dead switch nodes.
+  for (Node* n : dead_switches) {
+    VLOG(2) << "Removing dead switch: " << n->DebugString();
+    graph_->RemoveNode(n);
+  }
+
+  // Identify switch nodes that are part of the same control flow context by
+  // considering the operands of operations: an operation is part of the same
+  // control context as its operands unless the operation is a switch. Control
+  // dependencies are considered part of the same control flow context if the
+  // switch depth is the same (see comment below).
+
+  // entry_cluster records the input cluster to a switch node. This is used when
+  // merging with a merge node where the dst's cluster is merged with the entry
+  // cluster of the merge node's cluster (which corresponds to a switch cluster
+  // and so has an entry cluster).
+  std::unordered_map<int, UnionFind<Cluster>*> entry_cluster;
+
+  // Returns the output cluster of a node. Where the output cluster is cluster
+  // where the output of the node is used. For non-merge nodes this is simply
+  // the cluster they are part of, while for merge nodes it is the entry cluster
+  // of the cluster they are part of (this will correspond to the entry node of
+  // a switch node that dominates the merge).
+  auto find_output_cluster = [&](Node* n) {
+    UnionFind<Cluster>* cluster = &clusters[n->id()];
+    if (!IsMerge(n)) return cluster;
+    auto it = entry_cluster.find(clusters[n->id()].Get().representative);
+    // If the cluster is not found in the entry_cluster map then an
+    // instruction not dominated by a switch node has been merged into the
+    // cluster of the merge. This indicates a failure of the clustering.
+    CHECK(it != entry_cluster.end())
+        << "Unable to find entry for n=" << n->id() << " ("
+        << cluster->Get().representative << ")";
+    return it->second;
+  };
+
+  // TODO(jpienaar): This could be combined with DetermineBranchMapAndFrontier.
+  std::vector<int> switch_depth(graph_->num_node_ids());
+  for (auto it = rev_topo_sorted_nodes.rbegin();
+       it != rev_topo_sorted_nodes.rend(); ++it) {
+    Node* n = *it;
 
-    VLOG(4) << "Switches: {"
-            << str_util::Join(switch_nodes, ",",
-                              [](string* output, const Node* node) {
-                                strings::StrAppend(output, node->id());
-                              })
-            << "}";
-
-    // Merge Switch nodes with common predicate.
-    std::unordered_map<Node*, std::vector<Node*>> predicate_to_switch;
-    for (Node* node : switch_nodes) {
-      Node* tmp;
-      TF_CHECK_OK(node->input_node(1, &tmp));
-      predicate_to_switch[tmp].push_back(node);
+    // Compute switch depth.
+    int new_switch_depth = 0;
+    for (const Edge* e : n->in_edges()) {
+      Node* src = e->src();
+      new_switch_depth = std::max(
+          new_switch_depth, switch_depth[src->id()] - (IsMerge(src) ? 1 : 0));
     }
-    for (auto kv : predicate_to_switch) {
-      Node* first = kv.second.front();
-      for (Node* switch_node : kv.second) {
-        clusters_.at(first).Merge(&clusters_.at(switch_node));
+    switch_depth[n->id()] = new_switch_depth + (IsSwitch(n) ? 1 : 0);
+
+    // Only merge the input operands of a switch. The switch's clustering itself
+    // is determined by the interaction of the switch's outputs.
+    if (IsSwitch(n)) {
+      Node* input;
+      TF_CHECK_OK(n->input_node(0, &input));
+      entry_cluster[n->id()] = &clusters[input->id()];
+      UnionFind<Cluster>* cluster = find_output_cluster(input);
+      int cluster_depth = switch_depth[cluster->Get().representative];
+      // Merge the inputs of the switch node with one another. This results in
+      // predicates and control input residing in the same cluster.
+      for (const Edge* e : n->in_edges()) {
+        Node* src = e->src();
+        UnionFind<Cluster>* src_cluster = find_output_cluster(src);
+        int src_cluster_depth = switch_depth[src_cluster->Get().representative];
+        if (cluster_depth != src_cluster_depth) {
+          return errors::InvalidArgument(
+              "Unable to functionalize control flow in graph: Switch ('",
+              n->name(), "') has operands ('", input->name(), "' and '",
+              src->name(), "') that have different switch depths (",
+              cluster_depth, " != ", src_cluster_depth, ")");
+        }
+        cluster->Merge(src_cluster);
       }
+      continue;
     }
 
-    // Enqueue each edge of the switch node separately. That is, group all the
-    // nodes that are due to the true/false edge of the switch together and
-    // consider all nodes that only have a control dependency on the switch node
-    // separately. We want to group together all nodes that are part of the same
-    // branch, as these will be extracted into the `then` and `else` functions
-    // of the functional if. The ops due to control edges are different as they
-    // could be involved with either branch and merging them here could result
-    // in invalid graphs.
-    for (auto kv : predicate_to_switch) {
-      ClusterHandle none = ClusterHandle(-1);
-      ClusterHandle first[2] = {none, none};
-      std::deque<Node*>* queue[2];
-      for (auto switch_node : kv.second) {
-        for (const auto e : switch_node->out_edges()) {
-          if (IsSwitch(e->dst()) || IsMerge(e->dst())) {
-            continue;
-          }
-          // Control edges are enqueued on their own.
-          if (e->IsControlEdge()) {
-            workqueue.push_back({Representative(e->dst()), {e->dst()}});
-            continue;
-          }
-          // Combine all outputs of the same output port of a switch cluster
-          // into the same workqueue entry.
-          if (first[e->src_output()] == none) {
-            ClusterHandle repr = Representative(e->dst());
-            first[e->src_output()] = repr;
-            workqueue.push_back({repr, {}});
-            queue[e->src_output()] = &workqueue.back().second;
-          }
-          clusters_.at(first[e->src_output()]).Merge(&clusters_.at(e->dst()));
-          queue[e->src_output()]->push_back(e->dst());
+    for (const Edge* e : n->in_edges()) {
+      Node* src = e->src();
+      if (!src->IsOp()) continue;
+      UnionFind<Cluster>* cluster = find_output_cluster(src);
+      // Merge a node with its data operands and with its control operands if
+      // the src and dst are in the same ControlContext. The ControlContext is
+      // not explicitly available here, and instead the switch depth is used as
+      // a proxy here. Due to the invariant that control edges can only be from
+      // a containing scope to an inner scope or from the inner scope to its
+      // containing scope (for exit nodes), the switch depth will only match if
+      // the src and dst are in the same ControlContext. Control edges between
+      // ControlContexts are handled during the extraction.
+      int src_id = cluster->Get().representative;
+      int src_depth = switch_depth[src_id];
+      if (!e->IsControlEdge() || new_switch_depth == src_depth) {
+        if (src_depth != new_switch_depth) {
+          return errors::InvalidArgument(
+              "Unable to functionalize control flow in graph: Operand ('",
+              src->name(), "') and operator ('", n->name(),
+              "') have different switch depths (", src_depth,
+              " != ", new_switch_depth, ")");
         }
+        cluster->Merge(&clusters[n->id()]);
       }
     }
   }
-}
 
-void FunctionalizeCond::ContractEdge(Cluster* from, Cluster* to,
-                                     bool remove_from_graph) {
-  VLOG(3) << "ContractEdge from = " << from->representative
-          << " to = " << to->representative;
-  if (from->representative == to->representative) {
-    return;
-  }
-  to->merge_nodes.insert(from->merge_nodes.begin(), from->merge_nodes.end());
-  from->merge_nodes.clear();
-  to->switch_nodes.insert(from->switch_nodes.begin(), from->switch_nodes.end());
-  from->switch_nodes.clear();
-
-  for (Cluster* from_out : from->out_nodes) {
-    from_out->in_nodes.erase(from);
-    if (from_out->representative != to->representative) {
-      from_out->in_nodes.insert(to);
-      to->out_nodes.insert(from_out);
+  if (dump_graphs_) {
+    // Mark the switch cluster each node is part of.
+    for (Node* n : graph_->nodes()) {
+      n->ClearAttr("_XlaFunctionalizeSwitchGroup");
+      n->AddAttr("_XlaFunctionalizeSwitchGroup",
+                 clusters[n->id()].Get().representative);
     }
+    LOG(INFO) << "FunctionalizeControlFlow (with_clusters): "
+              << dump_graph::DumpGraphToFile("functionalize_clustered", *graph_,
+                                             library_);
   }
-  from->out_nodes.clear();
 
-  for (Cluster* from_in : from->in_nodes) {
-    from_in->out_nodes.erase(from);
-    if (from_in->representative != to->representative) {
-      from_in->out_nodes.insert(to);
-      to->in_nodes.insert(from_in);
+  // Verify all the nodes of a cluster are at the same depth.
+  std::unordered_map<int, std::pair<int, Node*>> cluster_to_depth_node;
+  for (Node* n : graph_->nodes()) {
+    int depth = switch_depth[n->id()];
+    int cluster_rep = clusters[n->id()].Get().representative;
+    auto it = cluster_to_depth_node.find(cluster_rep);
+    if (it == cluster_to_depth_node.end()) {
+      cluster_to_depth_node[cluster_rep] = std::make_pair(depth, n);
+    } else {
+      if (it->second.first != depth) {
+        return errors::Internal(
+            "Illegal clustering created, mismatch in depths:", "\n\t",
+            n->DebugString(), "(", clusters[n->id()].Get().representative,
+            ") at depth=", depth, " vs\n\t", it->second.second->DebugString(),
+            "(", clusters[n->id()].Get().representative, ") at depth ",
+            it->second.first);
+      }
     }
   }
-  from->in_nodes.clear();
 
-  to->in_nodes.erase(from);
-  to->out_nodes.erase(from);
-  clusters_.at(to->representative).Merge(&clusters_.at(from->representative));
-  from->visited = true;
+  struct Hash {
+    size_t operator()(const std::pair<Node*, Cluster>& item) const {
+      return Hash64Combine(hash<Node*>()(item.first),
+                           std::hash<int>()(item.second.representative));
+    }
+  };
 
-  if (remove_from_graph) {
-    clustered_graph_.erase(from->representative);
+  // Merge Switch nodes with common predicate.
+  std::unordered_map<std::pair<Node*, Cluster>, int, Hash> predicate_index;
+  // The nodes in switch_order are in reverse topological order, but the
+  // clustered switches need not be (i.e., when considered as a cluster one
+  // element of a cluster may be later in the topological order than another
+  // node whose cluster is later in the topological order of clustered
+  // switches).
+  for (auto it = switch_order.rbegin(); it != switch_order.rend(); ++it) {
+    Node* pred;
+    TF_CHECK_OK((*it)->input_node(1, &pred));
+    auto repr = std::make_pair(pred, clusters[(*it)->id()].Get());
+    if (predicate_index.find(repr) == predicate_index.end()) {
+      predicate_index[repr] = switch_clusters.size();
+      switch_clusters.emplace_back(pred);
+      // Generate a name by concatenating with the cluster representative as
+      // there could be multiple switch clusters with the same predicate.
+      switch_clusters[predicate_index[repr]].name =
+          strings::StrCat(pred->name(), "_", repr.second.representative, "_If");
+    }
+    switch_clusters[predicate_index[repr]].switches.push_back(*it);
   }
+
+  return switch_clusters;
 }
 
-void FunctionalizeCond::CreateClusteredGraph() {
-  auto update_cluster_for_node = [this](Node* node) -> Cluster& {
-    ClusterHandle repr = Representative(node);
-    Cluster& cluster_node = clustered_graph_[repr];
-    cluster_node.representative = repr;
-    for (const Node* in : node->in_nodes()) {
-      ClusterHandle other_repr = Representative(in);
-      // Skip source, sink and internal edges.
-      if (other_repr == repr) {
-        continue;
+StatusOr<std::vector<Node*>>
+FunctionalizeCond::EnsureDominanceAndReturnNonDominatedControlNodes(
+    const std::unordered_map<Node*, ForwardFlowNode>& branch_map,
+    const std::vector<Node*>& switches) {
+  std::vector<Node*> old_control_nodes;
+  for (const auto& kv : branch_map) {
+    if (kv.second.count != kv.first->in_edges().size()) {
+      std::vector<const Edge*> delete_edges;
+      for (const Edge* in : kv.first->in_edges()) {
+        auto it = branch_map.find(in->src());
+        if (it == branch_map.end()) {
+          if (in->IsControlEdge()) {
+            old_control_nodes.push_back(in->src());
+            delete_edges.push_back(in);
+          } else {
+            if (IsSwitch(in->src())) {
+              if (std::find(switches.begin(), switches.end(), in->src()) ==
+                  switches.end()) {
+                return errors::Internal(
+                    "Unexpected switch node found during flow forward: ",
+                    in->src()->DebugString());
+              }
+              continue;
+            }
+            return errors::InvalidArgument(
+                "Value ", kv.first->name(), "'s input, ", in->src()->name(),
+                ", is not dominated by switch nodes ", NodesToString(switches));
+          }
+        }
       }
-      Cluster& cluster_node_in = clustered_graph_[other_repr];
-      cluster_node.in_nodes.insert(&cluster_node_in);
-      cluster_node_in.out_nodes.insert(&cluster_node);
-      cluster_node_in.representative = other_repr;
-    }
-    for (const Node* out : node->out_nodes()) {
-      ClusterHandle other_repr = Representative(out);
-      // Skip source, sink and internal edges.
-      if (other_repr == repr) {
-        continue;
+      // Remove control edges from nodes that are not dominated by the switch
+      // nodes. New control dependencies will be added between these nodes and
+      // the XlaIf node inserted.
+      for (const Edge* e : delete_edges) {
+        graph_->RemoveEdge(e);
       }
-      Cluster& cluster_node_out = clustered_graph_[other_repr];
-      cluster_node.out_nodes.insert(&cluster_node_out);
-      cluster_node_out.in_nodes.insert(&cluster_node);
-      cluster_node_out.representative = other_repr;
     }
-    return cluster_node;
-  };
-  update_cluster_for_node(graph_->source_node());
-  for (Node* node : switch_nodes_) {
-    update_cluster_for_node(node).switch_nodes.insert(node);
-  }
-  for (Node* node : merge_nodes_) {
-    update_cluster_for_node(node).merge_nodes.insert(node);
   }
-
-  VLOG(3) << "Graph with clusters: " << DebugString(*graph_, &clusters_);
-  VLOG(3) << "ClusteredGraph: " << DebugString(clustered_graph_);
+  return old_control_nodes;
 }
 
-gtl::optional<FunctionalizeCond::Cluster*>
-FunctionalizeCond::CreateCorrespondingMergeCluster(Cluster* switch_cluster) {
-  VLOG(3) << "CreateCorrespondingMergeCluster for "
-          << switch_cluster->representative;
-  std::unordered_set<Cluster*> merges;
-  std::unordered_set<Cluster*> dominated;
-  dominated.insert(switch_cluster);
-  std::deque<Cluster*> queue;
-  auto enqueue_or_update_merge = [this, &queue, &merges](Cluster* c) {
-    if (c->merge_nodes.empty()) {
-      queue.push_back(c);
-    } else {
-      merges.insert(c);
-    }
-  };
-  // Enqueue all the outputs of the switch cluster in the workqueue.
-  for (auto* out : switch_cluster->out_nodes) {
-    enqueue_or_update_merge(out);
-  }
-  std::unordered_set<Cluster*> visited;
-  while (!queue.empty()) {
-    Cluster* cur = queue.front();
-    queue.pop_front();
-    if (visited.find(cur) != visited.end()) {
+StatusOr<
+    std::pair<std::unordered_map<Node*, FunctionalizeCond::ForwardFlowNode>,
+              std::unordered_set<Node*>>>
+FunctionalizeCond::DetermineBranchMapAndFrontier(
+    const SwitchCluster& switch_cluster) {
+  std::unordered_map<Node*, ForwardFlowNode> branch_map;
+  std::unordered_set<Node*> frontier;
+  std::vector<Node*> stack = switch_cluster.switches;
+  std::vector<bool> visited(graph_->num_node_ids(), false);
+  while (!stack.empty()) {
+    Node* n = stack.back();
+    stack.pop_back();
+
+    if (visited[n->id()]) {
       continue;
     }
-    visited.insert(cur);
-    // Ensure all inputs to the current node are in the dominated set.
-    for (Cluster* in : cur->in_nodes) {
-      if (dominated.find(in) == dominated.end()) {
-        return gtl::nullopt;
+    visited[n->id()] = true;
+
+    // Propagate branch state along each edge of a switch node.
+    bool sink_only = true;
+    for (const Edge* e : n->out_edges()) {
+      Node* out = e->dst();
+      if (!out->IsOp()) {
+        continue;
+      }
+      sink_only = false;
+      // Propagate branch information.
+      ForwardFlowNode& ffn = branch_map[out];
+      if (IsSwitch(n)) {
+        int index = e->IsControlEdge() ? Branch::kNeither : e->src_output();
+        TF_RETURN_IF_ERROR(Join(ForwardFlowNode(Branch(index)), out, &ffn));
+      } else {
+        TF_RETURN_IF_ERROR(Join(branch_map[n], out, &ffn));
+      }
+      if (IsMerge(out)) {
+        if (out->in_edges().size() == ffn.count) {
+          frontier.insert(out);
+        }
+      } else if (!visited[out->id()]) {
+        stack.push_back(out);
       }
     }
-    for (Cluster* out : cur->out_nodes) {
-      // No switch nodes beyond the entry one is expected.
-      if (!out->switch_nodes.empty()) {
-        return gtl::nullopt;
+    if (sink_only) {
+      if (!IsIdentity(n)) {
+        VLOG(1) << "Feeding into sink: " << n->DebugString();
       }
-      enqueue_or_update_merge(out);
     }
   }
-  auto it = merges.begin();
-  Cluster* merge_cluster = *it;
-  for (++it; it != merges.end(); ++it) {
-    ContractEdge(*it, merge_cluster);
-  }
-
-  // TODO(jpienaar): Clean up graph, merging nodes.
 
-  return merge_cluster;
+  if (dump_graphs_) {
+    for (const auto& kv : branch_map) {
+      // Append attribute to the graph if running with logging to make the
+      // changes clearer in the visualization.
+      kv.first->AddAttr("_XlaFunctionalizeBranch",
+                        Branch_Name(kv.second.branch));
+    }
+  }
+  return std::make_pair(std::move(branch_map), std::move(frontier));
 }
 
-xla::StatusOr<FunctionalizeCond::CondArgs> FunctionalizeCond::DetermineCondArgs(
-    const Cluster& merge_cluster, const Cluster& switch_cluster) {
-  VLOG(2) << "DetermineCondArgs for " << merge_cluster.representative
-          << " with switch cluster " << switch_cluster.representative;
-  CondArgs ret;
-  auto feeds_into_branch_cluster = [&](Node* switch_cluster) {
-    for (Node* out : switch_cluster->out_nodes()) {
-      ClusterHandle repr = Representative(out);
-      if (repr == merge_cluster.representative) {
-        return true;
-      }
-      for (Cluster* in : merge_cluster.in_nodes) {
-        if (repr == in->representative) {
-          return true;
-        }
+Status FunctionalizeCond::FunctionalizeInternal() {
+  TF_ASSIGN_OR_RETURN(std::vector<SwitchCluster> predicate_switch_order,
+                      DeterminePredicateSwitchOrder());
+
+  // Iterate from innermost set of clustered switches to outermost, replacing
+  // matching switch->merge subgraphs with single XlaIf nodes.
+  for (auto it = predicate_switch_order.rbegin();
+       it != predicate_switch_order.rend(); ++it) {
+    auto& ps = *it;
+    VLOG(3) << "Flow down from: " << NodesToString(ps.switches) << " ("
+            << ps.predicate->name() << ")";
+
+    std::unordered_map<Node*, ForwardFlowNode> branch_map;
+    std::unordered_set<Node*> frontier;
+    TF_ASSIGN_OR_RETURN(std::tie(branch_map, frontier),
+                        DetermineBranchMapAndFrontier(ps));
+
+    if (dump_graphs_)
+      LOG(INFO) << "FunctionalizeControlFlow (before XlaIf conversion): "
+                << dump_graph::DumpGraphToFile("functionalize_bc", *graph_,
+                                               library_);
+    TF_RETURN_IF_ERROR(ValidateFrontier(branch_map, frontier));
+
+    // Sort the merge and switch nodes using NodeCmp. The switch-nodes are
+    // further grouped (post sorting) by input to the switch node as in the
+    // functionalized form each input will be passed in only once. This grouping
+    // should retain the sorted order.
+    CondArgNodes cond_arg_nodes;
+    std::unordered_map<Node*, int> input_index;
+    std::sort(ps.switches.begin(), ps.switches.end(), NodeCmp());
+    for (Node* switch_node : ps.switches) {
+      Node* in;
+      TF_RETURN_IF_ERROR(switch_node->input_node(0, &in));
+      if (input_index.find(in) == input_index.end()) {
+        input_index[in] = cond_arg_nodes.size();
+        cond_arg_nodes.emplace_back(in);
       }
+      cond_arg_nodes.at(input_index.at(in)).switches.push_back(switch_node);
     }
-    return false;
-  };
-  for (Node* switch_cluster_node : switch_cluster.switch_nodes) {
-    if (!feeds_into_branch_cluster(switch_cluster_node)) {
-      continue;
+    std::vector<Node*> merge_nodes(frontier.begin(), frontier.end());
+    std::sort(merge_nodes.begin(), merge_nodes.end(), NodeCmp());
+
+    TF_ASSIGN_OR_RETURN(std::vector<Node*> old_control_nodes,
+                        EnsureDominanceAndReturnNonDominatedControlNodes(
+                            branch_map, ps.switches));
+
+    TF_ASSIGN_OR_RETURN(Node * if_node,
+                        ConvertToXlaIf(cond_arg_nodes, ps, merge_nodes));
+    for (Node* old : old_control_nodes) {
+      graph_->AddControlEdge(old, if_node);
     }
 
-    Node* tmp;
-    TF_RETURN_IF_ERROR(switch_cluster_node->input_node(1, &tmp));
-    if (ret.conditional == nullptr) {
-      ret.conditional = tmp;
-    } else if (ret.conditional != tmp) {
-      return errors::Unimplemented(
-          "Switch statements with different conditionals cannot be "
-          "converted into functional conditional.");
+    for (auto& del_kv : branch_map) {
+      graph_->RemoveNode(del_kv.first);
+    }
+    for (auto& kv : cond_arg_nodes) {
+      for (Node* node : kv.switches) {
+        graph_->RemoveNode(node);
+      }
     }
-    ret.args.insert(switch_cluster_node);
+    if (dump_graphs_)
+      LOG(INFO) << "FunctionalizeControlFlow (after XlaIf conversion): "
+                << dump_graph::DumpGraphToFile("functionalize_ac", *graph_,
+                                               library_);
   }
-  return ret;
+  return Status::OK();
 }
 
-xla::StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
-    const CondArgs& cond_args, const Cluster& merge_cluster,
-    const std::vector<Node*>& outputs) {
-  VLOG(2) << "Build if op for " << NodesToString(merge_cluster.merge_nodes)
-          << " with input " << NodesToString(cond_args.args);
+StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
+    const CondArgNodes& cond_arg_nodes, const SwitchCluster& switch_cluster,
+    const std::vector<Node*>& merge_nodes) {
+  VLOG(2) << "Build if op for " << switch_cluster.name;
 
   NodeDef if_def;
   // Create a new If node using the name of the merge node.
-  NodeDefBuilder builder(
-      strings::StrCat((*merge_cluster.merge_nodes.begin())->name(), "_If"),
-      "XlaIf");
+  NodeDefBuilder builder(switch_cluster.name, "XlaIf");
   string branch[] = {"else_branch", "then_branch"};
   for (int i = 0; i < 2; ++i) {
     static std::atomic<int64> sequence_num(0LL);
@@ -1136,8 +1159,8 @@ xla::StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
     body_name.set_name(
         strings::StrCat("_functionalize_if_", branch[i], "_", id));
     auto body = xla::MakeUnique<Graph>(graph_->op_registry());
-    TF_RETURN_IF_ERROR(
-        ExtractBody(cond_args, merge_cluster, outputs, i, body.get()));
+    TF_RETURN_IF_ERROR(ExtractBody(cond_arg_nodes, switch_cluster.switches,
+                                   merge_nodes, i, body.get()));
     VLOG(3) << "Body " << branch[i] << ": " << DebugString(body.get());
     FunctionDef body_fdef;
     TF_RETURN_IF_ERROR(GraphToFunctionDef(*body, body_name.name(), &body_fdef));
@@ -1148,33 +1171,40 @@ xla::StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
   // Build input type.
   std::vector<NodeDefBuilder::NodeOut> inputs;
   DataTypeVector in_arg_types;
-  for (const Node* arg : cond_args.args) {
-    const Edge* in_edge;
-    TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge));
-    if (in_edge->IsControlEdge()) {
-      builder.ControlInput(in_edge->src()->name());
-    } else {
-      DataType dtype = arg->input_type(0);
-      inputs.emplace_back(NodeDefBuilder::NodeOut(
-          in_edge->src()->name(), in_edge->src_output(), dtype));
-      in_arg_types.push_back(dtype);
+  for (auto& kv : cond_arg_nodes) {
+    bool inserted = false;
+    for (const Node* arg : kv.switches) {
+      const Edge* in_edge;
+      TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge));
+      if (in_edge->IsControlEdge()) {
+        builder.ControlInput(in_edge->src()->name());
+      } else {
+        if (!inserted) {
+          DataType dtype = arg->input_type(0);
+          inputs.emplace_back(NodeDefBuilder::NodeOut(
+              in_edge->src()->name(), in_edge->src_output(), dtype));
+          in_arg_types.push_back(dtype);
+          inserted = true;
+        }
+      }
     }
   }
   builder.Attr("Tin", in_arg_types);
 
   // Build output type.
   DataTypeVector out_type;
-  for (const Node* merge : merge_cluster.merge_nodes) {
+  for (const Node* merge : merge_nodes) {
     DataType dtype = merge->output_type(0);
     out_type.push_back(dtype);
   }
   builder.Attr("Tout", out_type);
 
   builder.Attr("Tcond", DT_BOOL);
-  builder.Device(cond_args.conditional->assigned_device_name());
+  builder.Device(switch_cluster.predicate->assigned_device_name());
   // Conditional should be the first input ...
-  builder.Input(NodeDefBuilder::NodeOut(cond_args.conditional->name(), 0,
-                                        cond_args.conditional->output_type(0)));
+  builder.Input(
+      NodeDefBuilder::NodeOut(switch_cluster.predicate->name(), 0,
+                              switch_cluster.predicate->output_type(0)));
   // ... followed by the other inputs.
   builder.Input(inputs);
 
@@ -1183,64 +1213,31 @@ xla::StatusOr<Node*> FunctionalizeCond::BuildAndAddXlaIfOp(
   return if_node;
 }
 
-void FunctionalizeCond::RemoveClusterNodes(Cluster* cluster) {
-  VLOG(3) << "RemoveClusterNodes for " << cluster->representative;
-  ClusterHandle repr = cluster->representative;
-  std::deque<Node*> to_delete;
-  for (Node* node : graph_->nodes()) {
-    if (Representative(node) == repr) {
-      to_delete.push_back(node);
-    }
-  }
-  for (Node* n : to_delete) {
-    graph_->RemoveNode(n);
-  }
-}
-
-template <class T>
-void FunctionalizeCond::RemoveUnusedArgs(const T& args) {
-  VLOG(2) << "RemoveUnusedArgs among: " << NodesToString(args);
-
-  std::deque<Node*> to_delete;
-  for (Node* arg : args) {
-    if (IsDeadSwitch(arg)) {
-      to_delete.push_back(arg);
-      for (Node* n : arg->out_nodes()) {
-        to_delete.push_back(n);
-      }
-    }
-  }
-  for (Node* n : to_delete) {
-    switch_nodes_.erase(n);
-    auto it = clustered_graph_.find(Representative(n));
-    if (it != clustered_graph_.end()) {
-      it->second.switch_nodes.erase(n);
-    }
-    graph_->RemoveNode(n);
-  }
-}
-
-Status FunctionalizeCond::ExtractBody(const CondArgs& cond_args,
-                                      const Cluster& merge_cluster,
-                                      const std::vector<Node*>& outputs,
+Status FunctionalizeCond::ExtractBody(const CondArgNodes& cond_arg_nodes,
+                                      const std::vector<Node*>& switches,
+                                      const std::vector<Node*>& merge_nodes,
                                       int input_edge, Graph* body) {
-  VLOG(2) << "ExtractBody for " << merge_cluster.representative
-          << " along edge " << input_edge;
+  VLOG(2) << "ExtractBody for " << NodesToString(merge_nodes) << " along edge "
+          << input_edge;
   std::vector<bool> squash_src_outputs(graph_->num_node_ids(), false);
   std::vector<Node*> node_map(graph_->num_node_ids(), nullptr);
   int arg_count = 0;
-  for (const auto* arg : cond_args.args) {
-    DataType dtype = arg->input_type(0);
-    TF_ASSIGN_OR_RETURN(Node * arg_node,
-                        BuildArgNode(body, dtype, arg_count++));
-    node_map.at(arg->id()) = arg_node;
-    squash_src_outputs.at(arg->id()) = true;
+  for (auto& kv : cond_arg_nodes) {
+    Node* arg_node = nullptr;
+    for (const auto* arg : kv.switches) {
+      DataType dtype = arg->input_type(0);
+      if (arg_node == nullptr) {
+        TF_ASSIGN_OR_RETURN(arg_node, BuildArgNode(body, dtype, arg_count++));
+      }
+      node_map.at(arg->id()) = arg_node;
+      squash_src_outputs.at(arg->id()) = true;
+    }
   }
 
   std::vector<Node*> stack;
-  stack.reserve(outputs.size());
-  for (int j = 0; j < outputs.size(); ++j) {
-    Node* node = outputs[j];
+  stack.reserve(merge_nodes.size());
+  for (int j = 0; j < merge_nodes.size(); ++j) {
+    Node* node = merge_nodes[j];
     TF_ASSIGN_OR_RETURN(node_map.at(node->id()),
                         BuildRetvalNode(body, node->output_type(0),
                                         /*index=*/j));
@@ -1251,7 +1248,7 @@ Status FunctionalizeCond::ExtractBody(const CondArgs& cond_args,
       node_map.at(in->id()) = body->CopyNode(in);
     }
 
-    if (cond_args.args.find(in) == cond_args.args.end()) {
+    if (std::find(switches.begin(), switches.end(), in) == switches.end()) {
       body->AddEdge(node_map.at(in->id()), in_edge->src_output(),
                     node_map.at(node->id()), 0);
     } else {
@@ -1266,18 +1263,25 @@ Status FunctionalizeCond::ExtractBody(const CondArgs& cond_args,
                       body);
 }
 
-Status FunctionalizeCond::AddInputEdges(const CondArgs& cond_args,
-                                        Node* if_node) {
+Status FunctionalizeCond::AddInputEdges(const CondArgNodes& cond_arg_nodes,
+                                        Node* predicate, Node* if_node) {
   VLOG(3) << "AddInputEdges for " << if_node->name();
-  int i = 0;
-  graph_->AddEdge(cond_args.conditional, 0, if_node, i++);
-  for (const Node* arg : cond_args.args) {
-    const Edge* in_edge;
-    TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge));
-    if (in_edge->IsControlEdge()) {
-      graph_->AddControlEdge(in_edge->src(), if_node);
-    } else {
-      graph_->AddEdge(in_edge->src(), in_edge->src_output(), if_node, i++);
+  int index = 0;
+  graph_->AddEdge(predicate, 0, if_node, index++);
+  for (auto& kv : cond_arg_nodes) {
+    bool inserted = false;
+    for (const Node* arg : kv.switches) {
+      const Edge* in_edge;
+      TF_RETURN_IF_ERROR(arg->input_edge(0, &in_edge));
+      if (in_edge->IsControlEdge()) {
+        graph_->AddControlEdge(in_edge->src(), if_node);
+      } else {
+        if (!inserted) {
+          graph_->AddEdge(in_edge->src(), in_edge->src_output(), if_node,
+                          index++);
+          inserted = true;
+        }
+      }
     }
   }
   return Status::OK();
@@ -1308,196 +1312,39 @@ Status FunctionalizeCond::AddOutputEdges(const std::vector<Node*>& outputs,
   return Status::OK();
 }
 
-void FunctionalizeCond::RemoveMergeNodes(Cluster* merge_cluster) {
-  VLOG(3) << "RemoveMergeNodes for " << merge_cluster->representative;
-  // Remove all merge nodes now dead post extraction of If.
-  for (auto it = merge_cluster->merge_nodes.begin();
-       it != merge_cluster->merge_nodes.end();) {
-    Node* node = *it;
-    graph_->RemoveNode(node);
-    merge_cluster->merge_nodes.erase(*it++);
-  }
-}
-
-Status FunctionalizeCond::RemoveTrivialSwitch(Cluster* switch_cluster) {
-  Cluster* merge_cluster = *switch_cluster->out_nodes.begin();
-  if (merge_cluster->merge_nodes.empty()) {
-    return errors::FailedPrecondition(
-        "Not a trivial switch: no Merge node feeding into Switch node");
-  }
-
-  for (auto it = merge_cluster->merge_nodes.begin();
-       it != merge_cluster->merge_nodes.end();) {
-    // We have the following structure:
-    //   Op -> Switch -> Merge -> Consumer
-    // and we want to transform it to:
-    //   Op -> Consumer
-    Node* merge_node = *it;
-    Node* switch_node;
-    const Edge* in = nullptr;
-    TF_RETURN_IF_ERROR(merge_node->input_node(0, &switch_node));
-    TF_RETURN_IF_ERROR(switch_node->input_edge(0, &in));
-    for (auto out : merge_node->out_edges()) {
-      int src_output = out->dst_input() == Graph::kControlSlot
-                           ? Graph::kControlSlot
-                           : in->src_output();
-      graph_->AddEdge(in->src(), src_output, out->dst(), out->dst_input());
-    }
-    graph_->RemoveNode(*it++);
-  }
-  RemoveUnusedArgs(switch_cluster->switch_nodes);
-
-  return Status::OK();
-}
-
-Status FunctionalizeCond::ConvertCorrespondingMergeToXlaIf(
-    Cluster* switch_cluster) {
-  VLOG(1) << "ConvertMergeToXlaIf for " << switch_cluster->representative;
-  gtl::optional<Cluster*> maybe_merge =
-      CreateCorrespondingMergeCluster(switch_cluster);
-  if (!maybe_merge.has_value()) {
-    return errors::FailedPrecondition(
-        "Switch cluster was not part of a simple conditional in the clustered "
-        "graph. Graph nodes in switch cluster ",
-        NodesToString(switch_cluster->switch_nodes));
-  }
-  Cluster* merge_cluster = *maybe_merge;
-  if (merge_cluster->merge_nodes.empty()) {
-    return errors::Internal(
-        "Merge node in clustered graph contains no merge nodes: ",
-        merge_cluster->representative.ToString());
-  }
-  TF_ASSIGN_OR_RETURN(auto cond_args,
-                      DetermineCondArgs(*merge_cluster, *switch_cluster));
-
-  // Sort the outputs by ID to produce more stable output.
-  std::vector<Node*> outputs(merge_cluster->merge_nodes.begin(),
-                             merge_cluster->merge_nodes.end());
-  std::sort(outputs.begin(), outputs.end(), CondArgs::CondCmp());
+StatusOr<Node*> FunctionalizeCond::ConvertToXlaIf(
+    const CondArgNodes& cond_arg_nodes, const SwitchCluster& switch_cluster,
+    const std::vector<Node*>& merge_nodes) {
+  VLOG(1) << "ConvertToXlaIf for " << switch_cluster.ToString() << " -> "
+          << NodesToString(merge_nodes);
 
   // Extract bodies and builds a If operator.
-  TF_ASSIGN_OR_RETURN(Node * if_node,
-                      BuildAndAddXlaIfOp(cond_args, *merge_cluster, outputs));
-  TF_RETURN_IF_ERROR(AddInputEdges(cond_args, if_node));
-  TF_RETURN_IF_ERROR(AddOutputEdges(outputs, if_node));
-
-  // Remove the old nodes from the graph_ and contract the edges of the
-  // clustered graph.
-  for (auto in : merge_cluster->in_nodes) {
-    if (in != switch_cluster) {
-      RemoveClusterNodes(in);
-    }
-  }
-  RemoveMergeNodes(merge_cluster);
-  RemoveUnusedArgs(cond_args.args);
-  auto in_nodes = merge_cluster->in_nodes;
-  for (auto it = in_nodes.begin(); it != in_nodes.end();) {
-    ContractEdge(*it++, switch_cluster);
-  }
-  ContractEdge(merge_cluster, switch_cluster);
-  clusters_[if_node].Get() = ClusterHandle(switch_cluster->representative);
-
-  return Status::OK();
-}
-
-std::vector<std::pair<int, FunctionalizeCond::Cluster*>>
-FunctionalizeCond::SortedSwitchNodes() {
-  VLOG(2) << "ProcessClusteredGraph";
-  std::stack<std::pair<int, Cluster*>> stack;
-  // Initialize with the source node.
-  stack.push({0, &clustered_graph_[Representative(graph_->source_node())]});
-
-  // Perform a depth-first traversal of the clustered graph computing the
-  // switch-merge depth.
-  std::vector<std::pair<int, Cluster*>> queue;
-  std::unordered_set<Cluster*> visited;
-  while (!stack.empty()) {
-    Cluster* n = stack.top().second;
-    size_t depth = stack.top().first;
-    stack.pop();
-
-    auto inserted = visited.insert(n);
-    if (!inserted.second) {
-      continue;
-    }
-
-    size_t new_depth = depth;
-    if (!n->merge_nodes.empty()) {
-      --new_depth;
-    }
-    if (!n->switch_nodes.empty()) {
-      queue.emplace_back(depth, n);
-      ++new_depth;
-    }
-    for (Cluster* e : n->out_nodes) {
-      stack.emplace(new_depth, e);
-    }
-  }
-
-  // Sort in reverse order of switch-merge depth with ties broken by the
-  // ClusterHandle.
-  std::sort(queue.begin(), queue.end(),
-            [](const std::pair<int, Cluster*>& lhs,
-               const std::pair<int, Cluster*>& rhs) {
-              return std::tie(lhs.first, lhs.second->representative) >
-                     std::tie(rhs.first, rhs.second->representative);
-            });
+  TF_ASSIGN_OR_RETURN(
+      Node * if_node,
+      BuildAndAddXlaIfOp(cond_arg_nodes, switch_cluster, merge_nodes));
+  TF_RETURN_IF_ERROR(
+      AddInputEdges(cond_arg_nodes, switch_cluster.predicate, if_node));
+  TF_RETURN_IF_ERROR(AddOutputEdges(merge_nodes, if_node));
 
-  return queue;
+  return if_node;
 }
 
 Status FunctionalizeCond::Functionalize(Graph* graph,
                                         FunctionLibraryDefinition* library) {
   VLOG(1) << "FunctionalizeCond::Functionalize";
-  FunctionalizeCond fc(graph, library);
-  fc.CreateClusters();
-  if (fc.NoConditionals()) {
-    return Status::OK();
-  }
-  fc.CreateClusteredGraph();
-
-  auto queue = fc.SortedSwitchNodes();
-  for (auto it = queue.begin(); it != queue.end();) {
-    Cluster* switch_cluster = (*it).second;
-    ++it;
-    if (switch_cluster->out_nodes.size() == 1) {
-      TF_RETURN_IF_ERROR(fc.RemoveTrivialSwitch(switch_cluster));
-    } else {
-      TF_RETURN_IF_ERROR(fc.ConvertCorrespondingMergeToXlaIf(switch_cluster));
-    }
-
-    // Contract newly Switch free switch_cluster with outgoing nodes without
-    // Switch or Merge nodes.
-    for (auto& nodes : {switch_cluster->out_nodes, switch_cluster->in_nodes}) {
-      std::vector<Cluster*> copy_nodes(nodes.begin(), nodes.end());
-      for (auto* node : copy_nodes) {
-        if (node->merge_nodes.empty() && node->switch_nodes.empty()) {
-          fc.ContractEdge(node, switch_cluster);
-        }
-      }
-    }
-
-    VLOG(3) << "Graph with clusters: "
-            << DebugString(*fc.graph_, &fc.clusters_);
-    VLOG(3) << "ClusteredGraph: " << DebugString(fc.clustered_graph_);
-  }
-
-  if (!fc.switch_nodes_.empty()) {
-    return errors::Internal(
-        "Failed to functionalize control flow with Switch nodes remaining: ",
-        NodesToString(fc.switch_nodes_));
-  }
-  return Status::OK();
+  FunctionalizeCond fc(graph, library, /*dump_graphs=*/VLOG_IS_ON(2));
+  return fc.FunctionalizeInternal();
 }
 
 }  // namespace
 
-// Transformation that converts Tensorflow's graph control flow constructs into
+// Transformation that converts TensorFlow's graph control flow constructs into
 // functional equivalents.
 Status FunctionalizeControlFlow(Graph* graph,
                                 FunctionLibraryDefinition* library) {
   VLOG(2) << "FunctionalizeControlFlow (initial): "
-          << dump_graph::DumpGraphToFile("functionalize_initial", *graph);
+          << dump_graph::DumpGraphToFile("functionalize_initial", *graph,
+                                         library);
   // Note: BuildControlFlowInfo() requires that the graph's source node is
   // connected to all source nodes in the graph. Many graphs violate this
   // invariant.
@@ -1509,7 +1356,8 @@ Status FunctionalizeControlFlow(Graph* graph,
   for (Node* node : graph->op_nodes()) {
     const ControlFlowInfo& cf = cf_info[node->id()];
 
-    VLOG(2) << "node: " << node->name() << " frame_name: " << cf.frame_name
+    VLOG(2) << "node: " << node->name() << " (" << node->id()
+            << ") frame_name: " << cf.frame_name
             << " frame: " << (cf.frame ? cf.frame->name() : "---")
             << " parent_frame: "
             << (cf.parent_frame ? cf.parent_frame->name() : "---");
@@ -1577,7 +1425,8 @@ Status FunctionalizeControlFlow(Graph* graph,
   TF_RETURN_IF_ERROR(FunctionalizeCond::Functionalize(graph, library));
 
   VLOG(2) << "FunctionalizeControlFlow (final): "
-          << dump_graph::DumpGraphToFile("functionalize_final", *graph);
+          << dump_graph::DumpGraphToFile("functionalize_final", *graph,
+                                         library);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index 01d2b282751f387cfa9c8887cdeb48090c96bff4..bc7276c3afd5060d6faeceb4d479416299ecc5da 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -38,10 +38,11 @@ namespace {
 
 // Returns the names of the "then" and "else" functions for the XlaIf node in a
 // graph.
-Status FindIfThenAndElse(const GraphDef& graph, NameAttrList* then_fn,
-                         NameAttrList* else_fn) {
+Status FindIfThenAndElse(const GraphDef& graph, string* op_name,
+                         NameAttrList* then_fn, NameAttrList* else_fn) {
   for (const NodeDef& node : graph.node()) {
     if (node.op() == "XlaIf") {
+      *op_name = node.name();
       const NameAttrList* result;
       TF_RETURN_IF_ERROR(GetNodeAttr(node, "then_branch", &result));
       *then_fn = *result;
@@ -96,9 +97,10 @@ TEST(FunctionalizeControlFlow, Conditional) {
 
   GraphDef graph_def;
   graph.ToGraphDef(&graph_def);
+  string op_name;
   NameAttrList then_fn;
   NameAttrList else_fn;
-  TF_EXPECT_OK(FindIfThenAndElse(graph_def, &then_fn, &else_fn));
+  TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
   InstantiationResultForTest else_result;
   TF_EXPECT_OK(
       InstantiateFunctionForTest(else_fn.name(), library, &else_result));
@@ -109,7 +111,7 @@ TEST(FunctionalizeControlFlow, Conditional) {
     auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
     auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
     auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
-    auto if_op = ops::XlaIf(scope.WithOpName("cond/Merge_If"), less,
+    auto if_op = ops::XlaIf(scope.WithOpName(op_name), less,
                             std::initializer_list<Input>{less, y, x}, then_fn,
                             else_fn, {DT_INT32});
     GraphDef expected;
diff --git a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
new file mode 100644
index 0000000000000000000000000000000000000000..91351421bcacd26c41b5c9f98ea833730e4aef30
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
@@ -0,0 +1,266 @@
+**Supported operators for device: XLA_CPU_JIT**
+
+Operator                              | Type Constraint
+------------------------------------- | ---------------
+`Abs`                                 | `T={double,float,int32,int64}`
+`Acosh`                               | `T={complex64,double,float}`
+`Add`                                 | `T={complex64,double,float,int32,int64}`
+`AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`AdjustContrastv2`                    |
+`AdjustHue`                           |
+`AdjustSaturation`                    |
+`All`                                 | `Tidx={int32,int64}`
+`Angle`                               | `Tout={double,float}`<br>`T={complex64}`
+`Any`                                 | `Tidx={int32,int64}`
+`ApproximateEqual`                    | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`ArgMax`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={float}`
+`ArgMin`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Asinh`                               | `T={complex64,double,float}`
+`AssignAddVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`AssignSubVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`AssignVariableOp`                    | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Atan2`                               | `T={double,float}`
+`Atanh`                               | `T={complex64,double,float}`
+`AvgPool`                             | `T={double,float}`
+`AvgPool3D`                           | `T={double,float}`
+`AvgPool3DGrad`                       | `T={double,float}`
+`AvgPoolGrad`                         | `T={double,float}`
+`BatchMatMul`                         | `T={complex64,double,float,int32}`
+`BatchToSpace`                        | `Tidx={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`BatchToSpaceND`                      | `Tcrops={int32,int64}`<br>`Tblock_shape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAdd`                             | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAddGrad`                         | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAddV1`                           | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BitwiseAnd`                          | `T={int32,int64,uint32,uint64}`
+`BitwiseOr`                           | `T={int32,int64,uint32,uint64}`
+`BroadcastArgs`                       | `T={int32,int64}`
+`BroadcastGradientArgs`               | `T={int32,int64}`
+`Cast`                                | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Ceil`                                | `T={double,float}`
+`Cholesky`                            | `T={double,float}`
+`Complex`                             | `Tout={complex64}`<br>`T={double,float}`
+`ComplexAbs`                          | `Tout={double,float}`<br>`T={complex64}`
+`Concat`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ConcatOffset`                        |
+`ConcatV2`                            | `Tidx={int32}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Conj`                                | `T={complex64}`
+`Const`                               | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ControlTrigger`                      |
+`Conv2D`                              | `T={float}`
+`Conv2DBackpropFilter`                | `T={float}`
+`Conv2DBackpropInput`                 | `T={float}`
+`Conv3D`                              | `T={double,float}`
+`Conv3DBackpropFilterV2`              | `T={double,float}`
+`Conv3DBackpropInputV2`               | `T={double,float}`
+`Cos`                                 | `T={complex64,double,float}`
+`Cosh`                                | `T={complex64,double,float}`
+`Cross`                               | `T={double,float,int32,int64,uint32,uint64}`
+`Cumprod`                             | `Tidx={int32,int64}`<br>`T={float}`
+`Cumsum`                              | `Tidx={int32,int64}`<br>`T={float}`
+`DepthToSpace`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`DepthwiseConv2dNative`               | `T={double,float}`
+`DepthwiseConv2dNativeBackpropFilter` | `T={double,float}`
+`DepthwiseConv2dNativeBackpropInput`  | `T={double,float}`
+`Diag`                                | `T={complex64,double,float,int32,int64}`
+`DiagPart`                            | `T={complex64,double,float,int32,int64}`
+`Div`                                 | `T={complex64,double,float,int32,int64}`
+`DynamicStitch`                       | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Elu`                                 | `T={double,float}`
+`EluGrad`                             | `T={double,float}`
+`Equal`                               | `T={bool,complex64,double,float,int32,int64}`
+`Exp`                                 | `T={complex64,double,float}`
+`ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Expm1`                               | `T={complex64,double,float}`
+`ExtractImagePatches`                 | `T={double,float,int32,int64,uint32,uint64}`
+`FFT`                                 |
+`FFT2D`                               |
+`FFT3D`                               |
+`Fill`                                | `index_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Floor`                               | `T={double,float}`
+`FloorDiv`                            | `T={complex64,double,float,int32,int64}`
+`FloorMod`                            | `T={double,float,int32,int64}`
+`FusedBatchNorm`                      | `T={float}`
+`FusedBatchNormGrad`                  | `T={float}`
+`FusedBatchNormGradV2`                | `U={float}`<br>`T={float}`
+`FusedBatchNormV2`                    | `U={float}`<br>`T={float}`
+`Gather`                              | `Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
+`GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
+`HSVToRGB`                            | `T={double,float}`
+`IFFT`                                |
+`IFFT2D`                              |
+`IFFT3D`                              |
+`IRFFT`                               |
+`IRFFT2D`                             |
+`IRFFT3D`                             |
+`Identity`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`IdentityN`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Imag`                                | `Tout={double,float}`<br>`T={complex64}`
+`Inv`                                 | `T={complex64,double,float,int32,int64}`
+`Invert`                              | `T={int32,int64,uint32,uint64}`
+`InvertPermutation`                   | `T={int32}`
+`IsFinite`                            | `T={double,float}`
+`IsInf`                               | `T={double,float}`
+`IsNan`                               | `T={double,float}`
+`L2Loss`                              | `T={double,float}`
+`LRN`                                 | `T={float}`
+`LRNGrad`                             | `T={float}`
+`LeftShift`                           | `T={int32,int64,uint32,uint64}`
+`Less`                                | `T={double,float,int32,int64,uint32,uint64}`
+`LessEqual`                           | `T={double,float,int32,int64,uint32,uint64}`
+`LinSpace`                            | `Tidx={int32,int64}`<br>`T={double,float}`
+`Log`                                 | `T={complex64,double,float}`
+`Log1p`                               | `T={complex64,double,float}`
+`LogSoftmax`                          | `T={double,float}`
+`LogicalAnd`                          |
+`LogicalNot`                          |
+`LogicalOr`                           |
+`MatMul`                              | `T={complex64,double,float}`
+`MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixTriangularSolve`               | `T={complex64,double,float}`
+`Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`MaxPool`                             | `T={double,float,int32,int64}`
+`MaxPool3D`                           | `T={float}`
+`MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
+`MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolGradV2`                       | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolV2`                           | `T={double,float,int32,int64}`
+`Maximum`                             | `T={double,float,int32,int64}`
+`Mean`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Min`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Minimum`                             | `T={double,float,int32,int64}`
+`MirrorPad`                           | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Mod`                                 | `T={double,float,int32,int64}`
+`Mul`                                 | `T={complex64,double,float,int32,int64}`
+`Multinomial`                         | `output_dtype={int32,int64}`<br>`T={double,float,int32,int64,uint32,uint64}`
+`Neg`                                 | `T={complex64,double,float,int32,int64}`
+`NoOp`                                |
+`NotEqual`                            | `T={bool,complex64,double,float,int32,int64}`
+`OneHot`                              | `TI={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`OnesLike`                            | `T={bool,complex64,double,float,int32,int64}`
+`Pack`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Pad`                                 | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`PadV2`                               | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ParallelDynamicStitch`               | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Pow`                                 | `T={complex64,double,float,int32,int64}`
+`PreventGradient`                     | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Prod`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`QuantizeAndDequantizeV2`             | `T={double,float}`
+`RFFT`                                |
+`RFFT2D`                              |
+`RFFT3D`                              |
+`RGBToHSV`                            | `T={double,float}`
+`RandomStandardNormal`                | `dtype={float}`
+`RandomUniform`                       | `T={int32,int64}`<br>`dtype={double,float}`
+`RandomUniformInt`                    | `T={int32,int64}`<br>`Tout={int32,int64}`
+`Range`                               | `Tidx={double,float,int32,int64}`
+`Rank`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ReadVariableOp`                      | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Real`                                | `Tout={double,float}`<br>`T={complex64}`
+`RealDiv`                             | `T={complex64,double,float,int32,int64}`
+`Reciprocal`                          | `T={complex64,double,float,int32,int64}`
+`ReciprocalGrad`                      | `T={complex64,double,float}`
+`Relu`                                | `T={double,float,int32,int64,uint32,uint64}`
+`Relu6`                               | `T={double,float,int32,int64,uint32,uint64}`
+`Relu6Grad`                           | `T={double,float,int32,int64,uint32,uint64}`
+`ReluGrad`                            | `T={double,float,int32,int64,uint32,uint64}`
+`Reshape`                             | `Tshape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ResizeBilinear`                      | `T={double,float,int32,int64}`
+`ResizeBilinearGrad`                  | `T={double,float}`
+`ResourceApplyAdagrad`                | `T={double,float}`
+`ResourceApplyAdam`                   | `T={double,float}`
+`ResourceApplyFtrl`                   | `T={double,float}`
+`ResourceApplyFtrlV2`                 | `T={double,float}`
+`ResourceApplyGradientDescent`        | `T={double,float}`
+`ResourceApplyMomentum`               | `T={double,float}`
+`ResourceApplyRMSProp`                | `T={double,float}`
+`ResourceGather`                      | `Tindices={int32,int64}`<br>`dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`ResourceStridedSliceAssign`          | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Reverse`                             | `T={bool,complex64,double,float,int32,int64}`
+`ReverseSequence`                     | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ReverseV2`                           | `T={bool,complex64,double,float,int32,int64}`<br>`Tidx={int32,int64}`
+`RightShift`                          | `T={int32,int64,uint32,uint64}`
+`Rint`                                | `T={double,float}`
+`Round`                               | `T={complex64,double,float,int32,int64}`
+`Rsqrt`                               | `T={complex64,double,float}`
+`RsqrtGrad`                           | `T={complex64,double,float}`
+`Select`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Selu`                                | `T={double,float}`
+`SeluGrad`                            | `T={double,float}`
+`Shape`                               | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ShapeN`                              | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sigmoid`                             | `T={complex64,double,float}`
+`SigmoidGrad`                         | `T={complex64,double,float}`
+`Sign`                                | `T={complex64,double,float,int32,int64}`
+`Sin`                                 | `T={complex64,double,float}`
+`Sinh`                                | `T={complex64,double,float}`
+`Size`                                | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Slice`                               | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Softmax`                             | `T={double,float}`
+`SoftmaxCrossEntropyWithLogits`       | `T={double,float}`
+`Softplus`                            | `T={double,float,int32,int64,uint32,uint64}`
+`SoftplusGrad`                        | `T={double,float,int32,int64,uint32,uint64}`
+`Softsign`                            | `T={double,float,int32,int64,uint32,uint64}`
+`SoftsignGrad`                        | `T={double,float,int32,int64,uint32,uint64}`
+`SpaceToBatch`                        | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SpaceToBatchND`                      | `Tblock_shape={int32,int64}`<br>`Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SpaceToDepth`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SparseMatMul`                        | `Tb={float}`<br>`Ta={float}`
+`SparseSoftmaxCrossEntropyWithLogits` | `Tlabels={int32,int64}`<br>`T={double,float}`
+`Split`                               | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SplitV`                              | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sqrt`                                | `T={complex64,double,float}`
+`SqrtGrad`                            | `T={complex64,double,float}`
+`Square`                              | `T={complex64,double,float,int32,int64}`
+`SquaredDifference`                   | `T={complex64,double,float,int32,int64}`
+`Squeeze`                             | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackCloseV2`                        |
+`StackPopV2`                          | `elem_type={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackPushV2`                         | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackV2`                             | `elem_type={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StatelessRandomNormal`               | `Tseed={int32}`<br>`T={int32,int64}`<br>`dtype={float}`
+`StatelessRandomUniform`              | `Tseed={int32}`<br>`T={int32,int64}`<br>`dtype={float}`
+`StopGradient`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StridedSlice`                        | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StridedSliceGrad`                    | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sub`                                 | `T={complex64,double,float,int32,int64}`
+`Sum`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`SymbolicGradient`                    | `Tout={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`Tin={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Tan`                                 | `T={complex64,double,float,int32,int64}`
+`Tanh`                                | `T={complex64,double,float}`
+`TanhGrad`                            | `T={complex64,double,float}`
+`TensorArrayCloseV3`                  |
+`TensorArrayConcatV3`                 | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayGatherV3`                 | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayGradV3`                   |
+`TensorArrayReadV3`                   | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayScatterV3`                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArraySizeV3`                   |
+`TensorArraySplitV3`                  | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayV3`                       | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayWriteV3`                  | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Tile`                                | `Tmultiples={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Transpose`                           | `Tperm={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TruncateDiv`                         | `T={complex64,double,float,int32,int64}`
+`TruncateMod`                         | `T={double,float,int32,int64}`
+`TruncatedNormal`                     | `T={int32,int64}`<br>`dtype={double,float}`
+`Unpack`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`UnsortedSegmentSum`                  | `Tnumsegments={int32,int64}`<br>`Tindices={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`VarIsInitializedOp`                  |
+`VariableShape`                       | `out_type={int32,int64}`
+`XlaWhile`                            | `T={bool,complex64,double,float,int32,int64,resource,uint32,uint64}`
+`ZerosLike`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_Arg`                                | `T={bool,complex64,double,float,int32,int64,resource,uint32,uint64}`
+`_ArrayToList`                        | `out_types={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_ListToArray`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`Tin={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_Retval`                             | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_XLARecv`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_XLASend`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+
+To regenerate this table, run:
+
+```shell
+bazel run -c opt -- tensorflow/compiler/tf2xla:tf2xla_supported_ops --device=XLA_CPU_JIT
+```
diff --git a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
new file mode 100644
index 0000000000000000000000000000000000000000..b9bdb829d773825005a8921f48d28b6892d8f0cd
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
@@ -0,0 +1,262 @@
+**Supported operators for device: XLA_GPU_JIT**
+
+Operator                              | Type Constraint
+------------------------------------- | ---------------
+`Abs`                                 | `T={double,float,int32,int64}`
+`Acosh`                               | `T={complex64,double,float}`
+`Add`                                 | `T={complex64,double,float,int32,int64}`
+`AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`AdjustContrastv2`                    |
+`AdjustHue`                           |
+`AdjustSaturation`                    |
+`All`                                 | `Tidx={int32,int64}`
+`Angle`                               | `Tout={double,float}`<br>`T={complex64}`
+`Any`                                 | `Tidx={int32,int64}`
+`ApproximateEqual`                    | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`ArgMax`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`ArgMin`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Asinh`                               | `T={complex64,double,float}`
+`AssignAddVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`AssignSubVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`AssignVariableOp`                    | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Atan2`                               | `T={double,float}`
+`Atanh`                               | `T={complex64,double,float}`
+`AvgPool`                             | `T={double,float}`
+`AvgPool3D`                           | `T={double,float}`
+`AvgPool3DGrad`                       | `T={double,float}`
+`AvgPoolGrad`                         | `T={double,float}`
+`BatchMatMul`                         | `T={complex64,double,float,int32}`
+`BatchToSpace`                        | `Tidx={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`BatchToSpaceND`                      | `Tcrops={int32,int64}`<br>`Tblock_shape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAdd`                             | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAddGrad`                         | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAddV1`                           | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BitwiseAnd`                          | `T={int32,int64,uint32,uint64}`
+`BitwiseOr`                           | `T={int32,int64,uint32,uint64}`
+`BroadcastArgs`                       | `T={int32,int64}`
+`BroadcastGradientArgs`               | `T={int32,int64}`
+`Cast`                                | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Ceil`                                | `T={double,float}`
+`Cholesky`                            | `T={double,float}`
+`Complex`                             | `Tout={complex64}`<br>`T={double,float}`
+`ComplexAbs`                          | `Tout={double,float}`<br>`T={complex64}`
+`Concat`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ConcatOffset`                        |
+`ConcatV2`                            | `Tidx={int32}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Conj`                                | `T={complex64}`
+`Const`                               | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ControlTrigger`                      |
+`Conv2D`                              | `T={float}`
+`Conv2DBackpropFilter`                | `T={float}`
+`Conv2DBackpropInput`                 | `T={float}`
+`Conv3D`                              | `T={double,float}`
+`Conv3DBackpropFilterV2`              | `T={double,float}`
+`Conv3DBackpropInputV2`               | `T={double,float}`
+`Cos`                                 | `T={complex64,double,float}`
+`Cosh`                                | `T={complex64,double,float}`
+`Cross`                               | `T={double,float,int32,int64,uint32,uint64}`
+`Cumprod`                             | `Tidx={int32,int64}`<br>`T={float}`
+`Cumsum`                              | `Tidx={int32,int64}`<br>`T={float}`
+`DepthToSpace`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`DepthwiseConv2dNative`               | `T={double,float}`
+`DepthwiseConv2dNativeBackpropFilter` | `T={double,float}`
+`DepthwiseConv2dNativeBackpropInput`  | `T={double,float}`
+`Diag`                                | `T={complex64,double,float,int32,int64}`
+`DiagPart`                            | `T={complex64,double,float,int32,int64}`
+`Div`                                 | `T={complex64,double,float,int32,int64}`
+`DynamicStitch`                       | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Elu`                                 | `T={double,float}`
+`EluGrad`                             | `T={double,float}`
+`Equal`                               | `T={bool,complex64,double,float,int32,int64}`
+`Exp`                                 | `T={complex64,double,float}`
+`ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Expm1`                               | `T={complex64,double,float}`
+`ExtractImagePatches`                 | `T={double,float,int32,int64,uint32,uint64}`
+`FFT`                                 |
+`FFT2D`                               |
+`FFT3D`                               |
+`Fill`                                | `index_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Floor`                               | `T={double,float}`
+`FloorDiv`                            | `T={complex64,double,float,int32,int64}`
+`FloorMod`                            | `T={double,float,int32,int64}`
+`FusedBatchNorm`                      | `T={float}`
+`FusedBatchNormGrad`                  | `T={float}`
+`FusedBatchNormGradV2`                | `U={float}`<br>`T={float}`
+`FusedBatchNormV2`                    | `U={float}`<br>`T={float}`
+`Gather`                              | `Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
+`GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
+`HSVToRGB`                            | `T={double,float}`
+`IFFT`                                |
+`IFFT2D`                              |
+`IFFT3D`                              |
+`IRFFT`                               |
+`IRFFT2D`                             |
+`IRFFT3D`                             |
+`Identity`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`IdentityN`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Imag`                                | `Tout={double,float}`<br>`T={complex64}`
+`Inv`                                 | `T={complex64,double,float,int32,int64}`
+`Invert`                              | `T={int32,int64,uint32,uint64}`
+`InvertPermutation`                   | `T={int32}`
+`IsFinite`                            | `T={double,float}`
+`IsInf`                               | `T={double,float}`
+`IsNan`                               | `T={double,float}`
+`L2Loss`                              | `T={double,float}`
+`LRN`                                 | `T={float}`
+`LRNGrad`                             | `T={float}`
+`LeftShift`                           | `T={int32,int64,uint32,uint64}`
+`Less`                                | `T={double,float,int32,int64,uint32,uint64}`
+`LessEqual`                           | `T={double,float,int32,int64,uint32,uint64}`
+`LinSpace`                            | `Tidx={int32,int64}`<br>`T={double,float}`
+`Log`                                 | `T={complex64,double,float}`
+`Log1p`                               | `T={complex64,double,float}`
+`LogSoftmax`                          | `T={double,float}`
+`LogicalAnd`                          |
+`LogicalNot`                          |
+`LogicalOr`                           |
+`MatMul`                              | `T={complex64,double,float}`
+`MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixTriangularSolve`               | `T={complex64,double,float}`
+`Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`MaxPool`                             | `T={double,float,int32,int64}`
+`MaxPool3D`                           | `T={float}`
+`MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
+`MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolGradV2`                       | `T={double,float,int32,int64,uint32,uint64}`
+`MaxPoolV2`                           | `T={double,float,int32,int64}`
+`Maximum`                             | `T={double,float,int32,int64}`
+`Mean`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Min`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Minimum`                             | `T={double,float,int32,int64}`
+`MirrorPad`                           | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Mod`                                 | `T={double,float,int32,int64}`
+`Mul`                                 | `T={complex64,double,float,int32,int64}`
+`Multinomial`                         | `output_dtype={int32,int64}`<br>`T={double,float,int32,int64,uint32,uint64}`
+`Neg`                                 | `T={complex64,double,float,int32,int64}`
+`NoOp`                                |
+`NotEqual`                            | `T={bool,complex64,double,float,int32,int64}`
+`OneHot`                              | `TI={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`OnesLike`                            | `T={bool,complex64,double,float,int32,int64}`
+`Pack`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Pad`                                 | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`PadV2`                               | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ParallelDynamicStitch`               | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Pow`                                 | `T={complex64,double,float,int32,int64}`
+`PreventGradient`                     | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Prod`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`QuantizeAndDequantizeV2`             | `T={double,float}`
+`RFFT`                                |
+`RFFT2D`                              |
+`RFFT3D`                              |
+`RGBToHSV`                            | `T={double,float}`
+`Range`                               | `Tidx={double,float,int32,int64}`
+`Rank`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ReadVariableOp`                      | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Real`                                | `Tout={double,float}`<br>`T={complex64}`
+`RealDiv`                             | `T={complex64,double,float,int32,int64}`
+`Reciprocal`                          | `T={complex64,double,float,int32,int64}`
+`ReciprocalGrad`                      | `T={complex64,double,float}`
+`Relu`                                | `T={double,float,int32,int64,uint32,uint64}`
+`Relu6`                               | `T={double,float,int32,int64,uint32,uint64}`
+`Relu6Grad`                           | `T={double,float,int32,int64,uint32,uint64}`
+`ReluGrad`                            | `T={double,float,int32,int64,uint32,uint64}`
+`Reshape`                             | `Tshape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ResizeBilinear`                      | `T={double,float,int32,int64}`
+`ResizeBilinearGrad`                  | `T={double,float}`
+`ResourceApplyAdagrad`                | `T={double,float}`
+`ResourceApplyAdam`                   | `T={double,float}`
+`ResourceApplyFtrl`                   | `T={double,float}`
+`ResourceApplyFtrlV2`                 | `T={double,float}`
+`ResourceApplyGradientDescent`        | `T={double,float}`
+`ResourceApplyMomentum`               | `T={double,float}`
+`ResourceApplyRMSProp`                | `T={double,float}`
+`ResourceGather`                      | `Tindices={int32,int64}`<br>`dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`ResourceStridedSliceAssign`          | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Reverse`                             | `T={bool,complex64,double,float,int32,int64}`
+`ReverseSequence`                     | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ReverseV2`                           | `T={bool,complex64,double,float,int32,int64}`<br>`Tidx={int32,int64}`
+`RightShift`                          | `T={int32,int64,uint32,uint64}`
+`Rint`                                | `T={double,float}`
+`Round`                               | `T={complex64,double,float,int32,int64}`
+`Rsqrt`                               | `T={complex64,double,float}`
+`RsqrtGrad`                           | `T={complex64,double,float}`
+`Select`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Selu`                                | `T={double,float}`
+`SeluGrad`                            | `T={double,float}`
+`Shape`                               | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ShapeN`                              | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sigmoid`                             | `T={complex64,double,float}`
+`SigmoidGrad`                         | `T={complex64,double,float}`
+`Sign`                                | `T={complex64,double,float,int32,int64}`
+`Sin`                                 | `T={complex64,double,float}`
+`Sinh`                                | `T={complex64,double,float}`
+`Size`                                | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Slice`                               | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Softmax`                             | `T={double,float}`
+`SoftmaxCrossEntropyWithLogits`       | `T={double,float}`
+`Softplus`                            | `T={double,float,int32,int64,uint32,uint64}`
+`SoftplusGrad`                        | `T={double,float,int32,int64,uint32,uint64}`
+`Softsign`                            | `T={double,float,int32,int64,uint32,uint64}`
+`SoftsignGrad`                        | `T={double,float,int32,int64,uint32,uint64}`
+`SpaceToBatch`                        | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SpaceToBatchND`                      | `Tblock_shape={int32,int64}`<br>`Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SpaceToDepth`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SparseMatMul`                        | `Tb={float}`<br>`Ta={float}`
+`SparseSoftmaxCrossEntropyWithLogits` | `Tlabels={int32,int64}`<br>`T={double,float}`
+`Split`                               | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SplitV`                              | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sqrt`                                | `T={complex64,double,float}`
+`SqrtGrad`                            | `T={complex64,double,float}`
+`Square`                              | `T={complex64,double,float,int32,int64}`
+`SquaredDifference`                   | `T={complex64,double,float,int32,int64}`
+`Squeeze`                             | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackCloseV2`                        |
+`StackPopV2`                          | `elem_type={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackPushV2`                         | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackV2`                             | `elem_type={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StatelessRandomNormal`               | `Tseed={int32}`<br>`T={int32,int64}`<br>`dtype={float}`
+`StatelessRandomUniform`              | `Tseed={int32}`<br>`T={int32,int64}`<br>`dtype={float}`
+`StopGradient`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StridedSlice`                        | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StridedSliceGrad`                    | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sub`                                 | `T={complex64,double,float,int32,int64}`
+`Sum`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`SymbolicGradient`                    | `Tout={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`Tin={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Tan`                                 | `T={complex64,double,float,int32,int64}`
+`Tanh`                                | `T={complex64,double,float}`
+`TanhGrad`                            | `T={complex64,double,float}`
+`TensorArrayCloseV3`                  |
+`TensorArrayConcatV3`                 | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayGatherV3`                 | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayGradV3`                   |
+`TensorArrayReadV3`                   | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayScatterV3`                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArraySizeV3`                   |
+`TensorArraySplitV3`                  | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayV3`                       | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayWriteV3`                  | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Tile`                                | `Tmultiples={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Transpose`                           | `Tperm={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TruncateDiv`                         | `T={complex64,double,float,int32,int64}`
+`TruncateMod`                         | `T={double,float,int32,int64}`
+`Unpack`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`UnsortedSegmentSum`                  | `Tnumsegments={int32,int64}`<br>`Tindices={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`VarIsInitializedOp`                  |
+`VariableShape`                       | `out_type={int32,int64}`
+`XlaWhile`                            | `T={bool,complex64,double,float,int32,int64,resource,uint32,uint64}`
+`ZerosLike`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_Arg`                                | `T={bool,complex64,double,float,int32,int64,resource,uint32,uint64}`
+`_ArrayToList`                        | `out_types={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_ListToArray`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`Tin={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_Retval`                             | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_XLARecv`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_XLASend`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+
+To regenerate this table, run:
+
+```shell
+bazel run -c opt -- tensorflow/compiler/tf2xla:tf2xla_supported_ops --device=XLA_GPU_JIT
+```
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index 8062f0c03ca60e88bd5c021092dceb105232219f..058a1f2621c64a735bd9d9c9d0ae007f93aa4dea 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -59,9 +60,7 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
   for (int i = 0; i < args->size(); ++i) {
     XlaCompiler::Argument& arg = (*args)[i];
     arg.type = ctx->input_type(i);
-
-    TF_RETURN_IF_ERROR(
-        TensorShapeToXLAShape(arg.type, ctx->InputShape(i), &arg.shape));
+    arg.shape = ctx->InputShape(i);
 
     if (arg.type == DT_RESOURCE) {
       return errors::InvalidArgument(
@@ -135,7 +134,7 @@ Status GraphCompiler::Compile() {
       TF_RET_CHECK(src->id() < output_registry.size());
       const NodeOutputs& src_outputs = output_registry[src->id()];
 
-      tensor_inputs_[e->dst_input()] = src_outputs[e->src_output()];
+      tensor_inputs_.at(e->dst_input()) = src_outputs.at(e->src_output());
     }
 
     OpKernelContext op_context(&params, n->num_outputs());
@@ -144,7 +143,9 @@ Status GraphCompiler::Compile() {
     } else {
       device_->Compute(CHECK_NOTNULL(params.op_kernel), &op_context);
       Status s = op_context.status();
-      TF_RETURN_IF_ERROR(s);
+      if (!s.ok()) {
+        return AttachDef(s, n->def());
+      }
     }
 
     // Set up outputs. Also check if outputs from the previous computation is
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.h b/tensorflow/compiler/tf2xla/graph_compiler.h
index ba00160b6d78c1e55cc2e053cd5285344e0179fb..127562eb23d775f17179cc9ee968ec2255cf3a14 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.h
+++ b/tensorflow/compiler/tf2xla/graph_compiler.h
@@ -70,7 +70,7 @@ class GraphCompiler {
 
  private:
   // Partially sets params. This partially set params can be reused
-  // across multple nodes visit.
+  // across multiple nodes visit.
   void PartiallySetupParams(OpKernelContext::Params* params);
 
   // Tests if a node is a functional node. A functional node represents a
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 6302fece1ffb27b6c7170fcfb90f5985f5b50659..d2fa933cf9c085f92b2f442827a94d72938e4bb2 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -4,6 +4,7 @@ package(
     default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
 )
 
+load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
 tf_kernel_library(
@@ -30,15 +31,23 @@ tf_kernel_library(
         "diag_op.cc",
         "dynamic_stitch_op.cc",
         "elu_op.cc",
+        "extract_image_patches_op.cc",
+        "fake_quantize_ops.cc",
+        "fft_ops.cc",
         "fill_op.cc",
         "function_ops.cc",
         "gather_op.cc",
         "gather_op_helpers.h",
         "identity_op.cc",
+        "image_ops.cc",
+        "image_resize_ops.cc",
         "index_ops.cc",
         "l2loss_op.cc",
         "lrn_ops.cc",
         "matmul_op.cc",
+        "matrix_band_part_op.cc",
+        "matrix_set_diag_op.cc",
+        "matrix_triangular_solve_op.cc",
         "mirror_pad_op.cc",
         "no_op.cc",
         "one_hot_op.cc",
@@ -54,11 +63,15 @@ tf_kernel_library(
         "reshape_op.cc",
         "retval_op.cc",
         "reverse_op.cc",
+        "reverse_sequence_op.cc",
+        "scan_ops.cc",
+        "scatter_nd_op.cc",
         "segment_reduction_ops.cc",
         "select_op.cc",
         "sendrecv_ops.cc",
         "sequence_ops.cc",
         "shape_op.cc",
+        "shape_util.cc",
         "slice_op.cc",
         "softmax_op.cc",
         "spacetobatch_op.cc",
@@ -76,8 +89,8 @@ tf_kernel_library(
         "variable_ops.cc",
     ],
     hdrs = [
-        "gather_op.h",
         "index_ops.h",
+        "shape_util.h",
     ],
     deps = [
         ":while_op",
@@ -85,18 +98,26 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/lib:batch_dot",
         "//tensorflow/compiler/tf2xla/lib:cholesky",
+        "//tensorflow/compiler/tf2xla/lib:scatter",
+        "//tensorflow/compiler/tf2xla/lib:triangular_solve",
+        "//tensorflow/compiler/tf2xla/lib:util",
+        "//tensorflow/compiler/tf2xla/lib:while_loop",
         "//tensorflow/compiler/tf2xla/ops:sendrecv_ops",
+        "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/core:framework",
+        "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:linalg_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:spectral_ops_op_lib",
         "//tensorflow/core:stateless_random_ops_op_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:concat_lib",
@@ -157,6 +178,7 @@ tf_kernel_library(
 cc_library(
     name = "index_ops_kernel_argmax_float_1d",
     srcs = ["index_ops_kernel_argmax_float_1d.cc"],
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
@@ -169,6 +191,7 @@ cc_library(
 cc_library(
     name = "index_ops_kernel_argmax_float_2d",
     srcs = ["index_ops_kernel_argmax_float_2d.cc"],
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index a015b8e0e8949f8aaa03a78b0f88b7ea8d6aaa1c..b0ba25b9983c3a9af26728ce4b1c263c844327db 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -28,8 +28,9 @@ class BatchMatMulOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result =
-        BatchDot(ctx->builder(), ctx->Input(0), ctx->Input(1), adj_x_, adj_y_);
+    auto result = BatchDot(ctx->builder(), ctx->Input(0), ctx->Input(1),
+                           /*transpose_x=*/adj_x_, /*transpose_y=*/adj_y_,
+                           /*conjugate_x=*/adj_x_, /*conjugate_y=*/adj_y_);
     OP_REQUIRES_OK(ctx, result.status());
     ctx->SetOutput(0, result.ValueOrDie());
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index 248e9d111e556dcdd75581aa6562a66fc8b57063..a249b1869f547f8e5aa725f9f5cf391b10429928 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 // XLA implementation of BatchNorm operations.
-#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -26,43 +26,63 @@ namespace {
 class FusedBatchNormOp : public XlaOpKernel {
  public:
   explicit FusedBatchNormOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
-    TensorFormat tensor_format;
-    if (ctx->GetAttr("data_format", &data_format).ok()) {
-      OP_REQUIRES(ctx, FormatFromString(data_format, &tensor_format),
-                  errors::InvalidArgument("Invalid data format"));
-      OP_REQUIRES(
-          ctx, (tensor_format == FORMAT_NHWC || tensor_format == FORMAT_NCHW),
-          errors::InvalidArgument("Not supported format"));
-      feature_index_ = GetTensorFeatureDimIndex(/*num_dims=*/4, tensor_format);
-    }
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(
+        ctx, FormatFromString(data_format_str, &data_format_),
+        errors::InvalidArgument("Invalid data format: ", data_format_str));
+    OP_REQUIRES(ctx,
+                (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW),
+                errors::InvalidArgument(
+                    "Unsupported data format ", ToString(data_format_),
+                    "; supported formats are NHWC and NCHW"));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
+    xla::PrimitiveType input_type;
+    OP_REQUIRES_OK(ctx,
+                   DataTypeToPrimitiveType(ctx->input_type(0), &input_type));
+    xla::PrimitiveType scale_type;
+    OP_REQUIRES_OK(ctx,
+                   DataTypeToPrimitiveType(ctx->input_type(1), &scale_type));
+
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    xla::ComputationDataHandle input = ctx->Input(0);
+    TensorShape input_shape = ctx->InputShape(0);
+
+    int feature_index =
+        GetTensorFeatureDimIndex(input_shape.dims(), data_format_);
+
+    // TODO(b/69928690): support mixed precision in the XLA batch normalization
+    // operators. As a workaround, cast everything to the statistics type (which
+    // may be more precise than the input type).
+    input = builder->ConvertElementType(input, scale_type);
+
     if (is_training_) {
-      xla::ComputationDataHandle output = ctx->builder()->BatchNormTraining(
-          ctx->Input(0), ctx->Input(1), ctx->Input(2), epsilon_,
-          feature_index_);
+      xla::ComputationDataHandle output = builder->BatchNormTraining(
+          input, ctx->Input(1), ctx->Input(2), epsilon_, feature_index);
 
       // In training mode, outputs the normalized value as well as the
       // calculated mean and variance.
-      for (int i = 0; i < 3; i++) {
-        ctx->SetOutput(i, ctx->builder()->GetTupleElement(output, i));
-      }
+      ctx->SetOutput(0, builder->ConvertElementType(
+                            builder->GetTupleElement(output, 0), input_type));
+      ctx->SetOutput(1, builder->GetTupleElement(output, 1));
+      ctx->SetOutput(2, builder->GetTupleElement(output, 2));
+
       // Output 3 and 4 for "FusedBatchNorm" are currently marked as "reserved
       // space 1 & 2". They are used to pass the per-batch mean and
       // variance to the gradient. Here we maintain the same behavior by setting
       // them to the mean and variance calculated by BatchNormTraining.
-      ctx->SetOutput(3, ctx->builder()->GetTupleElement(output, 1));
-      ctx->SetOutput(4, ctx->builder()->GetTupleElement(output, 2));
+      ctx->SetOutput(3, builder->GetTupleElement(output, 1));
+      ctx->SetOutput(4, builder->GetTupleElement(output, 2));
     } else {
-      xla::ComputationDataHandle output = ctx->builder()->BatchNormInference(
-          ctx->Input(0), ctx->Input(1), ctx->Input(2), ctx->Input(3),
-          ctx->Input(4), epsilon_, feature_index_);
-      ctx->SetOutput(0, output);
+      xla::ComputationDataHandle output = builder->BatchNormInference(
+          input, ctx->Input(1), ctx->Input(2), ctx->Input(3), ctx->Input(4),
+          epsilon_, feature_index);
+      ctx->SetOutput(0, builder->ConvertElementType(output, input_type));
       // Directly send input to output as mean and variance in inference mode.
       ctx->SetOutput(1, ctx->Input(3));
       ctx->SetOutput(2, ctx->Input(4));
@@ -73,55 +93,113 @@ class FusedBatchNormOp : public XlaOpKernel {
 
  private:
   float epsilon_;
-  int64 feature_index_;
+  TensorFormat data_format_;
   bool is_training_;
 };
 
 REGISTER_XLA_OP(Name("FusedBatchNorm"), FusedBatchNormOp);
+REGISTER_XLA_OP(Name("FusedBatchNormV2"), FusedBatchNormOp);
 
 class FusedBatchNormGradOp : public XlaOpKernel {
  public:
   explicit FusedBatchNormGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
-    bool is_training;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training));
-    CHECK(is_training) << "FusedBatchNormGradOp with is_training=False cannot "
-                          "be used with XLA for now!";
-    TensorFormat tensor_format;
-    if (ctx->GetAttr("data_format", &data_format).ok()) {
-      OP_REQUIRES(ctx, FormatFromString(data_format, &tensor_format),
-                  errors::InvalidArgument("Invalid data format"));
-      OP_REQUIRES(
-          ctx, (tensor_format == FORMAT_NHWC || tensor_format == FORMAT_NCHW),
-          errors::InvalidArgument("Not supported format"));
-      feature_index_ = GetTensorFeatureDimIndex(4, tensor_format);
-    }
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training_));
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(
+        ctx, FormatFromString(data_format_str, &data_format_),
+        errors::InvalidArgument("Invalid data format: ", data_format_str));
+    OP_REQUIRES(ctx,
+                (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW),
+                errors::InvalidArgument(
+                    "Unsupported data format ", ToString(data_format_),
+                    "; supported formats are NHWC and NCHW"));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto grad_output = ctx->Input(0);
-    auto activation = ctx->Input(1);
+    xla::ComputationBuilder* b = ctx->builder();
+
+    auto grad_backprop = ctx->Input(0);
+    auto activations = ctx->Input(1);
     auto scale = ctx->Input(2);
     auto mean = ctx->Input(3);
     auto var = ctx->Input(4);
-    xla::ComputationDataHandle output = ctx->builder()->BatchNormGrad(
-        activation, scale, mean, var, grad_output, epsilon_, feature_index_);
 
-    for (int i = 0; i < 3; i++) {
-      ctx->SetOutput(i, ctx->builder()->GetTupleElement(output, i));
+    TensorShape input_shape = ctx->InputShape(0);
+    int feature_index =
+        GetTensorFeatureDimIndex(input_shape.dims(), data_format_);
+
+    DataType input_dtype = ctx->input_type(0);
+    DataType scale_dtype = ctx->input_type(2);
+    xla::PrimitiveType input_type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_dtype, &input_type));
+    xla::PrimitiveType scale_type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(scale_dtype, &scale_type));
+
+    // TODO(b/69928690): support mixed precision in the XLA batch normalization
+    // operators. For now, cast everything to the statistics type (which
+    // may be more precise than the input type).
+    grad_backprop = b->ConvertElementType(grad_backprop, scale_type);
+    activations = b->ConvertElementType(activations, scale_type);
+
+    xla::ComputationDataHandle x_backprop;
+    xla::ComputationDataHandle scale_backprop;
+    xla::ComputationDataHandle offset_backprop;
+    if (is_training_) {
+      xla::ComputationDataHandle output =
+          b->BatchNormGrad(activations, scale, mean, var, grad_backprop,
+                           epsilon_, feature_index);
+
+      x_backprop = b->GetTupleElement(output, 0);
+      scale_backprop = b->GetTupleElement(output, 1);
+      offset_backprop = b->GetTupleElement(output, 2);
+    } else {
+      // Reduce over all dimensions except the feature dim.
+      std::vector<int64> reduction_dims(input_shape.dims() - 1);
+      std::iota(reduction_dims.begin(), reduction_dims.begin() + feature_index,
+                0);
+      std::iota(reduction_dims.begin() + feature_index, reduction_dims.end(),
+                feature_index + 1);
+      // offset_backprop  = sum(y_backprop)
+      // scale_backprop = y_backprop * ((x - pop_mean) * rsqrt(pop_var +
+      // epsilon))
+      // x_backprop = y_backprop * (scale * rsqrt(pop_var + epsilon))
+      offset_backprop =
+          b->Reduce(grad_backprop, XlaHelpers::Zero(b, scale_dtype),
+                    *ctx->GetOrCreateAdd(scale_dtype), reduction_dims);
+
+      // scratch1 = rsqrt(pop_var + epsilon)
+      auto neg_half = XlaHelpers::FloatLiteral(b, scale_dtype, -0.5);
+      auto scratch1 =
+          b->Pow(b->Add(var, b->ConstantR0<float>(epsilon_)), neg_half);
+
+      // scratch2 = sum(y_backprop * (x - mean))
+      auto scratch2 = b->Reduce(
+          b->Mul(grad_backprop, b->Sub(activations, mean, {feature_index})),
+          XlaHelpers::Zero(b, scale_dtype), *ctx->GetOrCreateAdd(scale_dtype),
+          reduction_dims);
+
+      x_backprop =
+          b->Mul(grad_backprop, b->Mul(scratch1, scale), {feature_index});
+      scale_backprop = b->Mul(scratch1, scratch2);
     }
-    ctx->SetOutput(3, ctx->builder()->GetTupleElement(output, 1));
-    ctx->SetOutput(4, ctx->builder()->GetTupleElement(output, 2));
+
+    ctx->SetOutput(0, b->ConvertElementType(x_backprop, input_type));
+    ctx->SetOutput(1, scale_backprop);
+    ctx->SetOutput(2, offset_backprop);
+    ctx->SetConstantOutput(3, Tensor(scale_dtype, {}));
+    ctx->SetConstantOutput(4, Tensor(scale_dtype, {}));
   }
 
  private:
+  TensorFormat data_format_;
   float epsilon_;
-  int64 feature_index_;
+  bool is_training_;
 };
 
 REGISTER_XLA_OP(Name("FusedBatchNormGrad"), FusedBatchNormGradOp);
+REGISTER_XLA_OP(Name("FusedBatchNormGradV2"), FusedBatchNormGradOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index 21d3e64872e19109852297838043975cea6d7921..344a2ab2b6835c518c41de6f7a30fb2a34d130d2 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -159,7 +159,8 @@ class BatchToSpaceNDOp : public XlaOpKernel {
                  block_shape, crops);
   }
 };
-REGISTER_XLA_OP(Name("BatchToSpaceND"), BatchToSpaceNDOp);
+REGISTER_XLA_OP(Name("BatchToSpaceND").CompileTimeConstInput("crops"),
+                BatchToSpaceNDOp);
 
 class BatchToSpaceOp : public XlaOpKernel {
  public:
@@ -181,7 +182,10 @@ class BatchToSpaceOp : public XlaOpKernel {
  private:
   int block_size_;
 };
-REGISTER_XLA_OP(Name("BatchToSpace"), BatchToSpaceOp);
+REGISTER_XLA_OP(Name("BatchToSpace")
+                    .CompileTimeConstInput("crops")
+                    .CompileTimeConstInput("block_shape"),
+                BatchToSpaceOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
index bb031b8c471e08ba90c554e309b850a26c3edae0..ee2c920453c3bbaef2c145df743fddf999167c39 100644
--- a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
@@ -65,7 +65,10 @@ class BCastArgsOp : public XlaOpKernel {
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(BCastArgsOp);
 };
-REGISTER_XLA_OP(Name("BroadcastArgs"), BCastArgsOp);
+REGISTER_XLA_OP(Name("BroadcastArgs")
+                    .CompileTimeConstInput("s0")
+                    .CompileTimeConstInput("s1"),
+                BCastArgsOp);
 
 // Given shapes of two tensors, computes the reduction indices for the
 // gradient computation.
@@ -121,7 +124,10 @@ class BCastGradArgsOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(BCastGradArgsOp);
 };
 
-REGISTER_XLA_OP(Name("BroadcastGradientArgs"), BCastGradArgsOp);
+REGISTER_XLA_OP(Name("BroadcastGradientArgs")
+                    .CompileTimeConstInput("s0")
+                    .CompileTimeConstInput("s1"),
+                BCastGradArgsOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 1de91924326464338352b1ac9edf77141f25ad35..2436a6074a11ad66387b232dd1c5aa135875bfc3 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace {
@@ -75,7 +76,7 @@ static xla::ComputationDataHandle FloorDivImpl(xla::ComputationBuilder* b,
   auto abs_y = b->Abs(y);
   auto t = b->Neg(b->Sub(b->Add(abs_x, abs_y), one));
   auto result = b->Select(different_sign, b->Div(t, abs_y), b->Div(x, y));
-  if (dtype == DT_FLOAT || dtype == DT_DOUBLE) {
+  if (DataTypeIsFloating(dtype)) {
     result = b->Floor(result);
   }
   return result;
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index 592f3ecc3ce2abf33ddffe8b0e59c4e12e73e956..545aa364f937b2dc972dbe7b8c18b5897aa8e5c3 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -92,7 +92,8 @@ class CategoricalOp : public XlaOpKernel {
 };
 
 // TODO(b/68769717): Rename this sampler to Categorical.
-REGISTER_XLA_OP(Name("Multinomial"), CategoricalOp);
+REGISTER_XLA_OP(Name("Multinomial").CompileTimeConstInput("num_samples"),
+                CategoricalOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
index 87d858f763560be454c162e0cf40307c68217663..fe6651793dc763d13f4a4b0ac294ec3ecf64af8f 100644
--- a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
@@ -33,7 +33,7 @@ class CholeskyOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Cholesky"), CholeskyOp);
+REGISTER_XLA_OP(Name("Cholesky").TypeConstraint("T", kFloatTypes), CholeskyOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index 73a4740e29af7fa57e71ef42a342f46b0e24231d..1a246e8df9b2cd83147b50d960744332f8582a51 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -84,8 +84,8 @@ class ConcatBaseOp : public XlaOpKernel {
           in_shape.dims() == input_dims || (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
-              in_shape.DebugString()));
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", in_shape.DebugString()));
       if (in_shape.dims() == 0) {
         // Inputs that come in as scalars must be reshaped to 1-vectors.
         input_data.push_back(ctx->builder()->Reshape(handle, {1}));
@@ -117,8 +117,11 @@ class ConcatV2Op : public ConcatBaseOp {
       : ConcatBaseOp(c, /* axis_index */ c->num_inputs() - 1) {}
 };
 
-REGISTER_XLA_OP(Name("Concat"), ConcatOp);
-REGISTER_XLA_OP(Name("ConcatV2").TypeConstraint("Tidx", DT_INT32), ConcatV2Op);
+REGISTER_XLA_OP(Name("Concat").CompileTimeConstInput("concat_dim"), ConcatOp);
+REGISTER_XLA_OP(Name("ConcatV2")
+                    .TypeConstraint("Tidx", DT_INT32)
+                    .CompileTimeConstInput("axis"),
+                ConcatV2Op);
 
 class ConcatOffsetOp : public XlaOpKernel {
  public:
@@ -189,10 +192,10 @@ class ConcatOffsetOp : public XlaOpKernel {
         } else {
           const int32 inp0_element = inp0_literal.Get<int>({j});
           const int32 inp_element = inp_literal.Get<int>({j});
-          OP_REQUIRES(
-              ctx, (inp0_element == inp_element),
-              errors::InvalidArgument("input[", i, ",", j, "] mismatch: ",
-                                      inp0_element, " vs. ", inp_element));
+          OP_REQUIRES(ctx, (inp0_element == inp_element),
+                      errors::InvalidArgument("input[", i, ",", j,
+                                              "] mismatch: ", inp0_element,
+                                              " vs. ", inp_element));
           out_vec(j) = 0;
         }
       }
@@ -202,7 +205,10 @@ class ConcatOffsetOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("ConcatOffset"), ConcatOffsetOp);
+REGISTER_XLA_OP(Name("ConcatOffset")
+                    .CompileTimeConstInput("concat_dim")
+                    .CompileTimeConstInput("shape"),
+                ConcatOffsetOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index c5017704e2a45b0bd740f7a8fdcf3a0be1d445a4..81cea6d376d02c956a5257c5475fe5c10b83deb9 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -46,72 +46,130 @@ TensorShape ExpandedFilterShapeForDepthwiseConvolution(
   return expanded_shape;
 }
 
+// Broadcast zeros to ExpandedFilterShapeForDepthwiseConvolution.
+xla::ComputationDataHandle CreateExpandedZero(
+    const TensorShape& filter_shape, DataType dtype,
+    xla::ComputationBuilder* builder) {
+  TensorShape expanded_filter_shape =
+      ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
+  return builder->Broadcast(XlaHelpers::Zero(builder, dtype),
+                            expanded_filter_shape.dim_sizes());
+}
+
+// Create a mask for depthwise convolution that will make a normal convolution
+// produce the same results as a depthwise convolution. For a [2, 2, 3, 2]
+// depthwise filter this returns a [2, 2, 3, 6] tesnsor
+//   1 1 0 0 0 0   1 1 0 0 0 0
+//   0 0 1 1 0 0   0 0 1 1 0 0
+//   0 0 0 0 1 1   0 0 0 0 1 1
+//
+//   1 1 0 0 0 0   1 1 0 0 0 0
+//   0 0 1 1 0 0   0 0 1 1 0 0
+//   0 0 0 0 1 1   0 0 0 0 1 1
+//
+// The first step is to create a one tensor, A, that is [3]
+//   0 1 2
+//
+// and another tensor, B,  that is [3 * 2]
+//   0 1 2 3 4 5
+//
+// and divide B it by 2 to get
+//   0 0 1 1 2 2
+//
+// then we broadcast the B to [2, 2, 3, 3 * 2]
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//
+// Finally compare A and broadcasted B in dimension 2 amd return the result at
+// the beginning of the comment.
+xla::ComputationDataHandle CreateExpandedFilterMask(
+    const TensorShape& filter_shape, xla::ComputationBuilder* builder) {
+  TensorShape expanded_filter_shape =
+      ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
+  int64 depthwise_multiplier = filter_shape.dim_size(filter_shape.dims() - 1);
+  int64 input_feature = filter_shape.dim_size(filter_shape.dims() - 2);
+
+  // Create a M sized linspace and an M*N sized linspace that will be
+  // broadcasted into perpendicular dimensions and compared.
+  xla::ComputationDataHandle input_feature_iota;
+  // DT_INT32 Iota will always return status::OK().
+  TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32, input_feature,
+                               &input_feature_iota));
+  xla::ComputationDataHandle expanded_feature_iota;
+  TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32,
+                               input_feature * depthwise_multiplier,
+                               &expanded_feature_iota));
+
+  // Divide the M*N sized linspace by the depthwise_multiplier to create
+  // [0 0 1 1 2 2] in the example in the function comment.
+  expanded_feature_iota =
+      builder->Div(expanded_feature_iota,
+                   XlaHelpers::IntegerLiteral(builder, DataType::DT_INT32,
+                                              depthwise_multiplier));
+
+  // Broadcast the N*M linspace to [H, W, ..., M, M*N].
+  auto expanded_feature_broadcast_dims = expanded_filter_shape.dim_sizes();
+  expanded_feature_broadcast_dims.pop_back();
+  auto broadcasted_expanded_feature_iota = builder->Broadcast(
+      expanded_feature_iota, expanded_feature_broadcast_dims);
+
+  // Compare the broadcasted linspace to the input feature linspace in the
+  // input feature dimension to create a diagonal predicate.
+  return builder->Eq(broadcasted_expanded_feature_iota, input_feature_iota,
+                     {expanded_filter_shape.dims() - 2});
+}
+
 // Expands a filter of shape [H, W, ..., M, N] to [H, W, ..., M, M*N] by adding
 // zeros for the cross-depth filters. Used to build a depthwise convolution.
 xla::ComputationDataHandle ExpandFilterForDepthwiseConvolution(
     const TensorShape& filter_shape, DataType dtype,
     const xla::ComputationDataHandle& filter,
     xla::ComputationBuilder* builder) {
-  // Filter has shape [H, W, ..., M, N]
-  // Dilate to [H, W, ..., M*M, N] using M inter-element padding, and then
-  // reshape to [H, W, ..., M, M*N].
-  int num_spatial_dims = filter_shape.dims() - 2;
-  const int64 in_depth = filter_shape.dim_size(num_spatial_dims);
-  xla::PaddingConfig padding = xla::MakeNoPaddingConfig(filter_shape.dims());
-  padding.mutable_dimensions(num_spatial_dims)->set_interior_padding(in_depth);
-  auto dilated_filter =
-      builder->Pad(filter, XlaHelpers::Zero(builder, dtype), padding);
-
+  int64 depthwise_multiplier = filter_shape.dim_size(filter_shape.dims() - 1);
+  int64 input_feature = filter_shape.dim_size(filter_shape.dims() - 2);
   TensorShape expanded_filter_shape =
       ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
-  return builder->Reshape(dilated_filter, expanded_filter_shape.dim_sizes());
+
+  // Create a [H, W, ..., 1, N*M] reshape of the filter.
+  TensorShape implicit_broadcast_filter_shape = expanded_filter_shape;
+  implicit_broadcast_filter_shape.set_dim(
+      implicit_broadcast_filter_shape.dims() - 2, 1);
+  implicit_broadcast_filter_shape.set_dim(
+      implicit_broadcast_filter_shape.dims() - 1,
+      depthwise_multiplier * input_feature);
+  auto implicit_broadcast_filter =
+      builder->Reshape(filter, implicit_broadcast_filter_shape.dim_sizes());
+
+  // Broadcast the filter to  [H, W, ..., M, M*N].
+  auto expanded_zero = CreateExpandedZero(filter_shape, dtype, builder);
+  auto expanded_filter = builder->Add(implicit_broadcast_filter, expanded_zero);
+
+  // If the filter mask is set, choose the broadcasted filter, othwerwise,
+  // choose zero.
+  return builder->Select(CreateExpandedFilterMask(filter_shape, builder),
+                         expanded_filter, expanded_zero);
 }
 
 // Inverse of ExpandFilterForDepthwiseConvolution.
 xla::ComputationDataHandle ContractFilterForDepthwiseBackprop(
-    const TensorShape& filter_shape, DataType dtype,
+    XlaOpKernelContext* ctx, const TensorShape& filter_shape, DataType dtype,
     const xla::ComputationDataHandle& filter_backprop,
     xla::ComputationBuilder* builder) {
-  int num_spatial_dims = filter_shape.dims() - 2;
-
-  // Reshape to [H, W, ..., M*M, N]
-  TensorShape shape = filter_shape;
-  int64 in_depth = filter_shape.dim_size(num_spatial_dims);
-  shape.set_dim(num_spatial_dims, in_depth * in_depth);
-  auto reshaped = builder->Reshape(filter_backprop, shape.dim_sizes());
-
-  std::vector<int64> zeros(filter_shape.dims());
-  std::vector<int64> strides(filter_shape.dims(), 1LL);
-  strides[num_spatial_dims] = in_depth + 1;
-  return builder->Slice(reshaped, zeros, shape.dim_sizes(), strides);
-
-  // Alternate implementation for backends without strided Slice() support.
-  // TODO(phawkins): Remove when all backends support strided slice.
-  //   // Pad [..., M * (M + 1), N]
-  //   xla::PaddingConfig config =
-  //   xla::MakeNoPaddingConfig(filter_shape.dims());
-  //   config.mutable_dimensions(num_spatial_dims)
-  //     ->set_edge_padding_high(in_depth);
-  //   auto zero = XlaHelpers::Zero(builder, dtype);
-  //   auto padded = builder->Pad(reshaped, zero, config);
-  //
-  //   // Reshape to [..., M, M + 1, N]
-  //   shape = filter_shape;
-  //   shape.set_dim(num_spatial_dims, in_depth);
-  //   shape.set_dim(num_spatial_dims + 1, in_depth + 1);
-  //   int64 out_depth = filter_shape.dim_size(num_spatial_dims + 1);
-  //   shape.AddDim(out_depth);
-  //   reshaped = builder->Reshape(padded, shape.dim_sizes());
-  //
-  //   // Slice to [..., M, 1, N]
-  //   std::vector<int64> zeros(shape.dims());
-  //   std::vector<int64> strides(shape.dims(), 1LL);
-  //   shape.set_dim(num_spatial_dims + 1, 1);
-  //   auto sliced = builder->Slice(reshaped, zeros, shape.dim_sizes(),
-  //   strides);
-  //
-  //   // Reshape to [..., M, N]
-  //   return builder->Reshape(sliced, filter_shape.dim_sizes());
+  TensorShape expanded_filter_shape =
+      ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
+  auto masked_expanded_filter = builder->Select(
+      CreateExpandedFilterMask(filter_shape, builder), filter_backprop,
+      CreateExpandedZero(filter_shape, dtype, builder));
+  return builder->Reshape(
+      builder->Reduce(masked_expanded_filter, XlaHelpers::Zero(builder, dtype),
+                      *ctx->GetOrCreateAdd(dtype),
+                      {expanded_filter_shape.dims() - 2}),
+      filter_shape.dim_sizes());
 }
 
 class ConvOp : public XlaOpKernel {
@@ -121,6 +179,7 @@ class ConvOp : public XlaOpKernel {
       : XlaOpKernel(ctx),
         num_spatial_dims_(num_spatial_dims),
         depthwise_(depthwise) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
 
@@ -144,6 +203,22 @@ class ConvOp : public XlaOpKernel {
         errors::Unimplemented("Current implementation does not yet support "
                               "strides in the batch and depth dimensions."));
 
+    OP_REQUIRES(ctx, dilations_.size() == num_dims(),
+                errors::InvalidArgument("Dilations field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    OP_REQUIRES(
+        ctx, dilations_[batch_dim] == 1 && dilations_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not support "
+                              "dilations in the batch and depth dimensions."));
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      OP_REQUIRES(ctx, dilations_[input_dim] >= 1,
+                  errors::Unimplemented("Dilation values must be positive; ", i,
+                                        "th spatial dimension had dilation ",
+                                        dilations_[input_dim]));
+    }
+
     const TensorShape input_shape = ctx->InputShape(0);
     // Input filter is of the following dimensions:
     // [ filter_rows, filter_cols, ..., in_depth, out_depth]
@@ -172,38 +247,53 @@ class ConvOp : public XlaOpKernel {
     xla::ComputationBuilder* b = ctx->builder();
 
     xla::ComputationDataHandle filter = ctx->Input(1);
+    TensorShape expanded_filter_shape = filter_shape;
     if (depthwise_) {
       filter = ExpandFilterForDepthwiseConvolution(
           filter_shape, ctx->input_type(0), filter, b);
+      expanded_filter_shape =
+          ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
     }
 
     xla::ConvolutionDimensionNumbers dims;
-    std::vector<int64> window_strides;
+    std::vector<int64> window_strides(num_spatial_dims_);
+    std::vector<int64> lhs_dilation(num_spatial_dims_, 1);
+    std::vector<int64> rhs_dilation(num_spatial_dims_);
+    std::vector<std::pair<int64, int64>> padding(num_spatial_dims_);
+
     dims.set_input_batch_dimension(batch_dim);
     dims.set_output_batch_dimension(batch_dim);
     dims.set_input_feature_dimension(feature_dim);
     dims.set_output_feature_dimension(feature_dim);
+    dims.set_kernel_input_feature_dimension(num_spatial_dims_);
+    dims.set_kernel_output_feature_dimension(num_spatial_dims_ + 1);
+
     for (int i = 0; i < num_spatial_dims_; ++i) {
-      int64 dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      const int64 dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
       dims.add_input_spatial_dimensions(dim);
       dims.add_kernel_spatial_dimensions(i);
       dims.add_output_spatial_dimensions(dim);
-      window_strides.push_back(strides_.at(dim));
+      window_strides[i] = strides_.at(dim);
+      rhs_dilation[i] = dilations_.at(dim);
+
+      int64 unused_output_size;
+      OP_REQUIRES_OK(
+          ctx, GetWindowedOutputSizeVerboseV2(
+                   input_shape.dim_size(dim), expanded_filter_shape.dim_size(i),
+                   rhs_dilation[i], window_strides[i], padding_,
+                   &unused_output_size, &padding[i].first, &padding[i].second));
     }
-    dims.set_kernel_input_feature_dimension(num_spatial_dims_);
-    dims.set_kernel_output_feature_dimension(num_spatial_dims_ + 1);
 
-    xla::Padding xla_padding =
-        (padding_ == VALID) ? xla::Padding::kValid : xla::Padding::kSame;
-
-    xla::ComputationDataHandle conv = b->ConvWithGeneralDimensions(
-        ctx->Input(0), filter, window_strides, xla_padding, dims);
+    xla::ComputationDataHandle conv =
+        b->ConvGeneralDilated(ctx->Input(0), filter, window_strides, padding,
+                              lhs_dilation, rhs_dilation, dims);
     ctx->SetOutput(0, conv);
   }
 
  protected:
   const int num_spatial_dims_;
   const bool depthwise_;
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_ = FORMAT_NHWC;
@@ -241,6 +331,7 @@ class ConvBackpropInputOp : public XlaOpKernel {
       : XlaOpKernel(ctx),
         num_spatial_dims_(num_spatial_dims),
         depthwise_(depthwise) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
     string data_format;
@@ -263,6 +354,22 @@ class ConvBackpropInputOp : public XlaOpKernel {
         errors::Unimplemented("Current implementation does not yet support "
                               "strides in the batch and depth dimensions."));
 
+    OP_REQUIRES(ctx, dilations_.size() == num_dims(),
+                errors::InvalidArgument("Dilations field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    OP_REQUIRES(
+        ctx, dilations_[batch_dim] == 1 && dilations_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not support "
+                              "dilations in the batch and depth dimensions."));
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      OP_REQUIRES(ctx, dilations_[input_dim] >= 1,
+                  errors::Unimplemented("Dilation values must be positive; ", i,
+                                        "th spatial dimension had dilation ",
+                                        dilations_[input_dim]));
+    }
+
     TensorShape input_shape;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &input_shape));
 
@@ -274,10 +381,11 @@ class ConvBackpropInputOp : public XlaOpKernel {
                    : filter_shape;
     // Reuse dimension computation logic from conv_grad_ops.cc.
     ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensions(
-                            type_string(), num_spatial_dims_, input_shape,
-                            expanded_filter_shape, out_backprop_shape, strides_,
-                            padding_, data_format_, &dims));
+    OP_REQUIRES_OK(ctx,
+                   ConvBackpropComputeDimensionsV2(
+                       type_string(), num_spatial_dims_, input_shape,
+                       expanded_filter_shape, out_backprop_shape, dilations_,
+                       strides_, padding_, data_format_, &dims));
 
     xla::ComputationBuilder* b = ctx->builder();
     auto filter = ctx->Input(1);
@@ -301,6 +409,7 @@ class ConvBackpropInputOp : public XlaOpKernel {
     std::vector<int64> kernel_spatial_dims(num_spatial_dims_);
     std::vector<std::pair<int64, int64>> padding(num_spatial_dims_);
     std::vector<int64> lhs_dilation(num_spatial_dims_);
+    std::vector<int64> rhs_dilation(num_spatial_dims_);
     std::vector<int64> ones(num_spatial_dims_, 1);
     for (int i = 0; i < num_spatial_dims_; ++i) {
       int64 dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
@@ -312,6 +421,7 @@ class ConvBackpropInputOp : public XlaOpKernel {
       padding[i] = {dims.spatial_dims[i].pad_before,
                     dims.spatial_dims[i].pad_after};
       lhs_dilation[i] = dims.spatial_dims[i].stride;
+      rhs_dilation[i] = dilations_[dim];
     }
 
     // If this is a depthwise convolution, expand the filter.
@@ -328,7 +438,7 @@ class ConvBackpropInputOp : public XlaOpKernel {
     //   = gradients (with padding and dilation) <conv> mirrored_weights
     xla::ComputationDataHandle in_backprop = b->ConvGeneralDilated(
         out_backprop, mirrored_weights, /*window_strides=*/ones, padding,
-        lhs_dilation, /*rhs_dilation=*/ones, dnums);
+        lhs_dilation, rhs_dilation, dnums);
 
     ctx->SetOutput(0, in_backprop);
   }
@@ -336,6 +446,7 @@ class ConvBackpropInputOp : public XlaOpKernel {
  protected:
   const int num_spatial_dims_;
   const bool depthwise_;
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_ = FORMAT_NHWC;
@@ -349,21 +460,26 @@ class Conv2DBackpropInputOp : public ConvBackpropInputOp {
   explicit Conv2DBackpropInputOp(OpKernelConstruction* ctx)
       : ConvBackpropInputOp(ctx, /*num_spatial_dims=*/2, /*depthwise=*/false) {}
 };
-REGISTER_XLA_OP(Name("Conv2DBackpropInput"), Conv2DBackpropInputOp);
+REGISTER_XLA_OP(
+    Name("Conv2DBackpropInput").CompileTimeConstInput("input_sizes"),
+    Conv2DBackpropInputOp);
 
 class Conv3DBackpropInputOp : public ConvBackpropInputOp {
  public:
   explicit Conv3DBackpropInputOp(OpKernelConstruction* ctx)
       : ConvBackpropInputOp(ctx, /*num_spatial_dims=*/3, /*depthwise=*/false) {}
 };
-REGISTER_XLA_OP(Name("Conv3DBackpropInputV2"), Conv3DBackpropInputOp);
+REGISTER_XLA_OP(
+    Name("Conv3DBackpropInputV2").CompileTimeConstInput("input_sizes"),
+    Conv3DBackpropInputOp);
 
 class DepthwiseConv2DBackpropInputOp : public ConvBackpropInputOp {
  public:
   explicit DepthwiseConv2DBackpropInputOp(OpKernelConstruction* ctx)
       : ConvBackpropInputOp(ctx, /*num_spatial_dims=*/2, /*depthwise=*/true) {}
 };
-REGISTER_XLA_OP(Name("DepthwiseConv2dNativeBackpropInput"),
+REGISTER_XLA_OP(Name("DepthwiseConv2dNativeBackpropInput")
+                    .CompileTimeConstInput("input_sizes"),
                 DepthwiseConv2DBackpropInputOp);
 
 class ConvBackpropFilterOp : public XlaOpKernel {
@@ -373,6 +489,7 @@ class ConvBackpropFilterOp : public XlaOpKernel {
       : XlaOpKernel(ctx),
         num_spatial_dims_(num_spatial_dims),
         depthwise_(depthwise) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
     string data_format;
@@ -392,6 +509,22 @@ class ConvBackpropFilterOp : public XlaOpKernel {
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
 
+    OP_REQUIRES(ctx, dilations_.size() == num_dims(),
+                errors::InvalidArgument("Dilations field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    OP_REQUIRES(
+        ctx, dilations_[n_dim] == 1 && dilations_[c_dim] == 1,
+        errors::Unimplemented("Current implementation does not support "
+                              "dilations in the batch and depth dimensions."));
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      OP_REQUIRES(ctx, dilations_[input_dim] >= 1,
+                  errors::Unimplemented("Dilation values must be positive; ", i,
+                                        "th spatial dimension had dilation ",
+                                        dilations_[input_dim]));
+    }
+
     const TensorShape activations_shape = ctx->InputShape(0);
     TensorShape filter_shape;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(1, &filter_shape));
@@ -403,10 +536,11 @@ class ConvBackpropFilterOp : public XlaOpKernel {
 
     // Reuse dimension computation logic from conv_grad_ops.cc.
     ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensions(
-                            type_string(), num_spatial_dims_, activations_shape,
-                            expanded_filter_shape, out_backprop_shape, strides_,
-                            padding_, data_format_, &dims));
+    OP_REQUIRES_OK(ctx,
+                   ConvBackpropComputeDimensionsV2(
+                       type_string(), num_spatial_dims_, activations_shape,
+                       expanded_filter_shape, out_backprop_shape, dilations_,
+                       strides_, padding_, data_format_, &dims));
 
     xla::ComputationBuilder* b = ctx->builder();
     xla::ComputationDataHandle activations = ctx->Input(0);
@@ -426,9 +560,7 @@ class ConvBackpropFilterOp : public XlaOpKernel {
 
     // Swap n_dim and c_dim in the activations.
     dnums.set_input_batch_dimension(c_dim);
-    dnums.set_output_batch_dimension(c_dim);
     dnums.set_input_feature_dimension(n_dim);
-    dnums.set_output_feature_dimension(n_dim);
 
     // The gradients become the RHS of the convolution.
     // The gradients have shape [batch, out_rows, out_cols, ..., out_depth]
@@ -438,21 +570,29 @@ class ConvBackpropFilterOp : public XlaOpKernel {
 
     std::vector<std::pair<int64, int64>> padding(num_spatial_dims_);
     std::vector<int64> rhs_dilation(num_spatial_dims_);
+    std::vector<int64> window_strides(num_spatial_dims_);
     std::vector<int64> ones(num_spatial_dims_, 1);
 
+    // Tensorflow filter shape is [ H, W, ..., inC, outC ].
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      dnums.add_output_spatial_dimensions(i);
+    }
+    dnums.set_output_batch_dimension(num_spatial_dims_);
+    dnums.set_output_feature_dimension(num_spatial_dims_ + 1);
+
     for (int i = 0; i < num_spatial_dims_; ++i) {
       int64 dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
       dnums.add_input_spatial_dimensions(dim);
       dnums.add_kernel_spatial_dimensions(dim);
-      dnums.add_output_spatial_dimensions(dim);
 
       // We will also need to pad the input with zeros such that after the
       // convolution, we get the right size for the filter.
       // The padded_in_rows should be such that when we convolve this with the
       // expanded_out_rows as a filter, we should get filter_rows back.
       //
-      const int padded_in_size = dims.spatial_dims[i].expanded_output_size +
-                                 dims.spatial_dims[i].filter_size - 1;
+      const int64 padded_in_size =
+          dims.spatial_dims[i].expanded_output_size +
+          (dims.spatial_dims[i].filter_size - 1) * dilations_[dim];
 
       // However it can be smaller than input_rows: in this
       // case it means some of the inputs are not used.
@@ -468,8 +608,7 @@ class ConvBackpropFilterOp : public XlaOpKernel {
       // and input "C" is not used at all.
       //
       // We apply negative padding in this case.
-      const int total_pad_in_size =
-          padded_in_size - dims.spatial_dims[i].input_size;
+      const int64 pad_total = padded_in_size - dims.spatial_dims[i].input_size;
 
       // + For the VALID padding, we don't pad anything on the top/left side
       //   and pad the bottom/right side with the remaining space.
@@ -479,13 +618,12 @@ class ConvBackpropFilterOp : public XlaOpKernel {
       // In addition, if the padded input size is smaller than the input size,
       // we need to ignore some training elements of the input. We do this by
       // applying negative padding on the right/bottom.
-      const int before_pad_in_size =
-          (total_pad_in_size > 0 && padding_ == Padding::SAME)
-              ? total_pad_in_size / 2
-              : 0;
+      const int64 pad_before =
+          padding_ == Padding::SAME ? std::max<int64>(pad_total / 2, 0) : 0;
 
-      padding[i] = {before_pad_in_size, total_pad_in_size - before_pad_in_size};
+      padding[i] = {pad_before, pad_total - pad_before};
       rhs_dilation[i] = dims.spatial_dims[i].stride;
+      window_strides[i] = dilations_[dim];
     }
 
     // Besides padding the input, we will also expand output_rows to
@@ -497,35 +635,20 @@ class ConvBackpropFilterOp : public XlaOpKernel {
     // This is done by specifying the window dilation factors in the
     // convolution HLO below.
     auto filter_backprop =
-        b->ConvGeneralDilated(activations, gradients,
-                              /*window_strides=*/ones, padding,
+        b->ConvGeneralDilated(activations, gradients, window_strides, padding,
                               /*lhs_dilation=*/ones, rhs_dilation, dnums);
 
-    // The layout of filter_backprop will match the layout of
-    // padded_activations
-    // and so will have layout: [out_feature, h, w, ..., in_feature]
-    // Tensorflow filter shape is [ H, W, ..., inC, outC ], so we transpose the
-    // output.
-    std::vector<int64> transpose_dims;
-    transpose_dims.reserve(num_dims());
-    for (int i = 0; i < num_spatial_dims_; ++i) {
-      transpose_dims.push_back(dnums.output_spatial_dimensions(i));
-    }
-    transpose_dims.push_back(c_dim);
-    transpose_dims.push_back(n_dim);
-    xla::ComputationDataHandle filter_backprop_reshaped =
-        b->Transpose(filter_backprop, transpose_dims);
-
     if (depthwise_) {
-      filter_backprop_reshaped = ContractFilterForDepthwiseBackprop(
-          filter_shape, ctx->input_type(0), filter_backprop_reshaped, b);
+      filter_backprop = ContractFilterForDepthwiseBackprop(
+          ctx, filter_shape, ctx->input_type(0), filter_backprop, b);
     }
-    ctx->SetOutput(0, filter_backprop_reshaped);
+    ctx->SetOutput(0, filter_backprop);
   }
 
  protected:
   const int num_spatial_dims_;
   const bool depthwise_;
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_ = FORMAT_NHWC;
@@ -540,7 +663,9 @@ class Conv2DBackpropFilterOp : public ConvBackpropFilterOp {
       : ConvBackpropFilterOp(ctx, /*num_spatial_dims=*/2, /*depthwise=*/false) {
   }
 };
-REGISTER_XLA_OP(Name("Conv2DBackpropFilter"), Conv2DBackpropFilterOp);
+REGISTER_XLA_OP(
+    Name("Conv2DBackpropFilter").CompileTimeConstInput("filter_sizes"),
+    Conv2DBackpropFilterOp);
 
 class Conv3DBackpropFilterOp : public ConvBackpropFilterOp {
  public:
@@ -548,14 +673,17 @@ class Conv3DBackpropFilterOp : public ConvBackpropFilterOp {
       : ConvBackpropFilterOp(ctx, /*num_spatial_dims=*/3, /*depthwise=*/false) {
   }
 };
-REGISTER_XLA_OP(Name("Conv3DBackpropFilterV2"), Conv3DBackpropFilterOp);
+REGISTER_XLA_OP(
+    Name("Conv3DBackpropFilterV2").CompileTimeConstInput("filter_sizes"),
+    Conv3DBackpropFilterOp);
 
 class DepthwiseConv2DBackpropFilterOp : public ConvBackpropFilterOp {
  public:
   explicit DepthwiseConv2DBackpropFilterOp(OpKernelConstruction* ctx)
       : ConvBackpropFilterOp(ctx, /*num_spatial_dims=*/2, /*depthwise=*/true) {}
 };
-REGISTER_XLA_OP(Name("DepthwiseConv2dNativeBackpropFilter"),
+REGISTER_XLA_OP(Name("DepthwiseConv2dNativeBackpropFilter")
+                    .CompileTimeConstInput("filter_sizes"),
                 DepthwiseConv2DBackpropFilterOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
index a4ea65ea89e348cb77412efb0c5c0fcb1a9f33f3..96d7809f7995634b6bc31ab801b93526d9da7e6f 100644
--- a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 namespace {
@@ -23,6 +24,16 @@ namespace {
 class DepthToSpaceOp : public XlaOpKernel {
  public:
   explicit DepthToSpaceOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES(ctx, data_format_ == FORMAT_NCHW || data_format_ == FORMAT_NHWC,
+                errors::InvalidArgument("Unsupported data format ",
+                                        ToString(data_format_),
+                                        "; expected formats NHWC or NCHW"));
+
     OP_REQUIRES_OK(ctx, ctx->GetAttr("block_size", &block_size_));
     OP_REQUIRES(
         ctx, block_size_ > 1,
@@ -31,18 +42,79 @@ class DepthToSpaceOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_tensor_shape = ctx->InputShape(0);
-    // The input is presumed to be [batch, height, width, depth]
     int input_rank = input_tensor_shape.dims();
     static const int kRequiredDims = 4;
     OP_REQUIRES(ctx, kRequiredDims == input_rank,
-                errors::InvalidArgument("Input rank should be: ", kRequiredDims,
-                                        " instead of: ", input_rank));
+                errors::InvalidArgument("Input rank should be ", kRequiredDims,
+                                        "; got: ", input_rank));
     const gtl::InlinedVector<int64, 4> input_shape =
         input_tensor_shape.dim_sizes();
 
     xla::ComputationBuilder* b = ctx->builder();
     xla::ComputationDataHandle input = ctx->Input(0);
 
+    int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format_);
+    int num_spatial_dims = GetTensorSpatialDims(input_rank, data_format_);
+
+    std::vector<int64> reshaped_shape;
+    std::vector<int64> transpose_order;
+    std::vector<int64> output_shape;
+    reshaped_shape.reserve(input_rank);
+    transpose_order.reserve(input_rank);
+    output_shape.reserve(input_rank);
+    if (data_format_ == FORMAT_NHWC) {
+      reshaped_shape.push_back(input_shape[0]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(input_shape[1 + i]);
+      }
+      int64 block_elems = 1;
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(block_size_);
+        block_elems *= block_size_;
+      }
+      reshaped_shape.push_back(input_shape[feature_dim] / block_elems);
+
+      transpose_order.push_back(0);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i + 1);
+        transpose_order.push_back(i + 1 + num_spatial_dims);
+      }
+      transpose_order.push_back(feature_dim + num_spatial_dims);
+
+      output_shape.push_back(input_shape[0]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        output_shape.push_back(input_shape[1 + i] * block_size_);
+      }
+      output_shape.push_back(input_shape[feature_dim] / block_elems);
+    } else {
+      // NCHW format.
+      reshaped_shape.push_back(input_shape[0]);
+      int64 block_elems = 1;
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(block_size_);
+        block_elems *= block_size_;
+      }
+      reshaped_shape.push_back(input_shape[feature_dim] / block_elems);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(input_shape[2 + i]);
+      }
+
+      transpose_order.push_back(0);
+      transpose_order.push_back(1 + num_spatial_dims);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(2 + num_spatial_dims + i);
+        transpose_order.push_back(1 + i);
+      }
+
+      output_shape.push_back(input_shape[0]);
+      output_shape.push_back(input_shape[feature_dim] / block_elems);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        output_shape.push_back(input_shape[2 + i] * block_size_);
+      }
+    }
+
+    // Note: comments are given in NHWC format; NCHW is similar with a different
+    // dimension order.
     // 1. Reshape `input` to `reshaped` of shape:
     //
     //      [batch,
@@ -51,14 +123,14 @@ class DepthToSpaceOp : public XlaOpKernel {
     //       block_size_,
     //       block_size_,
     //       depth / (block_size_ * block_size_)]
-    OP_REQUIRES(ctx, input_shape[3] % (block_size_ * block_size_) == 0,
+    OP_REQUIRES(ctx,
+                input_shape[feature_dim] % (block_size_ * block_size_) == 0,
                 errors::InvalidArgument(
                     "Input depth dimension (", input_shape[3],
                     ") is not divisible by square of the block size (",
                     block_size_, ")"));
-    xla::ComputationDataHandle reshaped = b->Reshape(
-        input, {input_shape[0], input_shape[1], input_shape[2], block_size_,
-                block_size_, input_shape[3] / (block_size_ * block_size_)});
+
+    xla::ComputationDataHandle reshaped = b->Reshape(input, reshaped_shape);
 
     // 2. Permute dimensions of `reshaped` to produce
     //    `permuted_reshaped` of shape:
@@ -70,7 +142,7 @@ class DepthToSpaceOp : public XlaOpKernel {
     //       block_size_,
     //       depth / (block_size_ * block_size_)]
     xla::ComputationDataHandle permuted_reshaped =
-        b->Transpose(reshaped, {0, 1, 3, 2, 4, 5});
+        b->Transpose(reshaped, transpose_order);
 
     // 3. Reshape `permuted_reshaped` to flatten `block_shape` into the
     //    batch dimension, producing an output tensor of shape:
@@ -80,15 +152,14 @@ class DepthToSpaceOp : public XlaOpKernel {
     //       input_shape[2] * block_size_,
     //       depth / (block_size_ * block_size_)]
     //
-    xla::ComputationDataHandle output = b->Reshape(
-        permuted_reshaped, {input_shape[0], input_shape[1] * block_size_,
-                            input_shape[2] * block_size_,
-                            input_shape[3] / (block_size_ * block_size_)});
+    xla::ComputationDataHandle output =
+        b->Reshape(permuted_reshaped, output_shape);
 
     ctx->SetOutput(0, output);
   }
 
  private:
+  TensorFormat data_format_;
   int block_size_;
 };
 REGISTER_XLA_OP(Name("DepthToSpace"), DepthToSpaceOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index ec5017f6ab96bd3fc273a746b77fbb7e74fd9f35..765ea922a532a085a552192348ab360c4c30ff0a 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -22,6 +24,62 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// Create a diagonal / batch diagonal matrix with 'input' on the diagonal.
+xla::StatusOr<xla::ComputationDataHandle> CreateDiagonal(
+    const xla::ComputationDataHandle& input, int64 last_dim_size,
+    tensorflow::gtl::ArraySlice<int64> other_dims, XlaOpKernelContext* ctx,
+    xla::ComputationBuilder* builder) {
+  // Create two matrices that have the following forms, and compare them:
+  //
+  // [[0, 0, 0, 0]            [[0, 1, 2, 3]
+  //  [1, 1, 1, 1]             [0, 1, 2, 3]
+  //  [2, 2, 2, 2]             [0, 1, 2, 3]
+  //  [3, 3, 3, 3]]            [0, 1, 2, 3]]
+  //
+  // This produces a predicate matrix of the right size, with "true" on the
+  // diagonal.
+  xla::ComputationDataHandle iota;
+  TF_RETURN_IF_ERROR(
+      XlaHelpers::Iota(builder, DataType::DT_INT32, last_dim_size, &iota));
+  xla::ComputationDataHandle iota_broadcast =
+      builder->Broadcast(iota, {last_dim_size});
+  xla::ComputationDataHandle mask = builder->Eq(iota_broadcast, iota, {0});
+
+  // If this is a batched diagonal, broadcast the mask across the other
+  // dimensions.
+  if (!other_dims.empty()) {
+    mask = builder->Broadcast(mask, other_dims);
+  }
+
+  // Broadcast the input, and then use the mask computed above to select the
+  // diagonal:
+  // e.g, in 2D:
+  //         [[t, f, f]    [[1, 1, 1]    [[0, 0, 0]      [[1, 0, 0]
+  // select(  [f, t, f]  ,  [4, 4, 4]  ,  [0, 0, 0]  ) =  [0, 4, 0]
+  //          [f, f, t]]    [9, 9, 9]]    [0, 0, 0]]      [0, 0, 9]]
+  //
+  // Broadcasting the input is less-than-trivial, since we need to broadcast
+  // into a "middle" dimension. We can do this with a reshape + implicit
+  // broadcast.
+  // TODO(b/30112114): Replace with in-dim broadcast when those are supported.
+  std::vector<int64> broadcast_dims(other_dims.begin(), other_dims.end());
+  broadcast_dims.push_back(1LL);
+  broadcast_dims.push_back(last_dim_size);
+  xla::ComputationDataHandle input_broadcast =
+      builder->Reshape(input, broadcast_dims);
+
+  broadcast_dims[broadcast_dims.size() - 2] = last_dim_size;
+  xla::PrimitiveType element_type;
+  TF_RETURN_IF_ERROR(
+      DataTypeToPrimitiveType(ctx->input_type(0), &element_type));
+  auto broadcast_shape =
+      xla::ShapeUtil::MakeShape(element_type, broadcast_dims);
+  xla::ComputationDataHandle zeros = Zeros(builder, broadcast_shape);
+
+  input_broadcast = builder->Add(input_broadcast, zeros);
+  return builder->Select(mask, input_broadcast, zeros);
+}
+
 class DiagOp : public XlaOpKernel {
  public:
   explicit DiagOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
@@ -29,6 +87,8 @@ class DiagOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationBuilder* builder = ctx->builder();
 
+    OP_REQUIRES(ctx, ctx->num_inputs() >= 1,
+                errors::InvalidArgument("Diag op must have at an input"));
     const TensorShape input_shape = ctx->InputShape(0);
 
     auto dims = input_shape.dim_sizes();
@@ -36,7 +96,7 @@ class DiagOp : public XlaOpKernel {
                 errors::InvalidArgument("Expected 1 <= dims, got shape ",
                                         input_shape.DebugString()));
 
-    xla::ComputationDataHandle diag = ctx->Input(0);
+    xla::ComputationDataHandle input = ctx->Input(0);
 
     // Picture:
     // tf.diag([1, 2, 3, 4]) ==> [[1, 0, 0, 0]
@@ -46,13 +106,13 @@ class DiagOp : public XlaOpKernel {
 
     // Flattens the input to 1D.
     int64 size = input_shape.num_elements();
-    diag = builder->Reshape(diag, {size});
+    input = builder->Reshape(input, {size});
 
-    // Adds inter-element padding of 'size'.
-    xla::PaddingConfig config;
-    auto* dim = config.add_dimensions();
-    dim->set_interior_padding(size);
-    diag = builder->Pad(diag, XlaHelpers::Zero(builder, input_type(0)), config);
+    // Create an R2 with the R1 diagonal.
+    auto diag_or_status =
+        CreateDiagonal(input, size, /*other_dims=*/{}, ctx, builder);
+    OP_REQUIRES_OK(ctx, diag_or_status.status());
+    xla::ComputationDataHandle diag = diag_or_status.ValueOrDie();
 
     // Reshapes to the final shape.
     std::vector<int64> new_dims(dims.size() * 2);
@@ -141,6 +201,8 @@ class MatrixDiagOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationBuilder* builder = ctx->builder();
 
+    OP_REQUIRES(ctx, ctx->num_inputs() >= 1,
+                errors::InvalidArgument("MatrixDiag op must have at an input"));
     const TensorShape input_shape = ctx->InputShape(0);
 
     auto dims = input_shape.dim_sizes();
@@ -152,17 +214,13 @@ class MatrixDiagOp : public XlaOpKernel {
 
     int last_dim = dims.size() - 1;
     int64 last_dim_size = input_shape.dim_size(last_dim);
+    tensorflow::gtl::ArraySlice<int64> other_dims(dims);
+    other_dims.pop_back();
 
-    // Adds inter-element padding of 'last_dim_size' to the last dimension.
-    xla::PaddingConfig config = xla::MakeNoPaddingConfig(dims.size());
-    auto* dim = config.mutable_dimensions(last_dim);
-    dim->set_interior_padding(last_dim_size);
-    diag = builder->Pad(diag, XlaHelpers::Zero(builder, input_type(0)), config);
-
-    // Reshapes to the final shape.
-    dims.push_back(last_dim_size);
-    diag = builder->Reshape(diag, dims);
-
+    auto diag_or_status =
+        CreateDiagonal(diag, last_dim_size, other_dims, ctx, builder);
+    OP_REQUIRES_OK(ctx, diag_or_status.status());
+    diag = diag_or_status.ValueOrDie();
     ctx->SetOutput(0, diag);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index 7349dcb987cd88c423570889c0502d1a0bd12c52..f2cd21ffb9ce88747c04f3c71e66dadeb1faf0f9 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -72,22 +72,24 @@ class DynamicStitchOp : public XlaOpKernel {
                      XLAShapeToTensorShape(indices_input[input_num].shape(),
                                            &indices_shape));
       const TensorShape& data_shape = data_shapes[input_num];
-      OP_REQUIRES(ctx, TensorShapeUtils::StartsWith(data_shape, indices_shape),
-                  errors::InvalidArgument(
-                      "data[", input_num, "].shape = ",
-                      data_shape.DebugString(), " does not start with indices[",
-                      input_num, "].shape = ", indices_shape.DebugString()));
-      OP_REQUIRES(ctx,
-                  input_num == 0 || SameExtraShape(data0_shape, indices0_shape,
-                                                   data_shape, indices_shape),
-                  errors::InvalidArgument(
-                      "Need data[0].shape[", indices0_shape.dims(),
-                      ":] = data[", input_num, "].shape[", indices_shape.dims(),
-                      ":], got data[0].shape = ", data0_shape.DebugString(),
-                      ", data[", input_num, "].shape = ",
-                      data_shape.DebugString(), ", indices[0].shape = ",
-                      indices0_shape.DebugString(), ", indices[", input_num,
-                      "].shape = ", indices_shape.DebugString()));
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::StartsWith(data_shape, indices_shape),
+          errors::InvalidArgument("data[", input_num,
+                                  "].shape = ", data_shape.DebugString(),
+                                  " does not start with indices[", input_num,
+                                  "].shape = ", indices_shape.DebugString()));
+      OP_REQUIRES(
+          ctx,
+          input_num == 0 || SameExtraShape(data0_shape, indices0_shape,
+                                           data_shape, indices_shape),
+          errors::InvalidArgument(
+              "Need data[0].shape[", indices0_shape.dims(), ":] = data[",
+              input_num, "].shape[", indices_shape.dims(),
+              ":], got data[0].shape = ", data0_shape.DebugString(), ", data[",
+              input_num, "].shape = ", data_shape.DebugString(),
+              ", indices[0].shape = ", indices0_shape.DebugString(),
+              ", indices[", input_num,
+              "].shape = ", indices_shape.DebugString()));
 
       OP_REQUIRES_OK(ctx,
                      XlaHelpers::ReshapeLiteral(indices_input[input_num],
@@ -159,8 +161,8 @@ class DynamicStitchOp : public XlaOpKernel {
                                    indices0_shape.dims());
     std::vector<int64> slice_limit(1 + data0_shape.dims() -
                                    indices0_shape.dims());
-    std::vector<int64> stride(1 + data0_shape.dims() -
-                              indices0_shape.dims(), 1);
+    std::vector<int64> stride(1 + data0_shape.dims() - indices0_shape.dims(),
+                              1);
     for (int d = indices0_shape.dims(); d < data0_shape.dims(); d++) {
       slice_limit[1 + d - indices0_shape.dims()] = data0_shape.dim_size(d);
     }
@@ -198,8 +200,10 @@ class DynamicStitchOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("DynamicStitch"), DynamicStitchOp);
-REGISTER_XLA_OP(Name("ParallelDynamicStitch"), DynamicStitchOp);
+REGISTER_XLA_OP(Name("DynamicStitch").CompileTimeConstInput("indices"),
+                DynamicStitchOp);
+REGISTER_XLA_OP(Name("ParallelDynamicStitch").CompileTimeConstInput("indices"),
+                DynamicStitchOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2970eae20a3fb71f06619f476a49d41b22bca56
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -0,0 +1,169 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace {
+
+class ExtractImagePatchesOp : public XlaOpKernel {
+ public:
+  explicit ExtractImagePatchesOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("ksizes", &ksizes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("rates", &dilations_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorFormat data_format = FORMAT_NHWC;
+    const int num_dims = ksizes_.size();
+
+    OP_REQUIRES(
+        ctx, num_dims >= 3,
+        errors::InvalidArgument("Kernel size must have at least 3 dimensions"));
+    const int num_spatial_dims = num_dims - 2;
+
+    OP_REQUIRES(ctx, strides_.size() == num_dims,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify ",
+                                        num_dims, " dimensions"));
+    OP_REQUIRES(ctx, dilations_.size() == num_dims,
+                errors::InvalidArgument("Dilations field must "
+                                        "specify ",
+                                        num_dims, " dimensions"));
+
+    int batch_dim = GetTensorBatchDimIndex(num_dims, data_format);
+    int feature_dim = GetTensorFeatureDimIndex(num_dims, data_format);
+    OP_REQUIRES(
+        ctx, ksizes_[batch_dim] == 1 && ksizes_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "kernel sizes > 1 in the batch and depth "
+                              "dimensions."));
+    OP_REQUIRES(
+        ctx, strides_[batch_dim] == 1 && strides_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "strides in the batch and depth dimensions."));
+    OP_REQUIRES(
+        ctx, dilations_[batch_dim] == 1 && dilations_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not support "
+                              "dilations in the batch and depth dimensions."));
+
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+      OP_REQUIRES(
+          ctx, ksizes_[input_dim] >= 0,
+          errors::Unimplemented("Kernel size values must be non-negative; ", i,
+                                "th spatial dimension had dilation ",
+                                dilations_[input_dim]));
+      OP_REQUIRES(ctx, strides_[input_dim] >= 1,
+                  errors::Unimplemented("Stride values must be positive; ", i,
+                                        "th spatial dimension had dilation ",
+                                        dilations_[input_dim]));
+      OP_REQUIRES(ctx, dilations_[input_dim] >= 1,
+                  errors::Unimplemented("Dilation values must be positive; ", i,
+                                        "th spatial dimension had dilation ",
+                                        dilations_[input_dim]));
+    }
+
+    xla::PrimitiveType type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(ctx->input_type(0), &type));
+
+    const TensorShape input_shape = ctx->InputShape(0);
+    OP_REQUIRES(
+        ctx, input_shape.dims() == num_dims,
+        errors::InvalidArgument("input must be ", num_dims, "-dimensional",
+                                input_shape.DebugString()));
+    const int64 depth = input_shape.dim_size(feature_dim);
+
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    // The following code is equivalent to:
+    // eye = np.eye(kH * kW * D).reshape([kH, kW, D, kH * kW * kD])
+    int64 kernel_size = 1;
+    std::vector<int64> lhs_shape(num_dims, 1);
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+      lhs_shape[i] = ksizes_[input_dim];
+      kernel_size *= ksizes_[input_dim];
+    }
+    lhs_shape[num_spatial_dims] = depth;
+    lhs_shape[num_spatial_dims + 1] = 1;
+
+    // Builds an identity matrix as a broadcast equality of iotas.
+    // iota = np.arange(np.prod(ksize), depth)
+    // filter = np.equal(np.reshape(iota, [-1, 1]), iota).astype(np.float32)
+    xla::ComputationDataHandle iota;
+    TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32,
+                                 kernel_size * depth, &iota));
+
+    auto lhs = builder->Reshape(iota, lhs_shape);
+    auto filter = builder->ConvertElementType(
+        builder->Eq(lhs, iota, {num_spatial_dims + 1}), type);
+
+    xla::ConvolutionDimensionNumbers dims;
+    std::vector<int64> window_strides(num_spatial_dims);
+    std::vector<int64> lhs_dilation(num_spatial_dims, 1);
+    std::vector<int64> rhs_dilation(num_spatial_dims);
+    std::vector<std::pair<int64, int64>> padding(num_spatial_dims);
+
+    dims.set_input_batch_dimension(batch_dim);
+    dims.set_output_batch_dimension(batch_dim);
+    dims.set_input_feature_dimension(feature_dim);
+    dims.set_output_feature_dimension(feature_dim);
+    dims.set_kernel_input_feature_dimension(num_spatial_dims);
+    dims.set_kernel_output_feature_dimension(num_spatial_dims + 1);
+
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      const int64 dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
+      dims.add_input_spatial_dimensions(dim);
+      dims.add_kernel_spatial_dimensions(i);
+      dims.add_output_spatial_dimensions(dim);
+      window_strides[i] = strides_.at(dim);
+      rhs_dilation[i] = dilations_.at(dim);
+
+      int64 unused_output_size;
+      OP_REQUIRES_OK(
+          ctx, GetWindowedOutputSizeVerboseV2(
+                   input_shape.dim_size(dim), ksizes_[dim], rhs_dilation[i],
+                   window_strides[i], padding_, &unused_output_size,
+                   &padding[i].first, &padding[i].second));
+    }
+
+    xla::ComputationDataHandle conv =
+        builder->ConvGeneralDilated(ctx->Input(0), filter, window_strides,
+                                    padding, lhs_dilation, rhs_dilation, dims);
+    ctx->SetOutput(0, conv);
+  }
+
+ protected:
+  std::vector<int32> ksizes_;
+  std::vector<int32> dilations_;
+  std::vector<int32> strides_;
+  Padding padding_;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(ExtractImagePatchesOp);
+};
+
+REGISTER_XLA_OP(Name("ExtractImagePatches"), ExtractImagePatchesOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..453a32c494b42e9922bc35fc526f3306530054fd
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
@@ -0,0 +1,289 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace {
+
+// Gymnastics with nudged zero point is to ensure that the real zero maps to
+// an integer, which is required for e.g. zero-padding in convolutional layers.
+void CpuNudge(const float min, const float max, const float quant_min,
+              const float quant_max, float* nudged_min, float* nudged_max,
+              float* scale) {
+  *scale = (max - min) / (quant_max - quant_min);
+
+  const float zero_point_from_min = quant_min - min / *scale;
+  float nudged_zero_point;
+  if (zero_point_from_min <= quant_min) {
+    nudged_zero_point = quant_min;
+  } else if (zero_point_from_min >= quant_max) {
+    nudged_zero_point = quant_max;
+  } else {
+    nudged_zero_point = std::round(zero_point_from_min);
+  }
+
+  *nudged_min = (quant_min - nudged_zero_point) * (*scale);
+  *nudged_max = (quant_max - nudged_zero_point) * (*scale);
+}
+
+// An XLA version of CpuNudge().
+void XlaNudge(xla::ComputationBuilder* b, const DataType data_type,
+              const xla::ComputationDataHandle& min,
+              const xla::ComputationDataHandle& max,
+              const float quant_min_value, const float quant_max_value,
+              xla::ComputationDataHandle* nudged_min,
+              xla::ComputationDataHandle* nudged_max,
+              xla::ComputationDataHandle* scale) {
+  *scale = b->Div(b->Sub(max, min),
+                  XlaHelpers::FloatLiteral(b, data_type,
+                                           quant_max_value - quant_min_value));
+  xla::ComputationDataHandle quant_min =
+      XlaHelpers::FloatLiteral(b, data_type, quant_min_value);
+  xla::ComputationDataHandle zero_point_from_min =
+      b->Sub(quant_min, b->Div(min, *scale));
+  xla::ComputationDataHandle quant_max =
+      XlaHelpers::FloatLiteral(b, data_type, quant_max_value);
+  xla::ComputationDataHandle nudged_zero_point =
+      b->Select(b->Le(zero_point_from_min, quant_min), quant_min,
+                b->Select(b->Ge(zero_point_from_min, quant_max), quant_max,
+                          b->Round(zero_point_from_min)));
+  *nudged_min = b->Mul(b->Sub(quant_min, nudged_zero_point), *scale);
+  *nudged_max = b->Mul(b->Sub(quant_max, nudged_zero_point), *scale);
+}
+
+xla::ComputationDataHandle Quantize(
+    xla::ComputationBuilder* b, const xla::ComputationDataHandle& input,
+    const DataType data_type,
+    const xla::ComputationDataHandle& nudged_input_min,
+    const xla::ComputationDataHandle& nudged_input_max,
+    const xla::ComputationDataHandle& input_scale) {
+  xla::ComputationDataHandle one = XlaHelpers::FloatLiteral(b, data_type, 1.0f);
+  xla::ComputationDataHandle inv_scale = b->Div(one, input_scale);
+  xla::ComputationDataHandle half =
+      XlaHelpers::FloatLiteral(b, data_type, 0.5f);
+
+  xla::ComputationDataHandle clamped =
+      b->Clamp(nudged_input_min, input, nudged_input_max);
+  xla::ComputationDataHandle clamped_shifted =
+      b->Sub(clamped, nudged_input_min);
+  xla::ComputationDataHandle rounded =
+      b->Floor(b->Add(b->Mul(clamped_shifted, inv_scale), half));
+  return b->Add(b->Mul(rounded, input_scale), nudged_input_min);
+}
+
+class FakeQuantWithMinMaxArgsOp : public XlaOpKernel {
+ public:
+  explicit FakeQuantWithMinMaxArgsOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    int num_bits;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(ctx, num_bits >= 2 && num_bits <= 16,
+                errors::InvalidArgument("num_bits is out of range, expected "
+                                        "between 2 and 16, was: ",
+                                        num_bits));
+    bool narrow_range;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("narrow_range", &narrow_range));
+    quant_min_ = narrow_range ? 1 : 0;
+    quant_max_ = (1 << num_bits) - 1;
+
+    float input_min, input_max;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("min", &input_min));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max", &input_max));
+    CpuNudge(input_min, input_max, quant_min_, quant_max_, &nudged_input_min_,
+             &nudged_input_max_, &input_scale_);
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationDataHandle input = ctx->Input(0);
+    const DataType data_type = ctx->input_type(0);
+
+    xla::ComputationBuilder* b = ctx->builder();
+    xla::ComputationDataHandle nudged_input_min =
+        XlaHelpers::FloatLiteral(b, data_type, nudged_input_min_);
+    xla::ComputationDataHandle nudged_input_max =
+        XlaHelpers::FloatLiteral(b, data_type, nudged_input_max_);
+    xla::ComputationDataHandle input_scale =
+        XlaHelpers::FloatLiteral(b, data_type, input_scale_);
+    xla::ComputationDataHandle output = Quantize(
+        b, input, data_type, nudged_input_min, nudged_input_max, input_scale);
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  float quant_min_;
+  float quant_max_;
+  float nudged_input_min_;
+  float nudged_input_max_;
+  float input_scale_;
+};
+
+REGISTER_XLA_OP(Name("FakeQuantWithMinMaxArgs"), FakeQuantWithMinMaxArgsOp);
+
+class FakeQuantWithMinMaxArgsGradOp : public XlaOpKernel {
+ public:
+  explicit FakeQuantWithMinMaxArgsGradOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    int num_bits;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(ctx, num_bits >= 2 && num_bits <= 16,
+                errors::InvalidArgument("num_bits is out of range, expected "
+                                        "between 2 and 16, was: ",
+                                        num_bits));
+    bool narrow_range;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("narrow_range", &narrow_range));
+    const float quant_min = narrow_range ? 1 : 0;
+    const float quant_max = (1 << num_bits) - 1;
+
+    float input_min, input_max, scale;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("min", &input_min));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max", &input_max));
+    CpuNudge(input_min, input_max, quant_min, quant_max, &nudged_input_min_,
+             &nudged_input_max_, &scale);
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationDataHandle gradient = ctx->Input(0);
+    const TensorShape gradient_shape = ctx->InputShape(0);
+    xla::ComputationDataHandle input = ctx->Input(1);
+    const DataType data_type = ctx->input_type(1);
+
+    xla::ComputationBuilder* b = ctx->builder();
+    xla::ComputationDataHandle nudged_input_min =
+        XlaHelpers::FloatLiteral(b, data_type, nudged_input_min_);
+    xla::ComputationDataHandle nudged_input_max =
+        XlaHelpers::FloatLiteral(b, data_type, nudged_input_max_);
+
+    xla::ComputationDataHandle between_nudged_min_max =
+        b->And(b->Le(nudged_input_min, input), b->Le(input, nudged_input_max));
+    xla::ComputationDataHandle zeroes = b->Broadcast(
+        XlaHelpers::Zero(b, data_type), gradient_shape.dim_sizes());
+    xla::ComputationDataHandle output =
+        b->Select(between_nudged_min_max, gradient, zeroes);
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  float nudged_input_min_;
+  float nudged_input_max_;
+};
+
+REGISTER_XLA_OP(Name("FakeQuantWithMinMaxArgsGradient"),
+                FakeQuantWithMinMaxArgsGradOp);
+
+class FakeQuantWithMinMaxVarsOp : public XlaOpKernel {
+ public:
+  explicit FakeQuantWithMinMaxVarsOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    int num_bits;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(ctx, num_bits >= 2 && num_bits <= 16,
+                errors::InvalidArgument("num_bits is out of range, expected "
+                                        "between 2 and 16, was: ",
+                                        num_bits));
+    bool narrow_range;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("narrow_range", &narrow_range));
+    quant_min_ = narrow_range ? 1 : 0;
+    quant_max_ = (1 << num_bits) - 1;
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationDataHandle input = ctx->Input(0);
+    const DataType data_type = ctx->input_type(0);
+    xla::ComputationDataHandle input_min = ctx->Input(1);
+    xla::ComputationDataHandle input_max = ctx->Input(2);
+
+    xla::ComputationBuilder* b = ctx->builder();
+    xla::ComputationDataHandle nudged_input_min, nudged_input_max, input_scale;
+    XlaNudge(b, data_type, input_min, input_max, quant_min_, quant_max_,
+             &nudged_input_min, &nudged_input_max, &input_scale);
+
+    xla::ComputationDataHandle output = Quantize(
+        b, input, data_type, nudged_input_min, nudged_input_max, input_scale);
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  float quant_min_;
+  float quant_max_;
+};
+
+REGISTER_XLA_OP(Name("FakeQuantWithMinMaxVars"), FakeQuantWithMinMaxVarsOp);
+
+class FakeQuantWithMinMaxVarsGradOp : public XlaOpKernel {
+ public:
+  explicit FakeQuantWithMinMaxVarsGradOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    int num_bits;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(ctx, num_bits >= 2 && num_bits <= 16,
+                errors::InvalidArgument("num_bits is out of range, expected "
+                                        "between 2 and 16, was: ",
+                                        num_bits));
+    bool narrow_range;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("narrow_range", &narrow_range));
+    quant_min_ = narrow_range ? 1 : 0;
+    quant_max_ = (1 << num_bits) - 1;
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationDataHandle gradient = ctx->Input(0);
+    const TensorShape gradient_shape = ctx->InputShape(0);
+    xla::ComputationDataHandle input = ctx->Input(1);
+    const DataType data_type = ctx->input_type(1);
+    xla::ComputationDataHandle input_min = ctx->Input(2);
+    xla::ComputationDataHandle input_max = ctx->Input(3);
+
+    xla::ComputationBuilder* b = ctx->builder();
+    xla::ComputationDataHandle nudged_input_min, nudged_input_max, input_scale;
+    XlaNudge(b, data_type, input_min, input_max, quant_min_, quant_max_,
+             &nudged_input_min, &nudged_input_max, &input_scale);
+
+    xla::ComputationDataHandle between_nudged_min_max =
+        b->And(b->Le(nudged_input_min, input), b->Le(input, nudged_input_max));
+    xla::ComputationDataHandle zero = XlaHelpers::Zero(b, data_type);
+    xla::ComputationDataHandle zeroes =
+        b->Broadcast(zero, gradient_shape.dim_sizes());
+    xla::ComputationDataHandle output0 =
+        b->Select(between_nudged_min_max, gradient, zeroes);
+    ctx->SetOutput(0, output0);
+
+    xla::ComputationDataHandle below_min = b->Lt(input, nudged_input_min);
+    xla::ComputationDataHandle output1 =
+        b->ReduceAll(b->Select(below_min, gradient, zeroes), zero,
+                     *ctx->GetOrCreateAdd(data_type));
+    ctx->SetOutput(1, output1);
+
+    xla::ComputationDataHandle above_max = b->Gt(input, nudged_input_max);
+    xla::ComputationDataHandle output2 =
+        b->ReduceAll(b->Select(above_max, gradient, zeroes), zero,
+                     *ctx->GetOrCreateAdd(data_type));
+    ctx->SetOutput(2, output2);
+  }
+
+ private:
+  float quant_min_;
+  float quant_max_;
+};
+
+REGISTER_XLA_OP(Name("FakeQuantWithMinMaxVarsGradient"),
+                FakeQuantWithMinMaxVarsGradOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4f3c1c3ad9a928e0552c388a25ed9fcb08edabb
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -0,0 +1,122 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA-specific Ops for FFT.
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+namespace {
+
+using xla::FftType;
+
+class GenericFftOp : public XlaOpKernel {
+ public:
+  explicit GenericFftOp(OpKernelConstruction* ctx, FftType fft_type,
+                        int fft_rank)
+      : XlaOpKernel(ctx), fft_type_(fft_type), fft_rank_(fft_rank) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape input_shape = ctx->InputShape(0);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVectorOrHigher(input_shape),
+        errors::InvalidArgument("input must be at least 1 dimensional"));
+
+    std::vector<int64> fft_length;
+    if (fft_type_ == FftType::RFFT || fft_type_ == FftType::IRFFT) {
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &fft_length));
+      OP_REQUIRES(ctx, fft_length.size() == fft_rank_,
+                  errors::InvalidArgument("fft_length must be length ",
+                                          fft_rank_, " vector"));
+    } else {
+      // Innermost axis provides the FFT length.
+      for (int i = 0; i < fft_rank_; i++) {
+        fft_length.push_back(
+            input_shape.dim_size(input_shape.dims() - fft_rank_ + i));
+      }
+    }
+
+    xla::ComputationBuilder* b = ctx->builder();
+    xla::ComputationDataHandle fft =
+        b->Fft(ctx->Input(0), fft_type_, fft_length);
+    ctx->SetOutput(0, fft);
+  }
+
+ protected:
+  const FftType fft_type_;
+  const int fft_rank_;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(GenericFftOp);
+};
+
+template <int FFTRank>
+class FFTOp : public GenericFftOp {
+ public:
+  explicit FFTOp(OpKernelConstruction* ctx)
+      : GenericFftOp(ctx, /*fft_type=*/FftType::FFT, /*fft_rank=*/FFTRank) {}
+};
+REGISTER_XLA_OP(Name("FFT"), FFTOp<1>);
+REGISTER_XLA_OP(Name("FFT2D"), FFTOp<2>);
+REGISTER_XLA_OP(Name("FFT3D"), FFTOp<3>);
+
+template <int FFTRank>
+class IFFTOp : public GenericFftOp {
+ public:
+  explicit IFFTOp(OpKernelConstruction* ctx)
+      : GenericFftOp(ctx, /*fft_type=*/FftType::IFFT, /*fft_rank=*/FFTRank) {}
+};
+REGISTER_XLA_OP(Name("IFFT"), IFFTOp<1>);
+REGISTER_XLA_OP(Name("IFFT2D"), IFFTOp<2>);
+REGISTER_XLA_OP(Name("IFFT3D"), IFFTOp<3>);
+
+template <int FFTRank>
+class RFFTOp : public GenericFftOp {
+ public:
+  explicit RFFTOp(OpKernelConstruction* ctx)
+      : GenericFftOp(ctx, /*fft_type=*/FftType::RFFT, /*fft_rank=*/FFTRank) {}
+};
+REGISTER_XLA_OP(Name("RFFT").CompileTimeConstInput("fft_length"), RFFTOp<1>);
+REGISTER_XLA_OP(Name("RFFT2D").CompileTimeConstInput("fft_length"), RFFTOp<2>);
+REGISTER_XLA_OP(Name("RFFT3D").CompileTimeConstInput("fft_length"), RFFTOp<3>);
+
+template <int FFTRank>
+class IRFFTOp : public GenericFftOp {
+ public:
+  explicit IRFFTOp(OpKernelConstruction* ctx)
+      : GenericFftOp(ctx, /*fft_type=*/FftType::IRFFT, /*fft_rank=*/FFTRank) {}
+};
+REGISTER_XLA_OP(Name("IRFFT").CompileTimeConstInput("fft_length"), IRFFTOp<1>);
+REGISTER_XLA_OP(Name("IRFFT2D").CompileTimeConstInput("fft_length"),
+                IRFFTOp<2>);
+REGISTER_XLA_OP(Name("IRFFT3D").CompileTimeConstInput("fft_length"),
+                IRFFTOp<3>);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/fill_op.cc b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
index 9e090fe01cbfd4dab81b0de21e3a44e42c2ef18e..eaa13b8dfacce9aaca42ce5fcdfa467ce7fa7b7f 100644
--- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
@@ -69,7 +69,7 @@ class FillOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Fill"), FillOp);
+REGISTER_XLA_OP(Name("Fill").CompileTimeConstInput("dims"), FillOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index e420f21ca33fe7de9b33f404ce04eae62d9c041e..7945c05af40df21a798a2cff51fe7f8e935793f6 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/kernels/gather_op.h"
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
+#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -26,46 +26,48 @@ limitations under the License.
 
 namespace tensorflow {
 
-xla::ComputationDataHandle XlaComputeGatherDynamicSlice(
-    XlaOpKernelContext* context, const xla::ComputationDataHandle& input,
-    const TensorShape& input_shape, const xla::ComputationDataHandle& indices,
-    const TensorShape& indices_shape, int64 axis, DataType dtype,
-    DataType index_type, xla::ComputationBuilder* builder) {
+Status XlaGather(const xla::ComputationDataHandle& input,
+                 const TensorShape& input_shape,
+                 const xla::ComputationDataHandle& indices,
+                 TensorShape indices_shape, int64 axis, bool indices_are_nd,
+                 DataType dtype, DataType index_type,
+                 xla::ComputationBuilder* builder,
+                 xla::ComputationDataHandle* gather_output) {
+  // If the indices are N-dimensional, then the minor dimension of indices
+  // should be of size N and correspond to the N indices.
+  int64 num_index_dims = 1;
+  if (indices_are_nd) {
+    CHECK_GE(indices_shape.dims(), 1);
+    num_index_dims = indices_shape.dim_size(indices_shape.dims() - 1);
+    indices_shape.RemoveLastDims(1);
+  }
+
   // Although the indices Tensor is flattened into rank 1 during the lookup,
   // and each scalar entry is used as an index into the first dimension of the
   // input, the output is returned with shape:
   // input.shape[:axis] + indices.shape + input.shape[axis+1:]
-  const int num_indices = indices_shape.num_elements();
+
+  const int64 num_indices = indices_shape.num_elements();
   TensorShape input_shape_pre_axis(input_shape);
   input_shape_pre_axis.RemoveDimRange(axis, input_shape.dims());
   TensorShape input_shape_post_axis(input_shape);
-  input_shape_post_axis.RemoveDimRange(0, axis + 1);
-
+  input_shape_post_axis.RemoveDimRange(0, axis + num_index_dims);
   // Each slice of the input tensor has shape:
-  // [<input_shape_pre_axis>, 1, <input shape_post_axis>]
+  // [<input_shape_pre_axis>, 1, ..., 1, <input shape_post_axis>]
   TensorShape slice_shape(input_shape);
-  slice_shape.set_dim(axis, 1);
-
-  // TODO(b/37575001) The tensor in which we construct the output during
-  // the loop must have rank >= 3 as a workaround for lowering issues.
-  int64 extra_dims = 0;
-  if (input_shape.dims() < 3) extra_dims = 3 - input_shape.dims();
+  for (int64 i = 0; i < num_index_dims; ++i) {
+    slice_shape.set_dim(axis + i, 1);
+  }
 
   TensorShape loop_out_shape;
-  for (int64 k = 0; k < extra_dims; ++k) loop_out_shape.AddDim(1);
   loop_out_shape.AppendShape(input_shape_pre_axis);
   loop_out_shape.AddDim(num_indices);
   loop_out_shape.AppendShape(input_shape_post_axis);
-
-  // Slices are reshaped into the rank >= 3 shape of the loop carried output.
   TensorShape loop_out_slice_shape;
-  for (int64 k = 0; k < extra_dims; ++k) loop_out_slice_shape.AddDim(1);
   loop_out_slice_shape.AppendShape(input_shape_pre_axis);
   loop_out_slice_shape.AddDim(1);
   loop_out_slice_shape.AppendShape(input_shape_post_axis);
 
-  // Finally, the loop-carried rank >= 3 output is reshaped to the op's
-  // specified result shape.
   TensorShape out_shape;
   out_shape.AppendShape(input_shape_pre_axis);
   out_shape.AppendShape(indices_shape);
@@ -73,131 +75,176 @@ xla::ComputationDataHandle XlaComputeGatherDynamicSlice(
 
   // Degenerate case: empty indices.
   if (num_indices == 0) {
-    return builder->Broadcast(XlaHelpers::Zero(builder, dtype),
-                              out_shape.dim_sizes());
+    *gather_output = builder->Broadcast(XlaHelpers::Zero(builder, dtype),
+                                        out_shape.dim_sizes());
+    return Status::OK();
+  }
+
+  for (int64 i = 0; i < num_index_dims; ++i) {
+    if (input_shape.dim_size(axis + i) == 0) {
+      return errors::InvalidArgument("Gather dimension ", axis + i,
+                                     " is of size zero in tensor with shape ",
+                                     input_shape.DebugString());
+    }
+  }
+
+  // Flatten the major dimensions of indices into a single dimension for ease of
+  // iteration. If there is an axis dimension, we must leave it alone.
+  std::vector<int64> flat_indices_shape = {num_indices};
+  if (indices_are_nd) {
+    flat_indices_shape.push_back(num_index_dims);
   }
 
   // Specify the shape of the loop-carried Tensor tuple.
-  xla::PrimitiveType ptype;
-  TF_CHECK_OK(DataTypeToPrimitiveType(dtype, &ptype));
-  xla::PrimitiveType idxtype;
-  TF_CHECK_OK(DataTypeToPrimitiveType(index_type, &idxtype));
-  std::vector<xla::Shape> tuple_shapes(
-      {// The iteration counter i is a scalar, incremented each iteration.
-       xla::ShapeUtil::MakeShape(idxtype, {}),
-       // The input array has shape input_shape. Loop invariant.
-       xla::ShapeUtil::MakeShape(ptype, input_shape.dim_sizes()),
-       // The gather indices are reshaped to rank 1. Loop invariant.
-       xla::ShapeUtil::MakeShape(idxtype, {num_indices}),
-       // The output array is rank >= 3, and is updated on each loop iteration.
-       xla::ShapeUtil::MakeShape(ptype, loop_out_shape.dim_sizes())});
-  xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
 
   // Construct the initial values of the loop-carried Tensors.
-  auto init_i = XlaHelpers::Zero(builder, index_type);
+  auto flat_indices = builder->Reshape(indices, flat_indices_shape);
   auto init_out = builder->Broadcast(XlaHelpers::Zero(builder, dtype),
                                      loop_out_shape.dim_sizes());
-  // Flatten the indices into 1-D for ease of iteration.
-  auto indices_1d = builder->Reshape(indices, {num_indices});
-  auto init = builder->Tuple({init_i, input, indices_1d, init_out});
-
-  // Construct the while loop condition (i < num_indices)
-  xla::ComputationBuilder condb(context->builder()->client(),
-                                "GatherWhileCond");
-  condb.Lt(condb.GetTupleElement(
-               condb.Parameter(0, tuple_shape, "GatherWhileTuple"), 0),
-           XlaHelpers::IntegerLiteral(&condb, index_type, num_indices));
-  auto cond_status = condb.Build();
-  auto cond = cond_status.ConsumeValueOrDie();
+  auto init = {input, flat_indices, init_out};
 
   // Construct the while loop body's function. The implementation of gather is:
   // for i in range(num_indices):
   //   index = dynamic-slice(indices, i)
   //   xi = dynamic-slice(input, index)
   //   output = dynamic-update-slice(output, xi, i)
-  xla::ComputationBuilder bodyb(context->builder()->client(),
-                                "GatherWhileBody");
-  {
-    // The four loop carried values.
-    auto loop_tuple = bodyb.Parameter(0, tuple_shape, "GatherWhileTuple");
-    auto i = bodyb.GetTupleElement(loop_tuple, 0);
-    auto input = bodyb.GetTupleElement(loop_tuple, 1);
-    auto indices = bodyb.GetTupleElement(loop_tuple, 2);
-    auto output = bodyb.GetTupleElement(loop_tuple, 3);
-
-    // Slice from the input array.
-    auto index = bodyb.DynamicSlice(indices, bodyb.Reshape(i, {1}), {1});
-    auto start_indices = bodyb.Pad(
-        bodyb.Reshape(index, {1}), XlaHelpers::Zero(&bodyb, index_type),
+  auto body_fn = [&](xla::ComputationDataHandle i,
+                     gtl::ArraySlice<xla::ComputationDataHandle> loop_vars,
+                     xla::ComputationBuilder* bodyb) {
+    auto input = loop_vars[0];
+    auto indices = loop_vars[1];
+    auto output = loop_vars[2];
+
+    auto zero_index = XlaHelpers::Zero(bodyb, index_type);
+
+    // Slice the i-th index from the indices array.
+    xla::ComputationDataHandle index;
+    auto indices_offset = bodyb->Reshape(i, {1});
+    if (indices_are_nd) {
+      // Slice out the entire nd index, if applicable.
+      indices_offset = bodyb->Pad(indices_offset, zero_index,
+                                  xla::MakeEdgePaddingConfig({{0, 1}}));
+      index = bodyb->DynamicSlice(indices, indices_offset, {1, num_index_dims});
+      index = bodyb->Collapse(index, {0, 1});
+    } else {
+      index = bodyb->DynamicSlice(indices, indices_offset, {1});
+    }
+
+    // Slice the corresponding data from the input array.
+    auto start_indices = bodyb->Pad(
+        index, zero_index,
         xla::MakeEdgePaddingConfig(
             {{input_shape_pre_axis.dims(), input_shape_post_axis.dims()}}));
-    auto slice_i = bodyb.Reshape(
-        bodyb.DynamicSlice(input, start_indices, slice_shape.dim_sizes()),
+    auto slice_i = bodyb->Reshape(
+        bodyb->DynamicSlice(input, start_indices, slice_shape.dim_sizes()),
         loop_out_slice_shape.dim_sizes());
 
-    // Construct the index into the R3+ output Tensor 0, ..., <index>, 0, ...
+    // Construct the index into the output Tensor 0, ..., <index>, 0, ...
     std::vector<xla::ComputationDataHandle> out_index_vals(
-        loop_out_shape.dims(),
-        bodyb.Reshape(XlaHelpers::Zero(&bodyb, index_type), {1}));
-    out_index_vals[input_shape_pre_axis.dims() + extra_dims] =
-        bodyb.Reshape(i, {1});
-    auto out_index = bodyb.ConcatInDim(out_index_vals, 0);
+        loop_out_shape.dims(), bodyb->Reshape(zero_index, {1}));
+    out_index_vals[input_shape_pre_axis.dims()] = bodyb->Reshape(i, {1});
+    auto out_index = bodyb->ConcatInDim(out_index_vals, 0);
 
     // Update the output Tensor
-    auto updated_output = bodyb.DynamicUpdateSlice(output, slice_i, out_index);
+    auto updated_output = bodyb->DynamicUpdateSlice(output, slice_i, out_index);
 
-    bodyb.Tuple({bodyb.Add(i, XlaHelpers::One(&bodyb, index_type)), input,
-                 indices, updated_output});
-  }
-  auto body_status = bodyb.Build();
-  auto body = body_status.ConsumeValueOrDie();
+    return std::vector<xla::ComputationDataHandle>{input, indices,
+                                                   updated_output};
+  };
 
   // Construct the While loop, extract and reshape the output.
-  auto gather_while = builder->While(cond, body, init);
-  auto gather_output = builder->GetTupleElement(gather_while, 3);
-  return builder->Reshape(gather_output, out_shape.dim_sizes());
+  xla::PrimitiveType ptype;
+  TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(index_type, &ptype));
+  TF_ASSIGN_OR_RETURN(auto outputs, XlaForEachIndex(num_indices, ptype, body_fn,
+                                                    init, "gather", builder));
+  *gather_output = builder->Reshape(outputs[2], out_shape.dim_sizes());
+  return Status::OK();
 }
 
-GatherOpDynamicSlice::GatherOpDynamicSlice(OpKernelConstruction* context)
-    : XlaOpKernel(context) {}
-
-void GatherOpDynamicSlice::Compile(XlaOpKernelContext* context) {
-  xla::ComputationBuilder* builder = context->builder();
-  auto input = context->Input(0);
-  auto input_shape = context->InputShape(0);
-  auto indices = context->Input(1);
-  auto indices_shape = context->InputShape(1);
-  int64 axis = 0;
-  if (context->num_inputs() == 3) {
-    const TensorShape axis_shape = context->InputShape(2);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(axis_shape),
-                errors::InvalidArgument("axis must be scalar"));
-    DataType axis_type = input_type(2);
-    OP_REQUIRES(context, axis_type == DT_INT32 || axis_type == DT_INT64,
-                errors::InvalidArgument("axis must be int32 or int64"));
-
-    OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(2, &axis));
-    const auto params_dims = input_shape.dims();
-    if (axis < 0) {
-      axis += params_dims;
+class GatherOp : public XlaOpKernel {
+ public:
+  explicit GatherOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    xla::ComputationBuilder* builder = context->builder();
+    auto input = context->Input(0);
+    auto input_shape = context->InputShape(0);
+    auto indices = context->Input(1);
+    auto indices_shape = context->InputShape(1);
+    int64 axis = 0;
+    if (context->num_inputs() == 3) {
+      const TensorShape axis_shape = context->InputShape(2);
+      OP_REQUIRES(context, TensorShapeUtils::IsScalar(axis_shape),
+                  errors::InvalidArgument("axis must be scalar"));
+      DataType axis_type = input_type(2);
+      OP_REQUIRES(context, axis_type == DT_INT32 || axis_type == DT_INT64,
+                  errors::InvalidArgument("axis must be int32 or int64"));
+
+      OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(2, &axis));
+      const auto params_dims = input_shape.dims();
+      if (axis < 0) {
+        axis += params_dims;
+      }
+      OP_REQUIRES(
+          context, 0 <= axis && axis < params_dims,
+          errors::InvalidArgument("Expected axis in the range [", -params_dims,
+                                  ", ", params_dims, "), but got ", axis));
     }
-    OP_REQUIRES(
-        context, 0 <= axis && axis < params_dims,
-        errors::InvalidArgument("Expected axis in the range [", -params_dims,
-                                ", ", params_dims, "), but got ", axis));
-  }
 
-  DataType index_type = input_type(1);
-  OP_REQUIRES(context, index_type == DT_INT32 || index_type == DT_INT64,
-              errors::InvalidArgument("indices must be int32 or int64"));
+    DataType index_type = input_type(1);
+    OP_REQUIRES(context, index_type == DT_INT32 || index_type == DT_INT64,
+                errors::InvalidArgument("indices must be int32 or int64"));
 
-  xla::ComputationDataHandle gather = XlaComputeGatherDynamicSlice(
-      context, input, input_shape, indices, indices_shape, axis, input_type(0),
-      index_type, builder);
-  context->SetOutput(0, gather);
-}
+    xla::ComputationDataHandle gather;
+    OP_REQUIRES_OK(
+        context, XlaGather(input, input_shape, indices, indices_shape, axis,
+                           /*indices_are_nd=*/false, input_type(0), index_type,
+                           builder, &gather));
+    context->SetOutput(0, gather);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(GatherOp);
+};
+
+REGISTER_XLA_OP(Name("Gather"), GatherOp);
+REGISTER_XLA_OP(Name("GatherV2").CompileTimeConstInput("axis"), GatherOp);
+
+class GatherNdOp : public XlaOpKernel {
+ public:
+  explicit GatherNdOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    DataType params_type = context->input_type(0);
+    DataType indices_type = context->input_type(1);
+
+    TensorShape params_shape = context->InputShape(0);
+    TensorShape indices_shape = context->InputShape(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(params_shape),
+                errors::InvalidArgument("params must be at least a vector"));
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(indices_shape),
+                errors::InvalidArgument("indices must be at least a vector"));
+    const int64 num_index_dims =
+        indices_shape.dim_size(indices_shape.dims() - 1);
+    OP_REQUIRES(
+        context, num_index_dims <= params_shape.dims(),
+        errors::InvalidArgument(
+            "index innermost dimension length must be <= params rank; saw: ",
+            indices_shape.dim_size(indices_shape.dims() - 1), " vs. ",
+            params_shape.dims()));
+
+    xla::ComputationBuilder* builder = context->builder();
+    auto params = context->Input(0);
+    auto indices = context->Input(1);
+    xla::ComputationDataHandle gather;
+    OP_REQUIRES_OK(context, XlaGather(params, params_shape, indices,
+                                      indices_shape, /*axis=*/0,
+                                      /*indices_are_nd=*/true, params_type,
+                                      indices_type, builder, &gather));
+    context->SetOutput(0, gather);
+  }
+};
 
-REGISTER_XLA_OP(Name("Gather"), GatherOpDynamicSlice);
-REGISTER_XLA_OP(Name("GatherV2"), GatherOpDynamicSlice);
+REGISTER_XLA_OP(Name("GatherNd"), GatherNdOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
index 2c80395c56d73adad7dc1679ba6423fbe103605a..bd8b92c22d71fe89ab8951ec79f411feef6505e3 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
@@ -30,11 +30,16 @@ namespace tensorflow {
 // shape input_shape) keyed on indices (of shape indices_shape).
 //
 // index_type must be must be DT_INT32 or DT_INT64.
-xla::ComputationDataHandle XlaComputeGatherDynamicSlice(
-    XlaOpKernelContext* ctx, const xla::ComputationDataHandle& input,
-    const TensorShape& input_shape, const xla::ComputationDataHandle& indices,
-    const TensorShape& indices_shape, int64 axis, DataType dtype,
-    DataType index_type, xla::ComputationBuilder* builder);
+// If `indices_are_nd` is true, the last dimension of `indices` are treated as
+// a multidimensional index values. Otherwise, `indices` is treated as a tensor
+// of scalar indices.
+Status XlaGather(const xla::ComputationDataHandle& input,
+                 const TensorShape& input_shape,
+                 const xla::ComputationDataHandle& indices,
+                 TensorShape indices_shape, int64 axis, bool indices_are_nd,
+                 DataType dtype, DataType index_type,
+                 xla::ComputationBuilder* builder,
+                 xla::ComputationDataHandle* gather_output);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f22f384256a8ddd8c05de4a1322aba741dc4d7fd
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -0,0 +1,305 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+// Converts 'input' from RGB format to HSV format.
+// 'shape' is the shape of the red/green/blue tensors.
+std::array<xla::ComputationDataHandle, 3> RGBToHSV(
+    XlaOpKernelContext* ctx, xla::ComputationBuilder* b,
+    const std::array<xla::ComputationDataHandle, 3>& rgb, DataType dtype,
+    const TensorShape& shape) {
+  auto zero = XlaHelpers::Zero(b, dtype);
+  auto one = XlaHelpers::One(b, dtype);
+
+  auto red = rgb[0];
+  auto green = rgb[1];
+  auto blue = rgb[2];
+  auto value = b->Max(b->Max(red, green), blue);
+  auto minimum = b->Min(b->Min(red, green), blue);
+  auto range = b->Sub(value, minimum);
+
+  auto zeros = b->Broadcast(zero, shape.dim_sizes());
+  auto saturation = b->Select(b->Gt(value, zero), b->Div(range, value), zeros);
+
+  auto norm = b->Div(XlaHelpers::FloatLiteral(b, dtype, 1.0 / 6.0), range);
+
+  auto hue = b->Select(b->Eq(green, value),
+                       b->Add(b->Mul(norm, b->Sub(blue, red)),
+                              XlaHelpers::FloatLiteral(b, dtype, 2.0 / 6.0)),
+                       b->Add(b->Mul(norm, b->Sub(red, green)),
+                              XlaHelpers::FloatLiteral(b, dtype, 4.0 / 6.0)));
+  hue = b->Select(b->Eq(red, value), b->Mul(norm, b->Sub(green, blue)), hue);
+  hue = b->Select(b->Gt(range, zero), hue, zeros);
+  hue = b->Select(b->Lt(hue, zero), b->Add(hue, one), hue);
+  return {hue, saturation, value};
+}
+
+// Converts 'input' from HSV format to RGB format.
+std::array<xla::ComputationDataHandle, 3> HSVToRGB(
+    xla::ComputationBuilder* b,
+    const std::array<xla::ComputationDataHandle, 3>& hsv, DataType dtype) {
+  xla::ComputationDataHandle hue = hsv[0];
+  xla::ComputationDataHandle saturation = hsv[1];
+  xla::ComputationDataHandle value = hsv[2];
+  auto zero = XlaHelpers::Zero(b, dtype);
+  auto one = XlaHelpers::FloatLiteral(b, dtype, 1.0);
+  auto two = XlaHelpers::FloatLiteral(b, dtype, 2.0);
+  auto three = XlaHelpers::FloatLiteral(b, dtype, 3.0);
+  auto four = XlaHelpers::FloatLiteral(b, dtype, 4.0);
+  auto six = XlaHelpers::FloatLiteral(b, dtype, 6.0);
+
+  auto dh = b->Mul(hue, six);
+  auto dr = b->Clamp(zero, b->Sub(b->Abs(b->Sub(dh, three)), one), one);
+  auto dg = b->Clamp(zero, b->Sub(two, b->Abs(b->Sub(dh, two))), one);
+  auto db = b->Clamp(zero, b->Sub(two, b->Abs(b->Sub(dh, four))), one);
+  auto one_minus_s = b->Sub(one, saturation);
+
+  auto red = b->Mul(b->Add(one_minus_s, b->Mul(saturation, dr)), value);
+  auto green = b->Mul(b->Add(one_minus_s, b->Mul(saturation, dg)), value);
+  auto blue = b->Mul(b->Add(one_minus_s, b->Mul(saturation, db)), value);
+  return {red, green, blue};
+}
+
+class RGBToHSVOp : public XlaOpKernel {
+ public:
+  explicit RGBToHSVOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    OP_REQUIRES(context, input_shape.dims() >= 1,
+                errors::InvalidArgument("input must be at least 1D",
+                                        input_shape.DebugString()));
+    int channel_dim = input_shape.dims() - 1;
+    int64 channels = input_shape.dim_size(channel_dim);
+    OP_REQUIRES(
+        context, channels == 3,
+        errors::FailedPrecondition("input must have 3 channels but input has ",
+                                   channels, " channels."));
+
+    xla::ComputationBuilder* b = context->builder();
+    xla::ComputationDataHandle input = context->Input(0);
+
+    xla::ComputationDataHandle red =
+        b->SliceInDim(input, /*start_index=*/0, /*limit_index=*/1, /*stride=*/1,
+                      /*dimno=*/channel_dim);
+    xla::ComputationDataHandle green =
+        b->SliceInDim(input, /*start_index=*/1, /*limit_index=*/2, /*stride=*/1,
+                      /*dimno=*/channel_dim);
+    xla::ComputationDataHandle blue =
+        b->SliceInDim(input, /*start_index=*/2, /*limit_index=*/3, /*stride=*/1,
+                      /*dimno=*/channel_dim);
+    TensorShape channel_shape = input_shape;
+    channel_shape.set_dim(channel_dim, 1);
+    auto hsv = RGBToHSV(context, b, {red, green, blue}, context->input_type(0),
+                        channel_shape);
+
+    context->SetOutput(0, b->ConcatInDim(hsv, channel_dim));
+  }
+};
+REGISTER_XLA_OP(Name("RGBToHSV"), RGBToHSVOp);
+
+class HSVToRGBOp : public XlaOpKernel {
+ public:
+  explicit HSVToRGBOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    OP_REQUIRES(context, input_shape.dims() >= 1,
+                errors::InvalidArgument("input must be at least 1D",
+                                        input_shape.DebugString()));
+    int channel_dim = input_shape.dims() - 1;
+    int64 channels = input_shape.dim_size(channel_dim);
+    OP_REQUIRES(
+        context, channels == 3,
+        errors::FailedPrecondition("input must have 3 channels but input has ",
+                                   channels, " channels."));
+
+    xla::ComputationBuilder* b = context->builder();
+    xla::ComputationDataHandle input = context->Input(0);
+    xla::ComputationDataHandle hue =
+        b->SliceInDim(input, /*start_index=*/0, /*limit_index=*/1, /*stride=*/1,
+                      /*dimno=*/channel_dim);
+    xla::ComputationDataHandle saturation =
+        b->SliceInDim(input, /*start_index=*/1, /*limit_index=*/2, /*stride=*/1,
+                      /*dimno=*/channel_dim);
+    xla::ComputationDataHandle value =
+        b->SliceInDim(input, /*start_index=*/2, /*limit_index=*/3, /*stride=*/1,
+                      /*dimno=*/channel_dim);
+
+    auto rgb = HSVToRGB(context->builder(), {hue, saturation, value},
+                        context->input_type(0));
+
+    context->SetOutput(0, b->ConcatInDim(rgb, channel_dim));
+  }
+};
+REGISTER_XLA_OP(Name("HSVToRGB"), HSVToRGBOp);
+
+class AdjustContrastOpV2 : public XlaOpKernel {
+ public:
+  explicit AdjustContrastOpV2(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape& input_shape = context->InputShape(0);
+    const TensorShape& factor_shape = context->InputShape(1);
+    OP_REQUIRES(context, input_shape.dims() >= 3,
+                errors::InvalidArgument("input must be at least 3-D, got shape",
+                                        input_shape.DebugString()));
+    int height_dim = input_shape.dims() - 3;
+    int width_dim = input_shape.dims() - 2;
+    int channel_dim = input_shape.dims() - 1;
+    const int64 height = input_shape.dim_size(height_dim);
+    const int64 width = input_shape.dim_size(width_dim);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(factor_shape),
+                errors::InvalidArgument("contrast_factor must be scalar: ",
+                                        factor_shape.DebugString()));
+
+    xla::ComputationBuilder* b = context->builder();
+    xla::ComputationDataHandle input = context->Input(0);
+    xla::ComputationDataHandle factor = context->Input(1);
+
+    DataType type = context->input_type(0);
+
+    auto output = b->Reduce(input, /*init_value=*/XlaHelpers::Zero(b, type),
+                            /*computation=*/*context->GetOrCreateAdd(type),
+                            {height_dim, width_dim});
+    output = b->Div(output, XlaHelpers::FloatLiteral(b, type, height * width));
+
+    std::vector<int64> broadcast_dims(input_shape.dims() - 2);
+    std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+    broadcast_dims.back() = channel_dim;
+    output = b->Add(b->Mul(input, factor),
+                    b->Mul(output, b->Sub(XlaHelpers::One(b, type), factor)),
+                    broadcast_dims);
+    context->SetOutput(0, output);
+  }
+};
+REGISTER_XLA_OP(Name("AdjustContrastv2"), AdjustContrastOpV2);
+
+class AdjustSaturationOp : public XlaOpKernel {
+ public:
+  explicit AdjustSaturationOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape& input_shape = context->InputShape(0);
+    const TensorShape& scale_shape = context->InputShape(1);
+    OP_REQUIRES(context, input_shape.dims() >= 3,
+                errors::InvalidArgument("input must be at least 3-D, got shape",
+                                        input_shape.DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(scale_shape),
+                errors::InvalidArgument("scale must be scalar: ",
+                                        scale_shape.DebugString()));
+    const int channel_dim = input_shape.dims() - 1;
+    const int64 channels = input_shape.dim_size(channel_dim);
+    OP_REQUIRES(
+        context, channels == 3,
+        errors::InvalidArgument("input must have 3 channels but instead has ",
+                                channels, " channels."));
+
+    xla::ComputationBuilder* b = context->builder();
+    xla::ComputationDataHandle input = context->Input(0);
+    xla::ComputationDataHandle scale = context->Input(1);
+
+    DataType type = context->input_type(0);
+
+    xla::ComputationDataHandle red =
+        b->SliceInDim(input, /*start_index=*/0, /*limit_index=*/1, /*stride=*/1,
+                      /*dimno=*/channel_dim);
+    xla::ComputationDataHandle green =
+        b->SliceInDim(input, /*start_index=*/1, /*limit_index=*/2, /*stride=*/1,
+                      /*dimno=*/channel_dim);
+    xla::ComputationDataHandle blue =
+        b->SliceInDim(input, /*start_index=*/2, /*limit_index=*/3, /*stride=*/1,
+                      /*dimno=*/channel_dim);
+    TensorShape channel_shape = input_shape;
+    channel_shape.set_dim(channel_dim, 1);
+    auto hsv = RGBToHSV(context, b, {red, green, blue}, context->input_type(0),
+                        channel_shape);
+
+    hsv[1] = b->Clamp(XlaHelpers::Zero(b, type), b->Mul(hsv[1], scale),
+                      XlaHelpers::One(b, type));
+
+    auto rgb = HSVToRGB(context->builder(), hsv, context->input_type(0));
+
+    context->SetOutput(0, b->ConcatInDim(rgb, channel_dim));
+  }
+};
+REGISTER_XLA_OP(Name("AdjustSaturation"), AdjustSaturationOp);
+
+class AdjustHueOp : public XlaOpKernel {
+ public:
+  explicit AdjustHueOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape& input_shape = context->InputShape(0);
+    const TensorShape& delta_shape = context->InputShape(1);
+    OP_REQUIRES(context, input_shape.dims() >= 3,
+                errors::InvalidArgument("input must be at least 3-D, got shape",
+                                        input_shape.DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(delta_shape),
+                errors::InvalidArgument("delta must be scalar: ",
+                                        delta_shape.DebugString()));
+    const int channel_dim = input_shape.dims() - 1;
+    const int64 channels = input_shape.dim_size(channel_dim);
+    OP_REQUIRES(
+        context, channels == 3,
+        errors::InvalidArgument("input must have 3 channels but instead has ",
+                                channels, " channels."));
+
+    xla::ComputationBuilder* b = context->builder();
+    xla::ComputationDataHandle input = context->Input(0);
+    xla::ComputationDataHandle delta = context->Input(1);
+
+    DataType type = context->input_type(0);
+
+    xla::ComputationDataHandle red =
+        b->SliceInDim(input, /*start_index=*/0, /*limit_index=*/1, /*stride=*/1,
+                      /*dimno=*/channel_dim);
+    xla::ComputationDataHandle green =
+        b->SliceInDim(input, /*start_index=*/1, /*limit_index=*/2, /*stride=*/1,
+                      /*dimno=*/channel_dim);
+    xla::ComputationDataHandle blue =
+        b->SliceInDim(input, /*start_index=*/2, /*limit_index=*/3, /*stride=*/1,
+                      /*dimno=*/channel_dim);
+    TensorShape channel_shape = input_shape;
+    channel_shape.set_dim(channel_dim, 1);
+    auto hsv = RGBToHSV(context, b, {red, green, blue}, context->input_type(0),
+                        channel_shape);
+
+    auto zero = XlaHelpers::Zero(b, type);
+    auto one = XlaHelpers::One(b, type);
+
+    auto& hue = hsv[0];
+    hue = b->Rem(b->Add(hsv[0], delta), one);
+    hue = b->Select(b->Lt(hue, zero), b->Rem(b->Add(one, hue), one), hue);
+
+    auto rgb = HSVToRGB(context->builder(), hsv, context->input_type(0));
+
+    context->SetOutput(0, b->ConcatInDim(rgb, channel_dim));
+  }
+};
+REGISTER_XLA_OP(Name("AdjustHue"), AdjustHueOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f36b3f594826c27b7866d956c855aa3638db9cb4
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -0,0 +1,449 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/math/math_util.h"
+
+namespace tensorflow {
+namespace {
+
+// We implement bilinear interpolation by upsampling followed by convolution.
+// The basic idea is as follows. To scale from NxN to RxR:
+//
+//    1. S := (N - 1) /  gcd(N-1, R-1)
+//    2. k := (R - 1) /  gcd(N-1, R-1)
+//    3. Convolution(kxk, stride=S, lhs_dilation=k, padding=k-1)
+//
+// For example, to Scale from 7x7 -> 15x15:
+//
+//    1. S := (7-1) / gcd(7-1, 15-1) = 6 / gcd(6, 14) = 6 / 2 = 3
+//    2. k := (15 - 1) / gcd(7-1, 15-1) = 14 / gcd(6, 14) = 14 / 2 = 7
+//    3. Convolution(7x7, stride=3, lhs_dilation=3, padding=2)
+//
+//
+// The 7x7 -> 15x15 case is much too large to write out in full as an
+// example. The smallest interesting example is 3x3 -> 4x4.
+//
+// S := 2
+// k := 3
+//
+// 00 03 06    00 00 00 00 00 00 00 00 00 00 00      00 02 04 06
+// 09 12 15 -> 00 00 00 00 00 00 00 00 00 00 00   -> 06 08 10 12
+// 18 21 24    00 00 00 00 00 03 00 00 06 00 00      12 14 16 18
+//             00 00 00 00 00 00 00 00 00 00 00      18 20 22 24
+//             00 00 00 00 00 00 00 00 00 00 00
+//             00 00 09 00 00 12 00 00 15 00 00
+//             00 00 00 00 00 00 00 00 00 00 00
+//             00 00 00 00 00 00 00 00 00 00 00
+//             00 00 18 00 00 21 00 00 24 00 00
+//             00 00 00 00 00 00 00 00 00 00 00
+//             00 00 00 00 00 00 00 00 00 00 00
+//
+// with the following convolutional kernel, with stride [2, 2]:
+//       1 2 3 2 1
+//       2 4 6 4 2
+// 1/9 * 3 6 9 6 3
+//       2 4 6 4 2
+//       1 2 3 2 1
+
+// Computes the size of the convolutional kernel and stride to use when resizing
+// from in_size to out_size.
+struct ResizeConvolutionDims {
+  // Size of the kernel to use.
+  std::vector<int64> kernel_size;
+
+  // Stride of the convolution to use.
+  std::vector<int64> stride;
+};
+ResizeConvolutionDims ComputeResizeConvolutionParameters(
+    gtl::ArraySlice<int64> in_size, gtl::ArraySlice<int64> out_size) {
+  CHECK_EQ(in_size.size(), out_size.size());
+  int num_spatial_dims = in_size.size();
+  ResizeConvolutionDims dims;
+  dims.kernel_size.resize(num_spatial_dims);
+  dims.stride.resize(num_spatial_dims);
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    if (in_size[i] == 1) {
+      // We must handle input size 1 specially because XLA convolution does
+      // not allow stride 0.
+      dims.stride[i] = dims.kernel_size[i] = 1;
+    } else if (out_size[i] == 1) {
+      // If in_size[i] > 1 but out_size[i] == 1, then we slice out the first
+      // entry before resizing.
+      dims.stride[i] = dims.kernel_size[i] = 1;
+    } else {
+      int64 gcd = MathUtil::GCD(static_cast<uint64>(in_size[i] - 1),
+                                static_cast<uint64>(out_size[i] - 1));
+      dims.stride[i] = (in_size[i] - 1) / gcd;
+      dims.kernel_size[i] = (out_size[i] - 1) / gcd;
+    }
+  }
+  return dims;
+}
+
+xla::ComputationDataHandle MakeBilinearResizeKernel(
+    xla::ComputationBuilder* builder, gtl::ArraySlice<int64> kernel_size,
+    int64 channels) {
+  // Form a 2D convolution kernel like:
+  //       1 2 3 2 1
+  //       2 4 6 4 2
+  // 1/9 * 3 6 9 6 3
+  //       2 4 6 4 2
+  //       1 2 3 2 1
+  // by multiplying two 1D kernels of the form:
+  // 1/3 * [1 2 3 2 1]
+  auto make_1d_kernel = [](int64 n) {
+    std::vector<float> kernel(n * 2 - 1);
+    for (int64 i = 0; i < n; ++i) {
+      float v = (i + 1.0f) / n;
+      kernel[i] = v;
+      kernel[n * 2 - 2 - i] = v;
+    }
+    return kernel;
+  };
+
+  xla::ComputationDataHandle channels_iota;
+  // DT_INT32 Iota will always return status::OK().
+  TF_CHECK_OK(
+      XlaHelpers::Iota(builder, DataType::DT_INT32, channels, &channels_iota));
+
+  auto diag = builder->ConvertElementType(
+      builder->Eq(
+          builder->Broadcast(channels_iota, {2 * kernel_size[0] - 1,
+                                             2 * kernel_size[1] - 1, channels}),
+          channels_iota, /*broadcast_dimensions=*/{2}),
+      xla::PrimitiveType::F32);
+  return builder->Mul(
+      builder->Mul(diag,
+                   builder->ConstantR1<float>(make_1d_kernel(kernel_size[1])),
+                   /*broadcast_dimensions=*/{1}),
+      builder->ConstantR1<float>(make_1d_kernel(kernel_size[0])),
+      /*broadcast_dimensions=*/{0});
+}
+
+xla::ComputationDataHandle ResizeUsingDilationAndConvolution(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& input,
+    const int num_spatial_dims, std::vector<int64> in_size,
+    std::vector<int64> out_size, const int64 channels) {
+  // Picture for a 1x3 to 1x4 resize:
+  // stride = 2, kernel size = 3
+  // Input:
+  // 3 6 9
+  // Input with dilation and padding:
+  // 0 0 3 0 0 6 0 0 9 0 0
+  // Convolution kernel:
+  // 1/3 * [1 2 3 2 1]
+  // Output:
+  // 3 5 7 9
+  xla::ConvolutionDimensionNumbers dimension_numbers;
+  dimension_numbers.set_input_batch_dimension(0);
+  dimension_numbers.set_output_batch_dimension(0);
+  dimension_numbers.set_input_feature_dimension(3);
+  dimension_numbers.set_output_feature_dimension(3);
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    dimension_numbers.add_input_spatial_dimensions(1 + i);
+    dimension_numbers.add_output_spatial_dimensions(1 + i);
+    dimension_numbers.add_kernel_spatial_dimensions(i);
+  }
+  dimension_numbers.set_kernel_input_feature_dimension(num_spatial_dims);
+  dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims + 1);
+
+  ResizeConvolutionDims dims =
+      ComputeResizeConvolutionParameters(in_size, out_size);
+  xla::ComputationDataHandle kernel =
+      MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
+  xla::ComputationDataHandle output = builder->ConvGeneralDilated(
+      input, kernel, dims.stride,
+      /*padding=*/
+      {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
+       {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+      /*lhs_dilation=*/dims.kernel_size,
+      /*rhs_dilation=*/{1, 1}, dimension_numbers);
+
+  // Add broadcasts to handle expanding from a size == 1 dimension to a
+  // size > 1 dimension.
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    if (in_size[i] == 1 && out_size[i] > 1) {
+      output = builder->Add(output, builder->ConstantR1<float>(out_size[i], 0),
+                            /*broadcast_dimensions=*/{1 + i});
+    }
+  }
+  return output;
+}
+
+xla::ComputationDataHandle ResizeUsingDilationAndConvolutionGradOp(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& grad,
+    const int num_spatial_dims, std::vector<int64> in_size,
+    std::vector<int64> grad_size, const int64 channels) {
+  ResizeConvolutionDims dims =
+      ComputeResizeConvolutionParameters(in_size, grad_size);
+
+  // To form the backward convolution, we keep the kernel unchanged (it is
+  // already symmetric) and swap the roles of strides and LHS dilation.
+  xla::ConvolutionDimensionNumbers dimension_numbers;
+  dimension_numbers.set_input_batch_dimension(0);
+  dimension_numbers.set_output_batch_dimension(0);
+  dimension_numbers.set_input_feature_dimension(3);
+  dimension_numbers.set_output_feature_dimension(3);
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    dimension_numbers.add_input_spatial_dimensions(1 + i);
+    dimension_numbers.add_output_spatial_dimensions(1 + i);
+    dimension_numbers.add_kernel_spatial_dimensions(i);
+  }
+  dimension_numbers.set_kernel_input_feature_dimension(num_spatial_dims);
+  dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims + 1);
+  xla::ComputationDataHandle kernel =
+      MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
+
+  // Broadcast the input kernel where the forward op expanded from a size == 1
+  // dimension to a size > 1 dimension. This has the effect of summing the
+  // gradient contributions in that dimension.
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    if (in_size[i] == 1 && grad_size[i] > 1) {
+      kernel = builder->Add(kernel, builder->ConstantR1<float>(grad_size[i], 0),
+                            /*broadcast_dimensions=*/{i});
+    }
+  }
+
+  xla::ComputationDataHandle output = builder->ConvGeneralDilated(
+      grad, kernel, /*window_strides=*/dims.kernel_size,
+      /*padding=*/
+      {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
+       {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+      /*lhs_dilation=*/dims.stride,
+      /*rhs_dilation=*/{1, 1}, dimension_numbers);
+
+  // If in_size[i] > 1 and grad_size[i] == 1, pad the output in dimension i.
+  // Opposite of the slice performed by the forward op.
+  xla::PaddingConfig padding = xla::MakeNoPaddingConfig(4);
+  bool pad_output = false;
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    if (in_size[i] > 1 && grad_size[i] == 1) {
+      pad_output = true;
+      padding.mutable_dimensions(1 + i)->set_edge_padding_high(in_size[i] - 1);
+    }
+  }
+  if (pad_output) {
+    output = builder->Pad(output, builder->ConstantR0<float>(0.0f), padding);
+  }
+  return output;
+}
+
+class ResizeBilinearOp : public XlaOpKernel {
+ public:
+  explicit ResizeBilinearOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES(
+        ctx, align_corners_ == true,
+        errors::Unimplemented(
+            "ResizeBilinear with align_corners=False is not yet implemented"));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    TensorShape input_shape = ctx->InputShape(0);
+    OP_REQUIRES(ctx, input_shape.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input_shape.DebugString()));
+    const int64 batch = input_shape.dim_size(0);
+    std::vector<int64> in_size = {input_shape.dim_size(1),
+                                  input_shape.dim_size(2)};
+    const int64 channels = input_shape.dim_size(3);
+    OP_REQUIRES(ctx, in_size[0] > 0 && in_size[1] > 0,
+                errors::InvalidArgument("input size must be positive, got [",
+                                        in_size[0], ",", in_size[1], "]"));
+
+    std::vector<int64> out_size;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &out_size));
+    OP_REQUIRES(ctx, out_size.size() == 2,
+                errors::InvalidArgument("output size must be length 2, got ",
+                                        out_size.size()));
+    OP_REQUIRES(ctx, out_size[0] > 0 && out_size[1] > 0,
+                errors::InvalidArgument("output size must be positive, got [",
+                                        out_size[0], ",", out_size[1], "]"));
+
+    const int num_spatial_dims = 2;
+
+    xla::ComputationDataHandle input = ctx->Input(0);
+
+    // If in_size[i] > 1 and out_size[i] == 1, slice out the first input in
+    // dimension i.
+    std::vector<int64> slice_size = in_size;
+    bool slice_input = false;
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      if (in_size[i] > 1 && out_size[i] == 1) {
+        // If in_size[i] > 1 but out_size[i] == 1, then we slice out the first
+        // entry before resizing.
+        slice_input = true;
+        slice_size[i] = 1;
+      }
+    }
+    if (slice_input) {
+      input = b->Slice(input, {0, 0, 0, 0},
+                       {batch, slice_size[0], slice_size[1], channels},
+                       {1, 1, 1, 1});
+    }
+
+    // Output is always type float.
+    input = b->ConvertElementType(input, xla::F32);
+
+    // Special Case:
+    // Instead of doing a ResizeUsingDilationAndConvolution directly,
+    // while (out_size[0]-1) = c * 2^x * (in_size[0]-1) for x>1 c>1, resize the
+    // image to 2*(in_size[0]-1)+1 x-times and then resize by scale c(int here).
+    // Instead of resizing directly we resize it iteratively.
+    //
+    // Since bilinear resize can be broken down as 2 sequential linear
+    // operations along different dimensions.
+    // Given sufficient numerical stability and a<e<c and b<f<d, bilinear resize
+    // from image of size axb -> cxd is same as resizing axb -> exf -> cxd.
+    //
+    // This makes the convolutions kernels smaller and the operation faster.
+    xla::ComputationDataHandle output = input;
+    while (in_size != out_size) {
+      if (in_size[0] != 1 && in_size[1] != 1) {
+        std::vector<float> k = {
+            (static_cast<float>(out_size[0]) - 1) / ((in_size[0] - 1) * 2),
+            (static_cast<float>(out_size[1]) - 1) / ((in_size[1] - 1) * 2)};
+        if ((k[0] == std::floor(k[0])) && (k[1] == std::floor(k[1])) &&
+            k[0] > 1 && k[1] > 1) {
+          std::vector<int64> next_out_size = {(in_size[0] - 1) * 2 + 1,
+                                              (in_size[1] - 1) * 2 + 1};
+          output = ResizeUsingDilationAndConvolution(
+              b, input, num_spatial_dims, in_size, next_out_size, channels);
+          input = output;
+          in_size = next_out_size;
+        } else {
+          output = ResizeUsingDilationAndConvolution(
+              b, input, num_spatial_dims, in_size, out_size, channels);
+          in_size = out_size;
+        }
+      } else {
+        output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims,
+                                                   in_size, out_size, channels);
+        in_size = out_size;
+      }
+    }
+
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  bool align_corners_;
+};
+
+REGISTER_XLA_OP(Name("ResizeBilinear").CompileTimeConstInput("size"),
+                ResizeBilinearOp);
+
+class ResizeBilinearGradOp : public XlaOpKernel {
+ public:
+  explicit ResizeBilinearGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES(
+        ctx, align_corners_ == true,
+        errors::Unimplemented("ResizeBilinearGrad with align_corners=False is "
+                              "not yet implemented"));
+
+    DataType output_dtype;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &output_dtype));
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(output_dtype, &output_type_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    TensorShape input_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, input_shape.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input_shape.DebugString()));
+    const int64 batch = input_shape.dim_size(0);
+    std::vector<int64> in_size = {input_shape.dim_size(1),
+                                  input_shape.dim_size(2)};
+    const int64 channels = input_shape.dim_size(3);
+    OP_REQUIRES(ctx, in_size[0] > 0 && in_size[1] > 0,
+                errors::InvalidArgument("input size must be positive, got [",
+                                        in_size[0], ",", in_size[1], "]"));
+
+    TensorShape grad_shape = ctx->InputShape(0);
+    OP_REQUIRES(ctx, grad_shape.dims() == 4,
+                errors::InvalidArgument("gradient must be 4-dimensional",
+                                        grad_shape.DebugString()));
+    const int64 grad_batch = grad_shape.dim_size(0);
+    const std::vector<int64> grad_size = {grad_shape.dim_size(1),
+                                          grad_shape.dim_size(2)};
+    const int64 grad_channels = grad_shape.dim_size(3);
+    OP_REQUIRES(ctx, batch == grad_batch,
+                errors::InvalidArgument(
+                    "activations and gradients must have the same batch size (",
+                    batch, " vs. ", grad_batch, ")"));
+    OP_REQUIRES(ctx, grad_size[0] > 0 && grad_size[1] > 0,
+                errors::InvalidArgument("gradient size must be positive, got [",
+                                        grad_size[0], ",", grad_size[1], "]"));
+    OP_REQUIRES(
+        ctx, channels == grad_channels,
+        errors::InvalidArgument(
+            "activations and gradients must have the same number of channels (",
+            channels, " vs. ", grad_channels, ")"));
+
+    const int num_spatial_dims = 2;
+
+    xla::ComputationDataHandle grad = ctx->Input(0);
+
+    xla::ComputationDataHandle output = grad;
+    while (in_size != grad_size) {
+      if (in_size[0] != 1 && in_size[1] != 1) {
+        std::vector<float> k = {
+            (static_cast<float>(grad_size[0]) - 1) / ((in_size[0] - 1) * 2),
+            (static_cast<float>(grad_size[1]) - 1) / ((in_size[1] - 1) * 2)};
+        if ((k[0] == std::floor(k[0])) && (k[1] == std::floor(k[1])) &&
+            k[0] > 1 && k[1] > 1) {
+          std::vector<int64> next_grad_size = {(in_size[0] - 1) * 2 + 1,
+                                               (in_size[1] - 1) * 2 + 1};
+          output = ResizeUsingDilationAndConvolutionGradOp(
+              b, grad, num_spatial_dims, in_size, next_grad_size, channels);
+          grad = output;
+          in_size = next_grad_size;
+        } else {
+          output = ResizeUsingDilationAndConvolutionGradOp(
+              b, grad, num_spatial_dims, in_size, grad_size, channels);
+          in_size = grad_size;
+        }
+      } else {
+        output = ResizeUsingDilationAndConvolutionGradOp(
+            b, grad, num_spatial_dims, in_size, grad_size, channels);
+        in_size = grad_size;
+      }
+    }
+
+    output = b->ConvertElementType(output, output_type_);
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  bool align_corners_;
+  xla::PrimitiveType output_type_;
+};
+
+REGISTER_XLA_OP(Name("ResizeBilinearGrad"), ResizeBilinearGradOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops.cc b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
index e0dc1870f2a4934c35163f0cc10196e8fcbed9be..7bf4b435f526afa93d8a218b191928acb932cd6b 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
@@ -80,7 +80,10 @@ void XlaArgMinMaxOp::Compile(XlaOpKernelContext* ctx) {
 
 XlaArgMaxOp::XlaArgMaxOp(OpKernelConstruction* ctx)
     : XlaArgMinMaxOp(ctx, /*is_min=*/false) {}
-REGISTER_XLA_OP(Name("ArgMax").Device(DEVICE_GPU_XLA_JIT), XlaArgMaxOp);
+REGISTER_XLA_OP(Name("ArgMax")
+                    .Device(DEVICE_GPU_XLA_JIT)
+                    .CompileTimeConstInput("dimension"),
+                XlaArgMaxOp);
 
 namespace {
 
@@ -90,7 +93,7 @@ class XlaArgMinOp : public XlaArgMinMaxOp {
 };
 XlaArgMinOp::XlaArgMinOp(OpKernelConstruction* ctx)
     : XlaArgMinMaxOp(ctx, /*is_min=*/true) {}
-REGISTER_XLA_OP(Name("ArgMin"), XlaArgMinOp);
+REGISTER_XLA_OP(Name("ArgMin").CompileTimeConstInput("dimension"), XlaArgMinOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
index 20946e247a9459d7c8a0d8a666fef24bd32838f2..b1f3c3c298ce0cadf38b9bda715761fe7e2896d7 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
@@ -56,10 +56,10 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
         errors::InvalidArgument("dim must be < input rank (",
                                 input_shape.dims(), "), but got: ", dim));
     const int64 dim_size = input_shape.dim_size(dim);
-    OP_REQUIRES(
-        ctx, dim_size > 0,
-        errors::InvalidArgument("Reduction axis ", dim, " is empty in shape: ",
-                                input_shape.DebugString()));
+    OP_REQUIRES(ctx, dim_size > 0,
+                errors::InvalidArgument(
+                    "Reduction axis ", dim,
+                    " is empty in shape: ", input_shape.DebugString()));
 
     // The output shape is the input shape contracted along dim.
     TensorShape output_shape;
@@ -113,9 +113,11 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ArgMaxCustomCallOp);
 };
 
-REGISTER_XLA_OP(
-    Name("ArgMax").TypeConstraint("T", DT_FLOAT).Device(DEVICE_CPU_XLA_JIT),
-    ArgMaxCustomCallOp);
+REGISTER_XLA_OP(Name("ArgMax")
+                    .TypeConstraint("T", DT_FLOAT)
+                    .Device(DEVICE_CPU_XLA_JIT)
+                    .CompileTimeConstInput("dimension"),
+                ArgMaxCustomCallOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index fcef497e5845d9080bc83b54e92dcf2fdecf5f12..886baf8115243a22b7255a3961c914d4cf6c2ed5 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -23,16 +23,18 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-constexpr std::array<DataType, 4> kMatmulTypes = {
-    {DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64}};
+constexpr std::array<DataType, 5> kMatmulTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64}};
 
 class MatMulOp : public XlaOpKernel {
  public:
   explicit MatMulOp(OpKernelConstruction* ctx, bool is_sparse = false)
-      : XlaOpKernel(ctx) {
+      : XlaOpKernel(ctx), is_sparse_(is_sparse) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
     if (is_sparse) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("Ta", &a_type_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("Tb", &b_type_));
       // SparseMatMul is actually dense matmul with a hint that one or
       // both of the inputs may contain a lot of zeroes. On CPU these
       // inputs are dynamically converted to sparse representation
@@ -66,14 +68,25 @@ class MatMulOp : public XlaOpKernel {
 
     xla::ComputationDataHandle a = ctx->Input(0);
     xla::ComputationDataHandle b = ctx->Input(1);
+    if (is_sparse_) {
+      if (a_type_ == DT_BFLOAT16) {
+        a = ctx->builder()->ConvertElementType(a, xla::F32);
+      }
+      if (b_type_ == DT_BFLOAT16) {
+        b = ctx->builder()->ConvertElementType(b, xla::F32);
+      }
+    }
     auto lhs = (transpose_a_) ? ctx->builder()->Transpose(a, {1, 0}) : a;
     auto rhs = (transpose_b_) ? ctx->builder()->Transpose(b, {1, 0}) : b;
     ctx->SetOutput(0, ctx->builder()->Dot(lhs, rhs));
   }
 
  private:
+  bool is_sparse_;
   bool transpose_a_;
   bool transpose_b_;
+  DataType a_type_;
+  DataType b_type_;
 };
 
 REGISTER_XLA_OP(Name("MatMul").TypeConstraint("T", kMatmulTypes), MatMulOp);
@@ -85,10 +98,7 @@ class SparseMatMulOp : public MatMulOp {
   ~SparseMatMulOp() override = default;
 };
 
-REGISTER_XLA_OP(Name("SparseMatMul")
-                    .TypeConstraint("Ta", kFloatTypes)
-                    .TypeConstraint("Tb", kFloatTypes),
-                SparseMatMulOp);
+REGISTER_XLA_OP(Name("SparseMatMul"), SparseMatMulOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..faa415a97b053b4b11d015fefcd430210b98118a
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class MatrixBandPartOp : public XlaOpKernel {
+ public:
+  explicit MatrixBandPartOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 2-dim, received shape: ",
+                    input_shape.DebugString()));
+
+    const TensorShape num_lower_in_shape = context->InputShape(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_lower_in_shape),
+                errors::InvalidArgument("num_lower must be scalar, got shape ",
+                                        num_lower_in_shape.DebugString()));
+
+    const TensorShape num_upper_in_shape = context->InputShape(2);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_upper_in_shape),
+                errors::InvalidArgument("num_upper must be scalar, got shape ",
+                                        num_upper_in_shape.DebugString()));
+
+    xla::ComputationBuilder* builder = context->builder();
+    xla::ComputationDataHandle input = context->Input(0);
+    xla::ComputationDataHandle num_lower = context->Input(1);
+    xla::ComputationDataHandle num_upper = context->Input(2);
+    DataType input_type = context->input_type(0);
+    DataType index_type = context->input_type(1);
+
+    TensorShape batch_shape = input_shape;
+    batch_shape.RemoveLastDims(2);
+    const int64 m = input_shape.dim_size(input_shape.dims() - 2);
+    const int64 n = input_shape.dim_size(input_shape.dims() - 1);
+
+    // Compute 'offset', which is how many diagonals we are above/below the
+    // diagonal.
+    xla::ComputationDataHandle iota_m;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, m, &iota_m));
+
+    xla::ComputationDataHandle iota_n;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, n, &iota_n));
+
+    auto offset = builder->Sub(builder->Broadcast(iota_n, {m}), iota_m,
+                               /*broadcast_dimensions=*/{0});
+
+    // If num_lower or num_upper are negative, include all lower/upper
+    // diagonals.
+    auto zero_index = XlaHelpers::Zero(builder, index_type);
+    num_lower = builder->Select(
+        builder->Lt(num_lower, zero_index),
+        XlaHelpers::IntegerLiteral(builder, index_type, m), num_lower);
+    num_upper = builder->Select(
+        builder->Lt(num_upper, zero_index),
+        XlaHelpers::IntegerLiteral(builder, index_type, n), num_upper);
+
+    auto indicator = builder->And(builder->Le(builder->Neg(num_lower), offset),
+                                  builder->Le(offset, num_upper));
+    indicator = builder->Broadcast(indicator, batch_shape.dim_sizes());
+
+    auto zero_input = XlaHelpers::Zero(builder, input_type);
+    auto output = builder->Select(
+        indicator, input,
+        builder->Broadcast(zero_input, input_shape.dim_sizes()));
+
+    context->SetOutput(0, output);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixBandPartOp);
+};
+REGISTER_XLA_OP(Name("MatrixBandPart"), MatrixBandPartOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2940bdcff75a087c914fdad0cb2426276e41aff
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+
+class MatrixSetDiagOp : public XlaOpKernel {
+ public:
+  explicit MatrixSetDiagOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const TensorShape diag_shape = context->InputShape(1);
+
+    const int rank = input_shape.dims();
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape),
+                errors::InvalidArgument(
+                    "input must be at least 2-dim, received shape: ",
+                    input_shape.DebugString()));
+
+    // Check to make sure the last dimension of diag is equal to the smaller of
+    // the last two dimensions of input.
+    const int64 m = input_shape.dim_size(rank - 2);
+    const int64 n = input_shape.dim_size(rank - 1);
+    const int64 min_dim = std::min(m, n);
+
+    TensorShape batch_shape = input_shape;
+    batch_shape.RemoveLastDims(2);
+
+    TensorShape expected_diag_shape = batch_shape;
+    expected_diag_shape.AddDim(min_dim);
+    OP_REQUIRES(context, expected_diag_shape == diag_shape,
+                errors::InvalidArgument(
+                    "must have diagonal.shape == input.shape[:-2] + "
+                    "min(input.shape[-2:]), but received input shape: ",
+                    input_shape.DebugString(),
+                    " and diagonal shape: ", diag_shape.DebugString()));
+
+    xla::ComputationBuilder* builder = context->builder();
+    xla::ComputationDataHandle input = context->Input(0);
+    xla::ComputationDataHandle diag = context->Input(1);
+
+    auto zero = XlaHelpers::Zero(builder, context->input_type(0));
+
+    // Create an indicator tensor that is true only on the diagonal.
+    xla::ComputationDataHandle iota_m;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, m, &iota_m));
+    xla::ComputationDataHandle iota_n;
+    OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, n, &iota_n));
+    auto indicator = builder->Eq(iota_m,
+                                 builder->Broadcast(iota_n, {m}),
+                                 /*broadcast_dimensions=*/{0});
+    indicator = builder->Broadcast(indicator, batch_shape.dim_sizes());
+
+    // Broadcast diag up to the input shape. Use an implicit broadcast (Add)
+    // because we need to broadcast on the right.
+    std::vector<int64> diag_broadcast_dims(rank - 1);
+    std::iota(diag_broadcast_dims.begin(), diag_broadcast_dims.end(), 0);
+    if (min_dim != m) {
+      diag_broadcast_dims.back() = rank - 1;
+    }
+    diag = builder->Add(diag, builder->Broadcast(zero, input_shape.dim_sizes()),
+                        /*broadcast_dimensions=*/diag_broadcast_dims);
+
+    auto output = builder->Select(indicator, diag, input);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSetDiagOp);
+};
+
+REGISTER_XLA_OP(Name("MatrixSetDiag"), MatrixSetDiagOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eaed93146460de5a6e8328432302cc75bf36a534
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+class MatrixTriangularSolveOp : public XlaOpKernel {
+ public:
+  explicit MatrixTriangularSolveOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("lower", &lower_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("adjoint", &adjoint_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto result = TriangularSolve(
+        ctx->builder(), ctx->Input(0), ctx->Input(1), /*left_side=*/true,
+        /*lower=*/lower_, /*transpose_a=*/adjoint_, /*conjugate_a=*/adjoint_);
+    if (!result.ok()) {
+      ctx->SetStatus(result.status());
+      return;
+    }
+    ctx->SetOutput(0, result.ValueOrDie());
+  }
+
+ private:
+  bool lower_;
+  bool adjoint_;
+};
+
+REGISTER_XLA_OP(Name("MatrixTriangularSolve"), MatrixTriangularSolveOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
index bea1d1600b5b5fc0c44f0208d394f25061ecbb68..05a36a031ad73be289604da1b7e56203ff12fbf5 100644
--- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
@@ -92,7 +92,8 @@ class MirrorPadOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(MirrorPadOp);
 };
 
-REGISTER_XLA_OP(Name("MirrorPad"), MirrorPadOp);
+REGISTER_XLA_OP(Name("MirrorPad").CompileTimeConstInput("paddings"),
+                MirrorPadOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
index 2a9cfcb2eb86399bd446db8d591012a7a2f3d667..9f7c9913802d311895479b914b66553e135aa426 100644
--- a/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
@@ -76,7 +76,7 @@ class OneHotOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(OneHotOp);
 };
 
-REGISTER_XLA_OP(Name("OneHot"), OneHotOp);
+REGISTER_XLA_OP(Name("OneHot").CompileTimeConstInput("depth"), OneHotOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/pad_op.cc b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
index d841bd37b33c31dbc156fa824ff62a58169a99cb..791351637aee61c5fdd911dd8a48959990514395 100644
--- a/tensorflow/compiler/tf2xla/kernels/pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
@@ -83,8 +83,8 @@ class PadOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Pad"), PadOp);
-REGISTER_XLA_OP(Name("PadV2"), PadOp);
+REGISTER_XLA_OP(Name("Pad").CompileTimeConstInput("paddings"), PadOp);
+REGISTER_XLA_OP(Name("PadV2").CompileTimeConstInput("paddings"), PadOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 2b6053d19dd64a0c893b3613133c8f4691f9cd27..d4fb5dd4e06c7c70591262c0d63a91c383a2a6e0 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -37,21 +37,23 @@ class PoolingOp : public XlaOpKernel {
  public:
   PoolingOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
-    std::vector<int32> ksize_int;
-    std::vector<int32> stride_int;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int));
-    OP_REQUIRES(ctx, ksize_int.size() == num_dims(),
-                errors::InvalidArgument("Sliding window ksize field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_int));
-    OP_REQUIRES(ctx, stride_int.size() == num_dims(),
-                errors::InvalidArgument("Sliding window stride field must "
-                                        "specify ",
-                                        num_dims(), " dimensions"));
-    for (int i = 0; i < num_dims(); ++i) {
-      ksize_.push_back(ksize_int[i]);
-      stride_.push_back(stride_int[i]);
+    if (ctx->num_inputs() == 1) {
+      std::vector<int32> ksize_int;
+      std::vector<int32> stride_int;
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_int));
+      OP_REQUIRES(ctx, ksize_int.size() == num_dims(),
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_int));
+      OP_REQUIRES(ctx, stride_int.size() == num_dims(),
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      for (int i = 0; i < num_dims(); ++i) {
+        ksize_.push_back(ksize_int[i]);
+        stride_.push_back(stride_int[i]);
+      }
     }
     Padding padding;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding));
@@ -77,6 +79,33 @@ class PoolingOp : public XlaOpKernel {
     xla::ComputationDataHandle input = ctx->Input(0);
     const TensorShape input_shape = ctx->InputShape(0);
 
+    std::vector<int64> ksize = ksize_;
+    std::vector<int64> stride = stride_;
+    if (ctx->num_inputs() != 1) {
+      const TensorShape ksize_shape = ctx->InputShape(1);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape),
+                  errors::InvalidArgument("ksize must be a vector, not shape ",
+                                          ksize_shape.DebugString()));
+      OP_REQUIRES(ctx, ksize_shape.num_elements() == num_dims(),
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      ksize.clear();
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &ksize));
+
+      const TensorShape stride_shape = ctx->InputShape(2);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape),
+                  errors::InvalidArgument("stride must be a vector, not shape ",
+                                          stride_shape.DebugString()));
+      OP_REQUIRES(ctx, stride_shape.num_elements() == num_dims(),
+                  errors::InvalidArgument("Sliding window stride field must "
+                                          "specify ",
+                                          num_dims(), " dimensions"));
+      stride.clear();
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &stride));
+    }
     OP_REQUIRES(ctx, input_shape.dims() == num_dims(),
                 errors::InvalidArgument("Input to ", type_string(),
                                         " operator must have ", num_dims(),
@@ -84,8 +113,8 @@ class PoolingOp : public XlaOpKernel {
 
     const DataType type = input_type(0);
     xla::ComputationDataHandle pooled = ctx->builder()->ReduceWindow(
-        input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize_,
-        stride_, padding_);
+        input, InitValue(ctx->builder(), type), *Reduction(ctx, type), ksize,
+        stride, padding_);
     ctx->SetOutput(0, PostProcessOutput(ctx, pooled, type, input_shape));
   }
 
@@ -130,6 +159,10 @@ class MaxPool2DOp : public MaxPoolOp {
   }
 };
 REGISTER_XLA_OP(Name("MaxPool"), MaxPool2DOp);
+REGISTER_XLA_OP(Name("MaxPoolV2")
+                    .CompileTimeConstInput("ksize")
+                    .CompileTimeConstInput("strides"),
+                MaxPool2DOp);
 
 class MaxPool3DOp : public MaxPoolOp {
  public:
@@ -243,22 +276,44 @@ class MaxPoolGradOp : public XlaOpKernel {
  public:
   MaxPoolGradOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : XlaOpKernel(ctx), num_spatial_dims_(num_spatial_dims) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_));
+    if (ctx->num_inputs() == 3) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("ksize", &ksize_));
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
+    }
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
+  }
+
+  int num_dims() const { return num_spatial_dims_ + 2; }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    if (ctx->num_inputs() != 3) {
+      OP_REQUIRES(
+          ctx, ctx->num_inputs() == 5,
+          errors::InvalidArgument("Must supply ksize and stride arguments."));
+      const TensorShape ksize_shape = ctx->InputShape(3);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape),
+                  errors::InvalidArgument("ksize must be a vector, not shape ",
+                                          ksize_shape.DebugString()));
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(3, &ksize_));
+
+      const TensorShape stride_shape = ctx->InputShape(4);
+      // Validate input sizes.
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape),
+                  errors::InvalidArgument("stride must be a vector, not shape ",
+                                          stride_shape.DebugString()));
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(4, &stride_));
+    }
+
     OP_REQUIRES(ctx, ksize_.size() == num_dims(),
                 errors::InvalidArgument("Sliding window ksize field must "
                                         "specify ",
                                         num_dims(), " dimensions"));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &stride_));
     OP_REQUIRES(ctx, stride_.size() == num_dims(),
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify ",
                                         num_dims(), " dimensions"));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
-  }
 
-  int num_dims() const { return num_spatial_dims_ + 2; }
-
-  void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape tensor_in_shape = ctx->InputShape(0);
     const TensorShape tensor_out_shape = ctx->InputShape(1);
     const TensorShape out_backprop_shape = ctx->InputShape(2);
@@ -315,6 +370,10 @@ class MaxPool2DGradOp : public MaxPoolGradOp {
   }
 };
 REGISTER_XLA_OP(Name("MaxPoolGrad"), MaxPool2DGradOp);
+REGISTER_XLA_OP(Name("MaxPoolGradV2")
+                    .CompileTimeConstInput("ksize")
+                    .CompileTimeConstInput("strides"),
+                MaxPool2DGradOp);
 
 class MaxPool3DGradOp : public MaxPoolGradOp {
  public:
@@ -455,14 +514,16 @@ class AvgPool2DGradOp : public AvgPoolGradOp {
                 errors::InvalidArgument("Invalid data format"));
   }
 };
-REGISTER_XLA_OP(Name("AvgPoolGrad"), AvgPool2DGradOp);
+REGISTER_XLA_OP(Name("AvgPoolGrad").CompileTimeConstInput("orig_input_shape"),
+                AvgPool2DGradOp);
 
 class AvgPool3DGradOp : public AvgPoolGradOp {
  public:
   explicit AvgPool3DGradOp(OpKernelConstruction* ctx)
       : AvgPoolGradOp(ctx, /*num_spatial_dims=*/3) {}
 };
-REGISTER_XLA_OP(Name("AvgPool3DGrad"), AvgPool3DGradOp);
+REGISTER_XLA_OP(Name("AvgPool3DGrad").CompileTimeConstInput("orig_input_shape"),
+                AvgPool3DGradOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 2421825ead17a3acee9f145f00904d382fb656f4..c0994c434bca5174eaee7b9e63e10432d9c2ed8d 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -52,7 +52,8 @@ class RandomUniformOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RandomUniformOp);
 };
 
-REGISTER_XLA_OP(Name("RandomUniform"), RandomUniformOp);
+REGISTER_XLA_OP(Name("RandomUniform").CompileTimeConstInput("shape"),
+                RandomUniformOp);
 
 class RandomUniformIntOp : public XlaOpKernel {
  public:
@@ -83,7 +84,8 @@ class RandomUniformIntOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RandomUniformIntOp);
 };
 
-REGISTER_XLA_OP(Name("RandomUniformInt"), RandomUniformIntOp);
+REGISTER_XLA_OP(Name("RandomUniformInt").CompileTimeConstInput("shape"),
+                RandomUniformIntOp);
 
 class RandomStandardNormalOp : public XlaOpKernel {
  public:
@@ -111,7 +113,8 @@ class RandomStandardNormalOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RandomStandardNormalOp);
 };
 
-REGISTER_XLA_OP(Name("RandomStandardNormal"), RandomStandardNormalOp);
+REGISTER_XLA_OP(Name("RandomStandardNormal").CompileTimeConstInput("shape"),
+                RandomStandardNormalOp);
 
 class TruncatedNormalOp : public XlaOpKernel {
  public:
@@ -183,7 +186,8 @@ class TruncatedNormalOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("TruncatedNormal"), TruncatedNormalOp);
+REGISTER_XLA_OP(Name("TruncatedNormal").CompileTimeConstInput("shape"),
+                TruncatedNormalOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index 647b6274083cf8886af6c451b746416445a4a2b2..03b13b2924f4b81c1017804c91d5ffb81c44ea0b 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -35,7 +35,7 @@ class SumOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP(Name("Sum"), SumOp);
+REGISTER_XLA_OP(Name("Sum").CompileTimeConstInput("reduction_indices"), SumOp);
 
 class ProdOp : public XlaReductionOp {
  public:
@@ -53,7 +53,8 @@ class ProdOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP(Name("Prod"), ProdOp);
+REGISTER_XLA_OP(Name("Prod").CompileTimeConstInput("reduction_indices"),
+                ProdOp);
 
 class MinOp : public XlaReductionOp {
  public:
@@ -73,7 +74,7 @@ class MinOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP(Name("Min"), MinOp);
+REGISTER_XLA_OP(Name("Min").CompileTimeConstInput("reduction_indices"), MinOp);
 
 class MaxOp : public XlaReductionOp {
  public:
@@ -93,7 +94,7 @@ class MaxOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP(Name("Max"), MaxOp);
+REGISTER_XLA_OP(Name("Max").CompileTimeConstInput("reduction_indices"), MaxOp);
 
 class MeanOp : public XlaReductionOp {
  public:
@@ -115,7 +116,8 @@ class MeanOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP(Name("Mean"), MeanOp);
+REGISTER_XLA_OP(Name("Mean").CompileTimeConstInput("reduction_indices"),
+                MeanOp);
 
 class AllOp : public XlaReductionOp {
  public:
@@ -133,7 +135,7 @@ class AllOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP(Name("All"), AllOp);
+REGISTER_XLA_OP(Name("All").CompileTimeConstInput("reduction_indices"), AllOp);
 
 class AnyOp : public XlaReductionOp {
  public:
@@ -151,7 +153,7 @@ class AnyOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP(Name("Any"), AnyOp);
+REGISTER_XLA_OP(Name("Any").CompileTimeConstInput("reduction_indices"), AnyOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index 5952e752724d1e6953dd4dbb6a8099b847c64d08..af4d64b159c09ed7e01017f25a2b23e58542dc3c 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -95,7 +95,7 @@ class ReshapeOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Reshape"), ReshapeOp);
+REGISTER_XLA_OP(Name("Reshape").CompileTimeConstInput("shape"), ReshapeOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
index 7489321f72f50c8f55f8da9dabb9f4b5c7797195..e51d386926763ecbb5a943dfb6f872e78901dc69 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
@@ -16,7 +16,6 @@ limitations under the License.
 // XLA-specific reverse Op.
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -53,7 +52,8 @@ class ReverseOp : public XlaOpKernel {
     xla::Literal lax;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {x_shape.dims()}, &lax));
     std::vector<bool> revdims(x_shape.dims());
-    std::copy(lax.preds().begin(), lax.preds().end(), revdims.begin());
+    std::copy(lax.data<bool>().begin(), lax.data<bool>().end(),
+              revdims.begin());
     std::vector<int64> dimensions;
 
     for (int d = 0; d < x_shape.dims(); ++d) {
@@ -66,7 +66,7 @@ class ReverseOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Reverse"), ReverseOp);
+REGISTER_XLA_OP(Name("Reverse").CompileTimeConstInput("dims"), ReverseOp);
 
 class ReverseV2Op : public XlaOpKernel {
  public:
@@ -104,7 +104,7 @@ class ReverseV2Op : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("ReverseV2"), ReverseV2Op);
+REGISTER_XLA_OP(Name("ReverseV2").CompileTimeConstInput("axis"), ReverseV2Op);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6bc5d3adb091cd238974c5b69b7a2f8fe639cc68
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -0,0 +1,182 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class ReverseSequenceOp : public XlaOpKernel {
+ public:
+  explicit ReverseSequenceOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("batch_dim", &batch_dim_));
+    OP_REQUIRES_OK(context, context->GetAttr("seq_dim", &seq_dim_));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    const TensorShape input_shape = context->InputShape(0);
+    const TensorShape seq_lens_shape = context->InputShape(1);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lens_shape),
+                errors::InvalidArgument("seq_lens input must be 1-dim, not ",
+                                        seq_lens_shape.dims()));
+    OP_REQUIRES(context, batch_dim_ != seq_dim_,
+                errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim_));
+    OP_REQUIRES(
+        context, seq_dim_ < input_shape.dims(),
+        errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+                                seq_dim_, " vs. ", input_shape.dims(), ")"));
+    OP_REQUIRES(
+        context, batch_dim_ < input_shape.dims(),
+        errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+                                batch_dim_, " vs. ", input_shape.dims(), ")"));
+    OP_REQUIRES(
+        context,
+        seq_lens_shape.num_elements() == input_shape.dim_size(batch_dim_),
+        errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim_,
+                                "), ", "(", seq_lens_shape.num_elements(),
+                                " vs. ", input_shape.dim_size(batch_dim_)));
+
+    xla::ComputationBuilder* builder = context->builder();
+    const auto input = context->Input(0);
+    const auto seq_lens = context->Input(1);
+
+    const int64 batch_size = input_shape.dim_size(batch_dim_);
+
+    const DataType input_type = context->input_type(0);
+    const DataType seq_lens_type = context->input_type(1);
+    const int64 max_seq_len = input_shape.dim_size(seq_dim_);
+
+    xla::Shape input_xla_shape;
+    OP_REQUIRES_OK(context, TensorShapeToXLAShape(input_type, input_shape,
+                                                  &input_xla_shape));
+    xla::Shape seq_lens_xla_shape;
+    OP_REQUIRES_OK(context, TensorShapeToXLAShape(seq_lens_type, seq_lens_shape,
+                                                  &seq_lens_xla_shape));
+
+    const auto tuple_shape = xla::ShapeUtil::MakeTupleShape({
+        xla::ShapeUtil::MakeShape(seq_lens_xla_shape.element_type(), {}),
+        seq_lens_xla_shape,
+        input_xla_shape,
+    });
+
+    // For each entry in the batch, reverse the sequence.
+    // TODO(b/65689298): generalize the Map() operator to non-scalar cases and
+    // use it here, instead of a While loop.
+
+    // Condition: lambda (i, _, _): i < batch_size
+    auto condition_builder =
+        builder->CreateSubBuilder("reverse_sequence_condition");
+    {
+      auto param = condition_builder->Parameter(0, tuple_shape, "param");
+      auto i = condition_builder->GetTupleElement(param, 0);
+      condition_builder->Lt(
+          i, XlaHelpers::IntegerLiteral(condition_builder.get(), seq_lens_type,
+                                        batch_size));
+    }
+    auto condition = condition_builder->Build();
+    OP_REQUIRES_OK(context, condition.status());
+
+    auto body_builder = builder->CreateSubBuilder("reverse_sequence_body");
+    {
+      auto param = body_builder->Parameter(0, tuple_shape, "param");
+      auto i = body_builder->GetTupleElement(param, 0);
+      auto seq_lens = body_builder->GetTupleElement(param, 1);
+      auto output = body_builder->GetTupleElement(param, 2);
+
+      // seq_len is the sequence length of the current batch element (rank 1)
+      auto seq_len = body_builder->DynamicSlice(
+          seq_lens, body_builder->Reshape(i, {1}), {1});
+
+      // Indices is the offset of the batch element in the input.
+      auto indices = body_builder->Broadcast(
+          XlaHelpers::Zero(body_builder.get(), seq_lens_type),
+          {input_shape.dims()});
+      indices = body_builder->DynamicUpdateSlice(
+          indices, body_builder->Reshape(i, {1}),
+          body_builder->Reshape(
+              XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type,
+                                         batch_dim_),
+              {1}));
+
+      // slice_indices is the offset of the start of the reversed sequence in
+      // the input.
+      auto slice_indices = body_builder->DynamicUpdateSlice(
+          indices,
+          body_builder->Sub(XlaHelpers::IntegerLiteral(
+                                body_builder.get(), seq_lens_type, max_seq_len),
+                            seq_len),
+          body_builder->Reshape(
+              XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type,
+                                         seq_dim_),
+              {1}));
+
+      // Slice out the reversed sequence. The slice will overflow the end of the
+      // sequence, and the contents of the overflow are implementation-defined.
+      // However, we will mask off these elements and replace them with elements
+      // from the original input so their values do not matter.
+      TensorShape slice_shape = input_shape;
+      slice_shape.set_dim(batch_dim_, 1);
+      auto slice = body_builder->DynamicSlice(output, slice_indices,
+                                              slice_shape.dim_sizes());
+
+      // Shift the reversed sequence to the left.
+      output = body_builder->DynamicUpdateSlice(output, slice, indices);
+
+      body_builder->Tuple(
+          {body_builder->Add(
+               i, XlaHelpers::One(body_builder.get(), seq_lens_type)),
+           seq_lens, output});
+    }
+    auto body = body_builder->Build();
+    OP_REQUIRES_OK(context, body.status());
+
+    auto loop_output = builder->While(
+        condition.ValueOrDie(), body.ValueOrDie(),
+        builder->Tuple({XlaHelpers::Zero(builder, seq_lens_type), seq_lens,
+                        builder->Rev(input, {seq_dim_})}));
+    auto output = builder->GetTupleElement(loop_output, 2);
+
+    // Mask out elements after the sequence length.
+    xla::ComputationDataHandle iota;
+    OP_REQUIRES_OK(
+        context, XlaHelpers::Iota(builder, seq_lens_type, max_seq_len, &iota));
+    std::vector<int64> dims(input_shape.dims(), 1);
+    dims[batch_dim_] = batch_size;
+    auto mask = builder->Lt(iota, builder->Reshape(seq_lens, dims), {seq_dim_});
+
+    // Broadcast the mask up to the input shape.
+    mask =
+        builder->Or(mask, builder->Broadcast(builder->ConstantR0<bool>(false),
+                                             input_shape.dim_sizes()));
+
+    output = builder->Select(mask, output, input);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  int32 batch_dim_;
+  int32 seq_dim_;
+};
+
+REGISTER_XLA_OP(Name("ReverseSequence"), ReverseSequenceOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee4a94164c4a43828eb4feedbfa9d1a9e231ef8f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -0,0 +1,147 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+// TODO(phawkins): implement double-sized windowed reductions in XLA and remove
+// the type constraint.
+constexpr std::array<DataType, 3> kScanOpTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT}};
+
+class ScanOp : public XlaOpKernel {
+ public:
+  ScanOp(OpKernelConstruction* ctx, bool sum) : XlaOpKernel(ctx), sum_(sum) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("reverse", &reverse_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("exclusive", &exclusive_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape input_shape = ctx->InputShape(0);
+    const TensorShape tensor_axis_shape = ctx->InputShape(1);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_axis_shape),
+                errors::InvalidArgument("ScanOp: axis must be a scalar, not ",
+                                        tensor_axis_shape.DebugString()));
+
+    int64 axis;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &axis));
+    if (axis < 0) {
+      axis += input_shape.dims();
+    }
+    OP_REQUIRES(
+        ctx, FastBoundsCheck(axis, input_shape.dims()),
+        errors::InvalidArgument("ScanOp: Expected scan axis in the range [",
+                                -input_shape.dims(), ", ", input_shape.dims(),
+                                "), but got ", axis));
+
+    DataType dtype = ctx->input_type(0);
+
+    if (input_shape.num_elements() == 0) {
+      // Exit early if there is nothing to compute.
+      ctx->SetOutput(0, ctx->Input(0));
+      return;
+    }
+
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    std::vector<int64> window_strides(input_shape.dims(), 1);
+    std::vector<int64> window_dims(input_shape.dims(), 1);
+    window_dims[axis] = input_shape.dim_size(axis);
+
+    std::vector<std::pair<int64, int64>> padding(input_shape.dims(), {0, 0});
+    padding[axis].first = input_shape.dim_size(axis) - 1;
+    // In exclusive mode, add an extra padding element so there is a complete
+    // window of padding before the data starts.
+    if (exclusive_) {
+      ++padding[axis].first;
+    }
+    if (reverse_) {
+      std::swap(padding[axis].first, padding[axis].second);
+    }
+
+    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::ComputationDataHandle init;
+    const xla::Computation* reducer;
+    if (sum_) {
+      init = XlaHelpers::Zero(builder, dtype);
+      reducer = ctx->GetOrCreateAdd(dtype);
+    } else {
+      init = XlaHelpers::One(builder, dtype);
+      reducer = ctx->GetOrCreateMul(dtype);
+    }
+    auto output = builder->ReduceWindowWithGeneralPadding(
+        ctx->Input(0), init, *reducer, window_dims, window_strides, padding);
+
+    // In exclusive mode, we have computed an extra element containing the sum
+    // of all the input elements. Slice off this extra "last" element.
+    if (exclusive_) {
+      if (reverse_) {
+        output = builder->SliceInDim(output, 1, input_shape.dim_size(axis) + 1,
+                                     1, axis);
+
+      } else {
+        output =
+            builder->SliceInDim(output, 0, input_shape.dim_size(axis), 1, axis);
+      }
+    }
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  const bool sum_;  // True=cumulative sum. False=cumulative product.
+  bool reverse_;
+  bool exclusive_;
+};
+
+class CumsumOp : public ScanOp {
+ public:
+  explicit CumsumOp(OpKernelConstruction* ctx) : ScanOp(ctx, /*sum=*/true) {}
+};
+REGISTER_XLA_OP(Name("Cumsum")
+                    .TypeConstraint("T", kScanOpTypes)
+                    .CompileTimeConstInput("axis"),
+                CumsumOp);
+
+class CumprodOp : public ScanOp {
+ public:
+  explicit CumprodOp(OpKernelConstruction* ctx) : ScanOp(ctx, /*sum=*/false) {}
+};
+REGISTER_XLA_OP(Name("Cumprod")
+                    .TypeConstraint("T", kScanOpTypes)
+                    .CompileTimeConstInput("axis"),
+                CumprodOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8433a29c4e203cac726ee6bf7f67a863447326ed
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/scatter.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+// Check whether updates.shape = indices.shape[:batch_dim] +
+// buffer_shape[num_index_dims:]
+Status ValidateUpdateShape(const TensorShape& buffer_shape,
+                           const TensorShape& indices_shape,
+                           const TensorShape& updates_shape) {
+  if (indices_shape.dims() < 1) {
+    return errors::InvalidArgument(
+        "indices shape must have >= 1 dimension; got ",
+        indices_shape.DebugString());
+  }
+
+  const int64 num_index_dims = indices_shape.dim_size(indices_shape.dims() - 1);
+  const int64 batch_dim = indices_shape.dims() - 1;
+
+  auto shape_err = [&]() {
+    return errors::InvalidArgument(
+        "Must have updates.shape = indices.shape[:batch_dim] + ",
+        "buffer_shape[num_index_dims:], got updates.shape: ",
+        updates_shape.DebugString(),
+        ", indices.shape: ", indices_shape.DebugString(),
+        ", buffer_shape: ", buffer_shape.DebugString(),
+        ", num_index_dims: ", num_index_dims, ", and batch_dim: ", batch_dim);
+  };
+
+  if (updates_shape.dims() < batch_dim) return shape_err();
+  if (buffer_shape.dims() <
+      num_index_dims + (updates_shape.dims() - batch_dim)) {
+    return shape_err();
+  }
+  if (updates_shape.dims() !=
+      batch_dim + buffer_shape.dims() - num_index_dims) {
+    return shape_err();
+  }
+  for (int d = 0; d < batch_dim; ++d) {
+    if (updates_shape.dim_size(d) != indices_shape.dim_size(d)) {
+      return shape_err();
+    }
+  }
+  for (int d = 0; d < updates_shape.dims() - batch_dim; ++d) {
+    if (updates_shape.dim_size(d + batch_dim) !=
+        buffer_shape.dim_size(d + num_index_dims)) {
+      return shape_err();
+    }
+  }
+  return Status::OK();
+}
+
+class ScatterNdOp : public XlaOpKernel {
+ public:
+  explicit ScatterNdOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    DataType dtype = context->input_type(1);
+
+    TensorShape indices_shape = context->InputShape(0);
+    TensorShape updates_shape = context->InputShape(1);
+
+    TensorShape buffer_shape;
+    OP_REQUIRES_OK(context, context->ConstantInputAsShape(2, &buffer_shape));
+
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVectorOrHigher(buffer_shape),
+        errors::InvalidArgument("Output must be at least 1-D, ",
+                                "got shape: ", buffer_shape.DebugString()));
+
+    OP_REQUIRES(
+        context,
+        buffer_shape.num_elements() > 0 || (indices_shape.num_elements() == 0 &&
+                                            updates_shape.num_elements() == 0),
+        errors::InvalidArgument(
+            "Indices and updates specified for empty output. indices shape: ",
+            indices_shape.DebugString()));
+
+    OP_REQUIRES_OK(context, ValidateUpdateShape(buffer_shape, indices_shape,
+                                                updates_shape));
+
+    xla::ComputationBuilder* builder = context->builder();
+    auto buffer = builder->Broadcast(XlaHelpers::Zero(builder, dtype),
+                                     buffer_shape.dim_sizes());
+    auto indices = context->Input(0);
+    auto updates = context->Input(1);
+    auto result =
+        XlaScatter(buffer, updates, indices,
+                   /*indices_are_vectors=*/true, /*combiner=*/{}, builder);
+    OP_REQUIRES_OK(context, result.status());
+    context->SetOutput(0, result.ValueOrDie());
+  }
+};
+
+REGISTER_XLA_OP(Name("ScatterNd").CompileTimeConstInput("shape"), ScatterNdOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/scatter_op_helpers.h
deleted file mode 100644
index a5ab7de17adb734014fe2dcbd60ae5c219c8e486..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/kernels/scatter_op_helpers.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Helper methods for XLA Scatter Ops.
-#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_SCATTER_OP_HELPERS_H_
-#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_SCATTER_OP_HELPERS_H_
-
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/util/bcast.h"
-
-namespace tensorflow {
-
-// Adds to builder an XLA computation that performs a scatter-add of input (of
-// shape input_shape) keyed on indices (of shape indices_shape). The shape
-// of the Tensor returned by this is num_segments input_shape[indices.dims():]
-//
-static xla::ComputationDataHandle XlaComputeScatterAddDynamicSlice(
-    XlaOpKernelContext* ctx, const xla::ComputationDataHandle& input,
-    const TensorShape& input_shape, const xla::ComputationDataHandle& indices,
-    const TensorShape& indices_shape, int64 num_segments, DataType dtype,
-    xla::ComputationBuilder* builder);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_SCATTER_OP_HELPERS_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
index 8a67c0b67fcd95f4841c5e011a4e51638eea5b0f..80d6df6c48b0141734dcee1c2a3c413926931feb 100644
--- a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,143 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <sstream>
-#include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/lib/scatter.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/core/framework/kernel_def_builder.h"
-#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
-
-xla::ComputationDataHandle XlaComputeScatterAddDynamicSlice(
-    XlaOpKernelContext* ctx, const xla::ComputationDataHandle& input,
-    const TensorShape& input_shape, const xla::ComputationDataHandle& indices,
-    const TensorShape& indices_shape, int64 num_segments, DataType dtype,
-    xla::ComputationBuilder* builder) {
-  // Flatten data for dynamic indexing via indices_1d.
-  TensorShape input_shape_i(input_shape);
-  for (int64 d = 0; d < indices_shape.dims(); ++d) {
-    input_shape_i.RemoveDim(0);
-  }
-  TensorShape flat_shape({indices_shape.num_elements()});
-  flat_shape.AppendShape(input_shape_i);
-
-  // output is same as flattened input shape with dim_size(0) = num_segments.
-  TensorShape out_shape(flat_shape);
-  out_shape.set_dim(0, num_segments);
-
-  // TODO(b/37575001) The tensor in which we construct the output during
-  // the loop must have rank >= 3 as a workaround for lowering issues.
-  int64 extra_dims = 0;
-  if (out_shape.dims() < 3) {
-    extra_dims = 3 - out_shape.dims();
-  }
-  TensorShape loop_out_shape;
-  for (int64 k = 0; k < extra_dims; ++k) {
-    loop_out_shape.AddDim(1);
-  }
-  loop_out_shape.AppendShape(out_shape);
-
-  // Slices from the input data are same shape as the input data, except dim 0.
-  TensorShape slice_shape(flat_shape);
-  slice_shape.set_dim(0, 1);
-  // slices are reshaped into the rank >= 3 shape of the loop-carried output
-  TensorShape loop_out_slice_shape(loop_out_shape);
-  loop_out_slice_shape.set_dim(extra_dims, 1);
-
-  // Construct the initial values of the loop-carried variables
-  // Flatten the indices into 1-D for ease of iteration.
-  auto indices_1d = builder->Reshape(indices, {indices_shape.num_elements()});
-  // Flatten the data for ease of indexing via values in indices_1d.
-  auto data_flat = builder->Reshape(input, flat_shape.dim_sizes());
-
-  auto init_i = builder->ConstantR0<int32>(0);
-  auto init_out = builder->Broadcast(XlaHelpers::Zero(builder, dtype),
-                                     loop_out_shape.dim_sizes());
-
-  xla::PrimitiveType ptype;
-  TF_CHECK_OK(DataTypeToPrimitiveType(dtype, &ptype));
-
-  std::vector<xla::Shape> tuple_shapes(
-      {// The loop iteration counter is a scalar, incremented each iteration.
-       xla::ShapeUtil::MakeShape(xla::S32, {}),
-       // The flattened input data is loop invariant.
-       xla::ShapeUtil::MakeShape(ptype, flat_shape.dim_sizes()),
-       // The scatter indices tensor is loop invariant.
-       xla::ShapeUtil::MakeShape(xla::S32, {indices_shape.num_elements()}),
-       // The output data array is updated each loop iteration.
-       xla::ShapeUtil::MakeShape(ptype, loop_out_shape.dim_sizes())});
-  xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
-
-  auto init = builder->Tuple({init_i, data_flat, indices_1d, init_out});
-
-  // Construct the while loop condition (i < num_indices)
-  xla::ComputationBuilder condb(ctx->builder()->client(),
-                                "ScatterAddWhileCond");
-  condb.Lt(condb.GetTupleElement(
-               condb.Parameter(0, tuple_shape, "ScatterAddWhileTuple"), 0),
-           condb.ConstantR0<int32>(indices_shape.num_elements()));
-  auto cond_status = condb.Build();
-  // TF_CHECK_OK(cond_status);
-  auto cond = cond_status.ConsumeValueOrDie();
-
-  // Construct the while loop body's function. The implementation of scatter is:
-  // for i in range(num_indices):
-  //   index = dynamic-slice(indices, i)
-  //   xi = dynamic-slice(input, i)
-  //   output = dynamic-update-slice(output, xi, index)
-  xla::ComputationBuilder bodyb(ctx->builder()->client(),
-                                "ScatterAddWhileBody");
-  {
-    auto input_tuple = bodyb.Parameter(0, tuple_shape, "ScatterAddWhileTuple");
-    auto i = bodyb.GetTupleElement(input_tuple, 0);
-    auto data = bodyb.GetTupleElement(input_tuple, 1);
-    auto idcs = bodyb.GetTupleElement(input_tuple, 2);
-    auto output = bodyb.GetTupleElement(input_tuple, 3);
-
-    // Index into the data array at i.
-    auto zero = bodyb.ConstantR1<int32>({0});
-    std::vector<xla::ComputationDataHandle> index_vals(flat_shape.dims(), zero);
-    index_vals[0] = bodyb.Reshape(i, {1});
-    auto index = bodyb.ConcatInDim(index_vals, 0);
-
-    auto data_slice =
-        bodyb.Reshape(bodyb.DynamicSlice(data, index, slice_shape.dim_sizes()),
-                      loop_out_slice_shape.dim_sizes());
-
-    // Index into the output array.
-    // Construct the index into the R3+ output array 0, ..., <index>, 0, ...
-    std::vector<xla::ComputationDataHandle> out_index_vals(
-        loop_out_shape.dims(), zero);
-    out_index_vals[extra_dims] =
-        bodyb.DynamicSlice(idcs, bodyb.Reshape(i, {1}), {1});
-    auto out_index = bodyb.ConcatInDim(out_index_vals, 0);
-
-    // Slice the output array, update value, and update the output slice.
-    auto updated_output = bodyb.DynamicUpdateSlice(
-        output,
-        bodyb.Add(data_slice,
-                  bodyb.DynamicSlice(output, out_index,
-                                     loop_out_slice_shape.dim_sizes())),
-        out_index);
-
-    auto ip1 = bodyb.Add(i, bodyb.ConstantR0<int32>(1));
-    bodyb.Tuple({ip1, data, idcs, updated_output});
-  }
-  auto body_status = bodyb.Build();
-  // TF_CHECK_OK(body_status);
-  auto body = body_status.ConsumeValueOrDie();
-
-  auto gather_while = builder->While(cond, body, init);
-  auto updated_output = builder->GetTupleElement(gather_while, 3);
-  return builder->Reshape(updated_output, out_shape.dim_sizes());
-}
-
 namespace {
 
 class UnsortedSegmentSum : public XlaOpKernel {
@@ -171,10 +41,10 @@ class UnsortedSegmentSum : public XlaOpKernel {
     // as data with the first indices.rank dimensions are replaced
     // by a single dimension with size num_segments.
     auto data = ctx->Input(0);
-    auto data_shape = ctx->InputShape(0);
+    TensorShape data_shape = ctx->InputShape(0);
 
     auto indices = ctx->Input(1);
-    auto indices_shape = ctx->InputShape(1);
+    TensorShape indices_shape = ctx->InputShape(1);
 
     int64 num_segments;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(2, &num_segments));
@@ -192,10 +62,21 @@ class UnsortedSegmentSum : public XlaOpKernel {
                       d, " differs ", data_shape.dim_size(d), " vs. ",
                       indices_shape.dim_size(d)));
     }
-    auto result = XlaComputeScatterAddDynamicSlice(
-        ctx, data, data_shape, indices, indices_shape, num_segments, dtype_,
-        ctx->builder());
-    ctx->SetOutput(0, result);
+    xla::ComputationBuilder* builder = ctx->builder();
+    TensorShape buffer_shape = data_shape;
+    buffer_shape.RemoveDimRange(0, indices_shape.dims());
+    buffer_shape.InsertDim(0, num_segments);
+    auto buffer = builder->Broadcast(XlaHelpers::Zero(builder, dtype_),
+                                     buffer_shape.dim_sizes());
+
+    auto combiner =
+        [](xla::ComputationDataHandle a, xla::ComputationDataHandle b,
+           xla::ComputationBuilder* builder) { return builder->Add(a, b); };
+
+    auto result = XlaScatter(buffer, /*updates=*/data, indices,
+                             /*indices_are_vectors=*/false, combiner, builder);
+    OP_REQUIRES_OK(ctx, result.status());
+    ctx->SetOutput(0, result.ValueOrDie());
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index c2b0e1bb4c1a141d0ab3f5b3ff5397d9da620bd8..2c31f8d90891924f6f86a54ccf548de4df87f3bd 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -138,7 +138,11 @@ class RangeOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Range"), RangeOp);
+REGISTER_XLA_OP(Name("Range")
+                    .CompileTimeConstInput("start")
+                    .CompileTimeConstInput("limit")
+                    .CompileTimeConstInput("delta"),
+                RangeOp);
 
 class LinSpaceOp : public XlaOpKernel {
  public:
@@ -207,7 +211,11 @@ class LinSpaceOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("LinSpace"), LinSpaceOp);
+REGISTER_XLA_OP(Name("LinSpace")
+                    .CompileTimeConstInput("start")
+                    .CompileTimeConstInput("stop")
+                    .CompileTimeConstInput("num"),
+                LinSpaceOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 24a99f253d6dc8bb699fff587c363b12c227e821..05354bca5bb089703fdcceb6f44648bbb98d004b 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // XLA-specific Shape Ops.
 
+#include "tensorflow/compiler/tf2xla/kernels/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -27,56 +28,42 @@ namespace {
 
 class ShapeOp : public XlaOpKernel {
  public:
-  explicit ShapeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  explicit ShapeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_shape = ctx->InputShape(0);
-    const int rank = input_shape.dims();
-    Tensor shape_constant(DT_INT32, TensorShape({rank}));
-    auto vec = shape_constant.vec<int32>();
-    // TODO(dga): support int64.  b/28119922.
-    for (int i = 0; i < rank; ++i) {
-      int64 dim_size = input_shape.dim_size(i);
-      OP_REQUIRES(
-          ctx, FastBoundsCheck(dim_size, std::numeric_limits<int32>::max()),
-          errors::InvalidArgument("Shape does not support tensors > int32max",
-                                  " but dim ", i, " is ", dim_size));
-      vec(i) = static_cast<int32>(dim_size);
-    }
-
+    Tensor shape_constant(out_dtype_, TensorShape({input_shape.dims()}));
+    OP_REQUIRES_OK(ctx, TensorShapeToConstant(input_shape, &shape_constant));
     ctx->SetConstantOutput(0, shape_constant);
   }
+
+ private:
+  DataType out_dtype_;
 };
 
 REGISTER_XLA_OP(Name("Shape"), ShapeOp);
 
 class ShapeNOp : public XlaOpKernel {
  public:
-  explicit ShapeNOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  explicit ShapeNOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     for (int i = 0; i < ctx->num_inputs(); ++i) {
-      const TensorShape shape = ctx->InputShape(i);
-      const int dims = shape.dims();
-      Tensor shape_constant(DT_INT32, TensorShape({dims}));
-      auto vec = shape_constant.vec<int32>();
-
-      // TODO(dga): support int64.  b/28119922.
-      for (int j = 0; j < dims; ++j) {
-        int64 dim_size = shape.dim_size(j);
-        OP_REQUIRES(
-            ctx, FastBoundsCheck(dim_size, std::numeric_limits<int32>::max()),
-            errors::InvalidArgument("Shape does not support tensors > int32max",
-                                    " but shape ", i, " dim ", j, " is ",
-                                    dim_size));
-        vec(j) = static_cast<int32>(dim_size);
-      }
-
+      const TensorShape input_shape = ctx->InputShape(i);
+      Tensor shape_constant(out_dtype_, TensorShape({input_shape.dims()}));
+      OP_REQUIRES_OK(ctx, TensorShapeToConstant(input_shape, &shape_constant));
       ctx->SetConstantOutput(i, shape_constant);
     }
   }
 
   bool IsExpensive() override { return false; }
+
+ private:
+  DataType out_dtype_;
 };
 REGISTER_XLA_OP(Name("ShapeN"), ShapeNOp);
 
@@ -134,7 +121,7 @@ class ExpandDimsOp : public XlaOpKernel {
     xla::Literal literal;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {1}, &literal));
 
-    int dim = literal.s32s(0);
+    int dim = literal.data<int32>()[0];
 
     OP_REQUIRES(ctx,
                 (dim >= -1 - input_shape.dims() && dim <= input_shape.dims()),
@@ -163,7 +150,7 @@ class ExpandDimsOp : public XlaOpKernel {
     ctx->SetOutput(0, ctx->builder()->Reshape(ctx->Input(0), new_shape));
   }
 };
-REGISTER_XLA_OP(Name("ExpandDims"), ExpandDimsOp);
+REGISTER_XLA_OP(Name("ExpandDims").CompileTimeConstInput("dim"), ExpandDimsOp);
 
 class SqueezeOp : public XlaOpKernel {
  public:
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_util.cc b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76ea5f525598f511f295eb5a30f3cf603fbf57aa
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/shape_util.h"
+
+#include <limits>
+
+#include "tensorflow/core/kernels/bounds_check.h"
+
+namespace tensorflow {
+
+Status TensorShapeToConstant(const TensorShape& input_shape,
+                             Tensor* shape_constant) {
+  const int dims = input_shape.dims();
+  if (shape_constant->dtype() == DT_INT32) {
+    auto vec = shape_constant->vec<int32>();
+    for (int i = 0; i < dims; ++i) {
+      int64 dim_size = input_shape.dim_size(i);
+      if (!FastBoundsCheck(dim_size, std::numeric_limits<int32>::max())) {
+        return errors::InvalidArgument(
+            "Shape with out_type=int32 does not support tensors > int32max",
+            " but dim ", i, " is ", dim_size);
+      }
+      vec(i) = static_cast<int32>(dim_size);
+    }
+  } else {
+    auto vec = shape_constant->vec<int64>();
+    for (int i = 0; i < dims; ++i) {
+      int64 dim_size = input_shape.dim_size(i);
+      vec(i) = dim_size;
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_util.h b/tensorflow/compiler/tf2xla/kernels/shape_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca57be3d47b95d71b07746e50256070e0a4f4c09
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/shape_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
+
+#include <limits>
+
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+// Converts a TensorShape to a constant Tensor.
+//
+// The input TensorShape input_shape is used to populate the elements of
+// shape_constant, which is modified in place.
+Status TensorShapeToConstant(const TensorShape& input_shape,
+                             Tensor* shape_constant);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index fbe8c78d8fb5f800967942555531a50937cad0ca..be1e97bf26fa4cde1b741c8d0b843a85ce33a59c 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -112,7 +112,9 @@ class SliceOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Slice"), SliceOp);
+REGISTER_XLA_OP(
+    Name("Slice").CompileTimeConstInput("begin").CompileTimeConstInput("size"),
+    SliceOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
index 83a87f19a718ce86a105e3c33ab9eaf0faff3a76..01b46e160d1f1f10a43faf7ca35afb42dfde6e33 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
@@ -162,7 +162,10 @@ class SpaceToBatchNDOp : public XlaOpKernel {
                  block_shape, paddings);
   }
 };
-REGISTER_XLA_OP(Name("SpaceToBatchND"), SpaceToBatchNDOp);
+REGISTER_XLA_OP(Name("SpaceToBatchND")
+                    .CompileTimeConstInput("paddings")
+                    .CompileTimeConstInput("block_shape"),
+                SpaceToBatchNDOp);
 
 class SpaceToBatchOp : public XlaOpKernel {
  public:
@@ -184,7 +187,8 @@ class SpaceToBatchOp : public XlaOpKernel {
  private:
   int block_size_;
 };
-REGISTER_XLA_OP(Name("SpaceToBatch"), SpaceToBatchOp);
+REGISTER_XLA_OP(Name("SpaceToBatch").CompileTimeConstInput("paddings"),
+                SpaceToBatchOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
index 89befda346ec06fec23ab1d1c9d910ded8cd806d..806fda632cde64c1b37ae3b9199028d6b6b0a215 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 namespace {
@@ -23,6 +24,16 @@ namespace {
 class SpaceToDepthOp : public XlaOpKernel {
  public:
   explicit SpaceToDepthOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES(ctx, data_format_ == FORMAT_NCHW || data_format_ == FORMAT_NHWC,
+                errors::InvalidArgument("Unsupported data format ",
+                                        ToString(data_format_),
+                                        "; expected formats NHWC or NCHW"));
+
     OP_REQUIRES_OK(ctx, ctx->GetAttr("block_size", &block_size_));
     OP_REQUIRES(
         ctx, block_size_ > 1,
@@ -31,34 +42,100 @@ class SpaceToDepthOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_tensor_shape = ctx->InputShape(0);
-    // The input is presumed to be [batch, height, width, depth]
     int input_rank = input_tensor_shape.dims();
     static const int kRequiredDims = 4;
     OP_REQUIRES(ctx, kRequiredDims == input_rank,
-                errors::InvalidArgument("Input rank should be: ", kRequiredDims,
-                                        " instead of: ", input_rank));
+                errors::InvalidArgument("Input rank should be ", kRequiredDims,
+                                        "; got ", input_rank));
     const gtl::InlinedVector<int64, 4> input_shape =
         input_tensor_shape.dim_sizes();
 
     xla::ComputationBuilder* b = ctx->builder();
     xla::ComputationDataHandle input = ctx->Input(0);
 
+    int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format_);
+    int num_spatial_dims = GetTensorSpatialDims(input_rank, data_format_);
+
+    std::vector<int64> reshaped_shape;
+    std::vector<int64> transpose_order;
+    std::vector<int64> output_shape;
+    reshaped_shape.reserve(input_rank);
+    transpose_order.reserve(input_rank);
+    output_shape.reserve(input_rank);
+    if (data_format_ == FORMAT_NHWC) {
+      int64 block_elems = 1;
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        OP_REQUIRES(ctx, input_shape[1 + i] % block_size_ == 0,
+                    errors::InvalidArgument(
+                        "input shape[", 1 + i, "]=", input_shape[1 + i],
+                        " is not divisible by block_size=", block_size_));
+        block_elems *= block_size_;
+      }
+
+      reshaped_shape.push_back(input_shape[0]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(input_shape[1 + i] / block_size_);
+        reshaped_shape.push_back(block_size_);
+      }
+      reshaped_shape.push_back(input_shape[feature_dim]);
+
+      transpose_order.push_back(0);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i * 2 + 1);
+      }
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i * 2 + 2);
+      }
+      transpose_order.push_back(feature_dim + num_spatial_dims);
+
+      output_shape.push_back(input_shape[0]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        output_shape.push_back(input_shape[1 + i] / block_size_);
+      }
+      output_shape.push_back(input_shape[feature_dim] * block_elems);
+    } else {
+      // FORMAT_NCHW
+      int64 block_elems = 1;
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        OP_REQUIRES(ctx, input_shape[2 + i] % block_size_ == 0,
+                    errors::InvalidArgument(
+                        "input shape[", 2 + i, "]=", input_shape[2 + i],
+                        " is not divisible by block_size=", block_size_));
+        block_elems *= block_size_;
+      }
+
+      reshaped_shape.push_back(input_shape[0]);
+      reshaped_shape.push_back(input_shape[feature_dim]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(input_shape[2 + i] / block_size_);
+        reshaped_shape.push_back(block_size_);
+      }
+
+      transpose_order.push_back(0);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i * 2 + 3);
+      }
+      transpose_order.push_back(feature_dim);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i * 2 + 2);
+      }
+
+      output_shape.push_back(input_shape[0]);
+      output_shape.push_back(input_shape[feature_dim] * block_elems);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        output_shape.push_back(input_shape[2 + i] / block_size_);
+      }
+    }
+
+    // Note: comments are given in NHWC format; NCHW is similar with a different
+    // dimension order.
     // 1. Reshape `input` to `reshaped` of shape:
     //
     //      [batch,
     //       input_shape[1] / block_size_, block_size_,
     //       input_shape[2] / block_size_, block_size_,
     //       depth]
-    const int block_rank = 2;
-    for (int i = 0; i < block_rank; ++i) {
-      OP_REQUIRES(ctx, input_shape[1 + i] % block_size_ == 0,
-                  errors::InvalidArgument(
-                      "input shape[", 1 + i, "]=", input_shape[1 + i],
-                      " is not divisible by block_size=", block_size_));
-    }
-    xla::ComputationDataHandle reshaped = b->Reshape(
-        input, {input_shape[0], input_shape[1] / block_size_, block_size_,
-                input_shape[2] / block_size_, block_size_, input_shape[3]});
+    xla::ComputationDataHandle reshaped = b->Reshape(input, reshaped_shape);
 
     // 2. Permute dimensions of `reshaped` to produce
     //    `permuted_reshaped` of shape:
@@ -69,7 +146,7 @@ class SpaceToDepthOp : public XlaOpKernel {
     //       block_size_, block_size_,
     //       depth]
     xla::ComputationDataHandle permuted_reshaped =
-        b->Transpose(reshaped, {0, 1, 3, 2, 4, 5});
+        b->Transpose(reshaped, transpose_order);
 
     // 3. Reshape `permuted_reshaped` to flatten `block_shape` into the
     //    batch dimension, producing an output tensor of shape:
@@ -79,15 +156,14 @@ class SpaceToDepthOp : public XlaOpKernel {
     //       input_shape[2] / block_size_,
     //       block_size_ * block_size_ * depth]
     //
-    xla::ComputationDataHandle output = b->Reshape(
-        permuted_reshaped, {input_shape[0], input_shape[1] / block_size_,
-                            input_shape[2] / block_size_,
-                            block_size_ * block_size_ * input_shape[3]});
+    xla::ComputationDataHandle output =
+        b->Reshape(permuted_reshaped, output_shape);
 
     ctx->SetOutput(0, output);
   }
 
  private:
+  TensorFormat data_format_;
   int block_size_;
 };
 REGISTER_XLA_OP(Name("SpaceToDepth"), SpaceToDepthOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index 795eb1794f577e0f7fd2a2068878e540ff0c1a1d..79c435c90a1f57250be90c2c2523bf3d7d231461 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -103,7 +103,7 @@ class SplitOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Split"), SplitOp);
+REGISTER_XLA_OP(Name("Split").CompileTimeConstInput("split_dim"), SplitOp);
 
 class SplitVOp : public XlaOpKernel {
  public:
@@ -142,8 +142,9 @@ class SplitVOp : public XlaOpKernel {
     int neg_one_dim = -1;
     std::vector<int64> split_sizes_vec(num_split, -1);
     const TensorShape split_size_shape = ctx->InputShape(1);
-    OP_REQUIRES(ctx, split_size_shape.dims() == 1 &&
-                         split_size_shape.num_elements() == num_split,
+    OP_REQUIRES(ctx,
+                split_size_shape.dims() == 1 &&
+                    split_size_shape.num_elements() == num_split,
                 errors::InvalidArgument(
                     "shape of tensor describing "
                     " the output must have dimension 1 and the same "
@@ -171,10 +172,11 @@ class SplitVOp : public XlaOpKernel {
     }
 
     OP_REQUIRES(
-        ctx, (neg_one_dim == -1 &&
-              total_split_size == input_shape.dim_size(split_dim)) ||
-                 (neg_one_dim >= 0 &&
-                  total_split_size <= input_shape.dim_size(split_dim)),
+        ctx,
+        (neg_one_dim == -1 &&
+         total_split_size == input_shape.dim_size(split_dim)) ||
+            (neg_one_dim >= 0 &&
+             total_split_size <= input_shape.dim_size(split_dim)),
         errors::InvalidArgument("Determined shape must either match "
                                 "input shape along split_dim exactly if "
                                 "fully specified, or be less than the size of "
@@ -206,7 +208,10 @@ class SplitVOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("SplitV"), SplitVOp);
+REGISTER_XLA_OP(Name("SplitV")
+                    .CompileTimeConstInput("split_dim")
+                    .CompileTimeConstInput("size_splits"),
+                SplitVOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index bb7891b31f6d52fd84cf72579c343f50473e1632..1a78c7ab9be701d3d02285ed21604f0f856b3f1f 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -40,7 +40,7 @@ namespace {
 
 Status GetStackShape(xla::ComputationBuilder* builder, XlaResource* resource,
                      TensorShape* stack_shape) {
-  auto shape_or_status = builder->GetShape(resource->value);
+  auto shape_or_status = builder->GetShape(resource->value());
   if (!shape_or_status.ok()) {
     return shape_or_status.status();
   }
@@ -63,22 +63,22 @@ Status GetStackShape(xla::ComputationBuilder* builder, XlaResource* resource,
 Status MaybeInitializeStack(xla::ComputationBuilder* builder,
                             XlaResource* resource, DataType dtype,
                             const TensorShape& elem_shape) {
-  if (resource->type != dtype) {
+  if (resource->type() != dtype) {
     return errors::InvalidArgument(
-        "Stack dtype is ", DataTypeString(resource->type), " but op has dtype ",
-        DataTypeString(dtype), ".");
+        "Stack dtype is ", DataTypeString(resource->type()),
+        " but op has dtype ", DataTypeString(dtype), ".");
   }
 
   TensorShape stack_shape;
-  stack_shape.AddDim(resource->tensor_array_size);
+  stack_shape.AddDim(resource->tensor_array_size());
   stack_shape.AppendShape(elem_shape);
 
-  if (resource->value.handle() == 0) {
+  if (!resource->initialized()) {
     // Stack has not been initialized.
-    xla::ComputationDataHandle zero = XlaHelpers::Zero(builder, resource->type);
-    resource->value =
-        builder->Tuple({builder->Broadcast(zero, stack_shape.dim_sizes()),
-                        builder->ConstantR0<int32>(0)});
+    xla::ComputationDataHandle zero =
+        XlaHelpers::Zero(builder, resource->type());
+    TF_RETURN_IF_ERROR(resource->SetTypeAndShape(dtype, elem_shape));
+    TF_RETURN_IF_ERROR(resource->SetZeroValue(builder));
   } else {
     // Checks the expected shape matches the actual shape.
     TensorShape actual_shape;
@@ -105,7 +105,9 @@ class StackOp : public XlaOpKernel {
     OP_REQUIRES(
         ctx, size >= 0,
         errors::InvalidArgument(
-            "XLA compilation requires a fixed stack size upper bound."));
+            "XLA compilation requires a fixed stack size upper bound. If "
+            "you are using tf.while_loop, set the maximum_iterations parameter "
+            "to fix this issue."));
 
     // We defer initializing the Stack resource until we see the first push.
     // Otherwise we do not know the shape of the stack elements.
@@ -115,8 +117,8 @@ class StackOp : public XlaOpKernel {
     string name = strings::StrCat("Stack: ", stack_name_);
     OP_REQUIRES_OK(
         ctx, xc.CreateResource(XlaResource::kStack, -1, std::move(name), dtype_,
-                               value, &resource));
-    resource->tensor_array_size = size;
+                               TensorShape(), value, /*tensor_array_size=*/size,
+                               /*tensor_array_gradients=*/{}, &resource));
     ctx->SetResourceOutput(0, resource);
   }
 
@@ -127,7 +129,7 @@ class StackOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(StackOp);
 };
 
-REGISTER_XLA_OP(Name("StackV2"), StackOp);
+REGISTER_XLA_OP(Name("StackV2").CompileTimeConstInput("max_size"), StackOp);
 
 class StackPushOp : public XlaOpKernel {
  public:
@@ -145,8 +147,8 @@ class StackPushOp : public XlaOpKernel {
     // Initializes the Stack, if the element shape was not already known.
     OP_REQUIRES_OK(ctx, MaybeInitializeStack(b, resource, dtype_, elem_shape));
 
-    xla::ComputationDataHandle ta = b->GetTupleElement(resource->value, 0);
-    xla::ComputationDataHandle index = b->GetTupleElement(resource->value, 1);
+    xla::ComputationDataHandle ta = b->GetTupleElement(resource->value(), 0);
+    xla::ComputationDataHandle index = b->GetTupleElement(resource->value(), 1);
     xla::ComputationDataHandle value = ctx->Input(1);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
@@ -160,9 +162,9 @@ class StackPushOp : public XlaOpKernel {
 
     // TODO(phawkins): We don't check the index is in bounds --- there is no
     // error mechanism in XLA.
-    resource->value =
-        b->Tuple({b->DynamicUpdateSlice(ta, update, start_indices),
-                  b->Add(index, b->ConstantR0<int32>(1))});
+    OP_REQUIRES_OK(ctx, resource->SetValue(b->Tuple(
+                            {b->DynamicUpdateSlice(ta, update, start_indices),
+                             b->Add(index, b->ConstantR0<int32>(1))})));
 
     ctx->SetOutput(0, value);
   }
@@ -187,27 +189,22 @@ class StackPopOp : public XlaOpKernel {
     XlaResource* resource;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
 
-    OP_REQUIRES(ctx, resource->type == dtype_,
-                errors::InvalidArgument(
-                    "Stack dtype is ", DataTypeString(resource->type),
-                    " but Op requested dtype ", DataTypeString(dtype_), "."));
-
     // There is a somewhat subtle issue here: here "uninitialized" means we have
     // not yet seen a pop in the order that we compile operators, not the order
     // that we run them. However, in practice the two orders should be the same
     // for the sole user of the stack operators (loop gradients).
-    OP_REQUIRES(ctx, resource->value.handle() != 0,
+    OP_REQUIRES(ctx, resource->initialized(),
                 errors::InvalidArgument("Stack pop on uninitialized stack"));
 
     TensorShape stack_shape;
     OP_REQUIRES_OK(ctx, GetStackShape(b, resource, &stack_shape));
 
-    xla::ComputationDataHandle state = resource->value;
+    xla::ComputationDataHandle state = resource->value();
     xla::ComputationDataHandle ta = b->GetTupleElement(state, 0);
     xla::ComputationDataHandle index = b->GetTupleElement(state, 1);
 
     index = b->Sub(index, b->ConstantR0<int32>(1));
-    resource->value = b->Tuple({ta, index});
+    OP_REQUIRES_OK(ctx, resource->SetValue(b->Tuple({ta, index})));
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
     auto start_indices =
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 6af4bd0496e0da926726e3f74376281f539e925a..91c169428c7a88a8d107a97445aeea999946e3e9 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -106,7 +106,11 @@ class StridedSliceOp : public XlaOpKernel {
   DataType index_type_;
 };
 
-REGISTER_XLA_OP(Name("StridedSlice"), StridedSliceOp);
+REGISTER_XLA_OP(Name("StridedSlice")
+                    .CompileTimeConstInput("begin")
+                    .CompileTimeConstInput("end")
+                    .CompileTimeConstInput("strides"),
+                StridedSliceOp);
 
 class StridedSliceGradOp : public XlaOpKernel {
  public:
@@ -211,7 +215,12 @@ class StridedSliceGradOp : public XlaOpKernel {
   DataType index_type_;
 };
 
-REGISTER_XLA_OP(Name("StridedSliceGrad"), StridedSliceGradOp);
+REGISTER_XLA_OP(Name("StridedSliceGrad")
+                    .CompileTimeConstInput("shape")
+                    .CompileTimeConstInput("begin")
+                    .CompileTimeConstInput("end")
+                    .CompileTimeConstInput("strides"),
+                StridedSliceGradOp);
 
 class StridedSliceAssignOp : public XlaOpKernel {
  public:
@@ -222,6 +231,7 @@ class StridedSliceAssignOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("new_axis_mask", &new_axis_mask_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shrink_axis_mask", &shrink_axis_mask_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Index", &index_type_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -243,9 +253,9 @@ class StridedSliceAssignOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, LiteralToHostTensor(strides_literal, index_type_,
                                             &strides_tensor));
 
-    DataType lhs_type;
     TensorShape lhs_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &lhs_type, &lhs_shape));
+    xla::ComputationDataHandle lhs;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &lhs_shape, &lhs));
 
     const TensorShape rhs_shape = ctx->InputShape(4);
 
@@ -273,9 +283,6 @@ class StridedSliceAssignOp : public XlaOpKernel {
                     " does not match r-value shape ", rhs_shape.DebugString(),
                     ". Automatic broadcasting not yet implemented."));
 
-    xla::ComputationDataHandle lhs;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &lhs));
-
     xla::ComputationDataHandle rhs = ctx->Input(4);
 
     gtl::InlinedVector<int64, 4> dimensions_to_reverse;
@@ -311,16 +318,21 @@ class StridedSliceAssignOp : public XlaOpKernel {
           lhs, rhs, ctx->builder()->ConstantR1<int64>(slice_begin));
     }
 
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, lhs_type, lhs));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, lhs));
   }
 
  private:
   int32 begin_mask_, end_mask_;
   int32 ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
   DataType index_type_;
+  DataType dtype_;
 };
 
-REGISTER_XLA_OP(Name("ResourceStridedSliceAssign"), StridedSliceAssignOp);
+REGISTER_XLA_OP(Name("ResourceStridedSliceAssign")
+                    .CompileTimeConstInput("begin")
+                    .CompileTimeConstInput("end")
+                    .CompileTimeConstInput("strides"),
+                StridedSliceAssignOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 351fda251798e43b607fb445f2c98abd57b3d86b..000b50af6bd86b7268c016865fb0856c16053ece 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
@@ -50,35 +50,38 @@ namespace {
 Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder,
                                   XlaResource* resource, DataType dtype,
                                   const TensorShape& elem_shape) {
-  if (resource->kind != XlaResource::kTensorArray) {
+  if (resource->kind() != XlaResource::kTensorArray) {
     return errors::InvalidArgument("Unexpected non-TensorArray resource");
   }
 
-  if (resource->type != dtype) {
+  if (resource->type() != dtype) {
     return errors::InvalidArgument(
-        "TensorArray dtype is ", DataTypeString(resource->type),
+        "TensorArray dtype is ", DataTypeString(resource->type()),
         " but op has dtype ", DataTypeString(dtype), ".");
   }
 
-  TF_RET_CHECK(resource->tensor_array_size >= 0)
-      << resource->name << " size " << resource->tensor_array_size;
-  TensorShape ta_shape;
-  ta_shape.AddDim(resource->tensor_array_size);
-  ta_shape.AppendShape(elem_shape);
+  TF_RET_CHECK(resource->tensor_array_size() >= 0)
+      << resource->name() << " size " << resource->tensor_array_size();
 
-  if (resource->value.handle() == 0) {
-    // TensorArray has not been initialized.
-    xla::ComputationDataHandle zero = XlaHelpers::Zero(builder, resource->type);
-    resource->value = builder->Broadcast(zero, ta_shape.dim_sizes());
+  if (!resource->initialized()) {
+    xla::ComputationDataHandle zero =
+        XlaHelpers::Zero(builder, resource->type());
+
+    TF_RETURN_IF_ERROR(resource->SetTypeAndShape(dtype, elem_shape));
+    TF_RETURN_IF_ERROR(resource->SetZeroValue(builder));
   } else {
     // Checks the elem_shape matches the TensorArray shape.
-    auto shape_or_status = builder->GetShape(resource->value);
+    auto shape_or_status = builder->GetShape(resource->value());
     if (!shape_or_status.ok()) {
       return shape_or_status.status();
     }
     TensorShape shape;
     TF_RETURN_IF_ERROR(
         XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), &shape));
+
+    TensorShape ta_shape;
+    ta_shape.AddDim(resource->tensor_array_size());
+    ta_shape.AppendShape(elem_shape);
     if (ta_shape != shape) {
       return errors::InvalidArgument(
           "Mismatched TensorArray sizes: ", ta_shape.DebugString(), " vs ",
@@ -93,19 +96,17 @@ Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder,
 Status CheckTensorArrayIsInitialized(const string& op_name,
                                      const XlaResource* resource,
                                      DataType dtype) {
-  if (resource->kind != XlaResource::kTensorArray) {
+  if (resource->kind() != XlaResource::kTensorArray) {
     return errors::InvalidArgument(
-        "Unexpected non-TensorArray resource passed "
-        "to ",
-        op_name);
+        "Unexpected non-TensorArray resource passed to ", op_name);
   }
-  if (resource->value.handle() == 0) {
+  if (!resource->initialized()) {
     return errors::InvalidArgument("Uninitialized TensorArray passed to ",
                                    op_name);
   }
-  if (resource->type != dtype) {
+  if (resource->type() != dtype) {
     return errors::InvalidArgument(
-        "TensorArray dtype is ", DataTypeString(resource->type),
+        "TensorArray dtype is ", DataTypeString(resource->type()),
         " but op has dtype ", DataTypeString(dtype), ".");
   }
 
@@ -115,10 +116,8 @@ Status CheckTensorArrayIsInitialized(const string& op_name,
 Status GetTensorArrayShape(const XlaResource* resource,
                            xla::ComputationBuilder* builder,
                            TensorShape* shape) {
-  TF_RETURN_IF_ERROR(resource->GetShape(builder, shape));
-  if (shape->dims() < 1) {
-    return errors::InvalidArgument("TensorArray rank must be >= 1");
-  }
+  *shape = resource->shape();
+  shape->InsertDim(0, resource->tensor_array_size());
   return Status::OK();
 }
 
@@ -161,8 +160,8 @@ class TensorArrayOp : public XlaOpKernel {
     // Initializes the TensorArray value if we know the element shape.
     // Otherwise, defer initialization to the first write.
     xla::ComputationDataHandle value;
+    TensorShape shape;
     if (element_shape_.IsFullyDefined()) {
-      TensorShape shape;
       CHECK(element_shape_.AsTensorShape(&shape));
       TensorShape ta_shape;
       ta_shape.AddDim(size);
@@ -176,8 +175,8 @@ class TensorArrayOp : public XlaOpKernel {
     string name = strings::StrCat("TensorArray: ", tensor_array_name_);
     OP_REQUIRES_OK(
         ctx, xc.CreateResource(XlaResource::kTensorArray, -1, std::move(name),
-                               dtype_, value, &var));
-    var->tensor_array_size = size;
+                               dtype_, shape, value, /*tensor_array_size=*/size,
+                               /*tensor_array_gradients=*/{}, &var));
     ctx->SetResourceOutput(0, var);
 
     Tensor flow(DT_FLOAT, TensorShape({}));
@@ -193,7 +192,8 @@ class TensorArrayOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayOp);
 };
 
-REGISTER_XLA_OP(Name("TensorArrayV3"), TensorArrayOp);
+REGISTER_XLA_OP(Name("TensorArrayV3").CompileTimeConstInput("size"),
+                TensorArrayOp);
 
 class TensorArrayWriteOp : public XlaOpKernel {
  public:
@@ -213,7 +213,7 @@ class TensorArrayWriteOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx,
                    MaybeInitializeTensorArray(b, resource, dtype_, elem_shape));
 
-    xla::ComputationDataHandle ta = resource->value;
+    xla::ComputationDataHandle ta = resource->value();
     xla::ComputationDataHandle index = ctx->Input(1);
     xla::ComputationDataHandle value = ctx->Input(2);
     xla::ComputationDataHandle flow = ctx->Input(3);
@@ -230,7 +230,7 @@ class TensorArrayWriteOp : public XlaOpKernel {
     xla::ComputationDataHandle written =
         DynamicAddSlice(b, ta, update, slice_shape.dim_sizes(), start_indices);
 
-    resource->value = written;
+    OP_REQUIRES_OK(ctx, resource->SetValue(written));
     ctx->SetOutput(0, flow);
   }
 
@@ -259,7 +259,7 @@ class TensorArrayReadOp : public XlaOpKernel {
     TensorShape ta_shape;
     OP_REQUIRES_OK(ctx, GetTensorArrayShape(resource, b, &ta_shape));
 
-    xla::ComputationDataHandle ta = resource->value;
+    xla::ComputationDataHandle ta = resource->value();
     xla::ComputationDataHandle index = ctx->Input(1);
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
@@ -309,10 +309,39 @@ class TensorArrayGatherOp : public XlaOpKernel {
     auto indices = ctx->Input(1);
     DataType index_type = ctx->input_type(1);
 
-    xla::ComputationDataHandle ta = resource->value;
+    xla::ComputationDataHandle ta = resource->value();
+
+    // Look for the case where the gather takes a simple slice from the
+    // tensor array (0, 1, 2, 3, 4, ..., N)
+    std::vector<int64> const_indices;
+    Status status = ctx->ConstantInputAsIntVector(1, &const_indices);
+    if (status.ok()) {
+      bool gather_is_dense_slice = true;
+      for (auto i = 0; i < const_indices.size(); i++) {
+        if (const_indices[i] != i) {
+          gather_is_dense_slice = false;
+          break;
+        }
+      }
+
+      if (gather_is_dense_slice) {
+        std::vector<int64> begin(ta_shape.dims(), 0);
+        std::vector<int64> strides(ta_shape.dims(), 1);
+        std::vector<int64> end(ta_shape.dims(), 1);
+        end[0] = const_indices.size();
+        for (auto i = 1; i < ta_shape.dims(); i++) {
+          end[i] = ta_shape.dim_size(i);
+        }
+        ctx->SetOutput(0, b->Slice(ta, begin, end, strides));
+        return;
+      }
+    }
 
-    xla::ComputationDataHandle gather = XlaComputeGatherDynamicSlice(
-        ctx, ta, ta_shape, indices, indices_shape, 0, dtype_, index_type, b);
+    xla::ComputationDataHandle gather;
+    OP_REQUIRES_OK(
+        ctx,
+        XlaGather(ta, ta_shape, indices, indices_shape, /*axis=*/0,
+                  /*indices_are_nd=*/false, dtype_, index_type, b, &gather));
     ctx->SetOutput(0, gather);
   }
 
@@ -348,35 +377,54 @@ class TensorArrayScatterOp : public XlaOpKernel {
     const int num_indices = indices_shape.dim_size(0);
     const xla::ComputationDataHandle indices = ctx->Input(1);
 
-    xla::ComputationDataHandle ta = resource->value;
+    xla::ComputationDataHandle ta = resource->value();
     const xla::ComputationDataHandle value = ctx->Input(2);
     const xla::ComputationDataHandle flow = ctx->Input(3);
 
-    auto slice_dims = value_shape.dim_sizes();
-    slice_dims[0] = 1LL;
-
-    std::vector<int64> value_starts(value_shape.dims(), 0);
-    auto value_ends = value_shape.dim_sizes();
-
-    std::vector<int64> value_strides(value_shape.dims(), 1);
-
-    // For every (index, value) pair, update the corresponding TensorArray
-    // storage.
-    for (int i = 0; i < num_indices; ++i) {
-      // Slice out part of the value.
-      value_starts[0] = i;
-      value_ends[0] = i + 1;
-      auto slice = b->Slice(value, value_starts, value_ends, value_strides);
+    // Look for the case where the scatter is for each sub-tensor in order. The
+    // tensor array implementation allows for this to be a straight addition.
+    bool scatter_all_elements_in_order = false;
+    std::vector<int64> const_indices;
+    Status status = ctx->ConstantInputAsIntVector(1, &const_indices);
+    if (status.ok() && num_indices == value_shape.dim_size(0)) {
+      scatter_all_elements_in_order = true;
+      for (auto i = 0; i < num_indices; i++) {
+        if (const_indices[i] != i) {
+          scatter_all_elements_in_order = false;
+          break;
+        }
+      }
+    }
 
-      // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-      auto index = b->Slice(indices, {i}, {i + 1}, {1});
-      auto start_indices =
-          b->Pad(b->Reshape(index, {1}), b->ConstantR0<int32>(0),
-                 xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
-      ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
+    if (scatter_all_elements_in_order) {
+      ta = b->Add(ta, value);
+    } else {
+      auto slice_dims = value_shape.dim_sizes();
+      slice_dims[0] = 1LL;
+
+      std::vector<int64> value_starts(value_shape.dims(), 0);
+      auto value_ends = value_shape.dim_sizes();
+
+      std::vector<int64> value_strides(value_shape.dims(), 1);
+
+      // For every (index, value) pair, update the corresponding TensorArray
+      // storage.
+      for (int i = 0; i < num_indices; ++i) {
+        // Slice out part of the value.
+        value_starts[0] = i;
+        value_ends[0] = i + 1;
+        auto slice = b->Slice(value, value_starts, value_ends, value_strides);
+
+        // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
+        auto index = b->Slice(indices, {i}, {i + 1}, {1});
+        auto start_indices =
+            b->Pad(b->Reshape(index, {1}), b->ConstantR0<int32>(0),
+                   xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+        ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
+      }
     }
 
-    resource->value = ta;
+    OP_REQUIRES_OK(ctx, resource->SetValue(ta));
     ctx->SetOutput(0, flow);
   }
 
@@ -405,7 +453,7 @@ class TensorArrayConcatOp : public XlaOpKernel {
     TensorShape ta_shape;
     OP_REQUIRES_OK(ctx, GetTensorArrayShape(resource, b, &ta_shape));
 
-    xla::ComputationDataHandle ta = resource->value;
+    xla::ComputationDataHandle ta = resource->value();
 
     auto ta_dims = ta_shape.dim_sizes();
     std::vector<int64> shape(ta_dims.begin() + 1, ta_dims.end());
@@ -460,16 +508,17 @@ class TensorArraySplitOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
     OP_REQUIRES_OK(ctx,
                    MaybeInitializeTensorArray(b, resource, dtype_, elem_shape));
-    xla::ComputationDataHandle ta = resource->value;
+    xla::ComputationDataHandle ta = resource->value();
 
     TensorShape ta_shape;
-    ta_shape.AddDim(resource->tensor_array_size);
+    ta_shape.AddDim(resource->tensor_array_size());
     ta_shape.AppendShape(elem_shape);
 
-    OP_REQUIRES(ctx, lengths.size() == resource->tensor_array_size,
-                errors::InvalidArgument(
-                    "TensorArray's size is not equal to the size of lengths (",
-                    lengths.size(), " vs. ", resource->tensor_array_size, ")"));
+    OP_REQUIRES(
+        ctx, lengths.size() == resource->tensor_array_size(),
+        errors::InvalidArgument(
+            "TensorArray's size is not equal to the size of lengths (",
+            lengths.size(), " vs. ", resource->tensor_array_size(), ")"));
 
     const xla::ComputationDataHandle value = ctx->Input(1);
     const xla::ComputationDataHandle flow = ctx->Input(3);
@@ -479,7 +528,8 @@ class TensorArraySplitOp : public XlaOpKernel {
                                         value_shape.DebugString(), " vs. ",
                                         ta_shape.DebugString()));
 
-    resource->value = b->Add(ta, b->Reshape(value, ta_shape.dim_sizes()));
+    OP_REQUIRES_OK(ctx, resource->SetValue(b->Add(
+                            ta, b->Reshape(value, ta_shape.dim_sizes()))));
 
     ctx->SetOutput(0, flow);
   }
@@ -490,7 +540,8 @@ class TensorArraySplitOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(TensorArraySplitOp);
 };
 
-REGISTER_XLA_OP(Name("TensorArraySplitV3"), TensorArraySplitOp);
+REGISTER_XLA_OP(Name("TensorArraySplitV3").CompileTimeConstInput("lengths"),
+                TensorArraySplitOp);
 
 class TensorArraySizeOp : public XlaOpKernel {
  public:
@@ -500,7 +551,8 @@ class TensorArraySizeOp : public XlaOpKernel {
     XlaResource* var;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &var));
     Tensor size_tensor(DT_INT32, {});
-    size_tensor.scalar<int32>()() = static_cast<int32>(var->tensor_array_size);
+    size_tensor.scalar<int32>()() =
+        static_cast<int32>(var->tensor_array_size());
     ctx->SetConstantOutput(0, size_tensor);
   }
 
@@ -523,7 +575,7 @@ class TensorArrayGradOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
 
     OP_REQUIRES_OK(
-        ctx, CheckTensorArrayIsInitialized(name(), resource, resource->type));
+        ctx, CheckTensorArrayIsInitialized(name(), resource, resource->type()));
     TensorShape ta_shape;
     OP_REQUIRES_OK(ctx, GetTensorArrayShape(resource, b, &ta_shape));
 
diff --git a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
index 9ee6bd892504e683a191484fb09259619759f36d..9aefcd4fc7f94a1dba1c56273c55d0b98fbbfaf2 100644
--- a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
@@ -122,7 +122,7 @@ class TileOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(TileOp);
 };
 
-REGISTER_XLA_OP(Name("Tile"), TileOp);
+REGISTER_XLA_OP(Name("Tile").CompileTimeConstInput("multiples"), TileOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index 5534d1bfa1338c7fe3647cd6aa281c4907dfdf8c..f750f7003be288461f5f10455e58932d1b4e4524 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -32,9 +32,24 @@ class ResourceApplyGradientDescent : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationDataHandle handle;
     xla::ComputationBuilder* b = ctx->builder();
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    DataType type = ctx->input_type(1);
+    TensorShape var_shape;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &handle));
+
+    TensorShape alpha_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha_shape),
+                errors::InvalidArgument("alpha is not a scalar: ",
+                                        alpha_shape.DebugString()));
+
+    TensorShape delta_shape = ctx->InputShape(2);
+    OP_REQUIRES(
+        ctx, var_shape.IsSameSize(delta_shape),
+        errors::InvalidArgument("var and delta do not have the same shape: ",
+                                var_shape.DebugString(), " vs ",
+                                delta_shape.DebugString()));
+
     handle = b->Sub(handle, b->Mul(ctx->Input(1), ctx->Input(2)));
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle));
   }
 };
 REGISTER_XLA_OP(
@@ -52,18 +67,10 @@ class ResourceApplyMomentum : public XlaOpKernel {
 
     DataType type = ctx->input_type(2);
 
-    DataType var_type, accum_type;
     TensorShape var_shape, accum_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-    OP_REQUIRES_OK(ctx,
-                   ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
-
-    OP_REQUIRES(
-        ctx, type == var_type && type == accum_type,
-        errors::InvalidArgument(
-            "Types of variable arguments to ResourceApplyMomentum must match: ",
-            DataTypeString(type), " vs. ", DataTypeString(var_type), " and ",
-            DataTypeString(accum_type)));
+    xla::ComputationDataHandle var, accum;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &accum_shape, &accum));
 
     OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
                 errors::InvalidArgument(
@@ -86,10 +93,6 @@ class ResourceApplyMomentum : public XlaOpKernel {
                 errors::InvalidArgument("momentum is not a scalar: ",
                                         momentum_shape.DebugString()));
 
-    xla::ComputationDataHandle var, accum;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
-
     xla::ComputationDataHandle lr = ctx->Input(2);
     xla::ComputationDataHandle grad = ctx->Input(3);
     xla::ComputationDataHandle momentum = ctx->Input(4);
@@ -122,18 +125,10 @@ class ResourceApplyAdagrad : public XlaOpKernel {
 
     DataType type = ctx->input_type(2);
 
-    DataType var_type, accum_type;
     TensorShape var_shape, accum_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-    OP_REQUIRES_OK(ctx,
-                   ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
-
-    OP_REQUIRES(
-        ctx, type == var_type && type == accum_type,
-        errors::InvalidArgument(
-            "Types of variable arguments to ResourceApplyAdagrad must match: ",
-            DataTypeString(type), " vs. ", DataTypeString(var_type), " and ",
-            DataTypeString(accum_type)));
+    xla::ComputationDataHandle var, accum;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &accum_shape, &accum));
 
     OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
                 errors::InvalidArgument(
@@ -151,9 +146,6 @@ class ResourceApplyAdagrad : public XlaOpKernel {
                     "var and grad do not have the same shape",
                     var_shape.DebugString(), " ", grad_shape.DebugString()));
 
-    xla::ComputationDataHandle var, accum;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
     xla::ComputationDataHandle lr = ctx->Input(2);
     xla::ComputationDataHandle grad = ctx->Input(3);
 
@@ -175,18 +167,11 @@ class ResourceApplyAdam : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    DataType var_type, m_type, v_type;
     TensorShape var_shape, m_shape, v_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(1, &m_type, &m_shape));
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(2, &v_type, &v_shape));
-
-    OP_REQUIRES(
-        ctx, dtype_ == var_type && dtype_ == m_type && dtype_ == v_type,
-        errors::InvalidArgument(
-            "Types of variable arguments to ResourceApplyRMSProp must match: ",
-            DataTypeString(dtype_), " vs. ", DataTypeString(var_type), " vs. ",
-            DataTypeString(m_type), " vs. ", DataTypeString(v_type)));
+    xla::ComputationDataHandle var, m, v;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, dtype_, &m_shape, &m));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, dtype_, &v_shape, &v));
 
     TensorShape beta1_power_shape = ctx->InputShape(3);
     TensorShape beta2_power_shape = ctx->InputShape(4);
@@ -228,10 +213,6 @@ class ResourceApplyAdam : public XlaOpKernel {
                     "var and grad do not have the same shape",
                     var_shape.DebugString(), " ", grad_shape.DebugString()));
 
-    xla::ComputationDataHandle var, m, v;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &m));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &v));
     xla::ComputationDataHandle beta1_power = ctx->Input(3);
     xla::ComputationDataHandle beta2_power = ctx->Input(4);
     xla::ComputationDataHandle lr = ctx->Input(5);
@@ -278,18 +259,11 @@ class ResourceApplyRMSProp : public XlaOpKernel {
 
     DataType type = ctx->input_type(3);
 
-    DataType var_type, ms_type, mom_type;
     TensorShape var_shape, ms_shape, mom_shape;
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(1, &ms_type, &ms_shape));
-    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(2, &mom_type, &mom_shape));
-
-    OP_REQUIRES(
-        ctx, type == var_type && type == ms_type && type == mom_type,
-        errors::InvalidArgument(
-            "Types of variable arguments to ResourceApplyRMSProp must match: ",
-            DataTypeString(type), " vs. ", DataTypeString(var_type), " vs. ",
-            DataTypeString(ms_type), " vs. ", DataTypeString(mom_type)));
+    xla::ComputationDataHandle var, ms, mom;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &ms_shape, &ms));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, type, &mom_shape, &mom));
 
     TensorShape lr_shape = ctx->InputShape(3);
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
@@ -323,10 +297,6 @@ class ResourceApplyRMSProp : public XlaOpKernel {
                     "var and grad do not have the same shape",
                     var_shape.DebugString(), " ", grad_shape.DebugString()));
 
-    xla::ComputationDataHandle var, ms, mom;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &ms));
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &mom));
     xla::ComputationDataHandle lr = ctx->Input(3);
     xla::ComputationDataHandle rho = ctx->Input(4);
     xla::ComputationDataHandle momentum = ctx->Input(5);
@@ -373,20 +343,11 @@ void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype,
                  bool has_l2_shrinkage) {
   xla::ComputationBuilder* b = ctx->builder();
 
-  DataType var_type, accum_type, linear_type;
   TensorShape var_shape, accum_shape, linear_shape;
-  OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
-  OP_REQUIRES_OK(ctx,
-                 ctx->GetVariableTypeAndShape(1, &accum_type, &accum_shape));
-  OP_REQUIRES_OK(ctx,
-                 ctx->GetVariableTypeAndShape(2, &linear_type, &linear_shape));
-
-  OP_REQUIRES(
-      ctx, dtype == var_type && dtype == accum_type && dtype == linear_type,
-      errors::InvalidArgument(
-          "Types of variable arguments to ResourceApplyFtrlV2 must match: ",
-          DataTypeString(dtype), " vs. ", DataTypeString(var_type), " and ",
-          DataTypeString(accum_type), " and ", DataTypeString(linear_type)));
+  xla::ComputationDataHandle var, accum, linear;
+  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype, &var_shape, &var));
+  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, dtype, &accum_shape, &accum));
+  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, dtype, &linear_shape, &linear));
 
   OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
               errors::InvalidArgument(
@@ -438,10 +399,6 @@ void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype,
               errors::InvalidArgument("lr_power is not a scalar: ",
                                       lr_power_shape.DebugString()));
 
-  xla::ComputationDataHandle var, accum, linear;
-  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
-  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &accum));
-  OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &linear));
   xla::ComputationDataHandle grad = ctx->Input(3);
   xla::ComputationDataHandle lr = ctx->Input(4);
   xla::ComputationDataHandle l1 = ctx->Input(5);
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index 2fc5d40d1059b868eef0a632071e7cccdecaf9f4..c167642174b328a968d7f7ce1f0ad6e0ab8a7a68 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -54,7 +54,8 @@ class TransposeOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {dims}, &literal));
 
     std::vector<int32> perm(dims);
-    std::copy(literal.s32s().begin(), literal.s32s().end(), perm.begin());
+    std::copy(literal.data<int32>().begin(), literal.data<int32>().end(),
+              perm.begin());
 
     std::vector<int64> transposed_order;
     // Check whether permutation is a permutation of integers of [0 .. dims).
@@ -72,8 +73,9 @@ class TransposeOp : public XlaOpKernel {
       }
     }
     for (int i = 0; i < dims; ++i) {
-      OP_REQUIRES(ctx, bits[i], errors::InvalidArgument(
-                                    i, " is missing from 'perm' argument."));
+      OP_REQUIRES(
+          ctx, bits[i],
+          errors::InvalidArgument(i, " is missing from 'perm' argument."));
     }
 
     // 0-D, 1-D, and identity transposes do nothing.
@@ -87,7 +89,7 @@ class TransposeOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Transpose"), TransposeOp);
+REGISTER_XLA_OP(Name("Transpose").CompileTimeConstInput("perm"), TransposeOp);
 
 // InvertPermutation frequently forms part of the gradient of Transpose.
 //
@@ -103,8 +105,9 @@ class InvertPermutationOp : public XlaOpKernel {
   explicit InvertPermutationOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    OP_REQUIRES(ctx, FastBoundsCheck(ctx->InputShape(0).num_elements(),
-                                     std::numeric_limits<int32>::max()),
+    OP_REQUIRES(ctx,
+                FastBoundsCheck(ctx->InputShape(0).num_elements(),
+                                std::numeric_limits<int32>::max()),
                 errors::InvalidArgument("permutation of nonnegative int32s "
                                         "must have <= int32 max elements"));
 
@@ -128,7 +131,9 @@ class InvertPermutationOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("InvertPermutation").TypeConstraint("T", DT_INT32),
+REGISTER_XLA_OP(Name("InvertPermutation")
+                    .TypeConstraint("T", DT_INT32)
+                    .CompileTimeConstInput("x"),
                 InvertPermutationOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index a266e9013c41b88788dbc99849f01c09f3d61348..0c5ad9e5255ffc3dfcfb83335060ae833937b3ce 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -50,18 +50,41 @@ XLAJIT_MAKE_UNARY(Conj, b->Conj(x));
 // Return x if x>0, otherwise -x.
 XLAJIT_MAKE_UNARY(Abs, b->Abs(x));
 
+// acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x))
+XLAJIT_MAKE_UNARY(
+    Acos,
+    b->Mul(XlaHelpers::FloatLiteral(b, input_type(0), 2.0),
+           b->Atan2(b->Pow(b->Sub(XlaHelpers::One(b, input_type(0)),
+                                  b->Mul(x, x)),
+                           XlaHelpers::FloatLiteral(b, input_type(0), 0.5)),
+                    b->Add(XlaHelpers::One(b, input_type(0)), x))));
+
 // acosh(x) = log(x + sqrt(x^2 - 1))
 XLAJIT_MAKE_UNARY(
     Acosh,
     b->Log(b->Add(x, b->Pow(b->Sub(b->Mul(x, x),
                                    XlaHelpers::One(b, input_type(0))),
                             XlaHelpers::FloatLiteral(b, input_type(0), 0.5)))));
+
+// asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
+XLAJIT_MAKE_UNARY(
+    Asin,
+    b->Mul(XlaHelpers::FloatLiteral(b, input_type(0), 2.0),
+           b->Atan2(x, b->Add(XlaHelpers::One(b, input_type(0)),
+                              b->Pow(b->Sub(XlaHelpers::One(b, input_type(0)),
+                                            b->Mul(x, x)),
+                                     XlaHelpers::FloatLiteral(b, input_type(0),
+                                                              0.5))))));
+
 // asinh(x) = log(x + sqrt(x^2 + 1))
 XLAJIT_MAKE_UNARY(
     Asinh,
     b->Log(b->Add(x, b->Pow(b->Add(b->Mul(x, x),
                                    XlaHelpers::One(b, input_type(0))),
                             XlaHelpers::FloatLiteral(b, input_type(0), 0.5)))));
+
+XLAJIT_MAKE_UNARY(Atan, b->Atan2(x, XlaHelpers::One(b, input_type(0))));
+
 // atanh(x) = 0.5 * log((1 + x) / (1 - x))
 XLAJIT_MAKE_UNARY(
     Atanh, b->Mul(b->Log(b->Div(b->Add(XlaHelpers::One(b, input_type(0)), x),
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index b19ea22f50d2dd44e8d1d81f5930263f364030e1..71173f5aead47702f0ed9e95b827a6fefd9b7efd 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
+#include "tensorflow/compiler/tf2xla/kernels/shape_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -22,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/no_op.h"
 
 namespace tensorflow {
@@ -31,21 +33,29 @@ class VarIsInitializedOp : public XlaOpKernel {
  public:
   explicit VarIsInitializedOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle handle;
-    bool initialized = ctx->ReadVariableInput(0, &handle).ok();
-    ctx->SetOutput(0, ctx->builder()->ConstantR0<bool>(initialized));
+    XlaResource* variable;
+    OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &variable));
+    ctx->SetOutput(0,
+                   ctx->builder()->ConstantR0<bool>(variable->initialized()));
   }
 };
 REGISTER_XLA_OP(Name("VarIsInitializedOp"), VarIsInitializedOp);
 
 class ReadVariableOp : public XlaOpKernel {
  public:
-  explicit ReadVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  explicit ReadVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationDataHandle handle;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    OP_REQUIRES_OK(
+        ctx, ctx->ReadVariableInput(0, dtype_, /*shape=*/nullptr, &handle));
     ctx->SetOutput(0, handle);
   }
+
+ private:
+  DataType dtype_;
 };
 REGISTER_XLA_OP(Name("ReadVariableOp"), ReadVariableOp);
 
@@ -63,10 +73,12 @@ class AssignAddVariableOp : public XlaOpKernel {
  public:
   explicit AssignAddVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
+    DataType type = ctx->input_type(1);
     xla::ComputationDataHandle handle;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    OP_REQUIRES_OK(ctx,
+                   ctx->ReadVariableInput(0, type, /*shape=*/nullptr, &handle));
     handle = ctx->builder()->Add(handle, ctx->Input(1));
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle));
   }
 };
 REGISTER_XLA_OP(
@@ -77,10 +89,12 @@ class AssignSubVariableOp : public XlaOpKernel {
  public:
   explicit AssignSubVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
+    DataType type = ctx->input_type(1);
     xla::ComputationDataHandle handle;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &handle));
+    OP_REQUIRES_OK(ctx,
+                   ctx->ReadVariableInput(0, type, /*shape=*/nullptr, &handle));
     handle = ctx->builder()->Sub(handle, ctx->Input(1));
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, ctx->input_type(1), handle));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, handle));
   }
 };
 REGISTER_XLA_OP(
@@ -93,33 +107,47 @@ class ResourceGatherOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationBuilder* builder = ctx->builder();
 
-    // Get the shape of the resource tensor.
-    TensorShape resource_shape;
-    DataType resource_dtype;
-    OP_REQUIRES_OK(
-        ctx, ctx->GetVariableTypeAndShape(0, &resource_dtype, &resource_shape));
-
-    DataType expected_output_dtype = ctx->expected_output_dtype(0);
-    OP_REQUIRES(ctx, resource_dtype == expected_output_dtype,
-                errors::InvalidArgument(
-                    "Variable dtype is ", DataTypeString(resource_dtype),
-                    " but expected output dtype is ",
-                    DataTypeString(expected_output_dtype), "."));
+    DataType type = ctx->expected_output_dtype(0);
 
+    TensorShape resource_shape;
     xla::ComputationDataHandle resource_handle;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &resource_handle));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &resource_shape,
+                                               &resource_handle));
 
     auto indices = ctx->Input(1);
     auto indices_shape = ctx->InputShape(1);
     DataType index_type = ctx->input_type(1);
-    xla::ComputationDataHandle gather = XlaComputeGatherDynamicSlice(
-        ctx, resource_handle, resource_shape, indices, indices_shape, 0,
-        resource_dtype, index_type, builder);
+    xla::ComputationDataHandle gather;
+    OP_REQUIRES_OK(
+        ctx, XlaGather(resource_handle, resource_shape, indices, indices_shape,
+                       /*axis=*/0, /*indices_are_nd=*/false, type, index_type,
+                       builder, &gather));
     ctx->SetOutput(0, gather);
   }
 };
 REGISTER_XLA_OP(Name("ResourceGather").TypeConstraint("dtype", kNumericTypes),
                 ResourceGatherOp);
 
+class VariableShapeOp : public XlaOpKernel {
+ public:
+  explicit VariableShapeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    DataType variable_dtype;
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetVariableTypeAndShape(0, &variable_dtype, &shape));
+    Tensor shape_constant(out_dtype_, TensorShape({shape.dims()}));
+    OP_REQUIRES_OK(ctx, TensorShapeToConstant(shape, &shape_constant));
+    ctx->SetConstantOutput(0, shape_constant);
+  }
+
+ private:
+  DataType out_dtype_;
+};
+
+REGISTER_XLA_OP(Name("VariableShape"), VariableShapeOp);
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index ead26478ff2a3a1302e95e4ee5dbbf366b04efc6..0ff1b65ae9179d506e453f98097cd88083eb2be7 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -39,7 +39,7 @@ Status MakeXlaCompilerArgumentsFromInputs(
   *has_uninitialized_vars = false;
   *has_tensor_arrays = false;
   for (int i = 0; i < ctx->num_inputs(); ++i) {
-    VLOG(2) << "  Input " << i
+    VLOG(2) << " Input " << i
             << " type: " << DataTypeString(ctx->input_type(i))
             << " shape: " << ctx->InputShape(i).DebugString();
     XlaCompiler::Argument& arg = (*args)[i];
@@ -50,34 +50,32 @@ Status MakeXlaCompilerArgumentsFromInputs(
       XlaResource* resource;
       TF_RETURN_IF_ERROR(ctx->GetResourceInput(i, &resource));
 
-      arg.initialized = resource->value.handle() > 0;
+      arg.initialized = resource->initialized();
       arg.kind = XlaCompiler::Argument::kResource;
-      arg.resource_kind = resource->kind;
+      arg.resource_kind = resource->kind();
       if (arg.resource_kind == XlaResource::kTensorArray) {
         *has_tensor_arrays = true;
       }
 
-      arg.type = resource->type;
-      if (arg.initialized) {
-        TF_RETURN_IF_ERROR(resource->PackedShape(ctx->builder(), &arg.shape));
-      } else {
+      arg.type = resource->type();
+      arg.shape = resource->shape();
+      if (!arg.initialized) {
         *has_uninitialized_vars = true;
       }
-      arg.tensor_array_size = resource->tensor_array_size;
-      for (const auto& gradient : resource->tensor_array_gradients) {
+      arg.tensor_array_size = resource->tensor_array_size();
+      for (const auto& gradient : resource->tensor_array_gradients()) {
         arg.tensor_array_gradients.insert(gradient.first);
       }
-      arg.name = resource->name;
-      VLOG(2) << "    resource " << resource->name
+      arg.name = resource->name();
+      VLOG(2) << "    resource " << resource->name()
               << " type: " << DataTypeString(arg.type)
-              << " shape: " << xla::ShapeUtil::HumanString(arg.shape)
+              << " shape: " << arg.shape.DebugString()
               << " initialized: " << arg.initialized;
 
     } else {
       arg.kind = XlaCompiler::Argument::kParameter;
       arg.type = ctx->input_type(i);
-      TF_RETURN_IF_ERROR(
-          TensorShapeToXLAShape(arg.type, ctx->InputShape(i), &arg.shape));
+      arg.shape = ctx->InputShape(i);
     }
   }
   return Status::OK();
@@ -120,6 +118,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   body_options.use_tuple_arg = true;
   body_options.return_updated_values_for_all_resources = true;
   body_options.resolve_compile_time_constants = false;
+  body_options.is_entry_computation = false;
   XlaCompiler::CompilationResult body;
   OP_REQUIRES_OK(ctx, compiler->CompileFunction(body_options, body_name_attr_,
                                                 arguments, &body));
@@ -153,22 +152,20 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
       XlaCompiler::Argument& arg = arguments[update.input_index];
       if (!arg.initialized) {
         VLOG(2) << "Update shape for argument " << update.input_index << " "
-                << xla::ShapeUtil::HumanString(update.shape);
+                << update.shape.DebugString();
         arg.initialized = true;
 
-        xla::Shape shape = update.shape;
-        if (!update.tensor_array_gradients_accessed.empty()) {
-          shape = xla::ShapeUtil::GetTupleElementShape(shape, 0);
-        }
-        std::unique_ptr<xla::Literal> zero =
-            xla::Literal::CreateFromShape(shape);
-        resource->value = builder->ConstantLiteral(*zero);
+        arg.shape = update.shape;
+        OP_REQUIRES_OK(ctx,
+                       resource->SetTypeAndShape(update.type, update.shape));
+
+        OP_REQUIRES_OK(ctx, resource->SetZeroValue(builder));
       }
 
       // Add any TensorArray gradients touched by the body to the enclosing
       // graph.
       for (const string& grad_source : update.tensor_array_gradients_accessed) {
-        VLOG(4) << "TensorArray " << resource->name << " accessed gradient "
+        VLOG(4) << "TensorArray " << resource->name() << " accessed gradient "
                 << grad_source;
         XlaResource* gradient;
         OP_REQUIRES_OK(ctx, resource->GetOrCreateTensorArrayGradient(
@@ -177,12 +174,9 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
 
       // Add all of the TensorArray gradients to the argument. For simplicity,
       // we always pass all known gradients.
-      for (const auto& gradient : resource->tensor_array_gradients) {
+      for (const auto& gradient : resource->tensor_array_gradients()) {
         arg.tensor_array_gradients.insert(gradient.first);
       }
-
-      // Recompute the argument shape.
-      OP_REQUIRES_OK(ctx, resource->PackedShape(ctx->builder(), &arg.shape));
     }
     // Recompile the body with the "correct" resource shapes.
     VLOG(1) << "Recompiling body with corrected resource shapes";
@@ -196,14 +190,21 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   XlaCompiler::CompileOptions cond_options;
   cond_options.use_tuple_arg = true;
   cond_options.resolve_compile_time_constants = false;
+  cond_options.is_entry_computation = false;
   XlaCompiler::CompilationResult cond;
   OP_REQUIRES_OK(ctx, compiler->CompileFunction(cond_options, cond_name_attr_,
                                                 arguments, &cond));
 
-  xla::Shape body_input_shape =
-      xla::ShapeUtil::MakeTupleShape(body.xla_input_shapes);
-  xla::Shape cond_input_shape =
-      xla::ShapeUtil::MakeTupleShape(cond.xla_input_shapes);
+  OP_REQUIRES(ctx, body.xla_input_shapes.size() == 1,
+              errors::FailedPrecondition("Expected one input shape"));
+  xla::Shape body_input_shape = body.xla_input_shapes[0];
+  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(body_input_shape),
+              errors::FailedPrecondition("Expected tuple shape"));
+  OP_REQUIRES(ctx, cond.xla_input_shapes.size() == 1,
+              errors::FailedPrecondition("Expected one input shape"));
+  xla::Shape cond_input_shape = cond.xla_input_shapes[0];
+  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(cond_input_shape),
+              errors::FailedPrecondition("Expected tuple shape"));
 
   VLOG(2) << "Body shape: " << xla::ShapeUtil::HumanString(body_input_shape)
           << " -> " << xla::ShapeUtil::HumanString(body.xla_output_shape);
@@ -286,9 +287,9 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
                          builder->GetTupleElement(while_result, pos), builder));
     }
     VLOG(2) << "Loop-carried variable: pos: " << update.input_index
-            << " name: " << resource->name << " modified: " << update.modified
+            << " name: " << resource->name() << " modified: " << update.modified
             << " type: " << DataTypeString(update.type)
-            << " shape: " << xla::ShapeUtil::HumanString(update.shape);
+            << " shape: " << update.shape.DebugString();
     // Copies the identity of the resource variable from input to output
     // unchanged, even if the variable was not modified.
     ctx->op_kernel_context()->set_output(
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 21ad21f73737a289390ed1ea767db1078d05b466..488fda74bf7b5c1d66f8d706a1be3cc1fc29a492 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -49,6 +49,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "scatter",
+    srcs = ["scatter.cc"],
+    hdrs = ["scatter.h"],
+    deps = [
+        ":util",
+        ":while_loop",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "triangular_solve",
     srcs = ["triangular_solve.cc"],
@@ -60,6 +79,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/core:lib",
@@ -105,6 +126,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "while_loop",
+    srcs = ["while_loop.cc"],
+    hdrs = ["while_loop.h"],
+    deps = [
+        ":util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/core:lib",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
index 28a5e6a58bb312f4c4821bcce484a08160009d56..798f0fa78055e800038e8bf41b4f410b670be7dd 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
@@ -25,12 +25,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-// The current implementation simply unrolls the computation along the batch
-// dimension.
-// TODO(andydavis): add batching support to XLA's Dot operator.
 xla::StatusOr<xla::ComputationDataHandle> BatchDot(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
-    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y) {
+    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y,
+    bool conjugate_x, bool conjugate_y) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> x_shape,
                       builder->GetShape(x));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> y_shape,
@@ -52,26 +50,20 @@ xla::StatusOr<xla::ComputationDataHandle> BatchDot(
 
   // The batch dimensions must be equal and the matrix dimensions must be
   // valid.
-  std::vector<int64> dimensions;
-  int64 batch_count = 1;
+  std::vector<int64> batch_dimension_numbers;
   for (int i = 0; i < ndims - 2; ++i) {
-    int64 x_size = x_shape->dimensions(i);
-    int64 y_size = y_shape->dimensions(i);
-    if (x_size != y_size) {
+    if (x_shape->dimensions(i) != y_shape->dimensions(i)) {
       return errors::InvalidArgument(
           "Dimension ", i, " of inputs to BatchedDot must be equal: ",
           xla::ShapeUtil::HumanString(*x_shape), " vs ",
           xla::ShapeUtil::HumanString(*y_shape));
     }
-    dimensions.push_back(x_size);
-    batch_count *= x_size;
+    batch_dimension_numbers.push_back(i);
   }
 
   int x_inner_dim = transpose_x ? (ndims - 2) : (ndims - 1);
   int y_inner_dim = transpose_y ? (ndims - 1) : (ndims - 2);
-  int64 x_inner_dim_size = x_shape->dimensions(x_inner_dim);
-  int64 y_inner_dim_size = y_shape->dimensions(y_inner_dim);
-  if (x_inner_dim_size != y_inner_dim_size) {
+  if (x_shape->dimensions(x_inner_dim) != y_shape->dimensions(y_inner_dim)) {
     return errors::InvalidArgument(
         "Dimensions ", x_inner_dim, " and ", y_inner_dim,
         " of arguments to BatchedDot must be equal: ",
@@ -80,75 +72,46 @@ xla::StatusOr<xla::ComputationDataHandle> BatchDot(
         " transpose: ", transpose_y);
   }
 
-  // If there are no batch dimensions, use a regular Dot. This case exists
-  // to improve the readability of the emitted graphs.
-  if (dimensions.empty()) {
-    auto lhs = transpose_x ? builder->Transpose(x, {1, 0}) : x;
-    auto rhs = transpose_y ? builder->Transpose(y, {1, 0}) : y;
-    return builder->Dot(lhs, rhs);
+  // Check for zero lhs/rhs dim size.
+  if (xla::ShapeUtil::HasZeroElements(*x_shape) ||
+      xla::ShapeUtil::HasZeroElements(*y_shape)) {
+    std::vector<int64> dimensions(batch_dimension_numbers.size());
+    for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
+      dimensions[i] = x_shape->dimensions(batch_dimension_numbers[i]);
+    }
+    int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2);
+    int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1);
+    dimensions.push_back(x_shape->dimensions(x_outer_dim));
+    dimensions.push_back(y_shape->dimensions(y_outer_dim));
+    return builder->Broadcast(
+        builder->ConstantLiteral(xla::Literal::Zero(x_shape->element_type())),
+        dimensions);
   }
 
-  int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2);
-  int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1);
-  dimensions.push_back(x_shape->dimensions(x_outer_dim));
-  dimensions.push_back(y_shape->dimensions(y_outer_dim));
-
-  if (x_shape->element_type() == xla::C64 && transpose_x) {
+  if (x_shape->element_type() == xla::C64 && conjugate_x) {
     x = builder->Conj(x);
   }
-  if (y_shape->element_type() == xla::C64 && transpose_y) {
+  if (y_shape->element_type() == xla::C64 && conjugate_y) {
     y = builder->Conj(y);
   }
 
-  // Reshape input tensors into 3D tensors by flattening the batch
-  // dimensions. This makes it easier to unroll the batch dimension.
-  auto x_flat =
-      builder->Reshape(x, {batch_count, x_shape->dimensions(ndims - 2),
-                           x_shape->dimensions(ndims - 1)});
-  auto y_flat =
-      builder->Reshape(y, {batch_count, y_shape->dimensions(ndims - 2),
-                           y_shape->dimensions(ndims - 1)});
-
-  // Slice batches into individual matrices and multiply them.
-  std::vector<xla::ComputationDataHandle> out_slices;
-  for (int64 i = 0; i < batch_count; ++i) {
-    // Slice off individual matrices and reshape to 2D tensors.
-    auto x_slice = builder->Slice(
-        x_flat, {i, 0, 0},
-        {i + 1, x_shape->dimensions(ndims - 2), x_shape->dimensions(ndims - 1)},
-        {1, 1, 1});
-    x_slice = builder->Reshape(x_slice, {x_shape->dimensions(ndims - 2),
-                                         x_shape->dimensions(ndims - 1)});
-    auto y_slice = builder->Slice(
-        y_flat, {i, 0, 0},
-        {i + 1, y_shape->dimensions(ndims - 2), y_shape->dimensions(ndims - 1)},
-        {1, 1, 1});
-    y_slice = builder->Reshape(y_slice, {y_shape->dimensions(ndims - 2),
-                                         y_shape->dimensions(ndims - 1)});
-
-    // Transpose if needed.
-    auto lhs = transpose_x ? builder->Transpose(x_slice, {1, 0}) : x_slice;
-    auto rhs = transpose_y ? builder->Transpose(y_slice, {1, 0}) : y_slice;
-
-    // Multiply matrices and add an outer singleton dimension to the output
-    // so we can concatenate along the flattened batch dimension later.
-    auto out = builder->Dot(lhs, rhs);
-    out = builder->Reshape(out,
-                           {1, dimensions[ndims - 2], dimensions[ndims - 1]});
-    out_slices.push_back(out);
+  // If there are no batch dimensions, use a regular Dot.
+  // TODO(b/69062148) Remove this code when Dot emitters can be passed
+  // dimensions to transpose directly (i.e. without requiring a Transpose HLO).
+  if (batch_dimension_numbers.empty()) {
+    auto lhs = transpose_x ? builder->Transpose(x, {1, 0}) : x;
+    auto rhs = transpose_y ? builder->Transpose(y, {1, 0}) : y;
+    return builder->Dot(lhs, rhs);
   }
 
-  // Concatenate output slices and reshape to original number of dimensions.
-  xla::ComputationDataHandle data;
-  if (out_slices.empty()) {
-    // It is illegal to pass an empty list to ConcatInDim.
-    // The batch count is empty, so both inputs must have zero elements.
-    // Arbitrarily use the left input as the argument to Reshape().
-    data = x;
-  } else {
-    data = builder->ConcatInDim(out_slices, 0);
+  xla::DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
+  dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
+  for (auto batch_dimension_number : batch_dimension_numbers) {
+    dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
+    dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
   }
-  return builder->Reshape(data, dimensions);
+  return builder->DotGeneral(x, y, dot_dnums);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h
index b46bc7417d29dc5b7e9649ac28cc78b57d4b619c..b230e885f10f45a78cdd6e455da3ba55ce589b96 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.h
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.h
@@ -27,7 +27,10 @@ namespace tensorflow {
 // viewed as an element of a batch), and arranges the individual results
 // in a single output tensor of the same batch size. Each of the
 // individual slices can optionally be transposed before multiplication by
-// setting the `transpose_x` or `transpose_y` flag to `true`.
+// setting the `transpose_x` or `transpose_y` flag to `true`. Similarly, each
+// can be elementwise-complex-conjugated by setting the `conjugate_x` or
+// `conjugate_y` flag to `true`. To apply a Hermitian adjoint to `x`, set both
+// `transpose_x` and `conjugate_x` to `true`, and analogously for `y`.
 //
 // The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
 // and `[..., r_y, c_y]`.
@@ -40,11 +43,10 @@ namespace tensorflow {
 // It is computed as:
 //
 //     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-// TODO(phawkins): add an option to take the complex conjugate of the LHS or
-// RHS.
 xla::StatusOr<xla::ComputationDataHandle> BatchDot(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
-    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y);
+    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y,
+    bool conjugate_x = false, bool conjugate_y = false);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index b3cc489adf6042acb3f56b3a0a6c8fbe43bde629..e795701181dd80a2ff544743d513bffd52fd2399 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -71,11 +71,14 @@ xla::StatusOr<xla::ComputationDataHandle> CholeskyUnblocked(
                           SliceInMinorDims(builder, l, {j + 1, 0}, {n, j}));
       TF_ASSIGN_OR_RETURN(auto r_squared,
                           BatchDot(builder, r, r, /*transpose_x=*/false,
-                                   /*transpose_y=*/true));
+                                   /*transpose_y=*/true, /*conjugate_x=*/false,
+                                   /*conjugate_y=*/false));
       new_d_squared = builder->Sub(new_d_squared, r_squared);
 
       TF_ASSIGN_OR_RETURN(br, BatchDot(builder, b, r, /*transpose_x=*/false,
-                                       /*transpose_y=*/true));
+                                       /*transpose_y=*/true,
+                                       /*conjugate_x=*/false,
+                                       /*conjugate_y=*/false));
     }
     auto new_d_inv = builder->Pow(
         new_d_squared, FloatLiteral(builder, shape->element_type(), -0.5));
@@ -134,7 +137,8 @@ xla::StatusOr<xla::ComputationDataHandle> Cholesky(
                           SliceInMinorDims(builder, l, {i, 0}, {i + k, i}));
       TF_ASSIGN_OR_RETURN(auto delta,
                           BatchDot(builder, lhs, rhs, /*transpose_x=*/false,
-                                   /*transpose_y=*/true));
+                                   /*transpose_y=*/true, /*conjugate_x=*/false,
+                                   /*conjugate_y=*/false));
       TF_ASSIGN_OR_RETURN(auto before,
                           SliceInMinorDims(builder, a, {i, i}, {n, i + k}));
       TF_ASSIGN_OR_RETURN(
@@ -155,6 +159,10 @@ xla::StatusOr<xla::ComputationDataHandle> Cholesky(
                           SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
       TF_ASSIGN_OR_RETURN(auto update,
                           TriangularSolve(builder, factorized, panel,
+                                          /*left_side=*/false,
+                                          /*lower=*/true,
+                                          /*transpose_a=*/true,
+                                          /*conjugate_a=*/false,
                                           /*block_size=*/8));
       TF_ASSIGN_OR_RETURN(
           l, UpdateSliceInMinorDims(builder, l, update, {i + k, i}));
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/tf2xla/lib/cholesky.h
index 2bead7359baaf3582c1230adf0cd4a90046859d2..e083a383be4be0d1b556b63214fe5f70323b4149 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.h
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.h
@@ -29,6 +29,7 @@ namespace tensorflow {
 // the block size to use.
 // TODO(phawkins): check for negative values on the diagonal and return an
 // error, instead of silently yielding NaNs.
+// TODO(mattjj): handle the complex Hermitian case
 xla::StatusOr<xla::ComputationDataHandle> Cholesky(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle a,
     int64 block_size = 256);
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.cc b/tensorflow/compiler/tf2xla/lib/scatter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6009243f9774eea24e8049e2bd50fe32f291132f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/scatter.cc
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/scatter.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+xla::StatusOr<xla::ComputationDataHandle> XlaScatter(
+    const xla::ComputationDataHandle& buffer,
+    const xla::ComputationDataHandle& updates,
+    const xla::ComputationDataHandle& indices, bool indices_are_vectors,
+    const std::function<xla::ComputationDataHandle(
+        xla::ComputationDataHandle, xla::ComputationDataHandle,
+        xla::ComputationBuilder*)>& combiner,
+    xla::ComputationBuilder* builder) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> buffer_shape,
+                      builder->GetShape(buffer));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> updates_shape,
+                      builder->GetShape(updates));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> indices_shape,
+                      builder->GetShape(indices));
+  gtl::ArraySlice<int64> indices_dims =
+      xla::AsInt64Slice(indices_shape->dimensions());
+  gtl::ArraySlice<int64> buffer_dims =
+      xla::AsInt64Slice(buffer_shape->dimensions());
+
+  // If the indices are N-dimensional, the minor dimension of indices contains
+  // the indices to update. Otherwise the indices are all scalars.
+  int64 num_index_dims = 1;
+  if (indices_are_vectors) {
+    TF_RET_CHECK(!indices_dims.empty());
+    num_index_dims = indices_dims.back();
+    if (num_index_dims > xla::ShapeUtil::Rank(*buffer_shape)) {
+      return errors::InvalidArgument(
+          "The size of the minor dimension of the indices (shape: ",
+          xla::ShapeUtil::HumanString(*indices_shape),
+          ") must be <= the rank of the buffer (shape: ",
+          xla::ShapeUtil::HumanString(*buffer_shape), ")");
+    }
+    indices_dims.pop_back();
+  }
+
+  int64 num_indices = 1;
+  for (int64 dim : indices_dims) {
+    num_indices *= dim;
+  }
+
+  // Degenerate case: nothing to update. Return the buffer unchanged.
+  if (num_indices == 0) {
+    return buffer;
+  }
+
+  // If any of the indexed dimensions are zero in the buffer, the update cannot
+  // succeed since it updates a slice of size 1.
+  for (int64 i = 0; i < num_index_dims; ++i) {
+    if (xla::ShapeUtil::GetDimension(*buffer_shape, i) == 0) {
+      return errors::InvalidArgument(
+          "Scatter dimension ", i, " is of size zero in tensor with shape ",
+          xla::ShapeUtil::HumanString(*buffer_shape));
+    }
+  }
+
+  // Shape of the non-indexed dimensions of the buffer.
+  std::vector<int64> buffer_shape_post_axes(
+      buffer_dims.begin() + num_index_dims, buffer_dims.end());
+
+  // Flatten the major dimensions of indices and updates into a single dimension
+  // for ease of iteration.
+  std::vector<int64> flat_indices_shape({num_indices});
+  if (indices_are_vectors) {
+    flat_indices_shape.push_back(num_index_dims);
+  }
+
+  std::vector<int64> flat_updates_shape({num_indices});
+  flat_updates_shape.insert(flat_updates_shape.end(),
+                            buffer_shape_post_axes.begin(),
+                            buffer_shape_post_axes.end());
+
+  // Construct the initial values of the loop-carried Tensors.
+  auto flat_indices = builder->Reshape(indices, flat_indices_shape);
+  auto flat_updates = builder->Reshape(updates, flat_updates_shape);
+  auto init = {flat_indices, flat_updates, buffer};
+
+  // Constructs the loop body. The implementation of scatter is essentially:
+  // for i in range(num_indices):
+  //   index = dynamic-slice(indices, i)
+  //   update = dynamic-slice(updates, i)
+  //   buffer = dynamic-update-slice(buffer, update, index)
+  auto body_fn = [&](xla::ComputationDataHandle i,
+                     gtl::ArraySlice<xla::ComputationDataHandle> loop_vars,
+                     xla::ComputationBuilder* body_builder) {
+    auto indices = loop_vars[0];
+    auto updates = loop_vars[1];
+    auto buffer = loop_vars[2];
+
+    auto zero_index = body_builder->ConstantLiteral(
+        xla::Literal::Zero(indices_shape->element_type()));
+
+    // Slice the i-th index from the indices array.
+    xla::ComputationDataHandle index;
+    auto indices_offset = body_builder->Reshape(i, {1});
+    if (indices_are_vectors) {
+      indices_offset = body_builder->Pad(indices_offset, zero_index,
+                                         xla::MakeEdgePaddingConfig({{0, 1}}));
+
+      index = body_builder->DynamicSlice(indices, indices_offset,
+                                         {1, num_index_dims});
+      index = body_builder->Collapse(index, {0, 1});
+    } else {
+      index = body_builder->DynamicSlice(indices, indices_offset, {1});
+    }
+
+    // Discard updates with negative indices, since some users expect this.
+    auto index_in_range =
+        body_builder->ReduceAll(body_builder->Le(zero_index, index),
+                                body_builder->ConstantR0<bool>(true),
+                                xla::CreateScalarAndComputation(body_builder));
+
+    index = body_builder->Pad(
+        index, zero_index,
+        xla::MakeEdgePaddingConfig({{0, buffer_shape_post_axes.size()}}));
+
+    // Slice the i-th index from the updates array.
+    auto updates_offset = body_builder->Reshape(i, {1});
+    updates_offset = body_builder->Pad(
+        updates_offset, zero_index,
+        xla::MakeEdgePaddingConfig({{0, buffer_shape_post_axes.size()}}));
+    std::vector<int64> flat_updates_slice_shape({1});
+    flat_updates_slice_shape.insert(flat_updates_slice_shape.end(),
+                                    buffer_shape_post_axes.begin(),
+                                    buffer_shape_post_axes.end());
+    auto update = body_builder->DynamicSlice(updates, updates_offset,
+                                             flat_updates_slice_shape);
+
+    // Unflatten the major (iteration) dimensions of the slice to their original
+    // shape.
+    std::vector<int64> updates_slice_shape(num_index_dims, 1);
+    updates_slice_shape.insert(updates_slice_shape.end(),
+                               buffer_shape_post_axes.begin(),
+                               buffer_shape_post_axes.end());
+    update = body_builder->Reshape(update, updates_slice_shape);
+
+    // Apply the update to the buffer. If there is a combiner, use it to merge
+    // the current values with the update.
+    if (combiner) {
+      auto current_value =
+          body_builder->DynamicSlice(buffer, index, updates_slice_shape);
+      update = combiner(current_value, update, body_builder);
+    }
+    // Apply the update if it is in range.
+    buffer = body_builder->Select(
+        index_in_range, body_builder->DynamicUpdateSlice(buffer, update, index),
+        buffer);
+
+    return std::vector<xla::ComputationDataHandle>{indices, updates, buffer};
+  };
+
+  TF_ASSIGN_OR_RETURN(
+      auto outputs, XlaForEachIndex(num_indices, indices_shape->element_type(),
+                                    body_fn, init, "scatter", builder));
+  return outputs[2];
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.h b/tensorflow/compiler/tf2xla/lib/scatter.h
new file mode 100644
index 0000000000000000000000000000000000000000..41e6d3b195ebf90662c7b9b42c53fcb0133ab29e
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/scatter.h
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_SCATTER_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_SCATTER_H_
+
+#include <functional>
+
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace tensorflow {
+
+// Builds an XLA computation that performs a scatter operation on `buffer`,
+// returning an updated buffer.
+// For each i0, i1, ..., sets
+// buffer[indices[i0, i1, ...], ...] := updates[i0, i1, ...]
+//
+// If `indices_are_vectors` is false, then each index in indices is a scalar,
+// and the shape of `indices` must be a prefix of the shape of updates.
+// Otherwise, `indices_are_vectors`, then indices are multidimensional and the
+// minor dimension of `indices` represents a vector of indices.
+//
+// If any indices are negative, the corresponding update is discarded.
+//
+// If a `combiner` is provided, updates are combined with the existing values in
+// the buffer using the combiner function. Otherwise, the updates replace the
+// existing values. The order of updates is implementation-defined.
+xla::StatusOr<xla::ComputationDataHandle> XlaScatter(
+    const xla::ComputationDataHandle& buffer,
+    const xla::ComputationDataHandle& updates,
+    const xla::ComputationDataHandle& indices, bool indices_are_vectors,
+    const std::function<xla::ComputationDataHandle(
+        xla::ComputationDataHandle, xla::ComputationDataHandle,
+        xla::ComputationBuilder*)>& combiner,
+    xla::ComputationBuilder* builder);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_SCATTER_H_
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index 579944c3a381e7018b7fee5013d0509158ce21cc..7f72a6073df218b9e2bd4cc0c0b5bb10b5cd4b84 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -24,13 +24,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
 xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
-    xla::ComputationDataHandle b, int64 block_size) {
+    xla::ComputationDataHandle b, bool left_side, bool lower, bool transpose_a,
+    bool conjugate_a, int64 block_size) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
                       builder->GetShape(a));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> b_shape,
@@ -60,14 +62,15 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     batch_dimensions.push_back(a_size);
   }
 
-  const int64 n = xla::ShapeUtil::GetDimension(*a_shape, -1);
-  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
-  if (n != xla::ShapeUtil::GetDimension(*a_shape, -2)) {
+  if (xla::ShapeUtil::GetDimension(*a_shape, -1) !=
+      xla::ShapeUtil::GetDimension(*a_shape, -2)) {
     return errors::InvalidArgument(
         "The 'a' arguments to TriangularSolve must be square matrices: ",
         xla::ShapeUtil::HumanString(*a_shape));
   }
-  if (n != xla::ShapeUtil::GetDimension(*b_shape, -1)) {
+  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
+  const int64 n = xla::ShapeUtil::GetDimension(*b_shape, -1);
+  if ((left_side ? m : n) != xla::ShapeUtil::GetDimension(*a_shape, -1)) {
     return errors::InvalidArgument(
         "Arguments to TriangularSolve have incompatible matrix shapes: ",
         xla::ShapeUtil::HumanString(*a_shape), " vs ",
@@ -89,6 +92,14 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     return output;
   };
 
+  // Applies a complex conjugation operation if `a` is complex and `conjugate_a`
+  // is true, otherwise returns its argument.
+  auto maybe_conj = [&](xla::ComputationBuilder* builder,
+                        xla::ComputationDataHandle x) {
+    auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a;
+    return perform_conj ? builder->Conj(x) : x;
+  };
+
   std::map<int, xla::Computation> base_computations;
   auto get_base_triangular_solve =
       [&](int k) -> xla::StatusOr<xla::Computation*> {
@@ -103,19 +114,35 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
                                                    prepend_batch_dims({k, k})),
                          "a");
 
+      std::array<int64, 2> b_lastd;
+      if (left_side) {
+        b_lastd = {k, n};
+      } else {
+        b_lastd = {m, k};
+      }
       auto b_param =
           sub->Parameter(1,
                          xla::ShapeUtil::MakeShape(b_shape->element_type(),
-                                                   prepend_batch_dims({m, k})),
+                                                   prepend_batch_dims(b_lastd)),
                          "b");
 
-      // TODO(phawkins): it might make sense to use a while loop here, rather
-      // than unrolling.
-      // TODO(phawkins): the left-looking variant of the algorithm might be more
-      // efficient at block size 1.
-      TF_RETURN_IF_ERROR(TriangularSolve(sub.get(), a_param, b_param,
-                                         /*block_size=*/1)
-                             .status());
+      // We use a left-looking subroutine on the block diagonal in some common
+      // cases, while falling back to a recursive call in unsupported cases. The
+      // left-looking subroutine is written with a While loop and so yields much
+      // faster compile times. Moreover, the left-looking variant can give
+      // higher performance on smaller (sub)problems.
+      if (left_side && lower) {
+        TF_RETURN_IF_ERROR(TriangularSolveLeftLooking(sub.get(), a_param,
+                                                      b_param, transpose_a,
+                                                      conjugate_a)
+                               .status());
+      } else {
+        TF_RETURN_IF_ERROR(TriangularSolve(sub.get(), a_param, b_param,
+                                           left_side, lower, transpose_a,
+                                           conjugate_a,
+                                           /*block_size=*/1)
+                               .status());
+      }
 
       TF_ASSIGN_OR_RETURN(computation, sub->Build());
     }
@@ -129,47 +156,396 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
   // Goto, Kazushige, and Robert Van De Geijn. "High-performance implementation
   // of the level-3 BLAS." ACM Transactions on Mathematical Software (TOMS) 35.1
   // (2008): 4.
-  for (int64 i = 0; i < n; i += block_size) {
-    int64 k = std::min(block_size, n - i);
 
-    // if k > 1:
-    //   output[..., :, i:i+k] = triangular_solve(
-    //       a[..., i:i+k, ..., i:i+k], b[..., :, i:i+k], side='Right',
-    //       kind='Lower', transpose=True, block_size=1)
-    // else:
-    //   output[..., :, i] = b[..., :, i] / a[..., i, i]
+  // In the code comments below, T = lambda x: np.swapaxes(x, -1, -2) if
+  // conjugate_a is False, or T = lambda x: np.conj(np.swapaxes(x, -1, -2)) if
+  // conjugate_a is True.
+
+  if (!left_side && lower == transpose_a) {
+    // for i in range(0, a.shape[-1], block_size):
+    for (int64 i = 0; i < n; i += block_size) {
+      int64 k = std::min(block_size, n - i);
+
+      // output[..., :, i:i+k] = triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., :, i:i+k], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
+
+      // if i + k < a.shape[-1]:
+      //   a_slice_2 = a[..., i+k:, i:i+k] if lower else a[..., i:i+k, i+k:]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., :, i+k:] -= np.matmul(output[..., :, i:i+k], a_slice_2)
+      if (i + k < n) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
+        } else {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i, i + k}, {i + k, n}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update,
+                            BatchDot(builder, update, a_slice_2,
+                                     /*transpose_x=*/false,
+                                     /*transpose_y=*/transpose_a,
+                                     /*conjugate_x=*/false,
+                                     /*conjugate_y=*/conjugate_a));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {0, i + k}, {m, n}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {0, i + k}));
+      }
+    }
+
+  } else if (left_side && lower != transpose_a) {
+    // for i in range(0, a.shape[-1], block_size):
+    for (int64 i = 0; i < m; i += block_size) {
+      int64 k = std::min(block_size, m - i);
+
+      // output[..., i:i+k, :] = triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., i:i+k, :], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {i, 0}, {i + k, n}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
+
+      // if i + k < a.shape[-1]:
+      //   a_slice_2 = a[..., i+k:, i:i+k] if lower else a[..., i:i+k, i+k:]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., i+k:, :] -= np.matmul(a_slice_2, output[..., i:i+k, :])
+      if (i + k < m) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {m, i + k}));
+        } else {
+          TF_ASSIGN_OR_RETURN(
+              a_slice_2, SliceInMinorDims(builder, a, {i, i + k}, {i + k, m}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, a_slice_2, update,
+                                                    /*transpose_x=*/transpose_a,
+                                                    /*transpose_y=*/false,
+                                                    /*conjugate_x=*/conjugate_a,
+                                                    /*conjugate_y=*/false));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {i + k, 0}, {m, n}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {i + k, 0}));
+      }
+    }
+  } else if (!left_side && lower != transpose_a) {
+    // for i in reversed(range(0, a.shape[-1], block_size)):
+    const int64 last_blk_ix = xla::RoundUpToNearest(n, block_size) - block_size;
+    for (int64 i = last_blk_ix; i >= 0; i -= block_size) {
+      int64 k = std::min(block_size, n - i);
+
+      // output[..., :, i:i+k] triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., :, i:i+k], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
+
+      // if i - k >= 0:
+      //   a_slice_2 = a[..., i:i+k, :i] if lower else a[..., :i, i:i+k]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., :, :i] -= np.matmul(out[..., :, i:i+k], a_slice_2)
+      if (i - k >= 0) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {i, 0}, {i + k, i}));
+        } else {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {0, i}, {i, i + k}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update,
+                            BatchDot(builder, update, a_slice_2,
+                                     /*transpose_x=*/false,
+                                     /*transpose_y=*/transpose_a,
+                                     /*conjugate_x=*/false,
+                                     /*conjugate_y=*/conjugate_a));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {0, 0}, {m, i}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {0, 0}));
+      }
+    }
+  } else {  // left_side && lower == transpose_a
+    // for i in reversed(range(0, a.shape[-1], block_size)):
+    const int64 last_blk_ix = xla::RoundUpToNearest(m, block_size) - block_size;
+    for (int64 i = last_blk_ix; i >= 0; i -= block_size) {
+      int64 k = std::min(block_size, m - i);
+
+      // output[..., i:i+k, :] triangular_solve(
+      //     a[..., i:i+k, i:i+k], b[..., i:i+k, :], ..., block_size=1)
+      TF_ASSIGN_OR_RETURN(auto a_slice,
+                          SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_slice,
+                          SliceInMinorDims(builder, b, {i, 0}, {i + k, n}));
+      xla::ComputationDataHandle update;
+      if (k > 1) {
+        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                            get_base_triangular_solve(k));
+        update = builder->Call(*solve, {a_slice, b_slice});
+      } else {
+        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+      }
+      TF_ASSIGN_OR_RETURN(
+          output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
+
+      // if i - k >= 0:
+      //   a_slice_2 = a[..., i:i+k, :i] if lower else a[..., :i, i:i+k]
+      //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
+      //   b[..., :i, :] -= np.matmul(a_slice_2, out[..., i:i+k, :])
+      if (i - k >= 0) {
+        xla::ComputationDataHandle a_slice_2;
+        if (lower) {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {i, 0}, {i + k, i}));
+        } else {
+          TF_ASSIGN_OR_RETURN(a_slice_2,
+                              SliceInMinorDims(builder, a, {0, i}, {i, i + k}));
+        }
+
+        TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, a_slice_2, update,
+                                                    /*transpose_x=*/transpose_a,
+                                                    /*transpose_y=*/false,
+                                                    /*conjugate_x=*/conjugate_a,
+                                                    /*conjugate_y=*/false));
+        TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                            SliceInMinorDims(builder, b, {0, 0}, {i, n}));
+        b_update = builder->Sub(b_slice_2, b_update);
+        TF_ASSIGN_OR_RETURN(
+            b, UpdateSliceInMinorDims(builder, b, b_update, {0, 0}));
+      }
+    }
+  }
+
+  return output;
+}
+
+xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
+    const xla::ComputationDataHandle& b, bool transpose_a, bool conjugate_a) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
+                      builder->GetShape(a));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> b_shape,
+                      builder->GetShape(b));
+  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
+  const int64 n = xla::ShapeUtil::GetDimension(*b_shape, -1);
+  const int64 ndims = xla::ShapeUtil::Rank(*a_shape);
+
+  std::vector<int64> batch_dimensions;
+  for (int i = 0; i < ndims - 2; ++i) {
+    int64 a_size = a_shape->dimensions(i);
+    batch_dimensions.push_back(a_size);
+  }
+
+  auto prepend_batch_dims = [&](std::array<int64, 2> indices) {
+    std::vector<int64> output(ndims);
+    std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin());
+    std::copy(indices.begin(), indices.end(),
+              output.begin() + batch_dimensions.size());
+    return output;
+  };
+
+  auto maybe_conj = [&](xla::ComputationBuilder* builder,
+                        xla::ComputationDataHandle x) {
+    auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a;
+    return perform_conj ? builder->Conj(x) : x;
+  };
+
+  // The main computation is performed in a While loop.
+
+  // Allocate the output and set its first or last row,
+  // output = np.zeros_like(b)
+  // if transpose_a:
+  //   output[..., m-1:, :] = b[..., m-1:, :] / a[..., m-1:, m-1:]
+  // else:
+  //   output[..., :1, :] = b[..., :1, :] / a[..., :1, :1]
+  xla::ComputationDataHandle output = Zeros(builder, *b_shape);
+  {
+    auto i = transpose_a ? m - 1 : 0;
     TF_ASSIGN_OR_RETURN(auto a_slice,
-                        SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+                        SliceInMinorDims(builder, a, {i, i}, {i + 1, i + 1}));
     TF_ASSIGN_OR_RETURN(auto b_slice,
-                        SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
-    xla::ComputationDataHandle update;
-    if (k > 1) {
-      TF_ASSIGN_OR_RETURN(xla::Computation * solve,
-                          get_base_triangular_solve(k));
-      update = builder->Call(*solve, {a_slice, b_slice});
+                        SliceInMinorDims(builder, b, {i, 0}, {i + 1, n}));
+    auto update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+    TF_ASSIGN_OR_RETURN(
+        output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
+  }
+
+  // Construct the initial loop carry tuple,
+  // if transpose_a:
+  //   init = (m-2, output, a, b)
+  // else:
+  //   init = (1, output, a, b)
+  std::vector<xla::Shape> tuple_shapes = {
+      // The loop iteration counter is a scalar, incremented each iteration.
+      xla::ShapeUtil::MakeShape(xla::S32, {}),
+      // The output has the shape of b, with one row updated each iteration.
+      *b_shape,
+      // The coefficient matrix a is a loop invariant.
+      *a_shape,
+      // The right-hand-side matrix b is a loop invariant.
+      *b_shape};
+  xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
+  auto init_i = builder->ConstantR0<int32>(transpose_a ? m - 2 : 1);
+  auto init = builder->Tuple({init_i, output, a, b});
+
+  // Construct the loop condition function,
+  // def cond_fun(loop_carry):
+  //   i, output, a, b = loop_carry
+  //   return i >= 0 if transpose_a else i < m
+  std::unique_ptr<xla::ComputationBuilder> condb =
+      builder->CreateSubBuilder("TriangularSolveLeftLookingWhileCond");
+  {
+    auto i = condb->GetTupleElement(
+        condb->Parameter(0, tuple_shape,
+                         "TriangularSolveLeftLookingWhileTuple"),
+        0);
+    if (transpose_a) {
+      condb->Ge(i, condb->ConstantR0<int32>(0));
     } else {
-      update = builder->Div(b_slice, a_slice);
+      condb->Lt(i, condb->ConstantR0<int32>(m));
     }
+  }
+  TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
 
-    TF_ASSIGN_OR_RETURN(
-        output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
-    // b[..., :, i+k:] -= np.dot(output[..., :, i:i+k],
-    //                           np.transpose(..., a[i+k:, i:i+k]))
-    if (i + k < n) {
-      TF_ASSIGN_OR_RETURN(auto a_slice_2,
-                          SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
-      TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, update, a_slice_2,
-                                                  /*transpose_x=*/false,
-                                                  /*transpose_y=*/true));
-
-      TF_ASSIGN_OR_RETURN(auto b_slice_2,
-                          SliceInMinorDims(builder, b, {0, i + k}, {m, n}));
-      b_update = builder->Sub(b_slice_2, b_update);
-      TF_ASSIGN_OR_RETURN(
-          b, UpdateSliceInMinorDims(builder, b, b_update, {0, i + k}));
+  // Construct the loop body function,
+  // def body_fun(loop_carry):
+  //   i, output, a, b = loop_carry
+  //   if transpose_a:
+  //     a_row = np.swapaxes(a[..., i+1:, i:i+1], -1 -2)
+  //   else:
+  //     a_row = a[..., i:i+1, :i]
+  //   result_row = b[..., i:i+1, :] - np.matmul(a_row, output[..., :, :])
+  //   output[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
+  //   if transpose_a:
+  //     return (i - 1, output, a, b)
+  //   else:
+  //     return (i + 1, output, a, b)
+  // We have to do some extra FLOPs propagating zeros in the matrix multiply
+  // because we can't have the size of its arguments depend on the loop counter.
+  std::unique_ptr<xla::ComputationBuilder> bodyb =
+      builder->CreateSubBuilder("TriangularSolveLeftLookingWhileBody");
+  {
+    auto input_tuple = bodyb->Parameter(0, tuple_shape,
+                                        "TriangularSolveLeftLookingWhileTuple");
+
+    // i, output, a, b = loop_carry
+    auto i = bodyb->GetTupleElement(input_tuple, 0);
+    auto body_out = bodyb->GetTupleElement(input_tuple, 1);
+    auto body_a = bodyb->GetTupleElement(input_tuple, 2);
+    auto body_b = bodyb->GetTupleElement(input_tuple, 3);
+    auto zero = bodyb->ConstantR0<int32>(0);
+
+    // Set up some helper functions.
+    auto prepend_zeros = [&](std::array<xla::ComputationDataHandle, 2> starts) {
+      auto zero = bodyb->Reshape(bodyb->ConstantR0<int32>(0), {1});
+      std::vector<xla::ComputationDataHandle> padded_starts(ndims, zero);
+      padded_starts[ndims - 2] = bodyb->Reshape(starts[0], {1});
+      padded_starts[ndims - 1] = bodyb->Reshape(starts[1], {1});
+      return bodyb->ConcatInDim(padded_starts, 0);
+    };
+
+    auto dynamic_slice = [&](xla::ComputationDataHandle x,
+                             std::array<xla::ComputationDataHandle, 2> starts,
+                             std::array<int64, 2> sizes) {
+      auto padded_starts = prepend_zeros(starts);
+      auto padded_sizes = prepend_batch_dims(sizes);
+      return bodyb->DynamicSlice(x, padded_starts, padded_sizes);
+    };
+
+    auto update = [&](xla::ComputationDataHandle x,
+                      xla::ComputationDataHandle update,
+                      std::array<xla::ComputationDataHandle, 2> starts) {
+      auto padded_starts = prepend_zeros(starts);
+      return bodyb->DynamicUpdateSlice(x, update, padded_starts);
+    };
+
+    // We'd like to implement this:
+    //   if transpose_a:
+    //     a_row = T(a[..., i+1:, i:i+1])
+    //     result_row = (b[..., i:i+1, :]
+    //                   - np.matmul(a_row, body_out[..., i+1:, :]))
+    //   else:
+    //     result_row = (b[..., i:i+1, :]
+    //                   - np.matmul(a[..., i:i+1, :i], body_out[..., :i, :]))
+    // But since we can't have intermediate array sizes depend on the loop
+    // counter, we instead exploit the fact that we initialized the output to
+    // all zeros and use that as zero-padding (doing unnecessary FLOPs).
+    xla::ComputationDataHandle a_row;
+    if (transpose_a) {
+      a_row = dynamic_slice(body_a, {zero, i}, {m, 1});
+    } else {
+      a_row = dynamic_slice(body_a, {i, zero}, {1, m});
     }
+    TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(bodyb.get(), a_row, body_out,
+                                                /*transpose_x=*/transpose_a,
+                                                /*transpose_y=*/false,
+                                                /*conjugate_x=*/conjugate_a,
+                                                /*conjugate_y=*/false));
+    auto result_row =
+        bodyb->Sub(dynamic_slice(body_b, {i, zero}, {1, n}), b_update);
+
+    // body_out[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
+    auto a_elt = dynamic_slice(body_a, {i, i}, {1, 1});
+    auto div_result = bodyb->Div(result_row, maybe_conj(bodyb.get(), a_elt));
+    body_out = update(body_out, div_result, {i, zero});
+
+    // if transpose_a:
+    //   return (i - 1, body_out, a, b)
+    // else:
+    //   return (i + 1, body_out, a, b)
+    auto next_i = bodyb->Add(i, bodyb->ConstantR0<int32>(transpose_a ? -1 : 1));
+    bodyb->Tuple({next_i, body_out, body_a, body_b});
   }
-  return output;
+  TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
+
+  // Construct the While loop and return the result,
+  // return while_loop(cond_fun, body_fun, init)[1]
+  auto triangular_solve_left_looking_while = builder->While(cond, body, init);
+  return builder->GetTupleElement(triangular_solve_left_looking_while, 1);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
index 501d026411c80359c7efa406ece5929a2e46ac1f..e32223bfdddda800b1fd4de3e4f0c8061e0f81d8 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.h
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
@@ -21,25 +21,50 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Solves systems of linear equations with upper or lower triangular matrices by
-// backsubstitution.
+// Solves systems of linear equations with lower or upper triangular coefficient
+// matrices by forward- or back-substitution. Broadcasting along leading
+// dimensions, this routine solves one of the matrix systems
+//   `op(a) * x = b`,  or `x * op(a) = b`,
+// for the variable `x` given `a` and `b`, where `op(a)` is either
+//   `op(a) = a`,  or `op(a) = transpose(a)`,  or `op(a) = conj(transpose(a))`.
+// That is, the innermost matrices in the output satisfy a scalar system
+// depending on the value of the value of (left_side, transpose_a, conjugate_a)
+// according to:
+//   (F, F, F) => `output[..., i, k]  a[..., k, j] = b[..., i, j]`,
+//   (F, F, T) => `output[..., i, k] a*[..., k, j] = b[..., i, j]`,
+//   (F, T, F) => `output[..., i, k]  a[..., j, k] = b[..., i, j]`,
+//   (F, T, T) => `output[..., i, k] a*[..., j, k] = b[..., i, j]`,
+//   (T, F, F) => ` a[..., i, k] output[..., k, j] = b[..., i, j]`,
+//   (T, F, T) => `a*[..., i, k] output[..., k, j] = b[..., i, j]`,
+//   (T, T, F) => ` a[..., i, k] output[..., j, k] = b[..., i, j]`,
+//   (T, T, T) => `a*[..., i, k] output[..., j, k] = b[..., i, j]`,
+// where * denotes complex conjugation and where the index `k` is summed over.
 //
-// `a` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. The strictly upper triangular part of each inner-most matrix
-// is assumed to be zero and not accessed.
-// `b` is a tensor of shape `[..., M, K]`.
-//
-// The innermost matrices in the output satisfy matrix equations
-// `output[..., i, j] * adjoint(a[..., k, j]) = b[..., i, k]`.
+// `a` is a tensor of shape `[..., M, M]` whose innermost 2 dimensions form
+// square matrices. If lower is true (false), then the strictly upper (lower)
+// triangular part of each innermost matrix in `a` is assumed to be zero and is
+// not accessed.
+// `b` is a tensor of shape `[..., M, K]` if left_side is true, otherwise a
+// tensor of shape `[..., K, M]`.
+// `left_side` is a boolean, indicating whether to solve a system of the form
+// op(a) * x = b (true) or x * op(a) = b (false).
+// `lower` is a boolean, indicating whether the argument `a` is lower-triangular
+// (true) or upper-triangular (false).
+// `transpose_a` is a boolean indicating whether the matrix `a` is transposed.
+// `conjugate_a` is a boolean indicating whether the entries of `a` are complex
+// conjugated (independently of whether they are transposed), so that when both
+// transpose_a and conjugate_a are true the effect is a Hermitian adjoint.
 //
 // Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no
 // blocking is used.
-// TODO(phawkins): equivalent to the BLAS TRSM routine with side=right,
-// kind=lower, and transposed_a=true. Implement the other possible combinations
-// of side, kind and transposed_a.
 xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
-    xla::ComputationDataHandle b, int64 block_size = 256);
+    xla::ComputationDataHandle b, bool left_side, bool lower, bool transpose_a,
+    bool conjugate_a, int64 block_size = 256);
+
+xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
+    const xla::ComputationDataHandle& b, bool transpose_a, bool conjugate_a);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
index 671d9aa4fe0c042a3cc44468074653d51c2be75d..661707062916263fd0d5d935ce41698a7655df02 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
@@ -27,32 +27,134 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace tensorflow {
 namespace {
 
 using TriangularSolveTest = xla::ClientLibraryTestBase;
+using TriangularSolveLeftLookingTest = xla::ClientLibraryTestBase;
+using complex64 = xla::complex64;
 
-XLA_TEST_F(TriangularSolveTest, Simple) {
+xla::Array2D<float> AValsLower() {
+  return {{2, 0, 0, 0}, {3, 6, 0, 0}, {4, 7, 9, 0}, {5, 8, 10, 11}};
+}
+
+xla::Array2D<float> AValsUpper() {
+  return {{2, 3, 4, 5}, {0, 6, 7, 8}, {0, 0, 9, 10}, {0, 0, 0, 11}};
+}
+
+xla::Array2D<float> BValsRight() {
+  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+}
+
+xla::Array2D<float> BValsLeft() {
+  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
+}
+
+xla::Array2D<complex64> AValsLowerComplex() {
+  return {{2, 0, 0, 0},
+          {complex64(3, 1), 6, 0, 0},
+          {4, complex64(7, 2), 9, 0},
+          {5, 8, complex64(10, 3), 11}};
+}
+
+xla::Array2D<complex64> AValsUpperComplex() {
+  return {{2, 3, complex64(4, 3), 5},
+          {0, 6, complex64(7, 2), 8},
+          {0, 0, complex64(9, 1), 10},
+          {0, 0, 0, 11}};
+}
+
+xla::Array2D<complex64> BValsRightComplex() {
+  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+}
+
+xla::Array2D<complex64> BValsLeftComplex() {
+  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
+}
+
+xla::Array2D<float> AValsFull() {
+  return {{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 7, 9, 0}, {5, 8, 10, 11}};
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
   xla::ComputationBuilder builder(client_, TestName());
 
-  xla::Array2D<float> a_vals({
-      {2, 0, 0, 0},
-      {3, 6, 0, 0},
-      {4, 7, 9, 0},
-      {5, 8, 10, 11},
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/true,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 0.08333334, 0.04629629, 0.03367003},
+      {2.5, -0.25, -0.1388889, -0.1010101},
+      {4.5, -0.58333331, -0.32407406, -0.23569024},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/true,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
+      {0.64393939, 0.06565657, -0.03030303, 0.72727273},
+      {1.4520202, 0.2003367, 0.01010101, 1.09090909},
   });
-  xla::Array2D<float> b_vals({
-      {1, 2, 3, 4},
-      {5, 6, 7, 8},
-      {9, 10, 11, 12},
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/false,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
+      {0.64393939, 0.06565657, -0.03030303, 0.72727273},
+      {1.4520202, 0.2003367, 0.01010101, 1.09090909},
   });
 
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
   xla::ComputationDataHandle a, b;
-  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(b_vals, 1, "b", &builder, &b);
-  auto result = TriangularSolve(&builder, a, b, /*block_size=*/2);
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/false,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
   TF_ASSERT_OK(result.status());
 
   xla::Array2D<float> expected({
@@ -62,7 +164,201 @@ XLA_TEST_F(TriangularSolveTest, Simple) {
   });
 
   ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(2e-3, 2e-3));
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/true,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.89646465, -0.69444444, -0.49242424},
+      {-0.27441077, -0.24074074, -0.20707071},
+      {-0.23232323, -0.22222222, -0.21212121},
+      {0.90909091, 1., 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/true,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/false,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/false,
+                                /*transpose_a=*/false, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {-0.89646465, -0.69444444, -0.49242424},
+      {-0.27441077, -0.24074074, -0.20707071},
+      {-0.23232323, -0.22222222, -0.21212121},
+      {0.90909091, 1., 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data =
+      CreateR2Parameter<complex64>(AValsLowerComplex(), 0, "a", &builder, &a);
+  auto b_data =
+      CreateR2Parameter<complex64>(BValsRightComplex(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/false, /*lower=*/true,
+                                /*transpose_a=*/true, /*conjugate_a=*/true,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<complex64> expected({
+      {0.5, complex64(0.08333333, 0.08333333),
+       complex64(0.02777778, -0.0462963), complex64(0.06313131, -0.01094276)},
+      {2.5, complex64(-0.25, 0.41666667), complex64(-0.23148148, -0.37962963),
+       complex64(0.08670034, -0.02104377)},
+      {4.5, complex64(-0.58333333, 0.75), complex64(-0.49074074, -0.71296296),
+       complex64(0.11026936, -0.03114478)},
+  });
+
+  ComputeAndCompareR2<complex64>(&builder, expected,
+                                 {a_data.get(), b_data.get()},
+                                 xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data =
+      CreateR2Parameter<complex64>(AValsUpperComplex(), 0, "a", &builder, &a);
+  auto b_data =
+      CreateR2Parameter<complex64>(BValsLeftComplex(), 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b,
+                                /*left_side=*/true, /*lower=*/false,
+                                /*transpose_a=*/true, /*conjugate_a=*/false,
+                                /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<complex64> expected({
+      {0.5, 1., 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {complex64(0.20020325, -2.81504065e-01),
+       complex64(0.13821138, -4.22764228e-01),
+       complex64(0.07621951, -5.64024390e-01)},
+      {complex64(0.19678492, 2.55912786e-01),
+       complex64(0.17738359, 3.84331116e-01),
+       complex64(0.15798226, 5.12749446e-01)},
+  });
+
+  ComputeAndCompareR2<complex64>(&builder, expected,
+                                 {a_data.get(), b_data.get()},
+                                 xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveLeftLookingTest, Simple) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolveLeftLooking(&builder, a, b,
+                                           /*transpose_a=*/false,
+                                           /*conjugate_a=*/false);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveLeftLookingTest, NonzeroUpperTriangle) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(AValsFull(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  auto result = TriangularSolveLeftLooking(&builder, a, b,
+                                           /*transpose_a=*/false,
+                                           /*conjugate_a=*/false);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index 7ffe0aa6df9b21c4311eb6c8d311fba1e115b3f4..f579669bbd852b514e021ce71d635f8ce5e4fe4d 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -28,7 +28,7 @@ limitations under the License.
 namespace tensorflow {
 
 xla::ComputationDataHandle Zeros(xla::ComputationBuilder* builder,
-                                 xla::Shape& shape) {
+                                 const xla::Shape& shape) {
   return builder->Broadcast(
       builder->ConstantLiteral(xla::Literal::Zero(shape.element_type())),
       xla::AsInt64Slice(shape.dimensions()));
@@ -40,6 +40,9 @@ xla::ComputationDataHandle FloatLiteral(xla::ComputationBuilder* builder,
     case xla::F16:
       return builder->ConstantR0<xla::half>(static_cast<xla::half>(value));
       break;
+    case xla::BF16:
+      return builder->ConstantR0<bfloat16>(static_cast<bfloat16>(value));
+      break;
     case xla::F32:
       return builder->ConstantR0<float>(static_cast<float>(value));
       break;
@@ -54,6 +57,61 @@ xla::ComputationDataHandle FloatLiteral(xla::ComputationBuilder* builder,
   }
 }
 
+xla::ComputationDataHandle IntegerLiteral(xla::ComputationBuilder* builder,
+                                          xla::PrimitiveType type,
+                                          int64 value) {
+  xla::Literal literal;
+  switch (type) {
+    case xla::U8:
+      literal = std::move(*xla::Literal::CreateR0<uint8>(value));
+      break;
+    case xla::U32:
+      literal = std::move(*xla::Literal::CreateR0<uint32>(value));
+      break;
+    case xla::U64:
+      literal = std::move(*xla::Literal::CreateR0<uint64>(value));
+      break;
+    case xla::S8:
+      literal = std::move(*xla::Literal::CreateR0<int8>(value));
+      break;
+    case xla::S32:
+      literal = std::move(*xla::Literal::CreateR0<int32>(value));
+      break;
+    case xla::S64:
+      literal = std::move(*xla::Literal::CreateR0<int64>(value));
+      break;
+    case xla::F32:
+      literal = std::move(*xla::Literal::CreateR0<float>(value));
+      break;
+    case xla::F64:
+      literal = std::move(*xla::Literal::CreateR0<double>(value));
+      break;
+    case xla::C64:
+      literal = std::move(*xla::Literal::CreateR0<complex64>(value));
+      break;
+    case xla::PRED:
+      LOG(FATAL) << "pred element type is not integral";
+    case xla::S16:
+    case xla::U16:
+      LOG(FATAL) << "u16/s16 literals not yet implemented";
+    case xla::BF16:
+      literal = std::move(
+          *xla::Literal::CreateR0<bfloat16>(static_cast<bfloat16>(value)));
+      break;
+    case xla::F16:
+      literal = std::move(
+          *xla::Literal::CreateR0<xla::half>(static_cast<xla::half>(value)));
+      break;
+    case xla::TUPLE:
+      LOG(FATAL) << "tuple element type is not integral";
+    case xla::OPAQUE:
+      LOG(FATAL) << "opaque element type is not integral";
+    default:
+      LOG(FATAL) << "unhandled element type " << type;
+  }
+  return builder->ConstantLiteral(literal);
+}
+
 xla::StatusOr<xla::ComputationDataHandle> SliceInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
     gtl::ArraySlice<int64> start, gtl::ArraySlice<int64> end) {
@@ -104,4 +162,15 @@ xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
   return UpdateSlice(builder, x, update, padded_start);
 }
 
+xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  TF_RET_CHECK(n_dims >= 2);
+  std::vector<int64> permutation(n_dims);
+  std::iota(permutation.begin(), permutation.end(), 0);
+  std::swap(permutation[n_dims - 1], permutation[n_dims - 2]);
+  return builder->Transpose(x, permutation);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index 8fba6b5cf247e9b2c26533c53ece8b0d7d4f4c36..51f8baaf00bd8fd25baa1a87be8cb0089dfb22b5 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -25,13 +25,18 @@ namespace tensorflow {
 
 // Returns a zero-filled tensor with shape `shape`.
 xla::ComputationDataHandle Zeros(xla::ComputationBuilder* builder,
-                                 xla::Shape& shape);
+                                 const xla::Shape& shape);
 
 // Returns a floating point scalar constant of 'type' with 'value'.
 // If 'type' is complex, returns a real value with zero imaginary component.
 xla::ComputationDataHandle FloatLiteral(xla::ComputationBuilder* builder,
                                         xla::PrimitiveType type, double value);
 
+// Returns a integer scalar constant of 'type' with 'value'.
+// If 'type' is complex, returns a real value with zero imaginary component.
+xla::ComputationDataHandle IntegerLiteral(xla::ComputationBuilder* builder,
+                                          xla::PrimitiveType type, int64 value);
+
 // Performs a slice in the minor dimensions of a Tensor.
 xla::StatusOr<xla::ComputationDataHandle> SliceInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
@@ -49,6 +54,10 @@ xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
     xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
     const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start);
 
+// Transposes a stack of matrices `x` by swapping the last two dimensions.
+xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.cc b/tensorflow/compiler/tf2xla/lib/while_loop.cc
new file mode 100644
index 0000000000000000000000000000000000000000..86c02ac2e65c12d3527c4022df0cc603e522ef7a
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/while_loop.cc
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
+#include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace tensorflow {
+
+xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaWhileLoop(
+    const LoopConditionFunction& condition_function,
+    const LoopBodyFunction& body_function,
+    gtl::ArraySlice<xla::ComputationDataHandle> initial_values,
+    StringPiece name, xla::ComputationBuilder* builder) {
+  int arity = initial_values.size();
+  std::vector<xla::Shape> var_shapes;
+  var_shapes.reserve(arity);
+  for (const xla::ComputationDataHandle& input : initial_values) {
+    TF_ASSIGN_OR_RETURN(auto shape, builder->GetShape(input));
+    var_shapes.push_back(std::move(*shape));
+  }
+  xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(var_shapes);
+
+  // Unpacks a tuple into its component parts.
+  auto unpack_tuple = [](xla::ComputationDataHandle tuple, int arity,
+                         xla::ComputationBuilder* builder) {
+    std::vector<xla::ComputationDataHandle> elements(arity);
+    for (int i = 0; i < arity; ++i) {
+      elements[i] = builder->GetTupleElement(tuple, i);
+    }
+    return elements;
+  };
+
+  // Build the condition.
+  std::unique_ptr<xla::ComputationBuilder> cond_builder =
+      builder->CreateSubBuilder(strings::StrCat(name, "_condition"));
+  {
+    auto parameter = cond_builder->Parameter(0, tuple_shape, "parameter");
+
+    TF_ASSIGN_OR_RETURN(
+        auto result,
+        condition_function(unpack_tuple(parameter, arity, cond_builder.get()),
+                           cond_builder.get()));
+    TF_RETURN_IF_ERROR(cond_builder->SetReturnValue(result));
+  }
+  TF_ASSIGN_OR_RETURN(auto cond, cond_builder->Build());
+
+  // Build the body.
+  std::unique_ptr<xla::ComputationBuilder> body_builder =
+      builder->CreateSubBuilder(strings::StrCat(name, "_body"));
+  {
+    auto parameter = body_builder->Parameter(0, tuple_shape, "parameter");
+
+    TF_ASSIGN_OR_RETURN(
+        auto result,
+        body_function(unpack_tuple(parameter, arity, body_builder.get()),
+                      body_builder.get()));
+
+    TF_RET_CHECK(result.size() == initial_values.size());
+    body_builder->Tuple(result);
+  }
+  TF_ASSIGN_OR_RETURN(auto body, body_builder->Build());
+
+  auto outputs = builder->While(cond, body, builder->Tuple(initial_values));
+
+  return unpack_tuple(outputs, arity, builder);
+}
+
+xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaForEachIndex(
+    int64 num_iterations, xla::PrimitiveType num_iterations_type,
+    const ForEachIndexBodyFunction& body_function,
+    gtl::ArraySlice<xla::ComputationDataHandle> initial_values,
+    StringPiece name, xla::ComputationBuilder* builder) {
+  auto while_cond_fn = [&](gtl::ArraySlice<xla::ComputationDataHandle> values,
+                           xla::ComputationBuilder* cond_builder)
+      -> xla::StatusOr<xla::ComputationDataHandle> {
+    return cond_builder->Lt(
+        values[0],
+        IntegerLiteral(cond_builder, num_iterations_type, num_iterations));
+  };
+  auto while_body_fn = [&](gtl::ArraySlice<xla::ComputationDataHandle> values,
+                           xla::ComputationBuilder* body_builder)
+      -> xla::StatusOr<std::vector<xla::ComputationDataHandle>> {
+    xla::ComputationDataHandle iteration = values[0];
+
+    std::vector<xla::ComputationDataHandle> updated_values;
+    updated_values.reserve(values.size());
+    updated_values.push_back(body_builder->Add(
+        iteration,
+        body_builder->ConstantLiteral(xla::Literal::One(num_iterations_type))));
+
+    values.remove_prefix(1);
+    TF_ASSIGN_OR_RETURN(std::vector<xla::ComputationDataHandle> body_outputs,
+                        body_function(iteration, values, body_builder));
+    updated_values.insert(updated_values.end(), body_outputs.begin(),
+                          body_outputs.end());
+    return updated_values;
+  };
+
+  std::vector<xla::ComputationDataHandle> values;
+  values.reserve(initial_values.size() + 1);
+  values.push_back(
+      builder->ConstantLiteral(xla::Literal::Zero(num_iterations_type)));
+  values.insert(values.end(), initial_values.begin(), initial_values.end());
+
+  TF_ASSIGN_OR_RETURN(values, XlaWhileLoop(while_cond_fn, while_body_fn, values,
+                                           name, builder));
+  values.erase(values.begin(), values.begin() + 1);
+  return values;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.h b/tensorflow/compiler/tf2xla/lib/while_loop.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e67a0c99b6deb65fa16ab2dec1727f5cb5fcb92
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/while_loop.h
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_WHILE_LOOP_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_WHILE_LOOP_H_
+
+#include <functional>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+// Function that builds a loop condition. Takes as input a sequence of input
+// values, and returns a boolean value representing if the condition succeeds.
+typedef std::function<xla::StatusOr<xla::ComputationDataHandle>(
+    gtl::ArraySlice<xla::ComputationDataHandle>, xla::ComputationBuilder*)>
+    LoopConditionFunction;
+
+// Function that builds a loop body. Takes as input a sequence of input values
+// and returns a sequence of output values.
+typedef std::function<xla::StatusOr<std::vector<xla::ComputationDataHandle>>(
+    gtl::ArraySlice<xla::ComputationDataHandle>, xla::ComputationBuilder*)>
+    LoopBodyFunction;
+
+// Helper function for building an XLA while loop, where the values carried by
+// the loop are a tuple of values, e.g., (a, b, c):
+// while(
+//   condition: (a, b, c) -> bool,
+//   body: (a, b, c) -> (a, b, c)
+//   init: (a, b, c)
+// )
+// 'name' is a descriptive name for the loop.
+xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaWhileLoop(
+    const LoopConditionFunction& condition_function,
+    const LoopBodyFunction& body_function,
+    gtl::ArraySlice<xla::ComputationDataHandle> initial_values,
+    StringPiece name, xla::ComputationBuilder* builder);
+
+// Builds an XLA loop that repeats a computation `num_iterations` times.
+//
+// The body function (ForEachIndexBodyFunction) takes as input a pair of
+// (current iteration number, loop-carried values), and returns an updated
+// vector of the loop-carried values.
+typedef std::function<xla::StatusOr<std::vector<xla::ComputationDataHandle>>(
+    xla::ComputationDataHandle, gtl::ArraySlice<xla::ComputationDataHandle>,
+    xla::ComputationBuilder*)>
+    ForEachIndexBodyFunction;
+
+xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaForEachIndex(
+    int64 num_iterations, xla::PrimitiveType num_iterations_type,
+    const ForEachIndexBodyFunction& body_function,
+    gtl::ArraySlice<xla::ComputationDataHandle> initial_values,
+    StringPiece name, xla::ComputationBuilder* builder);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_WHILE_LOOP_H_
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 576cd9bf9abb43e29d9eb8f706e0f42ac2d038e9..fcbd157c6191655865d5e250fdf71338780bc2a6 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -23,17 +23,17 @@ limitations under the License.
 namespace tensorflow {
 
 Status HostTensorToLiteral(const Tensor& host_tensor, xla::Literal* literal) {
-  literal->Clear();
+  xla::Shape literal_shape;
   TF_RETURN_IF_ERROR(TensorShapeToXLAShape(
-      host_tensor.dtype(), host_tensor.shape(), literal->mutable_shape()));
+      host_tensor.dtype(), host_tensor.shape(), &literal_shape));
 
-  literal->Reserve(host_tensor.NumElements());
+  *literal = xla::Literal(literal_shape);
 
   // memcpy over the payload ...
   // TODO(phawkins): handle string types.
   size_t total_bytes = host_tensor.TotalBytes();
   if (total_bytes > 0) {
-    void* dst_ptr = literal->MutableInternalData();
+    void* dst_ptr = literal->untyped_data();
     const void* src_ptr = DMAHelper::base(&host_tensor);
     memcpy(dst_ptr, src_ptr, total_bytes);
   }
@@ -56,7 +56,7 @@ Status LiteralToHostTensor(const xla::Literal& literal, DataType target_type,
   *host_tensor = Tensor(target_type, shape);
   size_t total_bytes = host_tensor->TotalBytes();
   if (total_bytes > 0) {
-    const void* src_ptr = literal.InternalData();
+    const void* src_ptr = literal.untyped_data();
     void* dst_ptr = DMAHelper::base(host_tensor);
     memcpy(dst_ptr, src_ptr, total_bytes);
   }
diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc
index d9c839b61019b92b6de3a77a7bec610ae848a9a4..1a0e09758f7cc6714793300c6ece14093a8ad246 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util.cc
@@ -14,34 +14,59 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
+namespace {
+const char kDeviceSuffixReplicatedCore[] = "REPLICATED_CORE";
+const char kShardingAttribute[] = "_XlaSharding";
+}  // namespace
 
-static const char DEVICE_SUFFIX_REPLICATED_CORE[] = "REPLICATED_CORE";
+namespace {
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+GetShardingFromNodeDef(const NodeDef& node_def) {
+  if (!HasNodeAttr(node_def, kShardingAttribute)) {
+    return tensorflow::gtl::optional<xla::OpSharding>();
+  }
+  string value;
+  xla::OpSharding sharding;
+  TF_RETURN_IF_ERROR(GetNodeAttr(node_def, kShardingAttribute, &value));
+  if (!sharding.ParseFromString(value)) {
+    return xla::InvalidArgument(
+        "Experimental _XlaSharding attribute was not a valid encoded "
+        "xla::OpSharding proto.");
+  }
+  return tensorflow::gtl::optional<xla::OpSharding>(sharding);
+}
 
-static Status CoreOutOfRangeError(int core, int num_cores_per_replica) {
+Status CoreOutOfRangeError(int core, int num_cores_per_replica) {
   return errors::InvalidArgument(
       "Invalid replicated core id: ", core,
       "; num_cores_per_replica=", num_cores_per_replica);
 }
+}  // namespace
 
 xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
-ParseShardingFromDevice(const string& device_name, int num_cores_per_replica) {
+ParseShardingFromDevice(
+    const string& device_name, int num_cores_per_replica,
+    tensorflow::gtl::optional<xla::OpSharding> explicit_sharding) {
   if (device_name.empty()) {
     return tensorflow::gtl::optional<xla::OpSharding>();
   }
-
   DeviceNameUtils::ParsedName parsed_device;
   if (!DeviceNameUtils::ParseFullName(device_name, &parsed_device)) {
     return errors::InvalidArgument("Malformed assigned device '", device_name,
                                    "'");
   }
-  if (!parsed_device.has_type ||
-      !StringPiece(parsed_device.type)
-           .ends_with(DEVICE_SUFFIX_REPLICATED_CORE)) {
+
+  if (explicit_sharding.has_value()) {
+    return explicit_sharding;
+  } else if (!parsed_device.has_type || !parsed_device.has_id ||
+             !StringPiece(parsed_device.type)
+                  .contains(kDeviceSuffixReplicatedCore)) {
     return tensorflow::gtl::optional<xla::OpSharding>();
   } else {
     const int core = parsed_device.id;
@@ -49,24 +74,38 @@ ParseShardingFromDevice(const string& device_name, int num_cores_per_replica) {
       return CoreOutOfRangeError(core, num_cores_per_replica);
     }
     return tensorflow::gtl::optional<xla::OpSharding>(
-        xla::ShardingBuilder::AssignDevice(core));
+        xla::sharding_builder::AssignDevice(core));
   }
 }
 
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+ParseShardingFromDevice(const NodeDef& node_def, int num_cores_per_replica) {
+  const string& device_name = node_def.device();
+  TF_ASSIGN_OR_RETURN(tensorflow::gtl::optional<xla::OpSharding> sharding,
+                      GetShardingFromNodeDef(node_def));
+  return ParseShardingFromDevice(device_name, num_cores_per_replica, sharding);
+}
+
 xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
 ParseShardingFromDevice(const Node& node, int num_cores_per_replica) {
   string device_name = node.assigned_device_name();
   if (device_name.empty()) {
     device_name = node.requested_device();
   }
-  return ParseShardingFromDevice(device_name, num_cores_per_replica);
+  TF_ASSIGN_OR_RETURN(tensorflow::gtl::optional<xla::OpSharding> sharding,
+                      GetShardingFromNodeDef(node.def()));
+  return ParseShardingFromDevice(device_name, num_cores_per_replica, sharding);
 }
+
 void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst) {
   string device_name = src.assigned_device_name();
   if (device_name.empty()) {
     device_name = src.requested_device();
   }
   dst->set_assigned_device_name(device_name);
+  if (const AttrValue* attr = src.attrs().Find(kShardingAttribute)) {
+    dst->AddAttr(kShardingAttribute, *attr);
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/sharding_util.h b/tensorflow/compiler/tf2xla/sharding_util.h
index f6468bba9f950fec88dcc6b3ec760f014d3a0ef3..b1c817bdcc211648b16e395313ca171d1acb9ea9 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.h
+++ b/tensorflow/compiler/tf2xla/sharding_util.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -29,14 +29,21 @@ namespace tensorflow {
 // - if the device name is invalid.
 // - the core is parsed and is out of the range [0, num_cores_per_replica).
 //
-// Otherwise, returns either a non-value or a sharding set as per
-// xla:ShardingBuilder::AssignDevice.
+// Otherwise, returns either:
+// - explicit_sharding if explicit_sharding.has_value()
+// - a non-value if there is no assigned core or
+// - a sharding set as per xla::sharding_builder::AssignDevice.
 xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
-ParseShardingFromDevice(const string& device_name, int num_cores_per_replica);
+ParseShardingFromDevice(const string& device_name, int num_cores_per_replica,
+                        tensorflow::gtl::optional<xla::OpSharding>
+                            explicit_sharding = tensorflow::gtl::nullopt);
 
 xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
 ParseShardingFromDevice(const Node& node, int num_cores_per_replica);
 
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+ParseShardingFromDevice(const NodeDef& node_def, int num_cores_per_replica);
+
 void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index a14c93a2b9494b89f579bc20ee0510c136f8f01b..6051d7dffd7493d8cffb07c1b5d10500e7e75522 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -241,9 +241,7 @@ Status CreateXlaArgs(const Graph& graph,
     XlaCompiler::Argument arg;
     arg.kind = XlaCompiler::Argument::kParameter;
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &arg.type));
-    TensorShape shape;
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &shape));
-    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, &arg.shape));
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &arg.shape));
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kDebugNameAttr, &arg.name));
     xla_args->push_back(arg);
   }
@@ -253,8 +251,7 @@ Status CreateXlaArgs(const Graph& graph,
 // Converts the TensorFlow graph into an XLA computation, by executing the
 // graph symbolically, with each op building up the XLA HLO.
 Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
-                         xla::Computation* computation,
-                         bool* requires_runtime_context) {
+                         xla::Computation* computation) {
   XlaOpRegistry::RegisterCompilationKernels();
   for (Node* node : graph->nodes()) {
     node->set_assigned_device_name(
@@ -277,7 +274,6 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
   TF_RETURN_IF_ERROR(compiler.CompileGraph(XlaCompiler::CompileOptions(),
                                            "tfcompile", std::move(graph),
                                            xla_args, &result));
-  *requires_runtime_context = result.requires_runtime_context;
   *computation = std::move(*result.computation);
 
   int num_const_results = 0;
@@ -352,12 +348,10 @@ Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config,
 
 Status ConvertGraphDefToXla(const GraphDef& graph_def,
                             const tf2xla::Config& config, xla::Client* client,
-                            xla::Computation* computation,
-                            bool* requires_runtime_context) {
+                            xla::Computation* computation) {
   std::unique_ptr<Graph> graph;
   TF_RETURN_IF_ERROR(InitGraph(graph_def, config, &graph));
-  TF_RETURN_IF_ERROR(ConvertGraphToXla(std::move(graph), client, computation,
-                                       requires_runtime_context));
+  TF_RETURN_IF_ERROR(ConvertGraphToXla(std::move(graph), client, computation));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla.h b/tensorflow/compiler/tf2xla/tf2xla.h
index ab99beebf7946237425d4d304a858ac6817177b8..473c431b12d441c652f1d0d6c11c5e87836ab36d 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.h
+++ b/tensorflow/compiler/tf2xla/tf2xla.h
@@ -30,13 +30,9 @@ namespace tensorflow {
 //
 // The computation is built in the context of the given `client`, which may
 // subsequently be used to compile or execute the computation.
-//
-// If `requires_runtime_context` is filled with true, this indicates the last
-// argument of the computation is XlaLocalRuntimeContext*.
 Status ConvertGraphDefToXla(const GraphDef& graph_def,
                             const tf2xla::Config& config, xla::Client* client,
-                            xla::Computation* computation,
-                            bool* requires_runtime_context);
+                            xla::Computation* computation);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7aca889a266439538c4cd1c153460e6cc871b246
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/tf2xla_supported_ops.h"
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace {
+
+void PrintSupportedOps(const string& device, const string& regen_run) {
+  XlaOpRegistry::RegisterCompilationKernels();
+
+  std::vector<const KernelDef*> kdefs =
+      XlaOpRegistry::DeviceKernels(device,
+                                   /*include_compilation_only_kernels=*/true);
+  std::sort(
+      kdefs.begin(), kdefs.end(),
+      [](const KernelDef* a, const KernelDef* b) { return a->op() < b->op(); });
+
+  std::cout << "**Supported operators for device: " << device << "**\n\n"
+            << "Operator | Type Constraint\n"
+            << "-------- | ---------------" << std::endl;
+  for (const KernelDef* kdef : kdefs) {
+    std::vector<string> constraints;
+    for (const KernelDef::AttrConstraint& constraint : kdef->constraint()) {
+      std::vector<string> types;
+      for (int type : constraint.allowed_values().list().type()) {
+        types.push_back(DataTypeString(static_cast<DataType>(type)));
+      }
+      std::sort(types.begin(), types.end());
+      constraints.push_back("`" + constraint.name() + "={" +
+                            str_util::Join(types, ",") + "}`");
+    }
+    std::cout << "`" << kdef->op() << "` | "
+              << str_util::Join(constraints, "<br>") << std::endl;
+  }
+
+  std::cout << "\nTo regenerate this table, run:\n\n```shell\n"
+            << regen_run << " --device=" << device << "\n```" << std::endl;
+}
+
+}  // namespace
+
+void SupportedOpsMain(int argc, char** argv, const char* regen_run) {
+  std::vector<string> device_names = XlaOpRegistry::BackendNames();
+  std::sort(device_names.begin(), device_names.end());
+
+  // Set up and parse flags.
+  string device;
+  std::vector<Flag> flag_list = {
+      {"device", &device,
+       "Name of the compilation device for which to print supported ops, "
+       "one of: " +
+           str_util::Join(device_names, ",")},
+  };
+  string usage = Flags::Usage(argv[0], flag_list);
+  bool parsed_flags_ok = Flags::Parse(&argc, argv, flag_list);
+  QCHECK(parsed_flags_ok) << "\n" << usage;
+  QCHECK(XlaOpRegistry::IsBackendRegistered(device))
+      << "\nUnknown device: " << device << "\n"
+      << usage;
+
+  // Run the program.
+  port::InitMain(usage.c_str(), &argc, &argv);
+  QCHECK(argc == 1) << "\nERROR: This command does not take any arguments "
+                       "other than flags\n\n"
+                    << usage;
+  PrintSupportedOps(device, regen_run);
+}
+
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.h b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b45fb4cdd3b0173b04e130b7416874a9a406dc5
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_TF2XLA_SUPPORTED_OPS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_TF2XLA_SUPPORTED_OPS_H_
+
+namespace tensorflow {
+namespace tf2xla {
+
+// The implementation of a main function for a binary that prints a table of
+// supported tf2xla operators for a given device, along with their type
+// constraints, to stdout.
+//
+// Pass the argc and argv from main, unmodified.  Use regen_run to specify the
+// command used to regenerate the table.
+void SupportedOpsMain(int argc, char** argv, const char* regen_run);
+
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_SUPPORTED_OPS_H_
diff --git a/tensorflow/compiler/tf2xla/tf2xla_supported_ops_main.cc b/tensorflow/compiler/tf2xla/tf2xla_supported_ops_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..690666c2400d45e33c1a5d1818b68a86a70a5be3
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/tf2xla_supported_ops_main.cc
@@ -0,0 +1,22 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/tf2xla_supported_ops.h"
+
+int main(int argc, char** argv) {
+  const char* regen_run =
+      "bazel run -c opt -- tensorflow/compiler/tf2xla:tf2xla_supported_ops";
+  tensorflow::tf2xla::SupportedOpsMain(argc, argv, regen_run);
+}
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index ecd15652fe84b0c19d2f7fc18f877236547f9be9..a9978e697b091715ce120f0d18fdddd259e08b32 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -70,10 +70,7 @@ TEST(ConvertGraphDefToXla, Sum) {
 
   xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
   xla::Computation computation;
-  bool requires_runtime_context;
-  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation,
-                                    &requires_runtime_context));
-  ASSERT_FALSE(requires_runtime_context);
+  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
 
   // Set up arguments.
   auto x_literal = xla::Literal::CreateR0<int32>(10);
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 55f2f3149c6ba7bfa18608f961c8a76103a50756..f428a194328935fec1210ea96245344de859e611 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -88,8 +88,8 @@ Status ValidateConfig(const tf2xla::Config& config) {
     TF_RETURN_IF_ERROR(CheckNameDuplicates("fetch", fetch.name(), &names));
   }
   TF_RETURN_IF_ERROR(CheckFeedFetchNameConflicts("fetch", names));
-  if (config.feed().empty() || config.fetch().empty()) {
-    return errors::InvalidArgument("feeds and fetches must be specified");
+  if (config.fetch().empty()) {
+    return errors::InvalidArgument("fetches must be specified");
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index 436039e154842443f779aba276bc571fc2ab7537..ed10d80609641b090cf78bf2e17364fe2fa89c31 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -58,24 +58,14 @@ TEST(ValidateConfig, Good) {
 
 TEST(ValidateConfig, BadEmpty) {
   tf2xla::Config config;
-  ExpectErrorContains(ValidateConfig(config),
-                      "feeds and fetches must be specified");
-}
-
-TEST(ValidateConfig, BadNoFeed) {
-  tf2xla::Config config;
-  tf2xla::Fetch* fetch = config.add_fetch();
-  fetch->mutable_id()->set_node_name("foo");
-  ExpectErrorContains(ValidateConfig(config),
-                      "feeds and fetches must be specified");
+  ExpectErrorContains(ValidateConfig(config), "fetches must be specified");
 }
 
 TEST(ValidateConfig, BadNoFetch) {
   tf2xla::Config config;
   tf2xla::Feed* feed = config.add_feed();
   feed->mutable_id()->set_node_name("foo");
-  ExpectErrorContains(ValidateConfig(config),
-                      "feeds and fetches must be specified");
+  ExpectErrorContains(ValidateConfig(config), "fetches must be specified");
 }
 
 TEST(ValidateConfig, BadFeedNodeName) {
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index 4f32c29954b2d809d31ef8c584b6a6c3dcdf5cef..fcb0a4e63814b4afc114bdaea312a92dd8396a2e 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -100,7 +100,7 @@ void XlaCompilationDevice::Compute(OpKernel* op_kernel,
   b->SetOpMetadata(metadata);
 
   auto sharding_parse_result = ParseShardingFromDevice(
-      op_kernel->requested_device(), std::numeric_limits<int>::max());
+      op_kernel->def(), std::numeric_limits<int>::max());
   OP_REQUIRES_OK(context, sharding_parse_result.status());
   tensorflow::gtl::optional<xla::OpSharding> op_sharding =
       sharding_parse_result.ValueOrDie();
@@ -135,98 +135,4 @@ void XlaExpression::set_constant_value(Tensor value) {
   constant_value_ = std::move(value);
 }
 
-Status XlaResource::GetXlaShape(xla::ComputationBuilder* builder,
-                                xla::Shape* shape) const {
-  auto shape_or_status = builder->GetShape(value);
-  if (!shape_or_status.ok()) {
-    return shape_or_status.status();
-  }
-  *shape = *shape_or_status.ValueOrDie();
-  return Status::OK();
-}
-
-Status XlaResource::GetShape(xla::ComputationBuilder* builder,
-                             TensorShape* shape) const {
-  xla::Shape xla_shape;
-  TF_RETURN_IF_ERROR(GetXlaShape(builder, &xla_shape));
-  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(xla_shape, shape));
-  return Status::OK();
-}
-
-Status XlaResource::GetOrCreateTensorArrayGradient(
-    const string& source, xla::ComputationBuilder* builder,
-    XlaResource** gradient_out) {
-  VLOG(2) << "Gradient lookup for resource: " << name
-          << " gradient: " << source;
-  TF_RET_CHECK(kind == kTensorArray);
-  std::unique_ptr<XlaResource>& gradient = tensor_array_gradients[source];
-  if (!gradient) {
-    gradient.reset(new XlaResource);
-    gradient->kind = XlaResource::kTensorArray;
-    gradient->name = strings::StrCat("TensorArrayGrad: ", name);
-    gradient->type = type;
-    gradient->tensor_array_size = tensor_array_size;
-
-    TensorShape ta_shape;
-    TF_RETURN_IF_ERROR(GetShape(builder, &ta_shape));
-    gradient->value = builder->Broadcast(XlaHelpers::Zero(builder, type),
-                                         ta_shape.dim_sizes());
-    gradient->initial_value = gradient->value;
-  }
-  *gradient_out = gradient.get();
-  return Status::OK();
-}
-
-Status XlaResource::PackedShape(xla::ComputationBuilder* builder,
-                                xla::Shape* packed_shape) const {
-  if (tensor_array_gradients.empty()) {
-    return GetXlaShape(builder, packed_shape);
-  }
-  TF_RET_CHECK(kind == kTensorArray);
-  std::vector<xla::Shape> elem_shapes(1 + tensor_array_gradients.size());
-  int pos = 0;
-  TF_RETURN_IF_ERROR(GetXlaShape(builder, &elem_shapes[pos++]));
-  for (const auto& gradient : tensor_array_gradients) {
-    TF_RETURN_IF_ERROR(
-        gradient.second->GetXlaShape(builder, &elem_shapes[pos++]));
-  }
-  *packed_shape = xla::ShapeUtil::MakeTupleShape(elem_shapes);
-  return Status::OK();
-}
-
-Status XlaResource::Pack(xla::ComputationDataHandle* pack,
-                         xla::ComputationBuilder* builder) const {
-  if (tensor_array_gradients.empty()) {
-    *pack = value;
-  } else {
-    TF_RET_CHECK(kind == kTensorArray);
-    std::vector<xla::ComputationDataHandle> elems;
-    elems.push_back(value);
-    for (const auto& gradient : tensor_array_gradients) {
-      elems.push_back(gradient.second->value);
-    }
-    *pack = builder->Tuple(elems);
-  }
-  return Status::OK();
-}
-
-Status XlaResource::SetFromPack(const std::set<string>& gradient_sources,
-                                const xla::ComputationDataHandle& pack,
-                                xla::ComputationBuilder* builder) {
-  if (gradient_sources.empty()) {
-    value = pack;
-  } else {
-    TF_RET_CHECK(kind == kTensorArray);
-    int pos = 0;
-    value = builder->GetTupleElement(pack, pos++);
-    for (const auto& source : gradient_sources) {
-      XlaResource* gradient;
-      TF_RETURN_IF_ERROR(
-          GetOrCreateTensorArrayGradient(source, builder, &gradient));
-      gradient->value = builder->GetTupleElement(pack, pos++);
-    }
-  }
-  return Status::OK();
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.h b/tensorflow/compiler/tf2xla/xla_compilation_device.h
index 6230acd718bc330f178007b575b5119de5b3d4f4..0243ee332fbdca0fe5e28b1a7d9530df4417f807 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.h
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/common_runtime/local_device.h"
@@ -66,87 +67,6 @@ class XlaCompilationDevice : public LocalDevice {
   std::unique_ptr<XlaCompilationAllocator> allocator_;
 };
 
-// Represents a resource, such as a Variable or TensorArray.
-// TODO(phawkins): make this into a properly abstracted class.
-struct XlaResource {
-  enum Kind {
-    kInvalid,
-    kVariable,
-    kTensorArray,
-    kStack,
-  };
-
-  Kind kind = kInvalid;
-
-  // If this resource is visible externally, what was its argument number?
-  int arg_num = -1;
-
-  // A descriptive name for the resource, used in error messages.
-  string name;
-
-  // Current type and value of the resource. Uninitialized resources are
-  // represented by a default (zero) handle and type DT_INVALID.
-  // While the type of a resource is notionally fixed during execution, when
-  // a resource is first initialized we do not yet know its type, so we keep
-  // track of its type dynamically.
-  DataType type = DT_INVALID;
-  xla::ComputationDataHandle value;
-
-  // Value of the resource at computation entry. Used to detect which
-  // variables have new values that need to be written back.
-  xla::ComputationDataHandle initial_value;
-
-  // TensorArray-specific fields
-
-  // 'tensor_array_size' stores the expected size of the TensorArray. We need
-  // to store this since sometimes TensorArrays must be initialized lazily since
-  // we do not know the element shape at construction time.
-  int64 tensor_array_size = -1;
-
-  // 'tensor_array_gradient' is a map from TensorArrayGradV3 'source' attributes
-  // to an XlaResource containing the gradient TensorArrays. We store a pointer
-  // here since there should only be one gradient TensorArray per 'source'
-  // string, irrespective of the number of calls to TensorArrayGrad. The map
-  // is ordered since values are packed into tuples by Pack() sorted by name
-  // order.
-  std::map<string, std::unique_ptr<XlaResource>> tensor_array_gradients;
-
-  // Returns the shape of the resource as an xla::Shape.
-  Status GetXlaShape(xla::ComputationBuilder* builder, xla::Shape* shape) const;
-
-  // Returns the shape of the resource as an TensorShape. Fails if the shape is
-  // not representable as a TensorShape.
-  Status GetShape(xla::ComputationBuilder* builder, TensorShape* shape) const;
-
-  // Looks up the gradient for `source`, or creates it if it does not already
-  // exist. The call target must be an initialized TensorArray resource. A
-  // TensorArray can have multiple named gradients; see the operator
-  // documentation for TensorArrayGradV3 for details.
-  Status GetOrCreateTensorArrayGradient(const string& source,
-                                        xla::ComputationBuilder* builder,
-                                        XlaResource** gradient_out);
-
-  // Packs a resource into a single XLA value `pack`, suitable for use as
-  // an XlaCompiler::Argument. For non-TensorArrays or TensorArrays without
-  // gradients, sets `*pack` to `value`.
-  // For TensorArrays with gradients, packs the value and its gradient values in
-  // a tuple; the gradients values are packed in order by source name.
-  Status Pack(xla::ComputationDataHandle* pack,
-              xla::ComputationBuilder* builder) const;
-
-  // Returns the shape of the `pack` value computed by `Pack()`.
-  Status PackedShape(xla::ComputationBuilder* builder,
-                     xla::Shape* packed_shape) const;
-
-  // Updates the resource with values from `pack`. If `gradient_sources` is
-  // non-empty, treats `pack` as a tuple that represents a TensorArray and
-  // its gradients, and unpacks and updates the gradient resources. Opposite
-  // of Pack().
-  Status SetFromPack(const std::set<string>& gradient_sources,
-                     const xla::ComputationDataHandle& pack,
-                     xla::ComputationBuilder* builder);
-};
-
 // A XlaExpression wraps an XLA computation. Each Tensor on an
 // XlaCompilationDevice contains an XlaExpression, and the shape of the Tensor
 // matches the shape of the subcomputation in the ComputationDataHandle. Each
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index b5c17c5273bb15e20184b2fefd93880d4828105e..672e19bd93449ccc31f4af5ded23257b197a3c39 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -28,9 +28,10 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
       temps_(new void*[static_data.num_temps]),
       arg_names_(static_data.arg_names),
       result_names_(static_data.result_names),
-      program_shape_(static_data.program_shape) {
+      program_shape_(static_data.program_shape),
+      hlo_profile_printer_data_(static_data.hlo_profile_printer_data) {
   // Allocate arg and temp buffers.
-  if (alloc_mode == AllocMode::ARGS_RESULTS_AND_TEMPS) {
+  if (alloc_mode == AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS) {
     alloc_args_ = tensorflow::tfcompile::runtime::MallocContiguousBuffers(
         static_data.arg_sizes, static_data.num_args, args_,
         /*annotate_initialized=*/false);
@@ -39,9 +40,13 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
       static_data.temp_sizes, static_data.num_temps, temps_,
       /*annotate_initialized=*/true);
 
-  // The runtime context is always the last arg, if it is required.
-  if (static_data.requires_runtime_context) {
-    args_[static_data.num_args - 1] = &context_;
+  // If Hlo profiling is enabled the generated code expects an appropriately
+  // sized buffer to be passed in as the last argument.  If Hlo profiling is
+  // disabled the last function argument is still present in the function
+  // signature, but it is ignored by the generated code and we pass in null for
+  // it.
+  if (hlo_profiling_enabled()) {
+    profile_counters_ = new int64[static_data.profile_counters_size]();
   }
 }
 
@@ -50,6 +55,7 @@ XlaCompiledCpuFunction::~XlaCompiledCpuFunction() {
   tensorflow::tfcompile::runtime::FreeContiguous(alloc_temps_);
   delete[] args_;
   delete[] temps_;
+  delete[] profile_counters_;
 }
 
 namespace {
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index f49a7889222ff989144217ab10b27595f89e4311..48a8c083cacf2f6ecf9dc1817b6174c01385d035 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -16,10 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_H_
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_H_
 
-#include <functional>
+#include <cassert>
 #include <string>
 
-#include "tensorflow/compiler/tf2xla/xla_local_runtime_context.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -27,6 +26,7 @@ limitations under the License.
 // never use this functionality.
 namespace xla {
 class ProgramShape;
+class HloProfilePrinterData;
 }
 
 namespace tensorflow {
@@ -48,12 +48,10 @@ namespace tensorflow {
 class XlaCompiledCpuFunction {
  public:
   // Type of the raw function, produced by either JIT or AOT.
-  //
-  // TODO(toddw): Add support for hlo profiling, and replace std::function with
-  // a raw function pointer, for some codesize savings.
-  using RawFunction = std::function<void(
-      void* result, const xla::ExecutableRunOptions* run_options,
-      const void** args, void** temps)>;
+  using RawFunction = void (*)(void* result,
+                               const xla::ExecutableRunOptions* run_options,
+                               const void** args, void** temps,
+                               int64* profile_counters);
 
   // StaticData represents the state necessary to run an XLA-compiled
   // function. For JIT this is backed by data in XlaJitCompiledCpuFunction; for
@@ -71,9 +69,6 @@ class XlaCompiledCpuFunction {
     // The 0-based index of the result tuple, in the temp buffers.
     size_t result_index = 0;
 
-    // Is the final arg XlaLocalRuntimeContext?
-    bool requires_runtime_context = false;
-
     // [Optional] Arrays of arg and result names. These are arrays of C-style
     // strings, where the array is terminated by nullptr.
     const char** arg_names = nullptr;
@@ -81,21 +76,31 @@ class XlaCompiledCpuFunction {
 
     // [Optional] Arg and result shapes.
     const xla::ProgramShape* program_shape = nullptr;
+
+    // [Optional] Profile printer data.  Null if profiling is disabled.
+    const xla::HloProfilePrinterData* hlo_profile_printer_data = nullptr;
+
+    // [Optional] The number of profile counters expected in the profile counter
+    // buffer by the generated code and hlo_profile_printer.  0 if profiling is
+    // disabled.  This information is already present in
+    // hlo_profile_printer_data but xla::HloProfilePrinterData is forward
+    // declared so we don't have access to that information here.
+    int64 profile_counters_size = 0;
   };
 
   // AllocMode controls the buffer allocation mode.
   enum class AllocMode {
-    // Allocate all buffers - args, results and temps.
-    ARGS_RESULTS_AND_TEMPS,
+    // Allocate all buffers - args, results, profile and temps.
+    ARGS_RESULTS_PROFILES_AND_TEMPS,
 
-    // Only allocate result and temp buffers.
+    // Only allocate result, profile and temp buffers.
     // Use set_arg_data to set argument buffers before Run is called.
-    RESULTS_AND_TEMPS_ONLY,
+    RESULTS_PROFILES_AND_TEMPS_ONLY,
   };
 
   XlaCompiledCpuFunction(
       const StaticData& static_data,
-      AllocMode alloc_mode = AllocMode::ARGS_RESULTS_AND_TEMPS);
+      AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS);
   virtual ~XlaCompiledCpuFunction();
 
   XlaCompiledCpuFunction(const XlaCompiledCpuFunction&) = delete;
@@ -104,21 +109,22 @@ class XlaCompiledCpuFunction {
   // Sets the intra-op thread pool used to run individual ops concurrently.
   void set_thread_pool(const Eigen::ThreadPoolDevice* pool) {
     run_options_.set_intra_op_thread_pool(pool);
-    context_.thread_pool = pool;
   }
 
   // Runs the computation, with inputs read from arg buffers, and outputs
   // written to result buffers. Returns true on success and false on failure.
   bool Run() {
-    context_.error = false;
-    context_.error_msg.clear();
     raw_function_(temps_[result_index_], &run_options_,
-                  const_cast<const void**>(args_), temps_);
-    return !context_.error;
+                  const_cast<const void**>(args_), temps_, profile_counters_);
+    return true;
   }
 
   // Returns the error message from the previous failed Run call.
-  const string& error_msg() const { return context_.error_msg; }
+  //
+  // TODO(fschneider): For now this always returns an empty string because there
+  // is no support for error reporting in XLA. Remove this once all callers are
+  // updated.
+  string error_msg() const { return {}; }
 
   // ------------------------------
   // Arg methods for managing input buffers. Buffers are in row-major order.
@@ -141,10 +147,6 @@ class XlaCompiledCpuFunction {
   // tensorflow::tfcompile::runtime::kAlign. If possible, use the functions in
   // tensorflow/compiler/aot/runtime.h to ensure correct alignment.
   //
-  // If StaticData.requires_runtime_context==true, the final argument is an
-  // XlaLocalRuntimeContext, which is managed internally by this class, and
-  // should not be changed.
-  //
   // Aliasing of argument and result buffers is not allowed, and results in
   // undefined behavior.
   void set_arg_data(size_t index, void* data) { args_[index] = data; }
@@ -162,6 +164,16 @@ class XlaCompiledCpuFunction {
     return static_cast<const void* const*>(temps_[result_index_]);
   }
 
+  // Profile counters for this XLA computation.
+  //
+  // When Hlo profiling is enabled (`hlo_profiling_enabled()` return true in
+  // this case) these counters are non-null and are automatically populated by
+  // `Run`.  The counters can then be pretty-printed using
+  // `hlo_profile_printer()`.
+  //
+  // When Hlo profiling is disabled, this accessor returns null.
+  const int64* profile_counters() const { return profile_counters_; }
+
   // Returns the buffer for the positional result at the given `index`.
   void* result_data(size_t index) { return results()[index]; }
   const void* result_data(size_t index) const { return results()[index]; }
@@ -195,6 +207,14 @@ class XlaCompiledCpuFunction {
   // program shape isn't available.
   const xla::ProgramShape* ProgramShape() const { return program_shape_; }
 
+  bool hlo_profiling_enabled() const {
+    return hlo_profile_printer_data_ != nullptr;
+  }
+  const xla::HloProfilePrinterData& hlo_profile_printer_data() const {
+    assert(hlo_profiling_enabled());
+    return *hlo_profile_printer_data_;
+  }
+
  private:
   const RawFunction raw_function_;
   const size_t result_index_;
@@ -208,14 +228,17 @@ class XlaCompiledCpuFunction {
   void* alloc_args_ = nullptr;
   void* alloc_temps_ = nullptr;
 
+  // Backing memory for profiling counters.
+  int64* profile_counters_ = nullptr;
+
   // Options and context passed to the compiled function.
   xla::ExecutableRunOptions run_options_;
-  tensorflow::XlaLocalRuntimeContext context_;
 
   // Optional metadata.
   const char** arg_names_ = nullptr;
   const char** result_names_ = nullptr;
   const xla::ProgramShape* program_shape_ = nullptr;
+  const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 48cebdf74c71f974bf075e0255626ec57eb9a149..59e88304422eaeaaf3f63cc4d476a8ec7ce95623 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -66,13 +66,14 @@ Status CheckSignature(const DataTypeVector& types,
 
 bool XlaCompiler::Argument::operator==(
     const XlaCompiler::Argument& other) const {
-  if (std::tie(kind, resource_kind, type, name, tensor_array_size,
+  if (std::tie(kind, resource_kind, type, name, initialized, tensor_array_size,
                tensor_array_gradients) !=
       std::tie(other.kind, other.resource_kind, other.type, other.name,
-               other.tensor_array_size, other.tensor_array_gradients)) {
+               other.initialized, other.tensor_array_size,
+               other.tensor_array_gradients)) {
     return false;
   }
-  if (!xla::ShapeUtil::Equal(shape, other.shape)) {
+  if (shape != other.shape) {
     return false;
   }
   if (constant_value.shape() != other.constant_value.shape()) {
@@ -152,7 +153,8 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   std::unique_ptr<Graph> graph(new Graph(options_.flib_def));
   CopyGraph(*fbody->graph, graph.get());
   OptimizerOptions opts;
-  opts.set_do_common_subexpression_elimination(true);
+  opts.set_opt_level(OptimizerOptions::L0);
+  opts.set_do_common_subexpression_elimination(false);
   opts.set_do_function_inlining(true);
   opts.set_do_constant_folding(true);
   GraphOptimizer optimizer(opts);
@@ -183,8 +185,7 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
       CheckSignature(fbody->arg_types, args),
       "Signature check failure while compiling: ", function.name());
 
-  std::unique_ptr<Graph> graph(new Graph(options_.flib_def));
-  CopyGraph(*fbody->graph, graph.get());
+  std::unique_ptr<Graph> graph = GetGraph(fbody);
 
   // _Arg and _Retval nodes don't exist in the stored subgraph for the function;
   // they are added by the function body looked up.  Therefore, they don't have
@@ -212,15 +213,6 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
                    *graph);
   }
 
-  // Optimize the graph before running the compiler.
-  OptimizerOptions opts;
-  opts.set_do_common_subexpression_elimination(true);
-  opts.set_do_function_inlining(true);
-  opts.set_do_constant_folding(true);
-  GraphOptimizer optimizer(opts);
-  optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
-                     /*device=*/nullptr, &graph, /*shape_map=*/nullptr);
-
   VLOG(1) << "====================================================";
   TF_RETURN_IF_ERROR(
       CompileGraph(options, function_id, std::move(graph), args, result));
@@ -230,6 +222,64 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
   return Status::OK();
 }
 
+// Computes the XLA shape for argument 'arg'.
+/*static*/ Status XlaCompiler::XLAShapeForArgument(
+    const XlaCompiler::Argument& arg, xla::Shape* xla_shape) {
+  switch (arg.kind) {
+    case XlaCompiler::Argument::kConstant:
+      return TensorShapeToXLAShape(arg.type, arg.constant_value.shape(),
+                                   xla_shape);
+    case XlaCompiler::Argument::kParameter:
+      return TensorShapeToXLAShape(arg.type, arg.shape, xla_shape);
+    case XlaCompiler::Argument::kResource: {
+      TF_RET_CHECK(arg.initialized);
+
+      switch (arg.resource_kind) {
+        case XlaResource::kVariable:
+          return TensorShapeToXLAShape(arg.type, arg.shape, xla_shape);
+        case XlaResource::kTensorArray: {
+          if (arg.tensor_array_size < 0) {
+            return errors::InvalidArgument(
+                "Negative tensor_array_size in XLAShapeForArgument");
+          }
+          TensorShape shape;
+          shape.AddDim(arg.tensor_array_size);
+          shape.AppendShape(arg.shape);
+          TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, xla_shape));
+
+          if (!arg.tensor_array_gradients.empty()) {
+            std::vector<xla::Shape> tuple_shape(
+                arg.tensor_array_gradients.size() + 1, *xla_shape);
+            *xla_shape = xla::ShapeUtil::MakeTupleShape(tuple_shape);
+          }
+          return Status::OK();
+        }
+        case XlaResource::kStack: {
+          if (arg.tensor_array_size < 0) {
+            return errors::InvalidArgument(
+                "Negative tensor_array_size in XLAShapeForArgument");
+          }
+          TensorShape shape;
+          shape.AddDim(arg.tensor_array_size);
+          shape.AppendShape(arg.shape);
+          xla::Shape buffer_shape;
+          TF_RETURN_IF_ERROR(
+              TensorShapeToXLAShape(arg.type, shape, &buffer_shape));
+          *xla_shape = xla::ShapeUtil::MakeTupleShape(
+              {buffer_shape, xla::ShapeUtil::MakeShape(xla::S32, {})});
+          return Status::OK();
+        }
+
+        case XlaResource::kInvalid:
+          return errors::Internal(
+              "Invalid resource type in XLAShapeForArgument()");
+      }
+    }
+    case XlaCompiler::Argument::kInvalid:
+      return errors::Internal("Invalid argument type in XLAShapeForArgument()");
+  }
+}
+
 namespace {
 
 Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
@@ -268,14 +318,16 @@ Status BuildArguments(const Graph& graph,
                       XlaContext* context, std::vector<int>* arg_cores,
                       std::vector<XlaExpression>* arg_expressions,
                       std::vector<int>* input_mapping,
-                      std::vector<xla::Shape>* input_shapes) {
+                      std::vector<xla::Shape>* input_shapes,
+                      bool is_entry_computation) {
   arg_expressions->resize(args.size());
   *arg_cores = std::vector<int>(args.size(), -1);
 
   // Argument numbers of arguments and resources that are to be passed to the
   // XLA computation as runtime parameters.
-  std::vector<int> parameters, resources;
-  parameters.reserve(args.size());
+  input_mapping->clear();
+  input_mapping->reserve(args.size());
+  std::vector<int> resources;
   resources.reserve(args.size());
 
   // Fills in constant arguments, and computes non-constant argument order.
@@ -289,18 +341,20 @@ Status BuildArguments(const Graph& graph,
         // TODO(phawkins): this code assumes that resource arguments do not
         // alias.
         XlaResource* resource;
-        TF_RETURN_IF_ERROR(
-            context->CreateResource(arg.resource_kind, i, arg.name, arg.type,
-                                    xla::ComputationDataHandle(), &resource));
-        resource->tensor_array_size = arg.tensor_array_size;
+        TF_RETURN_IF_ERROR(context->CreateResource(
+            arg.resource_kind, i, arg.name, arg.type, arg.shape,
+            xla::ComputationDataHandle(),
+            /*tensor_array_size=*/arg.tensor_array_size,
+            /*tensor_array_gradients=*/arg.tensor_array_gradients, &resource));
         arg_expression.set_resource(resource);
         if (arg.initialized) {
           resources.push_back(i);
         }
         break;
-      case XlaCompiler::Argument::kParameter:
-        parameters.push_back(i);
+      case XlaCompiler::Argument::kParameter: {
+        input_mapping->push_back(i);
         break;
+      }
       case XlaCompiler::Argument::kConstant:
         arg_expression.set_constant_value(arg.constant_value);
         break;
@@ -311,18 +365,23 @@ Status BuildArguments(const Graph& graph,
 
   // Append parameters containing variable values after the other runtime
   // parameters.
-  parameters.insert(parameters.end(), resources.begin(), resources.end());
-  if (parameters.empty()) {
+  input_mapping->insert(input_mapping->end(), resources.begin(),
+                        resources.end());
+  if (input_mapping->empty()) {
     return Status::OK();
   }
 
-  input_shapes->resize(parameters.size());
-  input_mapping->resize(parameters.size());
-  for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
-    const XlaCompiler::Argument& arg = args[parameters[i]];
+  std::vector<xla::Shape> arg_shapes(input_mapping->size());
+  for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
     // Computes the shapes of non-constant arguments.
-    (*input_shapes)[i] = arg.shape;
-    (*input_mapping)[i] = parameters[i];
+    TF_RETURN_IF_ERROR(XlaCompiler::XLAShapeForArgument(
+        args[(*input_mapping)[i]], &arg_shapes[i]));
+  }
+
+  if (use_tuple_arg) {
+    input_shapes->push_back(xla::ShapeUtil::MakeTupleShape(arg_shapes));
+  } else {
+    *input_shapes = arg_shapes;
   }
 
   // Use the _Arg nodes in the graph to resolve core assignments.
@@ -346,24 +405,38 @@ Status BuildArguments(const Graph& graph,
   }
 
   // Build parameter handles for non-constant arguments.
-  std::vector<xla::ComputationDataHandle> arg_handles(parameters.size());
+  std::vector<xla::ComputationDataHandle> arg_handles(input_mapping->size());
   if (use_tuple_arg) {
-    xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(*input_shapes);
-    xla::ComputationDataHandle tuple =
-        builder->Parameter(0, tuple_shape, "arg_tuple");
-    for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
-      const int core = (*arg_cores)[parameters[i]];
+    xla::ComputationDataHandle tuple;
+    if (is_entry_computation) {
+      xla::OpSharding tuple_sharding;
+      tuple_sharding.set_type(xla::OpSharding::Type::OpSharding_Type_TUPLE);
+      for (int64 parameter : *input_mapping) {
+        const int core = (*arg_cores)[parameter];
+        const int root_device = 0;
+        *tuple_sharding.add_tuple_shardings() =
+            core == -1 ? xla::sharding_builder::AssignDevice(root_device)
+                       : xla::sharding_builder::AssignDevice(core);
+      }
+      xla::ScopedShardingAssignment assign_tuple_sharding(builder,
+                                                          tuple_sharding);
+      tuple = builder->Parameter(0, (*input_shapes)[0], "arg_tuple");
+    } else {
+      tuple = builder->Parameter(0, (*input_shapes)[0], "arg_tuple");
+    }
+    for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+      const int core = (*arg_cores)[input_mapping->at(i)];
       xla::ScopedShardingAssignment assign_sharding(
           builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
-                              : xla::ShardingBuilder::AssignDevice(core));
+                              : xla::sharding_builder::AssignDevice(core));
       arg_handles[i] = builder->GetTupleElement(tuple, i);
     }
   } else {
-    for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
-      const int core = (*arg_cores)[parameters[i]];
+    for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+      const int core = (*arg_cores)[input_mapping->at(i)];
       xla::ScopedShardingAssignment assign_sharding(
           builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
-                              : xla::ShardingBuilder::AssignDevice(core));
+                              : xla::sharding_builder::AssignDevice(core));
       arg_handles[i] =
           builder->Parameter(i, (*input_shapes)[i], strings::StrCat("arg", i));
     }
@@ -371,12 +444,12 @@ Status BuildArguments(const Graph& graph,
 
   // Fill in the handles in non-constant arguments.
   VLOG(2) << "XLA computation inputs:";
-  for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
-    const XlaCompiler::Argument& arg = args[parameters[i]];
+  for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+    const XlaCompiler::Argument& arg = args[input_mapping->at(i)];
     VLOG(2) << "  XLA arg " << i
-            << " shape: " << xla::ShapeUtil::HumanString((*input_shapes)[i])
-            << " name: " << arg.name << " TF arg " << parameters[i];
-    XlaExpression& arg_expression = (*arg_expressions)[parameters[i]];
+            << " shape: " << xla::ShapeUtil::HumanString(arg_shapes[i])
+            << " name: " << arg.name << " TF arg " << input_mapping->at(i);
+    XlaExpression& arg_expression = (*arg_expressions)[input_mapping->at(i)];
     switch (arg.kind) {
       case XlaCompiler::Argument::kResource: {
         TF_RET_CHECK(arg.initialized);
@@ -385,10 +458,6 @@ Status BuildArguments(const Graph& graph,
                                                  arg_handles[i], builder));
         VLOG(2) << "    resource: num_gradients: "
                 << arg.tensor_array_gradients.size();
-        resource->initial_value = resource->value;
-        for (const auto& gradient : resource->tensor_array_gradients) {
-          gradient.second->initial_value = gradient.second->value;
-        }
         break;
       }
       case XlaCompiler::Argument::kParameter:
@@ -439,43 +508,44 @@ Status BuildComputation(
   std::vector<const XlaResource*> arg_resources;
   arg_resources.reserve(resources.size());
   for (const auto& resource : resources) {
-    if (resource->arg_num >= 0) {
+    if (resource->arg_num() >= 0) {
       arg_resources.push_back(resource.get());
     }
   }
   std::sort(arg_resources.begin(), arg_resources.end(),
             [](const XlaResource* a, const XlaResource* b) {
-              return a->arg_num < b->arg_num;
+              return a->arg_num() < b->arg_num();
             });
 
   for (const XlaResource* resource : arg_resources) {
-    const XlaCompiler::Argument& arg = args[resource->arg_num];
-    const int core = arg_cores[resource->arg_num];
-    DCHECK_LT(resource->arg_num, arg_cores.size());
+    const XlaCompiler::Argument& arg = args[resource->arg_num()];
+    const int core = arg_cores[resource->arg_num()];
+    DCHECK_LT(resource->arg_num(), arg_cores.size());
     bool modified =
-        resource->value.handle() != resource->initial_value.handle();
+        resource->value().handle() != resource->initial_value().handle();
     // TensorArray gradients were modified if their values changed or there are
     // any newly created gradients.
-    for (const auto& grad : resource->tensor_array_gradients) {
-      modified =
-          modified ||
-          grad.second->value.handle() != grad.second->initial_value.handle() ||
-          arg.tensor_array_gradients.count(grad.first) == 0;
+    for (const auto& grad : resource->tensor_array_gradients()) {
+      modified = modified ||
+                 grad.second->value().handle() !=
+                     grad.second->initial_value().handle() ||
+                 arg.tensor_array_gradients.count(grad.first) == 0;
     }
     if (return_updated_values_for_all_resources || modified) {
       resource_updates->emplace_back();
       XlaCompiler::ResourceUpdate& update = resource_updates->back();
-      update.input_index = resource->arg_num;
-      update.type = resource->type;
+      update.input_index = resource->arg_num();
+      update.type = resource->type();
+      update.shape = resource->shape();
       update.modified = modified;
-      for (const auto& grad : resource->tensor_array_gradients) {
+      for (const auto& grad : resource->tensor_array_gradients()) {
         update.tensor_array_gradients_accessed.insert(grad.first);
       }
 
       // Request that the value be returned on a specific core.
       xla::ScopedShardingAssignment assign_sharding(
           builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
-                              : xla::ShardingBuilder::AssignDevice(core));
+                              : xla::sharding_builder::AssignDevice(core));
 
       xla::ComputationDataHandle handle;
       TF_RETURN_IF_ERROR(resource->Pack(&handle, builder));
@@ -502,18 +572,6 @@ Status BuildComputation(
   return Status::OK();
 }
 
-void AssignMajorToMinorLayout(xla::Shape* shape) {
-  if (xla::ShapeUtil::IsTuple(*shape)) {
-    for (xla::Shape& elem_shape : *shape->mutable_tuple_shapes()) {
-      AssignMajorToMinorLayout(&elem_shape);
-    }
-  } else {
-    auto& minor_to_major = *shape->mutable_layout()->mutable_minor_to_major();
-    minor_to_major.Resize(xla::ShapeUtil::Rank(*shape), 0);
-    std::iota(minor_to_major.rbegin(), minor_to_major.rend(), 0);
-  }
-}
-
 }  // namespace
 
 Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
@@ -543,13 +601,12 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                      options.resolve_compile_time_constants);
   core::ScopedUnref context_unref(context);
 
-  result->tuple_arg = options.use_tuple_arg;
-
   std::vector<XlaExpression> arg_expressions;
   std::vector<int> arg_cores;
-  TF_RETURN_IF_ERROR(BuildArguments(
-      *graph, args, options.use_tuple_arg, &builder, context, &arg_cores,
-      &arg_expressions, &result->input_mapping, &result->xla_input_shapes));
+  TF_RETURN_IF_ERROR(
+      BuildArguments(*graph, args, options.use_tuple_arg, &builder, context,
+                     &arg_cores, &arg_expressions, &result->input_mapping,
+                     &result->xla_input_shapes, options.is_entry_computation));
   context->set_args(std::move(arg_expressions));
 
   TF_RETURN_IF_ERROR(ExecuteGraph(context, std::move(graph), device_,
@@ -564,11 +621,6 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
       result->computation.get(), &num_computation_outputs,
       &num_nonconst_outputs, &result->resource_updates));
 
-  result->requires_runtime_context = context->has_context_parameter();
-
-  // Tuple arguments and runtime context parameters are incompatible.
-  TF_RET_CHECK(!(options.use_tuple_arg && result->requires_runtime_context));
-
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
   result->outputs.resize(context->retvals().size());
@@ -596,7 +648,7 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
           << xla::ShapeUtil::HumanString(result->xla_output_shape);
 
   // Tensorflow expects a major-to-minor order of results.
-  AssignMajorToMinorLayout(&result->xla_output_shape);
+  xla::LayoutUtil::SetToDefaultLayout(&result->xla_output_shape);
 
   // Converts the output shapes to TensorShapes.
   int computation_output = 0;
@@ -615,13 +667,6 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
       ++computation_output;
     }
   }
-
-  for (std::vector<ResourceUpdate>::size_type i = 0;
-       i < result->resource_updates.size(); ++i) {
-    result->resource_updates[i].shape = xla::ShapeUtil::GetTupleElementShape(
-        result->xla_output_shape, computation_output);
-    ++computation_output;
-  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index ac7d4cfb127d1de8c92f3a855191c45af77888ad..b86c82c0ab5ce379d35a13043857f459199e2ad2 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -54,8 +54,6 @@ namespace tensorflow {
 //   +---------------------+-----------------------------------------+
 // Within each block, the arguments are arranged by the _Arg index from which
 // they were derived.
-// If `Options::requires_runtime_context` is true, then an additional runtime
-// context argument is passed as a final argument.
 //
 // The run-time outputs of the XLA computation are arranged in the following
 // order:
@@ -106,9 +104,17 @@ class XlaCompiler {
     // is the type of the variable's value, not DT_RESOURCE.
     DataType type;
 
-    // The shape of the argument. If the argument is a resource, this is the
-    // shape of the resource's value.
-    xla::Shape shape;
+    // The shape of the argument. For:
+    // * a parameter: the shape of the parameter.
+    // * a constant: ignored; the shape given by constant_value is used
+    //     instead.
+    // * an uninitialized resource: ignored. We don't yet know the shape of an
+    //     uninitialized resource (otherwise we would have initialized it!)
+    // * an initialized variable: the shape of the variable's value.
+    // * an initialized TensorArray or Stack resource: the shape of an entry in
+    //   the TensorArray/Stack. Note this is the size of a single entry, not the
+    //   XLA data structure that represents the complete stack/array.
+    TensorShape shape;
 
     // The value of the argument, if it is a compile-time constant. Must be a
     // host-memory tensor.
@@ -154,6 +160,10 @@ class XlaCompiler {
     // as Tensors at compile-time, rather than as run-time outputs of the
     // computation.
     bool resolve_compile_time_constants = true;
+
+    // True when compiling the entry computation, false for subcomputations
+    // (while, call, etc.)
+    bool is_entry_computation = true;
   };
 
   struct OutputDescription {
@@ -173,8 +183,9 @@ class XlaCompiler {
     int input_index;
 
     // Type and shape of the tensor to be written back.
+    // The `shape` field has the same meaning as the Argument::shape field.
     DataType type;
-    xla::Shape shape;
+    TensorShape shape;
 
     // Was the value of the variable modified by the computation?
     // (Always true, unless `return_updated_values_for_all_resources` is true.)
@@ -191,16 +202,9 @@ class XlaCompiler {
     // original arguments, and are not necessarily in the same order.)
     std::vector<int> input_mapping;
 
-    // Does the computation require the local runtime context to be passed as
-    // the last argument?
-    bool requires_runtime_context = false;
-
     // Input shapes of the computation.
     std::vector<xla::Shape> xla_input_shapes;
 
-    // Should the arguments be packed into a single tuple?
-    bool tuple_arg;
-
     // Output shape in XLA format. The output shape is always a tuple.
     xla::Shape xla_output_shape;
 
@@ -232,8 +236,7 @@ class XlaCompiler {
     int graph_def_version = TF_GRAPH_DEF_VERSION;
 
     // If 'allow_cpu_custom_calls' is true, kernels may make use of CustomCall()
-    // for CPU; additionally, an optional XlaLocalRuntimeContext* may be passed
-    // to the computation.
+    // for CPU.
     bool allow_cpu_custom_calls = false;
 
     // If not nullptr, populate_resource_manager is called with the
@@ -241,6 +244,19 @@ class XlaCompiler {
     // device is created, and can be used to create metadata objects
     // that can be accessed by XLA op kernels.
     std::function<Status(ResourceMgr*)>* populate_resource_manager = nullptr;
+
+    // If not nullptr, this memory allocator can be used by the compiler for
+    // temporary allocations it might want to make during compilation.
+    //
+    // For example, the compiler may want to try out different algorithms and
+    // choose the fastest one, and it might run those algorithms over buffers
+    // created using this allocator.
+    //
+    // The compiler can function correctly without an explicit allocator given
+    // here, but on some devices (notably, GPUs), TensorFlow tends to eagerly
+    // allocate most or all available memory on the device, leaving none for the
+    // compiler to access, unless it can use TensorFlow's allocator.
+    xla::DeviceMemoryAllocator* device_allocator = nullptr;
   };
 
   explicit XlaCompiler(Options options);
@@ -259,11 +275,10 @@ class XlaCompiler {
                       const std::vector<Argument>& args,
                       CompilationResult* result);
 
-  Status PrepareArguments(xla::ComputationBuilder* builder, NameAttrList func,
-                          const std::vector<DataType>& types,
-                          const std::vector<TensorShape>& shapes,
-                          const std::vector<const XlaExpression*>& expressions,
-                          std::vector<Argument>* args);
+  // Returns the shape of the XLA parameter for an argument 'arg'.
+  // See the class comment for more details about the argument passing
+  // convention.
+  static Status XLAShapeForArgument(const Argument& arg, xla::Shape* xla_shape);
 
   // Retrieves the channel handle associated with `key`. Allocates
   // a new channel handle if none exists.
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 93aae8485d157cd4afbf804d695d5c0ab8d7946c..65de4dbad75b7fb76a041bc799fc31dc5cb80d74 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -191,10 +191,10 @@ TEST_F(XlaCompilerTest, Simple) {
   std::vector<XlaCompiler::Argument> args(2);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[0].shape = TensorShape({2});
   args[1].kind = XlaCompiler::Argument::kParameter;
   args[1].type = DT_INT32;
-  args[1].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[1].shape = TensorShape({2});
 
   // Compiles the graph.
   XlaCompiler compiler(DefaultOptions());
@@ -227,6 +227,42 @@ TEST_F(XlaCompilerTest, Simple) {
   xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
 }
 
+TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
+  // Builds a graph that adds reshapes a tensor, but with the shape not
+  // statically known.
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Arg(scope.WithOpName("B"), DT_INT32, 1);
+  auto c = ops::Reshape(scope.WithOpName("C"), a, b);
+  auto d = ops::_Retval(scope.WithOpName("D"), c, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2});
+  args[1].kind = XlaCompiler::Argument::kParameter;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({2});
+
+  // Compiles the graph.
+  XlaCompiler compiler(DefaultOptions());
+
+  XlaCompiler::CompilationResult result;
+  Status status =
+      compiler.CompileGraph(XlaCompiler::CompileOptions(), "reshape",
+                            std::move(graph), args, &result);
+  EXPECT_FALSE(status.ok());
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("depends on a parameter"))
+      << status.error_message();
+  EXPECT_TRUE(
+      StringPiece(status.error_message()).contains("[[Node: C = Reshape"))
+      << status.error_message();
+}
+
 // Tests handling of compile-time constant outputs.
 TEST_F(XlaCompilerTest, ConstantOutputs) {
   // Builds a graph with one compile-time constant output and one data-dependent
@@ -245,7 +281,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
   std::vector<XlaCompiler::Argument> args(1);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[0].shape = TensorShape({2});
 
   XlaCompiler::Options options = DefaultOptions();
   XlaCompiler compiler(options);
@@ -337,7 +373,7 @@ TEST_F(XlaCompilerTest, ResourceManager) {
   std::vector<XlaCompiler::Argument> args(1);
   args[0].kind = XlaCompiler::Argument::kParameter;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+  args[0].shape = TensorShape({2});
 
   DummyResourceForTest* resource = new DummyResourceForTest();
 
@@ -384,7 +420,7 @@ TEST_F(XlaCompilerTest, DeterministicCompilation) {
     std::vector<XlaCompiler::Argument> args(1);
     args[0].kind = XlaCompiler::Argument::kParameter;
     args[0].type = DT_INT32;
-    args[0].shape = xla::ShapeUtil::MakeShape(xla::S32, {2});
+    args[0].shape = TensorShape({2});
 
     // Compiles the graph.
     auto options = DefaultOptions();
@@ -436,9 +472,7 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
   args[0].resource_kind = XlaResource::kTensorArray;
   args[0].initialized = true;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::S32, {2}),
-       xla::ShapeUtil::MakeShape(xla::S32, {2})});
+  args[0].shape = TensorShape({});
   args[0].tensor_array_size = 2;
   args[0].tensor_array_gradients = {"grad2"};
 
@@ -504,9 +538,7 @@ TEST_F(XlaCompilerTest, UnwrittenTensorArrayGradientsAreNotComputationOutputs) {
   args[0].resource_kind = XlaResource::kTensorArray;
   args[0].initialized = true;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::S32, {2}),
-       xla::ShapeUtil::MakeShape(xla::S32, {2})});
+  args[0].shape = TensorShape({});
   args[0].tensor_array_size = 2;
   args[0].tensor_array_gradients = {"grad1"};
 
@@ -538,9 +570,7 @@ TEST_F(XlaCompilerTest, NewTensorArrayGradientsAreComputationOutputs) {
   args[0].resource_kind = XlaResource::kTensorArray;
   args[0].initialized = true;
   args[0].type = DT_INT32;
-  args[0].shape = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::S32, {2}),
-       xla::ShapeUtil::MakeShape(xla::S32, {2})});
+  args[0].shape = TensorShape({});
   args[0].tensor_array_size = 2;
   args[0].tensor_array_gradients = {"grad1"};
 
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 651bafd6c5d946adfedd63ebbe93e4ea016f0b37..73878955e3fd54c103c0b07faf7f5ee5bcd84de0 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -70,24 +70,6 @@ XlaContext::XlaContext(XlaCompiler* compiler, xla::ComputationBuilder* builder,
       allow_cpu_custom_calls_(allow_cpu_custom_calls),
       resolve_compile_time_constants_(resolve_compile_time_constants) {}
 
-const xla::ComputationDataHandle&
-XlaContext::GetOrCreateRuntimeContextParameter() {
-  CHECK(allow_cpu_custom_calls_);
-  if (has_context_parameter_) return context_parameter_;
-  has_context_parameter_ = true;
-
-  // Allocate the next available parameter for the context parameter.
-  int num_parameters = 0;
-  for (const XlaExpression& arg : args_) {
-    if (!arg.has_constant_value()) {
-      ++num_parameters;
-    }
-  }
-  context_parameter_ = builder_->Parameter(
-      num_parameters, xla::ShapeUtil::MakeOpaqueShape(), "tf_context");
-  return context_parameter_;
-}
-
 string XlaContext::DebugString() { return "TLA JIT context"; }
 
 // This is called by the Retval Op to associate a computed value
@@ -121,18 +103,15 @@ Status XlaContext::AddConstRetval(int retval_index, DataType dtype,
 
 xla::ComputationBuilder* XlaContext::builder() { return builder_; }
 
-Status XlaContext::CreateResource(XlaResource::Kind kind, int arg_num,
-                                  string name, DataType type,
-                                  const xla::ComputationDataHandle& handle,
-                                  XlaResource** resource) {
-  resources_.emplace_back(new XlaResource);
+Status XlaContext::CreateResource(
+    XlaResource::Kind kind, int arg_num, string name, DataType type,
+    TensorShape shape, const xla::ComputationDataHandle& handle,
+    int64 tensor_array_size, const std::set<string>& tensor_array_gradients,
+    XlaResource** resource) {
+  resources_.emplace_back(
+      new XlaResource(kind, arg_num, std::move(name), type, std::move(shape),
+                      handle, tensor_array_size, tensor_array_gradients));
   *resource = resources_.back().get();
-  XlaResource& r = **resource;
-  r.kind = kind;
-  r.arg_num = arg_num;
-  r.name = std::move(name);
-  r.type = type;
-  r.initial_value = r.value = handle;
   return Status::OK();
 }
 
@@ -178,6 +157,20 @@ const xla::Computation* XlaContext::GetOrCreateAdd(const DataType type) {
   });
 }
 
+const xla::Computation* XlaContext::GetOrCreateMul(const DataType type) {
+  return LookupOrCreate(type, &mul_func_, [this, type] {
+    const string type_string = DataTypeString(type);
+    VLOG(1) << "Building Mul() for " << type_string;
+    xla::ComputationBuilder b(builder()->client(), "mul<" + type_string + ">");
+    xla::PrimitiveType xla_type;
+    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
+    auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
+    auto y = b.Parameter(1, xla::ShapeUtil::MakeShape(xla_type, {}), "y");
+    b.Mul(x, y);
+    return b.Build().ConsumeValueOrDie();
+  });
+}
+
 const xla::Computation* XlaContext::LookupOrCreate(
     DataType type, ComputationMap* out,
     const std::function<xla::Computation()>& create) {
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index de8aafa3628e6eebdabbc508cd95a2ac86e3472f..fac0352ae81e24597e1045981ac47a7cd09481da 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -56,15 +56,10 @@ class XlaContext : public ResourceBase {
   xla::ComputationBuilder* builder();
 
   bool allow_cpu_custom_calls() const { return allow_cpu_custom_calls_; }
-  bool has_context_parameter() const { return has_context_parameter_; }
 
   const std::vector<XlaExpression>& args() const { return args_; }
   void set_args(std::vector<XlaExpression> args);
 
-  // Get the runtime context parameter, adding one if it does not already exist.
-  // Dies if not compiling a local executable.
-  const xla::ComputationDataHandle& GetOrCreateRuntimeContextParameter();
-
   const std::vector<XlaExpression>& retvals() { return retvals_; }
 
   // This is called by the Retval Op to associate a computed value
@@ -76,11 +71,15 @@ class XlaContext : public ResourceBase {
   Status AddConstRetval(int retval_index, DataType dtype,
                         const xla::Literal& literal);
 
-  // Creates a resource with resource `kind` and initial type `type` and
-  // value `handle`. `name` is a descriptive name for use in error messages.
+  // Creates a resource with resource `kind` and initial value `handle`. `name`
+  // is a descriptive name for use in error messages. See the `XlaResource`
+  // constructor for a description of the remaining arguments.
   // Fails if the resource already exists.
   Status CreateResource(XlaResource::Kind kind, int arg_num, string name,
-                        DataType type, const xla::ComputationDataHandle& handle,
+                        DataType type, TensorShape shape,
+                        const xla::ComputationDataHandle& handle,
+                        int64 tensor_array_size,
+                        const std::set<string>& tensor_array_gradients,
                         XlaResource** resource);
 
   const std::vector<std::unique_ptr<XlaResource>>& resources() {
@@ -102,6 +101,11 @@ class XlaContext : public ResourceBase {
   // separate specialization of the computation for each DataType.
   const xla::Computation* GetOrCreateAdd(const DataType type);
 
+  // Get an XLA lambda to compute Mul. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::Computation* GetOrCreateMul(const DataType type);
+
   // The name of the XlaContext resource during symbolic graph execution.
   static const char kXlaContextResourceName[];
 
@@ -116,16 +120,9 @@ class XlaContext : public ResourceBase {
   const bool allow_cpu_custom_calls_;
 
   // If true, constant return values are returned as Tensors instead of
-  // run-time computation outptus.
+  // run-time computation outputs.
   const bool resolve_compile_time_constants_;
 
-  // When 'has_context_parameter_' is true, this is the computation handle
-  // for an additional final parameter to the computation, through which will be
-  // passed a XlaLocalRuntimeContext* at runtime. Created on demand by
-  // GetOrCreateRuntimeContextParameter().
-  bool has_context_parameter_ = false;
-  xla::ComputationDataHandle context_parameter_;
-
   // Arguments to the Tensorflow graph, indexed by _Arg index.
   // Includes both compile-time constant arguments and runtime parameters.
   std::vector<XlaExpression> args_;
@@ -155,6 +152,9 @@ class XlaContext : public ResourceBase {
   // Cached computation to compute Sum of two elements, specialized by type.
   ComputationMap add_func_;
 
+  // Cached computation to compute Mul of two elements, specialized by type.
+  ComputationMap mul_func_;
+
   // Cached computation to compute Sigmoid of an element, specialized by type.
   ComputationMap sigmoid_func_;
 
diff --git a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
index d504613d232c779e47a506657d2825d052e726dc..8ca757e72355d890c13b8b448d35c327d3986696 100644
--- a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
+++ b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
@@ -21,8 +21,6 @@ namespace tensorflow {
 bool GpuOpFilter(KernelDef* kdef) {
   // TODO(b/31361304): The GPU backend does not parallelize PRNG ops, leading to
   // slow code.
-  // TODO(b/34969189) The implementation of TruncatedNormal generates illegal
-  // code on GPU.
   if (kdef->op() == "RandomStandardNormal" || kdef->op() == "RandomUniform" ||
       kdef->op() == "RandomUniformInt" || kdef->op() == "TruncatedNormal") {
     return false;
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 9c3e15d2fa4c84af94d137f2e03107bcc980f4cd..f048662953e20b2a612271e2daeef6e370c4822a 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This file defines helper routines for Tla JIT compilation.
+// This file defines helper routines for XLA compilation.
 
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
@@ -121,6 +121,8 @@ xla::ComputationDataHandle XlaHelpers::One(xla::ComputationBuilder* b,
 xla::ComputationDataHandle XlaHelpers::Epsilon(xla::ComputationBuilder* b,
                                                DataType data_type) {
   switch (data_type) {
+    case DT_BFLOAT16:
+      return b->ConstantR0<bfloat16>(bfloat16::epsilon());
     case DT_FLOAT:
       return b->ConstantR0<float>(std::numeric_limits<float>::epsilon());
     case DT_DOUBLE:
@@ -133,54 +135,9 @@ xla::ComputationDataHandle XlaHelpers::Epsilon(xla::ComputationBuilder* b,
 
 xla::ComputationDataHandle XlaHelpers::IntegerLiteral(
     xla::ComputationBuilder* b, DataType data_type, int64 value) {
-  xla::Literal literal;
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
-  switch (type) {
-    case xla::U8:
-      literal = *xla::Literal::CreateR0<uint8>(value);
-      break;
-    case xla::U32:
-      literal = *xla::Literal::CreateR0<uint32>(value);
-      break;
-    case xla::U64:
-      literal = *xla::Literal::CreateR0<uint64>(value);
-      break;
-    case xla::S8:
-      literal = *xla::Literal::CreateR0<int8>(value);
-      break;
-    case xla::S32:
-      literal = *xla::Literal::CreateR0<int32>(value);
-      break;
-    case xla::S64:
-      literal = *xla::Literal::CreateR0<int64>(value);
-      break;
-    case xla::F32:
-      literal = *xla::Literal::CreateR0<float>(value);
-      break;
-    case xla::F64:
-      literal = *xla::Literal::CreateR0<double>(value);
-      break;
-    case xla::C64:
-      literal = *xla::Literal::CreateR0<complex64>(value);
-      break;
-    case xla::PRED:
-      LOG(FATAL) << "pred element type is not integral";
-    case xla::S16:
-    case xla::U16:
-      LOG(FATAL) << "u16/s16 literals not yet implemented";
-    case xla::F16:
-      literal =
-          *xla::Literal::CreateR0<xla::half>(static_cast<xla::half>(value));
-      break;
-    case xla::TUPLE:
-      LOG(FATAL) << "tuple element type is not integral";
-    case xla::OPAQUE:
-      LOG(FATAL) << "opaque element type is not integral";
-    default:
-      LOG(FATAL) << "unhandled element type " << type;
-  }
-  return b->ConstantLiteral(literal);
+  return ::tensorflow::IntegerLiteral(b, type, value);
 }
 
 xla::ComputationDataHandle XlaHelpers::FloatLiteral(xla::ComputationBuilder* b,
@@ -207,8 +164,8 @@ xla::ComputationDataHandle XlaHelpers::FloatLiteral(xla::ComputationBuilder* b,
         "elements.");
   }
 
-  *output = input;
-  output->mutable_shape()->Swap(&shape);
+  *output = input.Clone();
+  output->mutable_shape_do_not_use()->Swap(&shape);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 1dd454ea8d57e21526e5bcde0c8efc5514983b93..1fe6e69ff2dc838152032ac3d7b21de41684c6f6 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -37,27 +37,14 @@ namespace {
 
 // Returns a vector of positional argument buffer sizes.
 xla::StatusOr<std::vector<intptr_t>> ComputeArgSizes(
-    const xla::ProgramShape& program_shape, bool requires_runtime_context) {
+    const xla::ProgramShape& program_shape) {
   std::vector<intptr_t> arg_sizes;
   const size_t num_args = program_shape.parameters_size();
   arg_sizes.reserve(num_args);
   for (int i = 0; i < num_args; ++i) {
     const xla::Shape& arg_shape = program_shape.parameters(i);
-    if (i == num_args - 1 && requires_runtime_context) {
-      // If the compiled function needs an XlaLocalRuntimeContext* arg, it's
-      // always last, and must be represented as an opaque type.
-      const xla::PrimitiveType type = arg_shape.element_type();
-      if (type != xla::OPAQUE) {
-        return errors::InvalidArgument(
-            "expected final context arg to be opaque, but got type: ",
-            xla::PrimitiveType_Name(type), ", from program shape: ",
-            xla::ShapeUtil::HumanString(program_shape));
-      }
-      arg_sizes.push_back(-1);
-    } else {
-      constexpr size_t kPointerSize = sizeof(void*);
-      arg_sizes.push_back(xla::ShapeUtil::ByteSizeOf(arg_shape, kPointerSize));
-    }
+    constexpr size_t kPointerSize = sizeof(void*);
+    arg_sizes.push_back(xla::ShapeUtil::ByteSizeOf(arg_shape, kPointerSize));
   }
   return std::move(arg_sizes);
 }
@@ -90,21 +77,6 @@ xla::StatusOr<size_t> ComputeResultIndex(
   return result_slice.index();
 }
 
-// Adapt ComputeFunctionType, which includes a final profile_counters arg, to
-// RawFunction, which doesn't include that final arg.
-//
-// TODO(toddw): Change RawFunction and AOT to also pass the final
-// profile_counters arg, and remove this adapter.
-XlaCompiledCpuFunction::RawFunction RawFunctionAdapter(
-    xla::cpu::CpuExecutable::ComputeFunctionType compute_function) {
-  return [compute_function](void* result,
-                            const xla::ExecutableRunOptions* run_options,
-                            const void** args, void** temps) {
-    return compute_function(result, run_options, args, temps,
-                            /*profile_counters=*/nullptr);
-  };
-}
-
 // Collect names from `entries`, where T is one of tf2xla::{Feed,Fetch}. We hold
 // the actual strings in nonempty_names, and hold arrays of pointers in
 // name_ptrs, terminated by a nullptr entry.
@@ -144,9 +116,8 @@ XlaJitCompiledCpuFunction::Compile(
   TF_ASSIGN_OR_RETURN(xla::LocalClient * client,
                       xla::ClientLibrary::GetOrCreateLocalClient());
   xla::Computation computation;
-  bool requires_runtime_context;
-  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToXla(
-      graph_def, config, client, &computation, &requires_runtime_context));
+  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToXla(graph_def, config, client,
+                                                      &computation));
 
   // Get and verify the program shape.
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::ProgramShape> program_shape,
@@ -177,14 +148,13 @@ XlaJitCompiledCpuFunction::Compile(
   const xla::cpu::CpuExecutable* cpu_executable =
       static_cast<xla::cpu::CpuExecutable*>(executable->executable());
   XlaCompiledCpuFunction::RawFunction raw_function =
-      RawFunctionAdapter(cpu_executable->compute_function());
+      cpu_executable->compute_function();
   const xla::BufferAssignment& buffer_assignment =
       cpu_executable->buffer_assignment();
 
   // Compute buffer sizes and the result index, needed to run the raw function.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<intptr_t> arg_sizes,
-      ComputeArgSizes(*program_shape, requires_runtime_context));
+  TF_ASSIGN_OR_RETURN(std::vector<intptr_t> arg_sizes,
+                      ComputeArgSizes(*program_shape));
   TF_ASSIGN_OR_RETURN(std::vector<intptr_t> temp_sizes,
                       ComputeTempSizes(buffer_assignment));
   TF_ASSIGN_OR_RETURN(size_t result_index,
@@ -203,7 +173,6 @@ XlaJitCompiledCpuFunction::Compile(
   jit->static_data_.temp_sizes = jit->temp_sizes_.data();
   jit->static_data_.num_temps = jit->temp_sizes_.size();
   jit->static_data_.result_index = result_index;
-  jit->static_data_.requires_runtime_context = requires_runtime_context;
   // Optional metadata is collected and set below.
   CollectNames(config.feed(), &jit->nonempty_arg_names_, &jit->arg_names_);
   CollectNames(config.fetch(), &jit->nonempty_result_names_,
@@ -211,6 +180,14 @@ XlaJitCompiledCpuFunction::Compile(
   jit->static_data_.arg_names = jit->arg_names_.data();
   jit->static_data_.result_names = jit->result_names_.data();
   jit->static_data_.program_shape = jit->program_shape_.get();
+
+  if (cpu_executable->hlo_profiling_enabled()) {
+    jit->static_data_.hlo_profile_printer_data =
+        &cpu_executable->hlo_profile_printer_data();
+    jit->static_data_.profile_counters_size =
+        cpu_executable->hlo_profile_printer_data().profile_counters_size();
+  }
+
   return std::move(jit_unique_ptr);
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_local_runtime_context.h b/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
deleted file mode 100644
index dca420d6ee3fec45f88ac3b450ab0cb4fb83d38a..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_LOCAL_RUNTIME_CONTEXT_H_
-#define TENSORFLOW_COMPILER_TF2XLA_XLA_LOCAL_RUNTIME_CONTEXT_H_
-
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-
-// Forward-declare the ThreadPoolDevice so that it can be ignored unless it's
-// actually used.  E.g. some ahead-of-time compiled computations don't need a
-// thread pool.
-namespace Eigen {
-struct ThreadPoolDevice;
-}
-
-namespace tensorflow {
-
-// An instance of this class is passed to each call from tensorflow into a
-// compiled XLA computation. See xla_launch_ops.cc.
-struct XlaLocalRuntimeContext {
- public:
-  XlaLocalRuntimeContext() {}
-
-  // Kernels implemented using custom call ops set this if they encounter an
-  // error. The error is checked after the entire XLA computation is
-  // complete.
-  //
-  // error+error_msg are used instead of Status to reduce the binary size
-  // overhead for ahead-of-time compiled binaries.
-  bool error = false;
-  string error_msg;
-
-  // Kernels that need a thread pool can get it from here.
-  const Eigen::ThreadPoolDevice* thread_pool = nullptr;
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaLocalRuntimeContext);
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_LOCAL_RUNTIME_CONTEXT_H_
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 2b4cc9ba2d62b0e559e1456e6bfe6ab1e094e1df..ee29158646fa96fe554d089e11d50afb47e3e300 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -118,13 +118,36 @@ Status XlaOpKernelContext::ConstantInputReshaped(
   std::iota(layout_indices.rbegin(), layout_indices.rend(), 0);
   xla::Layout layout = xla::LayoutUtil::MakeLayout(layout_indices);
 
+  xla::StatusOr<bool> is_constant = builder()->IsConstant(handle);
+  if (!is_constant.ok()) {
+    Status status = is_constant.status();
+    errors::AppendToMessage(&status, "while evaluating input ", index, " of ",
+                            context_->op_kernel().type_string(),
+                            " operator as a compile-time constant.");
+    return status;
+  }
+
+  if (!is_constant.ValueOrDie()) {
+    return errors::InvalidArgument(
+        "Input ", index, " to ", context_->op_kernel().type_string(),
+        " operator must be a compile-time constant.\n"
+        "\n"
+        "XLA compilation requires that operator arguments that represent "
+        "shapes or dimensions be evaluated to concrete values at compile time. "
+        "This error means that a shape or dimension argument could not be "
+        "evaluated at compile time, usually because the value of the argument "
+        "depends on a parameter to the computation, on a variable, or on a "
+        "stateful operation such as a random number generator.");
+  }
+
   // Ask the XLA compiler to evaluate the data handle to a literal.
   xla::StatusOr<std::unique_ptr<xla::Literal>> computed =
       builder()->ComputeConstant(handle, &layout);
   if (!computed.ok()) {
-    return errors::InvalidArgument(
-        "Error evaluating ", context_->op_kernel().name(), " input ", index,
-        ": ", computed.status().error_message());
+    return errors::Internal("Error evaluating ", context_->op_kernel().name(),
+                            " input ", index,
+                            "as a compile-time constant.\nError: ",
+                            computed.status().error_message());
   }
   *constant_literal = std::move(*computed.ValueOrDie());
 
@@ -206,15 +229,15 @@ Status XlaOpKernelContext::ConstantInputAsInt64Literal(int index,
   xla::Literal literal;
   TF_RETURN_IF_ERROR(ConstantInput(index, &literal));
   switch (literal.shape().element_type()) {
-    case xla::S32:
-      out->Clear();
-      *out->mutable_shape() = literal.shape();
-      out->mutable_shape()->set_element_type(xla::S64);
-      for (int32 x : literal.s32s()) {
-        out->add_s64s(x);
+    case xla::S32: {
+      *out = xla::Literal(
+          xla::ShapeUtil::ChangeElementType(literal.shape(), xla::S64));
+      auto src_data = literal.data<int32>();
+      for (int64 i = 0; i < src_data.size(); ++i) {
+        out->data<int64>()[i] = src_data[i];
       }
       return Status::OK();
-
+    }
     case xla::S64:
       *out = std::move(literal);
       return Status::OK();
@@ -263,17 +286,26 @@ Status XlaOpKernelContext::ConstantInputList(
 }
 
 Status XlaOpKernelContext::ReadVariableInput(
-    int index, xla::ComputationDataHandle* value) {
+    int index, DataType type, TensorShape* shape,
+    xla::ComputationDataHandle* value) {
   const Tensor& tensor = context_->input(index);
   const XlaExpression* expression = CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
-  TF_RET_CHECK(variable->kind == XlaResource::kVariable);
-  if (variable->value.handle() == 0) {
+  TF_RET_CHECK(variable->kind() == XlaResource::kVariable);
+  if (!variable->initialized()) {
     return errors::InvalidArgument("Read of uninitialized variable ",
-                                   variable->name);
+                                   variable->name());
+  }
+  if (variable->type() != type) {
+    return errors::InvalidArgument(
+        "Type mismatch for read of variable ", variable->name(), ". Expected ",
+        DataTypeString(type), "; got ", DataTypeString(variable->type()));
+  }
+  *value = variable->value();
+  if (shape) {
+    *shape = variable->shape();
   }
-  *value = variable->value;
   return Status::OK();
 }
 
@@ -283,18 +315,13 @@ Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
   const XlaExpression* expression = CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
-  TF_RET_CHECK(variable->kind == XlaResource::kVariable);
-  if (variable->value.handle() == 0) {
+  TF_RET_CHECK(variable->kind() == XlaResource::kVariable);
+  if (!variable->initialized()) {
     return errors::InvalidArgument("Read of uninitialized variable ",
-                                   variable->name);
-  }
-  *type = variable->type;
-  auto shape_or_status = builder()->GetShape(variable->value);
-  if (!shape_or_status.ok()) {
-    return shape_or_status.status();
+                                   variable->name());
   }
-  TF_RETURN_IF_ERROR(
-      XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), shape));
+  *type = variable->type();
+  *shape = variable->shape();
   return Status::OK();
 }
 
@@ -381,26 +408,38 @@ Status XlaOpKernelContext::AssignVariable(
       CastExpressionFromTensor(context_->input(input_index));
   XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
-  TF_RET_CHECK(variable->kind == XlaResource::kVariable);
-  if (!((variable->type == DT_INVALID && type != DT_INVALID) ||
-        (variable->type == type))) {
-    return errors::InvalidArgument(
-        "Types of variables cannot change after initialization: old type was ",
-        DataTypeString(variable->type), ", new type is ", DataTypeString(type));
+  TF_RET_CHECK(variable->kind() == XlaResource::kVariable);
+
+  auto shape_or_status = builder()->GetShape(handle);
+  if (!shape_or_status.ok()) {
+    return shape_or_status.status();
   }
-  variable->type = type;
-  variable->value = handle;
-  return Status::OK();
+  TensorShape shape;
+  TF_RETURN_IF_ERROR(
+      XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), &shape));
+
+  TF_RETURN_IF_ERROR(variable->SetTypeAndShape(type, shape));
+  return variable->SetValue(handle);
 }
 
 XlaCompiler* XlaOpKernelContext::compiler() const {
   return XlaContext::Get(context_).compiler();
 }
 
-void XlaOpKernelContext::CtxFailure(Status s) { context_->CtxFailure(s); }
-void XlaOpKernelContext::CtxFailureWithWarning(Status s) {
+void XlaOpKernelContext::CtxFailure(const Status& s) {
+  context_->CtxFailure(s);
+}
+void XlaOpKernelContext::CtxFailureWithWarning(const Status& s) {
   context_->CtxFailureWithWarning(s);
 }
+void XlaOpKernelContext::CtxFailure(const char* file, int line,
+                                    const Status& s) {
+  context_->CtxFailure(file, line, s);
+}
+void XlaOpKernelContext::CtxFailureWithWarning(const char* file, int line,
+                                               const Status& s) {
+  context_->CtxFailureWithWarning(file, line, s);
+}
 
 const xla::Computation* XlaOpKernelContext::GetOrCreateMax(
     const DataType type) {
@@ -417,6 +456,11 @@ const xla::Computation* XlaOpKernelContext::GetOrCreateAdd(
   return XlaContext::Get(context_).GetOrCreateAdd(type);
 }
 
+const xla::Computation* XlaOpKernelContext::GetOrCreateMul(
+    const DataType type) {
+  return XlaContext::Get(context_).GetOrCreateMul(type);
+}
+
 XlaOpKernel::XlaOpKernel(OpKernelConstruction* context) : OpKernel(context) {}
 
 void XlaOpKernel::Compute(OpKernelContext* context) {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 76bcf594e6a0601763844847583c18ee26d8adf3..e1fd0f55c6d2501b4813c90171630a8df567f78a 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -164,21 +164,28 @@ class XlaOpKernelContext {
                                  TensorShape* shape) const;
 
   // Reads the current value of the resouce variable referred to by input
-  // 'index'.
-  Status ReadVariableInput(int index, xla::ComputationDataHandle* value);
+  // 'index'. If `shape` is not nullptr, sets `*shape` to the shape of the
+  // variable. Returns an error if the variable has not been initialized, or if
+  // its type does not match `type`.
+  Status ReadVariableInput(int index, DataType type, TensorShape* shape,
+                           xla::ComputationDataHandle* value);
 
   // Assigns the value `handle` to the variable referenced by input
-  // `input_index`. Marks the operator as having side effects.
+  // `input_index`. The variable must be of `type`. Returns an error if the
+  // variable has been initialized with a different type or with a
+  // different shape.
   Status AssignVariable(int input_index, DataType type,
                         const xla::ComputationDataHandle& handle);
 
   // Helper routines for the OP_REQUIRES macros
-  void CtxFailure(Status s);
-  void CtxFailureWithWarning(Status s);
+  void CtxFailure(const Status& s);
+  void CtxFailureWithWarning(const Status& s);
+  void CtxFailure(const char* file, int line, const Status& s);
+  void CtxFailureWithWarning(const char* file, int line, const Status& s);
 
   // If this kernel invocation is within a function execution,
   // call_frame() returns the call frame for the function call.
-  FunctionCallFrame* call_frame() const { return context_->call_frame(); }
+  CallFrameInterface* call_frame() const { return context_->call_frame(); }
 
   FunctionLibraryRuntime* function_library() const {
     return context_->function_library();
@@ -210,6 +217,11 @@ class XlaOpKernelContext {
   // separate specialization of the computation for each DataType.
   const xla::Computation* GetOrCreateAdd(const DataType type);
 
+  // Gets an XLA lambda to compute Mul. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::Computation* GetOrCreateMul(const DataType type);
+
  private:
   OpKernelContext* const context_;
 };
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 02318cf7fa1d4edc12507f6b4d66a8e897cbe100..0dde6a986c61bdd5b0b2e6d7a16b29ab95be98ab 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -82,6 +83,11 @@ XlaOpRegistry::~XlaOpRegistry() = default;
       return false;
     }
   }
+  if (x.compile_time_constant_inputs != y.compile_time_constant_inputs) {
+    LOG(WARNING) << "Registrations of " << x.name
+                 << " have incompatible compile time constant inputs.";
+    return false;
+  }
   return true;
 }
 
@@ -155,7 +161,14 @@ void XlaOpRegistry::RegisterCompilationKernels() {
     const string& op_name = op.first;
     const std::unique_ptr<OpRegistration>& op_registration = op.second;
     const OpDef* op_def;
-    TF_CHECK_OK(op_registry->LookUpOpDef(op_name, &op_def));
+    Status lookup_status = op_registry->LookUpOpDef(op_name, &op_def);
+    if (!lookup_status.ok()) {
+      LOG(ERROR) << lookup_status.error_message();
+      XLA_LOG_LINES(
+          ERROR, "Ops registered: \n" +
+                     dynamic_cast<OpRegistry*>(op_registry)->DebugString(true));
+    }
+    TF_CHECK_OK(lookup_status);
 
     std::unordered_set<string> type_attrs;
     for (const OpDef::AttrDef& attr_def : op_def->attr()) {
@@ -187,22 +200,39 @@ void XlaOpRegistry::RegisterCompilationKernels() {
 
       // Constrain each type attribute to the intersection of:
       // a) the types supported by the backend, and
-      // b) the attribute's type constraints.
-      // TODO(phawkins): it may be necessary to also take the intersection with
-      // the set of types supported by the OpDef.
+      // b) the types allowed by the OpDef, and
+      // c) the type constraints.
       for (const string& type_attr : type_attrs) {
         KernelDef::AttrConstraint* attr_constraint = kdef->add_constraint();
         attr_constraint->set_name(type_attr);
         auto* allowed_values =
             attr_constraint->mutable_allowed_values()->mutable_list();
 
-        auto it = op_registration->type_constraints.find(type_attr);
+        const OpDef::AttrDef& op_def_attr = *FindAttr(type_attr, *op_def);
+        const auto* op_def_allowed_types =
+            op_def_attr.has_allowed_values()
+                ? &op_def_attr.allowed_values().list().type()
+                : nullptr;
+        auto constraint_it = op_registration->type_constraints.find(type_attr);
+        const std::set<DataType>* type_constraints =
+            constraint_it != op_registration->type_constraints.end()
+                ? &constraint_it->second
+                : nullptr;
         for (DataType dtype : backend.second.supported_types) {
-          if (it == op_registration->type_constraints.end() ||
-              (it != op_registration->type_constraints.end() &&
-               it->second.find(dtype) != it->second.end())) {
-            allowed_values->add_type(dtype);
+          // Filter out types that aren't allowed by the OpDef.
+          if (op_def_allowed_types != nullptr &&
+              std::find(op_def_allowed_types->begin(),
+                        op_def_allowed_types->end(),
+                        dtype) == op_def_allowed_types->end()) {
+            continue;
+          }
+          // Filter out types based on the type constraints.
+          if (type_constraints != nullptr &&
+              type_constraints->find(dtype) == type_constraints->end()) {
+            continue;
           }
+          // Passed all the filters, this type is allowed.
+          allowed_values->add_type(dtype);
         }
         if (op_registration->allow_resource_types) {
           allowed_values->add_type(DT_RESOURCE);
@@ -245,6 +275,33 @@ std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
   return kernels;
 }
 
+/* static */ const std::unordered_set<string>*
+XlaOpRegistry::CompileTimeConstantInputs(const string& op) {
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  auto it = registry.ops_.find(op);
+  if (it == registry.ops_.end()) {
+    return nullptr;
+  }
+  return &it->second->compile_time_constant_inputs;
+}
+
+std::vector<string> XlaOpRegistry::BackendNames() {
+  std::vector<string> names;
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  for (const auto& backend_pair : registry.backends_) {
+    names.push_back(backend_pair.first);
+  }
+  return names;
+}
+
+bool XlaOpRegistry::IsBackendRegistered(const string& name) {
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  return registry.backends_.find(name) != registry.backends_.end();
+}
+
 XlaOpRegistry& XlaOpRegistry::Instance() {
   static XlaOpRegistry* r = new XlaOpRegistry;
   return *r;
@@ -303,6 +360,12 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
   return *this;
 }
 
+XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::CompileTimeConstInput(
+    StringPiece input_name) {
+  registration_->compile_time_constant_inputs.insert(input_name.ToString());
+  return *this;
+}
+
 std::unique_ptr<XlaOpRegistry::OpRegistration> XlaOpRegistrationBuilder::Build(
     XlaOpRegistry::Factory factory) {
   registration_->factory = factory;
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 6aee8c91cc01b4382ef867fa8e438eede008ac73..ff7453194af3a85bded86a5ce298f8779422dccb 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -45,11 +45,11 @@ extern const char* const DEVICE_GPU_XLA_JIT;  // "GPU_XLA_JIT"
 extern const char* const DEVICE_XLA_CPU;
 extern const char* const DEVICE_XLA_GPU;
 
-constexpr std::array<DataType, 3> kFloatTypes = {
-    {DT_HALF, DT_FLOAT, DT_DOUBLE}};
-constexpr std::array<DataType, 8> kNumericTypes = {
+constexpr std::array<DataType, 4> kFloatTypes = {
+    {DT_HALF, DT_FLOAT, DT_DOUBLE, DT_BFLOAT16}};
+constexpr std::array<DataType, 9> kNumericTypes = {
     {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
-     DT_COMPLEX64}};
+     DT_COMPLEX64, DT_BFLOAT16}};
 
 constexpr std::array<DataType, 8> kCpuAllTypes = {
     {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE,
@@ -97,6 +97,12 @@ class XlaOpRegistry {
                               gtl::ArraySlice<DataType> supported_types,
                               BackendOpFilter op_filter);
 
+  // Returns the names of the registered backends.
+  static std::vector<string> BackendNames();
+
+  // Returns true iff a backend with the given name is registered.
+  static bool IsBackendRegistered(const string& name);
+
   // Registers `device_name` for XLA compilation, using information from
   // `registration`.
   static void RegisterCompilationDevice(const string& device_name,
@@ -116,12 +122,17 @@ class XlaOpRegistry {
   static void RegisterCompilationKernels();
 
   // Returns KernelDefs for compilation ops registered on
-  // 'compilation_device_name'.
-  // Does not include kernels registered as CompilationOnly.
+  // 'compilation_device_name'.  Does not include kernels registered as
+  // CompilationOnly, iff include_compilation_only_kernels=false.
   static std::vector<const KernelDef*> DeviceKernels(
       const string& compilation_device_name,
       bool include_compilation_only_kernels);
 
+  // Returns the set of compile-time constant inputs to 'op'. Returns nullptr
+  // if the op is not registered.
+  static const std::unordered_set<string>* CompileTimeConstantInputs(
+      const string& op);
+
  private:
   friend class XlaBackendRegistrar;
   friend class XlaOpRegistrar;
@@ -175,6 +186,9 @@ class XlaOpRegistry {
     bool has_device_whitelist = false;
     std::unordered_set<string> device_whitelist;
 
+    // Names of arguments that must be compile-time constants.
+    std::unordered_set<string> compile_time_constant_inputs;
+
     // Factory used to build OpKernels that perform symbolic execution.
     Factory factory;
   };
@@ -236,6 +250,9 @@ class XlaOpRegistrationBuilder {
   // Allow DT_RESOURCE types for type parameters.
   XlaOpRegistrationBuilder& AllowResourceTypes();
 
+  // Mark 'input_name' as an argument whose value must be known at compile-time.
+  XlaOpRegistrationBuilder& CompileTimeConstInput(StringPiece input_name);
+
   std::unique_ptr<XlaOpRegistry::OpRegistration> Build(
       XlaOpRegistry::Factory factory);
 
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c2075b44b82ba279d1246ec6bfcf305d12c418a6
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -0,0 +1,194 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/sharding_util.h"
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+
+namespace tensorflow {
+
+XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
+                         TensorShape shape,
+                         const xla::ComputationDataHandle& initial_value,
+                         int64 tensor_array_size,
+                         const std::set<string>& tensor_array_gradients)
+    : kind_(kind),
+      arg_num_(arg_num),
+      name_(std::move(name)),
+      type_(type),
+      shape_(std::move(shape)),
+      value_(initial_value),
+      initial_value_(initial_value),
+      tensor_array_size_(tensor_array_size) {
+  CHECK(kind_ != kInvalid);
+
+  for (const string& gradient : tensor_array_gradients) {
+    tensor_array_gradients_[gradient].reset(
+        new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1,
+                        /*name=*/strings::StrCat("TensorArrayGrad: ", name_),
+                        type_, shape_, xla::ComputationDataHandle(),
+                        tensor_array_size_, /*tensor_array_gradients=*/{}));
+  }
+}
+
+Status XlaResource::SetTypeAndShape(DataType type, const TensorShape& shape) {
+  if (type == DT_INVALID) {
+    return errors::InvalidArgument("Attempted to set type of resource '", name_,
+                                   "'' to an invalid type");
+  }
+  if (initialized() && type_ != type) {
+    return errors::InvalidArgument("Type of resource ", name_,
+                                   " cannot be changed after initialization: "
+                                   "old type was ",
+                                   DataTypeString(type_), ", new type is ",
+                                   DataTypeString(type));
+  }
+  if (initialized() && shape_ != shape) {
+    return errors::InvalidArgument("Shape of resource ", name_,
+                                   " cannot be changed after initialization: "
+                                   "old shape was ",
+                                   shape_.DebugString(), ", new shape is ",
+                                   shape.DebugString());
+  }
+  type_ = type;
+  shape_ = shape;
+  return Status::OK();
+}
+
+Status XlaResource::SetValue(const xla::ComputationDataHandle& value) {
+  if (type_ == DT_INVALID) {
+    return errors::InvalidArgument(
+        "Resource '", name_,
+        "' must be initialized with a valid type before use.");
+  }
+  value_ = value;
+  return Status::OK();
+}
+
+Status XlaResource::SetZeroValue(xla::ComputationBuilder* builder) {
+  if (type_ == DT_INVALID) {
+    return errors::InvalidArgument(
+        "Resource '", name_,
+        "' must be initialized with a valid type before use.");
+  }
+  switch (kind_) {
+    case kVariable: {
+      value_ = builder->Broadcast(XlaHelpers::Zero(builder, type_),
+                                  shape_.dim_sizes());
+      break;
+    }
+    case kTensorArray: {
+      TensorShape ta_shape;
+      ta_shape.AddDim(tensor_array_size_);
+      ta_shape.AppendShape(shape_);
+      value_ = builder->Broadcast(XlaHelpers::Zero(builder, type_),
+                                  ta_shape.dim_sizes());
+      break;
+    }
+    case kStack: {
+      TensorShape ta_shape;
+      ta_shape.AddDim(tensor_array_size_);
+      ta_shape.AppendShape(shape_);
+      value_ =
+          builder->Tuple({builder->Broadcast(XlaHelpers::Zero(builder, type_),
+                                             ta_shape.dim_sizes()),
+                          builder->ConstantR0<int32>(0)});
+      break;
+    }
+
+    case kInvalid:
+    default:
+      LOG(FATAL) << "Invalid resource type";
+  }
+  return Status::OK();
+}
+
+Status XlaResource::GetOrCreateTensorArrayGradient(
+    const string& source, xla::ComputationBuilder* builder,
+    XlaResource** gradient_out) {
+  VLOG(2) << "Gradient lookup for resource: " << name_
+          << " gradient: " << source;
+  TF_RET_CHECK(kind_ == kTensorArray);
+  std::unique_ptr<XlaResource>& gradient = tensor_array_gradients_[source];
+  if (!gradient) {
+    TensorShape ta_shape;
+    ta_shape.AddDim(tensor_array_size_);
+    ta_shape.AppendShape(shape_);
+    xla::ComputationDataHandle gradient_value = builder->Broadcast(
+        XlaHelpers::Zero(builder, type_), ta_shape.dim_sizes());
+    gradient.reset(
+        new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1,
+                        /*name=*/strings::StrCat("TensorArrayGrad: ", name_),
+                        type_, shape_, gradient_value, tensor_array_size_,
+                        /*tensor_array_gradients=*/{}));
+  }
+  *gradient_out = gradient.get();
+  return Status::OK();
+}
+
+Status XlaResource::Pack(xla::ComputationDataHandle* pack,
+                         xla::ComputationBuilder* builder) const {
+  if (tensor_array_gradients_.empty()) {
+    *pack = value_;
+  } else {
+    TF_RET_CHECK(kind_ == kTensorArray);
+    std::vector<xla::ComputationDataHandle> elems;
+    elems.push_back(value_);
+    for (const auto& gradient : tensor_array_gradients_) {
+      elems.push_back(gradient.second->value_);
+    }
+    *pack = builder->Tuple(elems);
+  }
+  return Status::OK();
+}
+
+Status XlaResource::SetFromPack(const std::set<string>& gradient_sources,
+                                const xla::ComputationDataHandle& pack,
+                                xla::ComputationBuilder* builder) {
+  if (gradient_sources.empty()) {
+    if (!initialized()) {
+      initial_value_ = pack;
+    }
+    value_ = pack;
+  } else {
+    TF_RET_CHECK(kind_ == kTensorArray);
+    int pos = 0;
+    auto v = builder->GetTupleElement(pack, pos++);
+    if (!initialized()) {
+      initial_value_ = v;
+    }
+    value_ = v;
+
+    for (const auto& source : gradient_sources) {
+      XlaResource* gradient;
+      TF_RETURN_IF_ERROR(
+          GetOrCreateTensorArrayGradient(source, builder, &gradient));
+      auto v = builder->GetTupleElement(pack, pos++);
+      if (!gradient->initialized()) {
+        gradient->initial_value_ = v;
+      }
+      gradient->value_ = v;
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..1bb2c7274ecdf0954768fd96def51194e52deee8
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -0,0 +1,157 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_RESOURCE_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_RESOURCE_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Represents a resource, such as a Variable or TensorArray.
+class XlaResource {
+ public:
+  enum Kind {
+    kInvalid,
+    kVariable,
+    kTensorArray,
+    kStack,
+  };
+
+  XlaResource(Kind kind, int arg_num, string name, DataType type,
+              TensorShape shape,
+              const xla::ComputationDataHandle& initial_value,
+              int64 tensor_array_size,
+              const std::set<string>& tensor_array_gradients);
+
+  XlaResource(const XlaResource&) = delete;
+  XlaResource(XlaResource&&) = delete;
+  XlaResource& operator=(const XlaResource&) = delete;
+  XlaResource& operator=(XlaResource&&) = delete;
+
+  Kind kind() const { return kind_; }
+
+  // If this resource is visible externally to the computation, what was its
+  // argument number?
+  // < 0 means "not visible externally".
+  int arg_num() const { return arg_num_; }
+
+  // A descriptive name for the resource, used in error messages.
+  const string& name() const { return name_; }
+
+  // Current type and value of the resource. Uninitialized resources are
+  // represented by a default (zero) handle and type DT_INVALID.
+  // While the type of a resource is notionally fixed during execution, when
+  // a resource is first initialized we do not yet know its type, so we keep
+  // track of its type dynamically.
+  DataType type() const { return type_; }
+
+  // Shape of the resource. For an uninitialized resource, this is ignored.
+  // For a Variable, this is the shape of the value. For a TensorArray or Stack
+  // this is the shape of each entry in the TensorArray/Stack.
+  const TensorShape& shape() const { return shape_; }
+
+  const xla::ComputationDataHandle& value() const { return value_; }
+
+  // Value of the resource at computation entry. Used to detect which
+  // variables have new values that need to be written back.
+  const xla::ComputationDataHandle& initial_value() const {
+    return initial_value_;
+  }
+
+  // A variable is initialized if it has a value.
+  bool initialized() const { return value_.handle() > 0; }
+
+  // Sets the type and shape of the resource. The type and shape of a resource
+  // must not change once the variable has been initialized.
+  Status SetTypeAndShape(DataType type, const TensorShape& shape);
+
+  // Sets the current value of the resource. Returns an error if the type is not
+  // set to a valid value.
+  Status SetValue(const xla::ComputationDataHandle& value);
+
+  // Sets the current value of the resource to an all-zero value.
+  Status SetZeroValue(xla::ComputationBuilder* builder);
+
+  // Looks up the gradient for `source`, or creates it if it does not already
+  // exist. The call target must be an initialized TensorArray resource. A
+  // TensorArray can have multiple named gradients; see the operator
+  // documentation for TensorArrayGradV3 for details.
+  Status GetOrCreateTensorArrayGradient(const string& source,
+                                        xla::ComputationBuilder* builder,
+                                        XlaResource** gradient_out);
+
+  // Packs a resource into a single XLA value `pack`, suitable for use as
+  // an XlaCompiler::Argument. For non-TensorArrays or TensorArrays without
+  // gradients, sets `*pack` to `value`.
+  // For TensorArrays with gradients, packs the value and its gradient values in
+  // a tuple; the gradients values are packed in order by source name.
+  Status Pack(xla::ComputationDataHandle* pack,
+              xla::ComputationBuilder* builder) const;
+
+  // Updates the resource with values from `pack`. If `gradient_sources` is
+  // non-empty, treats `pack` as a tuple that represents a TensorArray and
+  // its gradients, and unpacks and updates the gradient resources.
+  // If `reset_initial_values` is true, sets the initial_values as well as the
+  // values.
+  // Opposite of Pack().
+  Status SetFromPack(const std::set<string>& gradient_sources,
+                     const xla::ComputationDataHandle& pack,
+                     xla::ComputationBuilder* builder);
+
+  // TensorArray and Stack specific fields
+
+  // 'tensor_array_size' stores the expected size of the TensorArray or Stack.
+  // We need to store this since sometimes TensorArrays must be initialized
+  // lazily since we do not know the element shape at construction time.
+  // Used by both TensorArrays and Stacks.
+  int64 tensor_array_size() const { return tensor_array_size_; }
+  void set_tensor_array_size(int64 size) { tensor_array_size_ = size; }
+
+  // 'tensor_array_gradient' is a map from TensorArrayGradV3 'source' attributes
+  // to an XlaResource containing the gradient TensorArrays. We store a pointer
+  // here since there should only be one gradient TensorArray per 'source'
+  // string, irrespective of the number of calls to TensorArrayGrad. The map
+  // is ordered since values are packed into tuples by Pack() sorted by name
+  // order.
+  const std::map<string, std::unique_ptr<XlaResource>>& tensor_array_gradients()
+      const {
+    return tensor_array_gradients_;
+  }
+
+ private:
+  const Kind kind_;
+  const int arg_num_;
+  const string name_;
+
+  DataType type_;
+  TensorShape shape_;
+  xla::ComputationDataHandle value_;
+  xla::ComputationDataHandle initial_value_;
+
+  int64 tensor_array_size_ = -1;
+
+  std::map<string, std::unique_ptr<XlaResource>> tensor_array_gradients_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_RESOURCE_H_
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index d3f292207fee396fb4248dede5c0eeb5cd2b87c9..34e733bc8d80b364cec1783006eba0a5468b55ea 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -20,6 +20,10 @@ package_group(
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_py",
+)
 
 # Filegroup used to collect source files for dependency checking.
 filegroup(
@@ -36,6 +40,12 @@ xla_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_proto_library_py(
+    name = "xla_data_proto",  # bzl adds a _py suffix
+    srcs = ["xla_data.proto"],
+    visibility = ["//visibility:public"],
+)
+
 xla_proto_library(
     name = "xla_proto",
     srcs = ["xla.proto"],
@@ -78,7 +88,6 @@ cc_library(
     visibility = [":friends"],
     deps = [
         "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
 )
@@ -172,6 +181,7 @@ cc_library(
     deps = [
         ":status",
         ":status_macros",
+        ":statusor",
         ":types",
         ":xla_data_proto",
         "//tensorflow/core:lib",
@@ -250,6 +260,7 @@ tf_cc_test(
     srcs = ["shape_util_test.cc"],
     deps = [
         ":shape_util",
+        ":status_macros",
         ":test",
         ":test_helpers",
         ":types",
@@ -290,7 +301,9 @@ cc_library(
         ":array2d",
         ":array3d",
         ":array4d",
+        ":shape_tree",
         ":shape_util",
+        ":sparse_index_array",
         ":status_macros",
         ":types",
         ":util",
@@ -617,6 +630,28 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "sparse_index_array",
+    srcs = ["sparse_index_array.cc"],
+    hdrs = ["sparse_index_array.h"],
+    deps = [
+        ":array2d",
+        ":shape_util",
+        ":xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "sparse_index_array_test",
+    srcs = ["sparse_index_array_test.cc"],
+    deps = [
+        ":sparse_index_array",
+        ":test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index 213e0bac6c77e9972de8d4dd7dfc8c7cf3a1b865..71aa057cd3a1c273c0e851497a78f94ba37c778e 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <initializer_list>
 #include <iterator>
 #include <memory>
+#include <numeric>
 #include <random>
 #include <type_traits>
 #include <vector>
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index f953407a567b91fdf6ae727d6982a2a778c5873e..02356699a25e47be50eb15872df4c9c302fc289b 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -80,6 +80,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "executable_build_options",
+    srcs = ["executable_build_options.cc"],
+    hdrs = ["executable_build_options.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "local_client",
     srcs = ["local_client.cc"],
@@ -87,6 +99,7 @@ cc_library(
     deps = [
         ":client",
         ":computation",
+        ":executable_build_options",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -98,6 +111,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service:source_map_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@llvm//:support",
@@ -186,6 +200,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "sharding_builder",
+    srcs = ["sharding_builder.cc"],
+    hdrs = ["sharding_builder.h"],
+    deps = [
+        "//tensorflow/compiler/xla:array",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 66937d64aff18817bbd5310e0c24e19556e9d727..d15ccb0c28522c647617153aaa8e738d029dfaba 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -60,7 +60,7 @@ StatusOr<std::unique_ptr<Literal>> Client::Transfer(
         "server provided response without a literal in "
         "TransferToClient request");
   }
-  return MakeUnique<Literal>(response.literal());
+  return Literal::CreateFromProto(*response.mutable_literal());
 }
 
 StatusOr<std::unique_ptr<GlobalData>> Client::TransferToServer(
@@ -142,7 +142,7 @@ StatusOr<std::unique_ptr<Literal>> Client::TransferFromOutfeed(
         "TransferToClient request");
   }
 
-  return MakeUnique<Literal>(response.literal());
+  return Literal::CreateFromProto(response.literal());
 }
 
 Status Client::ResetDevice() {
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index cce931000331e98b00f57025cb13a5d3982c2845..b1dcad6a49a270935b07e26de2d3945b912359d1 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -34,25 +34,9 @@ limitations under the License.
 
 namespace xla {
 
-ComputationDataHandle ComputationBuilder::ParseOpResponse(
-    const Status& status, OpResponse* response) {
-  VLOG(2) << "done with op request";
-
-  if (!status.ok()) {
-    NoteError(status);
-    return ComputationDataHandle();
-  }
-
-  if (response->output().handle() == 0) {
-    NoteError(InternalError("No output handle"));
-    return ComputationDataHandle();
-  }
-  return response->output();
-}
-
 ComputationBuilder::ComputationBuilder(Client* client,
                                        const string& computation_name)
-    : name_(computation_name), first_error_(Status::OK()), client_(client) {}
+    : name_(computation_name), client_(client) {}
 
 ComputationBuilder::~ComputationBuilder() {}
 
@@ -76,9 +60,8 @@ std::unique_ptr<ComputationBuilder> ComputationBuilder::CreateSubBuilder(
 }
 
 Status ComputationBuilder::PrepareComputation() {
-  if (!first_error_.ok()) {
-    return first_error_;
-  }
+  TF_RETURN_IF_ERROR(first_error_);
+
   if (!computation_.IsNull()) {
     return Status::OK();
   }
@@ -100,6 +83,49 @@ Status ComputationBuilder::PrepareComputation() {
   return Status::OK();
 }
 
+Status ComputationBuilder::RunOp(OpRequest* op_request,
+                                 OpResponse* op_response) {
+  TF_RETURN_IF_ERROR(first_error_);
+  TF_RETURN_IF_ERROR(PrepareComputation());
+
+  // Fill in fields that are set on every OpRequest.
+  *op_request->mutable_computation() = computation_.handle();
+  *op_request->mutable_metadata() = metadata_;
+  if (sharding_) {
+    *op_request->mutable_sharding() = *sharding_;
+  }
+
+  const string& op_name =
+      OpRequest::descriptor()->FindFieldByNumber(op_request->op_case())->name();
+  VLOG(2) << "running op request: " << op_name;
+  Status status = client_->stub()->Op(op_request, op_response);
+  VLOG(2) << "done with op request: " << op_name;
+  return status;
+}
+
+void ComputationBuilder::RunOpAndNoteError(OpRequest* op_request) {
+  OpResponse op_response;
+  Status status = RunOp(op_request, &op_response);
+  if (!status.ok()) {
+    NoteError(status);
+  }
+}
+
+ComputationDataHandle ComputationBuilder::RunOpAndParseResponse(
+    OpRequest* op_request) {
+  OpResponse op_response;
+  Status status = RunOp(op_request, &op_response);
+  if (!status.ok()) {
+    NoteError(status);
+    return ComputationDataHandle();
+  }
+  if (op_response.output().handle() == 0) {
+    NoteError(InternalError("No output handle"));
+    return ComputationDataHandle();
+  }
+  return op_response.output();
+}
+
 bool ComputationBuilder::MakeWindow(
     tensorflow::gtl::ArraySlice<int64> window_dimensions,
     tensorflow::gtl::ArraySlice<int64> window_strides,
@@ -158,81 +184,75 @@ bool ComputationBuilder::MakeWindow(
   return true;
 }
 
-ComputationDataHandle ComputationBuilder::ConstantOp(
-    const PopulateLiteral& populate) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  ConstantRequest request;
-  Literal literal;
-  populate(&literal);
-  *request.mutable_literal() = literal.ToProto();
-  VLOG(3) << "created constant: " << request.literal().ShortDebugString();
-  OpRequest op_request;
-  *op_request.mutable_constant_request() = request;
-  *op_request.mutable_computation() = computation_.handle();
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making constant request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
-}
-
 ComputationDataHandle ComputationBuilder::ConstantLiteral(
     const Literal& literal) {
-  return ConstantOp(
-      [literal](Literal* mutable_literal) { *mutable_literal = literal; });
+  OpRequest op_request;
+  ConstantRequest* request = op_request.mutable_constant_request();
+  *request->mutable_literal() = literal.ToProto();
+  VLOG(3) << "created constant: " << request->literal().ShortDebugString();
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Parameter(int64 parameter_number,
                                                     const Shape& shape,
                                                     const string& name) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  ParameterRequest request;
-  *request.mutable_shape() = shape;
-  request.set_parameter(parameter_number);
-  request.set_name(name);
   OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_parameter_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making parameter request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  ParameterRequest* request = op_request.mutable_parameter_request();
+  *request->mutable_shape() = shape;
+  request->set_parameter(parameter_number);
+  request->set_name(name);
+  return RunOpAndParseResponse(&op_request);
 }
 
-StatusOr<std::unique_ptr<Shape>> ComputationBuilder::GetShape(
+StatusOr<std::unique_ptr<Shape>> ComputationBuilder::GetShapeWithoutNoteError(
     const ComputationDataHandle& operand) {
-  if (!first_error_.ok()) {
-    return first_error_;
-  }
-
   GetLocalShapeRequest request;
   *request.mutable_computation() = computation_.handle();
   *request.mutable_operand() = operand;
   GetLocalShapeResponse response;
 
   VLOG(2) << "making get-shape request";
-  Status s = client_->stub()->GetLocalShape(&request, &response);
+  TF_RETURN_IF_ERROR(client_->stub()->GetLocalShape(&request, &response));
   VLOG(2) << "done with request";
 
-  if (!s.ok()) {
-    NoteError(s);
-    return first_error_;
-  }
   TF_RET_CHECK(response.has_shape());
   std::unique_ptr<Shape> shape = WrapUnique(response.release_shape());
   TF_RET_CHECK(shape != nullptr);
   return std::move(shape);
 }
 
+StatusOr<std::unique_ptr<Shape>> ComputationBuilder::GetShape(
+    const ComputationDataHandle& operand) {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  auto status_or_shape = GetShapeWithoutNoteError(operand);
+  if (!status_or_shape.ok()) {
+    NoteError(status_or_shape.status());
+    return first_error_;
+  }
+  return status_or_shape;
+}
+
+StatusOr<ProgramShape> ComputationBuilder::GetProgramShape() {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  GetComputationShapeRequest request;
+  *request.mutable_computation() = computation_.handle();
+  GetComputationShapeResponse response;
+
+  VLOG(2) << "making get-program-shape-request";
+  Status status = client_->stub()->GetComputationShape(&request, &response);
+  VLOG(2) << "done with get-program-shape-request";
+
+  if (!status.ok()) {
+    first_error_ = status;
+    return status;
+  }
+
+  TF_RET_CHECK(response.has_program_shape());
+  return std::move(*response.mutable_program_shape());
+}
+
 ComputationDataHandle ComputationBuilder::CheckShape(
     const ComputationDataHandle& operand, const Shape& expected_shape) {
   std::unique_ptr<Shape> actual_shape = GetShape(operand).ConsumeValueOrDie();
@@ -258,30 +278,19 @@ ComputationDataHandle ComputationBuilder::Slice(
     tensorflow::gtl::ArraySlice<int64> start_indices,
     tensorflow::gtl::ArraySlice<int64> limit_indices,
     tensorflow::gtl::ArraySlice<int64> strides) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  SliceRequest request;
-  *request.mutable_operand() = operand;
+  OpRequest op_request;
+  SliceRequest* request = op_request.mutable_slice_request();
+  *request->mutable_operand() = operand;
   for (int64 index : start_indices) {
-    request.add_start_indices(index);
+    request->add_start_indices(index);
   }
   for (int64 index : limit_indices) {
-    request.add_limit_indices(index);
+    request->add_limit_indices(index);
   }
   for (int64 index : strides) {
-    request.add_strides(index);
+    request->add_strides(index);
   }
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_slice_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making slice request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::SliceInDim(
@@ -307,143 +316,78 @@ ComputationDataHandle ComputationBuilder::DynamicSlice(
     const ComputationDataHandle& operand,
     const ComputationDataHandle& start_indices,
     tensorflow::gtl::ArraySlice<int64> slice_sizes) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  DynamicSliceRequest request;
-  *request.mutable_operand() = operand;
-  *request.mutable_start_indices() = start_indices;
+  OpRequest op_request;
+  DynamicSliceRequest* request = op_request.mutable_dynamic_slice_request();
+  *request->mutable_operand() = operand;
+  *request->mutable_start_indices() = start_indices;
   for (int64 index : slice_sizes) {
-    request.add_slice_sizes(index);
+    request->add_slice_sizes(index);
   }
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_dynamic_slice_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making dynamic slice request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::DynamicUpdateSlice(
     const ComputationDataHandle& operand, const ComputationDataHandle& update,
     const ComputationDataHandle& start_indices) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  DynamicUpdateSliceRequest request;
-  *request.mutable_operand() = operand;
-  *request.mutable_update() = update;
-  *request.mutable_start_indices() = start_indices;
   OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_dynamic_update_slice_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making dynamic update slice request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  DynamicUpdateSliceRequest* request =
+      op_request.mutable_dynamic_update_slice_request();
+  *request->mutable_operand() = operand;
+  *request->mutable_update() = update;
+  *request->mutable_start_indices() = start_indices;
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::ConcatInDim(
     tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
     int64 dimension) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  ConcatenateRequest request;
+  OpRequest op_request;
+  ConcatenateRequest* request = op_request.mutable_concatenate_request();
   for (const ComputationDataHandle& operand : operands) {
-    *request.add_operands() = operand;
+    *request->add_operands() = operand;
   }
-  request.set_dimension(dimension);
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_concatenate_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making concatenate request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  request->set_dimension(dimension);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Broadcast(
     const ComputationDataHandle& operand,
     tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  BroadcastRequest request;
-  *request.mutable_operand() = operand;
+  OpRequest op_request;
+  BroadcastRequest* request = op_request.mutable_broadcast_request();
+  *request->mutable_operand() = operand;
   for (int64 size : broadcast_sizes) {
-    request.add_broadcast_sizes(size);
+    request->add_broadcast_sizes(size);
   }
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_broadcast_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making broadcast request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Pad(
     const ComputationDataHandle& operand,
     const ComputationDataHandle& padding_value,
     const PaddingConfig& padding_config) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  PadRequest request;
-  *request.mutable_operand() = operand;
-  *request.mutable_padding_value() = padding_value;
-  *request.mutable_padding_config() = padding_config;
   OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_pad_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making pad request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  PadRequest* request = op_request.mutable_pad_request();
+  *request->mutable_operand() = operand;
+  *request->mutable_padding_value() = padding_value;
+  *request->mutable_padding_config() = padding_config;
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Reshape(
     const ComputationDataHandle& operand,
     tensorflow::gtl::ArraySlice<int64> dimensions,
     tensorflow::gtl::ArraySlice<int64> new_sizes) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  ReshapeRequest request;
-  *request.mutable_operand() = operand;
+  OpRequest op_request;
+  ReshapeRequest* request = op_request.mutable_reshape_request();
+  *request->mutable_operand() = operand;
   for (int64 dimension : dimensions) {
-    request.add_dimensions(dimension);
+    request->add_dimensions(dimension);
   }
   for (int64 new_size : new_sizes) {
-    request.add_new_sizes(new_size);
+    request->add_new_sizes(new_size);
   }
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_reshape_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making reshape request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Reshape(
@@ -455,7 +399,6 @@ ComputationDataHandle ComputationBuilder::Reshape(
 
   StatusOr<std::unique_ptr<Shape>> shape = GetShape(operand);
   if (!shape.ok()) {
-    first_error_ = shape.status();
     return ComputationDataHandle();
   }
   std::vector<int64> dimensions(shape.ValueOrDie()->dimensions().size());
@@ -485,7 +428,6 @@ ComputationDataHandle ComputationBuilder::Collapse(
   // dimensions by the product of their sizes.
   StatusOr<std::unique_ptr<Shape>> shape_or_status = GetShape(operand);
   if (!shape_or_status.ok()) {
-    first_error_ = shape_or_status.status();
     return ComputationDataHandle();
   }
   std::unique_ptr<Shape> original_shape = shape_or_status.ConsumeValueOrDie();
@@ -517,26 +459,11 @@ ComputationDataHandle ComputationBuilder::Collapse(
 
 void ComputationBuilder::Trace(const string& tag,
                                const ComputationDataHandle& operand) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return;
-  }
-
-  TraceRequest request;
-  request.set_tag(tag);
-  *request.mutable_operand() = operand;
   OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_trace_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making trace request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  VLOG(2) << "done with request";
-
-  if (!s.ok()) {
-    NoteError(s);
-  }
+  TraceRequest* request = op_request.mutable_trace_request();
+  request->set_tag(tag);
+  *request->mutable_operand() = operand;
+  RunOpAndNoteError(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Select(
@@ -547,44 +474,23 @@ ComputationDataHandle ComputationBuilder::Select(
 
 ComputationDataHandle ComputationBuilder::Tuple(
     tensorflow::gtl::ArraySlice<ComputationDataHandle> elements) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  VariadicOpRequest request;
-  request.set_varop(VAROP_TUPLE);
+  OpRequest op_request;
+  VariadicOpRequest* request = op_request.mutable_variadic_op_request();
+  request->set_varop(VAROP_TUPLE);
   for (const ComputationDataHandle& operand : elements) {
-    *request.add_operands() = operand;
+    *request->add_operands() = operand;
   }
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_variadic_op_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making variadic op request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::GetTupleElement(
     const ComputationDataHandle& tuple_data, int64 index) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  GetTupleElementRequest request;
-  *request.mutable_operand() = tuple_data;
-  request.set_index(index);
   OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_get_tuple_element_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making get tuple element op request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  GetTupleElementRequest* request =
+      op_request.mutable_get_tuple_element_request();
+  *request->mutable_operand() = tuple_data;
+  request->set_index(index);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Eq(
@@ -625,16 +531,33 @@ ComputationDataHandle ComputationBuilder::Lt(
 
 ComputationDataHandle ComputationBuilder::Dot(
     const ComputationDataHandle& lhs, const ComputationDataHandle& rhs) {
-  return BinaryOp(BINOP_DOT, lhs, rhs, /*broadcast_dimensions=*/{});
+  StatusOr<std::unique_ptr<Shape>> lhs_shape_or_status = GetShape(lhs);
+  if (!lhs_shape_or_status.ok()) {
+    return ComputationDataHandle();
+  }
+  std::unique_ptr<Shape> lhs_shape = lhs_shape_or_status.ConsumeValueOrDie();
+
+  DotDimensionNumbers dimension_numbers;
+  dimension_numbers.add_lhs_contracting_dimensions(
+      lhs_shape->dimensions_size() == 1 ? 0 : 1);
+  dimension_numbers.add_rhs_contracting_dimensions(0);
+  return DotGeneral(lhs, rhs, dimension_numbers);
+}
+
+ComputationDataHandle ComputationBuilder::DotGeneral(
+    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+    const DotDimensionNumbers& dimension_numbers) {
+  OpRequest op_request;
+  DotRequest* request = op_request.mutable_dot_request();
+  *request->mutable_lhs() = lhs;
+  *request->mutable_rhs() = rhs;
+  *request->mutable_dimension_numbers() = dimension_numbers;
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Conv(
     const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
   return ConvWithGeneralDimensions(
       lhs, rhs, window_strides, padding,
       CreateDefaultConvDimensionNumbers(window_strides.size()));
@@ -644,10 +567,6 @@ ComputationDataHandle ComputationBuilder::ConvWithGeneralPadding(
     const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
   return ConvGeneral(lhs, rhs, window_strides, padding,
                      CreateDefaultConvDimensionNumbers(window_strides.size()));
 }
@@ -715,13 +634,11 @@ ComputationDataHandle ComputationBuilder::ConvWithGeneralDimensions(
 
   StatusOr<std::unique_ptr<Shape>> lhs_shape_or_status = GetShape(lhs);
   if (!lhs_shape_or_status.ok()) {
-    first_error_ = lhs_shape_or_status.status();
     return ComputationDataHandle();
   }
 
   StatusOr<std::unique_ptr<Shape>> rhs_shape_or_status = GetShape(rhs);
   if (!rhs_shape_or_status.ok()) {
-    first_error_ = rhs_shape_or_status.status();
     return ComputationDataHandle();
   }
 
@@ -776,13 +693,11 @@ ComputationDataHandle ComputationBuilder::ConvGeneralDilated(
 
   StatusOr<std::unique_ptr<Shape>> lhs_shape_or_status = GetShape(lhs);
   if (!lhs_shape_or_status.ok()) {
-    first_error_ = lhs_shape_or_status.status();
     return ComputationDataHandle();
   }
 
   StatusOr<std::unique_ptr<Shape>> rhs_shape_or_status = GetShape(rhs);
   if (!rhs_shape_or_status.ok()) {
-    first_error_ = rhs_shape_or_status.status();
     return ComputationDataHandle();
   }
 
@@ -800,122 +715,78 @@ ComputationDataHandle ComputationBuilder::ConvGeneralDilated(
         rhs_shape->dimensions(dimension_numbers.kernel_spatial_dimensions(i));
   }
 
-  ConvolveRequest request;
-  *request.mutable_lhs() = lhs;
-  *request.mutable_rhs() = rhs;
-  *request.mutable_dimension_numbers() = dimension_numbers;
+  OpRequest op_request;
+  ConvolveRequest* request = op_request.mutable_convolve_request();
+  *request->mutable_lhs() = lhs;
+  *request->mutable_rhs() = rhs;
+  *request->mutable_dimension_numbers() = dimension_numbers;
 
   if (!MakeWindow(window_dimensions, window_strides, padding, lhs_dilation,
-                  rhs_dilation, request.mutable_window())) {
+                  rhs_dilation, request->mutable_window())) {
     // Error is recorded in MakeWindow.
     return ComputationDataHandle();
   }
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_convolve_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
 
-  VLOG(2) << "making convolve request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  return RunOpAndParseResponse(&op_request);
 }
 
-ComputationDataHandle ComputationBuilder::Infeed(const Shape& shape,
-                                                 const string& config) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
+ComputationDataHandle ComputationBuilder::Fft(
+    const ComputationDataHandle& operand, const FftType fft_type,
+    const tensorflow::gtl::ArraySlice<int64> fft_length) {
+  OpRequest op_request;
+  FftRequest* request = op_request.mutable_fft_request();
+  *request->mutable_operand() = operand;
+  request->set_fft_type(fft_type);
+  for (int64 dim_len : fft_length) {
+    request->add_fft_length(dim_len);
   }
+  return RunOpAndParseResponse(&op_request);
+}
 
-  InfeedRequest request;
-  *request.mutable_shape() = shape;
-  *request.mutable_config() = config;
+ComputationDataHandle ComputationBuilder::Infeed(const Shape& shape,
+                                                 const string& config) {
   OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_infeed_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making infeed op request";
-  Status s = client_->stub()->Op(&op_request, &response);
-
-  return ParseOpResponse(s, &response);
+  InfeedRequest* request = op_request.mutable_infeed_request();
+  *request->mutable_shape() = shape;
+  *request->mutable_config() = config;
+  return RunOpAndParseResponse(&op_request);
 }
 
 void ComputationBuilder::Outfeed(const ComputationDataHandle& operand,
                                  const Shape& shape,
                                  const string& outfeed_config) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return;
-  }
-
-  OutfeedRequest request;
-  request.set_outfeed_config(outfeed_config);
-  *request.mutable_operand() = operand;
-  *request.mutable_shape() = shape;
   OpRequest op_request;
-  *op_request.mutable_outfeed_request() = request;
-  *op_request.mutable_computation() = computation_.handle();
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making outfeed op request";
-  tensorflow::Status s = client_->stub()->Op(&op_request, &response);
-
-  if (!s.ok()) {
-    NoteError(s);
-    return;
-  }
+  OutfeedRequest* request = op_request.mutable_outfeed_request();
+  request->set_outfeed_config(outfeed_config);
+  *request->mutable_operand() = operand;
+  *request->mutable_shape() = shape;
+  RunOpAndNoteError(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Call(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<ComputationDataHandle> operands) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  CallRequest request;
-  *request.mutable_to_apply() = computation.handle();
+  OpRequest op_request;
+  CallRequest* request = op_request.mutable_call_request();
+  *request->mutable_to_apply() = computation.handle();
   for (const ComputationDataHandle& operand : operands) {
-    *request.add_operands() = operand;
+    *request->add_operands() = operand;
   }
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_call_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making call op request";
-  Status s = client_->stub()->Op(&op_request, &response);
-
-  return ParseOpResponse(s, &response);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::CustomCall(
     const string& call_target_name,
     tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
     const Shape& shape) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  CustomCallRequest request;
-  request.set_call_target_name(call_target_name);
+  OpRequest op_request;
+  CustomCallRequest* request = op_request.mutable_custom_call_request();
+  request->set_call_target_name(call_target_name);
   for (const ComputationDataHandle& operand : operands) {
-    *request.add_operands() = operand;
+    *request->add_operands() = operand;
   }
-  *request.mutable_shape() = shape;
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_custom_call_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making custom call op request";
-  Status s = client_->stub()->Op(&op_request, &response);
-
-  return ParseOpResponse(s, &response);
+  *request->mutable_shape() = shape;
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Complex(
@@ -1080,47 +951,25 @@ ComputationDataHandle ComputationBuilder::IsFinite(
 ComputationDataHandle ComputationBuilder::Transpose(
     const ComputationDataHandle& operand,
     tensorflow::gtl::ArraySlice<int64> permutation) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
   OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
   TransposeRequest* request = op_request.mutable_transpose_request();
   *request->mutable_operand() = operand;
   for (int64 dimension : permutation) {
     request->add_dimensions(dimension);
   }
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making transpose request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Rev(
     const ComputationDataHandle& operand,
     tensorflow::gtl::ArraySlice<int64> dimensions) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  ReverseRequest request;
-  *request.mutable_operand() = operand;
+  OpRequest op_request;
+  ReverseRequest* request = op_request.mutable_reverse_request();
+  *request->mutable_operand() = operand;
   for (int64 dimension : dimensions) {
-    request.add_dimensions(dimension);
+    request->add_dimensions(dimension);
   }
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_reverse_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making reverse op request";
-  Status s = client_->stub()->Op(&op_request, &response);
-
-  return ParseOpResponse(s, &response);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Sort(
@@ -1148,24 +997,15 @@ ComputationDataHandle ComputationBuilder::ConvertElementType(
 
   StatusOr<std::unique_ptr<Shape>> shape_status = GetShape(operand);
   if (!shape_status.ok()) {
-    first_error_ = shape_status.status();
     return ComputationDataHandle();
   }
   std::unique_ptr<Shape> original = shape_status.ConsumeValueOrDie();
 
-  ConvertRequest request;
-  *request.mutable_operand() = operand;
-  request.set_new_element_type(new_element_type);
   OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_convert_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making convert request";
-  Status s = client_->stub()->Op(&op_request, &response);
-
-  return ParseOpResponse(s, &response);
+  ConvertRequest* request = op_request.mutable_convert_request();
+  *request->mutable_operand() = operand;
+  request->set_new_element_type(new_element_type);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::BitcastConvertType(
@@ -1176,24 +1016,15 @@ ComputationDataHandle ComputationBuilder::BitcastConvertType(
 
   StatusOr<std::unique_ptr<Shape>> shape_status = GetShape(operand);
   if (!shape_status.ok()) {
-    first_error_ = shape_status.status();
     return ComputationDataHandle();
   }
   std::unique_ptr<Shape> original = shape_status.ConsumeValueOrDie();
 
-  ConvertRequest request;
-  *request.mutable_operand() = operand;
-  request.set_new_element_type(new_element_type);
   OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_bitcast_convert_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making bitcast convert request";
-  Status s = client_->stub()->Op(&op_request, &response);
-
-  return ParseOpResponse(s, &response);
+  ConvertRequest* request = op_request.mutable_bitcast_convert_request();
+  *request->mutable_operand() = operand;
+  request->set_new_element_type(new_element_type);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::SquareF32(
@@ -1221,107 +1052,57 @@ ComputationDataHandle ComputationBuilder::Clamp(
 
 ComputationDataHandle ComputationBuilder::UnaryOp(
     UnaryOperation unop, const ComputationDataHandle& operand) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  UnaryOpRequest request;
-  request.set_unop(unop);
-  *request.mutable_operand() = operand;
   OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_unary_op_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making unop request";
-  Status s = client_->stub()->Op(&op_request, &response);
-
-  return ParseOpResponse(s, &response);
+  UnaryOpRequest* request = op_request.mutable_unary_op_request();
+  request->set_unop(unop);
+  *request->mutable_operand() = operand;
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::BinaryOp(
     BinaryOperation binop, const ComputationDataHandle& lhs,
     const ComputationDataHandle& rhs,
     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  BinaryOpRequest request;
-  request.set_binop(binop);
-  *request.mutable_lhs() = lhs;
-  *request.mutable_rhs() = rhs;
+  OpRequest op_request;
+  BinaryOpRequest* request = op_request.mutable_binary_op_request();
+  request->set_binop(binop);
+  *request->mutable_lhs() = lhs;
+  *request->mutable_rhs() = rhs;
   for (int64 dimension : broadcast_dimensions) {
-    request.add_broadcast_dimensions(dimension);
+    request->add_broadcast_dimensions(dimension);
   }
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_binary_op_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making binop request";
-  Status s = client_->stub()->Op(&op_request, &response);
-
-  return ParseOpResponse(s, &response);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::RngOp(
     RandomDistribution distribution,
     tensorflow::gtl::ArraySlice<ComputationDataHandle> parameters,
     const Shape& shape) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  RngRequest request;
-  request.set_distribution(distribution);
+  OpRequest op_request;
+  RngRequest* request = op_request.mutable_rng_request();
+  request->set_distribution(distribution);
   for (const ComputationDataHandle& param : parameters) {
-    *request.add_parameter() = param;
+    *request->add_parameter() = param;
   }
-  *request.mutable_shape() = shape;
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_rng_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making rngop request";
-  Status s = client_->stub()->Op(&op_request, &response);
-
-  return ParseOpResponse(s, &response);
+  *request->mutable_shape() = shape;
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::TernaryOp(
     TernaryOperation triop, const ComputationDataHandle& lhs,
     const ComputationDataHandle& rhs, const ComputationDataHandle& ehs) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  TernaryOpRequest request;
-  request.set_triop(triop);
-  *request.mutable_lhs() = lhs;
-  *request.mutable_rhs() = rhs;
-  *request.mutable_ehs() = ehs;
   OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_ternary_op_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making triop request";
-  Status s = client_->stub()->Op(&op_request, &response);
-
-  return ParseOpResponse(s, &response);
+  TernaryOpRequest* request = op_request.mutable_ternary_op_request();
+  request->set_triop(triop);
+  *request->mutable_lhs() = lhs;
+  *request->mutable_rhs() = rhs;
+  *request->mutable_ehs() = ehs;
+  return RunOpAndParseResponse(&op_request);
 }
 
 Status ComputationBuilder::SetReturnValue(
     const ComputationDataHandle& operand) {
-  if (!first_error_.ok()) {
-    return first_error_;
-  }
+  TF_RETURN_IF_ERROR(first_error_);
 
   SetReturnValueRequest request;
   *request.mutable_computation() = computation_.handle();
@@ -1343,9 +1124,7 @@ Status ComputationBuilder::SetReturnValue(
 
 StatusOr<bool> ComputationBuilder::IsConstant(
     const ComputationDataHandle& operand, int64 num_parameters) {
-  if (!first_error_.ok()) {
-    return first_error_;
-  }
+  TF_RETURN_IF_ERROR(first_error_);
 
   IsConstantRequest request;
   *request.mutable_computation() = computation_.handle();
@@ -1366,9 +1145,7 @@ StatusOr<bool> ComputationBuilder::IsConstant(
 StatusOr<std::unique_ptr<Literal>> ComputationBuilder::ComputeConstant(
     const ComputationDataHandle& operand, const Layout* output_layout,
     tensorflow::gtl::ArraySlice<Literal> parameters) {
-  if (!first_error_.ok()) {
-    return first_error_;
-  }
+  TF_RETURN_IF_ERROR(first_error_);
 
   ComputeConstantRequest request;
   *request.mutable_computation() = computation_.handle();
@@ -1397,7 +1174,7 @@ StatusOr<std::unique_ptr<Literal>> ComputationBuilder::ComputeConstant(
         "no computed literal in the provided response in ComputeConstant "
         "request");
   }
-  return MakeUnique<Literal>(response.literal());
+  return Literal::CreateFromProto(response.literal());
 }
 
 ComputationDataHandle ComputationBuilder::Map(
@@ -1405,30 +1182,19 @@ ComputationDataHandle ComputationBuilder::Map(
     const Computation& computation,
     tensorflow::gtl::ArraySlice<int64> dimensions,
     tensorflow::gtl::ArraySlice<ComputationDataHandle> static_operands) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  MapRequest request;
+  OpRequest op_request;
+  MapRequest* request = op_request.mutable_map_request();
   for (const ComputationDataHandle& operand : operands) {
-    *request.add_operands() = operand;
+    *request->add_operands() = operand;
   }
-  *request.mutable_to_apply() = computation.handle();
+  *request->mutable_to_apply() = computation.handle();
   for (int64 dimension : dimensions) {
-    request.add_dimensions(dimension);
+    request->add_dimensions(dimension);
   }
   for (const ComputationDataHandle& sop : static_operands) {
-    *request.add_static_operands() = sop;
+    *request->add_static_operands() = sop;
   }
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_map_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making Map request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::RngNormal(
@@ -1443,57 +1209,46 @@ ComputationDataHandle ComputationBuilder::RngUniform(
   return RngOp(RandomDistribution::RNG_UNIFORM, {a, b}, shape);
 }
 
-ComputationDataHandle ComputationBuilder::RngBernoulli(
-    const ComputationDataHandle& mean, const Shape& shape) {
-  return RngOp(RandomDistribution::RNG_BERNOULLI, {mean}, shape);
-}
-
 ComputationDataHandle ComputationBuilder::While(
     const Computation& condition, const Computation& body,
     const ComputationDataHandle& init) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  WhileRequest request;
-  *request.mutable_condition() = condition.handle();
-  *request.mutable_body() = body.handle();
-  *request.mutable_init() = init;
   OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_while_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making while request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  WhileRequest* request = op_request.mutable_while_request();
+  *request->mutable_condition() = condition.handle();
+  *request->mutable_body() = body.handle();
+  *request->mutable_init() = init;
+  return RunOpAndParseResponse(&op_request);
+}
+
+ComputationDataHandle ComputationBuilder::Conditional(
+    const ComputationDataHandle& predicate,
+    const ComputationDataHandle& true_operand,
+    const Computation& true_computation,
+    const ComputationDataHandle& false_operand,
+    const Computation& false_computation) {
+  OpRequest op_request;
+  ConditionalRequest* request = op_request.mutable_conditional_request();
+  *request->mutable_predicate() = predicate;
+  *request->mutable_true_operand() = true_operand;
+  *request->mutable_true_computation() = true_computation.handle();
+  *request->mutable_false_operand() = false_operand;
+  *request->mutable_false_computation() = false_computation.handle();
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Reduce(
     const ComputationDataHandle& operand,
     const ComputationDataHandle& init_value, const Computation& computation,
     tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  ReduceRequest request;
-  *request.mutable_operand() = operand;
-  *request.mutable_init_value() = init_value;
+  OpRequest op_request;
+  ReduceRequest* request = op_request.mutable_reduce_request();
+  *request->mutable_operand() = operand;
+  *request->mutable_init_value() = init_value;
   for (int64 dimension : dimensions_to_reduce) {
-    request.add_dimensions(dimension);
+    request->add_dimensions(dimension);
   }
-  *request.mutable_to_apply() = computation.handle();
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_reduce_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making reduce request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  *request->mutable_to_apply() = computation.handle();
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::ReduceAll(
@@ -1505,7 +1260,6 @@ ComputationDataHandle ComputationBuilder::ReduceAll(
 
   StatusOr<std::unique_ptr<Shape>> shape = GetShape(operand);
   if (!shape.ok()) {
-    first_error_ = shape.status();
     return ComputationDataHandle();
   }
 
@@ -1525,7 +1279,6 @@ ComputationDataHandle ComputationBuilder::ReduceWindow(
 
   StatusOr<std::unique_ptr<Shape>> shape = GetShape(operand);
   if (!shape.ok()) {
-    first_error_ = shape.status();
     return ComputationDataHandle();
   }
 
@@ -1551,84 +1304,50 @@ ComputationDataHandle ComputationBuilder::ReduceWindowWithGeneralPadding(
     tensorflow::gtl::ArraySlice<int64> window_dimensions,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  ReduceWindowRequest request;
-  *request.mutable_operand() = operand;
-  *request.mutable_to_apply() = computation.handle();
-  *request.mutable_init_value() = init_value;
+  OpRequest op_request;
+  ReduceWindowRequest* request = op_request.mutable_reduce_window_request();
+  *request->mutable_operand() = operand;
+  *request->mutable_to_apply() = computation.handle();
+  *request->mutable_init_value() = init_value;
 
   if (!MakeWindow(window_dimensions, window_strides, padding, {}, {},
-                  request.mutable_window())) {
+                  request->mutable_window())) {
     NoteError(InternalError("failed to make window"));
     return ComputationDataHandle();
   }
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_reduce_window_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
 
-  VLOG(2) << "making reduce-window request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::BatchNormTraining(
     const ComputationDataHandle& operand, const ComputationDataHandle& scale,
     const ComputationDataHandle& offset, float epsilon, int64 feature_index) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-  BatchNormTrainingRequest request;
-  *request.mutable_operand() = operand;
-  *request.mutable_scale() = scale;
-  *request.mutable_offset() = offset;
-  request.set_epsilon(epsilon);
-  request.set_feature_index(feature_index);
-
   OpRequest op_request;
-  *op_request.mutable_batch_norm_training_request() = request;
-  *op_request.mutable_computation() = computation_.handle();
-  AddCommonFieldsToOpRequest(&op_request);
-
-  OpResponse response;
-
-  VLOG(2) << "making BatchNormTraining request";
-
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  BatchNormTrainingRequest* request =
+      op_request.mutable_batch_norm_training_request();
+  *request->mutable_operand() = operand;
+  *request->mutable_scale() = scale;
+  *request->mutable_offset() = offset;
+  request->set_epsilon(epsilon);
+  request->set_feature_index(feature_index);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::BatchNormInference(
     const ComputationDataHandle& operand, const ComputationDataHandle& scale,
     const ComputationDataHandle& offset, const ComputationDataHandle& mean,
     const ComputationDataHandle& variance, float epsilon, int64 feature_index) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-  BatchNormInferenceRequest request;
-  *request.mutable_operand() = operand;
-  *request.mutable_scale() = scale;
-  *request.mutable_offset() = offset;
-  *request.mutable_mean() = mean;
-  *request.mutable_variance() = variance;
-  request.set_epsilon(epsilon);
-  request.set_feature_index(feature_index);
-
   OpRequest op_request;
-  *op_request.mutable_batch_norm_inference_request() = request;
-  *op_request.mutable_computation() = computation_.handle();
-  AddCommonFieldsToOpRequest(&op_request);
-
-  OpResponse response;
-
-  VLOG(2) << "making BatchNormInference request";
-
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  BatchNormInferenceRequest* request =
+      op_request.mutable_batch_norm_inference_request();
+  *request->mutable_operand() = operand;
+  *request->mutable_scale() = scale;
+  *request->mutable_offset() = offset;
+  *request->mutable_mean() = mean;
+  *request->mutable_variance() = variance;
+  request->set_epsilon(epsilon);
+  request->set_feature_index(feature_index);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::BatchNormGrad(
@@ -1636,49 +1355,25 @@ ComputationDataHandle ComputationBuilder::BatchNormGrad(
     const ComputationDataHandle& mean, const ComputationDataHandle& var,
     const ComputationDataHandle& grad_output, float epsilon,
     int64 feature_index) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-  BatchNormGradRequest request;
-  *request.mutable_operand() = operand;
-  *request.mutable_scale() = scale;
-  *request.mutable_mean() = mean;
-  *request.mutable_variance() = var;
-  *request.mutable_grad_output() = grad_output;
-  request.set_epsilon(epsilon);
-  request.set_feature_index(feature_index);
-
   OpRequest op_request;
-  *op_request.mutable_batch_norm_grad_request() = request;
-  *op_request.mutable_computation() = computation_.handle();
-  AddCommonFieldsToOpRequest(&op_request);
-
-  OpResponse response;
-
-  VLOG(2) << "making BatchNormGrad request";
-
-  Status s = client_->stub()->Op(&op_request, &response);
-
-  return ParseOpResponse(s, &response);
+  BatchNormGradRequest* request = op_request.mutable_batch_norm_grad_request();
+  *request->mutable_operand() = operand;
+  *request->mutable_scale() = scale;
+  *request->mutable_mean() = mean;
+  *request->mutable_variance() = var;
+  *request->mutable_grad_output() = grad_output;
+  request->set_epsilon(epsilon);
+  request->set_feature_index(feature_index);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::CrossReplicaSum(
     const ComputationDataHandle& operand) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  CrossReplicaSumRequest request;
-  *request.mutable_operand() = operand;
   OpRequest op_request;
-  *op_request.mutable_cross_replica_sum_request() = request;
-  *op_request.mutable_computation() = computation_.handle();
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making cross-replica-sum request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  CrossReplicaSumRequest* request =
+      op_request.mutable_cross_replica_sum_request();
+  *request->mutable_operand() = operand;
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::SelectAndScatter(
@@ -1693,7 +1388,6 @@ ComputationDataHandle ComputationBuilder::SelectAndScatter(
 
   StatusOr<std::unique_ptr<Shape>> shape = GetShape(operand);
   if (!shape.ok()) {
-    first_error_ = shape.status();
     return ComputationDataHandle();
   }
   return SelectAndScatterWithGeneralPadding(
@@ -1710,98 +1404,53 @@ ComputationDataHandle ComputationBuilder::SelectAndScatterWithGeneralPadding(
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     const ComputationDataHandle& source,
     const ComputationDataHandle& init_value, const Computation& scatter) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  SelectAndScatterRequest request;
-  *request.mutable_operand() = operand;
-  *request.mutable_select() = select.handle();
-  *request.mutable_source() = source;
-  *request.mutable_init_value() = init_value;
-  *request.mutable_scatter() = scatter.handle();
+  OpRequest op_request;
+  SelectAndScatterRequest* request =
+      op_request.mutable_select_and_scatter_request();
+  *request->mutable_operand() = operand;
+  *request->mutable_select() = select.handle();
+  *request->mutable_source() = source;
+  *request->mutable_init_value() = init_value;
+  *request->mutable_scatter() = scatter.handle();
 
   if (!MakeWindow(window_dimensions, window_strides, padding, {}, {},
-                  request.mutable_window())) {
+                  request->mutable_window())) {
     NoteError(InternalError("failed to make window"));
     return ComputationDataHandle();
   }
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_select_and_scatter_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
 
-  VLOG(2) << "making select-and-scatter request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  return RunOpAndParseResponse(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::ReducePrecision(
     const ComputationDataHandle& operand, const int exponent_bits,
     const int mantissa_bits) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  ReducePrecisionRequest request;
-  *request.mutable_operand() = operand;
-  request.set_exponent_bits(exponent_bits);
-  request.set_mantissa_bits(mantissa_bits);
   OpRequest op_request;
-  *op_request.mutable_computation() = computation_.handle();
-  *op_request.mutable_reduce_precision_request() = request;
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making reduce-precision request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  ReducePrecisionRequest* request =
+      op_request.mutable_reduce_precision_request();
+  *request->mutable_operand() = operand;
+  request->set_exponent_bits(exponent_bits);
+  request->set_mantissa_bits(mantissa_bits);
+  return RunOpAndParseResponse(&op_request);
 }
 
 void ComputationBuilder::Send(const ComputationDataHandle& operand,
                               const ChannelHandle& handle) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return;
-  }
-
-  SendRequest request;
-  *request.mutable_operand() = operand;
-  *request.mutable_channel_handle() = handle;
   OpRequest op_request;
-  *op_request.mutable_send_request() = request;
+  SendRequest* request = op_request.mutable_send_request();
+  *request->mutable_operand() = operand;
+  *request->mutable_channel_handle() = handle;
   *op_request.mutable_computation() = computation_.handle();
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making send request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  VLOG(2) << "done with op request";
-
-  if (!s.ok()) {
-    NoteError(s);
-    return;
-  }
+  RunOpAndNoteError(&op_request);
 }
 
 ComputationDataHandle ComputationBuilder::Recv(const Shape& shape,
                                                const ChannelHandle& handle) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  RecvRequest request;
-  *request.mutable_shape() = shape;
-  *request.mutable_channel_handle() = handle;
   OpRequest op_request;
-  *op_request.mutable_recv_request() = request;
-  *op_request.mutable_computation() = computation_.handle();
-  AddCommonFieldsToOpRequest(&op_request);
-  OpResponse response;
-
-  VLOG(2) << "making recv request";
-  Status s = client_->stub()->Op(&op_request, &response);
-  return ParseOpResponse(s, &response);
+  RecvRequest* request = op_request.mutable_recv_request();
+  *request->mutable_shape() = shape;
+  *request->mutable_channel_handle() = handle;
+  return RunOpAndParseResponse(&op_request);
 }
 
 Computation ComputationBuilder::BuildAndNoteError() {
@@ -1830,13 +1479,6 @@ StatusOr<Computation> ComputationBuilder::Build() {
   return {std::move(computation_)};
 }
 
-void ComputationBuilder::AddCommonFieldsToOpRequest(OpRequest* request) const {
-  *request->mutable_metadata() = metadata_;
-  if (sharding_) {
-    *request->mutable_sharding() = *sharding_;
-  }
-}
-
 /* static */ ConvolutionDimensionNumbers
 ComputationBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
   ConvolutionDimensionNumbers dimension_numbers;
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index d2dbbbbebbd5a9386f8841576de33a1fdb767000..7cae91e9e04bba8f28f2348c552a941e4f7a36b4 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -43,59 +43,6 @@ limitations under the License.
 
 namespace xla {
 
-class ShardingBuilder {
- public:
-  // A shaped array used to describe the assignment of tiles to devices.
-  using TileAssignment = Array<int64>;
-
-  // Creates a replicated sharding - replicate a tensor on every device.
-  static OpSharding Replicate() {
-    OpSharding result;
-    result.set_type(OpSharding::Type::OpSharding_Type_REPLICATED);
-    return result;
-  }
-  // Creates a sharding that assigns a tensor to just one device.
-  static OpSharding AssignDevice(int device) {
-    OpSharding result;
-    result.set_type(OpSharding::Type::OpSharding_Type_MAXIMAL);
-    result.add_tile_assignment_dimensions(1);
-    result.add_tile_assignment_devices(device);
-    return result;
-  }
-  // Creates a tiled sharding with the given tile shape and assignment of tiles
-  // to devices.
-  static OpSharding Tile(Shape tile_shape,
-                         const TileAssignment& tile_assignment) {
-    OpSharding result;
-    result.set_type(OpSharding::Type::OpSharding_Type_OTHER);
-    *result.mutable_tile_shape() = tile_shape;
-    for (int64 dim : tile_assignment.dimensions()) {
-      result.add_tile_assignment_dimensions(dim);
-    }
-    for (uint32 device : tile_assignment) {
-      result.add_tile_assignment_devices(device);
-    }
-    return result;
-  }
-  // Creates a sharding in one dimension, with the given tile shape which must
-  // be rank 1 and using devices 0..num_tiles.
-  static OpSharding Tile1D(Shape tile_shape, int64 num_tiles) {
-    OpSharding result;
-    result.set_type(OpSharding::Type::OpSharding_Type_OTHER);
-
-    CHECK_EQ(ShapeUtil::Rank(tile_shape), 1);
-    std::vector<int64> dimensions(1, num_tiles);
-    auto& tile_dimension = (*tile_shape.mutable_dimensions())[0];
-    tile_dimension = CeilOfRatio(static_cast<int64>(tile_dimension), num_tiles);
-    *result.mutable_tile_shape() = tile_shape;
-    result.add_tile_assignment_dimensions(num_tiles);
-    for (int64 i = 0; i < num_tiles; ++i) {
-      result.add_tile_assignment_devices(i);
-    }
-    return result;
-  }
-};
-
 // Wraps an XLA client with a convenient interface for building up
 // computations. Any errors encountered in building up the computation are
 // deferred from being handled until Build() is called.
@@ -120,7 +67,7 @@ class ComputationBuilder {
   // OpMetadata is often applied to a series of XLA HLO instructions. As a
   // result, OpMetadata is set on the Computation Builder. All subsequent
   // instructions generated via this Computation Builder will have the same
-  // OpMetadata attached until a call to ClearOpMetdata.
+  // OpMetadata attached until a call to ClearOpMetadata.
   void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
 
   // Clears the HloMetadata state.
@@ -154,6 +101,9 @@ class ComputationBuilder {
   StatusOr<std::unique_ptr<Shape>> GetShape(
       const ComputationDataHandle& operand);
 
+  // Retrieves the (inferred) result for the current computation's shape.
+  StatusOr<ProgramShape> GetProgramShape();
+
   // Checks that the operand has the given expected shape. Returns the operand
   // if yes, fails with a CHECK error if no.
   ComputationDataHandle CheckShape(const ComputationDataHandle& operand,
@@ -393,6 +343,11 @@ class ComputationBuilder {
   ComputationDataHandle Dot(const ComputationDataHandle& lhs,
                             const ComputationDataHandle& rhs);
 
+  // Enqueues a general dot instruction onto the computation.
+  ComputationDataHandle DotGeneral(
+      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+      const DotDimensionNumbers& dimension_numbers);
+
   // Default dimension numbers used for a 2D convolution.
   static constexpr int64 kConvBatchDimension = 0;
   static constexpr int64 kConvFeatureDimension = 1;
@@ -458,14 +413,24 @@ class ComputationBuilder {
       tensorflow::gtl::ArraySlice<int64> rhs_dilation,
       const ConvolutionDimensionNumbers& dimension_numbers);
 
+  // Enqueues an FFT instruction onto the computation, of the given type and
+  // with the given FFT length.
+  ComputationDataHandle Fft(const ComputationDataHandle& operand,
+                            FftType fft_type,
+                            tensorflow::gtl::ArraySlice<int64> fft_length);
+
   // Enqueues an infeed instruction onto the computation, which writes data of
   // the given shape to the infeed buffer of the device.
   ComputationDataHandle Infeed(const Shape& shape, const string& config = "");
 
   // Enqueues an outfeed instruction onto the computation. This instruction
   // generates outgoing data transfers for the given data.
-  void Outfeed(const ComputationDataHandle& operand, const Shape& shape,
-               const string& outfeed_config);
+  //
+  // shape_with_layout communicates the laid out shape that we want to outfeed
+  // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
+  // will occur.
+  void Outfeed(const ComputationDataHandle& operand,
+               const Shape& shape_with_layout, const string& outfeed_config);
 
   // Enqueues a call instruction onto the computation.
   ComputationDataHandle Call(
@@ -726,16 +691,18 @@ class ComputationBuilder {
                                    const ComputationDataHandle& b,
                                    const Shape& shape);
 
-  // Enqueues a B(1, p) random number generation instruction onto the
-  // computation.
-  ComputationDataHandle RngBernoulli(const ComputationDataHandle& mean,
-                                     const Shape& shape);
-
   // Enqueues a while node onto the computation.
   ComputationDataHandle While(const Computation& condition,
                               const Computation& body,
                               const ComputationDataHandle& init);
 
+  // Enqueues a conditional node onto the computation.
+  ComputationDataHandle Conditional(const ComputationDataHandle& predicate,
+                                    const ComputationDataHandle& true_operand,
+                                    const Computation& true_computation,
+                                    const ComputationDataHandle& false_operand,
+                                    const Computation& false_computation);
+
   // Enqueues a ReducePrecision node onto the computation.
   ComputationDataHandle ReducePrecision(const ComputationDataHandle& operand,
                                         const int exponent_bits,
@@ -751,7 +718,7 @@ class ComputationBuilder {
   ComputationDataHandle Recv(const Shape& shape, const ChannelHandle& handle);
 
   // Returns true if 'operand' is a compile-time constant. A compile-time
-  // constant does not depend on parameters with higher index then
+  // constant does not depend on parameters with index greater than or equal to
   // `num_parameters`, or on stateful operators such as `RngNormal` or `Infeed`.
   // Unlike `ComputeConstant`, `IsConstant` tests whether a computation is a
   // compile-time constant without evaluating the computation.
@@ -811,7 +778,7 @@ class ComputationBuilder {
   // The operand must represent a constant value, which in this case
   // means that it must not statically depend on any parameter of the
   // computation that is being built other then the ones specified on the
-  // paramtere list. The parameters in the list will be indexed by their
+  // parameter list. The parameters in the list will be indexed by their
   // parameter id property so the number of parameters specified should be at
   // least as many as the largest used parameter index.
   //
@@ -870,8 +837,6 @@ class ComputationBuilder {
   Status first_error() const { return first_error_; }
 
  private:
-  using PopulateLiteral = std::function<void(Literal*)>;
-
   // Limited checking of convolution parameters. Returns false on
   // error.
   bool VerifyConvolution(const Shape& lhs_shape, const Shape& rhs_shape,
@@ -890,11 +855,6 @@ class ComputationBuilder {
                   tensorflow::gtl::ArraySlice<int64> rhs_dilation,
                   Window* window);
 
-  // Internal helper method that makes a request for a constant operation -- the
-  // provided function is used to populate the literal before sending the
-  // request.
-  ComputationDataHandle ConstantOp(const PopulateLiteral& populate);
-
   // Internal helper method that does the building for an arbitrary unary op.
   ComputationDataHandle UnaryOp(UnaryOperation binop,
                                 const ComputationDataHandle& operand);
@@ -924,19 +884,28 @@ class ComputationBuilder {
   // This is used before any given operation is enqueued.
   Status PrepareComputation();
 
-  // Helper function for parsing a method response and either returning the
-  // output computation data handle (on success) or a vacuous computation data
-  // handle (on failure).
-  ComputationDataHandle ParseOpResponse(const Status& status,
-                                        OpResponse* response);
-
   // Notes that the error occurred by:
   // * storing it internally and capturing a backtrace if it's the first error
   //   (this deferred value will be produced on the call to Build())
   // * dying if die_immediately_on_error_ is true
   void NoteError(const Status& error);
 
-  void AddCommonFieldsToOpRequest(OpRequest* request) const;
+  // Helper function that runs the given op_request, filling in op_response.
+  // Before the op is run, PrepareComputation is called, and common fields in
+  // the op_request are filled in.
+  Status RunOp(OpRequest* op_request, OpResponse* op_response);
+
+  // Helper function that calls RunOp and calls NoteError on failures.
+  void RunOpAndNoteError(OpRequest* op_request);
+
+  // Helper function that calls RunOp and either returns the output computation
+  // data handle (on success) or a vacuous computation data handle (on failure).
+  ComputationDataHandle RunOpAndParseResponse(OpRequest* op_request);
+
+  // Helper function that implements GetShape without noting errors. This makes
+  // it easier to ensure the real GetShape will note errors on every error path.
+  StatusOr<std::unique_ptr<Shape>> GetShapeWithoutNoteError(
+      const ComputationDataHandle& operand);
 
   string name_;  // Name to use for the built computation.
 
@@ -970,68 +939,66 @@ class ComputationBuilder {
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR0(NativeT value) {
-  return ConstantOp([value](Literal* literal) { literal->PopulateR0(value); });
+  return ConstantLiteral(*Literal::CreateR0<NativeT>(value));
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR1(
     tensorflow::gtl::ArraySlice<NativeT> values) {
-  return ConstantOp(
-      [&values](Literal* literal) { literal->PopulateR1(values); });
+  return ConstantLiteral(*Literal::CreateR1<NativeT>(values));
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR1(int64 length,
                                                      NativeT value) {
-  return ConstantOp([length, value](Literal* literal) {
-    literal->PopulateWithValue(value, {length});
-  });
+  Literal literal(ShapeUtil::MakeShape(
+      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
+  literal.PopulateWithValue(value);
+  return ConstantLiteral(literal);
 }
 
 inline ComputationDataHandle ComputationBuilder::ConstantR1(
     const tensorflow::core::Bitmap& values) {
-  return ConstantOp(
-      [&values](Literal* literal) { literal->PopulateR1(values); });
+  return ConstantLiteral(*Literal::CreateR1(values));
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR2(
     std::initializer_list<std::initializer_list<NativeT>> values) {
-  return ConstantOp(
-      [&values](Literal* literal) { literal->PopulateR2(values); });
+  return ConstantLiteral(*Literal::CreateR2<NativeT>(values));
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantFromArrayWithLayout(
     const Array<NativeT>& values, const Layout& layout) {
-  return ConstantOp([&values, &layout](Literal* literal) {
-    literal->PopulateFromArrayWithLayout(values, layout);
-  });
+  return ConstantLiteral(
+      *Literal::CreateFromArrayWithLayout<NativeT>(values, layout));
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantFromArray(
     const Array<NativeT>& values) {
-  return ConstantOp(
-      [&values](Literal* literal) { literal->PopulateFromArray(values); });
+  return ConstantLiteral(*Literal::CreateFromArray<NativeT>(values));
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR2FromArray2DWithLayout(
     const Array2D<NativeT>& values, const Layout& layout) {
-  return ConstantFromArrayWithLayout(values, layout);
+  return ConstantLiteral(
+      *Literal::CreateFromArrayWithLayout<NativeT>(values, layout));
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR2FromArray2D(
     const Array2D<NativeT>& values) {
-  return ConstantFromArray(values);
+  return ConstantLiteral(*Literal::CreateR2FromArray2D<NativeT>(values));
 }
 
 template <typename NativeT>
 ComputationDataHandle ComputationBuilder::ConstantR3FromArray3DWithLayout(
     const Array3D<NativeT>& values, const Layout& layout) {
-  return ConstantFromArrayWithLayout(values, layout);
+  return ConstantLiteral(
+      *Literal::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
 }
 
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
new file mode 100644
index 0000000000000000000000000000000000000000..804e34f5e75ce2d153ac7627b94a543fda88e810
--- /dev/null
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
+
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace xla {
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator(
+    DeviceMemoryAllocator* allocator) {
+  device_allocator_ = allocator;
+  return *this;
+}
+
+DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
+  return device_allocator_;
+}
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
+    int device_ordinal) {
+  CHECK_GE(device_ordinal, 0);
+  device_ordinal_ = device_ordinal;
+  return *this;
+}
+
+int ExecutableBuildOptions::device_ordinal() const { return device_ordinal_; }
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout(
+    const Shape& shape_with_layout) {
+  result_layout_set_ = true;
+  result_layout_ = shape_with_layout;
+  return *this;
+}
+
+const Shape* ExecutableBuildOptions::result_layout() const {
+  return result_layout_set_ ? &result_layout_ : nullptr;
+}
+
+string ExecutableBuildOptions::ToString() const {
+  string result_layout = "nullopt";
+  if (result_layout_set_) {
+    result_layout = ShapeUtil::HumanStringWithLayout(result_layout_);
+  }
+  string generate_hlo_graph = "nullopt";
+  if (generate_hlo_graph_.has_value()) {
+    generate_hlo_graph = generate_hlo_graph_.value();
+  }
+  return tensorflow::strings::Printf(
+      "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, "
+      "generate_hlo_graph=%s}",
+      device_ordinal_, result_layout.c_str(), generate_hlo_graph.c_str());
+}
+
+ExecutableBuildOptions& ExecutableBuildOptions::set_generate_hlo_graph(
+    string regex) {
+  generate_hlo_graph_ = std::move(regex);
+  return *this;
+}
+
+const tensorflow::gtl::optional<string>&
+ExecutableBuildOptions::generate_hlo_graph() const {
+  return generate_hlo_graph_;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a52dbac9adb155ad9a7d91a8102707f70fe2fbf
--- /dev/null
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
+
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+
+namespace xla {
+
+// Class containing options for building an LocalExecutable with
+// LocalClient::Compile.
+class ExecutableBuildOptions {
+ public:
+  // If set, this is the device to build the computation for. Valid
+  // device_ordinal values are: 0 to # of devices - 1. These values are
+  // identical to the device ordinal values used by StreamExecutor. The built
+  // executable will be executable on any device equivalent to the specified
+  // device as determined by Backend::devices_equivalent(). A value of -1
+  // indicates this option has not been set.
+  ExecutableBuildOptions& set_device_ordinal(int device_ordinal);
+  int device_ordinal() const;
+
+  // If set, this specifies the layout of the result of the computation. If not
+  // set, the service will chose the layout of the result. A Shape is used to
+  // store the layout to accommodate tuple result shapes. A value of nullptr
+  // indicates the option has not been set.
+  ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
+  const Shape* result_layout() const;
+
+  // If set, this specifies an allocator that can be used to allocate temporary
+  // space on the device during compilation.  For example, the compiler might
+  // want to run various algorithms on the device and pick the fastest one -- it
+  // might allocate buffers for use by these algorithms using this allocator.
+  //
+  // This does not need to be the same as the DeviceMemoryAllocator passed when
+  // running the executable.
+  ExecutableBuildOptions& set_device_allocator(
+      DeviceMemoryAllocator* allocator);
+  DeviceMemoryAllocator* device_allocator() const;
+
+  // If set, specifies a regexp of HLO graphs to dump (as in DebugOptions).
+  ExecutableBuildOptions& set_generate_hlo_graph(string regex);
+  const tensorflow::gtl::optional<string>& generate_hlo_graph() const;
+
+  // Returns a string representation of the build options, suitable for
+  // debugging.
+  string ToString() const;
+
+ private:
+  int device_ordinal_ = -1;
+  Shape result_layout_;
+  bool result_layout_set_ = false;
+  tensorflow::gtl::optional<string> generate_hlo_graph_;
+  DeviceMemoryAllocator* device_allocator_ = nullptr;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index 5f2b55713e342aa3d0251386d57cb52481fe748d..b63a1465ea755b906853860d47768ecbeaa0dcdd 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -31,14 +31,43 @@ limitations under the License.
 namespace xla {
 namespace {
 
+// Calculates the number of bytes required to store the data within the
+// specified shape. In case of a (nested) tuple shape this is the total byte
+// size of all sub-shapes within the tuple.
+int64 DataSizeOfShape(const Shape& shape) {
+  if (ShapeUtil::IsArray(shape)) {
+    return ShapeUtil::ByteSizeOf(shape);
+  }
+
+  int64 total_size = 0;
+  for (const Shape& s : shape.tuple_shapes()) {
+    total_size += DataSizeOfShape(s);
+  }
+  return total_size;
+}
+
+// Create a ComputationDataHandle for an op what generates fake data with the
+// given shape.
+ComputationDataHandle BuildFakeDataOpOnDevice(const Shape& shape,
+                                              ComputationBuilder* builder) {
+  if (ShapeUtil::IsArray(shape)) {
+    return builder->Broadcast(
+        builder->ConstantLiteral(Literal::One(shape.element_type())),
+        AsInt64Slice(shape.dimensions()));
+  }
+  std::vector<ComputationDataHandle> parts;
+  for (const Shape& s : shape.tuple_shapes()) {
+    parts.push_back(BuildFakeDataOpOnDevice(s, builder));
+  }
+  return builder->Tuple(parts);
+}
+
 std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
                                                        Client* client) {
   ComputationBuilder b(
       client,
       tensorflow::strings::StrCat("make_fake_", ShapeUtil::HumanString(shape)));
-  // TODO(b/26811613): Replace this when RNG is supported on all backends.
-  b.Broadcast(b.ConstantLiteral(Literal::One(shape.element_type())),
-              AsInt64Slice(shape.dimensions()));
+  BuildFakeDataOpOnDevice(shape, &b);
   Computation computation = b.Build().ConsumeValueOrDie();
 
   auto execution_options = CreateDefaultExecutionOptions();
@@ -51,7 +80,7 @@ std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
 
 std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
                                               Client* client) {
-  if (ShapeUtil::ByteSizeOf(shape) < (1LL << 20)) {
+  if (DataSizeOfShape(shape) < (1LL << 20)) {
     StatusOr<std::unique_ptr<Literal>> literal_status = MakeFakeLiteral(shape);
     if (!literal_status.ok()) {
       // If we got an Unimplemented error, fall back to making the fake data via
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index b051955f0fd85b7ca886bc0238068aeb94427209..ef98dbb6403beedb0c08ab9a0fc9e7d4ee31ab3b 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -21,30 +21,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace se = ::perftools::gputools;
 
-namespace xla {
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
-    int device_ordinal) {
-  device_ordinal_ = device_ordinal;
-  return *this;
-}
+using xla::source_map_util::InvalidParameterArgument;
 
-int ExecutableBuildOptions::device_ordinal() const { return device_ordinal_; }
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout(
-    const Shape& shape_with_layout) {
-  result_layout_set_ = true;
-  result_layout_ = shape_with_layout;
-  return *this;
-}
-
-const Shape* ExecutableBuildOptions::result_layout() const {
-  return result_layout_set_ ? &result_layout_ : nullptr;
-}
+namespace xla {
 
 namespace {
 StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
@@ -57,16 +41,18 @@ StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
 }  // namespace
 
 LocalExecutable::LocalExecutable(std::unique_ptr<Executable> executable,
-                                 Backend* backend, int device_ordinal,
-                                 const ExecutableBuildOptions& build_options)
+                                 Backend* backend,
+                                 ExecutableBuildOptions build_options)
     : executable_(std::move(executable)),
       backend_(backend),
-      build_device_ordinal_(device_ordinal),
-      build_options_(build_options) {}
+      build_options_(std::move(build_options)) {
+  CHECK_GE(build_options_.device_ordinal(), 0)
+      << "Must have a valid device ordinal that the executable was built for.";
+}
 
 tensorflow::Status LocalExecutable::ValidateExecutionOptions(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutableRunOptions& options, const Backend& backend) {
+    const ExecutableRunOptions& run_options, const Backend& backend) {
   const ComputationLayout& computation_layout =
       executable_->module_config().entry_computation_layout();
 
@@ -78,25 +64,26 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
   }
   for (int i = 0; i < arguments.size(); ++i) {
     if (!computation_layout.parameter_layout(i).MatchesLayoutInShape(
-            arguments[i]->shape())) {
-      return InvalidArgument(
-          "argument does not match shape or layout of computation parameter "
-          "%d: expected %s, got %s",
+            arguments[i]->on_host_shape())) {
+      return InvalidParameterArgument(
+          executable_.get(), i,
+          "Argument does not match shape or layout of computation parameter "
+          "%d: want %s, got %s",
           i,
           ShapeUtil::HumanString(computation_layout.parameter_layout(i).shape())
               .c_str(),
-          ShapeUtil::HumanString(arguments[i]->shape()).c_str());
+          ShapeUtil::HumanString(arguments[i]->on_host_shape()).c_str());
     }
   }
 
-  if (options.stream() != nullptr) {
-    if (!options.stream()->ok()) {
+  if (run_options.stream() != nullptr) {
+    if (!run_options.stream()->ok()) {
       return InvalidArgument("stream is uninitialized or in an error state");
     }
 
     // Check stream matches service platform.
     const se::Platform* stream_platform =
-        options.stream()->parent()->platform();
+        run_options.stream()->parent()->platform();
     if (stream_platform != backend_->platform()) {
       return InvalidArgument(
           "stream is for platform %s, but service targets platform %s",
@@ -106,7 +93,7 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
 
     // Cannot specify device_ordinal with a stream. The stream determines these
     // values.
-    if (options.device_ordinal() != -1) {
+    if (run_options.device_ordinal() != -1) {
       return InvalidArgument(
           "cannot set both device ordinal and stream options in "
           "ExecutableRunOptions; the stream determines the device ordinal");
@@ -115,34 +102,34 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
 
   // Verify that the device the executable was built for is equivalent to the
   // device it will run on.
-  int run_device_ordinal = options.device_ordinal() == -1
+  int run_device_ordinal = run_options.device_ordinal() == -1
                                ? backend_->default_device_ordinal()
-                               : options.device_ordinal();
-  TF_ASSIGN_OR_RETURN(
-      bool devices_equivalent,
-      backend_->devices_equivalent(run_device_ordinal, build_device_ordinal_));
+                               : run_options.device_ordinal();
+  TF_ASSIGN_OR_RETURN(bool devices_equivalent,
+                      backend_->devices_equivalent(
+                          run_device_ordinal, build_options_.device_ordinal()));
   if (!devices_equivalent) {
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * run_executor,
                         backend_->stream_executor(run_device_ordinal));
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * build_executor,
-                        backend_->stream_executor(build_device_ordinal_));
+                        backend_->stream_executor(build_device_ordinal()));
     return InvalidArgument(
         "executable is built for device %s of type \"%s\"; cannot run it on "
         "device %s of type \"%s\"",
-        backend_->device_name(build_device_ordinal_).c_str(),
+        backend_->device_name(build_device_ordinal()).c_str(),
         build_executor->GetDeviceDescription().name().c_str(),
         backend_->device_name(run_device_ordinal).c_str(),
         run_executor->GetDeviceDescription().name().c_str());
   }
 
-  if (!options.allocator()) {
+  if (!run_options.allocator()) {
     return InvalidArgument("an allocator must be provided to ExecuteLocally");
   }
 
-  if (options.allocator()->platform() != backend.platform()) {
+  if (run_options.allocator()->platform() != backend.platform()) {
     return InvalidArgument(
         "allocator platform (%s) does not match service platform (%s)",
-        options.allocator()->platform()->Name().c_str(),
+        run_options.allocator()->platform()->Name().c_str(),
         backend.platform()->Name().c_str());
   }
 
@@ -151,23 +138,22 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
 
 StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutableRunOptions& options) {
-  TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options, *backend_));
-
-  ExecutableRunOptions actual_options = options;
+    ExecutableRunOptions run_options) {
+  TF_RETURN_IF_ERROR(
+      ValidateExecutionOptions(arguments, run_options, *backend_));
 
   Backend::StreamPtr stream;
-  if (options.stream() == nullptr) {
+  if (run_options.stream() == nullptr) {
     // NB!  The lifetime of `stream` needs to match the lifetime of
     // `actual_options` (otherwise we will end up using a returned stream in
     // ExecuteOnStreamWrapper), which is why it isn't declared in the inner "if"
     // scope.
     TF_ASSIGN_OR_RETURN(
-        stream, BorrowStreamForDevice(options.device_ordinal(), backend_));
-    actual_options.set_stream(stream.get());
+        stream, BorrowStreamForDevice(run_options.device_ordinal(), backend_));
+    run_options.set_stream(stream.get());
   }
-  if (options.allocator() == nullptr) {
-    actual_options.set_allocator(backend_->memory_allocator());
+  if (run_options.allocator() == nullptr) {
+    run_options.set_allocator(backend_->memory_allocator());
   }
 
   // For local client execution on CPU backends:
@@ -176,7 +162,7 @@ StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
   // *) The thread pool used for XLA CPU ops is from
   //    backend_->eigen_intra_op_thread_pool().
   ServiceExecutableRunOptions service_options(
-      actual_options, backend_->StreamBorrower(),
+      run_options, backend_->StreamBorrower(),
       backend_->eigen_intra_op_thread_pool());
 
   if (executable_->dumping()) {
@@ -184,10 +170,9 @@ StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::Run(
   }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<ShapedBuffer> result,
-      executable_->ExecuteOnStreamWrapper<std::unique_ptr<ShapedBuffer>>(
-          &service_options, options.execution_profile(), arguments));
-  return ScopedShapedBuffer::MakeScoped(result.get(),
-                                        actual_options.allocator());
+      executable_->ExecuteOnStreamWrapper(
+          &service_options, run_options.execution_profile(), arguments));
+  return ScopedShapedBuffer::MakeScoped(result.get(), run_options.allocator());
 }
 
 StatusOr<std::unique_ptr<ScopedShapedBuffer>> LocalExecutable::ExecuteAndDump(
@@ -263,16 +248,19 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
     const Computation& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
     const ExecutableBuildOptions& options) {
-  int device_ordinal = options.device_ordinal() == -1
-                           ? default_device_ordinal()
-                           : options.device_ordinal();
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                      local_service_->CompileExecutable(
-                          computation.handle(), argument_layouts,
-                          options.result_layout(), device_ordinal));
+  ExecutableBuildOptions updated_options = options;
+  if (options.device_ordinal() == -1) {
+    updated_options.set_device_ordinal(default_device_ordinal());
+    VLOG(3) << "Set device ordinal to default value of: "
+            << updated_options.device_ordinal();
+  }
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> executable,
+      local_service_->CompileExecutable(computation.handle(), argument_layouts,
+                                        updated_options));
   return WrapUnique(new LocalExecutable(std::move(executable),
                                         local_service_->mutable_backend(),
-                                        device_ordinal, options));
+                                        updated_options));
 }
 
 StatusOr<std::unique_ptr<ScopedShapedBuffer>>
@@ -281,13 +269,9 @@ LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal,
   if (allocator == nullptr) {
     allocator = backend().memory_allocator();
   }
-  TF_ASSIGN_OR_RETURN(
-      auto scoped_buffer,
-      ScopedShapedBuffer::Allocate(
-          literal.shape(), allocator, device_ordinal,
-          [this](const Shape& shape) {
-            return backend().transfer_manager()->GetByteSizeRequirement(shape);
-          }));
+  TF_ASSIGN_OR_RETURN(auto scoped_buffer,
+                      backend().transfer_manager()->AllocateScopedShapedBuffer(
+                          literal.shape(), allocator, device_ordinal));
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                       backend().stream_executor(device_ordinal));
   TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
@@ -322,4 +306,8 @@ StatusOr<std::unique_ptr<Literal>> LocalClient::TransferFromOutfeedLocal(
   return std::move(literal);
 }
 
+StatusOr<int> LocalClient::ReplicaNumberToDeviceOrdinal(int replica_number) {
+  return local_service_->ReplicaNumberToDeviceOrdinal(replica_number);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 3ca0d2ef5513cfb6b0dbfbc63b311f81a318356e..b52a30f5a0b92e0094e6b0de3241c10a5a909cad 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -33,39 +34,13 @@ limitations under the License.
 
 namespace xla {
 
-// Class containing options for building an LocalExecutable with
-// LocalClient::Compile.
-class ExecutableBuildOptions {
- public:
-  // If set, this is the device to build the computation for. Valid
-  // device_ordinal values are: 0 to # of devices - 1. These values are
-  // identical to the device ordinal values used by StreamExecutor. The built
-  // executable will be executable on any device equivalent to the specified
-  // device as determined by Backend::devices_equivalent(). A value of -1
-  // indicates this option has not been set.
-  ExecutableBuildOptions& set_device_ordinal(int device_ordinal);
-  int device_ordinal() const;
-
-  // If set, this specifies the layout of the result of the computation. If not
-  // set, the service will chose the layout of the result. A Shape is used to
-  // store the layout to accommodate tuple result shapes. A value of nullptr
-  // indicates the option has not been set.
-  ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
-  const Shape* result_layout() const;
-
- private:
-  int device_ordinal_ = -1;
-  Shape result_layout_;
-  bool result_layout_set_ = false;
-};
-
 class LocalExecutable {
  public:
   // Run the compiled computation with the given arguments and options and
   // return the result.
   StatusOr<std::unique_ptr<ScopedShapedBuffer>> Run(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const ExecutableRunOptions& options);
+      ExecutableRunOptions run_options);
 
   // Return the layout (contained in a shape) of the result produced by the
   // computation.
@@ -88,8 +63,7 @@ class LocalExecutable {
 
   // Constructor invoked by LocalClient.
   LocalExecutable(std::unique_ptr<Executable> executable, Backend* backend,
-                  int device_ordinal,
-                  const ExecutableBuildOptions& build_options);
+                  ExecutableBuildOptions build_options);
 
   // Validates that the given arguments and options satisfy various constraints
   // of the computation.
@@ -117,19 +91,19 @@ class LocalExecutable {
   StatusOr<std::unique_ptr<Literal>> LiteralFromShapedBuffer(
       const ShapedBuffer& shaped_buffer);
 
+  // The ordinal of the device which this executable was compiled for. The
+  // executable can run on all equivalent devices (as determined by
+  // Backend::devices_equivalent).
+  int build_device_ordinal() const { return build_options_.device_ordinal(); }
+
   // Compiled computation.
   std::unique_ptr<Executable> executable_;
 
   // Execution backend.
-  Backend* backend_;
-
-  // The ordinal of the device which this executable was compiled for. The
-  // executable can run on all equivalent devices (as determined by
-  // Backend::devices_equivalent).
-  int build_device_ordinal_;
+  Backend* backend_ = nullptr;
 
   // Options used to build the executable.
-  const ExecutableBuildOptions& build_options_;
+  const ExecutableBuildOptions build_options_;
 };
 
 // An XLA Client specialization for use when the client and service run in
@@ -176,6 +150,13 @@ class LocalClient : public Client {
   StatusOr<std::unique_ptr<Literal>> TransferFromOutfeedLocal(
       const Shape& shape, int device_ordinal);
 
+  // Returns the device ordinal that corresponds to the given replica number.
+  //
+  // This returns an error if there is not a one-to-one correspondence of
+  // replicas to device ordinals, but is useful as a short term mechanism for
+  // the "easy" case where a single replica is a single device.
+  StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
+
   // Returns the platform that the underlying service targets.
   perftools::gputools::Platform* platform() const;
 
diff --git a/tensorflow/compiler/xla/client/sharding_builder.cc b/tensorflow/compiler/xla/client/sharding_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..176802b33ef824a1f898255a19e44def3c1fc982
--- /dev/null
+++ b/tensorflow/compiler/xla/client/sharding_builder.cc
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/sharding_builder.h"
+
+namespace xla {
+namespace sharding_builder {
+
+OpSharding Replicate() {
+  OpSharding result;
+  result.set_type(OpSharding::Type::OpSharding_Type_REPLICATED);
+  return result;
+}
+
+OpSharding AssignDevice(int device) {
+  OpSharding result;
+  result.set_type(OpSharding::Type::OpSharding_Type_MAXIMAL);
+  result.add_tile_assignment_dimensions(1);
+  result.add_tile_assignment_devices(device);
+  return result;
+}
+
+OpSharding Tile(const Shape& tile_shape,
+                const TileAssignment& tile_assignment) {
+  OpSharding result;
+  result.set_type(OpSharding::Type::OpSharding_Type_OTHER);
+  *result.mutable_tile_shape() = tile_shape;
+  for (int64 dim : tile_assignment.dimensions()) {
+    result.add_tile_assignment_dimensions(dim);
+  }
+  for (uint32 device : tile_assignment) {
+    result.add_tile_assignment_devices(device);
+  }
+  return result;
+}
+
+OpSharding Tile1D(const Shape& tile_shape, int64 num_tiles) {
+  OpSharding result;
+  result.set_type(OpSharding::Type::OpSharding_Type_OTHER);
+
+  CHECK_EQ(ShapeUtil::Rank(tile_shape), 1);
+  std::vector<int64> dimensions(1, num_tiles);
+  *result.mutable_tile_shape() = tile_shape;
+  auto& tile_dimension =
+      (*result.mutable_tile_shape()->mutable_dimensions())[0];
+  tile_dimension = CeilOfRatio(static_cast<int64>(tile_dimension), num_tiles);
+  result.add_tile_assignment_dimensions(num_tiles);
+  for (int64 i = 0; i < num_tiles; ++i) {
+    result.add_tile_assignment_devices(i);
+  }
+  return result;
+}
+
+OpSharding Tuple(const ShapeTree<OpSharding>& shardings) {
+  OpSharding result;
+  result.set_type(OpSharding::Type::OpSharding_Type_TUPLE);
+  for (const auto& index_to_sharding : shardings.leaves()) {
+    *result.add_tuple_shardings() = index_to_sharding.second;
+  }
+  return result;
+}
+
+}  // namespace sharding_builder
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/sharding_builder.h b/tensorflow/compiler/xla/client/sharding_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..34763e54d946690289ff42a7712b980168933eee
--- /dev/null
+++ b/tensorflow/compiler/xla/client/sharding_builder.h
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_SHARDING_BUILDER_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_SHARDING_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/array.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace sharding_builder {
+// A shaped array used to describe the assignment of tiles to devices.
+using TileAssignment = Array<int64>;
+
+// Creates a replicated sharding - replicate a tensor on every device.
+OpSharding Replicate();
+
+// Creates a sharding that assigns a tensor to just one device.
+OpSharding AssignDevice(int device);
+
+// Creates a tiled sharding with the given tile shape and assignment of tiles
+// to devices.
+//
+// If tile_shape is not evenly divisible by the number of devices in
+// tile_assignment, operations behave as if implicit padding had been inserted.
+// The value of this padding is undefined.
+OpSharding Tile(const Shape& tile_shape, const TileAssignment& tile_assignment);
+
+// Creates a sharding in one dimension, with the given tile shape which must
+// be rank 1 and using devices [0..num_tiles).
+//
+// This is simply a convenience wrapper for Tile().
+OpSharding Tile1D(const Shape& tile_shape, int64 num_tiles);
+
+// Creates a tuple sharding from the given ShapeTree of element shardings.
+OpSharding Tuple(const ShapeTree<OpSharding>& shardings);
+
+}  // namespace sharding_builder
+}  // namespace xla
+
+#endif
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 33d5b6f1d4d15d5143a3421c87eab9b7a7d11345..392ad9010ab81923a089c7b00a79ddc281af92bb 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -83,7 +83,7 @@ ExecutableRunOptions& ExecutableRunOptions::set_device_assignment(
   return *this;
 }
 
-DeviceAssignment* ExecutableRunOptions::device_assignment() const {
+const DeviceAssignment* ExecutableRunOptions::device_assignment() const {
   return device_assignment_;
 }
 
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index deb3ddb203d263d25bef0499a8a53a6098d0de0c..d4fcbf0493c936ebcd0639a432e56b62ee15672c 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -82,7 +82,7 @@ class ExecutableRunOptions {
 
   ExecutableRunOptions& set_device_assignment(
       DeviceAssignment* device_assignment);
-  DeviceAssignment* device_assignment() const;
+  const DeviceAssignment* device_assignment() const;
 
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
diff --git a/tensorflow/compiler/xla/execution_options_util.h b/tensorflow/compiler/xla/execution_options_util.h
index 562da78e837ea6c4a01f0d1170797340fd421ad8..a8ca27ec8dfdc01267ccc9efa6c39093c43d4e2d 100644
--- a/tensorflow/compiler/xla/execution_options_util.h
+++ b/tensorflow/compiler/xla/execution_options_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
+#ifndef TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
 
 #include "tensorflow/compiler/xla/xla.pb.h"
 
@@ -26,4 +26,4 @@ ExecutionOptions CreateDefaultExecutionOptions();
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
+#endif  // TENSORFLOW_COMPILER_XLA_EXECUTION_OPTIONS_UTIL_H_
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index 76c0168f370ff1f0749759705b7ecff359a80341..ffd1fb79e986f82e1c2721f0eefbf3b4c0838e41 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -78,7 +78,7 @@ namespace xla {
   int64 scale = 1;
   int64 linear_index = 0;
   bool first = true;
-  for (auto dimension : shape.layout().minor_to_major()) {
+  for (auto dimension : LayoutUtil::MinorToMajor(shape)) {
     if (first) {
       // Avoid two multiplies on the first loop iteration
       linear_index = multi_index[dimension];
@@ -110,7 +110,7 @@ namespace xla {
 
   // Accumulated product D{L(0)} * D{L(1)} * ...
   int64 divisor = 1;
-  for (auto dimension : shape.layout().minor_to_major()) {
+  for (auto dimension : LayoutUtil::MinorToMajor(shape)) {
     multi_index[dimension] =
         (linear_index / divisor) % shape.dimensions(dimension);
     divisor *= shape.dimensions(dimension);
@@ -133,21 +133,49 @@ namespace xla {
 
 /* static */ int64 IndexUtil::GetDimensionStride(const Shape& shape,
                                                  int64 dimension) {
-  const Layout& layout = shape.layout();
-  int64 pdim_size = layout.padded_dimensions_size();
+  int64 pdim_size = LayoutUtil::PaddedDimensions(shape).size();
   int64 stride = 1;
   DCHECK(pdim_size == 0 || pdim_size == shape.dimensions_size());
-  for (auto dim : layout.minor_to_major()) {
+  for (auto dim : LayoutUtil::MinorToMajor(shape)) {
     if (dim == dimension) {
       break;
     }
     if (pdim_size == 0) {
       stride *= shape.dimensions(dim);
     } else {
-      stride *= layout.padded_dimensions(dim);
+      stride *= LayoutUtil::PaddedDimension(shape, dim);
     }
   }
   return stride;
 }
 
+/* static */ bool IndexUtil::IndexInBounds(
+    const Shape& shape, tensorflow::gtl::ArraySlice<int64> index) {
+  int64 rank = ShapeUtil::Rank(shape);
+  if (rank != index.size()) {
+    return false;
+  }
+  for (int64 d = 0; d < rank; ++d) {
+    if (index[d] >= shape.dimensions(d)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/* static */ int IndexUtil::CompareIndices(
+    tensorflow::gtl::ArraySlice<int64> lhs,
+    tensorflow::gtl::ArraySlice<int64> rhs) {
+  int64 rank = lhs.size();
+  CHECK_EQ(rhs.size(), rank);
+  for (int64 dim = 0; dim < rank; ++dim) {
+    if (lhs[dim] < rhs[dim]) {
+      return -1;
+    } else if (lhs[dim] > rhs[dim]) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/index_util.h b/tensorflow/compiler/xla/index_util.h
index c9838966a5b67397eb5fc4afe3ab9d98e82eb2b1..142006f2626e83d3254f2de65fc28fd5d6694e53 100644
--- a/tensorflow/compiler/xla/index_util.h
+++ b/tensorflow/compiler/xla/index_util.h
@@ -37,7 +37,7 @@ class IndexUtil {
   static int64 MultidimensionalIndexToLinearIndex(
       const Shape& shape, tensorflow::gtl::ArraySlice<int64> multi_index);
 
-  // Coverts a linear index into multidimensional index (eg {x, y, z}) based on
+  // Converts a linear index into multidimensional index (eg {x, y, z}) based on
   // the shape and its layout. The first index in the returned multidimensional
   // index is dimension 0.
   static std::vector<int64> LinearIndexToMultidimensionalIndex(
@@ -69,6 +69,18 @@ class IndexUtil {
   //    sizeof(dimension(3)) * sizeof(dimension(2)) == 4 * 10
   static int64 GetDimensionStride(const Shape& shape, int64 dimension);
 
+  // Returns true iff the given multi-index is contained in the bounds for the
+  // shape.
+  static bool IndexInBounds(const Shape& shape,
+                            tensorflow::gtl::ArraySlice<int64> index);
+
+  // Compares the given indices in lexicographic order.  lhs[0] and rhs[0] are
+  // compared first, and lhs[rank-1] and rhs[rank-1] last.  If lhs is larger,
+  // then -1 is returned. If rhs is larger, then 1 is returned.  Otherwise, 0 is
+  // returned.
+  static int CompareIndices(tensorflow::gtl::ArraySlice<int64> lhs,
+                            tensorflow::gtl::ArraySlice<int64> rhs);
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(IndexUtil);
 };
diff --git a/tensorflow/compiler/xla/iterator_util.h b/tensorflow/compiler/xla/iterator_util.h
index a39999705eddc5728dce028dab64b7358395757e..a8bb8c7a7e6784e555f4e9dad73ecc78c668ac42 100644
--- a/tensorflow/compiler/xla/iterator_util.h
+++ b/tensorflow/compiler/xla/iterator_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
 
 #include <iterator>
 #include <utility>
@@ -95,4 +95,4 @@ UnwrappingIterator<NestedIter> MakeUnwrappingIterator(NestedIter iter) {
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ITERATOR_UTIL_H_
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index 5c2cc2a7a99cc51ded3d98c9dd5903e4b3078548..fdc4bbdd8b162b7115788e267c2a53e73c186123 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -57,17 +57,26 @@ void SetDefaultLayoutToContainer(
 /* static */ Layout LayoutUtil::MakeLayout(
     tensorflow::gtl::ArraySlice<int64> minor_to_major) {
   Layout layout;
+  layout.set_format(DENSE);
   for (int64 dimension_number : minor_to_major) {
     layout.add_minor_to_major(dimension_number);
   }
   return layout;
 }
 
+/* static */ Layout LayoutUtil::MakeSparseLayout(int64 max_sparse_elements) {
+  Layout layout;
+  layout.set_format(SPARSE);
+  layout.set_max_sparse_elements(max_sparse_elements);
+  return layout;
+}
+
 namespace {
 
 // Internal helper that creates a default layout for an array of the given rank.
 Layout CreateDefaultLayoutForRank(int64 rank) {
   Layout layout;
+  layout.set_format(DENSE);
   tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
       minor_to_major = layout.mutable_minor_to_major();
   minor_to_major->Resize(rank, 0);
@@ -105,7 +114,11 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
     for (auto& element_shape : *shape->mutable_tuple_shapes()) {
       SetToDefaultLayout(&element_shape);
     }
+    shape->clear_layout();
+  } else if (ShapeUtil::IsOpaque(*shape)) {
+    shape->clear_layout();
   } else {
+    shape->mutable_layout()->set_format(DENSE);
     tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
         minor_to_major = shape->mutable_layout()->mutable_minor_to_major();
     minor_to_major->Resize(shape->dimensions_size(), 0);
@@ -137,8 +150,10 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
       TF_RETURN_IF_ERROR(ValidateLayoutInShape(element_shape));
     }
     return tensorflow::Status::OK();
-  } else if (ShapeUtil::Rank(shape) == 0 && !shape.has_layout()) {
-    // A scalar without a layout is ok.
+  } else if (ShapeUtil::IsOpaque(shape)) {
+    if (shape.has_layout()) {
+      return InvalidArgument("opaque should not have a layout field");
+    }
     return tensorflow::Status::OK();
   } else {
     // Array shape.
@@ -156,46 +171,59 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
     return InvalidArgument("a single Layout is not valid for tuple shapes");
   }
 
-  if (layout.minor_to_major_size() != ShapeUtil::Rank(shape)) {
+  if (ShapeUtil::IsOpaque(shape)) {
+    return tensorflow::Status::OK();
+  }
+
+  if (layout.format() == INVALID_FORMAT) {
     return InvalidArgument(
-        "layout minor_to_major field contains %d elements, "
-        "but shape is rank %lld: {%s}; shape: %s",
-        layout.minor_to_major_size(), ShapeUtil::Rank(shape),
-        tensorflow::str_util::Join(layout.minor_to_major(), ", ").c_str(),
-        shape.ShortDebugString().c_str());
+        "Layout does not have a valid format: layout {%s}, shape {%s}",
+        layout.ShortDebugString().c_str(), shape.ShortDebugString().c_str());
   }
 
-  std::vector<bool> dimensions_in_layout(ShapeUtil::Rank(shape), false);
-  for (int64 i = 0; i < ShapeUtil::Rank(shape); ++i) {
-    int64 dim = layout.minor_to_major(i);
-    if (dim < 0 || dim >= ShapeUtil::Rank(shape)) {
+  if (layout.format() == DENSE) {
+    if (layout.minor_to_major_size() != ShapeUtil::Rank(shape)) {
       return InvalidArgument(
-          "layout minor_to_major field has out-of-bounds value: %s",
-          HumanString(layout).c_str());
+          "layout minor_to_major field contains %d elements, "
+          "but shape is rank %lld: {%s}; shape: %s",
+          layout.minor_to_major_size(), ShapeUtil::Rank(shape),
+          tensorflow::str_util::Join(layout.minor_to_major(), ", ").c_str(),
+          shape.ShortDebugString().c_str());
     }
-    if (dimensions_in_layout[dim]) {
-      return InvalidArgument(
-          "layout minor_to_major field has duplicate values: {%s}",
-          HumanString(layout).c_str());
-    }
-    dimensions_in_layout[dim] = true;
-  }
 
-  if (layout.padded_dimensions_size() > 0) {
-    if (layout.padded_dimensions_size() != ShapeUtil::Rank(shape)) {
-      return InvalidArgument(
-          "layout has %d padded dimensions, but shape is rank %lld",
-          layout.padded_dimensions_size(), ShapeUtil::Rank(shape));
+    std::vector<bool> dimensions_in_layout(ShapeUtil::Rank(shape), false);
+    for (int64 i = 0; i < ShapeUtil::Rank(shape); ++i) {
+      int64 dim = layout.minor_to_major(i);
+      if (dim < 0 || dim >= ShapeUtil::Rank(shape)) {
+        return InvalidArgument(
+            "layout minor_to_major field has out-of-bounds value: %s",
+            HumanString(layout).c_str());
+      }
+      if (dimensions_in_layout[dim]) {
+        return InvalidArgument(
+            "layout minor_to_major field has duplicate values: {%s}",
+            HumanString(layout).c_str());
+      }
+      dimensions_in_layout[dim] = true;
     }
-    for (int i = 0; i < layout.padded_dimensions_size(); ++i) {
-      if (layout.padded_dimensions(i) < shape.dimensions(i)) {
+
+    if (layout.padded_dimensions_size() > 0) {
+      if (layout.padded_dimensions_size() != ShapeUtil::Rank(shape)) {
         return InvalidArgument(
-            "for dimension %d, dimension padding (%lld) is smaller than "
-            "the dimension size (%lld) of the shape",
-            i, layout.padded_dimensions(i), shape.dimensions(i));
+            "layout has %d padded dimensions, but shape is rank %lld",
+            layout.padded_dimensions_size(), ShapeUtil::Rank(shape));
+      }
+      for (int i = 0; i < layout.padded_dimensions_size(); ++i) {
+        if (layout.padded_dimensions(i) < shape.dimensions(i)) {
+          return InvalidArgument(
+              "for dimension %d, dimension padding (%lld) is smaller than "
+              "the dimension size (%lld) of the shape",
+              i, layout.padded_dimensions(i), shape.dimensions(i));
+        }
       }
     }
   }
+
   return tensorflow::Status::OK();
 }
 
@@ -213,12 +241,23 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   LayoutUtil::ClearLayout(program_shape->mutable_result());
 }
 
+/* static */ bool LayoutUtil::IsDenseArray(const Shape& shape) {
+  return ShapeUtil::IsArray(shape) && shape.has_layout() &&
+         IsDense(shape.layout());
+}
+
+/* static */ bool LayoutUtil::IsDense(const Layout& layout) {
+  return layout.format() == DENSE;
+}
+
 /* static */ bool LayoutUtil::IsMonotonicWithDim0Minor(const Layout& layout) {
+  CHECK(layout.format() == DENSE);
   return std::is_sorted(layout.minor_to_major().begin(),
                         layout.minor_to_major().end());
 }
 
 /* static */ bool LayoutUtil::IsMonotonicWithDim0Major(const Layout& layout) {
+  CHECK(layout.format() == DENSE);
   return std::is_sorted(layout.minor_to_major().begin(),
                         layout.minor_to_major().end(), std::greater<int64>());
 }
@@ -228,6 +267,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
       shape.layout().padded_dimensions_size() == 0) {
     return false;
   }
+  CHECK(IsDenseArray(shape));
   CHECK_EQ(shape.dimensions_size(), shape.layout().padded_dimensions_size());
   for (int64 i = 0; i < shape.dimensions_size(); ++i) {
     if (shape.layout().padded_dimensions(i) > shape.dimensions(i)) {
@@ -237,15 +277,46 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   return false;
 }
 
+/* static */ tensorflow::gtl::ArraySlice<int64> LayoutUtil::PaddedDimensions(
+    const Shape& shape) {
+  CHECK(IsDenseArray(shape));
+  return AsInt64Slice(shape.layout().padded_dimensions());
+}
+
+/* static */ int64 LayoutUtil::PaddedDimension(const Shape& shape,
+                                               int64 index) {
+  CHECK(IsDenseArray(shape));
+  return shape.layout().padded_dimensions(index);
+}
+
+/* static */ PaddingValue LayoutUtil::GetPaddingValue(const Shape& shape) {
+  CHECK(IsDenseArray(shape));
+  return shape.layout().padding_value();
+}
+
+/* static */ bool LayoutUtil::IsSparseArray(const Shape& shape) {
+  return ShapeUtil::IsArray(shape) && shape.has_layout() &&
+         IsSparse(shape.layout());
+}
+
+/* static */ bool LayoutUtil::IsSparse(const Layout& layout) {
+  return layout.format() == SPARSE;
+}
+
+/* static */ int64 LayoutUtil::MaxSparseElements(const Layout& layout) {
+  CHECK(IsSparse(layout));
+  return layout.max_sparse_elements();
+}
+
 /* static */ bool LayoutUtil::HasLayout(const Shape& shape) {
   if (ShapeUtil::IsTuple(shape)) {
     // Tuple shape: all subshapes must have a layout.
     return std::all_of(shape.tuple_shapes().begin(), shape.tuple_shapes().end(),
                        [](const Shape& s) { return HasLayout(s); });
+  } else if (ShapeUtil::IsOpaque(shape)) {
+    return true;
   }
-  // A scalar trivially always has a layout.
-  return (ShapeUtil::Rank(shape) == 0 ||
-          (shape.has_layout() && (shape.layout().minor_to_major_size() > 0)));
+  return shape.has_layout() && shape.layout().format() != INVALID_FORMAT;
 }
 
 /* static */ bool LayoutUtil::HasLayout(const ProgramShape& program_shape) {
@@ -261,6 +332,18 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   return protobuf_util::ProtobufEquals(lhs, rhs);
 }
 
+/* static */ tensorflow::gtl::ArraySlice<int64> LayoutUtil::MinorToMajor(
+    const Shape& shape) {
+  CHECK(IsDenseArray(shape));
+  return AsInt64Slice(shape.layout().minor_to_major());
+}
+
+/* static */ tensorflow::gtl::ArraySlice<int64> LayoutUtil::MinorToMajor(
+    const Layout& layout) {
+  CHECK(layout.format() == DENSE);
+  return AsInt64Slice(layout.minor_to_major());
+}
+
 /* static */ int64 LayoutUtil::Major(const Layout& layout,
                                      int64 physical_dimension_number) {
   CHECK_LE(0, physical_dimension_number);
@@ -271,6 +354,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 
 /* static */ int64 LayoutUtil::Minor(const Layout& layout,
                                      int64 physical_dimension_number) {
+  CHECK_EQ(layout.format(), DENSE);
   CHECK_LE(0, physical_dimension_number);
   CHECK_LT(physical_dimension_number, layout.minor_to_major_size());
   return layout.minor_to_major(physical_dimension_number);
@@ -287,6 +371,11 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ string LayoutUtil::HumanString(const Layout& layout) {
+  if (IsSparse(layout)) {
+    return tensorflow::strings::StrCat("sparse{", layout.max_sparse_elements(),
+                                       "}");
+  }
+  CHECK(IsDense(layout));
   return tensorflow::strings::StrCat(
       "{", tensorflow::str_util::Join(layout.minor_to_major(), ","), "}");
 }
@@ -356,6 +445,7 @@ tensorflow::Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src,
 
 /* static */ bool LayoutUtil::AreDimensionsConsecutive(
     const Layout& layout, tensorflow::gtl::ArraySlice<int64> dims) {
+  CHECK(IsDense(layout));
   std::vector<int64> positions_in_layout;
   for (int64 dim : dims) {
     positions_in_layout.push_back(
@@ -370,4 +460,9 @@ tensorflow::Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src,
   return true;
 }
 
+std::ostream& operator<<(std::ostream& out, const Layout& layout) {
+  out << LayoutUtil::HumanString(layout);
+  return out;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index bc42e222292933be35e82d1fe50802e8830d16b3..6c54eb2201b66a4a0c5695bceb14bb2367133935 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -36,6 +36,10 @@ class LayoutUtil {
   // convenience function for protobuf construction.)
   static Layout MakeLayout(tensorflow::gtl::ArraySlice<int64> minor_to_major);
 
+  // Creates a sparse layout with the given maximum number of elements. (This is
+  // a convenience function for protobuf construction.)
+  static Layout MakeSparseLayout(int64 max_sparse_elements);
+
   // Returns default layout for the given shape.
   static Layout GetDefaultLayoutForShape(const Shape& shape);
 
@@ -71,6 +75,12 @@ class LayoutUtil {
   // Clears the layout on all Shapes within the given ProgramShape.
   static void ClearLayout(ProgramShape* program_shape);
 
+  // Returns whether the given Shape is an array and has a dense format layout.
+  static bool IsDenseArray(const Shape& shape);
+
+  // Returns whether the given Layout has a dense format.
+  static bool IsDense(const Layout& layout);
+
   // Returns whether the layout is monotonic and dim 0 is minor in the layout.
   // * R0 and R1: this is always trivially true.
   // * R2+: equivalent to column-major. Dimension 0 is the minor, dimension 1 is
@@ -88,6 +98,30 @@ class LayoutUtil {
   // dimension size).
   static bool IsPadded(const Shape& shape);
 
+  // Returns the padded_dimensions array for the given Shape.  Requires that the
+  // shape is an array and has a dense layout.
+  static tensorflow::gtl::ArraySlice<int64> PaddedDimensions(
+      const Shape& shape);
+
+  // Returns the given index of the padded_dimensions array for the given Shape.
+  // Requires that the shape is an array and has a dense layout.
+  static int64 PaddedDimension(const Shape& shape, int64 index);
+
+  // Returns the padding_value for the given Shape.  Requires that the shape is
+  // an array and has a dense layout.
+  static PaddingValue GetPaddingValue(const Shape& shape);
+
+  // Returns whether the given Shape is an array (i.e. not a tuple) and has a
+  // sparse format layout.
+  static bool IsSparseArray(const Shape& shape);
+
+  // Returns whether the given Layout has a sparse format.
+  static bool IsSparse(const Layout& layout);
+
+  // Returns the maximum number of elements that can be stored in a sparse
+  // layout.
+  static int64 MaxSparseElements(const Layout& layout);
+
   // Returns whether the given shape has a layout. For tuple shapes, true is
   // returned only if all elements have layouts.
   static bool HasLayout(const Shape& shape);
@@ -98,7 +132,12 @@ class LayoutUtil {
   // Returns whether lhs and rhs are identical.
   static bool Equal(const Layout& lhs, const Layout& rhs);
 
-  // Major(0) is the most major logical dimension number, major(1) is the
+  // Returns the minor_to_major array for the given Shape.  Requires that the
+  // shape is an array and has a dense layout.
+  static tensorflow::gtl::ArraySlice<int64> MinorToMajor(const Shape& shape);
+  static tensorflow::gtl::ArraySlice<int64> MinorToMajor(const Layout& layout);
+
+  // Major(0) is the most major logical dimension number, Major(1) is the
   // second-most-major logical dimension number and so on.
   //
   // This can be used to translate physical dimension numbers to logical
@@ -160,6 +199,8 @@ class LayoutUtil {
   TF_DISALLOW_COPY_AND_ASSIGN(LayoutUtil);
 };
 
+std::ostream& operator<<(std::ostream& out, const Layout& layout);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_LAYOUT_UTIL_H_
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index 331bb9afa94e9e7c97d9c880dbac31c60ac0da18..4fd1d818e3e3b417eee9f6b14bb598bfb9480c6e 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/layout_util.h"
+
+#include <sstream>
+
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -30,6 +33,14 @@ class LayoutUtilTest : public ::testing::Test {
     *shape.mutable_layout() = LayoutUtil::MakeLayout(minor_to_major);
     return shape;
   }
+
+  Shape MakeShapeWithSparseLayout(PrimitiveType element_type,
+                                  tensorflow::gtl::ArraySlice<int64> dimensions,
+                                  int64 max_sparse_elements) {
+    Shape shape = ShapeUtil::MakeShape(element_type, dimensions);
+    *shape.mutable_layout() = LayoutUtil::MakeSparseLayout(max_sparse_elements);
+    return shape;
+  }
 };
 
 TEST_F(LayoutUtilTest, TupleLayoutComparison) {
@@ -81,6 +92,29 @@ TEST_F(LayoutUtilTest, CopyLayoutArray) {
   EXPECT_FALSE(dst.has_layout());
 }
 
+TEST_F(LayoutUtilTest, CopyLayoutSparse) {
+  Shape src = MakeShapeWithSparseLayout(F32, {2, 3}, 2);
+  Shape dst = MakeShapeWithLayout(F32, {2, 3}, {1, 0});
+
+  EXPECT_FALSE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+
+  // Should work if destination has no layout.
+  dst.clear_layout();
+  EXPECT_FALSE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+
+  // If source is cleared, then destination should be cleared.
+  src.clear_layout();
+  EXPECT_FALSE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+  EXPECT_TRUE(dst.has_layout());
+  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+  EXPECT_FALSE(dst.has_layout());
+}
+
 TEST_F(LayoutUtilTest, CopyLayoutTuple) {
   Shape src = ShapeUtil::MakeTupleShape(
       {MakeShapeWithLayout(F32, {2, 3}, {0, 1}),
@@ -100,6 +134,25 @@ TEST_F(LayoutUtilTest, CopyLayoutTuple) {
   EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
 }
 
+TEST_F(LayoutUtilTest, CopyLayoutTupleSparse) {
+  Shape src = ShapeUtil::MakeTupleShape(
+      {MakeShapeWithSparseLayout(F32, {2, 3}, 4),
+       MakeShapeWithSparseLayout(F32, {42, 123}, 4),
+       ShapeUtil::MakeTupleShape(
+           {MakeShapeWithLayout(F32, {}, {}),
+            MakeShapeWithSparseLayout(F32, {1, 2, 3}, 6)})});
+  Shape dst = ShapeUtil::MakeTupleShape(
+      {MakeShapeWithLayout(F32, {2, 3}, {1, 0}),
+       MakeShapeWithLayout(F32, {42, 123}, {1, 0}),
+       ShapeUtil::MakeTupleShape(
+           {MakeShapeWithLayout(F32, {}, {}),
+            MakeShapeWithLayout(F32, {1, 2, 3}, {1, 2, 0})})});
+
+  EXPECT_FALSE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+}
+
 TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleSameRank) {
   Shape src = MakeShapeWithLayout(F32, {123, 42, 7}, {2, 0, 1});
   Shape dst = MakeShapeWithLayout(F32, {2, 3, 5}, {1, 0});
@@ -107,6 +160,13 @@ TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleSameRank) {
   EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
 }
 
+TEST_F(LayoutUtilTest, CopyLayoutSparseNotCompatibleSameRank) {
+  Shape src = MakeShapeWithSparseLayout(F32, {123, 42, 7}, 6);
+  Shape dst = MakeShapeWithLayout(F32, {2, 3, 5}, {1, 0});
+  ASSERT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+}
+
 TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleDifferentRank) {
   Shape src = MakeShapeWithLayout(F32, {123, 42, 7}, {2, 0, 1});
   Shape dst = MakeShapeWithLayout(F32, {2, 3}, {1, 0});
@@ -116,6 +176,15 @@ TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleDifferentRank) {
               ::testing::ContainsRegex("cannot copy layout from shape"));
 }
 
+TEST_F(LayoutUtilTest, CopyLayoutSparseNotCompatibleDifferentRank) {
+  Shape src = MakeShapeWithLayout(F32, {123, 42, 7}, {2, 0, 1});
+  Shape dst = MakeShapeWithSparseLayout(F32, {2, 3}, 4);
+  auto status = LayoutUtil::CopyLayoutBetweenShapes(src, &dst);
+  EXPECT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              ::testing::ContainsRegex("cannot copy layout from shape"));
+}
+
 TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleTuple) {
   Shape src =
       ShapeUtil::MakeTupleShape({MakeShapeWithLayout(F32, {2, 3}, {0, 1}),
@@ -221,5 +290,16 @@ TEST_F(LayoutUtilTest, DefaultLayoutGettersMajorToMinor) {
                             ShapeUtil::MakeShape(F32, {10, 20, 30, 15, 25}))));
 }
 
+TEST_F(LayoutUtilTest, SparseLayoutMaxElements) {
+  EXPECT_EQ(LayoutUtil::MaxSparseElements(LayoutUtil::MakeSparseLayout(101)),
+            101);
+}
+
+TEST_F(LayoutUtilTest, StreamOut) {
+  std::ostringstream oss;
+  oss << LayoutUtil::MakeLayout({0, 1, 2});
+  EXPECT_EQ(oss.str(), "{0,1,2}");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index bfafef0a40f55e13ac94b2d1750df25146081784..c8ed3e3a2b009ddffdfb79a9a6ced8d5e736bee6 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -40,6 +40,10 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
   flags->set_xla_cpu_multi_thread_eigen(true);
   flags->set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
   flags->set_xla_eliminate_hlo_implicit_broadcast(true);
+
+  // Set cudnn batchnorm off by default; it does not provide a performance win
+  // on average.
+  flags->set_xla_gpu_use_cudnn_batchnorm(false);
 }
 
 // Allocates flag_values and flag_objects; this function must not be called more
@@ -96,179 +100,195 @@ void AllocateFlags() {
             option_proto, reduce_precision_option_value);
       };
 
-  flag_objects = new std::vector<tensorflow::Flag>(
-      {tensorflow::Flag(
-           "xla_generate_hlo_graph",
-           flag_values->mutable_xla_generate_hlo_graph(),
-           "HLO modules matching this regex will be dumped to a .dot file "
-           "throughout various stages in compilation."),
-       tensorflow::Flag(
-           "xla_hlo_graph_addresses",
-           bool_setter_for(&DebugOptions::set_xla_hlo_graph_addresses),
-           flag_values->xla_hlo_graph_addresses(),
-           "With xla_generate_hlo_graph, show addresses of HLO ops in "
-           "graph dump."),
-       tensorflow::Flag(
-           "xla_hlo_graph_path", flag_values->mutable_xla_hlo_graph_path(),
-           "With xla_generate_hlo_graph, dump the graphs into this path."),
-       tensorflow::Flag(
-           "xla_hlo_dump_as_graphdef",
-           bool_setter_for(&DebugOptions::set_xla_hlo_dump_as_graphdef),
-           flag_values->xla_hlo_dump_as_graphdef(),
-           "Dump HLO graphs as TensorFlow GraphDefs."),
-       tensorflow::Flag(
-           "xla_hlo_graph_sharding_color",
-           bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
-           flag_values->xla_hlo_graph_sharding_color(),
-           "Assign colors based on sharding assignments when generating the "
-           "HLO graphs."),
-       tensorflow::Flag(
-           "xla_hlo_tfgraph_device_scopes",
-           bool_setter_for(&DebugOptions::set_xla_hlo_tfgraph_device_scopes),
-           flag_values->xla_hlo_tfgraph_device_scopes(),
-           "When generating TensorFlow HLO graphs, if the HLO instructions "
-           "are assigned to a specific device, prefix the name scope with "
-           "\"devX\" with X being the device ordinal."),
-       tensorflow::Flag(
-           "xla_log_hlo_text", flag_values->mutable_xla_log_hlo_text(),
-           "HLO modules matching this regex will be dumped to LOG(INFO)."),
-       tensorflow::Flag(
-           "xla_generate_hlo_text_to",
-           flag_values->mutable_xla_generate_hlo_text_to(),
-           "Dump all HLO modules as text into the provided directory path."),
-       tensorflow::Flag(
-           "xla_enable_fast_math",
-           bool_setter_for(&DebugOptions::set_xla_enable_fast_math),
-           flag_values->xla_enable_fast_math(),
-           "Enable unsafe fast-math optimizations in the compiler; "
-           "this may produce faster code at the expense of some accuracy."),
-       tensorflow::Flag(
-           "xla_llvm_enable_alias_scope_metadata",
-           bool_setter_for(
-               &DebugOptions::set_xla_llvm_enable_alias_scope_metadata),
-           flag_values->xla_llvm_enable_alias_scope_metadata(),
-           "In LLVM-based backends, enable the emission of "
-           "!alias.scope metadata in the generated IR."),
-       tensorflow::Flag(
-           "xla_llvm_enable_noalias_metadata",
-           bool_setter_for(&DebugOptions::set_xla_llvm_enable_noalias_metadata),
-           flag_values->xla_llvm_enable_noalias_metadata(),
-           "In LLVM-based backends, enable the emission of "
-           "!noalias metadata in the generated IR."),
-       tensorflow::Flag(
-           "xla_llvm_enable_invariant_load_metadata",
-           bool_setter_for(
-               &DebugOptions::set_xla_llvm_enable_invariant_load_metadata),
-           flag_values->xla_llvm_enable_invariant_load_metadata(),
-           "In LLVM-based backends, enable the emission of "
-           "!invariant.load metadata in "
-           "the generated IR."),
-       tensorflow::Flag(
-           "xla_llvm_disable_expensive_passes",
-           bool_setter_for(
-               &DebugOptions::set_xla_llvm_disable_expensive_passes),
-           flag_values->xla_llvm_disable_expensive_passes(),
-           "In LLVM-based backends, disable a custom set of "
-           "expensive optimization passes."),
-       tensorflow::Flag(
-           "xla_backend_optimization_level",
-           int32_setter_for(&DebugOptions::set_xla_backend_optimization_level),
-           flag_values->xla_backend_optimization_level(),
-           "Numerical optimization level for the XLA compiler backend."),
-       tensorflow::Flag(
-           "xla_disable_hlo_passes", setter_for_xla_disable_hlo_passes, "",
-           "Comma-separated list of hlo passes to be disabled. These names "
-           "must exactly match the passes' names; no whitespace around "
-           "commas."),
-       tensorflow::Flag(
-           "xla_embed_ir_in_executable",
-           bool_setter_for(&DebugOptions::set_xla_embed_ir_in_executable),
-           flag_values->xla_embed_ir_in_executable(),
-           "Embed the compiler IR as a string in the executable."),
-       tensorflow::Flag(
-           "xla_dump_ir_to", flag_values->mutable_xla_dump_ir_to(),
-           "Dump the compiler IR into this directory as individual files."),
-       tensorflow::Flag(
-           "xla_eliminate_hlo_implicit_broadcast",
-           bool_setter_for(
-               &DebugOptions::set_xla_eliminate_hlo_implicit_broadcast),
-           flag_values->xla_eliminate_hlo_implicit_broadcast(),
-           "Eliminate implicit broadcasts when lowering user "
-           "computations to HLO instructions; use explicit "
-           "broadcast instead."),
-       tensorflow::Flag(
-           "xla_cpu_multi_thread_eigen",
-           bool_setter_for(&DebugOptions::set_xla_cpu_multi_thread_eigen),
-           flag_values->xla_cpu_multi_thread_eigen(),
-           "When generating calls to Eigen in the CPU backend, "
-           "use multi-threaded Eigen mode."),
-       tensorflow::Flag("xla_gpu_cuda_data_dir",
-                        flag_values->mutable_xla_gpu_cuda_data_dir(),
-                        "If non-empty, speficies a local directory containing "
-                        "ptxas and nvvm libdevice files; otherwise we use "
-                        "those from runfile directories."),
-       tensorflow::Flag("xla_gpu_ftz",
-                        bool_setter_for(&DebugOptions::set_xla_gpu_ftz),
-                        flag_values->xla_gpu_ftz(),
-                        "If true, flush-to-zero semantics are enabled in the "
-                        "code generated for GPUs."),
-       tensorflow::Flag(
-           "xla_gpu_disable_multi_streaming",
-           bool_setter_for(&DebugOptions::set_xla_gpu_disable_multi_streaming),
-           flag_values->xla_gpu_disable_multi_streaming(),
-           "If true, multi-streaming in the GPU backend is disabled."),
-       tensorflow::Flag(
-           "xla_dump_hlo_proto_to",
-           flag_values->mutable_xla_dump_hlo_proto_to(),
-           "Dump compilation artifacts as proto binary into this directory."),
-       tensorflow::Flag(
-           "xla_test_all_output_layouts",
-           bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
-           flag_values->xla_test_all_output_layouts(),
-           "Let ClientLibraryTestBase::ComputeAndCompare* test "
-           "all permutations of output layouts. For example, with "
-           "a 3D shape, all permutations of the set {0, 1, 2} are "
-           "tried."),
-       tensorflow::Flag(
-           "xla_test_all_input_layouts",
-           bool_setter_for(&DebugOptions::set_xla_test_all_input_layouts),
-           flag_values->xla_test_all_input_layouts(),
-           "Let ClientLibraryTestBase::ComputeAndCompare* test "
-           "all permutations of *input* layouts. For example, for "
-           "2 input arguments with 2D shape and 4D shape, the "
-           "computation will run 2! * 4! times for every possible "
-           "layouts"),
-       tensorflow::Flag(
-           "xla_hlo_profile",
-           bool_setter_for(&DebugOptions::set_xla_hlo_profile),
-           flag_values->xla_hlo_profile(),
-           "Instrument the computation to collect per-HLO cycle counts"),
-       tensorflow::Flag("xla_dump_computations_to",
-                        flag_values->mutable_xla_dump_computations_to(),
-                        "Dump computations that XLA executes into the provided "
-                        "directory path"),
-       tensorflow::Flag("xla_dump_executions_to",
-                        flag_values->mutable_xla_dump_executions_to(),
-                        "Dump parameters and results of computations that XLA "
-                        "executes into the provided directory path"),
-       tensorflow::Flag("xla_backend_extra_options",
-                        setter_for_xla_backend_extra_options, "",
-                        "Extra options to pass to a backend; "
-                        "comma-separated list of 'key=val' strings (=val "
-                        "may be omitted); no whitespace around commas."),
-       tensorflow::Flag("xla_reduce_precision", setter_for_xla_reduce_precision,
-                        "",
-                        "Directions for adding reduce-precision operations. "
-                        "Format is 'LOCATION=E,M:OPS;NAMES' where LOCATION is "
-                        "the class of locations in which to insert the "
-                        "operations (e.g., 'OP_OUTPUTS'), E and M are the "
-                        "exponent and matissa bit counts respectively, and "
-                        "OPS and NAMES are comma-separated (no spaces) lists "
-                        "of the operation types and names to which to attach "
-                        "the reduce-precision operations.  The NAMES string "
-                        "and its preceding ';' may be omitted.  This option "
-                        "may be repeated to define multiple sets of added "
-                        "reduce-precision operations.")});
+  flag_objects = new std::vector<tensorflow::Flag>({
+      tensorflow::Flag(
+          "xla_generate_hlo_graph",
+          flag_values->mutable_xla_generate_hlo_graph(),
+          "HLO modules matching this regex will be dumped to a .dot file "
+          "throughout various stages in compilation."),
+      tensorflow::Flag(
+          "xla_hlo_graph_addresses",
+          bool_setter_for(&DebugOptions::set_xla_hlo_graph_addresses),
+          flag_values->xla_hlo_graph_addresses(),
+          "With xla_generate_hlo_graph, show addresses of HLO ops in "
+          "graph dump."),
+      tensorflow::Flag(
+          "xla_hlo_graph_path", flag_values->mutable_xla_hlo_graph_path(),
+          "With xla_generate_hlo_graph, dump the graphs into this path."),
+      tensorflow::Flag(
+          "xla_hlo_dump_as_graphdef",
+          bool_setter_for(&DebugOptions::set_xla_hlo_dump_as_graphdef),
+          flag_values->xla_hlo_dump_as_graphdef(),
+          "Dump HLO graphs as TensorFlow GraphDefs."),
+      tensorflow::Flag(
+          "xla_hlo_graph_sharding_color",
+          bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
+          flag_values->xla_hlo_graph_sharding_color(),
+          "Assign colors based on sharding assignments when generating the "
+          "HLO graphs."),
+      tensorflow::Flag(
+          "xla_hlo_tfgraph_device_scopes",
+          bool_setter_for(&DebugOptions::set_xla_hlo_tfgraph_device_scopes),
+          flag_values->xla_hlo_tfgraph_device_scopes(),
+          "When generating TensorFlow HLO graphs, if the HLO instructions "
+          "are assigned to a specific device, prefix the name scope with "
+          "\"devX\" with X being the device ordinal."),
+      tensorflow::Flag(
+          "xla_log_hlo_text", flag_values->mutable_xla_log_hlo_text(),
+          "HLO modules matching this regex will be dumped to LOG(INFO)."),
+      tensorflow::Flag(
+          "xla_generate_hlo_text_to",
+          flag_values->mutable_xla_generate_hlo_text_to(),
+          "Dump all HLO modules as text into the provided directory path."),
+      tensorflow::Flag(
+          "xla_enable_fast_math",
+          bool_setter_for(&DebugOptions::set_xla_enable_fast_math),
+          flag_values->xla_enable_fast_math(),
+          "Enable unsafe fast-math optimizations in the compiler; "
+          "this may produce faster code at the expense of some accuracy."),
+      tensorflow::Flag(
+          "xla_llvm_enable_alias_scope_metadata",
+          bool_setter_for(
+              &DebugOptions::set_xla_llvm_enable_alias_scope_metadata),
+          flag_values->xla_llvm_enable_alias_scope_metadata(),
+          "In LLVM-based backends, enable the emission of "
+          "!alias.scope metadata in the generated IR."),
+      tensorflow::Flag(
+          "xla_llvm_enable_noalias_metadata",
+          bool_setter_for(&DebugOptions::set_xla_llvm_enable_noalias_metadata),
+          flag_values->xla_llvm_enable_noalias_metadata(),
+          "In LLVM-based backends, enable the emission of "
+          "!noalias metadata in the generated IR."),
+      tensorflow::Flag(
+          "xla_llvm_enable_invariant_load_metadata",
+          bool_setter_for(
+              &DebugOptions::set_xla_llvm_enable_invariant_load_metadata),
+          flag_values->xla_llvm_enable_invariant_load_metadata(),
+          "In LLVM-based backends, enable the emission of "
+          "!invariant.load metadata in "
+          "the generated IR."),
+      tensorflow::Flag(
+          "xla_llvm_disable_expensive_passes",
+          bool_setter_for(&DebugOptions::set_xla_llvm_disable_expensive_passes),
+          flag_values->xla_llvm_disable_expensive_passes(),
+          "In LLVM-based backends, disable a custom set of "
+          "expensive optimization passes."),
+      tensorflow::Flag(
+          "xla_backend_optimization_level",
+          int32_setter_for(&DebugOptions::set_xla_backend_optimization_level),
+          flag_values->xla_backend_optimization_level(),
+          "Numerical optimization level for the XLA compiler backend."),
+      tensorflow::Flag(
+          "xla_disable_hlo_passes", setter_for_xla_disable_hlo_passes, "",
+          "Comma-separated list of hlo passes to be disabled. These names "
+          "must exactly match the passes' names; no whitespace around "
+          "commas."),
+      tensorflow::Flag(
+          "xla_embed_ir_in_executable",
+          bool_setter_for(&DebugOptions::set_xla_embed_ir_in_executable),
+          flag_values->xla_embed_ir_in_executable(),
+          "Embed the compiler IR as a string in the executable."),
+      tensorflow::Flag(
+          "xla_dump_ir_to", flag_values->mutable_xla_dump_ir_to(),
+          "Dump the compiler IR into this directory as individual files."),
+      tensorflow::Flag(
+          "xla_eliminate_hlo_implicit_broadcast",
+          bool_setter_for(
+              &DebugOptions::set_xla_eliminate_hlo_implicit_broadcast),
+          flag_values->xla_eliminate_hlo_implicit_broadcast(),
+          "Eliminate implicit broadcasts when lowering user "
+          "computations to HLO instructions; use explicit "
+          "broadcast instead."),
+      tensorflow::Flag(
+          "xla_cpu_multi_thread_eigen",
+          bool_setter_for(&DebugOptions::set_xla_cpu_multi_thread_eigen),
+          flag_values->xla_cpu_multi_thread_eigen(),
+          "When generating calls to Eigen in the CPU backend, "
+          "use multi-threaded Eigen mode."),
+      tensorflow::Flag("xla_gpu_cuda_data_dir",
+                       flag_values->mutable_xla_gpu_cuda_data_dir(),
+                       "If non-empty, speficies a local directory containing "
+                       "ptxas and nvvm libdevice files; otherwise we use "
+                       "those from runfile directories."),
+      tensorflow::Flag("xla_gpu_ftz",
+                       bool_setter_for(&DebugOptions::set_xla_gpu_ftz),
+                       flag_values->xla_gpu_ftz(),
+                       "If true, flush-to-zero semantics are enabled in the "
+                       "code generated for GPUs."),
+      tensorflow::Flag(
+          "xla_gpu_disable_multi_streaming",
+          bool_setter_for(&DebugOptions::set_xla_gpu_disable_multi_streaming),
+          flag_values->xla_gpu_disable_multi_streaming(),
+          "If true, multi-streaming in the GPU backend is disabled."),
+      tensorflow::Flag(
+          "xla_dump_optimized_hlo_proto_to",
+          flag_values->mutable_xla_dump_optimized_hlo_proto_to(),
+          "Dump Hlo after all hlo passes are executed as proto binary into "
+          "this directory."),
+      tensorflow::Flag(
+          "xla_dump_unoptimized_hlo_proto_to",
+          flag_values->mutable_xla_dump_unoptimized_hlo_proto_to(),
+          "Dump HLO before any hlo passes are executed as proto binary into "
+          "this directory."),
+      tensorflow::Flag("xla_dump_per_pass_hlo_proto_to",
+                       flag_values->mutable_xla_dump_per_pass_hlo_proto_to(),
+                       "Dump HLO after each pass as an HloProto in binary file "
+                       "format into this directory."),
+      tensorflow::Flag(
+          "xla_test_all_output_layouts",
+          bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts),
+          flag_values->xla_test_all_output_layouts(),
+          "Let ClientLibraryTestBase::ComputeAndCompare* test "
+          "all permutations of output layouts. For example, with "
+          "a 3D shape, all permutations of the set {0, 1, 2} are "
+          "tried."),
+      tensorflow::Flag(
+          "xla_test_all_input_layouts",
+          bool_setter_for(&DebugOptions::set_xla_test_all_input_layouts),
+          flag_values->xla_test_all_input_layouts(),
+          "Let ClientLibraryTestBase::ComputeAndCompare* test "
+          "all permutations of *input* layouts. For example, for "
+          "2 input arguments with 2D shape and 4D shape, the "
+          "computation will run 2! * 4! times for every possible "
+          "layouts"),
+      tensorflow::Flag(
+          "xla_hlo_profile",
+          bool_setter_for(&DebugOptions::set_xla_hlo_profile),
+          flag_values->xla_hlo_profile(),
+          "Instrument the computation to collect per-HLO cycle counts"),
+      tensorflow::Flag("xla_dump_computations_to",
+                       flag_values->mutable_xla_dump_computations_to(),
+                       "Dump computations that XLA executes into the provided "
+                       "directory path"),
+      tensorflow::Flag("xla_dump_executions_to",
+                       flag_values->mutable_xla_dump_executions_to(),
+                       "Dump parameters and results of computations that XLA "
+                       "executes into the provided directory path"),
+      tensorflow::Flag("xla_backend_extra_options",
+                       setter_for_xla_backend_extra_options, "",
+                       "Extra options to pass to a backend; "
+                       "comma-separated list of 'key=val' strings (=val "
+                       "may be omitted); no whitespace around commas."),
+      tensorflow::Flag("xla_reduce_precision", setter_for_xla_reduce_precision,
+                       "",
+                       "Directions for adding reduce-precision operations. "
+                       "Format is 'LOCATION=E,M:OPS;NAMES' where LOCATION is "
+                       "the class of locations in which to insert the "
+                       "operations (e.g., 'OP_OUTPUTS'), E and M are the "
+                       "exponent and matissa bit counts respectively, and "
+                       "OPS and NAMES are comma-separated (no spaces) lists "
+                       "of the operation types and names to which to attach "
+                       "the reduce-precision operations.  The NAMES string "
+                       "and its preceding ';' may be omitted.  This option "
+                       "may be repeated to define multiple sets of added "
+                       "reduce-precision operations."),
+      tensorflow::Flag(
+          "xla_gpu_use_cudnn_batchnorm",
+          bool_setter_for(&DebugOptions::set_xla_gpu_use_cudnn_batchnorm),
+          flag_values->xla_gpu_use_cudnn_batchnorm(),
+          "Allows the GPU backend to implement batchnorm HLOs using cudnn, "
+          "rather than expanding them to a soup of HLOs."),
+  });
   ParseFlagsFromEnv(*flag_objects);
 }
 
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
index d0ef8e66ab0bcbf88035ae31fe32eb161e32e998..b53157f59c61cf4e0850e006ad3656f4be63a936 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
 
 #include <vector>
 
@@ -35,4 +35,4 @@ xla::DebugOptions GetDebugOptionsFromFlags();
 }  // namespace legacy_flags
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
index 0c238e6a5decffb0339f428e4ea676944479cf1b..e9cf435d83d8345e974d83f8e5340dafeba8e3b2 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
+#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
 
 #include <vector>
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -148,4 +148,4 @@ inline bool parse_xla_reduce_precision_option(
 }  // namespace legacy_flags
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 93d3cd425f0a868b51677058796e9c40c2d3dff8..e0a9b148b443e90a0c4f3e19660b6234d49eef84 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -27,14 +27,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+
+using tensorflow::strings::Printf;
+using tensorflow::strings::StrCat;
+
+namespace xla {
+
 namespace {
-using tensorflow::int64;
 
 constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
 
@@ -46,9 +52,8 @@ void ConvertEndianShort(char* bytes, int64 size) {
     std::swap(bytes[i], bytes[i + 1]);
   }
 }
-}  // namespace
 
-namespace xla {
+}  // namespace
 
 std::ostream& operator<<(std::ostream& out, const Literal& literal) {
   out << literal.ToString();
@@ -64,12 +69,12 @@ Literal::StrideConfig::StrideConfig(
   if (!dimensions.empty()) {
     // Selects the shape with the largest minor dimension as the one upon
     // which to run the tight stride loop.
-    if (dimensions[source_shape.layout().minor_to_major()[0]] >=
-        dimensions[dest_shape.layout().minor_to_major()[0]]) {
-      minor_dimension = source_shape.layout().minor_to_major()[0];
+    if (dimensions[LayoutUtil::Minor(source_shape.layout(), 0)] >=
+        dimensions[LayoutUtil::Minor(dest_shape.layout(), 0)]) {
+      minor_dimension = LayoutUtil::Minor(source_shape.layout(), 0);
       dest_stride = IndexUtil::GetDimensionStride(dest_shape, minor_dimension);
     } else {
-      minor_dimension = dest_shape.layout().minor_to_major()[0];
+      minor_dimension = LayoutUtil::Minor(dest_shape.layout(), 0);
       source_stride =
           IndexUtil::GetDimensionStride(source_shape, minor_dimension);
     }
@@ -78,52 +83,134 @@ Literal::StrideConfig::StrideConfig(
   }
 }
 
+Literal::Literal(const Shape& shape)
+    : Literal(shape, /*allocate_arrays=*/true) {}
+
+Literal::Literal(const Shape& shape, bool allocate_arrays)
+    : shape_(shape), pieces_(shape), owns_buffers_(true) {
+  CHECK(LayoutUtil::HasLayout(shape));
+  for (auto& pair : pieces_) {
+    const ShapeIndex& index = pair.first;
+    Piece& piece = pair.second;
+
+    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
+    const Shape& subshape = piece.subshape();
+    if (ShapeUtil::IsArray(subshape)) {
+      if (allocate_arrays) {
+        piece.set_buffer(new char[piece.size_bytes()]);
+        if (LayoutUtil::IsSparseArray(subshape)) {
+          piece.set_sparse_indices(new SparseIndexArray(
+              LayoutUtil::MaxSparseElements(subshape.layout()),
+              ShapeUtil::Rank(subshape)));
+        }
+      } else {
+        piece.set_buffer(nullptr);
+      }
+    }
+  }
+}
+
+Literal::~Literal() { DeallocateBuffers(); }
+
+void Literal::DeallocateBuffers() {
+  if (owns_buffers_) {
+    for (auto& pair : pieces_) {
+      Piece& piece = pair.second;
+      if (piece.buffer() != nullptr) {
+        delete[] piece.buffer();
+        delete piece.sparse_indices();
+      }
+    }
+  }
+}
+
+Literal::Literal(Literal&& other) {
+  shape_ = std::move(other.shape_);
+  pieces_ = std::move(other.pieces_);
+  // We need to iterate through the pieces to set the subshape pointer
+  // properly. It must refer to subshapes within shape_.
+  for (auto& pair : pieces_) {
+    const ShapeIndex& index = pair.first;
+    Piece& piece = pair.second;
+    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
+  }
+  owns_buffers_ = other.owns_buffers_;
+
+  other.shape_ = ShapeUtil::MakeNil();
+  other.pieces_ = ShapeTree<Piece>(other.shape_);
+  other.piece({}).set_subshape(&other.shape_);
+}
+
+Literal& Literal::operator=(Literal&& other) {
+  DeallocateBuffers();
+  shape_ = std::move(other.shape_);
+  pieces_ = std::move(other.pieces_);
+  // We need to iterate through the pieces to set the subshape pointer
+  // properly. It must refer to subshapes within shape_.
+  for (auto& pair : pieces_) {
+    const ShapeIndex& index = pair.first;
+    Piece& piece = pair.second;
+    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
+  }
+  owns_buffers_ = other.owns_buffers_;
+
+  other.shape_ = ShapeUtil::MakeNil();
+  other.pieces_ = ShapeTree<Piece>(other.shape_);
+  other.piece({}).set_subshape(&other.shape_);
+  return *this;
+}
+
 std::unique_ptr<Literal> Literal::CreateFromShape(const Shape& shape) {
-  auto literal = MakeUnique<Literal>();
-  *literal->mutable_shape() = shape;
-  if (ShapeUtil::IsTuple(shape)) {
-    int64 num_elements = ShapeUtil::TupleElementCount(shape);
-    literal->tuple_literals_.resize(num_elements);
-    for (int i = 0; i < num_elements; ++i) {
-      std::unique_ptr<Literal> elem =
-          CreateFromShape(ShapeUtil::GetTupleElementShape(shape, i));
-      literal->tuple_literals_[i] = std::move(*elem);
+  auto literal = MakeUnique<Literal>(shape);
+  for (auto& pair : literal->pieces_) {
+    Piece& piece = pair.second;
+    if (ShapeUtil::IsArray(piece.subshape())) {
+      memset(piece.untyped_data(), 0, piece.size_bytes());
     }
-  } else {
-    literal->Reserve(ShapeUtil::ElementsIn(literal->shape()));
   }
   return literal;
 }
 
+const SparseIndexArray* Literal::sparse_indices(
+    const ShapeIndex& shape_index) const {
+  return piece(shape_index).sparse_indices();
+}
+
+SparseIndexArray* Literal::sparse_indices(const ShapeIndex& shape_index) {
+  return piece(shape_index).sparse_indices();
+}
+
 /* static */ std::unique_ptr<Literal> Literal::CreateFromDimensions(
     PrimitiveType primitive_type,
     tensorflow::gtl::ArraySlice<int64> dimensions) {
   return CreateFromShape(ShapeUtil::MakeShape(primitive_type, dimensions));
 }
 
-template <typename T>
-Status Literal::CopyRange(const Literal& src_literal,
-                          tensorflow::gtl::ArraySlice<int64> src_base,
-                          tensorflow::gtl::ArraySlice<int64> dest_base,
-                          tensorflow::gtl::ArraySlice<int64> copy_size) {
-  const Shape& src_shape = src_literal.shape();
-  const Shape& dest_shape = shape();
-  tensorflow::gtl::ArraySlice<T> src_data = src_literal.GetArraySlice<T>();
-  tensorflow::gtl::MutableArraySlice<T> dest_data = GetMutableArraySlice<T>();
-
-  TF_RET_CHECK(ShapeUtil::Rank(src_shape) == src_base.size());
-  TF_RET_CHECK(ShapeUtil::Rank(dest_shape) == dest_base.size());
+template <typename NativeT>
+Status Literal::CopySliceFromInternal(
+    const Literal& src_literal, tensorflow::gtl::ArraySlice<int64> src_base,
+    tensorflow::gtl::ArraySlice<int64> dest_base,
+    tensorflow::gtl::ArraySlice<int64> copy_size) {
+  TF_RET_CHECK(ShapeUtil::Rank(src_literal.shape()) == src_base.size());
+  TF_RET_CHECK(ShapeUtil::Rank(shape()) == dest_base.size());
+
+  auto linear_index = [](const Shape& shape,
+                         tensorflow::gtl::ArraySlice<int64> multi_index) {
+    return IndexUtil::MultidimensionalIndexToLinearIndex(shape, multi_index);
+  };
 
-  if (ShapeUtil::Rank(src_shape) == 0 || ShapeUtil::Rank(dest_shape) == 0) {
+  if (ShapeUtil::Rank(src_literal.shape()) == 0 ||
+      ShapeUtil::Rank(shape()) == 0) {
     // If any of the two shapes are scalars, we can just call the StridedCopy()
     // directly, and we know we will be copying only one value.
     TF_RET_CHECK(copy_size.empty());
-    StridedCopy(dest_data, LinearIndex(dest_base), 0, src_data,
-                src_literal.LinearIndex(src_base), 0, 1);
-  } else if (!ShapeUtil::HasZeroElements(dest_shape) &&
-             !ShapeUtil::HasZeroElements(src_shape)) {
-    // Perform copy if neither src literal nor dest literal has dimensions with
-    // zero element, otherwise it's a no-op.
+    StridedCopy(data<NativeT>(), linear_index(shape(), dest_base), 0,
+                src_literal.data<NativeT>(),
+                linear_index(src_literal.shape(), src_base), 0, 1);
+  } else if (!ShapeUtil::HasZeroElements(shape()) &&
+             !ShapeUtil::HasZeroElements(src_literal.shape())) {
+    // Perform copy if neither src nor dest has dimensions with zero element,
+    // otherwise it's a no-op.
     TF_RET_CHECK(src_base.size() == dest_base.size());
     TF_RET_CHECK(src_base.size() == copy_size.size());
 
@@ -133,7 +220,8 @@ Status Literal::CopyRange(const Literal& src_literal,
     // proper stride size at the matching dimension.
     DimensionVector src_indexes(src_base.size(), 0);
     DimensionVector dest_indexes(dest_base.size(), 0);
-    StrideConfig stride_config(src_shape, dest_shape, copy_size);
+    Literal::StrideConfig stride_config(src_literal.shape(), shape(),
+                                        copy_size);
 
     auto copy_proc = [&](const std::vector<int64>& indexes) {
       // Map from multi-dimensional index, to source index.
@@ -143,89 +231,296 @@ Status Literal::CopyRange(const Literal& src_literal,
       std::transform(indexes.begin(), indexes.end(), dest_base.begin(),
                      dest_indexes.begin(), std::plus<int64>());
 
-      int64 src_index = src_literal.LinearIndex(src_indexes);
-      int64 dest_index = LinearIndex(dest_indexes);
+      int64 src_index = linear_index(src_literal.shape(), src_indexes);
+      int64 dest_index = linear_index(shape(), dest_indexes);
 
-      StridedCopy(dest_data, dest_index, stride_config.dest_stride, src_data,
-                  src_index, stride_config.source_stride,
-                  stride_config.minor_loop_size);
+      // `this->` is needed to workaround MSVC bug: #16882
+      StridedCopy(this->data<NativeT>(), dest_index, stride_config.dest_stride,
+                  src_literal.data<NativeT>(), src_index,
+                  stride_config.source_stride, stride_config.minor_loop_size);
       return true;
     };
 
-    ShapeUtil::ForEachIndex(src_shape, stride_config.base,
+    ShapeUtil::ForEachIndex(src_literal.shape(), stride_config.base,
                             stride_config.dimensions, stride_config.step,
                             copy_proc);
   }
   return Status::OK();
 }
 
-Status Literal::Copy(const Literal& src_literal,
-                     tensorflow::gtl::ArraySlice<int64> src_base,
-                     tensorflow::gtl::ArraySlice<int64> dest_base,
-                     tensorflow::gtl::ArraySlice<int64> copy_size) {
+std::vector<Literal> Literal::DecomposeTuple() {
+  CHECK(ShapeUtil::IsTuple(shape()));
+  std::vector<Literal> elements;
+  for (int i = 0; i < ShapeUtil::TupleElementCount(shape()); ++i) {
+    elements.push_back(Literal(ShapeUtil::GetSubshape(shape(), {i}),
+                               /*allocate_arrays=*/false));
+    Literal& element = elements.back();
+    for (auto& pair : element.pieces_) {
+      const ShapeIndex& index = pair.first;
+      Piece& dest_piece = pair.second;
+      ShapeIndex src_index = {i};
+      for (int64 j : index) {
+        src_index.push_back(j);
+      }
+      Piece& src_piece = piece(src_index);
+
+      // Move the respective buffer and sparse indices over to the element
+      // Literal.
+      dest_piece.set_buffer(src_piece.buffer());
+      src_piece.set_buffer(nullptr);
+      dest_piece.set_sparse_indices(src_piece.sparse_indices());
+      src_piece.set_sparse_indices(nullptr);
+    }
+  }
+  // Set this literal to be nil-shaped.
+  *this = Literal();
+  return elements;
+}
+
+/* static */ Literal Literal::MoveIntoTuple(
+    tensorflow::gtl::MutableArraySlice<Literal> elements) {
+  std::vector<Shape> element_shapes;
+  for (const Literal& element : elements) {
+    element_shapes.push_back(element.shape());
+  }
+  Literal literal(ShapeUtil::MakeTupleShape(element_shapes),
+                  /*allocate_arrays=*/false);
+  for (int i = 0; i < elements.size(); ++i) {
+    TF_CHECK_OK(
+        literal.MoveFrom(std::move(elements[i]), /*dest_shape_index=*/{i}));
+  }
+  return literal;
+}
+
+namespace {
+
+// Copies the elements in 'src' to 'dest'. The shape and layout of the data in
+// the array slices are indicated by dest_shape and src_shape respectively.
+template <typename NativeT>
+void CopyElementsBetween(tensorflow::gtl::MutableArraySlice<NativeT> dest,
+                         tensorflow::gtl::ArraySlice<NativeT> src,
+                         const Shape& dest_shape, const Shape& src_shape) {
+  CHECK(ShapeUtil::Compatible(dest_shape, src_shape));
+  if (ShapeUtil::HasZeroElements(dest_shape)) {
+    return;
+  }
+  std::vector<int64> index(ShapeUtil::Rank(dest_shape));
+  do {
+    dest[IndexUtil::MultidimensionalIndexToLinearIndex(dest_shape, index)] =
+        src[IndexUtil::MultidimensionalIndexToLinearIndex(src_shape, index)];
+  } while (IndexUtil::BumpIndices(dest_shape, &index));
+}
+
+}  // namespace
+
+Status Literal::Piece::CopyFrom(const Literal::Piece& src) {
+  if (ShapeUtil::Equal(subshape(), src.subshape())) {
+    // If the layouts are equal it's faster just to memcpy.
+    memcpy(buffer(), src.buffer(), src.size_bytes());
+  } else {
+    TF_RET_CHECK(ShapeUtil::Compatible(src.subshape(), subshape()));
+    std::vector<int64> origin(ShapeUtil::Rank(subshape()), 0);
+    switch (subshape().element_type()) {
+#define COPY_ELEMENTS(XLA_T, NATIVE_T)                                    \
+  case (XLA_T):                                                           \
+    CopyElementsBetween<NATIVE_T>(data<NATIVE_T>(), src.data<NATIVE_T>(), \
+                                  subshape(), src.subshape());            \
+    break;
+      COPY_ELEMENTS(U8, uint8);
+      COPY_ELEMENTS(U16, uint16);
+      COPY_ELEMENTS(U32, uint32);
+      COPY_ELEMENTS(U64, uint64);
+      COPY_ELEMENTS(S8, int8);
+      COPY_ELEMENTS(S16, int16);
+      COPY_ELEMENTS(S32, int32);
+      COPY_ELEMENTS(S64, int64);
+      COPY_ELEMENTS(F16, half);
+      COPY_ELEMENTS(BF16, bfloat16);
+      COPY_ELEMENTS(F32, float);
+      COPY_ELEMENTS(F64, double);
+      COPY_ELEMENTS(C64, complex64);
+      COPY_ELEMENTS(PRED, bool);
+#undef COPY_ELEMENTS
+      default:
+        return Unimplemented(
+            "Unhandled primitive type %s",
+            PrimitiveType_Name(subshape().element_type()).c_str());
+    }
+  }
+  return Status::OK();
+}
+
+Status Literal::CopyFrom(const Literal& src_literal,
+                         const ShapeIndex& dest_shape_index,
+                         const ShapeIndex& src_shape_index) {
+  const Shape& dest_subshape =
+      ShapeUtil::GetSubshape(shape(), dest_shape_index);
+  const Shape& src_subshape =
+      ShapeUtil::GetSubshape(src_literal.shape(), src_shape_index);
+  if (!ShapeUtil::Compatible(dest_subshape, src_subshape)) {
+    return InvalidArgument(
+        "Destination subshape incompatible with source subshape: %s vs %s",
+        ShapeUtil::HumanString(dest_subshape).c_str(),
+        ShapeUtil::HumanString(src_subshape).c_str());
+  }
+
+  for (auto& pair : pieces_) {
+    const ShapeIndex& index = pair.first;
+    Piece& piece = pair.second;
+    if (!ShapeUtil::IsArray(piece.subshape())) {
+      continue;
+    }
+
+    // Determine if this index is in the part of this literal that we want to
+    // copy over from src_literal.
+    bool in_subtree_to_copy = true;
+    for (int i = 0; i < dest_shape_index.size(); ++i) {
+      if (index[i] != dest_shape_index[i]) {
+        in_subtree_to_copy = false;
+        break;
+      }
+    }
+    if (!in_subtree_to_copy) {
+      continue;
+    }
+
+    // Construct the index of the corresponding piece in the source literal.
+    ShapeIndex src_piece_index = src_shape_index;
+    for (int64 i = dest_shape_index.size(); i < index.size(); ++i) {
+      src_piece_index.push_back(index[i]);
+    }
+
+    TF_RETURN_IF_ERROR(piece.CopyFrom(src_literal.piece(src_piece_index)));
+  }
+  return Status::OK();
+}
+
+Status Literal::MoveFrom(Literal&& src_literal,
+                         const ShapeIndex& dest_shape_index) {
+  const Shape& dest_subshape =
+      ShapeUtil::GetSubshape(shape(), dest_shape_index);
+  if (!ShapeUtil::Equal(dest_subshape, src_literal.shape())) {
+    return InvalidArgument(
+        "Destination subshape not equal to source shape: %s vs %s",
+        ShapeUtil::HumanString(dest_subshape).c_str(),
+        ShapeUtil::HumanString(src_literal.shape()).c_str());
+  }
+
+  if (!(owns_buffers_ && src_literal.owns_buffers_)) {
+    return InvalidArgument(
+        "Source and destination literals must both own their buffers (ie, not "
+        "be views)");
+  }
+
+  for (auto& pair : src_literal.pieces_) {
+    const ShapeIndex& src_index = pair.first;
+    Piece& src_piece = pair.second;
+    if (!ShapeUtil::IsArray(src_piece.subshape())) {
+      continue;
+    }
+
+    ShapeIndex dest_index = dest_shape_index;
+    for (int64 i : src_index) {
+      dest_index.push_back(i);
+    }
+    Piece& dest_piece = piece(dest_index);
+    delete[] dest_piece.buffer();
+    dest_piece.set_buffer(src_piece.buffer());
+    delete dest_piece.sparse_indices();
+    dest_piece.set_sparse_indices(src_piece.sparse_indices());
+  }
+
+  src_literal.shape_ = ShapeUtil::MakeNil();
+  src_literal.pieces_ = ShapeTree<Piece>(src_literal.shape_);
+  src_literal.piece({}).set_subshape(&src_literal.shape_);
+  return Status::OK();
+}
+
+Status Literal::CopySliceFrom(const Literal& src_literal,
+                              tensorflow::gtl::ArraySlice<int64> src_base,
+                              tensorflow::gtl::ArraySlice<int64> dest_base,
+                              tensorflow::gtl::ArraySlice<int64> copy_size) {
+  TF_RET_CHECK(ShapeUtil::IsArray(shape())) << ShapeUtil::HumanString(shape());
+  TF_RET_CHECK(ShapeUtil::IsArray(src_literal.shape()))
+      << ShapeUtil::HumanString(src_literal.shape());
   TF_RET_CHECK(ShapeUtil::SameElementType(src_literal.shape(), shape()));
-  switch (src_literal.shape().element_type()) {
+
+  switch (shape().element_type()) {
     case U8:
-      return CopyRange<uint8>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<uint8>(src_literal, src_base, dest_base,
+                                          copy_size);
     case U16:
-      return CopyRange<uint16>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<uint16>(src_literal, src_base, dest_base,
+                                           copy_size);
     case U32:
-      return CopyRange<uint32>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<uint32>(src_literal, src_base, dest_base,
+                                           copy_size);
     case U64:
-      return CopyRange<uint64>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<uint64>(src_literal, src_base, dest_base,
+                                           copy_size);
     case S8:
-      return CopyRange<int8>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<int8>(src_literal, src_base, dest_base,
+                                         copy_size);
     case S16:
-      return CopyRange<int16>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<int16>(src_literal, src_base, dest_base,
+                                          copy_size);
     case S32:
-      return CopyRange<int32>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<int32>(src_literal, src_base, dest_base,
+                                          copy_size);
     case S64:
-      return CopyRange<int64>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<int64>(src_literal, src_base, dest_base,
+                                          copy_size);
     case F16:
-      return CopyRange<half>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<half>(src_literal, src_base, dest_base,
+                                         copy_size);
     case BF16:
-      return CopyRange<bfloat16>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<bfloat16>(src_literal, src_base, dest_base,
+                                             copy_size);
     case F32:
-      return CopyRange<float>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<float>(src_literal, src_base, dest_base,
+                                          copy_size);
     case F64:
-      return CopyRange<double>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<double>(src_literal, src_base, dest_base,
+                                           copy_size);
     case C64:
-      return CopyRange<complex64>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<complex64>(src_literal, src_base, dest_base,
+                                              copy_size);
     case PRED:
-      return CopyRange<bool>(src_literal, src_base, dest_base, copy_size);
+      return CopySliceFromInternal<bool>(src_literal, src_base, dest_base,
+                                         copy_size);
     default:
       break;
   }
-  return Unimplemented("Unhandled primitive type %d",
-                       src_literal.shape().element_type());
+  return Unimplemented("Unhandled primitive type %d", shape().element_type());
 }
 
 /* static */ Literal Literal::Zero(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case U8:
-      return *Literal::CreateR0<uint8>(0);
+      return std::move(*Literal::CreateR0<uint8>(0));
     case U32:
-      return *Literal::CreateR0<uint32>(0);
+      return std::move(*Literal::CreateR0<uint32>(0));
     case U64:
-      return *Literal::CreateR0<uint64>(0);
+      return std::move(*Literal::CreateR0<uint64>(0));
     case S8:
-      return *Literal::CreateR0<int8>(0);
+      return std::move(*Literal::CreateR0<int8>(0));
     case S32:
-      return *Literal::CreateR0<int32>(0);
+      return std::move(*Literal::CreateR0<int32>(0));
     case S64:
-      return *Literal::CreateR0<int64>(0);
+      return std::move(*Literal::CreateR0<int64>(0));
     case F16:
-      return *Literal::CreateR0<half>(static_cast<half>(0.0f));
+      return std::move(*Literal::CreateR0<half>(static_cast<half>(0.0f)));
     case BF16:
-      return *Literal::CreateR0<bfloat16>(static_cast<bfloat16>(0.0f));
+      return std::move(
+          *Literal::CreateR0<bfloat16>(static_cast<bfloat16>(0.0f)));
     case F32:
-      return *Literal::CreateR0<float>(0);
+      return std::move(*Literal::CreateR0<float>(0));
     case F64:
-      return *Literal::CreateR0<double>(0);
+      return std::move(*Literal::CreateR0<double>(0));
     case C64:
-      return *Literal::CreateR0<complex64>(0);
+      return std::move(*Literal::CreateR0<complex64>(0));
     case PRED:
-      return *Literal::CreateR0<bool>(false);
+      return std::move(*Literal::CreateR0<bool>(false));
     case S16:
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
@@ -241,30 +536,33 @@ Status Literal::Copy(const Literal& src_literal,
 /* static */ Literal Literal::One(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case U8:
-      return *Literal::CreateR0<uint8>(1);
+      return std::move(*Literal::CreateR0<uint8>(1));
     case U32:
-      return *Literal::CreateR0<uint32>(1);
+      return std::move(*Literal::CreateR0<uint32>(1));
     case U64:
-      return *Literal::CreateR0<uint64>(1);
+      return std::move(*Literal::CreateR0<uint64>(1));
     case S8:
-      return *Literal::CreateR0<int8>(1);
+      return std::move(*Literal::CreateR0<int8>(1));
     case S32:
-      return *Literal::CreateR0<int32>(1);
+      return std::move(*Literal::CreateR0<int32>(1));
     case S64:
-      return *Literal::CreateR0<int64>(1);
+      return std::move(*Literal::CreateR0<int64>(1));
+    case F16:
+      return std::move(*Literal::CreateR0<half>(static_cast<half>(1.0f)));
+    case BF16:
+      return std::move(
+          *Literal::CreateR0<bfloat16>(static_cast<bfloat16>(1.0f)));
     case F32:
-      return *Literal::CreateR0<float>(1);
+      return std::move(*Literal::CreateR0<float>(1));
     case F64:
-      return *Literal::CreateR0<double>(1);
+      return std::move(*Literal::CreateR0<double>(1));
     case C64:
-      return *Literal::CreateR0<complex64>(1);
+      return std::move(*Literal::CreateR0<complex64>(1));
     case PRED:
-      return *Literal::CreateR0<bool>(true);
+      return std::move(*Literal::CreateR0<bool>(true));
     case S16:
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
-    case F16:
-      return *Literal::CreateR0<half>(static_cast<half>(1.0f));
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 1";
     case OPAQUE:
@@ -277,35 +575,42 @@ Status Literal::Copy(const Literal& src_literal,
 /* static */ Literal Literal::MinValue(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case U8:
-      return *Literal::CreateR0<uint8>(std::numeric_limits<uint8>::min());
+      return std::move(
+          *Literal::CreateR0<uint8>(std::numeric_limits<uint8>::min()));
     case U32:
-      return *Literal::CreateR0<uint32>(std::numeric_limits<uint32>::min());
+      return std::move(
+          *Literal::CreateR0<uint32>(std::numeric_limits<uint32>::min()));
     case U64:
-      return *Literal::CreateR0<uint64>(std::numeric_limits<uint64>::min());
+      return std::move(
+          *Literal::CreateR0<uint64>(std::numeric_limits<uint64>::min()));
     case S8:
-      return *Literal::CreateR0<int8>(std::numeric_limits<int8>::min());
+      return std::move(
+          *Literal::CreateR0<int8>(std::numeric_limits<int8>::min()));
     case S32:
-      return *Literal::CreateR0<int32>(std::numeric_limits<int32>::min());
+      return std::move(
+          *Literal::CreateR0<int32>(std::numeric_limits<int32>::min()));
     case S64:
-      return *Literal::CreateR0<int64>(std::numeric_limits<int64>::min());
+      return std::move(
+          *Literal::CreateR0<int64>(std::numeric_limits<int64>::min()));
     case F32:
-      return *Literal::CreateR0<float>(-std::numeric_limits<float>::infinity());
+      return std::move(
+          *Literal::CreateR0<float>(-std::numeric_limits<float>::infinity()));
     case F64:
-      return *Literal::CreateR0<double>(
-          -std::numeric_limits<double>::infinity());
+      return std::move(
+          *Literal::CreateR0<double>(-std::numeric_limits<double>::infinity()));
     case C64:
       LOG(FATAL) << "C64 element type has no minimum value";
     case PRED:
-      return *Literal::CreateR0<bool>(false);
+      return std::move(*Literal::CreateR0<bool>(false));
     case S16:
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case F16:
-      return *Literal::CreateR0<half>(
-          static_cast<half>(-std::numeric_limits<float>::infinity()));
+      return std::move(*Literal::CreateR0<half>(
+          static_cast<half>(-std::numeric_limits<float>::infinity())));
     case BF16:
-      return *Literal::CreateR0<bfloat16>(
-          static_cast<bfloat16>(-std::numeric_limits<float>::infinity()));
+      return std::move(*Literal::CreateR0<bfloat16>(
+          static_cast<bfloat16>(-std::numeric_limits<float>::infinity())));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no minimum value";
     case OPAQUE:
@@ -318,33 +623,40 @@ Status Literal::Copy(const Literal& src_literal,
 /* static */ Literal Literal::MaxValue(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case U8:
-      return *Literal::CreateR0<uint8>(std::numeric_limits<uint8>::max());
+      return std::move(
+          *Literal::CreateR0<uint8>(std::numeric_limits<uint8>::max()));
     case U32:
-      return *Literal::CreateR0<uint32>(std::numeric_limits<uint32>::max());
+      return std::move(
+          *Literal::CreateR0<uint32>(std::numeric_limits<uint32>::max()));
     case U64:
-      return *Literal::CreateR0<uint64>(std::numeric_limits<uint64>::max());
+      return std::move(
+          *Literal::CreateR0<uint64>(std::numeric_limits<uint64>::max()));
     case S8:
-      return *Literal::CreateR0<int8>(std::numeric_limits<int8>::max());
+      return std::move(
+          *Literal::CreateR0<int8>(std::numeric_limits<int8>::max()));
     case S32:
-      return *Literal::CreateR0<int32>(std::numeric_limits<int32>::max());
+      return std::move(
+          *Literal::CreateR0<int32>(std::numeric_limits<int32>::max()));
     case S64:
-      return *Literal::CreateR0<int64>(std::numeric_limits<int64>::max());
+      return std::move(
+          *Literal::CreateR0<int64>(std::numeric_limits<int64>::max()));
     case F32:
-      return *Literal::CreateR0<float>(std::numeric_limits<float>::infinity());
+      return std::move(
+          *Literal::CreateR0<float>(std::numeric_limits<float>::infinity()));
     case F64:
-      return *Literal::CreateR0<double>(
-          std::numeric_limits<double>::infinity());
+      return std::move(
+          *Literal::CreateR0<double>(std::numeric_limits<double>::infinity()));
     case PRED:
-      return *Literal::CreateR0<bool>(true);
+      return std::move(*Literal::CreateR0<bool>(true));
     case S16:
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case F16:
-      return *Literal::CreateR0<half>(
-          static_cast<half>(std::numeric_limits<float>::infinity()));
+      return std::move(*Literal::CreateR0<half>(
+          static_cast<half>(std::numeric_limits<float>::infinity())));
     case BF16:
-      return *Literal::CreateR0<bfloat16>(
-          static_cast<bfloat16>(std::numeric_limits<float>::infinity()));
+      return std::move(*Literal::CreateR0<bfloat16>(
+          static_cast<bfloat16>(std::numeric_limits<float>::infinity())));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no maximum value";
     case OPAQUE:
@@ -356,17 +668,29 @@ Status Literal::Copy(const Literal& src_literal,
 
 /* static */ std::unique_ptr<Literal> Literal::CreateR1(
     const tensorflow::core::Bitmap& values) {
-  auto literal = MakeUnique<Literal>();
+  auto literal = MakeUnique<Literal>(
+      ShapeUtil::MakeShape(PRED, {static_cast<int64>(values.bits())}));
   literal->PopulateR1(values);
   return literal;
 }
 
+void Literal::PopulateR1(const tensorflow::core::Bitmap& values) {
+  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK_EQ(ShapeUtil::Rank(shape()), 1);
+  CHECK_EQ(element_count(), values.bits());
+  CHECK_EQ(shape().element_type(), PRED);
+  for (int64 i = 0; i < static_cast<int64>(values.bits()); ++i) {
+    Set({i}, values.get(i));
+  }
+}
+
 /* static */ std::unique_ptr<Literal> Literal::CreateR1U8(
     tensorflow::StringPiece value) {
-  auto literal = MakeUnique<Literal>();
-  *literal->mutable_shape() =
-      ShapeUtil::MakeShape(U8, {static_cast<int64>(value.size())});
-  literal->set_u8s(tensorflow::StringPiece(value.ToString()));
+  auto literal = MakeUnique<Literal>(
+      ShapeUtil::MakeShape(U8, {static_cast<int64>(value.size())}));
+  for (int i = 0; i < value.size(); ++i) {
+    literal->Set<uint8>({i}, value[i]);
+  }
   return literal;
 }
 
@@ -380,46 +704,50 @@ Status Literal::Copy(const Literal& src_literal,
 
 std::unique_ptr<Literal> Literal::Relayout(
     const Layout& new_layout, const ShapeIndex& shape_index) const {
-  std::unique_ptr<Literal> outer_result = CloneToUnique();
-
-  const Literal* copy_from = this;
-  Literal* copy_to = outer_result.get();
-  for (int64 i = 0; i < shape_index.size(); i++) {
-    *ShapeUtil::GetMutableSubshape(copy_to->mutable_shape(), {shape_index, i})
-         ->mutable_layout() = new_layout;
-    copy_from = &copy_from->tuple_literals_[shape_index[i]];
-    copy_to = &copy_to->tuple_literals_[shape_index[i]];
-  }
-
-  DimensionVector base(ShapeUtil::Rank(copy_from->shape()), 0);
-  DimensionVector copy_size(copy_from->shape().dimensions().begin(),
-                            copy_from->shape().dimensions().end());
+  // Create new shape with 'new_layout' set at the given shape index.
+  Shape new_shape = shape();
+  Shape* subshape = ShapeUtil::GetMutableSubshape(&new_shape, shape_index);
+  TF_CHECK_OK(LayoutUtil::ValidateLayoutForShape(new_layout, *subshape));
+  *subshape->mutable_layout() = new_layout;
+  auto result = MakeUnique<Literal>(new_shape);
+  TF_CHECK_OK(result->CopyFrom(*this));
+  return result;
+}
 
-  CHECK(ShapeUtil::IsArray(copy_from->shape()));
-  CHECK(ShapeUtil::IsArray(copy_to->shape()));
-  *copy_to->mutable_shape()->mutable_layout() = new_layout;
-  TF_CHECK_OK(copy_to->Copy(*copy_from, base, base, copy_size));
-  return outer_result;
+std::unique_ptr<Literal> Literal::Relayout(
+    const Shape& shape_with_layout) const {
+  CHECK(ShapeUtil::Compatible(shape_with_layout, shape()))
+      << "Given shape_with_layout " << ShapeUtil::HumanString(shape_with_layout)
+      << " not compatible with literal shape "
+      << ShapeUtil::HumanString(shape());
+  std::unique_ptr<Literal> result = CreateFromShape(shape_with_layout);
+  ShapeUtil::ForEachSubshape(
+      result->shape(),
+      [this, &result](const Shape& subshape, const ShapeIndex& index) {
+        if (ShapeUtil::IsArray(subshape)) {
+          TF_CHECK_OK(result->CopyFrom(*this,
+                                       /*dest_shape_index=*/index,
+                                       /*src_shape_index=*/index));
+        }
+      });
+  return result;
 }
 
 StatusOr<std::unique_ptr<Literal>> Literal::Reshape(
     tensorflow::gtl::ArraySlice<int64> dimensions) const {
-  if (ShapeUtil::IsTuple(shape())) {
+  if (!ShapeUtil::IsArray(shape())) {
     return InvalidArgument("Reshape does not support tuples.");
   }
   std::unique_ptr<Literal> output;
   if (!LayoutUtil::IsMonotonicWithDim0Major(shape().layout())) {
-    std::vector<int64> minor_to_major(ShapeUtil::Rank(shape()));
-    std::iota(minor_to_major.rbegin(), minor_to_major.rend(),
-              static_cast<int64>(0));
-    output = Relayout(LayoutUtil::MakeLayout(minor_to_major));
+    output =
+        Relayout(LayoutUtil::GetDefaultLayoutForRank(ShapeUtil::Rank(shape())));
   } else {
     output = CloneToUnique();
   }
   // Because the layout is monotonic, we can simply reuse the same sequence of
   // values without changing their order.
-  *output->mutable_shape() =
-      ShapeUtil::MakeShape(shape().element_type(), dimensions);
+  output->shape_ = ShapeUtil::MakeShape(shape().element_type(), dimensions);
 
   int64 elements_before = ShapeUtil::ElementsIn(shape());
   int64 elements_after = ShapeUtil::ElementsIn(output->shape());
@@ -435,7 +763,7 @@ StatusOr<std::unique_ptr<Literal>> Literal::Reshape(
 
 std::unique_ptr<Literal> Literal::Transpose(
     tensorflow::gtl::ArraySlice<int64> permutation) const {
-  CHECK(!ShapeUtil::IsTuple(shape())) << "Tuple is not supported for transpose";
+  CHECK(ShapeUtil::IsArray(shape())) << "Tuple is not supported for transpose";
   CHECK(IsPermutation(permutation, ShapeUtil::Rank(shape())))
       << "Given permutation is not a permutation of dimension numbers";
   // To transpose the array, we just permute the dimensions and layout, and
@@ -458,23 +786,24 @@ std::unique_ptr<Literal> Literal::Transpose(
   // dimension has within the transposed array, a layout is affine if
   // MinMaj(Di) == TMinMaj(T(Di)), with TMinMaj() being the minor to major
   // vector of the affine layout.
+  CHECK(LayoutUtil::IsDenseArray(permuted_shape));
   Layout* layout = permuted_shape.mutable_layout();
   layout->clear_minor_to_major();
-  for (auto index : shape().layout().minor_to_major()) {
+  for (auto index : LayoutUtil::MinorToMajor(shape())) {
     layout->add_minor_to_major(inverse_permutation[index]);
   }
   std::unique_ptr<Literal> new_literal = CreateFromShape(permuted_shape);
   DCHECK_GE(ShapeUtil::ByteSizeOf(new_literal->shape()),
             ShapeUtil::ByteSizeOf(shape()));
-  std::memcpy(new_literal->MutableInternalData(), InternalData(),
-              ShapeUtil::ByteSizeOf(shape()));
+  std::memcpy(new_literal->root_piece().buffer(), root_piece().buffer(),
+              root_piece().size_bytes());
   return new_literal;
 }
 
 std::unique_ptr<Literal> Literal::Slice(
     tensorflow::gtl::ArraySlice<int64> start_indices,
     tensorflow::gtl::ArraySlice<int64> limit_indices) const {
-  CHECK(!ShapeUtil::IsTuple(shape())) << "tuple is not supported for reshape";
+  CHECK(ShapeUtil::IsArray(shape())) << "tuple is not supported for slice";
 
   DimensionVector result_dimensions;
   for (int64 dnum = 0; dnum < ShapeUtil::Rank(shape()); ++dnum) {
@@ -484,13 +813,11 @@ std::unique_ptr<Literal> Literal::Slice(
     CHECK_GT(dimension, 0);
     result_dimensions.push_back(dimension);
   }
-  const auto result_shape = ShapeUtil::MakeShapeWithLayout(
-      shape().element_type(), result_dimensions,
-      AsInt64Slice(shape().layout().minor_to_major()));
+  const auto result_shape =
+      ShapeUtil::MakeShapeWithLayout(shape().element_type(), result_dimensions,
+                                     LayoutUtil::MinorToMajor(shape()));
 
-  auto result_literal = MakeUnique<Literal>();
-  *result_literal->mutable_shape() = result_shape;
-  result_literal->Reserve(ShapeUtil::ElementsIn(result_shape));
+  auto result_literal = MakeUnique<Literal>(result_shape);
 
   DimensionVector new_indices(ShapeUtil::Rank(result_shape));
   switch (result_shape.element_type()) {
@@ -504,6 +831,16 @@ std::unique_ptr<Literal> Literal::Slice(
             result_literal->Set<float>(indices, value);
           });
       return result_literal;
+    case C64:
+      result_literal->EachCell<complex64>(
+          [&](tensorflow::gtl::ArraySlice<int64> indices, complex64 /*value*/) {
+            for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
+              new_indices[i] = indices[i] + start_indices[i];
+            }
+            complex64 value = Get<complex64>(new_indices);
+            result_literal->Set<complex64>(indices, value);
+          });
+      return result_literal;
     case S32:
       result_literal->EachCell<int32>(
           [&](tensorflow::gtl::ArraySlice<int64> indices, int32 /*value*/) {
@@ -530,48 +867,116 @@ std::unique_ptr<Literal> Literal::Slice(
   }
 }
 
+Literal Literal::Clone() const {
+  Literal result(shape());
+  TF_CHECK_OK(result.CopyFrom(*this));
+  return result;
+}
+
 std::unique_ptr<Literal> Literal::CloneToUnique() const {
-  auto unique = MakeUnique<Literal>();
-  *unique = *this;
-  return unique;
+  auto result = MakeUnique<Literal>(shape());
+  TF_CHECK_OK(result->CopyFrom(*this));
+  return result;
 }
 
-string Literal::GetAsString(
-    tensorflow::gtl::ArraySlice<int64> multi_index) const {
-  switch (shape().element_type()) {
+string Literal::GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
+                            const ShapeIndex& shape_index) const {
+  const Shape& subshape = ShapeUtil::GetSubshape(shape(), shape_index);
+  CHECK(LayoutUtil::IsDenseArray(subshape));
+  switch (subshape.element_type()) {
     case PRED:
-      return Get<bool>(multi_index) ? "true" : "false";
-    case U8:
-      return tensorflow::strings::StrCat(Get<uint8>(multi_index));
+      return Get<bool>(multi_index, shape_index) ? "true" : "false";
+    case S8:
+      return StrCat(Get<int8>(multi_index, shape_index));
+    case S16:
+      return StrCat(Get<int16>(multi_index, shape_index));
     case S32:
-      return tensorflow::strings::StrCat(Get<int32>(multi_index));
+      return StrCat(Get<int32>(multi_index, shape_index));
     case S64:
-      return tensorflow::strings::StrCat(Get<int64>(multi_index));
+      return StrCat(Get<int64>(multi_index, shape_index));
+    case U8:
+      return StrCat(Get<uint8>(multi_index, shape_index));
+    case U16:
+      return StrCat(Get<uint16>(multi_index, shape_index));
     case U32:
-      return tensorflow::strings::StrCat(Get<uint32>(multi_index));
+      return StrCat(Get<uint32>(multi_index, shape_index));
     case U64:
-      return tensorflow::strings::StrCat(Get<uint64>(multi_index));
+      return StrCat(Get<uint64>(multi_index, shape_index));
+    case F16:
+      return StrCat(Get<half>(multi_index, shape_index));
     case F32:
-      return tensorflow::strings::StrCat(Get<float>(multi_index));
+      return StrCat(Get<float>(multi_index, shape_index));
+    case BF16:
+      return StrCat(
+          static_cast<float>(Get<bfloat16>(multi_index, shape_index)));
     case F64:
-      return tensorflow::strings::StrCat(Get<double>(multi_index));
+      return StrCat(Get<double>(multi_index, shape_index));
     case C64: {
-      complex64 c = Get<complex64>(multi_index);
-      return tensorflow::strings::StrCat("(", c.real(), ", ", c.imag(), ")");
+      complex64 c = Get<complex64>(multi_index, shape_index);
+      return StrCat("(", c.real(), ", ", c.imag(), ")");
     }
+    default:
+      LOG(FATAL) << PrimitiveType_Name(subshape.element_type());
+  }
+}
+
+string Literal::GetSparseElementAsString(int64 sparse_element_number,
+                                         const ShapeIndex& shape_index) const {
+  const Shape& subshape = ShapeUtil::GetSubshape(shape(), shape_index);
+  CHECK(LayoutUtil::IsSparseArray(subshape));
+  switch (subshape.element_type()) {
+    case PRED:
+      return GetSparseElement<bool>(sparse_element_number, shape_index)
+                 ? "true"
+                 : "false";
+    case S8:
+      return StrCat(GetSparseElement<int8>(sparse_element_number, shape_index));
+    case S16:
+      return StrCat(
+          GetSparseElement<int16>(sparse_element_number, shape_index));
+    case S32:
+      return StrCat(
+          GetSparseElement<int32>(sparse_element_number, shape_index));
+    case S64:
+      return StrCat(
+          GetSparseElement<int64>(sparse_element_number, shape_index));
+    case U8:
+      return StrCat(
+          GetSparseElement<uint8>(sparse_element_number, shape_index));
+    case U16:
+      return StrCat(
+          GetSparseElement<uint16>(sparse_element_number, shape_index));
+    case U32:
+      return StrCat(
+          GetSparseElement<uint32>(sparse_element_number, shape_index));
+    case U64:
+      return StrCat(
+          GetSparseElement<uint64>(sparse_element_number, shape_index));
     case F16:
-      return tensorflow::strings::StrCat(Get<half>(multi_index));
+      return StrCat(GetSparseElement<half>(sparse_element_number, shape_index));
+    case F32:
+      return StrCat(
+          GetSparseElement<float>(sparse_element_number, shape_index));
     case BF16:
-      return tensorflow::strings::StrCat(
-          static_cast<float>(Get<bfloat16>(multi_index)));
+      return StrCat(static_cast<float>(
+          GetSparseElement<bfloat16>(sparse_element_number, shape_index)));
+    case F64:
+      return StrCat(
+          GetSparseElement<double>(sparse_element_number, shape_index));
+    case C64: {
+      complex64 c =
+          GetSparseElement<complex64>(sparse_element_number, shape_index);
+      return StrCat("(", c.real(), ", ", c.imag(), ")");
+    }
     default:
-      return tensorflow::strings::StrCat(
-          "[", PrimitiveType_Name(shape().element_type()), "]");
+      LOG(FATAL) << "Invalid element type for sparse arrays: "
+                 << PrimitiveType_Name(subshape.element_type());
   }
 }
 
 StatusOr<int64> Literal::GetIntegralAsS64(
     tensorflow::gtl::ArraySlice<int64> multi_index) const {
+  CHECK(LayoutUtil::IsDenseArray(shape()));
   switch (shape().element_type()) {
     case PRED:
       return Get<bool>(multi_index);
@@ -592,13 +997,83 @@ StatusOr<int64> Literal::GetIntegralAsS64(
   }
 }
 
-int64 Literal::LinearIndex(
-    tensorflow::gtl::ArraySlice<int64> multi_index) const {
-  return IndexUtil::MultidimensionalIndexToLinearIndex(shape(), multi_index);
+tensorflow::gtl::ArraySlice<int64> Literal::GetSparseIndex(
+    int64 sparse_element_number, const ShapeIndex& shape_index) const {
+  const Piece& p = piece(shape_index);
+  CHECK_GE(sparse_element_number, 0);
+  CHECK_LT(sparse_element_number, p.sparse_indices()->index_count());
+  return p.sparse_indices()->At(sparse_element_number);
 }
 
-string Literal::ToString(bool print_layout) const {
-  std::vector<string> pieces;
+void Literal::SortSparseElements(const ShapeIndex& shape_index) {
+  piece(shape_index).SortSparseElements();
+}
+
+void Literal::Piece::SortSparseElements() {
+  switch (subshape().element_type()) {
+    case PRED:
+      SortSparseElementsInternal<bool>();
+      break;
+    case S8:
+      SortSparseElementsInternal<int8>();
+      break;
+    case U8:
+      SortSparseElementsInternal<uint8>();
+      break;
+    case S16:
+      SortSparseElementsInternal<int16>();
+      break;
+    case U16:
+      SortSparseElementsInternal<uint16>();
+      break;
+    case S32:
+      SortSparseElementsInternal<int32>();
+      break;
+    case U32:
+      SortSparseElementsInternal<uint32>();
+      break;
+    case S64:
+      SortSparseElementsInternal<int64>();
+      break;
+    case U64:
+      SortSparseElementsInternal<uint64>();
+      break;
+    case F32:
+      SortSparseElementsInternal<float>();
+      break;
+    case F64:
+      SortSparseElementsInternal<double>();
+      break;
+    case C64:
+      SortSparseElementsInternal<complex64>();
+      break;
+    case F16:
+      SortSparseElementsInternal<half>();
+      break;
+    case BF16:
+      SortSparseElementsInternal<bfloat16>();
+      break;
+    default:
+      LOG(FATAL) << "Element type not valid for sparse array: "
+                 << PrimitiveType_Name(subshape().element_type());
+  }
+}
+
+template <typename NativeT>
+void Literal::Piece::SortSparseElementsInternal() {
+  CHECK(LayoutUtil::IsSparseArray(subshape()));
+  int64 num_elements = sparse_indices()->index_count();
+  auto values = data<NativeT>();
+  CHECK_LE(num_elements, values.size());
+  sparse_indices()->SortWithValues(
+      tensorflow::gtl::MutableArraySlice<NativeT>(values.data(), num_elements));
+}
+
+namespace {
+
+void ToStringHelper(const Literal& literal, const ShapeIndex& shape_index,
+                    bool print_layout, std::vector<string>* pieces) {
+  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
 
   auto shape_to_string = [print_layout](const Shape& shape) {
     if (print_layout) {
@@ -608,322 +1083,236 @@ string Literal::ToString(bool print_layout) const {
     }
   };
 
+  // TODO(b/32894291): refactor this code to reduce code duplication.
+  if (ShapeUtil::IsTuple(subshape)) {
+    pieces->push_back(shape_to_string(subshape));
+    pieces->push_back(" (\n");
+    std::vector<string> tuple_pieces;
+    for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) {
+      ShapeIndex element_index = shape_index;
+      element_index.push_back(i);
+      std::vector<string> element_pieces;
+      ToStringHelper(literal, element_index, print_layout, &element_pieces);
+      tuple_pieces.push_back(tensorflow::str_util::Join(element_pieces, ""));
+    }
+    pieces->push_back(tensorflow::str_util::Join(tuple_pieces, ",\n"));
+    pieces->push_back("\n)");
+    return;
+  }
+
+  if (LayoutUtil::IsSparseArray(subshape)) {
+    pieces->push_back(shape_to_string(subshape));
+    pieces->push_back("{");
+    int64 rank = ShapeUtil::Rank(subshape);
+    int64 num_elements = literal.sparse_element_count();
+    for (int64 i = 0; i < num_elements; ++i) {
+      if (i > 0) {
+        pieces->push_back(", ");
+      }
+      if (rank == 1) {
+        pieces->push_back(StrCat(literal.GetSparseIndex(i)[0]));
+        pieces->push_back(": ");
+      } else {
+        pieces->push_back("[");
+        pieces->push_back(
+            tensorflow::str_util::Join(literal.GetSparseIndex(i), ", "));
+        pieces->push_back("]: ");
+      }
+      pieces->push_back(literal.GetSparseElementAsString(i));
+    }
+    pieces->push_back("}");
+    return;
+  }
+
+  CHECK(LayoutUtil::IsDenseArray(subshape));
+
   auto element_to_string =
-      [this](tensorflow::gtl::ArraySlice<int64> indices) -> string {
-    PrimitiveType element_type = shape().element_type();
+      [&](tensorflow::gtl::ArraySlice<int64> indices) -> string {
+    PrimitiveType element_type = subshape.element_type();
     if (element_type == PRED) {
       // We display predicates in a densely packed form.
-      return Get<bool>(indices) ? "1" : "0";
+      return literal.Get<bool>(indices, shape_index) ? "1" : "0";
     }
     return ((!indices.empty() && indices.back() > 0) ? ", " : "") +
-           GetAsString(indices);
+           literal.GetAsString(indices, shape_index);
   };
 
-  // TODO(b/32894291): refactor this code to reduce code duplication.
-  if (ShapeUtil::IsTuple(shape())) {
-    pieces.push_back(shape_to_string(shape()));
-    pieces.push_back(" (\n");
-    pieces.push_back(tensorflow::str_util::Join(
-        tuple_literals(), ",\n", [](string* out, const Literal& element) {
-          tensorflow::strings::StrAppend(out, element.ToString());
-        }));
-    pieces.push_back("\n)");
-  } else if (ShapeUtil::Rank(shape()) == 0) {
-    pieces.push_back(GetAsString({}));
-  } else if (ShapeUtil::Rank(shape()) == 1) {
-    pieces.push_back("{");
-    for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
-      pieces.push_back(element_to_string({i0}));
+  if (ShapeUtil::Rank(subshape) == 0) {
+    pieces->push_back(literal.GetAsString({}, shape_index));
+  } else if (ShapeUtil::Rank(subshape) == 1) {
+    pieces->push_back("{");
+    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
+      pieces->push_back(element_to_string({i0}));
     }
-    pieces.push_back("}");
-  } else if (ShapeUtil::Rank(shape()) == 2) {
-    pieces.push_back(shape_to_string(shape()));
-    pieces.push_back(" {\n");
-    for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
-      pieces.push_back("  { ");
-      for (int64 i1 = 0; i1 < shape().dimensions(1); ++i1) {
-        pieces.push_back(element_to_string({i0, i1}));
+    pieces->push_back("}");
+  } else if (ShapeUtil::Rank(subshape) == 2) {
+    pieces->push_back(shape_to_string(subshape));
+    pieces->push_back(" {\n");
+    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
+      pieces->push_back("  { ");
+      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
+        pieces->push_back(element_to_string({i0, i1}));
       }
-      pieces.push_back(" ");
-      pieces.push_back(i0 == shape().dimensions(0) - 1 ? "}\n" : "},\n");
+      pieces->push_back(" ");
+      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "}\n" : "},\n");
     }
-    pieces.push_back("}");
-  } else if (ShapeUtil::Rank(shape()) == 3) {
-    pieces.push_back(shape_to_string(shape()));
-    pieces.push_back(" {\n");
-    for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
-      pieces.push_back(i0 > 0 ? ",\n{" : "{");
-      for (int64 i1 = 0; i1 < shape().dimensions(1); ++i1) {
-        pieces.push_back(i1 > 0 ? ",\n  { " : " { ");
-        for (int64 i2 = 0; i2 < shape().dimensions(2); ++i2) {
-          pieces.push_back(element_to_string({i0, i1, i2}));
+    pieces->push_back("}");
+  } else if (ShapeUtil::Rank(subshape) == 3) {
+    pieces->push_back(shape_to_string(subshape));
+    pieces->push_back(" {\n");
+    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
+      pieces->push_back(i0 > 0 ? ",\n{" : "{");
+      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
+        pieces->push_back(i1 > 0 ? ",\n  { " : " { ");
+        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
+          pieces->push_back(element_to_string({i0, i1, i2}));
         }
-        pieces.push_back(" }");
+        pieces->push_back(" }");
       }
-      pieces.push_back(" }");
+      pieces->push_back(" }");
     }
-    pieces.push_back("\n}");
-  } else if (ShapeUtil::Rank(shape()) == 4) {
-    pieces.push_back(shape_to_string(shape()));
-    pieces.push_back(" {\n");
-    for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
-      pieces.push_back(tensorflow::strings::Printf("  {  /*i0=%lld*/\n", i0));
-      for (int64 i1 = 0; i1 < shape().dimensions(1); ++i1) {
-        pieces.push_back(
-            tensorflow::strings::Printf("    {  /*i1=%lld*/\n", i1));
-        for (int64 i2 = 0; i2 < shape().dimensions(2); ++i2) {
-          pieces.push_back("      {");
-          for (int64 i3 = 0; i3 < shape().dimensions(3); ++i3) {
-            pieces.push_back(element_to_string({i0, i1, i2, i3}));
+    pieces->push_back("\n}");
+  } else if (ShapeUtil::Rank(subshape) == 4) {
+    pieces->push_back(shape_to_string(subshape));
+    pieces->push_back(" {\n");
+    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
+      pieces->push_back(Printf("  {  /*i0=%lld*/\n", i0));
+      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
+        pieces->push_back(Printf("    {  /*i1=%lld*/\n", i1));
+        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
+          pieces->push_back("      {");
+          for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) {
+            pieces->push_back(element_to_string({i0, i1, i2, i3}));
           }
-          pieces.push_back(i2 == shape().dimensions(2) - 1 ? "}\n" : "},\n");
+          pieces->push_back(i2 == subshape.dimensions(2) - 1 ? "}\n" : "},\n");
         }
-        pieces.push_back(i1 == shape().dimensions(1) - 1 ? "    }\n"
-                                                         : "    },\n");
+        pieces->push_back(i1 == subshape.dimensions(1) - 1 ? "    }\n"
+                                                           : "    },\n");
       }
-      pieces.push_back(i0 == shape().dimensions(0) - 1 ? "  }\n" : "  },\n");
+      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "  }\n" : "  },\n");
     }
-    pieces.push_back("}");
-  } else if (ShapeUtil::Rank(shape()) == 5) {
-    pieces.push_back(shape_to_string(shape()));
-    pieces.push_back(" {\n");
-    for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
-      pieces.push_back(tensorflow::strings::Printf("  {  /*i0=%lld*/\n", i0));
-      for (int64 i1 = 0; i1 < shape().dimensions(1); ++i1) {
-        pieces.push_back(
-            tensorflow::strings::Printf("    {  /*i1=%lld*/\n", i1));
-        for (int64 i2 = 0; i2 < shape().dimensions(2); ++i2) {
-          pieces.push_back(
-              tensorflow::strings::Printf("      {  /*i2=%lld*/\n", i2));
-          for (int64 i3 = 0; i3 < shape().dimensions(3); ++i3) {
-            pieces.push_back("        {");
-            for (int64 i4 = 0; i4 < shape().dimensions(4); ++i4) {
-              pieces.push_back(element_to_string({i0, i1, i2, i3, i4}));
+    pieces->push_back("}");
+  } else if (ShapeUtil::Rank(subshape) == 5) {
+    pieces->push_back(shape_to_string(subshape));
+    pieces->push_back(" {\n");
+    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
+      pieces->push_back(Printf("  {  /*i0=%lld*/\n", i0));
+      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
+        pieces->push_back(Printf("    {  /*i1=%lld*/\n", i1));
+        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
+          pieces->push_back(Printf("      {  /*i2=%lld*/\n", i2));
+          for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) {
+            pieces->push_back("        {");
+            for (int64 i4 = 0; i4 < subshape.dimensions(4); ++i4) {
+              pieces->push_back(element_to_string({i0, i1, i2, i3, i4}));
             }
-            pieces.push_back(i3 == shape().dimensions(3) - 1 ? "}\n" : "},\n");
+            pieces->push_back(i3 == subshape.dimensions(3) - 1 ? "}\n"
+                                                               : "},\n");
           }
-          pieces.push_back(i2 == shape().dimensions(2) - 1 ? "      }\n"
-                                                           : "      },\n");
+          pieces->push_back(i2 == subshape.dimensions(2) - 1 ? "      }\n"
+                                                             : "      },\n");
         }
-        pieces.push_back(i1 == shape().dimensions(1) - 1 ? "    }\n"
-                                                         : "    },\n");
+        pieces->push_back(i1 == subshape.dimensions(1) - 1 ? "    }\n"
+                                                           : "    },\n");
       }
-      pieces.push_back(i0 == shape().dimensions(0) - 1 ? "  }\n" : "  },\n");
+      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "  }\n" : "  },\n");
     }
-    pieces.push_back("}");
+    pieces->push_back("}");
   } else {
-    pieces.push_back(shape_to_string(shape()));
-    pieces.push_back(" {...}");
+    pieces->push_back(shape_to_string(subshape));
+    pieces->push_back(" {");
+    literal.EachCellAsString(
+        [&](tensorflow::gtl::ArraySlice<int64> indices, const string& value) {
+          pieces->push_back(" ");
+          pieces->push_back(value);
+        });
+    pieces->push_back("}");
   }
+}
+
+}  // namespace
 
+int64 Literal::sparse_element_count() const {
+  CHECK(LayoutUtil::IsSparseArray(shape()));
+  return sparse_indices()->index_count();
+}
+
+string Literal::ToString(bool print_layout) const {
+  std::vector<string> pieces;
+  ToStringHelper(*this, {}, print_layout, &pieces);
   return tensorflow::str_util::Join(pieces, "");
 }
 
 /* static */ std::unique_ptr<Literal> Literal::MakeTuple(
     tensorflow::gtl::ArraySlice<const Literal*> elements) {
-  auto literal = MakeUnique<Literal>();
-  std::vector<Shape> shape;
-  for (const Literal* tuple_element : elements) {
-    *literal->add_tuple_literals() = *tuple_element;
-    shape.push_back(tuple_element->shape());
+  std::vector<Shape> element_shapes;
+  for (const Literal* element : elements) {
+    element_shapes.push_back(element->shape());
+  }
+  auto literal = MakeUnique<Literal>(ShapeUtil::MakeTupleShape(element_shapes));
+  for (int i = 0; i < elements.size(); ++i) {
+    TF_CHECK_OK(literal->CopyFrom(*elements[i], /*dest_shape_index=*/{i}));
   }
-  *literal->mutable_shape() = ShapeUtil::MakeTupleShape(shape);
   return literal;
 }
 
 /* static */ std::unique_ptr<Literal> Literal::MakeTupleOwned(
     std::vector<std::unique_ptr<Literal>> elements) {
-  auto literal = MakeUnique<Literal>();
-  std::vector<Shape> shape;
-  for (auto& tuple_element : elements) {
-    shape.push_back(tuple_element->shape());
-    *literal->add_tuple_literals() = std::move(*tuple_element);
+  std::vector<Shape> element_shapes;
+  element_shapes.reserve(elements.size());
+  for (const auto& element : elements) {
+    element_shapes.push_back(element->shape());
+  }
+  auto literal = MakeUnique<Literal>(ShapeUtil::MakeTupleShape(element_shapes));
+  for (int64 i = 0; i < elements.size(); ++i) {
+    TF_CHECK_OK(
+        literal->MoveFrom(std::move(*elements[i]), /*dest_shape_index=*/{i}));
   }
-  *literal->mutable_shape() = ShapeUtil::MakeTupleShape(shape);
   return literal;
 }
 
-const void* Literal::InternalData() const {
-  return const_cast<const void*>(
-      const_cast<Literal*>(this)->MutableInternalData());
+void Literal::EachCellAsString(
+    const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
+                             const string& value)>& per_cell) const {
+  if (ShapeUtil::HasZeroElements(shape())) {
+    return;
+  }
+  std::vector<int64> indices = IndexUtil::LinearIndexToMultidimensionalIndex(
+      shape(), /*linear_index=*/0);
+  do {
+    per_cell(indices, GetAsString(indices));
+  } while (IndexUtil::BumpIndices(shape(), &indices));
 }
 
-void* Literal::MutableInternalData() {
-  // NOTE: We access the vectors directly to avoid the const reference
-  // created by the accessor functions.
-  switch (shape().element_type()) {
-    case PRED:
-    case U8:
-      return reinterpret_cast<void*>(u8s_.data());
-    case S32:
-      return reinterpret_cast<void*>(s32s_.data());
-    case S64:
-      return reinterpret_cast<void*>(s64s_.data());
-    case U32:
-      return reinterpret_cast<void*>(u32s_.data());
-    case U64:
-      return reinterpret_cast<void*>(u64s_.data());
-    case F32:
-      return reinterpret_cast<void*>(f32s_.data());
-    case F64:
-      return reinterpret_cast<void*>(f64s_.data());
-    case C64:
-      return reinterpret_cast<void*>(c64s_.data());
-    case F16:
-      return reinterpret_cast<void*>(f16s_.data());
-    case BF16:
-      return reinterpret_cast<void*>(bf16s_.data());
-    default:
-      LOG(FATAL) << "primitive type not supported in literals: "
-                 << PrimitiveType_Name(shape().element_type());
+namespace {
+template <typename NativeSrcT, typename NativeDestT>
+std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
+  CHECK(ShapeUtil::IsArray(src_literal.shape()));
+  auto result_literal = MakeUnique<Literal>(ShapeUtil::ChangeElementType(
+      src_literal.shape(),
+      primitive_util::NativeToPrimitiveType<NativeDestT>()));
+  auto src_data = src_literal.data<NativeSrcT>();
+  auto dest_data = result_literal->template data<NativeDestT>();
+  int64 num_elements = src_literal.element_count();
+
+  for (int64 i = 0; i < num_elements; ++i) {
+    dest_data[i] = static_cast<NativeDestT>(src_data[i]);
   }
-}
-
-void Literal::Reserve(int64 num_elements) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  switch (shape().element_type()) {
-    case PRED:
-      Resize<bool>(num_elements, false);
-      break;
-    case S8:
-      Resize<int8>(num_elements, 0);
-      break;
-    case U8:
-      Resize<uint8>(num_elements, 0);
-      break;
-    case S32:
-      Resize<int32>(num_elements, 0);
-      break;
-    case S64:
-      Resize<int64>(num_elements, 0);
-      break;
-    case U32:
-      Resize<uint32>(num_elements, 0);
-      break;
-    case U64:
-      Resize<uint64>(num_elements, 0);
-      break;
-    case F32:
-      Resize<float>(num_elements, 0);
-      break;
-    case F64:
-      Resize<double>(num_elements, 0);
-      break;
-    case C64:
-      Resize<complex64>(num_elements, 0);
-      break;
-    case F16:
-      Resize<half>(num_elements, static_cast<half>(0.0f));
-      break;
-    case BF16:
-      Resize<bfloat16>(num_elements, static_cast<bfloat16>(0.0f));
-      break;
-    default:
-      LOG(FATAL) << "primitive type not supported in literals: "
-                 << PrimitiveType_Name(shape().element_type());
-  }
-}
-
-tensorflow::Status Literal::ValidateLiteral() const {
-  TF_CHECK_OK(ShapeUtil::ValidateShape(shape()));
-  int64 expected = ShapeUtil::ElementsIn(shape());
-  int64 actual = -1;
-  switch (shape().element_type()) {
-    case PRED:
-    case U8:
-      actual = u8s_size();
-      break;
-    case S32:
-      actual = s32s_size();
-      break;
-    case U32:
-      actual = u32s_size();
-      break;
-    case S64:
-      actual = s64s_size();
-      break;
-    case U64:
-      actual = u64s_size();
-      break;
-    case F32:
-      actual = f32s_size();
-      break;
-    case F64:
-      actual = f64s_size();
-      break;
-    case C64:
-      actual = c64s_size();
-      break;
-    case F16:
-      actual = f16s().size() / sizeof(half);
-      break;
-    case BF16:
-      actual = bf16s().size();
-      break;
-    default:
-      return tensorflow::errors::Unimplemented(
-          "unhandled element type for literal validation: " +
-          PrimitiveType_Name(shape().element_type()));
-  }
-
-  if (expected != actual) {
-    return tensorflow::errors::InvalidArgument(tensorflow::strings::Printf(
-        "literal has bad number of elements for its shape %s: want %lld "
-        "got %lld",
-        ShapeUtil::HumanString(shape()).c_str(), expected, actual));
-  }
-
-  return tensorflow::Status::OK();
-}
-
-void Literal::EachCellAsString(
-    const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
-                             const string& value)>& per_cell) const {
-  if (ShapeUtil::HasZeroElements(shape())) {
-    return;
-  }
-  std::vector<int64> indices = IndexUtil::LinearIndexToMultidimensionalIndex(
-      shape(), /*linear_index=*/0);
-  do {
-    per_cell(indices, GetAsString(indices));
-  } while (IndexUtil::BumpIndices(shape(), &indices));
-}
-
-namespace {
-template <typename NativeSrcT, typename NativeDestT>
-std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
-  auto result_literal = MakeUnique<Literal>();
-  Shape* result_shape = result_literal->mutable_shape();
-  *result_shape = src_literal.shape();
-  result_shape->set_element_type(
-      primitive_util::NativeToPrimitiveType<NativeDestT>());
-  result_literal->Reserve(ShapeUtil::ElementsIn(*result_shape));
-  tensorflow::gtl::ArraySlice<NativeSrcT> src_data =
-      src_literal.GetArraySlice<NativeSrcT>();
-  tensorflow::gtl::MutableArraySlice<NativeDestT> dest_data =
-      result_literal->GetMutableArraySlice<NativeDestT>();
-  int64 num_elements = ShapeUtil::ElementsIn(src_literal.shape());
-
-  for (int64 i = 0; i < num_elements; ++i) {
-    dest_data[i] = static_cast<NativeDestT>(src_data[i]);
-  }
-  return result_literal;
+  return result_literal;
 }
 
 template <PrimitiveType primitive_src_type>
 std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
-  auto result_literal = MakeUnique<Literal>();
-  Shape* result_shape = result_literal->mutable_shape();
-  *result_shape = src_literal.shape();
-  result_shape->set_element_type(C64);
-  result_literal->Reserve(ShapeUtil::ElementsIn(*result_shape));
+  CHECK(ShapeUtil::IsArray(src_literal.shape()));
+  auto result_literal = MakeUnique<Literal>(
+      ShapeUtil::ChangeElementType(src_literal.shape(), C64));
   using NativeSrcT =
       typename primitive_util::PrimitiveTypeToNative<primitive_src_type>::type;
   tensorflow::gtl::ArraySlice<NativeSrcT> src_data =
-      src_literal.GetArraySlice<NativeSrcT>();
+      src_literal.data<NativeSrcT>();
   tensorflow::gtl::MutableArraySlice<complex64> dest_data =
-      result_literal->GetMutableArraySlice<complex64>();
-  int64 num_elements = ShapeUtil::ElementsIn(src_literal.shape());
+      result_literal->data<complex64>();
+  int64 num_elements = src_literal.element_count();
   for (int64 i = 0; i < num_elements; ++i) {
     dest_data[i] = complex64(static_cast<float>(src_data[i]), 0);
   }
@@ -968,10 +1357,12 @@ StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
           PrimitiveType_Name(primitive_dest_type).c_str());
   }
 }
+
 }  // namespace
 
 StatusOr<std::unique_ptr<Literal>> Literal::Convert(
     PrimitiveType primitive_dest_type) const {
+  TF_RET_CHECK(ShapeUtil::IsArray(shape()));
   switch (shape().element_type()) {
 #define CONVERT_IF_DEST_TYPE_MATCHES(type) \
   case (type):                             \
@@ -996,356 +1387,192 @@ StatusOr<std::unique_ptr<Literal>> Literal::Convert(
   }
 }
 
-namespace {
-
-// Helper function which compares whether the elements of literal1 are equal to
-// the elements of literal2. Recursively iterates through the entire
-// multidimensional index space and compares the literal elements
-// one-by-one. literal1 and literal2 must be compatible (same dimensions and
-// type).
 template <typename NativeT>
-bool EqualElements(const Literal& literal1, const Literal& literal2,
-                   int dimension, std::vector<int64>* multi_index) {
-  if (dimension == ShapeUtil::Rank(literal1.shape())) {
-    return (literal1.Get<NativeT>(*multi_index) ==
-            literal2.Get<NativeT>(*multi_index));
-  }
-  for (int64 i = 0; i < literal1.shape().dimensions(dimension); ++i) {
-    (*multi_index)[dimension] = i;
-    if (!EqualElements<NativeT>(literal1, literal2, dimension + 1,
-                                multi_index)) {
+bool Literal::Piece::EqualElementsInternal(
+    const Literal::Piece& other, std::vector<int64>* multi_index) const {
+  if (multi_index->size() == ShapeUtil::Rank(subshape())) {
+    return (Get<NativeT>(*multi_index) == other.Get<NativeT>(*multi_index));
+  }
+  for (int64 i = 0; i < subshape().dimensions(multi_index->size()); ++i) {
+    multi_index->push_back(i);
+    if (!EqualElementsInternal<NativeT>(other, multi_index)) {
       return false;
     }
+    multi_index->pop_back();
   }
   return true;
 }
 
-}  // namespace
+bool Literal::Piece::EqualElements(const Literal::Piece& other) const {
+  DCHECK(ShapeUtil::Compatible(subshape(), other.subshape()));
+
+  std::vector<int64> multi_index;
+  switch (subshape().element_type()) {
+    case PRED:
+      return EqualElementsInternal<bool>(other, &multi_index);
+    case U8:
+      return EqualElementsInternal<uint8>(other, &multi_index);
+    case S32:
+      return EqualElementsInternal<int32>(other, &multi_index);
+    case S64:
+      return EqualElementsInternal<int64>(other, &multi_index);
+    case U32:
+      return EqualElementsInternal<uint32>(other, &multi_index);
+    case U64:
+      return EqualElementsInternal<uint64>(other, &multi_index);
+    case F32:
+      return EqualElementsInternal<float>(other, &multi_index);
+    case F64:
+      return EqualElementsInternal<double>(other, &multi_index);
+    case F16:
+      return EqualElementsInternal<half>(other, &multi_index);
+    case BF16:
+      return EqualElementsInternal<bfloat16>(other, &multi_index);
+    case C64:
+      return EqualElementsInternal<complex64>(other, &multi_index);
+    default:
+      LOG(FATAL) << "Unimplemented: Literal::Piece::EqualElements for type "
+                 << PrimitiveType_Name(subshape().element_type());
+  }
+}
 
 bool Literal::operator==(const Literal& other) const {
   if (!ShapeUtil::Compatible(shape(), other.shape())) {
     return false;
   }
-  if (ShapeUtil::IsTuple(shape())) {
-    // Because the shapes are compatible, they must have the same number of
-    // tuple elements.
-    CHECK_EQ(tuple_literals_size(), other.tuple_literals_size());
-    for (int i = 0; i < tuple_literals_size(); ++i) {
-      if (tuple_literals(i) != other.tuple_literals(i)) {
-        return false;
-      }
+  for (const auto& pair : pieces_) {
+    const ShapeIndex& index = pair.first;
+    const Piece& piece = pair.second;
+    if (!ShapeUtil::IsArray(piece.subshape())) {
+      continue;
     }
-    return true;
-  } else {
-    std::vector<int64> multi_index(ShapeUtil::Rank(shape()), 0);
-    switch (shape().element_type()) {
-      case PRED:
-        return EqualElements<bool>(*this, other, 0, &multi_index);
-      case U8:
-        return EqualElements<uint8>(*this, other, 0, &multi_index);
-      case S32:
-        return EqualElements<int32>(*this, other, 0, &multi_index);
-      case S64:
-        return EqualElements<int64>(*this, other, 0, &multi_index);
-      case U32:
-        return EqualElements<uint32>(*this, other, 0, &multi_index);
-      case U64:
-        return EqualElements<uint64>(*this, other, 0, &multi_index);
-      case F32:
-        return EqualElements<float>(*this, other, 0, &multi_index);
-      case F64:
-        return EqualElements<double>(*this, other, 0, &multi_index);
-      case F16:
-        return EqualElements<half>(*this, other, 0, &multi_index);
-      case BF16:
-        return EqualElements<bfloat16>(*this, other, 0, &multi_index);
-      case C64:
-        return EqualElements<complex64>(*this, other, 0, &multi_index);
-      default:
-        LOG(FATAL) << "Unimplemented: Literal::Equal for type "
-                   << PrimitiveType_Name(shape().element_type());
+
+    const Piece& other_piece = other.piece(index);
+    if (!piece.EqualElements(other_piece)) {
+      return false;
     }
   }
+  return true;
 }
 
-template <>
-tensorflow::gtl::MutableArraySlice<bool> Literal::GetMutableArraySlice() {
-  auto values = mutable_preds();
-  return tensorflow::gtl::MutableArraySlice<bool>(
-      reinterpret_cast<bool*>(values->data()), values->size());
-}
-
-template <>
-tensorflow::gtl::MutableArraySlice<int8> Literal::GetMutableArraySlice() {
-  auto values = mutable_u8s();
-  return tensorflow::gtl::MutableArraySlice<int8>(
-      reinterpret_cast<int8*>(values->data()), values->size());
-}
-
-template <>
-tensorflow::gtl::MutableArraySlice<uint8> Literal::GetMutableArraySlice() {
-  auto values = mutable_u8s();
-  return tensorflow::gtl::MutableArraySlice<uint8>(values->data(),
-                                                   values->size());
-}
-
-template <>
-tensorflow::gtl::MutableArraySlice<int16> Literal::GetMutableArraySlice() {
-  auto values = mutable_s16s();
-  return tensorflow::gtl::MutableArraySlice<int16>(values->data(),
-                                                   values->size());
-}
-
-template <>
-tensorflow::gtl::MutableArraySlice<uint16> Literal::GetMutableArraySlice() {
-  auto values = mutable_u16s();
-  return tensorflow::gtl::MutableArraySlice<uint16>(values->data(),
-                                                    values->size());
-}
-
-template <>
-tensorflow::gtl::MutableArraySlice<int32> Literal::GetMutableArraySlice() {
-  auto values = mutable_s32s();
-  return tensorflow::gtl::MutableArraySlice<int32>(values->data(),
-                                                   values->size());
-}
-
-template <>
-tensorflow::gtl::MutableArraySlice<uint32> Literal::GetMutableArraySlice() {
-  auto values = mutable_u32s();
-  return tensorflow::gtl::MutableArraySlice<uint32>(values->data(),
-                                                    values->size());
-}
-
-template <>
-tensorflow::gtl::MutableArraySlice<int64> Literal::GetMutableArraySlice() {
-  static_assert(sizeof(int64) == sizeof(tensorflow::protobuf_int64) &&
-                    alignof(int64) == alignof(tensorflow::protobuf_int64),
-                "The int64 and tensorflow::protobuf_int64 types are not "
-                "compatible");
-  auto values = mutable_s64s();
-  // Because of the fact that tensorflow::protobuf_int64 is defined as int64_t
-  // while tensorflow::int64 is defined as long long, a reinterpret_cast<> is
-  // necessary from the raw data pointer returned by the mutable_data() API.
-  return tensorflow::gtl::MutableArraySlice<int64>(
-      reinterpret_cast<int64*>(values->data()), values->size());
-}
-
-template <>
-tensorflow::gtl::MutableArraySlice<uint64> Literal::GetMutableArraySlice() {
-  static_assert(sizeof(uint64) == sizeof(tensorflow::protobuf_uint64) &&
-                    alignof(uint64) == alignof(tensorflow::protobuf_uint64),
-                "The uint64 and tensorflow::protobuf_uint64 types are not "
-                "compatible");
-  auto values = mutable_u64s();
-  // Because of the fact that tensorflow::protobuf_uint64 is defined as uint64_t
-  // while tensorflow::uint64 is defined as unsigned long long, a
-  // reinterpret_cast<> is necessary from the raw data pointer returned by the
-  // mutable_data() API.
-  return tensorflow::gtl::MutableArraySlice<uint64>(
-      reinterpret_cast<uint64*>(values->data()), values->size());
-}
-
-template <>
-tensorflow::gtl::MutableArraySlice<float> Literal::GetMutableArraySlice() {
-  auto values = mutable_f32s();
-  return tensorflow::gtl::MutableArraySlice<float>(values->data(),
-                                                   values->size());
-}
-
-template <>
-tensorflow::gtl::MutableArraySlice<double> Literal::GetMutableArraySlice() {
-  auto values = mutable_f64s();
-  return tensorflow::gtl::MutableArraySlice<double>(values->data(),
-                                                    values->size());
-}
-
-template <>
-tensorflow::gtl::MutableArraySlice<complex64> Literal::GetMutableArraySlice() {
-  auto values = mutable_c64s();
-  return {values->data(), values->size()};
-}
-
-template <>
-tensorflow::gtl::MutableArraySlice<half> Literal::GetMutableArraySlice<half>() {
-  auto values = mutable_f16s();
-  return tensorflow::gtl::MutableArraySlice<half>(values->data(),
-                                                  values->size());
-}
-
-template <>
-tensorflow::gtl::MutableArraySlice<bfloat16>
-Literal::GetMutableArraySlice<bfloat16>() {
-  auto values = mutable_bf16s();
-  return {values->data(), values->size()};
-}
-
-template <>
-tensorflow::gtl::ArraySlice<bool> Literal::GetArraySlice<bool>() const {
-  CHECK_EQ(shape().element_type(), PRED);
-  return tensorflow::gtl::ArraySlice<bool>(
-      reinterpret_cast<const bool*>(preds().data()), preds().size());
-}
-
-template <>
-tensorflow::gtl::ArraySlice<uint8> Literal::GetArraySlice<uint8>() const {
-  CHECK_EQ(shape().element_type(), U8);
-  return tensorflow::gtl::ArraySlice<uint8>(
-      reinterpret_cast<const uint8*>(u8s().data()), u8s().size());
-}
-
-template <>
-tensorflow::gtl::ArraySlice<int8> Literal::GetArraySlice<int8>() const {
-  CHECK_EQ(shape().element_type(), S8);
-  return tensorflow::gtl::ArraySlice<int8>(
-      reinterpret_cast<const int8*>(u8s().data()), u8s().size());
-}
-
-template <>
-tensorflow::gtl::ArraySlice<uint16> Literal::GetArraySlice<uint16>() const {
-  CHECK_EQ(shape().element_type(), U16);
-  return tensorflow::gtl::ArraySlice<uint16>(u16s().data(), u16s().size());
-}
-
-template <>
-tensorflow::gtl::ArraySlice<int16> Literal::GetArraySlice<int16>() const {
-  CHECK_EQ(shape().element_type(), S16);
-  return tensorflow::gtl::ArraySlice<int16>(s16s().data(), s16s().size());
-}
-
-template <>
-tensorflow::gtl::ArraySlice<uint32> Literal::GetArraySlice<uint32>() const {
-  CHECK_EQ(shape().element_type(), U32);
-  return u32s();
-}
-
-template <>
-tensorflow::gtl::ArraySlice<uint64> Literal::GetArraySlice<uint64>() const {
-  CHECK_EQ(shape().element_type(), U64);
-  return u64s();
-}
-
-template <>
-tensorflow::gtl::ArraySlice<int32> Literal::GetArraySlice<int32>() const {
-  CHECK_EQ(shape().element_type(), S32);
-  return s32s();
-}
-
-template <>
-tensorflow::gtl::ArraySlice<int64> Literal::GetArraySlice<int64>() const {
-  CHECK_EQ(shape().element_type(), S64);
-  return s64s();
-}
-
-template <>
-tensorflow::gtl::ArraySlice<double> Literal::GetArraySlice<double>() const {
-  CHECK_EQ(shape().element_type(), F64);
-  return f64s();
-}
-
-template <>
-tensorflow::gtl::ArraySlice<half> Literal::GetArraySlice<half>() const {
-  CHECK_EQ(shape().element_type(), F16);
-  return tensorflow::gtl::ArraySlice<half>(f16s().data(),
-                                           f16s().size() / sizeof(half));
-}
-
-template <>
-tensorflow::gtl::ArraySlice<bfloat16> Literal::GetArraySlice<bfloat16>() const {
-  CHECK_EQ(shape().element_type(), BF16);
-  return {bf16s().data(), bf16s().size()};
-}
-
-template <>
-tensorflow::gtl::ArraySlice<complex64> Literal::GetArraySlice<complex64>()
-    const {
-  CHECK_EQ(shape().element_type(), C64);
-  return c64s();
-}
+namespace {
 
 template <typename NativeT>
-static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
-  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal.shape()); ++i) {
-    auto multi_index =
-        IndexUtil::LinearIndexToMultidimensionalIndex(literal.shape(), i);
-    if (literal.Get<NativeT>(multi_index) != value) {
+static bool AllElementsEqualValue(tensorflow::gtl::ArraySlice<NativeT> data,
+                                  NativeT value) {
+  for (int64 i = 0; i < data.size(); ++i) {
+    if (data[i] != value) {
       return false;
     }
   }
   return true;
 }
 
+}  // namespace
+
 bool Literal::IsAll(int8 value) const {
-  switch (shape().element_type()) {
-    case U8:
-      if (value >= 0) {
-        return AllElementsEqualValue<uint8>(*this, value);
-      }
-      return false;
-    case U32:
-      if (value >= 0) {
-        return AllElementsEqualValue<uint32>(*this, value);
-      }
-      return false;
-    case U64:
-      if (value >= 0) {
-        return AllElementsEqualValue<uint64>(*this, value);
-      }
-      return false;
-    case S8:
-      return AllElementsEqualValue<int8>(*this, value);
-    case S32:
-      return AllElementsEqualValue<int32>(*this, value);
-    case S64:
-      return AllElementsEqualValue<int64>(*this, value);
-    case F32:
-      return AllElementsEqualValue<float>(*this, value);
-    case F64:
-      return AllElementsEqualValue<double>(*this, value);
-    case F16:
-      return AllElementsEqualValue<half>(*this, static_cast<half>(value));
-    case BF16:
-      return AllElementsEqualValue<bfloat16>(*this,
-                                             static_cast<bfloat16>(value));
-    case PRED:
-      if (value == 0) {
-        return AllElementsEqualValue<bool>(*this, false);
-      }
-      if (value == 1) {
-        return AllElementsEqualValue<bool>(*this, true);
+  for (const auto& pair : pieces_) {
+    const Piece& piece = pair.second;
+    if (!ShapeUtil::IsArray(piece.subshape())) {
+      continue;
+    }
+
+    auto piece_is_all = [&]() {
+      switch (shape().element_type()) {
+        case U8:
+          if (value >= 0) {
+            return AllElementsEqualValue<uint8>(piece.data<uint8>(), value);
+          }
+          return false;
+        case U32:
+          if (value >= 0) {
+            return AllElementsEqualValue<uint32>(piece.data<uint32>(), value);
+          }
+          return false;
+        case U64:
+          if (value >= 0) {
+            return AllElementsEqualValue<uint64>(piece.data<uint64>(), value);
+          }
+          return false;
+        case S8:
+          return AllElementsEqualValue<int8>(piece.data<int8>(), value);
+        case S32:
+          return AllElementsEqualValue<int32>(piece.data<int32>(), value);
+        case S64:
+          return AllElementsEqualValue<int64>(piece.data<int64>(), value);
+        case F32:
+          return AllElementsEqualValue<float>(piece.data<float>(), value);
+        case F64:
+          return AllElementsEqualValue<double>(piece.data<double>(), value);
+        case F16:
+          return AllElementsEqualValue<half>(piece.data<half>(),
+                                             static_cast<half>(value));
+        case BF16:
+          return AllElementsEqualValue<bfloat16>(piece.data<bfloat16>(),
+                                                 static_cast<bfloat16>(value));
+        case PRED:
+          if (value == 0) {
+            return AllElementsEqualValue<bool>(piece.data<bool>(), false);
+          }
+          if (value == 1) {
+            return AllElementsEqualValue<bool>(piece.data<bool>(), true);
+          }
+          return false;
+        default:
+          return false;
       }
       return false;
-    default:
+    };
+
+    if (!piece_is_all()) {
       return false;
+    }
   }
+  return true;
 }
 
 bool Literal::IsAllFloat(float value) const {
-  switch (shape().element_type()) {
-    case F32:
-      return AllElementsEqualValue<float>(*this, value);
-    case F64:
-      return AllElementsEqualValue<double>(*this, value);
-    case F16:
-      return AllElementsEqualValue<half>(*this, static_cast<half>(value));
-    case BF16:
-      return AllElementsEqualValue<bfloat16>(*this,
-                                             static_cast<bfloat16>(value));
-    default:
+  for (const auto& pair : pieces_) {
+    const Piece& piece = pair.second;
+    if (!ShapeUtil::IsArray(piece.subshape())) {
+      continue;
+    }
+
+    auto piece_is_all = [&]() {
+      switch (shape().element_type()) {
+        case F32:
+          return AllElementsEqualValue<float>(piece.data<float>(), value);
+        case F64:
+          return AllElementsEqualValue<double>(piece.data<double>(), value);
+        case F16:
+          return AllElementsEqualValue<half>(piece.data<half>(),
+                                             static_cast<half>(value));
+        case BF16:
+          return AllElementsEqualValue<bfloat16>(piece.data<bfloat16>(),
+                                                 static_cast<bfloat16>(value));
+        default:
+          return false;
+      }
+    };
+    if (!piece_is_all()) {
       return false;
+    }
   }
+  return true;
 }
 
 bool Literal::IsAllComplex(complex64 value) const {
   switch (shape().element_type()) {
     case C64:
-      return AllElementsEqualValue<complex64>(*this, value);
+      return AllElementsEqualValue<complex64>(root_piece().data<complex64>(),
+                                              value);
     default:
       return false;
   }
 }
 
 bool Literal::IsZero(tensorflow::gtl::ArraySlice<int64> indices) const {
+  CHECK(ShapeUtil::IsArray(shape()));
   switch (shape().element_type()) {
     case U8:
       return Get<uint8>(indices) == 0;
@@ -1376,247 +1603,294 @@ bool Literal::IsZero(tensorflow::gtl::ArraySlice<int64> indices) const {
   }
 }
 
-template <>
-/* static */ void Literal::Resize<bool>(int64 num_elements, bool value) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_preds()->resize(num_elements, value);
-}
-
-template <>
-void Literal::Resize<int8>(int64 num_elements, int8 value) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_u8s()->resize(num_elements, value);
-}
-
-template <>
-void Literal::Resize<uint8>(int64 num_elements, uint8 value) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_u8s()->resize(num_elements, value);
-}
-
-template <>
-void Literal::Resize<int32>(int64 num_elements, int32 value) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_s32s()->resize(num_elements, value);
-}
-
-template <>
-void Literal::Resize<uint32>(int64 num_elements, uint32 value) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_u32s()->resize(num_elements, value);
-}
-
-template <>
-void Literal::Resize<int64>(int64 num_elements, int64 value) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_s64s()->resize(num_elements, value);
-}
-
-template <>
-void Literal::Resize<uint64>(int64 num_elements, uint64 value) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_u64s()->resize(num_elements, value);
-}
-
-template <>
-void Literal::Resize<float>(int64 num_elements, float value) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_f32s()->resize(num_elements, value);
-}
-
-template <>
-void Literal::Resize<double>(int64 num_elements, double value) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_f64s()->resize(num_elements, value);
-}
-
-template <>
-void Literal::Resize<half>(int64 num_elements, half value) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_f16s()->resize(num_elements, value);
-}
-
-template <>
-void Literal::Resize<bfloat16>(int64 num_elements, bfloat16 value) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_bf16s()->resize(num_elements, value);
-}
-
-template <>
-void Literal::Resize<complex64>(int64 num_elements, complex64 value) {
-  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
-  mutable_c64s()->resize(num_elements, value);
-}
+namespace {
 
 template <typename RepeatedFieldT, typename NativeT>
 void CopyToRepeatedField(RepeatedFieldT* dest,
-                         const std::vector<NativeT>& src) {
+                         const tensorflow::gtl::ArraySlice<NativeT> src) {
   *dest = RepeatedFieldT(src.begin(), src.end());
 }
 
-template <>
-void CopyToRepeatedField<tensorflow::protobuf::RepeatedField<float>, complex64>(
-    tensorflow::protobuf::RepeatedField<float>* dest,
-    const std::vector<complex64>& src) {
-  *dest = tensorflow::protobuf::RepeatedField<float>(
-      reinterpret_cast<const float*>(src.data()),
-      reinterpret_cast<const float*>(src.data()) + src.size() * 2);
-}
+}  // namespace
 
-LiteralProto Literal::ToProto() const {
-  LiteralProto proto;
-  proto.Clear();
-  *proto.mutable_shape() = shape();
-  switch (shape().element_type()) {
+void Literal::Piece::WriteToProto(LiteralProto* proto) const {
+  *proto->mutable_shape() = subshape();
+  switch (subshape().element_type()) {
     case PRED:
-      CopyToRepeatedField(proto.mutable_preds(), preds());
+      CopyToRepeatedField(proto->mutable_preds(), data<bool>());
       break;
     case U8:
-      *proto.mutable_u8s() = u8s_string();
-      break;
-    case S32:
-      CopyToRepeatedField(proto.mutable_s32s(), s32s());
-      break;
-    case S64:
-      CopyToRepeatedField(proto.mutable_s64s(), s64s());
+      proto->set_u8s(static_cast<const unsigned char*>(data<uint8>().data()),
+                     element_count());
       break;
     case U32:
-      CopyToRepeatedField(proto.mutable_u32s(), u32s());
+      CopyToRepeatedField(proto->mutable_u32s(), data<uint32>());
       break;
     case U64:
-      CopyToRepeatedField(proto.mutable_u64s(), u64s());
+      CopyToRepeatedField(proto->mutable_u64s(), data<uint64>());
+      break;
+    case S32:
+      CopyToRepeatedField(proto->mutable_s32s(), data<int32>());
+      break;
+    case S64:
+      CopyToRepeatedField(proto->mutable_s64s(), data<int64>());
       break;
     case F16:
-      *proto.mutable_f16s() =
-          string(reinterpret_cast<const char*>(f16s_.data()),
-                 f16s_.size() * sizeof(half));
+      *proto->mutable_f16s() = string(
+          reinterpret_cast<const char*>(data<half>().data()), size_bytes());
       if (!kLittleEndian) {
-        ConvertEndianShort(const_cast<char*>(proto.mutable_f16s()->data()),
-                           proto.f16s().size());
+        ConvertEndianShort(const_cast<char*>(proto->mutable_f16s()->data()),
+                           proto->f16s().size());
       }
       break;
     case BF16:
-      *proto.mutable_bf16s() =
-          string(reinterpret_cast<const char*>(bf16s_.data()),
-                 bf16s_.size() * sizeof(bfloat16));
+      *proto->mutable_bf16s() = string(
+          reinterpret_cast<const char*>(data<bfloat16>().data()), size_bytes());
       if (!kLittleEndian) {
-        ConvertEndianShort(const_cast<char*>(proto.mutable_bf16s()->data()),
-                           proto.bf16s().size());
+        ConvertEndianShort(const_cast<char*>(proto->mutable_bf16s()->data()),
+                           proto->bf16s().size());
       }
       break;
     case F32:
-      CopyToRepeatedField(proto.mutable_f32s(), f32s());
+      CopyToRepeatedField(proto->mutable_f32s(), data<float>());
       break;
     case F64:
-      CopyToRepeatedField(proto.mutable_f64s(), f64s());
+      CopyToRepeatedField(proto->mutable_f64s(), data<double>());
       break;
     case C64:
-      CopyToRepeatedField(proto.mutable_c64s(), c64s());
-      break;
-    case TUPLE:
-      for (const auto& tuple : tuple_literals()) {
-        *proto.add_tuple_literals() = tuple.ToProto();
+      for (complex64 value : data<complex64>()) {
+        proto->add_c64s(value.real());
+        proto->add_c64s(value.imag());
       }
       break;
+    case TUPLE:
+      // Nothing to do but assign the shape which is done above.
+      return;
     default:
-      LOG(FATAL) << "Unhandled primitive type " << shape().element_type();
+      LOG(FATAL) << "Unhandled primitive type " << subshape().element_type();
   }
-
-  return proto;
 }
 
-template <typename RepeatedFieldT, typename NativeT>
-void CopyFromRepeatedField(std::vector<NativeT>* dest,
-                           const RepeatedFieldT& src) {
-  *dest = std::vector<NativeT>(src.begin(), src.end());
+const void* Literal::Piece::untyped_data() const {
+  CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
+  return buffer();
 }
 
-template <>
-void CopyFromRepeatedField<tensorflow::protobuf::RepeatedField<float>,
-                           complex64>(
-    std::vector<complex64>* dest,
-    const tensorflow::protobuf::RepeatedField<float>& src) {
-  *dest = std::vector<complex64>(
-      reinterpret_cast<const complex64*>(src.data()),
-      reinterpret_cast<const complex64*>(src.data()) + src.size() / 2);
+void* Literal::Piece::untyped_data() {
+  CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
+  return buffer();
 }
 
-void Literal::CopyFromProto(const LiteralProto& literal_proto) {
-  if (!literal_proto.has_shape()) {
-    return;
+namespace {
+
+template <typename RepeatedFieldT, typename NativeT>
+Status CopyFromRepeatedField(tensorflow::gtl::MutableArraySlice<NativeT> dest,
+                             const RepeatedFieldT& src) {
+  if (dest.size() != src.size()) {
+    return InvalidArgument(
+        "Expected %lu elements in LiteralProto repeated field, has %d",
+        dest.size(), src.size());
   }
+  std::copy(src.begin(), src.end(), dest.begin());
+  return Status::OK();
+}
 
-  *mutable_shape() = literal_proto.shape();
-  switch (shape().element_type()) {
+}  // namespace
+
+Status Literal::Piece::CopyFromProto(const LiteralProto& proto) {
+  // These conditions should have been checked in Literal::CreateFromProto.
+  TF_RET_CHECK(proto.has_shape());
+  TF_RET_CHECK(LayoutUtil::HasLayout(proto.shape()));
+  TF_RET_CHECK(ShapeUtil::Equal(proto.shape(), subshape()));
+
+  switch (subshape().element_type()) {
     case PRED:
-      CopyFromRepeatedField(mutable_preds(), literal_proto.preds());
-      break;
-    case U8:
-      set_u8s(literal_proto.u8s());
+      TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<bool>(), proto.preds()));
       break;
+    case U8: {
+      auto u8_data = data<uint8>();
+      TF_RET_CHECK(proto.u8s().size() == u8_data.size());
+      std::copy(proto.u8s().begin(), proto.u8s().end(), u8_data.begin());
+    } break;
     case S32:
-      CopyFromRepeatedField(mutable_s32s(), literal_proto.s32s());
+      TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<int32>(), proto.s32s()));
       break;
     case S64:
-      CopyFromRepeatedField(mutable_s64s(), literal_proto.s64s());
+      TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<int64>(), proto.s64s()));
       break;
     case U32:
-      CopyFromRepeatedField(mutable_u32s(), literal_proto.u32s());
+      TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<uint32>(), proto.u32s()));
       break;
     case U64:
-      CopyFromRepeatedField(mutable_u64s(), literal_proto.u64s());
+      TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<uint64>(), proto.u64s()));
       break;
     case F16: {
-      const string& s(literal_proto.f16s());
-      CHECK_EQ(0, s.size() % sizeof(half));
-      f16s_ = std::vector<half>(s.size() / sizeof(half));
-      memcpy(f16s_.data(), s.data(), s.size());
-
+      const string& s(proto.f16s());
+      TF_RET_CHECK(data<half>().size() * sizeof(half) == s.size());
+      memcpy(untyped_data(), s.data(), s.size());
       if (!kLittleEndian) {
-        ConvertEndianShort(reinterpret_cast<char*>(f16s_.data()), s.size());
+        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
       }
-      break;
-    }
-    case BF16: {
-      const string& s(literal_proto.bf16s());
-      CHECK_EQ(0, s.size() % sizeof(bfloat16));
-      bf16s_ = std::vector<bfloat16>(s.size() / sizeof(bfloat16));
-      memcpy(bf16s_.data(), s.data(), s.size());
+    } break;
 
+    case BF16: {
+      const string& s(proto.bf16s());
+      TF_RET_CHECK(data<bfloat16>().size() * sizeof(bfloat16) == s.size());
+      memcpy(untyped_data(), s.data(), s.size());
       if (!kLittleEndian) {
-        ConvertEndianShort(reinterpret_cast<char*>(bf16s_.data()), s.size());
+        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
       }
-      break;
-    }
+    } break;
     case F32:
-      CopyFromRepeatedField(mutable_f32s(), literal_proto.f32s());
+      TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<float>(), proto.f32s()));
       break;
     case F64:
-      CopyFromRepeatedField(mutable_f64s(), literal_proto.f64s());
-      break;
-    case C64:
-      CopyFromRepeatedField(mutable_c64s(), literal_proto.c64s());
+      TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<double>(), proto.f64s()));
       break;
-    case TUPLE:
-      for (const auto& proto : literal_proto.tuple_literals()) {
-        mutable_tuple_literals()->push_back(Literal(proto));
+    case C64: {
+      auto complex_data = data<complex64>();
+      TF_RET_CHECK(proto.c64s_size() == complex_data.size() * 2);
+      for (int64 i = 0; i < complex_data.size(); ++i) {
+        complex_data[i] = complex64{proto.c64s(i * 2), proto.c64s(i * 2 + 1)};
       }
+    } break;
+    case TUPLE:
+      LOG(FATAL) << "Should not be called on tuple shapes: "
+                 << ShapeUtil::HumanString(subshape());
       break;
     default:
-      LOG(FATAL) << "Unhandled primitive type " << shape().element_type();
+      LOG(FATAL) << "Unhandled primitive type " << subshape().element_type();
+  }
+  return Status::OK();
+}
+
+LiteralProto Literal::ToProto() const {
+  LiteralProto proto;
+  for (const auto& pair : pieces_) {
+    const ShapeIndex& index = pair.first;
+    const Piece& piece = pair.second;
+
+    LiteralProto* proto_piece = &proto;
+    for (int64 i : index) {
+      while (proto_piece->tuple_literals_size() <= i) {
+        proto_piece->add_tuple_literals();
+      }
+      proto_piece = proto_piece->mutable_tuple_literals(i);
+    }
+    piece.WriteToProto(proto_piece);
+  }
+
+  if (LayoutUtil::IsSparseArray(shape())) {
+    CopyToRepeatedField(proto.mutable_sparse_indices(),
+                        sparse_indices()->data());
+  }
+
+  return proto;
+}
+
+/* static */
+StatusOr<std::unique_ptr<Literal>> Literal::CreateFromProto(
+    const LiteralProto& proto) {
+  if (!proto.has_shape()) {
+    return InvalidArgument("LiteralProto has no shape");
+  }
+  if (!LayoutUtil::HasLayout(proto.shape())) {
+    return InvalidArgument("LiteralProto has no layout");
+  }
+
+  auto literal = MakeUnique<Literal>(proto.shape());
+
+  for (auto& pair : literal->pieces_) {
+    const ShapeIndex& index = pair.first;
+    Piece& piece = pair.second;
+    const LiteralProto* proto_element = &proto;
+    for (int64 i : index) {
+      TF_RET_CHECK(i < proto_element->tuple_literals_size());
+      proto_element = &proto_element->tuple_literals(i);
+    }
+
+    if (ShapeUtil::IsTuple(piece.subshape())) {
+      if (proto_element->tuple_literals_size() !=
+          ShapeUtil::TupleElementCount(piece.subshape())) {
+        return InvalidArgument(
+            "Expected %lld tuple elements in LiteralProto, has %d",
+            ShapeUtil::TupleElementCount(piece.subshape()),
+            proto_element->tuple_literals_size());
+      }
+      continue;
+    }
+
+    TF_RET_CHECK(ShapeUtil::IsArray(piece.subshape()));
+    TF_RETURN_IF_ERROR(piece.CopyFromProto(*proto_element));
   }
+  return std::move(literal);
 }
 
-const Literal& Literal::GetSubliteral(const ShapeIndex& index) const {
-  return const_cast<Literal*>(this)->GetSubliteral(index);
+const void* Literal::untyped_data(const ShapeIndex& shape_index) const {
+  return piece(shape_index).untyped_data();
+}
+
+void* Literal::untyped_data(const ShapeIndex& shape_index) {
+  return piece(shape_index).untyped_data();
+}
+
+int64 Literal::size_bytes(const ShapeIndex& shape_index) const {
+  return piece(shape_index).size_bytes();
+}
+
+string Literal::GetR1U8AsString() const {
+  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK_EQ(ShapeUtil::Rank(shape()), 1);
+  CHECK_EQ(shape().element_type(), U8);
+  return string(tensorflow::bit_cast<const char*>(data<uint8>().data()),
+                ShapeUtil::ElementsIn(shape()));
+}
+
+/* static */ const LiteralView LiteralView::Create(
+    const Literal& literal, const ShapeIndex& view_root) {
+  return LiteralView(literal, view_root);
+}
+
+LiteralView::LiteralView(const Literal& literal, const ShapeIndex& view_root) {
+  shape_ = ShapeUtil::GetSubshape(literal.shape(), view_root);
+  pieces_ = ShapeTree<Piece>(shape_);
+  owns_buffers_ = false;
+  for (auto& pair : pieces_) {
+    const ShapeIndex& index = pair.first;
+    Piece& piece = pair.second;
+
+    ShapeIndex src_index = view_root;
+    for (int64 i : index) {
+      src_index.push_back(i);
+    }
+    const Piece& src_piece = literal.piece(src_index);
+    piece.set_buffer(src_piece.buffer());
+    piece.set_sparse_indices(src_piece.sparse_indices());
+    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
+  }
+}
+
+LiteralView::~LiteralView() {}
+
+LiteralView::LiteralView(const LiteralView& other) { CopyFrom(other); }
+
+LiteralView& LiteralView::operator=(const LiteralView& other) {
+  CopyFrom(other);
+  return *this;
 }
 
-Literal& Literal::GetSubliteral(const ShapeIndex& index) {
-  Literal* subliteral = this;
-  for (int64 i : index) {
-    subliteral = &subliteral->tuple_literals_.at(i);
+void LiteralView::CopyFrom(const LiteralView& other) {
+  // We can't use the default copy-constructor/copy-assignment because
+  // Piece::subshape_ points to subshapes within the Shape of the owning
+  // Literal/LiteralView.
+  shape_ = other.shape();
+  pieces_ = other.pieces_;
+  for (auto& pair : pieces_) {
+    const ShapeIndex& index = pair.first;
+    Piece& piece = pair.second;
+    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
   }
-  return *subliteral;
+  owns_buffers_ = false;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index f37e529caf54e3aded1a418d1f01c1440cd0f284..d996004888ab521790b4c5a10da2a93f0d98d12f 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -34,7 +34,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/sparse_index_array.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -50,152 +52,70 @@ limitations under the License.
 
 namespace xla {
 
-// Utility class for dealing with XLA literal values.  Most methods are
-// templated by native (host) type which corresponds to a unique XLA
-// PrimitiveType. See ComputationBuilder for details.  Not all primitive types
-// defined in xla_data.proto have a corresponding native type or even have a
-// storage location in the Literal proto yet (for example, primitive type F16).
+// Class representing literal values in XLA.
+//
+// TODO(b/67651157): The methods in this class should be reduced to a minimal
+// set of methods which construct Literals and accessors methods. Other methods
+// which perform computation on Literals (Reshape, Slice, etc) should be moved
+// elsewhere, and perhaps combined with evaluator code which operates on
+// Literals.
 class Literal {
  public:
-  Literal() {}
+  Literal() : Literal(ShapeUtil::MakeNil()) {}
 
-  Literal(const Literal& other) = default;
-  Literal(Literal&&) = default;
+  // Create a literal of the given shape. The literal is allocated sufficient
+  // memory to hold the shape. Memory is uninitialized.
+  explicit Literal(const Shape& shape);
+  virtual ~Literal();
 
-  explicit Literal(const LiteralProto& other) { CopyFromProto(other); }
-
-  Literal& operator=(const Literal& other) = default;
-  Literal& operator=(Literal&&) = default;
+  // Literals are moveable, but not copyable. To copy a literal use
+  // Literal::Clone or Literal::CloneToUnique. This prevents inadvertent copies
+  // of literals which can be expensive.
+  Literal(const Literal& other) = delete;
+  Literal& operator=(const Literal& other) = delete;
+  Literal(Literal&& other);
+  Literal& operator=(Literal&& other);
 
   // Literals are equal if they have compatible shapes and the same data
-  // values. Layout is not checked.
+  // values. Layout is not compared.
   bool operator==(const Literal& other) const;
   bool operator!=(const Literal& other) const { return !(*this == other); }
 
+  // Serialize to and from a proto.
+  static StatusOr<std::unique_ptr<Literal>> CreateFromProto(
+      const LiteralProto& proto);
   LiteralProto ToProto() const;
 
-  bool has_shape() const {
-    return shape_.element_type() != PRIMITIVE_TYPE_INVALID;
-  }
-
-  // Basic accessor functions.  Names mirror the original protobuf
-  // functions for convenience.
-  string DebugString() const { return ToProto().DebugString(); }
-  string ShortDebugString() const { return ToProto().ShortDebugString(); }
-
-  // Return the nested literal at the given shape index.
-  const Literal& GetSubliteral(const ShapeIndex& index) const;
-  Literal& GetSubliteral(const ShapeIndex& index);
-
-  void Clear() {
-    shape_.Clear();
-    u8s_.clear();
-    s16s_.clear();
-    s32s_.clear();
-    s64s_.clear();
-    u16s_.clear();
-    u32s_.clear();
-    u64s_.clear();
-    f16s_.clear();
-    f32s_.clear();
-    f64s_.clear();
-    tuple_literals_.clear();
-  }
-
-  int preds_size() const { return u8s().size(); }
-  const std::vector<uint8>& preds() const {
-    static_assert(sizeof(uint8) == sizeof(bool),
-                  "The uint8 and bool types should be the same size");
-    return u8s_;
-  }
-  std::vector<uint8>* mutable_preds() {
-    static_assert(sizeof(uint8) == sizeof(bool),
-                  "The uint8 and bool types should be the same size");
-    return &u8s_;
-  }
-
-  int s16s_size() const { return s16s().size(); }
-  int32 s16s(int i) const { return s16s_[i]; }
-  const std::vector<int16>& s16s() const { return s16s_; }
-  std::vector<int16>* mutable_s16s() { return &s16s_; }
-
-  int s32s_size() const { return s32s().size(); }
-  int32 s32s(int i) const { return s32s_[i]; }
-  const std::vector<int32>& s32s() const { return s32s_; }
-  std::vector<int32>* mutable_s32s() { return &s32s_; }
-
-  int s64s_size() const { return s64s().size(); }
-  void add_s64s(int64 value) { s64s_.push_back(value); }
-  const std::vector<int64>& s64s() const { return s64s_; }
-  std::vector<int64>* mutable_s64s() { return &s64s_; }
-
-  int u16s_size() const { return u16s().size(); }
-  uint32 u16s(int i) const { return u16s_[i]; }
-  const std::vector<uint16>& u16s() const { return u16s_; }
-  std::vector<uint16>* mutable_u16s() { return &u16s_; }
-
-  int u32s_size() const { return u32s().size(); }
-  uint32 u32s(int i) const { return u32s_[i]; }
-  const std::vector<uint32>& u32s() const { return u32s_; }
-  std::vector<uint32>* mutable_u32s() { return &u32s_; }
-
-  int u64s_size() const { return u64s().size(); }
-  const std::vector<uint64>& u64s() const { return u64s_; }
-  std::vector<uint64>* mutable_u64s() { return &u64s_; }
-
-  int f16s_size() const { return f16s().size(); }
-  half f16s(int i) const { return f16s_[i]; }
-  const std::vector<half>& f16s() const { return f16s_; }
-  std::vector<half>* mutable_f16s() { return &f16s_; }
-
-  int f32s_size() const { return f32s().size(); }
-  float f32s(int i) const { return f32s_[i]; }
-  void add_f32s(float value) { f32s_.push_back(value); }
-  const std::vector<float>& f32s() const { return f32s_; }
-  std::vector<float>& f32s() { return f32s_; }
-  std::vector<float>* mutable_f32s() { return &f32s_; }
-
-  int f64s_size() const { return f64s().size(); }
-  const std::vector<double>& f64s() const { return f64s_; }
-  std::vector<double>* mutable_f64s() { return &f64s_; }
-
-  int c64s_size() const { return c64s().size(); }
-  const std::vector<complex64>& c64s() const { return c64s_; }
-  std::vector<complex64>* mutable_c64s() { return &c64s_; }
-
-  int bf16s_size() const { return bf16s().size(); }
-  bfloat16 bf16s(int i) const { return bf16s_[i]; }
-  const std::vector<bfloat16>& bf16s() const { return bf16s_; }
-  std::vector<bfloat16>* mutable_bf16s() { return &bf16s_; }
-
-  int tuple_literals_size() const { return tuple_literals().size(); }
-  const Literal& tuple_literals(int i) const { return tuple_literals_[i]; }
-  Literal* add_tuple_literals() {
-    tuple_literals_.push_back(Literal());
-    return &tuple_literals_.back();
-  }
-  std::vector<Literal>* mutable_tuple_literals() { return &tuple_literals_; }
-  const std::vector<Literal>& tuple_literals() const { return tuple_literals_; }
-
-  int u8s_size() const { return u8s().size(); }
-  const std::vector<uint8>& u8s() const { return u8s_; }
-  void set_u8s(const std::vector<uint8>& value) { u8s_ = value; }
-  void set_u8s(tensorflow::StringPiece value) {
-    u8s_ = std::vector<uint8>(value.size());
-    u8s_.clear();
-    append_u8s(value);
-  }
-
-  void append_u8s(tensorflow::StringPiece value) {
-    u8s_.insert(u8s_.end(), value.begin(), value.end());
-  }
-
-  string u8s_string() const { return string(u8s().begin(), u8s().end()); }
+  // Return the shape of the literal.
+  const Shape& shape() const { return shape_; }
 
-  std::vector<uint8>* mutable_u8s() { return &u8s_; }
+  // TODO(b/67651157): Remove this accessor. Literal users should not be able to
+  // mutate the shape as this can produce malformed Literals.
+  Shape* mutable_shape_do_not_use() { return &shape_; }
 
-  const Shape& shape() const { return shape_; }
-  Shape* mutable_shape() { return &shape_; }
+  // Returns a (Mutable)ArraySlice view of the array for this literal for the
+  // given NativeT (e.g., float). CHECKs if the subshape of the literal at the
+  // given ShapeIndex is not array. See primitive_util.h for the mapping from
+  // XLA type to native type.
+  template <typename NativeT>
+  tensorflow::gtl::ArraySlice<NativeT> data(
+      const ShapeIndex& shape_index = {}) const;
+  template <typename NativeT>
+  tensorflow::gtl::MutableArraySlice<NativeT> data(
+      const ShapeIndex& shape_index = {});
+
+  // Returns a pointer to the sparse index array. Returns nullptr if the literal
+  // is not a sparse array.
+  const SparseIndexArray* sparse_indices(
+      const ShapeIndex& shape_index = {}) const;
+  SparseIndexArray* sparse_indices(const ShapeIndex& shape_index = {});
+
+  // Returns a pointer to (or size of) the underlying buffer holding the array
+  // at the given shape index. CHECKs if the subshape of the literal at the
+  // given ShapeIndex is not array.
+  const void* untyped_data(const ShapeIndex& shape_index = {}) const;
+  void* untyped_data(const ShapeIndex& shape_index = {});
+  int64 size_bytes(const ShapeIndex& shape_index = {}) const;
 
   // Creates a new literal of a given rank. To minimize ambiguity (for users
   // and the compiler) these CreateR[0-2] methods should explicitly specify the
@@ -243,6 +163,60 @@ class Literal {
           values,
       const Layout& layout);
 
+  // Returns this literal's data as a string. This literal must be a rank-1 U8
+  // array.
+  string GetR1U8AsString() const;
+
+  // Creates a literal with a sparse layout and the given indices and values.
+  // The shape is initialized from the given dimensions.  The minor dimension of
+  // the indices array must equal the rank of the shape (i.e. size of the
+  // dimensions array). The major dimension of the indices array must equal the
+  // number of elements in the values array. The maximum number of elements in
+  // the array is taken from the max_indices() value of the index array.
+  //
+  // XLA assumes that sparse literals are in sorted order for all operations. If
+  // the `sort` argument is true, then the indices and values will be sorted
+  // while copying them into the literal. If you have ensured that the indices
+  // and values are already sorted, then you may set the `sort` argument to
+  // false to skip the sorting step.
+  //
+  // For example:
+  //
+  //   CreateSparse(
+  //     {12, 12, 12},
+  //     SparseIndexArray(10, 3,
+  //                      Array2D{
+  //                        {0, 1, 2},
+  //                        {3, 4, 5},
+  //                        {6, 7, 8},
+  //                        {9, 10, 11},
+  //                      }),
+  //     {1.0, 2.0 3.0, 4.0})
+  //
+  // This creates an array with shape F64[12,12,12]sparse{10}, that has the
+  // following non-zero values:
+  //
+  //     [0,  1,  2]: 1.0
+  //     [3,  4,  5]: 2.0
+  //     [6,  7,  8]: 3.0
+  //     [9, 10, 11]: 4.0
+  //
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateSparse(
+      tensorflow::gtl::ArraySlice<int64> dimensions, SparseIndexArray indices,
+      tensorflow::gtl::ArraySlice<NativeT> values, bool sort = true);
+
+  // Populates a literal with a sparse layout with the given indices and values.
+  // Each index in the indices array is CHECKed against the dimensions in the
+  // literal's shape.  If sort is true, then the indices and values will be
+  // sorted.  If sort is false, then the indices and values are assumed to
+  // already be in sorted order.  See CreateSparse for an example of how data
+  // are populated.
+  template <typename NativeT>
+  void PopulateSparse(SparseIndexArray indices,
+                      tensorflow::gtl::ArraySlice<NativeT> values,
+                      bool sort = true);
+
   // Creates a new Literal object with the shape specified as parameter.
   // The content of the literal values is the default value of the primitive
   // type of literal itself (0 for numeric types, and false for predicates).
@@ -256,6 +230,23 @@ class Literal {
       PrimitiveType primitive_type,
       tensorflow::gtl::ArraySlice<int64> dimensions);
 
+  // Copy values from 'src_literal' rooted at 'src_shape_index' into this
+  // literal rooted at 'dest_shape_index'. The subshape of this literal rooted
+  // at 'dest_shape_index' must be compatible with the subshape of 'src_literal'
+  // rooted at 'src_shape_index', but need not be arrays.
+  Status CopyFrom(const Literal& src_literal,
+                  const ShapeIndex& dest_shape_index = {},
+                  const ShapeIndex& src_shape_index = {});
+
+  // Similar to CopyFrom, but with move semantincs. The subshape of this literal
+  // rooted at 'dest_shape_index' must be *equal* to the shape 'src_literal'
+  // (layouts and shapes must match), but need not be arrays. The memory
+  // allocated in this literal for the subshape at dest_shape_index is
+  // deallocated, and the respective buffers are replaced with those in
+  // src_literal. Upon return, src_literal is set to a nil shape (empty tuple).
+  Status MoveFrom(Literal&& src_literal,
+                  const ShapeIndex& dest_shape_index = {});
+
   // Copies the values from src_literal, starting at src_base shape indexes,
   // to this literal, starting at dest_base, where the copy size in each
   // dimension is specified by copy_size.
@@ -265,10 +256,24 @@ class Literal {
   // Note: if either src_literal or this literal contains dimensions with zero
   // element, then copy_size must be 0 in these dimensions while the
   // corresponding base indices being 0.
-  Status Copy(const Literal& src_literal,
-              tensorflow::gtl::ArraySlice<int64> src_base,
-              tensorflow::gtl::ArraySlice<int64> dest_base,
-              tensorflow::gtl::ArraySlice<int64> copy_size);
+  // This literal and 'src_literal' must be arrays.
+  Status CopySliceFrom(const Literal& src_literal,
+                       tensorflow::gtl::ArraySlice<int64> src_base,
+                       tensorflow::gtl::ArraySlice<int64> dest_base,
+                       tensorflow::gtl::ArraySlice<int64> copy_size);
+
+  // Returns a vector containing the tuple elements of this Literal as separate
+  // Literals. This Literal must be tuple-shaped and can be a nested tuple. The
+  // elements are moved into the new Literals; no data is copied. Upon return
+  // this Literal is set to a nil shape (empty tuple)
+  std::vector<Literal> DecomposeTuple();
+
+  // This operation is the inverse of DecomposeTuple. The given elements are
+  // moved into the tuple elements of a new tuple-shaped Literal which is
+  // returned. Upon return, each of the Literals in 'elements' is set to a nil
+  // shape (empty tuple).
+  static Literal MoveIntoTuple(
+      tensorflow::gtl::MutableArraySlice<Literal> elements);
 
   // Creates a new value that has the equivalent value as this literal, but
   // conforms to new_layout; e.g. a literal matrix that was in {0, 1}
@@ -285,11 +290,16 @@ class Literal {
   std::unique_ptr<Literal> Relayout(const Layout& new_layout,
                                     const ShapeIndex& shape_index = {}) const;
 
-  // Creates a new literal by reshaping this literal to have 'shape'. Both the
-  // original shape and 'shape' must contain the same number of elements. The
+  // An overload of Relayout which changes the layout of the entire shape rather
+  // than being limited to a single array within the shape.
+  std::unique_ptr<Literal> Relayout(const Shape& shape_with_layout) const;
+
+  // Creates a new literal by reshaping this literal to have the given
+  // dimensions. The total number of elements must not change; The
   // implementation currently only supports monotonic dim0-major layouts.
+  // This literal must be an array.
   StatusOr<std::unique_ptr<Literal>> Reshape(
-      tensorflow::gtl::ArraySlice<int64> shape) const;
+      tensorflow::gtl::ArraySlice<int64> dimensions) const;
 
   // Creates a new literal by reordering the dimensions of this literal.
   // The given `permutation` must be a permutation of the dimension numbers
@@ -297,6 +307,7 @@ class Literal {
   // in the result literal (i.e., new_order[i] = old_order[permutation[i]]).
   // For example, a transpose call on a literal of shape [3 x 8 x 4] and
   // `permutation` = {2, 0, 1} returns a new literal of shape [4 x 3 x 8].
+  // This literal must be an array.
   std::unique_ptr<Literal> Transpose(
       tensorflow::gtl::ArraySlice<int64> permutation) const;
 
@@ -305,6 +316,7 @@ class Literal {
   // same rank and layout as for the given literal. The number of indices in
   // start_indices and limit_indices must be the rank of the literal, and the
   // indices follow the order of the dimensions.
+  // This literal must be an array.
   std::unique_ptr<Literal> Slice(
       tensorflow::gtl::ArraySlice<int64> start_indices,
       tensorflow::gtl::ArraySlice<int64> limit_indices) const;
@@ -312,34 +324,35 @@ class Literal {
   // Creates a literal with a prepended dimension with bound "times"; e.g. a
   // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from this
   // literal replicated four times.
+  // This literal must be an array.
   template <typename NativeT>
   std::unique_ptr<Literal> Replicate(int64 times) const;
 
   // Converts this literal to another primitive type. Returns an error if the
-  // conversion is not possible.
+  // conversion is not possible. This literal must be array-shaped.
   StatusOr<std::unique_ptr<Literal>> Convert(
       PrimitiveType primitive_dest_type) const;
 
-  // Creates a literal value zero of the given primitive type.
+  // Creates a scalar literal value zero of the given primitive type.
   static Literal Zero(PrimitiveType primitive_type);
 
-  // Creates a literal value one of the given primitive type.
+  // Creates a scalar literal value one of the given primitive type.
   static Literal One(PrimitiveType primitive_type);
 
-  // Creates a literal value containing the minimum value of the given
+  // Creates a scalar literal value containing the minimum value of the given
   // primitive type. For floating-point types, returns -inf.
   static Literal MinValue(PrimitiveType primitive_type);
 
-  // Creates a literal value containing the maximum value of the given
+  // Creates a scalar literal value containing the maximum value of the given
   // primitive type. For floating-point types, returns inf.
   static Literal MaxValue(PrimitiveType primitive_type);
 
   // Creates a literal of the given shape where each element is `value`.
   template <typename NativeT>
-  static std::unique_ptr<Literal> CreateFullWithMonotonicDim0MajorLayout(
+  static std::unique_ptr<Literal> CreateFullWithDescendingLayout(
       tensorflow::gtl::ArraySlice<int64> dimensions, NativeT value);
 
-  // Creates a new literal from an array. The variants not ending with
+  // Creates a new literal from an Array type. The variants not ending with
   // WithLayout use the default XLA layout for the literal's linear
   // representation in memory.
   template <typename NativeT>
@@ -388,35 +401,50 @@ class Literal {
       std::initializer_list<std::initializer_list<NativeT>> values,
       int64 projection_p, int64 projection_z);
 
-  // Clones this literal into an owned unique_ptr version.
+  // Clones this literal into a new Literal, or new std::unique_ptr<Literal>.
+  Literal Clone() const;
   std::unique_ptr<Literal> CloneToUnique() const;
 
-  // Returns the linear index of the given index within this literal's
-  // element_type repeated field.
-  int64 LinearIndex(tensorflow::gtl::ArraySlice<int64> multi_index) const;
+  // Gets or sets an element in the literal at the given index. The multi_index
+  // is CHECKed against the dimension sizes.
+  template <typename NativeT>
+  NativeT Get(tensorflow::gtl::ArraySlice<int64> multi_index,
+              const ShapeIndex& shape_index) const;
+  template <typename NativeT>
+  void Set(tensorflow::gtl::ArraySlice<int64> multi_index,
+           const ShapeIndex& shape_index, NativeT value);
 
-  // Gets or sets an element in the literal at the given index. The index is
-  // CHECKed against the dimension sizes.
+  // Overloads of Get and Set for array literals. CHECKs if the literal is not
+  // array-shaped and dense.
   template <typename NativeT>
   NativeT Get(tensorflow::gtl::ArraySlice<int64> multi_index) const;
   template <typename NativeT>
   void Set(tensorflow::gtl::ArraySlice<int64> multi_index, NativeT value);
 
-  // Returns a (Mutable)ArraySlice view of the array for this literal for the
-  // given NativeT (e.g., float). These functions map native type to XLA
-  // PrimitiveType via template specialization. The unspecialized forms below
-  // aborts to handle the error case where the given native type does not map to
-  // an XLA primitive type.
+  // Returns the multi-index of the element in a sparse literal at the given
+  // sparse element number.  The sparse element number is the position with in
+  // the sparse array's list of (index, value) pairs, and is checked against the
+  // total number of (index, value) pairs in the sparse array.
+  tensorflow::gtl::ArraySlice<int64> GetSparseIndex(
+      int64 sparse_element_number, const ShapeIndex& shape_index = {}) const;
+
+  // Returns the value of the element in a sparse literal at the given sparse
+  // element number.  The sparse element number is the position with in the
+  // sparse array's list of (index, value) pairs, and is checked against the
+  // total number of (index, value) pairs in the sparse array.
   template <typename NativeT>
-  tensorflow::gtl::ArraySlice<NativeT> GetArraySlice() const {
-    static_assert(!std::is_same<NativeT, NativeT>::value,
-                  "Cannot map native type to primitive type.");
-  }
+  NativeT GetSparseElement(int64 sparse_element_number,
+                           const ShapeIndex& shape_index = {}) const;
+
+  // Appends the given element to the literal.  If the elements are not appended
+  // in sorted order, then SortSparseElements should be called before calling
+  // other methods.  This literal must have a sparse layout.
   template <typename NativeT>
-  tensorflow::gtl::MutableArraySlice<NativeT> GetMutableArraySlice() {
-    static_assert(!std::is_same<NativeT, NativeT>::value,
-                  "Cannot map native type to primitive type.");
-  }
+  void AppendSparseElement(tensorflow::gtl::ArraySlice<int64> multi_index,
+                           NativeT value, const ShapeIndex& shape_index = {});
+
+  // Sorts the elements in a sparse array.
+  void SortSparseElements(const ShapeIndex& shape_index = {});
 
   // Returns the element value at index (0, ..., 0), however many zeroes are
   // required for that index.
@@ -425,10 +453,16 @@ class Literal {
 
   // As Get(), but determines the correct type and converts the value
   // into text.
-  string GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index) const;
+  string GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
+                     const ShapeIndex& shape_index = {}) const;
+
+  // As GetSparseElement(), but determines the correct type and converts the
+  // value into text.
+  string GetSparseElementAsString(int64 sparse_element_number,
+                                  const ShapeIndex& shape_index = {}) const;
 
   // As Get(), but determines the correct type and converts the value into
-  // int64.
+  // int64.  This literal must be an array.
   StatusOr<int64> GetIntegralAsS64(
       tensorflow::gtl::ArraySlice<int64> multi_index) const;
 
@@ -436,7 +470,8 @@ class Literal {
   template <typename NativeT>
   static std::unique_ptr<Literal> MakeIdentityR2(int64 size);
 
-  // Returns a tuple literal composed of given literals.
+  // Returns a tuple literal composed of given literals. Data is copied from the
+  // given elements into the returned literal.
   static std::unique_ptr<Literal> MakeTuple(
       tensorflow::gtl::ArraySlice<const Literal*> elements);
 
@@ -450,11 +485,29 @@ class Literal {
   static std::unique_ptr<Literal> MakeTupleOwned(
       std::vector<std::unique_ptr<Literal>> elements);
 
-  // Validates that the data payload of the literal matches the literal shape;
-  // if it does not, an appropriate status is returned.
-  tensorflow::Status ValidateLiteral() const;
+  // This overload lets you pass a braced list of unique_ptr<Literal>s to
+  // MakeTupleOwned:
+  //
+  //   Literal::MakeTupleOwned(Literal::CreateR1(...), ...).
+  //
+  // Simply relying on the MakeTupleOwned(std::vector<unique_ptr<Literal>>)
+  // overload doesn't work because std::initializer_list's elements are always
+  // const.
+  //
+  // The arguments to this function must all be unique_ptr<Literal>.
+  template <typename... Ts>
+  static std::unique_ptr<Literal> MakeTupleOwned(
+      std::unique_ptr<Ts>... elements) {
+    std::array<std::unique_ptr<Literal>, sizeof...(Ts)> arr{
+        std::move(elements)...};
+    std::vector<std::unique_ptr<Literal>> v;
+    v.insert(v.begin(), std::make_move_iterator(arr.begin()),
+             std::make_move_iterator(arr.end()));
+    return MakeTupleOwned(std::move(v));
+  }
 
   // Returns a string representation of the literal value.
+  // Warning: this function can take minutes for multi-million element Literals.
   string ToString(bool print_layout = false) const;
 
   // Invokes the "per cell" callback for each element in the provided
@@ -464,6 +517,8 @@ class Literal {
   // This function is useful if you want a polymorphic representation
   // of the tensor's elements (turning it to a string for something
   // like representation in a protobuf).
+  //
+  // This literal must have a dense layout.
   void EachCellAsString(
       const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
                                const string& value)>& per_cell) const;
@@ -472,80 +527,45 @@ class Literal {
                                    NativeT value)>
                     per_cell) const;
 
-  // Templated methods which populate the given repeated field in this literal
-  // with the given value(s). The Shape field of this literal is set
-  // to match the array dimensions and type. Examples:
+  // Populate this literal with the given values. Examples:
   //
   //   // Populate with floats.
   //   Array2D<float> float_values = ...
   //   literal.PopulateR2FromArray2D(values);
   //
   //   // Populate with int32s.
-  //   literal.PopulateR2({{1, 2}, {3, 4}});
+  //   literal.PopulateR2<int32>({{1, 2}, {3, 4}});
   //
-  template <typename NativeT>
-  void PopulateR0(NativeT values);
+  // The shape and element type of this literal must match given values. For
+  // example, in the call above to literal.PopulateR2(), 'literal' must be a 2x2
+  // array of S32.
   template <typename NativeT>
   void PopulateR1(tensorflow::gtl::ArraySlice<NativeT> values);
   void PopulateR1(const tensorflow::core::Bitmap& values);
   template <typename NativeT>
   void PopulateR2(std::initializer_list<std::initializer_list<NativeT>> values);
   template <typename NativeT>
-  void PopulateR2WithLayout(
-      std::initializer_list<std::initializer_list<NativeT>> values,
-      const Layout& layout);
-  template <typename NativeT>
   void PopulateFromArray(const Array<NativeT>& values);
   template <typename NativeT>
-  void PopulateFromArrayWithLayout(const Array<NativeT>& values,
-                                   const Layout& layout);
-  template <typename NativeT>
   void PopulateR2FromArray2D(const Array2D<NativeT>& values);
   template <typename NativeT>
-  void PopulateR2FromArray2DWithLayout(const Array2D<NativeT>& values,
-                                       const Layout& layout);
-  template <typename NativeT>
   void PopulateR3FromArray3D(const Array3D<NativeT>& values);
   template <typename NativeT>
-  void PopulateR3FromArray3DWithLayout(const Array3D<NativeT>& values,
-                                       const Layout& layout);
-  template <typename NativeT>
   void PopulateR4FromArray4D(const Array4D<NativeT>& values);
-  template <typename NativeT>
-  void PopulateR4FromArray4DWithLayout(const Array4D<NativeT>& values,
-                                       const Layout& layout);
 
   // Populates literal values by calling the generator function for every cell
   // in this literal object.
   //
   // generator must be a callable of the type
   // NativeT(tensorflow::gtl::ArraySlice<int64> indexes) or compatible.
+  //
+  // This literal must have a dense layout.
   template <typename NativeT, typename FnType>
   Status Populate(const FnType& generator);
 
-  // Creates a Literal of the given dimensions with all elements set to the
-  // given value.
-  template <typename NativeT>
-  void PopulateWithValue(NativeT value,
-                         tensorflow::gtl::ArraySlice<int64> dimensions);
-
-  // Returns a pointer to the underlying vector corresponding to the Literal's
-  // shape.
-  const void* InternalData() const;
-  void* MutableInternalData();
-
-  // Allocates space in the underlying vector of this literal sufficient to hold
-  // num_elements of this literal's primitive type. Values in the vector are set
-  // to zero. num_elements must equal the number of elements in the literal's
-  // shape.
-  void Reserve(int64 num_elements);
-
-  // Allocates space in the underlying vector of this literal sufficient to hold
-  // num_elements of this literal's primitive type and sets each element in this
-  // literal to the given value. num_elements must equal the number of elements
-  // in this literal's shape.
+  // Fills this literal with the given value.
   template <typename NativeT>
-  void Resize(int64 num_elements, NativeT value);
+  void PopulateWithValue(NativeT value);
 
   // Returns whether every element in this literal is equal to value.
   //
@@ -555,7 +575,7 @@ class Literal {
   //
   // If value doesn't fit in this literal's type, returns false.  Values of 1/0
   // are considered equal to true/false; other values are not considered equal
-  // to true.
+  // to true. Also if this literal is not array-shaped false is returned.
   bool IsAll(int8 value) const;
 
   // Like IsAll(const Literal&, int8), except we check whether the literal is
@@ -566,7 +586,7 @@ class Literal {
   // This casts value to the type of literal, then compares using ==.  The usual
   // admonishments about floating-point equality checks apply.  We expect you to
   // use this to check for values that can be expressed precisely as a float,
-  // e.g. -0.5.
+  // e.g. -0.5.  Also if this literal is not array-shaped false is returned.
   bool IsAllFloat(float value) const;
 
   // Like IsAll(const Literal&, int8), except we check whether the literal is
@@ -578,23 +598,38 @@ class Literal {
   // admonishments about floating-point equality checks apply.  We expect you to
   // use this to check for complex values that can be expressed precisely as
   // float pairs e.g. (-0.5, 1.0).
+  //
+  // This literal must have a dense layout.
   bool IsAllComplex(complex64 value) const;
 
   // Returns whether this literal is zero at the specified index. This literal
-  // must be an array.
+  // must be an array with a dense layout.
   bool IsZero(tensorflow::gtl::ArraySlice<int64> indices) const;
 
- private:
-  // Copy from a LiteralProto instance.
-  void CopyFromProto(const LiteralProto& literal_proto);
+  // Return the count of the elements in the array at the given shape index in
+  // this literal.
+  int64 element_count(const ShapeIndex& index = {}) const {
+    return ShapeUtil::ElementsIn(ShapeUtil::GetSubshape(shape(), index));
+  }
+
+  // Return the count of the elements in the sparse array at the given shape
+  // index in this literal, which will be no larger than
+  // LayoutUtil::MaxSparseElements(SetSubshape(shape(), index).layout()).
+  int64 sparse_element_count() const;
+
+ protected:
+  // 'allocate_arrays' indicates whether to allocate memory for the arrays in
+  // the shape. If false, buffer pointers inside of the Literal::Pieces are set
+  // to nullptr.
+  Literal(const Shape& shape, bool allocate_arrays);
 
-  // Internal template helper for the Copy() API, matching its arguments one by
-  // one.
-  template <typename T>
-  Status CopyRange(const Literal& src_literal,
-                   tensorflow::gtl::ArraySlice<int64> src_base,
-                   tensorflow::gtl::ArraySlice<int64> dest_base,
-                   tensorflow::gtl::ArraySlice<int64> copy_size);
+  // Internal template helper for the Literal::CopySliceFrom(), matching its
+  // arguments one by one.
+  template <typename NativeT>
+  Status CopySliceFromInternal(const Literal& src_literal,
+                               tensorflow::gtl::ArraySlice<int64> src_base,
+                               tensorflow::gtl::ArraySlice<int64> dest_base,
+                               tensorflow::gtl::ArraySlice<int64> copy_size);
 
   // Utility structure which is used to create the optimal configuration for
   // a ShapeUtil::ForEachIndex() scan across two literals.
@@ -619,163 +654,243 @@ class Literal {
     int64 minor_loop_size = 1;
   };
 
-  Shape shape_;
-  std::vector<uint8> u8s_;
-  std::vector<int16> s16s_;
-  std::vector<int32> s32s_;
-  std::vector<int64> s64s_;
-  std::vector<uint16> u16s_;
-  std::vector<uint32> u32s_;
-  std::vector<uint64> u64s_;
-  std::vector<bfloat16> bf16s_;
-  std::vector<half> f16s_;
-  std::vector<float> f32s_;
-  std::vector<double> f64s_;
-  std::vector<complex64> c64s_;
-  std::vector<Literal> tuple_literals_;
-};
-
-std::ostream& operator<<(std::ostream& out, const Literal& literal);
-
-// Declarations of template specializations for GetArraySlice and
-// GetMutableArraySlice. The specializations map native type to XLA primitive
-// type.
-template <>
-tensorflow::gtl::ArraySlice<bool> Literal::GetArraySlice<bool>() const;
-
-template <>
-tensorflow::gtl::ArraySlice<uint8> Literal::GetArraySlice<uint8>() const;
-
-template <>
-tensorflow::gtl::ArraySlice<int8> Literal::GetArraySlice<int8>() const;
-
-template <>
-tensorflow::gtl::ArraySlice<uint16> Literal::GetArraySlice<uint16>() const;
-
-template <>
-tensorflow::gtl::ArraySlice<int16> Literal::GetArraySlice<int16>() const;
+  // A data structure representing a subshape at a particular ShapeIndex within
+  // the literal. For array-shaped ShapeIndexes, this data structure holds the
+  // pointer to the memory allocated for the array data.
+  class Piece {
+   public:
+    // Return the buffer holding the array data for this piece as an array
+    // slice. This piece must be array-shaped.
+    template <typename NativeT>
+    tensorflow::gtl::ArraySlice<NativeT> data() const;
+    template <typename NativeT>
+    tensorflow::gtl::MutableArraySlice<NativeT> data();
+
+    // Return the buffer holding the array data for this piece as a void*. This
+    // piece must be array-shaped.
+    void* untyped_data();
+    const void* untyped_data() const;
+
+    // Gets or sets an element in the array at the given index. The multi_index
+    // is CHECKed against the dimension sizes of the array.  This piece must be
+    // array-shaped.
+    template <typename NativeT>
+    NativeT Get(tensorflow::gtl::ArraySlice<int64> index) const;
+    template <typename NativeT>
+    void Set(tensorflow::gtl::ArraySlice<int64> index, NativeT value);
+
+    // Gets/sets the buffer holding the array data.
+    char* buffer() const { return buffer_; }
+    void set_buffer(char* buffer) { buffer_ = buffer; }
+
+    // The array of multi-indices that provide the locations of non-zero
+    // elements in a sparse array.  Only used if
+    // LayoutUtil::IsSparseArray(shape()) is true.
+    SparseIndexArray* sparse_indices() const { return sparse_indices_; }
+    void set_sparse_indices(SparseIndexArray* sparse_indices) {
+      sparse_indices_ = sparse_indices;
+    }
 
-template <>
-tensorflow::gtl::ArraySlice<uint32> Literal::GetArraySlice<uint32>() const;
+    // Gets or sets the subshape of this piece. This reference points to a
+    // subshape within the shape in the containing Literal (Literal::shape_).
+    const Shape& subshape() const { return *subshape_; }
+    void set_subshape(const Shape* subshape) { subshape_ = subshape; }
 
-template <>
-tensorflow::gtl::ArraySlice<uint64> Literal::GetArraySlice<uint64>() const;
+    // Returns the size in bytes of the buffer holding the array data.
+    int64 size_bytes() const { return ShapeUtil::ByteSizeOf(subshape()); }
 
-template <>
-tensorflow::gtl::ArraySlice<int32> Literal::GetArraySlice<int32>() const;
+    // Returns the number of elements in this piece's array.
+    int64 element_count() const { return ShapeUtil::ElementsIn(subshape()); }
 
-template <>
-tensorflow::gtl::ArraySlice<int64> Literal::GetArraySlice<int64>() const;
+    // Copy the data from 'src' into this piece's buffer. Shapes of this piece
+    // and src must be compatible.
+    Status CopyFrom(const Piece& src);
 
-template <>
-inline tensorflow::gtl::ArraySlice<float> Literal::GetArraySlice<float>()
-    const {
-  DCHECK(shape().element_type() == F32);
-  return f32s();
-}
+    // Returns true if this piece and 'other' contain the same data. This piece
+    // and 'other' must be array-shaped and compatible.
+    bool EqualElements(const Piece& other) const;
 
-template <>
-tensorflow::gtl::ArraySlice<double> Literal::GetArraySlice<double>() const;
+    // Writes the shape and data (if array-shaped) into the given proto.
+    void WriteToProto(LiteralProto* proto) const;
 
-template <>
-tensorflow::gtl::ArraySlice<half> Literal::GetArraySlice<half>() const;
+    // Copies the data from the given proto into this piece. The shape of this
+    // piece must be equal (not just compatible) to the shape of the proto.
+    Status CopyFromProto(const LiteralProto& proto);
 
-template <>
-tensorflow::gtl::ArraySlice<bfloat16> Literal::GetArraySlice<bfloat16>() const;
+    // Sorts the elements in a sparse array.
+    void SortSparseElements();
 
-template <>
-tensorflow::gtl::ArraySlice<complex64> Literal::GetArraySlice<complex64>()
-    const;
+   private:
+    // Recursive helper for EqualElements.
+    template <typename NativeT>
+    bool EqualElementsInternal(const Piece& other,
+                               std::vector<int64>* multi_index) const;
 
-template <>
-tensorflow::gtl::MutableArraySlice<bool> Literal::GetMutableArraySlice();
+    // Helper for SortSparseElements that has the element type as a template
+    // parameter.
+    template <typename NativeT>
+    void SortSparseElementsInternal();
 
-template <>
-tensorflow::gtl::MutableArraySlice<int8> Literal::GetMutableArraySlice();
+    // For array-shaped pieces, this is the buffer holding the literal data.
+    char* buffer_ = nullptr;
 
-template <>
-tensorflow::gtl::MutableArraySlice<uint8> Literal::GetMutableArraySlice();
+    // For sparse arrays, this is the array of indices.
+    SparseIndexArray* sparse_indices_ = nullptr;
 
-template <>
-tensorflow::gtl::MutableArraySlice<int16> Literal::GetMutableArraySlice();
+    // The shape of piece. This points into the shape of the containing Literal
+    // (Literal::shape_).
+    const Shape* subshape_ = nullptr;
+  };
 
-template <>
-tensorflow::gtl::MutableArraySlice<uint16> Literal::GetMutableArraySlice();
+  // Returns the piece at the given ShapeIndex.
+  Piece& piece(const ShapeIndex& shape_index) {
+    return *pieces_.mutable_element(shape_index);
+  }
+  const Piece& piece(const ShapeIndex& shape_index) const {
+    return pieces_.element(shape_index);
+  }
 
-template <>
-tensorflow::gtl::MutableArraySlice<int32> Literal::GetMutableArraySlice();
+  // Returns the piece at the root of the shape (empty ShapeIndex).
+  Piece& root_piece() { return piece({}); }
+  const Piece& root_piece() const { return piece({}); }
 
-template <>
-tensorflow::gtl::MutableArraySlice<uint32> Literal::GetMutableArraySlice();
+  // Deallocate the buffers held by this literal (if the literal owns the
+  // buffer).
+  void DeallocateBuffers();
 
-template <>
-tensorflow::gtl::MutableArraySlice<int64> Literal::GetMutableArraySlice();
+  Shape shape_;
+  ShapeTree<Piece> pieces_;
 
-template <>
-tensorflow::gtl::MutableArraySlice<uint64> Literal::GetMutableArraySlice();
+  // Whether the buffers held in pieces_ are owned by this Literal.
+  bool owns_buffers_;
 
-template <>
-tensorflow::gtl::MutableArraySlice<float> Literal::GetMutableArraySlice();
+  // LiteralView must access and manipulate Pieces of other Literals.
+  friend class LiteralView;
+};  // namespace xla
 
-template <>
-tensorflow::gtl::MutableArraySlice<double> Literal::GetMutableArraySlice();
+std::ostream& operator<<(std::ostream& out, const Literal& literal);
 
-template <>
-tensorflow::gtl::MutableArraySlice<half> Literal::GetMutableArraySlice();
+// A read-only view of a Literal. A LiteralView contains pointers to buffers
+// owned by the viewed Literal.
+//
+// TODO(b/71550060): Replace LiteralView with Literal slice classes (immutable
+// and mutable) similar to (Mutable)ArraySlice.
+class LiteralView : public Literal {
+ public:
+  // Create and return a view of the given literal rooted at the given shape
+  // index within the given literal. A factory is used rather than a public
+  // constructor because only const LiteralViews are supported. It's still
+  // possible to create non-const LiteralViews via the copy constructors, but
+  // the factory method makes it a bit less likely. Implementing literal slices
+  // will fix this undesirable situation (b/71550060).
+  static const LiteralView Create(const Literal& literal,
+                                  const ShapeIndex& view_root = {});
 
-template <>
-tensorflow::gtl::MutableArraySlice<bfloat16> Literal::GetMutableArraySlice();
+  LiteralView(const LiteralView& other);
+  LiteralView& operator=(const LiteralView& other);
 
-template <>
-tensorflow::gtl::MutableArraySlice<complex64> Literal::GetMutableArraySlice();
+  virtual ~LiteralView();
 
-template <>
-void Literal::Resize<bool>(int64 num_elements, bool value);
+ private:
+  LiteralView(const Literal& literal, const ShapeIndex& view_root);
 
-template <>
-void Literal::Resize<int8>(int64 num_elements, int8 value);
+  // Helper for the copy constructor and copy assignment operator.
+  void CopyFrom(const LiteralView& other);
+};
 
-template <>
-void Literal::Resize<uint8>(int64 num_elements, uint8 value);
+template <typename NativeT>
+tensorflow::gtl::ArraySlice<NativeT> Literal::Piece::data() const {
+  CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
+  CHECK_EQ(subshape().element_type(),
+           primitive_util::NativeToPrimitiveType<NativeT>())
+      << "Attempting to access "
+      << PrimitiveType_Name(primitive_util::NativeToPrimitiveType<NativeT>())
+      << " type, but literal element type is "
+      << PrimitiveType_Name(subshape().element_type());
+  return tensorflow::gtl::ArraySlice<NativeT>(
+      reinterpret_cast<const NativeT*>(buffer()),
+      ShapeUtil::ElementsIn(subshape()));
+}
 
-template <>
-void Literal::Resize<int32>(int64 num_elements, int32 value);
+template <typename NativeT>
+tensorflow::gtl::MutableArraySlice<NativeT> Literal::Piece::data() {
+  CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
+  CHECK_EQ(subshape().element_type(),
+           primitive_util::NativeToPrimitiveType<NativeT>())
+      << "Attempting to access "
+      << PrimitiveType_Name(primitive_util::NativeToPrimitiveType<NativeT>())
+      << " type, but literal element type is "
+      << PrimitiveType_Name(subshape().element_type());
+  return tensorflow::gtl::MutableArraySlice<NativeT>(
+      reinterpret_cast<NativeT*>(buffer()), ShapeUtil::ElementsIn(subshape()));
+}
 
-template <>
-void Literal::Resize<uint32>(int64 num_elements, uint32 value);
+template <typename NativeT>
+NativeT Literal::Piece::Get(
+    tensorflow::gtl::ArraySlice<int64> multi_index) const {
+  CHECK(LayoutUtil::IsDenseArray(subshape()));
+  return data<NativeT>()[IndexUtil::MultidimensionalIndexToLinearIndex(
+      subshape(), multi_index)];
+}
 
-template <>
-void Literal::Resize<int64>(int64 num_elements, int64 value);
+template <typename NativeT>
+void Literal::Piece::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
+                         NativeT value) {
+  CHECK(LayoutUtil::IsDenseArray(subshape()));
+  data<NativeT>()[IndexUtil::MultidimensionalIndexToLinearIndex(
+      subshape(), multi_index)] = value;
+}
 
-template <>
-void Literal::Resize<uint64>(int64 num_elements, uint64 value);
+template <typename NativeT>
+tensorflow::gtl::ArraySlice<NativeT> Literal::data(
+    const ShapeIndex& shape_index) const {
+  return piece(shape_index).data<NativeT>();
+}
 
-template <>
-void Literal::Resize<float>(int64 num_elements, float value);
+template <typename NativeT>
+tensorflow::gtl::MutableArraySlice<NativeT> Literal::data(
+    const ShapeIndex& shape_index) {
+  return piece(shape_index).data<NativeT>();
+}
 
-template <>
-void Literal::Resize<double>(int64 num_elements, double value);
+template <typename NativeT>
+inline NativeT Literal::Get(tensorflow::gtl::ArraySlice<int64> multi_index,
+                            const ShapeIndex& shape_index) const {
+  return piece(shape_index).Get<NativeT>(multi_index);
+}
 
-template <>
-void Literal::Resize<half>(int64 num_elements, half value);
+template <typename NativeT>
+inline NativeT Literal::Get(
+    tensorflow::gtl::ArraySlice<int64> multi_index) const {
+  return root_piece().Get<NativeT>(multi_index);
+}
 
-template <>
-void Literal::Resize<bfloat16>(int64 num_elements, bfloat16 value);
+template <typename NativeT>
+inline void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
+                         const ShapeIndex& shape_index, NativeT value) {
+  return piece(shape_index).Set<NativeT>(multi_index, value);
+}
 
-template <>
-void Literal::Resize<complex64>(int64 num_elements, complex64 value);
+template <typename NativeT>
+inline void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
+                         NativeT value) {
+  return root_piece().Set<NativeT>(multi_index, value);
+}
 
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal> Literal::CreateR0(NativeT value) {
-  auto literal = MakeUnique<Literal>();
-  literal->PopulateR0<NativeT>(value);
+  auto literal = MakeUnique<Literal>(ShapeUtil::MakeShape(
+      primitive_util::NativeToPrimitiveType<NativeT>(), {}));
+  literal->Set({}, value);
   return literal;
 }
 
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal> Literal::CreateR1(
     tensorflow::gtl::ArraySlice<NativeT> values) {
-  auto literal = MakeUnique<Literal>();
+  auto literal = MakeUnique<Literal>(
+      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<NativeT>(),
+                           {static_cast<int64>(values.size())}));
   literal->PopulateR1(values);
   return literal;
 }
@@ -784,8 +899,12 @@ template <typename NativeT>
 /* static */ std::unique_ptr<Literal> Literal::CreateR2WithLayout(
     std::initializer_list<std::initializer_list<NativeT>> values,
     const Layout& layout) {
-  auto literal = MakeUnique<Literal>();
-  literal->PopulateR2WithLayout(values, layout);
+  auto literal = MakeUnique<Literal>(ShapeUtil::MakeShapeWithLayout(
+      primitive_util::NativeToPrimitiveType<NativeT>(),
+      {static_cast<int64>(values.size()),
+       static_cast<int64>(values.begin()->size())},
+      AsInt64Slice(layout.minor_to_major())));
+  literal->PopulateR2(values);
   return literal;
 }
 
@@ -858,6 +977,21 @@ template <typename NativeT>
   return CreateR4FromArray4DWithLayout(tmp, layout);
 }
 
+template <typename NativeT>
+/* static */ std::unique_ptr<Literal> Literal::CreateSparse(
+    tensorflow::gtl::ArraySlice<int64> dimensions, SparseIndexArray indices,
+    tensorflow::gtl::ArraySlice<NativeT> values, bool sort) {
+  int64 num_elements = values.size();
+  int64 rank = dimensions.size();
+  CHECK_EQ(num_elements, indices.index_count());
+  CHECK_EQ(rank, indices.rank());
+  auto literal = MakeUnique<Literal>(ShapeUtil::MakeShapeWithSparseLayout(
+      primitive_util::NativeToPrimitiveType<NativeT>(), dimensions,
+      indices.max_indices()));
+  literal->PopulateSparse(indices, values, sort);
+  return literal;
+}
+
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal> Literal::CreateR4(
     std::initializer_list<std::initializer_list<
@@ -869,8 +1003,10 @@ template <typename NativeT>
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal> Literal::CreateFromArrayWithLayout(
     const Array<NativeT>& values, const Layout& layout) {
-  auto literal = MakeUnique<Literal>();
-  literal->PopulateFromArrayWithLayout(values, layout);
+  auto literal = MakeUnique<Literal>(ShapeUtil::MakeShapeWithLayout(
+      primitive_util::NativeToPrimitiveType<NativeT>(), values.dimensions(),
+      AsInt64Slice(layout.minor_to_major())));
+  literal->PopulateFromArray(values);
   return literal;
 }
 
@@ -970,81 +1106,33 @@ template <typename NativeT>
   return CreateFromArrayWithLayout(values, layout);
 }
 
-template <typename NativeT>
-NativeT Literal::Get(tensorflow::gtl::ArraySlice<int64> multi_index) const {
-  int64 linear_index = LinearIndex(multi_index);
-  return GetArraySlice<NativeT>().at(linear_index);
-}
-
 template <typename NativeT>
 NativeT Literal::GetFirstElement() const {
-  return GetArraySlice<NativeT>().at(0);
-}
-
-template <>
-inline uint8 Literal::Get<uint8>(
-    tensorflow::gtl::ArraySlice<int64> multi_index) const {
-  CHECK(shape().element_type() == U8);
-  int64 linear_index = LinearIndex(multi_index);
-  return u8s()[linear_index];
-}
-
-template <>
-inline int8 Literal::Get<int8>(
-    tensorflow::gtl::ArraySlice<int64> multi_index) const {
-  CHECK(shape().element_type() == S8);
-  int64 linear_index = LinearIndex(multi_index);
-  return u8s()[linear_index];
-}
-
-template <>
-inline half Literal::Get<half>(
-    tensorflow::gtl::ArraySlice<int64> multi_index) const {
-  CHECK(shape().element_type() == F16);
-  int64 linear_index = LinearIndex(multi_index);
-  return GetArraySlice<half>()[linear_index];
-}
-
-template <>
-inline bfloat16 Literal::Get<bfloat16>(
-    tensorflow::gtl::ArraySlice<int64> multi_index) const {
-  CHECK(shape().element_type() == BF16);
-  int64 linear_index = LinearIndex(multi_index);
-  return GetArraySlice<bfloat16>()[linear_index];
+  return data<NativeT>().at(0);
 }
 
 template <typename NativeT>
-void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
-                  NativeT value) {
-  int64 linear_index = LinearIndex(multi_index);
-  GetMutableArraySlice<NativeT>().at(linear_index) = value;
+NativeT Literal::GetSparseElement(int64 sparse_element_number,
+                                  const ShapeIndex& shape_index) const {
+  CHECK(
+      LayoutUtil::IsSparseArray(ShapeUtil::GetSubshape(shape(), shape_index)));
+  return data<NativeT>(shape_index)[sparse_element_number];
 }
 
-template <>
-inline void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
-                         uint8 value) {
-  int64 linear_index = LinearIndex(multi_index);
-  (*mutable_u8s())[linear_index] = value;
-}
-
-template <>
-inline void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
-                         int8 value) {
-  return Set<uint8>(multi_index, value);
-}
-
-template <>
-inline void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
-                         int64 value) {
-  int64 linear_index = LinearIndex(multi_index);
-  (*mutable_s64s())[linear_index] = value;
-}
-
-template <>
-/* static */ inline void Literal::Set(
-    tensorflow::gtl::ArraySlice<int64> multi_index, uint64 value) {
-  int64 linear_index = LinearIndex(multi_index);
-  (*mutable_u64s())[linear_index] = value;
+template <typename NativeT>
+void Literal::AppendSparseElement(
+    tensorflow::gtl::ArraySlice<int64> multi_index, NativeT value,
+    const ShapeIndex& shape_index) {
+  Piece& p = piece(shape_index);
+  const Shape& subshape = p.subshape();
+  CHECK(LayoutUtil::IsSparseArray(subshape));
+  int64 rank = ShapeUtil::Rank(subshape);
+  CHECK_EQ(multi_index.size(), rank);
+  int64 last_element = p.sparse_indices()->index_count();
+  CHECK_LT(last_element, LayoutUtil::MaxSparseElements(subshape.layout()));
+  p.sparse_indices()->Append(multi_index);
+  CHECK_LT(last_element, p.data<NativeT>().size());
+  p.data<NativeT>()[last_element] = value;
 }
 
 // Returns an identity matrix (rank 2) with the given row and column count.
@@ -1071,51 +1159,31 @@ void Literal::EachCell(
   } while (IndexUtil::BumpIndices(shape(), &indices));
 }
 
-template <typename NativeT>
-inline void Literal::PopulateR0(NativeT value) {
-  *mutable_shape() = ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<NativeT>(), {});
-  Resize<NativeT>(1, value);
-}
-
 template <typename NativeT>
 inline void Literal::PopulateR1(tensorflow::gtl::ArraySlice<NativeT> values) {
-  *mutable_shape() =
-      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<NativeT>(),
-                           {static_cast<int64>(values.size())});
-  Reserve(values.size());
+  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK_EQ(ShapeUtil::Rank(shape()), 1);
+  CHECK_EQ(ShapeUtil::ElementsIn(shape()), values.size());
+  CHECK_EQ(shape().element_type(),
+           primitive_util::NativeToPrimitiveType<NativeT>());
   for (int64 i = 0; i < values.size(); ++i) {
     Set({i}, values[i]);
   }
 }
 
-inline void Literal::PopulateR1(const tensorflow::core::Bitmap& values) {
-  *mutable_shape() =
-      ShapeUtil::MakeShape(PRED, {static_cast<int64>(values.bits())});
-  Reserve(values.bits());
-  for (int64 i = 0; i < static_cast<int64>(values.bits()); ++i) {
-    Set({i}, values.get(i));
-  }
-}
-
 template <typename NativeT>
-void Literal::PopulateR2WithLayout(
-    std::initializer_list<std::initializer_list<NativeT>> values,
-    const Layout& layout) {
-  *mutable_shape() = ShapeUtil::MakeShapeWithLayout(
-      primitive_util::NativeToPrimitiveType<NativeT>(),
-      {static_cast<int64>(values.size()),
-       static_cast<int64>(values.begin()->size())},
-      AsInt64Slice(layout.minor_to_major()));
+void Literal::PopulateR2(
+    std::initializer_list<std::initializer_list<NativeT>> values) {
+  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK_EQ(ShapeUtil::Rank(shape()), 2);
+  CHECK_EQ(shape().element_type(),
+           primitive_util::NativeToPrimitiveType<NativeT>());
 
   const int64 dim0_size = values.size();
   const int64 dim1_size = values.begin()->size();
   CHECK_EQ(dim0_size, shape().dimensions(0));
   CHECK_EQ(dim1_size, shape().dimensions(1));
 
-  const int64 num_elements = dim1_size * dim0_size;
-  Reserve(num_elements);
-
   int64 dim0 = 0;
   for (auto inner_list : values) {
     int64 dim1 = 0;
@@ -1129,69 +1197,65 @@ void Literal::PopulateR2WithLayout(
 }
 
 template <typename NativeT>
-void Literal::PopulateR2(
-    std::initializer_list<std::initializer_list<NativeT>> values) {
-  PopulateR2WithLayout(values, LayoutUtil::GetDefaultLayoutForR2());
-}
-
-template <typename NativeT>
-void Literal::PopulateFromArrayWithLayout(const Array<NativeT>& values,
-                                          const Layout& layout) {
-  *mutable_shape() = ShapeUtil::MakeShapeWithLayout(
-      primitive_util::NativeToPrimitiveType<NativeT>(), values.dimensions(),
-      AsInt64Slice(layout.minor_to_major()));
-  Reserve(values.num_elements());
+void Literal::PopulateFromArray(const Array<NativeT>& values) {
+  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK_EQ(shape().element_type(),
+           primitive_util::NativeToPrimitiveType<NativeT>());
+  CHECK_EQ(ShapeUtil::Rank(shape()), values.num_dimensions());
+  for (int dim = 0; dim < values.num_dimensions(); ++dim) {
+    CHECK_EQ(values.dim(dim), shape().dimensions(dim));
+  }
   values.Each([this](tensorflow::gtl::ArraySlice<int64> indices,
                      NativeT value) { this->Set(indices, value); });
 }
 
-template <typename NativeT>
-void Literal::PopulateFromArray(const Array<NativeT>& values) {
-  PopulateFromArrayWithLayout(
-      values, LayoutUtil::GetDefaultLayoutForRank(values.num_dimensions()));
-}
-
-template <typename NativeT>
-void Literal::PopulateR2FromArray2DWithLayout(const Array2D<NativeT>& values,
-                                              const Layout& layout) {
-  PopulateFromArrayWithLayout(values, layout);
-}
-
 template <typename NativeT>
 void Literal::PopulateR2FromArray2D(const Array2D<NativeT>& values) {
   PopulateFromArray(values);
 }
 
-template <typename NativeT>
-void Literal::PopulateR3FromArray3DWithLayout(const Array3D<NativeT>& values,
-                                              const Layout& layout) {
-  PopulateFromArrayWithLayout(values, layout);
-}
-
 template <typename NativeT>
 void Literal::PopulateR3FromArray3D(const Array3D<NativeT>& values) {
   PopulateFromArray(values);
 }
 
 template <typename NativeT>
-void Literal::PopulateR4FromArray4DWithLayout(const Array4D<NativeT>& values,
-                                              const Layout& layout) {
-  PopulateFromArrayWithLayout(values, layout);
+void Literal::PopulateR4FromArray4D(const Array4D<NativeT>& values) {
+  PopulateFromArray(values);
 }
 
 template <typename NativeT>
-void Literal::PopulateR4FromArray4D(const Array4D<NativeT>& values) {
-  PopulateFromArray(values);
+void Literal::PopulateSparse(SparseIndexArray indices,
+                             tensorflow::gtl::ArraySlice<NativeT> values,
+                             bool sort) {
+  CHECK(LayoutUtil::IsSparseArray(shape()));
+  int rank = ShapeUtil::Rank(shape());
+  CHECK_EQ(indices.rank(), rank);
+  int64 max_elements = LayoutUtil::MaxSparseElements(shape().layout());
+  CHECK_LE(indices.max_indices(), max_elements);
+  int64 num_elements = values.size();
+  CHECK_LE(num_elements, max_elements);
+  CHECK_EQ(num_elements, indices.index_count());
+  auto root_data = root_piece().data<NativeT>();
+  root_data.remove_suffix(max_elements - values.size());
+  std::copy(values.begin(), values.end(), root_data.begin());
+  *this->root_piece().sparse_indices() = std::move(indices);
+  if (sort) {
+    auto root_data = this->root_piece().data<NativeT>();
+    root_data.remove_suffix(root_data.size() - num_elements);
+    this->root_piece().sparse_indices()->SortWithValues(root_data);
+  }
+  DCHECK(this->root_piece().sparse_indices()->Validate(shape()));
 }
 
 template <typename NativeT, typename FnType>
 Status Literal::Populate(const FnType& generator) {
   const Shape& this_shape = shape();
   const int64 rank = ShapeUtil::Rank(this_shape);
+  TF_RET_CHECK(LayoutUtil::IsDenseArray(this_shape));
   TF_RET_CHECK(this_shape.element_type() ==
                primitive_util::NativeToPrimitiveType<NativeT>());
-  tensorflow::gtl::MutableArraySlice<NativeT> data =
-      GetMutableArraySlice<NativeT>();
+  tensorflow::gtl::MutableArraySlice<NativeT> literal_data = data<NativeT>();
   if (rank > 0) {
     StrideConfig stride_config(this_shape, this_shape,
                                AsInt64Slice(this_shape.dimensions()));
@@ -1200,11 +1264,12 @@ Status Literal::Populate(const FnType& generator) {
         ShapeUtil::GetDimension(this_shape, stride_config.minor_dimension);
 
     auto init_function = [&](const std::vector<int64>& indexes) {
-      const int64 index = LinearIndex(indexes);
+      const int64 index =
+          IndexUtil::MultidimensionalIndexToLinearIndex(shape(), indexes);
       std::copy(indexes.begin(), indexes.end(), minor_scan_indexes.begin());
       for (int64 i = 0; i < minor_dimension_size; ++i) {
         minor_scan_indexes[stride_config.minor_dimension] = i;
-        data.at(index + i) = generator(minor_scan_indexes);
+        literal_data.at(index + i) = generator(minor_scan_indexes);
       }
       return true;
     };
@@ -1213,32 +1278,27 @@ Status Literal::Populate(const FnType& generator) {
                             init_function);
   } else {
     // For scalars.
-    data.at(0) = generator({});
+    literal_data.at(0) = generator({});
   }
   return Status::OK();
 }
 
 template <typename NativeT>
-void Literal::PopulateWithValue(NativeT value,
-                                tensorflow::gtl::ArraySlice<int64> dimensions) {
-  *mutable_shape() = ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<NativeT>(), dimensions);
-  Resize<NativeT>(ShapeUtil::ElementsIn(shape()), value);
+void Literal::PopulateWithValue(NativeT value) {
+  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK_EQ(shape().element_type(),
+           primitive_util::NativeToPrimitiveType<NativeT>());
+  for (NativeT& element : data<NativeT>()) {
+    element = value;
+  }
 }
 
 template <typename NativeT>
-/* static */ std::unique_ptr<Literal>
-Literal::CreateFullWithMonotonicDim0MajorLayout(
+/* static */ std::unique_ptr<Literal> Literal::CreateFullWithDescendingLayout(
     tensorflow::gtl::ArraySlice<int64> dimensions, NativeT value) {
-  Shape this_shape = ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
-      primitive_util::NativeToPrimitiveType<NativeT>(), dimensions);
-  auto literal = MakeUnique<Literal>();
-  *literal->mutable_shape() = this_shape;
-  literal->Reserve(ShapeUtil::ElementsIn(this_shape));
-  std::vector<int64> index(dimensions.size(), 0);
-  do {
-    literal->Set(index, value);
-  } while (IndexUtil::BumpIndices(this_shape, &index));
+  auto literal = MakeUnique<Literal>(ShapeUtil::MakeShapeWithDescendingLayout(
+      primitive_util::NativeToPrimitiveType<NativeT>(), dimensions));
+  literal->PopulateWithValue(value);
   return literal;
 }
 
@@ -1249,14 +1309,12 @@ std::unique_ptr<Literal> Literal::Replicate(int64 times) const {
   for (int64 bound : shape().dimensions()) {
     bounds.push_back(bound);
   }
-  auto literal = MakeUnique<Literal>();
-  *literal->mutable_shape() =
-      ShapeUtil::MakeShape(shape().element_type(), bounds);
+  auto literal =
+      MakeUnique<Literal>(ShapeUtil::MakeShape(shape().element_type(), bounds));
   int64 elements = ShapeUtil::ElementsIn(literal->shape());
   if (elements == 0) {
     return literal;
   }
-  literal->Reserve(elements);
 
   DimensionVector output_indices(bounds.size(), 0);
   tensorflow::gtl::ArraySlice<int64> input_indices = output_indices;
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 816bb3c549eaae4e8fc2b7d438627266603272f9..b3583c2eb75de8297d5e7507430491f119bd4462 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -31,6 +31,7 @@ namespace xla {
 namespace {
 
 using ::testing::ElementsAre;
+using ::testing::HasSubstr;
 
 class LiteralUtilTest : public ::testing::Test {
  protected:
@@ -192,6 +193,34 @@ TEST_F(LiteralUtilTest, CreateR3FromArray3d) {
   ASSERT_EQ(expected, result);
 }
 
+TEST_F(LiteralUtilTest, CreateSparse) {
+  std::vector<int64> dimensions = {8, 8, 8};
+  Array2D<int64> indices = {
+      {3, 4, 5},
+      {1, 2, 3},
+      {2, 3, 4},
+      {3, 5, 6},
+  };
+  std::vector<int64> values = {7, 8, 9, 10};
+  auto literal = Literal::CreateSparse<int64>(
+      dimensions, SparseIndexArray(indices.n1() + 3, indices), values);
+
+  Array2D<int64> expected_indices = {
+      {1, 2, 3},
+      {2, 3, 4},
+      {3, 4, 5},
+      {3, 5, 6},
+  };
+  std::vector<int64> expected_values = {8, 9, 7, 10};
+
+  EXPECT_EQ(literal->sparse_indices()->data(),
+            tensorflow::gtl::ArraySlice<int64>(
+                expected_indices.data(), expected_indices.num_elements()));
+  EXPECT_EQ(tensorflow::gtl::ArraySlice<int64>(literal->data<int64>().data(),
+                                               expected_values.size()),
+            tensorflow::gtl::ArraySlice<int64>(expected_values));
+}
+
 TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
   // clang-format off
   auto literal = Literal::CreateR4Projected<float>({
@@ -293,29 +322,28 @@ TEST_F(LiteralUtilTest, NonScalarEquality) {
   auto matrix_different = Literal::CreateR2<float>({{4.0, 3.0}, {1.0, 2.0}});
   auto vector_literal = Literal::CreateR1<float>({1.0, 2.0, 3.0, 4.0});
   auto scalar = Literal::CreateR0<float>(1.0);
+  Literal nil(ShapeUtil::MakeNil());
 
   EXPECT_EQ(*matrix, *matrix);
   EXPECT_EQ(*matrix, *matrix_clone);
   EXPECT_NE(*matrix, *matrix_different);
   EXPECT_NE(*matrix, *vector_literal);
   EXPECT_NE(*matrix, *scalar);
+  EXPECT_NE(*matrix, nil);
+  EXPECT_EQ(nil, nil);
 }
 
 TEST_F(LiteralUtilTest, DifferentLayoutEquality) {
   // Test equality with literals which have different layouts.
-  auto colmajor = MakeUnique<Literal>();
-  *colmajor->mutable_shape() = ShapeUtil::MakeShape(F32, {2, 2});
-  *colmajor->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
-  colmajor->Reserve(4);
+  auto colmajor =
+      MakeUnique<Literal>(ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1}));
   colmajor->Set<float>({0, 0}, 1.0);
   colmajor->Set<float>({0, 1}, 2.0);
   colmajor->Set<float>({1, 0}, 3.0);
   colmajor->Set<float>({1, 1}, 4.0);
 
-  auto rowmajor = MakeUnique<Literal>();
-  *rowmajor->mutable_shape() = ShapeUtil::MakeShape(F32, {2, 2});
-  *rowmajor->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({1, 0});
-  rowmajor->Reserve(4);
+  auto rowmajor =
+      MakeUnique<Literal>(ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0}));
   rowmajor->Set<float>({0, 0}, 1.0);
   rowmajor->Set<float>({0, 1}, 2.0);
   rowmajor->Set<float>({1, 0}, 3.0);
@@ -515,7 +543,7 @@ TYPED_TEST(LiteralUtilTestTemplated, Relayout2x2) {
 
 TEST_F(LiteralUtilTest, ReshapeR0) {
   auto original = Literal::CreateR0<float>(1.7f);
-  auto reshape = original->Reshape(/*shape=*/{}).ConsumeValueOrDie();
+  auto reshape = original->Reshape(/*dimensions=*/{}).ConsumeValueOrDie();
   EXPECT_EQ(*original, *reshape);
 }
 
@@ -597,24 +625,26 @@ TEST_F(LiteralUtilTest, TestR4RelayoutEquivalence) {
 
 TEST_F(LiteralUtilTest, TestR2LinearLayout) {
   // Test expected memory layout of R2 dim0-minor (column-major) literal.
-  auto mat_dim0minor = Literal::CreateR2WithLayout<int>({{1, 2, 3}, {4, 5, 6}},
-                                                        layout_r2_dim0minor_);
-  EXPECT_EQ(mat_dim0minor->s32s_size(), 6);
-  EXPECT_THAT(mat_dim0minor->s32s(), ElementsAre(1, 4, 2, 5, 3, 6));
+  auto mat_dim0minor = Literal::CreateR2WithLayout<int32>(
+      {{1, 2, 3}, {4, 5, 6}}, layout_r2_dim0minor_);
+  EXPECT_EQ(mat_dim0minor->element_count(), 6);
+  EXPECT_THAT(mat_dim0minor->data<int32>(), ElementsAre(1, 4, 2, 5, 3, 6));
 
   // Test expected memory layout when using Relayout to row major.
   auto relaid_mat_to_dim0major = mat_dim0minor->Relayout(layout_r2_dim0major_);
-  EXPECT_THAT(relaid_mat_to_dim0major->s32s(), ElementsAre(1, 2, 3, 4, 5, 6));
+  EXPECT_THAT(relaid_mat_to_dim0major->data<int32>(),
+              ElementsAre(1, 2, 3, 4, 5, 6));
 
   // Test expected memory layout of R2 created with dim0-major (row-major).
-  auto mat_dim0major = Literal::CreateR2WithLayout<int>({{1, 2, 3}, {4, 5, 6}},
-                                                        layout_r2_dim0major_);
-  EXPECT_EQ(mat_dim0major->s32s_size(), 6);
-  EXPECT_THAT(mat_dim0major->s32s(), ElementsAre(1, 2, 3, 4, 5, 6));
+  auto mat_dim0major = Literal::CreateR2WithLayout<int32>(
+      {{1, 2, 3}, {4, 5, 6}}, layout_r2_dim0major_);
+  EXPECT_EQ(mat_dim0major->element_count(), 6);
+  EXPECT_THAT(mat_dim0major->data<int32>(), ElementsAre(1, 2, 3, 4, 5, 6));
 
   // Test expected memory layout when using Relayout to column major.
   auto relaid_mat_to_dim0minor = mat_dim0major->Relayout(layout_r2_dim0minor_);
-  EXPECT_THAT(relaid_mat_to_dim0minor->s32s(), ElementsAre(1, 4, 2, 5, 3, 6));
+  EXPECT_THAT(relaid_mat_to_dim0minor->data<int32>(),
+              ElementsAre(1, 4, 2, 5, 3, 6));
 }
 
 TEST_F(LiteralUtilTest, TestR3LinearLayout) {
@@ -634,27 +664,27 @@ TEST_F(LiteralUtilTest, TestR3LinearLayout) {
   auto lit_dim0minor =
       Literal::CreateR3FromArray3DWithLayout<int>(arr3d, layout_r3_dim0minor_);
 
-  EXPECT_EQ(lit_dim0minor->s32s_size(), 12);
+  EXPECT_EQ(lit_dim0minor->element_count(), 12);
   std::vector<int> expected_dim0minor{1, 7, 4, 10, 2, 8, 5, 11, 3, 9, 6, 12};
-  EXPECT_THAT(lit_dim0minor->s32s(),
+  EXPECT_THAT(lit_dim0minor->data<int32>(),
               testing::ElementsAreArray(expected_dim0minor));
 
   // Test expected memory layout when using Relayout to row major.
   auto relaid_lit_to_dim0major = lit_dim0minor->Relayout(layout_r3_dim0major_);
   std::vector<int> expected_dim0major{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-  EXPECT_THAT(relaid_lit_to_dim0major->s32s(),
+  EXPECT_THAT(relaid_lit_to_dim0major->data<int32>(),
               testing::ElementsAreArray(expected_dim0major));
 
   // Test expected memory layout of R3 created with dim0-major (row-major).
   auto lit_dim0major =
       Literal::CreateR3FromArray3DWithLayout<int>(arr3d, layout_r3_dim0major_);
-  EXPECT_EQ(lit_dim0major->s32s_size(), 12);
-  EXPECT_THAT(lit_dim0major->s32s(),
+  EXPECT_EQ(lit_dim0major->element_count(), 12);
+  EXPECT_THAT(lit_dim0major->data<int32>(),
               testing::ElementsAreArray(expected_dim0major));
 
   // Test expected memory layout when using Relayout to column major.
   auto relaid_lit_to_dim0minor = lit_dim0major->Relayout(layout_r3_dim0minor_);
-  EXPECT_THAT(relaid_lit_to_dim0minor->s32s(),
+  EXPECT_THAT(relaid_lit_to_dim0minor->data<int32>(),
               testing::ElementsAreArray(expected_dim0minor));
 }
 
@@ -687,28 +717,28 @@ TEST_F(LiteralUtilTest, SliceR3U32Full) {
 }
 
 TEST_F(LiteralUtilTest, PopulateR1S64) {
-  Literal output;
+  Literal output(ShapeUtil::MakeShape(S64, {1}));
   output.PopulateR1<int64>({77});
   auto expected = Literal::CreateR1<int64>({77});
   EXPECT_EQ(output, *expected);
 }
 
 TEST_F(LiteralUtilTest, PopulateR1U64) {
-  Literal output;
+  Literal output(ShapeUtil::MakeShape(U64, {2}));
   output.PopulateR1<uint64>({{77, 88}});
   auto expected = Literal::CreateR1<uint64>({{77, 88}});
   EXPECT_EQ(output, *expected);
 }
 
 TEST_F(LiteralUtilTest, PopulateR1C64) {
-  Literal output;
+  Literal output(ShapeUtil::MakeShape(C64, {1}));
   output.PopulateR1<complex64>({{77, 88}});
   auto expected = Literal::CreateR1<complex64>({{77, 88}});
   EXPECT_EQ(output, *expected);
 }
 
 TEST_F(LiteralUtilTest, PopulateR2C64) {
-  Literal output;
+  Literal output(ShapeUtil::MakeShape(C64, {2, 2}));
   output.PopulateR2<complex64>({{{7, 8}, {9, 10}}, {{1, 2}, {3, 4}}});
   auto expected =
       Literal::CreateR2<complex64>({{{7, 8}, {9, 10}}, {{1, 2}, {3, 4}}});
@@ -716,78 +746,78 @@ TEST_F(LiteralUtilTest, PopulateR2C64) {
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR0BF16) {
-  Literal output;
+  Literal output(ShapeUtil::MakeShape(BF16, {}));
   bfloat16 h(0.25f);
-  output.PopulateWithValue<bfloat16>(h, {});
+  output.PopulateWithValue<bfloat16>(h);
   auto expected = Literal::CreateR0<bfloat16>(h);
   EXPECT_EQ(output, *expected);
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR1BF16) {
-  Literal output;
+  Literal output(ShapeUtil::MakeShape(BF16, {3}));
   bfloat16 h(0.5f);
-  output.PopulateWithValue<bfloat16>(h, {3});
+  output.PopulateWithValue<bfloat16>(h);
   auto expected = Literal::CreateR1<bfloat16>({h, h, h});
   EXPECT_EQ(output, *expected);
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR2BF16) {
-  Literal output;
+  Literal output(ShapeUtil::MakeShape(BF16, {2, 2}));
   bfloat16 h(2.0f);
-  output.PopulateWithValue<bfloat16>(h, {2, 2});
+  output.PopulateWithValue<bfloat16>(h);
   auto expected = Literal::CreateR2<bfloat16>({{h, h}, {h, h}});
   EXPECT_EQ(output, *expected);
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR0F32) {
-  Literal output;
-  output.PopulateWithValue<float>(2.5f, {});
+  Literal output(ShapeUtil::MakeShape(F32, {}));
+  output.PopulateWithValue<float>(2.5f);
   auto expected = Literal::CreateR0<float>(2.5f);
   EXPECT_EQ(output, *expected);
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR1S64) {
-  Literal output;
-  output.PopulateWithValue<int64>(-7, {3});
+  Literal output(ShapeUtil::MakeShape(S64, {3}));
+  output.PopulateWithValue<int64>(-7);
   auto expected = Literal::CreateR1<int64>({-7, -7, -7});
   EXPECT_EQ(output, *expected);
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR2U64) {
-  Literal output;
-  output.PopulateWithValue<uint64>(42, {2, 2});
+  Literal output(ShapeUtil::MakeShape(U64, {2, 2}));
+  output.PopulateWithValue<uint64>(42);
   auto expected = Literal::CreateR2<uint64>({{42, 42}, {42, 42}});
   EXPECT_EQ(output, *expected);
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR2C64) {
-  Literal output;
-  output.PopulateWithValue<complex64>({4, 2}, {2, 2});
+  Literal output(ShapeUtil::MakeShape(C64, {2, 2}));
+  output.PopulateWithValue<complex64>({4, 2});
   auto expected =
       Literal::CreateR2<complex64>({{{4, 2}, {4, 2}}, {{4, 2}, {4, 2}}});
   EXPECT_EQ(output, *expected);
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR0F16) {
-  Literal output;
+  Literal output(ShapeUtil::MakeShape(F16, {}));
   half h(0.25f);
-  output.PopulateWithValue<half>(h, {});
+  output.PopulateWithValue<half>(h);
   auto expected = Literal::CreateR0<half>(h);
   EXPECT_EQ(output, *expected);
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR1F16) {
-  Literal output;
+  Literal output(ShapeUtil::MakeShape(F16, {3}));
   half h(0.5f);
-  output.PopulateWithValue<half>(h, {3});
+  output.PopulateWithValue<half>(h);
   auto expected = Literal::CreateR1<half>({h, h, h});
   EXPECT_EQ(output, *expected);
 }
 
 TEST_F(LiteralUtilTest, PopulateWithValueR2F16) {
-  Literal output;
+  Literal output(ShapeUtil::MakeShape(F16, {2, 2}));
   half h(2.0f);
-  output.PopulateWithValue<half>(h, {2, 2});
+  output.PopulateWithValue<half>(h);
   auto expected = Literal::CreateR2<half>({{h, h}, {h, h}});
   EXPECT_EQ(output, *expected);
 }
@@ -803,7 +833,7 @@ TEST_F(LiteralUtilTest, ReplicateR2U32) {
   EXPECT_EQ(*output, *expected);
 }
 
-TEST_F(LiteralUtilTest, Copy) {
+TEST_F(LiteralUtilTest, CopySliceFrom) {
   const int64 dimensions[] = {17, 15, 34, 21};
   const int64 layouts[][4] = {
       {3, 2, 1, 0}, {0, 2, 1, 3}, {0, 1, 2, 3}, {2, 0, 3, 1}, {1, 3, 0, 2}};
@@ -826,7 +856,7 @@ TEST_F(LiteralUtilTest, Copy) {
     const int64 src_base[] = {3, 1, 5, 7};
     const int64 dest_base[] = {6, 4, 12, 2};
     const int64 copy_size[] = {7, 8, 11, 9};
-    TF_EXPECT_OK(blank->Copy(*source, src_base, dest_base, copy_size));
+    TF_EXPECT_OK(blank->CopySliceFrom(*source, src_base, dest_base, copy_size));
 
     std::vector<int64> source_indexes(TF_ARRAYSIZE(dimensions), 0);
     std::vector<int64> blank_indexes(TF_ARRAYSIZE(dimensions), 0);
@@ -849,16 +879,16 @@ TEST_F(LiteralUtilTest, Copy) {
   }
 }
 
-TEST_F(LiteralUtilTest, CopyScalars) {
+TEST_F(LiteralUtilTest, CopyFromScalars) {
   auto zero = Literal::CreateR0<uint32>(0);
   auto nine = Literal::CreateR0<uint32>(9);
-  TF_EXPECT_OK(zero->Copy(*nine, {}, {}, {}));
+  TF_EXPECT_OK(zero->CopyFrom(*nine));
   EXPECT_EQ(*zero, *nine);
 
   auto vect = Literal::CreateR1<uint32>({3, 4, 9, 12, 5, 17, 21});
-  TF_EXPECT_OK(zero->Copy(*vect, {5}, {}, {}));
+  TF_EXPECT_OK(zero->CopySliceFrom(*vect, {5}, {}, {}));
   EXPECT_EQ(zero->Get<uint32>({}), 17);
-  TF_EXPECT_OK(vect->Copy(*zero, {}, {4}, {}));
+  TF_EXPECT_OK(vect->CopySliceFrom(*zero, {}, {4}, {}));
   EXPECT_EQ(vect->Get<uint32>({4}), 17);
 }
 
@@ -872,7 +902,7 @@ TEST_F(LiteralUtilTest, CopyFromAndToZeroElement) {
     const auto empty = Literal::CreateFromShape(empty_r1_shape);
     auto nine = Literal::CreateR1<float>({9});
 
-    TF_EXPECT_OK(nine->Copy(*empty, {0}, {0}, {0}));
+    TF_EXPECT_OK(nine->CopySliceFrom(*empty, {0}, {0}, {0}));
     EXPECT_EQ(*nine, *const_nine);
   }
 
@@ -881,18 +911,101 @@ TEST_F(LiteralUtilTest, CopyFromAndToZeroElement) {
     const auto empty = Literal::CreateFromShape(empty_r1_shape);
     auto nine = Literal::CreateR1<float>({9});
 
-    TF_EXPECT_OK(empty->Copy(*nine, {0}, {0}, {0}));
+    TF_EXPECT_OK(empty->CopySliceFrom(*nine, {0}, {0}, {0}));
     EXPECT_EQ(*empty, *const_empty);
   }
 }
 
+TEST_F(LiteralUtilTest, CopyFromNilShape) {
+  Literal nil_literal0(ShapeUtil::MakeNil());
+  Literal nil_literal1(ShapeUtil::MakeNil());
+  // This doesn't actually do any copying, but it should succeed.
+  TF_ASSERT_OK(nil_literal0.CopyFrom(nil_literal1));
+}
+
+TEST_F(LiteralUtilTest, CopyFromArrays) {
+  auto scalar_42 = Literal::CreateR0<float>(42.0);
+  auto scalar_123 = Literal::CreateR0<float>(123.0);
+  EXPECT_NE(*scalar_42, *scalar_123);
+  TF_ASSERT_OK(scalar_42->CopyFrom(*scalar_123, /*dest_shape_index=*/{},
+                                   /*src_shape_index=*/{}));
+  EXPECT_EQ(*scalar_42, *scalar_123);
+  EXPECT_EQ(scalar_42->Get<float>({}), 123.0f);
+
+  auto matrix_1234 = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  auto matrix_5678 = Literal::CreateR2<float>({{5.0, 6.0}, {7.0, 8.0}});
+  EXPECT_NE(*matrix_1234, *matrix_5678);
+  EXPECT_EQ(matrix_1234->Get<float>({0, 0}), 1.0f);
+  TF_ASSERT_OK(matrix_1234->CopyFrom(*matrix_5678, /*dest_shape_index=*/{},
+                                     /*src_shape_index=*/{}));
+  EXPECT_EQ(*matrix_1234, *matrix_5678);
+  EXPECT_EQ(matrix_1234->Get<float>({0, 0}), 5.0f);
+}
+
+TEST_F(LiteralUtilTest, CopyFromTuples) {
+  auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  Literal nil_literal(ShapeUtil::MakeNil());
+  auto nested_tuple = Literal::MakeTuple(
+      {matrix.get(),
+       Literal::MakeTuple({Literal::CreateR0<int32>(42).get(),
+                           Literal::CreateR1<double>({23.0, 44.0}).get(),
+                           &nil_literal})
+           .get()});
+  // Create a tuple the same shape as the inner tuple of nested_tuple but with
+  // different values..
+  auto tuple = Literal::MakeTuple({Literal::CreateR0<int32>(-5).get(),
+                                   Literal::CreateR1<double>({2.0, 4.0}).get(),
+                                   &nil_literal});
+
+  EXPECT_EQ(*matrix, LiteralView::Create(*nested_tuple, {0}));
+  EXPECT_EQ(nested_tuple->Get<int32>({}, {1, 0}), 42);
+  EXPECT_EQ(nested_tuple->Get<double>({0}, {1, 1}), 23.0);
+  EXPECT_EQ(nested_tuple->Get<double>({1}, {1, 1}), 44.0);
+
+  // Overwrite the inner tuple element of nested_tuple with the contents of
+  // 'tuple'.
+  TF_ASSERT_OK(nested_tuple->CopyFrom(*tuple, /*dest_shape_index=*/{1},
+                                      /*src_shape_index=*/{}));
+
+  // The matrix element should be unchanged.
+  EXPECT_EQ(*matrix, LiteralView::Create(*nested_tuple, {0}));
+
+  // The tuple element should have been copied from 'tuple'.
+  EXPECT_EQ(nested_tuple->Get<int32>({}, {1, 0}), -5);
+  EXPECT_EQ(nested_tuple->Get<double>({0}, {1, 1}), 2.0);
+  EXPECT_EQ(nested_tuple->Get<double>({1}, {1, 1}), 4.0);
+}
+TEST_F(LiteralUtilTest, CopyBetweenSameTuple) {
+  auto tuple = Literal::MakeTuple(
+      {Literal::CreateR0<int32>(-2).get(), Literal::CreateR0<int32>(4).get()});
+
+  EXPECT_EQ(tuple->Get<int32>({}, {0}), -2);
+  EXPECT_EQ(tuple->Get<int32>({}, {1}), 4);
+
+  // Copy from one element to the other.
+  TF_ASSERT_OK(tuple->CopyFrom(*tuple, /*dest_shape_index=*/{1},
+                               /*src_shape_index=*/{0}));
+
+  EXPECT_EQ(tuple->Get<int32>({}, {0}), -2);
+  EXPECT_EQ(tuple->Get<int32>({}, {1}), -2);
+}
+
+TEST_F(LiteralUtilTest, CopyFromDifferentShapes) {
+  auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  auto vector = Literal::CreateR1<float>({5.0, 7.0});
+  Status status = matrix->CopyFrom(*vector);
+  ASSERT_FALSE(status.ok());
+  ASSERT_THAT(status.error_message(),
+              HasSubstr("Destination subshape incompatible"));
+}
+
 TEST_F(LiteralUtilTest, F16) {
   // Verify that the internal data views are consistent and that they
   // are in little endian format
   // TODO - modify if we make the data format machine endianess dependent
   auto m1 = Literal::CreateFromShape(ShapeUtil::MakeShape(F16, {2, 2}));
   Literal* l1 = m1.get();
-  const char* d1 = static_cast<const char*>(l1->InternalData());
+  const char* d1 = reinterpret_cast<const char*>(l1->data<half>().data());
   EXPECT_EQ(d1[0], 0);
   EXPECT_EQ(d1[1], 0);
   EXPECT_EQ(d1[2], 0);
@@ -901,13 +1014,12 @@ TEST_F(LiteralUtilTest, F16) {
   EXPECT_EQ(d1[5], 0);
   EXPECT_EQ(d1[6], 0);
   EXPECT_EQ(d1[7], 0);
-  EXPECT_EQ(l1->InternalData(), l1->MutableInternalData());
 
   half h1(1.0f);
   half h2(2.0f);
   auto m2 = Literal::CreateR2<half>({{h1, h2}, {h2, h1}});
   Literal* l2 = m2.get();
-  const char* d2 = static_cast<const char*>(l2->InternalData());
+  const char* d2 = reinterpret_cast<const char*>(l2->data<half>().data());
   EXPECT_EQ(d2[0], 0);
   EXPECT_EQ(d2[1], 0x3C);
   EXPECT_EQ(d2[2], 0);
@@ -916,7 +1028,6 @@ TEST_F(LiteralUtilTest, F16) {
   EXPECT_EQ(d2[5], 0x40);
   EXPECT_EQ(d2[6], 0);
   EXPECT_EQ(d2[7], 0x3C);
-  EXPECT_EQ(l2->InternalData(), l2->MutableInternalData());
 }
 
 TEST_F(LiteralUtilTest, Populate) {
@@ -941,7 +1052,9 @@ TEST_F(LiteralUtilTest, Populate) {
     auto generator = [&](tensorflow::gtl::ArraySlice<int64> indexes) -> uint32 {
       // Offsets from linear index just to avoid R0 literals to be initialized
       // with zero.
-      return literal->LinearIndex(indexes) + 17;
+      return IndexUtil::MultidimensionalIndexToLinearIndex(literal->shape(),
+                                                           indexes) +
+             17;
     };
     TF_EXPECT_OK(literal->Populate<uint32>(generator));
 
@@ -1118,16 +1231,18 @@ TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
   for (int len = 0; len < 25; ++len) {
     p.mutable_shape()->clear_dimensions();
     p.mutable_shape()->add_dimensions(len);
+    LayoutUtil::SetToDefaultLayout(p.mutable_shape());
     p.clear_preds();
     for (int i = 0; i < len; ++i) {
       p.add_preds((i % 2) == (len % 2));
     }
 
-    Literal literal(p);
-    ASSERT_EQ(len, literal.preds_size());
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> literal,
+                            Literal::CreateFromProto(p));
+    ASSERT_EQ(len, literal->data<bool>().size());
     int i = 0;
-    for (auto it = literal.preds().begin(); it < literal.preds().end(); ++it) {
-      EXPECT_EQ((i % 2) == (len % 2), *it);
+    for (bool value : literal->data<bool>()) {
+      EXPECT_EQ((i % 2) == (len % 2), value);
       ++i;
     }
   }
@@ -1141,8 +1256,7 @@ TEST_F(LiteralUtilTest, ToProto_f16) {
   auto m = Literal::CreateR2<half>({{h1, h2}, {h2, h1}});
   Literal* l = m.get();
   EXPECT_EQ(4, ShapeUtil::ElementsIn(l->shape()));
-  EXPECT_EQ(4, l->f16s().size());
-  EXPECT_EQ(4, l->f16s_size());
+  EXPECT_EQ(4, l->data<half>().size());
 
   LiteralProto p = l->ToProto();
   EXPECT_EQ(4, ShapeUtil::ElementsIn(p.shape()));
@@ -1168,17 +1282,12 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) {
   p.mutable_shape()->set_element_type(F16);
   p.mutable_shape()->clear_dimensions();
   p.mutable_shape()->add_dimensions(4);
+  LayoutUtil::SetToDefaultLayout(p.mutable_shape());
   p.clear_f16s();
   p.set_f16s(half_vals, 8);
-
-  Literal literal(p);
-  ASSERT_EQ(4, literal.f16s_size());
-  ASSERT_EQ(h1, literal.f16s(0));
-  ASSERT_EQ(h2, literal.f16s(1));
-  ASSERT_EQ(h2, literal.f16s(2));
-  ASSERT_EQ(h1, literal.f16s(3));
-
-  const std::vector<half>& r = literal.f16s();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> literal,
+                          Literal::CreateFromProto(p));
+  auto r = literal->data<half>();
   ASSERT_EQ(4, r.size());
   ASSERT_EQ(h1, r[0]);
   ASSERT_EQ(h2, r[1]);
@@ -1186,24 +1295,402 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) {
   ASSERT_EQ(h1, r[3]);
 }
 
-TEST_F(LiteralUtilTest, Subliterals) {
+TEST_F(LiteralUtilTest, LiteralViewTest) {
+  auto scalar = Literal::CreateR0<float>(1.0);
+  auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()});
+  auto nested_tuple = Literal::MakeTuple({tuple.get(), scalar.get()});
+  Literal nil(ShapeUtil::MakeNil());
+
+  EXPECT_EQ(LiteralView::Create(*scalar, {}), *scalar);
+  EXPECT_EQ(LiteralView::Create(*matrix, {}), *matrix);
+  EXPECT_EQ(LiteralView::Create(*tuple, {}), *tuple);
+  EXPECT_EQ(LiteralView::Create(*nested_tuple, {}), *nested_tuple);
+  EXPECT_EQ(LiteralView::Create(nil, {}), nil);
+
+  EXPECT_EQ(LiteralView::Create(*tuple, {0}), *scalar);
+  EXPECT_EQ(LiteralView::Create(*tuple, {1}), *matrix);
+
+  EXPECT_EQ(LiteralView::Create(*nested_tuple, {0}), *tuple);
+  EXPECT_EQ(LiteralView::Create(*nested_tuple, {0, 0}), *scalar);
+  EXPECT_EQ(LiteralView::Create(*nested_tuple, {0, 1}), *matrix);
+  EXPECT_EQ(LiteralView::Create(*nested_tuple, {1}), *scalar);
+}
+
+TEST_F(LiteralUtilTest, MutatingLiteralView) {
+  auto scalar = Literal::CreateR0<float>(1.0);
+  auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()});
+  auto nested_tuple = Literal::MakeTuple({tuple.get(), scalar.get()});
+  // Verify that changing the underlying data beneath the view changes the
+  // data of the view itself.
+  const auto nested_tuple_view = LiteralView::Create(*nested_tuple);
+  EXPECT_EQ(
+      nested_tuple->Get<float>(/*multi_index=*/{}, /*shape_index=*/{0, 0}),
+      1.0f);
+  EXPECT_EQ(nested_tuple_view.Get<float>(/*multi_index=*/{},
+                                         /*shape_index=*/{0, 0}),
+            1.0f);
+  nested_tuple->Set<float>(/*multi_index=*/{}, /*shape_index=*/{0, 0}, 555.0f);
+  EXPECT_EQ(
+      nested_tuple->Get<float>(/*multi_index=*/{}, /*shape_index=*/{0, 0}),
+      555.0f);
+  EXPECT_EQ(nested_tuple_view.Get<float>(/*multi_index=*/{},
+                                         /*shape_index=*/{0, 0}),
+            555.0f);
+}
+
+TEST_F(LiteralUtilTest, LiteralViewOfALiteralView) {
   auto scalar = Literal::CreateR0<float>(1.0);
   auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()});
   auto nested_tuple = Literal::MakeTuple({tuple.get(), scalar.get()});
 
-  EXPECT_EQ(&scalar->GetSubliteral(/*index=*/{}), scalar.get());
-  EXPECT_EQ(&matrix->GetSubliteral(/*index=*/{}), matrix.get());
-  EXPECT_EQ(&tuple->GetSubliteral(/*index=*/{}), tuple.get());
-  EXPECT_EQ(&nested_tuple->GetSubliteral(/*index=*/{}), nested_tuple.get());
+  const auto nested_tuple_view = LiteralView::Create(*nested_tuple);
+  const auto tuple_view =
+      LiteralView::Create(nested_tuple_view, /*view_root=*/{0});
+  const auto matrix_view = LiteralView::Create(tuple_view, /*view_root=*/{1});
+  EXPECT_EQ(matrix_view, *Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}}));
+}
+
+TEST_F(LiteralUtilTest, LiteralMove) {
+  std::unique_ptr<Literal> matrix =
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  Literal literal(std::move(*matrix));
+
+  EXPECT_TRUE(
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {2, 2}), literal.shape()));
+  EXPECT_EQ(literal.Get<float>({0, 0}), 1.0);
+  EXPECT_EQ(literal.Get<float>({0, 1}), 2.0);
+  EXPECT_EQ(literal.Get<float>({1, 0}), 3.0);
+  EXPECT_EQ(literal.Get<float>({1, 1}), 4.0);
+}
 
-  EXPECT_EQ(tuple->GetSubliteral(/*index=*/{0}), *scalar);
-  EXPECT_EQ(tuple->GetSubliteral(/*index=*/{1}), *matrix);
+TEST_F(LiteralUtilTest, DecomposeTuple) {
+  Literal nil_literal(ShapeUtil::MakeNil());
+  auto nested_tuple = Literal::MakeTuple(
+      {Literal::CreateR2<int32>({{1, 2}, {3, 4}}).get(),
+       Literal::MakeTuple({Literal::CreateR0<int32>(42).get(),
+                           Literal::CreateR1<double>({23.0, 44.0}).get(),
+                           &nil_literal})
+           .get(),
+       &nil_literal});
+
+  EXPECT_FALSE(ShapeUtil::IsNil(nested_tuple->shape()));
+  std::vector<Literal> elements = nested_tuple->DecomposeTuple();
+  EXPECT_TRUE(ShapeUtil::IsNil(nested_tuple->shape()));
+
+  ASSERT_EQ(elements.size(), 3);
+
+  EXPECT_TRUE(ShapeUtil::Compatible(elements[0].shape(),
+                                    ShapeUtil::MakeShape(S32, {2, 2})));
+  EXPECT_EQ(elements[0].Get<int32>({0, 0}), 1);
+  EXPECT_EQ(elements[0].Get<int32>({0, 1}), 2);
+  EXPECT_EQ(elements[0].Get<int32>({1, 0}), 3);
+  EXPECT_EQ(elements[0].Get<int32>({1, 1}), 4);
+
+  EXPECT_TRUE(ShapeUtil::Compatible(
+      elements[1].shape(),
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {}),
+                                 ShapeUtil::MakeShape(F64, {2}),
+                                 ShapeUtil::MakeNil()})));
+  EXPECT_EQ(elements[1].Get<int32>({}, /*shape_index=*/{0}), 42);
+  EXPECT_EQ(elements[1].Get<double>({0}, /*shape_index=*/{1}), 23.0);
+  EXPECT_EQ(elements[1].Get<double>({1}, /*shape_index=*/{1}), 44.0);
+
+  EXPECT_TRUE(ShapeUtil::Compatible(elements[2].shape(), ShapeUtil::MakeNil()));
+}
+
+TEST_F(LiteralUtilTest, DecomposeEmptyTuple) {
+  Literal nil_literal(ShapeUtil::MakeNil());
+  std::vector<Literal> elements = nil_literal.DecomposeTuple();
+  EXPECT_EQ(elements.size(), 0);
+}
+
+TEST_F(LiteralUtilTest, MoveIntoTuple) {
+  std::vector<Literal> elements;
+  elements.push_back(std::move(*Literal::CreateR0<float>(1.0)));
+  elements.push_back(std::move(*Literal::CreateR1<int32>({4, 8})));
+  elements.push_back(std::move(
+      *Literal::MakeTuple({Literal::CreateR0<int32>(42).get(),
+                           Literal::CreateR1<double>({23.0, 44.0}).get()})
+
+          ));
+
+  Literal literal = Literal::MoveIntoTuple(&elements);
+  ASSERT_TRUE(ShapeUtil::IsTuple(literal.shape()));
+  ASSERT_EQ(ShapeUtil::TupleElementCount(literal.shape()), 3);
+
+  EXPECT_EQ(literal.Get<float>({}, /*shape_index=*/{0}), 1.0);
+  EXPECT_EQ(literal.Get<int32>({0}, /*shape_index=*/{1}), 4);
+  EXPECT_EQ(literal.Get<int32>({1}, /*shape_index=*/{1}), 8);
+  EXPECT_EQ(literal.Get<int32>({}, /*shape_index=*/{2, 0}), 42);
+  EXPECT_EQ(literal.Get<double>({0}, /*shape_index=*/{2, 1}), 23.0);
+  EXPECT_EQ(literal.Get<double>({1}, /*shape_index=*/{2, 1}), 44.0);
+
+  for (const Literal& element : elements) {
+    EXPECT_TRUE(ShapeUtil::IsNil(element.shape()));
+  }
+}
+
+TEST_F(LiteralUtilTest, MoveIntoEmptyTuple) {
+  Literal literal = Literal::MoveIntoTuple({});
+  ASSERT_TRUE(ShapeUtil::IsTuple(literal.shape()));
+  ASSERT_EQ(ShapeUtil::TupleElementCount(literal.shape()), 0);
+}
+
+TEST_F(LiteralUtilTest, LiteralMoveAssignment) {
+  Literal literal;
+  EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeNil(), literal.shape()));
+
+  std::unique_ptr<Literal> matrix =
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  literal = std::move(*matrix);
+
+  EXPECT_TRUE(
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {2, 2}), literal.shape()));
+  EXPECT_EQ(literal.Get<float>({0, 0}), 1.0);
+  EXPECT_EQ(literal.Get<float>({0, 1}), 2.0);
+  EXPECT_EQ(literal.Get<float>({1, 0}), 3.0);
+  EXPECT_EQ(literal.Get<float>({1, 1}), 4.0);
+}
+
+TEST_F(LiteralUtilTest, LiteralViewCopy) {
+  std::unique_ptr<Literal> matrix =
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  const auto matrix_view = LiteralView::Create(*matrix);
+  LiteralView matrix_view_copy(matrix_view);
+
+  EXPECT_EQ(matrix_view_copy.Get<float>({0, 0}), 1.0);
+  EXPECT_EQ(matrix_view_copy.Get<float>({0, 1}), 2.0);
+  EXPECT_EQ(matrix_view_copy.Get<float>({1, 0}), 3.0);
+  EXPECT_EQ(matrix_view_copy.Get<float>({1, 1}), 4.0);
+}
+
+TEST_F(LiteralUtilTest, GetSetTuple) {
+  auto tuple = Literal::MakeTuple(
+      {Literal::CreateR0<float>(42.0).get(),
+       Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}}).get()});
+  EXPECT_EQ(tuple->Get<float>(/*multi_index=*/{}, /*shape_index=*/{0}), 42.0);
+  tuple->Set<float>(/*multi_index=*/{}, /*shape_index=*/{0}, -5.0);
+  EXPECT_EQ(tuple->Get<float>(/*multi_index=*/{}, /*shape_index=*/{0}), -5.0);
+
+  EXPECT_EQ(tuple->Get<float>(/*multi_index=*/{1, 0}, /*shape_index=*/{1}),
+            3.0);
+  tuple->Set<float>(/*multi_index=*/{1, 0}, /*shape_index=*/{1}, -4.0);
+  EXPECT_EQ(tuple->Get<float>(/*multi_index=*/{1, 0}, /*shape_index=*/{1}),
+            -4.0);
+}
+
+TEST_F(LiteralUtilTest, CreateFromShapeZeroInitialized) {
+  // Literals constructed using CreateFromShape should be zero initialized.
+  std::unique_ptr<Literal> scalar_f32 =
+      Literal::CreateFromShape(ShapeUtil::MakeShape(F32, {}));
+  EXPECT_EQ(scalar_f32->Get<float>({}), 0.0);
+  EXPECT_TRUE(scalar_f32->IsAll(0));
+
+  std::unique_ptr<Literal> vector_s32 =
+      Literal::CreateFromShape(ShapeUtil::MakeShape(S32, {3}));
+  EXPECT_EQ(vector_s32->Get<int32>({0}), 0);
+  EXPECT_EQ(vector_s32->Get<int32>({1}), 0);
+  EXPECT_EQ(vector_s32->Get<int32>({2}), 0);
+  EXPECT_TRUE(vector_s32->IsAll(0));
+
+  std::unique_ptr<Literal> tuple =
+      Literal::CreateFromShape(ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(F64, {}), ShapeUtil::MakeShape(PRED, {2}),
+           ShapeUtil::MakeShape(U64, {2, 1}), ShapeUtil::MakeShape(C64, {})}));
+
+  EXPECT_EQ(tuple->Get<double>({}, {0}), 0.0);
+  EXPECT_EQ(tuple->Get<bool>({0}, {1}), false);
+  EXPECT_EQ(tuple->Get<bool>({1}, {1}), false);
+  EXPECT_EQ(tuple->Get<uint64>({0, 0}, {2}), 0);
+  EXPECT_EQ(tuple->Get<uint64>({1, 0}, {2}), 0);
+  EXPECT_EQ(tuple->Get<complex64>({}, {3}), complex64(0.0f, 0.0f));
+}
+
+TEST_F(LiteralUtilTest, ProtoRoundTrip) {
+  // Test serializing then deserializing a Literal through a proto.
+  auto one_f32 = Literal::CreateR0<float>(1.0);
+  auto two_f32 = Literal::CreateR0<float>(2.0);
+  auto vector_int8 = Literal::CreateR1<int8>({-128, 0, 2, 4, 7, 56, 127});
+  auto vector_c64 = Literal::CreateR1<complex64>({{1.0, 2.0}, {3.0, 4.0}});
+  auto vector_bfloat16 = Literal::CreateR1<bfloat16>(
+      {bfloat16{-1.0}, bfloat16{2.0}, bfloat16{-3.0}});
+  auto vector_half =
+      Literal::CreateR1<half>({half{10.0}, half{20.0}, half{-30.0}});
+  auto matrix_pred =
+      Literal::CreateR2<bool>({{true, false, true}, {false, false, true}});
+  auto tuple = Literal::MakeTuple(
+      {one_f32.get(), vector_half.get(), matrix_pred.get(), matrix_pred.get()});
+  Literal nil_literal(ShapeUtil::MakeNil());
+  auto nested_tuple = Literal::MakeTuple(
+      {tuple.get(), vector_bfloat16.get(), tuple.get(), &nil_literal});
+
+  auto to_from_proto = [](const Literal& literal) -> Literal {
+    return std::move(*Literal::CreateFromProto(literal.ToProto()).ValueOrDie());
+  };
+
+  EXPECT_EQ(*one_f32, to_from_proto(*one_f32));
+  EXPECT_EQ(*vector_c64, to_from_proto(*vector_c64));
+  EXPECT_EQ(*vector_bfloat16, to_from_proto(*vector_bfloat16));
+  EXPECT_EQ(*matrix_pred, to_from_proto(*matrix_pred));
+  EXPECT_EQ(*tuple, to_from_proto(*tuple));
+  EXPECT_EQ(*nested_tuple, to_from_proto(*nested_tuple));
+  EXPECT_EQ(nil_literal, to_from_proto(nil_literal));
+
+  EXPECT_NE(*one_f32, *two_f32);
+  EXPECT_NE(*one_f32, to_from_proto(*two_f32));
+}
+
+TEST_F(LiteralUtilTest, InvalidProtoNoValues) {
+  // Proto contains a shape, but no values.
+  LiteralProto proto;
+  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3});
+  Status status = Literal::CreateFromProto(proto).status();
+  ASSERT_FALSE(status.ok());
+  ASSERT_THAT(status.error_message(),
+              HasSubstr("Expected 3 elements in LiteralProto"));
+}
+
+TEST_F(LiteralUtilTest, InvalidProtoNoShape) {
+  // Proto contains values, but no shape.
+  LiteralProto proto;
+  proto.add_preds(false);
+  proto.add_preds(true);
+  proto.add_preds(false);
+  Status status = Literal::CreateFromProto(proto).status();
+  ASSERT_FALSE(status.ok());
+  ASSERT_THAT(status.error_message(), HasSubstr("LiteralProto has no shape"));
+}
+
+TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) {
+  // Proto contains values in wrong container.
+  LiteralProto proto;
+  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3});
+  proto.add_preds(false);
+  proto.add_preds(true);
+  proto.add_preds(false);
+  Status status = Literal::CreateFromProto(proto).status();
+  ASSERT_FALSE(status.ok());
+  ASSERT_THAT(status.error_message(),
+              HasSubstr("Expected 3 elements in LiteralProto"));
+}
+
+TEST_F(LiteralUtilTest, InvalidProtoTooFewValues) {
+  // Proto contains too few values.
+  LiteralProto proto;
+  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {42, 2});
+  proto.add_f32s(1.0);
+  proto.add_f32s(2.0);
+  proto.add_f32s(3.0);
+  Status status = Literal::CreateFromProto(proto).status();
+  ASSERT_FALSE(status.ok());
+  ASSERT_THAT(status.error_message(),
+              HasSubstr("Expected 84 elements in LiteralProto"));
+}
+
+TEST_F(LiteralUtilTest, InvalidProtoTooManyValues) {
+  // Proto contains too many values.
+  LiteralProto proto;
+  *proto.mutable_shape() = ShapeUtil::MakeShape(S32, {2});
+  proto.add_s32s(42);
+  proto.add_s32s(-10);
+  proto.add_s32s(100);
+  Status status = Literal::CreateFromProto(proto).status();
+  ASSERT_FALSE(status.ok());
+  ASSERT_THAT(status.error_message(),
+              HasSubstr("Expected 2 elements in LiteralProto"));
+}
+
+TEST_F(LiteralUtilTest, InvalidProtoMissingLayout) {
+  // Proto shape missing layout.
+  LiteralProto proto;
+  *proto.mutable_shape() = ShapeUtil::MakeShape(PRED, {2, 2});
+  LayoutUtil::ClearLayout(proto.mutable_shape());
+  proto.add_preds(true);
+  proto.add_preds(false);
+  proto.add_preds(true);
+  proto.add_preds(false);
+  Status status = Literal::CreateFromProto(proto).status();
+  ASSERT_FALSE(status.ok());
+  ASSERT_THAT(status.error_message(), HasSubstr("LiteralProto has no layout"));
+}
+
+TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) {
+  // Proto has the too few tuple elements.
+  LiteralProto proto;
+  *proto.mutable_shape() = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})});
+  LiteralProto* element0 = proto.add_tuple_literals();
+  *element0->mutable_shape() =
+      ShapeUtil::GetTupleElementShape(proto.shape(), 0);
+  element0->add_preds(false);
+  element0->add_preds(true);
+
+  Status status = Literal::CreateFromProto(proto).status();
+  ASSERT_FALSE(status.ok());
+  ASSERT_THAT(status.error_message(), HasSubstr("Expected 2 tuple elements"));
+}
+
+TEST_F(LiteralUtilTest, InvalidProtoTooManyTupleElements) {
+  // Proto has the too many tuple elements.
+  LiteralProto proto;
+  *proto.mutable_shape() = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})});
+  LiteralProto* element0 = proto.add_tuple_literals();
+  *element0->mutable_shape() =
+      ShapeUtil::GetTupleElementShape(proto.shape(), 0);
+  element0->add_preds(false);
+  element0->add_preds(true);
+  LiteralProto* element1 = proto.add_tuple_literals();
+  *element1->mutable_shape() =
+      ShapeUtil::GetTupleElementShape(proto.shape(), 1);
+  element1->add_f32s(42.0);
+  LiteralProto* element2 = proto.add_tuple_literals();
+  *element2->mutable_shape() = ShapeUtil::MakeShape(F32, {});
+  element2->add_f32s(123.0);
+
+  Status status = Literal::CreateFromProto(proto).status();
+  ASSERT_FALSE(status.ok());
+  ASSERT_THAT(status.error_message(), HasSubstr("Expected 2 tuple elements"));
+}
+
+TEST_F(LiteralUtilTest, SortSparseElements) {
+  auto literal =
+      Literal::CreateSparse<float>({10, 10, 10}, SparseIndexArray(10, 3), {});
+  literal->AppendSparseElement<float>({2, 3, 4}, 2.0);
+  literal->AppendSparseElement<float>({3, 4, 5}, 3.0);
+  literal->AppendSparseElement<float>({1, 2, 3}, 1.0);
+  literal->SortSparseElements();
+  ASSERT_EQ(literal->ToString(false),
+            "f32[10,10,10]{[1, 2, 3]: 1, [2, 3, 4]: 2, [3, 4, 5]: 3}");
+}
 
-  EXPECT_EQ(nested_tuple->GetSubliteral(/*index=*/{0}), *tuple);
-  EXPECT_EQ(nested_tuple->GetSubliteral(/*index=*/{0, 0}), *scalar);
-  EXPECT_EQ(nested_tuple->GetSubliteral(/*index=*/{0, 1}), *matrix);
-  EXPECT_EQ(nested_tuple->GetSubliteral(/*index=*/{1}), *scalar);
+TEST_F(LiteralUtilTest, GetSparseElementAsString) {
+  std::vector<int64> dimensions = {10, 10, 10};
+  SparseIndexArray indices(10, {{1, 2, 3}, {2, 3, 4}, {3, 4, 5}});
+
+  ASSERT_EQ(
+      Literal::CreateSparse<bool>(dimensions, indices, {true, false, true})
+          ->GetSparseElementAsString(1),
+      "false");
+  ASSERT_EQ(Literal::CreateSparse<int64>(dimensions, indices, {1, 2, 3})
+                ->GetSparseElementAsString(1),
+            tensorflow::strings::StrCat(int64{2}));
+  ASSERT_EQ(Literal::CreateSparse<double>(dimensions, indices, {1.0, 2.0, 3.0})
+                ->GetSparseElementAsString(1),
+            tensorflow::strings::StrCat(double{2.0}));
+  ASSERT_EQ(Literal::CreateSparse<half>(dimensions, indices,
+                                        {half{1.0}, half{2.0}, half{3.0}})
+                ->GetSparseElementAsString(1),
+            tensorflow::strings::StrCat(half{2.0}));
+  ASSERT_EQ(
+      Literal::CreateSparse<complex64>(
+          dimensions, indices,
+          std::vector<complex64>{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}})
+          ->GetSparseElementAsString(1),
+      tensorflow::strings::StrCat("(", float{3.0}, ", ", float{4.0}, ")"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/map_util.h b/tensorflow/compiler/xla/map_util.h
index 51d0d5f86f00c539951e8e2baa6296337a5a21e9..8db8c6f3de84a6c46625eadbb6b0f83d2262e5f7 100644
--- a/tensorflow/compiler/xla/map_util.h
+++ b/tensorflow/compiler/xla/map_util.h
@@ -16,6 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_MAP_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_MAP_UTIL_H_
 
+#include <functional>
+#include <sstream>
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -44,6 +49,41 @@ typename Collection::value_type::second_type& FindOrDie(
   return it->second;
 }
 
+// Like FindOrDie but returns an error instead of dying if `key` is not in
+// `container`.
+template <class Collection>
+StatusOr<
+    std::reference_wrapper<const typename Collection::value_type::second_type>>
+MaybeFind(const Collection& collection,
+          const typename Collection::value_type::first_type& key) {
+  typename Collection::const_iterator it = collection.find(key);
+  if (it == collection.end()) {
+    std::ostringstream os;
+    os << key;
+    return NotFound("key not found: %s", os.str().c_str());
+  }
+  return {it->second};
+}
+
+// Returns a const reference to the value associated with the given key if it
+// exists, otherwise returns a const reference to the provided default value.
+//
+// WARNING: If a temporary object is passed as the default "value,"
+// this function will return a reference to that temporary object,
+// which will be destroyed at the end of the statement. A common
+// example: if you have a map with string values, and you pass a char*
+// as the default "value," either use the returned value immediately
+// or store it in a string (not string&).
+template <class Collection>
+const typename Collection::value_type::second_type& FindOrDefault(
+    const Collection& collection,
+    const typename Collection::value_type::first_type& key,
+    const typename Collection::value_type::second_type& value) {
+  auto it = collection.find(key);
+  if (it != collection.end()) return it->second;
+  return value;
+}
+
 // Inserts the key-value pair into the collection. Dies if key was already
 // present.
 template <class Collection>
@@ -60,6 +100,12 @@ bool ContainsKey(const Collection& collection, const Key& key) {
   return collection.find(key) != collection.end();
 }
 
+// Inserts `value` into `set`. Dies if it was already present.
+template <class Set>
+void InsertOrDie(Set* const set, const typename Set::value_type& value) {
+  CHECK(set->insert(value).second) << "duplicate value: " << value;
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_MAP_UTIL_H_
diff --git a/tensorflow/compiler/xla/packed_literal_reader.cc b/tensorflow/compiler/xla/packed_literal_reader.cc
index 70e0f5a74711c8ceef1b6d4225141aa1cc9c6219..857aae0a7982a57bb3057a6f267f5f033a0fdde4 100644
--- a/tensorflow/compiler/xla/packed_literal_reader.cc
+++ b/tensorflow/compiler/xla/packed_literal_reader.cc
@@ -44,11 +44,11 @@ StatusOr<std::unique_ptr<Literal>> PackedLiteralReader::Read(
   VLOG(3) << "reading shape from file: " << ShapeUtil::HumanString(shape)
           << " layout: "
           << (layout == nullptr ? "<none>" : layout->ShortDebugString());
-  auto result = MakeUnique<Literal>();
-  *result->mutable_shape() = shape;
+  Shape literal_shape = shape;
   if (layout != nullptr) {
-    TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape(*layout, shape));
-    *result->mutable_shape()->mutable_layout() = *layout;
+    TF_RETURN_IF_ERROR(
+        LayoutUtil::ValidateLayoutForShape(*layout, literal_shape));
+    *literal_shape.mutable_layout() = *layout;
   }
 
   if (shape.element_type() != F32) {
@@ -57,10 +57,12 @@ StatusOr<std::unique_ptr<Literal>> PackedLiteralReader::Read(
         PrimitiveType_Name(shape.element_type()).c_str());
   }
 
+  auto result = MakeUnique<Literal>(literal_shape);
+  result->PopulateWithValue(std::numeric_limits<float>::quiet_NaN());
+
   int64 elements = ShapeUtil::ElementsIn(shape);
-  result->Resize(elements, std::numeric_limits<float>::quiet_NaN());
-  std::vector<float>* field = result->mutable_f32s();
-  char* data = tensorflow::bit_cast<char*>(field->data());
+  tensorflow::gtl::ArraySlice<float> field = result->data<float>();
+  char* data = tensorflow::bit_cast<char*>(field.data());
   uint64 bytes = elements * sizeof(float);
   tensorflow::StringPiece sp;
   auto s = file_->Read(offset_, bytes, &sp, data);
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index 2bce56b7bd2f91f20ea670d0e7ccaa432c2b5f9f..143c9a2366be5786b7ef2148580caeb97d67d2d8 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -20,79 +20,6 @@ limitations under the License.
 namespace xla {
 namespace primitive_util {
 
-template <>
-PrimitiveType NativeToPrimitiveType<bool>() {
-  return PRED;
-}
-
-// Unsigned integer
-template <>
-PrimitiveType NativeToPrimitiveType<uint8>() {
-  return U8;
-}
-
-template <>
-PrimitiveType NativeToPrimitiveType<uint16>() {
-  return U16;
-}
-
-template <>
-PrimitiveType NativeToPrimitiveType<uint32>() {
-  return U32;
-}
-
-template <>
-PrimitiveType NativeToPrimitiveType<uint64>() {
-  return U64;
-}
-
-// Signed integer
-template <>
-PrimitiveType NativeToPrimitiveType<int8>() {
-  return S8;
-}
-
-template <>
-PrimitiveType NativeToPrimitiveType<int16>() {
-  return S16;
-}
-
-template <>
-PrimitiveType NativeToPrimitiveType<int32>() {
-  return S32;
-}
-
-template <>
-PrimitiveType NativeToPrimitiveType<int64>() {
-  return S64;
-}
-
-// Floating point
-template <>
-PrimitiveType NativeToPrimitiveType<float>() {
-  return F32;
-}
-
-template <>
-PrimitiveType NativeToPrimitiveType<double>() {
-  return F64;
-}
-
-template <>
-PrimitiveType NativeToPrimitiveType<bfloat16>() {
-  return BF16;
-}
-
-template <>
-PrimitiveType NativeToPrimitiveType<half>() {
-  return F16;
-}
-
-template <>
-PrimitiveType NativeToPrimitiveType<complex64>() {
-  return C64;
-}
-
 bool IsFloatingPointType(PrimitiveType type) {
   return type == F16 || type == F32 || type == F64 || type == BF16;
 }
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 19c6a138885c61f1304bfae3d8bb5d958a1bb5bc..b26a10ade63a5dad3bf8f9f3a2a33c3c5e67bdb2 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -26,6 +26,13 @@ limitations under the License.
 namespace xla {
 namespace primitive_util {
 
+// The number of exponent bits in a BF16 value.
+const int kBFloat16ExponentBits = 8;
+
+// The number of mantissa bits in a BF16 value. There is an implicit leading
+// 1, so there is an implicit additional bit of precision.
+const int kBFloat16MantissaBits = 7;
+
 // Returns the XLA primitive type (eg, F32) corresponding to the given
 // template parameter native type (eg, float).
 template <typename NativeT>
@@ -40,49 +47,81 @@ PrimitiveType NativeToPrimitiveType() {
 }
 
 // Declarations of specializations for each native type which correspond to a
-// XLA primitive type.
+// XLA primitive type.  As an optimization, these are declared inline in the
+// header.
 template <>
-PrimitiveType NativeToPrimitiveType<bool>();
+inline PrimitiveType NativeToPrimitiveType<bool>() {
+  return PRED;
+}
 
 // Unsigned integer
 template <>
-PrimitiveType NativeToPrimitiveType<uint8>();
+inline PrimitiveType NativeToPrimitiveType<uint8>() {
+  return U8;
+}
 
 template <>
-PrimitiveType NativeToPrimitiveType<uint16>();
+inline PrimitiveType NativeToPrimitiveType<uint16>() {
+  return U16;
+}
 
 template <>
-PrimitiveType NativeToPrimitiveType<uint32>();
+inline PrimitiveType NativeToPrimitiveType<uint32>() {
+  return U32;
+}
 
 template <>
-PrimitiveType NativeToPrimitiveType<uint64>();
+inline PrimitiveType NativeToPrimitiveType<uint64>() {
+  return U64;
+}
 
 // Signed integer
 template <>
-PrimitiveType NativeToPrimitiveType<int8>();
+inline PrimitiveType NativeToPrimitiveType<int8>() {
+  return S8;
+}
 
 template <>
-PrimitiveType NativeToPrimitiveType<int16>();
+inline PrimitiveType NativeToPrimitiveType<int16>() {
+  return S16;
+}
 
 template <>
-PrimitiveType NativeToPrimitiveType<int32>();
+inline PrimitiveType NativeToPrimitiveType<int32>() {
+  return S32;
+}
 
 template <>
-PrimitiveType NativeToPrimitiveType<int64>();
+inline PrimitiveType NativeToPrimitiveType<int64>() {
+  return S64;
+}
 
 // Floating point
 template <>
-PrimitiveType NativeToPrimitiveType<float>();
+inline PrimitiveType NativeToPrimitiveType<float>() {
+  return F32;
+}
+
 template <>
-PrimitiveType NativeToPrimitiveType<double>();
+inline PrimitiveType NativeToPrimitiveType<double>() {
+  return F64;
+}
+
 template <>
-PrimitiveType NativeToPrimitiveType<half>();
+inline PrimitiveType NativeToPrimitiveType<half>() {
+  return F16;
+}
+
 template <>
-PrimitiveType NativeToPrimitiveType<bfloat16>();
+inline PrimitiveType NativeToPrimitiveType<bfloat16>() {
+  return BF16;
+}
 
 // Complex
 template <>
-PrimitiveType NativeToPrimitiveType<complex64>();
+inline PrimitiveType NativeToPrimitiveType<complex64>() {
+  return C64;
+}
 
 bool IsFloatingPointType(PrimitiveType type);
 
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e2972f06016ab3555c4fc0cc4616993fe6764b1e
--- /dev/null
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -0,0 +1,86 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+
+py_library(
+    name = "xla_client",
+    srcs = ["xla_client.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pywrap_xla",
+        "//tensorflow/compiler/xla:xla_data_proto_py",
+    ],
+)
+
+py_test(
+    name = "xla_client_test",
+    srcs = ["xla_client_test.py"],
+    main = "xla_client_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":xla_client",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cc_library(
+    name = "numpy_bridge",
+    srcs = ["numpy_bridge.cc"],
+    hdrs = ["numpy_bridge.h"],
+    deps = [
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/python:numpy_lib",
+    ],
+)
+
+cc_library(
+    name = "local_computation_builder",
+    srcs = ["local_computation_builder.cc"],
+    hdrs = ["local_computation_builder.h"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:executable_build_options",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "pywrap_xla",
+    srcs = ["xla.i"],
+    swig_includes = [
+        "local_computation_builder.i",
+    ],
+    deps = [
+        ":local_computation_builder",
+        ":numpy_bridge",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/ndlstm/__init__.py b/tensorflow/compiler/xla/python/__init__.py
similarity index 100%
rename from tensorflow/contrib/ndlstm/__init__.py
rename to tensorflow/compiler/xla/python/__init__.py
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a89146d4484a90fc4d89ced0b0240ae9585e1f28
--- /dev/null
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -0,0 +1,590 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/local_computation_builder.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/platform/default/thread_annotations.h"
+
+namespace xla {
+
+namespace swig {
+
+// TODO(b/34473877) Ideally XLA would support AllReduce among arbitrary sets of
+// device handles instead of needing to set the number of replicas at XLA
+// service initialization time.
+tensorflow::mutex g_local_client_mutex(tensorflow::LINKER_INITIALIZED);
+int g_replica_count GUARDED_BY(g_local_client_mutex) = 1;
+LocalClient* g_local_client GUARDED_BY(g_local_client_mutex) = nullptr;
+
+Status InitializeReplicaCount(int replica_count) {
+  if (replica_count < 1) {
+    return InvalidArgument("Replica count must be >= 1; got %d.",
+                           replica_count);
+  }
+  tensorflow::mutex_lock lock(g_local_client_mutex);
+  if (g_local_client != nullptr) {
+    return FailedPrecondition(
+        "Attempted to set the replica count to %d, but a local XLA service was "
+        "previously created with a replica count of %d.",
+        replica_count, g_replica_count);
+  }
+  g_replica_count = replica_count;
+  return Status::OK();
+}
+
+int GetReplicaCount() {
+  tensorflow::mutex_lock lock(g_local_client_mutex);
+  return g_replica_count;
+}
+
+LocalClient* GetOrCreateLocalClient() {
+  tensorflow::mutex_lock lock(g_local_client_mutex);
+  if (g_local_client != nullptr) {
+    return g_local_client;
+  }
+  LocalClientOptions options;
+  options.set_number_of_replicas(g_replica_count);
+  g_local_client = ClientLibrary::GetOrCreateLocalClient(options).ValueOrDie();
+  CHECK(g_local_client != nullptr);
+  return g_local_client;
+}
+
+Status TransferToInfeedLocal(const Literal& literal) {
+  VLOG(1) << "Infeeding literal without replica number; shape: "
+          << literal.shape();
+  LocalClient* client = GetOrCreateLocalClient();
+  return client->TransferToInfeedLocal(literal, /*device_ordinal=*/0);
+}
+
+Status TransferToInfeedLocalReplica(const Literal& literal,
+                                    int replica_number) {
+  VLOG(1) << "Infeeding shape " << literal.shape()
+          << " to replica number: " << replica_number;
+  LocalClient* client = GetOrCreateLocalClient();
+  TF_ASSIGN_OR_RETURN(int device_ordinal,
+                      client->ReplicaNumberToDeviceOrdinal(replica_number));
+  return client->TransferToInfeedLocal(literal, device_ordinal);
+}
+
+StatusOr<std::unique_ptr<Literal>> TransferFromOutfeedLocalReplica(
+    const Shape& shape, int replica_number) {
+  VLOG(1) << "Outfeeding literal from replica number: " << replica_number
+          << " shape: " << shape;
+  LocalClient* client = GetOrCreateLocalClient();
+  TF_ASSIGN_OR_RETURN(int device_ordinal,
+                      client->ReplicaNumberToDeviceOrdinal(replica_number));
+  return client->TransferFromOutfeedLocal(shape, device_ordinal);
+}
+
+LocalShapedBuffer::LocalShapedBuffer(
+    std::unique_ptr<ScopedShapedBuffer> shaped_buffer)
+    : shaped_buffer_(std::move(shaped_buffer)) {}
+
+const std::unique_ptr<ScopedShapedBuffer>& LocalShapedBuffer::shaped_buffer()
+    const {
+  return shaped_buffer_;
+}
+
+static StatusOr<std::unique_ptr<ScopedShapedBuffer>> ToBuffer(
+    LocalClient* client, int device_ordinal, const Literal& arg) {
+  return client->LiteralToShapedBuffer(arg, device_ordinal,
+                                       client->backend().memory_allocator());
+}
+
+/* static */
+LocalShapedBuffer* LocalShapedBuffer::FromLiteral(
+    const Literal& argument,
+    const tensorflow::gtl::optional<Shape>& shape_with_layout) {
+  LocalClient* client = GetOrCreateLocalClient();
+  std::unique_ptr<ScopedShapedBuffer> buf;
+  if (shape_with_layout) {
+    std::unique_ptr<Literal> relaid =
+        argument.Relayout(shape_with_layout.value());
+    buf = ToBuffer(client, /*device_ordinal=*/0, *relaid).ConsumeValueOrDie();
+  } else {
+    buf = ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie();
+  }
+  return new LocalShapedBuffer(std::move(buf));
+}
+
+std::unique_ptr<Literal> LocalShapedBuffer::ToLiteral() const {
+  LocalClient* client = GetOrCreateLocalClient();
+  return client->ShapedBufferToLiteral(*shaped_buffer()).ConsumeValueOrDie();
+}
+
+CompiledLocalComputation::CompiledLocalComputation(
+    std::unique_ptr<LocalExecutable> executable)
+    : executable_(std::move(executable)) {}
+
+StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
+    const std::vector<Literal>& arguments,
+    const std::vector<tensorflow::gtl::optional<Shape>>& shapes_with_layout) {
+  LocalClient* client = GetOrCreateLocalClient();
+
+  VLOG(1) << "Execution requested with " << GetReplicaCount() << " replicas.";
+
+  // Each replica populates a StatusOr result, but only replica zero actually
+  // retrieves its literal value.
+  std::vector<StatusOr<std::unique_ptr<Literal>>> results(GetReplicaCount());
+  {
+    tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "xlarun",
+                                        GetReplicaCount());
+
+    for (int replica = 0; replica < GetReplicaCount(); ++replica) {
+      pool.Schedule([this, client, replica, &arguments, &shapes_with_layout,
+                     &results] {
+        StatusOr<int> device_ordinal_status =
+            client->ReplicaNumberToDeviceOrdinal(replica);
+        if (!device_ordinal_status.ok()) {
+          results[replica] = device_ordinal_status.status();
+          return;
+        }
+        const int device_ordinal = device_ordinal_status.ValueOrDie();
+        VLOG(3) << "Replica " << replica
+                << " mapped to device ordinal for execution: "
+                << device_ordinal;
+
+        // Transfer arguments in
+        std::vector<std::unique_ptr<ScopedShapedBuffer>> scoped_buffers;
+        scoped_buffers.reserve(arguments.size());
+        for (int i = 0; i < arguments.size(); ++i) {
+          const Literal& argument = arguments[i];
+          const tensorflow::gtl::optional<Shape>& shape_with_layout =
+              shapes_with_layout[i];
+
+          StatusOr<std::unique_ptr<ScopedShapedBuffer>> pushed;
+          if (shape_with_layout) {
+            std::unique_ptr<Literal> relaid =
+                argument.Relayout(shape_with_layout.value());
+            pushed = ToBuffer(client, device_ordinal, *relaid);
+          } else {
+            pushed = ToBuffer(client, device_ordinal, argument);
+          }
+          if (!pushed.ok()) {
+            results[replica] = pushed.status();
+            return;
+          }
+
+          scoped_buffers.push_back(std::move(pushed).ValueOrDie());
+        }
+
+        // Execute
+        std::vector<const ShapedBuffer*> argument_buffers;
+        argument_buffers.reserve(scoped_buffers.size());
+        for (auto& buffer : scoped_buffers) {
+          argument_buffers.push_back(buffer.get());
+        }
+
+        DeviceAssignment device_assignment =
+            client->backend()
+                .computation_placer()
+                ->AssignDevices(GetReplicaCount(), /*computation_count=*/1)
+                .ConsumeValueOrDie();
+
+        ExecutableRunOptions options;
+        options.set_device_ordinal(device_ordinal);
+        options.set_allocator(client->backend().memory_allocator());
+        options.set_inter_op_thread_pool(
+            client->backend().inter_op_thread_pool());
+        options.set_intra_op_thread_pool(
+            client->backend().eigen_intra_op_thread_pool_device());
+        options.set_device_assignment(&device_assignment);
+        StatusOr<std::unique_ptr<ScopedShapedBuffer>> result_buffer_status =
+            executable_->Run(argument_buffers, options);
+        if (!result_buffer_status.ok()) {
+          results[replica] = result_buffer_status.status();
+          return;
+        }
+
+        // Transfer result out
+        results[replica] =
+            client->ShapedBufferToLiteral(*result_buffer_status.ValueOrDie());
+      });
+    }
+  }
+
+  for (int replica = 0; replica < GetReplicaCount(); ++replica) {
+    const auto& statusor = results[replica];
+    if (!statusor.ok()) {
+      return InternalError(
+          "Failed running replica %d (other replicas may have failed as well): "
+          "%s.",
+          replica, statusor.status().ToString().c_str());
+    }
+  }
+
+  return std::move(results[0]);
+}
+
+LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers(
+    tensorflow::gtl::ArraySlice<LocalShapedBuffer*> argument_handles) {
+  LocalClient* client = GetOrCreateLocalClient();
+
+  std::vector<const ShapedBuffer*> argument_buffers;
+  argument_buffers.reserve(argument_handles.size());
+  for (auto& handle : argument_handles) {
+    argument_buffers.push_back(handle->shaped_buffer().get());
+  }
+
+  // Execute
+  ExecutableRunOptions options;
+  options.set_allocator(client->backend().memory_allocator());
+  options.set_inter_op_thread_pool(client->backend().inter_op_thread_pool());
+  options.set_intra_op_thread_pool(
+      client->backend().eigen_intra_op_thread_pool_device());
+  std::unique_ptr<ScopedShapedBuffer> result_buffer =
+      executable_->Run(argument_buffers, options).ConsumeValueOrDie();
+
+  return new LocalShapedBuffer(std::move(result_buffer));
+}
+
+LocalComputation::LocalComputation(Computation computation)
+    : computation_(std::move(computation)) {}
+
+StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
+    const std::vector<Shape>& argument_shapes,
+    const ExecutableBuildOptions* build_options) {
+  std::vector<const Shape*> argument_shape_pointers;
+  argument_shape_pointers.reserve(argument_shapes.size());
+  for (auto& argument_shape : argument_shapes) {
+    argument_shape_pointers.push_back(&argument_shape);
+  }
+
+  LocalClient* client = GetOrCreateLocalClient();
+  ExecutableBuildOptions options;
+  if (build_options != nullptr) {
+    options = *build_options;
+  }
+  TF_ASSIGN_OR_RETURN(
+      auto local_executable,
+      client->Compile(computation_, argument_shape_pointers, options));
+  return new CompiledLocalComputation(std::move(local_executable));
+}
+
+const Computation& LocalComputation::computation() const {
+  return computation_;
+}
+
+LocalComputationBuilder::LocalComputationBuilder(const string& computation_name)
+    : builder_(GetOrCreateLocalClient(), computation_name) {}
+
+void LocalComputationBuilder::SetOpMetadata(const OpMetadata& metadata) {
+  builder_.SetOpMetadata(metadata);
+}
+
+void LocalComputationBuilder::ClearOpMetadata() { builder_.ClearOpMetadata(); }
+
+StatusOr<LocalComputation*> LocalComputationBuilder::Build() {
+  TF_ASSIGN_OR_RETURN(Computation computation, builder_.Build());
+  return new LocalComputation(std::move(computation));
+}
+
+ComputationDataHandle LocalComputationBuilder::Parameter(int64 parameter_number,
+                                                         const Shape& shape,
+                                                         const string& name) {
+  return builder_.Parameter(parameter_number, shape, name);
+}
+
+std::unique_ptr<Shape> LocalComputationBuilder::GetShape(
+    const ComputationDataHandle& operand) {
+  return builder_.GetShape(operand).ConsumeValueOrDie();
+}
+
+StatusOr<Shape> LocalComputationBuilder::GetReturnValueShape() {
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape, builder_.GetProgramShape());
+  return program_shape.result();
+}
+
+ComputationDataHandle LocalComputationBuilder::Infeed(const Shape& shape) {
+  return builder_.Infeed(shape);
+}
+
+void LocalComputationBuilder::Outfeed(const ComputationDataHandle& operand,
+                                      const Shape& shape,
+                                      const string& outfeed_config) {
+  builder_.Outfeed(operand, shape, outfeed_config);
+}
+
+ComputationDataHandle LocalComputationBuilder::ConstantLiteral(
+    const Literal& literal) {
+  return builder_.ConstantLiteral(literal);
+}
+
+ComputationDataHandle LocalComputationBuilder::Broadcast(
+    const ComputationDataHandle& operand,
+    tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
+  return builder_.Broadcast(operand, broadcast_sizes);
+}
+
+ComputationDataHandle LocalComputationBuilder::Pad(
+    const ComputationDataHandle& operand,
+    const ComputationDataHandle& padding_value,
+    const PaddingConfig& padding_config) {
+  return builder_.Pad(operand, padding_value, padding_config);
+}
+
+ComputationDataHandle LocalComputationBuilder::Reshape(
+    const ComputationDataHandle& operand,
+    tensorflow::gtl::ArraySlice<int64> dimensions,
+    tensorflow::gtl::ArraySlice<int64> new_sizes) {
+  return builder_.Reshape(operand, dimensions, new_sizes);
+}
+
+ComputationDataHandle LocalComputationBuilder::Collapse(
+    const ComputationDataHandle& operand,
+    tensorflow::gtl::ArraySlice<int64> dimensions) {
+  return builder_.Collapse(operand, dimensions);
+}
+
+ComputationDataHandle LocalComputationBuilder::CrossReplicaSum(
+    const ComputationDataHandle& operand) {
+  return builder_.CrossReplicaSum(operand);
+}
+
+ComputationDataHandle LocalComputationBuilder::Slice(
+    const ComputationDataHandle& operand,
+    tensorflow::gtl::ArraySlice<int64> start_indices,
+    tensorflow::gtl::ArraySlice<int64> limit_indices,
+    tensorflow::gtl::ArraySlice<int64> strides) {
+  return builder_.Slice(operand, start_indices, limit_indices, strides);
+}
+
+ComputationDataHandle LocalComputationBuilder::DynamicSlice(
+    const ComputationDataHandle& operand,
+    const ComputationDataHandle& start_indices,
+    tensorflow::gtl::ArraySlice<int64> slice_sizes) {
+  return builder_.DynamicSlice(operand, start_indices, slice_sizes);
+}
+
+ComputationDataHandle LocalComputationBuilder::DynamicUpdateSlice(
+    const ComputationDataHandle& operand, const ComputationDataHandle& update,
+    const ComputationDataHandle& start_indices) {
+  return builder_.DynamicUpdateSlice(operand, update, start_indices);
+}
+
+ComputationDataHandle LocalComputationBuilder::ConcatInDim(
+    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
+    int64 dimension) {
+  return builder_.ConcatInDim(operands, dimension);
+}
+
+ComputationDataHandle
+LocalComputationBuilder::SelectAndScatterWithGeneralPadding(
+    const ComputationDataHandle& operand, const LocalComputation& select,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    const ComputationDataHandle& source,
+    const ComputationDataHandle& init_value, const LocalComputation& scatter) {
+  return builder_.SelectAndScatterWithGeneralPadding(
+      operand, select.computation(), window_dimensions, window_strides, padding,
+      source, init_value, scatter.computation());
+}
+
+ComputationDataHandle LocalComputationBuilder::Tuple(
+    tensorflow::gtl::ArraySlice<ComputationDataHandle> elements) {
+  return builder_.Tuple(elements);
+}
+
+ComputationDataHandle LocalComputationBuilder::GetTupleElement(
+    const ComputationDataHandle& tuple_data, int64 index) {
+  return builder_.GetTupleElement(tuple_data, index);
+}
+
+ComputationDataHandle LocalComputationBuilder::Dot(
+    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs) {
+  return builder_.Dot(lhs, rhs);
+}
+
+ComputationDataHandle LocalComputationBuilder::DotGeneral(
+    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+    const DotDimensionNumbers& dimension_numbers) {
+  return builder_.DotGeneral(lhs, rhs, dimension_numbers);
+}
+
+ComputationDataHandle LocalComputationBuilder::ConvGeneralDilated(
+    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+    tensorflow::gtl::ArraySlice<int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers) {
+  return builder_.ConvGeneralDilated(lhs, rhs, window_strides, padding,
+                                     lhs_dilation, rhs_dilation,
+                                     dimension_numbers);
+}
+
+ComputationDataHandle LocalComputationBuilder::ConvertElementType(
+    const ComputationDataHandle& operand, PrimitiveType new_element_type) {
+  return builder_.ConvertElementType(operand, new_element_type);
+}
+
+ComputationDataHandle LocalComputationBuilder::Call(
+    const LocalComputation& local_computation,
+    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands) {
+  return builder_.Call(local_computation.computation(), operands);
+}
+
+ComputationDataHandle LocalComputationBuilder::Transpose(
+    const ComputationDataHandle& operand,
+    tensorflow::gtl::ArraySlice<int64> permutation) {
+  return builder_.Transpose(operand, permutation);
+}
+
+ComputationDataHandle LocalComputationBuilder::Rev(
+    const ComputationDataHandle& operand,
+    tensorflow::gtl::ArraySlice<int64> dimensions) {
+  return builder_.Rev(operand, dimensions);
+}
+
+ComputationDataHandle LocalComputationBuilder::Map(
+    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
+    const LocalComputation& local_computation,
+    tensorflow::gtl::ArraySlice<int64> dimensions,
+    tensorflow::gtl::ArraySlice<ComputationDataHandle> static_operands) {
+  return builder_.Map(operands, local_computation.computation(), dimensions,
+                      static_operands);
+}
+
+ComputationDataHandle LocalComputationBuilder::Reduce(
+    const ComputationDataHandle& operand,
+    const ComputationDataHandle& init_value,
+    const LocalComputation& local_computation,
+    tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce) {
+  return builder_.Reduce(operand, init_value, local_computation.computation(),
+                         dimensions_to_reduce);
+}
+
+ComputationDataHandle LocalComputationBuilder::ReduceWindowWithGeneralPadding(
+    const ComputationDataHandle& operand,
+    const ComputationDataHandle& init_value,
+    const LocalComputation& local_computation,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
+  return builder_.ReduceWindowWithGeneralPadding(
+      operand, init_value, local_computation.computation(), window_dimensions,
+      window_strides, padding);
+}
+
+ComputationDataHandle LocalComputationBuilder::RngNormal(
+    const ComputationDataHandle& mu, const ComputationDataHandle& sigma,
+    const Shape& shape) {
+  return builder_.RngNormal(mu, sigma, shape);
+}
+
+ComputationDataHandle LocalComputationBuilder::RngUniform(
+    const ComputationDataHandle& a, const ComputationDataHandle& b,
+    const Shape& shape) {
+  return builder_.RngUniform(a, b, shape);
+}
+
+ComputationDataHandle LocalComputationBuilder::While(
+    const LocalComputation& condition, const LocalComputation& body,
+    const ComputationDataHandle& init) {
+  return builder_.While(condition.computation(), body.computation(), init);
+}
+
+ComputationDataHandle LocalComputationBuilder::Conditional(
+    const ComputationDataHandle& predicate,
+    const ComputationDataHandle& true_operand,
+    const LocalComputation& true_computation,
+    const ComputationDataHandle& false_operand,
+    const LocalComputation& false_computation) {
+  return builder_.Conditional(predicate, true_operand,
+                              true_computation.computation(), false_operand,
+                              false_computation.computation());
+}
+
+#define _FORWARD(method_name, return_sig, args_sig, args)    \
+  return_sig LocalComputationBuilder::method_name args_sig { \
+    return builder_.method_name args;                        \
+  }
+
+#define _FORWARD_UNOP(method_name)             \
+  _FORWARD(method_name, ComputationDataHandle, \
+           (const ComputationDataHandle& operand), (operand))
+
+#define _FORWARD_BINOP(method_name)                                        \
+  _FORWARD(                                                                \
+      method_name, ComputationDataHandle,                                  \
+      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
+       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions),           \
+      (lhs, rhs, broadcast_dimensions))
+
+#define _FORWARD_TRIOP(method_name)                                        \
+  _FORWARD(                                                                \
+      method_name, ComputationDataHandle,                                  \
+      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
+       const ComputationDataHandle& ehs),                                  \
+      (lhs, rhs, ehs))
+
+_FORWARD_TRIOP(Select)
+_FORWARD_TRIOP(Clamp)
+_FORWARD_BINOP(Eq)
+_FORWARD_BINOP(Ne)
+_FORWARD_BINOP(Ge)
+_FORWARD_BINOP(Gt)
+_FORWARD_BINOP(Lt)
+_FORWARD_BINOP(Le)
+_FORWARD_BINOP(Add)
+_FORWARD_BINOP(Sub)
+_FORWARD_BINOP(Mul)
+_FORWARD_BINOP(Div)
+_FORWARD_BINOP(Rem)
+_FORWARD_BINOP(Max)
+_FORWARD_BINOP(Min)
+_FORWARD_BINOP(And)
+_FORWARD_BINOP(Or)
+_FORWARD_UNOP(Not)
+_FORWARD_UNOP(Abs)
+_FORWARD_UNOP(Exp)
+_FORWARD_UNOP(Floor)
+_FORWARD_UNOP(Ceil)
+_FORWARD_UNOP(Round)
+_FORWARD_UNOP(Log)
+_FORWARD_UNOP(Sign)
+_FORWARD_UNOP(Cos)
+_FORWARD_UNOP(Sin)
+_FORWARD_UNOP(Tanh)
+_FORWARD_UNOP(SqrtF32)
+_FORWARD_UNOP(SquareF32)
+_FORWARD_BINOP(Pow)
+_FORWARD_UNOP(IsFinite)
+_FORWARD_UNOP(ReciprocalF32)
+_FORWARD_UNOP(Neg)
+_FORWARD_UNOP(Sort)
+
+#undef _FORWARD
+#undef _FORWARD_UNOP
+#undef _FORWARD_BINOP
+#undef _FORWARD_TRIOP
+
+void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer) {
+  delete local_shaped_buffer;
+}
+
+void DeleteCompiledLocalComputation(CompiledLocalComputation* computation) {
+  delete computation;
+}
+
+void DeleteLocalComputation(LocalComputation* computation) {
+  delete computation;
+}
+
+}  // namespace swig
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..d682204d26a819556db6f960ee639e763b6f4988
--- /dev/null
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -0,0 +1,335 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_COMPUTATION_BUILDER_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_COMPUTATION_BUILDER_H_
+
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace xla {
+
+namespace swig {
+
+// Initializes the number of replicas that XLA will be initialized with (when
+// first obtaining a handle to the local XLA service). If this is called after
+// the handle to the local XLA service has been established, then an error is
+// returned.
+Status InitializeReplicaCount(int replica_count);
+
+// Returns the replica count that is currently set, regardless of whether the
+// local XLA service has been instantiated yet or not.
+int GetReplicaCount();
+
+// Wraps the local client's infeed-transfer function.
+//
+// The default device ordinal (0) is used.
+Status TransferToInfeedLocal(const Literal& literal);
+
+// Transfers the given literal to the infeed of the given replica.
+//
+// The replica number is resolved to an appropriate device ordinal.
+Status TransferToInfeedLocalReplica(const Literal& literal, int replica_number);
+
+// Transfers a literal of the given shape from the outfeed of the given replica.
+//
+// The replica number is resolved to an appropriate device ordinal.
+StatusOr<std::unique_ptr<Literal> > TransferFromOutfeedLocalReplica(
+    const Shape& shape, int replica_number);
+
+// Wraps a ScopedShapedBuffer produced by copying a literal "to
+// device," i.e. copying a literal to a scoped buffer via the local
+// client.
+class LocalShapedBuffer {
+ public:
+  static LocalShapedBuffer* FromLiteral(
+      const Literal& argument,
+      const tensorflow::gtl::optional<Shape>& shape_with_layout);
+  LocalShapedBuffer(std::unique_ptr<ScopedShapedBuffer> shaped_buffer);
+  const std::unique_ptr<ScopedShapedBuffer>& shaped_buffer() const;
+  std::unique_ptr<Literal> ToLiteral() const;
+
+ private:
+  std::unique_ptr<ScopedShapedBuffer> shaped_buffer_;
+};
+
+// Wraps a LocalExecutable produced by compiling a
+// LocalComputation. The Execute method forwards to that of the
+// underlying LocalExecutable, and additionally handles tranferring
+// arguments and return values in and back out of the client library's
+// local client. This class is intended to be made available to Python
+// via SWIG.
+class CompiledLocalComputation {
+ public:
+  CompiledLocalComputation(std::unique_ptr<LocalExecutable> executable);
+
+  // Execute the computation with the given argument literals, and
+  // with optionally-specified argument layouts. The literals will be
+  // re-laid out according to the corresponding elements of
+  // shapes_with_layout.
+  StatusOr<std::unique_ptr<Literal> > Execute(
+      const std::vector<Literal>& arguments,
+      const std::vector<tensorflow::gtl::optional<Shape> >& shapes_with_layout);
+
+  LocalShapedBuffer* ExecuteWithShapedBuffers(
+      tensorflow::gtl::ArraySlice<LocalShapedBuffer*> argument_handles);
+
+ private:
+  std::unique_ptr<LocalExecutable> executable_;
+};
+
+// Wraps a Computation produced by a LocalComputationBuilder. The
+// Compile method compiles the computation to a (local) executable via
+// the client library's local client. This class is intended to be
+// made available to Python via SWIG.
+class LocalComputation {
+ public:
+  LocalComputation(Computation computation);
+  StatusOr<CompiledLocalComputation*> Compile(
+      const std::vector<Shape>& argument_shapes,
+      const ExecutableBuildOptions* build_options);
+  const Computation& computation() const;
+
+ private:
+  Computation computation_;
+};
+
+// Wraps the ComputationBuilder API in order to:
+// - Support consumption by SWIG in order to be made available to
+//   Python.
+// - Set up the underlying builder to use the client library's
+//   LocalClient.
+// - Wrap Computations in LocalComputations for Python access.
+// - Correspondingly unwrap incoming LocalComputations.
+class LocalComputationBuilder {
+ public:
+  LocalComputationBuilder(const string& computation_name);
+
+  void SetOpMetadata(const OpMetadata& metadata);
+  void ClearOpMetadata();
+
+  // Returns an owned LocalComputation to the caller on success.
+  StatusOr<LocalComputation*> Build();
+
+  ComputationDataHandle Parameter(int64 parameter_number, const Shape& shape,
+                                  const string& name);
+
+  std::unique_ptr<Shape> GetShape(const ComputationDataHandle& operand);
+
+  // Returns the shape of the current return value for the computation.
+  StatusOr<Shape> GetReturnValueShape();
+
+  ComputationDataHandle Infeed(const Shape& shape);
+
+  void Outfeed(const ComputationDataHandle& operand, const Shape& shape,
+               const string& outfeed_config);
+
+  ComputationDataHandle ConstantLiteral(const Literal& literal);
+
+  ComputationDataHandle Broadcast(
+      const ComputationDataHandle& operand,
+      tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
+
+  ComputationDataHandle Pad(const ComputationDataHandle& operand,
+                            const ComputationDataHandle& padding_value,
+                            const PaddingConfig& padding_config);
+
+  ComputationDataHandle Reshape(const ComputationDataHandle& operand,
+                                tensorflow::gtl::ArraySlice<int64> dimensions,
+                                tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  ComputationDataHandle Collapse(const ComputationDataHandle& operand,
+                                 tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  ComputationDataHandle CrossReplicaSum(const ComputationDataHandle& operand);
+
+  ComputationDataHandle Slice(const ComputationDataHandle& operand,
+                              tensorflow::gtl::ArraySlice<int64> start_indices,
+                              tensorflow::gtl::ArraySlice<int64> limit_indices,
+                              tensorflow::gtl::ArraySlice<int64> strides);
+
+  ComputationDataHandle DynamicSlice(
+      const ComputationDataHandle& operand,
+      const ComputationDataHandle& start_indices,
+      tensorflow::gtl::ArraySlice<int64> slice_sizes);
+
+  ComputationDataHandle DynamicUpdateSlice(
+      const ComputationDataHandle& operand, const ComputationDataHandle& update,
+      const ComputationDataHandle& start_indices);
+
+  ComputationDataHandle ConcatInDim(
+      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
+      int64 dimension);
+
+  ComputationDataHandle SelectAndScatterWithGeneralPadding(
+      const ComputationDataHandle& operand, const LocalComputation& select,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64> > padding,
+      const ComputationDataHandle& source,
+      const ComputationDataHandle& init_value, const LocalComputation& scatter);
+
+  ComputationDataHandle Tuple(
+      tensorflow::gtl::ArraySlice<ComputationDataHandle> elements);
+
+  ComputationDataHandle GetTupleElement(const ComputationDataHandle& tuple_data,
+                                        int64 index);
+
+  ComputationDataHandle Dot(const ComputationDataHandle& lhs,
+                            const ComputationDataHandle& rhs);
+
+  ComputationDataHandle DotGeneral(
+      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+      const DotDimensionNumbers& dimension_numbers);
+
+  ComputationDataHandle ConvGeneralDilated(
+      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64> > padding,
+      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+      tensorflow::gtl::ArraySlice<int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+
+  ComputationDataHandle ConvertElementType(const ComputationDataHandle& operand,
+                                           PrimitiveType new_element_type);
+
+  ComputationDataHandle Call(
+      const LocalComputation& local_computation,
+      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands);
+
+  ComputationDataHandle Transpose(
+      const ComputationDataHandle& operand,
+      tensorflow::gtl::ArraySlice<int64> permutation);
+
+  ComputationDataHandle Rev(const ComputationDataHandle& operand,
+                            tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  ComputationDataHandle Map(
+      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
+      const LocalComputation& local_computation,
+      tensorflow::gtl::ArraySlice<int64> dimensions,
+      tensorflow::gtl::ArraySlice<ComputationDataHandle> static_operands);
+
+  ComputationDataHandle Reduce(
+      const ComputationDataHandle& operand,
+      const ComputationDataHandle& init_value,
+      const LocalComputation& local_computation,
+      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
+
+  ComputationDataHandle ReduceWindowWithGeneralPadding(
+      const ComputationDataHandle& operand,
+      const ComputationDataHandle& init_value,
+      const LocalComputation& local_computation,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64> > padding);
+
+  ComputationDataHandle RngNormal(const ComputationDataHandle& mu,
+                                  const ComputationDataHandle& sigma,
+                                  const Shape& shape);
+
+  ComputationDataHandle RngUniform(const ComputationDataHandle& a,
+                                   const ComputationDataHandle& b,
+                                   const Shape& shape);
+
+  ComputationDataHandle While(const LocalComputation& condition,
+                              const LocalComputation& body,
+                              const ComputationDataHandle& init);
+
+  ComputationDataHandle Conditional(const ComputationDataHandle& predicate,
+                                    const ComputationDataHandle& true_operand,
+                                    const LocalComputation& true_computation,
+                                    const ComputationDataHandle& false_operand,
+                                    const LocalComputation& false_computation);
+
+#define _FORWARD(method_name, return_sig, args_sig) \
+  return_sig method_name args_sig;
+
+#define _FORWARD_UNOP(method_name)             \
+  _FORWARD(method_name, ComputationDataHandle, \
+           (const ComputationDataHandle& operand))
+
+#define _FORWARD_BINOP(method_name)                                        \
+  _FORWARD(                                                                \
+      method_name, ComputationDataHandle,                                  \
+      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
+       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions))
+
+#define _FORWARD_TRIOP(method_name)                                        \
+  _FORWARD(                                                                \
+      method_name, ComputationDataHandle,                                  \
+      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
+       const ComputationDataHandle& ehs))
+
+  _FORWARD_TRIOP(Select)
+  _FORWARD_TRIOP(Clamp)
+  _FORWARD_BINOP(Eq)
+  _FORWARD_BINOP(Ne)
+  _FORWARD_BINOP(Ge)
+  _FORWARD_BINOP(Gt)
+  _FORWARD_BINOP(Lt)
+  _FORWARD_BINOP(Le)
+  _FORWARD_BINOP(Add)
+  _FORWARD_BINOP(Sub)
+  _FORWARD_BINOP(Mul)
+  _FORWARD_BINOP(Div)
+  _FORWARD_BINOP(Rem)
+  _FORWARD_BINOP(Max)
+  _FORWARD_BINOP(Min)
+  _FORWARD_BINOP(And)
+  _FORWARD_BINOP(Or)
+  _FORWARD_UNOP(Not)
+  _FORWARD_UNOP(Abs)
+  _FORWARD_UNOP(Exp)
+  _FORWARD_UNOP(Floor)
+  _FORWARD_UNOP(Ceil)
+  _FORWARD_UNOP(Round)
+  _FORWARD_UNOP(Log)
+  _FORWARD_UNOP(Sign)
+  _FORWARD_UNOP(Cos)
+  _FORWARD_UNOP(Sin)
+  _FORWARD_UNOP(Tanh)
+  _FORWARD_UNOP(SqrtF32)
+  _FORWARD_UNOP(SquareF32)
+  _FORWARD_BINOP(Pow)
+  _FORWARD_UNOP(IsFinite)
+  _FORWARD_UNOP(ReciprocalF32)
+  _FORWARD_UNOP(Neg)
+  _FORWARD_UNOP(Sort)
+
+#undef _FORWARD
+#undef _FORWARD_UNOP
+#undef _FORWARD_BINOP
+#undef _FORWARD_TRIOP
+
+ private:
+  ComputationBuilder builder_;
+};
+
+// Functions for freeing resources from the Python side.
+void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer);
+void DeleteCompiledLocalComputation(CompiledLocalComputation* computation);
+void DeleteLocalComputation(LocalComputation* computation);
+
+}  // namespace swig
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_COMPUTATION_BUILDER_H_
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
new file mode 100644
index 0000000000000000000000000000000000000000..fa6c8bfa296c7f80e95e4afc4a6062d133643c53
--- /dev/null
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -0,0 +1,937 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// SWIG typemaps and declarations for building, compiling, and
+// executing XLA computations, wrapping most of what is declared in
+// local_computation_builder.h.
+//
+// The typemaps below implement/assert the following correspondences
+// (with elaborations below):
+//
+//    C++                                  Python
+// -------------------------------------+---------------------------------------
+//  ComputationDataHandle              <-> int
+//  ArraySlice<int64>                  <-  sequence of int
+//  ArraySlice<ComputationDataHandle>  <-  sequence of int
+//  Literal                            <-> (nested tuple of) numpy ndarray
+//  std::vector<Literal>               <-  sequence of (nested tuple of) ndarray
+//  Shape                               -> pair holding (dtype, dimensions)
+//                                     <-  object duck-typed as xla_client.Shape
+//  std::vector<Shape>                 <-  sequence of xla_client.Shape objects
+//  PrimitiveType                      <-  int
+//  ArraySlice<pair<int64, in64>>      <-  sequence of int pairs
+//  PaddingConfig proto                <-  corresponding Python proto
+//  ConvolutionDimensionNumbers proto  <-  corresponding Python proto
+//  DotDimensionNumbers proto          <-  corresponding Python proto
+//
+// Arrows indicate whether a conversion only ever occurs in one
+// direction, or whether it is maintained bidirectionally.
+//
+// The Python objects corresponding to C++ Literals have the type:
+//
+//   T = ndarray | (T, ...)
+//
+// where a terminal numpy ndarray translates to a Literal with a
+// non-tuple Shape, an XLA primitive element type corresponding to the
+// ndarray's dtype. Meanwhile, a non-terminal "tuple of T" translates
+// to a tuple-shaped Literal whose tuple components are translated
+// recursively. For example, if x is a numpy ndarray in Python, with
+// shape (2, 3) and dtype of dtype('float32'), then x translates to a
+// Literal with rank 2, dimension 2 and 3, and XLA primitive type
+// F32. Meanwhile,
+//
+//   (x, (x, x), (x,)),
+//
+// translates to a tuple-shaped XLA Literal, whose component subshapes
+// are a 2x3 F32-shaped literal followed by two tuple-shaped literals.
+//
+// Shapes output by C++ become Python objects with the type:
+//
+//   T            = (dtype, S)
+//   S            = DIMENSIONS | TUPLE_SHAPES
+//   DIMENSIONS   = (int, ...)
+//   TUPLE_SHAPES = (T, ...)
+//
+// In the pair described by the T rule, the terminal dtype determines
+// whether S expands as DIMENSIONS or TUPLE_SHAPES. Namely if it is
+// dtype('O'), numpy's object dtype, the structure represents a tuple
+// shape and the expansion of the non-terminal S is
+// TUPLE_SHAPES. Otherwise, dtype describes a primitive element type
+// and S expands into DIMENSIONS giving dimension sizes. For example:
+//
+//   (dtype('float32'), (3, 5, 7))
+//
+// describes a 3x5x7 array of F32s, and
+//
+//   (dtype('O'), ((dtype('float32'), (2, 3)),
+//                 (dtype('float64'), (4, 5))))
+//
+// describes a tuple shape with two subshapes: the first a 2x3 F32,
+// and the other a 4x5 F64.
+//
+// The Python int corresponding to a PrimitiveType enum must be valid
+// per xla_data.proto (e.g. xla_data.PRED, xla_data.F32).
+//
+// The SWIG object wrappers generated by this file are not intended
+// for end use, but rather for internal use in the Python XLA client,
+// xla_client.py.
+//
+// One central reason for the Python-side indirection is that the
+// Python-side objects produced by the typemaps in this file are
+// further packaged up by xla_client before being passed on. For
+// instance, xla_client wraps the long produced for a C++
+// ComputationDataHandle in a Python ComputationDataHandle proto,
+// rather than exposing a raw long outside of the client. Similarly,
+// the Python pair produced for a C++ Shape is further wrapped in a
+// Python class (xla_client.Shape) so as not to expose the raw pair
+// externally.
+//
+// Other SWIG object wrappers (e.g. of LocalComputation) are further
+// wrapped by xla_client in order to set up a custom destructor that
+// triggers memory deallocation on the C++ side.
+
+%module(threads="1") local_computation_builder
+
+// Keep the GIL except where explicitly specified.
+%nothread;
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+// Must be included first
+#include "tensorflow/python/lib/core/numpy.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/compiler/xla/python/numpy_bridge.h"
+#include "tensorflow/compiler/xla/python/local_computation_builder.h"
+
+using namespace xla;
+using namespace xla::swig;
+
+namespace xla {
+namespace swig {
+
+bool GetIntAttr(PyObject* o, const char* field, int64* result) {
+  PyObject* fo = PyObject_GetAttrString(o, field);
+  if (!fo) {
+    return false;
+  }
+  const int64 value = numpy::PyIntOrPyLongToLong(fo);
+  if (value == -1 && PyErr_Occurred()) {
+    Py_DECREF(fo);
+    return false;
+  }
+  Py_DECREF(fo);
+  *result = value;
+  return true;
+}
+
+}
+}
+%}
+
+// Required to use PyArray_* functions.
+%init %{
+tensorflow::ImportNumpy();
+%}
+
+// ComputationDataHandle
+
+%typemap(in) const ComputationDataHandle& (ComputationDataHandle temp) {
+  const int64 handle = numpy::PyIntOrPyLongToLong($input);
+  if (handle == -1 && PyErr_Occurred()) {
+    return NULL;
+  }
+  temp.set_handle(handle);
+  $1 = &temp;
+}
+
+%typemap(out) ComputationDataHandle {
+  $result = numpy::LongToPyIntOrPyLong($1.handle());
+}
+
+%typemap(out) StatusOr<xla::swig::CompiledLocalComputation*> {
+  if ($1.ok()) {
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::CompiledLocalComputation*)
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    return NULL;
+  }
+}
+
+%typemap(out) StatusOr< std::unique_ptr<Literal> > {
+  if ($1.ok()) {
+    std::unique_ptr<Literal> value = $1.ConsumeValueOrDie();
+    $result = numpy::PyObjectFromXlaLiteral(*value);
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    return NULL;
+  }
+}
+
+%typemap(out) StatusOr<xla::swig::LocalComputation*> {
+  if ($1.ok()) {
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::LocalComputation*)
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    return NULL;
+  }
+}
+
+%typemap(out) StatusOr<Shape> {
+  if ($1.ok()) {
+    $result = numpy::PyShapeInfoFromXlaShape($1.ConsumeValueOrDie());
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    return NULL;
+  }
+}
+
+%typemap(out) Status {
+  if (!$1.ok()) {
+    PyErr_SetString(
+        PyExc_RuntimeError, $1.ToString().c_str());
+    return NULL;
+  }
+  $result = Py_None;
+}
+
+// ArraySlice<int64>
+
+%typemap(in) tensorflow::gtl::ArraySlice<int64>
+    (std::vector<int64> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  temps.resize(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    PyObject* py_int = numpy::PyNumberToPyInt(o);
+    if (!py_int) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "Argument sequence element cannot be converted to int");
+      Py_DECREF(o);
+      return NULL;
+    }
+    temps[i] = numpy::PyIntOrPyLongToLong(py_int);
+    if (temps[i] == -1 && PyErr_Occurred()) {
+      Py_DECREF(py_int);
+      Py_DECREF(o);
+      return NULL;
+    }
+    Py_DECREF(py_int);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// ComputationDataHandle
+
+%typemap(in) tensorflow::gtl::ArraySlice<ComputationDataHandle>
+    (std::vector<ComputationDataHandle> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  temps.resize(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    PyObject* py_int = numpy::PyNumberToPyInt(o);
+    if (!py_int) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "Argument sequence element cannot be converted to int");
+      return NULL;
+    }
+    const int64 handle = numpy::PyIntOrPyLongToLong(py_int);
+    if (handle == -1 && PyErr_Occurred()) {
+      Py_DECREF(py_int);
+      Py_DECREF(o);
+      return NULL;
+    }
+    temps[i].set_handle(handle);
+    Py_DECREF(py_int);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// LocalShapedBuffer*
+
+%typemap(in) tensorflow::gtl::ArraySlice<xla::swig::LocalShapedBuffer*>
+    (std::vector<LocalShapedBuffer*> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    LocalShapedBuffer* lsbp;
+    if ((SWIG_ConvertPtr(o, (void**) &lsbp, $descriptor(xla::swig::LocalShapedBuffer*),
+                         SWIG_POINTER_EXCEPTION)) == -1) {
+      return NULL;
+    }
+    temps.push_back(lsbp);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// Literal
+
+%typemap(in) const Literal& (StatusOr< std::unique_ptr<Literal> > literal_status) {
+  literal_status = numpy::XlaLiteralFromPyObject($input);
+  if (!literal_status.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
+    return NULL;
+  }
+  $1 = literal_status.ValueOrDie().get();
+}
+
+%typemap(out) std::unique_ptr<Literal> {
+  $result = numpy::PyObjectFromXlaLiteral(*$1);
+}
+
+%typemap(out) StatusOr< std::unique_ptr<Literal> > {
+  if (!$1.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    return NULL;
+  }
+  $result = numpy::PyObjectFromXlaLiteral(*$1.ValueOrDie());
+}
+
+%typemap(in) const std::vector<Literal>& (std::vector<Literal> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    StatusOr< std::unique_ptr<Literal> > literal_status = numpy::XlaLiteralFromPyObject(o);
+    if (!literal_status.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
+      Py_DECREF(o);
+      return NULL;
+    }
+    temps.push_back(std::move(*literal_status.ConsumeValueOrDie()));
+    Py_DECREF(o);
+  }
+  $1 = &temps;
+}
+
+// OpMetadata
+
+%typemap(in) const OpMetadata& (OpMetadata temp) {
+  StatusOr<OpMetadata> statusor = numpy::OpMetadataFromPyObject($input);
+  if (!statusor.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+    return NULL;
+  }
+  temp = std::move(statusor).ValueOrDie();
+  $1 = &temp;
+}
+
+// Shape
+
+%typemap(in) const Shape& (Shape temp) {
+  StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+  if (!statusor.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+    return NULL;
+  }
+  temp = std::move(statusor).ValueOrDie();
+  $1 = &temp;
+}
+
+%typemap(in) const tensorflow::gtl::optional<Shape>& (
+    tensorflow::gtl::optional<Shape> temp) {
+  if ($input == Py_None) {
+    temp = tensorflow::gtl::nullopt;
+    $1 = &temp;
+  } else {
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+      return NULL;
+    }
+    temp = std::move(statusor).ValueOrDie();
+    $1 = &temp;
+  }
+}
+
+%typemap(out) std::unique_ptr<Shape> {
+  $result = numpy::PyShapeInfoFromXlaShape(*$1);
+}
+
+%typemap(in) const std::vector<Shape>& (std::vector<Shape> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+    Py_DECREF(o);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+      return NULL;
+    }
+    temps.push_back(statusor.ConsumeValueOrDie());
+  }
+  $1 = &temps;
+}
+
+%typemap(in) const std::vector<tensorflow::gtl::optional<Shape> >& (
+    std::vector<tensorflow::gtl::optional<Shape> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    if (o == Py_None) {
+      temps.push_back(tensorflow::gtl::nullopt);
+    } else {
+      StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+      Py_DECREF(o);
+      if (!statusor.ok()) {
+        PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+        return NULL;
+      }
+      temps.push_back(statusor.ConsumeValueOrDie());
+    }
+  }
+  $1 = &temps;
+}
+
+// PrimitiveType
+
+%typemap(in) PrimitiveType {
+  PyObject* py_int = numpy::PyNumberToPyInt($input);
+  if (!py_int) {
+    PyErr_SetString(PyExc_TypeError, "Argument cannot be converted to int");
+    return NULL;
+  }
+  const long value = numpy::PyIntOrPyLongToLong(py_int);
+  if (value == -1 && PyErr_Occurred()) {
+    Py_DECREF(py_int);
+    return NULL;
+  }
+  if (!PrimitiveType_IsValid(value)) {
+    PyErr_SetString(
+        PyExc_TypeError, "Argument not valid for PrimitiveType enum");
+    Py_DECREF(py_int);
+    return NULL;
+  }
+  $1 = static_cast<PrimitiveType>(value);
+}
+
+// ArraySlice<pair<int64, in64>>
+
+%typemap(in) tensorflow::gtl::ArraySlice<std::pair<int64, int64> >
+    (std::vector<std::pair<int64, int64> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    return NULL;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    if (!o) {
+      return NULL;
+    }
+    PyObject* first = PyTuple_GetItem(o, 0);
+    if (!first) {
+      Py_DECREF(o);
+      return NULL;
+    }
+    PyObject* first_pyint = numpy::PyNumberToPyInt(first);
+    if (!first_pyint) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "First pair item cannot be converted to int");
+      Py_DECREF(o);
+      return NULL;
+    }
+    PyObject* second = PyTuple_GetItem(o, 1);
+    if (!second) {
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      return NULL;
+    }
+    PyObject* second_pyint = numpy::PyNumberToPyInt(second);
+    if (!second_pyint) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "Second pair item cannot be converted to int");
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      return NULL;
+    }
+    const int64 first_value = numpy::PyIntOrPyLongToLong(first_pyint);
+    if (first_value == -1 && PyErr_Occurred()) {
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      Py_DECREF(second_pyint);
+      return NULL;
+    }
+    const int64 second_value = numpy::PyIntOrPyLongToLong(second_pyint);
+    if (second_value == -1 && PyErr_Occurred()) {
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      Py_DECREF(second_pyint);
+      return NULL;
+    }
+    temps.push_back(std::make_pair(first_value, second_value));
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// DotDimensionNumbers
+
+%typemap(in) const DotDimensionNumbers&
+    (DotDimensionNumbers dimension_numbers) {
+  int length;
+
+  /* lhs_contracting_dimensions */
+  PyObject* lhs_contracting_dimensions = PyObject_GetAttrString(
+      $input, "lhs_contracting_dimensions");
+  if (!lhs_contracting_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(lhs_contracting_dimensions);
+  if (length == -1) {
+    Py_DECREF(lhs_contracting_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(lhs_contracting_dimensions, i);
+    if (!item) {
+      Py_DECREF(lhs_contracting_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(lhs_contracting_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_lhs_contracting_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(lhs_contracting_dimensions);
+
+  /* rhs_contracting_dimensions */
+  PyObject* rhs_contracting_dimensions = PyObject_GetAttrString(
+      $input, "rhs_contracting_dimensions");
+  if (!lhs_contracting_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(rhs_contracting_dimensions);
+  if (length == -1) {
+    Py_DECREF(rhs_contracting_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(rhs_contracting_dimensions, i);
+    if (!item) {
+      Py_DECREF(rhs_contracting_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(rhs_contracting_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_rhs_contracting_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(rhs_contracting_dimensions);
+
+  /* lhs_batch_dimensions */
+  PyObject* lhs_batch_dimensions = PyObject_GetAttrString(
+      $input, "lhs_batch_dimensions");
+  if (!lhs_batch_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(lhs_batch_dimensions);
+  if (length == -1) {
+    Py_DECREF(lhs_batch_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(lhs_batch_dimensions, i);
+    if (!item) {
+      Py_DECREF(lhs_batch_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(lhs_batch_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_lhs_batch_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(lhs_batch_dimensions);
+
+  /* rhs_batch_dimensions */
+  PyObject* rhs_batch_dimensions = PyObject_GetAttrString(
+      $input, "rhs_batch_dimensions");
+  if (!rhs_batch_dimensions) {
+    return NULL;
+  }
+
+  length = PySequence_Size(rhs_batch_dimensions);
+  if (length == -1) {
+    Py_DECREF(rhs_batch_dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(rhs_batch_dimensions, i);
+    if (!item) {
+      Py_DECREF(rhs_batch_dimensions);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(rhs_batch_dimensions);
+      return NULL;
+    }
+    dimension_numbers.add_rhs_batch_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(rhs_batch_dimensions);
+
+  $1 = &dimension_numbers;
+}
+
+// PaddingConfig
+
+%typemap(in) const PaddingConfig&
+    (PaddingConfig padding_config) {
+  PyObject* dimensions = PyObject_GetAttrString($input, "dimensions");
+  if (!dimensions) {
+    return NULL;
+  }
+
+  int length = PySequence_Size(dimensions);
+  if (length == -1) {
+    Py_DECREF(dimensions);
+    return NULL;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(dimensions, i);
+    if (!item) {
+      Py_DECREF(dimensions);
+      return NULL;
+    }
+    int64 edge_padding_low, edge_padding_high, interior_padding;
+    if (!GetIntAttr(item, "edge_padding_low", &edge_padding_low)
+        || !GetIntAttr(item, "edge_padding_high", &edge_padding_high)
+        || !GetIntAttr(item, "interior_padding", &interior_padding)) {
+      Py_DECREF(item);
+      Py_DECREF(dimensions);
+      return NULL;
+    }
+    Py_DECREF(item);
+
+    PaddingConfig::PaddingConfigDimension* dimension =
+        padding_config.add_dimensions();
+    dimension->set_edge_padding_low(edge_padding_low);
+    dimension->set_edge_padding_high(edge_padding_high);
+    dimension->set_interior_padding(interior_padding);
+  }
+  Py_DECREF(dimensions);
+
+  $1 = &padding_config;
+}
+
+// ConvolutionDimensionNumbers
+
+%typemap(in) const ConvolutionDimensionNumbers&
+    (ConvolutionDimensionNumbers dimension_numbers) {
+  int64 value;
+
+  if (!GetIntAttr($input, "input_batch_dimension", &value)) {
+    return NULL;
+  }
+  dimension_numbers.set_input_batch_dimension(value);
+
+  if (!GetIntAttr($input, "input_feature_dimension", &value)) {
+    return NULL;
+  }
+  dimension_numbers.set_input_feature_dimension(value);
+
+  if (!GetIntAttr($input, "output_batch_dimension", &value)) {
+    return NULL;
+  }
+  dimension_numbers.set_output_batch_dimension(value);
+
+  if (!GetIntAttr($input, "output_feature_dimension", &value)) {
+    return NULL;
+  }
+  dimension_numbers.set_output_feature_dimension(value);
+
+  if (!GetIntAttr($input, "kernel_output_feature_dimension", &value)) {
+    return NULL;
+  }
+  dimension_numbers.set_kernel_output_feature_dimension(value);
+
+  if (!GetIntAttr($input, "kernel_input_feature_dimension", &value)) {
+    return NULL;
+  }
+  dimension_numbers.set_kernel_input_feature_dimension(value);
+
+  PyObject* o;
+  int length;
+
+  o = PyObject_GetAttrString($input, "input_spatial_dimensions");
+  if (!o) {
+    return NULL;
+  }
+  length = PySequence_Size(o);
+  if (length == -1) {
+    Py_DECREF(o);
+    return NULL;
+  }
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(o, i);
+    if (!item) {
+      Py_DECREF(o);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(o);
+      return NULL;
+    }
+    dimension_numbers.add_input_spatial_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(o);
+
+  o = PyObject_GetAttrString($input, "kernel_spatial_dimensions");
+  if (!o) {
+    return NULL;
+  }
+  length = PySequence_Size(o);
+  if (length == -1) {
+    Py_DECREF(o);
+    return NULL;
+  }
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(o, i);
+    if (!item) {
+      Py_DECREF(o);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(o);
+      return NULL;
+    }
+    dimension_numbers.add_kernel_spatial_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(o);
+
+  o = PyObject_GetAttrString($input, "output_spatial_dimensions");
+  if (!o) {
+    return NULL;
+  }
+  length = PySequence_Size(o);
+  if (length == -1) {
+    Py_DECREF(o);
+    return NULL;
+  }
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(o, i);
+    if (!item) {
+      Py_DECREF(o);
+      return NULL;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(o);
+      return NULL;
+    }
+    dimension_numbers.add_output_spatial_dimensions(dimension);
+    Py_DECREF(item);
+  }
+  Py_DECREF(o);
+
+  $1 = &dimension_numbers;
+}
+
+// ExecutableBuildOptions
+
+%typemap(in) const ExecutableBuildOptions*
+    (ExecutableBuildOptions build_options) {
+  if ($input == Py_None) {
+    $1 = NULL;
+  } else {
+    PyObject* o = PyObject_GetAttrString($input, "generate_hlo_graph");
+    if (!o) {
+      return NULL;
+    }
+    if (o != Py_None) {
+      if (!PyString_Check(o)) {
+        PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.generate_hlo_graph must be a string or None.");
+        return NULL;
+      }
+      build_options.set_generate_hlo_graph(PyString_AsString(o));
+    }
+    Py_DECREF(o);
+
+    $1 = &build_options;
+  }
+}
+
+%ignoreall
+%unignore xla;
+%unignore xla::swig;
+%unignore xla::swig::InitializeReplicaCount;
+%unignore xla::swig::GetReplicaCount;
+%unignore xla::swig::TransferToInfeedLocal;
+%unignore xla::swig::TransferToInfeedLocalReplica;
+%unignore xla::swig::TransferFromOutfeedLocalReplica;
+%unignore xla::swig::LocalShapedBuffer;
+%unignore xla::swig::LocalShapedBuffer::FromLiteral;
+%unignore xla::swig::LocalShapedBuffer::ToLiteral;
+%unignore xla::swig::CompiledLocalComputation;
+%unignore xla::swig::CompiledLocalComputation::Execute;
+%unignore xla::swig::CompiledLocalComputation::ExecuteWithShapedBuffers;
+%unignore xla::swig::LocalComputation;
+%unignore xla::swig::LocalComputation::Compile;
+%unignore xla::swig::LocalComputationBuilder;
+%unignore xla::swig::LocalComputationBuilder::LocalComputationBuilder;
+%unignore xla::swig::LocalComputationBuilder::Build;
+%unignore xla::swig::LocalComputationBuilder::SetOpMetadata;
+%unignore xla::swig::LocalComputationBuilder::ClearOpMetadata;
+%unignore xla::swig::LocalComputationBuilder::Parameter;
+%unignore xla::swig::LocalComputationBuilder::GetShape;
+%unignore xla::swig::LocalComputationBuilder::GetReturnValueShape;
+%unignore xla::swig::LocalComputationBuilder::Infeed;
+%unignore xla::swig::LocalComputationBuilder::Outfeed;
+%unignore xla::swig::LocalComputationBuilder::ConstantLiteral;
+%unignore xla::swig::LocalComputationBuilder::ConstantR0;
+%unignore xla::swig::LocalComputationBuilder::Broadcast;
+%unignore xla::swig::LocalComputationBuilder::Pad;
+%unignore xla::swig::LocalComputationBuilder::Reshape;
+%unignore xla::swig::LocalComputationBuilder::Collapse;
+%unignore xla::swig::LocalComputationBuilder::CrossReplicaSum;
+%unignore xla::swig::LocalComputationBuilder::Slice;
+%unignore xla::swig::LocalComputationBuilder::DynamicSlice;
+%unignore xla::swig::LocalComputationBuilder::DynamicUpdateSlice;
+%unignore xla::swig::LocalComputationBuilder::ConcatInDim;
+%unignore xla::swig::LocalComputationBuilder::SelectAndScatterWithGeneralPadding;
+%unignore xla::swig::LocalComputationBuilder::Select;
+%unignore xla::swig::LocalComputationBuilder::Tuple;
+%unignore xla::swig::LocalComputationBuilder::GetTupleElement;
+%unignore xla::swig::LocalComputationBuilder::ConvertElementType;
+%unignore xla::swig::LocalComputationBuilder::Call;
+%unignore xla::swig::LocalComputationBuilder::Transpose;
+%unignore xla::swig::LocalComputationBuilder::Rev;
+%unignore xla::swig::LocalComputationBuilder::Clamp;
+%unignore xla::swig::LocalComputationBuilder::Map;
+%unignore xla::swig::LocalComputationBuilder::Reduce;
+%unignore xla::swig::LocalComputationBuilder::ReduceWindowWithGeneralPadding;
+%unignore xla::swig::LocalComputationBuilder::RngNormal;
+%unignore xla::swig::LocalComputationBuilder::RngUniform;
+%unignore xla::swig::LocalComputationBuilder::RngBernoulli;
+%unignore xla::swig::LocalComputationBuilder::While;
+%unignore xla::swig::LocalComputationBuilder::Conditional;
+%unignore xla::swig::LocalComputationBuilder::Eq;
+%unignore xla::swig::LocalComputationBuilder::Ne;
+%unignore xla::swig::LocalComputationBuilder::Ge;
+%unignore xla::swig::LocalComputationBuilder::Gt;
+%unignore xla::swig::LocalComputationBuilder::Lt;
+%unignore xla::swig::LocalComputationBuilder::Le;
+%unignore xla::swig::LocalComputationBuilder::Dot;
+%unignore xla::swig::LocalComputationBuilder::DotGeneral;
+%unignore xla::swig::LocalComputationBuilder::ConvGeneralDilated;
+%unignore xla::swig::LocalComputationBuilder::Add;
+%unignore xla::swig::LocalComputationBuilder::Sub;
+%unignore xla::swig::LocalComputationBuilder::Mul;
+%unignore xla::swig::LocalComputationBuilder::Div;
+%unignore xla::swig::LocalComputationBuilder::Rem;
+%unignore xla::swig::LocalComputationBuilder::Max;
+%unignore xla::swig::LocalComputationBuilder::Min;
+%unignore xla::swig::LocalComputationBuilder::And;
+%unignore xla::swig::LocalComputationBuilder::Or;
+%unignore xla::swig::LocalComputationBuilder::Not;
+%unignore xla::swig::LocalComputationBuilder::Abs;
+%unignore xla::swig::LocalComputationBuilder::Exp;
+%unignore xla::swig::LocalComputationBuilder::Floor;
+%unignore xla::swig::LocalComputationBuilder::Ceil;
+%unignore xla::swig::LocalComputationBuilder::Round;
+%unignore xla::swig::LocalComputationBuilder::Log;
+%unignore xla::swig::LocalComputationBuilder::Sign;
+%unignore xla::swig::LocalComputationBuilder::Cos;
+%unignore xla::swig::LocalComputationBuilder::Sin;
+%unignore xla::swig::LocalComputationBuilder::Tanh;
+%unignore xla::swig::LocalComputationBuilder::SqrtF32;
+%unignore xla::swig::LocalComputationBuilder::SquareF32;
+%unignore xla::swig::LocalComputationBuilder::Pow;
+%unignore xla::swig::LocalComputationBuilder::IsFinite;
+%unignore xla::swig::LocalComputationBuilder::ReciprocalF32;
+%unignore xla::swig::LocalComputationBuilder::Neg;
+%unignore xla::swig::LocalComputationBuilder::Sort;
+%unignore xla::swig::DeleteLocalShapedBuffer;
+%unignore xla::swig::DeleteLocalComputation;
+%unignore xla::swig::DeleteCompiledLocalComputation;
+
+%thread;
+%include "tensorflow/compiler/xla/python/local_computation_builder.h"
+%nothread;
+
+%unignoreall
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d87480728aab1d4ebbc71c6c7504d37cae5edaf
--- /dev/null
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -0,0 +1,517 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/numpy_bridge.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace swig {
+
+namespace numpy {
+
+int PrimitiveTypeToNumpyType(PrimitiveType primitive_type) {
+  switch (primitive_type) {
+    case PRED:
+      return NPY_BOOL;
+    case S8:
+      return NPY_INT8;
+    case S16:
+      return NPY_INT16;
+    case S32:
+      return NPY_INT32;
+    case S64:
+      return NPY_INT64;
+    case U8:
+      return NPY_UINT8;
+    case U16:
+      return NPY_UINT16;
+    case U32:
+      return NPY_UINT32;
+    case U64:
+      return NPY_UINT64;
+    case F16:
+      return NPY_FLOAT16;
+    case F32:
+      return NPY_FLOAT32;
+    case F64:
+      return NPY_FLOAT64;
+    case TUPLE:
+      return NPY_OBJECT;
+    default:
+      LOG(FATAL) << "No Numpy type for XLA primitive type " << primitive_type;
+  }
+}
+
+PrimitiveType NumpyTypeToPrimitiveType(int np_type) {
+  switch (np_type) {
+    case NPY_BOOL:
+      return PRED;
+    case NPY_INT8:
+      return S8;
+    case NPY_INT16:
+      return S16;
+    case NPY_INT32:
+      return S32;
+    case NPY_INT64:
+      return S64;
+    case NPY_UINT8:
+      return U8;
+    case NPY_UINT16:
+      return U16;
+    case NPY_UINT32:
+      return U32;
+    case NPY_UINT64:
+      return U64;
+    case NPY_FLOAT16:
+      return F16;
+    case NPY_FLOAT32:
+      return F32;
+    case NPY_FLOAT64:
+      return F64;
+    case NPY_OBJECT:
+      return TUPLE;
+    default:
+      LOG(FATAL) << "No XLA primitive type for Numpy type " << np_type;
+  }
+}
+
+bool NumpyTypeIsValid(int np_type) {
+  switch (np_type) {
+    case NPY_BOOL:
+    case NPY_INT8:
+    case NPY_INT16:
+    case NPY_INT32:
+    case NPY_INT64:
+    case NPY_UINT8:
+    case NPY_UINT16:
+    case NPY_UINT32:
+    case NPY_UINT64:
+    case NPY_FLOAT16:
+    case NPY_FLOAT32:
+    case NPY_FLOAT64:
+    case NPY_OBJECT:
+      return true;
+    default:
+      return false;
+  }
+}
+
+PyObject* PyShapeInfoFromXlaShape(const Shape& shape) {
+  int np_typenum = PrimitiveTypeToNumpyType(shape.element_type());
+  PyArray_Descr* np_dtype = PyArray_DescrFromType(np_typenum);
+
+  PyObject* dimensions;
+  if (ShapeUtil::IsTuple(shape)) {
+    int num_elements = ShapeUtil::TupleElementCount(shape);
+    dimensions = PyTuple_New(ShapeUtil::TupleElementCount(shape));
+    for (int i = 0; i < num_elements; ++i) {
+      PyTuple_SET_ITEM(
+          dimensions, i,
+          PyShapeInfoFromXlaShape(ShapeUtil::GetTupleElementShape(shape, i)));
+    }
+  } else {
+    int rank = ShapeUtil::Rank(shape);
+    dimensions = PyTuple_New(rank);
+    for (int i = 0; i < rank; ++i) {
+      PyTuple_SET_ITEM(dimensions, i,
+                       LongToPyIntOrPyLong(ShapeUtil::GetDimension(shape, i)));
+    }
+  }
+  return PyTuple_Pack(2, np_dtype, dimensions);
+}
+
+// Precondition: o->ob_type == &PyArrayDescr_Type
+static int NumpyTypenum(PyObject* o) {
+  return reinterpret_cast<PyArray_Descr*>(o)->type_num;
+}
+
+// Extracts the string held inside r and returns it as a C++ string.
+//
+// NOTE: this is an internal helper for conversion to a C++, and so decrefs r.
+static string ExtractStringAndDecref(PyObject* r) {
+  auto error = [r] {
+    return tensorflow::strings::Printf("<failed conversion of %p>", r);
+  };
+  if (r == nullptr) {
+    return error();
+  }
+#if PY_MAJOR_VERSION < 3
+  string result = PyString_AsString(r);
+#else
+  PyObject* bytes = PyUnicode_AsEncodedString(r, 0, 0);
+  if (bytes == nullptr) {
+    return error();
+  }
+  CHECK(PyBytes_Check(bytes));
+  string result = PyBytes_AsString(bytes);
+  Py_DECREF(bytes);
+#endif
+  Py_DECREF(r);
+  return result;
+}
+
+// Safely returns a str of the given Python object o as a C++ string.
+static string PyObjectCppStr(PyObject* o) {
+  PyObject* s = PyObject_Str(o);
+  return ExtractStringAndDecref(s);
+}
+
+// Safely returns a repr of the given Python object o as a C++ string.
+static string PyObjectCppRepr(PyObject* o) {
+  PyObject* r = PyObject_Repr(o);
+  return ExtractStringAndDecref(r);
+}
+
+StatusOr<Shape> XlaShapeFromPyShape(PyObject* o) {
+  auto error = [o](const string& prefix) {
+    return InvalidArgument("%s; got %s", prefix.c_str(),
+                           PyObjectCppRepr(o).c_str());
+  };
+
+  auto get_attr = [o, &error](const string& field) -> StatusOr<PyObject*> {
+    PyObject* result =
+        PyObject_GetAttrString(o, const_cast<char*>(field.c_str()));
+    if (result == nullptr) {
+      return error(tensorflow::strings::StrCat(
+          "Failed to get attribute of Shape object:", field));
+    }
+    return result;
+  };
+
+  auto call_method = [o, &error](const string& method) -> StatusOr<PyObject*> {
+    PyObject* result =
+        PyObject_CallMethod(o, const_cast<char*>(method.c_str()), nullptr);
+    if (result == nullptr) {
+      return error(tensorflow::strings::StrCat(
+          "Failed to call method of shape object:", method));
+    }
+    return result;
+  };
+
+  PyObject* np_type;
+  TF_ASSIGN_OR_RETURN(np_type, get_attr("np_dtype"));
+  if (np_type->ob_type != &PyArrayDescr_Type) {
+    return error("Shape attribute np_dtype is not an integer numpy dtype");
+  }
+  if (!NumpyTypeIsValid(NumpyTypenum(np_type))) {
+    return error("Shape attribute np_dtype is not a valid integer numpy dtype");
+  }
+  const PrimitiveType element_type =
+      NumpyTypeToPrimitiveType(NumpyTypenum(np_type));
+  Py_DECREF(np_type);
+
+  if (element_type == TUPLE) {
+    PyObject* py_subshapes;
+    TF_ASSIGN_OR_RETURN(py_subshapes, call_method("tuple_shapes"));
+    if (!PyTuple_Check(py_subshapes)) {
+      return error(
+          "Return value of Shape method tuple_shapes() is not a tuple");
+    }
+    const int length = PyTuple_Size(py_subshapes);
+    std::vector<Shape> subshapes;
+    subshapes.reserve(length);
+    for (int i = 0; i < length; i++) {
+      TF_ASSIGN_OR_RETURN(
+          const Shape& subshape,
+          XlaShapeFromPyShape(PyTuple_GetItem(py_subshapes, i)));
+      subshapes.push_back(subshape);
+    }
+    Py_DECREF(py_subshapes);
+    return ShapeUtil::MakeTupleShape(subshapes);
+  } else {
+    PyObject* py_dimensions;
+    PyObject* py_minor_to_major;
+    TF_ASSIGN_OR_RETURN(py_dimensions, call_method("dimensions"));
+    TF_ASSIGN_OR_RETURN(py_minor_to_major, call_method("minor_to_major"));
+    if (!PyTuple_Check(py_dimensions)) {
+      return error("Return value of Shape method dimensions() is not a tuple");
+    }
+    if (py_minor_to_major != Py_None && !PyTuple_Check(py_minor_to_major)) {
+      return error(
+          "Return value of Shape method minor_to_major() is neither a tuple "
+          "nor None");
+    }
+    const int length = PyTuple_Size(py_dimensions);
+    if (py_minor_to_major != Py_None &&
+        length != PyTuple_Size(py_minor_to_major)) {
+      return error(
+          "Shape methods dimensions() and minor_to_major() return "
+          "different-length tuples");
+    }
+    std::vector<int64> dimensions(length);
+    std::vector<int64> minor_to_major(length);
+    for (int i = 0; i < length; i++) {
+      dimensions[i] = PyIntOrPyLongToLong(PyTuple_GetItem(py_dimensions, i));
+      if (dimensions[i] == -1 && PyErr_Occurred()) {
+        return error("Dimension is not an int");
+      }
+
+      if (py_minor_to_major != Py_None) {
+        minor_to_major[i] =
+            PyIntOrPyLongToLong(PyTuple_GetItem(py_minor_to_major, i));
+        if (minor_to_major[i] == -1 && PyErr_Occurred()) {
+          return error("Minor-to-major value is not an int");
+        }
+      }
+    }
+    bool with_layout = py_minor_to_major != Py_None;
+    Py_DECREF(py_dimensions);
+    Py_DECREF(py_minor_to_major);
+    if (with_layout) {
+      return ShapeUtil::MakeShapeWithLayout(element_type, dimensions,
+                                            minor_to_major);
+    } else {
+      return ShapeUtil::MakeShape(element_type, dimensions);
+    }
+  }
+}
+
+// Helper that retrieves the member with attr_name, stringifies it if is not
+// None, and returns it as a C++ string.
+static tensorflow::gtl::optional<string> GetAttrAsString(
+    PyObject* o, const string& attr_name) {
+  if (!PyObject_HasAttrString(o, attr_name.c_str())) {
+    return tensorflow::gtl::nullopt;
+  }
+  PyObject* attr = PyObject_GetAttrString(o, attr_name.c_str());
+  if (attr == Py_None) {
+    Py_DECREF(attr);
+    return tensorflow::gtl::nullopt;
+  }
+  string result = PyObjectCppStr(attr);
+  Py_DECREF(attr);
+  return result;
+}
+
+// Helper that retrieves the member with attr_name, checks that it is an integer
+// if it is not None, and returns it as an int32 value.
+static tensorflow::gtl::optional<int32> GetAttrAsInt32(
+    PyObject* o, const string& attr_name) {
+  if (!PyObject_HasAttrString(o, attr_name.c_str())) {
+    return tensorflow::gtl::nullopt;
+  }
+  PyObject* attr = PyObject_GetAttrString(o, attr_name.c_str());
+  if (attr == Py_None) {
+    Py_DECREF(attr);
+    return tensorflow::gtl::nullopt;
+  }
+  if (!CheckPyIntOrLong(attr)) {
+    Py_DECREF(attr);
+    return tensorflow::gtl::nullopt;
+  }
+  long value = PyIntOrPyLongToLong(attr);  // NOLINT
+  Py_DECREF(attr);
+  if (value == -1 && PyErr_Occurred() != nullptr) {
+    return tensorflow::gtl::nullopt;
+  }
+  if (static_cast<int32>(value) != value) {
+    return tensorflow::gtl::nullopt;
+  }
+  return value;
+}
+
+StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o) {
+  OpMetadata result;
+  tensorflow::gtl::optional<string> op_type = GetAttrAsString(o, "op_type");
+  if (op_type.has_value()) {
+    result.set_op_type(op_type.value());
+  }
+  tensorflow::gtl::optional<string> op_name = GetAttrAsString(o, "op_name");
+  if (op_name.has_value()) {
+    result.set_op_name(op_name.value());
+  }
+  tensorflow::gtl::optional<string> source_file =
+      GetAttrAsString(o, "source_file");
+  if (source_file.has_value()) {
+    result.set_source_file(source_file.value());
+  }
+  tensorflow::gtl::optional<int32> source_line =
+      GetAttrAsInt32(o, "source_line");
+  if (source_line.has_value()) {
+    result.set_source_line(source_line.value());
+  }
+  return result;
+}
+
+PyObject* PyObjectFromXlaLiteral(const Literal& literal) {
+  if (ShapeUtil::IsTuple(literal.shape())) {
+    int num_elements = ShapeUtil::TupleElementCount(literal.shape());
+    PyObject* tuple = PyTuple_New(num_elements);
+    for (int i = 0; i < num_elements; i++) {
+      PyTuple_SET_ITEM(
+          tuple, i, PyObjectFromXlaLiteral(LiteralView::Create(literal, {i})));
+    }
+    return tuple;
+  } else {
+    int rank = ShapeUtil::Rank(literal.shape());
+    std::vector<long> dimensions(rank);  // NOLINT - PyArray requires a long*
+    for (int i = 0; i < rank; i++) {
+      dimensions[i] = ShapeUtil::GetDimension(literal.shape(), i);
+    }
+    int np_type = PrimitiveTypeToNumpyType(literal.shape().element_type());
+    PyObject* array =
+        PyArray_EMPTY(rank, dimensions.data(), np_type, /*fortran=*/0);
+    CopyLiteralToNumpyArray(np_type, literal,
+                            reinterpret_cast<PyArrayObject*>(array));
+    return array;
+  }
+}
+
+StatusOr<std::unique_ptr<Literal>> XlaLiteralFromPyObject(PyObject* o) {
+  if (PyTuple_Check(o)) {
+    int num_elements = PyTuple_Size(o);
+    std::vector<std::unique_ptr<Literal>> elements;
+    elements.reserve(num_elements);
+    for (int i = 0; i < num_elements; i++) {
+      PyObject* element = PyTuple_GetItem(o, i);
+      TF_ASSIGN_OR_RETURN(auto literal, XlaLiteralFromPyObject(element));
+      elements.push_back(std::move(literal));
+    }
+    return Literal::MakeTupleOwned(std::move(elements));
+  } else if (PyArray_Check(o)) {
+    PyArrayObject* py_array = reinterpret_cast<PyArrayObject*>(o);
+    int rank = PyArray_NDIM(py_array);
+    std::vector<int64> dimensions(rank);
+    for (int i = 0; i < rank; i++) {
+      dimensions[i] = PyArray_DIM(py_array, i);
+    }
+    int np_type = PyArray_TYPE(py_array);
+    auto literal = Literal::CreateFromDimensions(
+        NumpyTypeToPrimitiveType(np_type), dimensions);
+    TF_RETURN_IF_ERROR(
+        CopyNumpyArrayToLiteral(np_type, py_array, literal.get()));
+    return std::move(literal);
+  } else {
+    return InvalidArgument(
+        "Non-tuple or Numpy array encountered in conversion to XLA literal.");
+  }
+}
+
+Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
+                               Literal* literal) {
+  switch (np_type) {
+    case NPY_BOOL:
+      CopyNumpyArrayToLiteral<bool>(py_array, literal);
+      break;
+    case NPY_INT32:
+      CopyNumpyArrayToLiteral<int32>(py_array, literal);
+      break;
+    case NPY_INT64:
+      CopyNumpyArrayToLiteral<int64>(py_array, literal);
+      break;
+    case NPY_UINT8:
+      CopyNumpyArrayToLiteral<uint8>(py_array, literal);
+      break;
+    case NPY_UINT32:
+      CopyNumpyArrayToLiteral<uint32>(py_array, literal);
+      break;
+    case NPY_UINT64:
+      CopyNumpyArrayToLiteral<uint64>(py_array, literal);
+      break;
+    case NPY_FLOAT16:
+      CopyNumpyArrayToLiteral<half>(py_array, literal);
+      break;
+    case NPY_FLOAT32:
+      CopyNumpyArrayToLiteral<float>(py_array, literal);
+      break;
+    case NPY_FLOAT64:
+      CopyNumpyArrayToLiteral<double>(py_array, literal);
+      break;
+    default:
+      return InvalidArgument(
+          "No XLA literal container for Numpy type number: %d", np_type);
+  }
+  return Status::OK();
+}
+
+void CopyLiteralToNumpyArray(int np_type, const Literal& literal,
+                             PyArrayObject* py_array) {
+  switch (np_type) {
+    case NPY_BOOL:
+      CopyLiteralToNumpyArray<bool>(literal, py_array);
+      break;
+    case NPY_INT32:
+      CopyLiteralToNumpyArray<int32>(literal, py_array);
+      break;
+    case NPY_INT64:
+      CopyLiteralToNumpyArray<int64>(literal, py_array);
+      break;
+    case NPY_UINT8:
+      CopyLiteralToNumpyArray<uint8>(literal, py_array);
+      break;
+    case NPY_UINT32:
+      CopyLiteralToNumpyArray<uint32>(literal, py_array);
+      break;
+    case NPY_UINT64:
+      CopyLiteralToNumpyArray<uint64>(literal, py_array);
+      break;
+    case NPY_FLOAT16:
+      CopyLiteralToNumpyArray<half>(literal, py_array);
+      break;
+    case NPY_FLOAT32:
+      CopyLiteralToNumpyArray<float>(literal, py_array);
+      break;
+    case NPY_FLOAT64:
+      CopyLiteralToNumpyArray<double>(literal, py_array);
+      break;
+    default:
+      LOG(FATAL) << "No XLA literal container for Numpy type" << np_type;
+  }
+}
+
+PyObject* LongToPyIntOrPyLong(long x) {  // NOLINT
+#if PY_MAJOR_VERSION < 3
+  return PyInt_FromLong(x);
+#else
+  return PyLong_FromLong(x);
+#endif
+}
+
+long PyIntOrPyLongToLong(PyObject* o) {  // NOLINT
+#if PY_MAJOR_VERSION < 3
+  return PyInt_AsLong(o);
+#else
+  return PyLong_AsLong(o);
+#endif
+}
+
+bool CheckPyIntOrLong(PyObject* o) {
+#if PY_MAJOR_VERSION < 3
+  return PyInt_Check(o);
+#else
+  if (!PyLong_Check(o)) {
+    return false;
+  }
+  int overflow = 0;
+  PyLong_AsLongAndOverflow(o, &overflow);
+  return (overflow == 0);
+#endif
+}
+
+PyObject* PyNumberToPyInt(PyObject* o) {
+#if PY_MAJOR_VERSION < 3
+  return PyNumber_Int(o);
+#else
+  return PyNumber_Long(o);
+#endif
+}
+
+}  // namespace numpy
+
+}  // namespace swig
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h
new file mode 100644
index 0000000000000000000000000000000000000000..adfcc3b8588dce01718bb19dea936bace483be4d
--- /dev/null
+++ b/tensorflow/compiler/xla/python/numpy_bridge.h
@@ -0,0 +1,123 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// These functions transform Python/Numpy data structures to XLA data
+// structures and vice versa, performing copies where
+// appropriate. Python tuples and Numpy ndarrays translate to XLA
+// tuples and XLA literals, respectively, and Numpy shape/dtype
+// information is translated to XLA shape information.
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_NUMPY_BRIDGE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_NUMPY_BRIDGE_H_
+
+#include <algorithm>
+#include <memory>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/python/lib/core/numpy.h"
+
+namespace xla {
+
+namespace swig {
+
+namespace numpy {
+
+// Maps XLA primitive types (PRED, S8, F32, ..., and TUPLE) to numpy
+// dtypes (NPY_BOOL, NPY_INT8, NPY_FLOAT32, ..., and NPY_OBJECT), and
+// vice versa.
+int PrimitiveTypeToNumpyType(PrimitiveType primitive_type);
+PrimitiveType NumpyTypeToPrimitiveType(int np_type);
+
+// Determines whether an integer-encoded Numpy dtype is valid,
+// i.e. has a supported conversion to an XLA PrimitiveType.
+bool NumpyTypeIsValid(int np_type);
+
+// Converts XLA shape information into a Python pair of the form
+// (numpy dtype, dimensions). If the XLA shape represents a tuple,
+// then the numpy dtype is NPY_OBJECT ('O') and `dimensions` is a
+// Python tuple of shape-description pairs, created
+// recursively. Otherwise, `dimensions` is a Python tuple-of-integers
+// providing the array dimensions.
+//
+// The return value is a new reference.
+PyObject* PyShapeInfoFromXlaShape(const Shape& shape);
+
+// Converts a Python object with a method interface mathing that of
+// xla_client.Shape into an XLA Shape object.
+//
+// The return value is a new reference.
+StatusOr<Shape> XlaShapeFromPyShape(PyObject* o);
+
+// Converts a PyObject that represents operation metadata into protocol buffer
+// form.
+StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o);
+
+// Converts an XLA literal to a Python object, either a Numpy ndarray
+// or a nested Python tuple thereof.
+//
+// To avoid transferring ownership of the data buffers that underlie
+// PyArrays and XLA literals, this function makes deep copies of all
+// array data.
+//
+// The return value is a new reference.
+PyObject* PyObjectFromXlaLiteral(const Literal& literal);
+
+// Converts a Numpy ndarray or a nested Python tuple thereof to a
+// corresponding XLA literal.
+//
+// To avoid transferring ownership of the data buffers that underlie
+// PyArrays and XLA literals, this function makes deep copies of all
+// array data.
+StatusOr<std::unique_ptr<Literal> > XlaLiteralFromPyObject(PyObject* o);
+
+// The following functions copy array data from the buffers underlying Numpy
+// ndarrays into those underlying XLA literals, and vice versa.
+
+Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
+                               Literal* literal);
+
+void CopyLiteralToNumpyArray(int np_type, const Literal& literal,
+                             PyArrayObject* py_array);
+
+template <typename NativeT>
+void CopyNumpyArrayToLiteral(PyArrayObject* py_array, Literal* literal) {
+  NativeT* source = static_cast<NativeT*>(PyArray_DATA(py_array));
+  auto dest = literal->data<NativeT>();
+  std::copy(source, source + PyArray_SIZE(py_array), dest.data());
+}
+
+template <typename NativeT>
+void CopyLiteralToNumpyArray(const Literal& literal, PyArrayObject* py_array) {
+  NativeT* dest = static_cast<NativeT*>(PyArray_DATA(py_array));
+  auto source = literal.data<NativeT>();
+  std::copy(source.begin(), source.end(), dest);
+}
+
+// Workarounds for Python 2 and 3 interop
+
+PyObject* LongToPyIntOrPyLong(long x);  // NOLINT
+long PyIntOrPyLongToLong(PyObject* o);  // NOLINT
+bool CheckPyIntOrLong(PyObject* o);
+PyObject* PyNumberToPyInt(PyObject* o);
+
+}  // namespace numpy
+
+}  // namespace swig
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_NUMPY_BRIDGE_H_
diff --git a/tensorflow/compiler/xla/python/xla.i b/tensorflow/compiler/xla/python/xla.i
new file mode 100644
index 0000000000000000000000000000000000000000..1c4021a558d3fcff2abfdbdbad7f3928e86ed3b8
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xla.i
@@ -0,0 +1,18 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* XLA-wide SWIG wrapper */
+
+%include "tensorflow/compiler/xla/python/local_computation_builder.i"
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b4bdb1d6a8fc53adec8f2dcb37335d3f52cf21
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -0,0 +1,1139 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An in-process, local XLA client in Python, supporting AOT compilation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import enum  # pylint: disable=g-bad-import-order
+import inspect
+import itertools
+import os
+
+import numpy as np
+
+from tensorflow.compiler.xla import xla_data_pb2
+from tensorflow.compiler.xla.python import pywrap_xla as c_api
+
+
+# Most functions are snake_case for consistency with other modules,
+# whereas method names of ComputationBuilder and LocalComputation are
+# CamelCase for consistency with XLA.
+# pylint: disable=invalid-name
+
+
+_OP_METADATA_FIELDS = [
+    'op_type',
+    'op_name',
+    'source_file',
+    'source_line',
+]
+OpMetadata = collections.namedtuple('OpMetadata', _OP_METADATA_FIELDS)
+
+
+def OpMetadataToProto(pyobj):
+  proto = xla_data_pb2.OpMetadata()
+  for field in _OP_METADATA_FIELDS:
+    attr = getattr(pyobj, field)
+    if attr is not None:
+      setattr(proto, field, attr)
+  return proto
+
+
+def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
+  """Helper for use in source mapping that returns an OpMetadata object."""
+  full_filename, lineno = inspect.stack()[skip_frames][1:3]
+  filename = os.path.basename(full_filename)
+  return OpMetadata(
+      op_type=op_type,
+      op_name=op_name,
+      source_file=filename,
+      source_line=lineno)
+
+
+class PaddingType(enum.Enum):
+  VALID = 1
+  SAME = 2
+
+
+def _convert_padding_type_to_pad_values(padding_type, lhs_dims, rhs_dims,
+                                        window_strides):
+  """Maps PaddingType (VALID or SAME) to pad values (list of pairs of ints)."""
+  if padding_type == PaddingType.VALID:
+    return [(0, 0)] * len(window_strides)
+
+  out_shape = np.ceil(np.true_divide(lhs_dims, window_strides)).astype(int)
+  pad_sizes = [max((out_size - 1) * stride + filter_size - in_size, 0)
+               for out_size, stride, filter_size, in_size
+               in zip(out_shape, window_strides, rhs_dims, lhs_dims)]
+  return [(pad_size // 2, pad_size - pad_size // 2)
+          for pad_size in pad_sizes]
+
+
+_UNARY_OPS = [
+    'Not',
+    'Abs',
+    'Exp',
+    'Floor',
+    'Round',
+    'Ceil',
+    'Log',
+    'Sign',
+    'Cos',
+    'Sin',
+    'Tanh',
+    'SqrtF32',
+    'SquareF32',
+    'IsFinite',
+    'ReciprocalF32',
+    'Neg',
+    'Sort',
+]
+
+_BINARY_OPS = [
+    'Eq',
+    'Ne',
+    'Ge',
+    'Gt',
+    'Lt',
+    'Le',
+    'Add',
+    'Sub',
+    'Mul',
+    'Div',
+    'Rem',
+    'Max',
+    'Min',
+    'And',
+    'Or',
+    'Pow',
+]
+
+XLA_ELEMENT_TYPE_TO_DTYPE = {
+    xla_data_pb2.F32: np.dtype(np.float32),
+    xla_data_pb2.F64: np.dtype(np.float64),
+    xla_data_pb2.S32: np.dtype(np.int32),
+    xla_data_pb2.S64: np.dtype(np.int64),
+    xla_data_pb2.U32: np.dtype(np.uint32),
+    xla_data_pb2.U64: np.dtype(np.uint64),
+    xla_data_pb2.PRED: np.dtype(np.bool),
+    xla_data_pb2.TUPLE: np.dtype(np.object),
+}
+
+# Note the conversion on the key. Numpy has a known issue wherein dtype hashing
+# doesn't work as expected (https://github.com/numpy/numpy/issues/7242). Thus,
+# when keying by dtype in this dict, we use the string form of dtypes.
+DTYPE_TO_XLA_ELEMENT_TYPE = {
+    str(v): k
+    for k, v in XLA_ELEMENT_TYPE_TO_DTYPE.items()
+}
+
+
+class LocalBuffer(object):
+  """Represents a handle to data owned by XLA.
+
+  The referent is ready for use in executing a local, compiled
+  Computation. On XLA platforms involving a device (e.g. GPU), this
+  means the referent is in device memory.
+  """
+
+  def __init__(self, c_local_shaped_buffer):
+    self.c_local_shaped_buffer = c_local_shaped_buffer
+    self._delete = c_api.DeleteLocalShapedBuffer
+
+  @staticmethod
+  def from_py(npval, layout_fn=None):
+    npval = require_numpy_array_layout(npval)
+    if layout_fn:
+      shape = Shape.from_numpy(npval)
+      shape = shape.map_leaves(layout_fn)
+    else:
+      shape = None
+    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval, shape))
+
+  def to_py(self):
+    return self.c_local_shaped_buffer.ToLiteral()
+
+  def delete(self):
+    if self.c_local_shaped_buffer is not None:
+      self._delete(self.c_local_shaped_buffer)
+      self.c_local_shaped_buffer = None
+
+  def is_deleted(self):
+    return self.c_local_shaped_buffer is None
+
+  def __del__(self):
+    self.delete()
+
+
+class Shape(object):
+  """XLA shape.
+
+  Represents an XLA shape by a corresponding Python/Numpy type and a
+  list of dimensions, which are themselves Shapes in case this one
+  represents an XLA tuple.
+  """
+
+  def __init__(self, np_dtype, dimensions, minor_to_major=None):
+    assert isinstance(dimensions, tuple)
+    self.np_dtype = np_dtype
+    self._dimensions = dimensions
+    self._minor_to_major = minor_to_major
+    self._check_minor_to_major()
+
+  def __eq__(self, other):
+    # pylint: disable=protected-access
+    return (self.np_dtype == other.np_dtype and
+            self._dimensions == other._dimensions and
+            self._minor_to_major == other._minor_to_major)
+
+  def __repr__(self):
+    return ('xla_client.Shape(np_dtype={!r}, dimensions={!r}, '
+            'minor_to_major={!r})').format(self.np_dtype, self._dimensions,
+                                           self._minor_to_major)
+
+  def element_type(self):
+    return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.np_dtype)]
+
+  def is_tuple(self):
+    return self.element_type() == xla_data_pb2.TUPLE
+
+  def dimensions(self):
+    if self.is_tuple():
+      raise ValueError('Tuple shape has no dimensions')
+    return self._dimensions
+
+  def minor_to_major(self):
+    return self._minor_to_major
+
+  def tuple_shapes(self):
+    if not self.is_tuple():
+      raise ValueError('Shape is not a tuple shape')
+    return self._dimensions
+
+  def rank(self):
+    return len(self.dimensions())
+
+  def map_leaves(self, f):
+    """Map f over each leaf-level array subshape.
+
+    Args:
+      f: The function to apply. Whenever f returns None, the identity is
+        applied instead.
+
+    Returns:
+      A new Shape with the mapped leaves.
+    """
+    if self.is_tuple():
+      children = tuple(child.map_leaves(f) for child in self.tuple_shapes())
+      return Shape(np.dtype('O'), children)
+    else:
+      mapped = f(self)
+      return self if mapped is None else mapped
+
+  def _check_minor_to_major(self):
+    mtm = self._minor_to_major
+    if self.is_tuple():
+      assert mtm is None, self
+    if mtm is not None:
+      assert self.rank() == len(mtm), self
+      assert sorted(mtm) == range(len(mtm)), self
+
+  def update_minor_to_major(self, minor_to_major):
+    if not isinstance(minor_to_major, tuple):
+      raise TypeError('minor_to_major must be a tuple')
+    updated = Shape(self.np_dtype, tuple(self.dimensions()), minor_to_major)
+    updated._check_minor_to_major()  # pylint: disable=protected-access
+    return updated
+
+  @staticmethod
+  def from_numpy(npval):
+
+    def convert(npval):
+      if isinstance(npval, tuple):
+        return Shape(np.dtype('O'), tuple(convert(elt) for elt in npval))
+      else:
+        return Shape(npval.dtype, np.shape(npval))
+
+    return convert(require_numpy_array_layout(npval))
+
+
+def _wrap_shape(shape_info):
+  dtype, dims = shape_info
+  element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(dtype)]
+  if element_type == xla_data_pb2.TUPLE:
+    dims = tuple(_wrap_shape(subshape_info) for subshape_info in dims)
+  return Shape(dtype, dims)
+
+
+def _wrap_data_handle(handle):
+  cdh = xla_data_pb2.ComputationDataHandle()
+  cdh.handle = handle
+  return cdh
+
+
+def _unwrap_data_handle(handle_proto):
+  return handle_proto.handle
+
+
+def _unwrap_data_handles(handle_protos):
+  return [_unwrap_data_handle(cdh) for cdh in handle_protos]
+
+
+def require_numpy_array_layout(value):
+  if isinstance(value, tuple):
+    return tuple(require_numpy_array_layout(x) for x in value)
+  else:
+    return np.require(value, requirements=['C', 'A'])
+
+
+class CompileOptions(object):
+  """Python object for XLA compile options.
+
+  These options can be passed to the 'compile' step when using a local XLA
+  client.
+  """
+
+  def __init__(self):
+    self.generate_hlo_graph = None
+
+
+def transfer_to_infeed(value, replica_number=None):
+  """Transfers the given value into the XLA infeed queue.
+
+  XLA's infeed queue is a single queue that feeds the "XLA virtual machine" with
+  a totally ordered stream of values. This is dequeued from XLA computations via
+  the Infeed() operation.
+
+  Args:
+    value: the value that the caller would like to enqueue into the XLA infeed
+      queue
+    replica_number: the replica number to infeed the value to -- if not
+      provided, then the default replica (trivially replica 0) is used.
+  """
+  if replica_number is None:
+    c_api.TransferToInfeedLocal(require_numpy_array_layout(value))
+  else:
+    c_api.TransferToInfeedLocalReplica(
+        require_numpy_array_layout(value), replica_number)
+
+
+def transfer_from_outfeed(shape, replica_number=None):
+  """Transfers a literal of the given shape from replica_number's outfeed.
+
+  Args:
+    shape: The shape of the value to transfer from outfeed.
+    replica_number: The replica number ordinal to transfer the outfeed value
+      from. (Each replica has a distinct outfeed queue.)
+
+  Returns:
+    The literal value that is produced from the outfeed queue.
+  """
+  return c_api.TransferFromOutfeedLocalReplica(shape, replica_number or 0)
+
+
+class LocalComputation(object):
+  """Python wrapper for a local XLA Computation.
+
+  A LocalComputation can be executed if it is compiled. Otherwise, it
+  can still be used as a Computation where required by the
+  ComputationBuilder methods.
+  """
+
+  def __init__(self, c_local_computation, is_compiled):
+    self.c_local_computation = c_local_computation
+    self.is_compiled = is_compiled
+
+    # Ensure a reference to C-based destructor for use in __del__.
+    if is_compiled:
+      self._delete = c_api.DeleteCompiledLocalComputation
+    else:
+      self._delete = c_api.DeleteLocalComputation
+
+  def Compile(self, argument_shapes=(), compile_options=None, layout_fn=None):
+    if self.is_compiled:
+      raise ValueError('Attempt to compile a compiled local XLA computation.')
+    if layout_fn:
+      argument_shapes = [
+          shape.map_leaves(layout_fn) for shape in argument_shapes
+      ]
+    return LocalComputation(
+        self.c_local_computation.Compile(argument_shapes, compile_options),
+        is_compiled=True)
+
+  def CompileWithExampleArguments(self,
+                                  arguments=(),
+                                  compile_options=None,
+                                  layout_fn=None):
+    return self.Compile(
+        argument_shapes=[Shape.from_numpy(arg) for arg in arguments],
+        compile_options=compile_options,
+        layout_fn=layout_fn)
+
+  def Execute(self, arguments=(), layout_fn=None):
+    """Execute with Python values as arguments and return value."""
+    if not self.is_compiled:
+      raise ValueError('Cannot execute an uncompiled local XLA computation.')
+    argument_shapes = [Shape.from_numpy(arg) for arg in arguments]
+    if layout_fn:
+      argument_shapes = [
+          shape.map_leaves(layout_fn) for shape in argument_shapes
+      ]
+    else:
+      argument_shapes = [None for shape in argument_shapes]
+    arguments = tuple(map(require_numpy_array_layout, arguments))
+    return self.c_local_computation.Execute(arguments, argument_shapes)
+
+  def ExecuteWithLocalBuffers(self, arguments=()):
+    """Execute with LocalBuffer arguments and return value."""
+    if not self.is_compiled:
+      raise ValueError('Cannot execute an uncompiled local XLA computation.')
+    arguments = tuple(arguments)
+    if any(arg.is_deleted() for arg in arguments):
+      raise ValueError('Executing with deleted local buffer argument')
+    return LocalBuffer(
+        self.c_local_computation.ExecuteWithShapedBuffers(
+            [arg.c_local_shaped_buffer for arg in arguments]))
+
+  def __del__(self):
+    self._delete(self.c_local_computation)
+
+
+class ComputationBuilder(object):
+  """XLA computation builder.
+
+  Enqueues XLA ops in sequence and in order to build a
+  LocalComputation, which in turn can be compiled into a
+  CompiledLocalComputation, which in turn can be locally executed.
+  """
+
+  # The methods of this class map 1-to-1 onto the XLA C++
+  # computation builder API. Therefore, there's no need to laboriously list
+  # arguments and return values for every method, especially where it's obvious.
+  #
+  # pylint: disable=g-doc-return-or-yield
+  # pylint: disable=g-doc-args
+
+  def __init__(self, name):
+    self._client = c_api.LocalComputationBuilder(name.encode('utf8'))
+    self._parameter_numbering = itertools.count()
+
+  def Build(self):
+    return LocalComputation(self._client.Build(), is_compiled=False)
+
+  def SetOpMetadata(self, op_metadata):
+    """Set metadata for operations that are about to be enqueued."""
+    self._client.SetOpMetadata(op_metadata)
+
+  def ClearOpMetadata(self):
+    """Clear metadata for operations that are about to be enqueued."""
+    self._client.ClearOpMetadata()
+
+  def Infeed(self, shape):
+    """Enqueues an infeed op onto the computation.
+
+    Infeed operations dequeue data of the given shape from the device's infeed
+    queue for subsequent use in the computation.
+
+    Returns:
+      A  ComputationDataHandle message.
+    """
+    return _wrap_data_handle(self._client.Infeed(shape))
+
+  def Outfeed(self, operand):
+    """Enqueues an outfeed op onto the computation.
+
+    Outfeed operations enqueue data, using the given operand, onto the XLA
+    outfeed queue for subsequent dequeue via the client API.
+    """
+    self._client.Outfeed(
+        _unwrap_data_handle(operand), self.GetShape(operand),
+        ''.encode('utf-8'))
+
+  def Constant(self, value):
+    """Enqueues a constant op onto the computation.
+
+    Args:
+      value: value for the constant, as a np.array with an explicit dtype set
+             to one of the supported types.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    value = require_numpy_array_layout(value)
+    return _wrap_data_handle(self._client.ConstantLiteral(value))
+
+  def ConstantF32Scalar(self, value):
+    """Convenience method to enqueue a scalar F32 constant op.
+
+    Args:
+      value: a floating-point number.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    return self.Constant(np.array(value, dtype=np.float32))
+
+  def ConstantF64Scalar(self, value):
+    """Convenience method to enqueue a scalar F32 constant op.
+
+    Args:
+      value: a floating-point number.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    return self.Constant(np.array(value, dtype=np.float64))
+
+  def ConstantS32Scalar(self, value):
+    """Convenience method to enqueue a scalar S32 constant op.
+
+    Args:
+      value: a floating-point number.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    return self.Constant(np.array(value, dtype=np.int32))
+
+  def ConstantS64Scalar(self, value):
+    """Convenience method to enqueue a scalar S64 constant op.
+
+    Args:
+      value: a floating-point number.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    return self.Constant(np.array(value, dtype=np.int64))
+
+  def ConstantPredScalar(self, value):
+    """Convenience method to enqueue a scalar PRED constant op.
+
+    Args:
+      value: a boolean value.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    return self.Constant(np.array(value, dtype=np.bool))
+
+  def ParameterWithShape(self, shape, name=None, parameter_num=None):
+    """Enqueues a Parameter op onto the computation, given a shape.
+
+    Args:
+      shape: the parameter's shape as a Shape object.
+      name: optional string name for the parameter.
+      parameter_num: parameter number in the computation function. If None,
+        the next linear parameter number is used. The default value capability
+        can be used for auto-numbering. If you're using auto-numbering for some
+        parameters, use it for *all* parameters to avoid clashes.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    if name is None:
+      name = ''
+    if parameter_num is None:
+      parameter_num = next(self._parameter_numbering)
+
+    return _wrap_data_handle(
+        self._client.Parameter(parameter_num, shape, name.encode('utf8')))
+
+  def ParameterFromNumpy(self, value, name=None, parameter_num=None):
+    """Enqueues a Parameter op onto the computation.
+
+    Args:
+      value: a Numpy array, or a nested tuple thereof, from which the
+        shape is inferred.
+      name: as in ParameterWithShape.
+      parameter_num: as in ParameterWithShape.
+
+    Returns:
+      A ComputationDataHandle message.
+    """
+    return self.ParameterWithShape(
+        Shape.from_numpy(value), name=name, parameter_num=parameter_num)
+
+  def Broadcast(self, operand, sizes):
+    """Enqueues a broadcast operation onto the computation.
+
+    Args:
+      operand: the operand ComputationDataHandle to broadcast.
+      sizes: an iterable of broadcast sizes.
+
+    Returns:
+      A ComputationDataHandle representing the added broadcast op.
+    """
+    return _wrap_data_handle(
+        self._client.Broadcast(_unwrap_data_handle(operand), sizes))
+
+  def Concatenate(self, operands, dimension):
+    """Enqueues a concatenate operation onto the computation.
+
+    Args:
+      operands: the operands to concatenate.
+      dimension: the dimension in which to perform the concatenation.
+
+    Returns:
+      A ComputationDataHandle representing the added concatenate op.
+    """
+    return _wrap_data_handle(
+        self._client.ConcatInDim(_unwrap_data_handles(operands), dimension))
+
+  def ConvertElementType(self, operand, new_element_type):
+    """Enqueues an element type conversion operation onto the computation.
+
+    Args:
+      operand: the operand to convert.
+      new_element_type: the target primitive type.
+
+    Returns:
+      A ComputationDataHandle representing the added conversion op.
+    """
+    return _wrap_data_handle(
+        self._client.ConvertElementType(
+            _unwrap_data_handle(operand), new_element_type))
+
+  def GetShape(self, operand):
+    return _wrap_shape(self._client.GetShape(_unwrap_data_handle(operand)))
+
+  def GetReturnValueShape(self):
+    return _wrap_shape(self._client.GetReturnValueShape())
+
+  def GetComputationStats(self):
+    raise NotImplementedError()
+
+  def Pad(self, operand, padding_value, padding_config):
+    """Enqueues a Pad operation onto the computation.
+
+    Args:
+      operand: ComputationDataHandle representing the array to pad.
+      padding_value: ComputationDataHandle representing the scalar pad value.
+      padding_config: either an xla_data_pb2.PaddingConfig or a list of integer
+        triples (edge_padding_low, edge_padding_high, interior_padding)
+        representing the configuration of the padding operation.
+
+    Returns:
+      A ComputationDataHandle representing the added pad op.
+    """
+    if not isinstance(padding_config, xla_data_pb2.PaddingConfig):
+      padding_config = GetPaddingConfigFromTriples(padding_config)
+    return _wrap_data_handle(
+        self._client.Pad(_unwrap_data_handle(operand),
+                         _unwrap_data_handle(padding_value),
+                         padding_config))
+
+  def Reshape(self, operand, dimensions, new_sizes):
+    """Reshape op."""
+    return _wrap_data_handle(
+        self._client.Reshape(
+            _unwrap_data_handle(operand), dimensions, new_sizes))
+
+  def CrossReplicaSum(self, operand):
+    """CrossReplicaSum op.
+
+    Args:
+      operand: the operand to sum across replica instances.
+
+    Returns:
+      A ComputationDataHandle that has the sum of the value among all replicas.
+    """
+    return _wrap_data_handle(
+        self._client.CrossReplicaSum(_unwrap_data_handle(operand)))
+
+  def Collapse(self, operand, dimensions):
+    """Collapse op."""
+    return _wrap_data_handle(
+        self._client.Collapse(_unwrap_data_handle(operand), dimensions))
+
+  def Trans(self, operand):
+    """Specialized matrix transpose op."""
+    return _wrap_data_handle(
+        self._client.Transpose(_unwrap_data_handle(operand), [1, 0]))
+
+  def Transpose(self, operand, permutation):
+    """Transpose op."""
+    return _wrap_data_handle(
+        self._client.Transpose(_unwrap_data_handle(operand), permutation))
+
+  def Rev(self, operand, dimensions):
+    """Rev op."""
+    return _wrap_data_handle(
+        self._client.Rev(_unwrap_data_handle(operand), dimensions))
+
+  def Clamp(self, min, operand, max):  # pylint: disable=redefined-builtin
+    """Clamp op."""
+    return _wrap_data_handle(
+        self._client.Clamp(_unwrap_data_handle(min),
+                           _unwrap_data_handle(operand),
+                           _unwrap_data_handle(max)))
+
+  def SelectAndScatter(self, operand, select, window_dimensions, window_strides,
+                       padding, source, init_value, scatter):
+    """Select and scatter op, used by the gradient of ReduceWindow.
+
+    Args:
+      operand: ComputationDataHandle for array of dimension N and type T over
+        which the windows slide.
+      select: Computation of type (T, T) -> Pred to apply to the elements of
+        each window to indicate which element is selected.
+      window_dimensions: sequence of N integers for dimensions of the window.
+      window_strides: sequence of N integers for the strides of the window.
+      padding: PaddingType representing either 'SAME' or 'VALID ' padding.
+      source: ComputationDataHandle for array of type T with values to scatter.
+      init_value: ComputationDataHandle of scalar type T for initial out value.
+      scatter: Computation of type (T, T) -> T to apply to each scatter source
+        element with its destination element.
+
+    Returns:
+      A ComputationDataHandle representing the added SelectAndScatter op.
+    """
+    pads = _convert_padding_type_to_pad_values(
+        padding, self.GetShape(operand).dimensions(),
+        window_dimensions, window_strides)
+    return _wrap_data_handle(
+        self._client.SelectAndScatterWithGeneralPadding(
+            _unwrap_data_handle(operand), select.c_local_computation,
+            window_dimensions, window_strides, pads,
+            _unwrap_data_handle(source), _unwrap_data_handle(init_value),
+            scatter.c_local_computation))
+
+  def Select(self, pred, on_true, on_false):
+    """Element-wise selection op.
+
+    Constructs an output array from elements of two input arrays, based on the
+    values of a predicate array.
+    """
+    return _wrap_data_handle(
+        self._client.Select(
+            _unwrap_data_handle(pred),
+            _unwrap_data_handle(on_true),
+            _unwrap_data_handle(on_false)))
+
+  def Slice(self, operand, start_indices, limit_indices, strides=None):
+    """Enqueues a slice operation onto the computation.
+
+    Args:
+      operand: ComputationDataHandle for the N dimensional array to be sliced.
+      start_indices: iterable of N integers containing the starting indices of
+        the slice for each dimension.
+      limit_indices: iterable of N integers containing the ending indices
+        (exclusive) of the slice for each dimension.
+      strides: optional iterable of N integers containing the stride sizes for
+        each dimension.
+
+    Returns:
+      A ComputationDataHandle representing the added Slice op.
+    """
+    if strides is None:
+      start_indices = list(start_indices)
+      strides = [1] * len(start_indices)
+    return _wrap_data_handle(
+        self._client.Slice(
+            _unwrap_data_handle(operand),
+            start_indices,
+            limit_indices,
+            strides))
+
+  def DynamicSlice(self, operand, start_indices, slice_sizes):
+    """Enqueues a slice op with dynamic start indices onto the computation.
+
+    Args:
+      operand: ComputationDataHandle for the N dimensional array to be sliced.
+      start_indices: ComputationDataHandle for the 1D array of N integers
+        containing the starting indices of the slice.
+      slice_sizes: iterable of N integers containing the slice sizes in each
+        dimension.
+
+    Returns:
+      A ComputationDataHandle representing the added DynamicSlice op.
+    """
+    return _wrap_data_handle(
+        self._client.DynamicSlice(
+            _unwrap_data_handle(operand),
+            _unwrap_data_handle(start_indices),
+            slice_sizes))
+
+  def DynamicUpdateSlice(self, operand, update, start_indices):
+    """Enqueues a dynamic update slice operation onto the computation.
+
+    Args:
+      operand: ComputationDataHandle for the N dimensional array to be updated.
+      update: N dimensional array comprising the slice update.
+      start_indices: Rank-1 array of N integers comprising the starting indices
+        of the slice along each dimension.
+    Returns:
+      A ComputationDataHandle representing the added DynamicUpdateSlice op.
+    """
+    return _wrap_data_handle(
+        self._client.DynamicUpdateSlice(
+            _unwrap_data_handle(operand),
+            _unwrap_data_handle(update),
+            _unwrap_data_handle(start_indices)))
+
+  def Tuple(self, *ops):
+    """Enqueues a tuple operation onto the computation.
+
+    Args:
+      ops: a sequence of tuple operands (each a ComputationDataHandle).
+
+    Returns:
+      A ComputationDataHandle representing the added Tuple op.
+    """
+    return _wrap_data_handle(self._client.Tuple(_unwrap_data_handles(ops)))
+
+  def GetTupleElement(self, tup, index):
+    """Enqueues a 'get tuple element' operation onto the computation.
+
+    Args:
+      tup: the tuple operand (a ComputationDataHandle).
+      index: numeric index to select from the tuple.
+
+    Returns:
+      A ComputationDataHandle representing the added GetTupleElement op.
+    """
+    return _wrap_data_handle(
+        self._client.GetTupleElement(_unwrap_data_handle(tup), index))
+
+  def Call(self, computation_to_apply, operands):
+    """Enqueues a call operation onto the computation.
+
+    Args:
+      computation_to_apply: a Computation object.
+      operands: an iterable of ComputationDataHandle. The number and types of
+        operands must match the arity of computation_to_apply.
+
+    Returns:
+      A ComputationDataHandle representing the added call op.
+    """
+    return _wrap_data_handle(
+        self._client.Call(computation_to_apply.c_local_computation,
+                          _unwrap_data_handles(operands)))
+
+  def Map(self, operands, computation_to_apply, dimensions, static_operands=()):
+    """Enqueues a map operation onto the computation.
+
+    Args:
+      operands: an iterable of ComputationDataHandle.
+      computation_to_apply: a Computation object.
+      dimensions: dimensions over which to apply map the function.
+      static_operands: auxiliary arguments passed to the applied computation.
+
+    Returns:
+      A ComputationDataHandle representing the added Map op.
+    """
+    return _wrap_data_handle(
+        self._client.Map(
+            _unwrap_data_handles(operands),
+            computation_to_apply.c_local_computation,
+            dimensions,
+            _unwrap_data_handles(static_operands)))
+
+  def Reduce(self, operand, init_value, computation_to_apply, dimensions):
+    """Enqueues a reduction operation onto the computation.
+
+    Args:
+      operand: reduction operand (ComputationDataHandle).
+      init_value: reduction initial value (ComputationDataHandle).
+      computation_to_apply: a Computation object - binary reduction function.
+      dimensions: sequence of dimensions (integers) to reduce on.
+
+    Returns:
+      A ComputationDataHandle representing the added Reduce op.
+    """
+    return _wrap_data_handle(
+        self._client.Reduce(
+            _unwrap_data_handle(operand),
+            _unwrap_data_handle(init_value),
+            computation_to_apply.c_local_computation,
+            dimensions))
+
+  def ReduceWindow(self, operand, init_value, computation_to_apply,
+                   window_dimensions, window_strides, padding):
+    """Enqueues a windowed reduction operation onto the computation.
+
+    Args:
+      operand: reduction operand (ComputationDataHandle).
+      init_value: reduction initial value (ComputationDataHandle).
+      computation_to_apply: a binary reduction function (Computation).
+      window_dimensions: dimensions of window (sequence of integers).
+      window_strides: strides for window (sequence of integers).
+      padding: PaddingType representing either 'SAME' or 'VALID' padding.
+
+    Returns:
+      A ComputationDataHandle representing the added ReduceWindow op.
+    """
+    pads = _convert_padding_type_to_pad_values(
+        padding, self.GetShape(operand).dimensions(), window_dimensions,
+        window_strides)
+    return _wrap_data_handle(
+        self._client.ReduceWindowWithGeneralPadding(
+            _unwrap_data_handle(operand),
+            _unwrap_data_handle(init_value),
+            computation_to_apply.c_local_computation,
+            window_dimensions, window_strides, pads))
+
+  def RngNormal(self, mu, sigma, dims):
+    """Enqueues an RngNormal operation onto the computation.
+
+    Args:
+      mu: A ComputationDataHandle to an F32 scalar specifying the mean.
+      sigma: A ComputationDataHandle to an F32 scalar specifying the standard
+        deviation.
+      dims: A 1D array-like of nonnegative integers specifying the dimensions.
+
+    Returns: a ComputationDataHandle to the generated array of F32 values.
+    """
+    shape = Shape(self.GetShape(mu).np_dtype, dims)
+    return _wrap_data_handle(
+        self._client.RngNormal(
+            _unwrap_data_handle(mu), _unwrap_data_handle(sigma), shape))
+
+  def RngUniform(self, a, b, dims):
+    """Enqueues an RngUniform operation onto the computation.
+
+    Args:
+      a: a ComputationDataHandle to an F32, S32, or U32 scalar (consistent with
+        the type of b) specifying the low end of the interval [a, b) over which
+        values are generated.
+      b: a ComputationDataHandle to an F32, S32, or U32 scalar (consistent with
+        the type of a) specifying the high end of the interval [a, b) over which
+        values are generated.
+      dims: A 1D array-like of nonnegative integers specifying the dimensions.
+
+    Returns: a ComputationDataHandle to the generated array of values with the
+      same numeric type (F32, S32, or U32) as the arguments a and b.
+    """
+    shape = Shape(self.GetShape(a).np_dtype, dims)
+    return _wrap_data_handle(
+        self._client.RngUniform(
+            _unwrap_data_handle(a), _unwrap_data_handle(b), shape))
+
+  def While(self, cond, body, init):
+    """Enqueues a While operation onto the computation.
+
+    Args:
+      cond: a Computation for the loop condition, which has type T -> PRED
+      body: a Computation for the loop body, which has type T -> T
+      init: a ComputationDataHandle for the initial parameter, which has type T
+
+    Returns: a ComputationDataHandle representing the While operation.
+    """
+    return _wrap_data_handle(
+        self._client.While(cond.c_local_computation,
+                           body.c_local_computation,
+                           _unwrap_data_handle(init)))
+
+  def Conditional(self, pred, true_operand, true_computation, false_operand,
+                  false_computation):
+    """Enqueues a Conditional operation onto the computation.
+
+    Args:
+      predicate: a ComputationDataHandle to test, which has scalar type PRED
+      true_operand: a ComputationDataHandle of type T_0
+      true_computation: a Computation to apply to true_operand, type T_0 -> S
+      false_operand: a ComputationDatahandle of type T_1
+      false_computation: a Computation to apply to false_operand, type T_1 -> S
+
+    Returns: a ComputationDataHandle representing the Conditional operation.
+    """
+    return _wrap_data_handle(
+        self._client.Conditional(
+            _unwrap_data_handle(pred), _unwrap_data_handle(true_operand),
+            true_computation.c_local_computation,
+            _unwrap_data_handle(false_operand),
+            false_computation.c_local_computation))
+
+  def Dot(self, lhs, rhs):
+    """Enqueues a dot operation onto the computation.
+
+    Args:
+      lhs: ComputationDataHandle for the rank 1 or rank 2 left-hand-side array.
+      rhs: ComputationDataHandle for the rank 1 or rank 2 right-hand-side array.
+
+    Returns: a ComputationDataHandle representing the Dot operation.
+    """
+    return _wrap_data_handle(
+        self._client.Dot(_unwrap_data_handle(lhs), _unwrap_data_handle(rhs)))
+
+  def DotGeneral(self, lhs, rhs, dimension_numbers):
+    """Enqueues a general dot operation onto the computation.
+
+    Args:
+      lhs: ComputationDataHandle for the left-hand-side array.
+      rhs: ComputationDataHandle for the right-hand-side array.
+      dimension_numbers: either an xla_data_pb2.DotDimensionNumbers or a nested
+        tuple ((lhs_contract, rhs_contract), (lhs_batch, rhs_batch)) of lists of
+        integers representing the dimensions to treat as contracting dimensions
+        and batch dimensions on each input operand.
+
+    Returns: a ComputationDataHandle representing the DotGeneral operation.
+    """
+    if not isinstance(dimension_numbers, xla_data_pb2.DotDimensionNumbers):
+      dimension_numbers = GetDotDimensionsFromLists(dimension_numbers)
+    return _wrap_data_handle(
+        self._client.DotGeneral(
+            _unwrap_data_handle(lhs), _unwrap_data_handle(rhs),
+            dimension_numbers))
+
+  def Conv(self, lhs, rhs, window_strides, padding):
+    """Enqueues a Conv operation onto the computation.
+
+    Args:
+      lhs: ComputationDataHandle for the rank N+2 array of inputs.
+      rhs: ComputationDataHandle for the rank N+2 array of kernel weights.
+      window_strides: length-N array-like of integer kernel strides.
+      padding: PaddingType representing either 'SAME' or 'VALID' padding.
+
+    Returns: a ComputationDataHandle representing the Conv operation.
+    """
+    pads = _convert_padding_type_to_pad_values(
+        padding, self.GetShape(lhs).dimensions()[2:],
+        self.GetShape(rhs).dimensions()[2:], window_strides)
+    dimension_numbers = self._GetConvDimensionNumbers(len(window_strides))
+    return _wrap_data_handle(
+        self._client.ConvGeneralDilated(_unwrap_data_handle(lhs),
+                                        _unwrap_data_handle(rhs),
+                                        window_strides,
+                                        pads,
+                                        (),
+                                        (),
+                                        dimension_numbers))
+
+  def ConvWithGeneralPadding(self, lhs, rhs, window_strides, padding,
+                             lhs_dilation, rhs_dilation):
+    """Enqueues a ConvWithGeneralPadding operation onto the computation.
+
+    Args:
+      lhs: ComputationDataHandle for the rank N+2 array of inputs.
+      rhs: ComputationDataHandle for the rank N+2 array of kernel weights.
+      window_strides: length-N array-like of kernel strides.
+      padding: length-N array-like of pairs of integers of (low, high) padding.
+      lhs_dilation: length-N array-like of dilation factors.
+      rhs_dilation: length-N array-like of dilation factors.
+
+    Returns:
+      A ComputationdataHandle representing the added ConvWithGeneralPadding op.
+    """
+    dimension_numbers = self._GetConvDimensionNumbers(len(window_strides))
+    return _wrap_data_handle(
+        self._client.ConvGeneralDilated(_unwrap_data_handle(lhs),
+                                        _unwrap_data_handle(rhs),
+                                        window_strides,
+                                        padding,
+                                        lhs_dilation,
+                                        rhs_dilation,
+                                        dimension_numbers))
+
+  def _GetConvDimensionNumbers(self, num_spatial_dims):
+    """Create ConvolutionDimensionNumbers proto for convolutions."""
+    nd = num_spatial_dims
+    dimension_numbers = xla_data_pb2.ConvolutionDimensionNumbers()
+    dimension_numbers.input_batch_dimension = 0
+    dimension_numbers.input_feature_dimension = 1
+    dimension_numbers.output_batch_dimension = 0
+    dimension_numbers.output_feature_dimension = 1
+    dimension_numbers.kernel_output_feature_dimension = 0
+    dimension_numbers.kernel_input_feature_dimension = 1
+    dimension_numbers.input_spatial_dimensions.extend(range(2, 2 + nd))
+    dimension_numbers.kernel_spatial_dimensions.extend(range(2, 2 + nd))
+    dimension_numbers.output_spatial_dimensions.extend(range(2, 2 + nd))
+    return dimension_numbers
+
+
+def _forward_methods_to_local_builder():
+  """Forward remaining ComputationBuilder methods to the C API.
+
+  Set up methods, corresponding to unary and binary XLA operations,
+  whose calls are forwarded in a boilerplate manner to the underlying
+  LocalComputationBuilder C-extension API.
+  """
+
+  def forward_to_local_builder_with_handles(target_method, is_binop=False):
+    """Generate a forwarding method that wraps/unwraps data handles."""
+
+    def forward(self, *args, **kwargs):
+      unwrapped_args = [_unwrap_data_handle(arg) for arg in args]
+
+      if is_binop and len(unwrapped_args) < 3:
+        unwrapped_args.append(kwargs.get('broadcast_dimensions', ()))
+
+      return _wrap_data_handle(
+          target_method(
+              self._client,  # pylint: disable=protected-access
+              *unwrapped_args))
+
+    return forward
+
+  for method_name in _UNARY_OPS:
+    forward = forward_to_local_builder_with_handles(
+        getattr(c_api.LocalComputationBuilder, method_name))
+    forward.__name__ = method_name
+    setattr(ComputationBuilder, method_name, forward)
+
+  for method_name in _BINARY_OPS:
+    forward = forward_to_local_builder_with_handles(
+        getattr(c_api.LocalComputationBuilder, method_name), is_binop=True)
+    forward.__name__ = method_name
+    setattr(ComputationBuilder, method_name, forward)
+
+
+_forward_methods_to_local_builder()
+
+
+def initialize_replica_count(replica_count):
+  """Initializes the desired replica count to use on XLA service init.
+
+  Args:
+    replica_count: number of replicas that are desired for set up during XLA
+      initialization.
+
+  Raises:
+    A runtime exception if the XLA service has already been initialized.
+  """
+  c_api.InitializeReplicaCount(replica_count)
+
+
+def get_replica_count():
+  """Returns the current replica count used for the XLA service.
+
+  Note: this will return a value whether the XLA service has been initialized
+  yet or not.
+  """
+  return c_api.GetReplicaCount()
+
+
+def GetPaddingConfigFromTriples(triples):
+  """Create PaddingConfig proto from list of triples of integers."""
+  padding_config = xla_data_pb2.PaddingConfig()
+  for lo, hi, interior in triples:
+    dimension = padding_config.dimensions.add()
+    dimension.edge_padding_low = lo
+    dimension.edge_padding_high = hi
+    dimension.interior_padding = interior
+  return padding_config
+
+
+def GetDotDimensionsFromLists(dimension_numbers):
+  (lhs_contract, rhs_contract), (lhs_batch, rhs_batch) = dimension_numbers
+  dot_dims_proto = xla_data_pb2.DotDimensionNumbers()
+  dot_dims_proto.lhs_contracting_dimensions.extend(lhs_contract)
+  dot_dims_proto.rhs_contracting_dimensions.extend(rhs_contract)
+  dot_dims_proto.lhs_batch_dimensions.extend(lhs_batch)
+  dot_dims_proto.rhs_batch_dimensions.extend(rhs_batch)
+  return dot_dims_proto
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9d09cd5d57e001fd48d2dba9f2b0ee18374231b
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -0,0 +1,1308 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the Python extension-based XLA client."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import threading
+
+import numpy as np
+
+from tensorflow.compiler.xla.python import xla_client
+import unittest
+
+
+class LocalComputationTest(unittest.TestCase):
+  """Base class for running an XLA Computation through the local client."""
+
+  def _NewComputation(self, name=None):
+    if name is None:
+      name = self.id()
+    return xla_client.ComputationBuilder(name)
+
+  def _Execute(self, c, arguments):
+    compiled_c = c.Build().CompileWithExampleArguments(arguments)
+    return compiled_c.Execute(arguments)
+
+  def _ExecuteAndAssertWith(self, assert_func, c, arguments, expected):
+    assert expected is not None
+    result = self._Execute(c, arguments)
+    # Numpy's comparison methods are a bit too lenient by treating inputs as
+    # "array-like", meaning that scalar 4 will be happily compared equal to
+    # [[4]]. We'd like to be more strict so assert shapes as well.
+    self.assertEqual(np.asanyarray(result).shape, np.asanyarray(expected).shape)
+    assert_func(result, expected)
+
+  def _ExecuteAndCompareExact(self, c, arguments=(), expected=None):
+    self._ExecuteAndAssertWith(np.testing.assert_equal, c, arguments, expected)
+
+  def _ExecuteAndCompareClose(self, c, arguments=(), expected=None):
+    self._ExecuteAndAssertWith(np.testing.assert_allclose, c, arguments,
+                               expected)
+
+
+def NumpyArrayF32(*args, **kwargs):
+  """Convenience wrapper to create Numpy arrays with a np.float32 dtype."""
+  return np.array(*args, dtype=np.float32, **kwargs)
+
+
+def NumpyArrayF64(*args, **kwargs):
+  """Convenience wrapper to create Numpy arrays with a np.float64 dtype."""
+  return np.array(*args, dtype=np.float64, **kwargs)
+
+
+def NumpyArrayS32(*args, **kwargs):
+  """Convenience wrapper to create Numpy arrays with a np.int32 dtype."""
+  return np.array(*args, dtype=np.int32, **kwargs)
+
+
+def NumpyArrayS64(*args, **kwargs):
+  """Convenience wrapper to create Numpy arrays with a np.int64 dtype."""
+  return np.array(*args, dtype=np.int64, **kwargs)
+
+
+def NumpyArrayBool(*args, **kwargs):
+  """Convenience wrapper to create Numpy arrays with a np.bool dtype."""
+  return np.array(*args, dtype=np.bool, **kwargs)
+
+
+class ComputationsWithConstantsTest(LocalComputationTest):
+  """Tests focusing on Constant ops."""
+
+  def testConstantScalarSumF32(self):
+    c = self._NewComputation()
+    root = c.Add(c.ConstantF32Scalar(1.11), c.ConstantF32Scalar(3.14))
+    self.assertEqual(c.GetShape(root), c.GetReturnValueShape())
+    self._ExecuteAndCompareClose(c, expected=4.25)
+
+  def testConstantScalarSumF64(self):
+    c = self._NewComputation()
+    c.Add(c.ConstantF64Scalar(1.11), c.ConstantF64Scalar(3.14))
+    self._ExecuteAndCompareClose(c, expected=4.25)
+
+  def testConstantScalarSumS32(self):
+    c = self._NewComputation()
+    c.Add(c.ConstantS32Scalar(1), c.ConstantS32Scalar(2))
+    self._ExecuteAndCompareClose(c, expected=3)
+
+  def testConstantScalarSumS64(self):
+    c = self._NewComputation()
+    c.Add(c.ConstantS64Scalar(1), c.ConstantS64Scalar(2))
+    self._ExecuteAndCompareClose(c, expected=3)
+
+  def testConstantVectorMulF32(self):
+    c = self._NewComputation()
+    c.Mul(
+        c.Constant(NumpyArrayF32([2.5, 3.3, -1.2, 0.7])),
+        c.Constant(NumpyArrayF32([-1.2, 2, -2, -3])))
+    self._ExecuteAndCompareClose(c, expected=[-3, 6.6, 2.4, -2.1])
+
+  def testConstantVectorMulF64(self):
+    c = self._NewComputation()
+    c.Mul(
+        c.Constant(NumpyArrayF64([2.5, 3.3, -1.2, 0.7])),
+        c.Constant(NumpyArrayF64([-1.2, 2, -2, -3])))
+    self._ExecuteAndCompareClose(c, expected=[-3, 6.6, 2.4, -2.1])
+
+  def testConstantVectorScalarDivF32(self):
+    c = self._NewComputation()
+    c.Div(
+        c.Constant(NumpyArrayF32([1.5, 2.5, 3.0, -10.8])),
+        c.ConstantF32Scalar(2.0))
+    self._ExecuteAndCompareClose(c, expected=[0.75, 1.25, 1.5, -5.4])
+
+  def testConstantVectorScalarDivF64(self):
+    c = self._NewComputation()
+    c.Div(
+        c.Constant(NumpyArrayF64([1.5, 2.5, 3.0, -10.8])),
+        c.ConstantF64Scalar(2.0))
+    self._ExecuteAndCompareClose(c, expected=[0.75, 1.25, 1.5, -5.4])
+
+  def testConstantVectorScalarPowF32(self):
+    c = self._NewComputation()
+    c.Pow(c.Constant(NumpyArrayF32([1.5, 2.5, 3.0])), c.ConstantF32Scalar(2.))
+    self._ExecuteAndCompareClose(c, expected=[2.25, 6.25, 9.])
+
+  def testConstantVectorScalarPowF64(self):
+    c = self._NewComputation()
+    c.Pow(c.Constant(NumpyArrayF64([1.5, 2.5, 3.0])), c.ConstantF64Scalar(2.))
+    self._ExecuteAndCompareClose(c, expected=[2.25, 6.25, 9.])
+
+  def testBooleanAnd(self):
+    c = self._NewComputation()
+    c.And(
+        c.Constant(NumpyArrayBool([True, False, True, False])),
+        c.Constant(NumpyArrayBool([True, True, False, False])))
+    self._ExecuteAndCompareExact(c, expected=[True, False, False, False])
+
+  def testBooleanOr(self):
+    c = self._NewComputation()
+    c.Or(
+        c.Constant(NumpyArrayBool([True, False, True, False])),
+        c.Constant(NumpyArrayBool([True, True, False, False])))
+    self._ExecuteAndCompareExact(c, expected=[True, True, True, False])
+
+  def testSum2DF32(self):
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6]])),
+        c.Constant(NumpyArrayF32([[1, -1, 1], [-1, 1, -1]])))
+    self._ExecuteAndCompareClose(c, expected=[[2, 1, 4], [3, 6, 5]])
+
+  def testSum2DF64(self):
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF64([[1, 2, 3], [4, 5, 6]])),
+        c.Constant(NumpyArrayF64([[1, -1, 1], [-1, 1, -1]])))
+    self._ExecuteAndCompareClose(c, expected=[[2, 1, 4], [3, 6, 5]])
+
+  def testSum2DWith1DBroadcastDim0F32(self):
+    # sum of a 2D array with a 1D array where the latter is replicated across
+    # dimension 0 to match the former's shape.
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayF32([10, 20, 30])),
+        broadcast_dimensions=(0,))
+    self._ExecuteAndCompareClose(
+        c, expected=[[11, 12, 13], [24, 25, 26], [37, 38, 39]])
+
+  def testSum2DWith1DBroadcastDim0F64(self):
+    # sum of a 2D array with a 1D array where the latter is replicated across
+    # dimension 0 to match the former's shape.
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF64([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayF64([10, 20, 30])),
+        broadcast_dimensions=(0,))
+    self._ExecuteAndCompareClose(
+        c, expected=[[11, 12, 13], [24, 25, 26], [37, 38, 39]])
+
+  def testSum2DWith1DBroadcastDim1F32(self):
+    # sum of a 2D array with a 1D array where the latter is replicated across
+    # dimension 1 to match the former's shape.
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayF32([10, 20, 30])),
+        broadcast_dimensions=(1,))
+    self._ExecuteAndCompareClose(
+        c, expected=[[11, 22, 33], [14, 25, 36], [17, 28, 39]])
+
+  def testSum2DWith1DBroadcastDim1F64(self):
+    # sum of a 2D array with a 1D array where the latter is replicated across
+    # dimension 1 to match the former's shape.
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF64([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayF64([10, 20, 30])),
+        broadcast_dimensions=(1,))
+    self._ExecuteAndCompareClose(
+        c, expected=[[11, 22, 33], [14, 25, 36], [17, 28, 39]])
+
+  def testConstantAxpyF32(self):
+    c = self._NewComputation()
+    c.Add(
+        c.Mul(
+            c.ConstantF32Scalar(2),
+            c.Constant(NumpyArrayF32([2.2, 3.3, 4.4, 5.5]))),
+        c.Constant(NumpyArrayF32([100, -100, 200, -200])))
+    self._ExecuteAndCompareClose(c, expected=[104.4, -93.4, 208.8, -189])
+
+  def testConstantAxpyF64(self):
+    c = self._NewComputation()
+    c.Add(
+        c.Mul(
+            c.ConstantF64Scalar(2),
+            c.Constant(NumpyArrayF64([2.2, 3.3, 4.4, 5.5]))),
+        c.Constant(NumpyArrayF64([100, -100, 200, -200])))
+    self._ExecuteAndCompareClose(c, expected=[104.4, -93.4, 208.8, -189])
+
+
+class ParametersTest(LocalComputationTest):
+  """Tests focusing on Parameter ops and argument-passing."""
+
+  def setUp(self):
+    self.f32_scalar_2 = NumpyArrayF32(2.0)
+    self.f32_4vector = NumpyArrayF32([-2.3, 3.3, -4.3, 5.3])
+    self.f64_scalar_2 = NumpyArrayF64(2.0)
+    self.f64_4vector = NumpyArrayF64([-2.3, 3.3, -4.3, 5.3])
+    self.s32_scalar_3 = NumpyArrayS32(3)
+    self.s32_4vector = NumpyArrayS32([10, 15, -2, 7])
+    self.s64_scalar_3 = NumpyArrayS64(3)
+    self.s64_4vector = NumpyArrayS64([10, 15, -2, 7])
+
+  def testScalarTimesVectorAutonumberF32(self):
+    c = self._NewComputation()
+    p0 = c.ParameterFromNumpy(self.f32_scalar_2)
+    p1 = c.ParameterFromNumpy(self.f32_4vector)
+    c.Mul(p0, p1)
+    self._ExecuteAndCompareClose(
+        c,
+        arguments=[self.f32_scalar_2, self.f32_4vector],
+        expected=[-4.6, 6.6, -8.6, 10.6])
+
+  def testScalarTimesVectorAutonumberF64(self):
+    c = self._NewComputation()
+    p0 = c.ParameterFromNumpy(self.f64_scalar_2)
+    p1 = c.ParameterFromNumpy(self.f64_4vector)
+    c.Mul(p0, p1)
+    self._ExecuteAndCompareClose(
+        c,
+        arguments=[self.f64_scalar_2, self.f64_4vector],
+        expected=[-4.6, 6.6, -8.6, 10.6])
+
+  def testScalarTimesVectorS32(self):
+    c = self._NewComputation()
+    p0 = c.ParameterFromNumpy(self.s32_scalar_3)
+    p1 = c.ParameterFromNumpy(self.s32_4vector)
+    c.Mul(p0, p1)
+    self._ExecuteAndCompareExact(
+        c,
+        arguments=[self.s32_scalar_3, self.s32_4vector],
+        expected=[30, 45, -6, 21])
+
+  def testScalarTimesVectorS64(self):
+    c = self._NewComputation()
+    p0 = c.ParameterFromNumpy(self.s64_scalar_3)
+    p1 = c.ParameterFromNumpy(self.s64_4vector)
+    c.Mul(p0, p1)
+    self._ExecuteAndCompareExact(
+        c,
+        arguments=[self.s64_scalar_3, self.s64_4vector],
+        expected=[30, 45, -6, 21])
+
+  def testScalarMinusVectorExplicitNumberingF32(self):
+    # Use explicit numbering and pass parameter_num first. Sub is used since
+    # it's not commutative and can help catch parameter reversal within the
+    # computation.
+    c = self._NewComputation()
+    p1 = c.ParameterFromNumpy(self.f32_4vector, parameter_num=1)
+    p0 = c.ParameterFromNumpy(self.f32_scalar_2, parameter_num=0)
+    c.Sub(p1, p0)
+    self._ExecuteAndCompareClose(
+        c,
+        arguments=[self.f32_scalar_2, self.f32_4vector],
+        expected=[-4.3, 1.3, -6.3, 3.3])
+
+  def testScalarMinusVectorExplicitNumberingF64(self):
+    # Use explicit numbering and pass parameter_num first. Sub is used since
+    # it's not commutative and can help catch parameter reversal within the
+    # computation.
+    c = self._NewComputation()
+    p1 = c.ParameterFromNumpy(self.f64_4vector, parameter_num=1)
+    p0 = c.ParameterFromNumpy(self.f64_scalar_2, parameter_num=0)
+    c.Sub(p1, p0)
+    self._ExecuteAndCompareClose(
+        c,
+        arguments=[self.f64_scalar_2, self.f64_4vector],
+        expected=[-4.3, 1.3, -6.3, 3.3])
+
+
+class LocalBufferTest(LocalComputationTest):
+  """Tests focusing on execution with LocalBuffers."""
+
+  def _Execute(self, c, arguments):
+    compiled_c = c.Build().CompileWithExampleArguments(arguments)
+    arg_buffers = [xla_client.LocalBuffer.from_py(arg) for arg in arguments]
+    result_buffer = compiled_c.ExecuteWithLocalBuffers(arg_buffers)
+    return result_buffer.to_py()
+
+  def testConstantSum(self):
+    c = self._NewComputation()
+    c.Add(c.ConstantF32Scalar(1.11), c.ConstantF32Scalar(3.14))
+    self._ExecuteAndCompareClose(c, expected=4.25)
+
+  def testOneParameterSum(self):
+    c = self._NewComputation()
+    c.Add(c.ParameterFromNumpy(NumpyArrayF32(0.)), c.ConstantF32Scalar(3.14))
+    self._ExecuteAndCompareClose(
+        c,
+        arguments=[NumpyArrayF32(1.11)],
+        expected=4.25)
+
+  def testTwoParameterSum(self):
+    c = self._NewComputation()
+    c.Add(c.ParameterFromNumpy(NumpyArrayF32(0.)),
+          c.ParameterFromNumpy(NumpyArrayF32(0.)))
+    self._ExecuteAndCompareClose(
+        c,
+        arguments=[NumpyArrayF32(1.11), NumpyArrayF32(3.14)],
+        expected=4.25)
+
+  def testCannotCallWithDeletedBuffers(self):
+    c = self._NewComputation()
+    c.Add(c.ParameterFromNumpy(NumpyArrayF32(0.)), c.ConstantF32Scalar(3.14))
+    arg = NumpyArrayF32(1.11)
+    compiled_c = c.Build().CompileWithExampleArguments([arg])
+    arg_buffer = xla_client.LocalBuffer.from_py(arg)
+    arg_buffer.delete()
+    with self.assertRaises(ValueError):
+      compiled_c.ExecuteWithLocalBuffers([arg_buffer])
+
+
+class SingleOpTest(LocalComputationTest):
+  """Tests for single ops.
+
+  The goal here is smoke testing - to exercise the most basic functionality of
+  single XLA ops. As minimal as possible number of additional ops are added
+  around the op being tested.
+  """
+
+  def testConcatenateF32(self):
+    c = self._NewComputation()
+    c.Concatenate(
+        (c.Constant(NumpyArrayF32([1.0, 2.0, 3.0])),
+         c.Constant(NumpyArrayF32([4.0, 5.0, 6.0]))),
+        dimension=0)
+    self._ExecuteAndCompareClose(c, expected=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+
+  def testConcatenateF64(self):
+    c = self._NewComputation()
+    c.Concatenate(
+        (c.Constant(NumpyArrayF64([1.0, 2.0, 3.0])),
+         c.Constant(NumpyArrayF64([4.0, 5.0, 6.0]))),
+        dimension=0)
+    self._ExecuteAndCompareClose(c, expected=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+
+  def testConvertElementType(self):
+    xla_types = {
+        np.bool: xla_client.xla_data_pb2.PRED,
+        np.int32: xla_client.xla_data_pb2.S32,
+        np.int64: xla_client.xla_data_pb2.S64,
+        np.float32: xla_client.xla_data_pb2.F32,
+        np.float64: xla_client.xla_data_pb2.F64,
+    }
+
+    def _ConvertAndTest(template, src_dtype, dst_dtype):
+      c = self._NewComputation()
+      x = c.Constant(np.array(template, dtype=src_dtype))
+      c.ConvertElementType(x, xla_types[dst_dtype])
+
+      result = c.Build().Compile().Execute()
+      expected = np.array(template, dtype=dst_dtype)
+
+      self.assertEqual(result.shape, expected.shape)
+      self.assertEqual(result.dtype, expected.dtype)
+      np.testing.assert_equal(result, expected)
+
+    x = [0, 1, 0, 0, 1]
+    for src_dtype, dst_dtype in itertools.product(xla_types, xla_types):
+      _ConvertAndTest(x, src_dtype, dst_dtype)
+
+  def testCrossReplicaSumOneReplica(self):
+    samples = [
+        NumpyArrayF32(42.0),
+        NumpyArrayF32([97.0]),
+        NumpyArrayF32([64.0, 117.0]),
+        NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
+    ]
+    for lhs in samples:
+      c = self._NewComputation()
+      c.CrossReplicaSum(c.Constant(lhs))
+      self._ExecuteAndCompareExact(c, expected=lhs)
+
+  def testDotMatrixVectorF32(self):
+    c = self._NewComputation()
+    lhs = NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]])
+    rhs = NumpyArrayF32([[10.0], [20.0]])
+    c.Dot(c.Constant(lhs), c.Constant(rhs))
+    self._ExecuteAndCompareClose(c, expected=np.dot(lhs, rhs))
+
+  def testDotMatrixVectorF64(self):
+    c = self._NewComputation()
+    lhs = NumpyArrayF64([[2.0, 3.0], [4.0, 5.0]])
+    rhs = NumpyArrayF64([[10.0], [20.0]])
+    c.Dot(c.Constant(lhs), c.Constant(rhs))
+    self._ExecuteAndCompareClose(c, expected=np.dot(lhs, rhs))
+
+  def testDotMatrixMatrixF32(self):
+    c = self._NewComputation()
+    lhs = NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]])
+    rhs = NumpyArrayF32([[10.0, 20.0], [100.0, 200.0]])
+    c.Dot(c.Constant(lhs), c.Constant(rhs))
+    self._ExecuteAndCompareClose(c, expected=np.dot(lhs, rhs))
+
+  def testDotMatrixMatrixF64(self):
+    c = self._NewComputation()
+    lhs = NumpyArrayF64([[2.0, 3.0], [4.0, 5.0]])
+    rhs = NumpyArrayF64([[10.0, 20.0], [100.0, 200.0]])
+    c.Dot(c.Constant(lhs), c.Constant(rhs))
+    self._ExecuteAndCompareClose(c, expected=np.dot(lhs, rhs))
+
+  def testDotGeneral(self):
+    c = self._NewComputation()
+    rng = np.random.RandomState(0)
+    lhs = NumpyArrayF32(rng.randn(10, 3, 4))
+    rhs = NumpyArrayF32(rng.randn(10, 4, 5))
+    dimension_numbers = (([2], [1]), ([0], [0]))
+    c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers)
+    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs))
+
+  def testDotGeneralWithDotDimensionNumbersProto(self):
+    c = self._NewComputation()
+    rng = np.random.RandomState(0)
+    lhs = NumpyArrayF32(rng.randn(10, 3, 4))
+    rhs = NumpyArrayF32(rng.randn(10, 4, 5))
+
+    dimension_numbers = xla_client.xla_data_pb2.DotDimensionNumbers()
+    dimension_numbers.lhs_contracting_dimensions.append(2)
+    dimension_numbers.rhs_contracting_dimensions.append(1)
+    dimension_numbers.lhs_batch_dimensions.append(0)
+    dimension_numbers.rhs_batch_dimensions.append(0)
+
+    c.DotGeneral(c.Constant(lhs), c.Constant(rhs), dimension_numbers)
+    self._ExecuteAndCompareClose(c, expected=np.matmul(lhs, rhs))
+
+  def testConvF32Same(self):
+    c = self._NewComputation()
+    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
+    lhs = a(1, 2, 3, 4)
+    rhs = a(1, 2, 1, 2) * 10
+    c.Conv(c.Constant(lhs), c.Constant(rhs),
+           [1, 1], xla_client.PaddingType.SAME)
+    result = np.array([[[[640., 700., 760., 300.],
+                         [880., 940., 1000., 380.],
+                         [1120., 1180., 1240., 460.]]]])
+    self._ExecuteAndCompareClose(c, expected=result)
+
+  def testConvF32Valid(self):
+    c = self._NewComputation()
+    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
+    lhs = a(1, 2, 3, 4)
+    rhs = a(1, 2, 1, 2) * 10
+    c.Conv(c.Constant(lhs), c.Constant(rhs),
+           [2, 1], xla_client.PaddingType.VALID)
+    result = np.array([[[[640., 700., 760.],
+                         [1120., 1180., 1240.]]]])
+    self._ExecuteAndCompareClose(c, expected=result)
+
+  def testConvWithGeneralPaddingF32(self):
+    c = self._NewComputation()
+    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
+    lhs = a(1, 1, 2, 3)
+    rhs = a(1, 1, 1, 2) * 10
+    strides = [1, 1]
+    pads = [(1, 0), (0, 1)]
+    lhs_dilation = (2, 1)
+    rhs_dilation = (1, 1)
+    c.ConvWithGeneralPadding(c.Constant(lhs), c.Constant(rhs),
+                             strides, pads, lhs_dilation, rhs_dilation)
+    result = np.array([[[[0., 0., 0.],
+                         [10., 20., 0.],
+                         [0., 0., 0.],
+                         [40., 50., 0.]]]])
+    self._ExecuteAndCompareClose(c, expected=result)
+
+  def testBooleanNot(self):
+    c = self._NewComputation()
+    arr = NumpyArrayBool([True, False, True])
+    c.Not(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=~arr)
+
+  def testExp(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Exp(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.exp(arr))
+
+  def testRound(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Round(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.round(arr))
+
+  def testLog(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Log(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.log(arr))
+
+  def testNeg(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Neg(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=-arr)
+
+  def testFloor(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Floor(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.floor(arr))
+
+  def testCeil(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Ceil(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.ceil(arr))
+
+  def testAbs(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, -12.1, 2.4, -1.])
+    c.Abs(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.abs(arr))
+
+  def testTanh(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Tanh(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.tanh(arr))
+
+  def testTrans(self):
+
+    def _TransposeAndTest(array):
+      c = self._NewComputation()
+      c.Trans(c.Constant(array))
+      self._ExecuteAndCompareClose(c, expected=array.T)
+
+    # Test square and non-square matrices in both default (C) and F orders.
+    for array_fun in [NumpyArrayF32, NumpyArrayF64]:
+      _TransposeAndTest(array_fun([[1, 2, 3], [4, 5, 6]]))
+      _TransposeAndTest(array_fun([[1, 2, 3], [4, 5, 6]], order="F"))
+      _TransposeAndTest(array_fun([[1, 2], [4, 5]]))
+      _TransposeAndTest(array_fun([[1, 2], [4, 5]], order="F"))
+
+  def testTranspose(self):
+
+    def _TransposeAndTest(array, permutation):
+      c = self._NewComputation()
+      c.Transpose(c.Constant(array), permutation)
+      expected = np.transpose(array, permutation)
+      self._ExecuteAndCompareClose(c, expected=expected)
+
+    _TransposeAndTest(NumpyArrayF32([[1, 2, 3], [4, 5, 6]]), [0, 1])
+    _TransposeAndTest(NumpyArrayF32([[1, 2, 3], [4, 5, 6]]), [1, 0])
+    _TransposeAndTest(NumpyArrayF32([[1, 2], [4, 5]]), [0, 1])
+    _TransposeAndTest(NumpyArrayF32([[1, 2], [4, 5]]), [1, 0])
+
+    arr = np.random.RandomState(0).randn(2, 3, 4).astype(np.float32)
+    for permutation in itertools.permutations(range(arr.ndim)):
+      _TransposeAndTest(arr, permutation)
+      _TransposeAndTest(np.asfortranarray(arr), permutation)
+
+  def testEq(self):
+    c = self._NewComputation()
+    c.Eq(
+        c.Constant(NumpyArrayS32([1, 2, 3, 4])),
+        c.Constant(NumpyArrayS32([4, 2, 3, 1])))
+    self._ExecuteAndCompareExact(c, expected=[False, True, True, False])
+
+  def testNe(self):
+    c = self._NewComputation()
+    c.Ne(
+        c.Constant(NumpyArrayS32([1, 2, 3, 4])),
+        c.Constant(NumpyArrayS32([4, 2, 3, 1])))
+    self._ExecuteAndCompareExact(c, expected=[True, False, False, True])
+
+    c.Ne(
+        c.Constant(NumpyArrayF32([-2.0, 0.0,
+                                  float("nan"),
+                                  float("nan")])),
+        c.Constant(NumpyArrayF32([2.0, -0.0, 1.0, float("nan")])))
+    self._ExecuteAndAssertWith(
+        np.testing.assert_allclose, c, (), expected=[True, False, True, True])
+
+  def testGt(self):
+    c = self._NewComputation()
+    c.Gt(
+        c.Constant(NumpyArrayS32([1, 2, 3, 4, 9])),
+        c.Constant(NumpyArrayS32([1, 0, 2, 7, 12])))
+    self._ExecuteAndCompareExact(c, expected=[False, True, True, False, False])
+
+  def testGe(self):
+    c = self._NewComputation()
+    c.Ge(
+        c.Constant(NumpyArrayS32([1, 2, 3, 4, 9])),
+        c.Constant(NumpyArrayS32([1, 0, 2, 7, 12])))
+    self._ExecuteAndCompareExact(c, expected=[True, True, True, False, False])
+
+  def testLt(self):
+    c = self._NewComputation()
+    c.Lt(
+        c.Constant(NumpyArrayS32([1, 2, 3, 4, 9])),
+        c.Constant(NumpyArrayS32([1, 0, 2, 7, 12])))
+    self._ExecuteAndCompareExact(c, expected=[False, False, False, True, True])
+
+  def testLe(self):
+    c = self._NewComputation()
+    c.Le(
+        c.Constant(NumpyArrayS32([1, 2, 3, 4, 9])),
+        c.Constant(NumpyArrayS32([1, 0, 2, 7, 12])))
+    self._ExecuteAndCompareExact(c, expected=[True, False, False, True, True])
+
+  def testMax(self):
+    c = self._NewComputation()
+    c.Max(
+        c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0, 9.0])),
+        c.Constant(NumpyArrayF32([1.0, 0.0, 2.0, 7.0, 12.0])))
+    self._ExecuteAndCompareExact(c, expected=[1.0, 2.0, 3.0, 7.0, 12.0])
+
+  def testMaxExplicitBroadcastDim0(self):
+    c = self._NewComputation()
+    c.Max(
+        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayF32([3, 4, 5])),
+        broadcast_dimensions=(0,))
+    self._ExecuteAndCompareExact(c, expected=[[3, 3, 3], [4, 5, 6], [7, 8, 9]])
+
+  def testMaxExplicitBroadcastDim1(self):
+    c = self._NewComputation()
+    c.Max(
+        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayF32([3, 4, 5])),
+        broadcast_dimensions=(1,))
+    self._ExecuteAndCompareExact(c, expected=[[3, 4, 5], [4, 5, 6], [7, 8, 9]])
+
+  def testMin(self):
+    c = self._NewComputation()
+    c.Min(
+        c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0, 9.0])),
+        c.Constant(NumpyArrayF32([1.0, 0.0, 2.0, 7.0, 12.0])))
+    self._ExecuteAndCompareExact(c, expected=[1.0, 0.0, 2.0, 4.0, 9.0])
+
+  def testPad(self):
+    c = self._NewComputation()
+    c.Pad(
+        c.Constant(NumpyArrayF32([[1.0, 2.0], [3.0, 4.0]])),
+        c.Constant(NumpyArrayF32(0.0)),
+        [(1, 2, 1), (0, 1, 0)])
+    self._ExecuteAndCompareClose(c, expected=[[0.0, 0.0, 0.0],
+                                              [1.0, 2.0, 0.0],
+                                              [0.0, 0.0, 0.0],
+                                              [3.0, 4.0, 0.0],
+                                              [0.0, 0.0, 0.0],
+                                              [0.0, 0.0, 0.0]])
+
+  def testPadWithPaddingConfig(self):
+    c = self._NewComputation()
+    padding_config = xla_client.xla_data_pb2.PaddingConfig()
+    for lo, hi, interior in [(1, 2, 1), (0, 1, 0)]:
+      dimension = padding_config.dimensions.add()
+      dimension.edge_padding_low = lo
+      dimension.edge_padding_high = hi
+      dimension.interior_padding = interior
+    c.Pad(
+        c.Constant(NumpyArrayF32([[1.0, 2.0], [3.0, 4.0]])),
+        c.Constant(NumpyArrayF32(0.0)),
+        padding_config)
+    self._ExecuteAndCompareClose(c, expected=[[0.0, 0.0, 0.0],
+                                              [1.0, 2.0, 0.0],
+                                              [0.0, 0.0, 0.0],
+                                              [3.0, 4.0, 0.0],
+                                              [0.0, 0.0, 0.0],
+                                              [0.0, 0.0, 0.0]])
+
+  def testReshape(self):
+    c = self._NewComputation()
+    c.Reshape(
+        c.Constant(NumpyArrayS32([[1, 2], [3, 4], [5, 6]])),
+        dimensions=[0, 1],
+        new_sizes=[2, 3])
+    self._ExecuteAndCompareExact(c, expected=[[1, 2, 3], [4, 5, 6]])
+
+  def testCollapse(self):
+    c = self._NewComputation()
+    c.Collapse(
+        c.Constant(NumpyArrayS32([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])),
+        dimensions=[1, 2])
+    self._ExecuteAndCompareExact(c, expected=[[1, 2, 3, 4], [5, 6, 7, 8]])
+
+  def testRev(self):
+    c = self._NewComputation()
+    c.Rev(
+        c.Constant(NumpyArrayS32([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])),
+        dimensions=[0, 2])
+    self._ExecuteAndCompareExact(
+        c, expected=[[[6, 5], [8, 7]], [[2, 1], [4, 3]]])
+
+  def testClampF32(self):
+    c = self._NewComputation()
+    c.Clamp(
+        c.Constant(NumpyArrayF32(-1)),
+        c.Constant(NumpyArrayF32([-2, -1, 0, 1, 2, 3])),
+        c.Constant(NumpyArrayF32(2)))
+    self._ExecuteAndCompareExact(c, expected=[-1, -1, 0, 1, 2, 2])
+
+  # TODO(b/72689392): re-enable when bug S32 resolved
+  def DISABLED_testClampS32(self):
+    c = self._NewComputation()
+    c.Clamp(
+        c.Constant(NumpyArrayS32(-1)),
+        c.Constant(NumpyArrayS32([-2, -1, 0, 1, 2, 3])),
+        c.Constant(NumpyArrayS32(2)))
+    self._ExecuteAndCompareExact(c, expected=[-1, 0, 1, 2, 2])
+
+  def testSelect(self):
+    c = self._NewComputation()
+    c.Select(
+        c.Constant(NumpyArrayBool([True, False, False, True, False])),
+        c.Constant(NumpyArrayS32([1, 2, 3, 4, 5])),
+        c.Constant(NumpyArrayS32([-1, -2, -3, -4, -5])))
+    self._ExecuteAndCompareExact(c, expected=[1, -2, -3, 4, -5])
+
+  def testSlice(self):
+    c = self._NewComputation()
+    c.Slice(
+        c.Constant(NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])), [1, 0],
+        [3, 2])
+    self._ExecuteAndCompareExact(c, expected=[[4, 5], [7, 8]])
+
+  def testDynamicSlice(self):
+    c = self._NewComputation()
+    c.DynamicSlice(
+        c.Constant(NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayS32([1, 0])), [2, 2])
+    self._ExecuteAndCompareExact(c, expected=[[4, 5], [7, 8]])
+
+  def testDynamicUpdateSlice(self):
+    c = self._NewComputation()
+    c.DynamicUpdateSlice(
+        c.Constant(NumpyArrayS32([[1, 2, 3], [4, 5, 6], [7, 8, 9]])),
+        c.Constant(NumpyArrayS32([[1, 2], [3, 4]])),
+        c.Constant(NumpyArrayS32([1, 1])))
+    self._ExecuteAndCompareExact(c, expected=[[1, 2, 3], [4, 1, 2], [7, 3, 4]])
+
+  def testTuple(self):
+    c = self._NewComputation()
+    c.Tuple(
+        c.ConstantS32Scalar(42), c.Constant(NumpyArrayF32([1.0, 2.0])),
+        c.Constant(NumpyArrayBool([True, False, False, True])))
+    result = c.Build().Compile().Execute()
+    self.assertIsInstance(result, tuple)
+    np.testing.assert_equal(result[0], 42)
+    np.testing.assert_allclose(result[1], [1.0, 2.0])
+    np.testing.assert_equal(result[2], [True, False, False, True])
+
+  def testGetTupleElement(self):
+    c = self._NewComputation()
+    c.GetTupleElement(
+        c.Tuple(
+            c.ConstantS32Scalar(42), c.Constant(NumpyArrayF32([1.0, 2.0])),
+            c.Constant(NumpyArrayBool([True, False, False, True]))), 1)
+    self._ExecuteAndCompareClose(c, expected=[1.0, 2.0])
+
+  def testBroadcast(self):
+    c = self._NewComputation()
+    c.Broadcast(c.Constant(NumpyArrayS32([10, 20, 30, 40])), sizes=(3,))
+    self._ExecuteAndCompareExact(
+        c, expected=[[10, 20, 30, 40], [10, 20, 30, 40], [10, 20, 30, 40]])
+
+  def testRngNormal(self):
+    shape = (2, 3)
+    c = self._NewComputation()
+    c.RngNormal(c.Constant(NumpyArrayF32(0.)), c.Constant(NumpyArrayF32(1.)),
+                dims=shape)
+    result = c.Build().Compile().Execute()
+    # since the result is random, we just check shape and uniqueness
+    self.assertEqual(result.shape, shape)
+    self.assertEqual(len(np.unique(result)), np.prod(shape))
+
+  def testRngUniformF32(self):
+    lo, hi = 2., 4.
+    shape = (2, 3)
+    c = self._NewComputation()
+    c.RngUniform(c.Constant(NumpyArrayF32(lo)), c.Constant(NumpyArrayF32(hi)),
+                 dims=shape)
+    result = c.Build().Compile().Execute()
+    # since the result is random, we just check shape, uniqueness, and range
+    self.assertEqual(result.shape, shape)
+    self.assertEqual(len(np.unique(result)), np.prod(shape))
+    self.assertTrue(np.all(lo <= result))
+    self.assertTrue(np.all(result < hi))
+
+  def testRngUniformS32(self):
+    lo, hi = 2, 4
+    shape = (2, 3)
+    c = self._NewComputation()
+    c.RngUniform(c.Constant(NumpyArrayS32(lo)), c.Constant(NumpyArrayS32(hi)),
+                 dims=shape)
+    result = c.Build().Compile().Execute()
+    # since the result is random, we just check shape, integrality, and range
+    self.assertEqual(result.shape, shape)
+    self.assertEqual(result.dtype, np.int32)
+    self.assertTrue(np.all(lo <= result))
+    self.assertTrue(np.all(result < hi))
+
+
+class EmbeddedComputationsTest(LocalComputationTest):
+  """Tests for XLA graphs with embedded computations (such as maps)."""
+
+  def _CreateConstantS32Computation(self):
+    """Computation (f32) -> s32 that returns a constant 1 for any input."""
+    c = self._NewComputation("constant_s32_one")
+    # TODO(eliben): consider adding a nicer way to create new parameters without
+    # having to create dummy Numpy arrays or populating Shape messages. Perhaps
+    # we need our own (Python-client-own) way to represent Shapes conveniently.
+    c.ParameterFromNumpy(NumpyArrayF32(0))
+    c.ConstantS32Scalar(1)
+    return c.Build()
+
+  def _CreateConstantS64Computation(self):
+    """Computation (f64) -> s64 that returns a constant 1 for any input."""
+    c = self._NewComputation("constant_s64_one")
+    # TODO(eliben): consider adding a nicer way to create new parameters without
+    # having to create dummy Numpy arrays or populating Shape messages. Perhaps
+    # we need our own (Python-client-own) way to represent Shapes conveniently.
+    c.ParameterFromNumpy(NumpyArrayF64(0))
+    c.ConstantS64Scalar(1)
+    return c.Build()
+
+  def _CreateConstantF32Computation(self):
+    """Computation (f32) -> f32 that returns a constant 1.0 for any input."""
+    c = self._NewComputation("constant_f32_one")
+    c.ParameterFromNumpy(NumpyArrayF32(0))
+    c.ConstantF32Scalar(1.0)
+    return c.Build()
+
+  def _CreateConstantF64Computation(self):
+    """Computation (f64) -> f64 that returns a constant 1.0 for any input."""
+    c = self._NewComputation("constant_f64_one")
+    c.ParameterFromNumpy(NumpyArrayF64(0))
+    c.ConstantF64Scalar(1.0)
+    return c.Build()
+
+  def _CreateMulF32By2Computation(self):
+    """Computation (f32) -> f32 that multiplies its parameter by 2."""
+    c = self._NewComputation("mul_f32_by2")
+    c.Mul(c.ParameterFromNumpy(NumpyArrayF32(0)), c.ConstantF32Scalar(2.0))
+    return c.Build()
+
+  def _CreateMulF32ByParamComputation(self):
+    """Computation (f32) -> f32 that multiplies one parameter by the other."""
+    c = self._NewComputation("mul_f32_by_param")
+    c.Mul(c.ParameterFromNumpy(NumpyArrayF32(0)),
+          c.ParameterFromNumpy(NumpyArrayF32(0)))
+    return c.Build()
+
+  def _CreateMulF64By2Computation(self):
+    """Computation (f64) -> f64 that multiplies its parameter by 2."""
+    c = self._NewComputation("mul_f64_by2")
+    c.Mul(c.ParameterFromNumpy(NumpyArrayF64(0)), c.ConstantF64Scalar(2.0))
+    return c.Build()
+
+  def _CreateBinaryAddF32Computation(self):
+    """Computation (f32, f32) -> f32 that adds its two parameters."""
+    c = self._NewComputation("add_param0_by_param1")
+    c.Add(
+        c.ParameterFromNumpy(NumpyArrayF32(0)),
+        c.ParameterFromNumpy(NumpyArrayF32(0)))
+    return c.Build()
+
+  def _CreateBinaryAddF64Computation(self):
+    """Computation (f64, f64) -> f64 that adds its two parameters."""
+    c = self._NewComputation("add_param0_by_param1")
+    c.Add(
+        c.ParameterFromNumpy(NumpyArrayF64(0)),
+        c.ParameterFromNumpy(NumpyArrayF64(0)))
+    return c.Build()
+
+  def _CreateBinaryDivF32Computation(self):
+    """Computation (f32, f32) -> f32 that divides its two parameters."""
+    c = self._NewComputation("div_param0_by_param1")
+    c.Div(
+        c.ParameterFromNumpy(NumpyArrayF32(0)),
+        c.ParameterFromNumpy(NumpyArrayF32(0)))
+    return c.Build()
+
+  def _CreateBinaryDivF64Computation(self):
+    """Computation (f64, f64) -> f64 that divides its two parameters."""
+    c = self._NewComputation("div_param0_by_param1")
+    c.Div(
+        c.ParameterFromNumpy(NumpyArrayF64(0)),
+        c.ParameterFromNumpy(NumpyArrayF64(0)))
+    return c.Build()
+
+  def _CreateTestF32Lt10Computation(self):
+    """Computation (f32) -> bool that tests if its parameter is less than 10."""
+    c = self._NewComputation("test_f32_lt_10")
+    c.Lt(c.ParameterFromNumpy(NumpyArrayF32(0)), c.ConstantF32Scalar(10.))
+    return c.Build()
+
+  def _CreateTestF64Lt10Computation(self):
+    """Computation (f64) -> bool that tests if its parameter is less than 10."""
+    c = self._NewComputation("test_f64_lt_10")
+    c.Lt(c.ParameterFromNumpy(NumpyArrayF64(0)), c.ConstantF64Scalar(10.))
+    return c.Build()
+
+  def _CreateBinaryGeF32Computation(self):
+    """Computation (f32, f32) -> bool that tests first_param >= second_param."""
+    c = self._NewComputation("param0_lt_param1")
+    c.Ge(c.ParameterFromNumpy(NumpyArrayF32(0)),
+         c.ParameterFromNumpy(NumpyArrayF32(0)))
+    return c.Build()
+
+  def _CreateBinaryGeF64Computation(self):
+    """Computation (f64, f64) -> bool that tests first_param >= second_param."""
+    c = self._NewComputation("param0_lt_param1")
+    c.Ge(c.ParameterFromNumpy(NumpyArrayF64(0)),
+         c.ParameterFromNumpy(NumpyArrayF64(0)))
+    return c.Build()
+
+  def _MakeSample3DArrayF32(self):
+    return NumpyArrayF32([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]],
+                          [[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])
+
+  def _MakeSample3DArrayF64(self):
+    return NumpyArrayF64([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]],
+                          [[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])
+
+  def testCallF32(self):
+    c = self._NewComputation()
+    c.Call(
+        self._CreateMulF32By2Computation(),
+        operands=(c.ConstantF32Scalar(5.0),))
+    self._ExecuteAndCompareClose(c, expected=10.0)
+
+  def testCallF64(self):
+    c = self._NewComputation()
+    c.Call(
+        self._CreateMulF64By2Computation(),
+        operands=(c.ConstantF64Scalar(5.0),))
+    self._ExecuteAndCompareClose(c, expected=10.0)
+
+  def testMapEachElementToS32Constant(self):
+    c = self._NewComputation()
+    c.Map([c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0]))],
+          self._CreateConstantS32Computation(), [0])
+    self._ExecuteAndCompareExact(c, expected=[1, 1, 1, 1])
+
+  def testMapEachElementToS64Constant(self):
+    c = self._NewComputation()
+    c.Map([c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0]))],
+          self._CreateConstantS64Computation(), [0])
+    self._ExecuteAndCompareExact(c, expected=[1, 1, 1, 1])
+
+  def testMapMulBy2F32(self):
+    c = self._NewComputation()
+    c.Map([c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0]))],
+          self._CreateMulF32By2Computation(), [0])
+    self._ExecuteAndCompareClose(c, expected=[2.0, 4.0, 6.0, 8.0])
+
+  def testMapMulBy2F64(self):
+    c = self._NewComputation()
+    c.Map([c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0]))],
+          self._CreateMulF64By2Computation(), [0])
+    self._ExecuteAndCompareClose(c, expected=[2.0, 4.0, 6.0, 8.0])
+
+  def testSimpleMapChainF32(self):
+    # Chains a map of constant-f32 with a map of mul-by-2
+    c = self._NewComputation()
+    const_f32 = c.Map([c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0]))],
+                      self._CreateConstantF32Computation(), [0])
+    c.Map([const_f32], self._CreateMulF32By2Computation(), [0])
+    self._ExecuteAndCompareClose(c, expected=[2.0, 2.0, 2.0, 2.0])
+
+  def testSimpleMapChainF64(self):
+    # Chains a map of constant-f64 with a map of mul-by-2
+    c = self._NewComputation()
+    const_f64 = c.Map([c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0]))],
+                      self._CreateConstantF64Computation(), [0])
+    c.Map([const_f64], self._CreateMulF64By2Computation(), [0])
+    self._ExecuteAndCompareClose(c, expected=[2.0, 2.0, 2.0, 2.0])
+
+  def testDivVectorsWithMapF32(self):
+    c = self._NewComputation()
+    c.Map((c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0])),
+           c.Constant(NumpyArrayF32([5.0, 5.0, 4.0, 4.0]))),
+          self._CreateBinaryDivF32Computation(), [0])
+    self._ExecuteAndCompareClose(c, expected=[0.2, 0.4, 0.75, 1.0])
+
+  def testDivVectorsWithMapF64(self):
+    c = self._NewComputation()
+    c.Map((c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0])),
+           c.Constant(NumpyArrayF64([5.0, 5.0, 4.0, 4.0]))),
+          self._CreateBinaryDivF64Computation(), [0])
+    self._ExecuteAndCompareClose(c, expected=[0.2, 0.4, 0.75, 1.0])
+
+  def DISABLED_testMapWithStaticOperands(self):
+    c = self._NewComputation()
+    factor = c.ConstantF32Scalar(3.0)
+    c.Map([c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0]))],
+          self._CreateMulF32ByParamComputation(), [0],
+          static_operands=[factor])
+    self._ExecuteAndCompareClose(c, expected=[3.0, 6.0, 9.0, 12.0])
+
+  def testSelectAndScatterF32(self):
+    c = self._NewComputation()
+    c.SelectAndScatter(c.Constant(NumpyArrayF32([[1., 2., 6.], [4., 5., 3.]])),
+                       select=self._CreateBinaryGeF32Computation(),
+                       window_dimensions=(2, 1),
+                       window_strides=(1, 2),
+                       padding=xla_client.PaddingType.VALID,
+                       source=c.Constant(NumpyArrayF32([[0.1, 0.2]])),
+                       init_value=c.Constant(NumpyArrayF32(1)),
+                       scatter=self._CreateBinaryAddF32Computation())
+    self._ExecuteAndCompareClose(c, expected=[[1., 1., 1.2], [1.1, 1., 1.]])
+
+  def testSelectAndScatterF64(self):
+    c = self._NewComputation()
+    c.SelectAndScatter(c.Constant(NumpyArrayF64([[1., 2., 6.], [4., 5., 3.]])),
+                       select=self._CreateBinaryGeF64Computation(),
+                       window_dimensions=(2, 1),
+                       window_strides=(1, 2),
+                       padding=xla_client.PaddingType.VALID,
+                       source=c.Constant(NumpyArrayF64([[0.1, 0.2]])),
+                       init_value=c.Constant(NumpyArrayF64(1)),
+                       scatter=self._CreateBinaryAddF64Computation())
+    self._ExecuteAndCompareClose(c, expected=[[1., 1., 1.2], [1.1, 1., 1.]])
+
+  def testReduce1DtoScalarF32(self):
+    c = self._NewComputation()
+    c.Reduce(
+        operand=c.Constant(NumpyArrayF32([1.0, 2.0, 3.0, 4.0])),
+        init_value=c.ConstantF32Scalar(0),
+        computation_to_apply=self._CreateBinaryAddF32Computation(),
+        dimensions=[0])
+    self._ExecuteAndCompareClose(c, expected=10)
+
+  def testReduce1DtoScalarF64(self):
+    c = self._NewComputation()
+    c.Reduce(
+        operand=c.Constant(NumpyArrayF64([1.0, 2.0, 3.0, 4.0])),
+        init_value=c.ConstantF64Scalar(0),
+        computation_to_apply=self._CreateBinaryAddF64Computation(),
+        dimensions=[0])
+    self._ExecuteAndCompareClose(c, expected=10)
+
+  def testReduce2DTo1DDim0F32(self):
+    input_array = NumpyArrayF32([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.Reduce(
+        operand=c.Constant(input_array),
+        init_value=c.ConstantF32Scalar(0),
+        computation_to_apply=self._CreateBinaryAddF32Computation(),
+        dimensions=[0])
+    self._ExecuteAndCompareClose(c, expected=[5, 7, 9])
+
+  def testReduce2DTo1DDim0F64(self):
+    input_array = NumpyArrayF64([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.Reduce(
+        operand=c.Constant(input_array),
+        init_value=c.ConstantF64Scalar(0),
+        computation_to_apply=self._CreateBinaryAddF64Computation(),
+        dimensions=[0])
+    self._ExecuteAndCompareClose(c, expected=[5, 7, 9])
+
+  def testReduce2DTo1DDim1F32(self):
+    input_array = NumpyArrayF32([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.Reduce(
+        operand=c.Constant(input_array),
+        init_value=c.ConstantF32Scalar(0),
+        computation_to_apply=self._CreateBinaryAddF32Computation(),
+        dimensions=[1])
+    self._ExecuteAndCompareClose(c, expected=[6, 15])
+
+  def testReduce2DTo1DDim1F64(self):
+    input_array = NumpyArrayF64([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.Reduce(
+        operand=c.Constant(input_array),
+        init_value=c.ConstantF64Scalar(0),
+        computation_to_apply=self._CreateBinaryAddF64Computation(),
+        dimensions=[1])
+    self._ExecuteAndCompareClose(c, expected=[6, 15])
+
+  def testReduce3DAllPossibleWaysF32(self):
+    input_array = self._MakeSample3DArrayF32()
+
+    def _ReduceAndTest(*dims):
+      c = self._NewComputation()
+      c.Reduce(
+          operand=c.Constant(input_array),
+          init_value=c.ConstantF32Scalar(0),
+          computation_to_apply=self._CreateBinaryAddF32Computation(),
+          dimensions=dims)
+      self._ExecuteAndCompareClose(
+          c, expected=np.sum(input_array, axis=tuple(dims)))
+
+    _ReduceAndTest(0)
+    _ReduceAndTest(0)
+    _ReduceAndTest(0, 1)
+    _ReduceAndTest(0, 2)
+    _ReduceAndTest(1, 2)
+    _ReduceAndTest(0, 1, 2)
+
+  def testReduce3DAllPossibleWaysF64(self):
+    input_array = self._MakeSample3DArrayF64()
+
+    def _ReduceAndTest(*dims):
+      c = self._NewComputation()
+      c.Reduce(
+          operand=c.Constant(input_array),
+          init_value=c.ConstantF64Scalar(0),
+          computation_to_apply=self._CreateBinaryAddF64Computation(),
+          dimensions=dims)
+      self._ExecuteAndCompareClose(
+          c, expected=np.sum(input_array, axis=tuple(dims)))
+
+    _ReduceAndTest(0)
+    _ReduceAndTest(0)
+    _ReduceAndTest(0, 1)
+    _ReduceAndTest(0, 2)
+    _ReduceAndTest(1, 2)
+    _ReduceAndTest(0, 1, 2)
+
+  def testReduceWindowValidUnitStridesF32(self):
+    input_array = NumpyArrayF32([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.ReduceWindow(operand=c.Constant(input_array),
+                   init_value=c.ConstantF32Scalar(0),
+                   computation_to_apply=self._CreateBinaryAddF32Computation(),
+                   window_dimensions=(2, 1), window_strides=(1, 1),
+                   padding=xla_client.PaddingType.VALID)
+    self._ExecuteAndCompareClose(c, expected=[[5., 7., 9.]])
+
+  def testReduceWindowSameUnitStridesF32(self):
+    input_array = NumpyArrayF32([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.ReduceWindow(operand=c.Constant(input_array),
+                   init_value=c.ConstantF32Scalar(0),
+                   computation_to_apply=self._CreateBinaryAddF32Computation(),
+                   window_dimensions=(2, 1), window_strides=(1, 1),
+                   padding=xla_client.PaddingType.SAME)
+    self._ExecuteAndCompareClose(c, expected=[[5., 7., 9.], [4., 5., 6.]])
+
+  def testReduceWindowValidGeneralStridesF32(self):
+    input_array = NumpyArrayF32([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.ReduceWindow(operand=c.Constant(input_array),
+                   init_value=c.ConstantF32Scalar(0),
+                   computation_to_apply=self._CreateBinaryAddF32Computation(),
+                   window_dimensions=(2, 1), window_strides=(1, 2),
+                   padding=xla_client.PaddingType.VALID)
+    self._ExecuteAndCompareClose(c, expected=[[5., 9.]])
+
+  def testReduceWindowValidUnitStridesF64(self):
+    input_array = NumpyArrayF64([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.ReduceWindow(operand=c.Constant(input_array),
+                   init_value=c.ConstantF64Scalar(0),
+                   computation_to_apply=self._CreateBinaryAddF64Computation(),
+                   window_dimensions=(2, 1), window_strides=(1, 1),
+                   padding=xla_client.PaddingType.VALID)
+    self._ExecuteAndCompareClose(c, expected=[[5., 7., 9.]])
+
+  def testReduceWindowSameUnitStridesF64(self):
+    input_array = NumpyArrayF64([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.ReduceWindow(operand=c.Constant(input_array),
+                   init_value=c.ConstantF64Scalar(0),
+                   computation_to_apply=self._CreateBinaryAddF64Computation(),
+                   window_dimensions=(2, 1), window_strides=(1, 1),
+                   padding=xla_client.PaddingType.SAME)
+    self._ExecuteAndCompareClose(c, expected=[[5., 7., 9.], [4., 5., 6.]])
+
+  def testReduceWindowValidGeneralStridesF64(self):
+    input_array = NumpyArrayF64([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    c = self._NewComputation()
+    c.ReduceWindow(operand=c.Constant(input_array),
+                   init_value=c.ConstantF64Scalar(0),
+                   computation_to_apply=self._CreateBinaryAddF64Computation(),
+                   window_dimensions=(2, 1), window_strides=(1, 2),
+                   padding=xla_client.PaddingType.VALID)
+    self._ExecuteAndCompareClose(c, expected=[[5., 9.]])
+
+  def testWhileF32(self):
+    cond = self._CreateTestF32Lt10Computation()
+    body = self._CreateMulF32By2Computation()
+    c = self._NewComputation()
+    init = c.ConstantF32Scalar(1.)
+    c.While(cond, body, init)
+    self._ExecuteAndCompareClose(c, expected=16.)
+
+  def testWhileF64(self):
+    cond = self._CreateTestF64Lt10Computation()
+    body = self._CreateMulF64By2Computation()
+    c = self._NewComputation()
+    init = c.ConstantF64Scalar(1.)
+    c.While(cond, body, init)
+    self._ExecuteAndCompareClose(c, expected=16.)
+
+  def testConditionalTrue(self):
+    c = self._NewComputation()
+    pred = c.ConstantPredScalar(True)
+    true_operand = c.ConstantF32Scalar(3.)
+    true_computation = self._CreateMulF32By2Computation()
+    false_operand = c.ConstantF32Scalar(2.)
+    false_computation = self._CreateConstantF32Computation()
+    c.Conditional(pred, true_operand, true_computation, false_operand,
+                  false_computation)
+    self._ExecuteAndCompareClose(c, expected=6.)
+
+  def testConditionalFalse(self):
+    c = self._NewComputation()
+    pred = c.ConstantPredScalar(False)
+    true_operand = c.ConstantF32Scalar(3.)
+    true_computation = self._CreateMulF32By2Computation()
+    false_operand = c.ConstantF32Scalar(2.)
+    false_computation = self._CreateConstantF32Computation()
+    c.Conditional(pred, true_operand, true_computation, false_operand,
+                  false_computation)
+    self._ExecuteAndCompareClose(c, expected=1.)
+
+  def testInfeedS32Values(self):
+    to_infeed = NumpyArrayS32([1, 2, 3, 4])
+    c = self._NewComputation()
+    c.Infeed(xla_client.Shape.from_numpy(to_infeed[0]))
+    compiled_c = c.Build().CompileWithExampleArguments()
+    for item in to_infeed:
+      xla_client.transfer_to_infeed(item)
+
+    for item in to_infeed:
+      result = compiled_c.Execute()
+      self.assertEqual(result, item)
+
+  def testInfeedThenOutfeedS32(self):
+    to_round_trip = NumpyArrayS32([1, 2, 3, 4])
+    c = self._NewComputation()
+    x = c.Infeed(xla_client.Shape.from_numpy(to_round_trip[0]))
+    c.Outfeed(x)
+
+    compiled_c = c.Build().CompileWithExampleArguments()
+
+    for want in to_round_trip:
+      execution = threading.Thread(target=compiled_c.Execute)
+      execution.start()
+      xla_client.transfer_to_infeed(want)
+      got = xla_client.transfer_from_outfeed(
+          xla_client.Shape.from_numpy(to_round_trip[0]))
+      execution.join()
+      self.assertEqual(want, got)
+
+
+class ErrorTest(LocalComputationTest):
+
+  def setUp(self):
+    self.f32_scalar_2 = NumpyArrayF32(2.0)
+    self.s32_scalar_2 = NumpyArrayS32(2)
+
+  def testInvokeWithWrongElementType(self):
+    c = self._NewComputation()
+    c.SetOpMetadata(xla_client.CurrentSourceInfoMetadata())
+    c.ParameterFromNumpy(self.s32_scalar_2)
+    c.ClearOpMetadata()
+    self.assertRaisesRegexp(
+        RuntimeError, r"Invalid argument shape.*xla_client_test.py.*"
+        r"expected s32\[\], got f32\[\]",
+        lambda: c.Build().CompileWithExampleArguments([self.f32_scalar_2]))
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 5bb81b80dde4c6d9324d33ddd5d6b6d6ad3cc1ac..a9acdae380af5b7f9efb3d08302fc717108f5e40 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -195,14 +195,26 @@ ReferenceUtil::ReduceWindow1DGeneric(
     const tensorflow::gtl::ArraySlice<int64>& window,
     const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
   std::vector<int64> dim_lengths{static_cast<int64>(operand.size())};
-  auto padding_both = xla::MakePadding(dim_lengths, window, stride, padding);
+  return ReduceWindow1DGeneric(
+      operand, init, reduce_func, window, stride,
+      xla::MakePadding(dim_lengths, window, stride, padding));
+}
 
+/* static  */ std::unique_ptr<std::vector<float>>
+ReferenceUtil::ReduceWindow1DGeneric(
+    const tensorflow::gtl::ArraySlice<float>& operand, float init,
+    const std::function<float(float, float)>& reduce_func,
+    const tensorflow::gtl::ArraySlice<int64>& window,
+    const tensorflow::gtl::ArraySlice<int64>& stride,
+    const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding) {
+  std::vector<int64> dim_lengths{static_cast<int64>(operand.size())};
   std::vector<int64> window_counts(window.size(), 0);
   std::vector<int64> pad_low(window.size(), 0);
   for (int64 i = 0; i < window.size(); ++i) {
+    int64 padded_width = padding[i].first + dim_lengths[i] + padding[i].second;
     window_counts[i] =
-        WindowCount(dim_lengths[i], window[i], stride[i], padding);
-    pad_low[i] = padding_both[i].first;
+        window_util::StridedBound(padded_width, window[i], stride[i]);
+    pad_low[i] = padding[i].first;
   }
   auto result = MakeUnique<std::vector<float>>(window_counts[0]);
 
@@ -269,6 +281,51 @@ ReferenceUtil::ReduceWindow1DAdd(
   return result;
 }
 
+/* static  */ std::unique_ptr<Array3D<float>> ReferenceUtil::ReduceWindow3DAdd(
+    const Array3D<float>& operand, float init,
+    const tensorflow::gtl::ArraySlice<int64>& window,
+    const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
+  std::vector<int64> dim_lengths{operand.n1(), operand.n2(), operand.n3()};
+  auto padding_both = xla::MakePadding(dim_lengths, window, stride, padding);
+
+  std::vector<int64> window_counts(window.size(), 0);
+  std::vector<int64> pad_low(window.size(), 0);
+  for (int64 i = 0; i < window.size(); ++i) {
+    window_counts[i] =
+        WindowCount(dim_lengths[i], window[i], stride[i], padding);
+    pad_low[i] = padding_both[i].first;
+  }
+  auto result = MakeUnique<Array3D<float>>(window_counts[0], window_counts[1],
+                                           window_counts[2]);
+
+  for (int64 i0 = 0; i0 < window_counts[0]; ++i0) {
+    for (int64 i1 = 0; i1 < window_counts[1]; ++i1) {
+      for (int64 i2 = 0; i2 < window_counts[2]; ++i2) {
+        int64 i0_base = i0 * stride[0] - pad_low[0];
+        int64 i1_base = i1 * stride[1] - pad_low[1];
+        int64 i2_base = i2 * stride[2] - pad_low[2];
+
+        float val = init;
+        for (int64 i0_win = 0; i0_win < window[0]; ++i0_win) {
+          for (int64 i1_win = 0; i1_win < window[1]; ++i1_win) {
+            for (int64 i2_win = 0; i2_win < window[2]; ++i2_win) {
+              if (i0_base + i0_win >= 0 && i1_base + i1_win >= 0 &&
+                  i2_base + i2_win >= 0 && i0_base + i0_win < operand.n1() &&
+                  i1_base + i1_win < operand.n2() &&
+                  i2_base + i2_win < operand.n3()) {
+                val += operand(i0_base + i0_win, i1_base + i1_win,
+                               i2_base + i2_win);
+              }
+            }
+          }
+        }
+        (*result)(i0, i1, i2) = val;
+      }
+    }
+  }
+  return result;
+}
+
 /* static */ std::unique_ptr<Array4D<float>>
 ReferenceUtil::ReduceWindow4DGeneric(
     const Array4D<float>& operand, float init,
@@ -520,7 +577,7 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
 
   HloEvaluator evaluator;
   std::unique_ptr<Literal> result_literal =
-      evaluator.Evaluate(*computation, {}).ConsumeValueOrDie();
+      evaluator.Evaluate<const Literal*>(*computation, {}).ConsumeValueOrDie();
 
   CHECK_EQ(ShapeUtil::Rank(result_literal->shape()), 4);
   auto result =
@@ -594,8 +651,12 @@ ReferenceUtil::ReduceToRowArray2D(
                    i2 == 0 || (dim_set.count(2) && i2 < array.n3()); ++i2) {
                 for (int64 i3 = 0;
                      i3 == 0 || (dim_set.count(3) && i3 < array.n4()); ++i3) {
-                  accumulator = reduce_function(
-                      accumulator, array(a0 + i0, a1 + i1, a2 + i2, a3 + i3));
+                  // Handle zero-sized arrays.
+                  if (array.n1() > 0 && array.n2() > 0 && array.n3() > 0 &&
+                      array.n4() > 0) {
+                    accumulator = reduce_function(
+                        accumulator, array(a0 + i0, a1 + i1, a2 + i2, a3 + i3));
+                  }
                 }
               }
             }
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index 62d455d71a70407e903a1e0be42a7e9f1898e523..3ec96f2f38b8f91e1549419b60481327fa9bbd5f 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -70,7 +70,7 @@ class ReferenceUtil {
   // dilation factors.
   static std::unique_ptr<Array4D<float>> ConvArray4DGeneralDimensionsDilated(
       const Array4D<float>& lhs, const Array4D<float>& rhs,
-      std::pair<int64, int64> stride, Padding padding,
+      std::pair<int64, int64> kernel_stride, Padding padding,
       std::pair<int64, int64> lhs_dilation,
       std::pair<int64, int64> rhs_dilation, ConvolutionDimensionNumbers dnums);
 
@@ -173,6 +173,10 @@ class ReferenceUtil {
       const Array2D<float>& operand, float init,
       const tensorflow::gtl::ArraySlice<int64>& window,
       const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
+  static std::unique_ptr<Array3D<float>> ReduceWindow3DAdd(
+      const Array3D<float>& operand, float init,
+      const tensorflow::gtl::ArraySlice<int64>& window,
+      const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
   static std::unique_ptr<Array4D<float>> ReduceWindow4DAdd(
       const Array4D<float>& operand, float init,
       const tensorflow::gtl::ArraySlice<int64>& window,
@@ -184,11 +188,18 @@ class ReferenceUtil {
       const std::function<float(float, float)>& reduce_func,
       const tensorflow::gtl::ArraySlice<int64>& window,
       const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
+  static std::unique_ptr<std::vector<float>> ReduceWindow1DGeneric(
+      const tensorflow::gtl::ArraySlice<float>& operand, float init,
+      const std::function<float(float, float)>& reduce_func,
+      const tensorflow::gtl::ArraySlice<int64>& window,
+      const tensorflow::gtl::ArraySlice<int64>& stride,
+      const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding);
   static std::unique_ptr<Array4D<float>> ReduceWindow4DGeneric(
       const Array4D<float>& operand, float init,
       const std::function<float(float, float)>& reduce_func,
       const tensorflow::gtl::ArraySlice<int64>& window,
       const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
+  // With arbitrary padding.
   static std::unique_ptr<Array4D<float>> ReduceWindow4DGeneric(
       const Array4D<float>& operand, float init,
       const std::function<float(float, float)>& reduce_func,
diff --git a/tensorflow/compiler/xla/reference_util_test.cc b/tensorflow/compiler/xla/reference_util_test.cc
index 846ccdc83df900e3afedb6ababe07ebb1bd68f41..9da9bc60a2025e63b57a3be9ed360d150f88d73c 100644
--- a/tensorflow/compiler/xla/reference_util_test.cc
+++ b/tensorflow/compiler/xla/reference_util_test.cc
@@ -86,6 +86,13 @@ TEST_F(ReferenceUtilTest, ReduceToRowArray2D) {
                                        ErrorSpec(0.0001));
 }
 
+TEST_F(ReferenceUtilTest, Reduce4Dto1DZeroSizedArray) {
+  auto result = Literal::CreateR1<float>(ReferenceUtil::Reduce4DTo1D(
+      Array4D<float>(1, 0, 1, 1), /*init=*/0, /*dims=*/{0, 1, 2},
+      [](float a, float b) { return a + b; }));
+  LiteralTestUtil::ExpectR1Equal<float>({0}, *result);
+}
+
 TEST_F(ReferenceUtilTest, MapArray2D) {
   auto identity = [](float value) { return log(exp(value)); };
   auto result = ReferenceUtil::MapArray2D(*matrix_, identity);
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index d3175c1e49974b060cc495d463d4995c925abcf7..83c67ed9368bc617a90c528f200b566ee8754edd 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -29,6 +29,11 @@ xla_proto_library(
     deps = ["//tensorflow/compiler/xla:xla_data_proto"],
 )
 
+xla_proto_library(
+    name = "hlo_profile_printer_data",
+    srcs = ["hlo_profile_printer_data.proto"],
+)
+
 # Filegroup used to collect source files for dependency checking.
 filegroup(
     name = "c_srcs",
@@ -38,6 +43,81 @@ filegroup(
     ]),
 )
 
+cc_library(
+    name = "bfloat16_support",
+    srcs = ["bfloat16_support.cc"],
+    hdrs = ["bfloat16_support.h"],
+    deps = [
+        ":hlo",
+    ],
+)
+
+cc_library(
+    name = "bfloat16_conversion_folding",
+    srcs = ["bfloat16_conversion_folding.cc"],
+    hdrs = ["bfloat16_conversion_folding.h"],
+    deps = [
+        ":bfloat16_support",
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "bfloat16_conversion_folding_test",
+    srcs = ["bfloat16_conversion_folding_test.cc"],
+    deps = [
+        ":bfloat16_conversion_folding",
+        ":bfloat16_support",
+        ":hlo",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "bfloat16_normalization",
+    srcs = ["bfloat16_normalization.cc"],
+    hdrs = ["bfloat16_normalization.h"],
+    deps = [
+        ":bfloat16_support",
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "bfloat16_normalization_test",
+    srcs = ["bfloat16_normalization_test.cc"],
+    deps = [
+        ":bfloat16_normalization",
+        ":bfloat16_support",
+        ":hlo",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "shape_inference",
     srcs = ["shape_inference.cc"],
@@ -108,6 +188,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
@@ -115,6 +196,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
@@ -450,8 +532,10 @@ cc_library(
         ":hlo_evaluator",
         ":hlo_execution_profile",
         ":hlo_module_config",
+        ":hlo_proto_util",
         ":platform_util",
         ":session_proto",
+        ":source_map_util",
         ":transfer_manager",
         ":user_computation",
         ":versioned_computation_handle",
@@ -500,6 +584,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
@@ -903,6 +988,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -1009,9 +1095,9 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "batchnorm_rewriter",
-    srcs = ["batchnorm_rewriter.cc"],
-    hdrs = ["batchnorm_rewriter.h"],
+    name = "batchnorm_expander",
+    srcs = ["batchnorm_expander.cc"],
+    hdrs = ["batchnorm_expander.h"],
     deps = [
         ":hlo",
         ":hlo_pass",
@@ -1029,11 +1115,11 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "batchnorm_rewriter_test",
+    name = "batchnorm_expander_test",
     size = "small",
-    srcs = ["batchnorm_rewriter_test.cc"],
+    srcs = ["batchnorm_expander_test.cc"],
     deps = [
-        ":batchnorm_rewriter",
+        ":batchnorm_expander",
         ":hlo",
         ":hlo_matchers",
         ":hlo_pass",
@@ -1082,6 +1168,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
@@ -1143,6 +1230,49 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "implicit_broadcast_remover",
+    srcs = ["implicit_broadcast_remover.cc"],
+    hdrs = ["implicit_broadcast_remover.h"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "implicit_broadcast_remover_test",
+    srcs = ["implicit_broadcast_remover_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":implicit_broadcast_remover",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+    ],
+)
+
+cc_library(
+    name = "dot_decomposer",
+    srcs = ["dot_decomposer.cc"],
+    hdrs = ["dot_decomposer.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "tuple_simplifier",
     srcs = ["tuple_simplifier.cc"],
@@ -1663,6 +1793,7 @@ tf_cc_test(
         ":hlo",
         ":hlo_graph_dumper",
         ":hlo_matchers",
+        ":hlo_runner",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -1670,7 +1801,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
 )
@@ -1703,6 +1833,22 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "hlo_verifier_test",
+    srcs = ["hlo_verifier_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_verifier",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_rematerialization",
     srcs = ["hlo_rematerialization.cc"],
@@ -1781,7 +1927,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
+        "//tensorflow/core:test",
     ],
 )
 
@@ -1812,6 +1960,7 @@ cc_library(
         ":hlo",
         ":hlo_graph_dumper",
         ":hlo_pass",
+        ":hlo_proto_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -1889,6 +2038,32 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_element_type_converter",
+    srcs = ["hlo_element_type_converter.cc"],
+    hdrs = ["hlo_element_type_converter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_evaluator",
+        ":hlo_pass",
+        ":hlo_query",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_element_type_converter_test",
+    srcs = ["hlo_element_type_converter_test.cc"],
+    deps = [
+        ":hlo_element_type_converter",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+    ],
+)
+
 cc_library(
     name = "device_memory_allocator",
     srcs = ["device_memory_allocator.cc"],
@@ -2021,6 +2196,7 @@ cc_library(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
     ],
     alwayslink = 1,
@@ -2074,6 +2250,41 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "zero_sized_hlo_elimination",
+    srcs = ["zero_sized_hlo_elimination.cc"],
+    hdrs = ["zero_sized_hlo_elimination.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "zero_sized_hlo_elimination_test",
+    srcs = ["zero_sized_hlo_elimination_test.cc"],
+    deps = [
+        ":hlo",
+        ":shape_inference",
+        ":zero_sized_hlo_elimination",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "pool",
     hdrs = ["pool.h"],
@@ -2165,11 +2376,96 @@ cc_library(
     srcs = ["hlo_profile_printer.cc"],
     hdrs = ["hlo_profile_printer.h"],
     deps = [
+        ":hlo_profile_printer_data",
         ":human_readable_profile_builder",
         "//tensorflow/compiler/xla:types",
     ],
 )
 
+cc_library(
+    name = "tuple_util",
+    srcs = ["tuple_util.cc"],
+    hdrs = ["tuple_util.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "tuple_util_test",
+    srcs = ["tuple_util_test.cc"],
+    deps = [
+        ":tuple_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+    ],
+)
+
+cc_library(
+    name = "while_util",
+    srcs = ["while_util.cc"],
+    hdrs = ["while_util.h"],
+    deps = [
+        ":call_inliner",
+        ":hlo",
+        ":tuple_util",
+    ],
+)
+
+tf_cc_test(
+    name = "while_util_test",
+    srcs = ["while_util_test.cc"],
+    deps = [
+        ":while_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+    ],
+)
+
+cc_library(
+    name = "while_loop_invariant_code_motion",
+    srcs = ["while_loop_invariant_code_motion.cc"],
+    hdrs = ["while_loop_invariant_code_motion.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        ":tuple_util",
+        ":while_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "while_loop_invariant_code_motion_test",
+    srcs = ["while_loop_invariant_code_motion_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":while_loop_invariant_code_motion",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "source_map_util",
+    srcs = ["source_map_util.cc"],
+    hdrs = ["source_map_util.h"],
+    deps = [
+        ":executable",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 71491218aa221cb26ea45f288ddc47173a15df3f..fb857559f972a220a19b108baa4c441e09b90e1f 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -193,6 +193,33 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
         enable_dot_strength_reduction_(enable_dot_strength_reduction),
         enable_conv_simplification_(enable_conv_simplification) {}
 
+  // Transforms Dots where at least one input is a vector or has a degenerate
+  // dimension and converts it into a multiply and reduce. This should enable
+  // more fusion than leaving the nodes as Dot operations.
+  StatusOr<bool> HandleDotStrengthReduction(HloInstruction* dot);
+
+  // Reshapes an instruction to rank 1 if it is not already rank 1.
+  HloInstruction* Flatten(HloInstruction* hlo) {
+    if (ShapeUtil::Rank(hlo->shape()) == 1) {
+      return hlo;
+    }
+    return computation_->AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::MakeShape(hlo->shape().element_type(),
+                             {ShapeUtil::ElementsIn(hlo->shape())}),
+        hlo));
+  }
+
+  // Helper method to perform and add reduction in a single dimension.
+  HloInstruction* AddReduce(HloInstruction* hlo, int64 dim) {
+    HloInstruction* zero = computation_->AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
+    HloComputation* AddReduce_computation = CreateScalarBinaryComputation(
+        computation_->parent(), F32, HloOpcode::kAdd);
+    Shape shape = ShapeUtil::DeleteDimension(dim, hlo->shape());
+    return computation_->AddInstruction(HloInstruction::CreateReduce(
+        shape, hlo, zero, {dim}, AddReduce_computation));
+  }
+
   // Convenience method for replacing an instruction with a bitcast.
   void ReplaceWithBitcast(HloInstruction* instruction);
 
@@ -252,6 +279,11 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  StatusOr<HloInstruction*> OptimizeDotOfConcat(HloInstruction* dot);
+  StatusOr<HloInstruction*> OptimizeDotOfConcatHelper(
+      const Shape& dot_shape, HloInstruction* lhs, int64 lhs_contracting_dim,
+      HloInstruction* rhs, int64 rhs_contracting_dim, bool swapped);
+
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
   HloComputation* computation_;
@@ -329,6 +361,39 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
     return Status::OK();
   }
 
+  // Canonicalization: Put constants on the right.  This makes the reassociation
+  // rules below simpler.
+  VLOG(10) << "trying transform [Const + A => A + Const]";
+  if (lhs->IsConstant() && !rhs->IsConstant()) {
+    return ReplaceWithNewInstruction(
+        add,
+        HloInstruction::CreateBinary(add->shape(), HloOpcode::kAdd, rhs, lhs));
+  }
+
+  // Reassociate to allow constant folding.
+  //
+  // Note: This is not general.  For example, we won't reassociate
+  //
+  //   (A + C1) + (B + C2) =>  A + B + (C1 + C2).
+  //
+  VLOG(10) << "trying transform [(A + C1) + C2 => A + (C1 + C2)]";
+  if (rhs->IsConstant() && lhs->opcode() == HloOpcode::kAdd &&
+      !lhs->operand(0)->IsConstant() && lhs->operand(1)->IsConstant()) {
+    auto* c1 = lhs->mutable_operand(1);
+    auto* c2 = rhs;
+    TF_ASSIGN_OR_RETURN(
+        Shape sum_of_constants_shape,
+        ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, c1, c2));
+
+    auto* sum_of_constants =
+        computation_->AddInstruction(HloInstruction::CreateBinary(
+            sum_of_constants_shape, HloOpcode::kAdd, c1, c2));
+    return ReplaceWithNewInstruction(
+        add, HloInstruction::CreateBinary(add->shape(), HloOpcode::kAdd,
+                                          lhs->mutable_operand(0),
+                                          sum_of_constants));
+  }
+
   return Status::OK();
 }
 
@@ -433,13 +498,14 @@ static HloInstruction* BuildTupleConstant(HloComputation* computation,
   if (ShapeUtil::IsTuple(literal.shape())) {
     std::vector<HloInstruction*> elems;
     elems.reserve(ShapeUtil::TupleElementCount(literal.shape()));
-    for (const Literal& child : literal.tuple_literals()) {
-      elems.push_back(BuildTupleConstant(computation, child));
+    for (int i = 0; i < ShapeUtil::TupleElementCount(literal.shape()); ++i) {
+      elems.push_back(
+          BuildTupleConstant(computation, LiteralView::Create(literal, {i})));
     }
     return computation->AddInstruction(HloInstruction::CreateTuple(elems));
   } else {
     return computation->AddInstruction(
-        HloInstruction::CreateConstant(MakeUnique<Literal>(literal)));
+        HloInstruction::CreateConstant(literal.CloneToUnique()));
   }
 }
 
@@ -462,6 +528,16 @@ Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub) {
     return Status::OK();
   }
 
+  // Canonicalize subtraction of a constant to addition.
+  VLOG(10) << "trying transform [A - Const => A + (-Const)]";
+  if (rhs->IsConstant() && !lhs->IsConstant()) {
+    HloInstruction* negative_const = computation_->AddInstruction(
+        HloInstruction::CreateUnary(rhs->shape(), HloOpcode::kNegate, rhs));
+    return ReplaceWithNewInstruction(
+        sub, HloInstruction::CreateBinary(sub->shape(), HloOpcode::kAdd, lhs,
+                                          negative_const));
+  }
+
   return Status::OK();
 }
 
@@ -523,6 +599,23 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
     return Status::OK();
   }
 
+  // A / Const => A * (1 / Const)
+  //
+  // (Backends can do this transformation, but generally only if the constant is
+  // a scalar.)
+  if (lhs->opcode() != HloOpcode::kConstant &&
+      rhs->opcode() == HloOpcode::kConstant) {
+    HloInstruction* one =
+        computation_->AddInstruction(HloInstruction::CreateConstant(
+            Literal::One(lhs->shape().element_type()).CloneToUnique()));
+    HloInstruction* inverse =
+        computation_->AddInstruction(HloInstruction::CreateBinary(
+            rhs->shape(), HloOpcode::kDivide, one, rhs));
+    return ReplaceWithNewInstruction(
+        divide, HloInstruction::CreateBinary(
+                    divide->shape(), HloOpcode::kMultiply, lhs, inverse));
+  }
+
   // (A / B) / (C / D)  =>  (A / B)*(D / C) => (A * D) / (B * C)
   if (lhs->opcode() == HloOpcode::kDivide &&
       rhs->opcode() == HloOpcode::kDivide) {
@@ -574,70 +667,72 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   return Status::OK();
 }
 
-Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
-  auto lhs = dot->mutable_operand(0);
-  auto rhs = dot->mutable_operand(1);
+StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
+    HloInstruction* dot) {
+  HloInstruction* lhs = dot->mutable_operand(0);
+  HloInstruction* rhs = dot->mutable_operand(1);
+  int64 lhs_collapsing_dim =
+      dot->dot_dimension_numbers().lhs_contracting_dimensions(0);
+  if (lhs->IsRank2Transpose()) {
+    lhs = lhs->mutable_operand(0);
+    lhs_collapsing_dim = 1 - lhs_collapsing_dim;
+  }
+  const int64 lhs_kept_dim = 1 - lhs_collapsing_dim;
+
+  int64 rhs_collapsing_dim =
+      dot->dot_dimension_numbers().rhs_contracting_dimensions(0);
+  if (rhs->IsRank2Transpose()) {
+    rhs = rhs->mutable_operand(0);
+    rhs_collapsing_dim = 1 - rhs_collapsing_dim;
+  }
+  const int64 rhs_kept_dim = 1 - rhs_collapsing_dim;
+
+  auto reshape_if_necessary = [&](HloInstruction* hlo) {
+    if (ShapeUtil::SameDimensions(hlo->shape(), dot->shape())) {
+      return hlo;
+    }
+    return computation_->AddInstruction(
+        HloInstruction::CreateReshape(dot->shape(), hlo));
+  };
 
-  // Only optimize F32 dot operations where the dot, rhs and lhs are rank 2 or
-  // below.
-  if (dot->shape().element_type() != F32 || ShapeUtil::Rank(lhs->shape()) > 2 ||
-      ShapeUtil::Rank(rhs->shape()) > 2 || ShapeUtil::Rank(dot->shape()) > 2) {
-    return Status::OK();
-  }
+  auto broadcast_to_dim = [&](HloInstruction* hlo, const Shape& shape,
+                              int64 dim) {
+    return computation_->AddInstruction(
+        HloInstruction::CreateBroadcast(shape, hlo, {dim}));
+  };
 
-  // Replace a zero element dot with a broadcast of the constant 0.
-  if (ShapeUtil::HasZeroElements(dot->shape()) ||
-      ShapeUtil::HasZeroElements(lhs->shape()) ||
-      ShapeUtil::HasZeroElements(rhs->shape())) {
-    auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
-  }
+  auto multiply = [&](HloInstruction* local_lhs, HloInstruction* local_rhs) {
+    return computation_->AddInstruction(HloInstruction::CreateBinary(
+        local_lhs->shape(), HloOpcode::kMultiply, local_lhs, local_rhs));
+  };
 
-  // Simplify dot(transpose(a), transpose(b)) to transpose(dot(b,a)).
-  if (lhs->IsRank2Transpose() && rhs->IsRank2Transpose()) {
-    auto new_dot = computation_->AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::PermuteDimensions({1, 0}, dot->shape()), HloOpcode::kDot,
-        rhs->mutable_operand(0), lhs->mutable_operand(0)));
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateTranspose(dot->shape(), new_dot, {1, 0}));
+  // Strength reduce dot(a[K] , b[K]) =
+  //  reshape(result.shape,
+  //          reduce_sum(multiply(a, b), {0}))
+  if (ShapeUtil::Rank(rhs->shape()) == 1 &&
+      ShapeUtil::Rank(lhs->shape()) == 1) {
+    TF_RETURN_IF_ERROR(
+        ReplaceInstruction(dot, reshape_if_necessary(AddReduce(
+                                    multiply(Flatten(lhs), Flatten(rhs)), 0))));
+    return true;
   }
 
-  if (!enable_dot_strength_reduction_) {
-    return Status::OK();
+  if (ShapeUtil::IsEffectiveScalar(rhs->shape()) &&
+      ShapeUtil::IsEffectiveScalar(lhs->shape())) {
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(multiply(Flatten(lhs), Flatten(rhs)))));
+    return true;
   }
 
   // Simplify outer product into multiply with implicit broadcasting.
   //
   // A dot(a[M, 1], b[1, N]) = multiply(a [M,1], b [1, N])
-  if (ShapeUtil::Rank(rhs->shape()) == 2 && rhs->shape().dimensions(0) == 1) {
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateBinary(dot->shape(), HloOpcode::kMultiply,
-                                          lhs, rhs));
-  }
-
-  // The following graph transformations take Dots where at least one input is a
-  // vector or has a degenerate dimension and converts it into a multiply and
-  // reduce. This should enable more fusion than leaving the nodes as Dot
-  // operations.
-
-  // Strength reduce dot(a[K] , b[K]) =
-  //  reshape(result.shape,
-  //          reduce_sum(multiply(a, b), {0}))
-  if (ShapeUtil::Rank(rhs->shape()) == 1 &&
-      ShapeUtil::Rank(lhs->shape()) == 1) {
-    auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
-        rhs->shape(), HloOpcode::kMultiply, lhs, rhs));
-    HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
-        computation_->parent(), F32, HloOpcode::kAdd);
-    auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    auto reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
-        ShapeUtil::MakeShape(dot->shape().element_type(), {}), multiply, zero,
-        {0}, add_reduce_computation));
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateReshape(dot->shape(), reduce));
+  if (ShapeUtil::Rank(rhs->shape()) == 2 &&
+      rhs->shape().dimensions(rhs_collapsing_dim) == 1) {
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, multiply(broadcast_to_dim(Flatten(lhs), dot->shape(), 0),
+                      broadcast_to_dim(Flatten(rhs), dot->shape(), 1))));
+    return true;
   }
 
   // Strength reduce dot(a[1, K], b) =
@@ -648,35 +743,21 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   //      )
   //    )
   if (ShapeUtil::Rank(lhs->shape()) == 1 ||
-      (ShapeUtil::Rank(lhs->shape()) == 2 && lhs->shape().dimensions(0) == 1)) {
-    auto new_lhs = computation_->AddInstruction(HloInstruction::CreateReshape(
-        ShapeUtil::MakeShape(lhs->shape().element_type(),
-                             {ShapeUtil::ElementsIn(lhs->shape())}),
-        lhs));
-    HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
-        computation_->parent(), F32, HloOpcode::kAdd);
-    auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    HloInstruction* reduce;
+      (ShapeUtil::Rank(lhs->shape()) == 2 &&
+       lhs->shape().dimensions(lhs_kept_dim) == 1)) {
     if (ShapeUtil::Rank(rhs->shape()) == 1) {
-      auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
-          rhs->shape(), HloOpcode::kMultiply, new_lhs, rhs));
-      reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
-          ShapeUtil::MakeShape(dot->shape().element_type(), {}), multiply, zero,
-          {0}, add_reduce_computation));
-    } else {
-      new_lhs = computation_->AddInstruction(
-          HloInstruction::CreateBroadcast(rhs->shape(), new_lhs, {0}));
-      auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
-          rhs->shape(), HloOpcode::kMultiply, new_lhs, rhs));
-
-      reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
-          ShapeUtil::MakeShape(dot->shape().element_type(),
-                               {rhs->shape().dimensions(1)}),
-          multiply, zero, {0}, add_reduce_computation));
+      TF_RETURN_IF_ERROR(ReplaceInstruction(
+          dot,
+          reshape_if_necessary(AddReduce(multiply(Flatten(lhs), rhs), 0))));
+      return true;
     }
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateReshape(dot->shape(), reduce));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(
+                 AddReduce(multiply(broadcast_to_dim(Flatten(lhs), rhs->shape(),
+                                                     rhs_collapsing_dim),
+                                    rhs),
+                           rhs_collapsing_dim))));
+    return true;
   }
 
   // Strength reduce dot(a, b[K, 1]) =
@@ -684,26 +765,208 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   //    reduce_sum(multiply(a, broadcast(reshape([K],b), {1})), {0})
   //  )
   if (ShapeUtil::Rank(rhs->shape()) == 1 ||
-      (ShapeUtil::Rank(rhs->shape()) == 2 && rhs->shape().dimensions(1) == 1)) {
-    auto new_rhs = computation_->AddInstruction(HloInstruction::CreateReshape(
-        ShapeUtil::MakeShape(rhs->shape().element_type(),
-                             {ShapeUtil::ElementsIn(rhs->shape())}),
-        rhs));
-    new_rhs = computation_->AddInstruction(
-        HloInstruction::CreateBroadcast(lhs->shape(), new_rhs, {1}));
-    auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
-        lhs->shape(), HloOpcode::kMultiply, lhs, new_rhs));
-    HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
-        computation_->parent(), F32, HloOpcode::kAdd);
+      (ShapeUtil::Rank(rhs->shape()) == 2 &&
+       rhs->shape().dimensions(rhs_kept_dim) == 1)) {
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(AddReduce(
+                 multiply(lhs, broadcast_to_dim(Flatten(rhs), lhs->shape(),
+                                                lhs_collapsing_dim)),
+                 lhs_collapsing_dim))));
+    return true;
+  }
+  return false;
+}
+
+StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcat(
+    HloInstruction* dot) {
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  if (dnums.lhs_contracting_dimensions_size() != 1 ||
+      dnums.lhs_batch_dimensions_size() != 0) {
+    return nullptr;
+  }
+
+  const int64 lhs_contracting_dim = dnums.lhs_contracting_dimensions(0);
+  const int64 rhs_contracting_dim = dnums.rhs_contracting_dimensions(0);
+  HloInstruction* lhs = dot->mutable_operand(0);
+  HloInstruction* rhs = dot->mutable_operand(1);
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * optimized_lhs_concat,
+      OptimizeDotOfConcatHelper(dot->shape(), lhs, lhs_contracting_dim, rhs,
+                                rhs_contracting_dim, /*swapped=*/false));
+  if (optimized_lhs_concat) {
+    return optimized_lhs_concat;
+  }
+
+  return OptimizeDotOfConcatHelper(dot->shape(), rhs, rhs_contracting_dim, lhs,
+                                   lhs_contracting_dim, /*swapped=*/true);
+}
+
+StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcatHelper(
+    const Shape& dot_shape, HloInstruction* lhs, int64 lhs_contracting_dim,
+    HloInstruction* rhs, int64 rhs_contracting_dim, bool swapped) {
+  bool can_optimize = lhs->opcode() == HloOpcode::kConcatenate &&
+                      lhs->concatenate_dimension() == lhs_contracting_dim &&
+                      rhs->opcode() == HloOpcode::kConstant;
+  if (!can_optimize) {
+    return nullptr;
+  }
+
+  // We're replacing this:
+  //
+  //   +-----+-----+-----+      +-------------------+
+  //   |     |     |     |      |                   |
+  //   |     |     |     |      |        R_0        |
+  //   |     |     |     |      |                   |
+  //   |     |     |     |      +-------------------+
+  //   |     |     |     |      |                   |
+  //   | L_0 | L_1 | L_2 |   *  |        R_1        |
+  //   |     |     |     |      |                   |
+  //   |     |     |     |      +-------------------+
+  //   |     |     |     |      |                   |
+  //   |     |     |     |      |        R_2        |
+  //   |     |     |     |      |                   |
+  //   +-----+-----+-----+      +-------------------+
+  //
+  // with this:
+  //
+  // [Sum over i]
+  //
+  //   +-----+     +-------------------+
+  //   |     |     |                   |
+  //   |     |  *  |        R_i        |
+  //   |     |     |                   |
+  //   |     |     +-------------------+
+  //   |     |
+  //   | L_i |
+  //   |     |
+  //   |     |
+  //   |     |
+  //   |     |
+  //   |     |
+  //   +-----+
+  //
+  // where the LHS is a concatenate operation (so we can "split" the LHS tensor
+  // for free) and the RHS is a constant tensor (and thus can be split at
+  // compile time).  In the future, we may also want to do this when both the
+  // LHS and the RHS are concatenate operations that line up along the dimension
+  // being contracted over.
+  //
+  // We should be able to generalize this transform to work on a non-constant
+  // RHS when/if we have in-place slices or support input-fusing slices into
+  // Dots.
+
+  // Dimension numbers for the new dot instructions we'll create (L_i * R_i in
+  // the diagram above).
+  DotDimensionNumbers new_dot_dnums;
+  new_dot_dnums.add_lhs_contracting_dimensions(swapped ? rhs_contracting_dim
+                                                       : lhs_contracting_dim);
+  new_dot_dnums.add_rhs_contracting_dimensions(swapped ? lhs_contracting_dim
+                                                       : rhs_contracting_dim);
+
+  // Here we use the MKN notation, where the contracted dimension has K
+  // elements and the two non-contracted dimensions have M and N elements.
+  HloInstruction* add_result = nullptr;
+  int64 rhs_contracting_dim_offset = 0;
+  int64 n = rhs->shape().dimensions(1 - rhs_contracting_dim);
+  for (HloInstruction* concat_op : lhs->operands()) {
+    int64 sub_k = concat_op->shape().dimensions(lhs_contracting_dim);
+    Shape rhs_slice_shape(rhs->shape());
+    rhs_slice_shape.set_dimensions(rhs_contracting_dim, sub_k);
+
+    std::array<int64, 2> start_indices;
+    start_indices[rhs_contracting_dim] = rhs_contracting_dim_offset;
+    start_indices[1 - rhs_contracting_dim] = 0;
+
+    std::array<int64, 2> limit_indices;
+    limit_indices[rhs_contracting_dim] = rhs_contracting_dim_offset + sub_k;
+    limit_indices[1 - rhs_contracting_dim] = n;
+
+    HloInstruction* rhs_slice =
+        computation_->AddInstruction(HloInstruction::CreateSlice(
+            rhs_slice_shape, rhs, /*start_indices=*/start_indices,
+            /*limit_indices=*/limit_indices, /*strides=*/{1, 1}));
+
+    // TODO(b/69062148): We can get rid of `swapped` once all backends support
+    // "non-canonical" contraction dimensions (that contracts dimension 1 of the
+    // LHS with dimension 0 of the RHS).  But for now we keep the same
+    // contraction dimensions as the incoming dot operation to ensure the new
+    // dot operations can be lowered.
+    HloInstruction *new_dot_lhs, *new_dot_rhs;
+    if (swapped) {
+      new_dot_lhs = rhs_slice;
+      new_dot_rhs = concat_op;
+    } else {
+      new_dot_lhs = concat_op;
+      new_dot_rhs = rhs_slice;
+    }
+
+    auto* new_dot = computation_->AddInstruction(HloInstruction::CreateDot(
+        dot_shape, new_dot_lhs, new_dot_rhs, new_dot_dnums));
+
+    if (add_result) {
+      add_result = computation_->AddInstruction(HloInstruction::CreateBinary(
+          dot_shape, HloOpcode::kAdd, add_result, new_dot));
+    } else {
+      add_result = new_dot;
+    }
+
+    rhs_contracting_dim_offset += sub_k;
+  }
+
+  return add_result;
+}
+
+Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
+  auto lhs = dot->mutable_operand(0);
+  auto rhs = dot->mutable_operand(1);
+
+  // Only optimize F32 dot operations where the dot, rhs and lhs are rank 2 or
+  // below.
+  if (dot->shape().element_type() != F32 || ShapeUtil::Rank(lhs->shape()) > 2 ||
+      ShapeUtil::Rank(rhs->shape()) > 2 || ShapeUtil::Rank(dot->shape()) > 2) {
+    return Status::OK();
+  }
+
+  // Replace a zero element dot with a broadcast of the constant 0.
+  if (ShapeUtil::HasZeroElements(dot->shape()) ||
+      ShapeUtil::HasZeroElements(lhs->shape()) ||
+      ShapeUtil::HasZeroElements(rhs->shape())) {
     auto zero = computation_->AddInstruction(
         HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    auto reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
-        ShapeUtil::MakeShape(dot->shape().element_type(),
-                             {lhs->shape().dimensions(0)}),
-        multiply, zero, {1}, add_reduce_computation));
     return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateReshape(dot->shape(), reduce));
+        dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
+  }
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * dot_of_concat_optimized,
+                      OptimizeDotOfConcat(dot));
+  if (dot_of_concat_optimized) {
+    VLOG(10) << "Replaced dot(concat(...), constant) with add(dot(..., "
+                "constant)...)";
+    return ReplaceInstruction(dot, dot_of_concat_optimized);
+  }
+
+  if (enable_dot_strength_reduction_ && !is_layout_sensitive_) {
+    TF_ASSIGN_OR_RETURN(bool did_strength_reduction,
+                        HandleDotStrengthReduction(dot));
+    if (did_strength_reduction) {
+      return Status::OK();
+    }
+  }
+
+  // Simplify dot(transpose(a), transpose(b)) to transpose(dot(b,a)).
+  if (lhs->IsRank2Transpose() && rhs->IsRank2Transpose()) {
+    DotDimensionNumbers dot_dimension_numbers;
+    dot_dimension_numbers.add_lhs_contracting_dimensions(1);
+    dot_dimension_numbers.add_rhs_contracting_dimensions(0);
+    auto new_dot = computation_->AddInstruction(HloInstruction::CreateDot(
+        ShapeUtil::PermuteDimensions({1, 0}, dot->shape()),
+        rhs->mutable_operand(0), lhs->mutable_operand(0),
+        dot_dimension_numbers));
+    return ReplaceWithNewInstruction(
+        dot, HloInstruction::CreateTranspose(dot->shape(), new_dot, {1, 0}));
   }
+
   return Status::OK();
 }
 
@@ -980,6 +1243,11 @@ Status AlgebraicSimplifierVisitor::HandleImag(HloInstruction* imag) {
 }
 
 Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
+  if (ShapeUtil::HasZeroElements(pad->operand(0)->shape())) {
+    return ReplaceWithNewInstruction(
+        pad, HloInstruction::CreateBroadcast(pad->shape(),
+                                             pad->mutable_operand(1), {}));
+  }
   // Eliminate nop pads (padding all zero), and replace a pad with negative
   // padding with a pad with non-negative padding followed by a slice.
   bool all_zero = true;
@@ -1120,6 +1388,27 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
         power, HloInstruction::CreateBinary(power->shape(), HloOpcode::kDivide,
                                             broadcast_one, lhs));
   }
+
+  VLOG(10) << "trying transform [pow(pow(A, X), Y) => pow(A, X*Y)]: "
+           << power->ToString();
+
+  // Don't perform this optimization if either of the exponents is complex; this
+  // identity is true only for real-valued exponents.  In addition, we cowardly
+  // refuse to do this transformation if the two expontents have different
+  // element types.
+  if (lhs->opcode() == HloOpcode::kPower &&
+      !ShapeUtil::ElementIsComplex(lhs->operand(1)->shape()) &&
+      !ShapeUtil::ElementIsComplex(rhs->shape()) &&
+      ShapeUtil::SameElementType(lhs->operand(1)->shape(), rhs->shape())) {
+    auto exponent_product =
+        computation_->AddInstruction(HloInstruction::CreateBinary(
+            rhs->shape(), HloOpcode::kMultiply, lhs->mutable_operand(1), rhs));
+    return ReplaceWithNewInstruction(
+        power, HloInstruction::CreateBinary(power->shape(), HloOpcode::kPower,
+                                            lhs->mutable_operand(0),
+                                            exponent_product));
+  }
+
   return Status::OK();
 }
 
@@ -1173,7 +1462,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::
         ShapeUtil::MakeShapeWithLayout(
             user->shape().element_type(),
             AsInt64Slice(operand->shape().dimensions()),
-            AsInt64Slice(operand->shape().layout().minor_to_major())),
+            LayoutUtil::MinorToMajor(operand->shape())),
         new_user_operands));
     VLOG(4) << "  new user: " << new_user->ToString();
     HloInstruction* new_reshape_or_broadcast = nullptr;
@@ -1183,8 +1472,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::
               ShapeUtil::MakeShapeWithLayout(
                   user->shape().element_type(),
                   AsInt64Slice(reshape_or_broadcast->shape().dimensions()),
-                  AsInt64Slice(
-                      reshape_or_broadcast->shape().layout().minor_to_major())),
+                  LayoutUtil::MinorToMajor(reshape_or_broadcast->shape())),
               new_user));
     } else {
       TF_RET_CHECK(reshape_or_broadcast->opcode() == HloOpcode::kBroadcast);
@@ -1193,8 +1481,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::
               ShapeUtil::MakeShapeWithLayout(
                   user->shape().element_type(),
                   AsInt64Slice(reshape_or_broadcast->shape().dimensions()),
-                  AsInt64Slice(
-                      reshape_or_broadcast->shape().layout().minor_to_major())),
+                  LayoutUtil::MinorToMajor(reshape_or_broadcast->shape())),
               new_user, reshape_or_broadcast->dimensions()));
     }
     VLOG(4) << "  new reshape/broadcast: "
@@ -1331,9 +1618,12 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
         reduce,
         HloInstruction::CreateBroadcast(reduce->shape(), init_value, {}));
   }
+
   // A Transpose feeding a reduce can simply permute the reduction dimensions
-  // field.
-  if (arg->opcode() == HloOpcode::kTranspose) {
+  // field if the output of the reduce is a vector or scalar. Higher ranked
+  // result may require a transpose of the output.
+  if (ShapeUtil::Rank(reduce->shape()) <= 1 &&
+      arg->opcode() == HloOpcode::kTranspose) {
     auto transpose_dimensions = arg->dimensions();
     std::vector<int64> new_reduce_dimensions;
     for (auto dim : dimensions) {
@@ -1403,6 +1693,12 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
 
 Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     HloInstruction* reduce_window) {
+  if (ShapeUtil::HasZeroElements(reduce_window->operand(0)->shape())) {
+    return ReplaceWithNewInstruction(
+        reduce_window,
+        HloInstruction::CreateBroadcast(reduce_window->shape(),
+                                        reduce_window->mutable_operand(1), {}));
+  }
   auto operand = reduce_window->mutable_operand(0);
   const Window& window = reduce_window->window();
   auto function = reduce_window->to_apply();
@@ -1448,6 +1744,63 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     }
   }
 
+  // If the pad puts a single non-identity value in each window that we're
+  // reducing, then this is a broadcast.
+  HloInstruction* pad_operand = operand->mutable_operand(0);
+  auto is_effective_broadcast = [&] {
+    if (window_util::HasStride(window)) {
+      VLOG(10) << "Window has stride.";
+      return false;
+    }
+    if (!window_util::HasSymmetricPadding(pad_config)) {
+      VLOG(10) << "Window has uneven padding.";
+      return false;
+    }
+    for (int64 i = 0; i < pad_config.dimensions_size(); ++i) {
+      const auto& pad_dimension = pad_config.dimensions(i);
+      if ((pad_dimension.edge_padding_low() != 0 ||
+           pad_dimension.edge_padding_high() != 0) &&
+          pad_operand->shape().dimensions(i) != 1) {
+        VLOG(10) << "Found non-trivial dimension being padded: " << i;
+        return false;
+      }
+    }
+    VLOG(10) << "Found to be padding trivial dimensions only.";
+
+    for (int64 i = 0; i < window.dimensions_size(); ++i) {
+      const auto& pad_dimension = pad_config.dimensions(i);
+      const WindowDimension& window_dimension = window.dimensions(i);
+      bool dimension_has_padding = (pad_dimension.edge_padding_low() != 0 ||
+                                    pad_dimension.edge_padding_high() != 0);
+      if (dimension_has_padding &&
+          window_dimension.size() < pad_dimension.edge_padding_low() + 1) {
+        VLOG(10) << "Found window did not cover single unpadded element in "
+                    "dimension: "
+                 << i;
+        return false;
+      }
+      if (pad_operand->shape().dimensions(i) != 1 &&
+          window_dimension.size() != 1) {
+        VLOG(10) << "Found window covers more than one element in non-trivial "
+                    "dimension: "
+                 << i;
+        return false;
+      }
+    }
+    VLOG(10) << "Found window covers a single unpadded element.";
+    return true;
+  };
+  if (is_effective_broadcast()) {
+    VLOG(10) << "Replacing pad/reduce-window with (implicit) broadcast.";
+    auto fadd = [this](std::unique_ptr<HloInstruction> x) {
+      return computation_->AddInstruction(std::move(x));
+    };
+    return ReplaceWithNewInstruction(
+        reduce_window, HloInstruction::CreateBroadcastSequence(
+                           /*output_shape=*/reduce_window->shape(),
+                           /*operand=*/pad_operand, fadd));
+  }
+
   // Carry out the folding of the pad into reduce_window.
   VLOG(10) << "Folding pad into reduce-window.";
   Window new_window = window;
@@ -1465,7 +1818,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
   return ReplaceWithNewInstruction(
       reduce_window, HloInstruction::CreateReduceWindow(
                          /*shape=*/reduce_window->shape(),
-                         /*operand=*/operand->mutable_operand(0),
+                         /*operand=*/pad_operand,
                          /*init_value=*/reduce_window->mutable_operand(1),
                          /*window=*/new_window,
                          /*reduce_computation=*/function));
@@ -1473,7 +1826,6 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
 
 Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
   auto operand = transpose->mutable_operand(0);
-
   if (std::is_sorted(transpose->dimensions().begin(),
                      transpose->dimensions().end())) {
     VLOG(10) << "deleting no-op transpose";
@@ -1500,6 +1852,18 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
     HloInstruction* convolution) {
   auto lhs = convolution->mutable_operand(0);
   auto rhs = convolution->mutable_operand(1);
+  if (ShapeUtil::HasZeroElements(lhs->shape()) ||
+      ShapeUtil::HasZeroElements(rhs->shape())) {
+    return ReplaceWithNewInstruction(
+        convolution,
+        HloInstruction::CreateBroadcast(
+            convolution->shape(),
+            computation_->AddInstruction(HloInstruction::CreateConvert(
+                ShapeUtil::MakeShape(convolution->shape().element_type(), {}),
+                computation_->AddInstruction(
+                    HloInstruction::CreateConstant(Literal::CreateR0(0.0f))))),
+            {}));
+  }
   const auto& window = convolution->window();
   if (!enable_conv_simplification_) {
     return Status::OK();
@@ -1556,15 +1920,15 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
   // still convert Conv into more efficient Matmul with operand transposition
   // (such as the transposition flags in cuBLAS SGEMM).
   if (!LayoutUtil::Equal(input_shape.layout(), convolution_shape.layout()) ||
-      input_shape.layout().minor_to_major(0) !=
+      LayoutUtil::Minor(input_shape.layout(), 0) !=
           dnums.input_feature_dimension() ||
-      convolution_shape.layout().minor_to_major(0) !=
+      LayoutUtil::Minor(convolution_shape.layout(), 0) !=
           dnums.output_feature_dimension() ||
       // The input feature dimension should come later in the minor-to-major
       // order.
-      (PositionInContainer(filter_shape.layout().minor_to_major(),
+      (PositionInContainer(LayoutUtil::MinorToMajor(filter_shape),
                            dnums.kernel_input_feature_dimension()) <
-       PositionInContainer(filter_shape.layout().minor_to_major(),
+       PositionInContainer(LayoutUtil::MinorToMajor(filter_shape),
                            dnums.kernel_output_feature_dimension()))) {
     return Status::OK();
   }
@@ -1592,18 +1956,15 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
 
   // We already checked feature_dimension is most minor, so data in input_shape
   // and row-major {conv_width,input_channels} are bitwise identical.
-  const Shape new_input_shape =
-      ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
-          input_shape.element_type(), {conv_width, input_channels});
+  const Shape new_input_shape = ShapeUtil::MakeShapeWithDescendingLayout(
+      input_shape.element_type(), {conv_width, input_channels});
   // We already checked input_feature_dimension is more major than
   // output_feature_dimension, so data in filter_shape and row-major
   // {input_channels,output_channels} are bitwise identical.
-  const Shape new_filter_shape =
-      ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
-          filter_shape.element_type(), {input_channels, output_channels});
-  const Shape dot_output_shape =
-      ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
-          convolution_shape.element_type(), {conv_width, output_channels});
+  const Shape new_filter_shape = ShapeUtil::MakeShapeWithDescendingLayout(
+      filter_shape.element_type(), {input_channels, output_channels});
+  const Shape dot_output_shape = ShapeUtil::MakeShapeWithDescendingLayout(
+      convolution_shape.element_type(), {conv_width, output_channels});
 
   // We cannot insert bitcasts if the layouts will not be compatible.
   // TODO(b/33178038): Consider inserting a transpose if a bitcast would be
@@ -1616,8 +1977,11 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
 
   auto new_lhs = add_bitcast(new_input_shape, lhs);
   auto new_rhs = add_bitcast(new_filter_shape, rhs);
-  auto dot = computation_->AddInstruction(HloInstruction::CreateBinary(
-      dot_output_shape, HloOpcode::kDot, new_lhs, new_rhs));
+  DotDimensionNumbers dot_dimension_numbers;
+  dot_dimension_numbers.add_lhs_contracting_dimensions(1);
+  dot_dimension_numbers.add_rhs_contracting_dimensions(0);
+  auto dot = computation_->AddInstruction(HloInstruction::CreateDot(
+      dot_output_shape, new_lhs, new_rhs, dot_dimension_numbers));
   return ReplaceInstruction(convolution, add_bitcast(convolution_shape, dot));
 }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 56dfb1cf0bc22ed62653d1f0772fdcae58498c27..0f08eb3a3267c4b7b04958270a5788fc48d3fa04 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -60,17 +61,63 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param0, zero));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
+// Test that Const + A is canonicalized to A + Const.
+TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0(42.0f)));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, constant, param0));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::Add(param0, op::Constant()));
+}
+
+// Test that [(A + C1) + C2] => [A + (C1 + C2)] for constants C1 and C2.
+TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0(42.0f)));
+  HloInstruction* constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0(3.14159f)));
+
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param0, constant1));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, add1, constant2));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::Add(param0, op::Add(constant1, constant2)));
+}
+
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
   Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
   HloComputation::Builder builder(TestName());
@@ -83,13 +130,12 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -106,13 +152,12 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -128,17 +173,37 @@ TEST_F(AlgebraicSimplifierTest, SubZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kSubtract, param0, zero));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
+// Test that A - Const is canonicalized to A + (-Const).
+TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
+  HloInstruction* constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32, HloOpcode::kSubtract, param0, constant));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::Add(param0, op::Negate(constant)));
+}
+
 // Test that (A/B)/C is simplified to A/(B*C).
 TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
@@ -154,15 +219,14 @@ TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, div, param2));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(op::Divide(param0, param1), param2));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Multiply(param1, param2)));
@@ -183,15 +247,14 @@ TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, div));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Divide(param1, param2)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(op::Multiply(param0, param2), param1));
@@ -217,8 +280,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, div0, div1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -226,7 +288,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -248,15 +310,14 @@ TEST_F(AlgebraicSimplifierTest, DivOfExp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, exp));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Exp(param1)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Exp(op::Negate(param1))));
@@ -277,15 +338,14 @@ TEST_F(AlgebraicSimplifierTest, DivOfPower) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, power));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Power(param1, param2)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Power(param1, op::Negate(param2))));
@@ -308,15 +368,14 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kDivide, param0, power));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(param0, op::Power(param1, param2)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   ASSERT_THAT(computation->root_instruction(),
               op::Multiply(param0, op::Power(param1, op::Negate(param2))));
@@ -327,6 +386,75 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
   EXPECT_EQ(0, negate_shape.dimensions_size());
 }
 
+// A / Const => A * (1 / Const)
+TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {3});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "param0"));
+  HloInstruction* constant =
+      builder.AddInstruction(HloInstruction::CreateConstant(
+          Literal::CreateR1<float>({0.f, 1.f, 2.f})));
+  builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kDivide,
+                                                      param0, constant));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(param0, op::Divide(op::Constant(), constant)));
+}
+
+// pow(pow(A, X), Y) => pow(A, X*Y)
+TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {7});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* base = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "param0"));
+  HloInstruction* exp1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32, "param1"));
+  HloInstruction* exp2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, r0f32, "param2"));
+  HloInstruction* inner_power = builder.AddInstruction(
+      HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, base, exp1));
+  builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kPower,
+                                                      inner_power, exp2));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Power(base, op::Multiply(exp1, exp2)));
+}
+
+// Don't simplify pow(pow(A, X), Y) => pow(A, X*Y) if X and Y are complex
+// numbers.
+TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) {
+  Shape r0c64 = ShapeUtil::MakeShape(C64, {});
+  Shape r1c64 = ShapeUtil::MakeShape(C64, {7});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* base = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1c64, "param0"));
+  HloInstruction* exp1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0c64, "param1"));
+  HloInstruction* exp2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, r0c64, "param2"));
+  HloInstruction* inner_power = builder.AddInstruction(
+      HloInstruction::CreateBinary(r1c64, HloOpcode::kPower, base, exp1));
+  builder.AddInstruction(HloInstruction::CreateBinary(r1c64, HloOpcode::kPower,
+                                                      inner_power, exp2));
+
+  module().AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_FALSE(simplifier.Run(&module()).ValueOrDie());
+}
+
 // Test that A/1 is simplified to A for a scalar.
 TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
@@ -338,13 +466,12 @@ TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, one));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -360,13 +487,12 @@ TEST_F(AlgebraicSimplifierTest, DivOneArray) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, param0, one));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -385,13 +511,12 @@ TEST_F(AlgebraicSimplifierTest, ComplexOfRealImagC) {
   HloInstruction* cplx = builder.AddInstruction(
       HloInstruction::CreateBinary(r2c64, HloOpcode::kComplex, real, imag));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, cplx);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -410,13 +535,12 @@ TEST_F(AlgebraicSimplifierTest, RealOfComplex) {
   HloInstruction* real = builder.AddInstruction(
       HloInstruction::CreateUnary(r2f32, HloOpcode::kReal, cplx));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, real);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
@@ -435,13 +559,12 @@ TEST_F(AlgebraicSimplifierTest, ImagOfComplex) {
   HloInstruction* imag = builder.AddInstruction(
       HloInstruction::CreateUnary(r2f32, HloOpcode::kImag, cplx));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, imag);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param1);
 }
@@ -463,13 +586,12 @@ TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, get, param2));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, add);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Add(param1, param2));
 }
@@ -489,15 +611,14 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, exp0, exp1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Divide(op::Exp(param0), op::Exp(param1)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Exp(op::Subtract(param0, param1)));
@@ -518,15 +639,14 @@ TEST_F(AlgebraicSimplifierTest, ExpMul) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMultiply, exp0, exp1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(op::Exp(param0), op::Exp(param1)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Exp(op::Add(param0, param1)));
@@ -545,15 +665,14 @@ TEST_F(AlgebraicSimplifierTest, PowExp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, exp0, param1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Power(op::Exp(param0), param1));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Exp(op::Multiply(param0, param1)));
@@ -572,15 +691,14 @@ TEST_F(AlgebraicSimplifierTest, LnPow) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, pow));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Log(op::Power(param0, param1)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Multiply(op::Log(param0), param1));
@@ -597,14 +715,13 @@ TEST_F(AlgebraicSimplifierTest, LnExp) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, exp0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Log(op::Exp(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
 }
@@ -626,15 +743,14 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, div));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Log(op::Divide(op::Exp(param0), op::Exp(param1))));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Subtract(param0, param1));
 }
@@ -651,14 +767,13 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, zero));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
@@ -676,14 +791,13 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, param0, zero));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast());
@@ -705,14 +819,13 @@ TEST_F(AlgebraicSimplifierTest, Pow1) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, one));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, one));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
 }
@@ -728,14 +841,13 @@ TEST_F(AlgebraicSimplifierTest, Pow2) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, two));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, two));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Multiply(param0, param0));
 }
@@ -751,14 +863,13 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32, HloOpcode::kPower,
                                                       param0, negative_one));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Power(param0, negative_one));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Divide(op::Broadcast(), param0));
@@ -767,6 +878,117 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
             1);
 }
 
+TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* lhs = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {3, 3, 0}), "lhs"));
+
+  HloInstruction* rhs = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {3, 0, 3}), "rhs"));
+
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_input_batch_dimension(0);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.set_input_feature_dimension(2);
+
+  dnums.set_output_batch_dimension(0);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.set_output_feature_dimension(2);
+
+  dnums.add_kernel_spatial_dimensions(0);
+  dnums.set_kernel_input_feature_dimension(1);
+  dnums.set_kernel_output_feature_dimension(2);
+  Window window;
+  WindowDimension* dim = window.add_dimensions();
+  dim->set_size(3);
+  dim->set_padding_low(0);
+  dim->set_padding_high(0);
+  dim->set_stride(1);
+  dim->set_window_dilation(1);
+  dim->set_base_dilation(1);
+  dim->set_window_reversal(false);
+  // Create add computation.
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {3, 3, 3}), lhs, rhs, window, dnums));
+  module().AddEntryComputation(builder.Build());
+  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
+                                             non_bitcasting_callback());
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Convolution(lhs, rhs));
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Broadcast(op::Constant()));
+}
+
+TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {3, 0}), "op"));
+  Window window;
+  for (int64 i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_padding_low(1);
+    dim->set_padding_high(1);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+  // Create add computation.
+  HloComputation* add_computation = nullptr;
+  {
+    HloComputation::Builder builder(TestName() + ".add");
+    const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    HloInstruction* p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
+    add_computation = module().AddEmbeddedComputation(builder.Build());
+  }
+  builder.AddInstruction(HloInstruction::CreateReduceWindow(
+      ShapeUtil::MakeShape(F32, {5, 2}), param,
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f))),
+      window, add_computation));
+  module().AddEntryComputation(builder.Build());
+  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
+                                             non_bitcasting_callback());
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::ReduceWindow(param, op::Constant()));
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Broadcast(op::Constant()));
+}
+
+TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {3, 0}), "op"));
+  PaddingConfig padding;
+  for (int i = 0; i < 2; ++i) {
+    PaddingConfig::PaddingConfigDimension* dimension = padding.add_dimensions();
+    dimension->set_edge_padding_low(1);
+    dimension->set_edge_padding_high(1);
+    dimension->set_interior_padding(0);
+  }
+  builder.AddInstruction(HloInstruction::CreatePad(
+      ShapeUtil::MakeShape(F32, {5, 2}), param,
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0(0.0f))),
+      padding));
+  module().AddEntryComputation(builder.Build());
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Pad(param, op::Constant()));
+  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
+                                             non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Broadcast(op::Constant()));
+}
+
 TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
 
@@ -781,17 +1003,16 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
       ShapeUtil::MakeShape(F32, {3, 2}), broadcast));
 
   auto computation = builder.Build();
-  auto module = CreateNewModule();
-  module->AddEntryComputation(std::move(computation));
+  module().AddEntryComputation(std::move(computation));
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
               op::Reshape(op::Broadcast(op::Reshape(op))));
 
   HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
                                              non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(), op);
+  EXPECT_THAT(module().entry_computation()->root_instruction(), op);
 }
 
 // Test that convert(A, $TYPE) is simplified to A if A is of type $TYPE.
@@ -802,14 +1023,13 @@ TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), input);
 }
@@ -823,14 +1043,13 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
 }
@@ -844,14 +1063,13 @@ TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
   builder.AddInstruction(
       HloInstruction::CreateConcatenate(param0->shape(), {param0}, 0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Concatenate(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
 }
@@ -874,8 +1092,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, param0, param0, empty_slice, param1}, 0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -883,7 +1100,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Concatenate(param0, param0, param1));
@@ -905,15 +1122,14 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, empty_slice}, 0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Concatenate(empty_literal, empty_slice));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), empty_literal);
 }
@@ -930,14 +1146,13 @@ TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
   HloInstruction* broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(r1f32, param1, {}));
   builder.AddInstruction(HloInstruction::CreateConcatenate(
-      param0->shape(), {broadcast, param0}, 0));
+      ShapeUtil::MakeShape(F32, {200}), {broadcast, param0}, 0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), op::Pad(param0, param1));
 }
 
@@ -951,8 +1166,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   // Set to different layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
@@ -962,7 +1176,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
 
   // Copy has not been removed.
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
@@ -978,8 +1192,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   // Set to same layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
@@ -989,7 +1202,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   // Copy has been removed.
   EXPECT_THAT(computation->root_instruction(), param0);
@@ -1010,14 +1223,13 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
   *reshape->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3, 4, 5});
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
 
   // Reshape is not replaced with a bitcast.
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
@@ -1056,8 +1268,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   builder.AddInstruction(HloInstruction::CreateTuple(
       {transformable_reshape, dimensions_wrong_reshape, layout_wrong_reshape}));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Tuple(transformable_reshape, dimensions_wrong_reshape,
@@ -1065,7 +1276,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
-  simplifier.Run(module.get()).ValueOrDie();
+  simplifier.Run(&module()).ValueOrDie();
 
   // Verify that only the first reshape is replaced.
   EXPECT_THAT(
@@ -1086,8 +1297,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAfterEffectiveUnary) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(ShapeUtil::MakeShape(F32, {1, 2, 3, 4, 5}),
                                    HloOpcode::kMaximum, movable_reshape, zero));
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Maximum(op::Reshape(param), zero));
@@ -1095,7 +1305,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAfterEffectiveUnary) {
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
 
-  simplifier.Run(module.get()).ValueOrDie();
+  simplifier.Run(&module()).ValueOrDie();
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Maximum(param, zero)));
 }
@@ -1113,8 +1323,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeToScalarNotHoistedAfterEffectiveUnary) {
       HloInstruction::CreateConstant(Literal::CreateR1<float>({1., 2., 3.})));
   builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {3}), HloOpcode::kMaximum, reshape, zero));
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Maximum(op::Reshape(param), zero));
@@ -1122,7 +1331,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeToScalarNotHoistedAfterEffectiveUnary) {
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
 
-  simplifier.Run(module.get()).ValueOrDie();
+  simplifier.Run(&module()).ValueOrDie();
 
   EXPECT_THAT(computation->root_instruction(),
               op::Maximum(op::Reshape(param), zero));
@@ -1147,9 +1356,8 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  module().AddEntryComputation(builder.Build());
+  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
 }
 
 // Regression test for a bug where if we failed to sink a reshape, we'd set the
@@ -1166,14 +1374,14 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) {
       builder.AddInstruction(HloInstruction::CreateConstant(
           Literal::CreateR2<float>({{0, 0}, {0, 0}})))));
 
-  builder.AddInstruction(HloInstruction::CreateBroadcast(
-      ShapeUtil::MakeShape(F32, {2, 2, 2}), add, /*broadcast_dimensions=*/{0}));
+  builder.AddInstruction(
+      HloInstruction::CreateBroadcast(ShapeUtil::MakeShape(F32, {2, 2, 2}), add,
+                                      /*broadcast_dimensions=*/{0, 1}));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  module().AddEntryComputation(builder.Build());
+  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
@@ -1190,14 +1398,13 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3});
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   // Verify that the reshape is replaced.
   EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
@@ -1217,14 +1424,13 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({3, 1, 2, 0});
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   // Verify that the reshape is replaced.
   EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
@@ -1243,15 +1449,14 @@ TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {1, 2, 1, 1, 2, 1}), reshape1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Reshape(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 }
@@ -1260,7 +1465,7 @@ TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(F32, {2, 2, 2}),
+          0, ShapeUtil::MakeShapeWithDescendingLayout(F32, {2, 2, 2}),
           "param0"));
 
   HloInstruction* copy1 = builder.AddInstruction(HloInstruction::CreateUnary(
@@ -1271,14 +1476,13 @@ TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 2, 1}),
       HloOpcode::kCopy, copy1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(op::Copy(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 }
@@ -1296,14 +1500,13 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
   builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {4, 3, 2}), transpose1, {1, 0, 2}));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(transpose1));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Transpose(param0));
   EXPECT_EQ(std::vector<int64>({2, 1, 0}),
@@ -1318,17 +1521,16 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {1, 5, 1}), param0));
   builder.AddInstruction(HloInstruction::CreateBroadcast(
-      ShapeUtil::MakeShape(F32, {1, 2, 3, 5, 1}), reshape1, {0, 2, 3}));
+      ShapeUtil::MakeShape(F32, {1, 2, 3, 5, 1}), reshape1, {0, 3, 2}));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Broadcast(op::Reshape(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
 }
@@ -1343,15 +1545,14 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {2, 3, 7, 2, 1, 3, 2}), broadcast1));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
 }
@@ -1365,15 +1566,14 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {3}), broadcast));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
@@ -1388,15 +1588,14 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 4}), broadcast));
 
-  auto module = CreateNewModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
   EXPECT_THAT(computation->root_instruction()->dimensions(),
@@ -1412,15 +1611,14 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 1}), broadcast));
 
-  auto module = CreateNewModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
   const std::vector<int64> broadcast_dims =
@@ -1438,15 +1636,14 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 8}), broadcast));
 
-  auto module = CreateNewModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Broadcast(param)));
@@ -2138,8 +2335,10 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, r1f32, "x"));
   HloInstruction* y =
       builder.AddInstruction(HloInstruction::CreateParameter(1, r1f32, "y"));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(r1f32, HloOpcode::kDot, x, y));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  builder.AddInstruction(HloInstruction::CreateDot(r1f32, x, y, dot_dnums));
   std::unique_ptr<HloComputation> dot_computation(builder.Build());
 
   HloComputation::Builder call_builder(TestName() + ".Call");
@@ -2150,12 +2349,11 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
   call_builder.AddInstruction(
       HloInstruction::CreateCall(r1f32, {zero, one}, dot_computation.get()));
 
-  auto module = CreateNewModule();
-  module->AddEmbeddedComputation(std::move(dot_computation));
-  module->AddEntryComputation(call_builder.Build());
+  module().AddEmbeddedComputation(std::move(dot_computation));
+  module().AddEntryComputation(call_builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
 }
 
 // Test that a constant with tuple shape becomes a tuple of constants.
@@ -2168,12 +2366,11 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
                           Literal::CreateR1<float>(constant_vector).get()});
   builder.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::Tuple(op::Constant(), op::Constant()));
 }
@@ -2193,11 +2390,10 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
           HloInstruction::CreateConstant(Literal::CreateR1<int>({0, 0, 0}))),
       /*slice_sizes=*/{10, 100, 1000}));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), op::Parameter());
 }
 
@@ -2227,14 +2423,354 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
       builder.AddInstruction(
           HloInstruction::CreateConstant(Literal::CreateR1<int>({0, 0, 0})))));
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module().AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::DynamicSlice(op::Parameter(), op::Parameter()));
 }
 
+struct PadReduceWindowEffectiveBroadcastCase {
+  std::vector<int64> input_spatials;
+  std::vector<int64> symmetric_pad_spatials;
+  std::vector<int64> reduce_window_spatials;
+  // Whether to use `B F S0 S1` form vs `B S0 S1 F` form.
+  //
+  // This doesn't test any different functionality but is useful for making sure
+  // kBroadcast nodes are well formed.
+  bool prepend_a;
+  bool should_become_broadcast;
+
+  string ToTestCaseName() const {
+    return tensorflow::strings::StrCat(
+        tensorflow::str_util::Join(input_spatials, ","), ";",
+        tensorflow::str_util::Join(symmetric_pad_spatials, ","), ";",
+        tensorflow::str_util::Join(reduce_window_spatials, ","), ";", prepend_a,
+        ";", should_become_broadcast);
+  }
+};
+
+void PrintTo(const PadReduceWindowEffectiveBroadcastCase& c, std::ostream* os) {
+  *os << c.ToTestCaseName();
+}
+
+class PadReduceWindowEffectiveBroadcastTest
+    : public AlgebraicSimplifierTest,
+      public ::testing::WithParamInterface<
+          PadReduceWindowEffectiveBroadcastCase> {};
+
+TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
+  const auto& param = GetParam();
+
+  // a and b are parallel bounds we can either turn into a B F S0 S1 or
+  // `B S0 S1 F` kind of pattern.
+  auto decorate_spatials = [&param](tensorflow::gtl::ArraySlice<int64> spatials,
+                                    int64 a, int64 b) {
+    std::vector<int64> result;
+    if (param.prepend_a) {
+      result.push_back(a);
+    }
+    for (int64 s : spatials) {
+      result.push_back(s);
+    }
+    if (!param.prepend_a) {
+      result.push_back(a);
+    }
+    result.push_back(b);
+    return result;
+  };
+
+  HloComputation::Builder builder(TestName());
+  const Shape input_shape = ShapeUtil::MakeShape(
+      F32, decorate_spatials(param.input_spatials, 128, 2048));
+  HloInstruction* input = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "input"));
+
+  PaddingConfig padding = window_util::MakeSymmetricPadding(
+      decorate_spatials(param.symmetric_pad_spatials, 0, 0));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape pad_shape,
+      ShapeInference::InferPadShape(input->shape(),
+                                    ShapeUtil::MakeShape(F32, {}), padding));
+  HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
+      pad_shape, input,
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0(0.0f))),
+      padding));
+
+  HloComputation* add_computation = nullptr;
+  {
+    HloComputation::Builder builder(TestName() + ".add");
+    const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    HloInstruction* p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
+    add_computation = module().AddEmbeddedComputation(builder.Build());
+  }
+
+  Window window = window_util::MakeWindow(
+      decorate_spatials(param.reduce_window_spatials, 1, 1));
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape,
+                          ShapeInference::InferReduceWindowShape(
+                              pad->shape(), zero->shape(), window,
+                              add_computation->ComputeProgramShape()));
+  builder.AddInstruction(HloInstruction::CreateReduceWindow(
+      output_shape, pad, zero, window, add_computation));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  ASSERT_TRUE(run_successful);
+
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->shape(), output_shape));
+
+  if (param.should_become_broadcast) {
+    EXPECT_THAT(computation->root_instruction(), op::Broadcast(::testing::_));
+  } else {
+    EXPECT_THAT(computation->root_instruction(),
+                op::ReduceWindow(::testing::_, zero));
+  }
+}
+
+const std::vector<PadReduceWindowEffectiveBroadcastCase>&
+PadReduceWindowEffectiveBroadcastCases() {
+  static auto* cases = new std::vector<PadReduceWindowEffectiveBroadcastCase>{
+      {/*input_spatials=*/{1, 1}, /*symmetric_pad_amount=*/{6, 6},
+       /*reduce_window_spatials=*/{7, 7}, /*prepend_a=*/true,
+       /*should_become_broadcast=*/true},  //
+      {/*input_spatials=*/{1, 1}, /*symmetric_pad_amount=*/{6, 6},
+       /*reduce_window_spatials=*/{7, 7}, /*prepend_a=*/false,
+       /*should_become_broadcast=*/true},  //
+      {/*input_spatials=*/{2, 2}, /*symmetric_pad_amount=*/{6, 6},
+       /*reduce_window_spatials=*/{7, 7}, /*prepend_a=*/true,
+       /*should_become_broadcast=*/false},  //
+      {/*input_spatials=*/{1, 1}, /*symmetric_pad_amount=*/{2, 2},
+       /*reduce_window_spatials=*/{5, 5}, /*prepend_a=*/true,
+       /*should_become_broadcast=*/true},  //
+      {/*input_spatials=*/{1, 1}, /*symmetric_pad_amount=*/{2, 2},
+       /*reduce_window_spatials=*/{1, 1}, /*prepend_a=*/true,
+       /*should_become_broadcast=*/false},  //
+      {/*input_spatials=*/{5, 1}, /*symmetric_pad_amount=*/{0, 2},
+       /*reduce_window_spatials=*/{2, 5}, /*prepend_a=*/true,
+       /*should_become_broadcast=*/false},  //
+  };
+  return *cases;
+}
+
+INSTANTIATE_TEST_CASE_P(
+    PadReduceWindowEffectiveBroadcastInstantiation,
+    PadReduceWindowEffectiveBroadcastTest,
+    ::testing::ValuesIn(PadReduceWindowEffectiveBroadcastCases()));
+
+class DotStrengthReductionTest
+    : public AlgebraicSimplifierTest,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<int, int, int, bool, bool>> {};
+TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
+  int m, k, n;
+  bool transpose_lhs, transpose_rhs;
+  std::tie(m, k, n, transpose_lhs, transpose_rhs) = GetParam();
+
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {m, n});
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
+  Shape transposed_lhs_shape = ShapeUtil::MakeShape(F32, {k, m});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {k, n});
+  Shape transposed_rhs_shape = ShapeUtil::MakeShape(F32, {n, k});
+  HloComputation::Builder builder(TestName());
+
+  auto lhs = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, transpose_lhs ? transposed_lhs_shape : lhs_shape, "lhs"));
+  if (transpose_lhs) {
+    lhs = builder.AddInstruction(
+        HloInstruction::CreateTranspose(lhs_shape, lhs, {1, 0}));
+  }
+  auto rhs = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, transpose_rhs ? transposed_rhs_shape : rhs_shape, "rhs"));
+  if (transpose_rhs) {
+    rhs = builder.AddInstruction(
+        HloInstruction::CreateTranspose(rhs_shape, rhs, {1, 0}));
+  }
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  builder.AddInstruction(
+      HloInstruction::CreateDot(dot_shape, lhs, rhs, dot_dnums));
+  auto computation = module().AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(&module()));
+  const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1;
+  const bool computation_should_be_modified =
+      dot_should_be_transformed || (transpose_lhs && transpose_rhs);
+  EXPECT_EQ(changed, computation_should_be_modified);
+  bool has_no_dot = true;
+  for (const auto& hlo : computation->instructions()) {
+    if (hlo->opcode() == HloOpcode::kDot) {
+      has_no_dot = false;
+      break;
+    }
+  }
+  EXPECT_EQ(has_no_dot, dot_should_be_transformed);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DotStrengthReductionTestInstantiation, DotStrengthReductionTest,
+    ::testing::Combine(::testing::Values(1, 2), ::testing::Values(1, 2),
+                       ::testing::Values(1, 2), ::testing::Bool(),
+                       ::testing::Bool()));
+
+struct DotOfConcatTestSpec {
+  int64 m;
+  int64 k;
+  int64 n;
+};
+
+class DotOfConcatSimplificationTest
+    : public HloVerifiedTestBase,
+      public ::testing::WithParamInterface<DotOfConcatTestSpec> {};
+
+// Test that we transform
+//  dot(const, concat(A, B, C))
+// to
+//  add(dot(const_0, A), dot(const_1, B),  dot(const_2, C))
+TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
+  HloComputation::Builder builder(TestName());
+
+  DotOfConcatTestSpec spec = GetParam();
+
+  ASSERT_GE(spec.k, 3);
+
+  int64 k0 = spec.k / 3;
+  int64 k1 = spec.k / 3;
+  int64 k2 = spec.k - k0 - k1;
+
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {spec.m, spec.k});
+  auto* lhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+          /*from=*/10.0, /*to=*/10000.0, /*rows=*/spec.m, /*cols=*/spec.k)));
+
+  Shape rhs0_shape = ShapeUtil::MakeShape(F32, {k0, spec.n});
+  Shape rhs1_shape = ShapeUtil::MakeShape(F32, {k1, spec.n});
+  Shape rhs2_shape = ShapeUtil::MakeShape(F32, {k2, spec.n});
+
+  HloInstruction* rhs0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, rhs0_shape, "rhs0"));
+  HloInstruction* rhs1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, rhs1_shape, "rhs1"));
+  HloInstruction* rhs2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, rhs2_shape, "rhs2"));
+
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {spec.k, spec.n});
+  HloInstruction* rhs = builder.AddInstruction(
+      HloInstruction::CreateConcatenate(rhs_shape, {rhs0, rhs1, rhs2}, 0));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {spec.m, spec.n});
+  builder.AddInstruction(
+      HloInstruction::CreateDot(dot_shape, lhs, rhs, dot_dnums));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  ASSERT_TRUE(run_successful);
+
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
+
+  auto match_dot_0 = op::Dot(op::Slice(op::Constant()), op::Parameter(0));
+  auto match_dot_1 = op::Dot(op::Slice(op::Constant()), op::Parameter(1));
+  auto match_dot_2 = op::Dot(op::Slice(op::Constant()), op::Parameter(2));
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Add(match_dot_0, match_dot_1), match_dot_2));
+}
+
+// Test that we transform
+//  dot(concat(A, B, C), const)
+// to
+//  add(dot(A, const_0), dot(B, const_1),  dot(C, const_2))
+TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
+  HloComputation::Builder builder(TestName());
+
+  DotOfConcatTestSpec spec = GetParam();
+
+  ASSERT_GE(spec.k, 4);
+
+  int64 k0 = spec.k / 4;
+  int64 k1 = spec.k / 4;
+  int64 k2 = spec.k / 4;
+  int64 k3 = spec.k - k0 - k1 - k2;
+
+  Shape lhs0_shape = ShapeUtil::MakeShape(F32, {spec.m, k0});
+  Shape lhs1_shape = ShapeUtil::MakeShape(F32, {spec.m, k1});
+  Shape lhs2_shape = ShapeUtil::MakeShape(F32, {spec.m, k2});
+  Shape lhs3_shape = ShapeUtil::MakeShape(F32, {spec.m, k3});
+
+  HloInstruction* lhs0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, lhs0_shape, "lhs0"));
+  HloInstruction* lhs1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, lhs1_shape, "lhs1"));
+  HloInstruction* lhs2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, lhs2_shape, "lhs2"));
+  HloInstruction* lhs3 = builder.AddInstruction(
+      HloInstruction::CreateParameter(3, lhs3_shape, "lhs3"));
+
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {spec.m, spec.k});
+  HloInstruction* lhs =
+      builder.AddInstruction(HloInstruction::CreateConcatenate(
+          lhs_shape, {lhs0, lhs1, lhs2, lhs3}, 1));
+
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {spec.k, spec.n});
+  auto* rhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+          /*from=*/10.0, /*to=*/10000.0, /*rows=*/spec.k, /*cols=*/spec.n)));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {spec.m, spec.n});
+  builder.AddInstruction(
+      HloInstruction::CreateDot(dot_shape, lhs, rhs, dot_dnums));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  ASSERT_TRUE(run_successful);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
+
+  auto match_dot_0 = op::Dot(op::Parameter(0), op::Slice(op::Constant()));
+  auto match_dot_1 = op::Dot(op::Parameter(1), op::Slice(op::Constant()));
+  auto match_dot_2 = op::Dot(op::Parameter(2), op::Slice(op::Constant()));
+  auto match_dot_3 = op::Dot(op::Parameter(3), op::Slice(op::Constant()));
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Add(op::Add(match_dot_0, match_dot_1), match_dot_2),
+                      match_dot_3));
+}
+
+DotOfConcatTestSpec kDotOfConcatTestSpecs[] = {
+    {/*m=*/3, /*k=*/9, /*n=*/3},    //
+    {/*m=*/3, /*k=*/20, /*n=*/3},   //
+    {/*m=*/1, /*k=*/18, /*n=*/5},   //
+    {/*m=*/20, /*k=*/20, /*n=*/1},  //
+    {/*m=*/1, /*k=*/16, /*n=*/1},   //
+};
+
+INSTANTIATE_TEST_CASE_P(DotOfConcatSimplificationTestInstantiation,
+                        DotOfConcatSimplificationTest,
+                        ::testing::ValuesIn(kDotOfConcatTestSpecs));
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index ad2fee2d39a8ca183b87212bdeea22c351aaa88a..4e80679c11dfdf7fdf8077a9f354139a4cab6803 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -27,191 +27,161 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-
-namespace se = ::perftools::gputools;
 
 namespace xla {
 
-AllocationTracker::AllocationTracker() : next_handle_(1) {}
-
-GlobalDataHandle AllocationTracker::Register(Backend* backend,
-                                             int device_ordinal,
-                                             se::DeviceMemoryBase device_memory,
-                                             const Shape& shape,
-                                             const string& tag) {
-  tensorflow::mutex_lock lock(allocation_mutex_);
+StatusOr<GlobalDataHandle> AllocationTracker::Register(
+    std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag) {
+  tensorflow::mutex_lock lock(mutex_);
   VLOG(2) << "Register";
-  return RegisterInternal(backend, device_ordinal, device_memory, shape, tag,
-                          /*initial_ref_count=*/1);
+  return RegisterInternal(std::move(shaped_buffer), tag);
 }
 
-GlobalDataHandle AllocationTracker::RegisterInternal(
-    Backend* backend, int device_ordinal, se::DeviceMemoryBase device_memory,
-    const Shape& shape, const string& tag, int initial_ref_count) {
+StatusOr<GlobalDataHandle> AllocationTracker::RegisterInternal(
+    std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag) {
   VLOG(2) << "RegisterInternal("
           << "tag: \"" << tag << "\" "
-          << "device_ordinal: " << device_ordinal << " "
-          << "device_memory: " << device_memory.opaque() << " "
-          << "shape: " << shape.ShortDebugString() << ")";
-  TF_CHECK_OK(ShapeUtil::ValidateShape(shape));
-
-  int64 handle;
-  HandleMap& handle_map = GetOrCreateOpaqueToHandleMap(device_ordinal);
-  auto handle_it = handle_map.find(device_memory.opaque());
-  if (handle_it != handle_map.end()) {
-    handle = handle_it->second;
-    auto& allocation = FindOrDie(handle_to_allocation_, handle);
-    int ref_count = allocation->ref_count();
-    CHECK_GT(ref_count, 0);
-    VLOG(2) << "ref_count: " << ref_count << " -> " <<
-            (ref_count + initial_ref_count);
-    allocation->increment_ref_count(initial_ref_count);
-  } else {
-    handle = next_handle_++;
-    VLOG(2) << "ref_count: " << initial_ref_count;
-    InsertOrDie(&handle_map, device_memory.opaque(), handle);
-    auto inserted = handle_to_allocation_.emplace(
-        handle, MakeUnique<Allocation>(backend, device_ordinal, device_memory,
-                                       shape, tag, initial_ref_count));
-    CHECK(inserted.second);
+          << "shaped_buffer: " << *shaped_buffer;
+  if (shaped_buffer->platform() != backend_->platform()) {
+    return InvalidArgument(
+        "AllocationTracker for platform %s cannot register buffer from "
+        "platform %s",
+        backend_->platform()->Name().c_str(),
+        shaped_buffer->platform()->Name().c_str());
   }
 
+  int64 handle = next_handle_++;
+  std::vector<ShapeIndex> shape_indices;
+  ShapeUtil::ForEachSubshape(shaped_buffer->on_device_shape(),
+                             [this, &shape_indices](const Shape& /*subshape*/,
+                                                    const ShapeIndex& index) {
+                               shape_indices.push_back(index);
+                             });
+  for (const ShapeIndex& index : shape_indices) {
+    AddAllocationOrIncrementRefCount(shaped_buffer->buffer(index),
+                                     shaped_buffer->device_ordinal());
+  }
   GlobalDataHandle result;
   result.set_handle(handle);
+
+  handle_to_shaped_buffer_[handle] = std::move(shaped_buffer);
+
   VLOG(2) << "handle: " << handle;
 
   return result;
 }
 
 tensorflow::Status AllocationTracker::Unregister(const GlobalDataHandle& data) {
-  tensorflow::mutex_lock lock(allocation_mutex_);
-  TF_ASSIGN_OR_RETURN(Allocation * allocation, ResolveInternal(data));
-  std::set<void*> deallocated_buffers;
-  TF_RETURN_IF_ERROR(
-      DeallocateShape(allocation->backend(), allocation->device_ordinal(),
-                      allocation->mutable_device_memory(), allocation->shape(),
-                      &deallocated_buffers));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status AllocationTracker::DeallocateShape(
-    Backend* backend, int device_ordinal, se::DeviceMemoryBase* device_memory,
-    const Shape& shape, std::set<void*>* deallocated_buffers) {
-  VLOG(2) << "DeallocateShape("
-          << "shape: \"" << shape.ShortDebugString() << "\" "
-          << "device_memory: " << device_memory->opaque() << ")";
-  if (ContainsKey(*deallocated_buffers, device_memory->opaque())) {
-    // Buffer has already been deallocated. Nothing to do.
-    VLOG(2) << "already deallocated";
-    return tensorflow::Status::OK();
-  }
-
-  // Add buffer to deallocated set so we do not try to deallocate it again
-  // if it is encountered again while traversing a tuple.
-  deallocated_buffers->insert(device_memory->opaque());
-
-  HandleMap& handle_map = GetOrCreateOpaqueToHandleMap(device_ordinal);
-  auto handle_it = handle_map.find(device_memory->opaque());
-  if (handle_it != handle_map.end()) {
-    int64 handle = handle_it->second;
-    auto& allocation = FindOrDie(handle_to_allocation_, handle);
-    int ref_count = allocation->ref_count();
-    VLOG(2) << "ref_count: " << ref_count << " -> " << ref_count - 1;
-    allocation->decrement_ref_count();
-    if (allocation->ref_count() > 0) {
-      // Buffer is referred to by another allocation. Don't deallocate it.
-      return tensorflow::Status::OK();
-    }
-    handle_map.erase(device_memory->opaque());
+  tensorflow::mutex_lock lock(mutex_);
+  VLOG(2) << "Unregister("
+          << "handle: " << data.handle() << ")";
+  TF_ASSIGN_OR_RETURN(ShapedBuffer * shaped_buffer, ResolveInternal(data));
+  std::vector<ShapeIndex> shape_indices;
+  ShapeUtil::ForEachSubshape(shaped_buffer->on_device_shape(),
+                             [this, &shape_indices](const Shape& /*subshape*/,
+                                                    const ShapeIndex& index) {
+                               shape_indices.push_back(index);
+                             });
+  for (const ShapeIndex& index : shape_indices) {
+    TF_RETURN_IF_ERROR(DecrementRefCount(shaped_buffer->buffer(index),
+                                         shaped_buffer->device_ordinal()));
   }
 
-  if (ShapeUtil::IsTuple(shape)) {
-    // Traverse into tuple recursively deallocating buffers.
-    TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
-                        backend->stream_executor(device_ordinal));
-    TF_ASSIGN_OR_RETURN(std::vector<se::DeviceMemoryBase> elements,
-                        backend->transfer_manager()->ShallowCopyTupleFromDevice(
-                            executor, *device_memory, shape));
-
-    TF_RET_CHECK(ShapeUtil::TupleElementCount(shape) == elements.size())
-        << "tuple has unexpected number of elements: " << elements.size()
-        << " != " << ShapeUtil::TupleElementCount(shape);
-    for (size_t i = 0; i < elements.size(); ++i) {
-      VLOG(2) << "recursing onto the tuple elements";
-      TF_RETURN_IF_ERROR(DeallocateShape(backend, device_ordinal, &elements[i],
-                                         shape.tuple_shapes(i),
-                                         deallocated_buffers));
-    }
-  }
+  // Keep a nullptr as a tombstone for unregistered handles. This enables better
+  // error messages. That is, "handle has been deallocated" versus "handle does
+  // not exist".
+  handle_to_shaped_buffer_.at(data.handle()).reset();
 
-  return backend->memory_allocator()->Deallocate(device_ordinal, device_memory);
+  return tensorflow::Status::OK();
 }
 
 StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
     const GlobalDataHandle& data) {
-  tensorflow::mutex_lock lock(allocation_mutex_);
-  TF_ASSIGN_OR_RETURN(Allocation * allocation, ResolveInternal(data));
+  tensorflow::mutex_lock lock(mutex_);
 
-  if (!ShapeUtil::IsTuple(allocation->shape())) {
+  TF_ASSIGN_OR_RETURN(ShapedBuffer * shaped_buffer, ResolveInternal(data));
+  if (!ShapeUtil::IsTuple(shaped_buffer->on_host_shape())) {
     return InvalidArgument("global data handle %lld is not a tuple",
                            data.handle());
   }
+  // If the on-host representation is a tuple, then the on-device one should be
+  // as well.
+  TF_RET_CHECK(ShapeUtil::IsTuple(shaped_buffer->on_device_shape()));
 
-  if (ShapeUtil::IsNestedTuple(allocation->shape())) {
+  if (ShapeUtil::IsNestedTuple(shaped_buffer->on_device_shape())) {
     return Unimplemented("deconstructing nested tuples not yet supported");
   }
 
-  TF_ASSIGN_OR_RETURN(
-      se::StreamExecutor * executor,
-      allocation->backend()->stream_executor(allocation->device_ordinal()));
-  TF_ASSIGN_OR_RETURN(
-      std::vector<se::DeviceMemoryBase> element_bases,
-      allocation->backend()->transfer_manager()->ShallowCopyTupleFromDevice(
-          executor, allocation->device_memory(), allocation->shape()));
-
   std::vector<GlobalDataHandle> element_handles;
-  element_handles.reserve(element_bases.size());
-  for (int i = 0; i < element_bases.size(); ++i) {
-    element_handles.push_back(RegisterInternal(
-        allocation->backend(), allocation->device_ordinal(), element_bases[i],
-        ShapeUtil::GetSubshape(allocation->shape(), {i}),
-        tensorflow::strings::StrCat(allocation->tag(), ".element_", i),
-        /*initial_ref_count=*/2));
+  for (int i = 0;
+       i < ShapeUtil::TupleElementCount(shaped_buffer->on_device_shape());
+       ++i) {
+    auto element_buffer = MakeUnique<ShapedBuffer>(
+        ShapeUtil::GetTupleElementShape(shaped_buffer->on_host_shape(), i),
+        ShapeUtil::GetTupleElementShape(shaped_buffer->on_device_shape(), i),
+        shaped_buffer->platform(), shaped_buffer->device_ordinal());
+    element_buffer->set_buffer(shaped_buffer->buffer(/*index=*/{i}),
+                               /*index=*/{});
+    TF_ASSIGN_OR_RETURN(
+        GlobalDataHandle element_handle,
+        RegisterInternal(std::move(element_buffer), "deconstructed tuple"));
+
+    element_handles.push_back(element_handle);
   }
   return std::move(element_handles);
 }
 
-StatusOr<const Allocation*> AllocationTracker::Resolve(
+StatusOr<const ShapedBuffer*> AllocationTracker::Resolve(
     const GlobalDataHandle& data) {
-  tensorflow::mutex_lock lock(allocation_mutex_);
+  tensorflow::mutex_lock lock(mutex_);
   return AllocationTracker::ResolveInternal(data);
 }
 
-StatusOr<Allocation*> AllocationTracker::ResolveInternal(
+StatusOr<ShapedBuffer*> AllocationTracker::ResolveInternal(
     const GlobalDataHandle& data) {
   VLOG(2) << "resolve:" << data.handle();
-  auto it = handle_to_allocation_.find(data.handle());
-  if (it == handle_to_allocation_.end()) {
+  auto it = handle_to_shaped_buffer_.find(data.handle());
+  if (it == handle_to_shaped_buffer_.end()) {
     return NotFound("no allocation record for global data handle: %lld",
                     data.handle());
   }
-  Allocation* allocation = it->second.get();
+  ShapedBuffer* shaped_buffer = it->second.get();
 
-  if (allocation->is_deallocated()) {
+  if (shaped_buffer == nullptr) {
     return InvalidArgument("global data handle %lld was previously deallocated",
                            data.handle());
   }
 
-  return allocation;
+  return shaped_buffer;
+}
+
+void AllocationTracker::AddAllocationOrIncrementRefCount(
+    perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
+  AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
+  auto it = allocation_map.find(device_memory.opaque());
+  if (it == allocation_map.end()) {
+    allocation_map[device_memory.opaque()] = {device_memory, device_ordinal,
+                                              /*ref_count=*/1};
+  } else {
+    it->second.ref_count++;
+  }
 }
 
-AllocationTracker::HandleMap& AllocationTracker::GetOrCreateOpaqueToHandleMap(
-    int device_ordinal) {
-  if (opaque_to_handle_.size() <= device_ordinal) {
-    opaque_to_handle_.resize(device_ordinal + 1);
+Status AllocationTracker::DecrementRefCount(
+    perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) {
+  AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
+  auto it = allocation_map.find(device_memory.opaque());
+  TF_RET_CHECK(it != allocation_map.end());
+  Allocation& allocation = it->second;
+  TF_RET_CHECK(allocation.ref_count >= 1);
+  if (allocation.ref_count == 1) {
+    TF_RETURN_IF_ERROR(backend_->memory_allocator()->Deallocate(
+        device_ordinal, &device_memory));
+    allocation_map.erase(it);
+  } else {
+    allocation.ref_count--;
   }
-  return opaque_to_handle_[device_ordinal];
+  return tensorflow::Status::OK();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index ebbf35b6fe87bc7322ccb99cfe8f8eed56de06b3..807af8694972083d097604a67ee46d2f73d9545a 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -28,147 +28,92 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
-// A global allocation in device space, tracked by the XLA service.
-class Allocation {
- public:
-  Allocation(Backend* backend, int device_ordinal,
-             perftools::gputools::DeviceMemoryBase device_memory,
-             const Shape& shape, const string& tag, int initial_ref_count)
-      : backend_(backend),
-        device_ordinal_(device_ordinal),
-        device_memory_(device_memory),
-        shape_(shape),
-        tag_(tag),
-        ref_count_(initial_ref_count) {}
-
-  Backend* backend() const { return backend_; }
-  int device_ordinal() const { return device_ordinal_; }
-  perftools::gputools::DeviceMemoryBase device_memory() const {
-    return device_memory_;
-  }
-  const Shape& shape() const { return shape_; }
-  const string& tag() const { return tag_; }
-
-  bool is_deallocated() const {
-    CHECK_GE(ref_count_, 0);
-    return ref_count_ == 0;
-  }
-  int ref_count() const {
-    CHECK_GE(ref_count_, 0);
-    return ref_count_;
-  }
-  void increment_ref_count(int inc) {
-    CHECK_GT(ref_count_, 0);
-    CHECK_LE(ref_count_, INT_MAX - inc);
-    ref_count_ += inc;
-  }
-  void decrement_ref_count() {
-    CHECK_GT(ref_count_, 0);
-    --ref_count_;
-  }
-  perftools::gputools::DeviceMemoryBase* mutable_device_memory() {
-    return &device_memory_;
-  }
-
- private:
-  // The backend that the memory is allocated on.
-  Backend* backend_;
-
-  // The device that the memory is allocated on.
-  int device_ordinal_;
-
-  // The pointer to this allocation.
-  perftools::gputools::DeviceMemoryBase device_memory_;
-
-  // The shape of this allocation.
-  Shape shape_;
-
-  // An informal description of this allocation shown in tools.
-  string tag_;
-
-  // This is the number of Allocation objects which refer to this memory
-  // allocation.
-  int ref_count_;
-
-  // Return a string representation of this allocation for debugging or logging
-  // purposes.
-  string ToString() const;
-};
-
 // Tracks allocations for the XLA service; allocations can be registered
 // with shape/device/tag and resolved from a handle for later use.
 class AllocationTracker {
  public:
-  AllocationTracker();
+  // The allocator is used for deallocating memory when allocations are
+  // deregistered. All registered allocations must have the same platform as the
+  // allocator.
+  AllocationTracker(Backend* backend) : backend_(backend), next_handle_(1) {}
 
-  // Registers device memory with a given shape, device identifier, and tag, and
-  // returns a corresponding handle that can be used for talking to XLA
-  // clients.
-  GlobalDataHandle Register(Backend* backend, int device_ordinal,
-                            perftools::gputools::DeviceMemoryBase device_memory,
-                            const Shape& shape, const string& tag);
+  // Registers a shaped buffer of device memory, and returns a corresponding
+  // handle that can be used for talking to XLA clients.
+  StatusOr<GlobalDataHandle> Register(
+      std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag);
 
   // Unregister the allocation for the given data handle.
-  tensorflow::Status Unregister(const GlobalDataHandle& data);
+  Status Unregister(const GlobalDataHandle& data);
 
   // Returns a vector of global data handles that point to the tuple elements.
   StatusOr<std::vector<GlobalDataHandle>> DeconstructTuple(
       const GlobalDataHandle& Data);
 
-  // Resolve a handle from an XLA client to an allocation, or provide an
-  // error status to say whether it was not found (or found, but found
-  // deallocated).
-  StatusOr<const Allocation*> Resolve(const GlobalDataHandle& data);
+  // Resolve a handle from an XLA client to a shaped buffer, or provide an error
+  // status to say whether it was not found (or found, but found deallocated).
+  StatusOr<const ShapedBuffer*> Resolve(const GlobalDataHandle& data);
 
  private:
-  // Internal helper which resolves the given GlobalDataHandle to an Allocation.
-  StatusOr<Allocation*> ResolveInternal(const GlobalDataHandle& data)
-      EXCLUSIVE_LOCKS_REQUIRED(allocation_mutex_);
-
-  GlobalDataHandle RegisterInternal(
-      Backend* backend, int device_ordinal,
-      perftools::gputools::DeviceMemoryBase device_memory, const Shape& shape,
-      const string& tag, int initial_ref_count)
-      EXCLUSIVE_LOCKS_REQUIRED(allocation_mutex_);
-
-  // Helper function which deallocates the memory buffer containing the given
-  // shape referred to by device_memory. Tuples are traversed recursively
-  // deallocating all nested buffers. The parameter deallocated_buffers contains
-  // the set of buffers deallocated so far stored as opaque values (void *) from
-  // DeviceMemoryBase. Keeping track of deallocated buffers prevents
-  // double-freeing of buffers which may be referred to more than once in a
-  // nested tuple.
-  tensorflow::Status DeallocateShape(
-      Backend* backend, int device_ordinal,
-      perftools::gputools::DeviceMemoryBase* device_memory, const Shape& shape,
-      std::set<void*>* deallocated_buffers)
-      EXCLUSIVE_LOCKS_REQUIRED(allocation_mutex_);
-
-  // Returns the opaque_to_handle_ map for the given device_ordinal, creating
-  // a new map if there is not one for the device_ordinal.
-  using HandleMap = std::map<void*, int64>;
-  HandleMap& GetOrCreateOpaqueToHandleMap(int device_ordinal)
-      EXCLUSIVE_LOCKS_REQUIRED(allocation_mutex_);
-
-  tensorflow::mutex allocation_mutex_;  // Guards the allocation mapping.
+  // Data structure encapsulating single memory allocation on the device.
+  struct Allocation {
+    // The pointer to this allocation.
+    perftools::gputools::DeviceMemoryBase device_memory;
+
+    // The device that the memory is allocated on.
+    int device_ordinal;
+
+    // This is the number of times this memory allocation is referred to by
+    // registered data handles.
+    int ref_count;
+  };
+
+  // Internal helper which resolves the given GlobalDataHandle to a
+  // ShapedBuffer.
+  StatusOr<ShapedBuffer*> ResolveInternal(const GlobalDataHandle& data)
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Internal helper which registers a shaped buffer.
+  StatusOr<GlobalDataHandle> RegisterInternal(
+      std::unique_ptr<ShapedBuffer> shaped_buffer, const string& tag)
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Adds the given device address to the allocation tracker, or if it already
+  // exists, then increment it's reference count.
+  void AddAllocationOrIncrementRefCount(
+      perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal)
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // Decrements the reference count of the given device memory. Then, if it is
+  // zero, deallocate the memory.
+  Status DecrementRefCount(perftools::gputools::DeviceMemoryBase device_memory,
+                           int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+  // A map from device memory opaque value to allocation. One such map is
+  // maintained per device ordinal.
+  using AllocationMap = tensorflow::gtl::FlatMap<const void*, Allocation>;
+
+  tensorflow::mutex mutex_;
+
+  // Backend to use with this tracker. The backend supplies the memory allocator
+  // to use when deallocating memory.
+  Backend* backend_;
 
   // The next handle to assign to an allocation, guarded by the same mutex as
   // the mapping as they'll be mutated at the same time.
-  int64 next_handle_ GUARDED_BY(allocation_mutex_);
+  int64 next_handle_ GUARDED_BY(mutex_);
 
-  // A map from DeviceMemoryBase to handle for each device_ordinal.
-  std::vector<HandleMap> opaque_to_handle_ GUARDED_BY(allocation_mutex_);
+  // A map from device ordinal to AllocationMap.
+  tensorflow::gtl::FlatMap<int, AllocationMap> opaque_to_allocation_map_
+      GUARDED_BY(mutex_);
 
-  // Mapping from GlobalDataHandle handle to the corresponding registered
-  // Allocation object.
-  std::map<int64, std::unique_ptr<Allocation>> handle_to_allocation_
-      GUARDED_BY(allocation_mutex_);
+  // A map from data handle to ShapedBuffer.
+  tensorflow::gtl::FlatMap<int64, std::unique_ptr<ShapedBuffer>>
+      handle_to_shaped_buffer_ GUARDED_BY(mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(AllocationTracker);
 };
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
similarity index 58%
rename from tensorflow/compiler/xla/service/batchnorm_rewriter.cc
rename to tensorflow/compiler/xla/service/batchnorm_expander.cc
index c6193b3fbbd651088a823605af3ba84bca4a77ee..27ddfd47aa3096afd3e245af1ac3cedd9b48ce4a 100644
--- a/tensorflow/compiler/xla/service/batchnorm_rewriter.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/batchnorm_rewriter.h"
+#include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 
 #include <algorithm>
 #include <memory>
@@ -45,9 +45,9 @@ limitations under the License.
 
 namespace xla {
 
-// BatchNormRewriterVisitor traverses the HLO computation and rewrites BatchNorm
+// BatchNormExpanderVisitor traverses the HLO computation and rewrites BatchNorm
 // operations into smaller operations.
-class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
+class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
  public:
   // Default visitor action is to do nothing and return OK.
   Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
@@ -68,10 +68,10 @@ class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
   // Returns whether any batch norm ops were rewritten.
   const bool changed() const { return changed_; }
 
-  ~BatchNormRewriterVisitor() override = default;
+  ~BatchNormExpanderVisitor() override = default;
 
  private:
-  explicit BatchNormRewriterVisitor(HloComputation* computation,
+  explicit BatchNormExpanderVisitor(HloComputation* computation,
                                     bool rewrite_training_op,
                                     bool rewrite_inference_op,
                                     bool rewrite_grad_op, bool use_fusion)
@@ -94,7 +94,7 @@ class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
     return computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
   }
 
-  // Current HloComputation instance the BatchNormRewriter is
+  // Current HloComputation instance the BatchNormExpander is
   // traversing.
   HloComputation* computation_;
 
@@ -130,11 +130,11 @@ class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
   }
 };
 
-bool BatchNormRewriterVisitor::Run(HloComputation* computation,
+bool BatchNormExpanderVisitor::Run(HloComputation* computation,
                                    bool rewrite_training_op,
                                    bool rewrite_inference_op,
                                    bool rewrite_grad_op, bool use_fusion) {
-  BatchNormRewriterVisitor visitor(
+  BatchNormExpanderVisitor visitor(
       computation,
       /*rewrite_training_op=*/rewrite_training_op,
       /*rewrite_inference_op=*/rewrite_inference_op,
@@ -144,11 +144,20 @@ bool BatchNormRewriterVisitor::Run(HloComputation* computation,
   return visitor.changed_;
 }
 
-Status BatchNormRewriterVisitor::HandleBatchNormTraining(
+Status BatchNormExpanderVisitor::HandleBatchNormTraining(
     HloInstruction* batch_norm) {
   if (!rewrite_training_op_) {
     return Status::OK();
   }
+
+  std::vector<HloInstruction*> added_instructions;
+  auto add = [&](std::unique_ptr<HloInstruction> inst) {
+    HloInstruction* added_inst = computation_->AddInstruction(std::move(inst));
+    added_instructions.push_back(added_inst);
+    return added_inst;
+  };
+  int64 instruction_count_before = computation_->instruction_count();
+
   // Expand batch norm training into smaller HLO ops.
   HloInstruction* operand = batch_norm->mutable_operand(0);
   const Shape operand_shape = operand->shape();
@@ -160,7 +169,7 @@ Status BatchNormRewriterVisitor::HandleBatchNormTraining(
       Literal::CreateR0<float>(size_in_elements / feature_count);
   TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
                       elements_per_feature_literal->Convert(ptype));
-  auto elements_per_feature = computation_->AddInstruction(
+  auto elements_per_feature = add(
       HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
 
   HloInstruction* scale = batch_norm->mutable_operand(1);
@@ -169,14 +178,12 @@ Status BatchNormRewriterVisitor::HandleBatchNormTraining(
 
   auto zero_literal = Literal::CreateR0(0.0f);
   TF_ASSIGN_OR_RETURN(zero_literal, zero_literal->Convert(ptype));
-  auto zero = computation_->AddInstruction(
-      HloInstruction::CreateConstant(std::move(zero_literal)));
+  auto zero = add(HloInstruction::CreateConstant(std::move(zero_literal)));
 
   auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
   TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
-  auto epsilon = computation_->AddInstruction(
-      HloInstruction::CreateConstant(std::move(epsilon_literal)));
-
+  auto epsilon =
+      add(HloInstruction::CreateConstant(std::move(epsilon_literal)));
   std::vector<int64> dimensions_without_feature;
 
   for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
@@ -185,109 +192,116 @@ Status BatchNormRewriterVisitor::HandleBatchNormTraining(
     }
   }
 
-  auto scale_broadcasted = computation_->AddInstruction(
+  auto scale_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, scale, {feature_index}));
 
-  auto offset_broadcasted = computation_->AddInstruction(
+  auto offset_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, offset, {feature_index}));
 
   HloComputation* add_reduce_computation =
       GetScalarBinaryComputation(ptype, HloOpcode::kAdd);
 
   // X^2.
-  auto operand_squared =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kMultiply, operand, operand));
+  auto operand_squared = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kMultiply, operand, operand));
   // Sum[X].
-  auto sum = computation_->AddInstruction(HloInstruction::CreateReduce(
-      feature_shape, operand, zero, dimensions_without_feature,
-      add_reduce_computation));
+  auto sum = add(HloInstruction::CreateReduce(feature_shape, operand, zero,
+                                              dimensions_without_feature,
+                                              add_reduce_computation));
 
   // Sum[X^2].
-  auto squared_sum = computation_->AddInstruction(HloInstruction::CreateReduce(
+  auto squared_sum = add(HloInstruction::CreateReduce(
       feature_shape, operand_squared, zero, dimensions_without_feature,
       add_reduce_computation));
 
   // Fuse two parallel reduces together to improve performance.
-  if (use_fusion_) {
-    auto tuple = computation_->AddInstruction(
-        HloInstruction::CreateTuple({sum, squared_sum}));
+  if (use_fusion_ && !batch_norm->has_sharding()) {
+    auto tuple = add(HloInstruction::CreateTuple({sum, squared_sum}));
 
     auto fused = computation_->CreateFusionInstruction(
         {tuple, sum, squared_sum, operand_squared},
         HloInstruction::FusionKind::kInput);
 
-    sum = computation_->AddInstruction(
-        HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
+    sum = add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
 
-    squared_sum = computation_->AddInstruction(
-        HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
+    squared_sum =
+        add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
   }
 
   // E[X].
-  auto mean = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto mean = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kDivide, sum, elements_per_feature));
 
-  auto mean_broadcasted = computation_->AddInstruction(
+  auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
 
   // E[X^2].
-  auto square_mean = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto square_mean = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kDivide, squared_sum, elements_per_feature));
 
   // E^2[X].
-  auto mean_square = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto mean_square = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kMultiply, mean, mean));
 
   // Var[X].
-  auto var = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto var = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kSubtract, square_mean, mean_square));
 
-  auto var_broadcasted = computation_->AddInstruction(
-      HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
+  auto var_broadcasted =
+      add(HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
 
   // Var[X] + epsilon.
-  auto var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
+  auto var_add_epsilon = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
 
   auto neg_half_literal = Literal::CreateR0(-0.5f);
   TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
-  auto neg_half = computation_->AddInstruction(
-      HloInstruction::CreateConstant(std::move(neg_half_literal)));
+  auto neg_half =
+      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
 
   // 1 / Sqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
+  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
 
   // X - E[X].
-  auto operand_minus_mean =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
+  auto operand_minus_mean = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon].
-  auto normalized = computation_->AddInstruction(
+  auto normalized = add(
       HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply,
                                    operand_minus_mean, rsqrt_var_add_epsilon));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale.
-  auto scaled_normalized =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
+  auto scaled_normalized = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset.
-  auto shifted_normalized = computation_->AddInstruction(
-      HloInstruction::CreateBinary(operand_shape, HloOpcode::kAdd,
-                                   scaled_normalized, offset_broadcasted));
-
-  TF_CHECK_OK(ReplaceWithNewInstruction(
-      batch_norm,
-      HloInstruction::CreateTuple({shifted_normalized, mean, var})));
+  auto shifted_normalized = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kAdd, scaled_normalized, offset_broadcasted));
+
+  auto tuple = HloInstruction::CreateTuple({shifted_normalized, mean, var});
+
+  if (batch_norm->has_sharding()) {
+    int64 instruction_count_after = computation_->instruction_count();
+    CHECK_EQ(instruction_count_after,
+             instruction_count_before + added_instructions.size());
+    HloSharding operand_sharding =
+        batch_norm->sharding().GetAsShapeTree(batch_norm->shape()).element({0});
+    for (HloInstruction* inst : added_instructions) {
+      if (ShapeUtil::Equal(inst->shape(), operand_shape)) {
+        inst->set_sharding(operand_sharding);
+      } else {
+        inst->set_sharding(HloSharding::Replicate());
+      }
+    }
+    tuple->set_sharding(batch_norm->sharding());
+  }
+  TF_CHECK_OK(ReplaceWithNewInstruction(batch_norm, std::move(tuple)));
   return Status::OK();
 }
 
-Status BatchNormRewriterVisitor::HandleBatchNormInference(
+Status BatchNormExpanderVisitor::HandleBatchNormInference(
     HloInstruction* batch_norm) {
   if (!rewrite_inference_op_) {
     return Status::OK();
@@ -317,58 +331,75 @@ Status BatchNormRewriterVisitor::HandleBatchNormInference(
     }
   }
 
-  auto scale_broadcasted = computation_->AddInstruction(
+  std::vector<HloInstruction*> added_instructions;
+  auto add = [&](std::unique_ptr<HloInstruction> inst) {
+    HloInstruction* added_inst = computation_->AddInstruction(std::move(inst));
+    added_instructions.push_back(added_inst);
+    return added_inst;
+  };
+  int64 instruction_count_before = computation_->instruction_count();
+
+  auto scale_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, scale, {feature_index}));
 
-  auto offset_broadcasted = computation_->AddInstruction(
+  auto offset_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, offset, {feature_index}));
 
-  auto mean_broadcasted = computation_->AddInstruction(
+  auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
 
-  auto var_broadcasted = computation_->AddInstruction(
-      HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
+  auto var_broadcasted =
+      add(HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
 
   // Var[X] + epsilon.
-  auto var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
+  auto var_add_epsilon = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
 
   auto neg_half_literal = Literal::CreateR0(-0.5f);
   TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
-  auto neg_half = computation_->AddInstruction(
-      HloInstruction::CreateConstant(std::move(neg_half_literal)));
+  auto neg_half =
+      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
 
   // 1 / Sqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
+  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
 
   // X - E[X].
-  auto operand_minus_mean =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
+  auto operand_minus_mean = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon].
-  auto normalized = computation_->AddInstruction(
+  auto normalized = add(
       HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply,
                                    operand_minus_mean, rsqrt_var_add_epsilon));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale.
-  auto scaled_normalized =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
+  auto scaled_normalized = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset.
   auto shifted_normalized = HloInstruction::CreateBinary(
       operand_shape, HloOpcode::kAdd, scaled_normalized, offset_broadcasted);
 
+  int64 instruction_count_after = computation_->instruction_count();
+  CHECK_EQ(instruction_count_after,
+           instruction_count_before + added_instructions.size());
+  if (batch_norm->has_sharding()) {
+    for (HloInstruction* inst : added_instructions) {
+      if (ShapeUtil::Equal(inst->shape(), operand_shape)) {
+        inst->set_sharding(batch_norm->sharding());
+      } else {
+        inst->set_sharding(HloSharding::Replicate());
+      }
+    }
+    shifted_normalized->set_sharding(batch_norm->sharding());
+  }
   TF_CHECK_OK(
       ReplaceWithNewInstruction(batch_norm, std::move(shifted_normalized)));
   return Status::OK();
 }
 
-Status BatchNormRewriterVisitor::HandleBatchNormGrad(
+Status BatchNormExpanderVisitor::HandleBatchNormGrad(
     HloInstruction* batch_norm) {
   // Use the following formulas to calculate gradients:
   // scale_grad =
@@ -385,6 +416,13 @@ Status BatchNormRewriterVisitor::HandleBatchNormGrad(
   if (!rewrite_grad_op_) {
     return Status::OK();
   }
+  std::vector<HloInstruction*> added_instructions;
+  auto add = [&](std::unique_ptr<HloInstruction> inst) {
+    HloInstruction* added_inst = computation_->AddInstruction(std::move(inst));
+    added_instructions.push_back(added_inst);
+    return added_inst;
+  };
+  int64 instruction_count_before = computation_->instruction_count();
 
   HloInstruction* activation = batch_norm->mutable_operand(0);
   const Shape activation_shape = activation->shape();
@@ -403,23 +441,22 @@ Status BatchNormRewriterVisitor::HandleBatchNormGrad(
       Literal::CreateR0<float>(size_in_elements / feature_count);
   TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
                       elements_per_feature_literal->Convert(ptype));
-  auto elements_per_feature = computation_->AddInstruction(
+  auto elements_per_feature = add(
       HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
 
   auto zero_literal = Literal::CreateR0(0.0f);
   TF_ASSIGN_OR_RETURN(zero_literal, zero_literal->Convert(ptype));
-  auto zero = computation_->AddInstruction(
-      HloInstruction::CreateConstant(std::move(zero_literal)));
+  auto zero = add(HloInstruction::CreateConstant(std::move(zero_literal)));
 
   auto neg_half_literal = Literal::CreateR0(-0.5f);
   TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
-  auto neg_half = computation_->AddInstruction(
-      HloInstruction::CreateConstant(std::move(neg_half_literal)));
+  auto neg_half =
+      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
 
   auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
   TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
-  auto epsilon = computation_->AddInstruction(
-      HloInstruction::CreateConstant(std::move(epsilon_literal)));
+  auto epsilon =
+      add(HloInstruction::CreateConstant(std::move(epsilon_literal)));
 
   std::vector<int64> dimensions_without_feature;
 
@@ -429,141 +466,148 @@ Status BatchNormRewriterVisitor::HandleBatchNormGrad(
     }
   }
 
-  auto scale_broadcasted =
-      computation_->AddInstruction(HloInstruction::CreateBroadcast(
-          activation_shape, scale, {feature_index}));
-  auto variance_broadcasted =
-      computation_->AddInstruction(HloInstruction::CreateBroadcast(
-          activation_shape, variance, {feature_index}));
+  auto scale_broadcasted = add(HloInstruction::CreateBroadcast(
+      activation_shape, scale, {feature_index}));
+  auto variance_broadcasted = add(HloInstruction::CreateBroadcast(
+      activation_shape, variance, {feature_index}));
 
   // E[X].
-  auto mean_broadcasted = computation_->AddInstruction(
+  auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(activation_shape, mean, {feature_index}));
 
   // rsqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon_broadcasted =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kPower,
-          computation_->AddInstruction(
-              HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd,
-                                           variance_broadcasted, epsilon)),
-          neg_half));
-
-  auto rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          feature_shape, HloOpcode::kPower,
-          computation_->AddInstruction(HloInstruction::CreateBinary(
-              feature_shape, HloOpcode::kAdd, variance, epsilon)),
-          neg_half));
+  auto rsqrt_var_add_epsilon_broadcasted = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kPower,
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd,
+                                       variance_broadcasted, epsilon)),
+      neg_half));
+
+  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      feature_shape, HloOpcode::kPower,
+      add(HloInstruction::CreateBinary(feature_shape, HloOpcode::kAdd, variance,
+                                       epsilon)),
+      neg_half));
 
   // X - E[X].
-  auto activation_minus_mean = computation_->AddInstruction(
-      HloInstruction::CreateBinary(activation_shape, HloOpcode::kSubtract,
-                                   activation, mean_broadcasted));
+  auto activation_minus_mean = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kSubtract, activation, mean_broadcasted));
 
   // Grad[Y] * (X - E[X]).
-  auto grad_output_times_activiation_minus_mean = computation_->AddInstruction(
-      HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                   grad_output, activation_minus_mean));
+  auto grad_output_times_activiation_minus_mean =
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
+                                       grad_output, activation_minus_mean));
 
   HloComputation* add_reduce_computation =
       GetScalarBinaryComputation(ptype, HloOpcode::kAdd);
 
   // sum(Grad[Y] * (X - E[X])).
   auto sum_grad_output_times_activiation_minus_mean =
-      computation_->AddInstruction(HloInstruction::CreateReduce(
+      add(HloInstruction::CreateReduce(
           feature_shape, grad_output_times_activiation_minus_mean, zero,
           dimensions_without_feature, add_reduce_computation));
 
   // Grad[beta] = Sum(Grad[Y]).
-  auto grad_beta = computation_->AddInstruction(HloInstruction::CreateReduce(
+  auto grad_beta = add(HloInstruction::CreateReduce(
       feature_shape, grad_output, zero, dimensions_without_feature,
       add_reduce_computation));
 
-  if (use_fusion_) {
-    auto tuple = computation_->AddInstruction(HloInstruction::CreateTuple(
+  if (use_fusion_ && !batch_norm->has_sharding()) {
+    auto tuple = add(HloInstruction::CreateTuple(
         {sum_grad_output_times_activiation_minus_mean, grad_beta}));
 
     auto fused = computation_->CreateFusionInstruction(
         {tuple, sum_grad_output_times_activiation_minus_mean, grad_beta},
         HloInstruction::FusionKind::kInput);
 
-    sum_grad_output_times_activiation_minus_mean = computation_->AddInstruction(
-        HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
+    sum_grad_output_times_activiation_minus_mean =
+        add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
 
-    grad_beta = computation_->AddInstruction(
-        HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
+    grad_beta =
+        add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
   }
 
   // Grad[scale] = Sum(Grad[Y] * (X - E[X]) * rsqrt[Var[X] + epsilon]).
-  auto grad_scale = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto grad_scale = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kMultiply,
       sum_grad_output_times_activiation_minus_mean, rsqrt_var_add_epsilon));
 
   // I2 = Sum(Grad[Y])
-  auto I2 = computation_->AddInstruction(HloInstruction::CreateBroadcast(
-      activation_shape, grad_beta, {feature_index}));
+  auto i2 = add(HloInstruction::CreateBroadcast(activation_shape, grad_beta,
+                                                {feature_index}));
 
   // I3 = Sum(Grad[Y] * (X - E[X]))
-  auto I3 = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+  auto i3 = add(HloInstruction::CreateBroadcast(
       activation_shape, sum_grad_output_times_activiation_minus_mean,
       {feature_index}));
 
   // I4 = (X - E[X]) * I3
-  auto I4 = computation_->AddInstruction(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kMultiply, I3, activation_minus_mean));
+  auto i4 = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kMultiply, i3, activation_minus_mean));
 
   // I5 = I4 / (Var[X] + epsilon)
-  auto I5 = computation_->AddInstruction(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kDivide, I4,
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kAdd, variance_broadcasted, epsilon))));
+  auto i5 = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kDivide, i4,
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd,
+                                       variance_broadcasted, epsilon))));
 
   // scale * rsqrt[Var[X] + epsilon] * 1/N
-  auto scale_times_rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kMultiply, scale_broadcasted,
-          rsqrt_var_add_epsilon_broadcasted));
+  auto scale_times_rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kMultiply, scale_broadcasted,
+      rsqrt_var_add_epsilon_broadcasted));
 
-  scale_times_rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kDivide,
-          scale_times_rsqrt_var_add_epsilon, elements_per_feature));
+  scale_times_rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kDivide, scale_times_rsqrt_var_add_epsilon,
+      elements_per_feature));
 
-  auto I1 = computation_->AddInstruction(
-      HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                   grad_output, elements_per_feature));
+  auto i1 =
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
+                                       grad_output, elements_per_feature));
 
   // I6 = I1 - I2 - I5
-  auto I6 = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto i6 = add(HloInstruction::CreateBinary(
       activation_shape, HloOpcode::kSubtract,
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kSubtract, I1, I2)),
-      I5));
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kSubtract,
+                                       i1, i2)),
+      i5));
 
   // Grad[X] = scale * rsqrt[Var[X] + epsilon] * 1/N * I6.
-  auto grad_activation = computation_->AddInstruction(
-      HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                   scale_times_rsqrt_var_add_epsilon, I6));
+  auto grad_activation =
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
+                                       scale_times_rsqrt_var_add_epsilon, i6));
+  auto tuple =
+      HloInstruction::CreateTuple({grad_activation, grad_scale, grad_beta});
+  if (batch_norm->has_sharding()) {
+    int64 instruction_count_after = computation_->instruction_count();
+    CHECK_EQ(instruction_count_after,
+             instruction_count_before + added_instructions.size());
+    HloSharding activation_sharding =
+        batch_norm->sharding().GetAsShapeTree(batch_norm->shape()).element({0});
+    for (HloInstruction* inst : added_instructions) {
+      if (ShapeUtil::Equal(inst->shape(), activation_shape)) {
+        inst->set_sharding(activation_sharding);
+      } else {
+        inst->set_sharding(HloSharding::Replicate());
+      }
+    }
+    tuple->set_sharding(batch_norm->sharding());
+  }
 
-  TF_CHECK_OK(ReplaceWithNewInstruction(
-      batch_norm,
-      HloInstruction::CreateTuple({grad_activation, grad_scale, grad_beta})));
+  TF_CHECK_OK(ReplaceWithNewInstruction(batch_norm, std::move(tuple)));
 
   return Status::OK();
 }
 
-StatusOr<bool> BatchNormRewriter::Run(HloModule* module) {
-  XLA_VLOG_LINES(2, "BatchNormRewriter::Run(), before:\n" + module->ToString());
+StatusOr<bool> BatchNormExpander::Run(HloModule* module) {
+  XLA_VLOG_LINES(2, "BatchNormExpander::Run(), before:\n" + module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations()) {
-    if (BatchNormRewriterVisitor::Run(comp, rewrite_training_op_,
+    if (BatchNormExpanderVisitor::Run(comp, rewrite_training_op_,
                                       rewrite_inference_op_, rewrite_grad_op_,
                                       use_fusion_)) {
       changed = true;
     }
   }
-  XLA_VLOG_LINES(2, "BatchNormRewriter::Run(), after:\n" + module->ToString());
+  XLA_VLOG_LINES(2, "BatchNormExpander::Run(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter.h b/tensorflow/compiler/xla/service/batchnorm_expander.h
similarity index 83%
rename from tensorflow/compiler/xla/service/batchnorm_rewriter.h
rename to tensorflow/compiler/xla/service/batchnorm_expander.h
index f601741d964376058a2bafade311ede4c8567fd2..4ad987085da91684bb7891070afeefd19be4138f 100644
--- a/tensorflow/compiler/xla/service/batchnorm_rewriter.h
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_REWRITER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_REWRITER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_EXPANDER_H_
 
 #include <utility>
 
@@ -26,18 +26,18 @@ namespace xla {
 // A pass which rewrites batch norm operations into more operations. Breaking a
 // big operation into smaller operations helps leverage our generic fusion
 // logic.
-class BatchNormRewriter : public HloPassInterface {
+class BatchNormExpander : public HloPassInterface {
  public:
   // When use_fusion is set, a multi-output fusion node is created.
-  BatchNormRewriter(bool rewrite_training_op = false,
+  BatchNormExpander(bool rewrite_training_op = false,
                     bool rewrite_inference_op = false,
                     bool rewrite_grad_op = false, bool use_fusion = true)
       : rewrite_training_op_(rewrite_training_op),
         rewrite_inference_op_(rewrite_inference_op),
         rewrite_grad_op_(rewrite_grad_op),
         use_fusion_(use_fusion) {}
-  ~BatchNormRewriter() = default;
-  tensorflow::StringPiece name() const override { return "batchnorm_rewriter"; }
+  ~BatchNormExpander() = default;
+  tensorflow::StringPiece name() const override { return "batchnorm_expander"; }
 
   // Run operation expander on the given computation. Returns whether the
   // computation was changed.
@@ -52,4 +52,4 @@ class BatchNormRewriter : public HloPassInterface {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_REWRITER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BATCHNORM_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter_test.cc b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
similarity index 93%
rename from tensorflow/compiler/xla/service/batchnorm_rewriter_test.cc
rename to tensorflow/compiler/xla/service/batchnorm_expander_test.cc
index 590f79aee51ccf410823b91fd8ad09fc7c429c7d..aa36e64b07099a372dab67babc7a18a2d39596bc 100644
--- a/tensorflow/compiler/xla/service/batchnorm_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/batchnorm_rewriter.h"
+#include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 
 #include <memory>
 #include <utility>
@@ -36,10 +36,10 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using BatchNormRewriterTest = HloTestBase;
+using BatchNormExpanderTest = HloTestBase;
 
 // Test that we expand BatchNormTraining.
-TEST_F(BatchNormRewriterTest, BatchNormTraining) {
+TEST_F(BatchNormExpanderTest, BatchNormTraining) {
   Shape input_shape = ShapeUtil::MakeShape(F32, {2, 2, 2, 2});
   Shape scale_shape = ShapeUtil::MakeShape(F32, {2});
   Shape offset_shape = ShapeUtil::MakeShape(F32, {2});
@@ -63,7 +63,7 @@ TEST_F(BatchNormRewriterTest, BatchNormTraining) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBatchNormTraining);
-  BatchNormRewriter rewriter(/*rewrite_training_op=*/true,
+  BatchNormExpander rewriter(/*rewrite_training_op=*/true,
                              /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
   ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
@@ -73,7 +73,7 @@ TEST_F(BatchNormRewriterTest, BatchNormTraining) {
 }
 
 // Test that we expand BatchNormGrad.
-TEST_F(BatchNormRewriterTest, BatchNormGrad) {
+TEST_F(BatchNormExpanderTest, BatchNormGrad) {
   Shape input_shape = ShapeUtil::MakeShape(F32, {2, 2, 2, 2});
   Shape scale_shape = ShapeUtil::MakeShape(F32, {2});
   Shape mean_shape = ShapeUtil::MakeShape(F32, {2});
@@ -105,7 +105,7 @@ TEST_F(BatchNormRewriterTest, BatchNormGrad) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBatchNormGrad);
-  BatchNormRewriter rewriter(/*rewrite_training_op=*/true,
+  BatchNormExpander rewriter(/*rewrite_training_op=*/true,
                              /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
   ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cde990e176ddb57a8e93ecc3c60260b2dbae32a8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
@@ -0,0 +1,184 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/bfloat16_conversion_folding.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+class BFloat16ConversionFoldingVisitor : public DfsHloVisitorWithDefault {
+ public:
+  explicit BFloat16ConversionFoldingVisitor(
+      HloComputation* computation, const BFloat16Support* bfloat16_support)
+      : computation_(computation), bfloat16_support_(bfloat16_support) {}
+
+  Status DefaultAction(HloInstruction* hlo) override;
+
+  static bool Run(HloComputation* computation,
+                  const BFloat16Support* bfloat16_support) {
+    BFloat16ConversionFoldingVisitor visitor(computation, bfloat16_support);
+    TF_CHECK_OK(computation->Accept(&visitor));
+    return visitor.changed_;
+  }
+
+ private:
+  // Checks if the HLO has a BF16 -> F32 conversion as input, or a F32 -> BF16
+  // conversion as output, and folds them to the HLO itself if feasible.
+  Status TryFoldBF16Conversions(HloInstruction* hlo);
+
+  // Folds the F32 -> BF16 conversions from the HLO's output.
+  //
+  // Precondition: all of the HLO's users are F32 -> BF16 conversions.
+  Status FoldOutputConversions(HloInstruction* hlo);
+
+  // Folds the BF16 -> F32 conversion operand to the HLO.
+  //
+  // Precondition: the operand is a F32 -> BF16 conversion.
+  Status FoldOperandConversion(HloInstruction* hlo, int64 operand_index);
+
+  HloComputation* computation_;
+  const BFloat16Support* bfloat16_support_;
+  bool changed_ = false;
+};
+
+Status BFloat16ConversionFoldingVisitor::FoldOutputConversions(
+    HloInstruction* hlo) {
+  std::vector<HloInstruction*> materialized_users = hlo->users();
+  hlo->mutable_shape()->set_element_type(BF16);
+  for (auto user : materialized_users) {
+    CHECK_EQ(user->opcode(), HloOpcode::kConvert);
+    TF_RETURN_IF_ERROR(user->ReplaceAllUsesWith(hlo));
+    changed_ = true;
+  }
+  return Status::OK();
+}
+
+Status BFloat16ConversionFoldingVisitor::FoldOperandConversion(
+    HloInstruction* hlo, int64 operand_index) {
+  // The operand is a convert from BF16 to F32.
+  auto operand = hlo->mutable_operand(operand_index);
+  CHECK_EQ(operand->opcode(), HloOpcode::kConvert);
+  TF_RETURN_IF_ERROR(
+      hlo->ReplaceOperandWith(operand_index, operand->mutable_operand(0)));
+  changed_ = true;
+  return Status::OK();
+}
+
+Status BFloat16ConversionFoldingVisitor::TryFoldBF16Conversions(
+    HloInstruction* hlo) {
+  std::vector<int64> bf16_to_f32_operands;
+  bool has_other_f32_operands = false;
+  for (int64 i = 0; i < hlo->operands().size(); ++i) {
+    auto operand = hlo->operand(i);
+    if (operand->shape().element_type() == F32) {
+      if (operand->opcode() == HloOpcode::kConvert &&
+          operand->operand(0)->shape().element_type() == BF16 &&
+          bfloat16_support_->SupportsBF16Operand(*hlo, i)) {
+        // Operand is a convert from BF16 to F32 and we support BF16 input
+        // directly in the current HLO at the operand index.
+        bf16_to_f32_operands.push_back(i);
+      } else {
+        has_other_f32_operands = true;
+      }
+      continue;
+    }
+  }
+
+  bool fold_output_conversion = hlo->user_count() > 0 &&
+                                hlo->shape().element_type() == F32 &&
+                                bfloat16_support_->SupportsBF16Output(*hlo) &&
+                                hlo != computation_->root_instruction();
+  if (fold_output_conversion) {
+    for (auto user : hlo->users()) {
+      if (user->opcode() == HloOpcode::kConvert &&
+          user->shape().element_type() == BF16) {
+        continue;
+      }
+      // We should not change the output type if any user is not a conversion
+      // from F32 to BF16.
+      fold_output_conversion = false;
+      break;
+    }
+  }
+
+  if (!bfloat16_support_->SupportsMixedPrecisions(*hlo)) {
+    if (has_other_f32_operands ||
+        (!fold_output_conversion && hlo->shape().element_type() == F32)) {
+      // Some of the operands/output will remain F32, but we cannot use mixed
+      // precisions, so we cannot do anything here.
+      return Status::OK();
+    }
+  }
+
+  if (fold_output_conversion) {
+    TF_RETURN_IF_ERROR(FoldOutputConversions(hlo));
+  }
+
+  for (int64 i : bf16_to_f32_operands) {
+    TF_RETURN_IF_ERROR(FoldOperandConversion(hlo, i));
+  }
+  return Status::OK();
+}
+
+Status BFloat16ConversionFoldingVisitor::DefaultAction(HloInstruction* hlo) {
+  // Do not fold BF16 conversions for instructions related to tuples, entry and
+  // exit of a computation, fusion, convert, and control flow.
+  if (hlo->opcode() == HloOpcode::kTuple ||            //
+      hlo->opcode() == HloOpcode::kGetTupleElement ||  //
+      hlo->opcode() == HloOpcode::kInfeed ||           //
+      hlo->opcode() == HloOpcode::kOutfeed ||          //
+      hlo->opcode() == HloOpcode::kConstant ||         //
+      hlo->opcode() == HloOpcode::kParameter ||        //
+      hlo->opcode() == HloOpcode::kFusion ||           //
+      hlo->opcode() == HloOpcode::kConvert ||          //
+      hlo->opcode() == HloOpcode::kCall ||             //
+      hlo->opcode() == HloOpcode::kCustomCall ||       //
+      hlo->opcode() == HloOpcode::kWhile ||            //
+      hlo->opcode() == HloOpcode::kConditional) {
+    return Status::OK();
+  }
+  if (hlo == computation_->root_instruction() &&
+      !bfloat16_support_->SupportsMixedPrecisions(*hlo)) {
+    // If hlo is the root instruction, we cannot change its output, so folding
+    // can only happen when it supports mixed precision so that we can change
+    // its operands.
+    return Status::OK();
+  }
+  return TryFoldBF16Conversions(hlo);
+}
+
+StatusOr<bool> BFloat16ConversionFolding::Run(HloModule* module) {
+  XLA_VLOG_LINES(
+      2, "BFloat16ConversionFolding::Run(), before:\n" + module->ToString());
+  bool changed = false;
+  for (auto* comp : module->MakeNonfusionComputations()) {
+    if (BFloat16ConversionFoldingVisitor::Run(comp, bfloat16_support_)) {
+      changed = true;
+    }
+  }
+  XLA_VLOG_LINES(
+      2, "BFloat16ConversionFolding::Run(), after:\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9398387098fad84ba28735c30e426fedd9b0cb0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.h
@@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_CONVERSION_FOLDING_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_CONVERSION_FOLDING_H_
+
+#include "tensorflow/compiler/xla/service/bfloat16_support.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass which folds F32 <-> BF16 conversions to their operands or users, when
+// it is supported by the backend.
+//
+// This pass follows the passed-in backend-specific BF16 support rules, but can
+// introduce mixed precision in individual HLOs which breaks the assumption of
+// some other HLO passes. So it should be used at the end of the HLO
+// optimization pipeline followed by a DCE pass. If other passes are needed
+// after this pass, run BFloat16MixedPrecisionRemoval first to undo some of the
+// changed made by this pass.
+class BFloat16ConversionFolding : public HloPassInterface {
+ public:
+  explicit BFloat16ConversionFolding(const BFloat16Support* bfloat16_support)
+      : bfloat16_support_(bfloat16_support) {}
+
+  ~BFloat16ConversionFolding() override = default;
+  tensorflow::StringPiece name() const override { return "bfloat16-fold"; }
+
+  // Run BF16 conversion folding on the given computation. Returns whether the
+  // computation was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  const BFloat16Support* bfloat16_support_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_CONVERSION_FOLDING_H_
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb37759439debf41a305ec7dccaa548e1bf234cd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
@@ -0,0 +1,209 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/bfloat16_conversion_folding.h"
+#include "tensorflow/compiler/xla/service/bfloat16_support.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+class TestBFloat16Support : public BFloat16Support {
+ public:
+  TestBFloat16Support() {}
+  ~TestBFloat16Support() override {}
+
+  bool SupportsBF16Operand(const HloInstruction& hlo,
+                           int64 operand_index) const override {
+    if (hlo.opcode() == HloOpcode::kAdd ||
+        hlo.opcode() == HloOpcode::kSubtract ||
+        hlo.opcode() == HloOpcode::kTuple ||
+        hlo.opcode() == HloOpcode::kGetTupleElement) {
+      return true;
+    }
+    return false;
+  }
+
+  bool SupportsBF16Output(const HloInstruction& hlo) const override {
+    if (hlo.opcode() == HloOpcode::kAdd ||
+        hlo.opcode() == HloOpcode::kSubtract ||
+        hlo.opcode() == HloOpcode::kTuple ||
+        hlo.opcode() == HloOpcode::kGetTupleElement) {
+      return true;
+    }
+    return false;
+  }
+
+  bool SupportsMixedPrecisions(const HloInstruction& hlo) const override {
+    if (hlo.opcode() == HloOpcode::kAdd || hlo.opcode() == HloOpcode::kTuple ||
+        hlo.opcode() == HloOpcode::kGetTupleElement) {
+      return true;
+    }
+    return false;
+  }
+};
+
+class BFloat16ConversionFoldingTest : public HloTestBase {
+ protected:
+  bool FoldConversions(HloModule* module) {
+    TestBFloat16Support bfloat16_support_;
+    BFloat16ConversionFolding fold(&bfloat16_support_);
+    StatusOr<bool> result = fold.Run(module);
+    EXPECT_IS_OK(result.status());
+    return result.ValueOrDie();
+  }
+};
+
+TEST_F(BFloat16ConversionFoldingTest, FoldIfSupported) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32_shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32_shape, "b"));
+  HloInstruction* c = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32_shape, "c"));
+
+  HloInstruction* add0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32_shape, HloOpcode::kAdd, a, b));
+  HloInstruction* convert0 =
+      builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, add0));
+  HloInstruction* convert1 = builder.AddInstruction(
+      HloInstruction::CreateConvert(f32_shape, convert0));
+
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32_shape, HloOpcode::kAdd, convert1, c));
+  builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, add1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(FoldConversions(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), add1);
+  EXPECT_EQ(add0->shape().element_type(), BF16);
+  EXPECT_EQ(add1->shape().element_type(), BF16);
+  EXPECT_EQ(add1->operand(0), add0);
+}
+
+TEST_F(BFloat16ConversionFoldingTest, DoNotFoldIfUnsupported) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32_shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32_shape, "b"));
+  HloInstruction* c = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32_shape, "c"));
+
+  HloInstruction* mul0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32_shape, HloOpcode::kMultiply, a, b));
+  HloInstruction* convert0 =
+      builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, mul0));
+  HloInstruction* convert1 = builder.AddInstruction(
+      HloInstruction::CreateConvert(f32_shape, convert0));
+
+  HloInstruction* mul1 = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32_shape, HloOpcode::kMultiply, convert1, c));
+  HloInstruction* convert2 =
+      builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, mul1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_FALSE(FoldConversions(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), convert2);
+  EXPECT_EQ(mul0->shape().element_type(), F32);
+  EXPECT_EQ(mul1->shape().element_type(), F32);
+  EXPECT_EQ(mul1->operand(0), convert1);
+}
+
+TEST_F(BFloat16ConversionFoldingTest, DoNotFoldUnsupportedMixedPrecision) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32_shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32_shape, "b"));
+  HloInstruction* c = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32_shape, "c"));
+
+  HloInstruction* sub0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32_shape, HloOpcode::kSubtract, a, b));
+  HloInstruction* convert0 =
+      builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, sub0));
+  HloInstruction* convert1 = builder.AddInstruction(
+      HloInstruction::CreateConvert(f32_shape, convert0));
+
+  HloInstruction* sub1 = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32_shape, HloOpcode::kSubtract, convert1, c));
+  HloInstruction* convert2 =
+      builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, sub1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_FALSE(FoldConversions(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), convert2);
+  EXPECT_EQ(sub0->shape().element_type(), F32);
+  EXPECT_EQ(sub1->shape().element_type(), F32);
+  EXPECT_EQ(sub1->operand(0), convert1);
+}
+
+TEST_F(BFloat16ConversionFoldingTest, DoNotFoldTuple) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32_shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, bf16_shape, "b"));
+  HloInstruction* convert0 =
+      builder.AddInstruction(HloInstruction::CreateConvert(f32_shape, b));
+
+  HloInstruction* tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({a, convert0}));
+  HloInstruction* gte = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(f32_shape, tuple, 0));
+  HloInstruction* convert1 =
+      builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, gte));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_FALSE(FoldConversions(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), convert1);
+  EXPECT_EQ(gte->shape().element_type(), F32);
+  EXPECT_EQ(tuple->operand(1), convert0);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization.cc b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b032c040e8aff49f9e0fc1ff9a1c1e79ea4bb77f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
@@ -0,0 +1,351 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+class BFloat16NormalizationVisitor : public DfsHloVisitorWithDefault {
+ public:
+  explicit BFloat16NormalizationVisitor(HloComputation* computation,
+                                        const BFloat16Support* bfloat16_support)
+      : computation_(computation), bfloat16_support_(bfloat16_support) {}
+
+  Status DefaultAction(HloInstruction* hlo) override;
+
+  // Special handling for cross-replica-sum which can have a tuple output.
+  Status HandleCrossReplicaSum(HloInstruction* crs) override;
+
+  static bool Run(HloComputation* computation,
+                  const BFloat16Support* bfloat16_support) {
+    BFloat16NormalizationVisitor visitor(computation, bfloat16_support);
+    TF_CHECK_OK(computation->Accept(&visitor));
+    return visitor.changed_;
+  }
+
+ private:
+  // Checks if the HLO uses BF16 in an unsupported way, and if so, inserts
+  // conversions between F32 and BF16 to make it supported.
+  Status HandleInstruction(HloInstruction* hlo);
+
+  // Inserts a conversion HLO that changes the given HLO's output type.
+  Status InsertConvertAfterOutput(HloInstruction* hlo, PrimitiveType to,
+                                  HloComputation* computation);
+
+  // Changes the output type to the specified type, then inserts a conversion
+  // to the original type.
+  Status ChangeOutputTypeThenInsertConvertBack(HloInstruction* hlo,
+                                               PrimitiveType to,
+                                               HloComputation* computation);
+
+  // Inserts a conversion HLO that changes the given HLO's operand type.
+  Status InsertConvertBeforeOperand(HloInstruction* hlo, int64 operand_idx,
+                                    PrimitiveType to,
+                                    HloComputation* computation);
+
+  // Inserts conversion HLOs to replace the called computations' BF16
+  // operands/outputs to F32.
+  Status ConvertCalledComputations(
+      HloInstruction* hlo,
+      tensorflow::gtl::ArraySlice<HloComputation*> bf16_called_comps);
+
+  HloComputation* computation_;
+  const BFloat16Support* bfloat16_support_;
+  bool changed_ = false;
+};
+
+Status BFloat16NormalizationVisitor::InsertConvertAfterOutput(
+    HloInstruction* hlo, PrimitiveType to, HloComputation* computation) {
+  bool is_root = computation->root_instruction() == hlo;
+  std::vector<HloInstruction*> materialized_users = hlo->users();
+  // Use inst's shape temporarily, in order to pass checks in ReplaceUseWith.
+  auto convert = computation->AddInstruction(
+      HloInstruction::CreateConvert(hlo->shape(), hlo));
+  for (auto* user : materialized_users) {
+    TF_RETURN_IF_ERROR(hlo->ReplaceUseWith(user, convert));
+  }
+  if (is_root) {
+    computation->set_root_instruction(convert);
+  }
+  convert->mutable_shape()->set_element_type(to);
+  changed_ = true;
+  return Status::OK();
+}
+
+Status BFloat16NormalizationVisitor::ChangeOutputTypeThenInsertConvertBack(
+    HloInstruction* hlo, PrimitiveType to, HloComputation* computation) {
+  auto original_type = hlo->shape().element_type();
+  hlo->mutable_shape()->set_element_type(to);
+  return InsertConvertAfterOutput(hlo, original_type, computation);
+}
+
+Status BFloat16NormalizationVisitor::InsertConvertBeforeOperand(
+    HloInstruction* hlo, int64 operand_idx, PrimitiveType to,
+    HloComputation* computation) {
+  auto operand = hlo->mutable_operand(operand_idx);
+  auto convert = computation->AddInstruction(HloInstruction::CreateConvert(
+      ShapeUtil::ChangeElementType(operand->shape(), to), operand));
+  TF_RETURN_IF_ERROR(hlo->ReplaceOperandWith(operand_idx, convert));
+  changed_ = true;
+  return Status::OK();
+}
+
+Status BFloat16NormalizationVisitor::ConvertCalledComputations(
+    HloInstruction* hlo,
+    tensorflow::gtl::ArraySlice<HloComputation*> bf16_called_comps) {
+  std::map<HloComputation*, HloComputation*> cloned_computations;
+  for (auto& comp : bf16_called_comps) {
+    auto cloned = comp->parent()->AddEmbeddedComputation(comp->Clone());
+    cloned_computations[comp] = cloned;
+    changed_ = true;
+  }
+  hlo->ReplaceCalledComputations([&](HloComputation* comp) {
+    auto it = cloned_computations.find(comp);
+    if (it != cloned_computations.end()) {
+      return it->second;
+    }
+    return comp;
+  });
+  for (auto& comp_pair : cloned_computations) {
+    auto comp = comp_pair.second;
+    if (comp->root_instruction()->shape().element_type() == BF16) {
+      TF_RETURN_IF_ERROR(
+          InsertConvertAfterOutput(comp->root_instruction(), F32, comp));
+    }
+    for (auto* param : comp->parameter_instructions()) {
+      if (param->shape().element_type() == BF16) {
+        // This changes the parameter to F32 then inserts a convert after it.
+        TF_RETURN_IF_ERROR(
+            ChangeOutputTypeThenInsertConvertBack(param, F32, comp));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status BFloat16NormalizationVisitor::HandleCrossReplicaSum(
+    HloInstruction* crs) {
+  if (!ShapeUtil::IsTuple(crs->shape())) {
+    return HandleInstruction(crs);
+  }
+
+  std::vector<PrimitiveType> operand_types(crs->operand_count());
+  std::vector<PrimitiveType> output_types(crs->operand_count());
+  bool has_f32 = false;
+  bool has_bf16 = false;
+  bool has_bf16_output = false;
+  for (int64 i = 0; i < crs->operand_count(); ++i) {
+    operand_types[i] = crs->operand(i)->shape().element_type();
+    output_types[i] = ShapeUtil::GetSubshape(crs->shape(), {i}).element_type();
+    if (operand_types[i] == F32 || output_types[i] == F32) {
+      has_f32 = true;
+    } else if (operand_types[i] == BF16) {
+      has_bf16 = true;
+    }
+    if (output_types[i] == BF16) {
+      has_bf16 = true;
+      has_bf16_output = true;
+    }
+  }
+
+  for (int64 i = 0; i < crs->operand_count(); ++i) {
+    if (operand_types[i] != BF16) {
+      continue;
+    }
+    if (bfloat16_support_->SupportsBF16Operand(*crs, i) &&
+        (bfloat16_support_->SupportsMixedPrecisions(*crs) || !has_f32)) {
+      continue;
+    }
+    TF_RETURN_IF_ERROR(InsertConvertBeforeOperand(crs, i, F32, computation_));
+    has_f32 = true;
+  }
+
+  if (!has_bf16_output) {
+    return Status::OK();
+  }
+
+  if (bfloat16_support_->SupportsBF16Output(*crs) &&
+      (bfloat16_support_->SupportsMixedPrecisions(*crs) || !has_f32)) {
+    return Status::OK();
+  }
+
+  std::vector<HloInstruction*> output_elements(crs->operand_count());
+  auto original_shape = crs->shape();
+  for (int64 i = 0; i < crs->operand_count(); ++i) {
+    auto subshape = ShapeUtil::GetMutableSubshape(crs->mutable_shape(), {i});
+    if (output_types[i] != BF16) {
+      output_elements[i] = computation_->AddInstruction(
+          HloInstruction::CreateGetTupleElement(*subshape, crs, i));
+      continue;
+    }
+    subshape->set_element_type(F32);
+    auto gte = computation_->AddInstruction(
+        HloInstruction::CreateGetTupleElement(*subshape, crs, i));
+    output_elements[i] =
+        computation_->AddInstruction(HloInstruction::CreateConvert(
+            ShapeUtil::ChangeElementType(*subshape, BF16), gte));
+  }
+  auto tuple = computation_->AddInstruction(
+      HloInstruction::CreateTuple(output_elements));
+
+  std::vector<HloInstruction*> materialized_users = crs->users();
+  // Use the crs' shape temporarily, in order to pass checks in
+  // ReplaceUseWith.
+  *tuple->mutable_shape() = crs->shape();
+  for (auto* user : materialized_users) {
+    TF_RETURN_IF_ERROR(crs->ReplaceUseWith(user, tuple));
+  }
+  *tuple->mutable_shape() = original_shape;
+  return Status::OK();
+}
+
+Status BFloat16NormalizationVisitor::HandleInstruction(HloInstruction* hlo) {
+  std::vector<int64> bf16_operands;
+  std::vector<int64> f32_operands;
+  bool has_f32 = false;
+  bool has_bf16 = false;
+
+  for (int64 i = 0; i < hlo->operand_count(); ++i) {
+    if (hlo->operand(i)->shape().element_type() == F32) {
+      f32_operands.push_back(i);
+      has_f32 = true;
+    } else if (hlo->operand(i)->shape().element_type() == BF16) {
+      bf16_operands.push_back(i);
+      has_bf16 = true;
+    }
+  }
+
+  if (hlo->shape().element_type() == F32) {
+    has_f32 = true;
+  } else if (hlo->shape().element_type() == BF16) {
+    has_bf16 = true;
+  }
+
+  std::vector<HloComputation*> bf16_called_comps;
+  for (auto* comp : hlo->called_computations()) {
+    bool comp_has_bf16 = false;
+    if (comp->root_instruction()->shape().element_type() == F32) {
+      has_f32 = true;
+    } else if (comp->root_instruction()->shape().element_type() == BF16) {
+      has_bf16 = true;
+      comp_has_bf16 = true;
+    }
+    for (auto* param : comp->parameter_instructions()) {
+      if (param->shape().element_type() == F32) {
+        has_f32 = true;
+      } else if (param->shape().element_type() == BF16) {
+        has_bf16 = true;
+        comp_has_bf16 = true;
+      }
+    }
+    if (comp_has_bf16) {
+      bf16_called_comps.push_back(comp);
+    }
+  }
+
+  if (!bfloat16_support_->SupportsMixedPrecisions(*hlo) && has_bf16 &&
+      has_f32) {
+    // Resolve unsupported mixed precision.
+    //
+    // See if we can change everything to BF16.
+    if (hlo->called_computations().empty() &&
+        hlo->shape().element_type() == BF16) {
+      bool can_use_bf16 = true;
+      for (int i : f32_operands) {
+        if (bfloat16_support_->EffectiveOperandPrecisionIsOutputPrecision(*hlo,
+                                                                          i) &&
+            bfloat16_support_->SupportsBF16Operand(*hlo, i)) {
+          continue;
+        }
+        can_use_bf16 = false;
+        break;
+      }
+      if (can_use_bf16) {
+        for (int i : f32_operands) {
+          TF_RETURN_IF_ERROR(
+              InsertConvertBeforeOperand(hlo, i, BF16, computation_));
+        }
+        return Status::OK();
+      }
+    }
+    if (hlo->shape().element_type() == BF16) {
+      TF_RETURN_IF_ERROR(
+          ChangeOutputTypeThenInsertConvertBack(hlo, F32, computation_));
+    }
+    for (int i : bf16_operands) {
+      TF_RETURN_IF_ERROR(InsertConvertBeforeOperand(hlo, i, F32, computation_));
+    }
+    return ConvertCalledComputations(hlo, bf16_called_comps);
+  }
+
+  for (int i : bf16_operands) {
+    if (!bfloat16_support_->SupportsBF16Operand(*hlo, i)) {
+      TF_RETURN_IF_ERROR(InsertConvertBeforeOperand(hlo, i, F32, computation_));
+    }
+  }
+
+  if (hlo->shape().element_type() == BF16 &&
+      !bfloat16_support_->SupportsBF16Output(*hlo)) {
+    TF_RETURN_IF_ERROR(
+        ChangeOutputTypeThenInsertConvertBack(hlo, F32, computation_));
+  }
+
+  return Status::OK();
+}
+
+Status BFloat16NormalizationVisitor::DefaultAction(HloInstruction* hlo) {
+  // Do not change instructions related to entry and exit of a computation,
+  // tuples, fusion, convert, and control flow.
+  if (hlo->opcode() == HloOpcode::kTuple ||            //
+      hlo->opcode() == HloOpcode::kGetTupleElement ||  //
+      hlo->opcode() == HloOpcode::kInfeed ||           //
+      hlo->opcode() == HloOpcode::kOutfeed ||          //
+      hlo->opcode() == HloOpcode::kConstant ||         //
+      hlo->opcode() == HloOpcode::kParameter ||        //
+      hlo->opcode() == HloOpcode::kFusion ||           //
+      hlo->opcode() == HloOpcode::kConvert ||          //
+      hlo->opcode() == HloOpcode::kCall ||             //
+      hlo->opcode() == HloOpcode::kCustomCall ||       //
+      hlo->opcode() == HloOpcode::kWhile ||            //
+      hlo->opcode() == HloOpcode::kConditional) {
+    return Status::OK();
+  }
+  return HandleInstruction(hlo);
+}
+
+StatusOr<bool> BFloat16Normalization::Run(HloModule* module) {
+  XLA_VLOG_LINES(
+      2, "BFloat16Normalization::Run(), before:\n" + module->ToString());
+  bool changed = false;
+  for (auto* comp : module->MakeComputationPostOrder()) {
+    if (BFloat16NormalizationVisitor::Run(comp, bfloat16_support_)) {
+      changed = true;
+    }
+  }
+  XLA_VLOG_LINES(2,
+                 "BFloat16Normalization::Run(), after:\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization.h b/tensorflow/compiler/xla/service/bfloat16_normalization.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a60fe0af3218484acb95e6c69815d551350764c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization.h
@@ -0,0 +1,92 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_NORMALIZATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_NORMALIZATION_H_
+
+#include "tensorflow/compiler/xla/service/bfloat16_support.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass which adds F32 <-> BF16 conversions for HLO instructions that do not
+// support BF16 input/output or mixed precision, according to the passed-in
+// backend-specific BF16 support rules.
+class BFloat16Normalization : public HloPassInterface {
+ public:
+  explicit BFloat16Normalization(const BFloat16Support* bfloat16_support)
+      : bfloat16_support_(bfloat16_support) {}
+
+  ~BFloat16Normalization() override = default;
+  tensorflow::StringPiece name() const override { return "bf16-normalization"; }
+
+  // Run BF16 normalization on the given computation. Returns whether the
+  // computation was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  const BFloat16Support* bfloat16_support_;
+};
+
+// A pass that unconditionally removes the mixed F32/BF16 uses in HLO
+// instructions (excluding convert) by adding F32 <-> BF16 conversions. Unlike
+// BFloat16Normalization, this pass does not use a backend-specific
+// BFloat16Support, and does not change HLOs that have BF16 data if they do not
+// use mixed precision; it removes mixed precision even if the backend supports
+// it. This pass is used to make the HLO module valid for other HLO passes which
+// do not support mixed precision.
+class BFloat16MixedPrecisionRemoval : public HloPassInterface {
+ public:
+  BFloat16MixedPrecisionRemoval() {}
+
+  ~BFloat16MixedPrecisionRemoval() override = default;
+
+  tensorflow::StringPiece name() const override {
+    return "bf16-mixed-precision-removal";
+  }
+
+  // Run mixed precision removal on the given computation. Returns whether the
+  // computation was changed.
+  StatusOr<bool> Run(HloModule* module) override {
+    BFloat16Normalization normalization(&no_mixed_precision_support_);
+    return normalization.Run(module);
+  }
+
+ private:
+  class BFloat16SupportForMixedPrecisionRemoval : public BFloat16Support {
+   public:
+    BFloat16SupportForMixedPrecisionRemoval() {}
+
+    ~BFloat16SupportForMixedPrecisionRemoval() override = default;
+
+    bool SupportsBF16Operand(const HloInstruction& hlo,
+                             int64 operand_index) const override {
+      return true;
+    }
+
+    bool SupportsBF16Output(const HloInstruction& hlo) const override {
+      return true;
+    }
+
+    bool SupportsMixedPrecisions(const HloInstruction& hlo) const override {
+      return false;
+    }
+  } no_mixed_precision_support_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_NORMALIZATION_H_
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..66c3085842c4afe7ffc4d5891883e4cce9389d45
--- /dev/null
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -0,0 +1,248 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/bfloat16_support.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+class TestBFloat16Support : public BFloat16Support {
+ public:
+  TestBFloat16Support() {}
+  ~TestBFloat16Support() override {}
+
+  bool SupportsBF16Operand(const HloInstruction& hlo,
+                           int64 operand_index) const override {
+    if (hlo.opcode() == HloOpcode::kAdd ||
+        hlo.opcode() == HloOpcode::kSubtract ||
+        hlo.opcode() == HloOpcode::kReduce ||
+        hlo.opcode() == HloOpcode::kTuple ||
+        hlo.opcode() == HloOpcode::kGetTupleElement) {
+      return true;
+    }
+    return false;
+  }
+
+  bool SupportsBF16Output(const HloInstruction& hlo) const override {
+    if (hlo.opcode() == HloOpcode::kAdd || hlo.opcode() == HloOpcode::kReduce ||
+        hlo.opcode() == HloOpcode::kSubtract ||
+        hlo.opcode() == HloOpcode::kTuple ||
+        hlo.opcode() == HloOpcode::kGetTupleElement) {
+      return true;
+    }
+    return false;
+  }
+
+  bool SupportsMixedPrecisions(const HloInstruction& hlo) const override {
+    if (hlo.opcode() == HloOpcode::kAdd || hlo.opcode() == HloOpcode::kTuple ||
+        hlo.opcode() == HloOpcode::kGetTupleElement) {
+      return true;
+    }
+    return false;
+  }
+};
+
+class BFloat16NormalizationTest : public HloTestBase {
+ protected:
+  bool Normalize(HloModule* module) {
+    TestBFloat16Support bfloat16_support_;
+    BFloat16Normalization normalization(&bfloat16_support_);
+    StatusOr<bool> result = normalization.Run(module);
+    EXPECT_IS_OK(result.status());
+    return result.ValueOrDie();
+  }
+};
+
+TEST_F(BFloat16NormalizationTest, NoopIfSupported) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32_shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, bf16_shape, "b"));
+  HloInstruction* c = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32_shape, "c"));
+
+  HloInstruction* add0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(bf16_shape, HloOpcode::kAdd, a, b));
+
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32_shape, HloOpcode::kAdd, add0, c));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_FALSE(Normalize(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), add1);
+  EXPECT_EQ(add0->shape().element_type(), BF16);
+  EXPECT_EQ(add1->shape().element_type(), F32);
+}
+
+TEST_F(BFloat16NormalizationTest, ResolveIfUnsupportedBF16) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32_shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, bf16_shape, "b"));
+  HloInstruction* c = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32_shape, "c"));
+
+  HloInstruction* mul0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(bf16_shape, HloOpcode::kMultiply, a, b));
+
+  HloInstruction* mul1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(bf16_shape, HloOpcode::kMultiply, mul0, c));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(Normalize(module.get()));
+
+  EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
+  EXPECT_EQ(computation->root_instruction()->operand(0), mul1);
+  EXPECT_EQ(mul0->shape().element_type(), F32);
+  EXPECT_EQ(mul1->shape().element_type(), F32);
+  EXPECT_EQ(mul1->operand(0)->opcode(), HloOpcode::kConvert);
+}
+
+TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionSubtraction) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32_shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, bf16_shape, "b"));
+  HloInstruction* c = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32_shape, "c"));
+
+  HloInstruction* sub0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(bf16_shape, HloOpcode::kSubtract, a, b));
+
+  HloInstruction* sub1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(bf16_shape, HloOpcode::kSubtract, sub0, c));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(Normalize(module.get()));
+
+  EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
+  EXPECT_EQ(computation->root_instruction()->operand(0), sub1);
+  EXPECT_EQ(sub0->shape().element_type(), F32);
+  EXPECT_EQ(sub1->shape().element_type(), F32);
+  EXPECT_EQ(sub1->operand(0)->opcode(), HloOpcode::kConvert);
+}
+
+TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
+  Shape f32_input_shape = ShapeUtil::MakeShape(F32, {2, 4});
+  Shape f32_output_shape = ShapeUtil::MakeShape(F32, {4});
+
+  Shape bf16_scalar_shape = ShapeUtil::MakeShape(BF16, {2, 4});
+
+  auto reduce_comp_builder = HloComputation::Builder("reduce_comp");
+  auto reduce_comp_param0 = reduce_comp_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, bf16_scalar_shape, "param0"));
+  auto reduce_comp_param1 = reduce_comp_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, bf16_scalar_shape, "param1"));
+  reduce_comp_builder.AddInstruction(
+      HloInstruction::CreateBinary(bf16_scalar_shape, HloOpcode::kAdd,
+                                   reduce_comp_param0, reduce_comp_param1));
+
+  auto module = CreateNewModule();
+  auto reduce_computation =
+      module->AddEmbeddedComputation(reduce_comp_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* input = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32_input_shape, "a"));
+  HloInstruction* init = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, bf16_scalar_shape, "init"));
+  HloInstruction* reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+      f32_output_shape, input, init, {0}, reduce_computation));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(Normalize(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), reduce);
+  EXPECT_EQ(reduce->called_computations().size(), 1);
+  EXPECT_EQ(reduce->called_computations()[0]->num_parameters(), 2);
+  EXPECT_EQ(reduce->called_computations()[0]
+                ->parameter_instruction(0)
+                ->shape()
+                .element_type(),
+            F32);
+  EXPECT_EQ(reduce->called_computations()[0]
+                ->parameter_instruction(1)
+                ->shape()
+                .element_type(),
+            F32);
+  EXPECT_EQ(reduce->called_computations()[0]
+                ->root_instruction()
+                ->shape()
+                .element_type(),
+            F32);
+  EXPECT_EQ(reduce->shape().element_type(), F32);
+  EXPECT_EQ(reduce->operand(0), input);
+  EXPECT_EQ(input->shape().element_type(), F32);
+  EXPECT_EQ(reduce->operand(1)->opcode(), HloOpcode::kConvert);
+  EXPECT_EQ(reduce->operand(1)->shape().element_type(), F32);
+}
+
+TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
+
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32_shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, bf16_shape, "b"));
+
+  HloInstruction* crs =
+      builder.AddInstruction(HloInstruction::CreateCrossReplicaSum(
+          ShapeUtil::MakeTupleShape({f32_shape, bf16_shape}), {a, b}));
+  HloInstruction* gte = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(bf16_shape, crs, 1));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(Normalize(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), gte);
+  EXPECT_EQ(gte->shape().element_type(), BF16);
+  EXPECT_EQ(crs->operand(1)->shape().element_type(), F32);
+  EXPECT_EQ(ShapeUtil::GetSubshape(crs->shape(), {1}).element_type(), F32);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_support.cc b/tensorflow/compiler/xla/service/bfloat16_support.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3fd9e24601f27633c8063e4574c7c4f91f30dcff
--- /dev/null
+++ b/tensorflow/compiler/xla/service/bfloat16_support.cc
@@ -0,0 +1,111 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/bfloat16_support.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+
+namespace xla {
+
+bool BFloat16Support::SupportsBF16Operand(const HloInstruction& hlo,
+                                          int64 operand_index) const {
+  switch (hlo.opcode()) {
+    case HloOpcode::kCall:
+    case HloOpcode::kConditional:
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kTuple:
+    case HloOpcode::kWhile:
+      return true;
+    case HloOpcode::kConvert:
+      CHECK_EQ(operand_index, 0);
+      return hlo.operand(0)->shape().element_type() == BF16;
+    default:
+      break;
+  }
+  return false;
+}
+
+bool BFloat16Support::SupportsBF16Output(const HloInstruction& hlo) const {
+  switch (hlo.opcode()) {
+    case HloOpcode::kCall:
+    case HloOpcode::kConditional:
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kTuple:
+    case HloOpcode::kWhile:
+      return true;
+    case HloOpcode::kConvert:
+      return hlo.shape().element_type() == BF16;
+    default:
+      break;
+  }
+  return false;
+}
+
+bool BFloat16Support::SupportsMixedPrecisions(const HloInstruction& hlo) const {
+  switch (hlo.opcode()) {
+    case HloOpcode::kCall:
+    case HloOpcode::kConditional:
+    case HloOpcode::kConvert:
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kTuple:
+    case HloOpcode::kWhile:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/* static */
+bool BFloat16Support::EffectiveOperandPrecisionIsOutputPrecision(
+    const HloInstruction& hlo, int64 operand_index) {
+  switch (hlo.opcode()) {
+    case HloOpcode::kAbs:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kClamp:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kCopy:
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kMaximum:
+    case HloOpcode::kMinimum:
+    case HloOpcode::kPad:
+    case HloOpcode::kReshape:
+    case HloOpcode::kReverse:
+    case HloOpcode::kSlice:
+    case HloOpcode::kSort:
+    case HloOpcode::kTranspose:
+    case HloOpcode::kTuple:
+      return true;
+    case HloOpcode::kDynamicSlice:
+      return operand_index == 0;
+    case HloOpcode::kDynamicUpdateSlice:
+      return operand_index == 0 || operand_index == 1;
+    case HloOpcode::kSelect:
+      return operand_index == 1 || operand_index == 2;
+    default:
+      break;
+  }
+  return false;
+}
+
+bool BFloat16Support::EffectiveOperandPrecisionIsBF16(
+    const HloInstruction& hlo, int64 operand_index) const {
+  return false;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_support.h b/tensorflow/compiler/xla/service/bfloat16_support.h
new file mode 100644
index 0000000000000000000000000000000000000000..29f662d22b4e5486662a1387407d41e0fd2ed1b3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/bfloat16_support.h
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_SUPPORT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_SUPPORT_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+
+namespace xla {
+
+class BFloat16Support {
+ public:
+  BFloat16Support() {}
+  virtual ~BFloat16Support() {}
+
+  // Returns whether the backend supports BF16 operand for the HLO instruction
+  // at the given index.
+  virtual bool SupportsBF16Operand(const HloInstruction& hlo,
+                                   int64 operand_index) const;
+
+  // Returns whether the backend supports BF16 output for the HLO instruction.
+  virtual bool SupportsBF16Output(const HloInstruction& hlo) const;
+
+  // Returns whether the backend support mixed precision: the operands, output,
+  // and parameters/output of the called computations can have different
+  // precisions (BF16 and F32).
+  virtual bool SupportsMixedPrecisions(const HloInstruction& hlo) const;
+
+  // Returns whether the given HLO inherits its BF16 operand precision at the
+  // given index, so even if the output is F32, elements in the output that
+  // depend on the BF16 operand will still have BF16 effective precision even if
+  // they have F32 format. Similarly, this also means if the output is BF16 then
+  // increasing the operand precision from BF16 to F32 will not change the
+  // output. This typically includes HLOs that pass elements from the operand to
+  // the output without arithmetic operations.
+  static bool EffectiveOperandPrecisionIsOutputPrecision(
+      const HloInstruction& hlo, int64 operand_index);
+
+  // Returns if the backend only uses BF16 precision for the operand at the
+  // specified index, even if the operand is F32.
+  virtual bool EffectiveOperandPrecisionIsBF16(const HloInstruction& hlo,
+                                               int64 operand_index) const;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BFLOAT16_SUPPORT_H_
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 19a9ff04def5fc3d0b3739bbcf546a74114759a6..b1e693da9d5af4babe619b8796007f2da318f6a8 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -45,6 +45,8 @@ using ::tensorflow::gtl::FlatMap;
 using ::tensorflow::gtl::FlatSet;
 using ::tensorflow::strings::Appendf;
 using ::tensorflow::strings::HumanReadableNumBytes;
+using ::tensorflow::strings::Printf;
+using ::tensorflow::strings::StrAppend;
 
 size_t BufferAllocation::Slice::Hasher::operator()(Slice s) const {
   uint64 h = std::hash<int64>()(s.index());
@@ -73,9 +75,10 @@ void BufferAllocation::AddAssignment(const LogicalBuffer& buffer, int64 offset,
   CHECK_LE(offset, size_) << "LogicalBuffer " << buffer
                           << " offset out of range";
   CHECK_LE(offset + size, size_)
-      << "LogicalBuffer " << buffer << " size out of range";
+      << "LogicalBuffer " << buffer
+      << " size out of range at offset: " << offset << " with size: " << size;
   CHECK_EQ(buffer.color(), color())
-      << "Buffer color " << buffer.color()
+      << "Buffer color " << buffer.color() << " for buffer " << buffer
       << " does not match allocation color " << color() << ".";
   OffsetSize offset_size;
   offset_size.offset = offset;
@@ -92,6 +95,9 @@ BufferAllocationProto BufferAllocation::ToProto() const {
   proto.set_color(color_.value());
   if (is_entry_computation_parameter_) {
     proto.set_is_entry_computation_parameter(true);
+    for (int64 idx : param_shape_index()) {
+      proto.add_parameter_shape_index(idx);
+    }
     proto.set_parameter_number(parameter_number_);
   }
   proto.set_maybe_live_out(maybe_live_out_);
@@ -111,25 +117,24 @@ BufferAllocationProto BufferAllocation::ToProto() const {
 
 string BufferAllocation::ToString() const {
   string output;
-  tensorflow::strings::StrAppend(
-      &output, tensorflow::strings::Printf("allocation %lld: %p, size %lld",
-                                           index_, this, size()));
+  Appendf(&output, "allocation %lld: %p, size %lld", index_, this, size());
   if (color().value() != 0) {
-    tensorflow::strings::StrAppend(&output, ", color ", color().value());
+    StrAppend(&output, ", color ", color().value());
   }
   if (is_entry_computation_parameter()) {
-    tensorflow::strings::StrAppend(&output, ", parameter ", parameter_number());
+    StrAppend(&output, ", parameter ", parameter_number(), " at ShapeIndex ",
+              param_shape_index().ToString());
   }
   if (is_thread_local()) {
-    tensorflow::strings::StrAppend(&output, ", thread-local");
+    StrAppend(&output, ", thread-local");
   }
   if (maybe_live_out()) {
-    tensorflow::strings::StrAppend(&output, ", maybe-live-out");
+    StrAppend(&output, ", maybe-live-out");
   }
   if (IsPreallocatedTempBuffer()) {
-    tensorflow::strings::StrAppend(&output, ", preallocated-temp");
+    StrAppend(&output, ", preallocated-temp");
   }
-  tensorflow::strings::StrAppend(&output, ":\n");
+  StrAppend(&output, ":\n");
   // Dump the assigned buffers ordered by id.
   std::vector<const LogicalBuffer*> sorted_buffers;
   for (const auto& buffer_offset_size : assigned_buffers_) {
@@ -141,12 +146,11 @@ string BufferAllocation::ToString() const {
             });
   for (const LogicalBuffer* buffer : sorted_buffers) {
     const OffsetSize& offset_size = FindOrDie(assigned_buffers_, buffer);
-    tensorflow::strings::StrAppend(
-        &output,
-        tensorflow::strings::Printf(
-            "  %s [%lld,%lld]: %s\n", buffer->ToString().c_str(),
-            offset_size.offset, offset_size.size,
-            ShapeUtil::HumanStringWithLayout(buffer->shape()).c_str()));
+    StrAppend(&output,
+              tensorflow::strings::Printf(
+                  "  %s [%lld,%lld]: %s\n", buffer->ToString().c_str(),
+                  offset_size.offset, offset_size.size,
+                  ShapeUtil::HumanStringWithLayout(buffer->shape()).c_str()));
   }
   return output;
 }
@@ -581,6 +585,7 @@ Status GatherComputationsByAllocationType(
            instruction->called_computations()) {
         switch (instruction->opcode()) {
           case HloOpcode::kCall:
+          case HloOpcode::kConditional:
           case HloOpcode::kWhile:
             // Call and while must be called from a computation with global
             // allocations as they may return references to buffers inside the
@@ -838,20 +843,19 @@ Status BufferAssigner::AssignBuffersForComputation(
                                     /*is_thread_local=*/false,
                                     /*is_reusable=*/false);
       allocation->set_entry_computation_parameter(
-          instruction->parameter_number());
+          instruction->parameter_number(), buffer->index());
       VLOG(3) << "New allocation #" << allocation->index()
               << " for entry computation parameter: " << *buffer;
       continue;
     }
 
-    if (is_thread_local || instruction->opcode() == HloOpcode::kCustomCall) {
-      // Custom call operations never have reusable buffers. Also we do not
-      // reuse thread-local buffers for now, because they are dynamically
-      // allocated and their lifetimes are hard to compute.
+    if (is_thread_local) {
+      // We do not reuse thread-local buffers for now, because they are
+      // dynamically allocated and their lifetimes are hard to compute.
       BufferAllocation* allocation = assignment->NewAllocation(
           *buffer, buffer_size, is_thread_local, /*is_reusable=*/false);
       VLOG(3) << "New allocation #" << allocation->index()
-              << " for thread-local/CustomCall: " << *buffer;
+              << " for thread-local: " << *buffer;
       continue;
     }
 
@@ -976,8 +980,8 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
   const HloOrdering& hlo_ordering = assignment->liveness().hlo_ordering();
   if (run_whole_module_heap_simulation) {
     // Run the heap simulation over the whole module. This reduces memory usage,
-    // since buffers for kCall and kWhile sub-computations are only live for the
-    // duration of their calling instructions.
+    // since buffers for kCall, kWhile, and kConditional sub-computations are
+    // only live for the duration of their calling instructions.
     VLOG(1) << "Running whole-module heap simulation";
     SequentialHloOrdering::HloModuleSequence module_sequence;
     FlatSet<const LogicalBuffer*> all_buffers_to_assign;
@@ -996,14 +1000,15 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       auto color = single_colored_set.first;
       VLOG(2) << "Simulating heap for color " << color;
       int64 alignment = assignment->color_alignment_(color);
+      HeapSimulator::Options options;
+      options.buffers_to_assign = &single_colored_set.second;
       TF_ASSIGN_OR_RETURN(
           const HeapSimulator::Result result,
           HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
                                  MakeUnique<LazyBestFitHeap>(alignment)),
                              assignment->module(), module_sequence,
                              assignment->points_to_analysis(),
-                             assignment->buffer_size_,
-                             &single_colored_set.second));
+                             assignment->buffer_size_, options));
       AssignBuffersFromHeapSimulator(result, assignment,
                                      single_colored_set.first);
     }
@@ -1023,14 +1028,15 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         auto color = single_colored_set.first;
         VLOG(2) << "Simulating heap for color " << color;
         int64 alignment = assignment->color_alignment_(color);
+        HeapSimulator::Options options;
+        options.buffers_to_assign = &single_colored_set.second;
         TF_ASSIGN_OR_RETURN(
             const HeapSimulator::Result result,
             HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
                                    MakeUnique<LazyBestFitHeap>(alignment)),
                                *computation, *instruction_sequence,
                                assignment->points_to_analysis(),
-                               assignment->buffer_size_,
-                               &single_colored_set.second));
+                               assignment->buffer_size_, options));
         AssignBuffersFromHeapSimulator(result, assignment,
                                        single_colored_set.first);
       }
@@ -1119,140 +1125,6 @@ void BufferAssigner::AddSetToColocatedBufferSets(
   }
 }
 
-// Conceptually the same as AddSetToColocatedBufferSets, but specific to the
-// colocated buffers for while instructions. 'colocated_set' contains the
-// buffers for a single while instruction that must be colocated. The idea here
-// is to apply a memory-saving heuristic for separate while instructions whose
-// buffers are disjoint in liveness, by using the colocation mechanism to force
-// buffer sharing. This often reduces memory for multi-layer RNNs.
-//
-// TODO(b/32491382): We should be able to remove this heuristic after we
-// implement module-level liveness analysis, which would let us directly detect
-// buffer sharing opportunities between the while instruction buffer and the
-// buffers from the predicate and body computation, as well as sharing across
-// different while instructions.
-void BufferAssigner::AddWhileSetToColocatedBufferSets(
-    const std::vector<const LogicalBuffer*>& colocated_set,
-    const LogicalBuffer* while_init_buffer,
-    const LogicalBuffer* while_result_buffer, const HloInstruction* while_hlo,
-    const HloComputation& computation, const BufferLiveness& buffer_liveness,
-    const LogicalBuffer::SizeFunction& buffer_size,
-    std::vector<ColocatedBufferSet>* colocated_buffer_sets) {
-  CHECK(!colocated_set.empty());
-  const TuplePointsToAnalysis& points_to_analysis =
-      buffer_liveness.points_to_analysis();
-
-  // Parallel while loops cannot safely share colocated buffer sets.
-  if (buffer_liveness.hlo_ordering().SequentialOrder(computation) == nullptr) {
-    AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
-    return;
-  }
-
-  // Scan 'colocated_buffer_sets' in reverse order for locality; colocated sets
-  // are added in postorder over computations and instructions.
-  const int64 init_buffer_size = buffer_size(*while_init_buffer);
-  const bool is_live_out = buffer_liveness.MaybeLiveOut(*while_result_buffer);
-  for (int i = colocated_buffer_sets->size() - 1; i >= 0; --i) {
-    const ColocatedBufferSet& predecessor_set = (*colocated_buffer_sets)[i];
-
-    // Skip predecessor sets not associated with while loops.
-    if (std::all_of(predecessor_set.begin(), predecessor_set.end(),
-                    [](const LogicalBuffer* buffer) {
-                      return buffer->instruction()->opcode() !=
-                             HloOpcode::kWhile;
-                    })) {
-      continue;
-    }
-
-    // Skip predecessor sets already associated with 'while_hlo'.
-    if (std::any_of(predecessor_set.begin(), predecessor_set.end(),
-                    [&while_hlo](const LogicalBuffer* buffer) {
-                      return buffer->instruction() == while_hlo;
-                    })) {
-      continue;
-    }
-
-    // Skip predecessor sets with entry parameter if the while result is live
-    // out.
-    if (is_live_out &&
-        std::any_of(predecessor_set.begin(), predecessor_set.end(),
-                    [](const LogicalBuffer* buffer) {
-                      auto* instruction = buffer->instruction();
-                      auto* computation = instruction->parent();
-                      auto* module = computation->parent();
-                      return instruction->opcode() == HloOpcode::kParameter &&
-                             computation == module->entry_computation();
-                    })) {
-      continue;
-    }
-
-    // Build vector of predecessor while result and init buffers, which are
-    // checked for liveness interference below. We must check both the result
-    // and init buffers because they're aliased together, but
-    // TuplePointsToAnalysis is unaware of this aliasing.
-    std::vector<const LogicalBuffer*> predecessor_while_buffers;
-    for (const LogicalBuffer* buffer : predecessor_set) {
-      const HloInstruction* instruction = buffer->instruction();
-      if (instruction->opcode() == HloOpcode::kWhile &&
-          buffer_size(*buffer) == init_buffer_size &&
-          instruction->parent() == &computation) {
-        predecessor_while_buffers.push_back(buffer);
-        // Add the init buffer at the same index, which must also exist in the
-        // predecessor set, and must be unambiguous.
-        const PointsToSet& init_points_to =
-            points_to_analysis.GetPointsToSet(instruction->operand(0));
-        const auto& init_buffers = init_points_to.element(buffer->index());
-        CHECK_EQ(init_buffers.size(), 1);
-        CHECK_GT(predecessor_set.count(init_buffers[0]), 0);
-        predecessor_while_buffers.push_back(init_buffers[0]);
-      }
-    }
-    if (predecessor_while_buffers.empty()) {
-      continue;
-    }
-
-    // Skip predecessor set if the live range of any predecessor
-    // buffers overlaps with 'while_init_buffer' or
-    // 'while_result_buffer' (we need to check both since they're
-    // aliased together, but the points-to analysis is unaware of this
-    // aliasing). Note that tuple element buffer forwarding can cause
-    // the same buffer to appear on both sides of the interference
-    // comparison below.
-    auto may_interfere_with_init_or_result = [&](const LogicalBuffer* buffer) {
-      if (while_init_buffer->id() != buffer->id() &&
-          buffer_liveness.MayInterfere(*while_init_buffer, *buffer)) {
-        return true;
-      }
-
-      if (while_result_buffer->id() != buffer->id() &&
-          buffer_liveness.MayInterfere(*while_result_buffer, *buffer)) {
-        return true;
-      }
-
-      return false;
-    };
-
-    if (std::any_of(predecessor_while_buffers.begin(),
-                    predecessor_while_buffers.end(),
-                    may_interfere_with_init_or_result)) {
-      continue;
-    }
-
-    // All our checks have passed; merge 'predecessor_set' with 'colocated_set',
-    // and add the merged set to 'colocated_buffer_sets'. This forces the
-    // colocation of buffers across different while instructions.
-    FlatSet<const LogicalBuffer*> unique;
-    unique.insert(predecessor_set.begin(), predecessor_set.end());
-    unique.insert(colocated_set.begin(), colocated_set.end());
-    std::vector<const LogicalBuffer*> merged_set(unique.begin(), unique.end());
-    AddSetToColocatedBufferSets(merged_set, colocated_buffer_sets);
-    return;
-  }
-
-  // Failed to merge into predecessor set; add 'colocated_set' as-is.
-  AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
-}
-
 namespace {
 
 // Checks that points-to set of 'instruction' is unambiguous and distinct
@@ -1269,10 +1141,133 @@ const LogicalBuffer* AddBufferToColocatedSet(
   return colocated_set->back();
 }
 
+// Given the interference map of a graph (the list of interfering node indices
+// for each node), perform graph coloring such that interfering nodes are
+// assigned to different colors. Returns the assigned color of the nodes, where
+// the colors are represented as integer values [0, color_count).
+std::vector<int64> ColorInterferenceGraph(
+    const std::vector<std::vector<int64>>& interference_map) {
+  const int64 node_count = interference_map.size();
+
+  // Sort the nodes such that we assign nodes with more interference first. This
+  // relies on the common heuristic of assigning the most constrained node
+  // first, but it would be good to investigate other ordering heuristics too.
+  std::vector<int64> nodes(node_count);
+  std::iota(nodes.begin(), nodes.end(), 0);
+  std::sort(nodes.begin(), nodes.end(),
+            [&interference_map](const int64 i, const int64 j) {
+              return interference_map[i].size() > interference_map[j].size();
+            });
+
+  const int64 kColorUnassigned = -1;
+  std::vector<int64> assigned_colors(node_count, kColorUnassigned);
+  for (int64 node : nodes) {
+    // Mark the colors that are already assigned to the neighbors.
+    std::vector<bool> available_colors(node_count, true);
+    for (int64 neighbor : interference_map[node]) {
+      int64 color = assigned_colors[neighbor];
+      if (color != kColorUnassigned) {
+        available_colors[color] = false;
+      }
+    }
+
+    // Find the color that is not yet assigned to the neighbors.
+    int64 color = kColorUnassigned;
+    for (color = 0; color < available_colors.size(); ++color) {
+      if (available_colors[color]) {
+        break;
+      }
+    }
+    CHECK_NE(color, kColorUnassigned);
+    assigned_colors[node] = color;
+  }
+  return assigned_colors;
+}
+
 }  // namespace
 
+std::vector<BufferAssigner::ColocatedBufferSet>
+BufferAssigner::MergeColocatedBufferSets(
+    const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
+    const BufferLiveness& buffer_liveness,
+    const LogicalBuffer::SizeFunction& buffer_size) {
+  VLOG(1) << "colocation sets count before coalescing:"
+          << colocated_buffer_sets.size();
+
+  // Returns true if the given buffer is for the entry parameter.
+  auto is_entry_parameter = [](const LogicalBuffer& buffer) {
+    auto* instruction = buffer.instruction();
+    auto* computation = instruction->parent();
+    auto* module = computation->parent();
+    return instruction->opcode() == HloOpcode::kParameter &&
+           computation == module->entry_computation();
+  };
+
+  // Returns true if the two colocated buffer sets (specified by their indices
+  // into the colocated_buffer_sets) can be merged into a single set.
+  auto cannot_merge_buffer_sets = [&colocated_buffer_sets, &buffer_liveness,
+                                   &buffer_size,
+                                   &is_entry_parameter](int64 i, int64 j) {
+    for (auto& buffer_a : colocated_buffer_sets[i]) {
+      for (auto& buffer_b : colocated_buffer_sets[j]) {
+        // Do not merge if the set includes live outs or entry parameters.
+        if ((buffer_liveness.MaybeLiveOut(*buffer_a) &&
+             is_entry_parameter(*buffer_b)) ||
+            (buffer_liveness.MaybeLiveOut(*buffer_b) &&
+             is_entry_parameter(*buffer_a))) {
+          return true;
+        }
+        // Do not merge if the buffers interfere with each other.
+        if (buffer_a->id() != buffer_b->id() &&
+            buffer_liveness.MayInterfere(*buffer_a, *buffer_b)) {
+          return true;
+        }
+        // Do not merge if the buffer sizes are different.
+        if (buffer_size(*buffer_a) != buffer_size(*buffer_b)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  };
+
+  // Build the interference map among the colocated buffer sets (nodes), by
+  // adding an edge between any two nodes that cannot be merged into a single
+  // colocated buffer set.
+  std::vector<std::vector<int64>> interference_map(
+      colocated_buffer_sets.size());
+  for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
+    for (int64 j = i + 1; j < colocated_buffer_sets.size(); ++j) {
+      if (cannot_merge_buffer_sets(i, j)) {
+        interference_map[i].push_back(j);
+        interference_map[j].push_back(i);
+      }
+    }
+  }
+
+  // Assign a color to each colocation set in colocated_buffer_sets, such that
+  // the sets that can be merged are assigned with the same color.
+  auto assigned_colors = ColorInterferenceGraph(interference_map);
+
+  // Merge the buffer sets with the same color.
+  CHECK(!assigned_colors.empty());
+  int64 num_sets =
+      *std::max_element(assigned_colors.begin(), assigned_colors.end()) + 1;
+  std::vector<ColocatedBufferSet> new_colocated_buffer_sets(num_sets);
+  for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
+    const auto& buffer_set = colocated_buffer_sets[i];
+    new_colocated_buffer_sets[assigned_colors[i]].insert(buffer_set.begin(),
+                                                         buffer_set.end());
+  }
+
+  VLOG(1) << "colocation sets count after coalescing:"
+          << colocated_buffer_sets.size();
+  return new_colocated_buffer_sets;
+}
+
 // Builds sets of buffers in 'colocated_buffer_sets' which should be colocated
-// in the same allocation (currently just supports kWhile and kCall).
+// in the same allocation (currently just supports kWhile, kCall, and
+// kConditional).
 void BufferAssigner::BuildColocatedBufferSets(
     const HloModule* module, const BufferLiveness& buffer_liveness,
     const LogicalBuffer::SizeFunction& buffer_size,
@@ -1295,12 +1290,11 @@ void BufferAssigner::BuildColocatedBufferSets(
                 const Shape& /*subshape*/, const ShapeIndex& index) {
               std::vector<const LogicalBuffer*> colocated_set;
               // Add while.init.
-              auto* init_buffer =
-                  AddBufferToColocatedSet(while_hlo->operand(0), index,
-                                          points_to_analysis, &colocated_set);
+              AddBufferToColocatedSet(while_hlo->operand(0), index,
+                                      points_to_analysis, &colocated_set);
               // Add while.result.
-              auto* result_buffer = AddBufferToColocatedSet(
-                  while_hlo, index, points_to_analysis, &colocated_set);
+              AddBufferToColocatedSet(while_hlo, index, points_to_analysis,
+                                      &colocated_set);
               // Add while.cond.parameter.
               AddBufferToColocatedSet(
                   while_hlo->while_condition()->parameter_instruction(0), index,
@@ -1313,10 +1307,7 @@ void BufferAssigner::BuildColocatedBufferSets(
               AddBufferToColocatedSet(
                   while_hlo->while_body()->root_instruction(), index,
                   points_to_analysis, &colocated_set);
-              AddWhileSetToColocatedBufferSets(
-                  colocated_set, init_buffer, result_buffer, while_hlo,
-                  *computation, buffer_liveness, buffer_size,
-                  colocated_buffer_sets);
+              AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
             });
       } else if (opcode == HloOpcode::kCall) {
         const HloInstruction* call_hlo = instruction;
@@ -1336,9 +1327,82 @@ void BufferAssigner::BuildColocatedBufferSets(
                                       &colocated_set);
               AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
             });
+      } else if (opcode == HloOpcode::kConditional) {
+        const HloInstruction* conditional_hlo = instruction;
+        ShapeUtil::ForEachSubshape(
+            conditional_hlo->shape(),
+            [this, conditional_hlo, &points_to_analysis, colocated_buffer_sets](
+                const Shape& /*subshape*/, const ShapeIndex& index) {
+              std::vector<const LogicalBuffer*> colocated_set;
+              // Add conditional.result.
+              AddBufferToColocatedSet(conditional_hlo, index,
+                                      points_to_analysis, &colocated_set);
+              // Add conditional.true_computation.root.
+              AddBufferToColocatedSet(
+                  conditional_hlo->true_computation()->root_instruction(),
+                  index, points_to_analysis, &colocated_set);
+              // Add conditional.false_computation.root.
+              AddBufferToColocatedSet(
+                  conditional_hlo->false_computation()->root_instruction(),
+                  index, points_to_analysis, &colocated_set);
+              AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
+            });
+
+        // Add true_operand and conditional.true_computation.parameter(0) as a
+        // colocated buffer set. Note that this has to be done for each subshape
+        // in the true_operand of the conditional.
+        ShapeUtil::ForEachSubshape(
+            conditional_hlo->operand(1)->shape(),
+            [this, conditional_hlo, &points_to_analysis, colocated_buffer_sets](
+                const Shape& /*subshape*/, const ShapeIndex& index) {
+              std::vector<const LogicalBuffer*> true_set;
+              // Add conditional.true_operand.
+              AddBufferToColocatedSet(conditional_hlo->operand(1), index,
+                                      points_to_analysis, &true_set);
+              // Add conditional.true_computation.parameter_instruction(0).
+              AddBufferToColocatedSet(
+                  conditional_hlo->true_computation()->parameter_instruction(0),
+                  index, points_to_analysis, &true_set);
+              AddSetToColocatedBufferSets(true_set, colocated_buffer_sets);
+            });
+
+        // Add false_operand and conditional.false_computation.parameter(0) as a
+        // colocated buffer set. Note that this has to be done for each subshape
+        // in the false_operand of the conditional.
+        ShapeUtil::ForEachSubshape(
+            conditional_hlo->operand(2)->shape(),
+            [this, conditional_hlo, &points_to_analysis, colocated_buffer_sets](
+                const Shape& /*subshape*/, const ShapeIndex& index) {
+              std::vector<const LogicalBuffer*> false_set;
+              // Add conditional.false_operand.
+              AddBufferToColocatedSet(conditional_hlo->operand(2), index,
+                                      points_to_analysis, &false_set);
+              // Add conditional.false_computation.parameter_instruction(0).
+              AddBufferToColocatedSet(
+                  conditional_hlo->false_computation()->parameter_instruction(
+                      0),
+                  index, points_to_analysis, &false_set);
+              AddSetToColocatedBufferSets(false_set, colocated_buffer_sets);
+            });
       }
     }
   }
+
+  if (colocated_buffer_sets->empty()) {
+    return;
+  }
+
+  // Try to find more coalescing opportunities among the colocated buffer sets.
+  //
+  // TODO(b/32491382): We should be able to remove this by using the
+  // module-level liveness analysis, which would let us directly detect buffer
+  // sharing opportunities between the while instruction buffer and the buffers
+  // from the predicate and body computation, as well as sharing across
+  // different while instructions.
+  std::vector<ColocatedBufferSet> new_colocated_buffer_sets =
+      MergeColocatedBufferSets(*colocated_buffer_sets, buffer_liveness,
+                               buffer_size);
+  std::swap(*colocated_buffer_sets, new_colocated_buffer_sets);
 }
 
 // Assigns all colocated buffer sets in 'colocated_buffer_sets' to the same
@@ -1350,39 +1414,47 @@ void BufferAssigner::AssignColocatedBufferSets(
     FlatSet<BufferAllocation::Index>* colocated_allocations) {
   for (const ColocatedBufferSet& colocated_buffer_set : colocated_buffer_sets) {
     BufferAllocation* allocation = nullptr;
-    // Set 'entry_parameter_number' if entry param in 'colocated_buffer_set'.
+    // Set 'entry_parameter_number' and 'entry_parameter_shape_idx' if entry
+    // param in 'colocated_buffer_set'.
     int64 entry_parameter_number = -1;
+    const ShapeIndex* entry_parameter_shape_idx = nullptr;
     for (const LogicalBuffer* buffer : colocated_buffer_set) {
       const HloInstruction* instruction = buffer->instruction();
       const HloComputation* computation = instruction->parent();
       if (instruction->opcode() == HloOpcode::kParameter &&
           computation == computation->parent()->entry_computation()) {
         entry_parameter_number = instruction->parameter_number();
+        entry_parameter_shape_idx = &buffer->index();
         break;
       }
     }
 
     for (const LogicalBuffer* buffer : colocated_buffer_set) {
+      const int64 buffer_size = assignment->buffer_size_(*buffer);
       if (allocation == nullptr) {
         // TODO(b/32491382) Avoid current trivial solution of using new
         // allocations for each colocated buffer set. When liveness has
         // module-level scope, we can allow buffers to be shared across
         // computations (in some cases).
-        allocation = assignment->NewAllocation(
-            *buffer, assignment->buffer_size_(*buffer),
-            /*is_thread_local=*/false, /*is_reusable=*/true);
+        allocation = assignment->NewAllocation(*buffer, buffer_size,
+                                               /*is_thread_local=*/false,
+                                               /*is_reusable=*/true);
         if (entry_parameter_number >= 0) {
           // This colocated buffer set contains an entry parameter and other
           // logical buffers which use the parameter as read-only in a while
           // body computation (which updates in place).
           // Set 'entry_computation_parameter' to indicate that it contains
           // an entry parameter, and to prevent reuse in MaybeAssignBuffer.
-          allocation->set_entry_computation_parameter(entry_parameter_number);
+          allocation->set_entry_computation_parameter(
+              entry_parameter_number, *entry_parameter_shape_idx);
         }
         colocated_allocations->insert(allocation->index());
       } else {
+        CHECK_EQ(buffer_size, allocation->size())
+            << "Buffer: " << *buffer << " size mismatch in colocated buffer "
+            << "allocation: " << *allocation;
         assignment->AddAssignment(allocation, *buffer, /*offset=*/0,
-                                  assignment->buffer_size_(*buffer));
+                                  buffer_size);
       }
       colocated_buffers->insert(buffer);
     }
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 08a40bfeb2a2a78c25805308e73154c6cc667f21..6b7fd0014d103ef0617afcc5cb3f663554a01aa4 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -91,6 +91,13 @@ class BufferAllocation {
     return parameter_number_;
   }
 
+  // If this allocation is for a parameter of the entry computation, this
+  // function returns which subshape of the parameter the allocation is for.
+  const ShapeIndex& param_shape_index() const {
+    CHECK(is_entry_computation_parameter_);
+    return param_shape_index_;
+  }
+
   // Returns whether this allocation is assigned a LogicalBuffer which may
   // be live out of the entry computation.
   bool maybe_live_out() const { return maybe_live_out_; }
@@ -203,9 +210,11 @@ class BufferAllocation {
   // Adds a LogicalBuffer to the set assigned to this buffer.
   void AddAssignment(const LogicalBuffer& buffer, int64 offset, int64 size);
 
-  void set_entry_computation_parameter(int64 parameter_number) {
+  void set_entry_computation_parameter(int64 parameter_number,
+                                       ShapeIndex param_shape_index) {
     is_entry_computation_parameter_ = true;
     parameter_number_ = parameter_number;
+    param_shape_index_ = std::move(param_shape_index);
   }
   void set_maybe_live_out(bool value) { maybe_live_out_ = value; }
   void set_index(Index index) { index_ = index; }
@@ -235,6 +244,10 @@ class BufferAllocation {
   // indicates the index (starting from 0) of the parameter.
   int64 parameter_number_ = 0;
 
+  // If this buffer is for an entry computation parameter, which subshape of the
+  // parameter is it for?
+  ShapeIndex param_shape_index_;
+
   // Whether the allocation contains a LogicalBuffer which may be live-out of
   // the entry computation. Note that this flag is conservatively computed by
   // TuplePointsToAnalysis.  That is, an allocation marked `maybe_live_out_`
@@ -528,15 +541,13 @@ class BufferAssigner {
       const std::vector<const LogicalBuffer*>& colocated_set,
       std::vector<ColocatedBufferSet>* colocated_buffer_sets);
 
-  // Conceptually the same as AddSetToColocatedBufferSets, but specific to the
-  // colocated buffers for while instructions.
-  void AddWhileSetToColocatedBufferSets(
-      const std::vector<const LogicalBuffer*>& colocated_set,
-      const LogicalBuffer* while_init_buffer,
-      const LogicalBuffer* while_result_buffer, const HloInstruction* while_hlo,
-      const HloComputation& computation, const BufferLiveness& buffer_liveness,
-      const LogicalBuffer::SizeFunction& buffer_size,
-      std::vector<ColocatedBufferSet>* colocated_buffer_sets);
+  // Given a list of colocated buffer sets (each colocated buffer set represents
+  // the logical buffers that would be assigned to the same physical buffer),
+  // try to merge the sets if the buffers can be shared. Returns the merged set.
+  std::vector<ColocatedBufferSet> MergeColocatedBufferSets(
+      const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
+      const BufferLiveness& buffer_liveness,
+      const LogicalBuffer::SizeFunction& buffer_size);
 
   // Split a set of buffers into several sets, each of which contains buffers
   // colored with the same color.
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 8fba8ef5e5c799eaac429017f4a0ff6a0315ba7c..cd73654b8f666c4b96c000235cc3ad2cd0a46c17 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -166,6 +166,15 @@ class BufferAssignmentTest : public HloTestBase {
     return builder.Build();
   }
 
+  std::unique_ptr<HloComputation> BuildR0F32UnaryOpComputation(
+      HloOpcode opcode, const string& name) {
+    auto builder = HloComputation::Builder(name);
+    auto param =
+        builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "x"));
+    builder.AddInstruction(HloInstruction::CreateUnary(r0f32_, opcode, param));
+    return builder.Build();
+  }
+
   // Verifies that the given instruction hlo has a valid input buffer assigned,
   // i.e., the parameter number matches the op's.
   const BufferAllocation& GetAssignedInputAllocation(
@@ -605,7 +614,7 @@ TEST_F(BufferAssignmentTest, TrivialMap) {
   BufferAllocation map_buffer = GetAssignedOutputAllocation(*buffers, map);
   EXPECT_NE(param0_buffer.index(), map_buffer.index());
 
-  // The final computation node of the map is an add of an f32 parm and a
+  // The final computation node of the map is an add of an f32 param and a
   // constant.
   EXPECT_EQ(HloOpcode::kAdd, inner_last->opcode());
   const BufferAllocation& inner_add_buffer =
@@ -740,6 +749,56 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
             << " instructions; total buffer size " << size0 + sizec + sizeb;
 }
 
+TEST_F(BufferAssignmentTest, ExampleConditional) {
+  auto module = CreateNewModule();
+  auto true_computation = module->AddEmbeddedComputation(
+      BuildR0F32UnaryOpComputation(HloOpcode::kCeil, "Ceil"));
+  auto false_computation = module->AddEmbeddedComputation(
+      BuildR0F32UnaryOpComputation(HloOpcode::kFloor, "Floor"));
+
+  auto builder = HloComputation::Builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  auto const1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.4f)));
+  auto const2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.4f)));
+  auto conditional = builder.AddInstruction(HloInstruction::CreateConditional(
+      r0f32_, pred, const1, true_computation, const2, false_computation));
+  module->AddEntryComputation(builder.Build());
+
+  const std::vector<const HloInstruction*> conditional_instrs =
+      GetInstructions(conditional);
+  const std::vector<const HloInstruction*> true_instrs =
+      GetInstructions(true_computation->root_instruction());
+  const std::vector<const HloInstruction*> false_instrs =
+      GetInstructions(false_computation->root_instruction());
+  EXPECT_EQ(4, conditional_instrs.size());
+  EXPECT_EQ(2, true_instrs.size());
+  EXPECT_EQ(2, false_instrs.size());
+
+  auto buffers = RunBufferAssignment(module.get());
+  ValidateBuffers(conditional_instrs, *buffers);
+  ValidateBuffers(true_instrs, *buffers);
+  ValidateBuffers(false_instrs, *buffers);
+
+  EXPECT_FALSE(BuffersDistinct(conditional_instrs, true_instrs, *buffers))
+      << "Should be reuse between conditional and true computation.";
+  EXPECT_FALSE(BuffersDistinct(conditional_instrs, false_instrs, *buffers))
+      << "Should be reuse between conditional and false computation.";
+  EXPECT_FALSE(BuffersDistinct(true_instrs, false_instrs, *buffers))
+      << "Should be reuse between true and false computations.";
+
+  const BufferAllocation& conditional_buffer =
+      GetTopLevelAllocation(*buffers, conditional);
+  const BufferAllocation& true_buffer =
+      GetTopLevelAllocation(*buffers, true_computation->root_instruction());
+  const BufferAllocation& false_buffer =
+      GetTopLevelAllocation(*buffers, false_computation->root_instruction());
+  EXPECT_EQ(conditional_buffer.size(), true_buffer.size());
+  EXPECT_EQ(conditional_buffer.size(), false_buffer.size());
+}
+
 TEST_F(BufferAssignmentTest, UnaryOpReuseChain) {
   // param0[100] ---> (exp) ---> (tanh) ---> (exp) ---> (neg)
   auto builder = HloComputation::Builder(TestName());
@@ -1360,10 +1419,13 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) {
       HloInstruction::CreateParameter(1, shape_3x4, "param_b"));
   auto param_c = builder.AddInstruction(
       HloInstruction::CreateParameter(2, shape_4x4, "param_c"));
-  auto dot_ab = builder.AddInstruction(HloInstruction::CreateBinary(
-      shape_2x4, HloOpcode::kDot, param_a, param_b));
-  auto dot_bc = builder.AddInstruction(HloInstruction::CreateBinary(
-      shape_3x4, HloOpcode::kDot, param_b, param_c));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot_ab = builder.AddInstruction(
+      HloInstruction::CreateDot(shape_2x4, param_a, param_b, dot_dnums));
+  auto dot_bc = builder.AddInstruction(
+      HloInstruction::CreateDot(shape_3x4, param_b, param_c, dot_dnums));
   builder.AddInstruction(
       HloInstruction::CreateConcatenate(shape_5x4, {dot_ab, dot_bc}, 1));
 
@@ -1525,6 +1587,117 @@ TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
             assignment->GetUniqueSlice(while1, {1}).ConsumeValueOrDie());
 }
 
+// Tests that the colocated buffers for while instructions are properly assigned
+// during buffer assignment such that the result tuple elements are not assigned
+// to the same buffer.
+//
+// %infeed --> %while.0 --> %while.1 --+
+//                                     +-- %tuple
+//   %zero -->   %add   --> %while.2 --+
+//
+// Execution Order:
+// %infeed -> %while.0 -> %while.1 -> %zero -> %add -> %while.2 -> %tuple
+//
+// The HLO computation used in this test requires specific ordering to expose
+// the bug (b/72496031). During buffer assignment, the visitation order of
+// colocated buffers is %while.2 -> while.0 -> while.1, and the buffer
+// assignment was coalescing the colocated buffers for all 3 while instructions,
+// therefore assigning the same buffer to the two result tuple elements.
+TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
+  const Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+
+  // Builds a condition computation: x -> x < 4
+  auto build_cond = [&]() {
+    auto builder = HloComputation::Builder("cond");
+    auto const4 = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<int>(4)));
+    auto param =
+        builder.AddInstruction(HloInstruction::CreateParameter(0, r0s32, "x"));
+    builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, param, const4));
+    return builder.Build();
+  };
+
+  // Builds a body computation: x -> x + 9
+  auto build_body = [&]() {
+    auto builder = HloComputation::Builder("body");
+    auto const9 = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<int>(9)));
+    auto param =
+        builder.AddInstruction(HloInstruction::CreateParameter(0, r0s32, "x"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(r0s32, HloOpcode::kAdd, param, const9));
+    return builder.Build();
+  };
+
+  // Build the entry computation as described in the comment above.
+  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto builder = HloComputation::Builder("entry");
+
+  auto infeed = builder.AddInstruction(HloInstruction::CreateInfeed(r0s32, ""));
+  auto cond0 = module->AddEmbeddedComputation(build_cond());
+  auto body0 = module->AddEmbeddedComputation(build_body());
+  auto while0 = builder.AddInstruction(
+      HloInstruction::CreateWhile(r0s32, cond0, body0, infeed));
+
+  auto cond1 = module->AddEmbeddedComputation(build_cond());
+  auto body1 = module->AddEmbeddedComputation(build_body());
+  auto while1 = builder.AddInstruction(
+      HloInstruction::CreateWhile(r0s32, cond1, body1, while0));
+
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(0)));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0s32, HloOpcode::kAdd, zero, zero));
+  auto cond2 = module->AddEmbeddedComputation(build_cond());
+  auto body2 = module->AddEmbeddedComputation(build_body());
+  auto while2 = builder.AddInstruction(
+      HloInstruction::CreateWhile(r0s32, cond2, body2, add));
+
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({while2, while1}));
+  module->AddEntryComputation(builder.Build());
+
+  // Run CopyInsertion and check if the graph constructed above doesn't need
+  // any copies inserted for BufferAssignment to run.
+  int64 instruction_count = module->instruction_count();
+  CopyInsertion copy_insertion;
+  ASSERT_IS_OK(copy_insertion.Run(module.get()).status());
+  ASSERT_EQ(instruction_count, module->instruction_count());
+
+  // Create a sequential order among all the instructions in the entry
+  // computation, since the issue this test stresses depends on the order the
+  // nodes are traversed during BufferAssignment.
+  SequentialHloOrdering::HloModuleSequence sequence;
+  sequence[module->entry_computation()] = {infeed, while0, while1, zero,
+                                           add,    while2, tuple};
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto assignment,
+      BufferAssigner::Run(
+          module.get(),
+          xla::MakeUnique<SequentialHloOrdering>(module.get(), sequence),
+          backend().compiler()->BufferSizeBytesFunction(),
+          [](LogicalBuffer::Color) { return 1; }));
+
+  // The result tuple elements must be assigned with different buffers.
+  TF_ASSERT_OK_AND_ASSIGN(auto slice0, assignment->GetUniqueSlice(tuple, {0}));
+  TF_ASSERT_OK_AND_ASSIGN(auto slice1, assignment->GetUniqueSlice(tuple, {1}));
+  EXPECT_NE(slice0, slice1);
+
+  // while0 and while1 result buffers must be equal to slice1.
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_while0,
+                          assignment->GetUniqueSlice(while0, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_while1,
+                          assignment->GetUniqueSlice(while1, {}));
+  EXPECT_EQ(slice1, slice_while0);
+  EXPECT_EQ(slice1, slice_while1);
+
+  // while2 result buffer must be equal to slice0.
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_while2,
+                          assignment->GetUniqueSlice(while2, {}));
+  EXPECT_EQ(slice0, slice_while2);
+}
+
 TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
   auto module = xla::MakeUnique<HloModule>(TestName());
   auto builder = HloComputation::Builder("entry");
@@ -1708,9 +1881,8 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
       BufferAssigner::Run(
           module.get(),
           xla::MakeUnique<SequentialHloOrdering>(module.get(), sequence),
-          ByteSizeOf,
-          [](LogicalBuffer::Color) { return 1; })
-      .ConsumeValueOrDie();
+          ByteSizeOf, [](LogicalBuffer::Color) { return 1; })
+          .ConsumeValueOrDie();
 
   EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment));
 }
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc
index 513bfa3b7f7b45696093d03c1dd8250c548d260a..37982aaef9eddd64ef6b57ad5a9cf8dd6a565097 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness.cc
@@ -102,8 +102,8 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
     return false;
   }
 
-  // Every user of 'a' must be a predecessor of 'b' or 'b' itself.
   for (const BufferAlias& alias : points_to_analysis_->GetBufferAliases(a)) {
+    // Every user of 'a' must be a predecessor of 'b' or 'b' itself.
     for (auto user : alias.instruction()->users()) {
       if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(), user,
                                   points_to_analysis())) {
@@ -114,6 +114,17 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
         return false;
       }
     }
+
+    // If the root instruction aliases the buffer 'a', the live range of 'a' is
+    // until the end of the computation and can never be strictly before another
+    // buffer defined in the same computation. This is needed to prevent the
+    // root instruction's buffers from being reused by later instructions even
+    // when the root is not the last instruction in the schedule.
+    if (alias.instruction()->parent()->root_instruction() ==
+            alias.instruction() &&
+        alias.instruction()->parent() == b.instruction()->parent()) {
+      return false;
+    }
   }
 
   // If 'b' is a user of 'a' then the buffers interfere unless 'a.instruction'
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index bbb42d494b8003176d4911bacbe8a10dc5fc7c6a..f623aef67a4f98b447a9a15634a78deb60cfe6f1 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -167,11 +167,10 @@ TEST_F(BufferLivenessTest, MultipleEntryParameters_Sequential) {
 
   SequentialHloOrdering::HloModuleSequence sequence;
   sequence.insert({entry, {param0, negate, param1, exp, add}});
-  auto liveness = BufferLiveness::Run(
-                      module.get(),
-                      xla::MakeUnique<SequentialHloOrdering>(
-                          module.get(), sequence))
-                      .ConsumeValueOrDie();
+  auto liveness =
+      BufferLiveness::Run(module.get(), xla::MakeUnique<SequentialHloOrdering>(
+                                            module.get(), sequence))
+          .ConsumeValueOrDie();
 
   // Entry parameters interfere as if they are defined simultaneously at
   // the very beginning.
@@ -296,7 +295,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
   module_sequence.emplace(computation, order);
   auto liveness =
       BufferLiveness::Run(module.get(), xla::MakeUnique<SequentialHloOrdering>(
-          module.get(), module_sequence))
+                                            module.get(), module_sequence))
           .ConsumeValueOrDie();
 
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, param, negate));
@@ -312,6 +311,48 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, add, exp));
 }
 
+TEST_F(BufferLivenessTest, RootInstructionIsNotLastInSequentialOrder) {
+  // Tests that when the root instruction is not the last instruction in the
+  // schedule, the live range of its buffers interfere with the buffers of the
+  // later instructions.
+  //
+  // Two sets of independent instructions are executed in the computation.
+  // param --> add (root)
+  // recv --> recv-done --> send --> send-done
+  //
+  // Sequential order:
+  //  param, add (root), recv, recv-done, send, send-done
+  auto builder = HloComputation::Builder(TestName());
+  auto param =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, vec_, "param"));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, param, param));
+  auto recv = builder.AddInstruction(
+      HloInstruction::CreateRecv(vec_, /*channel_id=*/0));
+  auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
+  auto send = builder.AddInstruction(
+      HloInstruction::CreateSend(recv_done, /*channel_id=*/1));
+  auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build(add));
+
+  SequentialHloOrdering::HloModuleSequence module_sequence;
+  std::vector<const HloInstruction*> order = {param,     add,  recv,
+                                              recv_done, send, send_done};
+  module_sequence.emplace(computation, order);
+  auto liveness =
+      BufferLiveness::Run(module.get(), xla::MakeUnique<SequentialHloOrdering>(
+                                            module.get(), module_sequence))
+          .ConsumeValueOrDie();
+
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, add));
+  // Check the root instruction (add) buffer interferes with the recv buffer.
+  EXPECT_TRUE(
+      liveness->MayInterfere(GetBuffer(*liveness, add, /*index=*/{}),
+                             GetBuffer(*liveness, recv, /*index=*/{0})));
+}
+
 TEST_F(BufferLivenessTest, TupleLiveOut) {
   // Verify MaybeLiveOut with nested tuples. Result of computation looks like:
   //
@@ -625,9 +666,8 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
 
     // Run BufferLiveness on 'module'.
     auto liveness =
-        BufferLiveness::Run(module.get(),
-                            xla::MakeUnique<DependencyHloOrdering>(
-                                module.get()))
+        BufferLiveness::Run(
+            module.get(), xla::MakeUnique<DependencyHloOrdering>(module.get()))
             .ConsumeValueOrDie();
     // Return whether or not buffers interference is detected between
     // 'tuple_param0' and 'tuple_root' at shape index '{1}'.
@@ -738,9 +778,8 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     module->AddEmbeddedComputation(builder.Build());
     // Run BufferLiveness on 'module'.
     auto liveness =
-        BufferLiveness::Run(module.get(),
-                            xla::MakeUnique<DependencyHloOrdering>(
-                                module.get()))
+        BufferLiveness::Run(
+            module.get(), xla::MakeUnique<DependencyHloOrdering>(module.get()))
             .ConsumeValueOrDie();
     // Return whether or not buffers interference is detected between
     // 'tuple_param0' and 'tuple_root' at shape index '{1}'.
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 1adecdb939cb2c1259003d3be2c90b5a299b0f30..13eb02ca012f44b2b5ed7c6f5becb7d54b07c33c 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -54,6 +54,7 @@ std::ostream& operator<<(std::ostream& out, const CallContext& context) {
 CallContext GetInstructionCallContext(const HloInstruction* instruction) {
   switch (instruction->opcode()) {
     case HloOpcode::kCall:
+    case HloOpcode::kConditional:
     case HloOpcode::kWhile:
       return CallContext::kSequential;
     case HloOpcode::kMap:
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index 0395ea8c8b52315f7ca2221f412750ebadda2dd8..1ea7d538cd515c3098b6a1f03c6146d288330406 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -34,12 +34,13 @@ using ::testing::UnorderedElementsAre;
 class CallGraphTest : public HloTestBase {
  protected:
   // Build and return a trivial computation taking and returning a scalar.
-  std::unique_ptr<HloComputation> MakeScalarComputation() {
+  std::unique_ptr<HloComputation> MakeScalarComputation(
+      HloOpcode opcode = HloOpcode::kNegate) {
     HloComputation::Builder builder(TestName() + ".ScalarComputation");
     HloInstruction* param0 = builder.AddInstruction(
         HloInstruction::CreateParameter(0, kScalarShape, "param0"));
     builder.AddInstruction(
-        HloInstruction::CreateUnary(kScalarShape, HloOpcode::kNegate, param0));
+        HloInstruction::CreateUnary(kScalarShape, opcode, param0));
     return builder.Build();
   }
 
@@ -236,6 +237,54 @@ TEST_F(CallGraphTest, ContextBothComputations) {
   EXPECT_EQ(CallContext::kBoth, sub_node.context());
 }
 
+TEST_F(CallGraphTest, ComputationWithConditional) {
+  // Test a call graph of a module with a conditional.
+  auto module = CreateNewModule();
+  HloComputation* true_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation(HloOpcode::kCeil));
+  HloComputation* false_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation(HloOpcode::kFloor));
+
+  HloComputation::Builder builder(TestName());
+  HloInstruction* pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  HloInstruction* const1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.4f)));
+  HloInstruction* const2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.6f)));
+  HloInstruction* conditional =
+      builder.AddInstruction(HloInstruction::CreateConditional(
+          kScalarShape, pred, const1, true_computation, const2,
+          false_computation));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+
+  EXPECT_EQ(3, call_graph->nodes().size());
+
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(1, entry_node.callsites().size());
+
+  const CallSite& conditional_callsite = entry_node.callsites()[0];
+  EXPECT_EQ(conditional, conditional_callsite.instruction());
+  EXPECT_THAT(conditional_callsite.called_computations(),
+              UnorderedElementsAre(true_computation, false_computation));
+  EXPECT_EQ(CallContext::kSequential, conditional_callsite.context());
+  EXPECT_EQ(entry_node.GetCallSite(conditional), &conditional_callsite);
+
+  const CallGraphNode& true_node = call_graph->GetNode(true_computation);
+  EXPECT_TRUE(true_node.callees().empty());
+  EXPECT_EQ(1, true_node.callers().size());
+  EXPECT_EQ(entry_computation, true_node.callers()[0]);
+
+  const CallGraphNode& false_node = call_graph->GetNode(false_computation);
+  EXPECT_TRUE(false_node.callees().empty());
+  EXPECT_EQ(1, false_node.callers().size());
+  EXPECT_EQ(entry_computation, false_node.callers()[0]);
+}
+
 TEST_F(CallGraphTest, ComplexGraph) {
   // Test a call graph of a module with several computation called in various
   // contexts. The call graph looks like:
diff --git a/tensorflow/compiler/xla/service/call_inliner.cc b/tensorflow/compiler/xla/service/call_inliner.cc
index 3aa7f5c4d5829ccc0e8df697c1363754128ff436..482ccc5b67109258f544e5657ecfa0e8f62192c0 100644
--- a/tensorflow/compiler/xla/service/call_inliner.cc
+++ b/tensorflow/compiler/xla/service/call_inliner.cc
@@ -82,6 +82,10 @@ class SubcomputationInsertionVisitor : public DfsHloVisitorWithDefault {
     return outer_->ReplaceInstruction(call_, new_root);
   }
 
+  CallInliner::InlinedInstructionMap ConsumeInstructionMap() {
+    return std::move(subcomputation_hlo_to_new_hlo_);
+  }
+
  private:
   // Resolves the callee subcomputation_hlo to the new (inline) HLO in the
   // caller computation, or returns a NotFound error if that subcomputation HLO
@@ -112,13 +116,13 @@ class SubcomputationInsertionVisitor : public DfsHloVisitorWithDefault {
 
   HloInstruction* call_;
   HloComputation* outer_;
-  std::unordered_map<HloInstruction*, HloInstruction*>
-      subcomputation_hlo_to_new_hlo_;
+  CallInliner::InlinedInstructionMap subcomputation_hlo_to_new_hlo_;
 };
 
 }  // namespace
 
-/* static */ Status CallInliner::Inline(HloInstruction* call) {
+/* static */ StatusOr<CallInliner::InlinedInstructionMap> CallInliner::Inline(
+    HloInstruction* call) {
   TF_RET_CHECK(call->opcode() == HloOpcode::kCall)
       << "Instruction was not a call op: " << call->opcode();
   const auto& callees = call->called_computations();
@@ -126,7 +130,8 @@ class SubcomputationInsertionVisitor : public DfsHloVisitorWithDefault {
   HloComputation* callee = callees[0];
   // We visit the callee, cloning its body into its caller.
   SubcomputationInsertionVisitor visitor(call);
-  return callee->Accept(&visitor);
+  TF_RETURN_IF_ERROR(callee->Accept(&visitor));
+  return visitor.ConsumeInstructionMap();
 }
 
 StatusOr<bool> CallInliner::Run(HloModule* module) {
@@ -140,7 +145,7 @@ StatusOr<bool> CallInliner::Run(HloModule* module) {
           VLOG(1) << "Visiting callsite: " << callsite.ToString();
           if (callsite.instruction()->opcode() == HloOpcode::kCall) {
             HloInstruction* call = callsite.instruction();
-            TF_RETURN_IF_ERROR(Inline(call));
+            TF_RETURN_IF_ERROR(Inline(call).status());
             did_mutate = true;
           }
         }
diff --git a/tensorflow/compiler/xla/service/call_inliner.h b/tensorflow/compiler/xla/service/call_inliner.h
index 2dbd38bf1ac90d3efa1453e6af6f791668d5e72a..a8345a394d46c90a48305313dac0bcd9b06938ac 100644
--- a/tensorflow/compiler/xla/service/call_inliner.h
+++ b/tensorflow/compiler/xla/service/call_inliner.h
@@ -27,8 +27,12 @@ namespace xla {
 // called function, and proceed recursively.
 class CallInliner : public HloPassInterface {
  public:
-  // Inlines one call instruction.
-  static Status Inline(HloInstruction* call);
+  using InlinedInstructionMap =
+      std::unordered_map<HloInstruction*, HloInstruction*>;
+
+  // Inlines one call instruction.  Returns a mapping from the original
+  // instructions to their inlined versions.
+  static StatusOr<InlinedInstructionMap> Inline(HloInstruction* call);
 
   ~CallInliner() override = default;
   tensorflow::StringPiece name() const override { return "CallInliner"; }
diff --git a/tensorflow/compiler/xla/service/call_inliner_test.cc b/tensorflow/compiler/xla/service/call_inliner_test.cc
index 865ed993da121d26ceb61123f1822d93814cbb9b..738d00881dd057fc13c115006c15e8f5b6d14a1d 100644
--- a/tensorflow/compiler/xla/service/call_inliner_test.cc
+++ b/tensorflow/compiler/xla/service/call_inliner_test.cc
@@ -135,7 +135,7 @@ TEST_F(CallInlinerTest, InlineWithoutRunningPass) {
       HloInstruction::CreateCall(pred, {}, false_computation));
   auto computation = module->AddEntryComputation(call_false_builder.Build());
 
-  TF_ASSERT_OK(CallInliner::Inline(call));
+  TF_ASSERT_OK(CallInliner::Inline(call).status());
   EXPECT_THAT(computation->root_instruction(), op::Constant());
   EXPECT_THAT(computation->root_instruction()->control_successors(),
               ElementsAre(op::Constant()));
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 9e96898d9b4215e67c8686d372e4b4e6edd1d88b..dab73596e1639eed62151197048ee8d29570b20a 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -101,12 +101,13 @@ CompileOnlyService::CompileAheadOfTime(
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
         CreateModuleConfig(*program_shape, instance.argument_layouts,
-                           &execution_options));
+                           &execution_options, *user_computation));
 
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
                         computation_tracker_.BuildHloModule(
                             versioned_handle, *module_config,
                             /*include_unreachable_instructions=*/true));
+    TF_RETURN_IF_ERROR(MaybeDumpHloModule(*hlo_module));
     hlo_modules.push_back(std::move(hlo_module));
   }
 
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index fc67330f5cbdbcb0d1a259d284599916a908d1fe..74fd24edf88d44b2dfdc87556b0af43987e69e08 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -72,8 +72,18 @@ class AotCompilationOptions {
   // Returns the ID of the platform to which these options apply.
   virtual perftools::gputools::Platform::Id PlatformId() const = 0;
 
+  // Optional allocator that may be used for allocating temp space on the device
+  // during compilation.
+  DeviceMemoryAllocator* device_allocator() const { return device_allocator_; }
+  void set_device_allocator(DeviceMemoryAllocator* device_allocator) {
+    device_allocator_ = device_allocator;
+  }
+
  protected:
   AotCompilationOptions() = default;
+
+ private:
+  DeviceMemoryAllocator* device_allocator_ = nullptr;
 };
 
 // Abstract compiler interface that is subclassed for compilation on a
@@ -99,9 +109,16 @@ class Compiler {
 
   // Runs Hlo passes to optimize the given Hlo module, returns the optimized
   // module.
+  //
+  // If device_allocator is not null, the compiler may use it to allocate temp
+  // space on the device for use during compilation.  For example, the compiler
+  // may allocate buffers on the device and then run variants of a given
+  // algorithm over those buffers, to see which variant is fastest.  Any space
+  // allocated should be deallocated before this function returns.
   virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor) = 0;
+      perftools::gputools::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
@@ -112,21 +129,27 @@ class Compiler {
   // The compiler may optionally specialize to the individual device
   // (not just type of device) indicated by the executor.
   //
+  // device_allocator is optional; see RunHloPasses.
+  //
   // Use the overload below to compile computations that run in parallel.
   virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* executor) = 0;
+      perftools::gputools::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules, and returns a corresponding
   // sequence of executable objects.
   //
+  // device_allocator is optional; see RunHloPasses.
+  //
   // TODO(b/68666782): Remove this method after adding support for multiple
   // modules to RunHloPasses and RunBackends.
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec) = 0;
+          stream_exec,
+      DeviceMemoryAllocator* device_allocator) = 0;
 
   // Compiles the HLO module for ahead-of-time execution.  This is intended for
   // use in static compilation.
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 3278fd5f064902459ded4d9367b5390cf8a63f27..153f062d015e49db11c4c9ae0a2a61e76c020f02 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -339,7 +340,7 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
            ShapeUtil::MakeShape(F32, {42})}),
       "param0"));
 
-  // The return value of the computation is the zero-th elemnt of the nested
+  // The return value of the computation is the zero-th element of the nested
   // tuple. This element is itself a tuple.
   auto gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       ShapeUtil::GetSubshape(param->shape(), {0}), param, 0));
@@ -1723,8 +1724,242 @@ void BM_ParallelWhiles(int num_iters, int num_whiles) {
   }
 }
 
+std::unique_ptr<HloComputation> MakeBenchmarkWhileBody(
+    const int num_tuple_inputs) {
+  auto builder = HloComputation::Builder("benchmark_loop_body");
+  const Shape element_shape = ShapeUtil::MakeShape(F32, {});
+  std::vector<Shape> input_shape(num_tuple_inputs, element_shape);
+  const Shape loop_state_shape = ShapeUtil::MakeTupleShape(input_shape);
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "loop_state"));
+  std::vector<HloInstruction*> gte_nodes(num_tuple_inputs);
+  for (int i = 0; i < num_tuple_inputs; ++i) {
+    gte_nodes[i] = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(element_shape, param, i));
+  }
+  builder.AddInstruction(HloInstruction::CreateTuple(gte_nodes));
+  return builder.Build();
+}
+
+void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) {
+  tensorflow::testing::StopTiming();
+  HloModuleConfig config;
+  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  CopyInsertion copy_insertion;
+  const Shape element_shape = ShapeUtil::MakeShape(F32, {});
+  std::vector<HloInstruction*> tuple_params(num_tuple_inputs);
+  for (int i = 0; i < num_iters; ++i) {
+    auto builder = HloComputation::Builder("BM_ParallelWhiles");
+    HloModule module("BM_ManyElementTuple", VersionedComputationHandle(),
+                     config);
+    for (int j = 0; j < num_tuple_inputs; ++j) {
+      tuple_params[j] = builder.AddInstruction(
+          HloInstruction::CreateParameter(j, element_shape, ""));
+    }
+    HloInstruction* init =
+        builder.AddInstruction(HloInstruction::CreateTuple(tuple_params));
+    HloComputation* condition =
+        module.AddEmbeddedComputation(MakeTrivialCondition(init->shape()));
+    HloComputation* body =
+        module.AddEmbeddedComputation(MakeBenchmarkWhileBody(num_tuple_inputs));
+    HloInstruction* xla_while = builder.AddInstruction(
+        HloInstruction::CreateWhile(init->shape(), condition, body, init));
+    builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+        ShapeUtil::MakeShape(F32, {}), xla_while, 0));
+    module.AddEntryComputation(builder.Build());
+    tensorflow::testing::StartTiming();
+    ASSERT_IS_OK(copy_insertion.Run(&module).status());
+    tensorflow::testing::StopTiming();
+  }
+}
+
 BENCHMARK(BM_SequentialWhiles)->Arg(512)->Arg(1024)->Arg(2048)->Arg(4096);
 BENCHMARK(BM_ParallelWhiles)->Arg(512)->Arg(1024)->Arg(2048)->Arg(4096);
+BENCHMARK(BM_ManyElementTuple)->Arg(1024)->Arg(12288);
+
+TEST_F(CopyInsertionTest, SimpleControlFlowTest) {
+  const string& hlo_string = R"(
+HloModule TestModule
+
+if-body.v5 {
+  constant.3 = s32[] constant(-1)
+  p.1 = (s32[], (s32[], s32[], s32[]), (s32[])) parameter(0)
+  get-tuple-element.18 = (s32[], s32[], s32[]) get-tuple-element(p.1), index=1
+  get-tuple-element.65 = s32[] get-tuple-element(get-tuple-element.18), index=0
+  get-tuple-element.66 = s32[] get-tuple-element(get-tuple-element.18), index=1
+  add.3 = s32[] add(get-tuple-element.65, get-tuple-element.66)
+  tuple.33 = (s32[]) tuple(add.3)
+  ROOT tuple.34 = (s32[], (s32[], s32[], s32[]), (s32[])) tuple(constant.3, get-tuple-element.18, tuple.33)
+}
+
+if-condition.v4 {
+  p.2 = (s32[], (s32[], s32[], s32[]), (s32[])) parameter(0)
+  get-tuple-element.67 = s32[] get-tuple-element(p.2), index=0
+  constant.4 = s32[] constant(0)
+  ROOT equal-to = pred[] equal-to(get-tuple-element.67, constant.4)
+}
+
+_functionalize_body_1__.v28 {
+  arg_tuple.4 = (s32[], s32[], s32[], s32[]) parameter(0)
+  get-tuple-element.68 = s32[] get-tuple-element(arg_tuple.4), index=0
+  constant.7 = s32[] constant(1)
+  add.4 = s32[] add(get-tuple-element.68, constant.7)
+  get-tuple-element.69 = s32[] get-tuple-element(arg_tuple.4), index=1
+  get-tuple-element.70 = s32[] get-tuple-element(arg_tuple.4), index=2
+  less-than-or-equal-to = pred[] less-than-or-equal-to(get-tuple-element.69, get-tuple-element.70)
+  constant.8 = s32[] constant(0)
+  select = s32[] select(less-than-or-equal-to, constant.8, constant.7)
+  get-tuple-element.71 = s32[] get-tuple-element(arg_tuple.4), index=3
+  tuple.35 = (s32[], s32[], s32[]) tuple(get-tuple-element.69, get-tuple-element.71, get-tuple-element.70)
+  tuple.36 = (s32[]) tuple(constant.8)
+  tuple.37 = (s32[], (s32[], s32[], s32[]), (s32[])) tuple(select, tuple.35, tuple.36)
+  while = (s32[], (s32[], s32[], s32[]), (s32[])) while(tuple.37), condition=if-condition.v4, body=if-body.v5
+  get-tuple-element.72 = (s32[]) get-tuple-element(while), index=2
+  get-tuple-element.73 = s32[] get-tuple-element(get-tuple-element.72), index=0
+  ROOT tuple.38 = (s32[], s32[], s32[], s32[]) tuple(add.4, get-tuple-element.69, get-tuple-element.70, get-tuple-element.73)
+}
+
+cond_wrapper.v3.1 {
+  inputs.1 = (s32[], s32[], s32[], s32[]) parameter(0)
+  get-tuple-element.75 = s32[] get-tuple-element(inputs.1), index=0
+  constant.11 = s32[] constant(7)
+  ROOT less-than.2 = pred[] less-than(get-tuple-element.75, constant.11)
+}
+
+_functionalize_body_2__.v25 {
+  arg_tuple.5 = (s32[], s32[], s32[], s32[], s32[]) parameter(0)
+  get-tuple-element.76 = s32[] get-tuple-element(arg_tuple.5), index=0
+  get-tuple-element.77 = s32[] get-tuple-element(arg_tuple.5), index=2
+  get-tuple-element.78 = s32[] get-tuple-element(arg_tuple.5), index=3
+  get-tuple-element.79 = s32[] get-tuple-element(arg_tuple.5), index=4
+  tuple.39 = (s32[], s32[], s32[], s32[]) tuple(get-tuple-element.76, get-tuple-element.77, get-tuple-element.78, get-tuple-element.79)
+  while.2 = (s32[], s32[], s32[], s32[]) while(tuple.39), condition=cond_wrapper.v3.1, body=_functionalize_body_1__.v28
+  get-tuple-element.80 = s32[] get-tuple-element(while.2), index=0
+  get-tuple-element.81 = s32[] get-tuple-element(arg_tuple.5), index=1
+  constant.12 = s32[] constant(1)
+  add.5 = s32[] add(get-tuple-element.81, constant.12)
+  get-tuple-element.82 = s32[] get-tuple-element(while.2), index=3
+  ROOT tuple.40 = (s32[], s32[], s32[], s32[], s32[]) tuple(get-tuple-element.80, add.5, get-tuple-element.77, get-tuple-element.78, get-tuple-element.82)
+}
+
+cond_wrapper.v3.2 {
+  inputs.2 = (s32[], s32[], s32[], s32[], s32[]) parameter(0)
+  get-tuple-element.83 = s32[] get-tuple-element(inputs.2), index=1
+  constant.13 = s32[] constant(5)
+  ROOT less-than.3 = pred[] less-than(get-tuple-element.83, constant.13)
+}
+
+ENTRY TestComputation {
+  arg_tuple.6 = (s32[], s32[], s32[], s32[], s32[]) parameter(0)
+  ROOT while.3 = (s32[], s32[], s32[], s32[], s32[]) while(arg_tuple.6), condition=cond_wrapper.v3.2, body=_functionalize_body_2__.v25
+}
+)";
+  auto module_or_status =
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  auto module = module_or_status.ConsumeValueOrDie();
+  InsertCopies(module.get());
+}
+
+TEST_F(CopyInsertionTest, ControlFlowTest) {
+  const string& hlo_string = R"(
+HloModule TestModule
+
+if-body.v5 {
+  constant.3 = s32[] constant(-1)
+  p.1 = (s32[], (s32[], s32[], s32[]), (s32[])) parameter(0)
+  get-tuple-element.18 = (s32[], s32[], s32[]) get-tuple-element(p.1), index=1
+  get-tuple-element.65 = s32[] get-tuple-element(get-tuple-element.18), index=0
+  get-tuple-element.66 = s32[] get-tuple-element(get-tuple-element.18), index=1
+  add.3 = s32[] add(get-tuple-element.65, get-tuple-element.66)
+  tuple.33 = (s32[]) tuple(add.3)
+  ROOT tuple.34 = (s32[], (s32[], s32[], s32[]), (s32[])) tuple(constant.3, get-tuple-element.18, tuple.33)
+}
+
+if-condition.v4 {
+  p.2 = (s32[], (s32[], s32[], s32[]), (s32[])) parameter(0)
+  get-tuple-element.67 = s32[] get-tuple-element(p.2), index=0
+  constant.4 = s32[] constant(0)
+  ROOT equal-to = pred[] equal-to(get-tuple-element.67, constant.4)
+}
+
+if-body.v5.1 {
+  constant.5 = s32[] constant(-1)
+  p.3 = (s32[], (s32[], s32[], s32[]), (s32[])) parameter(0)
+  get-tuple-element.68 = (s32[], s32[], s32[]) get-tuple-element(p.3), index=1
+  get-tuple-element.70 = s32[] get-tuple-element(get-tuple-element.68), index=2
+  multiply.1 = s32[] multiply(get-tuple-element.70, get-tuple-element.70)
+  tuple.35 = (s32[]) tuple(multiply.1)
+  ROOT tuple.36 = (s32[], (s32[], s32[], s32[]), (s32[])) tuple(constant.5, get-tuple-element.68, tuple.35)
+}
+
+if-condition.v4.1 {
+  p.4 = (s32[], (s32[], s32[], s32[]), (s32[])) parameter(0)
+  get-tuple-element.71 = s32[] get-tuple-element(p.4), index=0
+  constant.6 = s32[] constant(1)
+  ROOT equal-to.1 = pred[] equal-to(get-tuple-element.71, constant.6)
+}
+
+_functionalize_body_1__.v28 {
+  arg_tuple.4 = (s32[], s32[], s32[], s32[]) parameter(0)
+  get-tuple-element.72 = s32[] get-tuple-element(arg_tuple.4), index=0
+  constant.7 = s32[] constant(1)
+  add.4 = s32[] add(get-tuple-element.72, constant.7)
+  get-tuple-element.73 = s32[] get-tuple-element(arg_tuple.4), index=1
+  get-tuple-element.74 = s32[] get-tuple-element(arg_tuple.4), index=2
+  less-than-or-equal-to = pred[] less-than-or-equal-to(get-tuple-element.73, get-tuple-element.74)
+  constant.8 = s32[] constant(0)
+  select = s32[] select(less-than-or-equal-to, constant.8, constant.7)
+  get-tuple-element.75 = s32[] get-tuple-element(arg_tuple.4), index=3
+  tuple.37 = (s32[], s32[], s32[]) tuple(get-tuple-element.73, get-tuple-element.75, get-tuple-element.74)
+  tuple.38 = (s32[]) tuple(constant.8)
+  tuple.39 = (s32[], (s32[], s32[], s32[]), (s32[])) tuple(select, tuple.37, tuple.38)
+  while = (s32[], (s32[], s32[], s32[]), (s32[])) while(tuple.39), condition=if-condition.v4, body=if-body.v5
+  while.1 = (s32[], (s32[], s32[], s32[]), (s32[])) while(while), condition=if-condition.v4.1, body=if-body.v5.1
+  get-tuple-element.76 = (s32[]) get-tuple-element(while.1), index=2
+  get-tuple-element.77 = s32[] get-tuple-element(get-tuple-element.76), index=0
+  ROOT tuple.40 = (s32[], s32[], s32[], s32[]) tuple(add.4, get-tuple-element.73, get-tuple-element.74, get-tuple-element.77)
+}
+
+cond_wrapper.v3.1 {
+  inputs.1 = (s32[], s32[], s32[], s32[]) parameter(0)
+  get-tuple-element.78 = s32[] get-tuple-element(inputs.1), index=0
+  constant.11 = s32[] constant(7)
+  ROOT less-than.2 = pred[] less-than(get-tuple-element.78, constant.11)
+}
+
+_functionalize_body_2__.v25 {
+  arg_tuple.5 = (s32[], s32[], s32[], s32[], s32[]) parameter(0)
+  get-tuple-element.79 = s32[] get-tuple-element(arg_tuple.5), index=0
+  get-tuple-element.80 = s32[] get-tuple-element(arg_tuple.5), index=2
+  get-tuple-element.81 = s32[] get-tuple-element(arg_tuple.5), index=3
+  get-tuple-element.82 = s32[] get-tuple-element(arg_tuple.5), index=4
+  tuple.41 = (s32[], s32[], s32[], s32[]) tuple(get-tuple-element.79, get-tuple-element.80, get-tuple-element.81, get-tuple-element.82)
+  while.2 = (s32[], s32[], s32[], s32[]) while(tuple.41), condition=cond_wrapper.v3.1, body=_functionalize_body_1__.v28
+  get-tuple-element.83 = s32[] get-tuple-element(while.2), index=0
+  get-tuple-element.84 = s32[] get-tuple-element(arg_tuple.5), index=1
+  constant.12 = s32[] constant(1)
+  add.5 = s32[] add(get-tuple-element.84, constant.12)
+  get-tuple-element.85 = s32[] get-tuple-element(while.2), index=3
+  ROOT tuple.42 = (s32[], s32[], s32[], s32[], s32[]) tuple(get-tuple-element.83, add.5, get-tuple-element.80, get-tuple-element.81, get-tuple-element.85)
+}
+
+cond_wrapper.v3.2 {
+  inputs.2 = (s32[], s32[], s32[], s32[], s32[]) parameter(0)
+  get-tuple-element.86 = s32[] get-tuple-element(inputs.2), index=1
+  constant.13 = s32[] constant(5)
+  ROOT less-than.3 = pred[] less-than(get-tuple-element.86, constant.13)
+}
+
+ENTRY TestComputation {
+  arg_tuple.6 = (s32[], s32[], s32[], s32[], s32[]) parameter(0)
+  ROOT while.3 = (s32[], s32[], s32[], s32[], s32[]) while(arg_tuple.6), condition=cond_wrapper.v3.2, body=_functionalize_body_2__.v25
+}
+)";
+  auto module_or_status =
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  auto module = module_or_status.ConsumeValueOrDie();
+  InsertCopies(module.get());
+}
 
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index e1eed498f6adfdae9df1dbf183f7c0505afd4ea2..c13a0b1cdf0b5be0b69db98b2b9587f30ca4c304 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -81,14 +81,15 @@ cc_library(
         ":conv_canonicalization",
         ":cpu_copy_insertion",
         ":cpu_executable",
+        ":cpu_hlo_support_checker",
         ":cpu_instruction_fusion",
+        ":cpu_layout_assignment",
         ":cpu_options",
         ":cpu_parallelization_preparation",
         ":disassembler",
         ":dot_op_emitter",
         ":ir_emission_utils",
         ":ir_emitter",
-        ":layout_assignment",
         ":parallel_cpu_executable",
         ":parallel_task_assignment",
         ":simple_orc_jit",
@@ -100,16 +101,18 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
-        "//tensorflow/compiler/xla/service:batchnorm_rewriter",
+        "//tensorflow/compiler/xla/service:batchnorm_expander",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
+        "//tensorflow/compiler/xla/service:dot_decomposer",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/service:hlo_ordering",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
@@ -124,7 +127,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/compiler/xla/service:while_loop_invariant_code_motion",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
+        "//tensorflow/compiler/xla/service:zero_sized_hlo_elimination",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",  # fixdeps: keep
         "//tensorflow/core:lib",  # fixdeps: keep
         "//tensorflow/core:stream_executor_no_cuda",
@@ -135,8 +140,6 @@ cc_library(
         "@llvm//:core",
         "@llvm//:mc",  # fixdeps: keep
         "@llvm//:object",
-        "@llvm//:powerpc_code_gen",  # fixdeps: keep
-        "@llvm//:powerpc_disassembler",  # fixdeps: keep
         "@llvm//:support",
         "@llvm//:target",  # fixdeps: keep
         "@llvm//:x86_code_gen",  # fixdeps: keep
@@ -147,19 +150,21 @@ cc_library(
 
 cc_library(
     name = "simple_orc_jit",
-    srcs = ["simple_orc_jit.cc"],
+    srcs = [
+        "simple_orc_jit.cc",
+        "windows_compatibility.cc",
+        "windows_compatibility.h",
+    ],
     hdrs = ["simple_orc_jit.h"],
     deps = [
         ":compiler_functor",
         ":cpu_runtime",
-        ":cpu_runtime_avx",
-        ":cpu_runtime_neon",
-        ":cpu_runtime_sse4_1",
         ":custom_call_target_registry",
         ":disassembler",
         ":external_constant_pool",
         ":orc_jit_memory_mapper",
         ":runtime_conv2d",
+        ":runtime_fft",
         ":runtime_fork_join",
         ":runtime_matmul",
         ":runtime_single_threaded_conv2d",
@@ -250,8 +255,11 @@ cc_library(
         ":dot_op_emitter",
         ":external_constant_pool",
         ":ir_emission_utils",
+        ":ir_function",
+        ":parallel_loop_emitter",
         ":shape_partition",
         ":simple_orc_jit",
+        ":target_machine_features",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -280,6 +288,54 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "target_machine_features",
+    srcs = [
+        "target_machine_features.cc",
+    ],
+    hdrs = ["target_machine_features.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/core:lib",
+        "@llvm//:analysis",
+        "@llvm//:target",
+    ],
+)
+
+cc_library(
+    name = "ir_function",
+    srcs = ["ir_function.cc"],
+    hdrs = ["ir_function.h"],
+    deps = [
+        ":ir_emission_utils",
+        ":shape_partition",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/service/cpu:cpu_runtime",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+    ],
+)
+
+cc_library(
+    name = "parallel_loop_emitter",
+    srcs = ["parallel_loop_emitter.cc"],
+    hdrs = ["parallel_loop_emitter.h"],
+    deps = [
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+    ],
+)
+
 cc_library(
     name = "dot_op_emitter",
     srcs = ["dot_op_emitter.cc"],
@@ -287,6 +343,8 @@ cc_library(
     deps = [
         ":cpu_options",
         ":cpu_runtime",
+        ":target_machine_features",
+        ":vector_support_library",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
@@ -298,7 +356,6 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
-        "//tensorflow/compiler/xla/service/llvm_ir:vector_support_library",
         "//tensorflow/core:lib",
         "@llvm//:core",
     ],
@@ -336,7 +393,6 @@ cc_library(
         "@llvm//:mc",
         "@llvm//:mc_disassembler",
         "@llvm//:object",
-        "@llvm//:powerpc_disassembler",  # fixdeps: keep
         "@llvm//:support",
         "@llvm//:target",
         "@llvm//:x86_disassembler",  # fixdeps: keep
@@ -349,9 +405,6 @@ cc_library(
     hdrs = ["compiler_functor.h"],
     deps = [
         ":cpu_runtime",
-        ":cpu_runtime_avx",
-        ":cpu_runtime_neon",
-        ":cpu_runtime_sse4_1",
         ":disassembler",
         ":llvm_ir_runtime",
         "//tensorflow/compiler/xla:statusor",
@@ -371,43 +424,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "cpu_runtime_sse4_1",
-    srcs = ["cpu_runtime_sse4_1.cc"],
-    hdrs = ["cpu_runtime_sse4_1.h"],
-    copts = ["-DEIGEN_AVOID_STL_ARRAY"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:framework_lite",
-        "//third_party/eigen3",
-    ],
-)
-
-cc_library(
-    name = "cpu_runtime_avx",
-    srcs = ["cpu_runtime_avx.cc"],
-    hdrs = ["cpu_runtime_avx.h"],
-    copts = ["-DEIGEN_AVOID_STL_ARRAY"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:framework_lite",
-        "//third_party/eigen3",
-    ],
-)
-
-cc_library(
-    name = "cpu_runtime_neon",
-    srcs = ["cpu_runtime_neon.cc"],
-    hdrs = ["cpu_runtime_neon.h"],
-    # runtime_copts() enables -mfpu=neon
-    copts = ["-DEIGEN_AVOID_STL_ARRAY"] + runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:framework_lite",
-        "//third_party/eigen3",
-    ],
-)
-
 cc_library(
     name = "cpu_runtime",
     srcs = [
@@ -438,6 +454,7 @@ cc_library(
         "llvm_ir_runtime.h",
     ],
     deps = [
+        ":vector_support_library",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
         "@llvm//:core",
@@ -462,6 +479,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_fft",
+    srcs = [
+        "runtime_fft.cc",
+        "runtime_fft_impl.h",
+    ],
+    hdrs = ["runtime_fft.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "runtime_matvec",
     srcs = ["runtime_matvec.cc"],
@@ -615,13 +650,14 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/service:hlo",
+        "@llvm//:core",
     ],
 )
 
 cc_library(
-    name = "layout_assignment",
-    srcs = ["layout_assignment.cc"],
-    hdrs = ["layout_assignment.h"],
+    name = "cpu_layout_assignment",
+    srcs = ["cpu_layout_assignment.cc"],
+    hdrs = ["cpu_layout_assignment.h"],
     deps = [
         ":dot_op_emitter",
         ":ir_emission_utils",
@@ -633,11 +669,11 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "layout_assignment_test",
+    name = "cpu_layout_assignment_test",
     size = "small",
-    srcs = ["layout_assignment_test.cc"],
+    srcs = ["cpu_layout_assignment_test.cc"],
     deps = [
-        ":layout_assignment",
+        ":cpu_layout_assignment",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
@@ -763,6 +799,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "vector_support_library",
+    srcs = ["vector_support_library.cc"],
+    hdrs = ["vector_support_library.h"],
+    deps = [
+        ":target_machine_features",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "@llvm//:core",
+        "@llvm//:support",
+    ],
+)
+
 tf_cc_test(
     name = "cpu_copy_insertion_test",
     srcs = ["cpu_copy_insertion_test.cc"],
@@ -783,6 +834,32 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "cpu_hlo_support_checker",
+    srcs = ["cpu_hlo_support_checker.cc"],
+    hdrs = ["cpu_hlo_support_checker.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_hlo_support_checker_test",
+    srcs = ["cpu_hlo_support_checker_test.cc"],
+    deps = [
+        ":cpu_hlo_support_checker",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 04b4a8c5c80eeefdbe10001ba5c462affbc9b21d..ed290fcdf8bb69f1bbad57fa5a0926376bc9405a 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -37,9 +37,6 @@ limitations under the License.
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h"
 #include "tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -50,15 +47,6 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-/* static */ CompilerFunctor::VectorIntrinsics
-CompilerFunctor::AllIntrinsics() {
-  VectorIntrinsics intrinsics;
-  intrinsics.sse_intrinsics = true;
-  intrinsics.avx_intrinsics = true;
-  intrinsics.neon_intrinsics = true;
-  return intrinsics;
-}
-
 /* Create filtered versions of the LLVM Pass Managers to filter out some
 of the expensive passes.
 Profiling:
@@ -192,89 +180,28 @@ operator()(llvm::Module& module) const {
       std::move(object_file), std::move(memory_buffer));
 }
 
-namespace {
-// Returns the set of vectorized library functions supported for the target.
-std::vector<llvm::VecDesc> VectorFunctionsForTargetLibraryInfoImpl(
-    llvm::Triple::ArchType arch, llvm::StringRef feature_string,
-    CompilerFunctor::VectorIntrinsics const& available_intrinsics) {
-  std::vector<llvm::VecDesc> vector_functions;
-
-  const llvm::VecDesc four_wide_vector_functions_neon[] = {
-      {"expf", runtime::kExpV4F32NEONSymbolName, 4},
-      {"llvm.exp.f32", runtime::kExpV4F32NEONSymbolName, 4},
-
-      {"logf", runtime::kLogV4F32NEONSymbolName, 4},
-      {"llvm.log.f32", runtime::kLogV4F32NEONSymbolName, 4},
-  };
-
-  const llvm::VecDesc four_wide_vector_functions_sse[] = {
-      {"expf", runtime::kExpV4F32SSESymbolName, 4},
-      {"llvm.exp.f32", runtime::kExpV4F32SSESymbolName, 4},
-
-      {"logf", runtime::kLogV4F32SSESymbolName, 4},
-      {"llvm.log.f32", runtime::kLogV4F32SSESymbolName, 4},
-  };
-
-  const llvm::VecDesc eight_wide_vector_functions_avx[] = {
-      {"expf", runtime::kExpV8F32AVXSymbolName, 8},
-      {"llvm.exp.f32", runtime::kExpV8F32AVXSymbolName, 8},
-
-      {"logf", runtime::kLogV8F32AVXSymbolName, 8},
-      {"llvm.log.f32", runtime::kLogV8F32AVXSymbolName, 8},
-  };
-
-  // These functions are generated by XLA as LLVM IR, so they're always
-  // available.
-  const llvm::VecDesc ir_vector_functions[] = {
+static std::vector<llvm::VecDesc> VectorFunctionsForTargetLibraryInfoImpl() {
+  std::vector<llvm::VecDesc> result = {
       {"tanhf", runtime::kTanhV4F32SymbolName, 4},
       {"llvm.tanh.f32", runtime::kTanhV4F32SymbolName, 4},
 
       {"tanhf", runtime::kTanhV8F32SymbolName, 8},
       {"llvm.tanh.f32", runtime::kTanhV8F32SymbolName, 8},
-  };
 
-  llvm::SmallVector<llvm::StringRef, 32> features;
-  feature_string.split(features, ',', -1, /*KeepEmpty=*/false);
-  auto has_feature = [&features](const llvm::StringRef feature) {
-    return std::find(features.begin(), features.end(), feature) !=
-           features.end();
-  };
+      {"expf", runtime::kExpV4F32SymbolName, 4},
+      {"llvm.exp.f32", runtime::kExpV4F32SymbolName, 4},
 
-  switch (arch) {
-    case llvm::Triple::x86:
-    case llvm::Triple::x86_64: {
-      if (has_feature("+sse4.1") && available_intrinsics.sse_intrinsics) {
-        vector_functions.insert(vector_functions.end(),
-                                std::begin(four_wide_vector_functions_sse),
-                                std::end(four_wide_vector_functions_sse));
-      }
-      if (has_feature("+avx") && available_intrinsics.avx_intrinsics) {
-        vector_functions.insert(vector_functions.end(),
-                                std::begin(eight_wide_vector_functions_avx),
-                                std::end(eight_wide_vector_functions_avx));
-      }
-      break;
-    }
-    case llvm::Triple::arm:
-    case llvm::Triple::aarch64: {
-      if (has_feature("+neon") && available_intrinsics.neon_intrinsics) {
-        vector_functions.insert(vector_functions.end(),
-                                std::begin(four_wide_vector_functions_neon),
-                                std::end(four_wide_vector_functions_neon));
-      }
-      break;
-    }
-    default:
-      break;
-  }
+      {"expf", runtime::kExpV8F32SymbolName, 8},
+      {"llvm.exp.f32", runtime::kExpV8F32SymbolName, 8},
 
-  vector_functions.insert(vector_functions.end(),
-                          std::begin(ir_vector_functions),
-                          std::end(ir_vector_functions));
+      {"logf", runtime::kLogV4F32SymbolName, 4},
+      {"llvm.log.f32", runtime::kLogV4F32SymbolName, 4},
 
-  return vector_functions;
+      {"logf", runtime::kLogV8F32SymbolName, 8},
+      {"llvm.log.f32", runtime::kLogV8F32SymbolName, 8},
+  };
+  return result;
 }
-}  // namespace
 
 void CompilerFunctor::AddTargetInfoPasses(
     llvm::legacy::PassManagerBase* passes) const {
@@ -282,9 +209,7 @@ void CompilerFunctor::AddTargetInfoPasses(
   auto target_library_info_impl =
       MakeUnique<llvm::TargetLibraryInfoImpl>(target_triple);
   target_library_info_impl->addVectorizableFunctions(
-      VectorFunctionsForTargetLibraryInfoImpl(
-          target_triple.getArch(), target_machine_->getTargetFeatureString(),
-          available_intrinsics_));
+      VectorFunctionsForTargetLibraryInfoImpl());
   passes->add(
       new llvm::TargetLibraryInfoWrapperPass(*target_library_info_impl));
   passes->add(createTargetTransformInfoWrapperPass(
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.h b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
index 8cdd049e7b773bdc455db627ff1749997d621ee4..1a8283a702223a7414c1ffcd99c1ac42c04ac068 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.h
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
@@ -31,21 +31,10 @@ namespace cpu {
 // Orc JIT compile layer.
 class CompilerFunctor {
  public:
-  // Describes the set of vector intrinsics available to the generated code.
-  struct VectorIntrinsics {
-    bool sse_intrinsics;
-    bool avx_intrinsics;
-    bool neon_intrinsics;
-  };
-
-  // Returns a VectorIntrinsics where all intrinsics are available.
-  static VectorIntrinsics AllIntrinsics();
-
   explicit CompilerFunctor(
       llvm::TargetMachine* target_machine, const Disassembler* disassembler,
       int opt_level, bool optimize_for_size, bool enable_fast_math,
       bool disable_expensive_passes,
-      const VectorIntrinsics& available_intrinsics,
       LLVMCompiler::ModuleHook pre_optimization_hook = nullptr,
       LLVMCompiler::ModuleHook post_optimization_hook = nullptr)
       : target_machine_(target_machine),
@@ -54,7 +43,6 @@ class CompilerFunctor {
         optimize_for_size_(optimize_for_size),
         enable_fast_math_(enable_fast_math),
         disable_expensive_passes_(disable_expensive_passes),
-        available_intrinsics_(available_intrinsics),
         pre_optimization_hook_(pre_optimization_hook),
         post_optimization_hook_(post_optimization_hook) {}
 
@@ -78,7 +66,6 @@ class CompilerFunctor {
   const bool optimize_for_size_;
   const bool enable_fast_math_;
   const bool disable_expensive_passes_;
-  const VectorIntrinsics available_intrinsics_;
   LLVMCompiler::ModuleHook pre_optimization_hook_;
   LLVMCompiler::ModuleHook post_optimization_hook_;
 };
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index addd7284c593f3dcdd86b1745f9aef7b6a1c30c6..f9cc9651846cca7bd6ab7e9e61590cec4e2400da 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -42,7 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
-#include "tensorflow/compiler/xla/service/batchnorm_rewriter.h"
+#include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
@@ -50,24 +51,27 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/conv_canonicalization.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h"
 #include "tensorflow/compiler/xla/service/cpu/disassembler.h"
 #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emitter.h"
-#include "tensorflow/compiler/xla/service/cpu/layout_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h"
 #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
@@ -83,7 +87,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
+#include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -149,11 +155,6 @@ CpuCompiler::CpuCompiler() {
   LLVMInitializeAArch64TargetMC();
   LLVMInitializeAArch64AsmPrinter();
   LLVMInitializeAArch64Disassembler();
-  LLVMInitializePowerPCTarget();
-  LLVMInitializePowerPCTargetInfo();
-  LLVMInitializePowerPCTargetMC();
-  LLVMInitializePowerPCAsmPrinter();
-  LLVMInitializePowerPCDisassembler();
 }
 
 namespace {
@@ -166,42 +167,16 @@ namespace {
 // first module is compiled.
 std::once_flag llvm_command_line_options_initialized;
 
-void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
-  auto options = config.debug_options().xla_backend_extra_options();
-  if (!options.empty()) {
-    std::vector<string> fake_argv_storage;
-    fake_argv_storage.push_back("");
-    for (const auto& it : options) {
-      // Skip options the XLA backend itself consumes.
-      if (!tensorflow::StringPiece(it.first).starts_with("xla_")) {
-        if (it.second.empty()) {
-          fake_argv_storage.push_back(it.first);
-        } else {
-          fake_argv_storage.push_back(it.first + "=" + it.second);
-        }
-      }
-    }
-
-    VLOG(2) << "Passing argv to LLVM:";
-    std::vector<const char*> fake_argv;
-    for (const auto& s : fake_argv_storage) {
-      fake_argv.push_back(s.c_str());
-      VLOG(2) << s;
-    }
-    llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]);
-  }
-}
-
 // This visitor records which HLO instructions should have profiling information
 // recorded.
 class CollectProfileCandidates : public DfsHloVisitorWithDefault {
  public:
-  static StatusOr<std::unordered_map<const HloInstruction*, size_t>>
+  static StatusOr<std::unordered_map<const HloInstruction*, int64>>
   GetCandidatesForComputation(
       HloComputation* computation,
       const std::unordered_map<const HloInstruction*, int64>&
           assigned_indices) {
-    std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx;
+    std::unordered_map<const HloInstruction*, int64> hlo_to_profile_idx;
     CollectProfileCandidates profile_candidates_for_computation(
         &hlo_to_profile_idx, assigned_indices);
     TF_RETURN_IF_ERROR(
@@ -211,7 +186,7 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
 
  private:
   CollectProfileCandidates(
-      std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx,
+      std::unordered_map<const HloInstruction*, int64>* hlo_to_profile_idx,
       const std::unordered_map<const HloInstruction*, int64>& assigned_indices)
       : hlo_to_profile_idx_(hlo_to_profile_idx),
         assigned_indices_(assigned_indices) {}
@@ -251,7 +226,7 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx_;
+  std::unordered_map<const HloInstruction*, int64>* hlo_to_profile_idx_;
   const std::unordered_map<const HloInstruction*, int64>& assigned_indices_;
 };
 }  // namespace
@@ -259,7 +234,8 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
 Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   // Optimization pipeline.
   HloPassPipeline pipeline("CPU");
-  pipeline.AddInvariantChecker<HloVerifier>(ShapeSizeBytesFunction());
+  pipeline.AddInvariantChecker<HloVerifier>();
+  pipeline.AddPass<CpuHloSupportChecker>();
 
   ReducePrecisionInsertion::AddPasses(
       &pipeline, module->config().debug_options(),
@@ -272,14 +248,14 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   // TODO(b/65775800): Fix wrong output bug in Call and remove the CallInliner
   // pass.
   pipeline.AddPass<CallInliner>();
-
+  pipeline.AddPass<DotDecomposer>();
   pipeline.AddPass<ConvCanonicalization>();
   {
     auto& pass =
         pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
-    pass.AddInvariantChecker<HloVerifier>(ShapeSizeBytesFunction());
+    pass.AddInvariantChecker<HloVerifier>();
 
-    pass.AddPass<BatchNormRewriter>(
+    pass.AddPass<BatchNormExpander>(
         /*rewrite_training_op=*/true,
         /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true,
@@ -288,6 +264,12 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
         /*is_layout_sensitive=*/false,
         [](const Shape&, const Shape&) { return false; },
         /*enable_dot_strength_reduction=*/false);
+
+    // BatchNormExpander can create zero-sized ops, so zero-sized HLO
+    // elimination has to come after that pass.
+    pipeline.AddPass<ZeroSizedHloElimination>();
+
+    pass.AddPass<WhileLoopInvariantCodeMotion>();
     pass.AddPass<TupleSimplifier>();
     pass.AddPass<WhileLoopSimplifier>();
     pass.AddPass<HloDCE>();
@@ -318,6 +300,7 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
       [](const Shape&, const Shape&) { return true; },
       /*enable_dot_strength_reduction=*/false);
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+  pipeline.AddPass<HloElementTypeConverter>(BF16, F32);
   // Outline ops in the entry computation into calls to subcomputations.
   const int max_parallelism =
       module->config().intra_op_parallelism_threads() > 0
@@ -435,11 +418,27 @@ Status InitializeModuleHooks(
   return Status::OK();
 }
 
+Status VerifyLlvmModule(const llvm::Module& llvm_module) {
+  XLA_SCOPED_LOGGING_TIMER("CpuCompiler - Running LLVM verifier");
+
+  std::string err;
+  llvm::raw_string_ostream err_stream(err);
+
+  // verifyModule() returns true if the module is broken.
+  TF_RET_CHECK(!llvm::verifyModule(llvm_module, &err_stream))
+      << "Invalid LLVM IR before optimizations:\n"
+      << err_stream.str()
+      << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. "
+         "Rerun with --xla_dump_ir_to to get the IR. ";
+  return Status::OK();
+}
+
 }  // namespace
 
 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* /*stream_exec*/) {
+    perftools::gputools::StreamExecutor* /*stream_exec*/,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(2) << "Before optimization:";
   XLA_VLOG_LINES(2, module->ToString());
 
@@ -452,7 +451,8 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
 
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module,
-    perftools::gputools::StreamExecutor* stream_exec) {
+    perftools::gputools::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   const string timer_message =
       "Compiling [" + module->name() + "] for CPU using JIT";
   XLA_SCOPED_LOGGING_TIMER(timer_message);
@@ -460,7 +460,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   VLOG(1) << "Compiling: " << module->name();
   TF_RET_CHECK(stream_exec != nullptr);
   std::call_once(llvm_command_line_options_initialized,
-                 &InitializeLLVMCommandLineOptions, module->config());
+                 &llvm_ir::InitializeLLVMCommandLineOptions, module->config());
 
   ModuleHook pre_optimization_ir_hook;
   ModuleHook post_optimization_ir_hook;
@@ -483,17 +483,19 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   llvm_module->setDataLayout(jit->data_layout());
   llvm_module->setTargetTriple(jit->target_triple().getTriple());
 
-  HloComputation* computation = module->entry_computation();
-  std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx;
+  HloComputation* entry_computation = module->entry_computation();
+  std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx;
+  std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
   std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
-  std::unique_ptr<HloProfilePrinter> hlo_profile_printer;
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
   if (module->config().hlo_profiling_enabled()) {
     hlo_profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
 
     TF_ASSIGN_OR_RETURN(
-        hlo_to_profile_idx,
+        instruction_to_profile_idx,
         CollectProfileCandidates::GetCandidatesForComputation(
-            computation, hlo_profile_index_map->instruction_to_profile_idx()));
+            entry_computation,
+            hlo_profile_index_map->instruction_to_profile_idx()));
 
     auto shape_size_bytes = [](const Shape& shape) {
       // On the cpu, opaques are pointers.
@@ -504,8 +506,11 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     };
 
     HloCostAnalysis cost_analysis(shape_size_bytes);
-    hlo_profile_printer =
-        CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis);
+    TF_RETURN_IF_ERROR(entry_computation->Accept(&cost_analysis));
+    hlo_profile_printer_data =
+        CreateHloProfilePrinterData(*hlo_profile_index_map, cost_analysis);
+    computation_to_profile_idx =
+        hlo_profile_index_map->computation_to_profile_idx();
   }
 
   std::unique_ptr<Executable> cpu_executable;
@@ -514,8 +519,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // ownership is std::moved.
   const bool embed_ir_in_executable =
       module->config().debug_options().xla_embed_ir_in_executable();
-  const string xla_dump_hlo_proto_to =
-      module->config().debug_options().xla_dump_hlo_proto_to();
+  const string xla_dump_optimized_hlo_proto_to =
+      module->config().debug_options().xla_dump_optimized_hlo_proto_to();
 
   if (options::CpuParallelBackendRequested(module->config())) {
     VLOG(1) << "Using parallel cpu backend";
@@ -528,17 +533,17 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     // uses data dependencies for determining order.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(module.get(),
-                            xla::MakeUnique<DependencyHloOrdering>(module.get()),
-                            BufferSizeBytesFunction(), memory_alignment));
+        BufferAssigner::Run(
+            module.get(), xla::MakeUnique<DependencyHloOrdering>(module.get()),
+            BufferSizeBytesFunction(), memory_alignment));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
 
-    if (!xla_dump_hlo_proto_to.empty()) {
+    if (!xla_dump_optimized_hlo_proto_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_hlo_proto_to, module->name()));
+          proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
 
     // If we are using the parallel CPU backend, we need to create map from
@@ -546,7 +551,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     std::map<HloComputation*, HloInstruction*> parallel_computations;
     std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
         aligned_constants;
-    for (auto instruction : computation->MakeInstructionPostOrder()) {
+    for (auto instruction : entry_computation->MakeInstructionPostOrder()) {
       // Parameters and constants don't get their own computation.
       if (instruction->opcode() == HloOpcode::kParameter) {
         continue;
@@ -554,7 +559,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
       if (instruction->opcode() == HloOpcode::kConstant) {
         // Copy the constant out of the ProtocolBuffer so that we can give it a
         // higher alignment.
-        const void* data = instruction->literal().InternalData();
+        const void* data = instruction->literal().untyped_data();
         int64 size = CpuExecutable::ShapeSizeBytes(instruction->shape());
         auto iter = aligned_constants.emplace(
             instruction, xla::MakeUnique<unsigned char[]>(size));
@@ -571,22 +576,15 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
       parallel_computations.emplace(to_apply, instruction);
     }
 
-    // We always profile the entire computation as a whole, even if hlo
-    // profiling is disabled.  When hlo profiling is diabled, we pass in a
-    // profile counter array of just one element, which corresponds to the whole
-    // computation.
-    size_t entry_computation_profile_idx =
-        hlo_profile_index_map ? hlo_profile_index_map->GetProfileIndexFor(
-                                    *module->entry_computation())
-                              : 0;
     IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
-                         hlo_to_profile_idx, entry_computation_profile_idx,
+                         std::move(instruction_to_profile_idx),
+                         std::move(computation_to_profile_idx),
                          jit->target_machine(), jit->external_constant_pool());
 
     std::unique_ptr<HloInstructionMap<string>> function_names(
         new HloInstructionMap<string>());
     for (auto embedded_computation :
-         computation->MakeEmbeddedComputationsList()) {
+         entry_computation->MakeEmbeddedComputationsList()) {
       if (embedded_computation->IsFusionComputation()) {
         continue;
       }
@@ -600,7 +598,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
           llvm::Function * ir_function,
           ir_emitter.EmitComputation(
               embedded_computation, embedded_computation->name(),
-              /*is_entry_computation=*/computation_is_parallel,
+              /*is_top_level_computation=*/computation_is_parallel,
               /*instruction_order=*/nullptr));
       // If this computation is parallel, remember it in the function name map.
       // This way we know what function to execute when we try to run code for
@@ -616,13 +614,14 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     if (embed_ir_in_executable) {
       ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
     }
+    TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
 
     // JIT compile the LLVM IR module to in-memory machine code.
     jit->AddModule(std::move(llvm_module));
     cpu_executable.reset(new ParallelCpuExecutable(
         std::move(jit), std::move(assignment), std::move(module),
         std::move(function_names), std::move(aligned_constants),
-        std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
+        std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map)));
 
     if (embed_ir_in_executable) {
       static_cast<CpuExecutable&>(*cpu_executable)
@@ -642,27 +641,19 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     // temporary buffers are required to run the computation.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(
-            module.get(),
-            xla::MakeUnique<SequentialHloOrdering>(module.get(), module_sequence),
-            BufferSizeBytesFunction(), memory_alignment));
+        BufferAssigner::Run(module.get(),
+                            xla::MakeUnique<SequentialHloOrdering>(
+                                module.get(), module_sequence),
+                            BufferSizeBytesFunction(), memory_alignment));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
 
-    if (!xla_dump_hlo_proto_to.empty()) {
+    if (!xla_dump_optimized_hlo_proto_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_hlo_proto_to, module->name()));
+          proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
-    // We always profile the entire computation as a whole, even if hlo
-    // profiling is disabled.  When hlo profiling is diabled, we pass in a
-    // profile counter array of just one element, which corresponds to the whole
-    // computation.
-    size_t entry_computation_profile_idx =
-        hlo_profile_index_map ? hlo_profile_index_map->GetProfileIndexFor(
-                                    *module->entry_computation())
-                              : 0;
 
     // Each computation is a single function.  Emit all embedded computations
     // before the entry computation. The order of computations returned from
@@ -670,11 +661,12 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     // before a caller computation.
 
     IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
-                         hlo_to_profile_idx, entry_computation_profile_idx,
+                         std::move(instruction_to_profile_idx),
+                         std::move(computation_to_profile_idx),
                          jit->target_machine(), jit->external_constant_pool());
 
     for (auto embedded_computation :
-         computation->MakeEmbeddedComputationsList()) {
+         entry_computation->MakeEmbeddedComputationsList()) {
       if (embedded_computation->IsFusionComputation()) {
         continue;
       }
@@ -682,29 +674,33 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
           ir_emitter
               .EmitComputation(embedded_computation,
                                embedded_computation->name(),
-                               /*is_entry_computation=*/false,
+                               /*is_top_level_computation=*/false,
                                &module_sequence.at(embedded_computation))
               .status());
     }
-    string function_name_prefix =
-        computation->name().empty() ? "__compute" : computation->name();
+    string function_name_prefix = entry_computation->name().empty()
+                                      ? "__compute"
+                                      : entry_computation->name();
     TF_ASSIGN_OR_RETURN(
         llvm::Function * entry_function,
-        ir_emitter.EmitComputation(computation, function_name_prefix,
-                                   /*is_entry_computation=*/true,
-                                   &module_sequence.at(computation)));
+        ir_emitter.EmitComputation(entry_computation, function_name_prefix,
+                                   /*is_top_level_computation=*/true,
+                                   &module_sequence.at(entry_computation)));
 
     string function_name = llvm_ir::AsString(entry_function->getName());
     string ir_module_string;
     if (embed_ir_in_executable) {
       ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
     }
+    TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
+
+    XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(*llvm_module));
 
     // JIT compile the LLVM IR module to in-memory machine code.
     jit->AddModule(std::move(llvm_module));
     cpu_executable.reset(new CpuExecutable(
         std::move(jit), std::move(assignment), std::move(module), function_name,
-        std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
+        std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map)));
 
     if (embed_ir_in_executable) {
       static_cast<CpuExecutable&>(*cpu_executable)
@@ -721,7 +717,8 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
                                 const AotCompilationOptions& aot_options) {
   TF_RET_CHECK(!modules.empty());
   std::call_once(llvm_command_line_options_initialized,
-                 &InitializeLLVMCommandLineOptions, modules[0]->config());
+                 &llvm_ir::InitializeLLVMCommandLineOptions,
+                 modules[0]->config());
 
   // We can pass just one llvm::TargetOptions when we compile the LLVM module,
   // so we bail if the configs have conflicting flags. At the moment, the only
@@ -824,27 +821,28 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
         BufferAssigner::Run(
-            module, xla::MakeUnique<SequentialHloOrdering>(module, module_sequence),
+            module,
+            xla::MakeUnique<SequentialHloOrdering>(module, module_sequence),
             BufferSizeBytesFunction(), memory_alignment));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
 
-    const string xla_dump_hlo_proto_to =
-        module->config().debug_options().xla_dump_hlo_proto_to();
-    if (!xla_dump_hlo_proto_to.empty()) {
+    const string xla_dump_optimized_hlo_proto_to =
+        module->config().debug_options().xla_dump_optimized_hlo_proto_to();
+    if (!xla_dump_optimized_hlo_proto_to.empty()) {
       HloProto proto = MakeHloProto(*module, *assignment);
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-          proto, xla_dump_hlo_proto_to, module->name()));
+          proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
 
-    IrEmitter ir_emitter(
-        *module, *assignment, &llvm_module,
-        /*hlo_to_profile_idx=*/
-        std::unordered_map<const HloInstruction*, size_t>{},
-        /*entry_computation_profile_idx=*/tensorflow::gtl::nullopt,
-        target_machine.get(),
-        /*external_constant_pool=*/nullptr);
+    IrEmitter ir_emitter(*module, *assignment, &llvm_module,
+                         /*instruction_to_profile_idx=*/
+                         std::unordered_map<const HloInstruction*, int64>{},
+                         /*computation_to_profile_idx=*/
+                         std::unordered_map<const HloComputation*, int64>{},
+                         target_machine.get(),
+                         /*external_constant_pool=*/nullptr);
     HloComputation* computation = module->entry_computation();
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
@@ -855,7 +853,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
           ir_emitter
               .EmitComputation(embedded_computation,
                                embedded_computation->name(),
-                               /*is_entry_computation=*/false,
+                               /*is_top_level_computation=*/false,
                                &module_sequence.at(embedded_computation))
               .status());
     }
@@ -863,7 +861,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     TF_ASSIGN_OR_RETURN(
         llvm::Function * entry_function,
         ir_emitter.EmitComputation(computation, entry_point_name,
-                                   /*is_entry_computation=*/true,
+                                   /*is_top_level_computation=*/true,
                                    &module_sequence.at(computation)));
 
     CHECK(entry_function->getName() == llvm_ir::AsStringRef(entry_point_name));
@@ -874,14 +872,23 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
         *module, user_pre_optimization_hook_, user_post_optimization_hook_,
         &pre_optimization_ir_dump_hook, &post_optimization_ir_dump_hook));
 
+    // Run the LLVM verifier over the unoptimized LLVM IR.  If it fails, run the
+    // pre-optimization IR dump hook before returning.
+    {
+      Status verify_status = VerifyLlvmModule(llvm_module);
+      if (!verify_status.ok() && pre_optimization_ir_dump_hook) {
+        pre_optimization_ir_dump_hook(llvm_module).IgnoreError();
+      }
+      TF_RETURN_IF_ERROR(verify_status);
+    }
+
     Disassembler disassembler(*target_machine);
     CompilerFunctor compiler_functor(
         target_machine.get(), &disassembler, opt_level,
         options::OptimizeForSizeRequested(module->config()),
         module->config().debug_options().xla_enable_fast_math(),
         module->config().debug_options().xla_llvm_disable_expensive_passes(),
-        CompilerFunctor::AllIntrinsics(), pre_optimization_ir_dump_hook,
-        post_optimization_ir_dump_hook);
+        pre_optimization_ir_dump_hook, post_optimization_ir_dump_hook);
     llvm::object::OwningBinary<llvm::object::ObjectFile> object_file =
         compiler_functor(llvm_module);
     llvm::StringRef object_file_data_ref = object_file.getBinary()->getData();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index ebed7058d8f7968c6e03ef90d0da6b2325037eb0..3498139ab95d21383c6dc008ae5614b7bfe91148 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -118,11 +118,13 @@ class CpuCompiler : public LLVMCompiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index e956f478b86d9816615e2902f5bbeae6d6384162..802d0a6fb46890b31d14b1fbf3b2e7d6520caccc 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -55,9 +55,9 @@ CpuExecutable::CpuExecutable(
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<const HloModule> hlo_module,
     const string& entry_function_name,
-    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
       jit_(std::move(jit)),
       assignment_(std::move(assignment)) {
@@ -73,28 +73,6 @@ CpuExecutable::CpuExecutable(
       reinterpret_cast<ComputeFunctionType>(cantFail(sym.getAddress()));
 }
 
-// Given a pointer to an output buffer (following the CPU JIT calling
-// conventions), mark addresses that are "live". The initial pointer itself is
-// trivially live. If the shape of the buffer is a tuple, this analysis looks
-// into the tuple's elements and marks them live as well (since tuples keep
-// pointers to buffers) and also works recursively.  address is an in-memory
-// buffer address that contains some runtime XLA object.  shape is its
-// shape. marked_addresses is the set of live addresses to populate.
-static void MarkLiveAddressesInOutput(
-    const void* address, const Shape& shape,
-    std::unordered_set<const void*>* marked_addresses) {
-  marked_addresses->insert(address);
-  const uintptr_t* address_buffer = static_cast<const uintptr_t*>(address);
-  if (ShapeUtil::IsTuple(shape)) {
-    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-      const uintptr_t* element_address = address_buffer + i;
-      const void* element = reinterpret_cast<const void*>(*element_address);
-      MarkLiveAddressesInOutput(
-          element, ShapeUtil::GetTupleElementShape(shape, i), marked_addresses);
-    }
-  }
-}
-
 Status CpuExecutable::AllocateBuffers(
     DeviceMemoryAllocator* memory_allocator, int device_ordinal,
     std::vector<perftools::gputools::DeviceMemoryBase>* buffers) {
@@ -148,20 +126,6 @@ Status CpuExecutable::ExecuteComputeFunction(
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
     HloExecutionProfile* hlo_execution_profile) {
-  std::vector<se::DeviceMemoryBase> argument_buffers;
-  argument_buffers.reserve(arguments.size());
-  for (const auto* argument : arguments) {
-    argument_buffers.push_back(argument->buffer(/*index=*/{}));
-  }
-  return ExecuteComputeFunction(run_options, argument_buffers, buffers,
-                                hlo_execution_profile);
-}
-
-Status CpuExecutable::ExecuteComputeFunction(
-    const ExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
-    HloExecutionProfile* hlo_execution_profile) {
   // The calling convention for JITed functions is:
   //
   //  void function(void* result, const void* run_options, void** args_array,
@@ -177,23 +141,19 @@ Status CpuExecutable::ExecuteComputeFunction(
   //               determined by buffer analysis.
   //
   std::vector<const void*> args_array;
-  for (se::DeviceMemoryBase arg_mem : arguments) {
-    args_array.push_back(arg_mem.opaque());
+  for (const ShapedBuffer* argument : arguments) {
+    args_array.push_back(argument->root_buffer().opaque());
   }
 
   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
 
-  // Allocate profiling counters for each hlo instruction that we would like to
-  // profile.  Even when not Hlo profiling, we allocate a counter for the entire
-  // computation, which we use to update ExecutionProfile below.
-  std::vector<int64>* profile_counters = nullptr;
-  std::vector<int64> profile_counter_for_entry_computation;
-  if (hlo_execution_profile) {
-    profile_counters = hlo_execution_profile->mutable_profile_counters();
-  } else {
-    profile_counters = &profile_counter_for_entry_computation;
-    profile_counter_for_entry_computation.push_back(0);
-  }
+  size_t profile_counters_size =
+      hlo_execution_profile ? hlo_execution_profile->profile_counters().size()
+                            : 0;
+  int64* profile_counters =
+      hlo_execution_profile
+          ? hlo_execution_profile->mutable_profile_counters()->data()
+          : nullptr;
 
   // Call the computation function following the calling convention.
   std::vector<void*> buffer_pointers;
@@ -208,7 +168,7 @@ Status CpuExecutable::ExecuteComputeFunction(
     VLOG(3) << tensorflow::strings::Printf(
         "  func(void* result, void* params[%zu], void* temps[%zu], "
         "uint64 profile_counters[%zu])",
-        args_array.size(), buffer_pointers.size(), profile_counters->size());
+        args_array.size(), buffer_pointers.size(), profile_counters_size);
     VLOG(3) << tensorflow::strings::Printf("    result = %p", result_buffer);
     auto ptr_printer = [](string* out, const void* p) {
       tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p));
@@ -220,11 +180,11 @@ Status CpuExecutable::ExecuteComputeFunction(
         "    temps = [%s]",
         tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str());
     VLOG(3) << tensorflow::strings::Printf("    profile_counters = %p",
-                                           profile_counters->data());
+                                           profile_counters);
   }
 
   compute_function_(result_buffer, run_options, args_array.data(),
-                    buffer_pointers.data(), profile_counters->data());
+                    buffer_pointers.data(), profile_counters);
 
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
@@ -232,13 +192,11 @@ Status CpuExecutable::ExecuteComputeFunction(
     tensorflow::mutex_lock lock(mutex_);
     const double nanoseconds = (end_micros - start_micros) * 1000.0;
     execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
-
+    // If hlo profiling was disabled then the cycle count is left empty.
     if (hlo_execution_profile) {
       execution_profile_.set_compute_cycle_count(
           hlo_execution_profile->total_cycles_executed(
               *module().entry_computation()));
-    } else {
-      execution_profile_.set_compute_cycle_count(profile_counters->back());
     }
   }
 
@@ -246,11 +204,23 @@ Status CpuExecutable::ExecuteComputeFunction(
 }
 
 static void LogLiveAddresses(
-    const std::unordered_set<const void*>& marked_addresses) {
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
+    const std::vector<bool>& buffers_in_result) {
+  if (!VLOG_IS_ON(3)) {
+    return;
+  }
+
+  CHECK_EQ(buffers.size(), buffers_in_result.size());
+  std::vector<const void*> live_out_buffers;
+  for (int i = 0; i < buffers.size(); ++i) {
+    if (buffers_in_result[i]) {
+      live_out_buffers.push_back(buffers[i].opaque());
+    }
+  }
   VLOG(3) << "Live addresses in output marking found "
-          << marked_addresses.size() << " addresses:\n"
+          << live_out_buffers.size() << " addresses:\n"
           << tensorflow::str_util::Join(
-                 marked_addresses, ", ", [](string* out, const void* address) {
+                 live_out_buffers, ", ", [](string* out, const void* address) {
                    tensorflow::strings::StrAppend(
                        out, tensorflow::strings::Printf("%p", address));
                  });
@@ -259,13 +229,12 @@ static void LogLiveAddresses(
 static Status DeallocateTempBuffers(
     DeviceMemoryAllocator* allocator, se::Stream* stream,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
-    const std::unordered_set<const void*>& marked_addresses) {
-  // Keep those marked live because they are referenced by the output of the
-  // computation and are needed by the service. They will be deallocated by the
-  // service.
+    const std::vector<bool>& buffers_in_result) {
+  // Keep those buffers in the output of the marked live because they are needed
+  // by the service. They will be deallocated by the service.
   for (size_t i = 0; i < buffers.size(); ++i) {
     se::DeviceMemoryBase alloc = buffers[i];
-    if (marked_addresses.count(alloc.opaque()) == 0 && !alloc.is_null()) {
+    if (!buffers_in_result[i] && !alloc.is_null()) {
       VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
               << alloc.opaque() << "]";
       TF_RETURN_IF_ERROR(
@@ -276,33 +245,43 @@ static Status DeallocateTempBuffers(
   return Status::OK();
 }
 
-StatusOr<perftools::gputools::DeviceMemoryBase> CpuExecutable::ExecuteOnStream(
+StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
+    tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+        allocated_buffers,
+    std::vector<bool>* buffers_in_result) {
   se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
+  auto result_buffer = MakeUnique<ShapedBuffer>(
+      /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
+      stream->parent()->platform(), stream->parent()->device_ordinal());
 
-  TF_RETURN_IF_ERROR(AllocateBuffers(
-      memory_allocator, stream->parent()->device_ordinal(), &buffers));
-  TF_RETURN_IF_ERROR(ExecuteComputeFunction(
-      &run_options->run_options(), arguments, buffers, hlo_execution_profile));
-
-  // Mark the buffers that are actually live (used in the output) when the
-  // computation finishes executing.
-  std::unordered_set<const void*> marked_addresses;
-  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                      assignment_->GetUniqueTopLevelOutputSlice());
-  se::DeviceMemoryBase top_level_output = buffers[result_slice.index()];
-  MarkLiveAddressesInOutput(top_level_output.opaque(), result_shape(),
-                            &marked_addresses);
-
-  LogLiveAddresses(marked_addresses);
-  TF_RETURN_IF_ERROR(DeallocateTempBuffers(memory_allocator, stream, buffers,
-                                           marked_addresses));
-
-  return top_level_output;
+  // Copy DeviceMemoryBase values which contain the array(s) of the result into
+  // the respective location in ShapedBuffer which is returned to the caller.
+  TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus(
+      [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
+        const auto& sources = this->GetRootPointsToSet().element(index);
+        // The points to set is unambiguous so the set should be a
+        // singleton.
+        CHECK_EQ(1, sources.size());
+        const LogicalBuffer* buffer_source = sources[0];
+        HloInstruction* src = buffer_source->instruction();
+
+        // The source for this result buffer can be a nested buffer such as
+        // a tuple element. The source instruction should have a
+        // non-parameter buffer assigned.
+        TF_ASSIGN_OR_RETURN(
+            const BufferAllocation::Slice slice,
+            this->assignment_->GetUniqueSlice(src, buffer_source->index()));
+        CHECK(!slice.allocation()->is_entry_computation_parameter());
+
+        const BufferAllocation::Index buffer_index = slice.index();
+        const se::DeviceMemoryBase& buffer = allocated_buffers[buffer_index];
+        CHECK(!buffer.is_null() || buffer.size() == 0);
+        *device_memory = buffer;
+        (*buffers_in_result)[buffer_index] = true;
+        return Status::OK();
+      }));
+  return std::move(result_buffer);
 }
 
 StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
@@ -317,67 +296,26 @@ StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteOnStream(
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
 
-  auto result_buffer =
-      MakeUnique<ShapedBuffer>(result_shape(), stream->parent()->platform(),
-                               stream->parent()->device_ordinal());
-
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
   TF_RETURN_IF_ERROR(ExecuteComputeFunction(
       &run_options->run_options(), arguments, buffers, hlo_execution_profile));
 
-  // Copy DeviceMemoryBase values which contain the array(s) of the result into
-  // the respective location in ShapedBuffer which is returned to the caller.
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
-  TF_RETURN_IF_ERROR(
-      result_buffer->mutable_shape_index_to_buffer_entry()
-          ->ForEachMutableElementWithStatus(
-              [&buffers, &buffers_in_result, &result_buffer, this](
-                  const ShapeIndex& index, size_t* buffer_entry) {
-                const auto& sources = this->GetRootPointsToSet().element(index);
-                // The points to set is unambiguous so the set should be a
-                // singleton.
-                CHECK_EQ(1, sources.size());
-                const LogicalBuffer* buffer_source = sources[0];
-                HloInstruction* src = buffer_source->instruction();
-
-                // The source for this result buffer can be a nested buffer
-                // such as a tuple element.
-
-                // The source instruction should have a non-parameter buffer
-                // assigned.
-                TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
-                                    this->assignment_->GetUniqueSlice(
-                                        src, buffer_source->index()));
-                CHECK(!slice.allocation()->is_entry_computation_parameter());
-
-                const BufferAllocation::Index buffer_index = slice.index();
-                const se::DeviceMemoryBase& buffer = buffers[buffer_index];
-                CHECK(!buffer.is_null() || buffer.size() == 0);
-                *buffer_entry = result_buffer->mutable_buffers()->size();
-                result_buffer->mutable_buffers()->push_back(buffer);
-                buffers_in_result[buffer_index] = true;
-                return Status::OK();
-              }));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ShapedBuffer> result_buffer,
+      CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
 
   // Free all buffers not in the result.
-  for (size_t i = 0; i < buffers.size(); ++i) {
-    se::DeviceMemoryBase alloc = buffers[i];
-    if (!buffers_in_result[i] && !alloc.is_null()) {
-      VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
-              << alloc.opaque() << "]";
-      TF_RETURN_IF_ERROR(memory_allocator->Deallocate(
-          stream->parent()->device_ordinal(), &alloc));
-    }
-  }
+  TF_RETURN_IF_ERROR(DeallocateTempBuffers(memory_allocator, stream, buffers,
+                                           buffers_in_result));
 
   return std::move(result_buffer);
 }
 
-StatusOr<perftools::gputools::DeviceMemoryBase>
-CpuExecutable::ExecuteAsyncOnStream(
+StatusOr<std::unique_ptr<ShapedBuffer>> CpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   if (hlo_profiling_enabled()) {
     return Unimplemented(
         "Asynchronous execution on stream with hlo profiling is not yet "
@@ -393,29 +331,25 @@ CpuExecutable::ExecuteAsyncOnStream(
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
 
-  // Mark the buffers that are actually live (used in the output) when the
-  // computation finishes executing.
-  std::unordered_set<const void*> marked_addresses;
-  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                      assignment_->GetUniqueTopLevelOutputSlice());
-  se::DeviceMemoryBase top_level_output = buffers[result_slice.index()];
-  MarkLiveAddressesInOutput(top_level_output.opaque(), result_shape(),
-                            &marked_addresses);
+  std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ShapedBuffer> result_buffer,
+      CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
 
-  LogLiveAddresses(marked_addresses);
+  LogLiveAddresses(buffers, buffers_in_result);
 
   host_stream->EnqueueTask([this, run_options, arguments, buffers,
-                            marked_addresses, memory_allocator, stream]() {
+                            buffers_in_result, memory_allocator, stream]() {
     // Failing a CHECK here is not great, but I don't see an obvious way to
     // return a failed Status asynchronously.
     TF_CHECK_OK(ExecuteComputeFunction(&run_options->run_options(), arguments,
                                        buffers,
                                        /*hlo_execution_profile=*/nullptr));
     TF_CHECK_OK(DeallocateTempBuffers(memory_allocator, stream, buffers,
-                                      marked_addresses));
+                                      buffers_in_result));
   });
 
-  return top_level_output;
+  return std::move(result_buffer);
 }
 
 /*static*/ int64 CpuExecutable::ShapeSizeBytes(const Shape& shape) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 17ee2d673ee7cde1847bf29e2399e6033cb7e30e..267b89a10b3c038dc2048f0ad5b5b343c88ef0f9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -51,25 +51,18 @@ class CpuExecutable : public Executable {
                 std::unique_ptr<const BufferAssignment> assignment,
                 std::unique_ptr<const HloModule> hlo_module,
                 const string& entry_function_name,
-                std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+                std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~CpuExecutable() override {}
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
-
   StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
+  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments) override;
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
   // This should be called after set_ir_module_string.
   const string& ir_module_string() const { return ir_module_string_; }
@@ -108,13 +101,6 @@ class CpuExecutable : public Executable {
 
   // Calls the generated function performing the computation with the given
   // arguments using the supplied buffers.
-  Status ExecuteComputeFunction(
-      const ExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          buffers,
-      HloExecutionProfile* hlo_execution_profile);
   Status ExecuteComputeFunction(
       const ExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
@@ -122,6 +108,18 @@ class CpuExecutable : public Executable {
           buffers,
       HloExecutionProfile* hlo_execution_profile);
 
+  // Create a ShapedBuffer for holding the result of the computation. The
+  // addresses (DeviceMemoryBases) are set according to buffer assignment.
+  // 'buffers_in_result' should point to a vector of the same size as
+  // 'allocated_buffers'. An element in buffers_in_result is set to true if the
+  // corresponding buffer is live out of the computation (and thus contained in
+  // the returned ShapedBuffer).
+  StatusOr<std::unique_ptr<ShapedBuffer>> CreateResultShapedBuffer(
+      const ServiceExecutableRunOptions* run_options,
+      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+          allocated_buffers,
+      std::vector<bool>* buffers_in_result);
+
   // Returns the points-to set of the root instruction of the entry
   // computation. Uses points-to analysis from buffer assignment.
   const PointsToSet& GetRootPointsToSet() const;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7bd4741a04b1135d9780e0cf765b7b33378526e1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h"
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+StatusOr<bool> CpuHloSupportChecker::Run(HloModule* module) {
+  for (auto* computation : module->computations()) {
+    for (const auto& instruction : computation->instructions()) {
+      TF_RETURN_IF_ERROR(
+          ShapeUtil::ValidateShapeWithOptionalLayout(instruction->shape()));
+      TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+          instruction->shape(),
+          [&instruction](const Shape& subshape, const ShapeIndex&) {
+            if (LayoutUtil::IsSparseArray(subshape)) {
+              return xla::Unimplemented(
+                  "CPU backend does not support HLO instruction %s with shape "
+                  "containing a sparse layout: %s",
+                  instruction->ToString().c_str(),
+                  ShapeUtil::HumanStringWithLayout(instruction->shape())
+                      .c_str());
+            }
+            return Status::OK();
+          }));
+    }
+  }
+  return false;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h
new file mode 100644
index 0000000000000000000000000000000000000000..2924b6365943f0a3ec998d7a77767a76cbb576ae
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_HLO_SUPPORT_CHECKER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_HLO_SUPPORT_CHECKER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// This pass should run early in the HLO pipeline and checks for HLO constructs
+// which are not supported by the CPU backend and cannot be removed via HLO
+// transformations (eg, sparse layouts).
+class CpuHloSupportChecker : public HloPassInterface {
+ public:
+  CpuHloSupportChecker() = default;
+  ~CpuHloSupportChecker() override = default;
+
+  tensorflow::StringPiece name() const override {
+    return "cpu_hlo_support_checker";
+  }
+
+  // Note: always returns false (no instructions are ever modified by this
+  // pass).
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_HLO_SUPPORT_CHECKER_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f463e6de623fc6ab43d685ff2a5d6882ba7b8a2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+using ::testing::HasSubstr;
+
+class CpuHloSupportCheckerTest : public HloTestBase {
+ protected:
+  CpuHloSupportChecker& checker() { return checker_; }
+
+ private:
+  CpuHloSupportChecker checker_;
+};
+
+TEST_F(CpuHloSupportCheckerTest, Add) {
+  HloComputation::Builder builder(TestName());
+  const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "param1"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kAdd, param0, param1));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK(checker().Run(module.get()).status());
+}
+
+TEST_F(CpuHloSupportCheckerTest, SparseUnimplemented) {
+  HloComputation::Builder builder(TestName());
+  const Shape sparse_shape = ShapeUtil::MakeShapeWithSparseLayout(F32, {10}, 2);
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, sparse_shape, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, sparse_shape, "param1"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      sparse_shape, HloOpcode::kAdd, param0, param1));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  Status status = checker().Run(module.get()).status();
+  ASSERT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("CPU backend does not support"));
+  EXPECT_THAT(status.error_message(),
+              HasSubstr(ShapeUtil::HumanStringWithLayout(sparse_shape)));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index f87ee3cecd932faac140636a3db7cd4aa0371b85..482e04052d5a914eab0e5bff2c7a83f3b698052f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -26,7 +26,7 @@ int64 BytesInDimension(const Shape& shape, int64 dimension) {
          shape.dimensions(dimension);
 }
 
-bool IsFusile(const HloInstruction& hlo) {
+bool CanBeLoopFused(const HloInstruction& hlo) {
   // These are the only ones we fuse since we rely on effective elemental IR
   // generation.
   return hlo.IsElementwise() ||  //
@@ -42,6 +42,23 @@ bool IsFusile(const HloInstruction& hlo) {
          hlo.opcode() == HloOpcode::kTranspose;
 }
 
+bool IsMatrixVectorDot(const HloInstruction* hlo) {
+  const Shape& hlo_shape = hlo->shape();
+  return hlo->opcode() == HloOpcode::kDot && hlo_shape.dimensions_size() == 2 &&
+         (hlo_shape.dimensions(0) == 1 || hlo_shape.dimensions(1) == 1);
+}
+
+bool CanBeOutputFused(const HloInstruction* producer,
+                      const HloInstruction* consumer) {
+  return consumer->opcode() == HloOpcode::kAdd && IsMatrixVectorDot(producer) &&
+         producer->user_count() == 1;
+}
+
+bool CanBeOutputFusedIntoSomeOperand(const HloInstruction* consumer) {
+  return consumer->opcode() == HloOpcode::kAdd &&
+         (CanBeOutputFused(consumer->operand(0), consumer) ||
+          CanBeOutputFused(consumer->operand(1), consumer));
+}
 }  // namespace
 
 bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
@@ -52,7 +69,15 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
 
   constexpr int kFusionThresholdBytes = 16 * 1024;
 
-  if (!IsFusile(*producer)) {
+  if (CanBeOutputFused(producer, consumer)) {
+    return true;
+  }
+
+  if (CanBeOutputFusedIntoSomeOperand(producer)) {
+    return false;
+  }
+
+  if (!CanBeLoopFused(*producer)) {
     VLOG(2) << "Producer is not fusile.";
     return false;
   }
@@ -108,16 +133,13 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     }
   }
 
-  if (consumer->opcode() == HloOpcode::kFusion) {
-    // InstructionFusion::ShouldFuse above only allows kLoop and kInput fusions.
-    // The CPU backend does not create kInput fusions, so we only expect to see
-    // kLoop here.
-    CHECK(consumer->fusion_kind() == HloInstruction::FusionKind::kLoop);
+  if (consumer->opcode() == HloOpcode::kFusion &&
+      consumer->fusion_kind() == HloInstruction::FusionKind::kLoop) {
     VLOG(2) << "Fusing: consumer is a fusion node.";
     return true;
   }
 
-  if (IsFusile(*consumer)) {
+  if (CanBeLoopFused(*consumer)) {
     VLOG(2) << "Fusing: consumer is elementwise or fusile.";
     return true;
   }
@@ -126,5 +148,11 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
   return false;
 }
 
+HloInstruction::FusionKind CpuInstructionFusion::ChooseKind(
+    const HloInstruction* producer, const HloInstruction* consumer) {
+  return CanBeOutputFused(producer, consumer)
+             ? HloInstruction::FusionKind::kOutput
+             : HloInstruction::FusionKind::kLoop;
+}
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h
index 0eca4c3473e1454fe5dbd8bf855b4418cf553a94..07aff34974e0cfa6c7a129f82017b280fb1ccd59 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h
@@ -30,6 +30,8 @@ class CpuInstructionFusion : public InstructionFusion {
 
  protected:
   bool ShouldFuse(HloInstruction* consumer, int64 operand_index) override;
+  HloInstruction::FusionKind ChooseKind(
+      const HloInstruction* producer, const HloInstruction* consumer) override;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index b9e4d006d77ae76e33ac51440349400ea4eff118..595c3f55b321f47e2312b93e0c238c7637495d77 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -31,6 +31,14 @@ namespace {
 
 using InstructionFusionTest = HloTestBase;
 
+std::unique_ptr<HloInstruction> MakeDot(const Shape& shape, HloInstruction* lhs,
+                                        HloInstruction* rhs) {
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  return HloInstruction::CreateDot(shape, lhs, rhs, dot_dnums);
+}
+
 TEST_F(InstructionFusionTest, DotOperationFusion_Basic_0) {
   HloComputation::Builder builder(TestName());
   HloInstruction* arg0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -40,8 +48,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Basic_0) {
 
   HloInstruction* exp0 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {1024, 256}), HloOpcode::kExp, arg0));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1024, 1}), HloOpcode::kDot, exp0, arg1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), exp0, arg1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -59,8 +67,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Basic_1) {
 
   HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {256, 1024}), HloOpcode::kExp, arg1));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1, 1024}), HloOpcode::kDot, arg0, exp1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1, 1024}), arg0, exp1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -80,8 +88,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Bitcast) {
       ShapeUtil::MakeShape(S32, {2, 512, 2, 128}), HloOpcode::kExp, arg0));
   HloInstruction* bitcast0 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {1024, 256}), HloOpcode::kBitcast, exp0));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1024, 1}), HloOpcode::kDot, bitcast0, arg1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), bitcast0, arg1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -102,8 +110,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Reshape) {
   HloInstruction* reshape0 =
       builder.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(S32, {1024, 256}), exp0));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1024, 1}), HloOpcode::kDot, reshape0, arg1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), reshape0, arg1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -121,8 +129,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_TooLarge) {
 
   HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {256, 32 * 1024}), HloOpcode::kExp, arg1));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1, 32 * 1024}), HloOpcode::kDot, arg0, exp1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1, 32 * 1024}), arg0, exp1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -140,8 +148,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_ElementReuse) {
 
   HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {256, 1024}), HloOpcode::kExp, arg1));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {2, 1024}), HloOpcode::kDot, arg0, exp1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {2, 1024}), arg0, exp1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -162,8 +170,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_TransposeFusion) {
   HloInstruction* transpose1 =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(S32, {256, 1024}), exp1, {1, 0}));
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1, 1024}), HloOpcode::kDot, arg0, transpose1));
+  builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1, 1024}), arg0, transpose1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -188,7 +196,9 @@ class OpcodeFusionTest : public InstructionFusionTest {
   // Runs CPU instruction fusion on the given module, and tests that the result
   // contains a fused op at the root with exactly the given multiset of opcodes.
   void RunFusionAndCheckOpcodesWereFused(
-      HloModule* module, const std::multiset<HloOpcode>& expected_opcodes) {
+      HloModule* module, const std::multiset<HloOpcode>& expected_opcodes,
+      HloInstruction::FusionKind fusion_kind =
+          HloInstruction::FusionKind::kLoop) {
     auto computation = module->entry_computation();
     auto did_fusion = CpuInstructionFusion().Run(module);
     ASSERT_TRUE(did_fusion.ok());
@@ -196,7 +206,7 @@ class OpcodeFusionTest : public InstructionFusionTest {
 
     HloInstruction* root = computation->root_instruction();
     ASSERT_THAT(root, op::Fusion());
-    EXPECT_EQ(root->fusion_kind(), HloInstruction::FusionKind::kLoop);
+    EXPECT_EQ(root->fusion_kind(), fusion_kind);
 
     std::vector<HloOpcode> fused_opcodes(root->fused_instruction_count());
     std::transform(root->fused_instructions().begin(),
@@ -608,6 +618,88 @@ TEST_F(OpcodeFusionTest, ReuseViaImplicitBroadcastBinary) {
               Not(op::Fusion()));
 }
 
+void CreateComputationForDotAddOutputFusionTest(const string& test_name,
+                                                HloModule* module, int m, int k,
+                                                int n,
+                                                bool add_extra_use_for_dot) {
+  HloComputation::Builder builder(test_name);
+
+  Shape dot_lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
+  Shape dot_rhs_shape = ShapeUtil::MakeShape(F32, {k, n});
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {m, n});
+
+  auto* dot_lhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, dot_lhs_shape, "param0"));
+  auto* dot_rhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, dot_rhs_shape, "param1"));
+  auto* addend = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, dot_shape, "param2"));
+
+  auto* dot = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(dot_shape, dot_lhs, dot_rhs));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(dot_shape, HloOpcode::kAdd, dot, addend));
+
+  if (add_extra_use_for_dot) {
+    builder.AddInstruction(
+        HloInstruction::CreateOutfeed(dot_shape, dot, "no_config"));
+  }
+
+  module->AddEntryComputation(builder.Build());
+}
+
+TEST_F(OpcodeFusionTest, DotAddOutputFusion_1x50x19) {
+  auto module = CreateNewModule();
+  CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/1,
+                                             /*k=*/50, /*n=*/19,
+                                             /*add_extra_use_for_dot=*/false);
+
+  RunFusionAndCheckOpcodesWereFused(
+      module.get(),
+      {HloOpcode::kDot, HloOpcode::kAdd, HloOpcode::kParameter,
+       HloOpcode::kParameter, HloOpcode::kParameter},
+      HloInstruction::FusionKind::kOutput);
+}
+
+TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1) {
+  auto module = CreateNewModule();
+  CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
+                                             /*k=*/50, /*n=*/1,
+                                             /*add_extra_use_for_dot=*/false);
+
+  RunFusionAndCheckOpcodesWereFused(
+      module.get(),
+      {HloOpcode::kDot, HloOpcode::kAdd, HloOpcode::kParameter,
+       HloOpcode::kParameter, HloOpcode::kParameter},
+      HloInstruction::FusionKind::kOutput);
+}
+
+TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x19) {
+  auto module = CreateNewModule();
+  CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
+                                             /*k=*/50, /*n=*/19,
+                                             /*add_extra_use_for_dot=*/false);
+
+  TF_ASSERT_OK_AND_ASSIGN(bool fused_something,
+                          CpuInstructionFusion().Run(module.get()));
+  EXPECT_FALSE(fused_something);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              Not(op::Fusion()));
+}
+
+TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1_multi_use) {
+  auto module = CreateNewModule();
+  CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
+                                             /*k=*/50, /*n=*/1,
+                                             /*add_extra_use_for_dot=*/true);
+
+  TF_ASSERT_OK_AND_ASSIGN(bool fused_something,
+                          CpuInstructionFusion().Run(module.get()));
+  EXPECT_FALSE(fused_something);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              Not(op::Fusion()));
+}
+
 }  // namespace
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/layout_assignment.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
similarity index 55%
rename from tensorflow/compiler/xla/service/cpu/layout_assignment.cc
rename to tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
index 3f2d101959db50d9f775097f01d5a2ba25a0da8c..e8117377e61a4e21b8c45b929c518a18878fcb60 100644
--- a/tensorflow/compiler/xla/service/cpu/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/cpu/layout_assignment.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h"
 
 #include <numeric>
 
@@ -25,58 +25,77 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-Status CpuLayoutAssignment::AddBackendConstraints(
-    LayoutConstraints* constraints) {
-  auto row_major_shape = [](const Shape& old_shape) {
-    Shape new_shape(old_shape);
-    std::vector<int64> dimension_order(new_shape.dimensions_size());
-    std::iota(dimension_order.rbegin(), dimension_order.rend(), 0);
-    *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
-    return new_shape;
-  };
-  auto col_major_shape = [](const Shape& old_shape) {
-    Shape new_shape(old_shape);
-    std::vector<int64> dimension_order(new_shape.dimensions_size());
-    std::iota(dimension_order.begin(), dimension_order.end(), 0);
-    *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
-    return new_shape;
-  };
-
-  // We want to change the layout of constant arrays to be column major when all
-  // of their users are dot operations that can be made faster with the flipped
-  // layout.  To avoid going quadriatic over the # of instructions, we cache
-  // this property in should_make_rhs_col_major -- it maps a constant to true if
-  // all of the users of said constant are dot operations that can be sped up.
-  // This cache is populated lazily as we encounter dot operations traversing
-  // the instruction stream.
-  tensorflow::gtl::FlatMap<const HloInstruction*, bool>
-      should_make_rhs_col_major_cache;
-  auto should_make_rhs_col_major = [&](const HloInstruction& instruction) {
-    if (ProfitableToImplementDotInUntiledLlvmIr(instruction) !=
-        DotInLlvmIrProfitable::kWithColumnMajorRhs) {
+// We want to change the layout of constant arrays to be column major when all
+// of their users are dot operations that can be made faster with the flipped
+// layout.  To avoid going quadriatic over the # of instructions, we cache this
+// property in should_make_rhs_col_major -- it maps a constant to true if all of
+// the users of said constant are dot operations that can be sped up.  This
+// cache is populated lazily as we encounter dot operations traversing the
+// instruction stream.
+
+namespace {
+using ::tensorflow::gtl::nullopt;
+using ::tensorflow::gtl::optional;
+
+using ShouldMakeOperandColMajorCache =
+    tensorflow::gtl::FlatMap<const HloInstruction*, bool>;
+}  // namespace
+
+static bool ShouldMakeAllUsersColMajor(const HloInstruction* instruction) {
+  for (auto* user : instruction->users()) {
+    optional<int64> operand_idx = ProfitableToMakeDotOperandColumnMajor(*user);
+    if (!operand_idx || user->operand(*operand_idx) != instruction ||
+        std::count(user->operands().begin(), user->operands().end(),
+                   instruction) != 1) {
       return false;
     }
+  }
+  return true;
+}
 
-    const auto* rhs = instruction.operand(1);
-    if (rhs->opcode() != HloOpcode::kConstant) {
-      return false;
-    }
+static optional<int64> ShouldMakeOperandColumnMajor(
+    ShouldMakeOperandColMajorCache* cache, const HloInstruction& instruction) {
+  optional<int64> operand_idx =
+      ProfitableToMakeDotOperandColumnMajor(instruction);
+  if (!operand_idx) {
+    return nullopt;
+  }
 
-    auto it = should_make_rhs_col_major_cache.find(rhs);
-    if (it != should_make_rhs_col_major_cache.end()) {
-      return it->second;
-    }
+  const HloInstruction* operand = instruction.operand(*operand_idx);
+  if (operand->opcode() != HloOpcode::kConstant) {
+    return nullopt;
+  }
 
-    bool result = std::all_of(
-        rhs->users().begin(), rhs->users().end(), [&](HloInstruction* user) {
-          return ProfitableToImplementDotInUntiledLlvmIr(*user) ==
-                     DotInLlvmIrProfitable::kWithColumnMajorRhs &&
-                 user->operand(0) != rhs;
-        });
+  auto it = cache->find(operand);
+  if (it == cache->end()) {
+    auto insert_result =
+        cache->insert({operand, ShouldMakeAllUsersColMajor(operand)});
+    CHECK(insert_result.second);
+    it = insert_result.first;
+  }
 
-    InsertOrDie(&should_make_rhs_col_major_cache, rhs, result);
-    return result;
-  };
+  return it->second ? operand_idx : nullopt;
+}
+
+static Shape RowMajorShape(const Shape& old_shape) {
+  Shape new_shape(old_shape);
+  std::vector<int64> dimension_order(new_shape.dimensions_size());
+  std::iota(dimension_order.rbegin(), dimension_order.rend(), 0);
+  *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
+  return new_shape;
+}
+
+static Shape ColMajorShape(const Shape& old_shape) {
+  Shape new_shape(old_shape);
+  std::vector<int64> dimension_order(new_shape.dimensions_size());
+  std::iota(dimension_order.begin(), dimension_order.end(), 0);
+  *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
+  return new_shape;
+}
+
+Status CpuLayoutAssignment::AddBackendConstraints(
+    LayoutConstraints* constraints) {
+  ShouldMakeOperandColMajorCache cache;
 
   const HloComputation* computation = constraints->computation();
   for (auto* instruction : computation->instructions()) {
@@ -91,9 +110,9 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       //
       // These constraints are not hard constraints. Ideally, we should decide
       // which layouts to choose according to some cost model.
-      Shape output_shape(row_major_shape(convolution->shape()));
-      Shape input_shape(row_major_shape(lhs_instruction->shape()));
-      Shape filter_shape(row_major_shape(rhs_instruction->shape()));
+      Shape output_shape(RowMajorShape(convolution->shape()));
+      Shape input_shape(RowMajorShape(lhs_instruction->shape()));
+      Shape filter_shape(RowMajorShape(rhs_instruction->shape()));
 
       // Set layouts of the instructions' shapes.
       TF_RETURN_IF_ERROR(
@@ -102,11 +121,11 @@ Status CpuLayoutAssignment::AddBackendConstraints(
           constraints->SetOperandLayout(filter_shape, convolution, 1));
       TF_RETURN_IF_ERROR(
           constraints->SetInstructionLayout(output_shape, convolution));
-    } else if (should_make_rhs_col_major(*instruction)) {
-      auto* dot = instruction;
-      const auto& rhs_shape = dot->operand(1)->shape();
-      TF_RETURN_IF_ERROR(
-          constraints->SetOperandLayout(col_major_shape(rhs_shape), dot, 1));
+    } else if (optional<int64> op_idx =
+                   ShouldMakeOperandColumnMajor(&cache, *instruction)) {
+      const HloInstruction* op = instruction->operand(*op_idx);
+      TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+          ColMajorShape(op->shape()), instruction, *op_idx));
     } else if (PotentiallyImplementedAsEigenDot(*instruction)) {
       const HloInstruction* dot = instruction;
       // In order to implement `dot` with Eigen dot, the layouts of the lhs,
@@ -114,17 +133,17 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       //
       // These constraints are not hard constraints. Ideally, we should decide
       // which layouts to choose according to some cost model.
-      Shape output_shape(row_major_shape(dot->shape()));
+      Shape output_shape(RowMajorShape(dot->shape()));
 
       const HloInstruction* lhs_instruction = dot->operand(0);
-      Shape lhs_shape(row_major_shape(lhs_instruction->shape()));
+      Shape lhs_shape(RowMajorShape(lhs_instruction->shape()));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(lhs_shape, dot, 0));
 
       // dot is a kDot or a kTransposeDot fusion node.  In the latter case, if
       // it represents X @ X, it may have just one operand.
       if (dot->operand_count() > 1) {
         const HloInstruction* rhs_instruction = dot->operand(1);
-        Shape rhs_shape(row_major_shape(rhs_instruction->shape()));
+        Shape rhs_shape(RowMajorShape(rhs_instruction->shape()));
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(rhs_shape, dot, 1));
       }
 
@@ -141,8 +160,12 @@ Status CpuLayoutAssignment::AddBackendConstraints(
         if (constraints->OperandBufferForwarded(instruction, operand_no)) {
           continue;
         }
+        // Skip operands with non-array shapes.
+        if (!ShapeUtil::IsArray(instruction->operand(operand_no)->shape())) {
+          continue;
+        }
         Shape operand_shape(
-            row_major_shape(instruction->operand(operand_no)->shape()));
+            RowMajorShape(instruction->operand(operand_no)->shape()));
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
             operand_shape, instruction, operand_no));
       }
diff --git a/tensorflow/compiler/xla/service/cpu/layout_assignment.h b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
similarity index 86%
rename from tensorflow/compiler/xla/service/cpu/layout_assignment.h
rename to tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
index 4fd8d68dd6b4f2a8b16f6c048743a996ea76a560..c8edbb9e15a5b6f9c574f5fe9d130d149499ebd2 100644
--- a/tensorflow/compiler/xla/service/cpu/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LAYOUT_ASSIGNMENT_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LAYOUT_ASSIGNMENT_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
 
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
@@ -38,4 +38,4 @@ class CpuLayoutAssignment : public LayoutAssignment {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LAYOUT_ASSIGNMENT_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/layout_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
similarity index 54%
rename from tensorflow/compiler/xla/service/cpu/layout_assignment_test.cc
rename to tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
index 1ea5e8c7fc4896512e62396d0a756cda44785f11..6ba030fff3bbc5f413bfb133114ceb5309b77672 100644
--- a/tensorflow/compiler/xla/service/cpu/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/cpu/layout_assignment.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h"
 
 #include <initializer_list>
 #include <memory>
@@ -40,6 +40,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
@@ -61,8 +63,8 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensor) {
       HloInstruction::CreateParameter(0, lhs_shape, "param0"));
   auto dot_rhs = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateFromShape(rhs_shape)));
-  auto result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_shape, HloOpcode::kDot, dot_lhs, dot_rhs));
+  auto result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
   auto module = CreateNewModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
@@ -98,10 +100,10 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor0) {
       HloInstruction::CreateParameter(1, lhs_shape, "param1"));
   auto dot_rhs = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateFromShape(rhs_shape)));
-  auto dot_a_result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_shape, HloOpcode::kDot, dot_a_lhs, dot_rhs));
-  auto dot_b_result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_shape, HloOpcode::kDot, dot_b_lhs, dot_rhs));
+  auto dot_a_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_shape, dot_a_lhs, dot_rhs));
+  auto dot_b_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_shape, dot_b_lhs, dot_rhs));
   builder.AddInstruction(HloInstruction::CreateBinary(
       result_shape, HloOpcode::kAdd, dot_a_result, dot_b_result));
 
@@ -142,10 +144,10 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor1) {
       HloInstruction::CreateParameter(1, lhs_b_shape, "param1"));
   auto dot_rhs = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateFromShape(rhs_shape)));
-  auto dot_a_result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_a_shape, HloOpcode::kDot, dot_a_lhs, dot_rhs));
-  auto dot_b_result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_b_shape, HloOpcode::kDot, dot_b_lhs, dot_rhs));
+  auto dot_a_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_a_shape, dot_a_lhs, dot_rhs));
+  auto dot_b_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_b_shape, dot_b_lhs, dot_rhs));
   auto tuple_result = builder.AddInstruction(
       HloInstruction::CreateTuple({dot_a_result, dot_b_result}));
 
@@ -180,8 +182,8 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantLhsTensor) {
       HloInstruction::CreateConstant(Literal::CreateFromShape(lhs_shape)));
   auto dot_rhs = builder.AddInstruction(
       HloInstruction::CreateParameter(0, rhs_shape, "param0"));
-  auto dot_result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_shape, HloOpcode::kDot, dot_lhs, dot_rhs));
+  auto dot_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
   auto module = CreateNewModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
@@ -220,8 +222,8 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensorThroughGTE) {
       HloInstruction::CreateParameter(0, lhs_shape, "param0"));
   auto dot_rhs = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(rhs_shape, constant, 1));
-  auto dot_result = builder.AddInstruction(HloInstruction::CreateBinary(
-      result_shape, HloOpcode::kDot, dot_lhs, dot_rhs));
+  auto dot_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
   auto module = CreateNewModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
@@ -241,5 +243,172 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensorThroughGTE) {
     EXPECT_NE(instruction->opcode(), HloOpcode::kCopy);
   }
 }
+
+struct DotOutputFusionLayoutAssignmentResult {
+  bool layout_assignment_changed_something;
+  const HloInstruction* dot_lhs_fusion_param;
+  const HloInstruction* dot_rhs_fusion_param;
+  const HloInstruction* addend_fusion_param;
+};
+
+static StatusOr<DotOutputFusionLayoutAssignmentResult> RunDotOutputFusion(
+    HloModule* module, const string& test_name, int m, int k, int n,
+    const int64 dot_operand_idx_in_add) {
+  DotOutputFusionLayoutAssignmentResult result;
+
+  CHECK(dot_operand_idx_in_add == 0 || dot_operand_idx_in_add == 1);
+
+  auto builder = HloComputation::Builder(test_name);
+
+  Shape dot_lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
+  Shape dot_rhs_shape = ShapeUtil::MakeShape(F32, {k, n});
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {m, n});
+
+  HloInstruction* dot_lhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, dot_lhs_shape, "param0"));
+  HloInstruction* addend = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, dot_shape, "param1"));
+  HloInstruction* dot_rhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateFromShape(dot_rhs_shape)));
+  HloInstruction* dot_result = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(dot_shape, dot_lhs, dot_rhs));
+  HloInstruction* add_result;
+  if (dot_operand_idx_in_add == 0) {
+    add_result = builder.AddInstruction(HloInstruction::CreateBinary(
+        dot_shape, HloOpcode::kAdd, dot_result, addend));
+  } else {
+    add_result = builder.AddInstruction(HloInstruction::CreateBinary(
+        dot_shape, HloOpcode::kAdd, addend, dot_result));
+  }
+
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloInstruction* fusion_instruction =
+      module->entry_computation()->AddInstruction(HloInstruction::CreateFusion(
+          dot_shape, HloInstruction::FusionKind::kOutput, add_result));
+  TF_RETURN_IF_ERROR(
+      computation->ReplaceInstruction(add_result, fusion_instruction));
+
+  HloInstruction* fused_add =
+      fusion_instruction->fused_instructions_computation()->root_instruction();
+  HloInstruction* fused_dot = fusion_instruction->FuseInstruction(dot_result);
+
+  TF_RETURN_IF_ERROR(
+      computation->RemoveInstructionAndUnusedOperands(dot_result));
+
+  ComputationLayout computation_layout(computation->ComputeProgramShape());
+  *computation_layout.mutable_parameter_layout(0) =
+      ShapeLayout(LayoutUtil::GetWithDefaultLayout(dot_lhs_shape));
+  *computation_layout.mutable_parameter_layout(1) =
+      ShapeLayout(LayoutUtil::GetWithDefaultLayout(dot_shape));
+  *computation_layout.mutable_result_layout() =
+      ShapeLayout(LayoutUtil::GetWithDefaultLayout(dot_shape));
+
+  result.dot_lhs_fusion_param =
+      fusion_instruction->operand(fused_dot->operand(0)->parameter_number());
+  result.dot_rhs_fusion_param =
+      fusion_instruction->operand(fused_dot->operand(1)->parameter_number());
+  result.addend_fusion_param = fusion_instruction->operand(
+      fused_add->operand(1 - dot_operand_idx_in_add)->parameter_number());
+
+  cpu::CpuLayoutAssignment layout_assignment(&computation_layout);
+  TF_ASSIGN_OR_RETURN(result.layout_assignment_changed_something,
+                      layout_assignment.Run(module));
+
+  return result;
+}
+
+static void AssertCorrectLayoutForDotOutputFusion(
+    const HloComputation* computation,
+    const DotOutputFusionLayoutAssignmentResult& layout_assignment_result,
+    bool expect_col_major_dot_rhs) {
+  Layout expected_dot_rhs_layout = expect_col_major_dot_rhs
+                                       ? LayoutUtil::MakeLayout({0, 1})
+                                       : LayoutUtil::MakeLayout({1, 0});
+  EXPECT_TRUE(LayoutUtil::Equal(
+      expected_dot_rhs_layout,
+      layout_assignment_result.dot_rhs_fusion_param->shape().layout()));
+
+  EXPECT_TRUE(LayoutUtil::Equal(
+      LayoutUtil::MakeLayout({1, 0}),
+      layout_assignment_result.dot_lhs_fusion_param->shape().layout()));
+
+  EXPECT_TRUE(LayoutUtil::Equal(
+      LayoutUtil::MakeLayout({1, 0}),
+      layout_assignment_result.addend_fusion_param->shape().layout()));
+  EXPECT_THAT(computation->instructions(), Each(Not(op::Copy())));
+}
+
+TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_0) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      DotOutputFusionLayoutAssignmentResult layout_assignment_result,
+      RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
+                         /*dot_operand_idx_in_add=*/0));
+  ASSERT_TRUE(layout_assignment_result.layout_assignment_changed_something);
+  AssertCorrectLayoutForDotOutputFusion(module->entry_computation(),
+                                        layout_assignment_result,
+                                        /*expect_col_major_dot_rhs=*/true);
+}
+
+TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_1) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      DotOutputFusionLayoutAssignmentResult layout_assignment_result,
+      RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
+                         /*dot_operand_idx_in_add=*/1));
+  ASSERT_TRUE(layout_assignment_result.layout_assignment_changed_something);
+  AssertCorrectLayoutForDotOutputFusion(module->entry_computation(),
+                                        layout_assignment_result,
+                                        /*expect_col_major_dot_rhs=*/true);
+}
+
+TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_0) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      DotOutputFusionLayoutAssignmentResult layout_assignment_result,
+      RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
+                         /*dot_operand_idx_in_add=*/0));
+  ASSERT_TRUE(layout_assignment_result.layout_assignment_changed_something);
+  AssertCorrectLayoutForDotOutputFusion(module->entry_computation(),
+                                        layout_assignment_result,
+                                        /*expect_col_major_dot_rhs=*/false);
+}
+
+TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_1) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      DotOutputFusionLayoutAssignmentResult layout_assignment_result,
+      RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
+                         /*dot_operand_idx_in_add=*/1));
+  ASSERT_TRUE(layout_assignment_result.layout_assignment_changed_something);
+  AssertCorrectLayoutForDotOutputFusion(module->entry_computation(),
+                                        layout_assignment_result,
+                                        /*expect_col_major_dot_rhs=*/false);
+}
+
+TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_0) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      DotOutputFusionLayoutAssignmentResult layout_assignment_result,
+      RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
+                         /*dot_operand_idx_in_add=*/0));
+  ASSERT_TRUE(layout_assignment_result.layout_assignment_changed_something);
+  AssertCorrectLayoutForDotOutputFusion(module->entry_computation(),
+                                        layout_assignment_result,
+                                        /*expect_col_major_dot_rhs=*/false);
+}
+
+TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_1) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      DotOutputFusionLayoutAssignmentResult layout_assignment_result,
+      RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
+                         /*dot_operand_idx_in_add=*/1));
+  ASSERT_TRUE(layout_assignment_result.layout_assignment_changed_something);
+  AssertCorrectLayoutForDotOutputFusion(module->entry_computation(),
+                                        layout_assignment_result,
+                                        /*expect_col_major_dot_rhs=*/false);
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 7908dc173d79a4a9dcb6127ac344267e27d2b5f2..1ef45dbec39a0880ebb123ba3fcd1fd6c89eb39a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -37,6 +37,7 @@ extern const char* const kEigenMatMulF64SymbolName =
     "__xla_cpu_runtime_EigenMatMulF64";
 extern const char* const kEigenConvF32SymbolName =
     "__xla_cpu_runtime_EigenConvF32";
+extern const char* const kEigenFftSymbolName = "__xla_cpu_runtime_EigenFft";
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF32";
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index 2ade455b8a0a43dda8c93bbb79891439da2e4f75..3e1f08071119c938619d02777513e5b834077118 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -44,6 +44,7 @@ namespace runtime {
 extern const char* const kEigenMatMulF32SymbolName;
 extern const char* const kEigenMatMulF64SymbolName;
 extern const char* const kEigenConvF32SymbolName;
+extern const char* const kEigenFftSymbolName;
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenSingleThreadedConvF32SymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
deleted file mode 100644
index acfada8540d89bb098bb0b04e109441e2123e678..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This header declares functions which may be called by the generated code on
-// the CPU. Calls to these functions must be resolved explicitly in the JIT in
-// xla::cpu::SimpleResolver.  It also defines a per-CpuExecutable context
-// which is used to cache expensive state and resources utilized by the
-// aforementioned functions.
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_AVX_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_AVX_H_
-
-#include "tensorflow/core/platform/macros.h"
-
-namespace xla {
-namespace cpu {
-namespace runtime {
-
-extern const char *const kExpV8F32AVXSymbolName;
-extern const char *const kLogV8F32AVXSymbolName;
-
-typedef float V8F32AVX __attribute__((__vector_size__(32)));
-}  // namespace runtime
-}  // namespace cpu
-}  // namespace xla
-
-extern "C" {
-
-// The following functions are vectorized versions of a selection of libm
-// library functions.
-// References to these functions are created by the LLVM vectorizer.
-xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_ExpV8F32AVX(
-    xla::cpu::runtime::V8F32AVX x) TF_ATTRIBUTE_WEAK;
-
-xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_LogV8F32AVX(
-    xla::cpu::runtime::V8F32AVX x) TF_ATTRIBUTE_WEAK;
-}
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_AVX_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.cc
deleted file mode 100644
index abe792b2787ce8baf56ee62585a0ab886d922a23..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h"
-
-#define EIGEN_USE_THREADS
-
-#include "third_party/eigen3/Eigen/Core"
-
-#ifdef __ARM_NEON__
-
-xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_ExpV4F32NEON(
-    xla::cpu::runtime::V4F32NEON x) {
-  return Eigen::internal::pexp(x);
-}
-
-xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_LogV4F32NEON(
-    xla::cpu::runtime::V4F32NEON x) {
-  Eigen::internal::Packet4f p = x;
-  return Eigen::internal::plog(p);
-}
-
-#endif  // __ARM_NEON__
-
-namespace xla {
-namespace cpu {
-namespace runtime {
-
-const char *const kExpV4F32NEONSymbolName = "__xla_cpu_runtime_ExpV4F32NEON";
-const char *const kLogV4F32NEONSymbolName = "__xla_cpu_runtime_LogV4F32NEON";
-
-}  // namespace runtime
-}  // namespace cpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
deleted file mode 100644
index 75cb16b273973d2bf665d378084343fd612a2941..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_NEON_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_NEON_H_
-
-// This header declares functions which may be called by the generated code on
-// the CPU. Calls to these functions must be resolved explicitly in the JIT in
-// xla::cpu::SimpleResolver.
-
-#include "tensorflow/core/platform/macros.h"
-
-#ifdef __ARM_NEON__
-// For the other runtimes (AVX, SSE4.1) we define the vector type directly using
-// __attribute__((__vector_size__(*))).  Unfortunately, the typedef for the ARM
-// NEON SIMD types is not portable, so the type has to come from <arm_neon.h>
-#include <arm_neon.h>
-#endif  // __ARM_NEON__
-
-namespace xla {
-namespace cpu {
-namespace runtime {
-
-extern const char *const kExpV4F32NEONSymbolName;
-extern const char *const kLogV4F32NEONSymbolName;
-
-#ifdef __ARM_NEON__
-typedef float32x4_t V4F32NEON;
-#else
-// On non-ARM platforms ensure the declaration is present
-struct V4F32NEON;
-#endif  // __ARM_NEON__
-
-}  // namespace runtime
-}  // namespace cpu
-}  // namespace xla
-
-extern "C" {
-
-// The following functions are vectorized versions of a selection of libm
-// library functions.
-// References to these functions are created by the LLVM vectorizer.
-xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_ExpV4F32NEON(
-    xla::cpu::runtime::V4F32NEON x) TF_ATTRIBUTE_WEAK;
-
-xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_LogV4F32NEON(
-    xla::cpu::runtime::V4F32NEON x) TF_ATTRIBUTE_WEAK;
-}
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_NEON_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
deleted file mode 100644
index a9a45db5a424d2faecbd437542c41fbd7fdf0bb8..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h"
-
-#define EIGEN_USE_THREADS
-
-#include "third_party/eigen3/Eigen/Core"
-
-#ifdef __SSE4_1__
-
-xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_ExpV4F32SSE(
-    xla::cpu::runtime::V4F32SSE x) {
-  Eigen::internal::Packet4f p = x;
-  return Eigen::internal::pexp(p);
-}
-
-xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_LogV4F32SSE(
-    xla::cpu::runtime::V4F32SSE x) {
-  Eigen::internal::Packet4f p = x;
-  return Eigen::internal::plog(p);
-}
-
-#endif  // __SSE4_1__
-
-namespace xla {
-namespace cpu {
-namespace runtime {
-
-const char *const kExpV4F32SSESymbolName = "__xla_cpu_runtime_ExpV4F32SSE";
-const char *const kLogV4F32SSESymbolName = "__xla_cpu_runtime_LogV4F32SSE";
-
-}  // namespace runtime
-}  // namespace cpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
deleted file mode 100644
index 96587d10d2b86e14ff6a7400fdf14ca0d994ddc5..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This header declares functions which may be called by the generated code on
-// the CPU. Calls to these functions must be resolved explicitly in the JIT in
-// xla::cpu::SimpleResolver.  It also defines a per-CpuExecutable context
-// which is used to cache expensive state and resources utilized by the
-// aforementioned functions.
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_SSE4_1_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_SSE4_1_H_
-
-#include "tensorflow/core/platform/macros.h"
-
-namespace xla {
-namespace cpu {
-namespace runtime {
-
-extern const char *const kExpV4F32SSESymbolName;
-extern const char *const kLogV4F32SSESymbolName;
-
-typedef float V4F32SSE __attribute__((__vector_size__(16)));
-
-}  // namespace runtime
-}  // namespace cpu
-}  // namespace xla
-
-extern "C" {
-
-// The following functions are vectorized versions of a selection of libm
-// library functions.
-// References to these functions are created by the LLVM vectorizer.
-xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_ExpV4F32SSE(
-    xla::cpu::runtime::V4F32SSE x) TF_ATTRIBUTE_WEAK;
-
-xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_LogV4F32SSE(
-    xla::cpu::runtime::V4F32SSE x) TF_ATTRIBUTE_WEAK;
-}
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_SSE4_1_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index b53719fcc260d706eab3d7460c42af4a1b5e775f..f5e61aef534da57ce13d3ee9bbeaeaec31f53d2e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -98,7 +98,7 @@ Status CpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
 
   if (!ShapeUtil::IsTuple(shape)) {
     int64 size = GetByteSizeRequirement(shape);
-    return TransferBufferToInfeed(executor, size, literal.InternalData());
+    return TransferBufferToInfeed(executor, size, literal.untyped_data());
   }
 
   if (ShapeUtil::IsNestedTuple(shape)) {
@@ -111,20 +111,20 @@ Status CpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
   // enqueue the resulting destination device addresses with the
   // infeed manager.
   std::vector<cpu::runtime::XfeedBuffer*> buffers;
-  buffers.reserve(literal.tuple_literals_size());
+  buffers.reserve(ShapeUtil::TupleElementCount(shape));
   auto cleanup = tensorflow::gtl::MakeCleanup([&buffers]() {
     for (cpu::runtime::XfeedBuffer* b : buffers) {
       b->Done(Cancelled("Failed to infeed buffer to device."));
     }
   });
 
-  for (const auto& tuple_element : literal.tuple_literals()) {
-    const Shape& tuple_element_shape = tuple_element.shape();
+  for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+    const Shape& tuple_element_shape = ShapeUtil::GetSubshape(shape, {i});
     int64 tuple_element_size = GetByteSizeRequirement(tuple_element_shape);
     TF_ASSIGN_OR_RETURN(
         cpu::runtime::XfeedBuffer * buffer,
         TransferBufferToInfeedInternal(executor, tuple_element_size,
-                                       tuple_element.InternalData()));
+                                       literal.untyped_data({i})));
     buffers.push_back(buffer);
   }
 
@@ -187,14 +187,14 @@ Status CpuTransferManager::TransferLiteralFromOutfeed(
         literal_shape.element_type(), dimensions));
     TF_ASSIGN_OR_RETURN(Shape received_shape,
                         TransferArrayBufferFromOutfeed(
-                            executor, literal->MutableInternalData(), size));
+                            executor, literal->untyped_data(), size));
     TF_RET_CHECK(ShapeUtil::Compatible(received_shape, literal->shape()))
         << "Shape received from outfeed "
         << ShapeUtil::HumanString(received_shape)
         << " did not match the shape that was requested for outfeed: "
         << ShapeUtil::HumanString(literal_shape);
     TF_RET_CHECK(size == GetByteSizeRequirement(received_shape));
-    *literal->mutable_shape() = received_shape;
+    *literal->mutable_shape_do_not_use() = received_shape;
     return Status::OK();
   }
 
@@ -217,7 +217,7 @@ Status CpuTransferManager::TransferLiteralFromOutfeed(
     auto empty = Literal::CreateFromDimensions(
         tuple_element_shape.element_type(), dimensions);
     int64 size = GetByteSizeRequirement(tuple_element_shape);
-    buffer_data.push_back({empty->MutableInternalData(), size});
+    buffer_data.push_back({empty->untyped_data(), size});
     elements.push_back(std::move(empty));
   }
 
@@ -233,7 +233,7 @@ Status CpuTransferManager::TransferLiteralFromOutfeed(
                GetByteSizeRequirement(received_shape));
 
   for (int64 i = 0; i < literal_shape.tuple_shapes_size(); ++i) {
-    *elements[i]->mutable_shape() = received_shape.tuple_shapes(i);
+    *elements[i]->mutable_shape_do_not_use() = received_shape.tuple_shapes(i);
   }
   *literal = std::move(*Literal::MakeTupleOwned(std::move(elements)));
   TF_RET_CHECK(ShapeUtil::Equal(literal->shape(), literal_shape));
diff --git a/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h b/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h
index 2994642356d55df26c31553ef28dc653503d05be..664125ecc95ca5ac10be4201b9120ddbdb9b9821 100644
--- a/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h
+++ b/tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
 
 // This file is depended on by kernels that have to build for mobile devices.
 // For this reason, we avoid relying on TensorFlow and instead only use the
@@ -71,4 +71,4 @@ class RegisterCustomCallTarget {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CUSTOM_CALL_TARGET_REGISTRY_H_
diff --git a/tensorflow/compiler/xla/service/cpu/disassembler.h b/tensorflow/compiler/xla/service/cpu/disassembler.h
index b6feaa7e45cee26eb7f850081bd1fad2cb63b15c..5e302f88990ee4a3c37758881ecec4d6f71dd8e6 100644
--- a/tensorflow/compiler/xla/service/cpu/disassembler.h
+++ b/tensorflow/compiler/xla/service/cpu/disassembler.h
@@ -37,7 +37,7 @@ struct DisassemblerResult {
   DisassemblerResult(const string& text, size_t code_size_bytes)
       : text(text), code_size_bytes(code_size_bytes) {}
 
-  // The dissassembled text sections of the object file.
+  // The disassembled text sections of the object file.
   string text;
   // The total number of bytes of executable code in the object file.
   uint64_t code_size_bytes;
@@ -53,7 +53,7 @@ class Disassembler {
   // Returns a DisassemblerResult for the given object file, containing the
   // disassembled code.
   //
-  // If we couldnt' retrieve a disassembler for this platform, an error status
+  // If we couldn't retrieve a disassembler for this platform, an error status
   // is returned.
   StatusOr<DisassemblerResult> DisassembleObjectFile(
       const llvm::object::ObjectFile& object_file) const;
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 4c40dae5122b0853a72d6428fc120220e3a69237..c9fc586b9a4c06eb9e1f111d8f9bd2f717990aab 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -23,10 +23,11 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+#include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -143,7 +144,8 @@ class ColumnMajorMatrixVectorProductEmitter {
   ColumnMajorMatrixVectorProductEmitter(PrimitiveType scalar_type,
                                         int64 tile_rows, int64 tile_cols,
                                         int64 m, int64 k, llvm::Value* lhs,
-                                        llvm::Value* rhs, llvm::Value* result,
+                                        llvm::Value* rhs, llvm::Value* addend,
+                                        llvm::Value* result,
                                         llvm::IRBuilder<>* ir_builder)
       : scalar_type_(scalar_type),
         tile_rows_(tile_rows),
@@ -152,6 +154,7 @@ class ColumnMajorMatrixVectorProductEmitter {
         k_(k),
         lhs_(lhs),
         rhs_(rhs),
+        addend_(addend),
         result_(result),
         ir_builder_(ir_builder),
         ksl_(ir_builder_),
@@ -173,7 +176,7 @@ class ColumnMajorMatrixVectorProductEmitter {
   }
 
   // Load a tile of values from the RHS.  For the RHS a "tile" is a contiguous
-  // sequnce of `count` values, each one broadcasted to the vector width.
+  // sequence of `count` values, each one broadcasted to the vector width.
   std::vector<llvm::Value*> LoadRhsTile(llvm::Value* offset, int64 count) {
     llvm::Value* base_pointer = vsl_.ComputeOffsetPointer(rhs_, offset);
     std::vector<llvm::Value*> result;
@@ -198,6 +201,7 @@ class ColumnMajorMatrixVectorProductEmitter {
   int64 k_;
   llvm::Value* lhs_;
   llvm::Value* rhs_;
+  llvm::Value* addend_;
   llvm::Value* result_;
   llvm::IRBuilder<>* ir_builder_;
   KernelSupportLibrary ksl_;
@@ -242,9 +246,10 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
            /*step=*/tile_rows_, [&](llvm::Value* row) {
              std::vector<llvm::Value*> lhs_tile =
                  lhs_tile_loader->LoadTile(/*minor_dim_offset=*/row);
-             llvm::Value* accumulator = is_first_column
-                                            ? vsl_.GetZeroVector()
-                                            : vsl_.LoadVector(result_, row);
+             llvm::Value* accumulator =
+                 is_first_column ? (addend_ ? vsl_.LoadVector(addend_, row)
+                                            : vsl_.GetZeroVector())
+                                 : vsl_.LoadVector(result_, row);
              for (int i = 0; i < columns; i++) {
                accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator);
              }
@@ -288,7 +293,18 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
                   ir_builder_->getInt1(is_first_tiled_column));
               ksl_.If(
                   setting_result_first_time,
-                  [&]() { vsl_.StoreScalar(product, result_, scalar_row); },
+                  /*true_block_generator=*/
+                  [&]() {
+                    if (addend_) {
+                      vsl_.StoreScalar(
+                          vsl_.Add(vsl_.LoadScalar(addend_, scalar_row),
+                                   product),
+                          result_, scalar_row);
+                    } else {
+                      vsl_.StoreScalar(product, result_, scalar_row);
+                    }
+                  },
+                  /*false_block_generator=*/
                   [&]() {
                     vsl_.StoreScalar(
                         vsl_.Add(vsl_.LoadScalar(result_, scalar_row), product),
@@ -353,7 +369,7 @@ class RowMajorMatrixVectorProductEmitter {
   RowMajorMatrixVectorProductEmitter(PrimitiveType scalar_type, int64 tile_rows,
                                      int64 tile_cols, int64 m, int64 k,
                                      llvm::Value* lhs, llvm::Value* rhs,
-                                     llvm::Value* result,
+                                     llvm::Value* addend, llvm::Value* result,
                                      llvm::IRBuilder<>* ir_builder)
       : scalar_type_(scalar_type),
         tile_rows_(tile_rows),
@@ -362,6 +378,7 @@ class RowMajorMatrixVectorProductEmitter {
         k_(k),
         lhs_(lhs),
         rhs_(rhs),
+        addend_(addend),
         result_(result),
         ir_builder_(ir_builder),
         ksl_(ir_builder_),
@@ -394,6 +411,7 @@ class RowMajorMatrixVectorProductEmitter {
   int64 k_;
   llvm::Value* lhs_;
   llvm::Value* rhs_;
+  llvm::Value* addend_;
   llvm::Value* result_;
   llvm::IRBuilder<>* ir_builder_;
   KernelSupportLibrary ksl_;
@@ -415,11 +433,32 @@ void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
   EmitInnerLoopEpilogue(/*current_tile_row=*/row, /*rows=*/row_count,
                         &scalar_accumulators);
 
+  std::vector<llvm::Value*> accumulator_values;
+  std::transform(
+      vector_accumulators.begin(), vector_accumulators.end(),
+      std::back_inserter(accumulator_values),
+      [](const VectorVariable& vector_var) { return vector_var.Get(); });
+
+  std::vector<llvm::Value*> horizontal_sums;
+  if (row_count == vsl_.vector_size()) {
+    if (addend_) {
+      horizontal_sums = vsl_.ComputeHorizontalSums(
+          std::move(accumulator_values), vsl_.LoadVector(addend_, row));
+    } else {
+      horizontal_sums =
+          vsl_.ComputeHorizontalSums(std::move(accumulator_values));
+    }
+  } else {
+    horizontal_sums = vsl_.ComputeHorizontalSums(std::move(accumulator_values));
+  }
+
   for (int i = 0; i < row_count; i++) {
     llvm::Value* result_value =
-        vsl_.Add(vsl_.AddReduce(vector_accumulators[i].Get()),
-                 scalar_accumulators[i].Get());
+        vsl_.Add(horizontal_sums[i], scalar_accumulators[i].Get());
     llvm::Value* offset = ir_builder_->CreateAdd(ir_builder_->getInt64(i), row);
+    if (addend_ && row_count != vsl_.vector_size()) {
+      result_value = vsl_.Add(vsl_.LoadScalar(addend_, offset), result_value);
+    }
     vsl_.StoreScalar(result_value, result_, offset);
   }
 }
@@ -483,49 +522,52 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
 
 }  // namespace
 
-DotOpEmitter::DotOpEmitter(const HloInstruction& dot, bool transpose_lhs,
-                           bool transpose_rhs,
-                           const llvm_ir::IrArray& target_array,
-                           const llvm_ir::IrArray& lhs_array,
-                           const llvm_ir::IrArray& rhs_array,
-                           llvm::Value* executable_run_options_value,
-                           llvm::IRBuilder<>* ir_builder,
-                           const HloModuleConfig& hlo_module_config)
+DotOpEmitter::DotOpEmitter(
+    const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
+    const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
+    const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array,
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
+    const HloModuleConfig& hlo_module_config,
+    const TargetMachineFeatures& target_machine_features)
     : dot_(dot),
       transpose_lhs_(transpose_lhs),
       transpose_rhs_(transpose_rhs),
       target_array_(target_array),
       lhs_array_(lhs_array),
       rhs_array_(rhs_array),
+      addend_array_(addend_array),
       executable_run_options_value_(executable_run_options_value),
       ir_builder_(ir_builder),
-      hlo_module_config_(hlo_module_config) {}
+      hlo_module_config_(hlo_module_config),
+      target_machine_features_(target_machine_features) {}
 
 /* static */ tensorflow::Status DotOpEmitter::EmitDotOperation(
     const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
     const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
-    const llvm_ir::IrArray& rhs_array,
+    const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
-    const HloModuleConfig& hlo_module_config) {
+    const HloModuleConfig& hlo_module_config,
+    const TargetMachineFeatures& target_machine_features) {
   PrimitiveType type = target_array.GetShape().element_type();
   TF_RET_CHECK(F32 == type || F64 == type || C64 == type);
   DotOpEmitter dot_emitter(dot, transpose_lhs, transpose_rhs, target_array,
-                           lhs_array, rhs_array, executable_run_options_value,
-                           ir_builder, hlo_module_config);
+                           lhs_array, rhs_array, addend_array,
+                           executable_run_options_value, ir_builder,
+                           hlo_module_config, target_machine_features);
   return dot_emitter.Emit();
 }
 
 bool DotOpEmitter::ShapesAreLegalForRuntimeDot() const { return true; }
 
 bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
-  if (dot_.shape().dimensions_size() != 2 ||
-      ProfitableToImplementDotInUntiledLlvmIr(dot_) ==
-          DotInLlvmIrProfitable::kYes) {
+  if (dot_.shape().dimensions_size() != 2) {
     return false;
   }
 
-  if (!primitive_util::IsFloatingPointType(dot_.shape().element_type()) &&
-      !primitive_util::IsIntegralType(dot_.shape().element_type())) {
+  PrimitiveType primitive_type = dot_.shape().element_type();
+
+  if (!primitive_util::IsFloatingPointType(primitive_type) &&
+      !primitive_util::IsIntegralType(primitive_type)) {
     return false;
   }
 
@@ -575,30 +617,76 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   int64 tiling_factor = GetGemvTilingFactor();
   CHECK_GT(tiling_factor, 0);
 
+  llvm::Value* result_op = target_array_.GetBasePointer();
+  llvm::Value* lhs_op =
+      swap_operands ? rhs_array_.GetBasePointer() : lhs_array_.GetBasePointer();
+  llvm::Value* rhs_op =
+      swap_operands ? lhs_array_.GetBasePointer() : rhs_array_.GetBasePointer();
+
+  const bool enable_fast_math =
+      hlo_module_config_.debug_options().xla_enable_fast_math();
+  const bool optimize_for_size =
+      options::OptimizeForSizeRequested(hlo_module_config_);
+
+  const int target_vector_register_element_size =
+      target_machine_features_.vector_register_num_elements(
+          *ir_builder_->GetInsertBlock()->getParent(), primitive_type);
+
+  // We may not always know the vector register size for the target we're
+  // compiling against, in which case target_vector_register_element_size is 0.
+  // In these cases we choose a default LLVM IR register size.
+  const int kUnknownTargetVectorRegisterSize = 4;
+  const int vector_register_element_size =
+      target_vector_register_element_size == 0
+          ? kUnknownTargetVectorRegisterSize
+          : target_vector_register_element_size;
+
   if (is_column_major_matrix_vector) {
     VLOG(2) << "Emitting column major matrix-vector multiply with m = " << m
             << " and k = " << k;
-    ColumnMajorMatrixVectorProductEmitter emitter(
-        dot_.shape().element_type(), /*tile_rows=*/8,
-        /*tile_cols=*/tiling_factor, m, k,
-        swap_operands ? rhs_array_.GetBasePointer()
-                      : lhs_array_.GetBasePointer(),
-        swap_operands ? lhs_array_.GetBasePointer()
-                      : rhs_array_.GetBasePointer(),
-        target_array_.GetBasePointer(), ir_builder_);
-    emitter.Emit();
+    int64 tile_rows = vector_register_element_size;
+    int64 tile_cols = tiling_factor;
+
+    string kernel_name = tensorflow::strings::StrCat(
+        "col_major_gemv_", PrimitiveType_Name(primitive_type), "_", tile_rows,
+        "_", tile_cols, "_", m, "_", k, addend_array_ ? "_with_addend" : "");
+
+    KernelSupportLibrary::EmitAndCallOutlinedKernel(
+        /*enable_fast_math=*/enable_fast_math,
+        /*optimize_for_size=*/optimize_for_size, ir_builder_, kernel_name,
+        lhs_op, rhs_op,
+        addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
+        [this, tile_rows, tile_cols, m, k, primitive_type](
+            llvm::Value* lhs_op, llvm::Value* rhs_op, llvm::Value* addend_op,
+            llvm::Value* result_op) {
+          ColumnMajorMatrixVectorProductEmitter emitter(
+              primitive_type, tile_rows, tile_cols, m, k, lhs_op, rhs_op,
+              addend_op, result_op, ir_builder_);
+          emitter.Emit();
+        });
   } else {
     VLOG(2) << "Emitting row major matrix-vector multiply with m = " << m
             << " and k = " << k;
-    RowMajorMatrixVectorProductEmitter emitter(
-        dot_.shape().element_type(), /*tile_rows=*/tiling_factor,
-        /*tile_cols=*/8, m, k,
-        swap_operands ? rhs_array_.GetBasePointer()
-                      : lhs_array_.GetBasePointer(),
-        swap_operands ? lhs_array_.GetBasePointer()
-                      : rhs_array_.GetBasePointer(),
-        target_array_.GetBasePointer(), ir_builder_);
-    emitter.Emit();
+    int64 tile_rows = tiling_factor;
+    int64 tile_cols = vector_register_element_size;
+
+    string kernel_name = tensorflow::strings::StrCat(
+        "row_major_gemv_", PrimitiveType_Name(primitive_type), "_", tile_rows,
+        "_", tile_cols, "_", m, "_", k, addend_array_ ? "_with_addend" : "");
+
+    KernelSupportLibrary::EmitAndCallOutlinedKernel(
+        /*enable_fast_math=*/enable_fast_math,
+        /*optimize_for_size=*/optimize_for_size, ir_builder_, kernel_name,
+        lhs_op, rhs_op,
+        addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
+        [this, tile_rows, tile_cols, m, k, primitive_type](
+            llvm::Value* lhs_op, llvm::Value* rhs_op, llvm::Value* addend_op,
+            llvm::Value* result_op) {
+          RowMajorMatrixVectorProductEmitter emitter(
+              primitive_type, tile_rows, tile_cols, m, k, lhs_op, rhs_op,
+              addend_op, result_op, ir_builder_);
+          emitter.Emit();
+        });
   }
 
   return true;
@@ -641,6 +729,8 @@ tensorflow::Status DotOpEmitter::Emit() {
     return Status::OK();
   }
 
+  CHECK_EQ(addend_array_, nullptr);
+
   if (PotentiallyImplementedAsEigenDot(dot_)) {
     return EmitCallToRuntime();
   }
@@ -915,8 +1005,8 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
   return {lhs_shape.dimensions(transpose_lhs_ ? 1 : 0),
           lhs_shape.dimensions(transpose_lhs_ ? 0 : 1),
           rhs_shape.dimensions(transpose_rhs_ ? 0 : 1),
-          lhs_shape.layout().minor_to_major(0) == 0,
-          rhs_shape.layout().minor_to_major(0) == 0};
+          LayoutUtil::Minor(lhs_shape.layout(), 0) == 0,
+          LayoutUtil::Minor(rhs_shape.layout(), 0) == 0};
 }
 
 llvm_ir::IrArray::Index DotOpEmitter::EmitOperandArrayLoopNest(
@@ -927,8 +1017,8 @@ llvm_ir::IrArray::Index DotOpEmitter::EmitOperandArrayLoopNest(
   // reduction dimension.
   std::vector<int64> dimensions;
   const Shape& shape = operand_array.GetShape();
-  for (int i = shape.layout().minor_to_major_size() - 1; i >= 0; --i) {
-    int64 dimension = shape.layout().minor_to_major(i);
+  for (int i = LayoutUtil::MinorToMajor(shape).size() - 1; i >= 0; --i) {
+    int64 dimension = LayoutUtil::Minor(shape.layout(), i);
     if (dimension != reduction_dimension) {
       dimensions.push_back(dimension);
     }
@@ -977,9 +1067,7 @@ bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo) {
       return false;
     }
 
-    if (ProfitableToImplementDotInUntiledLlvmIr(hlo) ==
-            DotInLlvmIrProfitable::kYes ||
-        ProfitableToImplementDotInTiledLlvmIr(hlo)) {
+    if (ProfitableToImplementDotInTiledLlvmIr(hlo)) {
       return false;
     }
 
@@ -1010,46 +1098,42 @@ bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo) {
   return false;
 }
 
-DotInLlvmIrProfitable ProfitableToImplementDotInUntiledLlvmIr(
-    const HloInstruction& dot) {
-  if (dot.opcode() == HloOpcode::kDot && dot.shape().dimensions_size() == 2) {
-    const Shape& result_shape = dot.shape();
-    // kReductionDimensionThresholdBytes was chosen to be 1/4 of a typical L1
-    // cache line size, so that we can have the reduction dimension of both the
-    // LHS and RHS matrices and still have some space "left over".  This needs
-    // to be tuned further.
-    const int64 kReductionDimensionThresholdBytes = 8 * 1024;
-    const bool single_threaded_eigen =
-        !dot.GetModule()->config().debug_options().xla_cpu_multi_thread_eigen();
-
-    // This is the point at which it is better to call into Eigen and shard the
-    // dot across multiple worker threads.  This is a rough estimate by running
-    // a matmult benchmark on my local machine, and it can be tuned further.
-    const int64 kMaxSingleThreadedFlops = 16 * 1024;
-
-    const int64 M = result_shape.dimensions(0);
-    const int64 N = result_shape.dimensions(1);
-    const int64 K = dot.operand(1)->shape().dimensions(0);
-    const int64 primitive_type_size =
-        ShapeUtil::ByteSizeOfPrimitiveType(result_shape.element_type());
-    if (M == 1 &&
-        K * primitive_type_size <= kReductionDimensionThresholdBytes &&
-        (single_threaded_eigen || M * K * N <= kMaxSingleThreadedFlops)) {
-      // Heuristics:
-      //
-      //  - Look for a configuration where we will likely be able to keep LHS in
-      //    L1 and do a cache-optimal traversal of RHS.
-      //
-      //  - Bail out on matrices that are large enough that Eigen can profitably
-      //    shard the computation across multiple cores.  This only applies when
-      //    multi-threading is enabled.
-      return LayoutUtil::IsMonotonicWithDim0Major(
-                 dot.operand(1)->shape().layout())
-                 ? DotInLlvmIrProfitable::kWithColumnMajorRhs
-                 : DotInLlvmIrProfitable::kYes;
+// For vector-matrix dot products, it is always profitable to make the Rhs
+// column major.
+tensorflow::gtl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
+    const HloInstruction& hlo) {
+  if (hlo.opcode() == HloOpcode::kDot && hlo.shape().dimensions_size() == 2 &&
+      hlo.shape().dimensions(0) == 1) {
+    if (hlo.dot_dimension_numbers().rhs_contracting_dimensions(0) == 0) {
+      return 1;
+    }
+    return {};
+  }
+
+  if (hlo.opcode() == HloOpcode::kFusion &&
+      hlo.fusion_kind() == HloInstruction::FusionKind::kOutput) {
+    auto* fusion_root =
+        hlo.fused_instructions_computation()->root_instruction();
+    if (fusion_root->opcode() != HloOpcode::kAdd) {
+      return {};
+    }
+
+    for (auto* fusion_root_op : fusion_root->operands()) {
+      if (fusion_root_op->opcode() != HloOpcode::kDot) {
+        continue;
+      }
+      if (auto operand_num =
+              ProfitableToMakeDotOperandColumnMajor(*fusion_root_op)) {
+        auto* operand = fusion_root_op->operand(*operand_num);
+        if (operand->opcode() == HloOpcode::kParameter &&
+            operand->user_count() == 1) {
+          return operand->parameter_number();
+        }
+      }
     }
   }
-  return DotInLlvmIrProfitable::kNo;
+
+  return {};
 }
 
 bool ProfitableToImplementDotInTiledLlvmIr(const HloInstruction& dot) {
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index c9168ccc0f6629c2a2bfbc7d4dc9c7ebab0a5708..9d748eb81f7850f3ccdb10f076eecfdc8326c05f 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "llvm/IR/IRBuilder.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
@@ -32,19 +33,11 @@ namespace cpu {
 
 bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo);
 
-enum class DotInLlvmIrProfitable { kYes, kNo, kWithColumnMajorRhs };
-
-// Returns a value to indicate if (and under what conditions) will lowering
-// |dot| as a untiled LLVM IR dot operation be profitable over calling into
-// Eigen or emitting a tiled LLVM IR implementation.  Possible return values
-// are:
-//
-//  * DotInLlvmIrProfitable::kYes - always profitable.
-//  * DotInLlvmIrProfitable::kNo - never profitable.
-//  * DotInLlvmIrProfitable::kWithColumnMajorRhs - only if we can manage to make
-//    the Rhs layout column major.
-DotInLlvmIrProfitable ProfitableToImplementDotInUntiledLlvmIr(
-    const HloInstruction& dot);
+// Returns the index for an operand to `hlo` that should ideally be column
+// major.  Returns nullopt if there is no such operand or if `hlo` is not a dot
+// or a fusion containing a dot.
+tensorflow::gtl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
+    const HloInstruction& hlo);
 
 // Returns true to indicate that we can generate a tiled LLVM IR implementation
 // for |dot|.
@@ -57,21 +50,29 @@ class DotOpEmitter {
   // place the result in target_array. IR is emitted at current insert point of
   // the builder. Upon completion of the method, the insert point is set to the
   // end of all instructions emitted for this operation.
+  //
+  // If `addend_array` is not nullptr then it must be an array of the same
+  // dimensions as the result, and the result is computed as `addend_array` +
+  // dot(`lhs_array`, `rhs_array`).  A non-null `addend_array` is only supported
+  // for Matrix-vector products.
   static tensorflow::Status EmitDotOperation(
       const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
       const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
-      const llvm_ir::IrArray& rhs_array,
+      const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array,
       llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
-      const HloModuleConfig& hlo_module_config);
+      const HloModuleConfig& hlo_module_config,
+      const TargetMachineFeatures& target_machine_features);
 
  private:
   DotOpEmitter(const HloInstruction& dot, bool transpose_lhs,
                bool transpose_rhs, const llvm_ir::IrArray& target_array,
                const llvm_ir::IrArray& lhs_array,
                const llvm_ir::IrArray& rhs_array,
+               const llvm_ir::IrArray* addend_array,
                llvm::Value* executable_run_options_value,
                llvm::IRBuilder<>* ir_builder,
-               const HloModuleConfig& hlo_module_config);
+               const HloModuleConfig& hlo_module_config,
+               const TargetMachineFeatures& target_machine_features);
 
   // Emits the IR to perform the dot operation.
   tensorflow::Status Emit();
@@ -140,9 +141,11 @@ class DotOpEmitter {
   const llvm_ir::IrArray& target_array_;
   const llvm_ir::IrArray& lhs_array_;
   const llvm_ir::IrArray& rhs_array_;
+  const llvm_ir::IrArray* addend_array_;
   llvm::Value* executable_run_options_value_;
   llvm::IRBuilder<>* ir_builder_;
   const HloModuleConfig& hlo_module_config_;
+  const TargetMachineFeatures& target_machine_features_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index ba693ec89ab7c4090f8c9d1e4d65f17a80d0ac55..99c5e16db70c6a203b4751c1ed8a106c0dc260e6 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -33,8 +33,14 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitFloatUnaryOp(
   switch (op->opcode()) {
     case HloOpcode::kTanh: {
       PrimitiveType element_type = op->shape().element_type();
+      bool cast_result_to_fp16 = false;
       string function_name;
       switch (element_type) {
+        case F16:
+          cast_result_to_fp16 = true;
+          operand_value = ir_builder_->CreateFPCast(operand_value,
+                                                    ir_builder_->getFloatTy());
+          TF_FALLTHROUGH_INTENDED;
         case F32:
           function_name = "tanhf";
           break;
@@ -44,26 +50,61 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitFloatUnaryOp(
         default:
           return Unimplemented("tanh");
       }
-      // Create function type for the function.
-      llvm::FunctionType* function_type = llvm::FunctionType::get(
-          llvm_ir::PrimitiveTypeToIrType(element_type, module_),
-          llvm_ir::PrimitiveTypeToIrType(element_type, module_),
-          /*isVarArg=*/false);
-      // Create function declaration for 'tanhf'.
+      // Create a function declaration.
       llvm::Function* function =
           llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-              llvm_ir::AsStringRef(function_name), function_type));
+              llvm_ir::AsStringRef(function_name), operand_value->getType(),
+              operand_value->getType()));
       function->setCallingConv(llvm::CallingConv::C);
       function->setDoesNotThrow();
       function->setDoesNotAccessMemory();
-      // Create instruction to call 'tanhf'.
-      return ir_builder_->CreateCall(function, operand_value);
+      // Create an instruction to call the function.
+      llvm::Value* result = ir_builder_->CreateCall(function, operand_value);
+      if (cast_result_to_fp16) {
+        result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy());
+      }
+      return result;
     }
     default:
       return ElementalIrEmitter::EmitFloatUnaryOp(op, operand_value);
   }
 }
 
+StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(
+    PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs) const {
+  string function_name;
+  bool cast_result_to_fp16 = false;
+  switch (prim_type) {
+    case F16:
+      cast_result_to_fp16 = true;
+      lhs = ir_builder_->CreateFPCast(lhs, ir_builder_->getFloatTy());
+      rhs = ir_builder_->CreateFPCast(rhs, ir_builder_->getFloatTy());
+      TF_FALLTHROUGH_INTENDED;
+    case F32:
+      function_name = "atan2f";
+      break;
+    case F64:
+      function_name = "atan2";
+      break;
+    default:
+      return Unimplemented("atan2");
+  }
+  // Create a function declaration.
+  llvm::Function* function =
+      llvm::cast<llvm::Function>(module_->getOrInsertFunction(
+          llvm_ir::AsStringRef(function_name), lhs->getType(), lhs->getType(),
+          rhs->getType()));
+  function->setCallingConv(llvm::CallingConv::C);
+  function->setDoesNotThrow();
+  function->setDoesNotAccessMemory();
+  // Create an instruction to call the function.
+  llvm::Value* result = ir_builder_->CreateCall(function, {lhs, rhs});
+  if (cast_result_to_fp16) {
+    result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy());
+  }
+  return result;
+}
+
 llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
     const HloInstruction* hlo,
     const HloToElementGeneratorMap& operand_to_generator) const {
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
index 7e9f27befb456c17581f556868712f92fd8fd083..4446dfd2821fb4b6e75f33694367392ecbcdd8bf 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
@@ -41,6 +41,8 @@ class CpuElementalIrEmitter : public ElementalIrEmitter {
  protected:
   StatusOr<llvm::Value*> EmitFloatUnaryOp(
       const HloInstruction* op, llvm::Value* operand_value) const override;
+  StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs,
+                                   llvm::Value* rhs) const override;
 
   IrEmitter* ir_emitter_;
 };
diff --git a/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc b/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc
index c9f8e5584965d0c73771750e26bd63c401d5b0c0..7dcc4ca7fa08b478f24065275ffa69725dc51682 100644
--- a/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc
+++ b/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc
@@ -33,15 +33,12 @@ void ExternalConstantPool::Insert(string name, const Literal& literal,
   CHECK(entries_.find(name) == entries_.end());
 
   int64 literal_size = ShapeUtil::ByteSizeOf(literal.shape());
-  void* raw_pointer;
-  CHECK_EQ(
-      posix_memalign(&raw_pointer, std::max<size_t>(alignment, sizeof(void*)),
-                     literal_size),
-      0)
-      << "failed to allocate " << literal_size << " bytes with alignment of "
-      << alignment;
-
-  std::memcpy(raw_pointer, literal.InternalData(), literal_size);
+  void* raw_pointer = tensorflow::port::AlignedMalloc(
+      literal_size, std::max<size_t>(alignment, sizeof(void*)));
+  CHECK(raw_pointer != nullptr) << "failed to allocate " << literal_size
+                                << " bytes with alignment of " << alignment;
+
+  std::memcpy(raw_pointer, literal.untyped_data(), literal_size);
   entries_.emplace(std::move(name), static_cast<uint8*>(raw_pointer));
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/external_constant_pool.h b/tensorflow/compiler/xla/service/cpu/external_constant_pool.h
index ade28cbcbcfda05a9ad0adab1139bf316720e11f..8008a56df4dbf16e7b57aee8a344058bb0d5883d 100644
--- a/tensorflow/compiler/xla/service/cpu/external_constant_pool.h
+++ b/tensorflow/compiler/xla/service/cpu/external_constant_pool.h
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_
 
 #include <memory>
 
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/mem.h"
 
 namespace xla {
 namespace cpu {
@@ -49,10 +50,10 @@ class ExternalConstantPool {
   const uint8* Find(const string& name);
 
  private:
-  // We need to `free()` pointers allocated into `entries_` since we allocate
-  // them with `posix_memalign`.
+  // We need to `AlignedFree` pointers allocated into `entries_` since we
+  // allocate them with `AlignedMalloc`.
   struct FreeDeleter {
-    void operator()(void* ptr) { free(ptr); }
+    void operator()(void* ptr) { tensorflow::port::AlignedFree(ptr); }
   };
 
   tensorflow::gtl::FlatMap<string, std::unique_ptr<uint8, FreeDeleter>>
@@ -61,4 +62,4 @@ class ExternalConstantPool {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_EXTERNAL_CONSTANT_POOL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index 3993779da636e519f8d8fded468c3271d27ee093..788217aab6172b4e548452b3f6ffd4197c163ce4 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -44,6 +44,9 @@ bool PotentiallyImplementedAsEigenConvolution(
       ShapeUtil::ElementIsComplex(kernel_shape)) {
     return false;
   }
+  if (window_util::HasWindowReversal(convolution.window())) {
+    return false;
+  }
 
   const ConvolutionDimensionNumbers& dnums =
       convolution.convolution_dimension_numbers();
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
index ac361ddfb4c8d253ffb1c99200939f6324cad2bb..34b2003916933f5ec0a15d9e219063c0a912fa40 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMISSION_UTILS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMISSION_UTILS_H_
 
+#include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
@@ -23,6 +24,19 @@ namespace cpu {
 
 bool PotentiallyImplementedAsEigenConvolution(
     const HloInstruction& convolution);
+
+// Dynamic loop bounds are specified as an array of dimension index
+// [start, limit) pairs of ir values (one for each partitioned outer dimension).
+//
+// EX: Let 'shape' = [8, 16, 32], with the loop bounds of the two-most major
+//     dimensions dynamic. Then 'dynamic_loop_bounds' will contain the
+//     following ir values for the two most-major dimensions:
+//       [dim0_index_start_ir_value, dim0_index_limit_ir_value]
+//       [dim1_index_start_ir_value, dim1_index_limit_ir_value]
+//
+// See IrFunction and ParallelLoopEmitter for details.
+using DynamicLoopBounds = std::vector<std::pair<llvm::Value*, llvm::Value*>>;
+
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 502dd2e7387d701e69e1c7ecb67fbdac26c6b5de..d9eeb1c3bdc2a8058992de0e13045a240bf56b8d 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/logging.h"
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -42,6 +43,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_function.h"
+#include "tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
@@ -59,6 +62,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -69,6 +73,7 @@ namespace {
 using llvm_ir::AsStringRef;
 using llvm_ir::IrName;
 using llvm_ir::SetToFirstInsertPoint;
+namespace gtl = tensorflow::gtl;
 }  // namespace
 
 namespace cpu {
@@ -76,16 +81,16 @@ namespace cpu {
 IrEmitter::IrEmitter(
     const HloModule& hlo_module, const BufferAssignment& assignment,
     llvm::Module* llvm_module,
-    std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
-    tensorflow::gtl::optional<size_t> entry_computation_profile_idx,
+    std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx,
+    std::unordered_map<const HloComputation*, int64> computation_to_profile_idx,
     llvm::TargetMachine* target_machine,
     ExternalConstantPool* external_constant_pool)
     : assignment_(assignment),
       module_(llvm_module),
       arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()),
       ir_builder_(llvm_module->getContext()),
-      hlo_to_profile_idx_(std::move(hlo_to_profile_idx)),
-      entry_computation_profile_idx_(std::move(entry_computation_profile_idx)),
+      instruction_to_profile_idx_(std::move(instruction_to_profile_idx)),
+      computation_to_profile_idx_(std::move(computation_to_profile_idx)),
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
       hlo_module_config_(hlo_module.config()),
       parallel_cpu_backend_(
@@ -117,138 +122,33 @@ StatusOr<llvm::Function*> IrEmitter::EmitComputation(
   // readcyclecounter if it is unavailable.
   bool use_rdtscp = arch_type_ == llvm::Triple::ArchType::x86 ||
                     arch_type_ == llvm::Triple::ArchType::x86_64;
-  profiling_state_ = ProfilingState(is_top_level_computation_, use_rdtscp,
-                                    GetProfileCountersArgument());
+  profiling_state_ = ProfilingState(use_rdtscp, GetProfileCountersArgument());
   if (instruction_order == nullptr) {
     TF_RETURN_IF_ERROR(computation->Accept(this));
   } else {
     TF_RETURN_IF_ERROR(computation->AcceptOrdered(this, *instruction_order));
   }
-  InsertOrDie(&emitted_functions_, computation, compute_function_);
-
-  return compute_function_;
-}
-
-static llvm::Argument* GetArg(llvm::Function* f, int idx) {
-  llvm::Function::arg_iterator arg_iter = f->arg_begin();
-  std::advance(arg_iter, idx);
-  return &*arg_iter;
+  llvm::Function* ir_function = compute_function_->function();
+  InsertOrDie(&emitted_functions_, computation, ir_function);
+  // Delete 'compute_function', finalizing 'ir_function' and restoring caller
+  // IR insert point.
+  compute_function_.reset();
+  return ir_function;
 }
 
 void IrEmitter::InitializeIrFunction(const string& function_name) {
-  // The function signature is:
-  //   void function(i8* retval, i8* run_options, i8** params, i8** temps,
-  //                 i64* dynamic_loop_bounds, i64* prof_counters)
-  //
-  // retval: points to the returned value.
-  // params: address of an array with pointers to parameters.
-  // temps: address of an array with pointers to temporary buffers.
-  //
-  // Therefore, the generated function's signature (FunctionType) is statically
-  // determined - parameter unpacking is done in code generated into the
-  // function, rather than by a prologue dictated by the platform ABI.
-  //
-  //                      /--------------\
-  //   retval ----------> | return value |
-  //                      \--------------/
-  //
-  //                      /-------------------------------\
-  //   run_options -----> | xla::ExecutableRunOptions |
-  //                      \-------------------------------/
-  //
-  //                     /---------------------------------------------\
-  //   params -------->  |  param 0  |  param 1  | ..... |  param N-1  |
-  //                     |   addr    |   addr    |       |   addr      |
-  //                     \---------------------------------------------/
-  //                          |           |                   |
-  //                          |           |                   |
-  //                          V           V                   V
-  //                     /---------\  /---------\         /-----------\
-  //                     | param 0 |  | param 1 |         | param N-1 |
-  //                     \---------/  \---------/         \-----------/
-  //
-  //                     /---------------------------------------------\
-  //   temps --------->  |  temp  0  |  temp  1  | ..... |  temp  N-1  |
-  //                     |   addr    |   addr    |       |   addr      |
-  //                     \---------------------------------------------/
-  //                          |           |                   |
-  //                          |           |                   |
-  //                          V           V                   V
-  //                     /---------\  /---------\         /-----------\
-  //                     | temp  0 |  | temp  1 |         | temp  N-1 |
-  //                     \---------/  \---------/         \-----------/
-  //
-  //                        /--------------------------------------------\
-  // dynamic loop bounds -> | outer_dim0_start | outer_dim0_limit | .....|
-  //  (elided for aot)      \--------------------------------------------/
-  //
-  //                     /---------------------------------------------\
-  //   prof counters ->  | counter 0 | counter 1 | ..... | counter N-1 |
-  //  (elided for aot)   \---------------------------------------------/
-
-  // Even though the type of params and temps is void** in the host's view, in
-  // LLVM IR this is represented by i8*, similarly to void*. It's up to the code
-  // to use GEPs to unravel the indirection layers.
-  llvm::FunctionType* compute_function_type = llvm::FunctionType::get(
-      /*Result=*/llvm::Type::getVoidTy(module_->getContext()),
-      /*Params=*/GetComputeFunctionParams(),
-      /*isVarArg=*/false);
-
   // Functions with local linkage get an inlining bonus.  Because we know
   // a-priori that embedded functions (non-entry functions) will not have its
   // name resolved, give it local linkage.
   llvm::Function::LinkageTypes linkage =
       is_top_level_computation_ ? llvm::GlobalValue::ExternalLinkage
                                 : llvm::GlobalValue::InternalLinkage;
-  compute_function_ =
-      llvm::Function::Create(/*Ty=*/compute_function_type,
-                             /*Linkage=*/linkage,
-                             /*Name=*/AsStringRef(function_name),
-                             /*Module=*/module_);
-  compute_function_->setCallingConv(llvm::CallingConv::C);
-
-  // Set meaningful names for the function's arguments: useful for debugging.
-  llvm::Function::arg_iterator arg_iter = compute_function_->arg_begin();
-  arg_iter->setName("retval");
-  (++arg_iter)->setName("run_options");
-  (++arg_iter)->setName("params");
-  (++arg_iter)->setName("temps");
-  if (num_dynamic_loop_bounds_ > 0) {
-    (++arg_iter)->setName("dynamic_loop_bounds");
-  }
-  (++arg_iter)->setName("prof_counters");
-
-  // We know a-priori that the function arguments are guaranteed to point to
-  // disjoint objects.
-  llvm::Argument* retval = GetResultArgument();
-  for (llvm::Argument& argument : compute_function_->args()) {
-    // However, the return buffer aliases the temporaries and thus cannot be
-    // marked noalias.
-    if (&argument == retval) {
-      continue;
-    }
-    compute_function_->addAttribute(argument.getArgNo() + 1,
-                                    llvm::Attribute::NoAlias);
-  }
-
-  // Add the optize attribute to the function if optimizing for size. This
-  // controls internal behavior of some optimization passes (e.g. loop
-  // unrolling).
-  if (options::OptimizeForSizeRequested(hlo_module_config_)) {
-    compute_function_->addFnAttr(llvm::Attribute::OptimizeForSize);
-  }
-
-  if (hlo_module_config_.debug_options().xla_enable_fast_math()) {
-    compute_function_->addFnAttr("unsafe-fp-math", "true");
-    compute_function_->addFnAttr("no-infs-fp-math", "true");
-    compute_function_->addFnAttr("no-nans-fp-math", "true");
-    compute_function_->addFnAttr("no-signed-zeros-fp-math", "true");
-  }
-
-  ir_builder_.SetInsertPoint(llvm::BasicBlock::Create(
-      /*Context=*/module_->getContext(),
-      /*Name=*/"entry",
-      /*Parent=*/compute_function_));
+  // Create and initialize new IrFunction.
+  compute_function_.reset(
+      new IrFunction(function_name, linkage,
+                     options::OptimizeForSizeRequested(hlo_module_config_),
+                     hlo_module_config_.debug_options().xla_enable_fast_math(),
+                     module_, &ir_builder_, num_dynamic_loop_bounds_));
 }
 
 IrEmitter::~IrEmitter() {}
@@ -344,11 +244,12 @@ int IrEmitter::MinimumAlignmentForBufferSize(int64 buffer_size) {
 
 // Calculate the alignment of a buffer allocated for a given primitive type.
 int IrEmitter::MinimumAlignmentForPrimitiveType(PrimitiveType primitive_type) {
-  int64 buffer_size = ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
-  DCHECK_GE(buffer_size, 0);
-  DCHECK_LE(buffer_size, SIZE_MAX);
-
-  return MinimumAlignmentForBufferSize(buffer_size);
+  int64 byte_size = ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
+  DCHECK_GE(byte_size, 0);
+  // Largest scalar is a complex64 so we don't need to worry about the
+  // int64->int truncation here.
+  DCHECK_LE(byte_size, 8);
+  return byte_size;
 }
 
 int64 IrEmitter::ByteSizeOf(const Shape& shape) const {
@@ -357,6 +258,10 @@ int64 IrEmitter::ByteSizeOf(const Shape& shape) const {
 
 // Calculate the alignment of a buffer allocated for a given shape.
 int IrEmitter::MinimumAlignmentForShape(const Shape& shape) {
+  if (ShapeUtil::IsScalar(shape)) {
+    return MinimumAlignmentForPrimitiveType(shape.element_type());
+  }
+
   int64 buffer_size = ByteSizeOf(shape);
   DCHECK_GE(buffer_size, 0);
   DCHECK_LE(buffer_size, SIZE_MAX);
@@ -574,7 +479,7 @@ Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
 
 Status IrEmitter::HandleSort(HloInstruction* sort) {
   // TODO(b/26783907): Implement sort on CPU.
-  return Unimplemented("Sort is not supported on CPU (b/26783907).");
+  return Unimplemented("Sort is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleTuple(HloInstruction* tuple) {
@@ -588,7 +493,7 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
 }
 
 Status IrEmitter::HandleMap(HloInstruction* map) {
-  tensorflow::gtl::ArraySlice<HloInstruction*> operands(map->operands());
+  gtl::ArraySlice<HloInstruction*> operands(map->operands());
   HloComputation* function = map->to_apply();
   // The called computation should have been emitted previously.
   llvm::Function* mapped_ir_function = FindOrDie(emitted_functions_, function);
@@ -612,12 +517,12 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   HloComputation* function = reduce_window->to_apply();
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*reduce_window, /*operands=*/{operand},
-      /*supported_types=*/{F32}));
+      /*supported_types=*/{F32, BF16}));
 
   // TODO(b/31410564): Implement dilation for reduce-window.
   if (window_util::HasDilation(window)) {
     return Unimplemented(
-        "Dilation for reduce-window not implemented on CPU. See b/31410564.");
+        "Dilation for ReduceWindow is not implemented on CPU.");
   }
 
   // The called computation should have been emitted previously.
@@ -720,8 +625,7 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
   // TODO(b/31410564): Implement dilation for select-and-scatter.
   if (window_util::HasDilation(window)) {
     return Unimplemented(
-        "Dilation for select-and-scatter not implemented on CPU. "
-        "See b/31410564.");
+        "Dilation for SelectAndScatter is not implemented on CPU. ");
   }
 
   // The select and scatter computations should have been emitted previously.
@@ -898,6 +802,24 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*dot, /*operands=*/{lhs, rhs},
       /*supported_types=*/{F32, F64, C64}));
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  if (dnums.lhs_batch_dimensions_size() > 0 ||
+      dnums.rhs_batch_dimensions_size() > 0) {
+    return Unimplemented("Dot with batch dimensions not implemented.");
+  }
+
+  if (dnums.lhs_contracting_dimensions_size() != 1) {
+    // This is disallowed by ShapeInference today.
+    return Unimplemented(
+        "Dot with multiple contracting dimensions not implemented.");
+  }
+
+  if (dnums.lhs_contracting_dimensions(0) !=
+          std::min(lhs->shape().dimensions_size() - 1, 1) ||
+      dnums.rhs_contracting_dimensions(0) != 0) {
+    return Unimplemented(
+        "Dot with non-standard contracting dimensions not implemented.");
+  }
 
   llvm_ir::IrArray lhs_array(GetIrArrayFor(lhs));
   llvm_ir::IrArray rhs_array(GetIrArrayFor(rhs));
@@ -916,8 +838,9 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   // Dot operation is complicated so we delegate to a helper class.
   return DotOpEmitter::EmitDotOperation(
       *dot, /*transpose_lhs=*/false, /*transpose_rhs=*/false, target_array,
-      lhs_array, rhs_array, GetExecutableRunOptionsArgument(), &ir_builder_,
-      hlo_module_config_);
+      lhs_array, rhs_array, /*addend_array=*/nullptr,
+      GetExecutableRunOptionsArgument(), &ir_builder_, hlo_module_config_,
+      target_machine_features_);
 }
 
 Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
@@ -1189,8 +1112,14 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
         llvm_ir::IrArray kernel_array(GetIrArrayFor(rhs));
         llvm_ir::IrArray::Index kernel_index(num_dims);
         for (int i = 0; i < num_spatial_dims; ++i) {
-          kernel_index[dnums.kernel_spatial_dimensions(i)] = kernel_spatial[i];
+          kernel_index[dnums.kernel_spatial_dimensions(i)] =
+              window.dimensions(i).window_reversal()
+                  ? ir_builder_.CreateNSWSub(
+                        ir_builder_.getInt64(window.dimensions(i).size() - 1),
+                        kernel_spatial[i])
+                  : kernel_spatial[i];
         }
+
         kernel_index[dnums.kernel_input_feature_dimension()] = input_feature;
         kernel_index[dnums.kernel_output_feature_dimension()] = output_feature;
 
@@ -1207,10 +1136,66 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       });
 }
 
+Status IrEmitter::HandleFft(HloInstruction* fft) {
+  auto operand = fft->operand(0);
+  TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
+      /*instruction=*/*fft, /*operands=*/{operand},
+      /*supported_types=*/{F32, C64}));
+  TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(operand->shape().layout()));
+  TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(fft->shape().layout()));
+  VLOG(3) << "operand=" << ShapeUtil::HumanStringWithLayout(operand->shape());
+  VLOG(3) << "fft=" << ShapeUtil::HumanStringWithLayout(fft->shape());
+
+  llvm::Value* operand_address = GetEmittedValueFor(operand);
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(fft));
+
+  const std::vector<int64>& fft_length = fft->fft_length();
+  int64 input_batch = 1;
+  for (int i = 0; i < fft->shape().dimensions_size() - fft_length.size(); i++) {
+    input_batch *= fft->shape().dimensions(i);
+  }
+
+  // Args have been computed, make the call.
+  llvm::Type* int8_ptr_type = ir_builder_.getInt8Ty()->getPointerTo();
+  llvm::Type* int32_type = ir_builder_.getInt32Ty();
+  llvm::Type* int64_type = ir_builder_.getInt64Ty();
+  llvm::FunctionType* fft_type = llvm::FunctionType::get(
+      ir_builder_.getVoidTy(),
+      {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
+       int64_type, int64_type, int64_type, int64_type},
+      /*isVarArg=*/false);
+  const char* fn_name = runtime::kEigenFftSymbolName;
+  llvm::Function* fft_func = llvm::cast<llvm::Function>(
+      module_->getOrInsertFunction(fn_name, fft_type));
+  fft_func->setCallingConv(llvm::CallingConv::C);
+  fft_func->setDoesNotThrow();
+  fft_func->setOnlyAccessesInaccessibleMemOrArgMem();
+  const int fft_rank = fft_length.size();
+  ir_builder_.CreateCall(
+      fft_func,
+      {GetExecutableRunOptionsArgument(),
+       ir_builder_.CreateBitCast(GetEmittedValueFor(fft), int8_ptr_type),
+       ir_builder_.CreateBitCast(operand_address, int8_ptr_type),
+       ir_builder_.getInt32(fft->fft_type()), ir_builder_.getInt32(fft_rank),
+       ir_builder_.getInt64(input_batch),
+       ir_builder_.getInt64(fft_rank > 0 ? fft_length[0] : 0),
+       ir_builder_.getInt64(fft_rank > 1 ? fft_length[1] : 0),
+       ir_builder_.getInt64(fft_rank > 2 ? fft_length[2] : 0)});
+
+  return Status::OK();
+}
+
 Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
+  if (hlo_module_config_.replica_count() == 1) {
+    // When there is a single replica, a cross replica sum is the identity
+    // function, and the buffer assignment expects a copy (we could eliminate
+    // these at the HLO level as an optimization).
+    TF_RETURN_IF_ERROR(EmitTargetAddressForOp(crs));
+    return EmitMemcpy(*crs->operand(0), *crs);
+  }
+
   // TODO(b/33011107): Support cross replica sum on CPU.
-  return Unimplemented(
-      "Cross replica sum not implemented on CPU. See b/33011107.");
+  return Unimplemented("CrossReplicaSum is not implemented on CPU.");
 }
 
 // Fills up the free variables in 'index_with_free_var' with values from
@@ -1240,205 +1225,6 @@ static llvm_ir::IrArray::Index FillReducedDimensionIndex(
   return index_with_free_var;
 }
 
-Status IrEmitter::HandleBatchNormTraining(HloInstruction* batch_norm_training) {
-  // The output of BatchNormTraining is a tuple of three element:
-  //   - An N-dimensional array containing normalized values.
-  //   - A 1 dimensional array containing the mean value for each feature.
-  //   - A 1 dimensional array containing the variance value for each feature.
-  HloInstruction* operand = batch_norm_training->operands()[0];
-  HloInstruction* scale = batch_norm_training->operands()[1];
-  HloInstruction* offset = batch_norm_training->operands()[2];
-  float epsilon = batch_norm_training->epsilon();
-  int64 feature_index = batch_norm_training->feature_index();
-  TF_RET_CHECK(ShapeUtil::IsTuple(batch_norm_training->shape()) &&
-               ShapeUtil::TupleElementCount(batch_norm_training->shape()) == 3);
-
-  const Shape& output_shape =
-      ShapeUtil::GetTupleElementShape(batch_norm_training->shape(), 0);
-  const Shape& feature_shape =
-      ShapeUtil::GetTupleElementShape(batch_norm_training->shape(), 1);
-
-  // Reduce vector of the non-feature dimensions.
-  std::vector<int64> dimensions_to_reduce;
-
-  for (int64 i = 0; i < operand->shape().dimensions_size(); ++i) {
-    if (i != feature_index) {
-      dimensions_to_reduce.push_back(i);
-    }
-  }
-
-  // Get the second and third allocations in the output tuple, which should be
-  // used to store the result of mean and variance value calculation.
-  TF_ASSIGN_OR_RETURN(
-      const BufferAllocation::Slice slice_mean,
-      assignment_.GetUniqueSlice(batch_norm_training, /*index=*/{1}));
-  TF_ASSIGN_OR_RETURN(
-      const BufferAllocation::Slice slice_var,
-      assignment_.GetUniqueSlice(batch_norm_training, /*index=*/{2}));
-  const int feature_count = output_shape.dimensions(feature_index);
-  const int size_in_elements = ShapeUtil::ElementsIn(output_shape);
-  TF_RET_CHECK(ShapeUtil::ElementsIn(operand->shape()) == size_in_elements);
-  const int elements_per_feature = size_in_elements / feature_count;
-
-  llvm::Value* mean = EmitTempBufferPointer(slice_mean, feature_shape);
-  llvm_ir::IrArray mean_array(mean, feature_shape);
-
-  llvm::Value* var = EmitTempBufferPointer(slice_var, feature_shape);
-  llvm_ir::IrArray var_array(var, feature_shape);
-
-  // This loop calculates mean and variance for each feature.
-  //
-  // In theory this could be swapped by multi-output fusion. We will evaluate
-  // this when it's ready.
-  //
-  // For variance calculation, we use a simplified formula so we can fuse the
-  // computation into the same loop to calculate mean: Var=E(X^2) - E(X)^2.
-  TF_RETURN_IF_ERROR(
-      llvm_ir::LoopEmitter(
-          [&](const llvm_ir::IrArray::Index& index) {
-            PrimitiveType element_type = operand->shape().element_type();
-            // Used to calculate E(X).
-            llvm::Value* sum_address = llvm_ir::EmitAllocaAtFunctionEntry(
-                llvm_ir::PrimitiveTypeToIrType(element_type, module_),
-                "sum_address", &ir_builder_,
-                MinimumAlignmentForPrimitiveType(element_type));
-
-            // Used to calculate E(X^2).
-            llvm::Value* sum_square_address =
-                llvm_ir::EmitAllocaAtFunctionEntry(
-                    llvm_ir::PrimitiveTypeToIrType(element_type, module_),
-                    "sum_square_address", &ir_builder_,
-                    MinimumAlignmentForPrimitiveType(element_type));
-
-            ir_builder_.CreateStore(
-                llvm::ConstantFP::get(ir_builder_.getFloatTy(), 0.0),
-                sum_address);
-
-            ir_builder_.CreateStore(
-                llvm::ConstantFP::get(ir_builder_.getFloatTy(), 0.0),
-                sum_square_address);
-
-            llvm_ir::ForLoopNest loops(IrName(batch_norm_training, "inner"),
-                                       &ir_builder_);
-
-            const llvm_ir::IrArray::Index reduced_dims_index =
-                loops.AddLoopsForShapeOnDimensions(
-                    operand->shape(), dimensions_to_reduce, "reduction_dim");
-
-            SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(),
-                                  &ir_builder_);
-
-            llvm_ir::IrArray operand_array(GetIrArrayFor(operand));
-            llvm_ir::IrArray::Index input_index =
-                FillReducedDimensionIndex(reduced_dims_index, index);
-            llvm::Value* new_value =
-                operand_array.EmitReadArrayElement(input_index, &ir_builder_);
-
-            llvm::Value* new_value_square =
-                ir_builder_.CreateFMul(new_value, new_value);
-
-            llvm::Value* current_sum = ir_builder_.CreateLoad(sum_address);
-            llvm::Value* current_sum_square =
-                ir_builder_.CreateLoad(sum_square_address);
-            // Update sum.
-            ir_builder_.CreateStore(
-                ir_builder_.CreateFAdd(current_sum, new_value), sum_address);
-
-            // Update sum square.
-            ir_builder_.CreateStore(
-                ir_builder_.CreateFAdd(current_sum_square, new_value_square),
-                sum_square_address);
-
-            SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(),
-                                  &ir_builder_);
-
-            llvm::Value* sum = ir_builder_.CreateLoad(sum_address);
-            llvm::Value* elements_per_feature_value = llvm::ConstantFP::get(
-                ir_builder_.getFloatTy(), elements_per_feature);
-            llvm::Value* mean =
-                ir_builder_.CreateFDiv(sum, elements_per_feature_value);
-            llvm::Value* mean_square = ir_builder_.CreateFMul(mean, mean);
-            llvm::Value* sum_square =
-                ir_builder_.CreateLoad(sum_square_address);
-
-            // Var=E(X^2) - E(X)^2.
-            llvm::Value* var = ir_builder_.CreateFSub(
-                ir_builder_.CreateFDiv(sum_square, elements_per_feature_value),
-                mean_square);
-
-            var_array.EmitWriteArrayElement(index, var, &ir_builder_);
-            return mean;
-          },
-          mean_array, &ir_builder_)
-          .EmitLoop(IrName(batch_norm_training, "mean_var")));
-
-  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(batch_norm_training));
-  TF_ASSIGN_OR_RETURN(
-      const BufferAllocation::Slice slice,
-      assignment_.GetUniqueSlice(batch_norm_training, /*index=*/{0}));
-
-  llvm::Value* normalized = EmitTempBufferPointer(slice, output_shape);
-
-  llvm_ir::IrArray target_array(normalized, output_shape);
-
-  AddAliasingInformationToIrArray(*batch_norm_training, &target_array);
-
-  TF_RETURN_IF_ERROR(
-      llvm_ir::LoopEmitter(
-          [this, mean_array, var_array, epsilon, operand, dimensions_to_reduce,
-           feature_index, offset, scale](const llvm_ir::IrArray::Index& index) {
-            // The following logic normalizes the input value, scales and shifts
-            // it:
-            //
-            // normalized = (input - mean) / sqrt(variance + epsilon)
-            // result = normalized * scale + offset
-
-            // Current index in the feature dimension.
-            llvm_ir::IrArray::Index feature_index_value(1,
-                                                        index[feature_index]);
-
-            llvm::Value* mean = mean_array.EmitReadArrayElement(
-                feature_index_value, &ir_builder_);
-            llvm::Value* var = var_array.EmitReadArrayElement(
-                feature_index_value, &ir_builder_);
-
-            llvm_ir::IrArray operand_array(GetIrArrayFor(operand));
-            llvm::Value* input =
-                operand_array.EmitReadArrayElement(index, &ir_builder_);
-
-            llvm::Value* variance_with_epsilon = ir_builder_.CreateFAdd(
-                var, llvm::ConstantFP::get(ir_builder_.getFloatTy(), epsilon));
-            llvm::Function* func_llvm_sqrt = llvm::Intrinsic::getDeclaration(
-                module_, llvm::Intrinsic::sqrt, {ir_builder_.getFloatTy()});
-            llvm::Value* variance_sqrt =
-                ir_builder_.CreateCall(func_llvm_sqrt, {variance_with_epsilon});
-            llvm::Value* normalized = ir_builder_.CreateFDiv(
-                ir_builder_.CreateFSub(input, mean), variance_sqrt);
-            llvm_ir::IrArray offset_array(GetIrArrayFor(offset));
-            llvm::Value* offset = offset_array.EmitReadArrayElement(
-                feature_index_value, &ir_builder_);
-            llvm_ir::IrArray scale_array(GetIrArrayFor(scale));
-            llvm::Value* scale = scale_array.EmitReadArrayElement(
-                feature_index_value, &ir_builder_);
-            llvm::Value* result = ir_builder_.CreateFAdd(
-                ir_builder_.CreateFMul(normalized, scale), offset);
-
-            return result;
-          },
-          target_array, &ir_builder_)
-          .EmitLoop(IrName(batch_norm_training, "normalize")));
-
-  llvm_ir::EmitTuple(GetIrArrayFor(batch_norm_training),
-                     {normalized, mean, var}, &ir_builder_, module_);
-  return Status::OK();
-}
-
-Status IrEmitter::HandleBatchNormGrad(HloInstruction* batch_norm_grad) {
-  // TODO(b/62843645) Implement BatchNormGrad on CPU backend.
-  return Unimplemented(
-      "BatchNormGrad is not implemented on CPU. See b/62843645.");
-}
-
 Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   VLOG(2) << "HandleParameter: " << parameter->ToString();
   auto param_number = parameter->parameter_number();
@@ -1452,15 +1238,20 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   //
   // Where Param is the actual element type of the underlying buffer (for
   // example, float for an XLA F32 element type).
-  llvm::Argument* params = GetArg(compute_function_, 2);
+  llvm::Value* params = compute_function_->parameters_arg();
   llvm::Value* param_address_offset =
       llvm_ir::EmitBufferIndexingGEP(params, param_number, &ir_builder_);
   llvm::LoadInst* param_address_untyped =
       ir_builder_.CreateLoad(param_address_offset);
   param_address_untyped->setName(AsStringRef(IrName(parameter, "untyped")));
-  if (hlo_module_config_.debug_options()
+  if (is_top_level_computation_ &&
+      hlo_module_config_.debug_options()
           .xla_llvm_enable_invariant_load_metadata()) {
-    // We never reassign parameters, so this load is invariant.
+    // In the entry computation the parameter slots in the %params argument are
+    // invariant through program execution.  In computations that are called
+    // from the entry computation (via kWhile, kCall and kConditional) the
+    // parameter slots are *not* invariant since they're written to by their
+    // callers.
     param_address_untyped->setMetadata(
         llvm::LLVMContext::MD_invariant_load,
         llvm::MDNode::get(param_address_untyped->getContext(), /*MDs=*/{}));
@@ -1479,6 +1270,52 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   return Status::OK();
 }
 
+// Returns true if the relative order of the unreduced dimensions stays the same
+// through the reduce operation.
+static bool ReductionPreservesLayout(const HloInstruction& reduce) {
+  DCHECK_EQ(reduce.opcode(), HloOpcode::kReduce);
+
+  // Maps dimensions that were not reduced from their dimension numbers in the
+  // source shape to their dimensions numbers in the destination shape.
+  //
+  // So if we reduce f32[A,B,C,D] on dimensions 1 and 2, this map contains
+  // [0->0, 3->1].
+  gtl::FlatMap<int64, int64> unreduced_dim_map;
+
+  gtl::FlatSet<int64> reduced_dims(reduce.dimensions().begin(),
+                                   reduce.dimensions().end());
+
+  const Shape& operand_shape = reduce.operand(0)->shape();
+  const Shape& result_shape = reduce.shape();
+
+  int64 delta = 0;
+  for (int64 i = 0; i < operand_shape.dimensions_size(); i++) {
+    if (reduced_dims.count(i)) {
+      delta++;
+    } else {
+      InsertOrDie(&unreduced_dim_map, i, i - delta);
+    }
+  }
+
+  // Iterate dimensions minor to major and check that the corresponding
+  // dimensions in the source and target shapes are equivalent.
+  int64 result_dim_idx = 0;
+  for (int64 operand_dim_idx = 0;
+       operand_dim_idx < operand_shape.dimensions_size(); operand_dim_idx++) {
+    int64 operand_dim = operand_shape.layout().minor_to_major(operand_dim_idx);
+    if (!reduced_dims.count(operand_dim)) {
+      if (FindOrDie(unreduced_dim_map, operand_dim) !=
+          result_shape.layout().minor_to_major(result_dim_idx++)) {
+        return false;
+      }
+    }
+  }
+
+  CHECK_EQ(result_dim_idx, result_shape.dimensions_size());
+
+  return true;
+}
+
 IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
     HloComputation* function, string* failure_reason) const {
   CHECK_EQ(function->num_parameters(), 2);
@@ -1495,7 +1332,7 @@ IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
   if (ShapeUtil::ElementIsComplex(root_shape)) {
     // TODO(b/65408531): Complex add could by done via bitcast to <float x [2N]>
     // Complex multiply would be more challenging. We could perhaps use a
-    // strided load to get all reals in a vector, all imags in a vector, or use
+    // strided load to get all reals in a vector, all images in a vector, or use
     // CreateShuffleVector on a bitcast to float x [2N].
     *failure_reason = "complex values not supported";
     return nullptr;
@@ -1587,13 +1424,9 @@ IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
 
 IrEmitter::ShardedVectorType IrEmitter::CreateShardedVectorType(
     PrimitiveType element_type, unsigned element_count) {
-  // Here we assume that the largest register is a vector register.
-  int max_vector_register_size_in_bytes =
-      target_machine_features_.largest_register_size_in_bytes(
-          compute_function_);
-
   int vector_register_size_in_elements =
-      max_vector_register_size_in_bytes /
+      target_machine_features_.vector_register_byte_size(
+          *compute_function_->function()) /
       ShapeUtil::ByteSizeOfPrimitiveType(element_type);
 
   ShardedVectorType sharded_vector_type;
@@ -1646,7 +1479,7 @@ IrEmitter::EmitInnerLoopForVectorizedReduction(
     const ReductionGenerator& reduction_generator,
     const llvm_ir::IrArray::Index& output_index,
     const ShardedVectorType& accumulator_type, HloInstruction* init_value,
-    HloInstruction* arg, tensorflow::gtl::ArraySlice<int64> dimensions,
+    HloInstruction* arg, gtl::ArraySlice<int64> dimensions,
     unsigned element_alignment) {
   ShardedVector accumulator;
   accumulator.reserve(accumulator_type.size());
@@ -1748,23 +1581,14 @@ void IrEmitter::EmitShardedVectorStore(
   }
 }
 
-namespace {
-// TODO(sanjoy): This is duplicated in tensorflow/core/lib/core/arena.cc.
-// Extract out a common implementation to tensorflow/core/lib/math/math_util.h
-uint32 GCD(uint32 x, uint32 y) {
-  while (y != 0) {
-    uint32 r = x % y;
-    x = y;
-    y = r;
-  }
-  return x;
-}
-}  // namespace
-
 StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value,
-    tensorflow::gtl::ArraySlice<int64> dimensions, HloComputation* function,
+    gtl::ArraySlice<int64> dimensions, HloComputation* function,
     string* failure_reason) {
+  if (!ReductionPreservesLayout(*reduce)) {
+    return false;
+  }
+
   ReductionGenerator reduction_generator =
       MatchReductionGenerator(function, failure_reason);
   if (!reduction_generator) {
@@ -1781,11 +1605,12 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
 
   bool is_reduction_over_minor_dimension =
       std::find(dimensions.begin(), dimensions.end(),
-                arg->shape().layout().minor_to_major(0)) != dimensions.end();
+                LayoutUtil::Minor(arg->shape().layout(), 0)) !=
+      dimensions.end();
 
-  unsigned element_alignment =
-      GCD(ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type()),
-          MinimumAlignmentForPrimitiveType(reduce->shape().element_type()));
+  unsigned element_alignment = tensorflow::MathUtil::GCD<unsigned>(
+      ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type()),
+      MinimumAlignmentForPrimitiveType(reduce->shape().element_type()));
 
   if (is_reduction_over_minor_dimension) {
     // TODO(sanjoy): Implement vectorized reduction over the minor dimension.
@@ -1818,8 +1643,9 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
 
   llvm_ir::ForLoopNest loop_nest(IrName(reduce), &ir_builder_);
   llvm_ir::IrArray::Index array_index(reduce->shape().dimensions_size());
-  for (int i = reduce->shape().layout().minor_to_major_size() - 1; i > 0; --i) {
-    int64 dimension = reduce->shape().layout().minor_to_major(i);
+  for (int i = LayoutUtil::MinorToMajor(reduce->shape()).size() - 1; i > 0;
+       --i) {
+    int64 dimension = LayoutUtil::Minor(reduce->shape().layout(), i);
     int64 start_index = 0;
     int64 end_index = reduce->shape().dimensions(dimension);
     std::unique_ptr<llvm_ir::ForLoop> loop =
@@ -1828,7 +1654,7 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     array_index[dimension] = loop->GetIndVarValue();
   }
 
-  int64 innermost_dimension = reduce->shape().layout().minor_to_major(0);
+  int64 innermost_dimension = LayoutUtil::Minor(reduce->shape().layout(), 0);
   int64 innermost_dimension_size =
       reduce->shape().dimensions(innermost_dimension);
 
@@ -1864,10 +1690,10 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
                            target_array);
 
     if (auto exit_terminator = loop->GetExitBasicBlock()->getTerminator()) {
-      CHECK_GT(reduce->shape().layout().minor_to_major_size(), 1);
+      CHECK_GT(LayoutUtil::MinorToMajor(reduce->shape()).size(), 1);
       ir_builder_.SetInsertPoint(exit_terminator);
     } else {
-      CHECK_EQ(reduce->shape().layout().minor_to_major_size(), 1);
+      CHECK_EQ(LayoutUtil::MinorToMajor(reduce->shape()).size(), 1);
       ir_builder_.SetInsertPoint(loop->GetExitBasicBlock());
     }
   }
@@ -1906,7 +1732,7 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
   auto arg = reduce->mutable_operand(0);
   auto init_value = reduce->mutable_operand(1);
-  tensorflow::gtl::ArraySlice<int64> dimensions(reduce->dimensions());
+  gtl::ArraySlice<int64> dimensions(reduce->dimensions());
   HloComputation* function = reduce->to_apply();
   if (!options::VectorizedReduceDisabled(hlo_module_config_)) {
     string vectorization_failure_reason;
@@ -1983,19 +1809,19 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
 
 Status IrEmitter::HandleSend(HloInstruction* send) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Send is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Send is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleSendDone(HloInstruction* send_done) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Send-done is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Send-done is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleSlice(HloInstruction* slice) {
   VLOG(2) << "HandleSlice: " << slice->ToString();
   auto operand = slice->operand(0);
   // The code below emits a sequential loop nest. For the parallel backend, use
-  // EmitParallelTargetElementLoop() which respects dynamic loop bounds.
+  // ParallelLoopEmitter which respects dynamic loop bounds.
   if (ShouldEmitParallelLoopFor(*slice)) {
     return DefaultAction(slice);
   }
@@ -2026,8 +1852,8 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   //
   // * Implement the memcpy within the innermost loop.
 
-  tensorflow::gtl::FlatSet<int64> inner_dims;
-  for (int64 dim : layout.minor_to_major()) {
+  gtl::FlatSet<int64> inner_dims;
+  for (int64 dim : LayoutUtil::MinorToMajor(layout)) {
     if (operand->shape().dimensions(dim) != slice->shape().dimensions(dim)) {
       break;
     }
@@ -2054,7 +1880,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
 
   // memcpy_dim is the innermost (in terms of layout) dimension for which the
   // slice does *not* just copy all the elements along the dimension.
-  const int64 memcpy_dim = layout.minor_to_major(inner_dims.size());
+  const int64 memcpy_dim = LayoutUtil::Minor(layout, inner_dims.size());
 
   const bool memcpy_is_contiguous = slice->slice_strides(memcpy_dim) == 1;
   // The number of logical elements that can be copied in a single call
@@ -2153,12 +1979,12 @@ Status IrEmitter::HandleDynamicUpdateSlice(
 
 Status IrEmitter::HandleRecv(HloInstruction* recv) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Recv is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Recv is not implemented on CPU.");
 }
 
 Status IrEmitter::HandleRecvDone(HloInstruction* recv_done) {
   // TODO(b/33942983): Support Send/Recv on CPU.
-  return Unimplemented("Recv-done is not implemented on CPU. See b/33942983.");
+  return Unimplemented("Recv-done is not implemented on CPU.");
 }
 
 Status IrEmitter::HandlePad(HloInstruction* pad) {
@@ -2167,10 +1993,10 @@ Status IrEmitter::HandlePad(HloInstruction* pad) {
   for (auto& padding_dimension : pad->padding_config().dimensions()) {
     if (padding_dimension.edge_padding_low() < 0 ||
         padding_dimension.edge_padding_high() < 0) {
-      return Unimplemented(
-          "Negative padding not supported in the CPU backend (b/34628603); "
-          "this should have been eliminated at the HLO level: %s",
-          pad->padding_config().ShortDebugString().c_str());
+      return InternalErrorStrCat(
+          "Encountered negative padding in IrEmitter on CPU. "
+          "This should have been eliminated at the HLO level. ",
+          pad->ToString());
     }
   }
 
@@ -2263,8 +2089,8 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
         *root, root->operand(0)->IsRank2Transpose(),
         root->operand(1)->IsRank2Transpose(), target_array, lhs_array,
-        rhs_array, GetExecutableRunOptionsArgument(), &ir_builder_,
-        hlo_module_config_));
+        rhs_array, /*addend_array=*/nullptr, GetExecutableRunOptionsArgument(),
+        &ir_builder_, hlo_module_config_, target_machine_features_));
     return Status::OK();
   } else if (llvm_ir::CanEmitFusedDynamicUpdateSliceInPlace(fusion,
                                                             assignment_)) {
@@ -2285,6 +2111,35 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     TF_RETURN_IF_ERROR(fusion->fused_expression_root()->Accept(&fused_emitter));
 
     return EmitTargetElementLoop(fusion, fused_emitter.GetRootGenerator());
+  } else if (fusion->fusion_kind() == HloInstruction::FusionKind::kOutput) {
+    VLOG(3) << "HandleFusion kOutput";
+    int64 dot_op_index = root->operand(0)->opcode() == HloOpcode::kDot ? 0 : 1;
+    const HloInstruction* dot = root->operand(dot_op_index);
+    CHECK_EQ(dot->opcode(), HloOpcode::kDot)
+        << dot->ToString() << "  "
+        << fusion->fused_instructions_computation()->ToString();
+
+    int64 dot_lhs_param_number = dot->operand(0)->parameter_number();
+    int64 dot_rhs_param_number = dot->operand(1)->parameter_number();
+    int64 addend_param_number =
+        root->operand(1 - dot_op_index)->parameter_number();
+
+    Shape target_shape = fusion->shape();
+    TF_RETURN_IF_ERROR(EmitTargetAddressForOp(fusion));
+    llvm_ir::IrArray target_array = GetIrArrayFor(fusion);
+
+    llvm_ir::IrArray lhs_array(
+        GetIrArrayFor(fusion->operand(dot_lhs_param_number)));
+    llvm_ir::IrArray rhs_array(
+        GetIrArrayFor(fusion->operand(dot_rhs_param_number)));
+    llvm_ir::IrArray addend_array(
+        GetIrArrayFor(fusion->operand(addend_param_number)));
+
+    TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
+        *dot, /*transpose_lhs=*/false, /*transpose_rhs=*/false, target_array,
+        lhs_array, rhs_array, &addend_array, GetExecutableRunOptionsArgument(),
+        &ir_builder_, hlo_module_config_, target_machine_features_));
+    return Status::OK();
   } else {
     return Unimplemented("Fusion kind not implemented on CPU");
   }
@@ -2305,9 +2160,17 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
       !parallel_cpu_backend_) {
     // ParallelTaskAssignment assigned partitions, emit call to
     // ParallelForkJoin.
-    TF_RETURN_IF_ERROR(EmitParallelForkJoin(parameter_addresses,
-                                            emitted_value_[call], computation,
-                                            call_ir_function));
+    std::vector<llvm::Value*> call_args = GetArrayFunctionCallArguments(
+        parameter_addresses, &ir_builder_, computation->name(),
+        /*return_value_buffer=*/emitted_value_[call],
+        /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(),
+        /*temp_buffers_arg=*/GetTempBuffersArgument(),
+        /*profile_counters_arg=*/GetProfileCountersArgument());
+
+    HloInstruction* root = computation->root_instruction();
+    TF_RETURN_IF_ERROR(EmitCallToParallelForkJoin(
+        call_args, root->shape(), root->outer_dimension_partitions(),
+        &ir_builder_, call_ir_function, computation->name()));
   } else {
     EmitArrayFunctionCallInto(call_ir_function, parameter_addresses,
                               emitted_value_[call], computation->name());
@@ -2317,8 +2180,7 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
 }
 
 Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
-  tensorflow::gtl::ArraySlice<HloInstruction*> operands(
-      custom_call->operands());
+  gtl::ArraySlice<HloInstruction*> operands(custom_call->operands());
   tensorflow::StringPiece custom_call_target(custom_call->custom_call_target());
   llvm::Type* i8_ptr_type = ir_builder_.getInt8PtrTy();
   llvm::AllocaInst* operands_alloca =
@@ -2410,7 +2272,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   // Terminates the current block with a branch to a while header.
   llvm::BasicBlock* header_bb = llvm::BasicBlock::Create(
       module_->getContext(), AsStringRef(IrName(xla_while, "header")),
-      compute_function_);
+      compute_function_->function());
   ir_builder_.CreateBr(header_bb);
   ir_builder_.SetInsertPoint(header_bb);
 
@@ -2427,7 +2289,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   // Branches to the body or to the while exit depending on the condition.
   llvm::BasicBlock* body_bb = llvm::BasicBlock::Create(
       module_->getContext(), AsStringRef(IrName(xla_while, "body")),
-      compute_function_);
+      compute_function_->function());
   llvm::BasicBlock* exit_bb = llvm::BasicBlock::Create(
       module_->getContext(), AsStringRef(IrName(xla_while, "exit")));
   ir_builder_.CreateCondBr(while_predicate, body_bb, exit_bb);
@@ -2442,15 +2304,14 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   ir_builder_.CreateBr(header_bb);
 
   // Adds the exit block to the function and sets the insert point there.
-  compute_function_->getBasicBlockList().push_back(exit_bb);
+  compute_function_->function()->getBasicBlockList().push_back(exit_bb);
   ir_builder_.SetInsertPoint(exit_bb);
 
   return Status::OK();
 }
 
 StatusOr<bool> IrEmitter::EmitFastConcatenate(
-    HloInstruction* concatenate,
-    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    HloInstruction* concatenate, gtl::ArraySlice<HloInstruction*> operands,
     string* failure_reason) {
   if (ShouldEmitParallelLoopFor(*concatenate)) {
     *failure_reason =
@@ -2478,14 +2339,13 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
 
   int64 concat_dim = concatenate->dimensions(0);
   const Layout& output_layout = output_shape.layout();
+  auto output_min2maj = LayoutUtil::MinorToMajor(output_layout);
   auto concat_dim_layout_itr =
-      std::find(output_layout.minor_to_major().begin(),
-                output_layout.minor_to_major().end(), concat_dim);
+      std::find(output_min2maj.begin(), output_min2maj.end(), concat_dim);
 
-  std::vector<int64> inner_dims(output_layout.minor_to_major().begin(),
-                                concat_dim_layout_itr);
+  std::vector<int64> inner_dims(output_min2maj.begin(), concat_dim_layout_itr);
   std::vector<int64> outer_dims(std::next(concat_dim_layout_itr),
-                                output_layout.minor_to_major().end());
+                                output_min2maj.end());
 
   llvm::Type* i8_ptr_type = ir_builder_.getInt8PtrTy();
   llvm::Type* i8_type = ir_builder_.getInt8Ty();
@@ -2560,7 +2420,7 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source,
                                      const llvm_ir::IrArray& source_array) {
   unsigned primitive_type_size =
       ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
-  unsigned element_alignment = GCD(
+  unsigned element_alignment = tensorflow::MathUtil::GCD<unsigned>(
       primitive_type_size, MinimumAlignmentForPrimitiveType(primitive_type));
   llvm::Type* primitive_ptr_type = llvm::PointerType::getUnqual(
       llvm_ir::PrimitiveTypeToIrType(primitive_type, module_));
@@ -2590,8 +2450,7 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source,
 }
 
 Status IrEmitter::HandleConcatenate(HloInstruction* concatenate) {
-  tensorflow::gtl::ArraySlice<HloInstruction*> operands(
-      concatenate->operands());
+  gtl::ArraySlice<HloInstruction*> operands(concatenate->operands());
   string failure_reason;
   TF_ASSIGN_OR_RETURN(
       bool successful,
@@ -2607,6 +2466,65 @@ Status IrEmitter::HandleConcatenate(HloInstruction* concatenate) {
   return DefaultAction(concatenate);
 }
 
+Status IrEmitter::HandleConditional(HloInstruction* conditional) {
+  auto pred = conditional->operand(0);
+  auto true_arg = conditional->operand(1);
+  auto false_arg = conditional->operand(2);
+  TF_RET_CHECK(ShapeUtil::IsScalar(pred->shape()) &&
+               pred->shape().element_type() == PRED)
+      << "Predicate on a Conditional must be bool; got: "
+      << ShapeUtil::HumanString(pred->shape());
+
+  HloComputation* true_computation = conditional->true_computation();
+  HloComputation* false_computation = conditional->false_computation();
+  TF_RET_CHECK(ShapeUtil::Equal(conditional->shape(),
+                                true_computation->root_instruction()->shape()))
+      << "Shape of conditional should be same as the shape of the true "
+      << "computation; got: " << ShapeUtil::HumanString(conditional->shape())
+      << " and "
+      << ShapeUtil::HumanString(true_computation->root_instruction()->shape());
+
+  TF_RET_CHECK(ShapeUtil::Equal(conditional->shape(),
+                                false_computation->root_instruction()->shape()))
+      << "Shape of conditional should be same as the shape of the false "
+      << "computation; got: " << ShapeUtil::HumanString(conditional->shape())
+      << " and "
+      << ShapeUtil::HumanString(false_computation->root_instruction()->shape());
+
+  llvm::Function* true_function =
+      FindOrDie(emitted_functions_, true_computation);
+  llvm::Function* false_function =
+      FindOrDie(emitted_functions_, false_computation);
+
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(conditional));
+  llvm::Value* conditional_result = GetEmittedValueFor(conditional);
+
+  // Generating:
+  //   if (pred)
+  //     cond_result = true_computation(true_operand)
+  //   else
+  //     cond_result = false_computation(false_operand)
+  llvm::LoadInst* pred_value = ir_builder_.CreateLoad(
+      GetIrArrayFor(pred).GetBasePointer(), "load_predicate_value");
+  llvm::Value* pred_cond = ir_builder_.CreateICmpNE(
+      pred_value,
+      llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
+      "boolean_predicate");
+  llvm_ir::LlvmIfData if_data =
+      llvm_ir::EmitIfThenElse(pred_cond, "conditional", &ir_builder_);
+
+  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+  EmitArrayFunctionCallInto(true_function, {GetEmittedValueFor(true_arg)},
+                            conditional_result, IrName(conditional, "_true"));
+
+  SetToFirstInsertPoint(if_data.false_block, &ir_builder_);
+  EmitArrayFunctionCallInto(false_function, {GetEmittedValueFor(false_arg)},
+                            conditional_result, IrName(conditional, "_false"));
+
+  SetToFirstInsertPoint(if_data.after_block, &ir_builder_);
+  return Status::OK();
+}
+
 Status IrEmitter::FinishVisit(HloInstruction* root) {
   // When this method is called, we should have already emitted an IR value for
   // the root (return) op. The IR value holds the address of the buffer holding
@@ -2618,57 +2536,51 @@ Status IrEmitter::FinishVisit(HloInstruction* root) {
   llvm::Value* root_value = GetEmittedValueFor(root);
   VLOG(2) << "  value: " << llvm_ir::DumpToString(*root_value);
 
-  llvm::Value* prof_counter = [&]() {
-    // For the parallel cpu backend, we record the total for each embedded
-    // computation callee with its caller kCall HLO.
-    if (parallel_cpu_backend_ && is_top_level_computation_) {
-      auto* computation = root->parent();
-      auto* entry_computation = computation->parent()->entry_computation();
-      if (computation != entry_computation) {
-        for (HloInstruction* instruction : entry_computation->instructions()) {
-          if (instruction->opcode() == HloOpcode::kCall &&
-              instruction->to_apply()->root_instruction() == root) {
-            return GetProfileCounterFor(*instruction);
-          }
+  auto record_complete_computation = [&](llvm::Value* prof_counter) {
+    if (prof_counter) {
+      profiling_state_.RecordCompleteComputation(&ir_builder_, prof_counter);
+    }
+  };
+
+  // For the parallel cpu backend, we record the total for each embedded
+  // computation callee with its caller kCall HLO.
+  if (parallel_cpu_backend_ && is_top_level_computation_) {
+    auto* computation = root->parent();
+    auto* entry_computation = computation->parent()->entry_computation();
+    if (computation != entry_computation) {
+      for (HloInstruction* instruction : entry_computation->instructions()) {
+        if (instruction->opcode() == HloOpcode::kCall &&
+            instruction->to_apply()->root_instruction() == root) {
+          record_complete_computation(GetProfileCounterFor(*instruction));
+          return Status::OK();
         }
       }
     }
-
-    // Otherwise we record the total computation cycles in a dedicated slot for
-    // the entry computation.
-    return GetProfileCounterForEntryComputation();
-  }();
-
-  if (prof_counter) {
-    profiling_state_.RecordCompleteComputation(&ir_builder_, prof_counter);
   }
-  ir_builder_.CreateRetVoid();
+
+  // For the entry computation this increment is cumulative of embedded
+  // computations since it includes cycles spent in computations invoked by
+  // While, Call etc.
+  record_complete_computation(GetProfileCounterFor(*root->parent()));
   return Status::OK();
 }
 
-llvm::Value* IrEmitter::GetProfileCounterFor(const HloInstruction& hlo) {
-  auto it = hlo_to_profile_idx_.find(&hlo);
-  if (it == hlo_to_profile_idx_.end()) {
+template <typename T>
+llvm::Value* IrEmitter::GetProfileCounterCommon(
+    const T& hlo,
+    const std::unordered_map<const T*, int64>& profile_index_map) {
+  auto it = profile_index_map.find(&hlo);
+  if (it == profile_index_map.end()) {
     return nullptr;
   }
 
-  size_t prof_counter_idx = it->second;
+  int64 prof_counter_idx = it->second;
   string counter_name = IrName("prof_counter", hlo.name());
   return ir_builder_.CreateGEP(GetProfileCountersArgument(),
                                ir_builder_.getInt64(prof_counter_idx),
                                AsStringRef(counter_name));
 }
 
-llvm::Value* IrEmitter::GetProfileCounterForEntryComputation() {
-  if (entry_computation_profile_idx_) {
-    return ir_builder_.CreateGEP(
-        GetProfileCountersArgument(),
-        ir_builder_.getInt64(*entry_computation_profile_idx_),
-        "prof_counter.computation");
-  }
-  return nullptr;
-}
-
 void IrEmitter::ProfilingState::UpdateProfileCounter(
     llvm::IRBuilder<>* ir_builder, llvm::Value* prof_counter,
     llvm::Value* cycle_end, llvm::Value* cycle_start) {
@@ -2731,8 +2643,7 @@ void IrEmitter::ProfilingState::RecordCycleDelta(llvm::IRBuilder<>* ir_builder,
 
 void IrEmitter::ProfilingState::RecordCompleteComputation(
     llvm::IRBuilder<>* ir_builder, llvm::Value* prof_counter) {
-  if (is_top_level_computation_ && last_read_cycle_end_ &&
-      first_read_cycle_start_) {
+  if (last_read_cycle_end_ && first_read_cycle_start_) {
     UpdateProfileCounter(ir_builder, prof_counter, last_read_cycle_end_,
                          first_read_cycle_start_);
   }
@@ -2740,7 +2651,7 @@ void IrEmitter::ProfilingState::RecordCompleteComputation(
 
 Status IrEmitter::Preprocess(HloInstruction* hlo) {
   VLOG(3) << "Visiting: " << hlo->ToString();
-  if (hlo_to_profile_idx_.count(hlo)) {
+  if (instruction_to_profile_idx_.count(hlo)) {
     profiling_state_.RecordCycleStart(&ir_builder_, hlo);
   }
   return Status::OK();
@@ -2783,43 +2694,16 @@ llvm::Type* IrEmitter::IrShapeType(const Shape& shape) {
   return llvm_ir::ShapeToIrType(shape, module_);
 }
 
-std::vector<llvm::Type*> IrEmitter::GetComputeFunctionParams() {
-  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext());
-  llvm::Type* i8_ptr_ptr_type = i8_ptr_type->getPointerTo();
-  llvm::Type* i64_ptr_type = llvm::Type::getInt64PtrTy(module_->getContext());
-  std::vector<llvm::Type*> compute_function_params(
-      {i8_ptr_type, i8_ptr_type, i8_ptr_ptr_type, i8_ptr_ptr_type});
-  if (num_dynamic_loop_bounds_ > 0) {
-    compute_function_params.push_back(i64_ptr_type);
-  }
-  compute_function_params.push_back(i64_ptr_type);
-  return compute_function_params;
-}
-
-llvm::Argument* IrEmitter::GetResultArgument() {
-  return GetArg(compute_function_, 0);
-}
-
-llvm::Argument* IrEmitter::GetProfileCountersArgument() {
-  const int64 arg_index = num_dynamic_loop_bounds_ > 0 ? 5 : 4;
-  return GetArg(compute_function_, arg_index);
+llvm::Value* IrEmitter::GetProfileCountersArgument() {
+  return compute_function_->profile_counters_arg();
 }
 
 llvm::Value* IrEmitter::GetTempBuffersArgument() {
-  return GetArg(compute_function_, 3);
-}
-
-llvm::Value* IrEmitter::GetDynamicLoopBound(const int64 offset) {
-  CHECK_GT(num_dynamic_loop_bounds_, 0);
-  CHECK_LT(offset, num_dynamic_loop_bounds_ * 2);
-  llvm::Argument* loop_bounds_arg = GetArg(compute_function_, 4);
-  string name = tensorflow::strings::StrCat("dynamic_loop_bound_", offset);
-  return ir_builder_.CreateLoad(ir_builder_.CreateGEP(
-      loop_bounds_arg, ir_builder_.getInt64(offset), AsStringRef(name)));
+  return compute_function_->temp_buffers_arg();
 }
 
 llvm::Value* IrEmitter::GetExecutableRunOptionsArgument() {
-  return GetArg(compute_function_, 1);
+  return compute_function_->exec_run_options_arg();
 }
 
 llvm::Value* IrEmitter::EmitTempBufferPointer(
@@ -2850,10 +2734,14 @@ llvm::Value* IrEmitter::EmitTempBufferPointer(
       GetTempBuffersArgument(), slice.index(), &ir_builder_);
   llvm::LoadInst* tempbuf_address_base =
       ir_builder_.CreateLoad(tempbuf_address_ptr);
-  if (hlo_module_config_.debug_options()
+  if (is_top_level_computation_ &&
+      hlo_module_config_.debug_options()
           .xla_llvm_enable_invariant_load_metadata()) {
-    // Loading the address of a buffer is invariant of the point at which the
-    // load is executed in the program because we never reassign buffers.
+    // In the entry computation the parameter slots in the %params argument are
+    // invariant through program execution.  In computations that are called
+    // from the entry computation (via kWhile, kCall and kConditional) the
+    // parameter slots are *not* invariant since they're written to by their
+    // callers.
     tempbuf_address_base->setMetadata(
         llvm::LLVMContext::MD_invariant_load,
         llvm::MDNode::get(tempbuf_address_base->getContext(), /*MDs=*/{}));
@@ -2875,7 +2763,7 @@ llvm::Value* IrEmitter::EmitTempBufferPointer(
 // for a single element_type value, and loads it after call.
 llvm::Value* IrEmitter::EmitElementFunctionCall(
     llvm::Function* function, const Shape& return_shape,
-    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+    gtl::ArraySlice<llvm::Value*> parameter_addresses,
     tensorflow::StringPiece name) {
   llvm::Value* return_value_buffer = EmitArrayFunctionCall(
       function, return_shape, 1, parameter_addresses, name);
@@ -2884,42 +2772,6 @@ llvm::Value* IrEmitter::EmitElementFunctionCall(
       AsStringRef(tensorflow::strings::StrCat(name, "_return_value")));
 }
 
-// Emits code to allocate an array of parameter address pointers, and store
-// each address from 'parameter_addresses'.
-// Returns an array of compute function call arguments (including parameter
-// address buffer).
-std::vector<llvm::Value*> IrEmitter::GetArrayFunctionCallArguments(
-    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-    llvm::Value* return_value_buffer, tensorflow::StringPiece name) {
-  llvm::Value* parameter_addresses_buffer =
-      llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-          ir_builder_.getInt8PtrTy(),
-          ir_builder_.getInt32(parameter_addresses.size()),
-          tensorflow::strings::StrCat(name, "_parameter_addresses"),
-          &ir_builder_);
-  for (size_t i = 0; i < parameter_addresses.size(); ++i) {
-    llvm::Value* parameter_as_i8ptr = ir_builder_.CreateBitCast(
-        parameter_addresses[i], ir_builder_.getInt8PtrTy(),
-        AsStringRef(tensorflow::strings::StrCat(name, "_parameter_", i,
-                                                "_address_as_i8ptr")));
-    llvm::Value* slot_in_param_adresses = ir_builder_.CreateInBoundsGEP(
-        parameter_addresses_buffer, {ir_builder_.getInt64(i)});
-    ir_builder_.CreateStore(parameter_as_i8ptr, slot_in_param_adresses);
-  }
-
-  const auto to_int8_ptr = [this](llvm::Value* ptr) {
-    return ir_builder_.CreatePointerCast(ptr, ir_builder_.getInt8PtrTy());
-  };
-  std::vector<llvm::Value*> arguments{
-      to_int8_ptr(return_value_buffer),
-      to_int8_ptr(GetExecutableRunOptionsArgument()),
-      parameter_addresses_buffer, GetTempBuffersArgument()};
-  if (auto* profile_counters = GetProfileCountersArgument()) {
-    arguments.push_back(profile_counters);
-  }
-  return arguments;
-}
-
 // Emits a core function call based on the following pseudo-code.
 //
 //   char** parameter_addresses_buffer =
@@ -2931,17 +2783,20 @@ std::vector<llvm::Value*> IrEmitter::GetArrayFunctionCallArguments(
 //                 temps)
 //   return return_value_buffer  -- address of the return value.
 void IrEmitter::EmitArrayFunctionCallInto(
-    llvm::Function* function,
-    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+    llvm::Function* function, gtl::ArraySlice<llvm::Value*> parameter_addresses,
     llvm::Value* return_value_buffer, tensorflow::StringPiece name) {
   ir_builder_.CreateCall(
-      function, GetArrayFunctionCallArguments(parameter_addresses,
-                                              return_value_buffer, name));
+      function, GetArrayFunctionCallArguments(
+                    parameter_addresses, &ir_builder_, name,
+                    /*return_value_buffer=*/return_value_buffer,
+                    /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(),
+                    /*temp_buffers_arg=*/GetTempBuffersArgument(),
+                    /*profile_counters_arg=*/GetProfileCountersArgument()));
 }
 
 llvm::Value* IrEmitter::EmitArrayFunctionCall(
     llvm::Function* function, const Shape& return_shape, int64 element_count,
-    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+    gtl::ArraySlice<llvm::Value*> parameter_addresses,
     tensorflow::StringPiece name) {
   llvm::Value* elements =
       llvm::ConstantInt::get(ir_builder_.getInt64Ty(), element_count);
@@ -2956,117 +2811,13 @@ llvm::Value* IrEmitter::EmitArrayFunctionCall(
   return return_value_buffer;
 }
 
-// Emits a call to a runtime fork/join function which dispatches parallel
-// calls to 'parallel_function' (and joins threads before returning).
-Status IrEmitter::EmitParallelForkJoin(
-    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-    llvm::Value* output_address, HloComputation* computation,
-    llvm::Function* parallel_function) {
-  HloInstruction* root = computation->root_instruction();
-
-  // Build ParallelForkJoin function type.
-  std::vector<llvm::Type*> compute_function_params = GetComputeFunctionParams();
-  // Number of parallel compute functions.
-  compute_function_params.push_back(ir_builder_.getInt32Ty());
-  // Array of partitions. There is an array element for each
-  // partition x partition_dim x 2 (for dimension start and limit).
-  compute_function_params.push_back(
-      llvm::Type::getInt64PtrTy(module_->getContext()));
-  // Number of partitioned most-major dimensions in 'root.shape'.
-  compute_function_params.push_back(ir_builder_.getInt32Ty());
-  // Function pointer for compute function to be dispatched in parallel.
-  compute_function_params.push_back(
-      llvm::Type::getInt8PtrTy(module_->getContext()));
-
-  llvm::FunctionType* fork_join_type = llvm::FunctionType::get(
-      /*Result=*/llvm::Type::getVoidTy(module_->getContext()),
-      /*Params=*/compute_function_params,
-      /*isVarArg=*/false);
-
-  llvm::Function* fork_join_func =
-      llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-          runtime::kParallelForkJoinSymbolName, fork_join_type));
-  fork_join_func->setCallingConv(llvm::CallingConv::C);
-  fork_join_func->setDoesNotThrow();
-
-  // Add common compute function arguments.
-  const string name = computation->name();
-  std::vector<llvm::Value*> arguments =
-      GetArrayFunctionCallArguments(parameter_addresses, output_address, name);
-
-  // Create ShapePartitionIterator to generate all partitions of 'root.shape'.
-  ShapePartitionIterator partition_iterator(root->shape(),
-                                            root->outer_dimension_partitions());
-  const int64 num_partitions = partition_iterator.GetTotalPartitionCount();
-  // Add argument specifying the number of parallel partitions.
-  arguments.push_back(ir_builder_.getInt32(num_partitions));
-
-  // The number of partitioned most-major dimensions in 'root.shape'.
-  const int32 num_partitioned_dims = root->outer_dimension_partitions().size();
-  // A dimension partition consists of two elements: [start_index, limit_index).
-  const int32 dim_partition_size = 2;
-  // Calculate array partition stride.
-  const int32 array_partition_stride =
-      num_partitioned_dims * dim_partition_size;
-  // Calculate the total number of elements in the partition array.
-  const int32 partition_array_size =
-      dim_partition_size * num_partitioned_dims * num_partitions;
-
-  // Store dimension partition values as llvm constants in 'partitions'.
-  // See comments in runtime_fork_join.cc for array layout description.
-  std::vector<llvm::Constant*> partitions(partition_array_size);
-  for (int32 i = 0; i < num_partitions; ++i) {
-    std::vector<std::pair<int64, int64>> dim_partitions =
-        partition_iterator.GetPartition(i);
-    CHECK_EQ(num_partitioned_dims, dim_partitions.size());
-    const int32 partition_index = i * array_partition_stride;
-    for (int32 j = 0; j < num_partitioned_dims; ++j) {
-      const std::pair<int64, int64>& dim_partition = dim_partitions[j];
-      const int32 index = partition_index + j * dim_partition_size;
-      // Store partition [dim_start, dim_limit) intervals for each dimension.
-      partitions[index] = ir_builder_.getInt64(dim_partition.first);
-      partitions[index + 1] =
-          ir_builder_.getInt64(dim_partition.first + dim_partition.second);
-    }
-  }
-
-  // Create global variable out of dimension partitions in 'partitions'.
-  llvm::ArrayType* partitions_array_type =
-      llvm::ArrayType::get(ir_builder_.getInt64Ty(), partition_array_size);
-  llvm::Constant* partitions_array =
-      llvm::ConstantArray::get(partitions_array_type, partitions);
-  llvm::GlobalVariable* global_partitions_array = new llvm::GlobalVariable(
-      /*Module=*/*module_,
-      /*Type=*/partitions_array_type,
-      /*isConstant=*/true,
-      /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
-      /*Initializer=*/partitions_array,
-      /*Name=*/
-      AsStringRef(
-          tensorflow::strings::StrCat(name, "_parallel_dimension_partitions")));
-
-  // Add argument specifying parallel dimension partitions.
-  arguments.push_back(ir_builder_.CreateBitCast(
-      global_partitions_array,
-      llvm::Type::getInt64PtrTy(module_->getContext())));
-  // Add argument specifying the number of partitioned most-major dimensions.
-  arguments.push_back(ir_builder_.getInt32(num_partitioned_dims));
-  // Add argument for parallel compute function pointer.
-  arguments.push_back(
-      ir_builder_.CreateBitCast(parallel_function, ir_builder_.getInt8PtrTy()));
-  // Emit call to parallel fork/join.
-  ir_builder_.CreateCall(fork_join_func, arguments);
-
-  return Status::OK();
-}
-
 Status IrEmitter::EmitTargetAddressForOp(const HloInstruction* op) {
   llvm::Value* addr;
   const Shape& target_shape = op->shape();
   if (op == op->parent()->root_instruction()) {
     // For the root node, we write directly to the output buffer of the
     // function.
-    llvm::Argument* retval = GetResultArgument();
+    llvm::Argument* retval = compute_function_->result_arg();
     if (!ShapeUtil::IsNil(target_shape)) {
       llvm::AttrBuilder attr_builder;
       attr_builder.addAlignmentAttr(MinimumAlignmentForShape(target_shape));
@@ -3127,8 +2878,13 @@ Status IrEmitter::EmitTargetElementLoop(
 
   } else {
     if (ShouldEmitParallelLoopFor(*target_op)) {
-      TF_RETURN_IF_ERROR(EmitParallelTargetElementLoop(
-          target_shape, element_generator, IrName(target_op), &target_array));
+      // Emit code to read dynamic loop bounds from compute function argument.
+      std::vector<std::pair<llvm::Value*, llvm::Value*>> dynamic_loop_bounds =
+          compute_function_->GetDynamicLoopBounds();
+      // Emit parallel loop with dynamic loop bounds for most-major dimensions.
+      TF_RETURN_IF_ERROR(ParallelLoopEmitter(element_generator, target_array,
+                                             &dynamic_loop_bounds, &ir_builder_)
+                             .EmitLoop(IrName(target_op)));
     } else {
       TF_RETURN_IF_ERROR(
           llvm_ir::LoopEmitter(element_generator, target_array, &ir_builder_)
@@ -3138,60 +2894,6 @@ Status IrEmitter::EmitTargetElementLoop(
   return Status::OK();
 }
 
-Status IrEmitter::EmitParallelTargetElementLoop(
-    const Shape& target_shape,
-    const llvm_ir::ElementGenerator& element_generator,
-    tensorflow::StringPiece loop_name, llvm_ir::IrArray* target_array) {
-  CHECK(!ShapeUtil::IsTuple(target_shape));
-  CHECK(!ShapeUtil::IsScalar(target_shape));
-
-  // Emit code to read dynamic loop bounds from function argument 4.
-  std::vector<llvm::Value*> dynamic_loop_bounds(2 * num_dynamic_loop_bounds_);
-  for (int i = 0; i < 2 * num_dynamic_loop_bounds_; ++i) {
-    dynamic_loop_bounds[i] = GetDynamicLoopBound(i);
-  }
-
-  llvm_ir::ForLoopNest loop_nest(loop_name, &ir_builder_);
-  const int64 num_dims = target_shape.dimensions_size();
-  llvm_ir::IrArray::Index array_index(num_dims);
-
-  // Add loops from outer-most to inner-most dimensions.
-  for (int i = target_shape.layout().minor_to_major_size() - 1; i >= 0; --i) {
-    const int64 dimension = target_shape.layout().minor_to_major(i);
-    const int bounds_index = num_dims - 1 - i;
-    if (bounds_index < num_dynamic_loop_bounds_) {
-      // Emit dynamic loop bounds for this dimension. Dynamic loop bounds
-      // are read from ir function dynamic loop bounds argument.
-      llvm::Value* start_index = dynamic_loop_bounds[bounds_index * 2 + 0];
-      llvm::Value* end_index = dynamic_loop_bounds[bounds_index * 2 + 1];
-
-      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
-          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension),
-          start_index, end_index);
-      array_index[dimension] = loop->GetIndVarValue();
-    } else {
-      // Emit static loop bounds for this dimension.
-      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
-          /*start_index=*/0,
-          /*end_index=*/target_shape.dimensions(dimension),
-          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension));
-      array_index[dimension] = loop->GetIndVarValue();
-    }
-  }
-  // Point IR builder at inner loop BB.
-  SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), &ir_builder_);
-
-  // Emit loop body.
-  TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
-                      element_generator(array_index));
-  target_array->EmitWriteArrayElement(array_index, target_element,
-                                      &ir_builder_);
-  // Point IR builder at outer loop exit BB.
-  SetToFirstInsertPoint(loop_nest.GetOuterLoopExitBasicBlock(), &ir_builder_);
-
-  return Status::OK();
-}
-
 Status IrEmitter::EmitMemcpy(const HloInstruction& source,
                              const HloInstruction& destination) {
   llvm::Value* source_value = GetEmittedValueFor(&source);
@@ -3204,8 +2906,8 @@ Status IrEmitter::EmitMemcpy(const HloInstruction& source,
 
 Status IrEmitter::ElementTypesSameAndSupported(
     const HloInstruction& instruction,
-    tensorflow::gtl::ArraySlice<const HloInstruction*> operands,
-    tensorflow::gtl::ArraySlice<PrimitiveType> supported_types) {
+    gtl::ArraySlice<const HloInstruction*> operands,
+    gtl::ArraySlice<PrimitiveType> supported_types) {
   for (auto operand : operands) {
     TF_RET_CHECK(
         ShapeUtil::SameElementType(operands[0]->shape(), operand->shape()));
@@ -3249,37 +2951,5 @@ StatusOr<llvm::Value*> IrEmitter::EmitScalarCall(
                                  ShapeUtil::MakeShape(return_type, {}),
                                  argument_addrs, name);
 }
-
-unsigned TargetMachineFeatures::largest_register_size_in_bytes(
-    llvm::Function* function) {
-  auto itr = largest_register_size_in_bytes_.find(function);
-  if (itr != largest_register_size_in_bytes_.end()) {
-    return itr->second;
-  }
-
-  int result = largest_register_size_in_bytes_impl(function);
-
-  InsertOrDie(&largest_register_size_in_bytes_, function, result);
-  DCHECK_EQ(result, largest_register_size_in_bytes_.begin()->second);
-  return result;
-}
-
-unsigned TargetMachineFeatures::largest_register_size_in_bytes_impl(
-    llvm::Function* function) const {
-  auto register_info =
-      target_machine_->getSubtargetImpl(*function)->getRegisterInfo();
-
-  unsigned largest_register_size = 0;
-  for (const llvm::TargetRegisterClass* register_class :
-       register_info->regclasses()) {
-    if (register_class->isAllocatable()) {
-      largest_register_size =
-          std::max(largest_register_size,
-                   register_info->getRegSizeInBits(*register_class));
-    }
-  }
-
-  return largest_register_size / 8;
-}
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 351c95278c17f536e56d9f085b938a9baea9cde1..509440251497cd7337284c39dae05c5f6c28e7c2 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <stddef.h>
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -30,6 +31,8 @@ limitations under the License.
 #include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/external_constant_pool.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_function.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -49,49 +52,6 @@ limitations under the License.
 
 namespace xla {
 namespace cpu {
-
-// Wraps an llvm::TargetMachine and parses out some information that feeds into
-// code LLVM IR generation decisions.
-//
-// Ideally we'd be able to use llvm::TargetTransformInfo here (since its
-// interface is pretty much a perfect fit for our use case), but obtaining an
-// instance of llvm::TargetTransformInfo outside an LLVM pass pipeline without
-// super-ugly hacks is difficult.
-//
-// TODO(b/66049221): See if the LLVM community will be receptive to exposing an
-// API that lets us directly create and use llvm::TargetTransformInfo instances
-// outside of a pass manager.
-class TargetMachineFeatures {
- public:
-  TargetMachineFeatures(llvm::TargetMachine* target_machine)
-      : target_machine_(target_machine) {}
-
-  // Return the vectorization factor, which is the number of bytes of data
-  // explicitly vectorized routines will try to process at once.
-  int vectorization_factor_in_bytes() const {
-    // Ideally this should be a function of the cache line size (which we can
-    // get from llvm::TargetTransformInfo::getCacheLineSize) of the target
-    // machine.  Guess a value of 128 bytes for now.
-    return 128;
-  }
-
-  // Return the size of the largest register size in bytes.  We need to pass in
-  // "function" since llvm functions can contain annotations for specializing
-  // them to specific micro-architectures (though currently XLA does not use
-  // this functionality).
-  //
-  // Ideally we should have been able to use
-  // llvm::TargetTransformInfo::getRegisterBitWidth(true) here.
-  unsigned largest_register_size_in_bytes(llvm::Function* function);
-
- private:
-  unsigned largest_register_size_in_bytes_impl(llvm::Function* function) const;
-
-  tensorflow::gtl::FlatMap<llvm::Function*, int>
-      largest_register_size_in_bytes_;
-  llvm::TargetMachine* target_machine_;
-};
-
 // This class is the top-level API for the XLA HLO --> LLVM IR compiler.  It
 // implements the DfsHloVisitor interface and emits HLO computations as LLVM IR
 // functions.
@@ -103,20 +63,21 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // assignment: a BufferAssignment from which we know which temporary buffers
   //             are used by the HLO nodes.
   // llvm_module: the LLVM module to emit IR into.
-  // hlo_to_profile_idx: the mapping from HLO to its index in the profiling
-  //                     array.
-  // entry_computation_profile_idx: the index in the profiling array
-  //                                for the entry computation.
+  // instruction_to_profile_idx: the mapping from HLO instructions to their
+  //              index in the profiling array.
+  // computation_to_profile_idx: the mapping from HLO computations to their
+  //              index in the profiling array.
   // external_constant_pool: if non-null, points to an ExternalConstantPool
   //                         instance into which the Ir emitter can spill
   //                         constants.
-  IrEmitter(
-      const HloModule& hlo_module, const BufferAssignment& assignment,
-      llvm::Module* llvm_module,
-      std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
-      tensorflow::gtl::optional<size_t> entry_computation_profile_idx,
-      llvm::TargetMachine* target_machine,
-      ExternalConstantPool* external_constant_pool);
+  IrEmitter(const HloModule& hlo_module, const BufferAssignment& assignment,
+            llvm::Module* llvm_module,
+            std::unordered_map<const HloInstruction*, int64>
+                instruction_to_profile_idx,
+            std::unordered_map<const HloComputation*, int64>
+                computation_to_profile_idx,
+            llvm::TargetMachine* target_machine,
+            ExternalConstantPool* external_constant_pool);
   ~IrEmitter() override;
 
   // Emit and return the given HLO computation as an LLVM IR
@@ -163,8 +124,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleSelect(HloInstruction* select) override;
   Status HandleDot(HloInstruction* dot) override;
   Status HandleConvolution(HloInstruction* convolution) override;
-  Status HandleBatchNormTraining(HloInstruction* batch_norm_training) override;
-  Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override;
+  Status HandleFft(HloInstruction* fft) override;
   Status HandleCrossReplicaSum(HloInstruction* crs) override;
   Status HandleInfeed(HloInstruction* infeed) override;
   Status HandleOutfeed(HloInstruction* outfeed) override;
@@ -189,6 +149,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleCustomCall(HloInstruction* custom_call) override;
   Status HandleWhile(HloInstruction* xla_while) override;
   Status HandleConcatenate(HloInstruction* concatenate) override;
+  Status HandleConditional(HloInstruction* conditional) override;
   Status FinishVisit(HloInstruction* root) override;
 
   Status Preprocess(HloInstruction* hlo) override;
@@ -198,14 +159,23 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // Private helper to initialize an IR function for the computation.
   void InitializeIrFunction(const string& function_name);
 
-  // Convenience function to generate a GEP into the profile counter parameter
-  // which would correspond to the index for a given HLO.
-  llvm::Value* GetProfileCounterFor(const HloInstruction& hlo);
+  template <typename T>
+  llvm::Value* GetProfileCounterCommon(
+      const T& hlo,
+      const std::unordered_map<const T*, int64>& profile_index_map);
+
+  // Convenience functions to generate a GEP into the profile counter parameter
+  // which would correspond to the index for a given HLO instruction or
+  // computation.
+  llvm::Value* GetProfileCounterFor(const HloInstruction& instruction) {
+    return GetProfileCounterCommon<HloInstruction>(instruction,
+                                                   instruction_to_profile_idx_);
+  }
 
-  // Convenience function to generate a GEP into the profile counter parameter
-  // corresponding to the index for the entry computation.  Returns nullptr if
-  // profiling the entry computation is disabled.
-  llvm::Value* GetProfileCounterForEntryComputation();
+  llvm::Value* GetProfileCounterFor(const HloComputation& computation) {
+    return GetProfileCounterCommon<HloComputation>(computation,
+                                                   computation_to_profile_idx_);
+  }
 
   // Gets the IR Value emitted previously for the given hlo.
   //
@@ -233,16 +203,9 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // Convenience function to get the IR type matching the given shape.
   llvm::Type* IrShapeType(const Shape& shape);
 
-  // Returns an array of compute function parameter types.
-  std::vector<llvm::Type*> GetComputeFunctionParams();
-
-  // Get the llvm::Value* that represents the "retval" argument of the
-  // computation function being emitted by this emitter.
-  llvm::Argument* GetResultArgument();
-
   // Get the llvm::Value* that represents the "prof_counters" argument of the
   // computation function being emitted by this emitter.
-  llvm::Argument* GetProfileCountersArgument();
+  llvm::Value* GetProfileCountersArgument();
 
   // Get the xla::ExecutableRunOptions that represents the "run_options"
   // argument of the computation function being emitted by this emitter.
@@ -252,11 +215,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // computation function being emitted by this emitter.
   llvm::Value* GetTempBuffersArgument();
 
-  // Emit ir to read and return the ir value for the dynamic loop bound at
-  // 'offset' from the "dynamic_loop_bounds" argument of the computation
-  // function being emitted by this emitter.
-  llvm::Value* GetDynamicLoopBound(const int64 offset);
-
   // Emits code that computes the address of the given temporary buffer to the
   // function. target_shape is the shape of this temporary buffer.
   // The returned Value's type is a pointer to element_type.
@@ -310,18 +268,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
       tensorflow::StringPiece name);
 
-  // Returns an array of compute function call arguments.
-  std::vector<llvm::Value*> GetArrayFunctionCallArguments(
-      tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-      llvm::Value* return_value_buffer, tensorflow::StringPiece name);
-
-  // Emits a call to a runtime fork/join function which dispatches parallel
-  // calls to 'parallel_function' (and joins threads before returning).
-  Status EmitParallelForkJoin(
-      tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-      llvm::Value* output_address, HloComputation* computation,
-      llvm::Function* parallel_function);
-
   // Verifies that the element types of all of the given operand instructions
   // match and are of one of the given supported types.
   Status ElementTypesSameAndSupported(
@@ -346,15 +292,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       HloInstruction* target_op, tensorflow::StringPiece desc,
       const llvm_ir::ElementGenerator& element_generator);
 
-  // Emit IR to perform a computation for every element in a partition/slice of
-  // 'target_shape'. The loop bounds for the outer-dimension partitions are
-  // passed into the compute function as a runtime argument (accessible from
-  // GetDynamicLoopBound).
-  Status EmitParallelTargetElementLoop(
-      const Shape& target_shape,
-      const llvm_ir::ElementGenerator& element_generator,
-      tensorflow::StringPiece loop_name, llvm_ir::IrArray* target_array);
-
   // Emits a memcpy from the source instruction's result value to the
   // destination's.  Both source and destination must have an entry in the
   // emitted_value_ table.
@@ -476,13 +413,19 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       thread_local_buffers_;
 
   // The following fields track the IR emission state. According to LLVM memory
-  // management rules, their memory is owned by the module.
-  llvm::Function* compute_function_;
+  // management rules, their memory is owned by the module (Note that IrFunction
+  // creates the encapsulated llvm::Function s.t. it is added to the llvm
+  // module's function list).
+  std::unique_ptr<IrFunction> compute_function_;
   llvm::IRBuilder<> ir_builder_;
 
-  // Maps HLOs to their index into the profile counter array.
-  std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx_;
-  const tensorflow::gtl::optional<size_t> entry_computation_profile_idx_;
+  // Maps HLO instructions to their index into the profile counter array.
+  const std::unordered_map<const HloInstruction*, int64>
+      instruction_to_profile_idx_;
+
+  // Maps HLO computations to their index into the profile counter array.
+  const std::unordered_map<const HloComputation*, int64>
+      computation_to_profile_idx_;
 
   // Maps HLOs to Values emitted for them.
   std::unordered_map<const HloInstruction*, llvm::Value*> emitted_value_;
@@ -490,7 +433,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   llvm_ir::AliasAnalysis alias_analysis_;
 
   // The number of root instruction outer dimensions used in parallel loop
-  // emission (EmitParallelTargetElementLoop).
+  // emission (ParallelLoopEmitter).
   int64 num_dynamic_loop_bounds_ = 0;
 
   // Returns whether the given instruction should be emitted as a parallel loop.
@@ -505,15 +448,9 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // profiling a computation.
   class ProfilingState {
    public:
-    ProfilingState()
-        : is_top_level_computation_(false),
-          use_rdtscp_(false),
-          prof_counters_(nullptr) {}
-    ProfilingState(bool is_top_level_computation, bool use_rdtscp,
-                   llvm::Argument* prof_counters)
-        : is_top_level_computation_(is_top_level_computation),
-          use_rdtscp_(use_rdtscp),
-          prof_counters_(prof_counters) {}
+    ProfilingState() : use_rdtscp_(false), prof_counters_(nullptr) {}
+    ProfilingState(bool use_rdtscp, llvm::Value* prof_counters)
+        : use_rdtscp_(use_rdtscp), prof_counters_(prof_counters) {}
 
     // Record the cycle counter before an HLO executes.
     void RecordCycleStart(llvm::IRBuilder<>* ir_builder, HloInstruction* hlo);
@@ -535,15 +472,12 @@ class IrEmitter : public DfsHloVisitorWithDefault {
                               llvm::Value* cycle_start);
 
    private:
-    // Is this IrEmitter for a top-level computation?
-    bool is_top_level_computation_;
-
     // Should we use the x86-specific rdtscp or the generic readcyclecounter
     // intrinsic?
     bool use_rdtscp_;
 
     // The argument which corresponds to the profile counter buffer.
-    llvm::Argument* prof_counters_;
+    llvm::Value* prof_counters_;
 
     // The first read cycle counter in the program.
     llvm::Value* first_read_cycle_start_ = nullptr;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.cc b/tensorflow/compiler/xla/service/cpu/ir_function.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d6f2f3818a7bd4424aaa7d918ca86abef15c0e9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.cc
@@ -0,0 +1,333 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+
+#include "tensorflow/compiler/xla/service/cpu/ir_function.h"
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
+#include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace xla {
+
+namespace {
+using llvm_ir::AsStringRef;
+}  // namespace
+
+namespace cpu {
+
+static std::vector<llvm::Type*> GetComputeFunctionParams(
+    llvm::Module* llvm_module, const int64 num_dynamic_loop_bounds) {
+  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(llvm_module->getContext());
+  llvm::Type* i8_ptr_ptr_type = i8_ptr_type->getPointerTo();
+  llvm::Type* i64_ptr_type =
+      llvm::Type::getInt64PtrTy(llvm_module->getContext());
+  std::vector<llvm::Type*> compute_function_params(
+      {i8_ptr_type, i8_ptr_type, i8_ptr_ptr_type, i8_ptr_ptr_type});
+  if (num_dynamic_loop_bounds > 0) {
+    compute_function_params.push_back(i64_ptr_type);
+  }
+  compute_function_params.push_back(i64_ptr_type);
+  return compute_function_params;
+}
+
+IrFunction::IrFunction(const string& function_name,
+                       llvm::Function::LinkageTypes linkage,
+                       const bool optimize_for_size_requested,
+                       const bool enable_fast_math, llvm::Module* llvm_module,
+                       llvm::IRBuilder<>* ir_builder,
+                       int64 num_dynamic_loop_bounds)
+    : ir_builder_(ir_builder),
+      llvm_module_(llvm_module),
+      caller_insert_point_guard_(*ir_builder),
+      num_dynamic_loop_bounds_(num_dynamic_loop_bounds) {
+  Initialize(function_name, linkage, optimize_for_size_requested,
+             enable_fast_math);
+}
+
+IrFunction::~IrFunction() {
+  // Emit function return value.
+  ir_builder_->CreateRetVoid();
+}
+
+DynamicLoopBounds IrFunction::GetDynamicLoopBounds() {
+  DynamicLoopBounds dynamic_loop_bounds(num_dynamic_loop_bounds_);
+  for (int i = 0; i < num_dynamic_loop_bounds_; ++i) {
+    dynamic_loop_bounds[i].first = GetDynamicLoopBound(i * 2 + 0);
+    dynamic_loop_bounds[i].second = GetDynamicLoopBound(i * 2 + 1);
+  }
+  return dynamic_loop_bounds;
+}
+
+void IrFunction::Initialize(const string& function_name,
+                            llvm::Function::LinkageTypes linkage,
+                            const bool optimize_for_size_requested,
+                            const bool enable_fast_math) {
+  // The function signature is:
+  //   void function(i8* retval, i8* run_options, i8** params, i8** temps,
+  //                 i64* dynamic_loop_bounds, i64* prof_counters)
+  //
+  // retval: points to the returned value.
+  // params: address of an array with pointers to parameters.
+  // temps: address of an array with pointers to temporary buffers.
+  //
+  // Therefore, the generated function's signature (FunctionType) is statically
+  // determined - parameter unpacking is done in code generated into the
+  // function, rather than by a prologue dictated by the platform ABI.
+  //
+  //                      /--------------\
+  //   retval ----------> | return value |
+  //                      \--------------/
+  //
+  //                      /-------------------------------\
+  //   run_options -----> | xla::ExecutableRunOptions |
+  //                      \-------------------------------/
+  //
+  //                     /---------------------------------------------\
+  //   params -------->  |  param 0  |  param 1  | ..... |  param N-1  |
+  //                     |   addr    |   addr    |       |   addr      |
+  //                     \---------------------------------------------/
+  //                          |           |                   |
+  //                          |           |                   |
+  //                          V           V                   V
+  //                     /---------\  /---------\         /-----------\
+  //                     | param 0 |  | param 1 |         | param N-1 |
+  //                     \---------/  \---------/         \-----------/
+  //
+  //                     /---------------------------------------------\
+  //   temps --------->  |  temp  0  |  temp  1  | ..... |  temp  N-1  |
+  //                     |   addr    |   addr    |       |   addr      |
+  //                     \---------------------------------------------/
+  //                          |           |                   |
+  //                          |           |                   |
+  //                          V           V                   V
+  //                     /---------\  /---------\         /-----------\
+  //                     | temp  0 |  | temp  1 |         | temp  N-1 |
+  //                     \---------/  \---------/         \-----------/
+  //
+  //                        /--------------------------------------------\
+  // dynamic loop bounds -> | outer_dim0_start | outer_dim0_limit | .....|
+  //  (elided for aot)      \--------------------------------------------/
+  //
+  //                     /---------------------------------------------\
+  //   prof counters ->  | counter 0 | counter 1 | ..... | counter N-1 |
+  //                     \---------------------------------------------/
+
+  // Even though the type of params and temps is void** in the host's view, in
+  // LLVM IR this is represented by i8*, similarly to void*. It's up to the code
+  // to use GEPs to unravel the indirection layers.
+  llvm::FunctionType* function_type = llvm::FunctionType::get(
+      /*Result=*/llvm::Type::getVoidTy(llvm_module_->getContext()),
+      /*Params=*/
+      GetComputeFunctionParams(llvm_module_, num_dynamic_loop_bounds_),
+      /*isVarArg=*/false);
+
+  // Functions with local linkage get an inlining bonus.  Because we know
+  // a-priori that embedded functions (non-entry functions) will not have its
+  // name resolved, give it local linkage.
+  function_ =
+      llvm_ir::CreateFunction(function_type, linkage,
+                              /*enable_fast_math=*/enable_fast_math,
+                              /*optimize_for_size=*/optimize_for_size_requested,
+                              function_name, llvm_module_);
+
+  // Set meaningful names for the function's arguments: useful for debugging.
+  llvm::Function::arg_iterator arg_iter = function_->arg_begin();
+  arg_iter->setName("retval");
+  result_arg_ = &*arg_iter;
+  (++arg_iter)->setName("run_options");
+  exec_run_options_arg_ = &*arg_iter;
+  (++arg_iter)->setName("params");
+  parameters_arg_ = &*arg_iter;
+  (++arg_iter)->setName("temps");
+  temp_buffers_arg_ = &*arg_iter;
+  if (num_dynamic_loop_bounds_ > 0) {
+    (++arg_iter)->setName("dynamic_loop_bounds");
+    dynamic_loop_bounds_arg_ = &*arg_iter;
+  }
+  (++arg_iter)->setName("prof_counters");
+  profile_counters_arg_ = &*arg_iter;
+
+  // We know a-priori that the function arguments are guaranteed to point to
+  // disjoint objects.
+  llvm::Argument* retval = result_arg();
+  for (llvm::Argument& argument : function_->args()) {
+    // However, the return buffer aliases the temporaries and thus cannot be
+    // marked noalias.
+    if (&argument == retval) {
+      continue;
+    }
+    function_->addAttribute(argument.getArgNo() + 1, llvm::Attribute::NoAlias);
+  }
+
+  ir_builder_->SetInsertPoint(llvm::BasicBlock::Create(
+      /*Context=*/llvm_module_->getContext(),
+      /*Name=*/"entry",
+      /*Parent=*/function_));
+}
+
+llvm::Value* IrFunction::GetDynamicLoopBound(const int64 offset) {
+  CHECK_GT(num_dynamic_loop_bounds_, 0);
+  CHECK_LT(offset, num_dynamic_loop_bounds_ * 2);
+  string name = tensorflow::strings::StrCat("dynamic_loop_bound_", offset);
+  return ir_builder_->CreateLoad(
+      ir_builder_->CreateGEP(CHECK_NOTNULL(dynamic_loop_bounds_arg_),
+                             ir_builder_->getInt64(offset), AsStringRef(name)));
+}
+
+// Emits code to allocate an array of parameter address pointers, and store
+// each address from 'parameter_addresses'.
+// Returns an array of compute function call arguments (including parameter
+// address buffer).
+std::vector<llvm::Value*> GetArrayFunctionCallArguments(
+    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece name,
+    llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg,
+    llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg) {
+  llvm::Value* parameter_addresses_buffer =
+      llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+          ir_builder->getInt8PtrTy(),
+          ir_builder->getInt32(parameter_addresses.size()),
+          tensorflow::strings::StrCat(name, "_parameter_addresses"),
+          ir_builder);
+  for (size_t i = 0; i < parameter_addresses.size(); ++i) {
+    llvm::Value* parameter_as_i8ptr = ir_builder->CreateBitCast(
+        parameter_addresses[i], ir_builder->getInt8PtrTy(),
+        AsStringRef(tensorflow::strings::StrCat(name, "_parameter_", i,
+                                                "_address_as_i8ptr")));
+    llvm::Value* slot_in_param_addresses = ir_builder->CreateInBoundsGEP(
+        parameter_addresses_buffer, {ir_builder->getInt64(i)});
+    ir_builder->CreateStore(parameter_as_i8ptr, slot_in_param_addresses);
+  }
+
+  const auto to_int8_ptr = [=](llvm::Value* ptr) {
+    return ir_builder->CreatePointerCast(ptr, ir_builder->getInt8PtrTy());
+  };
+  std::vector<llvm::Value*> arguments{
+      to_int8_ptr(return_value_buffer), to_int8_ptr(exec_run_options_arg),
+      parameter_addresses_buffer, temp_buffers_arg};
+  if (profile_counters_arg != nullptr) {
+    arguments.push_back(profile_counters_arg);
+  }
+  return arguments;
+}
+
+// Emits a call to a runtime fork/join function which dispatches parallel
+// calls to 'parallel_function' (and joins threads before returning).
+Status EmitCallToParallelForkJoin(
+    const std::vector<llvm::Value*>& arguments, const Shape& shape,
+    const std::vector<int64>& dimension_partition_counts,
+    llvm::IRBuilder<>* ir_builder, llvm::Function* parallel_function,
+    const string& name) {
+  llvm::Module* module = ir_builder->GetInsertBlock()->getModule();
+
+  // Build ParallelForkJoin function type.
+  std::vector<llvm::Type*> compute_function_params =
+      GetComputeFunctionParams(module, /*num_dynamic_loop_bounds=*/0);
+  // Number of parallel compute functions.
+  compute_function_params.push_back(ir_builder->getInt32Ty());
+  // Array of partitions. There is an array element for each
+  // partition x partition_dim x 2 (for dimension start and limit).
+  compute_function_params.push_back(
+      llvm::Type::getInt64PtrTy(module->getContext()));
+  // Number of partitioned most-major dimensions in 'shape'.
+  compute_function_params.push_back(ir_builder->getInt32Ty());
+  // Function pointer for compute function to be dispatched in parallel.
+  compute_function_params.push_back(
+      llvm::Type::getInt8PtrTy(module->getContext()));
+
+  llvm::FunctionType* fork_join_type = llvm::FunctionType::get(
+      /*Result=*/llvm::Type::getVoidTy(module->getContext()),
+      /*Params=*/compute_function_params,
+      /*isVarArg=*/false);
+
+  llvm::Function* fork_join_func =
+      llvm::cast<llvm::Function>(module->getOrInsertFunction(
+          runtime::kParallelForkJoinSymbolName, fork_join_type));
+  fork_join_func->setCallingConv(llvm::CallingConv::C);
+  fork_join_func->setDoesNotThrow();
+
+  // Add common compute function arguments.
+  std::vector<llvm::Value*> fork_join_arguments(arguments);
+
+  // Create ShapePartitionIterator to generate all partitions of 'shape'.
+  ShapePartitionIterator partition_iterator(shape, dimension_partition_counts);
+  const int64 num_partitions = partition_iterator.GetTotalPartitionCount();
+  // Add argument specifying the number of parallel partitions.
+  fork_join_arguments.push_back(ir_builder->getInt32(num_partitions));
+
+  // The number of partitioned most-major dimensions in 'shape'.
+  const int32 num_partitioned_dims = dimension_partition_counts.size();
+  // A dimension partition consists of two elements: [start_index, limit_index).
+  const int32 dim_partition_size = 2;
+  // Calculate array partition stride.
+  const int32 array_partition_stride =
+      num_partitioned_dims * dim_partition_size;
+  // Calculate the total number of elements in the partition array.
+  const int32 partition_array_size =
+      dim_partition_size * num_partitioned_dims * num_partitions;
+
+  // Store dimension partition values as llvm constants in 'partitions'.
+  // See comments in runtime_fork_join.cc for array layout description.
+  std::vector<llvm::Constant*> partitions(partition_array_size);
+  for (int32 i = 0; i < num_partitions; ++i) {
+    std::vector<std::pair<int64, int64>> dim_partitions =
+        partition_iterator.GetPartition(i);
+    CHECK_EQ(num_partitioned_dims, dim_partitions.size());
+    const int32 partition_index = i * array_partition_stride;
+    for (int32 j = 0; j < num_partitioned_dims; ++j) {
+      const std::pair<int64, int64>& dim_partition = dim_partitions[j];
+      const int32 index = partition_index + j * dim_partition_size;
+      // Store partition [dim_start, dim_limit) intervals for each dimension.
+      partitions[index] = ir_builder->getInt64(dim_partition.first);
+      partitions[index + 1] =
+          ir_builder->getInt64(dim_partition.first + dim_partition.second);
+    }
+  }
+
+  // Create global variable out of dimension partitions in 'partitions'.
+  llvm::ArrayType* partitions_array_type =
+      llvm::ArrayType::get(ir_builder->getInt64Ty(), partition_array_size);
+  llvm::Constant* partitions_array =
+      llvm::ConstantArray::get(partitions_array_type, partitions);
+  llvm::GlobalVariable* global_partitions_array = new llvm::GlobalVariable(
+      /*M=*/*module,
+      /*Ty=*/partitions_array_type,
+      /*isConstant=*/true,
+      /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
+      /*Initializer=*/partitions_array,
+      /*Name=*/
+      AsStringRef(
+          tensorflow::strings::StrCat(name, "_parallel_dimension_partitions")));
+
+  // Add argument specifying parallel dimension partitions.
+  fork_join_arguments.push_back(ir_builder->CreateBitCast(
+      global_partitions_array,
+      llvm::Type::getInt64PtrTy(module->getContext())));
+  // Add argument specifying the number of partitioned most-major dimensions.
+  fork_join_arguments.push_back(ir_builder->getInt32(num_partitioned_dims));
+  // Add argument for parallel compute function pointer.
+  fork_join_arguments.push_back(
+      ir_builder->CreateBitCast(parallel_function, ir_builder->getInt8PtrTy()));
+  // Emit call to parallel fork/join.
+  ir_builder->CreateCall(fork_join_func, fork_join_arguments);
+
+  return Status::OK();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.h b/tensorflow/compiler/xla/service/cpu/ir_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..557aa4a6bfc2ef70cafca4b226f8d8f15ea01e2b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.h
@@ -0,0 +1,134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace xla {
+namespace cpu {
+
+// IrFunction creates and encapsulates an llvm::Function, exposing methods to
+// emitters for function and function argument access.
+// The llvm::Function is created with the standard function signature
+// used in the XLA CPU backend (see ir_function.cc for argument details).
+// In addtion IrFunction saves the callers IR insert point during contruction,
+// and restores it after desctruction.
+//
+// Example usage:
+//
+//    // Create and initialize new IrFunction.
+//    std::unique_ptr<IrFunction> compute_function(new IrFunction(...));
+//    // Emit IR for function body using IrFunction helper methods.
+//    ...
+//    // Store reference to llvm::Function for future invocation.
+//    ir_functions.push_back(compute_function.function());
+//    // Delete IrFunction (finalizes IR function and restores caller insertion
+//    // point).
+//    compute_function.reset();
+//
+
+class IrFunction {
+ public:
+  IrFunction(const string& function_name, llvm::Function::LinkageTypes linkage,
+             const bool optimize_for_size_requested,
+             const bool enable_fast_math, llvm::Module* llvm_module,
+             llvm::IRBuilder<>* ir_builder, int64 num_dynamic_loop_bounds);
+  ~IrFunction();
+
+  // Emit ir to read and return the set of ir values representing the dynamic
+  // loop bounds argument of this function.
+  // Each element in returned vector is a pair of ir values representing
+  // the loop bounds for a specific dimension, where the first element of the
+  // pair is the dimension start index, and the second element of the pair
+  // is the dimension limit.
+  // EX: [dimension_i_index_start_ir_value, dimension_i_index_limit_ir_value]
+  //
+  DynamicLoopBounds GetDynamicLoopBounds();
+
+  // Returns the encapculated llvm::Function.
+  llvm::Function* function() { return function_; }
+
+  // Get the llvm::Value* that represents this functions "retval" argument.
+  llvm::Argument* result_arg() { return result_arg_; }
+
+  // Get the xla::ExecutableRunOptions that represents this functions
+  // "run_options" argument.
+  llvm::Value* exec_run_options_arg() { return exec_run_options_arg_; }
+
+  // Get the llvm::Value* that represents this functions parameters argument.
+  llvm::Value* parameters_arg() { return parameters_arg_; }
+
+  // Get the llvm::Value* that represents this functions "temps" argument.
+  llvm::Value* temp_buffers_arg() { return temp_buffers_arg_; }
+
+  // Get the llvm::Value* that represents this functions "prof_counters"
+  // argument.
+  llvm::Value* profile_counters_arg() { return profile_counters_arg_; }
+
+ private:
+  // Initialize an llvm::Function with standard signature based on arguments.
+  void Initialize(const string& function_name,
+                  llvm::Function::LinkageTypes linkage,
+                  bool optimize_for_size_requested, bool enable_fast_math);
+
+  // Emit ir to read and return the ir value for the dynamic loop bound at
+  // 'offset' from the "dynamic_loop_bounds" argument of this function.
+  llvm::Value* GetDynamicLoopBound(int64 offset);
+
+  llvm::IRBuilder<>* ir_builder_;
+  llvm::Module* llvm_module_;
+  llvm::IRBuilder<>::InsertPointGuard caller_insert_point_guard_;
+
+  int64 num_dynamic_loop_bounds_ = 0;
+  // Encapsulated llvm::Function.
+  llvm::Function* function_;
+  // Function argument IR values.
+  llvm::Argument* result_arg_;
+  llvm::Value* exec_run_options_arg_;
+  llvm::Value* parameters_arg_;
+  llvm::Value* temp_buffers_arg_;
+  llvm::Value* dynamic_loop_bounds_arg_ = nullptr;
+  llvm::Value* profile_counters_arg_;
+};
+
+// Returns an array of compute function call argument ir values.
+std::vector<llvm::Value*> GetArrayFunctionCallArguments(
+    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece name,
+    llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg,
+    llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg);
+
+// Emits a call to a runtime fork/join function which dispatches parallel
+// calls to 'parallel_function' (and joins threads before returning).
+Status EmitCallToParallelForkJoin(
+    const std::vector<llvm::Value*>& arguments, const Shape& shape,
+    const std::vector<int64>& dimension_partition_counts,
+    llvm::IRBuilder<>* ir_builder, llvm::Function* parallel_function,
+    const string& name);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index 81c29e4726c7be53b433be896f558f502e43c885..2e5cc96098241415b82f225afc81981f3e1069e0 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -28,6 +30,10 @@ namespace runtime {
 
 const char* const kTanhV4F32SymbolName = "__xla_cpu_runtime_TanhV4F32";
 const char* const kTanhV8F32SymbolName = "__xla_cpu_runtime_TanhV8F32";
+const char* const kExpV4F32SymbolName = "__xla_cpu_runtime_ExpV4F32";
+const char* const kExpV8F32SymbolName = "__xla_cpu_runtime_ExpV8F32";
+const char* const kLogV4F32SymbolName = "__xla_cpu_runtime_LogV4F32AVX";
+const char* const kLogV8F32SymbolName = "__xla_cpu_runtime_LogV8F32AVX";
 
 namespace {
 llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
@@ -42,62 +48,257 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
   }
 
   llvm::LLVMContext* context = &module->getContext();
-  llvm::Type* float_type = llvm::Type::getFloatTy(*context);
-  llvm::VectorType* vector_type =
-      llvm::VectorType::get(float_type, vector_width);
 
   llvm::BasicBlock* vector_tanh_body =
       llvm::BasicBlock::Create(*context, "body", vector_tanh_function);
 
   llvm::IRBuilder<> ir_builder(vector_tanh_body);
-
   llvm::FastMathFlags fast_math_flags;
   fast_math_flags.setFast();
   ir_builder.setFastMathFlags(fast_math_flags);
 
+  VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "tanh_f32");
+
   llvm::Value* input = &*vector_tanh_function->arg_begin();
-  CHECK_EQ(input->getType(), vector_type);
+  CHECK_EQ(input->getType(), vsl.vector_type());
 
   // This implements the same rational interpolant as implemented in Eigen3.
-  llvm::Value* input_clamped = llvm_ir::EmitFloatMin(
-      llvm_ir::EmitFloatMax(input, llvm::ConstantFP::get(vector_type, -9.0),
-                            &ir_builder),
-      llvm::ConstantFP::get(vector_type, 9.0), &ir_builder);
-
-  std::array<float, 7> numerator_coeffs(
-      {{-2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
-        5.12229709037114e-08f, 1.48572235717979e-05f, 6.37261928875436e-04f,
-        4.89352455891786e-03f}});
-
-  std::array<float, 4> denominator_coeffs(
-      {{1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
-        4.89352518554385e-03f}});
-
-  llvm::Value* input_squared =
-      ir_builder.CreateFMul(input_clamped, input_clamped);
-  llvm::Value* numerator =
-      llvm::ConstantFP::get(vector_type, numerator_coeffs[0]);
+  llvm::Value* input_clamped =
+      vsl.Clamp(input, /*low=*/GetIeeeF32(-9.0), /*high=*/GetIeeeF32(9.0));
+
+  std::array<float, 7> numerator_coeffs{
+      -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
+      5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
+      4.89352455891786e-03f};
+
+  std::array<float, 4> denominator_coeffs{
+      1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
+      4.89352518554385e-03f};
+
+  llvm::Value* input_squared = vsl.Mul(input_clamped, input_clamped);
+  llvm::Value* numerator = vsl.SplatFloat(GetIeeeF32(numerator_coeffs[0]));
   for (int i = 1; i < numerator_coeffs.size(); i++) {
-    numerator = ir_builder.CreateFAdd(
-        ir_builder.CreateFMul(input_squared, numerator),
-        llvm::ConstantFP::get(vector_type, numerator_coeffs[i]));
+    numerator =
+        vsl.MulAdd(input_squared, numerator, GetIeeeF32(numerator_coeffs[i]));
   }
-  numerator = ir_builder.CreateFMul(input_clamped, numerator);
 
-  llvm::Value* denominator =
-      llvm::ConstantFP::get(vector_type, denominator_coeffs[0]);
+  numerator = vsl.Mul(input_clamped, numerator);
+
+  llvm::Value* denominator = vsl.SplatFloat(GetIeeeF32(denominator_coeffs[0]));
   for (int i = 1; i < denominator_coeffs.size(); i++) {
-    denominator = ir_builder.CreateFAdd(
-        ir_builder.CreateFMul(input_squared, denominator),
-        llvm::ConstantFP::get(vector_type, denominator_coeffs[i]));
+    denominator = vsl.MulAdd(input_squared, denominator,
+                             GetIeeeF32(denominator_coeffs[i]));
   }
 
-  llvm::Value* result = ir_builder.CreateFDiv(numerator, denominator);
+  llvm::Value* result = vsl.Div(numerator, denominator);
   ir_builder.CreateRet(result);
 
   DCHECK(!llvm::verifyFunction(*vector_tanh_function));
   return vector_tanh_function;
 }
+
+llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module,
+                                         llvm::StringRef function_name,
+                                         int vector_width,
+                                         bool enable_fast_math) {
+  llvm::Function* vector_exp_function = module->getFunction(function_name);
+  if (vector_exp_function == nullptr) {
+    // If the function declaration is not present in the module, there can't be
+    // any calls to resolve.  Don't emit the function in this case.
+    return nullptr;
+  }
+
+  llvm::LLVMContext* context = &module->getContext();
+
+  llvm::BasicBlock* vector_exp_body =
+      llvm::BasicBlock::Create(*context, "body", vector_exp_function);
+
+  llvm::IRBuilder<> ir_builder(vector_exp_body);
+  llvm::FastMathFlags fast_math_flags;
+  fast_math_flags.setFast();
+  ir_builder.setFastMathFlags(fast_math_flags);
+
+  VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "exp_f32");
+
+  // This implements the same polynomial approximation as implemented in Eigen3.
+
+  const llvm::APFloat half = GetIeeeF32(0.5);
+  const llvm::APFloat one = GetIeeeF32(1.0);
+
+  const llvm::APFloat exp_hi = GetIeeeF32(88.3762626647950);
+  const llvm::APFloat exp_lo = GetIeeeF32(-88.3762626647949);
+
+  const llvm::APFloat cephes_LOG2EF = GetIeeeF32(1.44269504088896341);
+  const llvm::APFloat cephes_exp_C1 = GetIeeeF32(0.693359375);
+  const llvm::APFloat cephes_exp_C2 = GetIeeeF32(-2.12194440e-4);
+
+  const llvm::APFloat cephes_exp_p0 = GetIeeeF32(1.9875691500E-4);
+  const llvm::APFloat cephes_exp_p1 = GetIeeeF32(1.3981999507E-3);
+  const llvm::APFloat cephes_exp_p2 = GetIeeeF32(8.3334519073E-3);
+  const llvm::APFloat cephes_exp_p3 = GetIeeeF32(4.1665795894E-2);
+  const llvm::APFloat cephes_exp_p4 = GetIeeeF32(1.6666665459E-1);
+  const llvm::APFloat cephes_exp_p5 = GetIeeeF32(5.0000001201E-1);
+
+  llvm::Value* input = &*vector_exp_function->arg_begin();
+  llvm::Value* input_clamped =
+      vsl.Clamp(input, /*low=*/exp_lo, /*high=*/exp_hi);
+  llvm::Value* fx = vsl.Floor(vsl.MulAdd(input_clamped, cephes_LOG2EF, half));
+  llvm::Value* tmp = vsl.Mul(cephes_exp_C1, fx);
+  llvm::Value* z = vsl.Mul(cephes_exp_C2, fx);
+  llvm::Value* x = vsl.Sub(input_clamped, tmp);
+  x = vsl.Sub(x, z);
+  z = vsl.Mul(x, x);
+
+  llvm::Value* y = vsl.MulAdd(x, cephes_exp_p0, cephes_exp_p1);
+  y = vsl.MulAdd(y, x, cephes_exp_p2);
+  y = vsl.MulAdd(y, x, cephes_exp_p3);
+  y = vsl.MulAdd(y, x, cephes_exp_p4);
+  y = vsl.MulAdd(y, x, cephes_exp_p5);
+  y = vsl.MulAdd(y, z, x);
+  y = vsl.Add(one, y);
+
+  // VectorSupportLibrary (intentionally) can't juggle more than one type at a
+  // time so drop down to IRBuilder for this bit.
+  llvm::Value* vector_constant_0x7f =
+      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(0x7f));
+  llvm::Value* vector_constant_23 =
+      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(23));
+  llvm::Type* i32_vector_type =
+      llvm::VectorType::get(ir_builder.getInt32Ty(), vector_width);
+  // fx is clamped so we don't have to worry about it being out of range for
+  // i32.
+  llvm::Value* emm0 = ir_builder.CreateFPToSI(fx, i32_vector_type);
+  emm0 = ir_builder.CreateAdd(emm0, vector_constant_0x7f);
+  emm0 = ir_builder.CreateShl(emm0, vector_constant_23);
+  llvm::Value* emm0_f32 = ir_builder.CreateBitCast(emm0, vsl.vector_type());
+
+  llvm::Value* result = vsl.Max(vsl.Mul(y, emm0_f32), input);
+
+  ir_builder.CreateRet(result);
+
+  DCHECK(!llvm::verifyFunction(*vector_exp_function));
+  return vector_exp_function;
+}
+
+llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module,
+                                         llvm::StringRef function_name,
+                                         int vector_width,
+                                         bool enable_fast_math) {
+  llvm::Function* vector_log_function = module->getFunction(function_name);
+  if (vector_log_function == nullptr) {
+    // If the function declaration is not present in the module, there can't be
+    // any calls to resolve.  Don't emit the function in this case.
+    return nullptr;
+  }
+
+  llvm::LLVMContext* context = &module->getContext();
+
+  llvm::BasicBlock* vector_log_body =
+      llvm::BasicBlock::Create(*context, "body", vector_log_function);
+
+  llvm::IRBuilder<> ir_builder(vector_log_body);
+  llvm::FastMathFlags fast_math_flags;
+  fast_math_flags.setFast();
+  ir_builder.setFastMathFlags(fast_math_flags);
+
+  llvm::Value* input = &*vector_log_function->arg_begin();
+  VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "log_f32");
+
+  const llvm::APFloat half = GetIeeeF32(0.5);
+  const llvm::APFloat one = GetIeeeF32(1.0);
+
+  // This implements the same polynomial approximation as implemented in Eigen3.
+  // Returns NaN for x < 0, -INF for x = 0
+  const llvm::APFloat cephes_SQRTHF = GetIeeeF32(0.707106781186547524);
+  const llvm::APFloat cephes_log_p0 = GetIeeeF32(7.0376836292E-2);
+  const llvm::APFloat cephes_log_p1 = GetIeeeF32(-1.1514610310E-1);
+  const llvm::APFloat cephes_log_p2 = GetIeeeF32(1.1676998740E-1);
+  const llvm::APFloat cephes_log_p3 = GetIeeeF32(-1.2420140846E-1);
+  const llvm::APFloat cephes_log_p4 = GetIeeeF32(+1.4249322787E-1);
+  const llvm::APFloat cephes_log_p5 = GetIeeeF32(-1.6668057665E-1);
+  const llvm::APFloat cephes_log_p6 = GetIeeeF32(+2.0000714765E-1);
+  const llvm::APFloat cephes_log_p7 = GetIeeeF32(-2.4999993993E-1);
+  const llvm::APFloat cephes_log_p8 = GetIeeeF32(+3.3333331174E-1);
+  const llvm::APFloat cephes_log_q1 = GetIeeeF32(-2.12194440e-4);
+  const llvm::APFloat cephes_log_q2 = GetIeeeF32(0.693359375);
+
+  // The smallest non denormalized float number.
+  const llvm::APFloat min_norm_pos = GetIeeeF32FromBitwiseRep(0x00800000);
+  const llvm::APFloat minus_inf = GetIeeeF32FromBitwiseRep(0xff800000);
+  const llvm::APFloat inv_mant_mask = GetIeeeF32FromBitwiseRep(~0x7f800000);
+
+  // invalid_mask is set if x is negative or NaN (and therefore output
+  // must be NaN).
+  llvm::Value* invalid_mask = vsl.FCmpULEMask(input, vsl.GetZeroVector());
+  llvm::Value* iszero_mask = vsl.FCmpEQMask(input, vsl.GetZeroVector());
+
+  // Cut off denormalized stuff.
+  input = vsl.Max(min_norm_pos, input);
+
+  // VectorSupportLibrary (intentionally) can't juggle more than one type at a
+  // time so drop down to IRBuilder for this bit.
+  llvm::Value* vector_constant_0x7f =
+      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(0x7f));
+  llvm::Value* vector_constant_23 =
+      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(23));
+  llvm::Type* i32_vector_type =
+      llvm::VectorType::get(ir_builder.getInt32Ty(), vector_width);
+
+  llvm::Value* emm0 = ir_builder.CreateLShr(
+      ir_builder.CreateBitCast(input, i32_vector_type), vector_constant_23);
+
+  // Keep only the fractional part.
+  input = vsl.FloatAnd(input, inv_mant_mask);
+  input = vsl.FloatOr(input, half);
+
+  emm0 = ir_builder.CreateSub(emm0, vector_constant_0x7f);
+  llvm::Value* e =
+      vsl.Add(one, ir_builder.CreateSIToFP(emm0, vsl.vector_type()));
+
+  // part2:
+  //   if( x < SQRTHF ) {
+  //     e -= 1;
+  //     x = x + x - 1.0;
+  //   } else { x = x - 1.0; }
+  llvm::Value* mask = vsl.FCmpOLTMask(input, cephes_SQRTHF);
+  llvm::Value* tmp = vsl.FloatAnd(input, mask);
+  input = vsl.Sub(input, one);
+  e = vsl.Sub(e, vsl.FloatAnd(mask, one));
+  input = vsl.Add(input, tmp);
+
+  llvm::Value* x2 = vsl.Mul(input, input);
+  llvm::Value* x3 = vsl.Mul(x2, input);
+
+  llvm::Value *y, *y1, *y2;
+  y = vsl.MulAdd(input, cephes_log_p0, cephes_log_p1);
+  y1 = vsl.MulAdd(input, cephes_log_p3, cephes_log_p4);
+  y2 = vsl.MulAdd(input, cephes_log_p6, cephes_log_p7);
+  y = vsl.MulAdd(y, input, cephes_log_p2);
+  y1 = vsl.MulAdd(y1, input, cephes_log_p5);
+  y2 = vsl.MulAdd(y2, input, cephes_log_p8);
+  y = vsl.MulAdd(y, x3, y1);
+  y = vsl.MulAdd(y, x3, y2);
+  y = vsl.Mul(y, x3);
+
+  y1 = vsl.Mul(cephes_log_q1, e);
+  tmp = vsl.Mul(half, x2);
+  y = vsl.Add(y, y1);
+  input = vsl.Sub(input, tmp);
+  y2 = vsl.Mul(cephes_log_q2, e);
+  input = vsl.Add(input, y);
+  input = vsl.Add(input, y2);
+
+  // Negative arg will be NAN, 0 will be -INF.
+  llvm::Value* or_lhs =
+      vsl.FloatAndNot(iszero_mask, vsl.FloatOr(input, invalid_mask));
+  llvm::Value* or_rhs = vsl.FloatAnd(iszero_mask, minus_inf);
+  llvm::Value* result = vsl.FloatOr(or_lhs, or_rhs);
+
+  ir_builder.CreateRet(result);
+
+  DCHECK(!llvm::verifyFunction(*vector_log_function));
+  return vector_log_function;
+}
 }  // namespace
 
 void RewriteIRRuntimeFunctions(llvm::Module* module, bool enable_fast_math) {
@@ -108,11 +309,28 @@ void RewriteIRRuntimeFunctions(llvm::Module* module, bool enable_fast_math) {
       EmitVectorF32TanhIfNeeded(module, kTanhV8F32SymbolName,
                                 /*vector_width=*/8, enable_fast_math);
 
+  auto* exp_v4f32 =
+      EmitVectorF32ExpIfNeeded(module, kExpV4F32SymbolName,
+                               /*vector_width=*/4, enable_fast_math);
+  auto* exp_v8f32 =
+      EmitVectorF32ExpIfNeeded(module, kExpV8F32SymbolName,
+                               /*vector_width=*/8, enable_fast_math);
+
+  auto* log_v4f32 =
+      EmitVectorF32LogIfNeeded(module, kLogV4F32SymbolName,
+                               /*vector_width=*/4, enable_fast_math);
+  auto* log_v8f32 =
+      EmitVectorF32LogIfNeeded(module, kLogV8F32SymbolName,
+                               /*vector_width=*/8, enable_fast_math);
+
   // Gather all the call sites, force inline them and then delete the vector
   // function bodies.
+  //
+  // TODO(b/73081976): Should we avoid inlining these intrinsics in some cases?
 
   std::vector<llvm::CallInst*> calls_to_inline;
-  for (auto* function : {tanh_v4f32, tanh_v8f32}) {
+  for (auto* function :
+       {tanh_v4f32, tanh_v8f32, exp_v4f32, exp_v8f32, log_v4f32, log_v8f32}) {
     if (function != nullptr) {
       for (auto* user : function->users()) {
         calls_to_inline.push_back(llvm::cast<llvm::CallInst>(user));
@@ -125,7 +343,8 @@ void RewriteIRRuntimeFunctions(llvm::Module* module, bool enable_fast_math) {
     CHECK(llvm::InlineFunction(call_to_inline, inline_function_info));
   }
 
-  for (auto* function : {tanh_v4f32, tanh_v8f32}) {
+  for (auto* function :
+       {tanh_v4f32, tanh_v8f32, exp_v4f32, exp_v8f32, log_v4f32, log_v8f32}) {
     if (function != nullptr) {
       function->eraseFromParent();
     }
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h
index 7f31fb98b0d03c16ef40bff9822227e01f6be46b..5553972677512617ccb6ac4f57a4d33400b664e3 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTINE_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTINE_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTIME_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTIME_H_
 
 #include "llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -25,6 +25,10 @@ namespace runtime {
 
 extern const char* const kTanhV4F32SymbolName;
 extern const char* const kTanhV8F32SymbolName;
+extern const char* const kExpV4F32SymbolName;
+extern const char* const kExpV8F32SymbolName;
+extern const char* const kLogV4F32SymbolName;
+extern const char* const kLogV8F32SymbolName;
 
 // The following CPU runtime functions have LLVM-IR only implementations:
 //
@@ -40,4 +44,4 @@ void RewriteIRRuntimeFunctions(llvm::Module* module, bool enable_fast_math);
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTINE_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LLVM_IR_RUNTIME_H_
diff --git a/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h b/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h
index 2d29550fd5bd659770cc6300e56b57bf1763e671..f8963841158b71a30aa926e3b2b153c42bf78eb1 100644
--- a/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h
+++ b/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
 
 #include <memory>
 
@@ -53,4 +53,4 @@ class Registrar {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index 0077e344e2bd34aa598ee076220fee678f31b4ad..cd997f07890cdc1d9a546ede58cc1d992b6416ae 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -61,9 +61,9 @@ ParallelCpuExecutable::ParallelCpuExecutable(
     std::unique_ptr<const HloInstructionMap<string>> function_names,
     std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
         aligned_constants,
-    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
       jit_(std::move(jit)),
       assignment_(std::move(assignment)),
@@ -376,19 +376,6 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
     HloExecutionProfile* hlo_execution_profile) {
-  std::vector<se::DeviceMemoryBase> argument_buffers(arguments.size());
-  for (int i = 0; i < arguments.size(); ++i) {
-    argument_buffers[i] = arguments[i]->buffer(/*index=*/{});
-  }
-  return ExecuteComputeFunctions(run_options, argument_buffers, buffers,
-                                 hlo_execution_profile);
-}
-
-Status ParallelCpuExecutable::ExecuteComputeFunctions(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
-    HloExecutionProfile* hlo_execution_profile) {
   // Allocate profiling counters for each hlo instruction that we would like to
   // profile.
   std::vector<int64>* profile_counters = nullptr;
@@ -428,8 +415,9 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
     // just copy the existing buffer into the map containing instruction
     // results..
     if (instruction->opcode() == HloOpcode::kParameter) {
-      InsertOrDie(&results, instruction,
-                  arguments[instruction->parameter_number()].opaque());
+      InsertOrDie(
+          &results, instruction,
+          arguments[instruction->parameter_number()]->root_buffer().opaque());
     } else if (instruction->opcode() == HloOpcode::kConstant) {
       unsigned char* aligned_data =
           FindOrDie(aligned_constants_, instruction).get();
@@ -461,69 +449,6 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
   return Status::OK();
 }
 
-StatusOr<perftools::gputools::DeviceMemoryBase>
-ParallelCpuExecutable::ExecuteOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  VLOG(3) << "ExecuteOnStream arg size: " << arguments.size();
-  if (!arguments.empty()) {
-    VLOG(3) << "ExecuteOnStream arg[0]: " << arguments.at(0).opaque();
-  }
-
-  // Allocate the temporary buffers required for the computation.
-  se::StreamExecutor* stream_executor = stream->parent();
-  int device_ordinal = stream_executor->device_ordinal();
-  int64 buffer_count = assignment_->Allocations().size();
-  VLOG(3) << "temp buffer count: " << buffer_count;
-
-  std::vector<se::DeviceMemoryBase> device_allocations(
-      assignment_->Allocations().size());
-  TF_RETURN_IF_ERROR(AllocateBuffers(memory_allocator,
-                                     stream->parent()->device_ordinal(),
-                                     &device_allocations));
-
-  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                      assignment_->GetUniqueTopLevelOutputSlice());
-  const BufferAllocation::Index result_index = result_slice.index();
-  VLOG(3) << "result index: " << result_index;
-
-  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(
-      run_options, arguments, device_allocations, hlo_execution_profile));
-
-  // Mark the buffers that are actually live (used in the output) when the
-  // computation finishes executing.
-  std::unordered_set<const void*> marked_addresses;
-  MarkLiveAddressesInOutput(device_allocations[result_index].opaque(),
-                            result_shape(), &marked_addresses);
-
-  VLOG(3) << "Live addresses in output marking found "
-          << marked_addresses.size() << " addresses:\n"
-          << tensorflow::str_util::Join(
-                 marked_addresses, ", ", [](string* out, const void* address) {
-                   tensorflow::strings::StrAppend(
-                       out, tensorflow::strings::Printf("%p", address));
-                 });
-
-  // Computation is done - deallocate temp buffers. Keep those marked
-  // live because they are referenced by the output of the computation
-  // and are needed by the service. They will be deallocated by the
-  // service.
-  for (size_t i = 0; i < device_allocations.size(); ++i) {
-    auto alloc = device_allocations[i];
-    if (marked_addresses.count(alloc.opaque()) == 0 &&
-        alloc.opaque() != nullptr) {
-      VLOG(3) << "ParallelCpuExecutable deallocating buffer #" << i << " ["
-              << alloc.opaque() << "]";
-      TF_RETURN_IF_ERROR(memory_allocator->Deallocate(device_ordinal, &alloc));
-    }
-  }
-
-  return device_allocations[result_index];
-}
-
 StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
@@ -536,9 +461,9 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
   std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
 
-  auto result_buffer =
-      MakeUnique<ShapedBuffer>(result_shape(), stream->parent()->platform(),
-                               stream->parent()->device_ordinal());
+  auto result_buffer = MakeUnique<ShapedBuffer>(
+      /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
+      stream->parent()->platform(), stream->parent()->device_ordinal());
 
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
@@ -549,37 +474,30 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   // Copy DeviceMemoryBase values which into the respective location in
   // ShapedBuffer which is returned to the caller.
   std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
-  TF_RETURN_IF_ERROR(
-      result_buffer->mutable_shape_index_to_buffer_entry()
-          ->ForEachMutableElementWithStatus(
-              [&buffers, &buffers_in_result, &result_buffer, this](
-                  const ShapeIndex& index, size_t* buffer_entry) {
-                  const auto& sources =
-                      this->GetRootPointsToSet().element(index);
-                  // The points to set is unambiguous so the set should be a
-                  // singleton.
-                  CHECK_EQ(1, sources.size());
-                  const LogicalBuffer* buffer_source = sources[0];
-                  HloInstruction* src = buffer_source->instruction();
-
-                  // The source for this result buffer can be a nested buffer
-                  // such as a tuple element.
-
-                  // The source instruction should have a non-parameter buffer
-                  // assigned.
-                  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
-                                      this->assignment_->GetUniqueSlice(
-                                          src, buffer_source->index()));
-                  CHECK(!slice.allocation()->is_entry_computation_parameter());
-
-                  const BufferAllocation::Index buffer_index = slice.index();
-                  const se::DeviceMemoryBase& buffer = buffers[buffer_index];
-                  CHECK(!buffer.is_null() || buffer.size() == 0);
-                  *buffer_entry = result_buffer->mutable_buffers()->size();
-                  result_buffer->mutable_buffers()->push_back(buffer);
-                  buffers_in_result[buffer_index] = true;
-                return Status::OK();
-              }));
+  TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus(
+      [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
+        const auto& sources = this->GetRootPointsToSet().element(index);
+
+        // The points to set is unambiguous so the set should be a singleton.
+        CHECK_EQ(1, sources.size());
+        const LogicalBuffer* buffer_source = sources[0];
+        HloInstruction* src = buffer_source->instruction();
+
+        // The source for this result buffer can be a nested buffer such as a
+        // tuple element. The source instruction should have a non-parameter
+        // buffer assigned.
+        TF_ASSIGN_OR_RETURN(
+            const BufferAllocation::Slice slice,
+            this->assignment_->GetUniqueSlice(src, buffer_source->index()));
+        CHECK(!slice.allocation()->is_entry_computation_parameter());
+
+        const BufferAllocation::Index buffer_index = slice.index();
+        const se::DeviceMemoryBase& buffer = buffers[buffer_index];
+        CHECK(!buffer.is_null() || buffer.size() == 0);
+        *device_memory = buffer;
+        buffers_in_result[buffer_index] = true;
+        return Status::OK();
+      }));
 
   // Free all buffers not in the result.
   for (size_t i = 0; i < buffers.size(); ++i) {
@@ -595,10 +513,10 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   return std::move(result_buffer);
 }
 
-StatusOr<perftools::gputools::DeviceMemoryBase>
+StatusOr<std::unique_ptr<ShapedBuffer>>
 ParallelCpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
   return Unimplemented(
       "Asynchronous execution on stream is not yet supported on CPU.");
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index d65e3f42f3cb34eff005f34b51b81fd5c42974a3..c393e9b8ea39bfb4c605ebba8e2cd29726bc4af9 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -55,25 +55,18 @@ class ParallelCpuExecutable : public Executable {
       std::unordered_map<const HloInstruction*,
                          std::unique_ptr<unsigned char[]>>
           aligned_constants,
-      std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~ParallelCpuExecutable() override {}
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
-
   StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
+  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments) override;
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
   // This should be called after set_ir_module_string.
   const string& ir_module_string() const { return ir_module_string_; }
@@ -108,13 +101,6 @@ class ParallelCpuExecutable : public Executable {
 
   // Calls the generated functions in 'function_names_', performing the
   // computation with the given arguments using the supplied buffers.
-  Status ExecuteComputeFunctions(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          buffers,
-      HloExecutionProfile* hlo_execution_profile);
   Status ExecuteComputeFunctions(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e439cde11cf74272101b80c867a308e51ab26a6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h"
+
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace xla {
+namespace cpu {
+
+ParallelLoopEmitter::ParallelLoopEmitter(
+    const llvm_ir::ElementGenerator& target_element_generator,
+    const llvm_ir::IrArray& target_array,
+    const DynamicLoopBounds* dynamic_loop_bounds, llvm::IRBuilder<>* ir_builder)
+    : LoopEmitter(target_element_generator, target_array, ir_builder),
+      dynamic_loop_bounds_(dynamic_loop_bounds) {}
+
+llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
+    tensorflow::StringPiece loop_name) {
+  CHECK(!ShapeUtil::IsTuple(shape_));
+  CHECK(!ShapeUtil::IsScalar(shape_));
+
+  llvm_ir::ForLoopNest loop_nest(loop_name, ir_builder_);
+  const int64 num_dims = shape_.dimensions_size();
+  llvm_ir::IrArray::Index array_index(num_dims);
+
+  // Add loops from outer-most to inner-most dimensions.
+  for (int i = LayoutUtil::MinorToMajor(shape_).size() - 1; i >= 0; --i) {
+    const int64 dimension = LayoutUtil::Minor(shape_.layout(), i);
+    const int bounds_index = num_dims - 1 - i;
+    if (bounds_index < dynamic_loop_bounds_->size()) {
+      // Emit dynamic loop bounds for this dimension. Dynamic loop bounds
+      // are read from ir function dynamic loop bounds argument.
+      llvm::Value* start_index = (*dynamic_loop_bounds_)[bounds_index].first;
+      llvm::Value* end_index = (*dynamic_loop_bounds_)[bounds_index].second;
+
+      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
+          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension),
+          start_index, end_index);
+      array_index[dimension] = loop->GetIndVarValue();
+    } else {
+      // Emit static loop bounds for this dimension.
+      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
+          /*start_index=*/0,
+          /*end_index=*/shape_.dimensions(dimension),
+          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension));
+      array_index[dimension] = loop->GetIndVarValue();
+    }
+  }
+  // Point IR builder at inner loop BB.
+  llvm_ir::SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(),
+                                 ir_builder_);
+
+  // Set exit_bb_ to the exit block of the loop nest.
+  exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
+  CHECK(exit_bb_ != nullptr);
+
+  return array_index;
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce92e36a944de33b991d97460f0b2e859ad56081
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
@@ -0,0 +1,73 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
+
+namespace xla {
+namespace cpu {
+
+// ParallelLoopEmitter emits a loop nest for the target array shape.
+// The outer loop bounds of the loop nest are passed as ir values at runtime
+// (specified in 'dynamic_loop_bounds'), and the inner loop bounds are static.
+// Dynamic loop bounds are specified as an array of dimension index
+// [start, limit) pairs of ir values (one for each partitioned outer dimension).
+//
+// EX: Let 'shape' = [8, 16, 32], with the loop bounds of the two-most major
+//     dimensions dynamic. Then 'dynamic_loop_bounds' will contain the
+//     following ir values for the two most-major dimensions:
+//       [dim0_index_start_ir_value, dim0_index_limit_ir_value]
+//       [dim1_index_start_ir_value, dim1_index_limit_ir_value]
+//
+// Code emitted by ParallelLoopEmitter will be called in a multi-threaded
+// context where each thread will be assigned a different set of outer dimension
+// partitions, and where all threads will collectively iterate over the
+// entire target array shape.
+//
+// Outer dimension partitions can be generated using the ShapePartitionAssigner
+// and ShapePartitionIterator utility classes from shape_partition.cc.
+//
+class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
+ public:
+  // Constructs a ParallelLoopEmitter which uses 'target_element_generator' to
+  // generate elements, 'dynamic_loop_bounds' to set the loop bounds of the
+  // most-major dimensions, and 'target_array.' shape to set the static loop
+  // bounds for the most-minor dimensions.
+  ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator,
+                      const llvm_ir::IrArray& target_array,
+                      const DynamicLoopBounds* dynamic_loop_bounds,
+                      llvm::IRBuilder<>* ir_builder);
+
+  ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
+  ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
+  ~ParallelLoopEmitter() override = default;
+
+  llvm_ir::IrArray::Index EmitIndexAndSetExitBasicBlock(
+      tensorflow::StringPiece loop_name) override;
+
+ private:
+  const DynamicLoopBounds* dynamic_loop_bounds_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index 4b44ac8941e222d5954121bbb9654062e41f55d6..deb21bf4ef5895cfdbec5c2449b6ce7b306a7008 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -126,7 +126,7 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
     HloInstruction* instruction) {
   // Currently, we do not assign parallel tasks to instructions with at least
   // one of the following properties:
-  // *) Internal threading (library calls to kConv, kDot, and kCustomCall).
+  // *) Internal threading (library calls to kConv, kDot, kFft, kCustomCall).
   // *) Emit custom loops (kSelectAndScatter, FusionKind::kTransposeDot).
   // *) Tuple-shaped.
   // TODO(b/27458679) Parallelize instructions which are skipped here.
@@ -137,6 +137,7 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
       instruction->opcode() == HloOpcode::kSelectAndScatter ||
       instruction->opcode() == HloOpcode::kGetTupleElement ||
       instruction->opcode() == HloOpcode::kBitcast ||
+      instruction->opcode() == HloOpcode::kFft ||
       (instruction->opcode() == HloOpcode::kConvolution &&
        PotentiallyImplementedAsEigenConvolution(*instruction)) ||
       PotentiallyImplementedAsEigenDot(*instruction) ||
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
index 5801ec8d270cdaed7f2f65c24987a9ea643edb02..7140dabe516cd7ea9260456e994e8b63b68c60d6 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
 
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -99,4 +99,4 @@ class ParallelTaskAssigner : public HloPassInterface {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_fft.cc
new file mode 100644
index 0000000000000000000000000000000000000000..848d2d22414e8fc9bca82de90f7676011d8992fd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft.cc
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_fft.h"
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenFft(
+    const void* run_options_ptr, void* out, void* operand, int32 fft_type,
+    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
+    int64 fft_length2) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  tensorflow::xla::EigenFftImpl(*run_options->intra_op_thread_pool(), out,
+                                operand, fft_type, fft_rank, input_batch,
+                                fft_length0, fft_length1, fft_length2);
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..f20c5aa0aa2dcbc700f47c718e75baae18650d1a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FFT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FFT_H_
+
+#include "tensorflow/core/platform/types.h"
+
+extern "C" {
+
+extern void __xla_cpu_runtime_EigenFft(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
+    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
+    tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
+    tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
+
+}  // extern "C"
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FFT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..984cb0616e02475babad7160d0f43bb23de0b50e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
@@ -0,0 +1,240 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FFT_IMPL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FFT_IMPL_H_
+
+#include <array>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/types.h"
+
+// 'tensorflow' namespace is used so that int64 and other types don't require
+// qualification.
+namespace tensorflow {
+namespace xla {
+
+namespace internal {
+
+// Computes either a forward or reverse complex-to-complex FFT.
+template <bool Forward, int FFTRank, typename EigenDevice>
+void EigenFftC2C(const EigenDevice& device, complex64* out, complex64* operand,
+                 int64 input_batch, int64 fft_length0, int64 fft_length1,
+                 int64 fft_length2) {
+  // Create the axes (which are always trailing).
+  const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
+  constexpr auto direction = Forward ? Eigen::FFT_FORWARD : Eigen::FFT_REVERSE;
+
+  const std::array<int64, 3> fft_shape = {
+      {fft_length0, fft_length1, fft_length2}};
+
+  Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> dims;
+  dims[0] = input_batch;
+  for (int i = 0; i < FFTRank; i++) {
+    dims[i + 1] = fft_shape[i];
+  }
+  const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
+                         Eigen::Aligned>
+      input(operand, dims);
+  Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
+                   Eigen::Aligned>
+      output(out, dims);
+  output.device(device) = input.template fft<Eigen::BothParts, direction>(axes);
+}
+
+// Computes a forward real->complex FFT, slicing out redundant negative
+// frequencies from the innermost dimension.
+template <int FFTRank, typename EigenDevice>
+void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
+                 int64 input_batch, int64 fft_length0, int64 fft_length1,
+                 int64 fft_length2) {
+  const std::array<int64, 3> fft_shape = {
+      {fft_length0, fft_length1, fft_length2}};
+
+  Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> in_dims;
+  in_dims[0] = input_batch;
+  Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
+  out_dims[0] = input_batch;
+  TensorShape temp_shape{input_batch};
+  for (int i = 0; i < FFTRank; i++) {
+    in_dims[i + 1] = fft_shape[i];
+    out_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
+    temp_shape.AddDim(fft_shape[i]);
+  }
+  const Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
+                         Eigen::Aligned>
+      input(operand, in_dims);
+  Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
+                   Eigen::Aligned>
+      output(out, out_dims);
+
+  // Create the axes (which are always trailing).
+  const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
+
+  // Compute the full FFT using a temporary tensor.
+  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
+  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+  const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
+  full_fft.device(device) =
+      input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
+
+  // Slice away the negative frequency components.
+  output.device(device) = full_fft.slice(zero_start_indices, out_dims);
+}
+
+// Computes a reverse complex->real FFT, reconstructing redundant negative
+// frequencies using reverse conjugate on innermost dimension after doing IFFT
+// on outer dimensions.
+template <int FFTRank, typename EigenDevice>
+void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
+                 int64 input_batch, int64 fft_length0, int64 fft_length1,
+                 int64 fft_length2) {
+  const std::array<int64, 3> fft_shape = {
+      {fft_length0, fft_length1, fft_length2}};
+
+  Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> in_dims;
+  in_dims[0] = input_batch;
+  Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
+  out_dims[0] = input_batch;
+  TensorShape temp_shape{input_batch};
+  for (int i = 0; i < FFTRank; i++) {
+    in_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
+    out_dims[i + 1] = fft_shape[i];
+    temp_shape.AddDim(fft_shape[i]);
+  }
+  const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
+                         Eigen::Aligned>
+      input(operand, in_dims);
+  Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
+                   Eigen::Aligned>
+      output(out, out_dims);
+
+  // Calculate the shape of the temporary tensor for the full FFT and the
+  // region we will slice from input given fft_shape. We slice input to
+  // fft_shape on its inner-most dimensions, except the last (which we
+  // slice to fft_shape[-1] / 2 + 1).
+  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
+  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+
+  // Calculate the starting point and range of the source of
+  // negative frequency part.
+  auto neg_sizes = in_dims;
+  neg_sizes[FFTRank] = fft_shape[FFTRank - 1] - in_dims[FFTRank];
+  Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> neg_target_indices;
+  neg_target_indices[FFTRank] = in_dims[FFTRank];
+
+  const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
+  Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> neg_start_indices;
+  neg_start_indices[FFTRank] = 1;
+
+  full_fft.slice(zero_start_indices, in_dims).device(device) = input;
+
+  // First, conduct IFFTs on outer dimensions. We save computation (and
+  // avoid touching uninitialized memory) by slicing full_fft to the
+  // subregion we wrote input to.
+  if (FFTRank > 1) {
+    const auto outer_axes =
+        Eigen::ArrayXi::LinSpaced(FFTRank - 1, 1, FFTRank - 1);
+    full_fft.slice(zero_start_indices, in_dims).device(device) =
+        full_fft.slice(zero_start_indices, in_dims)
+            .template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(outer_axes);
+  }
+
+  // Reconstruct the full FFT by appending reversed and conjugated
+  // spectrum as the negative frequency part.
+  Eigen::array<bool, FFTRank + 1> reverse_last_axis;
+  for (auto i = 0; i <= FFTRank; i++) {
+    reverse_last_axis[i] = i == FFTRank;
+  }
+
+  if (neg_sizes[FFTRank] != 0) {
+    full_fft.slice(neg_target_indices, neg_sizes).device(device) =
+        full_fft.slice(neg_start_indices, neg_sizes)
+            .reverse(reverse_last_axis)
+            .conjugate();
+  }
+
+  auto inner_axis = Eigen::array<int, 1>{FFTRank};
+  output.device(device) =
+      full_fft.template fft<Eigen::RealPart, Eigen::FFT_REVERSE>(inner_axis);
+}
+
+template <int FFTRank, typename EigenDevice>
+void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
+                      int32 fft_type, int64 input_batch, int64 fft_length0,
+                      int64 fft_length1, int64 fft_length2) {
+  CHECK(::xla::FftType_IsValid(fft_type)) << fft_type;
+  switch (fft_type) {
+    case ::xla::FftType::FFT:
+      EigenFftC2C<true, FFTRank, EigenDevice>(
+          device, static_cast<complex64*>(out),
+          static_cast<complex64*>(operand), input_batch, fft_length0,
+          fft_length1, fft_length2);
+      break;
+    case ::xla::FftType::IFFT:
+      EigenFftC2C<false, FFTRank, EigenDevice>(
+          device, static_cast<complex64*>(out),
+          static_cast<complex64*>(operand), input_batch, fft_length0,
+          fft_length1, fft_length2);
+      break;
+    case ::xla::FftType::RFFT:
+      EigenFftR2C<FFTRank, EigenDevice>(
+          device, static_cast<complex64*>(out), static_cast<float*>(operand),
+          input_batch, fft_length0, fft_length1, fft_length2);
+      break;
+    case ::xla::FftType::IRFFT:
+      EigenFftC2R<FFTRank, EigenDevice>(
+          device, static_cast<float*>(out), static_cast<complex64*>(operand),
+          input_batch, fft_length0, fft_length1, fft_length2);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported FFT type: " << fft_type;
+  }
+}
+
+}  // namespace internal
+
+template <typename EigenDevice>
+void EigenFftImpl(const EigenDevice& device, void* out, void* operand,
+                  int32 fft_type, int32 fft_rank, int64 input_batch,
+                  int64 fft_length0, int64 fft_length1, int64 fft_length2) {
+  switch (fft_rank) {
+    case 1:
+      internal::EigenFftWithRank<1, EigenDevice>(
+          device, out, operand, fft_type, input_batch, fft_length0, 0, 0);
+      break;
+    case 2:
+      internal::EigenFftWithRank<2, EigenDevice>(device, out, operand, fft_type,
+                                                 input_batch, fft_length0,
+                                                 fft_length1, 0);
+      break;
+    case 3:
+      internal::EigenFftWithRank<3, EigenDevice>(device, out, operand, fft_type,
+                                                 input_batch, fft_length0,
+                                                 fft_length1, fft_length2);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported FFT rank " << fft_rank;
+  }
+}
+
+}  // namespace xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FFT_IMPL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.h b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.h
index fcf1cc62078d3847435a2e75e3ca9d109cf8b200..1cf0ec6e3df400e35fa4e755a0b25b4ce7966e8f 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -30,4 +30,4 @@ extern void __xla_cpu_runtime_ParallelForkJoin(
 
 }  // extern "C"
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_FORK_JOIN_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matvec.h b/tensorflow/compiler/xla/service/cpu/runtime_matvec.h
index cb7e0a81f09e2702de565012e1fcac8b7cd841ab..1bd8dfb377acc1f7cfbe9a92773f87f0ef25de3a 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matvec.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matvec.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -42,4 +42,4 @@ void EigenMatVecF64(double* out, double* lhs, double* rhs, tensorflow::int64 m,
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATVEC_H_
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition.h b/tensorflow/compiler/xla/service/cpu/shape_partition.h
index 7a2d00421cfdc8e41ec48698a16665621de16bda..33d02b70e61e3311c9af934e80874939fbe3adae 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition.h
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
 
 #include <vector>
 
@@ -102,4 +102,4 @@ class ShapePartitionIterator {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_SHAPE_PARTITION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index cda2783307925b77ac6d8cfe679c5b325db2befc..64d3a51f41676bbb4b59c9d272d22f52a87a0559 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -15,29 +15,28 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 
-#include <dlfcn.h>
 #include <stdint.h>
 #include <algorithm>
 #include <list>
 #include <utility>
 
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Host.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h"
-#include "tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h"
 #include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_conv2d.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_fork_join.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
+#include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -46,7 +45,7 @@ namespace cpu {
 namespace {
 
 // A simple SymbolResolver that delegates to the host dynamic linker.
-class SimpleResolver : public llvm::JITSymbolResolver {
+class SimpleResolver : public llvm::LegacyJITSymbolResolver {
  public:
   explicit SimpleResolver(ExternalConstantPool* external_constant_pool)
       : external_constant_pool_(external_constant_pool) {}
@@ -99,15 +98,6 @@ llvm::StringRef GetHostCpuName() {
   cpu_name.consume_back("-avx512");
   return cpu_name;
 }
-
-CompilerFunctor::VectorIntrinsics GetAvailableIntrinsics() {
-  CompilerFunctor::VectorIntrinsics intrinsics;
-  intrinsics.sse_intrinsics = (&__xla_cpu_runtime_ExpV4F32SSE != nullptr);
-  intrinsics.avx_intrinsics = (&__xla_cpu_runtime_ExpV8F32AVX != nullptr);
-  intrinsics.neon_intrinsics = (&__xla_cpu_runtime_ExpV4F32NEON != nullptr);
-  return intrinsics;
-}
-
 }  // namespace
 
 SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
@@ -126,34 +116,57 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
                                 /*MAttrs=*/DetectMachineAttributes()))),
       disassembler_(*target_machine_),
       data_layout_(target_machine_->createDataLayout()),
-      object_layer_([] {
-        return std::make_shared<llvm::SectionMemoryManager>(
-            orc_jit_memory_mapper::GetInstance());
-      }),
-      compile_layer_(
-          object_layer_,
-          CompilerFunctor(target_machine_.get(), &disassembler_, opt_level,
-                          optimize_for_size, enable_fast_math,
-                          disable_expensive_passes, GetAvailableIntrinsics(),
-                          std::move(pre_optimization_hook),
-                          std::move(post_optimization_hook))) {
+      execution_session_(string_pool_),
+      symbol_resolver_(llvm::orc::createLegacyLookupResolver(
+          [this](const std::string& name) -> llvm::JITSymbol {
+            if (const uint8* from_constant_pool =
+                    external_constant_pool_.Find(string(name))) {
+              return llvm::JITEvaluatedSymbol(
+                  reinterpret_cast<uint64_t>(from_constant_pool),
+                  llvm::JITSymbolFlags::None);
+            }
+
+            void* func_addr = CustomCallTargetRegistry::Global()->Lookup(name);
+            if (func_addr == nullptr) {
+              return nullptr;
+            }
+            llvm::JITEvaluatedSymbol symbol_info(
+                reinterpret_cast<uint64_t>(func_addr),
+                llvm::JITSymbolFlags::None);
+            return symbol_info;
+          },
+          [](llvm::Error Err) {
+            cantFail(std::move(Err), "lookupFlags failed");
+          })),
+      object_layer_(
+          execution_session_,
+          [](llvm::orc::VModuleKey) {
+            return std::make_shared<llvm::SectionMemoryManager>(
+                orc_jit_memory_mapper::GetInstance());
+          },
+          [this](llvm::orc::VModuleKey K) { return symbol_resolver_; }),
+      compile_layer_(object_layer_,
+                     CompilerFunctor(target_machine_.get(), &disassembler_,
+                                     opt_level, optimize_for_size,
+                                     enable_fast_math, disable_expensive_passes,
+                                     std::move(pre_optimization_hook),
+                                     std::move(post_optimization_hook))) {
   VLOG(1) << "CPU target: " << target_machine_->getTargetCPU().str()
           << " features: " << target_machine_->getTargetFeatureString().str();
 }
 
-SimpleOrcJIT::ModuleHandleT SimpleOrcJIT::AddModule(
+SimpleOrcJIT::VModuleKeyT SimpleOrcJIT::AddModule(
     std::unique_ptr<llvm::Module> module) {
-  auto handle = cantFail(compile_layer_.addModule(
-      std::move(module), MakeUnique<SimpleResolver>(external_constant_pool())));
-  module_handles_.push_back(handle);
-  return handle;
+  auto key = execution_session_.allocateVModule();
+  cantFail(compile_layer_.addModule(key, std::move(module)));
+  module_keys_.push_back(key);
+  return key;
 }
 
-void SimpleOrcJIT::RemoveModule(SimpleOrcJIT::ModuleHandleT handle) {
-  module_handles_.erase(
-      std::remove(module_handles_.begin(), module_handles_.end(), handle),
-      module_handles_.end());
-  cantFail(compile_layer_.removeModule(handle));
+void SimpleOrcJIT::RemoveModule(SimpleOrcJIT::VModuleKeyT key) {
+  module_keys_.erase(std::remove(module_keys_.begin(), module_keys_.end(), key),
+                     module_keys_.end());
+  cantFail(compile_layer_.removeModule(key));
 }
 
 llvm::JITSymbol SimpleOrcJIT::FindSymbol(const std::string& name) {
@@ -165,10 +178,10 @@ llvm::JITSymbol SimpleOrcJIT::FindSymbol(const std::string& name) {
 
   // Resolve symbol from last module to first, allowing later redefinitions of
   // symbols shadow earlier ones.
-  for (auto& handle :
-       llvm::make_range(module_handles_.rbegin(), module_handles_.rend())) {
+  for (auto& key :
+       llvm::make_range(module_keys_.rbegin(), module_keys_.rend())) {
     if (auto symbol =
-            compile_layer_.findSymbolIn(handle, mangled_name,
+            compile_layer_.findSymbolIn(key, mangled_name,
                                         /*ExportedSymbolsOnly=*/true)) {
       return symbol;
     }
@@ -196,17 +209,12 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(AcquireInfeedBufferForDequeue);
   REGISTER_CPU_RUNTIME_SYMBOL(AcquireOutfeedBufferForPopulation);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenConvF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenFft);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
-  REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32NEON);
-  REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32SSE);
-  REGISTER_CPU_RUNTIME_SYMBOL(ExpV8F32AVX);
-  REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32NEON);
-  REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32SSE);
-  REGISTER_CPU_RUNTIME_SYMBOL(LogV8F32AVX);
   REGISTER_CPU_RUNTIME_SYMBOL(ParallelForkJoin);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseInfeedBufferAfterDequeue);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseOutfeedBufferAfterPopulation);
@@ -253,15 +261,15 @@ bool RegisterKnownJITSymbols() {
   REGISTER_LIBM_SYMBOL(ilogb, int (*)(double));
   REGISTER_LIBM_SYMBOL(ldexp, double (*)(double, int));
   REGISTER_LIBM_SYMBOL(lgamma, double (*)(double));
-  REGISTER_LIBM_SYMBOL(llrint, long long (*)(double));
-  REGISTER_LIBM_SYMBOL(llround, long long (*)(double));
+  REGISTER_LIBM_SYMBOL(llrint, long long (*)(double));   // NOLINT(runtime/int)
+  REGISTER_LIBM_SYMBOL(llround, long long (*)(double));  // NOLINT(runtime/int)
   REGISTER_LIBM_SYMBOL(log, double (*)(double));
   REGISTER_LIBM_SYMBOL(log10, double (*)(double));
   REGISTER_LIBM_SYMBOL(log1p, double (*)(double));
   REGISTER_LIBM_SYMBOL(log2, double (*)(double));
   REGISTER_LIBM_SYMBOL(logb, double (*)(double));
-  REGISTER_LIBM_SYMBOL(lrint, long (*)(double));
-  REGISTER_LIBM_SYMBOL(lround, long (*)(double));
+  REGISTER_LIBM_SYMBOL(lrint, long (*)(double));   // NOLINT(runtime/int)
+  REGISTER_LIBM_SYMBOL(lround, long (*)(double));  // NOLINT(runtime/int)
   REGISTER_LIBM_SYMBOL(modf, double (*)(double, double*));
   REGISTER_LIBM_SYMBOL(nan, double (*)(const char*));
   REGISTER_LIBM_SYMBOL(nearbyint, double (*)(double));
@@ -272,10 +280,15 @@ bool RegisterKnownJITSymbols() {
   REGISTER_LIBM_SYMBOL(remquo, double (*)(double, double, int*));
   REGISTER_LIBM_SYMBOL(rint, double (*)(double));
   REGISTER_LIBM_SYMBOL(round, double (*)(double));
-  REGISTER_LIBM_SYMBOL(scalbln, double (*)(double, long));
+  REGISTER_LIBM_SYMBOL(scalbln,
+                       double (*)(double, long));  // NOLINT(runtime/int)
   REGISTER_LIBM_SYMBOL(scalbn, double (*)(double, int));
   REGISTER_LIBM_SYMBOL(sin, double (*)(double));
+#ifdef __APPLE__
+  REGISTER_LIBM_SYMBOL(__sincos, void (*)(double, double*, double*));
+#else
   REGISTER_LIBM_SYMBOL(sincos, void (*)(double, double*, double*));
+#endif
   REGISTER_LIBM_SYMBOL(sinh, double (*)(double));
   REGISTER_LIBM_SYMBOL(sqrt, double (*)(double));
   REGISTER_LIBM_SYMBOL(tan, double (*)(double));
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index ded01e9e4d7442296f7406dd035e6ab385458238..50993afc8f73617a2c65310ae73b3ab00519f550 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -21,8 +21,10 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h"
@@ -48,7 +50,7 @@ class SimpleOrcJIT {
       std::function<llvm::object::OwningBinary<llvm::object::ObjectFile>(
           llvm::Module&)>;
   using CompileLayerT = llvm::orc::IRCompileLayer<ObjLayerT, CompileFtor>;
-  using ModuleHandleT = CompileLayerT::ModuleHandleT;
+  using VModuleKeyT = llvm::orc::VModuleKey;
 
   // Create a new JIT, targeting the host architecture.
   // The |target_options| parameter allows customization of certain code
@@ -78,12 +80,12 @@ class SimpleOrcJIT {
     return target_machine_->getTargetTriple();
   }
 
-  // Add a module to the JIT. Returns an opaque handle that can be used to later
+  // Add a module to the JIT. Returns an opaque key that can be used to later
   // remove this module.
-  ModuleHandleT AddModule(std::unique_ptr<llvm::Module> module);
+  VModuleKeyT AddModule(std::unique_ptr<llvm::Module> module);
 
   // Remove a module from the JIT and free the memory associated with it.
-  void RemoveModule(ModuleHandleT handle);
+  void RemoveModule(VModuleKeyT key);
 
   // Get the runtime address of the compiled symbol whose name is given. Returns
   // nullptr if the symbol cannot be found.
@@ -96,10 +98,13 @@ class SimpleOrcJIT {
   }
 
  private:
-  std::vector<ModuleHandleT> module_handles_;
+  std::vector<VModuleKeyT> module_keys_;
   std::unique_ptr<llvm::TargetMachine> target_machine_;
   const Disassembler disassembler_;
   const llvm::DataLayout data_layout_;
+  llvm::orc::SymbolStringPool string_pool_;
+  llvm::orc::ExecutionSession execution_session_;
+  std::shared_ptr<llvm::orc::SymbolResolver> symbol_resolver_;
   ObjLayerT object_layer_;
   CompileLayerT compile_layer_;
   ExternalConstantPool external_constant_pool_;
diff --git a/tensorflow/compiler/xla/service/cpu/target_machine_features.cc b/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eeb049737dddd11ef2ce229df772baec3ac03dd8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+
+namespace xla {
+namespace cpu {
+
+llvm::TargetTransformInfo* TargetMachineFeatures::GetTargetTransformInfoFor(
+    const llvm::Function& function) const {
+  auto it = target_transform_info_cache_.find(&function);
+  if (it == target_transform_info_cache_.end()) {
+    auto emplace_result = target_transform_info_cache_.emplace(
+        &function, target_machine_->getTargetTransformInfo(function));
+    CHECK(emplace_result.second);
+    it = emplace_result.first;
+  }
+
+  return &it->second;
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/target_machine_features.h b/tensorflow/compiler/xla/service/cpu/target_machine_features.h
new file mode 100644
index 0000000000000000000000000000000000000000..703942615e552dccde7ddec8c8b90e8a486652af
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/target_machine_features.h
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_H_
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace xla {
+namespace cpu {
+
+// Wraps an llvm::TargetMachine and parses out some information that feeds into
+// LLVM IR code generation decisions.
+class TargetMachineFeatures {
+ public:
+  static constexpr int kX86AvxVectorByteSize = 32;
+
+  TargetMachineFeatures(llvm::TargetMachine* target_machine)
+      : target_machine_(target_machine) {}
+
+  // Return the vectorization factor, which is the number of bytes of data
+  // explicitly vectorized routines will try to process at once.
+  int vectorization_factor_in_bytes() const {
+    // Ideally this should be a function of the cache line size (which we can
+    // get from llvm::TargetTransformInfo::getCacheLineSize) of the target
+    // machine.  Guess a value of 128 bytes for now.
+    return 128;
+  }
+
+  // Return the size of the largest vector size in bytes.  We need to pass in
+  // "function" since llvm functions can contain annotations for specializing
+  // them to specific micro-architectures (though currently XLA does not use
+  // this functionality).
+  int vector_register_byte_size(const llvm::Function& function) const {
+    llvm::TargetTransformInfo* tti = GetTargetTransformInfoFor(function);
+    return tti->getRegisterBitWidth(/*Vector=*/true) / 8;
+  }
+
+  // Return the number of elements of type `type` that can fit into the largest
+  // vector register available.  We need to pass in "function" since llvm
+  // functions can contain annotations for specializing them to specific
+  // micro-architectures (though currently XLA does not use this functionality).
+  int vector_register_num_elements(const llvm::Function& function,
+                                   PrimitiveType type) const {
+    return vector_register_byte_size(function) /
+           (primitive_util::BitWidth(type) / 8);
+  }
+
+ private:
+  llvm::TargetTransformInfo* GetTargetTransformInfoFor(
+      const llvm::Function& function) const;
+
+  // This cache saves us from having to create a llvm::TargetTransformInfo for
+  // every call to GetTargetTransformInfoFor (creating a TargetTransformInfo
+  // costs one heap allocation on X86).
+  //
+  // This is mutated from within `GetTargetTransformInfoFor` which is
+  // semantically a getter (and thus `const`); and is therefore declared
+  // mutable.  Making this mutable is okay because it has cache semantics.
+  mutable tensorflow::gtl::FlatMap<const llvm::Function*,
+                                   llvm::TargetTransformInfo>
+      target_transform_info_cache_;
+  llvm::TargetMachine* target_machine_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_H_
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
new file mode 100644
index 0000000000000000000000000000000000000000..150db1cb6edec1af6724a8bca6a5f6272f1a7416
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -0,0 +1,424 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
+
+#include "llvm/Support/raw_ostream.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+
+namespace xla {
+namespace cpu {
+VectorSupportLibrary::VectorSupportLibrary(PrimitiveType primitive_type,
+                                           int64 vector_size,
+                                           llvm::IRBuilder<>* ir_builder,
+                                           std::string name)
+    : vector_size_(vector_size),
+      primitive_type_(primitive_type),
+      ir_builder_(ir_builder),
+      name_(std::move(name)) {
+  scalar_type_ = llvm_ir::PrimitiveTypeToIrType(
+      primitive_type, ir_builder_->GetInsertBlock()->getModule());
+  scalar_pointer_type_ = llvm::PointerType::getUnqual(scalar_type_);
+  vector_type_ = llvm::VectorType::get(scalar_type_, vector_size);
+  vector_pointer_type_ = llvm::PointerType::getUnqual(vector_type_);
+}
+
+static string TypeToString(llvm::Type* type) {
+  std::string o;
+  llvm::raw_string_ostream ostream(o);
+  type->print(ostream);
+  return ostream.str();
+}
+
+void VectorSupportLibrary::AssertCorrectTypes(
+    std::initializer_list<llvm::Value*> values) {
+  for (llvm::Value* v : values) {
+    llvm::Type* type = v->getType();
+    if (type != scalar_type() && type != vector_type()) {
+      LOG(FATAL) << "Expected either " << TypeToString(scalar_type()) << " or "
+                 << TypeToString(vector_type()) << " but got "
+                 << TypeToString(type);
+    }
+  }
+}
+
+llvm::Value* VectorSupportLibrary::Mul(llvm::Value* lhs, llvm::Value* rhs) {
+  AssertCorrectTypes({lhs, rhs});
+  return MulInternal(lhs, rhs);
+}
+
+llvm::Value* VectorSupportLibrary::MulInternal(llvm::Value* lhs,
+                                               llvm::Value* rhs) {
+  if (scalar_type_->isFloatingPointTy()) {
+    return ir_builder()->CreateFMul(lhs, rhs, name());
+  } else {
+    return ir_builder()->CreateMul(lhs, rhs, name());
+  }
+}
+
+llvm::Value* VectorSupportLibrary::Add(llvm::Value* lhs, llvm::Value* rhs) {
+  AssertCorrectTypes({lhs, rhs});
+  return AddInternal(lhs, rhs);
+}
+
+llvm::Value* VectorSupportLibrary::Sub(llvm::Value* lhs, llvm::Value* rhs) {
+  AssertCorrectTypes({lhs, rhs});
+  return ir_builder()->CreateFSub(lhs, rhs);
+}
+
+llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs) {
+  AssertCorrectTypes({lhs, rhs});
+  if (scalar_type_->isFloatingPointTy()) {
+    return llvm_ir::EmitFloatMax(lhs, rhs, ir_builder_);
+  } else {
+    LOG(FATAL) << "Max for integers is unimplemented";
+  }
+}
+
+llvm::Value* VectorSupportLibrary::Floor(llvm::Value* a) {
+  AssertCorrectTypes({a});
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor, {a},
+                                      {a->getType()}, ir_builder());
+}
+
+llvm::Value* VectorSupportLibrary::Div(llvm::Value* lhs, llvm::Value* rhs) {
+  AssertCorrectTypes({lhs, rhs});
+  if (scalar_type_->isFloatingPointTy()) {
+    return ir_builder()->CreateFDiv(lhs, rhs, name());
+  } else {
+    LOG(FATAL) << "Division for integers is unimplemented";
+  }
+}
+
+llvm::Value* VectorSupportLibrary::Clamp(llvm::Value* a,
+                                         const llvm::APFloat& low,
+                                         const llvm::APFloat& high) {
+  AssertCorrectTypes({a});
+  llvm::Type* type = a->getType();
+  CHECK(low.compare(high) == llvm::APFloat::cmpLessThan);
+  CHECK(scalar_type_->isFloatingPointTy());
+  return llvm_ir::EmitFloatMin(
+      llvm_ir::EmitFloatMax(a, GetConstantFloat(type, low), ir_builder_),
+      GetConstantFloat(type, high), ir_builder_);
+}
+
+llvm::Value* VectorSupportLibrary::FCmpEQMask(llvm::Value* lhs,
+                                              llvm::Value* rhs) {
+  AssertCorrectTypes({lhs, rhs});
+  return I1ToFloat(ir_builder()->CreateFCmpOEQ(lhs, rhs, name()));
+}
+
+llvm::Value* VectorSupportLibrary::FCmpOLTMask(llvm::Value* lhs,
+                                               llvm::Value* rhs) {
+  AssertCorrectTypes({lhs, rhs});
+  return I1ToFloat(ir_builder()->CreateFCmpOLT(lhs, rhs, name()));
+}
+
+llvm::Value* VectorSupportLibrary::FCmpULEMask(llvm::Value* lhs,
+                                               llvm::Value* rhs) {
+  AssertCorrectTypes({lhs, rhs});
+  return I1ToFloat(ir_builder()->CreateFCmpULE(lhs, rhs, name()));
+}
+
+llvm::Value* VectorSupportLibrary::I1ToFloat(llvm::Value* i1) {
+  bool is_vector = llvm::isa<llvm::VectorType>(i1->getType());
+  llvm::Type* integer_type = IntegerTypeForFloatSize(is_vector);
+  return ir_builder()->CreateBitCast(
+      ir_builder()->CreateSExt(i1, integer_type, name()),
+      is_vector ? vector_type() : scalar_type(), name());
+}
+
+llvm::Type* VectorSupportLibrary::IntegerTypeForFloatSize(bool vector) {
+  CHECK(scalar_type()->isFloatingPointTy());
+  const llvm::DataLayout& data_layout =
+      ir_builder()->GetInsertBlock()->getModule()->getDataLayout();
+  int64 float_size_bits = data_layout.getTypeSizeInBits(scalar_type());
+  llvm::Type* scalar_int_type = ir_builder()->getIntNTy(float_size_bits);
+  if (vector) {
+    return llvm::VectorType::get(scalar_int_type, vector_size());
+  } else {
+    return scalar_int_type;
+  }
+}
+
+llvm::Value* VectorSupportLibrary::BroadcastScalar(llvm::Value* x) {
+  CHECK_EQ(x->getType(), scalar_type());
+  return ir_builder()->CreateVectorSplat(vector_size(), x, name());
+}
+
+llvm::Value* VectorSupportLibrary::FloatAnd(llvm::Value* lhs,
+                                            llvm::Value* rhs) {
+  AssertCorrectTypes({lhs, rhs});
+  llvm::Type* int_type =
+      IntegerTypeForFloatSize(lhs->getType() == vector_type());
+  return ir_builder()->CreateBitCast(
+      ir_builder()->CreateAnd(
+          ir_builder()->CreateBitCast(lhs, int_type, name()),
+          ir_builder()->CreateBitCast(rhs, int_type, name()), name()),
+      vector_type());
+}
+
+llvm::Value* VectorSupportLibrary::FloatNot(llvm::Value* lhs) {
+  AssertCorrectTypes({lhs});
+  llvm::Type* int_type =
+      IntegerTypeForFloatSize(lhs->getType() == vector_type());
+  return ir_builder()->CreateBitCast(
+      ir_builder()->CreateNot(
+          ir_builder()->CreateBitCast(lhs, int_type, name()), name()),
+      vector_type());
+}
+
+llvm::Value* VectorSupportLibrary::FloatOr(llvm::Value* lhs, llvm::Value* rhs) {
+  AssertCorrectTypes({lhs, rhs});
+  llvm::Type* int_type =
+      IntegerTypeForFloatSize(lhs->getType() == vector_type());
+  return ir_builder()->CreateBitCast(
+      ir_builder()->CreateOr(ir_builder()->CreateBitCast(lhs, int_type, name()),
+                             ir_builder()->CreateBitCast(rhs, int_type, name()),
+                             name()),
+      vector_type(), name());
+}
+
+llvm::Value* VectorSupportLibrary::AddInternal(llvm::Value* lhs,
+                                               llvm::Value* rhs) {
+  if (scalar_type_->isFloatingPointTy()) {
+    return ir_builder()->CreateFAdd(lhs, rhs, name());
+  } else {
+    return ir_builder()->CreateAdd(lhs, rhs, name());
+  }
+}
+
+llvm::Value* VectorSupportLibrary::ComputeOffsetPointer(
+    llvm::Value* base_pointer, llvm::Value* offset_elements) {
+  if (base_pointer->getType() != scalar_pointer_type()) {
+    base_pointer = ir_builder()->CreateBitCast(base_pointer,
+                                               scalar_pointer_type(), name());
+  }
+  return ir_builder()->CreateInBoundsGEP(base_pointer, {offset_elements},
+                                         name());
+}
+
+llvm::Value* VectorSupportLibrary::LoadVector(llvm::Value* pointer) {
+  if (pointer->getType() != vector_pointer_type()) {
+    pointer =
+        ir_builder()->CreateBitCast(pointer, vector_pointer_type(), name());
+  }
+  return ir_builder()->CreateAlignedLoad(
+      pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name());
+}
+
+llvm::Value* VectorSupportLibrary::LoadScalar(llvm::Value* pointer) {
+  if (pointer->getType() != scalar_pointer_type()) {
+    pointer =
+        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
+  }
+  return ir_builder()->CreateAlignedLoad(
+      pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name());
+}
+
+void VectorSupportLibrary::StoreVector(llvm::Value* value,
+                                       llvm::Value* pointer) {
+  AssertCorrectTypes({value});
+  if (pointer->getType() != vector_pointer_type()) {
+    pointer = ir_builder()->CreateBitCast(pointer, vector_pointer_type());
+  }
+  ir_builder()->CreateAlignedStore(
+      value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
+}
+
+void VectorSupportLibrary::StoreScalar(llvm::Value* value,
+                                       llvm::Value* pointer) {
+  AssertCorrectTypes({value});
+  if (pointer->getType() != scalar_pointer_type()) {
+    pointer =
+        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
+  }
+  ir_builder()->CreateAlignedStore(
+      value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
+}
+
+llvm::Value* VectorSupportLibrary::LoadBroadcast(llvm::Value* pointer) {
+  if (pointer->getType() != scalar_pointer_type()) {
+    pointer =
+        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
+  }
+  return ir_builder()->CreateVectorSplat(
+      vector_size(), ir_builder()->CreateLoad(pointer), name());
+}
+
+llvm::Value* VectorSupportLibrary::AddReduce(llvm::Value* vector) {
+  llvm::SmallVector<llvm::Constant*, 32> mask(vector_size(), nullptr);
+  for (unsigned i = vector_size(); i != 1; i >>= 1) {
+    // On every iteration, we shuffle half of the remaining lanes to the top
+    // half of shuffle, and add two old and the new vector.
+
+    for (unsigned j = 0; j < vector_size(); ++j) {
+      if (j < (i / 2)) {
+        mask[j] = ir_builder()->getInt32(i / 2 + j);
+      } else {
+        mask[j] = llvm::UndefValue::get(ir_builder()->getInt32Ty());
+      }
+    }
+
+    llvm::Value* half_remaining_lanes = ir_builder()->CreateShuffleVector(
+        vector, llvm::UndefValue::get(vector_type()),
+        llvm::ConstantVector::get(mask), "");
+    vector = Add(vector, half_remaining_lanes);
+  }
+
+  return ir_builder()->CreateExtractElement(vector, ir_builder()->getInt32(0),
+                                            name());
+}
+
+llvm::Value* VectorSupportLibrary::AvxStyleHorizontalAdd(llvm::Value* lhs,
+                                                         llvm::Value* rhs) {
+  CHECK_EQ(lhs->getType(), vector_type());
+  CHECK_EQ(rhs->getType(), vector_type());
+  CHECK_EQ(vector_size() % 2, 0);
+
+  llvm::SmallVector<llvm::Constant*, 32> mask_a, mask_b;
+
+  // Adding the values shuffled using mask_a and mask_b gives us the
+  // AVX-style horizontal add we want.  The masks work as documented
+  // in https://llvm.org/docs/LangRef.html#shufflevector-instruction
+  //
+  // Here are the masks for vector_width() == 8:
+  //
+  //    index: |0 |1 |2 | 3 |4 |5 | 6 | 7
+  //   --------+--+--+--+---+--+--+---+---
+  //   mask_a: |0 |2 |8 |10 |4 |6 |12 |14
+  //   mask_b: |1 |3 |9 |11 |5 |7 |13 |16
+  //
+  // So, as an example, the value at lane 3 of the result vector is
+  // the result of adding lane 10 and lane 11 in the combined lhs++rhs
+  // vector, which are the lanes 2 and 3 in the rhs vector.
+  for (int i = 0; i < vector_size(); i += 2) {
+    int increment = i < vector_size() / 2 ? 0 : (vector_size() / 2);
+    mask_a.push_back(ir_builder()->getInt32(increment + i));
+    mask_b.push_back(ir_builder()->getInt32(increment + i + 1));
+  }
+  for (int i = 0; i < vector_size(); i += 2) {
+    int increment = i < vector_size() / 2 ? (vector_size() / 2) : vector_size();
+    mask_a.push_back(ir_builder()->getInt32(increment + i));
+    mask_b.push_back(ir_builder()->getInt32(increment + i + 1));
+  }
+
+  llvm::Value* shuffle_0 = ir_builder()->CreateShuffleVector(
+      lhs, rhs, llvm::ConstantVector::get(mask_a));
+  llvm::Value* shuffle_1 = ir_builder()->CreateShuffleVector(
+      lhs, rhs, llvm::ConstantVector::get(mask_b));
+
+  return Add(shuffle_0, shuffle_1);
+}
+
+llvm::Value* VectorSupportLibrary::ExtractLowHalf(llvm::Value* vector) {
+  llvm::SmallVector<llvm::Constant*, 32> mask;
+  for (int i = 0; i < vector_size() / 2; i++) {
+    mask.push_back(ir_builder()->getInt32(i));
+  }
+
+  return ir_builder()->CreateShuffleVector(vector,
+                                           llvm::UndefValue::get(vector_type()),
+                                           llvm::ConstantVector::get(mask));
+}
+
+llvm::Value* VectorSupportLibrary::ExtractHighHalf(llvm::Value* vector) {
+  llvm::SmallVector<llvm::Constant*, 32> mask;
+  for (int i = 0; i < vector_size() / 2; i++) {
+    mask.push_back(ir_builder()->getInt32(i + vector_size() / 2));
+  }
+
+  return ir_builder()->CreateShuffleVector(vector,
+                                           llvm::UndefValue::get(vector_type()),
+                                           llvm::ConstantVector::get(mask));
+}
+
+std::vector<llvm::Value*> VectorSupportLibrary::ComputeHorizontalSums(
+    std::vector<llvm::Value*> vectors, llvm::Value* init_values) {
+  const int x86_avx_vector_elements =
+      TargetMachineFeatures::kX86AvxVectorByteSize / scalar_byte_size();
+  if (vector_size() == x86_avx_vector_elements &&
+      vectors.size() == x86_avx_vector_elements) {
+    return ComputeAvxOptimizedHorizontalSums(std::move(vectors), init_values);
+  }
+
+  std::vector<llvm::Value*> result;
+  std::transform(vectors.begin(), vectors.end(), std::back_inserter(result),
+                 [this](llvm::Value* vector) { return AddReduce(vector); });
+  if (init_values) {
+    for (int64 i = 0, e = result.size(); i < e; i++) {
+      result[i] = Add(result[i], ir_builder()->CreateExtractElement(
+                                     init_values, ir_builder()->getInt32(i)));
+    }
+  }
+  return result;
+}
+
+std::vector<llvm::Value*>
+VectorSupportLibrary::ComputeAvxOptimizedHorizontalSums(
+    std::vector<llvm::Value*> vectors, llvm::Value* init_values) {
+  while (vectors.size() != 2) {
+    std::vector<llvm::Value*> new_vectors;
+    for (int i = 0; i < vectors.size(); i += 2) {
+      new_vectors.push_back(AvxStyleHorizontalAdd(vectors[i], vectors[i + 1]));
+    }
+
+    vectors = std::move(new_vectors);
+  }
+
+  llvm::Value* low =
+      AddInternal(ExtractLowHalf(vectors[0]), ExtractHighHalf(vectors[0]));
+  if (init_values) {
+    low = AddInternal(ExtractLowHalf(init_values), low);
+  }
+  llvm::Value* high =
+      AddInternal(ExtractLowHalf(vectors[1]), ExtractHighHalf(vectors[1]));
+  if (init_values) {
+    high = AddInternal(ExtractHighHalf(init_values), high);
+  }
+
+  std::vector<llvm::Value*> results;
+  for (int i = 0; i < 8; i++) {
+    llvm::Value* scalar_result = ir_builder()->CreateExtractElement(
+        i < 4 ? low : high, ir_builder()->getInt32(i % 4), name());
+    results.push_back(scalar_result);
+  }
+
+  return results;
+}
+
+llvm::Value* VectorSupportLibrary::GetZeroVector() {
+  return llvm::Constant::getNullValue(vector_type());
+}
+
+llvm::Value* VectorSupportLibrary::GetZeroScalar() {
+  return llvm::Constant::getNullValue(scalar_type());
+}
+
+LlvmVariable::LlvmVariable(llvm::Type* type, llvm::IRBuilder<>* ir_builder)
+    : ir_builder_(ir_builder) {
+  alloca_ = llvm_ir::EmitAllocaAtFunctionEntry(type, "", ir_builder_);
+}
+
+llvm::Value* LlvmVariable::Get() const {
+  return ir_builder_->CreateLoad(alloca_);
+}
+
+void LlvmVariable::Set(llvm::Value* new_value) {
+  ir_builder_->CreateStore(new_value, alloca_);
+}
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
new file mode 100644
index 0000000000000000000000000000000000000000..6479bf76aab581ae3ec2923d98dab53720cab203
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -0,0 +1,317 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_VECTOR_SUPPORT_LIBRARY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_VECTOR_SUPPORT_LIBRARY_H_
+
+#include <string>
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace cpu {
+
+// Simple wrappers around llvm::APFloat::APFloat to make the calling code more
+// obvious.
+
+inline llvm::APFloat GetIeeeF32(float f) { return llvm::APFloat(f); }
+inline llvm::APFloat GetIeeeF32FromBitwiseRep(int32 bitwise_value) {
+  return llvm::APFloat(llvm::APFloat::IEEEsingle(),
+                       llvm::APInt(/*numBits=*/32, /*val=*/bitwise_value));
+}
+
+// A thin wrapper around llvm_util.h to make code generating vector math flow
+// more readable.
+class VectorSupportLibrary {
+ public:
+  // This VectorSupportLibrary instance remembers `primitive_type` and
+  // `vector_size`, and these are implicitly used by the methods on this
+  // instance (i.e. LoadVector will load a vector of type <`vector_size` x
+  // `primitive_type`>).
+  VectorSupportLibrary(PrimitiveType primitive_type, int64 vector_size,
+                       llvm::IRBuilder<>* ir_builder, std::string name);
+
+  llvm::Value* Mul(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* Mul(int64 lhs, llvm::Value* rhs) {
+    return Mul(ir_builder()->getInt64(lhs), rhs);
+  }
+  llvm::Value* Mul(const llvm::APFloat& lhs, llvm::Value* rhs) {
+    return Mul(GetConstantFloat(rhs->getType(), lhs), rhs);
+  }
+
+  // If your call resolved to these then you probably wanted the versions taking
+  // APFloat.
+  llvm::Value* Mul(double lhs, llvm::Value* rhs) = delete;
+  llvm::Value* Mul(float lhs, llvm::Value* rhs) = delete;
+
+  llvm::Value* Add(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* Add(int64 lhs, llvm::Value* rhs) {
+    return Add(ir_builder()->getInt64(lhs), rhs);
+  }
+  llvm::Value* Add(const llvm::APFloat& lhs, llvm::Value* rhs) {
+    return Add(GetConstantFloat(rhs->getType(), lhs), rhs);
+  }
+
+  // If your call resolved to these then you probably wanted the versions taking
+  // APFloat.
+  llvm::Value* Add(double lhs, llvm::Value* rhs) = delete;
+  llvm::Value* Add(float lhs, llvm::Value* rhs) = delete;
+
+  llvm::Value* Sub(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* Sub(llvm::Value* lhs, const llvm::APFloat& rhs) {
+    return Sub(lhs, GetConstantFloat(lhs->getType(), rhs));
+  }
+  llvm::Value* Max(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* Max(const llvm::APFloat& lhs, llvm::Value* rhs) {
+    return Max(GetConstantFloat(rhs->getType(), lhs), rhs);
+  }
+  llvm::Value* Div(llvm::Value* lhs, llvm::Value* rhs);
+
+  llvm::Value* MulAdd(llvm::Value* a, llvm::Value* b, llvm::Value* c) {
+    return Add(c, Mul(a, b));
+  }
+
+  llvm::Value* MulAdd(llvm::Value* a, llvm::Value* b, const llvm::APFloat& c) {
+    return Add(GetConstantFloat(vector_type(), c), Mul(a, b));
+  }
+
+  llvm::Value* MulAdd(llvm::Value* a, const llvm::APFloat& b,
+                      const llvm::APFloat& c) {
+    return Add(GetConstantFloat(a->getType(), c),
+               Mul(a, GetConstantFloat(a->getType(), b)));
+  }
+
+  llvm::Value* Floor(llvm::Value* a);
+
+  llvm::Value* Clamp(llvm::Value* a, const llvm::APFloat& low,
+                     const llvm::APFloat& high);
+  llvm::Value* SplatFloat(const llvm::APFloat& d) {
+    return GetConstantFloat(vector_type(), d);
+  }
+
+  // These compare instructions return a floating point typed mask instead of an
+  // i1.  For instance, on a vector typed input, lanes where the predicate is
+  // true get a float with all ones and other lanes get a float with all zeros.
+  // This is slightly odd from the perspective of LLVM's type system, but it
+  // makes kernel IR generation code written using VectorSupportLibrary (its
+  // raison d'etre) less cluttered.
+
+  llvm::Value* FCmpEQMask(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* FCmpULEMask(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* FCmpOLTMask(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* FCmpOLTMask(llvm::Value* lhs, const llvm::APFloat& rhs) {
+    return FCmpOLTMask(lhs, GetConstantFloat(lhs->getType(), rhs));
+  }
+
+  // These boolean operations operate on the bitwise values of the floating
+  // point inputs.  They return a (vector of) float(s) but like in the mask
+  // generating predicates above this type system oddity makes the kernel IR
+  // generation code less cluttered.
+  llvm::Value* FloatAnd(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* FloatAnd(llvm::Value* lhs, const llvm::APFloat& rhs) {
+    return FloatAnd(lhs, GetConstantFloat(lhs->getType(), rhs));
+  }
+  llvm::Value* FloatOr(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* FloatOr(llvm::Value* lhs, const llvm::APFloat& rhs) {
+    return FloatOr(lhs, GetConstantFloat(lhs->getType(), rhs));
+  }
+  llvm::Value* FloatNot(llvm::Value* lhs);
+  llvm::Value* FloatAndNot(llvm::Value* lhs, llvm::Value* rhs) {
+    return FloatAnd(FloatNot(lhs), rhs);
+  }
+
+  llvm::Value* BroadcastScalar(llvm::Value* x);
+  llvm::Value* BroadcastScalar(const llvm::APFloat& d) {
+    return BroadcastScalar(GetConstantFloat(scalar_type(), d));
+  }
+
+  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
+                                    llvm::Value* offset_elements);
+  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
+                                    int64 offset_elements) {
+    return ComputeOffsetPointer(base_pointer,
+                                ir_builder()->getInt64(offset_elements));
+  }
+
+  llvm::Value* LoadVector(llvm::Value* pointer);
+
+  llvm::Value* LoadVector(llvm::Value* base_pointer,
+                          llvm::Value* offset_elements) {
+    return LoadVector(ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+
+  llvm::Value* LoadVector(llvm::Value* base_pointer, int64 offset_elements) {
+    return LoadVector(base_pointer, ir_builder()->getInt64(offset_elements));
+  }
+
+  llvm::Value* LoadScalar(llvm::Value* pointer);
+
+  llvm::Value* LoadScalar(llvm::Value* base_pointer,
+                          llvm::Value* offset_elements) {
+    return LoadScalar(ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+
+  llvm::Value* LoadScalar(llvm::Value* base_pointer, int64 offset_elements) {
+    return LoadScalar(base_pointer, ir_builder()->getInt64(offset_elements));
+  }
+
+  void StoreVector(llvm::Value* value, llvm::Value* pointer);
+
+  void StoreVector(llvm::Value* value, llvm::Value* base_pointer,
+                   llvm::Value* offset_elements) {
+    StoreVector(value, ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+
+  void StoreVector(llvm::Value* value, llvm::Value* base_pointer,
+                   int64 offset_elements) {
+    StoreVector(value, base_pointer, ir_builder()->getInt64(offset_elements));
+  }
+
+  void StoreScalar(llvm::Value* value, llvm::Value* pointer);
+  void StoreScalar(llvm::Value* value, llvm::Value* base_pointer,
+                   llvm::Value* offset_elements) {
+    StoreScalar(value, ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+
+  void StoreScalar(llvm::Value* value, llvm::Value* base_pointer,
+                   int64 offset_elements) {
+    StoreScalar(base_pointer, ir_builder()->getInt64(offset_elements));
+  }
+
+  llvm::Value* LoadBroadcast(llvm::Value* pointer);
+  llvm::Value* LoadBroadcast(llvm::Value* base_pointer,
+                             llvm::Value* offset_elements) {
+    return LoadBroadcast(ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+  llvm::Value* LoadBroadcast(llvm::Value* base_pointer, int64 offset_elements) {
+    return LoadBroadcast(base_pointer, ir_builder()->getInt64(offset_elements));
+  }
+
+  // Compute the horizontal sum of each vector in `vectors`.  The i'th element
+  // in the result vector is the (scalar) horizontal sum of the i'th vector in
+  // `vectors`.  If `init_values` is not nullptr then the value in the i'th lane
+  // in `init_values` is added to the i'th horizontal sum.
+  std::vector<llvm::Value*> ComputeHorizontalSums(
+      std::vector<llvm::Value*> vectors, llvm::Value* init_values = nullptr);
+
+  llvm::Value* GetZeroVector();
+  llvm::Value* GetZeroScalar();
+
+  llvm::IRBuilder<>* ir_builder() const { return ir_builder_; }
+  int64 vector_size() const { return vector_size_; }
+  llvm::Type* vector_type() const { return vector_type_; }
+  llvm::Type* vector_pointer_type() const { return vector_pointer_type_; }
+  llvm::Type* scalar_type() const { return scalar_type_; }
+  llvm::Type* scalar_pointer_type() const { return scalar_pointer_type_; }
+  int64 scalar_byte_size() const {
+    return primitive_util::BitWidth(primitive_type_) / 8;
+  }
+
+  const std::string& name() const { return name_; }
+
+ private:
+  llvm::Value* ExtractLowHalf(llvm::Value*);
+  llvm::Value* ExtractHighHalf(llvm::Value*);
+
+  llvm::Value* MulInternal(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* AddInternal(llvm::Value* lhs, llvm::Value* rhs);
+
+  llvm::Value* AddReduce(llvm::Value* vector);
+
+  // Checks that each value in `values` is either of type scalar_type() or
+  // vector_type().  This LOG(FATAL)'s so it should only be called in cases
+  // where a mismatching type is a programmer bug.
+  void AssertCorrectTypes(std::initializer_list<llvm::Value*> values);
+
+  // Perform an X86 AVX style horizontal add between `lhs` and `rhs`.  The
+  // resulting IR for an 8-float wide vector is expected to lower to a single
+  // vhaddps instruction on a CPU that supports vhaddps, and not be too bad in
+  // other cases.
+  //
+  // For a vector width of 8, the result vector is computed as:
+  //   Result[0] = Lhs[0] + Lhs[1]
+  //   Result[1] = Lhs[2] + Lhs[3]
+  //   Result[2] = Rhs[0] + Rhs[1]
+  //   Result[3] = Rhs[2] + Rhs[3]
+  //   Result[4] = Lhs[4] + Lhs[5]
+  //   Result[5] = Lhs[6] + Lhs[7]
+  //   Result[6] = Rhs[4] + Rhs[5]
+  //   Result[7] = Rhs[6] + Rhs[7]
+  llvm::Value* AvxStyleHorizontalAdd(llvm::Value* lhs, llvm::Value* rhs);
+
+  std::vector<llvm::Value*> ComputeAvxOptimizedHorizontalSums(
+      std::vector<llvm::Value*> vectors, llvm::Value* init_values);
+
+  llvm::Type* IntegerTypeForFloatSize(bool vector);
+  llvm::Value* I1ToFloat(llvm::Value* i1);
+  llvm::Value* GetConstantFloat(llvm::Type* type, const llvm::APFloat& f) {
+    llvm::Constant* scalar_value = llvm::ConstantFP::get(type->getContext(), f);
+    if (llvm::isa<llvm::VectorType>(type)) {
+      return llvm::ConstantVector::getSplat(vector_size(), scalar_value);
+    }
+    return scalar_value;
+  }
+
+  int64 vector_size_;
+  PrimitiveType primitive_type_;
+  llvm::IRBuilder<>* ir_builder_;
+  llvm::Type* vector_type_;
+  llvm::Type* vector_pointer_type_;
+  llvm::Type* scalar_type_;
+  llvm::Type* scalar_pointer_type_;
+  std::string name_;
+};
+
+// This wraps an alloca-backed stack variable which LLVM's SSA construction pass
+// can later convert to a SSA value.
+class LlvmVariable {
+ public:
+  LlvmVariable(llvm::Type*, llvm::IRBuilder<>* ir_builder);
+
+  llvm::Value* Get() const;
+  void Set(llvm::Value* new_value);
+
+ private:
+  llvm::AllocaInst* alloca_;
+  llvm::IRBuilder<>* ir_builder_;
+};
+
+class VectorVariable : public LlvmVariable {
+ public:
+  VectorVariable(VectorSupportLibrary* vector_support,
+                 llvm::Value* initial_value)
+      : LlvmVariable(vector_support->vector_type(),
+                     vector_support->ir_builder()) {
+    Set(initial_value);
+  }
+};
+
+class ScalarVariable : public LlvmVariable {
+ public:
+  ScalarVariable(VectorSupportLibrary* vector_support,
+                 llvm::Value* initial_value)
+      : LlvmVariable(vector_support->scalar_type(),
+                     vector_support->ir_builder()) {
+    Set(initial_value);
+  }
+};
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_VECTOR_SUPPORT_LIBRARY_H_
diff --git a/tensorflow/compiler/xla/service/cpu/windows_compatibility.cc b/tensorflow/compiler/xla/service/cpu/windows_compatibility.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ab308ee6cb16ba95e24694b59a4b5737765bbb8b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/windows_compatibility.cc
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
+
+#ifdef _MSC_VER
+
+#include <math.h>
+
+void sincos(double x, double *sinv, double *cosv) {
+  *sinv = sin(x);
+  *cosv = cos(x);
+}
+
+void sincosf(float x, float *sinv, float *cosv) {
+  *sinv = sinf(x);
+  *cosv = cosf(x);
+}
+
+#endif  // _MSC_VER
diff --git a/tensorflow/compiler/xla/service/cpu/windows_compatibility.h b/tensorflow/compiler/xla/service/cpu/windows_compatibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..262f379d8b6017f4a7e0156b724bfee7e8ec5b9a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/windows_compatibility.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_WINDOWS_COMPATIBILITY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_WINDOWS_COMPATIBILITY_H_
+
+#ifdef _MSC_VER
+
+extern "C" {
+
+// MSVC does not have sincos[f].
+void sincos(double x, double *sinv, double *cosv);
+void sincosf(float x, float *sinv, float *cosv);
+
+}
+
+#endif  // _MSC_VER
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_WINDOWS_COMPATIBILITY_H_
diff --git a/tensorflow/compiler/xla/service/cpu/xfeed_manager.cc b/tensorflow/compiler/xla/service/cpu/xfeed_manager.cc
index d0f214202908266371639af8f431ad8269ad0e35..47543b2082f55cf7b8cf60f1c5bbb16a0a609912 100644
--- a/tensorflow/compiler/xla/service/cpu/xfeed_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/xfeed_manager.cc
@@ -41,6 +41,8 @@ void XfeedQueueManager::EnqueueBuffersAtomically(
   tensorflow::mutex_lock l(mu_);
   bool was_empty = enqueued_buffers_.empty();
   for (XfeedBuffer* b : buffers) {
+    VLOG(3) << "Enqueueing " << queue_name_ << " buffer (of " << buffers.size()
+            << " buffers) with length: " << b->length();
     enqueued_buffers_.push_back(b);
   }
   if (was_empty && !buffers.empty()) {
@@ -54,9 +56,11 @@ void XfeedQueueManager::EnqueueBuffersAtomically(
 
 XfeedBuffer* XfeedQueueManager::BlockingDequeueBuffer() {
   tensorflow::mutex_lock l(mu_);
+  VLOG(3) << "Waiting for an available buffer.";
   while (enqueued_buffers_.empty()) {
     cv_.wait(l);
   }
+  VLOG(3) << "A buffer is available!";
   CHECK(current_buffer_ == nullptr);
   current_buffer_ = enqueued_buffers_.front();
   enqueued_buffers_.pop_front();
@@ -65,6 +69,9 @@ XfeedBuffer* XfeedQueueManager::BlockingDequeueBuffer() {
 
 void XfeedQueueManager::ReleaseCurrentBuffer(int32 length, void* data,
                                              StatusOr<Shape> shape) {
+  VLOG(3) << "Releasing buffer with shape: "
+          << (shape.ok() ? ShapeUtil::HumanString(shape.ValueOrDie())
+                         : "<error status>");
   tensorflow::mutex_lock l(mu_);
   CHECK(current_buffer_ != nullptr);
   CHECK_EQ(length, current_buffer_->length());
diff --git a/tensorflow/compiler/xla/service/cpu/xfeed_manager.h b/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
index 6af55700052007a2ca419d52b63dddea2052bd0b..b4ace232607e14fbfec01d48946f0031d96cd027 100644
--- a/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
@@ -50,7 +50,7 @@ class XfeedBuffer {
 // Reusable component for managing the infeed and outfeed queue state.
 class XfeedQueueManager {
  public:
-  XfeedQueueManager() = default;
+  XfeedQueueManager(string queue_name) : queue_name_(queue_name) {}
 
   // Calls the completion callback for any enqueued buffers that have
   // not been dequeued by the runtime, and empties the
@@ -86,6 +86,8 @@ class XfeedQueueManager {
   void ReleaseCurrentBuffer(int32 length, void* data, StatusOr<Shape> shape);
 
  private:
+  const string queue_name_;
+
   tensorflow::mutex mu_;
 
   // Condition variable that is signaled every time a buffer is
@@ -112,8 +114,8 @@ class XfeedManager {
   XfeedQueueManager* outfeed() { return &outfeed_; }
 
  private:
-  XfeedQueueManager infeed_;
-  XfeedQueueManager outfeed_;
+  XfeedQueueManager infeed_ = {"infeed"};
+  XfeedQueueManager outfeed_ = {"outfeed"};
 };
 
 }  // namespace runtime
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc
index 2e4b0a5230516b5308aeed892de9a49565a09f2e..78e7aa48accdbb51a8477455f5f9c004828c068f 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@@ -24,7 +24,7 @@ limitations under the License.
 namespace xla {
 
 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    perftools::gputools::Platform* platform,
+    const perftools::gputools::Platform* platform,
     tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
         stream_executors)
     : DeviceMemoryAllocator(platform),
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h
index 00caefab667cba6abfef200050ca18f229fc0320..39dfad84c1c1c1c461c24de555ecd919cea47d83 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@@ -33,7 +33,7 @@ class DeviceMemoryAllocator {
  public:
   // Parameter platform indicates which platform the allocator allocates memory
   // on. Must be non-null.
-  explicit DeviceMemoryAllocator(perftools::gputools::Platform* platform)
+  explicit DeviceMemoryAllocator(const perftools::gputools::Platform* platform)
       : platform_(platform) {}
   virtual ~DeviceMemoryAllocator() {}
 
@@ -49,14 +49,14 @@ class DeviceMemoryAllocator {
       int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) = 0;
 
   // Return the platform that the allocator allocates memory on.
-  perftools::gputools::Platform* platform() const { return platform_; }
+  const perftools::gputools::Platform* platform() const { return platform_; }
 
   // Can we call Deallocate() as soon as a computation has been scheduled on
   // a stream, or do we have to wait for the computation to complete first?
   virtual bool AllowsAsynchronousDeallocation() const = 0;
 
  protected:
-  perftools::gputools::Platform* platform_;
+  const perftools::gputools::Platform* platform_;
 };
 
 // Default memory allocator for a platform which uses
@@ -64,7 +64,7 @@ class DeviceMemoryAllocator {
 class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
  public:
   StreamExecutorMemoryAllocator(
-      perftools::gputools::Platform* platform,
+      const perftools::gputools::Platform* platform,
       tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
           stream_executors);
 
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 91086fd4a5f68211ef56c2417bb0ef4a38de2cff..a803b3171f9afa6297553c5507c4f9aa45e420ab 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -103,6 +103,7 @@ class DfsHloVisitorBase {
     return HandleElementwiseBinary(hlo);
   }
   virtual Status HandleConvolution(HloInstructionPtr hlo) = 0;
+  virtual Status HandleFft(HloInstructionPtr fft) = 0;
   virtual Status HandleCrossReplicaSum(HloInstructionPtr hlo) = 0;
   virtual Status HandleCompare(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
@@ -247,6 +248,10 @@ class DfsHloVisitorBase {
   // affecting correctness.
   void ReserveVisitStates(int num) { visit_state_.Reserve(num); }
 
+  // Useful when we want to visit the same computation more than once with the
+  // same visitor.
+  void ResetVisitStates() { visit_state_.Reset(); }
+
   void SetVisitState(int id, VisitState state) {
     visit_state_.SetState(id, state);
   }
@@ -326,6 +331,7 @@ class DfsHloVisitorBase {
       *w = (*w & ~mask) | (static_cast<uint64>(state) << shift);
       DCHECK_EQ(GetState(id), state);
     }
+    void Reset() { states_.clear(); }
 
    private:
     static const uint32 kStatesPerWord = sizeof(uint64) / 2 /*bits per entry*/;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 133aa2509405738de8388708b0c61a82023e2738..170adb3d241b3648bc53f96dde9866f0b794f80a 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -85,6 +85,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleConvolution(HloInstructionPtr convolution) override {
     return DefaultAction(convolution);
   }
+  Status HandleFft(HloInstructionPtr fft) override {
+    return DefaultAction(fft);
+  }
   Status HandleCrossReplicaSum(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12faed69677cd99c6ed82c8d13dad3138d9461b7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dot_decomposer.cc
@@ -0,0 +1,185 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dot_decomposer.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace {
+
+// TODO(b/69062148) Remove this code when all backends support BatchDot
+// natively.
+Status DecomposeBatchDot(HloInstruction* dot) {
+  auto computation = dot->parent();
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  HloInstruction* lhs = dot->mutable_operand(0);
+  HloInstruction* rhs = dot->mutable_operand(1);
+  const Shape& lhs_shape = lhs->shape();
+  const Shape& rhs_shape = rhs->shape();
+  const Shape& dot_shape = dot->shape();
+
+  // ShapeInference should guarantee that lhs/rhs batch dimensions match.
+  CHECK_EQ(dnums.lhs_batch_dimensions_size(),
+           dnums.rhs_batch_dimensions_size());
+  const int64 num_batch_dims = dnums.lhs_batch_dimensions_size();
+  // Calculate total batch size (note that ShapeInference requires that
+  // the batch dimensions are most-major).
+  int64 batch_size = 1;
+  for (int i = 0; i < num_batch_dims; ++i) {
+    CHECK_EQ(lhs_shape.dimensions(dnums.lhs_batch_dimensions(i)),
+             rhs_shape.dimensions(dnums.rhs_batch_dimensions(i)));
+    batch_size *= lhs_shape.dimensions(dnums.lhs_batch_dimensions(i));
+  }
+
+  // Set lhs/rhs_transpose.
+  CHECK_EQ(1, dnums.lhs_contracting_dimensions_size());
+  const int64 lhs_contracting_dim_number = dnums.lhs_contracting_dimensions(0);
+  const bool lhs_transpose = (lhs_contracting_dim_number - num_batch_dims) == 0;
+
+  CHECK_EQ(1, dnums.rhs_contracting_dimensions_size());
+  const int64 rhs_contracting_dim_number = dnums.rhs_contracting_dimensions(0);
+  const bool rhs_transpose = (rhs_contracting_dim_number - num_batch_dims) == 1;
+
+  // Compute R3 and R3 shapes for lhs.
+  PrimitiveType lhs_type = lhs_shape.element_type();
+  const int64 lhs_rows = lhs_shape.dimensions(num_batch_dims + 0);
+  const int64 lhs_cols = lhs_shape.dimensions(num_batch_dims + 1);
+  Shape lhs_shape_r3 =
+      ShapeUtil::MakeShape(lhs_type, {batch_size, lhs_rows, lhs_cols});
+  Shape lhs_slice_shape_r3 =
+      ShapeUtil::MakeShape(lhs_type, {1, lhs_rows, lhs_cols});
+  Shape lhs_slice_shape_r2 =
+      ShapeUtil::MakeShape(lhs_type, {lhs_rows, lhs_cols});
+
+  // Compute R3 and R3 shapes for rhs.
+  PrimitiveType rhs_type = rhs_shape.element_type();
+  const int64 rhs_rows = rhs_shape.dimensions(num_batch_dims + 0);
+  const int64 rhs_cols = rhs_shape.dimensions(num_batch_dims + 1);
+  Shape rhs_shape_r3 =
+      ShapeUtil::MakeShape(rhs_type, {batch_size, rhs_rows, rhs_cols});
+  Shape rhs_slice_shape_r3 =
+      ShapeUtil::MakeShape(rhs_type, {1, rhs_rows, rhs_cols});
+  Shape rhs_slice_shape_r2 =
+      ShapeUtil::MakeShape(rhs_type, {rhs_rows, rhs_cols});
+
+  // Compute R3 and R3 shapes for dot output.
+  PrimitiveType dot_type = dot_shape.element_type();
+  const int64 dot_rows = dot_shape.dimensions(num_batch_dims + 0);
+  const int64 dot_cols = dot_shape.dimensions(num_batch_dims + 1);
+  Shape dot_shape_r2 = ShapeUtil::MakeShape(dot_type, {dot_rows, dot_cols});
+  Shape dot_shape_r3 = ShapeUtil::MakeShape(dot_type, {1, dot_rows, dot_cols});
+  Shape concat_shape_r3 =
+      ShapeUtil::MakeShape(dot_type, {batch_size, dot_rows, dot_cols});
+
+  // Reshape lhs/rhs into R3.
+  auto lhs_r3 = computation->AddInstruction(
+      HloInstruction::CreateReshape(lhs_shape_r3, lhs));
+  auto rhs_r3 = computation->AddInstruction(
+      HloInstruction::CreateReshape(rhs_shape_r3, rhs));
+
+  // Loop through batch size, slicing out required lhs/rhs to compute each Dot.
+  std::vector<HloInstruction*> output_slices(batch_size);
+  for (int64 i = 0; i < batch_size; ++i) {
+    // Slice R3 shape from 'lhs' and reshape to R2.
+    auto lhs_slice_r3 = computation->AddInstruction(
+        HloInstruction::CreateSlice(lhs_slice_shape_r3, lhs_r3, {i, 0, 0},
+                                    {i + 1, lhs_rows, lhs_cols}, {1, 1, 1}));
+    auto lhs_slice_r2 = computation->AddInstruction(
+        HloInstruction::CreateReshape(lhs_slice_shape_r2, lhs_slice_r3));
+
+    // Slice R3 shape from 'rhs' and reshape to R2.
+    auto rhs_slice_r3 = computation->AddInstruction(
+        HloInstruction::CreateSlice(rhs_slice_shape_r3, rhs_r3, {i, 0, 0},
+                                    {i + 1, rhs_rows, rhs_cols}, {1, 1, 1}));
+    auto rhs_slice_r2 = computation->AddInstruction(
+        HloInstruction::CreateReshape(rhs_slice_shape_r2, rhs_slice_r3));
+
+    // Transpose lhs/rhs (if needed).
+    if (lhs_transpose) {
+      Shape lhs_slice_shape_r2_transpose =
+          ShapeUtil::MakeShape(lhs_type, {lhs_cols, lhs_rows});
+      lhs_slice_r2 =
+          computation->AddInstruction(HloInstruction::CreateTranspose(
+              lhs_slice_shape_r2_transpose, lhs_slice_r2, {1, 0}));
+    }
+    if (rhs_transpose) {
+      Shape rhs_slice_shape_r2_transpose =
+          ShapeUtil::MakeShape(rhs_type, {rhs_cols, rhs_rows});
+      rhs_slice_r2 =
+          computation->AddInstruction(HloInstruction::CreateTranspose(
+              rhs_slice_shape_r2_transpose, rhs_slice_r2, {1, 0}));
+    }
+
+    // Compute Dot of lhs/rhs R2 slices.
+    DotDimensionNumbers dot_dnums;
+    dot_dnums.add_lhs_contracting_dimensions(1);
+    dot_dnums.add_rhs_contracting_dimensions(0);
+    auto dot_r2 = computation->AddInstruction(HloInstruction::CreateDot(
+        dot_shape_r2, lhs_slice_r2, rhs_slice_r2, dot_dnums));
+
+    // Reshape Dot to R3 so we can concat along batch dimension.
+    auto dot_r3 = computation->AddInstruction(
+        HloInstruction::CreateReshape(dot_shape_r3, dot_r2));
+
+    output_slices[i] = dot_r3;
+  }
+
+  // Concatenate slices from 'output_slices' along batch dimension.
+  auto concat = computation->AddInstruction(
+      HloInstruction::CreateConcatenate(concat_shape_r3, output_slices, 0));
+  // Reshape output 'new_dot' to original dimensions.
+  auto new_dot = computation->AddInstruction(
+      HloInstruction::CreateReshape(dot_shape, concat));
+
+  // Replace all uses of 'dot' in 'computation' with 'new_dot'.
+  return computation->ReplaceInstruction(dot, new_dot);
+}
+
+}  // namespace
+
+StatusOr<bool> DotDecomposer::Run(HloModule* module) {
+  XLA_VLOG_LINES(2, "DotDecomposer ENTRY\n" + module->ToString());
+  // Gather all batch Dot operations.
+  std::vector<HloInstruction*> batch_dots;
+  for (auto* computation : module->MakeNonfusionComputations()) {
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kDot) {
+        continue;
+      }
+      const DotDimensionNumbers& dnums = instruction->dot_dimension_numbers();
+      if (dnums.lhs_batch_dimensions_size() > 0 && decompose_batch_dot_) {
+        batch_dots.push_back(instruction);
+      }
+    }
+  }
+  // Decompose each batch Dot in 'batch_dots'.
+  bool changed = false;
+  for (auto* dot : batch_dots) {
+    TF_RETURN_IF_ERROR(DecomposeBatchDot(dot));
+    changed = true;
+  }
+  XLA_VLOG_LINES(2, "DotDecompose EXIT\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.h b/tensorflow/compiler/xla/service/dot_decomposer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1959b687f16d6909a3283021c8635b3e65e6e412
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dot_decomposer.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// DotDecomposer is a pass which decomposes batch Dot operations into a
+// sequence of smaller (R2) Dot operations.
+class DotDecomposer : public HloPassInterface {
+ public:
+  // Decomposes batch Dot operations when 'decompose_batch_dot' is true.
+  DotDecomposer(bool decompose_batch_dot = true)
+      : decompose_batch_dot_(decompose_batch_dot) {}
+  ~DotDecomposer() = default;
+  tensorflow::StringPiece name() const override { return "dot_decomposer"; }
+
+  // Run DotDecomposer pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  bool decompose_batch_dot_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index b9407818cd8bc82aabd32ed02f61ef66fe442625..4468adbadbf823f1420a8b665a26f66cb7d36b43 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -50,11 +50,161 @@ using llvm_ir::IrName;
 using llvm_ir::SetToFirstInsertPoint;
 using tensorflow::strings::StrCat;
 
+namespace {
+
+llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits,
+                                      int64 mantissa_bits,
+                                      llvm::IRBuilder<>* ir_builder) {
+  // Integer and float types for casting and constant generation.
+  llvm::Type* float_type = x->getType();
+  llvm::IntegerType* int_type = ir_builder->getInt32Ty();
+
+  // Cast the input value to an integer for bitwise manipulation.
+  llvm::Value* x_as_int = ir_builder->CreateBitCast(x, int_type);
+
+  if (mantissa_bits < 23) {
+    // Last remaining mantissa bit.
+    const uint32_t last_mantissa_bit_mask = 1u << (23 - mantissa_bits);
+
+    // Compute rounding bias for round-to-nearest with ties to even.  This is
+    // equal to a base value of 0111... plus one bit if the last remaining
+    // mantissa bit is 1.
+    const uint32_t base_rounding_bias = (last_mantissa_bit_mask >> 1) - 1;
+    llvm::Value* x_last_mantissa_bit = ir_builder->CreateLShr(
+        ir_builder->CreateAnd(
+            x_as_int, llvm::ConstantInt::get(int_type, last_mantissa_bit_mask)),
+        (23 - mantissa_bits));
+    llvm::Value* x_rounding_bias = ir_builder->CreateAdd(
+        x_last_mantissa_bit,
+        llvm::ConstantInt::get(int_type, base_rounding_bias));
+
+    // Add rounding bias, and mask out truncated bits.  Note that the case
+    // where adding the rounding bias overflows into the exponent bits is
+    // correct; the non-masked mantissa bits will all be zero, and the
+    // exponent will be incremented by one.
+    const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
+    x_as_int = ir_builder->CreateAdd(x_as_int, x_rounding_bias);
+    x_as_int = ir_builder->CreateAnd(
+        x_as_int, llvm::ConstantInt::get(int_type, truncation_mask));
+  }
+
+  if (exponent_bits < 8) {
+    // Masks for f32 values.
+    const uint32_t f32_sign_bit_mask = 1u << 31;
+    const uint32_t f32_exp_bits_mask = 0xffu << 23;
+
+    // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the most-
+    // significant bit -- is equal to 1.0f for all exponent sizes.  Adding
+    // 2^(n-1)-1 to this gives us the highest non-infinite exponent for a bit-
+    // size of n, and subtracting 2^(n-1)-1 from this gives us the lowest'
+    // exponent (corresponding to 0.0f).
+    //
+    // Thus, the f32 exponent corresponding to the highest non-infinite
+    // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
+    // exponent corresponding to the lowest exponent for a bit size of n is
+    // (2^7-1) - 2^(n-1)-1.
+    //
+    // Note that we have already checked that exponents_bits >= 1.
+    const uint32_t f32_exponent_bias = (1 << 7) - 1;
+    const uint32_t reduced_exponent_bias = (1 << (exponent_bits - 1)) - 1;
+    const uint32_t reduced_max_exponent =
+        f32_exponent_bias + reduced_exponent_bias;
+    const uint32_t reduced_min_exponent =
+        f32_exponent_bias - reduced_exponent_bias;
+
+    // Do we overflow or underflow?
+    llvm::Value* x_exponent = ir_builder->CreateAnd(
+        x_as_int, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
+    llvm::Value* x_overflows = ir_builder->CreateICmpUGT(
+        x_exponent,
+        llvm::ConstantInt::get(int_type, reduced_max_exponent << 23));
+    llvm::Value* x_underflows = ir_builder->CreateICmpULE(
+        x_exponent,
+        llvm::ConstantInt::get(int_type, reduced_min_exponent << 23));
+
+    // Compute appropriately-signed values of zero and infinity.
+    llvm::Value* x_signed_zero = ir_builder->CreateAnd(
+        x_as_int, llvm::ConstantInt::get(int_type, f32_sign_bit_mask));
+    llvm::Value* x_signed_inf = ir_builder->CreateOr(
+        x_signed_zero, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
+
+    // Force to zero or infinity if overflow or underflow.  (Note that this
+    // truncates all denormal values to zero, rather than rounding them.)
+    x_as_int = ir_builder->CreateSelect(x_overflows, x_signed_inf, x_as_int);
+    x_as_int = ir_builder->CreateSelect(x_underflows, x_signed_zero, x_as_int);
+  }
+
+  // Cast the result back to a floating-point type.
+  llvm::Value* result = ir_builder->CreateBitCast(x_as_int, float_type);
+
+  // Correct result for NaN inputs.
+  //
+  // The exponent handling will "normalize" NaN values to infinities, which is
+  // undesirable (except in the case with no mantissa bits, in which case it
+  // is mandatory).  This logic also handles cases where mantissa-rounding
+  // causes a NaN's mantissa to overflow into the exponent bits, which would
+  // otherwise create an erroneous zero value.
+  //
+  // If the fast-math flags are set to assume no NaNs, the comparison is likely
+  // to be optimized away, so there's no point in even emitting it.
+  if (!ir_builder->getFastMathFlags().noNaNs()) {
+    llvm::Value* x_is_nan = ir_builder->CreateFCmpUNO(x, x);
+
+    if (mantissa_bits > 0) {
+      result = ir_builder->CreateSelect(x_is_nan, x, result);
+    } else {
+      result = ir_builder->CreateSelect(
+          x_is_nan, llvm::ConstantFP::getInfinity(float_type), result);
+    }
+  }
+  return result;
+}
+
+llvm::Value* EmitF32ToBF16(llvm::Value* f32_value,
+                           llvm::IRBuilder<>* ir_builder) {
+  auto reduced_precision = EmitReducePrecisionFloat(
+      f32_value,
+      /*exponent_bits=*/primitive_util::kBFloat16ExponentBits,
+      /*mantissa_bits=*/primitive_util::kBFloat16MantissaBits, ir_builder);
+  auto as_int32 =
+      ir_builder->CreateBitCast(reduced_precision, ir_builder->getInt32Ty());
+  auto shifted = ir_builder->CreateLShr(as_int32, 16);
+  auto truncated = ir_builder->CreateTrunc(shifted, ir_builder->getInt16Ty());
+  return ir_builder->CreateBitCast(truncated, ir_builder->getInt16Ty());
+}
+
+llvm::Value* EmitBF16ToF32(llvm::Value* bf16_value,
+                           llvm::IRBuilder<>* ir_builder) {
+  auto as_int16 =
+      ir_builder->CreateBitCast(bf16_value, ir_builder->getInt16Ty());
+  auto as_int32 = ir_builder->CreateZExt(as_int16, ir_builder->getInt32Ty());
+  auto shifted = ir_builder->CreateShl(as_int32, 16);
+  return ir_builder->CreateBitCast(shifted, ir_builder->getFloatTy());
+}
+
+llvm::Value* EmitIntegralToFloating(llvm::Value* integer_value,
+                                    PrimitiveType from_type,
+                                    PrimitiveType to_type, llvm::Module* module,
+                                    llvm::IRBuilder<>* ir_builder) {
+  if (primitive_util::IsSignedIntegralType(from_type)) {
+    return ir_builder->CreateSIToFP(
+        integer_value, llvm_ir::PrimitiveTypeToIrType(to_type, module));
+  } else {
+    CHECK(primitive_util::IsUnsignedIntegralType(from_type) ||
+          from_type == PRED);
+    return ir_builder->CreateUIToFP(
+        integer_value, llvm_ir::PrimitiveTypeToIrType(to_type, module));
+  }
+}
+
+}  // namespace
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitUnaryOp(
     const HloInstruction* op, llvm::Value* operand_value) const {
   if (op->opcode() == HloOpcode::kCopy) {
     return operand_value;
-  } else if (operand_value->getType()->isIntegerTy()) {
+  } else if (ShapeUtil::ElementIsIntegral(op->operand(0)->shape()) ||
+             op->operand(0)->shape().element_type() == PRED) {
     return EmitIntegerUnaryOp(op, operand_value);
   } else if (ShapeUtil::ElementIsComplex(op->operand(0)->shape())) {
     return EmitComplexUnaryOp(op, operand_value);
@@ -79,15 +229,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
             primitive_util::IsSignedIntegralType(to_type));
       }
       if (primitive_util::IsFloatingPointType(to_type)) {
-        if (primitive_util::IsSignedIntegralType(from_type)) {
-          return ir_builder_->CreateSIToFP(
-              operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
-        }
-        if (primitive_util::IsUnsignedIntegralType(from_type) ||
-            from_type == PRED) {
-          return ir_builder_->CreateUIToFP(
-              operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
+        if (to_type == BF16) {
+          return EmitF32ToBF16(
+              EmitIntegralToFloating(operand_value, from_type, F32, module_,
+                                     ir_builder_),
+              ir_builder_);
         }
+        return EmitIntegralToFloating(operand_value, from_type, to_type,
+                                      module_, ir_builder_);
       }
       if (primitive_util::IsComplexType(to_type)) {
         auto to_ir_component_type = llvm_ir::PrimitiveTypeToIrType(
@@ -207,6 +356,17 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
                 llvm_ir::PrimitiveTypeToIrType(to_component_type, module_)),
             nullptr);
       }
+      if (from_type == BF16) {
+        TF_RET_CHECK(to_type != BF16);
+        operand_value = EmitBF16ToF32(operand_value, ir_builder_);
+        from_type = F32;
+        if (from_type == to_type) {
+          return operand_value;
+        }
+      }
+      if (from_type == F32 && to_type == BF16) {
+        return EmitF32ToBF16(operand_value, ir_builder_);
+      }
       if (primitive_util::IsFloatingPointType(to_type)) {
         return ir_builder_->CreateFPCast(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
@@ -244,21 +404,13 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
           primitive_util::BitWidth(to_type));
     }
     case HloOpcode::kExp:
-      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {operand_value},
-                                          {operand_value->getType()},
-                                          ir_builder_);
+      return EmitExp(op->shape().element_type(), operand_value);
     case HloOpcode::kLog:
-      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::log, {operand_value},
-                                          {operand_value->getType()},
-                                          ir_builder_);
+      return EmitLog(op->shape().element_type(), operand_value);
     case HloOpcode::kCos:
-      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {operand_value},
-                                          {operand_value->getType()},
-                                          ir_builder_);
+      return EmitCos(op->shape().element_type(), operand_value);
     case HloOpcode::kSin:
-      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {operand_value},
-                                          {operand_value->getType()},
-                                          ir_builder_);
+      return EmitSin(op->shape().element_type(), operand_value);
     case HloOpcode::kFloor:
       return llvm_ir::EmitCallToIntrinsic(
           llvm::Intrinsic::floor, {operand_value}, {operand_value->getType()},
@@ -276,7 +428,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
           llvm::Intrinsic::round, {operand_value}, {operand_value->getType()},
           ir_builder_);
     case HloOpcode::kSign: {
-      // TODO(b/32151903): Ensure consistent sign behavior for -0.0
+      // TODO(b/32151903): Ensure consistent sign behavior for -0.0.
       auto type = operand_value->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
       auto oeq = ir_builder_->CreateFCmpOEQ(operand_value, zero);
@@ -309,9 +461,25 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
     const HloInstruction* op, llvm::Value* operand_value) const {
+  PrimitiveType input_type = op->operand(0)->shape().element_type();
+  PrimitiveType component_type =
+      primitive_util::IsComplexType(input_type)
+          ? primitive_util::ComplexComponentType(input_type)
+          : input_type;
   switch (op->opcode()) {
-    // TODO(b/65209142): Angle/Log require atan2.
-    // case HloOpcode::kLog:  // log(a+bi) = .5*log(a^2+b^2) + i*atan2(b, a)
+    case HloOpcode::kLog: {
+      // log(a+bi) = .5*log(a^2+b^2) + i*atan2(b, a)
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      llvm::Type* llvm_ty = a->getType();
+      auto sum_sq = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
+                                            ir_builder_->CreateFMul(b, b));
+      TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog(component_type, sum_sq));
+      TF_ASSIGN_OR_RETURN(auto angle, EmitAtan2(component_type, b, a));
+      auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5);
+      return EmitComposeComplex(
+          op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle);
+    }
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
       TF_RET_CHECK(primitive_util::IsComplexType(from_type));
@@ -333,15 +501,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
     }
     case HloOpcode::kExp: {
       // e^(a+bi) = e^a*(cos(b)+sin(b)i)
-      auto exp_a = llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::exp, {EmitExtractReal(operand_value)},
-          {EmitExtractReal(operand_value)->getType()}, ir_builder_);
-      auto cos_b = llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::cos, {EmitExtractImag(operand_value)},
-          {EmitExtractImag(operand_value)->getType()}, ir_builder_);
-      auto sin_b = llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::sin, {EmitExtractImag(operand_value)},
-          {EmitExtractImag(operand_value)->getType()}, ir_builder_);
+      TF_ASSIGN_OR_RETURN(
+          auto exp_a, EmitExp(component_type, EmitExtractReal(operand_value)));
+      TF_ASSIGN_OR_RETURN(
+          auto cos_b, EmitCos(component_type, EmitExtractImag(operand_value)));
+      TF_ASSIGN_OR_RETURN(
+          auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value)));
       return EmitComposeComplex(op, ir_builder_->CreateFMul(exp_a, cos_b),
                                 ir_builder_->CreateFMul(exp_a, sin_b));
     }
@@ -356,16 +521,13 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto a = EmitExtractReal(operand_value);
       auto b = EmitExtractImag(operand_value);
       auto type = a->getType();
-      auto exp_b = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {b},
-                                                {type}, ir_builder_);
+      TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b));
       auto half_exp_b =
           ir_builder_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b);
       auto half_exp_neg_b =
           ir_builder_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
-      auto cos_a = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {a},
-                                                {type}, ir_builder_);
-      auto sin_a = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {a},
-                                                {type}, ir_builder_);
+      TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a));
+      TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a));
       return EmitComposeComplex(
           op,
           ir_builder_->CreateFMul(
@@ -386,16 +548,13 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto a = EmitExtractReal(operand_value);
       auto b = EmitExtractImag(operand_value);
       auto type = a->getType();
-      auto exp_b = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {b},
-                                                {type}, ir_builder_);
+      TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b));
       auto half_exp_b =
           ir_builder_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b);
       auto half_exp_neg_b =
           ir_builder_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
-      auto cos_a = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {a},
-                                                {type}, ir_builder_);
-      auto sin_a = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {a},
-                                                {type}, ir_builder_);
+      TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a));
+      TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a));
       return EmitComposeComplex(
           op,
           ir_builder_->CreateFMul(
@@ -403,6 +562,58 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
           ir_builder_->CreateFMul(
               cos_a, ir_builder_->CreateFSub(half_exp_b, half_exp_neg_b)));
     }
+    case HloOpcode::kTanh: {
+      /*
+      tanh=(exp(x)-exp(-x)) / (exp(x)+exp(-x))
+      e^(a+bi) = e^a*(cos(b)+sin(b)i)
+      so tanh=(((cos(b)+sin(b)i)e^a - (cos(-b)+sin(-b)i)e^-a)) /
+              (((cos(b)+sin(b)i)e^a + (cos(-b)+sin(-b)i)e^-a))
+      cos(b)=cos(-b), sin(-b)=-sin(b)
+      so tanh=(((cos(b)+sin(b)i)e^a - (cos(b)-sin(b)i)e^-a)) /
+              (((cos(b)+sin(b)i)e^a + (cos(b)-sin(b)i)e^-a))
+             =(cos(b)e^a+i*sin(b)e^a + cos(b)(-e^-a)+i*sin(b)e^-a) /
+              (cos(b)e^a+i*sin(b)e^a + cos(b)e^-a+i*sin(b)(-e^-a))
+             =(cos(b)(e^a-e^-a) + i*sin(b)(e^a+e^-a)) /
+              (cos(b)(e^a+e^-a) + i*sin(b)(e^a-e^-a))
+      This is a complex division, so we can multiply by denom_conj/denom_conj
+             =(cos(b)(e^a-e^-a) + i*sin(b)(e^a+e^-a)) *
+              (cos(b)(e^a+e^-a) - i*sin(b)(e^a-e^-a)) /
+              ((cos(b)(e^a+e^-a))^2 + (sin(b)(e^a-e^-a))^2)
+             =(cos(b)^2(e^(2a)-e^(-2a)) + sin(b)^2(e^(2a)-e^(-2a)) +
+               i*(cos(b)sin(b)(e^a+e^-a)^2 - cos(b)sin(b)(e^a-e^-a)^2)) /
+              ((cos(b)(e^a+e^-a))^2 + (sin(b)(e^a-e^-a))^2)
+      */
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      TF_ASSIGN_OR_RETURN(auto exp_a, EmitExp(component_type, a));
+      TF_ASSIGN_OR_RETURN(auto cos_b, EmitCos(component_type, b));
+      TF_ASSIGN_OR_RETURN(auto sin_b, EmitSin(component_type, b));
+      auto exp_neg_a = ir_builder_->CreateFDiv(
+          llvm::ConstantFP::get(exp_a->getType(), 1), exp_a);
+      auto exp_2a_minus_exp_neg_2a = ir_builder_->CreateFSub(
+          ir_builder_->CreateFMul(exp_a, exp_a),
+          ir_builder_->CreateFMul(exp_neg_a, exp_neg_a));
+      auto cos_b_sq = ir_builder_->CreateFMul(cos_b, cos_b);
+      auto sin_b_sq = ir_builder_->CreateFMul(sin_b, sin_b);
+      auto real_num = ir_builder_->CreateFAdd(
+          ir_builder_->CreateFMul(cos_b_sq, exp_2a_minus_exp_neg_2a),
+          ir_builder_->CreateFMul(sin_b_sq, exp_2a_minus_exp_neg_2a));
+      auto cos_b_sin_b = ir_builder_->CreateFMul(cos_b, sin_b);
+      auto exp_a_plus_exp_neg_a = ir_builder_->CreateFAdd(exp_a, exp_neg_a);
+      auto exp_a_plus_exp_neg_a_sq =
+          ir_builder_->CreateFMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a);
+      auto exp_a_minus_exp_neg_a = ir_builder_->CreateFSub(exp_a, exp_neg_a);
+      auto exp_a_minus_exp_neg_a_sq =
+          ir_builder_->CreateFMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a);
+      auto imag_num = ir_builder_->CreateFMul(
+          cos_b_sin_b, ir_builder_->CreateFSub(exp_a_plus_exp_neg_a_sq,
+                                               exp_a_minus_exp_neg_a_sq));
+      auto denom = ir_builder_->CreateFAdd(
+          ir_builder_->CreateFMul(cos_b_sq, exp_a_plus_exp_neg_a_sq),
+          ir_builder_->CreateFMul(sin_b_sq, exp_a_minus_exp_neg_a_sq));
+      return EmitComposeComplex(op, ir_builder_->CreateFDiv(real_num, denom),
+                                ir_builder_->CreateFDiv(imag_num, denom));
+    }
     case HloOpcode::kAbs: {
       auto sum_sq = ir_builder_->CreateFAdd(
           ir_builder_->CreateFMul(EmitExtractReal(operand_value),
@@ -449,7 +660,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value,
     llvm::Value* rhs_value) const {
   PrimitiveType operand_type = op->operand(0)->shape().element_type();
-  if (lhs_value->getType()->isIntegerTy()) {
+  if (ShapeUtil::ElementIsIntegral(op->operand(0)->shape()) ||
+      operand_type == PRED) {
     return EmitIntegerBinaryOp(
         op, lhs_value, rhs_value,
         primitive_util::IsSignedIntegralType(operand_type));
@@ -464,7 +676,6 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value,
     llvm::Value* rhs_value) const {
   switch (op->opcode()) {
-    // case HloOpcode::kAtan2:  // TODO(b/65209142): CPU atan2 support
     case HloOpcode::kComplex:
       return EmitComposeComplex(op, lhs_value, rhs_value);
     case HloOpcode::kAdd:
@@ -508,10 +719,9 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
     case HloOpcode::kMinimum:
       return EmitFloatMin(lhs_value, rhs_value);
     case HloOpcode::kPower:
-      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::pow,
-                                          {lhs_value, rhs_value},
-                                          {lhs_value->getType()}, ir_builder_);
-
+      return EmitPow(op->shape().element_type(), lhs_value, rhs_value);
+    case HloOpcode::kAtan2:
+      return EmitAtan2(op->shape().element_type(), lhs_value, rhs_value);
     default:
       return Unimplemented("binary floating point op '%s'",
                            HloOpcodeString(op->opcode()).c_str());
@@ -607,9 +817,40 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
                                   EmitExtractImag(lhs_value),
                                   EmitExtractImag(rhs_value), ir_builder_));
 
-    // TODO(b/65209142): requires arg(z) -> requires atan|atan2 intrinsic
-    // case HloOpcode::kPower:
-    // // (a+bi)^(c+di) = exp(i(c+di)*arg(a+bi)) * (a*a+b*b)^(c/2+di/2)
+    case HloOpcode::kPower: {
+      // (a+bi)^(c+di) =
+      //    (a*a+b*b)^(0.5c) * exp(-d*atan2(b,a)) * (cos(q) + i*sin(q)),
+      //    where q = c*atan2(b,a)+0.5d*ln(a*a+b*b)
+      PrimitiveType component_type =
+          primitive_util::ComplexComponentType(op->shape().element_type());
+      auto a = EmitExtractReal(lhs_value);
+      auto b = EmitExtractImag(lhs_value);
+      auto c = EmitExtractReal(rhs_value);
+      auto d = EmitExtractImag(rhs_value);
+      auto aa_p_bb = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
+                                             ir_builder_->CreateFMul(b, b));
+      auto one_half = llvm::ConstantFP::get(a->getType(), 0.5);
+      auto half_c = ir_builder_->CreateFMul(one_half, c);
+
+      TF_ASSIGN_OR_RETURN(auto aa_p_bb_to_half_c,
+                          EmitPow(component_type, aa_p_bb, half_c));
+      auto neg_d = ir_builder_->CreateFNeg(d);
+      TF_ASSIGN_OR_RETURN(auto arg_lhs, EmitAtan2(component_type, b, a));
+      auto neg_d_arg_lhs = ir_builder_->CreateFMul(neg_d, arg_lhs);
+      TF_ASSIGN_OR_RETURN(auto e_to_neg_d_arg_lhs,
+                          EmitExp(component_type, neg_d_arg_lhs));
+      auto coeff =
+          ir_builder_->CreateFMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs);
+      TF_ASSIGN_OR_RETURN(auto ln_aa_p_bb, EmitLog(component_type, aa_p_bb));
+      auto half_d = ir_builder_->CreateFMul(one_half, d);
+      auto q =
+          ir_builder_->CreateFAdd(ir_builder_->CreateFMul(c, arg_lhs),
+                                  ir_builder_->CreateFMul(half_d, ln_aa_p_bb));
+      TF_ASSIGN_OR_RETURN(auto cos_q, EmitCos(component_type, q));
+      TF_ASSIGN_OR_RETURN(auto sin_q, EmitSin(component_type, q));
+      return EmitComposeComplex(op, ir_builder_->CreateFMul(coeff, cos_q),
+                                ir_builder_->CreateFMul(coeff, sin_q));
+    }
     default:
       return Unimplemented("binary complex op '%s'",
                            HloOpcodeString(op->opcode()).c_str());
@@ -629,7 +870,10 @@ llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value,
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
                                                       llvm::Value* x) const {
   if (prim_type != F32) {
-    return Unimplemented("inverse erf only implemented for F32 (b/34339814)");
+    // TODO(b/34339814): Implement inverse erf for F64.
+    return Unimplemented(
+        "Inverse erf is only implemented for element "
+        "type F32.");
   }
   auto getFloat = [&](const float f) {
     return llvm::ConstantFP::get(ir_builder_->getFloatTy(), f);
@@ -712,116 +956,51 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfcInv(
   return EmitErfInv(prim_type, ir_builder_->CreateFSub(one, value));
 }
 
-StatusOr<llvm::Value*> ElementalIrEmitter::EmitReducePrecision(
-    const HloInstruction* hlo, llvm::Value* x) const {
-  if (hlo->operand(0)->shape().element_type() != F32) {
-    return Unimplemented("reduce-precision only implemented for F32");
-  }
-
-  // Integer and float types for casting and constant generation.
-  llvm::Type* float_type = x->getType();
-  llvm::IntegerType* int_type = ir_builder_->getInt32Ty();
-
-  // Cast the input value to an integer for bitwise manipulation.
-  llvm::Value* x_as_int = ir_builder_->CreateBitCast(x, int_type);
-
-  if (hlo->mantissa_bits() < 23) {
-    // Last remaining mantissa bit.
-    const uint32_t last_mantissa_bit_mask = 1u << (23 - hlo->mantissa_bits());
-
-    // Compute rounding bias for round-to-nearest with ties to even.  This is
-    // equal to a base value of 0111... plus one bit if the last remaining
-    // mantissa bit is 1.
-    const uint32_t base_rounding_bias = (last_mantissa_bit_mask >> 1) - 1;
-    llvm::Value* x_last_mantissa_bit = ir_builder_->CreateLShr(
-        ir_builder_->CreateAnd(
-            x_as_int, llvm::ConstantInt::get(int_type, last_mantissa_bit_mask)),
-        (23 - hlo->mantissa_bits()));
-    llvm::Value* x_rounding_bias = ir_builder_->CreateAdd(
-        x_last_mantissa_bit,
-        llvm::ConstantInt::get(int_type, base_rounding_bias));
-
-    // Add rounding bias, and mask out truncated bits.  Note that the case
-    // where adding the rounding bias overflows into the exponent bits is
-    // correct; the non-masked mantissa bits will all be zero, and the
-    // exponent will be incremented by one.
-    const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
-    x_as_int = ir_builder_->CreateAdd(x_as_int, x_rounding_bias);
-    x_as_int = ir_builder_->CreateAnd(
-        x_as_int, llvm::ConstantInt::get(int_type, truncation_mask));
-  }
-
-  if (hlo->exponent_bits() < 8) {
-    // Masks for f32 values.
-    const uint32_t f32_sign_bit_mask = 1u << 31;
-    const uint32_t f32_exp_bits_mask = 0xffu << 23;
-
-    // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the most-
-    // significant bit -- is equal to 1.0f for all exponent sizes.  Adding
-    // 2^(n-1)-1 to this gives us the highest non-infinite exponent for a bit-
-    // size of n, and subtracting 2^(n-1)-1 from this gives us the lowest'
-    // exponent (corresponding to 0.0f).
-    //
-    // Thus, the f32 exponent corresponding to the highest non-infinite
-    // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
-    // exponent corresponding to the lowest exponent for a bit size of n is
-    // (2^7-1) - 2^(n-1)-1.
-    //
-    // Note that we have already checked that exponents_bits >= 1.
-    const uint32_t f32_exponent_bias = (1 << 7) - 1;
-    const uint32_t reduced_exponent_bias =
-        (1 << (hlo->exponent_bits() - 1)) - 1;
-    const uint32_t reduced_max_exponent =
-        f32_exponent_bias + reduced_exponent_bias;
-    const uint32_t reduced_min_exponent =
-        f32_exponent_bias - reduced_exponent_bias;
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog(PrimitiveType prim_type,
+                                                   llvm::Value* value) const {
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::log, {value},
+                                      {value->getType()}, ir_builder_);
+}
 
-    // Do we overflow or underflow?
-    llvm::Value* x_exponent = ir_builder_->CreateAnd(
-        x_as_int, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
-    llvm::Value* x_overflows = ir_builder_->CreateICmpUGT(
-        x_exponent,
-        llvm::ConstantInt::get(int_type, reduced_max_exponent << 23));
-    llvm::Value* x_underflows = ir_builder_->CreateICmpULE(
-        x_exponent,
-        llvm::ConstantInt::get(int_type, reduced_min_exponent << 23));
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitSin(PrimitiveType prim_type,
+                                                   llvm::Value* value) const {
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {value},
+                                      {value->getType()}, ir_builder_);
+}
 
-    // Compute appropriately-signed values of zero and infinity.
-    llvm::Value* x_signed_zero = ir_builder_->CreateAnd(
-        x_as_int, llvm::ConstantInt::get(int_type, f32_sign_bit_mask));
-    llvm::Value* x_signed_inf = ir_builder_->CreateOr(
-        x_signed_zero, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitCos(PrimitiveType prim_type,
+                                                   llvm::Value* value) const {
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {value},
+                                      {value->getType()}, ir_builder_);
+}
 
-    // Force to zero or infinity if overflow or underflow.  (Note that this
-    // truncates all denormal values to zero, rather than rounding them.)
-    x_as_int = ir_builder_->CreateSelect(x_overflows, x_signed_inf, x_as_int);
-    x_as_int = ir_builder_->CreateSelect(x_underflows, x_signed_zero, x_as_int);
-  }
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitExp(PrimitiveType prim_type,
+                                                   llvm::Value* value) const {
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {value},
+                                      {value->getType()}, ir_builder_);
+}
 
-  // Cast the result back to a floating-point type.
-  llvm::Value* result = ir_builder_->CreateBitCast(x_as_int, float_type);
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitPow(PrimitiveType prim_type,
+                                                   llvm::Value* lhs,
+                                                   llvm::Value* rhs) const {
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::pow, {lhs, rhs},
+                                      {lhs->getType()}, ir_builder_);
+}
 
-  // Correct result for NaN inputs.
-  //
-  // The exponent handling will "normalize" NaN values to infinities, which is
-  // undesirable (except in the case with no mantissa bits, in which case it
-  // is mandatory).  This logic also handles cases where mantissa-rounding
-  // causes a NaN's mantissa to overflow into the exponent bits, which would
-  // otherwise create an erroneous zero value.
-  //
-  // If the fast-math flags are set to assume no NaNs, the comparison is likely
-  // to be optimized away, so there's no point in even emitting it.
-  if (!ir_builder_->getFastMathFlags().noNaNs()) {
-    llvm::Value* x_is_nan = ir_builder_->CreateFCmpUNO(x, x);
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
+                                                     llvm::Value* lhs,
+                                                     llvm::Value* rhs) const {
+  return Unimplemented("atan2");
+}
 
-    if (hlo->mantissa_bits() > 0) {
-      result = ir_builder_->CreateSelect(x_is_nan, x, result);
-    } else {
-      result = ir_builder_->CreateSelect(
-          x_is_nan, llvm::ConstantFP::getInfinity(float_type), result);
-    }
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitReducePrecision(
+    const HloInstruction* hlo, llvm::Value* x) const {
+  if (hlo->operand(0)->shape().element_type() != F32) {
+    return Unimplemented("reduce-precision only implemented for F32");
   }
-  return result;
+  return EmitReducePrecisionFloat(x, /*exponent_bits=*/hlo->exponent_bits(),
+                                  /*mantissa_bits=*/hlo->mantissa_bits(),
+                                  ir_builder_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
@@ -864,17 +1043,9 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
           is_signed ? llvm::CmpInst::ICMP_SGE : llvm::CmpInst::ICMP_UGE,
           lhs_value, rhs_value, ir_builder_);
     case HloOpcode::kMinimum:
-      return ir_builder_->CreateSelect(
-          ir_builder_->CreateICmp(
-              is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE,
-              lhs_value, rhs_value),
-          lhs_value, rhs_value);
+      return EmitIntegralMin(lhs_value, rhs_value, is_signed);
     case HloOpcode::kMaximum:
-      return ir_builder_->CreateSelect(
-          ir_builder_->CreateICmp(
-              is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE,
-              lhs_value, rhs_value),
-          lhs_value, rhs_value);
+      return EmitIntegralMax(lhs_value, rhs_value, is_signed);
     case HloOpcode::kAnd:
       return ir_builder_->CreateAnd(lhs_value, rhs_value);
     case HloOpcode::kOr:
@@ -891,6 +1062,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
   }
 }
 
+llvm::Value* ElementalIrEmitter::EmitIntegralMax(llvm::Value* lhs_value,
+                                                 llvm::Value* rhs_value,
+                                                 bool is_signed) const {
+  return ir_builder_->CreateSelect(
+      ir_builder_->CreateICmp(
+          is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE,
+          lhs_value, rhs_value),
+      lhs_value, rhs_value);
+}
+
+llvm::Value* ElementalIrEmitter::EmitIntegralMin(llvm::Value* lhs_value,
+                                                 llvm::Value* rhs_value,
+                                                 bool is_signed) const {
+  return ir_builder_->CreateSelect(
+      ir_builder_->CreateICmp(
+          is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE,
+          lhs_value, rhs_value),
+      lhs_value, rhs_value);
+}
+
 llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
     const llvm_ir::IrArray::Index& target_index, const HloInstruction& hlo,
     int64 operand_no) const {
@@ -1088,14 +1279,6 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator(
                             get_next_uniform_float())));
         return ir_builder_->CreateFAdd(ir_builder_->CreateFMul(r, s), m);
       }
-      case RNG_BERNOULLI: {
-        TF_ASSIGN_OR_RETURN(llvm::Value * p,
-                            operand_to_generator.at(hlo->operand(0))(index));
-        return ir_builder_->CreateZExt(
-            ir_builder_->CreateFCmpOLT(get_next_uniform_float(), p),
-            llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(),
-                                           module_));
-      }
       default:
         return InvalidArgument(
             "unhandled distribution %s",
@@ -1195,7 +1378,18 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         TF_ASSIGN_OR_RETURN(llvm::Value * max_value,
                             operand_to_generator.at(hlo->operand(2))(
                                 ElementwiseSourceIndex(index, *hlo, 2)));
-        return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
+        PrimitiveType prim_type = hlo->shape().element_type();
+        if (primitive_util::IsFloatingPointType(prim_type)) {
+          return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
+        } else if (primitive_util::IsIntegralType(prim_type)) {
+          bool is_signed = primitive_util::IsSignedIntegralType(prim_type);
+          return EmitIntegralMin(
+              max_value, EmitIntegralMax(min_value, arg_value, is_signed),
+              is_signed);
+        } else {
+          return Unimplemented("Clamp unimplemented for %s",
+                               PrimitiveType_Name(prim_type).c_str());
+        }
       };
     case HloOpcode::kReducePrecision:
       return [this, hlo, &operand_to_generator](
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index cccb498f82936283a215370787907b293827ff2d..c516a826d9e382bc738e54635426db639d17108c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -39,7 +39,7 @@ class ElementalIrEmitter {
         module_(module),
         hlo_module_config_(hlo_module_config) {}
 
-  virtual ~ElementalIrEmitter() {}
+  virtual ~ElementalIrEmitter() = default;
 
   virtual StatusOr<llvm::Value*> EmitUnaryOp(const HloInstruction* op,
                                              llvm::Value* operand_value) const;
@@ -86,12 +86,38 @@ class ElementalIrEmitter {
   virtual llvm::Value* EmitFloatMin(llvm::Value* lhs_value,
                                     llvm::Value* rhs_value) const;
 
+  llvm::Value* EmitIntegralMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
+                               bool is_signed) const;
+
+  llvm::Value* EmitIntegralMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
+                               bool is_signed) const;
+
   virtual StatusOr<llvm::Value*> EmitErfInv(PrimitiveType prim_type,
                                             llvm::Value* value) const;
 
   virtual StatusOr<llvm::Value*> EmitErfcInv(PrimitiveType prim_type,
                                              llvm::Value* value) const;
 
+  virtual StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type,
+                                           llvm::Value* lhs,
+                                           llvm::Value* rhs) const;
+
+  virtual StatusOr<llvm::Value*> EmitLog(PrimitiveType prim_type,
+                                         llvm::Value* value) const;
+
+  virtual StatusOr<llvm::Value*> EmitSin(PrimitiveType prim_type,
+                                         llvm::Value* value) const;
+
+  virtual StatusOr<llvm::Value*> EmitCos(PrimitiveType prim_type,
+                                         llvm::Value* value) const;
+
+  virtual StatusOr<llvm::Value*> EmitExp(PrimitiveType prim_type,
+                                         llvm::Value* value) const;
+
+  virtual StatusOr<llvm::Value*> EmitPow(PrimitiveType prim_type,
+                                         llvm::Value* lhs,
+                                         llvm::Value* rhs) const;
+
   virtual StatusOr<llvm::Value*> EmitReducePrecision(const HloInstruction* hlo,
                                                      llvm::Value* x) const;
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 9c96d9eb30b5f9e51b7f5d82391c6b9f366898d6..90481c7a88f90edea5399ee44aee2d2c77fc115f 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -24,25 +24,25 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 
+using tensorflow::gtl::ArraySlice;
+
 namespace xla {
 
-StatusOr<std::vector<perftools::gputools::DeviceMemoryBase>>
+StatusOr<std::vector<std::unique_ptr<ShapedBuffer>>>
 Executable::ExecuteOnStreams(
-    tensorflow::gtl::ArraySlice<const ServiceExecutableRunOptions> run_options,
-    tensorflow::gtl::ArraySlice<
-        tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
-        arguments) {
+    ArraySlice<const ServiceExecutableRunOptions> run_options,
+    ArraySlice<ArraySlice<const ShapedBuffer*>> arguments) {
   TF_RET_CHECK(run_options.size() == arguments.size());
 
+  std::vector<std::unique_ptr<ShapedBuffer>> return_values(run_options.size());
+
   if (run_options.size() == 1) {
-    TF_ASSIGN_OR_RETURN(auto result,
+    TF_ASSIGN_OR_RETURN(return_values[0],
                         ExecuteOnStream(&run_options[0], arguments[0],
                                         /*hlo_execution_profile=*/nullptr));
-    return std::vector<perftools::gputools::DeviceMemoryBase>({result});
+    return std::move(return_values);
   }
 
-  std::vector<perftools::gputools::DeviceMemoryBase> return_values(
-      run_options.size());
   for (size_t i = 0; i < run_options.size(); ++i) {
     // We cannot BlockHostUntilDone() on the already-launched executions in case
     // of error, since if the executions communicate, the initially launched
@@ -52,9 +52,77 @@ Executable::ExecuteOnStreams(
   }
   for (const auto& options : run_options) {
     TF_RET_CHECK(options.stream() != nullptr);
-    options.stream()->BlockHostUntilDone();
+    TF_RETURN_IF_ERROR(options.stream()->BlockHostUntilDone());
+  }
+  return std::move(return_values);
+}
+
+StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
+    const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
+    ArraySlice<const ShapedBuffer*> arguments) {
+  perftools::gputools::Stream* stream = run_options->stream();
+  std::unique_ptr<perftools::gputools::Timer> timer;
+  if (profile != nullptr) {
+    timer.reset(new perftools::gputools::Timer(stream->parent()));
+    stream->InitTimer(timer.get()).ThenStartTimer(timer.get());
   }
-  return return_values;
+
+  VLOG(1) << "enqueueing executable on stream...";
+  // If the profiling flag isn't enabled, we pass nullptr as the profile to
+  // indicate profiling is not requested.
+  std::unique_ptr<HloExecutionProfile> profile_ptr =
+      module_config().debug_options().xla_hlo_profile() &&
+              hlo_profiling_enabled()
+          ? MakeUnique<HloExecutionProfile>(&hlo_profile_printer_data(),
+                                            &hlo_profile_index_map())
+          : nullptr;
+
+  StatusOr<std::unique_ptr<ShapedBuffer>> return_value =
+      ExecuteOnStream(run_options, arguments, profile_ptr.get());
+
+  if (profile != nullptr) {
+    VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
+    stream->ThenStopTimer(timer.get());
+    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+    VLOG(1) << "done with block-host-until-done";
+
+    // Merge in run-time profile information from execution_profile.
+    //
+    // TODO(b/71713097): This is buggy -- even though the mutex takes care of
+    // C++ level races, some other concurrent ExecuteOnStreamWrapper call could
+    // have rewritten the execution_profile before we get to it.
+    profile->MergeFrom(execution_profile());
+
+    // Overall execution time (in nanoseconds) from the executor timer.
+    if (stream->ok()) {
+      // Don't read timer->Nanoseconds() if the stream isn't OK -- that's
+      // illegal.
+      profile->set_compute_and_transfer_time_ns(timer->Nanoseconds());
+    }
+
+    // TODO(b/28123297): On GPU we end up including transfer time in
+    // the compute time this way. Instead, we should get the correct
+    // value by measuring it. Setting the field here at least lets
+    // benchmarks provide *some* value for GPU computations.
+    //
+    // TODO(b/28447609): The value in compute_and_transfer_time_ns is actually
+    // the compute time without the transfer time, so this way we get the
+    // correct compute time. We should instead have the correct value for
+    // compute_and_transfer_time and set compute_time to the compute time.
+    if (profile->compute_time_ns() == 0) {
+      profile->set_compute_time_ns(profile->compute_and_transfer_time_ns());
+    }
+  }
+
+  if (profile_ptr != nullptr) {
+    XLA_LOG_LINES(
+        tensorflow::INFO,
+        profile_ptr->ToString(stream->parent()->GetDeviceDescription()));
+    hlo_graph_dumper::MaybeDumpHloModule(module(), "Service::Execute",
+                                         profile_ptr.get());
+  }
+
+  return return_value;
 }
 
 Status Executable::DumpSessionModule() {
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 08862308c90af736c1adcaa9438973f858852506..0aee535ee780ef000bc5e9963ff48786b3a61eb2 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -44,13 +44,14 @@ namespace xla {
 // interface that is used for launching compiled programs across platforms.
 class Executable {
  public:
-  explicit Executable(std::unique_ptr<const HloModule> hlo_module,
-                      std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
-                      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+  explicit Executable(
+      std::unique_ptr<const HloModule> hlo_module,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
+      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
       : hlo_module_(std::move(hlo_module)),
-        hlo_profile_printer_(std::move(hlo_profile_printer)),
+        hlo_profile_printer_data_(std::move(hlo_profile_printer_data)),
         hlo_profile_index_map_(std::move(hlo_profile_index_map)) {
-    CHECK_EQ(hlo_profile_printer_.get() == nullptr,
+    CHECK_EQ(hlo_profile_printer_data_.get() == nullptr,
              hlo_profile_index_map_.get() == nullptr);
   }
   virtual ~Executable() {}
@@ -61,16 +62,7 @@ class Executable {
   // If the hlo_execution_profile is provided as non-nullptr, profiling will be
   // enabled.
   //
-  // Returns the device memory region that a successful execution would
-  // populate.
-  virtual StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      HloExecutionProfile* hlo_execution_profile) = 0;
-
-  // Overload of ExecuteOnStream which returns and takes arguments as
-  // ShapedBuffers. Used for LocalService execution.
+  // Returns a shaped buffer containing the result of the computation.
   virtual StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
@@ -78,21 +70,19 @@ class Executable {
 
   // Same as ExecuteOnStream(), but this call is non-blocking and returns as
   // soon as all of the operations are enqueued for launch on the stream.
-  virtual StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
+  virtual StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments) = 0;
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) = 0;
 
   // Same as ExecuteOnStream(), but runs this executable on multiple
   // streams. arguments[i] contains the arguments to the execution on
   // run_options[i]->stream() and the returned value is at index i of the
   // returned vector.
-  virtual StatusOr<std::vector<perftools::gputools::DeviceMemoryBase>>
-  ExecuteOnStreams(
+  virtual StatusOr<std::vector<std::unique_ptr<ShapedBuffer>>> ExecuteOnStreams(
       tensorflow::gtl::ArraySlice<const ServiceExecutableRunOptions>
           run_options,
       tensorflow::gtl::ArraySlice<
-          tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
+          tensorflow::gtl::ArraySlice<const ShapedBuffer*>>
           arguments);
 
   // Populates `hlo_execution_profile` from `executor`. This is implicit in any
@@ -107,13 +97,10 @@ class Executable {
 
   // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a
   // timer for the execution, sets up HLO profiling if enabled, and fills in the
-  // given ExecutionProfile if non-null.  The ExecuteOnStream overloads have
-  // different argument types and return types, so this method is templated on
-  // argument type and return type of the execute function.
-  template <typename ReturnT, typename ArgT>
-  StatusOr<ReturnT> ExecuteOnStreamWrapper(
+  // given ExecutionProfile if non-null.
+  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStreamWrapper(
       const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
-      const ArgT& arguments);
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
 
   // Returns the ExecutionProfile from executing on the device. This includes
   // the number of cycles taken for the computation or the compilation time.
@@ -130,9 +117,9 @@ class Executable {
         "Equality test on this executable is not implemented.");
   }
 
-  const HloProfilePrinter& hlo_profile_printer() const {
+  const HloProfilePrinterData& hlo_profile_printer_data() const {
     CHECK(hlo_profiling_enabled());
-    return *hlo_profile_printer_;
+    return *hlo_profile_printer_data_;
   }
 
   const HloProfileIndexMap& hlo_profile_index_map() const {
@@ -143,7 +130,9 @@ class Executable {
   // Returns whether this executable was compiled with HLO profilings support
   // enabled. If not, the caller should not expect an hlo_execution_profile
   // passed to ExecuteOnStream above to be populated during execution.
-  bool hlo_profiling_enabled() const { return hlo_profile_printer_ != nullptr; }
+  bool hlo_profiling_enabled() const {
+    return hlo_profile_printer_data_ != nullptr;
+  }
 
   const HloModule& module() const { return *hlo_module_; }
 
@@ -193,70 +182,10 @@ class Executable {
   // execution.
   int64 execution_count_ = 0;
 
-  std::unique_ptr<HloProfilePrinter> hlo_profile_printer_;
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
   std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_;
 };
 
-template <typename ReturnT, typename ArgT>
-StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
-    const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile,
-    const ArgT& arguments) {
-  perftools::gputools::Stream* stream = run_options->stream();
-  std::unique_ptr<perftools::gputools::Timer> timer;
-  if (profile != nullptr) {
-    timer.reset(new perftools::gputools::Timer(stream->parent()));
-    stream->InitTimer(timer.get()).ThenStartTimer(timer.get());
-  }
-
-  VLOG(1) << "enqueueing executable on stream...";
-  // If the profiling flag isn't enabled, we pass nullptr as the profile to
-  // indicate profiling is not requested.
-  std::unique_ptr<HloExecutionProfile> profile_ptr =
-      module_config().debug_options().xla_hlo_profile() &&
-              hlo_profiling_enabled()
-          ? MakeUnique<HloExecutionProfile>(&hlo_profile_printer(),
-                                            &hlo_profile_index_map())
-          : nullptr;
-
-  auto return_value =
-      ExecuteOnStream(run_options, arguments, profile_ptr.get());
-
-  if (profile != nullptr) {
-    VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
-    stream->ThenStopTimer(timer.get()).BlockHostUntilDone();
-    VLOG(1) << "done with block-host-until-done";
-
-    // Merge in run-time profile information from execution_profile.
-    profile->MergeFrom(execution_profile());
-
-    // Overall execution time (in nanoseconds) from the executor timer.
-    profile->set_compute_and_transfer_time_ns(timer->Nanoseconds());
-
-    // TODO(b/28123297): On GPU we end up including transfer time in
-    // the compute time this way. Instead, we should get the correct
-    // value by measuring it. Setting the field here at least lets
-    // benchmarks provide *some* value for GPU computations.
-    //
-    // TODO(b/28447609): The value in compute_and_transfer_time_ns is actually
-    // the compute time without the transfer time, so this way we get the
-    // correct compute time. We should instead have the correct value for
-    // compute_and_transfer_time and set compute_time to the compute time.
-    if (profile->compute_time_ns() == 0) {
-      profile->set_compute_time_ns(profile->compute_and_transfer_time_ns());
-    }
-  }
-
-  if (profile_ptr != nullptr) {
-    XLA_LOG_LINES(
-        tensorflow::INFO,
-        profile_ptr->ToString(stream->parent()->GetDeviceDescription()));
-    hlo_graph_dumper::MaybeDumpHloModule(module(), "Service::Execute",
-                                         profile_ptr.get());
-  }
-
-  return return_value;
-}
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_EXECUTABLE_H_
diff --git a/tensorflow/compiler/xla/service/execution_tracker.cc b/tensorflow/compiler/xla/service/execution_tracker.cc
index c225e62e3e11d2d01251b0f92272b0949eff8af1..2f0b9ed2bd98fbea4e67c0a30d5aa41ff6a06979 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.cc
+++ b/tensorflow/compiler/xla/service/execution_tracker.cc
@@ -39,9 +39,7 @@ AsyncExecution::AsyncExecution(Backend* backend,
 
 tensorflow::Status AsyncExecution::BlockUntilDone() const {
   for (auto& stream : streams_) {
-    if (!stream->BlockHostUntilDone()) {
-      return InternalError("failed to block until done");
-    }
+    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
   }
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
index dfba22a6c4c5cf071c2cd8621643b8da6587ee3b..2b6caa149439a86d6d047605099bc3ff7b295a8e 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -26,7 +26,10 @@ namespace xla {
 
 namespace {
 
-// Helper to replace the called computation at a while- or call-instruction.
+// Helper to replace the called computation at a while-, call-, or
+// conditional-instruction. This function replaces exactly one instance of
+// 'computation' with 'new_computation' even if 'instruction' calls
+// 'computation' more than once.
 void ReplaceCalledComputation(HloInstruction* instruction,
                               HloComputation* computation,
                               HloComputation* new_computation) {
@@ -45,6 +48,15 @@ void ReplaceCalledComputation(HloInstruction* instruction,
       instruction->set_to_apply(new_computation);
       break;
     }
+    case HloOpcode::kConditional: {
+      if (computation == instruction->true_computation()) {
+        instruction->set_true_computation(new_computation);
+      } else {
+        CHECK_EQ(computation, instruction->false_computation());
+        instruction->set_false_computation(new_computation);
+      }
+      break;
+    }
     default:
       LOG(FATAL) << "unexpected opcode: "
                  << HloOpcodeString(instruction->opcode());
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
index a68e90b7d009890012f94baa790d911871c9c960..d3854b40de3572a60df1ad99d8a4589f59ad7194 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
@@ -223,5 +223,35 @@ TEST_F(FlattenCallGraphTest, FlattenCalls) {
   EXPECT_EQ(1, b_node.caller_callsites().size());
 }
 
+TEST_F(FlattenCallGraphTest, FlattenCallsInConditional) {
+  auto module = CreateNewModule();
+  HloComputation* sub_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation());
+
+  // Create entry computation, which is a conditional that has the same
+  // computation in the true and false branch.
+  HloComputation::Builder builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.0f)));
+  builder.AddInstruction(HloInstruction::CreateConditional(
+      kScalarShape, pred, constant1, sub_computation, constant2,
+      sub_computation));
+  module->AddEntryComputation(builder.Build());
+  EXPECT_EQ(2, module->computation_count());
+
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
+  EXPECT_TRUE(result);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+  // The true and false computations must now be different.
+  EXPECT_EQ(3, module->computation_count());
+
+  const CallGraphNode& sub_node = call_graph->GetNode(sub_computation);
+  EXPECT_EQ(1, sub_node.caller_callsites().size());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index 74aa77b4f165be76fbc0a8aa1a4a7e90a8e9acec..78dc0ad4fcd167c93f19d0c2b18ea72d666897ef 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -51,83 +51,7 @@ se::Platform::Id GenericTransferManager::PlatformId() const {
   return platform_id_;
 }
 
-Status GenericTransferManager::TransferLiteralFromDevice(
-    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
-    const Shape& device_shape, const Shape& literal_shape, Literal* literal) {
-  VLOG(2) << "transferring literal shape from device: "
-          << ShapeUtil::HumanString(literal_shape)
-          << "; device location: " << source.opaque();
-  TF_RET_CHECK(ShapeUtil::Compatible(device_shape, literal_shape));
-
-  // Tuples are a special case and contain one or more shapes inside of them to
-  // an arbitrary nesting depth.
-  if (device_shape.element_type() == TUPLE) {
-    *literal->mutable_shape() = literal_shape;
-    TF_ASSIGN_OR_RETURN(
-        std::vector<se::DeviceMemoryBase> element_buffers,
-        ShallowCopyTupleFromDevice(executor, source, device_shape));
-    TF_RET_CHECK(element_buffers.size() ==
-                 ShapeUtil::TupleElementCount(device_shape));
-    for (int64 i = 0; i < element_buffers.size(); ++i) {
-      const Shape& element_device_shape = device_shape.tuple_shapes(i);
-      const Shape& element_literal_shape = literal_shape.tuple_shapes(i);
-      Literal* element_literal = literal->add_tuple_literals();
-      // Recursively call TransferFromDevice to copy over the data in the
-      // element array.
-      TF_RETURN_IF_ERROR(TransferLiteralFromDevice(
-          executor, element_buffers[i], /*device_shape=*/element_device_shape,
-          /*literal_shape=*/element_literal_shape, element_literal));
-    }
-    return Status::OK();
-  }
-
-  *literal->mutable_shape() = device_shape;
-  literal->Reserve(ShapeUtil::ElementsIn(device_shape));
-  TF_RETURN_IF_ERROR(TransferBufferFromDevice(
-      executor, source, /*size=*/ShapeUtil::ByteSizeOf(device_shape),
-      /*destination=*/literal->MutableInternalData()));
-  if (!ShapeUtil::Equal(literal_shape, device_shape)) {
-    *literal = std::move(*literal->Relayout(literal_shape.layout()));
-  }
-  TF_RET_CHECK(ShapeUtil::Equal(literal_shape, literal->shape()));
-  return Status::OK();
-}
-
-StatusOr<std::vector<se::DeviceMemoryBase>>
-GenericTransferManager::ShallowCopyTupleFromDevice(
-    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
-    const Shape& shape) {
-  TF_RET_CHECK(ShapeUtil::IsTuple(shape));
-
-  // For devices which use the GenericTransferManager, a tuple is stored as an
-  // array of pointers to buffers. Copy the contents of the tuple buffer into
-  // a vector of void* pointers.
-  std::vector<void*> element_pointers(ShapeUtil::TupleElementCount(shape),
-                                      nullptr);
-  int64 tuple_size = ShapeUtil::ByteSizeOf(shape, pointer_size_);
-  auto copy_status = executor->SynchronousMemcpyD2H(source, tuple_size,
-                                                    element_pointers.data());
-  if (!copy_status.ok()) {
-    return AddStatus(
-        Status(static_cast<tensorflow::error::Code>(copy_status.code()),
-               copy_status.error_message()),
-        "failed transfer of tuple buffer " + ShapeUtil::HumanString(shape));
-  }
-
-  // Create a DeviceMemoryBase from each void* pointer.
-  std::vector<se::DeviceMemoryBase> destination;
-  for (size_t i = 0; i < element_pointers.size(); ++i) {
-    if (element_pointers[i] == nullptr &&
-        !ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) {
-      return FailedPrecondition("tuple contains nullptr at element %lu", i);
-    }
-    destination.emplace_back(element_pointers[i],
-                             GetByteSizeRequirement(shape.tuple_shapes(i)));
-  }
-  return std::move(destination);
-}
-
-Status GenericTransferManager::WriteTuplePointersToDevice(
+Status GenericTransferManager::WriteSingleTupleIndexTable(
     perftools::gputools::StreamExecutor* executor,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> elements,
     const Shape& shape, perftools::gputools::DeviceMemoryBase* region) {
@@ -145,16 +69,19 @@ StatusOr<std::unique_ptr<Literal>>
 GenericTransferManager::TransferLiteralFromDevice(
     se::StreamExecutor* executor, const ShapedBuffer& device_buffer) {
   VLOG(2) << "transferring literal from device ordinal "
-          << executor->device_ordinal() << "; device shape: "
-          << ShapeUtil::HumanStringWithLayout(device_buffer.shape())
-          << "; opaque: " << device_buffer.buffer(/*index=*/{}).opaque();
+          << executor->device_ordinal() << "; device buffer: " << device_buffer;
   TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
 
+  // The on-host and on-device shape should always be the same for the generic
+  // transfer manager.
+  TF_RET_CHECK(ShapeUtil::Equal(device_buffer.on_device_shape(),
+                                device_buffer.on_host_shape()));
+
   std::unique_ptr<Literal> literal =
-      Literal::CreateFromShape(device_buffer.shape());
+      Literal::CreateFromShape(device_buffer.on_host_shape());
 
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      device_buffer.shape(),
+      device_buffer.on_host_shape(),
       [&](const Shape& subshape, const ShapeIndex& index) -> Status {
         if (!ShapeUtil::IsTuple(subshape)) {
           TF_RETURN_IF_ERROR(TransferBufferFromDevice(
@@ -162,7 +89,7 @@ GenericTransferManager::TransferLiteralFromDevice(
               /*source=*/device_buffer.buffer(index),
               /*size=*/GetByteSizeRequirement(subshape),
               /*destination=*/
-              literal->GetSubliteral(index).MutableInternalData()));
+              literal->untyped_data(index)));
         }
 
         return Status::OK();
@@ -175,33 +102,39 @@ Status GenericTransferManager::TransferLiteralToDevice(
     const ShapedBuffer& device_buffer) {
   const Shape& shape = literal.shape();
   VLOG(2) << "transferring literal shape to device: "
-          << ShapeUtil::HumanString(shape) << "; device location: "
-          << device_buffer.buffer(/*index=*/{}).opaque();
+          << ShapeUtil::HumanString(shape)
+          << "; device buffer: " << device_buffer;
+
+  // The on-host and on-device shape should always be the same for the generic
+  // transfer manager.
+  TF_RET_CHECK(ShapeUtil::Equal(device_buffer.on_device_shape(),
+                                device_buffer.on_host_shape()));
 
-  TF_RET_CHECK(ShapeUtil::Compatible(literal.shape(), device_buffer.shape()));
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(literal.shape(), device_buffer.on_host_shape()));
   TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
 
   TF_RETURN_IF_ERROR(WriteTupleIndexTables(executor, device_buffer));
 
   return ShapeUtil::ForEachSubshapeWithStatus(
-      device_buffer.shape(),
+      device_buffer.on_host_shape(),
       [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
         se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
         if (ShapeUtil::IsArray(device_subshape)) {
           TF_RET_CHECK(GetByteSizeRequirement(device_subshape) ==
                        device_memory.size());
           // Element is array-shaped: transfer array data to device buffer.
-          const Literal& subliteral = literal.GetSubliteral(index);
+          const auto subliteral = LiteralView::Create(literal, index);
           std::unique_ptr<Literal> relayed_out_literal;
           const void* source;
           if (LayoutUtil::Equal(device_subshape.layout(),
                                 subliteral.shape().layout())) {
-            source = subliteral.InternalData();
+            source = subliteral.untyped_data();
           } else {
             // Relayout data before transferring.
             relayed_out_literal = subliteral.Relayout(device_subshape.layout(),
                                                       /*shape_index=*/{});
-            source = relayed_out_literal->InternalData();
+            source = relayed_out_literal->untyped_data();
           }
           return TransferBufferToDevice(
               executor,
@@ -212,33 +145,6 @@ Status GenericTransferManager::TransferLiteralToDevice(
       });
 }
 
-Status GenericTransferManager::TransferLiteralToDevice(
-    se::StreamExecutor* executor, const Literal& literal,
-    se::DeviceMemoryBase* destination) {
-  const Shape& shape = literal.shape();
-  VLOG(2) << "transferring literal shape to device: "
-          << ShapeUtil::HumanString(shape)
-          << "; device location: " << destination->opaque();
-
-  if (ShapeUtil::IsTuple(literal.shape())) {
-    std::vector<void*> tuple_elements_on_device;
-    for (const Literal& tuple_element : literal.tuple_literals()) {
-      se::DeviceMemoryBase allocation = executor->AllocateArray<uint8>(
-          GetByteSizeRequirement(tuple_element.shape()));
-      TF_RETURN_IF_ERROR(
-          TransferLiteralToDevice(executor, tuple_element, &allocation));
-      tuple_elements_on_device.push_back(allocation.opaque());
-    }
-    return TransferBufferToDevice(
-        executor, tuple_elements_on_device.size() * sizeof(void*),
-        tuple_elements_on_device.data(), destination);
-  }
-
-  return TransferBufferToDevice(executor,
-                                /*size=*/GetByteSizeRequirement(shape),
-                                /*source=*/literal.InternalData(), destination);
-}
-
 Status GenericTransferManager::TransferLiteralToInfeed(
     se::StreamExecutor* executor, const Literal& literal) {
   return Unimplemented("Generic transfer to Infeed");
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index 50dca6aec5012f0b02cb54846b622f008600e48e..63a7c820cf4e5fbbdf870086a4fb5316ac50d10b 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -42,16 +42,6 @@ class GenericTransferManager : public TransferManager {
 
   perftools::gputools::Platform::Id PlatformId() const override;
 
-  Status TransferLiteralFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& source,
-      const Shape& device_shape, const Shape& literal_shape,
-      Literal* literal) override;
-
-  Status TransferLiteralToDevice(
-      perftools::gputools::StreamExecutor* executor, const Literal& literal,
-      perftools::gputools::DeviceMemoryBase* destination) override;
-
   StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
       perftools::gputools::StreamExecutor* executor,
       const ShapedBuffer& device_buffer) override;
@@ -62,9 +52,6 @@ class GenericTransferManager : public TransferManager {
 
   Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
                                  const Literal& literal) override;
-  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
-                                int64 size, const void* source) override;
-
   Status TransferLiteralFromOutfeed(
       perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
       Literal* literal) override;
@@ -73,16 +60,13 @@ class GenericTransferManager : public TransferManager {
       tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
           executors) override;
 
-  StatusOr<std::vector<perftools::gputools::DeviceMemoryBase>>
-  ShallowCopyTupleFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& source,
-      const Shape& shape) override;
-
   int64 GetByteSizeRequirement(const Shape& shape) const override;
 
  protected:
-  Status WriteTuplePointersToDevice(
+  Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
+                                int64 size, const void* source) override;
+
+  Status WriteSingleTupleIndexTable(
       perftools::gputools::StreamExecutor* executor,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           elements,
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index e57558b5788965214cadf5eab1024860f1a39ca1..9da4fb97fa27a238fead74985cb481a9be1f4a65 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -23,6 +23,15 @@ filegroup(
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
+cc_library(
+    name = "gpu_constants",
+    srcs = ["gpu_constants.cc"],
+    hdrs = ["gpu_constants.h"],
+    deps = [
+        "//tensorflow/compiler/xla:types",
+    ],
+)
+
 cc_library(
     name = "partition_assignment",
     srcs = [
@@ -120,9 +129,13 @@ cc_library(
     hdrs = [
         "ir_emitter.h",
         "ir_emitter_context.h",
+        "ir_emitter_nested.h",
+        "ir_emitter_unnested.h",
     ],
     deps = [
+        ":cudnn_convolution_runner",
         ":elemental_ir_emitter",
+        ":gpu_constants",
         ":gpu_executable",
         ":hlo_to_ir_bindings",
         ":ir_emission_utils",
@@ -203,6 +216,7 @@ cc_library(
     srcs = ["buffer_allocations.cc"],
     hdrs = ["buffer_allocations.h"],
     deps = [
+        ":gpu_constants",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -217,8 +231,11 @@ cc_library(
 cc_library(
     name = "gpu_executable",
     srcs = [
+        "conditional_thunk.cc",
         "convolution_thunk.cc",
         "copy_thunk.cc",
+        "cudnn_batchnorm_thunk.cc",
+        "fft_thunk.cc",
         "for_thunk.cc",
         "gemm_thunk.cc",
         "gpu_executable.cc",
@@ -230,8 +247,11 @@ cc_library(
         "while_thunk.cc",
     ],
     hdrs = [
+        "conditional_thunk.h",
         "convolution_thunk.h",
         "copy_thunk.h",
+        "cudnn_batchnorm_thunk.h",
+        "fft_thunk.h",
         "for_thunk.h",
         "gemm_thunk.h",
         "gpu_executable.h",
@@ -245,7 +265,9 @@ cc_library(
     ],
     deps = [
         ":buffer_allocations",
+        ":cudnn_convolution_runner",
         ":infeed_manager",
+        ":ir_emission_utils",
         ":partition_assignment",
         ":stream_assignment",
         "//tensorflow/compiler/xla:array2d",
@@ -269,6 +291,7 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
         "//tensorflow/core/platform/default/build_config:cudnn_plugin",
+        "//tensorflow/core/platform/default/build_config:cufft_plugin",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
     ],
 )
@@ -290,9 +313,41 @@ cc_library(
 )
 
 cc_library(
-    name = "convolution_folding",
-    srcs = ["convolution_folding.cc"],
-    hdrs = ["convolution_folding.h"],
+    name = "cudnn_convolution_algorithm_picker",
+    srcs = ["cudnn_convolution_algorithm_picker.cc"],
+    hdrs = ["cudnn_convolution_algorithm_picker.h"],
+    deps = [
+        ":cudnn_convolution_runner",
+        ":gpu_executable",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
+cc_library(
+    name = "cudnn_convolution_runner",
+    srcs = ["cudnn_convolution_runner.cc"],
+    hdrs = ["cudnn_convolution_runner.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
+cc_library(
+    name = "cudnn_convolution_rewriter",
+    srcs = ["cudnn_convolution_rewriter.cc"],
+    hdrs = ["cudnn_convolution_rewriter.h"],
     deps = [
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:literal_util",
@@ -306,15 +361,18 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "convolution_folding_test",
-    srcs = ["convolution_folding_test.cc"],
+    name = "cudnn_convolution_rewriter_test",
+    srcs = ["cudnn_convolution_rewriter_test.cc"],
     deps = [
-        ":convolution_folding",
+        ":cudnn_convolution_rewriter",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:test",
     ],
 )
@@ -427,15 +485,18 @@ cc_library(
     srcs = ["gpu_compiler.cc"],
     hdrs = ["gpu_compiler.h"],
     deps = [
-        ":convolution_folding",
+        ":cudnn_convolution_algorithm_picker",
+        ":cudnn_convolution_rewriter",
         ":fusion_merger",
+        ":gpu_constants",
         ":gpu_copy_insertion",
         ":gpu_executable",
+        ":gpu_hlo_support_checker",
+        ":gpu_layout_assignment",
         ":hlo_schedule",
         ":instruction_fusion",
         ":ir_emission_utils",
         ":ir_emitter",
-        ":layout_assignment",
         ":pad_insertion",
         ":partition_assignment",
         ":stream_assignment",
@@ -445,16 +506,18 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
-        "//tensorflow/compiler/xla/service:batchnorm_rewriter",
+        "//tensorflow/compiler/xla/service:batchnorm_expander",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
+        "//tensorflow/compiler/xla/service:dot_decomposer",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_proto",
@@ -467,11 +530,14 @@ cc_library(
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
+        "//tensorflow/compiler/xla/service:zero_sized_hlo_elimination",
+        "//tensorflow/compiler/xla/service/gpu:cudnn_batchnorm_rewriter",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:cuda_libdevice_path",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:regexp_internal",
         "//tensorflow/core:stream_executor_no_cuda",
         "@llvm//:core",
         "@llvm//:support",
@@ -479,6 +545,18 @@ cc_library(
     alwayslink = True,  # Contains compiler registration
 )
 
+cc_library(
+    name = "cudnn_batchnorm_rewriter",
+    srcs = ["cudnn_batchnorm_rewriter.cc"],
+    hdrs = ["cudnn_batchnorm_rewriter.h"],
+    deps = [
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+    ],
+)
+
 cc_library(
     name = "infeed_manager",
     srcs = ["infeed_manager.cc"],
@@ -492,9 +570,9 @@ cc_library(
 )
 
 cc_library(
-    name = "layout_assignment",
-    srcs = ["layout_assignment.cc"],
-    hdrs = ["layout_assignment.h"],
+    name = "gpu_layout_assignment",
+    srcs = ["gpu_layout_assignment.cc"],
+    hdrs = ["gpu_layout_assignment.h"],
     deps = [
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
@@ -508,17 +586,18 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "layout_assignment_test",
-    srcs = ["layout_assignment_test.cc"],
+    name = "gpu_layout_assignment_test",
+    srcs = ["gpu_layout_assignment_test.cc"],
     deps = [
-        ":layout_assignment",
+        ":gpu_layout_assignment",
+        ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # build_cleaner: keep
     ],
 )
 
@@ -586,6 +665,32 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "gpu_hlo_support_checker",
+    srcs = ["gpu_hlo_support_checker.cc"],
+    hdrs = ["gpu_hlo_support_checker.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_hlo_support_checker_test",
+    srcs = ["gpu_hlo_support_checker_test.cc"],
+    deps = [
+        ":gpu_hlo_support_checker",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index 9fdf717b5d463010e2709b6209c070f25555de72..2029c303d47e9a62135b003c3bd9be6f8b3438d4 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -48,6 +49,15 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
     // If buffer #i's address is already registered (e.g. external arguments or
     // result buffers), use that registered buffer.
     if (registered_buffers_.count(i)) {
+      se::DeviceMemoryBase address = FindOrDie(registered_buffers_, i);
+      if (reinterpret_cast<uintptr_t>(address.opaque()) %
+              kCudaMallocAlignBytes !=
+          0) {
+        return InternalError(
+            "Address of registered buffer %lld must be a multiple of %llx, but "
+            "was %p",
+            i, kCudaMallocAlignBytes, address.opaque());
+      }
       buffer_allocations->SetBuffer(i, FindOrDie(registered_buffers_, i));
       continue;
     }
@@ -67,6 +77,14 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
               tensorflow::strings::HumanReadableNumBytes(buffer_size).c_str(),
               i);
         }
+        if (reinterpret_cast<uintptr_t>(buffer_address.opaque()) %
+                kCudaMallocAlignBytes !=
+            0) {
+          return InternalError(
+              "Address returned by memory_allocator->Allocate must be a "
+              "multiple of %llx, but was %p",
+              kCudaMallocAlignBytes, buffer_address.opaque());
+        }
       }
       buffer_allocations->SetBuffer(i, buffer_address);
       if (allocation.IsPreallocatedTempBuffer()) {
@@ -80,6 +98,14 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
     }
   }
 
+  if (VLOG_IS_ON(2)) {
+    for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
+      const auto& buf = buffer_allocations->buffers_[i];
+      VLOG(2) << "Buffer " << i << " -> " << buf.opaque() << " (" << buf.size()
+              << "B)";
+    }
+  }
+
   return std::move(buffer_allocations);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..790ca535b11ee47724ef6227de40726d940d6153
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
+
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+namespace gpu {
+
+ConditionalThunk::ConditionalThunk(
+    const BufferAllocation::Slice& predicate_buffer_index,
+    const BufferAllocation::Slice& true_operand_buffer_index,
+    const BufferAllocation::Slice& false_operand_buffer_index,
+    ThunkSequence true_thunk_sequence, ThunkSequence false_thunk_sequence,
+    const HloInstruction* hlo)
+    : Thunk(Kind::kConditional, hlo),
+      predicate_buffer_index_(predicate_buffer_index),
+      true_operand_buffer_index_(true_operand_buffer_index),
+      false_operand_buffer_index_(false_operand_buffer_index),
+      true_thunk_(std::move(true_thunk_sequence), hlo),
+      false_thunk_(std::move(false_thunk_sequence), hlo) {}
+
+Status ConditionalThunk::Initialize(const GpuExecutable& executable) {
+  TF_RETURN_IF_ERROR(true_thunk_.Initialize(executable));
+  TF_RETURN_IF_ERROR(false_thunk_.Initialize(executable));
+  return Status::OK();
+}
+
+Status ConditionalThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations,
+    perftools::gputools::Stream* stream) {
+  // Copy the predicate value from device.
+  bool predicate;
+  perftools::gputools::DeviceMemoryBase predicate_address =
+      buffer_allocations.GetDeviceAddress(predicate_buffer_index_);
+  stream->ThenMemcpy(&predicate, predicate_address, sizeof(bool));
+
+  Status block_status = stream->BlockHostUntilDone();
+  if (!block_status.ok()) {
+    return InternalError("Failed to retrieve predicate value on stream %p: %s.",
+                         stream, block_status.error_message().c_str());
+  }
+
+  // Execute the true or the false computation depending on the value of the
+  // predicate.
+  if (predicate) {
+    TF_RETURN_IF_ERROR(true_thunk_.ExecuteOnStream(buffer_allocations, stream));
+  } else {
+    TF_RETURN_IF_ERROR(
+        false_thunk_.ExecuteOnStream(buffer_allocations, stream));
+  }
+
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..7725c46a3b4b51af34a4dd977885353ff32c21f6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
+
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// ConditionalThunk implements the conditional instruction on GPU by reading the
+// predicate of the conditional and executing the true or the false computation
+// depending on the value of the predicate.
+//
+// ConditionalThunk assumes that the buffers of the conditional result and the
+// result of the true and false computations share the same allocation. Also,
+// the buffers of the true operand of the conditional and that of the parameter
+// instruction of the true computation share the same allocation. Similarly, the
+// buffers of the false operand and that of the parameter instruction of the
+// false computation share the same allocation.
+class ConditionalThunk : public Thunk {
+ public:
+  ConditionalThunk(const BufferAllocation::Slice& predicate_buffer_index,
+                   const BufferAllocation::Slice& true_operand_buffer_index,
+                   const BufferAllocation::Slice& false_operand_buffer_index,
+                   ThunkSequence true_thunk_sequence,
+                   ThunkSequence false_thunk_sequence,
+                   const HloInstruction* hlo);
+
+  ConditionalThunk(const ConditionalThunk&) = delete;
+  ConditionalThunk& operator=(const ConditionalThunk&) = delete;
+
+  Status Initialize(const GpuExecutable& executable) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
+
+ private:
+  BufferAllocation::Slice predicate_buffer_index_;
+  BufferAllocation::Slice true_operand_buffer_index_;
+  BufferAllocation::Slice false_operand_buffer_index_;
+  SequentialThunk true_thunk_;
+  SequentialThunk false_thunk_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONDITIONAL_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 037eec8ef59e1aeccdfc43dbb5c1a852403780d1..15bba49b73bce8eb4a18175f8874f05049119458 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -36,364 +37,70 @@ using se::dnn::DataLayout;
 using se::dnn::FilterDescriptor;
 using se::dnn::FilterLayout;
 
-ConvolveScratchAllocator::ConvolveScratchAllocator(
-    int device_ordinal, DeviceMemoryAllocator* memory_allocator)
-    : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
-
-ConvolveScratchAllocator::~ConvolveScratchAllocator() {
-  for (auto& allocated_buffer : allocated_buffers_) {
-    if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer)
-             .ok()) {
-      // The program can still continue with failed deallocation.
-      LOG(ERROR) << "Failed to deallocate the allocated buffer: "
-                 << allocated_buffer.opaque();
-    }
-  }
-}
-
-int64 ConvolveScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) {
-  constexpr int64 kConvolveScratchSize = 1LL << 32;  // 4GB by default.
-  return kConvolveScratchSize;
-}
-
-se::port::StatusOr<se::DeviceMemory<uint8>>
-ConvolveScratchAllocator::AllocateBytes(se::Stream* stream, int64 byte_size) {
-  CHECK_GE(byte_size, 0) << "byte_size must be positive.";
-  if (byte_size > GetMemoryLimitInBytes(stream)) {
-    return se::port::Status(
-        se::port::error::RESOURCE_EXHAUSTED,
-        tensorflow::strings::Printf(
-            "Allocating %lld bytes exceeds the memory limit of %lld bytes.",
-            byte_size, GetMemoryLimitInBytes(stream)));
-  }
-
-  auto status_or_memory =
-      memory_allocator_->Allocate(device_ordinal_, byte_size,
-                                  /*retry_on_failure=*/false);
-  if (!status_or_memory.ok()) {
-    return se::port::Status(se::port::error::RESOURCE_EXHAUSTED,
-                            tensorflow::strings::Printf(
-                                "Failed to allocate %lld bytes on device %d.",
-                                byte_size, device_ordinal_));
-  }
-  se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie();
-  allocated_buffers_.push_back(allocated_buffer);
-  total_allocated_bytes_ += byte_size;
-  return se::DeviceMemory<uint8>(allocated_buffer);
-}
-
-string ConvolutionKindToString(
-    ConvolutionThunk::ConvolutionKind convolution_kind) {
-  switch (convolution_kind) {
-    case ConvolutionThunk::ConvolutionKind::kForward:
-      return "forward";
-    case ConvolutionThunk::ConvolutionKind::kBackwardFilter:
-      return "backward_filter";
-    case ConvolutionThunk::ConvolutionKind::kBackwardInput:
-      return "backward_input";
-  }
-  return "unknown convolution kind";
-}
-
 ConvolutionThunk::ConvolutionThunk(
-    ConvolutionKind convolution_kind,
-    const BufferAllocation::Slice& input_buffer,
+    CudnnConvKind convolution_kind, const BufferAllocation::Slice& input_buffer,
     const BufferAllocation::Slice& filter_buffer,
-    const BufferAllocation::Slice& output_buffer, const Shape& input_shape,
+    const BufferAllocation::Slice& output_buffer,
+    const BufferAllocation::Slice& tuple_result_buffer,
+    const BufferAllocation::Slice& scratch_buffer, const Shape& input_shape,
     const Shape& filter_shape, const Shape& output_shape, const Window& window,
-    const ConvolutionDimensionNumbers& dim_nums, const HloInstruction* hlo)
+    const ConvolutionDimensionNumbers& dim_nums, int64 algorithm,
+    bool tensor_ops_enabled, const HloInstruction* hlo)
     : Thunk(Kind::kConvolution, hlo),
       convolution_kind_(convolution_kind),
       input_buffer_(input_buffer),
       filter_buffer_(filter_buffer),
       output_buffer_(output_buffer),
+      tuple_result_buffer_(tuple_result_buffer),
+      scratch_buffer_(scratch_buffer),
       input_shape_(input_shape),
       filter_shape_(filter_shape),
       output_shape_(output_shape),
       window_(window),
-      dim_nums_(dim_nums) {}
+      dim_nums_(dim_nums),
+      algorithm_(algorithm),
+      tensor_ops_enabled_(tensor_ops_enabled) {}
 
-tensorflow::Status ConvolutionThunk::ExecuteOnStream(
+Status ConvolutionThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
-  VLOG(3) << "Convolution kind: " << ConvolutionKindToString(convolution_kind_);
-  VLOG(3) << "input shape: { " << input_shape_.ShortDebugString() << " }";
-  VLOG(3) << "filter shape: { " << filter_shape_.ShortDebugString() << " }";
-  VLOG(3) << "Output shape: { " << output_shape_.ShortDebugString() << " }";
-  VLOG(3) << "Dim nums: { " << dim_nums_.ShortDebugString() << " }";
-  VLOG(3) << "Window: { " << window_.ShortDebugString() << " }";
-
-  const int num_dimensions = window_.dimensions_size();
-  CHECK_LE(num_dimensions, 3);
-  // cuDNN does not support 1D convolutions. We therefore express 1D
-  // convolutions as 2D convolutions where the first spatial dimension is 1.
-  // This matches the behavior of TF (see definition of conv1d in
-  // tensorflow/python/ops/nn_ops.py).
-  const int effective_num_dimensions = std::max(2, num_dimensions);
-
-  CHECK_EQ(F32, output_shape_.element_type());
-  CHECK_EQ(num_dimensions, dim_nums_.input_spatial_dimensions_size());
-  CHECK_EQ(num_dimensions, dim_nums_.kernel_spatial_dimensions_size());
-  CHECK_EQ(num_dimensions, dim_nums_.output_spatial_dimensions_size());
-  for (const WindowDimension& dim : window_.dimensions()) {
-    CHECK_EQ(dim.padding_low(), dim.padding_high());
-  }
-
-  // cuDNN's convolution APIs support the BDYX layout for activations/output and
-  // the OIYX layout for weights.
-  BatchDescriptor input_descriptor(effective_num_dimensions);
-  input_descriptor.set_layout(DataLayout::kBatchDepthYX)
-      .set_feature_map_count(
-          input_shape_.dimensions(dim_nums_.input_feature_dimension()))
-      .set_count(input_shape_.dimensions(dim_nums_.input_batch_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    // Note that the dimensions are reversed. The same holds below.
-    input_descriptor.set_spatial_dim(
-        static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-        input_shape_.dimensions(dim_nums_.input_spatial_dimensions(dim)));
-  }
-
-  FilterDescriptor filter_descriptor(effective_num_dimensions);
-  filter_descriptor.set_layout(FilterLayout::kOutputInputYX)
-      .set_input_feature_map_count(
-          filter_shape_.dimensions(dim_nums_.kernel_input_feature_dimension()))
-      .set_output_feature_map_count(filter_shape_.dimensions(
-          dim_nums_.kernel_output_feature_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    filter_descriptor.set_spatial_dim(
-        static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-        filter_shape_.dimensions(dim_nums_.kernel_spatial_dimensions(dim)));
-  }
-
-  ConvolutionDescriptor convolution_descriptor(effective_num_dimensions);
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    convolution_descriptor
-        .set_zero_padding(
-            static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-            window_.dimensions(dim).padding_low())
-        .set_filter_stride(
-            static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-            window_.dimensions(dim).stride());
-  }
-
-  BatchDescriptor output_descriptor(effective_num_dimensions);
-  output_descriptor.set_layout(DataLayout::kBatchDepthYX)
-      .set_feature_map_count(
-          output_shape_.dimensions(dim_nums_.output_feature_dimension()))
-      .set_count(output_shape_.dimensions(dim_nums_.output_batch_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    output_descriptor.set_spatial_dim(
-        static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-        output_shape_.dimensions(dim_nums_.output_spatial_dimensions(dim)));
-  }
-
-  // Add a singleton dimension in the 1D convolution case.
-  if (num_dimensions == 1) {
-    input_descriptor.set_spatial_dim(static_cast<se::dnn::DimIndex>(0), 1);
-    output_descriptor.set_spatial_dim(static_cast<se::dnn::DimIndex>(0), 1);
-    filter_descriptor.set_spatial_dim(static_cast<se::dnn::DimIndex>(0), 1);
-    convolution_descriptor
-        .set_zero_padding(static_cast<se::dnn::DimIndex>(0), 0)
-        .set_filter_stride(static_cast<se::dnn::DimIndex>(0), 1);
-  }
-
   se::DeviceMemory<float> input_data(
       buffer_allocations.GetDeviceAddress(input_buffer_));
   se::DeviceMemory<float> filter_data(
       buffer_allocations.GetDeviceAddress(filter_buffer_));
   se::DeviceMemory<float> output_data(
       buffer_allocations.GetDeviceAddress(output_buffer_));
-  return ConvolveWithTune(input_descriptor, input_data, filter_descriptor,
-                          filter_data, output_descriptor, output_data,
-                          convolution_descriptor, buffer_allocations, stream);
-}
-
-tensorflow::Status ConvolutionThunk::Convolve(
-    const BatchDescriptor& input_descriptor, se::DeviceMemory<float> input_data,
-    const FilterDescriptor& filter_descriptor,
-    se::DeviceMemory<float> filter_data,
-    const BatchDescriptor& output_descriptor,
-    se::DeviceMemory<float> output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const se::dnn::AlgorithmConfig& algorithm_config, se::Stream* stream,
-    ConvolveScratchAllocator* scratch_allocator,
-    se::dnn::ProfileResult* profile_result) {
-  bool launch_ok;
-  switch (convolution_kind_) {
-    case ConvolutionKind::kBackwardFilter:
-      launch_ok =
-          stream
-              ->ThenConvolveBackwardFilterWithAlgorithm(
-                  input_descriptor, input_data, output_descriptor, output_data,
-                  convolution_descriptor, filter_descriptor, &filter_data,
-                  scratch_allocator, algorithm_config, profile_result)
-              .ok();
-      break;
-    case ConvolutionKind::kBackwardInput:
-      launch_ok = stream
-                      ->ThenConvolveBackwardDataWithAlgorithm(
-                          filter_descriptor, filter_data, output_descriptor,
-                          output_data, convolution_descriptor, input_descriptor,
-                          &input_data, scratch_allocator, algorithm_config,
-                          profile_result)
-                      .ok();
-      break;
-    case ConvolutionKind::kForward:
-      launch_ok =
-          stream
-              ->ThenConvolveWithAlgorithm(
-                  input_descriptor, input_data, filter_descriptor, filter_data,
-                  convolution_descriptor, output_descriptor, &output_data,
-                  scratch_allocator, algorithm_config, profile_result)
-              .ok();
-      break;
-  }
-  if (launch_ok) {
-    return tensorflow::Status::OK();
-  }
-  return InternalError(
-      "Unable to launch convolution for thunk %p with type %s and algorithm "
-      "(%lld, %lld)",
-      this, ConvolutionKindToString(convolution_kind_).c_str(),
-      algorithm_config.algorithm().algo_id(),
-      algorithm_config.algorithm_no_scratch().algo_id());
-}
-
-std::vector<AlgorithmDesc> ConvolutionThunk::GetAlgorithms(
-    bool with_winograd_nonfused, se::StreamExecutor* stream_exec) const {
-  std::vector<AlgorithmDesc> algorithms;
-  switch (convolution_kind_) {
-    case ConvolutionKind::kBackwardFilter:
-      CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(
-          with_winograd_nonfused, &algorithms));
-      break;
-    case ConvolutionKind::kBackwardInput:
-      CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(
-          with_winograd_nonfused, &algorithms));
-      break;
-    case ConvolutionKind::kForward:
-      CHECK(stream_exec->GetConvolveAlgorithms(with_winograd_nonfused,
-                                               &algorithms));
-      break;
-  }
-  return algorithms;
-}
-
-static string AlgorithmToString(const se::dnn::AlgorithmDesc& algo) {
-  if (algo.tensor_ops_enabled()) {
-    return tensorflow::strings::StrCat(algo.algo_id(), "+TC");
-  }
-  return tensorflow::strings::StrCat(algo.algo_id());
-}
-
-// Determines whether we can safely perform a winograd non-fused convolution for
-// the given input and output descriptors.  This works around b/68264959, an
-// integer overflow in cuDNNv5 and cuDNNv6.
-static bool ShouldIncludeWinogradNonfusedAlgo(
-    const BatchDescriptor& input_descriptor,
-    const BatchDescriptor& output_descriptor) {
-  int64 batch = input_descriptor.count();
-  int64 in_depths = input_descriptor.feature_map_count();
-  int64 in_rows = input_descriptor.height();
-  int64 in_cols = input_descriptor.width();
-  int64 out_depths = output_descriptor.feature_map_count();
-
-  int64 total_size = 16 * std::ceil(batch / 16.0) *
-                     std::max(in_depths, out_depths) * in_cols * in_rows *
-                     sizeof(float);
-  int64 threshold = 1L << 31;
-
-  return total_size < threshold;
-}
-
-tensorflow::Status ConvolutionThunk::ConvolveWithTune(
-    const BatchDescriptor& input_descriptor, se::DeviceMemory<float> input_data,
-    const FilterDescriptor& filter_descriptor,
-    se::DeviceMemory<float> filter_data,
-    const BatchDescriptor& output_descriptor,
-    se::DeviceMemory<float> output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BufferAllocations& buffer_allocations, se::Stream* stream) {
-  // TODO(b/29126320): Try cudnn v5's new auto-tuner when it's rolled out.
-  if (best_algorithm_.algorithm().is_default()) {
-    // Auto-tuning either is disabled or only happens in the first run of this
-    // function.
-    VLOG(2) << "Profiling for best convolution algorithm used for "
-               "ConvolutionThunk: "
-            << this;
-
-    bool with_winograd_nonfused =
-        ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor);
-
-    se::dnn::ProfileResult best_result;
-    se::dnn::ProfileResult best_result_without_scratch;
-    std::vector<AlgorithmDesc> algorithms =
-        GetAlgorithms(with_winograd_nonfused, stream->parent());
-    for (auto algorithm : algorithms) {
-      ConvolveScratchAllocator scratch_allocator(
-          buffer_allocations.device_ordinal(),
-          buffer_allocations.memory_allocator());
-      se::dnn::ProfileResult profile_result;
-      VLOG(3) << "Trying algorithm " << AlgorithmToString(algorithm)
-              << " for ConvolutionThunk: " << this;
-      bool launch_ok =
-          Convolve(input_descriptor, input_data, filter_descriptor, filter_data,
-                   output_descriptor, output_data, convolution_descriptor,
-                   se::dnn::AlgorithmConfig(algorithm, algorithm), stream,
-                   &scratch_allocator, &profile_result)
-              .ok();
-      if (launch_ok && profile_result.is_valid()) {
-        VLOG(3) << "Run of algorithm " << AlgorithmToString(algorithm)
-                << " for ConvolutionThunk " << this << " succeeded, taking "
-                << profile_result.elapsed_time_in_ms()
-                << "ms. (Best result: " << best_result.elapsed_time_in_ms()
-                << "ms)";
-        if (profile_result.elapsed_time_in_ms() <
-            best_result.elapsed_time_in_ms()) {
-          best_result = profile_result;
-        }
-        if (scratch_allocator.TotalAllocatedBytes() == 0 &&
-            profile_result.elapsed_time_in_ms() <
-                best_result_without_scratch.elapsed_time_in_ms()) {
-          best_result_without_scratch = profile_result;
-        }
-      } else {
-        VLOG(3) << "Run of algorithm " << AlgorithmToString(algorithm)
-                << " for ConvolutionThunk " << this << " failed.";
-      }
-    }
-
-    if (best_result.is_valid()) {
-      best_algorithm_.set_algorithm(best_result.algorithm());
-    } else {
-      LOG(ERROR) << "No convolution algorithm works with profiling. Fall back "
-                    "to the default algorithm.";
-      best_algorithm_.set_algorithm(AlgorithmDesc());
+  se::DeviceMemoryBase scratch =
+      buffer_allocations.GetDeviceAddress(scratch_buffer_);
+
+  se::dnn::AlgorithmConfig algorithm_config(
+      se::dnn::AlgorithmDesc(algorithm_, tensor_ops_enabled_));
+
+  TF_RETURN_IF_ERROR(RunCudnnConvolution(
+      convolution_kind_, input_shape_, filter_shape_, output_shape_, input_data,
+      filter_data, output_data, scratch, window_, dim_nums_, algorithm_config,
+      stream));
+
+  // Figure out which of output/input/filter is the result produced by this op,
+  // and write the result tuple.
+  void* result_ptr = [&] {
+    switch (convolution_kind_) {
+      case CudnnConvKind::kForward:
+        return output_data.opaque();
+      case CudnnConvKind::kBackwardInput:
+        return input_data.opaque();
+      case CudnnConvKind::kBackwardFilter:
+        return filter_data.opaque();
     }
+  }();
+  void* ptrs[] = {result_ptr, scratch.opaque()};
+  se::DeviceMemory<void*> tuple_addr(
+      buffer_allocations.GetDeviceAddress(tuple_result_buffer_));
+  stream->ThenMemcpyH2D<void*>(ptrs, &tuple_addr);
 
-    if (best_result_without_scratch.is_valid()) {
-      best_algorithm_.set_algorithm_no_scratch(
-          best_result_without_scratch.algorithm());
-    } else {
-      LOG(ERROR) << "No convolution algorithm without scratch works with "
-                    "profiling. Fall back "
-                    "to the default algorithm.";
-      best_algorithm_.set_algorithm_no_scratch(AlgorithmDesc());
-    }
-  }
-
-  {
-    VLOG(2) << "Using convolution algorithm ("
-            << AlgorithmToString(best_algorithm_.algorithm()) << ", "
-            << AlgorithmToString(best_algorithm_.algorithm_no_scratch())
-            << ") for ConvolutionThunk: " << this;
-    ConvolveScratchAllocator scratch_allocator(
-        buffer_allocations.device_ordinal(),
-        buffer_allocations.memory_allocator());
-    return Convolve(input_descriptor, input_data, filter_descriptor,
-                    filter_data, output_descriptor, output_data,
-                    convolution_descriptor, best_algorithm_, stream,
-                    &scratch_allocator, nullptr);
+  if (!stream->ok()) {
+    return InternalError("ConvolutionThunk::ExecuteOnStream failed.");
   }
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index 5ac5db2f04b6796c6013a7f87dd40b485233baa6..900d9cb6243088b56a1825fb3ab8c06cf8d74726 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -18,89 +18,60 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
 namespace gpu {
 
-// A one-time scratch allocator for forward and backward convolution. The
-// scratch buffers allocated are released on destruction.
-//
-// Not thread-safe.
-class ConvolveScratchAllocator : public perftools::gputools::ScratchAllocator {
- public:
-  ConvolveScratchAllocator(int device_ordinal,
-                           DeviceMemoryAllocator* memory_allocator);
-
-  ~ConvolveScratchAllocator() override;
-
-  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override;
-
-  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
-
-  perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
-  AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override;
-
- private:
-  const int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
-  std::vector<perftools::gputools::DeviceMemoryBase> allocated_buffers_;
-  int64 total_allocated_bytes_ = 0;
-};
-
 // This class stores everything that StreamExecutor needs to launch a BNN
 // convolution. It is generated by IrEmitter.
 //
 // This is thread-compatible.
 class ConvolutionThunk : public Thunk {
  public:
-  // ConvolutionThunk performs one of the following types of convolution.
-  enum class ConvolutionKind {
-    kBackwardFilter,  // Backward convolution for filter.
-    kBackwardInput,   // Backward convolution for input.
-    kForward,         // Forward convolution.
-  };
-
-  // Constructs a thunk for launching a DNN convolution.
+  // Constructs a thunk for launching a DNN convolution.  When run, it will
+  // write a tuple (result, scratch_memory) into `tuple_result_buffer`.
+  //
+  // `algorithm` is a cudnn algorithm number.  `algorithm == -1` indicates that
+  // we should use the default (i.e. baseline) cudnn algorithm.
+  //
+  // Note that "output" here doesn't refer to the output from running this
+  // thunk, but rather to the "output" of a hypothetical forward convolution
+  // that corresponds to this input+filter+output triple.  That is, the result
+  // generated by this thunk is "output" for forward convs, "input" for
+  // backward-input convs, and "filter" for backward-filter convs.
+  //
   // Semantics of null hlo_instruction argument are as in Thunk.
-  ConvolutionThunk(ConvolutionKind convolution_kind,
+  ConvolutionThunk(CudnnConvKind convolution_kind,
                    const BufferAllocation::Slice& input_buffer,
                    const BufferAllocation::Slice& filter_buffer,
                    const BufferAllocation::Slice& output_buffer,
+                   const BufferAllocation::Slice& tuple_result_buffer,
+                   const BufferAllocation::Slice& scratch_buffer,
                    const Shape& input_shape, const Shape& filter_shape,
                    const Shape& output_shape, const Window& window,
-                   const ConvolutionDimensionNumbers& dnums,
-                   const HloInstruction* hlo);
+                   const ConvolutionDimensionNumbers& dim_nums, int64 algorithm,
+                   bool tensor_ops_enabled, const HloInstruction* hlo);
 
   ConvolutionThunk(const ConvolutionThunk&) = delete;
   ConvolutionThunk& operator=(const ConvolutionThunk&) = delete;
 
-  // Does the convolution for the thunk on "stream". Auto-tuning happens on the
-  // first run of this function.
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+  // Does the convolution for the thunk on "stream".
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
 
  private:
-  tensorflow::Status ConvolveWithTune(
-      const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
-      perftools::gputools::DeviceMemory<float> input_data,
-      const perftools::gputools::dnn::FilterDescriptor& filter_descriptor,
-      perftools::gputools::DeviceMemory<float> filter_data,
-      const perftools::gputools::dnn::BatchDescriptor& output_descriptor,
-      perftools::gputools::DeviceMemory<float> output_data,
-      const perftools::gputools::dnn::ConvolutionDescriptor&
-          convolution_descriptor,
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream);
+  class ScratchAllocator;
 
-  tensorflow::Status Convolve(
+  Status Convolve(
       const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
       perftools::gputools::DeviceMemory<float> input_data,
       const perftools::gputools::dnn::FilterDescriptor& filter_descriptor,
@@ -110,39 +81,27 @@ class ConvolutionThunk : public Thunk {
       const perftools::gputools::dnn::ConvolutionDescriptor&
           convolution_descriptor,
       const perftools::gputools::dnn::AlgorithmConfig& algorithm_config,
-      perftools::gputools::Stream* stream,
-      ConvolveScratchAllocator* scratch_allocator,
+      perftools::gputools::Stream* stream, ScratchAllocator* scratch_allocator,
       perftools::gputools::dnn::ProfileResult* profile_result);
 
-  // Returns the convolve algorithms that can be used for this ConvolutionThunk.
-  std::vector<perftools::gputools::dnn::AlgorithmDesc> GetAlgorithms(
-      bool with_winograd_nonfused,
-      perftools::gputools::StreamExecutor* stream_exec) const;
-
-  // Fastest cuDNN convolution algorithm for this thunk learned from
-  // auto-tuning. If auto-tuning is disabled or failed, best_algorithm_ is set
-  // to the default value indicating cuDNN's convolution will choose
-  // the best algorithm from some heuristics based on its parameters.
-  perftools::gputools::dnn::AlgorithmConfig best_algorithm_;
-
-  const ConvolutionKind convolution_kind_;
+  const CudnnConvKind convolution_kind_;
 
   const BufferAllocation::Slice input_buffer_;
   const BufferAllocation::Slice filter_buffer_;
   const BufferAllocation::Slice output_buffer_;
+  const BufferAllocation::Slice tuple_result_buffer_;
+  const BufferAllocation::Slice scratch_buffer_;
 
   const Shape input_shape_;
   const Shape filter_shape_;
   const Shape output_shape_;
 
   const Window window_;
-
   const ConvolutionDimensionNumbers dim_nums_;
+  int64 algorithm_;
+  bool tensor_ops_enabled_;
 };
 
-string ConvolutionKindToString(
-    ConvolutionThunk::ConvolutionKind convolution_kind);
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..db6924c742e4a949a3e939b6d6659e92c2d1e312
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc
@@ -0,0 +1,219 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class Visitor : public DfsHloVisitorWithDefault {
+ public:
+  explicit Visitor(HloComputation* computation) : computation_(computation) {}
+
+  static bool Run(HloComputation* computation) {
+    Visitor visitor(computation);
+    TF_CHECK_OK(computation->Accept(&visitor));
+    return visitor.changed_;
+  }
+
+  Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+    return Status::OK();
+  }
+
+  Status HandleBatchNormInference(HloInstruction* batch_norm) override;
+  Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
+  Status HandleBatchNormGrad(HloInstruction* batch_norm) override;
+
+ private:
+  bool changed_ = false;
+  HloComputation* computation_;
+};
+
+// cudnn defines CUDNN_BN_MIN_EPSILON = 1e-5 as the minimum acceptable epsilon
+// for calls to its batchnorm ops.
+bool EpsilonInRange(HloInstruction* batch_norm) {
+  return batch_norm->epsilon() >= 1e-5;
+}
+
+Status Visitor::HandleBatchNormInference(HloInstruction* batch_norm) {
+  if (batch_norm->operand(0)->shape().element_type() != F32) {
+    VLOG(1) << "Not rewriting op with non-F32 element type: "
+            << batch_norm->ToString();
+    return Status::OK();
+  }
+
+  // cudnn errors out on zero-sized inputs.
+  if (ShapeUtil::ElementsIn(batch_norm->operand(0)->shape()) == 0) {
+    return Status::OK();
+  }
+
+  if (!EpsilonInRange(batch_norm)) {
+    return Status::OK();
+  }
+
+  HloInstruction* epsilon = computation_->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon())));
+  HloInstruction* feature_index =
+      computation_->AddInstruction(HloInstruction::CreateConstant(
+          Literal::CreateR0(batch_norm->feature_index())));
+
+  std::vector<HloInstruction*> operands(batch_norm->operands().begin(),
+                                        batch_norm->operands().end());
+  operands.push_back(epsilon);
+  operands.push_back(feature_index);
+
+  std::unique_ptr<HloInstruction> libcall = HloInstruction::CreateCustomCall(
+      batch_norm->shape(), operands, kCudnnBatchNormForwardInferenceCallTarget);
+  TF_RETURN_IF_ERROR(
+      computation_->ReplaceWithNewInstruction(batch_norm, std::move(libcall)));
+  changed_ = true;
+  return Status::OK();
+}
+
+Status Visitor::HandleBatchNormTraining(HloInstruction* batch_norm) {
+  if (batch_norm->operand(0)->shape().element_type() != F32) {
+    VLOG(1) << "Not rewriting op with non-F32 element type: "
+            << batch_norm->ToString();
+    return Status::OK();
+  }
+
+  // cudnn errors out on zero-sized inputs.
+  if (ShapeUtil::ElementsIn(batch_norm->operand(0)->shape()) == 0) {
+    return Status::OK();
+  }
+
+  if (!EpsilonInRange(batch_norm)) {
+    return Status::OK();
+  }
+
+  HloInstruction* epsilon = computation_->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon())));
+  HloInstruction* feature_index =
+      computation_->AddInstruction(HloInstruction::CreateConstant(
+          Literal::CreateR0(batch_norm->feature_index())));
+
+  std::vector<HloInstruction*> operands(batch_norm->operands().begin(),
+                                        batch_norm->operands().end());
+  operands.push_back(epsilon);
+  operands.push_back(feature_index);
+
+  HloInstruction* libcall =
+      computation_->AddInstruction(HloInstruction::CreateCustomCall(
+          batch_norm->shape(), operands,
+          kCudnnBatchNormForwardTrainingCallTarget));
+
+  // The cudnn libcall returns a tuple
+  //   {output, mean, rsqrt(variance + epsilon)},
+  // but the batchnorm HLO returns {output, mean, variance}.  Fix it up.
+  HloInstruction* inverse_stddev =
+      computation_->AddInstruction(HloInstruction::CreateGetTupleElement(
+          libcall->shape().tuple_shapes(2), libcall, 2));
+  HloInstruction* variance_plus_epsilon =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          inverse_stddev->shape(), HloOpcode::kPower, inverse_stddev,
+          computation_->AddInstruction(
+              HloInstruction::CreateConstant(Literal::CreateR0<float>(-2)))));
+  HloInstruction* variance =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          variance_plus_epsilon->shape(), HloOpcode::kSubtract,
+          variance_plus_epsilon, epsilon));
+
+  // Repackage the results.
+  std::unique_ptr<HloInstruction> new_tuple = HloInstruction::CreateTuple({
+      computation_->AddInstruction(HloInstruction::CreateGetTupleElement(
+          libcall->shape().tuple_shapes(0), libcall, 0)),
+      computation_->AddInstruction(HloInstruction::CreateGetTupleElement(
+          libcall->shape().tuple_shapes(1), libcall, 1)),
+      variance,
+  });
+
+  TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+      batch_norm, std::move(new_tuple)));
+  changed_ = true;
+  return Status::OK();
+}
+
+Status Visitor::HandleBatchNormGrad(HloInstruction* batch_norm) {
+  if (batch_norm->operand(0)->shape().element_type() != F32) {
+    VLOG(1) << "Not rewriting op with non-F32 element type: "
+            << batch_norm->ToString();
+    return Status::OK();
+  }
+
+  // cudnn errors out on zero-sized inputs.
+  if (ShapeUtil::ElementsIn(batch_norm->operand(0)->shape()) == 0) {
+    return Status::OK();
+  }
+
+  if (!EpsilonInRange(batch_norm)) {
+    return Status::OK();
+  }
+
+  HloInstruction* epsilon = computation_->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon())));
+  HloInstruction* feature_index =
+      computation_->AddInstruction(HloInstruction::CreateConstant(
+          Literal::CreateR0(batch_norm->feature_index())));
+
+  // The cudnn libcall expects its input to be rsqrt(variance + epsilon), but
+  // the batchnorm HLO takes plain variance as input.  Fix it up.
+  HloInstruction* var_plus_epsilon =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          batch_norm->operand(3)->shape(), HloOpcode::kAdd,
+          batch_norm->mutable_operand(3), epsilon));
+  HloInstruction* inverse_stddev =
+      computation_->AddInstruction(HloInstruction::CreateBinary(
+          var_plus_epsilon->shape(), HloOpcode::kPower, var_plus_epsilon,
+          computation_->AddInstruction(
+              HloInstruction::CreateConstant(Literal::CreateR0<float>(-.5)))));
+
+  std::vector<HloInstruction*> operands(batch_norm->operands().begin(),
+                                        batch_norm->operands().end());
+  operands[3] = inverse_stddev;
+  operands.push_back(epsilon);
+  operands.push_back(feature_index);
+
+  std::unique_ptr<HloInstruction> libcall = HloInstruction::CreateCustomCall(
+      batch_norm->shape(), operands, kCudnnBatchNormBackwardCallTarget);
+
+  TF_RETURN_IF_ERROR(
+      computation_->ReplaceWithNewInstruction(batch_norm, std::move(libcall)));
+  changed_ = true;
+  return Status::OK();
+}
+
+}  // anonymous namespace
+
+StatusOr<bool> CudnnBatchNormRewriter::Run(HloModule* module) {
+  VLOG(2) << "CudnnBatchNormRewriter::Run(), before:";
+  XLA_VLOG_LINES(2, module->ToString());
+
+  bool changed = false;
+  for (auto* comp : module->MakeNonfusionComputations()) {
+    if (Visitor::Run(comp)) {
+      changed = true;
+    }
+  }
+
+  VLOG(2) << "CudnnBatchNormRewriter::Run(), after:";
+  XLA_VLOG_LINES(2, module->ToString());
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..e09cde9abf85454c7a020566cd8c2671ae12ffc3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h
@@ -0,0 +1,66 @@
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_BATCHNORM_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_BATCHNORM_REWRITER_H_
+
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites BatchNorm HLOs into calls into cudnn where possible.
+//
+// A call into cudnn for performing a batchnorm op is represented as a
+// CustomCall HLO with custom_call_target equal to one of
+//
+//   - kCudnnBatchNormForwardInferenceCallTarget
+//   - kCudnnBatchNormForwardTrainingCallTarget, or
+//   - kCudnnBatchNormBackwardCallTarget.
+//
+// A CustomCall created by this pass has the same operands corresponding
+// batchnorm HLO, except the epsilon() and feature_index() properties of the
+// batchnorm HLO are converted into proper operands, added to the end of the
+// CustomCall's operands list.
+//
+// The inputs/outputs of the cudnn calls for BatchNormTraining and BatchNormGrad
+// do not correspond exactly to the HLOs.  In particular, the training cudnn
+// call returns 1/sqrt(variance + epsilon), while the HLO returns plain
+// variance.  Similarly, the grad cudnn call expects 1/sqrt(variance + epsilon)
+// as input, whereas the HLO expects plain variance.
+//
+// This pass adds HLOs in front of / behind the CustomCalls to fix up the
+// inputs/outputs as appropriate, and we rely on the AlgebraicSimplifier to
+// remove these where possible.
+//
+// Currently batchnorm ops over F32s are converted into cudnn calls, so long as
+// epsilon is not too small.  This pass leaves other batchnorm ops unmodified.
+//
+// The GPU backend does not implement a lowering for the batchnorm HLOs -- it
+// expects them to be lowered to cudnn calls via this pass or to HLO soup via
+// BatchNormRewriter.
+class CudnnBatchNormRewriter : public HloPassInterface {
+ public:
+  tensorflow::StringPiece name() const override {
+    return "cudnn_batchnorm_rewriter";
+  }
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_BATCHNORM_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58d9c8caff31e878487fbef01afce566e6187fd9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
@@ -0,0 +1,285 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
+
+#include <string>
+
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+namespace se = ::perftools::gputools;
+namespace dnn = se::dnn;
+
+static std::pair<dnn::BatchDescriptor /*input_desc*/,
+                 dnn::BatchDescriptor /*scale_offset_desc*/>
+MakeDescriptors(const Shape& shape, int64 feature_index) {
+  std::vector<int64> logical_to_physical =
+      LayoutUtil::MakeLogicalToPhysical(shape.layout());
+
+  auto physical_dim_size = [&](int64 physical_dim) {
+    return shape.dimensions(LayoutUtil::Major(shape.layout(), physical_dim));
+  };
+
+  // Batchnorm only cares about the location of the depth (aka "feature") dim.
+  // The other dims are all treated the same.  Thus we can use the kBatchDepthYX
+  // cudnn layout for any XLA shape+layout, even XLA shapes that don't have
+  // exactly 4 dimensions: We put everything that comes before the feature dim
+  // into "batch", and everything that comes after the feature dim into "Y".
+  int64 batch_size = 1;
+  int64 y_size = 1;
+  int64 physical_dim;
+  for (physical_dim = 0; physical_dim != logical_to_physical[feature_index];
+       ++physical_dim) {
+    CHECK_LT(physical_dim, shape.dimensions_size());
+    batch_size *= physical_dim_size(physical_dim);
+  }
+  ++physical_dim;  // Skip the feature dimension.
+  for (; physical_dim < shape.dimensions_size(); ++physical_dim) {
+    y_size *= physical_dim_size(physical_dim);
+  }
+
+  dnn::BatchDescriptor input_desc;
+  input_desc.set_layout(dnn::DataLayout::kBatchDepthYX)
+      .set_count(batch_size)
+      .set_feature_map_count(shape.dimensions(feature_index))
+      .set_height(y_size)
+      .set_width(1);
+
+  dnn::BatchDescriptor scale_offset_desc;
+  scale_offset_desc.set_layout(dnn::DataLayout::kBatchDepthYX)
+      .set_feature_map_count(input_desc.feature_map_count())
+      .set_height(1)
+      .set_width(1)
+      .set_count(1);
+
+  return std::make_pair(input_desc, scale_offset_desc);
+}
+
+CudnnBatchNormForwardInferenceThunk::CudnnBatchNormForwardInferenceThunk(
+    const BufferAllocation::Slice& operand,
+    const BufferAllocation::Slice& scale, const BufferAllocation::Slice& offset,
+    const BufferAllocation::Slice& mean,
+    const BufferAllocation::Slice& variance, float epsilon, int64 feature_index,
+    const BufferAllocation::Slice& output, const HloInstruction* hlo)
+    : Thunk(Thunk::Kind::kCudnnBatchNormForwardInference, hlo),
+      operand_(operand),
+      scale_(scale),
+      offset_(offset),
+      mean_(mean),
+      variance_(variance),
+      epsilon_(epsilon),
+      feature_index_(feature_index),
+      output_(output) {
+  CHECK_EQ(hlo->opcode(), HloOpcode::kCustomCall);
+  CHECK_EQ(hlo->custom_call_target(),
+           kCudnnBatchNormForwardInferenceCallTarget);
+  CHECK(
+      LayoutUtil::LayoutsInShapesEqual(hlo->shape(), hlo->operand(0)->shape()));
+  CHECK_EQ(hlo->shape().element_type(), F32) << "Not yet implemented";
+}
+
+Status CudnnBatchNormForwardInferenceThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  dnn::BatchDescriptor operand_desc;
+  dnn::BatchDescriptor scale_offset_desc;
+  std::tie(operand_desc, scale_offset_desc) =
+      MakeDescriptors(hlo_instruction()->shape(), feature_index_);
+
+  se::DeviceMemory<float> output(buffer_allocations.GetDeviceAddress(output_));
+  stream->ThenBatchNormalizationForward(
+      se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(operand_)),
+      se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(scale_)),
+      se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(offset_)),
+      se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(mean_)),
+      se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(variance_)),
+      operand_desc,                //
+      scale_offset_desc,           //
+      epsilon_,                    //
+      &output,                     //
+      /*batch_mean=*/nullptr,      //
+      /*batch_var=*/nullptr,       //
+      /*saved_mean=*/nullptr,      //
+      /*saved_inv_var=*/nullptr,   //
+      /*is_training=*/false,       //
+      /*var_to_inv_var=*/nullptr,  //
+      /*inv_var_to_var=*/nullptr);
+  if (!stream->ok()) {
+    return InternalError("BatchNormalizationForward call failed.");
+  }
+  return Status::OK();
+}
+
+CudnnBatchNormForwardTrainingThunk::CudnnBatchNormForwardTrainingThunk(
+    const BufferAllocation::Slice& operand,
+    const BufferAllocation::Slice& scale, const BufferAllocation::Slice& offset,
+    float epsilon, int64 feature_index,
+    const BufferAllocation::Slice& output_data,
+    const BufferAllocation::Slice& output_mean,
+    const BufferAllocation::Slice& output_inv_stddev,
+    const BufferAllocation::Slice& output_tuple, const HloInstruction* hlo)
+    : Thunk(Thunk::Kind::kCudnnBatchNormForwardTraining, hlo),
+      operand_(operand),
+      scale_(scale),
+      offset_(offset),
+      epsilon_(epsilon),
+      feature_index_(feature_index),
+      output_data_(output_data),
+      output_mean_(output_mean),
+      output_inv_stddev_(output_inv_stddev),
+      output_tuple_(output_tuple) {
+  CHECK_EQ(hlo->opcode(), HloOpcode::kCustomCall);
+  CHECK_EQ(hlo->custom_call_target(), kCudnnBatchNormForwardTrainingCallTarget);
+  CHECK_EQ(hlo->shape().tuple_shapes_size(), 3);
+  CHECK(LayoutUtil::LayoutsInShapesEqual(hlo->shape().tuple_shapes(0),
+                                         hlo->operand(0)->shape()));
+  for (const auto& tuple_shape : hlo->shape().tuple_shapes()) {
+    CHECK_EQ(tuple_shape.element_type(), F32) << "Not yet implemented";
+  }
+}
+
+Status CudnnBatchNormForwardTrainingThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  dnn::BatchDescriptor operand_desc;
+  dnn::BatchDescriptor scale_offset_desc;
+  // The BatchNormTraining HLO outputs a tuple of three elements: output data,
+  // batch mean, and batch variance.  We want to make our descriptors based on
+  // the shape of the output data.
+  std::tie(operand_desc, scale_offset_desc) = MakeDescriptors(
+      hlo_instruction()->shape().tuple_shapes(0), feature_index_);
+
+  se::DeviceMemory<float> output_data(
+      buffer_allocations.GetDeviceAddress(output_data_));
+  se::DeviceMemory<float> output_mean(
+      buffer_allocations.GetDeviceAddress(output_mean_));
+  se::DeviceMemory<float> output_inv_stddev(
+      buffer_allocations.GetDeviceAddress(output_inv_stddev_));
+
+  se::DeviceMemory<float> null_device_ptr(nullptr);
+  stream->ThenBatchNormalizationForward(
+      se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(operand_)),
+      se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(scale_)),
+      se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(offset_)),
+      /*estimated_mean=*/null_device_ptr,
+      /*estimated_variance=*/null_device_ptr,
+      operand_desc,                          //
+      scale_offset_desc,                     //
+      epsilon_,                              //
+      &output_data,                          //
+      /*batch_mean=*/&null_device_ptr,       //
+      /*batch_var=*/&null_device_ptr,        //
+      /*saved_mean=*/&output_mean,           //
+      /*saved_inv_var=*/&output_inv_stddev,  //
+      /*is_training=*/true,                  //
+      /*var_to_inv_var=*/nullptr,            //
+      /*inv_var_to_var=*/nullptr);
+
+  // Write the tuple.
+  void* ptrs[] = {output_data.opaque(), output_mean.opaque(),
+                  output_inv_stddev.opaque()};
+  se::DeviceMemory<void*> tuple_addr(
+      buffer_allocations.GetDeviceAddress(output_tuple_));
+  stream->ThenMemcpyH2D<void*>(ptrs, &tuple_addr);
+
+  if (!stream->ok()) {
+    return InternalError("BatchNormalizationTraining call failed.");
+  }
+  return Status::OK();
+}
+
+CudnnBatchNormBackwardThunk::CudnnBatchNormBackwardThunk(
+    const BufferAllocation::Slice& operand,
+    const BufferAllocation::Slice& scale, const BufferAllocation::Slice& mean,
+    const BufferAllocation::Slice& inv_stddev,
+    const BufferAllocation::Slice& grad_output, float epsilon,
+    int64 feature_index, const BufferAllocation::Slice& output_grad_data,
+    const BufferAllocation::Slice& output_grad_scale,
+    const BufferAllocation::Slice& output_grad_offset,
+    const BufferAllocation::Slice& output_tuple, const HloInstruction* hlo)
+    : Thunk(Thunk::Kind::kCudnnBatchNormBackward, hlo),
+      operand_(operand),
+      scale_(scale),
+      mean_(mean),
+      inv_stddev_(inv_stddev),
+      grad_output_(grad_output),
+      epsilon_(epsilon),
+      feature_index_(feature_index),
+      output_grad_data_(output_grad_data),
+      output_grad_scale_(output_grad_scale),
+      output_grad_offset_(output_grad_offset),
+      output_tuple_(output_tuple) {
+  CHECK_EQ(hlo->opcode(), HloOpcode::kCustomCall);
+  CHECK_EQ(hlo->custom_call_target(), kCudnnBatchNormBackwardCallTarget);
+  CHECK_EQ(hlo->shape().tuple_shapes_size(), 3);
+  CHECK(LayoutUtil::LayoutsInShapesEqual(hlo->shape().tuple_shapes(0),
+                                         hlo->operand(0)->shape()));
+  CHECK(LayoutUtil::LayoutsInShapesEqual(hlo->shape().tuple_shapes(0),
+                                         hlo->operand(4)->shape()));
+  for (const auto& tuple_shape : hlo->shape().tuple_shapes()) {
+    CHECK_EQ(tuple_shape.element_type(), F32) << "Not yet implemented";
+  }
+}
+
+Status CudnnBatchNormBackwardThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  dnn::BatchDescriptor operand_desc;
+  dnn::BatchDescriptor scale_offset_desc;
+
+  // This call outputs a tuple of three elements: grad data, grad offset, and
+  // grad scale.  We want to make our descriptors based on the shape of the grad
+  // data.
+  std::tie(operand_desc, scale_offset_desc) = MakeDescriptors(
+      hlo_instruction()->shape().tuple_shapes(0), feature_index_);
+
+  se::DeviceMemory<float> output_grad_data(
+      buffer_allocations.GetDeviceAddress(output_grad_data_));
+  se::DeviceMemory<float> output_grad_scale(
+      buffer_allocations.GetDeviceAddress(output_grad_scale_));
+  se::DeviceMemory<float> output_grad_offset(
+      buffer_allocations.GetDeviceAddress(output_grad_offset_));
+
+  stream->ThenBatchNormalizationBackward(
+      se::DeviceMemory<float>(
+          buffer_allocations.GetDeviceAddress(grad_output_)),
+      se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(operand_)),
+      se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(scale_)),
+      se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(mean_)),
+      se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(inv_stddev_)),
+      operand_desc, scale_offset_desc, epsilon_, &output_grad_data,
+      &output_grad_scale, &output_grad_offset);
+
+  // Write the output tuple.
+  void* ptrs[] = {output_grad_data.opaque(), output_grad_scale.opaque(),
+                  output_grad_offset.opaque()};
+  se::DeviceMemory<void*> tuple_addr(
+      buffer_allocations.GetDeviceAddress(output_tuple_));
+  stream->ThenMemcpyH2D<void*>(ptrs, &tuple_addr);
+
+  if (!stream->ok()) {
+    return InternalError("BatchNormalizationBackward call failed.");
+  }
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5fbb6d8a3912d380172d496d8d35e80dc9f5c71
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
@@ -0,0 +1,145 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_BATCHNORM_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_BATCHNORM_THUNK_H_
+
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace xla {
+namespace gpu {
+
+// This file contains thunks which call into cudnn to run the various flavors of
+// batch normalization: BatchNormInference, BatchNormTraining, and
+// BatchNormGrad, known to cudnn as BatchNormForwardInference,
+// BatchNormForwardTraining, and BatchNormBackward.
+//
+// As an alternative to using these thunks, XLA can decompose batchnorm HLOs
+// into smaller components using the BatchNormRewriter pass.  This can result in
+// faster code because those individual components can fuse into their
+// inputs/outputs, but it may also be slower if cudnn's batchnorm implementation
+// outperforms the code XLA generates for these components.
+//
+// Currently these thunks require that their inputs are F32s.
+//
+// Note that these thunks do not take full advantage of the cudnn batchnorm
+// functions.  For example, cudnn lets you bias and/or scale the input/output,
+// but these thunks don't currently support that.
+
+class CudnnBatchNormForwardInferenceThunk : public Thunk {
+ public:
+  CudnnBatchNormForwardInferenceThunk(const BufferAllocation::Slice& operand,
+                                      const BufferAllocation::Slice& scale,
+                                      const BufferAllocation::Slice& offset,
+                                      const BufferAllocation::Slice& mean,
+                                      const BufferAllocation::Slice& variance,
+                                      float epsilon, int64 feature_index,
+                                      const BufferAllocation::Slice& output,
+                                      const HloInstruction* hlo);
+
+  CudnnBatchNormForwardInferenceThunk(
+      const CudnnBatchNormForwardInferenceThunk&) = delete;
+  CudnnBatchNormForwardInferenceThunk& operator=(
+      const CudnnBatchNormForwardInferenceThunk&) = delete;
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
+
+ private:
+  BufferAllocation::Slice operand_;
+  BufferAllocation::Slice scale_;
+  BufferAllocation::Slice offset_;
+  BufferAllocation::Slice mean_;
+  BufferAllocation::Slice variance_;
+  float epsilon_;
+  int64 feature_index_;
+  BufferAllocation::Slice output_;
+};
+
+class CudnnBatchNormForwardTrainingThunk : public Thunk {
+ public:
+  CudnnBatchNormForwardTrainingThunk(
+      const BufferAllocation::Slice& operand,
+      const BufferAllocation::Slice& scale,
+      const BufferAllocation::Slice& offset, float epsilon, int64 feature_index,
+      const BufferAllocation::Slice& output_data,
+      const BufferAllocation::Slice& output_mean,
+      const BufferAllocation::Slice& output_inv_stddev,
+      const BufferAllocation::Slice& output_tuple, const HloInstruction* hlo);
+
+  CudnnBatchNormForwardTrainingThunk(
+      const CudnnBatchNormForwardTrainingThunk&) = delete;
+  CudnnBatchNormForwardTrainingThunk& operator=(
+      const CudnnBatchNormForwardTrainingThunk&) = delete;
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
+
+ private:
+  BufferAllocation::Slice operand_;
+  BufferAllocation::Slice scale_;
+  BufferAllocation::Slice offset_;
+  float epsilon_;
+  int64 feature_index_;
+  BufferAllocation::Slice output_data_;
+  BufferAllocation::Slice output_mean_;
+  BufferAllocation::Slice output_inv_stddev_;
+  BufferAllocation::Slice output_tuple_;
+};
+
+class CudnnBatchNormBackwardThunk : public Thunk {
+ public:
+  CudnnBatchNormBackwardThunk(const BufferAllocation::Slice& operand,
+                              const BufferAllocation::Slice& scale,
+                              const BufferAllocation::Slice& mean,
+                              const BufferAllocation::Slice& inv_stddev,
+                              const BufferAllocation::Slice& grad_output,
+                              float epsilon, int64 feature_index,
+                              const BufferAllocation::Slice& output_grad_data,
+                              const BufferAllocation::Slice& output_grad_scale,
+                              const BufferAllocation::Slice& output_grad_offset,
+                              const BufferAllocation::Slice& output_tuple,
+                              const HloInstruction* hlo);
+
+  CudnnBatchNormBackwardThunk(const CudnnBatchNormBackwardThunk&) = delete;
+  CudnnBatchNormBackwardThunk& operator=(const CudnnBatchNormBackwardThunk&) =
+      delete;
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
+
+ private:
+  BufferAllocation::Slice operand_;
+  BufferAllocation::Slice scale_;
+  BufferAllocation::Slice mean_;
+  BufferAllocation::Slice inv_stddev_;
+  BufferAllocation::Slice grad_output_;
+  float epsilon_;
+  int64 feature_index_;
+  BufferAllocation::Slice output_grad_data_;
+  BufferAllocation::Slice output_grad_scale_;
+  BufferAllocation::Slice output_grad_offset_;
+  BufferAllocation::Slice output_tuple_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_BATCHNORM_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c29aa31d4ee31c88ec6d315480d4258b190bbcff
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -0,0 +1,379 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
+#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace se = perftools::gputools;
+
+using se::DeviceMemoryBase;
+using se::dnn::AlgorithmConfig;
+using se::dnn::AlgorithmDesc;
+using tensorflow::gtl::nullopt;
+using tensorflow::gtl::optional;
+
+class ScratchAllocator : public se::ScratchAllocator {
+ public:
+  ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+      : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
+
+  ~ScratchAllocator() override;
+
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
+    return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
+  }
+  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
+
+  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override;
+
+ private:
+  const int device_ordinal_;
+  DeviceMemoryAllocator* memory_allocator_;
+  std::vector<se::DeviceMemoryBase> allocated_buffers_;
+  int64 total_allocated_bytes_ = 0;
+};
+
+ScratchAllocator::~ScratchAllocator() {
+  for (auto& allocated_buffer : allocated_buffers_) {
+    if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer)
+             .ok()) {
+      // The program can still continue with failed deallocation.
+      LOG(ERROR) << "Failed to deallocate the allocated buffer: "
+                 << allocated_buffer.opaque();
+    }
+  }
+}
+
+se::port::StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
+    se::Stream* stream, int64 byte_size) {
+  CHECK_GE(byte_size, 0) << "byte_size must be positive.";
+  if (byte_size > GetMemoryLimitInBytes(stream)) {
+    return se::port::Status(
+        se::port::error::RESOURCE_EXHAUSTED,
+        tensorflow::strings::Printf(
+            "Allocating %lld bytes exceeds the memory limit of %lld bytes.",
+            byte_size, GetMemoryLimitInBytes(stream)));
+  }
+
+  auto status_or_memory =
+      memory_allocator_->Allocate(device_ordinal_, byte_size,
+                                  /*retry_on_failure=*/false);
+  if (!status_or_memory.ok()) {
+    return se::port::Status(se::port::error::RESOURCE_EXHAUSTED,
+                            tensorflow::strings::Printf(
+                                "Failed to allocate %lld bytes on device %d.",
+                                byte_size, device_ordinal_));
+  }
+  se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie();
+  allocated_buffers_.push_back(allocated_buffer);
+  total_allocated_bytes_ += byte_size;
+  return se::DeviceMemory<uint8>(allocated_buffer);
+}
+
+// Determines whether we can safely perform a winograd non-fused convolution for
+// the given input and output shapes.  This works around b/68264959, an integer
+// overflow in cuDNNv5 and cuDNNv6.
+//
+// TODO(jlebar): We shouldn't need this check for cuDNNv7.
+bool ShouldIncludeWinogradNonfusedAlgo(
+    const Shape& input_shape, const Shape& output_shape,
+    const ConvolutionDimensionNumbers& dnums) {
+  int64 batch = input_shape.dimensions(dnums.input_batch_dimension());
+  int64 in_depths = input_shape.dimensions(dnums.input_feature_dimension());
+  int64 in_rows = input_shape.dimensions(dnums.input_spatial_dimensions(0));
+  int64 in_cols =
+      dnums.input_spatial_dimensions_size() == 1
+          ? 1
+          : input_shape.dimensions(dnums.input_spatial_dimensions(1));
+  int64 out_depths = output_shape.dimensions(dnums.output_feature_dimension());
+
+  int64 total_size = CeilOfRatio(batch, int64{16}) *
+                     std::max(in_depths, out_depths) * in_cols * in_rows *
+                     sizeof(float);
+
+  const int64 threshold = 1L << 31;
+  return total_size < threshold;
+}
+
+std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
+                                         bool with_winograd_nonfused,
+                                         se::StreamExecutor* stream_exec_) {
+  std::vector<AlgorithmDesc> algorithms;
+  switch (kind) {
+    case CudnnConvKind::kBackwardFilter:
+      CHECK(stream_exec_->GetConvolveBackwardFilterAlgorithms(
+          with_winograd_nonfused, &algorithms));
+      break;
+    case CudnnConvKind::kBackwardInput:
+      CHECK(stream_exec_->GetConvolveBackwardDataAlgorithms(
+          with_winograd_nonfused, &algorithms));
+      break;
+    case CudnnConvKind::kForward:
+      CHECK(stream_exec_->GetConvolveAlgorithms(with_winograd_nonfused,
+                                                &algorithms));
+      break;
+  }
+
+  // Remove any algorithms with tensor math enabled.  These have lower precision
+  // than regular algorithms, and we don't yet have a way to turn this on/off in
+  // XLA.
+  algorithms.erase(std::remove_if(algorithms.begin(), algorithms.end(),
+                                  [&](const AlgorithmDesc& a) {
+                                    return a.tensor_ops_enabled();
+                                  }),
+                   algorithms.end());
+
+  return algorithms;
+}
+
+string AlgorithmToString(const AlgorithmDesc& algo) {
+  if (algo.tensor_ops_enabled()) {
+    return tensorflow::strings::StrCat(algo.algo_id(), "+TC");
+  }
+  return tensorflow::strings::StrCat(algo.algo_id());
+}
+
+string NumBytesToString(int64 bytes) {
+  return tensorflow::strings::StrCat(
+      tensorflow::strings::HumanReadableNumBytes(bytes), " (", bytes, "B)");
+}
+
+}  // anonymous namespace
+
+// We could have caching here so that we don't redo this work for two identical
+// convolutions.  Unfortunately our cache key would have to be a tuple
+// containing the protos passed to this function, and we have no utility for
+// hashing protos.  We could write our own hash functions, but they'd silently
+// break if we ever added a field to one of the protos.  Perhaps we could hack
+// using the binary-encoded proto as the hash key, on the assumption that two
+// protos being binary-equal is a sufficient, if not necessary, condition for
+// proper equality.  But that would still leave us open to having unnecessary
+// cache misses and doing extra work.  Overall, caching doesn't seem worth the
+// trouble, but we may want to revisit this if we ever find a model where
+// caching would speed up compilation a lot.
+optional<std::tuple<int64, bool, int64>>
+CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
+    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+    const Shape& output_shape, const Window& window,
+    const ConvolutionDimensionNumbers& dnums, HloInstruction* instr) {
+  // Create a stream for us to do our work on.
+  se::Stream stream{stream_exec_};
+  stream.Init();
+  const auto device_ordinal = stream_exec_->device_ordinal();
+
+  // allocator either points to this->allocator_ or, if that's null, to a
+  // StreamExecutorMemoryAllocator for stream_exec_.
+  DeviceMemoryAllocator* allocator;
+  optional<StreamExecutorMemoryAllocator> se_allocator;
+  if (allocator_ != nullptr) {
+    allocator = allocator_;
+  } else {
+    se_allocator.emplace(
+        stream_exec_->platform(),
+        tensorflow::gtl::ArraySlice<se::StreamExecutor*>({stream_exec_}));
+    allocator = &*se_allocator;
+  }
+
+  // Allocate space for the input, filter, and output of the convolution.  We
+  // use a ScratchAllocator for this instead of calling allocator_ directly so
+  // that our allocations don't leak.
+  //
+  // We don't put any data in these buffers, because (in theory, anyway) the
+  // speed of a conv isn't affected by the data being convolved.
+  ScratchAllocator input_output_allocator(device_ordinal, allocator);
+  se::port::StatusOr<DeviceMemoryBase> input_buf =
+      input_output_allocator.AllocateBytes(&stream,
+                                           ShapeUtil::ByteSizeOf(input_shape));
+  se::port::StatusOr<DeviceMemoryBase> filter_buf =
+      input_output_allocator.AllocateBytes(&stream,
+                                           ShapeUtil::ByteSizeOf(filter_shape));
+  se::port::StatusOr<DeviceMemoryBase> output_buf =
+      input_output_allocator.AllocateBytes(&stream,
+                                           ShapeUtil::ByteSizeOf(output_shape));
+  if (!input_buf.ok() || !filter_buf.ok() || !output_buf.ok()) {
+    LOG(WARNING)
+        << "Couldn't allocate space for input/filter/output of convolution "
+        << instr->ToString() << ".  Falling back to default algorithm.";
+    return nullopt;
+  }
+
+  const bool use_winograd_nonfused =
+      ShouldIncludeWinogradNonfusedAlgo(input_shape, output_shape, dnums);
+  se::dnn::ProfileResult best_result;
+  int64 best_result_bytes_used = 0;
+  for (const AlgorithmDesc& alg :
+       GetAlgorithms(kind, use_winograd_nonfused, stream_exec_)) {
+    ScratchAllocator scratch_allocator(device_ordinal, allocator);
+    se::dnn::ProfileResult profile_result;
+    VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
+            << instr->ToString();
+
+    bool launch_ok =
+        RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                            se::DeviceMemory<float>(input_buf.ValueOrDie()),
+                            se::DeviceMemory<float>(filter_buf.ValueOrDie()),
+                            se::DeviceMemory<float>(output_buf.ValueOrDie()),
+                            &scratch_allocator, window, dnums,
+                            AlgorithmConfig(alg), &stream, &profile_result)
+            .ok();
+
+    if (launch_ok && profile_result.is_valid()) {
+      int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
+      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg)
+              << " succeeded, taking " << profile_result.elapsed_time_in_ms()
+              << "ms and using " << NumBytesToString(scratch_bytes_used)
+              << " of scratch (Best result: "
+              << best_result.elapsed_time_in_ms() << "ms, "
+              << NumBytesToString(best_result_bytes_used) << " of scratch)";
+      if (profile_result.elapsed_time_in_ms() <
+          best_result.elapsed_time_in_ms()) {
+        best_result = profile_result;
+        best_result_bytes_used = scratch_bytes_used;
+      }
+    } else {
+      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) << " failed.";
+    }
+  }
+  if (best_result.is_valid()) {
+    VLOG(2) << "Best algorithm for " << instr->ToString() << ": "
+            << AlgorithmToString(best_result.algorithm()) << ", takes "
+            << best_result.elapsed_time_in_ms() << "ms, and uses "
+            << best_result_bytes_used << "B of scratch memory.";
+    return std::make_tuple(best_result.algorithm().algo_id(),
+                           best_result.algorithm().tensor_ops_enabled(),
+                           best_result_bytes_used);
+  }
+
+  LOG(WARNING) << "All algorithms tried for convolution " << instr->ToString()
+               << " failed.  Falling back to default algorithm.";
+  return nullopt;
+}
+
+StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
+    HloInstruction* instr) {
+  CHECK(IsCustomCallToDnnConvolution(*instr));
+
+  const auto& call_target = instr->custom_call_target();
+  const auto& lhs_shape = instr->operand(0)->shape();
+  const auto& rhs_shape = instr->operand(1)->shape();
+  const auto& conv_result_shape = instr->shape().tuple_shapes(0);
+  optional<std::tuple<int64, bool, int64>> alg_scratch_and_tc;
+  if (call_target == kCudnnConvForwardCallTarget) {
+    alg_scratch_and_tc = PickBestAlgorithm(
+        CudnnConvKind::kForward, /*input_shape=*/lhs_shape,
+        /*filter_shape=*/rhs_shape, /*output_shape=*/conv_result_shape,
+        instr->window(), instr->convolution_dimension_numbers(), instr);
+  } else if (call_target == kCudnnConvBackwardInputCallTarget) {
+    alg_scratch_and_tc = PickBestAlgorithm(
+        CudnnConvKind::kBackwardInput, /*input_shape=*/conv_result_shape,
+        /*filter_shape=*/rhs_shape, /*output_shape=*/lhs_shape, instr->window(),
+        instr->convolution_dimension_numbers(), instr);
+  } else if (call_target == kCudnnConvBackwardFilterCallTarget) {
+    alg_scratch_and_tc = PickBestAlgorithm(
+        CudnnConvKind::kBackwardFilter, /*input_shape=*/lhs_shape,
+        /*filter_shape=*/conv_result_shape, /*output_shape=*/rhs_shape,
+        instr->window(), instr->convolution_dimension_numbers(), instr);
+  } else {
+    LOG(FATAL) << "Unknown custom call target for cudnn conv: "
+               << instr->ToString();
+  }
+
+  if (!alg_scratch_and_tc.has_value()) {
+    return false;
+  }
+
+  int64 algorithm;
+  bool tensor_ops_enabled;
+  int64 scratch_bytes;
+
+  std::tie(algorithm, tensor_ops_enabled, scratch_bytes) = *alg_scratch_and_tc;
+
+  VLOG(1) << "Setting cudnn conv to use algorithm " << algorithm << " and "
+          << NumBytesToString(scratch_bytes)
+          << " of scratch memory: " << instr->ToString()
+          << " tensor_ops_enabled: " << tensor_ops_enabled;
+
+  // Replace instr with a new CustomCall which has the correct algorithm, and
+  // whose output shape has the appropriate amount of scratch memory.
+  HloComputation* computation = instr->parent();
+  Shape new_call_shape =
+      ShapeUtil::MakeTupleShape({instr->shape().tuple_shapes(0),
+                                 ShapeUtil::MakeShape(U8, {scratch_bytes})});
+  HloInstruction* algorithm_hlo = computation->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int64>(algorithm)));
+  HloInstruction* tensor_ops_enabled_hlo =
+      computation->AddInstruction(HloInstruction::CreateConstant(
+          Literal::CreateR0<bool>(tensor_ops_enabled)));
+
+  HloInstruction* new_call =
+      computation->AddInstruction(HloInstruction::CreateCustomCall(
+          new_call_shape,
+          {instr->mutable_operand(0), instr->mutable_operand(1), algorithm_hlo,
+           tensor_ops_enabled_hlo},
+          instr->custom_call_target()));
+  new_call->set_window(instr->window());
+  new_call->set_convolution_dimension_numbers(
+      instr->convolution_dimension_numbers());
+
+  // Repackage new_call so it has the same shape as the original call, namely
+  // (conv_result, u8[0]).
+  HloInstruction* new_tuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(
+          {computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+               new_call_shape.tuple_shapes(0), new_call, 0)),
+           computation->AddInstruction(
+               HloInstruction::CreateConstant(Literal::CreateR1<uint8>({})))}));
+
+  TF_RETURN_IF_ERROR(instr->parent()->ReplaceInstruction(instr, new_tuple));
+  return true;
+}
+
+StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnComputation(
+    HloComputation* computation) {
+  std::vector<HloInstruction*> convs;
+  for (auto* instr : computation->instructions()) {
+    if (IsCustomCallToDnnConvolution(*instr)) {
+      convs.push_back(instr);
+    }
+  }
+
+  bool changed = false;
+  for (auto* instr : convs) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr));
+    changed |= result;
+  }
+  return changed;
+}
+
+StatusOr<bool> CudnnConvolutionAlgorithmPicker::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
new file mode 100644
index 0000000000000000000000000000000000000000..516210ec2e500cf03774d27408300ac3346e7b4f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
+
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// Modifies CustomCalls to cudnn convolutions, choosing the best algorithm for
+// each and adding explicit scratch space to the CustomCalls.
+class CudnnConvolutionAlgorithmPicker : public HloPassInterface {
+ public:
+  // If the `allocator` parameter is not null, we will use it to allocate temp
+  // memory while timing the various convolution algorithms.  If it's null,
+  // we'll use the default allocator on the StreamExecutor.
+  CudnnConvolutionAlgorithmPicker(
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* allocator)
+      : stream_exec_(stream_exec), allocator_(allocator) {}
+
+  tensorflow::StringPiece name() const override {
+    return "cudnn-convolution-algorithm-picker";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  StatusOr<bool> RunOnComputation(HloComputation* computation);
+  StatusOr<bool> RunOnInstruction(HloInstruction* instr);
+  tensorflow::gtl::optional<std::tuple<int64, bool, int64>> PickBestAlgorithm(
+      CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+      const Shape& output_shape, const Window& window,
+      const ConvolutionDimensionNumbers& dnums, HloInstruction* instr);
+
+  perftools::gputools::StreamExecutor* stream_exec_;  // never null
+  DeviceMemoryAllocator* allocator_;                  // may be null
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
similarity index 77%
rename from tensorflow/compiler/xla/service/gpu/convolution_folding.cc
rename to tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
index 828ae675d7ba60b4cee1c3f5312b069263d5a814..e0c73aa73acb7f3313eb54fb07390cb76590433e 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h"
 
 #include <numeric>
 #include <vector>
@@ -33,14 +33,32 @@ namespace xla {
 namespace gpu {
 
 namespace {
+
+bool CanImplementAsCudnnForwardConv(HloInstruction* conv) {
+  const ConvolutionDimensionNumbers& dnums =
+      conv->convolution_dimension_numbers();
+  if (dnums.input_spatial_dimensions_size() > 3) {
+    return false;
+  }
+
+  // CuDNN does not accept zero-element arguments
+  if (ShapeUtil::HasZeroElements(conv->operand(0)->shape()) ||
+      ShapeUtil::HasZeroElements(conv->operand(1)->shape())) {
+    return false;
+  }
+
+  if (window_util::HasWindowReversal(conv->window())) {
+    return false;
+  }
+  return true;
+}
+
 // Try to match a backward filter pattern that contains "conv".
 // Precondition: "conv" is a kConvolution.
-std::tuple<bool, std::vector<HloInstruction*>, Window,
-           ConvolutionDimensionNumbers>
-MatchBackwardFilter(HloInstruction* conv) {
+std::tuple<bool, Window, ConvolutionDimensionNumbers> MatchBackwardFilter(
+    HloInstruction* conv) {
   const auto no_match_result =
-      std::make_tuple(false, std::vector<HloInstruction*>(), Window(),
-                      ConvolutionDimensionNumbers());
+      std::make_tuple(false, Window(), ConvolutionDimensionNumbers());
   // Step 1: match the instruction pattern without considering the paddings and
   // dimension numbers just yet. We may need some generic pattern matcher
   // similar to third_party/llvm/llvm/include/llvm/IR/PatternMatch.h
@@ -55,19 +73,7 @@ MatchBackwardFilter(HloInstruction* conv) {
   //               v       v
   //              Convolution
   //                 conv
-  //                   |
-  //                   v
-  //               Transpose (optional if identity transposition)
   CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
-  // If the forward convolution is followed by a transpose, we can fuse the
-  // transpose into the backward convolution as well.
-  HloInstruction* transpose = nullptr;
-  if (conv->user_count() == 1) {
-    HloInstruction* single_user = *conv->users().begin();
-    if (single_user->opcode() == HloOpcode::kTranspose) {
-      transpose = single_user;
-    }
-  }
 
   // Step 2: match paddings and dimension numbers of the forward convolution.
   const ConvolutionDimensionNumbers& conv_dnums =
@@ -75,6 +81,9 @@ MatchBackwardFilter(HloInstruction* conv) {
   auto input_batch_dim = conv_dnums.input_batch_dimension();
   auto input_feature_dim = conv_dnums.input_feature_dimension();
   auto input_spatial_dims = conv_dnums.input_spatial_dimensions();
+  auto kernel_input_feature_dim = conv_dnums.kernel_input_feature_dimension();
+  auto kernel_output_feature_dim = conv_dnums.kernel_output_feature_dimension();
+  auto kernel_spatial_dims = conv_dnums.kernel_spatial_dimensions();
   auto output_batch_dim = conv_dnums.output_batch_dimension();
   auto output_feature_dim = conv_dnums.output_feature_dimension();
   auto output_spatial_dims = conv_dnums.output_spatial_dimensions();
@@ -96,9 +105,14 @@ MatchBackwardFilter(HloInstruction* conv) {
       VLOG(1) << "Padding low should be non-negative.";
       return no_match_result;
     }
+    if (window_dim.window_reversal()) {
+      VLOG(1) << "Window reversal field not supported";
+      return no_match_result;
+    }
     // Padding high will be checked in Step 3.
   }
-  if (transpose == nullptr && !window_util::HasWindowDilation(conv->window())) {
+  if (input_batch_dim == output_batch_dim &&
+      !window_util::HasWindowDilation(conv->window())) {
     VLOG(1) << conv->ToString()
             << " is a regular forward convolution. No need "
                "to fold it to a backward filter convolution.";
@@ -169,64 +183,40 @@ MatchBackwardFilter(HloInstruction* conv) {
     }
   }
 
-  // To make future HLO passes easier, we canonicalize the fused expression by
-  // adding an identity transposition if it's omitted in the pattern.
-  if (transpose == nullptr) {
-    // Create an identity transposition with the same rank as the forward
-    // convolution.
-    HloComputation* parent_computation = conv->parent();
-    std::vector<int64> transpose_dimensions(ShapeUtil::Rank(conv->shape()));
-    std::iota(transpose_dimensions.begin(), transpose_dimensions.end(), 0);
-    transpose =
-        parent_computation->AddInstruction(HloInstruction::CreateTranspose(
-            conv->shape(), conv, transpose_dimensions));
-    TF_CHECK_OK(conv->ReplaceAllUsesWith(transpose));
-  }
-
   // Restore the dimension numbers of the backward convolution from the forward
   // convolution. The two activation dimensions are reversed (batch and
   // feature).
   ConvolutionDimensionNumbers backward_conv_dnums;
   backward_conv_dnums.set_input_batch_dimension(input_feature_dim);
   backward_conv_dnums.set_input_feature_dimension(input_batch_dim);
-  backward_conv_dnums.set_output_batch_dimension(output_feature_dim);
-  backward_conv_dnums.set_output_feature_dimension(output_batch_dim);
   for (int i = 0; i < input_spatial_dims.size(); ++i) {
     backward_conv_dnums.add_input_spatial_dimensions(input_spatial_dims[i]);
   }
-  for (int i = 0; i < output_spatial_dims.size(); ++i) {
-    backward_conv_dnums.add_output_spatial_dimensions(output_spatial_dims[i]);
+  backward_conv_dnums.set_output_batch_dimension(kernel_input_feature_dim);
+  backward_conv_dnums.set_output_feature_dimension(kernel_output_feature_dim);
+  for (int i = 0; i < kernel_spatial_dims.size(); ++i) {
+    backward_conv_dnums.add_output_spatial_dimensions(kernel_spatial_dims[i]);
   }
   // The dimension numbering of the output of the forward convolution (before
   // transposition) is the same as that of the activations (according to the
   // semantics of kConvolution). The batch dimension of the activations should
   // be treated as the input feature dimension, and the feature dimension should
   // be treated as the output feature.
-  //
-  // The output of the forward convolution needs to be transposed to fit into
-  // the dimension numbering of the weight gradients. This transposition maps
-  // dimension i to PositionInContainer(transpose->dimensions(), i).
-  backward_conv_dnums.set_kernel_input_feature_dimension(
-      PositionInContainer(transpose->dimensions(), output_batch_dim));
-  backward_conv_dnums.set_kernel_output_feature_dimension(
-      PositionInContainer(transpose->dimensions(), output_feature_dim));
+  backward_conv_dnums.set_kernel_input_feature_dimension(output_batch_dim);
+  backward_conv_dnums.set_kernel_output_feature_dimension(output_feature_dim);
   for (int i = 0; i < output_spatial_dims.size(); ++i) {
-    backward_conv_dnums.add_kernel_spatial_dimensions(
-        PositionInContainer(transpose->dimensions(), output_spatial_dims[i]));
+    backward_conv_dnums.add_kernel_spatial_dimensions(output_spatial_dims[i]);
   }
 
-  return std::make_tuple(true, std::vector<HloInstruction*>({transpose, conv}),
-                         backward_conv_window, backward_conv_dnums);
+  return std::make_tuple(true, backward_conv_window, backward_conv_dnums);
 }
 
 // Try to match a backward input pattern that contains "conv".
 // Precondition: "conv" is a kConvolution.
-std::tuple<bool, std::vector<HloInstruction*>, Window,
-           ConvolutionDimensionNumbers>
-MatchBackwardInput(HloInstruction* conv) {
+std::tuple<bool, Window, ConvolutionDimensionNumbers> MatchBackwardInput(
+    HloInstruction* conv) {
   const auto no_match_result =
-      std::make_tuple(false, std::vector<HloInstruction*>(), Window(),
-                      ConvolutionDimensionNumbers());
+      std::make_tuple(false, Window(), ConvolutionDimensionNumbers());
 
   // Match instruction pattern.
   CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
@@ -275,6 +265,10 @@ MatchBackwardInput(HloInstruction* conv) {
               << " should have no window dilation.";
       return no_match_result;
     }
+    if (window_dim.window_reversal()) {
+      VLOG(1) << "Window reversal field not supported";
+      return no_match_result;
+    }
   }
 
   const auto& input_spatial_dims = dnums.input_spatial_dimensions();
@@ -395,58 +389,82 @@ MatchBackwardInput(HloInstruction* conv) {
   dnums.set_kernel_output_feature_dimension(
       conv->convolution_dimension_numbers().kernel_input_feature_dimension());
 
-  return std::make_tuple(true,
-                         std::vector<HloInstruction*>({conv, reverse_filter}),
-                         new_window, dnums);
+  return std::make_tuple(true, new_window, dnums);
 }
-}  // namespace
 
-StatusOr<bool> ConvolutionFolding::Run(HloModule* module) {
-  HloComputation* entry_computation = module->entry_computation();
-  std::vector<HloInstruction*> convs;
-  for (auto* hlo : entry_computation->instructions()) {
-    if (hlo->opcode() == HloOpcode::kConvolution) {
-      convs.push_back(hlo);
-    }
-  }
+// Tries to rewrite a single convolution into a call to cudnn.
+StatusOr<bool> RunOnInstruction(HloInstruction* conv) {
+  CHECK_EQ(conv->opcode(), HloOpcode::kConvolution);
 
-  bool changed = false;
-  for (HloInstruction* conv : convs) {
+  HloInstruction* custom_call = [&]() -> HloInstruction* {
     bool match;
-    std::vector<HloInstruction*> hlos_to_fuse;
     Window window;
     ConvolutionDimensionNumbers dnums;
-    std::tie(match, hlos_to_fuse, window, dnums) = MatchBackwardFilter(conv);
+
+    std::tie(match, window, dnums) = MatchBackwardFilter(conv);
     if (match) {
-      VLOG(2) << "Fuse instructions";
-      for (HloInstruction* hlo_to_fuse : hlos_to_fuse) {
-        VLOG(2) << "  " << hlo_to_fuse->ToString();
-      }
-      HloInstruction* backward_convolution =
-          entry_computation->CreateFusionInstructionForBackwardConvolution(
-              hlos_to_fuse, HloInstruction::FusionKind::kConvBackwardFilter,
-              window, dnums);
-      VLOG(2) << "to backward filter convolution";
-      VLOG(2) << "  " << backward_convolution->ToString();
-      changed = true;
-      continue;
+      return CreateCudnnConvBackwardFilter(
+          conv->shape(), conv->mutable_operand(0), conv->mutable_operand(1),
+          window, dnums);
     }
 
-    std::tie(match, hlos_to_fuse, window, dnums) = MatchBackwardInput(conv);
+    std::tie(match, window, dnums) = MatchBackwardInput(conv);
     if (match) {
-      VLOG(2) << "Fuse instructions";
-      for (HloInstruction* hlo_to_fuse : hlos_to_fuse) {
-        VLOG(2) << "  " << hlo_to_fuse->ToString();
-      }
-      HloInstruction* backward_convolution =
-          entry_computation->CreateFusionInstructionForBackwardConvolution(
-              hlos_to_fuse, HloInstruction::FusionKind::kConvBackwardInput,
-              window, dnums);
-      VLOG(2) << "to backward input convolution";
-      VLOG(2) << "  " << backward_convolution->ToString();
-      changed = true;
-      continue;
+      // Backward input conv subsumes the conv plus the reverse in operand 1.
+      HloInstruction* reverse = conv->mutable_operand(1);
+      CHECK_EQ(reverse->opcode(), HloOpcode::kReverse);
+      HloInstruction* rhs = reverse->mutable_operand(0);
+
+      return CreateCudnnConvBackwardInput(
+          conv->shape(), conv->mutable_operand(0), rhs, window, dnums);
+    }
+
+    // If all else fails, try a forward convolution.
+    if (CanImplementAsCudnnForwardConv(conv)) {
+      return CreateCudnnConvForward(conv->shape(), conv->mutable_operand(0),
+                                    conv->mutable_operand(1), conv->window(),
+                                    conv->convolution_dimension_numbers());
     }
+
+    return nullptr;
+  }();
+
+  if (custom_call == nullptr) {
+    return false;
+  }
+
+  // The CustomCall returns a tuple (conv_result, scratch_memory).  Extract out
+  // the conv result and replace `conv` with it.
+  TF_RETURN_IF_ERROR(conv->parent()->ReplaceWithNewInstruction(
+      conv,
+      HloInstruction::CreateGetTupleElement(conv->shape(), custom_call, 0)));
+  return true;
+}
+
+// Rewrites the convolutions in the given computation into calls to cudnn.
+// Returns true if it made any changes.
+StatusOr<bool> RunOnComputation(HloComputation* computation) {
+  std::vector<HloInstruction*> convs;
+  for (auto* hlo : computation->instructions()) {
+    if (hlo->opcode() == HloOpcode::kConvolution) {
+      convs.push_back(hlo);
+    }
+  }
+
+  bool changed = false;
+  for (HloInstruction* conv : convs) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(conv));
+    changed |= result;
+  }
+  return changed;
+}
+}  // namespace
+
+StatusOr<bool> CudnnConvolutionRewriter::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
   }
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c0578d88840fed1d77f7456c9acef27dec380f5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites plain convolutions, backwards-filter convolutions, and
+// backwards-input convolutions into CustomCall HLOs that call into cuDNN.
+class CudnnConvolutionRewriter : public HloPassInterface {
+ public:
+  tensorflow::StringPiece name() const override {
+    return "cudnn-convolution-rewriter";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc
similarity index 80%
rename from tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
rename to tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc
index 112c496e1f6bd17f89ac389ccf0256846dfa1971..65588b6aaf24da628ea586eb52c462b78b8daaa7 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,23 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h"
 
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace gpu {
+namespace {
 
-class ConvolutionFoldingTest : public HloTestBase {
+namespace op = xla::testing::opcode_matchers;
+
+class CudnnConvolutionRewriterTest : public HloTestBase {
  public:
-  ConvolutionFoldingTest() {
+  CudnnConvolutionRewriterTest() {
     for (int i = 0; i < 2; ++i) {
       WindowDimension* window_dim = default_conv_window_.add_dimensions();
       window_dim->set_size(1);
@@ -44,20 +50,21 @@ class ConvolutionFoldingTest : public HloTestBase {
     // the batch and feature dimension in the activations, and treat the batch
     // dimension in gradients as the input feature dimension in the filter.
     //
-    // TODO(jingyue): Add more tests on NCHW input order which TF also supports.
+    // TODO(jingyue): Add more tests on NCHW input order, which TF also
+    // supports.
     tf_default_dnums_for_backward_filter_.set_input_batch_dimension(3);
-    tf_default_dnums_for_backward_filter_.set_output_batch_dimension(3);
     tf_default_dnums_for_backward_filter_.set_input_feature_dimension(0);
-    tf_default_dnums_for_backward_filter_.set_output_feature_dimension(0);
     tf_default_dnums_for_backward_filter_.add_input_spatial_dimensions(1);
-    tf_default_dnums_for_backward_filter_.add_output_spatial_dimensions(1);
     tf_default_dnums_for_backward_filter_.add_input_spatial_dimensions(2);
-    tf_default_dnums_for_backward_filter_.add_output_spatial_dimensions(2);
     tf_default_dnums_for_backward_filter_.set_kernel_input_feature_dimension(0);
     tf_default_dnums_for_backward_filter_.set_kernel_output_feature_dimension(
         3);
     tf_default_dnums_for_backward_filter_.add_kernel_spatial_dimensions(1);
     tf_default_dnums_for_backward_filter_.add_kernel_spatial_dimensions(2);
+    tf_default_dnums_for_backward_filter_.add_output_spatial_dimensions(0);
+    tf_default_dnums_for_backward_filter_.add_output_spatial_dimensions(1);
+    tf_default_dnums_for_backward_filter_.set_output_batch_dimension(2);
+    tf_default_dnums_for_backward_filter_.set_output_feature_dimension(3);
 
     tf_default_dnums_for_backward_input_.set_input_batch_dimension(0);
     tf_default_dnums_for_backward_input_.set_output_batch_dimension(0);
@@ -74,9 +81,8 @@ class ConvolutionFoldingTest : public HloTestBase {
   }
 
  protected:
-  bool FoldConvolution(HloModule* module) {
-    ConvolutionFolding convolution_folding;
-    return convolution_folding.Run(module).ValueOrDie();
+  bool RunPass(HloModule* module) {
+    return CudnnConvolutionRewriter().Run(module).ValueOrDie();
   }
 
   // A convolution window with stride 1 and zero padding. The size fields are
@@ -86,7 +92,7 @@ class ConvolutionFoldingTest : public HloTestBase {
   ConvolutionDimensionNumbers tf_default_dnums_for_backward_input_;
 };
 
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithoutTranspose) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardFilterConvolve) {
   HloComputation::Builder builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -108,14 +114,13 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithoutTranspose) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
-TEST_F(ConvolutionFoldingTest,
+TEST_F(CudnnConvolutionRewriterTest,
        BackwardFilterConvolveEquivalentToForwardConvolution) {
   HloComputation::Builder builder(TestName());
   HloInstruction* activations =
@@ -135,12 +140,17 @@ TEST_F(ConvolutionFoldingTest,
       tf_default_dnums_for_backward_filter_));
 
   auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(module.get()));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
 // Extracted from block35 training.
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedActivations) {
+TEST_F(CudnnConvolutionRewriterTest,
+       BackwardFilterConvolveWithPaddedActivations) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -155,26 +165,22 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedActivations) {
     conv_window.mutable_dimensions(i)->set_padding_low(1);
     conv_window.mutable_dimensions(i)->set_padding_high(1);
   }
-  HloInstruction* convolution =
-      builder.AddInstruction(HloInstruction::CreateConvolve(
-          ShapeUtil::MakeShape(F32, {32, 3, 3, 32}), activations, gradients,
-          conv_window, tf_default_dnums_for_backward_filter_));
-
-  builder.AddInstruction(HloInstruction::CreateTranspose(
-      ShapeUtil::MakeShape(F32, {3, 3, 32, 32}), convolution, {1, 2, 3, 0}));
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {32, 3, 3, 32}), activations, gradients,
+      conv_window, tf_default_dnums_for_backward_filter_));
 
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
 // Extracted from inception v3 training.
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedGradients) {
+TEST_F(CudnnConvolutionRewriterTest,
+       BackwardFilterConvolveWithPaddedGradients) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -189,25 +195,20 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedGradients) {
     conv_window.mutable_dimensions(i)->set_padding_high(-1);
     conv_window.mutable_dimensions(i)->set_window_dilation(2);
   }
-  HloInstruction* convolution =
-      builder.AddInstruction(HloInstruction::CreateConvolve(
-          ShapeUtil::MakeShape(F32, {320, 3, 3, 192}), activations, gradients,
-          conv_window, tf_default_dnums_for_backward_filter_));
-
-  builder.AddInstruction(HloInstruction::CreateTranspose(
-      ShapeUtil::MakeShape(F32, {3, 3, 192, 320}), convolution, {1, 2, 3, 0}));
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {320, 3, 3, 192}), activations, gradients,
+      conv_window, tf_default_dnums_for_backward_filter_));
 
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithUnevenPadding) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardFilterConvolveWithUnevenPadding) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -222,25 +223,20 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithUnevenPadding) {
     // Uneven padding: padding_low=0, padding_high=1
     conv_window.mutable_dimensions(i)->set_padding_high(1);
   }
-  HloInstruction* convolution =
-      builder.AddInstruction(HloInstruction::CreateConvolve(
-          ShapeUtil::MakeShape(F32, {32, 2, 2, 32}), activations, gradients,
-          conv_window, tf_default_dnums_for_backward_filter_));
-
-  builder.AddInstruction(HloInstruction::CreateTranspose(
-      ShapeUtil::MakeShape(F32, {2, 2, 32, 32}), convolution, {1, 2, 3, 0}));
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {32, 2, 2, 32}), activations, gradients,
+      conv_window, tf_default_dnums_for_backward_filter_));
 
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardFilter ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
 }
 
-TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveEvenPadding) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* output =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -284,14 +280,15 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+  const HloInstruction* custom_call =
+      entry_computation->root_instruction()->operand(0);
   for (int i = 0; i < 2; ++i) {
-    const WindowDimension& window_dim =
-        entry_computation->root_instruction()->window().dimensions(i);
+    const WindowDimension& window_dim = custom_call->window().dimensions(i);
     // Low padding of the backward input convolution
     //   = kernel_size - 1 - low padding on gradients.
     EXPECT_EQ(3, window_dim.padding_low());
@@ -303,7 +300,7 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) {
 // Convolve([abc], [x], base_dilation=2)
 //   = Convolve([abc], Reverse([x]), base_dilation=2)
 //   = BackwardInputConvolve([abc], [x], stride=2)
-TEST_F(ConvolutionFoldingTest, BackwardInputConvolve1x1Filter) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolve1x1Filter) {
   auto builder = HloComputation::Builder(TestName());
   // NHWC dimension order.
   HloInstruction* output =
@@ -328,17 +325,16 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolve1x1Filter) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
 }
 
 // BackwardInputConvolve([abc], [x], stride=1) is equivalent to
 // ForwardConvolve([abc], [x], stride=1). No need to fold it into backward input
 // convolution.
-TEST_F(ConvolutionFoldingTest,
+TEST_F(CudnnConvolutionRewriterTest,
        BackwardInputConvolve1x1FilterEquivalentToForwardConvolve) {
   auto builder = HloComputation::Builder(TestName());
   // NHWC dimension order.
@@ -359,8 +355,12 @@ TEST_F(ConvolutionFoldingTest,
       tf_default_dnums_for_backward_input_));
 
   auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(module.get()));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
 }
 
 // Extracted from Inception V3 training.
@@ -377,7 +377,8 @@ TEST_F(ConvolutionFoldingTest,
 //                     20x10x10x192
 //
 // Gradients are padded unevenly.
-TEST_F(ConvolutionFoldingTest, BackwardInputConvolveUnevenPaddingOnGradients) {
+TEST_F(CudnnConvolutionRewriterTest,
+       BackwardInputConvolveUnevenPaddingOnGradients) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* output =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -409,14 +410,14 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveUnevenPaddingOnGradients) {
   auto module = CreateNewModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  EXPECT_EQ(HloOpcode::kFusion,
-            entry_computation->root_instruction()->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
-              entry_computation->root_instruction()->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+  const HloInstruction* custom_call =
+      entry_computation->root_instruction()->operand(0);
   for (int i = 0; i < 2; ++i) {
-    const WindowDimension& window_dim =
-        entry_computation->root_instruction()->window().dimensions(i);
+    const WindowDimension& window_dim = custom_call->window().dimensions(i);
     EXPECT_EQ(0, window_dim.padding_low());
     EXPECT_EQ(0, window_dim.padding_high());
     EXPECT_EQ(2, window_dim.stride());
@@ -425,7 +426,7 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveUnevenPaddingOnGradients) {
 
 // Similar to BackwardInputConvolveUnevenPadding, but the low padding of the
 // gradients exceeds kernel_size - 1. Therefore, this pattern cannot be fused.
-TEST_F(ConvolutionFoldingTest, BackwardInputConvolveLowPaddingTooLarge) {
+TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveLowPaddingTooLarge) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* output =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -454,8 +455,12 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveLowPaddingTooLarge) {
                          .ValueOrDie()));
 
   auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(module.get()));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
 }
 
 // Extracted from //learning/brain/google/xla/benchmarks/resnet.py
@@ -472,7 +477,7 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveLowPaddingTooLarge) {
 //
 // We should fuse BC even though padding on activations is uneven, because
 // PadInsertion will canonicalize the fusion HLO.
-TEST_F(ConvolutionFoldingTest,
+TEST_F(CudnnConvolutionRewriterTest,
        BackwardInputConvolveUnevenPaddingOnActivations) {
   auto builder = HloComputation::Builder(TestName());
   // The gradients are in NCHW layout.
@@ -505,13 +510,12 @@ TEST_F(ConvolutionFoldingTest,
   auto module = CreateNewModule();
   const HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(FoldConvolution(module.get()));
-  const HloInstruction* backward_conv = entry_computation->root_instruction();
-  EXPECT_EQ(HloOpcode::kFusion, backward_conv->opcode());
-  EXPECT_TRUE(HloInstruction::FusionKind::kConvBackwardInput ==
-              backward_conv->fusion_kind());
+  EXPECT_TRUE(RunPass(module.get()));
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
   const WindowDimension& backward_conv_col_dim =
-      backward_conv->window().dimensions(1);
+      entry_computation->root_instruction()->operand(0)->window().dimensions(1);
   EXPECT_EQ(0, backward_conv_col_dim.padding_low());
   EXPECT_EQ(1, backward_conv_col_dim.padding_high());
 }
@@ -527,7 +531,7 @@ TEST_F(ConvolutionFoldingTest,
 //
 // We currently don't fuse BC because PadInsertion doesn't support negative
 // padding on the gradients of backward convolution (b/32744257).
-TEST_F(ConvolutionFoldingTest,
+TEST_F(CudnnConvolutionRewriterTest,
        BackwardInputConvolveNegativePaddingHighOnActivations) {
   auto builder = HloComputation::Builder(TestName());
   // The gradients are in NCHW layout.
@@ -556,9 +560,14 @@ TEST_F(ConvolutionFoldingTest,
                          .ValueOrDie()));
 
   auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(module.get()));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
 }
 
+}  // anonymous namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81695a6c326b922904330f33bc88260729ff67ee
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
@@ -0,0 +1,224 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace se = ::perftools::gputools;
+
+using se::DeviceMemory;
+using se::DeviceMemoryBase;
+using se::Stream;
+using se::dnn::AlgorithmConfig;
+using se::dnn::BatchDescriptor;
+using se::dnn::ConvolutionDescriptor;
+using se::dnn::DataLayout;
+using se::dnn::DimIndex;
+using se::dnn::FilterDescriptor;
+using se::dnn::FilterLayout;
+using se::dnn::ProfileResult;
+
+// A StreamExecutor ScratchAllocator that wraps a single XLA allocation,
+// returning it (in its entirety) the first time Allocate() is called.
+class ScratchBufAllocator : public se::ScratchAllocator {
+ public:
+  explicit ScratchBufAllocator(se::DeviceMemoryBase scratch)
+      : scratch_(scratch) {}
+
+  ~ScratchBufAllocator() override = default;
+
+  int64 GetMemoryLimitInBytes(se::Stream* /*stream*/) override {
+    return scratch_.size();
+  }
+
+  se::port::StatusOr<DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override {
+    if (allocated_) {
+      return se::port::InternalError(
+          "Can't allocate twice from a ScratchBufAllocator.");
+    }
+    if (byte_size > scratch_.size()) {
+      return se::port::InternalError(tensorflow::strings::StrCat(
+          "Can't allocate ", byte_size,
+          " bytes from a ScratchBufAllocator of size ", scratch_.size()));
+    }
+
+    allocated_ = true;
+    return se::DeviceMemory<uint8>(scratch_);
+  }
+
+ private:
+  se::DeviceMemoryBase scratch_;
+  bool allocated_ = false;
+};
+
+}  // anonymous namespace
+
+string CudnnConvKindToString(CudnnConvKind kind) {
+  switch (kind) {
+    case CudnnConvKind::kForward:
+      return "forward";
+    case CudnnConvKind::kBackwardFilter:
+      return "backward_filter";
+    case CudnnConvKind::kBackwardInput:
+      return "backward_input";
+  }
+}
+
+Status RunCudnnConvolution(CudnnConvKind kind, const Shape& input_shape,
+                           const Shape& filter_shape, const Shape& output_shape,
+                           DeviceMemory<float> input_buf,
+                           DeviceMemory<float> filter_buf,
+                           DeviceMemory<float> output_buf,
+                           DeviceMemoryBase scratch_buf, const Window& window,
+                           const ConvolutionDimensionNumbers& dnums,
+                           AlgorithmConfig algorithm, Stream* stream,
+                           ProfileResult* profile_result /*= nullptr*/) {
+  ScratchBufAllocator scratch_allocator(scratch_buf);
+  return RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                             input_buf, filter_buf, output_buf,
+                             &scratch_allocator, window, dnums, algorithm,
+                             stream, profile_result);
+}
+
+Status RunCudnnConvolution(
+    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+    const Shape& output_shape, DeviceMemory<float> input_buf,
+    DeviceMemory<float> filter_buf, DeviceMemory<float> output_buf,
+    se::ScratchAllocator* scratch_allocator, const Window& window,
+    const ConvolutionDimensionNumbers& dnums, AlgorithmConfig algorithm,
+    Stream* stream, ProfileResult* profile_result /*= nullptr*/) {
+  VLOG(3) << "Convolution Algorithm: " << algorithm.algorithm().algo_id();
+  VLOG(3) << "tensor_ops_enabled: "
+          << algorithm.algorithm().tensor_ops_enabled();
+  VLOG(3) << "Convolution kind: " << CudnnConvKindToString(kind);
+  VLOG(3) << "input shape: { " << ShapeUtil::HumanString(input_shape) << " }";
+  VLOG(3) << "filter shape: { " << ShapeUtil::HumanString(filter_shape) << " }";
+  VLOG(3) << "Output shape: { " << ShapeUtil::HumanString(output_shape) << " }";
+  VLOG(3) << "Window: { " << window.ShortDebugString() << " }";
+  VLOG(3) << "Dim nums: { " << dnums.ShortDebugString() << " }";
+
+  const int num_dimensions = window.dimensions_size();
+  CHECK_LE(num_dimensions, 3);
+  // cuDNN does not support 1D convolutions. We therefore express 1D
+  // convolutions as 2D convolutions where the first spatial dimension is 1.
+  // This matches the behavior of TF (see definition of conv1d in
+  // tensorflow/python/ops/nn_ops.py).
+  const int effective_num_dimensions = std::max(2, num_dimensions);
+
+  CHECK_EQ(F32, output_shape.element_type())
+      << ShapeUtil::HumanString(output_shape);
+  CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size());
+  CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size());
+  CHECK_EQ(num_dimensions, dnums.output_spatial_dimensions_size());
+  for (const WindowDimension& dim : window.dimensions()) {
+    CHECK_EQ(dim.padding_low(), dim.padding_high());
+  }
+
+  // cuDNN's convolution APIs support the BDYX layout for activations/output and
+  // the OIYX layout for weights.
+  BatchDescriptor input_descriptor(effective_num_dimensions);
+  input_descriptor.set_layout(DataLayout::kBatchDepthYX)
+      .set_feature_map_count(
+          input_shape.dimensions(dnums.input_feature_dimension()))
+      .set_count(input_shape.dimensions(dnums.input_batch_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    // Note that the dimensions are reversed. The same holds below.
+    input_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        input_shape.dimensions(dnums.input_spatial_dimensions(dim)));
+  }
+
+  FilterDescriptor filter_descriptor(effective_num_dimensions);
+  filter_descriptor.set_layout(FilterLayout::kOutputInputYX)
+      .set_input_feature_map_count(
+          filter_shape.dimensions(dnums.kernel_input_feature_dimension()))
+      .set_output_feature_map_count(
+          filter_shape.dimensions(dnums.kernel_output_feature_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    filter_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        filter_shape.dimensions(dnums.kernel_spatial_dimensions(dim)));
+  }
+
+  ConvolutionDescriptor convolution_descriptor(effective_num_dimensions);
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    convolution_descriptor
+        .set_zero_padding(
+            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+            window.dimensions(dim).padding_low())
+        .set_filter_stride(
+            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+            window.dimensions(dim).stride());
+  }
+
+  BatchDescriptor output_descriptor(effective_num_dimensions);
+  output_descriptor.set_layout(DataLayout::kBatchDepthYX)
+      .set_feature_map_count(
+          output_shape.dimensions(dnums.output_feature_dimension()))
+      .set_count(output_shape.dimensions(dnums.output_batch_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    output_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        output_shape.dimensions(dnums.output_spatial_dimensions(dim)));
+  }
+
+  // Add a singleton dimension in the 1D convolution case.
+  if (num_dimensions == 1) {
+    input_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    output_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    filter_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    convolution_descriptor.set_zero_padding(static_cast<DimIndex>(0), 0)
+        .set_filter_stride(static_cast<DimIndex>(0), 1);
+  }
+
+  switch (kind) {
+    case CudnnConvKind::kForward:
+      stream->ThenConvolveWithAlgorithm(
+          input_descriptor, input_buf, filter_descriptor, filter_buf,
+          convolution_descriptor, output_descriptor, &output_buf,
+          scratch_allocator, algorithm, profile_result);
+      break;
+    case CudnnConvKind::kBackwardInput:
+      stream->ThenConvolveBackwardDataWithAlgorithm(
+          filter_descriptor, filter_buf, output_descriptor, output_buf,
+          convolution_descriptor, input_descriptor, &input_buf,
+          scratch_allocator, algorithm, profile_result);
+      break;
+    case CudnnConvKind::kBackwardFilter:
+      stream->ThenConvolveBackwardFilterWithAlgorithm(
+          input_descriptor, input_buf, output_descriptor, output_buf,
+          convolution_descriptor, filter_descriptor, &filter_buf,
+          scratch_allocator, algorithm, profile_result);
+      break;
+  }
+
+  if (!stream->ok()) {
+    return InternalError(
+        "Unable to launch convolution with type %s and algorithm (%lld, %lld)",
+        CudnnConvKindToString(kind).c_str(), algorithm.algorithm().algo_id(),
+        algorithm.algorithm_no_scratch().algo_id());
+  }
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..b101f76510c129fd22b246e5f0348848192ecbba
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
@@ -0,0 +1,97 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_
+
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// This file contains low-level routines for running cudnn convolutions.
+
+// Different types of convolutions supported by cudnn.
+//
+// A way to think about these is that a convolution is defined by three arrays
+// -- the "input", the "filter", and the "output" -- and given any two of these,
+// we can compute the third.  For example, a backward-input convolution takes as
+// input a filter and an "output" and produces an "input" such that if one were
+// to do a forward convolution of "input" using filter, the result would be
+// something with the same shape as "output".
+//
+// This way of thinking is not correct if you look at the values produced. For
+// example, a backward-input convolution is not actually the mathematical
+// inverse of a forward convolution.  But it's right as far as the shapes and
+// "connectivity" (i.e. which elements of the input affect which elements of
+// the output) are concerned.
+enum class CudnnConvKind {
+  kForward,         // input  + filter => output
+  kBackwardInput,   // filter + output => input
+  kBackwardFilter,  // input  + output => filter
+};
+
+// Converts a CudnnConvKind value to a string.
+string CudnnConvKindToString(CudnnConvKind kind);
+
+// Calls into cudnn to run the specified convolution.
+//
+// Note that depending on the value of CudnnConvKind, the result of this call
+// may be written into input_buf, filter_buf, or output_buf!
+//
+// At the moment we only support cudnn convolutions over floats.
+//
+// We provide one overload which takes a scratch buffer, and another which takes
+// an allocator which is responsible for allocating the scratch space.  In
+// theory the second one shouldn't be necessary -- users of this function could
+// just ask cudnn how much scratch space it needs for a particular convolution.
+// But in practice, StreamExecutor does not expose such an API, and in the name
+// of parsimony, perhaps it's better not to add it.  Instead, the first time you
+// call a convolution, you should call the version that takes a scratch
+// allocator and take note of how much memory is used.  The next time you call
+// the same conv, you can provide an explicitly preallocated scratch buffer of
+// that size, if you like.
+Status RunCudnnConvolution(
+    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+    const Shape& output_shape,
+    perftools::gputools::DeviceMemory<float> input_buf,
+    perftools::gputools::DeviceMemory<float> filter_buf,
+    perftools::gputools::DeviceMemory<float> output_buf,
+    perftools::gputools::DeviceMemoryBase scratch_buf, const Window& window,
+    const ConvolutionDimensionNumbers& dnums,
+    perftools::gputools::dnn::AlgorithmConfig algorithm,
+    perftools::gputools::Stream* stream,
+    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+
+Status RunCudnnConvolution(
+    CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape,
+    const Shape& output_shape,
+    perftools::gputools::DeviceMemory<float> input_buf,
+    perftools::gputools::DeviceMemory<float> filter_buf,
+    perftools::gputools::DeviceMemory<float> output_buf,
+    perftools::gputools::ScratchAllocator* scratch_allocator,
+    const Window& window, const ConvolutionDimensionNumbers& dnums,
+    perftools::gputools::dnn::AlgorithmConfig algorithm,
+    perftools::gputools::Stream* stream,
+    perftools::gputools::dnn::ProfileResult* profile_result = nullptr);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 6bf00cfb8a53723ae9608093480bf2eed10144dd..5af7a77ea858563fbea05af8efd54f96a74aee93 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -72,9 +72,27 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLibdeviceMathCall(
     tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
     PrimitiveType output_type) const {
   // The libdevice math functions differentiate between "double" and "float" by
-  // appending an 'f' to the function's name.
+  // appending an 'f' to the function's name. libdevice doesn't have f16 math
+  // functions, so we convert the operands to f32 before calling the function
+  // and then convert the result back to f16.
   string munged_callee = callee_name;
+  bool cast_result_to_fp16 = false;
+  std::vector<llvm::Value*> converted_operands(operands.begin(),
+                                               operands.end());
+  std::vector<PrimitiveType> converted_input_types(input_types.begin(),
+                                                   input_types.end());
   switch (output_type) {
+    case F16:
+      cast_result_to_fp16 = true;
+      for (int64 i = 0; i < operands.size(); ++i) {
+        if (input_types[i] == F16) {
+          converted_operands[i] = ir_builder_->CreateFPCast(
+              converted_operands[i], ir_builder_->getFloatTy());
+          converted_input_types[i] = F32;
+        }
+      }
+      output_type = F32;
+      TF_FALLTHROUGH_INTENDED;
     case F32:
       StrAppend(&munged_callee, "f");
       break;
@@ -84,7 +102,13 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLibdeviceMathCall(
       return Unimplemented("Bad type for libdevice math call: %s",
                            PrimitiveType_Name(output_type).c_str());
   }
-  return EmitMathCall(munged_callee, operands, input_types, output_type);
+  llvm::Value* result = EmitMathCall(munged_callee, converted_operands,
+                                     converted_input_types, output_type)
+                            .ValueOrDie();
+  if (cast_result_to_fp16) {
+    result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy());
+  }
+  return result;
 }
 
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLlvmIntrinsicMathCall(
@@ -92,10 +116,13 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLlvmIntrinsicMathCall(
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
     tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
     PrimitiveType output_type) const {
-  // llvm intrinsics differentiate between float/double functions via the ".f32"
-  // and ".f64" suffixes.
+  // llvm intrinsics differentiate between half/float/double functions via
+  // the suffixes ".f16", ".f32" and ".f64".
   string munged_callee = callee_name;
   switch (output_type) {
+    case F16:
+      StrAppend(&munged_callee, ".f16");
+      break;
     case F32:
       StrAppend(&munged_callee, ".f32");
       break;
@@ -135,10 +162,6 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatBinaryOp(
   PrimitiveType rhs_input_type = op->operand(1)->shape().element_type();
   PrimitiveType output_type = op->shape().element_type();
   switch (op->opcode()) {
-    case HloOpcode::kAtan2:
-      return EmitLibdeviceMathCall("__nv_atan2", {lhs_value, rhs_value},
-                                   {lhs_input_type, rhs_input_type},
-                                   output_type);
     case HloOpcode::kRemainder: {
       return EmitLibdeviceMathCall("__nv_fmod", {lhs_value, rhs_value},
                                    {lhs_input_type, rhs_input_type},
@@ -199,29 +222,44 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitErfcInv(
   return EmitLibdeviceMathCall("__nv_erfcinv", {value}, {prim_type}, prim_type);
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLog(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  return EmitLibdeviceMathCall("__nv_log", {value}, {prim_type}, prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitSin(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  return EmitLibdeviceMathCall("__nv_sin", {value}, {prim_type}, prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitCos(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  return EmitLibdeviceMathCall("__nv_cos", {value}, {prim_type}, prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitExp(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  return EmitLibdeviceMathCall("__nv_exp", {value}, {prim_type}, prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitPow(PrimitiveType prim_type,
+                                                      llvm::Value* lhs,
+                                                      llvm::Value* rhs) const {
+  return EmitLibdeviceMathCall("__nv_pow", {lhs, rhs}, {prim_type, prim_type},
+                               prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitAtan2(
+    PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs) const {
+  return EmitLibdeviceMathCall("__nv_atan2", {lhs, rhs}, {prim_type, prim_type},
+                               prim_type);
+}
+
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
     const HloInstruction* op, llvm::Value* operand_value) const {
   PrimitiveType input_type = op->operand(0)->shape().element_type();
   PrimitiveType output_type = op->shape().element_type();
   switch (op->opcode()) {
-    case HloOpcode::kExp:
-      return EmitLibdeviceMathCall("__nv_exp", {operand_value}, {input_type},
-                                   output_type);
-    case HloOpcode::kFloor:
-      return EmitLibdeviceMathCall("__nv_floor", {operand_value}, {input_type},
-                                   output_type);
-    case HloOpcode::kCeil:
-      return EmitLibdeviceMathCall("__nv_ceil", {operand_value}, {input_type},
-                                   output_type);
-    case HloOpcode::kLog:
-      return EmitLibdeviceMathCall("__nv_log", {operand_value}, {input_type},
-                                   output_type);
-    case HloOpcode::kCos:
-      return EmitLibdeviceMathCall("__nv_cos", {operand_value}, {input_type},
-                                   output_type);
-    case HloOpcode::kSin:
-      return EmitLibdeviceMathCall("__nv_sin", {operand_value}, {input_type},
-                                   output_type);
     case HloOpcode::kTanh:
       return EmitLibdeviceMathCall("__nv_tanh", {operand_value}, {input_type},
                                    output_type);
@@ -230,224 +268,6 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
   }
 }
 
-StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexBinaryOp(
-    const HloInstruction* op, llvm::Value* lhs_value,
-    llvm::Value* rhs_value) const {
-  PrimitiveType input_type = op->operand(0)->shape().element_type();
-  TF_RET_CHECK(primitive_util::IsComplexType(input_type));
-  PrimitiveType component_type =
-      primitive_util::ComplexComponentType(input_type);
-  switch (op->opcode()) {
-    case HloOpcode::kPower: {
-      // (a+bi)^(c+di) =
-      //    (a*a+b*b)^(0.5c) * exp(-d*atan2(b,a)) * (cos(q) + i*sin(q)),
-      //    where q = c*atan2(b,a)+0.5d*ln(a*a+b*b)
-      auto a = EmitExtractReal(lhs_value);
-      auto b = EmitExtractImag(lhs_value);
-      auto c = EmitExtractReal(rhs_value);
-      auto d = EmitExtractImag(rhs_value);
-      auto aa_p_bb = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
-                                             ir_builder_->CreateFMul(b, b));
-      auto one_half = llvm::ConstantFP::get(a->getType(), 0.5);
-      auto half_c = ir_builder_->CreateFMul(one_half, c);
-
-      TF_ASSIGN_OR_RETURN(
-          auto aa_p_bb_to_half_c,
-          EmitLibdeviceMathCall("__nv_pow", {aa_p_bb, half_c},
-                                {component_type, component_type},
-                                component_type));
-      auto neg_d = ir_builder_->CreateFNeg(d);
-      TF_ASSIGN_OR_RETURN(
-          auto arg_lhs, EmitLibdeviceMathCall("__nv_atan2", {b, a},
-                                              {component_type, component_type},
-                                              component_type));
-      auto neg_d_arg_lhs = ir_builder_->CreateFMul(neg_d, arg_lhs);
-      TF_ASSIGN_OR_RETURN(
-          auto e_to_neg_d_arg_lhs,
-          EmitLibdeviceMathCall("__nv_exp", {neg_d_arg_lhs}, {component_type},
-                                component_type));
-      auto coeff =
-          ir_builder_->CreateFMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs);
-      TF_ASSIGN_OR_RETURN(
-          auto ln_aa_p_bb,
-          EmitLibdeviceMathCall("__nv_log", {aa_p_bb}, {component_type},
-                                component_type));
-      auto half_d = ir_builder_->CreateFMul(one_half, d);
-      auto q =
-          ir_builder_->CreateFAdd(ir_builder_->CreateFMul(c, arg_lhs),
-                                  ir_builder_->CreateFMul(half_d, ln_aa_p_bb));
-      TF_ASSIGN_OR_RETURN(
-          auto cos_q, EmitLibdeviceMathCall("__nv_cos", {q}, {component_type},
-                                            component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto sin_q, EmitLibdeviceMathCall("__nv_sin", {q}, {component_type},
-                                            component_type));
-      return EmitComposeComplex(op, ir_builder_->CreateFMul(coeff, cos_q),
-                                ir_builder_->CreateFMul(coeff, sin_q));
-    }
-    default:
-      return ElementalIrEmitter::EmitComplexBinaryOp(op, lhs_value, rhs_value);
-  }
-}
-
-StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexUnaryOp(
-    const HloInstruction* op, llvm::Value* operand_value) const {
-  PrimitiveType input_type = op->operand(0)->shape().element_type();
-  PrimitiveType component_type =
-      primitive_util::IsComplexType(input_type)
-          ? primitive_util::ComplexComponentType(input_type)
-          : input_type;
-
-  switch (op->opcode()) {
-    case HloOpcode::kLog: {
-      // log(a+bi) = .5*log(a^2+b^2) + i*atan2(b, a)
-      auto a = EmitExtractReal(operand_value);
-      auto b = EmitExtractImag(operand_value);
-      llvm::Type* llvm_ty = a->getType();
-      auto sum_sq = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
-                                            ir_builder_->CreateFMul(b, b));
-      TF_ASSIGN_OR_RETURN(
-          auto log_sum_sq,
-          EmitLibdeviceMathCall("__nv_log", {sum_sq}, {component_type},
-                                component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto angle, EmitLibdeviceMathCall("__nv_atan2", {b, a},
-                                            {component_type, component_type},
-                                            component_type));
-      auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5);
-      return EmitComposeComplex(
-          op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle);
-    }
-    case HloOpcode::kExp: {
-      // e^(a+bi) = e^a*(cos(b)+sin(b)i)
-      auto b = EmitExtractImag(operand_value);
-      TF_ASSIGN_OR_RETURN(
-          auto exp_a,
-          EmitLibdeviceMathCall("__nv_exp", {EmitExtractReal(operand_value)},
-                                {component_type}, component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto cos_b, EmitLibdeviceMathCall("__nv_cos", {b}, {component_type},
-                                            component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto sin_b, EmitLibdeviceMathCall("__nv_sin", {b}, {component_type},
-                                            component_type));
-      return EmitComposeComplex(op, ir_builder_->CreateFMul(exp_a, cos_b),
-                                ir_builder_->CreateFMul(exp_a, sin_b));
-    }
-    case HloOpcode::kCos: {
-      // cos(a+bi) = .5(cos(a)*(e^-b+e^b) + i*sin(a)*(e^-b-e^b))
-      auto a = EmitExtractReal(operand_value);
-      auto llvm_ty = a->getType();
-      TF_ASSIGN_OR_RETURN(
-          auto exp_b,
-          EmitLibdeviceMathCall("__nv_exp", {EmitExtractImag(operand_value)},
-                                {component_type}, component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto cos_a, EmitLibdeviceMathCall("__nv_cos", {a}, {component_type},
-                                            component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto sin_a, EmitLibdeviceMathCall("__nv_sin", {a}, {component_type},
-                                            component_type));
-      auto half_exp_b =
-          ir_builder_->CreateFMul(llvm::ConstantFP::get(llvm_ty, 0.5), exp_b);
-      auto half_exp_neg_b =
-          ir_builder_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 0.5), exp_b);
-      return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFMul(
-              cos_a, ir_builder_->CreateFAdd(half_exp_neg_b, half_exp_b)),
-          ir_builder_->CreateFMul(
-              sin_a, ir_builder_->CreateFSub(half_exp_neg_b, half_exp_b)));
-    }
-
-    case HloOpcode::kSin: {
-      // sin(a+bi) = 0.5(sin(a)*(e^b+e^-b) + i*cos(a)*(e^b-e^-b)
-      auto a = EmitExtractReal(operand_value);
-      auto llvm_ty = a->getType();
-      TF_ASSIGN_OR_RETURN(
-          auto exp_b,
-          EmitLibdeviceMathCall("__nv_exp", {EmitExtractImag(operand_value)},
-                                {component_type}, component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto cos_a, EmitLibdeviceMathCall("__nv_cos", {a}, {component_type},
-                                            component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto sin_a, EmitLibdeviceMathCall("__nv_sin", {a}, {component_type},
-                                            component_type));
-      auto half_exp_b =
-          ir_builder_->CreateFMul(llvm::ConstantFP::get(llvm_ty, 0.5), exp_b);
-      auto half_exp_neg_b =
-          ir_builder_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 0.5), exp_b);
-      return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFMul(
-              sin_a, ir_builder_->CreateFAdd(half_exp_b, half_exp_neg_b)),
-          ir_builder_->CreateFMul(
-              cos_a, ir_builder_->CreateFSub(half_exp_b, half_exp_neg_b)));
-    }
-    case HloOpcode::kTanh: {
-      /*
-      tanh=(exp(x)-exp(-x)) / (exp(x)+exp(-x))
-      e^(a+bi) = e^a*(cos(b)+sin(b)i)
-      so tanh=(((cos(b)+sin(b)i)e^a - (cos(-b)+sin(-b)i)e^-a)) /
-              (((cos(b)+sin(b)i)e^a + (cos(-b)+sin(-b)i)e^-a))
-      cos(b)=cos(-b), sin(-b)=-sin(b)
-      so tanh=(((cos(b)+sin(b)i)e^a - (cos(b)-sin(b)i)e^-a)) /
-              (((cos(b)+sin(b)i)e^a + (cos(b)-sin(b)i)e^-a))
-             =(cos(b)e^a+i*sin(b)e^a + cos(b)(-e^-a)+i*sin(b)e^-a) /
-              (cos(b)e^a+i*sin(b)e^a + cos(b)e^-a+i*sin(b)(-e^-a))
-             =(cos(b)(e^a-e^-a) + i*sin(b)(e^a+e^-a)) /
-              (cos(b)(e^a+e^-a) + i*sin(b)(e^a-e^-a))
-      This is a complex division, so we can multiply by denom_conj/denom_conj
-             =(cos(b)(e^a-e^-a) + i*sin(b)(e^a+e^-a)) *
-              (cos(b)(e^a+e^-a) - i*sin(b)(e^a-e^-a)) /
-              ((cos(b)(e^a+e^-a))^2 + (sin(b)(e^a-e^-a))^2)
-             =(cos(b)^2(e^(2a)-e^(-2a)) + sin(b)^2(e^(2a)-e^(-2a)) +
-               i*(cos(b)sin(b)(e^a+e^-a)^2 - cos(b)sin(b)(e^a-e^-a)^2)) /
-              ((cos(b)(e^a+e^-a))^2 + (sin(b)(e^a-e^-a))^2)
-      */
-      auto a = EmitExtractReal(operand_value);
-      auto b = EmitExtractImag(operand_value);
-      TF_ASSIGN_OR_RETURN(
-          auto exp_a, EmitLibdeviceMathCall("__nv_exp", {a}, {component_type},
-                                            component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto cos_b, EmitLibdeviceMathCall("__nv_cos", {b}, {component_type},
-                                            component_type));
-      TF_ASSIGN_OR_RETURN(
-          auto sin_b, EmitLibdeviceMathCall("__nv_sin", {b}, {component_type},
-                                            component_type));
-      auto exp_neg_a = ir_builder_->CreateFDiv(
-          llvm::ConstantFP::get(exp_a->getType(), 1), exp_a);
-      auto exp_2a_minus_exp_neg_2a = ir_builder_->CreateFSub(
-          ir_builder_->CreateFMul(exp_a, exp_a),
-          ir_builder_->CreateFMul(exp_neg_a, exp_neg_a));
-      auto cos_b_sq = ir_builder_->CreateFMul(cos_b, cos_b);
-      auto sin_b_sq = ir_builder_->CreateFMul(sin_b, sin_b);
-      auto real_num = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(cos_b_sq, exp_2a_minus_exp_neg_2a),
-          ir_builder_->CreateFMul(sin_b_sq, exp_2a_minus_exp_neg_2a));
-      auto cos_b_sin_b = ir_builder_->CreateFMul(cos_b, sin_b);
-      auto exp_a_plus_exp_neg_a = ir_builder_->CreateFAdd(exp_a, exp_neg_a);
-      auto exp_a_plus_exp_neg_a_sq =
-          ir_builder_->CreateFMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a);
-      auto exp_a_minus_exp_neg_a = ir_builder_->CreateFSub(exp_a, exp_neg_a);
-      auto exp_a_minus_exp_neg_a_sq =
-          ir_builder_->CreateFMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a);
-      auto imag_num = ir_builder_->CreateFMul(
-          cos_b_sin_b, ir_builder_->CreateFSub(exp_a_plus_exp_neg_a_sq,
-                                               exp_a_minus_exp_neg_a_sq));
-      auto denom = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(cos_b_sq, exp_a_plus_exp_neg_a_sq),
-          ir_builder_->CreateFMul(sin_b_sq, exp_a_minus_exp_neg_a_sq));
-      return EmitComposeComplex(op, ir_builder_->CreateFDiv(real_num, denom),
-                                ir_builder_->CreateFDiv(imag_num, denom));
-    }
-    default:
-      return ElementalIrEmitter::EmitComplexUnaryOp(op, operand_value);
-  }
-}
-
 llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall(
     const string& callee_name,
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index 6a537d015209bc507af36b13eeb5d69ce58d8fea..77d4569b1e8e398005e8f517ff086a77aedd382d 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -54,20 +54,31 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitFloatUnaryOp(
       const HloInstruction* op, llvm::Value* operand_value) const override;
 
-  StatusOr<llvm::Value*> EmitComplexUnaryOp(
-      const HloInstruction* op, llvm::Value* operand_value) const override;
-
   StatusOr<llvm::Value*> EmitFloatBinaryOp(
       const HloInstruction* op, llvm::Value* lhs_value,
       llvm::Value* rhs_value) const override;
 
-  StatusOr<llvm::Value*> EmitComplexBinaryOp(
-      const HloInstruction* op, llvm::Value* lhs_value,
-      llvm::Value* rhs_value) const override;
-
   StatusOr<llvm::Value*> EmitErfcInv(PrimitiveType prim_type,
                                      llvm::Value* value) const override;
 
+  StatusOr<llvm::Value*> EmitLog(PrimitiveType prim_type,
+                                 llvm::Value* value) const override;
+
+  StatusOr<llvm::Value*> EmitSin(PrimitiveType prim_type,
+                                 llvm::Value* value) const override;
+
+  StatusOr<llvm::Value*> EmitCos(PrimitiveType prim_type,
+                                 llvm::Value* value) const override;
+
+  StatusOr<llvm::Value*> EmitExp(PrimitiveType prim_type,
+                                 llvm::Value* value) const override;
+
+  StatusOr<llvm::Value*> EmitPow(PrimitiveType prim_type, llvm::Value* lhs,
+                                 llvm::Value* rhs) const override;
+
+  StatusOr<llvm::Value*> EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs,
+                                   llvm::Value* rhs) const override;
+
   llvm::Value* EmitThreadId() const override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..66931bdc8b1030b2b2e7731ce6327c1e908d4ee6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@@ -0,0 +1,234 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
+
+#include <string>
+
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+namespace gpu {
+
+FftScratchAllocator::FftScratchAllocator(
+    int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+    : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
+
+FftScratchAllocator::~FftScratchAllocator() {
+  for (auto& allocated_buffer : allocated_buffers_) {
+    if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer)
+             .ok()) {
+      // The program can still continue with failed deallocation.
+      LOG(ERROR) << "Failed to deallocate the allocated buffer: "
+                 << allocated_buffer.opaque();
+    }
+  }
+}
+
+int64 FftScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) {
+  constexpr int64 kFftScratchSize = 1LL << 32;  // 4GB by default.
+  return kFftScratchSize;
+}
+
+se::port::StatusOr<se::DeviceMemory<uint8>> FftScratchAllocator::AllocateBytes(
+    se::Stream* stream, int64 byte_size) {
+  CHECK_GE(byte_size, 0) << "byte_size must be positive.";
+  if (byte_size > GetMemoryLimitInBytes(stream)) {
+    return se::port::Status(
+        se::port::error::RESOURCE_EXHAUSTED,
+        tensorflow::strings::Printf(
+            "Allocating %lld bytes exceeds the memory limit of %lld bytes.",
+            byte_size, GetMemoryLimitInBytes(stream)));
+  }
+
+  auto status_or_memory =
+      memory_allocator_->Allocate(device_ordinal_, byte_size,
+                                  /*retry_on_failure=*/false);
+  if (!status_or_memory.ok()) {
+    return tensorflow::errors::ResourceExhausted(
+        "Failed to allocate %lld bytes on device %d.", byte_size,
+        device_ordinal_);
+  }
+  se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie();
+  allocated_buffers_.push_back(allocated_buffer);
+  total_allocated_bytes_ += byte_size;
+  return se::DeviceMemory<uint8>(allocated_buffer);
+}
+
+namespace {
+
+se::fft::Type FftTypeToSeType(FftType type) {
+  switch (type) {
+    case FftType::FFT:
+      return se::fft::Type::kC2CForward;
+    case FftType::IFFT:
+      return se::fft::Type::kC2CInverse;
+    case FftType::IRFFT:
+      return se::fft::Type::kC2R;
+    case FftType::RFFT:
+      return se::fft::Type::kR2C;
+    default:
+      LOG(FATAL) << "unsupported fft type";
+  }
+}
+
+string FftTypeToString(se::fft::Type type) {
+  switch (type) {
+    case se::fft::Type::kC2CForward:
+      return "FFT";
+    case se::fft::Type::kC2CInverse:
+      return "IFFT";
+    case se::fft::Type::kC2R:
+      return "IRFFT";
+    case se::fft::Type::kR2C:
+      return "RFFT";
+    default:
+      LOG(FATAL) << "unknown fft type";
+  }
+}
+
+}  // namespace
+
+FftThunk::FftThunk(FftType fft_type,
+                   tensorflow::gtl::ArraySlice<int64> fft_length,
+                   const BufferAllocation::Slice& input_buffer,
+                   const BufferAllocation::Slice& output_buffer,
+                   const Shape& input_shape, const Shape& output_shape,
+                   const HloInstruction* hlo)
+    : Thunk(Kind::kFft, hlo),
+      fft_type_(FftTypeToSeType(fft_type)),
+      fft_length_(fft_length.begin(), fft_length.end()),
+      scale_factor_(1.0f),
+      input_buffer_(input_buffer),
+      output_buffer_(output_buffer),
+      input_shape_(input_shape),
+      output_shape_(output_shape) {}
+
+tensorflow::Status FftThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+  VLOG(3) << "FFT type: " << FftTypeToString(fft_type_);
+  VLOG(3) << "Input shape: " << ShapeUtil::HumanStringWithLayout(input_shape_);
+  VLOG(3) << "Output shape: "
+          << ShapeUtil::HumanStringWithLayout(output_shape_);
+
+  FftScratchAllocator scratch_allocator(buffer_allocations.device_ordinal(),
+                                        buffer_allocations.memory_allocator());
+
+  if (fft_plan_ == nullptr) {
+    const int64 fft_rank = fft_length_.size();
+    CHECK_LE(fft_rank, 3);
+    int batch_size = 1;
+    for (int i = 0; i < input_shape_.dimensions_size() - fft_rank; ++i) {
+      batch_size *= input_shape_.dimensions(i);
+    }
+    uint64 fft_length[3];
+    uint64 input_embed[3];
+    const uint64 input_stride = 1;
+    uint64 input_distance = 1;
+    uint64 output_embed[3];
+    const uint64 output_stride = 1;
+    uint64 output_distance = 1;
+
+    for (int i = 0; i < fft_rank; ++i) {
+      auto dim_offset = input_shape_.dimensions_size() - fft_rank + i;
+      fft_length[i] = static_cast<uint64>(fft_length_[i]);
+      input_embed[i] = input_shape_.dimensions(dim_offset);
+      input_distance *= input_shape_.dimensions(dim_offset);
+      output_embed[i] = output_shape_.dimensions(dim_offset);
+      output_distance *= output_shape_.dimensions(dim_offset);
+    }
+
+    constexpr bool kInPlaceFft = false;
+    fft_plan_ =
+        stream->parent()->AsFft()->CreateBatchedPlanWithScratchAllocator(
+            stream, fft_rank, fft_length, input_embed, input_stride,
+            input_distance, output_embed, output_stride, output_distance,
+            fft_type_, kInPlaceFft, batch_size, &scratch_allocator);
+    scale_factor_ = 1.0f / output_distance;
+  } else {
+    stream->parent()->AsFft()->UpdatePlanWithScratchAllocator(
+        stream, fft_plan_.get(), &scratch_allocator);
+  }
+
+  bool launch_ok;
+  switch (fft_type_) {
+    case se::fft::Type::kC2CForward: {
+      se::DeviceMemory<complex64> input_data(
+          buffer_allocations.GetDeviceAddress(input_buffer_));
+      se::DeviceMemory<complex64> output_data(
+          buffer_allocations.GetDeviceAddress(output_buffer_));
+      launch_ok =
+          stream->ThenFft(fft_plan_.get(), input_data, &output_data).ok();
+      break;
+    }
+    case se::fft::Type::kC2CInverse: {
+      se::DeviceMemory<complex64> input_data(
+          buffer_allocations.GetDeviceAddress(input_buffer_));
+      se::DeviceMemory<complex64> output_data(
+          buffer_allocations.GetDeviceAddress(output_buffer_));
+      launch_ok =
+          stream->ThenFft(fft_plan_.get(), input_data, &output_data).ok();
+      if (launch_ok) {
+        launch_ok =
+            stream
+                ->ThenBlasScal(ShapeUtil::ElementsIn(output_shape_),
+                               complex64(scale_factor_), &output_data, 1)
+                .ok();
+      }
+      break;
+    }
+    case se::fft::Type::kR2C: {
+      se::DeviceMemory<float> input_data(
+          buffer_allocations.GetDeviceAddress(input_buffer_));
+      se::DeviceMemory<complex64> output_data(
+          buffer_allocations.GetDeviceAddress(output_buffer_));
+      launch_ok =
+          stream->ThenFft(fft_plan_.get(), input_data, &output_data).ok();
+      break;
+    }
+    case se::fft::Type::kC2R: {
+      se::DeviceMemory<complex64> input_data(
+          buffer_allocations.GetDeviceAddress(input_buffer_));
+      se::DeviceMemory<float> output_data(
+          buffer_allocations.GetDeviceAddress(output_buffer_));
+      launch_ok =
+          stream->ThenFft(fft_plan_.get(), input_data, &output_data).ok();
+      if (launch_ok) {
+        launch_ok = stream
+                        ->ThenBlasScal(ShapeUtil::ElementsIn(output_shape_),
+                                       scale_factor_, &output_data, 1)
+                        .ok();
+      }
+      break;
+    }
+    default:
+      LOG(FATAL) << "unsupported fft type";
+  }
+  if (launch_ok) {
+    return tensorflow::Status::OK();
+  }
+  return InternalError("Unable to launch fft for thunk %p with type %s", this,
+                       FftTypeToString(fft_type_).c_str());
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..52fb8c376d7acea0f15aaa865c23fa2382717338
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@@ -0,0 +1,98 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FFT_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FFT_THUNK_H_
+
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// A one-time scratch allocator for FFT. The scratch buffers allocated are
+// released on destruction.
+//
+// Not thread-safe in that AllocateBytes, destructor are not locked.
+class FftScratchAllocator : public perftools::gputools::ScratchAllocator {
+ public:
+  FftScratchAllocator(int device_ordinal,
+                      DeviceMemoryAllocator* memory_allocator);
+
+  ~FftScratchAllocator() override;
+
+  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override;
+
+  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
+
+  perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
+  AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override;
+
+ private:
+  const int device_ordinal_;
+  DeviceMemoryAllocator* memory_allocator_;
+  std::vector<perftools::gputools::DeviceMemoryBase> allocated_buffers_;
+  int64 total_allocated_bytes_ = 0;
+};
+
+// This class stores everything that StreamExecutor needs to launch an FFT.
+// It is generated by IrEmitter.
+//
+// This is thread-compatible.
+class FftThunk : public Thunk {
+ public:
+  // Constructs a thunk for launching an FFT on a stream.
+  // Semantics of null hlo_instruction argument are as in Thunk.
+  FftThunk(FftType fft_type, tensorflow::gtl::ArraySlice<int64> fft_length,
+           const BufferAllocation::Slice& input_buffer,
+           const BufferAllocation::Slice& output_buffer,
+           const Shape& input_shape, const Shape& output_shape,
+           const HloInstruction* hlo);
+
+  FftThunk(const FftThunk&) = delete;             // Cannot share fft_plan_
+  FftThunk& operator=(const FftThunk&) = delete;  // Cannot share fft_plan_
+
+  // Does the FFT for the thunk on "stream".
+  tensorflow::Status ExecuteOnStream(
+      const BufferAllocations& buffer_allocations,
+      perftools::gputools::Stream* stream) override;
+
+ private:
+  const perftools::gputools::fft::Type fft_type_;
+  const std::vector<int64> fft_length_;
+
+  float scale_factor_;
+
+  std::unique_ptr<perftools::gputools::fft::Plan> fft_plan_;
+
+  const BufferAllocation::Slice input_buffer_;
+  const BufferAllocation::Slice output_buffer_;
+
+  const Shape input_shape_;
+  const Shape output_shape_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FFT_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index 525a2af941e77a27c0e01543e00e8a4c3e4b9f62..832494d17e9c4e1d9e92e18ef331df1cf3689024 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FOR_THUNK_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FOR_THUNK_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FOR_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FOR_THUNK_H_
 
 #include <vector>
 
@@ -49,4 +49,4 @@ class ForThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FOR_THUNK_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FOR_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.h b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
index bd720f8584f6254c43a3e2a1a5399aa919eebbc0..4c523a66de977cd32423b25f0d165c4f4ba51c4a 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.h
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -44,4 +44,4 @@ class FusionMerger : public HloPassInterface {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_FUSION_MERGER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index e784046450ed1cca088770c65c786e80adda869f..8e3aebbc12b5e6d746700956b9743bc94db50167 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -264,9 +264,9 @@ tensorflow::Status GemmThunk::ExecuteOnStream(
 
   auto make_descriptor = [this](se::DeviceMemoryBase data, const Shape& shape,
                                 bool transpose) -> MatrixDescriptor {
-    bool is_row_major = shape.layout().minor_to_major(0) != 0;
-    bool layout_mismatch = shape.layout().minor_to_major(0) !=
-                           output_shape_.layout().minor_to_major(0);
+    bool is_row_major = LayoutUtil::Minor(shape.layout(), 0) != 0;
+    bool layout_mismatch = LayoutUtil::Minor(shape.layout(), 0) !=
+                           LayoutUtil::Minor(output_shape_.layout(), 0);
     return MatrixDescriptor(data, transpose ^ layout_mismatch,
                             shape.dimensions(is_row_major),
                             shape.dimensions(!is_row_major));
@@ -320,7 +320,7 @@ tensorflow::Status GemmThunk::ExecuteOnStream(
   };
 
   bool launch_ok;
-  if (output_shape_.layout().minor_to_major(0) == 0) {
+  if (LayoutUtil::Minor(output_shape_.layout(), 0) == 0) {
     launch_ok = launch(
         lhs_descriptor, rhs_descriptor,
         MatrixDescriptor(output_data, false, output_num_rows, output_num_cols),
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index 983cb872924f22be0dfad8aa9ad86f233b909c46..8c6a1f51a8a09ef78950dfe7e89994a3fe247f49 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -52,6 +52,15 @@ class GemmThunk : public Thunk {
       const BufferAllocations& buffer_allocations,
       perftools::gputools::Stream* stream) override;
 
+  // Returns true if we'll perform autotuning if run on the given stream.  If
+  // so, we want the GPU to be quiescent during autotuning, so as not to
+  // introduce noise in our results.
+  bool ShouldHaltAllActivityBeforeRunning(
+      perftools::gputools::Stream* stream) override {
+    return autotune_results_.count(
+               stream->parent()->GetDeviceDescription().name()) != 0;
+  }
+
  private:
   const BufferAllocation::Slice lhs_buffer_;
   const BufferAllocation::Slice rhs_buffer_;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index fcd73fd37a2d9ae3c24b56970e3e992da5944682..28ebd034ee0c89137f4e6eb417d8a37f4a00af7a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -18,30 +18,37 @@ limitations under the License.
 #include <stdlib.h>
 #include <atomic>
 #include <functional>
+#include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
 #include <utility>
 
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
-#include "tensorflow/compiler/xla/service/batchnorm_rewriter.h"
+#include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
-#include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
-#include "tensorflow/compiler/xla/service/gpu/layout_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/pad_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
@@ -52,6 +59,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
@@ -64,6 +72,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
+#include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -74,9 +83,11 @@ limitations under the License.
 #include "tensorflow/core/platform/cuda_libdevice_path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/subprocess.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 
 namespace se = ::perftools::gputools;
 
@@ -90,14 +101,6 @@ namespace gpu {
 namespace {
 
 using tensorflow::port::Tracing;
-using tensorflow::strings::StrCat;
-
-// Any address of a variable residing in global memory or returned by one of the
-// memory allocation routines from the driver or runtime API is always aligned
-// to at least 256 bytes.
-//
-// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#device-memory-accesses
-constexpr int64 kMemoryAlignment = 256;
 
 // Returns the directory containing nvvm libdevice files.  config_cuda_data_dir
 // should be equal to config().debug_options().xla_gpu_cuda_data_dir() of the
@@ -125,31 +128,46 @@ string GetLibdeviceDir(const string& config_cuda_data_dir) {
 }
 
 // Runs optimization passes on the given HLO module.
-tensorflow::Status OptimizeHloModule(
-    HloModule* hlo_module,
-    const HloCostAnalysis::ShapeSizeFunction& shape_size_function) {
+tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
+                                     se::StreamExecutor* stream_exec,
+                                     DeviceMemoryAllocator* device_allocator) {
   {
     HloPassPipeline pipeline("optimization");
-    pipeline.AddInvariantChecker<HloVerifier>(shape_size_function);
+    pipeline.AddInvariantChecker<HloVerifier>();
+    pipeline.AddPass<GpuHloSupportChecker>();
     ReducePrecisionInsertion::AddPasses(
         &pipeline, hlo_module->config().debug_options(),
         ReducePrecisionInsertion::PassTiming::BEFORE_OPTIMIZATION);
 
     // TODO(b/64094172): make Call work on GPU instead of inlining.
     pipeline.AddPass<CallInliner>();
+    // Convert BF16 operations to F32 operations so that the GPU backend can
+    // support BF16 operations without directly implementing a BF16 lowering for
+    // most ops.
+    pipeline.AddPass<HloElementTypeConverter>(BF16, F32);
+    pipeline.AddPass<DotDecomposer>();
 
     {
       auto& pass =
           pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
-      pass.AddInvariantChecker<HloVerifier>(shape_size_function);
+      pass.AddInvariantChecker<HloVerifier>();
 
-      // TODO(b/62764704): Do not rewrite on GPU, use cuDNN's BatchNorm APIs
-      // instead.
-      pass.AddPass<BatchNormRewriter>(
+      // If cudnn batchnorms are enabled, rewrite batchnorm HLOs to cudnn calls
+      // where possible.  Not every batchnorm op can be implemented as a call to
+      // cudnn, so decompose any remaining batchnorm ops into a soup of HLOs.
+      if (hlo_module->config().debug_options().xla_gpu_use_cudnn_batchnorm()) {
+        pass.AddPass<CudnnBatchNormRewriter>();
+      }
+      pass.AddPass<BatchNormExpander>(
           /*rewrite_training_op=*/true,
           /*rewrite_inference_op=*/true,
           /*rewrite_grad_op=*/true,
           /*use_fusion=*/false);
+
+      // BatchNormExpander can create zero-sized ops, so zero-sized HLO
+      // elimination has to come after that pass.
+      pipeline.AddPass<ZeroSizedHloElimination>();
+
       pass.AddPass<AlgebraicSimplifier>(
           /*is_layout_sensitive=*/false,
           [](const Shape&, const Shape&) { return false; });
@@ -159,7 +177,7 @@ tensorflow::Status OptimizeHloModule(
       pass.AddPass<ReshapeMover>();
       pass.AddPass<HloConstantFolding>();
     }
-    pipeline.AddPass<ConvolutionFolding>();
+
     pipeline.AddPass<TransposeFolding>(
         [](const HloInstruction& dot,
            const TransposeFolding::OperandIndices& candidate_operands) {
@@ -171,16 +189,68 @@ tensorflow::Status OptimizeHloModule(
     pipeline.AddPass<HloDCE>();
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
+
+  {
+    // Convert convolutions into CustomCalls to cudnn, then canonicalize them
+    // (PadInsertion).
+    HloPassPipeline pipeline("conv_canonicalization");
+    pipeline.AddInvariantChecker<HloVerifier>();
+    pipeline.AddPass<CudnnConvolutionRewriter>();
+    pipeline.AddPass<PadInsertion>();
+
+    // Choose the fastest algorithm for each conv.
+    //
+    // In theory doing this here is way too early: It needs to happen after
+    // layout assignment, because the layout of the inputs/outputs affects the
+    // speed of the conv.  But currently we only allow only one input/output
+    // layout when calling cudnn, so there's no ambiguity.
+    //
+    // We pick the algorithm at this early stage so we can generate better HLO.
+    // After CudnnConvolutionRewriter, our convolutions are CustomCalls which
+    // return a tuple (conv_result, scratch_memory), and the each conv uses 0
+    // bytes of scratch:
+    //
+    //   customcall = (f32[...], f32[0])
+    //   return gte(customcall, 0)
+    //
+    // The algorithm picker then chooses the best algorithm, and potentially
+    // increases the scratch space.  It replaces customcall with new_tuple,
+    // giving us the following:
+    //
+    //   new_customcall = (f32[...], f32[N])
+    //   new_tuple = tuple(gte(new_customcall, 0), constant f32[0])
+    //   return gte(new_tuple, 0)
+    //
+    // The new tuple and gte instructions then be simplified away, because
+    // nobody is expected to use the scratch value.
+    //
+    // However, if we were to run CudnnConvolutionAlgorithmPicker after layout
+    // assignment, fusion would already have run, and the gte(customcall, 0)
+    // would probably already be into a fusion node.  We can't simplify across
+    // HloComputation boundaries, so in this case we wouldn't be able to
+    // simplify away the new_tuple bits.
+    //
+    // We'll need to revisit this if we ever allow multiple layouts for the
+    // inputs/outputs of a cudnn convolution.
+    pipeline.AddPass<CudnnConvolutionAlgorithmPicker>(stream_exec,
+                                                      device_allocator);
+    // Clean up new_tuple described above.
+    pipeline.AddPass<TupleSimplifier>();
+    pipeline.AddPass<HloDCE>();
+
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
+
   {
     HloPassFix<HloPassPipeline> fusion("fusion");
-    fusion.AddInvariantChecker<HloVerifier>(shape_size_function);
+    fusion.AddInvariantChecker<HloVerifier>();
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
     fusion.AddPass<FusionMerger>();
     TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
 
     HloPassPipeline reduce_pipeline("reduce-precision");
-    reduce_pipeline.AddInvariantChecker<HloVerifier>(shape_size_function);
+    reduce_pipeline.AddInvariantChecker<HloVerifier>();
     ReducePrecisionInsertion::AddPasses(
         &reduce_pipeline, hlo_module->config().debug_options(),
         ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
@@ -198,19 +268,18 @@ tensorflow::Status OptimizeHloModule(
 
 // Modifies the given HLO module so that it will be accepted by IrEmitter.
 // Unlike optimization passes, the passes are necessary for correctness.
-tensorflow::Status PrepareHloModuleForIrEmitting(
-    HloModule* hlo_module,
-    const HloCostAnalysis::ShapeSizeFunction& shape_size_function) {
+tensorflow::Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   // In some cases, we have to place the result of an instruction in a temporary
   // buffer. For instance, the buffer that holds an external parameter is
   // assumed immutable at this point, and should not be reused for output
   // (b/27180329). Therefore, in that case, we set the output to be a copy of
   // the parameter.
   HloPassPipeline pipeline("GPU-ir-emit-prepare");
-  pipeline.AddInvariantChecker<HloVerifier>(shape_size_function);
-  pipeline.AddPass<PadInsertion>();
+  pipeline.AddInvariantChecker<HloVerifier>();
+
   pipeline.AddPass<GpuLayoutAssignment>(
       hlo_module->mutable_entry_computation_layout());
+
   // The LayoutAssignment pass may leave behind kCopy instructions which are
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
@@ -229,6 +298,103 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
   return pipeline.Run(hlo_module).status();
 }
 
+// Prints a warning if the ptxas at ptxas_path has known bugs.
+//
+// Only prints a warning the first time it's called for a particular value of
+// ptxas_path.
+void WarnIfBadPtxasVersion(const string& ptxas_path) {
+  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+  static std::unordered_set<string>* seen_ptxas_paths GUARDED_BY(mu) =
+      new std::unordered_set<string>();
+
+  tensorflow::mutex_lock lock(mu);
+  if (!seen_ptxas_paths->insert(ptxas_path).second) {
+    // Already checked this ptx binary, nothing to do.
+    return;
+  }
+
+  tensorflow::SubProcess ptxas;
+  ptxas.SetProgram(ptxas_path, {ptxas_path, "--version"});
+  ptxas.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
+  if (!ptxas.Start()) {
+    LOG(WARNING) << "Couldn't invoke " << ptxas_path << " --version";
+    return;
+  }
+
+  string out;
+  int exit_code = ptxas.Communicate(/*stdin_input=*/nullptr, &out,
+                                    /*stderr_output=*/nullptr);
+  if (exit_code != 0) {
+    LOG(WARNING) << "Running " << ptxas_path << " --version returned "
+                 << exit_code;
+    return;
+  }
+
+  int64 vmaj, vmin, vdot;
+  string vmaj_str, vmin_str, vdot_str;
+  if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str,
+                         &vmin_str, &vdot_str) ||
+      !tensorflow::strings::safe_strto64(vmaj_str, &vmaj) ||
+      !tensorflow::strings::safe_strto64(vmin_str, &vmin) ||
+      !tensorflow::strings::safe_strto64(vdot_str, &vdot)) {
+    LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
+                 << " --version:\n"
+                 << out;
+    return;
+  }
+
+  // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some
+  // address calculations with large offsets (e.g. "load ptr + large_constant"),
+  // b/70245379.
+  if ((vmaj == 9 && vmin == 0 && vdot < 276) ||
+      (vmaj == 9 && vmin == 1 && vdot < 121)) {
+    LOG(WARNING) << "*** WARNING *** You are using ptxas " << vmaj << "."
+                 << vmin << "." << vdot
+                 << ", which is in range [9.0.0, 9.0.276) + [9.1.0, 9.1.121). "
+                    "These versions are known to miscompile XLA code, leading "
+                    "to incorrect results or invalid-address errors.";
+  }
+}
+
+// Prints a warning if the ptx->sass JIT in the driver has known bugs.
+//
+// Using such a driver only a problem if we fail to use ptxas to compile our ptx
+// and have to use the driver instead, so you should only call this function if
+// we're going to use the driver JIT.
+//
+// Only prints a warning the first time it's called.
+void WarnIfBadDriverJITVersion() {
+  static std::once_flag run_once;
+  std::call_once(run_once, [] {
+    auto version_or_status = se::cuda::Diagnostician::FindKernelDriverVersion();
+    if (!version_or_status.ok()) {
+      LOG(WARNING) << "Couldn't read CUDA driver version.";
+      return;
+    }
+    se::cuda::DriverVersion version = version_or_status.ValueOrDie();
+
+    // The following versions of the driver JIT miscompile some address
+    // calculations with large offsets (e.g. "load ptr + large_constant"),
+    // b/70245379:
+    //
+    //  - 384.x before 384.108
+    //  - 387.x before 387.40
+    //  - 390.x before 390.10.
+    auto vmaj = std::get<0>(version);
+    auto vmin = std::get<1>(version);
+    if ((vmaj == 384 && vmin < 108) ||  //
+        (vmaj == 387 && vmin < 40) ||   //
+        (vmaj == 390 && vmin < 10)) {
+      LOG(WARNING)
+          << "*** WARNING *** Invoking the PTX->SASS JIT from driver version "
+          << se::cuda::DriverVersionToString(version)
+          << ", which is in range [384.0.0, 384.108.0) + [387.0.0, 387.40.0) + "
+             "[390.0.0, 390.10.0). These versions are known to miscompile XLA "
+             "code, leading to incorrect results or invalid-address errors.";
+    }
+  });
+}
+
 // Compiles the given PTX string using ptxas and returns the resulting machine
 // code (i.e. a cubin) as a byte array.
 StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
@@ -240,6 +406,8 @@ StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
   auto env = tensorflow::Env::Default();
   TF_RETURN_IF_ERROR(env->FileExists(ptxas_path));
 
+  WarnIfBadPtxasVersion(ptxas_path);
+
   // Write ptx into a temporary file.
   string ptx_path;
   if (!env->LocalTempFilename(&ptx_path)) {
@@ -263,8 +431,9 @@ StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
     tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
   });
   tensorflow::SubProcess ptxas_info_dumper;
-  std::vector<string> ptxas_args = {ptxas_path, ptx_path, "-o", cubin_path,
-                                    StrCat("-arch=sm_", cc_major, cc_minor)};
+  std::vector<string> ptxas_args = {
+      ptxas_path, ptx_path, "-o", cubin_path,
+      tensorflow::strings::StrCat("-arch=sm_", cc_major, cc_minor)};
   if (VLOG_IS_ON(2)) {
     ptxas_args.push_back("-v");
   }
@@ -294,25 +463,28 @@ StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
 }  // namespace
 
 GpuCompiler::GpuCompiler()
-    : pointer_size_(llvm::DataLayout(kDataLayout).getPointerSize()) {}
+    : pointer_size_(llvm::DataLayout(kDataLayout)
+                        .getPointerSize(0 /* default address space */)) {}
 
 StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/) {
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* device_allocator) {
   XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunHloPasses");
   Tracing::TraceMe annotation("HLO Transforms", module->name(),
                               /*is_expensive=*/true);
-  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(), ShapeSizeBytesFunction()));
+  TF_RETURN_IF_ERROR(
+      OptimizeHloModule(module.get(), stream_exec, device_allocator));
   return std::move(module);
 }
 
 StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec) {
+    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* device_allocator) {
   XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend");
 
   TF_RET_CHECK(stream_exec != nullptr);
 
-  TF_RETURN_IF_ERROR(
-      PrepareHloModuleForIrEmitting(module.get(), ShapeSizeBytesFunction()));
+  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get()));
 
   llvm::LLVMContext llvm_context;
   std::string buffer;
@@ -343,19 +515,21 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<BufferAssignment> buffer_assignment,
       BufferAssigner::Run(module.get(), hlo_schedule->ConsumeHloOrdering(),
-                          BufferSizeBytesFunction(), [](LogicalBuffer::Color) {
-                            return kMemoryAlignment;
+                          BufferSizeBytesFunction(),
+                          /*color_alignment=*/[](LogicalBuffer::Color) {
+                            return kCudaMallocAlignBytes;
                           }));
-  // BufferAssignment::ToString() includes a header, so no need for us to
-  // print one ourselves.
+  // BufferAssignment::Stats::ToString() and BufferAssignment::ToString()
+  // include headers, so no need for us to print them ourselves.
+  XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString());
   XLA_VLOG_LINES(2, buffer_assignment->ToString());
   XLA_VLOG_LINES(2, module->ToString());
-  const string xla_dump_hlo_proto_to =
-      module->config().debug_options().xla_dump_hlo_proto_to();
-  if (!xla_dump_hlo_proto_to.empty()) {
+  const string xla_dump_optimized_hlo_proto_to =
+      module->config().debug_options().xla_dump_optimized_hlo_proto_to();
+  if (!xla_dump_optimized_hlo_proto_to.empty()) {
     HloProto proto = MakeHloProto(*module, *buffer_assignment);
     TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
-        proto, xla_dump_hlo_proto_to, module->name()));
+        proto, xla_dump_optimized_hlo_proto_to, module->name()));
   }
 
   IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(),
@@ -393,6 +567,20 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
         /*optimized=*/false));
   }
 
+  {
+    XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - Running LLVM verifier");
+
+    std::string err;
+    llvm::raw_string_ostream err_stream(err);
+
+    // verifyModule() returns true if the module is broken.
+    TF_RET_CHECK(!llvm::verifyModule(llvm_module, &err_stream))
+        << "Invalid LLVM IR before optimizations:\n"
+        << err_stream.str()
+        << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. "
+           "Rerun with --xla_dump_ir_to to get the IR. ";
+  }
+
   string libdevice_dir;
   {
     tensorflow::mutex_lock lock(mutex_);
@@ -443,7 +631,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   // Write PTX to IR dump directory, if IR dumping was requested.
   if (!ir_dump_directory.empty()) {
     const string ptx_outfile = tensorflow::io::JoinPath(
-        ir_dump_directory, StrCat(module->name(), ".ptx"));
+        ir_dump_directory, tensorflow::strings::StrCat(module->name(), ".ptx"));
     auto status = [&] {
       auto* env = tensorflow::Env::Default();
       TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(ir_dump_directory));
@@ -466,13 +654,14 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   XLA_VLOG_LINES(2, thunk_schedule->ToString());
 
   std::unique_ptr<HloProfileIndexMap> profile_index_map;
-  std::unique_ptr<HloProfilePrinter> profile_printer;
+  std::unique_ptr<HloProfilePrinterData> profile_printer;
 
   if (module->config().hlo_profiling_enabled()) {
     HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
+    TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
     profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
     profile_printer =
-        CreateHloProfilePrinter(*profile_index_map, cost_analysis);
+        CreateHloProfilePrinterData(*profile_index_map, cost_analysis);
   }
 
   auto* gpu_executable = new GpuExecutable(
@@ -541,6 +730,10 @@ std::vector<uint8> GpuCompiler::CompilePtxOrGetCachedResult(const string& ptx,
                    "GPU driver compile the ptx. "
                 << maybe_cubin.status();
           }
+
+          // We're going to use the driver to JIT our PTX->SASS, so warn if
+          // the JIT in the driver has known bugs.
+          WarnIfBadDriverJITVersion();
         }
       }
       cache_value->compilation_done = true;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index 18e34340205b6f51497e26c45520799d21c55a46..c352d4d8462fadb266c55ad437de998e86a6528e 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -51,11 +51,13 @@ class GpuCompiler : public LLVMCompiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_constants.cc b/tensorflow/compiler/xla/service/gpu/gpu_constants.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa360c7f73de2f0f9cf59c22b552b8e60ddb3a87
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_constants.cc
@@ -0,0 +1,25 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
+
+namespace xla {
+namespace gpu {
+
+// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#device-memory-accesses
+const int64 kCudaMallocAlignBytes = 256;
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_constants.h b/tensorflow/compiler/xla/service/gpu/gpu_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb1ca4c6c95a23d2a08f5f9c3cbc85e7d47d4f89
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_constants.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONSTANTS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONSTANTS_H_
+
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+namespace gpu {
+
+// Minimum alignment of cudaMalloc.  We require that buffers created by our
+// DeviceMemoryAllocator, and all input/output buffers, have this alignment.
+extern const int64 kCudaMallocAlignBytes;
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONSTANTS_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
index 33d739b79d3664fec3586bbc924b7fa2e10d3256..916b556fd43a453a4da2c96217e74c367f8c7653 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
@@ -36,7 +36,7 @@ namespace gpu {
 
 StatusOr<HloInstruction*> GpuCopyInsertion::FindOrInsertCopy(
     HloInstruction* hlo) {
-  HloInstruction*& copy = inserted_copies_[hlo];
+  HloInstruction*& copy = hlo_to_copy_map_[hlo];
   if (copy == nullptr) {
     TF_ASSIGN_OR_RETURN(copy, hlo->parent()->DeepCopyInstruction(hlo));
   }
@@ -55,45 +55,71 @@ StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
   // in IR.
   for (HloInstruction* hlo :
        module->entry_computation()->MakeInstructionPostOrder()) {
-    if (ImplementedAsLibraryCall(*hlo)) {
+    // Inserts a copy of hlo->operand(n) if it's a constant.
+    auto copy_operand_if_constant = [&](int64 n) -> Status {
+      HloInstruction* operand = hlo->mutable_operand(n);
+      TF_RET_CHECK(ShapeUtil::IsArray(operand->shape()));
+      const auto& values = dataflow->GetValueSet(operand).values();
+      if (std::any_of(values.begin(), values.end(), [](const HloValue* value) {
+            return value->defining_instruction()->opcode() ==
+                   HloOpcode::kConstant;
+          })) {
+        TF_ASSIGN_OR_RETURN(HloInstruction * copy, FindOrInsertCopy(operand));
+        TF_RETURN_IF_ERROR(hlo->ReplaceOperandWith(n, copy));
+        changed = true;
+      }
+      return Status::OK();
+    };
+
+    if (IsCustomCallToDnnBatchNorm(*hlo)) {
+      // The epsilon and feature_index operands to a CUDNN batchnorm op don't
+      // need to be materialized in memory -- in fact, they must be constants.
+      // These are the last two operands of all three batchnorm ops.
+      for (int64 i = 0; i < hlo->operand_count() - 2; ++i) {
+        TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
+      }
+    } else if (IsCustomCallToDnnConvolution(*hlo)) {
+      // The last two arguments to a CUDNN convolution are two HLO constants for
+      // cudnn algorithm and tensor_ops_enabled flag, which shouldn't be copied.
+      for (int64 i = 0; i < hlo->operand_count() - 2; ++i) {
+        TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
+      }
+    } else if (ImplementedAsLibraryCall(*hlo)) {
+      // For all other library calls, materialize all the operands into memory.
       for (int64 i = 0; i < hlo->operand_count(); ++i) {
-        HloInstruction* operand = hlo->mutable_operand(i);
-        TF_RET_CHECK(ShapeUtil::IsArray(operand->shape()));
-        const auto& values = dataflow->GetValueSet(operand).values();
-        if (std::any_of(values.begin(), values.end(),
-                        [](const HloValue* value) {
-                          return value->defining_instruction()->opcode() ==
-                                 HloOpcode::kConstant;
-                        })) {
-          TF_ASSIGN_OR_RETURN(HloInstruction * copy, FindOrInsertCopy(operand));
-          TF_RETURN_IF_ERROR(hlo->ReplaceOperandWith(i, copy));
-          changed = true;
-        }
+        TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
       }
     }
   }
 
-  // Init values of a while node cannot be constants. Insert copies for any
-  // constants found at the operand of a while.
-  tensorflow::gtl::FlatSet<HloInstruction*> copied_constants;
+  // Init values of while and conditional nodes cannot be constants. Insert
+  // copies for any constants found at the operands of these nodes.
+  tensorflow::gtl::FlatSet<HloInstruction*> inserted_copies;
   for (HloComputation* computation : module->computations()) {
     for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() != HloOpcode::kWhile) {
+      if (instruction->opcode() != HloOpcode::kWhile &&
+          instruction->opcode() != HloOpcode::kConditional) {
         continue;
       }
-      for (auto& pair :
-               dataflow->GetInstructionValueSet(instruction->operand(0))) {
-        const HloValueSet& value_set = pair.second;
-        for (const HloValue* value : value_set.values()) {
-          if (value->defining_instruction()->opcode() ==
-              HloOpcode::kConstant &&
-              !ContainsKey(copied_constants, value->defining_instruction())) {
-            HloInstruction* constant = value->defining_instruction();
-            TF_ASSIGN_OR_RETURN(HloInstruction * copy,
-                                FindOrInsertCopy(constant));
-            TF_RETURN_IF_ERROR(constant->ReplaceAllUsesWith(copy));
-            copied_constants.insert(constant);
-            changed = true;
+      for (auto operand : instruction->operands()) {
+        // Skip the operands that have already been replaced with a copy in a
+        // previous iteration (which is possible when a constant is used as an
+        // operand in multiple places).
+        if (ContainsKey(inserted_copies, operand)) {
+          continue;
+        }
+        for (auto& pair : dataflow->GetInstructionValueSet(operand)) {
+          const HloValueSet& value_set = pair.second;
+          for (const HloValue* value : value_set.values()) {
+            if (value->defining_instruction()->IsConstant() &&
+                !ContainsKey(hlo_to_copy_map_, value->defining_instruction())) {
+              HloInstruction* constant = value->defining_instruction();
+              TF_ASSIGN_OR_RETURN(HloInstruction * copy,
+                                  FindOrInsertCopy(constant));
+              TF_RETURN_IF_ERROR(constant->ReplaceAllUsesWith(copy));
+              inserted_copies.insert(copy);
+              changed = true;
+            }
           }
         }
       }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
index 4d77f337e6eb20f7d79acc0829fde26bbe443f25..0c6f9b511f3aac5f62182273b827adcd068cd633 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
@@ -32,13 +32,13 @@ class GpuCopyInsertion : public HloPassInterface {
   StatusOr<bool> Run(HloModule* module) override;
 
  protected:
-  // Returns a copy of `hlo`. Looks in inserted_copies_ first to avoid making
+  // Returns a copy of `hlo`. Looks in hlo_to_copy_map_ first to avoid making
   // duplicate copies.
   StatusOr<HloInstruction*> FindOrInsertCopy(HloInstruction* hlo);
 
   // A map containing all copies inserted to materialize operands of library
   // calls. The key is the copied instruction and the value is the copy.
-  tensorflow::gtl::FlatMap<HloInstruction*, HloInstruction*> inserted_copies_;
+  tensorflow::gtl::FlatMap<HloInstruction*, HloInstruction*> hlo_to_copy_map_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 0fd85e4fb057f144df93d53485570d67c66af0d4..623d6714de501000e38b7698620925f66425f157 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -66,10 +66,12 @@ class HloExecutionProfiler {
 
   // If profiling is enabled, sets the total cycle count on the profile from the
   // execution timer.
-  ~HloExecutionProfiler() {
+  void FinishExecution() {
+    CHECK(!finished_execution_) << "Call FinishExecution only once!";
+    finished_execution_ = true;
     if (do_profile_) {
       stream_->ThenStopTimer(execution_timer_.get());
-      stream_->BlockHostUntilDone();
+      stream_->BlockHostUntilDone().IgnoreError();
       profile_->set_total_cycles_executed(
           *computation_, execution_timer_->Nanoseconds() * clock_rate_ghz_);
     }
@@ -87,7 +89,7 @@ class HloExecutionProfiler {
   void FinishOperation(const HloInstruction* hlo_instruction) {
     if (do_profile_) {
       stream_->ThenStopTimer(per_op_timer_.get());
-      stream_->BlockHostUntilDone();
+      stream_->BlockHostUntilDone().IgnoreError();
       profile_->SetCyclesTakenBy(
           hlo_instruction, per_op_timer_->Nanoseconds() * clock_rate_ghz_);
     }
@@ -101,6 +103,7 @@ class HloExecutionProfiler {
   const HloComputation* computation_;
   std::unique_ptr<se::Timer> execution_timer_;
   std::unique_ptr<se::Timer> per_op_timer_;
+  bool finished_execution_ = false;
 };
 
 }  // namespace
@@ -113,9 +116,9 @@ GpuExecutable::GpuExecutable(
     std::unique_ptr<const ThunkSchedule> thunk_schedule,
     std::unique_ptr<const HloModule> hlo_module,
     std::unique_ptr<const BufferAssignment> assignment,
-    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
       ptx_(ptx),
       cubin_(cubin),
@@ -143,9 +146,12 @@ Status GpuExecutable::ExecuteThunks(
   if (do_profile) {
     LOG(WARNING) << "PROFILING: profiling is enabled";
   }
+
   HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
                                 hlo_module_->entry_computation());
 
+  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
+
   // Stream 0 indicates `main_stream` and substreams start from stream 1.
   std::vector<Pool<se::Stream>::SmartPtr> sub_streams;
   while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
@@ -155,6 +161,9 @@ Status GpuExecutable::ExecuteThunks(
         run_options->BorrowStream(main_stream->parent()->device_ordinal()));
   }
 
+  // The next event enqueued on stream N must not run until the thunk at
+  // last_blocking_thunk_for_stream[N] completes.
+  std::map<int32, const Thunk*> last_blocking_thunk_for_stream;
   std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
   for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
     TF_RETURN_IF_ERROR(thunk->Initialize(*this));
@@ -167,15 +176,41 @@ Status GpuExecutable::ExecuteThunks(
       stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
     }
 
+    if (last_blocking_thunk_for_stream.count(stream_no)) {
+      stream->ThenWaitFor(FindOrDie(thunk_to_finish_event,
+                                    last_blocking_thunk_for_stream[stream_no])
+                              .get());
+      last_blocking_thunk_for_stream.erase(stream_no);
+    }
+
+    // If this thunk requests it, wait for all currently-executing thunks to
+    // finish.  This is useful e.g. if the thunk is about to perform autotuning.
+    if (thunk->ShouldHaltAllActivityBeforeRunning(stream)) {
+      TF_RETURN_IF_ERROR(main_stream->BlockHostUntilDone());
+      last_blocking_thunk_for_stream.clear();
+    }
+
     profiler.StartOperation();
     VLOG(2) << "Executing the thunk for "
-            << thunk->hlo_instruction()->ToString();
+            << thunk->hlo_instruction()->ToString() << " on stream "
+            << stream_no;
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
-    if (thunk_schedule_->Depended(thunk)) {
+    if (thunk_schedule_->Depended(thunk) || thunk->ShouldBlockFutureThunks()) {
       auto finish_event = MakeUnique<se::Event>(main_stream->parent());
       finish_event->Init();
       stream->ThenRecordEvent(finish_event.get());
       thunk_to_finish_event[thunk] = std::move(finish_event);
+
+      if (thunk->ShouldBlockFutureThunks()) {
+        // Set last_blocking_thunk_for_stream on all streams other than this one
+        // so that all other streams will wait for this thunk to complete before
+        // executing any events that occur later in the total order.
+        for (int32 i = 0; i < sub_streams.size() + 1; ++i) {
+          if (i != stream_no) {
+            last_blocking_thunk_for_stream[i] = thunk;
+          }
+        }
+      }
     }
     profiler.FinishOperation(thunk->hlo_instruction());
   }
@@ -184,90 +219,32 @@ Status GpuExecutable::ExecuteThunks(
   // Make sure kernels are completed before deallocating temporary buffers.
   // TODO(b/30100571): we could potentially postpone deallocating the temp
   // buffers until a different computation is executed.
-  if (block_host_until_done && !main_stream->BlockHostUntilDone()) {
-    return InternalError("Failed to complete all kernels launched on stream %p",
-                         main_stream);
+  if (block_host_until_done) {
+    Status block_status = main_stream->BlockHostUntilDone();
+    if (!block_status.ok()) {
+      return InternalError(
+          "Failed to complete all kernels launched on stream %p: %s",
+          main_stream, block_status.error_message().c_str());
+    }
   }
 
-  return Status::OK();
-}
+  profiler.FinishExecution();
+  uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
-StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+  {
+    tensorflow::mutex_lock lock(mutex_);
+    const double nanoseconds = (end_micros - start_micros) * 1000.0;
+    execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
 
-  BufferAllocations::Builder buffer_allocations_builder;
-  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
-       ++i) {
-    const BufferAllocation& allocation = assignment_->GetAllocation(i);
-    if (allocation.is_entry_computation_parameter()) {
-      buffer_allocations_builder.RegisterBuffer(
-          i, arguments[allocation.parameter_number()]);
+    // If hlo profiling was disabled then the cycle count is left empty.
+    if (do_profile) {
+      execution_profile_.set_compute_cycle_count(
+          hlo_execution_profile->total_cycles_executed(
+              *module().entry_computation()));
     }
   }
-  se::StreamExecutor* executor = stream->parent();
-  TF_ASSIGN_OR_RETURN(
-      auto buffer_allocations,
-      buffer_allocations_builder.Build(*assignment_, executor->device_ordinal(),
-                                       memory_allocator));
-
-  bool block_host_until_done =
-      !memory_allocator->AllowsAsynchronousDeallocation();
-  TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations,
-                                   block_host_until_done,
-                                   hlo_execution_profile));
 
-  HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
-  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice output_slice,
-                      assignment_->GetUniqueTopLevelOutputSlice());
-  se::DeviceMemoryBase output_buffer_address =
-      buffer_allocations->GetDeviceAddress(output_slice.index());
-
-  if (ShapeUtil::IsTuple(root->shape())) {
-    std::set<se::DeviceMemoryBase> referred_by_output;
-    if (GetRootPointsToSet().IsAmbiguous()) {
-      // The points-to set of the root is ambiguous so we need to examine the
-      // result data to determine which buffers are contained in the result.
-      TF_ASSIGN_OR_RETURN(
-          TransferManager * transfer_manager,
-          TransferManager::GetForPlatform(executor->platform()));
-      TF_ASSIGN_OR_RETURN(referred_by_output,
-                          transfer_manager->GatherBufferPointersFromTuple(
-                              executor, output_buffer_address, root->shape()));
-    } else {
-      // The points-to set of the root is unambiguous so it's known statically
-      // which buffers are in the result. Gather these buffers using the root's
-      // points-to set.
-      TF_RETURN_IF_ERROR(GetRootPointsToSet().ForEachElementWithStatus(
-          [&referred_by_output, &buffer_allocations, this](
-              const ShapeIndex& /*index*/,
-              const PointsToSet::BufferList& buffers) {
-            // The points to set is unambiguous so the set should be a
-            // singleton. That is, we know exactly which instruction produced
-            // the array at this element.
-            CHECK_EQ(1, buffers.size());
-            HloInstruction* hlo = buffers[0]->instruction();
-            TF_ASSIGN_OR_RETURN(
-                const BufferAllocation::Slice slice,
-                this->assignment_->GetUniqueSlice(hlo, buffers[0]->index()));
-            CHECK(!slice.allocation()->is_entry_computation_parameter());
-            referred_by_output.insert(
-                buffer_allocations->GetDeviceAddress(slice.index()));
-            return Status::OK();
-          }));
-    }
-    TF_RETURN_IF_ERROR(
-        buffer_allocations->TearDown(referred_by_output, *assignment_));
-  } else {
-    // If the computation result is not a tuple, we can delete all temporary
-    // buffers that are not the output.
-    TF_RETURN_IF_ERROR(
-        buffer_allocations->TearDown({output_buffer_address}, *assignment_));
-  }
-  return output_buffer_address;
+  return Status::OK();
 }
 
 StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
@@ -285,9 +262,16 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
        ++i) {
     const BufferAllocation& allocation = assignment_->GetAllocation(i);
     if (allocation.is_entry_computation_parameter()) {
-      auto param_no = allocation.parameter_number();
-      buffer_allocations_builder.RegisterBuffer(
-          i, arguments[param_no]->buffer(/*index=*/{}));
+      // The caller must give us a buffer for ShapeIndex {} of every parameter.
+      // It can optionally give us a buffer for other ShapeIndices, but we
+      // ignore them: Because we can't rely on these sub-buffers' addresses
+      // being available, our generated code can't use them.  Instead, it must
+      // chase pointers starting at the tuple root.
+      if (allocation.param_shape_index().empty()) {
+        auto param_no = allocation.parameter_number();
+        buffer_allocations_builder.RegisterBuffer(
+            i, arguments[param_no]->root_buffer());
+      }
     }
   }
   se::StreamExecutor* executor = run_options->stream()->parent();
@@ -305,50 +289,46 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
   auto device_ordinal = executor->device_ordinal();
   auto shaped_buffer = MakeUnique<ShapedBuffer>(
-      root->shape(), executor->platform(), device_ordinal);
+      root->shape(), root->shape(), executor->platform(), device_ordinal);
 
   // Copy DeviceMemoryBase values which contain the array(s) of the result into
   // the respective location in ShapedBuffer.
   std::set<se::DeviceMemoryBase> buffers_in_result;
-  TF_RETURN_IF_ERROR(
-      shaped_buffer->mutable_shape_index_to_buffer_entry()
-          ->ForEachMutableElementWithStatus(
-              [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
-                  const ShapeIndex& index, size_t* buffer_entry) {
-                const auto& sources = this->GetRootPointsToSet().element(index);
-                // The points-to set is unambiguous so the set should be a
-                // singleton. That is, we know exactly which instruction
-                // produced the array at this element.
-                CHECK_EQ(1, sources.size());
-                auto src_hlo = sources[0]->instruction();
-
-                VLOG(4) << "Looking at: " << sources[0];
-
-                // The source instruction should have a non-parameter buffer
-                // assigned.
-                TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
-                                    this->assignment_->GetUniqueSlice(
-                                        src_hlo, sources[0]->index()));
-                CHECK(!slice.allocation()->is_entry_computation_parameter());
-
-                perftools::gputools::DeviceMemoryBase src_base =
-                    buffer_allocations->GetDeviceAddress(slice.index());
-                CHECK(!src_base.is_null() || src_base.size() == 0);
-                shaped_buffer->mutable_buffers()->push_back(src_base);
-                *buffer_entry = shaped_buffer->mutable_buffers()->size() - 1;
-
-                buffers_in_result.insert(src_base);
-                return Status::OK();
-              }));
+  TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
+      [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
+          const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
+        const auto& sources = this->GetRootPointsToSet().element(index);
+        // The points-to set is unambiguous so the set should be a
+        // singleton. That is, we know exactly which instruction
+        // produced the array at this element.
+        CHECK_EQ(1, sources.size());
+        auto src_hlo = sources[0]->instruction();
+
+        VLOG(4) << "Looking at: " << sources[0];
+
+        // The source instruction should have a non-parameter buffer
+        // assigned.
+        TF_ASSIGN_OR_RETURN(
+            const BufferAllocation::Slice slice,
+            this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
+        CHECK(!slice.allocation()->is_entry_computation_parameter());
+
+        perftools::gputools::DeviceMemoryBase src_base =
+            buffer_allocations->GetDeviceAddress(slice.index());
+        CHECK(!src_base.is_null() || src_base.size() == 0);
+        *device_memory = src_base;
+        buffers_in_result.insert(src_base);
+        return Status::OK();
+      }));
   TF_RETURN_IF_ERROR(
       buffer_allocations->TearDown(buffers_in_result, *assignment_));
 
   return std::move(shaped_buffer);
 }
 
-StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteAsyncOnStream(
+StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   // TODO(b/30671675): Implement asynchronous execution mode.
   return Unimplemented(
       "Asynchronous execution on stream is not yet supported on GPU.");
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index e7307e07c0b5608e31f15597d31d11c50f81c6d5..b19cfd43debd0a5490495d176fa2f1fcd625da07 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -54,7 +54,7 @@ class GpuExecutable : public Executable {
                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
                 std::unique_ptr<const HloModule> hlo_module,
                 std::unique_ptr<const BufferAssignment> assignment,
-                std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+                std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
 
   // This should be called after set_ir_module_string.
@@ -72,24 +72,16 @@ class GpuExecutable : public Executable {
   // empty, in which case compilation is left up to the GPU driver.
   const std::vector<uint8>& cubin() const { return cubin_; }
 
-  // Both overloads of ExecuteOnStream will fail if the compute capability of
-  // the stream doesn't match the compute capability passed to this object's
-  // constructor.
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
-
+  // ExecuteOnStream will fail if the compute capability of the stream doesn't
+  // match the compute capability passed to this object's constructor.
   StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
+  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments) override;
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
   const Status EqualOrFail(const Executable& executable) {
     // TODO(b/62952745) Implement equality test on GPU executable.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4944c41f7d8dc7a78a3cd094aee4d7087c74857e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h"
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+StatusOr<bool> GpuHloSupportChecker::Run(HloModule* module) {
+  for (auto* computation : module->computations()) {
+    for (const auto& instruction : computation->instructions()) {
+      TF_RETURN_IF_ERROR(
+          ShapeUtil::ValidateShapeWithOptionalLayout(instruction->shape()));
+      TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+          instruction->shape(),
+          [&instruction](const Shape& subshape, const ShapeIndex&) {
+            if (LayoutUtil::IsSparseArray(subshape)) {
+              return xla::Unimplemented(
+                  "GPU backend does not support HLO instruction %s with shape "
+                  "containing a sparse layout: %s",
+                  instruction->ToString().c_str(),
+                  ShapeUtil::HumanStringWithLayout(instruction->shape())
+                      .c_str());
+            }
+            return Status::OK();
+          }));
+    }
+  }
+  return false;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h
new file mode 100644
index 0000000000000000000000000000000000000000..d63e213d2b1efab4bcff75541cc5ab33d7a07976
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_HLO_SUPPORT_CHECKER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_HLO_SUPPORT_CHECKER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// his pass should run early in the HLO pipeline and checks for HLO constructs
+// which are not supported by the GPU backend and cannot be removed via HLO
+// transformations (eg, sparse layouts).
+class GpuHloSupportChecker : public HloPassInterface {
+ public:
+  GpuHloSupportChecker() = default;
+  ~GpuHloSupportChecker() override = default;
+
+  tensorflow::StringPiece name() const override {
+    return "gpu_hlo_support_checker";
+  }
+
+  // Note: always returns false (no instructions are ever modified by this
+  // pass).
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_HLO_SUPPORT_CHECKER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a4089df4c954cafcbe241189ee79a0995683513
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+using ::testing::HasSubstr;
+
+class GpuHloSupportCheckerTest : public HloTestBase {
+ protected:
+  GpuHloSupportChecker& checker() { return checker_; }
+
+ private:
+  GpuHloSupportChecker checker_;
+};
+
+TEST_F(GpuHloSupportCheckerTest, Add) {
+  HloComputation::Builder builder(TestName());
+  const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "param1"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kAdd, param0, param1));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK(checker().Run(module.get()).status());
+}
+
+TEST_F(GpuHloSupportCheckerTest, SparseUnimplemented) {
+  HloComputation::Builder builder(TestName());
+  const Shape sparse_shape = ShapeUtil::MakeShapeWithSparseLayout(F32, {10}, 2);
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, sparse_shape, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, sparse_shape, "param1"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      sparse_shape, HloOpcode::kAdd, param0, param1));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  Status status = checker().Run(module.get()).status();
+  ASSERT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("GPU backend does not support"));
+  EXPECT_THAT(status.error_message(),
+              HasSubstr(ShapeUtil::HumanStringWithLayout(sparse_shape)));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89f1e625884568bf7370b3801d851ef4846c2a98
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -0,0 +1,249 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+namespace gpu {
+
+// cuDNN convolutions are called with specific layouts on the input, output,
+// and filter:
+//
+//   input: DataLayout::kBatchDepthYX
+//   output: DataLayout::kBatchDepthYX
+//   filter: FilterLayout::kOutputInputYX
+//
+// The order dimensions in the constant name is major-to-minor (eg, the
+// most-major dimension of the input is batch, most-minor is X). The
+// specific dimension numbers these named dimensions correspond to is
+// determined by the ConvolutionDimensionNumbers argument. Y is spatial
+// dimension 0, and X is spatial dimension 1.
+//
+// TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls.
+static Status AddBackendConstraintsToDnnConvCustomCall(
+    HloInstruction* instr, LayoutConstraints* constraints) {
+  CHECK(IsCustomCallToDnnConvolution(*instr)) << instr->ToString();
+  Shape input_shape;
+  Shape filter_shape;
+  Shape output_shape;
+  const auto& target = instr->custom_call_target();
+  if (target == kCudnnConvForwardCallTarget) {
+    input_shape = instr->operand(0)->shape();
+    filter_shape = instr->operand(1)->shape();
+    output_shape = instr->shape().tuple_shapes(0);
+  } else if (target == kCudnnConvBackwardInputCallTarget) {
+    input_shape = instr->shape().tuple_shapes(0);
+    filter_shape = instr->operand(1)->shape();
+    output_shape = instr->operand(0)->shape();
+  } else if (target == kCudnnConvBackwardFilterCallTarget) {
+    input_shape = instr->operand(0)->shape();
+    filter_shape = instr->shape().tuple_shapes(0);
+    output_shape = instr->operand(1)->shape();
+  } else {
+    LOG(FATAL) << "Unexpected custom call target: "
+               << instr->custom_call_target();
+  }
+
+  // Construct minor-to-major dimension orders for operands and result.
+  // cuDNN's convolution APIs support the BDYX layout for activations/output
+  // and the OIYX layout for weights.
+  // TODO(b/29399649): Be more flexible about handling layouts of cuDNN
+  // calls after we switch to cuDNN v5.
+  const ConvolutionDimensionNumbers& dimension_numbers =
+      instr->convolution_dimension_numbers();
+  std::vector<int64> input_layout;
+  for (int i = dimension_numbers.input_spatial_dimensions_size() - 1; i >= 0;
+       --i) {
+    input_layout.push_back(dimension_numbers.input_spatial_dimensions(i));
+  }
+  input_layout.push_back(dimension_numbers.input_feature_dimension());
+  input_layout.push_back(dimension_numbers.input_batch_dimension());
+  *input_shape.mutable_layout() = LayoutUtil::MakeLayout(input_layout);
+
+  std::vector<int64> filter_layout;
+  for (int i = dimension_numbers.kernel_spatial_dimensions_size() - 1; i >= 0;
+       --i) {
+    filter_layout.push_back(dimension_numbers.kernel_spatial_dimensions(i));
+  }
+  filter_layout.push_back(dimension_numbers.kernel_input_feature_dimension());
+  filter_layout.push_back(dimension_numbers.kernel_output_feature_dimension());
+  *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout);
+
+  std::vector<int64> output_layout;
+  for (int i = dimension_numbers.output_spatial_dimensions_size() - 1; i >= 0;
+       --i) {
+    output_layout.push_back(dimension_numbers.output_spatial_dimensions(i));
+  }
+  output_layout.push_back(dimension_numbers.output_feature_dimension());
+  output_layout.push_back(dimension_numbers.output_batch_dimension());
+  *output_shape.mutable_layout() = LayoutUtil::MakeLayout(output_layout);
+
+  // The custom call returns a tuple of (actual_result, scratch_buffer);
+  // call_result_buf is the logical buffer for actual_result, the thing that
+  // contains the result of the conv call.
+  TF_ASSIGN_OR_RETURN(const LogicalBuffer* call_result_buf,
+                      constraints->points_to_analysis().GetBufferDefinedAt(
+                          instr, /*index=*/{0}));
+
+  // Set layouts of the instructions' shapes.
+  if (target == kCudnnConvForwardCallTarget) {
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(input_shape, instr, 0));
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(filter_shape, instr, 1));
+    TF_RETURN_IF_ERROR(
+        constraints->SetBufferLayout(output_shape.layout(), *call_result_buf));
+  } else if (target == kCudnnConvBackwardInputCallTarget) {
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(output_shape, instr, 0));
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(filter_shape, instr, 1));
+    TF_RETURN_IF_ERROR(
+        constraints->SetBufferLayout(input_shape.layout(), *call_result_buf));
+  } else if (target == kCudnnConvBackwardFilterCallTarget) {
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(input_shape, instr, 0));
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(output_shape, instr, 1));
+    TF_RETURN_IF_ERROR(
+        constraints->SetBufferLayout(filter_shape.layout(), *call_result_buf));
+  } else {
+    LOG(FATAL) << "Unexpected custom call target: "
+               << instr->custom_call_target();
+  }
+  return Status::OK();
+}
+
+Status GpuLayoutAssignment::AddBackendConstraints(
+    LayoutConstraints* constraints) {
+  for (auto* instruction : constraints->computation()->instructions()) {
+    if (IsCustomCallToDnnConvolution(*instruction)) {
+      TF_RETURN_IF_ERROR(
+          AddBackendConstraintsToDnnConvCustomCall(instruction, constraints));
+    }
+  }
+  return Status::OK();
+}
+
+bool GpuLayoutAssignment::CustomCallRequiresMajorFirstLayout(
+    const HloInstruction* instruction) {
+  // - Inputs to cudnn batchnorm custom calls don't need the major-first layout
+  //   (i.e. {n, n-1, ...0}) -- we can handle any layout.
+  // - Inputs to cudnn convolution require custom layouts handled in
+  //   AddBackendConstraints.
+  return !IsCustomCallToDnnBatchNorm(*instruction) &&
+         !IsCustomCallToDnnConvolution(*instruction);
+}
+
+Status GpuLayoutAssignment::PropagateOperandConstraint(
+    const OperandLayoutConstraint& layout_constraint,
+    LayoutConstraints* constraints) {
+  const HloInstruction* instruction = layout_constraint.instruction();
+
+  // cudnn batchnorm forward inference's result must have the same layout as its
+  // operand 0.
+  if (instruction->opcode() == HloOpcode::kCustomCall &&
+      instruction->custom_call_target() ==
+          kCudnnBatchNormForwardInferenceCallTarget &&
+      layout_constraint.operand_no() == 0) {
+    TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
+        layout_constraint.shape_layout().shape(), instruction));
+  }
+
+  // cudnn batchnorm forward training returns a tuple {output, mean,
+  // inverse-stddev}.  mean and inverse-stddev are rank 1 and so have only one
+  // possible layout, but output is not (necessarily) rank 1, and, like in
+  // batchnorm forward inference, must have the same layout as operand 0.
+  if (instruction->opcode() == HloOpcode::kCustomCall &&
+      instruction->custom_call_target() ==
+          kCudnnBatchNormForwardTrainingCallTarget &&
+      layout_constraint.operand_no() == 0) {
+    TF_ASSIGN_OR_RETURN(const LogicalBuffer* out_buf,
+                        constraints->points_to_analysis().GetBufferDefinedAt(
+                            instruction, /*index=*/{0}));
+    TF_RETURN_IF_ERROR(constraints->SetBufferLayout(
+        layout_constraint.shape_layout().layout(), *out_buf));
+  }
+
+  // Like forward training, cudnn batchnorm backward returns a tuple {output,
+  // mean, inverse-stddev}, and its operand 0 and 'output' must have the same
+  // layout.  In addition, its operand 0 and operand 4 -- the 'operand' and
+  // 'grad_output' parameters -- must have the same layout.
+  if (instruction->opcode() == HloOpcode::kCustomCall &&
+      instruction->custom_call_target() == kCudnnBatchNormBackwardCallTarget &&
+      (layout_constraint.operand_no() == 0 ||
+       layout_constraint.operand_no() == 4)) {
+    TF_ASSIGN_OR_RETURN(const LogicalBuffer* out_buf,
+                        constraints->points_to_analysis().GetBufferDefinedAt(
+                            instruction, /*index=*/{0}));
+    TF_RETURN_IF_ERROR(constraints->SetBufferLayout(
+        layout_constraint.shape_layout().layout(), *out_buf));
+
+    int64 operand_to_set = layout_constraint.operand_no() == 0 ? 4 : 0;
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+        layout_constraint.shape_layout().shape(), instruction, operand_to_set));
+  }
+
+  return LayoutAssignment::PropagateOperandConstraint(layout_constraint,
+                                                      constraints);
+}
+
+Status GpuLayoutAssignment::PropagateBufferConstraint(
+    const BufferLayoutConstraint& buffer_constraint,
+    LayoutConstraints* constraints) {
+  const LogicalBuffer& buf = buffer_constraint.buffer();
+  const HloInstruction* instruction = buf.instruction();
+
+  Shape shape_with_layout = buf.shape();
+  *shape_with_layout.mutable_layout() = buffer_constraint.layout();
+
+  // Propagate output constraints to the operands of cudnn batchnorm ops.  This
+  // is the same as PropagateOperandConstraint, just in the other direction.  We
+  // need to both to fulfill our contract to LayoutAssignment.
+  if (instruction->opcode() == HloOpcode::kCustomCall &&
+      instruction->custom_call_target() ==
+          kCudnnBatchNormForwardInferenceCallTarget) {
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+        shape_with_layout, instruction, /*operand_no=*/0));
+  }
+
+  if (instruction->opcode() == HloOpcode::kCustomCall &&
+      instruction->custom_call_target() ==
+          kCudnnBatchNormForwardTrainingCallTarget &&
+      buf.index() == ShapeIndex({0})) {
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+        shape_with_layout, instruction, /*operand_no=*/0));
+  }
+  if (instruction->opcode() == HloOpcode::kCustomCall &&
+      instruction->custom_call_target() == kCudnnBatchNormBackwardCallTarget &&
+      buf.index() == ShapeIndex({0})) {
+    // batchnorm backward has two operands, "operand" and "grad_output" whose
+    // layouts must both match that of the result at tuple-index 0.
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+        shape_with_layout, instruction, /*operand_no=*/0));
+    TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+        shape_with_layout, instruction, /*operand_no=*/4));
+  }
+
+  return LayoutAssignment::PropagateBufferConstraint(buffer_constraint,
+                                                     constraints);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/layout_assignment.h b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
similarity index 70%
rename from tensorflow/compiler/xla/service/gpu/layout_assignment.h
rename to tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
index 169041eb85c633cb4f1f679bcea127714828308f..86a3a7111fd79494e469beecf3234f6cec9adb9c 100644
--- a/tensorflow/compiler/xla/service/gpu/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAYOUT_ASSIGNMENT_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAYOUT_ASSIGNMENT_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_LAYOUT_ASSIGNMENT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_LAYOUT_ASSIGNMENT_H_
 
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
@@ -33,9 +33,17 @@ class GpuLayoutAssignment : public LayoutAssignment {
 
  protected:
   Status AddBackendConstraints(LayoutConstraints* constraints) override;
+  Status PropagateOperandConstraint(
+      const OperandLayoutConstraint& layout_constraint,
+      LayoutConstraints* constraints) override;
+  Status PropagateBufferConstraint(
+      const BufferLayoutConstraint& buffer_constraint,
+      LayoutConstraints* constraints) override;
+  bool CustomCallRequiresMajorFirstLayout(
+      const HloInstruction* instruction) override;
 };
 
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAYOUT_ASSIGNMENT_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_LAYOUT_ASSIGNMENT_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c45d2e94aebce5496da94841f6a1ae9015615c1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -0,0 +1,328 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_layout.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using LayoutAssignmentTest = HloTestBase;
+
+TEST_F(LayoutAssignmentTest, Elementwise) {
+  Shape ashape = ShapeUtil::MakeShape(F32, {42, 12});
+  Shape ashape_in_row_major(ashape);
+  Shape ashape_in_col_major(ashape);
+  *ashape_in_row_major.mutable_layout() = LayoutUtil::MakeLayout({1, 0});
+  *ashape_in_col_major.mutable_layout() = LayoutUtil::MakeLayout({0, 1});
+
+  // Enumerate all possible combinations of layouts.
+  for (const Shape& lhs_shape_with_layout :
+       {ashape_in_row_major, ashape_in_col_major}) {
+    for (const Shape& rhs_shape_with_layout :
+         {ashape_in_row_major, ashape_in_col_major}) {
+      for (const Shape& result_shape_with_layout :
+           {ashape_in_row_major, ashape_in_col_major}) {
+        // GpuLayoutAssignment should assign the same layout to "add" and its
+        // two operands.
+        auto builder = HloComputation::Builder(TestName());
+        auto x = builder.AddInstruction(
+            HloInstruction::CreateParameter(0, ashape, "x"));
+        auto y = builder.AddInstruction(
+            HloInstruction::CreateParameter(1, ashape, "y"));
+        auto add = builder.AddInstruction(
+            HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, x, y));
+        auto module = CreateNewModule();
+        HloComputation* computation =
+            module->AddEntryComputation(builder.Build(add));
+
+        ComputationLayout computation_layout(
+            computation->ComputeProgramShape());
+        *computation_layout.mutable_parameter_layout(0) =
+            ShapeLayout(lhs_shape_with_layout);
+        *computation_layout.mutable_parameter_layout(1) =
+            ShapeLayout(rhs_shape_with_layout);
+        *computation_layout.mutable_result_layout() =
+            ShapeLayout(result_shape_with_layout);
+
+        GpuLayoutAssignment layout_assignment(&computation_layout);
+        EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
+
+        for (const HloInstruction* operand : add->operands()) {
+          EXPECT_TRUE(LayoutUtil::Equal(add->shape().layout(),
+                                        operand->shape().layout()));
+        }
+      }
+    }
+  }
+}
+
+// Returns a list shapes with all the possible layouts of this shape, including
+// a shape with no layout.
+std::vector<Shape> AllLayoutsOf(const Shape& s) {
+  std::vector<int64> layout_vec(s.dimensions_size());
+  std::iota(layout_vec.begin(), layout_vec.end(), 0);
+
+  std::vector<Shape> shapes;
+  shapes.push_back(s);
+  shapes.back().clear_layout();
+
+  do {
+    shapes.push_back(s);
+    *shapes.back().mutable_layout() = LayoutUtil::MakeLayout(layout_vec);
+  } while (std::next_permutation(layout_vec.begin(), layout_vec.end()));
+
+  return shapes;
+}
+
+TEST_F(LayoutAssignmentTest, BatchNormInference) {
+  const int64 kFeatureIndex = 1;
+
+  // The shape of the data operand to BatchNormInference and of the output of
+  // the BatchNormInference call.
+  Shape shape = ShapeUtil::MakeShape(F32, {42, 12, 1, 100});
+
+  // The shape of the scale, offset, mean, and variance inputs to
+  // BatchNormTraining.  These are rank 1, with as many elements are in the
+  // kFeatureIndex dim of shape.
+  Shape aux_shape =
+      ShapeUtil::MakeShape(F32, {shape.dimensions(kFeatureIndex)});
+
+  for (const Shape& input_shape : AllLayoutsOf(shape)) {
+    for (const Shape& result_shape : AllLayoutsOf(shape)) {
+      SCOPED_TRACE(tensorflow::strings::StrCat(
+          "input_shape=", ShapeUtil::HumanStringWithLayout(input_shape),
+          ", result_shape=", ShapeUtil::HumanStringWithLayout(result_shape)));
+
+      auto builder = HloComputation::Builder(TestName());
+      auto* operand = builder.AddInstruction(
+          HloInstruction::CreateParameter(0, shape, "operand"));
+      auto* scale = builder.AddInstruction(
+          HloInstruction::CreateParameter(1, aux_shape, "scale"));
+      auto* offset = builder.AddInstruction(
+          HloInstruction::CreateParameter(2, aux_shape, "offset"));
+      auto* mean = builder.AddInstruction(
+          HloInstruction::CreateParameter(3, aux_shape, "mean"));
+      auto* variance = builder.AddInstruction(
+          HloInstruction::CreateParameter(4, aux_shape, "variance"));
+
+      auto* epsilon = builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<float>(1)));
+      auto* feature_index =
+          builder.AddInstruction(HloInstruction::CreateConstant(
+              Literal::CreateR0<int64>(kFeatureIndex)));
+
+      auto* batchnorm = builder.AddInstruction(HloInstruction::CreateCustomCall(
+          shape,
+          {operand, scale, offset, mean, variance, epsilon, feature_index},
+          kCudnnBatchNormForwardInferenceCallTarget));
+
+      auto module = CreateNewModule();
+      HloComputation* computation =
+          module->AddEntryComputation(builder.Build(batchnorm));
+
+      ComputationLayout computation_layout(computation->ComputeProgramShape());
+
+      if (input_shape.has_layout()) {
+        *computation_layout.mutable_parameter_layout(0) =
+            ShapeLayout(input_shape);
+      }
+
+      if (result_shape.has_layout()) {
+        *computation_layout.mutable_result_layout() = ShapeLayout(result_shape);
+      }
+
+      GpuLayoutAssignment layout_assignment(&computation_layout);
+      EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
+
+      // The first operand to batchnorm should have the same layout as the
+      // result.
+      EXPECT_TRUE(LayoutUtil::Equal(batchnorm->operand(0)->shape().layout(),
+                                    batchnorm->shape().layout()))
+          << batchnorm->ToString();
+    }
+  }
+}
+
+TEST_F(LayoutAssignmentTest, BatchNormTraining) {
+  const int64 kFeatureIndex = 1;
+
+  // The shape of the data operand to BatchNormTraining.
+  Shape shape = ShapeUtil::MakeShape(F32, {42, 12, 1, 100});
+
+  // The shape of the offset and scale inputs to BatchNormTraining.  These are
+  // rank 1, with as many elements are in the kFeatureIndex dim of shape.
+  Shape offset_scale_shape =
+      ShapeUtil::MakeShape(F32, {shape.dimensions(kFeatureIndex)});
+
+  // Shape of the output of our BatchNormTraining op.
+  Shape batchnorm_shape = ShapeUtil::MakeTupleShape(
+      {shape, offset_scale_shape, offset_scale_shape});
+
+  // Enumerate all combinations of shapes.
+  for (const Shape& input_shape : AllLayoutsOf(shape)) {
+    for (const Shape& result_shape : AllLayoutsOf(shape)) {
+      SCOPED_TRACE(tensorflow::strings::StrCat(
+          "input_shape=", ShapeUtil::HumanStringWithLayout(input_shape),
+          ", result_shape=", ShapeUtil::HumanStringWithLayout(result_shape)));
+
+      auto builder = HloComputation::Builder(TestName());
+      auto* operand = builder.AddInstruction(
+          HloInstruction::CreateParameter(0, shape, "operand"));
+      auto* scale = builder.AddInstruction(
+          HloInstruction::CreateParameter(1, offset_scale_shape, "scale"));
+      auto* offset = builder.AddInstruction(
+          HloInstruction::CreateParameter(2, offset_scale_shape, "offset"));
+
+      auto* epsilon = builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<float>(1)));
+      auto* feature_index =
+          builder.AddInstruction(HloInstruction::CreateConstant(
+              Literal::CreateR0<int64>(kFeatureIndex)));
+
+      auto* batchnorm = builder.AddInstruction(HloInstruction::CreateCustomCall(
+          batchnorm_shape, {operand, scale, offset, epsilon, feature_index},
+          kCudnnBatchNormForwardTrainingCallTarget));
+
+      auto module = CreateNewModule();
+      HloComputation* computation =
+          module->AddEntryComputation(builder.Build(batchnorm));
+
+      ComputationLayout computation_layout(computation->ComputeProgramShape());
+
+      if (input_shape.has_layout()) {
+        *computation_layout.mutable_parameter_layout(0) =
+            ShapeLayout(input_shape);
+      }
+
+      if (result_shape.has_layout()) {
+        *computation_layout.mutable_result_layout() =
+            ShapeLayout(ShapeUtil::MakeTupleShape(
+                {result_shape, offset_scale_shape, offset_scale_shape}));
+      }
+
+      GpuLayoutAssignment layout_assignment(&computation_layout);
+      EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
+
+      // The first operand to batchnorm should have the same layout as the
+      // first element of the result tuple.
+      EXPECT_TRUE(
+          LayoutUtil::Equal(batchnorm->operand(0)->shape().layout(),
+                            batchnorm->shape().tuple_shapes(0).layout()))
+          << batchnorm->ToString();
+    }
+  }
+}
+
+TEST_F(LayoutAssignmentTest, BatchNormGrad) {
+  const int64 kFeatureIndex = 1;
+
+  // The shape of the data operand to BatchNormTraining.
+  Shape shape = ShapeUtil::MakeShape(F32, {42, 12, 1, 100});
+
+  // The shape of the scale, mean, and variance inputs to BatchNormGrad.  These
+  // are rank 1, with as many elements are in the kFeatureIndex dim of shape.
+  Shape scale_shape =
+      ShapeUtil::MakeShape(F32, {shape.dimensions(kFeatureIndex)});
+
+  // Shape of the output of our BatchNormGrad op.
+  Shape batchnorm_shape =
+      ShapeUtil::MakeTupleShape({shape, scale_shape, scale_shape});
+
+  // Enumerate all combinations of shapes plus whether we're constraining param
+  // 0 or param 4.
+  for (const Shape& input_shape : AllLayoutsOf(shape)) {
+    for (const Shape& result_shape : AllLayoutsOf(shape)) {
+      for (int constrained_param_no : {0, 4}) {
+        SCOPED_TRACE(tensorflow::strings::StrCat(
+            "input_shape=", ShapeUtil::HumanStringWithLayout(input_shape),
+            ", result_shape=", ShapeUtil::HumanStringWithLayout(result_shape)));
+
+        auto builder = HloComputation::Builder(TestName());
+        auto* operand = builder.AddInstruction(
+            HloInstruction::CreateParameter(0, shape, "operand"));
+        auto* scale = builder.AddInstruction(
+            HloInstruction::CreateParameter(1, scale_shape, "scale"));
+        auto* mean = builder.AddInstruction(
+            HloInstruction::CreateParameter(2, scale_shape, "mean"));
+        auto* var = builder.AddInstruction(
+            HloInstruction::CreateParameter(3, scale_shape, "var"));
+        auto* grad_offset = builder.AddInstruction(
+            HloInstruction::CreateParameter(4, shape, "var"));
+
+        auto* epsilon = builder.AddInstruction(
+            HloInstruction::CreateConstant(Literal::CreateR0<float>(1)));
+        auto* feature_index =
+            builder.AddInstruction(HloInstruction::CreateConstant(
+                Literal::CreateR0<int64>(kFeatureIndex)));
+
+        auto* batchnorm =
+            builder.AddInstruction(HloInstruction::CreateCustomCall(
+                batchnorm_shape,
+                {operand, scale, mean, var, grad_offset, epsilon,
+                 feature_index},
+                kCudnnBatchNormBackwardCallTarget));
+
+        auto module = CreateNewModule();
+        HloComputation* computation =
+            module->AddEntryComputation(builder.Build(batchnorm));
+
+        ComputationLayout computation_layout(
+            computation->ComputeProgramShape());
+
+        if (input_shape.has_layout()) {
+          *computation_layout.mutable_parameter_layout(constrained_param_no) =
+              ShapeLayout(input_shape);
+        }
+
+        if (result_shape.has_layout()) {
+          *computation_layout.mutable_result_layout() =
+              ShapeLayout(ShapeUtil::MakeTupleShape(
+                  {result_shape, scale_shape, scale_shape}));
+        }
+
+        GpuLayoutAssignment layout_assignment(&computation_layout);
+        EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
+
+        // The first and fourth operands to the batchnorm call should have the
+        // same layout as the first element of the result tuple.
+        EXPECT_TRUE(
+            LayoutUtil::Equal(batchnorm->operand(0)->shape().layout(),
+                              batchnorm->shape().tuple_shapes(0).layout()))
+            << batchnorm->ToString();
+        EXPECT_TRUE(
+            LayoutUtil::Equal(batchnorm->operand(4)->shape().layout(),
+                              batchnorm->shape().tuple_shapes(0).layout()))
+            << batchnorm->ToString();
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index f0f036f7f381db15b84db85d3efeec5d8141884e..af9897769fda371e47af06c19abce9a06015e094 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -44,7 +44,7 @@ GpuTransferManager::GpuTransferManager()
     : GenericTransferManager(
           se::cuda::kCudaPlatformId,
           /*pointer_size=*/llvm::DataLayout(gpu::GpuCompiler::kDataLayout)
-              .getPointerSize()) {}
+              .getPointerSize(0 /* default address space */)) {}
 
 Status GpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
                                                    const Literal& literal) {
@@ -54,7 +54,7 @@ Status GpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
 
   if (!ShapeUtil::IsTuple(shape)) {
     int64 size = GetByteSizeRequirement(shape);
-    return TransferBufferToInfeed(executor, size, literal.InternalData());
+    return TransferBufferToInfeed(executor, size, literal.untyped_data());
   }
 
   if (ShapeUtil::IsNestedTuple(shape)) {
@@ -67,20 +67,21 @@ Status GpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
   // enqueue the resulting destination device addresses with the
   // infeed manager.
   std::vector<gpu::InfeedBuffer*> buffers;
-  buffers.reserve(literal.tuple_literals_size());
+  buffers.reserve(ShapeUtil::TupleElementCount(shape));
   auto cleanup = tensorflow::gtl::MakeCleanup([buffers]() {
     for (gpu::InfeedBuffer* b : buffers) {
       b->Done();
     }
   });
 
-  for (const auto& tuple_element : literal.tuple_literals()) {
-    const Shape& tuple_element_shape = tuple_element.shape();
+  for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+    const Shape& tuple_element_shape =
+        ShapeUtil::GetTupleElementShape(shape, i);
     int64 tuple_element_size = GetByteSizeRequirement(tuple_element_shape);
     TF_ASSIGN_OR_RETURN(
         gpu::InfeedBuffer * buffer,
         TransferBufferToInfeedInternal(executor, tuple_element_size,
-                                       tuple_element.InternalData()));
+                                       literal.untyped_data({i})));
     buffers.push_back(buffer);
   }
 
@@ -105,12 +106,13 @@ Status GpuTransferManager::EnqueueBuffersToInfeed(
   // infeed requests, blocking on the stream might be
   // heavy-handed. Figure out if finer-grained acknowledgement is
   // possible.
-  if (!stream->BlockHostUntilDone()) {
+  Status block_status = stream->BlockHostUntilDone();
+  if (!block_status.ok()) {
     for (gpu::InfeedBuffer* b : buffers) {
       b->Done();
     }
-    return InternalError("Failed to complete data transfer on stream %p",
-                         stream);
+    return InternalError("Failed to complete data transfer on stream %p: %s",
+                         stream, block_status.error_message().c_str());
   }
 
   infeed_manager->EnqueueBuffers(buffers);
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index c2115c49993ef71c4b6dd584e7e0498807666613..061210352cf12e6802d066d311fd2cb481673f15 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -22,12 +22,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace gpu {
 
+using tensorflow::strings::StrAppend;
+using tensorflow::strings::StrCat;
+
 void HloToIrBindings::EmitBasePointersForHlos(
     tensorflow::gtl::ArraySlice<const HloInstruction*> io_hlos,
     tensorflow::gtl::ArraySlice<const HloInstruction*> non_io_hlos) {
@@ -191,7 +196,11 @@ static bool BuffersInvariantWithinConsumer(
 llvm_ir::IrArray HloToIrBindings::GetIrArray(const HloInstruction& hlo,
                                              const HloInstruction& consumer,
                                              const ShapeIndex& shape_index) {
-  llvm_ir::IrArray ir_array(GetBasePointer(hlo, shape_index),
+  llvm::Value* base_ptr = GetBasePointer(hlo, shape_index);
+  CHECK_NE(base_ptr, nullptr)
+      << "Buffer not assigned for shape_index " << shape_index.ToString()
+      << " of " << hlo.ToString();
+  llvm_ir::IrArray ir_array(base_ptr,
                             ShapeUtil::GetSubshape(hlo.shape(), shape_index));
   alias_analysis_.AddAliasingInformationToIrArray(hlo, &ir_array);
 
@@ -223,5 +232,54 @@ void HloToIrBindings::UnbindAllLocalIrValues() {
   }
 }
 
+string HloToIrBindings::ToString() const {
+  string s = StrCat("** HloToIrBindings **\n");
+  StrAppend(&s, "  is_nested_=", is_nested_, "\n");
+  StrAppend(&s,
+            "  temp_buffer_base_=", llvm_ir::DumpToString(*temp_buffer_base_),
+            "\n");
+
+  if (base_ptrs_.empty()) {
+    return s;
+  }
+
+  // Iterate over all computations in the module in topological order, and print
+  // out the base pointers we have in each computation in topological order.
+  for (const HloComputation* computation :
+       base_ptrs_.begin()->first->GetModule()->MakeComputationPostOrder()) {
+    bool is_first = true;
+    for (const HloInstruction* instr :
+         computation->MakeInstructionPostOrder()) {
+      auto it = base_ptrs_.find(instr);
+      if (it == base_ptrs_.end()) {
+        continue;
+      }
+      if (is_first) {
+        StrAppend(&s, "  Base pointers for computation ", computation->name(),
+                  ":\n");
+        is_first = false;
+      }
+      StrAppend(&s, "    ", instr->ToString());
+
+      const ShapeTree<llvm::Value*>& shape_tree = it->second;
+      if (!ShapeUtil::IsTuple(instr->shape())) {
+        const llvm::Value* val = shape_tree.begin()->second;
+        StrAppend(&s, " -> ", llvm_ir::DumpToString(*val), "\n");
+        continue;
+      }
+
+      StrAppend(&s, "\n");
+      for (auto shape_it = shape_tree.begin(); shape_it != shape_tree.end();
+           ++shape_it) {
+        llvm::Value* val = shape_it->second;
+        StrAppend(&s, "      ", shape_it->first.ToString(), " -> ",
+                  (val != nullptr ? llvm_ir::DumpToString(*val) : "null"),
+                  "\n");
+      }
+    }
+  }
+  return s;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index 62ae1769a1f2fb3b9acaf35bdf18a793232500b0..3d34311b4368d17cb074aaf33c71fc865e96387e 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -66,13 +66,14 @@ class HloToIrBindings {
   }
 
   llvm::Value* GetTempBufferBase() const { return temp_buffer_base_; }
+  void SetTempBufferBase(llvm::Value* v) { temp_buffer_base_ = v; }
 
   // A helper method that returns the base pointer of the IrArray containing the
   // output of "inst".at the given ShapeIndex.
   llvm::Value* GetBasePointer(const HloInstruction& hlo,
                               const ShapeIndex& shape_index = {}) const {
     auto it = base_ptrs_.find(&hlo);
-    CHECK(it != base_ptrs_.end());
+    CHECK(it != base_ptrs_.end()) << hlo.ToString();
     return it->second.element(shape_index);
   }
 
@@ -87,6 +88,8 @@ class HloToIrBindings {
                               const HloInstruction& consumer,
                               const ShapeIndex& shape_index = {});
 
+  string ToString() const;
+
  private:
   // Emits IR to resolve (possibly) recursive GetTupleElement instructions.
   llvm::Value* EmitGetTupleElement(const HloInstruction* gte,
@@ -111,7 +114,7 @@ class HloToIrBindings {
   std::unordered_map<const HloInstruction*, ShapeTree<llvm::Value*>> base_ptrs_;
 
   // The address of the memory block that contains all temporary buffers.
-  llvm::Value* temp_buffer_base_;
+  llvm::Value* temp_buffer_base_ = nullptr;
 
   llvm_ir::AliasAnalysis alias_analysis_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
index e33e904692ca5ad41e17d2e165dbb40b6bd4aa33..2ac95ceb692447c7ac6dbbcd8b9a38876f7a77b6 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -30,9 +30,8 @@ InfeedThunk::InfeedThunk(
                              tuple_element_buffers.end()),
       destination_buffer_(destination_buffer) {}
 
-tensorflow::Status InfeedThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
+Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                    perftools::gputools::Stream* stream) {
   VLOG(2) << "Infeeding to GPU ";
 
   perftools::gputools::DeviceMemoryBase destination_address =
@@ -66,15 +65,16 @@ tensorflow::Status InfeedThunk::ExecuteOnStream(
                        buffer->length());
   }
 
-  if (!stream->BlockHostUntilDone()) {
-    return InternalError("Failed to complete data transfer on stream %p",
-                         stream);
+  Status block_status = stream->BlockHostUntilDone();
+  if (!block_status.ok()) {
+    return InternalError("Failed to complete data transfer on stream %p: %s",
+                         stream, block_status.error_message().c_str());
   }
 
   infeed_manager->ReleaseBuffers(infeed_buffers);
 
   VLOG(2) << "Infeeding to GPU complete";
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
index 371d71f9dbdd21cb5f36cc3108c8f398a4a91c29..86918705fa0305217f11753e383200c7bd71474b 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
@@ -43,9 +43,8 @@ class InfeedThunk : public Thunk {
   InfeedThunk(const InfeedThunk&) = delete;
   InfeedThunk& operator=(const InfeedThunk&) = delete;
 
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
 
  private:
   const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 1d47ffde4331868cbc8a8afb2d01b11e77a7fab0..2d6dad27a59978da6e4719afc50ebee5e641dde0 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -137,49 +137,6 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
                    .ValueOrDie());
 }
 
-TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfConvolutionUnfused) {
-  HloComputation::Builder builder(TestName());
-  auto input = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 1, 1, 3}), "input"));
-  auto filter = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1, 1, 1, 2}), "filter"));
-
-  Window conv_window;
-  WindowDimension* conv_window_row = conv_window.add_dimensions();
-  conv_window_row->set_size(1);
-  WindowDimension* conv_window_col = conv_window.add_dimensions();
-  conv_window_col->set_size(2);
-  conv_window_col->set_padding_high(1);
-
-  ConvolutionDimensionNumbers conv_dnums;
-  conv_dnums.set_input_batch_dimension(0);
-  conv_dnums.set_output_batch_dimension(0);
-  conv_dnums.set_input_feature_dimension(1);
-  conv_dnums.set_output_feature_dimension(1);
-  conv_dnums.add_input_spatial_dimensions(2);
-  conv_dnums.add_output_spatial_dimensions(2);
-  conv_dnums.add_input_spatial_dimensions(3);
-  conv_dnums.add_output_spatial_dimensions(3);
-  conv_dnums.set_kernel_output_feature_dimension(0);
-  conv_dnums.set_kernel_input_feature_dimension(1);
-  conv_dnums.add_kernel_spatial_dimensions(2);
-  conv_dnums.add_kernel_spatial_dimensions(3);
-
-  auto conv = builder.AddInstruction(
-      HloInstruction::CreateConvolve(ShapeUtil::MakeShape(F32, {1, 1, 1, 3}),
-                                     input, filter, conv_window, conv_dnums));
-  auto transpose = builder.AddInstruction(HloInstruction::CreateTranspose(
-      ShapeUtil::MakeShape(F32, {3, 1, 1, 1}), conv, {3, 2, 1, 0}));
-  builder.AddInstruction(
-      HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {3}), transpose));
-
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
-                   .Run(module.get())
-                   .ValueOrDie());
-}
-
 TEST_F(InstructionFusionTest, GetTupleElementFused) {
   HloComputation::Builder builder(TestName());
   Shape data_shape = ShapeUtil::MakeShape(F32, {8});
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 658fd05cd4b63c923d21b4a1de16468c0aeec65d..2f65edffea81db7dba1f8545f92b27ea622044e7 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -90,41 +90,93 @@ bool ImplementedAsGemm(const HloInstruction& hlo) {
   return false;
 }
 
-bool ImplementedAsDnnConvolution(const HloInstruction& hlo) {
-  // We can only do this if the HLO is unnested.
-  if (hlo.parent() != hlo.GetModule()->entry_computation()) {
+const char* const kCudnnBatchNormForwardInferenceCallTarget =
+    "__cudnn$batchNormalizationForwardInference";
+const char* const kCudnnBatchNormForwardTrainingCallTarget =
+    "__cudnn$batchNormalizationForwardTraining";
+const char* const kCudnnBatchNormBackwardCallTarget =
+    "__cudnn$batchNormalizationBackward";
+
+bool IsCustomCallToDnnBatchNorm(const HloInstruction& hlo) {
+  if (hlo.opcode() != HloOpcode::kCustomCall) {
     return false;
   }
+  const auto& target = hlo.custom_call_target();
+  return target == kCudnnBatchNormForwardInferenceCallTarget ||
+         target == kCudnnBatchNormForwardTrainingCallTarget ||
+         target == kCudnnBatchNormBackwardCallTarget;
+}
 
-  // Forward convolution.
-  if (hlo.opcode() == HloOpcode::kConvolution) {
-    const ConvolutionDimensionNumbers& dnums =
-        hlo.convolution_dimension_numbers();
-    if (dnums.input_spatial_dimensions_size() > 3) {
-      return false;
-    }
-
-    // CuDNN does not accept zero-element arguments
-    if (ShapeUtil::HasZeroElements(hlo.operand(0)->shape()) ||
-        ShapeUtil::HasZeroElements(hlo.operand(1)->shape())) {
-      return false;
-    }
+const char* const kCudnnConvForwardCallTarget = "__cudnn$convForward";
+const char* const kCudnnConvBackwardInputCallTarget =
+    "__cudnn$convBackwardInput";
+const char* const kCudnnConvBackwardFilterCallTarget =
+    "__cudnn$convBackwardFilter";
 
-    return true;
+bool IsCustomCallToDnnConvolution(const HloInstruction& hlo) {
+  if (hlo.opcode() != HloOpcode::kCustomCall) {
+    return false;
   }
+  const auto& target = hlo.custom_call_target();
+  return target == kCudnnConvForwardCallTarget ||
+         target == kCudnnConvBackwardInputCallTarget ||
+         target == kCudnnConvBackwardFilterCallTarget;
+}
 
-  // Backward convolution.
-  if (hlo.opcode() == HloOpcode::kFusion &&
-      (hlo.fusion_kind() == HloInstruction::FusionKind::kConvBackwardFilter ||
-       hlo.fusion_kind() == HloInstruction::FusionKind::kConvBackwardInput)) {
-    return true;
-  }
+bool ImplementedAsLibraryCall(const HloInstruction& hlo) {
+  return ImplementedAsGemm(hlo) || IsCustomCallToDnnBatchNorm(hlo) ||
+         IsCustomCallToDnnConvolution(hlo);
+}
 
-  return false;
+static HloInstruction* CreateCudnnConv(
+    const char* call_target, const Shape& shape, HloInstruction* lhs,
+    HloInstruction* rhs, const Window& window,
+    const ConvolutionDimensionNumbers& dnums) {
+  HloComputation* computation = lhs->parent();
+
+  // This call returns a tuple of (conv_result, scratch_memory), where
+  // conv_result is the actual result of the convolution, and scratch_memory is
+  // temporary memory used by cudnn.
+  //
+  // At the moment, we don't know how much scratch memory this conv is going to
+  // use, so we put u8[0] in this place.  Later on another pass will choose
+  // which conv algorithm to use, and at that point we'll modify the shape of
+  // this second tuple element.
+  Shape call_shape =
+      ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U8, {0})});
+
+  // Our CustomCall takes three arguments: The conv lhs and rhs, and the cudnn
+  // algorithm to use.  It's up to a later pass to choose the algorithm, so to
+  // indicate that we haven't yet made a choice, we speicfy -1 for that arg.
+  HloInstruction* negative_one = computation->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int64>(-1)));
+  HloInstruction* custom_call =
+      computation->AddInstruction(HloInstruction::CreateCustomCall(
+          call_shape, {lhs, rhs, negative_one}, call_target));
+  custom_call->set_window(window);
+  custom_call->set_convolution_dimension_numbers(dnums);
+  return custom_call;
 }
 
-bool ImplementedAsLibraryCall(const HloInstruction& hlo) {
-  return ImplementedAsGemm(hlo) || ImplementedAsDnnConvolution(hlo);
+HloInstruction* CreateCudnnConvForward(
+    const Shape& shape, HloInstruction* input, HloInstruction* kernel,
+    const Window& window, const ConvolutionDimensionNumbers& dnums) {
+  return CreateCudnnConv(kCudnnConvForwardCallTarget, shape, input, kernel,
+                         window, dnums);
+}
+
+HloInstruction* CreateCudnnConvBackwardInput(
+    const Shape& shape, HloInstruction* output, HloInstruction* reverse_filter,
+    const Window& window, const ConvolutionDimensionNumbers& dnums) {
+  return CreateCudnnConv(kCudnnConvBackwardInputCallTarget, shape, output,
+                         reverse_filter, window, dnums);
+}
+
+HloInstruction* CreateCudnnConvBackwardFilter(
+    const Shape& shape, HloInstruction* input, HloInstruction* output,
+    const Window& window, const ConvolutionDimensionNumbers& dnums) {
+  return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, shape, input,
+                         output, window, dnums);
 }
 
 bool IsReductionToVector(const HloInstruction& reduce) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index 06c3205296e4546e39525ec093cc17e2fc375d0d..59455f389e733fee2d6cace7486f919a0c5e834e 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -22,6 +22,9 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
+// TODO(jlebar): Move functions related to cublas/cudnn to a separate file; they
+// don't belong in "ir_emission_utils".
+
 namespace xla {
 namespace gpu {
 
@@ -30,8 +33,85 @@ constexpr int64 kWarpSize = 32;
 // Returns true if `hlo` will be implemented as a call to BLAS gemm.
 bool ImplementedAsGemm(const HloInstruction& hlo);
 
-// Returns true if `hlo` will be implemented as a call to cuDNN convolution.
-bool ImplementedAsDnnConvolution(const HloInstruction& hlo);
+// A call to cuDNN for batch normalization is represented as CustomCall HLO with
+// a call target equal to one of these strings.
+//
+// The operands to and outputs of these calls are the same as those of the
+// corresponding HLOs, except:
+//
+//  - epsilon and feature_index are proper operands, at the end of the operands
+//    list.  They must be HLO constants.
+//  - The cuDNN forward training call returns inv_stddev =
+//    1/sqrt(variance + epsilon) in place of plain variance.
+//  - Similarly, BatchNormGrad accepts inv_stddev in place of the variance
+//    operand.
+extern const char* const kCudnnBatchNormForwardInferenceCallTarget;
+extern const char* const kCudnnBatchNormForwardTrainingCallTarget;
+extern const char* const kCudnnBatchNormBackwardCallTarget;
+
+// Returns true if `hlo` will be implemented as a call to a cuDNN batch
+// normalization routine.
+//
+// This returns true if `hlo` is a CustomCall HLO with a call target equal to
+// one of the kCudnnBatchNormFoo constants above, but returns *false* for HLOs
+// with one of the kBatchNorm opcodes, because these are lowered either to a
+// sequence of generic HLOs or to a cuDNN CustomCall.
+bool IsCustomCallToDnnBatchNorm(const HloInstruction& hlo);
+
+// A call to cuDNN for convolution (forward, backward filter, or backward input)
+// is represented as a CustomCall HLO with a call target equal to one of these
+// strings.
+//
+// These CustomCalls have window() and convolution_dimension_numbers() set like
+// regular convolution ops.  They have the same LHS and RHS operands, plus two
+// additional constant operands: an int64 operand for the cudnn algorithm and
+// a bool operand for whether tensor_ops is enabled. A value of -1 for the cudnn
+// algorithm means that the implementation is free to choose the best algorithm
+// it can.
+//
+// These calls output a tuple (conv_result, scratch_memory), where conv_result
+// is the actual result of the convolution, and scratch_memory is temporary
+// memory used by cudnn.  Callers shouldn't inspect scratch_memory, as its value
+// is not well-defined.
+//
+// CudnnConvolutionRewriter lowers kConvolution HLOs to these custom calls.
+// When it does so, it chooses algorithm -1 and 0 bytes of scratch space.  Later
+// on in the pipeline, CudnnConvolutionAlgorithmChooser chooses an explicit
+// algorithm for each conv and sets the amount of scratch space needed.
+//
+// (Representing the scratch memory as an output may seem strange at first, but
+// it's quite sensible, from a certain point of view.  The scratch buffer is a
+// location in memory that the conv can write into, but which it can't legally
+// read from, at least until it's written something first.  But that's exactly
+// the definition of an output buffer.)
+extern const char* const kCudnnConvForwardCallTarget;
+extern const char* const kCudnnConvBackwardInputCallTarget;
+extern const char* const kCudnnConvBackwardFilterCallTarget;
+
+// Returns true if `hlo` will be implemented as a call to a cuDNN convolution
+// routine.
+//
+// This returns true if `hlo` is a CustomCall HLO with a call target equal to
+// one of the kCudnnConvFoo constants above, but returns *false* for HLOs with a
+// kConvolution opcode.
+bool IsCustomCallToDnnConvolution(const HloInstruction& hlo);
+
+// Creates a CustomCall for a cudnn forward/backward-input/backward-filter conv.
+// Note that these CustomCalls return a tuple (conv_result, scratch_memory).  If
+// you want just the conv result, you'll need to get-tuple-element the value
+// returned by this function.
+//
+// The created cudnn call will use the default cudnn algorithm and no scratch
+// space.
+HloInstruction* CreateCudnnConvForward(
+    const Shape& shape, HloInstruction* input, HloInstruction* kernel,
+    const Window& window, const ConvolutionDimensionNumbers& dnums);
+HloInstruction* CreateCudnnConvBackwardInput(
+    const Shape& shape, HloInstruction* output, HloInstruction* reverse_filter,
+    const Window& window, const ConvolutionDimensionNumbers& dnums);
+HloInstruction* CreateCudnnConvBackwardFilter(
+    const Shape& shape, HloInstruction* input, HloInstruction* output,
+    const Window& window, const ConvolutionDimensionNumbers& dnums);
 
 // Returns true if `hlo` will be implemented as a library call, e.g. cuBLAS gemm
 // or cuDNN convolution.
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 6e2bd4e11d3c4ff576edb0df3b724abebfc0e424..a3df67a87344d6ece2ea9047321ad9542c13f8cf 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
@@ -173,7 +175,7 @@ Status IrEmitter::EmitCallToNestedComputation(
   return Status::OK();
 }
 
-bool IrEmitter::MaybeEmitSpecialAtomicOperation(
+bool IrEmitter::MaybeEmitDirectAtomicOperation(
     const HloComputation& computation, llvm::Value* output_address,
     llvm::Value* source_address) {
   CHECK_EQ(2, computation.num_parameters());
@@ -233,102 +235,189 @@ bool IrEmitter::MaybeEmitSpecialAtomicOperation(
   return false;
 }
 
-Status IrEmitter::EmitAtomicOperationForNestedComputation(
-    const HloComputation& computation, llvm::Value* output_address,
-    llvm::Value* source_address) {
-  if (computation.num_parameters() != 2) {
-    // TODO(b/30258929): We only accept binary computations so far.
-    return Unimplemented(
-        "We only support atomic functions with exactly two parameters, but "
-        "computation %s has %lld.",
-        computation.name().c_str(), computation.num_parameters());
-  }
-
-  if (MaybeEmitSpecialAtomicOperation(computation, output_address,
-                                      source_address)) {
-    return Status::OK();
-  }
+// Implements atomic binary operations using atomic compare-and-swap
+// (atomicCAS) as follows:
+//   1. Reads the value from the memory pointed to by output_address and
+//     records it as old_output.
+//   2. Uses old_output as one of the source operand to perform the binary
+//     operation and stores the result in new_output.
+//   3. Calls atomicCAS which implements compare-and-swap as an atomic
+//     operation. In particular, atomicCAS reads the value from the memory
+//     pointed to by output_address, and compares the value with old_output. If
+//     the two values equal, new_output is written to the same memory location
+//     and true is returned to indicate that the atomic operation succeeds.
+//     Otherwise, the new value read from the memory is returned. In this case,
+//     the new value is copied to old_output, and steps 2. and 3. are repeated
+//     until atomicCAS succeeds.
+//
+// On Nvidia GPUs, atomicCAS can only operate on 32 bit and 64 bit integers. If
+// the element type of the binary operation is 32 bits or 64 bits, the integer
+// type of the same size is used for the atomicCAS operation. On the other hand,
+// if the element type is smaller than 32 bits, int32 is used for the atomicCAS
+// operation. In this case, atomicCAS reads and writes 32 bit values from
+// the memory, which is larger than the memory size required by the original
+// atomic binary operation. We mask off the last two bits of the output_address
+// and use the result as an address to read the 32 bit values from the memory.
+// This can avoid out of bound memory accesses if tensor buffers are 4 byte
+// aligned and have a size of 4N, an assumption that the runtime can guarantee.
+//
+// The pseudo code is shown below. Variables *_address are pointers to a memory
+// region with a size equal to the size of the atomicCAS operation, with the
+// exception that new_output_address is a pointer to a memory region with a size
+// equal to the element size of the binary operation.
+//
+//   element_size = sizeof(element_type);
+//   atomic_size = max(32, element_size);
+//   cas_new_output_address = alloca(atomic_size);
+//   cas_old_output_address = alloca(atomic_size);
+//   if (atomic_size != element_size) {
+//     atomic_address = output_address & ((int64)(-4));
+//     new_output_address = cas_new_output_address + (output_address & 3);
+//   } else {
+//     atomic_address = output_address;
+//     new_output_address = cas_new_output_address;
+//   }
+//
+//   *cas_old_output_address = *atomic_address;
+//   do {
+//     *cas_new_output_address = *cas_old_output_address;
+//     *new_output_address = operation(*new_output_address, *source_address);
+//     (*cas_old_output_address, success) =
+//       atomicCAS(atomic_address, *cas_old_output_address,
+//       *cas_new_output_address);
+//   } while (!success);
+//
+Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation,
+                                              llvm::Value* output_address,
+                                              llvm::Value* source_address) {
+  llvm::PointerType* output_address_type =
+      llvm::dyn_cast<llvm::PointerType>(output_address->getType());
+  CHECK_NE(output_address_type, nullptr);
+
+  // element_type is the data type for the binary operation.
+  llvm::Type* element_type = output_address_type->getPointerElementType();
+  int element_size = llvm_ir::GetSizeInBits(element_type);
+  llvm::Type* element_address_type = element_type->getPointerTo();
+
+  int atomic_size = (element_size < 32) ? 32 : element_size;
+  llvm::Type* atomic_type = ir_builder_.getIntNTy(atomic_size);
+  llvm::Type* atomic_address_type =
+      atomic_type->getPointerTo(output_address_type->getPointerAddressSpace());
+
+  // cas_old_output_address and cas_new_output_address point to the scratch
+  // memory where we store the old and new values for the repeated atomicCAS
+  // operations.
+  llvm::Value* cas_old_output_address = ir_builder_.CreateAlloca(
+      atomic_type, /*ArraySize=*/nullptr, "cas_old_output_address");
+  llvm::Value* cas_new_output_address = ir_builder_.CreateAlloca(
+      atomic_type, /*ArraySize=*/nullptr, "cas_new_output_address");
 
-  // Other binary computations can be made atomic as following (labels are basic
-  // block names used in the IR emitting code later).
-  //
-  // atomic_op_loop_preheader:
-  //   ...
-  //   source = *source_address;
-  //   old_output = *output_address;
-  //   do {
-  // atomic_op_loop_body_entry:
-  //     new_output = computation(old_output, source);
-  //     (old_output, success) =
-  //         atomicCAS(output_address, old_output, new_output);
-  //   } while (!success);
-  //
-  // atomic_op_loop_exit:
-  //   ...
-  //
-  // TODO(jingyue): Consider encapsulate the logic of emitting control flow to
-  // something similar to llvm_ir::ForLoop.
-  //
   // Emit preparation code to the preheader.
   llvm::BasicBlock* loop_preheader_bb = ir_builder_.GetInsertBlock();
-  llvm::Type* element_ir_type =
-      output_address->getType()->getPointerElementType();
-  // old_output = *output_address;
-  llvm::Value* old_output_location = ir_builder_.CreateAlloca(
-      element_ir_type, /*ArraySize=*/nullptr, "old_output_location");
-  ir_builder_.CreateStore(ir_builder_.CreateLoad(output_address, "old_output"),
-                          old_output_location);
+
+  llvm::Value* atomic_memory_address;
+  // binop_output_address points to the scratch memory that stores the
+  // result of the binary operation.
+  llvm::Value* binop_output_address;
+  if (element_size < 32) {
+    // Assume the element size is an integer number of bytes.
+    CHECK_EQ((element_size % sizeof(char)), 0);
+    llvm::Type* address_int_type =
+        module_->getDataLayout().getIntPtrType(output_address_type);
+    atomic_memory_address =
+        ir_builder_.CreatePtrToInt(output_address, address_int_type);
+    llvm::Value* mask = llvm::ConstantInt::get(address_int_type, 3);
+    llvm::Value* offset = ir_builder_.CreateAnd(atomic_memory_address, mask);
+    mask = llvm::ConstantInt::get(address_int_type, -4);
+    atomic_memory_address = ir_builder_.CreateAnd(atomic_memory_address, mask);
+    atomic_memory_address =
+        ir_builder_.CreateIntToPtr(atomic_memory_address, atomic_address_type);
+    binop_output_address = ir_builder_.CreateAdd(
+        ir_builder_.CreatePtrToInt(cas_new_output_address, address_int_type),
+        offset);
+    binop_output_address =
+        ir_builder_.CreateIntToPtr(binop_output_address, element_address_type);
+  } else {
+    atomic_memory_address =
+        ir_builder_.CreateBitCast(output_address, atomic_address_type);
+    binop_output_address =
+        ir_builder_.CreateBitCast(cas_new_output_address, element_address_type);
+  }
+
+  // Use the value from the memory that atomicCAS operates on to initialize
+  // cas_old_output.
+  llvm::Value* cas_old_output =
+      ir_builder_.CreateLoad(atomic_memory_address, "cas_old_output");
+  ir_builder_.CreateStore(cas_old_output, cas_old_output_address);
+
   llvm::BasicBlock* loop_exit_bb = loop_preheader_bb->splitBasicBlock(
       ir_builder_.GetInsertPoint(), "atomic_op_loop_exit");
-
-  // Emit the body of the loop that repeatedly invokes atomicCAS.
   llvm::BasicBlock* loop_body_bb =
       llvm::BasicBlock::Create(ir_builder_.getContext(), "atomic_op_loop_body",
                                ir_builder_.GetInsertBlock()->getParent());
   ir_builder_.SetInsertPoint(loop_body_bb);
   // Change preheader's successor from loop_exit_bb to loop_body_bb.
   loop_preheader_bb->getTerminator()->setSuccessor(0, loop_body_bb);
-  // new_output = computation(old_output, source);
-  llvm::Value* new_output_location = ir_builder_.CreateAlloca(
-      element_ir_type, /*ArraySize=*/nullptr, "new_output_location");
+
+  // Emit the body of the loop that repeatedly invokes atomicCAS.
+  //
+  // Use cas_old_output to initialize cas_new_output.
+  cas_old_output =
+      ir_builder_.CreateLoad(cas_old_output_address, "cas_old_output");
+  ir_builder_.CreateStore(cas_old_output, cas_new_output_address);
+  // Emits code to calculate new_output = operation(old_output, source);
   TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-      computation, {old_output_location, source_address}, new_output_location));
-
-  // (old_output, success) = atomicCAS(output_address, old_output, new_output);
-  int num_bits = llvm_ir::GetSizeInBits(element_ir_type);
-  llvm::Type* element_int_ir_type = ir_builder_.getIntNTy(num_bits);
-  // cmpxchg accepts integer only, and bitcast refuses to operate on aggregate
-  // types, so we bitcast load and store addresses to intN* of the same bit
-  // width.
-  llvm::Value* old_output = ir_builder_.CreateLoad(
-      ir_builder_.CreateBitCast(old_output_location,
-                                element_int_ir_type->getPointerTo()),
-      "old_output");
-  llvm::Value* new_output = ir_builder_.CreateLoad(
-      ir_builder_.CreateBitCast(new_output_location,
-                                element_int_ir_type->getPointerTo()),
-      "new_output");
+      computation, {binop_output_address, source_address},
+      binop_output_address));
+
+  llvm::Value* cas_new_output =
+      ir_builder_.CreateLoad(cas_new_output_address, "cas_new_output");
+
+  // Emit code to perform the atomicCAS operation
+  // (cas_old_output, success) = atomicCAS(memory_address, cas_old_output,
+  //                                       cas_new_output);
   llvm::Value* ret_value = ir_builder_.CreateAtomicCmpXchg(
-      ir_builder_.CreateBitCast(output_address,
-                                element_int_ir_type->getPointerTo()),
-      old_output, new_output, llvm::AtomicOrdering::SequentiallyConsistent,
+      atomic_memory_address, cas_old_output, cas_new_output,
+      llvm::AtomicOrdering::SequentiallyConsistent,
       llvm::AtomicOrdering::SequentiallyConsistent);
-  // cmpxchg returns a pair. The first element is the original value at
-  // output_address and the second element is whether the swap is successful.
+
+  // Extract the memory value returned from atomicCAS and store it as
+  // cas_old_output.
   ir_builder_.CreateStore(
-      ir_builder_.CreateExtractValue(ret_value, 0, "old_output"),
-      ir_builder_.CreateBitCast(old_output_location,
-                                element_int_ir_type->getPointerTo()));
+      ir_builder_.CreateExtractValue(ret_value, 0, "cas_old_output"),
+      cas_old_output_address);
+  // Extract the success bit returned from atomicCAS and generate a
+  // conditional branch on the success bit.
   ir_builder_.CreateCondBr(
       ir_builder_.CreateExtractValue(ret_value, 1, "success"), loop_exit_bb,
       loop_body_bb);
 
-  // Restore the insertion point to the exit basic block so that the caller of
+  // Set the insertion point to the exit basic block so that the caller of
   // this method can continue emitting code to the right place.
   SetToFirstInsertPoint(loop_exit_bb, &ir_builder_);
   return Status::OK();
 }
 
+Status IrEmitter::EmitAtomicOperationForNestedComputation(
+    const HloComputation& computation, llvm::Value* output_address,
+    llvm::Value* source_address) {
+  if (computation.num_parameters() != 2) {
+    // TODO(b/30258929): We only accept binary computations so far.
+    return Unimplemented(
+        "We only support atomic functions with exactly two parameters, but "
+        "computation %s has %lld.",
+        computation.name().c_str(), computation.num_parameters());
+  }
+
+  if (MaybeEmitDirectAtomicOperation(computation, output_address,
+                                     source_address)) {
+    return Status::OK();
+  }
+
+  return EmitAtomicOperationUsingCAS(computation, output_address,
+                                     source_address);
+}
+
 Status IrEmitter::HandleSelect(HloInstruction* select) {
   auto pred = select->operand(0);
   auto on_true = select->operand(1);
@@ -518,10 +607,17 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       "Hit a case for convolution that is not implemented on GPU.");
 }
 
+Status IrEmitter::HandleFft(HloInstruction* fft) {
+  if (ShapeUtil::HasZeroElements(fft->shape())) {
+    // Emit no code for an empty output.
+    return Status::OK();
+  }
+  return Unimplemented("Hit a case for fft that is not implemented on GPU.");
+}
+
 Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
   // TODO(b/33011107): Support cross replica sum on GPU.
-  return Unimplemented(
-      "Cross replica sum not implemented on GPU. See b/33011107.");
+  return Unimplemented("CrossReplicaSum is not implemented on GPU.");
 }
 
 Status IrEmitter::HandleParameter(HloInstruction* parameter) {
@@ -615,11 +711,13 @@ Status IrEmitter::HandleCustomCall(HloInstruction*) {
 }
 
 Status IrEmitter::HandleInfeed(HloInstruction*) {
-  return Unimplemented("Infeed is not supported on GPU (b/30467474).");
+  // TODO(b/30467474): Implement infeed on GPU.
+  return Unimplemented("Infeed is not supported on GPU.");
 }
 
 Status IrEmitter::HandleOutfeed(HloInstruction*) {
-  return Unimplemented("Outfeed is not supported on GPU (b/34359662).");
+  // TODO(b/34359662): Implement outfeed on GPU.
+  return Unimplemented("Outfeed is not supported on GPU.");
 }
 
 Status IrEmitter::HandleRng(HloInstruction* random) {
@@ -640,6 +738,29 @@ Status IrEmitter::HandleRng(HloInstruction* random) {
       .EmitLoop(IrName(random));
 }
 
+Status IrEmitter::HandleBatchNormInference(HloInstruction*) {
+  return Unimplemented(
+      "The GPU backend does not implement BatchNormInference directly.  It "
+      "should be lowered before IR emission to HLO-soup using "
+      "BatchNormRewriter or to a cudnn CustomCall using "
+      "CudnnBatchNormRewriter.");
+}
+
+Status IrEmitter::HandleBatchNormTraining(HloInstruction*) {
+  return Unimplemented(
+      "The GPU backend does not implement BatchNormTraining directly.  It "
+      "should be lowered before IR emission to HLO-soup using "
+      "BatchNormRewriter or to a cudnn CustomCall using "
+      "CudnnBatchNormRewriter.");
+}
+
+Status IrEmitter::HandleBatchNormGrad(HloInstruction*) {
+  return Unimplemented(
+      "The GPU backend does not implement BatchNormGrad directly.  It should "
+      "be lowered before IR emission to HLO-soup (using BatchNormRewriter) or "
+      "to a cudnn CustomCall using CudnnBatchNormRewriter.");
+}
+
 llvm_ir::IrArray::Index IrEmitter::EmitOperandArrayLoopNest(
     const llvm_ir::IrArray& operand_array, int64 reduction_dimension,
     tensorflow::StringPiece name_suffix, llvm_ir::ForLoopNest* loop_nest) {
@@ -648,8 +769,8 @@ llvm_ir::IrArray::Index IrEmitter::EmitOperandArrayLoopNest(
   // reduction dimension.
   std::vector<int64> dimensions;
   const Shape& shape = operand_array.GetShape();
-  for (int i = shape.layout().minor_to_major_size() - 1; i >= 0; --i) {
-    int64 dimension = shape.layout().minor_to_major(i);
+  for (int i = 0; i < LayoutUtil::MinorToMajor(shape).size(); ++i) {
+    int64 dimension = LayoutUtil::Major(shape.layout(), i);
     if (dimension != reduction_dimension) {
       dimensions.push_back(dimension);
     }
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 9c01f5b7c72f429822300af28bfd5261150d33d1..b0accc08d479258d65a18202122e4c9e90ff78d0 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -13,19 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// An XLA HLO graph may contain multiple computations. These computations
-// fall into two types, nested and unnested. We translate each nested
-// computation (e.g. the computation operand of a Map operator) to a device
-// function. For each unnested computation composed of top-level
-// HloInstructions, we generate a CUDA kernel for each HloInstruction.
-//
-// This file declares classes that translate an XLA HLO graph to LLVM IR for
-// GPUs. IrEmitterNested emits LLVM IR for nested computations, and
-// IrEmitterUnnested for unnested computations. The logic of emitting LLVM IR
-// for each individual HloInstruction is largely the same between these two
-// classes. Therefore, we implement the common logic in the Handle* functions in
-// the superclass IrEmitter.
-
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_H_
 
@@ -60,25 +47,35 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// This class is the top-level API for the XLA HLO --> LLVM IR compiler.
-// It implements the DfsHloVisitor interface and emits an LLVM IR program that
-// implements the input HLO graph.
+// Abstract base class for translating HLO graphs to LLVM IR for a GPU.
+//
+// There are two concrete subclasses of IrEmitter: IrEmitterNested and
+// IrEmitterUnnested.  In the unnested variety, each HLO gets its own kernel
+// function, whereas in the nested version the whole computation is emitted as
+// one *non-kernel* function.
+//
+// In XLA, kernel functions never call other kernel functions.  This means that
+// if we have a kernel -- e.g. implementing a kReduce HLO -- that wants to use
+// an HLO computation as a "subroutine" -- e.g. the HLO computation that
+// specifies how to reduce two elements -- then the subroutine computation must
+// be emitted using IrEmitterNested.
 //
-// Note: if `T` is a subclass of `IrEmitter` and a handler is not overridden in
-//       either `IrEmitter` or `T`, the handler in `DfsHloVisitorWithDefault`
-//       calls `T::DefaultAction`.
+// Fusion nodes are a special case.  A fusion node is emitted using
+// IrEmitterUnnested, but the code is generated using FusedIrEmitter, which is
+// not a subclass of gpu::IrEmitter, and in fact is better understood as an IR
+// generator generator.  See comments on that class.
 class IrEmitter : public DfsHloVisitorWithDefault {
  public:
   IrEmitter(const IrEmitter&) = delete;
   IrEmitter& operator=(const IrEmitter&) = delete;
 
-  // The following methods implement the DfsHloVisitorWithDefault interface.
   Status DefaultAction(HloInstruction* hlo) override;
   Status HandleConstant(HloInstruction* constant) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleDot(HloInstruction* dot) override;
   Status HandleConvolution(HloInstruction* convolution) override;
+  Status HandleFft(HloInstruction* fft) override;
   Status HandleCrossReplicaSum(HloInstruction* crs) override;
   Status HandleInfeed(HloInstruction* infeed) override;
   Status HandleOutfeed(HloInstruction* outfeed) override;
@@ -95,6 +92,9 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call) override;
   Status HandleRng(HloInstruction* random) override;
+  Status HandleBatchNormInference(HloInstruction* batch_norm) override;
+  Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
+  Status HandleBatchNormGrad(HloInstruction* batch_norm) override;
 
   Status FinishVisit(HloInstruction* root) override { return Status::OK(); }
 
@@ -185,9 +185,16 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // be simply implemented using an LLVM atomic instruction. If "computation" is
   // one of this kind, emits code to do that and returns true; otherwise,
   // returns false.
-  bool MaybeEmitSpecialAtomicOperation(const HloComputation& computation,
-                                       llvm::Value* output_address,
-                                       llvm::Value* source_address);
+  bool MaybeEmitDirectAtomicOperation(const HloComputation& computation,
+                                      llvm::Value* output_address,
+                                      llvm::Value* source_address);
+
+  // A helper method for EmitAtomicOperationForNestedComputation. It implements
+  // binary atomic operations using atomicCAS with special handling to support
+  // small data types.
+  Status EmitAtomicOperationUsingCAS(const HloComputation& computation,
+                                     llvm::Value* output_address,
+                                     llvm::Value* source_address);
 
   StatusOr<llvm::Value*> ComputeNestedElement(
       const HloComputation& computation,
@@ -206,185 +213,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   std::map<const HloComputation*, llvm::Function*> computation_to_ir_function_;
 };
 
-// Emits LLVM IR for unnested computations. Each HloInstruction is translated to
-// a separate CUDA kernel. These kernels are inserted into the resultant module
-// sorted in reverse postorder of the XLA HLO graph.
-class IrEmitterUnnested : public IrEmitter {
- public:
-  IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
-                    const HloComputation* hlo_computation,
-                    IrEmitterContext* ir_emitter_context);
-  IrEmitterUnnested(const IrEmitterUnnested&) = delete;
-  IrEmitterUnnested& operator=(const IrEmitterUnnested&) = delete;
-
-  // Transfers the ownship of thunk_sequence_ out.
-  std::unique_ptr<ThunkSequence> ConsumeThunkSequence() {
-    return std::move(thunk_sequence_);
-  }
-
-  Status DefaultAction(HloInstruction* hlo) override;
-
-  // IrEmitterUnnested handles the following instructions differently from
-  // IrEmitter.
-  Status HandleCopy(HloInstruction* copy) override;
-  Status HandleConvolution(HloInstruction* convolution) override;
-  Status HandleDot(HloInstruction* dot) override;
-  Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
-  Status HandleReduce(HloInstruction* reduce) override;
-  Status HandleSelectAndScatter(HloInstruction* instruction) override;
-  Status HandleTuple(HloInstruction* tuple) override;
-  Status HandleWhile(HloInstruction* xla_while) override;
-  Status HandleInfeed(HloInstruction* xla_infeed) override;
-  Status HandleRng(HloInstruction* random) override;
-  Status HandleSelect(HloInstruction* select) override;
-
-  Status EmitTargetElementLoop(
-      const HloInstruction& hlo,
-      const llvm_ir::ElementGenerator& body_emitter) override;
-
-  // Same as `EmitTargetElementLoop`, but in given `thunk` rather than
-  // `LastThunk()`.
-  Status EmitTargetElementLoopInThunk(
-      const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter,
-      KernelThunk* thunk);
-
- private:
-  // Builds the appropriate thunk for the instruction hlo and returns the owning
-  // pointer to it. The caller needs to make sure `inst` outlives the lifetime
-  // of the returned Thunk object.
-  std::unique_ptr<Thunk> BuildThunk(const HloInstruction* hlo);
-
-  // Builds the prototype of the IR kernel for `inst` and adds it to the module.
-  llvm::Function* BuildKernelPrototype(
-      const HloInstruction& inst,
-      tensorflow::gtl::ArraySlice<const HloInstruction*> escaped_hlos);
-
-  // Emits the base pointers for `hlo` and its operands. `io_hlos` will store
-  // all input/output HLOs among `hlo` and its operands.
-  llvm::Function* EmitBasePointersForHloAndItsOperands(
-      const HloInstruction& hlo, std::vector<const HloInstruction*>* io_hlos);
-
-  // EmitColumnReduction and EmitRowReduction emit code for column and row
-  // reduction of a matrix and/or 3D tensor. Row and column reduction have
-  // different memory access pattern, so for performance their implementations
-  // are significantly different.
-  //
-  // Emits code that reduces a matrix of shape [height x width] to a vector of
-  // [width]. Other parameters have the same meaning as those of
-  // `EmitReductionToVector`. Note that input shape might not be
-  // [height x width], but can be bitcast to [height x weight] with "height"
-  // being the major dimension.
-  Status EmitColumnReduction(int64 height, int64 width, HloInstruction* reduce,
-                             const Shape& input_shape,
-                             const llvm_ir::ElementGenerator& input_gen,
-                             const llvm_ir::ElementGenerator& init_value_gen,
-                             HloComputation* reducer);
-
-  // Emits code that reduces a 3D tensor of shape [depth x height x width] to a
-  // vector of shape [height]. Other parameters have the same meaning as those
-  // of `EmitReductionToVector`. Note that input shape might not be
-  // [depth x height x width], but can be bitcast to [depth x height x weight]
-  // with "depth" being the most major dimension.
-  Status EmitRowReduction(int64 depth, int64 height, int64 width,
-                          HloInstruction* reduce, const Shape& input_shape,
-                          const llvm_ir::ElementGenerator& input_gen,
-                          const llvm_ir::ElementGenerator& init_value_gen,
-                          HloComputation* reducer);
-
-  // Figures out whether `reduce` is a row or column reduction, and which
-  // dimensions to reduce, and calls either `EmitRowReduction` or
-  // `EmitColumnReduction` as appropriate. `input_shape` is the shape of the
-  // input array, which is the operand of the Reduce instruction if unfused or
-  // of the Fusion instruction if fused. `input_gen` and `init_value_gen`
-  // generate elements of the input and the initial value. Other parameters mean
-  // the same as for `HandleReduce`.
-  //
-  // Prerequisite: `IsReductionToVector(*reduce)`
-  Status EmitReductionToVector(
-      HloInstruction* reduce, const Shape& input_shape,
-      const llvm_ir::ElementGenerator& input_gen,
-      const llvm_ir::ElementGenerator& init_value_gen,
-      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
-      HloComputation* reducer);
-
-  // Emits code to initialize buffer of `inst` in given `thunk`.
-  Status EmitInitializer(const HloInstruction* inst, KernelThunk* thunk);
-
-  // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
-  // caller needs to make sure `inst` outlives the lifetime of the returned
-  // Thunk object.
-  std::unique_ptr<Thunk> BuildKernelThunk(const HloInstruction* inst);
-
-  // Returns a ConvolutionThunk that calls DNN to implement `inst`.
-  std::unique_ptr<Thunk> BuildConvolutionThunk(const HloInstruction* inst);
-
-  // Returns a GemmThunk that calls gemm to implement `inst`. The caller needs
-  // to make sure `inst` outlives the lifetime of the returned Thunk object.
-  std::unique_ptr<Thunk> BuildGemmThunk(const HloInstruction* inst);
-
-  // Returns a thunk that calls host-to-device cuMemcpy to implement `inst`.
-  std::unique_ptr<Thunk> BuildHostToDeviceCopyThunk(const HloInstruction* inst);
-
-  // Returns a thunk that calls device-to-device cuMemcpy to implement `inst`.
-  std::unique_ptr<Thunk> BuildDeviceToDeviceCopyThunk(
-      const HloInstruction* inst);
-
-  // Returns an InfeedThunk that performs device-to-device memcpy to implement
-  // `inst`.
-  std::unique_ptr<Thunk> BuildInfeedThunk(const HloInstruction* inst);
-
-  // Returns a WhileThunk that invokes thunk sequences for 'condition' and
-  // 'body' sub-computations of while instruction 'hlo'.
-  std::unique_ptr<Thunk> BuildWhileThunk(const HloInstruction* hlo);
-
-  // Returns a ForThunk which executes 'loop_limit' invocations of a thunk
-  // sequence from the 'body' sub-computation of the while instruction 'hlo'.
-  std::unique_ptr<Thunk> BuildForThunk(const HloInstruction* hlo,
-                                       const int64 loop_limit);
-
-  Status Postprocess(HloInstruction* hlo) override;
-
-  // Returns the last generated thunk.
-  Thunk* LastThunk() const { return thunk_sequence_->back().get(); }
-
-  // The thunk sequence this IrEmitter generates for the input computation.
-  std::unique_ptr<ThunkSequence> thunk_sequence_;
-
-  // The HloComputation that this IrEmitter emits code for.
-  const HloComputation* hlo_computation_;
-};
-
-// Emits LLVM IR for a nested computation to the resultant function.
-class IrEmitterNested : public IrEmitter {
- public:
-  // Constructs an LLVM IR emitter for a nested HLO computation. `function` is
-  // the containing IR function this emitter produces IR to. See
-  // IrEmitter::IrEmitter for the meanings of other arguments.
-  IrEmitterNested(const HloModuleConfig& hlo_module_config,
-                  const HloComputation& nested_computation,
-                  IrEmitterContext* ir_emitter_context);
-  IrEmitterNested(const IrEmitterNested&) = delete;
-  IrEmitterNested& operator=(const IrEmitterNested&) = delete;
-
-  // Overrides the default empty implementation. Binds the given instruction
-  // "parameter" with the parameter of the IR function.
-  Status HandleParameter(HloInstruction* parameter) override;
-
-  llvm::Function* GetEmittedFunction() const { return emitted_function_; }
-
-  Status EmitTargetElementLoop(
-      const HloInstruction& hlo,
-      const llvm_ir::ElementGenerator& body_emitter) override;
-
- private:
-  llvm::Function* EmitBasePointersForNestedComputation(
-      const HloComputation& nested_computation,
-      std::vector<const HloInstruction*>* io_hlos);
-
-  llvm::Function* emitted_function_;
-};
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
index 5225ff36ff3a8a1b049479c34aa301de8724f73e..71aada080ae8df70bffce3e1854b5fbd833efd23 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -16,12 +16,13 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
+
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca11cf2c182b0600b931b19d2d7fb3983e36441a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
+
+#include "llvm/IR/Function.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
+
+namespace xla {
+namespace gpu {
+
+// Emits LLVM IR for a "nested computation" into a non-kernel device function.
+//
+// This is used to emit code for HloComputations that don't require a separate
+// kernel call.  For example, IrEmitterNested is used to emit code for a kReduce
+// HLO's elementwise reduction computation.  Notably, IrEmitterNested is *not*
+// used to emit code for fusion nodes -- fusion nodes use FusedIrEmitter, which
+// is a different beast altogether.
+//
+// IrEmitterNested generates a non-kernel function with the following
+// parameters:
+//
+//   - N pointers to the buffers of each of the N parameters to the computation,
+//   - a pointer to the output buffer of the computation, and
+//   - a pointer to the top-level temp buffer.
+//
+class IrEmitterNested : public IrEmitter {
+ public:
+  // Constructs an LLVM IR emitter for a nested HLO computation. `function` is
+  // the containing IR function this emitter produces IR to. See
+  // IrEmitter::IrEmitter for the meanings of other arguments.
+  IrEmitterNested(const HloModuleConfig& hlo_module_config,
+                  const HloComputation& nested_computation,
+                  IrEmitterContext* ir_emitter_context);
+  IrEmitterNested(const IrEmitterNested&) = delete;
+  IrEmitterNested& operator=(const IrEmitterNested&) = delete;
+
+  // Overrides the default empty implementation. Binds the given instruction
+  // "parameter" with the parameter of the IR function.
+  Status HandleParameter(HloInstruction* parameter) override;
+
+  llvm::Function* GetEmittedFunction() const { return emitted_function_; }
+
+  Status EmitTargetElementLoop(
+      const HloInstruction& hlo,
+      const llvm_ir::ElementGenerator& body_emitter) override;
+
+ private:
+  llvm::Function* EmitBasePointersForNestedComputation(
+      const HloComputation& nested_computation,
+      std::vector<const HloInstruction*>* io_hlos);
+
+  llvm::Function* emitted_function_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_NESTED_H_
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 1b863c9e3c51d6e757751154abd653cd1fdcb8a7..aa2a0a9800bab142481e1def785c9052526fcd8c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -17,6 +17,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
+
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
@@ -28,14 +30,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
+#include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/for_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/gpu/kernel_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
@@ -69,6 +75,10 @@ namespace gpu {
 namespace {
 
 using llvm_ir::IrName;
+using tensorflow::gtl::ArraySlice;
+using tensorflow::gtl::nullopt;
+using tensorflow::gtl::optional;
+using tensorflow::strings::StrCat;
 
 // If a dimensions is smaller than this, untiled transposition may be more
 // efficient.
@@ -123,12 +133,46 @@ void UpdateLaunchDimensions(const LaunchDimensions& launch_dims, Thunk* thunk,
   llvm::ConstantInt* threads_per_block_ir_value = llvm::ConstantInt::get(
       llvm::IntegerType::get(llvm_context, /*NumBits=*/32),
       launch_dims.threads_per_block());
+  // Our launch bounds are exact, so we can specify them as reqntidx rather than
+  // maxntidx.
   nvvm_annotations_node->addOperand(llvm::MDNode::get(
       llvm_context,
       {llvm::ConstantAsMetadata::get(ir_kernel),
-       llvm::MDString::get(llvm_context, "maxntidx"),
+       llvm::MDString::get(llvm_context, "reqntidx"),
        llvm::ConstantAsMetadata::get(threads_per_block_ir_value)}));
 }
+
+// Tries to get a Slice for the given instruction at the given index, but
+// returns nullopt if we might not know the slice's address at runtime without
+// dereferencing a containing tuple.
+//
+// In particular, when XLA accepts a parameter of tuple type, the caller has the
+// option of telling XLA what are the values inside of the tuple, or just giving
+// XLA a pointer to the top-level tuple and letting us chase the pointers on the
+// GPU.  We therefore cannot rely having these pointers to parameter sub-buffers
+// being present when we run the program.
+optional<BufferAllocation::Slice> GetKnownAtRuntimeSlice(
+    const HloInstruction* instr, const ShapeIndex& index,
+    const BufferAssignment& buffer_assn) {
+  auto maybe_slice = buffer_assn.GetUniqueSlice(instr, index);
+  if (!maybe_slice.ok()) {
+    return nullopt;
+  }
+  // BufferAllocation gives a slice and alloc to every buffer accessed by XLA,
+  // but we don't necessarily know the runtime address of sub-buffers of input
+  // parameters.
+  const BufferAllocation::Slice& slice = maybe_slice.ValueOrDie();
+  const BufferAllocation* alloc = slice.allocation();
+  if (alloc->IsInputOrOutput() && !alloc->maybe_live_out() &&
+      !alloc->param_shape_index().empty()) {
+    return nullopt;
+  }
+
+  // Otherwise, we will know the address of this slice at runtime without having
+  // to dereference a tuple.
+  return slice;
+}
+
 }  // namespace
 
 IrEmitterUnnested::IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
@@ -146,16 +190,20 @@ Status IrEmitterUnnested::Postprocess(HloInstruction* hlo) {
 }
 
 namespace {
-bool ImplementedAsHostToDeviceMemcpy(const HloInstruction& hlo) {
-  // `hlo` needs to satisfy three conditions to be implemented as a
+bool ImplementedAsHostToDeviceMemcpy(const BufferAssignment& buffer_assignment,
+                                     const HloInstruction& hlo) {
+  // `hlo` needs to satisfy the following conditions to be implemented as a
   // host-to-device cuMemcpy.
   //
   // 1. `hlo` is a kCopy instruction.
   // 2. `hlo`'s only operand is a kConstant instruction.
   // 3. `hlo` and its operand have the same shape (thus the same layout too).
+  // 4. The address of `hlo`'s buffer is known at runtime (without dereferencing
+  //    pointers in a tuple).
   return hlo.opcode() == HloOpcode::kCopy &&
          hlo.operand(0)->opcode() == HloOpcode::kConstant &&
-         ShapeUtil::Equal(hlo.operand(0)->shape(), hlo.shape());
+         ShapeUtil::Equal(hlo.operand(0)->shape(), hlo.shape()) &&
+         GetKnownAtRuntimeSlice(&hlo, {}, buffer_assignment).has_value();
 }
 
 bool ImplementedAsDeviceToDeviceMemcpy(
@@ -169,52 +217,50 @@ bool ImplementedAsDeviceToDeviceMemcpy(
   //    instance) which means the source buffer also resides on the device.
   return hlo.opcode() == HloOpcode::kCopy &&
          ShapeUtil::Equal(hlo.operand(0)->shape(), hlo.shape()) &&
-         buffer_assignment.HasTopLevelAllocation(hlo.operand(0));
+         GetKnownAtRuntimeSlice(&hlo, {}, buffer_assignment).has_value() &&
+         GetKnownAtRuntimeSlice(hlo.operand(0), {}, buffer_assignment)
+             .has_value();
 }
 }  // namespace
 
 llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
     const HloInstruction& inst,
-    tensorflow::gtl::ArraySlice<const HloInstruction*> escaped_hlos) {
+    tensorflow::gtl::ArraySlice<const BufferAllocation*> args) {
   // Compute the kernel name. The opcode string may contain "-" which cannot be
   // in a PTX function name, so sanitize the name before uniquifying it.
   string kernel_name = ir_emitter_context_->name_uniquer()->GetUniqueName(
       llvm_ir::SanitizeFunctionName(inst.name()));
 
-  // Create the kernel and adds it to the module.
+  // Create the kernel and add it to the module.
   llvm::Module* module = ir_emitter_context_->llvm_module();
   llvm::LLVMContext& context = module->getContext();
-  int num_escaped_hlos = escaped_hlos.size();
   llvm::FunctionType* kernel_type = llvm::FunctionType::get(
-      llvm::Type::getVoidTy(context),  // The type of function result.
-      std::vector<llvm::Type*>(num_escaped_hlos + 1,
-                               ir_builder_.getInt8PtrTy()),
-      false);  // Not a variadic argument function.
+      /*Result=*/llvm::Type::getVoidTy(context),
+      std::vector<llvm::Type*>(args.size(), ir_builder_.getInt8PtrTy()),
+      /*isVarArg=*/false);
   llvm::Function* kernel =
       llvm::Function::Create(kernel_type, llvm::GlobalValue::ExternalLinkage,
                              kernel_name.c_str(), module);
 
-  // Add dereferenceable information to each of the escaped HLO parameters.
-  for (size_t arg_no = 0; arg_no < escaped_hlos.size(); ++arg_no) {
-    const HloInstruction* escaped_hlo = escaped_hlos[arg_no];
-    const Shape& escaped_hlo_shape = escaped_hlo->shape();
-    int64 escaped_hlo_size = llvm_ir::ByteSizeOf(
-        escaped_hlo_shape, ir_emitter_context_->llvm_module()->getDataLayout());
-    kernel->addDereferenceableAttr(arg_no + 1, escaped_hlo_size);
-  }
-
-  // The last argument is a pointer to the temporary buffer memory block.
-  // We know that it doesn't alias any of the escaped arguments (the inputs +
-  // the result).  We also know how many bytes can be dereferenced in it.
-  const llvm::Argument& temp_buffer = *std::prev(kernel->arg_end());
-  int64 temp_buffer_arg_no = temp_buffer.getArgNo();
-  int64 temp_allocation_total_size =
-      ir_emitter_context_->buffer_assignment().temp_allocation_total_size();
-  if (temp_allocation_total_size != 0) {
-    kernel->addDereferenceableAttr(temp_buffer_arg_no + 1,
-                                   temp_allocation_total_size);
+  // Add dereferenceable and alignment information to each of the kernel's
+  // parameters.
+  auto arg_it = kernel->arg_begin();
+  for (size_t arg_no = 0; arg_no < args.size(); ++arg_no) {
+    const BufferAllocation* alloc = args[arg_no];
+    llvm::Argument* fn_arg = &*arg_it;
+    ++arg_it;
+
+    kernel->addDereferenceableAttr(arg_no + 1, alloc->size());
+    kernel->addParamAttr(
+        arg_no, llvm::Attribute::get(context, llvm::Attribute::Alignment,
+                                     kCudaMallocAlignBytes));
+
+    if (alloc->IsPreallocatedTempBuffer()) {
+      fn_arg->setName("temp_buf");
+    } else {
+      fn_arg->setName(llvm_ir::AsStringRef(StrCat("alloc", alloc->index())));
+    }
   }
-  kernel->addAttribute(temp_buffer_arg_no + 1, llvm::Attribute::NoAlias);
 
   // TODO(b/65380986): Investigate if adding fast math flags for generated
   // kernels makes sense.
@@ -230,10 +276,9 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
 
   // Update the insert point to the entry basic block.
   llvm::BasicBlock* entry_bb =
-      llvm::BasicBlock::Create(context,
-                               "entry",  // The name of the basic block.
-                               kernel);  // The parent/owner of "entry_bb".
-  // Emit a "return void" at entry_bb's end, and sets the insert point before
+      llvm::BasicBlock::Create(context, /*Name=*/"entry", /*Parent=*/kernel);
+
+  // Emit a "return void" at entry_bb's end, and set the insert point before
   // that return instruction.
   ir_builder_.SetInsertPoint(llvm::ReturnInst::Create(context, entry_bb));
 
@@ -246,6 +291,11 @@ Status IrEmitterUnnested::DefaultAction(HloInstruction* hlo) {
 }
 
 Status IrEmitterUnnested::HandleDot(HloInstruction* dot) {
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  if (dnums.lhs_batch_dimensions_size() > 0 ||
+      dnums.rhs_batch_dimensions_size() > 0) {
+    return Unimplemented("Dot with batch dimensions not implemented.");
+  }
   if (ImplementedAsGemm(*dot)) {
     thunk_sequence_->emplace_back(BuildGemmThunk(dot));
     return Status::OK();
@@ -254,15 +304,191 @@ Status IrEmitterUnnested::HandleDot(HloInstruction* dot) {
   return IrEmitter::HandleDot(dot);
 }
 
+Status IrEmitterUnnested::HandleConditional(HloInstruction* conditional) {
+  thunk_sequence_->emplace_back(BuildConditionalThunk(conditional));
+  return Status::OK();
+}
+
 Status IrEmitterUnnested::HandleConvolution(HloInstruction* convolution) {
-  if (ImplementedAsDnnConvolution(*convolution)) {
-    thunk_sequence_->emplace_back(BuildConvolutionThunk(convolution));
-    return Status::OK();
-  }
   thunk_sequence_->emplace_back(BuildKernelThunk(convolution));
   return IrEmitter::HandleConvolution(convolution);
 }
 
+Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
+  // A CustomCall on the GPU backend can either be a custom-call to a
+  // user-supplied kernel, or a call into a library like cudnn.
+
+  // Lower custom-calls to cudnn batchnorm ops to specialized thunks.  It's part
+  // of the contract of these cudnn batchnorm calls that the epsilon and
+  // feature_index operands be constants.
+  if (custom_call->custom_call_target() ==
+      kCudnnBatchNormForwardInferenceCallTarget) {
+    const HloInstruction* epsilon = custom_call->operand(5);
+    CHECK(epsilon->IsConstant());
+    float epsilon_value = epsilon->literal().Get<float>({});
+
+    const HloInstruction* feature_index = custom_call->operand(6);
+    CHECK(feature_index->IsConstant());
+    int64 feature_index_value = feature_index->literal().Get<int64>({});
+
+    thunk_sequence_->emplace_back(
+        MakeUnique<CudnnBatchNormForwardInferenceThunk>(
+            /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
+            /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
+            /*offset=*/GetAllocationSlice(*custom_call->operand(2)),
+            /*mean=*/GetAllocationSlice(*custom_call->operand(3)),
+            /*variance=*/GetAllocationSlice(*custom_call->operand(4)),
+            /*epsilon=*/epsilon_value,
+            /*feature_index=*/feature_index_value,
+            /*output=*/GetAllocationSlice(*custom_call),
+            /*hlo=*/custom_call));
+    return Status::OK();
+  }
+
+  if (custom_call->custom_call_target() ==
+      kCudnnBatchNormForwardTrainingCallTarget) {
+    const HloInstruction* epsilon = custom_call->operand(3);
+    CHECK(epsilon->IsConstant());
+    float epsilon_value = epsilon->literal().Get<float>({});
+
+    const HloInstruction* feature_index = custom_call->operand(4);
+    CHECK(feature_index->IsConstant());
+    int64 feature_index_value = feature_index->literal().Get<int64>({});
+
+    // BatchNormTraining returns a tuple of three elements: data, calculated
+    // mean, and calculated 1/sqrt(variance + epsilon).
+    const auto& assn = ir_emitter_context_->buffer_assignment();
+    auto output_data = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
+    auto output_mean = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
+    auto output_inv_stddev = assn.GetUniqueSlice(custom_call, {2}).ValueOrDie();
+    thunk_sequence_->emplace_back(
+        MakeUnique<CudnnBatchNormForwardTrainingThunk>(
+            /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
+            /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
+            /*offset=*/GetAllocationSlice(*custom_call->operand(2)),
+            /*epsilon=*/epsilon_value,
+            /*feature_index=*/feature_index_value,
+            /*output_data=*/output_data,
+            /*output_mean=*/output_mean,
+            /*output_inv_stddev=*/output_inv_stddev,
+            /*output_tuple=*/GetAllocationSlice(*custom_call),
+            /*hlo=*/custom_call));
+    return Status::OK();
+  }
+
+  if (custom_call->custom_call_target() == kCudnnBatchNormBackwardCallTarget) {
+    const HloInstruction* epsilon = custom_call->operand(5);
+    CHECK(epsilon->IsConstant());
+    float epsilon_value = epsilon->literal().Get<float>({});
+
+    const HloInstruction* feature_index = custom_call->operand(6);
+    CHECK(feature_index->IsConstant());
+    int64 feature_index_value = feature_index->literal().Get<int64>({});
+
+    // BatchNormGrad returns a tuple of three elements: grad_data, grad_scale,
+    // grad_offset.
+    const auto& assn = ir_emitter_context_->buffer_assignment();
+    auto output_grad_data = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
+    auto output_grad_scale = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
+    auto output_grad_offset =
+        assn.GetUniqueSlice(custom_call, {2}).ValueOrDie();
+    thunk_sequence_->emplace_back(MakeUnique<CudnnBatchNormBackwardThunk>(
+        /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
+        /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
+        /*mean=*/GetAllocationSlice(*custom_call->operand(2)),
+        /*inv_stddev=*/GetAllocationSlice(*custom_call->operand(3)),
+        /*grad_output=*/GetAllocationSlice(*custom_call->operand(4)),
+        /*epsilon=*/epsilon_value,
+        /*feature_index=*/feature_index_value,
+        /*output_grad_data=*/output_grad_data,
+        /*output_grad_scale=*/output_grad_scale,
+        /*output_grad_offset=*/output_grad_offset,
+        /*output_tuple=*/GetAllocationSlice(*custom_call),
+        /*hlo=*/custom_call));
+    return Status::OK();
+  }
+
+  if (IsCustomCallToDnnConvolution(*custom_call)) {
+    const auto& assn = ir_emitter_context_->buffer_assignment();
+    const auto& lhs_shape = custom_call->operand(0)->shape();
+    const auto& rhs_shape = custom_call->operand(1)->shape();
+    const auto& conv_result_shape = custom_call->shape().tuple_shapes(0);
+    auto lhs_slice = GetAllocationSlice(*custom_call->operand(0));
+    auto rhs_slice = GetAllocationSlice(*custom_call->operand(1));
+    auto tuple_result_slice = GetAllocationSlice(*custom_call);
+    auto conv_result_slice = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
+    auto scratch_slice = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
+
+    const HloInstruction* algorithm_inst = custom_call->operand(2);
+    CHECK(algorithm_inst->IsConstant()) << algorithm_inst->ToString();
+    int64 algorithm = algorithm_inst->literal().Get<int64>({});
+
+    const HloInstruction* tensor_ops_enabled_inst = custom_call->operand(3);
+    CHECK(tensor_ops_enabled_inst->IsConstant())
+        << tensor_ops_enabled_inst->ToString();
+    bool tensor_ops_enabled = tensor_ops_enabled_inst->literal().Get<bool>({});
+
+    const auto& target = custom_call->custom_call_target();
+    std::unique_ptr<ConvolutionThunk> thunk;
+    if (target == kCudnnConvForwardCallTarget) {
+      thunk = MakeUnique<ConvolutionThunk>(
+          CudnnConvKind::kForward,
+          /*input_buffer=*/lhs_slice,
+          /*filter_buffer=*/rhs_slice,
+          /*output_buffer=*/conv_result_slice,
+          /*tuple_result_buffer=*/tuple_result_slice,
+          /*scratch_buffer=*/scratch_slice,
+          /*input_shape=*/lhs_shape,
+          /*filter_shape=*/rhs_shape,
+          /*output_shape=*/conv_result_shape,  //
+          custom_call->window(), custom_call->convolution_dimension_numbers(),
+          algorithm, tensor_ops_enabled, custom_call);
+    } else if (target == kCudnnConvBackwardInputCallTarget) {
+      thunk = MakeUnique<ConvolutionThunk>(
+          CudnnConvKind::kBackwardInput,
+          /*input_buffer=*/conv_result_slice,
+          /*filter_buffer=*/rhs_slice,
+          /*output_buffer=*/lhs_slice,
+          /*tuple_result_buffer=*/tuple_result_slice,
+          /*scratch_buffer=*/scratch_slice,
+          /*input_shape=*/conv_result_shape,
+          /*filter_shape=*/rhs_shape,
+          /*output_shape=*/lhs_shape,  //
+          custom_call->window(), custom_call->convolution_dimension_numbers(),
+          algorithm, tensor_ops_enabled, custom_call);
+    } else if (target == kCudnnConvBackwardFilterCallTarget) {
+      thunk = MakeUnique<ConvolutionThunk>(
+          CudnnConvKind::kBackwardFilter,
+          /*input_buffer=*/lhs_slice,
+          /*filter_buffer=*/conv_result_slice,
+          /*output_buffer=*/rhs_slice,
+          /*tuple_result_buffer=*/tuple_result_slice,
+          /*scratch_buffer=*/scratch_slice,
+          /*input_shape=*/lhs_shape,
+          /*filter_shape=*/conv_result_shape,
+          /*output_shape=*/rhs_shape,  //
+          custom_call->window(), custom_call->convolution_dimension_numbers(),
+          algorithm, tensor_ops_enabled, custom_call);
+    } else {
+      LOG(FATAL) << "Unexpected custom call target: "
+                 << custom_call->custom_call_target();
+    }
+
+    thunk_sequence_->emplace_back(std::move(thunk));
+    return Status::OK();
+  }
+
+  return IrEmitter::HandleCustomCall(custom_call);
+}
+
+Status IrEmitterUnnested::HandleFft(HloInstruction* fft) {
+  TF_RET_CHECK(
+      LayoutUtil::IsMonotonicWithDim0Major(fft->operand(0)->shape().layout()));
+  TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(fft->shape().layout()));
+  thunk_sequence_->emplace_back(BuildFftThunk(fft));
+  return Status::OK();
+}
+
 Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
   HloInstruction* root = fusion->fused_expression_root();
   // HandleFusion specializes reduction from a multi-dimensional array to a 1D
@@ -372,10 +598,6 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     thunk_sequence_->emplace_back(BuildGemmThunk(fusion));
     return Status::OK();
   }
-  if (ImplementedAsDnnConvolution(*fusion)) {
-    thunk_sequence_->emplace_back(BuildConvolutionThunk(fusion));
-    return Status::OK();
-  }
   thunk_sequence_->emplace_back(BuildKernelThunk(fusion));
   return IrEmitter::HandleFusion(fusion);
 }
@@ -407,8 +629,8 @@ Shape MergeDimensions(tensorflow::gtl::ArraySlice<size_t> segs,
             (segs.size() == i ? shape.dimensions().size() : segs[i]),
         1, std::multiplies<int64>()));
   }
-  return ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(shape.element_type(),
-                                                          dimensions);
+  return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
+                                                  dimensions);
 }
 
 // Returns whether the given shapes and permutation are a 0-2-1 transpose, and
@@ -421,20 +643,22 @@ std::tuple<bool, Shape, Shape> IsTranspose021(const Shape& a, const Shape& b) {
   CHECK(ShapeUtil::Compatible(a, b));
   std::vector<int64> perm(a.dimensions().size());
   {
-    std::vector<int64> layout_a(a.layout().minor_to_major().rbegin(),
-                                a.layout().minor_to_major().rend());
-    std::vector<int64> layout_b(b.layout().minor_to_major().rbegin(),
-                                b.layout().minor_to_major().rend());
+    auto layout_a_orig = LayoutUtil::MinorToMajor(a);
+    std::vector<int64> layout_a(layout_a_orig.rbegin(), layout_a_orig.rend());
+    auto layout_b_orig = LayoutUtil::MinorToMajor(b);
+    std::vector<int64> layout_b(layout_b_orig.rbegin(), layout_b_orig.rend());
     for (size_t i = 0; i < perm.size(); ++i) {
       perm[i] = PositionInContainer(layout_b, layout_a[i]);
     }
   }
   auto segs = ConsecutiveSegments(perm);
-  Shape norm_a = ShapeUtil::NormalizeShapeToMonotonicDim0MajorLayout(a);
-  Shape norm_b = ShapeUtil::NormalizeShapeToMonotonicDim0MajorLayout(b);
+  Shape norm_a =
+      ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(a);
+  Shape norm_b =
+      ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(b);
   if (3 == segs.size() && 0 == perm[0]) {
     Shape reduced_a = MergeDimensions(segs, norm_a);
-    Shape reduced_b = ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
+    Shape reduced_b = ShapeUtil::MakeShapeWithDescendingLayout(
         b.element_type(),
         Permute({0, 2, 1}, AsInt64Slice(reduced_a.dimensions())));
     return std::make_tuple(true, reduced_a, reduced_b);
@@ -448,10 +672,11 @@ std::tuple<bool, Shape, Shape> IsTranspose021(const Shape& a, const Shape& b) {
 bool AreShapesForTranspose021(const Shape& a, const Shape& b) {
   return 3 == b.dimensions().size() &&
          ShapeUtil::Compatible(
-             ShapeUtil::NormalizeShapeToMonotonicDim0MajorLayout(a),
+             ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(a),
              ShapeUtil::PermuteDimensions(
                  {0, 2, 1},
-                 ShapeUtil::NormalizeShapeToMonotonicDim0MajorLayout(b)));
+                 ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
+                     b)));
 }
 
 // Emits a tiled 0-2-1 transpose, assuming both input and output lain out from
@@ -483,9 +708,11 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
   CHECK(AreShapesForTranspose021(input.GetShape(), output.GetShape()));
 
   Shape input_shape =
-      ShapeUtil::NormalizeShapeToMonotonicDim0MajorLayout(input.GetShape());
+      ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
+          input.GetShape());
   Shape output_shape =
-      ShapeUtil::NormalizeShapeToMonotonicDim0MajorLayout(output.GetShape());
+      ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
+          output.GetShape());
   input = input.CastToShape(input_shape, builder);
   output = output.CastToShape(output_shape, builder);
 
@@ -603,7 +830,7 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
                   llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {},
                   builder))),
           builder->getInt64Ty(), /*isSigned=*/true, "block.id.x"),
-      ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
+      ShapeUtil::MakeShapeWithDescendingLayout(
           PRED /*arbitrary*/, AsInt64Slice(input_dims_in_tiles)),
       builder);
   const llvm_ir::IrArray::Index input_tile_origin = ({
@@ -672,7 +899,8 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
 }  // namespace
 
 Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
-  if (ImplementedAsHostToDeviceMemcpy(*copy)) {
+  if (ImplementedAsHostToDeviceMemcpy(ir_emitter_context_->buffer_assignment(),
+                                      *copy)) {
     thunk_sequence_->emplace_back(BuildHostToDeviceCopyThunk(copy));
     return Status::OK();
   }
@@ -706,6 +934,194 @@ Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
   return IrEmitter::HandleCopy(copy);
 }
 
+Status IrEmitterUnnested::EmitReductionToScalar(
+    HloInstruction* reduce, const Shape& input_shape,
+    const llvm_ir::ElementGenerator& input_gen,
+    const llvm_ir::ElementGenerator& init_value_gen, HloComputation* reducer) {
+  // Number of elements processed by a single thread.
+  constexpr int64 kTileSize = 16;
+  int64 num_elems = ShapeUtil::ElementsIn(input_shape);
+
+  // Round up the number of tiles to a multiple of the warp size.  This is
+  // necessary for correctness.  We launch one thread per tile, and if the
+  // number of threads isn't a multiple of the number of the warp size, our
+  // shuffles will read from inactive threads, producing undefined values.
+  int64 num_tiles =
+      RoundUpToNearest(CeilOfRatio(num_elems, kTileSize), kWarpSize);
+
+  // Check whether every thread will process a full tile's worth of elements
+  // without reading outside the bounds of the input.  If this is true, we can
+  // skip some bounds checks in the final algorithm.
+  bool all_threads_in_bounds = num_tiles * kTileSize == num_elems;
+
+  // __global__ void full_reduce_kernel() {
+  //   x_in_tiles = threadIdx.x + blockIdx.x * blockDim.x;
+  //   x = x_in_tiles * kTileSize;
+  //
+  //   partial_result = init_value;
+  //   if (all_threads_in_bounds || x + kTileSize <= num_elems) {
+  //     for (i = 0; i < kTileSize; ++i) {
+  //       partial_result = Reducer(partial_result, input[x + i]);
+  //     }
+  //   } else {
+  //     for (i = 0; i < kTileSize; ++i) {
+  //       if (x + i < num_elems) {
+  //         partial_result = Reducer(partial_result, input[x + i]);
+  //       }
+  //     }
+  //   }
+  //   for (i = warpSize / 2; i > 0; i /= 2) {
+  //     partial_result = Reducer(partial_result,
+  //                              __shfl_down(partial_result, i));
+  //   }
+  //   if (lane_id == 0) {
+  //     AtomicReducer(&output[y], partial_result);
+  //   }
+  // }
+  //
+  // // Choose num_blocks and threads_per_block such that:
+  // //
+  // //   num_blocks * threads_per_block =
+  // //     RoundUpToNextMultipleOf(Ceil(num_elems / kTileSize), warpSize),
+  // //
+  // // and threads_per_block is a multiple of warpSize.
+  // reduce_kernel<<<num_blocks, threads_per_block>>>();
+  //
+  auto loop_body_emitter =
+      [=](const llvm_ir::IrArray::Index& tile_index) -> Status {
+    llvm::Type* element_ir_type =
+        llvm_ir::PrimitiveTypeToIrType(input_shape.element_type(), module_);
+    llvm::Value* partial_reduction_result_address = ir_builder_.CreateAlloca(
+        element_ir_type, /*ArraySize=*/nullptr, "partial_reduction_result");
+    {
+      TF_ASSIGN_OR_RETURN(llvm::Value * init_ir_value,
+                          init_value_gen(llvm_ir::IrArray::Index({})));
+      ir_builder_.CreateStore(init_ir_value, partial_reduction_result_address);
+    }
+
+    llvm::Value* x_in_tiles = tile_index[0];
+
+    // Emit an inner for-loop that reduces the elements in the tile.
+    auto emit_tile_element_loop = [=](bool tile_in_bounds) -> Status {
+      std::unique_ptr<llvm_ir::ForLoop> tile_element_loop =
+          llvm_ir::ForLoop::EmitForLoop("element_id_in_tile",
+                                        ir_builder_.getInt64(0),
+                                        ir_builder_.getInt64(kTileSize),
+                                        ir_builder_.getInt64(1), &ir_builder_);
+
+      // Emit the body of the partial reduction loop.
+      llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(),
+                                     &ir_builder_);
+      llvm::Value* x = ir_builder_.CreateNSWAdd(
+          ir_builder_.CreateNSWMul(x_in_tiles, ir_builder_.getInt64(kTileSize)),
+          tile_element_loop->GetIndVarValue());
+      // Unless we know the tile is entirely in bounds, we have to emit a
+      // x-in-bounds check before reading from the input.
+      if (!tile_in_bounds) {
+        llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
+            ir_builder_.CreateICmpULT(x, ir_builder_.getInt64(num_elems)),
+            "x_in_bounds", &ir_builder_);
+
+        // Emit code that reads the input element and accumulates it to
+        // the partial reduction result.
+        llvm_ir::SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+      }
+      llvm_ir::IrArray::Index input_index(
+          /*linear=*/x, input_shape, &ir_builder_);
+      llvm::Value* input_address = ir_builder_.CreateAlloca(element_ir_type);
+      TF_ASSIGN_OR_RETURN(llvm::Value * input_ir_value, input_gen(input_index));
+      ir_builder_.CreateStore(input_ir_value, input_address);
+      return (EmitCallToNestedComputation(
+          *reducer, {partial_reduction_result_address, input_address},
+          partial_reduction_result_address));
+    };
+
+    // x_end = kTileSize + x_in_tiles * kTileSize, i.e., the location that's
+    // immediately beyond the tile.
+    llvm::Value* x_end = ir_builder_.CreateNSWAdd(
+        ir_builder_.getInt64(kTileSize),
+        ir_builder_.CreateNSWMul(x_in_tiles, ir_builder_.getInt64(kTileSize)));
+    // The tile is entirely in bound if all_threads_in_bounds or
+    // x_end <= num_elems.
+    llvm::Value* tile_in_bounds = ir_builder_.CreateOr(
+        ir_builder_.CreateICmpULE(x_end, ir_builder_.getInt64(num_elems)),
+        ir_builder_.getInt1(all_threads_in_bounds));
+    llvm_ir::LlvmIfData if_tile_in_bounds_data =
+        llvm_ir::EmitIfThenElse(tile_in_bounds, "tile_in_bounds", &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.true_block,
+                                   &ir_builder_);
+    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/true));
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.false_block,
+                                   &ir_builder_);
+    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/false));
+
+    // After the if-then-else statement on tile_in_bounds, emit calls to
+    // shfl_down that accumulate the partial reduction results of all threads
+    // from the warp.
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.after_block,
+                                   &ir_builder_);
+    int bit_width = llvm_ir::GetSizeInBits(element_ir_type);
+    // bitcast cannot be applied to aggregate types (even packed ones), so we
+    // instead bitcast addresses of load/store to intN* of the same bit-width.
+    llvm::Type* shuffle_ir_type = element_ir_type->isStructTy()
+                                      ? ir_builder_.getIntNTy(bit_width)
+                                      : element_ir_type;
+    for (int shuffle_distance = kWarpSize / 2; shuffle_distance >= 1;
+         shuffle_distance /= 2) {
+      llvm::Value* partial_reduction_result = ir_builder_.CreateLoad(
+          ir_builder_.CreateBitCast(partial_reduction_result_address,
+                                    shuffle_ir_type->getPointerTo()),
+          "partial_reduction_result");
+      llvm::Value* result_from_other_lane = ir_builder_.CreateAlloca(
+          element_ir_type, nullptr, "result_from_other_lane");
+      ir_builder_.CreateStore(
+          EmitShuffleDown(partial_reduction_result,
+                          ir_builder_.getInt32(shuffle_distance), &ir_builder_),
+          ir_builder_.CreateBitCast(result_from_other_lane,
+                                    shuffle_ir_type->getPointerTo()));
+      TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
+          *reducer, {partial_reduction_result_address, result_from_other_lane},
+          partial_reduction_result_address));
+    }
+
+    const HloInstruction* output =
+        reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce;
+
+    // Emit an atomic operation that accumulates the partial reduction result of
+    // lane 0 (which holds the partially accumulated result for its warp) to the
+    // output element.
+    llvm::Value* lane_id = ir_builder_.CreateURem(
+        x_in_tiles, ir_builder_.getInt64(kWarpSize), "lane_id");
+    llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse(
+        ir_builder_.CreateICmpEQ(lane_id, ir_builder_.getInt64(0)),
+        "lane_id_is_zero", &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block,
+                                   &ir_builder_);
+    llvm::Value* output_address =
+        GetIrArray(*output, *output)
+            .EmitArrayElementAddress(
+                llvm_ir::IrArray::Index(/*linear=*/ir_builder_.getInt64(0),
+                                        output->shape(), &ir_builder_),
+                &ir_builder_, "output_element_address");
+    return EmitAtomicOperationForNestedComputation(
+        *reducer, output_address, partial_reduction_result_address);
+  };
+
+  // Emit a parallel loop that iterates through all input tiles, one per thread.
+  Shape tiled_input_shape = ShapeUtil::MakeShapeWithLayout(
+      reduce->shape().element_type(), {num_tiles}, {0});
+  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
+      tiled_input_shape, ir_emitter_context_->device_description());
+  CHECK(LastThunk()->kind() == Thunk::Kind::kSequential);
+  UpdateLaunchDimensions(
+      launch_dimensions,
+      static_cast<SequentialThunk*>(LastThunk())->thunks().back().get(),
+      ir_emitter_context_->llvm_module());
+  return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
+                             launch_dimensions, &ir_builder_)
+      .EmitLoop(IrName(reduce));
+}
+
 Status IrEmitterUnnested::EmitColumnReduction(
     int64 height, int64 width, HloInstruction* reduce, const Shape& input_shape,
     const llvm_ir::ElementGenerator& input_gen,
@@ -799,14 +1215,15 @@ Status IrEmitterUnnested::EmitColumnReduction(
         // input_shape to normalized_input_shape and a reshape from
         // normalized_input_shape to input_matrix_shape.
         const Shape normalized_input_shape =
-            ShapeUtil::NormalizeShapeToMonotonicDim0MajorLayout(input_shape);
+            ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
+                input_shape);
+        auto input_shape_min2maj = LayoutUtil::MinorToMajor(input_shape);
         const std::vector<int64> transpose_dimension_mapping(
-            input_shape.layout().minor_to_major().rbegin(),
-            input_shape.layout().minor_to_major().rend());
+            input_shape_min2maj.rbegin(), input_shape_min2maj.rend());
 
         const Shape input_matrix_shape =
-            ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
-                input_shape.element_type(), {height, width});
+            ShapeUtil::MakeShapeWithDescendingLayout(input_shape.element_type(),
+                                                     {height, width});
         const llvm_ir::IrArray::Index input_matrix_index(
             {y, x}, input_matrix_shape, &ir_builder_);
         const llvm_ir::IrArray::Index input_index =
@@ -901,7 +1318,7 @@ Status IrEmitterUnnested::EmitRowReduction(
   //
   // Three optimizations are performed.
   //
-  // 1. To coalesc global memory accesses, dilate the tile with a factor of 32
+  // 1. To coalesce global memory accesses, dilate the tile with a factor of 32
   // (i.e. the warp size). For example, suppose the width is 8x32=256. Instead
   // of making each tile consecutive, we let make tile 0 column
   // [0,32,64,...,224], tile 1 column [1,33,65,...,225], and so on. This ensures
@@ -1042,13 +1459,14 @@ Status IrEmitterUnnested::EmitRowReduction(
         // from input_shape to normalized_input_shape and a reshape from
         // normalized_input_shape to input_3d_tensor_shape.
         const Shape normalized_input_shape =
-            ShapeUtil::NormalizeShapeToMonotonicDim0MajorLayout(input_shape);
+            ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
+                input_shape);
+        auto input_shape_min2maj = LayoutUtil::MinorToMajor(input_shape);
         const std::vector<int64> transpose_dimension_mapping(
-            input_shape.layout().minor_to_major().rbegin(),
-            input_shape.layout().minor_to_major().rend());
+            input_shape_min2maj.rbegin(), input_shape_min2maj.rend());
         const Shape input_3d_tensor_shape =
-            ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
-                input_shape.element_type(), {depth, height, width});
+            ShapeUtil::MakeShapeWithDescendingLayout(input_shape.element_type(),
+                                                     {depth, height, width});
         const llvm_ir::IrArray::Index input_3d_tensor_index(
             {z, y, x}, input_3d_tensor_shape, &ir_builder_);
         const llvm_ir::IrArray::Index input_index =
@@ -1177,9 +1595,9 @@ Status IrEmitterUnnested::EmitReductionToVector(
   // whether another dimension is major or minor of them.
   std::sort(input_dims_to_keep.begin(), input_dims_to_keep.end(),
             [&input_shape](int64 dim_a, int64 dim_b) {
-              return PositionInContainer(input_shape.layout().minor_to_major(),
+              return PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
                                          dim_a) <
-                     PositionInContainer(input_shape.layout().minor_to_major(),
+                     PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
                                          dim_b);
             });
   // Now, if output rank is at least 1, `input_dims_to_keep.front()` is
@@ -1189,14 +1607,11 @@ Status IrEmitterUnnested::EmitReductionToVector(
   // the dimensions to keep are contiguous, by prerequisite of
   // `EmitReductionToVector`, we only need to check whether the minormost
   // dimension of the input is to keep.
-  //
-  // If the output is scalar, we could emit either a row or a column reduction.
-  // Some tests have shown scalar reduction is no more efficient as row
-  // reduction, and is simpler to emit as column reduction, so we emit a column
-  // reduction in this case.
-  if (input_dims_to_keep.empty() ||
-      input_dims_to_keep.front() ==
-          LayoutUtil::Minor(input_shape.layout(), 0)) {
+  if (input_dims_to_keep.empty()) {
+    return EmitReductionToScalar(reduce, input_shape, input_gen, init_value_gen,
+                                 reducer);
+  } else if (input_dims_to_keep.front() ==
+             LayoutUtil::Minor(input_shape.layout(), 0)) {
     // Column reduction. Treat the result of "input" as a matrix whose width
     // is the most minor dimension and height the product of other dimensions,
     // and treat "reduce" as a column reduction of the input matrix.
@@ -1224,14 +1639,14 @@ Status IrEmitterUnnested::EmitReductionToVector(
     int64 width = 1;
     for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape);
          ++input_dim) {
-      if (PositionInContainer(input_shape.layout().minor_to_major(),
+      if (PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
                               input_dim) >
-          PositionInContainer(input_shape.layout().minor_to_major(),
+          PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
                               input_dims_to_keep.back())) {
         depth *= input_shape.dimensions(input_dim);
-      } else if (PositionInContainer(input_shape.layout().minor_to_major(),
+      } else if (PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
                                      input_dim) <
-                 PositionInContainer(input_shape.layout().minor_to_major(),
+                 PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
                                      input_dims_to_keep.front())) {
         width *= input_shape.dimensions(input_dim);
       }
@@ -1279,24 +1694,24 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
 }
 
 Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) {
-  tensorflow::gtl::ArraySlice<HloInstruction*> operands(tuple->operands());
-  bool all_tuple_elements_have_buffer = std::all_of(
-      operands.begin(), operands.end(), [this](HloInstruction* tuple_element) {
+  bool all_tuple_elements_have_buffer =
+      c_all_of(tuple->operands(), [&](HloInstruction* tuple_element) {
         return ir_emitter_context_->buffer_assignment().HasTopLevelAllocation(
             tuple_element);
       });
-  // Tuples (especially output tuples) can take too many tuple elements,
-  // causing the kernel emitted exceeds the parameter space limit
-  // (b/31336476). As an optimization, if all tuple elements have a buffer, we
-  // collect their buffer addresses in a host array, and then copy that array
-  // to the tuple's buffer.
+  // Tuples (especially tuples that are the final result of a computation) can
+  // be so huge that if we were to emit a kernel that took each tuple element as
+  // a parameter, we would exceed the max allowable number of parameters to a
+  // GPU kernel, b/31336476. As an optimization, if all tuple elements have a
+  // buffer, we collect their buffer addresses in a host array, and then copy
+  // that array to the tuple's buffer.
   //
   // Some tuple elements (e.g. const or bitcast of const) might not have a
-  // buffer -- their contents are stored in code. In that case, we fall back
-  // to emitting kernels which have access to their buffer addresses in code.
+  // buffer -- their contents are stored in code. In that case, we fall back to
+  // emitting kernels which have access to their buffer addresses in code.
   if (all_tuple_elements_have_buffer) {
     std::vector<BufferAllocation::Slice> tuple_element_buffers;
-    for (const HloInstruction* tuple_element : operands) {
+    for (const HloInstruction* tuple_element : tuple->operands()) {
       tuple_element_buffers.push_back(GetAllocationSlice(*tuple_element));
     }
     thunk_sequence_->emplace_back(MakeUnique<TupleThunk>(
@@ -1338,8 +1753,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   // TODO(b/31410564): Implement dilation rate for select-and-scatter.
   if (window_util::HasDilation(window)) {
     return Unimplemented(
-        "Dilation for select-and-scatter not implemented on GPU. "
-        "See b/31410564.");
+        "Dilation for SelectAndScatter not implemented on GPU.");
   }
 
   // kSelectAndScatter is implemented as two kernel launches: the first launch
@@ -1548,62 +1962,202 @@ Status IrEmitterUnnested::HandleInfeed(HloInstruction* infeed) {
   return Status::OK();
 }
 
-llvm::Function* IrEmitterUnnested::EmitBasePointersForHloAndItsOperands(
-    const HloInstruction& hlo, std::vector<const HloInstruction*>* io_hlos) {
-  const BufferAssignment& buffer_assignment =
-      ir_emitter_context_->buffer_assignment();
-  // GetTupleElement instructions are implemented by emitting IR that indexes
-  // and loads the target tuple element pointer from its operand (possibly
-  // recursively). For this reason, GetTupleElement instructions are associated
-  // with their operand buffer in 'io_hlos' and 'non_io_hlos' below.
-  std::vector<const HloInstruction*> non_io_hlos;
-  for (const HloInstruction* operand : hlo.operands()) {
-    const HloInstruction* to_lookup = operand->LatestNonGteAncestor();
-    if (buffer_assignment.HasTopLevelAllocation(to_lookup) &&
-        buffer_assignment.GetUniqueTopLevelSlice(to_lookup)
-            .ConsumeValueOrDie()
-            .allocation()
-            ->IsInputOrOutput()) {
-      io_hlos->push_back(operand);
-    } else {
-      non_io_hlos.push_back(operand);
+// Figures out how to access the buffers for all subshapes of hlo's operands and
+// for hlo itself (i.e. all the buffers produced by HLO).
+//
+// Returns a map keyed on the pair {HloInstruction, ShapeIndex}.  The value for
+// this key is a pair {Slice, ShapeIndex}, where the slice tells you the root
+// buffer to look in, and the ShapeIndex describes how to dereference starting
+// at that buffer to get to the buffer in question.
+//
+// For example, if {hlo, {1}} is mapped to {slice, {3, 4}}, then the buffer for
+// hlo at ShapeIndex {1} (i.e. the buffer for the second tuple element of hlo)
+// is found at slice[3][4].  That is, slice is a void***, which we dereference
+// twice -- first at index 3, and then at index 4 -- to get the address of our
+// buffer.
+//
+// This function conservatively assumes that we'll touch all sub-buffers of
+// every operand and of the output.
+static std::map<std::pair<const HloInstruction*, ShapeIndex>,
+                std::pair<BufferAllocation::Slice, ShapeIndex>>
+GetHloBufferSlices(const HloInstruction* hlo,
+                   const BufferAssignment& buffer_assn) {
+  std::map<std::pair<const HloInstruction*, ShapeIndex>,
+           std::pair<BufferAllocation::Slice, ShapeIndex>>
+      slices;
+
+  // Tries to find a slice plus an array of indices i1, ..., iN such that the
+  // sub-buffer for instr at index can be found at slice[i1]...[iN].
+  auto find_slice_for = [&](const HloInstruction* instr,
+                            const ShapeIndex& index)
+      -> optional<std::pair<BufferAllocation::Slice, ShapeIndex>> {
+    // Simple, common case: Is the buffer for instr known at runtime?  If so,
+    // we're done.
+    auto slice = GetKnownAtRuntimeSlice(instr, index, buffer_assn);
+    if (slice.has_value()) {
+      return {{*slice, ShapeIndex()}};
     }
-  }
 
-  CHECK_NE(HloOpcode::kGetTupleElement, hlo.opcode());
-  if (buffer_assignment.HasTopLevelAllocation(&hlo) &&
-      buffer_assignment.GetUniqueTopLevelSlice(&hlo)
-          .ConsumeValueOrDie()
-          .allocation()
-          ->IsInputOrOutput()) {
-    io_hlos->push_back(&hlo);
-  } else {
-    non_io_hlos.push_back(&hlo);
+    // If we don't know the buffer for instr at index, see if we know the buffer
+    // for instr at index without its last element.  If so, we can dynamically
+    // find the buffer for instr by dereferencing a pointer in that buffer.
+    // Continue looking this way until we run out of elements in 'index'.
+    ShapeIndex new_index = index;
+    ShapeIndex gte_indices;
+    while (!new_index.empty()) {
+      gte_indices.push_front(new_index.back());
+      new_index.pop_back();
+      auto slice = GetKnownAtRuntimeSlice(instr, new_index, buffer_assn);
+      if (slice.has_value()) {
+        return {{*slice, gte_indices}};
+      }
+    }
+
+    // If *that* didn't work, check whether instr is a GTE instruction.  If it
+    // is, see if we can get a buffer for its parent, and continue walking up
+    // parents until we find a defined buffer or we hit something that's not a
+    // GTE.
+    const HloInstruction* parent = instr;
+    while (parent->opcode() == HloOpcode::kGetTupleElement) {
+      gte_indices.push_front(parent->tuple_index());
+      parent = parent->operand(0);
+
+      auto slice = GetKnownAtRuntimeSlice(parent, {}, buffer_assn);
+      if (slice.has_value()) {
+        return {{*slice, gte_indices}};
+      }
+    }
+
+    return nullopt;
+  };
+
+  // Adds entries for all subshapes of instr to `slices`.
+  auto add_slices_for = [&](const HloInstruction* instr) {
+    // GPU constants don't have buffers; don't bother looking for one.
+    if (instr->IsConstant()) {
+      return;
+    }
+
+    ShapeUtil::ForEachSubshape(
+        instr->shape(), [&](const Shape& /*shape*/, const ShapeIndex& index) {
+          if (slices.count({instr, index})) {
+            // HLOs can have duplicate operands; don't bother redoing work.
+            return;
+          }
+          auto maybe_slice = find_slice_for(instr, index);
+          if (maybe_slice.has_value()) {
+            slices[{instr, index}] = *maybe_slice;
+          } else {
+            VLOG(1) << "Couldn't find buffer for " << instr->ToString()
+                    << " at index " << index.ToString();
+          }
+        });
+  };
+
+  add_slices_for(hlo);
+  for (const HloInstruction* operand : hlo->operands()) {
+    // Conservatively assume we'll need the buffers for all subshapes of the
+    // operand.
+    add_slices_for(operand);
   }
 
-  llvm::Function* kernel = BuildKernelPrototype(hlo, *io_hlos);
-  // bindings_ is reused because the bindings of kConstant to their underlying
-  // llvm::Constant can be shared for all HLOs in this computation.
-  bindings_.EmitBasePointersForHlos(*io_hlos, non_io_hlos);
-  return kernel;
+  return slices;
 }
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildKernelThunk(
     const HloInstruction* inst) {
-  std::vector<const HloInstruction*> io_hlos;
-  llvm::Function* kernel =
-      EmitBasePointersForHloAndItsOperands(*inst, &io_hlos);
+  const BufferAssignment& buffer_assn =
+      ir_emitter_context_->buffer_assignment();
+
+  std::map<std::pair<const HloInstruction*, ShapeIndex>,
+           std::pair<BufferAllocation::Slice, ShapeIndex>>
+      hlo_slices = GetHloBufferSlices(inst, buffer_assn);
+
+  // Figure out which buffer allocations need to be passed as arguments to our
+  // kernel.  This is simply all of the allocations referenced in hlo_slices,
+  // plus the XLA temp buffer (if we have it).  We always include the temp
+  // buffer because even if the kernel itself doesn't use it, a nested
+  // subcomputation within the kernel (e.g. a kMap's computation) might.
+  std::unordered_set<const BufferAllocation*> buffers_needed;
+  for (const auto& kv : hlo_slices) {
+    buffers_needed.insert(kv.second.first.allocation());
+  }
+  tensorflow::gtl::optional<const BufferAllocation*> temp_buffer;
+  for (const BufferAllocation& alloc : buffer_assn.Allocations()) {
+    if (alloc.IsPreallocatedTempBuffer()) {
+      if (!temp_buffer.has_value()) {
+        temp_buffer = &alloc;
+      } else {
+        LOG(FATAL) << "Multiple temp buffers found, but only one is allowed!";
+      }
+    }
+  }
+  if (temp_buffer.has_value()) {
+    buffers_needed.insert(*temp_buffer);
+  }
+
+  // We'll pass a pointer to each of the elements of `buffers` to our kernel, in
+  // this order.
+  std::vector<const BufferAllocation*> buffers(buffers_needed.begin(),
+                                               buffers_needed.end());
+  std::sort(buffers.begin(), buffers.end(),
+            [](const BufferAllocation* a, const BufferAllocation* b) {
+              return a->index() < b->index();
+            });
+
+  llvm::Function* kernel = BuildKernelPrototype(*inst, buffers);
+
+  // Build a map from a BufferAllocation to the corresponding argument in our
+  // kernel.
+  std::unordered_map<const BufferAllocation*, llvm::Value*> kernel_args;
+  {
+    auto arg_it = kernel->arg_begin();
+    auto buffers_it = buffers.begin();
+    for (; arg_it != kernel->arg_end(); ++arg_it, ++buffers_it) {
+      kernel_args[*buffers_it] = arg_it;
+    }
+  }
+
+  // For each buffer our kernel might want to touch, bind it to a value derived
+  // from our kernel args.
+  for (const auto& kv : hlo_slices) {
+    const HloInstruction* instr = kv.first.first;
+    const ShapeIndex& index = kv.first.second;
+    const BufferAllocation::Slice& slice = kv.second.first;
+    const ShapeIndex& gte_index = kv.second.second;
+
+    VLOG(3) << "Buffer for " << instr->ToString() << " at " << index.ToString()
+            << " is found in slice " << slice.ToString() << " at GTE index "
+            << gte_index.ToString();
+
+    llvm::Value* loc =
+        ir_builder_.CreateInBoundsGEP(kernel_args.at(slice.allocation()),
+                                      {ir_builder_.getInt64(slice.offset())});
+
+    // If gte_index is nonempty, we have to dereference `loc` to get to the
+    // value we're ultimately interested in.
+    llvm::Type* int8_double_pointer =
+        llvm::PointerType::get(ir_builder_.getInt8PtrTy(), /*AddressSpace=*/0);
+    for (int64 idx : gte_index) {
+      loc = ir_builder_.CreateBitCast(loc, int8_double_pointer);
+      loc = ir_builder_.CreateLoad(
+          ir_builder_.CreateInBoundsGEP(loc, {ir_builder_.getInt64(idx)}));
+    }
+
+    bindings_.BindHloToIrValue(*instr, loc, index);
+  }
 
-  // Compute the input buffer indices.
-  std::vector<BufferAllocation::Slice> io_buffers;
-  io_buffers.reserve(io_hlos.size());
-  for (const HloInstruction* io_hlo : io_hlos) {
-    io_buffers.push_back(GetAllocationSlice(*io_hlo->LatestNonGteAncestor()));
+  // Bind the temp buffer so that nested subcomputations can find it if they
+  // need.
+  if (temp_buffer.has_value()) {
+    bindings_.SetTempBufferBase(kernel_args.at(*temp_buffer));
+  } else {
+    bindings_.SetTempBufferBase(
+        llvm::ConstantPointerNull::get(ir_builder_.getInt8PtrTy()));
   }
 
-  // Create a KernelThunk that launches the kernel that implements "inst".
-  return MakeUnique<KernelThunk>(io_buffers,
-                                 llvm_ir::AsString(kernel->getName()), inst);
+  return MakeUnique<KernelThunk>(buffers, llvm_ir::AsString(kernel->getName()),
+                                 inst);
 }
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildHostToDeviceCopyThunk(
@@ -1611,7 +2165,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildHostToDeviceCopyThunk(
   const HloInstruction* operand = inst->operand(0);
   CHECK_EQ(HloOpcode::kConstant, operand->opcode());
   return MakeUnique<HostToDeviceCopyThunk>(
-      /*source_address=*/operand->literal().InternalData(),
+      /*source_address=*/operand->literal().untyped_data(),
       /*destination_buffer=*/GetAllocationSlice(*inst),
       /*mem_size=*/
       llvm_ir::ByteSizeOf(operand->shape(),
@@ -1692,50 +2246,14 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
   LOG(FATAL) << "Cannot build a GemmThunk for " << inst->ToString();
 }
 
-std::unique_ptr<Thunk> IrEmitterUnnested::BuildConvolutionThunk(
+std::unique_ptr<Thunk> IrEmitterUnnested::BuildFftThunk(
     const HloInstruction* inst) {
-  const HloInstruction* lhs = inst->operand(0);
-  const HloInstruction* rhs = inst->operand(1);
-  if (inst->opcode() == HloOpcode::kConvolution) {
-    // Forward covolution.
-    return MakeUnique<ConvolutionThunk>(
-        ConvolutionThunk::ConvolutionKind::kForward,
-        /*input_buffer=*/GetAllocationSlice(*lhs),
-        /*filter_buffer=*/GetAllocationSlice(*rhs),
-        /*output_buffer=*/GetAllocationSlice(*inst),
-        /*input_shape=*/lhs->shape(),
-        /*filter_shape=*/rhs->shape(),
-        /*output_shape=*/inst->shape(), inst->window(),
-        inst->convolution_dimension_numbers(), inst);
-  }
-
-  // Backward filter convolution, which takes the input (activations) and the
-  // gradients, and computes the filter.
-  CHECK_EQ(HloOpcode::kFusion, inst->opcode());
-  switch (inst->fusion_kind()) {
-    case HloInstruction::FusionKind::kConvBackwardFilter:
-      return MakeUnique<ConvolutionThunk>(
-          ConvolutionThunk::ConvolutionKind::kBackwardFilter,
-          /*input_buffer=*/GetAllocationSlice(*lhs),
-          /*filter_buffer=*/GetAllocationSlice(*inst),
-          /*output_buffer=*/GetAllocationSlice(*rhs),
-          /*input_shape=*/lhs->shape(),
-          /*filter_shape=*/inst->shape(),
-          /*output_shape=*/rhs->shape(), inst->window(),
-          inst->convolution_dimension_numbers(), inst);
-    case HloInstruction::FusionKind::kConvBackwardInput:
-      return MakeUnique<ConvolutionThunk>(
-          ConvolutionThunk::ConvolutionKind::kBackwardInput,
-          /*input_buffer=*/GetAllocationSlice(*inst),
-          /*filter_buffer=*/GetAllocationSlice(*rhs),
-          /*output_buffer=*/GetAllocationSlice(*lhs),
-          /*input_shape=*/inst->shape(),
-          /*filter_shape=*/rhs->shape(),
-          /*output_shape=*/lhs->shape(), inst->window(),
-          inst->convolution_dimension_numbers(), inst);
-    default:
-      LOG(FATAL) << "Not a convolution-fusion";
-  }
+  const HloInstruction* operand = inst->operand(0);
+  return MakeUnique<FftThunk>(inst->fft_type(), inst->fft_length(),
+                              /*input_buffer=*/GetAllocationSlice(*operand),
+                              /*output_buffer=*/GetAllocationSlice(*inst),
+                              /*input_shape=*/operand->shape(),
+                              /*output_shape=*/inst->shape(), inst);
 }
 
 Status IrEmitterUnnested::EmitInitializer(const HloInstruction* hlo,
@@ -1773,6 +2291,24 @@ Status IrEmitterUnnested::EmitInitializer(const HloInstruction* hlo,
 
 namespace {
 
+// Checks that the buffers corresponding to the given two HLOs share the same
+// allocation.
+Status CheckHloBuffersShareAllocation(
+    const HloInstruction* a, const HloInstruction* b, const ShapeIndex& index,
+    const BufferAssignment& buffer_assignment) {
+  const BufferAllocation::Slice slice_a =
+      buffer_assignment.GetUniqueSlice(a, index).ConsumeValueOrDie();
+  const BufferAllocation::Slice slice_b =
+      buffer_assignment.GetUniqueSlice(b, index).ConsumeValueOrDie();
+  if (slice_a != slice_b) {
+    return InternalError(
+        "instruction %s %s does not share allocation with instruction %s %s",
+        a->ToString().c_str(), slice_a.ToString().c_str(),
+        b->ToString().c_str(), slice_b.ToString().c_str());
+  }
+  return Status::OK();
+}
+
 // Checks that all buffers used during while loop iteration share the same
 // buffer allocation. This includes buffers for while result, while init
 // operand, condition parameter, body parameter and body result.
@@ -1782,37 +2318,65 @@ Status CheckWhileBuffersShareAllocation(
     const BufferAssignment& buffer_assignment) {
   return ShapeUtil::ForEachSubshapeWithStatus(
       xla_while->shape(),
-      [&buffer_assignment, &xla_while](const Shape& /*subshape*/,
-                                       const ShapeIndex& index) -> Status {
-        auto check = [&buffer_assignment](const HloInstruction* a,
-                                          const HloInstruction* b,
-                                          const ShapeIndex& index) -> Status {
-          const BufferAllocation::Slice slice_a =
-              buffer_assignment.GetUniqueSlice(a, index).ConsumeValueOrDie();
-          const BufferAllocation::Slice slice_b =
-              buffer_assignment.GetUniqueSlice(b, index).ConsumeValueOrDie();
-          if (slice_a != slice_b) {
-            return InternalError(
-                "instruction %s %s does not share allocation with "
-                "instruction %s %s",
-                a->ToString().c_str(), slice_a.ToString().c_str(),
-                b->ToString().c_str(), slice_b.ToString().c_str());
-          }
-          return Status::OK();
-        };
+      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
         const HloInstruction* condition_parameter =
             xla_while->while_condition()->parameter_instruction(0);
         const HloComputation* body = xla_while->while_body();
         const HloInstruction* body_parameter = body->parameter_instruction(0);
         const HloInstruction* body_result = body->root_instruction();
-        TF_RETURN_IF_ERROR(check(xla_while, xla_while->operand(0), index));
-        TF_RETURN_IF_ERROR(check(xla_while, condition_parameter, index));
-        TF_RETURN_IF_ERROR(check(xla_while, body_parameter, index));
-        TF_RETURN_IF_ERROR(check(xla_while, body_result, index));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            xla_while, xla_while->operand(0), index, buffer_assignment));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            xla_while, condition_parameter, index, buffer_assignment));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            xla_while, body_parameter, index, buffer_assignment));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            xla_while, body_result, index, buffer_assignment));
         return Status::OK();
       });
 }
 
+// Checks that the buffers used in a conditional instruction are shared with the
+// operands and result as follows:
+//   * The result buffer of the conditional should share the allocation with the
+//     result buffers of the true and false computations.
+//   * The buffer of operand 1 should share the allocation with the buffer of
+//     the parameter 0 instruction of the true computation.
+//   * The buffer of operand 2 should share the allocation with the buffer of
+//     the parameter 0 instruction of the false computation.
+Status CheckConditionalBuffersShareAllocation(
+    const HloInstruction* conditional,
+    const BufferAssignment& buffer_assignment) {
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      conditional->shape(),
+      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            conditional, conditional->true_computation()->root_instruction(),
+            index, buffer_assignment));
+        TF_RETURN_IF_ERROR(CheckHloBuffersShareAllocation(
+            conditional, conditional->false_computation()->root_instruction(),
+            index, buffer_assignment));
+        return Status::OK();
+      }));
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      conditional->operand(1)->shape(),
+      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
+        return CheckHloBuffersShareAllocation(
+            conditional->operand(1),
+            conditional->true_computation()->parameter_instruction(0), index,
+            buffer_assignment);
+      }));
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      conditional->operand(2)->shape(),
+      [&](const Shape& /*subshape*/, const ShapeIndex& index) -> Status {
+        return CheckHloBuffersShareAllocation(
+            conditional->operand(2),
+            conditional->false_computation()->parameter_instruction(0), index,
+            buffer_assignment);
+      }));
+  return Status::OK();
+}
+
 }  // namespace
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildWhileThunk(
@@ -1855,9 +2419,36 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildForThunk(
                               ir_emitter_body.ConsumeThunkSequence(), hlo);
 }
 
+std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
+    const HloInstruction* hlo) {
+  // Check that the buffers used in conditional are shared with the operands and
+  // result appropriately.
+  TF_CHECK_OK(CheckConditionalBuffersShareAllocation(
+      hlo, ir_emitter_context_->buffer_assignment()));
+
+  HloComputation* true_computation = hlo->true_computation();
+  IrEmitterUnnested ir_emitter_true(hlo_module_config_, true_computation,
+                                    ir_emitter_context_);
+  TF_CHECK_OK(true_computation->root_instruction()->Accept(&ir_emitter_true));
+
+  HloComputation* false_computation = hlo->false_computation();
+  IrEmitterUnnested ir_emitter_false(hlo_module_config_, false_computation,
+                                     ir_emitter_context_);
+  TF_CHECK_OK(false_computation->root_instruction()->Accept(&ir_emitter_false));
+
+  return MakeUnique<ConditionalThunk>(
+      GetAllocationSlice(*hlo->operand(0)),
+      GetAllocationSlice(*hlo->operand(1)),
+      GetAllocationSlice(*hlo->operand(2)),
+      std::move(*ir_emitter_true.ConsumeThunkSequence()),
+      std::move(*ir_emitter_false.ConsumeThunkSequence()), hlo);
+}
+
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator, KernelThunk* thunk) {
+  VLOG(3) << bindings_.ToString();
+
   const Shape& element_shape = hlo.IsMultiOutputFusion()
                                    ? ShapeUtil::GetSubshape(hlo.shape(), {0})
                                    : hlo.shape();
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
new file mode 100644
index 0000000000000000000000000000000000000000..688760efbd2c725a4bf48e45eb6f2734b63d25e1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -0,0 +1,205 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
+
+#include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+
+namespace xla {
+namespace gpu {
+
+// Emits LLVM IR for an "unnested computation".
+//
+// An unnested computation is an HloComputation which you run by executing one
+// or more kernels for each HloInstruction it contains.  Examples of unnested
+// computations:
+//
+//  - An HloModule's root computation,
+//  - The body of an HLO while loop,
+//  - The true/false computation of an HLO conditional.
+//
+// Note the opportunity for confusion -- the while loop's computation is nested
+// within the root computation, but it's emitted using IrEmitterUnnested!  Don't
+// think about it too hard.
+//
+// Examples of things that are not unnested computations:
+//
+//  - The reducer of a kReduce HLO.  This is emited using IrEmitterNested.
+//  - The body of a fusion node.  IrEmitterUnenested emits the relevant code
+//    within a kernel function using FusedIrEmitter.  (FusedIrEmitter is not
+//    really an IrEmitter, but is more an "IR generator generator".)
+//
+class IrEmitterUnnested : public IrEmitter {
+ public:
+  IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
+                    const HloComputation* hlo_computation,
+                    IrEmitterContext* ir_emitter_context);
+  IrEmitterUnnested(const IrEmitterUnnested&) = delete;
+  IrEmitterUnnested& operator=(const IrEmitterUnnested&) = delete;
+
+  // Transfers the ownship of thunk_sequence_ out.
+  std::unique_ptr<ThunkSequence> ConsumeThunkSequence() {
+    return std::move(thunk_sequence_);
+  }
+
+  Status DefaultAction(HloInstruction* hlo) override;
+
+  // IrEmitterUnnested handles the following instructions differently from
+  // IrEmitter.
+  Status HandleCopy(HloInstruction* copy) override;
+  Status HandleConditional(HloInstruction* conditional) override;
+  Status HandleConvolution(HloInstruction* convolution) override;
+  Status HandleCustomCall(HloInstruction* custom_call) override;
+  Status HandleDot(HloInstruction* dot) override;
+  Status HandleFft(HloInstruction* fft) override;
+  Status HandleFusion(HloInstruction* fusion) override;
+  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
+  Status HandleReduce(HloInstruction* reduce) override;
+  Status HandleSelectAndScatter(HloInstruction* instruction) override;
+  Status HandleTuple(HloInstruction* tuple) override;
+  Status HandleWhile(HloInstruction* xla_while) override;
+  Status HandleInfeed(HloInstruction* xla_infeed) override;
+  Status HandleRng(HloInstruction* random) override;
+  Status HandleSelect(HloInstruction* select) override;
+
+  Status EmitTargetElementLoop(
+      const HloInstruction& hlo,
+      const llvm_ir::ElementGenerator& body_emitter) override;
+
+  // Same as `EmitTargetElementLoop`, but in given `thunk` rather than
+  // `LastThunk()`.
+  Status EmitTargetElementLoopInThunk(
+      const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter,
+      KernelThunk* thunk);
+
+ private:
+  // Builds the appropriate thunk for the instruction hlo and returns the owning
+  // pointer to it. The caller needs to make sure `inst` outlives the lifetime
+  // of the returned Thunk object.
+  std::unique_ptr<Thunk> BuildThunk(const HloInstruction* hlo);
+
+  // Builds the prototype of the IR kernel for `inst` and adds it to the module.
+  // This kernel takes as arguments pointers to the given buffer allocations.
+  llvm::Function* BuildKernelPrototype(
+      const HloInstruction& inst,
+      tensorflow::gtl::ArraySlice<const BufferAllocation*> args);
+
+  // EmitColumnReduction and EmitRowReduction emit code for column and row
+  // reduction of a matrix and/or 3D tensor. Row and column reduction have
+  // different memory access pattern, so for performance their implementations
+  // are significantly different.
+  //
+  // Emits code that reduces a matrix of shape [height x width] to a vector of
+  // [width]. Other parameters have the same meaning as those of
+  // `EmitReductionToVector`. Note that input shape might not be
+  // [height x width], but can be bitcast to [height x weight] with "height"
+  // being the major dimension.
+  Status EmitColumnReduction(int64 height, int64 width, HloInstruction* reduce,
+                             const Shape& input_shape,
+                             const llvm_ir::ElementGenerator& input_gen,
+                             const llvm_ir::ElementGenerator& init_value_gen,
+                             HloComputation* reducer);
+
+  // Emits code that reduces a 3D tensor of shape [depth x height x width] to a
+  // vector of shape [height]. Other parameters have the same meaning as those
+  // of `EmitReductionToVector`. Note that input shape might not be
+  // [depth x height x width], but can be bitcast to [depth x height x weight]
+  // with "depth" being the most major dimension.
+  Status EmitRowReduction(int64 depth, int64 height, int64 width,
+                          HloInstruction* reduce, const Shape& input_shape,
+                          const llvm_ir::ElementGenerator& input_gen,
+                          const llvm_ir::ElementGenerator& init_value_gen,
+                          HloComputation* reducer);
+
+  // Emits code that reduces a tensor of arbitrary rank to a scalar.
+  Status EmitReductionToScalar(HloInstruction* reduce, const Shape& input_shape,
+                               const llvm_ir::ElementGenerator& input_gen,
+                               const llvm_ir::ElementGenerator& init_value_gen,
+                               HloComputation* reducer);
+
+  // Figures out whether `reduce` is a row or column reduction, and which
+  // dimensions to reduce, and calls either `EmitRowReduction` or
+  // `EmitColumnReduction` as appropriate. `input_shape` is the shape of the
+  // input array, which is the operand of the Reduce instruction if unfused or
+  // of the Fusion instruction if fused. `input_gen` and `init_value_gen`
+  // generate elements of the input and the initial value. Other parameters mean
+  // the same as for `HandleReduce`.
+  //
+  // Prerequisite: `IsReductionToVector(*reduce)`
+  Status EmitReductionToVector(
+      HloInstruction* reduce, const Shape& input_shape,
+      const llvm_ir::ElementGenerator& input_gen,
+      const llvm_ir::ElementGenerator& init_value_gen,
+      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
+      HloComputation* reducer);
+
+  // Emits code to initialize buffer of `inst` in given `thunk`.
+  Status EmitInitializer(const HloInstruction* inst, KernelThunk* thunk);
+
+  // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
+  // caller needs to make sure `inst` outlives the lifetime of the returned
+  // Thunk object.
+  std::unique_ptr<Thunk> BuildKernelThunk(const HloInstruction* inst);
+
+  // Returns a FftThunk that calls cuFFT to implement `inst`.
+  std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
+
+  // Returns a GemmThunk that calls gemm to implement `inst`. The caller needs
+  // to make sure `inst` outlives the lifetime of the returned Thunk object.
+  std::unique_ptr<Thunk> BuildGemmThunk(const HloInstruction* inst);
+
+  // Returns a thunk that calls host-to-device cuMemcpy to implement `inst`.
+  std::unique_ptr<Thunk> BuildHostToDeviceCopyThunk(const HloInstruction* inst);
+
+  // Returns a thunk that calls device-to-device cuMemcpy to implement `inst`.
+  std::unique_ptr<Thunk> BuildDeviceToDeviceCopyThunk(
+      const HloInstruction* inst);
+
+  // Returns an InfeedThunk that performs device-to-device memcpy to implement
+  // `inst`.
+  std::unique_ptr<Thunk> BuildInfeedThunk(const HloInstruction* inst);
+
+  // Returns a WhileThunk that invokes thunk sequences for 'condition' and
+  // 'body' sub-computations of while instruction 'hlo'.
+  std::unique_ptr<Thunk> BuildWhileThunk(const HloInstruction* hlo);
+
+  // Returns a ForThunk which executes 'loop_limit' invocations of a thunk
+  // sequence from the 'body' sub-computation of the while instruction 'hlo'.
+  std::unique_ptr<Thunk> BuildForThunk(const HloInstruction* hlo,
+                                       const int64 loop_limit);
+
+  // Returns a ConditionalThunk that executes the thunk sequence for
+  // 'true_computation' or 'false_computation' depending on the value of the
+  // predicate in the given conditional instruction.
+  std::unique_ptr<Thunk> BuildConditionalThunk(const HloInstruction* hlo);
+
+  Status Postprocess(HloInstruction* hlo) override;
+
+  // Returns the last generated thunk.
+  Thunk* LastThunk() const { return thunk_sequence_->back().get(); }
+
+  // The thunk sequence this IrEmitter generates for the input computation.
+  std::unique_ptr<ThunkSequence> thunk_sequence_;
+
+  // The HloComputation that this IrEmitter emits code for.
+  const HloComputation* hlo_computation_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index 96606993696354f36e143b3b994bbe6afb902df3..c20a781a33fe89af4740ed31dd5bfb1a64473057 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -29,10 +29,10 @@ namespace xla {
 namespace gpu {
 
 KernelThunk::KernelThunk(
-    tensorflow::gtl::ArraySlice<BufferAllocation::Slice> io_buffers,
+    tensorflow::gtl::ArraySlice<const BufferAllocation*> args,
     const string& kernel_name, const HloInstruction* hlo_instruction)
     : Thunk(Kind::kKernel, hlo_instruction),
-      io_buffers_(io_buffers.begin(), io_buffers.end()),
+      args_(args.begin(), args.end()),
       kernel_name_(kernel_name) {}
 
 tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
@@ -42,7 +42,7 @@ tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
     return tensorflow::Status::OK();
   }
 
-  loader_spec_.reset(new se::MultiKernelLoaderSpec(io_buffers_.size() + 1));
+  loader_spec_.reset(new se::MultiKernelLoaderSpec(args_.size()));
   tensorflow::StringPiece ptx = executable.ptx();
   // Convert tensorflow::StringPiece to se::port::StringPiece because
   // StreamExecutor uses the latter.
@@ -81,15 +81,16 @@ tensorflow::Status KernelThunk::ExecuteOnStream(
     kernel = &it->second;
   }
 
+  VLOG(3) << "Launching " << kernel->name();
   // Launch the kernel with potentially multiple blocks and threads.
   static constexpr int kKernelArgsLimit = 1024;
   auto kernel_args = MakeUnique<se::KernelArgsArray<kKernelArgsLimit>>();
-  for (const BufferAllocation::Slice io_buffer : io_buffers_) {
-    kernel_args->add_device_memory_argument(
-        buffer_allocations.GetDeviceAddress(io_buffer));
+  for (const BufferAllocation* arg : args_) {
+    const auto& buf = buffer_allocations.GetDeviceAddress(arg->index());
+    kernel_args->add_device_memory_argument(buf);
+    VLOG(3) << "  Arg: alloc #" << arg->index() << ": " << buf.opaque() << " ("
+            << buf.size() << "B)";
   }
-  kernel_args->add_device_memory_argument(
-      buffer_allocations.GetTempBufferBase());
   if (!stream->parent()->Launch(
           stream, se::ThreadDim(launch_dimensions.threads_per_block()),
           se::BlockDim(launch_dimensions.block_count()), *kernel,
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 350b5aaf360b0dad7f7b04d73f4c32bad55d3ce9..9ae455e2fcc253a7a08ff95764721048a16b0bf7 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -46,7 +46,7 @@ class KernelThunk : public Thunk {
   // Constructs a thunk for the given kernel.
   //
   // `hlo_instruction` is as in Thunk. Other arguments are as the class members.
-  KernelThunk(tensorflow::gtl::ArraySlice<BufferAllocation::Slice> io_buffers,
+  KernelThunk(tensorflow::gtl::ArraySlice<const BufferAllocation*> args,
               const string& kernel_name, const HloInstruction* hlo_instruction);
   KernelThunk(const KernelThunk&) = delete;
   KernelThunk& operator=(const KernelThunk&) = delete;
@@ -63,8 +63,8 @@ class KernelThunk : public Thunk {
       perftools::gputools::Stream* stream) override;
 
  private:
-  // The indices of the input/output buffers.
-  const std::vector<BufferAllocation::Slice> io_buffers_;
+  // Buffers passed to the kernel as arguments.
+  const std::vector<const BufferAllocation*> args_;
 
   // Entry kernel name for the computation.
   const string kernel_name_;
diff --git a/tensorflow/compiler/xla/service/gpu/layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/layout_assignment.cc
deleted file mode 100644
index d475c4171b56ceedf5fdbda8b4d6221af844261c..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/layout_assignment.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/layout_assignment.h"
-
-#include <memory>
-
-#include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace xla {
-namespace gpu {
-
-Status GpuLayoutAssignment::AddBackendConstraints(
-    LayoutConstraints* constraints) {
-  for (auto* instruction : constraints->computation()->instructions()) {
-    // cuDNN is called with specific layouts on the input, output, and filter:
-    //
-    //   input: DataLayout::kBatchDepthYX
-    //   output: DataLayout::kBatchDepthYX
-    //   filter: FilterLayout::kOutputInputYX
-    //
-    // The order dimensions in the constant name is major-to-minor (eg, the
-    // most-major dimension of the input is batch, most-minor is X). The
-    // specific dimension numbers these named dimensions correspond to is
-    // determined by the ConvolutionDimensionNumbers argument. Y is spatial
-    // dimension 0, and X is spatial dimension 1.
-    //
-    // TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls.
-    if (ImplementedAsDnnConvolution(*instruction)) {
-      HloInstruction* input = nullptr;
-      HloInstruction* filter = nullptr;
-      HloInstruction* output = nullptr;
-      if (instruction->opcode() == HloOpcode::kConvolution) {
-        input = instruction->mutable_operand(0);
-        filter = instruction->mutable_operand(1);
-        output = instruction;
-      } else {
-        CHECK_EQ(HloOpcode::kFusion, instruction->opcode());
-        switch (instruction->fusion_kind()) {
-          case HloInstruction::FusionKind::kConvBackwardFilter:
-            // filter = BackwardFilterConvolve(input, output)
-            input = instruction->mutable_operand(0);
-            filter = instruction;
-            output = instruction->mutable_operand(1);
-            break;
-          case HloInstruction::FusionKind::kConvBackwardInput:
-            // input = BackwardInputConvolve(output, filter)
-            input = instruction;
-            filter = instruction->mutable_operand(1);
-            output = instruction->mutable_operand(0);
-            break;
-          default:
-            LOG(FATAL) << "Not a convolution-fusion";
-        }
-      }
-
-      // Construct minor-to-major dimension orders for operands and result.
-      // cuDNN's convolution APIs support the BDYX layout for activations/output
-      // and the OIYX layout for weights.
-      // TODO(b/29399649): Be more flexible about handling layouts of cuDNN
-      // calls after we switch to cuDNN v5.
-      const ConvolutionDimensionNumbers& dimension_numbers =
-          instruction->convolution_dimension_numbers();
-      std::vector<int64> input_layout;
-      for (int i = dimension_numbers.input_spatial_dimensions_size() - 1;
-           i >= 0; --i) {
-        input_layout.push_back(dimension_numbers.input_spatial_dimensions(i));
-      }
-      input_layout.push_back(dimension_numbers.input_feature_dimension());
-      input_layout.push_back(dimension_numbers.input_batch_dimension());
-      Shape input_shape(input->shape());
-      *input_shape.mutable_layout() = LayoutUtil::MakeLayout(input_layout);
-
-      std::vector<int64> filter_layout;
-      for (int i = dimension_numbers.kernel_spatial_dimensions_size() - 1;
-           i >= 0; --i) {
-        filter_layout.push_back(dimension_numbers.kernel_spatial_dimensions(i));
-      }
-      filter_layout.push_back(
-          dimension_numbers.kernel_input_feature_dimension());
-      filter_layout.push_back(
-          dimension_numbers.kernel_output_feature_dimension());
-      Shape filter_shape(filter->shape());
-      *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout);
-
-      std::vector<int64> output_layout;
-      for (int i = dimension_numbers.output_spatial_dimensions_size() - 1;
-           i >= 0; --i) {
-        output_layout.push_back(dimension_numbers.output_spatial_dimensions(i));
-      }
-      output_layout.push_back(dimension_numbers.output_feature_dimension());
-      output_layout.push_back(dimension_numbers.output_batch_dimension());
-      Shape output_shape(output->shape());
-      *output_shape.mutable_layout() = LayoutUtil::MakeLayout(output_layout);
-
-      // Set layouts of the instructions' shapes.
-      if (instruction->opcode() == HloOpcode::kConvolution) {
-        TF_RETURN_IF_ERROR(
-            constraints->SetOperandLayout(input_shape, output, 0));
-        TF_RETURN_IF_ERROR(
-            constraints->SetOperandLayout(filter_shape, output, 1));
-        TF_RETURN_IF_ERROR(
-            constraints->SetInstructionLayout(output_shape, output));
-      } else {
-        CHECK_EQ(HloOpcode::kFusion, instruction->opcode());
-        switch (instruction->fusion_kind()) {
-          case HloInstruction::FusionKind::kConvBackwardFilter:
-            // filter = BackwardFilterConvolve(input, output)
-            TF_RETURN_IF_ERROR(
-                constraints->SetOperandLayout(input_shape, filter, 0));
-            TF_RETURN_IF_ERROR(
-                constraints->SetInstructionLayout(filter_shape, filter));
-            TF_RETURN_IF_ERROR(
-                constraints->SetOperandLayout(output_shape, filter, 1));
-            break;
-          case HloInstruction::FusionKind::kConvBackwardInput:
-            // input = BackwardInputConvolve(output, filter)
-            TF_RETURN_IF_ERROR(
-                constraints->SetInstructionLayout(input_shape, input));
-            TF_RETURN_IF_ERROR(
-                constraints->SetOperandLayout(output_shape, input, 0));
-            TF_RETURN_IF_ERROR(
-                constraints->SetOperandLayout(filter_shape, input, 1));
-            break;
-          default:
-            LOG(FATAL) << "Not a convolution-fusion";
-        }
-      }
-    }
-  }
-  return Status::OK();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/layout_assignment_test.cc
deleted file mode 100644
index ac206b89d329d7e4ac91ee51162c9694f6899d78..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/layout_assignment_test.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/layout_assignment.h"
-
-#include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/shape_layout.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-using LayoutAssignmentTest = HloTestBase;
-
-TEST_F(LayoutAssignmentTest, Elementwise) {
-  Shape ashape = ShapeUtil::MakeShape(F32, {42, 12});
-  Shape ashape_in_row_major(ashape);
-  Shape ashape_in_col_major(ashape);
-  *ashape_in_row_major.mutable_layout() = LayoutUtil::MakeLayout({1, 0});
-  *ashape_in_col_major.mutable_layout() = LayoutUtil::MakeLayout({0, 1});
-
-  // Enumerate all possible combinations of layouts.
-  for (const Shape& lhs_shape_with_layout :
-       {ashape_in_row_major, ashape_in_col_major}) {
-    for (const Shape& rhs_shape_with_layout :
-         {ashape_in_row_major, ashape_in_col_major}) {
-      for (const Shape& result_shape_with_layout :
-           {ashape_in_row_major, ashape_in_col_major}) {
-        // GpuLayoutAssignment should assign the same layout to "add" and its
-        // two operands.
-        auto builder = HloComputation::Builder(TestName());
-        auto x = builder.AddInstruction(
-            HloInstruction::CreateParameter(0, ashape, "x"));
-        auto y = builder.AddInstruction(
-            HloInstruction::CreateParameter(1, ashape, "y"));
-        auto add = builder.AddInstruction(
-            HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, x, y));
-        auto module = CreateNewModule();
-        HloComputation* computation =
-            module->AddEntryComputation(builder.Build(add));
-
-        ComputationLayout computation_layout(
-            computation->ComputeProgramShape());
-        *computation_layout.mutable_parameter_layout(0) =
-            ShapeLayout(lhs_shape_with_layout);
-        *computation_layout.mutable_parameter_layout(1) =
-            ShapeLayout(rhs_shape_with_layout);
-        *computation_layout.mutable_result_layout() =
-            ShapeLayout(result_shape_with_layout);
-
-        GpuLayoutAssignment layout_assignment(&computation_layout);
-        EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
-
-        for (const HloInstruction* operand : add->operands()) {
-          EXPECT_TRUE(LayoutUtil::Equal(add->shape().layout(),
-                                        operand->shape().layout()));
-        }
-      }
-    }
-  }
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 059943d48cd34b0ac487b91c3f3079ee3f761229..cfabae791d26d0eb49826085ad7ad166a19109a1 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -440,7 +440,7 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
 
 // One-time module initializer.
 // Must be called only once -- DO NOT CALL DIRECTLY.
-void GPUBackendInit() {
+void GPUBackendInit(const HloModuleConfig& hlo_module_config) {
   // Feed all customized flags here, so we can override them with llvm_cl_opts
   // without redeploy the compiler for development purpose.
 
@@ -466,6 +466,8 @@ void GPUBackendInit() {
   // between those loads.
   FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
 
+  llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config);
+
   // Initialize the NVPTX target; it's the only target we link with, so call its
   // specific initialization functions instead of the catch-all InitializeAll*.
   LLVMInitializeNVPTXTarget();
@@ -485,7 +487,7 @@ StatusOr<string> CompileToPtx(llvm::Module* module,
                               const HloModuleConfig& hlo_module_config,
                               const string& libdevice_dir_path) {
   static std::once_flag backend_init_flag;
-  std::call_once(backend_init_flag, GPUBackendInit);
+  std::call_once(backend_init_flag, GPUBackendInit, hlo_module_config);
 
   string ptx;
   {
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index 11290eda4ffcd579c03acd531b493bb7b1d34ed4..25846dc6cd4633c7becb6e62d6bc9585348a6eac 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -27,8 +27,8 @@ namespace gpu {
 
 namespace {
 bool IsForwardConvolutionCanonical(const HloInstruction& conv) {
-  CHECK_EQ(HloOpcode::kConvolution, conv.opcode());
-  return window_util::HasEvenPadding(conv.window()) &&
+  CHECK_EQ(conv.custom_call_target(), kCudnnConvForwardCallTarget);
+  return window_util::HasSymmetricPadding(conv.window()) &&
          !window_util::HasNegativePadding(conv.window()) &&
          !window_util::HasDilation(conv.window());
 }
@@ -43,10 +43,16 @@ HloInstruction* MaybePaddedAndSlicedInput(
     const Window& conv_window, const ConvolutionDimensionNumbers& conv_dnums,
     HloInstruction* input) {
   HloComputation* computation = input->parent();
-  if (!window_util::HasEvenPadding(conv_window) ||
+  if (!window_util::HasSymmetricPadding(conv_window) ||
       window_util::HasBaseDilation(conv_window)) {
     // If padding is uneven or has dilation, we insert a kPad instruction that
     // applies positive padding and dilation.
+    //
+    // TODO(phawkins): If conv_window has asymmetric padding, perhaps instead of
+    // moving all the padding into an explicit pad op, we should keep as much
+    // padding inside of cudnn as possible, on the assumption that padding
+    // within cudnn is basically free, whereas a kPad's cost increases as the
+    // amount of padding increases.
     PaddingConfig padding_config =
         MakeNoPaddingConfig(input->shape().dimensions_size());
     for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
@@ -167,14 +173,17 @@ bool PadInsertion::CanonicalizeForwardConvolution(HloInstruction* conv) {
     dim->set_window_dilation(1);
   }
 
+  // The conv CustomCall returns a tuple (conv_result, scratch_buffer).  Extract
+  // out the shape of conv_result.
+  Shape old_conv_shape = conv->shape().tuple_shapes(0);
+
   VLOG(1) << "Canonicalizing forward conv";
-  auto new_conv = HloInstruction::CreateConvolve(
-      conv->shape(), new_input, new_kernel, new_conv_window,
-      conv->convolution_dimension_numbers());
+  auto new_conv = CreateCudnnConvForward(old_conv_shape, new_input, new_kernel,
+                                         new_conv_window,
+                                         conv->convolution_dimension_numbers());
   VLOG(1) << "Replacing:\n  " << conv->ToString() << "\nwith:\n  "
           << new_conv->ToString();
-  TF_CHECK_OK(
-      conv->parent()->ReplaceWithNewInstruction(conv, std::move(new_conv)));
+  TF_CHECK_OK(conv->parent()->ReplaceInstruction(conv, new_conv));
   return true;
 }
 
@@ -190,7 +199,9 @@ void IncreasePaddingHighBy(int64 delta, WindowDimension* window_dim) {
 
 bool PadInsertion::CanonicalizeBackwardFilterConvolution(
     HloInstruction* backward_conv) {
-  if (window_util::HasEvenPadding(backward_conv->window())) {
+  CHECK_EQ(backward_conv->custom_call_target(),
+           kCudnnConvBackwardFilterCallTarget);
+  if (window_util::HasSymmetricPadding(backward_conv->window())) {
     return false;
   }
 
@@ -202,16 +213,11 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
   //   ABCD0 = Pad(ABCD, padding_high=1)
   //   BackwardFilterConv(ABCD0, xyz, padding_low=pading_high=1)
   // We choose the lesser of padding_low and padding_high as the new padding.
-  HloInstruction* transpose = backward_conv->fused_expression_root();
-  HloInstruction* forward_conv = transpose->mutable_operand(0);
   HloInstruction* input = backward_conv->mutable_operand(0);
-  Window new_forward_conv_window = forward_conv->window();
   Window new_backward_conv_window = backward_conv->window();
   // input_padding_config is the config of the kPad to be inserted.
   PaddingConfig input_padding_config =
       MakeNoPaddingConfig(ShapeUtil::Rank(input->shape()));
-  ConvolutionDimensionNumbers forward_conv_dnums =
-      forward_conv->convolution_dimension_numbers();
   ConvolutionDimensionNumbers backward_conv_dnums =
       backward_conv->convolution_dimension_numbers();
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
@@ -223,11 +229,7 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
       // cuDNN convolution (which doesn't support negative padding) to fail.
       return false;
     }
-    // If the backward convolution has uneven padding on the activations, we
-    // move some padding on the larger end to "internal" padding, so that the
-    // backward convolution produces larger weight gradients which get sliced
-    // later. Therefore, the amount of new padding (low or high) is the minimum
-    // of the amount of old padding low and old padding high.
+    // Compute the new, even padding for the backward conv operation.
     int64 new_conv_padding = std::min(padding_low, padding_high);
     int64 dim = backward_conv_dnums.input_spatial_dimensions(i);
     input_padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -238,14 +240,9 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
     // Since we move some padding from the backward convolution to the kPad, we
     // need to accordingly reduce the padding amount of the backward convolution
     // and its inner forward convolution.
-    IncreasePaddingLowBy(-(padding_low - new_conv_padding),
-                         new_backward_conv_window.mutable_dimensions(i));
-    IncreasePaddingHighBy(-(padding_high - new_conv_padding),
-                          new_backward_conv_window.mutable_dimensions(i));
-    IncreasePaddingLowBy(-(padding_low - new_conv_padding),
-                         new_forward_conv_window.mutable_dimensions(i));
-    IncreasePaddingHighBy(-(padding_high - new_conv_padding),
-                          new_forward_conv_window.mutable_dimensions(i));
+    auto* new_dim = new_backward_conv_window.mutable_dimensions(i);
+    new_dim->set_padding_low(new_conv_padding);
+    new_dim->set_padding_high(new_conv_padding);
   }
 
   // Create a new backward convolution replacing the old one.
@@ -261,28 +258,12 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
               .ConsumeValueOrDie(),
           input, padding, input_padding_config));
 
-  HloInstruction* new_forward_conv =
-      computation->AddInstruction(HloInstruction::CreateConvolve(
-          ShapeInference::InferConvolveShape(
-              padded_input->shape(), output->shape(), new_forward_conv_window,
-              forward_conv_dnums)
-              .ConsumeValueOrDie(),
-          padded_input, output, new_forward_conv_window, forward_conv_dnums));
-
-  HloInstruction* new_transpose =
-      computation->AddInstruction(HloInstruction::CreateTranspose(
-          ShapeInference::InferTransposeShape(new_forward_conv->shape(),
-                                              transpose->dimensions())
-              .ConsumeValueOrDie(),
-          new_forward_conv, transpose->dimensions()));
-
-  // Fuse the new forward convolution and the new transpose to the new backward
-  // convolution.
-  HloInstruction* new_backward_conv =
-      computation->CreateFusionInstructionForBackwardConvolution(
-          {new_transpose, new_forward_conv},
-          HloInstruction::FusionKind::kConvBackwardFilter,
-          new_backward_conv_window, backward_conv_dnums);
+  // The shape of the backward_conv CustomCall is a tuple (conv_result,
+  // scratch_buffer).  Extract out the shape of conv_result.
+  Shape backward_conv_shape = backward_conv->shape().tuple_shapes(0);
+  HloInstruction* new_backward_conv = CreateCudnnConvBackwardFilter(
+      backward_conv_shape, padded_input, output, new_backward_conv_window,
+      backward_conv_dnums);
 
   VLOG(1) << "Canonicalizing backward filter conv";
   VLOG(1) << "Replacing:\n  " << backward_conv->ToString() << "\nwith:\n  "
@@ -295,18 +276,19 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
 
 bool PadInsertion::CanonicalizeBackwardInputConvolution(
     HloInstruction* backward_conv) {
-  if (window_util::HasEvenPadding(backward_conv->window())) {
+  if (window_util::HasSymmetricPadding(backward_conv->window())) {
     return false;
   }
 
-  HloInstruction* forward_conv = backward_conv->fused_expression_root();
-  HloInstruction* reverse_filter = forward_conv->mutable_operand(1);
-  Window new_forward_conv_window = forward_conv->window();
   Window new_backward_conv_window = backward_conv->window();
-  ConvolutionDimensionNumbers forward_conv_dnums =
-      forward_conv->convolution_dimension_numbers();
   ConvolutionDimensionNumbers backward_conv_dnums =
       backward_conv->convolution_dimension_numbers();
+
+  // The backward_conv CustomCall returns a tuple (conv_result, scratch_memory).
+  // Get the shape of conv_result.
+  Shape backward_conv_shape = backward_conv->shape().tuple_shapes(0);
+
+  Shape new_backward_conv_shape = backward_conv_shape;
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
     int64 padding_low = backward_conv->window().dimensions(i).padding_low();
     int64 padding_high = backward_conv->window().dimensions(i).padding_high();
@@ -325,41 +307,38 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
     // where the amount of padding low is larger, we can canonicalize it to
     //   [B A] = BackwardInputConvolve([a b], [x y z], padding=(low=1,high=1))
     //   [A] = Slice([B A])
-    // For consistency, we need to increase the low padding of the inner
-    // convolution by 1 as well because the input is larger now.
     if (padding_low > padding_high) {
       IncreasePaddingLowBy(padding_high - padding_low,
                            new_backward_conv_window.mutable_dimensions(i));
-      IncreasePaddingLowBy(padding_low - padding_high,
-                           new_forward_conv_window.mutable_dimensions(i));
     } else if (padding_low < padding_high) {
       IncreasePaddingHighBy(padding_low - padding_high,
                             new_backward_conv_window.mutable_dimensions(i));
-      IncreasePaddingHighBy(padding_high - padding_low,
-                            new_forward_conv_window.mutable_dimensions(i));
     }
+    // Decreasing the padding by X *increases* the size of our output by X.
+    int64 dim = backward_conv_dnums.output_spatial_dimensions(i);
+    new_backward_conv_shape.set_dimensions(
+        dim, new_backward_conv_shape.dimensions(dim) +
+                 std::abs(padding_low - padding_high));
   }
 
   // Create a new backward convolution replacing the old one.
   HloComputation* computation = backward_conv->parent();
   HloInstruction* output = backward_conv->mutable_operand(0);
   HloInstruction* filter = backward_conv->mutable_operand(1);
-  HloInstruction* new_reverse_filter =
-      computation->AddInstruction(HloInstruction::CreateReverse(
-          filter->shape(), filter, reverse_filter->dimensions()));
-  HloInstruction* new_forward_conv =
-      computation->AddInstruction(HloInstruction::CreateConvolve(
-          ShapeInference::InferConvolveShape(
-              output->shape(), new_reverse_filter->shape(),
-              new_forward_conv_window, forward_conv_dnums)
-              .ConsumeValueOrDie(),
-          output, new_reverse_filter, new_forward_conv_window,
-          forward_conv_dnums));
+
+  HloInstruction* new_backward_conv_call = CreateCudnnConvBackwardInput(
+      new_backward_conv_shape, output, filter, new_backward_conv_window,
+      backward_conv_dnums);
+
+  // The CustomCall created above returns a tuple (conv_result, scratch_memory).
+  // Extract out the two elements.
   HloInstruction* new_backward_conv =
-      computation->CreateFusionInstructionForBackwardConvolution(
-          {new_forward_conv, new_reverse_filter},
-          HloInstruction::FusionKind::kConvBackwardInput,
-          new_backward_conv_window, backward_conv_dnums);
+      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_backward_conv_shape, new_backward_conv_call, 0));
+  HloInstruction* new_backward_conv_scratch =
+      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_backward_conv_call->shape().tuple_shapes(1),
+          new_backward_conv_call, 1));
 
   // Slice the new backward convolution.
   //
@@ -387,22 +366,25 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
   }
 
   // Replace the old backward convolution with the slice.
-  CHECK(ShapeUtil::Compatible(
+  Shape slice_shape =
       ShapeInference::InferSliceShape(new_backward_conv->shape(), start_indices,
                                       limit_indices, strides)
-          .ConsumeValueOrDie(),
-      backward_conv->shape()));
+          .ConsumeValueOrDie();
+  CHECK(ShapeUtil::Compatible(slice_shape, backward_conv_shape))
+      << ShapeUtil::HumanString(slice_shape) << " vs "
+      << ShapeUtil::HumanString(backward_conv_shape);
 
-  auto slice =
-      HloInstruction::CreateSlice(backward_conv->shape(), new_backward_conv,
-                                  start_indices, limit_indices, strides);
+  HloInstruction* slice = computation->AddInstruction(
+      HloInstruction::CreateSlice(backward_conv_shape, new_backward_conv,
+                                  start_indices, limit_indices, strides));
+  HloInstruction* new_tuple = computation->AddInstruction(
+      HloInstruction::CreateTuple({slice, new_backward_conv_scratch}));
 
   VLOG(1) << "Canonicalizing backward input conv";
   VLOG(1) << "Replacing:\n  " << backward_conv->ToString() << "\nwith:\n  "
-          << slice->ToString();
+          << new_tuple->ToString();
 
-  TF_CHECK_OK(
-      computation->ReplaceWithNewInstruction(backward_conv, std::move(slice)));
+  TF_CHECK_OK(computation->ReplaceInstruction(backward_conv, new_tuple));
   return true;
 }
 
@@ -410,18 +392,17 @@ StatusOr<bool> PadInsertion::Run(HloModule* module) {
   bool changed = false;
   for (HloInstruction* instruction :
        module->entry_computation()->MakeInstructionPostOrder()) {
-    if (instruction->opcode() == HloOpcode::kConvolution) {
-      changed |= CanonicalizeForwardConvolution(instruction);
-    } else if (instruction->opcode() == HloOpcode::kFusion) {
-      switch (instruction->fusion_kind()) {
-        case HloInstruction::FusionKind::kConvBackwardFilter:
-          changed |= CanonicalizeBackwardFilterConvolution(instruction);
-          break;
-        case HloInstruction::FusionKind::kConvBackwardInput:
-          changed |= CanonicalizeBackwardInputConvolution(instruction);
-          break;
-        default:
-          break;
+    if (IsCustomCallToDnnConvolution(*instruction)) {
+      const auto& target = instruction->custom_call_target();
+      if (target == kCudnnConvForwardCallTarget) {
+        changed |= CanonicalizeForwardConvolution(instruction);
+      } else if (target == kCudnnConvBackwardFilterCallTarget) {
+        changed |= CanonicalizeBackwardFilterConvolution(instruction);
+      } else if (target == kCudnnConvBackwardInputCallTarget) {
+        changed |= CanonicalizeBackwardInputConvolution(instruction);
+      } else {
+        LOG(FATAL) << "Unknown custom call target for cudnn conv: "
+                   << instruction->ToString();
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index 457e6094d90413440658452937bff2ccfe6cbe5c..388dcc008b07a76ff9ed07df04181e49a8734f51 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -88,6 +88,23 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
           /*HasNUW=*/true, /*HasNSW=*/true),
       thread_id, "linear_index", /*HasNUW=*/true, /*HasNSW=*/true);
 
+  // Add an @llvm.assume(linear_index < threads_per_block * num_blocks).
+  //
+  // This might seem obvious from the computation above, but LLVM does not
+  // currently determine the range of linear_index precisely.  InstCombine uses
+  // known-bits, which, when applied to the task of determining a value's range,
+  // is imprecise for everything other than powers of 2.  And
+  // CorrelatedValuePropagation is, as a cost-saving measure, disabled for
+  // conditions in the same basic block as their operands.
+  llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::assume,
+      {ir_builder_->CreateICmpULT(
+          linear_index,
+          ir_builder_->getInt64(launch_dimensions_.threads_per_block() *
+                                launch_dimensions_.block_count()),
+          "linear_index_in_range")},
+      {}, ir_builder_);
+
   auto if_in_bounds = llvm_ir::EmitIfThenElse(
       ir_builder_->CreateICmpULT(
           linear_index, ir_builder_->getInt64(ShapeUtil::ElementsIn(shape_))),
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
index 934e7e1919f08a16daf09ec634e2f9dc0c7cc723..8ed63a854a74fc06c3c389f40fe1f5970885deac 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
@@ -42,6 +42,11 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
                       const LaunchDimensions& launch_dimensions,
                       llvm::IRBuilder<>* ir_builder);
 
+  // Constructs a loop emitter for a loop that generates on element of each of N
+  // arrays on each iteration.
+  //
+  // This is used in multi-output fusion.  target_element_generator should
+  // produce a struct with N elements, one for each of target_arrays.
   ParallelLoopEmitter(
       const llvm_ir::ElementGenerator& target_element_generator,
       tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index d0d2deee24848184278e3e51dcaa3bb673b5fadc..6cf280df05496716a0780d61ded92efd9982734c 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -44,37 +44,41 @@ std::ostream& operator<<(std::ostream& out,
 
 // Calculates the launch dimensions used to invoke `hlo`.
 LaunchDimensions CalculateLaunchDimensions(
-    const Shape& shape, const se::DeviceDescription& device_desc,
-    PartitionStrategy partition_strategy) {
-  int64 warp_size = device_desc.threads_per_warp();
-
+    const Shape& shape, const se::DeviceDescription& device_desc) {
   int64 num_elements = ShapeUtil::ElementsIn(shape);
   if (num_elements <= 1) {
     return LaunchDimensions();
   }
 
-  // Calculate the number of threads per block.
-  // Initialize threads_per_block as the threads-per-block limit.
-  int64 threads_per_block = device_desc.threads_per_block_limit();
-  VLOG(2) << "Initial # of threads per block = " << threads_per_block;
-
-  if (partition_strategy == PartitionStrategy::kLatency) {
-    // Limit the thread count to allow maximum number of registers per thread.
-    // TODO(b/28560520): We don't have to assume the emitted kernel will use up
-    // all the registers. We could use ptxas to examine the actual number of
-    // register used, and set the thread count accordingly.
-    int64 threads_per_block_limit_due_to_registers =
-        device_desc.registers_per_core_limit() /
-        device_desc.registers_per_thread_limit();
-    CHECK_NE(0, threads_per_block_limit_due_to_registers);
-    if (threads_per_block_limit_due_to_registers < threads_per_block) {
-      threads_per_block =
-          // Make `threads_per_block` a multiple of warp size to use GPU
-          // efficiently.
-          warp_size *
-          std::max(1LL, threads_per_block_limit_due_to_registers / warp_size);
-      VLOG(2) << "Update # of threads per block due to register pressure = "
-              << threads_per_block;
+  // Since we don't do any inter-warp communication, we're free to choose any
+  // block size we want, subject to hardware constraints.  We choose the
+  // smallest block size that allows the GPU to reach full occupancy (assuming
+  // the kernel uses sufficiently few registers).  This gives us max performance
+  // when the kernel uses few registers, and lets us scale down gracefully as
+  // the kernel uses more registers.
+  //
+  // Specifically, we choose the number of threads per block such that
+  //
+  //   <num threads per block> * <max blocks per core> = <max threads per core>
+
+  auto threads_per_core = device_desc.threads_per_core_limit();
+  auto blocks_per_core = device_desc.blocks_per_core_limit();
+  int64 threads_per_block;
+  if (threads_per_core != 0 && blocks_per_core != 0) {
+    threads_per_block = device_desc.threads_per_core_limit() /
+                        device_desc.blocks_per_core_limit();
+  } else {
+    static std::atomic<int64> log_count{0};
+    if (log_count.fetch_add(1) < 8) {
+      LOG(WARNING) << "Attempting to calculate launch dimensions for GPU "
+                      "without full information about its capabilities.  "
+                      "StreamExecutor's PopulateDeviceDescription should be "
+                      "updated for this device.";
+    }
+    threads_per_block = device_desc.threads_per_warp();
+    if (threads_per_block == 0) {
+      // Fall back to *something* if we can't even get num threads per warp.
+      threads_per_block = 32;
     }
   }
 
@@ -84,8 +88,6 @@ LaunchDimensions CalculateLaunchDimensions(
             << threads_per_block << ") because the latter is smaller.";
   }
 
-  // Calculate the block count. We copy the strategy used by Eigen:
-  // eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
   int64 block_count = CeilOfRatio(num_elements, threads_per_block);
   VLOG(2) << tensorflow::strings::Printf(
       "Initialized the block count to ceil(# of elements / threads per "
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
index 8f7fce884acc93fd39510ad0826b819a6d9731a7..0bf463a6ef95d5a32784838c08ad239752fd1acf 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@@ -30,14 +30,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-enum class PartitionStrategy {
-  // Optimized for latency by allowing maximum number of registers per thread.
-  kLatency,
-  // Optimized for throughput. This may limit registers per thread and cause
-  // longer latency.
-  kThroughput
-};
-
 // Encapsulates the launch dimensions of a kernel, e.g., the block count and the
 // number of threads per block.
 class LaunchDimensions {
@@ -66,8 +58,7 @@ std::ostream& operator<<(std::ostream& out,
 
 LaunchDimensions CalculateLaunchDimensions(
     const Shape& shape,
-    const perftools::gputools::DeviceDescription& device_desc,
-    PartitionStrategy partition_strategy = PartitionStrategy::kLatency);
+    const perftools::gputools::DeviceDescription& device_desc);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 0ff27888ad72f8190400c22a9086d1965448662c..2c3032d79be221e8cacb178ffb1817459b603cc0 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -41,8 +41,13 @@ class GpuExecutable;
 class Thunk {
  public:
   enum class Kind {
+    kConditional,
     kConvolution,
     kCopy,
+    kCudnnBatchNormBackward,
+    kCudnnBatchNormForwardInference,
+    kCudnnBatchNormForwardTraining,
+    kFft,
     kGemm,
     kInfeed,
     kKernel,
@@ -70,6 +75,29 @@ class Thunk {
     return tensorflow::Status::OK();
   }
 
+  // Users of Thunk should call ShouldHaltAllActivityBeforeRunning(stream)
+  // before calling ExecuteOnStream(stream).  If it returns true, it's the
+  // user's responsibility to wait for all activity on the GPU to finish before
+  // calling ExecuteOnStream.
+  //
+  // This value is not required to be constant for a given Thunk.  For example,
+  // a Thunk that performs autotuning may return true for its first run and
+  // false thereafter.
+  virtual bool ShouldHaltAllActivityBeforeRunning(
+      perftools::gputools::Stream* /*stream*/) {
+    return false;
+  }
+
+  // Indicates whether thunks scheduled after this one should wait for this one
+  // to complete before running. For example, a convolution thunk creates a
+  // scratch allocator, then kicks off a convolution in cudnn via the stream
+  // executor. When the stream executor call returns, the scratch allocator goes
+  // out of scope, and the scratch memory is deallocated. In this case, the
+  // convolution thunk needs to return true so that future thunks wait for the
+  // convolution thunk to avoid reusing the deallocated memory until the
+  // convolution thunk is done with it.
+  virtual bool ShouldBlockFutureThunks() { return false; }
+
   // Execute the kernel for the thunk on the given stream. This method must be
   // called after Initialize and can be called multiple times over Thunk's
   // lifetime. Stream argument must be non-null.
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index 0d2412096abf7838b7b0e7617811c789f507a4a1..c21559af6d2e5dfb5aaf62afcdcaed514e0914c9 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -34,16 +34,14 @@ WhileThunk::WhileThunk(
       body_thunk_sequence_(
           MakeUnique<SequentialThunk>(std::move(*body_thunk_sequence), hlo)) {}
 
-tensorflow::Status WhileThunk::Initialize(const GpuExecutable& executable) {
+Status WhileThunk::Initialize(const GpuExecutable& executable) {
   TF_RETURN_IF_ERROR(condition_thunk_sequence_->Initialize(executable));
   TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status WhileThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations,
-    perftools::gputools::Stream* stream) {
-
+Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                   perftools::gputools::Stream* stream) {
   perftools::gputools::DeviceMemoryBase condition_result_data =
       buffer_allocations.GetDeviceAddress(condition_result_buffer_index_);
 
@@ -55,9 +53,11 @@ tensorflow::Status WhileThunk::ExecuteOnStream(
     // Copy the result of condition computation and break the loop if 'false'.
     bool condition_result;
     stream->ThenMemcpy(&condition_result, condition_result_data, sizeof(bool));
-    if (!stream->BlockHostUntilDone()) {
+    Status block_status = stream->BlockHostUntilDone();
+    if (!block_status.ok()) {
       return InternalError(
-          "Failed to complete all kernels launched on stream %p", stream);
+          "Failed to complete all kernels launched on stream %p: %s", stream,
+          block_status.error_message().c_str());
     }
 
     if (!condition_result) {
@@ -68,7 +68,7 @@ tensorflow::Status WhileThunk::ExecuteOnStream(
     TF_RETURN_IF_ERROR(
         body_thunk_sequence_->ExecuteOnStream(buffer_allocations, stream));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 95ed5497cea4fa3ba5dcdc6762cbd53cec88339a..4c9f45de9e42494df58706d0a4a3eb0c4220b8b8 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -45,10 +45,9 @@ class WhileThunk : public Thunk {
   WhileThunk(const WhileThunk&) = delete;
   WhileThunk& operator=(const WhileThunk&) = delete;
 
-  tensorflow::Status Initialize(const GpuExecutable& executable) override;
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations,
-      perftools::gputools::Stream* stream) override;
+  Status Initialize(const GpuExecutable& executable) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         perftools::gputools::Stream* stream) override;
 
  private:
   const BufferAllocation::Slice condition_result_buffer_index_;
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
index ccdd1717593e4fa7c1d1deb3f0f9ebfab1bf7209..e6caec8625f0d622dbb92bcc20802d254fe23f94 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
@@ -44,7 +44,7 @@ namespace {
 //
 //            Parameter
 //               |
-//   Const  GetTupleElemet
+//   Const  GetTupleElement
 //      \   /
 //       Add (root)
 //
@@ -62,7 +62,7 @@ namespace {
 //                                &tagged_instructions));
 //
 // Instructions that are "tagged" with a context-specific string will
-// be returned in 'tagged_instructions' for further procesing (i.e. parsing
+// be returned in 'tagged_instructions' for further processing (i.e. parsing
 // constants or recording the tuple_index).
 //
 class ExprTree {
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.h b/tensorflow/compiler/xla/service/gpu/while_transformer.h
index a4f527fce0e4e280e24efc1f33ea68a0b71555b9..fe3a954e1828ee4a323872eea81f64c7e780ad24 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.h
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -40,4 +40,4 @@ StatusOr<std::tuple<int64, int64, int64>> CanTransformWhileToFor(
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index f16daa0b5481474e754c880ead1945297ca50168..2f290f61bd527e9827472a78256f015e066e44be 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -117,9 +117,7 @@ class WhileTransformerTest : public HloTestBase {
   }
 
   void RunCopyInsertionPass() {
-    HloVerifier verifier([](const Shape& shape) {
-      return ShapeUtil::ByteSizeOf(shape, /*pointer_size=*/sizeof(void*));
-    });
+    HloVerifier verifier;
     TF_ASSERT_OK(verifier.Run(module_.get()).status());
     CopyInsertion copy_insertion;
     TF_ASSERT_OK(copy_insertion.Run(module_.get()).status());
diff --git a/tensorflow/compiler/xla/service/graphviz_example.cc b/tensorflow/compiler/xla/service/graphviz_example.cc
index 049e8d80d80c835bca4a4d38592564ba82a3ecf9..05017008e2ddbe0b9e78d06275fdec5d08d94bfa 100644
--- a/tensorflow/compiler/xla/service/graphviz_example.cc
+++ b/tensorflow/compiler/xla/service/graphviz_example.cc
@@ -108,8 +108,11 @@ std::unique_ptr<HloModule> MakeBigGraph() {
       HloInstruction::CreateUnary(vshape, HloOpcode::kCopy, param_v0));
   auto clamp = builder.AddInstruction(HloInstruction::CreateTernary(
       vshape, HloOpcode::kClamp, copy, param_v1, param_v2));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(vshape, HloOpcode::kDot, clamp, param_v0));
+      HloInstruction::CreateDot(vshape, clamp, param_v0, dot_dnums));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({dot, param_s, clamp}));
   auto scalar = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 34e2f7ee206c6a74073d8f4e867e862feb4aff49..cde5877e29f36abc61c5417ce960e2c7699e2749 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -64,10 +64,8 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_fn,
-    const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
-  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign,
-                     &module_sequence);
+    const LogicalBuffer::SizeFunction& size_fn, const Options& options) {
+  HeapSimulator heap(std::move(algorithm), size_fn, options, &module_sequence);
   const HloComputation* entry_computation = module.entry_computation();
   const std::vector<const HloInstruction*>& instruction_sequence =
       FindOrDie(module_sequence, entry_computation);
@@ -81,9 +79,8 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
     const std::vector<const HloInstruction*>& instruction_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_fn,
-    const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
-  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign,
+    const LogicalBuffer::SizeFunction& size_fn, const Options& options) {
+  HeapSimulator heap(std::move(algorithm), size_fn, options,
                      /*module_sequence=*/nullptr);
   TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence,
                                          points_to_analysis));
@@ -199,15 +196,17 @@ Status HeapSimulator::RunComputation(
       // We can only share with the operand buffer if it is about to be freed;
       // we must be the last user of the buffer.
       bool shared = false;
-      for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) {
-        if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
-            buffer->instruction()->opcode() != HloOpcode::kCopy &&
-            CanShareOperandBufferWithUser(
-                operand_buffer->instruction(), operand_buffer->index(),
-                buffer->instruction(), buffer->index(), points_to_analysis)) {
-          ShareBuffer(buffer, operand_buffer, instruction);
-          shared = true;
-          break;
+      if (options_.may_reuse_operand_buffers) {
+        for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) {
+          if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
+              buffer->instruction()->opcode() != HloOpcode::kCopy &&
+              CanShareOperandBufferWithUser(
+                  operand_buffer->instruction(), operand_buffer->index(),
+                  buffer->instruction(), buffer->index(), points_to_analysis)) {
+            ShareBuffer(buffer, operand_buffer, instruction);
+            shared = true;
+            break;
+          }
         }
       }
 
@@ -266,13 +265,12 @@ Status HeapSimulator::RunComputation(
 
 HeapSimulator::HeapSimulator(
     std::unique_ptr<HeapAlgorithm> algorithm,
-    const LogicalBuffer::SizeFunction& size_fn,
-    const FlatSet<const LogicalBuffer*>* buffers_to_assign,
+    const LogicalBuffer::SizeFunction& size_fn, const Options& options,
     const SequentialHloOrdering::HloModuleSequence* module_sequence)
     : no_fragmentation_stats_(MakeUnique<NoFragmentationStatsHeap>()),
       algorithm_(std::move(algorithm)),
       size_fn_(size_fn),
-      buffers_to_assign_(buffers_to_assign),
+      options_(options),
       module_sequence_(module_sequence) {
   debug_trace_.set_whole_module_simulation(module_sequence_ != nullptr);
 }
@@ -280,13 +278,16 @@ HeapSimulator::HeapSimulator(
 HeapSimulator::~HeapSimulator() {}
 
 bool HeapSimulator::IgnoreBuffer(const LogicalBuffer* buffer) const {
-  // Buffers for constants are ignored, as with BufferAssigner.  Also ignore
-  // buffers that we're not meant to assign.
+  // Buffers for constants are ignored unless the alloc_constants option is
+  // set. Also ignore buffers that we're not meant to assign.
   //
   // TODO(b/32248867): For consistency, constants should get allocations.
-  return buffer->instruction()->opcode() == HloOpcode::kConstant ||
-         (buffers_to_assign_ != nullptr &&
-          buffers_to_assign_->count(buffer) == 0);
+  if (!options_.alloc_constants &&
+      buffer->instruction()->opcode() == HloOpcode::kConstant) {
+    return true;
+  }
+  return options_.buffers_to_assign != nullptr &&
+         options_.buffers_to_assign->count(buffer) == 0;
 }
 
 // Alloc always calls the underlying heap algorithm.
@@ -400,8 +401,8 @@ HeapSimulator::Result HeapSimulator::Finish() {
     }
     // If we were told to assign specific buffers, make sure we've assigned
     // exactly that many buffers.
-    if (buffers_to_assign_ != nullptr) {
-      CHECK_EQ(buffers_to_assign_->size(), result.chunk_map.size());
+    if (options_.buffers_to_assign != nullptr) {
+      CHECK_EQ(options_.buffers_to_assign->size(), result.chunk_map.size());
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index a03ad2f37cf5ede35275ea019ab3d5998fb85d0a..636f19dd39f09721bd82fc4b44785f196f281ad7 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -67,6 +67,23 @@ class HeapSimulator {
     HeapSimulatorTrace debug_trace;
   };
 
+  // The different options to be passed to the Run() APIs.
+  struct Options {
+    Options()
+        : may_reuse_operand_buffers(true),
+          alloc_constants(false),
+          buffers_to_assign(nullptr) {}
+
+    // Whether a buffer about to be Free()-ed, can be recycled for a new born
+    // one, hence collapsing Free()+Alloc() calls (default true).
+    bool may_reuse_operand_buffers;
+    // Whether to issue Alloc() and Free() calls for constants (default false).
+    bool alloc_constants;
+    // If 'buffers_to_assign' is provided, only those buffers are assigned
+    // offsets, otherwise all buffers defined by the instructions are assigned.
+    const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign;
+  };
+
   // Run the heap simulation with the given algorithm, assuming the given
   // module_sequence, which must contain a topologically-consistent total
   // ordering of all instructions within each computation. The result is invalid
@@ -76,15 +93,12 @@ class HeapSimulator {
   // to running on a per-computation basis, since we can re-use buffer space for
   // called sub-computations.
   //
-  // If 'buffers_to_assign' is provided, only those buffers are assigned
-  // offsets, otherwise all buffers defined by the instructions are assigned.
   static StatusOr<Result> Run(
       std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
       const SequentialHloOrdering::HloModuleSequence& module_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_fn,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
-          nullptr);
+      const Options& options = Options());
 
   // Same as above, but runs on a single computation. The 'instruction_sequence'
   // must contain a topologically-consistent total ordering of all instructions
@@ -96,8 +110,7 @@ class HeapSimulator {
       const std::vector<const HloInstruction*>& instruction_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_fn,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
-          nullptr);
+      const Options& options = Options());
 
  private:
   // If 'module_sequence' is non-null, it is used to find kCall and kWhile
@@ -105,8 +118,7 @@ class HeapSimulator {
   // be run recursively. I.e. the simulation is run over the whole module.
   HeapSimulator(
       std::unique_ptr<HeapAlgorithm> algorithm,
-      const LogicalBuffer::SizeFunction& size_fn,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign,
+      const LogicalBuffer::SizeFunction& size_fn, const Options& options,
       const SequentialHloOrdering::HloModuleSequence* module_sequence);
   ~HeapSimulator();
 
@@ -130,7 +142,7 @@ class HeapSimulator {
   const std::unique_ptr<HeapAlgorithm> no_fragmentation_stats_;
   const std::unique_ptr<HeapAlgorithm> algorithm_;
   const LogicalBuffer::SizeFunction size_fn_;
-  const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign_;
+  const Options options_;
   const SequentialHloOrdering::HloModuleSequence* module_sequence_;
 
   // In addition to Alloc and Free, the heap simulator exposes a concept of
@@ -264,7 +276,7 @@ class LazyBestFitHeap : public HeapAlgorithm {
   enum { kLazyAllocOffset = -1 };
 
   struct OrderChunkByIncreasingSize {
-    bool operator()(const Chunk& a, const Chunk& b) {
+    bool operator()(const Chunk& a, const Chunk& b) const {
       if (a.size != b.size) return a.size < b.size;
       return a.offset < b.offset;
     }
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 17b926c8748e45b55f380e7595711b9e7a748f64..387b649a731ebcbfd8307807469f39f22d192b06 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -259,8 +259,11 @@ TEST_F(HeapSimulatorTest, MultiplyDot) {
       HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+      HloInstruction::CreateDot(f32vec4_, mul, paramY, dot_dnums));
 
   // The buffer for dot is the output, and it cannot be shared with the buffer
   // for mul, since dot isn't elementwise.
@@ -292,8 +295,11 @@ TEST_F(HeapSimulatorTest, MultiplyDotAdd) {
       HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+      HloInstruction::CreateDot(f32vec4_, mul, paramY, dot_dnums));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, dot, paramA));
 
@@ -327,10 +333,13 @@ TEST_F(HeapSimulatorTest, MultiplyDotDot) {
       HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot0 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+      HloInstruction::CreateDot(f32vec4_, mul, paramY, dot_dnums));
   auto dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, dot0, paramY));
+      HloInstruction::CreateDot(f32vec4_, dot0, paramY, dot_dnums));
 
   // The buffer for dot1 is the output.  No buffers can be shared.  The buffer
   // for mul is freed before the end, since it's no longer used after dot0
@@ -365,10 +374,13 @@ TEST_F(HeapSimulatorTest, MultiplyDotDotTuple) {
       HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot0 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+      HloInstruction::CreateDot(f32vec4_, mul, paramY, dot_dnums));
   auto dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, dot0, paramY));
+      HloInstruction::CreateDot(f32vec4_, dot0, paramY, dot_dnums));
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({dot0, dot1}));
 
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index e984bdb5f75f714fb7b4453a97178158d9b8a8b8..36db711c6c3570efdf678261ad38bbdb08cf94aa 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -36,6 +36,9 @@ option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
 message HloInstructionProto {
+  reserved 10;
+  reserved "parameter_name";
+
   string name = 1;
   string opcode = 2;
   xla.Shape shape = 3;
@@ -50,9 +53,8 @@ message HloInstructionProto {
   // Literal, only present for kConstant.
   xla.LiteralProto literal = 8;
 
-  // Parameter info, only present for kParameter.
+  // Parameter number is only present for kParameter.
   int64 parameter_number = 9;
-  string parameter_name = 10;
 
   // Fusion state, only present for kFusion.
   string fusion_kind = 11;
@@ -118,6 +120,15 @@ message HloInstructionProto {
 
   // Shape of outfeed request.
   xla.Shape outfeed_shape = 29;
+
+  // Describes the dimension numbers used for a dot operation
+  xla.DotDimensionNumbers dot_dimension_numbers = 30;
+
+  // FFT type (FFT, IFFT, etc).
+  xla.FftType fft_type = 31;
+
+  // FFT length.
+  repeated int64 fft_length = 32;
 }
 
 // Serialization of HloComputation.
@@ -189,6 +200,7 @@ message BufferAllocationProto {
   bool is_reusable = 4;
   bool is_entry_computation_parameter = 5;
   int64 parameter_number = 6;
+  repeated int64 parameter_shape_index = 10;
   bool maybe_live_out = 7;
   int64 color = 8;
   repeated Assigned assigned = 9;
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index c215cc48d60b93a88d64b7c4aecb2aa3bb460443..5432419e4a2dd2916da32ac6566851bf52fd68ca 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -131,9 +131,9 @@ Status HloComputation::RemoveParameter(int64 param_no) {
 
   while (param_no < param_instructions_.size()) {
     param_instruction = param_instructions_[param_no];
-    string param_name = param_instruction->parameter_name();
+    string param_name = param_instruction->name();
     // Fusion parameters are named foo.param_1, bar.param_2, etc. We are
-    // renumbering the parameters so replace the final number in the name with
+    // renumbering the parameters, so replace the final number in the name with
     // the updated value.
     const string param_underscore = ".param_";
     size_t index = param_name.rfind(param_underscore);
@@ -176,10 +176,6 @@ bool HloComputation::IsRemovable(const HloInstruction* instruction) {
     return false;
   }
 
-  if (instruction->HasSideEffect()) {
-    return false;
-  }
-
   return true;
 }
 
@@ -207,7 +203,8 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
     worklist.pop();
 
     if (removed.count(item) != 0 || item->user_count() != 0 ||
-        item == root_instruction() || !IsRemovable(item)) {
+        item == root_instruction() || !IsRemovable(item) ||
+        item->HasSideEffect()) {
       continue;
     }
     for (int i = 0; i < item->operand_count(); ++i) {
@@ -367,26 +364,27 @@ std::list<HloComputation*> HloComputation::MakeEmbeddedComputationsList()
   return post_order;
 }
 
-string HloComputation::ToString(int nested_level,
-                                bool include_large_constants) const {
+string HloComputation::ToString(const HloPrintOptions& options) const {
   std::ostringstream s;
-  for (int i = 0; i < nested_level; i++) {
+  for (int i = 0; i < options.indent_amount(); i++) {
     s << "    ";
   }
-  s << "%" << name() << " " << ShapeUtil::HumanString(ComputeProgramShape())
-    << " {\n";
+  if (options.print_percent()) {
+    s << "%";
+  }
+  s << name();
+  if (options.print_program_shape()) {
+    s << " " << ShapeUtil::HumanString(ComputeProgramShape());
+  }
+  s << " {\n";
   for (const HloInstruction* instruction : MakeInstructionPostOrder()) {
-    for (int i = 0; i < nested_level; i++) {
+    for (int i = 0; i < options.indent_amount(); i++) {
       s << "    ";
     }
     s << "  " << (instruction == root_instruction_ ? "ROOT " : "")
-      << instruction->ToString(
-             /*compact_operands=*/false,
-             /*include_metadata=*/true,
-             /*include_large_constants=*/include_large_constants)
-      << "\n";
+      << instruction->ToString(options) << "\n";
   }
-  for (int i = 0; i < nested_level; i++) {
+  for (int i = 0; i < options.indent_amount(); i++) {
     s << "    ";
   }
   s << "}";
@@ -463,20 +461,6 @@ HloInstruction* HloComputation::CreateFusionInstruction(
   return fusion_instruction;
 }
 
-HloInstruction* HloComputation::CreateFusionInstructionForBackwardConvolution(
-    tensorflow::gtl::ArraySlice<HloInstruction*> instructions_to_fuse,
-    HloInstruction::FusionKind fusion_kind, const Window& window,
-    const ConvolutionDimensionNumbers& conv_dnums) {
-  CHECK(HloInstruction::FusionKind::kConvBackwardFilter == fusion_kind ||
-        HloInstruction::FusionKind::kConvBackwardInput == fusion_kind);
-  HloInstruction* root = instructions_to_fuse.front();
-  HloInstruction* fusion_instruction =
-      AddInstruction(HloInstruction::CreateFusionForBackwardConvolution(
-          root->shape(), fusion_kind, window, conv_dnums, root));
-  FuseInstructionsInto(instructions_to_fuse, fusion_instruction);
-  return fusion_instruction;
-}
-
 StatusOr<HloInstruction*> HloComputation::DeepCopyHelper(
     HloInstruction* instruction, const ShapeTree<bool>* indices_to_copy,
     ShapeTree<HloInstruction*>* copies_added, ShapeIndex* index) {
@@ -543,7 +527,7 @@ ProgramShape HloComputation::ComputeProgramShape() const {
 
   for (auto* param_instruction : param_instructions_) {
     *program_shape.add_parameters() = param_instruction->shape();
-    *program_shape.add_parameter_names() = param_instruction->parameter_name();
+    *program_shape.add_parameter_names() = param_instruction->name();
   }
   *program_shape.mutable_result() = root_instruction_->shape();
 
@@ -579,8 +563,11 @@ Status HloComputation::ReplaceWithNewInstruction(
 
 Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
                                           HloInstruction* new_instruction) {
-  TF_RET_CHECK(ShapeUtil::Compatible(old_instruction->shape(),
-                                     new_instruction->shape()));
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(old_instruction->shape(), new_instruction->shape()))
+      << ShapeUtil::HumanString(old_instruction->shape()) << " vs "
+      << ShapeUtil::HumanString(new_instruction->shape());
+
   VLOG(10) << "transformed " << old_instruction->ToString() << " to "
            << new_instruction->ToString();
   // Try to add metadata for HLO instructions that are created to replace
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 353b30bc69d98556311635d6097e3d6ad5fb2aaa..061c59abe5e315917161ed737f89de53d71bb1b6 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -138,8 +138,11 @@ class HloComputation {
   void UniquifyName(NameUniquer* name_uniquer);
 
   // Return a string representation of the computation.
-  string ToString(int nested_level = 0,
-                  bool include_large_constants = false) const;
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  string ToString() const { return ToString(HloPrintOptions()); }
+  string ToString(const HloPrintOptions& options) const;
 
   // Returns a serialized representation of this computation.
   HloComputationProto ToProto() const;
@@ -221,15 +224,6 @@ class HloComputation {
       tensorflow::gtl::ArraySlice<HloInstruction*> instructions_to_fuse,
       HloInstruction::FusionKind fusion_kind);
 
-  // Creates a fusion instruction that represents a backward convolution. This
-  // is similar to CreateFusionInstruction but takes window and conv_dnums which
-  // indicate the window and convolution dimension numbers of the backward
-  // convolution.
-  HloInstruction* CreateFusionInstructionForBackwardConvolution(
-      tensorflow::gtl::ArraySlice<HloInstruction*> instructions_to_fuse,
-      HloInstruction::FusionKind fusion_kind, const Window& window,
-      const ConvolutionDimensionNumbers& conv_dnums);
-
   // Create a deep copy of the given instruction and return the instruction
   // producing the copied result. All instructions performing the copy are added
   // to the computation. For array-shaped values, this method trivially returns
@@ -313,11 +307,17 @@ class HloComputation {
           replacements,
       HloModule* module = nullptr, const string& suffix = "clone");
 
-  // Returns true if the given instruction can be removed from the
-  // computation. Instructions such as parameters and send/receive instructions
-  // cannot be removed without violating invariants of the HLO computation or
-  // module with the exception of fusion computation.  A parameter instruction
-  // is removable for a fusion computation.
+  // Returns true if the given instruction can be removed from the computation.
+  // Parameter instructions cannot be removed without violating invariants of
+  // the HLO computation with the exception of fusion computation. A parameter
+  // instruction is removable for a fusion computation.
+  //
+  // Note that IsRemovable() is a necessariy condition to remove an instruction
+  // rather than a sufficient condition. For example, instructions with
+  // side-effect (e.g., Send, Infeed) may be removed from a computation, but the
+  // transformation must guarantee the invariants relevant to the instructions
+  // still hold (e.g., Send and Recv must be removed together to make each
+  // channel complete).
   bool IsRemovable(const HloInstruction* instruction);
 
   // Returns true if this computation has a side effect. A computation has a
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 6fcc01dd64d1ac041e99eedb8b1de476409b257d..9cd5a1e2b71a7aa768e478289e8e4cc13030fcc3 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -201,10 +201,11 @@ Status HloCostAnalysis::HandleCopy(const HloInstruction*) {
 Status HloCostAnalysis::HandleDot(const HloInstruction* dot) {
   const Shape& lhs_shape = dot->operand(0)->shape();
   const Shape& rhs_shape = dot->operand(1)->shape();
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
   // Count of elements along the reduction dimension (last dimension for the
   // rhs).
-  int64 reduction_width = lhs_shape.dimensions(ShapeUtil::Rank(lhs_shape) - 1);
-
+  int64 reduction_width =
+      lhs_shape.dimensions(dnums.lhs_contracting_dimensions(0));
   // First divide by reduction width before multiplying by rhs elements to avoid
   // overflow.
   int64 fma_count;
@@ -391,13 +392,35 @@ Status HloCostAnalysis::HandleConvolution(const HloInstruction* convolution) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleFft(const HloInstruction* fft) {
+  auto real_shape =
+      ShapeUtil::IsTuple(fft->operand(0)->shape())
+          ? ShapeUtil::GetTupleElementShape(fft->operand(0)->shape(), 0)
+          : fft->operand(0)->shape();
+  constexpr int kFmaPerComplexMul = 4;
+  int64 log_factors = 1;
+  for (int64 dim : fft->fft_length()) {
+    log_factors *= tensorflow::Log2Floor(dim);
+  }
+  current_properties_[kFlopsKey] = kFmaFlops * kFmaPerComplexMul * log_factors *
+                                   ShapeUtil::ElementsIn(real_shape);
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleCrossReplicaSum(const HloInstruction* crs) {
   // We assume 2 replicas, so that each output element is the sum of two input
   // elements.
   //
   // TODO(b/33004697): Compute correct cost here, taking the actual number of
   // replicas into account.
-  current_properties_[kFlopsKey] = ShapeUtil::ElementsIn(crs->shape());
+  double flops = 0.0;
+  ShapeUtil::ForEachSubshape(
+      crs->shape(), [&, this](const Shape& subshape, const ShapeIndex&) {
+        if (ShapeUtil::IsArray(subshape)) {
+          flops += ShapeUtil::ElementsIn(subshape);
+        }
+      });
+  current_properties_[kFlopsKey] = flops;
   return Status::OK();
 }
 
@@ -446,7 +469,13 @@ Status HloCostAnalysis::HandleCall(const HloInstruction* call) {
 }
 
 Status HloCostAnalysis::HandleCustomCall(const HloInstruction*) {
-  return Unimplemented("Custom-call is not implemented for HLO cost analysis.");
+  // We can't do anything sane with CustomCalls, since we don't know what they
+  // do, and returning an error status will stop iteration over this
+  // computation, which is probably also not what we want.  So just punt and
+  // return OK.  This will cause all of the properties to be reported as 0,
+  // which is fine.
+  current_should_compute_bottleneck_time_ = false;
+  return Status::OK();
 }
 
 Status HloCostAnalysis::HandleSort(const HloInstruction* sort) {
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index fade19522cf0c30eab037aa355de1f9203f80014..e5783539e5436f09fa58bf7889118380ee90fea0 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -67,6 +67,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleCopy(const HloInstruction* copy) override;
   Status HandleDot(const HloInstruction* dot) override;
   Status HandleConvolution(const HloInstruction* convolution) override;
+  Status HandleFft(const HloInstruction* fft) override;
   Status HandleCrossReplicaSum(const HloInstruction* crs) override;
   Status HandleInfeed(const HloInstruction* infeed) override;
   Status HandleOutfeed(const HloInstruction* outfeed) override;
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index d35ba19a730555433099072c51ca5cf3774d4b99..279edd4ba8772a9c576f76f554de8ec68631b953 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace xla {
 
@@ -91,6 +92,10 @@ bool CombineConstants(HloComputation* computation, bool is_layout_sensitive) {
 
 StatusOr<bool> HloCSE::Run(HloModule* module) {
   bool changed = false;
+  const std::function<bool(const HloInstruction*, const HloInstruction*)>
+      eq_instructions = std::equal_to<const HloInstruction*>();
+  const std::function<bool(const HloComputation*, const HloComputation*)>
+      eq_computations = std::equal_to<const HloComputation*>();
   for (auto* computation : module->computations()) {
     changed |= CombineConstants(computation, is_layout_sensitive_);
 
@@ -110,11 +115,12 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
       // of this instruction.
       const HloInstruction* operand = instruction->operand(0);
 
-      std::vector<HloInstruction*> equivalent_instructions;
+      tensorflow::gtl::InlinedVector<HloInstruction*, 8>
+          equivalent_instructions;
       for (HloInstruction* user : operand->users()) {
-        if (user != instruction && user->Identical(*instruction) &&
-            (!is_layout_sensitive_ ||
-             ShapeUtil::Equal(user->shape(), instruction->shape()))) {
+        if (user != instruction &&
+            user->Identical(*instruction, eq_instructions, eq_computations,
+                            is_layout_sensitive_)) {
           equivalent_instructions.push_back(user);
         }
       }
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 3f34b9ceb34abc89fca5b896bb8fbe3a06cd6ed4..ccbbe8f1966d59b4ab2904dcc6ea724aaf4a7603 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -154,7 +154,11 @@ bool HloDataflowAnalysis::Phi(
     tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs) {
   CHECK(ssa_form_);
   VLOG(4) << "Phi(" << instruction->name() << ")";
-
+  VLOG(5) << "instruction value set = "
+          << GetInstructionValueSet(instruction).ToString();
+  for (const InstructionValueSet* input : inputs) {
+    VLOG(5) << "input value set = " << input->ToString();
+  }
   for (const InstructionValueSet* input : inputs) {
     DCHECK(ShapeUtil::Compatible(instruction->shape(), input->shape()));
   }
@@ -171,9 +175,14 @@ bool HloDataflowAnalysis::Phi(
         value_set.values().size() == 1 ? value_set.values()[0] : nullptr;
 
     // Construct a vector of unique value IDs of the inputs.
+    // Don't add value ids where the input is equal to the definition.
     std::vector<HloValue::Id> input_value_ids;
     for (const InstructionValueSet* input : inputs) {
       for (const HloValue* value : input->element(index).values()) {
+        if (value->defining_instruction() == instruction &&
+            value->defining_index() == index) {
+          continue;
+        }
         input_value_ids.push_back(value->id());
       }
     }
@@ -190,6 +199,7 @@ bool HloDataflowAnalysis::Phi(
          current_value->defining_instruction() == instruction &&
          current_value->defining_index() == index);
     if (current_value_defined_here) {
+      VLOG(5) << "current_value_defined_here: " << current_value->ToString();
       CHECK(current_value->is_phi());
       auto it = std::find(input_value_ids.begin(), input_value_ids.end(),
                           current_value->id());
@@ -197,7 +207,7 @@ bool HloDataflowAnalysis::Phi(
         input_value_ids.erase(it);
       }
     }
-
+    VLOG(5) << "after input_value_ids.size = " << input_value_ids.size();
     if (input_value_ids.empty()) {
       // A value set which has at least one element should never have its value
       // set reduced to zero elements. During dataflow value sets only can go
@@ -276,6 +286,23 @@ bool HloDataflowAnalysis::UpdateBitcastValueSet(HloInstruction* bitcast) {
   return false;
 }
 
+bool HloDataflowAnalysis::UpdateSliceValueSet(HloInstruction* slice) {
+  CHECK_EQ(slice->opcode(), HloOpcode::kSlice);
+  if (!slice->IsInPlaceSlice()) {
+    return false;
+  }
+  // If this slice is lowered to an in-place version, then it forwards the
+  // operand value to the output.
+  const InstructionValueSet& operand_set =
+      GetInstructionValueSet(slice->operand(0));
+  InstructionValueSet& slice_set = GetInstructionValueSet(slice);
+  if (operand_set != slice_set) {
+    slice_set = operand_set;
+    return true;
+  }
+  return false;
+}
+
 bool HloDataflowAnalysis::UpdateSendValueSet(HloInstruction* send) {
   CHECK_EQ(send->opcode(), HloOpcode::kSend);
   bool changed = false;
@@ -333,6 +360,21 @@ bool HloDataflowAnalysis::UpdateCallValueSet(HloInstruction* call) {
   return false;
 }
 
+bool HloDataflowAnalysis::UpdateConditionalValueSet(
+    HloInstruction* conditional) {
+  CHECK_EQ(conditional->opcode(), HloOpcode::kConditional);
+  std::vector<const InstructionValueSet*> inputs = {
+      &GetInstructionValueSet(
+          conditional->true_computation()->root_instruction()),
+      &GetInstructionValueSet(
+          conditional->false_computation()->root_instruction())};
+  // A phi-node is not defined for a kConditional instruction even though it
+  // represents a join point. This is because the current approach is to define
+  // a phi-node only for kWhile to account for the dataflow through back-edges
+  // and deal with the ambiguity in other cases.
+  return GetInstructionValueSet(conditional).AssignUnionOf(inputs);
+}
+
 bool HloDataflowAnalysis::UpdateCopyValueSet(HloInstruction* copy) {
   CHECK_EQ(copy->opcode(), HloOpcode::kCopy);
   bool changed = false;
@@ -394,7 +436,7 @@ bool HloDataflowAnalysis::UpdateParameterValueSet(HloInstruction* parameter) {
   CHECK_EQ(call_graph_node.context(), CallContext::kSequential);
 
   std::vector<const InstructionValueSet*> inputs;
-  bool called_from_while = false;
+  bool need_phi = false;
   for (const CallSite& callsite : call_graph_node.caller_callsites()) {
     if (callsite.instruction()->opcode() == HloOpcode::kCall) {
       // The operand values of a call instruction are forwarded to the
@@ -416,14 +458,32 @@ bool HloDataflowAnalysis::UpdateParameterValueSet(HloInstruction* parameter) {
         inputs.push_back(&GetInstructionValueSet(
             callsite.instruction()->while_body()->root_instruction()));
       }
-      called_from_while = true;
+      need_phi = true;
+    } else if (callsite.instruction()->opcode() == HloOpcode::kConditional) {
+      CHECK_EQ(parameter->parameter_number(), 0);
+      auto conditional = callsite.instruction();
+      // Conditional has 3 operands. Operand 0 is the predicate, operand 1 is
+      // the argument to the true computation and operand 2 is the argument to
+      // the false computation.
+      //
+      // If the parameter belongs to conditional's true computation, then
+      // operand 1 is forwarded to this parameter instruction. If the parameter
+      // belongs to conditional's false computation, then operand 2 is forwarded
+      // to this parameter instruction.
+      if (parameter->parent() == conditional->true_computation()) {
+        inputs.push_back(&GetInstructionValueSet(conditional->operand(1)));
+      } else {
+        CHECK_EQ(parameter->parent(), conditional->false_computation());
+        inputs.push_back(&GetInstructionValueSet(conditional->operand(2)));
+      }
+      need_phi = true;
     } else {
       LOG(FATAL) << "CallContext::kSequential computations should only be "
-                    "called from call or while instructions";
+                    "called from call, while, or conditional instructions";
     }
   }
 
-  if (ssa_form_ && called_from_while) {
+  if (ssa_form_ && need_phi) {
     return Phi(parameter, inputs);
   } else {
     return GetInstructionValueSet(parameter).AssignUnionOf(inputs);
@@ -494,6 +554,8 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
   switch (instruction->opcode()) {
     case HloOpcode::kBitcast:
       return UpdateBitcastValueSet(instruction);
+    case HloOpcode::kSlice:
+      return UpdateSliceValueSet(instruction);
     case HloOpcode::kCopy:
       return UpdateCopyValueSet(instruction);
     case HloOpcode::kGetTupleElement:
@@ -512,6 +574,8 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
       return UpdateSendValueSet(instruction);
     case HloOpcode::kRecvDone:
       return UpdateRecvDoneValueSet(instruction);
+    case HloOpcode::kConditional:
+      return UpdateConditionalValueSet(instruction);
     default:
       // Instruction does not forward HloValues (it defines all values in its
       // output). No update is necessary.
@@ -521,16 +585,23 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
 
 void HloDataflowAnalysis::Propagate() {
   std::queue<HloInstruction*> worklist;
+  tensorflow::gtl::FlatSet<HloInstruction*> workset;
+  auto add_to_worklist = [&worklist, &workset](HloInstruction* instruction) {
+    if (workset.insert(instruction).second) {
+      worklist.push(instruction);
+    }
+  };
 
   for (HloComputation* computation : module_->computations()) {
     for (HloInstruction* instruction : computation->instructions()) {
-      worklist.push(instruction);
+      add_to_worklist(instruction);
     }
   }
 
   while (!worklist.empty()) {
     HloInstruction* instruction = worklist.front();
     worklist.pop();
+    workset.erase(workset.find(instruction));
 
     VLOG(3) << "Worklist top: " << instruction->name();
     VLOG(3) << ToString();
@@ -544,19 +615,38 @@ void HloDataflowAnalysis::Propagate() {
     VLOG(4) << "New value set for " << instruction->name() << ": "
             << GetInstructionValueSet(instruction);
 
-    // Instruction value was updated. Add users to work list.
+    // Instruction value was updated. Add users to work list if we haven't
+    // already.
     for (HloInstruction* user : instruction->users()) {
-      worklist.push(user);
+      add_to_worklist(user);
 
       // If user sequentially calls a computation, then the respective
       // parameter(s) of the computation need to be updated.
-      for (HloComputation* called_computation : user->called_computations()) {
-        const CallGraphNode& call_graph_node =
-            call_graph_->GetNode(called_computation);
-        if (call_graph_node.context() == CallContext::kSequential) {
-          for (int64 operand_number : user->OperandIndices(instruction)) {
-            worklist.push(
-                called_computation->parameter_instruction(operand_number));
+      if (user->opcode() == HloOpcode::kConditional) {
+        // If operand 0 is the use of instruction, then no parameters need to be
+        // updated, since that is the predicate of the conditional.
+        // If operand 1 is the use of instruction, then the true_computation's
+        // parameter need to be updated.
+        // If operand 2 is the use of instruction, then the false_computation's
+        // parameter need to be updated.
+        //
+        // Note that the same instruction can be used in both operand 1 and
+        // operand 2.
+        if (user->operand(1) == instruction) {
+          add_to_worklist(user->true_computation()->parameter_instruction(0));
+        }
+        if (user->operand(2) == instruction) {
+          add_to_worklist(user->false_computation()->parameter_instruction(0));
+        }
+      } else {
+        for (HloComputation* called_computation : user->called_computations()) {
+          const CallGraphNode& call_graph_node =
+              call_graph_->GetNode(called_computation);
+          if (call_graph_node.context() == CallContext::kSequential) {
+            for (int64 operand_number : user->OperandIndices(instruction)) {
+              add_to_worklist(
+                  called_computation->parameter_instruction(operand_number));
+            }
           }
         }
       }
@@ -568,14 +658,15 @@ void HloDataflowAnalysis::Propagate() {
       const CallGraphNode& call_graph_node =
           call_graph_->GetNode(instruction->parent());
       for (const CallSite& callsite : call_graph_node.caller_callsites()) {
-        if (callsite.instruction()->opcode() == HloOpcode::kCall) {
-          worklist.push(callsite.instruction());
+        if ((callsite.instruction()->opcode() == HloOpcode::kCall) ||
+            (callsite.instruction()->opcode() == HloOpcode::kConditional)) {
+          add_to_worklist(callsite.instruction());
         } else if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
           // Add the while itself, and the body and condition parameters.
-          worklist.push(callsite.instruction());
-          worklist.push(
+          add_to_worklist(callsite.instruction());
+          add_to_worklist(
               callsite.instruction()->while_body()->parameter_instruction(0));
-          worklist.push(
+          add_to_worklist(
               callsite.instruction()->while_condition()->parameter_instruction(
                   0));
         }
@@ -634,8 +725,14 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
             define_all_values();
           }
           break;
+        case HloOpcode::kSlice:
+          if (!instruction->IsInPlaceSlice()) {
+            define_all_values();
+          }
+          break;
         case HloOpcode::kWhile:
         case HloOpcode::kCall:
+        case HloOpcode::kConditional:
         case HloOpcode::kGetTupleElement:
           // These instructions define no values. The values in their output
           // flow from their operands or from cross computation dataflow.
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index dfd81ae951042f7a4d6d3c24af4d5b7e046c272d..89d318188f0855c7924836a51cfe98d531e08cb4 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -145,7 +145,9 @@ class HloDataflowAnalysis {
   // Updates the value set for a particular instruction type. Returns whether
   // the instruction value set changed.
   bool UpdateBitcastValueSet(HloInstruction* bitcast);
+  bool UpdateSliceValueSet(HloInstruction* slice);
   bool UpdateCallValueSet(HloInstruction* call);
+  bool UpdateConditionalValueSet(HloInstruction* conditional);
   bool UpdateCopyValueSet(HloInstruction* copy);
   bool UpdateGetTupleElementValueSet(HloInstruction* gte);
   bool UpdateParameterValueSet(HloInstruction* parameter);
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index f08f0b1d6833b028baa5f997929a17eb5abae205..e714b2567fd1b3eab607a19f0bb7e3288150dc64 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
 // Test is parameterized on a bool which is whether the dataflow analysis is
@@ -77,11 +78,23 @@ class HloDataflowAnalysisTest : public HloTestBase,
                                  analysis_->GetValueDefinedAt(b), *analysis_);
   }
 
+  std::unique_ptr<HloComputation> CreateR0F32UnaryOpComputation(
+      HloOpcode opcode) {
+    HloComputation::Builder builder(TestName() + "." + HloOpcodeString(opcode));
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+    builder.AddInstruction(
+        HloInstruction::CreateUnary(scalar_shape_, opcode, param0));
+    return builder.Build();
+  }
+
   std::unique_ptr<HloModule> module_;
   std::unique_ptr<HloDataflowAnalysis> analysis_;
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
   const Shape vector_shape_ = ShapeUtil::MakeShape(F32, {42});
+  const Shape tuple_shape_ = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})});
 };
 
 TEST_P(HloDataflowAnalysisTest, BinaryOperation) {
@@ -1528,6 +1541,315 @@ TEST_P(HloDataflowAnalysisTest, EmbeddedComputationInterference) {
   EXPECT_TRUE(InstructionsMayInterfere(ordering, negate, embedded_log));
 }
 
+TEST_P(HloDataflowAnalysisTest, ConditionalWithIdentity) {
+  // Test conditional with identity computations in both true and false cases.
+  //
+  // true_computation(F32[] %true_param):
+  //   return %true_param
+  //
+  // false_computation(F32[] %false_param):
+  //   return %false_param
+  //
+  // entry:
+  //   %pred = Constant(true)
+  //   %constant1 = Constant(56.0)
+  //   %constant2 = Constant(12.0)
+  //   return Conditional(%pred, %constant1, true_computation,
+  //                      %constant2, false_computation)
+
+  auto true_builder = HloComputation::Builder(TestName() + "_true");
+  auto true_param = true_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "true_param"));
+  HloComputation* true_computation =
+      module_->AddEmbeddedComputation(true_builder.Build());
+
+  auto false_builder = HloComputation::Builder(TestName() + "_false");
+  auto false_param = false_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "false_param"));
+  HloComputation* false_computation =
+      module_->AddEmbeddedComputation(false_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.0f)));
+  auto conditional = builder.AddInstruction(HloInstruction::CreateConditional(
+      scalar_shape_, pred, constant1, true_computation, constant2,
+      false_computation));
+  module_->AddEntryComputation(builder.Build());
+
+  const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(pred));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant2));
+
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_param));
+
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_param),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_param),
+            analysis.GetValueDefinedAt(constant2));
+
+  EXPECT_THAT(analysis.GetValueDefinedAt(pred).uses(),
+              ElementsAre(HloUse{conditional, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
+              ElementsAre(HloUse{conditional, 1, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
+              ElementsAre(HloUse{conditional, 2, {}}));
+
+  EXPECT_EQ(analysis.values().size(), 3);
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+  EXPECT_THAT(HloValuesAt(conditional),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
+                                   analysis.GetValueDefinedAt(constant2)));
+}
+
+TEST_P(HloDataflowAnalysisTest, ConditionalTakingTupleOperand) {
+  // Test conditional with true and false computations taking a tuple operand.
+  //
+  // true_computation((F32[], F32[]) %true_param):
+  //   %true_x = GetTupleElement(%true_param, 0)
+  //   %true_y = GetTupleElement(%true_param, 1)
+  //   return Add(%true_x, %true_y)
+  //
+  // false_computation((F32[], F32[]) %false_param):
+  //   %false_x = GetTupleElement(%false_param, 0)
+  //   %false_y = GetTupleElement(%false_param, 1)
+  //   return Subtract(%false_x, %false_y)
+  //
+  // entry:
+  //   %pred = Constant(true)
+  //   %constant1 = Constant(56.0)
+  //   %constant2 = Constant(12.0)
+  //   %tuple_operand = Tuple(%constant1, %constant2)
+  //   return Conditional(%pred, %tuple_operand, true_computation,
+  //                      %tuple_operand, false_computation)
+
+  auto true_builder = HloComputation::Builder(TestName() + "_true");
+  auto true_param = true_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape_, "true_param"));
+  auto true_x = true_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, true_param, 0));
+  auto true_y = true_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, true_param, 1));
+  auto add = true_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, true_x, true_y));
+  HloComputation* true_computation =
+      module_->AddEmbeddedComputation(true_builder.Build());
+
+  auto false_builder = HloComputation::Builder(TestName() + "_false");
+  auto false_param = false_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape_, "false_param"));
+  auto false_x = false_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, false_param, 0));
+  auto false_y = false_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, false_param, 1));
+  auto sub = false_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kSubtract, false_x, false_y));
+  HloComputation* false_computation =
+      module_->AddEmbeddedComputation(false_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.0f)));
+  auto tuple_operand = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto conditional = builder.AddInstruction(HloInstruction::CreateConditional(
+      scalar_shape_, pred, tuple_operand, true_computation, tuple_operand,
+      false_computation));
+  module_->AddEntryComputation(builder.Build());
+
+  const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(pred));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant2));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(tuple_operand));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(add));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(sub));
+
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_x));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_y));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_x));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_y));
+
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_param),
+            analysis.GetValueDefinedAt(tuple_operand));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_param),
+            analysis.GetValueDefinedAt(tuple_operand));
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_x),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_y),
+            analysis.GetValueDefinedAt(constant2));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_x),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_y),
+            analysis.GetValueDefinedAt(constant2));
+
+  EXPECT_THAT(analysis.GetValueDefinedAt(pred).uses(),
+              ElementsAre(HloUse{conditional, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
+              UnorderedElementsAre(HloUse{conditional, 1, {0}},
+                                   HloUse{conditional, 2, {0}},
+                                   HloUse{add, 0, {}}, HloUse{sub, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
+              UnorderedElementsAre(HloUse{conditional, 1, {1}},
+                                   HloUse{conditional, 2, {1}},
+                                   HloUse{add, 1, {}}, HloUse{sub, 1, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(tuple_operand).uses(),
+              UnorderedElementsAre(
+                  HloUse{conditional, 1, {}}, HloUse{conditional, 2, {}},
+                  HloUse{true_x, 0, {}}, HloUse{true_y, 0, {}},
+                  HloUse{false_x, 0, {}}, HloUse{false_y, 0, {}}));
+
+  EXPECT_EQ(analysis.values().size(), 6);
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+  EXPECT_THAT(HloValuesAt(conditional),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(add),
+                                   analysis.GetValueDefinedAt(sub)));
+}
+
+TEST_P(HloDataflowAnalysisTest, NestedConditionals) {
+  // computation1(F32[] %param1):
+  //   %ceil = Ceil(%param1)
+  //   return %ceil
+  //
+  // computation2(F32[] %param2):
+  //   %floor = Floor(%param2)
+  //   return %floor
+  //
+  // computation3(F32[] %param3):
+  //   %negate = Negate(%param3)
+  //   return %negate
+  //
+  // inner_conditional((PRED, F32[], F32[]) %param_cond):
+  //   %pred_cond = GetTupleElement(%param_cond, 0)
+  //   %true_operand_cond = GetTupleElement(%param_cond, 1)
+  //   %false_opearnd_cond = GetTupleElement(%param_cond, 2)
+  //   return Conditional(%pred_cond, %true_operand_cond, computation1,
+  //                      %false_operand_cond, computation2)
+  //
+  // entry:
+  //   %pred1 = Constant(true)
+  //   %pred2 = Constant(false)
+  //   %constant1 = Constant(1.1);
+  //   %constant2 = Constant(2.2);
+  //   %constant3 = Constant(3.3);
+  //   return Conditional(%pred1, (%pred2, %constant1, %constant2),
+  //                      inner_conditional, %constant3, computation3)
+
+  auto computation1 = module_->AddEmbeddedComputation(
+      CreateR0F32UnaryOpComputation(HloOpcode::kCeil));
+  auto computation2 = module_->AddEmbeddedComputation(
+      CreateR0F32UnaryOpComputation(HloOpcode::kFloor));
+  auto computation3 = module_->AddEmbeddedComputation(
+      CreateR0F32UnaryOpComputation(HloOpcode::kNegate));
+
+  // Build inner_conditional computation.
+  const Shape scalar_bool_shape = ShapeUtil::MakeShape(PRED, {});
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {scalar_bool_shape, scalar_shape_, scalar_shape_});
+  auto inner_builder =
+      HloComputation::Builder(TestName() + "_inner_conditional");
+  auto param_cond = inner_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_param_shape, "param_cond"));
+  auto pred_cond = inner_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_bool_shape, param_cond, 0));
+  auto true_operand_cond = inner_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param_cond, 1));
+  auto false_operand_cond = inner_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param_cond, 2));
+  auto inner_conditional =
+      inner_builder.AddInstruction(HloInstruction::CreateConditional(
+          scalar_shape_, pred_cond, true_operand_cond, computation1,
+          false_operand_cond, computation2));
+  auto inner_conditional_computation =
+      module_->AddEmbeddedComputation(inner_builder.Build());
+
+  // Build entry computation.
+  auto builder = HloComputation::Builder(TestName());
+  auto pred1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  auto pred2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.2f)));
+  auto constant3 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.3f)));
+  auto tuple_operand = builder.AddInstruction(
+      HloInstruction::CreateTuple({pred2, constant1, constant2}));
+  auto conditional = builder.AddInstruction(HloInstruction::CreateConditional(
+      scalar_shape_, pred1, tuple_operand, inner_conditional_computation,
+      constant3, computation3));
+  module_->AddEntryComputation(builder.Build());
+
+  const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(pred1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(pred2));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant2));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant3));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(tuple_operand));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(computation1->root_instruction()));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(computation2->root_instruction()));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(computation3->root_instruction()));
+
+  auto computation1_param = computation1->parameter_instruction(0);
+  auto computation2_param = computation2->parameter_instruction(0);
+  auto computation3_param = computation3->parameter_instruction(0);
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(computation1_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(computation2_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(computation3_param));
+  EXPECT_EQ(analysis.GetUniqueValueAt(computation1_param),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(computation2_param),
+            analysis.GetValueDefinedAt(constant2));
+  EXPECT_EQ(analysis.GetUniqueValueAt(computation3_param),
+            analysis.GetValueDefinedAt(constant3));
+
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(param_cond));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(pred_cond));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_operand_cond));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_operand_cond));
+  EXPECT_EQ(analysis.GetUniqueValueAt(param_cond),
+            analysis.GetValueDefinedAt(tuple_operand));
+  EXPECT_EQ(analysis.GetUniqueValueAt(pred_cond),
+            analysis.GetValueDefinedAt(pred2));
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_operand_cond),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_operand_cond),
+            analysis.GetValueDefinedAt(constant2));
+
+  EXPECT_EQ(analysis.values().size(), 9);
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(inner_conditional));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+  EXPECT_THAT(
+      HloValuesAt(inner_conditional),
+      UnorderedElementsAre(
+          analysis.GetValueDefinedAt(computation1->root_instruction()),
+          analysis.GetValueDefinedAt(computation2->root_instruction())));
+  EXPECT_THAT(
+      HloValuesAt(conditional),
+      UnorderedElementsAre(
+          analysis.GetValueDefinedAt(computation1->root_instruction()),
+          analysis.GetValueDefinedAt(computation2->root_instruction()),
+          analysis.GetValueDefinedAt(computation3->root_instruction())));
+}
+
 INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
                         HloDataflowAnalysisTest,
                         ::testing::Values(false, true));
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index 40e67c87807b3e13d8ac09206bf6be02e4f9ff31..1e5f0f797a13fd7e7ce1cc934387a274a74153bc 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -55,7 +55,8 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
     for (auto* instruction : computation->instructions()) {
       if (instruction->user_count() == 0 &&
           live_instructions.count(instruction) == 0 &&
-          computation->IsRemovable(instruction)) {
+          computation->IsRemovable(instruction) &&
+          !instruction->HasSideEffect()) {
         dead_roots.push_back(instruction);
       }
     }
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index d54b9a27087a42fd23eab0bd06e8deaca567312b..5a56607a665c4cbeb7b2572f182b88e890602968 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -70,6 +70,26 @@ TEST_F(HloDceTest, NoDeadCode) {
   EXPECT_EQ(3, computation->instruction_count());
 }
 
+TEST_F(HloDceTest, InstructionsWithSideEffect) {
+  // Verify that side-effect instructions (Send in this test) are not removed.
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
+  builder.AddInstruction(
+      HloInstruction::CreateSend(constant, /*channel_id=*/0));
+  builder.AddInstruction(HloInstruction::CreateTuple({}));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_EQ(3, computation->instruction_count());
+
+  HloDCE dce;
+  EXPECT_FALSE(dce.Run(module.get()).ValueOrDie());
+
+  EXPECT_EQ(3, computation->instruction_count());
+}
+
 TEST_F(HloDceTest, DeadParameters) {
   // Verify that dead parameters are not removed, but use of the dead parameters
   // are.
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c782d1b0add17c70e0f54826917df251d5a613e2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -0,0 +1,207 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+namespace {
+
+HloInstruction* ToElementType(HloInstruction* hlo, PrimitiveType type) {
+  if (hlo->shape().element_type() != type) {
+    Shape shape = ShapeUtil::ChangeElementType(hlo->shape(), type);
+    hlo = hlo->parent()->AddInstruction(
+        HloInstruction::CreateConvert(shape, hlo));
+  }
+  CHECK_EQ(hlo->shape().element_type(), type);
+  return hlo;
+}
+
+bool HasOperandType(HloInstruction* hlo, PrimitiveType type) {
+  for (HloInstruction* operand : hlo->operands()) {
+    if (operand->shape().element_type() == type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Finds out the Tuple Shape of the new instruction after converting the element
+// type of the operands of the original instruction from `from_type` to
+// `to_type`.
+//
+// This routine assumes the resulting `shape` of the original instruction is a
+// non-nested tuple. This assumption is currently safe as only kTuple, kInfeed,
+// kOutfeed, kCall, kCustomCall and kBatchNorm* HLO instructions can produce
+// results with tuple shapes, and this routine is only called to convert the
+// result shapes of kBatchNorm* HLO instructions, which are non-nested tuples.
+Shape GetConvertedTupleShape(const Shape& shape, PrimitiveType from_type,
+                             PrimitiveType to_type) {
+  std::vector<Shape> new_tuple_subshapes;
+  for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+    Shape subshape = ShapeUtil::GetTupleElementShape(shape, i);
+    CHECK(!ShapeUtil::IsTuple(subshape));
+    if (subshape.element_type() == from_type) {
+      subshape = ShapeUtil::ChangeElementType(subshape, to_type);
+    }
+    new_tuple_subshapes.push_back(subshape);
+  }
+  return ShapeUtil::MakeTupleShape(new_tuple_subshapes);
+}
+
+// Converts the elements of the result of `hlo` to produce a new tuple with
+// shape `to_shape`.
+//
+// This routine assumes `hlo` is an instruction that produces a non-nested Tuple
+// as a result.
+HloInstruction* ConvertTupleElements(HloInstruction* hlo,
+                                     const Shape& to_shape) {
+  const Shape& shape = hlo->shape();
+  HloComputation* computation = hlo->parent();
+  std::vector<HloInstruction*> tuple_elements;
+  for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+    const Shape& ele_shape = ShapeUtil::GetTupleElementShape(shape, i);
+    HloInstruction* element = computation->AddInstruction(
+        HloInstruction::CreateGetTupleElement(ele_shape, hlo, i));
+    const Shape& to_ele_shape = ShapeUtil::GetTupleElementShape(to_shape, i);
+    CHECK(!ShapeUtil::IsTuple(ele_shape));
+    if (ele_shape.element_type() != to_ele_shape.element_type()) {
+      element = computation->AddInstruction(
+          HloInstruction::CreateConvert(to_ele_shape, element));
+    }
+    tuple_elements.push_back(element);
+  }
+  return computation->AddInstruction(
+      HloInstruction::CreateTuple(tuple_elements));
+}
+
+}  // namespace
+
+HloElementTypeConverter::HloElementTypeConverter(
+    PrimitiveType eliminate_type, PrimitiveType replace_with_type)
+    : eliminate_type_(eliminate_type), replace_with_type_(replace_with_type) {}
+
+// This routine converts the arithmetic operations in the given module that use
+// eliminate_type_ to operations that use replace_with_type_.
+StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
+  XLA_VLOG_LINES(
+      3, "HloElementTypeConverter::Run(), before:\n" + module->ToString());
+
+  if (eliminate_type_ == replace_with_type_) {
+    return false;
+  }
+
+  bool changed = false;
+  for (auto* computation : module->computations()) {
+    for (auto* hlo : computation->MakeInstructionPostOrder()) {
+      const auto opcode = hlo->opcode();
+      // These are ops where it does not make sense to convert them.
+      if (opcode == HloOpcode::kParameter || opcode == HloOpcode::kConstant ||
+          opcode == HloOpcode::kTuple || opcode == HloOpcode::kConvert ||
+          opcode == HloOpcode::kGetTupleElement ||
+          opcode == HloOpcode::kInfeed || opcode == HloOpcode::kOutfeed) {
+        continue;
+      }
+
+      // We cannot change a CustomCall since we have no way of adjusting the
+      // called binary to expect the updated type.
+      if (opcode == HloOpcode::kCustomCall) {
+        continue;
+      }
+
+      // These are ops with embedded computations where it suffices to convert
+      // the embedded computations instead of converting the ops themselves.
+      if (opcode == HloOpcode::kWhile || opcode == HloOpcode::kCall ||
+          opcode == HloOpcode::kFusion || opcode == HloOpcode::kMap ||
+          opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow ||
+          opcode == HloOpcode::kSelectAndScatter ||
+          opcode == HloOpcode::kConditional) {
+        continue;
+      }
+      TF_RET_CHECK(hlo->called_computations().empty()) << hlo->ToString();
+
+      if (!HasOperandType(hlo, eliminate_type_)) {
+        // If this CHECK fires, then this was an instruction that does not take
+        // the elimination type as an operand but it does return it. This pass
+        // does not have a feature to change the output type in that case, so
+        // instead of silently failing to eliminate the type, it fails loudly.
+        TF_RET_CHECK(hlo->shape().element_type() != eliminate_type_);
+        continue;
+      }
+
+      // Handle instructions that perform arithmetic operations and contain
+      // operands with eliminate_type_.
+      //
+      // First, convert the operands with eliminate_type_ to operands with
+      // replace_with_type_.
+      std::vector<HloInstruction*> new_operands;
+      for (HloInstruction* operand : hlo->operands()) {
+        if (operand->shape().element_type() == eliminate_type_) {
+          operand = ToElementType(operand, replace_with_type_);
+        }
+        new_operands.push_back(operand);
+      }
+
+      // Then find out the result type of the new instruction with the same
+      // opcode but using the converted operands, create the new instruction,
+      // and convert the result of the new instruction back to match the result
+      // type of the original instruction.
+      HloInstruction* new_hlo;
+      if (hlo->shape().element_type() == eliminate_type_) {
+        Shape shape =
+            ShapeUtil::ChangeElementType(hlo->shape(), replace_with_type_);
+        new_hlo = computation->AddInstruction(
+            hlo->CloneWithNewOperands(shape, new_operands, hlo->GetModule()));
+        new_hlo = ToElementType(new_hlo, eliminate_type_);
+      } else if (ShapeUtil::IsTuple(hlo->shape())) {
+        Shape old_shape = hlo->shape();
+        Shape new_shape = GetConvertedTupleShape(hlo->shape(), eliminate_type_,
+                                                 replace_with_type_);
+        new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
+            new_shape, new_operands, hlo->GetModule()));
+        // Convert the elements of the result of `new_hlo` to produce a new
+        // tuple with shape `old_shape`.
+        new_hlo = ConvertTupleElements(new_hlo, old_shape);
+      } else {
+        new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
+            hlo->shape(), new_operands, hlo->GetModule()));
+      }
+
+      TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, new_hlo));
+      changed = true;
+    }
+  }
+  XLA_VLOG_LINES(
+      2, "HloElementTypeConverter::Run(), after:\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.h b/tensorflow/compiler/xla/service/hlo_element_type_converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b109225d0b192e5c9e4f6d841377ffad8078dc2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass that eliminates certain element types as the input or output of ops by
+// inserting Convert ops. This allows a backend to support an element type while
+// only actually implementing the Convert op for that element type. This is
+// generally not the fastest approach, but it works.
+class HloElementTypeConverter : public HloPassInterface {
+ public:
+  // eliminate_type is the type to eliminate as the input or output of ops,
+  // using Convert ops to replace it with replace_with_type.
+  HloElementTypeConverter(PrimitiveType eliminate_type,
+                          PrimitiveType replace_with_type);
+
+  tensorflow::StringPiece name() const override {
+    return "element_type_converter";
+  }
+
+  // Returns the pass on the module and returns whether the module was modified.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  PrimitiveType eliminate_type_;
+  PrimitiveType replace_with_type_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb94d9f19b825d1321263a4737b66a6bf198a772
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class HloElementTypeConverterTest : public HloTestBase {
+ public:
+  std::unique_ptr<HloModule> CreateModuleFromHloString(
+      const string& hlo_string) {
+    return HloRunner::CreateModuleFromString(hlo_string,
+                                             GetDebugOptionsForTest())
+        .ValueOrDie();
+  }
+};
+
+TEST_F(HloElementTypeConverterTest, CustomCallsNotConverted) {
+  const string& hlo_string = R"(
+    HloModule custom_call
+    ENTRY CustomCall {
+      constant = bf16[1]{0} constant({12345})
+      ROOT custom-call = bf16[1,2,3]{0,2,1} custom-call(constant),
+           custom_call_target="foo"
+    }
+  )";
+  auto module = CreateModuleFromHloString(hlo_string);
+  HloElementTypeConverter type_converter(BF16, F32);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_FALSE(converted);
+}
+
+TEST_F(HloElementTypeConverterTest, InfeedsOutfeedsNotConverted) {
+  const string& hlo_string = R"(
+    HloModule InfeedOutfeed
+    ENTRY RoundTrip16MiBR1.v2 {
+      ROOT infeed = bf16[4]{0} infeed()
+      outfeed = () outfeed(infeed)
+    }
+  )";
+  auto module = CreateModuleFromHloString(hlo_string);
+  HloElementTypeConverter type_converter(BF16, F32);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_FALSE(converted);
+}
+
+TEST_F(HloElementTypeConverterTest, OperationsInNestedTuplesConverted) {
+  const string& hlo_string = R"(
+    HloModule NestedTuples
+    ENTRY NestedTuples.v5 {
+      constant.4 = bf16[] constant(42)
+      constant.2 = f32[2]{0} constant({1, 2})
+      constant.3 = bf16[] constant(42)
+      add = bf16[] add(constant.2, constant.3)
+      tuple = (f32[2]{0}, bf16[]) tuple(constant.2, add)
+      constant.5 = bf16[2]{0} constant({22, 44})
+      ROOT tuple.1 = ((f32[2]{0}, bf16[]), bf16[2]{0}) tuple(tuple, constant.5)
+    }
+  )";
+
+  auto module = CreateModuleFromHloString(hlo_string);
+  HloElementTypeConverter type_converter(BF16, F32);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_TRUE(converted);
+  const HloInstruction* bf16_op =
+      module->entry_computation()->root_instruction()->operand(0)->operand(1);
+  EXPECT_THAT(bf16_op, op::Convert(op::Add(op::Constant(), op::Convert())));
+}
+
+TEST_F(HloElementTypeConverterTest, BatchNormGradBF16Converted) {
+  const string& hlo_string = R"(
+    HloModule BatchNormGrad
+    ENTRY BatchNormGrad.v6 {
+      constant.4 = bf16[2,2,2,1]{3,2,1,0} constant(bf16[2,2,2,1] { { /*i0=0*/ 
+      { /*i1=0*/ {0}, {0} }, { /*i1=1*/ {0}, {0} } }, { /*i0=1*/ { /*i1=0*/ {0},
+      {0} }, { /*i1=1*/ {0}, {0} } } })
+      constant.5 = bf16[2]{0} constant({1, 1})
+      constant.6 = bf16[2]{0} constant({0, 0})
+      constant.7 = bf16[2]{0} constant({1, 1})
+      constant.8 = bf16[2,2,2,1]{3,2,1,0} constant(bf16[2,2,2,1] { { /*i0=0*/
+      { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} } }, { /*i0=1*/ { /*i1=0*/
+      {5}, {6} }, { /*i1=1*/ {7}, {8} } } })
+      ROOT batch-norm-grad = (bf16[2,2,2,1]{3,2,1,0}, bf16[2]{0}, bf16[2]{0})
+      batch-norm-grad(constant.4, constant.5, constant.6, constant.7,
+      constant.8), epsilon=0, feature_index=2
+    }
+  )";
+
+  auto module = CreateModuleFromHloString(hlo_string);
+  HloElementTypeConverter type_converter(BF16, F32);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
+  EXPECT_TRUE(converted);
+  const HloInstruction* tuple_instr =
+      module->entry_computation()->root_instruction();
+  ::testing::Matcher<const ::xla::HloInstruction*> batch_norm =
+      op::BatchNormGrad();
+  EXPECT_THAT(tuple_instr,
+              op::Tuple(op::Convert(op::GetTupleElement(batch_norm, 0)),
+                        op::Convert(op::GetTupleElement(batch_norm, 1)),
+                        op::Convert(op::GetTupleElement(batch_norm, 2))));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index e693d167a1f96f65b894d07fb2c8f33e61ff8c49..81212cda4266ec820230d0d84fc2a395edaf411e 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -39,9 +40,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bitmap.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -165,13 +168,67 @@ StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOpImpl(
   return std::move(result);
 }
 
+// For one particular placement of a window in a base shape (the placement is
+// represented as `window_count_index`), iterates inside the window. Translates
+// the window index into base index. If the base index is within bound, call `f`
+// with the base index.
+void IterateThroughWindow(
+    const Shape& window_shape, const Window& window, const Shape& base_shape,
+    const tensorflow::gtl::ArraySlice<int64>& window_count_index,
+    const std::function<void(const std::vector<int64>&)>& f) {
+  const int64 rank = ShapeUtil::Rank(base_shape);
+  DimensionVector window_index(rank);
+  std::fill(window_index.begin(), window_index.end(), 0);
+  do {
+    std::vector<int64> base_index(rank);
+    bool out_of_bound = false;
+    for (int64 i = 0; i < rank; ++i) {
+      base_index[i] = window_count_index[i] * window.dimensions(i).stride() +
+                      window_index[i] - window.dimensions(i).padding_low();
+      if (base_index[i] < 0 || base_index[i] >= base_shape.dimensions(i)) {
+        out_of_bound = true;
+        break;
+      }
+    }
+    if (!out_of_bound) {
+      f(base_index);
+    }
+  } while (IndexUtil::BumpIndices(window_shape, &window_index));
+}
+
 }  // namespace
 
-template <typename ReturnT>
+template <typename ReturnT, typename ElementwiseT>
 class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
  public:
   explicit TypedVisitor(HloEvaluator* p) : parent_(p) {}
 
+  // The following higher-order functions convert a function with ElementwiseT
+  // to a function with ReturnT.
+  std::function<ReturnT(ReturnT)> ConvertUnaryFunction(
+      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
+    return [&unary_op](ReturnT arg) {
+      return static_cast<ReturnT>(unary_op(static_cast<ElementwiseT>(arg)));
+    };
+  }
+  std::function<ReturnT(ReturnT, ReturnT)> ConvertBinaryFunction(
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
+          binary_op) {
+    return [&binary_op](ReturnT arg1, ReturnT arg2) {
+      return static_cast<ReturnT>(binary_op(static_cast<ElementwiseT>(arg1),
+                                            static_cast<ElementwiseT>(arg2)));
+    };
+  }
+  std::function<ReturnT(ReturnT, ReturnT, ReturnT)> ConvertTernaryFunction(
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT,
+                                       ElementwiseT)>& ternary_op) {
+    return [&ternary_op](ReturnT arg1, ReturnT arg2, ReturnT arg3) {
+      return static_cast<ReturnT>(ternary_op(static_cast<ElementwiseT>(arg1),
+                                             static_cast<ElementwiseT>(arg2),
+                                             static_cast<ElementwiseT>(arg3)));
+    };
+  }
+
   Status DefaultAction(HloInstruction* hlo_instruction) override {
     return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
                          HloOpcodeString(hlo_instruction->opcode()).c_str());
@@ -197,24 +254,25 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                               is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleAbs(HloInstruction* abs) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
-                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                        ElementWiseUnaryOp(abs, [](ElementwiseT elem_operand) {
                           return std::abs(elem_operand);
                         }));
     return Status::OK();
   }
 
   Status HandleAbs(HloInstruction* abs) override {
-    return HandleAbs<ReturnT>(abs);
+    return HandleAbs<ElementwiseT>(abs);
   }
 
   template <
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleRound(HloInstruction* round) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[round],
-                        ElementWiseUnaryOp(round, [](ReturnT elem_operand) {
-                          return std::round(elem_operand);
-                        }));
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[round],
+        ElementWiseUnaryOp(round, [](ElementwiseT elem_operand) {
+          return std::round(elem_operand);
+        }));
     return Status::OK();
   }
 
@@ -233,7 +291,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     parent_->evaluated_[broadcast] =
         Literal::CreateFromShape(broadcast->shape());
     auto output = parent_->evaluated_[broadcast].get();
-    auto operand_to_broadcast =
+    const Literal& operand_to_broadcast =
         parent_->GetEvaluatedLiteralFor(broadcast->operand(0));
     std::vector<int64> broadcast_indices(
         ShapeUtil::Rank(broadcast->operand(0)->shape()), 0);
@@ -264,7 +322,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleCeil(HloInstruction* ceil) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[ceil],
-                        ElementWiseUnaryOp(ceil, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(ceil, [](ElementwiseT elem_operand) {
                           return std::ceil(elem_operand);
                         }));
     return Status::OK();
@@ -299,7 +357,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleExp(HloInstruction* exp) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
-                        ElementWiseUnaryOp(exp, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
                           return std::exp(elem_operand);
                         }));
     return Status::OK();
@@ -309,10 +367,11 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleFloor(HloInstruction* floor) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[floor],
-                        ElementWiseUnaryOp(floor, [](ReturnT elem_operand) {
-                          return std::floor(elem_operand);
-                        }));
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[floor],
+        ElementWiseUnaryOp(floor, [](ElementwiseT elem_operand) {
+          return std::floor(elem_operand);
+        }));
     return Status::OK();
   }
 
@@ -329,7 +388,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleLog(HloInstruction* log) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[log],
-                        ElementWiseUnaryOp(log, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(log, [](ElementwiseT elem_operand) {
                           return std::log(elem_operand);
                         }));
     return Status::OK();
@@ -341,7 +400,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                 !std::is_same<NativeT, bool>::value>::type* = nullptr>
   Status HandleNot(HloInstruction* not_) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
-                        ElementWiseUnaryOp(not_, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
                           return ~elem_operand;
                         }));
     return Status::OK();
@@ -351,7 +410,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                                   NativeT>::value>::type* = nullptr>
   Status HandleNot(HloInstruction* not_) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
-                        ElementWiseUnaryOp(not_, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
                           return !elem_operand;
                         }));
     return Status::OK();
@@ -362,7 +421,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                 nullptr>
   Status HandleNot(HloInstruction* not_) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
-                        ElementWiseUnaryOp(not_, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
                           return !elem_operand;
                         }));
     return Status::OK();
@@ -376,7 +435,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleNot(HloInstruction* not_) override {
-    return HandleNot<ReturnT>(not_);
+    return HandleNot<ElementwiseT>(not_);
   }
 
   template <typename NativeT,
@@ -385,10 +444,11 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                 !std::is_floating_point<NativeT>::value>::type* = nullptr>
   Status HandleNegate(HloInstruction* negate) {
     using type = typename std::make_unsigned<NativeT>::type;
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[negate],
-                        ElementWiseUnaryOp(negate, [](ReturnT elem_operand) {
-                          return NativeT(-type(elem_operand));
-                        }));
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[negate],
+        ElementWiseUnaryOp(negate, [](ElementwiseT elem_operand) {
+          return NativeT(-type(elem_operand));
+        }));
     return Status::OK();
   }
 
@@ -397,10 +457,10 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                 !std::is_signed<NativeT>::value ||
                 std::is_floating_point<NativeT>::value>::type* = nullptr>
   Status HandleNegate(HloInstruction* negate) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[negate],
-                        ElementWiseUnaryOp(negate, [](ReturnT elem_operand) {
-                          return -elem_operand;
-                        }));
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[negate],
+        ElementWiseUnaryOp(
+            negate, [](ElementwiseT elem_operand) { return -elem_operand; }));
     return Status::OK();
   }
 
@@ -413,9 +473,9 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleSign(HloInstruction* sign) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
-                        ElementWiseUnaryOp(sign, [](ReturnT elem_operand) {
-                          return (ReturnT(0) < elem_operand) -
-                                 (elem_operand < ReturnT(0));
+                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
+                          return (ElementwiseT(0) < elem_operand) -
+                                 (elem_operand < ElementwiseT(0));
                         }));
     return Status::OK();
   }
@@ -425,9 +485,9 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleSign(HloInstruction* sign) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
-                        ElementWiseUnaryOp(sign, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
                           auto abs_val = std::abs(elem_operand);
-                          return 0 == abs_val ? ReturnT(0)
+                          return 0 == abs_val ? ElementwiseT(0)
                                               : elem_operand / abs_val;
                         }));
     return Status::OK();
@@ -437,9 +497,30 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return HandleSign<ReturnT>(sign);
   }
 
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleAtan2(HloInstruction* atan2) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[atan2],
+                        ElementWiseBinaryOp(atan2, [](ElementwiseT lhs_elem,
+                                                      ElementwiseT rhs_elem) {
+                          return std::atan2(lhs_elem, rhs_elem);
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<!std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleAtan2(HloInstruction* atan2) {
+    return InvalidArgument("Unsupported type for Atan2");
+  }
+
+  Status HandleAtan2(HloInstruction* atan2) override {
+    return HandleAtan2<ElementwiseT>(atan2);
+  }
+
   Status HandleTanh(HloInstruction* tanh) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[tanh],
-                        ElementWiseUnaryOp(tanh, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(tanh, [](ElementwiseT elem_operand) {
                           return std::tanh(elem_operand);
                         }));
     return Status::OK();
@@ -453,9 +534,10 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     using type = typename std::make_unsigned<NativeT>::type;
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[multiply],
-        ElementWiseBinaryOp(multiply, [](ReturnT lhs_elem, ReturnT rhs_elem) {
-          return NativeT(type(lhs_elem) * type(rhs_elem));
-        }));
+        ElementWiseBinaryOp(multiply,
+                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+                              return NativeT(type(lhs_elem) * type(rhs_elem));
+                            }));
     return Status::OK();
   }
 
@@ -467,40 +549,42 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleMultiply(HloInstruction* multiply) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[multiply],
-        ElementWiseBinaryOp(multiply, [](ReturnT lhs_elem, ReturnT rhs_elem) {
-          return lhs_elem * rhs_elem;
-        }));
+        ElementWiseBinaryOp(multiply,
+                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+                              return lhs_elem * rhs_elem;
+                            }));
     return Status::OK();
   }
 
   Status HandleMultiply(HloInstruction* multiply) override {
-    return HandleMultiply<ReturnT>(multiply);
+    return HandleMultiply<ElementwiseT>(multiply);
   }
 
   Status HandleSubtract(HloInstruction* subtract) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[subtract],
-        ElementWiseBinaryOp(subtract, [](ReturnT lhs_elem, ReturnT rhs_elem) {
-          return lhs_elem - rhs_elem;
-        }));
+        ElementWiseBinaryOp(subtract,
+                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+                              return lhs_elem - rhs_elem;
+                            }));
     return Status::OK();
   }
 
   Status HandleAdd(HloInstruction* add) override {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[add],
-        ElementWiseBinaryOp(add, [](ReturnT lhs_elem, ReturnT rhs_elem) {
-          return lhs_elem + rhs_elem;
-        }));
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[add],
+                        ElementWiseBinaryOp(add, [](ElementwiseT lhs_elem,
+                                                    ElementwiseT rhs_elem) {
+                          return lhs_elem + rhs_elem;
+                        }));
     return Status::OK();
   }
 
   Status HandleDivide(HloInstruction* divide) override {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[divide],
-        ElementWiseBinaryOp(divide, [](ReturnT lhs_elem, ReturnT rhs_elem) {
-          return lhs_elem / rhs_elem;
-        }));
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[divide],
+                        ElementWiseBinaryOp(divide, [](ElementwiseT lhs_elem,
+                                                       ElementwiseT rhs_elem) {
+                          return lhs_elem / rhs_elem;
+                        }));
     return Status::OK();
   }
 
@@ -510,7 +594,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleMaximum(HloInstruction* maximum) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[maximum],
-        ElementWiseBinaryOp(maximum, [](ReturnT lhs, ReturnT rhs) {
+        ElementWiseBinaryOp(maximum, [](ElementwiseT lhs, ElementwiseT rhs) {
           return std::fmax(lhs, rhs);
         }));
     return Status::OK();
@@ -524,18 +608,18 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleMaximum(HloInstruction* maximum) override {
-    return HandleMaximum<ReturnT>(maximum);
+    return HandleMaximum<ElementwiseT>(maximum);
   }
 
   template <
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleMinimum(HloInstruction* minimum) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[minimum],
-        ElementWiseBinaryOp(minimum, [](ReturnT lhs_el, ReturnT rhs_el) {
-          return std::fmin(lhs_el, rhs_el);
-        }));
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[minimum],
+                        ElementWiseBinaryOp(minimum, [](ElementwiseT lhs_el,
+                                                        ElementwiseT rhs_el) {
+                          return std::fmin(lhs_el, rhs_el);
+                        }));
     return Status::OK();
   }
 
@@ -547,15 +631,15 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleMinimum(HloInstruction* minimum) override {
-    return HandleMinimum<ReturnT>(minimum);
+    return HandleMinimum<ElementwiseT>(minimum);
   }
 
   Status HandlePower(HloInstruction* power) override {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[power],
-        ElementWiseBinaryOp(power, [](ReturnT lhs_el, ReturnT rhs_el) {
-          return std::pow(lhs_el, rhs_el);
-        }));
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[power],
+                        ElementWiseBinaryOp(power, [](ElementwiseT lhs_el,
+                                                      ElementwiseT rhs_el) {
+                          return std::pow(lhs_el, rhs_el);
+                        }));
     return Status::OK();
   }
 
@@ -563,11 +647,11 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleRemainder(HloInstruction* remainder) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[remainder],
-        ElementWiseBinaryOp(remainder, [](ReturnT lhs_el, ReturnT rhs_el) {
-          return std::fmod(lhs_el, rhs_el);
-        }));
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[remainder],
+                        ElementWiseBinaryOp(remainder, [](ElementwiseT lhs_el,
+                                                          ElementwiseT rhs_el) {
+                          return std::fmod(lhs_el, rhs_el);
+                        }));
     return Status::OK();
   }
 
@@ -579,7 +663,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleRemainder(HloInstruction* remainder) override {
-    return HandleRemainder<ReturnT>(remainder);
+    return HandleRemainder<ElementwiseT>(remainder);
   }
 
   template <typename NativeT,
@@ -588,7 +672,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleAnd(HloInstruction* and_) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[and_],
-        ElementWiseBinaryOp(and_, [](ReturnT lhs_el, ReturnT rhs_el) {
+        ElementWiseBinaryOp(and_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
           return lhs_el & rhs_el;
         }));
     return Status::OK();
@@ -599,7 +683,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleAnd(HloInstruction* and_) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[and_],
-        ElementWiseBinaryOp(and_, [](ReturnT lhs_el, ReturnT rhs_el) {
+        ElementWiseBinaryOp(and_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
           return lhs_el && rhs_el;
         }));
     return Status::OK();
@@ -613,7 +697,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleAnd(HloInstruction* and_) override {
-    return HandleAnd<ReturnT>(and_);
+    return HandleAnd<ElementwiseT>(and_);
   }
 
   template <typename NativeT,
@@ -622,7 +706,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleOr(HloInstruction* or_) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[or_],
-        ElementWiseBinaryOp(or_, [](ReturnT lhs_el, ReturnT rhs_el) {
+        ElementWiseBinaryOp(or_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
           return lhs_el | rhs_el;
         }));
     return Status::OK();
@@ -633,7 +717,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleOr(HloInstruction* or_) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[or_],
-        ElementWiseBinaryOp(or_, [](ReturnT lhs_el, ReturnT rhs_el) {
+        ElementWiseBinaryOp(or_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
           return lhs_el || rhs_el;
         }));
     return Status::OK();
@@ -647,7 +731,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleOr(HloInstruction* or_) override {
-    return HandleOr<ReturnT>(or_);
+    return HandleOr<ElementwiseT>(or_);
   }
 
   template <typename NativeT,
@@ -672,7 +756,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleShiftLeft(HloInstruction* shl) override {
-    return HandleShiftLeft<ReturnT>(shl);
+    return HandleShiftLeft<ElementwiseT>(shl);
   }
   template <typename NativeT,
             typename std::enable_if<
@@ -698,7 +782,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleShiftRightArithmetic(HloInstruction* shra) override {
-    return HandleShiftRightArithmetic<ReturnT>(shra);
+    return HandleShiftRightArithmetic<ElementwiseT>(shra);
   }
 
   template <typename NativeT,
@@ -725,19 +809,21 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleShiftRightLogical(HloInstruction* shrl) override {
-    return HandleShiftRightLogical<ReturnT>(shrl);
+    return HandleShiftRightLogical<ElementwiseT>(shrl);
   }
 
   template <
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleClamp(HloInstruction* clamp) {
-    std::function<ReturnT(ReturnT, ReturnT, ReturnT)> clamp_op =
-        [](ReturnT low, ReturnT value, ReturnT high) {
+    std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
+        clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
           return std::fmax(low, std::fmin(value, high));
         };
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clamp],
-                        ElementWiseTernaryOp(clamp, std::move(clamp_op)));
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[clamp],
+        ElementwiseTernaryOp(clamp,
+                             std::move(ConvertTernaryFunction(clamp_op))));
     return Status::OK();
   }
 
@@ -749,7 +835,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleClamp(HloInstruction* clamp) override {
-    return HandleClamp<ReturnT>(clamp);
+    return HandleClamp<ElementwiseT>(clamp);
   }
 
   Status HandleSelect(HloInstruction* select) override {
@@ -762,7 +848,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
           return on_false;
         };
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[select],
-                        ElementWiseTernaryOp(select, std::move(select_op)));
+                        ElementwiseTernaryOp(select, std::move(select_op)));
     return Status::OK();
   }
 
@@ -780,7 +866,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
         << " but is inferred to be: "
         << ShapeUtil::HumanString(inferred_return_shape);
 
-    auto operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
     auto result = Literal::CreateFromShape(result_shape);
 
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
@@ -860,7 +946,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size());
 
     auto func = [&](tensorflow::gtl::ArraySlice<int64> out_index) {
-      ReturnT result_val = static_cast<ReturnT>(0);
+      ElementwiseT result_val = static_cast<ElementwiseT>(0);
 
       std::fill(lhs_index.begin(), lhs_index.end(), 0);
       std::fill(rhs_index.begin(), rhs_index.end(), 0);
@@ -889,14 +975,21 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                 out_index[output_spatial_dim] * window_dim.stride() -
                 window_dim.padding_low() +
                 rhs_spatial_index[ki] * window_dim.window_dilation();
-            // Skip if the lhs (input) index is to be dilated.
-            if (undilated_index % window_dim.base_dilation() != 0) {
+            // Skip if the lhs (input) index is to be dilated.  As an
+            // optimization, skip this mod if there's no dilation.
+            if (window_dim.base_dilation() > 1 &&
+                undilated_index % window_dim.base_dilation() != 0) {
               goto cnt;
             }
 
-            // Calculate the actual lhs (input) index after dilation.
-            lhs_index[input_spatial_dim] =
-                undilated_index / window_dim.base_dilation();
+            // Calculate the actual lhs (input) index after dilation.  As an
+            // optimization, skip this integer divide if there's no dilation.
+            if (window_dim.base_dilation() > 1) {
+              lhs_index[input_spatial_dim] =
+                  undilated_index / window_dim.base_dilation();
+            } else {
+              lhs_index[input_spatial_dim] = undilated_index;
+            }
 
             // Skip if input index is not in bound.
             if (!(lhs_index[input_spatial_dim] >= 0 &&
@@ -911,13 +1004,14 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                     : rhs_spatial_index[ki];
           }
 
-          result_val += lhs_literal.Get<ReturnT>(lhs_index) *
-                        rhs_literal.Get<ReturnT>(rhs_index);
+          result_val +=
+              static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
+              static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
         }
       cnt : {}
       } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index));
 
-      return result_val;
+      return static_cast<ReturnT>(result_val);
     };
 
     auto result = Literal::CreateFromShape(result_shape);
@@ -934,61 +1028,126 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     CHECK(ShapeUtil::IsArray(lhs->shape()));
     CHECK(ShapeUtil::IsArray(rhs->shape()));
 
-    // Dot only supports operands of rank 1 and 2.
-    const auto dot_rank = ShapeUtil::Rank(dot->shape());
+    const auto& dnums = dot->dot_dimension_numbers();
+
     const auto lhs_rank = ShapeUtil::Rank(lhs->shape());
     const auto rhs_rank = ShapeUtil::Rank(rhs->shape());
-    CHECK(lhs_rank > 0 && lhs_rank <= 2);
-    CHECK(rhs_rank > 0 && rhs_rank <= 2);
-    CHECK_EQ(dot_rank, lhs_rank + rhs_rank - 2);
 
     CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
     CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
 
-    // Check contracted dimensions are the same.
-    //
-    // Determine the index of the contracted dimensions for input tensors.
-    // dimensions -1 of lhs and dimension 0 of rhs are contracted.
-    const int64 lhs_contracted_dimension =
-        ShapeUtil::GetDimensionNumber(lhs->shape(), -1);
-    const int64 rhs_contracted_dimension = 0;
-    CHECK_EQ(lhs->shape().dimensions(lhs_contracted_dimension),
-             rhs->shape().dimensions(rhs_contracted_dimension))
+    // There must be 1 and only 1 Contracting dimension for lhs and rhs.
+    CHECK_EQ(dnums.lhs_contracting_dimensions_size(), 1);
+    CHECK_EQ(dnums.rhs_contracting_dimensions_size(), 1);
+    const int64 lhs_contracting_dimension = dnums.lhs_contracting_dimensions(0);
+    const int64 rhs_contracting_dimension = dnums.rhs_contracting_dimensions(0);
+    // Contracted dimension sizes must be the same.
+    CHECK_EQ(lhs->shape().dimensions(lhs_contracting_dimension),
+             rhs->shape().dimensions(rhs_contracting_dimension))
         << "lhs contracted dimension: "
-        << lhs->shape().dimensions(lhs_contracted_dimension)
+        << lhs->shape().dimensions(lhs_contracting_dimension)
         << " rhs contracted dimension: "
-        << rhs->shape().dimensions(rhs_contracted_dimension);
+        << rhs->shape().dimensions(rhs_contracting_dimension);
     const int64 contracted_dimension_size =
-        lhs->shape().dimensions(lhs_contracted_dimension);
+        lhs->shape().dimensions(lhs_contracting_dimension);
 
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
 
     auto result = Literal::CreateFromShape(dot->shape());
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
-          ReturnT result_val = static_cast<ReturnT>(0);
 
-          std::vector<int64> lhs_index(lhs_rank, 0);
-          std::vector<int64> rhs_index(rhs_rank, 0);
-          // Set index for non-contracted dimension for lhs and rhs.
-          if (lhs_rank > 1) {
-            lhs_index[0] = multi_index[0];
+    CHECK_EQ(dnums.lhs_batch_dimensions_size(),
+             dnums.rhs_batch_dimensions_size());
+
+    std::vector<int64> lhs_non_contracting_dims;
+    for (int64 i = 0; i < lhs_rank; i++) {
+      if (i != lhs_contracting_dimension) {
+        lhs_non_contracting_dims.push_back(i);
+      }
+    }
+
+    std::vector<int64> rhs_non_batch_non_contracting_dims;
+    tensorflow::gtl::FlatSet<int64> batch_dims_set(
+        dnums.rhs_batch_dimensions().begin(),
+        dnums.rhs_batch_dimensions().end());
+    for (int64 i = 0; i < rhs_rank; i++) {
+      if (i != rhs_contracting_dimension && batch_dims_set.count(i) == 0) {
+        rhs_non_batch_non_contracting_dims.push_back(i);
+      }
+    }
+
+    const int64 batch_dim_size = dnums.lhs_batch_dimensions_size();
+    const int64 lhs_non_contracting_size = lhs_non_contracting_dims.size();
+
+    DimensionVector lhs_index(lhs_rank);
+    DimensionVector rhs_index(rhs_rank);
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> result_index) {
+          ElementwiseT result_val = static_cast<ElementwiseT>(0);
+
+          // Find the corresponding non-contracting indices for lhs and rhs.
+          //
+          // For `result_index`, its batch dimension, if exists, will be at the
+          // same dimension as the batch dimension of lhs and rhs. More
+          // specifically:
+          // - For lhs, the non-contracting dimensions, including the batch
+          // dimension have the same index as the `result_index`.
+          // - For rhs, the batch dimension is set seperately from other
+          // non-contracting dimensions, since these other non-contracting
+          // dimensions in rhs follow the non-contracting dimensions of lhs in
+          // the resulting index.
+          //
+          // As an example, for a resulting index:
+          //  result_index [result_batch, result_x, result_y]
+          // the effecting lhs and rhs indices are:
+          //  lhs [result_batch, lhs_non_contracting_dim, contracting_dim
+          //  rhs [result_batch, contracting_dim, rhs_non_contracting_dim]
+          // `result_x` is only affected by the lhs_non_contracting_dim and
+          // likewise `result_y` only depends on rhs_non_contracting_dim.
+          //
+          // so we can look up the lhs and rhs indices by:
+          //
+          // lhs:
+          //  batch index is the same as `result_batch`.
+          //    non-contracting dimension is the same as
+          //    result_index[lhs_non_contracting_dim]
+          // rhs:
+          //  batch index: the same as `result_batch`.
+          //  non-contracting dimension index: *not* the same as
+          //    result_index[rhs_non_contractng_dim], since the
+          //    non-contracting dimensions of lhs are included in the
+          //    result_index first. Instead, the non_contracting_dim of rhs must
+          //    be calculated as following:
+          //      lhs_non_contracting_dimensions_size +
+          //      (rhs_non_batch_non_contracting_dim - batch_dim_size) - 1
+          //
+          //    Note that (rhs_non_batch_contracting_dim - batch_dim_size) is
+          //    the index offset to the result_index that only depends on
+          //    the non_batch and non-contracting dimensions of rhs. -1 at the
+          //    end translates size to index.
+          for (auto i : lhs_non_contracting_dims) {
+            lhs_index[i] = result_index[i];
+          }
+          for (auto i : dnums.rhs_batch_dimensions()) {
+            rhs_index[i] = result_index[i];
           }
-          if (rhs_rank > 1) {
-            rhs_index[1] = multi_index[multi_index.size() - 1];
+          for (auto i : rhs_non_batch_non_contracting_dims) {
+            const int64 rhs_non_batch_non_contracting_dim =
+                lhs_non_contracting_size + (i - batch_dim_size) - 1;
+            rhs_index[i] = result_index[rhs_non_batch_non_contracting_dim];
           }
 
           // Accumulates resulting product along the contracted dimension.
           for (int64 i = 0; i < contracted_dimension_size; ++i) {
-            lhs_index[lhs_contracted_dimension] = i;
-            rhs_index[rhs_contracted_dimension] = i;
+            lhs_index[lhs_contracting_dimension] = i;
+            rhs_index[rhs_contracting_dimension] = i;
 
-            result_val += lhs_literal.Get<ReturnT>(lhs_index) *
-                          rhs_literal.Get<ReturnT>(rhs_index);
+            result_val +=
+                static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
+                static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
           }
 
-          return result_val;
+          return static_cast<ReturnT>(result_val);
         }));
 
     parent_->evaluated_[dot] = std::move(result);
@@ -1021,7 +1180,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
           return scalar;
         }));
 
-    auto evaluated_operand = parent_->GetEvaluatedLiteralFor(pad->operand(0));
+    const Literal& evaluated_operand =
+        parent_->GetEvaluatedLiteralFor(pad->operand(0));
 
     std::vector<int64> input_index(ShapeUtil::Rank(evaluated_operand.shape()),
                                    0);
@@ -1174,6 +1334,97 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  template <typename NativeT>
+  StatusOr<std::unique_ptr<Literal>> MapImpl(HloInstruction* map) {
+    auto operands = map->operands();
+    HloComputation* computation = map->to_apply();
+
+    auto result = Literal::CreateFromShape(map->shape());
+
+    HloEvaluator embedded_evaluator;
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          std::vector<std::unique_ptr<Literal>> arg_literals;
+          arg_literals.reserve(operands.size());
+
+          // Construct scalar literal parameters to be passed to the map
+          // computation.
+          for (auto operand : operands) {
+            const Literal& arg_literal =
+                parent_->GetEvaluatedLiteralFor(operand);
+
+            auto curr_val = arg_literal.Get<NativeT>(multi_index);
+            auto curr_val_literal = Literal::CreateR0<NativeT>(curr_val);
+
+            arg_literals.push_back(std::move(curr_val_literal));
+          }
+
+          std::unique_ptr<Literal> computed_result =
+              embedded_evaluator
+                  .Evaluate<std::unique_ptr<Literal>>(*computation,
+                                                      arg_literals)
+                  .ConsumeValueOrDie();
+          // Clear visit states so that the we can use the evaluate again on
+          // the same computation.
+          embedded_evaluator.ResetVisitStates();
+
+          return computed_result->Get<ReturnT>({});
+        }));
+    return std::move(result);
+  }
+
+  Status HandleMap(HloInstruction* map) override {
+    switch (map->operand(0)->shape().element_type()) {
+      case PRED: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<bool>(map));
+        break;
+      }
+      case U8: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint8>(map));
+        break;
+      }
+      case U32: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint32>(map));
+        break;
+      }
+      case U64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint64>(map));
+        break;
+      }
+      case S8: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int8>(map));
+        break;
+      }
+      case S32: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int32>(map));
+        break;
+      }
+      case S64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int64>(map));
+        break;
+      }
+      case F32: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<float>(map));
+        break;
+      }
+      case F64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<double>(map));
+        break;
+      }
+      case C64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<complex64>(map));
+        break;
+      }
+      default:
+        LOG(FATAL) << "HandleMap: unhandled primitive type for "
+                      "input operand: "
+                   << PrimitiveType_Name(
+                          map->operand(0)->shape().element_type());
+    }
+
+    return Status::OK();
+  }
+
   Status HandleReduce(HloInstruction* reduce) override {
     auto arg = reduce->operand(0);
     auto init_value = reduce->operand(1);
@@ -1220,6 +1471,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       }
     }
 
+    HloEvaluator embedded_evaluator;
     // For each resulting dimension, calculate and assign computed value.
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
@@ -1239,13 +1491,12 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
             std::vector<const Literal*> args = {curr_val_literal.get(),
                                                 result_val_literal.get()};
 
-            // We need a new visitor for each evaluation, so that the same
-            // computation can be visited more than once (with different
-            // inputs).
-            HloEvaluator embedded_evaluator;
             std::unique_ptr<Literal> computed_result =
-                embedded_evaluator.Evaluate(*function, args)
+                embedded_evaluator.Evaluate<const Literal*>(*function, args)
                     .ConsumeValueOrDie();
+            // Clear visit states so that the we can use the evaluate again on
+            // the same computation.
+            embedded_evaluator.ResetVisitStates();
 
             // Assign computed result to result_val.
             result_val = computed_result->Get<ReturnT>({});
@@ -1263,6 +1514,111 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override {
+    auto operand = select_and_scatter->operand(0);
+    auto source = select_and_scatter->operand(1);
+    const Window& window = select_and_scatter->window();
+
+    const Literal& init_literal =
+        parent_->GetEvaluatedLiteralFor(select_and_scatter->operand(2));
+    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
+    auto init_scalar = init_literal.Get<ReturnT>({});
+
+    auto result = Literal::CreateFromShape(select_and_scatter->shape());
+
+    // Initialize result array with the init value.
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> output_index) {
+          return init_scalar;
+        }));
+
+    std::vector<int64> window_dimension_sizes;
+    for (const auto& window_dimension : window.dimensions()) {
+      window_dimension_sizes.push_back(window_dimension.size());
+    }
+    const Shape window_shape = ShapeUtil::MakeShape(
+        operand->shape().element_type(), window_dimension_sizes);
+
+    HloComputation* select = select_and_scatter->select();
+    HloComputation* scatter = select_and_scatter->scatter();
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    const Literal& source_literal = parent_->GetEvaluatedLiteralFor(source);
+
+    int64 rank = ShapeUtil::Rank(operand_literal.shape());
+
+    HloEvaluator embedded_evaluator;
+    DimensionVector source_index(rank);
+
+    std::fill(source_index.begin(), source_index.end(), 0);
+    do {
+      // For each element in `source`, we place a window in `operand`. For each
+      // window placement, we iterate inside the window twice:
+      //
+      // 1. Find the selected index by applying `select` function to all
+      // elements. E.g., If the `select` function is GreaterEqual, the first
+      // iteration through the window finds the biggest value and returns its
+      // index.
+      //
+      // 2. Using the selected index, scatter value from `source` to result. We
+      // do this by iterating through the window, and compare each index with
+      // the selected index.
+      tensorflow::gtl::optional<ReturnT> selected_val;
+      tensorflow::gtl::optional<std::vector<int64>> selected_index;
+
+      IterateThroughWindow(
+          window_shape, window, operand_literal.shape(), source_index,
+          [&](const std::vector<int64>& operand_index) {
+            auto curr_val = operand_literal.Get<ReturnT>(operand_index);
+            if (!selected_val) {
+              selected_val = curr_val;
+              selected_index = operand_index;
+            }
+            const auto curr_val_literal = Literal::CreateR0<ReturnT>(curr_val);
+            const auto selected_val_literal =
+                Literal::CreateR0<ReturnT>(*selected_val);
+
+            const std::vector<const Literal*> args = {
+                curr_val_literal.get(), selected_val_literal.get()};
+            std::unique_ptr<Literal> computed_result =
+                embedded_evaluator.Evaluate<const Literal*>(*select, args)
+                    .ConsumeValueOrDie();
+            bool selected = computed_result->Get<bool>({});
+            if (selected) {
+              selected_val = curr_val;
+              selected_index = operand_index;
+            }
+            embedded_evaluator.ResetVisitStates();
+          });
+
+      IterateThroughWindow(
+          window_shape, window, operand_literal.shape(), source_index,
+          [&](const std::vector<int64>& operand_index) {
+            if (std::equal(operand_index.begin(), operand_index.end(),
+                           selected_index->begin())) {
+              auto source = source_literal.Get<ReturnT>(source_index);
+              auto scattered = result->Get<ReturnT>(operand_index);
+              const auto source_literal = Literal::CreateR0<ReturnT>(source);
+              const auto scattered_literal =
+                  Literal::CreateR0<ReturnT>(scattered);
+
+              const std::vector<const Literal*> args = {
+                  source_literal.get(), scattered_literal.get()};
+              std::unique_ptr<Literal> computed_result =
+                  embedded_evaluator.Evaluate<const Literal*>(*scatter, args)
+                      .ConsumeValueOrDie();
+              result->Set(operand_index, computed_result->Get<ReturnT>({}));
+              // Clear visit states so that the we can use the evaluator again
+              // on the same computation.
+              embedded_evaluator.ResetVisitStates();
+            }
+          });
+    } while (IndexUtil::BumpIndices(source->shape(), &source_index));
+
+    parent_->evaluated_[select_and_scatter] = std::move(result);
+    return Status::OK();
+  }
+
   Status HandleReduceWindow(HloInstruction* reduce_window) override {
     auto operand = reduce_window->operand(0);
     const Window& window = reduce_window->window();
@@ -1302,6 +1658,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     DimensionVector window_index(window.dimensions_size());
     DimensionVector operand_index(ShapeUtil::Rank(operand_literal.shape()));
 
+    HloEvaluator embedded_evaluator;
     // For each resulting dimension, calculate and assign computed value.
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> output_index) {
@@ -1310,39 +1667,28 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
           std::fill(window_index.begin(), window_index.end(), 0);
           std::fill(operand_index.begin(), operand_index.end(), 0);
 
-          do {
-            // Set curr_val to 0 if out of bound (padded).
-            ReturnT curr_val = static_cast<ReturnT>(0);
-            bool out_of_bound = false;
-            for (int i = 0; i < operand_index.size(); ++i) {
-              operand_index[i] =
-                  output_index[i] * window.dimensions(i).stride() +
-                  window_index[i] - window.dimensions(i).padding_low();
-              if (operand_index[i] < 0 ||
-                  operand_index[i] >= operand_literal.shape().dimensions(i)) {
-                out_of_bound = true;
-                break;
-              }
-            }
-            if (!out_of_bound) {
-              curr_val = operand_literal.Get<ReturnT>(operand_index);
-            }
-            // Evaluate computation with specified literal operands.
-            const auto curr_val_literal = Literal::CreateR0<ReturnT>(curr_val);
-            const auto result_val_literal =
-                Literal::CreateR0<ReturnT>(result_val);
-            const std::vector<const Literal*> args = {curr_val_literal.get(),
-                                                      result_val_literal.get()};
-            // We need a new visitor for each evaluation, so that the same
-            // computation can be visited more than once (with different
-            // inputs).
-            HloEvaluator embedded_evaluator;
-            std::unique_ptr<Literal> computed_result =
-                embedded_evaluator.Evaluate(*function, args)
-                    .ConsumeValueOrDie();
-
-            result_val = computed_result->Get<ReturnT>({});
-          } while (IndexUtil::BumpIndices(window_shape, &window_index));
+          IterateThroughWindow(
+              window_shape, window, operand_literal.shape(), output_index,
+              [&](const std::vector<int64>& operand_index) {
+                auto curr_val = operand_literal.Get<ReturnT>(operand_index);
+
+                // Evaluate computation with specified literal operands.
+                const auto curr_val_literal =
+                    Literal::CreateR0<ReturnT>(curr_val);
+                const auto result_val_literal =
+                    Literal::CreateR0<ReturnT>(result_val);
+                const std::vector<const Literal*> args = {
+                    curr_val_literal.get(), result_val_literal.get()};
+                std::unique_ptr<Literal> computed_result =
+                    embedded_evaluator.Evaluate<const Literal*>(*function, args)
+                        .ConsumeValueOrDie();
+
+                // Clear visit states so that the we can use the evaluate again
+                // on the same computation.
+                embedded_evaluator.ResetVisitStates();
+
+                result_val = computed_result->Get<ReturnT>({});
+              });
 
           return result_val;
         }));
@@ -1364,7 +1710,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
         << ShapeUtil::HumanString(inferred_return_shape);
 
     const int64 rank = ShapeUtil::Rank(operand->shape());
-    auto operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
     auto func = [&](tensorflow::gtl::ArraySlice<int64> out_index) {
       DimensionVector operand_index(rank);
       for (int64 i = 0; i < rank; ++i) {
@@ -1385,7 +1731,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
                                   NativeT>::value>::type* = nullptr>
   Status HandleSin(HloInstruction* sin) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[sin],
-                        ElementWiseUnaryOp(sin, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
                           return std::sin(elem_operand);
                         }));
     return Status::OK();
@@ -1400,14 +1746,14 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleSin(HloInstruction* sin) override {
-    return HandleSin<ReturnT>(sin);
+    return HandleSin<ElementwiseT>(sin);
   }
 
   template <typename NativeT, typename std::enable_if<std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleCos(HloInstruction* cos) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[cos],
-                        ElementWiseUnaryOp(cos, [](ReturnT elem_operand) {
+                        ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
                           return std::cos(elem_operand);
                         }));
     return Status::OK();
@@ -1422,7 +1768,116 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleCos(HloInstruction* cos) override {
-    return HandleCos<ReturnT>(cos);
+    return HandleCos<ElementwiseT>(cos);
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  float, NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[reduce_precision],
+        ElementWiseUnaryOp(reduce_precision, [reduce_precision](
+                                                 ElementwiseT elem) {
+          uint32_t value_as_int = tensorflow::bit_cast<uint32_t>(elem);
+          const uint32_t mantissa_bits = reduce_precision->mantissa_bits();
+          const uint32_t exponent_bits = reduce_precision->exponent_bits();
+
+          // Code is based on the CPU/GPU implementation in LLVM-emitting code.
+          //
+          // Bits in float type:
+          //   mantissa : bits [0:22]
+          //   exponent : bits [23:30]
+          //   sign     : bits [31]
+          if (mantissa_bits < 23) {
+            const uint32_t last_mantissa_bit_mask = 1u << (23 - mantissa_bits);
+
+            // Compute rounding bias for round-to-nearest with ties to even.
+            // This is equal to a base value of 0111... plus one bit if the last
+            // remaining mantissa bit is 1.
+            const uint32_t base_rounding_bias =
+                (last_mantissa_bit_mask >> 1) - 1;
+            const uint32_t x_last_mantissa_bit =
+                (value_as_int & last_mantissa_bit_mask) >> (23 - mantissa_bits);
+            const uint32_t x_rounding_bias =
+                x_last_mantissa_bit + base_rounding_bias;
+
+            // Add rounding bias, and mask out truncated bits.  Note that the
+            // case where adding the rounding bias overflows into the exponent
+            // bits is correct; the non-masked mantissa bits will all be zero,
+            // and the exponent will be incremented by one.
+            const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
+            value_as_int = value_as_int + x_rounding_bias;
+            value_as_int = value_as_int & truncation_mask;
+          }
+          if (exponent_bits < 8) {
+            // Masks for f32 values.
+            const uint32_t f32_sign_bit_mask = 1u << 31;
+            const uint32_t f32_exp_bits_mask = 0xffu << 23;
+
+            // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the
+            // most- significant bit -- is equal to 1.0f for all exponent sizes.
+            // Adding 2^(n-1)-1 to this gives us the highest non-infinite
+            // exponent for a bit- size of n, and subtracting 2^(n-1)-1 from
+            // this gives us the lowest' exponent (corresponding to 0.0f).
+            //
+            // Thus, the f32 exponent corresponding to the highest non-infinite
+            // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
+            // exponent corresponding to the lowest exponent for a bit size of n
+            // is (2^7-1) - 2^(n-1)-1.
+            //
+            // Note that we have already checked that exponents_bits >= 1.
+            const uint32_t f32_exponent_bias = (1 << 7) - 1;
+            const uint32_t reduced_exponent_bias =
+                (1 << (exponent_bits - 1)) - 1;
+            const uint32_t reduced_max_exponent =
+                f32_exponent_bias + reduced_exponent_bias;
+            const uint32_t reduced_min_exponent =
+                f32_exponent_bias - reduced_exponent_bias;
+
+            // Do we overflow or underflow?
+            const uint32_t x_exponent = value_as_int & f32_exp_bits_mask;
+            const bool x_overflows = x_exponent > (reduced_max_exponent << 23);
+            const bool x_underflows =
+                x_exponent <= (reduced_min_exponent << 23);
+
+            // Compute appropriately-signed values of zero and infinity.
+            const uint32_t x_signed_zero = value_as_int & f32_sign_bit_mask;
+            const uint32_t x_signed_inf = x_signed_zero | f32_exp_bits_mask;
+
+            // Force to zero or infinity if overflow or underflow.  (Note that
+            // this truncates all denormal values to zero, rather than rounding
+            // them.)
+            value_as_int = x_overflows ? x_signed_inf : value_as_int;
+            value_as_int = x_underflows ? x_signed_zero : value_as_int;
+          }
+
+          float reduced_result = tensorflow::bit_cast<float>(value_as_int);
+          if (std::isnan(elem)) {
+            reduced_result = mantissa_bits > 0
+                                 ? elem
+                                 : std::numeric_limits<float>::infinity();
+          }
+          return reduced_result;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  double, NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    return InvalidArgument("Double not supported for reduce precision");
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    return InvalidArgument("Unsupported type for reduce precision");
+  }
+
+  Status HandleReducePrecision(HloInstruction* reduce_precision) override {
+    return HandleReducePrecision<ElementwiseT>(reduce_precision);
   }
 
  private:
@@ -1430,8 +1885,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   StatusOr<std::unique_ptr<Literal>> DynamicSlice(
       const Literal& operand_literal, const Literal& start_indices_literal,
       const Shape& result_shape) {
-    const auto& start_indices_typed =
-        start_indices_literal.GetArraySlice<IndexT>();
+    auto start_indices_typed = start_indices_literal.data<IndexT>();
     std::vector<int64> start(start_indices_typed.begin(),
                              start_indices_typed.end());
 
@@ -1459,12 +1913,11 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
   StatusOr<std::unique_ptr<Literal>> DynamicUpdateSlice(
       const Literal& operand_literal, const Literal& update_literal,
       const Literal& start_indices_literal) {
-    const auto& start_indices_typed =
-        start_indices_literal.GetArraySlice<IndexT>();
+    auto start_indices_typed = start_indices_literal.data<IndexT>();
     const std::vector<int64> start(start_indices_typed.begin(),
                                    start_indices_typed.end());
 
-    auto result = MakeUnique<Literal>(operand_literal);
+    auto result = operand_literal.CloneToUnique();
     std::vector<int64> result_index(ShapeUtil::Rank(result->shape()), 0);
 
     auto func = [&](const std::vector<int64>& update_index) {
@@ -1487,22 +1940,27 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
   StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOp(
       HloInstruction* instruction,
-      const std::function<ReturnT(ReturnT)>& unary_op) {
+      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
     const Literal& operand_literal =
         parent_->GetEvaluatedLiteralFor(instruction->operand(0));
-    return ElementWiseUnaryOpImpl<ReturnT, ReturnT>(instruction, unary_op,
-                                                    operand_literal);
+    TF_ASSIGN_OR_RETURN(
+        auto result_literal,
+        (ElementWiseUnaryOpImpl<ReturnT, ReturnT>(
+            instruction, ConvertUnaryFunction(unary_op), operand_literal)));
+
+    return std::move(result_literal);
   }
 
   StatusOr<std::unique_ptr<Literal>> ElementWiseBinaryOp(
       HloInstruction* instruction,
-      const std::function<ReturnT(ReturnT, ReturnT)>& binary_op) {
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
+          binary_op) {
     const auto shape = instruction->shape();
     const auto* lhs = instruction->operand(0);
     const auto* rhs = instruction->operand(1);
 
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
-    // removed.
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast
+    // is removed.
     if (!(ShapeUtil::SameDimensions(shape, rhs->shape()) &&
           ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
       return Unimplemented(
@@ -1520,14 +1978,15 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
     TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
         [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
-          return binary_op(lhs_literal.Get<ReturnT>(multi_index),
-                           rhs_literal.Get<ReturnT>(multi_index));
+          return ConvertBinaryFunction(binary_op)(
+              lhs_literal.Get<ReturnT>(multi_index),
+              rhs_literal.Get<ReturnT>(multi_index));
         }));
     return std::move(result);
   }
 
   template <typename LhsType, typename RhsType, typename EhsType>
-  StatusOr<std::unique_ptr<Literal>> ElementWiseTernaryOp(
+  StatusOr<std::unique_ptr<Literal>> ElementwiseTernaryOp(
       HloInstruction* instruction,
       const std::function<ReturnT(LhsType, RhsType, EhsType)>& ternary_op) {
     const auto shape = instruction->shape();
@@ -1589,9 +2048,11 @@ HloEvaluator::HloEvaluator() {
   typed_visitors_[F64] = MakeUnique<TypedVisitor<double>>(this);
   typed_visitors_[C64] = MakeUnique<TypedVisitor<complex64>>(this);
 
-  typed_visitors_[BF16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
-    return Unimplemented("HloEvaluator: unhandled primitive type: BF16.");
-  });
+  // Most of the evaluator computations we use don't support BF16 (e.g.,
+  // std::ceil, std::tanh). To make evaluator work with BF16, we set all
+  // elementwise computations to be done in F32 and do BF16<->F32 conversion
+  // around the input and the output of the computations.
+  typed_visitors_[BF16] = MakeUnique<TypedVisitor<bfloat16, float>>(this);
   typed_visitors_[TUPLE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
     return Unimplemented("HloEvaluator: unhandled primitive type: TUPLE.");
   });
@@ -1600,41 +2061,53 @@ HloEvaluator::HloEvaluator() {
   });
 }
 
+template <typename LiteralPtr>
 StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
     const HloModule& module,
-    tensorflow::gtl::ArraySlice<const Literal*> arg_literals) {
+    tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals) {
   XLA_VLOG_LINES(2, "HloEvaluator::Evaluate module:\n" + module.ToString());
 
-  arg_literals_ = arg_literals;
   evaluated_.clear();
+  arg_literals_.clear();
+  for (const auto& literal_ptr : arg_literals) {
+    arg_literals_.push_back(&*literal_ptr);
+  }
 
   TF_RETURN_IF_ERROR(module.entry_computation()->Accept(this));
 
-  return MakeUnique<Literal>(
-      GetEvaluatedLiteralFor(module.entry_computation()->root_instruction()));
+  return GetEvaluatedLiteralFor(module.entry_computation()->root_instruction())
+      .CloneToUnique();
 }
 
+template <typename LiteralPtr>
 StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
     const HloComputation& computation,
-    tensorflow::gtl::ArraySlice<const Literal*> arg_literals) {
+    tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals) {
   XLA_VLOG_LINES(
       2, "HloEvaluator::Evaluate computation:\n" + computation.ToString());
-  arg_literals_ = arg_literals;
+
   evaluated_.clear();
+  arg_literals_.clear();
+  for (const auto& literal_ptr : arg_literals) {
+    arg_literals_.push_back(&*literal_ptr);
+  }
 
   TF_RETURN_IF_ERROR(computation.Accept(this));
-  return MakeUnique<Literal>(
-      GetEvaluatedLiteralFor(computation.root_instruction()));
+  return GetEvaluatedLiteralFor(computation.root_instruction()).CloneToUnique();
 }
 
+template <typename LiteralPtr>
 StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
     HloInstruction* instruction,
-    tensorflow::gtl::ArraySlice<const Literal*> operands) {
+    tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals) {
   TF_RET_CHECK(hlo_query::AllOperandsAreParametersOrConstants(*instruction));
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(instruction->shape()));
 
-  arg_literals_ = operands;
   evaluated_.clear();
+  arg_literals_.clear();
+  for (const auto& literal_ptr : arg_literals) {
+    arg_literals_.push_back(&*literal_ptr);
+  }
 
   // Evaluate operands of Parameter type against the input literals which
   // caches the evaluated literal results.
@@ -1645,14 +2118,14 @@ StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
               << input_literal->ToString();
       TF_RET_CHECK(ShapeUtil::Equal(operand->shape(), input_literal->shape()));
 
-      evaluated_[operand] = MakeUnique<Literal>(*input_literal);
+      evaluated_[operand] = input_literal->CloneToUnique();
     }
   }
 
   TF_RETURN_IF_ERROR(Preprocess(instruction));
   TF_RETURN_IF_ERROR(instruction->Visit(this));
   TF_RETURN_IF_ERROR(Postprocess(instruction));
-  return MakeUnique<Literal>(GetEvaluatedLiteralFor(instruction));
+  return GetEvaluatedLiteralFor(instruction).CloneToUnique();
 }
 
 StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
@@ -1673,7 +2146,7 @@ StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
   TF_RETURN_IF_ERROR(Preprocess(instruction));
   TF_RETURN_IF_ERROR(instruction->Visit(this));
   TF_RETURN_IF_ERROR(Postprocess(instruction));
-  return MakeUnique<Literal>(GetEvaluatedLiteralFor(instruction));
+  return GetEvaluatedLiteralFor(instruction).CloneToUnique();
 }
 
 std::unique_ptr<Literal> HloEvaluator::TryEvaluate(
@@ -1722,11 +2195,15 @@ StatusOr<std::unique_ptr<Literal>> HloEvaluator::EvaluateWithSubstitutions(
 }
 
 Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
+  CHECK_LT(parameter->parameter_number(), arg_literals_.size());
   const Literal* input_literal = arg_literals_[parameter->parameter_number()];
   VLOG(2) << "Parameter evaluated to: " << input_literal->ToString();
-  DCHECK(ShapeUtil::Equal(parameter->shape(), input_literal->shape()));
+  DCHECK(ShapeUtil::Equal(parameter->shape(), input_literal->shape()))
+      << "parameter shape is: " << ShapeUtil::HumanString(parameter->shape())
+      << ", but input literal shape is: "
+      << ShapeUtil::HumanString(input_literal->shape());
 
-  evaluated_[parameter] = MakeUnique<Literal>(*input_literal);
+  evaluated_[parameter] = input_literal->CloneToUnique();
   return Status::OK();
 }
 
@@ -1749,8 +2226,8 @@ Status HloEvaluator::HandleTranspose(HloInstruction* transpose) {
 Status HloEvaluator::HandleConcatenate(HloInstruction* concatenate) {
   tensorflow::gtl::ArraySlice<HloInstruction*> operands(
       concatenate->operands());
-  // The result concatenate dimension is going to be the sum of all concatenate
-  // dimensions of the operands taking part of the operation.
+  // The result concatenate dimension is going to be the sum of all
+  // concatenate dimensions of the operands taking part of the operation.
   const Shape& reference_shape = operands[0]->shape();
   CHECK(!ShapeUtil::IsTuple(reference_shape));
   const int64 rank = ShapeUtil::Rank(reference_shape);
@@ -1777,7 +2254,7 @@ Status HloEvaluator::HandleConcatenate(HloInstruction* concatenate) {
 
   for (auto operand : operands) {
     const Shape& operand_shape = operand->shape();
-    TF_RETURN_IF_ERROR(result_literal->Copy(
+    TF_RETURN_IF_ERROR(result_literal->CopySliceFrom(
         GetEvaluatedLiteralFor(operand), source_indices, dest_indices,
         AsInt64Slice(operand_shape.dimensions())));
     dest_indices[concat_dim] +=
@@ -1935,16 +2412,17 @@ Status HloEvaluator::HandleGetTupleElement(HloInstruction* get_tuple_element) {
 
   const Literal& operand_tuple_literal = GetEvaluatedLiteralFor(operand);
 
-  evaluated_[get_tuple_element] =
-      MakeUnique<Literal>(operand_tuple_literal.tuple_literals(index));
-
-  return Status::OK();
+  evaluated_[get_tuple_element] = MakeUnique<Literal>(
+      ShapeUtil::GetTupleElementShape(operand->shape(), index));
+  return evaluated_[get_tuple_element]->CopyFrom(operand_tuple_literal,
+                                                 /*dest_shape_index=*/{},
+                                                 /*src_shape_index=*/{index});
 }
 
 Status HloEvaluator::HandleCopy(HloInstruction* copy) {
   TF_RET_CHECK(ShapeUtil::Compatible(copy->shape(), copy->operand(0)->shape()));
 
-  auto result = MakeUnique<Literal>(GetEvaluatedLiteralFor(copy->operand(0)));
+  auto result = GetEvaluatedLiteralFor(copy->operand(0)).CloneToUnique();
   evaluated_[copy] = std::move(result);
   return Status::OK();
 }
@@ -1960,4 +2438,30 @@ Status HloEvaluator::Postprocess(HloInstruction* hlo) {
   return Status::OK();
 }
 
+// Explicit instantiation of templatized Evaluate* methods.
+//
+template StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate<
+    const Literal*>(const HloModule& module,
+                    tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+template StatusOr<std::unique_ptr<Literal>>
+HloEvaluator::Evaluate<std::unique_ptr<Literal>>(
+    const HloModule& module,
+    tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arg_literals);
+
+template StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate<
+    const Literal*>(const HloComputation& computation,
+                    tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+template StatusOr<std::unique_ptr<Literal>>
+HloEvaluator::Evaluate<std::unique_ptr<Literal>>(
+    const HloComputation& computation,
+    tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arg_literals);
+
+template StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate<
+    const Literal*>(HloInstruction* instruction,
+                    tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+template StatusOr<std::unique_ptr<Literal>>
+HloEvaluator::Evaluate<std::unique_ptr<Literal>>(
+    HloInstruction* instruction,
+    tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arg_literals);
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 7557aaa2484d184555411a79d8dce2c9241427b0..3b2b697e492a78a06a4e5ae6bf056ff8676f2ff5 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
 
 #include <memory>
 
@@ -42,9 +42,12 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // Precondition: The indices of arg_literals correspond to the parameter
   // numbers of the HLO parameters in the computation. See comment below for an
   // example.
+  // `LiteralPtr` accepts either std::unique_ptr<Literal> or const Literal*
+  // type.
+  template <typename LiteralPtr>
   StatusOr<std::unique_ptr<Literal>> Evaluate(
       const HloModule& module,
-      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+      tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals);
 
   // Evaluates an HLO computation and an array of pointers to literals.
   // Returns the evaluated result as a literal if successful.
@@ -62,9 +65,12 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // where Parameter0 has parameter_number 0 and Parameter1 has parameter_number
   // 1 in this computation. The input literals array will then have its first
   // literal map to Parameter0 and the second map to Parameter1.
+  // `LiteralPtr` accepts either std::unique_ptr<Literal> or const Literal*
+  // type.
+  template <typename LiteralPtr>
   StatusOr<std::unique_ptr<Literal>> Evaluate(
       const HloComputation& computation,
-      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+      tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals);
 
   // Evaluates a single HLO instruction and an array of pointers to literals.
   // Return the evaluated result as literal if successful.
@@ -72,10 +78,12 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // 1. argument literals correspond to the input instruction's parameters in
   // their post-ordering.
   // 2. the instruction's operands must be of either Parameter or Constant type.
-  // TODO(b/35950897): implement more ops other than element-wise ops.
+  // `LiteralPtr` accepts either std::unique_ptr<Literal> or const Literal*
+  // type.
+  template <typename LiteralPtr>
   StatusOr<std::unique_ptr<Literal>> Evaluate(
       HloInstruction* instruction,
-      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+      tensorflow::gtl::ArraySlice<LiteralPtr> arg_literals);
 
   // Evaluates a single HLO instruction with constant operands.
   // Returns the evaluated result as literal if successful.
@@ -100,12 +108,16 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
  protected:
   // Templated DfsHloVisitor. Typically ReturnT here indicates the resulting
   // literal type of each evaluated Handle* method of a TypedVisitor.
-  // There are however a few notable exceptions to this is rule, notably:
+  // There are however a few notable exceptions to this rule, notably:
   // - HandleCompare and HandleIsFinite: where the resulting literal type is
   // always boolean.
   // These operations are handled outside of the parent HloEvaluator handlers
   // instead of from within TypedVisitor.
-  template <typename ReturnT>
+  //
+  // Type params:
+  //   - ReturnT: The type of input and output of each operation.
+  //   - ElementwiseT: The type in which internal computation are done.
+  template <typename ReturnT, typename ElementwiseT = ReturnT>
   class TypedVisitor;
 
   // Wraps around instruction handling to infer types before dispatching to
@@ -134,6 +146,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   Status HandleIsFinite(HloInstruction* is_finite) override;
 
   Status HandleCompare(HloInstruction* compare) override;
+
   Status HandleTuple(HloInstruction* tuple) override;
 
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
@@ -167,17 +180,19 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // TODO(b/35950897): have better memory management here to free instructions
   // that are no longer a parent for any other subsequent instruction in
   // post-orderring.
+  // Must be cleared for each evaluation.
   tensorflow::gtl::FlatMap<const HloInstruction*, std::unique_ptr<Literal>>
       evaluated_;
 
-  // Stores input literals, assuming they are in post-order. Literals are not
-  // owned by this class, and they must outlive the lifetime of the instance of
-  // this class.
-  tensorflow::gtl::ArraySlice<const Literal*> arg_literals_;
+  // Caches pointers to input literals, assuming they are in post-order.
+  // Literals are not owned by this class, and they must outlive the lifetime of
+  // each invocation to the Evaluate* method.
+  // Must be cleared for each evaluation.
+  std::vector<const Literal*> arg_literals_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(HloEvaluator);
 };
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index b2c4351896764fa8683e91396f526d97ba208df6..97765d65909cee192f65069777f8f195081603b2 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -25,8 +25,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -35,15 +37,33 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace {
 
-class HloEvaluatorTest : public HloVerifiedTestBase {
+static std::array<bool, 2> use_bf16_params{true, false};
+
+class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
+                         public HloVerifiedTestBase {
  protected:
-  HloEvaluatorTest() { evaluator_ = MakeUnique<HloEvaluator>(); }
+  HloEvaluatorTest() : use_bfloat16_(GetParam()) {
+    evaluator_ = MakeUnique<HloEvaluator>();
+  }
+
+  std::unique_ptr<Literal> Evaluate(
+      tensorflow::gtl::ArraySlice<const Literal*> arg_literals = {}) {
+    if (use_bfloat16_) {
+      // In BF16 mode, we convert all F32 type to BF16 and evaluate the module.
+      auto type_converter = HloElementTypeConverter(F32, BF16);
+      type_converter.Run(&module()).ValueOrDie();
+    }
+    return evaluator_->Evaluate(*module().entry_computation(), arg_literals)
+        .ConsumeValueOrDie();
+  }
 
   std::unique_ptr<HloEvaluator> evaluator_;
 
@@ -52,12 +72,11 @@ class HloEvaluatorTest : public HloVerifiedTestBase {
     HloComputation::Builder b(TestName());
     auto c1 =
         b.AddInstruction(HloInstruction::CreateConstant(std::move(input)));
-    auto instruction = b.AddInstruction(
+    b.AddInstruction(
         HloInstruction::CreateUnary(expected->shape(), opcode, c1));
     module().AddEntryComputation(b.Build());
 
-    std::unique_ptr<Literal> result =
-        evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
+    std::unique_ptr<Literal> result = Evaluate();
 
     auto element_type = expected->shape().element_type();
     if (element_type == F32 || element_type == F64) {
@@ -74,20 +93,24 @@ class HloEvaluatorTest : public HloVerifiedTestBase {
     HloComputation::Builder b(TestName());
     auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs)));
     auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs)));
-    auto instruction = b.AddInstruction(
+    b.AddInstruction(
         HloInstruction::CreateBinary(expected->shape(), opcode, c1, c2));
     module().AddEntryComputation(b.Build());
 
-    std::unique_ptr<Literal> result =
-        evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
+    std::unique_ptr<Literal> result = Evaluate();
 
     LiteralTestUtil::ExpectEqual(*expected, *result);
   }
+
+  bool use_bfloat16_;
 };
 
+#define XLA_TYPED_TEST_P(test_case_name, test_name, test_type1) \
+  TEST_P(test_case_name, test_name)
+
 // Verifies that HloEvaluator evaluates a HLO instruction that performs clamp
 // with 3 operands.
-TEST_F(HloEvaluatorTest, DoesClamp) {
+TEST_P(HloEvaluatorTest, DoesClamp) {
   auto low = Literal::CreateR2<float>({{0.f, 2.f}, {2.f, 4.f}});
   auto value = Literal::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
   auto high = Literal::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
@@ -97,19 +120,18 @@ TEST_F(HloEvaluatorTest, DoesClamp) {
   auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(low)));
   auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
   auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
-  auto instruction = b.AddInstruction(
+  b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
   module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({{0, 4}, {2, 4}});
 
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
+TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
   auto low = Literal::CreateR0<float>(0.f);
   auto value = Literal::CreateR2<float>({{-1.f, 0.f}, {1.f, 2.f}});
   auto high = Literal::CreateR0<float>(1.f);
@@ -119,12 +141,11 @@ TEST_F(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
   auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(low)));
   auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
   auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
-  auto instruction = b.AddInstruction(
+  b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
   module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({{0, 0}, {1, 1}});
 
@@ -133,7 +154,7 @@ TEST_F(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs select
 // with 3 operands.
-TEST_F(HloEvaluatorTest, DoesSelect) {
+TEST_P(HloEvaluatorTest, DoesSelect) {
   auto pred = Literal::CreateR2<bool>({{true, false}, {false, true}});
   auto on_true = Literal::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
   auto on_false = Literal::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
@@ -145,12 +166,11 @@ TEST_F(HloEvaluatorTest, DoesSelect) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(on_true)));
   auto c3 =
       b.AddInstruction(HloInstruction::CreateConstant(std::move(on_false)));
-  auto instruction = b.AddInstruction(
+  b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kSelect, c1, c2, c3));
   module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate({});
 
   auto expected = Literal::CreateR2<float>({{2, 5}, {0, 4}});
 
@@ -159,7 +179,7 @@ TEST_F(HloEvaluatorTest, DoesSelect) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise addition with 2 operands.
-TEST_F(HloEvaluatorTest, DoesAdd) {
+TEST_P(HloEvaluatorTest, DoesAdd) {
   auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = Literal::CreateR2<int64>({{3, 4}, {-96, 8}});
@@ -168,7 +188,7 @@ TEST_F(HloEvaluatorTest, DoesAdd) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise and with 2 operands.
-TEST_F(HloEvaluatorTest, DoesAnd) {
+TEST_P(HloEvaluatorTest, DoesAnd) {
   auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = Literal::CreateR2<int64>({{0, 0}, {4, 4}});
@@ -177,7 +197,7 @@ TEST_F(HloEvaluatorTest, DoesAnd) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise or with 2 operands.
-TEST_F(HloEvaluatorTest, DoesOr) {
+TEST_P(HloEvaluatorTest, DoesOr) {
   auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = Literal::CreateR2<int64>({{3, 4}, {-100, 4}});
@@ -186,7 +206,7 @@ TEST_F(HloEvaluatorTest, DoesOr) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise multiply with 2 operands.
-TEST_F(HloEvaluatorTest, DoesMultiply) {
+TEST_P(HloEvaluatorTest, DoesMultiply) {
   auto lhs = Literal::CreateR2<int32>({{-1, 0}, {-100, 4}});
   auto rhs = Literal::CreateR2<int32>(
       {{std::numeric_limits<int32>::min(), 4}, {4, 4}});
@@ -197,14 +217,14 @@ TEST_F(HloEvaluatorTest, DoesMultiply) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise divide with 2 operands.
-TEST_F(HloEvaluatorTest, DoesDivideInt64) {
+TEST_P(HloEvaluatorTest, DoesDivideInt64) {
   auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = Literal::CreateR2<int64>({{0, 0}, {-25, 1}});
   TestBinaryOp(HloOpcode::kDivide, std::move(expected), std::move(lhs),
                std::move(rhs));
 }
-TEST_F(HloEvaluatorTest, DoesDivideDouble) {
+TEST_P(HloEvaluatorTest, DoesDivideDouble) {
   auto lhs = Literal::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
   auto rhs = Literal::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
   auto expected =
@@ -215,40 +235,41 @@ TEST_F(HloEvaluatorTest, DoesDivideDouble) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise abs op with 1 operand.
-TEST_F(HloEvaluatorTest, DoesAbsR2) {
+TEST_P(HloEvaluatorTest, DoesAbsR2) {
   auto operand = Literal::CreateR2<int64>({{1, -20}, {-100, 4}});
   auto expected = Literal::CreateR2<int64>({{1, 20}, {100, 4}});
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_F(HloEvaluatorTest, DoesAbsR0) {
+TEST_P(HloEvaluatorTest, DoesAbsR0) {
   auto operand = Literal::CreateR0<float>(-1.0f);
   auto expected = Literal::CreateR0<float>(1.0f);
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_F(HloEvaluatorTest, DoesAbsR1WithZeroSize) {
+TEST_P(HloEvaluatorTest, DoesAbsR1WithZeroSize) {
   auto operand = Literal::CreateR1<float>({});
   auto expected = Literal::CreateR1<float>({});
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_F(HloEvaluatorTest, DoesNegateR2) {
+TEST_P(HloEvaluatorTest, DoesNegateR2) {
   auto operand = Literal::CreateR2<int32>(
       {{0, std::numeric_limits<int32>::min()}, {-1, 4}});
   auto expected =
       Literal::CreateR2<int32>({{0, std::numeric_limits<int>::min()}, {1, -4}});
   TestUnaryOp(HloOpcode::kNegate, std::move(expected), std::move(operand));
 }
-TEST_F(HloEvaluatorTest, DoesCosR2) {
+TEST_P(HloEvaluatorTest, DoesCosR2) {
   auto operand = Literal::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
   auto expected = Literal::CreateR2<float>({{1, -1}, {-1, 1}});
-  TestUnaryOp(HloOpcode::kCos, std::move(expected), std::move(operand));
+  TestUnaryOp(HloOpcode::kCos, std::move(expected), std::move(operand),
+              use_bfloat16_ ? 0x1.0P-5 : 0x1.0P-20);
 }
-TEST_F(HloEvaluatorTest, DoesSinR2) {
+TEST_P(HloEvaluatorTest, DoesSinR2) {
   auto operand = Literal::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
   auto expected = Literal::CreateR2<float>({{0, 0}, {0, 0}});
   TestUnaryOp(HloOpcode::kSin, std::move(expected), std::move(operand),
-              0x1.0P-20);
+              use_bfloat16_ ? 0x1.0P-5 : 0x1.0P-20);
 }
-TEST_F(HloEvaluatorTest, DoesNotR2) {
+TEST_P(HloEvaluatorTest, DoesNotR2) {
   auto operand =
       Literal::CreateR2<int32>({{0, std::numeric_limits<int>::min()},
                                 {-1, std::numeric_limits<int>::max()}});
@@ -259,7 +280,7 @@ TEST_F(HloEvaluatorTest, DoesNotR2) {
 }
 // Verifies that HloEvaluator evaluates a HLO Computation with non-parameter nor
 // constant operands.
-TEST_F(HloEvaluatorTest, DoesTraverseInstructions) {
+TEST_P(HloEvaluatorTest, DoesTraverseInstructions) {
   auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
   auto rhs2 = Literal::CreateR2<int64>({{1, -20}, {-100, 4}});
@@ -279,10 +300,9 @@ TEST_F(HloEvaluatorTest, DoesTraverseInstructions) {
       b.AddInstruction(HloInstruction::CreateParameter(2, shape, "rhs2"));
   b.AddInstruction(HloInstruction::CreateBinary(shape, HloOpcode::kAdd,
                                                 lhs_instruction, param_rhs2));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, args).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate(args);
 
   auto expected = Literal::CreateR2<int64>({{4, -16}, {-196, 12}});
 
@@ -290,7 +310,7 @@ TEST_F(HloEvaluatorTest, DoesTraverseInstructions) {
 }
 
 // Verifies Reshape operation is correctly evaluated.
-TEST_F(HloEvaluatorTest, DoesReshape) {
+TEST_P(HloEvaluatorTest, DoesReshape) {
   HloComputation::Builder b(TestName());
   const int64 dimensions[] = {11, 8, 7, 5, 9};
   TF_ASSERT_OK_AND_ASSIGN(auto literal,
@@ -304,21 +324,20 @@ TEST_F(HloEvaluatorTest, DoesReshape) {
   const int64 permutation[] = {1, 2, 0, 4, 3};
   b.AddInstruction(
       HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate({});
 
   using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
   result->EachCell<NativeT>(
       [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT value) {
         std::vector<int64> rindexes = Permute(permutation, indices);
-        EXPECT_TRUE(value == literal_clone->Get<NativeT>(rindexes));
+        EXPECT_NEAR(value, literal_clone->Get<NativeT>(rindexes), 0x1.0P-5);
       });
 }
 
 // Verifies Broadcast operation is correctly evaluated.
-TEST_F(HloEvaluatorTest, DoesBroadcast) {
+TEST_P(HloEvaluatorTest, DoesBroadcast) {
   HloComputation::Builder b(TestName());
   auto input_literal = Literal::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}});
   auto output_literal = Literal::CreateR3<int32>(
@@ -327,15 +346,14 @@ TEST_F(HloEvaluatorTest, DoesBroadcast) {
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateBroadcast(
       output_literal->shape(), literal_instruction, {1, 2}));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate({});
 
   LiteralTestUtil::ExpectEqual(*result, *output_literal);
 }
 
-TEST_F(HloEvaluatorTest, DoesBroadcastScalar) {
+TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
   HloComputation::Builder b(TestName());
   auto input_literal = Literal::CreateR0<int32>(111);
   auto output_literal = Literal::CreateR2<int32>(
@@ -347,15 +365,14 @@ TEST_F(HloEvaluatorTest, DoesBroadcastScalar) {
   b.AddInstruction(HloInstruction::CreateBroadcast(
       output_literal->shape(), literal_instruction,
       /*broadcast_dimensions=*/{}));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate({});
 
   LiteralTestUtil::ExpectEqual(*result, *output_literal);
 }
 
-TEST_F(HloEvaluatorTest, DoesConcatenateSimple) {
+TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
   HloComputation::Builder b(TestName());
 
   HloInstruction* operand1 = b.AddInstruction(HloInstruction::CreateConstant(
@@ -368,17 +385,16 @@ TEST_F(HloEvaluatorTest, DoesConcatenateSimple) {
   Shape shape = ShapeUtil::MakeShape(S64, {4, 2});
   b.AddInstruction(HloInstruction::CreateConcatenate(shape, operands, 0));
 
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected =
       Literal::CreateR2<int64>({{-1, -2}, {100, 200}, {-2, -3}, {-100, -200}});
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
+TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   HloComputation::Builder b(TestName());
 
   HloInstruction* operand1 = b.AddInstruction(
@@ -391,16 +407,15 @@ TEST_F(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   Shape shape = ShapeUtil::MakeShape(S64, {2});
   b.AddInstruction(HloInstruction::CreateConcatenate(shape, operands, 0));
 
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR1<int64>({100, 200});
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, ConvertWithSameLayout) {
+TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
   HloComputation::Builder b(TestName());
 
   auto input_literal = Literal::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}});
@@ -412,15 +427,14 @@ TEST_F(HloEvaluatorTest, ConvertWithSameLayout) {
   HloInstruction* constant = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateConvert(expected->shape(), constant));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   LiteralTestUtil::ExpectEqual(*result, *expected);
 }
 
-TEST_F(HloEvaluatorTest, ConvertWithDifferentLayout) {
+TEST_P(HloEvaluatorTest, ConvertWithDifferentLayout) {
   HloComputation::Builder b(TestName());
 
   auto input_literal = Literal::CreateR2WithLayout<int32>(
@@ -433,10 +447,9 @@ TEST_F(HloEvaluatorTest, ConvertWithDifferentLayout) {
   HloInstruction* constant = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateConvert(expected->shape(), constant));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   LiteralTestUtil::ExpectEqual(*result, *expected);
 }
@@ -454,7 +467,7 @@ PaddingConfig CreatePaddingConfig(
   return padding_config;
 }
 
-TEST_F(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
+TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   auto operand = Literal::CreateR2<int32>({{}, {}});
   HloComputation::Builder b(TestName());
   auto operand_instruction =
@@ -467,11 +480,11 @@ TEST_F(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
 
   auto padding_config = CreatePaddingConfig({{{1, 0, 2}}, {{0, 2, 1}}});
   Shape shape = ShapeUtil::MakeShape(S32, {5, 2});
-  auto pad_instruction = b.AddInstruction(HloInstruction::CreatePad(
+  b.AddInstruction(HloInstruction::CreatePad(
       shape, operand_instruction, padding_value_instruction, padding_config));
   module().AddEntryComputation(b.Build());
 
-  auto result = evaluator_->Evaluate(pad_instruction).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<int32>(
       {{10, 10}, {10, 10}, {10, 10}, {10, 10}, {10, 10}});
@@ -479,7 +492,7 @@ TEST_F(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
+TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> input_array(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
@@ -496,10 +509,9 @@ TEST_F(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
       CreatePaddingConfig({{{1, 0, 2}}, {{0, 2, 1}}, {{0, 0, 0}}, {{0, 0, 0}}});
   b.AddInstruction(HloInstruction::CreatePad(
       shape, input_instruction, pad_instruction, r4_padding_on_dim0_dim1));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected_array = MakeUnique<Array4D<float>>(8, 5, 1, 1);
   expected_array->Fill(kPadValue);
@@ -515,7 +527,7 @@ TEST_F(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, NegativePadding2D) {
+TEST_P(HloEvaluatorTest, NegativePadding2D) {
   HloComputation::Builder b(TestName());
 
   // input_array:
@@ -541,10 +553,9 @@ TEST_F(HloEvaluatorTest, NegativePadding2D) {
                                              pad_value_instruction,
                                              r2_padding_on_dim0_dim1));
 
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   // f32[1,5] { 7.0, 2.718, 2.718, 2.718, 2.718 }
   auto expected_array = MakeUnique<Array2D<float>>(1, 5);
@@ -555,10 +566,10 @@ TEST_F(HloEvaluatorTest, NegativePadding2D) {
   (*expected_array)(0, 4) = 2.718f;
   auto expected = Literal::CreateR2FromArray2D<float>(*expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  LiteralTestUtil::ExpectNear(*expected, *result, ErrorSpec(0x1.0P-5));
 }
 
-TEST_F(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
+TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
   HloComputation::Builder b(TestName());
 
   // f32[4,3] {
@@ -587,10 +598,9 @@ TEST_F(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
                                              pad_value_instruction,
                                              r2_padding_on_dim0_dim1));
 
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected_array = MakeUnique<Array2D<float>>(0, 9);
   auto expected = Literal::CreateR2FromArray2D<float>(*expected_array);
@@ -598,7 +608,7 @@ TEST_F(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DotRank2AndRank1) {
+TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -621,12 +631,14 @@ TEST_F(HloEvaluatorTest, DotRank2AndRank1) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {4, 2});
-  b.AddInstruction(HloInstruction::CreateBinary(
-      shape, HloOpcode::kDot, lhs_instruction, rhs_instruction));
-  auto computation = module().AddEntryComputation(b.Build());
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
+                                             rhs_instruction, dot_dnums));
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   // clang-format off
   auto expected_array = Array2D<float>({
@@ -641,7 +653,7 @@ TEST_F(HloEvaluatorTest, DotRank2AndRank1) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DotRank1AndRank2) {
+TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -664,19 +676,21 @@ TEST_F(HloEvaluatorTest, DotRank1AndRank2) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {2});
-  b.AddInstruction(HloInstruction::CreateBinary(
-      shape, HloOpcode::kDot, lhs_instruction, rhs_instruction));
-  auto computation = module().AddEntryComputation(b.Build());
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
+                                             rhs_instruction, dot_dnums));
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR1<float>({22.f, 28.f});
 
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DotRank2AndRank2) {
+TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -705,12 +719,14 @@ TEST_F(HloEvaluatorTest, DotRank2AndRank2) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {4, 2});
-  b.AddInstruction(HloInstruction::CreateBinary(
-      shape, HloOpcode::kDot, lhs_instruction, rhs_instruction));
-  auto computation = module().AddEntryComputation(b.Build());
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
+                                             rhs_instruction, dot_dnums));
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected_array = Array2D<float>({
       {22.f, 28.f},
@@ -723,7 +739,7 @@ TEST_F(HloEvaluatorTest, DotRank2AndRank2) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, SimpleConv1D) {
+TEST_P(HloEvaluatorTest, SimpleConv1D) {
   HloComputation::Builder b(TestName());
 
   Array3D<float> lhs_array = {{{1, 2, 3}}};
@@ -761,10 +777,9 @@ TEST_F(HloEvaluatorTest, SimpleConv1D) {
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 3});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   Array3D<float> expected_array = {{{11.f, 18.f, 9.f}}};
   auto expected = Literal::CreateR3FromArray3D<float>(expected_array);
@@ -772,7 +787,7 @@ TEST_F(HloEvaluatorTest, SimpleConv1D) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
+TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -816,10 +831,9 @@ TEST_F(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   Array4D<float> expected_array(1, 1, 4, 4);
   // clang-format off
@@ -835,7 +849,7 @@ TEST_F(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
+TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
   HloComputation::Builder b(TestName());
 
   // clang-format off
@@ -900,21 +914,22 @@ TEST_F(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   // clang-format off
   // Result dimensions: [feature=1, height=1, batch=1, width=2]
   Array4D<float> expected_array({{{{2514, 2685}}}});
+  Array4D<float> expected_array_bf16({{{{2512, 2672}}}});
   // clang-format on
-  auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
+  auto expected = Literal::CreateR4FromArray4D<float>(
+      use_bfloat16_ ? expected_array_bf16 : expected_array);
 
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, Conv2DGeneralDimensions) {
+TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
   HloComputation::Builder b(TestName());
 
   // clang-format off
@@ -976,21 +991,22 @@ TEST_F(HloEvaluatorTest, Conv2DGeneralDimensions) {
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   // clang-format off
   // Result dimensions: [feature=1, height=1, batch=1, width=2]
   Array4D<float> expected_array({{{{2514, 2685}}}});
+  Array4D<float> expected_array_bf16({{{{2512, 2672}}}});
   // clang-format on
-  auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
+  auto expected = Literal::CreateR4FromArray4D<float>(
+      use_bfloat16_ ? expected_array_bf16 : expected_array);
 
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
+TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -1034,10 +1050,9 @@ TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 7, 7});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   Array4D<float> expected_array(1, 1, 7, 7);
   expected_array.FillWithYX(Array2D<float>({
@@ -1054,7 +1069,7 @@ TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
+TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -1098,10 +1113,9 @@ TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 8, 8});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   Array4D<float> expected_array(1, 1, 8, 8);
   expected_array.FillWithYX(Array2D<float>({
@@ -1119,7 +1133,7 @@ TEST_F(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest,
+TEST_P(HloEvaluatorTest,
        DilatedWindowAndBaseConv2DWithDifferentLowAndHighPaddingAndStrides) {
   HloComputation::Builder b(TestName());
 
@@ -1170,10 +1184,9 @@ TEST_F(HloEvaluatorTest,
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 9, 3});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   Array4D<float> expected_array(1, 1, 9, 3);
   expected_array.FillWithYX(Array2D<float>({
@@ -1192,7 +1205,7 @@ TEST_F(HloEvaluatorTest,
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, ReduceAdd) {
+TEST_P(HloEvaluatorTest, ReduceAdd) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1225,17 +1238,16 @@ TEST_F(HloEvaluatorTest, ReduceAdd) {
       HloInstruction::CreateReduce(shape, arg_instruction, init_value,
                                    /*dimensions_to_reduce=*/{1}, add_func));
 
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR1<float>({6, 18});
 
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, ReduceWindowMax) {
+TEST_P(HloEvaluatorTest, ReduceWindowMax) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1278,15 +1290,15 @@ TEST_F(HloEvaluatorTest, ReduceWindowMax) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, max_func));
 
-  auto computation = module().AddEntryComputation(b.Build());
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({{6, 7}});
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, ReduceWindowAdd) {
+TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1335,21 +1347,21 @@ TEST_F(HloEvaluatorTest, ReduceWindowAdd) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, add_func));
 
-  auto computation = module().AddEntryComputation(b.Build());
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({{1, 3, 5}, {5, 11, 13}});
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, ReduceWindowAdd6D) {
+TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
   HloComputation::Builder b(TestName());
 
   // arg: f32[4,4,4,4,4,4] full of ones. Using small dims to limit run-time.
   std::vector<int64> input_dims(6, 4);
   std::unique_ptr<Literal> arg_literal =
-      Literal::CreateFullWithMonotonicDim0MajorLayout<float>(input_dims, 1.0f);
+      Literal::CreateFullWithDescendingLayout<float>(input_dims, 1.0f);
 
   HloInstruction* arg_instruction =
       b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal)));
@@ -1396,17 +1408,17 @@ TEST_F(HloEvaluatorTest, ReduceWindowAdd6D) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, add_func));
 
-  auto computation = module().AddEntryComputation(b.Build());
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result = Evaluate();
 
   std::vector<int64> output_dims = {4, 3, 3, 3, 4, 4};
   std::unique_ptr<Literal> result_literal =
-      Literal::CreateFullWithMonotonicDim0MajorLayout<float>(output_dims, 8.0f);
+      Literal::CreateFullWithDescendingLayout<float>(output_dims, 8.0f);
   LiteralTestUtil::ExpectEqual(*result_literal, *result);
 }
 
-TEST_F(HloEvaluatorTest, StridedSlice) {
+TEST_P(HloEvaluatorTest, StridedSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1427,10 +1439,9 @@ TEST_F(HloEvaluatorTest, StridedSlice) {
                                                /*start_indices=*/{0, 2},
                                                /*limit_indices=*/{3, 5},
                                                /*strides=*/{2, 3}));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({
       {3},
@@ -1440,7 +1451,7 @@ TEST_F(HloEvaluatorTest, StridedSlice) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DynamicSlice) {
+TEST_P(HloEvaluatorTest, DynamicSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1461,10 +1472,9 @@ TEST_F(HloEvaluatorTest, DynamicSlice) {
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
                                                       start_indices, {2, 3}));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({
       {2, 3, 4},
@@ -1476,7 +1486,7 @@ TEST_F(HloEvaluatorTest, DynamicSlice) {
 
 // Verifies that the HloEvaluator's implementation goes along with existing
 // backends' behavior, although this is not required by the spec.
-TEST_F(HloEvaluatorTest, DynamicSliceModSlice) {
+TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1497,10 +1507,9 @@ TEST_F(HloEvaluatorTest, DynamicSliceModSlice) {
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
                                                       start_indices, {2, 3}));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({
       {2, 3, 4},
@@ -1510,7 +1519,7 @@ TEST_F(HloEvaluatorTest, DynamicSliceModSlice) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, DynamicSliceUpdate) {
+TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1534,10 +1543,9 @@ TEST_F(HloEvaluatorTest, DynamicSliceUpdate) {
   Shape shape = ShapeUtil::MakeShape(F64, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       shape, operand, update, start_indices));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<double>({
       {1, -2, -3},
@@ -1547,7 +1555,7 @@ TEST_F(HloEvaluatorTest, DynamicSliceUpdate) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, SetAndGetTuples) {
+TEST_P(HloEvaluatorTest, SetAndGetTuples) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1570,9 +1578,9 @@ TEST_F(HloEvaluatorTest, SetAndGetTuples) {
   Shape shape = ShapeUtil::MakeShape(F64, {2, 3});
   b.AddInstruction(HloInstruction::CreateGetTupleElement(shape, tuple, 1));
 
-  auto computation = module().AddEntryComputation(b.Build());
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<double>({
       {1, 2, 3},
@@ -1582,7 +1590,7 @@ TEST_F(HloEvaluatorTest, SetAndGetTuples) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, SetAndGetNestedTuples) {
+TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1609,9 +1617,9 @@ TEST_F(HloEvaluatorTest, SetAndGetNestedTuples) {
   b.AddInstruction(
       HloInstruction::CreateGetTupleElement(tuple2->shape(), outer_tuple, 1));
 
-  auto computation = module().AddEntryComputation(b.Build());
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result = Evaluate();
 
   auto result_inner_literal =
       Literal::CreateR2FromArray2D<double>(*operand_array);
@@ -1623,7 +1631,7 @@ TEST_F(HloEvaluatorTest, SetAndGetNestedTuples) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, Reverse) {
+TEST_P(HloEvaluatorTest, Reverse) {
   HloComputation::Builder b(TestName());
 
   // Input shape is float[4x3x2x1].
@@ -1649,10 +1657,9 @@ TEST_F(HloEvaluatorTest, Reverse) {
 
   const Shape shape = ShapeUtil::MakeShape(F32, {4, 3, 2, 1});
   b.AddInstruction(HloInstruction::CreateReverse(shape, operand, {0, 1}));
-  auto computation = module().AddEntryComputation(b.Build());
+  module().AddEntryComputation(b.Build());
 
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+  std::unique_ptr<Literal> result = Evaluate();
 
   // clang-format off
   auto expected = Literal::CreateR4FromArray4D<float>({
@@ -1677,7 +1684,7 @@ TEST_F(HloEvaluatorTest, Reverse) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
-TEST_F(HloEvaluatorTest, EvaluateWithSubstitutions) {
+TEST_P(HloEvaluatorTest, EvaluateWithSubstitutions) {
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4});
 
@@ -1700,7 +1707,7 @@ TEST_F(HloEvaluatorTest, EvaluateWithSubstitutions) {
 
 // Check that EvaluateWithSubstitutions works if one of the operands to the op
 // we're evaluating is a constant.
-TEST_F(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
+TEST_P(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4});
 
@@ -1722,5 +1729,8 @@ TEST_F(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
                                *result.ValueOrDie());
 }
 
+INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest,
+                        ::testing::ValuesIn(use_bf16_params));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index ba75e2ef1b485f015f3b8f8dbd76f214d6ab0130..f0df93b61d29c1535d8a89fbd65e669de5b43729 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -32,7 +32,7 @@ HloProfileIndexMap::HloProfileIndexMap(const HloModule& module) {
     InsertOrDie(&computation_to_profile_idx_, computation,
                 current_profile_index++);
     for (const HloInstruction* instruction : computation->instructions()) {
-      // For simplicity we track all instrutions here, but we could skip
+      // For simplicity we track all instructions here, but we could skip
       // non-executing instructions like constants and parameters.
       InsertOrDie(&instruction_to_profile_idx_, instruction,
                   current_profile_index++);
@@ -40,82 +40,75 @@ HloProfileIndexMap::HloProfileIndexMap(const HloModule& module) {
   }
 }
 
-std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
+std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
     const HloProfileIndexMap& hlo_profile_index_map,
     const HloCostAnalysis& cost_analysis) {
-  using HloComputationInfo = HloProfilePrinter::HloComputationInfo;
-  using HloInstructionInfo = HloProfilePrinter::HloInstructionInfo;
-
-  HloComputationInfo* computation_infos =
-      new HloComputationInfo[hlo_profile_index_map.computation_count()];
-
-  // There are two "indices" in play here.  The first one is the index of the
-  // HloComputationInfo or HloInstructionInfo in the array that contains said
-  // HloComputationInfo or HloInstructionInfo.  The second index is the index of
-  // the HloComputationInfo or HloInstructionInfo in the profile counters array,
-  // as decided by hlo_profile_index_map.  The latter index is always referred
-  // to as "profile_index".
-
-  size_t computation_index_in_static_data = 0;
-  size_t max_profile_index = hlo_profile_index_map.total_count();
-  for (const auto& pair : hlo_profile_index_map.computation_to_profile_idx()) {
-    CHECK_LT(pair.second, max_profile_index);
+  using HloComputationInfo = HloProfilePrinterData::HloComputationInfo;
+  using HloInstructionInfo = HloProfilePrinterData::HloInstructionInfo;
+
+  size_t profile_counters_size = hlo_profile_index_map.total_count();
+
+  std::unique_ptr<HloProfilePrinterData> profile_printer_data =
+      MakeUnique<HloProfilePrinterData>();
+  profile_printer_data->set_profile_counters_size(profile_counters_size);
+  profile_printer_data->mutable_computation_infos()->Reserve(
+      hlo_profile_index_map.computation_count());
+
+  const auto& computation_to_profile_idx_map =
+      hlo_profile_index_map.computation_to_profile_idx();
+
+  // computation_to_profile_idx_map's order is not deterministic so create a
+  // deterministic computation_and_profile_idx_list so that we end up with a
+  // deterministic HloProfilePrinterData protobuf.
+
+  std::vector<std::pair<const HloComputation*, int64>>
+      computation_and_profile_idx_list(computation_to_profile_idx_map.begin(),
+                                       computation_to_profile_idx_map.end());
+
+  // The profile indices were computed deterministically in
+  // HloProfileIndexMap::HloProfileIndexMap.
+  c_sort(computation_and_profile_idx_list,
+         [](const std::pair<const HloComputation*, int64>& left,
+            const std::pair<const HloComputation*, int64>& right) {
+           return left.second < right.second;
+         });
+
+  for (const auto& pair : computation_and_profile_idx_list) {
+    CHECK_LT(pair.second, profile_counters_size);
     const HloComputation* computation = pair.first;
-    size_t current_computation_index = computation_index_in_static_data++;
     HloComputationInfo* computation_info =
-        &computation_infos[current_computation_index];
+        profile_printer_data->add_computation_infos();
 
-    computation_info->name = strdup(computation->name().c_str());
-    computation_info->profile_index = pair.second;
-    computation_info->instructions =
-        new HloInstructionInfo[computation->instruction_count()];
-    computation_info->instructions_size = computation->instruction_count();
+    computation_info->set_name(computation->name());
+    computation_info->set_profile_index(pair.second);
+    computation_info->mutable_instruction_infos()->Reserve(
+        computation->instruction_count());
 
-    size_t instruction_index_in_static_data = 0;
     for (const HloInstruction* hlo : computation->instructions()) {
-      HloProfilePrinter::HloInstructionInfo* instruction_info =
-          &computation_info->instructions[instruction_index_in_static_data++];
-      instruction_info->long_name = strdup(hlo->ToString().c_str());
-      instruction_info->short_name =
-          strdup(hlo->ToString(/*compact_operands=*/true).c_str());
-      instruction_info->category = strdup(hlo->ToCategory().c_str());
-      instruction_info->flop_count = cost_analysis.flop_count(*hlo);
-      instruction_info->transcendental_count =
-          cost_analysis.transcendental_count(*hlo);
-      instruction_info->bytes_accessed = cost_analysis.bytes_accessed(*hlo);
-      instruction_info->optimal_seconds = cost_analysis.optimal_seconds(*hlo);
-      instruction_info->profile_index =
-          hlo_profile_index_map.GetProfileIndexFor(*hlo);
-      CHECK_LT(instruction_info->profile_index, max_profile_index);
+      HloInstructionInfo* instruction_info =
+          computation_info->add_instruction_infos();
+      instruction_info->set_long_name(hlo->ToString());
+      instruction_info->set_short_name(
+          hlo->ToString(HloPrintOptions().set_compact_operands(true)));
+      instruction_info->set_category(hlo->ToCategory());
+      instruction_info->set_flop_count(cost_analysis.flop_count(*hlo));
+      instruction_info->set_transcendental_count(
+          cost_analysis.transcendental_count(*hlo));
+      instruction_info->set_bytes_accessed(cost_analysis.bytes_accessed(*hlo));
+      instruction_info->set_optimal_seconds(
+          cost_analysis.optimal_seconds(*hlo));
+      instruction_info->set_profile_index(
+          hlo_profile_index_map.GetProfileIndexFor(*hlo));
     }
   }
 
-  auto deleter = [](HloProfilePrinter::HloComputationInfo* computation_infos,
-                    int64 computation_infos_size) {
-    for (int64 i = 0; i < computation_infos_size; i++) {
-      HloInstructionInfo* instruction_infos = computation_infos[i].instructions;
-      for (int64 j = 0; j < computation_infos[i].instructions_size; j++) {
-        // We can't make instruction_infos[j].long_name etc. non-const pointers
-        // since they may point into static storage, so we have a const_cast
-        // here.
-        free(const_cast<char*>(instruction_infos[j].long_name));
-        free(const_cast<char*>(instruction_infos[j].short_name));
-        free(const_cast<char*>(instruction_infos[j].category));
-      }
-      delete[] instruction_infos;
-      free(const_cast<char*>(computation_infos[i].name));
-    }
-    delete[] computation_infos;
-  };
-
-  return MakeUnique<HloProfilePrinter>(
-      computation_infos, hlo_profile_index_map.computation_count(), deleter);
+  return profile_printer_data;
 }
 
 HloExecutionProfile::HloExecutionProfile(
-    const HloProfilePrinter* hlo_profile_printer,
+    const HloProfilePrinterData* hlo_profile_printer_data,
     const HloProfileIndexMap* hlo_profile_index_map)
-    : hlo_profile_printer_(*hlo_profile_printer),
+    : hlo_profile_printer_data_(*hlo_profile_printer_data),
       hlo_profile_index_map_(*hlo_profile_index_map),
       profile_counters_(
           /*count*/ hlo_profile_index_map_.total_count(),
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 470fd4ce3c205d84152238f4b18daad77e403f68..6fb91b9bef9d1df82b8806ce79cc147823edeb3d 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -77,8 +77,8 @@ class HloProfileIndexMap {
   std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_;
 };
 
-// Create an instance of `HloProfilePrinter` that owns its memory.
-std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
+// Create an instance of `HloProfilePrinterData`.
+std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
     const HloProfileIndexMap& hlo_profile_index_map,
     const HloCostAnalysis& cost_analysis);
 
@@ -90,7 +90,7 @@ class HloExecutionProfile {
  public:
   using DeviceDescription = perftools::gputools::DeviceDescription;
 
-  HloExecutionProfile(const HloProfilePrinter* hlo_profile_printer,
+  HloExecutionProfile(const HloProfilePrinterData* hlo_profile_printer_data,
                       const HloProfileIndexMap* hlo_profile_index_map);
 
   // Record how many cycles this HLO took to execute.
@@ -117,17 +117,19 @@ class HloExecutionProfile {
   // debugging; e.g. emits cycle counts, execution time at the nominal device
   // frequency, and the effective throughput given the provided cost_analysis
   // for the operations in a given computation. Returns an empty string if it
-  // wasn't possible to generate a printable version. cost_analysis should be a
-  // clean analysis that can be used to visit the computation.
+  // wasn't possible to generate a printable version.
   string ToString(const DeviceDescription& device_description) const {
-    return hlo_profile_printer_.ToString(profile_counters_.data(),
-                                         device_description.clock_rate_ghz());
+    return PrintHloProfile(hlo_profile_printer_data_, profile_counters_.data(),
+                           device_description.clock_rate_ghz());
   }
 
   std::vector<int64>* mutable_profile_counters() { return &profile_counters_; }
+  const std::vector<int64>& profile_counters() const {
+    return profile_counters_;
+  }
 
  private:
-  const HloProfilePrinter& hlo_profile_printer_;
+  const HloProfilePrinterData& hlo_profile_printer_data_;
   const HloProfileIndexMap& hlo_profile_index_map_;
 
   // Stores per-Hlo profile counters.  This is the only thing that changes when
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
index b1e6729e2bccad4bdbe075a635d8a9b1ede6fecb..a0cb28246d3be541e798e85552436f64a3521f22 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -73,8 +73,8 @@ TEST_F(HloExecutionProfileTest, Basic) {
 
   HloCostAnalysis cost_analysis(shape_size_function);
   HloProfileIndexMap profile_index_map(*hlo_module);
-  std::unique_ptr<HloProfilePrinter> profile_printer =
-      CreateHloProfilePrinter(profile_index_map, cost_analysis);
+  std::unique_ptr<HloProfilePrinterData> profile_printer =
+      CreateHloProfilePrinterData(profile_index_map, cost_analysis);
   HloExecutionProfile execution_profile(profile_printer.get(),
                                         &profile_index_map);
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 84187d578346eafd5e32727a15f5eab9cc79feef..44fcd36370dcd0cf77601aa1cd2b92810947bd5f 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -508,8 +509,17 @@ stylesheet="
 
     // The "to_node" value may be a NULL, indicating that this points to the
     // "root" tag rather than a normal node.
-    int64 from_node_id = node_ids_.at(from_node);
-    int64 to_node_id = to_node ? node_ids_.at(to_node) : root_node_id_;
+    int64 from_node_id =
+        tensorflow::gtl::FindWithDefault(node_ids_, from_node, -1);
+    if (from_node_id == -1) {
+      LOG(FATAL) << from_node->name() << " was added to edges but not to nodes";
+    }
+    int64 to_node_id =
+        to_node ? tensorflow::gtl::FindWithDefault(node_ids_, to_node, -1)
+                : root_node_id_;
+    if (to_node != nullptr && to_node_id == -1) {
+      LOG(FATAL) << to_node->name() << " was added to edges but not to nodes";
+    }
 
     add_hover_css_rule("node", from_node_id, kBlue);
     add_hover_css_rule("node", to_node_id, kRed);
@@ -653,12 +663,15 @@ string HloDotDumper::DumpComputation(const HloComputation* comp) {
 
 string HloDotDumper::DumpRootTag() {
   const HloInstruction* from = GetNodeForEdge(computation_->root_instruction());
-  auto from_id = InstructionId(from);
 
-  if (!filter_.Show(from)) {
+  // We didn't display constants as separate nodes; so if the root is a
+  // constant, we don't add root tag or edge for it.
+  if (!filter_.Show(from) || from->opcode() == HloOpcode::kConstant) {
     return "";
   }
 
+  auto from_id = InstructionId(from);
+
   // The ID of the root computation is otherwise unused, so it makes a good ID
   // to use for the root-tag node.  However, the edge_ids_ map requires a
   // HloInstruction* pointer for the 'to' value, so we use a NULL value there
@@ -784,7 +797,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
 
     // Otherwise, print e.g. "%constant.42 (s32[100])".
     string constant_name;
-    if (tensorflow::StringPiece(constant->name()).starts_with("%constant")) {
+    if (tensorflow::StringPiece(constant->name()).starts_with("constant")) {
       constant_name = constant->name();
     } else {
       constant_name = StrCat("constant ", constant->name());
@@ -948,6 +961,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
       return kGreen;
     case HloOpcode::kConvolution:
     case HloOpcode::kDot:
+    case HloOpcode::kFft:
       return kDarkBlue;
     case HloOpcode::kReducePrecision:
       return kRed;
@@ -1000,7 +1014,7 @@ string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) {
   // The HLO instruction name contains usually the opcode, e.g. "%add.42" is
   // an add instruction.  In this case we render just the name.
   if (tensorflow::StringPiece(instr->name())
-          .starts_with(StrCat("%", HloOpcodeString(instr->opcode())))) {
+          .starts_with(HloOpcodeString(instr->opcode()))) {
     return Printf("<b>%s</b>", HtmlLikeStringSanitize(instr->name()));
   }
   string extended_opcode =
@@ -1036,62 +1050,32 @@ string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) {
 }
 
 string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
-  string opcode_specific_info = [&]() -> string {
-    switch (instr->opcode()) {
-      case HloOpcode::kRng:
-        return RandomDistribution_Name(instr->random_distribution());
-      case HloOpcode::kConvolution:
-        return StrCat(
-            HtmlLikeStringSanitize(
-                instr->ConvolutionDimensionNumbersToString()),
-            "<br/>",
-            HtmlLikeStringSanitize(window_util::ToString(instr->window())));
-      case HloOpcode::kBroadcast:
-      case HloOpcode::kTranspose:
-      case HloOpcode::kReduce:
-        return Printf("dims={%s}", Join(instr->dimensions(), ","));
-      case HloOpcode::kGetTupleElement:
-        return Printf("index=%lld", instr->tuple_index());
-      case HloOpcode::kBatchNormTraining:
-      case HloOpcode::kBatchNormGrad:
-        return Printf("feature_index=%lld", instr->feature_index());
-      case HloOpcode::kCustomCall:
-        return Printf("custom_call_target=%s", instr->custom_call_target());
-      case HloOpcode::kSlice:
-        return std::all_of(instr->slice_strides().begin(),
-                           instr->slice_strides().end(),
-                           [](int64 stride) { return stride == 1; })
-                   ? ""
-                   : StrCat("stride=", VectorString(instr->slice_strides()));
-      case HloOpcode::kSend:
-      case HloOpcode::kSendDone:
-      case HloOpcode::kRecv:
-      case HloOpcode::kRecvDone:
-        return StrCat("channel_id=", instr->channel_id());
-      default:
-        return "";
-    }
-  }();
-
   std::vector<string> lines;
-  if (!opcode_specific_info.empty()) {
-    lines.push_back(opcode_specific_info);
-  }
-  if (instr->has_sharding()) {
-    lines.push_back(StrCat("sharding=", instr->sharding().ToString()));
+
+  // Get the instruction's extra attributes excluding the names of its
+  // subcomputations, since those are drawn explicitly in the graph.
+  for (const auto& line : instr->ExtraAttributesToString(
+           HloPrintOptions().set_print_subcomputation_references(false))) {
+    lines.push_back(HtmlLikeStringSanitize(line));
   }
+
   // Show the shape and layout of the instruction, unless it's an inlined fusion
   // node -- there the shape and layout is present in the output node.
   if (instr->opcode() != HloOpcode::kFusion ||
       !ShouldShowFusionSubcomputation(instr)) {
-    string instr_shape = ShapeUtil::HumanString(instr->shape());
-
-    // Show layout of non-tuple shapes with more than one dimension.
-    if (LayoutUtil::HasLayout(instr->shape()) &&
-        instr->shape().dimensions_size() > 1 &&
-        !ShapeUtil::IsTuple(instr->shape())) {
-      StrAppend(&instr_shape, "{",
-                Join(instr->shape().layout().minor_to_major(), ","), "}");
+    // Show layout of instructions with more than one dimension.  Don't show
+    // layout on tuples or tensors with just one dimension (which only have one
+    // possible layout) to avoid visual noise.
+    bool shape_is_multidim = false;
+    ShapeUtil::ForEachSubshape(instr->shape(),
+                               [&](const Shape& s, const ShapeIndex&) {
+                                 shape_is_multidim |= s.dimensions_size() > 1;
+                               });
+    string instr_shape;
+    if (instr->opcode() != HloOpcode::kTuple && shape_is_multidim) {
+      instr_shape = ShapeUtil::HumanStringWithLayout(instr->shape());
+    } else {
+      instr_shape = ShapeUtil::HumanString(instr->shape());
     }
 
     // Some instructions have giant tuples as their shapes, so truncate the
@@ -1353,19 +1337,16 @@ string SaveGraph(const string& graph,
       file_extension = ".pbtxt";
       break;
   }
-  string path = JoinPath(
-      dest_path, StrCat("hlo_graph_", output_num++, ".XXXXXX", file_extension));
+  string path = JoinPath(dest_path, StrCat("hlo_graph_", output_num++, "."));
   auto status = Status::OK();
-  int fd = mkstemps(&path[0], file_extension.length());
-  if (fd < 0) {
+  auto env = tensorflow::Env::Default();
+  if (!env->CreateUniqueFileName(&path, file_extension)) {
     status =
         Status(tensorflow::error::Code::UNKNOWN,
                StrCat("Failed to create temporary file to dump HLO graph: ",
                       strerror(errno)));
   } else {
-    status =
-        tensorflow::WriteStringToFile(tensorflow::Env::Default(), path, graph);
-    close(fd);
+    status = tensorflow::WriteStringToFile(env, path, graph);
   }
   if (!status.ok()) {
     LOG(WARNING) << "Saving HLO graph failed: " << status;
@@ -1438,15 +1419,18 @@ void DumpText(const HloModule& module, const string& label,
       do_prefix ? StrCat(prefix, "-", label, ".txt") : StrCat(label, ".txt");
   string path = JoinPath(directory_path, filename);
   TF_CHECK_OK(WriteStringToFile(
-      env, path, module.ToString(/*include_large_constants=*/true)));
+      env, path,
+      module.ToString(HloPrintOptions().set_print_large_constants(true))));
   LOG(INFO) << "dumping module '" << module.name() << "' to " << path;
 }
 
 string MaybeDumpHloModule(const HloModule& module, const string& label,
                           const HloExecutionProfile* profile) {
-  VLOG(2) << "MaybeDumpHloModule called on module " << module.name();
-  string graph_url;
   const DebugOptions& debug_options = module.config().debug_options();
+  VLOG(2) << "MaybeDumpHloModule called on module " << module.name()
+          << " with generate_hlo_graph regex \""
+          << debug_options.xla_generate_hlo_graph() << "\"";
+  string graph_url;
   if (!debug_options.xla_generate_hlo_graph().empty() &&
       RE2::PartialMatch(module.name(),
                         debug_options.xla_generate_hlo_graph())) {
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
index 8e1531c87f9c6e133e2d6763b046b1d5dcbcd09f..1f00aa41dc783f9e5657f5fa654884a31fae0fe7 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
@@ -117,5 +117,18 @@ TEST(HloGraphDumperTest, NestedFusion) {
       HasSubstr(inner_sum->name()));
 }
 
+TEST(HloGraphDumperTest, Constant) {
+  HloComputation::Builder b("b");
+  auto instruction = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(-42)));
+  instruction->set_name("i_am_a_constant_root_instruction");
+  HloModule m(TestName());
+  HloComputation* root_computation = m.AddEntryComputation(b.Build());
+  string graph = hlo_graph_dumper::DumpGraph(
+      *root_computation, /*label=*/"an_empty_graph", DebugOptions());
+  EXPECT_THAT(graph, HasSubstr("an_empty_graph"));
+  EXPECT_THAT(graph, Not(HasSubstr("i_am_a_constant_root_instruction")));
+}
+
 }  // anonymous namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index c30c4326547bbeae4f7054974f0d3fade65e3382..0981f1f4fe57751d5b7059b4b08099385369e4b9 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -101,10 +101,10 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
 
   instruction->metadata_ = proto.metadata();
   if (proto.has_literal()) {
-    instruction->literal_ = MakeUnique<Literal>(proto.literal());
+    TF_ASSIGN_OR_RETURN(instruction->literal_,
+                        Literal::CreateFromProto(proto.literal()));
   }
   instruction->parameter_number_ = proto.parameter_number();
-  instruction->parameter_name_ = proto.parameter_name();
 
   instruction->tuple_index_ = proto.tuple_index();
   for (int64 dimension : proto.dimensions()) {
@@ -118,6 +118,10 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         MakeUnique<ConvolutionDimensionNumbers>(
             proto.convolution_dimension_numbers());
   }
+  if (proto.has_dot_dimension_numbers()) {
+    instruction->dot_dimension_numbers_ =
+        MakeUnique<DotDimensionNumbers>(proto.dot_dimension_numbers());
+  }
   for (const HloInstructionProto::SliceDimensions& slice_dimensions :
        proto.slice_dimensions()) {
     instruction->slice_starts_.push_back(slice_dimensions.start());
@@ -141,6 +145,10 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   instruction->infeed_config_ = proto.infeed_config();
   instruction->custom_call_target_ = proto.custom_call_target();
   instruction->outfeed_shape_ = proto.outfeed_shape();
+  instruction->fft_type_ = proto.fft_type();
+  for (int64 fft_len : proto.fft_length()) {
+    instruction->fft_length_.push_back(fft_len);
+  }
 
   return std::move(instruction);
 }
@@ -150,7 +158,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   auto instruction =
       WrapUnique(new HloInstruction(HloOpcode::kParameter, shape));
   instruction->parameter_number_ = parameter_number;
-  instruction->parameter_name_ = name;
   instruction->name_ = name;
   return instruction;
 }
@@ -160,8 +167,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   auto instruction =
       WrapUnique(new HloInstruction(HloOpcode::kTrace, ShapeUtil::MakeNil()));
   instruction->operands_.push_back(operand);
-  instruction->literal_.reset(new Literal);
-  instruction->literal_->append_u8s(tag);
+  instruction->literal_ = Literal::CreateR1U8(tag);
   return instruction;
 }
 
@@ -332,6 +338,41 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFft(
+    const Shape& shape, HloInstruction* operand, FftType fft_type,
+    tensorflow::gtl::ArraySlice<int64> fft_length) {
+  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kFft, shape));
+  instruction->AppendOperand(operand);
+  instruction->fft_type_ = fft_type;
+  instruction->fft_length_.assign(fft_length.begin(), fft_length.end());
+  return instruction;
+}
+
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDot(
+    const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+    const DotDimensionNumbers& dimension_numbers) {
+  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kDot, shape));
+  instruction->AppendOperand(lhs);
+  instruction->AppendOperand(rhs);
+  instruction->dot_dimension_numbers_ =
+      MakeUnique<DotDimensionNumbers>(dimension_numbers);
+  return instruction;
+}
+
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCanonicalDot(
+    const Shape& shape, HloInstruction* lhs, HloInstruction* rhs) {
+  CHECK_EQ(ShapeUtil::Rank(lhs->shape()), 2);
+  CHECK_EQ(ShapeUtil::Rank(rhs->shape()), 2);
+
+  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kDot, shape));
+  instruction->AppendOperand(lhs);
+  instruction->AppendOperand(rhs);
+  instruction->dot_dimension_numbers_ = MakeUnique<DotDimensionNumbers>();
+  instruction->dot_dimension_numbers_->add_lhs_contracting_dimensions(1);
+  instruction->dot_dimension_numbers_->add_rhs_contracting_dimensions(0);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateReducePrecision(const Shape& shape,
                                       HloInstruction* operand,
@@ -346,12 +387,9 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
 }
 
 /* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateCrossReplicaSum(const Shape& shape,
-                                      HloInstruction* operand) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kCrossReplicaSum, shape));
-  instruction->AppendOperand(operand);
-  return instruction;
+HloInstruction::CreateCrossReplicaSum(
+    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+  return CreateNary(shape, HloOpcode::kCrossReplicaSum, operands);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateInfeed(
@@ -366,6 +404,9 @@ HloInstruction::CreateCrossReplicaSum(const Shape& shape,
     tensorflow::StringPiece outfeed_config) {
   std::unique_ptr<HloInstruction> instruction =
       WrapUnique(new HloInstruction(HloOpcode::kOutfeed, ShapeUtil::MakeNil()));
+  CHECK(ShapeUtil::Compatible(operand->shape(), shape))
+      << "Outfeed shape " << shape << " must be compatible with operand shape "
+      << operand->shape();
   instruction->AppendOperand(operand);
   instruction->outfeed_config_ = outfeed_config.ToString();
   instruction->outfeed_shape_ = shape;
@@ -631,6 +672,58 @@ HloInstruction::CreateSelectAndScatter(
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateBroadcastSequence(
+    const Shape& output_shape, HloInstruction* operand,
+    const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
+        adder) {
+  CHECK(ShapeUtil::IsScalar(operand->shape()) ||
+        ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(output_shape));
+  Shape broadcast_shape = ShapeUtil::ChangeElementType(
+      output_shape, operand->shape().element_type());
+  // Do explicit broadcast for scalar.
+  if (ShapeUtil::IsScalar(operand->shape())) {
+    auto broadcast =
+        HloInstruction::CreateBroadcast(broadcast_shape, operand, {});
+    broadcast->set_metadata(operand->metadata());
+    if (operand->has_sharding()) {
+      broadcast->set_sharding(operand->sharding());
+    }
+    return broadcast;
+  }
+  // Do explicit broadcast for degenerate broadcast.
+  std::vector<int64> broadcast_dimensions;
+  std::vector<int64> reshaped_dimensions;
+  for (int i = 0; i < ShapeUtil::Rank(operand->shape()); i++) {
+    if (operand->shape().dimensions(i) == output_shape.dimensions(i)) {
+      broadcast_dimensions.push_back(i);
+      reshaped_dimensions.push_back(operand->shape().dimensions(i));
+    } else {
+      CHECK_EQ(operand->shape().dimensions(i), 1)
+          << "An explicit broadcast sequence requires the broadcasted "
+             "dimensions to be trivial; operand: "
+          << operand->ToString() << "; output_shape: " << output_shape;
+    }
+  }
+  // Eliminate the size one dimensions.
+  HloInstruction* reshaped_operand = adder(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(operand->shape().element_type(),
+                           reshaped_dimensions),
+      operand));
+  reshaped_operand->set_metadata(operand->metadata());
+  if (operand->has_sharding()) {
+    reshaped_operand->set_sharding(operand->sharding());
+  }
+  // Broadcast 'reshape' up to the larger size.
+  auto broadcast = HloInstruction::CreateBroadcast(
+      broadcast_shape, reshaped_operand, broadcast_dimensions);
+  broadcast->set_metadata(operand->metadata());
+  if (operand->has_sharding()) {
+    broadcast->set_sharding(operand->sharding());
+  }
+  return broadcast;
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreatePad(
     const Shape& shape, HloInstruction* operand, HloInstruction* padding_value,
     const PaddingConfig& padding_config) {
@@ -670,10 +763,23 @@ HloInstruction::CreateSelectAndScatter(
   return instruction;
 }
 
+// We put the fusion kind into the instruction's name for transpose-dot fusions,
+// since those fusions are really just describing a type of dot rather than
+// generating a novel computation.
+static string FusionNodeName(HloInstruction::FusionKind fusion_kind) {
+  switch (fusion_kind) {
+    case HloInstruction::FusionKind::kTransposeDot:
+      return "dot_fusion";
+    default:
+      return "fusion";
+  }
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFusion(
     const Shape& shape, FusionKind fusion_kind, HloInstruction* fused_root) {
   auto instruction = WrapUnique(new HloInstruction(HloOpcode::kFusion, shape));
   instruction->fusion_kind_ = fusion_kind;
+  instruction->name_ = FusionNodeName(fusion_kind);
   instruction->set_parent(fused_root->parent());
   instruction->set_metadata(fused_root->metadata());
   instruction->CloneAndFuseInternal(fused_root);
@@ -689,23 +795,12 @@ HloInstruction::CreateSelectAndScatter(
     instruction->AppendOperand(operand);
   }
   instruction->fusion_kind_ = fusion_kind;
+  instruction->name_ = FusionNodeName(fusion_kind);
   instruction->called_computations_.push_back(fusion_computation);
   fusion_computation->SetFusionInstruction(instruction.get());
   return instruction;
 }
 
-/* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateFusionForBackwardConvolution(
-    const Shape& shape, FusionKind fusion_kind, const Window& window,
-    const ConvolutionDimensionNumbers& conv_dnums, HloInstruction* fused_root) {
-  std::unique_ptr<HloInstruction> fusion =
-      CreateFusion(shape, fusion_kind, fused_root);
-  fusion->window_ = MakeUnique<Window>(window);
-  fusion->convolution_dimension_numbers_ =
-      MakeUnique<ConvolutionDimensionNumbers>(conv_dnums);
-  return fusion;
-}
-
 void HloInstruction::MergeFusionInstruction(
     HloInstruction* instruction_to_merge) {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
@@ -985,6 +1080,7 @@ bool HloInstruction::HasSideEffect() const {
     case HloOpcode::kSendDone:
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
+    case HloOpcode::kRng:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kTrace:
@@ -1086,7 +1182,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kLe:
     case HloOpcode::kLt:
     case HloOpcode::kNe:
-    case HloOpcode::kDot:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kPower:
@@ -1138,9 +1233,16 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateConvolve(shape, new_operands[0], new_operands[1], *window_,
                              *convolution_dimension_numbers_);
       break;
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kDot:
+      CHECK_EQ(new_operands.size(), 2);
+      clone = CreateDot(shape, new_operands[0], new_operands[1],
+                        *dot_dimension_numbers_);
+      break;
+    case HloOpcode::kFft:
       CHECK_EQ(new_operands.size(), 1);
-      clone = CreateCrossReplicaSum(shape, new_operands[0]);
+      return CreateFft(shape, new_operands[0], fft_type_, fft_length_);
+    case HloOpcode::kCrossReplicaSum:
+      clone = CreateCrossReplicaSum(shape, new_operands);
       break;
     case HloOpcode::kGetTupleElement:
       CHECK_EQ(new_operands.size(), 1);
@@ -1215,7 +1317,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CloneFusionWithNewOperands(shape, new_operands, module);
       break;
     case HloOpcode::kParameter:
-      clone = CreateParameter(parameter_number_, shape, parameter_name_);
+      clone = CreateParameter(parameter_number_, shape, name_);
       break;
     case HloOpcode::kBatchNormTraining:
       CHECK_EQ(new_operands.size(), 3);
@@ -1244,10 +1346,27 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
                                   new_operands[4], epsilon(), feature_index());
       break;
     case HloOpcode::kConditional:
-    case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
+      CHECK_EQ(new_operands.size(), 3);
+      clone = CreateConditional(shape, new_operands[0], new_operands[1],
+                                true_computation(), new_operands[2],
+                                false_computation());
+      break;
     case HloOpcode::kSend:
+      CHECK_EQ(new_operands.size(), 1);
+      clone = CreateSend(new_operands[0], channel_id());
+      break;
     case HloOpcode::kSendDone:
+      CHECK_EQ(new_operands.size(), 1);
+      clone = CreateSendDone(new_operands[0]);
+      break;
+    case HloOpcode::kRecv:
+      CHECK_EQ(new_operands.size(), 0);
+      clone = CreateRecv(shape, channel_id());
+      break;
+    case HloOpcode::kRecvDone:
+      CHECK_EQ(new_operands.size(), 1);
+      clone = CreateRecvDone(new_operands[0]);
+      break;
     case HloOpcode::kTrace:
       LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode_);
   }
@@ -1492,8 +1611,9 @@ bool HloInstruction::HasConstantOperand() const {
 
 bool HloInstruction::IdenticalSlowPath(
     const HloInstruction& other,
-    std::function<bool(const HloComputation*, const HloComputation*)>
-        eq_computations) const {
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations,
+    const std::function<bool(const Shape&, const Shape&)>& eq_shapes) const {
   // Perform opcode specific checks.
   switch (opcode()) {
     // The result of these instructions only depend upon their opcode and
@@ -1509,7 +1629,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kCos:
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kDivide:
-    case HloOpcode::kDot:
     case HloOpcode::kEq:
     case HloOpcode::kExp:
     case HloOpcode::kFloor:
@@ -1542,8 +1661,12 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kTuple:
       return true;
 
-    // These opcodes have complex or special behavior so just return false.
     case HloOpcode::kFusion:
+      return fusion_kind() == other.fusion_kind() &&
+             eq_computations(fused_instructions_computation(),
+                             other.fused_instructions_computation());
+
+    // These opcodes have complex or special behavior so just return false.
     case HloOpcode::kRng:
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
@@ -1553,7 +1676,7 @@ bool HloInstruction::IdenticalSlowPath(
       return parameter_number() == other.parameter_number() &&
              // Check the shape too because `this` and `other` may be in
              // different HloComputations.
-             ShapeUtil::Compatible(shape(), other.shape());
+             eq_shapes(shape(), other.shape());
 
     case HloOpcode::kBatchNormTraining:
     case HloOpcode::kBatchNormInference:
@@ -1582,6 +1705,15 @@ bool HloInstruction::IdenticalSlowPath(
              protobuf_util::ProtobufEquals(
                  convolution_dimension_numbers(),
                  other.convolution_dimension_numbers());
+    // Check dot dimension numbers.
+    case HloOpcode::kDot:
+      return protobuf_util::ProtobufEquals(dot_dimension_numbers(),
+                                           other.dot_dimension_numbers());
+
+    // FFT has various types & lengths.
+    case HloOpcode::kFft:
+      return fft_type() == other.fft_type() &&
+             fft_length() == other.fft_length();
 
     // Reduction results are determined by the reduction dimension and the
     // reduction computation.
@@ -1600,18 +1732,18 @@ bool HloInstruction::IdenticalSlowPath(
              protobuf_util::ProtobufEquals(window(), other.window());
 
     case HloOpcode::kReshape:
-      return ShapeUtil::Compatible(shape(), other.shape());
+      return eq_shapes(shape(), other.shape());
 
     // Transpose result is determined by the final shape and the permutation.
     case HloOpcode::kTranspose:
-      return ShapeUtil::Compatible(shape(), other.shape()) &&
+      return eq_shapes(shape(), other.shape()) &&
              dimensions() == other.dimensions();
 
     // Remaining instructions with special values.
     case HloOpcode::kBitcast:
-      return ShapeUtil::Equal(shape(), other.shape());
+      return eq_shapes(shape(), other.shape());
     case HloOpcode::kBroadcast:
-      return ShapeUtil::Compatible(shape(), other.shape()) &&
+      return eq_shapes(shape(), other.shape()) &&
              dimensions() == other.dimensions();
     case HloOpcode::kConcatenate:
       return dimensions() == other.dimensions();
@@ -1625,10 +1757,10 @@ bool HloInstruction::IdenticalSlowPath(
              slice_limits_ == other.slice_limits_ &&
              slice_strides_ == other.slice_strides_;
     case HloOpcode::kDynamicSlice:
-      return ShapeUtil::Compatible(shape(), other.shape()) &&
+      return eq_shapes(shape(), other.shape()) &&
              dynamic_slice_sizes_ == other.dynamic_slice_sizes_;
     case HloOpcode::kDynamicUpdateSlice:
-      return ShapeUtil::Compatible(shape(), other.shape());
+      return eq_shapes(shape(), other.shape());
     case HloOpcode::kCall:
     case HloOpcode::kMap:
       return eq_computations(to_apply(), other.to_apply());
@@ -1636,9 +1768,11 @@ bool HloInstruction::IdenticalSlowPath(
       return custom_call_target_ == other.custom_call_target_;
     case HloOpcode::kReverse:
       return dimensions() == other.dimensions();
+    case HloOpcode::kConditional:
+      return eq_computations(true_computation(), other.true_computation()) &&
+             eq_computations(false_computation(), other.false_computation());
 
     // These opcodes are not yet supported.
-    case HloOpcode::kConditional:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kSort:
@@ -1671,7 +1805,8 @@ void HloInstruction::RemoveUser(HloInstruction* user) {
 
 Status HloInstruction::ReplaceUseWith(HloInstruction* user,
                                       HloInstruction* new_producer) {
-  TF_RET_CHECK(ShapeUtil::Compatible(shape(), new_producer->shape()))
+  TF_RET_CHECK(
+      ShapeUtil::CompatibleIgnoringFpPrecision(shape(), new_producer->shape()))
       << "this shape: " << ShapeUtil::HumanString(shape())
       << ", replacement shape: "
       << ShapeUtil::HumanString(new_producer->shape());
@@ -1694,8 +1829,8 @@ Status HloInstruction::ReplaceOperandWith(int64 operand_num,
   TF_RET_CHECK(operand_num >= 0);
   TF_RET_CHECK(operand_num < operand_count());
   HloInstruction* old_operand = mutable_operand(operand_num);
-  TF_RET_CHECK(
-      ShapeUtil::Compatible(old_operand->shape(), new_operand->shape()))
+  TF_RET_CHECK(ShapeUtil::CompatibleIgnoringFpPrecision(old_operand->shape(),
+                                                        new_operand->shape()))
       << old_operand->shape().ShortDebugString() << " is not compatible with "
       << new_operand->shape().ShortDebugString();
   operands_[operand_num] = new_operand;
@@ -1882,16 +2017,23 @@ string HloInstruction::SignatureString() const {
   return StrCat("(", operands, ") -> ", ShapeUtil::HumanString(shape()));
 }
 
-string HloInstruction::ToString(bool compact_operands, bool include_metadata,
-                                bool include_large_constants) const {
+namespace {
+
+string PrintName(const string& name, const HloPrintOptions& options) {
+  return StrCat(options.print_percent() ? "%" : "", name);
+}
+
+}  // namespace
+
+string HloInstruction::ToString(const HloPrintOptions& options) const {
   string result =
-      StrCat("%", name(), " = ", ShapeUtil::HumanStringWithLayout(shape()), " ",
-             HloOpcodeString(opcode()), "(",
-             OperandsToString(compact_operands, include_large_constants), ")");
-  for (const string& extra : ExtraAttributesToString()) {
+      StrCat(PrintName(name(), options), " = ",
+             ShapeUtil::HumanStringWithLayout(shape()), " ",
+             HloOpcodeString(opcode()), "(", OperandsToString(options), ")");
+  for (const string& extra : ExtraAttributesToString(options)) {
     StrAppend(&result, ", ", extra);
   }
-  if (include_metadata &&
+  if (options.print_metadata() &&
       (!metadata_.op_type().empty() || !metadata_.op_name().empty() ||
        !metadata_.source_file().empty())) {
     StrAppend(&result, ", metadata={", xla::OpMetadataToString(metadata_), "}");
@@ -1899,14 +2041,13 @@ string HloInstruction::ToString(bool compact_operands, bool include_metadata,
   return result;
 }
 
-string HloInstruction::OperandsToString(bool compact,
-                                        bool include_large_constants) const {
+string HloInstruction::OperandsToString(const HloPrintOptions& options) const {
   string operands;
   if (opcode() == HloOpcode::kConstant) {
     // For constants, show the actual value in place of an empty operand list.
     if ((!ShapeUtil::IsTuple(shape()) &&
          ShapeUtil::ElementsIn(shape()) <= 10) ||
-        include_large_constants) {
+        options.print_large_constants()) {
       // Literal::ToString emits multidimensional arrays over multiple
       // lines. Compact this into one line by stripping out white space.
       string tmp = literal().ToString();
@@ -1931,14 +2072,19 @@ string HloInstruction::OperandsToString(bool compact,
   } else {
     tensorflow::gtl::ArraySlice<HloInstruction*> slice(operands_);
     const int64 kMaxOperandsToShowIfCompact = 4;
-    if (compact && slice.size() > kMaxOperandsToShowIfCompact) {
+    if (options.compact_operands() &&
+        slice.size() > kMaxOperandsToShowIfCompact) {
       slice.remove_suffix(slice.size() - kMaxOperandsToShowIfCompact);
     }
     operands = Join(slice, ", ", [&](string* out, HloInstruction* operand) {
-      *out += ShapeUtil::HumanStringWithLayout(operand->shape());
-      if (!compact) {
-        StrAppend(out, " %", operand->name());
+      std::vector<string> str;
+      if (options.print_operand_shape()) {
+        str.push_back(ShapeUtil::HumanStringWithLayout(operand->shape()));
+      }
+      if (!options.compact_operands()) {
+        str.push_back(PrintName(operand->name(), options));
       }
+      StrAppend(out, Join(str, " "));
     });
     const int64 remaining = operands_.size() - slice.size();
     if (slice.size() != operands_.size()) {
@@ -1948,7 +2094,8 @@ string HloInstruction::OperandsToString(bool compact,
   return operands;
 }
 
-std::vector<string> HloInstruction::ExtraAttributesToString() const {
+std::vector<string> HloInstruction::ExtraAttributesToString(
+    const HloPrintOptions& options) const {
   std::vector<string> extra;
   if (opcode() == HloOpcode::kFusion) {
     extra.push_back(StrCat("kind=", xla::ToString(fusion_kind())));
@@ -1990,23 +2137,42 @@ std::vector<string> HloInstruction::ExtraAttributesToString() const {
   if (convolution_dimension_numbers_ != nullptr) {
     extra.push_back(ConvolutionDimensionNumbersToString());
   }
-
-  if (opcode() == HloOpcode::kWhile) {
-    extra.push_back(StrCat("condition=%", while_condition()->name()));
-    extra.push_back(StrCat("body=%", while_body()->name()));
-  } else if (opcode() == HloOpcode::kSelectAndScatter) {
-    extra.push_back(StrCat("select=%", select()->name()));
-    extra.push_back(StrCat("scatter=%", scatter()->name()));
-  } else if (opcode() == HloOpcode::kCall || opcode() == HloOpcode::kMap ||
-             opcode() == HloOpcode::kReduceWindow ||
-             opcode() == HloOpcode::kReduce) {
-    extra.push_back(StrCat("to_apply=%", to_apply()->name()));
-  } else if (!called_computations().empty()) {
-    extra.push_back(StrCat(
-        "calls=", Join(called_computations(), ", ",
-                       [](string* out, const HloComputation* computation) {
-                         StrAppend(out, "%", computation->name());
-                       })));
+  if (dot_dimension_numbers_ != nullptr) {
+    extra.push_back(DotDimensionNumbersToString());
+  }
+  if (opcode() == HloOpcode::kFft) {
+    extra.push_back(StrCat("fft_type=", FftType_Name(fft_type())));
+    extra.push_back(StrCat("fft_length={", Join(fft_length(), ","), "}"));
+  }
+
+  if (options.print_subcomputation_references()) {
+    if (opcode() == HloOpcode::kWhile) {
+      extra.push_back(
+          StrCat("condition=", PrintName(while_condition()->name(), options)));
+      extra.push_back(
+          StrCat("body=", PrintName(while_body()->name(), options)));
+    } else if (opcode() == HloOpcode::kSelectAndScatter) {
+      extra.push_back(StrCat("select=", PrintName(select()->name(), options)));
+      extra.push_back(
+          StrCat("scatter=", PrintName(scatter()->name(), options)));
+    } else if (opcode() == HloOpcode::kConditional) {
+      extra.push_back(StrCat("true_computation=",
+                             PrintName(true_computation()->name(), options)));
+      extra.push_back(StrCat("false_computation=",
+                             PrintName(false_computation()->name(), options)));
+    } else if (opcode() == HloOpcode::kCall || opcode() == HloOpcode::kMap ||
+               opcode() == HloOpcode::kReduceWindow ||
+               opcode() == HloOpcode::kReduce) {
+      extra.push_back(
+          StrCat("to_apply=", PrintName(to_apply()->name(), options)));
+    } else if (!called_computations().empty()) {
+      extra.push_back(StrCat(
+          "calls=", Join(called_computations(), ", ",
+                         [&](string* out, const HloComputation* computation) {
+                           StrAppend(out,
+                                     PrintName(computation->name(), options));
+                         })));
+    }
   }
 
   if (opcode() == HloOpcode::kSend || opcode() == HloOpcode::kRecv ||
@@ -2023,8 +2189,9 @@ std::vector<string> HloInstruction::ExtraAttributesToString() const {
   if (!control_predecessors_.empty()) {
     extra.push_back(StrCat("control-predecessors={",
                            Join(control_predecessors_, ", ",
-                                [](string* out, HloInstruction* pre) {
-                                  StrAppend(out, "%", pre->name());
+                                [&](string* out, HloInstruction* pre) {
+                                  StrAppend(out,
+                                            PrintName(pre->name(), options));
                                 }),
                            "}"));
   }
@@ -2035,6 +2202,22 @@ std::vector<string> HloInstruction::ExtraAttributesToString() const {
     extra.push_back(
         StrCat("outfeed_config=\"", CEscape(outfeed_config_), "\""));
   }
+  if (opcode() == HloOpcode::kRng) {
+    extra.push_back(
+        StrCat("distribution=", RandomDistributionToString(distribution_)));
+  }
+  if (opcode() == HloOpcode::kReducePrecision) {
+    extra.push_back(StrCat("exponent_bits=", exponent_bits_));
+    extra.push_back(StrCat("mantissa_bits=", mantissa_bits_));
+  }
+
+  // By contract, we print the custom call target even if
+  // !options.print_subcomputation_references(), because the call target is not
+  // an HloComputation.
+  if (opcode() == HloOpcode::kCustomCall) {
+    extra.push_back(
+        StrCat("custom_call_target=\"", CEscape(custom_call_target_), "\""));
+  }
   return extra;
 }
 
@@ -2064,7 +2247,6 @@ HloInstructionProto HloInstruction::ToProto() const {
     *proto.mutable_literal() = literal_->ToProto();
   }
   proto.set_parameter_number(parameter_number_);
-  proto.set_parameter_name(parameter_name_);
   if (opcode() == HloOpcode::kFusion) {
     proto.set_fusion_kind(xla::ToString(fusion_kind()));
     *proto.mutable_fused_instructions_computation() =
@@ -2086,6 +2268,9 @@ HloInstructionProto HloInstruction::ToProto() const {
     *proto.mutable_convolution_dimension_numbers() =
         *convolution_dimension_numbers_;
   }
+  if (dot_dimension_numbers_ != nullptr) {
+    *proto.mutable_dot_dimension_numbers() = *dot_dimension_numbers_;
+  }
   for (int i = 0; i < slice_starts_.size(); ++i) {
     auto* slice_dimension = proto.add_slice_dimensions();
     slice_dimension->set_start(slice_starts_[i]);
@@ -2110,6 +2295,10 @@ HloInstructionProto HloInstruction::ToProto() const {
   proto.set_infeed_config(infeed_config_);
   proto.set_custom_call_target(custom_call_target_);
   *proto.mutable_outfeed_shape() = outfeed_shape_;
+  proto.set_fft_type(fft_type_);
+  for (int64 fft_len : fft_length_) {
+    proto.add_fft_length(fft_len);
+  }
 
   return proto;
 }
@@ -2131,42 +2320,27 @@ string HloInstruction::ToCategory() const {
     return category;
   }
 
+  // Give transpose-dot and backwards-conv fusions the categories "dot" and
+  // "convolution" so they match the categories of proper kDot and kConvolution
+  // ops.  These fusion categories are really just a way of expressing a
+  // particular kind of dot or conv, so they should have the same category as a
+  // vanilla dot/conv.
   if (opcode() == HloOpcode::kFusion) {
-    if (operands().size() == 2) {
-      bool saw_rank_1 = false;
-      bool saw_higher_rank = false;
-      for (const auto* operand : operands()) {
-        if (!ShapeUtil::IsTuple(operand->shape())) {
-          saw_rank_1 |= ShapeUtil::Rank(operand->shape()) == 1;
-          saw_higher_rank |= ShapeUtil::Rank(operand->shape()) > 1;
-        }
-      }
-      if (saw_rank_1 && saw_higher_rank) {
-        return "rank-1-broadcast binary fusion";
-      }
-    }
     switch (fusion_kind()) {
       case FusionKind::kLoop:
-        if (IsElementwise()) {
-          return "elementwise fusion";
-        } else {
-          return "non-elementwise fusion";
-        }
+        return "loop fusion";
       case FusionKind::kInput:
         return "input fusion";
       case FusionKind::kOutput:
         return "output fusion";
       case FusionKind::kTransposeDot:
-        return "dot fusion";
-      case FusionKind::kConvBackwardFilter:
-      case FusionKind::kConvBackwardInput:
-        return "convolution fusion";
+        return "dot";
       case FusionKind::kCustom:
         return "custom fusion";
     }
   }
 
-  if (IsElementwise() && opcode() != HloOpcode::kFusion) {
+  if (IsElementwise()) {
     return "non-fusion elementwise";
   }
 
@@ -2182,7 +2356,7 @@ void HloInstruction::set_tracing(HloInstruction* trace_instruction) {
 string HloInstruction::TracingTag() const {
   CHECK_EQ(HloOpcode::kTrace, opcode());
   CHECK(literal_ != nullptr);
-  return literal_->u8s_string();
+  return literal_->GetR1U8AsString();
 }
 
 bool HloInstruction::IsFused() const { return parent_->IsFusionComputation(); }
@@ -2325,6 +2499,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleSelect(this);
     case HloOpcode::kConvolution:
       return visitor->HandleConvolution(this);
+    case HloOpcode::kFft:
+      return visitor->HandleFft(this);
     case HloOpcode::kCrossReplicaSum:
       return visitor->HandleCrossReplicaSum(this);
     case HloOpcode::kTuple:
@@ -2933,10 +3109,6 @@ string ToString(HloInstruction::FusionKind kind) {
       return "kOutput";
     case HloInstruction::FusionKind::kTransposeDot:
       return "kTransposeDot";
-    case HloInstruction::FusionKind::kConvBackwardFilter:
-      return "kConvBackwardFilter";
-    case HloInstruction::FusionKind::kConvBackwardInput:
-      return "kConvBackwardInput";
     case HloInstruction::FusionKind::kCustom:
       return "kCustom";
   }
@@ -2956,12 +3128,6 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
   if (kind_name == "kTransposeDot") {
     return HloInstruction::FusionKind::kTransposeDot;
   }
-  if (kind_name == "kConvBackwardFilter") {
-    return HloInstruction::FusionKind::kConvBackwardFilter;
-  }
-  if (kind_name == "kConvBackwardInput") {
-    return HloInstruction::FusionKind::kConvBackwardInput;
-  }
   if (kind_name == "kCustom") {
     return HloInstruction::FusionKind::kCustom;
   }
@@ -3001,6 +3167,28 @@ string OpMetadataToString(const OpMetadata& metadata) {
   return Join(result, " ");
 }
 
+string RandomDistributionToString(const RandomDistribution& distribution) {
+  return tensorflow::str_util::Lowercase(RandomDistribution_Name(distribution));
+}
+
+StatusOr<RandomDistribution> StringToRandomDistribution(const string& name) {
+  static std::unordered_map<string, RandomDistribution>* map = [] {
+    static auto* map = new std::unordered_map<string, RandomDistribution>;
+    for (int i = 0; i < RandomDistribution_ARRAYSIZE; i++) {
+      if (RandomDistribution_IsValid(i)) {
+        auto value = static_cast<RandomDistribution>(i);
+        (*map)[RandomDistributionToString(value)] = value;
+      }
+    }
+    return map;
+  }();
+  auto found = map->find(tensorflow::str_util::Lowercase(name));
+  if (found == map->end()) {
+    return InvalidArgument("Unknown distribution");
+  }
+  return found->second;
+}
+
 std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind) {
   return os << ToString(kind);
 }
@@ -3047,10 +3235,39 @@ string HloInstruction::ConvolutionDimensionNumbersToString() const {
   result += "_";
   append_dims(rhs_dims, operand(1)->shape());
   result += "->";
-  append_dims(output_dims, shape());
+
+  // A convolution can be represented as a kConvolution HLO or as a CustomCall
+  // that returns a tuple, the first element of which is the result of the
+  // convolution.
+  Shape this_shape =
+      ShapeUtil::IsTuple(shape()) ? shape().tuple_shapes(0) : shape();
+  append_dims(output_dims, this_shape);
   return result;
 }
 
+string HloInstruction::DotDimensionNumbersToString() const {
+  std::vector<string> result;
+  if (dot_dimension_numbers_ == nullptr) {
+    return "";
+  }
+  const DotDimensionNumbers& dnums = *dot_dimension_numbers_;
+  if (!dnums.lhs_batch_dimensions().empty()) {
+    result.push_back(StrCat("lhs_batch_dims={",
+                            Join(dnums.lhs_batch_dimensions(), ","), "}"));
+  }
+  result.push_back(StrCat("lhs_contracting_dims={",
+                          Join(dnums.lhs_contracting_dimensions(), ","), "}"));
+
+  if (!dnums.rhs_batch_dimensions().empty()) {
+    result.push_back(StrCat("rhs_batch_dims={",
+                            Join(dnums.rhs_batch_dimensions(), ","), "}"));
+  }
+  result.push_back(StrCat("rhs_contracting_dims={",
+                          Join(dnums.rhs_contracting_dimensions(), ","), "}"));
+
+  return Join(result, ", ");
+}
+
 bool HloInstruction::CouldBeBitcast() const {
   switch (opcode_) {
     case HloOpcode::kTranspose:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index cda8b07c61e2b36a83184648f6f3744deeb86812..3170746157fbcfa7d0a7eaba6d226d46691105f9 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <iosfwd>
 #include <list>
 #include <memory>
+#include <set>
 #include <string>
 #include <tuple>
 #include <unordered_map>
@@ -56,21 +57,119 @@ namespace xla {
 class HloComputation;
 class HloModule;
 
+// A bunch of switches that control how the hlo text should be printed.
+class HloPrintOptions {
+ public:
+  // Constructs the default print options: don't print large constants, don't
+  // compact operands, no indentation.
+  HloPrintOptions()
+      : print_large_constants_(false),
+        print_subcomputation_references_(true),
+        print_metadata_(true),
+        compact_operands_(false),
+        print_operand_shape_(true),
+        print_program_shape_(true),
+        print_percent_(true),
+        indent_amount_(0) {}
+
+  static HloPrintOptions ShortParsable() {
+    return HloPrintOptions()
+        .set_print_large_constants(true)
+        .set_print_subcomputation_references(true)
+        .set_print_metadata(false)
+        .set_print_operand_shape(false)
+        .set_print_program_shape(false)
+        .set_print_percent(false);
+  }
+
+  // If true, large constants will be printed out.
+  HloPrintOptions& set_print_large_constants(bool value) {
+    print_large_constants_ = value;
+    return *this;
+  }
+
+  // If true, the names of subcomputations (e.g. a fusion node's fused
+  // computation) won't be printed.  This makes the resulting text not parsable.
+  //
+  // A CustomCall's call target is printed even if
+  // print_subcomputation_references is false, because the call target isn't an
+  // HloComputation.
+  HloPrintOptions& set_print_subcomputation_references(bool value) {
+    print_subcomputation_references_ = value;
+    return *this;
+  }
+
+  // If true, metatdata will be printed.
+  HloPrintOptions& set_print_metadata(bool value) {
+    print_metadata_ = value;
+    return *this;
+  }
+
+  // If true, operands' shapes will be printed.
+  HloPrintOptions& set_print_operand_shape(bool value) {
+    print_operand_shape_ = value;
+    return *this;
+  }
+
+  // If true, program shape of hlo computations will be printed.
+  HloPrintOptions& set_print_program_shape(bool value) {
+    print_program_shape_ = value;
+    return *this;
+  }
+
+  // If true, names will be printed with prefix '%'.
+  HloPrintOptions& set_print_percent(bool value) {
+    print_percent_ = value;
+    return *this;
+  }
+
+  // If true, only a part of operands will be printed out, and their names will
+  // be omitted (note that in this case the text will not be parsable).
+  HloPrintOptions& set_compact_operands(bool value) {
+    compact_operands_ = value;
+    return *this;
+  }
+
+  // The indent of the hlo text block.
+  HloPrintOptions& set_indent_amount(int value) {
+    indent_amount_ = value;
+    return *this;
+  }
+
+  bool print_large_constants() const { return print_large_constants_; }
+  bool print_subcomputation_references() const {
+    return print_subcomputation_references_;
+  }
+  bool print_metadata() const { return print_metadata_; }
+  bool compact_operands() const { return compact_operands_; }
+  bool print_operand_shape() const { return print_operand_shape_; }
+  bool print_program_shape() const { return print_program_shape_; }
+  bool print_percent() const { return print_percent_; }
+  int indent_amount() const { return indent_amount_; }
+
+ private:
+  bool print_large_constants_;
+  bool print_subcomputation_references_;
+  bool print_metadata_;
+  bool compact_operands_;
+  bool print_operand_shape_;
+  bool print_program_shape_;
+  bool print_percent_;
+  int indent_amount_;
+};
+
 // HLO instructions are the IR used by the high-level compiler.
 class HloInstruction {
  public:
   enum class FusionKind {
-    kLoop,                // Fused into a loop.
-    kInput,               // Op's input is fused into the op itself.
-    kOutput,              // Op's output is fused into the op itself.
-                          // REQUIRES: At least one operand buffer must be able
-                          // to alias the output buffer.
-    kTransposeDot,        // Fused into a dot with transposed operands.
-    kConvBackwardFilter,  // Fused into a backward filter convolution.
-    kConvBackwardInput,   // Fused into a backward input convolution.
-
-    kCustom,  // Custom category for backend-specific fusions that
-              // do not match any of the more specific ones.
+    kLoop,          // Fused into a loop.
+    kInput,         // Op's input is fused into the op itself.
+    kOutput,        // Op's output is fused into the op itself.
+                    // REQUIRES: At least one operand buffer must be able
+                    // to alias the output buffer.
+    kTransposeDot,  // Fused into a dot with transposed operands.
+    kCustom,        // Custom category for backend-specific fusions that
+                    // do not match any of the more specific ones.
   };
 
   ~HloInstruction();
@@ -160,6 +259,23 @@ class HloInstruction {
       const Window& window,
       const ConvolutionDimensionNumbers& dimension_numbers);
 
+  // Creates an FFT op, of the type indicated by fft_type.
+  static std::unique_ptr<HloInstruction> CreateFft(
+      const Shape& shape, HloInstruction* operand, FftType fft_type,
+      tensorflow::gtl::ArraySlice<int64> fft_length);
+
+  // Creates a dot op with operands 'lhs' and 'rhs' with contracting and batch
+  // dimensions specified in 'dimension_numbers'.
+  static std::unique_ptr<HloInstruction> CreateDot(
+      const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+      const DotDimensionNumbers& dimension_numbers);
+
+  // Creates a dot op with operands 'lhs' and 'rhs' that contracts dimension 1
+  // of the LHS with dimension 0 of the RHS with no batch dimensions.  Both LHS
+  // and the RHS must be of rank 2.
+  static std::unique_ptr<HloInstruction> CreateCanonicalDot(
+      const Shape& shape, HloInstruction* lhs, HloInstruction* rhs);
+
   // Creates a reduce-precision op, where operand is the data to reduce in
   // precision, and exponent_bits and mantissa_bits describe the precision to
   // reduce it to.
@@ -169,7 +285,8 @@ class HloInstruction {
 
   // Creates a cross replica sum op.
   static std::unique_ptr<HloInstruction> CreateCrossReplicaSum(
-      const Shape& shape, HloInstruction* operand);
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands);
 
   // Creates a conversion instruction, where operand is the data to convert and
   // shape is the target shape for the conversion.
@@ -289,6 +406,20 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
 
+  // Creates a sequence of instructions that performs an explicit broadcast of
+  // the operand to the target shape.
+  //
+  // Interior HLOs are passed to "adder", but the "root" HLO of the sequence is
+  // returned as a unique_ptr for API consistency with other factory methods in
+  // this interface.
+  //
+  // TODO(b/72173833) Ideally HloComputations would always be present, and so
+  // the adder being passed by the caller would not be necessary.
+  static std::unique_ptr<HloInstruction> CreateBroadcastSequence(
+      const Shape& output_shape, HloInstruction* operand,
+      const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
+          adder);
+
   // Creates a pad instruction, where the operand is padded on the edges and
   // between the elements with the given padding value.
   static std::unique_ptr<HloInstruction> CreatePad(
@@ -332,14 +463,6 @@ class HloInstruction {
       tensorflow::gtl::ArraySlice<HloInstruction*> operands,
       HloComputation* fusion_computation);
 
-  // Creates a fusion instruction that represents backward convolution. This is
-  // similar to CreateFusion, but with extra arguments indicating the window and
-  // dimemsion mapping of the backward convolution.
-  static std::unique_ptr<HloInstruction> CreateFusionForBackwardConvolution(
-      const Shape& shape, FusionKind fusion_kind, const Window& window,
-      const ConvolutionDimensionNumbers& conv_dnums,
-      HloInstruction* fused_root);
-
   // Creates a call instruction that applies the given computation on the given
   // operands. "shape" is the resultant shape.
   static std::unique_ptr<HloInstruction> CreateCall(
@@ -421,7 +544,7 @@ class HloInstruction {
   Status RemoveControlDependencyTo(HloInstruction* instruction);
 
   // Returns the set of control predecessors (successors) of this
-  // instruction. Control predecessors (sucessors) must execute before (after)
+  // instruction. Control predecessors (successors) must execute before (after)
   // the current instruction.
   const std::vector<HloInstruction*>& control_predecessors() const {
     return control_predecessors_;
@@ -431,28 +554,42 @@ class HloInstruction {
   }
 
   // Returns true if "other" performs the same computation as this instruction.
-  // Layout of the instructions' output array is not considered.
   bool Identical(
       const HloInstruction& other,
-      std::function<bool(const HloInstruction*, const HloInstruction*)>
+      const std::function<bool(const HloInstruction*, const HloInstruction*)>&
           eq_operands = std::equal_to<const HloInstruction*>(),
-      std::function<bool(const HloComputation*, const HloComputation*)>
-          eq_computations = std::equal_to<const HloComputation*>()) const {
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations = std::equal_to<const HloComputation*>(),
+      bool layout_sensitive = true) const {
     // An instruction is always identical to itself.
     if (this == &other) {
       return true;
     }
 
-    // Identical instruction must have the same opcode and identical operands.
-    // In general, there is no need to check shape because shape is inferred
-    // from the shape of the operands.
-    if (opcode() != other.opcode() ||
-        !ContainersEqual(operands(), other.operands(),
-                         std::move(eq_operands))) {
+    // Identical instruction must have the same opcode, shape, and identical
+    // operands.
+    if (opcode() != other.opcode()) {
+      return false;
+    }
+    using EqShapeFuncType = bool (*)(const Shape&, const Shape&);
+    EqShapeFuncType eq_shapes =
+        layout_sensitive ? ShapeUtil::Equal : ShapeUtil::Compatible;
+    if (!eq_shapes(shape(), other.shape())) {
+      return false;
+    }
+    if (operands().size() != other.operands().size()) {
       return false;
     }
 
-    return IdenticalSlowPath(other, eq_computations);
+    // Use an explicit loop rather than ContainerEquals, because copying around
+    // std::functions may be too expensive in some cases.
+    for (size_t i = 0; i < operands().size(); ++i) {
+      if (!eq_operands(operand(i), other.operand(i))) {
+        return false;
+      }
+    }
+
+    return IdenticalSlowPath(other, eq_computations, eq_shapes);
   }
 
   // Returns whether the instruction has a constant operand.
@@ -540,16 +677,6 @@ class HloInstruction {
     return parameter_number_;
   }
 
-  const string& parameter_name() const {
-    CHECK_EQ(HloOpcode::kParameter, opcode_);
-    return parameter_name_;
-  }
-
-  void set_parameter_name(const string& str) {
-    CHECK_EQ(HloOpcode::kParameter, opcode_);
-    parameter_name_ = str;
-  }
-
   // Returns the dimension sizes or numbers associated with this instruction.
   //
   // Precondition: opcode() is one of: concatenate, reduce, broadcast, reshape,
@@ -637,18 +764,20 @@ class HloInstruction {
   string SignatureString() const;
 
   // Returns a debugging string that represents this instruction.
-  string ToString(bool compact_operands = false, bool include_metadata = true,
-                  bool include_large_constants = false) const;
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  string ToString() const { return ToString(HloPrintOptions()); }
+  string ToString(const HloPrintOptions& options) const;
 
   // Components of the ToString() representation:
 
   // Returns a string representation of the operand list.
-  string OperandsToString(bool compact, bool include_large_constants) const;
+  string OperandsToString(const HloPrintOptions& options) const;
 
   // Returns string representation of op-specific attributes.
-  std::vector<string> ExtraAttributesToString() const;
-
-  string ToStringNoMetadata() const { return ToString(false, false); }
+  std::vector<string> ExtraAttributesToString(
+      const HloPrintOptions& options) const;
 
   // As ToString, but returns a shorter string.
   string ToShortString() const;
@@ -676,13 +805,15 @@ class HloInstruction {
   // Returns feature_index field associated with the instruction. The index
   // represents the index of the feature dimension.
   //
-  // Precondition: opcode() == HloOpcode::kBatchNormTraining
+  // Precondition: opcode() is one of kBatchNormTraining, kBatchNormInference,
+  // or kBatchNormGrad.
   int64 feature_index() const { return feature_index_; }
 
   // Returns a epsilon value associated with the instruction. The is a small
   // number added to the variance to avoid divide-by-zero error.
   //
-  // Precondition: opcode() == HloOpcode::kBatchNormTraining
+  // Precondition: opcode() is one of kBatchNormTraining, kBatchNormInference,
+  // or kBatchNormGrad.
   float epsilon() const { return epsilon_; }
 
   // Returns the infeed configuration string. The infeed configuration includes
@@ -749,8 +880,8 @@ class HloInstruction {
   // Returns true if this instruction is a fusion instruction that generates
   // multiple outputs.
   const bool IsMultiOutputFusion() const {
-    return (opcode() == HloOpcode::kFusion &&
-            fused_expression_root()->opcode() == HloOpcode::kTuple);
+    return opcode() == HloOpcode::kFusion &&
+           fused_expression_root()->opcode() == HloOpcode::kTuple;
   }
 
   FusionKind fusion_kind() const {
@@ -856,6 +987,17 @@ class HloInstruction {
   }
   const std::vector<int64>& slice_strides() const { return slice_strides_; }
 
+  // Returns the flag that describes whether a slice must be lowered into an
+  // offset into the original operand.
+  bool IsInPlaceSlice() const { return is_in_place_slice_; }
+
+  // Sets and returns the flag that describes whether a slice must be lowered
+  // into an offset into the original operand.
+  bool SetIsInPlaceSlice(bool value) {
+    is_in_place_slice_ = value;
+    return value;
+  }
+
   // Returns the size of the slice in the given dimension for a dynamic
   // slice node.
   //
@@ -905,16 +1047,45 @@ class HloInstruction {
     return *padding_config_;
   }
 
-  // Returns data on the dimension numbers used for a convolution
-  // operation.
+  // Returns data on the dimension numbers used for a convolution operation,
+  // which may be a kConvolution instruction or a kCustomCall that implements a
+  // convolution.
   const ConvolutionDimensionNumbers& convolution_dimension_numbers() const {
     CHECK(convolution_dimension_numbers_ != nullptr);
     return *convolution_dimension_numbers_;
   }
 
+  // Sets the convolution dimension numbers on this instruction.  In general you
+  // shouldn't need to call this; instead, specify the convolution dimension
+  // numbers when you create the instruction.
+  void set_convolution_dimension_numbers(
+      const ConvolutionDimensionNumbers& dnums) {
+    convolution_dimension_numbers_ =
+        MakeUnique<ConvolutionDimensionNumbers>(dnums);
+  }
+
+  FftType fft_type() const {
+    CHECK_EQ(HloOpcode::kFft, opcode_);
+    return fft_type_;
+  }
+
+  const std::vector<int64>& fft_length() const {
+    CHECK_EQ(HloOpcode::kFft, opcode_);
+    return fft_length_;
+  }
+
   // Returns the dump string of the convolution dimension numbers.
   string ConvolutionDimensionNumbersToString() const;
 
+  // Returns data on the dimension numbers used for a dot operation.
+  const DotDimensionNumbers& dot_dimension_numbers() const {
+    CHECK(dot_dimension_numbers_ != nullptr);
+    return *dot_dimension_numbers_;
+  }
+
+  // Returns the dump string of the dot dimension numbers.
+  string DotDimensionNumbersToString() const;
+
   // Returns the random distribution for this rng node.
   //
   // Precondition: opcode() == HloOpcode::kRng
@@ -1006,10 +1177,9 @@ class HloInstruction {
   std::tuple<bool, std::vector<int64>, std::vector<int64>>
   ReshapeMerelyInsertsOrDeletes1SizedDimensions() const;
 
-  // Returns a string identifier for this instruction. If no string identifier
-  // has been explicitly set, then the identifier is the serialized pointer to
-  // this instruction.
+  // Gets/sets the string identifier for this instruction.
   const string& name() const { return name_; }
+  void set_name(tensorflow::StringPiece name) { name_ = name.ToString(); }
 
   // Use the given NameUniquer to select a unique name for the instruction based
   // on the instruction's existing name.
@@ -1068,10 +1238,14 @@ class HloInstruction {
   class FusionReusesParamElements;
 
   // See comments on Identical().
+  // eq_shapes() is used to check shapes for equality, and would normally be
+  // expected to be ShapeUtil::Equals or ShapeUtil::Compatible, depending on
+  // whether we want a layout-sensitive check or not.
   bool IdenticalSlowPath(
       const HloInstruction& other,
-      std::function<bool(const HloComputation*, const HloComputation*)>
-          eq_computations) const;
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations,
+      const std::function<bool(const Shape&, const Shape&)>& eq_shapes) const;
 
   // Creates an n-ary elementwise operation.
   static std::unique_ptr<HloInstruction> CreateNary(
@@ -1173,11 +1347,23 @@ class HloInstruction {
   // Describes the dimension numbers used for a convolution.
   std::unique_ptr<ConvolutionDimensionNumbers> convolution_dimension_numbers_;
 
+  // Describes the dimension numbers used for a dot.
+  std::unique_ptr<DotDimensionNumbers> dot_dimension_numbers_;
+
+  // Describes FFT type for an FFT instruction.
+  FftType fft_type_ = FftType::FFT;
+
+  // Indicates the FFT length for an FFT instruction.
+  std::vector<int64> fft_length_;
+
   // Describes the [begin, end) index range for a slice.
   std::vector<int64> slice_starts_;
   std::vector<int64> slice_limits_;
   std::vector<int64> slice_strides_;
 
+  // Describes whether the slice can be lowered to an offset into the operand.
+  bool is_in_place_slice_ = false;
+
   // The bit sizes for a reduce-precision operation.
   int32 exponent_bits_ = 0;
   int32 mantissa_bits_ = 0;
@@ -1198,7 +1384,6 @@ class HloInstruction {
 
   // For parameter instructions this field holds the parameter number.
   int64 parameter_number_ = 0;
-  string parameter_name_;
 
   // Name of a global symbol to call, only present for kCustomCall.
   string custom_call_target_;
@@ -1267,9 +1452,12 @@ string ToString(HloInstruction::FusionKind kind);
 StatusOr<HloInstruction::FusionKind> StringToFusionKind(
     const string& kind_name);
 
-// Custom stringification functions for protos that live inside HloInstruction.
+// Custom (de)stringification functions for protos that live inside
+// HloInstruction.
 string PaddingConfigToString(const PaddingConfig& padding);
 string OpMetadataToString(const OpMetadata& metadata);
+string RandomDistributionToString(const RandomDistribution& distribution);
+StatusOr<RandomDistribution> StringToRandomDistribution(const string& name);
 
 std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind);
 
@@ -1295,6 +1483,10 @@ template <typename ValueT>
 using ConstHloInstructionMap =
     std::map<const HloInstruction*, ValueT, HloPtrComparator>;
 
+using HloInstructionSet = std::set<HloInstruction*, HloPtrComparator>;
+using ConstHloInstructionSet =
+    std::set<const HloInstruction*, HloPtrComparator>;
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTION_H_
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 76b12fc8d3aadc0a874ce059851666fbcd6a4e94..94e9bfe56eb445ec0b459a55342cd3cc4c6f68ef 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -712,8 +712,8 @@ TEST_F(HloInstructionTest, PreserveOutfeedShapeThroughClone) {
           {1, 2},
           {3, 4},
       })));
-  auto shape10 = ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0});
-  auto shape01 = ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {0, 1});
+  auto shape10 = ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0});
+  auto shape01 = ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1});
   auto outfeed10 = builder.AddInstruction(
       HloInstruction::CreateOutfeed(shape10, constant, ""));
   auto outfeed01 = builder.AddInstruction(
@@ -825,17 +825,42 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
   EXPECT_THAT(c1->users(), ElementsAre(fusion));
 }
 
-// Convenience function for comparing two HloInstructions inside of
-// std::unique_ptrs.
-static bool Identical(std::unique_ptr<HloInstruction> instruction1,
-                      std::unique_ptr<HloInstruction> instruction2) {
+// Convenience function for comparing two HloInstructions.
+static bool Identical(const HloInstruction& instruction1,
+                      const HloInstruction& instruction2) {
   // Verify Identical is reflexive for both instructions.
-  EXPECT_TRUE(instruction1->Identical(*instruction1));
-  EXPECT_TRUE(instruction2->Identical(*instruction2));
+  EXPECT_TRUE(instruction1.Identical(instruction1));
+  EXPECT_TRUE(instruction2.Identical(instruction2));
 
-  bool is_equal = instruction1->Identical(*instruction2);
+  bool is_equal = instruction1.Identical(instruction2);
   // Verify Identical is symmetric.
-  EXPECT_EQ(is_equal, instruction2->Identical(*instruction1));
+  EXPECT_EQ(is_equal, instruction2.Identical(instruction1));
+  return is_equal;
+}
+
+// Convenience function for comparing two HloInstructions for structural
+// equality.
+static bool StructuralEqual(const HloInstruction& instruction1,
+                            const HloInstruction& instruction2) {
+  auto eq_operand_shapes = [](const HloInstruction* a,
+                              const HloInstruction* b) {
+    return ShapeUtil::Equal(a->shape(), b->shape());
+  };
+  auto eq_computations = [](const HloComputation* a, const HloComputation* b) {
+    return *a == *b;
+  };
+
+  // Verify Identical is reflexive for both instructions.
+  EXPECT_TRUE(
+      instruction1.Identical(instruction1, eq_operand_shapes, eq_computations));
+  EXPECT_TRUE(
+      instruction2.Identical(instruction2, eq_operand_shapes, eq_computations));
+
+  bool is_equal =
+      instruction1.Identical(instruction2, eq_operand_shapes, eq_computations);
+  // Verify Identical is symmetric.
+  EXPECT_EQ(is_equal, instruction2.Identical(instruction1, eq_operand_shapes,
+                                             eq_computations));
   return is_equal;
 }
 
@@ -858,42 +883,42 @@ TEST_F(HloInstructionTest, IdenticalInstructions) {
 
   // Operations which only depend on their operands and opcode.
   EXPECT_TRUE(
-      Identical(HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1),
-                HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1)));
+      Identical(*HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1),
+                *HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1)));
   EXPECT_FALSE(
-      Identical(HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1),
-                HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op2)));
+      Identical(*HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1),
+                *HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op2)));
   EXPECT_FALSE(
-      Identical(HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1),
-                HloInstruction::CreateUnary(shape, HloOpcode::kNegate, op1)));
+      Identical(*HloInstruction::CreateUnary(shape, HloOpcode::kCopy, op1),
+                *HloInstruction::CreateUnary(shape, HloOpcode::kNegate, op1)));
 
   // Tuples.
-  EXPECT_TRUE(Identical(HloInstruction::CreateTuple({op1, op2}),
-                        HloInstruction::CreateTuple({op1, op2})));
-  EXPECT_FALSE(Identical(HloInstruction::CreateTuple({op1, op2}),
-                         HloInstruction::CreateTuple({op2, op1})));
+  EXPECT_TRUE(Identical(*HloInstruction::CreateTuple({op1, op2}),
+                        *HloInstruction::CreateTuple({op1, op2})));
+  EXPECT_FALSE(Identical(*HloInstruction::CreateTuple({op1, op2}),
+                         *HloInstruction::CreateTuple({op2, op1})));
 
   // Broadcasts.
-  EXPECT_TRUE(Identical(HloInstruction::CreateBroadcast(shape, op1, {0, 1}),
-                        HloInstruction::CreateBroadcast(shape, op1, {0, 1})));
-  EXPECT_FALSE(Identical(HloInstruction::CreateBroadcast(shape, op1, {0, 1}),
-                         HloInstruction::CreateBroadcast(shape, op1, {1, 0})));
+  EXPECT_TRUE(Identical(*HloInstruction::CreateBroadcast(shape, op1, {0, 1}),
+                        *HloInstruction::CreateBroadcast(shape, op1, {0, 1})));
+  EXPECT_FALSE(Identical(*HloInstruction::CreateBroadcast(shape, op1, {0, 1}),
+                         *HloInstruction::CreateBroadcast(shape, op1, {1, 0})));
   Shape bcast_shape1 = ShapeUtil::MakeShape(F32, {2, 2, 42});
   Shape bcast_shape2 = ShapeUtil::MakeShape(F32, {2, 2, 123});
   EXPECT_FALSE(
-      Identical(HloInstruction::CreateBroadcast(bcast_shape1, op1, {0, 1}),
-                HloInstruction::CreateBroadcast(bcast_shape2, op1, {0, 1})));
+      Identical(*HloInstruction::CreateBroadcast(bcast_shape1, op1, {0, 1}),
+                *HloInstruction::CreateBroadcast(bcast_shape2, op1, {0, 1})));
 
   // Binary operands.
   EXPECT_TRUE(Identical(
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2),
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2)));
+      *HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2),
+      *HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2)));
   EXPECT_FALSE(Identical(
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2),
-      HloInstruction::CreateBinary(shape, HloOpcode::kDivide, op2, op1)));
+      *HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2),
+      *HloInstruction::CreateBinary(shape, HloOpcode::kDivide, op2, op1)));
   EXPECT_FALSE(Identical(
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2),
-      HloInstruction::CreateBinary(shape, HloOpcode::kDivide, op1, op2)));
+      *HloInstruction::CreateBinary(shape, HloOpcode::kAdd, op1, op2),
+      *HloInstruction::CreateBinary(shape, HloOpcode::kDivide, op1, op2)));
 }
 
 TEST_F(HloInstructionTest, FunctionVisitor) {
@@ -1068,8 +1093,11 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
   HloInstruction* reshape =
       builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   HloInstruction* dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(sout, HloOpcode::kDot, x, reshape));
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
 
   HloModule module(TestName());
   auto* computation = module.AddEntryComputation(builder.Build());
@@ -1086,49 +1114,71 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
       ShapeUtil::Equal(root->operand(1)->shape(), root2->operand(1)->shape()));
   EXPECT_TRUE(ShapeUtil::Equal(root->operand(1)->operand(0)->shape(),
                                root2->operand(1)->operand(0)->shape()));
+  EXPECT_TRUE(StructuralEqual(*fusion, *fusion2));
 }
 
-TEST_F(HloInstructionTest, IsRandomFusable) {
-  auto shape = ShapeUtil::MakeShape(F32, {2, 2});
-  {
-    auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
-    auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<float>(0.0)));
-    auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<float>(1.0)));
-    auto rng = builder.AddInstruction(HloInstruction::CreateRng(
-        shape, RandomDistribution::RNG_NORMAL, {const0, const1}));
-
-    auto* computation = hlo_module->AddEntryComputation(builder.Build());
-    computation->CreateFusionInstruction({rng, const0, const1},
-      HloInstruction::FusionKind::kLoop);
-
-    auto* root = computation->root_instruction();
-
-    EXPECT_EQ(HloOpcode::kFusion, root->opcode());
-  }
-  {
-    auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
-    auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<float>(0.0)));
-    auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<float>(1.0)));
-    auto rng = builder.AddInstruction(HloInstruction::CreateRng(
-        shape, RandomDistribution::RNG_NORMAL, {const0, const1}));
-    builder.AddInstruction(HloInstruction::CreateUnary(
-        shape, HloOpcode::kNegate, rng));
-    auto* computation = hlo_module->AddEntryComputation(builder.Build());
-    computation->CreateFusionInstruction({rng, const0, const1},
-      HloInstruction::FusionKind::kLoop);
-
-    auto* root = computation->root_instruction();
-
-    EXPECT_EQ(HloOpcode::kFusion, root->operand(0)->opcode());
-  }
+TEST_F(HloInstructionTest, FusionEquality) {
+  HloModule module(TestName());
+  HloComputation::Builder builder(TestName());
+
+  // Create two fusion instructions containing a single unary operation.
+  auto parameter =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "x"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, parameter));
+  auto neg = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, parameter));
+  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* fusion = computation->CreateFusionInstruction(
+      {exp}, HloInstruction::FusionKind::kLoop);
+  auto* fusion2 = computation->CreateFusionInstruction(
+      {neg}, HloInstruction::FusionKind::kLoop);
+  EXPECT_FALSE(StructuralEqual(*fusion, *fusion2));
+
+  auto clone = fusion->Clone();
+  EXPECT_TRUE(StructuralEqual(*fusion, *clone));
 }
 
+TEST_F(HloInstructionTest, NestedFusionEquality) {
+  HloModule module(TestName());
+  HloComputation::Builder builder(TestName());
+
+  // Build a nested fusion computation.
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+  auto b_t = builder.AddInstruction(
+      HloInstruction::CreateTranspose(data_shape, b, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateDot(data_shape, a, b_t, dot_dnums));
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto add_operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kAdd, dot, add_operand));
+  auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kSubtract, dot, add_operand));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kMultiply, add, sub));
+  auto computation = module.AddEntryComputation(builder.Build());
+
+  auto nested_fusion = computation->CreateFusionInstruction(
+      {dot, b_t}, HloInstruction::FusionKind::kTransposeDot);
+
+  auto fusion = computation->CreateFusionInstruction(
+      {add, nested_fusion}, HloInstruction::FusionKind::kOutput);
+  auto fusion2 = computation->CreateFusionInstruction(
+      {sub, nested_fusion}, HloInstruction::FusionKind::kOutput);
+  auto clone = fusion->Clone();
+  EXPECT_TRUE(StructuralEqual(*fusion, *clone));
+  EXPECT_FALSE(StructuralEqual(*fusion, *fusion2));
+}
 
 TEST_F(HloInstructionTest, CloneSuffixNames) {
   // Test that the suffix string added to cloned instructions is not
@@ -1169,7 +1219,7 @@ TEST_F(HloInstructionTest, CloneSuffixNames) {
 }
 
 TEST_F(HloInstructionTest, Stringification) {
-  // Tests stringification of a simple op, fusion, and while.
+  // Tests stringification of a simple op, fusion, while, and conditional.
   const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
   const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
   const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
@@ -1182,12 +1232,17 @@ TEST_F(HloInstructionTest, Stringification) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
   HloInstruction* reshape =
       builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   HloInstruction* dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(sout, HloOpcode::kDot, x, reshape));
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+
+  auto options = HloPrintOptions().set_print_metadata(false);
 
-  EXPECT_EQ(dot->ToString(false, false),
+  EXPECT_EQ(dot->ToString(options),
             "%dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} "
-            "%transpose)");
+            "%transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}");
 
   HloModule module(TestName());
   auto* computation = module.AddEntryComputation(builder.Build());
@@ -1195,15 +1250,25 @@ TEST_F(HloInstructionTest, Stringification) {
       {dot, reshape}, HloInstruction::FusionKind::kTransposeDot);
 
   EXPECT_EQ(
-      fusion->ToString(false, false),
-      "%fusion = f32[5,20]{1,0} fusion(f32[5,10]{1,0} %x, "
+      fusion->ToString(options),
+      "%dot_fusion = f32[5,20]{1,0} fusion(f32[5,10]{1,0} %x, "
       "f32[20,10]{1,0} %y), kind=kTransposeDot, calls=%fused_computation");
 
   HloInstruction* loop = builder.AddInstruction(
       HloInstruction::CreateWhile(sout, computation, computation, x));
-  EXPECT_EQ(loop->ToString(false, false),
+  EXPECT_EQ(loop->ToString(options),
             "%while = f32[5,20]{1,0} while(f32[5,10]{1,0} %x), "
             "condition=%TransposeDot, body=%TransposeDot");
+
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  HloInstruction* conditional =
+      builder.AddInstruction(HloInstruction::CreateConditional(
+          sout, pred, x, computation, x, computation));
+  EXPECT_EQ(conditional->ToString(options),
+            "%conditional = f32[5,20]{1,0} conditional(pred[] %constant, "
+            "f32[5,10]{1,0} %x, f32[5,10]{1,0} %x), "
+            "true_computation=%TransposeDot, false_computation=%TransposeDot");
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index 4255d6086625dfb9a045e4431e968a5ee0106ac7..bc74c4bc10cad20eab20b5caf8550b17048a5276 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -102,6 +102,36 @@ bool HloGetTupleElementMatcher::MatchAndExplain(
   return true;
 }
 
+void HloCustomCallMatcher::DescribeTo(std::ostream* os) const {
+  HloMatcher::DescribeTo(os);
+  *os << " with call target that ";
+  call_target_matcher_.DescribeTo(os);
+}
+
+bool HloCustomCallMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (!HloMatcher::MatchAndExplain(instruction, listener)) {
+    return false;
+  }
+  ::testing::StringMatchResultListener sub_listener;
+  bool result = ExplainMatchResult(
+      call_target_matcher_, instruction->custom_call_target(), &sub_listener);
+  if (sub_listener.str().empty()) {
+    sub_listener << " that ";
+
+    std::stringstream desc_stream;
+    if (result) {
+      call_target_matcher_.DescribeTo(&desc_stream);
+    } else {
+      call_target_matcher_.DescribeNegationTo(&desc_stream);
+    }
+    sub_listener << desc_stream.str();
+  }
+  *listener << "custom-call with call target" << sub_listener.str();
+  return result;
+}
+
 }  // namespace testing
 
 void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 992f55788b4900949f4994ba5b7be015bcd0d3de..103f04a2cb7a1a5ae877d8bf259692f7cbed3408 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -56,8 +56,8 @@ class HloParameterMatcher : public HloMatcher {
 // index to match.
 class HloGetTupleElementMatcher : public HloMatcher {
  public:
-  explicit HloGetTupleElementMatcher(
-      ::testing::Matcher<const HloInstruction*> operand, int64 tuple_index)
+  HloGetTupleElementMatcher(::testing::Matcher<const HloInstruction*> operand,
+                            int64 tuple_index)
       : HloMatcher(HloOpcode::kGetTupleElement, /*operands=*/{operand}),
         tuple_index_(tuple_index) {}
 
@@ -68,6 +68,24 @@ class HloGetTupleElementMatcher : public HloMatcher {
   int64 tuple_index_;
 };
 
+// Custom matcher for custom-call instructions, which accepts a matcher for its
+// call target.
+class HloCustomCallMatcher : public HloMatcher {
+ public:
+  HloCustomCallMatcher(
+      ::testing::Matcher<string> call_target_matcher,
+      std::vector<::testing::Matcher<const HloInstruction*>> operands)
+      : HloMatcher(HloOpcode::kCustomCall, operands),
+        call_target_matcher_(call_target_matcher) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  ::testing::Matcher<string> call_target_matcher_;
+};
+
 // HloInstruction* matchers for opcode and operands. Example:
 //   namespace op = xla::opcode_matchers;
 //   EXPECT_THAT(instruction,
@@ -83,6 +101,7 @@ HLO_MATCHER(Abs);
 HLO_MATCHER(Add);
 HLO_MATCHER(Bitcast);
 HLO_MATCHER(Broadcast);
+HLO_MATCHER(BatchNormGrad);
 HLO_MATCHER(Call);
 HLO_MATCHER(Ceil);
 HLO_MATCHER(Clamp);
@@ -93,7 +112,6 @@ HLO_MATCHER(Convert);
 HLO_MATCHER(Convolution);
 HLO_MATCHER(Copy);
 HLO_MATCHER(CrossReplicaSum);
-HLO_MATCHER(CustomCall);
 HLO_MATCHER(Divide);
 HLO_MATCHER(Dot);
 HLO_MATCHER(DynamicSlice);
@@ -183,6 +201,36 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> GetTupleElement() {
       new ::xla::testing::HloMatcher(HloOpcode::kGetTupleElement, {}));
 }
 
+// - CustomCall(T, operand1, ..., operandN) matches a CustomCall with call
+//   target T and the given operands.
+//
+// - CustomCall(operand1, ..., operandN) matches any CustomCall HLO with the
+//   given operands.
+//
+// - CustomCall() matches any CustomCall HLO at all.
+template <typename... M>
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
+    ::testing::Matcher<string> call_target_matcher, M... operands) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloCustomCallMatcher(
+      call_target_matcher, {operands...}));
+}
+// This overload of CustomCall(A, B, C, ...) exists iff A is not convertible to
+// ::testing::Matcher<string>.  In that case, we want to prefer the overload
+// above.
+template <typename FirstM, typename... M,
+          typename Dummy = typename std::enable_if<
+              !std::is_convertible<FirstM, ::testing::Matcher<string>>::value,
+              void>::type*>
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall(
+    FirstM operands_first, M... operands_rest) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(
+      HloOpcode::kCustomCall, {operands_first, operands_rest...}));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> CustomCall() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(HloOpcode::kCustomCall, {}));
+}
+
 #undef HLO_MATCHER
 }  // namespace opcode_matchers
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index 1465d1cacdc971a04c620bc48bed33239a67a955..1c21703a45e11914854153bc14fabd85e9ea57f2 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -23,6 +23,12 @@ using ::testing::Eq;
 namespace xla {
 namespace {
 
+string DescribeHloMatcher(const ::testing::Matcher<const HloInstruction*>& m) {
+  std::stringstream ss;
+  m.DescribeTo(&ss);
+  return ss.str();
+}
+
 template <typename M, typename T>
 string Explain(const T& t, const M& m) {
   ::testing::StringMatchResultListener listener;
@@ -67,5 +73,32 @@ TEST(HloMatchersTest, Test) {
          "add"));
 }
 
+TEST(HloMatchersTest, CustomCallMatcher) {
+  auto c1 = HloInstruction::CreateConstant(Literal::CreateR1<float>({1, 2, 3}));
+  auto c2 = HloInstruction::CreateConstant(Literal::CreateR1<int32>({1, 2, 3}));
+  auto call = HloInstruction::CreateCustomCall(
+      ShapeUtil::MakeShape(F32, {1}), {c1.get(), c2.get()}, "foo_target");
+
+  EXPECT_THAT(call.get(), op::CustomCall());
+  EXPECT_THAT(call.get(), op::CustomCall(c1.get(), c2.get()));
+  EXPECT_THAT(call.get(), op::CustomCall("foo_target"));
+  EXPECT_THAT(call.get(), op::CustomCall("foo_target", c1.get(), c2.get()));
+  EXPECT_THAT(call.get(), op::CustomCall(::testing::StartsWith("foo")));
+  EXPECT_THAT(call.get(),
+              op::CustomCall(::testing::Not(::testing::StartsWith("bar"))));
+
+  // Wrong number of operands.
+  EXPECT_THAT(call.get(), ::testing::Not(op::CustomCall(c1.get())));
+
+  // Call target does not match.
+  EXPECT_THAT(call.get(),
+              ::testing::Not(op::CustomCall(::testing::StartsWith("bar"))));
+
+  EXPECT_THAT(Explain(call.get(), op::CustomCall("bar")),
+              R"(custom-call with call target that isn't equal to "bar")");
+  EXPECT_THAT(DescribeHloMatcher(op::CustomCall("foo_target")),
+              R"(custom-call with call target that is equal to "foo_target")");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index faaf73ea1ce5c77b0522cb3276b4efd78aabde16..60270b0595dcfca8f1fcea5ab0914428880f35b5 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -35,14 +35,19 @@ namespace xla {
 HloModule::HloModule(const string& name,
                      const VersionedComputationHandle& entry_computation_handle,
                      const HloModuleConfig& config)
-    : name_(name),
+    : name_(NameUniquer::GetSanitizedName(name)),
       config_(config),
       has_entry_computation_handle_(true),
-      entry_computation_handle_(entry_computation_handle) {}
+      entry_computation_handle_(entry_computation_handle),
+      unique_id_(next_unique_module_id_++) {}
 
-HloModule::HloModule(const string& name) : name_(name) {}
+HloModule::HloModule(const string& name)
+    : name_(NameUniquer::GetSanitizedName(name)),
+      unique_id_(next_unique_module_id_++) {}
 HloModule::HloModule(const string& name, const HloModuleConfig& config)
-    : name_(name), config_(config) {}
+    : name_(NameUniquer::GetSanitizedName(name)),
+      config_(config),
+      unique_id_(next_unique_module_id_++) {}
 
 HloComputation* HloModule::AddComputationInternal(
     std::unique_ptr<HloComputation> computation, bool is_entry,
@@ -170,17 +175,14 @@ void HloModule::ReplaceComputations(
   computations_ = std::move(new_computations);
 }
 
-string HloModule::ToString(bool include_large_constants) const {
+string HloModule::ToString(const HloPrintOptions& options) const {
   std::ostringstream s;
-  s << "HloModule " << name() << ":\n\n";
+  s << "HloModule " << name() << "\n\n";
   for (const HloComputation* computation : MakeComputationPostOrder()) {
     if (computation == entry_computation()) {
       s << "ENTRY ";
     }
-    s << computation->ToString(
-             /*nested_level=*/0,
-             /*include_large_constants=*/include_large_constants)
-      << "\n\n";
+    s << computation->ToString(options) << "\n\n";
   }
   return s.str();
 }
@@ -232,8 +234,8 @@ StatusOr<ProgramShape> ProgramShapeFromProto(const HloModuleProto& module) {
           << "Entry computation has more than one parameter instruction "
              "with parameter number "
           << instruction.parameter_number();
-      parameters[instruction.parameter_number()] = {
-          instruction.parameter_name(), &instruction.shape()};
+      parameters[instruction.parameter_number()] = {instruction.name(),
+                                                    &instruction.shape()};
     }
   }
   TF_RET_CHECK(root != nullptr)
@@ -459,6 +461,14 @@ HloInstruction* HloModule::OutlineExpressionFromComputation(
   return call;
 }
 
+int64 HloModule::instruction_count() const {
+  int64 n = 0;
+  for (const auto& computation : computations_) {
+    n += computation->instruction_count();
+  }
+  return n;
+}
+
 std::list<HloComputation*> HloModule::MakeComputationPostOrder() const {
   // First determine all root computations by building a set of nonroot
   // computations (computations which are called by an instruction in the
@@ -517,7 +527,15 @@ std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
 
   std::unordered_map<HloComputation*, HloComputation*> clone_map;
   for (auto& computation : computations_) {
-    auto cloned_computation = computation->Clone(suffix);
+    if (computation->IsFusionComputation()) {
+      // Cloning of a fused computation is handled by its fusion instruction.
+      continue;
+    }
+
+    // When cloning a computation, pass in the new module, so that for any
+    // fusion instruction in this computation, the fused computation will be
+    // deep cloned to the new module.
+    auto cloned_computation = computation->Clone(suffix, module.get());
     InsertOrDie(&clone_map, computation.get(), cloned_computation.get());
 
     if (entry_computation_ == computation.get()) {
@@ -531,8 +549,15 @@ std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
     for (auto* instruction : cloned_computation->instructions()) {
       // Rewrite instruction's called_computation to point to the cloned
       // computations.
-      instruction->ReplaceCalledComputations(
-          [&](HloComputation* hlo) { return FindOrDie(clone_map, hlo); });
+      instruction->ReplaceCalledComputations([&](HloComputation* hlo) {
+        if (hlo->IsFusionComputation()) {
+          // Cloning of a fused computation has already been handled when its
+          // fusion instruction is cloned. So this hlo computation is already
+          // the cloned one.
+          return hlo;
+        }
+        return FindOrDie(clone_map, hlo);
+      });
     }
   }
   return module;
@@ -543,4 +568,6 @@ uint64 HloModule::RandomNew64() const {
   return rng_();
 }
 
+/* static */ std::atomic<int> HloModule::next_unique_module_id_(0);
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 5141e7bc8d4cf0ef4cd83310772e0c5d66b5da12..4bfe8d89ce0a285de6d05d4867aaa6b266d78d12 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_H_
 
+#include <atomic>
 #include <list>
 #include <memory>
 #include <random>
@@ -98,6 +99,10 @@ class HloModule {
     return config_.mutable_entry_computation_layout();
   }
 
+  ComputationLayout entry_computation_layout() const {
+    return config_.entry_computation_layout();
+  }
+
   const VersionedComputationHandle& entry_computation_handle() const {
     return entry_computation_handle_;
   }
@@ -125,6 +130,9 @@ class HloModule {
   // Gets the number of computations in this module.
   int64 computation_count() const { return computations_.size(); }
 
+  // Gets the number of instructions in this module.
+  int64 instruction_count() const;
+
   // Compute and return a post order of all computations in the module. The sort
   // is defined like so: if computation A has an instruction which calls
   // computation B, then A will appear after B in the sort.
@@ -143,7 +151,12 @@ class HloModule {
 
   const HloModuleConfig& config() const { return config_; }
 
-  string ToString(bool include_large_constants = false) const;
+  // Return a string representation of the module.
+  //
+  // (We express the default options using an overload rather than a default
+  // param because gdb ignores default params, but does resolve overloads.)
+  string ToString() const { return ToString(HloPrintOptions()); }
+  string ToString(const HloPrintOptions& options) const;
 
   // Convert an HloModule to or from a proto.
   HloModuleProto ToProto() const;
@@ -189,6 +202,10 @@ class HloModule {
   // this point are guaranteed to be in the range [0..NumUniqueInstructionIds())
   int NumUniqueInstructionIds() const { return next_unique_id_; }
 
+  // Returns an id that is unique to this module across all modules created over
+  // the lifetime of this process.
+  int unique_id() const { return unique_id_; }
+
  private:
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation, bool is_entry,
@@ -215,6 +232,11 @@ class HloModule {
   NameUniquer computation_name_uniquer_{/*separator=*/"."};
   NameUniquer instruction_name_uniquer_{/*separator=*/"."};
   int next_unique_id_ = 0;
+
+  // Used to keep track of the next unique module id that should be assigned.
+  static std::atomic<int> next_unique_module_id_;
+  // A unique id to label modules with.
+  int unique_id_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index bf6440d66cac0d3a929c377202b212aba262f887..7f28a804bfec9c2f1bbb5fa08f7dd4e68be14d35 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -105,6 +105,48 @@ TEST_F(HloModuleTest, CloneTest) {
   }
 }
 
+TEST_F(HloModuleTest, CloneHasFusion) {
+  auto module = CreateNewModule();
+
+  // Create the fused computation.
+  HloComputation* fused_computation;
+  {
+    auto b = HloComputation::Builder("Fused");
+    auto x = b.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "x"));
+    b.AddInstruction(
+        HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, x, x));
+    fused_computation = module->AddEmbeddedComputation(b.Build());
+  }
+
+  // Create the entry computation.
+  {
+    auto b = HloComputation::Builder("Entry");
+    auto input = b.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
+    b.AddInstruction(
+        HloInstruction::CreateFusion(r0f32_, HloInstruction::FusionKind::kInput,
+                                     /*operands=*/{input}, fused_computation));
+    module->AddEntryComputation(b.Build());
+  }
+
+  auto post_order = module->MakeComputationPostOrder();
+  auto cloned_module = module->Clone("copy");
+  auto post_order_copied = cloned_module->MakeComputationPostOrder();
+
+  EXPECT_EQ(post_order.size(), post_order_copied.size());
+  for (auto origin = post_order.begin(), copied = post_order_copied.begin();
+       origin != post_order.end() && copied != post_order_copied.end();
+       ++origin, ++copied) {
+    if ((*origin)->name() == "Fused") {
+      // Clone of the fused computation is handled when its fusion instruction
+      // is cloned, which always use suffix ".clone".
+      EXPECT_EQ((*origin)->name() + ".clone", (*copied)->name());
+    } else {
+      EXPECT_EQ((*origin)->name() + ".copy", (*copied)->name());
+    }
+  }
+}
+
 TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
   // Create a module with a diamond call graph of computations.
   auto module = CreateNewModule();
@@ -135,14 +177,21 @@ TEST_F(HloModuleTest, LargeConstantToString) {
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(
-      "HloModule LargeConstantToString:\n\nENTRY %Constant () -> f32[16] {\n  "
+      "HloModule LargeConstantToString\n\nENTRY %Constant () -> f32[16] {\n  "
       "ROOT %constant = f32[16]{0} constant({...})\n}\n\n",
-      module->ToString(/*include_large_constants=*/false));
+      module->ToString(HloPrintOptions().set_print_large_constants(false)));
+
   EXPECT_EQ(
-      "HloModule LargeConstantToString:\n\nENTRY %Constant () -> f32[16] {\n  "
+      "HloModule LargeConstantToString\n\nENTRY %Constant () -> f32[16] {\n  "
       "ROOT %constant = f32[16]{0} constant({42, 42, 42, 42, 42, 42, 42, 42, "
       "42, 42, 42, 42, 42, 42, 42, 42})\n}\n\n",
-      module->ToString(/*include_large_constants=*/true));
+      module->ToString(HloPrintOptions().set_print_large_constants(true)));
+}
+
+TEST_F(HloModuleTest, UniqueModuleId) {
+  auto module_a = CreateNewModule();
+  auto module_b = CreateNewModule();
+  EXPECT_NE(module_a->unique_id(), module_b->unique_id());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index f3f79357582ac7661a532e94031acdbca0b86784..3d64523a79fc50638fdf378b5d521a5cd4482b90 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -73,6 +73,7 @@ namespace xla {
   V(kDynamicUpdateSlice, "dynamic-update-slice")             \
   V(kEq, "equal-to", kHloOpcodeIsComparison)                 \
   V(kExp, "exponential")                                     \
+  V(kFft, "fft")                                             \
   V(kFloor, "floor")                                         \
   V(kFusion, "fusion", kHloOpcodeIsVariadic)                 \
   V(kGe, "greater-than-or-equal-to", kHloOpcodeIsComparison) \
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 6f6e679a21870e46da85963c3b2998465ac43420..68e3c9618c1fe9daacb0aee3ee98862c8b9e4bc4 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -249,7 +249,7 @@ bool PredecessorHloOrdering::ExecutesBeforeInSameComputation(
 string PredecessorHloOrdering::ToStringHelper(const string& name) const {
   std::vector<string> pieces;
   pieces.push_back(name);
-  for (auto* computation : module_->computations()) {
+  for (auto* computation : module_->MakeNonfusionComputations()) {
     pieces.push_back(tensorflow::strings::Printf("computation %s:",
                                                  computation->name().c_str()));
     const auto all = computation->MakeInstructionPostOrder();
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 33bafd05c15c47abaa313f92eb53a791de43d7d9..aba66114de649ce7667ae77174e9c4073b010b90 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -310,5 +311,56 @@ TEST_F(HloOrderingTest, ValuesInWhileComputations) {
       *dataflow));
 }
 
+// Regression test for HloOrdering::ToString() crashing when fed a computation
+// containing a fusion node.
+TEST_F(HloOrderingTest, ToStringDoesNotCrash) {
+  const char* module_str = R"(
+HloModule test_module
+
+body.v8 {
+  prev.1 = (s32[], f32[3]{0}, f32[3]{0}, f32[3]{0}) parameter(0)
+  get-tuple-element.4 = s32[] get-tuple-element(prev.1), index=0
+  constant.1 = s32[] constant(1)
+  add = s32[] add(get-tuple-element.4, constant.1)
+  get-tuple-element.5 = f32[3]{0} get-tuple-element(prev.1), index=3
+  get-tuple-element.6 = f32[3]{0} get-tuple-element(prev.1), index=1
+  get-tuple-element.7 = f32[3]{0} get-tuple-element(prev.1), index=2
+  ROOT tuple = (s32[], f32[3]{0}, f32[3]{0}, f32[3]{0}) tuple(add, get-tuple-element.5, get-tuple-element.6, get-tuple-element.7)
+}
+
+condition.v4 {
+  constant.2 = s32[] constant(2)
+  prev.2 = (s32[], f32[3]{0}, f32[3]{0}, f32[3]{0}) parameter(0)
+  get-tuple-element.8 = s32[] get-tuple-element(prev.2), index=0
+  ROOT greater-than = pred[] greater-than(constant.2, get-tuple-element.8)
+}
+
+fused_computation {
+  get-tuple-element.5.param_1 = f32[3]{0} parameter(1)
+  get-tuple-element.6.param_2 = f32[3]{0} parameter(2)
+  add.4 = f32[3]{0} add(get-tuple-element.5.param_1, get-tuple-element.6.param_2)
+  get-tuple-element.7.param_1.1 = f32[3]{0} parameter(0)
+  ROOT add.5 = f32[3]{0} add(add.4, get-tuple-element.7.param_1.1)
+}
+
+ENTRY while.v11 {
+  constant.5 = s32[] constant(0)
+  constant.6 = f32[3]{0} constant({1, 1, 1})
+  constant.7 = f32[3]{0} constant({2, 2, 2})
+  constant.8 = f32[3]{0} constant({3, 3, 3})
+  tuple.1 = (s32[], f32[3]{0}, f32[3]{0}, f32[3]{0}) tuple(constant.5, constant.6, constant.7, constant.8)
+  while = (s32[], f32[3]{0}, f32[3]{0}, f32[3]{0}) while(tuple.1), condition=condition.v4, body=body.v8
+  get-tuple-element.9 = f32[3]{0} get-tuple-element(while), index=3
+  get-tuple-element.10 = f32[3]{0} get-tuple-element(while), index=1
+  get-tuple-element.11 = f32[3]{0} get-tuple-element(while), index=2
+  ROOT fusion = f32[3]{0} fusion(get-tuple-element.9, get-tuple-element.10, get-tuple-element.11), kind=kLoop, calls=fused_computation
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(module_str));
+  DependencyHloOrdering ordering(module.get());
+  ordering.ToString();  // Shouldn't crash.
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 53bd46a641afcba1b9551895955742e74a9f374b..5120775737bfa32bbb656421216f2b3fbef590ea 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -32,12 +33,28 @@ using ::tensorflow::strings::StrCat;
 namespace xla {
 
 namespace {
-void DumpModule(const HloModule& module,
-                const string& message) {
+void DumpModuleGraph(const HloModule& module, const string& message) {
   hlo_graph_dumper::MaybeDumpHloModule(module, message);
   VLOG(3) << "HLO " << message << ":";
   XLA_VLOG_LINES(3, module.ToString());
 }
+
+void DumpModuleProto(const HloModule& module, const string& dump_to,
+                     const string& pipeline_name, const string& pass_name) {
+  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+  static auto* const module_id_to_pass_number =
+      new tensorflow::gtl::FlatMap<int64, int64>();
+
+  tensorflow::mutex_lock lock(mu);
+  const int64 pass_number = (*module_id_to_pass_number)[module.unique_id()]++;
+
+  const string mod_name = SanitizeFileName(tensorflow::strings::Printf(
+      "module_%04d.%04lld.%s.after_%s", module.unique_id(), pass_number,
+      pipeline_name.c_str(), pass_name.c_str()));
+
+  TF_QCHECK_OK(protobuf_util::DumpProtoToDirectory(MakeHloProto(module),
+                                                   dump_to, mod_name));
+}
 }  // namespace
 
 StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
@@ -78,6 +95,13 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
   string message;
   TF_RETURN_IF_ERROR(
       run_invariant_checkers(StrCat("before running pipeline: ", name())));
+  const string xla_dump_per_pass_hlo_proto_to =
+      module->config().debug_options().xla_dump_per_pass_hlo_proto_to();
+  if (!xla_dump_per_pass_hlo_proto_to.empty()) {
+    DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, name().ToString(),
+                    "pipeline_start");
+  }
+
   for (auto& pass : passes_) {
     if (disabled_passes.count(pass->name().ToString()) > 0) {
       VLOG(1) << "  Skipping HLO pass " << pass->name()
@@ -90,17 +114,21 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
     // Emit label containing: "after foo-pass, before bar-pass".
     message.clear();
     StrAppend(&message, prefix, ", before ", pass->name());
-    DumpModule(*module, message);
+    DumpModuleGraph(*module, message);
 
     TF_ASSIGN_OR_RETURN(bool changed_this_pass, pass->Run(module));
     TF_RETURN_IF_ERROR(
         run_invariant_checkers(StrCat("after running pass: ", pass->name())));
+    if (!xla_dump_per_pass_hlo_proto_to.empty()) {
+      DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to,
+                      name().ToString(), pass->name().ToString());
+    }
 
     changed |= changed_this_pass;
     prefix.clear();
     StrAppend(&prefix, name(), ": after ", pass->name());
   }
-  DumpModule(*module, prefix + ", pipeline end");
+  DumpModuleGraph(*module, prefix + ", pipeline end");
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.cc b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
index e944ad15139af0d2f98e8e68d3d48303f47ecf1c..dcc22793015147aaf3229875078b2989e4ef7559 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer.cc
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
@@ -18,20 +18,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
 
 namespace xla {
-string HloProfilePrinter::ToString(const int64* counters,
-                                   double clock_rate_ghz) const {
+string PrintHloProfile(const HloProfilePrinterData& hlo_profile_printer_data,
+                       const int64* counters, double clock_rate_ghz) {
+  using HloComputationInfo = HloProfilePrinterData::HloComputationInfo;
+  using HloInstructionInfo = HloProfilePrinterData::HloInstructionInfo;
+
   string result;
 
-  for (int computation_idx = 0; computation_idx < computation_infos_size_;
-       computation_idx++) {
-    const HloComputationInfo& computation = computation_infos_[computation_idx];
-    const HloInstructionInfo* instructions_begin = computation.instructions;
-    const HloInstructionInfo* instructions_end =
-        computation.instructions + computation.instructions_size;
+  for (const HloComputationInfo& computation_info :
+       hlo_profile_printer_data.computation_infos()) {
+    const auto& instruction_infos = computation_info.instruction_infos();
     bool any_instruction_profiled =
-        std::any_of(instructions_begin, instructions_end,
+        std::any_of(instruction_infos.begin(), instruction_infos.end(),
                     [&](const HloInstructionInfo& instruction_info) {
-                      return counters[instruction_info.profile_index] != 0;
+                      return counters[instruction_info.profile_index()] != 0;
                     });
 
     if (!any_instruction_profiled) {
@@ -41,16 +41,19 @@ string HloProfilePrinter::ToString(const int64* counters,
     // Once we start using this in AOT for real, we will probably need a more
     // minimal version of HumanReadableProfileBuilder.
     HumanReadableProfileBuilder builder(
-        computation.name, counters[computation.profile_index], clock_rate_ghz);
+        computation_info.name(), counters[computation_info.profile_index()],
+        clock_rate_ghz);
 
-    for (const auto* instruction = instructions_begin;
-         instruction != instructions_end; instruction++) {
+    for (const auto& instruction_info : instruction_infos) {
       builder.AddOp(
-          /*op_name=*/instruction->long_name,
-          /*short_name=*/instruction->short_name, instruction->category,
-          counters[instruction->profile_index], instruction->flop_count,
-          instruction->transcendental_count, instruction->bytes_accessed,
-          instruction->optimal_seconds);
+          /*op_name=*/instruction_info.long_name(),
+          /*short_name=*/instruction_info.short_name(),
+          instruction_info.category(),
+          counters[instruction_info.profile_index()],
+          instruction_info.flop_count(),
+          instruction_info.transcendental_count(),
+          instruction_info.bytes_accessed(),
+          instruction_info.optimal_seconds());
     }
 
     result += builder.ToString();
@@ -58,10 +61,4 @@ string HloProfilePrinter::ToString(const int64* counters,
 
   return result;
 }
-
-HloProfilePrinter::~HloProfilePrinter() {
-  if (deleter_) {
-    deleter_(computation_infos_, computation_infos_size_);
-  }
-}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.h b/tensorflow/compiler/xla/service/hlo_profile_printer.h
index 316753a82ab2a9b5459b71c723a8e817ee2cacbf..b72325c7554acad258c2da55a18e5e18ec1b06a6 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer.h
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.h
@@ -13,85 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
 
 #include <cstdint>
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/hlo_profile_printer_data.pb.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
-// Instances of this class can pretty-print profile counters gathered from
-// running an XLA computation without having access to the backing module.
-class HloProfilePrinter {
- public:
-  // Holds meta information about an HloInstruction.
-  //
-  // The pointer-typed fields can be owning or non-owning -- this decision is
-  // manifested as the deleter_ function in the containing HloProfilePrinter.
-  struct HloInstructionInfo {
-    // Textual information for pretty printing.
-    const char* long_name;
-    const char* short_name;
-    const char* category;
-
-    // Metrics computed by HloCostAnalysis.
-    float flop_count;
-    float transcendental_count;
-    float bytes_accessed;
-    float optimal_seconds;
-
-    // The index into the profile counters array for the HloInstruction
-    // corresponding to this HloInstructionInfo.
-    int64 profile_index;
-  };
-
-  // Holds meta information about an HloComputation.
-  //
-  // The pointer-typed fields can be owning or non-owning -- this decision is
-  // manifested as the deleter_ function in the containing HloProfilePrinter.
-  struct HloComputationInfo {
-    const char* name;
-
-    // The index into the profile counters array for the HloInstruction
-    // corresponding to this HloComputationInfo.
-    int64 profile_index;
-
-    HloInstructionInfo* instructions;
-    int64 instructions_size;
-  };
-
-  HloProfilePrinter(
-      HloComputationInfo* computation_infos, int64 computation_infos_size,
-      std::function<void(HloComputationInfo*, int64)> deleter = nullptr)
-      : computation_infos_(computation_infos),
-        computation_infos_size_(computation_infos_size),
-        deleter_(std::move(deleter)) {}
-
-  HloProfilePrinter(HloProfilePrinter&& other) {
-    std::swap(other.computation_infos_, computation_infos_);
-    std::swap(other.computation_infos_size_, computation_infos_size_);
-    std::swap(other.deleter_, deleter_);
-  }
-
-  HloProfilePrinter(const HloProfilePrinter&) = delete;
-  HloProfilePrinter& operator=(const HloProfilePrinter&) = delete;
-
-  // Convert the profile counter sequence `counters` to a human readable string
-  // representation.
-  string ToString(const int64* counters, double clock_rate_ghz) const;
-
-  ~HloProfilePrinter();
-
- private:
-  // The `computation_infos_` field can be owning or non-owning -- this decision
-  // is manifested as the deleter_ function.
-  HloComputationInfo* computation_infos_ = nullptr;
-  int64 computation_infos_size_ = 0;
-  std::function<void(HloComputationInfo*, int64)> deleter_;
-};
+// Pretty-print an array of profile counters using hlo_profile_printer_data.
+string PrintHloProfile(const HloProfilePrinterData& hlo_profile_printer_data,
+                       const int64* counters, double clock_rate_ghz);
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer_data.proto b/tensorflow/compiler/xla/service/hlo_profile_printer_data.proto
new file mode 100644
index 0000000000000000000000000000000000000000..9f22b733fe1d676b177039a9d7a3064b8638d7bc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer_data.proto
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla;
+
+option cc_enable_arenas = true;
+
+// Describes how to pretty-print a profile counter array gathered for a specific
+// HloModule.
+message HloProfilePrinterData {
+  // Pretty-printer information about an HloInstruction.
+  message HloInstructionInfo {
+    string long_name = 1;
+    string short_name = 2;
+    string category = 3;
+
+    // Metrics computed by HloCostAnalysis.
+    float flop_count = 4;
+    float transcendental_count = 5;
+    float bytes_accessed = 6;
+    float optimal_seconds = 7;
+
+    // The index into the profile counters array for the HloInstruction
+    // corresponding to this HloInstructionInfo.
+    int64 profile_index = 8;
+  }
+
+  // Pretty-printer information about an HloComputation.
+  message HloComputationInfo {
+    string name = 1;
+
+    // The index into the profile counters array for the HloComputation
+    // corresponding to this HloComputationInfo.
+    int64 profile_index = 2;
+
+    // HloInstructionInfos for every HloInstruction in the HloComputation for
+    // corresponding to this HloComputattionInfo.
+    repeated HloInstructionInfo instruction_infos = 3;
+  }
+
+  // HloComputationInfos for every HloComputation in the HloModule.
+  repeated HloComputationInfo computation_infos = 1;
+
+  // The size of the profile counters array we will pretty-print.
+  int64 profile_counters_size = 2;
+}
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.cc b/tensorflow/compiler/xla/service/hlo_proto_util.cc
index 727ad0178c6227cd2e64c31a4618e781671b9393..78e6a101c10a1e812e3e2631d520139fd0bc425c 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.cc
@@ -19,15 +19,20 @@ namespace xla {
 
 HloProto MakeHloProto(const HloModule& module,
                       const BufferAssignment& assignment) {
-  HloModuleProto proto_module = module.ToProto();
   HloOrderingProto proto_ordering =
       assignment.liveness().hlo_ordering().ToProto();
   BufferAssignmentProto proto_assignment = assignment.ToProto();
-  HloProto proto;
-  proto.mutable_hlo_module()->Swap(&proto_module);
+  HloProto proto = MakeHloProto(module);
   proto.mutable_hlo_ordering()->Swap(&proto_ordering);
   proto.mutable_buffer_assignment()->Swap(&proto_assignment);
   return proto;
 }
 
+HloProto MakeHloProto(const HloModule& module) {
+  HloModuleProto proto_module = module.ToProto();
+  HloProto proto;
+  proto.mutable_hlo_module()->Swap(&proto_module);
+  return proto;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.h b/tensorflow/compiler/xla/service/hlo_proto_util.h
index 603259a11fcdca59f58653d9a7a164c983711a57..320288fdb9aa0810b306b1d78bd1ff4cfc366ed2 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util.h
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.h
@@ -31,6 +31,10 @@ namespace xla {
 HloProto MakeHloProto(const HloModule& module,
                       const BufferAssignment& assignment);
 
+// Returns a serialized representation of the HLO state, but buffer assignment
+// will not be included in the output.
+HloProto MakeHloProto(const HloModule& module);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROTO_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h
index d7bdac9c86579f19afbba133772c2c50894853d1..553ec11f6f9a2997ab7113f9b8241e04c7fe20d5 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.h
+++ b/tensorflow/compiler/xla/service/hlo_reachability.h
@@ -30,11 +30,17 @@ namespace xla {
 
 class HloInstruction;
 
-// A class for computing and representing reachability between HloInstructions.
+// A class for representing reachability between HloInstructions.
+//
+// !!! THIS CLASS DOES NOT COMPUTE REACHABILITY !!! It has an adjacency matrix
+// and it is up to the user of the class to set the adjacency matrix such that
+// it represents reachability, i.e. such that it is transitive. That the graph
+// be transitive is thus not an invariant of this class, but it is required for
+// the name of the class and its methods to make sense.
 class HloReachabilityMap {
  public:
-  // Sets up an empty reachable matrix for the full set of instructions
-  // specified in 'instructions'.
+  // Sets up a graph with no edges and where the nodes correspond to the given
+  // instructions.
   explicit HloReachabilityMap(const std::list<HloInstruction*>& instructions);
 
   // Set the reachability set of 'instruction' to the union of the reachability
@@ -42,17 +48,33 @@ class HloReachabilityMap {
   // 'x' is not 'instruction' will return true iff IsReachable(x, input) is true
   // for some 'input' in 'inputs'. Also sets 'instruction' to be reachable from
   // itself. Returns whether the reachability set of 'instruction' changed.
+  //
+  // !!! THIS FUNCTION DOES NOT COMPUTE REACHABILITY !!! It sets the adjacency
+  // vector in the internal graph of this HloReachabilityMap for the given
+  // instruction and does not transitively update any other part of the
+  // adjacency matrix.
   bool SetReachabilityToUnion(
       tensorflow::gtl::ArraySlice<const HloInstruction*> inputs,
       const HloInstruction* instruction);
 
   // Sets entry so that IsReachable(a, b) will return true
+  //
+  // !!! THIS FUNCTION DOES NOT COMPUTE REACHABILITY !!! It sets the adjacency
+  // matrix in the internal graph of this HloReachabilityMap to have an edge
+  // from a to b and does not transitively update any other part of the
+  // adjacency matrix.
   void SetReachable(const HloInstruction* a, const HloInstruction* b);
 
   // Returns true if "b" is reachable from "a"
+  //
+  // Note that this function only correctly answers queries about reachability
+  // if the set of edges that have been provided to this class are transitive.
   bool IsReachable(const HloInstruction* a, const HloInstruction* b) const;
 
   // Returns true if "b" is reachable from "a" or "a" is reachable from "b"
+  //
+  // Note that this function only correctly answers queries about reachability
+  // if the set of edges that have been provided to this class are transitive.
   bool IsConnected(const HloInstruction* a, const HloInstruction* b) const;
 
  private:
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 017f996bc4d1902c81f96425b7bc28d52622df0f..c6b4dc0368d92fd477decdfb38045f74f8696803 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -566,7 +566,9 @@ Status MemoryUsageTracker::BeginInstruction(Item* item) {
   VLOG(3) << "  memory usage = " << memory_usage_;
   VLOG(10) << ToString();
 
-  DCHECK(Check());
+  if (VLOG_IS_ON(1)) {
+    DCHECK(Check());
+  }
   return Status::OK();
 }
 
@@ -603,8 +605,9 @@ Status MemoryUsageTracker::EndInstruction() {
   VLOG(3) << "  memory usage = " << memory_usage_;
   VLOG(10) << ToString();
 
-  DCHECK(Check());
-
+  if (VLOG_IS_ON(1)) {
+    DCHECK(Check());
+  }
   return Status::OK();
 }
 
@@ -1021,7 +1024,9 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
 
       HloInstruction* best = best_item->instruction;
       VLOG(1) << "Rematerializing instruction " << best->name() << " (saving "
-              << memory_tracker.MemoryReducedIfRematerialized(best_item) << ")";
+              << HumanReadableNumBytes(
+                     memory_tracker.MemoryReducedIfRematerialized(best_item))
+              << ")";
       changed = true;
       remat_count++;
 
@@ -1101,8 +1106,8 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
         net_instructions_added++;
       }
 
-      VLOG(3) << "memory_usage after rematerialization = "
-              << memory_tracker.memory_usage();
+      VLOG(1) << "memory_usage after rematerialization = "
+              << HumanReadableNumBytes(memory_tracker.memory_usage());
     }
 
     const CallSite* callsite = call_graph_node.GetCallSite(instruction);
@@ -1208,11 +1213,12 @@ StatusOr<bool> HloRematerialization::Run(
 
   XLA_VLOG_LINES(3, "Before HloRematerialization:\n" + module->ToString());
   // Create initial sequence of HLO instructions.
-  TF_ASSIGN_OR_RETURN(*sequence,
-                      CreateMemoryMinimizingSequence(
-                          *module, [this](const LogicalBuffer& buffer) {
-                            return size_function_(buffer.shape());
-                          }));
+  TF_ASSIGN_OR_RETURN(*sequence, CreateMemoryMinimizingSequence(
+                                     *module,
+                                     [this](const LogicalBuffer& buffer) {
+                                       return size_function_(buffer.shape());
+                                     },
+                                     scheduler_algorithm_));
   // Compute peak memory usage of all computations in the module called in a
   // sequential context.
   call_graph_ = CallGraph::Build(module);
@@ -1313,9 +1319,10 @@ StatusOr<bool> HloRematerialization::Run(
 /* static */ StatusOr<bool> HloRematerialization::RematerializeAndSchedule(
     const HloRematerialization::ShapeSizeFunction& size_function,
     int64 memory_limit_bytes, HloModule* hlo_module,
+    SchedulerAlgorithm scheduler_algorithm,
     SequentialHloOrdering::HloModuleSequence* sequence,
     RematerializationSizes* sizes) {
-  HloRematerialization remat(size_function);
+  HloRematerialization remat(scheduler_algorithm, size_function);
   return remat.Run(hlo_module, sequence, memory_limit_bytes, sizes);
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 11f79a6d4158c6251c2faf63e9cac4e742440863..52553439033a3bcfa4b472f13f9cd4b1ecf5ed96 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -20,6 +20,7 @@
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 
 namespace xla {
@@ -65,12 +66,15 @@ class HloRematerialization {
   // code generation.
   static StatusOr<bool> RematerializeAndSchedule(
       const ShapeSizeFunction& size_function, int64 memory_limit_bytes,
-      HloModule* hlo_module, SequentialHloOrdering::HloModuleSequence* sequence,
+      HloModule* hlo_module, SchedulerAlgorithm scheduler_algorithm,
+      SequentialHloOrdering::HloModuleSequence* sequence,
       RematerializationSizes* sizes = nullptr);
 
  protected:
-  HloRematerialization(const ShapeSizeFunction& size_function)
-      : size_function_(size_function) {}
+  HloRematerialization(SchedulerAlgorithm scheduler_algorithm,
+                       const ShapeSizeFunction& size_function)
+      : scheduler_algorithm_(scheduler_algorithm),
+        size_function_(size_function) {}
   ~HloRematerialization() {}
 
   // Runs rematerialization on the given module. Returns whether the module was
@@ -103,6 +107,9 @@ class HloRematerialization {
   StatusOr<int64> CalledComputationsMemoryUsage(
       const HloInstruction* instruction) const;
 
+  // Selects an algorithm to use for HLO scheduling.
+  SchedulerAlgorithm scheduler_algorithm_;
+
   // Function which computes the size of the top-level buffer of a shape.
   const ShapeSizeFunction size_function_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index d88aa4bb567c6c5f6eab54f12239bf7040339c39..1b7d26dde501a6a0955d62ea0938e0683a32d49d 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -158,11 +158,11 @@ TEST_F(HloRematerializationTest, SingleComputation) {
   SequentialHloOrdering::HloModuleSequence sequence;
   // Computation requires 16KB without rematerialization, but uses only 12KB
   // with rematerialization so pick a memory limit between these values (14KB).
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/14 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/14 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
   EXPECT_TRUE(changed);
 
   // Root should not have changed.
@@ -191,11 +191,11 @@ TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
   EXPECT_EQ(computation->instruction_count(), 7);
 
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/20 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/20 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
 
   // No instructions should have been materialized.
   EXPECT_FALSE(changed);
@@ -232,11 +232,11 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
   // while so the peak memory use of the module is 18KB. Set the memory limit a
   // bit lower (17KB) to force rematerialization of the entry computation.
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/17 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/17 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
   EXPECT_TRUE(changed);
 
   // Only the entry computation should have a rematerialized instruction added.
@@ -268,11 +268,11 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
   EXPECT_EQ(body_computation->instruction_count(), 7);
 
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/15 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/15 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
   EXPECT_TRUE(changed);
 
   // Both computations should have a rematerialized instruction added.
@@ -310,11 +310,11 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
   // If all computations are maximally rematerialized then peak memory usage is
   // ~12K so pick something slightly larger.
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/13 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/13 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
   EXPECT_TRUE(changed);
 
   // All computations should have a rematerialized instruction added.
@@ -323,6 +323,76 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
   EXPECT_EQ(inner_computation->instruction_count(), 8);
 }
 
+TEST_F(HloRematerializationTest, RngNotRematerialized) {
+  // Test that a single rng is not rematerialized:
+  //
+  // Entry computation:
+  //   F32[] %param = {...}
+  //   F32[1024] rng = rng(param)
+  //   F32[1024] tanh = tanh(rng)
+  //   F32[1024] exp = exp(rng)
+  //   F32[1024] add_0 = add(rng, tanh)              // LIVE: add_0 + rng +
+  //                                                 //       tanh + exp
+  //
+  //   F32[1024] add_1 = add(rng, add(exp, add_0))   // LIVE: add_1 + add_0 +
+  //                                                 //       rng + tanh + exp
+  //
+  //   F32[1024] add_2 = add(rng, add(tanh, add_1))  // LIVE: add_2 + add_1 +
+  //                                                 //       rng + tanh + exp
+  auto module = CreateNewModule();
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  auto rng = builder.AddInstruction(HloInstruction::CreateRng(
+      vec1024_shape_, RandomDistribution::RNG_UNIFORM, {param, param}));
+  auto tanh = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kTanh, rng));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kExp, rng));
+  auto add_0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(vec1024_shape_, HloOpcode::kAdd, rng, tanh));
+  auto add_1 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, rng,
+      builder.AddInstruction(HloInstruction::CreateBinary(
+          vec1024_shape_, HloOpcode::kAdd, exp, add_0))));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, rng,
+      builder.AddInstruction(HloInstruction::CreateBinary(
+          vec1024_shape_, HloOpcode::kAdd, tanh, add_1))));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  auto count_rngs = [](const HloComputation* computation) {
+    int64 rng_count = 0;
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kRng) {
+        ++rng_count;
+      }
+    }
+    return rng_count;
+  };
+  // Before rematerialization there should be a single broadcast rng in
+  // the graph.
+  ASSERT_EQ(count_rngs(entry_computation), 1);
+  const int64 original_instruction_count =
+      entry_computation->instruction_count();
+  SequentialHloOrdering::HloModuleSequence sequence;
+  // Pick a memory limit some where between 24KB (initial peak memory including
+  // parameter and output) and 20KB (peak memory possible with
+  // rematerialization).
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, HloRematerialization::RematerializeAndSchedule(
+                        ByteSizeOf,
+                        /*memory_limit_bytes=*/4 * ByteSizeOf(vec1024_shape_),
+                        module.get(), SchedulerAlgorithm::kAuto, &sequence));
+  EXPECT_TRUE(changed);
+  // The rng should not have been rematerialized.
+  EXPECT_EQ(count_rngs(entry_computation), 1);
+  // There should have been rematerialization.
+  EXPECT_GT(entry_computation->instruction_count(), original_instruction_count);
+}
+
 TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   // Test that a single instruction is rematerialized several times. Module:
   //
@@ -406,11 +476,11 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   // Pick a memory limit some where between 24KB (initial peak memory including
   // parameter and output) and 20KB (peak memory possible with
   // rematerialization).
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/22 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/22 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
   EXPECT_TRUE(changed);
 
   // The broadcast should have been rematerialized 3 times.
@@ -503,11 +573,11 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
   // Pick a memory limit some where between 24KB (initial peak memory including
   // parameter and output) and 20KB (peak memory possible with
   // rematerialization).
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      HloRematerialization::RematerializeAndSchedule(
-          ByteSizeOf,
-          /*memory_limit_bytes=*/22 * 1024, module.get(), &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          HloRematerialization::RematerializeAndSchedule(
+                              ByteSizeOf,
+                              /*memory_limit_bytes=*/22 * 1024, module.get(),
+                              SchedulerAlgorithm::kAuto, &sequence));
   // Rematerialization should only occur if the rematerializable instruction has
   // no indirect uses.
   if (indirectly_used) {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 6b6d48233a7da50927207b8334186ee5105db268..41b079eb799d06321a31f7d7ae0630dc8d58c46b 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -40,21 +40,18 @@ namespace se = ::perftools::gputools;
 namespace xla {
 
 /*static*/ StatusOr<std::unique_ptr<HloModule>>
-HloRunner::ReadModuleFromHloProtoFile(const std::string& filename,
-                                      const DebugOptions& debug_options) {
-  HloProto proto;
-
-  const Status s =
-      tensorflow::ReadBinaryProto(tensorflow::Env::Default(), filename, &proto);
+HloRunner::CreateModuleFromString(const tensorflow::StringPiece hlo_string,
+                                  const DebugOptions& debug_options) {
+  HloModuleConfig config;
+  config.set_debug_options(debug_options);
+  return tools::Parse(hlo_string, config);
+}
 
-  if (!s.ok()) {
-    const Status s2 =
-        tensorflow::ReadTextProto(tensorflow::Env::Default(), filename, &proto);
-    if (!s2.ok()) {
-      return Status(s2.code(), s.error_message() + "\n" + s2.error_message());
-    }
-  }
+namespace {
 
+// Creates an HloModule from the given proto.
+StatusOr<std::unique_ptr<HloModule>> HloProtoToModule(
+    const HloProto& proto, const DebugOptions& debug_options) {
   TF_ASSIGN_OR_RETURN(
       HloModuleConfig config,
       HloModule::CreateModuleConfigFromProto(proto.hlo_module()));
@@ -64,9 +61,29 @@ HloRunner::ReadModuleFromHloProtoFile(const std::string& filename,
   return std::move(module);
 }
 
+}  // namespace
+
 /*static*/ StatusOr<std::unique_ptr<HloModule>>
-HloRunner::ReadModuleFromHloTextDumpFile(const std::string& filename,
+HloRunner::ReadModuleFromBinaryProtoFile(const std::string& filename,
                                          const DebugOptions& debug_options) {
+  HloProto proto;
+  TF_RETURN_IF_ERROR(tensorflow::ReadBinaryProto(tensorflow::Env::Default(),
+                                                 filename, &proto));
+  return HloProtoToModule(proto, debug_options);
+}
+
+/*static*/ StatusOr<std::unique_ptr<HloModule>>
+HloRunner::ReadModuleFromTextProtoFile(const std::string& filename,
+                                       const DebugOptions& debug_options) {
+  HloProto proto;
+  TF_RETURN_IF_ERROR(
+      tensorflow::ReadTextProto(tensorflow::Env::Default(), filename, &proto));
+  return HloProtoToModule(proto, debug_options);
+}
+
+/*static*/ StatusOr<std::unique_ptr<HloModule>>
+HloRunner::ReadModuleFromHloTextFile(const std::string& filename,
+                                     const DebugOptions& debug_options) {
   string hlo_string;
   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
                                                   filename, &hlo_string));
@@ -75,19 +92,6 @@ HloRunner::ReadModuleFromHloTextDumpFile(const std::string& filename,
   return tools::Parse(hlo_string, config);
 }
 
-/*static*/ StatusOr<std::unique_ptr<HloModule>> HloRunner::ReadModule(
-    const std::string& filename, const DebugOptions& debug_options) {
-  auto module = HloRunner::ReadModuleFromHloProtoFile(filename, debug_options);
-  if (module.ok()) {
-    return module;
-  }
-  const std::string e = module.status().error_message();
-  module = HloRunner::ReadModuleFromHloTextDumpFile(filename, debug_options);
-  return module.ok() ? std::move(module)
-                     : Status(module.status().code(),
-                              e + "\n" + module.status().error_message());
-}
-
 // Define this in .cc file to avoid having to include eigen or forward declare
 // these types in the header.
 struct HloRunner::EigenThreadPoolWrapper {
@@ -104,31 +108,29 @@ HloRunner::HloRunner(se::Platform* platform) {
   VLOG(1) << "Created HloRunner for platform: " << platform->Name();
 }
 
-HloRunner::~HloRunner() {
-  // Deallocate all the memory allocated during the tests.
-  for (auto& allocation : allocations_) {
-    backend().default_stream_executor()->Deallocate(&allocation);
-  }
-}
+HloRunner::~HloRunner() {}
 
-StatusOr<se::DeviceMemoryBase> HloRunner::Execute(
+StatusOr<std::unique_ptr<Literal>> HloRunner::ExecuteInternal(
     std::unique_ptr<HloModule> module,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    Shape* result_shape, bool run_hlo_passes) {
+    const tensorflow::gtl::ArraySlice<Literal*> arguments,
+    bool run_hlo_passes) {
   if (run_hlo_passes) {
     TF_ASSIGN_OR_RETURN(
         module, backend().compiler()->RunHloPasses(
-                    std::move(module), backend().default_stream_executor()));
+                    std::move(module), backend().default_stream_executor(),
+                    /*device_allocator=*/nullptr));
   }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
       backend().compiler()->RunBackend(std::move(module),
-                                       backend().default_stream_executor()));
+                                       backend().default_stream_executor(),
+                                       /*device_allocator=*/nullptr));
 
   se::Stream stream(backend().default_stream_executor());
   stream.Init();
 
   ExecutableRunOptions run_options;
+  run_options.set_device_ordinal(backend().default_device_ordinal());
   run_options.set_stream(&stream);
   run_options.set_allocator(backend().memory_allocator());
   run_options.set_inter_op_thread_pool(backend().inter_op_thread_pool());
@@ -138,73 +140,43 @@ StatusOr<se::DeviceMemoryBase> HloRunner::Execute(
   ServiceExecutableRunOptions service_run_options(
       run_options, backend().StreamBorrower(),
       backend().inter_op_thread_pool());
-  TF_ASSIGN_OR_RETURN(
-      se::DeviceMemoryBase result,
-      executable->ExecuteOnStream(&service_run_options, arguments,
-                                  /*hlo_execution_profile=*/nullptr));
-  TF_RET_CHECK(stream.BlockHostUntilDone());
 
-  allocations_.push_back(result);
-
-  *result_shape = executable->result_shape();
-
-  if (ShapeUtil::IsTuple(*result_shape)) {
-    // We must record element buffers of tuples as well to avoid leaks.
-    DCHECK(!ShapeUtil::IsNestedTuple(*result_shape));
+  // Copy arguments to device.
+  std::vector<std::unique_ptr<ScopedShapedBuffer>> argument_buffers;
+  std::vector<ShapedBuffer*> argument_buffer_ptrs;
+  for (Literal* argument : arguments) {
     TF_ASSIGN_OR_RETURN(
-        std::vector<se::DeviceMemoryBase> element_buffers,
-        backend().transfer_manager()->ShallowCopyTupleFromDevice(
-            backend().default_stream_executor(), result, *result_shape));
-
-    // A tuple may contain the same buffer in more than one element. Keep track
-    // of the buffers already added to avoid duplicates in allocations_.
-    std::set<void*> added_opaques;
-    for (auto element_buffer : element_buffers) {
-      if (added_opaques.count(element_buffer.opaque()) == 0) {
-        CHECK(element_buffer.opaque() != nullptr);
-        added_opaques.insert(element_buffer.opaque());
-        allocations_.push_back(element_buffer);
-      }
-    }
+        std::unique_ptr<ScopedShapedBuffer> argument_buffer,
+        backend().transfer_manager()->AllocateScopedShapedBuffer(
+            argument->shape(), run_options.allocator(),
+            run_options.device_ordinal()));
+    TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
+        stream.parent(), *argument, *argument_buffer));
+    argument_buffers.push_back(std::move(argument_buffer));
+    argument_buffer_ptrs.push_back(argument_buffers.back().get());
   }
 
-  return result;
-}
-
-StatusOr<se::DeviceMemoryBase> HloRunner::TransferToDevice(
-    const Literal& literal) {
-  // Allocate memory on the device using the stream executor.
-  int64 allocation_size =
-      backend().transfer_manager()->GetByteSizeRequirement(literal.shape());
-  se::DeviceMemoryBase allocation =
-      backend().default_stream_executor()->AllocateArray<uint8>(
-          allocation_size);
-  allocations_.push_back(allocation);
-
-  TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
-      backend().default_stream_executor(), literal, &allocation));
-
-  return allocation;
-}
-
-StatusOr<std::unique_ptr<Literal>> HloRunner::TransferFromDevice(
-    const Shape& shape, se::DeviceMemoryBase device_base) {
-  auto literal = MakeUnique<Literal>();
-  TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralFromDevice(
-      backend().default_stream_executor(), device_base, shape, shape,
-      literal.get()));
-  return std::move(literal);
-}
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ShapedBuffer> result,
+      executable->ExecuteOnStream(&service_run_options, argument_buffer_ptrs,
+                                  /*hlo_execution_profile=*/nullptr));
 
-StatusOr<std::unique_ptr<Literal>> HloRunner::ExecuteAndTransfer(
-    std::unique_ptr<HloModule> module,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    bool run_hlo_passes) {
-  Shape result_shape;
+  // Create a ScopedShapedBuffer of the result to manage deallocation. This will
+  // deallocate all the device memory when it goes out of scope.
   TF_ASSIGN_OR_RETURN(
-      se::DeviceMemoryBase device_base,
-      Execute(std::move(module), arguments, &result_shape, run_hlo_passes));
-  return TransferFromDevice(result_shape, device_base);
+      std::unique_ptr<ScopedShapedBuffer> scoped_result,
+      ScopedShapedBuffer::MakeScoped(result.get(), run_options.allocator()));
+
+  auto result_literal = backend().transfer_manager()->TransferLiteralFromDevice(
+      stream.parent(), *scoped_result);
+  if (result_literal.ok()) {
+    VLOG(4) << "Executed binary and got result: "
+            << result_literal.ValueOrDie()->ToString();
+  } else {
+    VLOG(4) << "Executed binary and got status: "
+            << result_literal.status().ToString();
+  }
+  return result_literal;
 }
 
 Backend& HloRunner::backend() {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index 95cddafc91ff40948efc4b0744343d994cf84f3a..cbaebc68bee708090b8ccb2eae19b556c4d6d453 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -35,7 +35,8 @@ namespace xla {
 
 // A base class for running an HloModule. This executes the given HloModule on a
 // certain backend directly without using the client interface. HloModule can be
-// explicitly built, or loaded from a serialization file (e.g., hlo proto file).
+// explicitly built, or loaded from a serialization file (e.g., hlo proto
+// file), or parsed from a hlo textual IR string.
 class HloRunner {
  public:
   HloRunner();
@@ -44,56 +45,34 @@ class HloRunner {
 
   ~HloRunner();
 
+  // Converts an HloModule from the given hlo textual IR string (in
+  // HloModule::ToString format).
+  static StatusOr<std::unique_ptr<HloModule>> CreateModuleFromString(
+      const tensorflow::StringPiece hlo_string,
+      const DebugOptions& debug_options);
+
   // Reads the proto file in xla.HloProto format, creates and returns the
-  // HloModule. Will try to parse the filename as binary proto, then try as
-  // text proto if that fails.
-  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloProtoFile(
+  // HloModule.
+  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromBinaryProtoFile(
+      const std::string& filename, const DebugOptions& debug_options);
+  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromTextProtoFile(
       const std::string& filename, const DebugOptions& debug_options);
 
   // Reads the hlo text dump file in HloModule::ToString format, creates and
   // returns the HloModule.
-  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloTextDumpFile(
-      const std::string& filename, const DebugOptions& debug_options);
-
-  // Tries to parse the filename specified first as binary proto format, then
-  // as a textual proto format, then textual IR, then gives up if both fail.
-  // ReadModuleFromHloProtoFile or ReadModuleFromHloTextDumpFile should be used
-  // explicitly when you know the format, this if you don't.
-  static StatusOr<std::unique_ptr<HloModule>> ReadModule(
+  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloTextFile(
       const std::string& filename, const DebugOptions& debug_options);
 
   // Executes the given module with given literals as input and returns the
   // result as a Literal. The LiteralPtr type accepts Literal* or
   // std::unique_ptr<Literal>.
-  // If run_hlo_passes is true, the module will be executed without Hlo
+  //
+  // If run_hlo_passes is false, the module will be executed without Hlo
   // optimization.
   template <typename LiteralPtr>
   StatusOr<std::unique_ptr<Literal>> Execute(
       std::unique_ptr<HloModule> module,
-      const tensorflow::gtl::ArraySlice<LiteralPtr> literals,
-      bool run_hlo_passes = true);
-
-  // Executes the given module and returns a global data handle.
-  StatusOr<perftools::gputools::DeviceMemoryBase> Execute(
-      std::unique_ptr<HloModule> module,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      Shape* result_shape, bool run_hlo_passes = true);
-
-  // Transfers the given literal to the device and returns the data handle.
-  StatusOr<perftools::gputools::DeviceMemoryBase> TransferToDevice(
-      const Literal& literal);
-
-  // Transfers the array referred to by the given handle from the device and
-  // returns as a Literal.
-  StatusOr<std::unique_ptr<Literal>> TransferFromDevice(
-      const Shape& shape, perftools::gputools::DeviceMemoryBase device_base);
-
-  // Executes the given module and return the result as a Literal.
-  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
-      std::unique_ptr<HloModule> module,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
+      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
       bool run_hlo_passes = true);
 
   // If backend is not created in the constructor, creates and returns the
@@ -104,9 +83,12 @@ class HloRunner {
   Backend& backend();
 
  private:
-  struct EigenThreadPoolWrapper;
+  StatusOr<std::unique_ptr<Literal>> ExecuteInternal(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<Literal*> arguments,
+      bool run_hlo_passes = true);
 
-  std::vector<perftools::gputools::DeviceMemoryBase> allocations_;
+  struct EigenThreadPoolWrapper;
 
   std::unique_ptr<EigenThreadPoolWrapper> thread_pool_wrapper_;
 
@@ -116,15 +98,14 @@ class HloRunner {
 template <typename LiteralPtr>
 StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
     std::unique_ptr<HloModule> module,
-    const tensorflow::gtl::ArraySlice<LiteralPtr> literals,
+    const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
     bool run_hlo_passes) {
-  std::vector<perftools::gputools::DeviceMemoryBase> arguments;
-  for (const auto& literal : literals) {
-    TF_ASSIGN_OR_RETURN(perftools::gputools::DeviceMemoryBase argument,
-                        TransferToDevice(*literal));
-    arguments.push_back(argument);
+  // Construct a vector of plain pointers for the arguments.
+  std::vector<Literal*> argument_pointers;
+  for (const auto& argument : arguments) {
+    argument_pointers.push_back(&*argument);
   }
-  return ExecuteAndTransfer(std::move(module), arguments, run_hlo_passes);
+  return ExecuteInternal(std::move(module), argument_pointers, run_hlo_passes);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc
index 8ccbcaeee4a9c9e94b344231953e20ac8f4b2053..5f5a930dad002c215a5332286ade97ef19cc67af 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 
+#include <queue>
 #include <utility>
 #include <vector>
 
@@ -31,6 +32,8 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
+using ::tensorflow::strings::HumanReadableNumBytes;
+
 namespace xla {
 
 StatusOr<int64> MinimumMemoryForSequence(
@@ -215,32 +218,26 @@ class ListScheduler {
       }
     }
 
-    std::list<ReadyListEntry> ready_list;
+    auto priority_comparator = [this](const ReadyListEntry& lhs,
+                                      const ReadyListEntry& rhs) {
+      return GetPriority(lhs) < GetPriority(rhs);
+    };
+    std::priority_queue<ReadyListEntry, std::vector<ReadyListEntry>,
+                        decltype(priority_comparator)>
+        ready_queue(priority_comparator);
     for (auto* instruction : computation_.instructions()) {
       // Instruction with no operands or control predecessors will
       // not be in the map.
       if (unscheduled_pred_count.count(instruction) == 0) {
-        ready_list.push_back(MakeReadyListEntry(instruction));
+        ready_queue.emplace(MakeReadyListEntry(instruction));
       }
     }
 
-    while (!ready_list.empty()) {
-      // Select the highest priority HLO instruction from the ready list.
-      auto best_it = ready_list.begin();
-      Priority best_priority = GetPriority(*best_it);
-      for (auto ready_it = std::next(ready_list.begin());
-           ready_it != ready_list.end(); ++ready_it) {
-        Priority priority = GetPriority(*ready_it);
-        if (priority > best_priority) {
-          best_it = ready_it;
-          best_priority = priority;
-        }
-      }
-
+    while (!ready_queue.empty()) {
       // Remove the selected instruction from the ready list and add it to the
       // schedule.
-      const HloInstruction* best = best_it->instruction;
-      ready_list.erase(best_it);
+      const HloInstruction* best = ready_queue.top().instruction;
+      ready_queue.pop();
       schedule.push_back(best);
       scheduled_instructions_.insert(best);
 
@@ -255,7 +252,7 @@ class ListScheduler {
         int64 pred_count = --unscheduled_pred_count.at(inst);
         CHECK_GE(pred_count, 0);
         if (pred_count == 0) {
-          ready_list.push_back(MakeReadyListEntry(inst));
+          ready_queue.emplace(MakeReadyListEntry(inst));
         }
       };
       // TODO(b/34466113): Replace this and above with successors() or
@@ -367,7 +364,17 @@ StatusOr<int64> MinimumMemoryForComputation(
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
+    const LogicalBuffer::SizeFunction& size_function,
+    SchedulerAlgorithm algorithm) {
+  VLOG(2) << "Computation: " << computation.name();
+  if (algorithm == SchedulerAlgorithm::kListSchedule) {
+    return ListScheduler::Run(computation, points_to_analysis, size_function);
+  }
+  if (algorithm == SchedulerAlgorithm::kDfsSchedule) {
+    return RunDFSMemoryScheduler(computation, points_to_analysis,
+                                 size_function);
+  }
+
   // We try both a list-scheduler based ordering and a DFS based ordering, and
   // choose whichever returns a lower min-memory, not accounting for
   // fragmentation.
@@ -382,7 +389,7 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
       const int64 list_memory,
       MinimumMemoryForComputation(computation, list_sequence,
                                   points_to_analysis, size_function));
-  VLOG(2) << "Min-memory list sequence: " << list_memory << " bytes";
+  VLOG(2) << "Min-memory list sequence: " << HumanReadableNumBytes(list_memory);
 
   TF_ASSIGN_OR_RETURN(
       std::vector<const HloInstruction*> dfs_sequence,
@@ -391,13 +398,15 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
       const int64 dfs_memory,
       MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis,
                                   size_function));
-  VLOG(2) << "Min-memory dfs sequence: " << dfs_memory << " bytes";
+  VLOG(2) << "Min-memory dfs sequence: " << HumanReadableNumBytes(dfs_memory);
 
   if (list_memory <= dfs_memory) {
-    VLOG(2) << "Chose min-memory list sequence: " << list_memory << " bytes";
+    VLOG(2) << "Chose min-memory list sequence: "
+            << HumanReadableNumBytes(list_memory);
     return list_sequence;
   } else {
-    VLOG(2) << "Chose min-memory dfs sequence: " << dfs_memory << " bytes";
+    VLOG(2) << "Chose min-memory dfs sequence: "
+            << HumanReadableNumBytes(dfs_memory);
     return dfs_sequence;
   }
 }
@@ -405,27 +414,30 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
 }  // namespace
 
 StatusOr<SequentialHloOrdering::HloModuleSequence>
-CreateMemoryMinimizingSequence(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function) {
+CreateMemoryMinimizingSequence(const HloModule& module,
+                               const LogicalBuffer::SizeFunction& size_function,
+                               SchedulerAlgorithm algorithm) {
   SequentialHloOrdering::HloModuleSequence sequence;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(&module));
   for (const auto* computation : module.MakeNonfusionComputations()) {
-    TF_ASSIGN_OR_RETURN(sequence[computation],
-                        CreateMemoryMinimizingSequence(
-                            *computation, *points_to_analysis, size_function));
+    TF_ASSIGN_OR_RETURN(
+        sequence[computation],
+        CreateMemoryMinimizingSequence(*computation, *points_to_analysis,
+                                       size_function, algorithm));
   }
   return sequence;
 }
 
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
-    const LogicalBuffer::SizeFunction& size_function) {
+    const LogicalBuffer::SizeFunction& size_function,
+    SchedulerAlgorithm algorithm) {
   CHECK(!computation.IsFusionComputation());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(computation.parent()));
   return CreateMemoryMinimizingSequence(computation, *points_to_analysis,
-                                        size_function);
+                                        size_function, algorithm);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.h b/tensorflow/compiler/xla/service/hlo_scheduling.h
index ec92a56b962152b15981f868369683144aa7c76a..1d1eb1e064f75c2220b39e84b010e720a0c37880 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.h
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.h
@@ -33,17 +33,28 @@ StatusOr<int64> MinimumMemoryForSequence(
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const LogicalBuffer::SizeFunction& size_function);
 
+enum class SchedulerAlgorithm {
+  kListSchedule,
+  kDfsSchedule,
+
+  // Selects the available scheduler algorithm that had the minimum memory in
+  // the resulting sequence (a la MinimumMemoryForSequence).
+  kAuto,
+};
+
 // Returns an HloModuleSequence which seeks to minimize the memory required for
 // the computation. size_function is the function returning the number of bytes
 // required for a LogicalBuffer.
 StatusOr<SequentialHloOrdering::HloModuleSequence>
 CreateMemoryMinimizingSequence(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function);
+    const HloModule& module, const LogicalBuffer::SizeFunction& size_function,
+    SchedulerAlgorithm algorithm = SchedulerAlgorithm::kAuto);
 
 // Overload of above that computes the sequence for a single computation.
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
-    const LogicalBuffer::SizeFunction& size_function);
+    const LogicalBuffer::SizeFunction& size_function,
+    SchedulerAlgorithm algorithm = SchedulerAlgorithm::kAuto);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index d1adec31c21fe55001db4d522ddda27dd538bc95..447c2446668253c932b44b51b2db22bfd47f9957 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -246,7 +246,8 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
   // The tile rank must be the same as the input rank.
   if (ShapeUtil::Rank(shape) != ShapeUtil::Rank(tile_shape_)) {
     return tensorflow::errors::InvalidArgument(
-        "Tile rank is different to the input rank");
+        "Tile rank is different to the input rank. sharding=", ToString(),
+        ", input_shape=", ShapeUtil::HumanString(shape));
   }
 
   // The tile shape must not be the same as the input shape without maximal_
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 1a6988a2dc872a39ff6b0551adf7ddb871f0d72a..7263198385cf0c84b1dac1e15177dcac99adaafb 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -80,6 +80,17 @@ class HloSharding {
     return HloSharding(flattened_list);
   }
 
+  // Creates a new sharding for a tuple type. The requested tuple shape must not
+  // be nested. For nested tuples, use the ShapeTree overload.
+  static HloSharding Tuple(const Shape& tuple_shape,
+                           tensorflow::gtl::ArraySlice<HloSharding> shardings) {
+    CHECK(ShapeUtil::IsTuple(tuple_shape));
+    CHECK(!ShapeUtil::IsNestedTuple(tuple_shape));
+    std::vector<HloSharding> flattened_list(shardings.begin(), shardings.end());
+    CHECK_EQ(flattened_list.size(), ShapeUtil::TupleElementCount(tuple_shape));
+    return HloSharding(flattened_list);
+  }
+
   // Create a new sharding from a protobuf OpSharding.
   static StatusOr<HloSharding> FromProto(const OpSharding& proto);
 
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
index 101a710d1cad9401134fdfe1d0ec9df241bc01e1..3dc733940fc89952bd5e75a9b28d9cbf356f8000 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
@@ -166,7 +166,7 @@ void HloTfGraphBuilder::SetNodeAttrs(const HloInstruction* instruction,
       layout_string = ShapeUtil::HumanStringWithLayout(instruction->shape());
     } else {
       layout_string = StrCat(
-          "{", Join(instruction->shape().layout().minor_to_major(), ","), "}");
+          "{", Join(LayoutUtil::MinorToMajor(instruction->shape()), ","), "}");
     }
     attrs["layout"].set_s(layout_string);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
index 9aa3e501d5f85e3b61b20555e3d13c5687f33f2f..c4876b852e32d34693202f4023aa20ad2b301ffd 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
@@ -56,4 +56,4 @@ class HloTfGraphBuilder {
 }  // namespace hlo_graph_dumper
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 15188c4057eca8eea1805e599cd020c045fdd10a..e2b3bb9d71497c352b0b92add2d2f6b4b777bee8 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -13,413 +13,529 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <set>
+
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
-#include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 
-namespace {
+Status ShapeVerifier::HandleElementwiseUnary(HloInstruction* hlo) {
+  return CheckUnaryShape(hlo);
+}
 
-// Visitor which verifies that the output shape is correctly set. Verifies
-// against the inferred shape for the instruction.
-// TODO(b/26024837): Check output shape for all instruction types.
-class ShapeVerifier : public DfsHloVisitor {
- public:
-  explicit ShapeVerifier(
-      const std::function<int64(const Shape&)>& shape_size_fn)
-      : shape_size_fn_(shape_size_fn) {}
-
-  Status HandleElementwiseUnary(HloInstruction* hlo) override {
-    return CheckUnaryShape(hlo);
-  }
+Status ShapeVerifier::HandleElementwiseBinary(HloInstruction* hlo) {
+  return CheckBinaryShape(hlo);
+}
 
-  Status HandleElementwiseBinary(HloInstruction* hlo) override {
-    return CheckBinaryShape(hlo);
-  }
+Status ShapeVerifier::HandleClamp(HloInstruction* clamp) {
+  return CheckTernaryShape(clamp);
+}
 
-  Status HandleClamp(HloInstruction* clamp) override {
-    return CheckTernaryShape(clamp);
-  }
+Status ShapeVerifier::HandleSelect(HloInstruction* select) {
+  return CheckTernaryShape(select);
+}
 
-  Status HandleSelect(HloInstruction* select) override {
-    return CheckTernaryShape(select);
+Status ShapeVerifier::HandleConcatenate(HloInstruction* concatenate) {
+  std::vector<const Shape*> operand_shapes;
+  for (const HloInstruction* operand : concatenate->operands()) {
+    operand_shapes.push_back(&operand->shape());
   }
+  return CheckShape(concatenate,
+                    ShapeInference::InferConcatOpShape(
+                        operand_shapes, concatenate->concatenate_dimension()));
+}
 
-  Status HandleConcatenate(HloInstruction* concatenate) override {
-    std::vector<const Shape*> operand_shapes;
-    for (const HloInstruction* operand : concatenate->operands()) {
-      operand_shapes.push_back(&operand->shape());
-    }
-    return CheckShape(
-        concatenate, ShapeInference::InferConcatOpShape(
-                         operand_shapes, concatenate->concatenate_dimension()));
-  }
+Status ShapeVerifier::HandleConvert(HloInstruction* convert) {
+  return CheckShape(convert, ShapeInference::InferConvertShape(
+                                 convert->operand(0)->shape(),
+                                 convert->shape().element_type()));
+}
 
-  Status HandleConvert(HloInstruction* convert) override {
-    return CheckShape(convert, ShapeInference::InferConvertShape(
-                                   convert->operand(0)->shape(),
-                                   convert->shape().element_type()));
-  }
+Status ShapeVerifier::HandleBitcastConvert(HloInstruction* convert) {
+  return CheckShape(convert, ShapeInference::InferBitcastConvertShape(
+                                 convert->operand(0)->shape(),
+                                 convert->shape().element_type()));
+}
 
-  Status HandleBitcastConvert(HloInstruction* convert) override {
-    return CheckShape(convert, ShapeInference::InferBitcastConvertShape(
-                                   convert->operand(0)->shape(),
-                                   convert->shape().element_type()));
-  }
+Status ShapeVerifier::HandleCopy(HloInstruction* copy) {
+  return CheckUnaryShape(copy);
+}
 
-  Status HandleCopy(HloInstruction* copy) override {
-    return CheckUnaryShape(copy);
-  }
+Status ShapeVerifier::HandleDot(HloInstruction* dot) {
+  TF_ASSIGN_OR_RETURN(const Shape expected,
+                      ShapeInference::InferDotOpShape(
+                          dot->operand(0)->shape(), dot->operand(1)->shape(),
+                          dot->dot_dimension_numbers()));
+  return CheckShape(dot, expected);
+}
 
-  Status HandleDot(HloInstruction* dot) override {
-    return CheckBinaryShape(dot);
-  }
+Status ShapeVerifier::HandleConvolution(HloInstruction* convolution) {
+  TF_ASSIGN_OR_RETURN(
+      const Shape expected,
+      ShapeInference::InferConvolveShape(
+          convolution->operand(0)->shape(), convolution->operand(1)->shape(),
+          convolution->window(), convolution->convolution_dimension_numbers()));
+  return CheckShape(convolution, expected);
+}
 
-  Status HandleConvolution(HloInstruction* convolution) override {
-    TF_ASSIGN_OR_RETURN(
-        const Shape expected,
-        ShapeInference::InferConvolveShape(
-            convolution->operand(0)->shape(), convolution->operand(1)->shape(),
-            convolution->window(),
-            convolution->convolution_dimension_numbers()));
-    return CheckShape(convolution, expected);
-  }
+Status ShapeVerifier::HandleFft(HloInstruction* fft) {
+  TF_ASSIGN_OR_RETURN(
+      const Shape expected,
+      ShapeInference::InferFftShape(fft->operand(0)->shape(), fft->fft_type(),
+                                    fft->fft_length()));
+  return CheckShape(fft, expected);
+}
 
-  Status HandleCrossReplicaSum(HloInstruction* crs) override {
-    return CheckShape(crs, ShapeInference::InferCrossReplicaSumShape(
-                               crs->operand(0)->shape()));
+Status ShapeVerifier::HandleCrossReplicaSum(HloInstruction* crs) {
+  std::vector<const Shape*> operand_shapes;
+  for (const HloInstruction* operand : crs->operands()) {
+    operand_shapes.push_back(&operand->shape());
   }
+  return CheckShape(crs,
+                    ShapeInference::InferCrossReplicaSumShape(operand_shapes));
+}
 
-  Status HandleReducePrecision(HloInstruction* reduce_precision) override {
-    return CheckShape(reduce_precision,
-                      ShapeInference::InferReducePrecisionShape(
-                          reduce_precision->operand(0)->shape(),
-                          reduce_precision->exponent_bits(),
-                          reduce_precision->mantissa_bits()));
-  }
+Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) {
+  return CheckShape(reduce_precision, ShapeInference::InferReducePrecisionShape(
+                                          reduce_precision->operand(0)->shape(),
+                                          reduce_precision->exponent_bits(),
+                                          reduce_precision->mantissa_bits()));
+}
 
-  Status HandleInfeed(HloInstruction*) override {
-    return tensorflow::Status::OK();
-  }
+Status ShapeVerifier::HandleInfeed(HloInstruction*) {
+  return tensorflow::Status::OK();
+}
 
-  Status HandleOutfeed(HloInstruction*) override {
-    return tensorflow::Status::OK();
-  }
+Status ShapeVerifier::HandleOutfeed(HloInstruction* outfeed) {
+  // Outfeed has a separate shape field for the value which is outfed to the
+  // host. The shape of the instruction itself is always nil because the outfeed
+  // produces no HLO value in the graph.
+  if (!ShapeUtil::Compatible(outfeed->outfeed_shape(),
+                             outfeed->operand(0)->shape())) {
+    return InvalidArgument(
+        "Expected outfeed to have shape compatible with operand's shape %s, "
+        "actual shape is %s:\n%s",
+        ShapeUtil::HumanString(outfeed->operand(0)->shape()).c_str(),
+        ShapeUtil::HumanString(outfeed->outfeed_shape()).c_str(),
+        outfeed->ToString().c_str());
+  }
+  return CheckShape(outfeed, ShapeUtil::MakeNil());
+}
 
-  Status HandleRng(HloInstruction*) override {
-    return tensorflow::Status::OK();
-  }
+Status ShapeVerifier::HandleRng(HloInstruction*) {
+  return tensorflow::Status::OK();
+}
 
-  Status HandleReverse(HloInstruction* reverse) override {
-    return CheckShape(
-        reverse, ShapeInference::InferReverseShape(reverse->operand(0)->shape(),
-                                                   reverse->dimensions()));
-  }
+Status ShapeVerifier::HandleReverse(HloInstruction* reverse) {
+  return CheckShape(
+      reverse, ShapeInference::InferReverseShape(reverse->operand(0)->shape(),
+                                                 reverse->dimensions()));
+}
 
-  Status HandleSort(HloInstruction* sort) override {
-    return CheckUnaryShape(sort);
-  }
+Status ShapeVerifier::HandleSort(HloInstruction* sort) {
+  return CheckUnaryShape(sort);
+}
 
-  Status HandleConstant(HloInstruction* constant) override {
-    return CheckShape(constant, constant->literal().shape());
-  }
+Status ShapeVerifier::HandleConstant(HloInstruction* constant) {
+  return CheckShape(constant, constant->literal().shape());
+}
 
-  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override {
-    return CheckShape(get_tuple_element,
-                      ShapeInference::InferGetTupleElementShape(
-                          get_tuple_element->operand(0)->shape(),
-                          get_tuple_element->tuple_index()));
-  }
+Status ShapeVerifier::HandleGetTupleElement(HloInstruction* get_tuple_element) {
+  return CheckShape(get_tuple_element,
+                    ShapeInference::InferGetTupleElementShape(
+                        get_tuple_element->operand(0)->shape(),
+                        get_tuple_element->tuple_index()));
+}
 
-  Status HandleReduce(HloInstruction* reduce) override {
-    return CheckShape(
-        reduce,
-        ShapeInference::InferReduceShape(
-            reduce->operand(0)->shape(), reduce->operand(1)->shape(),
-            reduce->dimensions(), reduce->to_apply()->ComputeProgramShape()));
-  }
+Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
+  return CheckShape(
+      reduce,
+      ShapeInference::InferReduceShape(
+          reduce->operand(0)->shape(), reduce->operand(1)->shape(),
+          reduce->dimensions(), reduce->to_apply()->ComputeProgramShape()));
+}
 
-  Status HandleBitcast(HloInstruction* bitcast) override {
-    // Bitcasts can be any shape, as long as the size matches the operand size.
-    TF_RET_CHECK(shape_size_fn_(bitcast->shape()) ==
-                 shape_size_fn_(bitcast->operand(0)->shape()));
-    return tensorflow::Status::OK();
-  }
+Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
+  return tensorflow::Status::OK();
+}
 
-  Status HandleBroadcast(HloInstruction* broadcast) override {
-    // HLO broadcast has no exact analog at the proto level so there is no
-    // ShapeInference method. Check the output shape explicitly.
-    const Shape& operand_shape = broadcast->operand(0)->shape();
-    TF_RET_CHECK(ShapeUtil::Rank(operand_shape) ==
-                 broadcast->dimensions().size());
-    for (int64 operand_dimension = 0;
-         operand_dimension < ShapeUtil::Rank(operand_shape);
-         ++operand_dimension) {
-      int64 output_dimension = broadcast->dimensions()[operand_dimension];
-      TF_RET_CHECK(broadcast->shape().dimensions(output_dimension) ==
-                   operand_shape.dimensions(operand_dimension));
-    }
-    return tensorflow::Status::OK();
+Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
+  // HLO broadcast has no exact analog at the proto level so there is no
+  // ShapeInference method. Check the output shape explicitly.
+  const Shape& operand_shape = broadcast->operand(0)->shape();
+  // Check for mixed precision.
+  TF_RETURN_IF_ERROR(CheckShape(broadcast, broadcast->shape()));
+  TF_RET_CHECK(ShapeUtil::Rank(operand_shape) ==
+               broadcast->dimensions().size());
+  for (int64 operand_dimension = 0;
+       operand_dimension < ShapeUtil::Rank(operand_shape);
+       ++operand_dimension) {
+    int64 output_dimension = broadcast->dimensions()[operand_dimension];
+    TF_RET_CHECK(broadcast->shape().dimensions(output_dimension) ==
+                 operand_shape.dimensions(operand_dimension))
+        << broadcast->ToString() << " operand shape " << operand_shape;
   }
+  return tensorflow::Status::OK();
+}
 
-  Status HandleReshape(HloInstruction* reshape) override {
-    TF_RET_CHECK(ShapeUtil::ElementsIn(reshape->shape()) ==
-                 ShapeUtil::ElementsIn(reshape->operand(0)->shape()));
-    return tensorflow::Status::OK();
-  }
+Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
+  // Check for mixed precision.
+  TF_RETURN_IF_ERROR(CheckShape(reshape, reshape->shape()));
+  TF_RET_CHECK(ShapeUtil::ElementsIn(reshape->shape()) ==
+               ShapeUtil::ElementsIn(reshape->operand(0)->shape()));
+  return tensorflow::Status::OK();
+}
 
-  Status HandleTranspose(HloInstruction* transpose) override {
-    return CheckShape(transpose, ShapeInference::InferTransposeShape(
-                                     transpose->operand(0)->shape(),
-                                     transpose->dimensions()));
-  }
+Status ShapeVerifier::HandleTranspose(HloInstruction* transpose) {
+  return CheckShape(
+      transpose, ShapeInference::InferTransposeShape(
+                     transpose->operand(0)->shape(), transpose->dimensions()));
+}
 
-  Status HandleParameter(HloInstruction*) override {
-    return tensorflow::Status::OK();
-  }
+Status ShapeVerifier::HandleParameter(HloInstruction*) {
+  return tensorflow::Status::OK();
+}
 
-  Status HandleFusion(HloInstruction*) override {
-    return tensorflow::Status::OK();
-  }
+Status ShapeVerifier::HandleFusion(HloInstruction*) {
+  return tensorflow::Status::OK();
+}
 
-  Status HandleCall(HloInstruction* call) override {
-    // The shape of kCall should match the shape of the computation it calls.
-    return CheckShape(call, call->to_apply()->ComputeProgramShape().result());
-  }
+Status ShapeVerifier::HandleCall(HloInstruction* call) {
+  // The shape of kCall should match the shape of the computation it calls.
+  return CheckShape(call, call->to_apply()->ComputeProgramShape().result());
+}
 
-  Status HandleCustomCall(HloInstruction*) override {
-    return tensorflow::Status::OK();
-  }
+Status ShapeVerifier::HandleCustomCall(HloInstruction*) {
+  return tensorflow::Status::OK();
+}
 
-  Status HandleSlice(HloInstruction* slice) override {
-    return CheckShape(slice,
-                      ShapeInference::InferSliceShape(
-                          slice->operand(0)->shape(), slice->slice_starts(),
-                          slice->slice_limits(), slice->slice_strides()));
-  }
+Status ShapeVerifier::HandleSlice(HloInstruction* slice) {
+  return CheckShape(slice,
+                    ShapeInference::InferSliceShape(
+                        slice->operand(0)->shape(), slice->slice_starts(),
+                        slice->slice_limits(), slice->slice_strides()));
+}
 
-  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override {
-    return CheckShape(dynamic_slice, ShapeInference::InferDynamicSliceShape(
-                                         dynamic_slice->operand(0)->shape(),
-                                         dynamic_slice->operand(1)->shape(),
-                                         dynamic_slice->dynamic_slice_sizes()));
-  }
+Status ShapeVerifier::HandleDynamicSlice(HloInstruction* dynamic_slice) {
+  return CheckShape(dynamic_slice, ShapeInference::InferDynamicSliceShape(
+                                       dynamic_slice->operand(0)->shape(),
+                                       dynamic_slice->operand(1)->shape(),
+                                       dynamic_slice->dynamic_slice_sizes()));
+}
 
-  Status HandleDynamicUpdateSlice(
-      HloInstruction* dynamic_update_slice) override {
-    return CheckShape(dynamic_update_slice,
-                      ShapeInference::InferDynamicUpdateSliceShape(
-                          dynamic_update_slice->operand(0)->shape(),
-                          dynamic_update_slice->operand(1)->shape(),
-                          dynamic_update_slice->operand(2)->shape()));
-  }
+Status ShapeVerifier::HandleDynamicUpdateSlice(
+    HloInstruction* dynamic_update_slice) {
+  return CheckShape(dynamic_update_slice,
+                    ShapeInference::InferDynamicUpdateSliceShape(
+                        dynamic_update_slice->operand(0)->shape(),
+                        dynamic_update_slice->operand(1)->shape(),
+                        dynamic_update_slice->operand(2)->shape()));
+}
 
-  Status HandleTuple(HloInstruction* tuple) override {
-    return CheckVariadicShape(tuple);
-  }
+Status ShapeVerifier::HandleTuple(HloInstruction* tuple) {
+  return CheckVariadicShape(tuple);
+}
 
-  Status HandleMap(HloInstruction* map) override {
-    std::vector<const Shape*> operand_shapes;
-    int64 max_operand_rank = 0;
-    for (const HloInstruction* operand : map->operands()) {
-      operand_shapes.push_back(&operand->shape());
-      max_operand_rank =
-          std::max(max_operand_rank, ShapeUtil::Rank(operand->shape()));
-    }
-    // TODO(b/65689298) Remove code below once Map is generalized to accept
-    // arbitrary map dimensions.
-    std::vector<int64> map_dims(max_operand_rank);
-    std::iota(map_dims.begin(), map_dims.end(), 0);
-    return CheckShape(
-        map,
-        ShapeInference::InferMapShape(
-            operand_shapes, map->to_apply()->ComputeProgramShape(), map_dims));
-  }
+Status ShapeVerifier::HandleMap(HloInstruction* map) {
+  std::vector<const Shape*> operand_shapes;
+  int64 max_operand_rank = 0;
+  for (const HloInstruction* operand : map->operands()) {
+    operand_shapes.push_back(&operand->shape());
+    max_operand_rank =
+        std::max(max_operand_rank, ShapeUtil::Rank(operand->shape()));
+  }
+  // TODO(b/65689298) Remove code below once Map is generalized to accept
+  // arbitrary map dimensions.
+  std::vector<int64> map_dims(max_operand_rank);
+  std::iota(map_dims.begin(), map_dims.end(), 0);
+  return CheckShape(map, ShapeInference::InferMapShape(
+                             operand_shapes,
+                             map->to_apply()->ComputeProgramShape(), map_dims));
+}
 
-  Status HandleReduceWindow(HloInstruction* reduce_window) override {
-    return CheckShape(
-        reduce_window,
-        ShapeInference::InferReduceWindowShape(
-            reduce_window->operand(0)->shape(),
-            reduce_window->operand(1)->shape(), reduce_window->window(),
-            reduce_window->to_apply()->ComputeProgramShape()));
-  }
+Status ShapeVerifier::HandleReduceWindow(HloInstruction* reduce_window) {
+  return CheckShape(
+      reduce_window,
+      ShapeInference::InferReduceWindowShape(
+          reduce_window->operand(0)->shape(),
+          reduce_window->operand(1)->shape(), reduce_window->window(),
+          reduce_window->to_apply()->ComputeProgramShape()));
+}
 
-  Status HandleSelectAndScatter(HloInstruction* instruction) override {
-    return CheckShape(
-        instruction,
-        ShapeInference::InferSelectAndScatterShape(
-            instruction->operand(0)->shape(),
-            instruction->select()->ComputeProgramShape(), instruction->window(),
-            instruction->operand(1)->shape(), instruction->operand(2)->shape(),
-            instruction->scatter()->ComputeProgramShape()));
-  }
+Status ShapeVerifier::HandleSelectAndScatter(HloInstruction* instruction) {
+  return CheckShape(
+      instruction,
+      ShapeInference::InferSelectAndScatterShape(
+          instruction->operand(0)->shape(),
+          instruction->select()->ComputeProgramShape(), instruction->window(),
+          instruction->operand(1)->shape(), instruction->operand(2)->shape(),
+          instruction->scatter()->ComputeProgramShape()));
+}
 
-  Status HandleWhile(HloInstruction* xla_while) override {
-    // The shape of kWhile should match the shape of the body computation it
-    // calls.
-    return CheckShape(xla_while,
-                      xla_while->while_body()->ComputeProgramShape().result());
-  }
+Status ShapeVerifier::HandleWhile(HloInstruction* xla_while) {
+  // The shape of kWhile should match the shape of the body computation it
+  // calls.
+  return CheckShape(xla_while,
+                    xla_while->while_body()->ComputeProgramShape().result());
+}
 
-  Status HandleConditional(HloInstruction* conditional) override {
-    TF_RETURN_IF_ERROR(CheckShape(
-        conditional,
-        conditional->true_computation()->ComputeProgramShape().result()));
-    return CheckShape(
-        conditional,
-        conditional->false_computation()->ComputeProgramShape().result());
-  }
+Status ShapeVerifier::HandleConditional(HloInstruction* conditional) {
+  TF_RETURN_IF_ERROR(CheckShape(
+      conditional,
+      conditional->true_computation()->ComputeProgramShape().result()));
+  return CheckShape(
+      conditional,
+      conditional->false_computation()->ComputeProgramShape().result());
+}
 
-  Status HandlePad(HloInstruction* pad) override {
-    return CheckShape(pad,
-                      ShapeInference::InferPadShape(pad->operand(0)->shape(),
-                                                    pad->operand(1)->shape(),
-                                                    pad->padding_config()));
-  }
+Status ShapeVerifier::HandlePad(HloInstruction* pad) {
+  return CheckShape(pad, ShapeInference::InferPadShape(pad->operand(0)->shape(),
+                                                       pad->operand(1)->shape(),
+                                                       pad->padding_config()));
+}
 
-  Status HandleSend(HloInstruction* send) override {
-    TF_RET_CHECK(send->users().size() == 1);
-    const HloInstruction* send_done = send->users().front();
-    TF_RET_CHECK(send_done->opcode() == HloOpcode::kSendDone);
-    TF_RETURN_IF_ERROR(CheckSameChannel(send, send_done));
-    return CheckShape(
-        send, ShapeUtil::MakeTupleShape(
-                  {send->operand(0)->shape(), ShapeUtil::MakeShape(U32, {})}));
-  }
+Status ShapeVerifier::HandleSend(HloInstruction* send) {
+  TF_RET_CHECK(send->users().size() == 1);
+  const HloInstruction* send_done = send->users().front();
+  TF_RET_CHECK(send_done->opcode() == HloOpcode::kSendDone);
+  TF_RETURN_IF_ERROR(CheckSameChannel(send, send_done));
+  return CheckShape(
+      send, ShapeUtil::MakeTupleShape(
+                {send->operand(0)->shape(), ShapeUtil::MakeShape(U32, {})}));
+}
 
-  Status HandleSendDone(HloInstruction* send_done) override {
-    TF_RET_CHECK(send_done->operands().size() == 1);
-    const HloInstruction* send = send_done->operand(0);
-    TF_RET_CHECK(send->opcode() == HloOpcode::kSend);
-    TF_RETURN_IF_ERROR(CheckSameChannel(send, send_done));
-    return CheckShape(send_done, ShapeUtil::MakeNil());
-  }
+Status ShapeVerifier::HandleSendDone(HloInstruction* send_done) {
+  TF_RET_CHECK(send_done->operands().size() == 1);
+  const HloInstruction* send = send_done->operand(0);
+  TF_RET_CHECK(send->opcode() == HloOpcode::kSend);
+  TF_RETURN_IF_ERROR(CheckSameChannel(send, send_done));
+  return CheckShape(send_done, ShapeUtil::MakeNil());
+}
 
-  Status HandleRecv(HloInstruction* recv) override {
-    TF_RET_CHECK(recv->users().size() == 1);
-    const HloInstruction* recv_done = recv->users().front();
-    TF_RET_CHECK(recv_done->opcode() == HloOpcode::kRecvDone);
-    TF_RETURN_IF_ERROR(CheckSameChannel(recv, recv_done));
-    return CheckShape(recv,
-                      ShapeUtil::MakeTupleShape(
-                          {recv_done->shape(), ShapeUtil::MakeShape(U32, {})}));
-  }
+Status ShapeVerifier::HandleRecv(HloInstruction* recv) {
+  TF_RET_CHECK(recv->users().size() == 1);
+  const HloInstruction* recv_done = recv->users().front();
+  TF_RET_CHECK(recv_done->opcode() == HloOpcode::kRecvDone);
+  TF_RETURN_IF_ERROR(CheckSameChannel(recv, recv_done));
+  return CheckShape(recv,
+                    ShapeUtil::MakeTupleShape(
+                        {recv_done->shape(), ShapeUtil::MakeShape(U32, {})}));
+}
 
-  Status HandleRecvDone(HloInstruction* recv_done) override {
-    TF_RET_CHECK(recv_done->operands().size() == 1);
-    const HloInstruction* recv = recv_done->operand(0);
-    TF_RET_CHECK(recv->opcode() == HloOpcode::kRecv);
-    TF_RETURN_IF_ERROR(CheckSameChannel(recv, recv_done));
-    return CheckShape(recv_done, recv->shape().tuple_shapes(0));
-  }
+Status ShapeVerifier::HandleRecvDone(HloInstruction* recv_done) {
+  TF_RET_CHECK(recv_done->operands().size() == 1);
+  const HloInstruction* recv = recv_done->operand(0);
+  TF_RET_CHECK(recv->opcode() == HloOpcode::kRecv);
+  TF_RETURN_IF_ERROR(CheckSameChannel(recv, recv_done));
+  return CheckShape(recv_done, recv->shape().tuple_shapes(0));
+}
 
-  Status HandleBatchNormTraining(HloInstruction* batch_norm_training) override {
-    return CheckShape(batch_norm_training,
-                      ShapeInference::InferBatchNormTrainingShape(
-                          batch_norm_training->operand(0)->shape(),
-                          batch_norm_training->operand(1)->shape(),
-                          batch_norm_training->operand(2)->shape(),
-                          batch_norm_training->feature_index()));
-  }
+Status ShapeVerifier::HandleBatchNormTraining(
+    HloInstruction* batch_norm_training) {
+  return CheckShape(batch_norm_training,
+                    ShapeInference::InferBatchNormTrainingShape(
+                        batch_norm_training->operand(0)->shape(),
+                        batch_norm_training->operand(1)->shape(),
+                        batch_norm_training->operand(2)->shape(),
+                        batch_norm_training->feature_index()));
+}
 
-  Status HandleBatchNormInference(
-      HloInstruction* batch_norm_inference) override {
-    return CheckShape(batch_norm_inference,
-                      ShapeInference::InferBatchNormInferenceShape(
-                          batch_norm_inference->operand(0)->shape(),
-                          batch_norm_inference->operand(1)->shape(),
-                          batch_norm_inference->operand(2)->shape(),
-                          batch_norm_inference->operand(3)->shape(),
-                          batch_norm_inference->operand(4)->shape(),
-                          batch_norm_inference->feature_index()));
-  }
+Status ShapeVerifier::HandleBatchNormInference(
+    HloInstruction* batch_norm_inference) {
+  return CheckShape(batch_norm_inference,
+                    ShapeInference::InferBatchNormInferenceShape(
+                        batch_norm_inference->operand(0)->shape(),
+                        batch_norm_inference->operand(1)->shape(),
+                        batch_norm_inference->operand(2)->shape(),
+                        batch_norm_inference->operand(3)->shape(),
+                        batch_norm_inference->operand(4)->shape(),
+                        batch_norm_inference->feature_index()));
+}
 
-  Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override {
-    return CheckShape(batch_norm_grad, ShapeInference::InferBatchNormGradShape(
-                                           batch_norm_grad->operand(0)->shape(),
-                                           batch_norm_grad->operand(1)->shape(),
-                                           batch_norm_grad->operand(2)->shape(),
-                                           batch_norm_grad->operand(3)->shape(),
-                                           batch_norm_grad->operand(4)->shape(),
-                                           batch_norm_grad->feature_index()));
-  }
+Status ShapeVerifier::HandleBatchNormGrad(HloInstruction* batch_norm_grad) {
+  return CheckShape(batch_norm_grad, ShapeInference::InferBatchNormGradShape(
+                                         batch_norm_grad->operand(0)->shape(),
+                                         batch_norm_grad->operand(1)->shape(),
+                                         batch_norm_grad->operand(2)->shape(),
+                                         batch_norm_grad->operand(3)->shape(),
+                                         batch_norm_grad->operand(4)->shape(),
+                                         batch_norm_grad->feature_index()));
+}
 
-  Status FinishVisit(HloInstruction*) override {
-    return tensorflow::Status::OK();
-  }
+namespace {
 
- private:
-  // Check the instruction's shape against the given expected shape and return
-  // an appropriate error if there is a mismatch.
-  Status CheckShape(const HloInstruction* instruction,
-                    const Shape& expected_shape) {
-    if (!ShapeUtil::Compatible(instruction->shape(), expected_shape)) {
-      return InvalidArgument(
-          "Expected instruction to have shape compatible with %s, actual "
-          "shape is %s:\n%s",
-          ShapeUtil::HumanString(expected_shape).c_str(),
-          ShapeUtil::HumanString(instruction->shape()).c_str(),
-          instruction->ToString().c_str());
+// Checks that the instruction does not have mixed precision floating point
+// inputs.
+Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
+  switch (instruction->opcode()) {
+    // White list the following opcodes for mixed-precision check, because they
+    // involve data pass through or grouping via tuples, where the precisions
+    // of buffers can be different.
+    case HloOpcode::kCall:
+    case HloOpcode::kConditional:
+    case HloOpcode::kConstant:
+    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kFusion:
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kInfeed:
+    case HloOpcode::kOutfeed:
+    case HloOpcode::kParameter:
+    case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
+    case HloOpcode::kReducePrecision:
+    case HloOpcode::kSelect:
+    case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
+    case HloOpcode::kTuple:
+    case HloOpcode::kWhile:
+      break;
+    default: {
+      PrimitiveType fp_type = PRIMITIVE_TYPE_INVALID;
+      for (auto operand : instruction->operands()) {
+        TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+            operand->shape(),
+            [&](const Shape& subshape, const ShapeIndex& index) {
+              if (!ShapeUtil::ElementIsFloating(subshape)) {
+                return Status::OK();
+              }
+              if (fp_type == PRIMITIVE_TYPE_INVALID) {
+                fp_type = subshape.element_type();
+              } else if (fp_type != subshape.element_type()) {
+                return FailedPrecondition(
+                    "Seen floating point types of different precisions in "
+                    "%s, but mixed precision is disallowed.",
+                    instruction->ToString().c_str());
+              }
+              return Status::OK();
+            }));
+      }
     }
-    return tensorflow::Status::OK();
   }
+  return Status::OK();
+}
 
-  // Overload which takes a StatusOr to reduce boilerplate in the caller.
-  Status CheckShape(const HloInstruction* instruction,
-                    const StatusOr<Shape>& expected_shape_status) {
-    if (!expected_shape_status.ok()) {
-      Status s = expected_shape_status.status();
-      tensorflow::errors::AppendToMessage(&s, ", for instruction ",
-                                          instruction->ToString());
-      return s;
-    }
-    return CheckShape(instruction, expected_shape_status.ValueOrDie());
-  }
+}  // namespace
 
-  // Check a unary (binary, etc) instruction's shape against the inferred shape.
-  Status CheckUnaryShape(const HloInstruction* instruction) {
-    return CheckShape(instruction,
-                      ShapeInference::InferUnaryOpShape(
-                          instruction->opcode(), instruction->operand(0)));
-  }
-  Status CheckBinaryShape(const HloInstruction* instruction) {
-    return CheckShape(instruction,
-                      ShapeInference::InferBinaryOpShape(
-                          instruction->opcode(), instruction->operand(0),
-                          instruction->operand(1)));
-  }
-  Status CheckTernaryShape(const HloInstruction* instruction) {
-    return CheckShape(instruction,
-                      ShapeInference::InferTernaryOpShape(
-                          instruction->opcode(), instruction->operand(0),
-                          instruction->operand(1), instruction->operand(2)));
+Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
+                                 const Shape& inferred_shape) {
+  // If allow_mixed_precision_ is false, check if there are operands with
+  // different precisions. We need this check because ShapeInference allows
+  // mixed precision inputs.
+  if (!allow_mixed_precision_) {
+    TF_RETURN_IF_ERROR(CheckMixedPrecisionOperands(instruction));
+  }
+
+  // Check if the output shape matches the expected shape.
+  bool compatible;
+  // We treat BF16 and F32 as compatible types if mixed precision is allowed,
+  // but only when the instruction defines the BF16/F32 buffer.
+  switch (instruction->opcode()) {
+    case HloOpcode::kSelect:
+      if (ShapeUtil::IsTuple(inferred_shape) || !allow_mixed_precision_) {
+        // Select only defines the top-level buffer, which in this case is the
+        // tuple, so we cannot allow mixed precision.
+        compatible =
+            ShapeUtil::Compatible(instruction->shape(), inferred_shape);
+      } else {
+        compatible = ShapeUtil::CompatibleIgnoringFpPrecision(
+            instruction->shape(), inferred_shape);
+      }
+      break;
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kTuple:
+      // Tuple and GetTupleElement do not define BF16/F32 buffers, so mixed
+      // precision is disallowed.
+    case HloOpcode::kConstant:
+    case HloOpcode::kBitcast:
+    case HloOpcode::kBitcastConvert:
+    case HloOpcode::kCall:
+    case HloOpcode::kConditional:
+    case HloOpcode::kConvert:
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kInfeed:
+    case HloOpcode::kOutfeed:
+    case HloOpcode::kParameter:
+    case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
+    case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
+    case HloOpcode::kWhile:
+      // The above opcodes should match the expected shapes exactly.
+      compatible = ShapeUtil::Compatible(instruction->shape(), inferred_shape);
+      break;
+    default:
+      if (allow_mixed_precision_) {
+        compatible = ShapeUtil::CompatibleIgnoringFpPrecision(
+            instruction->shape(), inferred_shape);
+      } else {
+        compatible =
+            ShapeUtil::Compatible(instruction->shape(), inferred_shape);
+      }
   }
-  Status CheckVariadicShape(const HloInstruction* instruction) {
-    return CheckShape(instruction,
-                      ShapeInference::InferVariadicOpShape(
-                          instruction->opcode(), instruction->operands()));
+  if (!compatible) {
+    return InvalidArgument(
+        "Expected instruction to have shape compatible with %s, actual "
+        "shape is %s:\n%s",
+        ShapeUtil::HumanString(inferred_shape).c_str(),
+        ShapeUtil::HumanString(instruction->shape()).c_str(),
+        instruction->ToString().c_str());
   }
+  return tensorflow::Status::OK();
+}
 
-  // Checks if the given two instructions shares the same channel id.
-  Status CheckSameChannel(const HloInstruction* instr1,
-                          const HloInstruction* instr2) {
-    if (instr1->channel_id() != instr2->channel_id()) {
-      return FailedPrecondition(
-          "Expected to have the same channel id, actual channel ids are: %s "
-          "(%lld), %s (%lld)",
-          instr1->ToString().c_str(), instr1->channel_id(),
-          instr2->ToString().c_str(), instr2->channel_id());
-    }
-    return tensorflow::Status::OK();
+Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
+                                 const StatusOr<Shape>& inferred_shape_status) {
+  if (!inferred_shape_status.ok()) {
+    Status s = inferred_shape_status.status();
+    tensorflow::errors::AppendToMessage(&s, ", for instruction ",
+                                        instruction->ToString());
+    return s;
   }
+  return CheckShape(instruction, inferred_shape_status.ValueOrDie());
+}
 
-  // Returns the size of a Shape in bytes.
-  const std::function<int64(const Shape&)> shape_size_fn_;
-};
+Status ShapeVerifier::CheckUnaryShape(const HloInstruction* instruction) {
+  return CheckShape(instruction,
+                    ShapeInference::InferUnaryOpShape(instruction->opcode(),
+                                                      instruction->operand(0)));
+}
+
+Status ShapeVerifier::CheckBinaryShape(const HloInstruction* instruction) {
+  return CheckShape(
+      instruction, ShapeInference::InferBinaryOpShape(instruction->opcode(),
+                                                      instruction->operand(0),
+                                                      instruction->operand(1)));
+}
+
+Status ShapeVerifier::CheckTernaryShape(const HloInstruction* instruction) {
+  return CheckShape(instruction,
+                    ShapeInference::InferTernaryOpShape(
+                        instruction->opcode(), instruction->operand(0),
+                        instruction->operand(1), instruction->operand(2)));
+}
+
+Status ShapeVerifier::CheckVariadicShape(const HloInstruction* instruction) {
+  return CheckShape(instruction,
+                    ShapeInference::InferVariadicOpShape(
+                        instruction->opcode(), instruction->operands()));
+}
+
+// Checks if the given two instructions shares the same channel id.
+Status ShapeVerifier::CheckSameChannel(const HloInstruction* instr1,
+                                       const HloInstruction* instr2) {
+  if (instr1->channel_id() != instr2->channel_id()) {
+    return FailedPrecondition(
+        "Expected to have the same channel id, actual channel ids are: %s "
+        "(%lld), %s (%lld)",
+        instr1->ToString().c_str(), instr1->channel_id(),
+        instr2->ToString().c_str(), instr2->channel_id());
+  }
+  return tensorflow::Status::OK();
+}
 
 string ComputationsToString(
     tensorflow::gtl::ArraySlice<HloComputation*> computations) {
@@ -429,7 +545,62 @@ string ComputationsToString(
       });
 }
 
-}  // namespace
+// Verifies various invariants about the structure of the HLO:
+//
+// (1) each instruction has a non-null parent() set to the HloComputation which
+//     contains it.
+//
+// (2) each computation has a non-null parent() set to the HloModule which
+//     contains it.
+//
+// (3) the operands of each instruction are in the same computation as the
+//     instruction.
+Status VerifyHloStructure(HloModule* module) {
+  for (const HloComputation* computation : module->computations()) {
+    if (computation->parent() == nullptr) {
+      return FailedPrecondition("Computation %s has a null parent pointer",
+                                computation->name().c_str());
+    }
+    if (computation->parent() != module) {
+      return FailedPrecondition(
+          "Computation %s parent() does not point to parent module",
+          computation->name().c_str());
+    }
+
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (instruction->parent() == nullptr) {
+        return FailedPrecondition("Instruction %s has a null parent pointer",
+                                  instruction->name().c_str());
+      }
+      if (instruction->parent() != computation) {
+        return FailedPrecondition(
+            "Instruction %s parent() does not point to parent computation",
+            instruction->name().c_str());
+      }
+    }
+  }
+
+  // Check that operands are in the same computation separately from verifying
+  // parent() correctness so conditions like a null HloInstruction::parent() are
+  // identified and reported explicitly above rather than reporting a mismatched
+  // operand.
+  for (const HloComputation* computation : module->computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      for (int i = 0; i < instruction->operand_count(); ++i) {
+        const HloInstruction* operand = instruction->operand(i);
+        if (operand->parent() != instruction->parent()) {
+          return FailedPrecondition(
+              "Operand %d (%s) of instruction %s is in a different "
+              "computation: %s vs %s",
+              i, operand->name().c_str(), instruction->name().c_str(),
+              operand->parent()->name().c_str(),
+              instruction->parent()->name().c_str());
+        }
+      }
+    }
+  }
+  return tensorflow::Status::OK();
+}
 
 Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
   // The parent fusion instruction of the fusion computation must be 'fusion'.
@@ -549,8 +720,9 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
 }
 
 StatusOr<bool> HloVerifier::Run(HloModule* module) {
+  TF_RETURN_IF_ERROR(VerifyHloStructure(module));
+
   tensorflow::gtl::FlatMap<string, const HloInstruction*> instructions;
-  ShapeVerifier shape_verifier(shape_size_fn_);
 
   for (auto* computation : module->computations()) {
     for (const auto& instruction : computation->instructions()) {
@@ -630,7 +802,8 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
       instructions[instruction->name()] = instruction;
     }
 
-    TF_RETURN_IF_ERROR(computation->Accept(&shape_verifier));
+    std::unique_ptr<ShapeVerifier> shape_verifier = shape_verifier_factory_();
+    TF_RETURN_IF_ERROR(computation->Accept(shape_verifier.get()));
   }
 
   return false;
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index e35a7f3642ccf91df37f69a3a11bd8c8e428b846..7eccf834bbd3ac6af0d5762a7241758b416a3523 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -13,19 +13,124 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+
 namespace xla {
 
+// Visitor which verifies that the output shape is correctly set. Verifies
+// against the inferred shape for the instruction.
+// TODO(b/26024837): Check output shape for all instruction types.
+class ShapeVerifier : public DfsHloVisitor {
+ public:
+  explicit ShapeVerifier() : allow_mixed_precision_(false) {}
+  explicit ShapeVerifier(bool allow_mixed_precision)
+      : allow_mixed_precision_(allow_mixed_precision) {}
+
+  Status HandleElementwiseUnary(HloInstruction* hlo) override;
+  Status HandleElementwiseBinary(HloInstruction* hlo) override;
+  Status HandleClamp(HloInstruction* clamp) override;
+  Status HandleSelect(HloInstruction* select) override;
+  Status HandleConcatenate(HloInstruction* concatenate) override;
+  Status HandleConvert(HloInstruction* convert) override;
+  Status HandleBitcastConvert(HloInstruction* convert) override;
+  Status HandleCopy(HloInstruction* copy) override;
+  Status HandleDot(HloInstruction* dot) override;
+  Status HandleConvolution(HloInstruction* convolution) override;
+  Status HandleFft(HloInstruction* fft) override;
+  Status HandleCrossReplicaSum(HloInstruction* crs) override;
+  Status HandleReducePrecision(HloInstruction* reduce_precision) override;
+  Status HandleInfeed(HloInstruction*) override;
+  Status HandleOutfeed(HloInstruction*) override;
+  Status HandleRng(HloInstruction*) override;
+  Status HandleReverse(HloInstruction* reverse) override;
+  Status HandleSort(HloInstruction* sort) override;
+  Status HandleConstant(HloInstruction* constant) override;
+  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
+  Status HandleReduce(HloInstruction* reduce) override;
+  Status HandleBitcast(HloInstruction* bitcast) override;
+  Status HandleBroadcast(HloInstruction* broadcast) override;
+  Status HandleReshape(HloInstruction* reshape) override;
+  Status HandleTranspose(HloInstruction* transpose) override;
+  Status HandleParameter(HloInstruction*) override;
+  Status HandleFusion(HloInstruction*) override;
+  Status HandleCall(HloInstruction* call) override;
+  Status HandleCustomCall(HloInstruction*) override;
+  Status HandleSlice(HloInstruction* slice) override;
+  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
+  Status HandleDynamicUpdateSlice(
+      HloInstruction* dynamic_update_slice) override;
+  Status HandleTuple(HloInstruction* tuple) override;
+  Status HandleMap(HloInstruction* map) override;
+  Status HandleReduceWindow(HloInstruction* reduce_window) override;
+  Status HandleSelectAndScatter(HloInstruction* instruction) override;
+  Status HandleWhile(HloInstruction* xla_while) override;
+  Status HandleConditional(HloInstruction* conditional) override;
+  Status HandlePad(HloInstruction* pad) override;
+  Status HandleSend(HloInstruction* send) override;
+  Status HandleSendDone(HloInstruction* send_done) override;
+  Status HandleRecv(HloInstruction* recv) override;
+  Status HandleRecvDone(HloInstruction* recv_done) override;
+  Status HandleBatchNormTraining(HloInstruction* batch_norm_training) override;
+  Status HandleBatchNormInference(
+      HloInstruction* batch_norm_inference) override;
+  Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override;
+
+  Status FinishVisit(HloInstruction*) override {
+    return tensorflow::Status::OK();
+  }
+
+ protected:
+  // Check the instruction's shape against the shape given by ShapeInference
+  // and return an appropriate error if there is a mismatch.
+  Status CheckShape(const HloInstruction* instruction,
+                    const Shape& inferred_shape);
+
+  // Overload which takes a StatusOr to reduce boilerplate in the caller.
+  Status CheckShape(const HloInstruction* instruction,
+                    const StatusOr<Shape>& inferred_shape_status);
+
+  // Check a unary (binary, etc) instruction's shape against the inferred shape.
+  Status CheckUnaryShape(const HloInstruction* instruction);
+  Status CheckBinaryShape(const HloInstruction* instruction);
+  Status CheckTernaryShape(const HloInstruction* instruction);
+  Status CheckVariadicShape(const HloInstruction* instruction);
+
+  // Checks if the given two instructions shares the same channel id.
+  Status CheckSameChannel(const HloInstruction* instr1,
+                          const HloInstruction* instr2);
+
+ private:
+  // Whether the inputs and output of an instruction can contain both F32s and
+  // BF16s. Tuples that include both F32s and BF16s are allowed regardless of
+  // this flag.
+  bool allow_mixed_precision_;
+};
+
 // HLO pass that verifies invariants of HLO instructions for each computation in
 // the module.
 class HloVerifier : public HloPassInterface {
  public:
-  explicit HloVerifier(const std::function<int64(const Shape&)>& shape_size_fn)
-      : shape_size_fn_(shape_size_fn) {}
+  using ShapeVerifierFactory = std::function<std::unique_ptr<ShapeVerifier>()>;
+
+  // Uses standard shape inference.
+  explicit HloVerifier()
+      : shape_verifier_factory_(
+            [] { return MakeUnique<ShapeVerifier>(false); }) {}
+
+  explicit HloVerifier(bool allow_mixed_precision)
+      : shape_verifier_factory_([allow_mixed_precision] {
+          return MakeUnique<ShapeVerifier>(allow_mixed_precision);
+        }) {}
+
+  // Uses custom shape verification.
+  explicit HloVerifier(ShapeVerifierFactory shape_verifier_factory)
+      : shape_verifier_factory_(std::move(shape_verifier_factory)) {}
+
   ~HloVerifier() override = default;
   tensorflow::StringPiece name() const override { return "verifier"; }
 
@@ -37,10 +142,13 @@ class HloVerifier : public HloPassInterface {
   // CHECKs various invariants of a fusion instruction.
   Status CheckFusionInstruction(HloInstruction* fusion) const;
 
-  // Returns the size of a Shape in bytes.
-  const std::function<int64(const Shape&)> shape_size_fn_;
+  // Creates a ShapeVerifier that checks that shapes match inferred
+  // expectations.  This is a factory function because ShapeVerifier,  Note that
+  // ShapeVerifier, being a DfsHloVisitor, is stateful.  We want a clean object
+  // for each run of the verifier.
+  ShapeVerifierFactory shape_verifier_factory_;
 };
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c92db0be14dceb32ea86521dcc99b8f63738e4a5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -0,0 +1,127 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+using ::testing::HasSubstr;
+
+using HloVerifierTest = HloTestBase;
+
+TEST_F(HloVerifierTest, NullInstructionParent) {
+  HloComputation::Builder builder(TestName());
+  const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param"));
+  HloInstruction* negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+
+  negate->set_parent(nullptr);
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(), HasSubstr("has a null parent pointer"));
+}
+
+TEST_F(HloVerifierTest, NullComputationParent) {
+  HloComputation::Builder builder(TestName());
+  const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param"));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+
+  computation->set_parent(nullptr);
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(), HasSubstr("has a null parent pointer"));
+}
+
+TEST_F(HloVerifierTest, DifferentOperandParents) {
+  HloComputation::Builder builder(TestName());
+  const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param"));
+  HloInstruction* negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  HloComputation::Builder emb_builder(TestName());
+  HloInstruction* emb_param = emb_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param"));
+  module->AddEmbeddedComputation(emb_builder.Build());
+
+  TF_ASSERT_OK(verifier().Run(module.get()).status());
+  TF_ASSERT_OK(negate->ReplaceOperandWith(0, emb_param));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("is in a different computation"));
+}
+
+TEST_F(HloVerifierTest, ResetsShapeVerifierState) {
+  HloComputation::Builder builder(TestName());
+  Shape s1 = ShapeUtil::MakeShape(F32, {1});
+  Shape s2 = ShapeUtil::MakeShape(F32, {2});
+
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "param"));
+
+  // Create an add instruction with the incorrect shape.
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(s2, HloOpcode::kAdd, param, param));
+
+  // In order to trigger the bug we're checking for, the instruction with the
+  // bad shape can't be the root of the computation.
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(s2, HloOpcode::kMultiply, add, add));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  // Run the verifier twice.  It should fail both times, because it shouldn't
+  // carry state in its DFS visitor between runs.
+  EXPECT_FALSE(verifier().Run(module.get()).status().ok());
+  EXPECT_FALSE(verifier().Run(module.get()).status().ok());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index b7c40fdeeb157fc74900bd9cf9d68a06a2cb1d56..13e4557317f74b3fb46f07fb91c339fd2f34752f 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -25,6 +25,7 @@ namespace xla {
 using tensorflow::strings::Appendf;
 using tensorflow::strings::HumanReadableElapsedTime;
 using tensorflow::strings::HumanReadableNumBytes;
+using tensorflow::strings::Printf;
 using tensorflow::strings::StrAppend;
 
 string HumanReadableProfileBuilder::ToString() const {
@@ -43,7 +44,12 @@ string HumanReadableProfileBuilder::ToString() const {
     } else {
       bytes_per_sec =
           HumanReadableNumBytes(op.bytes_accessed / CyclesToSeconds(op.cycles));
-      bytes_per_cycle = HumanReadableNumBytes(op.bytes_accessed / op.cycles);
+      if (op.bytes_accessed > op.cycles) {
+        bytes_per_cycle = HumanReadableNumBytes(op.bytes_accessed / op.cycles);
+      } else {
+        bytes_per_cycle =
+            Printf("%.3fB", static_cast<float>(op.bytes_accessed) / op.cycles);
+      }
     }
 
     double cycles_percent = 0;
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ada21345014dac70d61129aaf7bbc7466a7db914
--- /dev/null
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover.cc
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+namespace {
+
+// Visitor for removing implicit broadcasts.
+class ImplicitBroadcastVisitor : public DfsHloVisitorWithDefault {
+ public:
+  Status DefaultAction(HloInstruction* hlo_instruction) override {
+    return Status::OK();
+  }
+
+  Status HandleElementwiseBinary(HloInstruction* hlo) override {
+    return ReplaceImplicitBroadcastOperands(hlo);
+  }
+
+  Status HandleClamp(HloInstruction* hlo) override {
+    // Clamp is the only element-wise ternary operation.
+    return ReplaceImplicitBroadcastOperands(hlo);
+  }
+
+  // Returns whether any modification has been made to any visited instruction.
+  bool changed() const { return changed_; }
+
+ private:
+  // Iterates through the operands of 'hlo' and replace any operands which are
+  // implicitly broadcast with the equivalent sequence of broadcast and reshape
+  // instructions. An operand is considered to be implicitly broadcast if the
+  // operand shape does have the same dimensions as the shape of 'hlo'.
+  Status ReplaceImplicitBroadcastOperands(HloInstruction* hlo) {
+    auto fadd = [hlo](std::unique_ptr<HloInstruction> x) {
+      return hlo->parent()->AddInstruction(std::move(x));
+    };
+    std::vector<HloInstruction*> operands;
+    bool operands_changed = false;
+    for (int i = 0; i < hlo->operand_count(); ++i) {
+      HloInstruction* operand = hlo->mutable_operand(i);
+      if (!ShapeUtil::SameDimensions(hlo->shape(), operand->shape())) {
+        HloInstruction* new_operand = hlo->parent()->AddInstruction(
+            HloInstruction::CreateBroadcastSequence(hlo->shape(), operand,
+                                                    fadd));
+        operands.push_back(new_operand);
+        operands_changed = true;
+      } else {
+        operands.push_back(operand);
+      }
+    }
+    if (operands_changed) {
+      // Create a new HLO instruction because the HloInstruction::Replace*
+      // methods check that the shape does not change with the replacement.
+      HloInstruction* new_hlo = hlo->parent()->AddInstruction(
+          hlo->CloneWithNewOperands(hlo->shape(), operands));
+      TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_hlo));
+      changed_ = true;
+    }
+    return Status::OK();
+  }
+
+  bool changed_ = false;
+};
+
+}  // namespace
+
+StatusOr<bool> ImplicitBroadcastRemover::Run(HloModule* module) {
+  VLOG(1) << "Removing implicit broadcast from module " << module->name();
+  XLA_VLOG_LINES(2,
+                 "Before removing implicit broadcasts:\n" + module->ToString());
+
+  ImplicitBroadcastVisitor visitor;
+  for (HloComputation* computation : module->computations()) {
+    TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  }
+
+  if (visitor.changed()) {
+    // HLO instructions with implicitly broadcast operands are cloned and left
+    // for dead. Remove them.
+    HloDCE dce;
+    TF_RETURN_IF_ERROR(dce.Run(module).status());
+  }
+
+  XLA_VLOG_LINES(2,
+                 "After removing implicit broadcasts:\n" + module->ToString());
+
+  return visitor.changed();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover.h b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa325dc8a353c5bfbfded0c2774c66bfcc71c9cb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Pass which replaces all implicit broadcasts with their equivalent sequence of
+// explicit broadcast and reshape instructions.
+class ImplicitBroadcastRemover : public HloPassInterface {
+ public:
+  ImplicitBroadcastRemover() {}
+  ~ImplicitBroadcastRemover() override {}
+
+  tensorflow::StringPiece name() const override {
+    return "implicit-broadcast-remover";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_IMPLICIT_BROADCAST_REMOVER_H_
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8c7b38dd1bf73e0be7b669d7215812aaef1cee17
--- /dev/null
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
@@ -0,0 +1,176 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+class ImplicitBroadcastRemoverTest : public HloVerifiedTestBase {
+ protected:
+  ImplicitBroadcastRemover remover_;
+};
+
+TEST_F(ImplicitBroadcastRemoverTest, NoImplicitBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+  auto param0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_FALSE(remover_.Run(&module()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Parameter(), op::Parameter()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "scalar_param"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kPower, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+
+  EXPECT_FALSE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+
+  EXPECT_THAT(root, op::Power(op::Broadcast(op::Parameter()), op::Parameter()));
+
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
+  auto param0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 4, 1}), "p1"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kSubtract, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Subtract(op::Parameter(),
+                                 op::Broadcast(op::Reshape(op::Parameter()))));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {1, 4, 1});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "scalar_param"));
+  auto param1 =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kSubtract, param0, param1));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root,
+              op::Subtract(op::Broadcast(op::Parameter()), op::Parameter()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6, 8});
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 4, 1, 8}), "p0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 1, 6, 8}), "p1"));
+  auto param2 = builder.AddInstruction(HloInstruction::CreateParameter(
+      2, ShapeUtil::MakeShape(F32, {2, 1, 6, 8}), "p2"));
+  builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
+                                                       param0, param1, param2));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Reshape(op::Parameter())),
+                              op::Broadcast(op::Reshape(op::Parameter())),
+                              op::Broadcast(op::Reshape(op::Parameter()))));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(2)->shape()));
+}
+
+TEST_F(ImplicitBroadcastRemoverTest,
+       TernaryScalarAndDegenerateDimensionBroadcast) {
+  auto builder = HloComputation::Builder(TestName());
+
+  const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
+  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, ShapeUtil::MakeShape(F32, {1, 4, 6}), "p1"));
+  auto param2 =
+      builder.AddInstruction(HloInstruction::CreateParameter(2, shape, "p2"));
+  builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
+                                                       param0, param1, param2));
+
+  HloComputation* computation = module().AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Parameter()),
+                              op::Broadcast(op::Reshape(op::Parameter())),
+                              op::Parameter()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(2)->shape()));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index ba901b99e4f3c72c84c1ecdf4e19e58ad9ab6506..90e1f0acdc4cdeda280dabaab2df66b181d0f407 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -100,6 +100,7 @@ namespace xla {
     case HloOpcode::kDivide:
     case HloOpcode::kDot:
     case HloOpcode::kExp:
+    case HloOpcode::kFft:
     case HloOpcode::kFusion:
     case HloOpcode::kLog:
     case HloOpcode::kMap:
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 2704a805a91b93c69b751cdb61305ea7780f0ef2..0819ab3b90b2360c6b0b2afaa89f322afe566eb3 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -92,6 +92,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index dc63a2224d659fa427d4d1a30c5dc0f94d643b36..9171e859c6f84ceef9664aa1eb90a07c87dfab40 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -44,41 +44,26 @@ namespace interpreter {
 namespace se = ::perftools::gputools;
 namespace sep = ::perftools::gputools::interpreter;
 
-/*
- * Run optimization passes on the module. The graph is transformed by
- * each pass in the optimization pipeline. The service subdirectory
- * contains useful optimization passes.
- */
 Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
-  pipeline.AddPass<Inliner>();
-  pipeline.AddPass<HloSubcomputationUnification>();
-  pipeline.AddPass<HloCSE>(false);
-
-  pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
-      false, [](const Shape&, const Shape&) { return false; });
-  pipeline.AddPass<WhileLoopSimplifier>();
-  pipeline.AddPass<ReshapeMover>();
-  pipeline.AddPass<HloConstantFolding>();
-  pipeline.AddPass<HloCSE>(true);
+
   pipeline.AddPass<LayoutAssignment>(
       hlo_module->mutable_entry_computation_layout());
 
-  pipeline.AddPass<HloDCE>();
-  pipeline.AddPass<FlattenCallGraph>();
   return pipeline.Run(hlo_module).status();
 }
 
 StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> hlo_module,
-    se::StreamExecutor* /*stream_exec*/) {
+    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* /*stream_exec*/,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   VLOG(1) << "Run hlo passes on graph " << hlo_module->name();
   TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
   return std::move(hlo_module);
 }
 
 StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
-    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec) {
+    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   TF_RET_CHECK(stream_exec != nullptr);
 
   VLOG(1) << "Run backend " << hlo_module->name();
@@ -96,7 +81,8 @@ StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> InterpreterCompiler::Compile(
     std::vector<std::unique_ptr<HloModule>> /*hlo_modules*/,
-    std::vector<std::vector<se::StreamExecutor*>> /*stream_execs*/) {
+    std::vector<std::vector<se::StreamExecutor*>> /*stream_execs*/,
+    DeviceMemoryAllocator* /*device_allocator*/) {
   return tensorflow::errors::Unimplemented(
       "Compilation of multiple HLO modules is not supported on Interpreter.");
 }
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h
index 278cf5184227ae25518b1d46c0e16e4cce7bd1a8..c8660c04d86a82e7dfcfd1658310c2a0e4fa0083 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@@ -45,16 +45,19 @@ class InterpreterCompiler : public Compiler {
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> hlo_module,
-      perftools::gputools::StreamExecutor* stream_exec) override;
+      perftools::gputools::StreamExecutor* stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> hlo_modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_exec) override;
+          stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> hlo_modules,
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 9183a1d1bfb8c2f6e1933c004f9c9f5f9ad8eced..0cb9b5d8107cd8bf468b07d5fe2a22930d9e8b8c 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/interpreter/executor.h"
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -38,7 +39,6 @@ namespace xla {
 namespace interpreter {
 
 namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::interpreter;
 
 InterpreterExecutable::InterpreterExecutable(
     std::unique_ptr<const HloModule> hlo_module)
@@ -47,44 +47,18 @@ InterpreterExecutable::InterpreterExecutable(
 
 InterpreterExecutable::~InterpreterExecutable() {}
 
-static se::DeviceMemoryBase AllocateSingleOutput(
-    sep::InterpreterExecutor* executor, const Literal& literal) {
-  int64 size(xla::ShapeUtil::ByteSizeOf(literal.shape()));
-  void* buf = executor->Allocate(size);
-  const void* src = literal.InternalData();
-  memcpy(buf, src, size);
-  return se::DeviceMemoryBase(buf, size);
-}
-
-static se::DeviceMemoryBase AllocateOutputBuffer(
-    sep::InterpreterExecutor* executor, const Literal& literal) {
-  const Shape& shape = literal.shape();
-  if (shape.element_type() != xla::TUPLE) {
-    return AllocateSingleOutput(executor, literal);
-  } else {
-    int64 size(xla::ShapeUtil::ByteSizeOf(shape, sizeof(void*)));
-    void** buf = reinterpret_cast<void**>(executor->Allocate(size));
-    void** buf_rc = buf;
-    for (int64 n = 0; n < xla::ShapeUtil::TupleElementCount(shape); n++) {
-      se::DeviceMemoryBase out =
-          AllocateSingleOutput(executor, literal.tuple_literals(n));
-      *buf++ = out.opaque();
-    }
-
-    return se::DeviceMemoryBase(buf_rc, size);
-  }
-}
-
-StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteOnStream(
+StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     HloExecutionProfile* hlo_execution_profile) {
   se::Stream* stream = run_options->stream();
+  se::StreamExecutor* executor = stream->parent();
+  const se::Platform* platform = executor->platform();
 
   VLOG(1) << "Execute " << module().name();
   if (VLOG_IS_ON(2)) {
     for (const auto& a : arguments) {
-      VLOG(2) << "-- argument " << a.opaque();
+      VLOG(2) << "-- argument " << *a;
     }
   }
 
@@ -96,33 +70,32 @@ StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteOnStream(
         "Mismatch between argument count and graph parameter count.");
   }
 
-  // Create the arguments as an vector of XLA literals
+  TF_ASSIGN_OR_RETURN(TransferManager * transfer_manager,
+                      TransferManager::GetForPlatform(platform));
+
+  // Transform the ShapedBuffer arguments into literals which the evaluator
+  // consumes.
   std::vector<std::unique_ptr<Literal>> arg_literals;
-  std::vector<Literal*> arg_literals_ptrs;
   for (int64 p = 0; p < computation->num_parameters(); ++p) {
-    // Create the input literal for the parameter
-    HloInstruction* param = computation->parameter_instruction(p);
-    arg_literals.emplace_back(Literal::CreateFromShape(param->shape()));
-    arg_literals_ptrs.push_back(arg_literals.back().get());
-
-    // Copy in the data from the stream_executor buffers
-    void* buffer = arg_literals.back()->MutableInternalData();
-    memcpy(buffer, arguments[p].opaque(),
-           ShapeUtil::ByteSizeOf(param->shape()));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<Literal> arg_literal,
+        transfer_manager->TransferLiteralFromDevice(executor, *arguments[p]));
+    arg_literals.push_back(std::move(arg_literal));
   }
 
   // Execute the graph using the HloEvaluator.
   HloEvaluator evaluator;
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> output,
-                      evaluator.Evaluate(*computation, arg_literals_ptrs));
-
-  // Copy the result into the return buffer
-  perftools::gputools::StreamExecutor* executor(stream->parent());
-  sep::InterpreterExecutor* interpreter_executor(
-      static_cast<sep::InterpreterExecutor*>(executor->implementation()));
-
-  se::DeviceMemoryBase ret =
-      AllocateOutputBuffer(interpreter_executor, *(output.get()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Literal> result_literal,
+      evaluator.Evaluate<std::unique_ptr<Literal>>(*computation, arg_literals));
+
+  // Transform the result literal back into a ShapedBuffer.
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<ShapedBuffer> result,
+                      transfer_manager->AllocateShapedBuffer(
+                          result_literal->shape(), run_options->allocator(),
+                          run_options->device_ordinal()));
+  TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice(
+      executor, *result_literal, *result));
 
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
@@ -132,20 +105,13 @@ StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteOnStream(
     execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
   }
 
-  return ret;
-}
-
-StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
-    const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  return tensorflow::errors::Unimplemented(
-      "ExecuteOnStream is not yet supported on Interpreter.");
+  return std::move(result);
 }
 
-StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteAsyncOnStream(
+StatusOr<std::unique_ptr<ShapedBuffer>>
+InterpreterExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
   return tensorflow::errors::Unimplemented(
       "ExecuteAsyncOnStream is not yet supported on Interpreter.");
 }
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index 0e87eb90bff4b896fc4bc0efc4fa7b851631be6f..410110a1adf04c83001c38ed03f5d60dd203dc7e 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -43,21 +43,14 @@ class InterpreterExecutable : public Executable {
   InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module);
   ~InterpreterExecutable() override;
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
-      const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
-
   StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       HloExecutionProfile* hlo_execution_profile) override;
 
-  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteAsyncOnStream(
+  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments) override;
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
   static int64 ShapeSizeBytes(const Shape& shape);
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index 0bb3259ef43915067e614e72038387e8300ecc41..68371910d76f42c0b6d4b1adad9d6a83bdb858e6 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -85,7 +85,7 @@ bool InterpreterExecutor::HostCallback(Stream *stream,
 bool InterpreterExecutor::CreateStreamDependency(Stream *dependent,
                                                  Stream *other) {
   AsExecutorStream(dependent)->EnqueueTask(
-      [other]() { other->BlockHostUntilDone(); });
+      [other]() { SE_CHECK_OK(other->BlockHostUntilDone()); });
   AsExecutorStream(dependent)->BlockUntilDone();
   return true;
 }
@@ -100,9 +100,9 @@ bool InterpreterExecutor::StopTimer(Stream *stream, Timer *timer) {
   return true;
 }
 
-bool InterpreterExecutor::BlockHostUntilDone(Stream *stream) {
+port::Status InterpreterExecutor::BlockHostUntilDone(Stream *stream) {
   AsExecutorStream(stream)->BlockUntilDone();
-  return true;
+  return port::Status::OK();
 }
 
 DeviceDescription *InterpreterExecutor::PopulateDeviceDescription() const {
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index c59b2ccb1505b78be0c459ac9311428d65cc7e44..c5d07e906dafb033905c50c604069e80e1ce80cd 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -157,7 +157,7 @@ class InterpreterExecutor : public internal::StreamExecutorInterface {
   bool StartTimer(Stream *stream, Timer *timer) override;
   bool StopTimer(Stream *stream, Timer *timer) override;
 
-  bool BlockHostUntilDone(Stream *stream) override;
+  port::Status BlockHostUntilDone(Stream *stream) override;
 
   int PlatformDeviceCount() override { return 1; }
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 7eda7c2284c2457703fcfcd4226172e41dd4ae01..fce135ef61a7868386b869def1a79167c428d928 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -61,8 +61,8 @@ std::ostream& operator<<(std::ostream& out,
 
 BufferLayoutConstraint::BufferLayoutConstraint(const Layout& layout,
                                                const LogicalBuffer& buffer,
-                                               bool mandatory)
-    : LayoutConstraint(mandatory), layout_(layout), buffer_(&buffer) {
+                                               bool mandatory, bool dfs)
+    : LayoutConstraint(mandatory, dfs), layout_(layout), buffer_(&buffer) {
   CHECK(LayoutUtil::ValidateLayoutForShape(layout, buffer.shape()).ok());
 }
 
@@ -74,14 +74,17 @@ string BufferLayoutConstraint::ToString() const {
 
 OperandLayoutConstraint::OperandLayoutConstraint(
     const ShapeLayout& shape_layout, const HloInstruction* instruction,
-    int64 operand_no, bool mandatory)
-    : LayoutConstraint(mandatory),
+    int64 operand_no, bool mandatory, bool dfs)
+    : LayoutConstraint(mandatory, dfs),
       shape_layout_(shape_layout),
       instruction_(instruction),
       operand_no_(operand_no) {
   CHECK(shape_layout_.LayoutIsSet());
   CHECK(ShapeUtil::Compatible(shape_layout.shape(),
-                              instruction->operand(operand_no)->shape()));
+                              instruction->operand(operand_no)->shape()))
+      << shape_layout.shape() << " is not compatible with "
+      << instruction->operand(operand_no)->shape() << " (for operand "
+      << operand_no << " of instruction " << instruction->ToString() << ")";
 }
 
 string OperandLayoutConstraint::ToString() const {
@@ -131,7 +134,7 @@ bool LayoutConstraints::OperandBufferForwarded(
 
 Status LayoutConstraints::SetBufferLayout(const Layout& layout,
                                           const LogicalBuffer& buffer,
-                                          bool mandatory) {
+                                          bool mandatory, bool dfs) {
   VLOG(3) << "SetBufferLayout : " << buffer << " : "
           << LayoutUtil::HumanString(layout);
 
@@ -168,10 +171,11 @@ Status LayoutConstraints::SetBufferLayout(const Layout& layout,
   if (!overwrite) {
     iter = buffer_constraints_
                .insert(std::make_pair(
-                   &buffer, BufferLayoutConstraint(layout, buffer, mandatory)))
+                   &buffer,
+                   BufferLayoutConstraint(layout, buffer, mandatory, dfs)))
                .first;
   } else {
-    iter->second = BufferLayoutConstraint(layout, buffer, /*mandatory=*/true);
+    iter->second = BufferLayoutConstraint(layout, buffer, mandatory, dfs);
   }
   added_constraints_.push_back(&iter->second);
 
@@ -185,7 +189,8 @@ Status LayoutConstraints::SetBufferLayout(const Layout& layout,
 
 Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout,
                                            const HloInstruction* instruction,
-                                           int64 operand_no, bool mandatory) {
+                                           int64 operand_no, bool mandatory,
+                                           bool dfs) {
   VLOG(3) << "SetOperandLayout : " << instruction->name() << ", operand "
           << operand_no << " : "
           << ShapeUtil::HumanStringWithLayout(shape_with_layout);
@@ -223,12 +228,12 @@ Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout,
   if (iter == operand_constraints_.end()) {
     auto pair = std::make_pair(
         key, OperandLayoutConstraint(ShapeLayout(shape_with_layout),
-                                     instruction, operand_no, mandatory));
+                                     instruction, operand_no, mandatory, dfs));
     iter = operand_constraints_.insert(pair).first;
   } else {
     iter->second =
         OperandLayoutConstraint(ShapeLayout(shape_with_layout), instruction,
-                                operand_no, /*mandatory=*/true);
+                                operand_no, mandatory, dfs);
   }
   added_constraints_.push_back(&iter->second);
 
@@ -237,16 +242,17 @@ Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout,
 
 Status LayoutConstraints::SetArrayOperandLayout(
     const Layout& layout, const HloInstruction* instruction, int64 operand_no,
-    bool mandatory) {
+    bool mandatory, bool dfs) {
   const HloInstruction* operand = instruction->operand(operand_no);
   TF_RET_CHECK(ShapeUtil::IsArray(operand->shape()));
   Shape shape(operand->shape());
   *shape.mutable_layout() = layout;
   TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutInShape(shape));
-  return SetOperandLayout(shape, instruction, operand_no, mandatory);
+  return SetOperandLayout(shape, instruction, operand_no, mandatory, dfs);
 }
 
-Status LayoutConstraints::SetResultLayout(const Shape& shape_with_layout) {
+Status LayoutConstraints::SetResultLayout(const Shape& shape_with_layout,
+                                          bool dfs) {
   VLOG(3) << "SetResultLayout : "
           << ShapeUtil::HumanStringWithLayout(shape_with_layout);
 
@@ -264,14 +270,15 @@ Status LayoutConstraints::SetResultLayout(const Shape& shape_with_layout) {
   }
 
   result_constraint_.reset(
-      new ResultLayoutConstraint(ShapeLayout(shape_with_layout)));
+      new ResultLayoutConstraint(ShapeLayout(shape_with_layout), dfs));
   added_constraints_.push_back(result_constraint_.get());
 
   return Status::OK();
 }
 
 Status LayoutConstraints::SetInstructionLayout(
-    const Shape& shape_with_layout, const HloInstruction* instruction) {
+    const Shape& shape_with_layout, const HloInstruction* instruction,
+    bool mandatory, bool dfs) {
   VLOG(3) << "SetInstructionLayout : " << instruction->name() << ", "
           << ShapeUtil::HumanStringWithLayout(shape_with_layout);
 
@@ -287,8 +294,8 @@ Status LayoutConstraints::SetInstructionLayout(
   // instruction.
   return ShapeUtil::ForEachSubshapeWithStatus(
       shape_with_layout,
-      [this, instruction](const Shape& subshape,
-                          const ShapeIndex& index) -> Status {
+      [this, instruction, mandatory](const Shape& subshape,
+                                     const ShapeIndex& index) -> Status {
         // The precondition for this method is that the instruction defines all
         // buffers in its output.
         auto buffers =
@@ -297,7 +304,7 @@ Status LayoutConstraints::SetInstructionLayout(
         CHECK_EQ(buffers[0]->instruction(), instruction);
 
         if (ShapeUtil::IsArray(subshape)) {
-          return SetBufferLayout(subshape.layout(), *buffers[0]);
+          return SetBufferLayout(subshape.layout(), *buffers[0], mandatory);
         } else {
           return Status::OK();
         }
@@ -369,8 +376,9 @@ string LayoutConstraints::ToString() const {
 }
 
 Status LayoutAssignment::AddMandatoryConstraints(
-    const ComputationLayout& computation_layout, HloComputation* computation,
-    LayoutConstraints* constraints) {
+    const ComputationLayout& computation_layout,
+    const ChannelLayoutConstraints* channel_constraints,
+    HloComputation* computation, LayoutConstraints* constraints) {
   VLOG(3) << "Adding mandatory layout constraints to computation "
           << computation->name();
 
@@ -390,8 +398,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
       // Constrain the input to the Outfeed instruction to be the expected
       // layout of the Outfeed.
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-          instruction->outfeed_shape(), instruction, 0,
-          /*mandatory=*/true));
+          instruction->outfeed_shape(), instruction, 0));
     } else if (instruction->opcode() == HloOpcode::kParameter) {
       // Parameter layouts must match the respective layout in
       // ComputationLayout.
@@ -403,6 +410,37 @@ Status LayoutAssignment::AddMandatoryConstraints(
       TF_RETURN_IF_ERROR(
           constraints->SetInstructionLayout(*shape_with_layout, instruction));
     }
+
+    if (instruction->opcode() == HloOpcode::kSend ||
+        instruction->opcode() == HloOpcode::kRecv) {
+      CHECK(channel_constraints)
+          << "Multi-module layout assignment requires ChannelLayoutConstraints";
+      int64 channel_id = instruction->channel_id();
+      if (!channel_constraints->IsChannelConstrained(channel_id)) {
+        continue;
+      }
+      if (instruction->opcode() == HloOpcode::kSend) {
+        // TODO(b/68493863): Change to use SetOperandLayout().
+        const Shape send_buffer_shape = instruction->operand(0)->shape();
+        TF_RET_CHECK(ShapeUtil::IsArray(send_buffer_shape));
+        Shape new_buffer_shape = channel_constraints->LayoutShapeForChannel(
+            send_buffer_shape, instruction->channel_id());
+        TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
+            new_buffer_shape, instruction->operand(0)));
+      } else {
+        const Shape recv_buffer_shape =
+            ShapeUtil::GetTupleElementShape(instruction->shape(), 0);
+        TF_RET_CHECK(ShapeUtil::IsArray(recv_buffer_shape));
+        TF_ASSIGN_OR_RETURN(
+            const LogicalBuffer* buffer,
+            constraints->points_to_analysis().GetBufferDefinedAt(instruction,
+                                                                 {0}));
+        Shape new_shape = channel_constraints->LayoutShapeForChannel(
+            recv_buffer_shape, instruction->channel_id());
+        TF_RETURN_IF_ERROR(
+            constraints->SetBufferLayout(new_shape.layout(), *buffer));
+      }
+    }
   }
 
   // Constrain layouts of instructions which call computations which have
@@ -422,7 +460,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
       for (int64 i = 0; i < instruction->operand_count(); ++i) {
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
             called_computation_layout.parameter_layout(i).shape(), instruction,
-            i, /*mandatory=*/true));
+            i));
       }
     } else if (instruction->opcode() == HloOpcode::kWhile) {
       // Layout of input and output of kWhile instruction must be equal and must
@@ -473,20 +511,16 @@ Status LayoutAssignment::AddMandatoryConstraints(
       TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
           body_layout.result_shape(), instruction));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-          body_layout.result_shape(), instruction, 0,
-          /*mandatory=*/true));
+          body_layout.result_shape(), instruction, 0));
     } else if (instruction->opcode() == HloOpcode::kCustomCall) {
+      if (!CustomCallRequiresMajorFirstLayout(instruction)) {
+        continue;
+      }
       // Add constraints for kCustomCall instruction operands and instructions.
-      // For now we only support row major layouts for all inputs and outputs.
-      auto row_major_shape = [](const Shape& old_shape) {
-        Shape new_shape(old_shape);
-        std::vector<int64> dimension_order(new_shape.dimensions_size());
-        std::iota(dimension_order.rbegin(), dimension_order.rend(), 0);
-        *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
-        return new_shape;
-      };
-
-      Shape result_shape(row_major_shape(instruction->shape()));
+      // For now we only support major-first layouts for all inputs and outputs.
+      Shape result_shape = ShapeUtil::MakeShapeWithDescendingLayout(
+          instruction->shape().element_type(),
+          AsInt64Slice(instruction->shape().dimensions()));
       TF_RETURN_IF_ERROR(
           constraints->SetInstructionLayout(result_shape, instruction));
       for (int64 i = 0; i < instruction->operand_count(); ++i) {
@@ -496,9 +530,12 @@ Status LayoutAssignment::AddMandatoryConstraints(
           continue;
         }
 
-        Shape row_major_operand_shape(row_major_shape(operand_shape));
+        Shape row_major_operand_shape =
+            ShapeUtil::MakeShapeWithDescendingLayout(
+                operand_shape.element_type(),
+                AsInt64Slice(operand_shape.dimensions()));
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-            row_major_operand_shape, instruction, i, /*mandatory=*/true));
+            row_major_operand_shape, instruction, i));
       }
     }
   }
@@ -530,9 +567,11 @@ Status CheckCallLayout(HloInstruction* call,
 Status CheckCustomCallLayout(HloInstruction* custom_call) {
   for (const HloInstruction* operand : custom_call->operands()) {
     TF_RET_CHECK(
+        ShapeUtil::IsOpaque(operand->shape()) ||
         LayoutUtil::IsMonotonicWithDim0Major(operand->shape().layout()));
   }
   TF_RET_CHECK(
+      ShapeUtil::IsOpaque(custom_call->shape()) ||
       LayoutUtil::IsMonotonicWithDim0Major(custom_call->shape().layout()));
   return Status::OK();
 }
@@ -601,11 +640,9 @@ Status CheckConstantLayout(HloInstruction* constant) {
   return Status::OK();
 }
 
-// Check that all layouts in the module have been set and satisfy all necessary
-// conditions.
-Status CheckLayouts(
-    HloModule* module,
-    const std::map<HloComputation*, ComputationLayout>& computation_layouts) {
+}  // namespace
+
+Status LayoutAssignment::CheckLayouts(HloModule* module) {
   TF_ASSIGN_OR_RETURN(auto points_to_analysis,
                       TuplePointsToAnalysis::Run(module));
   for (auto* computation : module->MakeNonfusionComputations()) {
@@ -649,10 +686,12 @@ Status CheckLayouts(
         case HloOpcode::kCall:
           TF_RETURN_IF_ERROR(CheckCallLayout(
               instruction,
-              FindOrDie(computation_layouts, instruction->to_apply())));
+              FindOrDie(computation_layouts_, instruction->to_apply())));
           break;
         case HloOpcode::kCustomCall:
-          TF_RETURN_IF_ERROR(CheckCustomCallLayout(instruction));
+          if (CustomCallRequiresMajorFirstLayout(instruction)) {
+            TF_RETURN_IF_ERROR(CheckCustomCallLayout(instruction));
+          }
           break;
         case HloOpcode::kFusion:
           TF_RETURN_IF_ERROR(CheckFusionLayout(instruction));
@@ -660,7 +699,7 @@ Status CheckLayouts(
         case HloOpcode::kParameter:
           TF_RETURN_IF_ERROR(CheckParameterLayout(
               instruction,
-              FindOrDie(computation_layouts, instruction->parent())));
+              FindOrDie(computation_layouts_, instruction->parent())));
           break;
         case HloOpcode::kConstant:
           TF_RETURN_IF_ERROR(CheckConstantLayout(instruction));
@@ -668,8 +707,8 @@ Status CheckLayouts(
         case HloOpcode::kWhile:
           TF_RETURN_IF_ERROR(CheckWhileLayout(
               instruction,
-              FindOrDie(computation_layouts, instruction->while_condition()),
-              FindOrDie(computation_layouts, instruction->while_body())));
+              FindOrDie(computation_layouts_, instruction->while_condition()),
+              FindOrDie(computation_layouts_, instruction->while_body())));
           break;
         default:
           break;
@@ -681,17 +720,18 @@ Status CheckLayouts(
   // computation root.
   TF_RET_CHECK(ShapeUtil::Equal(
       module->entry_computation()->root_instruction()->shape(),
-      FindOrDie(computation_layouts, module->entry_computation())
+      FindOrDie(computation_layouts_, module->entry_computation())
           .result_layout()
           .shape()));
 
   return Status::OK();
 }
 
-}  // namespace
-
-LayoutAssignment::LayoutAssignment(ComputationLayout* entry_computation_layout)
-    : entry_computation_layout_(entry_computation_layout) {
+LayoutAssignment::LayoutAssignment(
+    ComputationLayout* entry_computation_layout,
+    ChannelLayoutConstraints* channel_constraints)
+    : entry_computation_layout_(entry_computation_layout),
+      channel_layout_constraints_(channel_constraints) {
   VLOG(1) << "entry computation layout given to layout assignment: "
           << entry_computation_layout_->ToString();
   // Layouts of all parameter instructions must be set.
@@ -711,8 +751,8 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     int64 operand_no) {
   const HloInstruction* operand = instruction->operand(operand_no);
 
-  CHECK(ShapeUtil::IsArray(instruction->shape()) &&
-        ShapeUtil::IsArray(operand->shape()));
+  CHECK(ShapeUtil::IsArray(instruction->shape()));
+  CHECK(ShapeUtil::IsArray(operand->shape()));
 
   if (instruction->IsElementwiseOnOperand(operand_no) &&
       !ShapeUtil::IsScalar(operand->shape()) &&
@@ -742,7 +782,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     const Shape& output_shape = instruction->shape();
     Shape output_shape_with_layout = ShapeUtil::MakeShapeWithLayout(
         output_shape.element_type(), AsInt64Slice(output_shape.dimensions()),
-        AsInt64Slice(output_layout.minor_to_major()));
+        LayoutUtil::MinorToMajor(output_layout));
     Shape operand_shape = operand->shape();
     *operand_shape.mutable_layout() =
         LayoutUtil::GetDefaultLayoutForShape(operand_shape);
@@ -771,7 +811,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     int64 rank = ShapeUtil::Rank(instruction->shape());
     std::vector<int64> new_minor_to_major(rank);
     for (int64 i = 0; i < rank; ++i) {
-      int64 output_dim = output_layout.minor_to_major(i);
+      int64 output_dim = LayoutUtil::Minor(output_layout, i);
       int64 operand_dim = instruction->dimensions(output_dim);
       new_minor_to_major[i] = operand_dim;
     }
@@ -814,7 +854,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     Shape operand_shape_with_layout = ShapeUtil::MakeShapeWithLayout(
         operand->shape().element_type(),
         AsInt64Slice(operand->shape().dimensions()),
-        AsInt64Slice(operand_layout.minor_to_major()));
+        LayoutUtil::MinorToMajor(operand_layout));
     Shape output_shape = user->shape();
     *output_shape.mutable_layout() =
         LayoutUtil::GetDefaultLayoutForShape(output_shape);
@@ -844,7 +884,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     std::vector<int64> new_minor_to_major(rank);
     auto inverse_dimensions = InversePermutation(user->dimensions());
     for (int64 i = 0; i < rank; ++i) {
-      int64 operand_dim = operand_layout.minor_to_major(i);
+      int64 operand_dim = LayoutUtil::Minor(operand_layout, i);
       int64 user_dim = inverse_dimensions[operand_dim];
       new_minor_to_major[i] = user_dim;
     }
@@ -869,7 +909,11 @@ Status LayoutAssignment::PropagateConstraints(LayoutConstraints* constraints) {
   auto add_new_constraints_to_worklist = [constraints, &worklist]() {
     // Add constraints to the front of the deque for DFS ordering.
     for (auto* constraint : constraints->ConsumeAddedConstraints()) {
-      worklist.push_front(constraint);
+      if (constraint->dfs()) {
+        worklist.push_front(constraint);
+      } else {
+        worklist.push_back(constraint);
+      }
     }
   };
   add_new_constraints_to_worklist();
@@ -1198,7 +1242,8 @@ Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
 // instruction itself.
 Status SetFusionLayouts(HloInstruction* fusion) {
   TF_RET_CHECK(fusion->opcode() == HloOpcode::kFusion);
-  for (auto* fused_instruction : fusion->fused_instructions()) {
+  for (auto* fused_instruction :
+       fusion->fused_instructions_computation()->MakeInstructionPostOrder()) {
     if (fused_instruction->opcode() == HloOpcode::kParameter) {
       const HloInstruction* fusion_operand =
           fusion->operand(fused_instruction->parameter_number());
@@ -1213,11 +1258,22 @@ Status SetFusionLayouts(HloInstruction* fusion) {
           ShapeUtil::Compatible(fusion->shape(), fused_instruction->shape()));
       TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
           fusion->shape(), fused_instruction->mutable_shape()));
-    } else if (fused_instruction->opcode() != HloOpcode::kConstant &&
-               fused_instruction->opcode() != HloOpcode::kGetTupleElement &&
-               fused_instruction->opcode() != HloOpcode::kInfeed) {
-      // Internal fused instructions with the exception of constants
-      // and infeed need no layout.
+    } else if (fused_instruction->opcode() == HloOpcode::kGetTupleElement) {
+      // A GTE inherits its layout from its operand (which should ultimately be
+      // a parameter).
+      TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+          fused_instruction->operand(0)->shape().tuple_shapes(
+              fused_instruction->tuple_index()),
+          fused_instruction->mutable_shape()));
+    } else if (fused_instruction->opcode() == HloOpcode::kConstant) {
+      // Give constants the layout of their literal.
+      TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+          fused_instruction->literal().shape(),
+          fused_instruction->mutable_shape()));
+    } else if (fused_instruction->opcode() == HloOpcode::kInfeed) {
+      // Nop; leave the infeed layout alone.
+    } else {
+      // Other instructions don't have layouts inside of fusion nodes.
       LayoutUtil::ClearLayout(fused_instruction->mutable_shape());
     }
   }
@@ -1303,8 +1359,8 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
     TF_RET_CHECK(LayoutUtil::HasLayout(instruction->shape()));
   }
 
-  // Copy the root instrucion's result if the it does not match the result
-  // layout constraint
+  // Copy the root instruction's result if its layout does not match the result
+  // layout constraint.
   if (constraints.ResultLayout() != nullptr &&
       !constraints.ResultLayout()->MatchesLayoutInShape(
           computation->root_instruction()->shape())) {
@@ -1321,7 +1377,8 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
 Status LayoutAssignment::RunOnComputation(
     const ComputationLayout& computation_layout,
     const TuplePointsToAnalysis& points_to_analysis,
-    HloComputation* computation) {
+    HloComputation* computation,
+    ChannelLayoutConstraints* channel_constraints) {
   DCHECK(computation_layout.LayoutIsSet());
   InsertOrDie(&computation_layouts_, computation, computation_layout);
   VLOG(2) << "LayoutAssignment::RunOnComputation(" << computation->name()
@@ -1333,13 +1390,13 @@ Status LayoutAssignment::RunOnComputation(
 
   // Add constraints required for correctness on all backends (eg, entry
   // parameter layout constraints).
-  TF_RETURN_IF_ERROR(
-      AddMandatoryConstraints(computation_layout, computation, &constraints));
+  TF_RETURN_IF_ERROR(AddMandatoryConstraints(
+      computation_layout, channel_constraints, computation, &constraints));
 
   // Add any backend-specific constraints.
   TF_RETURN_IF_ERROR(AddBackendConstraints(&constraints));
 
-  // Propagates layouts from an HLO to its neighbors.
+  // Propagates layouts from mandatory and backend constraints.
   TF_RETURN_IF_ERROR(PropagateConstraints(&constraints));
 
   // While any unconstrained buffers remain, pick an arbitrary buffer, give it a
@@ -1373,7 +1430,20 @@ Status LayoutAssignment::RunOnComputation(
   // All logical buffers should have constraints at this point. All that
   // remains is assign the constraints to the buffers and infer layouts for
   // aliased buffers.
-  return AssignLayouts(constraints, computation);
+  TF_RETURN_IF_ERROR(AssignLayouts(constraints, computation));
+
+  // Record the layouts assigned for any communication ops in
+  // channel_constraints so that they are constrained for future modules.
+  for (HloInstruction* instruction : computation->instructions()) {
+    if (instruction->opcode() == HloOpcode::kSend) {
+      channel_constraints->ConstrainChannel(
+          instruction->channel_id(), instruction->operand(0)->shape().layout());
+    } else if (instruction->opcode() == HloOpcode::kRecvDone) {
+      channel_constraints->ConstrainChannel(instruction->channel_id(),
+                                            instruction->shape().layout());
+    }
+  }
+  return Status::OK();
 }
 
 StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
@@ -1391,24 +1461,39 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   // Assign layouts to computations in an order such that a callee computation
   // is handled before its caller computation. This ensures that the layout of
   // all callers of a computation will agree.
+  std::list<HloComputation*> computation_post_order =
+      module->MakeComputationPostOrder();
   for (auto* computation : module->MakeComputationPostOrder()) {
-    if (computation == module->entry_computation()) {
-      TF_RETURN_IF_ERROR(RunOnComputation(*entry_computation_layout_,
-                                          *points_to_analysis,
-                                          module->entry_computation()));
-    } else if (computation->IsFusionComputation()) {
+    if (computation->IsFusionComputation()) {
       continue;
+    }
+    // Clear existing layouts of the instructions.  All layouts must be assigned
+    // by the LayoutAssignment pass, except for those on infeeds, parameters,
+    // and the computation result. The latter two are specified in
+    // computation_layout, so we only need to keep the existing layouts for
+    // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
+    // layout assignment pass that may accidently use the existing layout.
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kInfeed) {
+        LayoutUtil::ClearLayout(instruction->mutable_shape());
+      }
+    }
+    if (computation == module->entry_computation()) {
+      TF_RETURN_IF_ERROR(RunOnComputation(
+          *entry_computation_layout_, *points_to_analysis,
+          module->entry_computation(), channel_layout_constraints_));
     } else {
       ComputationLayout computation_layout(computation->ComputeProgramShape());
       // Setting all embedded computations to the default layout is potentially
       // suboptimal.
       computation_layout.SetToDefaultLayout();
       TF_RETURN_IF_ERROR(RunOnComputation(computation_layout,
-                                          *points_to_analysis, computation));
+                                          *points_to_analysis, computation,
+                                          channel_layout_constraints_));
     }
   }
 
-  TF_RETURN_IF_ERROR(CheckLayouts(module, computation_layouts_));
+  TF_RETURN_IF_ERROR(CheckLayouts(module));
 
   VLOG(3) << "After layout assignment:";
   XLA_VLOG_LINES(3, module->ToString());
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 0b97fba744923b8afc3fb539566b68f1bca47d38..29018584487cabfd740d7914625c2a50f552d6ff 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -46,7 +46,8 @@ namespace xla {
 // gathered together in LayoutConstraints object.
 class LayoutConstraint {
  public:
-  LayoutConstraint(bool mandatory) : mandatory_(mandatory) {}
+  LayoutConstraint(bool mandatory, bool dfs)
+      : mandatory_(mandatory), dfs_(dfs) {}
   virtual ~LayoutConstraint() = default;
 
   virtual string ToString() const = 0;
@@ -54,8 +55,12 @@ class LayoutConstraint {
   // True if this constraint cannot be overwritten by a different constraint.
   bool mandatory() const { return mandatory_; }
 
+  // When true, propagate in DFS. When false, constraint will propagate in BFS.
+  bool dfs() const { return dfs_; }
+
  private:
   bool mandatory_;
+  bool dfs_;
 };
 
 std::ostream& operator<<(std::ostream& out, const LayoutConstraint& constraint);
@@ -65,7 +70,7 @@ std::ostream& operator<<(std::ostream& out, const LayoutConstraint& constraint);
 class BufferLayoutConstraint : public LayoutConstraint {
  public:
   BufferLayoutConstraint(const Layout& layout, const LogicalBuffer& buffer,
-                         bool mandatory);
+                         bool mandatory, bool dfs);
 
   const LogicalBuffer& buffer() const { return *buffer_; }
   const Layout& layout() const { return layout_; }
@@ -86,7 +91,7 @@ class OperandLayoutConstraint : public LayoutConstraint {
  public:
   OperandLayoutConstraint(const ShapeLayout& shape_layout,
                           const HloInstruction* instruction, int64 operand_no,
-                          bool mandatory);
+                          bool mandatory, bool dfs);
 
   const ShapeLayout& shape_layout() const { return shape_layout_; }
   const HloInstruction* instruction() const { return instruction_; }
@@ -106,8 +111,10 @@ class OperandLayoutConstraint : public LayoutConstraint {
 // Constraint on the layout of the result of the entry computation.
 class ResultLayoutConstraint : public LayoutConstraint {
  public:
-  explicit ResultLayoutConstraint(const ShapeLayout& shape_layout)
-      : LayoutConstraint(/*mandatory=*/true), shape_layout_(shape_layout) {}
+  explicit ResultLayoutConstraint(const ShapeLayout& shape_layout,
+                                  bool dfs = false)
+      : LayoutConstraint(/*mandatory=*/true, dfs),
+        shape_layout_(shape_layout) {}
 
   const ShapeLayout& shape_layout() const { return shape_layout_; }
   string ToString() const override;
@@ -157,23 +164,25 @@ class LayoutConstraints {
   // operand of the instruction, or the layout of the result of the computation,
   // respectively.
   Status SetBufferLayout(const Layout& layout, const LogicalBuffer& buffer,
-                         bool mandatory = true);
+                         bool mandatory = true, bool dfs = true);
   Status SetOperandLayout(const Shape& shape_with_layout,
                           const HloInstruction* instruction, int64 operand_no,
-                          bool mandatory = true);
-  Status SetResultLayout(const Shape& shape_with_layout);
+                          bool mandatory = true, bool dfs = true);
+  Status SetResultLayout(const Shape& shape_with_layout, bool dfs = true);
 
   // Convenience wrapper around SetOperandLayout for setting the layout of a
   // operand using a Layout object. The operand must be array-shaped.
   Status SetArrayOperandLayout(const Layout& layout,
                                const HloInstruction* instruction,
-                               int64 operand_no, bool mandatory = true);
+                               int64 operand_no, bool mandatory = true,
+                               bool dfs = true);
 
   // Convenience wrapper around SetBufferLayout. Sets the layouts of all buffers
   // created by the instruction to the layouts in the given shape. The
   // instruction must define every logical buffer in its output.
   Status SetInstructionLayout(const Shape& shape_with_layout,
-                              const HloInstruction* instruction);
+                              const HloInstruction* instruction,
+                              bool mandatory = true, bool dfs = true);
 
   // Returns true if any buffer in the given operand is forwarded to the output
   // of the given instruction. For example, the Tuple instruction forwards the
@@ -215,13 +224,62 @@ class LayoutConstraints {
   HloComputation* computation_;
 };
 
+// Contains constraints on the layout of channels; sends and recvs.
+class ChannelLayoutConstraints {
+ public:
+  // Construct an empty constraint set.
+  ChannelLayoutConstraints() {}
+
+  // Returns true if channel_id has a layout constraint.
+  bool IsChannelConstrained(int64 channel_id) const {
+    return constraints_.count(channel_id) > 0;
+  }
+
+  // Given `shape`, apply the layout for `channel_id`. `channel_id` must already
+  // be constrained.
+  Shape LayoutShapeForChannel(Shape shape, int64 channel_id) const {
+    CHECK(IsChannelConstrained(channel_id));
+    *shape.mutable_layout() = constraints_.at(channel_id);
+    return shape;
+  }
+
+  // Returns the layout constraint for `channel_id`, which must already be
+  // constrained.
+  Layout LayoutForChannel(int64 channel_id) const {
+    CHECK(IsChannelConstrained(channel_id));
+    return constraints_.at(channel_id);
+  }
+
+  // Adds a new layout constraint for `channel_id`. If a constraint for
+  // `channel_id` already exists, this operation requires that the new layout is
+  // the same as the previously constrained layout.
+  void ConstrainChannel(int64 channel_id, const Layout& layout) {
+    CHECK(!IsChannelConstrained(channel_id) ||
+          LayoutUtil::Equal(layout, constraints_[channel_id]));
+    constraints_[channel_id] = layout;
+  }
+
+ private:
+  std::unordered_map<int64, Layout> constraints_;
+};
+
 // HLO pass which assigns layouts to all instructions in the HLO module while
 // satisfying all necessary invariants and minimizing cost.
 class LayoutAssignment : public HloPassInterface {
  public:
   // entry_computation_layout is modified to populate a layout for the result in
   // the case that no particular layout is requested.
-  explicit LayoutAssignment(ComputationLayout* entry_computation_layout);
+  //
+  // channel_constraints is both an input and output. Any sends or recvs that
+  // are present in channel_constraints will be layed out as constrained. Any
+  // unconstrained sends or recvs will be layed out as locally optimal and their
+  // layout will be added as a constraint to channel_constraints.
+  //
+  // If channel_constraints is nullptr, no kSend or kRecvs must be contained
+  // within any module passed to `Run`.
+  explicit LayoutAssignment(
+      ComputationLayout* entry_computation_layout,
+      ChannelLayoutConstraints* channel_constraints = nullptr);
   ~LayoutAssignment() override {}
   tensorflow::StringPiece name() const override { return "layout-assignment"; }
 
@@ -247,6 +305,19 @@ class LayoutAssignment : public HloPassInterface {
       const ResultLayoutConstraint& layout_constraint,
       LayoutConstraints* constraints);
 
+  // By default LayoutAssignment ensures that inputs and outputs of CustomCalls
+  // have the "major-first" layout (i.e.  {n, n-1, ..., 0}).
+  //
+  // If this function returns true, LayoutAssignment does not set a layout for
+  // the given CustomCall.  It's up to the backend to set one in
+  // AddBackendConstraints, if necessary.
+  //
+  // Precondition: instruction->opcode() == HloOpcode::kCustomCall.
+  virtual bool CustomCallRequiresMajorFirstLayout(
+      const HloInstruction* /*instruction*/) {
+    return true;
+  }
+
   // Called after layouts of an instruction have been finalized to allow
   // subclasses to check for platform specific assumptions.
   virtual Status Verify(const HloInstruction* instruction) {
@@ -283,9 +354,10 @@ class LayoutAssignment : public HloPassInterface {
  private:
   // Adds constraints which must be satisfied for correctness on all
   // backends. Called once prior to propagating constraints.
-  Status AddMandatoryConstraints(const ComputationLayout& computation_layout,
-                                 HloComputation* computation,
-                                 LayoutConstraints* constraints);
+  Status AddMandatoryConstraints(
+      const ComputationLayout& computation_layout,
+      const ChannelLayoutConstraints* channel_constraints,
+      HloComputation* computation, LayoutConstraints* constraints);
 
   // This method can be overridden to add backend-specific constraints to the
   // layout of the instructions of a computation. This method is called after
@@ -301,7 +373,8 @@ class LayoutAssignment : public HloPassInterface {
   // constrained.
   Status RunOnComputation(const ComputationLayout& computation_layout,
                           const TuplePointsToAnalysis& points_to_analysis,
-                          HloComputation* computation);
+                          HloComputation* computation,
+                          ChannelLayoutConstraints* channel_constraints);
 
   // Assign layouts to the instructions of a computation which satisfy the given
   // layout constraints. Copies may be added to satisfy the constraints. The
@@ -315,7 +388,12 @@ class LayoutAssignment : public HloPassInterface {
   // required for correctness.
   Status PropagateConstraints(LayoutConstraints* constraints);
 
+  // Check that all layouts in the module have been set and satisfy all
+  // necessary conditions.
+  Status CheckLayouts(HloModule* module);
+
   ComputationLayout* entry_computation_layout_;
+  ChannelLayoutConstraints* channel_layout_constraints_;
 
  protected:
   // Map containing the layouts of all computations assigned so
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index d51c0d1dfb727801d6d2a8328eba60838373479f..e269a13459f1146f1d2952870399827d9e705e38 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -35,9 +35,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace op = xla::testing::opcode_matchers;
@@ -587,5 +589,74 @@ TEST_F(LayoutAssignmentTest, TransposeToBitcastToUser) {
   EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(transpose->operand(0)->shape(),
                                             transpose->shape(), {2, 3, 0, 1}));
 }
+
+// A GTE inside of a fusion node inherits the layout of its operand (which
+// should, if we keep following operands, eventually be a parameter).
+TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
+  const char* module_str = R"(
+    HloModule test_module
+
+    fused_computation {
+      fparam = (f32[2,2,2], (f32[2,2,2], f32[2,2,2])) parameter(0)
+      gte0 = f32[2,2,2] get-tuple-element(fparam), index=0
+      gte1 = (f32[2,2,2], f32[2,2,2]) get-tuple-element(fparam), index=1
+      gte1a = f32[2,2,2] get-tuple-element(gte1), index=0
+      gte1b = f32[2,2,2] get-tuple-element(gte1), index=1
+      add = f32[2,2,2] add(gte1a, gte1b)
+      ROOT fresult = f32[2,2,2] add(gte0, add)
+    }
+
+    ENTRY entry_computation {
+      param = (f32[2,2,2], (f32[2,2,2], f32[2,2,2])) parameter(0)
+      ROOT fusion =
+        f32[2,2,2] fusion(param), kind=kLoop, calls=fused_computation
+    }
+  )";
+
+  auto module = tools::Parse(module_str).ValueOrDie();
+  ComputationLayout computation_layout(
+      module->entry_computation()->ComputeProgramShape());
+  Shape param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 1, 2}),
+       ShapeUtil::MakeTupleShape({
+           ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {1, 2, 0}),
+           ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {2, 0, 1}),
+       })});
+  TF_ASSERT_OK(
+      computation_layout.mutable_parameter_layout(0)->CopyLayoutFromShape(
+          param_shape));
+  computation_layout.mutable_result_layout()->ResetLayout(
+      LayoutUtil::MakeLayout({2, 1, 0}));
+  AssignLayouts(module.get(), &computation_layout);
+
+  HloComputation* fused_computation = *std::find_if(
+      module->computations().begin(), module->computations().end(),
+      [](const HloComputation* c) { return c->name() == "fused_computation"; });
+
+  auto fused_instr = [&](const string& name) {
+    auto it = std::find_if(
+        fused_computation->instructions().begin(),
+        fused_computation->instructions().end(),
+        [&](const HloInstruction* i) { return i->name() == name; });
+    CHECK(it != fused_computation->instructions().end());
+    return *it;
+  };
+
+  EXPECT_THAT(fused_instr("gte0")->shape().layout().minor_to_major(),
+              ElementsAre(0, 1, 2));
+  EXPECT_THAT(
+      fused_instr("gte1")->shape().tuple_shapes(0).layout().minor_to_major(),
+      ElementsAre(1, 2, 0));
+  EXPECT_THAT(
+      fused_instr("gte1")->shape().tuple_shapes(1).layout().minor_to_major(),
+      ElementsAre(2, 0, 1));
+  EXPECT_THAT(fused_instr("gte1a")->shape().layout().minor_to_major(),
+              ElementsAre(1, 2, 0));
+  EXPECT_THAT(fused_instr("gte1b")->shape().layout().minor_to_major(),
+              ElementsAre(2, 0, 1));
+  EXPECT_THAT(fused_instr("fresult")->shape().layout().minor_to_major(),
+              ElementsAre(2, 1, 0));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/liveness_util_test.cc b/tensorflow/compiler/xla/service/liveness_util_test.cc
index 476e86fa72ad691cda52097c953ba15132f206a7..2c2a02f6375343d67dfb155bbb03729ff6e490d2 100644
--- a/tensorflow/compiler/xla/service/liveness_util_test.cc
+++ b/tensorflow/compiler/xla/service/liveness_util_test.cc
@@ -277,8 +277,11 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
   auto b = builder.AddInstruction(HloInstruction::CreateConstant(
       Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
 
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b));
+      HloInstruction::CreateDot(data_shape, a, b, dot_dnums));
 
   auto one = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
@@ -312,8 +315,11 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedTransposeDotAdd) {
   auto b_t = builder.AddInstruction(
       HloInstruction::CreateTranspose(data_shape, b, {1, 0}));
 
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b_t));
+      HloInstruction::CreateDot(data_shape, a, b_t, dot_dnums));
 
   auto one = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc
index 34f3419269abbc73cd0ddb13c723a8da38ab19ff..f98fc0400a7d827a29dcddc5eecf9a4a01e76590 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace xla {
 StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
     std::vector<std::unique_ptr<HloModule>> modules,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-        stream_execs) {
+    std::vector<std::vector<perftools::gputools::StreamExecutor*>> stream_execs,
+    DeviceMemoryAllocator* device_allocator) {
   std::vector<std::unique_ptr<Executable>> result;
   for (size_t i = 0; i < modules.size(); i++) {
     if (stream_execs[i].size() != 1) {
@@ -27,10 +27,12 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
           "Model partitioning not implemented for the CPU/GPU compilers!");
     }
 
-    TF_ASSIGN_OR_RETURN(
-        modules[i], RunHloPasses(std::move(modules[i]), stream_execs[i][0]));
+    TF_ASSIGN_OR_RETURN(modules[i],
+                        RunHloPasses(std::move(modules[i]), stream_execs[i][0],
+                                     device_allocator));
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                        RunBackend(std::move(modules[i]), stream_execs[i][0]));
+                        RunBackend(std::move(modules[i]), stream_execs[i][0],
+                                   device_allocator));
     result.push_back(std::move(executable));
   }
 
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index c5393cef4f961c5d04c32d0d4291732b8ec702f1..d74e81bb7f622ac5e89203a3d02ca5ad839da07e 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -60,17 +60,20 @@ class LLVMCompiler : public Compiler {
   // Bring in
   //   StatusOr<std::unique_ptr<Executable>> RunBackend(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec)
+  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       DeviceMemoryAllocator* device_allocator)
   //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
   //       std::unique_ptr<HloModule> module,
-  //       perftools::gputools::StreamExecutor* stream_exec)
+  //       perftools::gputools::StreamExecutor* stream_exec,
+  //       DeviceMemoryAllocator* device_allocator)
   using Compiler::RunBackend;
   using Compiler::RunHloPasses;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_execs) override;
+          stream_execs,
+      DeviceMemoryAllocator* device_allocator) override;
 
  protected:
   ModuleHook user_pre_optimization_hook_;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index d878061f724de1c82f8285b0f082d0be4d5778df..37261ed1e665ebed9685751161a412ad114a9e96 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -48,11 +48,13 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/core:lib",
         "@llvm//:core",
         "@llvm//:support",
         "@llvm//:target",
+        "@llvm//:transform_utils",
     ],
 )
 
@@ -156,18 +158,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "vector_support_library",
-    srcs = ["vector_support_library.cc"],
-    hdrs = ["vector_support_library.h"],
-    deps = [
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
-        "@llvm//:core",
-    ],
-)
-
 cc_library(
     name = "kernel_support_library",
     srcs = ["kernel_support_library.cc"],
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index 9ad7cd82cb8ca862fd7acec3dfb12c9fd61f6e27..b3b6026ef17daa184c0a015fdea618597ef068b3 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -32,8 +32,23 @@ limitations under the License.
 
 namespace xla {
 
-// Unlike IrEmitter, this creates host functions which emit IR to generate the
-// output element at the given index. It is used to generate fused operations.
+// FusedIrEmitter is used to generate code for fusion nodes.
+//
+// Unlike IrEmitter and its ilk, which directly create LLVM IR in an LLVM
+// Module, FusedIrEmitter is better understood as "IR generator generator".
+// FusedIrEmitter recursively creates a generator (a host function) which the
+// compiler can invoke at a later time.  Invoking the generator emits LLVM IR
+// that, when run, produces the value at a particular index of the output.
+//
+// After building this generator, the compiler creates a loop (or its moral
+// equivalent, e.g. a GPU kernel) and calls the generator from within the loop.
+// This generates code that produces each element of the output.
+//
+// This class handles both vanilla fusion and multi-output fusion.  In the MOF
+// case, the fusion node ends with a kTuple instruction, and the generator
+// created produces an LLVM struct with N elements, one for each element of the
+// arrays in the tuple.  It follows that the arrays in the tuple must have the
+// same length.
 class FusedIrEmitter : public DfsHloVisitorWithDefault {
  public:
   using Generator = llvm_ir::ElementGenerator;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index 7224bd689842d89563b374f3db3d4e314be18764..6384c7f46f5ebbedaeda232b40095611a5d738a4 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -39,13 +39,27 @@ IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
       << "Shape " << ShapeUtil::HumanStringWithLayout(shape)
       << " should have a layout.";
   int64 divisor = 1;
-  for (int64 dimension : layout_.minor_to_major()) {
+  for (int64 i = 0; i < layout_.minor_to_major_size(); ++i) {
+    int64 dimension = layout_.minor_to_major(i);
     int64 size_of_current_dimension = shape.dimensions(dimension);
-    // Emit IR instructions that compute
-    //   (linear_index / divisor) % current_dimension
-    multidim_[dimension] = ir_builder->CreateURem(
-        ir_builder->CreateUDiv(linear, ir_builder->getInt64(divisor)),
-        ir_builder->getInt64(size_of_current_dimension));
+
+    // If i is not the last dimension, compute
+    //   (linear_index / divisor) % current_dimension.
+    // If i is the last dimension, we can skip the mod, because we assume that
+    // linear is in bounds.
+    //
+    // TODO(jlebar): We could add bounds checks here and elsewhere in this file,
+    // guarded under some sort of xla-memcheck flag.  This might be particularly
+    // useful because cuda-memcheck can't help us much in XLA: Most of our
+    // memory lives in one big allocation, so cuda-memcheck can't detect
+    // out-of-bounds accesses.
+    auto* quot = ir_builder->CreateUDiv(linear, ir_builder->getInt64(divisor));
+    if (i < layout_.minor_to_major_size() - 1) {
+      multidim_[dimension] = ir_builder->CreateURem(
+          quot, ir_builder->getInt64(size_of_current_dimension));
+    } else {
+      multidim_[dimension] = quot;
+    }
     divisor *= size_of_current_dimension;
   }
 }
@@ -244,8 +258,8 @@ llvm::Value* IrArray::EmitArrayElementAddress(
   //
   //   getelementptr base_ptr_, 0, most major index, ..., most minor index
   std::vector<llvm::Value*> gep_indices(1, ir_builder->getInt64(0));
-  for (int64 i = shape_->layout().minor_to_major_size() - 1; i >= 0; --i) {
-    int64 dimension = shape_->layout().minor_to_major(i);
+  for (int64 i = 0; i < LayoutUtil::MinorToMajor(*shape_).size(); ++i) {
+    int64 dimension = LayoutUtil::Major(shape_->layout(), i);
     gep_indices.push_back(actual_index[dimension]);
   }
   return ir_builder->CreateInBoundsGEP(base_ptr_, gep_indices,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
index 29cc0f81bd2c06538e28d1b593ee6a897fea0f27..23d2d4e87d26f4988ebddcf20f5a27af6a7fe0d6 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 
 namespace xla {
 void KernelSupportLibrary::For(
@@ -62,4 +63,72 @@ void KernelSupportLibrary::If(
   false_block_generator();
   llvm_ir::SetToLastInsertPoint(if_data.after_block, ir_builder_);
 }
+
+void KernelSupportLibrary::EmitAndCallOutlinedKernel(
+    bool enable_fast_math, bool optimize_for_size,
+    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
+    KernelSupportLibrary::ArgumentVector arguments,
+    const std::function<void(KernelSupportLibrary::ArgumentVector)>&
+        kernel_body_generator) {
+  llvm::Module* module = ir_builder->GetInsertBlock()->getModule();
+  llvm::Function* function =
+      module->getFunction(llvm_ir::AsStringRef(kernel_name));
+
+  int64 null_arg_idx = -1;
+  std::vector<llvm::Value*> sanitized_args;
+  sanitized_args.reserve(arguments.size());
+  for (int64 i = 0, e = arguments.size(); i < e; i++) {
+    if (arguments[i]) {
+      sanitized_args.push_back(arguments[i]);
+    } else {
+      CHECK_EQ(null_arg_idx, -1);
+      null_arg_idx = i;
+    }
+  }
+
+  if (!function) {
+    VLOG(2) << "Generating kernel for " << kernel_name;
+    std::vector<llvm::Type*> arg_types;
+    std::transform(sanitized_args.begin(), sanitized_args.end(),
+                   std::back_inserter(arg_types),
+                   [](llvm::Value* arg) { return arg->getType(); });
+
+    auto* function_type = llvm::FunctionType::get(
+        ir_builder->getVoidTy(), arg_types, /*isVarArg=*/false);
+
+    function = llvm_ir::CreateFunction(
+        function_type, llvm::GlobalValue::InternalLinkage,
+        /*enable_fast_math=*/enable_fast_math,
+        /*optimize_for_size=*/optimize_for_size, kernel_name, module);
+
+    llvm::IRBuilder<>::InsertPointGuard guard(*ir_builder);
+
+    auto* entry_bb =
+        llvm::BasicBlock::Create(ir_builder->getContext(), "entry", function);
+    auto* return_inst = llvm::ReturnInst::Create(ir_builder->getContext(),
+                                                 /*retVal=*/nullptr, entry_bb);
+    // Set the insert point to before return_inst.
+    ir_builder->SetInsertPoint(return_inst);
+
+    std::vector<llvm::Value*> arg_values;
+    /*
+     * clang on OSX doesn't like std::transform or range for loop here.
+     * See https://github.com/tensorflow/tensorflow/issues/15196
+     */
+    for (llvm::Function::arg_iterator arg = function->arg_begin(),
+                                      arg_e = function->arg_end();
+         arg != arg_e; ++arg) {
+      arg_values.push_back(arg);
+    }
+    if (null_arg_idx != -1) {
+      arg_values.insert(arg_values.begin() + null_arg_idx, nullptr);
+    }
+    kernel_body_generator(arg_values);
+  } else {
+    VLOG(3) << "Re-using kernel for " << kernel_name;
+  }
+
+  ir_builder->CreateCall(function, llvm_ir::AsArrayRef(sanitized_args));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index 9bafb7b57740b7acd0286c113c8a0585c0f93689..1c00b2aabd182da72e78d2c9c01cbe70cfd8e33c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
 
 #include <string>
 
@@ -118,6 +118,60 @@ class KernelSupportLibrary {
           const std::function<void()>& true_block_generator,
           const std::function<void()>& false_block_generator = []() {});
 
+  using ArgumentVector = tensorflow::gtl::ArraySlice<llvm::Value*>;
+
+  // Generates the following control flow structure:
+  //
+  //  define @`kernel_name`(arg0, arg1, ... arg`arguments.size()`) {
+  //    kernel_body_generator({arg0, arg1, ... arg`arguments.size()`});
+  //  }
+  //
+  //  ...
+  //  call @`kernel_name`(arguments[0], arguments[1] ...)
+  //  ...
+  //
+  // If a function called `kernel_name` is already present in the module then
+  // that function is re-used.  In that sense we're using the llvm::Module as a
+  // cache of outlined kernels, keyed by function name.
+  //
+  // If any of the values in `arguments` is nullptr (i.e. a nullptr
+  // llvm::Value*) then we ignore it when generating LLVM IR, and instead pass
+  // in a nullptr llvm::Value* in its position to `kernel_body_generator`.
+  // Currently we only support at most one nullptr value in `arguments`.
+  static void EmitAndCallOutlinedKernel(
+      bool enable_fast_math, bool optimize_for_size,
+      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
+      ArgumentVector arguments,
+      const std::function<void(ArgumentVector)>& kernel_body_generator);
+
+  // Thin wrappers around the more general EmitAndCallOutlinedKernel above.
+  static void EmitAndCallOutlinedKernel(
+      bool enable_fast_math, bool optimize_for_size,
+      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
+      llvm::Value* arg0, llvm::Value* arg1, llvm::Value* arg2,
+      const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*)>&
+          kernel_body_generator) {
+    EmitAndCallOutlinedKernel(
+        enable_fast_math, optimize_for_size, ir_builder, kernel_name,
+        {arg0, arg1, arg2}, [&](ArgumentVector args) {
+          kernel_body_generator(args[0], args[1], args[2]);
+        });
+  }
+
+  static void EmitAndCallOutlinedKernel(
+      bool enable_fast_math, bool optimize_for_size,
+      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
+      llvm::Value* arg0, llvm::Value* arg1, llvm::Value* arg2,
+      llvm::Value* arg3,
+      const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*,
+                               llvm::Value*)>& kernel_body_generator) {
+    EmitAndCallOutlinedKernel(
+        enable_fast_math, optimize_for_size, ir_builder, kernel_name,
+        {arg0, arg1, arg2, arg3}, [&](ArgumentVector args) {
+          kernel_body_generator(args[0], args[1], args[2], args[3]);
+        });
+  }
+
  private:
   llvm::IRBuilder<>* ir_builder_;
   bool prevent_unrolling_;
@@ -125,4 +179,4 @@ class KernelSupportLibrary {
 };
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index cd0c4a371e2b1cd0e1c52b77e47e8b081ab8e836..22141e7e00756483957f9cd4bc065a64556e854c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
@@ -61,6 +63,16 @@ llvm::StringRef AsStringRef(tensorflow::StringPiece str) {
   return llvm::StringRef(str.data(), str.size());
 }
 
+std::unique_ptr<llvm::Module> DropConstantInitializers(
+    const llvm::Module& module) {
+  std::unique_ptr<llvm::Module> cloned_module = CloneModule(&module);
+  for (llvm::GlobalVariable& global_var : cloned_module->globals()) {
+    global_var.setInitializer(nullptr);
+    global_var.setLinkage(llvm::GlobalValue::LinkageTypes::ExternalLinkage);
+  }
+  return cloned_module;
+}
+
 string DumpModuleToString(const llvm::Module& module) {
   std::string buffer_string;
   llvm::raw_string_ostream ostream(buffer_string);
@@ -142,7 +154,16 @@ llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
       return llvm::Type::getInt8Ty(module->getContext());
     case S16:
     case U16:
+    case BF16:
+      // For BF16 we just need some type that is 16 bits wide so that it will
+      // take up the right amount of space in memory. LLVM does not have a BF16
+      // type (the LLVM half type is IEEE 16 bit floating point, not bfloat), so
+      // we can't map it directly to an LLVM type. We will not map a BF16
+      // addition to an addition on this type (int16) - this is just the type
+      // used for storage.
       return llvm::Type::getInt16Ty(module->getContext());
+    case F16:
+      return llvm::Type::getHalfTy(module->getContext());
     case S32:
     case U32:
       return llvm::Type::getInt32Ty(module->getContext());
@@ -200,8 +221,8 @@ llvm::Type* ShapeToIrType(const Shape& shape, llvm::Module* module) {
   if (ShapeUtil::IsTuple(shape)) {
     // A tuple buffer is an array of pointers.
     result_type = llvm::ArrayType::get(result_type, shape.tuple_shapes_size());
-  } else {
-    for (int64 dimension : shape.layout().minor_to_major()) {
+  } else if (ShapeUtil::IsArray(shape)) {
+    for (int64 dimension : LayoutUtil::MinorToMajor(shape)) {
       result_type =
           llvm::ArrayType::get(result_type, shape.dimensions(dimension));
     }
@@ -280,6 +301,16 @@ llvm::Constant* LiteralToConstant(const Literal& literal, int64 dimension_index,
         value = llvm::ConstantFP::get(ir_element_type,
                                       literal.Get<float>(*multi_index));
         break;
+      case BF16:
+        value = llvm::ConstantInt::get(
+            ir_element_type,
+            tensorflow::bit_cast<uint16>(literal.Get<bfloat16>(*multi_index)));
+        break;
+      case F16:
+        value = llvm::ConstantFP::get(
+            ir_element_type,
+            static_cast<float>(literal.Get<half>(*multi_index)));
+        break;
       case F64:
         value = llvm::ConstantFP::get(ir_element_type,
                                       literal.Get<double>(*multi_index));
@@ -304,7 +335,7 @@ llvm::Constant* LiteralToConstant(const Literal& literal, int64 dimension_index,
   // decrements with each recursive call. We want to iterate through the
   // dimensions in major-to-minor order as we recurse so just index into
   // minor_to_major to get the dimension number for this level of the recursion.
-  int64 dimension = shape.layout().minor_to_major(dimension_index);
+  int64 dimension = LayoutUtil::Minor(shape.layout(), dimension_index);
 
   // Recursively call LiteralToConstant to construct subarrays for the
   // more-minor dimensions. Gather the subarrays into a vector for bundling into
@@ -320,7 +351,7 @@ llvm::Constant* LiteralToConstant(const Literal& literal, int64 dimension_index,
   if (elements.empty()) {
     element_type = ir_element_type;
     for (int i = 0; i < dimension_index; ++i) {
-      int64 index = shape.layout().minor_to_major(i);
+      int64 index = LayoutUtil::Minor(shape.layout(), i);
       element_type =
           llvm::ArrayType::get(element_type, shape.dimensions(index));
     }
@@ -653,6 +684,19 @@ static string GetProcessUniqueIrFileName(tensorflow::StringPiece prefix) {
   return uniquer->GetUniqueName(prefix);
 }
 
+static Status CreateAndWriteStringToFile(const string& directory_name,
+                                         const string& file_name,
+                                         const string& text) {
+  std::unique_ptr<tensorflow::WritableFile> f;
+  TF_RETURN_IF_ERROR(
+      tensorflow::Env::Default()->RecursivelyCreateDir(directory_name));
+  TF_RETURN_IF_ERROR(
+      tensorflow::Env::Default()->NewWritableFile(file_name, &f));
+  TF_RETURN_IF_ERROR(f->Append(text));
+  TF_RETURN_IF_ERROR(f->Close());
+  return Status::OK();
+}
+
 Status DumpIRToDirectory(const string& directory_name,
                          const string& hlo_module_name,
                          const llvm::Module& llvm_module, bool optimized) {
@@ -667,13 +711,70 @@ Status DumpIRToDirectory(const string& directory_name,
       directory_name,
       tensorflow::strings::StrCat(unique_and_safe_file_name, ".ll"));
 
-  std::unique_ptr<tensorflow::WritableFile> f;
-  TF_RETURN_IF_ERROR(
-      tensorflow::Env::Default()->RecursivelyCreateDir(directory_name));
-  TF_RETURN_IF_ERROR(
-      tensorflow::Env::Default()->NewWritableFile(ir_file_name, &f));
-  TF_RETURN_IF_ERROR(f->Append(DumpModuleToString(llvm_module)));
-  return f->Close();
+  // For some models the embedded constants can be huge, so also dump the module
+  // with the constants stripped to get IR that is easier to manipulate.
+  string ir_no_constant_initializers_file_name = tensorflow::io::JoinPath(
+      directory_name,
+      tensorflow::strings::StrCat(unique_and_safe_file_name, "-noconst.ll"));
+
+  TF_RETURN_IF_ERROR(CreateAndWriteStringToFile(
+      directory_name, ir_file_name, DumpModuleToString(llvm_module)));
+  return CreateAndWriteStringToFile(
+      directory_name, ir_no_constant_initializers_file_name,
+      DumpModuleToString(*DropConstantInitializers(llvm_module)));
+}
+
+llvm::Function* CreateFunction(llvm::FunctionType* function_type,
+                               llvm::GlobalValue::LinkageTypes linkage,
+                               bool enable_fast_math, bool optimize_for_size,
+                               tensorflow::StringPiece name,
+                               llvm::Module* module) {
+  llvm::Function* function =
+      llvm::Function::Create(function_type, linkage, AsStringRef(name), module);
+  function->setCallingConv(llvm::CallingConv::C);
+  function->addFnAttr("no-frame-pointer-elim", "false");
+
+  if (enable_fast_math) {
+    function->addFnAttr("unsafe-fp-math", "true");
+    function->addFnAttr("no-infs-fp-math", "true");
+    function->addFnAttr("no-nans-fp-math", "true");
+    function->addFnAttr("no-signed-zeros-fp-math", "true");
+  }
+
+  // Add the optize attribute to the function if optimizing for size. This
+  // controls internal behavior of some optimization passes (e.g. loop
+  // unrolling).
+  if (optimize_for_size) {
+    function->addFnAttr(llvm::Attribute::OptimizeForSize);
+  }
+
+  return function;
+}
+
+void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
+  auto options = config.debug_options().xla_backend_extra_options();
+  if (!options.empty()) {
+    std::vector<string> fake_argv_storage;
+    fake_argv_storage.push_back("");
+    for (const auto& it : options) {
+      // Skip options the XLA backend itself consumes.
+      if (!tensorflow::StringPiece(it.first).starts_with("xla_")) {
+        if (it.second.empty()) {
+          fake_argv_storage.push_back(it.first);
+        } else {
+          fake_argv_storage.push_back(it.first + "=" + it.second);
+        }
+      }
+    }
+
+    VLOG(2) << "Passing argv to LLVM:";
+    std::vector<const char*> fake_argv;
+    for (const auto& s : fake_argv_storage) {
+      fake_argv.push_back(s.c_str());
+      VLOG(2) << s;
+    }
+    llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]);
+  }
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 063ead2b647d8fc5cc4f67004aaded80a2191fe9..4a10ec466dae6fdb56546fb8d8b353dcff6a5b8d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -281,6 +282,16 @@ Status DumpIRToDirectory(const string& directory_name,
                          const string& hlo_module_name,
                          const llvm::Module& llvm_module, bool optimized);
 
+llvm::Function* CreateFunction(llvm::FunctionType* function_type,
+                               llvm::GlobalValue::LinkageTypes linkage,
+                               bool enable_fast_math, bool optimize_for_size,
+                               tensorflow::StringPiece name,
+                               llvm::Module* module);
+
+// Extracts the xla_backend_extra_options from `config` and passes those that
+// don't start with xla_ to LLVM.
+void InitializeLLVMCommandLineOptions(const HloModuleConfig& config);
+
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index 6fa4cd08c9e0ac30b83c0e2b49d98d930c2e15df..b6b918ec78a27b90325f72eea14b97f9aee43c54 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -51,37 +51,40 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
       shape_(target_array.GetShape()),
       ir_builder_(ir_builder) {}
 
+static LoopEmitter::BodyEmitter MakeBodyEmitterForMultiOutputFusion(
+    const ElementGenerator& target_element_generator,
+    const std::vector<IrArray>& target_arrays, llvm::IRBuilder<>* ir_builder) {
+  return [=](const llvm_ir::IrArray::Index array_index) {
+    TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
+                        target_element_generator(array_index));
+    CHECK(target_element->getType()->isStructTy())
+        << "This BodyEmitter is for multi-output fusion, but target element "
+           "generator does not produce values of struct type.";
+    CHECK_EQ(target_element->getType()->getStructNumElements(),
+             target_arrays.size());
+
+    for (int64 i = 0; i < target_arrays.size(); ++i) {
+      target_arrays[i].EmitWriteArrayElement(
+          array_index, ir_builder->CreateExtractValue(target_element, i),
+          ir_builder);
+    }
+    return Status::OK();
+  };
+}
+
 LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
                          tensorflow::gtl::ArraySlice<IrArray> target_arrays,
                          llvm::IRBuilder<>* ir_builder)
-    : body_emitter_([=](const llvm_ir::IrArray::Index array_index)
-                        -> ::tensorflow::Status {
-        // Convert target_element_generator to a BodyEmitter.
-        TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
-                            target_element_generator(array_index));
-        if (target_arrays.size() == 1) {
-          target_arrays[0].EmitWriteArrayElement(array_index, target_element,
-                                                 ir_builder);
-          return tensorflow::Status::OK();
-        }
-
-        for (int64 i = 0; i < target_arrays.size(); ++i) {
-          target_arrays[i].EmitWriteArrayElement(
-              array_index, ir_builder_->CreateExtractValue(target_element, i),
-              ir_builder);
-        }
-        return tensorflow::Status::OK();
-      }),
+    : body_emitter_(MakeBodyEmitterForMultiOutputFusion(
+          target_element_generator,
+          std::vector<IrArray>(target_arrays.begin(), target_arrays.end()),
+          ir_builder)),
+      shape_(target_arrays[0].GetShape()),
       ir_builder_(ir_builder) {
-  if (target_arrays.size() > 1) {
-    // The sanity check for multiple outputs.
-    shape_ = target_arrays[0].GetShape();
-    for (int64 i = 1; i < target_arrays.size(); ++i) {
-      const Shape& element_shape = target_arrays[i].GetShape();
-      CHECK(ShapeUtil::SameDimensions(shape_, element_shape));
-    }
-  } else {
-    shape_ = target_arrays[0].GetShape();
+  // Sanity check: In multi-output fusion, all shapes produced must have the
+  // same dimensions.
+  for (const IrArray& array : target_arrays) {
+    CHECK(ShapeUtil::SameDimensions(shape_, array.GetShape()));
   }
 }
 
@@ -99,8 +102,8 @@ IrArray::Index LoopEmitter::EmitIndexAndSetExitBasicBlock(
   // dimension (of the target shape).
   ForLoopNest loop_nest(loop_name, ir_builder_);
   IrArray::Index array_index(shape_.dimensions_size());
-  for (int i = shape_.layout().minor_to_major_size() - 1; i >= 0; --i) {
-    int64 dimension = shape_.layout().minor_to_major(i);
+  for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
+    int64 dimension = LayoutUtil::Major(shape_.layout(), i);
     std::unique_ptr<ForLoop> loop = loop_nest.AddLoop(
         /*start_index=*/0,
         /*end_index=*/shape_.dimensions(dimension),
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index 1ef1dc246442041698d96f6aff48794c8788f1d1..0fc528439a0d5bf8382dfcf2d8b3051f8900bf1d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -47,10 +47,16 @@ class LoopEmitter {
   // element of the given target array.
   LoopEmitter(const ElementGenerator& target_element_generator,
               const IrArray& target_array, llvm::IRBuilder<>* ir_builder);
-  // Same as previous method except emits multiple targets in an array.
+
+  // Constructs a LoopEmitter that emits one element into each of N separate
+  // arrays on each iteration of the loop.
+  //
+  // This is used for multi-output fusion.  target_element_generator must
+  // produce an LLVM struct with N elements.
   LoopEmitter(const ElementGenerator& target_element_generator,
               tensorflow::gtl::ArraySlice<IrArray> target_arrays,
               llvm::IRBuilder<>* ir_builder);
+
   LoopEmitter(const LoopEmitter&) = delete;
   LoopEmitter& operator=(const LoopEmitter&) = delete;
   virtual ~LoopEmitter() = default;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ops.h b/tensorflow/compiler/xla/service/llvm_ir/ops.h
index f72f482e3128c61e53cc454e7da8b5795ba6f695..175b081e84d31779b15560cb0998011fe046ca01 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ops.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_
 
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
@@ -90,4 +90,4 @@ Status EmitParallelFusedDynamicUpdateSliceInPlace(
 }  // namespace llvm_ir
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_OPS_H_
diff --git a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.cc
deleted file mode 100644
index e8c6a83618eaa8430521197f1c166cb7eb11a28e..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h"
-
-#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
-
-namespace xla {
-VectorSupportLibrary::VectorSupportLibrary(PrimitiveType primitive_type,
-                                           int64 vector_size,
-                                           llvm::IRBuilder<>* ir_builder,
-                                           std::string name)
-    : vector_size_(vector_size),
-      primitive_type_(primitive_type),
-      ir_builder_(ir_builder),
-      name_(std::move(name)) {
-  scalar_type_ = llvm_ir::PrimitiveTypeToIrType(
-      primitive_type, ir_builder_->GetInsertBlock()->getModule());
-  scalar_pointer_type_ = llvm::PointerType::getUnqual(scalar_type_);
-  vector_type_ = llvm::VectorType::get(scalar_type_, vector_size);
-  vector_pointer_type_ = llvm::PointerType::getUnqual(vector_type_);
-}
-
-llvm::Value* VectorSupportLibrary::Mul(llvm::Value* lhs, llvm::Value* rhs) {
-  if (scalar_type_->isFloatingPointTy()) {
-    return ir_builder()->CreateFMul(lhs, rhs, name());
-  } else {
-    return ir_builder()->CreateMul(lhs, rhs, name());
-  }
-}
-
-llvm::Value* VectorSupportLibrary::Add(llvm::Value* lhs, llvm::Value* rhs) {
-  if (scalar_type_->isFloatingPointTy()) {
-    return ir_builder()->CreateFAdd(lhs, rhs, name());
-  } else {
-    return ir_builder()->CreateAdd(lhs, rhs, name());
-  }
-}
-
-llvm::Value* VectorSupportLibrary::ComputeOffsetPointer(
-    llvm::Value* base_pointer, llvm::Value* offset_elements) {
-  if (base_pointer->getType() != scalar_pointer_type()) {
-    base_pointer = ir_builder()->CreateBitCast(base_pointer,
-                                               scalar_pointer_type(), name());
-  }
-  return ir_builder()->CreateInBoundsGEP(base_pointer, {offset_elements},
-                                         name());
-}
-
-llvm::Value* VectorSupportLibrary::LoadVector(llvm::Value* pointer) {
-  if (pointer->getType() != vector_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, vector_pointer_type(), name());
-  }
-  return ir_builder()->CreateAlignedLoad(
-      pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name());
-}
-
-llvm::Value* VectorSupportLibrary::LoadScalar(llvm::Value* pointer) {
-  if (pointer->getType() != scalar_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
-  }
-  return ir_builder()->CreateAlignedLoad(
-      pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name());
-}
-
-void VectorSupportLibrary::StoreVector(llvm::Value* value,
-                                       llvm::Value* pointer) {
-  if (pointer->getType() != vector_pointer_type()) {
-    pointer = ir_builder()->CreateBitCast(pointer, vector_pointer_type());
-  }
-  ir_builder()->CreateAlignedStore(
-      value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
-}
-
-void VectorSupportLibrary::StoreScalar(llvm::Value* value,
-                                       llvm::Value* pointer) {
-  if (pointer->getType() != scalar_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
-  }
-  ir_builder()->CreateAlignedStore(
-      value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
-}
-
-llvm::Value* VectorSupportLibrary::LoadBroadcast(llvm::Value* pointer) {
-  if (pointer->getType() != scalar_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
-  }
-  return ir_builder()->CreateVectorSplat(
-      vector_size(), ir_builder()->CreateLoad(pointer), name());
-}
-
-llvm::Value* VectorSupportLibrary::AddReduce(llvm::Value* vector) {
-  llvm::SmallVector<llvm::Constant*, 32> mask(vector_size(), nullptr);
-  for (unsigned i = vector_size(); i != 1; i >>= 1) {
-    // On every iteration, we shuffle half of the remaining lanes to the top
-    // half of shuffle, and add two old and the new vector.
-
-    for (unsigned j = 0; j < vector_size(); ++j) {
-      if (j < (i / 2)) {
-        mask[j] = ir_builder()->getInt32(i / 2 + j);
-      } else {
-        mask[j] = llvm::UndefValue::get(ir_builder()->getInt32Ty());
-      }
-    }
-
-    llvm::Value* half_remaining_lanes = ir_builder()->CreateShuffleVector(
-        vector, llvm::UndefValue::get(vector_type()),
-        llvm::ConstantVector::get(mask), "");
-    vector = Add(vector, half_remaining_lanes);
-  }
-
-  return ir_builder()->CreateExtractElement(vector, ir_builder()->getInt32(0),
-                                            name());
-}
-
-llvm::Value* VectorSupportLibrary::GetZeroVector() {
-  return llvm::Constant::getNullValue(vector_type());
-}
-
-llvm::Value* VectorSupportLibrary::GetZeroScalar() {
-  return llvm::Constant::getNullValue(scalar_type());
-}
-
-LlvmVariable::LlvmVariable(llvm::Type* type, llvm::IRBuilder<>* ir_builder)
-    : ir_builder_(ir_builder) {
-  alloca_ = llvm_ir::EmitAllocaAtFunctionEntry(type, "", ir_builder_);
-}
-
-llvm::Value* LlvmVariable::Get() { return ir_builder_->CreateLoad(alloca_); }
-
-void LlvmVariable::Set(llvm::Value* new_value) {
-  ir_builder_->CreateStore(new_value, alloca_);
-}
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h
deleted file mode 100644
index 3072677ab05aa91c736baaa0dc3023329d810a52..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_VECTOR_SUPPORT_LIBRARY_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_VECTOR_SUPPORT_LIBRARY_H_
-
-#include <string>
-
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Value.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace xla {
-// A thin wrapper around llvm_util.h to make code generating vector math flow
-// more readable.
-class VectorSupportLibrary {
- public:
-  // This VectorSupportLibrary instance remembers `primitive_type` and
-  // `vector_size`, and these are implicitly used by the methods on this
-  // instance (i.e. LoadVector will load a vector of type <`vector_size` x
-  // `primitive_type`>).
-  VectorSupportLibrary(PrimitiveType primitive_type, int64 vector_size,
-                       llvm::IRBuilder<>* ir_builder, std::string name);
-
-  llvm::Value* Mul(llvm::Value* lhs, llvm::Value* rhs);
-  llvm::Value* Mul(int64 lhs, llvm::Value* rhs) {
-    return Mul(ir_builder()->getInt64(lhs), rhs);
-  }
-
-  llvm::Value* Add(llvm::Value* lhs, llvm::Value* rhs);
-  llvm::Value* Add(int64 lhs, llvm::Value* rhs) {
-    return Add(ir_builder()->getInt64(lhs), rhs);
-  }
-
-  llvm::Value* MulAdd(llvm::Value* a, llvm::Value* b, llvm::Value* c) {
-    return Add(c, Mul(a, b));
-  }
-
-  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
-                                    llvm::Value* offset_elements);
-  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
-                                    int64 offset_elements) {
-    return ComputeOffsetPointer(base_pointer,
-                                ir_builder()->getInt64(offset_elements));
-  }
-
-  llvm::Value* LoadVector(llvm::Value* pointer);
-
-  llvm::Value* LoadVector(llvm::Value* base_pointer,
-                          llvm::Value* offset_elements) {
-    return LoadVector(ComputeOffsetPointer(base_pointer, offset_elements));
-  }
-
-  llvm::Value* LoadVector(llvm::Value* base_pointer, int64 offset_elements) {
-    return LoadVector(base_pointer, ir_builder()->getInt64(offset_elements));
-  }
-
-  llvm::Value* LoadScalar(llvm::Value* pointer);
-
-  llvm::Value* LoadScalar(llvm::Value* base_pointer,
-                          llvm::Value* offset_elements) {
-    return LoadScalar(ComputeOffsetPointer(base_pointer, offset_elements));
-  }
-
-  llvm::Value* LoadScalar(llvm::Value* base_pointer, int64 offset_elements) {
-    return LoadScalar(base_pointer, ir_builder()->getInt64(offset_elements));
-  }
-
-  void StoreVector(llvm::Value* value, llvm::Value* pointer);
-
-  void StoreVector(llvm::Value* value, llvm::Value* base_pointer,
-                   llvm::Value* offset_elements) {
-    StoreVector(value, ComputeOffsetPointer(base_pointer, offset_elements));
-  }
-
-  void StoreVector(llvm::Value* value, llvm::Value* base_pointer,
-                   int64 offset_elements) {
-    StoreVector(value, base_pointer, ir_builder()->getInt64(offset_elements));
-  }
-
-  void StoreScalar(llvm::Value* value, llvm::Value* pointer);
-  void StoreScalar(llvm::Value* value, llvm::Value* base_pointer,
-                   llvm::Value* offset_elements) {
-    StoreScalar(value, ComputeOffsetPointer(base_pointer, offset_elements));
-  }
-
-  void StoreScalar(llvm::Value* value, llvm::Value* base_pointer,
-                   int64 offset_elements) {
-    StoreScalar(base_pointer, ir_builder()->getInt64(offset_elements));
-  }
-
-  llvm::Value* LoadBroadcast(llvm::Value* pointer);
-  llvm::Value* LoadBroadcast(llvm::Value* base_pointer,
-                             llvm::Value* offset_elements) {
-    return LoadBroadcast(ComputeOffsetPointer(base_pointer, offset_elements));
-  }
-  llvm::Value* LoadBroadcast(llvm::Value* base_pointer, int64 offset_elements) {
-    return LoadBroadcast(base_pointer, ir_builder()->getInt64(offset_elements));
-  }
-
-  llvm::Value* AddReduce(llvm::Value* vector);
-
-  llvm::Value* GetZeroVector();
-  llvm::Value* GetZeroScalar();
-
-  llvm::IRBuilder<>* ir_builder() const { return ir_builder_; }
-  int64 vector_size() const { return vector_size_; }
-  llvm::Type* vector_type() const { return vector_type_; }
-  llvm::Type* vector_pointer_type() const { return vector_pointer_type_; }
-  llvm::Type* scalar_type() const { return scalar_type_; }
-  llvm::Type* scalar_pointer_type() const { return scalar_pointer_type_; }
-
-  const std::string& name() const { return name_; }
-
- private:
-  int64 vector_size_;
-  PrimitiveType primitive_type_;
-  llvm::IRBuilder<>* ir_builder_;
-  llvm::Type* vector_type_;
-  llvm::Type* vector_pointer_type_;
-  llvm::Type* scalar_type_;
-  llvm::Type* scalar_pointer_type_;
-  std::string name_;
-};
-
-// This wraps an alloca-backed stack variable which LLVM's SSA construction pass
-// can later convert to a SSA value.
-class LlvmVariable {
- public:
-  LlvmVariable(llvm::Type*, llvm::IRBuilder<>* ir_builder);
-
-  llvm::Value* Get();
-  void Set(llvm::Value* new_value);
-
- private:
-  llvm::AllocaInst* alloca_;
-  llvm::IRBuilder<>* ir_builder_;
-};
-
-class VectorVariable : public LlvmVariable {
- public:
-  VectorVariable(VectorSupportLibrary* vector_support,
-                 llvm::Value* initial_value)
-      : LlvmVariable(vector_support->vector_type(),
-                     vector_support->ir_builder()) {
-    Set(initial_value);
-  }
-};
-
-class ScalarVariable : public LlvmVariable {
- public:
-  ScalarVariable(VectorSupportLibrary* vector_support,
-                 llvm::Value* initial_value)
-      : LlvmVariable(vector_support->scalar_type(),
-                     vector_support->ir_builder()) {
-    Set(initial_value);
-  }
-};
-}  // namespace xla
-
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_VECTOR_SUPPORT_LIBRARY_H_
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 06f43bd3cb2376d34a3104133c868c4f4e5cc730..07f989d4faea199e812e54d2ae74d3ff9e7fa19a 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
@@ -71,7 +72,7 @@ LocalService::LocalService(const ServiceOptions& options,
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ComputationHandle& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-    const Shape* result_layout, int device_ordinal) {
+    const ExecutableBuildOptions& build_options) {
   TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
                       computation_tracker_.Resolve(computation));
   VersionedComputationHandle versioned_handle =
@@ -84,27 +85,47 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
   // Validate incoming layouts.
   if (argument_layouts.size() != program_shape->parameters_size()) {
     return InvalidArgument(
-        "invalid number of arguments for computation: expected %d, got %zu",
+        "Invalid number of arguments for computation: expected %d, got %zu.",
         program_shape->parameters_size(), argument_layouts.size());
   }
   for (int i = 0; i < argument_layouts.size(); ++i) {
     const Shape& argument_shape = *argument_layouts[i];
     TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(argument_shape));
     if (!ShapeUtil::Compatible(argument_shape, program_shape->parameters(i))) {
+      tensorflow::gtl::optional<const OpMetadata*> metadata =
+          user_computation->ParameterMetadata(i);
+      auto metadata_string = [&metadata]() -> string {
+        if (!metadata.has_value()) {
+          return "";
+        }
+        CHECK(metadata.value() != nullptr);
+        const OpMetadata& m = *metadata.value();
+        if (!m.source_file().empty()) {
+          return tensorflow::strings::Printf(
+              " (%s:%d)", m.source_file().c_str(), m.source_line());
+        }
+        return "";
+      };
       return InvalidArgument(
-          "invalid argument shape for argument %d, expected %s, got %s", i,
+          "Invalid argument shape for argument %d%s, expected %s, got %s.", i,
+          metadata_string().c_str(),
           ShapeUtil::HumanString(program_shape->parameters(i)).c_str(),
           ShapeUtil::HumanString(argument_shape).c_str());
     }
   }
-  if (result_layout != nullptr) {
-    TF_RETURN_IF_ERROR(
-        ValidateResultShapeWithLayout(*result_layout, program_shape->result()));
+  if (build_options.result_layout() != nullptr) {
+    TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout(
+        *build_options.result_layout(), program_shape->result()));
   }
 
   ExecutionOptions execution_options = CreateDefaultExecutionOptions();
-  if (result_layout != nullptr) {
-    *execution_options.mutable_shape_with_output_layout() = *result_layout;
+  if (build_options.generate_hlo_graph().has_value()) {
+    execution_options.mutable_debug_options()->set_xla_generate_hlo_graph(
+        build_options.generate_hlo_graph().value());
+  }
+  if (build_options.result_layout() != nullptr) {
+    *execution_options.mutable_shape_with_output_layout() =
+        *build_options.result_layout();
   } else {
     *execution_options.mutable_shape_with_output_layout() =
         program_shape->result();
@@ -113,15 +134,22 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
   }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, argument_layouts, &execution_options));
+      CreateModuleConfig(*program_shape, argument_layouts, &execution_options,
+                         *user_computation));
 
-  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
-                      execute_backend_->stream_executor(device_ordinal));
+  TF_ASSIGN_OR_RETURN(
+      se::StreamExecutor * executor,
+      execute_backend_->stream_executor(build_options.device_ordinal()));
 
-  std::vector<perftools::gputools::DeviceMemoryBase> argument_buffers(
-      argument_layouts.size());
   return BuildExecutable(versioned_handle, std::move(module_config),
-                         argument_buffers, execute_backend_.get(), executor);
+                         execute_backend_.get(), executor,
+                         build_options.device_allocator());
+}
+
+StatusOr<int> LocalService::ReplicaNumberToDeviceOrdinal(int replica_number) {
+  return backend().computation_placer()->DeviceId(
+      replica_number, /*computation=*/0, options_.number_of_replicas(),
+      /*computation_count=*/1);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 52c4346385eb663baa6e7579d7b3883ba084205b..15e120685e1be9190d49fdaf5ed6706bdf991a6c 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -41,11 +42,20 @@ class LocalService : public Service {
 
   // Builds an Executable with the given argument layouts and options. If
   // result_layout is non-null, then the executable is compiled to produce a
-  // result of the given layout.
+  // result of the given layout.  If device_allocator is non-null, then the
+  // compiler may use it to allocate temp space on the device.  The compiler is
+  // responsible for freeing any memory it allocates this way.
   StatusOr<std::unique_ptr<Executable>> CompileExecutable(
       const ComputationHandle& computation,
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-      const Shape* result_layout, int device_ordinal);
+      const ExecutableBuildOptions& options);
+
+  // Returns the device ordinal that corresponds to the given replica number.
+  //
+  // This returns an error if there is not a one-to-one correspondence of
+  // replicas to device ordinals, but is useful as a short term mechanism for
+  // the "easy" case where a single replica is a single device.
+  StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
 
  private:
   explicit LocalService(const ServiceOptions& options,
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.h b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
index 598d08b7203b25b194dfc3b3125ec58c96b2cd4c..f4c63dd86b4d8a6f598d46047012e4e5bc7b3d7e 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.h
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
 
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -90,4 +90,4 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault {
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index a0d08c288dbcc45e83a36ce7b094b04a9dbae532..7d8c05fffa4ab11d7dbf9956d2cb7ebd5bcdd3c4 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -17,12 +17,44 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
+namespace {
+
+bool IsAllowed(char character) {
+  auto c = static_cast<unsigned char>(character);
+  return (isalnum(c) != 0) || c == '_' || c == '.' || c == '-';
+}
+
+}  // namespace
+
+NameUniquer::NameUniquer(const string& separator) {
+  CHECK(std::all_of(separator.begin(), separator.end(), IsAllowed))
+      << "separator should comprises allowed characters only";
+  separator_ = separator;
+}
+
+/*static*/ string NameUniquer::GetSanitizedName(const string& name) {
+  string result = name;
+  CHECK(!result.empty()) << "name should not be empty";
+  char c = static_cast<unsigned char>(result[0]);
+  if (!isalpha(c) && c != '_') {
+    result[0] = '_';
+  }
+  for (int i = 1; i < result.length(); i++) {
+    if (!IsAllowed(result[i])) {
+      result[i] = '_';
+    }
+  }
+  return result;
+}
+
 string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
   string root = prefix.empty() ? "name" : prefix.ToString();
+  root = GetSanitizedName(root);
 
   // Strip away numeric suffix (if any). Only recognize separator if it is in
   // the middle of the name.
diff --git a/tensorflow/compiler/xla/service/name_uniquer.h b/tensorflow/compiler/xla/service/name_uniquer.h
index ed379b52258463b960dea788721c2c4325ef0260..4139c2700b25e8600182a034a8ac6f4f041c12e6 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.h
+++ b/tensorflow/compiler/xla/service/name_uniquer.h
@@ -28,14 +28,21 @@ namespace xla {
 // Simple stateful class that helps generate "unique" names. To use it, simply
 // call GetUniqueName as many times as needed. The names returned by
 // GetUniqueName are guaranteed to be distinct for this instance of the class.
+// Note that the names will be sanitized to match regexp
+// "[a-zA-Z_][a-zA-Z0-9_.-]*".
 class NameUniquer {
  public:
-  explicit NameUniquer(const string& separator = "__")
-      : separator_(separator) {}
+  // The separator must contain allowed characters only: "[a-zA-Z0-9_.-]".
+  explicit NameUniquer(const string& separator = "__");
 
-  // Get a unique name in a string, with an optional prefix for convenience.
+  // Get a sanitized unique name in a string, with an optional prefix for
+  // convenience.
   string GetUniqueName(tensorflow::StringPiece prefix = "");
 
+  // Sanitizes and returns the name. Unallowed characters will be replaced with
+  // '_'. The result will match the regexp "[a-zA-Z_][a-zA-Z0-9_.-]*".
+  static string GetSanitizedName(const string& name);
+
  private:
   // The string to use to separate the prefix of the name from the uniquing
   // integer value.
diff --git a/tensorflow/compiler/xla/service/name_uniquer_test.cc b/tensorflow/compiler/xla/service/name_uniquer_test.cc
index 9f0747a6e2175a968d8f3661ac51512009e86f29..4258cf16876ab46dce6df062ab701b1b1a4a7580 100644
--- a/tensorflow/compiler/xla/service/name_uniquer_test.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer_test.cc
@@ -60,12 +60,30 @@ TEST_F(NameUniquerTest, NumericSuffixes) {
   EXPECT_EQ("bar", uniquer.GetUniqueName("bar.-1000"));
   EXPECT_EQ("bar.1", uniquer.GetUniqueName("bar.-2000"));
   EXPECT_EQ("bar.2", uniquer.GetUniqueName("bar.1"));
+}
+
+TEST_F(NameUniquerTest, Sanitize) {
+  NameUniquer uniquer("_");
+
+  EXPECT_EQ("foo", uniquer.GetUniqueName("foo"));
+  EXPECT_EQ("foo_1", uniquer.GetUniqueName("foo"));
+  EXPECT_EQ("foo.54", uniquer.GetUniqueName("foo.54"));
+  EXPECT_EQ("foo_54", uniquer.GetUniqueName("foo_54"));
+  EXPECT_EQ("foo_54.1", uniquer.GetUniqueName("foo_54.1"));
+  EXPECT_EQ("foo_55", uniquer.GetUniqueName("foo"));
+
+  // Invalid characters will be replaced with '_'.
+  EXPECT_EQ("bar", uniquer.GetUniqueName("bar<-1000"));
+  EXPECT_EQ("bar_1", uniquer.GetUniqueName("bar<-2000"));
+  EXPECT_EQ("bar_2", uniquer.GetUniqueName("bar_1"));
 
   // Separator is only recognized in the middle of the prefix.
-  EXPECT_EQ(".10", uniquer.GetUniqueName(".10"));
-  EXPECT_EQ(".10.1", uniquer.GetUniqueName(".10"));
-  EXPECT_EQ("foobar.", uniquer.GetUniqueName("foobar."));
-  EXPECT_EQ("foobar..1", uniquer.GetUniqueName("foobar."));
+  EXPECT_EQ("_10", uniquer.GetUniqueName(
+                       ".10"));  // the leading '.' is replaced with '_'.
+  EXPECT_EQ("_10_1", uniquer.GetUniqueName(".10"));
+  EXPECT_EQ("_10_2", uniquer.GetUniqueName("_10"));
+  EXPECT_EQ("foobar_", uniquer.GetUniqueName("foobar_"));
+  EXPECT_EQ("foobar__1", uniquer.GetUniqueName("foobar_"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index 63f3bfb36cedeb44b190e1e8a5584d334f94b585..aa974ee61a27de9c19e97d8a6eb48f9261ce4bd9 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -33,10 +33,32 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
+using tensorflow::str_util::Lowercase;
+
 // Minimum supported CUDA compute capability is 3.5.
 constexpr int kMinCudaComputeCapabilityMajor = 3;
 constexpr int kMinCudaComputeCapabilityMinor = 5;
 
+// The name of the interpreter platform.
+constexpr char kInterpreter[] = "interpreter";
+
+namespace {
+
+string CanonicalPlatformName(const string& name) {
+  string platform_str = Lowercase(name);
+  // "cpu" and "host" mean the same thing.
+  if (platform_str == "cpu") {
+    platform_str = "host";
+  }
+  // "gpu" and "cuda" mean the same thing.
+  if (platform_str == "gpu") {
+    platform_str = "cuda";
+  }
+  return platform_str;
+}
+
+}  // namespace
+
 /* static */ StatusOr<std::vector<se::Platform*>>
 PlatformUtil::GetSupportedPlatforms() {
   se::MultiPlatformManager::PlatformMap platform_map;
@@ -78,7 +100,7 @@ PlatformUtil::GetSupportedPlatforms() {
   return platforms;
 }
 
-/* static */ StatusOr<se::Platform*> PlatformUtil::GetDefaultPlatform() {
+/* static */ StatusOr<se::Platform*> PlatformUtil::GetSolePlatform() {
   TF_ASSIGN_OR_RETURN(auto platforms, GetSupportedPlatforms());
   if (platforms.empty()) {
     return NotFound("no platforms found");
@@ -87,26 +109,42 @@ PlatformUtil::GetSupportedPlatforms() {
   }
 
   // Multiple platforms present and we can't pick a reasonable default.
-  auto l = [](string* out, const se::Platform* p) { out->append(p->Name()); };
-  string platforms_string = tensorflow::str_util::Join(platforms, ", ", l);
+  string platforms_string = tensorflow::str_util::Join(
+      platforms, ", ",
+      [](string* out, const se::Platform* p) { out->append(p->Name()); });
   return InvalidArgument(
       "must specify platform because more than one platform found: %s",
       platforms_string.c_str());
 }
 
-/*static*/ StatusOr<se::Platform*> PlatformUtil::GetPlatform(
-    const string& platform_name) {
-  using tensorflow::str_util::Lowercase;
-  string platform_str = Lowercase(platform_name);
-  // "cpu" and "host" mean the same thing.
-  if (platform_str == "cpu") {
-    platform_str = "host";
-  }
-  // "gpu" and "cuda" mean the same thing.
-  if (platform_str == "gpu") {
-    platform_str = "cuda";
+/* static */ StatusOr<se::Platform*> PlatformUtil::GetDefaultPlatform() {
+  TF_ASSIGN_OR_RETURN(auto platforms, GetSupportedPlatforms());
+  if (platforms.empty()) {
+    return NotFound("no platforms found");
+  } else if (platforms.size() == 1) {
+    return platforms[0];
+  } else if (platforms.size() == 2) {
+    for (int i = 0; i < 2; i++) {
+      if (Lowercase(platforms[i]->Name()) == kInterpreter &&
+          Lowercase(platforms[1 - i]->Name()) != kInterpreter) {
+        return platforms[1 - i];
+      }
+    }
   }
 
+  // Multiple platforms present and we can't pick a reasonable default.
+  string platforms_string = tensorflow::str_util::Join(
+      platforms, ", ",
+      [](string* out, const se::Platform* p) { out->append(p->Name()); });
+  return InvalidArgument(
+      "must specify platform because more than one platform (except for the "
+      "interpreter platform) found: %s",
+      platforms_string.c_str());
+}
+
+/*static*/ StatusOr<se::Platform*> PlatformUtil::GetPlatform(
+    const string& platform_name) {
+  string platform_str = CanonicalPlatformName(platform_name);
   TF_ASSIGN_OR_RETURN(auto platforms, PlatformUtil::GetSupportedPlatforms());
   for (se::Platform* platform : platforms) {
     if (Lowercase(platform->Name()) == platform_str) {
@@ -116,6 +154,32 @@ PlatformUtil::GetSupportedPlatforms() {
   return InvalidArgument("platform %s not found", platform_name.c_str());
 }
 
+/*static*/ StatusOr<se::Platform*> PlatformUtil::GetPlatformExceptFor(
+    const string& platform_name) {
+  string platform_str = CanonicalPlatformName(platform_name);
+
+  TF_ASSIGN_OR_RETURN(auto platforms, PlatformUtil::GetSupportedPlatforms());
+  std::vector<se::Platform*> matched;
+  for (se::Platform* platform : platforms) {
+    if (Lowercase(platform->Name()) != platform_name) {
+      matched.push_back(platform);
+    }
+  }
+  if (matched.empty()) {
+    return InvalidArgument("unable to find platform that is not %s",
+                           platform_name.c_str());
+  }
+  if (matched.size() == 1) {
+    return matched[0];
+  }
+  string matched_string = tensorflow::str_util::Join(
+      matched, ", ",
+      [](string* out, const se::Platform* p) { out->append(p->Name()); });
+  return InvalidArgument(
+      "found multiple platforms %s, but expected one platform except for %s",
+      matched_string.c_str(), platform_name.c_str());
+}
+
 // Returns whether the device underlying the given StreamExecutor is supported
 // by XLA.
 static bool IsDeviceSupported(se::StreamExecutor* executor) {
diff --git a/tensorflow/compiler/xla/service/platform_util.h b/tensorflow/compiler/xla/service/platform_util.h
index a59d4ffe87f568ac786e4b2d3bf6983bc0d4695a..69188820a70707d9c9be10b20fb7de92ad4d9873 100644
--- a/tensorflow/compiler/xla/service/platform_util.h
+++ b/tensorflow/compiler/xla/service/platform_util.h
@@ -37,16 +37,28 @@ class PlatformUtil {
   static StatusOr<std::vector<perftools::gputools::Platform*>>
   GetSupportedPlatforms();
 
-  // Convenience function which returns the default supported platform. If
+  // Convenience function which returns the default supported platform for
+  // tests. If exactly one supported platform is present, then this platform is
+  // the default platform. If exactly two platforms are present and one of them
+  // is the interpreter platform, then the other platform is the default
+  // platform. Otherwise returns an error.
+  static StatusOr<perftools::gputools::Platform*> GetDefaultPlatform();
+
+  // Convenience function which returns the sole supported platform. If
   // exactly one supported platform is present, then this platform is the
   // default platform. Otherwise returns an error.
-  static StatusOr<perftools::gputools::Platform*> GetDefaultPlatform();
+  static StatusOr<perftools::gputools::Platform*> GetSolePlatform();
 
   // Returns the platform according to the given name. Returns error if there is
   // no such platform.
   static StatusOr<perftools::gputools::Platform*> GetPlatform(
       const string& platform_name);
 
+  // Returns exactly one platform that does not have given name. Returns error
+  // if there is no such platform, or there are multiple such platforms.
+  static StatusOr<perftools::gputools::Platform*> GetPlatformExceptFor(
+      const string& platform_name);
+
   // Returns a vector of StreamExecutors for the given platform. The vector is
   // indexed by device ordinal (device numbering used by StreamExecutor). If an
   // element is nullptr, then the device is present by not supported by XLA.
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index 0fb90230f2f39a841973361f63d17af579a1342b..e62bafc50b0e1270702621c9ea7b2ee43e001fe0 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -101,8 +101,9 @@ HloInstruction* FirstNonScalarAndNonTrivialReshapeOperand(
         IsReshapeOrTranspose(operand) &&
         !CanTriviallyChangeShape(operand->operand(0))) {
       VLOG(5) << "Found first non-scalar and non-trivial reshape operand of "
-              << hlo->ToStringNoMetadata() << ":\n\t"
-              << operand->ToStringNoMetadata();
+              << hlo->ToString(HloPrintOptions().set_print_metadata(false))
+              << ":\n\t"
+              << operand->ToString(HloPrintOptions().set_print_metadata(false));
       return operand;
     }
   }
@@ -133,8 +134,9 @@ bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
 bool AllOperandsHaveEasyShapeChanges(
     const HloInstruction* instruction,
     const HloInstruction* first_reshape_operand) {
+  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
   VLOG(3) << "** Checking whether all operands have easy shape changes: "
-          << instruction->ToStringNoMetadata();
+          << instruction->ToString(print_no_metadata);
   // Check whether all operands:
   //    0. Have the same dimensions as the output -- if not, it may be
   //       implicitly broadcast, which can confound the movement's
@@ -151,21 +153,21 @@ bool AllOperandsHaveEasyShapeChanges(
       VLOG(5) << "Operand shape differs from output shape; may be "
                  "implicitly broadcast, so preventing "
                  "movement\n\toperand: "
-              << operand->ToStringNoMetadata()
-              << "\n\tinstruction: " << instruction->ToStringNoMetadata();
+              << operand->ToString(print_no_metadata) << "\n\tinstruction: "
+              << instruction->ToString(print_no_metadata);
       return false;
     }
 
     if (AreEquivalentReshapes(first_reshape_operand, operand)) {
       VLOG(5) << "Are equivalent reshapes:\n\tfirst_reshape_operand: "
-              << first_reshape_operand->ToStringNoMetadata()
-              << "\n\toperand: " << operand->ToStringNoMetadata();
+              << first_reshape_operand->ToString(print_no_metadata)
+              << "\n\toperand: " << operand->ToString(print_no_metadata);
       continue;
     }
 
     if (CanTriviallyChangeShape(operand)) {
       VLOG(5) << "Operand can trivially change shape: "
-              << operand->ToStringNoMetadata();
+              << operand->ToString(print_no_metadata);
       continue;
     }
 
@@ -173,12 +175,12 @@ bool AllOperandsHaveEasyShapeChanges(
     // well.
     VLOG(5) << "Operand is neither equalivant to the first Reshape operand"
                "nor can trivially change shape: "
-            << operand->ToStringNoMetadata();
+            << operand->ToString(print_no_metadata);
     return false;
   }
 
   VLOG(3) << "All operands have easy shape changes: "
-          << instruction->ToStringNoMetadata();
+          << instruction->ToString(print_no_metadata);
   return true;
 }
 
@@ -250,11 +252,13 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
     return false;
   }
 
+  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
   // At this point we've decided to sink reshape/transpose operands.
   const Shape& new_operand_shape = first_reshape_operand->operand(0)->shape();
   VLOG(3) << "** Sinking reshape or transpose: "
-          << instruction->ToStringNoMetadata() << "\n\tfirst reshape operand: "
-          << first_reshape_operand->ToStringNoMetadata()
+          << instruction->ToString(print_no_metadata)
+          << "\n\tfirst reshape operand: "
+          << first_reshape_operand->ToString(print_no_metadata)
           << "\n\tnew operand shape: "
           << ShapeUtil::HumanString(new_operand_shape);
 
@@ -267,7 +271,7 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
       continue;
     }
     VLOG(3) << "Updating operand #" << i << ": "
-            << operands[i]->ToStringNoMetadata();
+            << operands[i]->ToString(print_no_metadata);
     operands[i] = UpdateOperand(computation, first_reshape_operand,
                                 new_operand_shape, operands[i]);
   }
@@ -298,7 +302,7 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
   switch (first_reshape_operand->opcode()) {
     case HloOpcode::kReshape:
       VLOG(3) << "Creating new reshape for new elementwise op: "
-              << new_elementwise->ToStringNoMetadata();
+              << new_elementwise->ToString(print_no_metadata);
       new_reshape =
           HloInstruction::CreateReshape(instruction->shape(), new_elementwise);
       break;
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index d997cab83f8c2bc74632e49f23e690ffb17b901a..98dfc89867ab33788c4cc837a66d6751a1ef2507 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -34,8 +34,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -55,46 +57,38 @@ namespace se = ::perftools::gputools;
 
 using ::tensorflow::strings::Printf;
 using ::tensorflow::strings::StrCat;
+using ::xla::source_map_util::InvalidParameterArgument;
 
 namespace xla {
 
 namespace {
 
-// Copies the contents of an Allocation into a Literal proto.
-tensorflow::Status LiteralFromAllocation(const Allocation* allocation,
-                                         const Shape& literal_shape,
-                                         Literal* literal) {
-  TF_ASSIGN_OR_RETURN(
-      se::StreamExecutor * executor,
-      allocation->backend()->stream_executor(allocation->device_ordinal()));
-  return allocation->backend()->transfer_manager()->TransferLiteralFromDevice(
-      executor, allocation->device_memory(), allocation->shape(), literal_shape,
-      literal);
-}
-
 // Records the arguments used to invoke a computation in a SessionModule
 // proto.
 tensorflow::Status RecordArguments(
-    const tensorflow::gtl::ArraySlice<const Allocation*> arg_allocations,
+    const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+    se::StreamExecutor* executor, TransferManager* transfer_manager,
     SessionModule* module) {
   module->clear_arguments();
-  for (const Allocation* allocation : arg_allocations) {
-    Literal argument;
-    TF_RETURN_IF_ERROR(
-        LiteralFromAllocation(allocation, allocation->shape(), &argument));
-    *module->add_arguments() = argument.ToProto();
+  for (const ShapedBuffer* argument : arguments) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<Literal> literal,
+        transfer_manager->TransferLiteralFromDevice(executor, *argument));
+    *module->add_arguments() = literal->ToProto();
   }
   return tensorflow::Status::OK();
 }
 
 // Records the result of a computation in a SessionModule proto.
-tensorflow::Status RecordResult(const Allocation* result_allocation,
+tensorflow::Status RecordResult(const ShapedBuffer& result,
+                                se::StreamExecutor* executor,
+                                TransferManager* transfer_manager,
                                 SessionModule* module) {
   module->clear_result();
-  Literal result;
-  TF_RETURN_IF_ERROR(LiteralFromAllocation(
-      result_allocation, result_allocation->shape(), &result));
-  *module->mutable_result() = result.ToProto();
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Literal> literal,
+      transfer_manager->TransferLiteralFromDevice(executor, result));
+  *module->mutable_result() = literal->ToProto();
   return tensorflow::Status::OK();
 }
 
@@ -152,7 +146,9 @@ int ServiceOptions::intra_op_parallelism_threads() const {
 
 Service::Service(const ServiceOptions& options,
                  std::unique_ptr<Backend> execute_backend)
-    : options_(options), execute_backend_(std::move(execute_backend)) {
+    : options_(options),
+      allocation_tracker_(execute_backend.get()),
+      execute_backend_(std::move(execute_backend)) {
   CHECK_GT(options_.number_of_replicas(), 0);
   if (execute_backend_) {
     if (execute_backend_->device_count() > 0) {
@@ -235,41 +231,40 @@ tensorflow::Status Service::ValidateResultShapeWithLayout(
   return ShapeUtil::ValidateShape(shape_with_layout);
 }
 
-StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
+StatusOr<std::vector<const ShapedBuffer*>> Service::ResolveAndValidateArguments(
     tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-    const Backend* backend, int device_ordinal) {
-  std::vector<const Allocation*> allocations;
+    int device_ordinal) {
+  std::vector<const ShapedBuffer*> shaped_buffers;
   for (size_t i = 0; i < arguments.size(); ++i) {
-    auto allocation_status = allocation_tracker_.Resolve(*arguments[i]);
-    if (!allocation_status.ok()) {
-      return Status(allocation_status.status().code(),
-                    StrCat(allocation_status.status().error_message(), ", ",
+    auto buffer_status = allocation_tracker_.Resolve(*arguments[i]);
+    if (!buffer_status.ok()) {
+      return Status(buffer_status.status().code(),
+                    StrCat(buffer_status.status().error_message(), ", ",
                            "failed to resolve allocation for parameter ", i));
     }
-    const Allocation* allocation = allocation_status.ValueOrDie();
+    const ShapedBuffer* shaped_buffer = buffer_status.ValueOrDie();
 
     // Verify allocation is same platform and device as the execution.
-    if (allocation->backend() != backend ||
-        allocation->device_ordinal() != device_ordinal) {
+    if (shaped_buffer->platform() != execute_backend_->platform() ||
+        shaped_buffer->device_ordinal() != device_ordinal) {
       return InvalidArgument(
-          "argument %lu is on device %s but computation will be executed "
+          "argument %lu is on device %s:%d but computation will be executed "
           "on device %s",
-          i,
-          allocation->backend()
-              ->device_name(allocation->device_ordinal())
-              .c_str(),
-          backend->device_name(device_ordinal).c_str());
+          i, shaped_buffer->platform()->Name().c_str(),
+          shaped_buffer->device_ordinal(),
+          execute_backend_->device_name(device_ordinal).c_str());
     }
 
-    allocations.push_back(allocation);
+    shaped_buffers.push_back(shaped_buffer);
   }
-  return allocations;
+  return shaped_buffers;
 }
 
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
-    const ExecutionOptions* execution_options) {
+    const ExecutionOptions* execution_options,
+    const UserComputation& user_computation) {
   auto config = MakeUnique<HloModuleConfig>(program_shape);
   auto* computation_layout = config->mutable_entry_computation_layout();
 
@@ -283,8 +278,10 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     // ProgramShape.
     if (!ShapeUtil::Compatible(*argument_shapes[i],
                                program_shape.parameters(i))) {
-      return InvalidArgument(
-          "computation expects parameter %d to have shape %s, given shape %s",
+      return InvalidParameterArgument(
+          *user_computation.ParameterMetadata(i).value(),
+          "Argument does not match shape of computation parameter %d: want %s, "
+          "got %s",
           i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
           ShapeUtil::HumanString(*argument_shapes[i]).c_str());
     }
@@ -325,20 +322,23 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
 
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
-    tensorflow::gtl::ArraySlice<const Allocation*> arguments,
-    const ExecutionOptions& execution_options) {
+    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+    const ExecutionOptions& execution_options,
+    const UserComputation& user_computation) {
   std::vector<const Shape*> argument_shapes;
   for (const auto* arg : arguments) {
-    argument_shapes.push_back(&arg->shape());
+    argument_shapes.push_back(&arg->on_host_shape());
   }
-  return CreateModuleConfig(program_shape, argument_shapes, &execution_options);
+  return CreateModuleConfig(program_shape, argument_shapes, &execution_options,
+                            user_computation);
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     std::vector<VersionedComputationHandle> versioned_handles,
     std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
     Backend* backend,
-    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors) {
+    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p", this);
 
   // Dump computation proto state if flag is set.
@@ -384,7 +384,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<Executable>> executables,
-      backend->compiler()->Compile(std::move(modules), std::move(executors)));
+      backend->compiler()->Compile(std::move(modules), std::move(executors),
+                                   device_allocator));
 
   for (size_t i = 0; i < versioned_handles.size(); ++i) {
     if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) {
@@ -397,10 +398,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
 
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const VersionedComputationHandle& versioned_handle,
-    std::unique_ptr<HloModuleConfig> module_config,
-    const tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-        arguments,
-    Backend* backend, se::StreamExecutor* executor) {
+    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+    se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p with handle %s", this,
                     versioned_handle.ToString().c_str());
 
@@ -430,12 +429,15 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
                                           /*include_unreachable_instructions=*/
                                           true));
 
-  TF_ASSIGN_OR_RETURN(
-      module, backend->compiler()->RunHloPasses(std::move(module), executor));
+  TF_RETURN_IF_ERROR(MaybeDumpHloModule(*module));
 
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable,
-      backend->compiler()->RunBackend(std::move(module), executor));
+      module, backend->compiler()->RunHloPasses(std::move(module), executor,
+                                                device_allocator));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      backend->compiler()->RunBackend(
+                          std::move(module), executor, device_allocator));
 
   if (!other_directory_path.empty()) {
     executable->set_session_module(std::move(session_module));
@@ -446,11 +448,9 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
 
 StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
     const VersionedComputationHandle& versioned_handle,
-    std::unique_ptr<HloModuleConfig> module_config,
-    const tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-        arguments,
-    Backend* backend, perftools::gputools::StreamExecutor* executor,
-    ExecutionProfile* profile) {
+    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+    perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+    DeviceMemoryAllocator* device_allocator) {
   std::shared_ptr<Executable> executable =
       compilation_cache_.LookUp(versioned_handle, *module_config);
 
@@ -471,8 +471,8 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
   HloModuleConfig original_module_config = *module_config;
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable_unique_ptr,
-      BuildExecutable(versioned_handle, std::move(module_config), arguments,
-                      backend, executor));
+      BuildExecutable(versioned_handle, std::move(module_config), backend,
+                      executor, device_allocator));
 
   if (profile != nullptr) {
     uint64 end_micros = tensorflow::Env::Default()->NowMicros();
@@ -489,9 +489,7 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
 StatusOr<std::vector<GlobalDataHandle>>
 Service::ExecuteParallelAndRegisterResult(
     tensorflow::gtl::ArraySlice<Executable*> executables,
-    tensorflow::gtl::ArraySlice<
-        std::vector<perftools::gputools::DeviceMemoryBase>>
-        arguments,
+    tensorflow::gtl::ArraySlice<std::vector<const ShapedBuffer*>> arguments,
     Backend* backend, tensorflow::gtl::ArraySlice<DeviceHandle> device_handles,
     tensorflow::gtl::ArraySlice<string> result_tags,
     ExecutionProfile* profile) {
@@ -547,7 +545,7 @@ Service::ExecuteParallelAndRegisterResult(
 
       // Asynchronously launch the computation.
       TF_ASSIGN_OR_RETURN(
-          perftools::gputools::DeviceMemoryBase result,
+          std::unique_ptr<ShapedBuffer> result,
           executables[i]->ExecuteAsyncOnStream(&run_options, arguments[i]));
 
       if (replica == 0 && profile != nullptr) {
@@ -557,17 +555,20 @@ Service::ExecuteParallelAndRegisterResult(
       // All replicas share the same device address for the result allocation,
       // so only one of the replicas need to register the result handle.
       if (replica == 0) {
-        result_handles.push_back(allocation_tracker_.Register(
-            backend, replicas[0]->device_ordinal(), result,
-            executables[i]->result_shape(), result_tags[i]));
+        TF_ASSIGN_OR_RETURN(
+            GlobalDataHandle handle,
+            allocation_tracker_.Register(std::move(result), result_tags[i]));
+        result_handles.push_back(handle);
       }
     }
   }
 
   // Wait for all executions to complete.
   for (int64 i = 0; i < streams.size(); ++i) {
-    if (!streams[i]->BlockHostUntilDone()) {
-      return InternalError("failed to complete execution for stream %lld", i);
+    Status block_status = streams[i]->BlockHostUntilDone();
+    if (!block_status.ok()) {
+      return InternalError("failed to complete execution for stream %lld: %s",
+                           i, block_status.error_message().c_str());
     }
   }
 
@@ -578,7 +579,7 @@ Service::ExecuteParallelAndRegisterResult(
     se::Stream* stream = index_to_profiled_stream.second;
     Executable* executable = executables[device];
     const HloModule& module = executable->module();
-    HloExecutionProfile hlo_profile(&executable->hlo_profile_printer(),
+    HloExecutionProfile hlo_profile(&executable->hlo_profile_printer_data(),
                                     &executable->hlo_profile_index_map());
     TF_RETURN_IF_ERROR(
         executable->PopulateExecutionProfile(&hlo_profile, stream->parent()));
@@ -625,8 +626,7 @@ Service::ExecuteParallelAndRegisterResult(
 
 StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     Executable* executable,
-    const tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-        arguments,
+    const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     Backend* backend, perftools::gputools::StreamExecutor* executor,
     const string& result_tag, ExecutionProfile* profile) {
   // Set up streams.
@@ -651,6 +651,7 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
   for (const Pool<se::Stream>::SmartPtr& stream : streams) {
     ExecutableRunOptions options;
     options.set_stream(stream.get());
+    options.set_device_ordinal(stream->parent()->device_ordinal());
     options.set_allocator(backend->memory_allocator());
     options.set_inter_op_thread_pool(backend->inter_op_thread_pool());
     options.set_intra_op_thread_pool(
@@ -660,24 +661,21 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
                              backend->inter_op_thread_pool());
   }
 
-  perftools::gputools::DeviceMemoryBase result;
+  std::unique_ptr<ShapedBuffer> result;
   if (options_.number_of_replicas() == 1) {
-    TF_ASSIGN_OR_RETURN(
-        result, executable->ExecuteOnStreamWrapper<se::DeviceMemoryBase>(
-                    &run_options[0], profile, arguments));
+    TF_ASSIGN_OR_RETURN(result, executable->ExecuteOnStreamWrapper(
+                                    &run_options[0], profile, arguments));
   } else {
-    std::vector<
-        tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
+    // TODO(b/69985541): Support profiling also on this path.
+    std::vector<tensorflow::gtl::ArraySlice<const ShapedBuffer*>>
         repeated_arguments(options_.number_of_replicas(), arguments);
 
     TF_ASSIGN_OR_RETURN(auto results, executable->ExecuteOnStreams(
                                           run_options, repeated_arguments));
     TF_RET_CHECK(!results.empty());
-    result = results[0];
+    result = std::move(results[0]);
   }
-  return allocation_tracker_.Register(backend, executor->device_ordinal(),
-                                      result, executable->result_shape(),
-                                      result_tag);
+  return allocation_tracker_.Register(std::move(result), result_tag);
 }
 
 tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
@@ -691,7 +689,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
                                             ExecuteParallelResponse* result) {
   VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();
 
-  std::vector<std::vector<se::DeviceMemoryBase>> all_arguments;
+  std::vector<std::vector<const ShapedBuffer*>> all_arguments;
   std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
   std::vector<VersionedComputationHandle> versioned_handles;
   std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
@@ -748,20 +746,16 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
     // In the case of partitioned computations, assume all arguments go on the
     // zeroth core.
     TF_ASSIGN_OR_RETURN(
-        std::vector<const Allocation*> arg_allocations,
-        ResolveAndValidateArguments(request.arguments(), execute_backend_.get(),
+        std::vector<const ShapedBuffer*> arguments,
+        ResolveAndValidateArguments(request.arguments(),
                                     executors[0]->device_ordinal()));
-    std::vector<se::DeviceMemoryBase> arguments;
-    arguments.reserve(arg_allocations.size());
-    for (const Allocation* allocation : arg_allocations) {
-      arguments.push_back(allocation->device_memory());
-    }
 
     // Create an HloModuleConfig object for the computation, given the shape of
     // the program and the argument allocations.
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                        CreateModuleConfig(*program_shape, arg_allocations,
-                                           request.execution_options()));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModuleConfig> module_config,
+        CreateModuleConfig(*program_shape, arguments,
+                           request.execution_options(), *user_computation));
     VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
             << module_config->entry_computation_layout().ToString();
 
@@ -780,10 +774,14 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
 
   // Build the user computations into HloModules and compile to generate the
   // executables.
+  //
+  // TODO(jlebar): There's currently no way to pass a device allocator to
+  // ExecuteParallel, so we have to pass a null device_allocator below.
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<Executable>> executables,
       BuildExecutables(versioned_handles, std::move(module_configs),
-                       execute_backend_.get(), all_executors));
+                       execute_backend_.get(), all_executors,
+                       /*device_allocator=*/nullptr));
   std::vector<Executable*> executable_ptrs;
   executable_ptrs.reserve(executables.size());
   for (const auto& executable : executables) {
@@ -863,35 +861,31 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
       user_computation->ComputeProgramShape(versioned_handle.version));
 
   TF_ASSIGN_OR_RETURN(
-      std::vector<const Allocation*> arg_allocations,
-      ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
+      std::vector<const ShapedBuffer*> arguments,
+      ResolveAndValidateArguments(arg->arguments(),
                                   execute_backend_->default_device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, arg_allocations,
-                                         arg->execution_options()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> module_config,
+      CreateModuleConfig(*program_shape, arguments, arg->execution_options(),
+                         *user_computation));
 
   VLOG(3) << "Execute created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
 
-  std::vector<se::DeviceMemoryBase> arguments;
-  arguments.reserve(arg_allocations.size());
-  for (const Allocation* allocation : arg_allocations) {
-    arguments.push_back(allocation->device_memory());
-  }
-
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<Executable> executable,
       BuildAndCacheExecutable(versioned_handle, std::move(module_config),
-                              arguments, execute_backend_.get(),
+                              execute_backend_.get(),
                               execute_backend_->default_stream_executor(),
                               result->mutable_profile()));
 
   if (executable->dumping()) {
     executable->session_module()->set_execution_platform(
         execute_backend_->platform()->Name());
-    TF_RETURN_IF_ERROR(
-        RecordArguments(arg_allocations, executable->session_module()));
+    TF_RETURN_IF_ERROR(RecordArguments(
+        arguments, execute_backend_->default_stream_executor(),
+        execute_backend_->transfer_manager(), executable->session_module()));
   }
 
   TF_ASSIGN_OR_RETURN(
@@ -902,10 +896,11 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
           "result of " + user_computation->name(), result->mutable_profile()));
 
   if (executable->dumping()) {
-    TF_ASSIGN_OR_RETURN(const Allocation* result_allocation,
+    TF_ASSIGN_OR_RETURN(const ShapedBuffer* result_buffer,
                         allocation_tracker_.Resolve(result->output()));
-    TF_RETURN_IF_ERROR(
-        RecordResult(result_allocation, executable->session_module()));
+    TF_RETURN_IF_ERROR(RecordResult(
+        *result_buffer, execute_backend_->default_stream_executor(),
+        execute_backend_->transfer_manager(), executable->session_module()));
     TF_RETURN_IF_ERROR(executable->DumpSessionModule());
   }
 
@@ -931,31 +926,25 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
       user_computation->ComputeProgramShape(versioned_handle.version));
 
   TF_ASSIGN_OR_RETURN(
-      std::vector<const Allocation*> arg_allocations,
-      ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
+      std::vector<const ShapedBuffer*> arguments,
+      ResolveAndValidateArguments(arg->arguments(),
                                   execute_backend_->default_device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, arg_allocations,
-                                         arg->execution_options()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> module_config,
+      CreateModuleConfig(*program_shape, arguments, arg->execution_options(),
+                         *user_computation));
 
   VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
 
-  std::vector<se::DeviceMemoryBase> arguments;
-  arguments.reserve(arg_allocations.size());
-  for (const Allocation* allocation : arg_allocations) {
-    arguments.push_back(allocation->device_memory());
-  }
-
   ExecutionProfile profile;
 
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<Executable> executable,
-      BuildAndCacheExecutable(versioned_handle, std::move(module_config),
-                              arguments, execute_backend_.get(),
-                              execute_backend_->default_stream_executor(),
-                              &profile));
+      BuildAndCacheExecutable(
+          versioned_handle, std::move(module_config), execute_backend_.get(),
+          execute_backend_->default_stream_executor(), &profile));
 
   TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
                                               SingleComputationDeviceHandle()));
@@ -970,7 +959,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
     streams.push_back(std::move(stream));
   }
 
-  perftools::gputools::DeviceMemoryBase result_data;
+  std::unique_ptr<ShapedBuffer> result_buffer;
   for (const Pool<se::Stream>::SmartPtr& stream : streams) {
     ExecutableRunOptions options;
     options.set_stream(stream.get());
@@ -983,19 +972,19 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
         options, execute_backend_->StreamBorrower());
 
     TF_ASSIGN_OR_RETURN(
-        perftools::gputools::DeviceMemoryBase this_result_data,
+        std::unique_ptr<ShapedBuffer> this_result_buffer,
         executable->ExecuteAsyncOnStream(&service_options, arguments));
 
     // Take the first result.
-    if (result_data == nullptr) {
-      result_data = this_result_data;
+    if (result_buffer == nullptr) {
+      result_buffer = std::move(this_result_buffer);
     }
   }
 
-  auto output = allocation_tracker_.Register(
-      execute_backend_.get(), execute_backend_->default_device_ordinal(),
-      result_data, executable->result_shape(),
-      "result of " + user_computation->name());
+  TF_ASSIGN_OR_RETURN(
+      GlobalDataHandle output,
+      allocation_tracker_.Register(std::move(result_buffer),
+                                   "result of " + user_computation->name()));
 
   *result->mutable_execution() = execution_tracker_.Register(
       execute_backend_.get(), std::move(streams), profile, output);
@@ -1022,37 +1011,58 @@ tensorflow::Status Service::WaitForExecution(const WaitForExecutionRequest* arg,
 
 tensorflow::Status Service::TransferToClient(const TransferToClientRequest* arg,
                                              TransferToClientResponse* result) {
-  TF_ASSIGN_OR_RETURN(const Allocation* allocation,
+  TF_ASSIGN_OR_RETURN(const ShapedBuffer* shaped_buffer,
                       allocation_tracker_.Resolve(arg->data()));
 
-  const Shape* literal_shape;
+  const Shape* return_shape;
   if (arg->has_shape_with_layout()) {
     if (!LayoutUtil::HasLayout(arg->shape_with_layout())) {
       return InvalidArgument("shape_with_layout must have layout if present.");
     }
-    literal_shape = &arg->shape_with_layout();
+    return_shape = &arg->shape_with_layout();
   } else {
-    literal_shape = &allocation->shape();
+    return_shape = &shaped_buffer->on_host_shape();
   }
 
-  Literal literal;
-  TF_RETURN_IF_ERROR(
-      LiteralFromAllocation(allocation, *literal_shape, &literal));
-  *result->mutable_literal() = literal.ToProto();
+  TF_ASSIGN_OR_RETURN(
+      se::StreamExecutor * executor,
+      execute_backend_->stream_executor(shaped_buffer->device_ordinal()));
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Literal> result_literal,
+      execute_backend_->transfer_manager()->TransferLiteralFromDevice(
+          executor, *shaped_buffer));
+
+  if (LayoutUtil::LayoutsInShapesEqual(*return_shape,
+                                       result_literal->shape())) {
+    *result->mutable_literal() = result_literal->ToProto();
+  } else {
+    *result->mutable_literal() =
+        result_literal->Relayout(*return_shape)->ToProto();
+  }
   return tensorflow::Status::OK();
 }
 
+namespace {
+
+// Creates a clone of the given shaped buffer with the given device ordinal. The
+// shape and DeviceMemoryBase values of the clone are identical to the original.
+std::unique_ptr<ShapedBuffer> CloneShapedBufferOnDevice(
+    const ShapedBuffer& shaped_buffer, int device_ordinal) {
+  auto clone = MakeUnique<ShapedBuffer>(
+      shaped_buffer.on_host_shape(), shaped_buffer.on_device_shape(),
+      shaped_buffer.platform(), device_ordinal);
+  clone->buffers() = shaped_buffer.buffers();
+  return clone;
+}
+
+}  // namespace
+
 tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
                                              TransferToServerResponse* result) {
-  Literal literal = Literal(arg->literal());
-  const Shape& shape = literal.shape();
-
-  if (ShapeUtil::IsTuple(shape) && options_.number_of_replicas() > 1) {
-    // TODO(b/32990684): Tuple transfers to host end up allocating further
-    // buffers - implement that correctly.
-    return Unimplemented(
-        "Tuple transfers to the device not supported with replication.");
-  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
+                      Literal::CreateFromProto(arg->literal()));
+  const Shape& shape = literal->shape();
 
   std::vector<se::StreamExecutor*> replicas;
   if (arg->has_device_handle()) {
@@ -1063,25 +1073,38 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
         replicas, Replicas(*execute_backend_, SingleComputationDeviceHandle()));
   }
 
-  // Allocate memory on the device, using the stream executor. The size of the
-  // allocation is obtained by examining the shape of the literal passed from
-  // the client. An allocation handle is returned in the response.
-  int64 allocation_size =
-      execute_backend_->transfer_manager()->GetByteSizeRequirement(shape);
-
-  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase allocation,
-                      execute_backend_->memory_allocator()->Allocate(
-                          replicas[0]->device_ordinal(), allocation_size));
-
-  *result->mutable_data() = allocation_tracker_.Register(
-      execute_backend_.get(), replicas[0]->device_ordinal(), allocation, shape,
-      StrCat("TransferToServer literal of size ", allocation_size));
+  // All memory allocation is done on the first replica. The allocations in all
+  // other replicas mirror the firsts'.
+  int master_device_ordinal = replicas[0]->device_ordinal();
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ShapedBuffer> shaped_buffer,
+      execute_backend_->transfer_manager()->AllocateShapedBuffer(
+          shape, execute_backend_->memory_allocator(), master_device_ordinal));
 
+  // Transfer the data to the replicas.
   for (se::StreamExecutor* executor : replicas) {
-    TF_RETURN_IF_ERROR(
-        execute_backend_->transfer_manager()->TransferLiteralToDevice(
-            executor, literal, &allocation));
+    if (executor->device_ordinal() == master_device_ordinal) {
+      TF_RETURN_IF_ERROR(
+          execute_backend_->transfer_manager()->TransferLiteralToDevice(
+              executor, *literal, *shaped_buffer));
+    } else {
+      // The replica is not the master. Create an cloned shaped buffer with
+      // the replica's device ordinal. This is required because
+      // TransferLiteralToDevice verifies that the device ordinal of the shaped
+      // buffer matches that of the executor.
+      std::unique_ptr<ShapedBuffer> clone =
+          CloneShapedBufferOnDevice(*shaped_buffer, executor->device_ordinal());
+      TF_RETURN_IF_ERROR(
+          execute_backend_->transfer_manager()->TransferLiteralToDevice(
+              executor, *literal, *clone));
+    }
   }
+  TF_ASSIGN_OR_RETURN(
+      *result->mutable_data(),
+      allocation_tracker_.Register(std::move(shaped_buffer),
+                                   StrCat("TransferToServer literal of shape ",
+                                          ShapeUtil::HumanString(shape))));
+
   return tensorflow::Status::OK();
 }
 
@@ -1109,8 +1132,10 @@ tensorflow::Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
     executor = replicas[arg->replica_id()];
   }
 
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
+                      Literal::CreateFromProto(arg->literal()));
   return execute_backend_->transfer_manager()->TransferLiteralToInfeed(
-      executor, Literal(arg->literal()));
+      executor, *literal);
 }
 
 tensorflow::Status Service::TransferFromOutfeed(
@@ -1185,7 +1210,22 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
       bool is_constant,
       user_computation->IsConstant(arg->operand(), arg->parameters_size()));
   if (!is_constant) {
-    return InvalidArgument("Operand to ComputeConstant depends on parameter.");
+    StatusOr<const OperationRequest*> op_request_status =
+        user_computation->LookUpRequestForErrorReporting(arg->operand());
+    string op_request_string = "<unknown operation>";
+    if (op_request_status.ok()) {
+      op_request_string = op_request_status.ValueOrDie()->ShortDebugString();
+    }
+    return InvalidArgument(
+        "Operand to ComputeConstant depends on a parameter.\n\n"
+        "  op requested for constant evaluation: %s\n\n"
+        "This is an internal error that typically happens when the XLA user "
+        "(e.g. TensorFlow) is attempting to determine a value that must be a "
+        "compile-time constant (e.g. an array dimension) but it is not capable "
+        "of being evaluated at XLA compile time.\n\n"
+        "Please file a usability bug with the framework being used (e.g. "
+        "TensorFlow).",
+        op_request_string.c_str());
   }
 
   // We can't use ComputeProgramShape because it checks that all parameter
@@ -1213,7 +1253,8 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
   }
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(program_shape, {}, execution_options));
+                      CreateModuleConfig(program_shape, {}, execution_options,
+                                         *user_computation));
 
   // Exclude dead parameter instructions for the purpose of computing constants.
   TF_ASSIGN_OR_RETURN(
@@ -1222,18 +1263,16 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
                                           /*include_unreachable_instructions=*/
                                           false));
 
-  std::vector<Literal> parameters(arg->parameters_size());
+  std::vector<std::unique_ptr<Literal>> parameters(arg->parameters_size());
   for (int64 i = 0; i < arg->parameters_size(); ++i) {
-    parameters[i] = Literal(arg->parameters(i));
+    TF_ASSIGN_OR_RETURN(parameters[i],
+                        Literal::CreateFromProto(arg->parameters(i)));
   }
-  std::vector<const Literal*> parameter_ptrs;
-  std::transform(parameters.begin(), parameters.end(),
-                 std::back_inserter(parameter_ptrs),
-                 [](const Literal& literal) { return &literal; });
-
   HloEvaluator evaluator;
-  TF_ASSIGN_OR_RETURN(auto result_literal,
-                      evaluator.Evaluate(*module, parameter_ptrs));
+  TF_ASSIGN_OR_RETURN(
+      auto result_literal,
+      evaluator.Evaluate<std::unique_ptr<Literal>>(*module, parameters));
+
   // Since the shape_with_output_layout option in ExecutionOption is
   // non-effective to the Evaluator results, explicit relayout here.
   if (arg->has_output_layout()) {
@@ -1246,9 +1285,9 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
 
 tensorflow::Status Service::GetShape(const GetShapeRequest* arg,
                                      GetShapeResponse* result) {
-  TF_ASSIGN_OR_RETURN(const Allocation* allocation,
+  TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
                       allocation_tracker_.Resolve(arg->data()));
-  *result->mutable_shape() = allocation->shape();
+  *result->mutable_shape() = buffer->on_host_shape();
   return tensorflow::Status::OK();
 }
 
@@ -1357,6 +1396,17 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status =
           computation->AddConcatenateInstruction(arg->concatenate_request());
       break;
+    case OpRequest::kConditionalRequest: {
+      TF_ASSIGN_OR_RETURN(UserComputation * true_computation,
+                          computation_tracker_.Resolve(
+                              arg->conditional_request().true_computation()));
+      TF_ASSIGN_OR_RETURN(UserComputation * false_computation,
+                          computation_tracker_.Resolve(
+                              arg->conditional_request().false_computation()));
+      handle_status = computation->AddConditionalInstruction(
+          arg->conditional_request(), *true_computation, *false_computation);
+      break;
+    }
     case OpRequest::kConstantRequest:
       handle_status =
           computation->AddConstantInstruction(arg->constant_request());
@@ -1381,6 +1431,9 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status =
           computation->AddCustomCallInstruction(arg->custom_call_request());
       break;
+    case OpRequest::kDotRequest:
+      handle_status = computation->AddDotInstruction(arg->dot_request());
+      break;
     case OpRequest::kDynamicSliceRequest:
       handle_status =
           computation->AddDynamicSliceInstruction(arg->dynamic_slice_request());
@@ -1389,6 +1442,9 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status = computation->AddDynamicUpdateSliceInstruction(
           arg->dynamic_update_slice_request());
       break;
+    case OpRequest::kFftRequest:
+      handle_status = computation->AddFftInstruction(arg->fft_request());
+      break;
     case OpRequest::kGetTupleElementRequest:
       handle_status = computation->AddGetTupleElementInstruction(
           arg->get_tuple_element_request());
@@ -1397,9 +1453,9 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status = computation->AddInfeedInstruction(arg->infeed_request());
       break;
     case OpRequest::kOutfeedRequest:
-      TF_RETURN_IF_ERROR(
-          computation->AddOutfeedInstruction(arg->outfeed_request()));
-      return tensorflow::Status::OK();
+      handle_status =
+          computation->AddOutfeedInstruction(arg->outfeed_request());
+      break;
     case OpRequest::kMapRequest: {
       TF_ASSIGN_OR_RETURN(
           UserComputation * to_apply,
@@ -1501,8 +1557,10 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status = computation->AddRecvInstruction(arg->recv_request());
       break;
     }
+    case OpRequest::OP_NOT_SET:
+      return InvalidArgument("XLA service received OpRequest with OP_NOT_SET");
     default:
-      return InvalidArgument("Unsupported operation");
+      return InvalidArgument("Unsupported operation in XLA service");
   }
   TF_ASSIGN_OR_RETURN(*result->mutable_output(), handle_status);
 
@@ -1560,4 +1618,15 @@ StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Service::Replicas(
   return replicas;
 }
 
+Status Service::MaybeDumpHloModule(const HloModule& module) const {
+  const string xla_dump_unoptimized_hlo_proto_to =
+      module.config().debug_options().xla_dump_unoptimized_hlo_proto_to();
+  if (xla_dump_unoptimized_hlo_proto_to.empty()) {
+    return Status::OK();
+  }
+  HloProto proto = MakeHloProto(module);
+  return protobuf_util::DumpProtoToDirectory(
+      proto, xla_dump_unoptimized_hlo_proto_to, module.name());
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 47f4f0ade594089aa71717ef1e122886b0a6c7ac..6ce241971156599aaa25aea1b0caac0e1bd5379c 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -250,8 +250,9 @@ class Service : public ServiceInterface {
   // class.
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
-      tensorflow::gtl::ArraySlice<const Allocation*> arguments,
-      const ExecutionOptions& execution_options);
+      tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+      const ExecutionOptions& execution_options,
+      const UserComputation& user_computation);
 
  protected:
   friend class LocalExecutable;
@@ -265,25 +266,29 @@ class Service : public ServiceInterface {
 
   // Resolves the given argument handles in the allocation tracker and returns
   // the corresponding allocations. The function also verifies that each
-  // allocation matches the given backend and device ordinal.
-  StatusOr<std::vector<const Allocation*>> ResolveAndValidateArguments(
+  // allocation matches the execution platform and device ordinal.
+  StatusOr<std::vector<const ShapedBuffer*>> ResolveAndValidateArguments(
       tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-      const Backend* backend, int device_ordinal);
+      int device_ordinal);
 
   // Create a Hlo module config for the given program shape and arguments.
   // execution_options is optional; if not given a default is used.
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
-      const ExecutionOptions* execution_options);
+      const ExecutionOptions* execution_options,
+      const UserComputation& user_computation);
 
   // Builds an Executable for the given parameters.
+  //
+  // If device_allocator is not null, the compiler may use it to allocate temp
+  // buffers, which the compiler is responsible for freeing.  The allocator
+  // given here need not match the allocator used when running the executable.
   StatusOr<std::unique_ptr<Executable>> BuildExecutable(
       const VersionedComputationHandle& versioned_handle,
-      std::unique_ptr<HloModuleConfig> module_config,
-      const tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      Backend* backend, perftools::gputools::StreamExecutor* executor);
+      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+      perftools::gputools::StreamExecutor* executor,
+      DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Same as BuildExecutable() above, but builds a list of Executables for the
   // given computations that may interact with each other.
@@ -291,18 +296,17 @@ class Service : public ServiceInterface {
       std::vector<VersionedComputationHandle> versioned_handles,
       std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
       Backend* backend,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors);
+      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      DeviceMemoryAllocator* device_allocator);
 
   // Similar to BuildExecutable, but look in the compilation cache for the
   // executable first. If the executable is not in the cache, it is built and
   // inserted into the cache.
   StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
       const VersionedComputationHandle& versioned_handle,
-      std::unique_ptr<HloModuleConfig> module_config,
-      const tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      Backend* backend, perftools::gputools::StreamExecutor* executor,
-      ExecutionProfile* profile);
+      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
+      perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile,
+      DeviceMemoryAllocator* device_allocator = nullptr);
 
   // Runs the given executable with the given arguments and register the result
   // in the allocation tracker. The handle of the result from the tracker is
@@ -310,8 +314,7 @@ class Service : public ServiceInterface {
   // ExecutionProfile object which will be filled in with profile data.
   StatusOr<GlobalDataHandle> ExecuteAndRegisterResult(
       Executable* executable,
-      const tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
+      const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       Backend* backend, perftools::gputools::StreamExecutor* executor,
       const string& result_tag, ExecutionProfile* profile);
 
@@ -320,9 +323,7 @@ class Service : public ServiceInterface {
   // from the tracker are returned.
   StatusOr<std::vector<GlobalDataHandle>> ExecuteParallelAndRegisterResult(
       tensorflow::gtl::ArraySlice<Executable*> executables,
-      tensorflow::gtl::ArraySlice<
-          std::vector<perftools::gputools::DeviceMemoryBase>>
-          arguments,
+      tensorflow::gtl::ArraySlice<std::vector<const ShapedBuffer*>> arguments,
       Backend* backend,
       tensorflow::gtl::ArraySlice<DeviceHandle> device_handles,
       tensorflow::gtl::ArraySlice<string> result_tags,
@@ -347,6 +348,8 @@ class Service : public ServiceInterface {
   StatusOr<std::vector<perftools::gputools::StreamExecutor*>> Replicas(
       const Backend& backend, const DeviceHandle& device_handle) const;
 
+  Status MaybeDumpHloModule(const HloModule& module) const;
+
   // Returns the device handle that represents the replicated device for a
   // single computation that is not model-parallelized.
   DeviceHandle SingleComputationDeviceHandle() const;
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index 017e5ef09ed2f52b862821e9408540d188a1edf5..6c1f8feac7ed4423051cf2737be57dcfab508671 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -30,6 +30,9 @@ class ServiceExecutableRunOptions {
   using StreamBorrower =
       std::function<StatusOr<Pool<perftools::gputools::Stream>::SmartPtr>(int)>;
 
+  ServiceExecutableRunOptions()
+      : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
+
   explicit ServiceExecutableRunOptions(
       ExecutableRunOptions run_options, StreamBorrower borrow_stream = nullptr,
       tensorflow::thread::ThreadPool* xla_intra_op_thread_pool = nullptr)
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 3df1911d07cf0cd123604b1fac63923a725a37c6..004889b5f216015ee1e1308702b2bf4cb0deb344 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -36,6 +37,9 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
+using tensorflow::str_util::Join;
+using tensorflow::strings::Printf;
+
 namespace xla {
 
 namespace {
@@ -90,8 +94,6 @@ BinaryOperation OpcodeToBinaryOperation(HloOpcode opcode) {
       return BINOP_ATAN2;
     case HloOpcode::kComplex:
       return BINOP_COMPLEX;
-    case HloOpcode::kDot:
-      return BINOP_DOT;
     case HloOpcode::kMultiply:
       return BINOP_MUL;
     case HloOpcode::kAdd:
@@ -207,7 +209,8 @@ tensorflow::Status VerifyReducerShape(const ProgramShape& reducer_shape,
   }
 
   // Check that init_value's shape is suitable for reducer_shape.
-  if (!ShapeUtil::Compatible(accumulator_shape, init_value_shape)) {
+  if (!ShapeUtil::CompatibleIgnoringFpPrecision(accumulator_shape,
+                                                init_value_shape)) {
     return InvalidArgument(
         "Reduction function's accumulator shape differs from the "
         "init_value shape: %s vs %s",
@@ -218,8 +221,8 @@ tensorflow::Status VerifyReducerShape(const ProgramShape& reducer_shape,
   // Check that the inputs can be passed in as the second argument.
   const Shape& input_element_shape =
       ShapeUtil::MakeShape(input_element_type, {});
-  if (!ShapeUtil::Compatible(input_element_shape,
-                             reducer_shape.parameters(1))) {
+  if (!ShapeUtil::CompatibleIgnoringFpPrecision(input_element_shape,
+                                                reducer_shape.parameters(1))) {
     return InvalidArgument(
         "Reduction function's second parameter shape differs from the "
         "input type element type: %s vs %s",
@@ -229,7 +232,8 @@ tensorflow::Status VerifyReducerShape(const ProgramShape& reducer_shape,
 
   // Currently the accumulator and inputs must be the same type,
   // though that restriction could be relaxed.
-  if (!ShapeUtil::Compatible(accumulator_shape, reducer_shape.parameters(1))) {
+  if (!ShapeUtil::CompatibleIgnoringFpPrecision(accumulator_shape,
+                                                reducer_shape.parameters(1))) {
     return InvalidArgument(
         "Reduction function's second parameter shape currently must "
         "match the result shape. Got %s vs %s",
@@ -392,11 +396,13 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
                            dimension);
   }
   const Shape* arg_shape = nullptr;
+  PrimitiveType element_type = PRIMITIVE_TYPE_INVALID;
   for (const Shape* shape : arg_shapes) {
     TF_RETURN_IF_ERROR(
         ExpectNotTupleOrOpaque(*shape, "operand of concatenation"));
     if (!arg_shape) {
       arg_shape = shape;
+      element_type = arg_shape->element_type();
       continue;
     }
     if (ShapeUtil::Rank(*arg_shape) != ShapeUtil::Rank(*shape)) {
@@ -407,7 +413,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
           ShapeUtil::HumanString(*arg_shape).c_str(), ShapeUtil::Rank(*shape),
           ShapeUtil::HumanString(*shape).c_str());
     }
-    if (arg_shape->element_type() != shape->element_type()) {
+    if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(*arg_shape, *shape)) {
       return InvalidArgument(
           "cannot concatenate arrays with different element types: %s vs %s",
           PrimitiveType_Name(arg_shape->element_type()).c_str(),
@@ -429,6 +435,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
             ShapeUtil::HumanString(*shape).c_str(), dimension);
       }
     }
+    element_type = ShapeUtil::HigherPrecisionElementType(*shape, *arg_shape);
   }
 
   std::vector<int64> new_dimensions(arg_shape->dimensions().begin(),
@@ -436,7 +443,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   for (size_t i = 1; i < arg_shapes.size(); ++i) {
     new_dimensions[dimension] += arg_shapes[i]->dimensions(dimension);
   }
-  return ShapeUtil::MakeShape(arg_shape->element_type(), new_dimensions);
+  return ShapeUtil::MakeShape(element_type, new_dimensions);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferConvertShape(
@@ -534,7 +541,8 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
         ShapeUtil::HumanString(operand_shape).c_str(),
         padding_config.ShortDebugString().c_str());
   }
-  if (operand_shape.element_type() != padding_value_shape.element_type()) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(operand_shape,
+                                                     padding_value_shape)) {
     return InvalidArgument(
         "the element types of the operands to pad do not match");
   }
@@ -546,11 +554,118 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
                     std::max<int64>(operand_shape.dimensions(i) - 1, 0LL) *
                         padding_config.dimensions(i).interior_padding();
   }
-  return ShapeUtil::MakeShape(operand_shape.element_type(), dimensions);
+  return ShapeUtil::MakeShape(
+      ShapeUtil::HigherPrecisionElementType(operand_shape, padding_value_shape),
+      dimensions);
+}
+
+// Current DotDimensionNumbers Requirements:
+//
+// Contracting Dimensions:
+// *) Exactly one contracting dimension on both lhs and rhs.
+// *) Contracting dimension size must be the same on both lhs and rhs.
+// *) Contracting dimension numbers do not need to be the same (i.e. transposes
+//    are passed on to emitter implementations).
+//
+// Batch Dimensions:
+// *) Same number of batch dimensions on both lhs and rhs.
+// *) Same batch dimension numbers (and sizes) on both lhs and rhs.
+// *) Batch dimension numbers must be ordered before contracting and
+//    non-contracting/non-batch dimension numbers.
+//
+// Non-Contracting-Non-Batch Dimensions:
+// *) Can be 0 (matrix-vector) or 1 (matrix-matrix).
+//
+
+namespace {
+
+Status ValidateDotDimensionNumbers(
+    const Shape& lhs, const Shape& rhs,
+    const DotDimensionNumbers& dimension_numbers) {
+  // Check that dimension numbers are in range.
+  auto dims_in_range =
+      [](const int64 rank, tensorflow::gtl::ArraySlice<int64> contracting_dims,
+         tensorflow::gtl::ArraySlice<int64> batch_dims) -> bool {
+    auto in_range = [&rank](int64 i) -> bool { return 0 <= i && i < rank; };
+    return std::all_of(contracting_dims.begin(), contracting_dims.end(),
+                       in_range) &&
+           std::all_of(batch_dims.begin(), batch_dims.end(), in_range);
+  };
+
+  tensorflow::gtl::ArraySlice<int64> lhs_contracting_dimensions =
+      AsInt64Slice(dimension_numbers.lhs_contracting_dimensions());
+  tensorflow::gtl::ArraySlice<int64> rhs_contracting_dimensions =
+      AsInt64Slice(dimension_numbers.rhs_contracting_dimensions());
+  tensorflow::gtl::ArraySlice<int64> lhs_batch_dimensions =
+      AsInt64Slice(dimension_numbers.lhs_batch_dimensions());
+  tensorflow::gtl::ArraySlice<int64> rhs_batch_dimensions =
+      AsInt64Slice(dimension_numbers.rhs_batch_dimensions());
+
+  if (!dims_in_range(ShapeUtil::Rank(lhs), lhs_contracting_dimensions,
+                     lhs_batch_dimensions) ||
+      !dims_in_range(ShapeUtil::Rank(rhs), rhs_contracting_dimensions,
+                     rhs_batch_dimensions)) {
+    return InvalidArgument("A dimension number is out of range in dot: %s",
+                           dimension_numbers.DebugString().c_str());
+  }
+
+  // Check that dimension numbers are unique.
+  auto dims_unique = [](tensorflow::gtl::ArraySlice<int64> contracting_dims,
+                        tensorflow::gtl::ArraySlice<int64> batch_dims) -> bool {
+    tensorflow::gtl::FlatSet<int64> dim_set;
+    auto is_unique = [&dim_set](int64 i) -> bool {
+      return dim_set.insert(i).second;
+    };
+    return std::all_of(contracting_dims.begin(), contracting_dims.end(),
+                       is_unique) &&
+           std::all_of(batch_dims.begin(), batch_dims.end(), is_unique);
+  };
+
+  if (!dims_unique(lhs_contracting_dimensions, lhs_batch_dimensions) ||
+      !dims_unique(rhs_contracting_dimensions, rhs_batch_dimensions)) {
+    return InvalidArgument("A dimension number is not unique in dot: %s",
+                           dimension_numbers.DebugString().c_str());
+  }
+
+  // Check that the count of non-contracting-non-batch dimensions is in {0, 1}.
+  const int64 lhs_non_contracting_non_batch_dims =
+      ShapeUtil::Rank(lhs) -
+      dimension_numbers.lhs_contracting_dimensions_size() -
+      dimension_numbers.lhs_batch_dimensions_size();
+  const int64 rhs_non_contracting_non_batch_dims =
+      ShapeUtil::Rank(rhs) -
+      dimension_numbers.rhs_contracting_dimensions_size() -
+      dimension_numbers.rhs_batch_dimensions_size();
+  if (lhs_non_contracting_non_batch_dims < 0 ||
+      lhs_non_contracting_non_batch_dims > 1 ||
+      rhs_non_contracting_non_batch_dims < 0 ||
+      rhs_non_contracting_non_batch_dims > 1) {
+    return InvalidArgument(
+        "batch and contracting dimension number mismatch "
+        "with rank ");
+  }
+
+  // Check that batch dimension numbers are ordered before all others, and
+  // that they are monotonically increasing.
+  std::vector<int64> batch_dim_numbers(lhs_batch_dimensions.size());
+  std::iota(batch_dim_numbers.begin(), batch_dim_numbers.end(), 0);
+  if (!std::equal(batch_dim_numbers.begin(), batch_dim_numbers.end(),
+                  lhs_batch_dimensions.begin()) ||
+      !std::equal(batch_dim_numbers.begin(), batch_dim_numbers.end(),
+                  rhs_batch_dimensions.begin())) {
+    return InvalidArgument(
+        "batch dimension numbers must precede non-batch dimensions and be"
+        "monotonically increasing.");
+  }
+
+  return Status::OK();
 }
 
-/* static */ StatusOr<Shape> ShapeInference::InferDotOpShape(const Shape& lhs,
-                                                             const Shape& rhs) {
+}  // namespace
+
+/* static */ StatusOr<Shape> ShapeInference::InferDotOpShape(
+    const Shape& lhs, const Shape& rhs,
+    const DotDimensionNumbers& dimension_numbers) {
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(lhs, "lhs of dot"));
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(rhs, "rhs of dot"));
 
@@ -566,45 +681,71 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   };
 
   // Check if both element types are the same.
-  if (lhs.element_type() != rhs.element_type()) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
     return fail("element types do not match");
   }
 
-  if (ShapeUtil::Rank(lhs) < 1 || ShapeUtil::Rank(lhs) > 2 ||
-      ShapeUtil::Rank(rhs) < 1 || ShapeUtil::Rank(rhs) > 2) {
-    return fail("dot only supports rank 1 or 2");
+  if ((ShapeUtil::Rank(lhs) < 1) || (ShapeUtil::Rank(rhs) < 1)) {
+    return fail("dot only supports rank 1 or above.");
   }
 
-  // Determine the index of the contracted dimensions for input tensors.
-  // dimensions -1 of lhs and dimension 0 of rhs are contracted.
-  int64 lhs_contracted_dimension = ShapeUtil::GetDimensionNumber(lhs, -1);
-  int64 rhs_contracted_dimension = 0;
+  // Validate basic properties of dot dimension numbers.
+  TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(lhs, rhs, dimension_numbers));
+
+  // Check that there is only one contracting dimension for both lhs and rhs.
+  if (dimension_numbers.lhs_contracting_dimensions_size() !=
+          dimension_numbers.rhs_contracting_dimensions_size() ||
+      dimension_numbers.lhs_contracting_dimensions_size() != 1) {
+    return fail("must specify one contracting dimension for both lhs and rhs.");
+  }
 
-  // Check if the contracted dimension sizes are the same.
-  if ((lhs_contracted_dimension < ShapeUtil::Rank(lhs) &&
-       rhs_contracted_dimension < ShapeUtil::Rank(rhs)) &&
-      lhs.dimensions(lhs_contracted_dimension) !=
-          rhs.dimensions(rhs_contracted_dimension)) {
-    return fail("contracted dimensions mismatch");
+  // Check that contracting dimension sizes match.
+  const int64 lhs_contracting_dimension =
+      dimension_numbers.lhs_contracting_dimensions(0);
+  const int64 rhs_contracting_dimension =
+      dimension_numbers.rhs_contracting_dimensions(0);
+  if (lhs.dimensions(lhs_contracting_dimension) !=
+      rhs.dimensions(rhs_contracting_dimension)) {
+    return fail("contracting dimension sizes do not match.");
+  }
+
+  // Check that number of batch dimensions match.
+  if (dimension_numbers.lhs_batch_dimensions_size() !=
+      dimension_numbers.rhs_batch_dimensions_size()) {
+    return fail("must the same number of batch dimensions for lhs and rhs.");
+  }
+
+  // Check that batch dimension numbers and sizes match.
+  for (int64 i = 0; i < dimension_numbers.lhs_batch_dimensions_size(); ++i) {
+    if (dimension_numbers.lhs_batch_dimensions(i) !=
+            dimension_numbers.rhs_batch_dimensions(i) ||
+        lhs.dimensions(dimension_numbers.lhs_batch_dimensions(i)) !=
+            rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i))) {
+      return fail("batch dimension numbers and sizes must match for lhs/rhs.");
+    }
   }
 
   // The ranks of lhs and rhs are decremented by 1 respectively due to the
   // contraction, and added for the rank of the result. When an input tensor is
   // a scalar, its contribution to the rank of the result is 0.
   // Generate the result dimensions in order, rhs dimensions followed by lhs
-  // dimensions except the contracted dimensions.
+  // dimensions except the contracted and batch dimensions.
   std::vector<int64> dimensions;
+  std::unordered_set<int64> rhs_batch_dims(
+      dimension_numbers.rhs_batch_dimensions().begin(),
+      dimension_numbers.rhs_batch_dimensions().end());
   for (int64 i = 0; i < ShapeUtil::Rank(lhs); i++) {
-    if (i != lhs_contracted_dimension) {
+    if (i != lhs_contracting_dimension) {
       dimensions.push_back(lhs.dimensions(i));
     }
   }
   for (int64 i = 0; i < ShapeUtil::Rank(rhs); i++) {
-    if (i != rhs_contracted_dimension) {
+    if (i != rhs_contracting_dimension && rhs_batch_dims.count(i) == 0) {
       dimensions.push_back(rhs.dimensions(i));
     }
   }
-  Shape result = ShapeUtil::MakeShape(lhs.element_type(), dimensions);
+  Shape result = ShapeUtil::MakeShape(
+      ShapeUtil::HigherPrecisionElementType(lhs, rhs), dimensions);
 
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(result));
   VLOG(2) << "inferred dot shape: " << ShapeUtil::HumanString(result);
@@ -635,7 +776,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                              ShapeUtil::HumanString(rhs).c_str());
     }
   }
-  return ShapeUtil::MakeShape(lhs.element_type(), output_dimensions);
+  return ShapeUtil::MakeShape(ShapeUtil::HigherPrecisionElementType(lhs, rhs),
+                              output_dimensions);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferInDimBroadcastShape(
@@ -697,6 +839,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   // specified in broadcast_dimensions are then changed to match the
   // corresponding dimension size in smaller_shape.
   Shape output_shape(larger_shape);
+  output_shape.set_element_type(
+      ShapeUtil::HigherPrecisionElementType(larger_shape, smaller_shape));
 
   for (int i = 0; i < smaller_shape.dimensions_size(); ++i) {
     int64 dimension_to_match = broadcast_dimensions.at(i);
@@ -746,7 +890,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   TF_RETURN_IF_ERROR(
       ExpectNotTupleOrOpaque(rhs, "rhs of elementwise binary operation"));
 
-  if (!ShapeUtil::SameElementType(lhs, rhs)) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
     return InvalidArgument(
         "binary op %s with different element types: %s and %s",
         BinaryOperation_Name(operation).c_str(),
@@ -765,10 +909,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     }
   }
 
-  if (ShapeUtil::Compatible(lhs, rhs)) {
+  if (ShapeUtil::CompatibleIgnoringFpPrecision(lhs, rhs)) {
     // If the shapes are the same other than layout, the output shape is the
     // same (elementwise op).
-    return lhs;
+    return ShapeUtil::ChangeElementType(
+        lhs, ShapeUtil::HigherPrecisionElementType(lhs, rhs));
   }
 
   if (ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs)) {
@@ -805,7 +950,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}",
       BinaryOperation_Name(operation).c_str(),
       ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(),
-      tensorflow::str_util::Join(broadcast_dimensions, ", ").c_str());
+      Join(broadcast_dimensions, ", ").c_str());
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
 
@@ -816,8 +961,6 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       rhs, tensorflow::strings::StrCat("rhs of binary operation ",
                                        BinaryOperation_Name(operation))));
   switch (operation) {
-    case BINOP_DOT:
-      return InferDotOpShape(lhs, rhs);
     case BINOP_MAX:
     case BINOP_MIN:
     case BINOP_SUB:
@@ -843,7 +986,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       TF_ASSIGN_OR_RETURN(const Shape& shape,
                           InferElementwiseBinaryOpShape(operation, lhs, rhs,
                                                         broadcast_dimensions));
-      if (lhs.element_type() == F32) {
+      if (lhs.element_type() == F32 && rhs.element_type() == F32) {
         return ShapeUtil::ChangeElementType(shape, C64);
       } else {
         return Unimplemented("complex component type not supported");
@@ -948,12 +1091,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     TF_RETURN_IF_ERROR(
         ExpectNotTupleOrOpaque(*arg_shapes[i], "operand of map"));
 
-    if (ShapeUtil::Compatible(*arg_shapes[i], *arg_shape)) {
+    if (ShapeUtil::CompatibleIgnoringFpPrecision(*arg_shapes[i], *arg_shape)) {
       continue;
     }
     if (!ShapeUtil::IsTuple(*arg_shapes[i]) &&
         !ShapeUtil::IsTuple(*arg_shape) &&
-        ShapeUtil::SameElementType(*arg_shapes[i], *arg_shape)) {
+        ShapeUtil::SameElementTypeIgnoringFpPrecision(*arg_shapes[i],
+                                                      *arg_shape)) {
       if (ShapeUtil::IsScalar(*arg_shapes[i])) {
         continue;
       }
@@ -970,7 +1114,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Map operation requires all operands to have the same shape; got: "
         "%s",
-        tensorflow::str_util::Join(pieces, ", ").c_str());
+        Join(pieces, ", ").c_str());
   }
 
   // Check that dimensions.size == arg_shape.dimensions_size() (we currently
@@ -987,7 +1131,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     if (dimensions[i] != i) {
       return InvalidArgument(
           "Map requires monotonically increasing dimension numbers, found: %s ",
-          tensorflow::str_util::Join(dimensions, ", ").c_str());
+          Join(dimensions, ", ").c_str());
     }
   }
 
@@ -1018,7 +1162,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
           i, ShapeUtil::HumanString(parameter_shape).c_str());
     }
 
-    if (parameter_shape.element_type() != arg_shape->element_type()) {
+    if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(parameter_shape,
+                                                       *arg_shape)) {
       return InvalidArgument(
           "mapped computation's parameter type has to match argument element "
           "type; got parameter %d shape: %s, argument shape: %s",
@@ -1091,7 +1236,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
 
-  if (!ShapeUtil::SameElementType(offset_shape, operand_shape)) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(offset_shape,
+                                                     operand_shape)) {
     return InvalidArgument(
         "The inputs should have the same element type for batch-norm-training, "
         "but the shape of offset factor is %s "
@@ -1100,7 +1246,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
 
-  if (!ShapeUtil::SameElementType(scale_shape, operand_shape)) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(scale_shape,
+                                                     operand_shape)) {
     return InvalidArgument(
         "The inputs should have the same element type for batch-norm-training, "
         "but the shape of scale factor is %s "
@@ -1199,7 +1346,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
 
-  if (!ShapeUtil::SameElementType(offset_shape, operand_shape)) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(offset_shape,
+                                                     operand_shape)) {
     return InvalidArgument(
         "The inputs should have the same element type for "
         "batch-norm-inference, "
@@ -1209,7 +1357,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
 
-  if (!ShapeUtil::SameElementType(scale_shape, operand_shape)) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(scale_shape,
+                                                     operand_shape)) {
     return InvalidArgument(
         "The inputs should have the same element type for "
         "batch-norm-inference, "
@@ -1219,7 +1368,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
 
-  if (!ShapeUtil::SameElementType(mean_shape, operand_shape)) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(mean_shape,
+                                                     operand_shape)) {
     return InvalidArgument(
         "The inputs should have the same element type for "
         "batch-norm-inference, "
@@ -1229,7 +1379,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
 
-  if (!ShapeUtil::SameElementType(variance_shape, operand_shape)) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(variance_shape,
+                                                     operand_shape)) {
     return InvalidArgument(
         "The inputs should have the same element type for "
         "batch-norm-inference, "
@@ -1351,7 +1502,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         PrimitiveType_Name(output_grad_shape.element_type()).c_str());
   }
 
-  if (!ShapeUtil::SameElementType(output_grad_shape, operand_shape)) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(output_grad_shape,
+                                                     operand_shape)) {
     return InvalidArgument(
         "The inputs should have the same element type for batch-norm-grad, "
         "but the element type of output_grad is %s "
@@ -1360,7 +1512,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
 
-  if (!ShapeUtil::SameElementType(scale_shape, operand_shape)) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(scale_shape,
+                                                     operand_shape)) {
     return InvalidArgument(
         "The inputs should have the same element type for batch-norm-grad, "
         "but the element type of scale factor is %s "
@@ -1369,7 +1522,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
 
-  if (!ShapeUtil::SameElementType(mean_shape, operand_shape)) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(mean_shape,
+                                                     operand_shape)) {
     return InvalidArgument(
         "The inputs should have the same element type for batch-norm-grad, "
         "but the element type of mean is %s "
@@ -1378,7 +1532,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         PrimitiveType_Name(operand_shape.element_type()).c_str());
   }
 
-  if (!ShapeUtil::SameElementType(var_shape, operand_shape)) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(var_shape,
+                                                     operand_shape)) {
     return InvalidArgument(
         "The inputs should have the same element type for batch-norm-grad, "
         "but the element type of mean is %s "
@@ -1439,7 +1594,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(lhs, "lhs of convolution"));
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(rhs, "rhs of convolution"));
 
-  if (!ShapeUtil::SameElementType(lhs, rhs)) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
     return InvalidArgument(
         "Convolution with different element types: %s and %s",
         ShapeUtil::HumanString(lhs).c_str(),
@@ -1584,15 +1739,107 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     dimensions[dnums.output_spatial_dimensions(i)] =
         window_output_shape.dimensions(i);
   }
+  return ShapeUtil::MakeShape(ShapeUtil::HigherPrecisionElementType(lhs, rhs),
+                              dimensions);
+}
 
-  return ShapeUtil::MakeShape(lhs.element_type(), dimensions);
+/* static */ StatusOr<Shape> ShapeInference::InferFftShape(
+    const Shape& in, const FftType fft_type,
+    const tensorflow::gtl::ArraySlice<int64> fft_length) {
+  const int64 fft_rank = fft_length.size();
+  if (fft_rank < 1 || fft_rank > 3) {
+    return InvalidArgument("FFT only supports ranks 1-3, but got %lld",
+                           fft_rank);
+  }
+#define RET_CHECK_RANK(x)                              \
+  if (x.dimensions_size() < fft_rank) {                \
+    return InvalidArgument(                            \
+        "FFT of rank %lld requires input of at least " \
+        "same rank; got input of rank %d",             \
+        fft_rank, x.dimensions_size());                \
+  }
+  switch (fft_type) {
+    case FFT:
+    case IFFT:
+      if (in.element_type() != C64) {
+        return InvalidArgument("%s requires C64 input type, found %s",
+                               FftType_Name(fft_type).c_str(),
+                               PrimitiveType_Name(in.element_type()).c_str());
+      }
+      RET_CHECK_RANK(in);
+      return in;
+    case RFFT: {
+      if (in.element_type() != F32) {
+        return InvalidArgument("RFFT requires F32 input type, found %s",
+                               PrimitiveType_Name(in.element_type()).c_str());
+      }
+      RET_CHECK_RANK(in);
+      for (int i = 0; i < fft_rank; i++) {
+        if (in.dimensions(in.dimensions_size() - fft_rank + i) !=
+            fft_length[i]) {
+          return InvalidArgument(
+              "RFFT requires innermost dimensions match fft_length but "
+              "dimension %lld is %lld and should be %lld",
+              in.dimensions_size() - fft_rank + i,
+              in.dimensions(in.dimensions_size() - fft_rank + i),
+              fft_length[i]);
+        }
+      }
+      Shape result = ShapeUtil::ChangeElementType(in, C64);
+      result.set_dimensions(result.dimensions_size() - 1,
+                            fft_length[fft_rank - 1] / 2 + 1);
+      return result;
+    }
+    case IRFFT: {
+      if (in.element_type() != C64) {
+        return InvalidArgument("IRFFT requires C64 input type, found %s",
+                               PrimitiveType_Name(in.element_type()).c_str());
+      }
+      RET_CHECK_RANK(in);
+      Shape result = ShapeUtil::ComplexComponentShape(in);
+      for (int i = 0; i < fft_rank - 1; i++) {
+        if (in.dimensions(in.dimensions_size() - fft_rank + i) !=
+            fft_length[i]) {
+          return InvalidArgument(
+              "IRFFT requires all but one innermost dimensions match "
+              "fft_length, but dimension %lld is %lld and should be %lld",
+              in.dimensions_size() - fft_rank + i,
+              in.dimensions(in.dimensions_size() - fft_rank + i),
+              fft_length[i]);
+        }
+      }
+      if (in.dimensions(in.dimensions_size() - 1) !=
+          fft_length[fft_rank - 1] / 2 + 1) {
+        return InvalidArgument(
+            "IRFFT requires innermost dimension matches fft_length/2+1, but "
+            "dimension %d is %lld and should be %lld",
+            in.dimensions_size() - 1, in.dimensions(in.dimensions_size() - 1),
+            fft_length[fft_rank - 1] / 2 + 1);
+      }
+      result.set_dimensions(result.dimensions_size() - 1,
+                            fft_length[fft_rank - 1]);
+      return result;
+    }
+    default:
+      LOG(FATAL) << "Unexpected fft_type: " << fft_type;
+  }
+#undef RET_CHECK_RANK
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferCrossReplicaSumShape(
-    const Shape& operand) {
-  TF_RETURN_IF_ERROR(
-      ExpectNotTupleOrOpaque(operand, "operand of cross replica sum"));
-  return operand;
+    tensorflow::gtl::ArraySlice<const Shape*> operand_shapes) {
+  for (const Shape* operand_shape : operand_shapes) {
+    TF_RETURN_IF_ERROR(
+        ExpectNotTupleOrOpaque(*operand_shape, "operand of cross replica sum"));
+  }
+  if (operand_shapes.size() == 1) {
+    return *operand_shapes[0];
+  }
+  std::vector<Shape> operand_shape_values;
+  for (const Shape* operand_shape : operand_shapes) {
+    operand_shape_values.push_back(*operand_shape);
+  }
+  return ShapeUtil::MakeTupleShape(operand_shape_values);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferReduceShape(
@@ -1655,16 +1902,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   }
   const Shape& operand_element_shape =
       ShapeUtil::MakeShape(operand_shape.element_type(), {});
-  if (!ShapeUtil::Compatible(operand_element_shape,
-                             select_shape.parameters(0))) {
+  if (!ShapeUtil::CompatibleIgnoringFpPrecision(operand_element_shape,
+                                                select_shape.parameters(0))) {
     return InvalidArgument(
         "select function's first parameter shape currently must "
         "match the operand element shape. Got %s vs %s",
         ShapeUtil::HumanString(select_shape.parameters(0)).c_str(),
         ShapeUtil::HumanString(operand_element_shape).c_str());
   }
-  if (!ShapeUtil::Compatible(operand_element_shape,
-                             select_shape.parameters(1))) {
+  if (!ShapeUtil::CompatibleIgnoringFpPrecision(operand_element_shape,
+                                                select_shape.parameters(1))) {
     return InvalidArgument(
         "select function's second parameter shape currently must "
         "match the operand element shape. Got %s vs %s",
@@ -1681,7 +1928,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                       InferWindowOutputShape(operand_shape, window,
                                              operand_shape.element_type(),
                                              /*allow_negative_padding=*/false));
-  if (!ShapeUtil::Compatible(source_shape, window_result_shape)) {
+  if (!ShapeUtil::CompatibleIgnoringFpPrecision(source_shape,
+                                                window_result_shape)) {
     return InvalidArgument(
         "source shape does not match the shape of window-reduced operand: "
         "source(%s), window-reduced operand(%s)",
@@ -1695,21 +1943,28 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     const Shape& arg, tensorflow::gtl::ArraySlice<int64> starts,
     tensorflow::gtl::ArraySlice<int64> limits,
     tensorflow::gtl::ArraySlice<int64> strides) {
+  auto error = [&](const string& message) {
+    return InvalidArgument(
+        "%s in slice operation; argument shape: %s; starts: {%s}; limits: "
+        "{%s}; strides: {%s}",
+        message.c_str(), ShapeUtil::HumanString(arg).c_str(),
+        Join(starts, ",").c_str(), Join(limits, ",").c_str(),
+        Join(strides, ",").c_str());
+  };
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of slice"));
   VLOG(2) << tensorflow::strings::Printf(
       "slicing shape %s starts={%s} limits={%s}",
-      ShapeUtil::HumanString(arg).c_str(),
-      tensorflow::str_util::Join(starts, ", ").c_str(),
-      tensorflow::str_util::Join(limits, ", ").c_str());
+      ShapeUtil::HumanString(arg).c_str(), Join(starts, ", ").c_str(),
+      Join(limits, ", ").c_str());
 
   if (starts.size() != limits.size()) {
-    return InvalidArgument("slice start and limit sizes differ: %zu vs %zu",
-                           starts.size(), limits.size());
+    return error(Printf("slice start and limit sizes differ: %zu vs %zu",
+                        starts.size(), limits.size()));
   }
 
   if (starts.size() != strides.size()) {
-    return InvalidArgument("slice start and strides sizes differ: %zu vs %zu",
-                           starts.size(), strides.size());
+    return error(Printf("slice start and strides sizes differ: %zu vs %zu",
+                        starts.size(), strides.size()));
   }
 
   if (starts.size() != ShapeUtil::Rank(arg)) {
@@ -1728,20 +1983,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
                              start_index);
     }
     if (limit_index > arg.dimensions(dimension)) {
-      return InvalidArgument(
-          "limit index (%lld) must be less than or equal to dimension "
-          "size (%lld)",
-          limit_index, arg.dimensions(dimension));
+      return error(
+          Printf("limit index (%lld) must be less than or equal to dimension "
+                 "size (%lld)",
+                 limit_index, arg.dimensions(dimension)));
     }
     VLOG(2) << tensorflow::strings::Printf("starts[%lld] = %lld", dimension,
                                            start_index);
     VLOG(2) << tensorflow::strings::Printf("limits[%lld] = %lld", dimension,
                                            limit_index);
     if (start_index > limit_index) {
-      return InvalidArgument(
-          "limit index (%lld) must be greater or equal to "
-          "start index (%lld) in slice with positive stride",
-          limit_index, start_index);
+      return error(
+          Printf("limit index (%lld) must be greater or equal to "
+                 "start index (%lld) in slice with positive stride",
+                 limit_index, start_index));
     }
     if (stride <= 0) {
       return InvalidArgument("stride (%lld) must be positive", stride);
@@ -1764,7 +2019,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
       ShapeUtil::HumanString(operand_shape).c_str(),
       ShapeUtil::HumanString(start_indices_shape).c_str(),
-      tensorflow::str_util::Join(slice_sizes, ", ").c_str());
+      Join(slice_sizes, ", ").c_str());
 
   if (ShapeUtil::Rank(start_indices_shape) != 1) {
     return InvalidArgument(
@@ -1857,7 +2112,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         ShapeUtil::Rank(update_shape), ShapeUtil::Rank(operand_shape));
   }
 
-  if (operand_shape.element_type() != update_shape.element_type()) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(operand_shape,
+                                                     update_shape)) {
     return InvalidArgument(
         "dynamic update slice update element type does not match argument. "
         "operand.element_type: %s vs update.element_type: %s",
@@ -1958,6 +2214,64 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   return init;
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferConditionalShape(
+    const Shape& predicate, const Shape& true_operand,
+    const Shape& false_operand, const ProgramShape& true_computation,
+    const ProgramShape& false_computation) {
+  if (!ShapeUtil::ShapeIs(predicate, PRED, {})) {
+    return InvalidArgument("predicate must be a boolean; got %s.",
+                           ShapeUtil::HumanString(predicate).c_str());
+  }
+
+  if (true_computation.parameters_size() != 1) {
+    return InvalidArgument("true_computation must take 1 argument; got %d.",
+                           true_computation.parameters_size());
+  }
+  if (!ShapeUtil::Compatible(true_computation.parameters(0), true_operand)) {
+    auto true_shape_string = [&]() {
+      return tensorflow::strings::Printf(
+          "true_operand: %s; true_computation: %s",
+          ShapeUtil::HumanString(true_operand).c_str(),
+          ShapeUtil::HumanString(true_computation).c_str());
+    };
+    return InvalidArgument(
+        "true_operand must match the shape of the only parameter of "
+        "true_computation: got %s.",
+        true_shape_string().c_str());
+  }
+
+  if (false_computation.parameters_size() != 1) {
+    return InvalidArgument("false_computation must take 1 argument; got %d.",
+                           false_computation.parameters_size());
+  }
+  if (!ShapeUtil::Compatible(false_computation.parameters(0), false_operand)) {
+    auto false_shape_string = [&]() {
+      return tensorflow::strings::Printf(
+          "false_operand: %s; false_computation: %s",
+          ShapeUtil::HumanString(false_operand).c_str(),
+          ShapeUtil::HumanString(false_computation).c_str());
+    };
+    return InvalidArgument(
+        "false_operand must match the shape of the only parameter of "
+        "false_computation: got %s.",
+        false_shape_string().c_str());
+  }
+  if (!ShapeUtil::Compatible(true_computation.result(),
+                             false_computation.result())) {
+    auto shape_string = [&]() {
+      return tensorflow::strings::Printf(
+          "true_computation result: %s; false_computation result: %s.",
+          ShapeUtil::HumanString(true_computation.result()).c_str(),
+          ShapeUtil::HumanString(false_computation.result()).c_str());
+    };
+    return InvalidArgument(
+        "the result of true_computation and false_computation must have the "
+        "same shape: got %s.",
+        shape_string().c_str());
+  }
+  return true_computation.result();
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferBroadcastShape(
     const Shape& operand, tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(operand, "operand of broadcast"));
@@ -2003,8 +2317,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     return InvalidArgument(
         "Reshape dimensions [%s] are not a permutation of the operand "
         "dimensions (operand shape is %s).",
-        tensorflow::str_util::Join(dimensions, ",").c_str(),
-        ShapeUtil::HumanString(operand).c_str());
+        Join(dimensions, ",").c_str(), ShapeUtil::HumanString(operand).c_str());
   }
 
   return inferred_shape;
@@ -2036,24 +2349,26 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(min, "clamp min"));
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(operand, "clamp operand"));
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(max, "clamp max"));
-  if (!ShapeUtil::SameElementType(min, operand) ||
-      !ShapeUtil::SameElementType(max, operand)) {
+  if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(min, operand) ||
+      !ShapeUtil::SameElementTypeIgnoringFpPrecision(max, operand)) {
     return InvalidArgument("clamp op with different operand types: %s, %s, %s",
                            ShapeUtil::HumanString(min).c_str(),
                            ShapeUtil::HumanString(operand).c_str(),
                            ShapeUtil::HumanString(max).c_str());
   }
-  if (((ShapeUtil::Compatible(min, operand) || ShapeUtil::IsScalar(min)) &&
-       (ShapeUtil::Compatible(max, operand) || ShapeUtil::IsScalar(max)))) {
+  if (((ShapeUtil::CompatibleIgnoringFpPrecision(min, operand) ||
+        ShapeUtil::IsScalar(min)) &&
+       (ShapeUtil::CompatibleIgnoringFpPrecision(max, operand) ||
+        ShapeUtil::IsScalar(max)))) {
     return operand;
   }
   if (ShapeUtil::IsScalar(operand)) {
-    if (ShapeUtil::Compatible(min, max)) {
-      return min;
+    if (ShapeUtil::CompatibleIgnoringFpPrecision(min, max)) {
+      return ShapeUtil::ChangeElementType(min, operand.element_type());
     } else if (ShapeUtil::IsScalar(min)) {
-      return max;
+      return ShapeUtil::ChangeElementType(max, operand.element_type());
     } else if (ShapeUtil::IsScalar(max)) {
-      return min;
+      return ShapeUtil::ChangeElementType(min, operand.element_type());
     }
   }
   return Unimplemented(
@@ -2066,7 +2381,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 // broadcast from all operands, not just the predicate.
 /* static */ StatusOr<Shape> ShapeInference::InferSelectShape(
     const Shape& pred, const Shape& on_true, const Shape& on_false) {
-  if (!ShapeUtil::Compatible(on_true, on_false)) {
+  bool compatible;
+  if (ShapeUtil::IsTuple(on_true)) {
+    // Select only defines the top-level buffer, so if it's a tuple, the two
+    // input must match exactly.
+    compatible = ShapeUtil::Compatible(on_true, on_false);
+  } else {
+    compatible = ShapeUtil::CompatibleIgnoringFpPrecision(on_true, on_false);
+  }
+  if (!compatible) {
     return InvalidArgument(
         "operands to select must be the same shape; got %s and %s",
         ShapeUtil::HumanString(on_true).c_str(),
@@ -2081,7 +2404,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     // By this stage we know that pred's element type is PRED. Therefore, this
     // check restricts pred to be a PRED scalar, or a PRED array with the same
     // dimensions as on_true and on_false.
-    return on_true;
+    return ShapeUtil::ChangeElementType(
+        on_true, ShapeUtil::HigherPrecisionElementType(on_true, on_false));
   } else {
     return Unimplemented(
         "select operation with non-scalar predicate with dimensionality "
@@ -2096,8 +2420,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   // The applied function's arity equals the number of arguments.
   if (arg_shapes.size() != to_apply.parameters_size()) {
     string computation_signature = ShapeUtil::HumanString(to_apply);
-    string argument_shapes = tensorflow::str_util::Join(
-        arg_shapes, ", ", [](string* out, const Shape* shape) {
+    string argument_shapes =
+        Join(arg_shapes, ", ", [](string* out, const Shape* shape) {
           tensorflow::strings::StrAppend(out, ShapeUtil::HumanString(*shape));
         });
     return InvalidArgument(
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 0aadb98a407c2160b60e686f6f3ea250bb9e838f..b39151ebbc19f5d0b702a80da5069f58c8dfb07d 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -109,8 +109,15 @@ class ShapeInference {
       const Shape& lhs, const Shape& rhs, const Window& window,
       const ConvolutionDimensionNumbers& dimension_numbers);
 
-  // Infers the shape produced a cross replica sum with the given operand shape.
-  static StatusOr<Shape> InferCrossReplicaSumShape(const Shape& operand);
+  // Infers the shape produced by the given FFT type on the given operand.
+  static StatusOr<Shape> InferFftShape(
+      const Shape& in, FftType fft_type,
+      tensorflow::gtl::ArraySlice<int64> fft_length);
+
+  // Infers the shape produced a cross replica sum with the given operand
+  // shapes.
+  static StatusOr<Shape> InferCrossReplicaSumShape(
+      tensorflow::gtl::ArraySlice<const Shape*> operand_shapes);
 
   // Infers the shape produced by applying the given reduction computation
   // shape to the given input operand shape.
@@ -178,6 +185,12 @@ class ShapeInference {
                                          const ProgramShape& body,
                                          const Shape& init);
 
+  // Infers the shape produced by a conditional operation.
+  static StatusOr<Shape> InferConditionalShape(
+      const Shape& predicate, const Shape& true_operand,
+      const Shape& false_operand, const ProgramShape& true_computation,
+      const ProgramShape& false_computation);
+
   // Infers the shape produced by a broadcast operation.
   static StatusOr<Shape> InferBroadcastShape(
       const Shape& operand, tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
@@ -229,11 +242,13 @@ class ShapeInference {
       tensorflow::gtl::ArraySlice<const Shape*> arg_shapes,
       const ProgramShape& to_apply);
 
- private:
   // Helper that infers the shape produced by performing a dot operation with
   // the given LHS and RHS shapes.
-  static StatusOr<Shape> InferDotOpShape(const Shape& lhs, const Shape& rhs);
+  static StatusOr<Shape> InferDotOpShape(
+      const Shape& lhs, const Shape& rhs,
+      const DotDimensionNumbers& dimension_numbers);
 
+ private:
   // Helper that infers the shape produced by performing an element-wise binary
   // operation with the given LHS and RHS shapes.
   // Note: By "element-wise" we mean operations that look at a single element in
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index be93c879c0b7fd74c3b93e28c6dc0f5c656a522a..026c021165785bd3945d6a846dae446ad45da9b7 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -898,8 +898,11 @@ TEST_F(ShapeInferenceTest, BroadcastScalar) {
 
 // scalar <dot> vector: error
 TEST_F(ShapeInferenceTest, ScalarDotVector) {
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto inferred_status =
-      ShapeInference::InferBinaryOpShape(BINOP_DOT, f32_, vector_32_, {});
+      ShapeInference::InferDotOpShape(f32_, vector_32_, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
               HasSubstr("dot only supports rank"));
@@ -907,61 +910,199 @@ TEST_F(ShapeInferenceTest, ScalarDotVector) {
 
 // 3D <dot> 2D: error
 TEST_F(ShapeInferenceTest, DotWithRankHigherThanTwo) {
-  auto inferred_status = ShapeInference::InferBinaryOpShape(
-      BINOP_DOT, ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, {});
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto inferred_status = ShapeInference::InferDotOpShape(
+      ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("dot only supports rank"));
+              HasSubstr("batch and contracting dimension number mismatch"));
 }
 
 // vector <dot> vector -> scalar
 TEST_F(ShapeInferenceTest, VectorDotVector) {
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto inferred_status =
-      ShapeInference::InferBinaryOpShape(BINOP_DOT, vector_64_, vector_64_, {});
+      ShapeInference::InferDotOpShape(vector_64_, vector_64_, dot_dnums);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(f32_, inferred_status.ValueOrDie()));
   auto inferred_status_mismatch =
-      ShapeInference::InferBinaryOpShape(BINOP_DOT, vector_64_, vector_32_, {});
+      ShapeInference::InferDotOpShape(vector_64_, vector_32_, dot_dnums);
   ASSERT_FALSE(inferred_status_mismatch.ok());
 }
 
 // matrix <dot> vector -> vector
 TEST_F(ShapeInferenceTest, MatrixDotVector) {
-  auto inferred_status = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, matrix_32_64_, vector_64_, {});
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(matrix_32_64_, vector_64_, dot_dnums);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(), vector_32_));
-  auto inferred_status_mismatch = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, matrix_32_64_, vector_32_, {});
+  auto inferred_status_mismatch =
+      ShapeInference::InferDotOpShape(matrix_32_64_, vector_32_, dot_dnums);
   ASSERT_FALSE(inferred_status_mismatch.ok());
 }
 
 // vector <dot> matrix -> vector
 TEST_F(ShapeInferenceTest, VectorDotMatrix) {
-  auto inferred_status = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, vector_32_, matrix_32_64_, {});
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(vector_32_, matrix_32_64_, dot_dnums);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(), vector_64_));
-  auto inferred_status_mismatch = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, vector_64_, matrix_32_64_, {});
+  auto inferred_status_mismatch =
+      ShapeInference::InferDotOpShape(vector_64_, matrix_32_64_, dot_dnums);
   ASSERT_FALSE(inferred_status_mismatch.ok());
 }
 
 // matrix <dot> matrix -> matrix
 TEST_F(ShapeInferenceTest, MatrixDotMatrix) {
-  auto inferred_status_match = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, matrix_32_64_, matrix_64_48_, {});
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto inferred_status_match =
+      ShapeInference::InferDotOpShape(matrix_32_64_, matrix_64_48_, dot_dnums);
   ASSERT_IS_OK(inferred_status_match.status());
   ASSERT_TRUE(
       ShapeUtil::Equal(inferred_status_match.ValueOrDie(), matrix_32_48_))
       << "inferred: "
       << ShapeUtil::HumanString(inferred_status_match.ValueOrDie())
       << " expected: " << ShapeUtil::HumanString(matrix_64_48_);
-  auto inferred_status_mismatch = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, matrix_32_64_, matrix_32_64_, {});
+  auto inferred_status_mismatch =
+      ShapeInference::InferDotOpShape(matrix_32_64_, matrix_32_64_, dot_dnums);
   ASSERT_FALSE(inferred_status_mismatch.ok());
 }
 
+// BatchMatMul with two batch dimensions and one contracting dimension.
+TEST_F(ShapeInferenceTest, DotGeneral) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 3, 14});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(3);
+  dot_dnums.add_lhs_batch_dimensions(0);
+  dot_dnums.add_lhs_batch_dimensions(1);
+
+  dot_dnums.add_rhs_contracting_dimensions(2);
+  dot_dnums.add_rhs_batch_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+
+  auto inferred_status_match =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_IS_OK(inferred_status_match.status());
+  ASSERT_TRUE(
+      ShapeUtil::Equal(inferred_status_match.ValueOrDie(), output_shape))
+      << "inferred: "
+      << ShapeUtil::HumanString(inferred_status_match.ValueOrDie())
+      << " expected: " << ShapeUtil::HumanString(output_shape);
+}
+
+// BatchMatMul with two contracting dimensions fails.
+TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 11, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(2);
+  dot_dnums.add_lhs_contracting_dimensions(3);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_batch_dimensions(0);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("must specify one contracting dimension for both "
+                        "lhs and rhs"));
+}
+
+// BatchMatMul with different batch dimension sizes fails.
+TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimSizesFails) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 3, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(2);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_batch_dimensions(0);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("batch dimension numbers and sizes must match"));
+}
+
+// BatchMatMul with different batch dimension numbers fails.
+TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimNumbersFails) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 2, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(2);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("batch dimension numbers must precede non-batch"));
+}
+
+// BatchMatMul with out-of-range dimension numbers fails.
+TEST_F(ShapeInferenceTest, DotWithContractingDimNumberOutOfRange) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(3);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("A dimension number is out of range"));
+}
+
+// BatchMatMul with non-unique dimension numbers fails.
+TEST_F(ShapeInferenceTest, DotWithContractingNonUniqueDimNumber) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("A dimension number is not unique"));
+}
+
 TEST_F(ShapeInferenceTest, BinOpBroadcastMatrixVector) {
   // Test variations of broadcasting a vector for a binary add with a
   // matrix.
@@ -1296,5 +1437,95 @@ TEST_F(ShapeInferenceTest, Transpose) {
                                     ShapeUtil::MakeShape(F32, {3, 4, 5, 2})));
 }
 
+TEST_F(ShapeInferenceTest, Conditional) {
+  auto inferred_status0 = ShapeInference::InferConditionalShape(
+      pred_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_64_}, f32_));
+  EXPECT_IS_OK(inferred_status0.status());
+  EXPECT_TRUE(ShapeUtil::Equal(f32_, inferred_status0.ValueOrDie()));
+
+  auto inferred_status1 = ShapeInference::InferConditionalShape(
+      pred_, matrix_32_48_, vector_32_,
+      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
+      ShapeUtil::MakeProgramShape({vector_32_}, vector_64_));
+  EXPECT_IS_OK(inferred_status1.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_64_, inferred_status1.ValueOrDie()));
+
+  auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
+  auto inferred_status2 = ShapeInference::InferConditionalShape(
+      pred_, matrix_32_48_, tuple_f32_v32,
+      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+      ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_));
+  EXPECT_IS_OK(inferred_status2.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, inferred_status2.ValueOrDie()));
+
+  auto inferred_status_error0 = ShapeInference::InferConditionalShape(
+      s32_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_64_}, f32_));
+  EXPECT_FALSE(inferred_status_error0.ok());
+  EXPECT_THAT(inferred_status_error0.status().error_message(),
+              HasSubstr("predicate must be a boolean"));
+
+  auto inferred_status_error1 = ShapeInference::InferConditionalShape(
+      pred_, ShapeUtil::MakeTupleShape({f32_, vector_32_}), matrix_32_48_,
+      ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
+      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_));
+  EXPECT_FALSE(inferred_status_error1.ok());
+  EXPECT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("true_computation must take 1 argument"));
+
+  auto inferred_status_error2 = ShapeInference::InferConditionalShape(
+      pred_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_64_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_64_}, f32_));
+  EXPECT_FALSE(inferred_status_error2.ok());
+  EXPECT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("true_operand must match the shape of the only "
+                        "parameter of true_computation"));
+
+  auto inferred_status_error3 = ShapeInference::InferConditionalShape(
+      pred_, matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_}),
+      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+      ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_));
+  EXPECT_FALSE(inferred_status_error3.ok());
+  EXPECT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("false_computation must take 1 argument"));
+
+  auto inferred_status_error4 = ShapeInference::InferConditionalShape(
+      pred_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_));
+  EXPECT_FALSE(inferred_status_error4.ok());
+  EXPECT_THAT(inferred_status_error4.status().error_message(),
+              HasSubstr("false_operand must match the shape of the only "
+                        "parameter of false_computation"));
+
+  auto inferred_status_error5 = ShapeInference::InferConditionalShape(
+      pred_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_64_}, vector_32_));
+  EXPECT_FALSE(inferred_status_error5.ok());
+  EXPECT_THAT(inferred_status_error5.status().error_message(),
+              HasSubstr("the result of true_computation and false_computation "
+                        "must have the same shape"));
+}
+
+TEST_F(ShapeInferenceTest, BadSlice) {
+  auto arg = ShapeUtil::MakeShape(F32, {4});
+  StatusOr<Shape> statusor =
+      ShapeInference::InferSliceShape(arg, {0}, {5}, {1});
+  ASSERT_FALSE(statusor.ok());
+
+  LOG(INFO) << statusor.status();
+
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("less than or equal to dimension size"))
+      << statusor.status();
+  EXPECT_THAT(statusor.status().error_message(), HasSubstr("argument shape"))
+      << statusor.status();
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index a7539a1a11d2bbd62c780890c6730dbb212307c4..c679d401c3691b14a43ce77cbe953cd4c64a9e92 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -34,58 +34,32 @@ namespace xla {
 
 using ::tensorflow::strings::Appendf;
 
-/* static */ StatusOr<std::unique_ptr<ShapedBuffer>>
-ShapedBuffer::MakeArrayShapedBuffer(const Shape& shape,
-                                    const se::Platform* platform,
-                                    int device_ordinal,
-                                    const se::DeviceMemoryBase& buffer) {
-  if (ShapeUtil::IsTuple(shape)) {
-    return InvalidArgument("Shape must be an array: %s",
-                           ShapeUtil::HumanStringWithLayout(shape).c_str());
-  }
-  auto shaped_buffer =
-      MakeUnique<ShapedBuffer>(shape, platform, device_ordinal);
-  *shaped_buffer->mutable_shape_index_to_buffer_entry()->mutable_element({}) =
-      0;
-  *shaped_buffer->mutable_buffers() = {buffer};
-  return std::move(shaped_buffer);
-}
-
-ShapedBuffer::ShapedBuffer(const Shape& shape, const se::Platform* platform,
-                           int device_ordinal)
-    : shape_(shape),
+ShapedBuffer::ShapedBuffer(const Shape& on_host_shape,
+                           const Shape& on_device_shape,
+                           const se::Platform* platform, int device_ordinal)
+    : on_host_shape_(on_host_shape),
+      on_device_shape_(on_device_shape),
       platform_(platform),
       device_ordinal_(device_ordinal),
-      shape_index_to_buffer_entry_(shape) {}
+      buffers_(on_device_shape) {}
 
 void ShapedBuffer::clear() {
-  for (se::DeviceMemoryBase& memory_base : buffers_) {
+  for (auto& pair : buffers_) {
     // A default constructed DeviceMemoryBase is a null pointer.
-    memory_base = se::DeviceMemoryBase();
+    pair.second = se::DeviceMemoryBase();
   }
 }
 
-void ShapedBuffer::AddBufferAtIndex(
-    const perftools::gputools::DeviceMemoryBase& buffer,
-    const ShapeIndex& shape_index) {
-  *mutable_shape_index_to_buffer_entry()->mutable_element(shape_index) =
-      buffers().size();
-  mutable_buffers()->push_back(buffer);
-}
-
-const se::DeviceMemoryBase& ShapedBuffer::buffer(
-    const ShapeIndex& index) const {
-  return buffers_[shape_index_to_buffer_entry_.element(index)];
-}
-
-se::DeviceMemoryBase* ShapedBuffer::mutable_buffer(const ShapeIndex& index) {
-  return &buffers_[shape_index_to_buffer_entry_.element(index)];
-}
-
 string ShapedBuffer::ToString() const {
-  string s = "ShapedBuffer(" + platform_->Name() + "):\n";
+  string s = tensorflow::strings::StrCat(
+      "ShapedBuffer(", platform_->Name(), ":", device_ordinal(),
+      "), on-host shape=" + ShapeUtil::HumanStringWithLayout(on_host_shape()),
+      ", on-device shape=" +
+          ShapeUtil::HumanStringWithLayout(on_device_shape()),
+      ":\n");
   ShapeUtil::ForEachSubshape(
-      shape(), [this, &s](const Shape& subshape, const ShapeIndex& index) {
+      on_device_shape(),
+      [this, &s](const Shape& subshape, const ShapeIndex& index) {
         string shape_str;
         if (ShapeUtil::IsTuple(subshape)) {
           shape_str = "tuple";
@@ -105,53 +79,24 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) {
   return out;
 }
 
-/* static */ StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-ScopedShapedBuffer::Allocate(
-    const Shape& shape, DeviceMemoryAllocator* allocator, int device_ordinal,
-    const std::function<int64(const Shape&)>& shape_size_fn) {
-  if (!LayoutUtil::HasLayout(shape)) {
-    return InvalidArgument("Shape must have a layout: %s",
-                           ShapeUtil::HumanStringWithLayout(shape).c_str());
-  }
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
-  auto shaped_buffer =
-      WrapUnique(new ScopedShapedBuffer(shape, allocator, device_ordinal));
-
-  // Allocate an appropriate sized buffer for each element in the shape
-  // including the tuple pointer arrays.
-  for (auto& pair : shaped_buffer->shape_index_to_buffer_entry_) {
-    const ShapeIndex& index = pair.first;
-    size_t& buffer_entry = pair.second;
-    TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase memory_base,
-                        shaped_buffer->allocator_->Allocate(
-                            shaped_buffer->device_ordinal(),
-                            shape_size_fn(ShapeUtil::GetSubshape(
-                                shaped_buffer->shape(), index))));
-    shaped_buffer->buffers_.push_back(memory_base);
-    buffer_entry = shaped_buffer->buffers_.size() - 1;
-  }
-
-  return std::move(shaped_buffer);
-}
-
 /* static */
 StatusOr<std::unique_ptr<ScopedShapedBuffer>> ScopedShapedBuffer::MakeScoped(
     ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator) {
   auto scoped_buffer = WrapUnique(new ScopedShapedBuffer(
-      shaped_buffer->shape(), allocator, shaped_buffer->device_ordinal()));
+      shaped_buffer->on_host_shape(), shaped_buffer->on_device_shape(),
+      allocator, shaped_buffer->device_ordinal()));
   scoped_buffer->buffers_ = shaped_buffer->buffers();
-  scoped_buffer->shape_index_to_buffer_entry_ =
-      shaped_buffer->shape_index_to_buffer_entry();
-
   shaped_buffer->clear();
 
   return std::move(scoped_buffer);
 }
 
-ScopedShapedBuffer::ScopedShapedBuffer(const Shape& shape,
+ScopedShapedBuffer::ScopedShapedBuffer(const Shape& on_host_shape,
+                                       const Shape& on_device_shape,
                                        DeviceMemoryAllocator* allocator,
                                        int device_ordinal)
-    : ShapedBuffer(shape, allocator->platform(), device_ordinal),
+    : ShapedBuffer(on_host_shape, on_device_shape, allocator->platform(),
+                   device_ordinal),
       allocator_(allocator) {}
 
 ScopedShapedBuffer::~ScopedShapedBuffer() {
@@ -159,7 +104,8 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
   // in the shape (eg, a tuple with a repeated element) so keep track of what
   // has been deallocated.
   std::set<void*> deallocated_opaques;
-  for (se::DeviceMemoryBase& memory_base : buffers_) {
+  for (auto& pair : buffers_) {
+    se::DeviceMemoryBase& memory_base = pair.second;
     if (!memory_base.is_null() &&
         deallocated_opaques.count(memory_base.opaque()) == 0) {
       deallocated_opaques.insert(memory_base.opaque());
@@ -170,13 +116,10 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
 }
 
 std::unique_ptr<ShapedBuffer> ScopedShapedBuffer::release() {
-  auto shaped_buffer =
-      MakeUnique<ShapedBuffer>(shape(), platform(), device_ordinal());
-
-  *shaped_buffer->mutable_buffers() = buffers();
-  *shaped_buffer->mutable_shape_index_to_buffer_entry() =
-      shape_index_to_buffer_entry();
+  auto shaped_buffer = MakeUnique<ShapedBuffer>(
+      on_host_shape(), on_device_shape(), platform(), device_ordinal());
 
+  shaped_buffer->buffers() = buffers();
   clear();
 
   return shaped_buffer;
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index fa88caa13ff734995e8ab0925f17d0d3c26b8fda..d397e47d2ca734458c7dc99baa5c81b16d0fd72b 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -31,61 +31,68 @@ limitations under the License.
 namespace xla {
 
 // Class which encapsulates a buffer or set of buffers containing data of a
-// particular XLA shape. Used for zero-copy execution interface for a
-// XLA client running in the same process as the service (LocalClient),
+// particular XLA shape.
 class ShapedBuffer {
  public:
-  // Convenience method which creates a ShapedBuffer of array shape (not a
-  // tuple). Its single buffer pointer is set to the given value "buffer". The
-  // given buffer must be large enough to store the given shape as given by
-  // ShapeUtil::ByteSizeOf.
-  static StatusOr<std::unique_ptr<ShapedBuffer>> MakeArrayShapedBuffer(
-      const Shape& shape, const perftools::gputools::Platform* platform,
-      int device_ordinal, const perftools::gputools::DeviceMemoryBase& buffer);
-
-  ShapedBuffer(const Shape& shape,
+  // Construct a ShapedBuffer with null DeviceMemoryBases at each index. The
+  // shape of the data on the host and the device may differ because the device
+  // may have a different representation for different data types. Therefore,
+  // both the on-host and on-device shape are required. The on-device shape
+  // determines the number of device allocations (DeviceMemoryBase) held by the
+  // ShapedBuffer.
+  ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
                const perftools::gputools::Platform* platform,
                int device_ordinal);
 
-  const Shape& shape() const { return shape_; }
+  // Returns the shape of the on-host representation of the data held by this
+  // ShapedBuffer.
+  const Shape& on_host_shape() const { return on_host_shape_; }
+
+  // Returns the shape of the on-device representation of the data held by this
+  // ShapedBuffer.
+  const Shape& on_device_shape() const { return on_device_shape_; }
+
   const perftools::gputools::Platform* platform() const { return platform_; }
   int device_ordinal() const { return device_ordinal_; }
 
+  // Return the root buffer of the shape (shape index {}).
+  const perftools::gputools::DeviceMemoryBase& root_buffer() const {
+    return buffer(/*index=*/{});
+  }
+
   // Returns the buffer at the given shape index where index is defined as in
   // ShapeUtil::GetSubshape.
   const perftools::gputools::DeviceMemoryBase& buffer(
-      const ShapeIndex& index) const;
-  perftools::gputools::DeviceMemoryBase* mutable_buffer(
-      const ShapeIndex& index);
-
-  // Returns the underlying structure which stores the buffer pointers.
-  const std::vector<perftools::gputools::DeviceMemoryBase>& buffers() const {
-    return buffers_;
+      const ShapeIndex& index) const {
+    return buffers_.element(index);
   }
-  std::vector<perftools::gputools::DeviceMemoryBase>* mutable_buffers() {
-    return &buffers_;
+
+  // Sets the device memory buffer at the given index.
+  void set_buffer(const perftools::gputools::DeviceMemoryBase& buffer,
+                  const ShapeIndex& index) {
+    *buffers_.mutable_element(index) = buffer;
   }
 
-  // Returns the tree of indices which map to buffer pointers.
-  const ShapeTree<size_t>& shape_index_to_buffer_entry() const {
-    return shape_index_to_buffer_entry_;
+  // Returns the underlying ShapeTree containing all the device addresses in the
+  // ShapedBuffer.
+  const ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() const {
+    return buffers_;
   }
-  ShapeTree<size_t>* mutable_shape_index_to_buffer_entry() {
-    return &shape_index_to_buffer_entry_;
+  ShapeTree<perftools::gputools::DeviceMemoryBase>& buffers() {
+    return buffers_;
   }
 
   // Set all device memory pointers in the object to null.
   void clear();
 
-  // Adds a new buffer at the given shape index.
-  void AddBufferAtIndex(const perftools::gputools::DeviceMemoryBase& buffer,
-                        const ShapeIndex& shape_index);
-
   string ToString() const;
 
  protected:
-  // The shape of the device buffer with layout.
-  const Shape shape_;
+  // The shape of the data when represented on the host.
+  const Shape on_host_shape_;
+
+  // The shape of the data on the device.
+  const Shape on_device_shape_;
 
   // The platform the memory is allocated on.
   const perftools::gputools::Platform* platform_;
@@ -93,14 +100,8 @@ class ShapedBuffer {
   // The device the memory is allocated on.
   const int device_ordinal_;
 
-  // The list of DeviceMemoryBase pointers representing this shape.
-  // Note that there can be a many to one relationship between tuple elements
-  // and buffers.  To account for this, shape_index_to_buffer_entry_ allows us
-  // to make from a position in a shape to an index into this list.
-  std::vector<perftools::gputools::DeviceMemoryBase> buffers_;
-
-  // The tree of indices into buffers_.
-  ShapeTree<size_t> shape_index_to_buffer_entry_;
+  // The tree of device buffers. Its shape is on_device_shape().
+  ShapeTree<perftools::gputools::DeviceMemoryBase> buffers_;
 };
 
 std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
@@ -110,20 +111,16 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
 // destructed.
 class ScopedShapedBuffer : public ShapedBuffer {
  public:
-  // Return a newly allocated ScopedShapedBuffer of an arbitrary shape. Array
-  // buffers (leaves in the shape) are allocated and uninitialized. Tuple
-  // buffers (if any) are allocated and initialized to the backend-specific
-  // representation of an array of pointers to the tuple elements.
-  static StatusOr<std::unique_ptr<ScopedShapedBuffer>> Allocate(
-      const Shape& shape, DeviceMemoryAllocator* allocator, int device_ordinal,
-      const std::function<int64(const Shape&)>& shape_size_fn);
-
   // Takes a ShapedBuffer and returns a ScopedShapedBuffer which manages the
   // deallocation of the device memory held in the shaped buffer. All device
   // memory pointers in the given ShapedBuffer are set to null.
   static StatusOr<std::unique_ptr<ScopedShapedBuffer>> MakeScoped(
       ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator);
 
+  // Create a ScopedShapedBuffer with null DeviceMemoryBases at each index.
+  ScopedShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape,
+                     DeviceMemoryAllocator* allocator, int device_ordinal);
+
   // Return the allocator used to allocate the device memory held in this
   // ScopedShapedBuffer.
   DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
@@ -138,8 +135,6 @@ class ScopedShapedBuffer : public ShapedBuffer {
   virtual ~ScopedShapedBuffer();
 
  protected:
-  ScopedShapedBuffer(const Shape& shape, DeviceMemoryAllocator* allocator,
-                     int device_ordinal);
   ScopedShapedBuffer(const ScopedShapedBuffer&) = delete;
   void operator=(const ScopedShapedBuffer&) = delete;
 
diff --git a/tensorflow/compiler/xla/service/source_map_util.cc b/tensorflow/compiler/xla/service/source_map_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8cbaac7b3760717bcacb57adc8782a5755c0aa6d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/source_map_util.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/source_map_util.h"
+
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace source_map_util {
+namespace {
+
+Status InvalidParameterArgumentV(const OpMetadata& op_metadata,
+                                 const char* format, va_list args) {
+  string message;
+  tensorflow::strings::Appendv(&message, format, args);
+  if (!op_metadata.source_file().empty()) {
+    tensorflow::strings::Appendf(&message, " (%s:%d)",
+                                 op_metadata.source_file().c_str(),
+                                 op_metadata.source_line());
+  }
+  return InvalidArgument("%s", message.c_str());
+}
+
+}  // namespace
+
+Status InvalidParameterArgument(const OpMetadata& op_metadata,
+                                const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  Status result = InvalidParameterArgumentV(op_metadata, format, args);
+  va_end(args);
+  return result;
+}
+
+Status InvalidParameterArgument(Executable* executable, int parameter_number,
+                                const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  if (executable != nullptr && executable->has_module()) {
+    const HloModule& module = executable->module();
+    const HloComputation& computation = *module.entry_computation();
+    HloInstruction* param = computation.parameter_instruction(parameter_number);
+    const OpMetadata& metadata = param->metadata();
+    Status result = InvalidParameterArgumentV(metadata, format, args);
+    va_end(args);
+    return result;
+  }
+  Status result = InvalidArgumentV(format, args);
+  va_end(args);
+  return result;
+}
+
+}  // namespace source_map_util
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/source_map_util.h b/tensorflow/compiler/xla/service/source_map_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..a776d745f4e56ca4f3d2480740259832bbc85011
--- /dev/null
+++ b/tensorflow/compiler/xla/service/source_map_util.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_
+
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+namespace source_map_util {
+
+// Creates an INVALID_ARUGMENT status with the given format string.
+//
+// Also, attempts to extract the OpMetadata for parameter_number on executable
+// and append it to the status message for source mapping to user code.
+//
+// executable may be nullptr, but parameter_number should not be out of bounds
+// or a CHECK-failure may occur.
+Status InvalidParameterArgument(Executable* executable, int parameter_number,
+                                const char* format, ...)
+    TF_PRINTF_ATTRIBUTE(3, 4);
+
+// As above, but takes the parameter metadata directly instead of extracting it
+// from the executable.
+Status InvalidParameterArgument(const OpMetadata& op_metadata,
+                                const char* format, ...)
+    TF_PRINTF_ATTRIBUTE(2, 3);
+
+}  // namespace source_map_util
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SOURCE_MAP_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index d5f53ad56fb019d0ae7c27fc28706f05614ece68..2f36e2b16e0f2eed10aef811dd3cceeba6a5b8a9 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -40,6 +40,45 @@ TransferManager::GetPlatformTransferManagers() {
   return r;
 }
 
+Status TransferManager::TransferArrayToDevice(
+    perftools::gputools::StreamExecutor* executor, const Literal& literal,
+    const perftools::gputools::DeviceMemoryBase& dest) {
+  const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
+  TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape))
+      << "On-device representation of "
+      << ShapeUtil::HumanString(literal.shape())
+      << " is not an array: " << ShapeUtil::HumanString(on_device_shape);
+  if (dest.size() < GetByteSizeRequirement(on_device_shape)) {
+    return FailedPrecondition(
+        "Allocation on device not large enough for array: "
+        "%lld < %lld",
+        dest.size(), GetByteSizeRequirement(on_device_shape));
+  }
+  ShapedBuffer shaped_buffer(/*on_host_shape=*/literal.shape(), on_device_shape,
+                             executor->platform(), executor->device_ordinal());
+  shaped_buffer.set_buffer(dest, /*index=*/{});
+  return TransferLiteralToDevice(executor, literal, shaped_buffer);
+}
+
+StatusOr<std::unique_ptr<Literal>> TransferManager::TransferArrayFromDevice(
+    perftools::gputools::StreamExecutor* executor, const Shape& shape,
+    const perftools::gputools::DeviceMemoryBase& source) {
+  TF_RET_CHECK(ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape))
+      << "Shape " << ShapeUtil::HumanString(shape)
+      << " has a differently shaped representation on-device: "
+      << ShapeUtil::HumanString(HostShapeToDeviceShape(shape));
+  if (source.size() < GetByteSizeRequirement(shape)) {
+    return FailedPrecondition(
+        "Allocation on device not large enough for array: "
+        "%lld < %lld",
+        source.size(), GetByteSizeRequirement(shape));
+  }
+  ShapedBuffer shaped_buffer(/*on_host_shape=*/shape, shape,
+                             executor->platform(), executor->device_ordinal());
+  shaped_buffer.set_buffer(source, /*index=*/{});
+  return TransferLiteralFromDevice(executor, shaped_buffer);
+}
+
 /* static */ void TransferManager::RegisterTransferManager(
     se::Platform::Id platform_id,
     TransferManagerCreationFunction creation_function) {
@@ -75,14 +114,12 @@ TransferManager::GetPlatformTransferManagers() {
 Status TransferManager::WriteTupleIndexTables(
     perftools::gputools::StreamExecutor* executor,
     const ShapedBuffer& device_buffer) {
-  VLOG(2) << "Writing tuple index tables to ShapedBuffer rooted at "
-          << device_buffer.buffer(/*index=*/{}).opaque()
-          << "; shape: " << ShapeUtil::HumanString(device_buffer.shape());
+  VLOG(2) << "Writing tuple index tables for " << device_buffer;
 
   TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
 
   return ShapeUtil::ForEachSubshapeWithStatus(
-      device_buffer.shape(),
+      device_buffer.on_device_shape(),
       [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
         if (ShapeUtil::IsTuple(device_subshape)) {
           se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
@@ -97,7 +134,7 @@ Status TransferManager::WriteTupleIndexTables(
             elements.push_back(device_buffer.buffer(element_index));
             element_index.pop_back();
           }
-          return WriteTuplePointersToDevice(executor, elements, device_subshape,
+          return WriteSingleTupleIndexTable(executor, elements, device_subshape,
                                             &device_memory);
         }
 
@@ -143,31 +180,43 @@ Status TransferManager::TransferBufferToDevice(
   return Status::OK();
 }
 
-StatusOr<std::set<se::DeviceMemoryBase>>
-TransferManager::GatherBufferPointersFromTuple(
-    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
-    const Shape& shape) {
-  TF_RET_CHECK(ShapeUtil::IsTuple(shape));
-
-  std::set<se::DeviceMemoryBase> buffer_pointers;
-  buffer_pointers.insert(source);
-
-  TF_ASSIGN_OR_RETURN(std::vector<se::DeviceMemoryBase> tuple_elements,
-                      ShallowCopyTupleFromDevice(executor, source, shape));
-  for (auto i = 0; i < tuple_elements.size(); ++i) {
-    const Shape& element_shape = shape.tuple_shapes(i);
-    if (ShapeUtil::IsTuple(element_shape)) {
-      TF_ASSIGN_OR_RETURN(
-          std::set<se::DeviceMemoryBase> buffer_pointers_in_element,
-          GatherBufferPointersFromTuple(executor, tuple_elements[i],
-                                        element_shape));
-      buffer_pointers.insert(buffer_pointers_in_element.begin(),
-                             buffer_pointers_in_element.end());
-    } else {
-      buffer_pointers.insert(tuple_elements[i]);
-    }
+StatusOr<std::unique_ptr<ShapedBuffer>> TransferManager::AllocateShapedBuffer(
+    const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+    int device_ordinal) {
+  if (!LayoutUtil::HasLayout(on_host_shape)) {
+    return InvalidArgument(
+        "Shape must have a layout: %s",
+        ShapeUtil::HumanStringWithLayout(on_host_shape).c_str());
+  }
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(on_host_shape));
+  const Shape on_device_shape = HostShapeToDeviceShape(on_host_shape);
+  TF_RET_CHECK(LayoutUtil::HasLayout(on_device_shape));
+
+  auto shaped_buffer = WrapUnique(new ShapedBuffer(
+      on_host_shape, on_device_shape, allocator->platform(), device_ordinal));
+
+  // Allocate an appropriate sized buffer for each element in the shape
+  // including the tuple pointer arrays.
+  for (auto& pair : shaped_buffer->buffers()) {
+    const ShapeIndex& index = pair.first;
+    se::DeviceMemoryBase& memory_base = pair.second;
+    const Shape& subshape = ShapeUtil::GetSubshape(on_device_shape, index);
+    TF_ASSIGN_OR_RETURN(memory_base,
+                        allocator->Allocate(shaped_buffer->device_ordinal(),
+                                            GetByteSizeRequirement(subshape)));
   }
-  return std::move(buffer_pointers);
+
+  return std::move(shaped_buffer);
+}
+
+StatusOr<std::unique_ptr<ScopedShapedBuffer>>
+TransferManager::AllocateScopedShapedBuffer(const Shape& on_host_shape,
+                                            DeviceMemoryAllocator* allocator,
+                                            int device_ordinal) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ShapedBuffer> unscoped_buffer,
+      AllocateShapedBuffer(on_host_shape, allocator, device_ordinal));
+  return ScopedShapedBuffer::MakeScoped(unscoped_buffer.get(), allocator);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index fdc123e54eb7f754c12510bef551b98da01b585d..9f2b5c4aecf0b52f610171e0c2755de577b2bd9e 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -44,55 +44,47 @@ class TransferManager {
   // Returns the ID of the platform that this transfer manager acts on.
   virtual perftools::gputools::Platform::Id PlatformId() const = 0;
 
-  // Transfers the region into the provided literal using the provided
-  // executor. device_shape is the shape, including layout, of the data on the
-  // device, while literal_shape will be the shape for the literal. device_shape
-  // and literal_shape must be compatible, but need not have the same layout.
-  // TODO(b/66694934): Remove TransferLiteral* methods which accept bare
-  // DeviceMemoryBase.
-  virtual Status TransferLiteralFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& region,
-      const Shape& device_shape, const Shape& literal_shape,
-      Literal* literal) = 0;
-
-  // Transfers the given literal into the provided region output parameter,
-  // using the given executor.
-  virtual Status TransferLiteralToDevice(
-      perftools::gputools::StreamExecutor* executor, const Literal& literal,
-      perftools::gputools::DeviceMemoryBase* region) = 0;
-
-  // Transfers the data held in the given ShapedBuffer into the provided literal
-  // using the provided executor. literal_shape will be the shape for the
-  // literal. The shape of the ShapedBuffer and literal_shape must be
-  // compatible, but need not have the same layout.
+  // Returns the shape of the on-device representation for the given shape on
+  // the host. This is intended for use with ShapedBuffer where buffers are
+  // pre-allocated by the host, e.g. TransferLiteralToDevice, without the user
+  // needing to consider device-specific behaviors.
+  virtual Shape HostShapeToDeviceShape(const Shape& host_shape) const {
+    return host_shape;
+  }
+
+  // Returns a literal containing the data held in the given ShapedBuffer.
+  // using the provided executor. The optional literal_shape will be the shape
+  // for the literal. The shape of the ShapedBuffer and
+  // DeviceShape(literal_shape) must be compatible, but need not have the same
+  // layout.
   virtual StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
       perftools::gputools::StreamExecutor* executor,
       const ShapedBuffer& device_buffer) = 0;
 
   // Transfers the given literal into the previously allocated device memory
-  // represented by the given ShapedBuffer using the given executor.
+  // represented by the given ShapedBuffer using the given executor. The shape
+  // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible,
+  // but need not have the same layout
   virtual Status TransferLiteralToDevice(
       perftools::gputools::StreamExecutor* executor, const Literal& literal,
       const ShapedBuffer& device_buffer) = 0;
 
+  // Convenience methods for transferring an array to or from the device at a
+  // known address. This avoids having to construct a ShapedBuffer just to
+  // transfer an array at a known address.
+  Status TransferArrayToDevice(
+      perftools::gputools::StreamExecutor* executor, const Literal& literal,
+      const perftools::gputools::DeviceMemoryBase& dest);
+  StatusOr<std::unique_ptr<Literal>> TransferArrayFromDevice(
+      perftools::gputools::StreamExecutor* executor, const Shape& shape,
+      const perftools::gputools::DeviceMemoryBase& source);
+
   // Transfers the given literal into the Infeed interface of the device,
   // using the given executor.
   virtual Status TransferLiteralToInfeed(
       perftools::gputools::StreamExecutor* executor,
       const Literal& literal) = 0;
 
-  // Transfer a memory block of the given size from 'source' buffer to the
-  // Infeed interface of the device using the given executor.
-  //
-  // size is the size to transfer from source in bytes.
-  //
-  // source is the source data that must be in the target-dependent layout that
-  // the Infeed HLO used in the computation expects.
-  virtual Status TransferBufferToInfeed(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source) = 0;
-
   // Transfers the given literal from the Outfeed interface of the device,
   // using the given executor.
   virtual Status TransferLiteralFromOutfeed(
@@ -104,37 +96,26 @@ class TransferManager {
       tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
           executor) = 0;
 
-  // Shallow copy a tuple from the device and create a DeviceMemoryBase object
-  // for each element in the tuple. A DeviceMemoryBase object refers to the
-  // buffer containing the data of that element. The DeviceMemoryBase objects
-  // are returned as a vector.
-  virtual StatusOr<std::vector<perftools::gputools::DeviceMemoryBase>>
-  ShallowCopyTupleFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& source,
-      const Shape& shape) = 0;
-
   // Given an allocated ShapedBuffer, constructs the tuple index table(s) in
   // each buffer of the given ShapedBuffer corresponding to tuple shapes. If the
   // ShapedBuffer is array-shaped this method does nothing.
   Status WriteTupleIndexTables(perftools::gputools::StreamExecutor* executor,
                                const ShapedBuffer& device_buffer);
 
-  // Returns all buffer pointers that the tuple `source` refers to. Unlike
-  // ShallowCopyTupleFromDevice, this function gather buffer pointers in nested
-  // tuples as well. Also, the returned DeviceMemoryBase objects are
-  // deduplicated.
-  StatusOr<std::set<perftools::gputools::DeviceMemoryBase>>
-  GatherBufferPointersFromTuple(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& source, const Shape& shape);
-
   // Determines the byte size requirement for the given shape on the underlying
   // architecture. This will be used to allocate an appropriately sized memory
   // region for a host-to-device transfer.
   virtual int64 GetByteSizeRequirement(const Shape& shape) const = 0;
 
-  typedef std::unique_ptr<TransferManager> (*TransferManagerCreationFunction)();
+  // Allocate a ShapedBuffer which can hold data with the given on-host
+  // shape. The on-device shape may be different as indicated by
+  // HostShapeToDeviceShape.
+  StatusOr<std::unique_ptr<ShapedBuffer>> AllocateShapedBuffer(
+      const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+      int device_ordinal);
+  StatusOr<std::unique_ptr<ScopedShapedBuffer>> AllocateScopedShapedBuffer(
+      const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+      int device_ordinal);
 
   /////
   // The TransferManager class also serves as a point to register objects for
@@ -144,6 +125,7 @@ class TransferManager {
   // assumed to be a singleton, so no ownership is transferred.
   //
   // Precondition: a platform kind must not be registered more than once.
+  typedef std::unique_ptr<TransferManager> (*TransferManagerCreationFunction)();
   static void RegisterTransferManager(
       perftools::gputools::Platform::Id platform_id,
       TransferManagerCreationFunction transfer_manager);
@@ -154,6 +136,17 @@ class TransferManager {
       const perftools::gputools::Platform* platform);
 
  protected:
+  // Transfer a memory block of the given size from 'source' buffer to the
+  // Infeed interface of the device using the given executor.
+  //
+  // size is the size to transfer from source in bytes.
+  //
+  // source is the source data that must be in the target-dependent layout that
+  // the Infeed HLO used in the computation expects.
+  virtual Status TransferBufferToInfeed(
+      perftools::gputools::StreamExecutor* executor, int64 size,
+      const void* source) = 0;
+
   // Transfer a memory block of the given size from the device source into the
   // 'destination' buffer.
   //
@@ -172,10 +165,9 @@ class TransferManager {
       const void* source, perftools::gputools::DeviceMemoryBase* destination);
 
   // Writes the given device-memory pointers in 'elements' to the given region
-  // to construct a tuple in the platform-specific tuple representation. This
-  // can handle nested tuples as well. In the nested case, the element
-  // DeviceMemoryBase points to another array of pointers on the device.
-  virtual Status WriteTuplePointersToDevice(
+  // to construct a tuple index table in the platform-specific tuple
+  // representation.
+  virtual Status WriteSingleTupleIndexTable(
       perftools::gputools::StreamExecutor* executor,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           elements,
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index fb55d4e5433ce666a061256691ea08ee56fde396..83185ac49e9b7c386d10d1cbc4e20dcdfdfd6cae 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -42,7 +42,7 @@ TransposeFolding::OperandIndices CanFoldOperandsIntoDot(
   TransposeFolding::OperandIndices operand_set;
   for (int64 i = 0; i < dot.operand_count(); ++i) {
     auto& operand = *dot.operand(i);
-    if (operand.IsRank2Transpose() && operand.user_count() == 1) {
+    if (operand.IsRank2Transpose()) {
       operand_set.push_back(i);
     }
   }
@@ -61,8 +61,7 @@ TransposeFolding::OperandIndices CanFoldOperandsIntoConvolution(
   TransposeFolding::OperandIndices operand_set;
   for (int64 i = 0; i < convolution.operand_count(); ++i) {
     auto& operand = *convolution.operand(i);
-    if (operand.opcode() == HloOpcode::kTranspose &&
-        operand.user_count() == 1) {
+    if (operand.opcode() == HloOpcode::kTranspose) {
       operand_set.push_back(i);
     }
   }
@@ -102,6 +101,10 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
   auto& convolution = *pair.first;
   auto& operand_indices = pair.second;
 
+  if (operand_indices.empty()) {
+    return false;
+  }
+
   const ConvolutionDimensionNumbers& dnums =
       convolution.convolution_dimension_numbers();
   ConvolutionDimensionNumbers new_dnums = dnums;
@@ -121,8 +124,9 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
         transpose_dimensions[dnums.input_batch_dimension()]);
     new_dnums.set_input_feature_dimension(
         transpose_dimensions[dnums.input_feature_dimension()]);
-    for (const auto& spatial_dimension : dnums.input_spatial_dimensions()) {
-      CHECK_EQ(spatial_dimension, transpose_dimensions[spatial_dimension]);
+    for (auto& input_spatial_dimension :
+         *new_dnums.mutable_input_spatial_dimensions()) {
+      input_spatial_dimension = transpose_dimensions[input_spatial_dimension];
     }
     new_lhs = &transpose_operand;
   } else {
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 6ac32e88f1f4af4743990daecd6c1f66a4e32763..caa1a111ad880b9dee62c1c94e32e8275c196fbf 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -64,9 +64,12 @@ TEST_F(TransposeFoldingTest, FoldDotTranspose) {
   HloInstruction* transpose_y =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {3, 2}), y, {1, 0}));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {2, 2}), /*opcode=*/HloOpcode::kDot,
-      /*lhs=*/x, /*rhs=*/transpose_y));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateDot(ShapeUtil::MakeShape(F32, {2, 2}), /*lhs=*/x,
+                                /*rhs=*/transpose_y, dot_dnums));
 
   HloModule module("test_module");
   HloComputation* entry_computation =
@@ -104,9 +107,12 @@ TEST_F(TransposeFoldingTest, FoldDotTransposeConstant) {
   HloInstruction* transpose1 =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {2, 3}), const1, {1, 0}));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1, 3}), /*opcode=*/HloOpcode::kDot,
-      /*lhs=*/transpose0, /*rhs=*/transpose1));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
+      ShapeUtil::MakeShape(F32, {1, 3}),
+      /*lhs=*/transpose0, /*rhs=*/transpose1, dot_dnums));
 
   HloModule module("test_module");
   HloComputation* entry_computation =
@@ -169,9 +175,12 @@ TEST_F(TransposeFoldingTest, FoldDotTransposeInWhile) {
   HloInstruction* transpose_y =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {3, 2}), y, {1, 0}));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {2, 2}), /*opcode=*/HloOpcode::kDot,
-      /*lhs=*/x, /*rhs=*/transpose_y));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateDot(ShapeUtil::MakeShape(F32, {2, 2}), /*lhs=*/x,
+                                /*rhs=*/transpose_y, dot_dnums));
 
   HloModule module("test_module");
   HloComputation* entry_computation =
@@ -376,5 +385,69 @@ TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
       new_conv->convolution_dimension_numbers().output_spatial_dimensions(1));
 }
 
+// Test that a transpose of every dimension in the activations gets folded into
+// convolution.
+TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
+  auto builder = HloComputation::Builder("entry_computation");
+  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {3, 2, 1, 1}),
+      /*name=*/"x"));
+  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {2, 3, 1, 1}),
+      /*name=*/"y"));
+  HloInstruction* transpose_x =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {2, 3, 1, 1}), x, {1, 0, 3, 2}));
+  auto dnums = ComputationBuilder::CreateDefaultConvDimensionNumbers();
+  Window window;
+  for (int i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_base_dilation(1);
+    dim->set_window_dilation(1);
+    dim->set_stride(1);
+    dim->set_size(y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
+  }
+  StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
+      transpose_x->shape(), y->shape(), window, dnums);
+  EXPECT_IS_OK(conv_shape);
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      conv_shape.ValueOrDie(), transpose_x, y, window, dnums));
+
+  HloModule module("test_module");
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build(conv));
+  FoldTranspose(&module);
+
+  // Instructions after folding: x, y, and the convolution.
+  std::unordered_set<HloInstruction*> instruction_set(
+      entry_computation->instructions().begin(),
+      entry_computation->instructions().end());
+  EXPECT_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
+  EXPECT_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
+  EXPECT_EQ(1, instruction_set.size())
+      << "entry_computation should contain exactly 3 instructions.";
+  HloInstruction* new_conv = *instruction_set.begin();
+  EXPECT_EQ(HloOpcode::kConvolution, new_conv->opcode());
+  EXPECT_EQ(dnums.input_feature_dimension(),
+            new_conv->convolution_dimension_numbers().input_batch_dimension());
+  EXPECT_EQ(
+      dnums.input_batch_dimension(),
+      new_conv->convolution_dimension_numbers().input_feature_dimension());
+  EXPECT_EQ(
+      dnums.input_spatial_dimensions(0),
+      new_conv->convolution_dimension_numbers().input_spatial_dimensions(1));
+  EXPECT_EQ(
+      dnums.input_spatial_dimensions(1),
+      new_conv->convolution_dimension_numbers().input_spatial_dimensions(0));
+  EXPECT_EQ(
+      dnums.output_spatial_dimensions(0),
+      new_conv->convolution_dimension_numbers().output_spatial_dimensions(0));
+  EXPECT_EQ(
+      dnums.output_spatial_dimensions(1),
+      new_conv->convolution_dimension_numbers().output_spatial_dimensions(1));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 0c848566478a25d4862cb0698e029dacd71f7a6a..657a8fe09ae9df906d695f7f49df72500d611792 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -273,6 +273,16 @@ Status TuplePointsToAnalysis::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
+Status TuplePointsToAnalysis::HandleSlice(HloInstruction* slice) {
+  // A kSlice instruction aliases its operand if the backend lowers it to an
+  // in-place implementation.
+  if (slice->IsInPlaceSlice()) {
+    CreateCopiedPointsToSet(slice, slice->operand(0));
+    return Status::OK();
+  }
+  return DefaultAction(slice);
+}
+
 Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) {
   // RecvDone aliases its input (Recv) tuple element {0} to its output.
   PointsToSet& points_to_set = CreateEmptyPointsToSet(recv_done);
@@ -427,10 +437,15 @@ bool TuplePointsToAnalysis::InstructionDefinesBufferAtIndex(
 
 Status TuplePointsToAnalysis::VerifyBuffer(const LogicalBuffer& buffer) const {
   if (!InstructionDefinesBufferAtIndex(buffer.instruction(), buffer.index())) {
-    return FailedPrecondition(
-        "LogicalBuffer %s is ill-defined: instruction %s does not define a "
-        "buffer at that index",
-        buffer.ToString().c_str(), buffer.instruction()->name().c_str());
+    // kSlice ops that are lowered to an in-place version are expected to not
+    // define their output buffer.
+    if (buffer.instruction()->opcode() != HloOpcode::kSlice ||
+        !buffer.instruction()->IsInPlaceSlice()) {
+      return FailedPrecondition(
+          "LogicalBuffer %s is ill-defined: instruction %s does not define a "
+          "buffer at that index",
+          buffer.ToString().c_str(), buffer.instruction()->name().c_str());
+    }
   }
 
   if (buffer.id() < 0 ||
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index 8928de107eed8c40bbe2130e26fe83ca3802d2f6..c3743b150168ebcf1051050dc511e50c43108c4f 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -199,12 +199,10 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   StatusOr<const LogicalBuffer*> GetBufferDefinedAt(
       const HloInstruction* instruction, const ShapeIndex& index) const;
 
-  // Return a vector containing all BufferAliases of the given logical buffer
-  // This trivially includes the BufferAlias with same instruction and index as
-  // the logical buffer itself, so the returned vector is never empty.  The
-  // buffer alias set is the inverse of the points-to set. That is,
-  // LogicalBuffer B is in the points-to set of instruction I at index N iff
-  // instruction I, index N is a BufferAlias of B.
+  // Return a (possibly empty) vector containing all BufferAliases of the given
+  // logical buffer The buffer alias set is the inverse of the points-to set.
+  // That is, LogicalBuffer B is in the points-to set of instruction I at index
+  // N iff instruction I, index N is a BufferAlias of B.
   using BufferAliasVector = tensorflow::gtl::InlinedVector<BufferAlias, 1>;
   const BufferAliasVector& GetBufferAliases(const LogicalBuffer& buffer) const;
 
@@ -250,6 +248,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   Status HandleTuple(HloInstruction* tuple) override;
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
+  Status HandleSlice(HloInstruction* slice) override;
   Status HandleCopy(HloInstruction* copy) override;
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleSend(HloInstruction* send) override;
diff --git a/tensorflow/compiler/xla/service/tuple_util.cc b/tensorflow/compiler/xla/service/tuple_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a530bb0b20582b303f4af969514748b46fd5064
--- /dev/null
+++ b/tensorflow/compiler/xla/service/tuple_util.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/tuple_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace xla {
+
+/*static*/ HloInstruction* TupleUtil::ExtractPrefix(HloInstruction* input_tuple,
+                                                    int64 elements) {
+  CHECK(ShapeUtil::IsTuple(input_tuple->shape()));
+
+  HloComputation* computation = input_tuple->parent();
+  const Shape& input_shape = input_tuple->shape();
+
+  std::vector<HloInstruction*> tuple_elements;
+  tuple_elements.reserve(elements);
+  for (int i = 0; i < elements; i++) {
+    tuple_elements.push_back(
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            input_shape.tuple_shapes(i), input_tuple, i)));
+  }
+
+  return computation->AddInstruction(
+      HloInstruction::CreateTuple(tuple_elements));
+}
+
+/*static*/ HloInstruction* TupleUtil::AppendSuffix(
+    HloInstruction* input_tuple,
+    tensorflow::gtl::ArraySlice<HloInstruction*> trailing_values) {
+  CHECK(ShapeUtil::IsTuple(input_tuple->shape()));
+
+  HloComputation* computation = input_tuple->parent();
+  const Shape& input_shape = input_tuple->shape();
+  std::vector<HloInstruction*> tuple_elements;
+  tuple_elements.reserve(input_shape.tuple_shapes_size());
+  for (int i = 0; i < input_shape.tuple_shapes_size(); i++) {
+    tuple_elements.push_back(
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            input_shape.tuple_shapes(i), input_tuple, i)));
+  }
+  tuple_elements.insert(tuple_elements.end(), trailing_values.begin(),
+                        trailing_values.end());
+  return computation->AddInstruction(
+      HloInstruction::CreateTuple(tuple_elements));
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_util.h b/tensorflow/compiler/xla/service/tuple_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5ff9aaa8357fe8e4777d6dee37bbec72e144c06
--- /dev/null
+++ b/tensorflow/compiler/xla/service/tuple_util.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_TUPLE_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_TUPLE_UTIL_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+
+namespace xla {
+class TupleUtil {
+ public:
+  // Generates HLO instructions to get a prefix tuple from `input_tuple` (which
+  // must be of tuple shape) of length `elements`.  Returns the root of the
+  // graph of instructions generated.
+  //
+  // The instructions are generated into the computation containing
+  // `input_tuple`.
+  static HloInstruction* ExtractPrefix(HloInstruction* input_tuple,
+                                       int64 elements);
+
+  // Generates HLO instructions to create a tuple that consists of the values in
+  // `trailing_values` appended to `input_tuple` (which must be of tuple shape).
+  // Returns the root of the graph of instructions generated.
+  //
+  // The instructions are generated into the computation containing
+  // `input_tuple`.
+  static HloInstruction* AppendSuffix(
+      HloInstruction* input_tuple,
+      tensorflow::gtl::ArraySlice<HloInstruction*> trailing_values);
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_TUPLE_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/tuple_util_test.cc b/tensorflow/compiler/xla/service/tuple_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..754fd8ef169231827eeb5bfd72aeb596644ca767
--- /dev/null
+++ b/tensorflow/compiler/xla/service/tuple_util_test.cc
@@ -0,0 +1,81 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/tuple_util.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+
+namespace xla {
+namespace {
+
+namespace op = ::xla::testing::opcode_matchers;
+
+StatusOr<std::unique_ptr<HloModule>> GetParsedModule(
+    HloComputation** entry_computation, HloInstruction** param0,
+    HloInstruction** param1) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  p0 = (f32[32,32]{1,0},f32[32,32]{1,0},f32[32,32]{1,0}) parameter(0)
+  ROOT p1 = f32[32,32]{1,0} parameter(1)
+}
+)";
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      tools::Parse(hlo_string));
+
+  *entry_computation = module->entry_computation();
+  *param0 = (*entry_computation)->parameter_instruction(0);
+  *param1 = (*entry_computation)->parameter_instruction(1);
+
+  return std::move(module);
+}
+
+TEST(TupleUtilTest, ExtractPrefix) {
+  HloInstruction *param0, *param1;
+  HloComputation* entry_computation;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      GetParsedModule(&entry_computation, &param0, &param1));
+
+  HloInstruction* prefix = TupleUtil::ExtractPrefix(param0, 2);
+
+  EXPECT_THAT(prefix, op::Tuple(op::GetTupleElement(op::Parameter(0), 0),
+                                op::GetTupleElement(op::Parameter(0), 1)));
+}
+
+TEST(TupleUtilTest, AppendSuffix) {
+  HloInstruction *param0, *param1;
+  HloComputation* entry_computation;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      GetParsedModule(&entry_computation, &param0, &param1));
+
+  HloInstruction* with_suffix =
+      TupleUtil::AppendSuffix(param0, {param1, param1});
+
+  EXPECT_THAT(with_suffix, op::Tuple(op::GetTupleElement(op::Parameter(0), 0),
+                                     op::GetTupleElement(op::Parameter(0), 1),
+                                     op::GetTupleElement(op::Parameter(0), 2),
+                                     op::Parameter(1), op::Parameter(1)));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 4e90491b55a5688e37cbabae0843f584578add55..fead9b92362bcd1974f2dff6e030bc47dfc5aa85 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -88,8 +88,6 @@ HloOpcode BinaryOperationToHloOpcode(BinaryOperation binop) {
       return HloOpcode::kAtan2;
     case BINOP_COMPLEX:
       return HloOpcode::kComplex;
-    case BINOP_DOT:
-      return HloOpcode::kDot;
     case BINOP_MUL:
       return HloOpcode::kMultiply;
     case BINOP_ADD:
@@ -371,14 +369,6 @@ StatusOr<ComputationDataHandle> UserComputation::AddRngInstruction(
 
   // Check the number of parameters per RNG distribution.
   switch (rng_request.distribution()) {
-    case RandomDistribution::RNG_BERNOULLI:
-      if (rng_request.parameter_size() != 1) {
-        return InvalidArgument(
-            "RNG distribution (%s) expects 1 parameters, but got %d",
-            RandomDistribution_Name(rng_request.distribution()).c_str(),
-            rng_request.parameter_size());
-      }
-      break;
     case RandomDistribution::RNG_NORMAL:
     case RandomDistribution::RNG_UNIFORM:
       if (rng_request.parameter_size() != 2) {
@@ -765,6 +755,54 @@ StatusOr<ComputationDataHandle> UserComputation::AddWhileInstruction(
   return handle;
 }
 
+StatusOr<ComputationDataHandle> UserComputation::AddConditionalInstruction(
+    const ConditionalRequest& conditional_request,
+    const UserComputation& true_computation,
+    const UserComputation& false_computation) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* pred,
+                      LookUpRequest(conditional_request.predicate()));
+  TF_ASSIGN_OR_RETURN(const OperationRequest* true_operand,
+                      LookUpRequest(conditional_request.true_operand()));
+  TF_ASSIGN_OR_RETURN(const OperationRequest* false_operand,
+                      LookUpRequest(conditional_request.false_operand()));
+
+  VersionedComputationHandle::Version true_computation_version =
+      true_computation.version();
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<const ProgramShape> true_computation_shape,
+      true_computation.ComputeProgramShape(true_computation_version));
+
+  VersionedComputationHandle::Version false_computation_version =
+      false_computation.version();
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<const ProgramShape> false_computation_shape,
+      false_computation.ComputeProgramShape(false_computation_version));
+
+  TF_ASSIGN_OR_RETURN(Shape inferred_shape,
+                      ShapeInference::InferConditionalShape(
+                          pred->output_shape(), true_operand->output_shape(),
+                          false_operand->output_shape(),
+                          *true_computation_shape, *false_computation_shape));
+
+  ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = inferred_shape;
+  request.add_embedded_computation_versions(true_computation_version);
+  request.add_embedded_computation_versions(false_computation_version);
+  *request.mutable_request()->mutable_conditional_request() =
+      conditional_request;
+
+  VLOG(1) << "AddConditionalInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << conditional_request.ShortDebugString();
+  return handle;
+}
+
 StatusOr<ComputationDataHandle> UserComputation::AddBroadcastInstruction(
     const BroadcastRequest& broadcast_request) {
   tensorflow::mutex_lock lock(mutex_);
@@ -1075,6 +1113,31 @@ StatusOr<ComputationDataHandle> UserComputation::AddConvolveInstruction(
   return handle;
 }
 
+StatusOr<ComputationDataHandle> UserComputation::AddFftInstruction(
+    const FftRequest& fft_request) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
+                      LookUpRequest(fft_request.operand()));
+  TF_ASSIGN_OR_RETURN(Shape shape,
+                      ShapeInference::InferFftShape(
+                          operand->output_shape(), fft_request.fft_type(),
+                          AsInt64Slice(fft_request.fft_length())));
+
+  const ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = shape;
+  *request.mutable_request()->mutable_fft_request() = fft_request;
+
+  VLOG(1) << "AddFftInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << fft_request.ShortDebugString();
+  return handle;
+}
+
 StatusOr<ComputationDataHandle> UserComputation::AddCrossReplicaSumInstruction(
     const CrossReplicaSumRequest& cross_replica_sum_request) {
   tensorflow::mutex_lock lock(mutex_);
@@ -1082,7 +1145,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddCrossReplicaSumInstruction(
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
                       LookUpRequest(cross_replica_sum_request.operand()));
   TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCrossReplicaSumShape(
-                                       operand->output_shape()));
+                                       {&operand->output_shape()}));
 
   ComputationDataHandle handle = CreateComputationDataHandle();
 
@@ -1122,7 +1185,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddInfeedInstruction(
   return handle;
 }
 
-Status UserComputation::AddOutfeedInstruction(
+StatusOr<ComputationDataHandle> UserComputation::AddOutfeedInstruction(
     const OutfeedRequest& outfeed_request) {
   tensorflow::mutex_lock lock(mutex_);
 
@@ -1134,8 +1197,6 @@ Status UserComputation::AddOutfeedInstruction(
   // Verify that operand is valid.
   TF_RETURN_IF_ERROR(LookUpRequest(outfeed_request.operand()).status());
 
-  // No handle is returned, but a handle must be assigned to this instruction
-  // for computation versioning.
   ComputationDataHandle handle = CreateComputationDataHandle();
   OperationRequest& request =
       (*session_computation_.mutable_requests())[handle.handle()];
@@ -1146,7 +1207,7 @@ Status UserComputation::AddOutfeedInstruction(
   VLOG(1) << "AddOutfeedInstruction (" << GetVersionedHandleInternal()
           << "), data handle " << handle.handle() << ": "
           << outfeed_request.ShortDebugString();
-  return Status::OK();
+  return handle;
 }
 
 StatusOr<ComputationDataHandle> UserComputation::AddCallInstruction(
@@ -1192,6 +1253,14 @@ StatusOr<ComputationDataHandle> UserComputation::AddCustomCallInstruction(
     TF_RETURN_IF_ERROR(LookUpRequest(handle).status());
   }
 
+  if (tensorflow::StringPiece(custom_call_request.call_target_name())
+          .starts_with("$")) {
+    return InvalidArgument(
+        "Invalid custom_call_target \"%s\": Call targets that start with '$' "
+        "are reserved for internal use.",
+        custom_call_request.call_target_name().c_str());
+  }
+
   const ComputationDataHandle handle = CreateComputationDataHandle();
 
   OperationRequest& request =
@@ -1207,6 +1276,33 @@ StatusOr<ComputationDataHandle> UserComputation::AddCustomCallInstruction(
   return handle;
 }
 
+StatusOr<ComputationDataHandle> UserComputation::AddDotInstruction(
+    const DotRequest& dot_request) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
+                      LookUpRequest(dot_request.lhs()));
+  TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
+                      LookUpRequest(dot_request.rhs()));
+
+  TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferDotOpShape(
+                                       lhs->output_shape(), rhs->output_shape(),
+                                       dot_request.dimension_numbers()));
+
+  const ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = shape;
+  *request.mutable_request()->mutable_dot_request() = dot_request;
+
+  VLOG(1) << "AddDotInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << dot_request.ShortDebugString();
+  return handle;
+}
+
 StatusOr<ComputationDataHandle> UserComputation::AddUnaryInstruction(
     const UnaryOpRequest& unary_request) {
   tensorflow::mutex_lock lock(mutex_);
@@ -1433,7 +1529,7 @@ StatusOr<const OperationRequest*> LookUpRequest(
   return &session_computation.requests().at(handle_value);
 }
 
-// Returns the OperationRequestion corresponding to the root (result) of the
+// Returns the OperationRequest corresponding to the root (result) of the
 // session computation.
 StatusOr<const OperationRequest*> GetRoot(
     VersionedComputationHandle::Version version,
@@ -1479,8 +1575,8 @@ UserComputation::ComputeProgramShape(
             request.request().parameter_request();
         int64 param_no = parameter_request.parameter();
         // Parameters may be out of order so expand ProgramShape parameters
-        // until
-        // it is at least large enough to hold the current parameter number.
+        // until it is at least large enough to hold the current parameter
+        // number.
         while (program_shape->parameters_size() <= param_no) {
           program_shape->add_parameters();
           program_shape->add_parameter_names();
@@ -1594,6 +1690,13 @@ void PureFunctionalVisitor(const SessionComputation& session_computation,
       break;
     }
 
+    case OpRequest::kFftRequest: {
+      const FftRequest& fft_request = request.request().fft_request();
+      PureFunctionalVisitor(session_computation, fft_request.operand(),
+                            num_parameters, visited, is_functional);
+      break;
+    }
+
     case OpRequest::kCrossReplicaSumRequest: {
       // TODO(b/33009255): Implmement constant folding for cross replica sum.
       *is_functional = false;
@@ -1629,6 +1732,15 @@ void PureFunctionalVisitor(const SessionComputation& session_computation,
       break;
     }
 
+    case OpRequest::kDotRequest: {
+      const DotRequest& dot_request = request.request().dot_request();
+      PureFunctionalVisitor(session_computation, dot_request.lhs(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation, dot_request.rhs(),
+                            num_parameters, visited, is_functional);
+      break;
+    }
+
     case OpRequest::kSendRequest: {
       *is_functional = false;
       break;
@@ -1757,6 +1869,23 @@ void PureFunctionalVisitor(const SessionComputation& session_computation,
       break;
     }
 
+    case OpRequest::kConditionalRequest: {
+      const ConditionalRequest& conditional_request =
+          request.request().conditional_request();
+      PureFunctionalVisitor(session_computation,
+                            conditional_request.predicate(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            conditional_request.true_operand(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            conditional_request.false_operand(), num_parameters,
+                            visited, is_functional);
+      // TODO(b/32495713): We aren't checking the true and false computations
+      // themselves.
+      break;
+    }
+
     case OpRequest::kTernaryOpRequest: {
       const TernaryOpRequest& ternary_op_request =
           request.request().ternary_op_request();
@@ -1868,6 +1997,9 @@ void PureFunctionalVisitor(const SessionComputation& session_computation,
     default:
       LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
   }
+  if (!*is_functional) {
+    VLOG(1) << "Non-functional: " << request.request().DebugString();
+  }
   visited->insert(handle.handle());
 }
 
@@ -1985,6 +2117,21 @@ UserComputation::GetEmbeddedComputations(
           break;
         }
 
+        case OpRequest::kConditionalRequest: {
+          CHECK_EQ(2, request.embedded_computation_versions_size());
+          const ConditionalRequest& conditional_request =
+              request.request().conditional_request();
+          const VersionedComputationHandle true_computation_versioned_handle = {
+              conditional_request.true_computation(),
+              request.embedded_computation_versions(0)};
+          computations.push_back(true_computation_versioned_handle);
+          const VersionedComputationHandle false_computation_versioned_handle =
+              {conditional_request.false_computation(),
+               request.embedded_computation_versions(1)};
+          computations.push_back(false_computation_versioned_handle);
+          break;
+        }
+
         default:
           // No embedded computation.
           break;
@@ -2000,6 +2147,24 @@ UserComputation::GetEmbeddedComputations(
   return computations;
 }
 
+StatusOr<const OperationRequest*>
+UserComputation::LookUpRequestForErrorReporting(
+    const ComputationDataHandle& handle) const {
+  tensorflow::mutex_lock lock(mutex_);
+  return LookUpRequest(handle);
+}
+
+tensorflow::gtl::optional<const OpMetadata*> UserComputation::ParameterMetadata(
+    int parameter_number) const {
+  tensorflow::mutex_lock lock(mutex_);
+  auto it = parameters_.find(parameter_number);
+  if (it == parameters_.end()) {
+    return tensorflow::gtl::nullopt;
+  }
+  OperationRequest* op = it->second;
+  return &op->request().metadata();
+}
+
 Status UserComputation::RemapEmbeddedComputations(
     const std::map<int64, ComputationHandle>& old_to_new) {
   auto update = [&old_to_new](ComputationHandle* to_update) -> Status {
@@ -2071,6 +2236,16 @@ Status UserComputation::RemapEmbeddedComputations(
         TF_RETURN_IF_ERROR(update(while_request->mutable_body()));
         break;
       }
+      case OpRequest::kConditionalRequest: {
+        TF_RET_CHECK(2 == request.embedded_computation_versions_size());
+        ConditionalRequest* conditional_request =
+            request.mutable_request()->mutable_conditional_request();
+        TF_RETURN_IF_ERROR(
+            update(conditional_request->mutable_true_computation()));
+        TF_RETURN_IF_ERROR(
+            update(conditional_request->mutable_false_computation()));
+        break;
+      }
       default:
         // No embedded computation.
         TF_RET_CHECK(0 == request.embedded_computation_versions_size());
@@ -2274,6 +2449,12 @@ static void ForEachOperand(
       break;
     }
 
+    case OpRequest::kFftRequest: {
+      const FftRequest& fft_request = request.request().fft_request();
+      apply(fft_request.operand());
+      break;
+    }
+
     case OpRequest::kBatchNormTrainingRequest: {
       const BatchNormTrainingRequest& batch_norm_training_request =
           request.request().batch_norm_training_request();
@@ -2417,6 +2598,15 @@ static void ForEachOperand(
       break;
     }
 
+    case OpRequest::kConditionalRequest: {
+      const ConditionalRequest& conditional_request =
+          request.request().conditional_request();
+      apply(conditional_request.predicate());
+      apply(conditional_request.true_operand());
+      apply(conditional_request.false_operand());
+      break;
+    }
+
     case OpRequest::kTernaryOpRequest: {
       const TernaryOpRequest& ternary_op_request =
           request.request().ternary_op_request();
@@ -2453,6 +2643,13 @@ static void ForEachOperand(
       break;
     }
 
+    case OpRequest::kDotRequest: {
+      const DotRequest& dot_request = request.request().dot_request();
+      apply(dot_request.rhs());
+      apply(dot_request.lhs());
+      break;
+    }
+
     case OpRequest::kUnaryOpRequest: {
       const UnaryOpRequest& unary_op_request =
           request.request().unary_op_request();
@@ -2571,48 +2768,11 @@ HloComputation* ComputationLowerer::ResolveComputation(
 
 HloInstruction* ComputationLowerer::ImplicitBroadcastToExplicitBroadcast(
     HloInstruction* operand, const Shape& output_shape) {
-  CHECK(ShapeUtil::IsScalar(operand->shape()) ||
-        ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(output_shape));
-  Shape broadcast_shape = ShapeUtil::MakeShape(
-      operand->shape().element_type(), AsInt64Slice(output_shape.dimensions()));
-  // Do explicit broadcast for scalar.
-  if (ShapeUtil::IsScalar(operand->shape())) {
-    HloInstruction* broadcast = hlo_builder_.AddInstruction(
-        HloInstruction::CreateBroadcast(broadcast_shape, operand, {}));
-    broadcast->set_metadata(operand->metadata());
-    if (operand->has_sharding()) {
-      broadcast->set_sharding(operand->sharding());
-    }
-    return broadcast;
-  }
-  // Do explicit broadcast for degenerate broadcast.
-  std::vector<int64> broadcast_dimensions;
-  std::vector<int64> reshaped_dimensions;
-  for (int i = 0; i < ShapeUtil::Rank(operand->shape()); i++) {
-    if (operand->shape().dimensions(i) == output_shape.dimensions(i)) {
-      broadcast_dimensions.push_back(i);
-      reshaped_dimensions.push_back(operand->shape().dimensions(i));
-    }
-  }
-  // Eliminate the size one dimensions.
-  HloInstruction* reshaped_operand =
-      hlo_builder_.AddInstruction(HloInstruction::CreateReshape(
-          ShapeUtil::MakeShape(operand->shape().element_type(),
-                               reshaped_dimensions),
-          operand));
-  reshaped_operand->set_metadata(operand->metadata());
-  if (operand->has_sharding()) {
-    reshaped_operand->set_sharding(operand->sharding());
-  }
-  // Broadcast 'reshape' up to the larger size.
-  HloInstruction* broadcast =
-      hlo_builder_.AddInstruction(HloInstruction::CreateBroadcast(
-          broadcast_shape, reshaped_operand, broadcast_dimensions));
-  broadcast->set_metadata(operand->metadata());
-  if (operand->has_sharding()) {
-    broadcast->set_sharding(operand->sharding());
-  }
-  return broadcast;
+  auto fadd = [this](std::unique_ptr<HloInstruction> x) {
+    return hlo_builder_.AddInstruction(std::move(x));
+  };
+  return fadd(
+      HloInstruction::CreateBroadcastSequence(output_shape, operand, fadd));
 }
 
 void ComputationLowerer::Visit(
@@ -2653,7 +2813,8 @@ void ComputationLowerer::Visit(
       const ConstantRequest& constant_request =
           request.request().constant_request();
       hlo_instruction = add_instruction(HloInstruction::CreateConstant(
-          Literal(constant_request.literal()).CloneToUnique()));
+          Literal::CreateFromProto(constant_request.literal())
+              .ConsumeValueOrDie()));
       break;
     }
 
@@ -2732,13 +2893,31 @@ void ComputationLowerer::Visit(
       break;
     }
 
+    case OpRequest::kFftRequest: {
+      const FftRequest& fft_request = request.request().fft_request();
+      HloInstruction* operand = lookup_instruction(fft_request.operand());
+      hlo_instruction = add_instruction(HloInstruction::CreateFft(
+          request.output_shape(), operand, fft_request.fft_type(),
+          AsInt64Slice(fft_request.fft_length())));
+      break;
+    }
+
+    case OpRequest::kDotRequest: {
+      const DotRequest& dot_request = request.request().dot_request();
+      HloInstruction* lhs = lookup_instruction(dot_request.lhs());
+      HloInstruction* rhs = lookup_instruction(dot_request.rhs());
+      hlo_instruction = add_instruction(HloInstruction::CreateDot(
+          request.output_shape(), lhs, rhs, dot_request.dimension_numbers()));
+      break;
+    }
+
     case OpRequest::kCrossReplicaSumRequest: {
       const CrossReplicaSumRequest& cross_replica_sum_request =
           request.request().cross_replica_sum_request();
       HloInstruction* operand =
           lookup_instruction(cross_replica_sum_request.operand());
       hlo_instruction = add_instruction(HloInstruction::CreateCrossReplicaSum(
-          request.output_shape(), operand));
+          request.output_shape(), {operand}));
       break;
     }
 
@@ -3021,6 +3200,30 @@ void ComputationLowerer::Visit(
       break;
     }
 
+    case OpRequest::kConditionalRequest: {
+      const ConditionalRequest& conditional_request =
+          request.request().conditional_request();
+      CHECK_EQ(2, request.embedded_computation_versions_size());
+      VersionedComputationHandle::Version true_computation_version =
+          request.embedded_computation_versions(0);
+      HloComputation* true_computation = ResolveComputation(
+          conditional_request.true_computation(), true_computation_version);
+      VersionedComputationHandle::Version false_computation_version =
+          request.embedded_computation_versions(1);
+      HloComputation* false_computation = ResolveComputation(
+          conditional_request.false_computation(), false_computation_version);
+      HloInstruction* predicate =
+          lookup_instruction(conditional_request.predicate());
+      HloInstruction* true_operand =
+          lookup_instruction(conditional_request.true_operand());
+      HloInstruction* false_operand =
+          lookup_instruction(conditional_request.false_operand());
+      hlo_instruction = add_instruction(HloInstruction::CreateConditional(
+          request.output_shape(), predicate, true_operand, true_computation,
+          false_operand, false_computation));
+      break;
+    }
+
     case OpRequest::kTernaryOpRequest: {
       const TernaryOpRequest& ternary_op_request =
           request.request().ternary_op_request();
@@ -3151,8 +3354,7 @@ void ComputationLowerer::Visit(
         lhs = (lhs == operand_to_broadcast) ? broadcasted_operand : lhs;
         rhs = (rhs == operand_to_broadcast) ? broadcasted_operand : rhs;
       }
-      if (debug_options_.xla_eliminate_hlo_implicit_broadcast() &&
-          binary_op_request.binop() != BINOP_DOT) {
+      if (debug_options_.xla_eliminate_hlo_implicit_broadcast()) {
         if (!ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) {
           // lhs side is being implicitly broadcast. Change to explicit.
           lhs =
diff --git a/tensorflow/compiler/xla/service/user_computation.h b/tensorflow/compiler/xla/service/user_computation.h
index 317c631dca2e1ebe6f3c8fbaf1a3e94106034f79..54bb24d6d7fe7aa8cc7c684795e40464e4eb6614 100644
--- a/tensorflow/compiler/xla/service/user_computation.h
+++ b/tensorflow/compiler/xla/service/user_computation.h
@@ -133,6 +133,10 @@ class UserComputation {
   StatusOr<ComputationDataHandle> AddConvolveInstruction(
       const ConvolveRequest& convolve_request);
 
+  // Enqueues an FFT instruction onto this user computation.
+  StatusOr<ComputationDataHandle> AddFftInstruction(
+      const FftRequest& fft_request);
+
   // Enqueues a cross replica sum instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddCrossReplicaSumInstruction(
       const CrossReplicaSumRequest& cross_replica_sum_request);
@@ -142,7 +146,8 @@ class UserComputation {
       const InfeedRequest& infeed_request);
 
   // Enqueues an outfeed instruction onto this user computation.
-  Status AddOutfeedInstruction(const OutfeedRequest& outfeed_request);
+  StatusOr<ComputationDataHandle> AddOutfeedInstruction(
+      const OutfeedRequest& outfeed_request);
 
   // Enqueues a call instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddCallInstruction(
@@ -153,6 +158,10 @@ class UserComputation {
   StatusOr<ComputationDataHandle> AddCustomCallInstruction(
       const CustomCallRequest& custom_call_request);
 
+  // Enqueues a dot instruction onto this user computation.
+  StatusOr<ComputationDataHandle> AddDotInstruction(
+      const DotRequest& dot_request);
+
   // Enqueues a broadcast instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddBroadcastInstruction(
       const BroadcastRequest& broadcast_request);
@@ -216,6 +225,12 @@ class UserComputation {
       const UserComputation& condition_computation,
       const UserComputation& body_computation);
 
+  // Enqueues a conditional instruction on this user computation.
+  StatusOr<ComputationDataHandle> AddConditionalInstruction(
+      const ConditionalRequest& conditional_request,
+      const UserComputation& true_computation,
+      const UserComputation& false_computation);
+
   // Enqueues a Send instruction onto this user computation.
   Status AddSendInstruction(const SendRequest& send_request);
 
@@ -307,6 +322,23 @@ class UserComputation {
   SessionComputation CloneSessionComputation(
       VersionedComputationHandle::Version version) const;
 
+  // Warning: typically we don't want to look up computation data handles until
+  // the computation is finished being built, for consistency purposes. We
+  // expose this routine for error reporting purposes so that we can provide
+  // more meaningful error messages from the XLA service layer.
+  //
+  // Returns the operation request that the handle comes from.
+  StatusOr<const OperationRequest*> LookUpRequestForErrorReporting(
+      const ComputationDataHandle& handle) const;
+
+  // Retrieves the parameter metadata for the given parameter number.
+  //
+  // If the parameter number is invalid for this computation, nullopt is
+  // returned. When the return value has_value(), nullptr will never be
+  // the held value.
+  tensorflow::gtl::optional<const OpMetadata*> ParameterMetadata(
+      int parameter_number) const;
+
  private:
   // Warning: dangerous mutating operation that doesn't respect versioning.
   // This is only used at initialization time when constructing from a
diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc
index 5afaf226ae0cce7e9afc966c6b4adf838aeebc91..2fa163953f638c0038e9f6bb11ce2a3742e0558c 100644
--- a/tensorflow/compiler/xla/service/user_computation_test.cc
+++ b/tensorflow/compiler/xla/service/user_computation_test.cc
@@ -65,8 +65,10 @@ TEST_F(UserComputationTest, SimpleComputation) {
 
   OutfeedRequest outfeed_request;
   *outfeed_request.mutable_operand() = constant_handle;
+  *outfeed_request.mutable_shape() = kVectorShape;
   outfeed_request.set_outfeed_config("abc");
-  TF_ASSERT_OK(computation.AddOutfeedInstruction(outfeed_request));
+  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle outfeed_handle,
+                          computation.AddOutfeedInstruction(outfeed_request));
 
   auto hlo_resolver = [](const VersionedComputationHandle& handle) {
     return nullptr;
@@ -334,50 +336,5 @@ TEST_F(UserComputationTest, EliminateDegenerateBroadcastAfterIndimBroadcast) {
               operands[1]->opcode() == HloOpcode::kBroadcast);
 }
 
-TEST_F(UserComputationTest, SkipDotInEliminatingImplicitBroadcast) {
-  auto debug_options = DebugOptions();
-  debug_options.set_xla_eliminate_hlo_implicit_broadcast(true);
-
-  //  %a = Param({1, 3});
-  //  %b = Param({3, 1});
-  //  %dot = Dot(%a, %b);
-  ComputationHandle handle;
-  handle.set_handle(123);
-  UserComputation computation("TheComputation", handle);
-
-  ParameterRequest a_request;
-  *a_request.mutable_shape() = ShapeUtil::MakeShape(F32, {1, 3});
-  a_request.set_name("a");
-  a_request.set_parameter(0);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle,
-                          computation.AddParameterInstruction(a_request));
-
-  ParameterRequest b_request;
-  *b_request.mutable_shape() = ShapeUtil::MakeShape(F32, {3, 1});
-  b_request.set_name("b");
-  b_request.set_parameter(1);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle,
-                          computation.AddParameterInstruction(b_request));
-
-  BinaryOpRequest dot;
-  dot.set_binop(BINOP_DOT);
-  *dot.mutable_lhs() = a_handle;
-  *dot.mutable_rhs() = b_handle;
-  TF_ASSERT_OK(computation.AddBinaryInstruction(dot).status());
-
-  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
-    return nullptr;
-  };
-  VersionedComputationHandle latest_version = computation.GetVersionedHandle();
-
-  // Build the HLO computation.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloComputation> hlo_computation,
-      computation.BuildHloComputation(latest_version.version, hlo_resolver,
-                                      debug_options));
-
-  EXPECT_EQ(3, hlo_computation->instruction_count());
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5f9b01f011ce04f1114c74391a967c62f015221
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -0,0 +1,296 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h"
+#include "tensorflow/compiler/xla/service/tuple_util.h"
+#include "tensorflow/compiler/xla/service/while_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace xla {
+
+using tensorflow::gtl::FlatMap;
+using tensorflow::gtl::FlatSet;
+using tensorflow::gtl::InlinedVector;
+
+// Copies `to_hoist` to the computation containing `while_instr`, hoisting its
+// operands as needed.  All of its transitive operands are expected to be either
+// in `hoisted_instructions` or `unhoisted_invariant_instructions`.  This
+// function hoists the operands in `unhoisted_invariant_instructions` and moves
+// them into `hoisted_instructions`.
+static void CreateLoopInvariantCopy(
+    FlatMap<HloInstruction*, HloInstruction*>* hoisted_instructions,
+    FlatSet<HloInstruction*>* unhoisted_invariant_instructions,
+    HloInstruction* while_instr, HloInstruction* to_hoist) {
+  HloComputation* parent_of_while = while_instr->parent();
+  HloComputation* while_body = while_instr->while_body();
+
+  struct DFSFrame {
+    HloInstruction* instruction;
+    int64 operand_index;
+  };
+
+  InlinedVector<DFSFrame, 8> dfs_stack;
+  dfs_stack.push_back({to_hoist, 0});
+
+  HloInstruction* while_body_param = while_body->parameter_instruction(0);
+  HloInstruction* while_operand = while_instr->mutable_operand(0);
+
+  do {
+    DFSFrame* frame = &dfs_stack.back();
+    if (frame->operand_index == frame->instruction->operand_count()) {
+      HloInstruction* old_instruction = frame->instruction;
+
+      // All of the operands for old_instruction have been cloned, so it is
+      // time to clone old_instruction itself.
+
+      auto get_new_operand = [&](HloInstruction* old_operand) {
+        return old_operand == while_body_param
+                   ? while_operand
+                   : FindOrDie(*hoisted_instructions, old_operand);
+      };
+
+      InlinedVector<HloInstruction*, 4> new_operands;
+      c_transform(old_instruction->operands(), std::back_inserter(new_operands),
+                  get_new_operand);
+
+      HloInstruction* new_instruction =
+          parent_of_while->AddInstruction(old_instruction->CloneWithNewOperands(
+              old_instruction->shape(), new_operands));
+
+      InsertOrDie(hoisted_instructions, old_instruction, new_instruction);
+
+      // Approximately half of the instructions that would normally be present
+      // in unhoisted_invariant_instructions are constants.  We save a bit of
+      // compile time by not putting these in the hashtable.
+      CHECK_EQ(unhoisted_invariant_instructions->erase(old_instruction),
+               to_hoist != old_instruction &&
+                   old_instruction->opcode() != HloOpcode::kConstant);
+      dfs_stack.pop_back();
+      continue;
+    }
+
+    HloInstruction* next_operand =
+        frame->instruction->mutable_operand(frame->operand_index++);
+    if (hoisted_instructions->count(next_operand) ||
+        next_operand == while_body_param) {
+      continue;
+    }
+
+    dfs_stack.push_back({next_operand, 0});
+  } while (!dfs_stack.empty());
+}
+
+// Returns true if `instruction` is worth hoisting only if it lets us hoist some
+// instruction using it.  The rationale is that hoisting these instructions will
+// prevent simplification and fusion in the while body.
+static bool NotWorthHoistingIndividually(const HloInstruction& instruction) {
+  switch (instruction.opcode()) {
+    default:
+      return false;
+
+    case HloOpcode::kBitcast:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kConstant:
+    case HloOpcode::kReverse:
+    case HloOpcode::kSlice:
+    case HloOpcode::kTuple:
+      return true;
+
+    case HloOpcode::kTranspose:
+      return ShapeUtil::TransposeIsBitcast(
+          /*input_shape=*/instruction.operand(0)->shape(),
+          /*output_shape=*/instruction.shape(), instruction.dimensions());
+
+    case HloOpcode::kReshape:
+      return ShapeUtil::ReshapeIsBitcast(
+          /*input_shape=*/instruction.operand(0)->shape(),
+          /*output_shape=*/instruction.shape());
+  }
+}
+
+// Populates `gte_set` with the GetTupleElement instructions in `while_body`
+// that access elements in the parameter tuple that don't change across
+// iterations.  Assumes `while_body` is the body computation of the while loop
+// in question.
+static void GatherInvariantGTEs(HloComputation* while_body,
+                                FlatSet<HloInstruction*>* gte_set) {
+  const HloInstruction::InstructionVector root_operands =
+      while_body->root_instruction()->operands();
+  for (int i = 0; i < root_operands.size(); i++) {
+    HloInstruction* instr = root_operands[i];
+    if (instr->opcode() == HloOpcode::kGetTupleElement &&
+        instr->tuple_index() == i &&
+        instr->operand(0) == while_body->parameter_instruction(0) &&
+        ShapeUtil::IsArray(instr->shape())) {
+      InsertOrDie(gte_set, instr);
+    }
+  }
+}
+
+static StatusOr<bool> TryHoistingInvariantInstructionsFromWhileBody(
+    HloInstruction* while_instr) {
+  auto print_no_metadata = HloPrintOptions{}.set_print_metadata(false);
+
+  if (!ShapeUtil::IsTuple(while_instr->shape())) {
+    // This restriction leaves one interesting pattern on the table:
+    //
+    //  while_body(f32[1024, 1024] %param) {
+    //    %value = expensive_op(%param)
+    //    outfeed(%value)
+    //    ROOT = %param
+    //  }
+    //
+    // If we see that pattern in the while, instead of generalizing this
+    // algorithm to work with non-tuples, we should instead add a pass that
+    // canonicalizes while loops like the above to use a tuple state.
+    return false;
+  }
+
+  string while_instr_name = while_instr->ToString(print_no_metadata);
+  VLOG(2) << "Trying to hoist from " << while_instr_name;
+
+  HloComputation* while_body = while_instr->while_body();
+
+  // Maps instructions in the while body to instructions hoisted outside the
+  // while that compute the same value.
+  FlatMap<HloInstruction*, HloInstruction*> hoisted_instructions;
+
+  // Contains instructions that can be legally hoisted, but were deemed to be
+  // unprofitable to be hoisted alone by NotWorthHoistingIndividually.  When we
+  // hoist an instruction in this set, we move it from
+  // unhoisted_invariant_instructions to hoisted_instructions.
+  FlatSet<HloInstruction*> unhoisted_invariant_instructions;
+
+  // Invariant GTE's axiomatically satisfy the constraints for
+  // unhoisted_invariant_instructions -- they can be legally hoisted, but there
+  // is no benefit to hoisting them unless something that uses it is also
+  // hoisted.
+  GatherInvariantGTEs(while_body, &unhoisted_invariant_instructions);
+
+  if (unhoisted_invariant_instructions.empty()) {
+    // There are no obviously loop invariant elements in the state being
+    // threaded through the while loop so give up.  In theory this precondition
+    // is too strong -- we could have code that e.g. permutes the elements in
+    // the while state but uses a select to pick the same value on every
+    // iteration.
+    return false;
+  }
+
+  // instructions_to_replace[i] is hoisted into a loop invariant instruction
+  // replacement_instructions[i].
+  std::vector<HloInstruction*> instructions_to_replace;
+  std::vector<HloInstruction*> replacement_instructions;
+
+  for (auto* instruction : while_body->MakeInstructionPostOrder()) {
+    if (instruction->HasSideEffect() ||
+        instruction->opcode() == HloOpcode::kParameter ||
+        !instruction->control_predecessors().empty() ||
+        !instruction->control_successors().empty()) {
+      continue;
+    }
+
+    auto is_invariant = [&](HloInstruction* op) {
+      return hoisted_instructions.find(op) != hoisted_instructions.end() ||
+             unhoisted_invariant_instructions.count(op) ||
+             op->opcode() == HloOpcode::kConstant;
+    };
+
+    if (!c_all_of(instruction->operands(), is_invariant)) {
+      continue;
+    }
+
+    if (NotWorthHoistingIndividually(*instruction)) {
+      VLOG(2) << "Adding " << instruction->ToString(print_no_metadata)
+              << " to unhoisted invariant set.";
+      // Approximately half of the instructions that reach this point are
+      // constants.  We save a bit of compile time by not putting these in the
+      // hashtable.
+      if (instruction->opcode() != HloOpcode::kConstant) {
+        InsertOrDie(&unhoisted_invariant_instructions, instruction);
+      }
+      continue;
+    }
+
+    VLOG(2) << "Hoisting " << instruction->ToString(print_no_metadata);
+
+    CreateLoopInvariantCopy(&hoisted_instructions,
+                            &unhoisted_invariant_instructions, while_instr,
+                            instruction);
+
+    instructions_to_replace.push_back(instruction);
+    replacement_instructions.push_back(
+        FindOrDie(hoisted_instructions, instruction));
+  }
+
+  if (instructions_to_replace.empty()) {
+    return false;
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      WhileUtil::MakeInstructionsLiveInResult live_in_instructions_result,
+      WhileUtil::MakeInstructionsLiveIn(while_instr, replacement_instructions));
+
+  HloComputation* new_while_body =
+      live_in_instructions_result.new_while_instr->while_body();
+
+  for (int i = 0; i < instructions_to_replace.size(); i++) {
+    HloInstruction* instruction_to_replace_in_new_while =
+        FindOrDie(live_in_instructions_result.while_body_instruction_map,
+                  instructions_to_replace[i]);
+    TF_RETURN_IF_ERROR(new_while_body->ReplaceInstruction(
+        instruction_to_replace_in_new_while,
+        live_in_instructions_result.while_body_live_in_values[i]));
+  }
+
+  VLOG(1) << "Hoisted " << instructions_to_replace.size()
+          << " instructions from " << while_instr_name;
+
+  return true;
+}
+
+StatusOr<bool> WhileLoopInvariantCodeMotion::Run(HloModule* module) {
+  bool changed = false;
+  std::vector<HloInstruction*> while_instrs;
+  for (auto* comp : module->computations()) {
+    c_copy_if(comp->instructions(), std::back_inserter(while_instrs),
+              [](const HloInstruction* instr) {
+                return instr->opcode() == HloOpcode::kWhile;
+              });
+  }
+
+  for (HloInstruction* while_instr : while_instrs) {
+    // Right now we only hoist computations from the while body, but
+    // TryHoistingInvariantInstructionsFromWhileBody can be generalized to
+    // optimize the condition computation too, if needed.
+    //
+    // The transform we do here is a pessmization for while loops that execute
+    // zero times*, but at this time we expect those to be rare.  If this
+    // becomes a problem we can consider using the conditional HLO to avoid
+    // doing extra work for while loops with zero trip count.
+    //
+    // * We delete while loops that have a zero trip count, so this would have
+    //   to be a while loop with a somewhat opaque condition expression.
+
+    TF_ASSIGN_OR_RETURN(
+        bool result,
+        TryHoistingInvariantInstructionsFromWhileBody(while_instr));
+    changed |= result;
+  }
+  return changed;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c4b765b0003c48cfacb9d28e7c8259ac0927d66
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// HLO pass that rewrites while loops to hoist loop invariant instructions in
+// the while body into the computation that contains the while instruction.
+
+class WhileLoopInvariantCodeMotion : public HloPassInterface {
+ public:
+  ~WhileLoopInvariantCodeMotion() override = default;
+
+  tensorflow::StringPiece name() const override {
+    return "while-loop-invariant-code-motion";
+  }
+  StatusOr<bool> Run(HloModule* module) override;
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..799340fda905fb7d40b19b4cb79bb0fcb5629fd3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -0,0 +1,442 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class WhileLoopInvariantCodeMotionTest : public HloVerifiedTestBase {
+ public:
+  // Makes a computation which has one parameter, of the given shape, and always
+  // returns PRED[]{true}.  This is useful as a dummy loop condition.
+  HloComputation* MakeAlwaysTrueComputation(const Shape& param_shape,
+                                            HloModule* module);
+};
+
+static void FindOnlyWhileInstruction(HloComputation* computation,
+                                     HloInstruction** while_instruction) {
+  *while_instruction = nullptr;
+  for (auto* instr : computation->instructions()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      ASSERT_EQ(*while_instruction, nullptr);
+      *while_instruction = instr;
+    }
+  }
+
+  ASSERT_NE(*while_instruction, nullptr);
+}
+
+HloComputation* WhileLoopInvariantCodeMotionTest::MakeAlwaysTrueComputation(
+    const Shape& param_shape, HloModule* module) {
+  HloComputation::Builder builder(TestName() + ".always_true");
+  builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "param"));
+  builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  return module->AddEmbeddedComputation(builder.Build());
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, HoistOneInvariantOperation) {
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  Shape while_shape =
+      ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
+
+  HloComputation* while_body = [&]() {
+    HloComputation::Builder builder(TestName() + ".while_body");
+    HloInstruction* param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, while_shape, "param"));
+    HloInstruction* gte_0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 0));
+    HloInstruction* gte_1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
+    HloInstruction* add_result =
+        builder.AddInstruction(HloInstruction::CreateBinary(
+            scalar_s32, HloOpcode::kAdd, gte_0, gte_1));
+    builder.AddInstruction(
+        HloInstruction::CreateTuple({gte_0, gte_1, add_result}));
+
+    return module().AddEmbeddedComputation(builder.Build());
+  }();
+
+  HloComputation::Builder builder(TestName());
+  auto* init_value = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, while_shape, "init_value"));
+  builder.AddInstruction(HloInstruction::CreateWhile(
+      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
+      while_body, init_value));
+  HloComputation* entry_computation =
+      module().AddEntryComputation(builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+  EXPECT_TRUE(simplified_loop);
+
+  HloInstruction* transformed_while;
+  FindOnlyWhileInstruction(entry_computation, &transformed_while);
+
+  EXPECT_THAT(entry_computation->instructions(), Contains(op::Add()));
+  EXPECT_THAT(transformed_while->while_body()->instructions(),
+              Each(Not(op::Add())));
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, HoistInvariantOperationTree) {
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  Shape while_shape =
+      ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
+
+  HloComputation* while_body = [&]() {
+    HloComputation::Builder builder(TestName() + ".while_body");
+    HloInstruction* param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, while_shape, "param"));
+    HloInstruction* gte_0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 0));
+    HloInstruction* gte_1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
+    HloInstruction* gte_2_loop_variant = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 2));
+
+    HloInstruction* add_result =
+        builder.AddInstruction(HloInstruction::CreateBinary(
+            scalar_s32, HloOpcode::kAdd, gte_0, gte_1));
+    HloInstruction* mul_result =
+        builder.AddInstruction(HloInstruction::CreateBinary(
+            scalar_s32, HloOpcode::kMultiply, add_result, gte_1));
+    HloInstruction* negate_result =
+        builder.AddInstruction(HloInstruction::CreateUnary(
+            scalar_s32, HloOpcode::kNegate, mul_result));
+    HloInstruction* constant = builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<int32>(4)));
+    HloInstruction* sub_result =
+        builder.AddInstruction(HloInstruction::CreateBinary(
+            scalar_s32, HloOpcode::kSubtract, negate_result, constant));
+    HloInstruction* divide_result =
+        builder.AddInstruction(HloInstruction::CreateBinary(
+            scalar_s32, HloOpcode::kDivide, sub_result, gte_2_loop_variant));
+    builder.AddInstruction(
+        HloInstruction::CreateTuple({gte_0, gte_1, divide_result}));
+
+    return module().AddEmbeddedComputation(builder.Build());
+  }();
+
+  HloComputation::Builder builder(TestName());
+  auto* init_value = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, while_shape, "init_value"));
+  builder.AddInstruction(HloInstruction::CreateWhile(
+      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
+      while_body, init_value));
+  HloComputation* entry_computation =
+      module().AddEntryComputation(builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+  EXPECT_TRUE(simplified_loop);
+
+  HloInstruction* transformed_while;
+  FindOnlyWhileInstruction(entry_computation, &transformed_while);
+
+  EXPECT_THAT(entry_computation->instructions(),
+              AllOf(Contains(op::Add()), Contains(op::Multiply()),
+                    Contains(op::Negate()), Contains(op::Subtract()),
+                    Contains(op::Constant()),
+
+                    // The division had a loop varying operand so that better
+                    // not be hoisted.
+                    Not(Contains(op::Divide()))));
+
+  EXPECT_THAT(transformed_while->while_body()->instructions(),
+              Each(Not(AnyOf(op::Add(), op::Multiply(), op::Negate(),
+                             op::Subtract(), op::Constant()))));
+
+  EXPECT_THAT(transformed_while->while_body()->instructions(),
+              Contains(op::Divide()));
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest,
+       DontHoistTriviallyLoopVaryingComputation) {
+  // Basic negative test: the add expression is not loop invariant.
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32});
+
+  HloComputation* while_body = [&]() {
+    HloComputation::Builder builder(TestName() + ".while_body");
+    HloInstruction* param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, while_shape, "param"));
+    HloInstruction* gte_0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 0));
+    HloInstruction* gte_1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
+    HloInstruction* add_result =
+        builder.AddInstruction(HloInstruction::CreateBinary(
+            scalar_s32, HloOpcode::kAdd, gte_0, gte_1));
+    builder.AddInstruction(HloInstruction::CreateTuple({gte_0, add_result}));
+
+    return module().AddEmbeddedComputation(builder.Build());
+  }();
+
+  HloComputation::Builder builder(TestName());
+  auto* init_value = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, while_shape, "init_value"));
+  auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
+      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
+      while_body, init_value));
+
+  module().AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+  EXPECT_FALSE(simplified_loop);
+
+  EXPECT_THAT(while_inst->while_body()->instructions(), Contains(op::Add()));
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest,
+       DontHoistLoopVaryingComputationWithAlternatingTuples) {
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  Shape while_shape =
+      ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
+
+  HloComputation* while_body = [&]() {
+    HloComputation::Builder builder(TestName() + ".while_body");
+    HloInstruction* param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, while_shape, "param"));
+    HloInstruction* gte_0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 0));
+    HloInstruction* gte_1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
+    HloInstruction* add_result =
+        builder.AddInstruction(HloInstruction::CreateBinary(
+            scalar_s32, HloOpcode::kAdd, gte_0, gte_1));
+    builder.AddInstruction(
+        HloInstruction::CreateTuple({gte_1, gte_0, add_result}));
+
+    return module().AddEmbeddedComputation(builder.Build());
+  }();
+
+  HloComputation::Builder builder(TestName());
+  auto* init_value = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, while_shape, "init_value"));
+  auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
+      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
+      while_body, init_value));
+
+  module().AddEntryComputation(builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+  EXPECT_FALSE(simplified_loop);
+
+  EXPECT_THAT(while_inst->while_body()->instructions(), Contains(op::Add()));
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) {
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32});
+
+  HloComputation* while_body = [&]() {
+    HloComputation::Builder builder(TestName() + ".while_body");
+    HloInstruction* param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, while_shape, "param"));
+    HloInstruction* gte_0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 0));
+    HloInstruction* gte_1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
+    builder.AddInstruction(
+        HloInstruction::CreateOutfeed(scalar_s32, gte_0, ""));
+    builder.AddInstruction(HloInstruction::CreateTuple({gte_0, gte_1}));
+
+    return module().AddEmbeddedComputation(builder.Build());
+  }();
+
+  HloComputation::Builder builder(TestName());
+  auto* init_value = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, while_shape, "init_value"));
+  auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
+      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
+      while_body, init_value));
+
+  module().AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+  EXPECT_FALSE(simplified_loop);
+
+  EXPECT_THAT(while_inst->while_body()->instructions(),
+              Contains(op::Outfeed()));
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
+  // The bitcast's user, an outfeed, can't be hoisted, so don't hoist the
+  // bitcast either.
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
+  Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32});
+
+  HloComputation* while_body = [&]() {
+    HloComputation::Builder builder(TestName() + ".while_body");
+    HloInstruction* param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, while_shape, "param"));
+    HloInstruction* gte_0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 0));
+    HloInstruction* gte_1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
+    HloInstruction* bitcast_inst = builder.AddInstruction(
+        HloInstruction::CreateUnary(scalar_f32, HloOpcode::kBitcast, gte_0));
+    builder.AddInstruction(
+        HloInstruction::CreateOutfeed(scalar_f32, bitcast_inst, ""));
+    builder.AddInstruction(HloInstruction::CreateTuple({gte_0, gte_1}));
+
+    return module().AddEmbeddedComputation(builder.Build());
+  }();
+
+  HloComputation::Builder builder(TestName());
+  auto* init_value = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, while_shape, "init_value"));
+  auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
+      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
+      while_body, init_value));
+
+  module().AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+  EXPECT_FALSE(simplified_loop);
+
+  EXPECT_THAT(while_inst->while_body()->instructions(),
+              Contains(op::Outfeed()));
+  EXPECT_THAT(while_inst->while_body()->instructions(),
+              Contains(op::Bitcast()));
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
+  // The bitcast's user can be hoisted, so hoist the bitcast too.
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
+  Shape while_shape =
+      ShapeUtil::MakeTupleShape({scalar_s32, scalar_f32, scalar_f32});
+
+  HloComputation* while_body = [&]() {
+    HloComputation::Builder builder(TestName() + ".while_body");
+    HloInstruction* param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, while_shape, "param"));
+    HloInstruction* gte_0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 0));
+    HloInstruction* gte_1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_f32, param, 1));
+    HloInstruction* bitcast_inst = builder.AddInstruction(
+        HloInstruction::CreateUnary(scalar_f32, HloOpcode::kBitcast, gte_0));
+    HloInstruction* add_inst =
+        builder.AddInstruction(HloInstruction::CreateBinary(
+            scalar_f32, HloOpcode::kAdd, bitcast_inst, gte_1));
+    builder.AddInstruction(
+        HloInstruction::CreateTuple({gte_0, gte_1, add_inst}));
+
+    return module().AddEmbeddedComputation(builder.Build());
+  }();
+
+  HloComputation::Builder builder(TestName());
+  auto* init_value = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, while_shape, "init_value"));
+  builder.AddInstruction(HloInstruction::CreateWhile(
+      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
+      while_body, init_value));
+
+  HloComputation* entry_computation =
+      module().AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+  EXPECT_TRUE(simplified_loop);
+
+  HloInstruction* transformed_while;
+  FindOnlyWhileInstruction(entry_computation, &transformed_while);
+
+  EXPECT_THAT(transformed_while->while_body()->instructions(),
+              Each(Not(op::Add())));
+  EXPECT_THAT(transformed_while->while_body()->instructions(),
+              Each(Not(op::Bitcast())));
+  EXPECT_THAT(entry_computation->instructions(), Contains(op::Add()));
+  EXPECT_THAT(entry_computation->instructions(), Contains(op::Bitcast()));
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistControlDependencies) {
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  Shape while_shape =
+      ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
+
+  HloComputation* while_body;
+  {
+    HloComputation::Builder builder(TestName() + ".while_body");
+    HloInstruction* param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, while_shape, "param"));
+    HloInstruction* gte_0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 0));
+    HloInstruction* gte_1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
+    HloInstruction* add_result =
+        builder.AddInstruction(HloInstruction::CreateBinary(
+            scalar_s32, HloOpcode::kAdd, gte_0, gte_1));
+    TF_ASSERT_OK(param->AddControlDependencyTo(add_result));
+    builder.AddInstruction(
+        HloInstruction::CreateTuple({gte_0, gte_1, add_result}));
+
+    while_body = module().AddEmbeddedComputation(builder.Build());
+  }
+
+  HloComputation::Builder builder(TestName());
+  auto* init_value = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, while_shape, "init_value"));
+  builder.AddInstruction(HloInstruction::CreateWhile(
+      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
+      while_body, init_value));
+  module().AddEntryComputation(builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+  EXPECT_FALSE(simplified_loop);
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, BodyHasNonTupleRoot) {
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32});
+
+  HloComputation* while_body = [&]() {
+    HloComputation::Builder builder(TestName() + ".passthrough");
+    HloInstruction* param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, while_shape, "param"));
+    HloComputation* result = module().AddEmbeddedComputation(builder.Build());
+
+    result->AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
+    return result;
+  }();
+
+  HloComputation::Builder builder(TestName());
+  auto* init_value = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, while_shape, "init_value"));
+  builder.AddInstruction(HloInstruction::CreateWhile(
+      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
+      while_body, init_value));
+  module().AddEntryComputation(builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+  EXPECT_FALSE(simplified_loop);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index b38ee907d70e29093c5cef718e1432663015728b..981de9b2200a9ae8938db21299580f510834d2f0 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -236,7 +236,7 @@ static optional<int64> GetLoopTripCount(HloInstruction* while_op) {
       VLOG(2) << "Couldn't evaluate while cond: " << result.status();
       return nullopt;
     }
-    return result.ValueOrDie()->GetArraySlice<bool>() ==
+    return result.ValueOrDie()->data<bool>() ==
            tensorflow::gtl::ArraySlice<bool>{true};
   };
 
@@ -289,7 +289,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // Don't try this transformation if the while loop isn't removable, since if
   // it succeeds ultimately we're going to have to replace the old while loop
   // with a new one.
-  if (!while_op->parent()->IsRemovable(while_op)) {
+  if (!while_op->parent()->IsRemovable(while_op) || while_op->HasSideEffect()) {
     VLOG(2) << "Can't remove dead parameters from non-removable while op.";
     return false;
   }
@@ -306,6 +306,13 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
     return false;
   }
 
+  if (while_body_root->opcode() != HloOpcode::kTuple) {
+    VLOG(2) << "While body's root is not a tuple(...) instruction.";
+    return false;
+  }
+
+  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
+
   // Bail if param0 of while_cond or while_body has users which aren't of type
   // get-tuple-element.
   for (const HloInstruction* instr : {while_body->parameter_instruction(0),
@@ -313,9 +320,10 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
     for (const HloInstruction* user : instr->users()) {
       if (user->opcode() != HloOpcode::kGetTupleElement) {
         VLOG(2) << "Cowardly refusing to analyze while loop with "
-                << instr->ToStringNoMetadata()
-                << " used by non-GTE instruction " << user->ToStringNoMetadata()
-                << " in computation " << instr->parent()->name();
+                << instr->ToString(print_no_metadata)
+                << " used by non-GTE instruction "
+                << user->ToString(print_no_metadata) << " in computation "
+                << instr->parent()->name();
         return false;
       }
     }
@@ -351,7 +359,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
 
       used_tuple_indices.insert(user->tuple_index());
       if (used_tuple_indices.size() == tuple_size) {
-        VLOG(2) << "Loop " << while_op->ToStringNoMetadata()
+        VLOG(2) << "Loop " << while_op->ToString(print_no_metadata)
                 << " uses all of its inputs; no simplification possible.";
         return false;
       }
@@ -375,7 +383,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
       used_tuple_indices.insert(i);
 
       if (used_tuple_indices.size() == tuple_size) {
-        VLOG(2) << "Loop " << while_op->ToStringNoMetadata()
+        VLOG(2) << "Loop " << while_op->ToString(print_no_metadata)
                 << " uses all of its inputs; no simplification possible.";
         return false;
       }
@@ -387,7 +395,8 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   CHECK_LT(used_tuple_indices.size(), tuple_size);
 
   VLOG(1) << "Eliminating " << tuple_size - used_tuple_indices.size()
-          << " elements from tuple of " << while_op->ToStringNoMetadata();
+          << " elements from tuple of "
+          << while_op->ToString(print_no_metadata);
 
   // Build up maps from the old/new to the new/old tuple indices.
   std::vector<int64> new_to_old_tuple_idx(used_tuple_indices.begin(),
@@ -431,7 +440,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
         continue;
       }
       CHECK_EQ(user->opcode(), HloOpcode::kGetTupleElement)
-          << user->ToStringNoMetadata();
+          << user->ToString(print_no_metadata);
 
       int64 old_idx = user->tuple_index();
       auto new_idx_iter = old_to_new_tuple_idx.find(old_idx);
@@ -446,14 +455,14 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
         CHECK(user->user_count() == 0 ||
               user->user_count() == 1 &&
                   user->users().front() == while_body_root)
-            << "Instruction " << user->ToStringNoMetadata()
+            << "Instruction " << user->ToString(print_no_metadata)
             << " should be unused (except by root of while body), but has "
                "users: {"
             << tensorflow::str_util::Join(
                    user->users(), ", ",
-                   [](string* out, const HloInstruction* instr) {
+                   [&](string* out, const HloInstruction* instr) {
                      tensorflow::strings::StrAppend(
-                         out, instr->ToStringNoMetadata());
+                         out, instr->ToString(print_no_metadata));
                    })
             << "}";
 
@@ -555,10 +564,12 @@ static StatusOr<bool> TryRemoveWhileLoop(HloInstruction* while_op) {
   //
   // This is not a fundamental limitation.  The control operands can be moved
   // onto the new HLOs after simplification, and any side-effecting ops inside
-  // the loop aren't removed, just cloned and added back to the loop.
-  // Nevertheless our infrastructure sees loop simplification as removal of
-  // these nodes and currently doesn't allow it.
-  if (!while_op->parent()->IsRemovable(while_op)) {
+  // the loop aren't removed, just cloned and added back to the loop.  But
+  // moving an op out of the loop also removes implicit control dependencies
+  // between the op and the ops outside the loop, so we'd have to add those back
+  // for things like infeed/outfeed.  It gets complicated.  So for now we just
+  // avoid it.
+  if (!while_op->parent()->IsRemovable(while_op) || while_op->HasSideEffect()) {
     VLOG(2) << "Not attempting to remove while loop it is not removable: "
             << while_op->ToShortString();
     return false;
@@ -586,7 +597,9 @@ static StatusOr<bool> TryRemoveWhileLoop(HloInstruction* while_op) {
     auto call_op = computation->AddInstruction(HloInstruction::CreateCall(
         while_op->shape(), while_op->operands(), while_op->while_body()));
     TF_RETURN_IF_ERROR(computation->ReplaceInstruction(while_op, call_op));
-    TF_RETURN_IF_ERROR(CallInliner::Inline(call_op));
+    TF_ASSIGN_OR_RETURN(auto inlined_instructions_map,
+                        CallInliner::Inline(call_op));
+    (void)inlined_instructions_map;
     return true;
   }
   return false;
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.h b/tensorflow/compiler/xla/service/while_loop_simplifier.h
index 50dac32a4ab0a5de756c1ddf5e62c3560e54a079..d3d55634c97bbdf3f81321d8089bb808c411340b 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.h
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -41,4 +41,4 @@ class WhileLoopSimplifier : public HloPassInterface {
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index d99b31dc0037968bc88d5f22d53309a6a4546963..c5183f8d3aee99696ed4114c3f7e451888222137 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -418,5 +418,32 @@ TEST_F(WhileLoopSimplifierTest, RemoveUnusedOperand) {
                      op::GetTupleElement(op::Parameter(0), /*tuple_index=*/1)));
 }
 
+TEST_F(WhileLoopSimplifierTest, BodyHasNonTupleRoot) {
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32});
+
+  HloComputation* while_body = [&]() {
+    HloComputation::Builder builder(TestName() + ".passthrough");
+    HloInstruction* param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, while_shape, "param"));
+    HloComputation* result = module().AddEmbeddedComputation(builder.Build());
+
+    result->AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
+    return result;
+  }();
+
+  HloComputation::Builder builder(TestName());
+  auto* init_value = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, while_shape, "init_value"));
+  builder.AddInstruction(HloInstruction::CreateWhile(
+      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
+      while_body, init_value));
+  module().AddEntryComputation(builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopSimplifier{}.Run(&module()));
+  EXPECT_FALSE(simplified_loop);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e20b25e4a08a946f6b58575a4d4e557744f8035c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -0,0 +1,140 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/tuple_util.h"
+
+namespace xla {
+
+static StatusOr<HloComputation*> WidenWhileCondition(
+    HloComputation* narrow_condition, const Shape& wide_shape) {
+  const Shape& narrow_shape =
+      narrow_condition->parameter_instruction(0)->shape();
+
+  HloComputation* wide_while_cond = [&]() {
+    HloComputation::Builder builder(
+        tensorflow::strings::StrCat("wide.", narrow_condition->name()));
+    builder.AddInstruction(
+        HloInstruction::CreateParameter(0, wide_shape, "wide_param"));
+
+    // This is needed so that the root instruction is shaped as a PRED[] -- we
+    // need to get this right to begin with since we can't mutate the type of
+    // the root instruction later.  We later change the root instruction to
+    // something more appropriate.
+    builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+    return narrow_condition->parent()->AddEmbeddedComputation(builder.Build());
+  }();
+
+  HloInstruction* truncated_parameter =
+      TupleUtil::ExtractPrefix(wide_while_cond->parameter_instruction(0),
+                               narrow_shape.tuple_shapes_size());
+  HloInstruction* call_narrow_cond = wide_while_cond->AddInstruction(
+      HloInstruction::CreateCall(ShapeUtil::MakeShape(PRED, {}),
+                                 {truncated_parameter}, narrow_condition));
+
+  wide_while_cond->set_root_instruction(call_narrow_cond);
+
+  TF_RETURN_IF_ERROR(CallInliner::Inline(call_narrow_cond).status());
+  return wide_while_cond;
+}
+
+static StatusOr<std::pair<HloComputation*, CallInliner::InlinedInstructionMap>>
+WidenWhileBody(HloComputation* narrow_body, const Shape& wide_shape) {
+  const Shape& narrow_shape = narrow_body->parameter_instruction(0)->shape();
+
+  HloComputation* wide_while_body = [&]() {
+    HloComputation::Builder builder(
+        tensorflow::strings::StrCat("wide.", narrow_body->name()));
+    builder.AddInstruction(
+        HloInstruction::CreateParameter(0, wide_shape, "wide_param"));
+    return narrow_body->parent()->AddEmbeddedComputation(builder.Build());
+  }();
+
+  HloInstruction* wide_parameter = wide_while_body->parameter_instruction(0);
+  HloInstruction* truncated_parameter = TupleUtil::ExtractPrefix(
+      wide_parameter, narrow_shape.tuple_shapes_size());
+  HloInstruction* call_narrow_body =
+      wide_while_body->AddInstruction(HloInstruction::CreateCall(
+          narrow_shape, {truncated_parameter}, narrow_body));
+
+  std::vector<HloInstruction*> live_through_values;
+  for (int i = narrow_shape.tuple_shapes_size();
+       i < wide_shape.tuple_shapes_size(); i++) {
+    live_through_values.push_back(
+        wide_while_body->AddInstruction(HloInstruction::CreateGetTupleElement(
+            wide_shape.tuple_shapes(i), wide_parameter, i)));
+  }
+
+  wide_while_body->set_root_instruction(
+      TupleUtil::AppendSuffix(call_narrow_body, live_through_values));
+
+  TF_ASSIGN_OR_RETURN(auto inlined_instructions_map,
+                      CallInliner::Inline(call_narrow_body));
+  return {{wide_while_body, std::move(inlined_instructions_map)}};
+}
+
+/*static*/ StatusOr<WhileUtil::MakeInstructionsLiveInResult>
+WhileUtil::MakeInstructionsLiveIn(
+    HloInstruction* while_instr,
+    tensorflow::gtl::ArraySlice<HloInstruction*> instructions) {
+  CHECK(ShapeUtil::IsTuple(while_instr->shape()));
+
+  int64 elements_in_old_while_shape = while_instr->shape().tuple_shapes_size();
+  Shape new_while_shape = while_instr->shape();
+  for (auto* instruction : instructions) {
+    *new_while_shape.add_tuple_shapes() = instruction->shape();
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      HloComputation * new_while_condition,
+      WidenWhileCondition(while_instr->while_condition(), new_while_shape));
+
+  HloComputation* new_while_body;
+  CallInliner::InlinedInstructionMap inlined_instructions_map;
+  TF_ASSIGN_OR_RETURN(
+      std::tie(new_while_body, inlined_instructions_map),
+      WidenWhileBody(while_instr->while_body(), new_while_shape));
+
+  HloInstruction* new_while_init =
+      TupleUtil::AppendSuffix(while_instr->mutable_operand(0), instructions);
+  HloComputation* containing_computation = while_instr->parent();
+  HloInstruction* new_while = containing_computation->AddInstruction(
+      HloInstruction::CreateWhile(new_while_shape, new_while_condition,
+                                  new_while_body, new_while_init));
+  TF_RETURN_IF_ERROR(containing_computation->ReplaceInstruction(
+      while_instr, TupleUtil::ExtractPrefix(
+                       new_while, while_instr->shape().tuple_shapes_size())));
+
+  HloInstruction* while_body_param = new_while_body->parameter_instruction(0);
+  std::vector<HloInstruction*> live_in_instructions;
+  for (int64 i = elements_in_old_while_shape;
+       i < new_while_shape.tuple_shapes_size(); i++) {
+    live_in_instructions.push_back(
+        new_while_body->AddInstruction(HloInstruction::CreateGetTupleElement(
+            instructions[i - elements_in_old_while_shape]->shape(),
+            while_body_param, i)));
+  }
+
+  WhileUtil::MakeInstructionsLiveInResult result;
+
+  result.new_while_instr = new_while;
+  result.while_body_live_in_values = std::move(live_in_instructions);
+  result.while_body_instruction_map = std::move(inlined_instructions_map);
+
+  return std::move(result);
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_util.h b/tensorflow/compiler/xla/service/while_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..3600b5a80d26e37fdb7d5173c3b8743734306390
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_util.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_UTIL_H_
+
+#include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+
+namespace xla {
+class WhileUtil {
+ public:
+  // Holds a return value from MakeInstructionsLiveIn.
+  struct MakeInstructionsLiveInResult {
+    // The new while operation that has the requested values live in.
+    HloInstruction* new_while_instr;
+
+    // The i'th element of `while_body_live_in_values` is an instruction in the
+    // while body that holds the i'th *newly added* live in value at runtime.
+    std::vector<HloInstruction*> while_body_live_in_values;
+
+    // `while_body_instruction_map` maps instructions in the original while body
+    // to the corresponding instructions in the body for the newly created while
+    // operation.
+    CallInliner::InlinedInstructionMap while_body_instruction_map;
+  };
+
+  // Replaces `while_instr` with a new while instruction that is equivalent to
+  // `while_instr`, except that it has all of the HLO instructions in
+  // `instructions` as live-in, loop invariant values.  These new live in values
+  // are represented as new elements appended to the parameter of the while
+  // loop, which must be of tuple shape.  GetTupleElement instructions computing
+  // each new live in value is returned in the `while_body_live_in_values`
+  // vector.
+  //
+  // Precondition: `while_instr` must have a tuple shaped state.
+  //
+  // Every instruction in `instructions` must be contained in the computation
+  // that contains `while_instr`.
+  static StatusOr<MakeInstructionsLiveInResult> MakeInstructionsLiveIn(
+      HloInstruction* while_instr,
+      tensorflow::gtl::ArraySlice<HloInstruction*> instructions);
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/while_util_test.cc b/tensorflow/compiler/xla/service/while_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf0d0db99bd92b6b364b4e28e56a0902d4065963
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_util_test.cc
@@ -0,0 +1,130 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_util.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+
+namespace xla {
+namespace {
+
+namespace op = ::xla::testing::opcode_matchers;
+
+StatusOr<std::unique_ptr<HloModule>> GetParsedModule(
+    HloComputation** entry_computation, HloInstruction** param0,
+    HloInstruction** param1, HloInstruction** param2) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+while_body {
+  ROOT p_body = (f32[32,32]{1,0}, f32[32,32]{1,0}) parameter(0)
+}
+
+while_condition {
+  p_cond = f32[32,32]{1,0} parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  p_entry_0 = f32[32,32]{1,0} parameter(0)
+  p_entry_1 = s32[32,32]{1,0} parameter(1)
+  p_entry_2 = s64[32,32]{1,0} parameter(2)
+  while_init = (f32[32,32]{1,0}, f32[32,32]{1,0}) tuple(p_entry_0, p_entry_0)
+  ROOT while = (f32[32,32]{1,0}, f32[32,32]{1,0}) while(while_init), condition=while_condition, body=while_body
+}
+)";
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      tools::Parse(hlo_string));
+
+  *entry_computation = module->entry_computation();
+  *param0 = (*entry_computation)->parameter_instruction(0);
+  *param1 = (*entry_computation)->parameter_instruction(1);
+  *param2 = (*entry_computation)->parameter_instruction(2);
+
+  return std::move(module);
+}
+
+TEST(WhileUtil, MakeZeroInstructionsLiveOp) {
+  HloInstruction *param0, *param1, *param2;
+  HloComputation* entry_computation;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      GetParsedModule(&entry_computation, &param0, &param1, &param2));
+
+  HloInstruction* while_instr = entry_computation->root_instruction();
+  ASSERT_EQ(while_instr->opcode(), HloOpcode::kWhile);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileUtil::MakeInstructionsLiveInResult make_live_in_result,
+      WhileUtil::MakeInstructionsLiveIn(while_instr, /*instructions=*/{}));
+
+  HloInstruction* new_while_instr = make_live_in_result.new_while_instr;
+
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::Tuple(op::GetTupleElement(::testing::Eq(new_while_instr), 0),
+                op::GetTupleElement(::testing::Eq(new_while_instr), 1)));
+
+  auto param_reconstructed =
+      op::Tuple(op::GetTupleElement(op::Parameter(0), 0),
+                op::GetTupleElement(op::Parameter(0), 1));
+
+  EXPECT_THAT(new_while_instr->while_body()->root_instruction(),
+              op::Tuple(op::GetTupleElement(param_reconstructed, 0),
+                        op::GetTupleElement(param_reconstructed, 1)));
+}
+
+TEST(WhileUtilTest, MakeTwoInstructionsLive) {
+  HloInstruction *param0, *param1, *param2;
+  HloComputation* entry_computation;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      GetParsedModule(&entry_computation, &param0, &param1, &param2));
+
+  HloInstruction* while_instr = entry_computation->root_instruction();
+  ASSERT_EQ(while_instr->opcode(), HloOpcode::kWhile);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileUtil::MakeInstructionsLiveInResult make_live_in_result,
+      WhileUtil::MakeInstructionsLiveIn(while_instr,
+                                        /*instructions=*/{param0, param1}));
+
+  HloInstruction* new_while_instr = make_live_in_result.new_while_instr;
+
+  XLA_VLOG_LINES(3, module->ToString());
+
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::Tuple(op::GetTupleElement(::testing::Eq(new_while_instr), 0),
+                op::GetTupleElement(::testing::Eq(new_while_instr), 1)));
+
+  auto first_half_param_reconstructed =
+      op::Tuple(op::GetTupleElement(op::Parameter(0), 0),
+                op::GetTupleElement(op::Parameter(0), 1));
+
+  EXPECT_THAT(new_while_instr->while_body()->root_instruction(),
+              op::Tuple(op::GetTupleElement(first_half_param_reconstructed, 0),
+                        op::GetTupleElement(first_half_param_reconstructed, 1),
+                        op::GetTupleElement(op::Parameter(0), 2),
+                        op::GetTupleElement(op::Parameter(0), 3)));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa40b5cb264803097f52966d6f61f1f41b6b3017
--- /dev/null
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+StatusOr<bool> ZeroSizedHloElimination::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* comp : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
+      if (instruction->HasSideEffect() ||
+          ShapeUtil::IsTuple(instruction->shape())) {
+        continue;
+      }
+      if (comp->IsRemovable(instruction) &&
+          ShapeUtil::HasZeroElements(instruction->shape())) {
+        TF_RETURN_IF_ERROR(comp->ReplaceWithNewInstruction(
+            instruction, HloInstruction::CreateConstant(
+                             Literal::CreateFromShape(instruction->shape()))));
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding.h b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
similarity index 70%
rename from tensorflow/compiler/xla/service/gpu/convolution_folding.h
rename to tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
index f9c898721f8dd6b8b7e74c82bb2085cc437eaad5..063e312df66ce9cba0fa9f49c2fc6026ba6b74aa 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding.h
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h
@@ -13,25 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_FOLDING_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_FOLDING_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
+// HLO pass that replaces zero sized Hlos with an zero sized constant literal.
 namespace xla {
-namespace gpu {
-
-class ConvolutionFolding : public HloPassInterface {
+class ZeroSizedHloElimination : public HloPassInterface {
  public:
+  StatusOr<bool> Run(HloModule* module) override;
   tensorflow::StringPiece name() const override {
-    return "convolution-folding";
+    return "zero_sized_hlo_elimination";
   }
-
-  StatusOr<bool> Run(HloModule* module) override;
 };
-
-}  // namespace gpu
 }  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CONVOLUTION_FOLDING_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f8cdc1e0e73cdaa8675fc945ba3dbe19ce3da7d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h"
+
+#include <memory>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+namespace {
+class ZeroSizedHloEliminationTest : public HloTestBase {
+ protected:
+  ZeroSizedHloEliminationTest()
+      : HloTestBase(),
+        builder_("zero_sized_computation"),
+        zero_sized_param_(
+            builder_.AddInstruction(HloInstruction::CreateParameter(
+                0, ShapeUtil::MakeShape(F32, {3, 0}), "zero sized param"))) {}
+
+  StatusOr<bool> RunZeroSizedElimination() {
+    HloModule module("zero_sized_elimination_test_module");
+    module.AddEntryComputation(builder_.Build());
+    return ZeroSizedHloElimination{}.Run(&module);
+  }
+
+  HloComputation::Builder builder_;
+  HloInstruction* zero_sized_param_;
+};
+
+TEST_F(ZeroSizedHloEliminationTest, EliminatedZeroSizedOp) {
+  builder_.AddInstruction(HloInstruction::CreateUnary(
+      zero_sized_param_->shape(), HloOpcode::kTanh, zero_sized_param_));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunZeroSizedElimination());
+  EXPECT_TRUE(changed);
+}
+
+TEST_F(ZeroSizedHloEliminationTest, DoesNotEliminateParameter) {
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunZeroSizedElimination());
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ZeroSizedHloEliminationTest, DoesNotEliminateSideEffects) {
+  builder_.AddInstruction(HloInstruction::CreateSend(zero_sized_param_, 0));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunZeroSizedElimination());
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_layout.cc b/tensorflow/compiler/xla/shape_layout.cc
index 5bf9842a6ce7be747f58c10f302f85c6f82ac6f9..789eba5780d37e1fd4d80ec881855951c8bba0eb 100644
--- a/tensorflow/compiler/xla/shape_layout.cc
+++ b/tensorflow/compiler/xla/shape_layout.cc
@@ -32,13 +32,13 @@ tensorflow::Status ShapeLayout::CopyLayoutFromShape(const Shape& other_shape) {
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ShapeLayout::AssignLayoutToShape(Shape* other_shape) const {
-  if (!ShapeUtil::Compatible(*other_shape, shape_)) {
+tensorflow::Status ShapeLayout::AssignLayoutToShape(Shape* to_shape) const {
+  if (!ShapeUtil::Compatible(*to_shape, shape_)) {
     return InvalidArgument("Shape %s is not compatible with shape %s",
-                           ShapeUtil::HumanString(*other_shape).c_str(),
+                           ShapeUtil::HumanString(*to_shape).c_str(),
                            ShapeUtil::HumanString(shape()).c_str());
   }
-  *other_shape = shape_;
+  *to_shape = shape_;
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/shape_layout.h b/tensorflow/compiler/xla/shape_layout.h
index 92564660f21bf1b596c4b9ca04c07eaca27ed192..4c83750f3e6f3c735db66d8e0b86ae3f43e5ca11 100644
--- a/tensorflow/compiler/xla/shape_layout.h
+++ b/tensorflow/compiler/xla/shape_layout.h
@@ -38,18 +38,19 @@ class ShapeLayout {
   explicit ShapeLayout(const Shape& shape) : shape_(shape) {}
 
   // Assigns the layouts in this ShapeLayout to the Layout fields of the given
-  // shape. 'shape' and the shape of the ShapeLayout object must be compatible.
-  tensorflow::Status AssignLayoutToShape(Shape* shape) const;
+  // shape. 'to_shape' and the shape of the ShapeLayout object must be
+  // compatible.
+  tensorflow::Status AssignLayoutToShape(Shape* to_shape) const;
 
   // Returns true if the Layouts in this ShapeLayout match the layouts in the
   // given shape. Returns false otherwise. If the given shape is not compatible
   // with the ShapeLayout's shape, then false is returned.
   bool MatchesLayoutInShape(const Shape& shape) const;
 
-  // Copies the layout from the given shape into this ShapeLayout. 'shape' must
-  // be compatible with the ShapeLayout's shape, and 'shape' must have a layout
-  // (LayoutUtil::HasLayout).
-  tensorflow::Status CopyLayoutFromShape(const Shape& shape);
+  // Copies the layout from the given shape into this ShapeLayout. 'other_shape'
+  // must be compatible with the ShapeLayout's shape, and 'other_shape' must
+  // have a layout (LayoutUtil::HasLayout).
+  tensorflow::Status CopyLayoutFromShape(const Shape& other_shape);
 
   // Clears (Layout::Clear) all the Layouts stored in this object.
   void Clear();
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index bf8d19015079f2ce0bd450594040ed818f94b66b..d752619bd65751779c24f061e44e206d66b01465 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -238,7 +238,7 @@ class ShapeTree {
   //           (or compatible).
   //   index : the index of the element in the shape. See ShapeUtil::GetSubshape
   //           for definition of index.
-  //   data : The data value at this elemnt.
+  //   data : The data value at this element.
   template <typename Fn>
   void ForEachElement(const Fn& func) const;
 
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 74fa0b2f2e740310be23661caef3f19e24e4087b..604e0173e789348923316174873f58058eaf2815 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 #include <numeric>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -58,36 +59,47 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
   return out;
 }
 
+std::ostream& operator<<(std::ostream& out, const ShapeIndexView& shape_index) {
+  out << shape_index.ToString();
+  return out;
+}
+
 namespace {
 
 // Recursive helper for comparing the equality of two shapes. Returns true if
 // the shapes are the same. If compare_layouts is true, then layouts must also
 // match.
 bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
-  if (ShapeUtil::IsTuple(lhs)) {
-    return ShapeUtil::IsTuple(rhs) &&
+  if (ShapeUtil::IsTuple(lhs) || ShapeUtil::IsTuple(rhs)) {
+    return ShapeUtil::IsTuple(lhs) && ShapeUtil::IsTuple(rhs) &&
            ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
                            [=](const Shape& l, const Shape& r) {
                              return CompareShapes(l, r, compare_layouts);
                            });
+  } else if (ShapeUtil::IsOpaque(lhs) || ShapeUtil::IsOpaque(rhs)) {
+    return ShapeUtil::IsOpaque(lhs) && ShapeUtil::IsOpaque(rhs);
   }
-  // Explicitly compare the fields rather than using MessageDifferencer because
-  // we want empty layouts to be treated identically to missing layouts.
+
   if (compare_layouts) {
-    if (!ContainersEqual(lhs.layout().minor_to_major(),
-                         rhs.layout().minor_to_major())) {
-      VLOG(3) << "CompareShapes: lhs layout != rhs layout";
+    if (lhs.layout().format() != rhs.layout().format()) {
       return false;
     }
-    if (!ContainersEqual(lhs.layout().padded_dimensions(),
-                         rhs.layout().padded_dimensions())) {
-      VLOG(3)
-          << "CompareShapes: lhs padded_dimensions != rhs padded_dimensions";
-      return false;
-    }
-    if (lhs.layout().padding_value() != rhs.layout().padding_value()) {
-      VLOG(3) << "CompareShapes: lhs padding value != rhs padding_value";
-      return false;
+    if (LayoutUtil::IsDenseArray(lhs)) {
+      if (!ContainersEqual(LayoutUtil::MinorToMajor(lhs),
+                           LayoutUtil::MinorToMajor(rhs))) {
+        VLOG(3) << "CompareShapes: lhs layout != rhs layout";
+        return false;
+      }
+      if (!ContainersEqual(lhs.layout().padded_dimensions(),
+                           rhs.layout().padded_dimensions())) {
+        VLOG(3)
+            << "CompareShapes: lhs padded_dimensions != rhs padded_dimensions";
+        return false;
+      }
+      if (lhs.layout().padding_value() != rhs.layout().padding_value()) {
+        VLOG(3) << "CompareShapes: lhs padding value != rhs padding_value";
+        return false;
+      }
     }
   }
 
@@ -141,7 +153,8 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 }
 
 /* static */ int64 ShapeUtil::Rank(const Shape& shape) {
-  CHECK(!ShapeUtil::IsTuple(shape)) << "Tuples do not have a rank";
+  CHECK(!ShapeUtil::IsTuple(shape))
+      << "Tuples do not have a rank, shape: " << shape;
   return shape.dimensions_size();
 }
 
@@ -182,20 +195,32 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
       .ValueOrDie();
 }
 
-/* static */ Shape ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
+/* static */ Shape ShapeUtil::MakeShapeWithDescendingLayout(
     PrimitiveType element_type, tensorflow::gtl::ArraySlice<int64> dimensions) {
   std::vector<int64> layout(dimensions.size());
   std::iota(layout.rbegin(), layout.rend(), static_cast<int64>(0));
   return MakeShapeWithLayout(element_type, dimensions, layout);
 }
 
-/* static */ Shape ShapeUtil::NormalizeShapeToMonotonicDim0MajorLayout(
+/* static */ Shape ShapeUtil::MakeShapeWithSparseLayout(
+    PrimitiveType element_type, tensorflow::gtl::ArraySlice<int64> dimensions,
+    int64 max_sparse_elements) {
+  DCHECK_NE(TUPLE, element_type);
+  DCHECK_NE(OPAQUE, element_type);
+  Shape shape = ShapeUtil::MakeShape(element_type, dimensions);
+  *shape.mutable_layout() = LayoutUtil::MakeSparseLayout(max_sparse_elements);
+  TF_DCHECK_OK(ShapeUtil::ValidateShape(shape));
+  return shape;
+}
+
+/* static */ Shape
+ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     const Shape& shape) {
   std::vector<int64> dims(shape.dimensions_size());
   for (int i = 0; i < shape.dimensions_size(); ++i) {
     dims[i] = shape.dimensions(LayoutUtil::Major(shape.layout(), i));
   }
-  return MakeShapeWithMonotonicDim0MajorLayout(shape.element_type(), dims);
+  return MakeShapeWithDescendingLayout(shape.element_type(), dims);
 }
 
 /* static */ void ShapeUtil::PopulateShape(
@@ -235,6 +260,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 }
 
 /* static */ void ShapeUtil::AppendMajorDimension(int bound, Shape* shape) {
+  CHECK(LayoutUtil::IsDenseArray(*shape));
   shape->mutable_layout()->add_minor_to_major(Rank(*shape));
   shape->add_dimensions(bound);
   TF_DCHECK_OK(ValidateShape(*shape));
@@ -329,6 +355,14 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return MakeTupleShape(new_elements);
 }
 
+// Returns the shape of a real or imaginary component.
+/* static */ Shape ShapeUtil::ComplexComponentShape(
+    const Shape& complex_shape) {
+  CHECK(ElementIsComplex(complex_shape)) << HumanString(complex_shape);
+  return ChangeElementType(complex_shape, primitive_util::ComplexComponentType(
+                                              complex_shape.element_type()));
+}
+
 /* static */ bool ShapeUtil::ShapeIs(const Shape& shape,
                                      PrimitiveType element_type,
                                      std::initializer_list<int64> dimensions) {
@@ -336,7 +370,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 }
 
 /* static */ int64 ShapeUtil::ElementsIn(const Shape& shape) {
-  CHECK(!IsTuple(shape));
+  CHECK(!IsTuple(shape)) << ShapeUtil::HumanString(shape);
   CHECK_EQ(shape.dimensions_size(), Rank(shape));
   return std::accumulate<decltype(shape.dimensions().begin()), int64>(
       shape.dimensions().begin(), shape.dimensions().end(), 1LL,
@@ -352,7 +386,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 }
 
 /* static */ string ShapeUtil::HumanString(const Shape& shape) {
-  if (shape.element_type() == TUPLE) {
+  if (IsTuple(shape)) {
     string text = "(";
     const char* prefix = "";
     for (const Shape& elem_shape : shape.tuple_shapes()) {
@@ -396,10 +430,30 @@ const string& LowercasePrimitiveTypeName(PrimitiveType s) {
   static PrimitiveTypeNameGenerator* gen = new PrimitiveTypeNameGenerator();
   return gen->LowercaseName(s);
 }
+
+StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
+  static std::unordered_map<string, PrimitiveType>* name_to_type = [] {
+    static auto* map = new std::unordered_map<string, PrimitiveType>;
+    for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
+      if (PrimitiveType_IsValid(i)) {
+        auto value = static_cast<PrimitiveType>(i);
+        (*map)[LowercasePrimitiveTypeName(value)] = value;
+      }
+    }
+    return map;
+  }();
+  auto found = name_to_type->find(name);
+  if (found == name_to_type->end()) {
+    return InvalidArgument("Invalid element type string: \"%s\".",
+                           name.c_str());
+  }
+  return found->second;
+}
+
 }  // namespace
 
 /* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
-  if (shape.element_type() == TUPLE) {
+  if (IsTuple(shape)) {
     string text = "(";
     const char* prefix = "";
     for (const Shape& elem_shape : shape.tuple_shapes()) {
@@ -421,8 +475,6 @@ const string& LowercasePrimitiveTypeName(PrimitiveType s) {
       if (LayoutUtil::HasLayout(shape)) {
         tensorflow::strings::StrAppend(&result,
                                        LayoutUtil::HumanString(shape.layout()));
-      } else {
-        tensorflow::strings::StrAppend(&result, "{no layout}");
       }
     }
     return result;
@@ -470,26 +522,35 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
 
   string element_type_string;
   string dimensions_string;
+  string format_string;
   string layout_string;
   // tensorflow::StringPiece is not compatible with internal RE2 StringPiece, so
   // we convert in to the RE2-consumable type and then consume the corresponding
   // amount from our StringPiece type.
   tensorflow::RegexpStringPiece s_consumable(s->data(), s->size());
-  if (RE2::Consume(&s_consumable,
-                   "^(\\w*\\d*)\\[([\\d,]*)\\](?:\\s*{([\\d,]*)})?",
-                   &element_type_string, &dimensions_string, &layout_string)) {
+  if (RE2::Consume(
+          &s_consumable,
+          "^(\\w*\\d*)\\[([\\d,]*)\\](?:\\s*(dense|sparse)?\\s*{([\\d,]+)})?",
+          &element_type_string, &dimensions_string, &format_string,
+          &layout_string)) {
     size_t consumed = s->size() - s_consumable.size();
     s->remove_prefix(consumed);
+    auto string_to_int64 = [&s](const string& input) -> StatusOr<int64> {
+      int64 element;
+      if (!tensorflow::strings::safe_strto64(input.c_str(), &element)) {
+        return InvalidArgument(
+            "Invalid s64 value in parsed shape string: \"%s\" in \"%s\"",
+            input.c_str(), s->ToString().c_str());
+      }
+      return element;
+    };
+
     auto comma_list_to_int64s =
-        [&s](const string& input) -> StatusOr<std::vector<int64>> {
+        [&s,
+         string_to_int64](const string& input) -> StatusOr<std::vector<int64>> {
       std::vector<int64> results;
       for (const string& piece : tensorflow::str_util::Split(input, ',')) {
-        int64 element;
-        if (!tensorflow::strings::safe_strto64(piece.c_str(), &element)) {
-          return InvalidArgument(
-              "Invalid s64 value in parsed shape string: \"%s\" in \"%s\"",
-              piece.c_str(), s->ToString().c_str());
-        }
+        TF_ASSIGN_OR_RETURN(int64 element, string_to_int64(piece));
         results.push_back(element);
       }
       return results;
@@ -500,31 +561,32 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
                         comma_list_to_int64s(dimensions_string));
 
     // Extract the primitive element type.
-    PrimitiveType primitive_type = PRIMITIVE_TYPE_INVALID;
-    for (PrimitiveType i =
-             static_cast<PrimitiveType>(PRIMITIVE_TYPE_INVALID + 1);
-         i < TUPLE; i = static_cast<PrimitiveType>(i + 1)) {
-      if (tensorflow::str_util::Lowercase(PrimitiveType_Name(i)) ==
-          element_type_string) {
-        primitive_type = i;
-        break;
-      }
-    }
-    if (primitive_type == PRIMITIVE_TYPE_INVALID) {
+    TF_ASSIGN_OR_RETURN(const PrimitiveType primitive_type,
+                        StringToPrimitiveType(element_type_string));
+    if (primitive_type == PRIMITIVE_TYPE_INVALID || primitive_type == TUPLE ||
+        primitive_type == OPAQUE) {
       return InvalidArgument("Invalid element type string: \"%s\".",
                              element_type_string.c_str());
     }
 
     Shape result;
-    if (layout_string.empty()) {
+    if (format_string.empty() && layout_string.empty()) {
       // Create a shape without a layout set.
       result = ShapeUtil::MakeShape(primitive_type, dimensions);
-    } else {
+    } else if (format_string == "sparse") {
+      TF_ASSIGN_OR_RETURN(int64 max_elements, string_to_int64(layout_string));
+      result = ShapeUtil::MakeShapeWithSparseLayout(primitive_type, dimensions,
+                                                    max_elements);
+    } else if (format_string.empty() || format_string == "dense") {
       // Extract the layout minor-to-major and set it.
       TF_ASSIGN_OR_RETURN(std::vector<int64> min2maj,
                           comma_list_to_int64s(layout_string));
       TF_ASSIGN_OR_RETURN(result, MakeShapeWithLayoutInternal(
                                       primitive_type, dimensions, min2maj));
+    } else {
+      // This should not be reached.
+      LOG(FATAL) << "Unhandled condition when parsing shape; format: \""
+                 << format_string << "\", layout: \"" << layout_string << "\"";
     }
     TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(result));
     return std::move(result);
@@ -537,7 +599,12 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
 
 /* static */ StatusOr<Shape> ShapeUtil::ParseShapeString(
     tensorflow::StringPiece s) {
-  return ParseShapeStringInternal(&s);
+  TF_ASSIGN_OR_RETURN(Shape shape, ParseShapeStringInternal(&s));
+  if (!s.empty()) {
+    return InvalidArgument("Invalid shape string to parse: \"%s\"",
+                           s.ToString().c_str());
+  }
+  return shape;
 }
 
 /* static */ bool ShapeUtil::SameDimensions(const Shape& lhs,
@@ -563,6 +630,19 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   return SameDimensions(lhs, rhs);
 }
 
+/* static */ bool ShapeUtil::CompatibleIgnoringFpPrecision(const Shape& lhs,
+                                                           const Shape& rhs) {
+  if (lhs.element_type() == TUPLE) {
+    return rhs.element_type() == TUPLE &&
+           ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
+                           CompatibleIgnoringFpPrecision);
+  }
+  if (SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
+    return CompatibleIgnoringElementType(lhs, rhs);
+  }
+  return false;
+}
+
 /* static */ int64 ShapeUtil::GetDimension(const Shape& shape,
                                            int64 dimension_number) {
   return shape.dimensions(GetDimensionNumber(shape, dimension_number));
@@ -622,23 +702,55 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   TF_DCHECK_OK(ValidateShape(shape));
   DCHECK_NE(OPAQUE, shape.element_type());
   if (shape.element_type() == TUPLE) {
-    CHECK_GT(pointer_size, 0);
-    return pointer_size * shape.tuple_shapes_size();
+    return ByteSizeOfTupleIndexTable(shape, pointer_size);
   }
+  int64 byte_size = ByteSizeOfElements(shape);
+  if (LayoutUtil::IsSparseArray(shape)) {
+    byte_size += ByteSizeOfSparseIndices(shape);
+  }
+  return byte_size;
+}
+
+/* static */ int64 ShapeUtil::ByteSizeOfTupleIndexTable(const Shape& shape,
+                                                        int64 pointer_size) {
+  TF_DCHECK_OK(ValidateShape(shape));
+  DCHECK_EQ(TUPLE, shape.element_type());
+  CHECK_GT(pointer_size, 0);
+  return pointer_size * shape.tuple_shapes_size();
+}
+
+/* static */ int64 ShapeUtil::ByteSizeOfElements(const Shape& shape) {
+  TF_DCHECK_OK(ValidateShape(shape));
+  DCHECK(ShapeUtil::IsArray(shape));
   int64 allocated_element_count;
-  if (shape.layout().padded_dimensions_size() > 0) {
-    CHECK_EQ(Rank(shape), shape.layout().padded_dimensions_size());
-    allocated_element_count = 1;
-    for (int64 dimension_size : shape.layout().padded_dimensions()) {
-      allocated_element_count *= dimension_size;
-    }
+
+  if (LayoutUtil::IsSparseArray(shape)) {
+    allocated_element_count = LayoutUtil::MaxSparseElements(shape.layout());
   } else {
-    allocated_element_count = ElementsIn(shape);
+    CHECK(LayoutUtil::IsDenseArray(shape));
+    tensorflow::gtl::ArraySlice<int64> padded_dimensions =
+        LayoutUtil::PaddedDimensions(shape);
+    if (!padded_dimensions.empty()) {
+      CHECK_EQ(Rank(shape), padded_dimensions.size());
+      allocated_element_count = 1;
+      for (int64 dimension_size : padded_dimensions) {
+        allocated_element_count *= dimension_size;
+      }
+    } else {
+      allocated_element_count = ElementsIn(shape);
+    }
   }
   return allocated_element_count *
          ByteSizeOfPrimitiveType(shape.element_type());
 }
 
+/* static */ int64 ShapeUtil::ByteSizeOfSparseIndices(const Shape& shape) {
+  TF_DCHECK_OK(ValidateShape(shape));
+  DCHECK(LayoutUtil::IsSparseArray(shape));
+  return LayoutUtil::MaxSparseElements(shape.layout()) *
+         ShapeUtil::Rank(shape) * sizeof(int64);
+}
+
 /* static */ Status ShapeUtil::ValidateShapeWithOptionalLayoutInternal(
     const Shape& shape) {
   if (shape.element_type() == TUPLE) {
@@ -694,9 +806,9 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   return LayoutUtil::ValidateLayoutInShape(shape);
 }
 
-/* static */ Shape ShapeUtil::ChangeElementType(const Shape& shape,
+/* static */ Shape ShapeUtil::ChangeElementType(const Shape& original,
                                                 PrimitiveType type) {
-  Shape new_shape = shape;
+  Shape new_shape = original;
   new_shape.set_element_type(type);
   return new_shape;
 }
@@ -705,7 +817,8 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
                                                  ShapeIndexView index) {
   const Shape* return_shape = &shape;
   for (auto i : index) {
-    CHECK(IsTuple(*return_shape));
+    CHECK(IsTuple(*return_shape))
+        << "Invalid index " << index << " for shape " << shape;
     return_shape = &return_shape->tuple_shapes(i);
   }
   return *return_shape;
@@ -863,7 +976,9 @@ Status ForEachMutableSubshapeHelper(
     new_shape.add_dimensions(dim);
   }
   if (shape.has_layout()) {
+    CHECK(LayoutUtil::IsDenseArray(shape));
     Layout* new_layout = new_shape.mutable_layout();
+    new_layout->set_format(DENSE);
     new_layout->clear_minor_to_major();
     for (auto index : Permute(permutation, shape.layout().minor_to_major())) {
       new_layout->add_minor_to_major(index);
@@ -1117,9 +1232,9 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     // as input_shape/output_shape and the dimension-0-major layout. These two
     // shapes are used for conversion between logical linear indices and
     // multi-dimensional indices.
-    Shape input_shape_dim0_major = MakeShapeWithMonotonicDim0MajorLayout(
+    Shape input_shape_dim0_major = MakeShapeWithDescendingLayout(
         input_shape.element_type(), AsInt64Slice(input_shape.dimensions()));
-    Shape output_shape_dim0_major = MakeShapeWithMonotonicDim0MajorLayout(
+    Shape output_shape_dim0_major = MakeShapeWithDescendingLayout(
         output_shape.element_type(), AsInt64Slice(output_shape.dimensions()));
 
     for (int64 input_dim = 0; input_dim < Rank(input_shape); ++input_dim) {
@@ -1290,6 +1405,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   shape.mutable_dimensions()->erase(shape.dimensions().begin() + dim_to_delete);
   if (LayoutUtil::HasLayout(shape)) {
     Layout* layout = shape.mutable_layout();
+    layout->set_format(DENSE);
     for (size_t i = 0; i < layout->minor_to_major().size();) {
       if (layout->minor_to_major(i) == dim_to_delete) {
         layout->mutable_minor_to_major()->erase(
@@ -1319,4 +1435,9 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   return shape;
 }
 
+std::ostream& operator<<(std::ostream& out, const Shape& shape) {
+  out << ShapeUtil::HumanString(shape);
+  return out;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 2ea1bd95cb571134ab1e1dda37fbc887a1fa06b2..19b1aa93bd373ebd5f502d0dca56c9b31ab4fd7f 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -22,6 +22,8 @@ limitations under the License.
 #include <initializer_list>
 #include <string>
 
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -61,6 +63,9 @@ class ShapeIndex {
   void push_back(int64 value) { indices_.push_back(value); }
   void pop_back() { indices_.pop_back(); }
 
+  // push_front is O(n^2), but shapes don't usually have a ton of dimensions.
+  void push_front(int64 value) { indices_.insert(indices_.begin(), value); }
+
   std::vector<int64>::const_iterator begin() const { return indices_.begin(); }
   std::vector<int64>::const_iterator end() const { return indices_.end(); }
   std::vector<int64>::iterator begin() { return indices_.begin(); }
@@ -133,6 +138,7 @@ class ShapeIndexView {
 };
 
 std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index);
+std::ostream& operator<<(std::ostream& out, const ShapeIndexView& shape_index);
 
 // Namespaced collection of (static) shape utilities.
 //
@@ -141,7 +147,10 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index);
 class ShapeUtil {
  public:
   // Returns the number of elements are contained within the provided shape;
-  // e.g. for rank 0 (scalars) the result is always 1.
+  // e.g. for rank 0 (scalars) the result is always 1. Note that sparse shapes
+  // may not actually be able to store this number of elements. See
+  // LayoutUtil::MaxSparseElements(shape) to obtain the maximum number of
+  // elements that can be stored in a sparse shape.
   // Precondition: !IsTuple(shape)
   static int64 ElementsIn(const Shape& shape);
 
@@ -162,6 +171,27 @@ class ShapeUtil {
   // Precondition: !ShapeUtil::IsOpaque(shape) && !ShapeUtil::IsTuple(shape)
   static int64 ByteSizeOfPrimitiveType(PrimitiveType primitive_type);
 
+  // Returns the number of bytes required to store the tuple member pointers for
+  // a allocation of shape. The `shape` must be a TUPLE shape, and
+  // `pointer_size` must be larger than zero.
+  static int64 ByteSizeOfTupleIndexTable(const Shape& shape,
+                                         int64 pointer_size);
+
+  // Returns the number of bytes required for the elements in an allocation of
+  // `shape`, which must be an array shape. The return value does not include
+  // the bytes needed to store sparse indices. Dense shapes use a separate
+  // memory location for each element, and so for these shapes,
+  // `ByteSizeOf(shape) == ByteSizeOfElements(shape)`. For dense shapes, this
+  // size also includes padding if present in the layout. For sparse shapes,
+  // `ByteSizeOf(shape) == ByteSizeOfElements(shape) +
+  // ByteSizeOfSparseindices(shape)`.
+  static int64 ByteSizeOfElements(const Shape& shape);
+
+  // Returns the number of bytes required for the sparse indices in an
+  // allocation of shape. The shape must be an array shape. The return value
+  // does not include the bytes needed to store sparse indices.
+  static int64 ByteSizeOfSparseIndices(const Shape& shape);
+
   // Returns a human-readable string that represents the given shape, with or
   // without layout. e.g. "f32[42x12] {0, 1}" or "f32[64]".
   static string HumanString(const Shape& shape);
@@ -170,7 +200,7 @@ class ShapeUtil {
   // As above, but for program shapes, returns a string for the form:
   //
   // (param_name: f32[42x12], ...) -> f32[24x42]
-  static string HumanString(const ProgramShape& shape);
+  static string HumanString(const ProgramShape& program_shape);
 
   // Parses a ShapeUtil::HumanString-format shape string back into a shape
   // object.
@@ -185,6 +215,31 @@ class ShapeUtil {
     return lhs.element_type() == rhs.element_type();
   }
 
+  // As SameElementType, but allows floating point types to have different
+  // precisions.
+  static bool SameElementTypeIgnoringFpPrecision(const Shape& a,
+                                                 const Shape& b) {
+    if (ElementIsFloating(a) && ElementIsFloating(b)) {
+      return true;
+    }
+    return ShapeUtil::SameElementType(a, b);
+  }
+
+  // Returns the higher-precision element type if a and b are both floating
+  // point types; otherwise, checks that that they have the same element type
+  // and returns it.
+  static PrimitiveType HigherPrecisionElementType(const Shape& a,
+                                                  const Shape& b) {
+    if (SameElementType(a, b)) {
+      return a.element_type();
+    }
+    CHECK(SameElementTypeIgnoringFpPrecision(a, b));
+    return primitive_util::BitWidth(a.element_type()) <
+                   primitive_util::BitWidth(b.element_type())
+               ? b.element_type()
+               : a.element_type();
+  }
+
   // Returns true if the rank, dimension sizes, and element type are
   // identical. Layout is ignored. Tuple elements are compared recursively for
   // compatibility.
@@ -195,6 +250,10 @@ class ShapeUtil {
   // compatibility.
   static bool CompatibleIgnoringElementType(const Shape& lhs, const Shape& rhs);
 
+  // As Compatible, but allow one of lhs and rhs to be BF16 while the other
+  // being F32. Tuple elements are compared recursively for compatibility.
+  static bool CompatibleIgnoringFpPrecision(const Shape& lhs, const Shape& rhs);
+
   // Returns whether the lhs and rhs shapes are identical protobufs.
   static bool Equal(const Shape& lhs, const Shape& rhs);
 
@@ -267,14 +326,22 @@ class ShapeUtil {
       PrimitiveType element_type, tensorflow::gtl::ArraySlice<int64> dimensions,
       tensorflow::gtl::ArraySlice<int64> minor_to_major);
 
-  // Constructs a new shape with major-first layout.
-  static Shape MakeShapeWithMonotonicDim0MajorLayout(
+  static Shape MakeShapeWithSparseLayout(
+      PrimitiveType element_type, tensorflow::gtl::ArraySlice<int64> dimensions,
+      int64 max_sparse_elements);
+
+  // Constructs a new shape with major-first layout (i.e. {n, n-1, ..., 0}).
+  static Shape MakeShapeWithDescendingLayout(
       PrimitiveType element_type,
       tensorflow::gtl::ArraySlice<int64> dimensions);
 
-  // Returns a new shape with major-first layout that has the same layout of
-  // elements with a different shape.
-  static Shape NormalizeShapeToMonotonicDim0MajorLayout(const Shape& shape);
+  // Returns a new Shape based on the given Shape with low-dimension-major
+  // layout (i.e. {n, n-1, ..., 0}, like Fortran), and with the dimensions
+  // rearranged so that it has the same in-memory layout as the given shape.
+  //
+  // For example, transforms f32[B,H,W,C]{0,3,2,1} to f32[H,W,C,B]{3,2,1,0}.
+  static Shape MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
+      const Shape& shape);
 
   // As MakeShape, but the object to write to is passed in.
   static void PopulateShape(PrimitiveType element_type,
@@ -324,7 +391,8 @@ class ShapeUtil {
     return shape.element_type() == OPAQUE;
   }
 
-  // Returns whether the shape is an array.
+  // Returns whether the shape is an array.  Note that scalars are considered
+  // arrays.
   static bool IsArray(const Shape& shape) {
     return !IsTuple(shape) && !IsOpaque(shape);
   }
@@ -351,6 +419,10 @@ class ShapeUtil {
   // shape. E.g. a tuple like (f32, s32, u32) would slice via 1,3 to (s32, u32).
   static Shape SliceTuple(const Shape& tuple, int64 start, int64 limit);
 
+  // Returns the shape of the real/imaginary components of the given complex
+  // shape.
+  static Shape ComplexComponentShape(const Shape& complex_shape);
+
   // Shorthand for testing whether a shape is of a given element type and
   // sequence of dimensions.
   //
@@ -502,8 +574,7 @@ class ShapeUtil {
     CHECK_EQ(Rank(shape), base.size());
     CHECK_EQ(incr.size(), base.size());
     CHECK_EQ(count.size(), base.size());
-    const Layout& layout = shape.layout();
-    const int64 rank = layout.minor_to_major_size();
+    const int64 rank = LayoutUtil::MinorToMajor(shape).size();
     // Allows handling R0 arrays, such that the visitor function will be called
     // once with the proper empty indexes.
     int64 n = -1;
@@ -511,7 +582,7 @@ class ShapeUtil {
     while (n < rank && visitor_function(indexes)) {
       // Increments dimensions in minor to major order.
       for (n = 0; n < rank; ++n) {
-        int64 dim = layout.minor_to_major(n);
+        int64 dim = LayoutUtil::Minor(shape.layout(), n);
         indexes[dim] += incr[dim];
         if (indexes[dim] < base[dim] + count[dim]) {
           break;
@@ -529,6 +600,8 @@ class ShapeUtil {
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeUtil);
 };
 
+std::ostream& operator<<(std::ostream& out, const Shape& shape);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SHAPE_UTIL_H_
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 4bce7ca51d0534cbcad6faac12818c5f3e94b29e..4db97d45b20b86dc60531845c6e28a223203ff7f 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -71,7 +72,8 @@ TEST(ShapeUtilTest, Rank4DimensionIndexing) {
 
 TEST(ShapeUtilTest, ParseShapeStringR2F32) {
   string shape_string = "f32[123,456]";
-  Shape actual = ShapeUtil::ParseShapeString(shape_string).ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
+                          ShapeUtil::ParseShapeString(shape_string));
   Shape expected = ShapeUtil::MakeShape(F32, {123, 456});
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -80,7 +82,8 @@ TEST(ShapeUtilTest, ParseShapeStringR2F32) {
 
 TEST(ShapeUtilTest, ParseShapeStringTupleOfArrays) {
   string shape_string = "(f32[1572864],s8[5120,1024])";
-  Shape actual = ShapeUtil::ParseShapeString(shape_string).ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
+                          ShapeUtil::ParseShapeString(shape_string));
   Shape expected =
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {1572864}),
                                  ShapeUtil::MakeShape(S8, {5120, 1024})});
@@ -91,7 +94,8 @@ TEST(ShapeUtilTest, ParseShapeStringTupleOfArrays) {
 
 TEST(ShapeUtilTest, ParseShapeStringNestedTuple) {
   string shape_string = "(f32[1],(f32[2]), f32[3])";
-  Shape actual = ShapeUtil::ParseShapeString(shape_string).ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
+                          ShapeUtil::ParseShapeString(shape_string));
   Shape expected = ShapeUtil::MakeTupleShape({
       ShapeUtil::MakeShape(F32, {1}),
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {2})}),
@@ -102,6 +106,47 @@ TEST(ShapeUtilTest, ParseShapeStringNestedTuple) {
       << "actual:   " << ShapeUtil::HumanString(actual);
 }
 
+TEST(ShapeUtilTest, ParseShapeStringWithLayout) {
+  string shape_string = "f32[123,456]{0,1}";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
+                          ShapeUtil::ParseShapeString(shape_string));
+  Shape expected = ShapeUtil::MakeShapeWithLayout(F32, {123, 456}, {0, 1});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST(ShapeUtilTest, ParseShapeStringWithExplicitDenseLayout) {
+  string shape_string = "f32[123,456]dense{0,1}";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
+                          ShapeUtil::ParseShapeString(shape_string));
+  Shape expected = ShapeUtil::MakeShapeWithLayout(F32, {123, 456}, {0, 1});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST(ShapeUtilTest, ParseShapeStringWithSparseLayout) {
+  string shape_string = "f32[123,456]sparse{10}";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
+                          ShapeUtil::ParseShapeString(shape_string));
+  Shape expected = ShapeUtil::MakeShapeWithSparseLayout(F32, {123, 456}, 10);
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual: " << ShapeUtil::HumanString(actual);
+}
+
+TEST(ShapeUtilTest, ParseInvalidShapeString) {
+  string shape_strings[] = {
+      "f32[123,456]foobar{0,1}", "f32[123,456]sparse{0,1}", "f32[123,456]{foo}",
+      "f32[123,456]dense{foo}",  "f32[123,456]sparse{foo}",
+  };
+  for (const string& shape_string : shape_strings) {
+    StatusOr<Shape> result = ShapeUtil::ParseShapeString(shape_string);
+    ASSERT_FALSE(result.ok()) << "shape: " << shape_string;
+  }
+}
+
 TEST(ShapeUtilTest, CompatibleIdenticalShapes) {
   Shape shape1 = ShapeUtil::MakeShape(F32, {3, 2});
   Shape shape2 = ShapeUtil::MakeShape(F32, {3, 2});
@@ -125,6 +170,18 @@ TEST(ShapeUtilTest, CompatibleNotIdenticalShapes) {
   EXPECT_TRUE(ShapeUtil::Compatible(shape_1, shape_2));
 }
 
+TEST(ShapeUtilTest, CompatibleIgnoringFpPrecision) {
+  Shape shape1 = ShapeUtil::MakeShape(BF16, {3, 2});
+  Shape shape2 = ShapeUtil::MakeShape(F32, {3, 2});
+  ASSERT_TRUE(ShapeUtil::CompatibleIgnoringFpPrecision(shape1, shape2));
+}
+
+TEST(ShapeUtilTest, IncompatibleIgnoringFpPrecision) {
+  Shape shape1 = ShapeUtil::MakeShape(BF16, {3, 2});
+  Shape shape2 = ShapeUtil::MakeShape(F32, {2, 2});
+  ASSERT_FALSE(ShapeUtil::CompatibleIgnoringFpPrecision(shape1, shape2));
+}
+
 TEST(ShapeUtilTest, IncompatibleDifferentElementShapes) {
   Shape shape_1 = ShapeUtil::MakeShape(F32, {3, 2});
   Shape shape_2 = ShapeUtil::MakeShape(PRED, {3, 2});
@@ -139,6 +196,14 @@ TEST(ShapeUtilTest, CompatibleTuples) {
   EXPECT_TRUE(ShapeUtil::Compatible(tuple1, tuple2));
 }
 
+TEST(ShapeUtilTest, CompatibleTuplesIgnoringFpPrecision) {
+  Shape tuple1 = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(BF16, {3, 2}), ShapeUtil::MakeShape(F32, {4, 5})});
+  Shape tuple2 = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F64, {3, 2}), ShapeUtil::MakeShape(BF16, {4, 5})});
+  EXPECT_TRUE(ShapeUtil::CompatibleIgnoringFpPrecision(tuple1, tuple2));
+}
+
 TEST(ShapeUtilTest, IncompatibleTuplesWithSwappedElements) {
   Shape tuple1 = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(PRED, {4, 5}), ShapeUtil::MakeShape(F32, {3, 2})});
@@ -148,6 +213,14 @@ TEST(ShapeUtilTest, IncompatibleTuplesWithSwappedElements) {
   EXPECT_FALSE(ShapeUtil::CompatibleIgnoringElementType(tuple1, tuple2));
 }
 
+TEST(ShapeUtilTest, IncompatibleTuplesIgnoringFpPrecision) {
+  Shape tuple1 = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(BF16, {4, 5}), ShapeUtil::MakeShape(F32, {3, 2})});
+  Shape tuple2 = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {3, 2}), ShapeUtil::MakeShape(BF16, {4, 5})});
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringFpPrecision(tuple1, tuple2));
+}
+
 TEST(ShapeUtilTest, IncompatibleTuplesWithDifferentPrimitiveType) {
   Shape tuple1 = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(PRED, {4, 5}), ShapeUtil::MakeShape(F32, {3, 2})});
@@ -165,20 +238,6 @@ TEST(ShapeUtilTest, IncompatibleTuplesWithDifferentDimensions) {
   EXPECT_FALSE(ShapeUtil::Compatible(tuple1, tuple2));
 }
 
-TEST(ShapeUtilTest, EmptyLayoutEqualsMissingLayout) {
-  // A shape with a missing layout should be equal to a shape with an empty
-  // layout.
-  Shape scalar1 = ShapeUtil::MakeShape(F32, {});
-  Shape scalar2 = ShapeUtil::MakeShape(F32, {});
-
-  EXPECT_TRUE(ShapeUtil::Equal(scalar1, scalar2));
-
-  scalar1.clear_layout();    // Remove layout field.
-  scalar2.mutable_layout();  // Create empty layout field.
-
-  EXPECT_TRUE(ShapeUtil::Equal(scalar1, scalar2));
-}
-
 TEST(ShapeUtilTest, CompareShapesWithPaddedDimensionsMismatch) {
   Shape shape1 = ShapeUtil::MakeShape(F32, {20, 30});
   shape1.mutable_layout()->add_padded_dimensions(10);
@@ -199,17 +258,17 @@ TEST(ShapeUtilTest, CompareShapesWithPaddingValueMismatch) {
   EXPECT_FALSE(ShapeUtil::Equal(shape1, shape2));
 }
 
-TEST(ShapeUtilTest, ScalarUnpopulatedLayoutEqualsScalarLayout) {
-  Shape scalar_unpopulated = ShapeUtil::MakeShape(F32, {});
-  scalar_unpopulated.clear_layout();
-  ASSERT_FALSE(scalar_unpopulated.has_layout())
-      << ShapeUtil::HumanStringWithLayout(scalar_unpopulated);
+TEST(ShapeUtilTest, ScalarDefaultLayoutEqualsScalarEmptyMin2Maj) {
+  Shape scalar_default_layout = ShapeUtil::MakeShape(F32, {});
+  ASSERT_TRUE(scalar_default_layout.has_layout())
+      << ShapeUtil::HumanStringWithLayout(scalar_default_layout);
 
-  const Shape scalar_populated = ShapeUtil::MakeShapeWithLayout(F32, {}, {});
-  ASSERT_TRUE(scalar_populated.has_layout())
-      << ShapeUtil::HumanStringWithLayout(scalar_populated);
+  const Shape scalar_empty_min2maj =
+      ShapeUtil::MakeShapeWithLayout(F32, {}, {});
+  ASSERT_TRUE(scalar_empty_min2maj.has_layout())
+      << ShapeUtil::HumanStringWithLayout(scalar_empty_min2maj);
 
-  EXPECT_TRUE(ShapeUtil::Equal(scalar_unpopulated, scalar_populated));
+  EXPECT_TRUE(ShapeUtil::Equal(scalar_default_layout, scalar_empty_min2maj));
 }
 
 TEST(ShapeUtilTest, ByteSizeOfWithoutPadding) {
diff --git a/tensorflow/compiler/xla/sparse_index_array.cc b/tensorflow/compiler/xla/sparse_index_array.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31844abd89a020c87c403353374a80fb639a3244
--- /dev/null
+++ b/tensorflow/compiler/xla/sparse_index_array.cc
@@ -0,0 +1,110 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/sparse_index_array.h"
+
+#include "tensorflow/compiler/xla/index_util.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+SparseIndexArray::SparseIndexArray() : rank_(0), max_indices_(0) {}
+
+SparseIndexArray::SparseIndexArray(int64 max_indices, int64 rank,
+                                   std::vector<int64> indices)
+    : indices_(std::move(indices)), rank_(rank), max_indices_(max_indices) {
+  CHECK_GT(rank_, 0);
+  CHECK_EQ(indices_.size() % rank_, 0)
+      << "indices_.size(): " << indices_.size() << ", rank_: " << rank_;
+  CHECK_LT(index_count(), max_indices_);
+}
+
+SparseIndexArray::SparseIndexArray(int64 max_indices, int64 rank,
+                                   tensorflow::gtl::ArraySlice<int64> indices)
+    : SparseIndexArray(max_indices, rank,
+                       std::vector<int64>(indices.begin(), indices.end())) {}
+
+SparseIndexArray::SparseIndexArray(int64 max_indices,
+                                   const Array2D<int64>& indices)
+    : SparseIndexArray(max_indices, indices.n2(),
+                       std::vector<int64>(indices.begin(), indices.end())) {}
+
+int64 SparseIndexArray::index_count() const {
+  CHECK_GT(rank_, 0);
+  CHECK_EQ(indices_.size() % rank_, 0);
+  return indices_.size() / rank_;
+}
+
+tensorflow::gtl::ArraySlice<int64> SparseIndexArray::At(
+    int64 sparse_element_number) const {
+  CHECK_GT(rank_, 0);
+  CHECK_GE(sparse_element_number, 0);
+  CHECK_LE(rank_ * sparse_element_number + rank_, indices_.size());
+  return tensorflow::gtl::ArraySlice<int64>(
+      indices_.data() + rank_ * sparse_element_number, rank_);
+}
+
+tensorflow::gtl::MutableArraySlice<int64> SparseIndexArray::At(
+    int64 sparse_element_number) {
+  CHECK_GT(rank_, 0);
+  CHECK_GE(sparse_element_number, 0);
+  CHECK_LE(rank_ * sparse_element_number + rank_, indices_.size());
+  return tensorflow::gtl::MutableArraySlice<int64>(
+      indices_.data() + rank_ * sparse_element_number, rank_);
+}
+
+void SparseIndexArray::Append(tensorflow::gtl::ArraySlice<int64> index) {
+  CHECK_GT(rank_, 0);
+  CHECK_EQ(index.size(), rank_);
+  indices_.insert(indices_.end(), index.begin(), index.end());
+}
+
+void SparseIndexArray::Clear() { indices_.clear(); }
+
+void SparseIndexArray::Resize(int64 num_indices) {
+  CHECK_GT(rank_, 0);
+  indices_.resize(rank_ * num_indices);
+}
+
+bool SparseIndexArray::Validate(const Shape& shape) const {
+  if (rank_ == 0 || rank_ != ShapeUtil::Rank(shape)) {
+    return false;
+  }
+  int64 num_indices = index_count();
+  if (num_indices > LayoutUtil::MaxSparseElements(shape.layout())) {
+    return false;
+  }
+  if (num_indices < 2) {
+    return true;
+  }
+  tensorflow::gtl::ArraySlice<int64> last = At(0);
+  if (!IndexUtil::IndexInBounds(shape, last)) {
+    return false;
+  }
+  for (int64 n = 1; n < num_indices; ++n) {
+    tensorflow::gtl::ArraySlice<int64> next = At(n);
+    if (!IndexUtil::IndexInBounds(shape, next)) {
+      return false;
+    }
+    if (IndexUtil::CompareIndices(last, next) >= 0) {
+      return false;
+    }
+    last = next;
+  }
+  return true;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/sparse_index_array.h b/tensorflow/compiler/xla/sparse_index_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2ce22d6721ff8da46f741ccedc2a63dea5994c8
--- /dev/null
+++ b/tensorflow/compiler/xla/sparse_index_array.h
@@ -0,0 +1,176 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utility class for managing sparse array indices.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SPARSE_INDEX_ARRAY_H_
+#define TENSORFLOW_COMPILER_XLA_SPARSE_INDEX_ARRAY_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/index_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace xla {
+
+// Encapsulates the array of indices for a sparse array.  A SparseIndexArray
+// contain indices for up to `max_indices` elements of a sparse array.  Each
+// sparse index is an array of `rank` int64 value that gives the location of a
+// value within a sparse array.  Note that the dimensions of the array are not
+// checked (except for the rank).  To avoid confusion, we refer to the position
+// of an index within a SparseIndexArray as a sparse index number.
+class SparseIndexArray {
+ public:
+  SparseIndexArray();
+  SparseIndexArray(const SparseIndexArray&) = default;
+  SparseIndexArray(SparseIndexArray&&) = default;
+  SparseIndexArray& operator=(const SparseIndexArray&) = default;
+  SparseIndexArray& operator=(SparseIndexArray&&) = default;
+
+  // Constructs a SparseIndexArray that can hold up to `max_indices` sparse
+  // indices, with an initial contents obtained from the given array.  The rank
+  // is taken from the minor dimension of the array.  The major dimension of the
+  // array must not exceed `max_indices`.
+  SparseIndexArray(int64 max_indices, const Array2D<int64>& indices);
+
+  // Like above, but the array is flattened.  For example, the following are
+  // equivalent:
+  //
+  //  SparseIndexArray(10, 3,
+  //                   Array2D{
+  //                     {0, 1, 2},
+  //                     {3, 4, 5},
+  //                     {6, 7, 8},
+  //                     {9, 10, 11},
+  //                   })
+  //
+  //  SparseIndexArray(10, 3,
+  //                   {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11})
+  //
+  SparseIndexArray(int64 max_indices, int64 rank,
+                   std::vector<int64> indices = {});
+  SparseIndexArray(int64 max_indices, int64 rank,
+                   tensorflow::gtl::ArraySlice<int64> indices);
+
+  // Returns the number of elements represented by the indices stored in the
+  // array.
+  int64 index_count() const;
+
+  // Returns a slice that refers to the given sparse index number. The argument
+  // must be in the range [0, element_count()).
+  tensorflow::gtl::ArraySlice<int64> At(int64 sparse_element_number) const;
+  tensorflow::gtl::MutableArraySlice<int64> At(int64 sparse_element_number);
+
+  // Adds the given index at the end of the array.  The new size of the
+  // SparseIndexArray must not exceed `max_indices`.
+  void Append(tensorflow::gtl::ArraySlice<int64> index);
+
+  // Removes all indices from the array.
+  void Clear();
+
+  // Resizes the array to contain the given number of sparse indices.  The new
+  // size must be smaller than `max_indices`.  If the new size is larger than
+  // the old size, the value of the new indices is not specified.
+  void Resize(int64 num_indices);
+
+  // Returns true iff all indices are unique and occur in sorted order, and are
+  // valid for the given shape.
+  bool Validate(const Shape& shape) const;
+
+  int64 rank() const { return rank_; }
+  int64 max_indices() const { return max_indices_; }
+
+  // Returns a pointer to the int64 array that holds the sparse indices.
+  tensorflow::gtl::MutableArraySlice<int64> mutable_data() { return &indices_; }
+  tensorflow::gtl::ArraySlice<int64> data() const { return indices_; }
+
+  // Sorts this sparse index array along with the set of corresponding values.
+  // The indices and values are sorted in the lexicographic order of the
+  // indices, from smallest to largest.
+  //
+  // For example:
+  //
+  //   std::vector<float> v{10.0, 11.0, 12.0};
+  //   SparseIndexArray a(10, 3,
+  //                      {{3, 4, 5},
+  //                       {1, 2, 3},
+  //                       {2, 3, 4}});
+  //   a.SortWithValues(&v);
+  //   // Prints "11.0, 12.0, 10.0":
+  //   std::cout << v[0] << ", " << v[1] << ", " << v[2] << std::endl;
+  //
+  template <typename NativeT>
+  void SortWithValues(tensorflow::gtl::MutableArraySlice<NativeT> values);
+
+ private:
+  std::vector<int64> indices_;
+  int64 rank_;
+  int64 max_indices_;
+};
+
+template <typename NativeT>
+void SparseIndexArray::SortWithValues(
+    tensorflow::gtl::MutableArraySlice<NativeT> values) {
+  int64 num_elements = index_count();
+  CHECK_EQ(values.size(), num_elements);
+  std::vector<int64> sort_order;
+  sort_order.reserve(num_elements);
+  for (int64 i = 0; i < num_elements; ++i) {
+    sort_order.push_back(i);
+  }
+  auto sort_order_less = [this](int64 lhs, int64 rhs) {
+    return IndexUtil::CompareIndices(At(lhs), At(rhs)) < 0;
+  };
+  std::sort(sort_order.begin(), sort_order.end(), sort_order_less);
+
+  // Reorder the array elements according to sort_order.  Work through the array
+  // and follow cycles so we can do the reorder in-place.
+  tensorflow::gtl::InlinedVector<int64, 8> saved_index(rank());
+  for (int64 i = 0; i < num_elements; ++i) {
+    // sort_order[i] == -1 indicates the element has already been copied.
+    if (sort_order[i] < 0) {
+      continue;
+    } else if (i == sort_order[i]) {
+      // The element is already in sorted order.
+      sort_order[i] = -1;
+      continue;
+    }
+
+    std::copy_n(At(i).begin(), rank(), saved_index.begin());
+    NativeT saved_value = values[i];
+    int64 j = i;
+    for (;;) {
+      if (sort_order[j] == i) {
+        std::copy_n(saved_index.begin(), rank(), At(j).begin());
+        values[j] = saved_value;
+        sort_order[j] = -1;
+        break;
+      }
+
+      std::copy_n(At(sort_order[j]).begin(), rank(), At(j).begin());
+      values[j] = values[sort_order[j]];
+
+      int64 k = sort_order[j];
+      sort_order[j] = -1;
+      j = k;
+    }
+  }
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SPARSE_INDEX_ARRAY_H_
diff --git a/tensorflow/compiler/xla/sparse_index_array_test.cc b/tensorflow/compiler/xla/sparse_index_array_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7377f88958dcb7daf3d3f4f0e07966fdc9294580
--- /dev/null
+++ b/tensorflow/compiler/xla/sparse_index_array_test.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/sparse_index_array.h"
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+namespace {
+
+TEST(SparseIndexArrayTest, Sort) {
+  SparseIndexArray a(10, 3);
+  a.Append({2, 3, 4});
+  a.Append({3, 4, 5});
+  a.Append({1, 2, 3});
+  a.Append({5, 6, 7});
+  a.Append({4, 5, 6});
+  a.Append({6, 7, 8});
+  std::vector<double> values = {
+      12.0, 13.0, 11.0, 15.0, 14.0, 16.0,
+  };
+  a.SortWithValues<double>(&values);
+  ASSERT_EQ(a.data(), std::vector<int64>({1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5,
+                                          6, 7, 6, 7, 8}));
+  ASSERT_EQ(values, std::vector<double>({11.0, 12.0, 13.0, 14.0, 15.0, 16.0}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/status_macros.h b/tensorflow/compiler/xla/status_macros.h
index 5e5550563d02de99ddefbeb8ee8e1bf98afdcdbf..e51dd64e2a3dc7c359918cb08c6c94b2b4d9e91b 100644
--- a/tensorflow/compiler/xla/status_macros.h
+++ b/tensorflow/compiler/xla/status_macros.h
@@ -196,18 +196,8 @@ class StatusAdaptorForMacros {
 #define TF_STATUS_MACROS_CONCAT_NAME(x, y) TF_STATUS_MACROS_CONCAT_IMPL(x, y)
 #define TF_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
 
-#define TF_ASSIGN_OR_RETURN(...)                                             \
-  TF_STATUS_MACRO_GET_VARIADIC_IMPL(__VA_ARGS__, TF_ASSIGN_OR_RETURN_IMPL_3, \
-                                    TF_ASSIGN_OR_RETURN_IMPL_2)              \
-  (__VA_ARGS__)
-
-#define TF_STATUS_MACRO_GET_VARIADIC_IMPL(_1, _2, _3, NAME, ...) NAME
-
-#define TF_ASSIGN_OR_RETURN_IMPL_2(lhs, rexpr) \
-  TF_ASSIGN_OR_RETURN_IMPL_3(lhs, rexpr)
-
-#define TF_ASSIGN_OR_RETURN_IMPL_3(lhs, rexpr) \
-  TF_ASSIGN_OR_RETURN_IMPL(                    \
+#define TF_ASSIGN_OR_RETURN(lhs, rexpr) \
+  TF_ASSIGN_OR_RETURN_IMPL(             \
       TF_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), lhs, rexpr)
 
 #define TF_ASSIGN_OR_RETURN_IMPL(statusor, lhs, rexpr) \
diff --git a/tensorflow/compiler/xla/statusor_internals.h b/tensorflow/compiler/xla/statusor_internals.h
index a2fda5bb3c6f11c20fc45c57885b1ce7523db81d..14636bd144bc0a155fc96c5a350c658fd2dadfe6 100644
--- a/tensorflow/compiler/xla/statusor_internals.h
+++ b/tensorflow/compiler/xla/statusor_internals.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
+#define TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
 
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/core/platform/macros.h"
@@ -242,4 +242,4 @@ struct TraitsBase<false, false> {
 }  // namespace internal_statusor
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_STATUSOR_INTERNALS_H_
diff --git a/tensorflow/compiler/xla/statusor_test.cc b/tensorflow/compiler/xla/statusor_test.cc
index 5fa2211ac66177514ac8ecabfa8791e7c8c014a2..f9d25945bc617507735fb6c4d011c39723497f69 100644
--- a/tensorflow/compiler/xla/statusor_test.cc
+++ b/tensorflow/compiler/xla/statusor_test.cc
@@ -32,26 +32,26 @@ namespace {
 class Base1 {
  public:
   virtual ~Base1() {}
-  int pad;
+  int pad_;
 };
 
 class Base2 {
  public:
   virtual ~Base2() {}
-  int yetotherpad;
+  int yetotherpad_;
 };
 
 class Derived : public Base1, public Base2 {
  public:
   ~Derived() override {}
-  int evenmorepad;
+  int evenmorepad_;
 };
 
 class CopyNoAssign {
  public:
-  explicit CopyNoAssign(int value) : foo(value) {}
-  CopyNoAssign(const CopyNoAssign& other) : foo(other.foo) {}
-  int foo;
+  explicit CopyNoAssign(int value) : foo_(value) {}
+  CopyNoAssign(const CopyNoAssign& other) : foo_(other.foo_) {}
+  int foo_;
 
  private:
   const CopyNoAssign& operator=(const CopyNoAssign&);
@@ -253,7 +253,7 @@ TEST(StatusOr, TestCopyCtorNonAssignable) {
   StatusOr<CopyNoAssign> original(value);
   StatusOr<CopyNoAssign> copy(original);
   EXPECT_EQ(copy.status(), original.status());
-  EXPECT_EQ(original.ValueOrDie().foo, copy.ValueOrDie().foo);
+  EXPECT_EQ(original.ValueOrDie().foo_, copy.ValueOrDie().foo_);
 }
 
 TEST(StatusOr, TestCopyCtorStatusOKConverting) {
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index addce9019b340f9489a25dbdd2437f4d71740b95..5ff774075259e819718bcb91af4092129a6df582 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -69,6 +69,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
@@ -104,7 +105,9 @@ cc_library(
     hdrs = ["hlo_test_base.h"],
     deps = [
         ":literal_test_util",
+        ":test_utils",
         "//tensorflow/compiler/xla:shape_layout",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -114,6 +117,10 @@ cc_library(
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
@@ -338,6 +345,24 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "xla_hlo_profile_test",
+    srcs = ["xla_hlo_profile_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "axpy_simple_test",
     srcs = ["axpy_simple_test.cc"],
@@ -354,6 +379,7 @@ xla_test(
 xla_test(
     name = "map_test",
     srcs = ["map_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
@@ -382,6 +408,7 @@ xla_test(
     name = "params_test",
     srcs = ["params_test.cc"],
     shard_count = 30,
+    tags = ["optonly"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
@@ -430,6 +457,22 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "conditional_test",
+    srcs = ["conditional_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "unary_op_test",
     srcs = ["unary_op_test.cc"],
@@ -532,9 +575,30 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "exhaustive_f32_elementwise_op_test",
+    srcs = ["exhaustive_f32_elementwise_op_test.cc"],
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    shard_count = 48,
+    tags = [
+        "enormous",
+        "manual",
+    ],
+    deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+    ],
+)
+
 xla_test(
     name = "reduce_precision_test",
     srcs = ["reduce_precision_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
@@ -557,6 +621,9 @@ xla_test(
 xla_test(
     name = "dot_operation_test",
     srcs = ["dot_operation_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -579,6 +646,9 @@ xla_test(
 xla_test(
     name = "dot_operation_runtime_test",
     srcs = ["dot_operation_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -773,11 +843,6 @@ xla_test(
 xla_test(
     name = "bfloat16_test",
     srcs = ["bfloat16_test.cc"],
-    blacklisted_backends = [
-        "cpu",
-        "cpu_parallel",
-        "gpu",
-    ],
     shard_count = 40,
     deps = [
         ":test_utils",
@@ -807,6 +872,31 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "half_test",
+    srcs = ["half_test.cc"],
+    backends = [
+        # TODO(b/72509305): Flaky (fails with SEGV) as of 2018-01-25
+        # "cpu",
+        "gpu",
+    ],
+    deps = [
+        ":test_utils",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "slice_test",
     srcs = ["slice_test.cc"],
@@ -961,7 +1051,10 @@ xla_test(
     name = "reduce_window_test",
     timeout = "long",
     srcs = [],
-    tags = ["optonly"],
+    tags = [
+        "enable_for_xla_interpreter",
+        "optonly",
+    ],
     xla_test_library_deps = [":reduce_window_test_library"],
     deps = [],
 )
@@ -970,6 +1063,10 @@ xla_test(
     name = "select_and_scatter_test",
     timeout = "long",
     srcs = ["select_and_scatter_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+        "optonly",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
@@ -1008,6 +1105,19 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "reduce_hlo_test",
+    srcs = ["reduce_hlo_test.cc"],
+    deps = [
+        ":client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "call_test",
     srcs = ["call_test.cc"],
@@ -1036,9 +1146,10 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -1364,6 +1475,31 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "execution_profile_test",
+    srcs = ["execution_profile_test.cc"],
+    deps = [
+        ":client_library_test_base",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
+xla_test(
+    name = "execution_profile_test_with_xla_hlo_profile",
+    srcs = ["execution_profile_test.cc"],
+    args = ["--xla_hlo_profile"],
+    deps = [
+        ":client_library_test_base",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "replay_test",
     srcs = ["replay_test.cc"],
@@ -1456,6 +1592,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1482,6 +1619,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1676,6 +1814,45 @@ xla_test(
     ],
 )
 
+# A demo of textual IR based test.
+xla_test(
+    name = "sample_text_test",
+    srcs = ["sample_text_test.cc"],
+    # You can leave this empty if you want to test all supported backends.
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    deps = [
+        ":hlo_test_base",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+    ],
+)
+
+# A demo of test that loads an hlo module from a file and compares results on gpu and cpu.
+tf_cc_test(
+    name = "sample_file_test",
+    srcs = ["sample_file_test.cc"],
+    data = ["isolated_convolution.hlo"],
+    tags = ["requires-gpu-sm35"],
+    deps = [
+        ":hlo_test_base",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/service:cpu_plugin",  # reference backend
+        "//tensorflow/compiler/xla/service:gpu_plugin",  # test backend
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index c6e8b24d1211743d07878d388522feacf9c0e7f1..7e9005001db34d403ea923eb9c152d114bf32803 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -1879,20 +1879,73 @@ XLA_TEST_F(ArrayElementwiseOpTest, ClampF32ScalarVector) {
   auto min_scalar = builder.ConstantR0<float>(0.0f);
   auto min_vector = builder.ConstantR1<float>({1.0f, -6.5f, 1.0f, 2.25f, 0.0f});
   auto arg_vector = builder.ConstantR1<float>({2.0f, 10.0f, -5.0f, 1.0f, 4.0f});
-  auto arg_scalar = builder.ConstantR1<float>({2.0f, 10.0f, -5.0f, 1.0f, 4.0f});
   auto max_scalar = builder.ConstantR0<float>(3.0f);
   auto max_vector = builder.ConstantR1<float>({3.0f, 0.5f, 25.5f, 5.0f, 123.0});
   // Perform clamp with broadcasted scalar and vector.
   auto clamp = builder.Add(
       builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
                   builder.Clamp(min_scalar, arg_vector, max_vector)),
-      builder.Add(builder.Clamp(min_vector, arg_scalar, max_vector),
-                  builder.Clamp(min_scalar, arg_scalar, max_vector)));
+      builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
+                  builder.Clamp(min_scalar, arg_vector, max_scalar)));
 
-  ComputeAndCompareR1<float>(&builder, {8.0f, 4.5f, 2.0f, 6.5f, 15.0f}, {},
+  ComputeAndCompareR1<float>(&builder, {8.0f, 7.0f, 2.0f, 6.5f, 14.0f}, {},
                              error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClampS32Vector) {
+  ComputationBuilder builder(client_, TestName());
+  auto min_vector = builder.ConstantR1<int32>({1, -6, 1, 2, 0, -5});
+  auto arg_vector = builder.ConstantR1<int32>({2, 10, -5, 1, 4, 10});
+  auto max_vector = builder.ConstantR1<int32>({3, 0, 25, 5, 123, -1});
+  auto clamp = builder.Clamp(min_vector, arg_vector, max_vector);
+
+  ComputeAndCompareR1<int32>(&builder, {2, 0, 1, 2, 4, -1}, {});
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, ClampS32ScalarVector) {
+  ComputationBuilder builder(client_, TestName());
+  auto min_scalar = builder.ConstantR0<int32>(0);
+  auto min_vector = builder.ConstantR1<int32>({1, -6, 1, 2, 0});
+  auto arg_vector = builder.ConstantR1<int32>({2, 10, -5, 1, 4});
+  auto max_scalar = builder.ConstantR0<int32>(3);
+  auto max_vector = builder.ConstantR1<int32>({3, 1, 25, 5, 123});
+  // Perform clamp with broadcasted scalar and vector.
+  auto clamp = builder.Add(
+      builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
+                  builder.Clamp(min_scalar, arg_vector, max_vector)),
+      builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
+                  builder.Clamp(min_scalar, arg_vector, max_scalar)));
+
+  ComputeAndCompareR1<int32>(&builder, {8, 8, 2, 6, 14}, {});
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, ClampU32Vector) {
+  ComputationBuilder builder(client_, TestName());
+  auto min_vector = builder.ConstantR1<uint32>({1, 2, 1, 2, 0, ~0u - 4});
+  auto arg_vector = builder.ConstantR1<uint32>({2, 10, 5, 1, 4, 10});
+  auto max_vector = builder.ConstantR1<uint32>({3, 5, 25, 5, 123, ~0u});
+  auto clamp = builder.Clamp(min_vector, arg_vector, max_vector);
+
+  ComputeAndCompareR1<uint32>(&builder, {2, 5, 5, 2, 4, ~0u - 4}, {});
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, ClampU32ScalarVector) {
+  ComputationBuilder builder(client_, TestName());
+  auto min_scalar = builder.ConstantR0<uint32>(0);
+  auto min_vector = builder.ConstantR1<uint32>({1, 0, 1, 2, 0});
+  auto arg_vector = builder.ConstantR1<uint32>({2, 10, 0, 1, 4});
+  auto max_scalar = builder.ConstantR0<uint32>(3);
+  auto max_vector = builder.ConstantR1<uint32>({3, 1, 25, 5, 123});
+  // Perform clamp with broadcasted scalar and vector.
+  auto clamp = builder.Add(
+      builder.Add(builder.Clamp(min_vector, arg_vector, max_scalar),
+                  builder.Clamp(min_scalar, arg_vector, max_vector)),
+      builder.Add(builder.Clamp(min_vector, arg_vector, max_vector),
+                  builder.Clamp(min_scalar, arg_vector, max_scalar)));
+
+  ComputeAndCompareR1<uint32>(&builder, {8, 8, 2, 6, 14}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, AddTwoParametersF32s) {
   ComputationBuilder builder(client_, TestName());
 
@@ -1971,6 +2024,18 @@ XLA_TEST_F(ArrayElementwiseOpTest, SinF32s) {
                              error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, Atan2F32s) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<float>({0.0f, 5.0f, 0.0f, -3.0f, 2.0f, -8.0f});
+  auto b = builder.ConstantR1<float>({6.0f, 0.0f, -4.0f, 0.0f, 2.0f, 8.0f});
+  auto atan = builder.Atan2(a, b);
+
+  ComputeAndCompareR1<float>(
+      &builder,
+      {0.0f, 1.57079633f, 3.14159265f, -1.57079633f, 0.78539816f, -0.78539816f},
+      {}, error_spec_);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, TanhF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 3.14f, 2.25f});
@@ -1983,47 +2048,117 @@ XLA_TEST_F(ArrayElementwiseOpTest, TanhF32s) {
 XLA_TEST_F(ArrayElementwiseOpTest, TanhF32sVector) {
   // This is like the test ArrayElementwiseOpTest.TanhF32s above, except that
   // the input tensor is large enough to exercise the vectorized tanh
-  // implementation.
-  ComputationBuilder builder(client_, TestName());
-  auto input_literal = Literal::CreateR2<float>(
-      {{1.02, -0.32, 0.85, 0.90, 1.23, -0.91, -0.49, 0.80},
-       {-0.67, 0.16, -0.07, 0.39, -0.41, 0.04, 1.36, 1.25},
-       {0.41, 0.65, -1.08, 0.32, -1.45, -0.77, -1.09, 0.91},
-       {-1.03, -0.30, -1.11, -1.17, 1.50, -0.85, 0.04, 1.02},
-       {0.34, -0.61, 0.41, 0.07, -0.02, 1.42, -0.62, 0.81},
-       {0.08, 0.81, -0.30, 1.17, -0.65, -0.44, 0.92, 1.26},
-       {-1.29, 1.35, 0.08, -1.24, -0.92, 0.49, 1.17, -0.45},
-       {-1.31, -1.44, -0.13, -1.31, -0.79, 1.41, 1.21, 1.05}});
-  auto input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  // implementation on XLA CPU.
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateR1<float>(
+      {1.02,  -0.32, 0.85,  0.90,  1.23,  -0.91, -0.49, 0.80,  -0.67, 0.16,
+       -0.07, 0.39,  -0.41, 0.04,  1.36,  1.25,  0.41,  0.65,  -1.08, 0.32,
+       -1.45, -0.77, -1.09, 0.91,  -1.03, -0.30, -1.11, -1.17, 1.50,  -0.85,
+       0.04,  1.02,  0.34,  -0.61, 0.41,  0.07,  -0.02, 1.42,  -0.62, 0.81,
+       0.08,  0.81,  -0.30, 1.17,  -0.65, -0.44, 0.92,  1.26,  -1.29, 1.35,
+       0.08,  -1.24, -0.92, 0.49,  1.17,  -0.45, -1.31, -1.44, -0.13, -1.31,
+       -0.79, 1.41,  1.21,  1.05});
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          client_->TransferToServer(*input_literal));
 
   auto input = builder.Parameter(0, input_literal->shape(), "input");
   builder.Tanh(input);
 
-  ComputeAndCompareR2<float>(
+  ComputeAndCompareR1<float>(
       &builder,
-      {{0.77009583, -0.30665702, 0.69070244, 0.71401149, 0.84400684,
-        -0.71985596, -0.45764771, 0.66664988},
-       {-0.58278900, 0.16050975, -0.06770509, 0.36843640, -0.38476998,
-        0.04018109, 0.87562293, 0.84788644},
-       {0.38603750, 0.57294142, -0.79140943, 0.31032649, -0.89590985,
-        -0.64770776, -0.79625875, 0.72234446},
-       {-0.77389336, -0.28871772, -0.80428445, -0.82541436, 0.90456349,
-        -0.68856895, 0.03877772, 0.76877952},
-       {0.32561871, -0.54546672, 0.39072621, 0.07273290, -0.01924866,
-        0.88924897, -0.55283129, 0.67183107},
-       {0.08006320, 0.66944766, -0.29068485, 0.82573754, -0.57170743,
-        -0.41581789, 0.72739530, 0.85025692},
-       {-0.85931867, 0.87357593, 0.07782833, -0.84597743, -0.72748238,
-        0.45396307, 0.82449573, -0.42462519},
-       {-0.86363792, -0.89368379, -0.12621804, -0.86445558, -0.65565848,
-        0.88789743, 0.83566397, 0.78287679}},
+      {0.77009583,  -0.30665702, 0.69070244,  0.71401149,  0.84400684,
+       -0.71985596, -0.45764771, 0.66664988,  -0.58278900, 0.16050975,
+       -0.06770509, 0.36843640,  -0.38476998, 0.04018109,  0.87562293,
+       0.84788644,  0.38603750,  0.57294142,  -0.79140943, 0.31032649,
+       -0.89590985, -0.64770776, -0.79625875, 0.72234446,  -0.77389336,
+       -0.28871772, -0.80428445, -0.82541436, 0.90456349,  -0.68856895,
+       0.03877772,  0.76877952,  0.32561871,  -0.54546672, 0.39072621,
+       0.07273290,  -0.01924866, 0.88924897,  -0.55283129, 0.67183107,
+       0.08006320,  0.66944766,  -0.29068485, 0.82573754,  -0.57170743,
+       -0.41581789, 0.72739530,  0.85025692,  -0.85931867, 0.87357593,
+       0.07782833,  -0.84597743, -0.72748238, 0.45396307,  0.82449573,
+       -0.42462519, -0.86363792, -0.89368379, -0.12621804, -0.86445558,
+       -0.65565848, 0.88789743,  0.83566397,  0.78287679},
       {input_data.get()},
       // The error spec is unusually high here to account for the fact that we
       // use a rational interpolant to approximate tanh.
       ErrorSpec(0.004, 0.004));
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ExpF32sVector) {
+  // The input tensor is large enough to exercise the vectorized exp
+  // implementation on XLA CPU.
+  ComputationBuilder builder(client_, TestName());
+
+  // Just to help make sense of the scales here -- exp(89) saturates float32 and
+  // exp(-10) is smaller than our error spec.
+  std::unique_ptr<Literal> input_literal = Literal::CreateR1<float>(
+      {1.02,   -0.32,  0.85,   0.9,    1.23,   -0.91,  -0.49, 0.8,    -1.31,
+       -1.44,  -0.13,  -1.31,  -0.79,  1.41,   1.21,   1.05,  -195.6, -194.5,
+       -193.4, -192.3, -191.2, -190.1, -189.0, -187.9, -19.6, -18.5,  -17.4,
+       -16.3,  -15.2,  -14.1,  -13.0,  -11.9,  -10.8,  -9.7,  -8.6,   -7.5,
+       -6.4,   -5.3,   -4.2,   -3.1,   -2.0,   -0.9,   0.2,   1.3,    2.4,
+       3.5,    4.6,    5.7,    6.8,    7.9,    9.0,    10.1,  11.2,   12.3,
+       13.4,   14.5,   15.6,   16.7,   17.8,   18.9,   20.0,  21.1,   22.2,
+       23.3,   24.4,   25.5,   26.6,   27.7,   28.8,   29.9,  31.0,   32.1,
+       68.4,   69.5,   70.6,   71.7,   72.8,   73.9,   75.0,  76.1,   77.2,
+       78.3,   79.4,   80.5,   81.6,   82.7,   83.8,   84.9,  85.2,   86.3,
+       86.4,   86.5,   87.6,   87.7,   87.8,   87.9});
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
+                          client_->TransferToServer(*input_literal));
+
+  auto input = builder.Parameter(0, input_literal->shape(), "input");
+  builder.Exp(input);
+
+  std::vector<float> expected_result;
+  int64 input_size = input_literal->shape().dimensions(0);
+  expected_result.reserve(input_size);
+  for (int64 i = 0; i < input_size; i++) {
+    expected_result.push_back(std::exp(input_literal->Get<float>({i})));
+  }
+
+  ComputeAndCompareR1<float>(&builder, expected_result, {input_data.get()},
+                             error_spec_);
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, LogF32sVector) {
+  // The input tensor is large enough to exercise the vectorized exp
+  // implementation on XLA CPU.
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> input_literal = Literal::CreateR1<float>(
+      {-1.29,    -1.41,    -1.25,    -13.5,    -11.7,    -17.9,    -198,
+       -167,     1.29,     1.41,     1.25,     13.5,     11.7,     17.9,
+       198,      167,      1.27e+03, 1.33e+03, 1.74e+03, 1.6e+04,  1.84e+04,
+       1.74e+04, 1.89e+05, 1.9e+05,  1.93e+06, 1.98e+06, 1.65e+06, 1.97e+07,
+       1.66e+07, 1e+07,    1.98e+08, 1.96e+08, 1.64e+09, 1.58e+09, 1.64e+09,
+       1.44e+10, 1.5e+10,  1.99e+10, 1.17e+11, 1.08e+11, 1.08e+12, 1.38e+12,
+       1.4e+12,  1.03e+13, 1.6e+13,  1.99e+13, 1.26e+14, 1.51e+14, 1.33e+15,
+       1.41e+15, 1.63e+15, 1.39e+16, 1.21e+16, 1.27e+16, 1.28e+17, 1.62e+17,
+       2e+18,    1.96e+18, 1.81e+18, 1.99e+19, 1.86e+19, 1.61e+19, 1.71e+20,
+       1.47e+20, 1.83e+21, 1.33e+21, 1.3e+21,  1.35e+22, 1.84e+22, 1.02e+22,
+       1.81e+23, 1.02e+23, 1.89e+24, 1.49e+24, 1.08e+24, 1.95e+25, 1.1e+25,
+       1.62e+25, 1.2e+26,  1.41e+26, 1.93e+27, 1.66e+27, 1.62e+27, 1.05e+28,
+       1.5e+28,  1.79e+28, 1.36e+29, 1.95e+29, 1.5e+30,  1.81e+30, 1.34e+30,
+       1.7e+31,  1.44e+31, 1.1e+31,  1.4e+32,  1.67e+32, 1.96e+33, 1.11e+33,
+       1.19e+33, 1.61e+34, 1.05e+34, 1.88e+34, 1.67e+35, 1.7e+35});
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
+                          client_->TransferToServer(*input_literal));
+
+  auto input = builder.Parameter(0, input_literal->shape(), "input");
+  builder.Log(input);
+
+  std::vector<float> expected_result;
+  int64 input_size = input_literal->shape().dimensions(0);
+  expected_result.reserve(input_size);
+  for (int64 i = 0; i < input_size; i++) {
+    expected_result.push_back(std::log(input_literal->Get<float>({i})));
+  }
+
+  ComputeAndCompareR1<float>(&builder, expected_result, {input_data.get()},
+                             error_spec_);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) {
   // a ------ (add) --------- (add)
   //         /               /
@@ -2520,9 +2655,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, R4_16x16x2x2_Plus_R1_16) {
   std::iota(r1.begin(), r1.end(), 1.0);
 
   ComputationBuilder builder(client_, TestName());
-  std::unique_ptr<Literal> a_literal = Literal::CreateR4FromArray4D(r4);
-  *a_literal->mutable_shape()->mutable_layout() =
-      LayoutUtil::MakeLayout({0, 1, 2, 3});
+  std::unique_ptr<Literal> a_literal = Literal::CreateR4FromArray4DWithLayout(
+      r4, LayoutUtil::MakeLayout({0, 1, 2, 3}));
   auto a = builder.ConstantLiteral(*a_literal);
   auto b = builder.ConstantR1<float>(r1);
   builder.Add(a, b, {1});
diff --git a/tensorflow/compiler/xla/tests/axpy_simple_test.cc b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
index 627a9c3e7d9f6eb8d360228362ea5adf12c6c798..3f6fd7c65d3360a622dbf754833009fb20410535 100644
--- a/tensorflow/compiler/xla/tests/axpy_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
@@ -62,6 +62,10 @@ TEST_F(AxpySimpleTest, AxpyTenValues) {
   auto ax = builder.Mul(alpha, x);
   auto axpy = builder.Add(ax, y);
 
+  TF_ASSERT_OK_AND_ASSIGN(ProgramShape shape, builder.GetProgramShape());
+
+  EXPECT_EQ("() -> f32[10]", ShapeUtil::HumanString(shape));
+
   std::vector<float> expected = {
       1.85840735, -1.85840735, 2.28318531,   -2.28318531,  -6.42477796,
       6.42477796, 10.56637061, -10.56637061, -14.70796327, 14.70796327};
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index 028d1251b455b82a291c236f7866e52e27d3590e..28ab9654997728fbafd6610af840e721e72cce5a 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -39,6 +39,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -46,9 +48,13 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class BatchNormalizationTest : public ClientLibraryTestBase {
+class BatchNormalizationTest
+    : public ClientLibraryTestBase,
+      public ::testing::WithParamInterface<bool /*use_cudnn_batchnorm*/> {
  protected:
   BatchNormalizationTest() : input_array_(kSamples, kZ, kY, kX) {
+    mutable_debug_options()->set_xla_gpu_use_cudnn_batchnorm(GetParam());
+
     Array2D<float> pz({
         // z0 z1
         {-1.0f, 4.1f},  // p0
@@ -56,7 +62,7 @@ class BatchNormalizationTest : public ClientLibraryTestBase {
         {5.0f, 4.4f},   // p2
     });
     input_array_.FillWithPZ(pz);
-    input_literal_ = *Literal::CreateR4FromArray4D(input_array_);
+    input_literal_ = std::move(*Literal::CreateR4FromArray4D(input_array_));
     CHECK_EQ(kSamples, input_array_.planes());
     CHECK_EQ(kZ, input_array_.depth());
     CHECK_EQ(kY, input_array_.height());
@@ -73,7 +79,18 @@ class BatchNormalizationTest : public ClientLibraryTestBase {
   const ErrorSpec error_spec_{0.001, 0.001};
 };
 
-TEST_F(BatchNormalizationTest, SubtractInZ) {
+// If testing the GPU backend, run the tests twice, with and without cudnn
+// batchnorm.  Otherwise, just run the tests once -- the value of this flag
+// doesn't matter.
+#ifdef XLA_TEST_BACKEND_GPU
+INSTANTIATE_TEST_CASE_P(BatchNormalizationTestInstance, BatchNormalizationTest,
+                        ::testing::Bool());
+#else
+INSTANTIATE_TEST_CASE_P(BatchNormalizationTestInstance, BatchNormalizationTest,
+                        ::testing::Values(false));
+#endif
+
+XLA_TEST_P(BatchNormalizationTest, SubtractInZ) {
   ComputationBuilder builder(client_, "subtract_in_z_one_sample");
   auto x = builder.ConstantLiteral(input_literal_);
   auto y = builder.ConstantR1<float>({3.14, 4.25});
@@ -89,22 +106,24 @@ TEST_F(BatchNormalizationTest, SubtractInZ) {
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
-TEST_F(BatchNormalizationTest, SquareTesseractElementwise) {
+XLA_TEST_P(BatchNormalizationTest, SquareTesseractElementwise) {
   ComputationBuilder builder(client_, "square_tesseract_elementwise");
   auto x = builder.ConstantLiteral(input_literal_);
   builder.SquareF32(x);
 
+  using tensorflow::MathUtil;
+
   Array4D<float> expected(kSamples, kZ, kY, kX);
   Array2D<float> expected_pz({
-      {std::pow(-1.0f, 2.0f), std::pow(4.1f, 2.0f)},
-      {std::pow(2.0f, 2.0f), std::pow(4.1f, 2.0f)},
-      {std::pow(5.0f, 2.0f), std::pow(4.4f, 2.0f)},
+      {MathUtil::IPow(-1.0f, 2), MathUtil::IPow(4.1f, 2)},
+      {MathUtil::IPow(2.0f, 2), MathUtil::IPow(4.1f, 2)},
+      {MathUtil::IPow(5.0f, 2), MathUtil::IPow(4.4f, 2)},
   });
   expected.FillWithPZ(expected_pz);
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
-TEST_F(BatchNormalizationTest, SumToZ) {
+XLA_TEST_P(BatchNormalizationTest, SumToZ) {
   ComputationBuilder builder(client_, "sum_to_z");
   auto input_activations = builder.ConstantLiteral(input_literal_);
   Computation add = CreateScalarAddComputation(F32, &builder);
@@ -116,7 +135,7 @@ TEST_F(BatchNormalizationTest, SumToZ) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
-TEST_F(BatchNormalizationTest, SquareAndReduce) {
+XLA_TEST_P(BatchNormalizationTest, SquareAndReduce) {
   ComputationBuilder builder(client_, "square_and_reduce");
   auto input_activations = builder.ConstantLiteral(input_literal_);
   auto set_means = builder.ConstantR1<float>({2.f, 4.2f});
@@ -131,7 +150,7 @@ TEST_F(BatchNormalizationTest, SquareAndReduce) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
-TEST_F(BatchNormalizationTest, VarianceToStddev) {
+XLA_TEST_P(BatchNormalizationTest, VarianceToStddev) {
   ComputationBuilder builder(client_, "variance_to_stddev");
   auto variance = builder.ConstantR1<float>({6.f, .02f});
   auto sqrt = builder.SqrtF32(variance);
@@ -142,7 +161,7 @@ TEST_F(BatchNormalizationTest, VarianceToStddev) {
 
 // Compare against a forward batch normalization example in the NN spec
 // reference.
-TEST_F(BatchNormalizationTest, SpecComparisonForward) {
+XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
   ComputationBuilder builder(client_, "batch_normalize_per_spec");
   auto input_activations =
       builder.CheckShape(builder.ConstantLiteral(input_literal_),
@@ -198,19 +217,227 @@ TEST_F(BatchNormalizationTest, SpecComparisonForward) {
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
+XLA_TEST_P(BatchNormalizationTest, BasicTraining) {
+  const int kFeatureIndex = 3;
+  ComputationBuilder builder(client_, TestName());
+
+  auto operand = builder.ConstantR4FromArray4D<float>(
+      {{{{1.f, 2.f}}, {{3.f, 4.f}}}, {{{5.f, 6.f}}, {{7.f, 8.f}}}});
+
+  auto scale = builder.ConstantR1<float>({2.0f, 3.0f});
+
+  auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
+
+  auto tuple = builder.BatchNormTraining(operand, scale, offset,
+                                         /*epsilon=*/0.001, kFeatureIndex);
+
+  auto expected = Literal::MakeTuple(
+      {Literal::CreateR4<float>({{{{-1.6f, -2.0f}}, {{0.1f, 0.6f}}},
+                                 {{{1.9f, 3.3f}}, {{3.7f, 6.0f}}}})
+           .get(),
+       Literal::CreateR1<float>({4, 5}).get(),
+       Literal::CreateR1<float>({5, 5}).get()});
+
+  ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.1));
+}
+
+XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) {
+  const int kFeatureIndex = 2;
+  ComputationBuilder builder(client_, TestName());
+
+  auto operand = builder.ConstantR4FromArray4D<float>(
+      {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}});
+
+  auto scale = builder.ConstantR1<float>({2.0f, 3.0f});
+
+  auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
+
+  auto tuple = builder.BatchNormTraining(operand, scale, offset,
+                                         /*epsilon=*/0.001, kFeatureIndex);
+
+  auto expected = Literal::MakeTuple(
+      {Literal::CreateR4<float>({{{{-1.6f}, {-2.0f}}, {{0.1f}, {0.6f}}},
+                                 {{{1.9f}, {3.3f}}, {{3.7f}, {6.0f}}}})
+           .get(),
+       Literal::CreateR1<float>({4, 5}).get(),
+       Literal::CreateR1<float>({5, 5}).get()});
+
+  ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.1));
+}
+
+XLA_TEST_P(BatchNormalizationTest, TrainingWithFeatureOnLowDimension) {
+  // Use 0 dimension as feature, tests layout analyzer.
+  const int kFeatureIndex = 0;
+  ComputationBuilder builder(client_, TestName());
+
+  ComputationDataHandle h0;
+  auto operand = CreateR3Parameter<float>(Array3D<float>(260, 2, 2, 1.0f),
+                                          /*parameter_number=*/0, "operand",
+                                          &builder, &h0);
+  ComputationDataHandle h1;
+  auto scale =
+      CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
+                               /*parameter_number=*/1, "scale", &builder, &h1);
+  ComputationDataHandle h2;
+  auto offset =
+      CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
+                               /*parameter_number=*/2, "offset", &builder, &h2);
+
+  auto tuple = builder.BatchNormTraining(h0, h1, h2,
+                                         /*epsilon=*/1, kFeatureIndex);
+
+  auto expected = Literal::MakeTuple(
+      {Literal::CreateR3FromArray3D<float>(Array3D<float>(260, 2, 2, 1.0f))
+           .get(),
+       Literal::CreateR1<float>(std::vector<float>(260, 1.0f)).get(),
+       Literal::CreateR1<float>(std::vector<float>(260, 0.0f)).get()});
+
+  ComputeAndCompareTuple(&builder, *expected,
+                         {operand.get(), scale.get(), offset.get()},
+                         ErrorSpec(0.1));
+}
+
+XLA_TEST_P(BatchNormalizationTest, LargeEpsilonTest) {
+  // Test the correctness of choosing a large epsilon value.
+  const int kFeatureIndex = 2;
+  ComputationBuilder builder(client_, TestName());
+
+  ComputationDataHandle h0;
+  auto operand = CreateR3Parameter<float>({{{0.0f}, {10.0f}, {20.0f}, {30.0f}}},
+                                          /*parameter_number=*/0, "operand",
+                                          &builder, &h0);
+  ComputationDataHandle h1;
+  auto scale =
+      CreateR1Parameter<float>(std::vector<float>(1, 1.0f),
+                               /*parameter_number=*/1, "scale", &builder, &h1);
+  ComputationDataHandle h2;
+  auto offset =
+      CreateR1Parameter<float>(std::vector<float>(1, 0.0f),
+                               /*parameter_number=*/2, "offset", &builder, &h2);
+
+  // var = 125, mean = 15, epsilon = -100
+  auto tuple = builder.BatchNormTraining(h0, h1, h2,
+                                         /*epsilon=*/-100, kFeatureIndex);
+
+  auto expected = Literal::MakeTuple(
+      {Literal::CreateR3FromArray3D<float>({{{-3.0f}, {-1.0f}, {1.0f}, {3.0f}}})
+           .get(),
+       Literal::CreateR1<float>(std::vector<float>(1, 15.0f)).get(),
+       Literal::CreateR1<float>(std::vector<float>(1, 125.0f)).get()});
+
+  ComputeAndCompareTuple(&builder, *expected,
+                         {operand.get(), scale.get(), offset.get()},
+                         ErrorSpec(0.1));
+}
+
+XLA_TEST_P(BatchNormalizationTest, BatchNormGradBasic) {
+  const int kFeatureIndex = 2;
+  ComputationBuilder builder(client_, TestName());
+
+  auto operand =
+      builder.ConstantR4FromArray4D<float>(Array4D<float>(2, 2, 2, 1, 0.0f));
+
+  auto scale = builder.ConstantR1<float>({1.0f, 1.0f});
+
+  auto mean = builder.ConstantR1<float>({0.0f, 0.0f});
+
+  auto var = builder.ConstantR1<float>({1.0f, 1.0f});
+
+  auto grad_output = builder.ConstantR4FromArray4D<float>(
+      {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}});
+
+  builder.BatchNormGrad(operand, scale, mean, var, grad_output,
+                        /*epsilon=*/0.0, kFeatureIndex);
+
+  auto expected = Literal::MakeTuple(
+      {Literal::CreateR4<float>({{{{-3.f}, {-3.f}}, {{-1.f}, {-1.f}}},
+                                 {{{1.f}, {1.f}}, {{3.f}, {3.f}}}})
+           .get(),
+       Literal::CreateR1<float>({0, 0}).get(),
+       Literal::CreateR1<float>({16, 20}).get()});
+
+  ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.1));
+}
+
 struct BatchNormTestParam {
   std::vector<int64> bounds;
   int64 feature_index;
   float random_value_mean;
   float random_value_var;
+  bool use_cudnn_batchnorm;
+
+  friend ::std::ostream& operator<<(::std::ostream& os,
+                                    const BatchNormTestParam& p) {
+    os << "bounds={" << tensorflow::str_util::Join(p.bounds, ", ") << "}, ";
+    os << "feature_index=" << p.feature_index << ", ";
+    os << "random_value_mean=" << p.random_value_mean << ", ";
+    os << "random_value_var=" << p.random_value_var;
+
+    // Don't print use_cudnn_batchnorm when it's false, because most backends
+    // never set it to true.
+    if (p.use_cudnn_batchnorm) {
+      os << ", use_cudnn_batchnorm=true";
+    }
+    return os;
+  }
 };
 
 // Tests to test the fused operation of BatchNorm.
-class BatchNormTest : public ClientLibraryTestBase,
-                      public ::testing::WithParamInterface<BatchNormTestParam> {
+class BatchNormTestManySizes
+    : public ClientLibraryTestBase,
+      public ::testing::WithParamInterface<BatchNormTestParam> {
+ public:
+  BatchNormTestManySizes() {
+    mutable_debug_options()->set_xla_gpu_use_cudnn_batchnorm(
+        GetParam().use_cudnn_batchnorm);
+  }
 };
 
-XLA_TEST_P(BatchNormTest, RandomizedTests) {
+std::vector<BatchNormTestParam> BuildBatchNormTestParams() {
+  std::vector<BatchNormTestParam> params;
+
+  auto add_testcase = [&](std::vector<int64> bounds, int64 feature_index,
+                          float random_value_mean, float random_value_var) {
+    BatchNormTestParam p{bounds, feature_index, random_value_mean,
+                         random_value_var, /*use_cudnn_batchnorm=*/false};
+    params.push_back(p);
+
+    // If testing the GPU backend, also run with cudnn batchnorm enabled.
+#ifdef XLA_TEST_BACKEND_GPU
+    p.use_cudnn_batchnorm = true;
+    params.push_back(p);
+#endif
+  };
+
+  add_testcase({2, 2, 2, 2}, 0, 100.2f, 200.0f);
+  add_testcase({2, 2, 2, 2}, 3, 300.f, 400.0f);
+
+  add_testcase({1, 10, 1, 1}, 0, 10.1f, 20.1f);
+  add_testcase({10, 10, 10, 10}, 1, 3.14f, 314.15f);
+  add_testcase({10, 10, 10, 10}, 2, 666.6f, 777.7f);
+  add_testcase({10, 10, 10, 10}, 1, -666.6f, 777.7f);
+  add_testcase({10, 10, 10, 10}, 2, 0.f, 777.7f);
+  add_testcase({1, 1, 10, 130}, 2, 0.f, 777.7f);
+  add_testcase({1, 1, 130, 11}, 2, 0.f, 777.7f);
+  add_testcase({1, 1, 10, 1}, 3, 888.8f, 9.9f);
+
+  add_testcase({24, 129, 1, 2}, 2, 10000, 10000);
+  add_testcase({24, 129, 1, 2}, 3, 10000, 10000);
+
+  // Feature on low dimension to trigger relayout, check that internal logical
+  // to physical dimension calculation is correct after relayout.
+  add_testcase({1, 2, 3, 4}, 0, 100, 100);
+
+  // Zero-sized tensor.
+  add_testcase({1, 0, 100, 42}, 0, 100, 100);
+
+  return params;
+}
+
+INSTANTIATE_TEST_CASE_P(BatchNormTest_Instantiation, BatchNormTestManySizes,
+                        ::testing::ValuesIn(BuildBatchNormTestParams()));
+
+XLA_TEST_P(BatchNormTestManySizes, RandomizedTrainingTests) {
   float epsilon = 0.001;
   ComputationBuilder builder(client_, TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
@@ -286,9 +513,9 @@ XLA_TEST_P(BatchNormTest, RandomizedTests) {
   auto offset_activations =
       builder.Parameter(2, offset_literal->shape(), "scale");
 
-  auto expected = *Literal::MakeTuple({expected_normalized.get(),
-                                       Literal::CreateR1<float>(mean).get(),
-                                       Literal::CreateR1<float>(var).get()});
+  auto expected = Literal::MakeTuple({expected_normalized.get(),
+                                      Literal::CreateR1<float>(mean).get(),
+                                      Literal::CreateR1<float>(var).get()});
 
   std::unique_ptr<GlobalData> input_data =
       client_->TransferToServer(*input_literal).ConsumeValueOrDie();
@@ -300,13 +527,17 @@ XLA_TEST_P(BatchNormTest, RandomizedTests) {
   builder.BatchNormTraining(input_activations, scale_activations,
                             offset_activations, epsilon, feature_index);
 
+  // Run all HLO passes during this test.  In particular, ClientLibraryTestBase
+  // disables constant folding, but we want it enabled for our zero-sized tensor
+  // testcase.
+  execution_options_.mutable_debug_options()->clear_xla_disable_hlo_passes();
   ComputeAndCompareTuple(
-      &builder, expected,
+      &builder, *expected,
       {input_data.get(), scale_data.get(), offset_data.get()},
       ErrorSpec(0.01, 1));
 }
 
-XLA_TEST_P(BatchNormTest, RandomizedInferencingTests) {
+XLA_TEST_P(BatchNormTestManySizes, RandomizedInferencingTests) {
   float epsilon = 0.001;
   ComputationBuilder builder(client_, TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
@@ -402,6 +633,11 @@ XLA_TEST_P(BatchNormTest, RandomizedInferencingTests) {
                              offset_activations, mean_activations,
                              variance_activations, epsilon, feature_index);
 
+  // Run all HLO passes during this test.  In particular, ClientLibraryTestBase
+  // disables constant folding, but we want it enabled for our zero-sized tensor
+  // testcase.
+  execution_options_.mutable_debug_options()->clear_xla_disable_hlo_passes();
+
   ComputeAndCompareR4<float>(
       &builder, expected,
       {input_data.get(), scale_data.get(), offset_data.get(), mean_data.get(),
@@ -409,7 +645,7 @@ XLA_TEST_P(BatchNormTest, RandomizedInferencingTests) {
       ErrorSpec(0.01, 1));
 }
 
-XLA_TEST_P(BatchNormTest, RandomizedGradTests) {
+XLA_TEST_P(BatchNormTestManySizes, RandomizedGradTests) {
   float epsilon = 0.001;
   ComputationBuilder builder(client_, TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
@@ -447,7 +683,11 @@ XLA_TEST_P(BatchNormTest, RandomizedGradTests) {
   std::vector<float> mean(feature_bound);
 
   for (int64 i = 0; i < feature_bound; ++i) {
-    mean[i] = sum[i] / num_elements_per_feature;
+    if (num_elements_per_feature > 0) {
+      mean[i] = sum[i] / num_elements_per_feature;
+    } else {
+      mean[i] = 0;
+    }
   }
 
   std::vector<float> mean_square(feature_bound);
@@ -457,7 +697,11 @@ XLA_TEST_P(BatchNormTest, RandomizedGradTests) {
 
   std::vector<float> square_mean(feature_bound);
   for (int64 i = 0; i < feature_bound; ++i) {
-    square_mean[i] = sum_squared[i] / num_elements_per_feature;
+    if (num_elements_per_feature > 0) {
+      square_mean[i] = sum_squared[i] / num_elements_per_feature;
+    } else {
+      square_mean[i] = 0;
+    }
   }
 
   std::vector<float> var(feature_bound);
@@ -535,8 +779,12 @@ XLA_TEST_P(BatchNormTest, RandomizedGradTests) {
       grad_activation, scale4D, [](float a, float b) { return a * b; });
 
   grad_activation = *ReferenceUtil::MapArray4D(
-      grad_activation, rsqrt_var_add_epsilon,
-      [=](float a, float b) { return a * b / num_elements_per_feature; });
+      grad_activation, rsqrt_var_add_epsilon, [=](float a, float b) {
+        if (num_elements_per_feature > 0) {
+          return a * b / num_elements_per_feature;
+        }
+        return 0.f;
+      });
 
   auto expected_grad_activation =
       Literal::CreateR4FromArray4D<float>(grad_activation);
@@ -571,179 +819,20 @@ XLA_TEST_P(BatchNormTest, RandomizedGradTests) {
                                  grad_output_parameter, epsilon, feature_index);
 
   auto expected =
-      *Literal::MakeTuple({expected_grad_activation.get(),
-                           Literal::CreateR1<float>(grad_scale).get(),
-                           Literal::CreateR1<float>(grad_offset).get()});
+      Literal::MakeTuple({expected_grad_activation.get(),
+                          Literal::CreateR1<float>(grad_scale).get(),
+                          Literal::CreateR1<float>(grad_offset).get()});
+
+  // Run all HLO passes during this test.  In particular, ClientLibraryTestBase
+  // disables constant folding, but we want it enabled for our zero-sized tensor
+  // testcase.
+  execution_options_.mutable_debug_options()->clear_xla_disable_hlo_passes();
 
-  ComputeAndCompareTuple(&builder, expected,
+  ComputeAndCompareTuple(&builder, *expected,
                          {input_data.get(), scale_data.get(), mean_data.get(),
                           var_data.get(), grad_output_data.get()},
                          ErrorSpec(0.01, 1));
 }
 
-INSTANTIATE_TEST_CASE_P(
-    BatchNormTest_Instantiation, BatchNormTest,
-    ::testing::Values(BatchNormTestParam{{2, 2, 2, 2}, 0, 100.2f, 200.0f},
-                      BatchNormTestParam{{2, 2, 2, 2}, 3, 300.f, 400.0f},
-
-                      BatchNormTestParam{{1, 10, 1, 1}, 0, 10.1f, 20.1f},
-                      BatchNormTestParam{{10, 10, 10, 10}, 1, 3.14f, 314.15f},
-                      BatchNormTestParam{{10, 10, 10, 10}, 2, 666.6f, 777.7f},
-                      BatchNormTestParam{{10, 10, 10, 10}, 1, -666.6f, 777.7f},
-                      BatchNormTestParam{{10, 10, 10, 10}, 2, 0.f, 777.7f},
-                      BatchNormTestParam{{1, 1, 10, 130}, 2, 0.f, 777.7f},
-                      BatchNormTestParam{{1, 1, 130, 11}, 2, 0.f, 777.7f},
-                      BatchNormTestParam{{1, 1, 10, 1}, 3, 888.8f, 9.9f},
-
-                      BatchNormTestParam{{24, 129, 1, 2}, 2, 10000, 10000},
-                      BatchNormTestParam{{24, 129, 1, 2}, 3, 10000, 10000},
-
-                      // Feature on low dimension to trigger relayout, test
-                      // internal logical to physical dimension calculation
-                      // is correct after relayout.
-                      BatchNormTestParam{{1, 2, 3, 4}, 0, 100, 100}));
-
-XLA_TEST_F(BatchNormTest, BasicTraining) {
-  const int kFeatureIndex = 3;
-  ComputationBuilder builder(client_, TestName());
-
-  auto operand = builder.ConstantR4FromArray4D<float>(
-      {{{{1.f, 2.f}}, {{3.f, 4.f}}}, {{{5.f, 6.f}}, {{7.f, 8.f}}}});
-
-  auto scale = builder.ConstantR1<float>({2.0f, 3.0f});
-
-  auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
-
-  auto tuple = builder.BatchNormTraining(operand, scale, offset,
-                                         /*epsilon=*/0.001, kFeatureIndex);
-
-  auto expected = *Literal::MakeTuple(
-      {Literal::CreateR4<float>({{{{-1.6f, -2.0f}}, {{0.1f, 0.6f}}},
-                                 {{{1.9f, 3.3f}}, {{3.7f, 6.0f}}}})
-           .get(),
-       Literal::CreateR1<float>({4, 5}).get(),
-       Literal::CreateR1<float>({5, 5}).get()});
-
-  ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.1));
-}
-
-XLA_TEST_F(BatchNormTest, BasicTrainingOnSublane) {
-  const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
-
-  auto operand = builder.ConstantR4FromArray4D<float>(
-      {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}});
-
-  auto scale = builder.ConstantR1<float>({2.0f, 3.0f});
-
-  auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
-
-  auto tuple = builder.BatchNormTraining(operand, scale, offset,
-                                         /*epsilon=*/0.001, kFeatureIndex);
-
-  auto expected = *Literal::MakeTuple(
-      {Literal::CreateR4<float>({{{{-1.6f}, {-2.0f}}, {{0.1f}, {0.6f}}},
-                                 {{{1.9f}, {3.3f}}, {{3.7f}, {6.0f}}}})
-           .get(),
-       Literal::CreateR1<float>({4, 5}).get(),
-       Literal::CreateR1<float>({5, 5}).get()});
-
-  ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.1));
-}
-
-XLA_TEST_F(BatchNormTest, DISABLED_ON_GPU(TrainingWithFeatureOnLowDimension)) {
-  // Use 0 dimension as feature, tests layout analyzer.
-  const int kFeatureIndex = 0;
-  ComputationBuilder builder(client_, TestName());
-
-  ComputationDataHandle h0;
-  auto operand = CreateR3Parameter<float>(Array3D<float>(260, 2, 2, 1.0f),
-                                          /*parameter_number=*/0, "operand",
-                                          &builder, &h0);
-  ComputationDataHandle h1;
-  auto scale =
-      CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
-                               /*parameter_number=*/1, "scale", &builder, &h1);
-  ComputationDataHandle h2;
-  auto offset =
-      CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
-                               /*parameter_number=*/2, "offset", &builder, &h2);
-
-  auto tuple = builder.BatchNormTraining(h0, h1, h2,
-                                         /*epsilon=*/1, kFeatureIndex);
-
-  auto expected = *Literal::MakeTuple(
-      {Literal::CreateR3FromArray3D<float>(Array3D<float>(260, 2, 2, 1.0f))
-           .get(),
-       Literal::CreateR1<float>(std::vector<float>(260, 1.0f)).get(),
-       Literal::CreateR1<float>(std::vector<float>(260, 0.0f)).get()});
-
-  ComputeAndCompareTuple(&builder, expected,
-                         {operand.get(), scale.get(), offset.get()},
-                         ErrorSpec(0.1));
-}
-
-XLA_TEST_F(BatchNormTest, LargeEpsilonTest) {
-  // Test the correctness of choosing a large epsilon value.
-  const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
-
-  ComputationDataHandle h0;
-  auto operand = CreateR3Parameter<float>({{{0.0f}, {10.0f}, {20.0f}, {30.0f}}},
-                                          /*parameter_number=*/0, "operand",
-                                          &builder, &h0);
-  ComputationDataHandle h1;
-  auto scale =
-      CreateR1Parameter<float>(std::vector<float>(1, 1.0f),
-                               /*parameter_number=*/1, "scale", &builder, &h1);
-  ComputationDataHandle h2;
-  auto offset =
-      CreateR1Parameter<float>(std::vector<float>(1, 0.0f),
-                               /*parameter_number=*/2, "offset", &builder, &h2);
-
-  // var = 125, mean = 15, epsilon = -100
-  auto tuple = builder.BatchNormTraining(h0, h1, h2,
-                                         /*epsilon=*/-100, kFeatureIndex);
-
-  auto expected = *Literal::MakeTuple(
-      {Literal::CreateR3FromArray3D<float>({{{-3.0f}, {-1.0f}, {1.0f}, {3.0f}}})
-           .get(),
-       Literal::CreateR1<float>(std::vector<float>(1, 15.0f)).get(),
-       Literal::CreateR1<float>(std::vector<float>(1, 125.0f)).get()});
-
-  ComputeAndCompareTuple(&builder, expected,
-                         {operand.get(), scale.get(), offset.get()},
-                         ErrorSpec(0.1));
-}
-
-XLA_TEST_F(BatchNormTest, BatchNormGradBasic) {
-  const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
-
-  auto operand =
-      builder.ConstantR4FromArray4D<float>(Array4D<float>(2, 2, 2, 1, 0.0f));
-
-  auto scale = builder.ConstantR1<float>({1.0f, 1.0f});
-
-  auto mean = builder.ConstantR1<float>({0.0f, 0.0f});
-
-  auto var = builder.ConstantR1<float>({1.0f, 1.0f});
-
-  auto grad_output = builder.ConstantR4FromArray4D<float>(
-      {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}});
-
-  builder.BatchNormGrad(operand, scale, mean, var, grad_output,
-                        /*epsilon=*/0.0, kFeatureIndex);
-
-  auto expected = *Literal::MakeTuple(
-      {Literal::CreateR4<float>({{{{-3.f}, {-3.f}}, {{-1.f}, {-1.f}}},
-                                 {{{1.f}, {1.f}}, {{3.f}, {3.f}}}})
-           .get(),
-       Literal::CreateR1<float>({0, 0}).get(),
-       Literal::CreateR1<float>({16, 20}).get()});
-
-  ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.1));
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
index a1c53ef2aa95c7d2a9d46483dfda22a05ff0cf1a..b853dfaa15d7ff2e21048a5a6a486d22c5a05416 100644
--- a/tensorflow/compiler/xla/tests/bfloat16_test.cc
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -61,6 +61,15 @@ XLA_TEST_F(Bfloat16Test, ScalarOperation) {
                                 error_spec_);
 }
 
+XLA_TEST_F(Bfloat16Test, LogOperation) {
+  ComputationBuilder builder(client_, TestName());
+  auto x = builder.ConstantR0<bfloat16>(static_cast<bfloat16>(4.0f));
+  builder.Log(x);
+
+  ComputeAndCompareR0<bfloat16>(&builder, static_cast<bfloat16>(1.387f), {},
+                                error_spec_);
+}
+
 XLA_TEST_F(Bfloat16Test, NegateScalarF16) {
   ComputationBuilder builder(client_, TestName());
   builder.Neg(builder.ConstantR0<bfloat16>(static_cast<bfloat16>(2.1f)));
@@ -88,10 +97,11 @@ XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
   auto tuple = builder.BatchNormTraining(operand, scale, offset,
                                          /*epsilon=*/0.001, kFeatureIndex);
 
-  auto expected = *Literal::MakeTuple(
+  auto expected = Literal::MakeTuple(
       {Literal::CreateR4<bfloat16>(
-           {{{{static_cast<bfloat16>(-1.7f)}, {static_cast<bfloat16>(-2.04f)}},
-             {{static_cast<bfloat16>(0.105f)}, {static_cast<bfloat16>(0.65f)}}},
+           {{{{static_cast<bfloat16>(-1.6875f)},
+              {static_cast<bfloat16>(-2.04f)}},
+             {{static_cast<bfloat16>(0.105f)}, {static_cast<bfloat16>(0.66f)}}},
             {{{static_cast<bfloat16>(1.89f)}, {static_cast<bfloat16>(3.35f)}},
              {{static_cast<bfloat16>(3.7f)}, {static_cast<bfloat16>(6.04f)}}}})
            .get(),
@@ -102,7 +112,7 @@ XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
            {static_cast<bfloat16>(5), static_cast<bfloat16>(5)})
            .get()});
 
-  ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.01));
+  ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.01));
 }
 
 XLA_TEST_F(Bfloat16Test, BatchNormGrad) {
@@ -130,7 +140,7 @@ XLA_TEST_F(Bfloat16Test, BatchNormGrad) {
   builder.BatchNormGrad(operand, scale, mean, var, grad_output,
                         /*epsilon=*/0.0, kFeatureIndex);
 
-  auto expected = *Literal::MakeTuple(
+  auto expected = Literal::MakeTuple(
       {Literal::CreateR4<bfloat16>(
            {{{{static_cast<bfloat16>(-3.f)}, {static_cast<bfloat16>(-3.f)}},
              {{static_cast<bfloat16>(-1.f)}, {static_cast<bfloat16>(-1.f)}}},
@@ -144,7 +154,7 @@ XLA_TEST_F(Bfloat16Test, BatchNormGrad) {
            {static_cast<bfloat16>(16), static_cast<bfloat16>(20)})
            .get()});
 
-  ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.01));
+  ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.01));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/broadcast_test.cc b/tensorflow/compiler/xla/tests/broadcast_test.cc
index 0294628a127c9d506e6387d0b80f3da583c5a174..6ebbf7191833ef85ee4a48cc96c0a3be38c71228 100644
--- a/tensorflow/compiler/xla/tests/broadcast_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_test.cc
@@ -87,11 +87,11 @@ XLA_TEST_F(BroadcastTest, BroadcastVectorTo2D) {
 
   LiteralTestUtil::ExpectNear(
       *Literal::CreateR2<float>({{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}}),
-      result->tuple_literals(0), error_spec_);
+      LiteralView::Create(*result, {0}), error_spec_);
 
   LiteralTestUtil::ExpectNear(
       *Literal::CreateR2<float>({{1.0, 2.0, 3.0}, {1.0, 2.0, 3.0}}),
-      result->tuple_literals(1), error_spec_);
+      LiteralView::Create(*result, {1}), error_spec_);
 }
 
 XLA_TEST_F(BroadcastTest, Broadcast2DTo2D) {
diff --git a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
index 659660d91e519b428d28ced8591d05b4e4d45f53..f594cc10ac6496f710d03f0b0b134e6dd3b6d38f 100644
--- a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
+++ b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
@@ -104,7 +104,8 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().error_message(),
-              ContainsRegex("expects parameter 0"));
+              ContainsRegex(
+                  "Argument does not match shape of computation parameter 0"));
 
   // Shape mismatch in parameter 1 (rank)
   status = client_->Execute(computation, {f32_data.get(), f32_data.get()},
@@ -112,7 +113,8 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().error_message(),
-              ContainsRegex("expects parameter 1"));
+              ContainsRegex(
+                  "Argument does not match shape of computation parameter 1"));
 
   // Shape mismatch in parameter 1 (element type)
   status = client_->Execute(computation, {f32_data.get(), u8_4_data.get()},
@@ -120,7 +122,8 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
   ASSERT_THAT(status.status().error_message(),
-              ContainsRegex("expects parameter 1"));
+              ContainsRegex(
+                  "Argument does not match shape of computation parameter 1"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 15bd273e9b69f9c177a4ec6b5c9f0e1dccee7fc1..a677986cd926cc0054d8f36abc98ccac33dc043d 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -251,8 +251,17 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
 
 tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     ComputationBuilder* builder, const Literal& expected,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments_passed_in,
     const Shape* shape_with_layout) {
+  std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
+                                     arguments_passed_in.end());
+  if (!arguments_.empty()) {
+    CHECK(arguments.empty());
+    for (const auto& argument : arguments_) {
+      arguments.push_back(argument.get());
+    }
+  }
+
   TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
   if (ShapeUtil::ElementIsFloating(expected.shape()) ||
       ShapeUtil::ElementIsComplex(expected.shape())) {
@@ -267,12 +276,17 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   const Literal* expected_ptr = &expected;
   std::unique_ptr<Literal> converted_expected;
   Shape layout_shape;
-  if (expected.shape().element_type() == F32 && use_bfloat16_) {
+  if (use_bfloat16_) {
     converted_expected = LiteralTestUtil::ConvertF32ToBF16(expected);
     expected_ptr = converted_expected.get();
     if (shape_with_layout != nullptr) {
       layout_shape = *shape_with_layout;
-      layout_shape.set_element_type(BF16);
+      ShapeUtil::ForEachMutableSubshape(
+          &layout_shape, [&](Shape* subshape, const ShapeIndex& /*index*/) {
+            if (subshape->element_type() == F32) {
+              subshape->set_element_type(BF16);
+            }
+          });
       shape_with_layout = &layout_shape;
     }
   }
@@ -295,8 +309,17 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
 
 tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     ComputationBuilder* builder, const Literal& expected,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
-    const Shape* shape_with_layout) {
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments_passed_in,
+    ErrorSpec error, const Shape* shape_with_layout) {
+  std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
+                                     arguments_passed_in.end());
+  if (!arguments_.empty()) {
+    CHECK(arguments.empty());
+    for (const auto& argument : arguments_) {
+      arguments.push_back(argument.get());
+    }
+  }
+
   TF_RET_CHECK(ShapeUtil::ElementIsFloating(expected.shape()) ||
                ShapeUtil::ElementIsComplex(expected.shape()));
   TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
@@ -305,13 +328,17 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   const Literal* expected_ptr = &expected;
   std::unique_ptr<Literal> converted_expected;
   Shape layout_shape;
-  if (expected.shape().element_type() == F32 && use_bfloat16_) {
+  if (use_bfloat16_) {
     converted_expected = LiteralTestUtil::ConvertF32ToBF16(expected);
     expected_ptr = converted_expected.get();
-    layout_shape.set_element_type(BF16);
     if (shape_with_layout != nullptr) {
       layout_shape = *shape_with_layout;
-      layout_shape.set_element_type(BF16);
+      ShapeUtil::ForEachMutableSubshape(
+          &layout_shape, [&](Shape* subshape, const ShapeIndex& /*index*/) {
+            if (subshape->element_type() == F32) {
+              subshape->set_element_type(BF16);
+            }
+          });
       shape_with_layout = &layout_shape;
     }
   }
@@ -348,7 +375,7 @@ void ClientLibraryTestBase::ComputeAndCompareR1U8(
   VLOG(1) << "expected: " << expected_literal->ToString();
   VLOG(1) << "actual:   " << actual->ToString();
 
-  EXPECT_EQ(expected, actual->u8s_string());
+  EXPECT_EQ(expected, actual->GetR1U8AsString());
 }
 
 void ClientLibraryTestBase::ComputeAndCompareTuple(
@@ -360,7 +387,7 @@ void ClientLibraryTestBase::ComputeAndCompareTuple(
     return;
   }
   auto actual = actual_status.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectEqualTuple(expected, *actual);
+  LiteralTestUtil::ExpectEqual(expected, *actual);
 }
 
 void ClientLibraryTestBase::ComputeAndCompareTuple(
@@ -372,7 +399,7 @@ void ClientLibraryTestBase::ComputeAndCompareTuple(
     return;
   }
   auto actual = actual_status.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectNearTuple(expected, *actual, error);
+  LiteralTestUtil::ExpectNear(expected, *actual, error);
 }
 
 void ClientLibraryTestBase::ComputeAndCompare(
@@ -499,17 +526,41 @@ std::unique_ptr<GlobalData>
 ClientLibraryTestBase::CreateParameterAndTransferLiteral(
     int64 parameter_number, const Literal& literal, const string& name,
     ComputationBuilder* builder, ComputationDataHandle* data_handle) {
+  return CreateParameterAndTransferLiteral(parameter_number, literal, name,
+                                           nullptr, builder, data_handle);
+}
+
+std::unique_ptr<GlobalData>
+ClientLibraryTestBase::CreateParameterAndTransferLiteral(
+    int64 parameter_number, const Literal& literal, const string& name,
+    const DeviceHandle* device_handle, ComputationBuilder* builder,
+    ComputationDataHandle* data_handle) {
   const Literal* param_literal = &literal;
   std::unique_ptr<Literal> converted_literal;
-  if (use_bfloat16_ && literal.shape().element_type() == F32) {
+  if (use_bfloat16_) {
     converted_literal = LiteralTestUtil::ConvertF32ToBF16(literal);
     param_literal = converted_literal.get();
   }
   std::unique_ptr<GlobalData> data =
-      client_->TransferToServer(*param_literal).ConsumeValueOrDie();
+      client_->TransferToServer(*param_literal, device_handle)
+          .ConsumeValueOrDie();
   *data_handle =
       builder->Parameter(parameter_number, param_literal->shape(), name);
   return data;
 }
 
+ComputationDataHandle ClientLibraryTestBase::AddParam(
+    const Literal& argument, ComputationBuilder* builder) {
+  ComputationDataHandle data_handle;
+  arguments_.push_back(CreateParameterAndTransferLiteral(
+      arguments_.size(), argument, "", builder, &data_handle));
+  return data_handle;
+}
+
+ComputationDataHandle ClientLibraryTestBase::CreateConstantFromLiteral(
+    const Literal& literal, ComputationBuilder* builder) {
+  return builder->ConstantLiteral(
+      use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 1d27880fb1413adbbe691b5d12cadcd85fbe5d92..ba0319990bc04196386e6812b0a03671676698ec 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -43,6 +43,23 @@ limitations under the License.
 
 namespace xla {
 
+// Sets the use_bfloat16 on a container of test cases according to the values in
+// use_bfloat16_params. Generates one set of test cases for each values in
+// use_bfloat16_params with that value. Returns the result.
+template <typename TestCase>
+std::vector<TestCase> ExpandUseBfloat16(
+    tensorflow::gtl::ArraySlice<bool> use_bfloat16_params,
+    tensorflow::gtl::ArraySlice<TestCase> specs) {
+  std::vector<TestCase> expanded;
+  for (bool use_bfloat16 : use_bfloat16_params) {
+    for (const auto& spec : specs) {
+      expanded.push_back(spec);
+      expanded.back().use_bfloat16 = use_bfloat16;
+    }
+  }
+  return expanded;
+}
+
 // A client library test establishes an in-process XLA client connection.
 class ClientLibraryTestBase : public ::testing::Test {
  protected:
@@ -194,7 +211,7 @@ class ClientLibraryTestBase : public ::testing::Test {
       tensorflow::gtl::ArraySlice<GlobalData*> arguments);
   void ComputeAndCompareTuple(
       ComputationBuilder* builder, const Literal& expected,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec abs_error);
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
 
   // Convenience method for running a built computation and comparing the result
   // with the HloEvaluator.
@@ -253,6 +270,51 @@ class ClientLibraryTestBase : public ::testing::Test {
       int64 parameter_number, const Literal& literal, const string& name,
       ComputationBuilder* builder, ComputationDataHandle* data_handle);
 
+  // As above, but the caller can specify the device that the literal is
+  // transferred to. If device_handle is nullptr, the literal will be
+  // transferred to the default device.
+  std::unique_ptr<GlobalData> CreateParameterAndTransferLiteral(
+      int64 parameter_number, const Literal& literal, const string& name,
+      const DeviceHandle* device_handle, ComputationBuilder* builder,
+      ComputationDataHandle* data_handle);
+
+  // Creates a parameter instruction and sets the value that will be passed to
+  // the computation as specified. This function must be used for all parameters
+  // or none and no parameters must be passed when invoking the computation if
+  // using this mechanism. If using this mechanism, then each parameter must be
+  // set exactly once. The first added parameter gets index 0, then 1 and so on.
+  ComputationDataHandle AddParam(const Literal& argument,
+                                 ComputationBuilder* builder);
+
+  template <class T>
+  ComputationDataHandle AddParam(const Array<T>& argument,
+                                 ComputationBuilder* builder) {
+    return AddParam(*Literal::CreateFromArray(argument), builder);
+  }
+
+  // Creates a constant instruction with the given literal. When the
+  // use_bfloat16 flag is set but the literal has F32 elements, the elements
+  // will be converted to BF16s.
+  ComputationDataHandle CreateConstantFromLiteral(const Literal& literal,
+                                                  ComputationBuilder* builder);
+
+  // Creates a constant instruction with the given array. When the use_bfloat16
+  // flag is set but the array has float elements, the elements will be
+  // converted to bfloat16s.
+  template <typename NativeT>
+  ComputationDataHandle CreateConstantFromArray(const Array<NativeT>& array,
+                                                ComputationBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateFromArray(array), builder);
+  }
+
+  // Same as CreateConstantFromArray, but for scalars.
+  template <typename NativeT>
+  ComputationDataHandle CreateConstantFromScalar(NativeT value,
+                                                 ComputationBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateR0<NativeT>(value),
+                                     builder);
+  }
+
   // Creates a parameter instruction that wraps a given value and then stores
   // into "data_handle" the global handle for that parameter.
   //
@@ -315,6 +377,9 @@ class ClientLibraryTestBase : public ::testing::Test {
   bool use_bfloat16() const { return use_bfloat16_; }
   void set_use_bfloat16(bool value) { use_bfloat16_ = value; }
 
+  // The float type used in this test, BF16 or F32 according to use_bfloat16.
+  PrimitiveType FloatType() const { return use_bfloat16_ ? BF16 : F32; }
+
   Client* client_;
   ExecutionOptions execution_options_;
 
@@ -344,6 +409,9 @@ class ClientLibraryTestBase : public ::testing::Test {
   // Whether to run tests with all float-type input/output converted to
   // bfloat16.
   bool use_bfloat16_ = false;
+
+  // Arguments to be passed to the computation when it runs.
+  std::vector<std::unique_ptr<GlobalData>> arguments_;
 };
 
 template <typename NativeT>
@@ -363,6 +431,7 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
+                    std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -388,6 +457,7 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
+                    std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -413,6 +483,7 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
+                    std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -438,6 +509,7 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
+                    std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -463,6 +535,7 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
+                    std::is_same<NativeT, half>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 8853ed9e5780672d4006c326291767b8b5253f56..045148cdd11da94ae4789a753efca95c6aaa1f27 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -36,7 +36,7 @@ namespace {
 
 class ClientTest : public ClientLibraryTestBase {};
 
-TEST_F(ClientTest, ExecuteWithLayout) {
+XLA_TEST_F(ClientTest, ExecuteWithLayout) {
   ComputationBuilder b(client_, TestName());
 
   std::vector<std::vector<int64>> layouts = {{0, 1}, {1, 0}};
@@ -68,7 +68,7 @@ TEST_F(ClientTest, ExecuteWithLayout) {
   }
 }
 
-TEST_F(ClientTest, ExecuteWithTupleLayout) {
+XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
   ComputationBuilder b(client_, TestName());
 
   b.Tuple({b.ConstantR2<int32>({{1, 2}, {3, 4}}),
@@ -90,9 +90,9 @@ TEST_F(ClientTest, ExecuteWithTupleLayout) {
       auto result,
       client_->ExecuteAndTransfer(computation, {}, &execution_options));
   LiteralTestUtil::ExpectR2Equal<int32>({{1, 2}, {3, 4}},
-                                        result->tuple_literals(0));
+                                        LiteralView::Create(*result, {0}));
   LiteralTestUtil::ExpectR2Equal<int32>({{10, 20}, {30, 40}},
-                                        result->tuple_literals(1));
+                                        LiteralView::Create(*result, {1}));
 
   EXPECT_TRUE(ShapeUtil::IsTuple(result->shape()));
   EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->shape()));
@@ -107,7 +107,8 @@ TEST_F(ClientTest, ExecuteWithTupleLayout) {
                                      /*minor_to_major=*/{1, 0})));
 }
 
-TEST_F(ClientTest, DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(ExecuteParallel))) {
+XLA_TEST_F(ClientTest,
+        DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(ExecuteParallel))) {
   Computation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
   Shape shape = ShapeUtil::MakeShape(S32, {2, 2});
 
diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.cc b/tensorflow/compiler/xla/tests/codegen_test_base.cc
index e472408dcf7ed5fec74e886fd0092ce47ee2e7eb..022641394f113ef28e7c53058385d77572822213 100644
--- a/tensorflow/compiler/xla/tests/codegen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/codegen_test_base.cc
@@ -21,9 +21,11 @@ StatusOr<std::unique_ptr<Executable>> CodegenTestBase::CompileToExecutable(
     std::unique_ptr<HloModule> hlo_module) {
   TF_ASSIGN_OR_RETURN(hlo_module, backend().compiler()->RunHloPasses(
                                       std::move(hlo_module),
-                                      backend().default_stream_executor()));
+                                      backend().default_stream_executor(),
+                                      /*device_allocator=*/nullptr));
   return backend().compiler()->RunBackend(std::move(hlo_module),
-                                          backend().default_stream_executor());
+                                          backend().default_stream_executor(),
+                                          /*device_allocator=*/nullptr);
 }
 
 StatusOr<std::unique_ptr<AotCompilationResult>>
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index 5226a78386824a94572d3e5cc3329677108a910a..ec2c580670cfac14ba42e8c9a836c86551af4b89 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -149,7 +149,7 @@ TEST_F(ComputeConstantTest, Param) {
     auto computation = b.Add(param, b.ConstantR0<float>(1.5f));
 
     std::vector<Literal> arguments;
-    arguments.emplace_back(*Literal::CreateR0(42.5f));
+    arguments.push_back(std::move(*Literal::CreateR0(42.5f)));
     EXPECT_TRUE(IsConstant(computation, &b, arguments.size()));
 
     auto value =
@@ -168,7 +168,7 @@ TEST_F(ComputeConstantTest, DirectParamMissing) {
 
     auto value = ComputeConstantScalar<float>(client, computation, &b);
     EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                    .contains("depends on parameter"))
+                    .contains("depends on a parameter"))
         << value.status();
   }
 }
@@ -184,7 +184,7 @@ TEST_F(ComputeConstantTest, IndirectParamMissing) {
 
     auto value = ComputeConstantScalar<float>(client, computation, &b);
     EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                    .contains("depends on parameter"))
+                    .contains("depends on a parameter"))
         << value.status();
   }
 }
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc821674820fb128823786d7149037fc59b22ab6
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -0,0 +1,575 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+class ConditionalOpTest : public ClientLibraryTestBase {
+ protected:
+  Computation CreateR0ConstantComputation(float value) {
+    ComputationBuilder builder(client_, "Constant");
+    builder.Parameter(0, empty_tuple_, "tuple");
+    builder.ConstantR0<float>(value);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateR0IdentityComputation() {
+    ComputationBuilder builder(client_, "Identity");
+    builder.Parameter(0, r0f32_, "x");
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateCeilComputation(const Shape& shape) {
+    ComputationBuilder builder(client_, "Ceil");
+    auto param = builder.Parameter(0, shape, "param");
+    builder.Ceil(param);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateR0CeilComputation() {
+    return CreateCeilComputation(r0f32_);
+  }
+
+  Computation CreateR1CeilComputation() {
+    return CreateCeilComputation(r1s2f32_);
+  }
+
+  Computation CreateFloorComputation(const Shape& shape) {
+    ComputationBuilder builder(client_, "Floor");
+    auto param = builder.Parameter(0, shape, "param");
+    builder.Floor(param);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateR0FloorComputation() {
+    return CreateFloorComputation(r0f32_);
+  }
+
+  Computation CreateR1FloorComputation() {
+    return CreateFloorComputation(r1s2f32_);
+  }
+
+  Computation CreateTupleCeilComputation(const string& computation_name,
+                                         const Shape& tuple_shape) {
+    ComputationBuilder builder(client_, computation_name);
+    auto tuple = builder.Parameter(0, tuple_shape, "tuple");
+    auto x = builder.GetTupleElement(tuple, 0);
+    auto y = builder.GetTupleElement(tuple, 1);
+    auto x_ceil = builder.Ceil(x);
+    auto y_ceil = builder.Ceil(y);
+    builder.Tuple({x_ceil, y_ceil});
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateR0TupleCeilComputation() {
+    return CreateTupleCeilComputation("CeilR0", tuple_2_r0f32_);
+  }
+
+  Computation CreateR1TupleCeilComputation() {
+    return CreateTupleCeilComputation("CeilR1", tuple_2_r1s2f32_);
+  }
+
+  Computation CreateTupleFloorComputation(const string& computation_name,
+                                          const Shape& tuple_shape) {
+    ComputationBuilder builder(client_, computation_name);
+    auto tuple = builder.Parameter(0, tuple_shape, "tuple");
+    auto x = builder.GetTupleElement(tuple, 0);
+    auto y = builder.GetTupleElement(tuple, 1);
+    auto x_floor = builder.Floor(x);
+    auto y_floor = builder.Floor(y);
+    builder.Tuple({x_floor, y_floor});
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateR0TupleFloorComputation() {
+    return CreateTupleFloorComputation("FloorR0", tuple_2_r0f32_);
+  }
+
+  Computation CreateR1TupleFloorComputation() {
+    return CreateTupleFloorComputation("FloorR1", tuple_2_r1s2f32_);
+  }
+
+  Computation CreateTupleAddComputation(const string& computation_name,
+                                        const Shape& tuple_shape) {
+    ComputationBuilder builder(client_, computation_name);
+    auto tuple = builder.Parameter(0, tuple_shape, "tuple");
+    auto x = builder.GetTupleElement(tuple, 0);
+    auto y = builder.GetTupleElement(tuple, 1);
+    builder.Add(x, y);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateR0TupleAddComputation() {
+    return CreateTupleAddComputation("AddR0", tuple_2_r0f32_);
+  }
+
+  Computation CreateR1TupleAddComputation() {
+    return CreateTupleAddComputation("AddR1", tuple_2_r1s2f32_);
+  }
+
+  Computation CreateTupleSubComputation(const string& computation_name,
+                                        const Shape& tuple_shape) {
+    ComputationBuilder builder(client_, computation_name);
+    auto tuple = builder.Parameter(0, tuple_shape, "tuple");
+    auto x = builder.GetTupleElement(tuple, 0);
+    auto y = builder.GetTupleElement(tuple, 1);
+    builder.Sub(x, y);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateR0TupleSubComputation() {
+    return CreateTupleSubComputation("SubR0", tuple_2_r0f32_);
+  }
+
+  Computation CreateR1TupleSubComputation() {
+    return CreateTupleSubComputation("SubR1", tuple_2_r1s2f32_);
+  }
+
+  Shape r0f32_ = ShapeUtil::MakeShape(F32, {});
+  Shape r1s2f32_ = ShapeUtil::MakeShape(F32, {2});
+  Shape tuple_2_r0f32_ = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})});
+  Shape tuple_2_r1s2f32_ = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2}), ShapeUtil::MakeShape(F32, {2})});
+  Shape empty_tuple_ = ShapeUtil::MakeTupleShape({});
+  ErrorSpec error_spec_{0.001};
+};
+
+// Test true and false computations that do not take any parameters.
+XLA_TEST_F(ConditionalOpTest, Parameters0) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operands = builder.Tuple({});
+  auto true_computation = CreateR0ConstantComputation(56.0f);
+  auto false_computation = CreateR0ConstantComputation(12.0f);
+  auto result = builder.Conditional(pred, operands, true_computation, operands,
+                                    false_computation);
+
+  ComputeAndCompareR0<float>(&builder, 56.0f, {}, error_spec_);
+}
+
+// Test true and false computations that take in 1 parameter.
+XLA_TEST_F(ConditionalOpTest, Parameters1) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.0f);
+  auto operand2 = builder.ConstantR0<float>(12.0f);
+  auto identity = CreateR0IdentityComputation();
+  auto result =
+      builder.Conditional(pred, operand1, identity, operand2, identity);
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test conditional with two different computations in the true and false cases
+// that take in different arguments.
+XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.4f);
+  auto operand2 = builder.ConstantR0<float>(12.6f);
+  auto result = builder.Conditional(pred, operand1, CreateR0CeilComputation(),
+                                    operand2, CreateR0FloorComputation());
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test conditional with two different computations in the true and false cases
+// that take in the same arguments.
+XLA_TEST_F(ConditionalOpTest, DiffComputationsSameArg) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand = builder.ConstantR0<float>(12.6f);
+  auto result = builder.Conditional(pred, operand, CreateR0CeilComputation(),
+                                    operand, CreateR0FloorComputation());
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test conditional with the same computation in the true and false cases but
+// take in different arguments.
+XLA_TEST_F(ConditionalOpTest, SameComputationDiffArgs) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.4f);
+  auto operand2 = builder.ConstantR0<float>(12.6f);
+  auto floor = CreateR0FloorComputation();
+  auto result = builder.Conditional(pred, operand1, floor, operand2, floor);
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test conditional with the same computation in the true and false cases that
+// take in the same arguments.
+XLA_TEST_F(ConditionalOpTest, SameComputationSameArg) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand = builder.ConstantR0<float>(12.6f);
+  auto floor = CreateR0FloorComputation();
+  auto result = builder.Conditional(pred, operand, floor, operand, floor);
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test conditional with different instances of the same computation in the true
+// and false cases.
+XLA_TEST_F(ConditionalOpTest, SameComputationDiffInstances) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.4f);
+  auto operand2 = builder.ConstantR0<float>(12.6f);
+  auto result = builder.Conditional(pred, operand1, CreateR0FloorComputation(),
+                                    operand2, CreateR0FloorComputation());
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test the case when a call invokes a computation that contains a conditional.
+XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
+  Shape r0bool = ShapeUtil::MakeShape(PRED, {});
+  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  auto pred_cond = inner_builder.Parameter(0, r0bool, "param0");
+  auto true_operand = inner_builder.Parameter(1, r0f32_, "param1");
+  auto false_operand = inner_builder.Parameter(2, r0f32_, "param2");
+  inner_builder.Conditional(pred_cond, true_operand, CreateR0CeilComputation(),
+                            false_operand, CreateR0FloorComputation());
+  auto inner_builder_result = inner_builder.Build();
+
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.4f);
+  auto operand2 = builder.ConstantR0<float>(12.6f);
+  builder.Call(inner_builder_result.ConsumeValueOrDie(),
+               {pred, operand1, operand2});
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test true and false computations that take in 2 parameters and predicate is
+// true.
+XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operand1 = builder.ConstantR0<float>(56.0f);
+  auto operand2 = builder.ConstantR0<float>(12.0f);
+  auto operands = builder.Tuple({operand1, operand2});
+  auto result =
+      builder.Conditional(pred, operands, CreateR0TupleAddComputation(),
+                          operands, CreateR0TupleSubComputation());
+
+  ComputeAndCompareR0<float>(&builder, 68.0f, {}, error_spec_);
+}
+
+// Test true and false computations that take in 2 parameters and predicate is
+// false.
+XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.0f);
+  auto operand2 = builder.ConstantR0<float>(12.0f);
+  auto operands = builder.Tuple({operand1, operand2});
+  auto result =
+      builder.Conditional(pred, operands, CreateR0TupleAddComputation(),
+                          operands, CreateR0TupleSubComputation());
+
+  ComputeAndCompareR0<float>(&builder, 44.0f, {}, error_spec_);
+}
+
+// Test true and false computations that take in 2 array parameters and
+// predicate is true.
+XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
+  auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
+  auto operands = builder.Tuple({operand1, operand2});
+  auto result =
+      builder.Conditional(pred, operands, CreateR1TupleAddComputation(),
+                          operands, CreateR1TupleSubComputation());
+
+  ComputeAndCompareR1<float>(&builder, {34.0f, 67.0f}, {}, error_spec_);
+}
+
+// Test true and false computations that take in 2 array parameters and
+// predicate is false.
+XLA_TEST_F(ConditionalOpTest, Parameters2ArrayFalseBranch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
+  auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
+  auto operands = builder.Tuple({operand1, operand2});
+  auto result =
+      builder.Conditional(pred, operands, CreateR1TupleAddComputation(),
+                          operands, CreateR1TupleSubComputation());
+
+  ComputeAndCompareR1<float>(&builder, {14.0f, 45.0f}, {}, error_spec_);
+}
+
+// Test true and false computations that return a tuple of scalars.
+XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operands = builder.Tuple(
+      {builder.ConstantR0<float>(12.2f), builder.ConstantR0<float>(25.6f)});
+  builder.Conditional(pred, operands, CreateR0TupleCeilComputation(), operands,
+                      CreateR0TupleFloorComputation());
+
+  ComputeAndCompareTuple(
+      &builder,
+      *Literal::MakeTuple({Literal::CreateR0<float>(12.0f).get(),
+                           Literal::CreateR0<float>(25.0f).get()}),
+      {}, error_spec_);
+}
+
+// Test true and false computations that return a tuple of arrays.
+XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operands = builder.Tuple({builder.ConstantR1<float>({12.2f, 15.8f}),
+                                 builder.ConstantR1<float>({25.6f, 29.2f})});
+  builder.Conditional(pred, operands, CreateR1TupleCeilComputation(), operands,
+                      CreateR1TupleFloorComputation());
+
+  ComputeAndCompareTuple(
+      &builder,
+      *Literal::MakeTuple({Literal::CreateR1<float>({13.0f, 16.0f}).get(),
+                           Literal::CreateR1<float>({26.0f, 30.0f}).get()}),
+      {}, error_spec_);
+}
+
+// Test true and false computations that return a tuple of a predicate, a
+// scalar, and an array.
+XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
+  ComputationBuilder true_builder(client_, TestName() + ".true");
+  {
+    true_builder.Parameter(0, empty_tuple_, "tuple");
+    auto true_pred = true_builder.ConstantR0<bool>(true);
+    auto true_scalar = true_builder.ConstantR0<float>(12.2f);
+    auto true_array = true_builder.ConstantR1<float>({12.8f, 14.6f});
+    true_builder.Tuple({true_pred, true_scalar, true_array});
+  }
+  auto true_builder_result = true_builder.Build();
+  EXPECT_IS_OK(true_builder_result.status());
+
+  ComputationBuilder false_builder(client_, TestName() + ".false");
+  {
+    false_builder.Parameter(0, empty_tuple_, "tuple");
+    auto false_pred = false_builder.ConstantR0<bool>(false);
+    auto false_scalar = false_builder.ConstantR0<float>(25.6f);
+    auto false_array = false_builder.ConstantR1<float>({26.4f, 32.6f});
+    false_builder.Tuple({false_pred, false_scalar, false_array});
+  }
+  auto false_builder_result = false_builder.Build();
+  EXPECT_IS_OK(false_builder_result.status());
+
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operands = builder.Tuple({});
+  builder.Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(),
+                      operands, false_builder_result.ConsumeValueOrDie());
+
+  ComputeAndCompareTuple(
+      &builder,
+      *Literal::MakeTuple({Literal::CreateR0<bool>(true).get(),
+                           Literal::CreateR0<float>(12.2f).get(),
+                           Literal::CreateR1<float>({12.8f, 14.6f}).get()}),
+      {}, error_spec_);
+}
+
+// Test true and false computations that return a nested tuple.
+XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
+  ComputationBuilder true_builder(client_, TestName() + ".true");
+  {
+    true_builder.Parameter(0, empty_tuple_, "tuple");
+    auto true_constant1 = true_builder.ConstantR0<float>(12.2f);
+    auto true_constant2 = true_builder.ConstantR1<float>({12.8f, 14.6f});
+    auto true_constant3 = true_builder.ConstantR1<float>({25.4f, 29.8f});
+    auto true_constant4 = true_builder.ConstantR0<float>(35.6f);
+    true_builder.Tuple({true_builder.Tuple({true_constant1, true_constant2}),
+                        true_builder.Tuple({true_constant3, true_constant4})});
+  }
+  auto true_builder_result = true_builder.Build();
+  EXPECT_IS_OK(true_builder_result.status());
+
+  ComputationBuilder false_builder(client_, TestName() + ".false");
+  {
+    false_builder.Parameter(0, empty_tuple_, "tuple");
+    auto false_constant1 = false_builder.ConstantR0<float>(46.6f);
+    auto false_constant2 = false_builder.ConstantR1<float>({54.4f, 58.4f});
+    auto false_constant3 = false_builder.ConstantR1<float>({62.1f, 67.4f});
+    auto false_constant4 = false_builder.ConstantR0<float>(9.3f);
+    false_builder.Tuple(
+        {false_builder.Tuple({false_constant1, false_constant2}),
+         false_builder.Tuple({false_constant3, false_constant4})});
+  }
+  auto false_builder_result = false_builder.Build();
+  EXPECT_IS_OK(false_builder_result.status());
+
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operands = builder.Tuple({});
+  builder.Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(),
+                      operands, false_builder_result.ConsumeValueOrDie());
+
+  ComputeAndCompareTuple(
+      &builder,
+      *Literal::MakeTuple(
+          {Literal::MakeTuple({Literal::CreateR0<float>(46.6f).get(),
+                               Literal::CreateR1<float>({54.4f, 58.4f}).get()})
+               .get(),
+           Literal::MakeTuple({Literal::CreateR1<float>({62.1f, 67.4f}).get(),
+                               Literal::CreateR0<float>(9.3f).get()})
+               .get()}),
+      {}, error_spec_);
+}
+
+// Test conditional that takes in scalar operands in the form of external
+// params.
+XLA_TEST_F(ConditionalOpTest, ScalarOperandsFromExternalParams) {
+  Shape r0bool = ShapeUtil::MakeShape(PRED, {});
+  ComputationBuilder builder(client_, TestName());
+
+  ComputationDataHandle pred, operand1, operand2;
+  auto pred_arg = CreateR0Parameter<bool>(true, 0, "pred", &builder, &pred);
+  auto operand1_param =
+      CreateR0Parameter<float>(56.3f, 1, "operand1", &builder, &operand1);
+  auto operand2_param =
+      CreateR0Parameter<float>(12.7f, 2, "operand2", &builder, &operand2);
+  auto result = builder.Conditional(pred, operand1, CreateR0CeilComputation(),
+                                    operand2, CreateR0FloorComputation());
+
+  ComputeAndCompareR0<float>(
+      &builder, 57.0f,
+      {pred_arg.get(), operand1_param.get(), operand2_param.get()},
+      error_spec_);
+}
+
+// Test conditional that takes in array operands in the form of external params.
+XLA_TEST_F(ConditionalOpTest, ArrayOperandsFromExternalParams) {
+  Shape r0bool = ShapeUtil::MakeShape(PRED, {});
+  ComputationBuilder builder(client_, TestName());
+
+  ComputationDataHandle pred, operand1, operand2;
+  auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
+  auto operand1_param = CreateR1Parameter<float>({24.3f, 56.7f}, 1, "operand1",
+                                                 &builder, &operand1);
+  auto operand2_param = CreateR1Parameter<float>({10.2f, 11.6f}, 2, "operand2",
+                                                 &builder, &operand2);
+  auto result = builder.Conditional(pred, operand1, CreateR1CeilComputation(),
+                                    operand2, CreateR1FloorComputation());
+
+  ComputeAndCompareR1<float>(
+      &builder, {10.0f, 11.0f},
+      {pred_arg.get(), operand1_param.get(), operand2_param.get()},
+      error_spec_);
+}
+
+// Test the case where one conditional is nested within another.
+XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
+  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  {
+    Shape r0bool = ShapeUtil::MakeShape(PRED, {});
+    Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
+    auto param0 = inner_builder.Parameter(0, tuple_shape, "param0");
+    auto pred_cond = inner_builder.GetTupleElement(param0, 0);
+    auto true_operand = inner_builder.GetTupleElement(param0, 1);
+    auto false_operand = inner_builder.GetTupleElement(param0, 2);
+    inner_builder.Conditional(pred_cond, true_operand,
+                              CreateR0CeilComputation(), false_operand,
+                              CreateR0FloorComputation());
+  }
+  auto inner_builder_result = inner_builder.Build();
+  EXPECT_IS_OK(inner_builder_result.status());
+
+  ComputationBuilder builder(client_, TestName());
+  auto pred1 = builder.ConstantR0<bool>(true);
+  auto pred2 = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(1.1f);
+  auto operand2 = builder.ConstantR0<float>(12.2f);
+  auto operand3 = builder.ConstantR0<float>(43.3f);
+  auto tuple_operand = builder.Tuple({pred2, operand1, operand2});
+  builder.Conditional(pred1, tuple_operand,
+                      inner_builder_result.ConsumeValueOrDie(), operand3,
+                      CreateR0IdentityComputation());
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
+  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  {
+    Shape r0bool = ShapeUtil::MakeShape(PRED, {});
+    Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
+    auto param0 = inner_builder.Parameter(0, tuple_shape, "param0");
+    auto pred_cond = inner_builder.GetTupleElement(param0, 0);
+    auto true_operand = inner_builder.GetTupleElement(param0, 1);
+    auto false_operand = inner_builder.GetTupleElement(param0, 2);
+    inner_builder.Conditional(pred_cond, true_operand,
+                              CreateR0CeilComputation(), false_operand,
+                              CreateR0FloorComputation());
+  }
+  auto inner_builder_result = inner_builder.Build();
+  EXPECT_IS_OK(inner_builder_result.status());
+
+  ComputationBuilder builder(client_, TestName());
+  auto pred2 = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(1.1f);
+  auto operand2 = builder.ConstantR0<float>(12.2f);
+  auto tuple_operand = builder.Tuple({pred2, operand1, operand2});
+  builder.Call(inner_builder_result.ConsumeValueOrDie(), {tuple_operand});
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test a mismatch in the shape of the true operand and true computation.
+XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operand1 = builder.ConstantR0<float>(56.0f);
+  auto operand2 = builder.ConstantR0<float>(12.0f);
+  auto operands = builder.Tuple({operand1, operand2});
+  builder.Conditional(pred, operands, CreateR1TupleAddComputation(), operands,
+                      CreateR0TupleSubComputation());
+
+  auto result = builder.Build();
+  EXPECT_FALSE(result.ok());
+  EXPECT_THAT(result.status().error_message(),
+              ::testing::HasSubstr("true_operand must match the shape of the "
+                                   "only parameter of true_computation"));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc
index 97bd1553664a6c0fcb097b441ec42efb4eaa9cc2..35aa3f6d696297efb7d95d826ed75a504a24529d 100644
--- a/tensorflow/compiler/xla/tests/constants_test.cc
+++ b/tensorflow/compiler/xla/tests/constants_test.cc
@@ -141,11 +141,12 @@ TEST_F(ConstantsTest, Small_3x2x1x1) {
       {5.0f, 4.4f},   // p2
   });
   input_array.FillWithPZ(pz);
-  Literal input_literal = *Literal::CreateR4FromArray4D(input_array);
+  std::unique_ptr<Literal> input_literal =
+      Literal::CreateR4FromArray4D(input_array);
 
   {
     ComputationBuilder builder(client_, TestName());
-    builder.ConstantLiteral(input_literal);
+    builder.ConstantLiteral(*input_literal);
     ComputeAndCompareR4<float>(&builder, input_array, {}, error_spec_);
   }
 
@@ -165,10 +166,10 @@ TEST_F(ConstantsTest, DISABLED_TupleConstant) {
 
   std::unique_ptr<Literal> result = ExecuteAndTransferOrDie(&builder, {});
 
-  LiteralTestUtil::ExpectR2Near<float>({{1.0}, {2.0}},
-                                       result->tuple_literals(0), error_spec_);
-  LiteralTestUtil::ExpectR1Near<float>({2.0, 42.0}, result->tuple_literals(1),
-                                       error_spec_);
+  LiteralTestUtil::ExpectR2Near<float>(
+      {{1.0}, {2.0}}, LiteralView::Create(*result, {0}), error_spec_);
+  LiteralTestUtil::ExpectR1Near<float>(
+      {2.0, 42.0}, LiteralView::Create(*result, {1}), error_spec_);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 2924c08615fa706bb19addf04bf58e1d5dd5a659..0ceb9aff378ae8aa8098be9360310b1d78d31ab2 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -105,8 +105,8 @@ TEST_F(ConvolutionTest, Convolve_1x1x1x2_1x1x1x2_Valid) {
   }));
 
   ComputeAndCompare(&builder, conv,
-                    {*Literal::CreateFromArray(input_data),
-                     *Literal::CreateFromArray(filter_data)},
+                    {std::move(*Literal::CreateFromArray(input_data)),
+                     std::move(*Literal::CreateFromArray(filter_data))},
                     error_spec_);
 }
 
@@ -136,8 +136,8 @@ TEST_F(ConvolutionTest, Convolve_1x1x4x4_1x1x2x2_Valid) {
   }));
   // clang-format on
   ComputeAndCompare(&builder, conv,
-                    {*Literal::CreateFromArray(input_data),
-                     *Literal::CreateFromArray(filter_data)},
+                    {std::move(*Literal::CreateFromArray(input_data)),
+                     std::move(*Literal::CreateFromArray(filter_data))},
                     error_spec_);
 }
 
@@ -167,8 +167,8 @@ TEST_F(ConvolutionTest, Convolve_1x1x4x4_1x1x2x2_Same) {
   }));
   // clang-format on
   ComputeAndCompare(&builder, conv,
-                    {*Literal::CreateFromArray(input_data),
-                     *Literal::CreateFromArray(filter_data)},
+                    {std::move(*Literal::CreateFromArray(input_data)),
+                     std::move(*Literal::CreateFromArray(filter_data))},
                     error_spec_);
 }
 
@@ -200,8 +200,8 @@ TEST_F(ConvolutionTest, Convolve_1x1x4x4_1x1x3x3_Same) {
   }));
   // clang-format on
   ComputeAndCompare(&builder, conv,
-                    {*Literal::CreateFromArray(input_data),
-                     *Literal::CreateFromArray(filter_data)},
+                    {std::move(*Literal::CreateFromArray(input_data)),
+                     std::move(*Literal::CreateFromArray(filter_data))},
                     error_spec_);
 }
 
@@ -501,10 +501,10 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
   Array2D<float> expected_result(29, 10);
   expected_result.Fill(0);
 
-  ComputeAndCompare(
-      &builder, conv,
-      {*Literal::CreateFromArray(param0), *Literal::CreateFromArray(param1)},
-      error_spec_);
+  ComputeAndCompare(&builder, conv,
+                    {std::move(*Literal::CreateFromArray(param0)),
+                     std::move(*Literal::CreateFromArray(param1))},
+                    error_spec_);
 }
 
 INSTANTIATE_TEST_CASE_P(ConvolveWithAndWithoutCanonicalization_Instantiation,
@@ -608,5 +608,28 @@ INSTANTIATE_TEST_CASE_P(
 
 );
 
+TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
+  ComputationBuilder builder(client_, TestName());
+  Shape input_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2});
+  Shape filter_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2});
+  auto input = builder.Parameter(0, input_shape, "input");
+  auto filter = builder.Parameter(1, filter_shape, "filter");
+  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+
+  Array4D<bfloat16> input_data(1, 1, 1, 2);
+  input_data.FillWithYX(Array2D<bfloat16>({
+      {bfloat16(1), bfloat16(2)},
+  }));
+  Array4D<bfloat16> filter_data(1, 1, 1, 2);
+  filter_data.FillWithYX(Array2D<bfloat16>({
+      {bfloat16(5), bfloat16(6)},
+  }));
+
+  ComputeAndCompare(&builder, conv,
+                    {std::move(*Literal::CreateFromArray(input_data)),
+                     std::move(*Literal::CreateFromArray(filter_data))},
+                    error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index bcb85b04eefa349df1c055e010d584b85b55a4a8..ece7c3b05e7fafa299db7f9cbf50610c8204f95e 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -40,7 +40,7 @@ class CopyOpTest : public HloTestBase {
   void TestCopyOp(const Literal& literal) {
     auto builder = HloComputation::Builder(TestName());
     auto constant = builder.AddInstruction(
-        HloInstruction::CreateConstant(MakeUnique<Literal>(literal)));
+        HloInstruction::CreateConstant(literal.CloneToUnique()));
     builder.AddInstruction(HloInstruction::CreateUnary(
         constant->shape(), HloOpcode::kCopy, constant));
     auto computation = builder.Build();
@@ -56,9 +56,13 @@ class CopyOpTest : public HloTestBase {
                                 tensorflow::gtl::ArraySlice<int64> permutation);
 };
 
-XLA_TEST_F(CopyOpTest, CopyR0Bool) { TestCopyOp(*Literal::CreateR0<bool>(true)); }
+XLA_TEST_F(CopyOpTest, CopyR0Bool) {
+  TestCopyOp(*Literal::CreateR0<bool>(true));
+}
 
-XLA_TEST_F(CopyOpTest, CopyR1S0U32) { TestCopyOp(*Literal::CreateR1<uint32>({})); }
+XLA_TEST_F(CopyOpTest, CopyR1S0U32) {
+  TestCopyOp(*Literal::CreateR1<uint32>({}));
+}
 
 XLA_TEST_F(CopyOpTest, CopyR1S3U32) {
   TestCopyOp(*Literal::CreateR1<uint32>({1, 2, 3}));
@@ -85,7 +89,6 @@ XLA_TEST_F(CopyOpTest, CopyParameterScalar) {
   // Copy literal to device to use as parameter.
   auto literal = Literal::CreateR0<float>(42.0);
   Shape shape = literal->shape();
-  auto constant_device_base = TransferToDevice(*literal);
 
   auto param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, shape, "param0"));
@@ -98,7 +101,7 @@ XLA_TEST_F(CopyOpTest, CopyParameterScalar) {
   module->AddEntryComputation(std::move(computation));
 
   std::unique_ptr<Literal> result =
-      ExecuteAndTransfer(std::move(module), {constant_device_base});
+      ExecuteAndTransfer(std::move(module), {literal.get()});
   LiteralTestUtil::ExpectR0Near<float>(42.0f, *result, error_spec_);
 }
 
@@ -129,7 +132,8 @@ XLA_TEST_F(CopyOpTest, CopyConstantR2DifferentLayouts) {
   std::unique_ptr<Literal> literal =
       Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   // Reverse the minor-to-major order of the literal.
-  Layout* literal_layout = literal->mutable_shape()->mutable_layout();
+  Layout* literal_layout =
+      literal->mutable_shape_do_not_use()->mutable_layout();
   ASSERT_EQ(2, literal_layout->minor_to_major_size());
   literal_layout->mutable_minor_to_major()->SwapElements(0, 1);
 
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 74f73a1ddc15be033e52b0b45f9961e5dc3a1ecb..2d847a66b0ae7c8f09fa0cb181a4c84ea99be5b1 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -128,5 +129,19 @@ XLA_TEST_F(CustomCallTest,
       Array3D<float>{{{2, 3}, {4, 5}}, {{3, 4}, {5, 6}}}, *result);
 }
 
+class CustomCallClientAPITest : public ClientLibraryTestBase {};
+
+// When using the client API, CustomCall targets can't begin with '$' -- these
+// are reserved for internal use.
+XLA_TEST_F(CustomCallClientAPITest, IllegalCustomCallTarget) {
+  ComputationBuilder builder(client_, TestName());
+  auto call = builder.CustomCall("$illegal", /*operands=*/{},
+                                 ShapeUtil::MakeShape(F32, {1}));
+
+  StatusOr<std::unique_ptr<GlobalData>> result =
+      Execute(&builder, /*arguments=*/{});
+  EXPECT_FALSE(result.ok());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index bfb04fd9f9bf6887c4462cb00fee00250517f5c4..6b0c04c2c083bbfce267dd92d24ef15c06186d26 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -51,8 +51,6 @@ class DotOperationTest : public ClientLibraryTestBase {
   template <typename Element>
   void TestNonsquareMatrixDot(bool lhs_row_major = false,
                               bool rhs_row_major = false);
-  void TestMatrixDot(int M, int K, int N, bool lhs_row_major = false,
-                     bool rhs_row_major = false);
 };
 
 XLA_TEST_F(DotOperationTest, ZeroElementVectorDotF32) {
@@ -199,158 +197,182 @@ void DotOperationTest::TestSquareMatrixDot(bool lhs_row_major,
       &builder, expected, {lhs_handle.get(), rhs_handle.get()}, error_spec_);
 }
 
-void DotOperationTest::TestMatrixDot(int M, int K, int N, bool lhs_row_major,
-                                     bool rhs_row_major) {
-  std::unique_ptr<Array2D<float>> lhs_data =
-      MakeLinspaceArray2D(0.0, 1.0, M, K);
-  std::unique_ptr<Literal> lhs_lit = Literal::CreateR2FromArray2DWithLayout(
-      *lhs_data,
-      LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major)));
-  auto lhs_handle = client_->TransferToServer(*lhs_lit).ConsumeValueOrDie();
+struct DotTestParam {
+  int m;
+  int k;
+  int n;
+  bool dot_lhs_row_major;
+  bool dot_rhs_row_major;
+  bool has_addend;
+  bool addend_row_major;
+};
+
+string PrintDotTestParam(
+    const ::testing::TestParamInfo<DotTestParam>& test_param) {
+  const DotTestParam& param = test_param.param;
+  if (param.has_addend) {
+    return tensorflow::strings::StrCat(param.m, "x", param.k, "x", param.n,
+                                       "_MajorToMinor",
+                                       param.dot_lhs_row_major ? "T" : "F",
+                                       param.dot_rhs_row_major ? "T" : "F",
+                                       param.addend_row_major ? "T" : "F");
+  } else {
+    return tensorflow::strings::StrCat(param.m, "x", param.k, "x", param.n,
+                                       "_MajorToMinor",
+                                       param.dot_lhs_row_major ? "T" : "F",
+                                       param.dot_rhs_row_major ? "T" : "F");
+  }
+}
 
-  std::unique_ptr<Array2D<float>> rhs_data =
-      MakeLinspaceArray2D(0.0, 1.0, K, N);
-  std::unique_ptr<Literal> rhs_lit = Literal::CreateR2FromArray2DWithLayout(
-      *rhs_data,
-      LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major)));
-  auto rhs_handle = client_->TransferToServer(*rhs_lit).ConsumeValueOrDie();
+class ParametricDotTest : public DotOperationTest,
+                          public ::testing::WithParamInterface<DotTestParam> {};
+
+XLA_TEST_P(ParametricDotTest, TestF32) {
+  DotTestParam param = GetParam();
+
+  std::unique_ptr<Array2D<float>> dot_lhs_data =
+      MakeLinspaceArray2D(0.0, 1.0, param.m, param.k);
+  std::unique_ptr<Literal> dot_lhs_lit = Literal::CreateR2FromArray2DWithLayout(
+      *dot_lhs_data, LayoutUtil::MakeLayout(
+                         MinorToMajorForIsRowMajor(param.dot_lhs_row_major)));
+  std::unique_ptr<GlobalData> dot_lhs_handle =
+      client_->TransferToServer(*dot_lhs_lit).ConsumeValueOrDie();
+
+  std::unique_ptr<Array2D<float>> dot_rhs_data =
+      MakeLinspaceArray2D(0.0, 1.0, param.k, param.n);
+  std::unique_ptr<Literal> dot_rhs_lit = Literal::CreateR2FromArray2DWithLayout(
+      *dot_rhs_data, LayoutUtil::MakeLayout(
+                         MinorToMajorForIsRowMajor(param.dot_rhs_row_major)));
+  std::unique_ptr<GlobalData> dot_rhs_handle =
+      client_->TransferToServer(*dot_rhs_lit).ConsumeValueOrDie();
+
+  std::unique_ptr<Array2D<float>> addend_data;
+  std::unique_ptr<Literal> addend_lit;
+  std::unique_ptr<GlobalData> addend_handle;
+
+  if (param.has_addend) {
+    addend_data = MakeLinspaceArray2D(0.0, 1.0, param.m, param.n);
+    addend_lit = Literal::CreateR2FromArray2DWithLayout(
+        *addend_data, LayoutUtil::MakeLayout(
+                          MinorToMajorForIsRowMajor(param.addend_row_major)));
+    addend_handle = client_->TransferToServer(*addend_lit).ConsumeValueOrDie();
+  }
 
   ComputationBuilder builder(client_, TestName());
   auto prim_type = primitive_util::NativeToPrimitiveType<float>();
   auto result = builder.Dot(
-      builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {M, K}), "lhs"),
-      builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {K, N}), "rhs"));
-
-  std::unique_ptr<Array2D<float>> expected =
-      ReferenceUtil::MatmulArray2D(*lhs_data, *rhs_data);
-
-  ComputeAndCompareR2<float>(&builder, *expected,
-                             {lhs_handle.get(), rhs_handle.get()},
-                             ErrorSpec(0.3, 3e-3));
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_12_117_7_MinorToMajorTF) {
-  TestMatrixDot(12, 117, 7, true, false);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_12_117_7_MinorToMajorFT) {
-  TestMatrixDot(12, 117, 7, false, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_12_117_7_MinorToMajorTT) {
-  TestMatrixDot(12, 117, 7, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_12_117_7_MinorToMajorFF) {
-  TestMatrixDot(12, 117, 7, false, false);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_270_270_520_MinorToMajorTT) {
-  TestMatrixDot(270, 270, 520, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_270_270_520_MinorToMajorTF) {
-  TestMatrixDot(270, 270, 520, true, false);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_270_270_520_MinorToMajorFT) {
-  TestMatrixDot(270, 270, 520, false, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_270_270_520_MinorToMajorFF) {
-  TestMatrixDot(270, 270, 520, false, false);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorTT) {
-  TestMatrixDot(269, 3, 520, true, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorTF) {
-  TestMatrixDot(260, 3, 520, true, false);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorFT) {
-  TestMatrixDot(260, 3, 520, false, true);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorFF) {
-  TestMatrixDot(260, 3, 520, false, false);
-}
-
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x8x8) {
-  TestMatrixDot(1, 8, 8, true, true);
-}
+      builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {param.m, param.k}),
+                        "dot_lhs"),
+      builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {param.k, param.n}),
+                        "dot_rhs"));
+
+  if (param.has_addend) {
+    result = builder.Add(
+        result,
+        builder.Parameter(
+            2, ShapeUtil::MakeShape(prim_type, {param.m, param.n}), "addend"));
+  }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x130x8) {
-  TestMatrixDot(1, 130, 8, true, true);
-}
+  std::unique_ptr<Array2D<float>> expected;
+  if (param.has_addend) {
+    expected = ReferenceUtil::ApplyElementwise2D(
+        std::plus<float>(),
+        *ReferenceUtil::MatmulArray2D(*dot_lhs_data, *dot_rhs_data),
+        *addend_data);
+  } else {
+    expected = ReferenceUtil::MatmulArray2D(*dot_lhs_data, *dot_rhs_data);
+  }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x8x130) {
-  TestMatrixDot(1, 8, 130, true, true);
-}
+  std::vector<GlobalData*> args = {dot_lhs_handle.get(), dot_rhs_handle.get()};
+  if (param.has_addend) {
+    args.push_back(addend_handle.get());
+  }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x290x130) {
-  TestMatrixDot(1, 290, 130, true, true);
+  ComputeAndCompareR2<float>(&builder, *expected, args, ErrorSpec(0.3, 3e-3));
 }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_2x1x1) {
-  TestMatrixDot(2, 1, 1, true, true);
-}
+std::vector<DotTestParam> CreateDotTestParameters() {
+  std::vector<DotTestParam> params;
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_8x8x1) {
-  TestMatrixDot(8, 8, 1, true, true);
-}
+  auto add_matrix_matrix_dot_test = [&](int m, int k, int n) {
+    for (bool lhs_row_major : {true, false}) {
+      for (bool rhs_row_major : {true, false}) {
+        params.push_back({/*m=*/m, /*k=*/k, /*n=*/n,
+                          /*dot_lhs_row_major=*/lhs_row_major,
+                          /*dot_rhs_row_major=*/rhs_row_major,
+                          /*has_addend=*/false, /*addend_row_major=*/true});
+      }
+    }
+  };
+
+  auto add_matrix_vector_dot_test = [&](int k, int n) {
+    for (bool has_addend : {false, true}) {
+      params.push_back({/*m=*/1, /*k=*/k, /*n=*/n,
+                        /*dot_lhs_row_major=*/true, /*dot_rhs_row_major=*/true,
+                        /*has_addend=*/has_addend, /*addend_row_major=*/true});
+      if (n != 1) {
+        params.push_back(
+            {/*m=*/n, /*k=*/k, /*n=*/1,
+             /*dot_lhs_row_major=*/true, /*dot_rhs_row_major=*/true,
+             /*has_addend=*/has_addend, /*addend_row_major=*/true});
+      }
+    }
+  };
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_16x1x1) {
-  TestMatrixDot(16, 1, 1, true, true);
-}
+  add_matrix_matrix_dot_test(/*m=*/12, /*k=*/117, /*n=*/7);
+  add_matrix_matrix_dot_test(/*m=*/270, /*k=*/270, /*n=*/520);
+  add_matrix_matrix_dot_test(/*m=*/260, /*k=*/3, /*n=*/520);
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_16x3x1) {
-  TestMatrixDot(16, 3, 1, true, true);
-}
+  add_matrix_vector_dot_test(/*k=*/8, /*n=*/8);
+  add_matrix_vector_dot_test(/*k=*/130, /*n=*/8);
+  add_matrix_vector_dot_test(/*k=*/8, /*n=*/130);
+  add_matrix_vector_dot_test(/*k=*/290, /*n=*/130);
+  add_matrix_vector_dot_test(/*k=*/1, /*n=*/1);
+  add_matrix_vector_dot_test(/*k=*/1, /*n=*/16);
+  add_matrix_vector_dot_test(/*k=*/3, /*n=*/16);
+  add_matrix_vector_dot_test(/*k=*/3, /*n=*/3);
+  add_matrix_vector_dot_test(/*k=*/29, /*n=*/29);
+  add_matrix_vector_dot_test(/*k=*/8, /*n=*/2);
+  add_matrix_vector_dot_test(/*k=*/2, /*n=*/8);
+  add_matrix_vector_dot_test(/*k=*/259, /*n=*/258);
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_3x3x1) {
-  TestMatrixDot(3, 3, 1, true, true);
+  return params;
 }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_29x29x1) {
-  TestMatrixDot(29, 29, 1, true, true);
-}
+INSTANTIATE_TEST_CASE_P(DotTests, ParametricDotTest,
+                        ::testing::ValuesIn(CreateDotTestParameters()),
+                        PrintDotTestParam);
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x8x2) {
-  TestMatrixDot(1, 8, 2, true, true);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFF) {
+  TestSquareMatrixDot<float>(false, false);
 }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x2x8) {
-  TestMatrixDot(1, 2, 8, true, true);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFT) {
+  TestSquareMatrixDot<float>(false, true);
 }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_259x258x1) {
-  TestMatrixDot(259, 258, 1, true, true);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorTF) {
+  TestSquareMatrixDot<float>(true, false);
 }
 
-XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_259x258x1_FT) {
-  TestMatrixDot(259, 258, 1, false, true);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorTT) {
+  TestSquareMatrixDot<float>(true, true);
 }
 
-XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFF) {
-  constexpr bool kLhsRowMajor = false;
-  constexpr bool kRhsRowMajor = false;
-  TestSquareMatrixDot<float>(kLhsRowMajor, kRhsRowMajor);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorFF) {
+  TestSquareMatrixDot<complex64>(false, false);
 }
 
-XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFT) {
-  TestSquareMatrixDot<float>(false, true);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorFT) {
+  TestSquareMatrixDot<complex64>(false, true);
 }
 
-XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorTF) {
-  TestSquareMatrixDot<float>(true, false);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorTF) {
+  TestSquareMatrixDot<complex64>(true, false);
 }
 
-TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorTT) {
-  constexpr bool kLhsRowMajor = true;
-  constexpr bool kRhsRowMajor = true;
-  TestSquareMatrixDot<float>(kLhsRowMajor, kRhsRowMajor);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorTT) {
+  TestSquareMatrixDot<complex64>(true, true);
 }
 
 XLA_TEST_F(DotOperationTest, SquareMatrixDotF64) {
@@ -498,9 +520,39 @@ XLA_TEST_F(DotOperationTest, BatchMatMul) {
 
   ComputeAndCompareR4<float>(
       &builder,
-      /*expected=*/{{{{1300, 2400}, {13, 24}}, {{11400, 13600}, {114, 136}}},
-                    {{{42900, 79200}, {429, 792}},
-                     {{250800, 299200}, {2508, 2992}}}},
+      /*expected=*/
+      {{{{1300, 2400}, {13, 24}}, {{11400, 13600}, {114, 136}}},
+       {{{42900, 79200}, {429, 792}}, {{250800, 299200}, {2508, 2992}}}},
+      {x_data.get(), y_data.get()}, error_spec_);
+}
+
+XLA_TEST_F(DotOperationTest, GeneralMatMul) {
+  ComputationBuilder builder(client_, TestName());
+  auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2, 2}), "x");
+  auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2, 2}), "y");
+
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(2);
+  dnums.add_rhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(0);
+  dnums.add_rhs_batch_dimensions(0);
+
+  auto out = builder.DotGeneral(x, y, dnums);
+
+  auto x_data = client_
+                    ->TransferToServer(*Literal::CreateR3<float>(
+                        {{{1.0, 2.0}, {3.0, 4.0}}, {{5.0, 6.0}, {7.0, 8.0}}}))
+                    .ConsumeValueOrDie();
+
+  auto y_data = client_
+                    ->TransferToServer(*Literal::CreateR3<float>(
+                        {{{1.0, 0.0}, {0.0, 1.0}}, {{1.0, 0.0}, {0.0, 1.0}}}))
+                    .ConsumeValueOrDie();
+
+  ComputeAndCompareR3<float>(
+      &builder,
+      /*expected=*/
+      {{{1.0, 2.0}, {3.0, 4.0}}, {{5.0, 6.0}, {7.0, 8.0}}},
       {x_data.get(), y_data.get()}, error_spec_);
 }
 
@@ -561,5 +613,95 @@ TEST_F(DotOperationTest, TransposeFolding) {
   }
 }
 
+TEST_F(DotOperationTest, DotOfConcatOptimizationWithConstLHS) {
+  auto prim_type = primitive_util::NativeToPrimitiveType<float>();
+
+  std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+
+  ComputationBuilder builder(client_, TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_arg_0 = builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 2}),
+                                     "rhs_arg_0");
+  auto rhs_arg_1 = builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {3, 2}),
+                                     "rhs_arg_1");
+  auto rhs_arg_2 = builder.Parameter(2, ShapeUtil::MakeShape(prim_type, {1, 2}),
+                                     "rhs_arg_2");
+  auto result = builder.Dot(
+      lhs_constant, builder.ConcatInDim({rhs_arg_0, rhs_arg_1, rhs_arg_2}, 0));
+
+  std::unique_ptr<Array2D<float>> arg_0_value_array(
+      new Array2D<float>({{1.0, 2.0}, {3.0, 4.0}}));
+  std::unique_ptr<Array2D<float>> arg_1_value_array(
+      new Array2D<float>({{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}));
+  std::unique_ptr<Array2D<float>> arg_2_value_array(
+      new Array2D<float>({{1.0, 2.0}}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arg_0_value,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<float>(*arg_0_value_array)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arg_1_value,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<float>(*arg_1_value_array)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arg_2_value,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<float>(*arg_2_value_array)));
+
+  Array2D<float> expected({{53.0, 74.0}, {45.0, 66.0}});
+  ComputeAndCompareR2<float>(
+      &builder, expected,
+      {arg_0_value.get(), arg_1_value.get(), arg_2_value.get()}, error_spec_);
+}
+
+TEST_F(DotOperationTest, DotOfConcatOptimizationWithConstRHS) {
+  auto prim_type = primitive_util::NativeToPrimitiveType<float>();
+
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0},
+                          {3.0, 4.0},
+                          {5.0, 6.0},
+                          {6.0, 5.0},
+                          {4.0, 3.0},
+                          {2.0, 1.0}}));
+
+  ComputationBuilder builder(client_, TestName());
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto lhs_arg_0 = builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 2}),
+                                     "lhs_arg_0");
+  auto lhs_arg_1 = builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {2, 3}),
+                                     "lhs_arg_1");
+  auto lhs_arg_2 = builder.Parameter(2, ShapeUtil::MakeShape(prim_type, {2, 1}),
+                                     "lhs_arg_2");
+  auto result = builder.Dot(
+      builder.ConcatInDim({lhs_arg_0, lhs_arg_1, lhs_arg_2}, 1), rhs_constant);
+
+  std::unique_ptr<Array2D<float>> arg_0_value_array(
+      new Array2D<float>({{1.0, 2.0}, {3.0, 4.0}}));
+  std::unique_ptr<Array2D<float>> arg_1_value_array(
+      new Array2D<float>({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}));
+  std::unique_ptr<Array2D<float>> arg_2_value_array(
+      new Array2D<float>({{1.0}, {2.0}}));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arg_0_value,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<float>(*arg_0_value_array)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arg_1_value,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<float>(*arg_1_value_array)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto arg_2_value,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2D<float>(*arg_2_value_array)));
+
+  Array2D<float> expected({{38.0, 36.0}, {93.0, 91.0}});
+  ComputeAndCompareR2<float>(
+      &builder, expected,
+      {arg_0_value.get(), arg_1_value.get(), arg_2_value.get()}, error_spec_);
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 8baaf39e3cf8fa7f6fa4a0224c1297f82e0d92aa..877dc7db0eec229a7119b3627f177a33ed0d971b 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -51,12 +51,16 @@ class DynamicSliceTest : public ClientLibraryTestBase {
     RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {2}, {3}, {2, 3, 4});
     // Slice at dimension boundaries.
     RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {5}, {3}, {5, 6, 7});
-    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {6}, {4}, {6, 7, 0, 1});
     // Zero element slice.
     RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {2}, {0}, {});
   }
 
+  template <typename IndexT, typename DataT>
+  void TestR1Wrap() {
+    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {6}, {4}, {6, 7, 0, 1});
+  }
+
   template <typename IndexT, typename DataT>
   void TestR2() {
     // Slice at dimension start.
@@ -68,15 +72,19 @@ class DynamicSliceTest : public ClientLibraryTestBase {
     // Slice at dimension boundaries.
     RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {2, 1},
                          {{5}, {8}});
-    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {3, 3},
-                         {{5, 6, 4}, {8, 9, 7}, {2, 3, 1}});
     // Zero element slice: 2x0.
     RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {0, 0}, {2, 0},
                          {{}, {}});
     // Zero element slice: 0x2.
     RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {0, 0}, {0, 2},
-                         Array2D<DataT>(0, 2));
+                         Array2D<int>(0, 2));
+  }
+
+  template <typename IndexT, typename DataT>
+  void TestR2Wrap() {
+    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {3, 3},
+                         {{5, 6, 4}, {8, 9, 7}, {2, 3, 1}});
   }
 
   template <typename IndexT, typename DataT>
@@ -97,85 +105,119 @@ class DynamicSliceTest : public ClientLibraryTestBase {
        {{7, 8}, {9, 10}, {11, 12}}},
       {0, 1, 1}, {2, 2, 1},
       {{{4}, {6}}, {{10}, {12}}});
+    // clang-format on
+  }
 
+  template <typename IndexT, typename DataT>
+  void TestR3Wrap() {
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
     RunR3<IndexT, DataT>(
       {{{1, 2}, {3, 4}, {5, 6}},
        {{7, 8}, {9, 10}, {11, 12}}},
       {0, 2, 1}, {2, 1, 2},
       {{{6, 5}}, {{12, 11}}});
-
-    // clang-format on
   }
 
   template <typename IndexT, typename DataT>
-  void RunR1(tensorflow::gtl::ArraySlice<DataT> input_values,
+  void RunR1(tensorflow::gtl::ArraySlice<int> input_values_int,
              const std::vector<IndexT> slice_starts,
              const std::vector<int64>& slice_sizes,
-             tensorflow::gtl::ArraySlice<DataT> expected_values) {
+             tensorflow::gtl::ArraySlice<int> expected_values_int) {
+    // bfloat16 has explicit constructors, so it does not implicitly convert the
+    // way built-in types do, which is why we can't take the parameter as an
+    // ArraySlice<DataT>. We also can't convert it to a vector, because
+    // vector<bool> is special so that it cannot be an ArraySlice<bool>, which
+    // is what the code below wants. So instead we do this.
+    Literal input_values =
+        std::move(*Literal::CreateR1(input_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal expected_values =
+        std::move(*Literal::CreateR1(expected_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR1<DataT>(input_values);
+    auto input = builder.ConstantLiteral(input_values);
     builder.DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareR1<DataT>(&builder, expected_values, {start_data.get()});
+    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
   }
 
   template <typename IndexT, typename DataT>
-  void RunR2(const Array2D<DataT>& input_values,
+  void RunR2(const Array2D<int>& input_values_int,
              const std::vector<IndexT> slice_starts,
              const std::vector<int64>& slice_sizes,
-             const Array2D<DataT>& expected_values) {
+             const Array2D<int>& expected_values_int) {
+    Literal input_values =
+        std::move(*Literal::CreateR2FromArray2D(input_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal expected_values =
+        std::move(*Literal::CreateR2FromArray2D(expected_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR2FromArray2D<DataT>(input_values);
+    auto input = builder.ConstantLiteral(input_values);
     builder.DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareR2<DataT>(&builder, expected_values, {start_data.get()});
+    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
   }
 
   template <typename IndexT, typename DataT>
-  void RunR3(const Array3D<DataT>& input_values,
+  void RunR3(const Array3D<int>& input_values_int,
              const std::vector<IndexT> slice_starts,
              const std::vector<int64>& slice_sizes,
-             const Array3D<DataT>& expected_values) {
+             const Array3D<int>& expected_values_int) {
+    Literal input_values =
+        std::move(*Literal::CreateR3FromArray3D(input_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal expected_values =
+        std::move(*Literal::CreateR3FromArray3D(expected_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR3FromArray3D<DataT>(input_values);
+    auto input = builder.ConstantLiteral(input_values);
     builder.DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareR3<DataT>(&builder, expected_values, {start_data.get()});
+    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
   }
 };
 
+XLA_TEST_F(DynamicSliceTest, Int32R1BF16) { TestR1<int32, bfloat16>(); }
 XLA_TEST_F(DynamicSliceTest, Int32R1) { TestR1<int32, int32>(); }
-
+XLA_TEST_F(DynamicSliceTest, Int32R1Wrap) { TestR1Wrap<int32, int32>(); }
 XLA_TEST_F(DynamicSliceTest, Int64R1) { TestR1<int64, float>(); }
-
 XLA_TEST_F(DynamicSliceTest, UInt64R1) { TestR1<uint64, double>(); }
 
-XLA_TEST_F(DynamicSliceTest, Int32R2) { TestR2<int32, float>(); }
-
+XLA_TEST_F(DynamicSliceTest, Int32R2BF16) { TestR2<int32, bfloat16>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R2) { TestR2<int32, int32>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R2Wrap) { TestR2Wrap<int32, int32>(); }
 XLA_TEST_F(DynamicSliceTest, Int64R2) { TestR2<int64, double>(); }
-
 XLA_TEST_F(DynamicSliceTest, UInt64R2) { TestR2<uint64, int32>(); }
 
-XLA_TEST_F(DynamicSliceTest, Int32R3) { TestR3<int32, int32>(); }
-
+XLA_TEST_F(DynamicSliceTest, Int32R3BF16) { TestR3<int32, bfloat16>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R3) { TestR3<int32, float>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R3Wrap) { TestR3Wrap<int32, float>(); }
 XLA_TEST_F(DynamicSliceTest, Int64R3) { TestR3<int64, float>(); }
-
 XLA_TEST_F(DynamicSliceTest, UInt64R3) { TestR3<uint64, double>(); }
 
 XLA_TEST_F(DynamicSliceTest, Int32R1Pred) {
@@ -213,7 +255,7 @@ XLA_TEST_F(DynamicSliceTest, Int32R2Pred) {
   // Zero element slice: 0x2.
   RunR2<int32, bool>(
       {{true, false, true}, {false, false, true}, {true, true, false}}, {0, 0},
-      {0, 2}, Array2D<bool>(0, 2));
+      {0, 2}, Array2D<int>(0, 2));
 }
 
 XLA_TEST_F(DynamicSliceTest, Int32R3Pred) {
@@ -300,107 +342,154 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
   }
 
   template <typename IndexT, typename DataT>
-  void RunR1(tensorflow::gtl::ArraySlice<DataT> input_values,
-             tensorflow::gtl::ArraySlice<DataT> update_values,
+  void RunR1(tensorflow::gtl::ArraySlice<int> input_values_int,
+             tensorflow::gtl::ArraySlice<int> update_values_int,
              const std::vector<IndexT> slice_starts,
-             tensorflow::gtl::ArraySlice<DataT> expected_values) {
+             tensorflow::gtl::ArraySlice<int> expected_values_int) {
+    Literal input_values =
+        std::move(*Literal::CreateR1(input_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal update_values =
+        std::move(*Literal::CreateR1(update_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal expected_values =
+        std::move(*Literal::CreateR1(expected_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR1<DataT>(input_values);
-    auto update = builder.ConstantR1<DataT>(update_values);
+    auto input = builder.ConstantLiteral(input_values);
+    auto update = builder.ConstantLiteral(update_values);
     builder.DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    ComputeAndCompareR1<DataT>(&builder, expected_values, {start_data.get()});
+    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
   }
 
   template <typename IndexT, typename DataT>
-  void RunR2(const Array2D<DataT>& input_values,
-             const Array2D<DataT>& update_values,
+  void RunR2(const Array2D<int>& input_values_int,
+             const Array2D<int>& update_values_int,
              const std::vector<IndexT> slice_starts,
-             const Array2D<DataT>& expected_values) {
+             const Array2D<int>& expected_values_int) {
+    Literal input_values =
+        std::move(*Literal::CreateR2FromArray2D(input_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal update_values =
+        std::move(*Literal::CreateR2FromArray2D(update_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal expected_values =
+        std::move(*Literal::CreateR2FromArray2D(expected_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR2FromArray2D<DataT>(input_values);
-    auto update = builder.ConstantR2FromArray2D<DataT>(update_values);
+    auto input = builder.ConstantLiteral(input_values);
+    auto update = builder.ConstantLiteral(update_values);
     builder.DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    ComputeAndCompareR2<DataT>(&builder, expected_values, {start_data.get()});
+    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
   }
 
   template <typename IndexT, typename DataT>
-  void RunR3(const Array3D<DataT>& input_values,
-             const Array3D<DataT>& update_values,
+  void RunR3(const Array3D<int>& input_values_int,
+             const Array3D<int>& update_values_int,
              const std::vector<IndexT> slice_starts,
-             const Array3D<DataT>& expected_values) {
+             const Array3D<int>& expected_values_int) {
+    Literal input_values =
+        std::move(*Literal::CreateR3FromArray3D(input_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal update_values =
+        std::move(*Literal::CreateR3FromArray3D(update_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal expected_values =
+        std::move(*Literal::CreateR3FromArray3D(expected_values_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     ComputationDataHandle starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
-    auto input = builder.ConstantR3FromArray3D<DataT>(input_values);
-    auto update = builder.ConstantR3FromArray3D<DataT>(update_values);
+    auto input = builder.ConstantLiteral(input_values);
+    auto update = builder.ConstantLiteral(update_values);
     builder.DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    ComputeAndCompareR3<DataT>(&builder, expected_values, {start_data.get()});
+    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
   }
 
+  template <class T>
   void RunR3Contiguous(std::vector<int32> operand_shape, int32 index,
                        int32 size) {
+#ifdef XLA_TEST_BACKEND_CPU_PARALLEL
+    // TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
+    if (std::is_same<bfloat16, T>::value) {
+      return;
+    }
+#endif
+
     const int32 kSeq = operand_shape[0];
     const int32 kBatch = operand_shape[1];
     const int32 kDim = operand_shape[2];
-    Array3D<float> input_values(kSeq, kBatch, kDim);
-    Array3D<float> update_values(size, kBatch, kDim);
-    Array3D<float> expected_values(kSeq, kBatch, kDim);
+    Array3D<T> input_values(kSeq, kBatch, kDim);
+    Array3D<T> update_values(size, kBatch, kDim);
+    Array3D<T> expected_values(kSeq, kBatch, kDim);
 
-    input_values.FillIota(0);
-    float val = 1000;
-    update_values.FillIota(val);
+    input_values.FillIota(static_cast<T>(0));
+    T value = static_cast<T>(10);
+    update_values.FillIota(static_cast<T>(value));
 
     // TODO(b/34128753) Expected values may vary depending on backend when
     // the update wraps. According to documentation, the results are technically
     // implementation specific where the update is out of bounds, and hence
     // we don't really know what to pass into ComputeAndCompareR3.
-    expected_values.FillIota(0);
+    expected_values.FillIota(static_cast<T>(0));
     for (int i = 0; i < size; i++) {
       for (int j = 0; j < kBatch; j++) {
         for (int k = 0; k < kDim; k++) {
-          expected_values((index + i) % kSeq, j, k) = val++;
+          expected_values((index + i) % kSeq, j, k) = value++;
         }
       }
     }
     if (VLOG_IS_ON(1)) {
-      DumpArray<float>("input", input_values);
-      DumpArray<float>("update", update_values);
-      DumpArray<float>("expected", expected_values);
+      DumpArray<T>("input", input_values);
+      DumpArray<T>("update", update_values);
+      DumpArray<T>("expected", expected_values);
     }
 
     // Build dynamic slice computation.
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer input parameter.
     ComputationDataHandle input;
-    std::unique_ptr<GlobalData> input_data = CreateR3Parameter<float>(
-        input_values, 0, "input_values", &builder, &input);
+    std::unique_ptr<GlobalData> input_data =
+        CreateR3Parameter<T>(input_values, 0, "input_values", &builder, &input);
     // Initialize and transfer update parameter.
     ComputationDataHandle update;
-    std::unique_ptr<GlobalData> update_data = CreateR3Parameter<float>(
+    std::unique_ptr<GlobalData> update_data = CreateR3Parameter<T>(
         update_values, 1, "update_values", &builder, &update);
     auto starts = builder.ConstantR1<int32>({index, 0, 0});
     builder.DynamicUpdateSlice(input, update, starts);
 
     // Run computation and compare against expected values.
-    ComputeAndCompareR3<float>(&builder, expected_values,
-                               {input_data.get(), update_data.get()},
-                               ErrorSpec(0.000001));
+    ComputeAndCompareR3<T>(&builder, expected_values,
+                           {input_data.get(), update_data.get()},
+                           ErrorSpec(0.000001));
   }
 
   template <typename NativeT>
@@ -411,28 +500,35 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
   }
 };
 
+// TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
+XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R1BF16)) {
+  TestR1<int32, bfloat16>();
+}
 XLA_TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1<int32, float>(); }
-
 XLA_TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1<int64, float>(); }
-
 XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1<uint64, double>(); }
 
+// TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
+XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R2BF16)) {
+  TestR2<int32, bfloat16>();
+}
 XLA_TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2<int32, float>(); }
-
 XLA_TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2<int64, int64>(); }
-
 XLA_TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2<uint64, int32>(); }
 
+// TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
+XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R3BF16)) {
+  TestR3<int32, bfloat16>();
+}
 XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3<int32, float>(); }
-
 XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3<int64, int64>(); }
-
 XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3<uint64, uint64>(); }
 
+XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32WrapBF16)) {
+  TestWrap<int32, bfloat16>();
+}
 XLA_TEST_F(DynamicUpdateSliceTest, Int32Wrap) { TestWrap<int32, float>(); }
-
 XLA_TEST_F(DynamicUpdateSliceTest, Int64Wrap) { TestWrap<int64, int64>(); }
-
 XLA_TEST_F(DynamicUpdateSliceTest, UInt64Wrap) { TestWrap<uint64, uint64>(); }
 
 XLA_TEST_F(DynamicUpdateSliceTest, Int32R1Pred) {
@@ -498,36 +594,70 @@ XLA_TEST_F(DynamicUpdateSliceTest, Int32R3Pred) {
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousSingleElement) {
   // Single element, no wrap.
   std::vector<int32> operand_shape({4, 5, 2});
-  RunR3Contiguous(operand_shape, /*index=*/1, /*size=*/1);
+  RunR3Contiguous<float>(operand_shape, /*index=*/1, /*size=*/1);
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousSingleElementBF16) {
+  // Single element, no wrap.
+  std::vector<int32> operand_shape({4, 5, 2});
+  RunR3Contiguous<bfloat16>(operand_shape, /*index=*/1, /*size=*/1);
 }
 
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleElements) {
   // Multiple element, no wrap.
   std::vector<int32> operand_shape({4, 5, 2});
-  RunR3Contiguous(operand_shape, /*index=*/1, /*size=*/2);
+  RunR3Contiguous<float>(operand_shape, /*index=*/1, /*size=*/2);
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleElementsBF16) {
+  // Multiple element, no wrap.
+  std::vector<int32> operand_shape({4, 5, 2});
+  RunR3Contiguous<bfloat16>(operand_shape, /*index=*/1, /*size=*/2);
 }
 
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleWrapping) {
   // Multiple element, wrapping.
   std::vector<int32> operand_shape({4, 5, 2});
-  RunR3Contiguous(operand_shape, /*index=*/3, /*size=*/2);
+  RunR3Contiguous<float>(operand_shape, /*index=*/3, /*size=*/2);
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleWrappingBF16) {
+  // Multiple element, wrapping.
+  std::vector<int32> operand_shape({4, 5, 2});
+  RunR3Contiguous<bfloat16>(operand_shape, /*index=*/3, /*size=*/2);
 }
 
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousTooLarge) {
   // Multiple element, update size larger than operand.
   std::vector<int32> operand_shape({4, 5, 2});
-  RunR3Contiguous(operand_shape, /*index=*/5, /*size=*/2);
+  RunR3Contiguous<float>(operand_shape, /*index=*/5, /*size=*/2);
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousTooLargeBF16) {
+  // Multiple element, update size larger than operand.
+  std::vector<int32> operand_shape({4, 5, 2});
+  RunR3Contiguous<bfloat16>(operand_shape, /*index=*/5, /*size=*/2);
 }
 
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousUnaligned) {
   std::vector<int32> operand_shape({3, 123, 247});
-  RunR3Contiguous(operand_shape, /*index=*/1, /*size=*/1);
+  RunR3Contiguous<float>(operand_shape, /*index=*/1, /*size=*/1);
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousUnalignedBF16) {
+  std::vector<int32> operand_shape({3, 123, 247});
+  RunR3Contiguous<bfloat16>(operand_shape, /*index=*/1, /*size=*/1);
 }
 
 // TODO(b/34134076) Disabled on GPU 2016-01-06 due to out-of-memory error.
 XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_GPU(R3ContiguousLarger)) {
   std::vector<int32> operand_shape({32, 128, 1024});
-  RunR3Contiguous(operand_shape, /*index=*/7, /*size=*/1);
+  RunR3Contiguous<float>(operand_shape, /*index=*/7, /*size=*/1);
+}
+
+XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_GPU(R3ContiguousLargerBF16)) {
+  std::vector<int32> operand_shape({32, 128, 1024});
+  RunR3Contiguous<bfloat16>(operand_shape, /*index=*/7, /*size=*/1);
 }
 
 void BM_DynamicSlice(int num_iters) {
@@ -559,20 +689,20 @@ void BM_DynamicSlice(int num_iters) {
   auto computation = builder.Build().ConsumeValueOrDie();
 
   // Initialize and transfer parameter buffer.
-  auto shape_size_fn = [client](const Shape& shape) {
-    return client->backend().transfer_manager()->GetByteSizeRequirement(shape);
-  };
-  auto buffer = ScopedShapedBuffer::Allocate(start_indices_shape, &allocator, 0,
-                                             shape_size_fn)
+  auto buffer = client->backend()
+                    .transfer_manager()
+                    ->AllocateScopedShapedBuffer(
+                        start_indices_shape, &allocator, /*device_ordinal=*/0)
                     .ConsumeValueOrDie();
 
   auto start_indices_literal = Literal::CreateR1<int32>({0, 1, 2, 3});
   ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      executors[device_ordinal], *start_indices_literal,
-      buffer->mutable_buffer({})));
+      executors[device_ordinal], *start_indices_literal, *buffer));
 
   std::unique_ptr<LocalExecutable> executable =
-      client->Compile(computation, {&buffer->shape()}, ExecutableBuildOptions())
+      client
+          ->Compile(computation, {&buffer->on_host_shape()},
+                    ExecutableBuildOptions())
           .ConsumeValueOrDie();
 
   // Run some warm-up executions.
diff --git a/tensorflow/compiler/xla/tests/execution_profile_test.cc b/tensorflow/compiler/xla/tests/execution_profile_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..644cbbf40f296eb2a574ae568b4f32aa3d0bd12f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/execution_profile_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class ExecutionProfileTest : public ClientLibraryTestBase {};
+
+XLA_TEST_F(ExecutionProfileTest,
+           DISABLED_ON_CPU_PARALLEL(ExecuteWithExecutionProfile)) {
+  Shape shape = ShapeUtil::MakeShape(F32, {256, 256});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GlobalData> input,
+      client_->TransferToServer(
+          *Literal::CreateR2F32Linspace(1e0, 1e5, 256, 256)));
+
+  ComputationBuilder b(client_, TestName() + ".add");
+  b.Dot(b.Parameter(0, shape, "param_0"), b.Parameter(1, shape, "param_1"));
+  TF_ASSERT_OK_AND_ASSIGN(Computation dot_product, b.Build());
+
+  ExecutionProfile execution_profile;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GlobalData> data,
+      client_->Execute(dot_product, {input.get(), input.get()},
+                       &execution_options_, &execution_profile));
+
+  VLOG(3) << "execution_profile.compute_cycle_count() = "
+          << execution_profile.compute_cycle_count();
+  VLOG(3) << "execution_profile.compute_and_transfer_time_ns() = "
+          << execution_profile.compute_and_transfer_time_ns();
+  VLOG(3) << "execution_profile.compute_time_ns() = "
+          << execution_profile.compute_time_ns();
+
+  bool hlo_profiling_enabled =
+      execution_options_.debug_options().xla_hlo_profile();
+
+  // If HLO profiling is enabled we always expect cycle count to be populated.
+  // If HLO profiling is disabled then depending on the backend the cycle count
+  // may or may not be populated.
+  if (hlo_profiling_enabled) {
+    EXPECT_GT(execution_profile.compute_cycle_count(), 0);
+  }
+
+  EXPECT_GT(execution_profile.compute_and_transfer_time_ns(), 0);
+  EXPECT_GT(execution_profile.compute_time_ns(), 0);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computed, client_->Transfer(*data, &shape));
+  (void)computed;
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6fe7737de7af349dca2931b52d62dbc03b14e0b3
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/core/casts.h"
+
+namespace xla {
+namespace {
+class ExhaustiveF32ElementwiseOpTest
+    : public ClientLibraryTestBase,
+      public ::testing::WithParamInterface<std::pair<int64, int64>> {
+ protected:
+  ErrorSpec error_spec_{0.0001, 0.0001, /*relaxed_nans=*/true};
+
+  template <typename EnqueueOpTy>
+  void ExhaustivelyTestF32Op(EnqueueOpTy enqueue_op,
+                             float (*evaluate_op)(float),
+                             std::pair<int64, int64> known_incorrect_range) {
+    int64 begin, end;
+    std::tie(begin, end) = GetParam();
+    int64 input_size = end - begin;
+    LOG(INFO) << "Checking range [" << begin << ", " << end << ")";
+
+    ComputationBuilder builder(client_, TestName());
+
+    std::unique_ptr<Literal> input_literal =
+        Literal::CreateFromDimensions(F32, {input_size});
+    for (int64 i = begin; i < end; i++) {
+      if (i >= known_incorrect_range.first &&
+          i < known_incorrect_range.second) {
+        // If the operation is known to be buggy on a specific input clamp that
+        // input to 0 under the assumption that the op is at least correct on 0.
+        input_literal->Set({i - begin}, 0.0f);
+      } else {
+        input_literal->Set({i - begin}, tensorflow::bit_cast<float, int>(i));
+      }
+    }
+
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
+                            client_->TransferToServer(*input_literal));
+
+    auto input = builder.Parameter(0, input_literal->shape(), "input");
+    enqueue_op(&builder, input);
+
+    std::vector<float> expected_result;
+    expected_result.reserve(input_size);
+    for (int64 i = 0; i < input_size; i++) {
+      expected_result.push_back(evaluate_op(input_literal->Get<float>({i})));
+    }
+
+    ComputeAndCompareR1<float>(&builder, expected_result, {input_data.get()},
+                               error_spec_);
+  }
+};
+
+XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, LogF32) {
+#ifdef XLA_TEST_BACKEND_CPU
+  // TODO(b/73141998): The vectorized Log implementation gives results outside
+  // our error spec in this range (these numbers are bitwise representations of
+  // floats expressed as a zero extended int64):
+  std::pair<int64, int64> known_incorrect_range = {1, 8315654};
+#else
+  std::pair<int64, int64> known_incorrect_range = {0, 0};
+#endif
+
+  ExhaustivelyTestF32Op(
+      [](ComputationBuilder* builder, const ComputationDataHandle& input) {
+        builder->Log(input);
+      },
+      std::log, known_incorrect_range);
+}
+
+XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, ExpF32) {
+#ifdef XLA_TEST_BACKEND_CPU
+  // TODO(b/73142289): The vectorized Exp implementation gives results outside
+  // our error spec in this range (these numbers are bitwise representations of
+  // floats expressed as a zero extended int64):
+  std::pair<int64, int64> known_incorrect_range = {1107296256 + 11583654,
+                                                   1107296256 + 11629080};
+#else
+  std::pair<int64, int64> known_incorrect_range = {0, 0};
+#endif
+
+  ExhaustivelyTestF32Op(
+      [](ComputationBuilder* builder, const ComputationDataHandle& input) {
+        builder->Exp(input);
+      },
+      std::exp, known_incorrect_range);
+}
+
+XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, TanhF32) {
+  ExhaustivelyTestF32Op(
+      [](ComputationBuilder* builder, const ComputationDataHandle& input) {
+        builder->Tanh(input);
+      },
+      std::tanh, /*known_incorrect_range=*/{0, 0});
+}
+
+std::vector<std::pair<int64, int64>> CreateExhaustiveParameters() {
+  // We break up the 2^32-element space into small'ish chunks to keep peak
+  // memory usage low.
+  std::vector<std::pair<int64, int64>> result;
+  const int64 step = 1 << 25;
+  for (int64 i = 0; i < (1l << 32); i += step) {
+    result.push_back({i, i + step});
+  }
+  return result;
+}
+
+INSTANTIATE_TEST_CASE_P(ExhaustiveF32ElementwiseOpTestInstance,
+                        ExhaustiveF32ElementwiseOpTest,
+                        ::testing::ValuesIn(CreateExhaustiveParameters()));
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/filecheck.h b/tensorflow/compiler/xla/tests/filecheck.h
index 493ff7414bde31b18a39a5098925d9c991529b00..3830d5a44d2ca483fbe839231b0136d13033b48b 100644
--- a/tensorflow/compiler/xla/tests/filecheck.h
+++ b/tensorflow/compiler/xla/tests/filecheck.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_TESTS_FILECHECK_H_
-#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_TESTS_FILECHECK_H_
+#ifndef TENSORFLOW_COMPILER_XLA_TESTS_FILECHECK_H_
+#define TENSORFLOW_COMPILER_XLA_TESTS_FILECHECK_H_
 
 #include <string>
 
@@ -30,4 +30,4 @@ StatusOr<bool> RunFileCheck(const string& input, const string& pattern);
 
 }  // namespace xla
 
-#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_TESTS_FILECHECK_H_
+#endif  // TENSORFLOW_COMPILER_XLA_TESTS_FILECHECK_H_
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index 2686afccc216095345dbb7b43e916fbbe7c8ea39..a292eab1d198fbf69c6dc81c780487ea46756f72 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -816,7 +816,8 @@ void BM_ParallelFusion(int num_iters) {
   std::unique_ptr<LocalExecutable> executable =
       client
           ->Compile(computation,
-                    {&buffer0->shape(), &buffer1->shape(), &buffer2->shape()},
+                    {&buffer0->on_host_shape(), &buffer1->on_host_shape(),
+                     &buffer2->on_host_shape()},
                     ExecutableBuildOptions())
           .ConsumeValueOrDie();
 
diff --git a/tensorflow/compiler/xla/tests/half_test.cc b/tensorflow/compiler/xla/tests/half_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec2f49d43bd8cee84c6b0abe1892e8b2278eefeb
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/half_test.cc
@@ -0,0 +1,257 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+
+// Tests the handling of the basic mathematics operations with F16 operands.
+
+namespace xla {
+namespace {
+
+class HalfTestBase : public ClientLibraryTestBase {
+ protected:
+  const ErrorSpec error_spec_{0.001, 0.001};
+  // Number of elements in the input buffers.
+  static const int kNumElements = 4;
+};
+
+using UnaryBuildFuncTy =
+    std::function<void(ComputationBuilder*, const ComputationDataHandle& src)>;
+
+struct UnaryOpTestParam {
+  std::function<half(half)> compute_func;
+  UnaryBuildFuncTy build_func;
+};
+
+class UnaryOpTest : public HalfTestBase,
+                    public ::testing::WithParamInterface<UnaryOpTestParam> {};
+
+XLA_TEST_P(UnaryOpTest, Ops) {
+  std::vector<half> x({half(1.4), half(-2.3), half(3.2), half(-4.1)});
+  ComputationBuilder builder(client_, TestName());
+  ComputationDataHandle x_opnd;
+  auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
+                                        &builder, &x_opnd);
+
+  std::function<half(half)> compute_func = GetParam().compute_func;
+  std::vector<half> expected;
+  for (int64 i = 0; i < x.size(); ++i) {
+    expected.push_back(compute_func(x[i]));
+  }
+
+  UnaryBuildFuncTy build_func = GetParam().build_func;
+  build_func(&builder, x_opnd);
+
+  ComputeAndCompareR1<half>(&builder, expected, {x_data.get()}, error_spec_);
+}
+
+half sign_imp(half value) {
+  const float x(std::move(value));
+  return half((x < .0) ? -1 : (x > .0));
+}
+
+half round_imp(half value) {
+  return half(round(static_cast<float>(std::move(value))));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    half, UnaryOpTest,
+    ::testing::Values(UnaryOpTestParam{[](half x) { return abs(x); },
+                                       &ComputationBuilder::Abs},
+                      UnaryOpTestParam{[](half x) { return round_imp(x); },
+                                       &ComputationBuilder::Round},
+                      UnaryOpTestParam{[](half x) { return ceil(x); },
+                                       &ComputationBuilder::Ceil},
+                      UnaryOpTestParam{[](half x) { return cos(x); },
+                                       &ComputationBuilder::Cos},
+                      UnaryOpTestParam{[](half x) { return exp(x); },
+                                       &ComputationBuilder::Exp},
+                      UnaryOpTestParam{[](half x) { return floor(x); },
+                                       &ComputationBuilder::Floor},
+                      UnaryOpTestParam{[](half x) { return log(x); },
+                                       &ComputationBuilder::Log},
+                      UnaryOpTestParam{[](half x) { return -x; },
+                                       &ComputationBuilder::Neg},
+                      UnaryOpTestParam{[](half x) { return sign_imp(x); },
+                                       &ComputationBuilder::Sign},
+                      UnaryOpTestParam{[](half x) { return sin(x); },
+                                       &ComputationBuilder::Sin},
+                      UnaryOpTestParam{[](half x) { return tanh(x); },
+                                       &ComputationBuilder::Tanh}
+
+                      ));
+
+struct UnaryPredTestParam {
+  std::function<bool(half)> compute_func;
+  UnaryBuildFuncTy build_func;
+};
+
+class UnaryPredTest : public HalfTestBase,
+                      public ::testing::WithParamInterface<UnaryPredTestParam> {
+};
+
+XLA_TEST_P(UnaryPredTest, Ops) {
+  std::vector<half> x({half(1.4), half(-2.3), half(3.2), half(-4.1)});
+  ComputationBuilder builder(client_, TestName());
+  ComputationDataHandle x_opnd;
+  auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
+                                        &builder, &x_opnd);
+
+  std::function<bool(half)> compute_func = GetParam().compute_func;
+  CHECK_EQ(kNumElements, x.size());
+  bool expected[kNumElements];
+  for (int64 i = 0; i < x.size(); ++i) {
+    expected[i] = compute_func(x[i]);
+  }
+
+  UnaryBuildFuncTy build_func = GetParam().build_func;
+  build_func(&builder, x_opnd);
+
+  ComputeAndCompareR1<bool>(&builder, expected, {x_data.get()});
+}
+
+INSTANTIATE_TEST_CASE_P(half, UnaryPredTest,
+                        ::testing::Values(UnaryPredTestParam{
+                            [](half x) { return isfinite(x); },
+                            &ComputationBuilder::IsFinite}));
+
+using BinaryBuildFuncTy = std::function<void(
+    ComputationBuilder*, const ComputationDataHandle& x,
+    const ComputationDataHandle& y, tensorflow::gtl::ArraySlice<int64>)>;
+
+struct BinaryOpTestParam {
+  std::function<half(half, half)> compute_func;
+  BinaryBuildFuncTy build_func;
+};
+
+class BinaryOpTest : public HalfTestBase,
+                     public ::testing::WithParamInterface<BinaryOpTestParam> {};
+
+XLA_TEST_P(BinaryOpTest, Ops) {
+  std::vector<half> x({half(1.0), half(2.0), half(3.0), half(-4.0)});
+  std::vector<half> y({half(0.4), half(-0.3), half(0.2), half(0.1)});
+  ComputationBuilder builder(client_, TestName());
+  ComputationDataHandle x_opnd;
+  auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
+                                        &builder, &x_opnd);
+
+  ComputationDataHandle y_opnd;
+  auto y_data = CreateR1Parameter<half>(y, /*parameter_number=*/1, "y",
+                                        &builder, &y_opnd);
+
+  std::function<half(half, half)> compute_func = GetParam().compute_func;
+  std::vector<half> expected;
+  for (int64 i = 0; i < x.size(); ++i) {
+    expected.push_back(compute_func(x[i], y[i]));
+  }
+
+  BinaryBuildFuncTy build_func = GetParam().build_func;
+  build_func(&builder, x_opnd, y_opnd, {});
+
+  ComputeAndCompareR1<half>(&builder, expected, {x_data.get(), y_data.get()},
+                            error_spec_);
+}
+
+half atan2_imp(half x, half y) {
+  return half(atan2(static_cast<float>(std::move(x)),
+                    static_cast<float>(std::move(y))));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    half, BinaryOpTest,
+    ::testing::Values(
+        BinaryOpTestParam{[](half x, half y) { return x + y; },
+                          &ComputationBuilder::Add},
+        BinaryOpTestParam{[](half x, half y) { return atan2_imp(x, y); },
+                          &ComputationBuilder::Atan2},
+        BinaryOpTestParam{[](half x, half y) { return x / y; },
+                          &ComputationBuilder::Div},
+        BinaryOpTestParam{[](half x, half y) { return max(x, y); },
+                          &ComputationBuilder::Max},
+        BinaryOpTestParam{[](half x, half y) { return min(x, y); },
+                          &ComputationBuilder::Min},
+        BinaryOpTestParam{[](half x, half y) { return x * y; },
+                          &ComputationBuilder::Mul},
+        BinaryOpTestParam{[](half x, half y) { return pow(x, y); },
+                          &ComputationBuilder::Pow},
+        BinaryOpTestParam{[](half x, half y) { return x - y; },
+                          &ComputationBuilder::Sub}
+
+        ));
+
+struct BinaryPredTestParam {
+  std::function<bool(half, half)> compute_func;
+  BinaryBuildFuncTy build_func;
+};
+
+class BinaryPredTest
+    : public HalfTestBase,
+      public ::testing::WithParamInterface<BinaryPredTestParam> {};
+
+XLA_TEST_P(BinaryPredTest, Ops) {
+  std::vector<half> x({half(1.0), half(2.0), half(0.2), half(-4.0)});
+  std::vector<half> y({half(0.4), half(-0.3), half(0.2), half(0.1)});
+  ComputationBuilder builder(client_, TestName());
+  ComputationDataHandle x_opnd;
+  auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
+                                        &builder, &x_opnd);
+
+  ComputationDataHandle y_opnd;
+  auto y_data = CreateR1Parameter<half>(y, /*parameter_number=*/1, "y",
+                                        &builder, &y_opnd);
+
+  std::function<bool(half, half)> compute_func = GetParam().compute_func;
+  CHECK_EQ(kNumElements, x.size());
+  bool expected[kNumElements];
+  for (int64 i = 0; i < x.size(); ++i) {
+    expected[i] = compute_func(x[i], y[i]);
+  }
+
+  BinaryBuildFuncTy build_func = GetParam().build_func;
+  build_func(&builder, x_opnd, y_opnd, {});
+
+  ComputeAndCompareR1<bool>(&builder, expected, {x_data.get(), y_data.get()});
+}
+
+INSTANTIATE_TEST_CASE_P(
+    half, BinaryPredTest,
+    ::testing::Values(BinaryPredTestParam{[](half x, half y) { return x == y; },
+                                          &ComputationBuilder::Eq},
+                      BinaryPredTestParam{[](half x, half y) { return x != y; },
+                                          &ComputationBuilder::Ne},
+                      BinaryPredTestParam{[](half x, half y) { return x >= y; },
+                                          &ComputationBuilder::Ge},
+                      BinaryPredTestParam{[](half x, half y) { return x > y; },
+                                          &ComputationBuilder::Gt},
+                      BinaryPredTestParam{[](half x, half y) { return x <= y; },
+                                          &ComputationBuilder::Le},
+                      BinaryPredTestParam{[](half x, half y) { return x < y; },
+                                          &ComputationBuilder::Lt}
+
+                      ));
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index d73c05ff92578209143e0679558848160cae99bd..9f5806c5e16c30cf198027cffab5f78c315cb957 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -15,13 +15,22 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
+#include <memory>
 #include <set>
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -30,44 +39,235 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
+namespace {
+
+using tensorflow::StringPiece;
+using tensorflow::gtl::ArraySlice;
+using tensorflow::gtl::optional;
+
+constexpr char kInterpreter[] = "interpreter";
+
+// Helper functions to get test and reference platforms.
+se::Platform* GetReferencePlatform() {
+  auto result = PlatformUtil::GetPlatform(kInterpreter);
+  TF_CHECK_OK(result.status()) << "could not get interpreter platform";
+  return result.ValueOrDie();
+}
+
+se::Platform* GetTestPlatform() {
+  auto result = PlatformUtil::GetDefaultPlatform();
+  TF_CHECK_OK(result.status()) << "could not get test platform";
+  return result.ValueOrDie();
+}
+
+bool ProgramShapesEqual(const ProgramShape& lhs, const ProgramShape& rhs) {
+  if (lhs.parameters_size() != rhs.parameters_size()) {
+    return false;
+  }
+  for (int i = 0; i < lhs.parameters_size(); i++) {
+    if (!ShapeUtil::Equal(lhs.parameters(i), rhs.parameters(i))) {
+      return false;
+    }
+  }
+  return ShapeUtil::Equal(lhs.result(), rhs.result());
+}
+
+ProgramShape GetProgramShapeWithLayout(const HloModule& module) {
+  ProgramShape program_shape;
+  const auto* entry = module.entry_computation();
+  for (const auto* param : entry->parameter_instructions()) {
+    *program_shape.add_parameters() = param->shape();
+    *program_shape.add_parameter_names() = param->name();
+  }
+  *program_shape.mutable_result() = entry->root_instruction()->shape();
+  return program_shape;
+}
+
+}  // namespace
+
+HloTestBase::HloTestBase()
+    : HloTestBase(GetTestPlatform(), GetReferencePlatform()) {}
+
+HloTestBase::HloTestBase(se::Platform* test_platform,
+                         se::Platform* reference_platform)
+    : test_runner_(test_platform), reference_runner_(reference_platform) {
+  hlo_verifier_ = MakeUnique<HloVerifier>();
+}
+
 /* static */
 std::unique_ptr<HloModule> HloTestBase::CreateNewModule() {
   HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsForTest());
+  return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
+                               config);
+}
 
+/*static*/ DebugOptions HloTestBase::GetDebugOptionsForTest() {
   auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
+  return debug_options;
+}
 
-  config.set_debug_options(debug_options);
-
-  return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
-                               config);
+StatusOr<std::unique_ptr<Literal>> HloTestBase::Execute(
+    std::unique_ptr<HloModule> module,
+    tensorflow::gtl::ArraySlice<Literal*> arguments) {
+  return test_runner_.Execute(std::move(module), arguments);
 }
 
-StatusOr<perftools::gputools::DeviceMemoryBase> HloTestBase::Execute(
+std::unique_ptr<Literal> HloTestBase::ExecuteAndTransfer(
     std::unique_ptr<HloModule> module,
-    tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-        arguments,
-    Shape* result_shape) {
-  return runner_.Execute(std::move(module), arguments, result_shape);
+    tensorflow::gtl::ArraySlice<Literal*> arguments) {
+  return test_runner_.Execute(std::move(module), arguments).ValueOrDie();
 }
 
-se::DeviceMemoryBase HloTestBase::TransferToDevice(const Literal& literal) {
-  return runner_.TransferToDevice(literal).ValueOrDie();
+StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
+    const HloModule& test_module,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  std::unique_ptr<HloModule> reference_module = test_module.Clone();
+  const auto& program_shape = GetProgramShapeWithLayout(test_module);
+
+  if (reference_preprocessor != nullptr) {
+    reference_preprocessor(reference_module.get());
+    if (!ProgramShapesEqual(program_shape,
+                            GetProgramShapeWithLayout(*reference_module))) {
+      return InvalidArgument(
+          "reference preprocessor must not modify the program shape");
+    }
+  }
+  TF_RETURN_IF_ERROR(VerifyHloModule(*reference_runner_.backend().platform(),
+                                     reference_module.get()));
+  return std::move(reference_module);
 }
 
-std::unique_ptr<Literal> HloTestBase::TransferFromDevice(
-    const Shape& shape, se::DeviceMemoryBase device_base) {
-  return runner_.TransferFromDevice(shape, device_base).ValueOrDie();
+template <typename LiteralPtr>
+StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
+    std::unique_ptr<HloModule> module, const ArraySlice<LiteralPtr> arguments,
+    const optional<ErrorSpec>& error, bool run_hlo_passes,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  static_assert(
+      std::is_same<Literal*, LiteralPtr>::value ||
+          std::is_same<std::unique_ptr<Literal>, LiteralPtr>::value,
+      "The LiteralPtr type only accepts Literal* or std::unique_ptr<Literal>.");
+  TF_RETURN_IF_ERROR(
+      VerifyHloModule(*test_runner_.backend().platform(), module.get()));
+  TF_ASSIGN_OR_RETURN(auto reference_module,
+                      MakeReferenceModule(*module, reference_preprocessor));
+
+  // Execute on two backends.
+  TF_ASSIGN_OR_RETURN(
+      auto test,
+      test_runner_.Execute(std::move(module), arguments, run_hlo_passes));
+  TF_ASSIGN_OR_RETURN(auto reference,
+                      reference_runner_.Execute(std::move(reference_module),
+                                                arguments, run_hlo_passes));
+  return LiteralTestUtil::NearOrEqual(/*expected=*/*reference, /*actual=*/*test,
+                                      error);
 }
 
-std::unique_ptr<Literal> HloTestBase::ExecuteAndTransfer(
-    std::unique_ptr<HloModule> module,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
-  return runner_.ExecuteAndTransfer(std::move(module), arguments).ValueOrDie();
+template <typename LiteralPtr>
+::testing::AssertionResult HloTestBase::RunAndCompare(
+    std::unique_ptr<HloModule> module, const ArraySlice<LiteralPtr> arguments,
+    const optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto result =
+      RunAndCompareInternal(std::move(module), arguments, error,
+                            /*run_hlo_passes=*/true, reference_preprocessor);
+  if (!result.ok()) {
+    return ::testing::AssertionFailure() << result.status();
+  }
+  return result.ValueOrDie();
+}
+
+template <typename LiteralPtr>
+::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses(
+    std::unique_ptr<HloModule> module, const ArraySlice<LiteralPtr> arguments,
+    const optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto result =
+      RunAndCompareInternal(std::move(module), arguments, error,
+                            /*run_hlo_passes=*/false, reference_preprocessor);
+  if (!result.ok()) {
+    return ::testing::AssertionFailure() << result.status();
+  }
+  return result.ValueOrDie();
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompare(
+    std::unique_ptr<HloModule> module, const optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  const auto& fake_arguments =
+      MakeFakeArguments(module.get()).ConsumeValueOrDie();
+  return RunAndCompare<std::unique_ptr<Literal>>(
+      std::move(module), fake_arguments, error, reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses(
+    std::unique_ptr<HloModule> module, const optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  const auto& fake_arguments =
+      MakeFakeArguments(module.get()).ConsumeValueOrDie();
+  return RunAndCompareNoHloPasses<std::unique_ptr<Literal>>(
+      std::move(module), fake_arguments, error, reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompare(
+    const StringPiece hlo_string,
+    const tensorflow::gtl::optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto module_or_status =
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "Error while parsing HLO text format: "
+           << module_or_status.status().ToString();
+  }
+  return RunAndCompare(module_or_status.ConsumeValueOrDie(), error,
+                       reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompareFromFile(
+    const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto module_or_status =
+      HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest());
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "failed reading hlo module from file";
+  }
+  return RunAndCompare(module_or_status.ConsumeValueOrDie(), error,
+                       reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses(
+    const StringPiece hlo_string,
+    const tensorflow::gtl::optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto module_or_status =
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "Error while parsing HLO text format: "
+           << module_or_status.status().ToString();
+  }
+  return RunAndCompareNoHloPasses(module_or_status.ConsumeValueOrDie(), error,
+                                  reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompareNoHloPassesFromFile(
+    const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto module_or_status =
+      HloRunner::ReadModuleFromHloTextFile(filename, GetDebugOptionsForTest());
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "failed reading hlo module from file";
+  }
+  return RunAndCompareNoHloPasses(module_or_status.ConsumeValueOrDie(), error,
+                                  reference_preprocessor);
 }
 
-Backend& HloTestBase::backend() { return runner_.backend(); }
+Backend& HloTestBase::backend() { return test_runner_.backend(); }
 
 /* static */
 string HloTestBase::TestName() {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 7f068dce36be3546298de2f06bf6d33446d07ca2..4aea9fc9fd027231106e529eb16bcd43f23fbe1c 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -24,52 +24,150 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 
-// A base class for tests which build and run HLO code. This is a lower level of
-// abstraction than using the client interface and enables, for one, explicitly
-// building a graph of HLO instructions to run.
+// A base class for tests which build and/or run HLO code. The class includes
+// support for running an HLO module on two platforms and compare the results.
+// This is a lower level of abstraction than using the client interface and
+// enables, for one, explicitly building a graph of HLO instructions to run.
+//
+// This can also be used to write text/file-based test cases. Note that the test
+// target is responsible for linking the needed backends. A covenient way to do
+// this is to make it an xla_test: it will generate test targets linking with
+// the respective backends, which will be used as the test backend; the
+// interpreter backend is already linked with hlo_test_base so it will be the
+// default reference backend. For example, if you want to compare both cpu vs.
+// interpreter, and gpu vs. interpreter, you can:
+//
+//  xla_test (
+//    name = "sample_text_test",
+//    srcs = ["sample_text_test.cc"],
+//    backends = [
+//      "cpu",
+//      "gpu",
+//    ],
+//    deps = [
+//      "//third_party/tensorflow/compiler/xla/tests:hlo_test_base",
+//      ...
+//    ],
+//  )
+//
+// For a more detailed example, see "../tests/sample_text_test.cc".
 class HloTestBase : public ::testing::Test {
  protected:
-  HloTestBase() {}
+  // This uses the interpreter backend as the reference backend and
+  // automatically finds another supported backend as the test backend. If the
+  // interpreter is the only supported backend, it will be both the test backend
+  // and the reference backend.
+  HloTestBase();
+
+  // If your test doesn't use interpreter as the reference backend, you can use
+  // this constructor. Note that your test target is responsible for linking in
+  // both needed backends.
+  HloTestBase(::perftools::gputools::Platform* test_platform,
+              ::perftools::gputools::Platform* reference_platform);
 
   ~HloTestBase() override {}
 
   // Creates a new HLO module for a test. The module created will have
   // TestName() for its name; it will also automatically populate its debug
-  // options from command-line flags. It's recommended to use this method to
-  // create all HloModules for tests.
+  // options from command-line flags. If you want a fresh HloModule object and
+  // then add HloComputations to it, it's recommended to use this method in your
+  // tests.
   static std::unique_ptr<HloModule> CreateNewModule();
 
-  // Executes the given module and returns a global data handle.
-  StatusOr<perftools::gputools::DeviceMemoryBase> Execute(
+  // Populates debug options from command-line flags and adjusts the options for
+  // testing. It is recommended to use this when you need to pass in
+  // DebugOptions, e.g. when creating a module from a string or a file.
+  static DebugOptions GetDebugOptionsForTest();
+
+  // Executes the given module and return the result as a Literal.
+  StatusOr<std::unique_ptr<Literal>> Execute(
       std::unique_ptr<HloModule> module,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments,
-      Shape* result_shape);
+      tensorflow::gtl::ArraySlice<Literal*> arguments);
 
-  // Transfers the given literal to the device and returns the data handle.
-  perftools::gputools::DeviceMemoryBase TransferToDevice(
-      const Literal& literal);
+  std::unique_ptr<Literal> ExecuteAndTransfer(
+      std::unique_ptr<HloModule> module,
+      tensorflow::gtl::ArraySlice<Literal*> arguments);
+
+  // Executes the given hlo module on two backends and compares results.
+  //
+  // 'arguments': the input of the hlo module. The LiteralPtr type accepts
+  // Literal* or std::unique_ptr<Literal>.
+  //
+  // 'error': if has value, expects the results to be near (within the error
+  // bound). Otherwise, expects the results to be equal.
+  //
+  // 'reference_preprocessor': the module should be ready to run on the test
+  // backend, but it might need to be tailored so that it is able to run on the
+  // reference backend. Note that the program shape of the module must not be
+  // modified.
+  template <typename LiteralPtr>
+  ::testing::AssertionResult RunAndCompare(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+
+  // Same as above, except that the module will be executed without Hlo
+  // optimization.
+  template <typename LiteralPtr>
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
 
-  // Transfers the array referred to by the given handle from the device and
-  // returns as a Literal.
-  std::unique_ptr<Literal> TransferFromDevice(
-      const Shape& shape, perftools::gputools::DeviceMemoryBase device_base);
+  // Executes an hlo module with fake inputs and compares the results.
+  ::testing::AssertionResult RunAndCompare(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
 
-  // Executes the given module and return the result as a Literal.
-  std::unique_ptr<Literal> ExecuteAndTransfer(
+  // Same as above, except that the module will be executed without Hlo
+  // optimization.
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
       std::unique_ptr<HloModule> module,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments);
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+
+  // Convenient wrappers for executing and comparing an hlo module with fake
+  // input. Module can be passed in directly, or parsed from an hlo_string,
+  // or loaded from a file.
+  ::testing::AssertionResult RunAndCompare(
+      const tensorflow::StringPiece hlo_string,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+  ::testing::AssertionResult RunAndCompareFromFile(
+      const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
+      const tensorflow::StringPiece hlo_string,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+  ::testing::AssertionResult RunAndCompareNoHloPassesFromFile(
+      const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
 
   // Convenience method to force the layout of a given parameter in a module.
   // The layout of parameter number 'param_no' in the 'module' is set to
@@ -99,14 +197,38 @@ class HloTestBase : public ::testing::Test {
         ->Clear();
   }
 
+  // Return an HLO verifier constructed for the test backend.
+  HloVerifier& verifier() const { return *hlo_verifier_; }
+
   static string TestName();
 
-  // Returns the backend owned by the HloRunner.
+  // Returns the backend owned by the test runner.
   Backend& backend();
 
-  HloRunner runner_;
+  HloRunner test_runner_;
+  HloRunner reference_runner_;
+
+  std::unique_ptr<HloVerifier> hlo_verifier_;
 
   ErrorSpec error_spec_{0.0001};
+
+ private:
+  // Given the test module, makes a reference module that is ready to run on the
+  // reference platform. This assumes that the given module is ready to run on
+  // the test platform.
+  StatusOr<std::unique_ptr<HloModule>> MakeReferenceModule(
+      const HloModule& test_module,
+      const std::function<void(HloModule*)>& reference_preprocessor);
+
+  // Runs the module on two platforms with or without running hlo passes and
+  // compares the results. Returns whether the results are near or equal. If any
+  // error happens before the results are computed, returns the error status.
+  template <typename LiteralPtr>
+  StatusOr<::testing::AssertionResult> RunAndCompareInternal(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      const tensorflow::gtl::optional<ErrorSpec>& error, bool run_hlo_passes,
+      const std::function<void(HloModule*)>& reference_preprocessor);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
index 31060b9e80fcd50aefdedca27c70ec8a9b8be743..506091ddd8d1d8e6519525bb7031f4e8b296b5fb 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
@@ -23,15 +23,8 @@ limitations under the License.
 
 namespace xla {
 
-/*static*/ int64 HloVerifiedTestBase::DefaultShapeSize(const Shape& shape) {
-  constexpr int64 kPointerSize = sizeof(void*);
-  if (ShapeUtil::IsOpaque(shape)) {
-    return kPointerSize;
-  }
-  return ShapeUtil::ByteSizeOf(shape, kPointerSize);
-}
-
-HloVerifiedTestBase::HloVerifiedTestBase() : shape_size_fn_(DefaultShapeSize) {}
+HloVerifiedTestBase::HloVerifiedTestBase()
+    : shape_verifier_(MakeUnique<ShapeVerifier>()) {}
 
 HloVerifiedTestBase::~HloVerifiedTestBase() {
   // We can't call the ASSERT or EXPECT test macros in destructors, so we
@@ -47,7 +40,7 @@ void HloVerifiedTestBase::TearDown() {
       << "TearDown called more than once; it should be called exactly once.";
   tear_down_called_ = true;
   if (module_) {
-    HloVerifier verifier(shape_size_fn_);
+    HloVerifier verifier;
     xla::StatusOr<bool> mutated = verifier.Run(module_.get());
     if (!mutated.ok()) {
       ADD_FAILURE() << "HloVerifier failed: " << mutated.status();
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
index b3d6b5af3b46f932707abf309669d23c327d1334..492688bf7d682cf991cb8c09399492a0437f651b 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
@@ -28,14 +28,13 @@ namespace xla {
 // A base class for HLO tests that stores a default HloModule, and automatically
 // performs verification on that module on tear-down.
 class HloVerifiedTestBase : public HloTestBase {
- public:
-  // Returns the size in bytes of the given shape, using a default pointer size.
-  static int64 DefaultShapeSize(const Shape& shape);
-
  protected:
   HloVerifiedTestBase();
   ~HloVerifiedTestBase() override;
 
+  // Constructs a default shape verifier.
+  std::unique_ptr<ShapeVerifier> MakeShapeVerifier();
+
   // Performs verification on the default HloModule returned by module().
   // Automatically called by the testing framework for each test.
   //
@@ -47,14 +46,14 @@ class HloVerifiedTestBase : public HloTestBase {
   HloModule& module();
 
   // Sets the shape-size function used during hlo verification. If this isn't
-  // called, DefaultShapeSize is used instead.
-  void SetShapeSizeFn(std::function<int64(const Shape&)> shape_size_fn) {
-    shape_size_fn_ = std::move(shape_size_fn);
+  // called, a default ShapeVerifier is used instead.
+  void SetShapeVerifier(std::unique_ptr<ShapeVerifier> shape_verifier) {
+    shape_verifier_ = std::move(shape_verifier);
   }
 
  private:
   std::unique_ptr<HloModule> module_;  // Lazily populated. Access via module().
-  std::function<int64(const Shape&)> shape_size_fn_;
+  std::unique_ptr<ShapeVerifier> shape_verifier_;
   bool tear_down_called_ = false;
 };
 
diff --git a/tensorflow/compiler/xla/tests/isolated_convolution.hlo b/tensorflow/compiler/xla/tests/isolated_convolution.hlo
new file mode 100644
index 0000000000000000000000000000000000000000..9452780930efbb1ecc13b35cd4ab53678d36c37f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/isolated_convolution.hlo
@@ -0,0 +1,8 @@
+HloModule convolution.167:
+
+ENTRY %convolution.167 (parameter.0: f32[16,28,28,128], parameter.1: f32[3,3,128,128]) -> f32[16,28,28,128] {
+  %parameter.0 = f32[16,28,28,128]{3,0,2,1} parameter(0)
+  %parameter.1 = f32[3,3,128,128]{3,2,1,0} parameter(1)
+  ROOT %convolution.167 = f32[16,28,28,128]{3,0,2,1} convolution(f32[16,28,28,128]{3,0,2,1} %parameter.0, f32[3,3,128,128]{3,2,1,0} %parameter.1), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01oi->b01f
+}
+
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 6aa27e5470d22a8c6698389a720a38e9ea254617..5aa71a9261dbd414d1499f15c9b83cd63b634b49 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -57,7 +57,8 @@ namespace xla {
     }
     for (int i = 0; i < expected.tuple_shapes_size(); ++i) {
       ::testing::AssertionResult result =
-          EqualShapes(expected.tuple_shapes(i), actual.tuple_shapes(i));
+          EqualShapes(expected.tuple_shapes(i), actual.tuple_shapes(i))
+          << "mismatch in tuple index " << i;
       if (!result) {
         return result;
       }
@@ -100,36 +101,57 @@ namespace xla {
   ASSERT_EQ(expected.ShortDebugString(), actual.ShortDebugString());
 }
 
+namespace {
+
+// Return a literal with all arrays of type FromNativeT converted to type
+// ToNativeT in the given literal.
+template <typename FromNativeT, typename ToNativeT>
+std::unique_ptr<Literal> ConvertType(const Literal& literal) {
+  // First construct shape of the result.
+  Shape result_shape(literal.shape());
+  ShapeUtil::ForEachMutableSubshape(
+      &result_shape, [](Shape* subshape, const ShapeIndex&) {
+        if (subshape->element_type() ==
+            primitive_util::NativeToPrimitiveType<FromNativeT>()) {
+          subshape->set_element_type(
+              primitive_util::NativeToPrimitiveType<ToNativeT>());
+        }
+      });
+  auto result = MakeUnique<Literal>(result_shape);
+
+  // Then copy over the data from 'literal' converting FromNativeT values to
+  // ToNativeT values as necessary.
+  ShapeUtil::ForEachSubshape(
+      literal.shape(),
+      [&](const Shape& subshape, const ShapeIndex& shape_index) {
+        if (ShapeUtil::IsArray(subshape)) {
+          if (subshape.element_type() ==
+              primitive_util::NativeToPrimitiveType<FromNativeT>()) {
+            auto src = literal.data<FromNativeT>(shape_index);
+            auto dest = result->data<ToNativeT>(shape_index);
+            for (int64 i = 0; i < src.size(); ++i) {
+              dest[i] = static_cast<ToNativeT>(src[i]);
+            }
+          } else {
+            TF_CHECK_OK(result->CopyFrom(literal,
+                                         /*dest_shape_index=*/shape_index,
+                                         /*src_shape_index=*/shape_index));
+          }
+        }
+      });
+  return result;
+}
+
+}  // namespace
+
 /* static */ std::unique_ptr<Literal> LiteralTestUtil::ConvertBF16ToF32(
-    const Literal& bf16_literal) {
-  CHECK_EQ(bf16_literal.shape().element_type(), BF16);
-  Shape converted_shape = bf16_literal.shape();
-  converted_shape.set_element_type(F32);
-  auto converted = Literal::CreateFromShape(converted_shape);
-  if (!ShapeUtil::HasZeroElements(converted_shape)) {
-    std::vector<int64> index(converted_shape.dimensions_size(), 0);
-    do {
-      converted->Set<float>(
-          index, static_cast<float>(bf16_literal.Get<bfloat16>(index)));
-    } while (IndexUtil::BumpIndices(converted_shape, &index));
-  }
-  return converted;
+    const Literal& literal) {
+  return ConvertType<bfloat16, float>(literal);
 }
 
 /* static */ std::unique_ptr<Literal> LiteralTestUtil::ConvertF32ToBF16(
-    const Literal& f32_literal) {
-  CHECK_EQ(f32_literal.shape().element_type(), F32);
-  Shape converted_shape = f32_literal.shape();
-  converted_shape.set_element_type(BF16);
-  auto converted = Literal::CreateFromShape(converted_shape);
-  if (!ShapeUtil::HasZeroElements(converted_shape)) {
-    std::vector<int64> index(converted_shape.dimensions_size(), 0);
-    do {
-      converted->Set<bfloat16>(
-          index, static_cast<bfloat16>(f32_literal.Get<float>(index)));
-    } while (IndexUtil::BumpIndices(converted_shape, &index));
-  }
-  return converted;
+    const Literal& literal) {
+  return ConvertType<float, bfloat16>(literal);
 }
 
 namespace {
@@ -279,6 +301,9 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
     case BF16:
       match = ExpectLiteralsEqual<bfloat16>(expected, actual, &multi_index, 0);
       break;
+    case F16:
+      match = ExpectLiteralsEqual<half>(expected, actual, &multi_index, 0);
+      break;
     case F32:
       match = ExpectLiteralsEqual<float>(expected, actual, &multi_index, 0);
       break;
@@ -290,9 +315,14 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
       break;
     case TUPLE: {
       bool tuple_match = true;
-      for (int i = 0; i < actual.tuple_literals_size(); ++i) {
-        auto result =
-            Equal(expected.tuple_literals(i), actual.tuple_literals(i));
+      for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
+        SCOPED_TRACE(tensorflow::strings::StrCat(
+            "Tuple index ", i, " in ",
+            ShapeUtil::HumanString(expected.shape())));
+
+        // Create LiteralViews of the expected and actual elements.
+        auto result = Equal(LiteralView::Create(expected, {i}),
+                            LiteralView::Create(actual, {i}));
         tuple_match = tuple_match ? !!result : false;
       }
       match = tuple_match;
@@ -313,25 +343,6 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
   return result;
 }
 
-/* static */ void LiteralTestUtil::ExpectEqualTuple(const Literal& expected,
-                                                    const Literal& actual) {
-  VLOG(1) << "expected: " << expected.ToString();
-  VLOG(1) << "actual:   " << actual.ToString();
-
-  ASSERT_TRUE(ShapeUtil::IsTuple(expected.shape()));
-  ASSERT_TRUE(ShapeUtil::IsTuple(actual.shape()));
-  AssertEqualShapes(expected.shape(), actual.shape());
-  for (uint64 i = 0; i < expected.tuple_literals_size(); ++i) {
-    const auto& expected_element = expected.tuple_literals(i);
-    const auto& actual_element = actual.tuple_literals(i);
-    if (ShapeUtil::IsTuple(expected_element.shape())) {
-      ExpectEqualTuple(expected_element, actual_element);
-    } else {
-      ExpectEqual(expected_element, actual_element);
-    }
-  }
-}
-
 namespace {
 
 // Helper class for comparing floating-point literals within an error bound.
@@ -344,9 +355,9 @@ class NearComparator {
   // temporary files on failure. Returns true if  literals match.
   bool ExpectNear(const Literal& expected, const Literal& actual) {
     VLOG(1) << "expected:";
-    XLA_VLOG_LINES(1, expected.ToString());
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(expected));
     VLOG(1) << "actual:";
-    XLA_VLOG_LINES(1, actual.ToString());
+    XLA_VLOG_LINES(1, TruncateHugeLiteral(actual));
 
     // If the shapes mismatch, we simply fail the expectation instead of
     // printing out data, as it's a type error rather than a value error.
@@ -365,16 +376,21 @@ class NearComparator {
     abs_expected_miscompare_sum_ = 0.0;
     max_rel_err_ = 0.0;
     max_abs_err_ = 0.0;
-    *miscompares_.mutable_shape() =
-        ShapeUtil::ChangeElementType(actual.shape(), PRED);
-    miscompares_.mutable_preds()->resize(
-        ShapeUtil::ElementsIn(miscompares_.shape()), false);
+    first_linear_index_ = -1;
+    last_linear_index_ = -1;
+    max_rel_linear_index_ = -1;
+    max_abs_linear_index_ = -1;
+    miscompares_ = Literal(ShapeUtil::ChangeElementType(actual.shape(), PRED));
+    miscompares_.PopulateWithValue(false);
     multi_index_.resize(expected.shape().dimensions_size(), 0);
 
     switch (expected.shape().element_type()) {
       case BF16:
         ExpectLiteralsNear<bfloat16>(expected, actual, 0);
         break;
+      case F16:
+        ExpectLiteralsNear<half>(expected, actual, 0);
+        break;
       case F32:
         ExpectLiteralsNear<float>(expected, actual, 0);
         break;
@@ -393,21 +409,33 @@ class NearComparator {
     if (num_miscompares_ > 0) {
       if (!VLOG_IS_ON(1)) {
         LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape())
-                  << " " << expected.ToString();
+                  << " " << TruncateHugeLiteral(expected);
         LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual.shape())
-                  << " " << actual.ToString();
+                  << " " << TruncateHugeLiteral(actual);
+        LOG(INFO) << "Dumping literals to temp files...";
+        WriteLiteralToTempFile(expected, "expected");
+        WriteLiteralToTempFile(actual, "actual");
+        WriteLiteralToTempFile(miscompares_, "miscompares");
       }
       EXPECT_TRUE(num_miscompares_ == 0)
           << "\nmax relative mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(max_rel_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), max_rel_linear_index_))
           << "\nmaximum relative error " << max_rel_err_
           << "\nmax absolute mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(max_abs_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), max_abs_linear_index_))
           << "\nmaximum absolute error " << max_abs_err_
           << "\nfirst mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(first_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), first_linear_index_))
           << "\nlast mismatch at index "
-          << LiteralTestUtil::MultiIndexAsString(last_multi_index_)
+          << LiteralTestUtil::MultiIndexAsString(
+                 IndexUtil::LinearIndexToMultidimensionalIndex(
+                     actual.shape(), last_linear_index_))
           << "\ntotal absolute error " << abs_diff_sum_
           << "\ntotal absolute error of miscompares "
           << abs_diff_miscompare_sum_ << "\ntotal relative error "
@@ -415,18 +443,18 @@ class NearComparator {
           << "\ntotal relative error of miscompares "
           << (abs_diff_miscompare_sum_ / abs_expected_miscompare_sum_)
           << "\nfailure count " << num_miscompares_;
-
-      WriteLiteralToTempFile(expected, "expected");
-      WriteLiteralToTempFile(actual, "actual");
-      WriteLiteralToTempFile(miscompares_, "miscompares");
     }
     return num_miscompares_ == 0;
   }
 
  private:
   template <typename NativeT>
-  bool NanMismatch(NativeT lhs, NativeT rhs) {
-    return std::isnan(lhs) != std::isnan(rhs);
+  bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
+    if (relaxed_nans) {
+      return !std::isnan(expected) && std::isnan(actual);
+    } else {
+      return std::isnan(expected) != std::isnan(actual);
+    }
   }
 
   template <typename NativeT>
@@ -446,57 +474,94 @@ class NearComparator {
       return true;
     }
 
-    float abs_diff = std::abs(actual - expected);
-    float rel_err = abs_diff / std::abs(expected);
+    const float abs_diff = std::abs(actual - expected);
+    const float rel_err = abs_diff / std::abs(expected);
+    const bool nan_mismatch =
+        NanMismatch<NativeT>(expected, actual, error_.relaxed_nans);
+    const bool mismatch =
+        (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel));
+    return !mismatch;
+  }
+
+  // Assumes that expected vs actual fail ExpectValuesNear.
+  template <typename NativeT>
+  void UpdateAndLogMiscompares(const NativeT expected, const NativeT actual,
+                               const Shape& shape, const int64 linear_index) {
+    const float abs_diff = std::abs(actual - expected);
+    const float rel_err = abs_diff / std::abs(expected);
     abs_diff_sum_ += abs_diff;
     abs_expected_sum_ += std::abs(expected);
-    if (rel_err > max_rel_err_) {
+    if (rel_err > max_rel_err_ || std::isnan(rel_err)) {
       max_rel_err_ = rel_err;
-      max_rel_multi_index_ = multi_index_;
+      max_rel_linear_index_ = linear_index;
     }
-    if (abs_diff > max_abs_err_) {
+    if (abs_diff > max_abs_err_ || std::isnan(abs_diff)) {
       max_abs_err_ = abs_diff;
-      max_abs_multi_index_ = multi_index_;
+      max_abs_linear_index_ = linear_index;
     }
-    VLOG(10) << tensorflow::strings::Printf(
-        "index %s abs_diff %f rel_err %f",
-        LiteralTestUtil::MultiIndexAsString(multi_index_).c_str(), abs_diff,
-        rel_err);
-    bool nan_mismatch = NanMismatch<NativeT>(expected, actual);
-    bool mismatch =
-        (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel));
-    if (mismatch) {
-      abs_diff_miscompare_sum_ += abs_diff;
-      abs_expected_miscompare_sum_ += std::abs(expected);
-      const int64 kMaxFailures = 2;
-      if (num_miscompares_ < kMaxFailures) {
-        ::testing::Message msg;
-        msg << "mismatch at index "
-            << LiteralTestUtil::MultiIndexAsString(multi_index_) << " abs diff "
-            << abs_diff << " rel err " << rel_err << " failure #"
-            << num_miscompares_;
-        ExpectNear<NativeT>(expected, actual, msg);
-      } else if (num_miscompares_ == kMaxFailures) {
-        LOG(ERROR)
-            << "reached max 'loud' failure count; silently proceeding...";
-      }
-      if (num_miscompares_ == 0) {
-        first_multi_index_ = multi_index_;
-      }
-      num_miscompares_++;
-      last_multi_index_ = multi_index_;
+    if (VLOG_IS_ON(10)) {
+      VLOG(10) << tensorflow::strings::Printf(
+          "index %s abs_diff %f rel_err %f",
+          LiteralTestUtil::MultiIndexAsString(
+              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
+                                                            linear_index))
+              .c_str(),
+          abs_diff, rel_err);
     }
-    return !mismatch;
+    abs_diff_miscompare_sum_ += abs_diff;
+    abs_expected_miscompare_sum_ += std::abs(expected);
+    const int64 kMaxFailures = 2;
+    if (num_miscompares_ < kMaxFailures) {
+      const auto multi_index =
+          IndexUtil::LinearIndexToMultidimensionalIndex(shape, linear_index);
+      ::testing::Message msg;
+      msg << "mismatch at index "
+          << LiteralTestUtil::MultiIndexAsString(multi_index) << " abs diff "
+          << abs_diff << " rel err " << rel_err << " failure #"
+          << num_miscompares_;
+      ExpectNear<NativeT>(expected, actual, msg);
+    } else if (num_miscompares_ == kMaxFailures) {
+      LOG(ERROR) << "reached max 'loud' failure count; silently proceeding...";
+    }
+    if (num_miscompares_ == 0) {
+      first_linear_index_ = linear_index;
+    }
+    num_miscompares_++;
+    last_linear_index_ = linear_index;
+    miscompares_.data<bool>()[linear_index] = true;
   }
 
   // Recursive function which compares the two given literals elementwise.
   template <typename NativeT>
   void ExpectLiteralsNear(const Literal& expected, const Literal& actual,
                           int64 dimension) {
+    // Fast path optimization for the case were layouts match.
+    if (LayoutUtil::Equal(actual.shape().layout(), expected.shape().layout())) {
+      tensorflow::gtl::ArraySlice<const NativeT> expected_data =
+          expected.data<NativeT>();
+      tensorflow::gtl::ArraySlice<const NativeT> actual_data =
+          actual.data<NativeT>();
+      const int64 len = expected_data.size();
+      for (int64 i = 0; i < len; ++i) {
+        const bool near = ExpectValuesNear(expected_data[i], actual_data[i]);
+        if (!near) {
+          UpdateAndLogMiscompares<NativeT>(expected_data[i], actual_data[i],
+                                           actual.shape(), i);
+        }
+      }
+      return;
+    }
+
     if (dimension == expected.shape().dimensions_size()) {
       bool near = ExpectValuesNear(expected.Get<NativeT>(multi_index_),
                                    actual.Get<NativeT>(multi_index_));
-      miscompares_.Set<bool>(multi_index_, !near);
+      if (!near) {
+        UpdateAndLogMiscompares<NativeT>(
+            expected.Get<NativeT>(multi_index_),
+            actual.Get<NativeT>(multi_index_), actual.shape(),
+            IndexUtil::MultidimensionalIndexToLinearIndex(actual.shape(),
+                                                          multi_index_));
+      }
     } else {
       for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) {
         multi_index_[dimension] = i;
@@ -517,6 +582,32 @@ class NearComparator {
     LOG(ERROR) << "wrote to " << name << " file: " << filename;
   }
 
+  // Gets the total element count.  For tuples, this is not the count of tuple
+  // elements, but the sum of elements of each tuple element.
+  int64 RecursiveElementCount(const Shape& shape) {
+    if (ShapeUtil::IsTuple(shape)) {
+      const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
+      int64 total = 0;
+      for (int64 i = 0; i < tuple_elements; ++i) {
+        total +=
+            RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
+      }
+      return total;
+    } else {
+      return ShapeUtil::ElementsIn(shape);
+    }
+  }
+
+  // Calling ToString on a literal with over 100 million elements takes around
+  // 3 minutes.  The utility of printing a literal with >1000 elements is
+  // questionable, especially when writing the Literal proto to disk is orders
+  // of magnitude faster.
+  string TruncateHugeLiteral(const Literal& literal) {
+    return RecursiveElementCount(literal.shape()) < 1000
+               ? literal.ToString()
+               : "[TRUNCATED, Literal with more than 1000 values]";
+  }
+
   ErrorSpec error_;
 
   // Number of element miscomparisons encountered so far.
@@ -537,16 +628,18 @@ class NearComparator {
   double abs_expected_miscompare_sum_;
   float max_rel_err_;
   float max_abs_err_;
-  std::vector<int64> first_multi_index_;
-  std::vector<int64> last_multi_index_;
-  std::vector<int64> max_rel_multi_index_;
-  std::vector<int64> max_abs_multi_index_;
+  int64 first_linear_index_;
+  int64 last_linear_index_;
+  int64 max_rel_linear_index_;
+  int64 max_abs_linear_index_;
 };
 
 template <>
-bool NearComparator::NanMismatch<complex64>(complex64 lhs, complex64 rhs) {
-  return std::isnan(lhs.real()) != std::isnan(rhs.real()) ||
-         std::isnan(lhs.imag()) != std::isnan(rhs.imag());
+bool NearComparator::NanMismatch<complex64>(complex64 expected,
+                                            complex64 actual,
+                                            bool relaxed_nans) {
+  return NanMismatch(expected.real(), actual.real(), relaxed_nans) ||
+         NanMismatch(expected.imag(), actual.imag(), relaxed_nans);
 }
 
 template <>
@@ -567,14 +660,64 @@ bool NearComparator::ExpectValuesNear<bfloat16>(bfloat16 expected,
                           static_cast<float>(actual));
 }
 
+template <>
+bool NearComparator::ExpectValuesNear<half>(half expected, half actual) {
+  return ExpectValuesNear(static_cast<float>(std::move(expected)),
+                          static_cast<float>(std::move(actual)));
+}
+
+template <>
+void NearComparator::UpdateAndLogMiscompares<bfloat16>(
+    const bfloat16 expected, const bfloat16 actual, const Shape& shape,
+    const int64 linear_index) {
+  UpdateAndLogMiscompares(static_cast<float>(expected),
+                          static_cast<float>(actual), shape, linear_index);
+}
+
+template <>
+void NearComparator::UpdateAndLogMiscompares<half>(half expected, half actual,
+                                                   const Shape& shape,
+                                                   const int64 linear_index) {
+  UpdateAndLogMiscompares(static_cast<float>(std::move(expected)),
+                          static_cast<float>(std::move(actual)), shape,
+                          linear_index);
+}
+
 }  // namespace
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::Near(
     const Literal& expected, const Literal& actual, const ErrorSpec& error) {
-  NearComparator comparator(error);
-  return comparator.ExpectNear(expected, actual)
-             ? ::testing::AssertionSuccess()
-             : ::testing::AssertionFailure() << "values were not near";
+  ::testing::AssertionResult err =
+      EqualShapes(expected.shape(), actual.shape());
+  if (!err) {
+    return err;
+  }
+
+  if (ShapeUtil::IsTuple(expected.shape())) {
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
+      SCOPED_TRACE(tensorflow::strings::StrCat(
+          "Tuple index ", i, " in ", ShapeUtil::HumanString(expected.shape())));
+      const auto expected_element = LiteralView::Create(expected, {i});
+      const auto actual_element = LiteralView::Create(actual, {i});
+
+      ::testing::AssertionResult res =
+          Near(expected_element, actual_element, error);
+      if (err && !res) {
+        err = res;
+      }
+    }
+    return err;
+  }
+
+  if (ShapeUtil::ElementIsFloating(expected.shape()) ||
+      ShapeUtil::ElementIsComplex(expected.shape())) {
+    NearComparator comparator(error);
+    return comparator.ExpectNear(expected, actual)
+               ? ::testing::AssertionSuccess()
+               : ::testing::AssertionFailure() << "values were not near";
+  }
+
+  return Equal(expected, actual);
 }
 
 /* static */ void LiteralTestUtil::ExpectNear(const Literal& expected,
@@ -587,47 +730,21 @@ bool NearComparator::ExpectValuesNear<bfloat16>(bfloat16 expected,
               : tensorflow::strings::StrCat("\nmessage: ", message));
 }
 
-/* static */ ::testing::AssertionResult LiteralTestUtil::NearTuple(
-    const Literal& expected, const Literal& actual, const ErrorSpec& error) {
-  VLOG(1) << "expected: " << expected.ToString();
-  VLOG(1) << "actual:   " << actual.ToString();
-
-  if (!ShapeUtil::IsTuple(expected.shape()) ||
-      !ShapeUtil::IsTuple(actual.shape())) {
-    return ::testing::AssertionFailure()
-           << "tuples expected expected shape = "
-           << expected.shape().ShortDebugString()
-           << " actual shape = " << actual.shape().ShortDebugString();
-  }
-  AssertEqualShapes(expected.shape(), actual.shape());
-  for (uint64 i = 0; i < expected.tuple_literals_size(); ++i) {
-    const auto& expected_element = expected.tuple_literals(i);
-    const auto& actual_element = actual.tuple_literals(i);
-    if (ShapeUtil::IsTuple(expected_element.shape())) {
-      auto ret = NearTuple(expected_element, actual_element, error);
-      if (!ret) {
-        return ret;
-      }
-    } else if (ShapeUtil::ElementIsFloating(expected_element.shape())) {
-      auto ret = Near(expected_element, actual_element, error);
-      if (!ret) {
-        return ret;
-      }
-    } else {
-      auto ret = Equal(expected_element, actual_element);
-      if (!ret) {
-        return ret;
-      }
-    }
+/*static*/ ::testing::AssertionResult LiteralTestUtil::NearOrEqual(
+    const Literal& expected, const Literal& actual,
+    const tensorflow::gtl::optional<ErrorSpec>& error) {
+  if (error.has_value()) {
+    VLOG(1) << "Expects near";
+    return Near(expected, actual, *error);
   }
-
-  return ::testing::AssertionSuccess();
+  VLOG(1) << "Expects equal";
+  return Equal(expected, actual);
 }
 
-/* static */ void LiteralTestUtil::ExpectNearTuple(const Literal& expected,
-                                                   const Literal& actual,
-                                                   const ErrorSpec& error) {
-  EXPECT_TRUE(NearTuple(expected, actual, error));
+/*static*/ void LiteralTestUtil::ExpectNearOrEqual(
+    const Literal& expected, const Literal& actual,
+    const tensorflow::gtl::optional<ErrorSpec>& error) {
+  EXPECT_TRUE(NearOrEqual(expected, actual, error));
 }
 
 /* static */ string LiteralTestUtil::MultiIndexAsString(
@@ -644,10 +761,10 @@ bool NearComparator::ExpectValuesNear<bfloat16>(bfloat16 expected,
     new_num_elements *= new_dimensions[i];
   }
   CHECK_EQ(ShapeUtil::ElementsIn(literal.shape()), new_num_elements);
+  CHECK_EQ(new_dimensions.size(), minor_to_major.size());
 
-  auto new_literal = MakeUnique<Literal>();
-  *new_literal->mutable_shape() =
-      ShapeUtil::MakeShape(literal.shape().element_type(), new_dimensions);
+  auto new_literal = MakeUnique<Literal>(
+      ShapeUtil::MakeShape(literal.shape().element_type(), new_dimensions));
 
   // Create a new shape with the given minor-to-major layout. This shape is used
   // solely for converting linear address to multi-dimensional addresses when
@@ -655,9 +772,6 @@ bool NearComparator::ExpectValuesNear<bfloat16>(bfloat16 expected,
   Shape shape_with_layout = new_literal->shape();
   *shape_with_layout.mutable_layout() = LayoutUtil::MakeLayout(minor_to_major);
 
-  // Allocate space in the new literal.
-  new_literal->Reserve(ShapeUtil::ElementsIn(literal.shape()));
-
   // Copy data into new literal, element-by-element.
   for (int64 i = 0; i < ShapeUtil::ElementsIn(literal.shape()); ++i) {
     std::vector<int64> from_multi_index =
@@ -697,6 +811,10 @@ bool NearComparator::ExpectValuesNear<bfloat16>(bfloat16 expected,
         new_literal->Set<double>(to_multi_index,
                                  literal.Get<double>(from_multi_index));
         break;
+      case C64:
+        new_literal->Set<complex64>(to_multi_index,
+                                    literal.Get<complex64>(from_multi_index));
+        break;
       default:
         LOG(FATAL) << "Unhandled primitive element type: "
                    << PrimitiveType_Name(literal.shape().element_type());
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index 6e4add2690fd958d555eab3cef51cdbbd01819c9..7b757a4bd7e7592583b7596b4305ddb7e6c52d75 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -39,10 +40,16 @@ namespace xla {
 
 // Structure describing permissible absolute and relative error bounds.
 struct ErrorSpec {
-  explicit ErrorSpec(float aabs, float arel = 0) : abs(aabs), rel(arel) {}
+  explicit ErrorSpec(float aabs, float arel = 0, bool relaxed_nans = false)
+      : abs(aabs), rel(arel), relaxed_nans(relaxed_nans) {}
 
   float abs;  // Absolute error bound.
   float rel;  // Relative error bound.
+
+  // If relaxed_nans is true then any result is valid if we are expecting NaNs.
+  // In effect, this allows the tested operation to produce incorrect results
+  // for inputs outside its mathematical domain.
+  bool relaxed_nans;
 };
 
 // Utility class for making expectations/assertions related to XLA literals.
@@ -59,10 +66,14 @@ class LiteralTestUtil {
   static void AssertEqualShapesAndLayouts(const Shape& expected,
                                           const Shape& actual);
 
-  // Converts a bfloat16 literal to a float literal.
+  // If the given literal's data type is bfloat16, converts it to a float
+  // literal; otherwise, returns a copy of it. If the literal is a tuple,
+  // recursively converts its elements.
   static std::unique_ptr<Literal> ConvertBF16ToF32(const Literal& bf16_literal);
 
-  // Converts a float literal to a bfloat16 literal.
+  // If the given literal's data type is float, converts it to a bfloat16
+  // literal; otherwise, returns a copy of it. If the literal is a tuple,
+  // recursively converts its elements.
   static std::unique_ptr<Literal> ConvertF32ToBF16(const Literal& f32_literal);
 
   // Asserts that the expected and actual literals are (bitwise) equal for all
@@ -106,13 +117,18 @@ class LiteralTestUtil {
   static void ExpectR4EqualArray4D(const Array4D<NativeT>& expected,
                                    const Literal& actual);
 
-  // Expects that the values of the elements in the expected and actual tuples
-  // are equal. Tuples are matched recursively.
-  static void ExpectEqualTuple(const Literal& expected, const Literal& actual);
-
   // Asserts that the expected and actual literals are within the given error
   // bound for all elements. Also, asserts that the rank, dimensions sizes, and
-  // bounds are equivalent. Only supported for floating point values.
+  // bounds are equivalent.
+  //
+  // Tuples are matched recursively.  When comparing tensors of
+  // non-floating-point type, checks for exact equality, ignoring the ErroSpec.
+  //
+  // If the shape of the literals is neither a complex/floating-point tensor nor
+  // a tuple which contains a complex/floating-point tensor, Near() is
+  // equivalent to Equal().  We don't raise an error in this case, because we
+  // want to allow callers to call Near() even if they have no preconceptions
+  // about the shapes being compared.
   static ::testing::AssertionResult Near(
       const Literal& expected, const Literal& actual,
       const ErrorSpec& error) TF_MUST_USE_RESULT;
@@ -161,17 +177,18 @@ class LiteralTestUtil {
                                   const Literal& actual,
                                   const ErrorSpec& error);
 
-  // Returns whether the values of the elements in the expected and actual
-  // tuples are within the given error bound. Tuples are matched recursively.
-  // If the elements of the tuple are not floating-point types, the error spec
-  // is ignored and exact equality is checked.
-  static ::testing::AssertionResult NearTuple(
+  // If the error spec is given, returns whether the expected and the actual are
+  // within the error bound; otherwise, returns whether they are equal. Tuples
+  // will be compared recursively.
+  static ::testing::AssertionResult NearOrEqual(
       const Literal& expected, const Literal& actual,
-      const ErrorSpec& error) TF_MUST_USE_RESULT;
+      const tensorflow::gtl::optional<ErrorSpec>& error) TF_MUST_USE_RESULT;
 
-  // Expects that the expected and actual values are near.
-  static void ExpectNearTuple(const Literal& expected, const Literal& actual,
-                              const ErrorSpec& error);
+  // If the error spec is given, expects the expected and the actual to be near;
+  // otherwise, expects them to be equal. Tuples will be compared recursively.
+  static void ExpectNearOrEqual(
+      const Literal& expected, const Literal& actual,
+      const tensorflow::gtl::optional<ErrorSpec>& error);
 
   // Returns a multi-dimensional index as a string. For example: '{7, 8}' will
   // be returned for a 2-dimensional index with dimension 0 index equal to 7,
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index 2acf27ed390b0732ba40fcf505c746bd7d8b651e..3a421f8458268a14dcdd84889bcae4990c095ea4 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -83,18 +83,43 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
     LiteralProto literal_proto;
     TF_CHECK_OK(tensorflow::ReadBinaryProto(tensorflow::Env::Default(), result,
                                             &literal_proto));
-    Literal literal(literal_proto);
+    std::unique_ptr<Literal> literal =
+        Literal::CreateFromProto(literal_proto).ConsumeValueOrDie();
     if (result.find("expected") != string::npos) {
-      EXPECT_EQ("2", literal.ToString());
+      EXPECT_EQ("2", literal->ToString());
     } else if (result.find("actual") != string::npos) {
-      EXPECT_EQ("4", literal.ToString());
+      EXPECT_EQ("4", literal->ToString());
     } else if (result.find("miscompares") != string::npos) {
-      EXPECT_EQ("true", literal.ToString());
+      EXPECT_EQ("true", literal->ToString());
     } else {
       FAIL() << "unknown file in temporary directory: " << result;
     }
   }
 }
 
+TEST(LiteralTestUtilTest, NearComparatorR1) {
+  auto a =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
+  auto b =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
+  EXPECT_TRUE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtilTest, NearComparatorR1Nan) {
+  auto a =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8});
+  auto b =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, NAN, 0.5, 0.6, 0.7, 0.8});
+  EXPECT_TRUE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001}));
+}
+
+TEST(LiteralTestUtil, NearComparatorDifferentLengths) {
+  auto a =
+      Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
+  auto b = Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7});
+  EXPECT_FALSE(LiteralTestUtil::Near(*a, *b, ErrorSpec{0.0001}));
+  EXPECT_FALSE(LiteralTestUtil::Near(*b, *a, ErrorSpec{0.0001}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index b5b95967ff9162301a092f3a57996e0f3f78658f..7e92439c494b677f718a63c71c20828d65bebef4 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -74,7 +74,8 @@ class LLVMCompilerTest : public ::testing::Test {
 
     ASSERT_TRUE(compiler
                     ->RunBackend(std::move(hlo_module),
-                                 backend_->default_stream_executor())
+                                 backend_->default_stream_executor(),
+                                 /*device_allocator=*/nullptr)
                     .ok());
 
     // Test that hooks were called.
@@ -98,7 +99,8 @@ class LLVMCompilerTest : public ::testing::Test {
     executors.push_back({backend_->default_stream_executor()});
     executors.push_back({backend_->default_stream_executor()});
 
-    EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors)));
+    EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors),
+                                   /*device_allocator=*/nullptr));
   }
 
  private:
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test.cc b/tensorflow/compiler/xla/tests/local_client_aot_test.cc
index 569d5944cab0ae8f6a7b58a651285d20d4f9d019..47cab796041e9669affaebd7866d0d80100730f1 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test.cc
@@ -44,8 +44,7 @@ TEST_F(LocalClientAotTest, Constant) {
   OpaqueData opaque_data{100, 20, 3};
   void* parameters[] = {&opaque_data};
   float out = 0;
-  char tmp[4] = {0};
-  void* temporary_buffers[] = {nullptr, &out, &tmp};
+  void* temporary_buffers[] = {nullptr, &out};
   SumAndDouble(&out, &run_options, parameters, temporary_buffers);
   EXPECT_EQ(out, 246.0f);
 
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index 0cd44a72b5818c1bf66fd4cd1929572038596b47..3704ddd8010bf727b75ff81b63605e8b7ffe2ca8 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -63,8 +63,6 @@ int main(int argc, char** argv) {
     triple_string = "x86_64-apple-macosx";
   } else if (target_cpu == "arm") {
     triple_string = "aarch64-none-linux-gnu";
-  } else if (target_cpu == "ppc") {
-    triple_string = "powerpc64le-unknown-linux-gnu";
   } else if (target_cpu == "local") {
     triple_string = xla::llvm_ir::AsString(llvm::sys::getDefaultTargetTriple());
   } else {
@@ -89,10 +87,9 @@ int main(int argc, char** argv) {
   // It's lame to hard-code the buffer assignments, but we need
   // local_client_aot_test.cc to be able to easily invoke the function.
   CHECK_EQ(result->result_buffer_index(), 1);
-  CHECK_EQ(result->buffer_sizes().size(), 3);
+  CHECK_EQ(result->buffer_sizes().size(), 2);
   CHECK_EQ(result->buffer_sizes()[0], -1);             // param buffer
   CHECK_EQ(result->buffer_sizes()[1], sizeof(float));  // result buffer
-  CHECK_EQ(result->buffer_sizes()[2], sizeof(float));  // temp buffer
   if (triple.isOSBinFormatELF()) {
     // Check the ELF magic.
     CHECK_EQ(result->object_file_data()[0], 0x7F);
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index ad71d40197fe48b4343ee5f5f7f71b282a05cbf5..2462ea39f914b1dbb525ea777a48d9ce66035638 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -138,13 +138,13 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) {
   // Create x as a col-major array.
   auto x_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
       {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1})));
-  EXPECT_TRUE(LayoutUtil::Equal(x_array->shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(x_array->on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({0, 1})));
 
   // Create y as a row-major array.
   auto y_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
       {{10.0f, 20.0f}, {30.0f, 40.0f}}, LayoutUtil::MakeLayout({1, 0})));
-  EXPECT_TRUE(LayoutUtil::Equal(y_array->shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(y_array->on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({1, 0})));
 
   std::unique_ptr<ScopedShapedBuffer> result_colmaj =
@@ -179,7 +179,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) {
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {0, 1})),
       DefaultExecutableRunOptions());
-  EXPECT_TRUE(LayoutUtil::Equal(result_colmaj->shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(result_colmaj->on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({0, 1})));
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
                                        *ShapedBufferToLiteral(*result_colmaj),
@@ -191,7 +191,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) {
       DefaultExecutableBuildOptions().set_result_layout(
           ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {1, 0})),
       DefaultExecutableRunOptions());
-  EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj->shape().layout(),
+  EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj->on_device_shape().layout(),
                                 LayoutUtil::MakeLayout({1, 0})));
   LiteralTestUtil::ExpectR2Near<float>({{11.0f, 22.0f}, {33.0f, 44.0f}},
                                        *ShapedBufferToLiteral(*result_rowmaj),
@@ -213,16 +213,17 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) {
   std::unique_ptr<ScopedShapedBuffer> result =
       ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->shape()));
-  EXPECT_EQ(3, ShapeUtil::TupleElementCount(result->shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
+  EXPECT_EQ(3, ShapeUtil::TupleElementCount(result->on_host_shape()));
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
-  LiteralTestUtil::ExpectR2Equal<float>({{1.0f, 2.0f}, {3.0f, 4.0f}},
-                                        result_literal->tuple_literals(0));
-  LiteralTestUtil::ExpectR2Equal<float>({{10.0f, 20.0f}, {30.0f, 40.0f}},
-                                        result_literal->tuple_literals(1));
-  LiteralTestUtil::ExpectR2Equal<float>({{1.0f, 2.0f}, {3.0f, 4.0f}},
-                                        result_literal->tuple_literals(2));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0}));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{10.0f, 20.0f}, {30.0f, 40.0f}},
+      LiteralView::Create(*result_literal, {1}));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {2}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
@@ -241,19 +242,21 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
   std::unique_ptr<ScopedShapedBuffer> result =
       ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->shape()));
-  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
+  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape()));
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
-  LiteralTestUtil::ExpectR2Equal<float>({{1.0f, 2.0f}, {3.0f, 4.0f}},
-                                        result_literal->tuple_literals(1));
-  const Literal& inner_tuple_literal = result_literal->tuple_literals(0);
-  LiteralTestUtil::ExpectR2Equal<float>({{1.0f, 2.0f}, {3.0f, 4.0f}},
-                                        inner_tuple_literal.tuple_literals(0));
-  LiteralTestUtil::ExpectR2Equal<float>({{10.0f, 20.0f}, {30.0f, 40.0f}},
-                                        inner_tuple_literal.tuple_literals(1));
-  LiteralTestUtil::ExpectR2Equal<float>({{1.0f, 2.0f}, {3.0f, 4.0f}},
-                                        inner_tuple_literal.tuple_literals(2));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {1}));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{1.0f, 2.0f}, {3.0f, 4.0f}},
+      LiteralView::Create(*result_literal, {0, 0}));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{10.0f, 20.0f}, {30.0f, 40.0f}},
+      LiteralView::Create(*result_literal, {0, 1}));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{1.0f, 2.0f}, {3.0f, 4.0f}},
+      LiteralView::Create(*result_literal, {0, 2}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) {
@@ -278,10 +281,10 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) {
       DefaultExecutableRunOptions());
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
-  LiteralTestUtil::ExpectR2Equal<float>({{1.0f, 2.0f}, {3.0f, 4.0f}},
-                                        result_literal->tuple_literals(0));
-  LiteralTestUtil::ExpectR2Equal<float>({{1.0f, 2.0f}, {3.0f, 4.0f}},
-                                        result_literal->tuple_literals(1));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0}));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
@@ -320,14 +323,15 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
   std::unique_ptr<ScopedShapedBuffer> result =
       ExecuteLocallyOrDie(computation, {x_buffer.get(), y_buffer.get()});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result->shape()));
-  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->shape()));
+  EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape()));
+  EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape()));
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
-  LiteralTestUtil::ExpectR2Equal<float>({{56.0f, 46.0f}, {36.0f, 26.0f}},
-                                        result_literal->tuple_literals(0));
-  LiteralTestUtil::ExpectR1Equal<float>({40.0f, 71.0f, 117.0f},
-                                        result_literal->tuple_literals(1));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{56.0f, 46.0f}, {36.0f, 26.0f}},
+      LiteralView::Create(*result_literal, {0}));
+  LiteralTestUtil::ExpectR1Equal<float>(
+      {40.0f, 71.0f, 117.0f}, LiteralView::Create(*result_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) {
@@ -365,10 +369,10 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) {
       ExecuteLocallyOrDie(computation, {arg_buffer.get()});
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
-  LiteralTestUtil::ExpectR2Equal<float>({{-1.0, -2.0}, {-3.0, -4}},
-                                        result_literal->tuple_literals(0));
-  LiteralTestUtil::ExpectR1Equal<float>({264.0, 73.0, 133.0},
-                                        result_literal->tuple_literals(1));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{-1.0, -2.0}, {-3.0, -4}}, LiteralView::Create(*result_literal, {0}));
+  LiteralTestUtil::ExpectR1Equal<float>(
+      {264.0, 73.0, 133.0}, LiteralView::Create(*result_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) {
@@ -395,18 +399,19 @@ XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) {
   std::unique_ptr<ScopedShapedBuffer> result_0 =
       ExecuteLocallyOrDie(computation, {arg_buffer.get()});
   std::unique_ptr<Literal> result_0_literal = ShapedBufferToLiteral(*result_0);
-  LiteralTestUtil::ExpectR2Equal<float>({{-1.0, -2.0}, {-3.0, -4.0}},
-                                        result_0_literal->tuple_literals(0));
-  LiteralTestUtil::ExpectR2Equal<float>({{22.0, 6.0}, {8.0, 10}},
-                                        result_0_literal->tuple_literals(1));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{-1.0, -2.0}, {-3.0, -4.0}},
+      LiteralView::Create(*result_0_literal, {0}));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{22.0, 6.0}, {8.0, 10}}, LiteralView::Create(*result_0_literal, {1}));
 
   std::unique_ptr<ScopedShapedBuffer> result_1 =
       ExecuteLocallyOrDie(computation, {result_0.get()});
   std::unique_ptr<Literal> result_1_literal = ShapedBufferToLiteral(*result_1);
-  LiteralTestUtil::ExpectR2Equal<float>({{1.0, 2.0}, {3.0, 4.0}},
-                                        result_1_literal->tuple_literals(0));
-  LiteralTestUtil::ExpectR2Equal<float>({{44.0, 12.0}, {16.0, 20}},
-                                        result_1_literal->tuple_literals(1));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{1.0, 2.0}, {3.0, 4.0}}, LiteralView::Create(*result_1_literal, {0}));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{44.0, 12.0}, {16.0, 20}}, LiteralView::Create(*result_1_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, LargeTuple) {
@@ -455,7 +460,8 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) {
 
   for (int i = 0; i < kElementCount; ++i) {
     LiteralTestUtil::ExpectR1Near<float>(
-        {2.0f * i, 0.0f}, result_literal->tuple_literals(i), error_spec_);
+        {2.0f * i, 0.0f}, LiteralView::Create(*result_literal, {i}),
+        error_spec_);
   }
 }
 
@@ -512,8 +518,8 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_CPU_PARALLEL(LargeNestedTuple)) {
   for (int i = 0; i < kFanout; ++i) {
     for (int j = 0; j < kFanout; ++j) {
       LiteralTestUtil::ExpectR0Near<float>(
-          i + j + i * kFanout + j,
-          result_literal->tuple_literals(i).tuple_literals(j), error_spec_);
+          i + j + i * kFanout + j, LiteralView::Create(*result_literal, {i, j}),
+          error_spec_);
     }
   }
 }
@@ -554,11 +560,12 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) {
       ExecuteLocallyOrDie(computation, {arg_buffer.get()});
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(*result);
 
-  const Literal* result_element = result_literal.get();
+  ShapeIndex index;
   for (int i = 0; i < kTupleDepth; ++i) {
-    result_element = &result_element->tuple_literals(0);
+    index.push_back(0);
   }
-  LiteralTestUtil::ExpectR0Equal<float>(165.0, *result_element);
+  LiteralTestUtil::ExpectR0Equal<float>(
+      165.0, LiteralView::Create(*result_literal, index));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) {
@@ -575,7 +582,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) {
 
   EXPECT_FALSE(execute_status.ok());
   EXPECT_THAT(execute_status.status().error_message(),
-              ContainsRegex("invalid number of arguments"));
+              ContainsRegex("Invalid number of arguments"));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) {
@@ -591,7 +598,7 @@ XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) {
 
   EXPECT_FALSE(execute_status.ok());
   EXPECT_THAT(execute_status.status().error_message(),
-              ContainsRegex("invalid argument shape"))
+              ContainsRegex("Invalid argument shape"))
       << execute_status.status();
 }
 
@@ -763,10 +770,10 @@ XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) {
   std::unique_ptr<ScopedShapedBuffer> result =
       ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {});
   std::unique_ptr<Literal> tuple_literal = ShapedBufferToLiteral(*result);
-  LiteralTestUtil::ExpectR1Equal<float>({2.0f, 4.0f, 6.0f},
-                                        tuple_literal->tuple_literals(0));
-  LiteralTestUtil::ExpectR1Equal<float>({1.0f, 2.0f, 3.0f},
-                                        tuple_literal->tuple_literals(1));
+  LiteralTestUtil::ExpectR1Equal<float>(
+      {2.0f, 4.0f, 6.0f}, LiteralView::Create(*tuple_literal, {0}));
+  LiteralTestUtil::ExpectR1Equal<float>(
+      {1.0f, 2.0f, 3.0f}, LiteralView::Create(*tuple_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) {
@@ -906,20 +913,18 @@ void BM_LocalClientOverhead(int num_iters) {
   builder.Add(x, x);
   auto computation = builder.Build().ConsumeValueOrDie();
 
-  auto shape_size_fn = [client](const Shape& shape) {
-    return client->backend().transfer_manager()->GetByteSizeRequirement(shape);
-  };
-  auto buffer = ScopedShapedBuffer::Allocate(
-                    shape, &allocator, /*device_ordinal=*/0, shape_size_fn)
-                    .ConsumeValueOrDie();
+  auto buffer =
+      transfer_manager
+          ->AllocateScopedShapedBuffer(shape, &allocator, /*device_ordinal=*/0)
+          .ConsumeValueOrDie();
   auto literal = Literal::CreateR2<float>({{0, 0, 0}, {0, 0, 0}});
   ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      executors[device_ordinal], *literal, buffer->mutable_buffer({})));
+      executors[device_ordinal], *literal, *buffer));
 
   const int kWarmups = 2;
 
-  auto executable_status = client->Compile(computation, {&buffer->shape()},
-                                           ExecutableBuildOptions());
+  auto executable_status = client->Compile(
+      computation, {&buffer->on_host_shape()}, ExecutableBuildOptions());
   ASSERT_IS_OK(executable_status);
   std::unique_ptr<LocalExecutable> executable =
       executable_status.ConsumeValueOrDie();
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 062a9246e49598d5d03dce8c1f437138923449bf..96b976d25d75d35f46adfd104a03aceb363661eb 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -188,7 +188,7 @@ LocalClientTestBase::ExecuteLocally(
     const ExecutableRunOptions& run_options) {
   std::vector<const Shape*> argument_layouts(arguments.size());
   for (int i = 0; i < arguments.size(); ++i) {
-    argument_layouts[i] = &arguments[i]->shape();
+    argument_layouts[i] = &arguments[i]->on_host_shape();
   }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<LocalExecutable> executable,
diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
index 0fb87c3c2ccbad387d46016cfad4e7d3cc537dcc..6c86dd5b9ef673c9facffafa37e00a859ce82010 100644
--- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
@@ -221,5 +221,77 @@ INSTANTIATE_TEST_CASE_P(MatOpsDotAddTestInstances, MatOpsDotAddTest,
                         ::testing::Combine(::testing::Bool(), ::testing::Bool(),
                                            ::testing::Bool()));
 
+class MatOpsDotAddTest_bf16
+    : public ClientLibraryTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {};
+
+TEST_P(MatOpsDotAddTest_bf16, Dot_Add_2x2_2x2) {
+  bool row_major = std::get<0>(GetParam());
+  bool add_lhs = std::get<1>(GetParam());
+  bool transpose = std::get<2>(GetParam());
+  Array2D<bfloat16> lhs(
+      {{bfloat16(1.0f), bfloat16(2.0f)}, {bfloat16(3.0), bfloat16(4.0)}});
+  Array2D<bfloat16> rhs(
+      {{bfloat16(10.0f), bfloat16(11.0f)}, {bfloat16(12.0f), bfloat16(13.0f)}});
+
+  auto minor_to_major = [](bool row_major) -> std::vector<int64> {
+    return {row_major ? 1 : 0, row_major ? 0 : 1};
+  };
+
+  auto prim_type = primitive_util::NativeToPrimitiveType<bfloat16>();
+  Shape lhs_shape =
+      ShapeUtil::MakeShape(prim_type, {lhs.height(), lhs.width()});
+  Shape rhs_shape =
+      ShapeUtil::MakeShape(prim_type, {rhs.height(), rhs.width()});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto lhs_handle,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2DWithLayout<bfloat16>(
+              lhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto rhs_handle,
+      client_->TransferToServer(
+          *Literal::CreateR2FromArray2DWithLayout<bfloat16>(
+              rhs, LayoutUtil::MakeLayout(minor_to_major(row_major)))));
+
+  ComputationBuilder builder(client_, TestName());
+  auto lhs_arg = builder.Parameter(0, lhs_shape, "lhs");
+  auto lhs_mat_arg = lhs_arg;
+  if (transpose) {
+    lhs_mat_arg = builder.Transpose(lhs_mat_arg, {1, 0});
+  }
+  auto rhs_arg = builder.Parameter(1, rhs_shape, "rhs");
+  auto result = builder.Dot(lhs_mat_arg, rhs_arg);
+  Array2D<bfloat16> expected;
+  if (add_lhs) {
+    result = builder.Add(result, lhs_arg);
+    if (transpose) {
+      expected = Array2D<bfloat16>(
+          {{bfloat16(47), bfloat16(52)}, {bfloat16(71), bfloat16(78)}});
+    } else {
+      expected = Array2D<bfloat16>(
+          {{bfloat16(35), bfloat16(39)}, {bfloat16(81), bfloat16(89)}});
+    }
+  } else {
+    result = builder.Add(result, rhs_arg);
+    if (transpose) {
+      expected = Array2D<bfloat16>(
+          {{bfloat16(56), bfloat16(61)}, {bfloat16(80), bfloat16(87)}});
+    } else {
+      expected = Array2D<bfloat16>(
+          {{bfloat16(44), bfloat16(48)}, {bfloat16(90), bfloat16(98)}});
+    }
+  }
+
+  ComputeAndCompareR2<bfloat16>(&builder, expected,
+                                {lhs_handle.get(), rhs_handle.get()},
+                                ErrorSpec(1e-6));
+}
+
+INSTANTIATE_TEST_CASE_P(MatOpsDotAddTestInstances, MatOpsDotAddTest_bf16,
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index 22d2b917a1d55f4f453e21c2d8fea38e32ff796b..0a603f4954badd12adf3144320789a5edd0d9c6c 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -35,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -76,8 +78,11 @@ class MultiOutputFusionTest : public HloTestBase {
         elem_shape2, HloOpcode::kAdd, broadcast, param1));
     HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary(
         elem_shape2, HloOpcode::kSubtract, param1, broadcast));
+    DotDimensionNumbers dot_dnums;
+    dot_dnums.add_lhs_contracting_dimensions(1);
+    dot_dnums.add_rhs_contracting_dimensions(0);
     HloInstruction* dot = builder.AddInstruction(
-        HloInstruction::CreateBinary(elem_shape2, HloOpcode::kDot, sub, add2));
+        HloInstruction::CreateDot(elem_shape2, sub, add2, dot_dnums));
     auto computation = hlo_module->AddEntryComputation(builder.Build(dot));
 
     if (manual_fusion) {
@@ -96,14 +101,13 @@ class MultiOutputFusionTest : public HloTestBase {
           nullptr);
     }
 
-    Literal input;
-    input.PopulateWithValue<float>(2.5f, {size, size});
-    auto p1 = TransferToDevice(input);
-    auto p0 = TransferToDevice(*Literal::CreateR0<float>(-9.0f));
+    Literal arg1(ShapeUtil::MakeShape(F32, {size, size}));
+    arg1.PopulateWithValue<float>(2.5f);
 
-    Literal expect;
-    expect.PopulateWithValue<float>(size * 1.5f * 3.5f, {size, size});
-    auto actual = ExecuteAndTransfer(std::move(hlo_module), {p0, p1});
+    Literal expect(ShapeUtil::MakeShape(F32, {size, size}));
+    expect.PopulateWithValue<float>(size * 1.5f * 3.5f);
+    auto actual = ExecuteAndTransfer(
+        std::move(hlo_module), {Literal::CreateR0<float>(-9.0f).get(), &arg1});
     LiteralTestUtil::ExpectNear(expect, *actual, error_spec_);
   }
 
@@ -133,8 +137,11 @@ class MultiOutputFusionTest : public HloTestBase {
     HloInstruction* reshape =
         builder.AddInstruction(HloInstruction::CreateReshape(
             ShapeUtil::MakeShape(F32, {size, 1}), add));
-    HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(F32, {1}), HloOpcode::kDot, sub, reshape));
+    DotDimensionNumbers dot_dnums;
+    dot_dnums.add_lhs_contracting_dimensions(0);
+    dot_dnums.add_rhs_contracting_dimensions(0);
+    HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
+        ShapeUtil::MakeShape(F32, {1}), sub, reshape, dot_dnums));
     auto computation = hlo_module->AddEntryComputation(builder.Build(dot));
 
     if (manual_fusion) {
@@ -154,14 +161,13 @@ class MultiOutputFusionTest : public HloTestBase {
                nullptr);
     }
 
-    Literal input0, input1;
-    input0.PopulateWithValue<float>(2.5f, {size});
-    input1.PopulateWithValue<double>(1, {size});
-    auto p0 = TransferToDevice(input0);
-    auto p1 = TransferToDevice(input1);
+    Literal input0(ShapeUtil::MakeShape(F32, {size}));
+    input0.PopulateWithValue(2.5f);
+    Literal input1(ShapeUtil::MakeShape(F64, {size}));
+    input1.PopulateWithValue(1.);
 
-    Literal expect = *Literal::CreateR1<float>({size * 1.5f * 3.5f});
-    auto actual = ExecuteAndTransfer(std::move(hlo_module), {p0, p1});
+    Literal expect = std::move(*Literal::CreateR1<float>({size * 1.5f * 3.5f}));
+    auto actual = ExecuteAndTransfer(std::move(hlo_module), {&input0, &input1});
     LiteralTestUtil::ExpectNear(expect, *actual, error_spec_);
   }
 };
@@ -172,5 +178,38 @@ XLA_TEST_F(MultiOutputFusionTest, 2DFusionSize129) { RunTest2D(true, 129); }
 XLA_TEST_F(MultiOutputFusionTest, DiffentTypesNoFusion) { RunTest1D(false, 8); }
 XLA_TEST_F(MultiOutputFusionTest, DiffentTypesFusion) { RunTest1D(true, 8); }
 
+XLA_TEST_F(MultiOutputFusionTest, FusionNodeIsRoot) {
+  const char* testcase = R"(
+    HloModule m
+
+    fused_computation {
+      x.param_0 = (((s32[]), f32[]), (f32[], s32[])) parameter(0)
+      gte.3 = ((s32[]), f32[]) get-tuple-element(x.param_0), index=0
+      gte.2 = (s32[]) get-tuple-element(gte.3), index=0
+      gte.4 = s32[] get-tuple-element(gte.2), index=0
+      copy = s32[] copy(gte.4)
+      ROOT tuple = (s32[]) tuple(copy)
+    }
+
+    ENTRY thing.v3 {
+      x = (((s32[]), f32[]), (f32[], s32[])) parameter(0)
+      ROOT fusion = (s32[]) fusion(x), kind=kLoop, calls=fused_computation
+    }
+  )";
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::MakeTupleOwned(
+      Literal::MakeTupleOwned(
+          Literal::MakeTupleOwned(Literal::CreateR0<int32>(42)),
+          Literal::CreateR0<float>(1.0)),
+      Literal::MakeTupleOwned(Literal::CreateR0<float>(3.0),
+                              Literal::CreateR0<int32>(4)));
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {param.get()}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result, *Literal::MakeTupleOwned(Literal::CreateR0<int32>(42))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc
index 3fd83a4c3b104831f03366339fb7b8b5d816a3f7..8cef8dd34dc7b16b1e58ded67d6b6a4ba79f20db 100644
--- a/tensorflow/compiler/xla/tests/pad_test.cc
+++ b/tensorflow/compiler/xla/tests/pad_test.cc
@@ -33,6 +33,14 @@ limitations under the License.
 namespace xla {
 namespace {
 
+#ifdef XLA_BACKEND_SUPPORTS_BFLOAT16
+// Tests both F32 and BF16.
+static std::array<bool, 2> use_bfloat16_params{false, true};
+#else
+// Only tests F32.
+static std::array<bool, 1> use_bfloat16_params{false};
+#endif
+
 class PadTest : public ClientLibraryTestBase {
  protected:
   PadTest() {
@@ -61,8 +69,22 @@ class PadTest : public ClientLibraryTestBase {
   PaddingConfig r4_padding_on_dim0_dim1_;
 };
 
+class PadTestFloat : public PadTest,
+                     public ::testing::WithParamInterface<bool> {
+ protected:
+  PadTestFloat() { set_use_bfloat16(GetParam()); }
+
+  ErrorSpec DefaultErrorSpec() const {
+    if (use_bfloat16()) {
+      return ErrorSpec(1e-3, 1e-3);
+    } else {
+      return ErrorSpec(1e-5, 1e-5);
+    }
+  }
+};
+
 // Tests a Pad() with a zero-element input and output.
-XLA_TEST_F(PadTest, Pad1DS0ToS0Array) {
+XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
   ComputationBuilder b(client_, TestName());
   // Set up the padding configuration {low: 0, high: 0, interior: 0}.
   PaddingConfig padding_config;
@@ -71,12 +93,13 @@ XLA_TEST_F(PadTest, Pad1DS0ToS0Array) {
   dimension->set_edge_padding_high(0);
   dimension->set_interior_padding(0);
 
-  b.Pad(b.ConstantR1<float>({}), b.ConstantR0<float>(0.1), padding_config);
-  ComputeAndCompareR1<float>(&b, {}, {}, ErrorSpec(0.0001));
+  b.Pad(AddParam(*Literal::CreateR1<float>({}), &b),
+        AddParam(*Literal::CreateR0<float>(0.1), &b), padding_config);
+  ComputeAndCompareR1<float>(&b, {}, {}, DefaultErrorSpec());
 }
 
 // Tests a Pad() with a zero-element input but a non-zero-element output.
-XLA_TEST_F(PadTest, Pad1DS0ToS5Array) {
+XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
   ComputationBuilder b(client_, TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
@@ -85,12 +108,13 @@ XLA_TEST_F(PadTest, Pad1DS0ToS5Array) {
   dimension->set_edge_padding_high(4);
   dimension->set_interior_padding(7);
 
-  b.Pad(b.ConstantR1<float>({}), b.ConstantR0<float>(0.1), padding_config);
+  b.Pad(AddParam(*Literal::CreateR1<float>({}), &b),
+        AddParam(*Literal::CreateR0<float>(0.1), &b), padding_config);
   ComputeAndCompareR1<float>(&b, std::vector<float>(5, 0.1), {},
-                             ErrorSpec(0.0001));
+                             DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, Pad1DS3Array) {
+XLA_TEST_P(PadTestFloat, Pad1DS3Array) {
   ComputationBuilder b(client_, TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
@@ -99,21 +123,21 @@ XLA_TEST_F(PadTest, Pad1DS3Array) {
   dimension->set_edge_padding_high(0);
   dimension->set_interior_padding(1);
 
-  b.Pad(b.ConstantR1<float>({1, 2, 3}), b.ConstantR0<float>(0.1),
-        padding_config);
+  b.Pad(AddParam(*Literal::CreateR1<float>({1, 2, 3}), &b),
+        AddParam(*Literal::CreateR0<float>(0.1), &b), padding_config);
   std::vector<float> expected({0.1, 0.1, 0.1, 1, 0.1, 2, 0.1, 3});
-  ComputeAndCompareR1<float>(&b, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR1<float>(&b, expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, Pad4D_2x0x3x2_FloatArray) {
+XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
   ComputationBuilder b(client_, TestName());
-  b.Pad(b.ConstantR4FromArray4D<float>(Array4D<float>(2, 0, 3, 2)),
-        b.ConstantR0<float>(1.5), r4_padding_on_dim0_dim1_);
+  b.Pad(AddParam(Array4D<float>(2, 0, 3, 2), &b),
+        AddParam(*Literal::CreateR0<float>(1.5), &b), r4_padding_on_dim0_dim1_);
   ComputeAndCompareR4<float>(&b, Array4D<float>(5, 2, 3, 2, 1.5f), {},
-                             ErrorSpec(0.0001));
+                             DefaultErrorSpec());
 }
 
-TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) {
+TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
   ComputationBuilder b(client_, TestName());
   auto input = MakeUnique<Array4D<float>>(1, 1, 3, 2);
   Array2D<float> input_xy({
@@ -123,7 +147,7 @@ TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) {
   });
   input->FillWithYX(input_xy);
 
-  b.Pad(b.ConstantR4FromArray4D<float>(*input), b.ConstantR0<float>(1.5),
+  b.Pad(AddParam(*input, &b), AddParam(*Literal::CreateR0<float>(1.5), &b),
         r4_padding_on_dim0_dim1_);
 
   auto expected = MakeUnique<Array4D<float>>(2, 3, 3, 2);
@@ -134,15 +158,15 @@ TEST_F(PadTest, Pad4DFloat_1x1x3x2_Array) {
   (*expected)(1, 0, 1, 1) = 4.0f;
   (*expected)(1, 0, 2, 0) = 5.0f;
   (*expected)(1, 0, 2, 1) = 6.0f;
-  ComputeAndCompareR4<float>(&b, *expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR4<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-TEST_F(PadTest, Pad4DFloatArrayWithInteriorPadding) {
+TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
   ComputationBuilder b(client_, TestName());
 
   const float pad_value = 1.5f;
   Array4D<float> input(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
-  b.Pad(b.ConstantR4FromArray4D<float>(input), b.ConstantR0<float>(pad_value),
+  b.Pad(AddParam(input, &b), AddParam(*Literal::CreateR0<float>(pad_value), &b),
         r4_padding_on_dim0_dim1_);
 
   auto expected = MakeUnique<Array4D<float>>(8, 5, 1, 1);
@@ -156,7 +180,7 @@ TEST_F(PadTest, Pad4DFloatArrayWithInteriorPadding) {
   ComputeAndCompareR4<float>(&b, *expected, {}, ErrorSpec(0.0001));
 }
 
-TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) {
+TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
   ComputationBuilder b(client_, TestName());
 
   PaddingConfig padding_config;
@@ -184,7 +208,8 @@ TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) {
   auto input = Literal::CreateR4FromArray4D<float>(input_array);
   input = input->Relayout(layout);
 
-  b.Pad(b.ConstantLiteral(*input), b.ConstantR0(pad_value), padding_config);
+  b.Pad(AddParam(*input, &b),
+        AddParam(*Literal::CreateR0<float>(pad_value), &b), padding_config);
 
   Array4D<float> expected_array(1, 1, 5, 8);
   expected_array.Fill(pad_value);
@@ -197,7 +222,7 @@ TEST_F(PadTest, Pad4DFloatArrayMinorFirstSmall) {
   ComputeAndCompareR4<float>(&b, expected_array, {}, ErrorSpec(0.0001));
 }
 
-XLA_TEST_F(PadTest, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
+XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
   ComputationBuilder b(client_, TestName());
 
   PaddingConfig padding_config;
@@ -229,7 +254,8 @@ XLA_TEST_F(PadTest, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
   auto input = Literal::CreateR4FromArray4D<float>(input_array);
   input = input->Relayout(layout);
 
-  b.Pad(b.ConstantLiteral(*input), b.ConstantR0(pad_value), padding_config);
+  b.Pad(AddParam(*input, &b),
+        AddParam(*Literal::CreateR0<float>(pad_value), &b), padding_config);
 
   Array4D<float> expected_array(1, 25, 17, 11);
   expected_array.Fill(pad_value);
@@ -249,7 +275,7 @@ XLA_TEST_F(PadTest, Pad4DU8Array) {
   });
   input->FillWithYX(input_xy);
 
-  b.Pad(b.ConstantR4FromArray4D<uint8>(*input), b.ConstantR0<uint8>(35),
+  b.Pad(AddParam(*input, &b), b.ConstantR0<uint8>(35),
         r4_padding_on_dim0_dim1_);
 
   auto expected = MakeUnique<Array4D<uint8>>(2, 3, 3, 2);
@@ -277,8 +303,7 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
   auto ones = MakeUnique<Array4D<int32>>(2, 3, 3, 2);
   zeros->Fill(0);
   ones->Fill(1);
-  b.Select(padded, b.ConstantR4FromArray4D<int32>(*ones),
-           b.ConstantR4FromArray4D<int32>(*zeros));
+  b.Select(padded, AddParam(*ones, &b), AddParam(*zeros, &b));
 
   auto expected = MakeUnique<Array4D<int32>>(2, 3, 3, 2);
   expected->Fill(0);
@@ -291,10 +316,12 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
   ComputeAndCompareR4<int32>(&b, *expected, {});
 }
 
-XLA_TEST_F(PadTest, Large2DPad) {
+XLA_TEST_P(PadTestFloat, Large2DPad) {
   ComputationBuilder b(client_, TestName());
 
-  auto input = b.Parameter(0, ShapeUtil::MakeShape(F32, {4, 4}), "input");
+  auto ones = MakeUnique<Array2D<float>>(4, 4);
+  ones->Fill(1.0f);
+  auto input = AddParam(*ones, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -302,25 +329,22 @@ XLA_TEST_F(PadTest, Large2DPad) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_high(58 +
                                                                   100 * dim);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(0.0f), padding_config);
-
-  auto ones = MakeUnique<Array2D<float>>(4, 4);
-  ones->Fill(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*ones);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(0.0f), &b),
+                      padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*ones, padding_config, 0.0f);
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()});
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, AllTypes2DPad) {
+XLA_TEST_P(PadTestFloat, AllTypes2DPad) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 35;
   constexpr int64 in_cols = 35;
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(0.0f);
+  auto input = AddParam(*operand, &b);
+
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   padding_config.mutable_dimensions(0)->set_edge_padding_low(7);
   padding_config.mutable_dimensions(0)->set_edge_padding_high(5);
@@ -328,20 +352,14 @@ XLA_TEST_F(PadTest, AllTypes2DPad) {
   padding_config.mutable_dimensions(1)->set_edge_padding_low(6);
   padding_config.mutable_dimensions(1)->set_edge_padding_high(4);
   padding_config.mutable_dimensions(1)->set_interior_padding(2);
-  auto padded = b.Pad(input, b.ConstantR0<float>(3.14f), padding_config);
-
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(0.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(3.14f), &b),
+                      padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 3.14f);
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec{0.0001});
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, High2DPad) {
+XLA_TEST_P(PadTestFloat, High2DPad) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 129;
@@ -349,8 +367,9 @@ XLA_TEST_F(PadTest, High2DPad) {
   constexpr int64 low_padding = 0;
   int64 high_padding[2] = {5, 7};
   constexpr int64 interior_padding = 0;
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(1.0f);
+  auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(low_padding);
@@ -359,20 +378,15 @@ XLA_TEST_F(PadTest, High2DPad) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(2.718f), padding_config);
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
+                      padding_config);
 
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec(0.0001));
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, NegativePadding2D) {
+XLA_TEST_P(PadTestFloat, NegativePadding2D) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 129;
@@ -380,8 +394,9 @@ XLA_TEST_F(PadTest, NegativePadding2D) {
   int64 low_padding[2] = {-1, -2};
   int64 high_padding[2] = {-3, 4};
   constexpr int64 interior_padding = 0;
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(1.0f);
+  auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -391,20 +406,15 @@ XLA_TEST_F(PadTest, NegativePadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(2.718f), padding_config);
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
+                      padding_config);
 
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec(0.0001));
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) {
+XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
   ComputationBuilder b(client_, TestName());
 
   constexpr int64 in_rows = 8;
@@ -412,8 +422,9 @@ XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) {
   int64 low_padding[2] = {4, -1};
   int64 high_padding[2] = {-2, -4};
   int64 interior_padding[2] = {1, 2};
-  auto input =
-      b.Parameter(0, ShapeUtil::MakeShape(F32, {in_rows, in_cols}), "input");
+  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
+  operand->FillUnique(1.0f);
+  auto input = AddParam(*operand, &b);
   PaddingConfig padding_config = MakeNoPaddingConfig(2);
   for (int dim : {0, 1}) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_low(
@@ -423,44 +434,40 @@ XLA_TEST_F(PadTest, NegativeAndInteriorPadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding[dim]);
   }
-  auto padded = b.Pad(input, b.ConstantR0<float>(2.718f), padding_config);
+  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
+                      padding_config);
 
-  auto operand = MakeUnique<Array2D<float>>(in_rows, in_cols);
-  operand->FillUnique(1.0f);
-  auto input_literal = Literal::CreateR2FromArray2D<float>(*operand);
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_data.get()},
-                             ErrorSpec(0.0001));
+  ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
 // Regression test for b/31827337.
-XLA_TEST_F(PadTest, ReducePad) {
+XLA_TEST_P(PadTestFloat, ReducePad) {
   ComputationBuilder b(client_, TestName());
-  auto input = b.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2, 2, 2}), "input");
+  auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
+  ones->Fill(1.0);
+  auto input = AddParam(*ones, &b);
 
-  Computation add_f32 = CreateScalarAddComputation(F32, &b);
-  auto reduce = b.Reduce(input, b.ConstantR0<float>(0.0), add_f32, {0});
+  Computation add = CreateScalarAddComputation(FloatType(), &b);
+  auto reduce =
+      b.Reduce(input, AddParam(*Literal::CreateR0<float>(0.0), &b), add, {0});
 
   PaddingConfig padding_config = MakeNoPaddingConfig(3);
   padding_config.mutable_dimensions(0)->set_edge_padding_low(1);
   padding_config.mutable_dimensions(0)->set_edge_padding_high(1);
-  auto pad = b.Pad(reduce, b.ConstantR0<float>(0.0), padding_config);
-
-  auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
-  ones->Fill(1.0);
-  auto input_literal = Literal::CreateR4FromArray4D<float>(*ones);
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+  auto padded = b.Pad(reduce, AddParam(*Literal::CreateR0<float>(0.0f), &b),
+                      padding_config);
 
   Array3D<float> expected({{{0.0, 0.0}, {0.0, 0.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
                            {{0.0, 0.0}, {0.0, 0.0}}});
-  ComputeAndCompareR3<float>(&b, expected, {input_data.get()});
+  ComputeAndCompareR3<float>(&b, expected, {}, DefaultErrorSpec());
 }
 
+INSTANTIATE_TEST_CASE_P(PadTestFloatInstantiation, PadTestFloat,
+                        ::testing::ValuesIn(use_bfloat16_params));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index b7f62b8aa167b2d9ef1bb2fa83af5aaeda1d6652..bb7e800df84121f2045141bc366c34b94ba694ea 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -334,10 +334,109 @@ XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(
   ComputeAndCompareTuple(&builder, *Literal::MakeTuple(ptrs), param_data);
 }
 
+// Test large number of parameters flowing into a while-loop.
+// Construct conceptually the following HLO graph:
+//
+// p0 = parameter(0)
+// p1 = parameter(1)
+// ...
+// pN = parameter(N)
+// result = while (false) {
+//   p0 += (1, 1);
+//   p1 += (1, 1);
+//   ...
+//   pN += (1, 1)
+// }
+// result = {p0, p1, ..., pN}
+//
+// TODO(b/70173746): Times out during compilation on GPU and CPU backends as of
+// 2017-12-12.
+XLA_TEST_F(ParamsTest,
+           DISABLED_ON_CPU(DISABLED_ON_GPU(ManyParametersIntoWhileLoop))) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::vector<std::unique_ptr<GlobalData>> param_data_owner;
+  constexpr int kParamCount = 1900;
+  std::vector<ComputationDataHandle> params;
+  std::vector<Shape> parameter_shapes;
+  for (int i = 0; i < kParamCount; ++i) {
+    std::unique_ptr<Literal> literal = Literal::CreateR1<int32>({i, i});
+    param_data_owner.push_back(
+        std::move(client_->TransferToServer(*literal)).ValueOrDie());
+    ComputationDataHandle param =
+        builder.Parameter(i, literal->shape(), "param");
+    params.push_back(param);
+    parameter_shapes.push_back(literal->shape());
+  }
+
+  // Add bool parameter for the loop condition. Use a parameter HLO instead of a
+  // constant because DCE may eliminate the while-body otherwise.
+  std::unique_ptr<Literal> bool_literal = Literal::CreateR0<bool>(false);
+  param_data_owner.push_back(
+      std::move(client_->TransferToServer(*bool_literal)).ValueOrDie());
+  ComputationDataHandle bool_param =
+      builder.Parameter(kParamCount, bool_literal->shape(), "bool_param");
+  params.push_back(bool_param);
+  parameter_shapes.push_back(bool_literal->shape());
+
+  auto init = builder.Tuple(params);
+
+  // Create a computation for the condition: while(bool_param).
+  Shape while_shape = ShapeUtil::MakeTupleShape(parameter_shapes);
+  Computation condition;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto condition_parameter =
+        builder.Parameter(0, while_shape, "condition_parameter");
+    builder.GetTupleElement(condition_parameter, kParamCount);
+    condition = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a computation for the body.
+  // Add {1, 1} to the each tuple element.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto body_parameter = builder.Parameter(0, while_shape, "body_parameter");
+    std::vector<ComputationDataHandle> updates;
+    for (int i = 0; i < kParamCount; ++i) {
+      auto add = builder.Add(builder.GetTupleElement(body_parameter, i),
+                             builder.ConstantR1<int32>({1, 1}));
+      updates.push_back(add);
+    }
+    // Add bool parameter.
+    updates.push_back(builder.GetTupleElement(body_parameter, kParamCount));
+
+    builder.Tuple(updates);
+    body = builder.Build().ConsumeValueOrDie();
+  }
+
+  auto loop = builder.While(condition, body, init);
+
+  std::vector<ComputationDataHandle> outputs;
+  for (int i = 0; i < kParamCount; ++i) {
+    outputs.push_back(builder.GetTupleElement(loop, i));
+  }
+  builder.Tuple(outputs);
+
+  std::vector<GlobalData*> param_data;
+  param_data.reserve(param_data_owner.size());
+  for (const std::unique_ptr<GlobalData>& data : param_data_owner) {
+    param_data.push_back(data.get());
+  }
+
+  std::vector<std::unique_ptr<Literal>> elements;
+  std::vector<const Literal*> ptrs;
+  for (int i = 0; i < kParamCount; ++i) {
+    elements.push_back(Literal::CreateR1<int32>({i, i}));
+    ptrs.push_back(elements.back().get());
+  }
+  ComputeAndCompareTuple(&builder, *Literal::MakeTuple(ptrs), param_data);
+}
+
 #endif
 
-XLA_TEST_F(ParamsTest,
-           DISABLED_ON_CPU_PARALLEL(TupleOfR1ParametersAddedTogether)) {
+XLA_TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) {
   ComputationBuilder builder(client_, TestName());
 
   Shape r1f32_3 = ShapeUtil::MakeShape(F32, {3});
@@ -363,10 +462,8 @@ XLA_TEST_F(ParamsTest,
 // Verifies that passing a 2x2 with {0, 1} layout returns the same value back
 // when (transferred to the server and) passed through a parameter.
 XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) {
-  std::unique_ptr<Literal> literal = Literal::CreateR2<float>({
-      {1, 2}, {3, 4},
-  });
-  *literal->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
+  std::unique_ptr<Literal> literal = Literal::CreateR2WithLayout<float>(
+      {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({0, 1}));
   ComputationBuilder builder(client_, TestName());
   builder.Parameter(0, literal->shape(), "input");
 
@@ -377,10 +474,8 @@ XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) {
 
 // As above, but for {1, 0} layout.
 XLA_TEST_F(ParamsTest, R2_2x2_Layout_10) {
-  std::unique_ptr<Literal> literal = Literal::CreateR2<float>({
-      {1, 3}, {2, 4},
-  });
-  *literal->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({1, 0});
+  std::unique_ptr<Literal> literal = Literal::CreateR2WithLayout<float>(
+      {{1, 3}, {2, 4}}, LayoutUtil::MakeLayout({1, 0}));
   ComputationBuilder builder(client_, TestName());
   builder.Parameter(0, literal->shape(), "input");
 
@@ -401,7 +496,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) {
         original.layout().minor_to_major().begin(),
         original.layout().minor_to_major().end());
     std::reverse(original_layout.begin(), original_layout.end());
-    *literal->mutable_shape()->mutable_layout() =
+    *literal->mutable_shape_do_not_use()->mutable_layout() =
         LayoutUtil::MakeLayout(original_layout);
     ASSERT_EQ(2, literal->Get<float>({0, 1}));
   }
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 209f063cc5a34648453d12deae79f261b95dc3b4..6aafb9fa6cb2175c478f0e9a5e16f5808cbea590 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <limits>
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/computation_builder.h"
@@ -36,65 +37,42 @@ namespace {
 class PrngTest : public ClientLibraryTestBase {
  protected:
   template <typename T>
-  void UniformTest(T a, T b, tensorflow::gtl::ArraySlice<int64> dims);
-  void BernoulliTest(float p, tensorflow::gtl::ArraySlice<int64> dims);
+  std::unique_ptr<Literal> UniformTest(T a, T b,
+                                       tensorflow::gtl::ArraySlice<int64> dims,
+                                       int64 seed = 42);
 
   // Computes the χ² statistic of a sample of the discrete uniform distribution
   // of the given range size. `expected_count` is the number of times each
   // possible value is expected to be generated. Thus, the sample size is
   // `range_size * expected_count`.
-  double UniformChiSquared(int32 range_size, int32 expected_count);
+  double UniformChiSquared(int32 range_size, int32 expected_count,
+                           int64 seed = 42);
 };
 
 template <typename T>
-void PrngTest::UniformTest(T a, T b, tensorflow::gtl::ArraySlice<int64> dims) {
+std::unique_ptr<Literal> PrngTest::UniformTest(
+    T a, T b, tensorflow::gtl::ArraySlice<int64> dims, int64 seed) {
   ComputationBuilder builder(client_, TestName());
   builder.RngUniform(
       builder.ConstantR0<T>(a), builder.ConstantR0<T>(b),
       ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<T>(), dims));
 
-  SetSeed(42);
+  SetSeed(seed);
   auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
   EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions()));
   actual->EachCell<T>([=](tensorflow::gtl::ArraySlice<int64>, T value) {
     EXPECT_LE(a, value);
     EXPECT_LT(value, b);
   });
-}
-
-void PrngTest::BernoulliTest(float p, tensorflow::gtl::ArraySlice<int64> dims) {
-  ComputationBuilder builder(client_, TestName());
-  auto shape = ShapeUtil::MakeShape(U32, dims);
-  builder.RngBernoulli(builder.ConstantR0<float>(p), shape);
-
-  TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
-  ExecutionOptions execution_options = execution_options_;
-  execution_options.set_seed(42);
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto actual, client_->ExecuteAndTransfer(computation, /*arguments=*/{},
-                                               &execution_options));
-  EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions()));
-  int32 sum = 0;
-  actual->EachCell<uint32>(
-      [&sum](tensorflow::gtl::ArraySlice<int64>, uint32 value) {
-        EXPECT_TRUE(value == 0 || value == 1);
-        sum += value;
-      });
-  int32 total = ShapeUtil::ElementsIn(shape);
-  float p_tilde = sum / static_cast<float>(total);
-
-  // Test within expected range using normal approximation. The test uses a
-  // fixed seed and has a fixed output per p and backend. Using the normal
-  // approximation as this test is invoked for different `p` and the different
-  // backends could use different random number generators and produce different
-  // values. Choose 95% confidence level, so that z_{1-\alpha/2} = 1.96.
-  float normal_approximation_term = 1.96 * sqrt(p * (1 - p) / total);
-  EXPECT_GE(p_tilde, p - normal_approximation_term);
-  EXPECT_LE(p_tilde, p + normal_approximation_term);
+  return actual;
 }
 
 // Uniform random number generation tests
 XLA_TEST_F(PrngTest, ScalarU01) { UniformTest<float>(0, 1, {}); }
+XLA_TEST_F(PrngTest, ScalarU01limits) {
+  UniformTest<float>(std::numeric_limits<float>::min(),
+                     std::numeric_limits<float>::max(), {});
+}
 XLA_TEST_F(PrngTest, ZeroValuesU01) { UniformTest<float>(0, 1, {0}); }
 XLA_TEST_F(PrngTest, TenValuesU01) { UniformTest<float>(0, 1, {10}); }
 XLA_TEST_F(PrngTest, TenValuesU37) { UniformTest<float>(3, 7, {10}); }
@@ -102,6 +80,56 @@ XLA_TEST_F(PrngTest, ZeroValuesR2) { UniformTest<float>(0, 1, {0, 20}); }
 XLA_TEST_F(PrngTest, LargeU01) { UniformTest<float>(0, 1, {0x100, 0x100}); }
 XLA_TEST_F(PrngTest, TwelveValuesU524) { UniformTest<int32>(5, 24, {12}); }
 
+// TODO(b/71543667): Fix Rng ops on LLVM backends.
+XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(
+                         DISABLED_ON_CPU(ScalarBF16Tests)))) {
+  for (int64 seed = 0; seed < 100; ++seed) {
+    // The largest negative number smaller than zero in bf16 that's not
+    // denormalized.
+    int32 low_raw = 0x80800000;
+    const float low = reinterpret_cast<const float&>(low_raw);
+    float high = 0.0f;
+    UniformTest<bfloat16>(static_cast<bfloat16>(low),
+                          static_cast<bfloat16>(high), {}, /*seed=*/seed);
+
+    // Test odd and even values.
+    UniformTest<bfloat16>(static_cast<bfloat16>(32.75),
+                          static_cast<bfloat16>(33), {}, /*seed=*/seed);
+    UniformTest<bfloat16>(static_cast<bfloat16>(32.50),
+                          static_cast<bfloat16>(32.75), {}, /*seed=*/seed);
+    UniformTest<bfloat16>(static_cast<bfloat16>(-33.00),
+                          static_cast<bfloat16>(-32.75), {}, /*seed=*/seed);
+    UniformTest<bfloat16>(static_cast<bfloat16>(-32.75),
+                          static_cast<bfloat16>(-32.50), {}, /*seed=*/seed);
+  }
+}
+
+// TODO(b/71543667): Fix Rng ops on LLVM backends.
+XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(
+                         DISABLED_ON_CPU_PARALLEL(ScalarBF16CountTests)))) {
+  // There are 3 BF16 values in the range of [32.25, 33): 32.25, 32.5, 32.75,
+  // they should get similar counts.
+  bfloat16 low = static_cast<bfloat16>(32.25);
+  bfloat16 high = static_cast<bfloat16>(33);
+  bfloat16 interval = static_cast<bfloat16>(0.25);
+  std::vector<int32> counts(static_cast<int64>((high - low) / interval), 0);
+
+  constexpr int64 count = 100;
+  for (int64 seed = 0; seed < count; ++seed) {
+    auto result = UniformTest<bfloat16>(low, high, {}, /*seed=*/seed);
+    result->Literal::EachCell<bfloat16>(
+        [&](tensorflow::gtl::ArraySlice<int64>, bfloat16 value) {
+          int64 index = static_cast<int64>((value - low) / interval);
+          counts[index]++;
+        });
+  }
+  // Each bucket should have similar amount of counts. That is, not more than
+  // 10% of total counts. This mostly tests that we don't fall into a 1:2:2
+  // distribution, which yields 20% expected difference.
+  EXPECT_LT(std::abs(counts[0] - counts[1]), count * 0.1);
+  EXPECT_LT(std::abs(counts[1] - counts[2]), count * 0.1);
+}
+
 namespace {
 template <typename T>
 T Square(T x) {
@@ -109,7 +137,8 @@ T Square(T x) {
 }
 }  // namespace
 
-double PrngTest::UniformChiSquared(int32 range_size, int32 expected_count) {
+double PrngTest::UniformChiSquared(int32 range_size, int32 expected_count,
+                                   int64 seed) {
   int32 sample_size = range_size * expected_count;
 
   ComputationBuilder builder(client_, TestName());
@@ -117,7 +146,7 @@ double PrngTest::UniformChiSquared(int32 range_size, int32 expected_count) {
                      builder.ConstantR0<int32>(range_size),
                      ShapeUtil::MakeShape(S32, {sample_size}));
 
-  SetSeed(42);
+  SetSeed(seed);
   auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
   std::vector<int32> counts(range_size, 0);
   actual->EachCell<int32>([&counts](tensorflow::gtl::ArraySlice<int64>,
@@ -181,10 +210,12 @@ XLA_TEST_F(PrngTest, MapUsingRng) {
                        computation,
                        /*arguments=*/{param0_data.get()}, &execution_options));
 
-  EXPECT_EQ(actual->f32s_size(), param0_literal->f32s_size());
-  for (int i = 0; i < param0_literal->f32s_size(); ++i) {
-    EXPECT_GE(actual->f32s(i), param0_literal->f32s(i));
-    EXPECT_LT(actual->f32s(i), param0_literal->f32s(i) + 1.0f);
+  EXPECT_EQ(ShapeUtil::ElementsIn(actual->shape()),
+            ShapeUtil::ElementsIn(param0_literal->shape()));
+  for (int i = 0; i < ShapeUtil::ElementsIn(actual->shape()); ++i) {
+    EXPECT_GE(actual->data<float>()[i], param0_literal->data<float>()[i]);
+    EXPECT_LT(actual->data<float>()[i],
+              param0_literal->data<float>()[i] + 1.0f);
   }
 }
 
@@ -250,10 +281,6 @@ XLA_TEST_F(PrngTest, PassInGlobalRngSeed) {
   LiteralTestUtil::ExpectNotEqual(*result5, *result6);
 }
 
-// Bernoulli random number generation tests
-XLA_TEST_F(PrngTest, HundredValuesB10p5) { BernoulliTest(0.5, {100}); }
-XLA_TEST_F(PrngTest, HundredValuesB10p1) { BernoulliTest(0.1, {100}); }
-
 XLA_TEST_F(PrngTest, TenValuesN01) {
   ComputationBuilder builder(client_, TestName());
   builder.RngNormal(builder.ConstantR0<float>(0), builder.ConstantR0<float>(1),
diff --git a/tensorflow/compiler/xla/tests/reduce_hlo_test.cc b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0a2c0ca4cb8414e0771a541b9f963f9aedc8376
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
@@ -0,0 +1,132 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+// Tests the Reduce HLO in ways that can't be done using the ComputationBuilder
+// API.
+
+namespace xla {
+namespace {
+
+namespace str_util = tensorflow::str_util;
+namespace strings = tensorflow::strings;
+
+struct ReduceLayout {
+  std::array<int64, 4> input_minor_to_major;
+  std::array<int64, 3> output_minor_to_major;
+
+  string ToString() const {
+    return strings::StrCat(str_util::Join(input_minor_to_major, "x"), "_",
+                           str_util::Join(output_minor_to_major, "x"));
+  }
+};
+
+string PrintReduceLayout(
+    ::testing::TestParamInfo<ReduceLayout> reduce_layout_param) {
+  return reduce_layout_param.param.ToString();
+}
+
+void PrintTo(const ReduceLayout& reduce_layout, ::std::ostream* os) {
+  *os << reduce_layout.ToString();
+}
+
+class ReduceWithLayoutTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<ReduceLayout> {};
+
+StatusOr<std::unique_ptr<HloModule>> GetParsedModule() {
+  const char* const hlo_string = R"(
+HloModule BadReduce
+
+Sum {
+  x.1 = f32[] parameter(0)
+  y.1 = f32[] parameter(1)
+  ROOT add.1 = f32[] add(x.1, y.1)
+}
+
+ENTRY reduce.1 {
+  parameter = f32[2,2,2,3]{3,2,1,0} parameter(0)
+  init_value = f32[] constant(0)
+  reduce = f32[2,2,3]{2,1,0} reduce(parameter, init_value), dimensions={1}, to_apply=Sum
+  ROOT copy = f32[2,2,3]{2,1,0} copy(reduce)
+}
+)";
+
+  return tools::Parse(hlo_string);
+}
+
+// TODO(b/72454718): XLA:GPU does not support executing code compiled without
+// optimizations.
+XLA_TEST_P(ReduceWithLayoutTest, DISABLED_ON_GPU(Reduce)) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module, GetParsedModule());
+  HloInstruction* reduce_instruction =
+      module->entry_computation()->root_instruction()->mutable_operand(0);
+  ASSERT_EQ(reduce_instruction->opcode(), HloOpcode::kReduce);
+
+  const ReduceLayout& reduce_layout = GetParam();
+
+  Shape* reduce_output_shape = reduce_instruction->mutable_shape();
+  *reduce_output_shape->mutable_layout() =
+      LayoutUtil::MakeLayout(reduce_layout.output_minor_to_major);
+
+  Shape* reduce_input_shape =
+      reduce_instruction->mutable_operand(0)->mutable_shape();
+  *reduce_input_shape->mutable_layout() =
+      LayoutUtil::MakeLayout(reduce_layout.input_minor_to_major);
+
+  std::unique_ptr<Literal> reduce_input =
+      Literal::CreateR4<float>({{ /*i0=0*/
+                                 {/*i1=0*/
+                                  {-0.246092796, -0.179497838, -0.161181688},
+                                  {-0.151643038, -0.240213156, -0.198156}},
+                                 {/*i1=1*/
+                                  {-0.14222312, -0.162200093, -0.193907976},
+                                  {-0.239411, -0.198166847, -0.172471642}}},
+                                { /*i0=1*/
+                                 {/*i1=0*/
+                                  {-0.22965157, -0.218723893, -0.129257083},
+                                  {-0.188762426, -0.16123569, -0.181166649}},
+                                 {/*i1=1*/
+                                  {-0.241772294, -0.245131493, -0.160247207},
+                                  {-0.179881215, -0.23383224, -0.121976733}}}});
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), ErrorSpec(1e-5)));
+}
+
+INSTANTIATE_TEST_CASE_P(ReduceWithLayoutTest_Instantiation,
+                        ReduceWithLayoutTest,
+                        ::testing::Values(                           //
+                            ReduceLayout{{3, 2, 1, 0}, {0, 1, 2}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {0, 2, 1}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {1, 2, 0}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {1, 0, 2}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {2, 0, 1}},   //
+                            ReduceLayout{{3, 2, 1, 0}, {2, 1, 0}},   //
+                            ReduceLayout{{3, 1, 2, 0}, {1, 2, 0}},   //
+                            ReduceLayout{{1, 2, 3, 0}, {1, 0, 2}},   //
+                            ReduceLayout{{0, 2, 1, 3}, {2, 0, 1}}),  //
+                        PrintReduceLayout);
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index 4756ba096896806ece8fe35d18c4eaef041b8830..dc7ce3253cee255a7949326fa5b49fc8917432b8 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -249,7 +249,9 @@ INSTANTIATE_TEST_CASE_P(ReducePrecisionAccuracyTest,
 // ReducePrecisionInsertion passes.
 class ReducePrecisionInsertionTest : public ClientLibraryTestBase {};
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionBeforeFusion)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -276,7 +278,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionBeforeFusion) {
   ComputeAndCompareR1<float>(&builder, {0.0f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionSkippedAfterFusion)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -300,7 +304,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedAfterFusion) {
   ComputeAndCompareR1<float>(&builder, {-1.00001f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionAddedAfterFusion)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -322,7 +328,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedAfterFusion) {
   ComputeAndCompareR1<float>(&builder, {-1.0f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedFusionContains) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionSkippedFusionContains)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
@@ -345,7 +353,9 @@ XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionSkippedFusionContains) {
   ComputeAndCompareR1<float>(&builder, {-1.00001f}, {a_data.get()});
 }
 
-XLA_TEST_F(ReducePrecisionInsertionTest, ReducePrecisionAddedFusionContains) {
+// The interpreter has no fusion pass, so skip this test.
+XLA_TEST_F(ReducePrecisionInsertionTest,
+           DISABLED_ON_INTERPRETER(ReducePrecisionAddedFusionContains)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 7bc3185c367f076c9a7d211c9799557e1a91d92f..50d7b5074d201d2292cf90224ef4cd37efdbb8d3 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -143,6 +143,55 @@ class ReduceTest : public ClientLibraryTestBase {
     ComputeAndCompareR0<bool>(&builder, expected, {input_global_data.get()});
   }
 
+  // Reduce predicate tensor with dimension rows * cols to dimension cols, to
+  // test the implementation of atomic operations on misaligned small data
+  // types.
+  template <int64 cols>
+  void RunR2ToR1PredTest(bool and_reduce, int64 rows, int64 minor = 1,
+                         int64 major = 0) {
+    ComputationBuilder builder(client_, TestName());
+    const Shape input_shape = ShapeUtil::MakeShape(U8, {rows, cols});
+    auto input = builder.Parameter(0, input_shape, "input");
+    auto input_pred = builder.Eq(input, builder.ConstantR0<uint8>(1));
+
+    ComputationDataHandle init_value;
+    Computation reduce_op;
+    if (and_reduce) {
+      init_value = builder.ConstantR0<bool>(true);
+      reduce_op = CreateScalarAndComputation(&builder);
+    } else {
+      init_value = builder.ConstantR0<bool>(false);
+      reduce_op = CreateScalarOrComputation(&builder);
+    }
+
+    builder.Reduce(input_pred, init_value, reduce_op,
+                   /*dimensions_to_reduce=*/{0});
+
+    Array2D<uint8> input_data(rows, cols);
+    input_data.FillRandom(0, 1);
+    std::unique_ptr<Literal> input_literal =
+        Literal::CreateR2FromArray2D(input_data);
+    input_literal =
+        input_literal->Relayout(LayoutUtil::MakeLayout({minor, major}));
+    std::unique_ptr<GlobalData> input_global_data =
+        client_->TransferToServer(*input_literal).ConsumeValueOrDie();
+
+    std::array<bool, cols> expected;
+    for (int64 colno = 0; colno < cols; ++colno) {
+      bool column_sum = and_reduce ? true : false;
+      for (int64 rowno = 0; rowno < rows; ++rowno) {
+        if (and_reduce) {
+          column_sum = column_sum && input_data(rowno, colno);
+        } else {
+          column_sum = column_sum || input_data(rowno, colno);
+        }
+      }
+      expected[colno] = column_sum;
+    }
+
+    ComputeAndCompareR1<bool>(&builder, expected, {input_global_data.get()});
+  }
+
   // Runs an R2 => R0 reduction test with the given number of (rows, cols).
   void RunR2ToR0Test(int64 rows, int64 cols, int64 minor = 1, int64 major = 0) {
     ComputationBuilder builder(client_, TestName());
@@ -352,15 +401,13 @@ XLA_TEST_F(ReduceTest, ReduceR2_111x50_01_To_R1) {
 XLA_TEST_F(ReduceTest, ReduceR2_1024x1024_To_R1) { RunR2ToR1Test(1024, 1024); }
 XLA_TEST_F(ReduceTest, ReduceR2_1000x1500_To_R1) { RunR2ToR1Test(1000, 1500); }
 
-// TODO(b/34969189): Invalid CAS generated on GPU.
-XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(AndReduceAllOnesR1_10_Pred)) {
+XLA_TEST_F(ReduceTest, AndReduceAllOnesR1_10_Pred) {
   constexpr int element_count = 10;
   std::vector<int> input(element_count, 1);
   RunR1ToR0PredTest(/*and_reduce=*/true, input);
 }
 
-// TODO(b/34969189): Invalid CAS generated on GPU.
-XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(AndReduceOnesAndZerosR1_10_Pred)) {
+XLA_TEST_F(ReduceTest, AndReduceOnesAndZerosR1_10_Pred) {
   constexpr int element_count = 10;
   std::vector<int> input(element_count);
   for (int i = 0; i < element_count; ++i) {
@@ -369,15 +416,13 @@ XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(AndReduceOnesAndZerosR1_10_Pred)) {
   RunR1ToR0PredTest(/*and_reduce=*/true, input);
 }
 
-// TODO(b/34969189): Invalid CAS generated on GPU.
-XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OrReduceAllOnesR1_10_Pred)) {
+XLA_TEST_F(ReduceTest, OrReduceAllOnesR1_10_Pred) {
   constexpr int element_count = 10;
   std::vector<int> input(element_count, 1);
   RunR1ToR0PredTest(/*and_reduce=*/false, input);
 }
 
-// TODO(b/34969189): Invalid CAS generated on GPU.
-XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OrReduceOnesAndZerosR1_10_Pred)) {
+XLA_TEST_F(ReduceTest, OrReduceOnesAndZerosR1_10_Pred) {
   constexpr int element_count = 10;
   std::vector<int> input(element_count);
   for (int i = 0; i < element_count; ++i) {
@@ -449,6 +494,26 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) {
                              ErrorSpec(0.01, 1e-4));
 }
 
+// Test that algebraic simplifier does not incorrectly fold a transpose into a
+// reduction operation.
+XLA_TEST_F(ReduceTest, TransposeAndReduceR3_12x111x50_To_R2) {
+  ComputationBuilder builder(client_, TestName());
+  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  const Shape input_shape = ShapeUtil::MakeShape(F32, {12, 111, 50});
+  ComputationDataHandle input = builder.Parameter(0, input_shape, "input");
+  ComputationDataHandle zero = builder.ConstantR0<float>(0.0);
+  ComputationDataHandle transpose =
+      builder.Transpose(input, /*permutation=*/{1, 0, 2});
+  ComputationDataHandle reduce =
+      builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0});
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> input_data,
+                          MakeFakeLiteral(input_shape));
+
+  ComputeAndCompare(&builder, reduce, {std::move(*input_data)},
+                    ErrorSpec(0.01, 1e-4));
+}
+
 XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) {
   const int64 rows = 111, cols = 50;
 
@@ -812,5 +877,12 @@ XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OperationOnConstantAsInitValue)) {
   ComputeAndCompareR0<float>(&builder, 4.0f, {b_data.get()});
 }
 
+XLA_TEST_F(ReduceTest, ReduceAndPredR2_128x64_To_R1) {
+  RunR2ToR1PredTest</*cols=64*/ 64>(/*and_reduce=true*/ true, /*rows=128*/ 128);
+}
+XLA_TEST_F(ReduceTest, ReduceOrPredR2_64x32_To_R1) {
+  RunR2ToR1PredTest</*cols=32*/ 32>(/*and_reduce=false*/ false, /*rows=64*/ 64);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 0601a1466bd87ab721443e0da725006e2d73e392..b11b64e40a582150d6adf29e915cd70b4bcb982b 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -41,16 +41,40 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class ReduceWindowTest : public ClientLibraryTestBase {
+#ifdef XLA_BACKEND_SUPPORTS_BFLOAT16
+// Tests both F32 and BF16.
+static std::array<bool, 2> use_bfloat16_params{false, true};
+#else
+// Only tests F32.
+static std::array<bool, 1> use_bfloat16_params{false};
+#endif
+
+class ReduceWindowTestBase : public ClientLibraryTestBase {
  public:
-  ReduceWindowTest() : builder_(client_, TestName()) {}
+  ErrorSpec DefaultErrorSpec() const {
+    if (use_bfloat16()) {
+      return ErrorSpec(1e-1, 5e-2);
+    } else {
+      return ErrorSpec(1e-3, 1e-3);
+    }
+  }
+};
+
+class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
+                         public ReduceWindowTestBase {
+ public:
+  ReduceWindowTest() : builder_(client_, TestName()) {
+    set_use_bfloat16(GetParam());
+  }
 
   void ReduceWindowAdd(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
-    builder_.ReduceWindow(input, builder_.ConstantR0<float>(0.0f),
-                          CreateScalarAddComputation(F32, &builder_),
+    auto init =
+        CreateConstantFromLiteral(*Literal::CreateR0<float>(0.0f), &builder_);
+    builder_.ReduceWindow(input, init,
+                          CreateScalarAddComputation(FloatType(), &builder_),
                           window_dimensions, window_strides, padding);
   }
 
@@ -58,30 +82,32 @@ class ReduceWindowTest : public ClientLibraryTestBase {
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
-    builder_.ReduceWindow(
-        input, builder_.ConstantLiteral(Literal::MinValue(F32)),
-        CreateScalarMax(), window_dimensions, window_strides, padding);
+    auto init = CreateConstantFromLiteral(Literal::MinValue(F32), &builder_);
+    builder_.ReduceWindow(input, init, CreateScalarMax(), window_dimensions,
+                          window_strides, padding);
   }
 
   void ReduceWindowMin(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
-    builder_.ReduceWindow(input,
-                          builder_.ConstantLiteral(Literal::MaxValue(F32)),
-                          CreateScalarMinComputation(F32, &builder_),
+    auto init = CreateConstantFromLiteral(Literal::MaxValue(F32), &builder_);
+    builder_.ReduceWindow(input, init,
+                          CreateScalarMinComputation(FloatType(), &builder_),
                           window_dimensions, window_strides, padding);
   }
 
   ComputationBuilder builder_;
 };
 
-TEST_F(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
-  const auto input = builder_.ConstantR1<float>({1, 1, 1, 1});
-  const auto init_value = builder_.ConstantR0<float>(0);
+TEST_P(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>({1, 1, 1, 1}), &builder_);
+  const auto init_value =
+      CreateConstantFromLiteral(*Literal::CreateR0<float>(0), &builder_);
   TF_ASSERT_OK(builder_.first_error());
   builder_.ReduceWindow(input, init_value,
-                        CreateScalarAddComputation(F32, &builder_),
+                        CreateScalarAddComputation(FloatType(), &builder_),
                         /*window_dimensions=*/{1, 2},
                         /*window_strides=*/{1}, Padding::kValid);
   ASSERT_EQ(builder_.first_error().code(), tensorflow::error::INVALID_ARGUMENT)
@@ -91,88 +117,106 @@ TEST_F(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
 }
 
 // Regression test for b/68964348.
-TEST_F(ReduceWindowTest, R0ReduceWindow) {
-  auto input = builder_.ConstantR0<float>(42);
-  auto init = builder_.ConstantR0<float>(1.0);
-  builder_.ReduceWindow(input, init, CreateScalarAddComputation(F32, &builder_),
+TEST_P(ReduceWindowTest, R0ReduceWindow) {
+  const auto input =
+      CreateConstantFromLiteral(*Literal::CreateR0<float>(42.0), &builder_);
+  const auto init =
+      CreateConstantFromLiteral(*Literal::CreateR0<float>(1.0), &builder_);
+  builder_.ReduceWindow(input, init,
+                        CreateScalarAddComputation(FloatType(), &builder_),
                         /*window_dimensions=*/{},
                         /*window_strides=*/{}, Padding::kSame);
-  ComputeAndCompareR0<float>(&builder_, 43, {}, ErrorSpec(0.00001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateR0<float>(43.0), {},
+                           ErrorSpec(0.00001));
 }
 
-TEST_F(ReduceWindowTest, Min3In5Stride2) {
-  const auto input = builder_.ConstantR1<float>({10000, 1000, 100, 10, 1});
+TEST_P(ReduceWindowTest, Min3In5Stride2) {
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>({10000, 1000, 100, 10, 1}), &builder_);
   ReduceWindowMin(input, {3}, {2}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {100, 1}, {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateR1<float>({100, 1}), {},
+                           ErrorSpec(0.00001));
 }
 
-XLA_TEST_F(ReduceWindowTest, ZeroElementSmall) {
-  Array4D<float> input_array(1, 0, 2, 1);
+TEST_P(ReduceWindowTest, Min3In5Stride1WithSamePadding) {
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>({10000, 1000, 100, 10, 1}), &builder_);
+  ReduceWindowMin(input, /*window_dimensions=*/{3}, /*window_strides=*/{1},
+                  Padding::kSame);
+  ComputeAndCompareLiteral(&builder_,
+                           *Literal::CreateR1<float>({1000, 100, 10, 1, 1}), {},
+                           ErrorSpec(0.00001));
+}
 
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
+XLA_TEST_P(ReduceWindowTest, ZeroElementSmall) {
+  Array4D<float> input_array(1, 0, 2, 1);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 2, 1}, {1, 1, 1, 1}, padding);
 
   auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 2, 1},
                                               {1, 1, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, NonSquareSmall) {
+TEST_P(ReduceWindowTest, NonSquareSmall) {
   Array4D<float> input_array(1, 2, 2, 1);
-  input_array.FillRandom(2.f);
+  input_array.FillRandom(2.f, 2.f);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 2, 1}, {1, 1, 1, 1}, padding);
 
   auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 2, 1},
                                               {1, 1, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, MiddleDimsSmall) {
+TEST_P(ReduceWindowTest, MiddleDimsSmall) {
   Array4D<float> input_array(1, 3, 3, 1);
-  input_array.FillRandom(2.f);
-
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
+  input_array.FillRandom(2.f, 2.f);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 1, 1}, {1, 2, 2, 1}, padding);
 
   auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 1, 1},
                                               {1, 2, 2, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, Along2ndMinorDim) {
+TEST_P(ReduceWindowTest, Along2ndMinorDim) {
   Array4D<float> input_array(3, 6, 7, 32);
-  input_array.FillRandom(2.f);
+  input_array.FillRandom(2.f, 2.f);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
   // The parameters of this reduction mimic feature norm (e.g. LRN).
   int lrn_diameter = 7;  // diameter = 2*radius + 1 --> must be odd
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, lrn_diameter, 1}, {1, 1, 1, 1}, padding);
 
   auto res = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {1, 1, lrn_diameter, 1}, {1, 1, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, AmongMajor2Dims) {
+TEST_P(ReduceWindowTest, AmongMajor2Dims) {
   Array4D<float> input_array(4, 4, 6, 8);
   input_array.FillWithMinorDimNum();
+  const auto input_data_handle =
+      CreateConstantFromArray(input_array, &builder_);
 
   int win_len = 3;
   int win_stride = 1;
 
   Padding padding = Padding::kSame;
-  const auto input_data_handle =
-      builder_.ConstantR4FromArray4D<float>(input_array);
   // Reduce only along the x and y dimensions, according to the win_len.
   ReduceWindowAdd(input_data_handle, {win_len, win_len, 1, 1},
                   {win_stride, win_stride, 1, 1}, padding);
@@ -180,18 +224,20 @@ TEST_F(ReduceWindowTest, AmongMajor2Dims) {
   auto result = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {win_len, win_len, 1, 1},
       {win_stride, win_stride, 1, 1}, padding);
-  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
+
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, AmongMajor2DimsMediumSize) {
+TEST_P(ReduceWindowTest, AmongMajor2DimsMediumSize) {
   Array4D<float> input_array(9, 12, 4, 89);
-  input_array.FillRandom(2.0f);
+  input_array.FillRandom(2.f, 2.f);
 
   int win_len = 3;
   int win_stride = 2;
 
   const auto input_data_handle =
-      builder_.ConstantR4FromArray4D<float>(input_array);
+      CreateConstantFromArray(input_array, &builder_);
 
   Padding padding = Padding::kSame;
   // Reduce only along the x and y dimensions, according to the win_len.
@@ -202,137 +248,57 @@ TEST_F(ReduceWindowTest, AmongMajor2DimsMediumSize) {
       input_array, 0.0f, {win_len, win_len, 1, 1},
       {win_stride, win_stride, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
-}
-
-// TODO(b/32173947): Test support for arbitrary-sized padding.
-TEST_F(ReduceWindowTest, DISABLED_AmongMajor2DimsMediumSizeLargePadding) {
-  Array4D<float> input_array(9, 12, 4, 89);  // simulate Dim0IsMinor layout
-  input_array.FillRandom(2.0f);
-
-  int64 rank = 4;
-  int win_len = 3;
-  int win_stride = 2;
-
-  const auto input_data_handle =
-      builder_.ConstantR4FromArray4D<float>(input_array);
-
-  Padding padding = Padding::kSame;
-  // Reduce only along the x and y dimensions, according to the win_len.
-  // Create padding vector with large padding values in the reduction dims.
-  std::vector<std::pair<int64, int64>> low_high_padding;
-  low_high_padding.resize(rank, {4, 4});
-
-  builder_.ReduceWindowWithGeneralPadding(
-      input_data_handle, builder_.ConstantR0<float>(0.0f),
-      CreateScalarAddComputation(F32, &builder_), {win_len, win_len, 1, 1},
-      {win_stride, win_stride, 1, 1}, low_high_padding);
-
-  auto result = ReferenceUtil::ReduceWindow4DAdd(
-      input_array, 0.0f, {win_len, win_len, 1, 1},
-      {win_stride, win_stride, 1, 1}, padding);
-
-  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
-}
-
-XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x2) {
-  Array3D<float> input_array(2, 1, 2);
-  input_array(0, 0, 0) = 1000;
-  input_array(0, 0, 1) = 100;
-  input_array(1, 0, 0) = 10;
-  input_array(1, 0, 1) = 1;
-  auto input = builder_.ConstantR3FromArray3D<float>(input_array);
-
-  ReduceWindowAdd(input, {1, 1, 2}, {1, 1, 1}, Padding::kValid);
-
-  Array3D<float> expected(2, 1, 1);
-  expected(0, 0, 0) = 1100;
-  expected(1, 0, 0) = 11;
-  ComputeAndCompareR3<float>(&builder_, expected, {}, ErrorSpec(0.0001));
-}
-
-XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x3Stride1x1x2) {
-  Array3D<float> input_array(2, 1, 3);
-  input_array(0, 0, 0) = 100;
-  input_array(0, 0, 1) = 10;
-  input_array(0, 0, 2) = 1;
-  input_array(1, 0, 0) = 500;
-  input_array(1, 0, 1) = 50;
-  input_array(1, 0, 2) = 5;
-  auto input = builder_.ConstantR3FromArray3D<float>(input_array);
-
-  ReduceWindowAdd(input, {1, 1, 2}, {1, 1, 2}, Padding::kValid);
-
-  Array3D<float> expected(2, 1, 1);
-  expected(0, 0, 0) = 110;
-  expected(1, 0, 0) = 550;
-  ComputeAndCompareR3<float>(&builder_, expected, {}, ErrorSpec(0.0001));
-}
-
-XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x3SamePad) {
-  Array3D<float> input_array(2, 1, 3);
-  input_array(0, 0, 0) = 100;
-  input_array(0, 0, 1) = 10;
-  input_array(0, 0, 2) = 1;
-  input_array(1, 0, 0) = 500;
-  input_array(1, 0, 1) = 50;
-  input_array(1, 0, 2) = 5;
-  auto input = builder_.ConstantR3FromArray3D<float>(input_array);
-
-  ReduceWindowAdd(input, {1, 1, 2}, {1, 1, 1}, Padding::kSame);
-
-  Array3D<float> expected(2, 1, 3);
-  expected(0, 0, 0) = 110;
-  expected(0, 0, 1) = 11;
-  expected(0, 0, 2) = 1;
-  expected(1, 0, 0) = 550;
-  expected(1, 0, 1) = 55;
-  expected(1, 0, 2) = 5;
-  ComputeAndCompareR3<float>(&builder_, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
 }
 
 // Tests a reduction function that is not a simple add/min/max/etc.
-XLA_TEST_F(ReduceWindowTest, NonstandardReduceFunction) {
+XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
   Array4D<float> input_array(1, 2, 2, 1);
   input_array(0, 0, 0, 0) = 1;
   input_array(0, 0, 1, 0) = 2;
   input_array(0, 1, 0, 0) = 3;
   input_array(0, 1, 1, 0) = 4;
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
   Padding padding = Padding::kValid;
-
-  const Shape scalar = ShapeUtil::MakeShape(F32, {});
+  const Shape scalar = ShapeUtil::MakeShape(FloatType(), {});
   auto b = builder_.CreateSubBuilder("unusual");
   auto lhs = b->Parameter(0, scalar, "lhs");
   auto rhs = b->Parameter(1, scalar, "rhs");
-  b->Min(b->Add(lhs, rhs), b->ConstantR0<float>(8.0f));
+  b->Min(b->Add(lhs, rhs),
+         CreateConstantFromLiteral(*Literal::CreateR0<float>(8.0f), b.get()));
   Computation reduce_fn = b->BuildAndNoteError();
 
-  builder_.ReduceWindow(input, builder_.ConstantR0<float>(3.0f), reduce_fn,
-                        /*window_dimensions=*/{1, 1, 2, 1},
-                        /*window_strides=*/{1, 1, 1, 1}, padding);
+  builder_.ReduceWindow(
+      input,
+      CreateConstantFromLiteral(*Literal::CreateR0<float>(0.0f), &builder_),
+      reduce_fn,
+      /*window_dimensions=*/{1, 1, 2, 1},
+      /*window_strides=*/{1, 1, 1, 1}, padding);
 
   const auto reduce_func = [](float arg1, float arg2) {
     return std::min<float>(arg1 + arg2, 8.0f);
   };
 
   auto expected =
-      ReferenceUtil::ReduceWindow4DGeneric(input_array, 3.0f, reduce_func,
+      ReferenceUtil::ReduceWindow4DGeneric(input_array, 0.0f, reduce_func,
                                            /*window=*/{1, 1, 2, 1},
                                            /*stride=*/{1, 1, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *expected, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*expected), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, R4UnitWindow) {
+TEST_P(ReduceWindowTest, R4UnitWindow) {
   Array4D<float> input_array(13, 12, 8, 15);
-  input_array.Fill(1.0f);
+  input_array.FillRandom(2.f, 2.f);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({0, 3, 2, 1}));
-  ComputationDataHandle input =
-      builder_.Parameter(0, input_literal->shape(), "operand");
+  ComputationDataHandle input;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "parameter", &builder_, &input);
 
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 7, 1}, {1, 4, 1, 1}, padding);
@@ -340,15 +306,11 @@ TEST_F(ReduceWindowTest, R4UnitWindow) {
   auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 7, 1},
                                               {1, 4, 1, 1}, padding);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                          client_->TransferToServer(*input_literal));
-  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res),
+                           {input_data.get()}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(HloTestBase, R6AddMultipleStrides) {
-  auto b = HloComputation::Builder(TestName());
-
+XLA_TEST_P(ReduceWindowTest, R6AddMultipleStrides) {
   std::vector<int64> input_dims(6, 8);
   auto shape = ShapeUtil::MakeShape(F32, input_dims);
 
@@ -358,56 +320,15 @@ XLA_TEST_F(HloTestBase, R6AddMultipleStrides) {
   };
   TF_EXPECT_OK(arg_literal->Populate<float>(generator));
 
-  auto input =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal)));
-
-  auto init_value = b.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
-
-  HloComputation::Builder add_computation("add");
-  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  auto param_lhs = add_computation.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
-  auto param_rhs = add_computation.AddInstruction(
-      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
-  add_computation.AddInstruction(HloInstruction::CreateBinary(
-      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-
-  auto module = CreateNewModule();
-  auto add_func = module->AddEmbeddedComputation(add_computation.Build());
-
-  WindowDimension trivial_dim;
-  trivial_dim.set_size(1);
-  trivial_dim.set_stride(1);
-  trivial_dim.set_padding_low(0);
-  trivial_dim.set_padding_high(0);
-  trivial_dim.set_window_dilation(1);
-  trivial_dim.set_base_dilation(1);
-
-  WindowDimension active_dim;
-  active_dim.set_size(3);
-  active_dim.set_stride(1);
-  active_dim.set_padding_low(0);
-  active_dim.set_padding_high(0);
-  active_dim.set_window_dilation(1);
-  active_dim.set_base_dilation(1);
-
-  Window window;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = trivial_dim;
-
-  // Non-monotonic output layout with minor dims trivial.
+  const auto input = CreateConstantFromLiteral(*arg_literal, &builder_);
+
+  Padding padding = Padding::kValid;
+  ReduceWindowAdd(input, {3, 1, 3, 3, 1, 1}, {1, 1, 1, 1, 1, 1}, padding);
+
   std::vector<int64> output_layout = {1, 5, 3, 2, 0, 4};
   std::vector<int64> output_dims = {6, 8, 6, 6, 8, 8};
   Shape result_shape =
       ShapeUtil::MakeShapeWithLayout(F32, output_dims, output_layout);
-  b.AddInstruction(HloInstruction::CreateReduceWindow(
-      result_shape, input, init_value, window, add_func));
-
   std::unique_ptr<Literal> expected = Literal::CreateFromShape(result_shape);
   auto out_generator =
       [&](tensorflow::gtl::ArraySlice<int64> indexes) -> float {
@@ -415,82 +336,37 @@ XLA_TEST_F(HloTestBase, R6AddMultipleStrides) {
   };
   TF_EXPECT_OK(expected->Populate<float>(out_generator));
 
-  module->AddEntryComputation(b.Build());
-  auto actual = ExecuteAndTransfer(std::move(module), {});
-
-  LiteralTestUtil::ExpectNear(*actual, *expected, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(HloTestBase, R6Add) {
-  auto b = HloComputation::Builder(TestName());
-
+XLA_TEST_P(ReduceWindowTest, R6Add) {
   std::vector<int64> input_dims(6, 8);
+  auto shape = ShapeUtil::MakeShape(F32, input_dims);
+
   std::unique_ptr<Literal> arg_literal =
-      Literal::CreateFullWithMonotonicDim0MajorLayout<float>(input_dims, 1.0f);
-  auto input =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal)));
-
-  auto init_value = b.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
-
-  HloComputation::Builder add_computation("add");
-  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  auto param_lhs = add_computation.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
-  auto param_rhs = add_computation.AddInstruction(
-      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
-  add_computation.AddInstruction(HloInstruction::CreateBinary(
-      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-
-  auto module = CreateNewModule();
-  auto add_func = module->AddEmbeddedComputation(add_computation.Build());
-
-  WindowDimension trivial_dim;
-  trivial_dim.set_size(1);
-  trivial_dim.set_stride(1);
-  trivial_dim.set_padding_low(0);
-  trivial_dim.set_padding_high(0);
-  trivial_dim.set_window_dilation(1);
-  trivial_dim.set_base_dilation(1);
-
-  WindowDimension active_dim;
-  active_dim.set_size(3);
-  active_dim.set_stride(1);
-  active_dim.set_padding_low(0);
-  active_dim.set_padding_high(0);
-  active_dim.set_window_dilation(1);
-  active_dim.set_base_dilation(1);
-
-  Window window;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = trivial_dim;
-
-  Shape shape = ShapeUtil::MakeShape(F32, {8, 8, 6, 6, 8, 8});
-  b.AddInstruction(HloInstruction::CreateReduceWindow(shape, input, init_value,
-                                                      window, add_func));
+      Literal::CreateFullWithDescendingLayout<float>(input_dims, 1.0f);
+
+  const auto input = CreateConstantFromLiteral(*arg_literal, &builder_);
+
+  Padding padding = Padding::kValid;
+  ReduceWindowAdd(input, {1, 1, 3, 3, 1, 1}, {1, 1, 1, 1, 1, 1}, padding);
 
   std::vector<int64> output_dims = {8, 8, 6, 6, 8, 8};
   std::unique_ptr<Literal> expected =
-      Literal::CreateFullWithMonotonicDim0MajorLayout<float>(output_dims, 9.0f);
-
-  module->AddEntryComputation(b.Build());
-  auto actual = ExecuteAndTransfer(std::move(module), {});
+      Literal::CreateFullWithDescendingLayout<float>(output_dims, 9.0f);
 
-  LiteralTestUtil::ExpectNear(*actual, *expected, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, R4SecondMinorStride) {
+XLA_TEST_P(ReduceWindowTest, R4SecondMinorStride) {
   Array4D<float> input_array(2, 1, 27, 119);
   input_array.FillRandom(2.0f);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input =
-      builder_.Parameter(0, input_literal->shape(), "operand");
+  ComputationDataHandle input;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "parameter", &builder_, &input);
 
   int win_len = 1;
   int stride = 8;
@@ -500,20 +376,19 @@ XLA_TEST_F(ReduceWindowTest, R4SecondMinorStride) {
   auto res = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                          client_->TransferToServer(*input_literal));
-  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res),
+                           {input_data.get()}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, R4SecondMinorUnitStride) {
+XLA_TEST_P(ReduceWindowTest, R4SecondMinorUnitStride) {
   Array4D<float> input_array(3, 2, 4, 64);
   input_array.FillRandom(2.0f);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input =
-      builder_.Parameter(0, input_literal->shape(), "operand");
+  ComputationDataHandle input;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "parameter", &builder_, &input);
 
   int win_len = 3;
   int stride = 1;
@@ -523,20 +398,19 @@ XLA_TEST_F(ReduceWindowTest, R4SecondMinorUnitStride) {
   auto res = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                          client_->TransferToServer(*input_literal));
-  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res),
+                           {input_data.get()}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, R4SecondMinorWin) {
+XLA_TEST_P(ReduceWindowTest, R4SecondMinorWin) {
   Array4D<float> input_array(1, 3, 12, 200);
   input_array.FillRandom(2.0f);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input =
-      builder_.Parameter(0, input_literal->shape(), "operand");
+  ComputationDataHandle input;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "parameter", &builder_, &input);
 
   int win_len = 8;
   int stride = 5;
@@ -546,13 +420,11 @@ XLA_TEST_F(ReduceWindowTest, R4SecondMinorWin) {
   auto res = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                          client_->TransferToServer(*input_literal));
-  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res),
+                           {input_data.get()}, DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, AmongMajor2DimsMultipleMinor) {
+TEST_P(ReduceWindowTest, AmongMajor2DimsMultipleMinor) {
   Array4D<float> input_array(6, 4, 10, 130);
   input_array.FillRandom(2.0f);
 
@@ -561,7 +433,7 @@ TEST_F(ReduceWindowTest, AmongMajor2DimsMultipleMinor) {
 
   Padding padding = Padding::kSame;
   const auto input_data_handle =
-      builder_.ConstantR4FromArray4D<float>(input_array);
+      CreateConstantFromArray(input_array, &builder_);
   // Reduce only along the x and y dimensions, according to the win_len.
   ReduceWindowAdd(input_data_handle, {win_len, win_len, 1, 1},
                   {win_stride, win_stride, 1, 1}, padding);
@@ -569,36 +441,59 @@ TEST_F(ReduceWindowTest, AmongMajor2DimsMultipleMinor) {
   auto result = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {win_len, win_len, 1, 1},
       {win_stride, win_stride, 1, 1}, padding);
-  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, Add24In1152_NoOverlap) {
+XLA_TEST_P(ReduceWindowTest, Add24In1152_NoOverlap) {
   std::vector<float> input_vector(128 * 9, 1);
-  const auto input = builder_.ConstantR1<float>(input_vector);
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>(input_vector), &builder_);
   ReduceWindowAdd(input, {32}, {128}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {32, 32, 32, 32, 32, 32, 32, 32, 32},
-                             {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(
+      &builder_,
+      *Literal::CreateR1<float>({32, 32, 32, 32, 32, 32, 32, 32, 32}), {},
+      DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, Add128In128Stride128) {
-  const auto input = builder_.ConstantR1<float>(
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+XLA_TEST_P(ReduceWindowTest, Add128In128Stride128) {
+  std::vector<float> input_vector{
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>(input_vector), &builder_);
   ReduceWindowAdd(input, {128}, {128}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {1088}, {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateR1<float>({1088}), {},
+                           DefaultErrorSpec());
+}
+
+XLA_TEST_P(ReduceWindowTest, Add128In128) {
+  std::vector<float> input_vector{
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>(input_vector), &builder_);
+  ReduceWindowAdd(input, {128}, {1}, Padding::kValid);
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateR1<float>({1088}), {},
+                           DefaultErrorSpec());
 }
 
 // Regression test for a bug that appeared in Inception (b/34784899).
-TEST_F(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
+TEST_P(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
   Array2D<float> input_array(14, 14, 1.0f);
-  ComputationDataHandle input =
-      builder_.Broadcast(builder_.ConstantLiteral(Literal::One(F32)), {14, 14});
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
   int win_len = 3;
   int stride = 1;
@@ -608,13 +503,14 @@ TEST_F(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
   auto res = ReferenceUtil::ReduceWindow2DAdd(
       input_array, 0.0f, {win_len, win_len}, {stride, stride}, padding);
 
-  ComputeAndCompareR2<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray<float>(*res),
+                           {}, DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
+TEST_P(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
   Array2D<float> input_array(6, 4, 1.0f);
-  ComputationDataHandle input =
-      builder_.Broadcast(builder_.ConstantLiteral(Literal::One(F32)), {6, 4});
+  ComputationDataHandle input = builder_.Broadcast(
+      CreateConstantFromLiteral(Literal::One(F32), &builder_), {6, 4});
 
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {4, 2}, {3, 3}, padding);
@@ -622,9 +518,13 @@ TEST_F(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
   auto res = ReferenceUtil::ReduceWindow2DAdd(input_array, 0.0f, {4, 2}, {3, 3},
                                               padding);
 
-  ComputeAndCompareR2<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray<float>(*res),
+                           {}, DefaultErrorSpec());
 }
 
+INSTANTIATE_TEST_CASE_P(ReduceWindowTestInstance, ReduceWindowTest,
+                        ::testing::ValuesIn(use_bfloat16_params));
+
 enum Reducer { kAdd, kMax };
 
 struct R4ReduceWindowTestData {
@@ -633,35 +533,43 @@ struct R4ReduceWindowTestData {
   int64 strides[4];
   int64 pad_low[4];
   int64 pad_high[4];
+  int64 layout[4];
 
   Reducer reducer;
 };
 
 string R4ReduceWindowTestDataToString(
-    const ::testing::TestParamInfo<R4ReduceWindowTestData>& data) {
+    const ::testing::TestParamInfo<
+        ::testing::tuple<R4ReduceWindowTestData, bool>>& data) {
+  const auto& param = ::testing::get<0>(data.param);
   string str = tensorflow::strings::StrCat(
-      "base_bounds_",
-      tensorflow::str_util::Join(data.param.base_bounds, "x"),  //
+      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),  //
       "__window_bounds_",
-      tensorflow::str_util::Join(data.param.window_bounds, "x"),            //
-      "__strides_", tensorflow::str_util::Join(data.param.strides, "x"),    //
-      "__pad_low_", tensorflow::str_util::Join(data.param.pad_low, "x"),    //
-      "__pad_high_", tensorflow::str_util::Join(data.param.pad_high, "x"),  //
-      (data.param.reducer == kAdd) ? "add" : "max");
-  CHECK(data.param.reducer == kAdd || data.param.reducer == kMax);
+      tensorflow::str_util::Join(param.window_bounds, "x"),            //
+      "__strides_", tensorflow::str_util::Join(param.strides, "x"),    //
+      "__pad_low_", tensorflow::str_util::Join(param.pad_low, "x"),    //
+      "__pad_high_", tensorflow::str_util::Join(param.pad_high, "x"),  //
+      "__layout_", tensorflow::str_util::Join(param.layout, "_"),      //
+      (param.reducer == kAdd) ? "_add" : "_max");
+  CHECK(param.reducer == kAdd || param.reducer == kMax);
 
   // Test names are not allowed to contain the '-' character.
   std::replace(str.begin(), str.end(), '-', 'n');
+  if (::testing::get<1>(data.param)) {
+    str = tensorflow::strings::StrCat(str, "_bfloat16");
+  }
   return str;
 }
 
-class R4ReduceWindowTest
-    : public ClientLibraryTestBase,
-      public ::testing::WithParamInterface<R4ReduceWindowTestData> {
+class R4ReduceWindowTest : public ReduceWindowTestBase,
+                           public ::testing::WithParamInterface<
+                               ::testing::tuple<R4ReduceWindowTestData, bool>> {
  protected:
+  R4ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
+
   void DoIt() {
     ComputationBuilder b(client_, TestName());
-    const auto& param = GetParam();
+    const auto& param = ::testing::get<0>(GetParam());
 
     const float kInitValue = 0.0f;
 
@@ -669,24 +577,26 @@ class R4ReduceWindowTest
                          param.base_bounds[2], param.base_bounds[3]);
     input.FillIota(1);
     std::unique_ptr<Literal> input_literal =
-        Literal::CreateR4FromArray4D(input);
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_arg,
-                            client_->TransferToServer(*input_literal));
+        Literal::CreateR4FromArray4DWithLayout(
+            input, LayoutUtil::MakeLayout(param.layout));
+    ComputationDataHandle parameter;
+    auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
+                                                       &b, &parameter);
 
     std::vector<std::pair<int64, int64>> padding(4);
     for (int i = 0; i < 4; ++i) {
       padding[i] = {param.pad_low[i], param.pad_high[i]};
     }
 
-    auto parameter = b.Parameter(0, input_literal->shape(), "p0");
-    auto pad_value = b.ConstantR0<float>(kInitValue);
+    auto init_value =
+        CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
     CHECK(param.reducer == kAdd || param.reducer == kMax);
     auto computation = param.reducer == kAdd
-                           ? CreateScalarAddComputation(F32, &b)
-                           : CreateScalarMaxComputation(F32, &b);
+                           ? CreateScalarAddComputation(FloatType(), &b)
+                           : CreateScalarMaxComputation(FloatType(), &b);
     b.ReduceWindowWithGeneralPadding(
         /*operand=*/parameter,
-        /*init_value=*/pad_value,
+        /*init_value=*/init_value,
         /*computation=*/computation,
         /*window_dimensions=*/param.window_bounds,
         /*window_strides=*/param.strides,
@@ -704,8 +614,13 @@ class R4ReduceWindowTest
             /*window=*/param.window_bounds,
             /*stride=*/param.strides,
             /*padding=*/padding);
-    ComputeAndCompareR4<float>(&b, *expected, {input_arg.get()},
-                               ErrorSpec(1e-3, 1e-3));
+    std::unique_ptr<Literal> expected_literal =
+        Literal::CreateFromArray(*expected);
+    const Shape& expected_shape_with_layout = ShapeUtil::MakeShapeWithLayout(
+        input_literal->shape().element_type(),
+        AsInt64Slice(expected_literal->shape().dimensions()), param.layout);
+    ComputeAndCompareLiteral(&b, *expected_literal, {input_arg.get()},
+                             DefaultErrorSpec(), &expected_shape_with_layout);
   }
 };
 
@@ -719,6 +634,16 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{1, 1, 1, 1},
                            /*pad_low=*/{0, 0, 0, 0},
                            /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kAdd},
+
+    // Arbitrary padding (not kSame or kValid).
+    R4ReduceWindowTestData{/*base_bounds=*/{9, 12, 4, 89},
+                           /*window_bounds=*/{3, 3, 1, 1},
+                           /*strides=*/{2, 2, 1, 1},
+                           /*pad_low=*/{4, 4, 0, 0},
+                           /*pad_high=*/{4, 4, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
     // Zero base bound edge case.
@@ -727,6 +652,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{1, 1, 1, 1},
                            /*pad_low=*/{0, 0, 0, 0},
                            /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
     // With non-1x1 window.
@@ -735,6 +661,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{1, 1, 1, 1},
                            /*pad_low=*/{0, 0, 0, 0},
                            /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
     // With max instead of add.
@@ -743,6 +670,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{1, 1, 1, 1},
                            /*pad_low=*/{0, 0, 0, 0},
                            /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kMax},
 
     // With stride.
@@ -751,6 +679,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{2, 4, 1, 1},
                            /*pad_low=*/{0, 0, 0, 0},
                            /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
     // With low padding.
@@ -759,6 +688,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{2, 2, 1, 1},
                            /*pad_low=*/{3, 2, 0, 0},
                            /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
     // With high padding.
@@ -767,6 +697,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{2, 2, 1, 1},
                            /*pad_low=*/{0, 0, 0, 0},
                            /*pad_high=*/{2, 3, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
     // Window touches both sides of the padding simultaneously.
@@ -775,6 +706,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{1, 1, 1, 1},
                            /*pad_low=*/{1, 1, 0, 0},
                            /*pad_high=*/{1, 1, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
     // Window is entirely in the padding for some positions.
@@ -783,6 +715,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{1, 1, 1, 1},
                            /*pad_low=*/{4, 4, 0, 0},
                            /*pad_high=*/{4, 4, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
     // Zero base bound with padding edge case.
@@ -791,6 +724,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{1, 1, 1, 1},
                            /*pad_low=*/{0, 1, 0, 0},
                            /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
     // With stride, low padding and high padding.
@@ -799,6 +733,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{3, 1, 1, 1},
                            /*pad_low=*/{10, 1, 0, 0},
                            /*pad_high=*/{2, 3, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
     // With second minor dimension == 9.
@@ -807,6 +742,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{1, 1, 1, 1},
                            /*pad_low=*/{0, 0, 0, 0},
                            /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
     // With minor dimension == 129.
@@ -815,6 +751,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{1, 1, 1, 1},
                            /*pad_low=*/{0, 0, 0, 0},
                            /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
     // With minor dims reduction and non-overlapped stride.
@@ -823,6 +760,7 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*strides=*/{1, 1, 2, 2},
                            /*pad_low=*/{0, 0, 0, 0},
                            /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 
     // With minor dims reduction and overlapped stride.
@@ -830,25 +768,29 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*window_bounds=*/{1, 1, 4, 4},
                            /*strides=*/{1, 1, 2, 2},
                            /*pad_low=*/{0, 0, 0, 0},
-                           /*pad_high=*/{0, 0, 0, 0},
+                           /*pad_high=*/{1, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
 };
 
-INSTANTIATE_TEST_CASE_P(R4ReduceWindowTestInstantiation, R4ReduceWindowTest,
-                        ::testing::ValuesIn(kR4ReduceWindowTestValues),
-                        R4ReduceWindowTestDataToString);
+INSTANTIATE_TEST_CASE_P(
+    R4ReduceWindowTestInstantiation, R4ReduceWindowTest,
+    ::testing::Combine(::testing::ValuesIn(kR4ReduceWindowTestValues),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R4ReduceWindowTestDataToString);
 
 class R4ReduceWindowLargeTest : public R4ReduceWindowTest {};
 
-XLA_TEST_P(R4ReduceWindowLargeTest, DoIt) { DoIt(); }
+XLA_TEST_P(R4ReduceWindowLargeTest, DISABLED_ON_INTERPRETER(DoIt)) { DoIt(); }
 
 // Test cases that are large/slow/failed.
 const R4ReduceWindowTestData kR4ReduceWindowLargeTestValues[] = {
     R4ReduceWindowTestData{/*base_bounds=*/{28, 28, 256, 128},
-                           /*window_bounds=*/{3, 3, 1, 1},
-                           /*strides=*/{1, 1, 1, 1},
+                           /*window_bounds=*/{3, 3, 1, 5},
+                           /*strides=*/{1, 1, 1, 5},
                            /*pad_low=*/{1, 1, 0, 0},
                            /*pad_high=*/{1, 1, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kMax},
 
     R4ReduceWindowTestData{/*base_bounds=*/{112, 112, 64, 128},
@@ -856,13 +798,163 @@ const R4ReduceWindowTestData kR4ReduceWindowLargeTestValues[] = {
                            /*strides=*/{2, 2, 1, 1},
                            /*pad_low=*/{0, 0, 0, 0},
                            /*pad_high=*/{1, 1, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
                            /*reducer=*/kAdd},
+
+    R4ReduceWindowTestData{/*base_bounds=*/{1, 1, 32768 - 3, 2},
+                           /*window_bounds=*/{1, 1, 4, 1},
+                           /*strides=*/{1, 1, 4, 1},
+                           /*pad_low=*/{0, 0, 1, 0},
+                           /*pad_high=*/{0, 0, 2, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kMax},
 };
 
-INSTANTIATE_TEST_CASE_P(R4ReduceWindowLargeTestInstantiation,
-                        R4ReduceWindowLargeTest,
-                        ::testing::ValuesIn(kR4ReduceWindowLargeTestValues),
-                        R4ReduceWindowTestDataToString);
+INSTANTIATE_TEST_CASE_P(
+    R4ReduceWindowLargeTestInstantiation, R4ReduceWindowLargeTest,
+    ::testing::Combine(::testing::ValuesIn(kR4ReduceWindowLargeTestValues),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R4ReduceWindowTestDataToString);
+
+class R4ReduceWindowAnyDimsTest : public R4ReduceWindowTest {};
+
+// TODO(b/72234705): Fix the test cases failed on CPU and GPU.
+XLA_TEST_P(R4ReduceWindowAnyDimsTest,
+           DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt)))) {
+  DoIt();
+}
+
+const R4ReduceWindowTestData kR4ReduceWindowAnyDimsTestValues[] = {
+    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
+                           /*window_bounds=*/{2, 3, 4, 5},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kAdd},
+    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
+                           /*window_bounds=*/{2, 3, 1, 1},
+                           /*strides=*/{1, 1, 1, 1},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{3, 2, 1, 0},
+                           /*reducer=*/kMax},
+    // With 0321 layout.
+    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 140},
+                           /*window_bounds=*/{2, 3, 4, 5},
+                           /*strides=*/{1, 2, 3, 4},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{0, 3, 2, 1},
+                           /*reducer=*/kAdd},
+
+    // With 0123 layout.
+    R4ReduceWindowTestData{/*base_bounds=*/{4, 6, 17, 23},
+                           /*window_bounds=*/{2, 3, 7, 9},
+                           /*strides=*/{1, 2, 5, 8},
+                           /*pad_low=*/{0, 0, 0, 0},
+                           /*pad_high=*/{0, 0, 0, 0},
+                           /*layout=*/{0, 1, 2, 3},
+                           /*reducer=*/kAdd},
+};
+
+INSTANTIATE_TEST_CASE_P(
+    R4ReduceWindowAnyDimsTestInstantiation, R4ReduceWindowAnyDimsTest,
+    ::testing::Combine(::testing::ValuesIn(kR4ReduceWindowAnyDimsTestValues),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R4ReduceWindowTestDataToString);
+
+struct R3ReduceWindowTestData {
+  int64 base_bounds[3];
+  int64 window_bounds[3];
+  int64 strides[3];
+  int64 layout[3];
+  Padding padding;
+  Reducer reducer;
+} kR3TestCases[] = {
+    {/*base_bounds=*/{2, 1, 2}, /*window_bounds=*/{1, 1, 2},
+     /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{4, 3, 3}, /*window_bounds=*/{2, 2, 2},
+     /*strides=*/{2, 2, 2}, /*layout=*/{2, 1, 0},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{4, 3, 3}, /*window_bounds=*/{2, 2, 2},
+     /*strides=*/{2, 2, 2}, /*layout=*/{2, 1, 0},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{6, 21, 3}, /*window_bounds=*/{2, 3, 2},
+     /*strides=*/{1, 2, 2}, /*layout=*/{2, 1, 0},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{10, 21, 129}, /*window_bounds=*/{2, 9, 1},
+     /*strides=*/{5, 2, 1}, /*layout=*/{2, 1, 0},
+     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{6, 21, 3}, /*window_bounds=*/{2, 3, 2},
+     /*strides=*/{1, 2, 2}, /*layout=*/{0, 1, 2},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+    {/*base_bounds=*/{6, 21, 3}, /*window_bounds=*/{2, 3, 2},
+     /*strides=*/{1, 2, 2}, /*layout=*/{1, 0, 2},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+};
+
+string R3ReduceWindowTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<R3ReduceWindowTestData, bool>>& data) {
+  const auto& param = ::testing::get<0>(data.param);
+  string str = tensorflow::strings::StrCat(
+      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),
+      "__window_bounds_", tensorflow::str_util::Join(param.window_bounds, "x"),
+      "__strides_", tensorflow::str_util::Join(param.strides, "x"),
+      "__padding_", param.padding == Padding::kSame ? "same" : "valid",
+      "__layout_", param.layout[0], "_", param.layout[1], "_", param.layout[2],
+      "__reducer_", param.reducer == kAdd ? "add" : "max");
+  if (::testing::get<1>(data.param)) {
+    str = tensorflow::strings::StrCat(str, "_bfloat16");
+  }
+  return str;
+}
+
+class R3ReduceWindowTest : public ReduceWindowTestBase,
+                           public ::testing::WithParamInterface<
+                               ::testing::tuple<R3ReduceWindowTestData, bool>> {
+ protected:
+  R3ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
+};
+
+TEST_P(R3ReduceWindowTest, Add) {
+  ComputationBuilder b(client_, TestName());
+  const auto& param = ::testing::get<0>(GetParam());
+  CHECK(param.reducer == kAdd);
+
+  const float kInitValue = 0.0f;
+  Array3D<float> input(param.base_bounds[0], param.base_bounds[1],
+                       param.base_bounds[2], 1.0f);
+  std::unique_ptr<Literal> input_literal =
+      Literal::CreateR3FromArray3DWithLayout(
+          input, LayoutUtil::MakeLayout(param.layout));
+
+  ComputationDataHandle parameter;
+  auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
+                                                     &b, &parameter);
+  auto init_value =
+      CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
+  b.ReduceWindow(/*operand=*/parameter,
+                 /*init_value=*/init_value,
+                 /*computation=*/CreateScalarAddComputation(FloatType(), &b),
+                 /*window_dimensions=*/param.window_bounds,
+                 /*window_strides=*/param.strides, /*padding=*/param.padding);
+
+  auto expected = ReferenceUtil::ReduceWindow3DAdd(
+      /*operand=*/input, /*init=*/kInitValue, /*window=*/param.window_bounds,
+      /*stride=*/param.strides, /*padding=*/param.padding);
+
+  ComputeAndCompareLiteral(&b, *Literal::CreateFromArray(*expected),
+                           {input_arg.get()}, DefaultErrorSpec());
+}
+
+INSTANTIATE_TEST_CASE_P(
+    R3ReduceWindowTestInstantiation, R3ReduceWindowTest,
+    ::testing::Combine(::testing::ValuesIn(kR3TestCases),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R3ReduceWindowTestDataToString);
 
 struct R2ReduceWindowTestData {
   int64 base_bounds[2];
@@ -910,130 +1002,217 @@ struct R2ReduceWindowTestData {
 };
 
 string R2ReduceWindowTestDataToString(
-    const ::testing::TestParamInfo<R2ReduceWindowTestData>& data) {
+    const ::testing::TestParamInfo<
+        ::testing::tuple<R2ReduceWindowTestData, bool>>& data) {
+  const auto& param = ::testing::get<0>(data.param);
   string str = tensorflow::strings::StrCat(
-      "base_bounds_",
-      tensorflow::str_util::Join(data.param.base_bounds, "x"),  //
+      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),  //
       "__window_bounds_",
-      tensorflow::str_util::Join(data.param.window_bounds, "x"),              //
-      "__strides_", tensorflow::str_util::Join(data.param.strides, "x"),      //
-      "__padding_", data.param.padding == Padding::kSame ? "same" : "valid",  //
-      "__layout_", data.param.layout[0], "_", data.param.layout[1],           //
-      "__reducer_", data.param.reducer == kAdd ? "add" : "max");
+      tensorflow::str_util::Join(param.window_bounds, "x"),              //
+      "__strides_", tensorflow::str_util::Join(param.strides, "x"),      //
+      "__padding_", param.padding == Padding::kSame ? "same" : "valid",  //
+      "__layout_", param.layout[0], "_", param.layout[1],                //
+      "__reducer_", param.reducer == kAdd ? "add" : "max");
+  if (::testing::get<1>(data.param)) {
+    str = tensorflow::strings::StrCat(str, "_bfloat16");
+  }
   return str;
 }
 
-class R2ReduceWindowTest
-    : public ClientLibraryTestBase,
-      public ::testing::WithParamInterface<R2ReduceWindowTestData> {};
+class R2ReduceWindowTest : public ReduceWindowTestBase,
+                           public ::testing::WithParamInterface<
+                               ::testing::tuple<R2ReduceWindowTestData, bool>> {
+ protected:
+  R2ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
 
-TEST_P(R2ReduceWindowTest, Add) {
-  ComputationBuilder b(client_, TestName());
-  const auto& param = GetParam();
-  CHECK(param.reducer == kAdd);
+  void DoIt() {
+    ComputationBuilder b(client_, TestName());
+    const auto& param = ::testing::get<0>(GetParam());
+    CHECK(param.reducer == kAdd);
 
-  const float kInitValue = 0.0f;
-  Array2D<float> input(param.base_bounds[0], param.base_bounds[1], 1.0f);
-  std::unique_ptr<Literal> input_literal =
-      Literal::CreateR2FromArray2DWithLayout(
-          input, LayoutUtil::MakeLayout(param.layout));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_arg,
-                          client_->TransferToServer(*input_literal));
-  b.ReduceWindow(/*operand=*/
-                 b.Parameter(0, input_literal->shape(), "p0"),
-                 /*init_value=*/b.ConstantR0<float>(kInitValue),
-                 /*computation=*/CreateScalarAddComputation(F32, &b),
-                 /*window_dimensions=*/param.window_bounds,
-                 /*window_strides=*/param.strides, /*padding=*/param.padding);
+    const float kInitValue = 0.0f;
+    Array2D<float> input(param.base_bounds[0], param.base_bounds[1], 1.0f);
+    std::unique_ptr<Literal> input_literal =
+        Literal::CreateR2FromArray2DWithLayout(
+            input, LayoutUtil::MakeLayout(param.layout));
+
+    ComputationDataHandle parameter;
+    auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
+                                                       &b, &parameter);
+    auto init_value =
+        CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
+    b.ReduceWindow(/*operand=*/parameter,
+                   /*init_value=*/init_value,
+                   /*computation=*/CreateScalarAddComputation(FloatType(), &b),
+                   /*window_dimensions=*/param.window_bounds,
+                   /*window_strides=*/param.strides, /*padding=*/param.padding);
+
+    auto expected = ReferenceUtil::ReduceWindow2DAdd(
+        /*operand=*/input, /*init=*/kInitValue, /*window=*/param.window_bounds,
+        /*stride=*/param.strides, /*padding=*/param.padding);
+
+    ComputeAndCompareLiteral(&b, *Literal::CreateFromArray(*expected),
+                             {input_arg.get()}, DefaultErrorSpec());
+  }
+};
 
-  auto expected = ReferenceUtil::ReduceWindow2DAdd(
-      /*operand=*/input, /*init=*/kInitValue, /*window=*/param.window_bounds,
-      /*stride=*/param.strides, /*padding=*/param.padding);
+TEST_P(R2ReduceWindowTest, DoIt) { DoIt(); }
+
+INSTANTIATE_TEST_CASE_P(
+    R2ReduceWindowTestInstantiation, R2ReduceWindowTest,
+    ::testing::Combine(::testing::ValuesIn(kR2TestCases),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R2ReduceWindowTestDataToString);
+
+class R2ReduceWindowFailingCpuGpuBf16Test : public R2ReduceWindowTest {};
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_arg.get()},
-                             ErrorSpec(1e-3, 1e-3));
+// TODO(b/72234705): Fix the test cases failed on CPU and GPU.
+XLA_TEST_P(R2ReduceWindowFailingCpuGpuBf16Test,
+           DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt)))) {
+  DoIt();
 }
 
-INSTANTIATE_TEST_CASE_P(R2ReduceWindowTestInstantiation, R2ReduceWindowTest,
-                        ::testing::ValuesIn(kR2TestCases),
-                        R2ReduceWindowTestDataToString);
+const R2ReduceWindowTestData kR2FailingValuesCpuGpuBf16Test[] = {
+    {/*base_bounds=*/{8, 128}, /*window_bounds=*/{8, 128},
+     /*strides=*/{1, 1}, /*layout=*/{1, 0},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+};
+
+INSTANTIATE_TEST_CASE_P(
+    R2ReduceWindowFailingInstantiation, R2ReduceWindowFailingCpuGpuBf16Test,
+    ::testing::Combine(::testing::ValuesIn(kR2FailingValuesCpuGpuBf16Test),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R2ReduceWindowTestDataToString);
 
 struct R1ReduceWindowTestData {
   int64 base_bounds[1];
   int64 window_bounds[1];
   int64 strides[1];
-  Padding padding;
+  int64 pad_low[1];
+  int64 pad_high[1];
   Reducer reducer;
 } kR1TestCases[] = {
     {/*base_bounds=*/{1}, /*window_bounds=*/{1},
      /*strides=*/{1},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({1}, {1}, {1}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({1}, {1}, {1}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{3}, /*window_bounds=*/{3},
      /*strides=*/{1},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({3}, {3}, {1}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({3}, {3}, {1}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{3}, /*window_bounds=*/{2},
      /*strides=*/{1},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({3}, {2}, {1}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({3}, {2}, {1}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{5}, /*window_bounds=*/{1},
      /*strides=*/{1},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
+     /*pad_low=*/{xla::MakePadding({5}, {1}, {1}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({5}, {1}, {1}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kMax},
 
     {/*base_bounds=*/{16}, /*window_bounds=*/{4},
      /*strides=*/{4},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
+     /*pad_low=*/{xla::MakePadding({16}, {4}, {4}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({16}, {4}, {4}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kMax},
 
     {/*base_bounds=*/{16}, /*window_bounds=*/{4},
      /*strides=*/{3},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({16}, {4}, {3}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({16}, {4}, {3}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
-    {/*base_bounds=*/{128 * 2}, /*window_bounds=*/{30},
+    {/*base_bounds=*/{128 * 2},
+     /*window_bounds=*/{30},
      /*strides=*/{27},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
-
-    {/*base_bounds=*/{128 * 17}, /*window_bounds=*/{7},
+     /*pad_low=*/
+     {xla::MakePadding({128 * 2}, {30}, {27}, Padding::kValid)[0].first},
+     /*pad_high=*/
+     {xla::MakePadding({128 * 2}, {30}, {27}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{128 * 17},
+     /*window_bounds=*/{7},
      /*strides=*/{64},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
-
-    {/*base_bounds=*/{128 * 2}, /*window_bounds=*/{32},
+     /*pad_low=*/
+     {xla::MakePadding({128 * 17}, {7}, {64}, Padding::kValid)[0].first},
+     /*pad_high=*/
+     {xla::MakePadding({128 * 17}, {7}, {64}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{128 * 2},
+     /*window_bounds=*/{32},
      /*strides=*/{56},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/
+     {xla::MakePadding({128 * 2}, {32}, {56}, Padding::kValid)[0].first},
+     /*pad_high=*/
+     {xla::MakePadding({128 * 2}, {32}, {56}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{3}, /*window_bounds=*/{2},
      /*strides=*/{1},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({3}, {2}, {1}, Padding::kSame)[0].first},
+     /*pad_high=*/{xla::MakePadding({3}, {2}, {1}, Padding::kSame)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{5}, /*window_bounds=*/{3},
      /*strides=*/{2},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({5}, {3}, {2}, Padding::kSame)[0].first},
+     /*pad_high=*/{xla::MakePadding({5}, {3}, {2}, Padding::kSame)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{16}, /*window_bounds=*/{4},
      /*strides=*/{3},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({16}, {4}, {3}, Padding::kSame)[0].first},
+     /*pad_high=*/{xla::MakePadding({16}, {4}, {3}, Padding::kSame)[0].second},
+     /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{5}, /*window_bounds=*/{5},
+     /*strides=*/{1},
+     /*pad_low=*/{0},
+     /*pad_high=*/{5},
+     /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{5}, /*window_bounds=*/{5},
+     /*strides=*/{1},
+     /*pad_low=*/{5},
+     /*pad_high=*/{0},
+     /*reducer=*/Reducer::kAdd},
 };
 
 string R1ReduceWindowTestDataToString(
-    const ::testing::TestParamInfo<R1ReduceWindowTestData>& data) {
+    const ::testing::TestParamInfo<
+        ::testing::tuple<R1ReduceWindowTestData, bool>>& data) {
+  const auto& param = ::testing::get<0>(data.param);
   string str = tensorflow::strings::StrCat(
-      "base_bounds_",
-      tensorflow::str_util::Join(data.param.base_bounds, "x"),  //
-      "__window_bounds_",
-      tensorflow::str_util::Join(data.param.window_bounds, "x"),              //
-      "__strides_", tensorflow::str_util::Join(data.param.strides, "x"),      //
-      "__padding_", data.param.padding == Padding::kSame ? "same" : "valid",  //
-      "__reducer_", data.param.reducer == kAdd ? "add" : "max");
+      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),
+      "__window_bounds_", tensorflow::str_util::Join(param.window_bounds, "x"),
+      "__strides_", tensorflow::str_util::Join(param.strides, "x"),
+      "__pad_low_", tensorflow::str_util::Join(param.pad_low, "x"),
+      "__pad_high_", tensorflow::str_util::Join(param.pad_high, "x"),
+      "__reducer_", param.reducer == kAdd ? "add" : "max");
+  if (::testing::get<1>(data.param)) {
+    str = tensorflow::strings::StrCat(str, "_bfloat16");
+  }
   return str;
 }
 
-class R1ReduceWindowTest
-    : public ClientLibraryTestBase,
-      public ::testing::WithParamInterface<R1ReduceWindowTestData> {};
+class R1ReduceWindowTest : public ReduceWindowTestBase,
+                           public ::testing::WithParamInterface<
+                               ::testing::tuple<R1ReduceWindowTestData, bool>> {
+ protected:
+  R1ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
+};
 
 TEST_P(R1ReduceWindowTest, DoIt) {
   ComputationBuilder b(client_, TestName());
-  const auto& param = GetParam();
+  const auto& param = ::testing::get<0>(GetParam());
   CHECK(param.reducer == kAdd || param.reducer == kMax);
 
   const float kInitValue = 0.0f;
@@ -1041,18 +1220,24 @@ TEST_P(R1ReduceWindowTest, DoIt) {
   std::iota(std::begin(input_vector), std::end(input_vector), 0);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR1(tensorflow::gtl::ArraySlice<float>(input_vector));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_arg,
-                          client_->TransferToServer(*input_literal));
+  ComputationDataHandle parameter;
+  auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
+                                                     &b, &parameter);
+
+  std::vector<std::pair<int64, int64>> padding(1);
+  padding[0] = {param.pad_low[0], param.pad_high[0]};
 
   auto computation = param.reducer == kAdd
-                         ? CreateScalarAddComputation(F32, &b)
-                         : CreateScalarMaxComputation(F32, &b);
-  b.ReduceWindow(/*operand=*/
-                 b.Parameter(0, input_literal->shape(), "p0"),
-                 /*init_value=*/b.ConstantR0<float>(kInitValue),
-                 /*computation=*/computation,
-                 /*window_dimensions=*/param.window_bounds,
-                 /*window_strides=*/param.strides, /*padding=*/param.padding);
+                         ? CreateScalarAddComputation(FloatType(), &b)
+                         : CreateScalarMaxComputation(FloatType(), &b);
+  auto init_value =
+      CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
+  b.ReduceWindowWithGeneralPadding(
+      /*operand=*/parameter,
+      /*init_value=*/init_value,
+      /*computation=*/computation,
+      /*window_dimensions=*/param.window_bounds,
+      /*window_strides=*/param.strides, /*padding=*/padding);
 
   auto reduce_func = param.reducer == kAdd
                          ? +[](float a, float b) { return a + b; }
@@ -1062,14 +1247,73 @@ TEST_P(R1ReduceWindowTest, DoIt) {
       /*init=*/kInitValue,
       /*reduce_func=*/reduce_func,
       /*window=*/param.window_bounds,
-      /*stride=*/param.strides, /*padding=*/param.padding);
+      /*stride=*/param.strides,
+      /*padding=*/padding);
 
-  ComputeAndCompareR1<float>(&b, tensorflow::gtl::ArraySlice<float>(*expected),
-                             {input_arg.get()}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&b, *Literal::CreateR1<float>(*expected),
+                           {input_arg.get()}, DefaultErrorSpec());
+}
+
+INSTANTIATE_TEST_CASE_P(
+    R1ReduceWindowTestInstantiation, R1ReduceWindowTest,
+    ::testing::Combine(::testing::ValuesIn(kR1TestCases),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R1ReduceWindowTestDataToString);
+
+// Test class for text-based test cases. Note that this compares with the
+// results on the interpreter backend.
+class ReduceWindowTextTest : public HloTestBase {};
+
+TEST_F(ReduceWindowTextTest, R2General256x384) {
+  const string& hlo_string = R"(
+HloModule R2Window
+mul {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT mul = f32[] multiply(lhs, rhs)
+}
+ENTRY R2Window {
+  operand = f32[256,384]{1,0} parameter(0)
+  constant = f32[] constant(1)
+  ROOT reduce-window = f32[256,384]{1,0} reduce-window(operand, constant), window={size=2x3 pad=0_1x1_1}, to_apply=mul
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
+}
+
+TEST_F(ReduceWindowTextTest, R2General256x384Layout01) {
+  const string& hlo_string = R"(
+HloModule R2Window
+mul {
+lhs = f32[] parameter(0)
+rhs = f32[] parameter(1)
+ROOT mul = f32[] multiply(lhs, rhs)
+}
+ENTRY R2Window {
+operand = f32[256,384]{0,1} parameter(0)
+constant = f32[] constant(1)
+ROOT reduce-window = f32[256,384]{0,1} reduce-window(operand, constant), window={size=2x3 pad=0_1x1_1}, to_apply=mul
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
+}
+
+TEST_F(ReduceWindowTextTest, R2General2x5) {
+  const string& hlo_string = R"(
+HloModule R2Window
+mul {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT mul = f32[] multiply(lhs, rhs)
+}
+ENTRY R2Window {
+  operand = f32[2,5]{1,0} parameter(0)
+  constant = f32[] constant(1)
+  ROOT reduce-window = f32[3,5]{1,0} reduce-window(operand, constant), window={size=2x1 pad=0_2x0_0}, to_apply=mul
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001}));
 }
 
-INSTANTIATE_TEST_CASE_P(R1ReduceWindowTestInstantiation, R1ReduceWindowTest,
-                        ::testing::ValuesIn(kR1TestCases),
-                        R1ReduceWindowTestDataToString);
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index d235b9a1580ecbd6b82a69fca53d259912ff375e..f7b04debd4f5c40a904e32c832b6fc384a03c33b 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -41,326 +41,467 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class ReshapeTest : public ClientLibraryTestBase {
+// Use a bool parameter to indicate whether to use bfloat16.
+class ReshapeTest : public ::testing::WithParamInterface<bool>,
+                    public ClientLibraryTestBase {
  public:
+  ReshapeTest() { set_use_bfloat16(GetParam()); }
+
   ErrorSpec zero_error_spec_{0.0};
 };
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to 1 dimension.
-XLA_TEST_F(ReshapeTest, CollapseTrivial1x1) {
+XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2<float>({{1.0}});
-  builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f}, {}, zero_error_spec_);
+  Array2D<float> input_array(1, 1);
+  input_array.Fill(1.0f);
+  auto input_literal = Literal::CreateR2FromArray2D(input_array);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+
+  auto expected_literal = Literal::CreateR1<float>({1.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, CollapseTrivialR1EmptyDims) {
+XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<float>({1.0});
-  builder.Collapse(/*operand=*/a, /*dimensions=*/{});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f}, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateR1<float>({1.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{});
+
+  auto expected_literal = Literal::CreateR1<float>({1.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, CollapseTrivialR1OnlyDim) {
+XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<float>({1.0});
-  builder.Collapse(/*operand=*/a, /*dimensions=*/{0});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f}, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateR1<float>({1.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0});
+
+  auto expected_literal = Literal::CreateR1<float>({1.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to scalar.
-XLA_TEST_F(ReshapeTest, SingleElementArrayToScalar) {
+XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2<float>({{1.0}});
-  auto reshape =
-      builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1}, /*new_sizes=*/{});
+  Array2D<float> input_array(1, 1);
+  input_array.Fill(1.0f);
+  auto input_literal = Literal::CreateR2FromArray2D(input_array);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
+                                                 &builder, &parameter);
+  auto reshape = builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                                 /*new_sizes=*/{});
   auto new_shape = builder.GetShape(reshape).ConsumeValueOrDie();
 
-  ComputeAndCompareR0<float>(&builder, 1.0f, {}, zero_error_spec_);
+  auto expected_literal = Literal::CreateR0<float>(1.0f);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, ScalarToSingleElementArray) {
+XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(1.0f);
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
-
-  auto a = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0");
-  a = builder.Neg(a);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
+                                                 &builder, &parameter);
+  auto a = builder.Neg(parameter);
   auto reshape =
       builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
 
-  ComputeAndCompareR1<float>(&builder, {-1.0f}, {param0_data.get()},
-                             zero_error_spec_);
+  auto expected_literal = Literal::CreateR1<float>({-1.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, Trivial0x3) {
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 3));
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {}, {}, zero_error_spec_);
+  Array2D<float> input_array(0, 3);
+  auto input_literal = Literal::CreateR2FromArray2D(input_array);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // TODO(b/29185393): Make this work with the GPU backend. The GPU backend
 // does not handle zero-sized shapes correctly. Failed last on 2017-05-15
 // with an incorrect result rank.
-XLA_TEST_F(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
-
-  auto a = builder.Parameter(0, ShapeUtil::MakeShape(F32, {0, 3}), "param0");
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
-                             zero_error_spec_);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, Trivial3x0) {
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial3x0)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {}, {}, zero_error_spec_);
+  Array2D<float> input_array(3, 0);
+  auto input_literal = Literal::CreateR2FromArray2D(input_array);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Collapses a 2-dimensional row vector to 1 dimension.
-XLA_TEST_F(ReshapeTest, Trivial1x3) {
+XLA_TEST_P(ReshapeTest, Trivial1x3) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2<float>({{1.0f, 2.0f, 3.0f}});
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f, 2.0f, 3.0f}, {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateR2<float>({{1.0f, 2.0f, 3.0f}});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({1.0f, 2.0f, 3.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Collapses a 2-dimensional column vector to 1 dimension.
-XLA_TEST_F(ReshapeTest, Trivial3x1) {
+XLA_TEST_P(ReshapeTest, Trivial3x1) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2<float>({{1.0f}, {2.0f}, {3.0f}});
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f, 2.0f, 3.0f}, {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateR2<float>({{1.0f}, {2.0f}, {3.0f}});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({1.0f, 2.0f, 3.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
 // Splits an empty vector into an empty matrix.
-XLA_TEST_F(ReshapeTest, R1ToR2_0_To_2x0) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(R1ToR2_0_To_2x0)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<float>({});
-  auto result =
-      builder.Reshape(/*operand=*/a, /*dimensions=*/{0}, /*new_sizes=*/{2, 0});
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(2, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateR1<float>({});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0},
+                  /*new_sizes=*/{2, 0});
+  auto expected_literal = Literal::CreateR2<float>({{}, {}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Splits a vector into a matrix.
-XLA_TEST_F(ReshapeTest, R1ToR2_6_To_2x3) {
+XLA_TEST_P(ReshapeTest, R1ToR2_6_To_2x3) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
-  auto result =
-      builder.Reshape(/*operand=*/a, /*dimensions=*/{0}, /*new_sizes=*/{2, 3});
-  Array2D<float> expected_2x3({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}});
-  ComputeAndCompareR2<float>(&builder, expected_2x3, {}, zero_error_spec_);
+  auto input_literal =
+      Literal::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0},
+                  /*new_sizes=*/{2, 3});
+  auto expected_literal =
+      Literal::CreateR2<float>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
 // Transposes a 2x0 array to a 0x2 array.
-XLA_TEST_F(ReshapeTest, Reshape0x2To2x0) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Reshape0x2To2x0)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 2));
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1},
-                                /*new_sizes=*/{2, 0});
-
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(2, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 2));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                  /*new_sizes=*/{2, 0});
+  auto expected_literal = Literal::CreateR2<float>({{}, {}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Transposes a 2-dimensional row vector to a column vector.
-XLA_TEST_F(ReshapeTest, ReshapeRowToCol) {
+XLA_TEST_P(ReshapeTest, ReshapeRowToCol) {
   ComputationBuilder builder(client_, TestName());
   auto simple = MakeLinspaceArray2D(1.0f, 3.0f, 1, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*simple);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1},
-                                /*new_sizes=*/{3, 1});
+  auto input_literal = Literal::CreateFromArray(*simple);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                  /*new_sizes=*/{3, 1});
 
   auto expected = ReferenceUtil::TransposeArray2D(*simple);
-  ComputeAndCompareR2<float>(&builder, *expected, {}, zero_error_spec_);
+  auto expected_literal = Literal::CreateFromArray(*expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Transposes a 2-dimensional array.
-XLA_TEST_F(ReshapeTest, TransposeAsReshape) {
+XLA_TEST_P(ReshapeTest, TransposeAsReshape) {
   ComputationBuilder builder(client_, TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*a4x3);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{1, 0},
-                                /*new_sizes=*/{3, 4});
-
-  auto expected3x4 = ReferenceUtil::TransposeArray2D(*a4x3);
-  ComputeAndCompareR2<float>(&builder, *expected3x4, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(*a4x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
+                  /*new_sizes=*/{3, 4});
+
+  auto expected = ReferenceUtil::TransposeArray2D(*a4x3);
+  auto expected_literal = Literal::CreateFromArray(*expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
 // Transposes a 0x4 array with ComputationBuilder::Trans.
-XLA_TEST_F(ReshapeTest, Transpose0x4) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Transpose0x4)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 4));
-  auto result = builder.Transpose(a, {1, 0});
-
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(4, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 4));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Transpose(parameter, {1, 0});
+  auto expected_literal = Literal::CreateR2<float>({{}, {}, {}, {}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Transposes a 2-dimensional array with ComputationBuilder::Trans.
-XLA_TEST_F(ReshapeTest, Transpose4x3) {
+XLA_TEST_P(ReshapeTest, Transpose4x3) {
   ComputationBuilder builder(client_, TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*a4x3);
-  auto result = builder.Transpose(a, {1, 0});
-
-  auto expected3x4 = ReferenceUtil::TransposeArray2D(*a4x3);
-  ComputeAndCompareR2<float>(&builder, *expected3x4, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(*a4x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Transpose(parameter, {1, 0});
+
+  auto expected = ReferenceUtil::TransposeArray2D(*a4x3);
+  auto expected_literal = Literal::CreateFromArray(*expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
 // Reshapes an empty 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
-XLA_TEST_F(ReshapeTest, ReshapeSplitNoShuffleZeroElements) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitNoShuffleZeroElements)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(6, 0));
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1},
-                                /*new_sizes=*/{2, 3, 0, 0});
-
-  ComputeAndCompareR4<float>(&builder, Array4D<float>(2, 3, 0, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array2D<float>(6, 0));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                  /*new_sizes=*/{2, 3, 0, 0});
+  auto expected_literal = Literal::CreateFromArray(Array4D<float>(2, 3, 0, 0));
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, ReshapeR4ToR2ZeroElements) {
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeR4ToR2ZeroElements)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR4FromArray4D<float>(Array4D<float>(2, 3, 4, 0));
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1, 2, 3},
-                                /*new_sizes=*/{24, 0});
-
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(24, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array4D<float>(2, 3, 4, 0));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
+                  /*new_sizes=*/{24, 0});
+  auto expected_literal = Literal::CreateFromArray(Array2D<float>(24, 0));
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
-XLA_TEST_F(ReshapeTest, ReshapeSplitNoShuffle) {
+XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
   ComputationBuilder builder(client_, TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*a4x3);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1},
-                                /*new_sizes=*/{2, 6});
-
-  auto expected2x6 = MakeLinspaceArray2D(1.0f, 12.0f, 2, 6);
-  ComputeAndCompareR2<float>(&builder, *expected2x6, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(*a4x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                  /*new_sizes=*/{2, 6});
+
+  auto expected = MakeLinspaceArray2D(1.0f, 12.0f, 2, 6);
+  auto expected_literal = Literal::CreateFromArray(*expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-// Reshapes a 2-dimensional array with dimensions that are not just a
-// rearrangement of the originals (split), and reorder the input (shuffle).
-XLA_TEST_F(ReshapeTest, ReshapeSplitAndShuffleZeroElements) {
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitAndShuffleZeroElements)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 6));
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{1, 0},
-                                /*new_sizes=*/{3, 0});
-
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(3, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 6));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
+                  /*new_sizes=*/{3, 0});
+  auto expected_literal = Literal::CreateFromArray(Array2D<float>(3, 0));
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), and reorder the input (shuffle).
-XLA_TEST_F(ReshapeTest, ReshapeSplitAndShuffle) {
+XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffle) {
   ComputationBuilder builder(client_, TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*a4x3);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{1, 0},
-                                /*new_sizes=*/{2, 6});
-
-  Array2D<float> expected2x6({{1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f},
-                              {8.0f, 11.0f, 3.0f, 6.0f, 9.0f, 12.0f}});
-  ComputeAndCompareR2<float>(&builder, expected2x6, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(*a4x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
+                  /*new_sizes=*/{2, 6});
+  Array2D<float> expected({{1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f},
+                           {8.0f, 11.0f, 3.0f, 6.0f, 9.0f, 12.0f}});
+  auto expected_literal = Literal::CreateFromArray(expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // The following tests use the same input 3D array; they test the examples we
 // show for the Reshape operation in the operation_semantics document.
 // TODO(b/34503277): find a way to show this code in the documentation without
 // duplication on the TF documentation server.
-Array3D<int> v_array_for_doc_R3_tests({{{10, 11, 12}, {15, 16, 17}},
-                                       {{20, 21, 22}, {25, 26, 27}},
-                                       {{30, 31, 32}, {35, 36, 37}},
-                                       {{40, 41, 42}, {45, 46, 47}}});
-
-XLA_TEST_F(ReshapeTest, DocR3_R1_Collapse_012) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{0, 1, 2},
-                                /*new_sizes=*/{24});
-  ComputeAndCompareR1<int>(&builder,
-                           {10, 11, 12, 15, 16, 17, 20, 21, 22, 25, 26, 27,
-                            30, 31, 32, 35, 36, 37, 40, 41, 42, 45, 46, 47},
-                           {});
-}
-
-XLA_TEST_F(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{0, 1, 2},
-                                /*new_sizes=*/{8, 3});
-  Array2D<int> expected({{10, 11, 12},
-                         {15, 16, 17},
-                         {20, 21, 22},
-                         {25, 26, 27},
-                         {30, 31, 32},
-                         {35, 36, 37},
-                         {40, 41, 42},
-                         {45, 46, 47}});
-  ComputeAndCompareR2<int>(&builder, expected, {});
-}
-
-XLA_TEST_F(ReshapeTest, DocR3_R1_Collapse_120) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{1, 2, 0},
-                                /*new_sizes=*/{24});
-  ComputeAndCompareR1<int>(&builder,
-                           {10, 20, 30, 40, 11, 21, 31, 41, 12, 22, 32, 42,
-                            15, 25, 35, 45, 16, 26, 36, 46, 17, 27, 37, 47},
-                           {});
-}
-
-XLA_TEST_F(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{1, 2, 0},
-                                /*new_sizes=*/{8, 3});
-  Array2D<int> expected({{10, 20, 30},
-                         {40, 11, 21},
-                         {31, 41, 12},
-                         {22, 32, 42},
-                         {15, 25, 35},
-                         {45, 16, 26},
-                         {36, 46, 17},
-                         {27, 37, 47}});
-  ComputeAndCompareR2<int>(&builder, expected, {});
-}
-
-XLA_TEST_F(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{1, 2, 0},
-                                /*new_sizes=*/{2, 6, 2});
-  Array3D<int> expected(
+static Array3D<float> ArrayForDocR3Tests() {
+  return Array3D<float>({{{10, 11, 12}, {15, 16, 17}},
+                         {{20, 21, 22}, {25, 26, 27}},
+                         {{30, 31, 32}, {35, 36, 37}},
+                         {{40, 41, 42}, {45, 46, 47}}});
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
+                  /*new_sizes=*/{24});
+  auto expected_literal = Literal::CreateR1<float>(
+      {10, 11, 12, 15, 16, 17, 20, 21, 22, 25, 26, 27,
+       30, 31, 32, 35, 36, 37, 40, 41, 42, 45, 46, 47});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
+                  /*new_sizes=*/{8, 3});
+  auto expected_literal = Literal::CreateR2<float>({{10, 11, 12},
+                                                    {15, 16, 17},
+                                                    {20, 21, 22},
+                                                    {25, 26, 27},
+                                                    {30, 31, 32},
+                                                    {35, 36, 37},
+                                                    {40, 41, 42},
+                                                    {45, 46, 47}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
+                  /*new_sizes=*/{24});
+  auto expected_literal = Literal::CreateR1<float>(
+      {10, 20, 30, 40, 11, 21, 31, 41, 12, 22, 32, 42,
+       15, 25, 35, 45, 16, 26, 36, 46, 17, 27, 37, 47});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
+                  /*new_sizes=*/{8, 3});
+  auto expected_literal = Literal::CreateR2<float>({{10, 20, 30},
+                                                    {40, 11, 21},
+                                                    {31, 41, 12},
+                                                    {22, 32, 42},
+                                                    {15, 25, 35},
+                                                    {45, 16, 26},
+                                                    {36, 46, 17},
+                                                    {27, 37, 47}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
+                  /*new_sizes=*/{2, 6, 2});
+  auto expected_literal = Literal::CreateR3<float>(
       {{{10, 20}, {30, 40}, {11, 21}, {31, 41}, {12, 22}, {32, 42}},
        {{15, 25}, {35, 45}, {16, 26}, {36, 46}, {17, 27}, {37, 47}}});
-  ComputeAndCompareR3<int>(&builder, expected, {});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Collapses the low dimensions of a 4D tensor to get a 2D matrix, without
@@ -378,23 +519,26 @@ XLA_TEST_F(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
 // Then we collapse Z be collapsed so we just end up with planes:
 //
 // 1 2 3 4 5 6 1 2 3 4 5 6
-XLA_TEST_F(ReshapeTest, FullyConnectedCollapse) {
+XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
   ComputationBuilder builder(client_, TestName());
   Array4D<float> t2x2x2x3(2, 2, 2, 3);
   auto filler2x3 = MakeLinspaceArray2D(1.0f, 6.0f, 2, 3);
   t2x2x2x3.FillWithYX(*filler2x3);
-  auto a = builder.ConstantR4FromArray4D<float>(t2x2x2x3);
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{1, 2, 3});
-
-  Array2D<float> expected2x12(
+  auto input_literal = Literal::CreateFromArray(t2x2x2x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{1, 2, 3});
+  auto expected_literal = Literal::CreateR2<float>(
       {{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
        {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
         6.0f}});
-  ComputeAndCompareR2<float>(&builder, expected2x12, {}, zero_error_spec_);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // As above, but uses reshape directly.
-XLA_TEST_F(ReshapeTest, FullyConnectedCollapseDesugared) {
+XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
   ComputationBuilder builder(client_, TestName());
   Array4D<float> t(2, 1, 2, 2);
   t(0, 0, 0, 0) = 0;
@@ -405,52 +549,68 @@ XLA_TEST_F(ReshapeTest, FullyConnectedCollapseDesugared) {
   t(1, 0, 0, 1) = 5;
   t(1, 0, 1, 0) = 6;
   t(1, 0, 1, 1) = 7;
-  auto a = builder.ConstantR4FromArray4D<float>(t);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1, 2, 3},
-                                /*new_sizes=*/{2, 4});
-
-  Array2D<float> expected({{0, 1, 2, 3}, {4, 5, 6, 7}});
-  ComputeAndCompareR2<float>(&builder, expected, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(t);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
+                  /*new_sizes=*/{2, 4});
+
+  auto expected_literal =
+      Literal::CreateR2<float>({{0, 1, 2, 3}, {4, 5, 6, 7}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Reshape various ranks to a scalar.
-XLA_TEST_F(ReshapeTest, ToScalar) {
+XLA_TEST_P(ReshapeTest, ToScalar) {
   for (int rank = 0; rank < 8; ++rank) {
     ComputationBuilder b(client_, TestName());
-    auto input = Literal::CreateR1<float>({83.0f});
     std::vector<int64> ones(rank, 1);  // this is {1, ..., 1}.
     std::vector<int64> dimensions(rank);
     std::iota(dimensions.begin(), dimensions.end(), 0);
-    *input->mutable_shape() = ShapeUtil::MakeShape(F32, ones);
-    b.Reshape(b.ConstantLiteral(*input), dimensions, {});
+    Literal input_literal(ShapeUtil::MakeShape(F32, ones));
+    std::vector<int64> zeros(rank, 0);  // this is {0, ..., 0}.
+    input_literal.Set<float>(zeros, 83.0f);
 
-    ComputeAndCompareR0<float>(&b, 83.0f, {}, zero_error_spec_);
+    ComputationDataHandle parameter;
+    auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                   &b, &parameter);
+    b.Reshape(parameter, dimensions, {});
+
+    auto expected_literal = Literal::CreateR0<float>(83.0f);
+    ComputeAndCompareLiteral(&b, *expected_literal, {input.get()},
+                             zero_error_spec_);
   }
 }
 
-XLA_TEST_F(ReshapeTest, BadDimensions) {
+XLA_TEST_P(ReshapeTest, BadDimensions) {
   ComputationBuilder b(client_, TestName());
-  b.Reshape(b.ConstantR1<int32>({1}), {}, {});
+  auto input_literal = Literal::CreateR1<float>({1.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
+                                                 &parameter);
+  b.Reshape(parameter, {}, {});
   EXPECT_THAT(
       ExecuteToString(&b, {}),
       ::testing::HasSubstr("not a permutation of the operand dimensions"));
 }
 
-XLA_TEST_F(ReshapeTest, BadNewSizes) {
+XLA_TEST_P(ReshapeTest, BadNewSizes) {
   ComputationBuilder b(client_, TestName());
-  b.Reshape(b.ConstantR1<int32>({1, 2}), {1}, {});
+  auto input_literal = Literal::CreateR1<float>({1.0f, 2.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
+                                                 &parameter);
+  b.Reshape(parameter, {1}, {});
   EXPECT_THAT(ExecuteToString(&b, {}),
               ::testing::HasSubstr("mismatched element counts"));
 }
 
-XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
-  const Shape parameter_shape = ShapeUtil::MakeShape(F32, {2, 2, 2, 2});
+XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, parameter_shape, "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 8});
-
   // clang-format off
-  auto literal = Literal::CreateR4FromArray4DWithLayout(Array4D<float>{
+  auto input_literal = Literal::CreateR4FromArray4DWithLayout(Array4D<float>{
     {
       {
         {0, 1},
@@ -474,8 +634,12 @@ XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   },
        LayoutUtil::MakeLayout({0, 1, 2, 3}));
   // clang-format on
-  std::unique_ptr<GlobalData> input =
-      client_->TransferToServer(*literal).ConsumeValueOrDie();
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 8});
+
   Array2D<float> expected_array({
       {0, 1, 2, 3, 100, 101, 102, 103},
       {222, 333, 444, 555, 666, 777, 888, 999},
@@ -484,72 +648,75 @@ XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   Computation computation = builder.Build().ConsumeValueOrDie();
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
-      ShapeUtil::MakeShapeWithLayout(F32, {2, 8}, {1, 0});
+      ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {2, 8},
+                                     {1, 0});
   std::unique_ptr<Literal> actual =
       client_
           ->ExecuteAndTransfer(computation, {input.get()}, &execution_options)
           .ConsumeValueOrDie();
   std::unique_ptr<Literal> expected =
       Literal::CreateR2FromArray2D<float>(expected_array);
+  if (use_bfloat16()) {
+    expected = LiteralTestUtil::ConvertF32ToBF16(*expected);
+  }
   LiteralTestUtil::ExpectEqual(*expected, *actual);
 }
 
-XLA_TEST_F(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
-  std::unique_ptr<Literal> input = Literal::CreateR2<float>({
+XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
+  ComputationBuilder builder(client_, TestName());
+  std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4});
 
   // clang-format off
-  Array4D<float> expected = {
+  auto expected_literal = Literal::CreateR4<float>({
     {{{0, 1, 2, 3}},
      {{4, 5, 6, 7}}},
     {{{100, 101, 102, 103}},
      {{104, 105, 106, 107}}},
     {{{200, 201, 202, 203}},
      {{204, 205, 206, 207}}}
-  };
+  });
   // clang-format on
-  ComputeAndCompareR4<float>(&builder, expected, {input_data.get()},
-                             zero_error_spec_);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Tests R2->R4 reshape with the reshape dimensions {1, 0}.
-XLA_TEST_F(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
-  std::unique_ptr<Literal> input = Literal::CreateR2<float>({
+XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
+  ComputationBuilder builder(client_, TestName());
+  std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4});
 
   // clang-format off
-  Array4D<float> expected = {
+  auto expected_literal = Literal::CreateR4<float>({
     {{{0, 100, 200, 1}},
      {{101, 201, 2, 102}}},
     {{{202, 3, 103, 203}},
      {{4, 104, 204, 5}}},
     {{{105, 205, 6, 106}},
      {{206, 7, 107, 207}}}
-  };
+  });
   // clang-format on
-  ComputeAndCompareR4<float>(&builder, expected, {input_data.get()},
-                             zero_error_spec_);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
+XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
+  ComputationBuilder builder(client_, TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 1, 1);
@@ -559,12 +726,10 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape({2, 1}, {1, 0}, *input_literal);
@@ -572,7 +737,8 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
                            zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
+XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
+  ComputationBuilder builder(client_, TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 4, 1);
@@ -582,12 +748,10 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape({4, 2}, {1, 0}, *input_literal);
@@ -596,7 +760,8 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
 }
 
 // Tests R4->R2 reshape with the reshape dimensions {0, 2, 1, 3}.
-XLA_TEST_F(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
+XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
+  ComputationBuilder builder(client_, TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(5, 10, 2, 3);
@@ -606,12 +771,11 @@ XLA_TEST_F(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 2, 1, 3}, /*new_sizes=*/{5, 60});
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 2, 1, 3},
+                  /*new_sizes=*/{5, 60});
 
   Array2D<float> expected_array(5, 60);
   input.Each([&](tensorflow::gtl::ArraySlice<int64> indices, float* cell) {
@@ -619,10 +783,12 @@ XLA_TEST_F(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
         *cell;
   });
   auto expected = Literal::CreateR2FromArray2D(expected_array);
-  ComputeAndCompareLiteral(&builder, *expected, {input_data.get()});
+  ComputeAndCompareLiteral(&builder, *expected, {input_data.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, NoopReshape) {
+XLA_TEST_P(ReshapeTest, NoopReshape) {
+  ComputationBuilder builder(client_, TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input_array(2, 3, 5, 7);
@@ -632,18 +798,17 @@ XLA_TEST_F(ReshapeTest, NoopReshape) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({1, 2, 3, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto input = builder.Parameter(0, input_literal->shape(), "input");
-  builder.Reshape(input, /*dimensions=*/{3, 0, 1, 2},
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{3, 0, 1, 2},
                   /*new_sizes=*/{7, 2, 3, 5});
   Computation computation = builder.Build().ConsumeValueOrDie();
 
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
-      ShapeUtil::MakeShapeWithLayout(F32, {7, 2, 3, 5}, {2, 3, 0, 1});
+      ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {7, 2, 3, 5},
+                                     {2, 3, 0, 1});
   std::unique_ptr<Literal> output_literal =
       client_
           ->ExecuteAndTransfer(computation, {input_data.get()},
@@ -652,35 +817,43 @@ XLA_TEST_F(ReshapeTest, NoopReshape) {
 
   // Since the reshape is a no-op, verify that it does not change the underlying
   // data.
-  EXPECT_EQ(tensorflow::gtl::ArraySlice<float>(input_literal->f32s()),
-            tensorflow::gtl::ArraySlice<float>(output_literal->f32s()));
+  if (use_bfloat16()) {
+    auto expected = LiteralTestUtil::ConvertF32ToBF16(*input_literal);
+    EXPECT_EQ(expected->data<bfloat16>(), output_literal->data<bfloat16>());
+  } else {
+    EXPECT_EQ(input_literal->data<float>(), output_literal->data<float>());
+  }
 }
 
-XLA_TEST_F(ReshapeTest, R4ToR4Reshape_Trivial) {
-  auto literal_1x2x3x4 = Literal::CreateR4(
+XLA_TEST_P(ReshapeTest, R4ToR4Reshape_Trivial) {
+  ComputationBuilder builder(client_, TestName());
+  auto literal_1x2x3x4 = Literal::CreateR4<float>(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
-  ComputationBuilder builder(client_, TestName());
-  auto input = builder.ConstantLiteral(*literal_1x2x3x4);
-  builder.Reshape(input, /*dimensions=*/{0, 1, 2, 3},
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3},
                   /*new_sizes=*/{1, 2, 3, 4});
 
-  ComputeAndCompareLiteral(&builder, *literal_1x2x3x4, {});
+  ComputeAndCompareLiteral(&builder, *literal_1x2x3x4, {input.get()});
 }
 
-XLA_TEST_F(ReshapeTest, R4ToR4Reshape) {
-  auto literal_1x2x3x4 = Literal::CreateR4(
+XLA_TEST_P(ReshapeTest, R4ToR4Reshape) {
+  auto literal_1x2x3x4 = Literal::CreateR4<float>(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
   ComputationBuilder builder(client_, TestName());
-  auto input = builder.ConstantLiteral(*literal_1x2x3x4);
-  builder.Reshape(input, /*dimensions=*/{1, 3, 2, 0},
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{1, 3, 2, 0},
                   /*new_sizes=*/{2, 4, 3, 1});
 
   // clang-format off
-  auto expected_2x4x3x1 = Literal::CreateR4(
+  auto expected_2x4x3x1 = Literal::CreateR4<float>(
       {{{{1}, {5}, {9}},
         {{2}, {6}, {10}},
         {{3}, {7}, {11}},
@@ -691,10 +864,10 @@ XLA_TEST_F(ReshapeTest, R4ToR4Reshape) {
         {{16}, {20}, {24}}}});
   // clang-format on
 
-  ComputeAndCompareLiteral(&builder, *expected_2x4x3x1, {});
+  ComputeAndCompareLiteral(&builder, *expected_2x4x3x1, {input.get()});
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeSimple) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64> bounds = {2, 2, 2, 2};
@@ -706,12 +879,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeSimple) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
@@ -723,7 +896,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeSimple) {
                            zero_error_spec_, &expected->shape());
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64> bounds = {1, 1, 250, 300};
@@ -735,12 +908,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
@@ -752,7 +925,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
                            zero_error_spec_, &expected->shape());
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64> bounds = {5, 5, 1, 10};
@@ -764,12 +937,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
@@ -781,7 +954,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
                            zero_error_spec_, &expected->shape());
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   // This happens in NN-Builder MNIST.
@@ -794,12 +967,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
@@ -811,7 +984,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
                            zero_error_spec_, &expected->shape());
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64> bounds = {3, 3, 1, 3};
@@ -823,12 +996,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({0, 1, 2, 3}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{1, 0, 2, 3}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{1, 0, 2, 3},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {1, 0, 2, 3}, *input_literal)
@@ -840,5 +1013,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
                            zero_error_spec_, &expected->shape());
 }
 
+#ifdef XLA_BACKEND_SUPPORTS_BFLOAT16
+INSTANTIATE_TEST_CASE_P(ReshapeTestInstance, ReshapeTest, ::testing::Bool());
+#else
+INSTANTIATE_TEST_CASE_P(ReshapeTestInstance, ReshapeTest,
+                        ::testing::ValuesIn(std::vector<bool>{false}));
+#endif
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc
index 1f6cfc85ccd25bb22db51411f7376489c14c3603..8fc841f14087cdea02fe44cdaea521ff92122aec 100644
--- a/tensorflow/compiler/xla/tests/reverse_test.cc
+++ b/tensorflow/compiler/xla/tests/reverse_test.cc
@@ -28,56 +28,89 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class ReverseTest : public ClientLibraryTestBase {};
-
-// Tests the reverse operation on a scalar.
-XLA_TEST_F(ReverseTest, ReverseScalar) {
-  ComputationBuilder b(client_, TestName());
-  float input = 3.5f;
-  b.Rev(b.ConstantR0<float>(input), {});
-  ComputeAndCompareR0<float>(&b, input, {});
-}
-
-// Tests the reverse operation on a 0x0 float array on both dimensions.
-XLA_TEST_F(ReverseTest, Reverse0x0FloatArray) {
-  ComputationBuilder b(client_, TestName());
-  b.Rev(b.ConstantR2FromArray2D<float>(Array2D<float>(0, 0)), {0, 1});
-  ComputeAndCompareR2<float>(&b, Array2D<float>(0, 0), {});
-}
-
-// Tests the reverse operation on a 0x1 float array on both dimensions.
-XLA_TEST_F(ReverseTest, Reverse0x1FloatArray) {
-  ComputationBuilder b(client_, TestName());
-  b.Rev(b.ConstantR2FromArray2D<float>(Array2D<float>(0, 1)), {0, 1});
-  ComputeAndCompareR2<float>(&b, Array2D<float>(0, 1), {});
+#ifdef XLA_BACKEND_SUPPORTS_BFLOAT16
+// Tests both F32 and BF16.
+static std::array<bool, 2> use_bfloat16_params{false, true};
+#else
+// Only tests F32.
+static std::array<bool, 1> use_bfloat16_params{false};
+#endif
+
+struct ReverseSpec {
+  tensorflow::gtl::ArraySlice<int64> input_dims;
+  tensorflow::gtl::ArraySlice<int64> reversal;
+  bool use_bfloat16;
+
+  string ToTestCaseName() const {
+    return tensorflow::strings::Printf(
+        "reverse_%s_in_dims_%s_%s",
+        tensorflow::str_util::Join(input_dims, "x").c_str(),
+        tensorflow::str_util::Join(reversal, "x").c_str(),
+        use_bfloat16 ? "bf16" : "f32");
+  }
+};
+
+static std::vector<ReverseSpec> GetTestCases() {
+  // clang-format off
+  return ExpandUseBfloat16<ReverseSpec>(
+      use_bfloat16_params,
+      {{{}, {}},
+        {{0, 0}, {0, 1}},
+        {{0, 1}, {0, 1}},
+        {{1, 0}, {0, 1}},
+        {{1, 1}, {0, 1}},
+        {{2, 0, 4, 3}, {0, 2}},
+        {{2, 0, 4, 3}, {1, 3}},
+        {{1, 2, 3, 4}, {0, 3}},
+        {{4, 3, 2, 1}, {0, 1}},
+      });
+  // clang-format on
 }
 
-// Tests the reverse operation on a 1x0 float array on both dimensions.
-XLA_TEST_F(ReverseTest, Reverse1x0FloatArray) {
-  ComputationBuilder b(client_, TestName());
-  b.Rev(b.ConstantR2FromArray2D<float>(Array2D<float>(1, 0)), {0, 1});
-  ComputeAndCompareR2<float>(&b, Array2D<float>(1, 0), {});
+void PrintTo(const ReverseSpec& spec, std::ostream* os) {
+  *os << spec.ToTestCaseName();
 }
 
-// Tests the reverse operation on a 1x1 float array on both dimensions.
-XLA_TEST_F(ReverseTest, Reverse1x1FloatArray) {
-  ComputationBuilder b(client_, TestName());
-  Array2D<float> input({{3.5f}});
-  b.Rev(b.ConstantR2FromArray2D<float>(input), {0, 1});
-  ComputeAndCompareR2<float>(&b, input, {});
+class FloatReverseTest : public ClientLibraryTestBase,
+                         public ::testing::WithParamInterface<ReverseSpec> {
+ public:
+  FloatReverseTest() { set_use_bfloat16(GetParam().use_bfloat16); }
+};
+
+TEST_P(FloatReverseTest, Reverses) {
+  const ReverseSpec& spec = GetParam();
+  std::vector<float> input_vector(
+      ShapeUtil::ElementsIn(ShapeUtil::MakeShape(F32, spec.input_dims)));
+  std::iota(input_vector.begin(), input_vector.end(), 0.0);
+  auto r1_literal = Literal::CreateR1<float>(input_vector);
+  auto input_literal = r1_literal->Reshape(spec.input_dims).ConsumeValueOrDie();
+
+  ComputationBuilder builder(client_, TestName());
+  auto a = AddParam(*input_literal, &builder);
+  builder.Rev(a, spec.reversal);
+
+  std::unique_ptr<Literal> expected = input_literal->CloneToUnique();
+  std::vector<int64> output_indices(spec.input_dims.size());
+  expected->EachCell<float>(
+      [&](tensorflow::gtl::ArraySlice<int64> indices, float) {
+        for (int64 i = 0; i < indices.size(); ++i) {
+          output_indices[i] = indices[i];
+        }
+        float value = input_literal->Get<float>(indices);
+        for (int64 dim : spec.reversal) {
+          output_indices[dim] = (spec.input_dims[dim] - 1) - indices[dim];
+        }
+        expected->Set<float>(output_indices, value);
+      });
+  ComputeAndCompareLiteral(&builder, *expected, {});
 }
 
-XLA_TEST_F(ReverseTest, Reverse2x0x4x3FloatArrayDim02) {
-  ComputationBuilder b(client_, TestName());
-  b.Rev(b.ConstantR4FromArray4D<float>(Array4D<float>(2, 0, 4, 3)), {0, 2});
-  ComputeAndCompareR4<float>(&b, Array4D<float>(2, 0, 4, 3), {});
-}
+INSTANTIATE_TEST_CASE_P(FloatReverseInstance, FloatReverseTest,
+                        ::testing::ValuesIn(GetTestCases()),
+                        ::testing::PrintToStringParamName());
 
-XLA_TEST_F(ReverseTest, Reverse2x0x4x3FloatArrayDim13) {
-  ComputationBuilder b(client_, TestName());
-  b.Rev(b.ConstantR4FromArray4D<float>(Array4D<float>(2, 0, 4, 3)), {1, 3});
-  ComputeAndCompareR4<float>(&b, Array4D<float>(2, 0, 4, 3), {});
-}
+// A simple test class which not templated by float precision.
+class ReverseTest : public ClientLibraryTestBase {};
 
 // Tests the reverse operation on a 4D U8 array on dimension 0 and 3.
 XLA_TEST_F(ReverseTest, Reverse4DU8ArrayOnDim23) {
diff --git a/tensorflow/compiler/xla/tests/sample_file_test.cc b/tensorflow/compiler/xla/tests/sample_file_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31b104f4e37f77d47f56ff8183ee1de1cc22e44d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/sample_file_test.cc
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This demonstrates how to use hlo_test_base to create a file based testcase
+// and compare results on gpu and cpu.
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class SampleFileTest : public HloTestBase {
+ protected:
+  SampleFileTest()
+      : HloTestBase(
+            /*test_platform=*/PlatformUtil::GetPlatform("gpu").ValueOrDie(),
+            /*reference_platform=*/PlatformUtil::GetPlatform("cpu")
+                .ValueOrDie()) {}
+};
+
+TEST_F(SampleFileTest, Convolution) {
+  const string& filename = "compiler/xla/tests/isolated_convolution.hlo";
+  string test_srcdir = tensorflow::testing::TensorFlowSrcRoot();
+  EXPECT_TRUE(RunAndCompareFromFile(
+      tensorflow::io::JoinPath(test_srcdir, filename), ErrorSpec{0.01}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/sample_text_test.cc b/tensorflow/compiler/xla/tests/sample_text_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4f2b74e3dc9e80f50454b28eb6f2502cef3e681
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/sample_text_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This demonstrates how to use hlo_test_base to create textual IR based
+// testcases.
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+using tensorflow::gtl::nullopt;
+
+class SampleTextTest : public HloTestBase {};
+
+TEST_F(SampleTextTest, Axpy) {
+  const string& hlo_string = R"(
+HloModule axpy_module:
+ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
+  %alpha = f32[] parameter(0)
+  %broadcast = f32[2,4]{1,0} broadcast(f32[] %alpha), dimensions={}
+  %x = f32[2,4]{1,0} parameter(1)
+  %multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %broadcast, f32[2,4]{1,0} %x)
+  %y = f32[2,4]{1,0} parameter(2)
+  ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
+}
+)";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_string, ErrorSpec{0.0001}));
+}
+
+TEST_F(SampleTextTest, Tuple) {
+  const string& hlo_string = R"(
+HloModule TupleCreate_module:
+ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) {
+  %v1 = f32[] parameter(0)
+  %v2 = f32[3]{0} parameter(1)
+  %v3 = f32[2,3]{1,0} parameter(2)
+  ROOT %tuple = (f32[], f32[3]{0}, f32[2,3]{1,0}) tuple(f32[] %v1, f32[3]{0} %v2, f32[2,3]{1,0} %v3)
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, nullopt));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index b5e7570778ffeca66cc15d7cd2b153639637a647..4da6ee91607941b395b00befc98a10e7c17746ed 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -69,6 +69,13 @@ class ScalarComputationsTest : public ClientLibraryTestBase {
   }
 };
 
+XLA_TEST_F(ScalarComputationsTest, ReturnScalarF32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.ConstantR0<float>(2.1f);
+
+  ComputeAndCompareR0<float>(&builder, 2.1f, {}, error_spec_);
+}
+
 XLA_TEST_F(ScalarComputationsTest, NegateScalarF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Neg(builder.ConstantR0<float>(2.1f));
@@ -730,7 +737,61 @@ XLA_TEST_F(ScalarComputationsTest, PowScalar) {
   ComputeAndCompareR0<float>(&builder, 8.0, {}, error_spec_);
 }
 
-XLA_TEST_F(ScalarComputationsTest, ClampScalarHigh) {
+XLA_TEST_F(ScalarComputationsTest, ClampScalarHighS32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
+                builder.ConstantR0<int32>(5),   // The operand to be clamped.
+                builder.ConstantR0<int32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<int32>(&builder, 3, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleS32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
+                builder.ConstantR0<int32>(2),   // The operand to be clamped.
+                builder.ConstantR0<int32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<int32>(&builder, 2, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarLowS32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
+                builder.ConstantR0<int32>(-5),  // The operand to be clamped.
+                builder.ConstantR0<int32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<int32>(&builder, -1, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarHighU32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
+                builder.ConstantR0<uint32>(5),   // The operand to be clamped.
+                builder.ConstantR0<uint32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<uint32>(&builder, 3, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleU32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
+                builder.ConstantR0<uint32>(2),   // The operand to be clamped.
+                builder.ConstantR0<uint32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<uint32>(&builder, 2, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarLowU32) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
+                builder.ConstantR0<uint32>(0),   // The operand to be clamped.
+                builder.ConstantR0<uint32>(3));  // The upper bound.
+
+  ComputeAndCompareR0<uint32>(&builder, 1, {});
+}
+
+XLA_TEST_F(ScalarComputationsTest, ClampScalarHighF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(5.0f),   // The operand to be clamped.
@@ -739,7 +800,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarHigh) {
   ComputeAndCompareR0<float>(&builder, 3.0, {}, error_spec_);
 }
 
-XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddle) {
+XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(2.5f),   // The operand to be clamped.
@@ -748,7 +809,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddle) {
   ComputeAndCompareR0<float>(&builder, 2.5, {}, error_spec_);
 }
 
-XLA_TEST_F(ScalarComputationsTest, ClampScalarLow) {
+XLA_TEST_F(ScalarComputationsTest, ClampScalarLowF32) {
   ComputationBuilder builder(client_, TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(-5.0f),  // The operand to be clamped.
@@ -845,5 +906,12 @@ XLA_TEST_F(ScalarComputationsTest, SqrtF320) {
   ComputeAndCompareR0<float>(&builder, 0.0f, {zero_data.get()}, error_spec_);
 }
 
+XLA_TEST_F(ScalarComputationsTest, RoundScalar) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Round(builder.ConstantR0<float>(1.4f));
+
+  ComputeAndCompareR0<float>(&builder, 1.0f, {}, error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index 62ff349e9c011e0eb845192013a74aeb0956b791..9ee94b8571e5fc8789b60501462986967ce909a0 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -39,8 +39,8 @@ namespace xla {
 namespace {
 
 struct SelectAndScatterTestParam {
-  Array4D<float> operand_shape;
-  Array4D<float> source_shape;
+  std::vector<int64> operand_shape;
+  std::vector<int64> source_shape;
   Padding padding_type;
   tensorflow::gtl::ArraySlice<int64> window_dimensions;
   tensorflow::gtl::ArraySlice<int64> window_strides;
@@ -69,83 +69,132 @@ class SelectAndScatterTest
   Computation min_f32_;
 };
 
-XLA_TEST_P(SelectAndScatterTest, R4Randomized) {
-  Array4D<float> o(GetParam().operand_shape);
+XLA_TEST_P(SelectAndScatterTest, ParamTest) {
+  auto operand_shape = GetParam().operand_shape;
+  Array<float> o(operand_shape);
   o.FillRandom(1.5f);
-  auto operand = builder_.ConstantR4FromArray4D(o);
+  auto operand = builder_.ConstantFromArray(o);
 
-  Array4D<float> s(GetParam().source_shape);
+  auto source_shape = GetParam().source_shape;
+  Array<float> s(source_shape);
   s.FillRandom(12.0f);
-  auto source = builder_.ConstantR4FromArray4D(s);
-
-  builder_.SelectAndScatter(operand, ge_f32_, GetParam().window_dimensions,
-                            GetParam().window_strides, GetParam().padding_type,
-                            source, builder_.ConstantR0<float>(0.0f), add_f32_);
+  auto source = builder_.ConstantFromArray(s);
 
-  auto e = ReferenceUtil::SelectAndScatter4DGePlus(
-      o, s, 0.0f, GetParam().window_dimensions, GetParam().window_strides,
-      GetParam().padding_type == Padding::kSame);
+  auto select_and_scatter = builder_.SelectAndScatter(
+      operand, ge_f32_, GetParam().window_dimensions, GetParam().window_strides,
+      GetParam().padding_type, source, builder_.ConstantR0<float>(0.0f),
+      add_f32_);
 
-  ComputeAndCompareR4<float>(&builder_, *e, {}, ErrorSpec(1e-5));
+  ComputeAndCompare(&builder_, select_and_scatter, {}, ErrorSpec(1e-5));
 }
 
 INSTANTIATE_TEST_CASE_P(
     SelectAndScatterTest_Instantiation, SelectAndScatterTest,
-    ::testing::Values(SelectAndScatterTestParam{{6, 6, 256, 128},
-                                                {3, 3, 256, 128},
-                                                Padding::kSame,
-                                                {3, 3, 1, 1},
-                                                {2, 2, 1, 1}},
-                      SelectAndScatterTestParam{{7, 7, 256, 128},
-                                                {3, 3, 256, 128},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {2, 2, 1, 1}},
-                      SelectAndScatterTestParam{{6, 7, 256, 128},
-                                                {3, 3, 256, 128},
-                                                Padding::kValid,
-                                                {2, 3, 1, 1},
-                                                {2, 2, 1, 1}},
-                      SelectAndScatterTestParam{{6, 7, 256, 128},
-                                                {2, 3, 256, 128},
-                                                Padding::kValid,
-                                                {2, 3, 1, 1},
-                                                {3, 2, 1, 1}},
-                      SelectAndScatterTestParam{{9, 9, 16, 128},
-                                                {3, 3, 16, 128},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{3, 3, 4, 4},
-                                                {1, 1, 4, 4},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{3, 3, 4, 4},
-                                                {1, 1, 4, 4},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{9, 3, 4, 4},
-                                                {3, 1, 4, 4},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{7, 3, 4, 4},
-                                                {3, 1, 4, 4},
-                                                Padding::kValid,
-                                                {3, 3, 1, 1},
-                                                {2, 3, 1, 1}},
-                      SelectAndScatterTestParam{{1, 1, 5, 5},
-                                                {1, 1, 5, 5},
-                                                Padding::kSame,
-                                                {3, 3, 1, 1},
-                                                {3, 3, 1, 1}},
-                      SelectAndScatterTestParam{{7, 7, 8, 256},
-                                                {4, 4, 8, 256},
-                                                Padding::kSame,
-                                                {2, 2, 1, 1},
-                                                {2, 2, 1, 1}}));
+    ::testing::Values(
+        SelectAndScatterTestParam{{6, 6, 6, 4, 4},
+                                  {3, 3, 3, 4, 4},
+                                  Padding::kSame,
+                                  {3, 3, 3, 1, 1},
+                                  {2, 2, 2, 1, 1}},
+        SelectAndScatterTestParam{{7, 7, 7, 4, 4},
+                                  {3, 3, 3, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 3, 1, 1},
+                                  {2, 2, 2, 1, 1}},
+
+        SelectAndScatterTestParam{{8, 8, 8, 4, 4},
+                                  {1, 3, 3, 4, 4},
+                                  Padding::kValid,
+                                  {8, 4, 4, 1, 1},
+                                  {1, 2, 2, 1, 1}},
+        SelectAndScatterTestParam{{6, 6, 256, 128},
+                                  {3, 3, 256, 128},
+                                  Padding::kSame,
+                                  {3, 3, 1, 1},
+                                  {2, 2, 1, 1}},
+        SelectAndScatterTestParam{{7, 7, 256, 128},
+                                  {3, 3, 256, 128},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {2, 2, 1, 1}},
+        SelectAndScatterTestParam{{6, 7, 256, 128},
+                                  {3, 3, 256, 128},
+                                  Padding::kValid,
+                                  {2, 3, 1, 1},
+                                  {2, 2, 1, 1}},
+        SelectAndScatterTestParam{{6, 7, 256, 128},
+                                  {2, 3, 256, 128},
+                                  Padding::kValid,
+                                  {2, 3, 1, 1},
+                                  {3, 2, 1, 1}},
+        SelectAndScatterTestParam{{9, 9, 16, 128},
+                                  {3, 3, 16, 128},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{3, 3, 4, 4},
+                                  {1, 1, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{3, 3, 4, 4},
+                                  {1, 1, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{9, 3, 4, 4},
+                                  {3, 1, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{7, 3, 4, 4},
+                                  {3, 1, 4, 4},
+                                  Padding::kValid,
+                                  {3, 3, 1, 1},
+                                  {2, 3, 1, 1}},
+        SelectAndScatterTestParam{{1, 1, 5, 5},
+                                  {1, 1, 5, 5},
+                                  Padding::kSame,
+                                  {3, 3, 1, 1},
+                                  {3, 3, 1, 1}},
+        SelectAndScatterTestParam{{7, 7, 8, 256},
+                                  {4, 4, 8, 256},
+                                  Padding::kSame,
+                                  {2, 2, 1, 1},
+                                  {2, 2, 1, 1}},
+        SelectAndScatterTestParam{
+            {6, 4, 4}, {3, 4, 4}, Padding::kSame, {3, 1, 1}, {2, 1, 1}},
+        SelectAndScatterTestParam{
+            {6, 256, 128}, {3, 256, 128}, Padding::kSame, {3, 1, 1}, {2, 1, 1}},
+        SelectAndScatterTestParam{{7, 256, 128},
+                                  {3, 256, 128},
+                                  Padding::kValid,
+                                  {3, 1, 1},
+                                  {2, 1, 1}},
+        SelectAndScatterTestParam{{6, 256, 128},
+                                  {3, 256, 128},
+                                  Padding::kValid,
+                                  {2, 1, 1},
+                                  {2, 1, 1}},
+        SelectAndScatterTestParam{{6, 256, 128},
+                                  {2, 256, 128},
+                                  Padding::kValid,
+                                  {2, 1, 1},
+                                  {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {9, 16, 128}, {3, 16, 128}, Padding::kValid, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {3, 4, 4}, {1, 4, 4}, Padding::kValid, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {3, 4, 4}, {1, 4, 4}, Padding::kValid, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {9, 4, 4}, {3, 4, 4}, Padding::kValid, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {7, 4, 4}, {3, 4, 4}, Padding::kValid, {3, 1, 1}, {2, 1, 1}},
+        SelectAndScatterTestParam{
+            {1, 5, 5}, {1, 5, 5}, Padding::kSame, {3, 1, 1}, {3, 1, 1}},
+        SelectAndScatterTestParam{
+            {7, 8, 256}, {4, 8, 256}, Padding::kSame, {2, 1, 1}, {2, 1, 1}}));
 
 // Test for F32 1D array, with a zero-element input.
 XLA_TEST_F(SelectAndScatterTest, R1S0F32) {
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index c21124750ad512cad69b1483e708613ee2857ac0..ac163df127e0087c02777fa3d5ce7970c51b97b9 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -33,7 +34,6 @@ namespace xla {
 namespace {
 
 using ::tensorflow::str_util::Join;
-using ::tensorflow::strings::StrCat;
 
 class SliceTest : public ClientLibraryTestBase {};
 
@@ -211,6 +211,13 @@ class SliceR1Test : public ClientLibraryTestBase,
   }
 };
 
+string SliceR1TestDataToString(const ::testing::TestParamInfo<R1Spec>& data) {
+  const R1Spec& spec = data.param;
+  return ::tensorflow::strings::Printf("%lld_%lld_%lld_%lld", spec.input_dim0,
+                                       spec.slice_start, spec.slice_limit,
+                                       spec.slice_stride);
+}
+
 XLA_TEST_P(SliceR1Test, DoIt_F32) { Run<float>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_F64) { Run<double>(GetParam()); }
@@ -223,30 +230,66 @@ XLA_TEST_P(SliceR1Test, DoIt_U64) { Run<uint64>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_S64) { Run<int64>(GetParam()); }
 
-INSTANTIATE_TEST_CASE_P(                          //
-    SliceR1TestInstantiation,                     //
-    SliceR1Test,                                  //
-    ::testing::Values(                            //
-        R1Spec{10, 0, 0, 1},                      //
-        R1Spec{10, 7, 7, 1},                      //
-        R1Spec{10, 2, 4, 1},                      //
-        R1Spec{10, 2, 4, 2},                      //
-        R1Spec{10, 0, 10, 1},                     //
-        R1Spec{1024, 1024 - 4, 1024, 1},          //
-        R1Spec{4096, 7, 7 + 1024, 1},             //
-        R1Spec{10, 0, 10, 2},                     //
-        R1Spec{10, 0, 10, 3},                     //
-        R1Spec{10, 0, 10, 4},                     //
-        R1Spec{10, 0, 10, 5},                     //
-        R1Spec{10, 0, 10, 10},                    //
-        R1Spec{500, 200, 400, 7},                 //
-        R1Spec{4096, 1, 4095, 3},                 //
-        R1Spec{2047, 1024 - 24, 1024 + 160, 31},  //
-        R1Spec{2047, 1, 2046, 3 * 128},           //
-        R1Spec{4096, 1024 + 3, 4095, 500},        //
-        R1Spec{8192, 0, 8192, 1024 * 3 + 400}     //
-        )                                         //
+// Tests for R1 slice ops.
+// The format for each testcase is {input size, start, limit, stride}.
+// clang-format off
+INSTANTIATE_TEST_CASE_P(
+    SliceR1TestInstantiation,
+    SliceR1Test,
+    ::testing::Values(
+        R1Spec{10, 0, 0, 1},
+        R1Spec{10, 7, 7, 1},
+        R1Spec{10, 0, 5, 1},
+        R1Spec{10, 3, 5, 1},
+        R1Spec{10, 0, 10, 1},
+        R1Spec{1024, 0, 5, 1},
+        R1Spec{1024, 3, 5, 1},
+        R1Spec{1024 + 17, 0, 5, 1},
+        R1Spec{1024 + 17, 3, 5, 1},
+        R1Spec{1024 + 17, 1024, 1024 + 6, 1},
+        R1Spec{1024 + 17, 1024 + 1, 1024 + 6, 1},
+        R1Spec{1024, 1024 - 4, 1024, 1},
+        R1Spec{4 * 1024, 7, 7 + 1024, 1},
+        R1Spec{4 * 1024, 0, 4 * 1024, 1},
+        R1Spec{4 * 1024, 1, 4 * 1024 - 1, 1},
+        R1Spec{4 * 1024, 1024, 3 * 1024, 1},
+        R1Spec{4 * 1024, 1024 + 1, 3 * 1024 - 1, 1},
+        R1Spec{16 * 1024, 0, 5, 1},
+        R1Spec{16 * 1024, 3, 5, 1},
+        R1Spec{16 * 1024 + 17, 0, 5, 1},
+        R1Spec{16 * 1024 + 17, 3, 5, 1},
+        R1Spec{16 * 1024 + 17, 16 * 1024, 16 * 1024 + 6, 1},
+        R1Spec{16 * 1024 + 17, 16 * 1024 + 1, 16 * 1024 + 6, 1},
+        R1Spec{16 * 1024, 4 * 1024 - 17, 8 * 1024 - 18, 1},
+        R1Spec{64 * 1024, 0, 64 * 1024, 1},
+        R1Spec{64 * 1024, 1, 64 * 1024 - 1, 1},
+        R1Spec{64 * 1024, 1024, 63 * 1024, 1},
+        R1Spec{64 * 1024, 1024 + 1, 63 * 1024 - 1, 1},
+        R1Spec{64 * 1024, 32 * 1024, 33 * 1024, 1},
+        R1Spec{64 * 1024, 32 * 1024 + 1, 33 * 1024 - 1, 1},
+        R1Spec{64 * 1024, 32 * 1024 - 17, 36 * 1024 - 18, 1},
+// TODO(b/69425338): This uses too much memory on GPU.
+#ifndef XLA_TEST_BACKEND_GPU
+        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
+        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
+        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1},
+#endif
+        R1Spec{10, 2, 4, 2},
+        R1Spec{10, 0, 10, 2},
+        R1Spec{10, 0, 10, 3},
+        R1Spec{10, 0, 10, 4},
+        R1Spec{10, 0, 10, 5},
+        R1Spec{10, 0, 10, 10},
+        R1Spec{500, 200, 400, 7},
+        R1Spec{4096, 1, 4095, 3},
+        R1Spec{2047, 1024 - 24, 1024 + 160, 31},
+        R1Spec{2047, 1, 2046, 3 * 128},
+        R1Spec{4096, 1024 + 3, 4095, 500},
+        R1Spec{8192, 0, 8192, 1024 * 3 + 400}
+        ),
+    SliceR1TestDataToString
 );
+// clang-format on
 
 struct R2Spec {
   int64 input_dim0;
@@ -339,7 +382,7 @@ struct R4Spec {
 
 string R4SpecToString(const ::testing::TestParamInfo<R4Spec>& data) {
   const R4Spec& spec = data.param;
-  return StrCat(                                   //
+  return tensorflow::strings::StrCat(              //
       "input_", Join(spec.input_dims, "x"),        //
       "__layout_", Join(spec.input_layout, ""),    //
       "__starts_", Join(spec.slice_starts, "x"),   //
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index 28a2d0198a707cec1aa5e0fbed341ee9b2a927f7..cc4eaf62f50d1fa622c705fab810fe1e1b0fbf08 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -36,6 +36,7 @@ limitations under the License.
 #define DISABLED_ON_CPU(X) X
 #define DISABLED_ON_CPU_PARALLEL(X) X
 #define DISABLED_ON_GPU(X) X
+#define DISABLED_ON_INTERPRETER(X) X
 
 // We need this macro instead of pasting directly to support nesting
 // the DISABLED_ON_FOO macros, as in the definition of DISABLED_ON_CPU.
@@ -62,6 +63,11 @@ limitations under the License.
 # define DISABLED_ON_GPU(X) XLA_TEST_PASTE(DISABLED_, X)
 #endif  // XLA_TEST_BACKEND_GPU
 
+#ifdef XLA_TEST_BACKEND_INTERPRETER
+# undef DISABLED_ON_INTERPRETER
+# define DISABLED_ON_INTERPRETER(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // XLA_TEST_BACKEND_INTERPRETER
+
 // clang-format on
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 0d56c9f48363d0569921d7c76050dcc66208931b..b060fb13b1451aab30cfca73bea0a4a598a9fa3a 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 
@@ -23,118 +24,117 @@ namespace xla {
 namespace {
 
 template <typename FloatT>
-void PopulateWithRandomFloatingPointData(Literal* literal) {
+void PopulateWithRandomFloatingPointData(Literal* literal,
+                                         std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
-  std::minstd_rand0 engine;
-  std::uniform_real_distribution<FloatT> generator(0.0f, 1.0f);
+  // Create uniform numbers between 1 and 1.125 to avoid creating denormal
+  // numbers.
+  std::uniform_real_distribution<FloatT> generator(1.0f, 1.125f);
+  const bool should_index_bias = ShapeUtil::ElementsIn(literal->shape()) > 1000;
   TF_CHECK_OK(literal->Populate<FloatT>(
+      [&](tensorflow::gtl::ArraySlice<int64> indices) {
+        // Generate a random uniform number from -0.0625 and 0.0625 and bias it
+        // with a position dependent number with mean 0.037109375. These number
+        // should allow for long chains of accumulation without being too close
+        // to zero or too large to accumulate all numbers accurately. Only do
+        // this for large literals where the number of elements is much greater
+        // than 47 otherwise only negative values are produced.
+        //
+        // The value is positionally biased using a product of the indices. Add
+        // one to each index value to avoid collapsing to zero if any of the
+        // indices are zero.
+        int64 index_product = 1;
+        for (int64 i : indices) {
+          index_product *= (1 + i);
+        }
+        const int64 negative_bias = should_index_bias ? 47 : 0;
+        FloatT index_bias =
+            static_cast<FloatT>(index_product % 113 - negative_bias) /
+            static_cast<FloatT>(256.0f);
+        return (generator(*engine) - 1.0625) + index_bias;
+      }));
+}
+
+// The standard library does not have a case for bfloat16, unsurprisingly, so we
+// handle that one specially.
+template <>
+void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal,
+                                                   std::minstd_rand0* engine) {
+  CHECK_EQ(literal->shape().element_type(), BF16);
+  std::uniform_real_distribution<float> generator(-0.9f, 1.0f);
+  TF_CHECK_OK(literal->Populate<bfloat16>(
       [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return generator(engine);
+        return static_cast<bfloat16>(generator(*engine));
       }));
 }
 
 template <typename IntT>
-void PopulateWithRandomIntegralData(Literal* literal) {
+void PopulateWithRandomIntegralData(Literal* literal,
+                                    std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<IntT>());
-  std::minstd_rand0 engine;
   std::uniform_int_distribution<IntT> generator(
       std::numeric_limits<IntT>::lowest(), std::numeric_limits<IntT>::max());
   TF_CHECK_OK(literal->Populate<IntT>(
       [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-        return generator(engine);
+        return generator(*engine);
       }));
 }
 
-bool LooksLikeSum(const HloInstruction& instruction) {
-  return instruction.opcode() == HloOpcode::kAdd &&
-         instruction.operand(0)->opcode() == HloOpcode::kParameter &&
-         instruction.operand(1)->opcode() == HloOpcode::kParameter &&
-         instruction.operand(0) != instruction.operand(1);
-}
-
-// Given an instruction and operand number, replace the given operand with
-// a Literal Constant Zero. Handle the case of a fusion instruction by
-// replacing the fusion's parent's parameter with a Literal Constant Zero,
-// unless the fusion's parent is itself a fusion.
-Status MaybeReplaceParameterInputWithZero(HloInstruction* const instruction,
-                                          const int64 operand_number) {
-  CHECK_LT(operand_number, instruction->operand_count());
-  if (instruction->operand(operand_number)->opcode() != HloOpcode::kParameter) {
-    return Status::OK();
-  }
-
-  HloComputation* const computation = instruction->parent();
-  std::unique_ptr<HloInstruction> zero = HloInstruction::CreateConstant(
-      MakeUnique<Literal>(Literal::Zero(instruction->shape().element_type())));
-
-  if (computation->IsFusionComputation()) {
-    HloInstruction* const fusion_instruction = computation->FusionInstruction();
-    if (fusion_instruction->IsFused()) {
-      return Unimplemented(
-          "Unable to replace fused parameter of fusion instruction");
-    }
-    TF_RETURN_IF_ERROR(fusion_instruction->ReplaceOperandWith(
-        instruction->operand(operand_number)->parameter_number(),
-        fusion_instruction->parent()->AddInstruction(std::move(zero))));
-  } else {
-    TF_RETURN_IF_ERROR(instruction->ReplaceOperandWith(
-        operand_number, computation->AddInstruction(std::move(zero))));
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
-StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
+// Similar to MakeFakeLiteral but takes a random number generator engine to
+// enable reusing the engine across randomly generated literals.
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
+    const Shape& shape, std::minstd_rand0* engine) {
   if (ShapeUtil::IsTuple(shape)) {
     std::vector<std::unique_ptr<Literal>> elements;
     for (const Shape& element_shape : shape.tuple_shapes()) {
       TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
-                          MakeFakeLiteral(element_shape));
+                          MakeFakeLiteralInternal(element_shape, engine));
       elements.push_back(std::move(element));
     }
     return Literal::MakeTupleOwned(std::move(elements));
   }
   std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
   switch (shape.element_type()) {
+    case BF16:
+      PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine);
+      break;
     case F32:
-      PopulateWithRandomFloatingPointData<float>(literal.get());
+      PopulateWithRandomFloatingPointData<float>(literal.get(), engine);
       break;
     case F64:
-      PopulateWithRandomFloatingPointData<double>(literal.get());
+      PopulateWithRandomFloatingPointData<double>(literal.get(), engine);
       break;
     case S8:
-      PopulateWithRandomIntegralData<int8>(literal.get());
+      PopulateWithRandomIntegralData<int8>(literal.get(), engine);
       break;
     case U8:
-      PopulateWithRandomIntegralData<uint8>(literal.get());
+      PopulateWithRandomIntegralData<uint8>(literal.get(), engine);
       break;
     case S16:
-      PopulateWithRandomIntegralData<int16>(literal.get());
+      PopulateWithRandomIntegralData<int16>(literal.get(), engine);
       break;
     case U16:
-      PopulateWithRandomIntegralData<uint16>(literal.get());
+      PopulateWithRandomIntegralData<uint16>(literal.get(), engine);
       break;
     case S32:
-      PopulateWithRandomIntegralData<int32>(literal.get());
+      PopulateWithRandomIntegralData<int32>(literal.get(), engine);
       break;
     case U32:
-      PopulateWithRandomIntegralData<uint32>(literal.get());
+      PopulateWithRandomIntegralData<uint32>(literal.get(), engine);
       break;
     case S64:
-      PopulateWithRandomIntegralData<int64>(literal.get());
+      PopulateWithRandomIntegralData<int64>(literal.get(), engine);
       break;
     case U64:
-      PopulateWithRandomIntegralData<uint64>(literal.get());
+      PopulateWithRandomIntegralData<uint64>(literal.get(), engine);
       break;
     case PRED: {
       std::uniform_int_distribution<int> generator(0, 1);
-      std::minstd_rand0 engine;
       TF_CHECK_OK(literal->Populate<bool>(
           [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-            return generator(engine);
+            return generator(*engine);
           }));
       break;
     }
@@ -145,43 +145,162 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
   return std::move(literal);
 }
 
-StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
-    const HloModule& module) {
-  std::vector<std::unique_ptr<Literal>> arguments;
-  for (const ShapeLayout& shape_layout :
-       module.config().entry_computation_layout().parameter_layouts()) {
-    TF_ASSIGN_OR_RETURN(auto literal, MakeFakeLiteral(shape_layout.shape()));
-    arguments.push_back(std::move(literal));
+// Matches binary addition computations.
+bool LooksLikeSum(const HloComputation& computation) {
+  const HloInstruction* const root = computation.root_instruction();
+  return root->opcode() == HloOpcode::kAdd &&
+         computation.num_parameters() == 2 &&
+         root->operand(0)->opcode() == HloOpcode::kParameter &&
+         root->operand(1)->opcode() == HloOpcode::kParameter &&
+         root->operand(0) != root->operand(1);
+}
+
+// Reduce, ReduceWindow, and SelectAndScatter ops may use binary addition,
+// which requires an init_value of 0 rather than a random value.
+bool NeedsZeroInitValue(const HloUse& use) {
+  const HloInstruction* const instruction = use.instruction;
+  const HloOpcode opcode = instruction->opcode();
+  const int64 op_num = use.operand_number;
+  return (
+      ((opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow) &&
+       op_num == 1 && LooksLikeSum(*instruction->to_apply())) ||
+      (opcode == HloOpcode::kSelectAndScatter && op_num == 2 &&
+       LooksLikeSum(*instruction->scatter())));
+}
+
+// Generate random values that are constrained to the input_shape minus the
+// output_shape so as not to produce wrapping slices, for instance.
+std::unique_ptr<Literal> MakeRandomNonwrappingSliceIndex(
+    const Shape& input_shape, const Shape& slice_shape,
+    std::minstd_rand0* engine) {
+  const int64 rank = ShapeUtil::Rank(input_shape);
+  std::vector<int32> start_indices(rank);
+  for (int i = 0; i < rank; ++i) {
+    const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) -
+                              ShapeUtil::GetDimension(slice_shape, i);
+    std::uniform_int_distribution<int32> generator(0, upper_bound);
+    start_indices[i] = generator(*engine);
   }
-  return std::move(arguments);
+  return Literal::CreateR1<int32>(start_indices);
 }
 
-Status ReplaceInitsWithConstants(HloModule* const module) {
-  for (HloComputation* const computation : module->computations()) {
-    for (HloInstruction* const instruction : computation->instructions()) {
+// Use dataflow analysis on each parameter to see if there are uses that would
+// be problematic when generating input data.  Returns the list of instructions
+// that correspond to their uses.
+//
+// Should be paired with the CreateLiteralForConstrainedUses() function below.
+std::vector<HloInstruction*> FindConstrainedUses(
+    const HloDataflowAnalysis& dataflow, const HloInstruction& param) {
+  std::vector<HloInstruction*> constrained_uses;
+  for (const auto& pair : dataflow.GetInstructionValueSet(&param)) {
+    const HloValue& value = dataflow.GetUniqueValueAt(&param, pair.first);
+    for (const HloUse& use : value.uses()) {
+      HloInstruction* instruction = use.instruction;
       const HloOpcode opcode = instruction->opcode();
-      if ((opcode == HloOpcode::kReduce ||
-           opcode == HloOpcode::kReduceWindow) &&
-          LooksLikeSum(*instruction->to_apply()->root_instruction())) {
-        TF_RETURN_IF_ERROR(MaybeReplaceParameterInputWithZero(instruction, 1));
-      } else if (opcode == HloOpcode::kSelectAndScatter &&
-                 LooksLikeSum(*instruction->scatter()->root_instruction())) {
-        TF_RETURN_IF_ERROR(MaybeReplaceParameterInputWithZero(instruction, 2));
+      const int64 op_num = use.operand_number;
+      if ((opcode == HloOpcode::kDynamicSlice && op_num == 1) ||
+          (opcode == HloOpcode::kDynamicUpdateSlice && op_num == 2)) {
+        constrained_uses.push_back(instruction);
+      } else if (opcode == HloOpcode::kFusion) {
+        const HloInstruction* const to_analyze =
+            instruction->fused_parameter(op_num);
+        auto fused_uses = FindConstrainedUses(dataflow, *to_analyze);
+        constrained_uses.insert(constrained_uses.end(), fused_uses.begin(),
+                                fused_uses.end());
+      } else if (NeedsZeroInitValue(use)) {
+        constrained_uses.push_back(instruction);
+      } else if (opcode == HloOpcode::kConvert ||
+                 opcode == HloOpcode::kReducePrecision) {
+        auto converted_uses = FindConstrainedUses(dataflow, *instruction);
+        constrained_uses.insert(constrained_uses.end(), converted_uses.begin(),
+                                converted_uses.end());
       }
     }
   }
-  return Status::OK();
+  return constrained_uses;
+}
+
+// Given a parameter, generate a random Literal to use as input if there exist
+// no constrained uses in the dataflow graph.  If such constraints exist,
+// generate a constrained literal (either bounded in the case of indices, or
+// zero in the case of init_values for reductions).
+StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
+    const tensorflow::gtl::ArraySlice<HloInstruction*> constrained_uses,
+    const HloInstruction& param, std::minstd_rand0* engine) {
+  HloInstruction* needs_index = nullptr;
+  HloInstruction* needs_zero = nullptr;
+  for (HloInstruction* use : constrained_uses) {
+    switch (use->opcode()) {
+      case HloOpcode::kDynamicSlice:
+      case HloOpcode::kDynamicUpdateSlice:
+        if (needs_index != nullptr &&
+            !ShapeUtil::Equal(needs_index->shape(), use->shape())) {
+          return Unimplemented(
+              "Conflicting operand generation slice index constraints\n");
+        }
+        needs_index = use;
+        break;
+
+      case HloOpcode::kReduce:
+      case HloOpcode::kReduceWindow:
+      case HloOpcode::kSelectAndScatter:
+        needs_zero = use;
+        break;
+
+      default:
+        return Unimplemented(
+            "Constrained operand generation not implemented for %s.",
+            use->ToString().c_str());
+    }
+  }
+  if (needs_index != nullptr && needs_zero != nullptr) {
+    return Unimplemented(
+        "Conflicting operand generation constraints.\nNeeds index: %s\nNeeds "
+        "zero: %s\n",
+        needs_index->ToString().c_str(), needs_zero->ToString().c_str());
+  }
+  if (needs_index != nullptr) {
+    return MakeRandomNonwrappingSliceIndex(needs_index->operand(0)->shape(),
+                                           needs_index->shape(), engine);
+  } else if (needs_zero != nullptr) {
+    return Literal::CreateFromShape(param.shape());
+  } else {
+    return MakeFakeLiteralInternal(param.shape(), engine);
+  }
+}
+
+// Given a module entry parameter, use the dataflow analysis to see if a
+// special case literal must be created, or if we can generate fake data.
+StatusOr<std::unique_ptr<Literal>> MakeConstrainedArgument(
+    const HloDataflowAnalysis& dataflow, const HloInstruction& param,
+    std::minstd_rand0* engine) {
+  const auto constrained_uses = FindConstrainedUses(dataflow, param);
+  return CreateLiteralForConstrainedUses(constrained_uses, param, engine);
+}
+
+}  // namespace
+
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
+  std::minstd_rand0 engine;
+  return MakeFakeLiteralInternal(shape, &engine);
+}
+
+StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
+    HloModule* const module) {
+  TF_ASSIGN_OR_RETURN(auto dataflow, HloDataflowAnalysis::Run(module));
+  const auto params = module->entry_computation()->parameter_instructions();
+  std::minstd_rand0 engine;
+  std::vector<std::unique_ptr<Literal>> arguments(params.size());
+  for (int i = 0; i < params.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(
+        arguments[i], MakeConstrainedArgument(*dataflow, *params[i], &engine));
+  }
+  return std::move(arguments);
 }
 
 Status VerifyHloModule(const perftools::gputools::Platform& platform,
                        HloModule* const module) {
-  return HloVerifier(
-             std::bind(
-                 &TransferManager::GetByteSizeRequirement,
-                 TransferManager::GetForPlatform(&platform).ConsumeValueOrDie(),
-                 std::placeholders::_1))
-      .Run(module)
-      .status();
+  return HloVerifier().Run(module).status();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index 9aca162a185e5b22888229555b7bce88769c79a6..0fb024ffb074f1c90b75022bc7f5a8b58b03c0c2 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -60,13 +60,11 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape);
 
 // Generates a vector of arguments containing fake data. The number, shape and
 // layout of the arguments is appropriate for given HLO module.
+//
+// Will handle special cases such as making sure that indices used for dynamic
+// slices are bounded, reduces that call adds use 0 as an init value, etc.
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
-    const HloModule& module);
-
-// Reductions using Adds, ReduceWindow, and SelectAndScatter, require their
-// init_value to be replaced with the constant 0.0f when testing, otherwise we
-// may generate a bad init_value when looking at the op in isolation.
-Status ReplaceInitsWithConstants(HloModule* const module);
+    HloModule* const module);
 
 // Check that a given module satisfies various constraints before trying to
 // execute it.
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index f2a64749482e5f5a8c5d72034fb7a4eee07baf48..268ba338f2e6740a1d1a046d5a85494f3cf2e9f8 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -46,9 +46,10 @@ class TransferManagerTest : public LocalClientTestBase {
   ~TransferManagerTest() override = default;
 
   std::unique_ptr<ScopedShapedBuffer> AllocateDeviceBuffer(const Shape& shape) {
-    return ScopedShapedBuffer::Allocate(
-               shape, GetOrCreateAllocator(local_client_->platform()),
-               /*device_ordinal=*/0, shape_size_fn_)
+    return transfer_manager_
+        ->AllocateScopedShapedBuffer(
+            shape, GetOrCreateAllocator(local_client_->platform()),
+            /*device_ordinal=*/0)
         .ValueOrDie();
   }
 
@@ -118,7 +119,7 @@ XLA_TEST_F(TransferManagerTest, TransferR1U8) {
                           transfer_manager_->TransferLiteralFromDevice(
                               stream_executor_, *device_buffer));
 
-  EXPECT_EQ(result->u8s_string(), test_string);
+  EXPECT_EQ(result->GetR1U8AsString(), test_string);
 }
 
 XLA_TEST_F(TransferManagerTest, TransferR2F32) {
@@ -211,5 +212,39 @@ XLA_TEST_F(TransferManagerTest, TransferNestedTuple) {
   LiteralTestUtil::ExpectEqual(*literal, *result);
 }
 
+XLA_TEST_F(TransferManagerTest, TransferComplexValue) {
+  std::unique_ptr<Literal> literal = Literal::CreateR1<complex64>(
+      {complex64(1.0f, 2.0f), complex64(42.0f, -123.4f)});
+  auto device_buffer = AllocateDeviceBuffer(literal->shape());
+
+  // Round trip literal through device.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
+      stream_executor_, *literal, *device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
+                          transfer_manager_->TransferLiteralFromDevice(
+                              stream_executor_, *device_buffer));
+
+  LiteralTestUtil::ExpectEqual(*literal, *result);
+}
+
+XLA_TEST_F(TransferManagerTest, TransferComplexValueInTuple) {
+  std::unique_ptr<Literal> literal = Literal::MakeTuple(
+      {Literal::CreateR1<complex64>(
+           {complex64(1.0f, 2.0f), complex64(42.0f, -123.4f)})
+           .get(),
+       Literal::CreateR1<int32>({1, 2, 3, 4, 5, 6}).get(),
+       Literal::CreateR0<complex64>(complex64(0.3f, -0.4f)).get()});
+  auto device_buffer = AllocateDeviceBuffer(literal->shape());
+
+  // Round trip literal through device.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
+      stream_executor_, *literal, *device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
+                          transfer_manager_->TransferLiteralFromDevice(
+                              stream_executor_, *device_buffer));
+
+  LiteralTestUtil::ExpectEqual(*literal, *result);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 5a012c93d64f6a6fca73aa422e20cf238c945ce9..2029312f94a14bc81706368b9ecfc2727fd9fe4c 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -57,6 +57,20 @@ XLA_TEST_F(TupleTest, TupleConstant) {
   ComputeAndCompareTuple(&builder, *value, {}, error_spec_);
 }
 
+// Tests a tuple made of scalar constants.
+XLA_TEST_F(TupleTest, TupleScalarConstant) {
+  ComputationBuilder builder(client_, TestName());
+
+  const float constant_scalar1 = 7.3f;
+  const float constant_scalar2 = 1.2f;
+  auto value =
+      Literal::MakeTuple({Literal::CreateR0<float>(constant_scalar1).get(),
+                          Literal::CreateR0<float>(constant_scalar2).get()});
+
+  auto result = builder.ConstantLiteral(*value);
+  ComputeAndCompareTuple(&builder, *value, {}, error_spec_);
+}
+
 // Tests the creation of tuple data.
 XLA_TEST_F(TupleTest, TupleCreate) {
   ComputationBuilder builder(client_, TestName());
@@ -180,8 +194,7 @@ XLA_TEST_F(TupleTest, TupleGTEToTuple) {
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
-// TODO(b/68395210): GPU does not tolerate ambiguous top-level buffers.
-XLA_TEST_F(TupleTest, DISABLED_ON_GPU(SelectBetweenPredTuples)) {
+XLA_TEST_F(TupleTest, SelectBetweenPredTuples) {
   ComputationBuilder b(client_, TestName());
   ComputationDataHandle v1, v2;
 
@@ -445,5 +458,61 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
   ComputeAndCompareR1<float>(&builder, expected, arguments, ErrorSpec(1e-5));
 }
 
+XLA_TEST_F(TupleTest, ComplexTuples) {
+  ComputationBuilder builder(client_, TestName());
+  {
+    Shape c64r0 = ShapeUtil::MakeShape(C64, {});
+    Shape c64r1 = ShapeUtil::MakeShape(C64, {2});
+    Shape c64r2 = ShapeUtil::MakeShape(C64, {3, 2});
+    Shape arg0_shape = ShapeUtil::MakeTupleShape(
+        {c64r0, ShapeUtil::MakeTupleShape({c64r1, c64r2})});
+    auto input0 = builder.Parameter(0, arg0_shape, "input0");
+    auto t0 = builder.GetTupleElement(input0, 0);
+    auto t1 = builder.GetTupleElement(input0, 1);
+    auto t10 = builder.GetTupleElement(t1, 0);
+    auto t11 = builder.GetTupleElement(t1, 1);
+    auto sum = builder.Add(builder.Add(t10, t11, {1}), t0);
+    auto input1 = builder.Parameter(1, c64r1, "input1");
+    auto prod = builder.Mul(input1, sum, {1});
+    builder.Tuple({builder.Tuple({prod, sum}),
+                   builder.ConstantR0<complex64>({123, 456})});
+  }
+
+  std::unique_ptr<GlobalData> arg0 =
+      client_
+          ->TransferToServer(*Literal::MakeTuple(
+              {Literal::CreateR0<complex64>({1, 2}).get(),
+               Literal::MakeTuple(
+                   {Literal::CreateR1<complex64>({{10, 20}, {30, 40}}).get(),
+                    Literal::CreateR2<complex64>(
+                        {{{100, 200}, {300, 400}},
+                         {{1000, 2000}, {3000, 4000}},
+                         {{10000, 20000}, {30000, 40000}}})
+                        .get()})
+                   .get()}))
+          .ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> arg1 =
+      client_
+          ->TransferToServer(*Literal::CreateR1<complex64>({{1, 2}, {1, -2}}))
+          .ConsumeValueOrDie();
+  auto sum = Literal::CreateR2<complex64>({{{111, 222}, {331, 442}},
+                                           {{1011, 2022}, {3031, 4042}},
+                                           {{10011, 20022}, {30031, 40042}}});
+  auto prod = Literal::CreateFromShape(sum->shape());
+  ASSERT_TRUE(prod->Populate<complex64>(
+                      [&sum](tensorflow::gtl::ArraySlice<int64> indexes) {
+                        return sum->Get<complex64>(indexes) *
+                               (indexes[indexes.size() - 1] == 0
+                                    ? complex64(1, 2)
+                                    : complex64(1, -2));
+                      })
+                  .ok());
+  auto expected =
+      Literal::MakeTuple({Literal::MakeTuple({prod.get(), sum.get()}).get(),
+                          Literal::CreateR0<complex64>({123, 456}).get()});
+  ComputeAndCompareTuple(&builder, *expected, {arg0.get(), arg1.get()},
+                         error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index fa4192e9281784a4a3063601afe89fba6a9dac18..835e2d7e5594d7c8c6e523f9806e32dce23a87e9 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -215,5 +215,23 @@ XLA_TEST_F(UnaryOpTest, SignAbsTestR2) {
   ComputeAndCompareR2<float>(&builder, {{0, 0}, {0, 0}}, {});
 }
 
+XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToS32) {
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR1<int32>({0, 1});
+  auto rhs = builder.ConstantR1<int32>({1, 1});
+  builder.ConvertElementType(builder.Eq(lhs, rhs), S32);
+
+  ComputeAndCompareR1<int32>(&builder, {0, 1}, {});
+}
+
+XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToF32) {
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR1<int32>({0, 1});
+  auto rhs = builder.ConstantR1<int32>({1, 1});
+  builder.ConvertElementType(builder.Eq(lhs, rhs), F32);
+
+  ComputeAndCompareR1<float>(&builder, {0.0, 1.0}, {});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 49f673f5f0bf9b844ab4030383784208b4e2c58a..52157b837c383205f77a030ef98b2fd03a41aff5 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -357,8 +357,7 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
   ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001));
 }
 
-// TODO(b/63003356): 11-06-2017: fails on all back-ends with incorrect result.
-TEST_F(WhileTest, DISABLED_WhileWithPermutationAndTupleResult) {
+TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
   std::vector<Shape> shape_elements = {
       ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(F32, {3}),
       ShapeUtil::MakeShape(F32, {3}), ShapeUtil::MakeShape(F32, {3})};
@@ -411,8 +410,7 @@ TEST_F(WhileTest, DISABLED_WhileWithPermutationAndTupleResult) {
   ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001));
 }
 
-// TODO(b/63003356): 11-06-2017: fails on all back-ends with incorrect result.
-TEST_F(WhileTest, DISABLED_WhileWithPermutationAndVectorResult) {
+TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
   std::vector<Shape> shape_elements = {
       ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(F32, {3}),
       ShapeUtil::MakeShape(F32, {3}), ShapeUtil::MakeShape(F32, {3})};
@@ -565,6 +563,53 @@ TEST_F(WhileTest, WhileWithPredicateTupleResult) {
   ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0));
 }
 
+TEST_F(WhileTest, WhileWithTupleConstantScalarResult) {
+  std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
+                                       ShapeUtil::MakeShape(S32, {})};
+  Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
+
+  // Create a computation for the condition.
+  // Repeat for 5 iterations.
+  Computation condition;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Gt(builder.ConstantR0<int32>(5), iteration);
+    condition = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a computation for the body.
+  // Add 1 to the iteration variable and set the other tuple element to a
+  // constant.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    auto result =
+        builder.Tuple({builder.Add(iteration, builder.ConstantR0<int32>(1)),
+                       builder.ConstantR0<int32>(7)});
+    body = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, "while");
+  auto init = builder.Tuple(
+      {builder.ConstantR0<int32>(0), builder.ConstantR0<int32>(7)});
+  auto result = builder.While(condition, body, init);
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(result).ConsumeValueOrDie());
+
+  auto expected_counter = Literal::CreateR0<int32>(5);
+  auto expected_data = Literal::CreateR0<int32>(7);
+  auto expected =
+      Literal::MakeTuple({expected_counter.get(), expected_data.get()});
+  VLOG(2) << "expected = " << ShapeUtil::HumanString(expected->shape());
+  ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001));
+}
+
 // Tests two while nodes when the result type T is a Tuple and the second
 // while node uses the result of the first while node which is used in two
 // nodes.
@@ -913,8 +958,7 @@ TEST_F(WhileTest, WhileWithPrngScalarResult) {
   }
 }
 
-// TODO(b/34969189) Fails with bad AtomicCmpSwap on GPU on 2017-09-11.
-TEST_F(WhileTest, DISABLED_ON_GPU(WhileThatSwapsParameterWithTupleElement)) {
+TEST_F(WhileTest, WhileThatSwapsParameterWithTupleElement) {
   auto element_shape = ShapeUtil::MakeShape(F32, {2});
 
   ComputationBuilder outer(client_, "outer");
@@ -950,8 +994,7 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileThatSwapsParameterWithTupleElement)) {
                          ErrorSpec(1e-6));
 }
 
-// TODO(b/34969189) Fails with bad AtomicCmpSwap on GPU on 2017-09-11.
-TEST_F(WhileTest, DISABLED_ON_GPU(WhileThatSwapsParameterWithBroadcast)) {
+TEST_F(WhileTest, WhileThatSwapsParameterWithBroadcast) {
   auto element_shape = ShapeUtil::MakeShape(F32, {2});
 
   ComputationBuilder outer(client_, "outer");
@@ -1164,6 +1207,50 @@ TEST_F(WhileTest, WhileWithCallInsideCondition) {
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
 
+TEST_F(WhileTest, WhileWithLoopInvariantOperation) {
+  auto matrix_shape = ShapeUtil::MakeShape(F32, {2, 2});
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  auto while_shape = ShapeUtil::MakeTupleShape(
+      {scalar_s32, matrix_shape, matrix_shape, matrix_shape});
+
+  // Create a computation for the condition: repeat for 5 iterations.
+  Computation condition;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto state = builder.Parameter(0, while_shape, "state");
+    builder.Gt(builder.ConstantR0<int32>(5), builder.GetTupleElement(state, 0));
+    TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
+  }
+
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto state = builder.Parameter(0, while_shape, "state");
+    auto indvar = builder.GetTupleElement(state, 0);
+    auto input_0 = builder.GetTupleElement(state, 1);
+    auto input_1 = builder.GetTupleElement(state, 2);
+    auto output = builder.Tanh(builder.Dot(input_0, input_1));
+    auto indvar_next = builder.Add(indvar, builder.ConstantR0<int32>(1));
+    auto tuple_result = builder.Tuple({indvar_next, input_0, input_1, output});
+    TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
+  }
+
+  ComputationBuilder builder(client_, TestName());
+  auto matrix_input = builder.Parameter(0, matrix_shape, "matrix");
+  auto init = builder.Tuple(
+      {builder.ConstantR0<int32>(0), matrix_input, matrix_input, matrix_input});
+  auto while_instruction = builder.While(condition, body, init);
+  builder.GetTupleElement(while_instruction, 3);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto param_value,
+                          client_->TransferToServer(*Literal::CreateR2<float>(
+                              {{1.0, 2.0}, {-1.0, -2.0}})));
+
+  ComputeAndCompareR2<float>(
+      &builder, {{-0.76159416, -0.96402758}, {0.76159416, 0.96402758}},
+      {param_value.get()}, ErrorSpec(4e-5));
+}
+
 void BM_WhileLoop(int num_iters) {
   // Benchmark a simple kernel to measure while loop overheads.
   tensorflow::testing::StopTiming();
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ad2a1985331b80625dd0687ea052300bc99e440
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -0,0 +1,362 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+namespace se = ::perftools::gputools;
+namespace gtl = ::tensorflow::gtl;
+
+class HloProfileTest : public ClientLibraryTestBase {};
+
+struct ParsedProfileOutputLine {
+  int64 cycles;
+  string cycles_percentage;
+  double usec;
+  string flops;
+  string trops;
+  string bytes_per_sec;
+  string bytes_per_cycle;
+  string opcode;
+};
+
+::testing::AssertionResult HasFlops(
+    const ParsedProfileOutputLine& parsed_line) {
+  if (RE2::FullMatch(parsed_line.flops, "[0-9.TGMk]+FLOP/s")) {
+    return ::testing::AssertionSuccess()
+           << "'flops' field present in  " << parsed_line.opcode << ": '"
+           << parsed_line.flops << "'";
+  }
+
+  return ::testing::AssertionFailure()
+         << "'flops' field absent in  " << parsed_line.opcode << ": '"
+         << parsed_line.flops << "'";
+}
+
+::testing::AssertionResult HasTrops(
+    const ParsedProfileOutputLine& parsed_line) {
+  if (RE2::FullMatch(parsed_line.trops, "[0-9.TGMk]+TROP/s")) {
+    return ::testing::AssertionSuccess()
+           << "'trops' field present in  " << parsed_line.opcode << ": '"
+           << parsed_line.trops << "'";
+  }
+
+  return ::testing::AssertionFailure()
+         << "'trops' field absent in  " << parsed_line.opcode << ": '"
+         << parsed_line.trops << "'";
+}
+
+Status ParseOneProfileOutputLine(
+    const string& line, bool expect_hlo,
+    gtl::FlatMap<string, ParsedProfileOutputLine>* parsed_results) {
+  string separator = "[^:]*:: +";
+  string match_percentage = "\\d+\\.\\d\\d%";
+  string match_cycles = "(\\d+) cycles +\\( *(" + match_percentage + ")\\)";
+  string match_usecs = "([0-9.]+) usec";
+  string match_flops = "([^ ]+)";
+  string match_trops = "([^ ]+)";
+  string match_bytes_per_sec = "([0-9.TGMKi]+)B/s";
+  string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle";
+
+  // The underlined part is what we're trying to match with match_opcode:
+  //
+  //   %dot33 = f32[256,256]{1,0} dot(...)
+  //                              ^^^
+
+  string match_opcode =
+      expect_hlo ? "%[^=]+= [^ ]+ ([^(]+)\\(.*" : "(\\[total\\])";
+  string regexp_pattern = tensorflow::strings::StrCat(
+      " +", match_cycles, separator, match_usecs, separator, match_flops,
+      separator, match_trops, separator, match_bytes_per_sec, separator,
+      match_bytes_per_cycle, separator, match_opcode);
+
+  ParsedProfileOutputLine parsed_line;
+  bool matched = RE2::FullMatch(
+      line, regexp_pattern, &parsed_line.cycles, &parsed_line.cycles_percentage,
+      &parsed_line.usec, &parsed_line.flops, &parsed_line.trops,
+      &parsed_line.bytes_per_sec, &parsed_line.bytes_per_cycle,
+      &parsed_line.opcode);
+  if (!matched) {
+    return tensorflow::errors::InvalidArgument(
+        "Input did not match regexp.  Input: ", line,
+        ", Regexp: ", regexp_pattern);
+  }
+
+  InsertOrDie(parsed_results, parsed_line.opcode, parsed_line);
+
+  return Status::OK();
+}
+
+// Returns void so that we can ASSERT.
+void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
+                            const Computation& computation,
+                            const Shape& lhs_arg_shape,
+                            const Shape& rhs_arg_shape) {
+  LocalService* service = ClientLibrary::GetXlaService(client->platform());
+  Backend* backend = service->mutable_backend();
+  se::StreamExecutor* executor = backend->default_stream_executor();
+  DeviceMemoryAllocator* allocator = backend->memory_allocator();
+  auto* transfer_manager = backend->transfer_manager();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<ScopedShapedBuffer> lhs_arg,
+      transfer_manager->AllocateScopedShapedBuffer(
+          lhs_arg_shape, allocator, backend->default_device_ordinal()));
+  TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
+      executor, *Literal::CreateFromShape(lhs_arg_shape), *lhs_arg));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<ScopedShapedBuffer> rhs_arg,
+      transfer_manager->AllocateScopedShapedBuffer(
+          rhs_arg_shape, allocator, backend->default_device_ordinal()));
+  TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
+      executor, *Literal::CreateFromShape(rhs_arg_shape), *rhs_arg));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LocalExecutable> local_executable,
+      client->Compile(computation, {&lhs_arg_shape, &rhs_arg_shape},
+                      ExecutableBuildOptions()));
+
+  Executable* executable = local_executable->executable();
+  HloExecutionProfile hlo_execution_profile(
+      &executable->hlo_profile_printer_data(),
+      &executable->hlo_profile_index_map());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      Backend::StreamPtr stream_ptr,
+      backend->BorrowStream(backend->default_device_ordinal()));
+  ExecutableRunOptions exec_run_options;
+  exec_run_options.set_stream(stream_ptr.get());
+  exec_run_options.set_allocator(backend->memory_allocator());
+  exec_run_options.set_intra_op_thread_pool(
+      backend->eigen_intra_op_thread_pool_device());
+  ServiceExecutableRunOptions run_options(
+      exec_run_options, /*borrow_stream=*/nullptr,
+      backend->eigen_intra_op_thread_pool());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto execution_result,
+      executable->ExecuteOnStream(&run_options, {lhs_arg.get(), rhs_arg.get()},
+                                  &hlo_execution_profile));
+  (void)execution_result;
+
+  *profile_output =
+      hlo_execution_profile.ToString(executor->GetDeviceDescription());
+
+  XLA_VLOG_LINES(4, *profile_output);
+}
+
+// TODO(b/71364943): This test exposes a bug in the parallel CPU backend.
+XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) {
+  const int64 m = 256, k = 256, n = 256;
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {m, k});
+
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetDefaultPlatform());
+  TF_ASSERT_OK_AND_ASSIGN(LocalClient * client,
+                          ClientLibrary::GetOrCreateLocalClient(platform));
+
+  ComputationBuilder builder(client, TestName());
+  auto result = builder.Tanh(builder.Add(
+      builder.Parameter(0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"),
+      builder.Parameter(1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs")));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
+
+  string profile_output;
+  ExecuteAndFetchProfile(&profile_output, client, computation, lhs_shape,
+                         rhs_shape);
+
+  std::vector<string> profile_output_lines =
+      tensorflow::str_util::Split(profile_output, '\n');
+
+  gtl::FlatMap<string, ParsedProfileOutputLine> parsed_profile_lines;
+
+  TF_ASSERT_OK(ParseOneProfileOutputLine(
+      profile_output_lines[1], /*expect_hlo=*/false, &parsed_profile_lines));
+
+  TF_ASSERT_OK(ParseOneProfileOutputLine(
+      profile_output_lines[2], /*expect_hlo=*/true, &parsed_profile_lines));
+
+  TF_ASSERT_OK(ParseOneProfileOutputLine(
+      profile_output_lines[3], /*expect_hlo=*/true, &parsed_profile_lines));
+
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_profile,
+                          MaybeFind(parsed_profile_lines, "[total]"));
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine dot_profile,
+                          MaybeFind(parsed_profile_lines, "add"));
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine tanh_profile,
+                          MaybeFind(parsed_profile_lines, "tanh"));
+
+  EXPECT_GT(total_profile.cycles, 0);
+  EXPECT_EQ(total_profile.cycles_percentage, "100.00%");
+
+  EXPECT_TRUE(HasFlops(total_profile));
+  EXPECT_TRUE(HasTrops(total_profile));
+
+  EXPECT_GT(total_profile.cycles, dot_profile.cycles);
+  EXPECT_NE(dot_profile.cycles_percentage, "0.00%");
+  EXPECT_NE(dot_profile.cycles_percentage, "100.00%");
+
+  EXPECT_TRUE(HasFlops(dot_profile));
+  EXPECT_FALSE(HasTrops(dot_profile));
+
+  EXPECT_GT(total_profile.cycles, tanh_profile.cycles);
+  EXPECT_NE(tanh_profile.cycles_percentage, "0.00%");
+  EXPECT_NE(tanh_profile.cycles_percentage, "100.00%");
+
+  EXPECT_FALSE(HasFlops(tanh_profile));
+  EXPECT_TRUE(HasTrops(tanh_profile));
+}
+
+// TODO(b/71364943): This test exposes a bug in the parallel CPU backend.
+//
+// TODO(b/71544591): The GPU backend does not record cycles spent in on Hlo
+// instructions "interior" to while nodes.
+XLA_TEST_F(HloProfileTest,
+           DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(ProfileWhileComputation))) {
+  const int64 size = 256;
+  Shape matrix_shape = ShapeUtil::MakeShape(F32, {size, size});
+  Shape while_result_shape =
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {}), matrix_shape});
+
+  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
+                          PlatformUtil::GetDefaultPlatform());
+  TF_ASSERT_OK_AND_ASSIGN(LocalClient * client,
+                          ClientLibrary::GetOrCreateLocalClient(platform));
+
+  Computation condition;
+  {
+    ComputationBuilder builder(client, "condition");
+    auto state = builder.Parameter(0, while_result_shape, "state");
+    auto iteration = builder.GetTupleElement(state, 0);
+    builder.Gt(builder.ConstantR0<int32>(5), iteration);
+    TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
+  }
+
+  Computation body;
+  {
+    ComputationBuilder builder(client, "body");
+    auto state = builder.Parameter(0, while_result_shape, "state");
+    auto matrix = builder.GetTupleElement(state, 1);
+    auto next_iteration = builder.Add(builder.GetTupleElement(state, 0),
+                                      builder.ConstantR0<int32>(1));
+    builder.Tuple({next_iteration, builder.Add(matrix, matrix)});
+    TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
+  }
+
+  ComputationBuilder builder(client, TestName());
+  auto initial_while_state =
+      builder.Tuple({builder.ConstantR0<int32>(0),
+                     builder.Parameter(0, matrix_shape, "initial_value")});
+  auto while_result = builder.While(condition, body, initial_while_state);
+  builder.Add(builder.GetTupleElement(while_result, 1),
+              builder.Parameter(1, matrix_shape, "other_value"));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
+
+  string profile_output;
+  ExecuteAndFetchProfile(&profile_output, client, computation, matrix_shape,
+                         matrix_shape);
+
+  std::vector<string> profile_output_lines =
+      tensorflow::str_util::Split(profile_output, '\n');
+
+  auto while_body_profile_start =
+      std::find_if(profile_output_lines.begin(), profile_output_lines.end(),
+                   [](tensorflow::StringPiece s) {
+                     return s.starts_with("Execution profile for body");
+                   });
+
+  ASSERT_NE(while_body_profile_start, profile_output_lines.end());
+
+  gtl::FlatMap<string, ParsedProfileOutputLine> parsed_profile_lines;
+
+  TF_ASSERT_OK(
+      ParseOneProfileOutputLine(*std::next(while_body_profile_start, 1),
+                                /*expect_hlo=*/false, &parsed_profile_lines));
+
+  TF_ASSERT_OK(
+      ParseOneProfileOutputLine(*std::next(while_body_profile_start, 2),
+                                /*expect_hlo=*/true, &parsed_profile_lines));
+
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_while_body_profile,
+                          MaybeFind(parsed_profile_lines, "[total]"));
+  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine dot_profile,
+                          MaybeFind(parsed_profile_lines, "add"));
+
+  EXPECT_GT(total_while_body_profile.cycles, 0);
+  EXPECT_EQ(total_while_body_profile.opcode, "[total]");
+  EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.00%");
+
+  EXPECT_GT(total_while_body_profile.cycles, dot_profile.cycles);
+  EXPECT_NE(dot_profile.cycles_percentage, "0.00%");
+  EXPECT_NE(dot_profile.cycles_percentage, "100.00%");
+}
+}  // namespace
+}  // namespace xla
+
+static std::pair<int, char**> AddXlaHloProfileFlag(int argc, char** argv) {
+  // Intentional "leak".
+  char** new_argv = new char*[argc + 2];
+  for (int i = 0; i < argc; i++) {
+    new_argv[i] = argv[i];
+  }
+
+  // We do it this way (as opposed to piping in a modified DebugOptions
+  // instance) for better end-to-end integration testing.
+  new_argv[argc] = strdup("--xla_hlo_profile");
+
+  // Fusion can change the Hlo instructions that show up in the final Hlo
+  // executable, so block it here.
+  new_argv[argc + 1] = strdup("--xla_disable_hlo_passes=fusion");
+  return {argc + 2, new_argv};
+}
+
+GTEST_API_ int main(int argc, char** argv) {
+  std::vector<tensorflow::Flag> flag_list;
+  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  std::tie(argc, argv) = AddXlaHloProfileFlag(argc, argv);
+
+  auto usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) {
+    LOG(ERROR) << "\n" << usage;
+    return 2;
+  }
+
+  testing::InitGoogleTest(&argc, argv);
+  if (argc > 1) {
+    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+    return 2;
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/compiler/xla/text_literal_reader.cc b/tensorflow/compiler/xla/text_literal_reader.cc
index 4d060895d357493327ec50b38016478c65fef94d..6fa4c48e11d1102367b21bc21d4734466495ef0e 100644
--- a/tensorflow/compiler/xla/text_literal_reader.cc
+++ b/tensorflow/compiler/xla/text_literal_reader.cc
@@ -102,9 +102,9 @@ StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadAllLines() {
         ShapeUtil::HumanString(shape).c_str());
   }
 
-  auto result = MakeUnique<Literal>();
+  auto result = MakeUnique<Literal>(shape);
   const float fill = std::numeric_limits<float>::quiet_NaN();
-  result->PopulateWithValue<float>(fill, AsInt64Slice(shape.dimensions()));
+  result->PopulateWithValue<float>(fill);
   std::vector<tensorflow::StringPiece> pieces;
   std::vector<tensorflow::StringPiece> coordinates;
   std::vector<int64> coordinate_values;
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index 5ede37b8737bd4fa6235464ddeb6382af17c8a80..b82f1c81c84b487c1661af5267b9123da97bb107 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -85,10 +85,12 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
     for (int i = 0; i < program_shape->parameters_size(); ++i) {
       layouts.push_back(&program_shape->parameters(i));
     }
+    ExecutableBuildOptions build_options;
+    build_options.set_device_ordinal(0);
+    build_options.set_result_layout(program_shape->result());
     StatusOr<std::unique_ptr<Executable>> executable =
         local_service->CompileExecutable(computation.handle(), layouts,
-                                         &program_shape->result(),
-                                         /*device_ordinal=*/0);
+                                         build_options);
 
     const HloModule& module = executable.ValueOrDie()->module();
 
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 78d8fb1f4330aed899ca917e66fae819a002b3a9..05c0fdf97d27c09eb2bbb0f265b5b2a5982ca7b1 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -60,16 +60,19 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
       for (int i = 0; i < program_shape->parameters_size(); ++i) {
         layouts.push_back(&program_shape->parameters(i));
       }
+
+      ExecutableBuildOptions build_options;
+      build_options.set_device_ordinal(0);
+      build_options.set_result_layout(program_shape->result());
       StatusOr<std::unique_ptr<Executable>> executable =
           local_service->CompileExecutable(computation.handle(), layouts,
-                                           &program_shape->result(),
-                                           /*device_ordinal=*/0);
+                                           build_options);
 
       const HloModule& module = executable.ValueOrDie()->module();
 
       fprintf(stdout, "HLO compiled for %s backend:\n%s\n",
               local_service->backend().platform()->Name().c_str(),
-              module.ToString().c_str());
+              module.ToString(HloPrintOptions::ShortParsable()).c_str());
     } else {
       const ComputationTracker& tracker = local_service->computation_tracker();
       UserComputation* user_computation =
@@ -80,7 +83,8 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
           tracker.BuildHloModule(versioned_handle, HloModuleConfig())
               .ConsumeValueOrDie();
 
-      fprintf(stdout, "%s\n", module->ToString().c_str());
+      fprintf(stdout, "%s\n",
+              module->ToString(HloPrintOptions::ShortParsable()).c_str());
     }
   }
 }
diff --git a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
index 4e02e17db65c0a4220672733be8319e1a0cc4f0f..8460ae3e4991ee091af72d2553a8491f627c722e 100644
--- a/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
+++ b/tensorflow/compiler/xla/tools/hlo_proto_to_json.cc
@@ -19,7 +19,7 @@ limitations under the License.
 //
 // Reads one serilized Hlo module, convert it into JSON format and dump into
 // some output directory. some_binaray_proto is obtained by serializing Hlo
-// module to disk using --xla_dump_hlo_proto_to debug optoin.
+// module to disk using --xla_dump_optimized_hlo_proto_to debug option.
 
 #include <stdio.h>
 #include <string>
diff --git a/tensorflow/compiler/xla/tools/parser/BUILD b/tensorflow/compiler/xla/tools/parser/BUILD
index ce936af6c3376387c1ed9fa48da23b8af537f6e5..97aacf6b39f83978e732060817cd93ede81ca782 100644
--- a/tensorflow/compiler/xla/tools/parser/BUILD
+++ b/tensorflow/compiler/xla/tools/parser/BUILD
@@ -34,9 +34,9 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
         "//tensorflow/core:regexp_internal",
     ],
diff --git a/tensorflow/compiler/xla/tools/parser/README.md b/tensorflow/compiler/xla/tools/parser/README.md
index 6232967f5f04cbf316d985357ae84c28335531e2..f0f3dd7785c13e505e1eb6d4c8cd4bad157c4993 100644
--- a/tensorflow/compiler/xla/tools/parser/README.md
+++ b/tensorflow/compiler/xla/tools/parser/README.md
@@ -1,24 +1,26 @@
-# HloModule string syntax
-
-TODO: Support all subcomputations (for fusion, reduce, ...).
-
-TODO: Support all extra attributes, e.g. dimensions, strides.
+# HLO Text Syntax
 
 ```yacc
 hlo_module
   : 'HloModule' name computations
   ;
 
+/* If no computation is marked as ENTRY, the last computation will be the entry
+computation of the module.*/
 computations
   : computation
   | computation computations
   ;
 
 computation
-  : 'ENTRY' name param_list '->' shape instruction_list
-  | name param_list '->' shape instruction_list
+  : 'ENTRY' name param_list_to_shape instruction_list
+  | name param_list_to_shape instruction_list
+  | 'ENTRY' name instruction_list
+  | name instruction_list
   ;
 
+/* If no instruction is marked as ROOT, the last instruction will be the root of
+its computation. */
 instruction_list
   : '{' instruction_list1 '}'
   ;
@@ -41,6 +43,7 @@ operands1
   ;
 operand
   : shape name
+  | name
   ;
 
 attributes
@@ -60,6 +63,10 @@ attribute_value
   | '{' sub_attributes '}'
   ;
 
+param_list_to_shape
+  : param_list '->' shape
+  ;
+
 param_list
   : '(' param_list1 ')'
   ;
@@ -84,6 +91,7 @@ tuple_elements
 name
   : identifier ':'
   | '%' identifier
+  | identifier
   ;
 
 identifier
@@ -108,7 +116,29 @@ non_tuple
   | rank2345
   ;
 rank2345
-  : shape nested_array
+  : shape sparse_or_nested_array
+  ;
+sparse_or_nested_array
+  : sparse_array
+  | nested_array
+  ;
+sparse_array
+  : '{' sparse_array1 '}'
+  ;
+sparse_array1
+  : sparse_array_item
+  | sparse_array1 ',' sparse_array_item
+  ;
+sparse_array_item
+  : multi_index ':' scalar
+  ;
+multi_index
+  : kInt
+  | '[' multi_index1 ']'
+  ;
+multi_index1
+  : kInt
+  | multi_index1 ',' kInt
   ;
 
 ```
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc b/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
index 56744440db1b17aa1cc8823feb1bad279f8f4f75..fc0e4444521247734fc240a03da669244fe1a6a4 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <unordered_map>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -153,21 +152,21 @@ TokKind HloLexer::LexToken() {
   }
 }
 
-// Lex a shape, name, keyword, opcode, attribute name, or the dim labels
-// pattern.
+// Lex a shape, name, keyword, attribute name, the dim labels pattern, and
+// other identifiers.
 //
 // shape    ::= ([a-zA-Z0-9_]*[0-9]*)\[([0-9,]*)\](?:\s*{([0-9,]*)})?
 // name     ::= [a-zA-Z_][a-zA-Z0-9_.-]*:
 // keyword  ::= HloModule, ENTRY, ...
-// opcode   ::= add, greater-than, ...
 // attribute_name ::= condition, body, dimensions, ...
 // dim_labels_pattern ::= [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
+// identifiers ::= other cases that match [a-zA-Z_][a-zA-Z0-9_.-]*
 TokKind HloLexer::LexIdentifier() {
   {
     auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
     // 'consumable' will be advanced iff its prefix matches the pattern.
     static LazyRE2 shape_pattern = {
-        R"(^(\w*\d*)\[([\d,]*)\](?:{([\d,]*)})?)"};
+        R"(^(\w*\d*)\[([\d,]*)\](?:(dense|sparse)?{([\d,]+)})?)"};
     if (RE2::Consume(&consumable, *shape_pattern)) {
       auto status_or_shape = ShapeUtil::ParseShapeString(
           StringPieceFromPointers(token_start_, consumable.begin()));
@@ -220,20 +219,6 @@ TokKind HloLexer::LexIdentifier() {
 
 #undef KEYWORD
 
-  // See if this is an opcode.
-  auto opcode = StringToHloOpcode(identifier.ToString());
-  if (opcode.ok()) {
-    opcode_val_ = opcode.ValueOrDie();
-    return TokKind::kOpcode;
-  }
-
-  // See if this is an fusion kind.
-  auto kind = xla::StringToFusionKind(identifier.ToString());
-  if (kind.ok()) {
-    fusion_kind_val_ = kind.ValueOrDie();
-    return TokKind::kFusionKind;
-  }
-
   {
     auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
     static LazyRE2 dim_labels_pattern = {
@@ -244,8 +229,9 @@ TokKind HloLexer::LexIdentifier() {
       return TokKind::kDimLabels;
     }
   }
-  current_ptr_ = token_start_ + 1;
-  return TokKind::kError;
+
+  str_val_ = identifier.ToString();
+  return TokKind::kIdent;
 }
 
 // Lex names after a % character.
@@ -271,7 +257,8 @@ TokKind HloLexer::LexPercent() {
 // fp without exp ::= [-]?([0-9]+[.][0-9]*|[0-9]*[.][0-9]+)
 // dim_labels_pattern ::= [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
 // dxd_pattern ::= [0-9]+(x[0-9]+)+
-// pad_pattern ::= [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*
+// pad_pattern ::=
+//   [-]?[0-9]+_[-]?[0-9]+(_[0-9]+)?(x[-]?[0-9]+_[-]?[0-9]+(_[0-9]+)?)*
 // int ::=  [-]?[0-9]+
 // negative inf ::= '-inf'
 TokKind HloLexer::LexNumberOrPattern() {
@@ -289,7 +276,7 @@ TokKind HloLexer::LexNumberOrPattern() {
       R"([0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,})"};
   static LazyRE2 dxd_pattern = {R"([0-9]+(x[0-9]+)+)"};
   static LazyRE2 pad_pattern = {
-      R"([0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*)"};
+      R"([-]?[0-9]+_[-]?[0-9]+(_[0-9]+)?(x[-]?[0-9]+_[-]?[0-9]+(_[0-9]+)?)*)"};
 
   if (RE2::Consume(&consumable, *dim_labels_pattern)) {
     current_ptr_ = consumable.begin();
@@ -326,18 +313,43 @@ TokKind HloLexer::LexNumberOrPattern() {
   return TokKind::kError;
 }
 
-StringPiece HloLexer::GetCurrentLine() const {
-  const char* start = token_start_;
-  const char* end = current_ptr_;
-  if (!CanDereference(start) || !CanDereference(end)) {
-    return "LINE OUT OF RANGE";
+std::pair<unsigned, unsigned> HloLexer::GetLineAndColumn(LocTy location) const {
+  unsigned line_no = 1;
+  const char* start = buf_.begin();
+  const char* ptr = start;
+  if (line_no_cache_.last_query && CanDereference(line_no_cache_.last_query) &&
+      line_no_cache_.last_query <= location) {
+    ptr = line_no_cache_.last_query;
+    line_no = line_no_cache_.line_no_of_query;
   }
-  while (start > buf_.begin() && *start != '\n') {
-    start--;
+  for (; ptr != location; ptr++) {
+    if (*ptr == '\n') {
+      line_no++;
+    }
   }
-  while (end < buf_.end() && *end != '\n') {
-    end++;
+
+  // Update the line number cache.
+  line_no_cache_.last_query = ptr;
+  line_no_cache_.line_no_of_query = line_no;
+  size_t line_offset = StringPieceFromPointers(start, ptr).rfind('\n');
+  if (line_offset == StringPiece::npos) {
+    line_offset = 0;
   }
+  return {line_no, ptr - start - line_offset};
+}
+
+StringPiece HloLexer::GetLine(LocTy loc) const {
+  if (!CanDereference(loc)) {
+    return "LINE OUT OF RANGE";
+  }
+  size_t line_start =
+      StringPieceFromPointers(buf_.begin(), loc + 1).rfind('\n');
+  const char* start = line_start == StringPiece::npos
+                          ? buf_.begin()
+                          : buf_.begin() + line_start + 1;
+  size_t line_end = StringPieceFromPointers(loc, buf_.end()).find('\n');
+  const char* end = line_end == StringPiece::npos ? buf_.end() : loc + line_end;
+
   return StringPieceFromPointers(start, end);
 }
 
@@ -428,14 +440,12 @@ string TokKindToString(TokKind kind) {
       return "kDxD";
     case TokKind::kPad:
       return "kPad";
+    case TokKind::kIdent:
+      return "kIdent";
     case TokKind::kString:
       return "kString";
     case TokKind::kShape:
       return "kShape";
-    case TokKind::kOpcode:
-      return "kOpcode";
-    case TokKind::kFusionKind:
-      return "kFusionKind";
     case TokKind::kInt:
       return "kInt";
     case TokKind::kDecimal:
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h b/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
index 5c9d1bf3912584040dc5260cc6730247d439fd60..27880b9b8afbfa58abfedc3b2cecd5236b78a6d6 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
+++ b/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
@@ -18,9 +18,8 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/tools/parser/hlo_token.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/logging.h"
@@ -48,6 +47,7 @@ class HloLexer {
       case TokKind::kDxD:
       case TokKind::kPad:
       case TokKind::kString:
+      case TokKind::kIdent:
         return str_val_;
       default:
         LOG(FATAL) << "This token does not have string value";
@@ -57,14 +57,6 @@ class HloLexer {
     CHECK(GetKind() == TokKind::kShape);
     return shape_val_;
   }
-  HloOpcode GetOpcodeVal() const {
-    CHECK(GetKind() == TokKind::kOpcode);
-    return opcode_val_;
-  }
-  HloInstruction::FusionKind GetFusionKindVal() const {
-    CHECK(GetKind() == TokKind::kFusionKind);
-    return fusion_kind_val_;
-  }
   int64 GetInt64Val() const {
     CHECK(GetKind() == TokKind::kInt);
     return int64_val_;
@@ -74,8 +66,16 @@ class HloLexer {
     return decimal_val_;
   }
 
-  // Returns the line of text that is currently being lexed.
-  tensorflow::StringPiece GetCurrentLine() const;
+  typedef const char* LocTy;
+
+  // Returns the location of the current token.
+  LocTy GetLoc() const { return token_start_; }
+
+  // Returns the line and column of a location in the buffer.
+  std::pair<unsigned, unsigned> GetLineAndColumn(LocTy location) const;
+
+  // Returns the whole line given the location.
+  tensorflow::StringPiece GetLine(LocTy loc) const;
 
  private:
   // Returns the current character. If it's neither the end of input buffer nor
@@ -114,10 +114,15 @@ class HloLexer {
   TokKind current_kind_;
   string str_val_;
   Shape shape_val_;
-  HloOpcode opcode_val_;
-  HloInstruction::FusionKind fusion_kind_val_;
   int64 int64_val_;
   double decimal_val_;
+
+  struct LineNoCacheTy {
+    const char* last_query;
+    unsigned line_no_of_query;
+  };
+  // This caches the line number of the previous query.
+  mutable LineNoCacheTy line_no_cache_{nullptr, 0};
 };
 
 }  // namespace tools
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index 47979ec6f361789f29e8f7ff47793747330551fc..89def5d5610cb9522a69297668b443b8c4e03fb5 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -40,6 +41,8 @@ const double kF16max = 65504;
 // Parser for the HloModule::ToString() format text.
 class HloParser {
  public:
+  using LocTy = HloLexer::LocTy;
+
   explicit HloParser(StringPiece str, const HloModuleConfig& config)
       : lexer_(str), config_(config) {}
 
@@ -56,7 +59,7 @@ class HloParser {
   // ParseXXX returns false if an error occurred.
   bool ParseHloModule();
   bool ParseComputations();
-  bool ParseComputation();
+  bool ParseComputation(HloComputation** entry_computation);
   bool ParseInstructionList(HloComputation::Builder* builder,
                             string* root_name);
   bool ParseInstruction(HloComputation::Builder* builder, string* root_name);
@@ -65,6 +68,13 @@ class HloParser {
   bool ParseTupleLiteral(std::unique_ptr<Literal>* literal, const Shape& shape);
   bool ParseNonTupleLiteral(std::unique_ptr<Literal>* literal,
                             const Shape& shape);
+  bool ParseDenseLiteral(std::unique_ptr<Literal>* literal, const Shape& shape);
+  bool ParseSparseLiteral(std::unique_ptr<Literal>* literal,
+                          const Shape& shape);
+  template <typename LiteralNativeT>
+  bool ParseSparseLiteralHelper(std::unique_ptr<Literal>* literal,
+                                const Shape& shape);
+
   // Sets the sub-value of literal at the given index to the given value. The
   // literal's shape must have the default layout.
   bool SetValueInLiteral(int64 value, int64 linear_index, Literal* literal);
@@ -96,6 +106,7 @@ class HloParser {
     kString,
     kBracedInt64List,
     kHloComputation,
+    kFftType,
     kWindow,
     kConvolutionDimensionNumbers,
     kSharding,
@@ -104,6 +115,7 @@ class HloParser {
     kPaddingConfig,
     kMetadata,
     kFusionKind,
+    kDistribution,
   };
 
   struct AttrConfig {
@@ -167,20 +179,30 @@ class HloParser {
   bool ParseInt64List(const TokKind start, const TokKind end,
                       const TokKind delim, std::vector<int64>* result);
 
+  bool ParseParamListToShape(Shape* shape, LocTy* shape_loc);
   bool ParseParamList();
   bool ParseName(string* result);
   bool ParseAttributeName(string* result);
   bool ParseString(string* result);
   bool ParseShape(Shape* result);
   bool ParseOpcode(HloOpcode* result);
+  bool ParseFftType(FftType* result);
   bool ParseFusionKind(HloInstruction::FusionKind* result);
+  bool ParseRandomDistribution(RandomDistribution* result);
   bool ParseInt64(int64* result);
   bool ParseDouble(double* result);
   bool ParseBool(bool* result);
   bool ParseToken(TokKind kind, const string& msg);
 
+  // Returns true if the current token is the beginning of a shape.
+  bool CanBeShape();
+  // Returns true if the current token is the beginning of a
+  // param_list_to_shape.
+  bool CanBeParamListToShape();
+
   // Logs the current parsing line and the given message. Always returns false.
   bool TokenError(StringPiece msg);
+  bool Error(LocTy loc, StringPiece msg);
 
   // If the current token is 'kind', eats it (i.e. lexes the next token) and
   // returns true.
@@ -191,31 +213,47 @@ class HloParser {
 
   // Adds the instruction to the pool. Returns false and emits an error if the
   // instruction already exists.
-  bool AddInstruction(const string& name, HloInstruction* instruction);
+  bool AddInstruction(const string& name, HloInstruction* instruction,
+                      LocTy name_loc);
   // Adds the computation to the pool. Returns false and emits an error if the
   // computation already exists.
-  bool AddComputation(const string& name, HloComputation* computation);
+  bool AddComputation(const string& name, HloComputation* computation,
+                      LocTy name_loc);
 
-  // The map from the instruction name to the instruction. This does not own the
-  // instructions.
-  std::unordered_map<string, HloInstruction*> instruction_pool_;
-  std::unordered_map<string, HloComputation*> computation_pool_;
+  // The map from the instruction/computation name to the
+  // instruction/computation itself and it's location. This does not own the
+  // pointers.
+  std::unordered_map<string, std::pair<HloInstruction*, LocTy>>
+      instruction_pool_;
+  std::unordered_map<string, std::pair<HloComputation*, LocTy>>
+      computation_pool_;
 
   HloLexer lexer_;
   std::unique_ptr<HloModule> module_;
+  std::vector<std::unique_ptr<HloComputation>> computations_;
   const HloModuleConfig config_;
   std::vector<string> error_;
 };
 
-bool HloParser::TokenError(StringPiece msg) {
-  const string error =
-      StrCat("was parsing \"", lexer_.GetCurrentLine(), "\"; token ",
-             TokKindToString(lexer_.GetKind()), "; ", msg);
-  VLOG(1) << "TokenError: " << error;
-  error_.push_back(error);
+bool HloParser::Error(LocTy loc, StringPiece msg) {
+  auto line_col = lexer_.GetLineAndColumn(loc);
+  const unsigned line = line_col.first;
+  const unsigned col = line_col.second;
+  std::vector<string> error_lines;
+  error_lines.push_back(
+      StrCat("was parsing ", line, ":", col, ": error: ", msg));
+  error_lines.push_back(lexer_.GetLine(loc).ToString());
+  error_lines.push_back(col == 0 ? "" : StrCat(string(col - 1, ' '), "^"));
+
+  error_.push_back(tensorflow::str_util::Join(error_lines, "\n"));
+  VLOG(1) << "Error: " << error_.back();
   return false;
 }
 
+bool HloParser::TokenError(StringPiece msg) {
+  return Error(lexer_.GetLoc(), msg);
+}
+
 bool HloParser::Run() {
   lexer_.Lex();
   return ParseHloModule();
@@ -241,46 +279,110 @@ bool HloParser::ParseHloModule() {
 
 // computations ::= (computation)+
 bool HloParser::ParseComputations() {
+  HloComputation* entry_computation = nullptr;
   do {
-    if (!ParseComputation()) {
+    if (!ParseComputation(&entry_computation)) {
       return false;
     }
   } while (lexer_.GetKind() != TokKind::kEof);
+
+  for (int i = 0; i < computations_.size(); i++) {
+    // If entry_computation is not nullptr, it means the computation it pointed
+    // to is marked with "ENTRY"; otherwise, no computation is marked with
+    // "ENTRY", and we use the last computation as the entry computation. We
+    // add the non-entry computations as embedded computations to the module.
+    if ((entry_computation != nullptr &&
+         computations_[i].get() != entry_computation) ||
+        (entry_computation == nullptr && i != computations_.size() - 1)) {
+      module_->AddEmbeddedComputation(std::move(computations_[i]));
+      continue;
+    }
+    auto computation =
+        module_->AddEntryComputation(std::move(computations_[i]));
+    // The parameters and result layouts were set to default layout. Here we
+    // set the layouts to what the hlo text says.
+    for (int p = 0; p < computation->num_parameters(); p++) {
+      const Shape& param_shape = computation->parameter_instruction(p)->shape();
+      if (param_shape.has_layout()) {
+        module_->mutable_entry_computation_layout()
+            ->mutable_parameter_layout(p)
+            ->ResetLayout(param_shape.layout());
+      }
+    }
+    const Shape& result_shape = computation->root_instruction()->shape();
+    if (result_shape.has_layout()) {
+      module_->mutable_entry_computation_layout()
+          ->mutable_result_layout()
+          ->ResetLayout(result_shape.layout());
+    }
+  }
+
   return true;
 }
 
-// computation ::= ('ENTRY')? name param_list '->' shape instruction_list
-bool HloParser::ParseComputation() {
+// computation ::= ('ENTRY')? name (param_list_to_shape)? instruction_list
+bool HloParser::ParseComputation(HloComputation** entry_computation) {
+  LocTy maybe_entry_loc = lexer_.GetLoc();
   const bool is_entry_computation = EatIfPresent(TokKind::kw_ENTRY);
+
   string name;
+  LocTy name_loc = lexer_.GetLoc();
   if (!ParseName(&name)) {
     return false;
   }
   auto builder = MakeUnique<HloComputation::Builder>(name);
 
+  LocTy shape_loc = nullptr;
   Shape shape;
+  if (CanBeParamListToShape() && !ParseParamListToShape(&shape, &shape_loc)) {
+    return false;
+  }
+
   string root_name;
-  if (!ParseParamList() || !ParseToken(TokKind::kArrow, "expects '->'") ||
-      !ParseShape(&shape) || !ParseInstructionList(builder.get(), &root_name)) {
+  if (!ParseInstructionList(builder.get(), &root_name)) {
     return false;
   }
 
-  HloInstruction* root =
-      tensorflow::gtl::FindPtrOrNull(instruction_pool_, root_name);
+  std::pair<HloInstruction*, LocTy>* root_node =
+      tensorflow::gtl::FindOrNull(instruction_pool_, root_name);
   // This means some instruction was marked as ROOT but we didn't find it in the
   // pool, which should not happen.
-  if (!root_name.empty() && root == nullptr) {
+  if (!root_name.empty() && root_node == nullptr) {
     LOG(FATAL) << "instruction " << root_name
                << " was marked as ROOT but the parser has not seen it before";
   }
+
+  HloInstruction* root = root_node == nullptr ? nullptr : root_node->first;
   // Now root can be either an existing instruction or a nullptr. If it's a
   // nullptr, the implementation of Builder will set the last instruction as
   // root instruction.
-  HloComputation* computation =
-      is_entry_computation
-          ? module_->AddEntryComputation(builder->Build(root))
-          : module_->AddEmbeddedComputation(builder->Build(root));
-  return AddComputation(name, computation);
+  computations_.emplace_back(builder->Build(root));
+  HloComputation* computation = computations_.back().get();
+
+  if (!root) {
+    root = computation->root_instruction();
+  } else {
+    CHECK_EQ(root, computation->root_instruction());
+  }
+
+  // If param_list_to_shape was present, check compatibility.
+  if (shape_loc != nullptr && !ShapeUtil::Compatible(root->shape(), shape)) {
+    return Error(
+        shape_loc,
+        StrCat("Shape of computation ", name, ", ",
+               ShapeUtil::HumanString(shape),
+               ", is not compatible with that of its root instruction ",
+               root_name, ", ", ShapeUtil::HumanString(root->shape())));
+  }
+
+  if (is_entry_computation) {
+    if (*entry_computation != nullptr) {
+      return Error(maybe_entry_loc, "expects only one ENTRY");
+    }
+    *entry_computation = computation;
+  }
+
+  return AddComputation(name, computation, name_loc);
 }
 
 // instruction_list ::= '{' instruction_list1 '}'
@@ -307,13 +409,21 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
   Shape shape;
   HloOpcode opcode;
   std::vector<HloInstruction*> operands;
+
+  LocTy maybe_root_loc = lexer_.GetLoc();
   bool is_root = EatIfPresent(TokKind::kw_ROOT);
+
+  const LocTy name_loc = lexer_.GetLoc();
   if (!ParseName(&name) ||
       !ParseToken(TokKind::kEqual, "expects '=' in instruction") ||
       !ParseShape(&shape) || !ParseOpcode(&opcode)) {
     return false;
   }
+
   if (is_root) {
+    if (!root_name->empty()) {
+      return Error(maybe_root_loc, "one computation should have only one ROOT");
+    }
     *root_name = name;
   }
 
@@ -395,7 +505,6 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kLe:
     case HloOpcode::kLt:
     case HloOpcode::kNe:
-    case HloOpcode::kDot:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kPower:
@@ -444,12 +553,11 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kCrossReplicaSum: {
-      if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseAttributes(attrs)) {
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
-          HloInstruction::CreateCrossReplicaSum(shape, operands[0]));
+          HloInstruction::CreateCrossReplicaSum(shape, operands));
       break;
     }
     case HloOpcode::kReshape: {
@@ -590,6 +698,20 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           shape, /*lhs=*/operands[0], /*rhs=*/operands[1], *window, *dnums));
       break;
     }
+    case HloOpcode::kFft: {
+      optional<FftType> fft_type;
+      optional<std::vector<int64>> fft_length;
+      attrs["fft_type"] = {/*required=*/true, AttrTy::kFftType, &fft_type};
+      attrs["fft_length"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                             &fft_length};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateFft(
+          shape, operands[0], *fft_type, *fft_length));
+      break;
+    }
     case HloOpcode::kBroadcast: {
       optional<std::vector<int64>> broadcast_dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
@@ -813,18 +935,113 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
         return false;
       }
       instruction = builder->AddInstruction(HloInstruction::CreateOutfeed(
-          shape, operands[0], config ? *config : ""));
+          operands[0]->shape(), operands[0], config ? *config : ""));
+      break;
+    }
+    case HloOpcode::kRng: {
+      optional<RandomDistribution> distribution;
+      attrs["distribution"] = {/*required=*/true, AttrTy::kDistribution,
+                               &distribution};
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateRng(shape, *distribution, operands));
+      break;
+    }
+    case HloOpcode::kReducePrecision: {
+      optional<int64> exponent_bits;
+      optional<int64> mantissa_bits;
+      attrs["exponent_bits"] = {/*required=*/true, AttrTy::kInt64,
+                                &exponent_bits};
+      attrs["mantissa_bits"] = {/*required=*/true, AttrTy::kInt64,
+                                &mantissa_bits};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateReducePrecision(
+              shape, operands[0], static_cast<int>(*exponent_bits),
+              static_cast<int>(*mantissa_bits)));
+      break;
+    }
+    case HloOpcode::kConditional: {
+      optional<HloComputation*> true_computation;
+      optional<HloComputation*> false_computation;
+      attrs["true_computation"] = {/*required=*/true, AttrTy::kHloComputation,
+                                   &true_computation};
+      attrs["false_computation"] = {/*required=*/true, AttrTy::kHloComputation,
+                                    &false_computation};
+      if (!ParseOperands(&operands, /*expected_size=*/3) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateConditional(
+          shape, /*pred=*/operands[0],
+          /*true_computation_arg=*/operands[1], *true_computation,
+          /*false_computation_arg=*/operands[2], *false_computation));
+      break;
+    }
+    case HloOpcode::kCustomCall: {
+      optional<string> custom_call_target;
+      attrs["custom_call_target"] = {/*required=*/true, AttrTy::kString,
+                                     &custom_call_target};
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateCustomCall(
+          shape, operands, *custom_call_target));
+      break;
+    }
+    case HloOpcode::kDot: {
+      optional<std::vector<int64>> lhs_contracting_dims;
+      attrs["lhs_contracting_dims"] = {
+          /*required=*/false, AttrTy::kBracedInt64List, &lhs_contracting_dims};
+      optional<std::vector<int64>> rhs_contracting_dims;
+      attrs["rhs_contracting_dims"] = {
+          /*required=*/false, AttrTy::kBracedInt64List, &rhs_contracting_dims};
+      optional<std::vector<int64>> lhs_batch_dims;
+      attrs["lhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List,
+                                 &lhs_batch_dims};
+      optional<std::vector<int64>> rhs_batch_dims;
+      attrs["rhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List,
+                                 &rhs_batch_dims};
+
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+
+      DotDimensionNumbers dnum;
+      if (lhs_contracting_dims) {
+        *dnum.mutable_lhs_contracting_dimensions() = {
+            lhs_contracting_dims->begin(), lhs_contracting_dims->end()};
+      }
+      if (rhs_contracting_dims) {
+        *dnum.mutable_rhs_contracting_dimensions() = {
+            rhs_contracting_dims->begin(), rhs_contracting_dims->end()};
+      }
+      if (lhs_batch_dims) {
+        *dnum.mutable_lhs_batch_dimensions() = {lhs_batch_dims->begin(),
+                                                lhs_batch_dims->end()};
+      }
+      if (rhs_batch_dims) {
+        *dnum.mutable_rhs_batch_dimensions() = {rhs_batch_dims->begin(),
+                                                rhs_batch_dims->end()};
+      }
+
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateDot(shape, operands[0], operands[1], dnum));
       break;
     }
-    case HloOpcode::kConditional:
-    case HloOpcode::kCustomCall:
-    case HloOpcode::kReducePrecision:
-    case HloOpcode::kRng:
     case HloOpcode::kTrace:
       return TokenError(StrCat("parsing not yet implemented for op: ",
                                HloOpcodeString(opcode)));
   }
 
+  instruction->set_name(name);
+
   // Add common attrs (sharding, control predecessors) to the instruction, if
   // they were seen.
   if (sharding) {
@@ -835,15 +1052,15 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     for (auto* pre : *predecessors) {
       Status status = pre->AddControlDependencyTo(instruction);
       if (!status.ok()) {
-        return TokenError(StrCat("error adding control dependency for: ", name,
-                                 " status: ", status.ToString()));
+        return Error(name_loc, StrCat("error adding control dependency for: ",
+                                      name, " status: ", status.ToString()));
       }
     }
   }
   if (metadata) {
     instruction->set_metadata(*metadata);
   }
-  return AddInstruction(name, instruction);
+  return AddInstruction(name, instruction, name_loc);
 }  // NOLINT(readability/fn_size)
 
 // ::= '{' (single_sharding | tuple_sharding) '}'
@@ -889,6 +1106,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
     return false;
   }
 
+  LocTy loc = lexer_.GetLoc();
   bool maximal = false;
   bool replicated = false;
   std::vector<int64> devices;
@@ -956,34 +1174,35 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
 
   if (replicated) {
     if (!devices.empty()) {
-      return TokenError(
-          "replicated shardings should not have any devices assigned");
+      return Error(loc,
+                   "replicated shardings should not have any devices assigned");
     }
     if (!ShapeUtil::Equal(tile_shape, Shape())) {
-      return TokenError(
-          "replicated shardings should not have any tile shape set");
+      return Error(loc,
+                   "replicated shardings should not have any tile shape set");
     }
     sharding->set_type(OpSharding::Type::OpSharding_Type_REPLICATED);
   } else if (maximal) {
     if (devices.size() != 1) {
-      return TokenError(
-          "maximal shardings should have exactly one device assigned");
+      return Error(loc,
+                   "maximal shardings should have exactly one device assigned");
     }
     if (!ShapeUtil::Equal(tile_shape, Shape())) {
-      return TokenError("maximal shardings should not have any tile shape set");
+      return Error(loc, "maximal shardings should not have any tile shape set");
     }
     sharding->set_type(OpSharding::Type::OpSharding_Type_MAXIMAL);
     sharding->add_tile_assignment_devices(devices[0]);
   } else {
     if (devices.size() <= 1) {
-      return TokenError(
-          "non-maximal shardings must have more than one device assigned");
+      return Error(
+          loc, "non-maximal shardings must have more than one device assigned");
     }
     if (ShapeUtil::Equal(tile_shape, Shape())) {
-      return TokenError("non-maximal shardings should have a tile shape set");
+      return Error(loc, "non-maximal shardings should have a tile shape set");
     }
     if (tile_assignment_dimensions.empty()) {
-      return TokenError(
+      return Error(
+          loc,
           "non-maximal shardings must have a tile assignment list including "
           "dimensions");
     }
@@ -1008,22 +1227,23 @@ bool HloParser::ParseInstructionNames(
                   "expects '{' at the beginning of instruction name list")) {
     return false;
   }
+  LocTy loc = lexer_.GetLoc();
   do {
     string name;
     if (!ParseName(&name)) {
-      return TokenError("expects a instruction name");
+      return Error(loc, "expects a instruction name");
     }
-    HloInstruction* instr =
-        tensorflow::gtl::FindPtrOrNull(instruction_pool_, name);
+    std::pair<HloInstruction*, LocTy>* instr =
+        tensorflow::gtl::FindOrNull(instruction_pool_, name);
     if (!instr) {
       return TokenError(
           Printf("instruction '%s' is not defined", name.c_str()));
     }
-    instructions->push_back(instr);
+    instructions->push_back(instr->first);
   } while (EatIfPresent(TokKind::kComma));
 
   return ParseToken(TokKind::kRbrace,
-                    "expects '}' at the end of control instructions");
+                    "expects '}' at the end of instruction name list");
 }
 
 bool HloParser::SetValueInLiteral(int64 value, int64 linear_index,
@@ -1058,6 +1278,8 @@ bool HloParser::SetValueInLiteral(double value, int64 linear_index,
   switch (shape.element_type()) {
     case F16:
       return SetValueInLiteralHelper<half>(value, linear_index, literal);
+    case BF16:
+      return SetValueInLiteralHelper<bfloat16>(value, linear_index, literal);
     case F32:
       return SetValueInLiteralHelper<float>(value, linear_index, literal);
     case F64:
@@ -1096,7 +1318,8 @@ bool HloParser::SetValueInLiteralHelper(ParsedElemT value, int64 linear_index,
        (std::numeric_limits<ParsedElemT>::infinity() == value ||
         -std::numeric_limits<ParsedElemT>::infinity() == value))) {
     // Skip range checking for non-finite value.
-  } else if (literal->shape().element_type() == F16) {
+  } else if (literal->shape().element_type() == F16 ||
+             literal->shape().element_type() == BF16) {
     if (value > kF16max || value < -kF16max) {
       return TokenError(StrCat(
           "value ", value, " is out of range for literal's primitive type ",
@@ -1112,7 +1335,7 @@ bool HloParser::SetValueInLiteralHelper(ParsedElemT value, int64 linear_index,
         PrimitiveType_Name(literal->shape().element_type())));
   }
 
-  literal->GetMutableArraySlice<LiteralNativeT>().at(linear_index) =
+  literal->data<LiteralNativeT>().at(linear_index) =
       static_cast<LiteralNativeT>(value);
   return true;
 }
@@ -1179,9 +1402,19 @@ bool HloParser::ParseTupleLiteral(std::unique_ptr<Literal>* literal,
 // non_tuple
 //   ::= rank01
 //   ::= rank2345
-// rank2345 ::= shape nested_array
+// rank2345 ::= shape sparse_or_nested_array
 bool HloParser::ParseNonTupleLiteral(std::unique_ptr<Literal>* literal,
                                      const Shape& shape) {
+  if (LayoutUtil::IsSparseArray(shape)) {
+    return ParseSparseLiteral(literal, shape);
+  }
+
+  CHECK(LayoutUtil::IsDenseArray(shape));
+  return ParseDenseLiteral(literal, shape);
+}
+
+bool HloParser::ParseDenseLiteral(std::unique_ptr<Literal>* literal,
+                                  const Shape& shape) {
   const int64 rank = ShapeUtil::Rank(shape);
   if (rank > 1 && !EatShapeAndCheckCompatible(shape)) {
     return false;
@@ -1282,26 +1515,28 @@ bool HloParser::ParseNonTupleLiteral(std::unique_ptr<Literal>* literal,
           }
           lexer_.Lex();
         } else if (primitive_util::IsIntegralType(shape.element_type())) {
+          LocTy loc = lexer_.GetLoc();
           int64 value;
           if (!ParseInt64(&value)) {
-            return TokenError(StrCat("expects integer for primitive type: ",
+            return Error(loc, StrCat("expects integer for primitive type: ",
                                      PrimitiveType_Name(shape.element_type())));
           }
           if (!SetValueInLiteral(value, linear_index++, literal->get())) {
             return false;
           }
         } else if (primitive_util::IsFloatingPointType(shape.element_type())) {
+          LocTy loc = lexer_.GetLoc();
           double value;
           if (!ParseDouble(&value)) {
-            return TokenError(
-                StrCat("expect floating point value for primitive type: ",
-                       PrimitiveType_Name(shape.element_type())));
+            return Error(
+                loc, StrCat("expect floating point value for primitive type: ",
+                            PrimitiveType_Name(shape.element_type())));
           }
           if (!SetValueInLiteral(value, linear_index++, literal->get())) {
             return false;
           }
         } else {
-          return TokenError(StrCat("unsupported premitive type ",
+          return TokenError(StrCat("unsupported primitive type ",
                                    PrimitiveType_Name(shape.element_type())));
         }
         break;
@@ -1313,11 +1548,147 @@ bool HloParser::ParseNonTupleLiteral(std::unique_ptr<Literal>* literal,
   return true;
 }
 
+bool HloParser::ParseSparseLiteral(std::unique_ptr<Literal>* literal,
+                                   const Shape& shape) {
+  if (!EatShapeAndCheckCompatible(shape)) {
+    return false;
+  }
+
+  switch (shape.element_type()) {
+    case PRED:
+      return ParseSparseLiteralHelper<uint8>(literal, shape);
+    case S8:
+      return ParseSparseLiteralHelper<int8>(literal, shape);
+    case S16:
+      return ParseSparseLiteralHelper<int16>(literal, shape);
+    case S32:
+      return ParseSparseLiteralHelper<int32>(literal, shape);
+    case S64:
+      return ParseSparseLiteralHelper<int64>(literal, shape);
+    case U8:
+      return ParseSparseLiteralHelper<uint8>(literal, shape);
+    case U16:
+      return ParseSparseLiteralHelper<uint16>(literal, shape);
+    case U32:
+      return ParseSparseLiteralHelper<uint32>(literal, shape);
+    case U64:
+      return ParseSparseLiteralHelper<uint64>(literal, shape);
+    case F16:
+      return ParseSparseLiteralHelper<half>(literal, shape);
+    case F32:
+      return ParseSparseLiteralHelper<float>(literal, shape);
+    case BF16:
+      return ParseSparseLiteralHelper<bfloat16>(literal, shape);
+    case F64:
+      return ParseSparseLiteralHelper<double>(literal, shape);
+    default:
+      return Error(lexer_.GetLoc(),
+                   StrCat("invalid primitive type for sparse literal: ",
+                          PrimitiveType_Name(shape.element_type())));
+  }
+}
+
+template <typename LiteralNativeT>
+bool HloParser::ParseSparseLiteralHelper(std::unique_ptr<Literal>* literal,
+                                         const Shape& shape) {
+  std::vector<int64> index;
+
+  int64 rank = ShapeUtil::Rank(shape);
+
+  *literal = MakeUnique<Literal>(shape);
+
+  if (!ParseToken(TokKind::kLbrace,
+                  "expects '{' at the beginning of a sparse literal")) {
+    return false;
+  }
+
+  for (;;) {
+    if (lexer_.GetKind() == TokKind::kRbrace) {
+      lexer_.Lex();
+      break;
+    }
+
+    LocTy index_loc = lexer_.GetLoc();
+    index.clear();
+    if (lexer_.GetKind() == TokKind::kInt) {
+      int64 single_index = lexer_.GetInt64Val();
+      lexer_.Lex();
+      if (rank != 1) {
+        return Error(
+            index_loc,
+            StrCat("invalid single-dimensional index for shape with rank ",
+                   rank, ": ", single_index));
+      }
+      index.push_back(single_index);
+    } else {
+      if (!ParseInt64List(TokKind::kLsquare, TokKind::kRsquare, TokKind::kComma,
+                          &index)) {
+        return false;
+      }
+      if (index.size() != rank) {
+        return Error(
+            index_loc,
+            StrCat("invalid multi-dimension index for shape with rank ", rank,
+                   ": [", tensorflow::str_util::Join(index, ", "), "]"));
+      }
+    }
+    if (!ParseToken(TokKind::kColon,
+                    "expects ':' after after the sparse array index and before "
+                    "the sparse array value")) {
+      return false;
+    }
+    LocTy value_loc = lexer_.GetLoc();
+    LiteralNativeT value;
+    if (lexer_.GetKind() == TokKind::kw_true ||
+        lexer_.GetKind() == TokKind::kw_false) {
+      value = static_cast<LiteralNativeT>(lexer_.GetKind() == TokKind::kw_true);
+      lexer_.Lex();
+    } else if (primitive_util::IsIntegralType(shape.element_type())) {
+      int64 value_s64;
+      if (!ParseInt64(&value_s64)) {
+        return Error(value_loc,
+                     StrCat("expects integer for primitive type: ",
+                            PrimitiveType_Name(shape.element_type())));
+      }
+      value = static_cast<LiteralNativeT>(value_s64);
+    } else if (primitive_util::IsFloatingPointType(shape.element_type())) {
+      double value_f64;
+      if (!ParseDouble(&value_f64)) {
+        return Error(value_loc,
+                     StrCat("expects floating point value for primitive type: ",
+                            PrimitiveType_Name(shape.element_type())));
+      }
+      value = static_cast<LiteralNativeT>(value_f64);
+    } else {
+      LOG(FATAL) << "Unexpected element type: "
+                 << PrimitiveType_Name(shape.element_type());
+    }
+    if (lexer_.GetKind() != TokKind::kRbrace &&
+        !ParseToken(TokKind::kComma,
+                    "expects ',' separator between sparse array elements")) {
+      return false;
+    }
+
+    if ((*literal)->sparse_element_count() + 1 ==
+        LayoutUtil::MaxSparseElements(shape.layout())) {
+      return Error(
+          lexer_.GetLoc(),
+          StrCat("number of sparse elements exceeds maximum for layout: ",
+                 ShapeUtil::HumanStringWithLayout(shape)));
+    }
+
+    (*literal)->AppendSparseElement(index, value);
+  }
+
+  (*literal)->SortSparseElements();
+  return true;
+}
+
 // operands ::= '(' operands1 ')'
 // operands1
 //   ::= /*empty*/
 //   ::= operand (, operand)*
-// operand ::= shape name
+// operand ::= (shape)? name
 bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
   if (!ParseToken(TokKind::kLparen,
                   "expects '(' at the beginning of operands")) {
@@ -1327,17 +1698,23 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
     // empty
   } else {
     do {
-      Shape shape;
+      LocTy loc = lexer_.GetLoc();
       string name;
-      if (!ParseShape(&shape) || !ParseName(&name)) {
+      if (CanBeShape()) {
+        Shape shape;
+        if (!ParseShape(&shape)) {
+          return false;
+        }
+      }
+      if (!ParseName(&name)) {
         return false;
       }
-      HloInstruction* instruction =
-          tensorflow::gtl::FindPtrOrNull(instruction_pool_, name);
+      std::pair<HloInstruction*, LocTy>* instruction =
+          tensorflow::gtl::FindOrNull(instruction_pool_, name);
       if (!instruction) {
-        return TokenError(StrCat("instruction does not exist: ", name));
+        return Error(loc, StrCat("instruction does not exist: ", name));
       }
-      operands->push_back(instruction);
+      operands->push_back(instruction->first);
     } while (EatIfPresent(TokKind::kComma));
   }
   return ParseToken(TokKind::kRparen, "expects ')' at the end of operands");
@@ -1345,11 +1722,12 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
 
 bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands,
                               const int expected_size) {
+  LocTy loc = lexer_.GetLoc();
   if (!ParseOperands(operands)) {
     return false;
   }
   if (expected_size != operands->size()) {
-    return TokenError(StrCat("expects ", expected_size, " operands, but has ",
+    return Error(loc, StrCat("expects ", expected_size, " operands, but has ",
                              operands->size(), " operands"));
   }
   return true;
@@ -1358,6 +1736,7 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands,
 // sub_attributes ::= '{' (','? attribute)* '}'
 bool HloParser::ParseSubAttributes(
     const std::unordered_map<string, AttrConfig>& attrs) {
+  LocTy loc = lexer_.GetLoc();
   if (!ParseToken(TokKind::kLbrace, "expects '{' to start sub attributes")) {
     return false;
   }
@@ -1376,7 +1755,7 @@ bool HloParser::ParseSubAttributes(
   for (const auto& attr_it : attrs) {
     if (attr_it.second.required &&
         seen_attrs.find(attr_it.first) == seen_attrs.end()) {
-      return TokenError(Printf("sub-attribute %s is expected but not seen",
+      return Error(loc, Printf("sub-attribute %s is expected but not seen",
                                attr_it.first.c_str()));
     }
   }
@@ -1386,6 +1765,7 @@ bool HloParser::ParseSubAttributes(
 // attributes ::= (',' attribute)*
 bool HloParser::ParseAttributes(
     const std::unordered_map<string, AttrConfig>& attrs) {
+  LocTy loc = lexer_.GetLoc();
   std::unordered_set<string> seen_attrs;
   while (EatIfPresent(TokKind::kComma)) {
     if (!ParseAttributeHelper(attrs, &seen_attrs)) {
@@ -1396,7 +1776,7 @@ bool HloParser::ParseAttributes(
   for (const auto& attr_it : attrs) {
     if (attr_it.second.required &&
         seen_attrs.find(attr_it.first) == seen_attrs.end()) {
-      return TokenError(Printf("attribute %s is expected but not seen",
+      return Error(loc, Printf("attribute %s is expected but not seen",
                                attr_it.first.c_str()));
     }
   }
@@ -1406,21 +1786,23 @@ bool HloParser::ParseAttributes(
 bool HloParser::ParseAttributeHelper(
     const std::unordered_map<string, AttrConfig>& attrs,
     std::unordered_set<string>* seen_attrs) {
+  LocTy loc = lexer_.GetLoc();
   string name;
   if (!ParseAttributeName(&name)) {
-    return TokenError("error parsing attributes");
+    return Error(loc, "error parsing attributes");
   }
   VLOG(1) << "Parsing attribute " << name;
   if (!seen_attrs->insert(name).second) {
-    return TokenError(Printf("attribute %s already exists", name.c_str()));
+    return Error(loc, Printf("attribute %s already exists", name.c_str()));
   }
   auto attr_it = attrs.find(name);
   if (attr_it == attrs.end()) {
-    return TokenError(Printf("unexpected attribute %s", name.c_str()));
+    return Error(loc, Printf("unexpected attribute %s", name.c_str()));
   }
   AttrTy attr_type = attr_it->second.attr_type;
   void* attr_out_ptr = attr_it->second.result;
   bool success = [&] {
+    LocTy attr_loc = lexer_.GetLoc();
     switch (attr_type) {
       case AttrTy::kInt64: {
         int64 result;
@@ -1436,7 +1818,7 @@ bool HloParser::ParseAttributeHelper(
           return false;
         }
         if (result != static_cast<int32>(result)) {
-          return TokenError("value out of range for int32");
+          return Error(attr_loc, "value out of range for int32");
         }
         static_cast<optional<int32>*>(attr_out_ptr)
             ->emplace(static_cast<int32>(result));
@@ -1449,7 +1831,7 @@ bool HloParser::ParseAttributeHelper(
         }
         if (result > std::numeric_limits<float>::max() ||
             result < std::numeric_limits<float>::lowest()) {
-          return TokenError("value out of range for float");
+          return Error(attr_loc, "value out of range for float");
         }
         static_cast<optional<float>*>(attr_out_ptr)
             ->emplace(static_cast<float>(result));
@@ -1463,6 +1845,14 @@ bool HloParser::ParseAttributeHelper(
         static_cast<optional<HloComputation*>*>(attr_out_ptr)->emplace(result);
         return true;
       }
+      case AttrTy::kFftType: {
+        FftType result;
+        if (!ParseFftType(&result)) {
+          return false;
+        }
+        static_cast<optional<FftType>*>(attr_out_ptr)->emplace(result);
+        return true;
+      }
       case AttrTy::kWindow: {
         Window result;
         if (!ParseWindow(&result)) {
@@ -1548,23 +1938,35 @@ bool HloParser::ParseAttributeHelper(
         static_cast<optional<OpMetadata>*>(attr_out_ptr)->emplace(result);
         return true;
       }
+      case AttrTy::kDistribution: {
+        RandomDistribution result;
+        if (!ParseRandomDistribution(&result)) {
+          return false;
+        }
+        static_cast<optional<RandomDistribution>*>(attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
     }
   }();
   if (!success) {
-    return TokenError(Printf("error parsing attribute %s", name.c_str()));
+    return Error(loc, Printf("error parsing attribute %s", name.c_str()));
   }
   return true;
 }
 
 bool HloParser::ParseComputationName(HloComputation** value) {
   string name;
+  LocTy loc = lexer_.GetLoc();
   if (!ParseName(&name)) {
-    return TokenError("expects computation name");
+    return Error(loc, "expects computation name");
   }
-  *value = tensorflow::gtl::FindPtrOrNull(computation_pool_, name);
-  if (*value == nullptr) {
-    return TokenError(StrCat("computation does not exist: ", name));
+  std::pair<HloComputation*, LocTy>* computation =
+      tensorflow::gtl::FindOrNull(computation_pool_, name);
+  if (computation == nullptr) {
+    return Error(loc, StrCat("computation does not exist: ", name));
   }
+  *value = computation->first;
   return true;
 }
 
@@ -1572,6 +1974,7 @@ bool HloParser::ParseComputationName(HloComputation** value) {
 // The subattributes can appear in any order. 'size=' is required, others are
 // optional.
 bool HloParser::ParseWindow(Window* window) {
+  LocTy loc = lexer_.GetLoc();
   if (!ParseToken(TokKind::kLbrace, "expected '{' to start window attribute")) {
     return false;
   }
@@ -1581,10 +1984,12 @@ bool HloParser::ParseWindow(Window* window) {
   std::vector<std::vector<int64>> pad;
   std::vector<int64> lhs_dilate;
   std::vector<int64> rhs_dilate;
+  std::vector<int64> rhs_reversal;
   while (lexer_.GetKind() != TokKind::kRbrace) {
+    LocTy attr_loc = lexer_.GetLoc();
     string field_name;
     if (!ParseAttributeName(&field_name)) {
-      return TokenError("expects sub-attributes in window");
+      return Error(attr_loc, "expects sub-attributes in window");
     }
     bool ok = [&] {
       if (field_name == "size") {
@@ -1602,7 +2007,10 @@ bool HloParser::ParseWindow(Window* window) {
       if (field_name == "pad") {
         return ParseWindowPad(&pad);
       }
-      return TokenError(StrCat("unexpected attribute name: ", field_name));
+      if (field_name == "rhs_reversal") {
+        return ParseDxD("rhs_reversal", &rhs_reversal);
+      }
+      return Error(attr_loc, StrCat("unexpected attribute name: ", field_name));
     }();
     if (!ok) {
       return false;
@@ -1610,20 +2018,20 @@ bool HloParser::ParseWindow(Window* window) {
   }
 
   if (size.empty()) {
-    return TokenError(
-        "sub-attribute 'size=' is required in the window attribute");
+    return Error(loc,
+                 "sub-attribute 'size=' is required in the window attribute");
   }
   if (!stride.empty() && stride.size() != size.size()) {
-    return TokenError("expects 'stride=' has the same size as 'size='");
+    return Error(loc, "expects 'stride=' has the same size as 'size='");
   }
   if (!lhs_dilate.empty() && lhs_dilate.size() != size.size()) {
-    return TokenError("expects 'lhs_dilate=' has the same size as 'size='");
+    return Error(loc, "expects 'lhs_dilate=' has the same size as 'size='");
   }
   if (!rhs_dilate.empty() && rhs_dilate.size() != size.size()) {
-    return TokenError("expects 'rhs_dilate=' has the same size as 'size='");
+    return Error(loc, "expects 'rhs_dilate=' has the same size as 'size='");
   }
   if (!pad.empty() && pad.size() != size.size()) {
-    return TokenError("expects 'pad=' has the same size as 'size='");
+    return Error(loc, "expects 'pad=' has the same size as 'size='");
   }
 
   for (int i = 0; i < size.size(); i++) {
@@ -1638,6 +2046,8 @@ bool HloParser::ParseWindow(Window* window) {
         lhs_dilate.empty() ? 1 : lhs_dilate[i]);
     window->mutable_dimensions(i)->set_window_dilation(
         rhs_dilate.empty() ? 1 : rhs_dilate[i]);
+    window->mutable_dimensions(i)->set_window_reversal(
+        rhs_reversal.empty() ? false : (rhs_reversal[i] == 1));
   }
   return ParseToken(TokKind::kRbrace, "expected '}' to end window attribute");
 }
@@ -1769,7 +2179,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
 //
 //  {[2:3:4], [5:6:7], [8:9]}
 //
-// The the parsed result will be:
+// The parsed result will be:
 //
 //  {/*starts=*/{2, 5, 8}, /*limits=*/{3, 6, 9}, /*strides=*/{4, 7, 1}}
 //
@@ -1783,20 +2193,19 @@ bool HloParser::ParseSliceRanges(SliceRanges* result) {
     return ParseToken(TokKind::kRbrace, "expects '}' to end ranges");
   }
   do {
+    LocTy loc = lexer_.GetLoc();
     ranges.emplace_back();
     if (!ParseInt64List(TokKind::kLsquare, TokKind::kRsquare, TokKind::kColon,
                         &ranges.back())) {
       return false;
     }
-  } while (EatIfPresent(TokKind::kComma));
-
-  for (const auto& range : ranges) {
+    const auto& range = ranges.back();
     if (range.size() != 2 && range.size() != 3) {
-      return TokenError(Printf(
-          "expects [start:limit:step] or [start:limit], but sees %ld elements.",
-          range.size()));
+      return Error(loc, Printf("expects [start:limit:step] or [start:limit], "
+                               "but sees %ld elements.",
+                               range.size()));
     }
-  }
+  } while (EatIfPresent(TokKind::kComma));
 
   for (const auto& range : ranges) {
     result->starts.push_back(range[0]);
@@ -1832,6 +2241,19 @@ bool HloParser::ParseInt64List(const TokKind start, const TokKind end,
       end, StrCat("expects an int64 list to end with ", TokKindToString(end)));
 }
 
+// param_list_to_shape ::= param_list '->' shape
+bool HloParser::ParseParamListToShape(Shape* shape, LocTy* shape_loc) {
+  if (!ParseParamList() || !ParseToken(TokKind::kArrow, "expects '->'")) {
+    return false;
+  }
+  *shape_loc = lexer_.GetLoc();
+  return ParseShape(shape);
+}
+
+bool HloParser::CanBeParamListToShape() {
+  return lexer_.GetKind() == TokKind::kLparen;
+}
+
 // param_list ::= '(' param_list1 ')'
 // param_list1
 //   ::= /*empty*/
@@ -1848,8 +2270,8 @@ bool HloParser::ParseParamList() {
   } else {
     do {
       Shape shape;
-      if (!ParseToken(TokKind::kName, "expects name in parameter") ||
-          !ParseShape(&shape)) {
+      string name;
+      if (!ParseName(&name) || !ParseShape(&shape)) {
         return false;
       }
     } while (EatIfPresent(TokKind::kComma));
@@ -1888,9 +2310,17 @@ bool HloParser::ParseShape(Shape* result) {
   return true;
 }
 
+bool HloParser::CanBeShape() {
+  // A non-tuple shape starts with a kShape token; a tuple shape starts with
+  // '('.
+  return lexer_.GetKind() == TokKind::kShape ||
+         lexer_.GetKind() == TokKind::kLparen;
+}
+
 bool HloParser::ParseName(string* result) {
   VLOG(1) << "ParseName";
-  if (lexer_.GetKind() != TokKind::kName) {
+  if (lexer_.GetKind() != TokKind::kIdent &&
+      lexer_.GetKind() != TokKind::kName) {
     return TokenError("expects name");
   }
   *result = lexer_.GetStrVal();
@@ -1918,15 +2348,16 @@ bool HloParser::ParseString(string* result) {
 }
 
 bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
+  LocTy loc = lexer_.GetLoc();
   if (!result->empty()) {
-    return TokenError(
-        Printf("sub-attribute '%s=' already exists", name.c_str()));
+    return Error(loc,
+                 Printf("sub-attribute '%s=' already exists", name.c_str()));
   }
   // 1D
   if (lexer_.GetKind() == TokKind::kInt) {
     int64 number;
     if (!ParseInt64(&number)) {
-      return TokenError(Printf("expects sub-attribute '%s=i'", name.c_str()));
+      return Error(loc, Printf("expects sub-attribute '%s=i'", name.c_str()));
     }
     result->push_back(number);
     return true;
@@ -1935,8 +2366,8 @@ bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
   if (lexer_.GetKind() == TokKind::kDxD) {
     string str = lexer_.GetStrVal();
     if (!SplitAndParseAsInts(str, 'x', result)) {
-      return TokenError(
-          Printf("expects sub-attribute '%s=ixj...'", name.c_str()));
+      return Error(loc,
+                   Printf("expects sub-attribute '%s=ixj...'", name.c_str()));
     }
     lexer_.Lex();
     return true;
@@ -1945,8 +2376,9 @@ bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
 }
 
 bool HloParser::ParseWindowPad(std::vector<std::vector<int64>>* pad) {
+  LocTy loc = lexer_.GetLoc();
   if (!pad->empty()) {
-    return TokenError("sub-attribute 'pad=' already exists");
+    return Error(loc, "sub-attribute 'pad=' already exists");
   }
   if (lexer_.GetKind() != TokKind::kPad) {
     return TokenError("expects window pad pattern, e.g., '0_0x3_3'");
@@ -1957,8 +2389,8 @@ bool HloParser::ParseWindowPad(std::vector<std::vector<int64>>* pad) {
     std::vector<int64> low_high;
     if (!SplitAndParseAsInts(padding_str[i], '_', &low_high) ||
         low_high.size() != 2) {
-      return TokenError(
-          "expects padding_low and padding_high separated by '_'");
+      return Error(loc,
+                   "expects padding_low and padding_high separated by '_'");
     }
     pad->push_back(low_high);
   }
@@ -1974,15 +2406,16 @@ bool HloParser::ParsePaddingConfig(PaddingConfig* padding) {
   if (lexer_.GetKind() != TokKind::kPad) {
     return TokenError("expects padding config, e.g., '0_0_0x3_3_1'");
   }
+  LocTy loc = lexer_.GetLoc();
   string str = lexer_.GetStrVal();
   std::vector<string> padding_str = Split(str, 'x');
   for (const auto& padding_dim_str : padding_str) {
     std::vector<int64> padding_dim;
     if (!SplitAndParseAsInts(padding_dim_str, '_', &padding_dim) ||
         (padding_dim.size() != 2 && padding_dim.size() != 3)) {
-      return TokenError(
-          "expects padding config pattern like 'low_high_interior' or "
-          "'low_high'");
+      return Error(loc,
+                   "expects padding config pattern like 'low_high_interior' or "
+                   "'low_high'");
     }
     auto* dim = padding->add_dimensions();
     dim->set_edge_padding_low(padding_dim[0]);
@@ -2024,20 +2457,64 @@ bool HloParser::ParseMetadata(OpMetadata* metadata) {
 
 bool HloParser::ParseOpcode(HloOpcode* result) {
   VLOG(1) << "ParseOpcode";
-  if (lexer_.GetKind() != TokKind::kOpcode) {
+  if (lexer_.GetKind() != TokKind::kIdent) {
     return TokenError("expects opcode");
   }
-  *result = lexer_.GetOpcodeVal();
+  string val = lexer_.GetStrVal();
+  auto status_or_result = StringToHloOpcode(val);
+  if (!status_or_result.ok()) {
+    return TokenError(
+        Printf("expects opcode but sees: %s, error: %s", val.c_str(),
+               status_or_result.status().error_message().c_str()));
+  }
+  *result = status_or_result.ValueOrDie();
+  lexer_.Lex();
+  return true;
+}
+
+bool HloParser::ParseFftType(FftType* result) {
+  VLOG(1) << "ParseFftType";
+  if (lexer_.GetKind() != TokKind::kIdent) {
+    return TokenError("expects fft type");
+  }
+  string val = lexer_.GetStrVal();
+  if (!FftType_Parse(val, result) || !FftType_IsValid(*result)) {
+    return TokenError(Printf("expects fft type but sees: %s", val.c_str()));
+  }
   lexer_.Lex();
   return true;
 }
 
 bool HloParser::ParseFusionKind(HloInstruction::FusionKind* result) {
   VLOG(1) << "ParseFusionKind";
-  if (lexer_.GetKind() != TokKind::kFusionKind) {
+  if (lexer_.GetKind() != TokKind::kIdent) {
     return TokenError("expects fusion kind");
   }
-  *result = lexer_.GetFusionKindVal();
+  string val = lexer_.GetStrVal();
+  auto status_or_result = StringToFusionKind(val);
+  if (!status_or_result.ok()) {
+    return TokenError(
+        Printf("expects fusion kind but sees: %s, error: %s", val.c_str(),
+               status_or_result.status().error_message().c_str()));
+  }
+  *result = status_or_result.ValueOrDie();
+  lexer_.Lex();
+  return true;
+}
+
+bool HloParser::ParseRandomDistribution(RandomDistribution* result) {
+  VLOG(1) << "ParseRandomDistribution";
+  if (lexer_.GetKind() != TokKind::kIdent) {
+    return TokenError("expects random distribution");
+  }
+  string val = lexer_.GetStrVal();
+  auto status_or_result = StringToRandomDistribution(val);
+  if (!status_or_result.ok()) {
+    return TokenError(
+        Printf("expects random distribution but sees: %s, error: %s",
+               val.c_str(), status_or_result.status().error_message().c_str()));
+  }
+  *result = status_or_result.ValueOrDie();
   lexer_.Lex();
   return true;
 }
@@ -2103,20 +2580,24 @@ bool HloParser::EatIfPresent(TokKind kind) {
   return true;
 }
 
-bool HloParser::AddInstruction(const string& name,
-                               HloInstruction* instruction) {
-  auto result = instruction_pool_.insert({name, instruction});
+bool HloParser::AddInstruction(const string& name, HloInstruction* instruction,
+                               LocTy name_loc) {
+  auto result = instruction_pool_.insert({name, {instruction, name_loc}});
   if (!result.second) {
-    return TokenError(StrCat("instruction already exists: ", name));
+    Error(name_loc, StrCat("instruction already exists: ", name));
+    return Error(/*loc=*/result.first->second.second,
+                 "instruction previously defined here");
   }
   return true;
 }
 
-bool HloParser::AddComputation(const string& name,
-                               HloComputation* computation) {
-  auto result = computation_pool_.insert({name, computation});
+bool HloParser::AddComputation(const string& name, HloComputation* computation,
+                               LocTy name_loc) {
+  auto result = computation_pool_.insert({name, {computation, name_loc}});
   if (!result.second) {
-    return TokenError(StrCat("computation already exists: ", name));
+    Error(name_loc, StrCat("computation already exists: ", name));
+    return Error(/*loc=*/result.first->second.second,
+                 "computation previously defined here");
   }
   return true;
 }
@@ -2127,7 +2608,7 @@ StatusOr<std::unique_ptr<HloModule>> Parse(StringPiece str,
                                            const HloModuleConfig& config) {
   HloParser parser(str, config);
   if (!parser.Run()) {
-    return InvalidArgument("Syntax error: %s", parser.GetError().c_str());
+    return InvalidArgument("Syntax error:\n%s", parser.GetError().c_str());
   }
   return parser.ConsumeHloModule();
 }
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index 90cdb87a1ebcf59d291eebd52963a130f19f4403..b8c6b59204f897c7dc07b846370b5b776a19a808 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -25,7 +25,6 @@ namespace tools {
 namespace {
 
 using tensorflow::StringPiece;
-using tensorflow::strings::StrCat;
 
 struct TestData {
   string test_name;
@@ -46,7 +45,7 @@ std::vector<TestData> CreateTestCases() {
 // ax + y
 {
 "AxpyParam",
-R"(HloModule axpy_module:
+R"(HloModule axpy_module
 
 ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   %alpha = f32[] parameter(0)
@@ -62,7 +61,7 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
 // pred constant
 {
 "ConstantPred",
-R"(HloModule constant_pred_module:
+R"(HloModule constant_pred_module
 
 ENTRY %constant_pred () -> pred[] {
   ROOT %constant = pred[] constant(true), metadata={op_type="const" op_name="\"it\'s not a problem\n" source_file="path/to/test.cc" source_line=68}
@@ -73,7 +72,7 @@ ENTRY %constant_pred () -> pred[] {
 // s32 constant
 {
 "ConstantS32",
-R"(HloModule constant_s32_module:
+R"(HloModule constant_s32_module
 
 ENTRY %constant_s32 () -> s32[] {
   ROOT %constant = s32[] constant(-42)
@@ -84,7 +83,7 @@ ENTRY %constant_s32 () -> s32[] {
 // f32 constant, but the value is not a decimal
 {
 "ConstantF32",
-R"(HloModule ConstantF32_module:
+R"(HloModule ConstantF32_module
 
 ENTRY %ConstantF32.v4 () -> f32[] {
   ROOT %constant = f32[] constant(42)
@@ -95,7 +94,7 @@ ENTRY %ConstantF32.v4 () -> f32[] {
 // f32 constant, rank 1 empty array.
 {
 "ConstantF32R1Empty",
-R"(HloModule ConstantF32Empty_module:
+R"(HloModule ConstantF32Empty_module
 
 ENTRY %ConstantF32Empty.v4 () -> f32[0] {
   ROOT %constant = f32[0]{0} constant({})
@@ -106,7 +105,7 @@ ENTRY %ConstantF32Empty.v4 () -> f32[0] {
 // f32 constant, rank 4 empty array.
 {
 "ConstantF32R4Empty",
-R"(HloModule ConstantF32R4Empty_module:
+R"(HloModule ConstantF32R4Empty_module
 
 ENTRY %ConstantF32R4Empty.v4 () -> f32[2,0,4,3] {
   ROOT %constant = f32[2,0,4,3]{3,2,1,0} constant(f32[2,0,4,3] { { /*i0=0*/ }, { /*i0=1*/ } })
@@ -117,7 +116,7 @@ ENTRY %ConstantF32R4Empty.v4 () -> f32[2,0,4,3] {
 // constant 4D
 {
 "Constant4D",
-R"(HloModule Small_3x2x1x1_module:
+R"(HloModule Small_3x2x1x1_module
 
 ENTRY %Small_3x2x1x1.v1 () -> f32[3,2,1,1] {
   ROOT %constant = f32[3,2,1,1]{3,2,1,0} constant(f32[3,2,1,1] { { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } })
@@ -128,7 +127,7 @@ ENTRY %Small_3x2x1x1.v1 () -> f32[3,2,1,1] {
 // non-finite constants: nan, inf, -inf
 {
 "ConstantNonFinite",
-R"(HloModule IsFiniteR1F32s_module:
+R"(HloModule IsFiniteR1F32s_module
 
 ENTRY %IsFiniteR1F32s.v2 () -> pred[6] {
   %constant = f32[6]{0} constant({nan, 7, nan, -1, inf, -inf})
@@ -140,18 +139,29 @@ ENTRY %IsFiniteR1F32s.v2 () -> pred[6] {
 // constant f16
 {
 "ConstantF16",
-R"(HloModule ConstantF16_module:
+R"(HloModule ConstantF16_module
 
 ENTRY %ConstantF16.v4 () -> f16[] {
   ROOT %constant = f16[] constant(500)
 }
 
+)"
+},
+// bf16
+{
+"BF16",
+R"(HloModule BF16
+
+ENTRY %BF16.v4 () -> bf16[] {
+  ROOT %constant = bf16[] constant(500)
+}
+
 )"
 },
 // constant + constant
 {
 "AddConstants",
-R"(HloModule add_constants_module:
+R"(HloModule add_constants_module
 
 ENTRY %add_constants () -> f32[] {
   %constant = f32[] constant(3.14)
@@ -163,7 +173,7 @@ ENTRY %add_constants () -> f32[] {
 // tuple constant
 {
 "TupleConstant",
-R"(HloModule TupleConstant_module:
+R"(HloModule TupleConstant_module
 
 ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) {
   ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { { 1 }, { 2 } }, {2, 42} ))
@@ -174,7 +184,7 @@ ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) {
 // v1 > v2 ? v1 : v2
 {
 "SelectR1F32",
-R"(HloModule SelectR1F32WithCmpR1F32sFromParamsSmall_module:
+R"(HloModule SelectR1F32WithCmpR1F32sFromParamsSmall_module
 
 ENTRY %SelectR1F32WithCmpR1F32sFromParamsSmall.v4 (v1: f32[4], v2: f32[4]) -> f32[4] {
   %v1 = f32[4]{0} parameter(0), sharding={maximal device=1}
@@ -188,7 +198,7 @@ ENTRY %SelectR1F32WithCmpR1F32sFromParamsSmall.v4 (v1: f32[4], v2: f32[4]) -> f3
 // empty tuple
 {
 "EmptyTupleCreate",
-R"(HloModule EmptyTupleCreate_module:
+R"(HloModule EmptyTupleCreate_module
 
 ENTRY %EmptyTupleCreate.v1 () -> () {
   ROOT %tuple = () tuple()
@@ -199,7 +209,7 @@ ENTRY %EmptyTupleCreate.v1 () -> () {
 // tuple
 {
 "TupleCreate",
-R"(HloModule TupleCreate_module:
+R"(HloModule TupleCreate_module
 
 ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) {
   %v1 = f32[] parameter(0)
@@ -212,7 +222,7 @@ ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f
 },
 {
 "ShardedTupleCreate",
-R"(HloModule ShardedTupleCreate_module:
+R"(HloModule ShardedTupleCreate_module
 
 ENTRY %ShardedTupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) {
   %v1 = f32[] parameter(0)
@@ -227,7 +237,7 @@ ENTRY %ShardedTupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f3
 // while (result < 5) { result = result + 1; }
 {
 "WhileWithScalarS32Result",
-R"(HloModule WhileWithScalarS32Result_module:
+R"(HloModule WhileWithScalarS32Result_module
 
 %body.v3 (prev.1: s32[]) -> s32[] {
   %constant = s32[] constant(1)
@@ -251,7 +261,7 @@ ENTRY %WhileWithScalarS32Result.v2 () -> s32[] {
 // send and recv
 {
 "SendRecv",
-R"(HloModule TwoSendRecvBothWayRecvFist_module:
+R"(HloModule TwoSendRecvBothWayRecvFist_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
   %recv = (f32[], u32[]) recv(), channel_id=15, sharding={maximal device=1}
@@ -266,7 +276,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 // get-tuple-element
 {
 "GetTupleElement",
-R"(HloModule GetTupleElement_module:
+R"(HloModule GetTupleElement_module
 
 ENTRY %GetTupleElement.v4 () -> s32[2,3] {
   %constant = f32[3]{0} constant({1, 2, 3})
@@ -280,7 +290,7 @@ ENTRY %GetTupleElement.v4 () -> s32[2,3] {
 // call
 {
 "Call",
-R"(HloModule CallR0F32IdentityScalar_module:
+R"(HloModule CallR0F32IdentityScalar_module
 
 %Identity.v1 (x: f32[]) -> f32[] {
   ROOT %x = f32[] parameter(0)
@@ -296,7 +306,7 @@ ENTRY %CallR0F32IdentityScalar.v2 () -> f32[] {
 // reduce window
 {
 "ReduceWindow",
-R"(HloModule R4UnitWindow_module:
+R"(HloModule R4UnitWindow_module
 
 %add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
   %lhs = f32[] parameter(0)
@@ -315,7 +325,7 @@ ENTRY %R4UnitWindow.v3 (operand: f32[13,12,8,15]) -> f32[13,3,8,15] {
 // reduce window on scalar
 {
 "ReduceWindowScalar",
-R"(HloModule reduce_window_scalar:
+R"(HloModule reduce_window_scalar
 
 %add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
   %lhs = f32[] parameter(0)
@@ -334,7 +344,7 @@ ENTRY %R4UnitWindowScalar () -> f32[] {
 // convolution
 {
 "Convolution",
-R"(HloModule Convolve1D1Window_0_module:
+R"(HloModule Convolve1D1Window_0_module
 
 ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
   %input = f32[1,2,1]{2,1,0} parameter(0)
@@ -348,7 +358,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 // convolution rank 2
 {
 "ConvolutionR2",
-R"(HloModule ConvolveR2_module:
+R"(HloModule ConvolveR2_module
 
 ENTRY %ConvolveR2.v3 (input: f32[1,2], filter: f32[1,1]) -> f32[1,2] {
   %input = f32[1,2]{1,0} parameter(0)
@@ -356,12 +366,25 @@ ENTRY %ConvolveR2.v3 (input: f32[1,2], filter: f32[1,1]) -> f32[1,2] {
   ROOT %convolution = f32[1,2]{0,1} convolution(f32[1,2]{1,0} %input, f32[1,1]{1,0} %filter), dim_labels=bf_io->bf
 }
 
+)"
+},
+// convolution backward
+{
+"ConvolutionBackward",
+R"(HloModule ConvolveBackward_module
+
+ENTRY %ConvolveBackward (input: f32[128,7,7,512], filter: f32[3,3,512,512]) -> f32[128,14,14,512] {
+  %input = f32[128,7,7,512]{0,3,2,1} parameter(0)
+  %filter = f32[3,3,512,512]{3,2,1,0} parameter(1)
+  ROOT %convolution-base-dilated = f32[128,14,14,512]{0,3,2,1} convolution(f32[128,7,7,512]{0,3,2,1} %input, f32[3,3,512,512]{3,2,1,0} %filter), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2 rhs_reversal=1x1}, dim_labels=b01f_01oi->b01f
+}
+
 )"
 },
 // reverse(constant)
 {
 "Reverse4D",
-R"(HloModule Reverse4DFloatArrayOnDim01_module:
+R"(HloModule Reverse4DFloatArrayOnDim01_module
 
 ENTRY %Reverse4DFloatArrayOnDim01.v2 () -> f32[4,3,2,1] {
   %constant = f32[4,3,2,1]{0,1,2,3} constant(f32[4,3,2,1] { { /*i0=0*/ { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} }, { /*i1=2*/ {5}, {6} } }, { /*i0=1*/ { /*i1=0*/ {7}, {8} }, { /*i1=1*/ {9}, {10} }, { /*i1=2*/ {11}, {12} } }, { /*i0=2*/ { /*i1=0*/ {13}, {14} }, { /*i1=1*/ {15}, {16} }, { /*i1=2*/ {17}, {18} } }, { /*i0=3*/ { /*i1=0*/ {19}, {20} }, { /*i1=1*/ {21}, {22} }, { /*i1=2*/ {23}, {24} } } })
@@ -373,7 +396,7 @@ ENTRY %Reverse4DFloatArrayOnDim01.v2 () -> f32[4,3,2,1] {
 // concat
 {
 "Concat",
-R"(HloModule Concat2x3With2x5_module:
+R"(HloModule Concat2x3With2x5_module
 
 ENTRY %Concat2x3With2x5.v3 () -> f32[2,8] {
   %constant = f32[2,3]{1,0} constant(f32[2,3] { { 0, 1, 2 }, { 1000, 1001, 1002 } })
@@ -381,50 +404,12 @@ ENTRY %Concat2x3With2x5.v3 () -> f32[2,8] {
   ROOT %concatenate = f32[2,8]{1,0} concatenate(f32[2,3]{1,0} %constant, f32[2,5]{1,0} %constant.1), dimensions={1}
 }
 
-)"
-},
-// map
-{
-"Map",
-R"(HloModule MapBinaryAdder_module:
-
-%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
-  %lhs = f32[] parameter(0)
-  %rhs = f32[] parameter(1)
-  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
-}
-
-ENTRY %MapBinaryAdder.v3 (param0: f32[4], param1: f32[4]) -> f32[4] {
-  %param0 = f32[4]{0} parameter(0)
-  %param1 = f32[4]{0} parameter(1)
-  ROOT %map = f32[4]{0} map(f32[4]{0} %param0, f32[4]{0} %param1), to_apply=%add_F32.v3
-}
-
-)"
-},
-// reduce
-{
-"Reduce",
-R"(HloModule ReduceR3ToR2_module:
-
-%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
-  %lhs = f32[] parameter(0)
-  %rhs = f32[] parameter(1)
-  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
-}
-
-ENTRY %ReduceR3ToR2.v3 (input: f32[8,16,256]) -> f32[8,16] {
-  %input = f32[8,16,256]{2,1,0} parameter(0)
-  %constant = f32[] constant(0)
-  ROOT %reduce = f32[8,16]{1,0} reduce(f32[8,16,256]{2,1,0} %input, f32[] %constant), dimensions={2}, to_apply=%add_F32.v3
-}
-
 )"
 },
 // select and scatter
 {
 "SelectAndScatter",
-R"(HloModule R4F32OverlapSmall_module:
+R"(HloModule R4F32OverlapSmall_module
 
 %ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] {
   %lhs = f32[] parameter(0)
@@ -450,7 +435,7 @@ ENTRY %R4F32OverlapSmall.v4 () -> f32[4,5,1,1] {
 // select and scatter on scalar
 {
 "SelectAndScatterScalar",
-R"(HloModule select_and_scatter_scalar:
+R"(HloModule select_and_scatter_scalar
 
 %ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] {
   %lhs = f32[] parameter(0)
@@ -476,7 +461,7 @@ ENTRY %SelectAndScatterScalar () -> f32[] {
 // slice
 {
 "Slice",
-R"(HloModule slice_module:
+R"(HloModule slice_module
 
 ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
   %p0 = f32[3,3,4,4]{3,2,1,0} parameter(0)
@@ -488,7 +473,7 @@ ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
 // slice, no stride
 {
 "SliceNoStride",
-R"(HloModule Slice3x3x3_To_1x3x3_F32_module:
+R"(HloModule Slice3x3x3_To_1x3x3_F32_module
 
 ENTRY %Slice3x3x3_To_1x3x3_F32.v2 () -> f32[1,3,3] {
   %constant = f32[3,3,3]{2,1,0} constant(f32[3,3,3] { { { 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 } }, { { 9, 10, 11 }, { 12, 13, 14 }, { 15, 16, 17 } }, { { 18, 19, 20 }, { 21, 22, 23 }, { 24, 25, 26 } } })
@@ -500,7 +485,7 @@ ENTRY %Slice3x3x3_To_1x3x3_F32.v2 () -> f32[1,3,3] {
 // slice R0
 {
 "SliceR0",
-R"(HloModule SliceR0_module:
+R"(HloModule SliceR0_module
 
 ENTRY %SliceR0.v2 () -> s32[] {
   %constant = s32[] constant(1)
@@ -512,7 +497,7 @@ ENTRY %SliceR0.v2 () -> s32[] {
 // transpose
 {
 "Transpose",
-R"(HloModule Transpose_module:
+R"(HloModule Transpose_module
 
 ENTRY %Transpose.v2 () -> s32[1,2,3] {
   %constant = s32[1,2,3]{2,1,0} constant(s32[1,2,3] { { { 1, 2, 3 }, { 4, 5, 6 } } })
@@ -524,7 +509,7 @@ ENTRY %Transpose.v2 () -> s32[1,2,3] {
 // Dynamic slice
 {
 "DynamicSlice",
-R"(HloModule DynamicSlice_module:
+R"(HloModule DynamicSlice_module
 
 ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[1]) -> s32[2,2,258] {
   %original_parameter = s32[2,2,258]{2,1,0} parameter(0)
@@ -539,7 +524,7 @@ ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[1]) -
 // Dynamic update slice
 {
 "DynamicUpdateSlice",
-R"(HloModule DynamicUpdateSlice_module:
+R"(HloModule DynamicUpdateSlice_module
 
 ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_indices: s32[4]) -> s32[1,1,25,1] {
   %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
@@ -553,7 +538,7 @@ ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_
 // batch norm training
 {
 "BatchNormTraining",
-R"(HloModule BasicTraining_module:
+R"(HloModule BasicTraining_module
 
 ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) {
   %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ {1, 2} }, { /*i1=1*/ {3, 4} } }, { /*i0=1*/ { /*i1=0*/ {5, 6} }, { /*i1=1*/ {7, 8} } } })
@@ -567,7 +552,7 @@ ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) {
 // batch norm inference
 {
 "BatchNormInference",
-R"(HloModule BatchNormInference_module:
+R"(HloModule BatchNormInference_module
 
 ENTRY %BatchNormInference.v6 (input: f32[2,2,2,2], offset: f32[2], scale: f32[2], mean: f32[2], variance: f32[2]) -> f32[2,2,2,2] {
   %input = f32[2,2,2,2]{3,2,1,0} parameter(0)
@@ -583,7 +568,7 @@ ENTRY %BatchNormInference.v6 (input: f32[2,2,2,2], offset: f32[2], scale: f32[2]
 // batch norm grad
 {
 "BatchNormGrad",
-R"(HloModule BatchNormGrad_module:
+R"(HloModule BatchNormGrad_module
 
 ENTRY %BatchNormGrad.v4 (input: f32[2,2,2,2], scale: f32[2], mean: f32[2], variance: f32[2], grad_output: f32[2,2,2,2]) -> (f32[2,2,2,2], f32[2], f32[2]) {
   %input = f32[2,2,2,2]{3,2,1,0} parameter(0)
@@ -594,12 +579,60 @@ ENTRY %BatchNormGrad.v4 (input: f32[2,2,2,2], scale: f32[2], mean: f32[2], varia
   ROOT %batch-norm-grad = (f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-grad(f32[2,2,2,2]{3,2,1,0} %input, f32[2]{0} %scale, f32[2]{0} %mean, f32[2]{0} %variance, f32[2,2,2,2]{3,2,1,0} %grad_output), epsilon=0.001, feature_index=0
 }
 
+)"
+},
+// fft
+{
+"Fft",
+R"(HloModule Fft_module
+
+ENTRY %Fft (input: c64[8,32]) -> c64[8,32] {
+  %input = c64[8,32]{1,0} parameter(0)
+  ROOT %fft = c64[8,32]{1,0} fft(c64[8,32]{1,0} %input), fft_type=FFT, fft_length={32}
+}
+
+)"
+},
+// ifft
+{
+"Ifft2d",
+R"(HloModule Ifft2d_module
+
+ENTRY %Ifft2d (input: c64[5,8,32]) -> c64[5,8,32] {
+  %input = c64[5,8,32]{2,1,0} parameter(0)
+  ROOT %fft = c64[5,8,32]{2,1,0} fft(c64[5,8,32]{2,1,0} %input), fft_type=IFFT, fft_length={8,32}
+}
+
+)"
+},
+// rfft2d
+{
+"Rfft2d",
+R"(HloModule Rfft2d_module
+
+ENTRY %Rfft2d (input: f32[5,64,32]) -> c64[5,64,17] {
+  %input = f32[5,64,32]{2,1,0} parameter(0)
+  ROOT %fft = c64[5,64,17]{2,1,0} fft(f32[5,64,32]{2,1,0} %input), fft_type=RFFT, fft_length={64,32}
+}
+
+)"
+},
+// irfft3d
+{
+"Irfft3d",
+R"(HloModule Irfft3d_module
+
+ENTRY %Irfft3d (input: c64[5,64,128,33]) -> f32[5,64,128,64] {
+  %input = c64[5,64,128,33]{3,2,1,0} parameter(0)
+  ROOT %fft = f32[5,64,128,64]{3,2,1,0} fft(c64[5,64,128,33]{3,2,1,0} %input), fft_type=IRFFT, fft_length={64,128,64}
+}
+
 )"
 },
 // pad
 {
 "Pad",
-R"(HloModule Pad1DS3Array_module:
+R"(HloModule Pad1DS3Array_module
 
 ENTRY %Pad1DS3Array.v3 () -> f32[8] {
   %constant = f32[3]{0} constant({1, 2, 3})
@@ -612,7 +645,7 @@ ENTRY %Pad1DS3Array.v3 () -> f32[8] {
 // pad has interior
 {
 "PadHasInterior",
-R"(HloModule PadHasInterior_module:
+R"(HloModule PadHasInterior_module
 
 ENTRY %PadHasInterior.v3 (input: f32[1,25,7,7]) -> f32[1,25,17,11] {
   %input = f32[1,25,7,7]{3,2,1,0} parameter(0)
@@ -620,12 +653,25 @@ ENTRY %PadHasInterior.v3 (input: f32[1,25,7,7]) -> f32[1,25,17,11] {
   ROOT %pad = f32[1,25,17,11]{3,2,1,0} pad(f32[1,25,7,7]{3,2,1,0} %input, f32[] %constant), padding=0_0_0x0_0_0x2_2_1x2_2_0
 }
 
+)"
+},
+// Negative padding
+{
+"PadHasNegativePadding",
+R"(HloModule PadHasNegativePadding_module
+
+ENTRY %PadHasNegativePadding (input: f32[1,25,7,7,10]) -> f32[1,15,6,3,29] {
+  %input = f32[1,25,7,7,10]{4,3,2,1,0} parameter(0)
+  %constant = f32[] constant(-5.123)
+  ROOT %pad = f32[1,15,6,3,29]{4,3,2,1,0} pad(f32[1,25,7,7,10]{4,3,2,1,0} %input, f32[] %constant), padding=0_0_0x0_-10_0x0_-1_0x-2_-2_0x-1_-1_3
+}
+
 )"
 },
 // fusion
 {
 "Fusion",
-R"(HloModule fusion_module:
+R"(HloModule fusion_module
 
 %fused_computation (constant.param_0: f32[3,2,1,1], constant.1.param_1: f32[2]) -> f32[3,2,1,1] {
   %constant.param_0 = f32[3,2,1,1]{3,2,1,0} parameter(0)
@@ -640,22 +686,182 @@ ENTRY %fusion.v3 () -> f32[3,2,1,1] {
   ROOT %fusion = f32[3,2,1,1]{3,2,1,0} fusion(f32[3,2,1,1]{3,2,1,0} %constant, f32[2]{0} %constant.1), kind=kLoop, calls=%fused_computation
 }
 
+)"
+},
+{
+"Sparse",
+R"(HloModule sparse_f32
+
+ENTRY %sparse () -> f32[2,3,4] {
+  ROOT %foo = f32[2,3,4]sparse{10} constant(f32[2,3,4]{[0, 1, 2]: 1, [1, 2, 3]: 2, [2, 3, 4]: 3})
+}
+
+)"
+},
+{
+"SparseEmpty",
+R"(HloModule sparse_f32_empty
+
+ENTRY %sparse_f32_empty () -> f32[2,3,4] {
+  ROOT %foo = f32[2,3,4]sparse{10} constant(f32[2,3,4]{})
+}
+
+)"
+},
+{
+"SparseR1",
+R"(HloModule sparse_f32_r1
+
+ENTRY %sparse_f32_r1 () -> f32[9] {
+  ROOT %foo = f32[9]sparse{10} constant(f32[9]{1: 2, 3: 4, 5: 6})
+}
+
+)"
+},
+  });
+  // clang-format on
+}
+
+std::vector<TestData> CreateShortTestCases() {
+  // clang-format off
+  return std::vector<TestData>({
+// map
+{
+"Map",
+R"(HloModule MapBinaryAdder_module
+
+add_F32.v3 {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY MapBinaryAdder.v3 {
+  param0 = f32[4]{0} parameter(0)
+  param1 = f32[4]{0} parameter(1)
+  ROOT map = f32[4]{0} map(param0, param1), to_apply=add_F32.v3
+}
+
+)"
+},
+// reduce
+{
+"Reduce",
+R"(HloModule ReduceR3ToR2_module
+
+add_F32.v3 {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY ReduceR3ToR2.v3 {
+  input = f32[8,16,256]{2,1,0} parameter(0)
+  constant = f32[] constant(0)
+  ROOT reduce = f32[8,16]{1,0} reduce(input, constant), dimensions={2}, to_apply=add_F32.v3
+}
+
 )"
 },
 // infeed/outfeed
 {
 "InfeedOutfeed",
-R"(HloModule outfeed_module:
+R"(HloModule outfeed_module
+
+ENTRY InfeedToOutfeed {
+  infeed = (u32[3]{0}, pred[]) infeed()
+  outfeed = () outfeed(infeed)
+  ROOT infeed.1 = (u32[3]{0}, pred[]) infeed()
+  outfeed.1 = () outfeed(infeed.1)
+}
+
+)"
+},
+// Rng
+{
+"Rng",
+R"(HloModule rng_module
+
+ENTRY Rng {
+  constant = f32[] constant(0)
+  constant.1 = f32[] constant(1)
+  ROOT rng = f32[8]{0} rng(constant, constant.1), distribution=rng_uniform
+}
+
+)"
+},
+// Reduce precision
+{
+"ReducePrevison",
+R"(HloModule reduce_precision
+
+ENTRY ReducePrecision {
+  constant = f32[1]{0} constant({3.14159})
+  ROOT reduce-precision = f32[1]{0} reduce-precision(constant), exponent_bits=8, mantissa_bits=10
+}
+
+)"
+},
+// Conditional
+{
+"Conditional",
+R"(HloModule conditional
+
+Negate {
+  x = f32[] parameter(0)
+  ROOT negate = f32[] negate(x)
+}
+
+Identity {
+  y = f32[] parameter(0)
+  ROOT copy = f32[] copy(y)
+}
 
-ENTRY %InfeedToOutfeed () -> (u32[3], pred[]) {
-  %infeed = (u32[3]{0}, pred[]) infeed()
-  %outfeed = () outfeed((u32[3]{0}, pred[]) %infeed)
-  ROOT %infeed.1 = (u32[3]{0}, pred[]) infeed()
-  %outfeed.1 = () outfeed((u32[3]{0}, pred[]) %infeed.1)
+ENTRY Parameters1.v4 {
+  constant = pred[] constant(true)
+  constant.1 = f32[] constant(56)
+  constant.2 = f32[] constant(12)
+  ROOT conditional = f32[] conditional(constant, constant.1, constant.2), true_computation=Negate, false_computation=Identity
 }
 
 )"
+},
+// CustomCall
+{
+"CustomCall",
+R"(HloModule custom_call
+
+ENTRY CustomCall {
+  constant = f32[1]{0} constant({12345})
+  ROOT custom-call = f32[1,2,3]{0,2,1} custom-call(constant), custom_call_target="foo\"bar"
+}
+
+)"
+},
+// Variables with non-default names
+{
+"NonDefaultNames",
+R"(HloModule add_constants_module
+
+ENTRY add_constants {
+  foo = f32[] constant(3.14)
+  ROOT bar = f32[] add(foo, foo)
 }
+
+)"
+},
+{
+"Dot",
+R"(HloModule dot
+
+ENTRY dot {
+  a = f32[2,10]{1,0} parameter(0)
+  b = f32[10,3]{1,0} parameter(1)
+  ROOT dot = f32[2,3]{1,0} dot(a, b), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+)"
+},
   });
   // clang-format on
 }
@@ -674,18 +880,35 @@ class HloParserTest : public ::testing::Test,
   void ExpectEqual() {
     const string& original = GetParam().module_string;
     auto result = Parse(original);
-    TF_EXPECT_OK(result.status());
+    TF_ASSERT_OK(result.status());
+    EXPECT_EQ(original, result.ValueOrDie()->ToString(
+                            HloPrintOptions().set_print_large_constants(true)));
+  }
+};
+
+class HloParserShortTest : public HloParserTest {
+ protected:
+  void ExpectEqualShort() {
+    const string& original = GetParam().module_string;
+    auto result = Parse(original);
+    TF_ASSERT_OK(result.status());
     EXPECT_EQ(original,
-              result.ValueOrDie()->ToString(/*include_large_constants=*/true));
+              result.ValueOrDie()->ToString(HloPrintOptions::ShortParsable()));
   }
 };
 
 TEST_P(HloParserTest, Run) { ExpectEqual(); }
 
+TEST_P(HloParserShortTest, Run) { ExpectEqualShort(); }
+
 INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserTest,
                         ::testing::ValuesIn(CreateTestCases()),
                         TestDataToString);
 
+INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserShortTest,
+                        ::testing::ValuesIn(CreateShortTestCases()),
+                        TestDataToString);
+
 TEST_F(HloParserTest, Empty) {
   const string original = "";
   auto result = Parse(original);
@@ -749,7 +972,7 @@ ENTRY %blabla (x: f32[]) -> pred[] {
 }
 
 TEST_F(HloParserTest, MoreConstants) {
-  const string original = R"(HloModule SelectScalarS32True_module:
+  const string original = R"(HloModule SelectScalarS32True_module
 
 ENTRY %SelectScalarS32True.v4 () -> s32[] {
   %constant.2 = pred[] constant(true)
@@ -766,7 +989,7 @@ ENTRY %SelectScalarS32True.v4 () -> s32[] {
 }
 
 TEST_F(HloParserTest, LiteralDimensionsMismatch_1) {
-  const string original = R"(HloModule some_2_module:
+  const string original = R"(HloModule some_2_module
 
 ENTRY %some_2 () -> f32[2] {
   ROOT %constant = f32[2]{0} constant({1,{2}})
@@ -780,7 +1003,7 @@ ENTRY %some_2 () -> f32[2] {
 }
 
 TEST_F(HloParserTest, LiteralDimensionsMismatch_2) {
-  const string original = R"(HloModule some_2x3_module:
+  const string original = R"(HloModule some_2x3_module
 
 ENTRY %some_2x3 () -> f32[2,3] {
   ROOT %constant = f32[2,3]{1,0} constant(f32[2,3] {1, 2, 3, 4, 5, 6})
@@ -794,7 +1017,7 @@ ENTRY %some_2x3 () -> f32[2,3] {
 }
 
 TEST_F(HloParserTest, LiteralDimensionsMismatch_3) {
-  const string original = R"(HloModule some_2x3x2_module:
+  const string original = R"(HloModule some_2x3x2_module
 
 ENTRY %some_2x3x2 () -> f32[2,3,2] {
   ROOT %constant = f32[2,3,2]{2,1,0} constant(f32[2,3,2] {{{1, 2}, {3, 4}, {5, 6}, {7, 8}, {9, 10}, {11, 12}}})
@@ -809,7 +1032,7 @@ ENTRY %some_2x3x2 () -> f32[2,3,2] {
 
 TEST_F(HloParserTest, ConstantF16Overflow) {
   const string original =
-      R"(HloModule ConstantF16Overflow_module:
+      R"(HloModule ConstantF16Overflow_module
 
 ENTRY %ConstantF16Overflow.v4 () -> f16[] {
   ROOT %constant = f16[] constant(-65505)
@@ -823,7 +1046,7 @@ ENTRY %ConstantF16Overflow.v4 () -> f16[] {
 }
 
 TEST_F(HloParserTest, ConstantWithExp) {
-  const string original = R"(HloModule ConstantWithExp_module:
+  const string original = R"(HloModule ConstantWithExp_module
 
 ENTRY %ConstantWithExp.v4 () -> f32[] {
   %constant.1 = f32[] constant(3e+2)
@@ -838,7 +1061,7 @@ ENTRY %ConstantWithExp.v4 () -> f32[] {
 }
 
 TEST_F(HloParserTest, AttibutesAnyOrder) {
-  const string original = R"(HloModule any_order_module:
+  const string original = R"(HloModule any_order_module
 
 ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
   %input = f32[1,2,1]{2,1,0} parameter(0)
@@ -852,7 +1075,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 }
 
 TEST_F(HloParserTest, InvalidDimLabels) {
-  string prefix = R"(HloModule invalid_dim_labels_module:
+  string prefix = R"(HloModule invalid_dim_labels_module
 
 ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
   %input = f32[1,2,1]{2,1,0} parameter(0)
@@ -864,19 +1087,21 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 
 )";
 
-  ExpectHasSubstr(Parse(StrCat(prefix, ",dim_labels=00_01_10", suffix))
-                      .status()
-                      .error_message(),
-                  "expects dim labels pattern");
+  ExpectHasSubstr(
+      Parse(tensorflow::strings::StrCat(prefix, ",dim_labels=00_01_10", suffix))
+          .status()
+          .error_message(),
+      "expects dim labels pattern");
 
-  ExpectHasSubstr(Parse(StrCat(prefix, ",dim_labels=010_1100->010", suffix))
+  ExpectHasSubstr(Parse(tensorflow::strings::StrCat(
+                            prefix, ",dim_labels=010_1100->010", suffix))
                       .status()
                       .error_message(),
                   "must have the same rank");
 }
 
 TEST_F(HloParserTest, UnexpectedAttribute) {
-  const string original = R"(HloModule unexpected_attr_module:
+  const string original = R"(HloModule unexpected_attr_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
   %recv = (f32[], u32[]) recv(), channel_id=15
@@ -892,7 +1117,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 TEST_F(HloParserTest, MissingAttribute) {
-  const string original = R"(HloModule missing_attr_module:
+  const string original = R"(HloModule missing_attr_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
   %recv = (f32[], u32[]) recv(), channel_id=15
@@ -908,7 +1133,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 TEST_F(HloParserTest, PredecessorUndefined) {
-  const string original = R"(HloModule pre_not_found_module:
+  const string original = R"(HloModule pre_not_found_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
   %recv = (f32[], u32[]) recv(), channel_id=15
@@ -924,7 +1149,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 TEST_F(HloParserTest, SliceAllowOmitStride1) {
-  const string original = R"(HloModule slice_module:
+  const string original = R"(HloModule slice_module
 
 ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
   %p0 = f32[3,3,4,4]{3,2,1,0} parameter(0)
@@ -936,7 +1161,7 @@ ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
 }
 
 TEST_F(HloParserTest, PaddingConfigIsNotWindowPad) {
-  const string original = R"(HloModule window_pad_module:
+  const string original = R"(HloModule window_pad_module
 
 ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
   %input = f32[1,2,1]{2,1,0} parameter(0)
@@ -951,7 +1176,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 }
 
 TEST_F(HloParserTest, CommaBetweenSubAttributes) {
-  const string original = R"(HloModule test_comma_module:
+  const string original = R"(HloModule test_comma_module
 
 ENTRY %test_comma.v4 () -> f32[] {
   ROOT %constant = f32[] constant(-4.2), metadata={source_line=5, op_type="::const"}
@@ -961,6 +1186,124 @@ ENTRY %test_comma.v4 () -> f32[] {
   TF_EXPECT_OK(Parse(original).status());
 }
 
+TEST_F(HloParserTest, ComputationShapeDoesNotMatchRootShape) {
+  const string original = R"(HloModule custom_call:
+
+ENTRY %CustomCall () -> f32[1] {
+  %constant = f32[1]{0} constant({12345})
+  ROOT %foo = f32[1,2,3]{0,2,1} custom-call(f32[1]{0} %constant), custom_call_target="foo\"bar"
+})";
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  "Shape of computation CustomCall, f32[1], is not compatible "
+                  "with that of its root instruction foo, f32[1,2,3]");
+}
+
+TEST_F(HloParserTest, EntryComputationWithLayout) {
+  const string original = R"(HloModule layout:
+add_F32.v3 {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
+  input = f32[8,16,256]{0,1,2} parameter(0)
+  constant = f32[] constant(0)
+  ROOT reduce = f32[8,16]{0,1} reduce(input, constant), dimensions={2}, to_apply=add_F32.v3
+})";
+
+  auto module = Parse(original);
+  TF_ASSERT_OK(module.status());
+  auto program_layout = module.ValueOrDie()->entry_computation_layout();
+  ASSERT_EQ(program_layout.parameter_count(), 1);
+  auto param_layout = program_layout.parameter_layout(0).layout();
+  auto result_layout = program_layout.result_layout().layout();
+  EXPECT_TRUE(
+      LayoutUtil::Equal(LayoutUtil::MakeLayout({0, 1, 2}), param_layout))
+      << "actual layout of parameter(0) is "
+      << LayoutUtil::HumanString(param_layout);
+  EXPECT_TRUE(LayoutUtil::Equal(LayoutUtil::MakeLayout({0, 1}), result_layout))
+      << "actual layout of result is "
+      << LayoutUtil::HumanString(result_layout);
+}
+
+TEST_F(HloParserTest, NoEntry) {
+  const string original = R"(HloModule no_entry:
+c1 {
+  const1 = f32[1]{0} constant({12345})
+}
+c2 {
+  const2 = f32[1]{0} constant({67890})
+})";
+  auto module = Parse(original);
+  TF_ASSERT_OK(module.status());
+  EXPECT_EQ(module.ValueOrDie()->entry_computation()->name(), "c2");
+}
+
+TEST_F(HloParserTest, NoRoot) {
+  const string original = R"(HloModule no_root:
+ENTRY consts {
+  first = f32[1]{0} constant({12345})
+  last = f32[1]{0} constant({67890})
+})";
+  auto module = Parse(original);
+  TF_ASSERT_OK(module.status());
+  EXPECT_EQ(
+      module.ValueOrDie()->entry_computation()->root_instruction()->name(),
+      "last");
+}
+
+TEST_F(HloParserTest, MultipleEntries) {
+  const string original = R"(HloModule multiple_entries:
+ENTRY c1 {
+  const1 = f32[1]{0} constant({12345})
+}
+ENTRY c2 {
+  const2 = f32[1]{0} constant({67890})
+})";
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  "expects only one ENTRY");
+}
+
+TEST_F(HloParserTest, MultipleRoots) {
+  const string original = R"(HloModule multiple_roots:
+ENTRY consts {
+  ROOT const1 = f32[1]{0} constant({12345})
+  ROOT const2 = f32[1]{0} constant({12345})
+})";
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  "one computation should have only one ROOT");
+}
+
+TEST_F(HloParserTest, InstructionExists) {
+  const string original = R"(HloModule comp_exists
+c1 {
+  instr = f32[1]{0} constant({12345})
+}
+c2 {
+  instr = f32[1]{0} constant({67890})
+})";
+
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  R"(was parsing 3:3: error: instruction previously defined here
+  instr = f32[1]{0} constant({12345})
+  ^)");
+}
+
+TEST_F(HloParserTest, ComputationExists) {
+  const string original = R"(HloModule comp_exists
+comp {
+  const1 = f32[1]{0} constant({12345})
+}
+comp {
+  const2 = f32[1]{0} constant({67890})
+})";
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  R"(was parsing 2:1: error: computation previously defined here
+comp {
+^)");
+}
+
 }  // namespace
 }  // namespace tools
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_token.h b/tensorflow/compiler/xla/tools/parser/hlo_token.h
index 07e48804d053f31bdff6678f09ee2c1e3b731e0f..7928bee5c2097f353b182095a555c334d7b69c95 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_token.h
+++ b/tensorflow/compiler/xla/tools/parser/hlo_token.h
@@ -18,6 +18,9 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/types.h"
+
 namespace xla {
 namespace tools {
 
@@ -60,10 +63,9 @@ enum class TokKind {
   kDimLabels,      // [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
   kDxD,            // [0-9]+(x[0-9]+)+
   kPad,            // [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*
+  kIdent,          // other identifiers
   kString,         // "abcd\"\n"
   kShape,          // f32[2,3]{1,0}
-  kOpcode,         // add
-  kFusionKind,     // kLoop, kOutput, ...
   kInt,            // 42
   kDecimal,        // 4.2
 };
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index ec3f6a0471e2ae965846f5ef7560e448fe9d8073..eda5effbb92db92c9317a956497a00c0ec15c27c 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -59,25 +59,33 @@ namespace xla {
 namespace tools {
 namespace {
 
+// Command-line opts to this tool.  See main() for descriptions of these
+// fields.
+struct Options {
+  string fake_infeed_shape;
+  bool use_fake_data = false;
+  bool print_result = true;
+  int num_runs = 1;
+};
+
 // Invokes the given computation passing arbitrary data for every (unbound)
 // parameter if use_fake_data, Otherwise use recorded data if available.
 //
 // Similarly, infeeds fake data of shape fake_infeed_shape if it is provided;
 // otherwise, no infeed is performed.
 StatusOr<std::unique_ptr<Literal>> ReplayComputation(
-    const SessionModule& module, int num_runs,
-    tensorflow::StringPiece fake_infeed_shape, bool use_fake_data,
-    Client* client) {
+    const SessionModule& module, Client* client, const Options& opts) {
   TF_ASSIGN_OR_RETURN(Computation computation, client->LoadSnapshot(module));
 
   std::vector<std::unique_ptr<GlobalData>> arguments;
-  if (use_fake_data) {
+  if (opts.use_fake_data) {
     arguments = MakeFakeArgumentsOrDie(computation, client);
   } else {  // use recorded data if available
     for (const auto& proto : module.arguments()) {
-      Literal literal(proto);
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Literal> literal,
+                          Literal::CreateFromProto(proto));
       TF_ASSIGN_OR_RETURN(std::unique_ptr<GlobalData> data,
-                          client->TransferToServer(literal));
+                          client->TransferToServer(*literal));
       arguments.push_back(std::move(data));
     }
   }
@@ -86,12 +94,12 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
   // concurrent infeed occur via the fake_infeed_shape.
   tensorflow::gtl::optional<tensorflow::thread::ThreadPool> pool;
 
-  if (!fake_infeed_shape.empty()) {
+  if (!opts.fake_infeed_shape.empty()) {
     pool.emplace(tensorflow::Env::Default(), "infeed",
                  /*num_threads=*/1);
-    pool->Schedule([fake_infeed_shape, client]() {
+    pool->Schedule([opts, client]() {
       StatusOr<Shape> shape_status =
-          ShapeUtil::ParseShapeString(fake_infeed_shape);
+          ShapeUtil::ParseShapeString(opts.fake_infeed_shape);
       TF_CHECK_OK(shape_status.status());
       Shape shape = std::move(shape_status).ValueOrDie();
       StatusOr<std::unique_ptr<Literal>> data_status = MakeFakeLiteral(shape);
@@ -112,19 +120,19 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
   // Run the computation num_runs times, and return the result from the last
   // execution.
   std::unique_ptr<Literal> result;
-  for (int i = 0; i < num_runs; ++i) {
+  for (int i = 0; i < opts.num_runs; ++i) {
     ExecutionProfile profile;
-    if (use_fake_data) {
-      // If using fake data, execute the computation but don't bother retrieving
-      // the result -- presumably it's uninteresting, since our data is fake.
+    if (opts.print_result) {
+      TF_ASSIGN_OR_RETURN(result, client->ExecuteAndTransfer(
+                                      computation, execute_arguments,
+                                      /*execution_options=*/nullptr, &profile));
+    } else {
+      // If we're not printing the result, execute the computation but don't
+      // bother retrieving the result.  This can be a significant speedup.
       TF_RETURN_IF_ERROR(client
                              ->Execute(computation, execute_arguments,
                                        /*execution_options=*/nullptr, &profile)
                              .status());
-    } else {
-      TF_ASSIGN_OR_RETURN(result, client->ExecuteAndTransfer(
-                                      computation, execute_arguments,
-                                      /*execution_options=*/nullptr, &profile));
     }
     LOG(INFO) << "Execution took "
               << static_cast<double>(profile.compute_time_ns()) / 1e9 << "s";
@@ -133,16 +141,15 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
   return std::move(result);
 }
 
-int RealMain(tensorflow::gtl::ArraySlice<char*> args, int num_runs,
-             tensorflow::StringPiece fake_infeed_shape, bool use_fake_data) {
+int RealMain(tensorflow::gtl::ArraySlice<char*> args, const Options& opts) {
   Client* client = ClientLibrary::LocalClientOrDie();
   tensorflow::Env* env = tensorflow::Env::Default();
   int exit_status = EXIT_SUCCESS;
   for (char* arg : args) {
     SessionModule module;
     TF_CHECK_OK(tensorflow::ReadBinaryProto(env, arg, &module));
-    StatusOr<std::unique_ptr<Literal>> result_status = ReplayComputation(
-        module, num_runs, fake_infeed_shape, use_fake_data, client);
+    StatusOr<std::unique_ptr<Literal>> result_status =
+        ReplayComputation(module, client, opts);
     if (!result_status.ok()) {
       fprintf(stderr, "%s: error: %s\n", arg,
               result_status.status().ToString().c_str());
@@ -156,12 +163,16 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args, int num_runs,
               ShapeUtil::HumanString(result->shape()).c_str(),
               result->ToString().c_str());
       if (module.has_result()) {
+        std::unique_ptr<Literal> literal =
+            Literal::CreateFromProto(module.result()).ConsumeValueOrDie();
         fprintf(stdout, "was %s:%s\n",
                 ShapeUtil::HumanString(module.result().shape()).c_str(),
-                Literal(module.result()).ToString().c_str());
+                literal->ToString().c_str());
       }
     }
   }
+
+  ClientLibrary::DestroyLocalInstances();
   return exit_status;
 }
 
@@ -170,16 +181,15 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args, int num_runs,
 }  // namespace xla
 
 int main(int argc, char** argv) {
-  // Flags
-  xla::string fake_infeed_shape;
-  bool use_fake_data = false;
-  int num_runs = 1;
+  xla::tools::Options opts;
   const std::vector<tensorflow::Flag> flag_list = {
-      tensorflow::Flag("use_fake_data", &use_fake_data,
+      tensorflow::Flag("use_fake_data", &opts.use_fake_data,
                        "Replay computation using fake data"),
-      tensorflow::Flag("num_runs", &num_runs,
+      tensorflow::Flag("print_result", &opts.print_result,
+                       "Print the result of the computation to stdout"),
+      tensorflow::Flag("num_runs", &opts.num_runs,
                        "Number of times to run each computation"),
-      tensorflow::Flag("fake_infeed_shape", &fake_infeed_shape,
+      tensorflow::Flag("fake_infeed_shape", &opts.fake_infeed_shape,
                        "Shape of fake data to construct for (infinite) infeed"),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
@@ -191,5 +201,5 @@ int main(int argc, char** argv) {
 
   tensorflow::gtl::ArraySlice<char*> args(argv, argc);
   args.pop_front();  // Pop off the binary name, argv[0]
-  return xla::tools::RealMain(args, num_runs, fake_infeed_shape, use_fake_data);
+  return xla::tools::RealMain(args, opts);
 }
diff --git a/tensorflow/compiler/xla/tools/show_literal.cc b/tensorflow/compiler/xla/tools/show_literal.cc
index b50cb5e28eac14ed99af566939f8bd64e393ff64..fe8e72ba32bb4493b2751cfdfeb977f271092f9c 100644
--- a/tensorflow/compiler/xla/tools/show_literal.cc
+++ b/tensorflow/compiler/xla/tools/show_literal.cc
@@ -40,7 +40,8 @@ int main(int argc, char **argv) {
   xla::LiteralProto literal_proto;
   TF_CHECK_OK(tensorflow::ReadBinaryProto(tensorflow::Env::Default(), argv[1],
                                           &literal_proto));
-  xla::Literal literal(literal_proto);
+  std::unique_ptr<xla::Literal> literal =
+      xla::Literal::CreateFromProto(literal_proto).ConsumeValueOrDie();
   LOG(INFO) << "literal: " << literal_proto.ShortDebugString();
-  fprintf(stderr, "%s\n", literal.ToString().c_str());
+  fprintf(stderr, "%s\n", literal->ToString().c_str());
 }
diff --git a/tensorflow/compiler/xla/tools/show_text_literal.cc b/tensorflow/compiler/xla/tools/show_text_literal.cc
index bbe9902aa17a585c4bad5b732330305dfdd45302..8525873e913185554d18df8c8c3584bfcdcdcabe 100644
--- a/tensorflow/compiler/xla/tools/show_text_literal.cc
+++ b/tensorflow/compiler/xla/tools/show_text_literal.cc
@@ -39,13 +39,13 @@ int main(int argc, char **argv) {
   std::unique_ptr<xla::Literal> literal =
       xla::TextLiteralReader::ReadPath(argv[1]).ConsumeValueOrDie();
 
-  LOG(INFO) << "literal: " << literal->ShortDebugString();
+  LOG(INFO) << "literal: " << *literal;
   fprintf(stderr, "%s\n", literal->ToString().c_str());
   if (literal->shape().element_type() == xla::F32) {
-    float min =
-        *std::min_element(literal->f32s().begin(), literal->f32s().end());
-    float max =
-        *std::max_element(literal->f32s().begin(), literal->f32s().end());
+    float min = *std::min_element(literal->data<float>().begin(),
+                                  literal->data<float>().end());
+    float max = *std::max_element(literal->data<float>().begin(),
+                                  literal->data<float>().end());
     fprintf(stderr, "min: %a=%f\n", min, min);
     fprintf(stderr, "max: %a=%f\n", max, max);
   }
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index e595df3052c3de64de503d7627eff72dcba177ee..1f0c626bbb2d64ef4e67c9ec51485ae96ae73d04 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -30,9 +30,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stacktrace.h"
 
 namespace xla {
-namespace {
 
-// Logs the provided status message with a backtrace.
 Status WithLogBacktrace(const Status& status) {
   CHECK(!status.ok());
   VLOG(1) << status.ToString();
@@ -40,8 +38,6 @@ Status WithLogBacktrace(const Status& status) {
   return status;
 }
 
-}  // namespace
-
 ScopedLoggingTimer::ScopedLoggingTimer(const string& label, bool enabled)
     : enabled(enabled), label(label) {
   if (enabled) {
@@ -74,13 +70,18 @@ Status AppendStatus(Status prior, tensorflow::StringPiece context) {
 // Implementation note: we can't common these out (without using macros) because
 // they all need to va_start/va_end their varargs in their frame.
 
-Status InvalidArgument(const char* format, ...) {
+Status InvalidArgumentV(const char* format, va_list args) {
   string message;
+  tensorflow::strings::Appendv(&message, format, args);
+  return WithLogBacktrace(tensorflow::errors::InvalidArgument(message));
+}
+
+Status InvalidArgument(const char* format, ...) {
   va_list args;
   va_start(args, format);
-  tensorflow::strings::Appendv(&message, format, args);
+  Status result = InvalidArgumentV(format, args);
   va_end(args);
-  return WithLogBacktrace(tensorflow::errors::InvalidArgument(message));
+  return result;
 }
 
 Status Unimplemented(const char* format, ...) {
@@ -191,9 +192,9 @@ std::vector<int64> ComposePermutations(tensorflow::gtl::ArraySlice<int64> p1,
   return output;
 }
 
-bool IsIdentityPermutation(tensorflow::gtl::ArraySlice<int64> p) {
-  for (int64 i = 0; i < p.size(); ++i) {
-    if (p[i] != i) {
+bool IsIdentityPermutation(tensorflow::gtl::ArraySlice<int64> permutation) {
+  for (int64 i = 0; i < permutation.size(); ++i) {
+    if (permutation[i] != i) {
       return false;
     }
   }
@@ -338,7 +339,7 @@ std::vector<std::pair<int64, int64>> CommonFactors(
 
 string SanitizeFileName(string file_name) {
   for (char& c : file_name) {
-    if (c == '/' || c == '\\' || c == '[' || c == ']') {
+    if (c == '/' || c == '\\' || c == '[' || c == ']' || c == ' ') {
       c = '_';
     }
   }
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index b722095d1f38bf8a984c3ce9092a65f8e0baa911..08df5b12b3a53a138f56705531baa3333b23c5d8 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -40,6 +40,13 @@ limitations under the License.
 
 namespace xla {
 
+// Logs the provided status message with a backtrace.
+//
+// For use by Status-factories, logs a backtrace at the point where the status
+// is created, such that we can use --vmodule=util=1 to see all status
+// creation backtraces.
+Status WithLogBacktrace(const Status& status);
+
 // Ranks greater than 8 are very rare, so use InlinedVector<int64, 8> to store
 // the bounds and indices. And for the rare cases of ranks greater than 8,
 // the InlinedVector will just behave like an std::vector<> and allocate the
@@ -207,6 +214,27 @@ Status ResourceExhausted(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 Status NotFound(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 
+// Passed-varargs variant of the InvalidArgument factory above.
+Status InvalidArgumentV(const char* format, va_list args);
+
+template <typename... Args>
+Status UnimplementedStrCat(Args&&... concat) {
+  return Unimplemented(
+      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+}
+
+template <typename... Args>
+Status InternalErrorStrCat(Args&&... concat) {
+  return InternalError(
+      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+}
+
+template <typename... Args>
+Status ResourceExhaustedStrCat(Args&&... concat) {
+  return ResourceExhausted(
+      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+}
+
 // Splits the lines of the original, replaces leading whitespace with the prefix
 // given by "indentation", and returns the string joined by newlines again. As a
 // side effect, any additional trailing whitespace is removed.
@@ -239,11 +267,14 @@ std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
 
 // Override of the above that works around compile failures with gcc 7.1.1.
 // For details see https://github.com/tensorflow/tensorflow/issues/10843
+// Hide this workaround from MSVC as it causes ambiguous error.
+#ifndef _MSC_VER
 template <typename T>
 std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
                        const std::vector<T>& input) {
   return Permute<std::vector, T>(permutation, input);
 }
+#endif
 
 // Inverts a permutation, i.e., output_permutation[input_permutation[i]] = i.
 std::vector<int64> InversePermutation(
@@ -329,7 +360,7 @@ T CeilOfRatio(T dividend, T divisor) {
 }
 
 // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio
-// then multiplying by the divisor. For example: RoundUpToMultiple(13, 8) => 16
+// then multiplying by the divisor. For example: RoundUpToNearest(13, 8) => 16
 template <typename T>
 T RoundUpToNearest(T value, T divisor) {
   return CeilOfRatio(value, divisor) * divisor;
@@ -337,7 +368,7 @@ T RoundUpToNearest(T value, T divisor) {
 
 // Rounds the value down to a multiple of the divisor by first calling
 // FloorOfRatio then multiplying by the divisor. For example:
-// RoundUpToMultiple(13, 8) => 8
+// RoundDownToNearest(13, 8) => 8
 template <typename T>
 T RoundDownToNearest(T value, T divisor) {
   return FloorOfRatio(value, divisor) * divisor;
@@ -395,6 +426,33 @@ std::vector<std::pair<int64, int64>> CommonFactors(
 // Removes illegal characters from filenames.
 string SanitizeFileName(string file_name);
 
+template <typename Container, typename Predicate>
+bool c_all_of(Container container, Predicate predicate) {
+  return std::all_of(std::begin(container), std::end(container), predicate);
+}
+
+template <typename InputContainer, typename OutputIterator,
+          typename UnaryOperation>
+OutputIterator c_transform(InputContainer input_container,
+                           OutputIterator output_iterator,
+                           UnaryOperation unary_op) {
+  return std::transform(std::begin(input_container), std::end(input_container),
+                        output_iterator, unary_op);
+}
+
+template <class InputContainer, class OutputIterator, class UnaryPredicate>
+OutputIterator c_copy_if(InputContainer input_container,
+                         OutputIterator output_iterator,
+                         UnaryPredicate predicate) {
+  return std::copy_if(std::begin(input_container), std::end(input_container),
+                      output_iterator, predicate);
+}
+
+template <class InputContainer, class Comparator>
+void c_sort(InputContainer& input_container, Comparator comparator) {
+  std::sort(input_container.begin(), input_container.end(), comparator);
+}
+
 }  // namespace xla
 
 #define XLA_LOG_LINES(SEV, STRING) \
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 2e0eba8de0100fb4e7e45348618febd778c88c9a..93284b80f9e1f82c4b18dc7388754d5c01a7740c 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -25,6 +26,28 @@ limitations under the License.
 namespace xla {
 namespace window_util {
 
+Window MakeWindow(tensorflow::gtl::ArraySlice<int64> sizes) {
+  Window window;
+  for (int64 size : sizes) {
+    auto* dimension = window.add_dimensions();
+    dimension->set_size(size);
+    dimension->set_stride(1);
+    dimension->set_base_dilation(1);
+    dimension->set_window_dilation(1);
+  }
+  return window;
+}
+
+PaddingConfig MakeSymmetricPadding(tensorflow::gtl::ArraySlice<int64> sizes) {
+  PaddingConfig config;
+  for (int64 size : sizes) {
+    auto* dimension = config.add_dimensions();
+    dimension->set_edge_padding_low(size);
+    dimension->set_edge_padding_high(size);
+  }
+  return config;
+}
+
 /* static */ string ToString(const WindowDimension& dim) {
   using tensorflow::strings::StrAppend;
   using tensorflow::strings::StrCat;
@@ -88,6 +111,11 @@ string ToString(const Window& window) {
       return StrCat(dim.window_dilation());
     });
   }
+  if (HasWindowReversal(window)) {
+    add_field(" rhs_reversal", [](const WindowDimension& dim) {
+      return StrCat(dim.window_reversal() ? 1 : 0);
+    });
+  }
   return str;
 }
 
@@ -109,13 +137,21 @@ bool HasPadding(const Window& window) {
   return false;
 }
 
-bool HasEvenPadding(const Window& window) {
+bool HasSymmetricPadding(const Window& window) {
   return std::all_of(window.dimensions().begin(), window.dimensions().end(),
                      [](const WindowDimension& dim) {
                        return dim.padding_low() == dim.padding_high();
                      });
 }
 
+bool HasSymmetricPadding(const PaddingConfig& padding_config) {
+  return std::all_of(padding_config.dimensions().begin(),
+                     padding_config.dimensions().end(),
+                     [](const PaddingConfig::PaddingConfigDimension& dim) {
+                       return dim.edge_padding_low() == dim.edge_padding_high();
+                     });
+}
+
 bool HasNegativePadding(const Window& window) {
   return std::any_of(window.dimensions().begin(), window.dimensions().end(),
                      [](const WindowDimension& dim) {
@@ -141,10 +177,25 @@ bool HasWindowDilation(const Window& window) {
   return false;
 }
 
+bool HasWindowReversal(const Window& window) {
+  for (const auto& dim : window.dimensions()) {
+    if (dim.window_reversal()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool HasDilation(const Window& window) {
   return HasBaseDilation(window) || HasWindowDilation(window);
 }
 
+bool IsInactiveWindowDimension(const Window& window, int64 logical_dim) {
+  const WindowDimension& window_dim = window.dimensions(logical_dim);
+  return window_dim.size() == 1 && window_dim.stride() == 1 &&
+         window_dim.padding_low() == 0 && window_dim.padding_high() == 0;
+}
+
 int64 DilatedBound(int64 bound, int64 dilation) {
   CHECK_GE(bound, 0);
   CHECK_GE(dilation, 1);
diff --git a/tensorflow/compiler/xla/window_util.h b/tensorflow/compiler/xla/window_util.h
index 235cb2d59d451a25dc4f824ab488f8cef6b03bfb..ba473e2c8c35202865a9a4981da7653fe1d6f552 100644
--- a/tensorflow/compiler/xla/window_util.h
+++ b/tensorflow/compiler/xla/window_util.h
@@ -18,10 +18,21 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace xla {
 namespace window_util {
 
+// Creates a window with the given sizes in the dimensions and all strides set
+// to 1.
+Window MakeWindow(tensorflow::gtl::ArraySlice<int64> sizes);
+
+// Creates a padding config with symmetrical padding in each dimension, of value
+// given by sizes; e.g. {0, 1, 2} would create a R3 padding config that had zero
+// pixels of padding in dimension 0, one pixel of padding symmetrically, on each
+// side of dimension 1, and two pixels of padding symmetrically on dimension 2.
+PaddingConfig MakeSymmetricPadding(tensorflow::gtl::ArraySlice<int64> sizes);
+
 string ToString(const WindowDimension& dim);
 string ToString(const Window& window);
 
@@ -32,13 +43,24 @@ string ToString(const Window& window);
 
 bool HasStride(const Window& window);
 bool HasPadding(const Window& window);
-bool HasEvenPadding(const Window& window);
+bool HasSymmetricPadding(const Window& window);
 bool HasNegativePadding(const Window& window);
 
+// As with HasSymmetricPadding(Window) above, returns whether the "padding low"
+// is equivalent to the "padding high" for all dimensions, but works on a
+// padding configuration.
+bool HasSymmetricPadding(const PaddingConfig& padding_config);
+
 bool HasBaseDilation(const Window& window);
 bool HasWindowDilation(const Window& window);
 bool HasDilation(const Window& window);
 
+bool HasWindowReversal(const Window& window);
+
+// Returns true if the given logical dimension is inactive in the sense that it
+// has window bound 1, no striding and no padding.
+bool IsInactiveWindowDimension(const Window& window, int64 logical_dim);
+
 // Returns the new bound after dilation.
 //
 // If a window with the given bound in some dimension is dilated with the given
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 127e5e81ac6d21945c7125ef913d236e8892758e..56162ab44e2e0e3e4478fe631888f243332dc1d8 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -82,8 +82,9 @@ message DebugOptions {
   // Dump all HLO modules as text into the provided directory path.
   string xla_generate_hlo_text_to = 7;
 
-  // Dump compilation artifacts in binary proto into this directory.
-  string xla_dump_hlo_proto_to = 8;
+  // Dump Hlo after all hlo passes are executed as proto binary into this
+  // directory.
+  string xla_dump_optimized_hlo_proto_to = 8;
 
   // Instrument the computation to collect per-HLO cycle counts.
   bool xla_hlo_profile = 9;
@@ -175,6 +176,18 @@ message DebugOptions {
   // assignments, if available.
   bool xla_hlo_tfgraph_device_scopes = 93;
 
+  // If true, the GPU backend is free to use cudnn for HLO batch normalization
+  // ops.
+  bool xla_gpu_use_cudnn_batchnorm = 94;
+
+  // Dump HLO before any hlo passes are executed as proto binary into this
+  // directory.
+  string xla_dump_unoptimized_hlo_proto_to = 95;
+
+  // Dump HLO after each pass as an HloProto in binary file format into this
+  // directory.
+  string xla_dump_per_pass_hlo_proto_to = 96;
+
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 2ba1a2d904e45e582ee4e8a4ea889ee69d55e747..3aea0217539b89b5d60ecfaf2605eee4b69af728 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -114,6 +114,17 @@ message PaddingConfig {
   repeated PaddingConfigDimension dimensions = 1;
 }
 
+// A format specifies the method used by a layout to store an array in memory.
+enum Format {
+  INVALID_FORMAT = 0;
+  // The default layout, with exactly one storage location per element (ignoring
+  // padding).
+  DENSE = 1;
+  // A sparsely encoded layout, providing only the index/value pairs of non-zero
+  // elements.
+  SPARSE = 2;
+}
+
 // A layout describes how the array is placed in (1D) memory space.  This
 // includes the minor-to-major ordering of dimensions within a shape, as well as
 // any padding present in those dimensions.
@@ -124,21 +135,30 @@ message PaddingConfig {
 //
 // See the XLA documentation for more information on shapes and layouts.
 message Layout {
+  // The method used to store the data in memory. The format determines which of
+  // the other fields are used by the layout.
+  Format format = 4;
+
   // Sequence of dimension numbers, from minor (fastest varying index) to major
   // (slowest varying index). This field is required.
   repeated int64 minor_to_major = 1;
 
-  // The width to which the layout of each dimension is padded up
-  // to. If present, the size of the padded_dimensions must equal the
-  // rank of the shape. The padding appears at the end of a dimension,
-  // not at the beginning. This kind of padding, unlike padding in
-  // e.g. convolution, is not part of the shape.
+  // The width to which the layout of each dimension is padded up to. If
+  // present, the size of the padded_dimensions must equal the rank of the
+  // shape. The padding appears at the end of a dimension, not at the
+  // beginning. This kind of padding, unlike padding in e.g. convolution, is not
+  // part of the shape. This field must be unset unless the format is DENSE.
   repeated int64 padded_dimensions = 2;
 
-  // Describes the values in the padding specified by
-  // padded_dimensions.
+  // Describes the values in the padding specified by padded_dimensions. This
+  // field must be unset unless the format is DENSE.
   PaddingValue padding_value = 3;
 
+  // The maximum number of elements that can be stored for SPARSE formats.  This
+  // can be used to determine the maximum size in bytes of arrays stored in
+  // memory.  This field must be unset unless the format is SPARSE.
+  int64 max_sparse_elements = 5;
+
   // Important: if any field is added, be sure to modify ShapeUtil::Equal()
   // appropriately to account for the new field.
 }
@@ -321,7 +341,8 @@ message LiteralProto {
   // The F16s and BF16s are encoded in little endian byte order
   bytes f16s = 11;
   bytes bf16s = 13;
-  // Next = 14
+  repeated int64 sparse_indices = 14;
+  // Next = 15
 }
 
 message WindowDimension {
@@ -498,6 +519,23 @@ message CustomCallRequest {
   Shape shape = 4;
 }
 
+message DotDimensionNumbers {
+  // The dimension numbers that represent the 'lhs' contracting dimensions.
+  repeated int64 lhs_contracting_dimensions = 1;
+  // The dimension numbers that represent the 'rhs' contracting dimensions.
+  repeated int64 rhs_contracting_dimensions = 2;
+  // The dimension numbers that represent the 'lhs' batch dimensions.
+  repeated int64 lhs_batch_dimensions = 3;
+  // The dimension numbers that represent the 'rhs' batch dimensions.
+  repeated int64 rhs_batch_dimensions = 4;
+};
+
+message DotRequest {
+  ComputationDataHandle lhs = 2;
+  ComputationDataHandle rhs = 3;
+  DotDimensionNumbers dimension_numbers = 4;
+}
+
 message MapRequest {
   repeated ComputationDataHandle operands = 2;
   ComputationHandle to_apply = 3;
@@ -651,6 +689,14 @@ message ConcatenateRequest {
   int64 dimension = 3;
 }
 
+message ConditionalRequest {
+  ComputationDataHandle predicate = 2;
+  ComputationDataHandle true_operand = 3;
+  ComputationHandle true_computation = 4;
+  ComputationDataHandle false_operand = 5;
+  ComputationHandle false_computation = 6;
+}
+
 message WhileRequest {
   ComputationHandle condition = 2;
   ComputationHandle body = 3;
@@ -732,9 +778,6 @@ enum BinaryOperation {
   BINOP_LT = 9;
   BINOP_NE = 10;
 
-  // Dot product, matrix multiply.
-  BINOP_DOT = 12;
-
   // Element-wise maximum.
   BINOP_MAX = 14;
 
@@ -780,9 +823,7 @@ enum RandomDistribution {
   // parameter[0] and standard deviation parameter[1].
   RNG_NORMAL = 2;
 
-  // Creates a Bernoulli-distribution-generated random number with mean
-  // parameter[0].
-  RNG_BERNOULLI = 3;
+  // Next: 4
 }
 
 message RngRequest {
@@ -885,6 +926,7 @@ message OpRequest {
     ConvolveRequest convolve_request = 8;
     CrossReplicaSumRequest cross_replica_sum_request = 9;
     CustomCallRequest custom_call_request = 10;
+    DotRequest dot_request = 43;
     DynamicSliceRequest dynamic_slice_request = 11;
     DynamicUpdateSliceRequest dynamic_update_slice_request = 12;
     GetTupleElementRequest get_tuple_element_request = 13;
@@ -914,7 +956,8 @@ message OpRequest {
     BatchNormInferenceRequest batch_norm_inference_request = 38;
     FftRequest fft_request = 41;
     ConvertRequest bitcast_convert_request = 42;
-    // Next: 43
+    ConditionalRequest conditional_request = 44;
+    // Next: 45
   }
 }
 
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index b7ade951150412e0ad3f72c235f0677e68fce66e..bab37e8906e5c648acdc1556da7e5f4601776ff5 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -6,10 +6,17 @@ licenses(["notice"])  # Apache 2.0
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
 load("//third_party/mpi:mpi.bzl", "if_mpi")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 
 py_library(
     name = "contrib_py",
-    srcs = glob(["**/*.py"]),
+    srcs = glob(
+        ["**/*.py"],
+        exclude = [
+            "**/*_test.py",
+        ],
+    ),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
@@ -18,7 +25,9 @@ py_library(
         "//tensorflow/contrib/bayesflow:bayesflow_py",
         "//tensorflow/contrib/boosted_trees:init_py",
         "//tensorflow/contrib/cloud:cloud_py",
+        "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
+        "//tensorflow/contrib/coder:coder_ops_py",
         "//tensorflow/contrib/compiler:compiler_py",
         "//tensorflow/contrib/copy_graph:copy_graph_py",
         "//tensorflow/contrib/crf:crf_py",
@@ -29,6 +38,7 @@ py_library(
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow/contrib/estimator:estimator_py",
         "//tensorflow/contrib/factorization:factorization_py",
+        "//tensorflow/contrib/feature_column:feature_column_py",
         "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/fused_conv:fused_conv_py",
@@ -41,6 +51,7 @@ py_library(
         "//tensorflow/contrib/image:single_image_random_dot_stereograms_py",
         "//tensorflow/contrib/input_pipeline:input_pipeline_py",
         "//tensorflow/contrib/integrate:integrate_py",
+        "//tensorflow/contrib/kafka",
         "//tensorflow/contrib/keras",
         "//tensorflow/contrib/kernel_methods",
         "//tensorflow/contrib/kfac",
@@ -48,6 +59,7 @@ py_library(
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
         "//tensorflow/contrib/legacy_seq2seq:seq2seq_py",
+        "//tensorflow/contrib/libsvm",
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/contrib/linear_optimizer:sdca_estimator_py",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
@@ -60,13 +72,15 @@ py_library(
         "//tensorflow/contrib/metrics:metrics_py",
         "//tensorflow/contrib/model_pruning",
         "//tensorflow/contrib/nccl:nccl_py",
-        "//tensorflow/contrib/ndlstm",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_py",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/opt:opt_py",
+        "//tensorflow/contrib/periodic_resample:init_py",
         "//tensorflow/contrib/predictor",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
+        "//tensorflow/contrib/py2tf",
+        "//tensorflow/contrib/receptive_field:receptive_field_py",
         "//tensorflow/contrib/reduce_slice_ops:reduce_slice_ops_py",
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
         "//tensorflow/contrib/resampler:resampler_py",
@@ -94,20 +108,22 @@ py_library(
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:util",
-    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_ops_py"]),
+    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_tensorrt([
+        "//tensorflow/contrib/tensorrt:init_py",
+    ]),
 )
 
 cc_library(
     name = "contrib_kernels",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/batching:batch_ops_kernels",
         "//tensorflow/contrib/boosted_trees:boosted_trees_kernels",
+        "//tensorflow/contrib/coder:all_kernels",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_kernels",
+        "//tensorflow/contrib/data/kernels:dataset_kernels",
         "//tensorflow/contrib/factorization/kernels:all_kernels",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_kernels",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_kernel",
-        "//tensorflow/contrib/nccl:nccl_kernels",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_kernels",
         "//tensorflow/contrib/rnn:all_kernels",
         "//tensorflow/contrib/seq2seq:beam_search_ops_kernels",
@@ -115,19 +131,23 @@ cc_library(
         "//tensorflow/contrib/tensor_forest:stats_ops_kernels",
         "//tensorflow/contrib/tensor_forest:tensor_forest_kernels",
         "//tensorflow/contrib/text:all_kernels",
-    ],
+    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_cuda([
+        "//tensorflow/contrib/nccl:nccl_kernels",
+    ]),
 )
 
 cc_library(
     name = "contrib_ops_op_lib",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/batching:batch_ops_op_lib",
         "//tensorflow/contrib/boosted_trees:boosted_trees_ops_op_lib",
+        "//tensorflow/contrib/coder:all_ops",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_ops_op_lib",
+        "//tensorflow/contrib/data:dataset_ops_op_lib",
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
+        "//tensorflow/contrib/kafka:kafka_ops_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
         "//tensorflow/contrib/nccl:nccl_ops_op_lib",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_op_lib",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 1eda1abfcf779ece7af3dbf2554c2a0a8c2611e9..4f6f539027b040de7554d09fe9118ff97aa006f8 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -19,9 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 # Add projects here, they will show up under tf.contrib.
+from tensorflow.contrib import batching
 from tensorflow.contrib import bayesflow
 from tensorflow.contrib import cloud
 from tensorflow.contrib import cluster_resolver
+from tensorflow.contrib import coder
 from tensorflow.contrib import compiler
 from tensorflow.contrib import copy_graph
 from tensorflow.contrib import crf
@@ -31,6 +33,7 @@ from tensorflow.contrib import deprecated
 from tensorflow.contrib import distributions
 from tensorflow.contrib import estimator
 from tensorflow.contrib import factorization
+from tensorflow.contrib import feature_column
 from tensorflow.contrib import framework
 from tensorflow.contrib import gan
 from tensorflow.contrib import graph_editor
@@ -55,6 +58,7 @@ from tensorflow.contrib import model_pruning
 from tensorflow.contrib import nccl
 from tensorflow.contrib import nn
 from tensorflow.contrib import opt
+from tensorflow.contrib import periodic_resample
 from tensorflow.contrib import predictor
 from tensorflow.contrib import quantization
 from tensorflow.contrib import quantize
@@ -80,14 +84,14 @@ from tensorflow.contrib import training
 from tensorflow.contrib import util
 from tensorflow.contrib.eager.python import tfe as eager
 from tensorflow.contrib.lite.python import lite
-from tensorflow.contrib.ndlstm import python as ndlstm
+from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
 from tensorflow.contrib.remote_fused_graph import pylib as remote_fused_graph
 from tensorflow.contrib.specs import python as specs
 from tensorflow.contrib.summary import summary
 
 from tensorflow.python.util.lazy_loader import LazyLoader
-ffmpeg = LazyLoader("ffmpeg",
-                    globals(), "tensorflow.contrib.ffmpeg")
+ffmpeg = LazyLoader("ffmpeg", globals(),
+                    "tensorflow.contrib.ffmpeg")
 del LazyLoader
 
 del absolute_import
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py
index a5057da9fd43a88575813613d6ac9d17fd2b2e28..6658f0d9c13f6db17b25354cde2593d57f104f17 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@@ -48,7 +48,7 @@ def _flatten_tensors(tensors):
   if shape.ndims is None:
     raise ValueError("At least one of the tensors in 'tensors' must have "
                      "statically known rank.")
-  if len(shape) > 1:
+  if len(shape) != 1:
     reshaped = []
     for t in tensors:
       with ops.colocate_with(t):
@@ -289,7 +289,7 @@ def build_ring_all_reduce(input_tensors, num_workers, num_subchunks,
                                        chunks_by_dev)
   if pad_len > 0:
     output_tensors = _strip_padding(output_tensors, pad_len)
-  if len(shape) > 1:
+  if len(shape) != 1:
     output_tensors = _reshape_tensors(output_tensors, shape)
   return output_tensors
 
@@ -466,7 +466,7 @@ def build_recursive_hd_all_reduce(input_tensors, red_op, un_op=None):
   if un_op:
     reduced_shards = [un_op(t) for t in reduced_shards]
   output_tensors = _build_recursive_hd_scatter(reduced_shards, devices)
-  if len(shape) > 1:
+  if len(shape) != 1:
     output_tensors = _reshape_tensors(output_tensors, shape)
   return output_tensors
 
@@ -578,7 +578,7 @@ def build_shuffle_all_reduce(input_tensors, gather_devices, red_op, un_op=None):
   reduced_shards = _build_shuffle_gather(input_tensors, gather_devices,
                                          red_op, un_op)
   output_tensors = _build_shuffle_scatter(reduced_shards, dst_devices)
-  if len(shape) > 1:
+  if len(shape) != 1:
     output_tensors = _reshape_tensors(output_tensors, shape)
   return output_tensors
 
@@ -744,21 +744,21 @@ def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
   level_2_output = upper_level_f(up_values)
   # Third stage: propagate within each worker using NCCL Broadcast
   for w in range(0, num_workers):
-    dst_devices = per_worker_devices[w][1:]
-    send_op, dst_tensors = nccl.broadcast(level_2_output[w], dst_devices)
-    # NOTE: need control dependency to ensure send_op executes
-    with ops.control_dependencies([send_op]):
-      with ops.device(per_worker_devices[w][0]):
-        dst_tensors.insert(0, array_ops.identity(level_2_output[w]))
-        down_values[w] = dst_tensors
+    dst_tensors = []
+    with ops.device(per_worker_devices[w][0]):
+      broadcast_src = nccl.broadcast(array_ops.identity(level_2_output[w]))
+    for d in per_worker_devices[w]:
+      with ops.device(d):
+        dst_tensors.append(array_ops.identity(broadcast_src))
+    down_values[w] = dst_tensors
   output_tensors = [v for sublist in down_values for v in sublist]
-  if len(shape) > 1:
+  if len(shape) != 1:
     output_tensors = _reshape_tensors(output_tensors, shape)
   return output_tensors
 
 
 def _reduce_non_singleton(input_tensors, red_f, un_op):
-  """If input_tenors has more than one element apply red_f, else apply un_op."""
+  """If input_tensors has more than one element apply red_f, else apply un_op."""
   if len(input_tensors) > 1:
     return red_f(input_tensors)
   else:
@@ -831,7 +831,7 @@ def _build_shuffle_hybrid(input_tensors, gather_devices, red_op, upper_level_f):
   for w in range(0, num_workers):
     output_tensors += _build_shuffle_scatter(
         [level_2_output[w]], per_worker_devices[w])
-  if len(shape) > 1:
+  if len(shape) != 1:
     output_tensors = _reshape_tensors(output_tensors, shape)
   return output_tensors
 
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce_test.py b/tensorflow/contrib/all_reduce/python/all_reduce_test.py
index 0802b2736909c2a6f075ea2eac6d4dd3ab2918d8..47bab0a3670a90644972b2c961954a3036b8ecba 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce_test.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce_test.py
@@ -119,7 +119,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
   def _buildInitialVars(self, shape, dev_list):
     values = []
     num_devices = len(dev_list)
-    dim = np.prod(shape)
+    dim = np.prod(shape) if shape else 1
     for d in range(0, num_devices):
       with ops.device(dev_list[d]):
         npt = np.zeros(shape).astype(np.float32)
@@ -164,6 +164,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
                     (num_workers, num_gpus, shape, subdiv, elapsed))
 
   def testRingAllReduce(self):
+    self._testRingAllReduce(1, 2, [], 1)
     self._testRingAllReduce(1, 2, [8], 1)
     self._testRingAllReduce(1, 2, [4, 4], 1)
     self._testRingAllReduce(6, 1, [8], 1)
@@ -192,6 +193,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
                     "elapsed=%f" % (num_workers, num_gpus, shape, elapsed))
 
   def testShuffleAllReduce(self):
+    self._testShuffleAllReduce(1, 2, [], 1)
     self._testShuffleAllReduce(1, 2, [8], 1)
     self._testShuffleAllReduce(1, 2, [4, 4], 1)
     self._testShuffleAllReduce(1, 8, [32], 1)
diff --git a/tensorflow/contrib/android/README.md b/tensorflow/contrib/android/README.md
index f49e5857fe5255c2459793cb1389052a2ff5f88f..db37bcf73d144eb81c32a461a276d10be7e2d193 100644
--- a/tensorflow/contrib/android/README.md
+++ b/tensorflow/contrib/android/README.md
@@ -15,9 +15,9 @@ For prebuilt libraries, see the
 page for a recent build.
 
 The TensorFlow Inference Interface is also available as a
-[JCenter package](https://bintray.com/google/tensorflow/tensorflow-android) and
-can be included quite simply in your android project with a couple of lines in
-the project's `build.gradle` file:
+[JCenter package](https://bintray.com/google/tensorflow/tensorflow)
+(see the tensorflow-android directory) and can be included quite simply in your
+android project with a couple of lines in the project's `build.gradle` file:
 
 ```
 allprojects {
@@ -32,9 +32,9 @@ dependencies {
 ```
 
 This will tell Gradle to use the
-[latest version](https://bintray.com/google/tensorflow/tensorflow-android/_latestVersion)
+[latest version](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
 of the TensorFlow AAR that has been released to
-[https://bintray.com/google/tensorflow/tensorflow-android](https://bintray.com/google/tensorflow/tensorflow-android).
+[JCenter](https://jcenter.bintray.com/org/tensorflow/tensorflow-android/).
 You may replace the `+` with an explicit version label if you wish to
 use a specific release of TensorFlow in your app.
 
@@ -81,6 +81,11 @@ For documentation on building a self-contained AAR file with cmake, see
 [tensorflow/contrib/android/cmake](cmake).
 
 
+### Makefile
+
+For documentation on building native TF libraries with make, including a CUDA-enabled variant for devices like the Nvidia Shield TV, see [tensorflow/contrib/makefile/README.md](../makefile/README.md)
+
+
 ## AssetManagerFileSystem
 
 This directory also contains a TensorFlow filesystem supporting the Android
diff --git a/tensorflow/contrib/android/asset_manager_filesystem.h b/tensorflow/contrib/android/asset_manager_filesystem.h
index 2b43939f148e360945e5d488d148fcb2c13008a6..665304b5eef1f8a3633c8c522259e20d744b1808 100644
--- a/tensorflow/contrib/android/asset_manager_filesystem.h
+++ b/tensorflow/contrib/android/asset_manager_filesystem.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_ANDROID_ASSET_MANAGER_FILESYSTEM_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_ANDROID_ASSET_MANAGER_FILESYSTEM_H_
+#ifndef TENSORFLOW_CONTRIB_ANDROID_ASSET_MANAGER_FILESYSTEM_H_
+#define TENSORFLOW_CONTRIB_ANDROID_ASSET_MANAGER_FILESYSTEM_H_
 
 #include <android/asset_manager.h>
 #include <android/asset_manager_jni.h>
@@ -79,4 +79,4 @@ class AssetManagerFileSystem : public FileSystem {
 };
 
 }  // namespace tensorflow
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_ANDROID_ASSET_MANAGER_FILESYSTEM_H_
+#endif  // TENSORFLOW_CONTRIB_ANDROID_ASSET_MANAGER_FILESYSTEM_H_
diff --git a/tensorflow/contrib/android/cmake/CMakeLists.txt b/tensorflow/contrib/android/cmake/CMakeLists.txt
index aba356d6167658f125001cbed6e3190c716ee7d6..a115d1610e2334a6626f29674f3dd195e3a3c648 100644
--- a/tensorflow/contrib/android/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/android/cmake/CMakeLists.txt
@@ -34,6 +34,8 @@ add_library(lib_tf STATIC IMPORTED )
 set_target_properties(lib_tf PROPERTIES IMPORTED_LOCATION
         ${PREBUILT_DIR}/lib/libtensorflow-core.a)
 # Change to compile flags should be replicated into bazel build file
+# TODO: Consider options other than -O2 for binary size.
+#       e.g. -Os for gcc, and -Oz for clang.
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIS_SLIM_BUILD \
                      -std=c++11 -fno-rtti -fno-exceptions \
                      -O2 -Wno-narrowing -fomit-frame-pointer \
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index dc5b9fb88742d78d0f40207b589e29451a6358dd..abddadac5bcace9b1f992b69bdcc69c24b29cd13 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -194,6 +194,11 @@ public class TensorFlowInferenceInterface {
    * @param outputNames A list of output nodes which should be filled by the inference pass.
    */
   public void run(String[] outputNames, boolean enableStats) {
+    run(outputNames, enableStats, new String[] {});
+  }
+
+  /** An overloaded version of runInference that allows supplying targetNodeNames as well */
+  public void run(String[] outputNames, boolean enableStats, String[] targetNodeNames) {
     // Release any Tensors from the previous run calls.
     closeFetches();
 
@@ -204,6 +209,11 @@ public class TensorFlowInferenceInterface {
       runner.fetch(tid.name, tid.outputIndex);
     }
 
+    // Add targets.
+    for (String t : targetNodeNames) {
+      runner.addTarget(t);
+    }
+
     // Run the session.
     try {
       if (enableStats) {
diff --git a/tensorflow/contrib/android/jni/run_stats_jni.cc b/tensorflow/contrib/android/jni/run_stats_jni.cc
index 119fa9cd2c378d2ba2383ea8b0e09e1b6083d84e..707853b59befc2625145ad96952fbf9f66d62b43 100644
--- a/tensorflow/contrib/android/jni/run_stats_jni.cc
+++ b/tensorflow/contrib/android/jni/run_stats_jni.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/stat_summarizer.h"
 
-using tensorflow::StatSummarizer;
 using tensorflow::RunMetadata;
+using tensorflow::StatSummarizer;
 
 namespace {
 StatSummarizer* requireHandle(JNIEnv* env, jlong handle) {
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index a111cfecb366fe245150cc71d2c43662d0d69090..ee67909133fc26ba98355db05a4b90d3dfa6b97b 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -12,7 +12,7 @@ cc_library(
     name = "batch_scheduler_hdrs",
     hdrs = ["batch_scheduler.h"],
     deps = [
-        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core/kernels/batching_util:batch_scheduler_hdrs",
     ],
 )
 
@@ -20,18 +20,7 @@ cc_library(
     name = "batch_scheduler",
     hdrs = ["batch_scheduler.h"],
     deps = [
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_cc_test(
-    name = "batch_scheduler_test",
-    srcs = ["batch_scheduler_test.cc"],
-    deps = [
-        ":batch_scheduler",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
+        "//tensorflow/core/kernels/batching_util:batch_scheduler",
     ],
 )
 
@@ -39,9 +28,7 @@ cc_library(
     name = "shared_batch_scheduler_hdrs",
     hdrs = ["shared_batch_scheduler.h"],
     deps = [
-        ":batch_scheduler_hdrs",
-        "//tensorflow/contrib/batching/util:periodic_function_dynamic",
-        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core/kernels/batching_util:shared_batch_scheduler_hdrs",
     ],
 )
 
@@ -49,46 +36,16 @@ cc_library(
     name = "shared_batch_scheduler",
     hdrs = ["shared_batch_scheduler.h"],
     deps = [
-        ":batch_scheduler",
-        "//tensorflow/contrib/batching/util:periodic_function_dynamic",
-        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/batching_util:shared_batch_scheduler",
     ],
     alwayslink = 1,
 )
 
-tf_cc_test(
-    name = "shared_batch_scheduler_test",
-    srcs = ["shared_batch_scheduler_test.cc"],
-    deps = [
-        ":shared_batch_scheduler",
-        "//tensorflow/contrib/batching/test_util:fake_clock_env",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
 cc_library(
     name = "adaptive_shared_batch_scheduler",
     hdrs = ["adaptive_shared_batch_scheduler.h"],
     deps = [
-        ":batch_scheduler",
-        "//tensorflow/contrib/batching/util:periodic_function_dynamic",
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_cc_test(
-    name = "adaptive_shared_batch_scheduler_test",
-    srcs = ["adaptive_shared_batch_scheduler_test.cc"],
-    tags = ["manual"],  # b/69013768
-    deps = [
-        ":adaptive_shared_batch_scheduler",
-        "//tensorflow/contrib/batching/test_util:fake_clock_env",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
+        "//tensorflow/core/kernels/batching_util:adaptive_shared_batch_scheduler",
     ],
 )
 
@@ -96,34 +53,7 @@ cc_library(
     name = "basic_batch_scheduler",
     hdrs = ["basic_batch_scheduler.h"],
     deps = [
-        ":shared_batch_scheduler",
-    ],
-)
-
-tf_cc_test(
-    name = "basic_batch_scheduler_test",
-    srcs = ["basic_batch_scheduler_test.cc"],
-    deps = [
-        ":basic_batch_scheduler",
-        ":batch_scheduler",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
-tf_cc_test(
-    name = "basic_batch_scheduler_benchmark",
-    srcs = ["basic_batch_scheduler_benchmark.cc"],
-    tags = [
-        "local",
-        "manual",
-    ],
-    deps = [
-        ":basic_batch_scheduler",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core:test",
+        "//tensorflow/core/kernels/batching_util:basic_batch_scheduler",
     ],
 )
 
@@ -137,48 +67,14 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
-tf_custom_op_library(
-    name = "python/ops/_batch_ops.so",
-    srcs = ["ops/batch_ops.cc"],
-    deps = [
-        "//tensorflow/contrib/batching/kernels:batch_kernels",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["batch_ops"],
-)
-
-tf_gen_op_wrapper_py(
-    name = "batch_ops",
-    deps = [":batch_ops_op_lib"],
-)
-
-tf_kernel_library(
-    name = "batch_ops_kernels",
-    deps = [
-        "//tensorflow/contrib/batching/kernels:batch_kernels",
-        "//tensorflow/contrib/batching/util:periodic_function",
-        "//tensorflow/core/kernels:concat_lib",
-        "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/kernels:split_lib",
-    ],
-    alwayslink = 1,
-)
-
-tf_custom_op_py_library(
+py_library(
     name = "batch_py",
     srcs = glob(["python/ops/*.py"]) + ["__init__.py"],
-    dso = [":python/ops/_batch_ops.so"],
-    kernels = [
-        ":batch_ops_kernels",
-        ":batch_ops_op_lib",
-    ],
     srcs_version = "PY2AND3",
     deps = [
-        ":batch_ops",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:batch_ops_gen",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
@@ -188,6 +84,14 @@ tf_custom_op_py_library(
     ],
 )
 
+cc_library(
+    name = "batch_ops_kernels",
+    deps = [
+        "//tensorflow/core/kernels:batch_kernels",
+    ],
+    alwayslink = 1,
+)
+
 py_test(
     name = "batch_ops_test",
     size = "small",
@@ -203,6 +107,7 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
         "//tensorflow/python:gradients",
         "//tensorflow/python:script_ops",
     ],
diff --git a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h b/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
index 6ed177e001758ad8c566c7965e1ec10ae5235fc8..86250e6692004a12a1fa338767a5db1e4c2e4195 100644
--- a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
@@ -13,450 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
+#ifndef TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
 
-#include <functional>
-#include <memory>
-#include <queue>
-#include <unordered_map>
-#include <vector>
+#include "tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h"
 
-#include "tensorflow/contrib/batching/batch_scheduler.h"
-#include "tensorflow/contrib/batching/util/periodic_function.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace serving {
-namespace internal {
-template <typename TaskType>
-class ASBSBatch;
-
-template <typename TaskType>
-class ASBSQueue;
-}  // namespace internal
-
-// Shared batch scheduler designed to minimize latency. The scheduler keeps
-// track of a number of queues (one per model or model version) which are
-// continuously enqueuing requests. The scheduler groups the requests into
-// batches which it periodically sends off for processing (see
-// shared_batch_scheduler.h for more details). The AdaptiveSharedBatchScheduler
-// prioritizes batches by age (i.e. the batch's oldest request) irrespective of
-// queue. The scheduler will process the oldest batch at an adjustable rate,
-// regardless of batch size. The user can provide feedback to help set this rate
-// to achieve some goal (i.e. minimize overall latency, limit cpu usage, etc).
-//
-// The rate (or rather, the corresponding period) is adjusted each time a batch
-// is processed, using an exponentially weighted moving average to smooth
-// potentially noisy feedback:
-// ewma_feedback = ((N - 1) * ewma_feedback + feedback()) / N
-// period *= (1 + K * emwa_feedback)
-//
-// Some potential use cases:
-// Hardware Accelerators (GPUs & TPUs) - If some phase of batch processing
-//   involves serial processing by a device, from a latency perspective it is
-//   desirable to keep the device evenly loaded, avoiding the need to wait for
-//   the device to process prior batches.
-//   feedback = num_pending_on_device() - desired_pending.
-// CPU utilization - If the batch processing is cpu dominated, you can reap
-//   latency gains when underutilized by increasing the processing rate, but
-//   back the rate off when the load increases to avoid overload.
-//   feedback = cpu_rate() - desired_cpu_rate.
-
-template <typename TaskType>
-class AdaptiveSharedBatchScheduler
-    : public std::enable_shared_from_this<
-          AdaptiveSharedBatchScheduler<TaskType>> {
- public:
-  struct Options {
-    // The name to use for the pool of batch threads.
-    string thread_pool_name = {"batch_threads"};
-    // Number of batch processing threads; equivalently the maximum number of
-    // concurrently running batches.
-    int64 num_batch_threads = port::NumSchedulableCPUs();
-    // The environment to use (typically only overridden by test code).
-    Env* env = Env::Default();
-    // Initial batch scheduling period in microseconds. Will be altered for
-    // non-zero rate_feedback.
-    double initial_scheduling_period_micros = 500;
-    // Minimum batch scheduling period in microseconds. Recommend setting this
-    // value greater than 0, otherwise it may take a while to recover from a
-    // sustained time of negative scheduling_period_feedback (which may occur
-    // under low load).
-    double min_scheduling_period_micros = 100;
-    // Maximum batch scheduling period in microseconds.
-    double max_scheduling_period_micros = 10000;
-    // Feedback function used to modify the scheduling period each time a batch
-    // is scheduled.  Should return values roughly O(1), with positive values
-    // resulting in an increased period.
-    std::function<double()> scheduling_period_feedback{[] { return 0.; }};
-    // To handle potentially noisy scheduling_period_feedback, the period is
-    // adjusted using an exponentially weighted moving average over the previous
-    // feedback_smoothing_batches batches.  Must be greater than 0.
-    int64 feedback_smoothing_batches = 10;
-  };
-
-  // Ownership is shared between the caller of Create() and any queues created
-  // via AddQueue().
-  static Status Create(
-      const Options& options,
-      std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>>* scheduler);
-
-  struct QueueOptions {
-    // Maximum size of each batch.
-    int max_batch_size = 1000;
-    // Maximum number of enqueued (i.e. non-scheduled) batches.
-    int max_enqueued_batches = 10;
-  };
-
-  using BatchProcessor = std::function<void(std::unique_ptr<Batch<TaskType>>)>;
-
-  // Adds queue (and its callback) to be managed by this scheduler.
-  Status AddQueue(const QueueOptions& options,
-                  BatchProcessor process_batch_callback,
-                  std::unique_ptr<BatchScheduler<TaskType>>* queue);
-
- private:
-  // access to AddBatch, RemoveQueue, GetEnv.
-  friend class internal::ASBSQueue<TaskType>;
-
-  explicit AdaptiveSharedBatchScheduler(const Options& options);
-
-  // Batch scheduling function which runs every scheduling_period_ microseconds.
-  void ProcessOneBatch();
-
-  // Notifies scheduler of non-empty batch which is eligible for processing.
-  void AddBatch(internal::ASBSBatch<TaskType>*);
-
-  // Removes queue from scheduler.
-  void RemoveQueue(const internal::ASBSQueue<TaskType>* queue);
-
-  Env* GetEnv() const { return options_.env; }
-
-  const Options options_;
-
-  struct BatchCompare {
-    bool operator()(const internal::ASBSBatch<TaskType>* a,
-                    const internal::ASBSBatch<TaskType>* b);
-  };
-
-  // Collection of batches added by AddBatch, ordered by age. Owned by scheduler
-  // until they are released for processing.
-  std::priority_queue<const internal::ASBSBatch<TaskType>*,
-                      std::vector<internal::ASBSBatch<TaskType>*>, BatchCompare>
-      batches_ GUARDED_BY(mu_);
-
-  // Unowned queues and callbacks added by AddQueue.
-  std::unordered_map<const internal::ASBSQueue<TaskType>*, BatchProcessor>
-      queues_and_callbacks_ GUARDED_BY(mu_);
-
-  mutex mu_;
-
-  // Responsible for running ProcessOneBatch. PeriodicFunction was used in order
-  // to check for deletion so that the thread can be shut down.
-  std::unique_ptr<PeriodicFunction> scheduling_thread_;
-
-  // Responsible for running the batch processing callbacks.
-  std::unique_ptr<thread::ThreadPool> batch_thread_pool_;
-
-  // Time interval in microseconds between successive ProcessOneBatch calls.
-  double scheduling_period_;
-
-  // Exponentially weighted moving average of
-  // options_.scheduling_period_feedback() evaluated in each ProcessOneBatch
-  // call.
-  double ewma_feedback_ = 0;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(AdaptiveSharedBatchScheduler);
-};
-
-//////////////////////////////////////////////////////////
-// Implementation details follow. API users need not read.
-
-namespace internal {
-// Consolidates tasks into batches, passing them off to the
-// AdaptiveSharedBatchScheduler for processing.
-template <typename TaskType>
-class ASBSQueue : public BatchScheduler<TaskType> {
- public:
-  using QueueOptions =
-      typename AdaptiveSharedBatchScheduler<TaskType>::QueueOptions;
-
-  ASBSQueue(std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler,
-            const QueueOptions& options);
-
-  ~ASBSQueue() override;
-
-  // Adds task to current batch. Fails if the task size is larger than the batch
-  // size or if the current batch is full and this queue's number of outstanding
-  // batches is at its maximum.
-  Status Schedule(std::unique_ptr<TaskType>* task) override;
-
-  // Number of tasks waiting to be scheduled.
-  size_t NumEnqueuedTasks() const override;
-
-  // Number of size 1 tasks which could currently be scheduled without failing.
-  size_t SchedulingCapacity() const override;
-
-  // Notifies queue that a batch is about to be scheduled; the queue should not
-  // place any more tasks in this batch.
-  void ReleaseBatch(const ASBSBatch<TaskType>* batch);
-
- private:
-  std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler_;
-  const QueueOptions options_;
-  // Owned by scheduler_.
-  ASBSBatch<TaskType>* current_batch_ GUARDED_BY(mu_) = nullptr;
-  int64 num_enqueued_batches_ GUARDED_BY(mu_) = 0;
-  int64 num_enqueued_tasks_ GUARDED_BY(mu_) = 0;
-  mutable mutex mu_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ASBSQueue);
-};
-
-// Batch which remembers when and by whom it was created.
-template <typename TaskType>
-class ASBSBatch : public Batch<TaskType> {
- public:
-  ASBSBatch(ASBSQueue<TaskType>* queue, int64 creation_time_micros)
-      : queue_(queue), creation_time_micros_(creation_time_micros) {}
-
-  ~ASBSBatch() override {}
-
-  ASBSQueue<TaskType>* queue() const { return queue_; }
-
-  int64 creation_time_micros() const { return creation_time_micros_; }
-
- private:
-  ASBSQueue<TaskType>* queue_;
-  const int64 creation_time_micros_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ASBSBatch);
-};
-}  // namespace internal
-
-// ---------------- AdaptiveSharedBatchScheduler ----------------
-
-template <typename TaskType>
-Status AdaptiveSharedBatchScheduler<TaskType>::Create(
-    const Options& options,
-    std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>>* scheduler) {
-  if (options.num_batch_threads < 1) {
-    return errors::InvalidArgument("num_batch_threads must be positive; was ",
-                                   options.num_batch_threads);
-  }
-  if (options.min_scheduling_period_micros < 0) {
-    return errors::InvalidArgument(
-        "min_scheduling_period_micros must be >= 0; was ",
-        options.min_scheduling_period_micros);
-  }
-  if (options.min_scheduling_period_micros >
-      options.initial_scheduling_period_micros) {
-    return errors::InvalidArgument(
-        "initial_scheduling_period_micros (",
-        options.initial_scheduling_period_micros,
-        ") must be >= min_scheduling_period_micros (",
-        options.min_scheduling_period_micros, ")");
-  }
-  if (options.initial_scheduling_period_micros >
-      options.max_scheduling_period_micros) {
-    return errors::InvalidArgument(
-        "initial_scheduling_period_micros (",
-        options.initial_scheduling_period_micros,
-        ") must be <= max_scheduling_period_micros (",
-        options.max_scheduling_period_micros, ")");
-  }
-  if (options.feedback_smoothing_batches < 1) {
-    return errors::InvalidArgument(
-        "feedback_smoothing_batches must be positive; was ",
-        options.feedback_smoothing_batches);
-  }
-  scheduler->reset(new AdaptiveSharedBatchScheduler<TaskType>(options));
-  return Status::OK();
-}
-
-template <typename TaskType>
-AdaptiveSharedBatchScheduler<TaskType>::AdaptiveSharedBatchScheduler(
-    const Options& options)
-    : options_(options),
-      scheduling_period_(options.initial_scheduling_period_micros) {
-  PeriodicFunction::Options opts;
-  opts.thread_name_prefix = "scheduling_thread";
-  opts.env = GetEnv();
-  scheduling_thread_.reset(
-      new PeriodicFunction([this] { ProcessOneBatch(); }, 0, opts));
-  batch_thread_pool_.reset(new thread::ThreadPool(
-      GetEnv(), options.thread_pool_name, options.num_batch_threads));
-}
-
-template <typename TaskType>
-Status AdaptiveSharedBatchScheduler<TaskType>::AddQueue(
-    const QueueOptions& options, BatchProcessor process_batch_callback,
-    std::unique_ptr<BatchScheduler<TaskType>>* queue) {
-  if (options.max_batch_size <= 0) {
-    return errors::InvalidArgument("max_batch_size must be positive; was ",
-                                   options.max_batch_size);
-  }
-  if (options.max_enqueued_batches <= 0) {
-    return errors::InvalidArgument(
-        "max_enqueued_batches must be positive; was ",
-        options.max_enqueued_batches);
-  }
-  internal::ASBSQueue<TaskType>* asbs_queue_raw;
-  queue->reset(asbs_queue_raw = new internal::ASBSQueue<TaskType>(
-                   this->shared_from_this(), options));
-  mutex_lock l(mu_);
-  queues_and_callbacks_[asbs_queue_raw] = process_batch_callback;
-  return Status::OK();
-}
-
-template <typename TaskType>
-void AdaptiveSharedBatchScheduler<TaskType>::AddBatch(
-    internal::ASBSBatch<TaskType>* batch) {
-  mutex_lock l(mu_);
-  batches_.push(batch);
-}
-
-template <typename TaskType>
-void AdaptiveSharedBatchScheduler<TaskType>::RemoveQueue(
-    const internal::ASBSQueue<TaskType>* queue) {
-  mutex_lock l(mu_);
-  queues_and_callbacks_.erase(queue);
-}
-
-template <typename TaskType>
-void AdaptiveSharedBatchScheduler<TaskType>::ProcessOneBatch() {
-  static const double kFeedbackMultiplier = .001;
-  internal::ASBSBatch<TaskType>* batch = nullptr;
-  BatchProcessor callback;
-  const int64 start_time_micros = GetEnv()->NowMicros();
-  {
-    mutex_lock l(mu_);
-    if (!batches_.empty()) {
-      batch = batches_.top();
-      batches_.pop();
-      callback = queues_and_callbacks_[batch->queue()];
-    }
-  }
-  if (batch != nullptr) {
-    double feedback = options_.scheduling_period_feedback();
-    const int64 N = options_.feedback_smoothing_batches;
-    ewma_feedback_ = ((N - 1) * ewma_feedback_ + feedback) / N;
-    scheduling_period_ *= (1 + kFeedbackMultiplier * ewma_feedback_);
-    if (scheduling_period_ < options_.min_scheduling_period_micros) {
-      scheduling_period_ = options_.min_scheduling_period_micros;
-    } else if (scheduling_period_ > options_.max_scheduling_period_micros) {
-      scheduling_period_ = options_.max_scheduling_period_micros;
-    }
-    // Queue may destroy itself after ReleaseBatch is called.
-    batch->queue()->ReleaseBatch(batch);
-    batch_thread_pool_->Schedule([callback, batch] {
-      callback(std::unique_ptr<Batch<TaskType>>(batch));
-    });
-  }
-  const int64 sleep_time =
-      scheduling_period_ - (GetEnv()->NowMicros() - start_time_micros);
-  if (sleep_time > 0) {
-    GetEnv()->SleepForMicroseconds(sleep_time);
-  }
-}
-
-template <typename TaskType>
-bool AdaptiveSharedBatchScheduler<TaskType>::BatchCompare::operator()(
-    const internal::ASBSBatch<TaskType>* a,
-    const internal::ASBSBatch<TaskType>* b) {
-  return a->creation_time_micros() > b->creation_time_micros();
-}
-
-// ---------------- ASBSQueue ----------------
-
-namespace internal {
-template <typename TaskType>
-ASBSQueue<TaskType>::ASBSQueue(
-    std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler,
-    const QueueOptions& options)
-    : scheduler_(scheduler), options_(options) {}
-
-template <typename TaskType>
-ASBSQueue<TaskType>::~ASBSQueue() {
-  // Wait until last batch has been scheduled.
-  const int kSleepMicros = 1000;
-  for (;;) {
-    {
-      mutex_lock l(mu_);
-      if (num_enqueued_batches_ == 0) {
-        break;
-      }
-    }
-    scheduler_->GetEnv()->SleepForMicroseconds(kSleepMicros);
-  }
-  scheduler_->RemoveQueue(this);
-}
-
-template <typename TaskType>
-Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
-  ASBSBatch<TaskType>* new_batch = nullptr;
-  size_t size = (*task)->size();
-  if (size > options_.max_batch_size) {
-    return errors::InvalidArgument("Task size ", size,
-                                   " is larger than maximum batch size ",
-                                   options_.max_batch_size);
-  }
-  {
-    mutex_lock l(mu_);
-    // Current batch is full, create another if allowed.
-    if (current_batch_ &&
-        current_batch_->size() + size > options_.max_batch_size) {
-      if (num_enqueued_batches_ >= options_.max_enqueued_batches) {
-        return errors::Unavailable("The batch scheduling queue is full");
-      }
-      current_batch_->Close();
-      current_batch_ = nullptr;
-    }
-    if (!current_batch_) {
-      num_enqueued_batches_++;
-      current_batch_ = new_batch =
-          new ASBSBatch<TaskType>(this, scheduler_->GetEnv()->NowMicros());
-    }
-    current_batch_->AddTask(std::move(*task));
-    num_enqueued_tasks_++;
-  }
-  if (new_batch != nullptr) scheduler_->AddBatch(new_batch);
-  return Status::OK();
-}
-
-template <typename TaskType>
-void ASBSQueue<TaskType>::ReleaseBatch(const ASBSBatch<TaskType>* batch) {
-  mutex_lock l(mu_);
-  num_enqueued_batches_--;
-  num_enqueued_tasks_ -= batch->num_tasks();
-  if (batch == current_batch_) {
-    current_batch_->Close();
-    current_batch_ = nullptr;
-  }
-}
-
-template <typename TaskType>
-size_t ASBSQueue<TaskType>::NumEnqueuedTasks() const {
-  mutex_lock l(mu_);
-  return num_enqueued_tasks_;
-}
-
-template <typename TaskType>
-size_t ASBSQueue<TaskType>::SchedulingCapacity() const {
-  mutex_lock l(mu_);
-  const int current_batch_capacity =
-      current_batch_ ? options_.max_batch_size - current_batch_->size() : 0;
-  const int spare_batches =
-      options_.max_enqueued_batches - num_enqueued_batches_;
-  return spare_batches * options_.max_batch_size + current_batch_capacity;
-}
-}  // namespace internal
-}  // namespace serving
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
+#endif  // TENSORFLOW_CONTRIB_BATCHING_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler.h b/tensorflow/contrib/batching/basic_batch_scheduler.h
index 9d3805fbaf39978159dd2f4a754e6d41a07acf6a..d9b37da6933aa0847c229607f43d1d5d121a928c 100644
--- a/tensorflow/contrib/batching/basic_batch_scheduler.h
+++ b/tensorflow/contrib/batching/basic_batch_scheduler.h
@@ -13,252 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
+#ifndef TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
 
-#include <stddef.h>
-#include <cstddef>
-#include <functional>
-#include <memory>
-#include <string>
+#include "tensorflow/core/kernels/batching_util/basic_batch_scheduler.h"
 
-#include "tensorflow/contrib/batching/shared_batch_scheduler.h"
-
-namespace tensorflow {
-namespace serving {
-
-// A BatchScheduler implementation geared toward handling a single request type
-// running on a specific set of hardware resources. A typical scenario is one in
-// which all requests invoke the same machine-learned model on one GPU.
-//
-// If there are, say, two GPUs and two models each bound to one of the GPUs, one
-// could use two BasicBatchScheduler instances to schedule the two model/GPU
-// combinations independently. If multiple models must share a given GPU or
-// other hardware resource, consider using SharedBatchScheduler instead.
-//
-//
-// PARAMETERS AND BEHAVIOR:
-//
-// BasicBatchScheduler runs a fixed pool of threads, which it uses to process
-// batches of tasks. It enforces a maximum batch size, and enqueues a bounded
-// number of tasks. If the queue is nearly empty, such that a full batch cannot
-// be formed, when a thread becomes free, it anyway schedules a batch
-// immediately if a task has been in the queue for longer than a given timeout
-// parameter. If the timeout parameter is set to 0, then the batch threads will
-// always be kept busy (unless there are zero tasks waiting to be processed).
-//
-// For online serving, it is recommended to set the maximum number of enqueued
-// batches worth of tasks equal to the number of batch threads, which allows
-// enqueuing of enough tasks s.t. if every thread becomes available it can be
-// kept busy, but no more. For bulk processing jobs and throughput-oriented
-// benchmarks, you may want to set it much higher.
-//
-// When Schedule() is called, if the queue is full the call will fail with an
-// UNAVAILABLE error (after which the client may retry again later). If the call
-// succeeds, the maximum time the task will spend in the queue before being
-// placed in a batch and assigned to a thread for processing, is the greater of:
-//  - the maximum time to process ceil(max_enqueued_batches/num_batch_threads)
-//    (1 in the recommended configuration) batches of previously-submitted tasks
-//  - the configured timeout parameter (which can be 0, as mentioned above)
-//
-// Unlike StreamingBatchScheduler, when BasicBatchScheduler assigns a batch to a
-// thread, it closes the batch. The process-batch callback may assume that every
-// batch it receives is closed at the outset.
-//
-//
-// RECOMMENDED USE-CASES:
-//
-// BasicBatchScheduler is suitable for use-cases that feature a single kind of
-// request (e.g. a server performing inference with a single machine-learned
-// model, possibly evolving over time), with loose versioning semantics.
-// Concretely, the following conditions should hold:
-//
-//  A. All requests batched onto a given resource (e.g. a hardware accelerator,
-//     or a pool accelerators) are of the same type. For example, they all
-//     invoke the same machine-learned model.
-//
-//     These variations are permitted:
-//      - The model may reside in a single servable, or it may be spread across
-//        multiple servables that are used in unison (e.g. a vocabulary lookup
-//        table servable and a tensorflow session servable).
-//      - The model's servable(s) may be static, or they may evolve over time
-//        (successive servable versions).
-//      - Zero or more of the servables are used in the request thread; the rest
-//        are used in the batch thread. In our running example, the vocabulary
-//        lookups and tensorflow runs may both be performed in the batch thread,
-//        or alternatively the vocabulary lookup may occur in the request thread
-//        with only the tensorflow run performed in the batch thread.
-//
-//     In contrast, BasicBatchScheduler is not a good fit if the server
-//     hosts multiple distinct models running on a pool accelerators, with each
-//     request specifying which model it wants to use. BasicBatchScheduler
-//     has no facility to time-multiplex the batch threads across multiple
-//     models in a principled way. More basically, it cannot ensure that a given
-//     batch doesn't contain a mixture of requests for different models.
-//
-//  B. Requests do not specify a particular version of the servable(s) that must
-//     be used. Instead, each request is content to use the "latest" version.
-//
-//     BasicBatchScheduler does not constrain which requests get grouped
-//     together into a batch, so using this scheduler there is no way to achieve
-//     cohesion of versioned requests to version-specific batches.
-//
-//  C. No servable version coordination needs to be performed between the
-//     request threads and the batch threads. Often, servables are only used in
-//     the batch threads, in which case this condition trivially holds. If
-//     servables are used in both threads, then the use-case must tolerate
-//     version skew across the servables used in the two kinds of threads.
-//
-//
-// EXAMPLE USE-CASE FLOW:
-//
-// For such use-cases, request processing via BasicBatchScheduler generally
-// follows this flow (given for illustration; variations are possible):
-//  1. Optionally perform some pre-processing on each request in the request
-//     threads.
-//  2. Route the requests to the batch scheduler, as batching::Task objects.
-//     (Since all requests are of the same type and are not versioned, the
-//     scheduler is free to group them into batches arbitrarily.)
-//  3. Merge the requests into a single batched representation B.
-//  4. Obtain handles to the servable(s) needed to process B. The simplest
-//     approach is to obtain the latest version of each servable. Alternatively,
-//     if cross-servable consistency is required (e.g. the vocabulary lookup
-//     table's version number must match that of the tensorflow session),
-//     identify an appropriate version number and obtain the servable handles
-//     accordingly.
-//  5. Process B using the obtained servable handles, and split the result into
-//     individual per-request units.
-//  6. Perform any post-processing in the batch thread and/or request thread.
-//
-//
-// PERFORMANCE TUNING: See README.md.
-//
-template <typename TaskType>
-class BasicBatchScheduler : public BatchScheduler<TaskType> {
- public:
-  // TODO(b/25089730): Tune defaults based on best practices as they develop.
-  // (Keep them mirrored to the ones in SharedBatchScheduler::QueueOptions and
-  // SharedBatchScheduler::Options.)
-  struct Options {
-    // The maximum size of each batch.
-    //
-    // The scheduler may form batches of any size between 1 and this number
-    // (inclusive). If there is a need to quantize the batch sizes, i.e. only
-    // submit batches whose size is in a small set of allowed sizes, that can be
-    // done by adding padding in the process-batch callback.
-    int max_batch_size = 1000;
-
-    // If a task has been enqueued for this amount of time (in microseconds),
-    // and a thread is available, the scheduler will immediately form a batch
-    // from enqueued tasks and assign the batch to the thread for processing,
-    // even if the batch's size is below 'max_batch_size'.
-    //
-    // This parameter offers a way to bound queue latency, so that a task isn't
-    // stuck in the queue indefinitely waiting for enough tasks to arrive to
-    // make a full batch. (The latency bound is given in the class documentation
-    // above.)
-    //
-    // The goal is to smooth out batch sizes under low request rates, and thus
-    // avoid latency spikes.
-    int64 batch_timeout_micros = 0;
-
-    // The name to use for the pool of batch threads.
-    string thread_pool_name = {"batch_threads"};
-
-    // The number of threads to use to process batches.
-    // Must be >= 1, and should be tuned carefully.
-    int num_batch_threads = port::NumSchedulableCPUs();
-
-    // The maximum allowable number of enqueued (accepted by Schedule() but
-    // not yet being processed on a batch thread) tasks in terms of batches.
-    // If this limit is reached, Schedule() will return an UNAVAILABLE error.
-    // See the class documentation above for guidelines on how to tune this
-    // parameter.
-    int max_enqueued_batches = 10;
-
-    // The following options are typically only overridden by test code.
-
-    // The environment to use.
-    Env* env = Env::Default();
-  };
-  static Status Create(const Options& options,
-                       std::function<void(std::unique_ptr<Batch<TaskType>>)>
-                           process_batch_callback,
-                       std::unique_ptr<BasicBatchScheduler>* scheduler);
-
-  ~BasicBatchScheduler() override = default;
-
-  Status Schedule(std::unique_ptr<TaskType>* task) override;
-  size_t NumEnqueuedTasks() const override;
-  size_t SchedulingCapacity() const override;
-
- private:
-  explicit BasicBatchScheduler(
-      std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue);
-
-  // This class is merely a thin wrapper around a SharedBatchScheduler with a
-  // single queue.
-  std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(BasicBatchScheduler);
-};
-
-//////////
-// Implementation details follow. API users need not read.
-
-template <typename TaskType>
-Status BasicBatchScheduler<TaskType>::Create(
-    const Options& options,
-    std::function<void(std::unique_ptr<Batch<TaskType>>)>
-        process_batch_callback,
-    std::unique_ptr<BasicBatchScheduler>* scheduler) {
-  typename SharedBatchScheduler<TaskType>::Options shared_scheduler_options;
-  shared_scheduler_options.thread_pool_name = options.thread_pool_name;
-  shared_scheduler_options.num_batch_threads = options.num_batch_threads;
-  shared_scheduler_options.env = options.env;
-  std::shared_ptr<SharedBatchScheduler<TaskType>> shared_scheduler;
-  TF_RETURN_IF_ERROR(SharedBatchScheduler<TaskType>::Create(
-      shared_scheduler_options, &shared_scheduler));
-
-  typename SharedBatchScheduler<TaskType>::QueueOptions
-      shared_scheduler_queue_options;
-  shared_scheduler_queue_options.max_batch_size = options.max_batch_size;
-  shared_scheduler_queue_options.batch_timeout_micros =
-      options.batch_timeout_micros;
-  shared_scheduler_queue_options.max_enqueued_batches =
-      options.max_enqueued_batches;
-  std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue;
-  TF_RETURN_IF_ERROR(shared_scheduler->AddQueue(shared_scheduler_queue_options,
-                                                process_batch_callback,
-                                                &shared_scheduler_queue));
-
-  scheduler->reset(
-      new BasicBatchScheduler<TaskType>(std::move(shared_scheduler_queue)));
-  return Status::OK();
-}
-
-template <typename TaskType>
-Status BasicBatchScheduler<TaskType>::Schedule(
-    std::unique_ptr<TaskType>* task) {
-  return shared_scheduler_queue_->Schedule(task);
-}
-
-template <typename TaskType>
-size_t BasicBatchScheduler<TaskType>::NumEnqueuedTasks() const {
-  return shared_scheduler_queue_->NumEnqueuedTasks();
-}
-
-template <typename TaskType>
-size_t BasicBatchScheduler<TaskType>::SchedulingCapacity() const {
-  return shared_scheduler_queue_->SchedulingCapacity();
-}
-
-template <typename TaskType>
-BasicBatchScheduler<TaskType>::BasicBatchScheduler(
-    std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue)
-    : shared_scheduler_queue_(std::move(shared_scheduler_queue)) {}
-
-}  // namespace serving
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
+#endif  // TENSORFLOW_CONTRIB_BATCHING_BASIC_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/batch_scheduler.h b/tensorflow/contrib/batching/batch_scheduler.h
index a5072f439abad3c5db79a514a7f2baff0b021b39..8e94e1fd8b969d4fef8dbc8c322557f9da3833e6 100644
--- a/tensorflow/contrib/batching/batch_scheduler.h
+++ b/tensorflow/contrib/batching/batch_scheduler.h
@@ -13,264 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Abstractions for processing small tasks in a batched fashion, to reduce
-// processing times and costs that can be amortized across multiple tasks.
-//
-// The core class is BatchScheduler, which groups tasks into batches.
-//
-// BatchScheduler encapsulates logic for aggregating multiple tasks into a
-// batch, and kicking off processing of a batch on a thread pool it manages.
-//
-// This file defines an abstract BatchScheduler class.
+#ifndef TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 
-#include <stddef.h>
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/core/lib/core/notification.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace serving {
-
-// The abstract superclass for a unit of work to be done as part of a batch.
-//
-// An implementing subclass typically contains (or points to):
-//  (a) input data;
-//  (b) a thread-safe completion signal (e.g. a Notification);
-//  (c) a place to store the outcome (success, or some error), upon completion;
-//  (d) a place to store the output data, upon success.
-//
-// Items (b), (c) and (d) are typically non-owned pointers to data homed
-// elsewhere, because a task's ownership gets transferred to a BatchScheduler
-// (see below) and it may be deleted as soon as it is done executing.
-class BatchTask {
- public:
-  virtual ~BatchTask() = default;
-
-  // Returns the size of the task, in terms of how much it contributes to the
-  // size of a batch. (A batch's size is the sum of its task sizes.)
-  virtual size_t size() const = 0;
-};
-
-// A thread-safe collection of BatchTasks, to be executed together in some
-// fashion.
-//
-// At a given time, a batch is either "open" or "closed": an open batch can
-// accept new tasks; a closed one cannot. A batch is monotonic: initially it is
-// open and tasks can be added to it; then it is closed and its set of tasks
-// remains fixed for the remainder of its life. A closed batch cannot be re-
-// opened. Tasks can never be removed from a batch.
-//
-// Type parameter TaskType must be a subclass of BatchTask.
-template <typename TaskType>
-class Batch {
- public:
-  Batch() = default;
-  virtual ~Batch();  // Blocks until the batch is closed.
-
-  // Appends 'task' to the batch. After calling AddTask(), the newly-added task
-  // can be accessed via task(num_tasks()-1) or mutable_task(num_tasks()-1).
-  // Dies if the batch is closed.
-  void AddTask(std::unique_ptr<TaskType> task);
-
-  // Removes the most recently added task. Returns nullptr if the batch is
-  // empty.
-  std::unique_ptr<TaskType> RemoveTask();
-
-  // Returns the number of tasks in the batch.
-  int num_tasks() const;
-
-  // Returns true iff the batch contains 0 tasks.
-  bool empty() const;
-
-  // Returns a reference to the ith task (in terms of insertion order).
-  const TaskType& task(int i) const;
-
-  // Returns a pointer to the ith task (in terms of insertion order).
-  TaskType* mutable_task(int i);
-
-  // Returns the sum of the task sizes.
-  size_t size() const;
-
-  // Returns true iff the batch is currently closed.
-  bool IsClosed() const;
-
-  // Blocks until the batch is closed.
-  void WaitUntilClosed() const;
-
-  // Marks the batch as closed. Dies if called more than once.
-  void Close();
-
- private:
-  mutable mutex mu_;
-
-  // The tasks in the batch.
-  std::vector<std::unique_ptr<TaskType>> tasks_ GUARDED_BY(mu_);
-
-  // The sum of the sizes of the tasks in 'tasks_'.
-  size_t size_ GUARDED_BY(mu_) = 0;
-
-  // Whether the batch has been closed.
-  Notification closed_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(Batch);
-};
-
-// An abstract batch scheduler class. Collects individual tasks into batches,
-// and processes each batch on a pool of "batch threads" that it manages. The
-// actual logic for processing a batch is accomplished via a callback.
-//
-// Type parameter TaskType must be a subclass of BatchTask.
-template <typename TaskType>
-class BatchScheduler {
- public:
-  virtual ~BatchScheduler() = default;
-
-  // Submits a task to be processed as part of a batch.
-  //
-  // Ownership of '*task' is transferred to the callee iff the method returns
-  // Status::OK. In that case, '*task' is left as nullptr. Otherwise, '*task' is
-  // left as-is.
-  //
-  // If no batch processing capacity is available to process this task at the
-  // present time, and any task queue maintained by the implementing subclass is
-  // full, this method returns an UNAVAILABLE error code. The client may retry
-  // later.
-  //
-  // Other problems, such as the task size being larger than the maximum batch
-  // size, yield other, permanent error types.
-  //
-  // In all cases, this method returns "quickly" without blocking for any
-  // substantial amount of time. If the method returns Status::OK, the task is
-  // processed asynchronously, and any errors that occur during the processing
-  // of the batch that includes the task can be reported to 'task'.
-  virtual Status Schedule(std::unique_ptr<TaskType>* task) = 0;
-
-  // Returns the number of tasks that have been scheduled (i.e. accepted by
-  // Schedule()), but have yet to be handed to a thread for execution as part of
-  // a batch. Note that this returns the number of tasks, not the aggregate task
-  // size (so if there is one task of size 3 and one task of size 5, this method
-  // returns 2 rather than 8).
-  virtual size_t NumEnqueuedTasks() const = 0;
-
-  // Returns a guaranteed number of size 1 tasks that can be Schedule()d without
-  // getting an UNAVAILABLE error. In a typical implementation, returns the
-  // available space on a queue.
-  //
-  // There are two important caveats:
-  //  1. The guarantee does not extend to varying-size tasks due to possible
-  //     internal fragmentation of batches.
-  //  2. The guarantee only holds in a single-thread environment or critical
-  //     section, i.e. if an intervening thread cannot call Schedule().
-  //
-  // This method is useful for monitoring, or for guaranteeing a future slot in
-  // the schedule (but being mindful about the caveats listed above).
-  virtual size_t SchedulingCapacity() const = 0;
-};
-
-//////////
-// Implementation details follow. API users need not read.
-
-template <typename TaskType>
-Batch<TaskType>::~Batch() {
-  WaitUntilClosed();
-}
-
-template <typename TaskType>
-void Batch<TaskType>::AddTask(std::unique_ptr<TaskType> task) {
-  DCHECK(!IsClosed());
-  {
-    mutex_lock l(mu_);
-    size_ += task->size();
-    tasks_.push_back(std::move(task));
-  }
-}
-
-template <typename TaskType>
-std::unique_ptr<TaskType> Batch<TaskType>::RemoveTask() {
-  {
-    mutex_lock l(mu_);
-    if (tasks_.empty()) {
-      return nullptr;
-    }
-    std::unique_ptr<TaskType> task = std::move(tasks_.back());
-    tasks_.pop_back();
-    return task;
-  }
-}
-
-template <typename TaskType>
-int Batch<TaskType>::num_tasks() const {
-  {
-    mutex_lock l(mu_);
-    return tasks_.size();
-  }
-}
-
-template <typename TaskType>
-bool Batch<TaskType>::empty() const {
-  {
-    mutex_lock l(mu_);
-    return tasks_.empty();
-  }
-}
-
-template <typename TaskType>
-const TaskType& Batch<TaskType>::task(int i) const {
-  DCHECK_GE(i, 0);
-  {
-    mutex_lock l(mu_);
-    DCHECK_LT(i, tasks_.size());
-    return *tasks_[i].get();
-  }
-}
-
-template <typename TaskType>
-TaskType* Batch<TaskType>::mutable_task(int i) {
-  DCHECK_GE(i, 0);
-  {
-    mutex_lock l(mu_);
-    DCHECK_LT(i, tasks_.size());
-    return tasks_[i].get();
-  }
-}
-
-template <typename TaskType>
-size_t Batch<TaskType>::size() const {
-  {
-    mutex_lock l(mu_);
-    return size_;
-  }
-}
-
-template <typename TaskType>
-bool Batch<TaskType>::IsClosed() const {
-  return const_cast<Notification*>(&closed_)->HasBeenNotified();
-}
-
-template <typename TaskType>
-void Batch<TaskType>::WaitUntilClosed() const {
-  const_cast<Notification*>(&closed_)->WaitForNotification();
-}
-
-template <typename TaskType>
-void Batch<TaskType>::Close() {
-  closed_.Notify();
-}
-
-}  // namespace serving
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
+#endif  // TENSORFLOW_CONTRIB_BATCHING_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/kernels/BUILD b/tensorflow/contrib/batching/kernels/BUILD
deleted file mode 100644
index 6e53dd9a5fc0201c5ed91d1eaf07f940e341fb5e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/batching/kernels/BUILD
+++ /dev/null
@@ -1,34 +0,0 @@
-# Description:
-#   Contains kernels for the batching ops.
-
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "batch_kernels",
-    srcs = ["batch_kernels.cc"],
-    deps = [
-        "//tensorflow/contrib/batching:shared_batch_scheduler_hdrs",
-        "//tensorflow/contrib/batching/util:periodic_function_dynamic",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:concat_lib_hdrs",
-        "//tensorflow/core/kernels:ops_util_hdrs",
-        "//tensorflow/core/kernels:split_lib_hdrs",
-    ],
-    alwayslink = 1,
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/batching/ops/batch_ops.cc b/tensorflow/contrib/batching/ops/batch_ops.cc
deleted file mode 100644
index 85e0ccba4aa372bdc21fb194263569b8b787bb6c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/batching/ops/batch_ops.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-namespace tensorflow {
-
-REGISTER_OP("Batch")
-    .Input("in_tensors: T")
-    .Output("batched_tensors: T")
-    .Output("batch_index: int64")
-    .Output("id: int64")
-    .Attr("num_batch_threads: int")
-    .Attr("max_batch_size: int")
-    .Attr("batch_timeout_micros: int")
-    .Attr("allowed_batch_sizes: list(int) = []")
-    .Attr("grad_timeout_micros: int")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("batching_queue: string = ''")
-    .Attr("T: list(type)")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      std::vector<shape_inference::ShapeHandle> in_shapes;
-      TF_RETURN_IF_ERROR(c->input("in_tensors", &in_shapes));
-      std::vector<shape_inference::ShapeHandle> out_shapes(in_shapes.size());
-      for (int i = 0; i < in_shapes.size(); ++i) {
-        TF_RETURN_IF_ERROR(
-            c->ReplaceDim(in_shapes[i], 0, c->UnknownDim(), &out_shapes[i]));
-      }
-      TF_RETURN_IF_ERROR(c->set_output("batched_tensors", out_shapes));
-      TF_RETURN_IF_ERROR(c->set_output("id", {c->Scalar()}));
-      TF_RETURN_IF_ERROR(c->set_output(
-          "batch_index",
-          {c->MakeShape({shape_inference::DimensionOrConstant(c->UnknownDim()),
-                         shape_inference::DimensionOrConstant(3)})}));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Batches all input tensors nondeterministically.
-
-When many instances of this Op are being run concurrently with the same
-container/shared_name in the same device, some will output zero-shaped Tensors
-and others will output Tensors of size up to max_batch_size.
-
-All Tensors in in_tensors are batched together (so, for example, labels and
-features should be batched with a single instance of this operation.
-
-Each invocation of batch emits an `id` scalar which will be used to identify
-this particular invocation when doing unbatch or its gradient.
-
-Each op which emits a non-empty batch will also emit a non-empty batch_index
-Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
-start, and length of elements of each set of Tensors present in batched_tensors.
-
-Batched tensors are concatenated along the first dimension, and all tensors in
-in_tensors must have the first dimension of the same size.
-
-in_tensors: The tensors to be batched.
-num_batch_threads: Number of scheduling threads for processing batches of work.
- Determines the number of batches processed in parallel.
-max_batch_size: Batch sizes will never be bigger than this.
-batch_timeout_micros: Maximum number of microseconds to wait before outputting
- an incomplete batch.
-allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
- nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
- batches up to one of those sizes. The entries must increase monotonically, and
- the final entry must equal max_batch_size.
-grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
-batched_tensors: Either empty tensors or a batch of concatenated Tensors.
-batch_index: If out_tensors is non-empty, has information to invert it.
-container: Controls the scope of sharing of this batch.
-id: always contains a scalar with a unique ID for this invocation of Batch.
-shared_name: Concurrently running instances of batch in the same device with the
- same container and shared_name will batch their elements together. If left
- empty, the op name will be used as the shared name.
-T: the types of tensors to be batched.
-)doc");
-
-REGISTER_OP("Unbatch")
-    .Input("batched_tensor: T")
-    .Input("batch_index: int64")
-    .Input("id: int64")
-    .Output("unbatched_tensor: T")
-    .Attr("timeout_micros: int")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("T: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle out_shape;
-      TF_RETURN_IF_ERROR(
-          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &out_shape));
-      c->set_output(0, out_shape);
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Reverses the operation of Batch for a single output Tensor.
-
-An instance of Unbatch either receives an empty batched_tensor, in which case it
-asynchronously waits until the values become available from a concurrently
-running instance of Unbatch with the same container and shared_name, or receives
-a non-empty batched_tensor in which case it finalizes all other concurrently
-running instances and outputs its own element from the batch.
-
-batched_tensor: The possibly transformed output of Batch. The size of the first
- dimension should remain unchanged by the transformations for the operation to
- work.
-batch_index: The matching batch_index obtained from Batch.
-id: The id scalar emitted by Batch.
-unbatched_tensor: The Tensor corresponding to this execution.
-timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
- batched input tensor associated with a given invocation of the op.
-container: Container to control resource sharing.
-shared_name: Instances of Unbatch with the same container and shared_name are
- assumed to possibly belong to the same batch. If left empty, the op name will
- be used as the shared name.
-)doc");
-
-REGISTER_OP("UnbatchGrad")
-    .Input("original_input: T")
-    .Input("batch_index: int64")
-    .Input("grad: T")
-    .Input("id: int64")
-    .Output("batched_grad: T")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("T: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->UnknownShapeOfRank(c->Rank(c->input(2))));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Gradient of Unbatch.
-
-Acts like Batch but using the given batch_index index of batching things as they
-become available. This ensures that the gradients are propagated back in the
-same session which did the forward pass.
-
-original_input: The input to the Unbatch operation this is the gradient of.
-batch_index: The batch_index given to the Unbatch operation this is the gradient
-of.
-grad: The downstream gradient.
-id: The id scalar emitted by Batch.
-batched_grad: The return value, either an empty tensor or the batched gradient.
-container: Container to control resource sharing.
-shared_name: Instances of UnbatchGrad with the same container and shared_name
- are assumed to possibly belong to the same batch. If left empty, the op name
- will be used as the shared name.
-  )doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops.py b/tensorflow/contrib/batching/python/ops/batch_ops.py
index cee4d7b4a9710e285957f27ace7c2762c473c5c7..921d6917a4e478c3e60771fdc3ae99febc33d2e3 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops.py
@@ -18,18 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.batching.ops import gen_batch_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_batch_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.contrib.batching.ops.gen_batch_ops import *
+from tensorflow.python.ops.gen_batch_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.contrib.util import loader
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import resource_loader
-
-
-_batch_ops = loader.load_op_library(
-    resource_loader.get_path_to_datafile("_batch_ops.so"))
 
 
 @ops.RegisterGradient("Batch")
@@ -59,10 +53,13 @@ def _UnbatchGrad(op, grad):   # pylint: disable=invalid-name
   ]
 
 
-def batch_function(num_batch_threads, max_batch_size, batch_timeout_micros,
+def batch_function(num_batch_threads,
+                   max_batch_size,
+                   batch_timeout_micros,
                    allowed_batch_sizes=None,
                    grad_timeout_micros=60 * 1000 * 1000,
-                   unbatch_timeout_micros=60 * 1000 * 1000):
+                   unbatch_timeout_micros=60 * 1000 * 1000,
+                   max_enqueued_batches=10):
   """Batches the computation done by the decorated function.
 
   So, for example, in the following code
@@ -100,6 +97,7 @@ def batch_function(num_batch_threads, max_batch_size, batch_timeout_micros,
      documentation of the unbatch op for more details. Defaults to 60s.
     unbatch_timeout_micros: The timeout to use for unbatching. See the
      documentation of the unbatch op for more details. Defaults to 60s.
+    max_enqueued_batches: The maximum depth of the batch queue. Defaults to 10.
 
   Returns:
     The decorated function will return the unbatched computation output Tensors.
@@ -117,6 +115,7 @@ def batch_function(num_batch_threads, max_batch_size, batch_timeout_micros,
             num_batch_threads=num_batch_threads,
             max_batch_size=max_batch_size,
             batch_timeout_micros=batch_timeout_micros,
+            max_enqueued_batches=max_enqueued_batches,
             allowed_batch_sizes=allowed_batch_sizes,
             grad_timeout_micros=grad_timeout_micros,
             shared_name=name)
diff --git a/tensorflow/contrib/batching/shared_batch_scheduler.h b/tensorflow/contrib/batching/shared_batch_scheduler.h
index 41a3f99137ade2552432fee62ddce17d064148a4..83a59695d7db7e0a24fb437a3ea71a4d9e23c93f 100644
--- a/tensorflow/contrib/batching/shared_batch_scheduler.h
+++ b/tensorflow/contrib/batching/shared_batch_scheduler.h
@@ -13,688 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
+#ifndef TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
 
-#include <stddef.h>
-#include <deque>
-#include <functional>
-#include <list>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
+#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 
-#include "tensorflow/contrib/batching/batch_scheduler.h"
-#include "tensorflow/contrib/batching/util/periodic_function.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace serving {
-namespace internal {
-template <typename TaskType>
-class Queue;
-}  // namespace internal
-}  // namespace serving
-}  // namespace tensorflow
-
-namespace tensorflow {
-namespace serving {
-
-// A batch scheduler for server instances that service multiple request types
-// (e.g. multiple machine-learned models, or multiple versions of a model served
-// concurrently), or even multiple distinct tasks for a given request. The
-// scheduler multiplexes batches of different kinds of tasks onto a fixed-size
-// thread pool (each batch contains tasks of a single type), in a carefully
-// controlled manner. A common configuration is to set the number of threads
-// equal to the number of hardware accelerator units, in which case the
-// scheduler takes care of multiplexing the task types onto the shared hardware,
-// in a manner that is both fair and efficient.
-//
-// Semantically, SharedBatchScheduler behaves like having N instances of
-// BasicBatchScheduler (see basic_batch_scheduler.h), one per task type. The
-// difference is that under the covers there is a single shared thread pool,
-// instead of N independent ones, with their sharing deliberately coordinated.
-//
-// SharedBatchScheduler does not implement the BatchScheduler API; rather, it
-// presents an abstraction of "queues", where each queue coresponds to one type
-// of task. Tasks submitted to a given queue are placed in their own batches,
-// and cannot be mixed with other tasks. Queues can be added and deleted
-// dynamically, to accommodate e.g. versions of a model being brought up and
-// down over the lifetime of a server.
-//
-// The batch thread pool round-robins through the queues, running one batch
-// from a queue and then moving to the next queue. Each queue behaves like a
-// BasicBatchScheduler instance, in the sense that it has maximum batch size and
-// timeout parameters, which govern when a batch is eligible to be processed.
-//
-// Each queue is independently configured with a maximum size (in terms of the
-// maximum number of batches worth of enqueued tasks). For online serving, it is
-// recommended that the queue sizes be configured such that the sum of the sizes
-// of the active queues roughly equal the number of batch threads. (The idea is
-// that if all threads become available at roughly the same time, there will be
-// enough enqueued work for them to take on, but no more.)
-//
-// If queue sizes are configured in the manner suggested above, the maximum time
-// a task can spend in a queue before being placed in a batch and assigned to a
-// thread for processing, is the greater of:
-//  - the maximum time to process one batch of tasks from any active queue
-//  - the configured timeout parameter for the task's queue (which can be 0)
-//
-// For bulk processing jobs and throughput-oriented benchmarks, you may want to
-// set the maximum queue size to a large value.
-//
-// TODO(b/26539183): Support queue servicing policies other than round-robin.
-// E.g. let each queue specify a "share" (an int >= 1), so e.g. with queues A
-// and B having shares 1 and 2 respectively, the servicing pattern is ABBABB...
-//
-//
-// PERFORMANCE TUNING: See README.md.
-//
-template <typename TaskType>
-class SharedBatchScheduler
-    : public std::enable_shared_from_this<SharedBatchScheduler<TaskType>> {
- public:
-  // TODO(b/25089730): Tune defaults based on best practices as they develop.
-  struct Options {
-    // The name to use for the pool of batch threads.
-    string thread_pool_name = {"batch_threads"};
-
-    // The number of threads to use to process batches.
-    // Must be >= 1, and should be tuned carefully.
-    int num_batch_threads = port::NumSchedulableCPUs();
-
-    // The environment to use.
-    // (Typically only overridden by test code.)
-    Env* env = Env::Default();
-  };
-  // Ownership is shared between the caller of Create() and any queues created
-  // via AddQueue().
-  static Status Create(
-      const Options& options,
-      std::shared_ptr<SharedBatchScheduler<TaskType>>* scheduler);
-
-  ~SharedBatchScheduler();
-
-  // Adds a queue to which tasks may be submitted. The returned queue implements
-  // the BatchScheduler API. Each queue has its own set of scheduling options,
-  // and its own callback to process batches of tasks submitted to the queue.
-  //
-  // The returned queue's destructor blocks until all tasks submitted to it have
-  // been processed.
-  struct QueueOptions {
-    // The maximum size of each batch.
-    //
-    // The scheduler may form batches of any size between 1 and this number
-    // (inclusive). If there is a need to quantize the batch sizes, i.e. only
-    // submit batches whose size is in a small set of allowed sizes, that can be
-    // done by adding padding in the process-batch callback.
-    int max_batch_size = 1000;
-
-    // If a task has been enqueued for this amount of time (in microseconds),
-    // and a thread is available, the scheduler will immediately form a batch
-    // from enqueued tasks and assign the batch to the thread for processing,
-    // even if the batch's size is below 'max_batch_size'.
-    //
-    // This parameter offers a way to bound queue latency, so that a task isn't
-    // stuck in the queue indefinitely waiting for enough tasks to arrive to
-    // make a full batch. (The latency bound is given in the class documentation
-    // above.)
-    //
-    // The goal is to smooth out batch sizes under low request rates, and thus
-    // avoid latency spikes.
-    int64 batch_timeout_micros = 0;
-
-    // The maximum allowable number of enqueued (accepted by Schedule() but
-    // not yet being processed on a batch thread) tasks in terms of batches.
-    // If this limit is reached, Schedule() will return an UNAVAILABLE error.
-    // See the class documentation above for guidelines on how to tune this
-    // parameter.
-    int max_enqueued_batches = 10;
-  };
-  Status AddQueue(const QueueOptions& options,
-                  std::function<void(std::unique_ptr<Batch<TaskType>>)>
-                      process_batch_callback,
-                  std::unique_ptr<BatchScheduler<TaskType>>* queue);
-
- private:
-  explicit SharedBatchScheduler(const Options& options);
-
-  // The code executed in 'batch_threads_'. Obtains a batch to process from the
-  // queue pointed to by 'next_queue_to_schedule_', and processes it. If that
-  // queue declines to provide a batch to process, moves onto the next queue. If
-  // no queues provide a batch to process, just sleeps briefly and exits.
-  void ThreadLogic();
-
-  const Options options_;
-
-  mutex mu_;
-
-  // A list of queues. (We use std::list instead of std::vector to ensure that
-  // iterators are not invalidated by adding/removing elements. It also offers
-  // efficient removal of elements from the middle.)
-  using QueueList = std::list<std::unique_ptr<internal::Queue<TaskType>>>;
-
-  // All "active" queues, i.e. ones that either:
-  //  - have not been removed, or
-  //  - have been removed but are not yet empty.
-  QueueList queues_ GUARDED_BY(mu_);
-
-  // An iterator over 'queues_', pointing to the queue from which the next
-  // available batch thread should grab work.
-  typename QueueList::iterator next_queue_to_schedule_ GUARDED_BY(mu_);
-
-  // Used by idle batch threads to wait for work to enter the system. Notified
-  // whenever a batch becomes schedulable.
-  condition_variable schedulable_batch_cv_;
-
-  // Threads that process batches obtained from the queues.
-  std::vector<std::unique_ptr<PeriodicFunction>> batch_threads_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(SharedBatchScheduler);
-};
-
-//////////
-// Implementation details follow. API users need not read.
-
-namespace internal {
-
-// A task queue for SharedBatchScheduler. Accepts tasks and accumulates them
-// into batches, and dispenses those batches to be processed via a "pull"
-// interface. The queue's behavior is governed by maximum batch size, timeout
-// and maximum queue length parameters; see their documentation in
-// SharedBatchScheduler.
-//
-// The queue is implemented as a deque of batches, with these invariants:
-//  - The number of batches is between 1 and 'options_.max_enqueued_batches'.
-//  - The back-most batch is open; the rest are closed.
-//
-// Submitted tasks are added to the open batch. If that batch doesn't have room
-// but the queue isn't full, then that batch is closed and a new open batch is
-// started.
-//
-// Batch pull requests are handled by dequeuing the front-most batch if it is
-// closed. If the front-most batch is open (i.e. the queue contains only one
-// batch) and has reached the timeout, it is immediately closed and returned;
-// otherwise no batch is returned for the request.
-template <typename TaskType>
-class Queue {
- public:
-  using ProcessBatchCallback =
-      std::function<void(std::unique_ptr<Batch<TaskType>>)>;
-  using SchedulableBatchCallback = std::function<void()>;
-  Queue(const typename SharedBatchScheduler<TaskType>::QueueOptions& options,
-        Env* env, ProcessBatchCallback process_batch_callback,
-        SchedulableBatchCallback schdulable_batch_callback);
-
-  // Illegal to destruct unless the queue is empty.
-  ~Queue();
-
-  // Submits a task to the queue, with the same semantics as
-  // BatchScheduler::Schedule().
-  Status Schedule(std::unique_ptr<TaskType>* task);
-
-  // Returns the number of enqueued tasks, with the same semantics as
-  // BatchScheduler::NumEnqueuedTasks().
-  size_t NumEnqueuedTasks() const;
-
-  // Returns the queue capacity, with the same semantics as
-  // BatchScheduler::SchedulingCapacity().
-  size_t SchedulingCapacity() const;
-
-  // Called by a thread that is ready to process a batch, to request one from
-  // this queue. Either returns a batch that is ready to be processed, or
-  // nullptr if the queue declines to schedule a batch at this time. If it
-  // returns a batch, the batch is guaranteed to be closed.
-  std::unique_ptr<Batch<TaskType>> ScheduleBatch();
-
-  // Processes a batch that has been returned earlier by ScheduleBatch().
-  void ProcessBatch(std::unique_ptr<Batch<TaskType>> batch);
-
-  // Determines whether the queue is empty, i.e. has no tasks waiting or being
-  // processed.
-  bool IsEmpty() const;
-
-  // Marks the queue closed, and waits until it is empty.
-  void CloseAndWaitUntilEmpty();
-
-  bool closed() const {
-    mutex_lock l(mu_);
-    return closed_;
-  }
-
- private:
-  // Same as IsEmpty(), but assumes the caller already holds a lock on 'mu_'.
-  bool IsEmptyInternal() const EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Closes the open batch residing at the back of 'batches_', and inserts a
-  // fresh open batch behind it.
-  void StartNewBatch() EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Determines whether the open batch residing at the back of 'batches_' is
-  // currently schedulable.
-  bool IsOpenBatchSchedulable() const EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  const typename SharedBatchScheduler<TaskType>::QueueOptions options_;
-
-  // The environment to use.
-  Env* env_;
-
-  // A callback invoked to processes a batch of work units. Always invoked from
-  // a batch thread.
-  ProcessBatchCallback process_batch_callback_;
-
-  // A callback invoked to notify the scheduler that a new batch has become
-  // schedulable.
-  SchedulableBatchCallback schedulable_batch_callback_;
-
-  mutable mutex mu_;
-
-  // Whether this queue can accept new tasks. This variable is monotonic: it
-  // starts as false, and then at some point gets set to true and remains true
-  // for the duration of this object's life.
-  bool closed_ GUARDED_BY(mu_) = false;
-
-  // The enqueued batches. See the invariants in the class comments above.
-  std::deque<std::unique_ptr<Batch<TaskType>>> batches_ GUARDED_BY(mu_);
-
-  // The time at which the first task was added to the open (back-most) batch
-  // in 'batches_'. Valid iff that batch contains at least one task.
-  uint64 open_batch_start_time_micros_ GUARDED_BY(mu_);
-
-  // Whether this queue contains a batch that is eligible to be scheduled. Used
-  // to keep track of when to call 'schedulable_batch_callback_'.
-  bool schedulable_batch_ GUARDED_BY(mu_) = false;
-
-  // The number of batches currently being processed by batch threads.
-  // Incremented in ScheduleBatch() and decremented in ProcessBatch().
-  int num_batches_being_processed_ GUARDED_BY(mu_) = 0;
-
-  // Used by CloseAndWaitUntilEmpty() to wait until the queue is empty, for the
-  // case in which the queue is not empty when CloseAndWaitUntilEmpty() starts.
-  // When ProcessBatch() dequeues the last batch and makes the queue empty, if
-  // 'empty_notification_' is non-null it calls 'empty_notification_->Notify()'.
-  Notification* empty_notification_ GUARDED_BY(mu_) = nullptr;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(Queue);
-};
-
-// A RAII-style object that points to a Queue and implements
-// the BatchScheduler API. To be handed out to clients who call AddQueue().
-template <typename TaskType>
-class QueueHandle : public BatchScheduler<TaskType> {
- public:
-  QueueHandle(std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler,
-              Queue<TaskType>* queue);
-  ~QueueHandle() override;
-
-  Status Schedule(std::unique_ptr<TaskType>* task) override;
-  size_t NumEnqueuedTasks() const override;
-  size_t SchedulingCapacity() const override;
-
- private:
-  // The scheduler that owns 'queue_'.
-  std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler_;
-
-  // The queue this handle wraps. Owned by 'scheduler_', which keeps it alive at
-  // least until this class's destructor closes it.
-  Queue<TaskType>* queue_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(QueueHandle);
-};
-
-}  // namespace internal
-
-template <typename TaskType>
-Status SharedBatchScheduler<TaskType>::Create(
-    const Options& options,
-    std::shared_ptr<SharedBatchScheduler<TaskType>>* scheduler) {
-  if (options.num_batch_threads < 1) {
-    return errors::InvalidArgument("num_batch_threads must be positive; was ",
-                                   options.num_batch_threads);
-  }
-  scheduler->reset(new SharedBatchScheduler<TaskType>(options));
-  return Status::OK();
-}
-
-template <typename TaskType>
-SharedBatchScheduler<TaskType>::~SharedBatchScheduler() {
-  // Wait until the batch threads finish clearing out and deleting the closed
-  // queues.
-  for (;;) {
-    {
-      mutex_lock l(mu_);
-      if (queues_.empty()) {
-        break;
-      }
-    }
-    const int64 kSleepTimeMicros = 100;
-    options_.env->SleepForMicroseconds(kSleepTimeMicros);
-  }
-  // Delete the batch threads before allowing state the threads may access (e.g.
-  // 'mu_') to be deleted.
-  batch_threads_.clear();
-}
-
-template <typename TaskType>
-Status SharedBatchScheduler<TaskType>::AddQueue(
-    const QueueOptions& options,
-    std::function<void(std::unique_ptr<Batch<TaskType>>)>
-        process_batch_callback,
-    std::unique_ptr<BatchScheduler<TaskType>>* queue) {
-  if (options.max_batch_size <= 0) {
-    return errors::InvalidArgument("max_batch_size must be positive; was ",
-                                   options.max_batch_size);
-  }
-  if (options.batch_timeout_micros < 0) {
-    return errors::InvalidArgument(
-        "batch_timeout_micros must be non-negative; was ",
-        options.batch_timeout_micros);
-  }
-  if (options.max_enqueued_batches < 0) {
-    return errors::InvalidArgument(
-        "max_enqueued_batches must be non-negative; was ",
-        options.max_enqueued_batches);
-  }
-
-  auto schedulable_batch_callback = [this] {
-    mutex_lock l(mu_);
-    schedulable_batch_cv_.notify_one();
-  };
-  auto internal_queue =
-      std::unique_ptr<internal::Queue<TaskType>>(new internal::Queue<TaskType>(
-          options, options_.env, process_batch_callback,
-          schedulable_batch_callback));
-  auto handle = std::unique_ptr<BatchScheduler<TaskType>>(
-      new internal::QueueHandle<TaskType>(this->shared_from_this(),
-                                          internal_queue.get()));
-  {
-    mutex_lock l(mu_);
-    queues_.push_back(std::move(internal_queue));
-    if (next_queue_to_schedule_ == queues_.end()) {
-      next_queue_to_schedule_ = queues_.begin();
-    }
-  }
-  *queue = std::move(handle);
-  return Status::OK();
-}
-
-template <typename TaskType>
-SharedBatchScheduler<TaskType>::SharedBatchScheduler(const Options& options)
-    : options_(options), next_queue_to_schedule_(queues_.end()) {
-  // Kick off the batch threads.
-  PeriodicFunction::Options periodic_fn_options;
-  periodic_fn_options.thread_name_prefix =
-      strings::StrCat(options.thread_pool_name, "_");
-  for (int i = 0; i < options.num_batch_threads; ++i) {
-    std::unique_ptr<PeriodicFunction> thread(new PeriodicFunction(
-        [this] { this->ThreadLogic(); },
-        0 /* function invocation interval time */, periodic_fn_options));
-    batch_threads_.push_back(std::move(thread));
-  }
-}
-
-template <typename TaskType>
-void SharedBatchScheduler<TaskType>::ThreadLogic() {
-  // A batch to process next (or nullptr if no work to do).
-  std::unique_ptr<Batch<TaskType>> batch_to_process;
-  // The queue with which 'batch_to_process' is associated.
-  internal::Queue<TaskType>* queue_for_batch = nullptr;
-  {
-    mutex_lock l(mu_);
-
-    const int num_queues = queues_.size();
-    for (int num_queues_tried = 0;
-         batch_to_process == nullptr && num_queues_tried < num_queues;
-         ++num_queues_tried) {
-      DCHECK(next_queue_to_schedule_ != queues_.end());
-
-      // If a closed queue responds to ScheduleBatch() with nullptr, the queue
-      // will never yield any further batches so we can drop it. To avoid a
-      // race, we take a snapshot of the queue's closedness state *before*
-      // calling ScheduleBatch().
-      const bool queue_closed = (*next_queue_to_schedule_)->closed();
-
-      // Ask '*next_queue_to_schedule_' if it wants us to process a batch.
-      batch_to_process = (*next_queue_to_schedule_)->ScheduleBatch();
-      if (batch_to_process != nullptr) {
-        queue_for_batch = next_queue_to_schedule_->get();
-      }
-
-      // Advance 'next_queue_to_schedule_'.
-      if (queue_closed && (*next_queue_to_schedule_)->IsEmpty() &&
-          batch_to_process == nullptr) {
-        // We've encountered a closed queue with no work to do. Drop it.
-        DCHECK_NE(queue_for_batch, next_queue_to_schedule_->get());
-        next_queue_to_schedule_ = queues_.erase(next_queue_to_schedule_);
-      } else {
-        ++next_queue_to_schedule_;
-      }
-      if (next_queue_to_schedule_ == queues_.end() && !queues_.empty()) {
-        // We've hit the end. Wrap to the first queue.
-        next_queue_to_schedule_ = queues_.begin();
-      }
-    }
-
-    if (batch_to_process == nullptr) {
-      // We couldn't find any work to do. Wait until a new batch becomes
-      // schedulable, or some time has elapsed, before checking again.
-      const int64 kTimeoutMillis = 1;  // The smallest accepted granule of time.
-      WaitForMilliseconds(&l, &schedulable_batch_cv_, kTimeoutMillis);
-      return;
-    }
-  }
-
-  queue_for_batch->ProcessBatch(std::move(batch_to_process));
-}
-
-namespace internal {
-
-template <typename TaskType>
-Queue<TaskType>::Queue(
-    const typename SharedBatchScheduler<TaskType>::QueueOptions& options,
-    Env* env, ProcessBatchCallback process_batch_callback,
-    SchedulableBatchCallback schedulable_batch_callback)
-    : options_(options),
-      env_(env),
-      process_batch_callback_(process_batch_callback),
-      schedulable_batch_callback_(schedulable_batch_callback) {
-  // Create an initial, open batch.
-  batches_.emplace_back(new Batch<TaskType>);
-}
-
-template <typename TaskType>
-Queue<TaskType>::~Queue() {
-  mutex_lock l(mu_);
-  DCHECK(IsEmptyInternal());
-
-  // Close the (empty) open batch, so its destructor doesn't block.
-  batches_.back()->Close();
-}
-
-template <typename TaskType>
-Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
-  if ((*task)->size() > options_.max_batch_size) {
-    return errors::InvalidArgument("Task size ", (*task)->size(),
-                                   " is larger than maximum batch size ",
-                                   options_.max_batch_size);
-  }
-
-  bool notify_of_schedulable_batch = false;
-  {
-    mutex_lock l(mu_);
-
-    DCHECK(!closed_);
-
-    if (batches_.back()->size() + (*task)->size() > options_.max_batch_size) {
-      if (batches_.size() >= options_.max_enqueued_batches) {
-        return errors::Unavailable(
-            "The batch scheduling queue to which this task was submitted is "
-            "full");
-      }
-      StartNewBatch();
-    }
-    if (batches_.back()->empty()) {
-      open_batch_start_time_micros_ = env_->NowMicros();
-    }
-    batches_.back()->AddTask(std::move(*task));
-
-    if (!schedulable_batch_) {
-      if (batches_.size() > 1 || IsOpenBatchSchedulable()) {
-        schedulable_batch_ = true;
-        notify_of_schedulable_batch = true;
-      }
-    }
-  }
-
-  if (notify_of_schedulable_batch) {
-    schedulable_batch_callback_();
-  }
-
-  return Status::OK();
-}
-
-template <typename TaskType>
-size_t Queue<TaskType>::NumEnqueuedTasks() const {
-  mutex_lock l(mu_);
-  size_t num_enqueued_tasks = 0;
-  for (const auto& batch : batches_) {
-    num_enqueued_tasks += batch->num_tasks();
-  }
-  return num_enqueued_tasks;
-}
-
-template <typename TaskType>
-size_t Queue<TaskType>::SchedulingCapacity() const {
-  mutex_lock l(mu_);
-  const int num_new_batches_schedulable =
-      options_.max_enqueued_batches - batches_.size();
-  const int open_batch_capacity =
-      options_.max_batch_size - batches_.back()->size();
-  return (num_new_batches_schedulable * options_.max_batch_size) +
-         open_batch_capacity;
-}
-
-template <typename TaskType>
-std::unique_ptr<Batch<TaskType>> Queue<TaskType>::ScheduleBatch() {
-  // The batch to schedule, which we may populate below. (If left as nullptr,
-  // that means we are electing not to schedule a batch at this time.)
-  std::unique_ptr<Batch<TaskType>> batch_to_schedule;
-
-  {
-    mutex_lock l(mu_);
-
-    // Consider closing the open batch at this time, to schedule it.
-    if (batches_.size() == 1 && IsOpenBatchSchedulable()) {
-      StartNewBatch();
-    }
-
-    if (batches_.size() >= 2) {
-      // There is at least one closed batch that is ready to be scheduled.
-      ++num_batches_being_processed_;
-      batch_to_schedule = std::move(batches_.front());
-      batches_.pop_front();
-    } else {
-      schedulable_batch_ = false;
-    }
-  }
-
-  return batch_to_schedule;
-}
-
-template <typename TaskType>
-void Queue<TaskType>::ProcessBatch(std::unique_ptr<Batch<TaskType>> batch) {
-  process_batch_callback_(std::move(batch));
-
-  {
-    mutex_lock l(mu_);
-    --num_batches_being_processed_;
-    if (empty_notification_ != nullptr && IsEmptyInternal()) {
-      empty_notification_->Notify();
-    }
-  }
-}
-
-template <typename TaskType>
-bool Queue<TaskType>::IsEmpty() const {
-  mutex_lock l(mu_);
-  return IsEmptyInternal();
-}
-
-template <typename TaskType>
-void Queue<TaskType>::CloseAndWaitUntilEmpty() {
-  Notification empty;
-  {
-    mutex_lock l(mu_);
-    closed_ = true;
-    if (IsEmptyInternal()) {
-      empty.Notify();
-    } else {
-      // Arrange for ProcessBatch() to notify when the queue becomes empty.
-      empty_notification_ = &empty;
-    }
-  }
-  empty.WaitForNotification();
-}
-
-template <typename TaskType>
-bool Queue<TaskType>::IsEmptyInternal() const {
-  return num_batches_being_processed_ == 0 && batches_.size() == 1 &&
-         batches_.back()->empty();
-}
-
-template <typename TaskType>
-void Queue<TaskType>::StartNewBatch() {
-  batches_.back()->Close();
-  batches_.emplace_back(new Batch<TaskType>);
-}
-
-template <typename TaskType>
-bool Queue<TaskType>::IsOpenBatchSchedulable() const {
-  Batch<TaskType>* open_batch = batches_.back().get();
-  if (open_batch->empty()) {
-    return false;
-  }
-  return closed_ || open_batch->size() >= options_.max_batch_size ||
-         env_->NowMicros() >=
-             open_batch_start_time_micros_ + options_.batch_timeout_micros;
-}
-
-template <typename TaskType>
-QueueHandle<TaskType>::QueueHandle(
-    std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler,
-    Queue<TaskType>* queue)
-    : scheduler_(scheduler), queue_(queue) {}
-
-template <typename TaskType>
-QueueHandle<TaskType>::~QueueHandle() {
-  queue_->CloseAndWaitUntilEmpty();
-}
-
-template <typename TaskType>
-Status QueueHandle<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
-  return queue_->Schedule(task);
-}
-
-template <typename TaskType>
-size_t QueueHandle<TaskType>::NumEnqueuedTasks() const {
-  return queue_->NumEnqueuedTasks();
-}
-
-template <typename TaskType>
-size_t QueueHandle<TaskType>::SchedulingCapacity() const {
-  return queue_->SchedulingCapacity();
-}
-
-}  // namespace internal
-
-}  // namespace serving
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
+#endif  // TENSORFLOW_CONTRIB_BATCHING_SHARED_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/test_util/BUILD b/tensorflow/contrib/batching/test_util/BUILD
index d1ced0d8c367f44b520a9bba2db8a3e0969bab4c..6db627faad1df4a4b73082e74e7754829ff2b514 100644
--- a/tensorflow/contrib/batching/test_util/BUILD
+++ b/tensorflow/contrib/batching/test_util/BUILD
@@ -22,11 +22,9 @@ filegroup(
 cc_library(
     name = "fake_clock_env",
     testonly = 1,
-    srcs = ["fake_clock_env.cc"],
     hdrs = ["fake_clock_env.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:tensorflow",
+        "//tensorflow/core/kernels/batching_util:fake_clock_env",
     ],
 )
diff --git a/tensorflow/contrib/batching/test_util/fake_clock_env.h b/tensorflow/contrib/batching/test_util/fake_clock_env.h
index 35cafcb73c51feb4e9e15a61d1830c8ef6bc3e0f..40a39a5569854350c72a47102f3dac07b362ce8e 100644
--- a/tensorflow/contrib/batching/test_util/fake_clock_env.h
+++ b/tensorflow/contrib/batching/test_util/fake_clock_env.h
@@ -13,64 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
+#ifndef TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
+#define TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
 
-#include <functional>
-#include <string>
-#include <vector>
+#include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
 
-#include "tensorflow/core/lib/core/notification.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace serving {
-namespace test_util {
-
-// An Env implementation with a fake clock for NowMicros() and
-// SleepForMicroseconds(). The clock doesn't advance on its own; it advances via
-// an explicit Advance() method.
-// All other Env virtual methods pass through to a wrapped Env.
-class FakeClockEnv : public EnvWrapper {
- public:
-  explicit FakeClockEnv(Env* wrapped);
-  ~FakeClockEnv() override = default;
-
-  // Advance the clock by a certain number of microseconds.
-  void AdvanceByMicroseconds(int micros);
-
-  // Blocks until there is a sleeping thread that is scheduled to wake up at
-  // the given (absolute) time.
-  void BlockUntilSleepingThread(uint64 wake_time);
-
-  // Blocks until there are at least num_threads sleeping.
-  void BlockUntilThreadsAsleep(int num_threads);
-
-  // Methods that this class implements.
-  uint64 NowMicros() override;
-  void SleepForMicroseconds(int64 micros) override;
-
- private:
-  mutex mu_;
-
-  uint64 current_time_ GUARDED_BY(mu_) = 0;
-
-  struct SleepingThread {
-    uint64 wake_time;
-    Notification* wake_notification;
-  };
-  std::vector<SleepingThread> sleeping_threads_ GUARDED_BY(mu_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(FakeClockEnv);
-};
-
-}  // namespace test_util
-}  // namespace serving
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
+#endif  // TENSORFLOW_CONTRIB_BATCHING_TEST_UTIL_FAKE_CLOCK_ENV_H_
diff --git a/tensorflow/contrib/batching/util/BUILD b/tensorflow/contrib/batching/util/BUILD
index f33a08cb817e9f2832be953ef6ff1aba04c4c288..2a84a7712a8fa66e89db41ff4e7ebe4f620029ca 100644
--- a/tensorflow/contrib/batching/util/BUILD
+++ b/tensorflow/contrib/batching/util/BUILD
@@ -22,12 +22,11 @@ filegroup(
 
 cc_library(
     name = "periodic_function_dynamic",
-    srcs = ["periodic_function.cc"],
     hdrs = ["periodic_function.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels/batching_util:periodic_function_dynamic",
+        "//third_party/eigen3",
     ],
 )
 
@@ -36,17 +35,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":periodic_function_dynamic",
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_cc_test(
-    name = "periodic_function_test",
-    srcs = ["periodic_function_test.cc"],
-    deps = [
-        ":periodic_function_dynamic",
-        "//tensorflow/contrib/batching/test_util:fake_clock_env",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
+        "//tensorflow/core/kernels/batching_util:periodic_function",
     ],
 )
diff --git a/tensorflow/contrib/batching/util/periodic_function.h b/tensorflow/contrib/batching/util/periodic_function.h
index 2c032d802fe5f23a267db28dc869a253f16afc34..aa2ed0a385125fa090a7a56b6339a87eb2d57b1f 100644
--- a/tensorflow/contrib/batching/util/periodic_function.h
+++ b/tensorflow/contrib/batching/util/periodic_function.h
@@ -12,121 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
+#define TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
 
-// PeriodicFunction will periodically call the given function with a specified
-// period in a background thread.  After Start() returns, the thread is
-// guaranteed to have started. The destruction of the class causes the
-// background thread to be destroyed as well.  Start() should not be called more
-// than once.
-//
-// PeriodicFunction runs the function as soon as any previous run both is
-// complete and was started more than "interval_micros" earlier.  Thus, runs are
-// both serialized, and normally have a period of "interval_micros" if no run
-// exceeds the time.
-//
-// Note that, if the function takes longer than two interval_micross to finish,
-// then PeriodicFunction will "skip" at least one call to the function.  For
-// instance, if the period is 50ms and the function starts runs at time 0 for
-// 150ms, then the function will immediately start executing again at time 150,
-// but there will be no function runs corresponding to times 50 or 100.  This is
-// especially important to remember when using an environment with a simulated
-// clock: advancing simulated time atomically over N interval_micross will not
-// cause the function to be called N times.
-//
-// This object is thread-safe.
-//
-// Example:
-//
-//   class Foo {
-//    public:
-//     Foo() : periodic_function_([this]() { Bar(); },
-//                               1000 /* 1000us == 1ms*/) {
-//     }
-//
-//    private:
-//     void Bar() { ... }
-//
-//     PeriodicFunction periodic_function_;
-//   };
+#include "tensorflow/core/kernels/batching_util/periodic_function.h"
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
-
-#include <functional>
-#include <memory>
-#include <string>
-
-#include "tensorflow/core/lib/core/notification.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace serving {
-
-namespace internal {
-class PeriodicFunctionTestAccess;
-}
-
-class PeriodicFunction {
- public:
-  // Provides the ability to customize several aspects of the PeriodicFunction.
-  // Passed to constructor of PeriodicFunction.
-  struct Options {
-    Options() {}
-
-    // Any standard thread options, such as stack size, should
-    // be passed via "thread_options".
-    ThreadOptions thread_options;
-
-    // Specifies the thread name prefix (see the description in class
-    // Thread).
-    string thread_name_prefix = "periodic_function";
-
-    // The environment to use. Does not take ownership, but must remain alive
-    // for as long as the PeriodicFunction exists.
-    Env* env = Env::Default();
-
-    // Specifies the length of sleep before the first invocation of the
-    // function.
-    // This can be used for adding a random jitter to avoid synchronous behavior
-    // across multiple periodic functions.
-    int64 startup_delay_micros = 0;
-  };
-
-  // Also starts the background thread which will be calling the function.
-  PeriodicFunction(const std::function<void()>& function, int64 interval_micros,
-                   const Options& options = Options());
-
-  ~PeriodicFunction();
-
- private:
-  friend class internal::PeriodicFunctionTestAccess;
-
-  // Notifies the background thread to stop.
-  void NotifyStop();
-
-  // (Blocking.) Loops forever calling "function_" every "interval_micros_".
-  void RunLoop(int64 start) LOCKS_EXCLUDED(mutex_);
-
-  const std::function<void()> function_;  // Actual client function
-  const int64 interval_micros_;    // Interval between calls.
-  const Options options_;
-
-  // Protects state below.
-  mutable mutex mutex_;
-  // Used to notify the thread to stop.
-  Notification stop_thread_;
-
-  // Thread for running "function_"
-  std::unique_ptr<Thread> thread_ GUARDED_BY(mutex_) = nullptr;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(PeriodicFunction);
-};
-
-}  // namespace serving
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
+#endif  // TENSORFLOW_CONTRIB_BATCHING_UTIL_PERIODIC_FUNCTION_H_
diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index a262d4aecdbb69dfcd8b88bc0a09060500d6b1c9..74712aeb67c3f0a31def78f25a0298f9c02c9590 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -99,6 +99,25 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "layers_conv_variational_test",
+    size = "small",
+    srcs = ["python/kernel_tests/layers_conv_variational_test.py"],
+    additional_deps = [
+        ":bayesflow_py",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+    ],
+)
+
 cuda_py_test(
     name = "layers_dense_variational_test",
     size = "small",
@@ -118,6 +137,26 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "mcmc_diagnostics_test",
+    size = "small",
+    srcs = ["python/kernel_tests/mcmc_diagnostics_test.py"],
+    additional_deps = [
+        ":bayesflow_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_seed",
+    ],
+)
+
 cuda_py_test(
     name = "monte_carlo_test",
     size = "small",
@@ -156,6 +195,7 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    tags = ["no_mac"],  # b/73192243
 )
 
 cuda_py_test(
@@ -198,6 +238,46 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_seed",
     ],
+    tags = ["notsan"],
+)
+
+cuda_py_test(
+    name = "variable_utils_test",
+    size = "small",
+    srcs = ["python/kernel_tests/variable_utils_test.py"],
+    additional_deps = [
+        ":bayesflow_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "variational_sgd_optimizer_test",
+    size = "small",
+    srcs = ["python/kernel_tests/variational_sgd_optimizer_test.py"],
+    additional_deps = [
+        ":bayesflow_py",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_seed",
+    ],
+    tags = ["notsan"],
 )
 
 filegroup(
diff --git a/tensorflow/contrib/bayesflow/__init__.py b/tensorflow/contrib/bayesflow/__init__.py
index 95b9452b1ada60c44672f37800ced2133d2bd8b2..528c4fbacd06c7b0defa0e32bd24a98b2bc07b64 100644
--- a/tensorflow/contrib/bayesflow/__init__.py
+++ b/tensorflow/contrib/bayesflow/__init__.py
@@ -26,9 +26,11 @@ from tensorflow.contrib.bayesflow.python.ops import custom_grad
 from tensorflow.contrib.bayesflow.python.ops import halton_sequence
 from tensorflow.contrib.bayesflow.python.ops import hmc
 from tensorflow.contrib.bayesflow.python.ops import layers
+from tensorflow.contrib.bayesflow.python.ops import mcmc_diagnostics
 from tensorflow.contrib.bayesflow.python.ops import metropolis_hastings
 from tensorflow.contrib.bayesflow.python.ops import monte_carlo
 from tensorflow.contrib.bayesflow.python.ops import optimizers
+from tensorflow.contrib.bayesflow.python.ops import variable_utils
 # pylint: enable=unused-import,line-too-long
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -42,10 +44,12 @@ _allowed_symbols = [
     'hmc',
     'layers',
     'metropolis_hastings',
+    'mcmc_diagnostics',
     'monte_carlo',
     'optimizers',
     'special_math',
     'stochastic_variables',
+    'variable_utils',
     'variational_inference',
 ]
 
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
index b1f108e5f01e4945ee83d8262f1d99877f0fe9f0..5bd834e56245ab4d874544cfd014fe59ae521ea8 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
@@ -12,40 +12,53 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for Hamiltonian Monte Carlo.
-"""
+"""Tests for Hamiltonian Monte Carlo."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 import numpy as np
-from scipy import special
 from scipy import stats
 
 from tensorflow.contrib.bayesflow.python.ops import hmc
+from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _compute_energy_change
+from tensorflow.contrib.bayesflow.python.ops.hmc_impl import _leapfrog_integrator
 
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import gradients_impl as gradients_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.platform import tf_logging as logging_ops
+
+
+def _reduce_variance(x, axis=None, keepdims=False):
+  sample_mean = math_ops.reduce_mean(x, axis, keepdims=True)
+  return math_ops.reduce_mean(
+      math_ops.squared_difference(x, sample_mean), axis, keepdims)
 
 
-# TODO(b/66964210): Test float16.
 class HMCTest(test.TestCase):
 
   def setUp(self):
     self._shape_param = 5.
     self._rate_param = 10.
-    self._expected_x = (special.digamma(self._shape_param)
-                        - np.log(self._rate_param))
-    self._expected_exp_x = self._shape_param / self._rate_param
 
     random_seed.set_random_seed(10003)
     np.random.seed(10003)
 
+  def assertAllFinite(self, x):
+    self.assertAllEqual(np.ones_like(x).astype(bool), np.isfinite(x))
+
   def _log_gamma_log_prob(self, x, event_dims=()):
     """Computes log-pdf of a log-gamma random variable.
 
@@ -60,63 +73,46 @@ class HMCTest(test.TestCase):
                                self._rate_param * math_ops.exp(x),
                                event_dims)
 
-  def _log_gamma_log_prob_grad(self, x, event_dims=()):
-    """Computes log-pdf and gradient of a log-gamma random variable.
-
-    Args:
-      x: Value of the random variable.
-      event_dims: Dimensions not to treat as independent. Default is (),
-        i.e., all dimensions are independent.
-
-    Returns:
-      log_prob: The log-pdf up to a normalizing constant.
-      grad: The gradient of the log-pdf with respect to x.
-    """
-    return (math_ops.reduce_sum(self._shape_param * x -
-                                self._rate_param * math_ops.exp(x),
-                                event_dims),
-            self._shape_param - self._rate_param * math_ops.exp(x))
-
-  def _n_event_dims(self, x_shape, event_dims):
-    return np.prod([int(x_shape[i]) for i in event_dims])
-
-  def _integrator_conserves_energy(self, x, event_dims, sess,
+  def _integrator_conserves_energy(self, x, independent_chain_ndims, sess,
                                    feed_dict=None):
-    def potential_and_grad(x):
-      log_prob, grad = self._log_gamma_log_prob_grad(x, event_dims)
-      return -log_prob, -grad
-
-    step_size = array_ops.placeholder(np.float32, [], name='step_size')
-    hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps')
+    step_size = array_ops.placeholder(np.float32, [], name="step_size")
+    hmc_lf_steps = array_ops.placeholder(np.int32, [], name="hmc_lf_steps")
 
     if feed_dict is None:
       feed_dict = {}
     feed_dict[hmc_lf_steps] = 1000
 
-    m = random_ops.random_normal(array_ops.shape(x))
-    potential_0, grad_0 = potential_and_grad(x)
-    old_energy = potential_0 + 0.5 * math_ops.reduce_sum(m * m,
-                                                         event_dims)
-
-    _, new_m, potential_1, _ = (
-        hmc.leapfrog_integrator(step_size, hmc_lf_steps, x,
-                                m, potential_and_grad, grad_0))
+    event_dims = math_ops.range(independent_chain_ndims,
+                                array_ops.rank(x))
 
-    new_energy = potential_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,
+    m = random_ops.random_normal(array_ops.shape(x))
+    log_prob_0 = self._log_gamma_log_prob(x, event_dims)
+    grad_0 = gradients_ops.gradients(log_prob_0, x)
+    old_energy = -log_prob_0 + 0.5 * math_ops.reduce_sum(m**2., event_dims)
+
+    new_m, _, log_prob_1, _ = _leapfrog_integrator(
+        current_momentums=[m],
+        target_log_prob_fn=lambda x: self._log_gamma_log_prob(x, event_dims),
+        current_state_parts=[x],
+        step_sizes=[step_size],
+        num_leapfrog_steps=hmc_lf_steps,
+        current_target_log_prob=log_prob_0,
+        current_grads_target_log_prob=grad_0)
+    new_m = new_m[0]
+
+    new_energy = -log_prob_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,
                                                          event_dims)
 
     x_shape = sess.run(x, feed_dict).shape
-    n_event_dims = self._n_event_dims(x_shape, event_dims)
-    feed_dict[step_size] = 0.1 / n_event_dims
-    old_energy_val, new_energy_val = sess.run([old_energy, new_energy],
-                                              feed_dict)
-    logging.vlog(1, 'average energy change: {}'.format(
-        abs(old_energy_val - new_energy_val).mean()))
-
-    self.assertAllEqual(np.ones_like(new_energy_val, dtype=np.bool),
-                        abs(old_energy_val - new_energy_val) < 1.)
-
-  def _integrator_conserves_energy_wrapper(self, event_dims):
+    event_size = np.prod(x_shape[independent_chain_ndims:])
+    feed_dict[step_size] = 0.1 / event_size
+    old_energy_, new_energy_ = sess.run([old_energy, new_energy],
+                                        feed_dict)
+    logging_ops.vlog(1, "average energy relative change: {}".format(
+        (1. - new_energy_ / old_energy_).mean()))
+    self.assertAllClose(old_energy_, new_energy_, atol=0., rtol=0.02)
+
+  def _integrator_conserves_energy_wrapper(self, independent_chain_ndims):
     """Tests the long-term energy conservation of the leapfrog integrator.
 
     The leapfrog integrator is symplectic, so for sufficiently small step
@@ -124,135 +120,310 @@ class HMCTest(test.TestCase):
     the energy of the system blowing up or collapsing.
 
     Args:
-      event_dims: A tuple of dimensions that should not be treated as
-        independent. This allows for multiple chains to be run independently
-        in parallel. Default is (), i.e., all dimensions are independent.
+      independent_chain_ndims: Python `int` scalar representing the number of
+        dims associated with independent chains.
     """
-    with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
-
-      feed_dict = {x_ph: np.zeros([50, 10, 2])}
-      self._integrator_conserves_energy(x_ph, event_dims, sess, feed_dict)
+    with self.test_session(graph=ops.Graph()) as sess:
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
+      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
+      self._integrator_conserves_energy(x_ph, independent_chain_ndims,
+                                        sess, feed_dict)
 
   def testIntegratorEnergyConservationNullShape(self):
-    self._integrator_conserves_energy_wrapper([])
+    self._integrator_conserves_energy_wrapper(0)
 
   def testIntegratorEnergyConservation1(self):
-    self._integrator_conserves_energy_wrapper([1])
+    self._integrator_conserves_energy_wrapper(1)
 
   def testIntegratorEnergyConservation2(self):
-    self._integrator_conserves_energy_wrapper([2])
-
-  def testIntegratorEnergyConservation12(self):
-    self._integrator_conserves_energy_wrapper([1, 2])
-
-  def testIntegratorEnergyConservation012(self):
-    self._integrator_conserves_energy_wrapper([0, 1, 2])
-
-  def _chain_gets_correct_expectations(self, x, event_dims, sess,
-                                       feed_dict=None):
+    self._integrator_conserves_energy_wrapper(2)
+
+  def testIntegratorEnergyConservation3(self):
+    self._integrator_conserves_energy_wrapper(3)
+
+  def testSampleChainSeedReproducibleWorksCorrectly(self):
+    with self.test_session(graph=ops.Graph()) as sess:
+      num_results = 10
+      independent_chain_ndims = 1
+
+      def log_gamma_log_prob(x):
+        event_dims = math_ops.range(independent_chain_ndims,
+                                    array_ops.rank(x))
+        return self._log_gamma_log_prob(x, event_dims)
+
+      kwargs = dict(
+          target_log_prob_fn=log_gamma_log_prob,
+          current_state=np.random.rand(4, 3, 2),
+          step_size=0.1,
+          num_leapfrog_steps=2,
+          num_burnin_steps=150,
+          seed=52,
+      )
+
+      samples0, kernel_results0 = hmc.sample_chain(
+          **dict(list(kwargs.items()) + list(dict(
+              num_results=2 * num_results,
+              num_steps_between_results=0).items())))
+
+      samples1, kernel_results1 = hmc.sample_chain(
+          **dict(list(kwargs.items()) + list(dict(
+              num_results=num_results,
+              num_steps_between_results=1).items())))
+
+      [
+          samples0_,
+          samples1_,
+          target_log_prob0_,
+          target_log_prob1_,
+      ] = sess.run([
+          samples0,
+          samples1,
+          kernel_results0.current_target_log_prob,
+          kernel_results1.current_target_log_prob,
+      ])
+      self.assertAllClose(samples0_[::2], samples1_,
+                          atol=1e-5, rtol=1e-5)
+      self.assertAllClose(target_log_prob0_[::2], target_log_prob1_,
+                          atol=1e-5, rtol=1e-5)
+
+  def _chain_gets_correct_expectations(self, x, independent_chain_ndims,
+                                       sess, feed_dict=None):
+    counter = collections.Counter()
     def log_gamma_log_prob(x):
+      counter["target_calls"] += 1
+      event_dims = math_ops.range(independent_chain_ndims,
+                                  array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
-    step_size = array_ops.placeholder(np.float32, [], name='step_size')
-    hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps')
-    hmc_n_steps = array_ops.placeholder(np.int32, [], name='hmc_n_steps')
+    num_results = array_ops.placeholder(
+        np.int32, [], name="num_results")
+    step_size = array_ops.placeholder(
+        np.float32, [], name="step_size")
+    num_leapfrog_steps = array_ops.placeholder(
+        np.int32, [], name="num_leapfrog_steps")
 
     if feed_dict is None:
       feed_dict = {}
-    feed_dict.update({step_size: 0.1,
-                      hmc_lf_steps: 2,
-                      hmc_n_steps: 300})
-
-    sample_chain, acceptance_prob_chain = hmc.chain([hmc_n_steps],
-                                                    step_size,
-                                                    hmc_lf_steps,
-                                                    x, log_gamma_log_prob,
-                                                    event_dims)
-
-    acceptance_probs, samples = sess.run([acceptance_prob_chain, sample_chain],
-                                         feed_dict)
-    samples = samples[feed_dict[hmc_n_steps] // 2:]
-    expected_x_est = samples.mean()
-    expected_exp_x_est = np.exp(samples).mean()
-
-    logging.vlog(1, 'True      E[x, exp(x)]: {}\t{}'.format(
-        self._expected_x, self._expected_exp_x))
-    logging.vlog(1, 'Estimated E[x, exp(x)]: {}\t{}'.format(
-        expected_x_est, expected_exp_x_est))
-    self.assertNear(expected_x_est, self._expected_x, 2e-2)
-    self.assertNear(expected_exp_x_est, self._expected_exp_x, 2e-2)
-    self.assertTrue((acceptance_probs > 0.5).all())
-    self.assertTrue((acceptance_probs <= 1.0).all())
-
-  def _chain_gets_correct_expectations_wrapper(self, event_dims):
-    with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
-
-      feed_dict = {x_ph: np.zeros([50, 10, 2])}
-      self._chain_gets_correct_expectations(x_ph, event_dims, sess,
-                                            feed_dict)
+    feed_dict.update({num_results: 150,
+                      step_size: 0.05,
+                      num_leapfrog_steps: 2})
+
+    samples, kernel_results = hmc.sample_chain(
+        num_results=num_results,
+        target_log_prob_fn=log_gamma_log_prob,
+        current_state=x,
+        step_size=step_size,
+        num_leapfrog_steps=num_leapfrog_steps,
+        num_burnin_steps=150,
+        seed=42)
+
+    self.assertAllEqual(dict(target_calls=2), counter)
+
+    expected_x = (math_ops.digamma(self._shape_param)
+                  - np.log(self._rate_param))
+
+    expected_exp_x = self._shape_param / self._rate_param
+
+    acceptance_probs_, samples_, expected_x_ = sess.run(
+        [kernel_results.acceptance_probs, samples, expected_x],
+        feed_dict)
+
+    actual_x = samples_.mean()
+    actual_exp_x = np.exp(samples_).mean()
+
+    logging_ops.vlog(1, "True      E[x, exp(x)]: {}\t{}".format(
+        expected_x_, expected_exp_x))
+    logging_ops.vlog(1, "Estimated E[x, exp(x)]: {}\t{}".format(
+        actual_x, actual_exp_x))
+    self.assertNear(actual_x, expected_x_, 2e-2)
+    self.assertNear(actual_exp_x, expected_exp_x, 2e-2)
+    self.assertAllEqual(np.ones_like(acceptance_probs_, np.bool),
+                        acceptance_probs_ > 0.5)
+    self.assertAllEqual(np.ones_like(acceptance_probs_, np.bool),
+                        acceptance_probs_ <= 1.)
+
+  def _chain_gets_correct_expectations_wrapper(self, independent_chain_ndims):
+    with self.test_session(graph=ops.Graph()) as sess:
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
+      feed_dict = {x_ph: np.random.rand(50, 10, 2)}
+      self._chain_gets_correct_expectations(x_ph, independent_chain_ndims,
+                                            sess, feed_dict)
 
   def testHMCChainExpectationsNullShape(self):
-    self._chain_gets_correct_expectations_wrapper([])
+    self._chain_gets_correct_expectations_wrapper(0)
 
   def testHMCChainExpectations1(self):
-    self._chain_gets_correct_expectations_wrapper([1])
+    self._chain_gets_correct_expectations_wrapper(1)
 
   def testHMCChainExpectations2(self):
-    self._chain_gets_correct_expectations_wrapper([2])
-
-  def testHMCChainExpectations12(self):
-    self._chain_gets_correct_expectations_wrapper([1, 2])
-
-  def _kernel_leaves_target_invariant(self, initial_draws, event_dims,
+    self._chain_gets_correct_expectations_wrapper(2)
+
+  def testKernelResultsUsingTruncatedDistribution(self):
+    def log_prob(x):
+      return array_ops.where(
+          x >= 0.,
+          -x - x**2,  # Non-constant gradient.
+          array_ops.fill(x.shape, math_ops.cast(-np.inf, x.dtype)))
+    # This log_prob has the property that it is likely to attract
+    # the HMC flow toward, and below, zero...but for x <=0,
+    # log_prob(x) = -inf, which should result in rejection, as well
+    # as a non-finite log_prob.  Thus, this distribution gives us an opportunity
+    # to test out the kernel results ability to correctly capture rejections due
+    # to finite AND non-finite reasons.
+    # Why use a non-constant gradient?  This ensures the leapfrog integrator
+    # will not be exact.
+
+    num_results = 1000
+    # Large step size, will give rejections due to integration error in addition
+    # to rejection due to going into a region of log_prob = -inf.
+    step_size = 0.1
+    num_leapfrog_steps = 5
+    num_chains = 2
+
+    with self.test_session(graph=ops.Graph()) as sess:
+
+      # Start multiple independent chains.
+      initial_state = ops.convert_to_tensor([0.1] * num_chains)
+
+      states, kernel_results = hmc.sample_chain(
+          num_results=num_results,
+          target_log_prob_fn=log_prob,
+          current_state=initial_state,
+          step_size=step_size,
+          num_leapfrog_steps=num_leapfrog_steps,
+          seed=42)
+
+      states_, kernel_results_ = sess.run([states, kernel_results])
+      pstates_ = kernel_results_.proposed_state
+
+      neg_inf_mask = np.isneginf(kernel_results_.proposed_target_log_prob)
+
+      # First:  Test that the mathematical properties of the above log prob
+      # function in conjunction with HMC show up as expected in kernel_results_.
+
+      # We better have log_prob = -inf some of the time.
+      self.assertLess(0, neg_inf_mask.sum())
+      # We better have some rejections due to something other than -inf.
+      self.assertLess(neg_inf_mask.sum(), (~kernel_results_.is_accepted).sum())
+      # We better have been accepted a decent amount, even near the end of the
+      # chain, or else this HMC run just got stuck at some point.
+      self.assertLess(
+          0.1, kernel_results_.is_accepted[int(0.9 * num_results):].mean())
+      # We better not have any NaNs in proposed state or log_prob.
+      # We may have some NaN in grads, which involve multiplication/addition due
+      # to gradient rules.  This is the known "NaN grad issue with tf.where."
+      self.assertAllEqual(np.zeros_like(states_),
+                          np.isnan(kernel_results_.proposed_target_log_prob))
+      self.assertAllEqual(np.zeros_like(states_),
+                          np.isnan(states_))
+      # We better not have any +inf in states, grads, or log_prob.
+      self.assertAllEqual(np.zeros_like(states_),
+                          np.isposinf(kernel_results_.proposed_target_log_prob))
+      self.assertAllEqual(
+          np.zeros_like(states_),
+          np.isposinf(kernel_results_.proposed_grads_target_log_prob[0]))
+      self.assertAllEqual(np.zeros_like(states_),
+                          np.isposinf(states_))
+
+      # Second:  Test that kernel_results is congruent with itself and
+      # acceptance/rejection of states.
+
+      # Proposed state is negative iff proposed target log prob is -inf.
+      np.testing.assert_array_less(pstates_[neg_inf_mask], 0.)
+      np.testing.assert_array_less(0., pstates_[~neg_inf_mask])
+
+      # Acceptance probs are zero whenever proposed state is negative.
+      self.assertAllEqual(
+          np.zeros_like(pstates_[neg_inf_mask]),
+          kernel_results_.acceptance_probs[neg_inf_mask])
+
+      # The move is accepted ==> state = proposed state.
+      self.assertAllEqual(
+          states_[kernel_results_.is_accepted],
+          pstates_[kernel_results_.is_accepted],
+      )
+      # The move was rejected <==> state[t] == state[t - 1].
+      for t in range(1, num_results):
+        for i in range(num_chains):
+          if kernel_results_.is_accepted[t, i]:
+            self.assertNotEqual(states_[t, i], states_[t - 1, i])
+          else:
+            self.assertEqual(states_[t, i], states_[t - 1, i])
+
+  def _kernel_leaves_target_invariant(self, initial_draws,
+                                      independent_chain_ndims,
                                       sess, feed_dict=None):
     def log_gamma_log_prob(x):
+      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
     def fake_log_prob(x):
       """Cooled version of the target distribution."""
       return 1.1 * log_gamma_log_prob(x)
 
-    step_size = array_ops.placeholder(np.float32, [], name='step_size')
+    step_size = array_ops.placeholder(np.float32, [], name="step_size")
 
     if feed_dict is None:
       feed_dict = {}
 
     feed_dict[step_size] = 0.4
 
-    sample, acceptance_probs, _, _ = hmc.kernel(step_size, 5, initial_draws,
-                                                log_gamma_log_prob, event_dims)
-    bad_sample, bad_acceptance_probs, _, _ = hmc.kernel(
-        step_size, 5, initial_draws, fake_log_prob, event_dims)
-    (acceptance_probs_val, bad_acceptance_probs_val, initial_draws_val,
-     updated_draws_val, fake_draws_val) = sess.run([acceptance_probs,
-                                                    bad_acceptance_probs,
-                                                    initial_draws, sample,
-                                                    bad_sample], feed_dict)
+    sample, kernel_results = hmc.kernel(
+        target_log_prob_fn=log_gamma_log_prob,
+        current_state=initial_draws,
+        step_size=step_size,
+        num_leapfrog_steps=5,
+        seed=43)
+
+    bad_sample, bad_kernel_results = hmc.kernel(
+        target_log_prob_fn=fake_log_prob,
+        current_state=initial_draws,
+        step_size=step_size,
+        num_leapfrog_steps=5,
+        seed=44)
+
+    [
+        acceptance_probs_,
+        bad_acceptance_probs_,
+        initial_draws_,
+        updated_draws_,
+        fake_draws_,
+    ] = sess.run([
+        kernel_results.acceptance_probs,
+        bad_kernel_results.acceptance_probs,
+        initial_draws,
+        sample,
+        bad_sample,
+    ], feed_dict)
+
     # Confirm step size is small enough that we usually accept.
-    self.assertGreater(acceptance_probs_val.mean(), 0.5)
-    self.assertGreater(bad_acceptance_probs_val.mean(), 0.5)
+    self.assertGreater(acceptance_probs_.mean(), 0.5)
+    self.assertGreater(bad_acceptance_probs_.mean(), 0.5)
+
     # Confirm step size is large enough that we sometimes reject.
-    self.assertLess(acceptance_probs_val.mean(), 0.99)
-    self.assertLess(bad_acceptance_probs_val.mean(), 0.99)
-    _, ks_p_value_true = stats.ks_2samp(initial_draws_val.flatten(),
-                                        updated_draws_val.flatten())
-    _, ks_p_value_fake = stats.ks_2samp(initial_draws_val.flatten(),
-                                        fake_draws_val.flatten())
-    logging.vlog(1, 'acceptance rate for true target: {}'.format(
-        acceptance_probs_val.mean()))
-    logging.vlog(1, 'acceptance rate for fake target: {}'.format(
-        bad_acceptance_probs_val.mean()))
-    logging.vlog(1, 'K-S p-value for true target: {}'.format(ks_p_value_true))
-    logging.vlog(1, 'K-S p-value for fake target: {}'.format(ks_p_value_fake))
+    self.assertLess(acceptance_probs_.mean(), 0.99)
+    self.assertLess(bad_acceptance_probs_.mean(), 0.99)
+
+    _, ks_p_value_true = stats.ks_2samp(initial_draws_.flatten(),
+                                        updated_draws_.flatten())
+    _, ks_p_value_fake = stats.ks_2samp(initial_draws_.flatten(),
+                                        fake_draws_.flatten())
+
+    logging_ops.vlog(1, "acceptance rate for true target: {}".format(
+        acceptance_probs_.mean()))
+    logging_ops.vlog(1, "acceptance rate for fake target: {}".format(
+        bad_acceptance_probs_.mean()))
+    logging_ops.vlog(1, "K-S p-value for true target: {}".format(
+        ks_p_value_true))
+    logging_ops.vlog(1, "K-S p-value for fake target: {}".format(
+        ks_p_value_fake))
     # Make sure that the MCMC update hasn't changed the empirical CDF much.
     self.assertGreater(ks_p_value_true, 1e-3)
     # Confirm that targeting the wrong distribution does
     # significantly change the empirical CDF.
     self.assertLess(ks_p_value_fake, 1e-6)
 
-  def _kernel_leaves_target_invariant_wrapper(self, event_dims):
+  def _kernel_leaves_target_invariant_wrapper(self, independent_chain_ndims):
     """Tests that the kernel leaves the target distribution invariant.
 
     Draws some independent samples from the target distribution,
@@ -264,86 +435,429 @@ class HMCTest(test.TestCase):
     does change the target distribution. (And that we can detect that.)
 
     Args:
-      event_dims: A tuple of dimensions that should not be treated as
-        independent. This allows for multiple chains to be run independently
-        in parallel. Default is (), i.e., all dimensions are independent.
+      independent_chain_ndims: Python `int` scalar representing the number of
+        dims associated with independent chains.
     """
-    with self.test_session() as sess:
+    with self.test_session(graph=ops.Graph()) as sess:
       initial_draws = np.log(np.random.gamma(self._shape_param,
                                              size=[50000, 2, 2]))
       initial_draws -= np.log(self._rate_param)
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
 
       feed_dict = {x_ph: initial_draws}
 
-      self._kernel_leaves_target_invariant(x_ph, event_dims, sess,
-                                           feed_dict)
-
-  def testKernelLeavesTargetInvariantNullShape(self):
-    self._kernel_leaves_target_invariant_wrapper([])
+      self._kernel_leaves_target_invariant(x_ph, independent_chain_ndims,
+                                           sess, feed_dict)
 
   def testKernelLeavesTargetInvariant1(self):
-    self._kernel_leaves_target_invariant_wrapper([1])
+    self._kernel_leaves_target_invariant_wrapper(1)
 
   def testKernelLeavesTargetInvariant2(self):
-    self._kernel_leaves_target_invariant_wrapper([2])
+    self._kernel_leaves_target_invariant_wrapper(2)
 
-  def testKernelLeavesTargetInvariant12(self):
-    self._kernel_leaves_target_invariant_wrapper([1, 2])
+  def testKernelLeavesTargetInvariant3(self):
+    self._kernel_leaves_target_invariant_wrapper(3)
+
+  def _ais_gets_correct_log_normalizer(self, init, independent_chain_ndims,
+                                       sess, feed_dict=None):
+    counter = collections.Counter()
 
-  def _ais_gets_correct_log_normalizer(self, init, event_dims, sess,
-                                       feed_dict=None):
     def proposal_log_prob(x):
-      return math_ops.reduce_sum(-0.5 * x * x - 0.5 * np.log(2*np.pi),
-                                 event_dims)
+      counter["proposal_calls"] += 1
+      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
+      return -0.5 * math_ops.reduce_sum(x**2. + np.log(2 * np.pi),
+                                        axis=event_dims)
 
     def target_log_prob(x):
+      counter["target_calls"] += 1
+      event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
       return self._log_gamma_log_prob(x, event_dims)
 
     if feed_dict is None:
       feed_dict = {}
 
-    w, _, _ = hmc.ais_chain(200, 0.5, 2, init, target_log_prob,
-                            proposal_log_prob, event_dims)
-
-    w_val = sess.run(w, feed_dict)
-    init_shape = sess.run(init, feed_dict).shape
-    normalizer_multiplier = np.prod([init_shape[i] for i in event_dims])
-
-    true_normalizer = -self._shape_param * np.log(self._rate_param)
-    true_normalizer += special.gammaln(self._shape_param)
-    true_normalizer *= normalizer_multiplier
-
-    n_weights = np.prod(w_val.shape)
-    normalized_w = np.exp(w_val - true_normalizer)
-    standard_error = np.std(normalized_w) / np.sqrt(n_weights)
-    logging.vlog(1, 'True normalizer {}, estimated {}, n_weights {}'.format(
-        true_normalizer, np.log(normalized_w.mean()) + true_normalizer,
-        n_weights))
-    self.assertNear(normalized_w.mean(), 1.0, 4.0 * standard_error)
-
-  def _ais_gets_correct_log_normalizer_wrapper(self, event_dims):
+    num_steps = 200
+
+    _, ais_weights, _ = hmc.sample_annealed_importance_chain(
+        proposal_log_prob_fn=proposal_log_prob,
+        num_steps=num_steps,
+        target_log_prob_fn=target_log_prob,
+        step_size=0.5,
+        current_state=init,
+        num_leapfrog_steps=2,
+        seed=45)
+
+    # We have three calls because the calculation of `ais_weights` entails
+    # another call to the `convex_combined_log_prob_fn`. We could refactor
+    # things to avoid this, if needed (eg, b/72994218).
+    self.assertAllEqual(dict(target_calls=3, proposal_calls=3), counter)
+
+    event_shape = array_ops.shape(init)[independent_chain_ndims:]
+    event_size = math_ops.reduce_prod(event_shape)
+
+    log_true_normalizer = (
+        -self._shape_param * math_ops.log(self._rate_param)
+        + math_ops.lgamma(self._shape_param))
+    log_true_normalizer *= math_ops.cast(event_size, log_true_normalizer.dtype)
+
+    log_estimated_normalizer = (math_ops.reduce_logsumexp(ais_weights)
+                                - np.log(num_steps))
+
+    ratio_estimate_true = math_ops.exp(ais_weights - log_true_normalizer)
+    ais_weights_size = array_ops.size(ais_weights)
+    standard_error = math_ops.sqrt(
+        _reduce_variance(ratio_estimate_true)
+        / math_ops.cast(ais_weights_size, ratio_estimate_true.dtype))
+
+    [
+        ratio_estimate_true_,
+        log_true_normalizer_,
+        log_estimated_normalizer_,
+        standard_error_,
+        ais_weights_size_,
+        event_size_,
+    ] = sess.run([
+        ratio_estimate_true,
+        log_true_normalizer,
+        log_estimated_normalizer,
+        standard_error,
+        ais_weights_size,
+        event_size,
+    ], feed_dict)
+
+    logging_ops.vlog(1, "        log_true_normalizer: {}\n"
+                        "   log_estimated_normalizer: {}\n"
+                        "           ais_weights_size: {}\n"
+                        "                 event_size: {}\n".format(
+                            log_true_normalizer_,
+                            log_estimated_normalizer_,
+                            ais_weights_size_,
+                            event_size_))
+    self.assertNear(ratio_estimate_true_.mean(), 1., 4. * standard_error_)
+
+  def _ais_gets_correct_log_normalizer_wrapper(self, independent_chain_ndims):
     """Tests that AIS yields reasonable estimates of normalizers."""
-    with self.test_session() as sess:
-      x_ph = array_ops.placeholder(np.float32, name='x_ph')
-
+    with self.test_session(graph=ops.Graph()) as sess:
+      x_ph = array_ops.placeholder(np.float32, name="x_ph")
       initial_draws = np.random.normal(size=[30, 2, 1])
-      feed_dict = {x_ph: initial_draws}
-
-      self._ais_gets_correct_log_normalizer(x_ph, event_dims, sess,
-                                            feed_dict)
-
-  def testAISNullShape(self):
-    self._ais_gets_correct_log_normalizer_wrapper([])
+      self._ais_gets_correct_log_normalizer(
+          x_ph,
+          independent_chain_ndims,
+          sess,
+          feed_dict={x_ph: initial_draws})
 
   def testAIS1(self):
-    self._ais_gets_correct_log_normalizer_wrapper([1])
+    self._ais_gets_correct_log_normalizer_wrapper(1)
 
   def testAIS2(self):
-    self._ais_gets_correct_log_normalizer_wrapper([2])
-
-  def testAIS12(self):
-    self._ais_gets_correct_log_normalizer_wrapper([1, 2])
-
-if __name__ == '__main__':
+    self._ais_gets_correct_log_normalizer_wrapper(2)
+
+  def testAIS3(self):
+    self._ais_gets_correct_log_normalizer_wrapper(3)
+
+  def testSampleAIChainSeedReproducibleWorksCorrectly(self):
+    with self.test_session(graph=ops.Graph()) as sess:
+      independent_chain_ndims = 1
+      x = np.random.rand(4, 3, 2)
+
+      def proposal_log_prob(x):
+        event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
+        return -0.5 * math_ops.reduce_sum(x**2. + np.log(2 * np.pi),
+                                          axis=event_dims)
+
+      def target_log_prob(x):
+        event_dims = math_ops.range(independent_chain_ndims, array_ops.rank(x))
+        return self._log_gamma_log_prob(x, event_dims)
+
+      ais_kwargs = dict(
+          proposal_log_prob_fn=proposal_log_prob,
+          num_steps=200,
+          target_log_prob_fn=target_log_prob,
+          step_size=0.5,
+          current_state=x,
+          num_leapfrog_steps=2,
+          seed=53)
+
+      _, ais_weights0, _ = hmc.sample_annealed_importance_chain(
+          **ais_kwargs)
+
+      _, ais_weights1, _ = hmc.sample_annealed_importance_chain(
+          **ais_kwargs)
+
+      [ais_weights0_, ais_weights1_] = sess.run([
+          ais_weights0, ais_weights1])
+
+      self.assertAllClose(ais_weights0_, ais_weights1_,
+                          atol=1e-5, rtol=1e-5)
+
+  def testNanRejection(self):
+    """Tests that an update that yields NaN potentials gets rejected.
+
+    We run HMC with a target distribution that returns NaN
+    log-likelihoods if any element of x < 0, and unit-scale
+    exponential log-likelihoods otherwise. The exponential potential
+    pushes x towards 0, ensuring that any reasonably large update will
+    push us over the edge into NaN territory.
+    """
+    def _unbounded_exponential_log_prob(x):
+      """An exponential distribution with log-likelihood NaN for x < 0."""
+      per_element_potentials = array_ops.where(
+          x < 0.,
+          array_ops.fill(array_ops.shape(x), x.dtype.as_numpy_dtype(np.nan)),
+          -x)
+      return math_ops.reduce_sum(per_element_potentials)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      initial_x = math_ops.linspace(0.01, 5, 10)
+      updated_x, kernel_results = hmc.kernel(
+          target_log_prob_fn=_unbounded_exponential_log_prob,
+          current_state=initial_x,
+          step_size=2.,
+          num_leapfrog_steps=5,
+          seed=46)
+      initial_x_, updated_x_, acceptance_probs_ = sess.run(
+          [initial_x, updated_x, kernel_results.acceptance_probs])
+
+      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
+      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
+      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))
+
+      self.assertAllEqual(initial_x_, updated_x_)
+      self.assertEqual(acceptance_probs_, 0.)
+
+  def testNanFromGradsDontPropagate(self):
+    """Test that update with NaN gradients does not cause NaN in results."""
+    def _nan_log_prob_with_nan_gradient(x):
+      return np.nan * math_ops.reduce_sum(x)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      initial_x = math_ops.linspace(0.01, 5, 10)
+      updated_x, kernel_results = hmc.kernel(
+          target_log_prob_fn=_nan_log_prob_with_nan_gradient,
+          current_state=initial_x,
+          step_size=2.,
+          num_leapfrog_steps=5,
+          seed=47)
+      initial_x_, updated_x_, acceptance_probs_ = sess.run(
+          [initial_x, updated_x, kernel_results.acceptance_probs])
+
+      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
+      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
+      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))
+
+      self.assertAllEqual(initial_x_, updated_x_)
+      self.assertEqual(acceptance_probs_, 0.)
+
+      self.assertAllFinite(
+          gradients_ops.gradients(updated_x, initial_x)[0].eval())
+      self.assertAllEqual([True], [g is None for g in gradients_ops.gradients(
+          kernel_results.proposed_grads_target_log_prob, initial_x)])
+      self.assertAllEqual([False], [g is None for g in gradients_ops.gradients(
+          kernel_results.proposed_grads_target_log_prob,
+          kernel_results.proposed_state)])
+
+      # Gradients of the acceptance probs and new log prob are not finite.
+      # self.assertAllFinite(
+      #     gradients_ops.gradients(acceptance_probs, initial_x)[0].eval())
+      # self.assertAllFinite(
+      #     gradients_ops.gradients(new_log_prob, initial_x)[0].eval())
+
+  def _testChainWorksDtype(self, dtype):
+    with self.test_session(graph=ops.Graph()) as sess:
+      states, kernel_results = hmc.sample_chain(
+          num_results=10,
+          target_log_prob_fn=lambda x: -math_ops.reduce_sum(x**2., axis=-1),
+          current_state=np.zeros(5).astype(dtype),
+          step_size=0.01,
+          num_leapfrog_steps=10,
+          seed=48)
+      states_, acceptance_probs_ = sess.run(
+          [states, kernel_results.acceptance_probs])
+      self.assertEqual(dtype, states_.dtype)
+      self.assertEqual(dtype, acceptance_probs_.dtype)
+
+  def testChainWorksIn64Bit(self):
+    self._testChainWorksDtype(np.float64)
+
+  def testChainWorksIn16Bit(self):
+    self._testChainWorksDtype(np.float16)
+
+  def testChainWorksCorrelatedMultivariate(self):
+    dtype = np.float32
+    true_mean = dtype([0, 0])
+    true_cov = dtype([[1, 0.5],
+                      [0.5, 1]])
+    num_results = 2000
+    counter = collections.Counter()
+    with self.test_session(graph=ops.Graph()) as sess:
+      def target_log_prob(x, y):
+        counter["target_calls"] += 1
+        # Corresponds to unnormalized MVN.
+        # z = matmul(inv(chol(true_cov)), [x, y] - true_mean)
+        z = array_ops.stack([x, y], axis=-1) - true_mean
+        z = array_ops.squeeze(
+            gen_linalg_ops.matrix_triangular_solve(
+                np.linalg.cholesky(true_cov),
+                z[..., array_ops.newaxis]),
+            axis=-1)
+        return -0.5 * math_ops.reduce_sum(z**2., axis=-1)
+      states, _ = hmc.sample_chain(
+          num_results=num_results,
+          target_log_prob_fn=target_log_prob,
+          current_state=[dtype(-2), dtype(2)],
+          step_size=[0.5, 0.5],
+          num_leapfrog_steps=2,
+          num_burnin_steps=200,
+          num_steps_between_results=1,
+          seed=54)
+      self.assertAllEqual(dict(target_calls=2), counter)
+      states = array_ops.stack(states, axis=-1)
+      self.assertEqual(num_results, states.shape[0].value)
+      sample_mean = math_ops.reduce_mean(states, axis=0)
+      x = states - sample_mean
+      sample_cov = math_ops.matmul(x, x, transpose_a=True) / dtype(num_results)
+      [sample_mean_, sample_cov_] = sess.run([
+          sample_mean, sample_cov])
+      self.assertAllClose(true_mean, sample_mean_,
+                          atol=0.05, rtol=0.)
+      self.assertAllClose(true_cov, sample_cov_,
+                          atol=0., rtol=0.1)
+
+
+class _EnergyComputationTest(object):
+
+  def testHandlesNanFromPotential(self):
+    with self.test_session(graph=ops.Graph()) as sess:
+      x = [1, np.inf, -np.inf, np.nan]
+      target_log_prob, proposed_target_log_prob = [
+          self.dtype(x.flatten()) for x in np.meshgrid(x, x)]
+      num_chains = len(target_log_prob)
+      dummy_momentums = [-1, 1]
+      momentums = [self.dtype([dummy_momentums] * num_chains)]
+      proposed_momentums = [self.dtype([dummy_momentums] * num_chains)]
+
+      target_log_prob = ops.convert_to_tensor(target_log_prob)
+      momentums = [ops.convert_to_tensor(momentums[0])]
+      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
+      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
+
+      energy = _compute_energy_change(
+          target_log_prob,
+          momentums,
+          proposed_target_log_prob,
+          proposed_momentums,
+          independent_chain_ndims=1)
+      grads = gradients_ops.gradients(energy, momentums)
+
+      [actual_energy, grads_] = sess.run([energy, grads])
+
+      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
+      # finite otherwise.
+      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
+      self.assertAllEqual(expected_energy, actual_energy)
+
+      # Ensure gradient is finite.
+      self.assertAllEqual(np.ones_like(grads_).astype(np.bool),
+                          np.isfinite(grads_))
+
+  def testHandlesNanFromKinetic(self):
+    with self.test_session(graph=ops.Graph()) as sess:
+      x = [1, np.inf, -np.inf, np.nan]
+      momentums, proposed_momentums = [
+          [np.reshape(self.dtype(x), [-1, 1])]
+          for x in np.meshgrid(x, x)]
+      num_chains = len(momentums[0])
+      target_log_prob = np.ones(num_chains, self.dtype)
+      proposed_target_log_prob = np.ones(num_chains, self.dtype)
+
+      target_log_prob = ops.convert_to_tensor(target_log_prob)
+      momentums = [ops.convert_to_tensor(momentums[0])]
+      proposed_target_log_prob = ops.convert_to_tensor(proposed_target_log_prob)
+      proposed_momentums = [ops.convert_to_tensor(proposed_momentums[0])]
+
+      energy = _compute_energy_change(
+          target_log_prob,
+          momentums,
+          proposed_target_log_prob,
+          proposed_momentums,
+          independent_chain_ndims=1)
+      grads = gradients_ops.gradients(energy, momentums)
+
+      [actual_energy, grads_] = sess.run([energy, grads])
+
+      # Ensure energy is `inf` (note: that's positive inf) in weird cases and
+      # finite otherwise.
+      expected_energy = self.dtype([0] + [np.inf]*(num_chains - 1))
+      self.assertAllEqual(expected_energy, actual_energy)
+
+      # Ensure gradient is finite.
+      g = grads_[0].reshape([len(x), len(x)])[:, 0]
+      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isfinite(g))
+
+      # The remaining gradients are nan because the momentum was itself nan or
+      # inf.
+      g = grads_[0].reshape([len(x), len(x)])[:, 1:]
+      self.assertAllEqual(np.ones_like(g).astype(np.bool), np.isnan(g))
+
+
+class EnergyComputationTest16(test.TestCase, _EnergyComputationTest):
+  dtype = np.float16
+
+
+class EnergyComputationTest32(test.TestCase, _EnergyComputationTest):
+  dtype = np.float32
+
+
+class EnergyComputationTest64(test.TestCase, _EnergyComputationTest):
+  dtype = np.float64
+
+
+class _HMCHandlesLists(object):
+
+  def testStateParts(self):
+    with self.test_session(graph=ops.Graph()) as sess:
+      dist_x = normal_lib.Normal(loc=self.dtype(0), scale=self.dtype(1))
+      dist_y = independent_lib.Independent(
+          gamma_lib.Gamma(concentration=self.dtype([1, 2]),
+                          rate=self.dtype([0.5, 0.75])),
+          reinterpreted_batch_ndims=1)
+      def target_log_prob(x, y):
+        return dist_x.log_prob(x) + dist_y.log_prob(y)
+      x0 = [dist_x.sample(seed=1), dist_y.sample(seed=2)]
+      samples, _ = hmc.sample_chain(
+          num_results=int(2e3),
+          target_log_prob_fn=target_log_prob,
+          current_state=x0,
+          step_size=0.85,
+          num_leapfrog_steps=3,
+          num_burnin_steps=int(250),
+          seed=49)
+      actual_means = [math_ops.reduce_mean(s, axis=0) for s in samples]
+      actual_vars = [_reduce_variance(s, axis=0) for s in samples]
+      expected_means = [dist_x.mean(), dist_y.mean()]
+      expected_vars = [dist_x.variance(), dist_y.variance()]
+      [
+          actual_means_,
+          actual_vars_,
+          expected_means_,
+          expected_vars_,
+      ] = sess.run([
+          actual_means,
+          actual_vars,
+          expected_means,
+          expected_vars,
+      ])
+      self.assertAllClose(expected_means_, actual_means_, atol=0.05, rtol=0.16)
+      self.assertAllClose(expected_vars_, actual_vars_, atol=0., rtol=0.25)
+
+
+class HMCHandlesLists32(_HMCHandlesLists, test.TestCase):
+  dtype = np.float32
+
+
+class HMCHandlesLists64(_HMCHandlesLists, test.TestCase):
+  dtype = np.float64
+
+
+if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/layers_conv_variational_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_conv_variational_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..750afb6654311fea30a1dc6b31b20aa3b4160ae2
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_conv_variational_test.py
@@ -0,0 +1,521 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convolutional Bayesian layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.bayesflow.python.ops import layers_conv_variational as prob_layers_lib
+from tensorflow.contrib.bayesflow.python.ops import layers_util as prob_layers_util
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.platform import test
+
+
+class Counter(object):
+  """Helper class to manage incrementing a counting `int`."""
+
+  def __init__(self):
+    self._value = -1
+
+  @property
+  def value(self):
+    return self._value
+
+  def __call__(self):
+    self._value += 1
+    return self._value
+
+
+class MockDistribution(independent_lib.Independent):
+  """Monitors layer calls to the underlying distribution."""
+
+  def __init__(self, result_sample, result_log_prob, loc=None, scale=None):
+    self.result_sample = result_sample
+    self.result_log_prob = result_log_prob
+    self.result_loc = loc
+    self.result_scale = scale
+    self.result_distribution = normal_lib.Normal(loc=0.0, scale=1.0)
+    if loc is not None and scale is not None:
+      self.result_distribution = normal_lib.Normal(loc=self.result_loc,
+                                                   scale=self.result_scale)
+    self.called_log_prob = Counter()
+    self.called_sample = Counter()
+    self.called_loc = Counter()
+    self.called_scale = Counter()
+
+  def log_prob(self, *args, **kwargs):
+    self.called_log_prob()
+    return self.result_log_prob
+
+  def sample(self, *args, **kwargs):
+    self.called_sample()
+    return self.result_sample
+
+  @property
+  def distribution(self):  # for dummy check on Independent(Normal)
+    return self.result_distribution
+
+  @property
+  def loc(self):
+    self.called_loc()
+    return self.result_loc
+
+  @property
+  def scale(self):
+    self.called_scale()
+    return self.result_scale
+
+
+class MockKLDivergence(object):
+  """Monitors layer calls to the divergence implementation."""
+
+  def __init__(self, result):
+    self.result = result
+    self.args = []
+    self.called = Counter()
+
+  def __call__(self, *args, **kwargs):
+    self.called()
+    self.args.append(args)
+    return self.result
+
+
+class ConvVariational(test.TestCase):
+
+  def _testKLPenaltyKernel(self, layer_class):
+    with self.test_session():
+      layer = layer_class(filters=2, kernel_size=3)
+      if layer_class in (prob_layers_lib.Conv1DReparameterization,
+                         prob_layers_lib.Conv1DFlipout):
+        inputs = random_ops.random_uniform([2, 3, 1], seed=1)
+      elif layer_class in (prob_layers_lib.Conv2DReparameterization,
+                           prob_layers_lib.Conv2DFlipout):
+        inputs = random_ops.random_uniform([2, 3, 3, 1], seed=1)
+      elif layer_class in (prob_layers_lib.Conv3DReparameterization,
+                           prob_layers_lib.Conv3DFlipout):
+        inputs = random_ops.random_uniform([2, 3, 3, 3, 1], seed=1)
+
+      # No keys.
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 0)
+      self.assertListEqual(layer.losses, losses)
+
+      _ = layer(inputs)
+
+      # Yes keys.
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 1)
+      self.assertListEqual(layer.losses, losses)
+
+  def _testKLPenaltyBoth(self, layer_class):
+    def _make_normal(dtype, *args):  # pylint: disable=unused-argument
+      return normal_lib.Normal(
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.))
+    with self.test_session():
+      layer = layer_class(
+          filters=2,
+          kernel_size=3,
+          bias_posterior_fn=prob_layers_util.default_mean_field_normal_fn(),
+          bias_prior_fn=_make_normal)
+      if layer_class in (prob_layers_lib.Conv1DReparameterization,
+                         prob_layers_lib.Conv1DFlipout):
+        inputs = random_ops.random_uniform([2, 3, 1], seed=1)
+      elif layer_class in (prob_layers_lib.Conv2DReparameterization,
+                           prob_layers_lib.Conv2DFlipout):
+        inputs = random_ops.random_uniform([2, 3, 3, 1], seed=1)
+      elif layer_class in (prob_layers_lib.Conv3DReparameterization,
+                           prob_layers_lib.Conv3DFlipout):
+        inputs = random_ops.random_uniform([2, 3, 3, 3, 1], seed=1)
+
+      # No keys.
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 0)
+      self.assertListEqual(layer.losses, losses)
+
+      _ = layer(inputs)
+
+      # Yes keys.
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 2)
+      self.assertListEqual(layer.losses, losses)
+
+  def _testConvSetUp(self, layer_class, batch_size, depth=None,
+                     height=None, width=None, channels=None, filters=None,
+                     **kwargs):
+    seed = Counter()
+    if layer_class in (prob_layers_lib.Conv1DReparameterization,
+                       prob_layers_lib.Conv1DFlipout):
+      inputs = random_ops.random_uniform(
+          [batch_size, width, channels], seed=seed())
+      kernel_size = (2,)
+    elif layer_class in (prob_layers_lib.Conv2DReparameterization,
+                         prob_layers_lib.Conv2DFlipout):
+      inputs = random_ops.random_uniform(
+          [batch_size, height, width, channels], seed=seed())
+      kernel_size = (2, 2)
+    elif layer_class in (prob_layers_lib.Conv3DReparameterization,
+                         prob_layers_lib.Conv3DFlipout):
+      inputs = random_ops.random_uniform(
+          [batch_size, depth, height, width, channels], seed=seed())
+      kernel_size = (2, 2, 2)
+
+    kernel_shape = kernel_size + (channels, filters)
+    kernel_posterior = MockDistribution(
+        loc=random_ops.random_uniform(kernel_shape, seed=seed()),
+        scale=random_ops.random_uniform(kernel_shape, seed=seed()),
+        result_log_prob=random_ops.random_uniform(kernel_shape, seed=seed()),
+        result_sample=random_ops.random_uniform(kernel_shape, seed=seed()))
+    kernel_prior = MockDistribution(
+        result_log_prob=random_ops.random_uniform(kernel_shape, seed=seed()),
+        result_sample=random_ops.random_uniform(kernel_shape, seed=seed()))
+    kernel_divergence = MockKLDivergence(
+        result=random_ops.random_uniform(kernel_shape, seed=seed()))
+
+    bias_size = (filters,)
+    bias_posterior = MockDistribution(
+        result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
+        result_sample=random_ops.random_uniform(bias_size, seed=seed()))
+    bias_prior = MockDistribution(
+        result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
+        result_sample=random_ops.random_uniform(bias_size, seed=seed()))
+    bias_divergence = MockKLDivergence(
+        result=random_ops.random_uniform(bias_size, seed=seed()))
+
+    layer = layer_class(
+        filters=filters,
+        kernel_size=kernel_size,
+        padding="SAME",
+        kernel_posterior_fn=lambda *args: kernel_posterior,
+        kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
+        kernel_prior_fn=lambda *args: kernel_prior,
+        kernel_divergence_fn=kernel_divergence,
+        bias_posterior_fn=lambda *args: bias_posterior,
+        bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
+        bias_prior_fn=lambda *args: bias_prior,
+        bias_divergence_fn=bias_divergence,
+        **kwargs)
+
+    outputs = layer(inputs)
+
+    kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+    return (kernel_posterior, kernel_prior, kernel_divergence,
+            bias_posterior, bias_prior, bias_divergence,
+            layer, inputs, outputs, kl_penalty, kernel_shape)
+
+  def _testConvReparameterization(self, layer_class):
+    batch_size, depth, height, width, channels, filters = 2, 4, 4, 4, 3, 5
+    with self.test_session() as sess:
+      (kernel_posterior, kernel_prior, kernel_divergence,
+       bias_posterior, bias_prior, bias_divergence, layer, inputs,
+       outputs, kl_penalty, kernel_shape) = self._testConvSetUp(
+           layer_class, batch_size,
+           depth=depth, height=height, width=width, channels=channels,
+           filters=filters)
+
+      convolution_op = nn_ops.Convolution(
+          tensor_shape.TensorShape(inputs.shape),
+          filter_shape=tensor_shape.TensorShape(kernel_shape),
+          padding="SAME")
+      expected_outputs = convolution_op(inputs, kernel_posterior.result_sample)
+      expected_outputs = nn.bias_add(expected_outputs,
+                                     bias_posterior.result_sample,
+                                     data_format="NHWC")
+
+      [
+          expected_outputs_, actual_outputs_,
+          expected_kernel_, actual_kernel_,
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          expected_bias_, actual_bias_,
+          expected_bias_divergence_, actual_bias_divergence_,
+      ] = sess.run([
+          expected_outputs, outputs,
+          kernel_posterior.result_sample, layer.kernel_posterior_tensor,
+          kernel_divergence.result, kl_penalty[0],
+          bias_posterior.result_sample, layer.bias_posterior_tensor,
+          bias_divergence.result, kl_penalty[1],
+      ])
+
+      self.assertAllClose(
+          expected_kernel_, actual_kernel_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_bias_, actual_bias_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_outputs_, actual_outputs_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_bias_divergence_, actual_bias_divergence_,
+          rtol=1e-6, atol=0.)
+
+      self.assertAllEqual(
+          [[kernel_posterior.distribution,
+            kernel_prior.distribution,
+            kernel_posterior.result_sample]],
+          kernel_divergence.args)
+
+      self.assertAllEqual(
+          [[bias_posterior.distribution,
+            bias_prior.distribution,
+            bias_posterior.result_sample]],
+          bias_divergence.args)
+
+  def _testConvFlipout(self, layer_class):
+    batch_size, depth, height, width, channels, filters = 2, 4, 4, 4, 3, 5
+    with self.test_session() as sess:
+      (kernel_posterior, kernel_prior, kernel_divergence,
+       bias_posterior, bias_prior, bias_divergence, layer, inputs,
+       outputs, kl_penalty, kernel_shape) = self._testConvSetUp(
+           layer_class, batch_size,
+           depth=depth, height=height, width=width, channels=channels,
+           filters=filters, seed=44)
+
+      convolution_op = nn_ops.Convolution(
+          tensor_shape.TensorShape(inputs.shape),
+          filter_shape=tensor_shape.TensorShape(kernel_shape),
+          padding="SAME")
+
+      expected_kernel_posterior_affine = normal_lib.Normal(
+          loc=array_ops.zeros_like(kernel_posterior.result_loc),
+          scale=kernel_posterior.result_scale)
+      expected_kernel_posterior_affine_tensor = (
+          expected_kernel_posterior_affine.sample(seed=42))
+
+      expected_outputs = convolution_op(
+          inputs, kernel_posterior.distribution.loc)
+
+      input_shape = array_ops.shape(inputs)
+      output_shape = array_ops.shape(expected_outputs)
+      batch_shape = array_ops.expand_dims(input_shape[0], 0)
+      channels = input_shape[-1]
+      rank = len(inputs.get_shape()) - 2
+
+      sign_input = random_ops.random_uniform(
+          array_ops.concat([batch_shape,
+                            array_ops.expand_dims(channels, 0)], 0),
+          minval=0,
+          maxval=2,
+          dtype=dtypes.int32,
+          seed=layer.seed)
+      sign_input = math_ops.cast(2 * sign_input - 1, inputs.dtype)
+      sign_output = random_ops.random_uniform(
+          array_ops.concat([batch_shape,
+                            array_ops.expand_dims(filters, 0)], 0),
+          minval=0,
+          maxval=2,
+          dtype=dtypes.int32,
+          seed=distribution_util.gen_new_seed(
+              layer.seed, salt="conv_flipout"))
+      sign_output = math_ops.cast(2 * sign_output - 1, inputs.dtype)
+      for _ in range(rank):
+        sign_input = array_ops.expand_dims(sign_input, 1)  # 2D ex: (B, 1, 1, C)
+        sign_output = array_ops.expand_dims(sign_output, 1)
+
+      sign_input = array_ops.tile(  # tile for element-wise op broadcasting
+          sign_input,
+          [1] + [input_shape[i + 1] for i in range(rank)] + [1])
+      sign_output = array_ops.tile(
+          sign_output,
+          [1] + [output_shape[i + 1] for i in range(rank)] + [1])
+
+      perturbed_inputs = convolution_op(
+          inputs * sign_input, expected_kernel_posterior_affine_tensor)
+      perturbed_inputs *= sign_output
+
+      expected_outputs += perturbed_inputs
+      expected_outputs = nn.bias_add(expected_outputs,
+                                     bias_posterior.result_sample,
+                                     data_format="NHWC")
+
+      [
+          expected_outputs_, actual_outputs_,
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          expected_bias_, actual_bias_,
+          expected_bias_divergence_, actual_bias_divergence_,
+      ] = sess.run([
+          expected_outputs, outputs,
+          kernel_divergence.result, kl_penalty[0],
+          bias_posterior.result_sample, layer.bias_posterior_tensor,
+          bias_divergence.result, kl_penalty[1],
+      ])
+
+      self.assertAllClose(
+          expected_bias_, actual_bias_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_outputs_, actual_outputs_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_bias_divergence_, actual_bias_divergence_,
+          rtol=1e-6, atol=0.)
+
+      self.assertAllEqual(
+          [[kernel_posterior.distribution, kernel_prior.distribution, None]],
+          kernel_divergence.args)
+
+      self.assertAllEqual(
+          [[bias_posterior.distribution,
+            bias_prior.distribution,
+            bias_posterior.result_sample]],
+          bias_divergence.args)
+
+  def _testRandomConvFlipout(self, layer_class):
+    batch_size, depth, height, width, channels, filters = 2, 4, 4, 4, 3, 5
+    with self.test_session() as sess:
+      seed = Counter()
+      if layer_class in (prob_layers_lib.Conv1DReparameterization,
+                         prob_layers_lib.Conv1DFlipout):
+        inputs = random_ops.random_uniform(
+            [batch_size, width, channels], seed=seed())
+        kernel_size = (2,)
+      elif layer_class in (prob_layers_lib.Conv2DReparameterization,
+                           prob_layers_lib.Conv2DFlipout):
+        inputs = random_ops.random_uniform(
+            [batch_size, height, width, channels], seed=seed())
+        kernel_size = (2, 2)
+      elif layer_class in (prob_layers_lib.Conv3DReparameterization,
+                           prob_layers_lib.Conv3DFlipout):
+        inputs = random_ops.random_uniform(
+            [batch_size, depth, height, width, channels], seed=seed())
+        kernel_size = (2, 2, 2)
+
+      kernel_shape = kernel_size + (channels, filters)
+      bias_size = (filters,)
+
+      kernel_posterior = MockDistribution(
+          loc=random_ops.random_uniform(
+              kernel_shape, seed=seed()),
+          scale=random_ops.random_uniform(
+              kernel_shape, seed=seed()),
+          result_log_prob=random_ops.random_uniform(
+              kernel_shape, seed=seed()),
+          result_sample=random_ops.random_uniform(
+              kernel_shape, seed=seed()))
+      bias_posterior = MockDistribution(
+          loc=random_ops.random_uniform(
+              bias_size, seed=seed()),
+          scale=random_ops.random_uniform(
+              bias_size, seed=seed()),
+          result_log_prob=random_ops.random_uniform(
+              bias_size, seed=seed()),
+          result_sample=random_ops.random_uniform(
+              bias_size, seed=seed()))
+      layer_one = layer_class(
+          filters=filters,
+          kernel_size=kernel_size,
+          padding="SAME",
+          kernel_posterior_fn=lambda *args: kernel_posterior,
+          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
+          bias_posterior_fn=lambda *args: bias_posterior,
+          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
+          seed=44)
+      layer_two = layer_class(
+          filters=filters,
+          kernel_size=kernel_size,
+          padding="SAME",
+          kernel_posterior_fn=lambda *args: kernel_posterior,
+          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
+          bias_posterior_fn=lambda *args: bias_posterior,
+          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
+          seed=45)
+
+      outputs_one = layer_one(inputs)
+      outputs_two = layer_two(inputs)
+
+      outputs_one_, outputs_two_ = sess.run([
+          outputs_one, outputs_two])
+
+      self.assertLess(np.sum(np.isclose(outputs_one_, outputs_two_)),
+                      np.prod(outputs_one_.shape))
+
+  def testKLPenaltyKernelConv1DReparameterization(self):
+    self._testKLPenaltyKernel(prob_layers_lib.Conv1DReparameterization)
+
+  def testKLPenaltyKernelConv2DReparameterization(self):
+    self._testKLPenaltyKernel(prob_layers_lib.Conv2DReparameterization)
+
+  def testKLPenaltyKernelConv3DReparameterization(self):
+    self._testKLPenaltyKernel(prob_layers_lib.Conv3DReparameterization)
+
+  def testKLPenaltyKernelConv1DFlipout(self):
+    self._testKLPenaltyKernel(prob_layers_lib.Conv1DFlipout)
+
+  def testKLPenaltyKernelConv2DFlipout(self):
+    self._testKLPenaltyKernel(prob_layers_lib.Conv2DFlipout)
+
+  def testKLPenaltyKernelConv3DFlipout(self):
+    self._testKLPenaltyKernel(prob_layers_lib.Conv3DFlipout)
+
+  def testKLPenaltyBothConv1DReparameterization(self):
+    self._testKLPenaltyBoth(prob_layers_lib.Conv1DReparameterization)
+
+  def testKLPenaltyBothConv2DReparameterization(self):
+    self._testKLPenaltyBoth(prob_layers_lib.Conv2DReparameterization)
+
+  def testKLPenaltyBothConv3DReparameterization(self):
+    self._testKLPenaltyBoth(prob_layers_lib.Conv3DReparameterization)
+
+  def testKLPenaltyBothConv1DFlipout(self):
+    self._testKLPenaltyBoth(prob_layers_lib.Conv1DFlipout)
+
+  def testKLPenaltyBothConv2DFlipout(self):
+    self._testKLPenaltyBoth(prob_layers_lib.Conv2DFlipout)
+
+  def testKLPenaltyBothConv3DFlipout(self):
+    self._testKLPenaltyBoth(prob_layers_lib.Conv3DFlipout)
+
+  def testConv1DReparameterization(self):
+    self._testConvReparameterization(prob_layers_lib.Conv1DReparameterization)
+
+  def testConv2DReparameterization(self):
+    self._testConvReparameterization(prob_layers_lib.Conv2DReparameterization)
+
+  def testConv3DReparameterization(self):
+    self._testConvReparameterization(prob_layers_lib.Conv3DReparameterization)
+
+  def testConv1DFlipout(self):
+    self._testConvFlipout(prob_layers_lib.Conv1DFlipout)
+
+  def testConv2DFlipout(self):
+    self._testConvFlipout(prob_layers_lib.Conv2DFlipout)
+
+  def testConv3DFlipout(self):
+    self._testConvFlipout(prob_layers_lib.Conv3DFlipout)
+
+  def testRandomConv1DFlipout(self):
+    self._testRandomConvFlipout(prob_layers_lib.Conv1DFlipout)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py
index 50358fd1c2b7635ffe2d08c5af3219bb0a11498b..342f38ccec7ec74db1b393d6cdc22300205cc547 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py
@@ -18,11 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.bayesflow.python.ops import layers_dense_variational_impl as prob_layers_lib
+import numpy as np
+
+from tensorflow.contrib.bayesflow.python.ops import layers_dense_variational as prob_layers_lib
+from tensorflow.contrib.bayesflow.python.ops import layers_util as prob_layers_util
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.platform import test
 
 
@@ -41,14 +48,18 @@ class Counter(object):
     return self._value
 
 
-class MockDistribution(normal_lib.Normal):
-  """Monitors DenseVariational calls to the underlying distribution."""
+class MockDistribution(independent_lib.Independent):
+  """Monitors layer calls to the underlying distribution."""
 
   def __init__(self, result_sample, result_log_prob, loc=None, scale=None):
     self.result_sample = result_sample
     self.result_log_prob = result_log_prob
     self.result_loc = loc
     self.result_scale = scale
+    self.result_distribution = normal_lib.Normal(loc=0.0, scale=1.0)
+    if loc is not None and scale is not None:
+      self.result_distribution = normal_lib.Normal(loc=self.result_loc,
+                                                   scale=self.result_scale)
     self.called_log_prob = Counter()
     self.called_sample = Counter()
     self.called_loc = Counter()
@@ -62,6 +73,10 @@ class MockDistribution(normal_lib.Normal):
     self.called_sample()
     return self.result_sample
 
+  @property
+  def distribution(self):  # for dummy check on Independent(Normal)
+    return self.result_distribution
+
   @property
   def loc(self):
     self.called_loc()
@@ -74,7 +89,7 @@ class MockDistribution(normal_lib.Normal):
 
 
 class MockKLDivergence(object):
-  """Monitors DenseVariational calls to the divergence implementation."""
+  """Monitors layer calls to the divergence implementation."""
 
   def __init__(self, result):
     self.result = result
@@ -87,94 +102,125 @@ class MockKLDivergence(object):
     return self.result
 
 
-class DenseVariationalLocalReparametrization(test.TestCase):
+class DenseVariational(test.TestCase):
 
-  def testKLPenaltyKernel(self):
+  def _testKLPenaltyKernel(self, layer_class):
     with self.test_session():
-      dense_vi = prob_layers_lib.DenseVariational(units=2)
+      layer = layer_class(units=2)
       inputs = random_ops.random_uniform([2, 3], seed=1)
 
       # No keys.
-      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 0)
-      self.assertListEqual(dense_vi.losses, loss_keys)
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 0)
+      self.assertListEqual(layer.losses, losses)
 
-      _ = dense_vi(inputs)
+      _ = layer(inputs)
 
       # Yes keys.
-      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 1)
-      self.assertListEqual(dense_vi.losses, loss_keys)
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 1)
+      self.assertListEqual(layer.losses, losses)
 
-  def testKLPenaltyBoth(self):
+  def _testKLPenaltyBoth(self, layer_class):
     def _make_normal(dtype, *args):  # pylint: disable=unused-argument
       return normal_lib.Normal(
           loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.))
     with self.test_session():
-      dense_vi = prob_layers_lib.DenseVariational(
+      layer = layer_class(
           units=2,
-          bias_posterior_fn=prob_layers_lib.default_mean_field_normal_fn(),
+          bias_posterior_fn=prob_layers_util.default_mean_field_normal_fn(),
           bias_prior_fn=_make_normal)
       inputs = random_ops.random_uniform([2, 3], seed=1)
 
       # No keys.
-      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 0)
-      self.assertListEqual(dense_vi.losses, loss_keys)
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 0)
+      self.assertListEqual(layer.losses, losses)
 
-      _ = dense_vi(inputs)
+      _ = layer(inputs)
 
       # Yes keys.
-      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-      self.assertEqual(len(loss_keys), 2)
-      self.assertListEqual(dense_vi.losses, loss_keys)
-
-  def testVariationalNonLocal(self):
+      losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(losses), 2)
+      self.assertListEqual(layer.losses, losses)
+
+  def _testDenseSetUp(self, layer_class, batch_size, in_size, out_size,
+                      **kwargs):
+    seed = Counter()
+    inputs = random_ops.random_uniform([batch_size, in_size], seed=seed())
+
+    kernel_size = [in_size, out_size]
+    kernel_posterior = MockDistribution(
+        loc=random_ops.random_uniform(kernel_size, seed=seed()),
+        scale=random_ops.random_uniform(kernel_size, seed=seed()),
+        result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
+        result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
+    kernel_prior = MockDistribution(
+        result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
+        result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
+    kernel_divergence = MockKLDivergence(
+        result=random_ops.random_uniform(kernel_size, seed=seed()))
+
+    bias_size = [out_size]
+    bias_posterior = MockDistribution(
+        result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
+        result_sample=random_ops.random_uniform(bias_size, seed=seed()))
+    bias_prior = MockDistribution(
+        result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
+        result_sample=random_ops.random_uniform(bias_size, seed=seed()))
+    bias_divergence = MockKLDivergence(
+        result=random_ops.random_uniform(bias_size, seed=seed()))
+
+    layer = layer_class(
+        units=out_size,
+        kernel_posterior_fn=lambda *args: kernel_posterior,
+        kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
+        kernel_prior_fn=lambda *args: kernel_prior,
+        kernel_divergence_fn=kernel_divergence,
+        bias_posterior_fn=lambda *args: bias_posterior,
+        bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
+        bias_prior_fn=lambda *args: bias_prior,
+        bias_divergence_fn=bias_divergence,
+        **kwargs)
+
+    outputs = layer(inputs)
+
+    kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+    return (kernel_posterior, kernel_prior, kernel_divergence,
+            bias_posterior, bias_prior, bias_divergence,
+            layer, inputs, outputs, kl_penalty)
+
+  def testKLPenaltyKernelReparameterization(self):
+    self._testKLPenaltyKernel(prob_layers_lib.DenseReparameterization)
+
+  def testKLPenaltyKernelLocalReparameterization(self):
+    self._testKLPenaltyKernel(prob_layers_lib.DenseLocalReparameterization)
+
+  def testKLPenaltyKernelFlipout(self):
+    self._testKLPenaltyKernel(prob_layers_lib.DenseFlipout)
+
+  def testKLPenaltyBothReparameterization(self):
+    self._testKLPenaltyBoth(prob_layers_lib.DenseReparameterization)
+
+  def testKLPenaltyBothLocalReparameterization(self):
+    self._testKLPenaltyBoth(prob_layers_lib.DenseLocalReparameterization)
+
+  def testKLPenaltyBothFlipout(self):
+    self._testKLPenaltyBoth(prob_layers_lib.DenseFlipout)
+
+  def testDenseReparameterization(self):
     batch_size, in_size, out_size = 2, 3, 4
     with self.test_session() as sess:
-      seed = Counter()
-      inputs = random_ops.random_uniform([batch_size, in_size], seed=seed())
-
-      kernel_size = [in_size, out_size]
-      kernel_posterior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
-          result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
-      kernel_prior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
-          result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
-      kernel_divergence = MockKLDivergence(
-          result=random_ops.random_uniform(kernel_size, seed=seed()))
-
-      bias_size = [out_size]
-      bias_posterior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
-          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
-      bias_prior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
-          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
-      bias_divergence = MockKLDivergence(
-          result=random_ops.random_uniform(bias_size, seed=seed()))
+      (kernel_posterior, kernel_prior, kernel_divergence,
+       bias_posterior, bias_prior, bias_divergence, layer, inputs,
+       outputs, kl_penalty) = self._testDenseSetUp(
+           prob_layers_lib.DenseReparameterization,
+           batch_size, in_size, out_size)
 
       expected_outputs = (
           math_ops.matmul(inputs, kernel_posterior.result_sample) +
           bias_posterior.result_sample)
 
-      dense_vi = prob_layers_lib.DenseVariational(
-          units=2,
-          kernel_use_local_reparameterization=False,
-          kernel_posterior_fn=lambda *args: kernel_posterior,
-          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
-          kernel_prior_fn=lambda *args: kernel_prior,
-          kernel_divergence_fn=kernel_divergence,
-          bias_posterior_fn=lambda *args: bias_posterior,
-          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
-          bias_prior_fn=lambda *args: bias_prior,
-          bias_divergence_fn=bias_divergence)
-
-      outputs = dense_vi(inputs)
-
-      kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
-
       [
           expected_outputs_, actual_outputs_,
           expected_kernel_, actual_kernel_,
@@ -183,9 +229,9 @@ class DenseVariationalLocalReparametrization(test.TestCase):
           expected_bias_divergence_, actual_bias_divergence_,
       ] = sess.run([
           expected_outputs, outputs,
-          kernel_posterior.result_sample, dense_vi.kernel.posterior_tensor,
+          kernel_posterior.result_sample, layer.kernel_posterior_tensor,
           kernel_divergence.result, kl_penalty[0],
-          bias_posterior.result_sample, dense_vi.bias.posterior_tensor,
+          bias_posterior.result_sample, layer.bias_posterior_tensor,
           bias_divergence.result, kl_penalty[1],
       ])
 
@@ -206,40 +252,25 @@ class DenseVariationalLocalReparametrization(test.TestCase):
           rtol=1e-6, atol=0.)
 
       self.assertAllEqual(
-          [[kernel_posterior, kernel_prior, kernel_posterior.result_sample]],
+          [[kernel_posterior.distribution,
+            kernel_prior.distribution,
+            kernel_posterior.result_sample]],
           kernel_divergence.args)
 
       self.assertAllEqual(
-          [[bias_posterior, bias_prior, bias_posterior.result_sample]],
+          [[bias_posterior.distribution,
+            bias_prior.distribution,
+            bias_posterior.result_sample]],
           bias_divergence.args)
 
-  def testVariationalLocal(self):
+  def testDenseLocalReparameterization(self):
     batch_size, in_size, out_size = 2, 3, 4
     with self.test_session() as sess:
-      seed = Counter()
-      inputs = random_ops.random_uniform([batch_size, in_size], seed=seed())
-
-      kernel_size = [in_size, out_size]
-      kernel_posterior = MockDistribution(
-          loc=random_ops.random_uniform(kernel_size, seed=seed()),
-          scale=random_ops.random_uniform(kernel_size, seed=seed()),
-          result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
-          result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
-      kernel_prior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
-          result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
-      kernel_divergence = MockKLDivergence(
-          result=random_ops.random_uniform(kernel_size, seed=seed()))
-
-      bias_size = [out_size]
-      bias_posterior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
-          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
-      bias_prior = MockDistribution(
-          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
-          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
-      bias_divergence = MockKLDivergence(
-          result=random_ops.random_uniform(bias_size, seed=seed()))
+      (kernel_posterior, kernel_prior, kernel_divergence,
+       bias_posterior, bias_prior, bias_divergence, layer, inputs,
+       outputs, kl_penalty) = self._testDenseSetUp(
+           prob_layers_lib.DenseLocalReparameterization,
+           batch_size, in_size, out_size)
 
       expected_kernel_posterior_affine = normal_lib.Normal(
           loc=math_ops.matmul(inputs, kernel_posterior.result_loc),
@@ -250,21 +281,80 @@ class DenseVariationalLocalReparametrization(test.TestCase):
       expected_outputs = (expected_kernel_posterior_affine_tensor +
                           bias_posterior.result_sample)
 
-      dense_vi = prob_layers_lib.DenseVariational(
-          units=2,
-          kernel_use_local_reparameterization=True,
-          kernel_posterior_fn=lambda *args: kernel_posterior,
-          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
-          kernel_prior_fn=lambda *args: kernel_prior,
-          kernel_divergence_fn=kernel_divergence,
-          bias_posterior_fn=lambda *args: bias_posterior,
-          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
-          bias_prior_fn=lambda *args: bias_prior,
-          bias_divergence_fn=bias_divergence)
+      [
+          expected_outputs_, actual_outputs_,
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          expected_bias_, actual_bias_,
+          expected_bias_divergence_, actual_bias_divergence_,
+      ] = sess.run([
+          expected_outputs, outputs,
+          kernel_divergence.result, kl_penalty[0],
+          bias_posterior.result_sample, layer.bias_posterior_tensor,
+          bias_divergence.result, kl_penalty[1],
+      ])
 
-      outputs = dense_vi(inputs)
+      self.assertAllClose(
+          expected_bias_, actual_bias_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_outputs_, actual_outputs_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_bias_divergence_, actual_bias_divergence_,
+          rtol=1e-6, atol=0.)
+
+      self.assertAllEqual(
+          [[kernel_posterior.distribution,
+            kernel_prior.distribution,
+            None]],
+          kernel_divergence.args)
+
+      self.assertAllEqual(
+          [[bias_posterior.distribution,
+            bias_prior.distribution,
+            bias_posterior.result_sample]],
+          bias_divergence.args)
+
+  def testDenseFlipout(self):
+    batch_size, in_size, out_size = 2, 3, 4
+    with self.test_session() as sess:
+      (kernel_posterior, kernel_prior, kernel_divergence,
+       bias_posterior, bias_prior, bias_divergence, layer, inputs,
+       outputs, kl_penalty) = self._testDenseSetUp(
+           prob_layers_lib.DenseFlipout,
+           batch_size, in_size, out_size, seed=44)
+
+      expected_kernel_posterior_affine = normal_lib.Normal(
+          loc=array_ops.zeros_like(kernel_posterior.result_loc),
+          scale=kernel_posterior.result_scale)
+      expected_kernel_posterior_affine_tensor = (
+          expected_kernel_posterior_affine.sample(seed=42))
 
-      kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      sign_input = random_ops.random_uniform(
+          [batch_size, in_size],
+          minval=0,
+          maxval=2,
+          dtype=dtypes.int32,
+          seed=layer.seed)
+      sign_input = math_ops.cast(2 * sign_input - 1, inputs.dtype)
+      sign_output = random_ops.random_uniform(
+          [batch_size, out_size],
+          minval=0,
+          maxval=2,
+          dtype=dtypes.int32,
+          seed=distribution_util.gen_new_seed(
+              layer.seed, salt="dense_flipout"))
+      sign_output = math_ops.cast(2 * sign_output - 1, inputs.dtype)
+      perturbed_inputs = math_ops.matmul(
+          inputs * sign_input, expected_kernel_posterior_affine_tensor)
+      perturbed_inputs *= sign_output
+
+      expected_outputs = math_ops.matmul(inputs, kernel_posterior.result_loc)
+      expected_outputs += perturbed_inputs
+      expected_outputs += bias_posterior.result_sample
 
       [
           expected_outputs_, actual_outputs_,
@@ -274,7 +364,7 @@ class DenseVariationalLocalReparametrization(test.TestCase):
       ] = sess.run([
           expected_outputs, outputs,
           kernel_divergence.result, kl_penalty[0],
-          bias_posterior.result_sample, dense_vi.bias.posterior_tensor,
+          bias_posterior.result_sample, layer.bias_posterior_tensor,
           bias_divergence.result, kl_penalty[1],
       ])
 
@@ -292,13 +382,62 @@ class DenseVariationalLocalReparametrization(test.TestCase):
           rtol=1e-6, atol=0.)
 
       self.assertAllEqual(
-          [[kernel_posterior, kernel_prior, None]],
+          [[kernel_posterior.distribution, kernel_prior.distribution, None]],
           kernel_divergence.args)
 
       self.assertAllEqual(
-          [[bias_posterior, bias_prior, bias_posterior.result_sample]],
+          [[bias_posterior.distribution,
+            bias_prior.distribution,
+            bias_posterior.result_sample]],
           bias_divergence.args)
 
+  def testRandomDenseFlipout(self):
+    batch_size, in_size, out_size = 2, 3, 4
+    with self.test_session() as sess:
+      seed = Counter()
+      inputs = random_ops.random_uniform([batch_size, in_size], seed=seed())
+
+      kernel_posterior = MockDistribution(
+          loc=random_ops.random_uniform(
+              [in_size, out_size], seed=seed()),
+          scale=random_ops.random_uniform(
+              [in_size, out_size], seed=seed()),
+          result_log_prob=random_ops.random_uniform(
+              [in_size, out_size], seed=seed()),
+          result_sample=random_ops.random_uniform(
+              [in_size, out_size], seed=seed()))
+      bias_posterior = MockDistribution(
+          loc=random_ops.random_uniform(
+              [out_size], seed=seed()),
+          scale=random_ops.random_uniform(
+              [out_size], seed=seed()),
+          result_log_prob=random_ops.random_uniform(
+              [out_size], seed=seed()),
+          result_sample=random_ops.random_uniform(
+              [out_size], seed=seed()))
+      layer_one = prob_layers_lib.DenseFlipout(
+          units=out_size,
+          kernel_posterior_fn=lambda *args: kernel_posterior,
+          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
+          bias_posterior_fn=lambda *args: bias_posterior,
+          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
+          seed=44)
+      layer_two = prob_layers_lib.DenseFlipout(
+          units=out_size,
+          kernel_posterior_fn=lambda *args: kernel_posterior,
+          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
+          bias_posterior_fn=lambda *args: bias_posterior,
+          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
+          seed=45)
+
+      outputs_one = layer_one(inputs)
+      outputs_two = layer_two(inputs)
+
+      outputs_one_, outputs_two_ = sess.run([
+          outputs_one, outputs_two])
+
+      self.assertLess(np.sum(np.isclose(outputs_one_, outputs_two_)), out_size)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..52e36e135d95c1ec919c710f35d59073c2134d05
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/mcmc_diagnostics_test.py
@@ -0,0 +1,445 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MCMC diagnostic utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.bayesflow.python.ops import mcmc_diagnostics_impl as mcmc_diagnostics
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.platform import test
+
+rng = np.random.RandomState(42)
+
+
+class _EffectiveSampleSizeTest(object):
+
+  @property
+  def use_static_shape(self):
+    raise NotImplementedError(
+        "Subclass failed to implement `use_static_shape`.")
+
+  def _check_versus_expected_effective_sample_size(self,
+                                                   x_,
+                                                   expected_ess,
+                                                   sess,
+                                                   atol=1e-2,
+                                                   rtol=1e-2,
+                                                   filter_threshold=None,
+                                                   filter_beyond_lag=None):
+    x = array_ops.placeholder_with_default(
+        input=x_, shape=x_.shape if self.use_static_shape else None)
+    ess = mcmc_diagnostics.effective_sample_size(
+        x,
+        filter_threshold=filter_threshold,
+        filter_beyond_lag=filter_beyond_lag)
+    if self.use_static_shape:
+      self.assertAllEqual(x.shape[1:], ess.shape)
+
+    ess_ = sess.run(ess)
+
+    self.assertAllClose(
+        np.ones_like(ess_) * expected_ess, ess_, atol=atol, rtol=rtol)
+
+  def testIidRank1NormalHasFullEssMaxLags10(self):
+    # With a length 5000 iid normal sequence, and filter_beyond_lag = 10, we
+    # should have a good estimate of ESS, and it should be close to the full
+    # sequence length of 5000.
+    # The choice of filter_beyond_lag = 10 is a short cutoff, reasonable only
+    # since we know the correlation length should be zero right away.
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        self._check_versus_expected_effective_sample_size(
+            x_=rng.randn(5000).astype(np.float32),
+            expected_ess=5000,
+            sess=sess,
+            filter_beyond_lag=10,
+            filter_threshold=None,
+            rtol=0.3)
+
+  def testIidRank2NormalHasFullEssMaxLags10(self):
+    # See similar test for Rank1Normal for reasoning.
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        self._check_versus_expected_effective_sample_size(
+            x_=rng.randn(5000, 2).astype(np.float32),
+            expected_ess=5000,
+            sess=sess,
+            filter_beyond_lag=10,
+            filter_threshold=None,
+            rtol=0.3)
+
+  def testIidRank1NormalHasFullEssMaxLagThresholdZero(self):
+    # With a length 5000 iid normal sequence, and filter_threshold = 0,
+    # we should have a super-duper estimate of ESS, and it should be very close
+    # to the full sequence length of 5000.
+    # The choice of filter_beyond_lag = 0 means we cutoff as soon as the
+    # auto-corris below zero.  This should happen very quickly, due to the fact
+    # that the theoretical auto-corr is [1, 0, 0,...]
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        self._check_versus_expected_effective_sample_size(
+            x_=rng.randn(5000).astype(np.float32),
+            expected_ess=5000,
+            sess=sess,
+            filter_beyond_lag=None,
+            filter_threshold=0.,
+            rtol=0.1)
+
+  def testIidRank2NormalHasFullEssMaxLagThresholdZero(self):
+    # See similar test for Rank1Normal for reasoning.
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        self._check_versus_expected_effective_sample_size(
+            x_=rng.randn(5000, 2).astype(np.float32),
+            expected_ess=5000,
+            sess=sess,
+            filter_beyond_lag=None,
+            filter_threshold=0.,
+            rtol=0.1)
+
+  def testLength10CorrelationHasEssOneTenthTotalLengthUsingMaxLags50(self):
+    # Create x_, such that
+    #   x_[i] = iid_x_[0], i = 0,...,9
+    #   x_[i] = iid_x_[1], i = 10,..., 19,
+    #   and so on.
+    iid_x_ = rng.randn(5000, 1).astype(np.float32)
+    x_ = (iid_x_ * np.ones((5000, 10)).astype(np.float32)).reshape((50000,))
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        self._check_versus_expected_effective_sample_size(
+            x_=x_,
+            expected_ess=50000 // 10,
+            sess=sess,
+            filter_beyond_lag=50,
+            filter_threshold=None,
+            rtol=0.2)
+
+  def testLength10CorrelationHasEssOneTenthTotalLengthUsingMaxLagsThresholdZero(
+      self):
+    # Create x_, such that
+    #   x_[i] = iid_x_[0], i = 0,...,9
+    #   x_[i] = iid_x_[1], i = 10,..., 19,
+    #   and so on.
+    iid_x_ = rng.randn(5000, 1).astype(np.float32)
+    x_ = (iid_x_ * np.ones((5000, 10)).astype(np.float32)).reshape((50000,))
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        self._check_versus_expected_effective_sample_size(
+            x_=x_,
+            expected_ess=50000 // 10,
+            sess=sess,
+            filter_beyond_lag=None,
+            filter_threshold=0.,
+            rtol=0.1)
+
+  def testListArgs(self):
+    # x_ has correlation length 10 ==> ESS = N / 10
+    # y_ has correlation length 1  ==> ESS = N
+    iid_x_ = rng.randn(5000, 1).astype(np.float32)
+    x_ = (iid_x_ * np.ones((5000, 10)).astype(np.float32)).reshape((50000,))
+    y_ = rng.randn(50000).astype(np.float32)
+    states = [x_, x_, y_, y_]
+    filter_threshold = [0., None, 0., None]
+    filter_beyond_lag = [None, 5, None, 5]
+
+    # See other tests for reasoning on tolerance.
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        ess = mcmc_diagnostics.effective_sample_size(
+            states,
+            filter_threshold=filter_threshold,
+            filter_beyond_lag=filter_beyond_lag)
+        ess_ = sess.run(ess)
+    self.assertAllEqual(4, len(ess_))
+
+    self.assertAllClose(50000 // 10, ess_[0], rtol=0.3)
+    self.assertAllClose(50000 // 10, ess_[1], rtol=0.3)
+    self.assertAllClose(50000, ess_[2], rtol=0.1)
+    self.assertAllClose(50000, ess_[3], rtol=0.1)
+
+  def testMaxLagsThresholdLessThanNeg1SameAsNone(self):
+    # Setting both means we filter out items R_k from the auto-correlation
+    # sequence if k > filter_beyond_lag OR k >= j where R_j < filter_threshold.
+
+    # x_ has correlation length 10.
+    iid_x_ = rng.randn(500, 1).astype(np.float32)
+    x_ = (iid_x_ * np.ones((500, 10)).astype(np.float32)).reshape((5000,))
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        x = array_ops.placeholder_with_default(
+            input=x_, shape=x_.shape if self.use_static_shape else None)
+
+        ess_none_none = mcmc_diagnostics.effective_sample_size(
+            x, filter_threshold=None, filter_beyond_lag=None)
+        ess_none_200 = mcmc_diagnostics.effective_sample_size(
+            x, filter_threshold=None, filter_beyond_lag=200)
+        ess_neg2_200 = mcmc_diagnostics.effective_sample_size(
+            x, filter_threshold=-2., filter_beyond_lag=200)
+        ess_neg2_none = mcmc_diagnostics.effective_sample_size(
+            x, filter_threshold=-2., filter_beyond_lag=None)
+        ess_none_none_, ess_none_200_, ess_neg2_200_, ess_neg2_none_ = sess.run(
+            [ess_none_none, ess_none_200, ess_neg2_200, ess_neg2_none])
+
+        # filter_threshold=-2 <==> filter_threshold=None.
+        self.assertAllClose(ess_none_none_, ess_neg2_none_)
+        self.assertAllClose(ess_none_200_, ess_neg2_200_)
+
+  def testMaxLagsArgsAddInAnOrManner(self):
+    # Setting both means we filter out items R_k from the auto-correlation
+    # sequence if k > filter_beyond_lag OR k >= j where R_j < filter_threshold.
+
+    # x_ has correlation length 10.
+    iid_x_ = rng.randn(500, 1).astype(np.float32)
+    x_ = (iid_x_ * np.ones((500, 10)).astype(np.float32)).reshape((5000,))
+    with self.test_session() as sess:
+      with spectral_ops_test_util.fft_kernel_label_map():
+        x = array_ops.placeholder_with_default(
+            input=x_, shape=x_.shape if self.use_static_shape else None)
+
+        ess_1_9 = mcmc_diagnostics.effective_sample_size(
+            x, filter_threshold=1., filter_beyond_lag=9)
+        ess_1_none = mcmc_diagnostics.effective_sample_size(
+            x, filter_threshold=1., filter_beyond_lag=None)
+        ess_none_9 = mcmc_diagnostics.effective_sample_size(
+            x, filter_threshold=1., filter_beyond_lag=9)
+        ess_1_9_, ess_1_none_, ess_none_9_ = sess.run(
+            [ess_1_9, ess_1_none, ess_none_9])
+
+        # Since R_k = 1 for k < 10, and R_k < 1 for k >= 10,
+        # filter_threshold = 1 <==> filter_beyond_lag = 9.
+        self.assertAllClose(ess_1_9_, ess_1_none_)
+        self.assertAllClose(ess_1_9_, ess_none_9_)
+
+
+class EffectiveSampleSizeStaticTest(test.TestCase, _EffectiveSampleSizeTest):
+
+  @property
+  def use_static_shape(self):
+    return True
+
+
+class EffectiveSampleSizeDynamicTest(test.TestCase, _EffectiveSampleSizeTest):
+
+  @property
+  def use_static_shape(self):
+    return False
+
+
+class _PotentialScaleReductionTest(object):
+
+  @property
+  def use_static_shape(self):
+    raise NotImplementedError(
+        "Subclass failed to impliment `use_static_shape`.")
+
+  def testListOfStatesWhereFirstPassesSecondFails(self):
+    """Simple test showing API with two states.  Read first!."""
+    n_samples = 1000
+
+    # state_0 is two scalar chains taken from iid Normal(0, 1).  Will pass.
+    state_0 = rng.randn(n_samples, 2)
+
+    # state_1 is three 4-variate chains taken from Normal(0, 1) that have been
+    # shifted.  Since every chain is shifted, they are not the same, and the
+    # test should fail.
+    offset = np.array([1., -1., 2.]).reshape(3, 1)
+    state_1 = rng.randn(n_samples, 3, 4) + offset
+
+    rhat = mcmc_diagnostics.potential_scale_reduction(
+        chains_states=[state_0, state_1], independent_chain_ndims=1)
+
+    self.assertIsInstance(rhat, list)
+    with self.test_session() as sess:
+      rhat_0_, rhat_1_ = sess.run(rhat)
+
+    # r_hat_0 should be close to 1, meaning test is passed.
+    self.assertAllEqual((), rhat_0_.shape)
+    self.assertAllClose(1., rhat_0_, rtol=0.02)
+
+    # r_hat_1 should be greater than 1.2, meaning test has failed.
+    self.assertAllEqual((4,), rhat_1_.shape)
+    self.assertAllEqual(np.ones_like(rhat_1_).astype(bool), rhat_1_ > 1.2)
+
+  def check_results(self, state_, independent_chain_shape, should_pass):
+    sample_ndims = 1
+    independent_chain_ndims = len(independent_chain_shape)
+    with self.test_session():
+      state = array_ops.placeholder_with_default(
+          input=state_, shape=state_.shape if self.use_static_shape else None)
+
+      rhat = mcmc_diagnostics.potential_scale_reduction(
+          state, independent_chain_ndims=independent_chain_ndims)
+
+      if self.use_static_shape:
+        self.assertAllEqual(
+            state_.shape[sample_ndims + independent_chain_ndims:], rhat.shape)
+
+      rhat_ = rhat.eval()
+      if should_pass:
+        self.assertAllClose(np.ones_like(rhat_), rhat_, atol=0, rtol=0.02)
+      else:
+        self.assertAllEqual(np.ones_like(rhat_).astype(bool), rhat_ > 1.2)
+
+  def iid_normal_chains_should_pass_wrapper(self,
+                                            sample_shape,
+                                            independent_chain_shape,
+                                            other_shape,
+                                            dtype=np.float32):
+    """Check results with iid normal chains."""
+
+    state_shape = sample_shape + independent_chain_shape + other_shape
+    state_ = rng.randn(*state_shape).astype(dtype)
+
+    # The "other" dimensions do not have to be identical, just independent, so
+    # force them to not be identical.
+    if other_shape:
+      state_ *= rng.rand(*other_shape).astype(dtype)
+
+    self.check_results(state_, independent_chain_shape, should_pass=True)
+
+  def testPassingIIDNdimsAreIndependentOneOtherZero(self):
+    self.iid_normal_chains_should_pass_wrapper(
+        sample_shape=[10000], independent_chain_shape=[4], other_shape=[])
+
+  def testPassingIIDNdimsAreIndependentOneOtherOne(self):
+    self.iid_normal_chains_should_pass_wrapper(
+        sample_shape=[10000], independent_chain_shape=[3], other_shape=[7])
+
+  def testPassingIIDNdimsAreIndependentOneOtherTwo(self):
+    self.iid_normal_chains_should_pass_wrapper(
+        sample_shape=[10000], independent_chain_shape=[2], other_shape=[5, 7])
+
+  def testPassingIIDNdimsAreIndependentTwoOtherTwo64Bit(self):
+    self.iid_normal_chains_should_pass_wrapper(
+        sample_shape=[10000],
+        independent_chain_shape=[2, 3],
+        other_shape=[5, 7],
+        dtype=np.float64)
+
+  def offset_normal_chains_should_fail_wrapper(
+      self, sample_shape, independent_chain_shape, other_shape):
+    """Check results with normal chains that are offset from each other."""
+
+    state_shape = sample_shape + independent_chain_shape + other_shape
+    state_ = rng.randn(*state_shape)
+
+    # Add a significant offset to the different (formerly iid) chains.
+    offset = np.linspace(
+        0, 2, num=np.prod(independent_chain_shape)).reshape([1] * len(
+            sample_shape) + independent_chain_shape + [1] * len(other_shape))
+    state_ += offset
+
+    self.check_results(state_, independent_chain_shape, should_pass=False)
+
+  def testFailingOffsetNdimsAreSampleOneIndependentOneOtherOne(self):
+    self.offset_normal_chains_should_fail_wrapper(
+        sample_shape=[10000], independent_chain_shape=[2], other_shape=[5])
+
+
+class PotentialScaleReductionStaticTest(test.TestCase,
+                                        _PotentialScaleReductionTest):
+
+  @property
+  def use_static_shape(self):
+    return True
+
+  def testIndependentNdimsLessThanOneRaises(self):
+    with self.assertRaisesRegexp(ValueError, "independent_chain_ndims"):
+      mcmc_diagnostics.potential_scale_reduction(
+          rng.rand(2, 3, 4), independent_chain_ndims=0)
+
+
+class PotentialScaleReductionDynamicTest(test.TestCase,
+                                         _PotentialScaleReductionTest):
+
+  @property
+  def use_static_shape(self):
+    return False
+
+
+class _ReduceVarianceTest(object):
+
+  @property
+  def use_static_shape(self):
+    raise NotImplementedError(
+        "Subclass failed to impliment `use_static_shape`.")
+
+  def check_versus_numpy(self, x_, axis, biased, keepdims):
+    with self.test_session():
+      x_ = np.asarray(x_)
+      x = array_ops.placeholder_with_default(
+          input=x_, shape=x_.shape if self.use_static_shape else None)
+      var = mcmc_diagnostics._reduce_variance(
+          x, axis=axis, biased=biased, keepdims=keepdims)
+      np_var = np.var(x_, axis=axis, ddof=0 if biased else 1, keepdims=keepdims)
+
+      if self.use_static_shape:
+        self.assertAllEqual(np_var.shape, var.shape)
+
+      var_ = var.eval()
+      # We will mask below, which changes shape, so check shape explicitly here.
+      self.assertAllEqual(np_var.shape, var_.shape)
+
+      # We get NaN when we divide by zero due to the size being the same as ddof
+      nan_mask = np.isnan(np_var)
+      if nan_mask.any():
+        self.assertTrue(np.isnan(var_[nan_mask]).all())
+      self.assertAllClose(np_var[~nan_mask], var_[~nan_mask], atol=0, rtol=0.02)
+
+  def testScalarBiasedTrue(self):
+    self.check_versus_numpy(x_=-1.234, axis=None, biased=True, keepdims=False)
+
+  def testScalarBiasedFalse(self):
+    # This should result in NaN.
+    self.check_versus_numpy(x_=-1.234, axis=None, biased=False, keepdims=False)
+
+  def testShape2x3x4AxisNoneBiasedFalseKeepdimsFalse(self):
+    self.check_versus_numpy(
+        x_=rng.randn(2, 3, 4), axis=None, biased=True, keepdims=False)
+
+  def testShape2x3x4Axis1BiasedFalseKeepdimsTrue(self):
+    self.check_versus_numpy(
+        x_=rng.randn(2, 3, 4), axis=1, biased=True, keepdims=True)
+
+  def testShape2x3x4x5Axis13BiasedFalseKeepdimsTrue(self):
+    self.check_versus_numpy(
+        x_=rng.randn(2, 3, 4, 5), axis=1, biased=True, keepdims=True)
+
+  def testShape2x3x4x5Axis13BiasedFalseKeepdimsFalse(self):
+    self.check_versus_numpy(
+        x_=rng.randn(2, 3, 4, 5), axis=1, biased=False, keepdims=False)
+
+
+class ReduceVarianceTestStaticShape(test.TestCase, _ReduceVarianceTest):
+
+  @property
+  def use_static_shape(self):
+    return True
+
+
+class ReduceVarianceTestDynamicShape(test.TestCase, _ReduceVarianceTest):
+
+  @property
+  def use_static_shape(self):
+    return False
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py
index 66793383fdd5c71f136900197a91be6966e2f8c7..756c25683bd4b0c8c77e9e28485ca2a85582999c 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -36,9 +36,9 @@ class SGLDOptimizerTest(test.TestCase):
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
         grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
         decay_rate = 0.53
-        sgd_op = SGLDOptimizer(
-            3.0, preconditioner_decay_rate=decay_rate).apply_gradients(
-                zip([grads0, grads1], [var0, var1]))
+        sgd_optimizer = SGLDOptimizer(3.0, preconditioner_decay_rate=decay_rate)
+        sgd_op = sgd_optimizer.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
         self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
@@ -54,6 +54,7 @@ class SGLDOptimizerTest(test.TestCase):
             decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
         self.assertAllCloseAccordingToType(
             [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], var1.eval())
+        self.assertAllCloseAccordingToType(1, sgd_optimizer._counter.eval())
 
   def testBasicMultiInstance(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -102,6 +103,8 @@ class SGLDOptimizerTest(test.TestCase):
                             sgd_optimizer2.variable_scope)
         self.assertNotEqual(sgd_optimizer.variable_scope.name,
                             sgd_optimizer2.variable_scope.name)
+        self.assertAllCloseAccordingToType(1, sgd_optimizer._counter.eval())
+        self.assertAllCloseAccordingToType(1, sgd_optimizer2._counter.eval())
 
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/variable_utils_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/variable_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f978cf86417dc5ff5412a3eee584330a266e0964
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/variable_utils_test.py
@@ -0,0 +1,135 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utility functions related to managing `tf.Variable`s."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+import numpy as np
+
+from tensorflow.contrib.bayesflow.python.ops import variable_utils
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variable_scope as varscope_ops
+from tensorflow.python.ops import variables as variables_ops
+from tensorflow.python.platform import test
+
+
+def test_fn(x):
+  x = ops.convert_to_tensor(x, name="x")
+  dtype = x.dtype.as_numpy_dtype
+  s = x.shape.as_list()
+  z = varscope_ops.get_variable(
+      name="z",
+      dtype=dtype,
+      initializer=np.arange(np.prod(s)).reshape(s).astype(dtype))
+  y = varscope_ops.get_variable(
+      name="y",
+      dtype=dtype,
+      initializer=np.arange(np.prod(s)).reshape(s).astype(dtype)**2)
+  return x + y + z
+
+
+class _WrapCallableTest(object):
+
+  def testDefaultArgsWorkCorrectly(self):
+    with self.test_session():
+      x = constant_op.constant(self.dtype([0.1, 0.2]))
+      wrapped_fn, vars_args = variable_utils.externalize_variables_as_args(
+          test_fn, [x])
+
+      varscope_ops.get_variable_scope().reuse_variables()
+
+      result = wrapped_fn(self.dtype(2), [3, 4, 5], 0.5)
+
+      y_actual = varscope_ops.get_variable("y", dtype=self.dtype)
+      z_actual = varscope_ops.get_variable("z", dtype=self.dtype)
+
+      variables_ops.global_variables_initializer().run()
+      result_ = result.eval()
+
+      self.assertEqual(self.dtype, result_.dtype)
+      self.assertAllEqual([5.5, 6.5, 7.5], result_)
+      self.assertAllEqual([y_actual, z_actual], vars_args)
+
+  def testNonDefaultArgsWorkCorrectly(self):
+    with self.test_session():
+      x = constant_op.constant(self.dtype([0.1, 0.2]))
+
+      _ = test_fn(self.dtype([0., 0.]))   # Needed to create vars.
+      varscope_ops.get_variable_scope().reuse_variables()
+
+      y_actual = varscope_ops.get_variable("y", dtype=self.dtype)
+
+      wrapped_fn, vars_args = variable_utils.externalize_variables_as_args(
+          test_fn, [x], possible_ancestor_vars=[y_actual])
+
+      result = wrapped_fn(self.dtype([2, 3]), 0.5)  # x, y
+
+      variables_ops.global_variables_initializer().run()
+      result_ = result.eval()
+
+      self.assertEqual(self.dtype, result_.dtype)
+      self.assertAllEqual([2.5, 4.5], result_)
+      self.assertAllEqual([y_actual], vars_args)
+
+  def testWarnings(self):
+    with self.test_session():
+      x = constant_op.constant(self.dtype([0.1, 0.2]))
+      wrapped_fn, _ = variable_utils.externalize_variables_as_args(
+          test_fn, [x], possible_ancestor_vars=[])
+      varscope_ops.get_variable_scope().reuse_variables()
+      with warnings.catch_warnings(record=True) as w:
+        wrapped_fn(self.dtype(2))
+      w = sorted(w, key=lambda w: str(w.message))
+      self.assertEqual(2, len(w))
+      self.assertRegexpMatches(
+          str(w[0].message),
+          r"Variable .* 'y:0' .* not found in bypass dict.")
+      self.assertRegexpMatches(
+          str(w[1].message),
+          r"Variable .* 'z:0' .* not found in bypass dict.")
+
+  def testExceptions(self):
+    with self.test_session():
+      x = constant_op.constant(self.dtype([0.1, 0.2]))
+      wrapped_fn, _ = variable_utils.externalize_variables_as_args(
+          test_fn,
+          [x],
+          possible_ancestor_vars=[],
+          assert_variable_override=True)
+      varscope_ops.get_variable_scope().reuse_variables()
+      with self.assertRaisesRegexp(ValueError, r"not found"):
+        wrapped_fn(self.dtype(2))
+
+
+class WrapCallableTest16(test.TestCase, _WrapCallableTest):
+  dtype = np.float16
+
+
+class WrapCallableTest32(test.TestCase, _WrapCallableTest):
+  dtype = np.float32
+
+
+class WrapCallableTest64(test.TestCase, _WrapCallableTest):
+  dtype = np.float64
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_sgd_optimizer_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/variational_sgd_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..83c64dbe0fd586edcb784a5c09a4c133aaa99cff
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/variational_sgd_optimizer_test.py
@@ -0,0 +1,268 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for GradientDescent."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from tensorflow.contrib.bayesflow.python.ops.optimizers import VariationalSGDOptimizer
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class VariationalSGDOptimizerTest(test.TestCase):
+
+  def testBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        decay_rate = 0.53
+        sgd_op = VariationalSGDOptimizer(
+            1,
+            1,
+            preconditioner_decay_rate=decay_rate,
+            max_learning_rate=3.0,
+            burnin_max_learning_rate=3.0,
+            use_single_learning_rate=True).apply_gradients(
+                zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+
+  def testBasicMultiInstance(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        vara = variables.Variable([1.1, 2.1], dtype=dtype)
+        varb = variables.Variable([3.0, 4.0], dtype=dtype)
+        gradsa = constant_op.constant([0.1, 0.1], dtype=dtype)
+        gradsb = constant_op.constant([0.01, 0.01], dtype=dtype)
+        decay_rate = 0.5
+        batch_size = 2
+        total_num_examples = 10
+        optimizer = VariationalSGDOptimizer(
+            batch_size,
+            total_num_examples,
+            max_learning_rate=1.0,
+            burnin_max_learning_rate=3.0,
+            preconditioner_decay_rate=decay_rate)
+        sgd_op = optimizer.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        optimizer2 = VariationalSGDOptimizer(
+            batch_size,
+            total_num_examples,
+            max_learning_rate=1.0,
+            burnin_max_learning_rate=10.0,
+            burnin=0,
+            preconditioner_decay_rate=decay_rate)
+        sgd_op2 = optimizer2.apply_gradients(
+            zip([gradsa, gradsb], [vara, varb]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.1, 2.1], vara.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], varb.eval())
+
+        # Run 1 step of sgd
+        sgd_op.run()
+        sgd_op2.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.1 - 3. * 0.1, 2.1 - 3. * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([1.1 - 0.1, 2.1 - 0.1], vara.eval())
+
+        self.assertAllCloseAccordingToType([3.0 - 3. * 0.01, 4.0 - 3. * 0.01],
+                                           var1.eval())
+        self.assertAllCloseAccordingToType([3.0 - 0.01, 4.0 - 0.01],
+                                           varb.eval())
+        self.assertNotEqual(optimizer.variable_scope,
+                            optimizer2.variable_scope)
+        self.assertNotEqual(optimizer.variable_scope.name,
+                            optimizer2.variable_scope.name)
+        self.assertAllCloseAccordingToType(1, optimizer._counter.eval())
+        self.assertAllCloseAccordingToType(1, optimizer2._counter.eval())
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lrate = constant_op.constant(3.0)
+        decay_rate = 0.5
+        batch_size = 2
+        total_num_examples = 10
+        sgd_op = VariationalSGDOptimizer(
+            batch_size,
+            total_num_examples,
+            max_learning_rate=lrate,
+            burnin=0,
+            preconditioner_decay_rate=decay_rate).apply_gradients(
+                zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+
+  def testTensorDecayLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lrate = variables.Variable(3.0)
+        lrate_decay_op = lrate.assign_add(-3.)
+        decay_rate = 0.5
+        batch_size = 2
+        total_num_examples = 10
+        optimizer = VariationalSGDOptimizer(
+            batch_size,
+            total_num_examples,
+            max_learning_rate=lrate,
+            burnin=0,
+            preconditioner_decay_rate=decay_rate)
+        sgd_op = optimizer.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+        # Update learning rate to 0
+        lrate_decay_op.eval()
+        sgd_op.run()
+        # Validate params haven't changed
+        self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+        lrate_decay_op.eval()
+
+        with self.assertRaises(errors.InvalidArgumentError):
+          sgd_op.run()
+
+  def testGradWrtRef(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        opt = VariationalSGDOptimizer(1, 1, max_learning_rate=1.0)
+        values = [1.0, 3.0]
+        vars_ = [variables.Variable([v], dtype=dtype) for v in values]
+        grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
+        variables.global_variables_initializer().run()
+        for grad, _ in grads_and_vars:
+          self.assertAllCloseAccordingToType([1.0], grad.eval())
+
+  def testWithGlobalStep(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        global_step = variables.Variable(0, trainable=False)
+        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        decay_rate = 0.1
+        batch_size = 2
+        total_num_examples = 10
+        sgd_optimizer = VariationalSGDOptimizer(
+            batch_size,
+            total_num_examples,
+            max_learning_rate=3.0,
+            burnin=0,
+            preconditioner_decay_rate=decay_rate)
+        sgd_op = sgd_optimizer.apply_gradients(
+            zip([grads0, grads1], [var0, var1]), global_step=global_step)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+
+        # Validate updated params and global_step
+        self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+        self.assertAllCloseAccordingToType(1, global_step.eval())
+        self.assertAllCloseAccordingToType(1, sgd_optimizer._counter.eval())
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([[1.1], [2.1]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+        decay_rate = 0.1
+        batch_size = 2
+        total_num_examples = 10
+        sgd_op = VariationalSGDOptimizer(
+            batch_size,
+            total_num_examples,
+            max_learning_rate=3.0,
+            burnin=0,
+            preconditioner_decay_rate=decay_rate).apply_gradients(
+                zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.1], [2.1]], var0.eval())
+        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[1.1 - 3.0 * 0.1], [2.1]],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType(
+            [[3.0 - 3.0 * 0], [4.0 - 3.0 * 0.01]], var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py b/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
index ee3719232d8796c338247320fd8ef832a41df12b..d44fe6529a7ff0da0c6747e193fdb98a272a8da3 100644
--- a/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/custom_grad_impl.py
@@ -31,8 +31,7 @@ __all__ = [
 ]
 
 
-def custom_gradient(fx, gx, x, axis=(),
-                    fx_gx_manually_stopped=False,
+def custom_gradient(fx, gx, x, axis=(), fx_gx_manually_stopped=False,
                     name=None):
   """Enables specifying a custom gradient.
 
@@ -43,7 +42,8 @@ def custom_gradient(fx, gx, x, axis=(),
   h(x) = x * stop_gradient(g(x)) + stop_gradient(f(x) - x * g(x))
   ```
 
-  is such that `h(x) = stop(f(x))` and `grad[h(x), x] = stop_gradient(g(x)).`
+  is such that `h(x) = stop_gradient(f(x))` and `grad[h(x), x] =
+  stop_gradient(g(x)).`
 
   In addition to scalar-domain/scalar-range functions, this function also
   supports tensor-domain/scalar-range functions. However, in the latter case it
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc.py b/tensorflow/contrib/bayesflow/python/ops/hmc.py
index 977d42fc16bb91777a76c45ac24f3c5dc587f5fe..7fd5652c5c3e085b23c05baef6e3a42b7a42e08f 100644
--- a/tensorflow/contrib/bayesflow/python/ops/hmc.py
+++ b/tensorflow/contrib/bayesflow/python/ops/hmc.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm.
-"""
+"""Hamiltonian Monte Carlo, a gradient-based MCMC algorithm."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,11 +23,9 @@ from tensorflow.contrib.bayesflow.python.ops.hmc_impl import *  # pylint: disabl
 from tensorflow.python.util import all_util
 
 _allowed_symbols = [
-    'chain',
-    'kernel',
-    'leapfrog_integrator',
-    'leapfrog_step',
-    'ais_chain'
+    "sample_chain",
+    "sample_annealed_importance_chain",
+    "kernel",
 ]
 
 all_util.remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
index 333dce929530adceb30dcb63653a5bd009c059e0..f724910c59315867a42a56fab3deb36f5d3adb7a 100644
--- a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
@@ -14,183 +14,343 @@
 # ==============================================================================
 """Hamiltonian Monte Carlo, a gradient-based MCMC algorithm.
 
-@@chain
-@@update
-@@leapfrog_integrator
-@@leapfrog_step
-@@ais_chain
+@@sample_chain
+@@sample_annealed_importance_chain
+@@kernel
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import numpy as np
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_impl as gradients_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.ops.distributions import util as distributions_util
 
 __all__ = [
-    'chain',
-    'kernel',
-    'leapfrog_integrator',
-    'leapfrog_step',
-    'ais_chain'
+    "sample_chain",
+    "sample_annealed_importance_chain",
+    "kernel",
 ]
 
 
-def _make_potential_and_grad(target_log_prob_fn):
-  def potential_and_grad(x):
-    log_prob_result = -target_log_prob_fn(x)
-    grad_result = gradients_impl.gradients(math_ops.reduce_sum(log_prob_result),
-                                           x)[0]
-    return log_prob_result, grad_result
-  return potential_and_grad
-
-
-def chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
-          target_log_prob_fn, event_dims=(), name=None):
+KernelResults = collections.namedtuple(
+    "KernelResults",
+    [
+        "acceptance_probs",
+        "current_grads_target_log_prob",  # "Current result" means "accepted".
+        "current_target_log_prob",  # "Current result" means "accepted".
+        "energy_change",
+        "is_accepted",
+        "proposed_grads_target_log_prob",
+        "proposed_state",
+        "proposed_target_log_prob",
+        "random_positive",
+    ])
+
+
+def _make_dummy_kernel_results(
+    dummy_state,
+    dummy_target_log_prob,
+    dummy_grads_target_log_prob):
+  return KernelResults(
+      acceptance_probs=dummy_target_log_prob,
+      current_grads_target_log_prob=dummy_grads_target_log_prob,
+      current_target_log_prob=dummy_target_log_prob,
+      energy_change=dummy_target_log_prob,
+      is_accepted=array_ops.ones_like(dummy_target_log_prob, dtypes.bool),
+      proposed_grads_target_log_prob=dummy_grads_target_log_prob,
+      proposed_state=dummy_state,
+      proposed_target_log_prob=dummy_target_log_prob,
+      random_positive=dummy_target_log_prob,
+  )
+
+
+def sample_chain(
+    num_results,
+    target_log_prob_fn,
+    current_state,
+    step_size,
+    num_leapfrog_steps,
+    num_burnin_steps=0,
+    num_steps_between_results=0,
+    seed=None,
+    current_target_log_prob=None,
+    current_grads_target_log_prob=None,
+    name=None):
   """Runs multiple iterations of one or more Hamiltonian Monte Carlo chains.
 
-  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
-  algorithm that takes a series of gradient-informed steps to produce
-  a Metropolis proposal. This function samples from an HMC Markov
-  chain whose initial state is `initial_x` and whose stationary
-  distribution has log-density `target_log_prob_fn()`.
-
-  This function can update multiple chains in parallel. It assumes
-  that all dimensions of `initial_x` not specified in `event_dims` are
-  independent, and should therefore be updated independently. The
-  output of `target_log_prob_fn()` should sum log-probabilities across
-  all event dimensions. Slices along dimensions not in `event_dims`
-  may have different target distributions; this is up to
+  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm
+  that takes a series of gradient-informed steps to produce a Metropolis
+  proposal. This function samples from an HMC Markov chain at `current_state`
+  and whose stationary distribution has log-unnormalized-density
   `target_log_prob_fn()`.
 
-  This function basically just wraps `hmc.kernel()` in a tf.scan() loop.
+  This function samples from multiple chains in parallel. It assumes that the
+  the leftmost dimensions of (each) `current_state` (part) index an independent
+  chain.  The function `target_log_prob_fn()` sums log-probabilities across
+  event dimensions (i.e., current state (part) rightmost dimensions). Each
+  element of the output of `target_log_prob_fn()` represents the (possibly
+  unnormalized) log-probability of the joint distribution over (all) the current
+  state (parts).
 
-  Args:
-    n_iterations: Integer number of Markov chain updates to run.
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `initial_x`. Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely.
-      When possible, it's often helpful to match per-variable step
-      sizes to the standard deviations of the target distribution in
-      each variable.
-    n_leapfrog_steps: Integer number of steps to run the leapfrog
-      integrator for. Total progress per HMC step is roughly
-      proportional to step_size * n_leapfrog_steps.
-    initial_x: Tensor of initial state(s) of the Markov chain(s).
-    target_log_prob_fn: Python callable which takes an argument like `initial_x`
-      and returns its (possibly unnormalized) log-density under the target
-      distribution.
-    event_dims: List of dimensions that should not be treated as
-      independent. This allows for multiple chains to be run independently
-      in parallel. Default is (), i.e., all dimensions are independent.
-    name: Python `str` name prefixed to Ops created by this function.
+  The `current_state` can be represented as a single `Tensor` or a `list` of
+  `Tensors` which collectively represent the current state. When specifying a
+  `list`, one must also specify a list of `step_size`s.
 
-  Returns:
-    acceptance_probs: Tensor with the acceptance probabilities for each
-      iteration. Has shape matching `target_log_prob_fn(initial_x)`.
-    chain_states: Tensor with the state of the Markov chain at each iteration.
-      Has shape `[n_iterations, initial_x.shape[0],...,initial_x.shape[-1]`.
+  Note: `target_log_prob_fn` is called exactly twice.
+
+  Only one out of every `num_steps_between_samples + 1` steps is included in the
+  returned results. This "thinning" comes at a cost of reduced statistical
+  power, while reducing memory requirements and autocorrelation. For more
+  discussion see [1].
+
+  [1]: "Statistically efficient thinning of a Markov chain sampler."
+       Art B. Owen. April 2017.
+       http://statweb.stanford.edu/~owen/reports/bestthinning.pdf
 
   #### Examples:
 
-  ```python
-  # Sampling from a standard normal (note `log_joint()` is unnormalized):
-  def log_joint(x):
-    return tf.reduce_sum(-0.5 * tf.square(x))
-  chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint,
-                                      event_dims=[0])
-  # Discard first half of chain as warmup/burn-in
-  warmed_up = chain[500:]
-  mean_est = tf.reduce_mean(warmed_up, 0)
-  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
-  ```
+  ##### Sample from a diagonal-variance Gaussian.
 
   ```python
-  # Sampling from a diagonal-variance Gaussian:
-  variances = tf.linspace(1., 3., 10)
-  def log_joint(x):
-    return tf.reduce_sum(-0.5 / variances * tf.square(x))
-  chain, acceptance_probs = hmc.chain(1000, 0.5, 2, tf.zeros(10), log_joint,
-                                      event_dims=[0])
-  # Discard first half of chain as warmup/burn-in
-  warmed_up = chain[500:]
-  mean_est = tf.reduce_mean(warmed_up, 0)
-  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
+  tfd = tf.contrib.distributions
+
+  def make_likelihood(true_variances):
+    return tfd.MultivariateNormalDiag(
+        scale_diag=tf.sqrt(true_variances))
+
+  dims = 10
+  dtype = np.float32
+  true_variances = tf.linspace(dtype(1), dtype(3), dims)
+  likelihood = make_likelihood(true_variances)
+
+  states, kernel_results = hmc.sample_chain(
+      num_results=1000,
+      target_log_prob_fn=likelihood.log_prob,
+      current_state=tf.zeros(dims),
+      step_size=0.5,
+      num_leapfrog_steps=2,
+      num_burnin_steps=500)
+
+  # Compute sample stats.
+  sample_mean = tf.reduce_mean(states, axis=0)
+  sample_var = tf.reduce_mean(
+      tf.squared_difference(states, sample_mean),
+      axis=0)
   ```
 
-  ```python
-  # Sampling from factor-analysis posteriors with known factors W:
-  # mu[i, j] ~ Normal(0, 1)
-  # x[i] ~ Normal(matmul(mu[i], W), I)
-  def log_joint(mu, x, W):
-    prior = -0.5 * tf.reduce_sum(tf.square(mu), 1)
-    x_mean = tf.matmul(mu, W)
-    likelihood = -0.5 * tf.reduce_sum(tf.square(x - x_mean), 1)
-    return prior + likelihood
-  chain, acceptance_probs = hmc.chain(1000, 0.1, 2,
-                                      tf.zeros([x.shape[0], W.shape[0]]),
-                                      lambda mu: log_joint(mu, x, W),
-                                      event_dims=[1])
-  # Discard first half of chain as warmup/burn-in
-  warmed_up = chain[500:]
-  mean_est = tf.reduce_mean(warmed_up, 0)
-  var_est = tf.reduce_mean(tf.square(warmed_up), 0) - tf.square(mean_est)
+  ##### Sampling from factor-analysis posteriors with known factors.
+
+  I.e.,
+
+  ```none
+  for i=1..n:
+    w[i] ~ Normal(0, eye(d))            # prior
+    x[i] ~ Normal(loc=matmul(w[i], F))  # likelihood
   ```
 
+  where `F` denotes factors.
+
   ```python
-  # Sampling from the posterior of a Bayesian regression model.:
-
-  # Run 100 chains in parallel, each with a different initialization.
-  initial_beta = tf.random_normal([100, x.shape[1]])
-  chain, acceptance_probs = hmc.chain(1000, 0.1, 10, initial_beta,
-                                      log_joint_partial, event_dims=[1])
-  # Discard first halves of chains as warmup/burn-in
-  warmed_up = chain[500:]
-  # Averaging across samples within a chain and across chains
-  mean_est = tf.reduce_mean(warmed_up, [0, 1])
-  var_est = tf.reduce_mean(tf.square(warmed_up), [0, 1]) - tf.square(mean_est)
+  tfd = tf.contrib.distributions
+
+  def make_prior(dims, dtype):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.zeros(dims, dtype))
+
+  def make_likelihood(weights, factors):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.tensordot(weights, factors, axes=[[0], [-1]]))
+
+  # Setup data.
+  num_weights = 10
+  num_factors = 4
+  num_chains = 100
+  dtype = np.float32
+
+  prior = make_prior(num_weights, dtype)
+  weights = prior.sample(num_chains)
+  factors = np.random.randn(num_factors, num_weights).astype(dtype)
+  x = make_likelihood(weights, factors).sample(num_chains)
+
+  def target_log_prob(w):
+    # Target joint is: `f(w) = p(w, x | factors)`.
+    return prior.log_prob(w) + make_likelihood(w, factors).log_prob(x)
+
+  # Get `num_results` samples from `num_chains` independent chains.
+  chains_states, kernels_results = hmc.sample_chain(
+      num_results=1000,
+      target_log_prob_fn=target_log_prob,
+      current_state=tf.zeros([num_chains, dims], dtype),
+      step_size=0.1,
+      num_leapfrog_steps=2,
+      num_burnin_steps=500)
+
+  # Compute sample stats.
+  sample_mean = tf.reduce_mean(chains_states, axis=[0, 1])
+  sample_var = tf.reduce_mean(
+      tf.squared_difference(chains_states, sample_mean),
+      axis=[0, 1])
   ```
+
+  Args:
+    num_results: Integer number of Markov chain draws.
+    target_log_prob_fn: Python callable which takes an argument like
+      `current_state` (or `*current_state` if it's a list) and returns its
+      (possibly unnormalized) log-density under the target distribution.
+    current_state: `Tensor` or Python `list` of `Tensor`s representing the
+      current state(s) of the Markov chain(s). The first `r` dimensions index
+      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
+    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
+      for the leapfrog integrator. Must broadcast with the shape of
+      `current_state`. Larger step sizes lead to faster progress, but too-large
+      step sizes make rejection exponentially more likely. When possible, it's
+      often helpful to match per-variable step sizes to the standard deviations
+      of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    num_burnin_steps: Integer number of chain steps to take before starting to
+      collect results.
+      Default value: 0 (i.e., no burn-in).
+    num_steps_between_results: Integer number of chain steps between collecting
+      a result. Only one out of every `num_steps_between_samples + 1` steps is
+      included in the returned results. This "thinning" comes at a cost of
+      reduced statistical power, while reducing memory requirements and
+      autocorrelation. For more discussion see [1].
+      Default value: 0 (i.e., no subsampling).
+    seed: Python integer to seed the random number generator.
+    current_target_log_prob: (Optional) `Tensor` representing the value of
+      `target_log_prob_fn` at the `current_state`. The only reason to specify
+      this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
+      representing gradient of `target_log_prob` at the `current_state` and wrt
+      the `current_state`. Must have same shape as `current_state`. The only
+      reason to specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_sample_chain").
+
+  Returns:
+    accepted_states: Tensor or Python list of `Tensor`s representing the
+      state(s) of the Markov chain(s) at each result step. Has same shape as
+      input `current_state` but with a prepended `num_results`-size dimension.
+    kernel_results: `collections.namedtuple` of internal calculations used to
+      advance the chain.
   """
-  with ops.name_scope(name, 'hmc_chain', [n_iterations, step_size,
-                                          n_leapfrog_steps, initial_x]):
-    initial_x = ops.convert_to_tensor(initial_x, name='initial_x')
-    non_event_shape = array_ops.shape(target_log_prob_fn(initial_x))
-
-    def body(a, _):
-      updated_x, acceptance_probs, log_prob, grad = kernel(
-          step_size, n_leapfrog_steps, a[0], target_log_prob_fn, event_dims,
-          a[2], a[3])
-      return updated_x, acceptance_probs, log_prob, grad
-
-    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
-    potential, grad = potential_and_grad(initial_x)
-    return functional_ops.scan(body, array_ops.zeros(n_iterations),
-                               (initial_x, array_ops.zeros(non_event_shape),
-                                -potential, -grad))[:2]
-
-
-def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
-              target_log_prob_fn, proposal_log_prob_fn, event_dims=(),
-              name=None):
+  with ops.name_scope(
+      name, "hmc_sample_chain",
+      [num_results, current_state, step_size, num_leapfrog_steps,
+       num_burnin_steps, num_steps_between_results, seed,
+       current_target_log_prob, current_grads_target_log_prob]):
+    with ops.name_scope("initialize"):
+      [
+          current_state,
+          step_size,
+          current_target_log_prob,
+          current_grads_target_log_prob,
+      ] = _prepare_args(
+          target_log_prob_fn,
+          current_state,
+          step_size,
+          current_target_log_prob,
+          current_grads_target_log_prob)
+      num_results = ops.convert_to_tensor(
+          num_results,
+          dtype=dtypes.int32,
+          name="num_results")
+      num_leapfrog_steps = ops.convert_to_tensor(
+          num_leapfrog_steps,
+          dtype=dtypes.int32,
+          name="num_leapfrog_steps")
+      num_burnin_steps = ops.convert_to_tensor(
+          num_burnin_steps,
+          dtype=dtypes.int32,
+          name="num_burnin_steps")
+      num_steps_between_results = ops.convert_to_tensor(
+          num_steps_between_results,
+          dtype=dtypes.int32,
+          name="num_steps_between_results")
+
+    def _run_chain(num_steps, current_state, kernel_results):
+      """Runs the chain(s) for `num_steps`."""
+      def _loop_body(iter_, current_state, kernel_results):
+        return [iter_ + 1] + list(kernel(
+            target_log_prob_fn,
+            current_state,
+            step_size,
+            num_leapfrog_steps,
+            seed,
+            kernel_results.current_target_log_prob,
+            kernel_results.current_grads_target_log_prob))
+      while_loop_kwargs = dict(
+          cond=lambda iter_, *args: iter_ < num_steps,
+          body=_loop_body,
+          loop_vars=[
+              np.int32(0),
+              current_state,
+              kernel_results,
+          ],
+      )
+      if seed is not None:
+        while_loop_kwargs["parallel_iterations"] = 1
+      return control_flow_ops.while_loop(
+          **while_loop_kwargs)[1:]  # Lop-off "iter_".
+
+    def _scan_body(args_list, iter_):
+      """Closure which implements `tf.scan` body."""
+      current_state, kernel_results = args_list
+      return _run_chain(
+          1 + array_ops.where(math_ops.equal(iter_, 0),
+                              num_burnin_steps,
+                              num_steps_between_results),
+          current_state,
+          kernel_results)
+
+    scan_kwargs = dict(
+        fn=_scan_body,
+        elems=math_ops.range(num_results),  # iter_: used to choose burnin.
+        initializer=[
+            current_state,
+            _make_dummy_kernel_results(
+                current_state,
+                current_target_log_prob,
+                current_grads_target_log_prob),
+        ])
+    if seed is not None:
+      scan_kwargs["parallel_iterations"] = 1
+    return functional_ops.scan(**scan_kwargs)
+
+
+def sample_annealed_importance_chain(
+    proposal_log_prob_fn,
+    num_steps,
+    target_log_prob_fn,
+    current_state,
+    step_size,
+    num_leapfrog_steps,
+    seed=None,
+    name=None):
   """Runs annealed importance sampling (AIS) to estimate normalizing constants.
 
-  This routine uses Hamiltonian Monte Carlo to sample from a series of
+  This function uses Hamiltonian Monte Carlo to sample from a series of
   distributions that slowly interpolates between an initial "proposal"
-  distribution
+  distribution:
 
   `exp(proposal_log_prob_fn(x) - proposal_log_normalizer)`
 
-  and the target distribution
+  and the target distribution:
 
   `exp(target_log_prob_fn(x) - target_log_normalizer)`,
 
@@ -199,112 +359,203 @@ def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
   normalizing constants of the initial distribution and the target
   distribution:
 
-  E[exp(w)] = exp(target_log_normalizer - proposal_log_normalizer).
-
-  Args:
-    n_iterations: Integer number of Markov chain updates to run. More
-      iterations means more expense, but smoother annealing between q
-      and p, which in turn means exponentially lower variance for the
-      normalizing constant estimator.
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `initial_x`. Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely.
-      When possible, it's often helpful to match per-variable step
-      sizes to the standard deviations of the target distribution in
-      each variable.
-    n_leapfrog_steps: Integer number of steps to run the leapfrog
-      integrator for. Total progress per HMC step is roughly
-      proportional to step_size * n_leapfrog_steps.
-    initial_x: Tensor of initial state(s) of the Markov chain(s). Must
-      be a sample from q, or results will be incorrect.
-    target_log_prob_fn: Python callable which takes an argument like `initial_x`
-      and returns its (possibly unnormalized) log-density under the target
-      distribution.
-    proposal_log_prob_fn: Python callable that returns the log density of the
-      initial distribution.
-    event_dims: List of dimensions that should not be treated as
-      independent. This allows for multiple chains to be run independently
-      in parallel. Default is (), i.e., all dimensions are independent.
-    name: Python `str` name prefixed to Ops created by this function.
+  `E[exp(ais_weights)] = exp(target_log_normalizer - proposal_log_normalizer)`.
 
-  Returns:
-    ais_weights: Tensor with the estimated weight(s). Has shape matching
-      `target_log_prob_fn(initial_x)`.
-    chain_states: Tensor with the state(s) of the Markov chain(s) the final
-      iteration. Has shape matching `initial_x`.
-    acceptance_probs: Tensor with the acceptance probabilities for the final
-      iteration. Has shape matching `target_log_prob_fn(initial_x)`.
+  Note: `proposal_log_prob_fn` and `target_log_prob_fn` are called exactly three
+  times (although this may be reduced to two times, in the future).
 
   #### Examples:
 
+  ##### Estimate the normalizing constant of a log-gamma distribution.
+
   ```python
-  # Estimating the normalizing constant of a log-gamma distribution:
-  def proposal_log_prob(x):
-    # Standard normal log-probability. This is properly normalized.
-    return tf.reduce_sum(-0.5 * tf.square(x) - 0.5 * np.log(2 * np.pi), 1)
-  def target_log_prob(x):
-    # Unnormalized log-gamma(2, 3) distribution.
-    # True normalizer is (lgamma(2) - 2 * log(3)) * x.shape[1]
-    return tf.reduce_sum(2. * x - 3. * tf.exp(x), 1)
+  tfd = tf.contrib.distributions
+
   # Run 100 AIS chains in parallel
-  initial_x = tf.random_normal([100, 20])
-  w, _, _ = hmc.ais_chain(1000, 0.2, 2, initial_x, target_log_prob,
-                          proposal_log_prob, event_dims=[1])
-  log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100)
+  num_chains = 100
+  dims = 20
+  dtype = np.float32
+
+  proposal = tfd.MultivatiateNormalDiag(
+     loc=tf.zeros([dims], dtype=dtype))
+
+  target = tfd.TransformedDistribution(
+    distribution=tfd.Gamma(concentration=dtype(2),
+                           rate=dtype(3)),
+    bijector=tfd.bijectors.Invert(tfd.bijectors.Exp()),
+    event_shape=[dims])
+
+  chains_state, ais_weights, kernels_results = (
+      hmc.sample_annealed_importance_chain(
+          proposal_log_prob_fn=proposal.log_prob,
+          num_steps=1000,
+          target_log_prob_fn=target.log_prob,
+          step_size=0.2,
+          current_state=proposal.sample(num_chains),
+          num_leapfrog_steps=2))
+
+  log_estimated_normalizer = (tf.reduce_logsumexp(ais_weights)
+                              - np.log(num_chains))
+  log_true_normalizer = tf.lgamma(2.) - 2. * tf.log(3.)
   ```
 
+  ##### Estimate marginal likelihood of a Bayesian regression model.
+
   ```python
-  # Estimating the marginal likelihood of a Bayesian regression model:
-  base_measure = -0.5 * np.log(2 * np.pi)
-  def proposal_log_prob(x):
-    # Standard normal log-probability. This is properly normalized.
-    return tf.reduce_sum(-0.5 * tf.square(x) + base_measure, 1)
-  def regression_log_joint(beta, x, y):
-    # This function returns a vector whose ith element is log p(beta[i], y | x).
-    # Each row of beta corresponds to the state of an independent Markov chain.
-    log_prior = tf.reduce_sum(-0.5 * tf.square(beta) + base_measure, 1)
-    means = tf.matmul(beta, x, transpose_b=True)
-    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means) +
-                                   base_measure, 1)
-    return log_prior + log_likelihood
-  def log_joint_partial(beta):
-    return regression_log_joint(beta, x, y)
+  tfd = tf.contrib.distributions
+
+  def make_prior(dims, dtype):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.zeros(dims, dtype))
+
+  def make_likelihood(weights, x):
+    return tfd.MultivariateNormalDiag(
+        loc=tf.tensordot(weights, x, axes=[[0], [-1]]))
+
   # Run 100 AIS chains in parallel
-  initial_beta = tf.random_normal([100, x.shape[1]])
-  w, beta_samples, _ = hmc.ais_chain(1000, 0.1, 2, initial_beta,
-                                     log_joint_partial, proposal_log_prob,
-                                     event_dims=[1])
-  log_normalizer_estimate = tf.reduce_logsumexp(w) - np.log(100)
+  num_chains = 100
+  dims = 10
+  dtype = np.float32
+
+  # Make training data.
+  x = np.random.randn(num_chains, dims).astype(dtype)
+  true_weights = np.random.randn(dims).astype(dtype)
+  y = np.dot(x, true_weights) + np.random.randn(num_chains)
+
+  # Setup model.
+  prior = make_prior(dims, dtype)
+  def target_log_prob_fn(weights):
+    return prior.log_prob(weights) + make_likelihood(weights, x).log_prob(y)
+
+  proposal = tfd.MultivariateNormalDiag(
+      loc=tf.zeros(dims, dtype))
+
+  weight_samples, ais_weights, kernel_results = (
+      hmc.sample_annealed_importance_chain(
+        num_steps=1000,
+        proposal_log_prob_fn=proposal.log_prob,
+        target_log_prob_fn=target_log_prob_fn
+        current_state=tf.zeros([num_chains, dims], dtype),
+        step_size=0.1,
+        num_leapfrog_steps=2))
+  log_normalizer_estimate = (tf.reduce_logsumexp(ais_weights)
+                             - np.log(num_chains))
   ```
+
+  Args:
+    proposal_log_prob_fn: Python callable that returns the log density of the
+      initial distribution.
+    num_steps: Integer number of Markov chain updates to run. More
+      iterations means more expense, but smoother annealing between q
+      and p, which in turn means exponentially lower variance for the
+      normalizing constant estimator.
+    target_log_prob_fn: Python callable which takes an argument like
+      `current_state` (or `*current_state` if it's a list) and returns its
+      (possibly unnormalized) log-density under the target distribution.
+    current_state: `Tensor` or Python `list` of `Tensor`s representing the
+      current state(s) of the Markov chain(s). The first `r` dimensions index
+      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
+    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
+      for the leapfrog integrator. Must broadcast with the shape of
+      `current_state`. Larger step sizes lead to faster progress, but too-large
+      step sizes make rejection exponentially more likely. When possible, it's
+      often helpful to match per-variable step sizes to the standard deviations
+      of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    seed: Python integer to seed the random number generator.
+    name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_sample_annealed_importance_chain").
+
+  Returns:
+    accepted_state: `Tensor` or Python list of `Tensor`s representing the
+      state(s) of the Markov chain(s) at the final iteration. Has same shape as
+      input `current_state`.
+    ais_weights: Tensor with the estimated weight(s). Has shape matching
+      `target_log_prob_fn(current_state)`.
+    kernel_results: `collections.namedtuple` of internal calculations used to
+      advance the chain.
   """
-  with ops.name_scope(name, 'hmc_ais_chain',
-                      [n_iterations, step_size, n_leapfrog_steps, initial_x]):
-    non_event_shape = array_ops.shape(target_log_prob_fn(initial_x))
-
-    beta_series = math_ops.linspace(0., 1., n_iterations+1)[1:]
-    def _body(a, beta):  # pylint: disable=missing-docstring
-      def log_prob_beta(x):
-        return ((1 - beta) * proposal_log_prob_fn(x) +
-                beta * target_log_prob_fn(x))
-      last_x = a[0]
-      w = a[2]
-      w += (1. / n_iterations) * (target_log_prob_fn(last_x) -
-                                  proposal_log_prob_fn(last_x))
-      # TODO(b/66917083): There's an opportunity for gradient reuse here.
-      updated_x, acceptance_probs, _, _ = kernel(step_size, n_leapfrog_steps,
-                                                 last_x, log_prob_beta,
-                                                 event_dims)
-      return updated_x, acceptance_probs, w
-
-    x, acceptance_probs, w = functional_ops.scan(
-        _body, beta_series, (initial_x, array_ops.zeros(non_event_shape),
-                             array_ops.zeros(non_event_shape)))
-  return w[-1], x[-1], acceptance_probs[-1]
-
-
-def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
-           x_log_prob=None, x_grad=None, name=None):
+  def make_convex_combined_log_prob_fn(iter_):
+    def _fn(*args):
+      p = proposal_log_prob_fn(*args)
+      t = target_log_prob_fn(*args)
+      dtype = p.dtype.base_dtype
+      beta = (math_ops.cast(iter_ + 1, dtype)
+              / math_ops.cast(num_steps, dtype))
+      return (1. - beta) * p + beta * t
+    return _fn
+
+  with ops.name_scope(
+      name, "hmc_sample_annealed_importance_chain",
+      [num_steps, current_state, step_size, num_leapfrog_steps, seed]):
+    with ops.name_scope("initialize"):
+      [
+          current_state,
+          step_size,
+          current_log_prob,
+          current_grads_log_prob,
+      ] = _prepare_args(
+          make_convex_combined_log_prob_fn(iter_=0),
+          current_state,
+          step_size,
+          description="convex_combined_log_prob")
+      num_steps = ops.convert_to_tensor(
+          num_steps,
+          dtype=dtypes.int32,
+          name="num_steps")
+      num_leapfrog_steps = ops.convert_to_tensor(
+          num_leapfrog_steps,
+          dtype=dtypes.int32,
+          name="num_leapfrog_steps")
+    def _loop_body(iter_, ais_weights, current_state, kernel_results):
+      """Closure which implements `tf.while_loop` body."""
+      current_state_parts = (list(current_state)
+                             if _is_list_like(current_state)
+                             else [current_state])
+      # TODO(b/72994218): Consider refactoring things to avoid this unecessary
+      # call.
+      ais_weights += ((target_log_prob_fn(*current_state_parts)
+                       - proposal_log_prob_fn(*current_state_parts))
+                      / math_ops.cast(num_steps, ais_weights.dtype))
+      return [iter_ + 1, ais_weights] + list(kernel(
+          make_convex_combined_log_prob_fn(iter_),
+          current_state,
+          step_size,
+          num_leapfrog_steps,
+          seed,
+          kernel_results.current_target_log_prob,
+          kernel_results.current_grads_target_log_prob))
+
+    while_loop_kwargs = dict(
+        cond=lambda iter_, *args: iter_ < num_steps,
+        body=_loop_body,
+        loop_vars=[
+            np.int32(0),  # iter_
+            array_ops.zeros_like(current_log_prob),  # ais_weights
+            current_state,
+            _make_dummy_kernel_results(current_state,
+                                       current_log_prob,
+                                       current_grads_log_prob),
+        ])
+    if seed is not None:
+      while_loop_kwargs["parallel_iterations"] = 1
+
+    [ais_weights, current_state, kernel_results] = control_flow_ops.while_loop(
+        **while_loop_kwargs)[1:]  # Lop-off "iter_".
+
+    return [current_state, ais_weights, kernel_results]
+
+
+def kernel(target_log_prob_fn,
+           current_state,
+           step_size,
+           num_leapfrog_steps,
+           seed=None,
+           current_target_log_prob=None,
+           current_grads_target_log_prob=None,
+           name=None):
   """Runs one iteration of Hamiltonian Monte Carlo.
 
   Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
@@ -312,324 +563,623 @@ def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
   a Metropolis proposal. This function applies one step of HMC to
   randomly update the variable `x`.
 
-  This function can update multiple chains in parallel. It assumes
-  that all dimensions of `x` not specified in `event_dims` are
-  independent, and should therefore be updated independently. The
-  output of `target_log_prob_fn()` should sum log-probabilities across
-  all event dimensions. Slices along dimensions not in `event_dims`
-  may have different target distributions; for example, if
-  `event_dims == (1,)`, then `x[0, :]` could have a different target
-  distribution from x[1, :]. This is up to `target_log_prob_fn()`.
-
-  Args:
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `x`. Larger step sizes lead to faster progress, but
-      too-large step sizes make rejection exponentially more likely.
-      When possible, it's often helpful to match per-variable step
-      sizes to the standard deviations of the target distribution in
-      each variable.
-    n_leapfrog_steps: Integer number of steps to run the leapfrog
-      integrator for. Total progress per HMC step is roughly
-      proportional to step_size * n_leapfrog_steps.
-    x: Tensor containing the value(s) of the random variable(s) to update.
-    target_log_prob_fn: Python callable which takes an argument like `initial_x`
-      and returns its (possibly unnormalized) log-density under the target
-      distribution.
-    event_dims: List of dimensions that should not be treated as
-      independent. This allows for multiple chains to be run independently
-      in parallel. Default is (), i.e., all dimensions are independent.
-    x_log_prob (optional): Tensor containing the cached output of a previous
-      call to `target_log_prob_fn()` evaluated at `x` (such as that provided by
-      a previous call to `kernel()`). Providing `x_log_prob` and
-      `x_grad` saves one gradient computation per call to `kernel()`.
-    x_grad (optional): Tensor containing the cached gradient of
-      `target_log_prob_fn()` evaluated at `x` (such as that provided by
-      a previous call to `kernel()`). Providing `x_log_prob` and
-      `x_grad` saves one gradient computation per call to `kernel()`.
-    name: Python `str` name prefixed to Ops created by this function.
-
-  Returns:
-    updated_x: The updated variable(s) x. Has shape matching `initial_x`.
-    acceptance_probs: Tensor with the acceptance probabilities for the final
-      iteration. This is useful for diagnosing step size problems etc. Has
-      shape matching `target_log_prob_fn(initial_x)`.
-    new_log_prob: The value of `target_log_prob_fn()` evaluated at `updated_x`.
-    new_grad: The value of the gradient of `target_log_prob_fn()` evaluated at
-      `updated_x`.
+  This function can update multiple chains in parallel. It assumes that all
+  leftmost dimensions of `current_state` index independent chain states (and are
+  therefore updated independently). The output of `target_log_prob_fn()` should
+  sum log-probabilities across all event dimensions. Slices along the rightmost
+  dimensions may have different target distributions; for example,
+  `current_state[0, :]` could have a different target distribution from
+  `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of
+  independent chains is `tf.size(target_log_prob_fn(*current_state))`.)
 
   #### Examples:
 
+  ##### Simple chain with warm-up.
+
   ```python
+  tfd = tf.contrib.distributions
+
   # Tuning acceptance rates:
+  dtype = np.float32
   target_accept_rate = 0.631
-  def target_log_prob(x):
-    # Standard normal
-    return tf.reduce_sum(-0.5 * tf.square(x))
-  initial_x = tf.zeros([10])
-  initial_log_prob = target_log_prob(initial_x)
-  initial_grad = tf.gradients(initial_log_prob, initial_x)[0]
-  # Algorithm state
-  x = tf.Variable(initial_x, name='x')
-  step_size = tf.Variable(1., name='step_size')
-  last_log_prob = tf.Variable(initial_log_prob, name='last_log_prob')
-  last_grad = tf.Variable(initial_grad, name='last_grad')
-  # Compute updates
-  new_x, acceptance_prob, log_prob, grad = hmc.kernel(step_size, 3, x,
-                                                      target_log_prob,
-                                                      event_dims=[0],
-                                                      x_log_prob=last_log_prob)
-  x_update = tf.assign(x, new_x)
-  log_prob_update = tf.assign(last_log_prob, log_prob)
-  grad_update = tf.assign(last_grad, grad)
-  step_size_update = tf.assign(step_size,
-                               tf.where(acceptance_prob > target_accept_rate,
-                                        step_size * 1.01, step_size / 1.01))
-  adaptive_updates = [x_update, log_prob_update, grad_update, step_size_update]
-  sampling_updates = [x_update, log_prob_update, grad_update]
-
-  sess = tf.Session()
-  sess.run(tf.global_variables_initializer())
+  num_warmup_iter = 500
+  num_chain_iter = 500
+
+  x = tf.get_variable(name="x", initializer=dtype(1))
+  step_size = tf.get_variable(name="step_size", initializer=dtype(1))
+
+  target = tfd.Normal(loc=dtype(0), scale=dtype(1))
+
+  new_x, other_results = hmc.kernel(
+      target_log_prob_fn=target.log_prob,
+      current_state=x,
+      step_size=step_size,
+      num_leapfrog_steps=3)[:4]
+
+  x_update = x.assign(new_x)
+
+  step_size_update = step_size.assign_add(
+      step_size * tf.where(
+        other_results.acceptance_probs > target_accept_rate,
+        0.01, -0.01))
+
+  warmup = tf.group([x_update, step_size_update])
+
+  tf.global_variables_initializer().run()
+
+  sess.graph.finalize()  # No more graph building.
+
   # Warm up the sampler and adapt the step size
-  for i in xrange(500):
-    sess.run(adaptive_updates)
+  for _ in xrange(num_warmup_iter):
+    sess.run(warmup)
+
   # Collect samples without adapting step size
-  samples = np.zeros([500, 10])
-  for i in xrange(500):
-    x_val, _ = sess.run([new_x, sampling_updates])
-    samples[i] = x_val
+  samples = np.zeros([num_chain_iter])
+  for i in xrange(num_chain_iter):
+    _, x_, target_log_prob_, grad_ = sess.run([
+        x_update,
+        x,
+        other_results.target_log_prob,
+        other_results.grads_target_log_prob])
+    samples[i] = x_
+
+  print(samples.mean(), samples.std())
+  ```
+
+  ##### Sample from more complicated posterior.
+
+  I.e.,
+
+  ```none
+    W ~ MVN(loc=0, scale=sigma * eye(dims))
+    for i=1...num_samples:
+        X[i] ~ MVN(loc=0, scale=eye(dims))
+      eps[i] ~ Normal(loc=0, scale=1)
+        Y[i] = X[i].T * W + eps[i]
   ```
 
   ```python
-  # Empirical-Bayes estimation of a hyperparameter by MCMC-EM:
-
-  # Problem setup
-  N = 150
-  D = 10
-  x = np.random.randn(N, D).astype(np.float32)
-  true_sigma = 0.5
-  true_beta = true_sigma * np.random.randn(D).astype(np.float32)
-  y = x.dot(true_beta) + np.random.randn(N).astype(np.float32)
-
-  def log_prior(beta, log_sigma):
-    return tf.reduce_sum(-0.5 / tf.exp(2 * log_sigma) * tf.square(beta) -
-                         log_sigma)
-  def regression_log_joint(beta, log_sigma, x, y):
-    # This function returns log p(beta | log_sigma) + log p(y | x, beta).
-    means = tf.matmul(tf.expand_dims(beta, 0), x, transpose_b=True)
-    means = tf.squeeze(means)
-    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means))
-    return log_prior(beta, log_sigma) + log_likelihood
-  def log_joint_partial(beta):
-    return regression_log_joint(beta, log_sigma, x, y)
-  # Our estimate of log(sigma)
-  log_sigma = tf.Variable(0., name='log_sigma')
-  # The state of the Markov chain
-  beta = tf.Variable(tf.random_normal([x.shape[1]]), name='beta')
-  new_beta, _, _, _ = hmc.kernel(0.1, 5, beta, log_joint_partial,
-                                 event_dims=[0])
-  beta_update = tf.assign(beta, new_beta)
+  tfd = tf.contrib.distributions
+
+  def make_training_data(num_samples, dims, sigma):
+    dt = np.asarray(sigma).dtype
+    zeros = tf.zeros(dims, dtype=dt)
+    x = tfd.MultivariateNormalDiag(
+        loc=zeros).sample(num_samples, seed=1)
+    w = tfd.MultivariateNormalDiag(
+        loc=zeros,
+        scale_identity_multiplier=sigma).sample(seed=2)
+    noise = tfd.Normal(
+        loc=dt(0),
+        scale=dt(1)).sample(num_samples, seed=3)
+    y = tf.tensordot(x, w, axes=[[1], [0]]) + noise
+    return y, x, w
+
+  def make_prior(sigma, dims):
+    # p(w | sigma)
+    return tfd.MultivariateNormalDiag(
+        loc=tf.zeros([dims], dtype=sigma.dtype),
+        scale_identity_multiplier=sigma)
+
+  def make_likelihood(x, w):
+    # p(y | x, w)
+    return tfd.MultivariateNormalDiag(
+        loc=tf.tensordot(x, w, axes=[[1], [0]]))
+
+  # Setup assumptions.
+  dtype = np.float32
+  num_samples = 150
+  dims = 10
+  num_iters = int(5e3)
+
+  true_sigma = dtype(0.5)
+  y, x, true_weights = make_training_data(num_samples, dims, true_sigma)
+
+  # Estimate of `log(true_sigma)`.
+  log_sigma = tf.get_variable(name="log_sigma", initializer=dtype(0))
+  sigma = tf.exp(log_sigma)
+
+  # State of the Markov chain.
+  weights = tf.get_variable(
+      name="weights",
+      initializer=np.random.randn(dims).astype(dtype))
+
+  prior = make_prior(sigma, dims)
+
+  def joint_log_prob_fn(w):
+    # f(w) = log p(w, y | x)
+    return prior.log_prob(w) + make_likelihood(x, w).log_prob(y)
+
+  weights_update = weights.assign(
+      hmc.kernel(target_log_prob_fn=joint_log_prob,
+                 current_state=weights,
+                 step_size=0.1,
+                 num_leapfrog_steps=5)[0])
+
+  with tf.control_dependencies([weights_update]):
+    loss = -prior.log_prob(weights)
+
   optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-  with tf.control_dependencies([beta_update]):
-    log_sigma_update = optimizer.minimize(-log_prior(beta, log_sigma),
-                                          var_list=[log_sigma])
-
-  sess = tf.Session()
-  sess.run(tf.global_variables_initializer())
-  log_sigma_history = np.zeros(1000)
-  for i in xrange(1000):
-    log_sigma_val, _ = sess.run([log_sigma, log_sigma_update])
-    log_sigma_history[i] = log_sigma_val
-  # Should converge to something close to true_sigma
-  plt.plot(np.exp(log_sigma_history))
+  log_sigma_update = optimizer.minimize(loss, var_list=[log_sigma])
+
+  sess.graph.finalize()  # No more graph building.
+
+  tf.global_variables_initializer().run()
+
+  sigma_history = np.zeros(num_iters, dtype)
+  weights_history = np.zeros([num_iters, dims], dtype)
+
+  for i in xrange(num_iters):
+    _, sigma_, weights_, _ = sess.run([log_sigma_update, sigma, weights])
+    weights_history[i, :] = weights_
+    sigma_history[i] = sigma_
+
+  true_weights_ = sess.run(true_weights)
+
+  # Should converge to something close to true_sigma.
+  plt.plot(sigma_history);
+  plt.ylabel("sigma");
+  plt.xlabel("iteration");
   ```
-  """
-  with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]):
-    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
-
-    x_shape = array_ops.shape(x)
-    m = random_ops.random_normal(x_shape)
-
-    kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims)
-
-    if (x_log_prob is not None) and (x_grad is not None):
-      log_potential_0, grad_0 = -x_log_prob, -x_grad  # pylint: disable=invalid-unary-operand-type
-    else:
-      if x_log_prob is not None:
-        logging.warn('x_log_prob was provided, but x_grad was not,'
-                     ' so x_log_prob was not used.')
-      if x_grad is not None:
-        logging.warn('x_grad was provided, but x_log_prob was not,'
-                     ' so x_grad was not used.')
-      log_potential_0, grad_0 = potential_and_grad(x)
-
-    new_x, new_m, log_potential_1, grad_1 = leapfrog_integrator(
-        step_size, n_leapfrog_steps, x, m, potential_and_grad, grad_0)
-
-    kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims)
-
-    # TODO(mhoffman): It seems like there may be an opportunity for nans here.
-    # I'm delaying addressing this because we're going to refactor this part
-    # to use the more general Metropolis abstraction anyway.
-    acceptance_probs = math_ops.exp(math_ops.minimum(0., log_potential_0 -
-                                                     log_potential_1 +
-                                                     kinetic_0 - kinetic_1))
-    accepted = math_ops.cast(
-        random_ops.random_uniform(array_ops.shape(acceptance_probs)) <
-        acceptance_probs, np.float32)
-    new_log_prob = (-log_potential_0 * (1. - accepted) -
-                    log_potential_1 * accepted)
-
-    # TODO(b/65738010): This should work, but it doesn't for now.
-    # reduced_shape = math_ops.reduced_shape(x_shape, event_dims)
-    reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims,
-                                                        keep_dims=True))
-    accepted = array_ops.reshape(accepted, reduced_shape)
-    new_x = x * (1. - accepted) + new_x * accepted
-    new_grad = -grad_0 * (1. - accepted) - grad_1 * accepted
-
-  return new_x, acceptance_probs, new_log_prob, new_grad
-
-
-def leapfrog_integrator(step_size, n_steps, initial_position, initial_momentum,
-                        potential_and_grad, initial_grad, name=None):
-  """Applies `n_steps` steps of the leapfrog integrator.
-
-  This just wraps `leapfrog_step()` in a `tf.while_loop()`, reusing
-  gradient computations where possible.
 
   Args:
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `initial_position`. Larger step sizes lead to faster progress, but
-      too-large step sizes lead to larger discretization error and
-      worse energy conservation.
-    n_steps: Number of steps to run the leapfrog integrator.
-    initial_position: Tensor containing the value(s) of the position variable(s)
-      to update.
-    initial_momentum: Tensor containing the value(s) of the momentum variable(s)
-      to update.
-    potential_and_grad: Python callable that takes a position tensor like
-      `initial_position` and returns the potential energy and its gradient at
-      that position.
-    initial_grad: Tensor with the value of the gradient of the potential energy
-      at `initial_position`.
+    target_log_prob_fn: Python callable which takes an argument like
+      `current_state` (or `*current_state` if it's a list) and returns its
+      (possibly unnormalized) log-density under the target distribution.
+    current_state: `Tensor` or Python `list` of `Tensor`s representing the
+      current state(s) of the Markov chain(s). The first `r` dimensions index
+      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
+    step_size: `Tensor` or Python `list` of `Tensor`s representing the step size
+      for the leapfrog integrator. Must broadcast with the shape of
+      `current_state`. Larger step sizes lead to faster progress, but too-large
+      step sizes make rejection exponentially more likely. When possible, it's
+      often helpful to match per-variable step sizes to the standard deviations
+      of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    seed: Python integer to seed the random number generator.
+    current_target_log_prob: (Optional) `Tensor` representing the value of
+      `target_log_prob_fn` at the `current_state`. The only reason to
+      specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
+      representing gradient of `current_target_log_prob` at the `current_state`
+      and wrt the `current_state`. Must have same shape as `current_state`. The
+      only reason to specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
     name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_kernel").
 
   Returns:
-    updated_position: Updated value of the position.
-    updated_momentum: Updated value of the momentum.
-    new_potential: Potential energy of the new position. Has shape matching
-      `potential_and_grad(initial_position)`.
-    new_grad: Gradient from potential_and_grad() evaluated at the new position.
-      Has shape matching `initial_position`.
-
-  Example: Simple quadratic potential.
-  ```python
-  def potential_and_grad(position):
-    return tf.reduce_sum(0.5 * tf.square(position)), position
-  position = tf.placeholder(np.float32)
-  momentum = tf.placeholder(np.float32)
-  potential, grad = potential_and_grad(position)
-  new_position, new_momentum, new_potential, new_grad = hmc.leapfrog_integrator(
-    0.1, 3, position, momentum, potential_and_grad, grad)
-
-  sess = tf.Session()
-  position_val = np.random.randn(10)
-  momentum_val = np.random.randn(10)
-  potential_val, grad_val = sess.run([potential, grad],
-                                     {position: position_val})
-  positions = np.zeros([100, 10])
-  for i in xrange(100):
-    position_val, momentum_val, potential_val, grad_val = sess.run(
-      [new_position, new_momentum, new_potential, new_grad],
-      {position: position_val, momentum: momentum_val})
-    positions[i] = position_val
-  # Should trace out sinusoidal dynamics.
-  plt.plot(positions[:, 0])
-  ```
+    accepted_state: Tensor or Python list of `Tensor`s representing the state(s)
+      of the Markov chain(s) at each result step. Has same shape as
+      `current_state`.
+    kernel_results: `collections.namedtuple` of internal calculations used to
+      advance the chain.
+
+  Raises:
+    ValueError: if there isn't one `step_size` or a list with same length as
+      `current_state`.
   """
-  def leapfrog_wrapper(step_size, x, m, grad, l):
-    x, m, _, grad = leapfrog_step(step_size, x, m, potential_and_grad, grad)
-    return step_size, x, m, grad, l + 1
+  with ops.name_scope(
+      name, "hmc_kernel",
+      [current_state, step_size, num_leapfrog_steps, seed,
+       current_target_log_prob, current_grads_target_log_prob]):
+    with ops.name_scope("initialize"):
+      [current_state_parts, step_sizes, current_target_log_prob,
+       current_grads_target_log_prob] = _prepare_args(
+           target_log_prob_fn, current_state, step_size,
+           current_target_log_prob, current_grads_target_log_prob,
+           maybe_expand=True)
+      independent_chain_ndims = distributions_util.prefer_static_rank(
+          current_target_log_prob)
+      current_momentums = []
+      for s in current_state_parts:
+        current_momentums.append(random_ops.random_normal(
+            shape=array_ops.shape(s),
+            dtype=s.dtype.base_dtype,
+            seed=seed))
+        seed = distributions_util.gen_new_seed(
+            seed, salt="hmc_kernel_momentums")
+
+      num_leapfrog_steps = ops.convert_to_tensor(
+          num_leapfrog_steps,
+          dtype=dtypes.int32,
+          name="num_leapfrog_steps")
+    [
+        proposed_momentums,
+        proposed_state_parts,
+        proposed_target_log_prob,
+        proposed_grads_target_log_prob,
+    ] = _leapfrog_integrator(current_momentums,
+                             target_log_prob_fn,
+                             current_state_parts,
+                             step_sizes,
+                             num_leapfrog_steps,
+                             current_target_log_prob,
+                             current_grads_target_log_prob)
+
+    energy_change = _compute_energy_change(current_target_log_prob,
+                                           current_momentums,
+                                           proposed_target_log_prob,
+                                           proposed_momentums,
+                                           independent_chain_ndims)
+
+    # u < exp(min(-energy, 0)),  where u~Uniform[0,1)
+    # ==> -log(u) >= max(e, 0)
+    # ==> -log(u) >= e
+    # (Perhaps surprisingly, we don't have a better way to obtain a random
+    # uniform from positive reals, i.e., `tf.random_uniform(minval=0,
+    # maxval=np.inf)` won't work.)
+    random_uniform = random_ops.random_uniform(
+        shape=array_ops.shape(energy_change),
+        dtype=energy_change.dtype,
+        seed=seed)
+    random_positive = -math_ops.log(random_uniform)
+    is_accepted = random_positive >= energy_change
+
+    accepted_target_log_prob = array_ops.where(is_accepted,
+                                               proposed_target_log_prob,
+                                               current_target_log_prob)
+
+    accepted_state_parts = [_choose(is_accepted,
+                                    proposed_state_part,
+                                    current_state_part,
+                                    independent_chain_ndims)
+                            for current_state_part, proposed_state_part
+                            in zip(current_state_parts, proposed_state_parts)]
+
+    accepted_grads_target_log_prob = [
+        _choose(is_accepted,
+                proposed_grad,
+                grad,
+                independent_chain_ndims)
+        for proposed_grad, grad
+        in zip(proposed_grads_target_log_prob, current_grads_target_log_prob)]
+
+    maybe_flatten = lambda x: x if _is_list_like(current_state) else x[0]
+    return [
+        maybe_flatten(accepted_state_parts),
+        KernelResults(
+            acceptance_probs=math_ops.exp(math_ops.minimum(-energy_change, 0.)),
+            current_grads_target_log_prob=accepted_grads_target_log_prob,
+            current_target_log_prob=accepted_target_log_prob,
+            energy_change=energy_change,
+            is_accepted=is_accepted,
+            proposed_grads_target_log_prob=proposed_grads_target_log_prob,
+            proposed_state=maybe_flatten(proposed_state_parts),
+            proposed_target_log_prob=proposed_target_log_prob,
+            random_positive=random_positive,
+        ),
+    ]
+
+
+def _leapfrog_integrator(current_momentums,
+                         target_log_prob_fn,
+                         current_state_parts,
+                         step_sizes,
+                         num_leapfrog_steps,
+                         current_target_log_prob=None,
+                         current_grads_target_log_prob=None,
+                         name=None):
+  """Applies `num_leapfrog_steps` of the leapfrog integrator.
+
+  Assumes a simple quadratic kinetic energy function: `0.5 ||momentum||**2`.
 
-  def counter_fn(a, b, c, d, counter):  # pylint: disable=unused-argument
-    return counter < n_steps
+  #### Examples:
 
-  with ops.name_scope(name, 'leapfrog_integrator',
-                      [step_size, n_steps, initial_position, initial_momentum,
-                       initial_grad]):
-    _, new_x, new_m, new_grad, _ = control_flow_ops.while_loop(
-        counter_fn, leapfrog_wrapper, [step_size, initial_position,
-                                       initial_momentum, initial_grad,
-                                       array_ops.constant(0)], back_prop=False)
-    # We're counting on the runtime to eliminate this redundant computation.
-    new_potential, new_grad = potential_and_grad(new_x)
-  return new_x, new_m, new_potential, new_grad
+  ##### Simple quadratic potential.
 
+  ```python
+  tfd = tf.contrib.distributions
 
-def leapfrog_step(step_size, position, momentum, potential_and_grad, grad,
-                  name=None):
-  """Applies one step of the leapfrog integrator.
+  dims = 10
+  num_iter = int(1e3)
+  dtype = np.float32
 
-  Assumes a simple quadratic kinetic energy function: 0.5 * ||momentum||^2.
+  position = tf.placeholder(np.float32)
+  momentum = tf.placeholder(np.float32)
+
+  [
+      new_momentums,
+      new_positions,
+  ] = hmc._leapfrog_integrator(
+      current_momentums=[momentum],
+      target_log_prob_fn=tfd.MultivariateNormalDiag(
+          loc=tf.zeros(dims, dtype)).log_prob,
+      current_state_parts=[position],
+      step_sizes=0.1,
+      num_leapfrog_steps=3)[:2]
+
+  sess.graph.finalize()  # No more graph building.
+
+  momentum_ = np.random.randn(dims).astype(dtype)
+  position_ = np.random.randn(dims).astype(dtype)
+
+  positions = np.zeros([num_iter, dims], dtype)
+  for i in xrange(num_iter):
+    position_, momentum_ = sess.run(
+        [new_momentums[0], new_position[0]],
+        feed_dict={position: position_, momentum: momentum_})
+    positions[i] = position_
+
+  plt.plot(positions[:, 0]);  # Sinusoidal.
+  ```
 
   Args:
-    step_size: Scalar step size or array of step sizes for the
-      leapfrog integrator. Broadcasts to the shape of
-      `position`. Larger step sizes lead to faster progress, but
-      too-large step sizes lead to larger discretization error and
-      worse energy conservation.
-    position: Tensor containing the value(s) of the position variable(s)
-      to update.
-    momentum: Tensor containing the value(s) of the momentum variable(s)
-      to update.
-    potential_and_grad: Python callable that takes a position tensor like
-      `position` and returns the potential energy and its gradient at that
-      position.
-    grad: Tensor with the value of the gradient of the potential energy
-      at `position`.
+    current_momentums: Tensor containing the value(s) of the momentum
+      variable(s) to update.
+    target_log_prob_fn: Python callable which takes an argument like
+      `*current_state_parts` and returns its (possibly unnormalized) log-density
+      under the target distribution.
+    current_state_parts: Python `list` of `Tensor`s representing the current
+      state(s) of the Markov chain(s). The first `independent_chain_ndims` of
+      the `Tensor`(s) index different chains.
+    step_sizes: Python `list` of `Tensor`s representing the step size for the
+      leapfrog integrator. Must broadcast with the shape of
+      `current_state_parts`.  Larger step sizes lead to faster progress, but
+      too-large step sizes make rejection exponentially more likely. When
+      possible, it's often helpful to match per-variable step sizes to the
+      standard deviations of the target distribution in each variable.
+    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
+      for. Total progress per HMC step is roughly proportional to `step_size *
+      num_leapfrog_steps`.
+    current_target_log_prob: (Optional) `Tensor` representing the value of
+      `target_log_prob_fn(*current_state_parts)`. The only reason to specify
+      this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
+      representing gradient of `target_log_prob_fn(*current_state_parts`) wrt
+      `current_state_parts`. Must have same shape as `current_state_parts`. The
+      only reason to specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
     name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "hmc_leapfrog_integrator").
 
   Returns:
-    updated_position: Updated value of the position.
-    updated_momentum: Updated value of the momentum.
-    new_potential: Potential energy of the new position. Has shape matching
-      `potential_and_grad(position)`.
-    new_grad: Gradient from potential_and_grad() evaluated at the new position.
-      Has shape matching `position`.
-
-  Example: Simple quadratic potential.
-  ```python
-  def potential_and_grad(position):
-    # Simple quadratic potential
-    return tf.reduce_sum(0.5 * tf.square(position)), position
-  position = tf.placeholder(np.float32)
-  momentum = tf.placeholder(np.float32)
-  potential, grad = potential_and_grad(position)
-  new_position, new_momentum, new_potential, new_grad = hmc.leapfrog_step(
-    0.1, position, momentum, potential_and_grad, grad)
-
-  sess = tf.Session()
-  position_val = np.random.randn(10)
-  momentum_val = np.random.randn(10)
-  potential_val, grad_val = sess.run([potential, grad],
-                                     {position: position_val})
-  positions = np.zeros([100, 10])
-  for i in xrange(100):
-    position_val, momentum_val, potential_val, grad_val = sess.run(
-      [new_position, new_momentum, new_potential, new_grad],
-      {position: position_val, momentum: momentum_val})
-    positions[i] = position_val
-  # Should trace out sinusoidal dynamics.
-  plt.plot(positions[:, 0])
-  ```
+    proposed_momentums: Updated value of the momentum.
+    proposed_state_parts: Tensor or Python list of `Tensor`s representing the
+      state(s) of the Markov chain(s) at each result step. Has same shape as
+      input `current_state_parts`.
+    proposed_target_log_prob: `Tensor` representing the value of
+      `target_log_prob_fn` at `accepted_state`.
+    proposed_grads_target_log_prob: Gradient of `proposed_target_log_prob` wrt
+      `accepted_state`.
+
+  Raises:
+    ValueError: if `len(momentums) != len(state_parts)`.
+    ValueError: if `len(state_parts) != len(step_sizes)`.
+    ValueError: if `len(state_parts) != len(grads_target_log_prob)`.
+    TypeError: if `not target_log_prob.dtype.is_floating`.
   """
-  with ops.name_scope(name, 'leapfrog_step', [step_size, position, momentum,
-                                              grad]):
-    momentum -= 0.5 * step_size * grad
-    position += step_size * momentum
-    potential, grad = potential_and_grad(position)
-    momentum -= 0.5 * step_size * grad
-
-  return position, momentum, potential, grad
+  def _loop_body(step,
+                 current_momentums,
+                 current_state_parts,
+                 ignore_current_target_log_prob,  # pylint: disable=unused-argument
+                 current_grads_target_log_prob):
+    return [step + 1] + list(_leapfrog_step(current_momentums,
+                                            target_log_prob_fn,
+                                            current_state_parts,
+                                            step_sizes,
+                                            current_grads_target_log_prob))
+
+  with ops.name_scope(
+      name, "hmc_leapfrog_integrator",
+      [current_momentums, current_state_parts, step_sizes, num_leapfrog_steps,
+       current_target_log_prob, current_grads_target_log_prob]):
+    if len(current_momentums) != len(current_state_parts):
+      raise ValueError("`momentums` must be in one-to-one correspondence "
+                       "with `state_parts`")
+    num_leapfrog_steps = ops.convert_to_tensor(num_leapfrog_steps,
+                                               name="num_leapfrog_steps")
+    current_target_log_prob, current_grads_target_log_prob = (
+        _maybe_call_fn_and_grads(
+            target_log_prob_fn,
+            current_state_parts,
+            current_target_log_prob,
+            current_grads_target_log_prob))
+    return control_flow_ops.while_loop(
+        cond=lambda iter_, *args: iter_ < num_leapfrog_steps,
+        body=_loop_body,
+        loop_vars=[
+            np.int32(0),  # iter_
+            current_momentums,
+            current_state_parts,
+            current_target_log_prob,
+            current_grads_target_log_prob,
+        ],
+        back_prop=False)[1:]  # Lop-off "iter_".
+
+
+def _leapfrog_step(current_momentums,
+                   target_log_prob_fn,
+                   current_state_parts,
+                   step_sizes,
+                   current_grads_target_log_prob,
+                   name=None):
+  """Applies one step of the leapfrog integrator."""
+  with ops.name_scope(
+      name, "_leapfrog_step",
+      [current_momentums, current_state_parts, step_sizes,
+       current_grads_target_log_prob]):
+    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
+                          in zip(current_momentums,
+                                 step_sizes,
+                                 current_grads_target_log_prob)]
+    proposed_state_parts = [x + ss * m for x, ss, m
+                            in zip(current_state_parts,
+                                   step_sizes,
+                                   proposed_momentums)]
+    proposed_target_log_prob = target_log_prob_fn(*proposed_state_parts)
+    if not proposed_target_log_prob.dtype.is_floating:
+      raise TypeError("`target_log_prob_fn` must produce a `Tensor` "
+                      "with `float` `dtype`.")
+    proposed_grads_target_log_prob = gradients_ops.gradients(
+        proposed_target_log_prob, proposed_state_parts)
+    if any(g is None for g in proposed_grads_target_log_prob):
+      raise ValueError(
+          "Encountered `None` gradient. Does your target `target_log_prob_fn` "
+          "access all `tf.Variable`s via `tf.get_variable`?\n"
+          "  current_state_parts: {}\n"
+          "  proposed_state_parts: {}\n"
+          "  proposed_grads_target_log_prob: {}".format(
+              current_state_parts,
+              proposed_state_parts,
+              proposed_grads_target_log_prob))
+    proposed_momentums = [m + 0.5 * ss * g for m, ss, g
+                          in zip(proposed_momentums,
+                                 step_sizes,
+                                 proposed_grads_target_log_prob)]
+    return [
+        proposed_momentums,
+        proposed_state_parts,
+        proposed_target_log_prob,
+        proposed_grads_target_log_prob,
+    ]
+
+
+def _compute_energy_change(current_target_log_prob,
+                           current_momentums,
+                           proposed_target_log_prob,
+                           proposed_momentums,
+                           independent_chain_ndims,
+                           name=None):
+  """Helper to `kernel` which computes the energy change."""
+  with ops.name_scope(
+      name, "compute_energy_change",
+      ([current_target_log_prob, proposed_target_log_prob,
+        independent_chain_ndims] +
+       current_momentums + proposed_momentums)):
+    # Abbreviate lk0=log_kinetic_energy and lk1=proposed_log_kinetic_energy
+    # since they're a mouthful and lets us inline more.
+    lk0, lk1 = [], []
+    for current_momentum, proposed_momentum in zip(current_momentums,
+                                                   proposed_momentums):
+      axis = math_ops.range(independent_chain_ndims,
+                            array_ops.rank(current_momentum))
+      lk0.append(_log_sum_sq(current_momentum, axis))
+      lk1.append(_log_sum_sq(proposed_momentum, axis))
+
+    lk0 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk0, axis=-1),
+                                                  axis=-1)
+    lk1 = -np.log(2.) + math_ops.reduce_logsumexp(array_ops.stack(lk1, axis=-1),
+                                                  axis=-1)
+    lp0 = -current_target_log_prob   # log_potential
+    lp1 = -proposed_target_log_prob  # proposed_log_potential
+    x = array_ops.stack([lp1, math_ops.exp(lk1), -lp0, -math_ops.exp(lk0)],
+                        axis=-1)
+
+    # The sum is NaN if any element is NaN or we see both +Inf and -Inf.
+    # Thus we will replace such rows with infinite energy change which implies
+    # rejection. Recall that float-comparisons with NaN are always False.
+    is_sum_determinate = (
+        math_ops.reduce_all(math_ops.is_finite(x) | (x >= 0.), axis=-1) &
+        math_ops.reduce_all(math_ops.is_finite(x) | (x <= 0.), axis=-1))
+    is_sum_determinate = array_ops.tile(
+        is_sum_determinate[..., array_ops.newaxis],
+        multiples=array_ops.concat([
+            array_ops.ones(array_ops.rank(is_sum_determinate),
+                           dtype=dtypes.int32),
+            [4],
+        ], axis=0))
+    x = array_ops.where(is_sum_determinate,
+                        x,
+                        array_ops.fill(array_ops.shape(x),
+                                       value=x.dtype.as_numpy_dtype(np.inf)))
+
+    return math_ops.reduce_sum(x, axis=-1)
+
+
+def _choose(is_accepted,
+            accepted,
+            rejected,
+            independent_chain_ndims,
+            name=None):
+  """Helper to `kernel` which expand_dims `is_accepted` to apply tf.where."""
+  def _expand_is_accepted_like(x):
+    with ops.name_scope("_choose"):
+      expand_shape = array_ops.concat([
+          array_ops.shape(is_accepted),
+          array_ops.ones([array_ops.rank(x) - array_ops.rank(is_accepted)],
+                         dtype=dtypes.int32),
+      ], axis=0)
+      multiples = array_ops.concat([
+          array_ops.ones([array_ops.rank(is_accepted)], dtype=dtypes.int32),
+          array_ops.shape(x)[independent_chain_ndims:],
+      ], axis=0)
+      m = array_ops.tile(array_ops.reshape(is_accepted, expand_shape),
+                         multiples)
+      m.set_shape(x.shape)
+      return m
+  with ops.name_scope(name, "_choose", values=[
+      is_accepted, accepted, rejected, independent_chain_ndims]):
+    return array_ops.where(_expand_is_accepted_like(accepted),
+                           accepted,
+                           rejected)
+
+
+def _maybe_call_fn_and_grads(fn,
+                             fn_arg_list,
+                             fn_result=None,
+                             grads_fn_result=None,
+                             description="target_log_prob"):
+  """Helper which computes `fn_result` and `grads` if needed."""
+  fn_arg_list = (list(fn_arg_list) if _is_list_like(fn_arg_list)
+                 else [fn_arg_list])
+  if fn_result is None:
+    fn_result = fn(*fn_arg_list)
+  if not fn_result.dtype.is_floating:
+    raise TypeError("`{}` must be a `Tensor` with `float` `dtype`.".format(
+        description))
+  if grads_fn_result is None:
+    grads_fn_result = gradients_ops.gradients(
+        fn_result, fn_arg_list)
+  if len(fn_arg_list) != len(grads_fn_result):
+    raise ValueError("`{}` must be in one-to-one correspondence with "
+                     "`grads_{}`".format(*[description]*2))
+  if any(g is None for g in grads_fn_result):
+    raise ValueError("Encountered `None` gradient.")
+  return fn_result, grads_fn_result
+
+
+def _prepare_args(target_log_prob_fn, state, step_size,
+                  target_log_prob=None, grads_target_log_prob=None,
+                  maybe_expand=False, description="target_log_prob"):
+  """Helper which processes input args to meet list-like assumptions."""
+  state_parts = list(state) if _is_list_like(state) else [state]
+  state_parts = [ops.convert_to_tensor(s, name="state")
+                 for s in state_parts]
+  target_log_prob, grads_target_log_prob = _maybe_call_fn_and_grads(
+      target_log_prob_fn,
+      state_parts,
+      target_log_prob,
+      grads_target_log_prob,
+      description)
+  step_sizes = list(step_size) if _is_list_like(step_size) else [step_size]
+  step_sizes = [
+      ops.convert_to_tensor(
+          s, name="step_size", dtype=target_log_prob.dtype)
+      for s in step_sizes]
+  if len(step_sizes) == 1:
+    step_sizes *= len(state_parts)
+  if len(state_parts) != len(step_sizes):
+    raise ValueError("There should be exactly one `step_size` or it should "
+                     "have same length as `current_state`.")
+  maybe_flatten = lambda x: x if maybe_expand or _is_list_like(state) else x[0]
+  return [
+      maybe_flatten(state_parts),
+      maybe_flatten(step_sizes),
+      target_log_prob,
+      grads_target_log_prob,
+  ]
+
+
+def _is_list_like(x):
+  """Helper which returns `True` if input is `list`-like."""
+  return isinstance(x, (tuple, list))
+
+
+def _log_sum_sq(x, axis=None):
+  """Computes log(sum(x**2))."""
+  return math_ops.reduce_logsumexp(2. * math_ops.log(math_ops.abs(x)), axis)
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers.py b/tensorflow/contrib/bayesflow/python/ops/layers.py
index dcead38af826a12e776160bdb251ba021e6b953c..a742b7c1aa593d6c08bf9d8d597c99c9fc4e7aed 100644
--- a/tensorflow/contrib/bayesflow/python/ops/layers.py
+++ b/tensorflow/contrib/bayesflow/python/ops/layers.py
@@ -23,13 +23,43 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.layers_dense_variational_impl import *
+from tensorflow.contrib.bayesflow.python.ops.layers_conv_variational import *
+from tensorflow.contrib.bayesflow.python.ops.layers_dense_variational import *
+from tensorflow.contrib.bayesflow.python.ops.layers_util import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'DenseVariational',
-    'dense_variational',
+    'Convolution1DReparameterization',
+    'Convolution2DReparameterization',
+    'Convolution3DReparameterization',
+    'Convolution1DFlipout',
+    'Convolution2DFlipout',
+    'Convolution3DFlipout',
+    'Conv1DReparameterization',
+    'Conv2DReparameterization',
+    'Conv3DReparameterization',
+    'Conv1DFlipout',
+    'Conv2DFlipout',
+    'Conv3DFlipout',
+    'convolution1d_reparameterization',
+    'convolution2d_reparameterization',
+    'convolution3d_reparameterization',
+    'convolution1d_flipout',
+    'convolution2d_flipout',
+    'convolution3d_flipout',
+    'conv1d_reparameterization',
+    'conv2d_reparameterization',
+    'conv3d_reparameterization',
+    'conv1d_flipout',
+    'conv2d_flipout',
+    'conv3d_flipout',
+    'DenseReparameterization',
+    'DenseLocalReparameterization',
+    'DenseFlipout',
+    'dense_reparameterization',
+    'dense_local_reparameterization',
+    'dense_flipout',
     'default_loc_scale_fn',
     'default_mean_field_normal_fn',
 ]
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers_conv_variational.py b/tensorflow/contrib/bayesflow/python/ops/layers_conv_variational.py
new file mode 100644
index 0000000000000000000000000000000000000000..7723cfb442712626ff415f1412e3362f2392ce9f
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/layers_conv_variational.py
@@ -0,0 +1,2943 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Convolutional variational layer classes and their functional aliases.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.bayesflow.python.ops import layers_util
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base as layers_lib
+from tensorflow.python.layers import utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops.distributions import kullback_leibler as kl_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import util as distribution_util
+
+
+class _ConvVariational(layers_lib.Layer):
+  """Abstract nD convolution layer (private, used as implementation base).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+      length of the convolution window.
+    strides: An integer or tuple/list of n integers,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of n integers, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    name: A string, the name of the layer.
+
+  Properties:
+    rank: Python integer, dimensionality of convolution.
+    filters: Python integer, dimensionality of the output space.
+    kernel_size: Size of the convolution window.
+    strides: Stride length of convolution.
+    padding: Python string describing padding approach.
+    data_format: Python string describing input data's dimensions.
+    dilation_rate: Dilation rate for an atrous convolution.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+  """
+
+  def __init__(
+      self,
+      rank,
+      filters,
+      kernel_size,
+      strides=1,
+      padding="valid",
+      data_format="channels_last",
+      dilation_rate=1,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(_ConvVariational, self).__init__(
+        trainable=trainable,
+        name=name,
+        activity_regularizer=activity_regularizer,
+        **kwargs)
+    self.rank = rank
+    self.filters = filters
+    self.kernel_size = utils.normalize_tuple(kernel_size, rank, "kernel_size")
+    self.strides = utils.normalize_tuple(strides, rank, "strides")
+    self.padding = utils.normalize_padding(padding)
+    self.data_format = utils.normalize_data_format(data_format)
+    self.dilation_rate = utils.normalize_tuple(
+        dilation_rate, rank, "dilation_rate")
+    self.activation = activation
+    self.input_spec = layers_lib.InputSpec(ndim=self.rank + 2)
+    self.kernel_posterior_fn = kernel_posterior_fn
+    self.kernel_posterior_tensor_fn = kernel_posterior_tensor_fn
+    self.kernel_prior_fn = kernel_prior_fn
+    self.kernel_divergence_fn = kernel_divergence_fn
+    self.bias_posterior_fn = bias_posterior_fn
+    self.bias_posterior_tensor_fn = bias_posterior_tensor_fn
+    self.bias_prior_fn = bias_prior_fn
+    self.bias_divergence_fn = bias_divergence_fn
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if self.data_format == "channels_first":
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError("The channel dimension of the inputs "
+                       "should be defined. Found `None`.")
+    input_dim = input_shape[channel_axis].value
+    kernel_shape = self.kernel_size + (input_dim, self.filters)
+    dtype = dtypes.as_dtype(self.dtype)
+
+    # Must have a posterior kernel.
+    self.kernel_posterior = self.kernel_posterior_fn(
+        dtype, kernel_shape, "kernel_posterior",
+        self.trainable, self.add_variable)
+
+    if self.kernel_prior_fn is None:
+      self.kernel_prior = None
+    else:
+      self.kernel_prior = self.kernel_prior_fn(
+          dtype, kernel_shape, "kernel_prior",
+          self.trainable, self.add_variable)
+    self._built_kernel_divergence = False
+
+    if self.bias_posterior_fn is None:
+      self.bias_posterior = None
+    else:
+      self.bias_posterior = self.bias_posterior_fn(
+          dtype, (self.filters,), "bias_posterior",
+          self.trainable, self.add_variable)
+
+    if self.bias_prior_fn is None:
+      self.bias_prior = None
+    else:
+      self.bias_prior = self.bias_prior_fn(
+          dtype, (self.filters,), "bias_prior",
+          self.trainable, self.add_variable)
+    self._built_bias_divergence = False
+
+    self.input_spec = layers_lib.InputSpec(ndim=self.rank + 2,
+                                           axes={channel_axis: input_dim})
+    self._convolution_op = nn_ops.Convolution(
+        input_shape,
+        filter_shape=tensor_shape.TensorShape(kernel_shape),
+        dilation_rate=self.dilation_rate,
+        strides=self.strides,
+        padding=self.padding.upper(),
+        data_format=utils.convert_data_format(self.data_format,
+                                              self.rank + 2))
+
+    self.built = True
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+
+    outputs = self._apply_variational_kernel(inputs)
+    outputs = self._apply_variational_bias(outputs)
+    if self.activation is not None:
+      outputs = self.activation(outputs)
+    if not self._built_kernel_divergence:
+      kernel_posterior = self.kernel_posterior
+      kernel_prior = self.kernel_prior
+      if isinstance(self.kernel_posterior, independent_lib.Independent):
+        kernel_posterior = kernel_posterior.distribution
+      if isinstance(self.kernel_prior, independent_lib.Independent):
+        kernel_prior = kernel_prior.distribution
+      self._apply_divergence(self.kernel_divergence_fn,
+                             kernel_posterior,
+                             kernel_prior,
+                             self.kernel_posterior_tensor,
+                             name="divergence_kernel")
+      self._built_kernel_divergence = True
+    if not self._built_bias_divergence:
+      bias_posterior = self.bias_posterior
+      bias_prior = self.bias_prior
+      if isinstance(self.bias_posterior, independent_lib.Independent):
+        bias_posterior = bias_posterior.distribution
+      if isinstance(self.bias_prior, independent_lib.Independent):
+        bias_prior = bias_prior.distribution
+      self._apply_divergence(self.bias_divergence_fn,
+                             bias_posterior,
+                             bias_prior,
+                             self.bias_posterior_tensor,
+                             name="divergence_bias")
+      self._built_bias_divergence = True
+    return outputs
+
+  def _apply_variational_bias(self, inputs):
+    if self.bias_posterior is None:
+      self.bias_posterior_tensor = None
+      return inputs
+    self.bias_posterior_tensor = self.bias_posterior_tensor_fn(
+        self.bias_posterior)
+    outputs = inputs
+    if self.data_format == "channels_first":
+      if self.rank == 1:
+        # nn.bias_add does not accept a 1D input tensor.
+        bias = array_ops.reshape(self.bias_posterior_tensor,
+                                 (1, self.filters, 1))
+        outputs += bias
+      if self.rank == 2:
+        outputs = nn.bias_add(outputs,
+                              self.bias_posterior_tensor,
+                              data_format="NCHW")
+      if self.rank == 3:
+        # As of Mar 2017, direct addition is significantly slower than
+        # bias_add when computing gradients. To use bias_add, we collapse Z
+        # and Y into a single dimension to obtain a 4D input tensor.
+        outputs_shape = outputs.shape.as_list()
+        outputs_4d = array_ops.reshape(outputs,
+                                       [outputs_shape[0], outputs_shape[1],
+                                        outputs_shape[2] * outputs_shape[3],
+                                        outputs_shape[4]])
+        outputs_4d = nn.bias_add(outputs_4d,
+                                 self.bias_posterior_tensor,
+                                 data_format="NCHW")
+        outputs = array_ops.reshape(outputs_4d, outputs_shape)
+    else:
+      outputs = nn.bias_add(outputs,
+                            self.bias_posterior_tensor,
+                            data_format="NHWC")
+    return outputs
+
+  def _apply_divergence(self, divergence_fn, posterior, prior,
+                        posterior_tensor, name):
+    if (divergence_fn is None or
+        posterior is None or
+        prior is None):
+      divergence = None
+      return
+    divergence = standard_ops.identity(
+        divergence_fn(
+            posterior, prior, posterior_tensor),
+        name=name)
+    self.add_loss(divergence)
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == "channels_last":
+      space = input_shape[1:-1]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0]] + new_space +
+                                      [self.filters])
+    else:
+      space = input_shape[2:]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0], self.filters] +
+                                      new_space)
+
+
+class _ConvReparameterization(_ConvVariational):
+  """Abstract nD convolution layer (private, used as implementation base).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the reparameterization
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+      length of the convolution window.
+    strides: An integer or tuple/list of n integers,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of n integers, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    name: A string, the name of the layer.
+
+  Properties:
+    rank: Python integer, dimensionality of convolution.
+    filters: Python integer, dimensionality of the output space.
+    kernel_size: Size of the convolution window.
+    strides: Stride length of convolution.
+    padding: Python string describing padding approach.
+    data_format: Python string describing input data's dimensions.
+    dilation_rate: Dilation rate for an atrous convolution.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+
+  [1]: "Auto-Encoding Variational Bayes."
+        Diederik P. Kingma, Max Welling.
+        International Conference on Learning Representations, 2014.
+  """
+
+  def __init__(
+      self,
+      rank,
+      filters,
+      kernel_size,
+      strides=1,
+      padding="valid",
+      data_format="channels_last",
+      dilation_rate=1,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(_ConvReparameterization, self).__init__(
+        rank=rank,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name, **kwargs)
+
+  def _apply_variational_kernel(self, inputs):
+    self.kernel_posterior_tensor = self.kernel_posterior_tensor_fn(
+        self.kernel_posterior)
+    self.kernel_posterior_affine = None
+    self.kernel_posterior_affine_tensor = None
+    outputs = self._convolution_op(inputs, self.kernel_posterior_tensor)
+    return outputs
+
+
+class Conv1DReparameterization(_ConvReparameterization):
+  """1D convolution layer (e.g. temporal convolution).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the reparameterization
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of a single integer, specifying the
+      length of the 1D convolution window.
+    strides: An integer or tuple/list of a single integer,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: An integer or tuple/list of a single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    name: A string, the name of the layer.
+
+  Properties:
+    filters: Python integer, dimensionality of the output space.
+    kernel_size: Size of the convolution window.
+    strides: Stride length of convolution.
+    padding: Python string describing padding approach.
+    data_format: Python string describing input data's dimensions.
+    dilation_rate: Dilation rate for an atrous convolution.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 128, 1])
+  net = tfp.layers.Conv1DReparameterization(64,
+                                            kernel_size=5,
+                                            padding="SAME",
+                                            activation=tf.nn.relu)(net)
+  net = tf.reshape(net, [-1, 128 * 64])
+  logits = tfp.layers.DenseReparameterization(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Auto-Encoding Variational Bayes."
+        Diederik P. Kingma, Max Welling.
+        International Conference on Learning Representations, 2014.
+  """
+
+  def __init__(
+      self,
+      filters,
+      kernel_size,
+      strides=1,
+      padding="valid",
+      data_format="channels_last",
+      dilation_rate=1,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(Conv1DReparameterization, self).__init__(
+        rank=1,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name, **kwargs)
+
+
+def conv1d_reparameterization(
+    inputs,
+    filters,
+    kernel_size,
+    strides=1,
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=1,
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    name=None,
+    reuse=None):
+  """Functional interface for 1D convolution layer (e.g. temporal convolution).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the reparameterization
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of a single integer, specifying the
+      length of the 1D convolution window.
+    strides: An integer or tuple/list of a single integer,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: An integer or tuple/list of a single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 128, 1])
+  net = tfp.layers.conv1d_reparameterization(net,
+                                             filters=64,
+                                             kernel_size=5,
+                                             padding="SAME",
+                                             activation=tf.nn.relu)
+  net = tf.reshape(net, [-1, 128 * 64])
+  logits = tfp.layers.dense_reparameterization(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Auto-Encoding Variational Bayes."
+        Diederik P. Kingma, Max Welling.
+        International Conference on Learning Representations, 2014.
+  """
+  layer = Conv1DReparameterization(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
+
+class Conv2DReparameterization(_ConvReparameterization):
+  """2D convolution layer (e.g. spatial convolution over images).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the reparameterization
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+      height and width of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    name: A string, the name of the layer.
+
+  Properties:
+    filters: Python integer, dimensionality of the output space.
+    kernel_size: Size of the convolution window.
+    strides: Stride length of convolution.
+    padding: Python string describing padding approach.
+    data_format: Python string describing input data's dimensions.
+    dilation_rate: Dilation rate for an atrous convolution.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 32, 32, 3])
+  net = tfp.layers.Conv2DReparameterization(64,
+                                            kernel_size=5,
+                                            padding="SAME",
+                                            activation=tf.nn.relu)(net)
+  net = tf.layers.MaxPooling2D(pool_size=2,
+                               strides=2,
+                               padding="SAME")(net)
+  net = tf.reshape(net, [-1, 8 * 8 * 64])
+  logits = tfp.layers.DenseReparameterization(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Auto-Encoding Variational Bayes."
+        Diederik P. Kingma, Max Welling.
+        International Conference on Learning Representations, 2014.
+  """
+
+  def __init__(
+      self,
+      filters,
+      kernel_size,
+      strides=(1, 1),
+      padding="valid",
+      data_format="channels_last",
+      dilation_rate=(1, 1),
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(Conv2DReparameterization, self).__init__(
+        rank=2,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name, **kwargs)
+
+
+def conv2d_reparameterization(
+    inputs,
+    filters,
+    kernel_size,
+    strides=(1, 1),
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=(1, 1),
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    name=None,
+    reuse=None):
+  """Functional interface for the 2D convolution layer.
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the reparameterization
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+      height and width of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 32, 32, 3])
+  net = tfp.layers.conv2d_reparameterization(net,
+                                             filters=64,
+                                             kernel_size=5,
+                                             padding="SAME",
+                                             activation=tf.nn.relu)
+  net = tf.layers.max_pooling2d(net,
+                                pool_size=2,
+                                strides=2,
+                                padding="SAME")
+  net = tf.reshape(net, [-1, 8 * 8 * 64])
+  logits = tfp.layers.dense_reparameterization(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Auto-Encoding Variational Bayes."
+        Diederik P. Kingma, Max Welling.
+        International Conference on Learning Representations, 2014.
+  """
+  layer = Conv2DReparameterization(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
+
+class Conv3DReparameterization(_ConvReparameterization):
+  """3D convolution layer (e.g. spatial convolution over volumes).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the reparameterization
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the convolution along the depth,
+      height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    dilation_rate: An integer or tuple/list of 3 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    name: A string, the name of the layer.
+
+  Properties:
+    filters: Python integer, dimensionality of the output space.
+    kernel_size: Size of the convolution window.
+    strides: Stride length of convolution.
+    padding: Python string describing padding approach.
+    data_format: Python string describing input data's dimensions.
+    dilation_rate: Dilation rate for an atrous convolution.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 256, 32, 32, 3])
+  net = tfp.layers.Conv3DReparameterization(64,
+                                            kernel_size=5,
+                                            padding="SAME",
+                                            activation=tf.nn.relu)(net)
+  net = tf.layers.MaxPooling2D(pool_size=2,
+                               strides=2,
+                               padding="SAME")(net)
+  net = tf.reshape(net, [-1, 256 * 8 * 8 * 64])
+  logits = tfp.layers.DenseReparameterization(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Auto-Encoding Variational Bayes."
+        Diederik P. Kingma, Max Welling.
+        International Conference on Learning Representations, 2014.
+  """
+
+  def __init__(
+      self,
+      filters,
+      kernel_size,
+      strides=(1, 1, 1),
+      padding="valid",
+      data_format="channels_last",
+      dilation_rate=(1, 1, 1),
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(Conv3DReparameterization, self).__init__(
+        rank=3,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name, **kwargs)
+
+
+def conv3d_reparameterization(
+    inputs,
+    filters,
+    kernel_size,
+    strides=(1, 1, 1),
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=(1, 1, 1),
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    name=None,
+    reuse=None):
+  """Functional interface for the 3D convolution layer.
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the reparameterization
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the convolution along the depth,
+      height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    dilation_rate: An integer or tuple/list of 3 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 256, 32, 32, 3])
+  net = tfp.layers.conv3d_reparameterization(net,
+                                             filters=64,
+                                             kernel_size=5,
+                                             padding="SAME",
+                                             activation=tf.nn.relu)
+  net = tf.layers.max_pooling2d(net,
+                                pool_size=2,
+                                strides=2,
+                                padding="SAME")
+  net = tf.reshape(net, [-1, 256 * 8 * 8 * 64])
+  logits = tfp.layers.dense_reparameterization(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Auto-Encoding Variational Bayes."
+        Diederik P. Kingma, Max Welling.
+        International Conference on Learning Representations, 2014.
+  """
+  layer = Conv3DReparameterization(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
+
+class _ConvFlipout(_ConvVariational):
+  """Abstract nD convolution layer (private, used as implementation base).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the Flipout
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`. Flipout uses
+  roughly twice as many floating point operations as the
+  reparameterization estimator but has the advantage of significantly
+  lower variance.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+      length of the convolution window.
+    strides: An integer or tuple/list of n integers,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of n integers, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    seed: Python scalar `int` which initializes the random number
+      generator. Default value: `None` (i.e., use global seed).
+    name: A string, the name of the layer.
+
+  Properties:
+    rank: Python integer, dimensionality of convolution.
+    filters: Python integer, dimensionality of the output space.
+    kernel_size: Size of the convolution window.
+    strides: Stride length of convolution.
+    padding: Python string describing padding approach.
+    data_format: Python string describing input data's dimensions.
+    dilation_rate: Dilation rate for an atrous convolution.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+    seed: Python integer, used to create random seeds.
+
+  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
+        Mini-Batches."
+        Anonymous. OpenReview, 2017.
+        https://openreview.net/forum?id=rJnpifWAb
+  """
+
+  def __init__(
+      self,
+      rank,
+      filters,
+      kernel_size,
+      strides=1,
+      padding="valid",
+      data_format="channels_last",
+      dilation_rate=1,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      seed=None,
+      name=None,
+      **kwargs):
+    super(_ConvFlipout, self).__init__(
+        rank=rank,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name, **kwargs)
+    self.seed = seed
+
+  def _apply_variational_kernel(self, inputs):
+    if (not isinstance(self.kernel_posterior, independent_lib.Independent) or
+        not isinstance(self.kernel_posterior.distribution, normal_lib.Normal)):
+      raise TypeError(
+          "`{}` requires "
+          "`kernel_posterior_fn` produce an instance of "
+          "`tf.distributions.Independent(tf.distributions.Normal)` "
+          "(saw: \"{}\").".format(
+              type(self).__name__, self.kernel_posterior.name))
+    self.kernel_posterior_affine = normal_lib.Normal(
+        loc=array_ops.zeros_like(self.kernel_posterior.distribution.loc),
+        scale=self.kernel_posterior.distribution.scale)
+    self.kernel_posterior_affine_tensor = (
+        self.kernel_posterior_tensor_fn(self.kernel_posterior_affine))
+    self.kernel_posterior_tensor = None
+
+    outputs = self._convolution_op(
+        inputs, self.kernel_posterior.distribution.loc)
+
+    input_shape = array_ops.shape(inputs)
+    output_shape = array_ops.shape(outputs)
+    batch_shape = array_ops.expand_dims(input_shape[0], 0)
+    channels = input_shape[-1]
+
+    sign_input = layers_util.random_sign(
+        array_ops.concat([batch_shape,
+                          array_ops.expand_dims(channels, 0)], 0),
+        dtype=inputs.dtype,
+        seed=self.seed)
+    sign_output = layers_util.random_sign(
+        array_ops.concat([batch_shape,
+                          array_ops.expand_dims(self.filters, 0)], 0),
+        dtype=inputs.dtype,
+        seed=distribution_util.gen_new_seed(
+            self.seed, salt="conv_flipout"))
+    for _ in range(self.rank):
+      sign_input = array_ops.expand_dims(sign_input, 1)  # 2D ex: (B, 1, 1, C)
+      sign_output = array_ops.expand_dims(sign_output, 1)
+
+    sign_input = array_ops.tile(  # tile for element-wise op broadcasting
+        sign_input,
+        [1] + [input_shape[i + 1] for i in range(self.rank)] + [1])
+    sign_output = array_ops.tile(
+        sign_output,
+        [1] + [output_shape[i + 1] for i in range(self.rank)] + [1])
+
+    perturbed_inputs = self._convolution_op(
+        inputs * sign_input, self.kernel_posterior_affine_tensor) * sign_output
+
+    outputs += perturbed_inputs
+    return outputs
+
+
+class Conv1DFlipout(_ConvFlipout):
+  """1D convolution layer (e.g. temporal convolution) with Flipout.
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the Flipout
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`. Flipout uses
+  roughly twice as many floating point operations as the
+  reparameterization estimator but has the advantage of significantly
+  lower variance.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of a single integer, specifying the
+      length of the 1D convolution window.
+    strides: An integer or tuple/list of a single integer,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: An integer or tuple/list of a single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    seed: Python scalar `int` which initializes the random number
+      generator. Default value: `None` (i.e., use global seed).
+    name: A string, the name of the layer.
+
+  Properties:
+    filters: Python integer, dimensionality of the output space.
+    kernel_size: Size of the convolution window.
+    strides: Stride length of convolution.
+    padding: Python string describing padding approach.
+    data_format: Python string describing input data's dimensions.
+    dilation_rate: Dilation rate for an atrous convolution.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+    seed: Python integer, used to create random seeds.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 128, 1])
+  net = tfp.layers.Conv1DFlipout(64,
+                                 kernel_size=5,
+                                 padding="SAME",
+                                 activation=tf.nn.relu)(net)
+  net = tf.reshape(net, [-1, 128 * 64])
+  logits = tfp.layers.DenseFlipout(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses the Flipout gradient estimator to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
+        Mini-Batches."
+        Anonymous. OpenReview, 2017.
+        https://openreview.net/forum?id=rJnpifWAb
+  """
+
+  def __init__(
+      self,
+      filters,
+      kernel_size,
+      strides=1,
+      padding="valid",
+      data_format="channels_last",
+      dilation_rate=1,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      seed=None,
+      name=None,
+      **kwargs):
+    super(Conv1DFlipout, self).__init__(
+        rank=1,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        seed=seed,
+        name=name, **kwargs)
+
+
+def conv1d_flipout(
+    inputs,
+    filters,
+    kernel_size,
+    strides=1,
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=1,
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    seed=None,
+    name=None,
+    reuse=None):
+  """Functional interface for 1D convolution layer (e.g. temporal convolution).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the Flipout
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`. Flipout uses
+  roughly twice as many floating point operations as the
+  reparameterization estimator but has the advantage of significantly
+  lower variance.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of a single integer, specifying the
+      length of the 1D convolution window.
+    strides: An integer or tuple/list of a single integer,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: An integer or tuple/list of a single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    seed: Python scalar `int` which initializes the random number
+      generator. Default value: `None` (i.e., use global seed).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 128, 1])
+  net = tfp.layers.conv1d_flipout(net,
+                                  filters=64,
+                                  kernel_size=5,
+                                  padding="SAME",
+                                  activation=tf.nn.relu)
+  net = tf.reshape(net, [-1, 128 * 64])
+  logits = tfp.layers.dense_flipout(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses the Flipout gradient estimator to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
+        Mini-Batches."
+        Anonymous. OpenReview, 2017.
+        https://openreview.net/forum?id=rJnpifWAb
+  """
+  layer = Conv1DFlipout(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      seed=seed,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
+
+class Conv2DFlipout(_ConvFlipout):
+  """2D convolution layer (e.g. spatial convolution over images) with Flipout.
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the Flipout
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`. Flipout uses
+  roughly twice as many floating point operations as the
+  reparameterization estimator but has the advantage of significantly
+  lower variance.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+      height and width of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    seed: Python scalar `int` which initializes the random number
+      generator. Default value: `None` (i.e., use global seed).
+    name: A string, the name of the layer.
+
+  Properties:
+    filters: Python integer, dimensionality of the output space.
+    kernel_size: Size of the convolution window.
+    strides: Stride length of convolution.
+    padding: Python string describing padding approach.
+    data_format: Python string describing input data's dimensions.
+    dilation_rate: Dilation rate for an atrous convolution.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+    seed: Python integer, used to create random seeds.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 32, 32, 3])
+  net = tfp.layers.Conv2DFlipout(64,
+                                 kernel_size=5,
+                                 padding="SAME",
+                                 activation=tf.nn.relu)(net)
+  net = tf.layers.MaxPooling2D(pool_size=2,
+                               strides=2,
+                               padding="SAME")(net)
+  net = tf.reshape(net, [-1, 8 * 8 * 64])
+  logits = tfp.layers.DenseFlipout(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses the Flipout gradient estimator to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
+        Mini-Batches."
+        Anonymous. OpenReview, 2017.
+        https://openreview.net/forum?id=rJnpifWAb
+  """
+
+  def __init__(
+      self,
+      filters,
+      kernel_size,
+      strides=(1, 1),
+      padding="valid",
+      data_format="channels_last",
+      dilation_rate=(1, 1),
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      seed=None,
+      name=None,
+      **kwargs):
+    super(Conv2DFlipout, self).__init__(
+        rank=2,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        seed=seed,
+        name=name, **kwargs)
+
+
+def conv2d_flipout(
+    inputs,
+    filters,
+    kernel_size,
+    strides=(1, 1),
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=(1, 1),
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    seed=None,
+    name=None,
+    reuse=None):
+  """Functional interface for the 2D convolution layer.
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the Flipout
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`. Flipout uses
+  roughly twice as many floating point operations as the
+  reparameterization estimator but has the advantage of significantly
+  lower variance.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+      height and width of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    seed: Python scalar `int` which initializes the random number
+      generator. Default value: `None` (i.e., use global seed).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 32, 32, 3])
+  net = tfp.layers.conv2d_flipout(net,
+                                  filters=64,
+                                  kernel_size=5,
+                                  padding="SAME",
+                                  activation=tf.nn.relu)
+  net = tf.layers.max_pooling2d(net,
+                                pool_size=2,
+                                strides=2,
+                                padding="SAME")
+  net = tf.reshape(net, [-1, 8 * 8 * 64])
+  logits = tfp.layers.dense_flipout(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses the Flipout gradient estimator to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
+        Mini-Batches."
+        Anonymous. OpenReview, 2017.
+        https://openreview.net/forum?id=rJnpifWAb
+  """
+  layer = Conv2DFlipout(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      seed=seed,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
+
+class Conv3DFlipout(_ConvFlipout):
+  """3D convolution layer (e.g. spatial convolution over volumes) with Flipout.
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the Flipout
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`. Flipout uses
+  roughly twice as many floating point operations as the
+  reparameterization estimator but has the advantage of significantly
+  lower variance.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the convolution along the depth,
+      height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    dilation_rate: An integer or tuple/list of 3 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    seed: Python scalar `int` which initializes the random number
+      generator. Default value: `None` (i.e., use global seed).
+    name: A string, the name of the layer.
+
+  Properties:
+    filters: Python integer, dimensionality of the output space.
+    kernel_size: Size of the convolution window.
+    strides: Stride length of convolution.
+    padding: Python string describing padding approach.
+    data_format: Python string describing input data's dimensions.
+    dilation_rate: Dilation rate for an atrous convolution.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+    seed: Python integer, used to create random seeds.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 256, 32, 32, 3])
+  net = tfp.layers.Conv3DFlipout(64,
+                                 kernel_size=5,
+                                 padding="SAME",
+                                 activation=tf.nn.relu)(net)
+  net = tf.layers.MaxPooling2D(pool_size=2,
+                               strides=2,
+                               padding="SAME")(net)
+  net = tf.reshape(net, [-1, 256 * 8 * 8 * 64])
+  logits = tfp.layers.DenseFlipout(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses the Flipout gradient estimator to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
+        Mini-Batches."
+        Anonymous. OpenReview, 2017.
+        https://openreview.net/forum?id=rJnpifWAb
+  """
+
+  def __init__(
+      self,
+      filters,
+      kernel_size,
+      strides=(1, 1, 1),
+      padding="valid",
+      data_format="channels_last",
+      dilation_rate=(1, 1, 1),
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      seed=None,
+      name=None,
+      **kwargs):
+    super(Conv3DFlipout, self).__init__(
+        rank=3,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        seed=seed,
+        name=name, **kwargs)
+
+
+def conv3d_flipout(
+    inputs,
+    filters,
+    kernel_size,
+    strides=(1, 1, 1),
+    padding="valid",
+    data_format="channels_last",
+    dilation_rate=(1, 1, 1),
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    seed=None,
+    name=None,
+    reuse=None):
+  """Functional interface for the 3D convolution layer.
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. It may also include a bias addition and activation function
+  on the outputs. It assumes the `kernel` and/or `bias` are drawn from
+  distributions.
+
+  By default, the layer implements a stochastic forward pass via
+  sampling from the kernel and bias posteriors,
+  ```none
+  outputs = f(inputs; kernel, bias), kernel, bias ~ posterior
+  ```
+  where f denotes the layer's calculation. It uses the Flipout
+  estimator [1], which performs a Monte Carlo approximation of the
+  distribution integrating over the `kernel` and `bias`. Flipout uses
+  roughly twice as many floating point operations as the
+  reparameterization estimator but has the advantage of significantly
+  lower variance.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Arguments:
+    inputs: Tensor input.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the convolution along the depth,
+      height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    dilation_rate: An integer or tuple/list of 3 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Optional regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+    seed: Python scalar `int` which initializes the random number
+      generator. Default value: `None` (i.e., use global seed).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tf.reshape(features, [-1, 256, 32, 32, 3])
+  net = tfp.layers.conv3d_flipout(net,
+                                  filters=64,
+                                  kernel_size=5,
+                                  padding="SAME",
+                                  activation=tf.nn.relu)
+  net = tf.layers.max_pooling2d(net,
+                                pool_size=2,
+                                strides=2,
+                                padding="SAME")
+  net = tf.reshape(net, [-1, 256 * 8 * 8 * 64])
+  logits = tfp.layers.dense_flipout(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses the Flipout gradient estimator to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
+        Mini-Batches."
+        Anonymous. OpenReview, 2017.
+        https://openreview.net/forum?id=rJnpifWAb
+  """
+  layer = Conv3DFlipout(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      seed=seed,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
+
+# Aliases
+
+Convolution1DReparameterization = Conv1DReparameterization
+Convolution2DReparameterization = Conv2DReparameterization
+Convolution3DReparameterization = Conv3DReparameterization
+convolution1d_reparameterization = conv1d_reparameterization
+convolution2d_reparameterization = conv2d_reparameterization
+convolution3d_reparameterization = conv3d_reparameterization
+Convolution1DFlipout = Conv1DFlipout
+Convolution2DFlipout = Conv2DFlipout
+Convolution3DFlipout = Conv3DFlipout
+convolution1d_flipout = conv1d_flipout
+convolution2d_flipout = conv2d_flipout
+convolution3d_flipout = conv3d_flipout
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational.py b/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational.py
new file mode 100644
index 0000000000000000000000000000000000000000..591a8e553de0c194786c7ee8693665f762711b2d
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational.py
@@ -0,0 +1,1176 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Dense Bayesian layer using KL-divergence based variational inference.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.bayesflow.python.ops import layers_util
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base as layers_lib
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops.distributions import kullback_leibler as kl_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import util as distribution_util
+
+
+class _DenseVariational(layers_lib.Layer):
+  """Abstract densely-connected class (private, used as implementation base).
+
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
+
+  ```none
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
+  ```
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Args:
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Properties:
+    units: Python integer, dimensionality of the output space.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+  """
+
+  def __init__(
+      self,
+      units,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(_DenseVariational, self).__init__(
+        trainable=trainable,
+        name=name,
+        activity_regularizer=activity_regularizer,
+        **kwargs)
+    self.units = units
+    self.activation = activation
+    self.input_spec = layers_lib.InputSpec(min_ndim=2)
+    self.kernel_posterior_fn = kernel_posterior_fn
+    self.kernel_posterior_tensor_fn = kernel_posterior_tensor_fn
+    self.kernel_prior_fn = kernel_prior_fn
+    self.kernel_divergence_fn = kernel_divergence_fn
+    self.bias_posterior_fn = bias_posterior_fn
+    self.bias_posterior_tensor_fn = bias_posterior_tensor_fn
+    self.bias_prior_fn = bias_prior_fn
+    self.bias_divergence_fn = bias_divergence_fn
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    in_size = input_shape.with_rank_at_least(2)[-1].value
+    if in_size is None:
+      raise ValueError("The last dimension of the inputs to `Dense` "
+                       "should be defined. Found `None`.")
+    self._input_spec = layers_lib.InputSpec(min_ndim=2, axes={-1: in_size})
+    dtype = dtypes.as_dtype(self.dtype)
+
+    # Must have a posterior kernel.
+    self.kernel_posterior = self.kernel_posterior_fn(
+        dtype, [in_size, self.units], "kernel_posterior",
+        self.trainable, self.add_variable)
+
+    if self.kernel_prior_fn is None:
+      self.kernel_prior = None
+    else:
+      self.kernel_prior = self.kernel_prior_fn(
+          dtype, [in_size, self.units], "kernel_prior",
+          self.trainable, self.add_variable)
+    self._built_kernel_divergence = False
+
+    if self.bias_posterior_fn is None:
+      self.bias_posterior = None
+    else:
+      self.bias_posterior = self.bias_posterior_fn(
+          dtype, [self.units], "bias_posterior",
+          self.trainable, self.add_variable)
+
+    if self.bias_prior_fn is None:
+      self.bias_prior = None
+    else:
+      self.bias_prior = self.bias_prior_fn(
+          dtype, [self.units], "bias_prior",
+          self.trainable, self.add_variable)
+    self._built_bias_divergence = False
+
+    self.built = True
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+
+    outputs = self._apply_variational_kernel(inputs)
+    outputs = self._apply_variational_bias(outputs)
+    if self.activation is not None:
+      outputs = self.activation(outputs)  # pylint: disable=not-callable
+    if not self._built_kernel_divergence:
+      kernel_posterior = self.kernel_posterior
+      kernel_prior = self.kernel_prior
+      if isinstance(self.kernel_posterior, independent_lib.Independent):
+        kernel_posterior = kernel_posterior.distribution
+      if isinstance(self.kernel_prior, independent_lib.Independent):
+        kernel_prior = kernel_prior.distribution
+      self._apply_divergence(self.kernel_divergence_fn,
+                             kernel_posterior,
+                             kernel_prior,
+                             self.kernel_posterior_tensor,
+                             name="divergence_kernel")
+      self._built_kernel_divergence = True
+    if not self._built_bias_divergence:
+      bias_posterior = self.bias_posterior
+      bias_prior = self.bias_prior
+      if isinstance(self.bias_posterior, independent_lib.Independent):
+        bias_posterior = bias_posterior.distribution
+      if isinstance(self.bias_prior, independent_lib.Independent):
+        bias_prior = bias_prior.distribution
+      self._apply_divergence(self.bias_divergence_fn,
+                             bias_posterior,
+                             bias_prior,
+                             self.bias_posterior_tensor,
+                             name="divergence_bias")
+      self._built_bias_divergence = True
+    return outputs
+
+  def _apply_variational_bias(self, inputs):
+    if self.bias_posterior is None:
+      self.bias_posterior_tensor = None
+      return inputs
+    self.bias_posterior_tensor = self.bias_posterior_tensor_fn(
+        self.bias_posterior)
+    return nn.bias_add(inputs, self.bias_posterior_tensor)
+
+  def _apply_divergence(self, divergence_fn, posterior, prior,
+                        posterior_tensor, name):
+    if (divergence_fn is None or
+        posterior is None or
+        prior is None):
+      divergence = None
+      return
+    divergence = standard_ops.identity(
+        divergence_fn(
+            posterior, prior, posterior_tensor),
+        name=name)
+    self.add_loss(divergence)
+
+  def _matmul(self, inputs, kernel):
+    if inputs.shape.ndims <= 2:
+      return standard_ops.matmul(inputs, kernel)
+    # To handle broadcasting, we must use `tensordot`.
+    return standard_ops.tensordot(inputs, kernel, axes=[[-1], [0]])
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).with_rank_at_least(2)
+    if input_shape[-1].value is None:
+      raise ValueError(
+          "The innermost dimension of input_shape must be defined, "
+          "but saw: {}".format(input_shape))
+    return input_shape[:-1].concatenate(self.units)
+
+
+class DenseReparameterization(_DenseVariational):
+  """Densely-connected layer class with reparameterization estimator.
+
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
+
+  ```none
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
+  ```
+
+  It uses the reparameterization estimator [1], which performs a Monte Carlo
+  approximation of the distribution integrating over the `kernel` and
+  `bias`.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Args:
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Properties:
+    units: Python integer, dimensionality of the output space.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tfp.layers.DenseReparameterization(
+      512, activation=tf.nn.relu)(features)
+  logits = tfp.layers.DenseReparameterization(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Auto-Encoding Variational Bayes."
+        Diederik P. Kingma, Max Welling.
+        International Conference on Learning Representations, 2014.
+  """
+
+  def __init__(
+      self,
+      units,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(
+          is_singular=True),
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(DenseReparameterization, self).__init__(
+        units=units,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name,
+        **kwargs)
+
+  def _apply_variational_kernel(self, inputs):
+    self.kernel_posterior_tensor = self.kernel_posterior_tensor_fn(
+        self.kernel_posterior)
+    self.kernel_posterior_affine = None
+    self.kernel_posterior_affine_tensor = None
+    return self._matmul(inputs, self.kernel_posterior_tensor)
+
+
+def dense_reparameterization(
+    inputs,
+    units,
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(is_singular=True),  # pylint: disable=line-too-long
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    name=None,
+    reuse=None):
+  """Densely-connected layer with reparameterization estimator.
+
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
+
+  ```none
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
+  ```
+
+  It uses the reparameterization estimator [1], which performs a Monte Carlo
+  approximation of the distribution integrating over the `kernel` and
+  `bias`.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Args:
+    inputs: Tensor input.
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Returns:
+    output: `Tensor` representing a the affine transformed input under a random
+      draw from the surrogate posterior distribution.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tfp.layers.dense_reparameterization(
+      features, 512, activation=tf.nn.relu)
+  logits = tfp.layers.dense_reparameterization(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Auto-Encoding Variational Bayes."
+        Diederik P. Kingma, Max Welling.
+        International Conference on Learning Representations, 2014.
+  """
+  layer = DenseReparameterization(
+      units,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
+
+class DenseLocalReparameterization(_DenseVariational):
+  """Densely-connected layer class with local reparameterization estimator.
+
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
+
+  ```none
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
+  ```
+
+  It uses the local reparameterization estimator [1], which performs a
+  Monte Carlo approximation of the distribution on the hidden units
+  induced by the `kernel` and `bias`.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Args:
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Properties:
+    units: Python integer, dimensionality of the output space.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tfp.layers.DenseLocalReparameterization(
+      512, activation=tf.nn.relu)(features)
+  logits = tfp.layers.DenseLocalReparameterization(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses local reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Variational Dropout and the Local Reparameterization Trick."
+        Diederik P. Kingma, Tim Salimans, Max Welling.
+        Neural Information Processing Systems, 2015.
+  """
+
+  def __init__(
+      self,
+      units,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(
+          is_singular=True),
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(DenseLocalReparameterization, self).__init__(
+        units=units,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name,
+        **kwargs)
+
+  def _apply_variational_kernel(self, inputs):
+    if (not isinstance(self.kernel_posterior, independent_lib.Independent) or
+        not isinstance(self.kernel_posterior.distribution, normal_lib.Normal)):
+      raise TypeError(
+          "`DenseLocalReparameterization` requires "
+          "`kernel_posterior_fn` produce an instance of "
+          "`tf.distributions.Independent(tf.distributions.Normal)` "
+          "(saw: \"{}\").".format(self.kernel_posterior.name))
+    self.kernel_posterior_affine = normal_lib.Normal(
+        loc=self._matmul(inputs, self.kernel_posterior.distribution.loc),
+        scale=standard_ops.sqrt(self._matmul(
+            standard_ops.square(inputs),
+            standard_ops.square(self.kernel_posterior.distribution.scale))))
+    self.kernel_posterior_affine_tensor = (
+        self.kernel_posterior_tensor_fn(self.kernel_posterior_affine))
+    self.kernel_posterior_tensor = None
+    return self.kernel_posterior_affine_tensor
+
+
+def dense_local_reparameterization(
+    inputs,
+    units,
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(
+        is_singular=True),
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    name=None,
+    reuse=None):
+  """Densely-connected layer with local reparameterization estimator.
+
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
+
+  ```none
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
+  ```
+
+  It uses the local reparameterization estimator [1], which performs a
+  Monte Carlo approximation of the distribution on the hidden units
+  induced by the `kernel` and `bias`.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Args:
+    inputs: Tensor input.
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Returns:
+    output: `Tensor` representing a the affine transformed input under a random
+      draw from the surrogate posterior distribution.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tfp.layers.dense_local_reparameterization(
+      features, 512, activation=tf.nn.relu)
+  logits = tfp.layers.dense_local_reparameterization(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses local reparameterization gradients to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Variational Dropout and the Local Reparameterization Trick."
+        Diederik P. Kingma, Tim Salimans, Max Welling.
+        Neural Information Processing Systems, 2015.
+  """
+  layer = DenseLocalReparameterization(
+      units,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
+
+class DenseFlipout(_DenseVariational):
+  """Densely-connected layer class with Flipout estimator.
+
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
+
+  ```none
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
+  ```
+
+  It uses the Flipout estimator [1], which performs a Monte Carlo
+  approximation of the distribution integrating over the `kernel` and
+  `bias`. Flipout uses roughly twice as many floating point operations
+  as the reparameterization estimator but has the advantage of
+  significantly lower variance.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Args:
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    seed: Python scalar `int` which initializes the random number
+      generator. Default value: `None` (i.e., use global seed).
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Properties:
+    units: Python integer, dimensionality of the output space.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_posterior_fn: `callable` returning posterior.
+    kernel_posterior_tensor_fn: `callable` operating on posterior.
+    kernel_prior_fn: `callable` returning prior.
+    kernel_divergence_fn: `callable` returning divergence.
+    bias_posterior_fn: `callable` returning posterior.
+    bias_posterior_tensor_fn: `callable` operating on posterior.
+    bias_prior_fn: `callable` returning prior.
+    bias_divergence_fn: `callable` returning divergence.
+    seed: Python integer, used to create random seeds.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tfp.layers.DenseFlipout(
+      512, activation=tf.nn.relu)(features)
+  logits = tfp.layers.DenseFlipout(10)(net)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses the Flipout gradient estimator to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
+        Mini-Batches."
+        Anonymous. OpenReview, 2017.
+        https://openreview.net/forum?id=rJnpifWAb
+  """
+
+  def __init__(
+      self,
+      units,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=layers_util.default_mean_field_normal_fn(
+          is_singular=True),
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      seed=None,
+      name=None,
+      **kwargs):
+    super(DenseFlipout, self).__init__(
+        units=units,
+        activation=activation,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        kernel_posterior_fn=kernel_posterior_fn,
+        kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+        kernel_prior_fn=kernel_prior_fn,
+        kernel_divergence_fn=kernel_divergence_fn,
+        bias_posterior_fn=bias_posterior_fn,
+        bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+        bias_prior_fn=bias_prior_fn,
+        bias_divergence_fn=bias_divergence_fn,
+        name=name,
+        **kwargs)
+    self.seed = seed
+
+  def _apply_variational_kernel(self, inputs):
+    if (not isinstance(self.kernel_posterior, independent_lib.Independent) or
+        not isinstance(self.kernel_posterior.distribution, normal_lib.Normal)):
+      raise TypeError(
+          "`DenseFlipout` requires "
+          "`kernel_posterior_fn` produce an instance of "
+          "`tf.distributions.Independent(tf.distributions.Normal)` "
+          "(saw: \"{}\").".format(self.kernel_posterior.name))
+    self.kernel_posterior_affine = normal_lib.Normal(
+        loc=array_ops.zeros_like(self.kernel_posterior.distribution.loc),
+        scale=self.kernel_posterior.distribution.scale)
+    self.kernel_posterior_affine_tensor = (
+        self.kernel_posterior_tensor_fn(self.kernel_posterior_affine))
+    self.kernel_posterior_tensor = None
+
+    input_shape = array_ops.shape(inputs)
+    batch_shape = input_shape[:-1]
+
+    sign_input = layers_util.random_sign(
+        input_shape,
+        dtype=inputs.dtype,
+        seed=self.seed)
+    sign_output = layers_util.random_sign(
+        array_ops.concat([batch_shape,
+                          array_ops.expand_dims(self.units, 0)], 0),
+        dtype=inputs.dtype,
+        seed=distribution_util.gen_new_seed(
+            self.seed, salt="dense_flipout"))
+    perturbed_inputs = self._matmul(
+        inputs * sign_input, self.kernel_posterior_affine_tensor) * sign_output
+
+    outputs = self._matmul(inputs, self.kernel_posterior.distribution.loc)
+    outputs += perturbed_inputs
+    return outputs
+
+
+def dense_flipout(
+    inputs,
+    units,
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_posterior_fn=layers_util.default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=layers_util.default_mean_field_normal_fn(
+        is_singular=True),
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    seed=None,
+    name=None,
+    reuse=None):
+  """Densely-connected layer with Flipout estimator.
+
+  This layer implements the Bayesian variational inference analogue to
+  a dense layer by assuming the `kernel` and/or the `bias` are drawn
+  from distributions. By default, the layer implements a stochastic
+  forward pass via sampling from the kernel and bias posteriors,
+
+  ```none
+  kernel, bias ~ posterior
+  outputs = activation(matmul(inputs, kernel) + bias)
+  ```
+
+  It uses the Flipout estimator [1], which performs a Monte Carlo
+  approximation of the distribution integrating over the `kernel` and
+  `bias`. Flipout uses roughly twice as many floating point operations
+  as the reparameterization estimator but has the advantage of
+  significantly lower variance.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  distributions.
+
+  Args:
+    inputs: Tensor input.
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    seed: Python scalar `int` which initializes the random number
+      generator. Default value: `None` (i.e., use global seed).
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Returns:
+    output: `Tensor` representing a the affine transformed input under a random
+      draw from the surrogate posterior distribution.
+
+  #### Examples
+
+  We illustrate a Bayesian neural network with [variational inference](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods),
+  assuming a dataset of `features` and `labels`.
+
+  ```python
+  tfp = tf.contrib.bayesflow
+
+  net = tfp.layers.dense_flipout(
+      features, 512, activation=tf.nn.relu)
+  logits = tfp.layers.dense_flipout(net, 10)
+  neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits)
+  kl = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
+  loss = neg_log_likelihood + kl
+  train_op = tf.train.AdamOptimizer().minimize(loss)
+  ```
+
+  It uses the Flipout gradient estimator to minimize the
+  Kullback-Leibler divergence up to a constant, also known as the
+  negative Evidence Lower Bound. It consists of the sum of two terms:
+  the expected negative log-likelihood, which we approximate via
+  Monte Carlo; and the KL divergence, which is added via regularizer
+  terms which are arguments to the layer.
+
+  [1]: "Flipout: Efficient Pseudo-Independent Weight Perturbations on
+        Mini-Batches."
+        Anonymous. OpenReview, 2017.
+        https://openreview.net/forum?id=rJnpifWAb
+  """
+  layer = DenseFlipout(
+      units,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      seed=seed,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational_impl.py b/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational_impl.py
deleted file mode 100644
index b05ce0ffc1dd55ffb029b339a846a9aa5c877620..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational_impl.py
+++ /dev/null
@@ -1,797 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Dense Bayesian layer using KL-divergence based variational inference.
-
-@@DenseVariational
-@@dense_variational
-
-@@default_loc_scale_fn
-@@default_mean_field_normal_fn
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.distributions.python.ops import deterministic as deterministic_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.layers import base as layers_lib
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import standard_ops
-from tensorflow.python.ops.distributions import kullback_leibler as kl_lib
-from tensorflow.python.ops.distributions import normal as normal_lib
-
-
-__all__ = [
-    "DenseVariational",
-    "dense_variational",
-    "default_loc_scale_fn",
-    "default_mean_field_normal_fn",
-]
-
-
-def default_loc_scale_fn(
-    is_singular=False,
-    loc_initializer=init_ops.random_normal_initializer(stddev=0.1),
-    untransformed_scale_initializer=init_ops.random_normal_initializer(
-        mean=-3., stddev=0.1),
-    loc_regularizer=None,
-    untransformed_scale_regularizer=None,
-    loc_constraint=None,
-    untransformed_scale_constraint=None):
-  """Makes closure which creates `loc`, `scale` params from `tf.get_variable`.
-
-  This function produces a closure which produces `loc`, `scale` using
-  `tf.get_variable`. The closure accepts the following arguments:
-
-    dtype: Type of parameter's event.
-    shape: Python `list`-like representing the parameter's event shape.
-    name: Python `str` name prepended to any created (or existing)
-      `tf.Variable`s.
-    trainable: Python `bool` indicating all created `tf.Variable`s should be
-      added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
-    add_variable_fn: `tf.get_variable`-like `callable` used to create (or
-      access existing) `tf.Variable`s.
-
-  Args:
-    is_singular: Python `bool` indicating if `scale is None`. Default: `False`.
-    loc_initializer: Initializer function for the `loc` parameters.
-      The default is `tf.random_normal_initializer(mean=0., stddev=0.1)`.
-    untransformed_scale_initializer: Initializer function for the `scale`
-      parameters. Default value: `tf.random_normal_initializer(mean=-3.,
-      stddev=0.1)`. This implies the softplus transformed result has mean
-      approximately `0.05` and std. deviation approximately `0.005`.
-    loc_regularizer: Regularizer function for the `loc` parameters.
-      The default (`None`) is to use the `tf.get_variable` default.
-    untransformed_scale_regularizer: Regularizer function for the `scale`
-      parameters. The default (`None`) is to use the `tf.get_variable` default.
-    loc_constraint: An optional projection function to be applied to the
-      loc after being updated by an `Optimizer`. The function must take as input
-      the unprojected variable and must return the projected variable (which
-      must have the same shape). Constraints are not safe to use when doing
-      asynchronous distributed training.
-      The default (`None`) is to use the `tf.get_variable` default.
-    untransformed_scale_constraint: An optional projection function to be
-      applied to the `scale` parameters after being updated by an `Optimizer`
-      (e.g. used to implement norm constraints or value constraints). The
-      function must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are not
-      safe to use when doing asynchronous distributed training. The default
-      (`None`) is to use the `tf.get_variable` default.
-
-  Returns:
-    default_loc_scale_fn: Python `callable` which instantiates `loc`, `scale`
-    parameters from args: `dtype, shape, name, trainable, add_variable_fn`.
-  """
-  def _fn(dtype, shape, name, trainable, add_variable_fn):
-    """Creates `loc`, `scale` parameters."""
-    loc = add_variable_fn(
-        name=name + "_loc",
-        shape=shape,
-        initializer=loc_initializer,
-        regularizer=loc_regularizer,
-        constraint=loc_constraint,
-        dtype=dtype,
-        trainable=trainable)
-    if is_singular:
-      return loc, None
-    untransformed_scale = add_variable_fn(
-        name=name + "_untransformed_scale",
-        shape=shape,
-        initializer=untransformed_scale_initializer,
-        regularizer=untransformed_scale_regularizer,
-        constraint=untransformed_scale_constraint,
-        dtype=dtype,
-        trainable=trainable)
-    scale = (np.finfo(dtype.as_numpy_dtype).eps +
-             nn_ops.softplus(untransformed_scale))
-    return loc, scale
-  return _fn
-
-
-def default_mean_field_normal_fn(
-    is_singular=False,
-    loc_initializer=None,
-    untransformed_scale_initializer=None,
-    loc_regularizer=None,
-    untransformed_scale_regularizer=None,
-    loc_constraint=None,
-    untransformed_scale_constraint=None):
-  """Creates a function to build Normal distributions with trainable params.
-
-  This function produces a closure which produces `tf.distributions.Normal`
-  parameterized by a loc` and `scale` each created using `tf.get_variable`. The
-  produced closure accepts the following arguments:
-
-    name: Python `str` name prepended to any created (or existing)
-      `tf.Variable`s.
-    shape: Python `list`-like representing the parameter's event shape.
-    dtype: Type of parameter's event.
-    trainable: Python `bool` indicating all created `tf.Variable`s should be
-      added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
-    add_variable_fn: `tf.get_variable`-like `callable` used to create (or
-      access existing) `tf.Variable`s.
-
-  Args:
-    is_singular: Python `bool` if `True`, forces the special case limit of
-      `scale->0`, i.e., a `Deterministic` distribution.
-    loc_initializer: Initializer function for the `loc` parameters.
-      If `None` (default), values are initialized using the default
-      initializer used by `tf.get_variable`.
-    untransformed_scale_initializer: Initializer function for the `scale`
-      parameters. If `None` (default), values are initialized using the default
-      initializer used by `tf.get_variable`.
-    loc_regularizer: Regularizer function for the `loc` parameters.
-    untransformed_scale_regularizer: Regularizer function for the `scale`
-      parameters.
-    loc_constraint: An optional projection function to be applied to the
-      loc after being updated by an `Optimizer`. The function must take as input
-      the unprojected variable and must return the projected variable (which
-      must have the same shape). Constraints are not safe to use when doing
-      asynchronous distributed training.
-    untransformed_scale_constraint: An optional projection function to be
-      applied to the `scale` parameters after being updated by an `Optimizer`
-      (e.g. used to implement norm constraints or value constraints). The
-      function must take as input the unprojected variable and must return the
-      projected variable (which must have the same shape). Constraints are not
-      safe to use when doing asynchronous distributed training.
-
-  Returns:
-    make_normal_fn: Python `callable` which creates a `tf.distributions.Normal`
-      using from args: `dtype, shape, name, trainable, add_variable_fn`.
-  """
-  loc_scale_fn_ = default_loc_scale_fn(
-      is_singular,
-      loc_initializer,
-      untransformed_scale_initializer,
-      loc_regularizer,
-      untransformed_scale_regularizer,
-      loc_constraint,
-      untransformed_scale_constraint)
-  def _fn(dtype, shape, name, trainable, add_variable_fn):
-    """Creates a batch of `Deterministic` or `Normal` distributions."""
-    loc, scale = loc_scale_fn_(dtype, shape, name, trainable, add_variable_fn)
-    if scale is None:
-      return deterministic_lib.Deterministic(loc=loc)
-    return normal_lib.Normal(loc=loc, scale=scale)
-  return _fn
-
-
-class DenseVariational(layers_lib.Layer):
-  """Densely-connected variational class.
-
-  This layer implements the Bayesian variational inference analogue to:
-  `outputs = activation(matmul(inputs, kernel) + bias)`
-  by assuming the `kernel` and/or the `bias` are random variables.
-
-  The layer implements a stochastic dense calculation by making a Monte Carlo
-  approximation of a [variational Bayesian method based on KL divergence](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods), i.e.,
-
-  ```none
-  -log p(y|x) = -log int_{R**d} p(y|x,w) p(w) dw
-              = -log int_{R**d} p(y,w|x) q(w|x) / q(w|x) dw
-             <= E_q(W|x)[-log p(y,W|x) + log q(W|x)]       # Jensen's
-              = E_q(W|x)[-log p(y|x,W)] + KL[q(W|x), p(W)]
-             ~= m**-1 sum{ -log(y|x,w[j]) : w[j] ~ q(W|x), j=1..m }
-                 + KL[q(W|x), p(W)]
-  ```
-
-  where `W` denotes the (independent) `kernel` and `bias` random variables, `w`
-  is a random variate or outcome of `W`, `y` is the label, `x` is the evidence`,
-  and `~=` denotes an approximation which becomes exact as `m->inf`. The above
-  bound is sometimes referred to as the negative Evidence Lower BOund or
-  negative [ELBO](https://arxiv.org/abs/1601.00670). In context of a DNN, this
-  layer is appropriate to use when the final loss is a negative log-likelihood.
-
-  The Monte-Carlo sum portion is used for the feed-forward calculation of the
-  DNN. The KL divergence portion can be added to the final loss via:
-  `loss += sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))`.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  random variables (which together comprise `W`).
-
-  Args:
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (`callable`). Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_use_local_reparameterization: Python `bool` indicating whether
-      `kernel` calculation should employ the Local Reparameterization Trick.
-      When `True`, `kernel_posterior_fn` must create an instance of
-      `tf.distributions.Normal`.
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    name: Python `str`, the name of the layer. Layers with the same name will
-      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
-      such cases.
-    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
-      layer by the same name.
-
-  Properties:
-    units: Python integer, dimensionality of the output space.
-    activation: Activation function (`callable`).
-    activity_regularizer: Regularizer function for the output.
-    kernel_use_local_reparameterization: Python `bool` indicating whether
-      `kernel` calculation should employ the Local Reparameterization Trick.
-    kernel: `VariationalKernelParamater` instance containing all `kernel`
-      related properties and `callable`s.
-    bias: `VariationalParameter` instance containing all `kernel`
-      related properties and `callable`s.
-  """
-
-  def __init__(
-      self,
-      units,
-      activation=None,
-      activity_regularizer=None,
-      trainable=True,
-      kernel_use_local_reparameterization=True,
-      kernel_posterior_fn=default_mean_field_normal_fn(),
-      kernel_posterior_tensor_fn=lambda d: d.sample(),
-      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      bias_posterior_fn=default_mean_field_normal_fn(is_singular=True),
-      bias_posterior_tensor_fn=lambda d: d.sample(),
-      bias_prior_fn=None,
-      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-      name=None,
-      **kwargs):
-    super(DenseVariational, self).__init__(
-        trainable=trainable,
-        name=name,
-        activity_regularizer=activity_regularizer,
-        **kwargs)
-    self._units = units
-    self._activation = activation
-    self._input_spec = layers_lib.InputSpec(min_ndim=2)
-    self._kernel_use_local_reparameterization = (
-        kernel_use_local_reparameterization)
-    self._kernel = VariationalKernelParameter(
-        kernel_posterior_fn,
-        kernel_posterior_tensor_fn,
-        kernel_prior_fn,
-        kernel_divergence_fn)
-    self._bias = VariationalParameter(
-        bias_posterior_fn,
-        bias_posterior_tensor_fn,
-        bias_prior_fn,
-        bias_divergence_fn)
-
-  @property
-  def units(self):
-    return self._units
-
-  @property
-  def activation(self):
-    return self._activation
-
-  @property
-  def input_spec(self):
-    return self._input_spec
-
-  @input_spec.setter
-  def input_spec(self, value):
-    self._input_spec = value
-
-  @property
-  def kernel_use_local_reparameterization(self):
-    return self._kernel_use_local_reparameterization
-
-  @property
-  def kernel(self):
-    return self._kernel
-
-  @property
-  def bias(self):
-    return self._bias
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    in_size = input_shape.with_rank_at_least(2)[-1].value
-    if in_size is None:
-      raise ValueError("The last dimension of the inputs to `Dense` "
-                       "should be defined. Found `None`.")
-    self._input_spec = layers_lib.InputSpec(min_ndim=2, axes={-1: in_size})
-    dtype = dtypes.as_dtype(self.dtype)
-
-    # Must have a posterior kernel.
-    self.kernel.posterior = self.kernel.posterior_fn(
-        dtype, [in_size, self.units], "kernel_posterior",
-        self.trainable, self.add_variable)
-
-    if self.kernel.prior_fn is None:
-      self.kernel_prior = None
-    else:
-      self.kernel.prior = self.kernel.prior_fn(
-          dtype, [in_size, self.units], "kernel_prior",
-          self.trainable, self.add_variable)
-    self._built_kernel_divergence = False
-
-    if self.bias.posterior_fn is None:
-      self.bias.posterior = None
-    else:
-      self.bias.posterior = self.bias.posterior_fn(
-          dtype, [self.units], "bias_posterior",
-          self.trainable, self.add_variable)
-
-    if self.bias.prior_fn is None:
-      self.bias.prior = None
-    else:
-      self.bias.prior = self.bias.prior_fn(
-          dtype, [self.units], "bias_prior",
-          self.trainable, self.add_variable)
-    self._built_bias_divergence = False
-
-    self.built = True
-
-  def call(self, inputs):
-    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
-
-    outputs = self._apply_variational_kernel(inputs)
-    outputs = self._apply_variational_bias(outputs)
-    if self.activation is not None:
-      outputs = self.activation(outputs)  # pylint: disable=not-callable
-    if not self._built_kernel_divergence:
-      self._apply_divergence(self.kernel, name="divergence_kernel")
-      self._built_kernel_divergence = True
-    if not self._built_bias_divergence:
-      self._apply_divergence(self.bias, name="divergence_bias")
-      self._built_bias_divergence = True
-    return outputs
-
-  def _apply_variational_kernel(self, inputs):
-    if not self.kernel_use_local_reparameterization:
-      self.kernel.posterior_tensor = self.kernel.posterior_tensor_fn(
-          self.kernel.posterior)
-      self.kernel.posterior_affine = None
-      self.kernel.posterior_affine_tensor = None
-      return self._matmul(inputs, self.kernel.posterior_tensor)
-    if not isinstance(self.kernel.posterior, normal_lib.Normal):
-      raise TypeError("`kernel_use_local_reparameterization=True` requires "
-                      "`kernel_posterior_fn` produce an instance of "
-                      "`tf.distributions.Normal` (saw: \"{}\").".format(
-                          type(self.kernel.posterior).__name__))
-    self.kernel.posterior_affine = normal_lib.Normal(
-        loc=self._matmul(inputs, self.kernel.posterior.loc),
-        scale=standard_ops.sqrt(self._matmul(
-            standard_ops.square(inputs),
-            standard_ops.square(self.kernel.posterior.scale))))
-    self.kernel.posterior_affine_tensor = (
-        self.kernel.posterior_tensor_fn(self.kernel.posterior_affine))
-    self.kernel.posterior_tensor = None
-    return self.kernel.posterior_affine_tensor
-
-  def _apply_variational_bias(self, inputs):
-    if self.bias.posterior is None:
-      self.bias.posterior_tensor = None
-      return inputs
-    self.bias.posterior_tensor = self.bias.posterior_tensor_fn(
-        self.bias.posterior)
-    return nn.bias_add(inputs, self.bias.posterior_tensor)
-
-  def _apply_divergence(self, param, name):
-    if (param.divergence_fn is None or
-        param.posterior is None or
-        param.prior is None):
-      param.divergence = None
-      return
-    param.divergence = standard_ops.identity(
-        param.divergence_fn(
-            param.posterior, param.prior, param.posterior_tensor),
-        name=name)
-    self.add_loss(param.divergence)
-
-  def _matmul(self, inputs, kernel):
-    if inputs.shape.ndims <= 2:
-      return standard_ops.matmul(inputs, kernel)
-    # To handle broadcasting, we must use `tensordot`.
-    return standard_ops.tensordot(inputs, kernel, axes=[[-1], [0]])
-
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).with_rank_at_least(2)
-    if input_shape[-1].value is None:
-      raise ValueError(
-          "The innermost dimension of input_shape must be defined, "
-          "but saw: {}".format(input_shape))
-    return input_shape[:-1].concatenate(self.units)
-
-
-def dense_variational(
-    inputs,
-    units,
-    activation=None,
-    activity_regularizer=None,
-    trainable=True,
-    kernel_use_local_reparameterization=True,
-    kernel_posterior_fn=default_mean_field_normal_fn(),
-    kernel_posterior_tensor_fn=lambda d: d.sample(),
-    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
-        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
-    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    bias_posterior_fn=default_mean_field_normal_fn(is_singular=True),
-    bias_posterior_tensor_fn=lambda d: d.sample(),
-    bias_prior_fn=None,
-    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
-    name=None,
-    reuse=None):
-  """Densely-connected variational layer.
-
-  This layer implements the Bayesian variational inference analogue to:
-  `outputs = activation(matmul(inputs, kernel) + bias)`
-  by assuming the `kernel` and/or the `bias` are random variables.
-
-  The layer implements a stochastic dense calculation by making a Monte Carlo
-  approximation of a [variational Bayesian method based on KL divergence](
-  https://en.wikipedia.org/wiki/Variational_Bayesian_methods), i.e.,
-
-  ```none
-  -log p(y|x) = -log int_{R**d} p(y|x,w) p(w) dw
-              = -log int_{R**d} p(y,w|x) q(w|x) / q(w|x) dw
-             <= E_q(W|x)[-log p(y,W|x) + log q(W|x)]       # Jensen's
-              = E_q(W|x)[-log p(y|x,W)] + KL[q(W|x), p(W)]
-             ~= m**-1 sum{ -log(y|x,w[j]) : w[j] ~ q(W|x), j=1..m }
-                 + KL[q(W|x), p(W)]
-  ```
-
-  where `W` denotes the (independent) `kernel` and `bias` random variables, `w`
-  is a random variate or outcome of `W`, `y` is the label, `x` is the evidence`,
-  and `~=` denotes an approximation which becomes exact as `m->inf`. The above
-  bound is sometimes referred to as the negative Evidence Lower BOund or
-  negative [ELBO](https://arxiv.org/abs/1601.00670). In context of a DNN, this
-  layer is appropriate to use when the final loss is a negative log-likelihood.
-
-  The Monte-Carlo sum portion is used for the feed-forward calculation of the
-  DNN. The KL divergence portion can be added to the final loss via:
-  `loss += sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))`.
-
-  The arguments permit separate specification of the surrogate posterior
-  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
-  random variables (which together comprise `W`).
-
-  Args:
-    inputs: Tensor input.
-    units: Integer or Long, dimensionality of the output space.
-    activation: Activation function (`callable`). Set it to None to maintain a
-      linear activation.
-    activity_regularizer: Regularizer function for the output.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    kernel_use_local_reparameterization: Python `bool` indicating whether
-      `kernel` calculation should employ the Local Reparameterization Trick.
-      When `True`, `kernel_posterior_fn` must create an instance of
-      `tf.distributions.Normal`.
-    kernel_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `kernel` parameter. Default value:
-      `default_mean_field_normal_fn()`.
-    kernel_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    kernel_prior_fn: Python `callable` which creates `tf.distributions`
-      instance. See `default_mean_field_normal_fn` docstring for required
-      parameter signature.
-      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
-    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    bias_posterior_fn: Python `callable` which creates
-      `tf.distributions.Distribution` instance representing the surrogate
-      posterior of the `bias` parameter. Default value:
-      `default_mean_field_normal_fn(is_singular=True)` (which creates an
-      instance of `tf.distributions.Deterministic`).
-    bias_posterior_tensor_fn: Python `callable` which takes a
-      `tf.distributions.Distribution` instance and returns a representative
-      value. Default value: `lambda d: d.sample()`.
-    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
-      See `default_mean_field_normal_fn` docstring for required parameter
-      signature. Default value: `None` (no prior, no variational inference)
-    bias_divergence_fn: Python `callable` which takes the surrogate posterior
-      distribution, prior distribution and random variate sample(s) from the
-      surrogate posterior and computes or approximates the KL divergence. The
-      distributions are `tf.distributions.Distribution`-like instances and the
-      sample is a `Tensor`.
-    name: Python `str`, the name of the layer. Layers with the same name will
-      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
-      such cases.
-    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
-      layer by the same name.
-
-  Returns:
-    output: `Tensor` representing a the affine transformed input under a random
-      draw from the surrogate posterior distribution.
-  """
-  layer = DenseVariational(
-      units,
-      activation=activation,
-      activity_regularizer=activity_regularizer,
-      trainable=trainable,
-      kernel_use_local_reparameterization=(
-          kernel_use_local_reparameterization),
-      kernel_posterior_fn=kernel_posterior_fn,
-      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
-      kernel_prior_fn=kernel_prior_fn,
-      kernel_divergence_fn=kernel_divergence_fn,
-      bias_posterior_fn=bias_posterior_fn,
-      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
-      bias_prior_fn=bias_prior_fn,
-      bias_divergence_fn=bias_divergence_fn,
-      name=name,
-      dtype=inputs.dtype.base_dtype,
-      _scope=name,
-      _reuse=reuse)
-  return layer.apply(inputs)
-
-
-class NotSet(object):
-  """Helper to track whether a `VariationalParameter` value has been set."""
-  pass
-
-
-class VariationalParameter(object):
-  """Struct-like container of variational parameter properties.
-
-  A `VariationalParameter` is intitialized with Python `callable`s which set the
-  value of correspondingly named members. Corresponding values have "set once"
-  semantics, i.e., once set to any value they are immutable.
-  """
-
-  def __init__(
-      self,
-      posterior_fn,
-      posterior_tensor_fn,
-      prior_fn,
-      divergence_fn):
-    """Creates the `VariationalParameter` struct-like object.
-
-    Args:
-      posterior_fn: Python `callable` which creates a
-        `tf.distribution.Distribution` like object representing the posterior
-        distribution. See `VariationalParameter.posterior_fn` for `callable`'s
-        required parameters.
-      posterior_tensor_fn: Python `callable` which computes a `Tensor`
-        which represents the `posterior`.
-      prior_fn: Python `callable` which creates a
-        `tf.distribution.Distribution` like object representing the prior
-        distribution. See `VariationalParameter.prior_fn` for `callable`'s
-        required parameters.
-      divergence_fn: Python `callable` which computes the KL divergence from
-        `posterior` to `prior`. See `VariationalParameter.divergence_fn` for
-        required `callable`'s parameters.
-    """
-    self._posterior_fn = posterior_fn
-    self._posterior = NotSet()
-    self._posterior_tensor_fn = posterior_tensor_fn
-    self._posterior_tensor = NotSet()
-    self._prior_fn = prior_fn
-    self._prior = NotSet()
-    self._divergence_fn = divergence_fn
-    self._divergence = NotSet()
-    self._init_helper()
-
-  @property
-  def posterior_fn(self):
-    """`callable` which creates `tf.distributions.Distribution`-like posterior.
-
-    The `callable` must accept the following parameters:
-      name: Python `str` name prepended to any created (or existing)
-        `tf.Variable`s.
-      shape: Python `list`-like representing the parameter's event shape.
-      dtype: Type of parameter's event.
-      trainable: Python `bool` indicating all created `tf.Variable`s should be
-        added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
-      add_variable_fn: `tf.get_variable`-like `callable` used to create (or
-        access existing) `tf.Variable`s.
-
-    Returns:
-      posterior_fn: The Python `callable` specified in `__init__`.
-    """
-    return self._posterior_fn
-
-  @property
-  def posterior(self):
-    """`tf.distributions.Distribution`-like instance representing posterior."""
-    return self._posterior
-
-  @posterior.setter
-  def posterior(self, value):
-    """One-time setter of the `posterior` distribution."""
-    if not isinstance(self._posterior, NotSet):
-      raise ValueError("Cannot override already set attribute.")
-    self._posterior = value
-
-  @property
-  def posterior_tensor_fn(self):
-    """Creates `Tensor` representing the `posterior` distribution.
-
-    The `callable` must accept the following parameters:
-      posterior: `tf.distributions.Distribution`-like instance.
-
-    Returns:
-      posterior_tensor_fn: The Python `callable` specified in
-        `__init__`.
-    """
-    return self._posterior_tensor_fn
-
-  @property
-  def posterior_tensor(self):
-    """`Tensor` representing the `posterior` distribution."""
-    return self._posterior_tensor
-
-  @posterior_tensor.setter
-  def posterior_tensor(self, value):
-    """One-time setter of the `posterior_tensor`."""
-    if not isinstance(self._posterior_tensor, NotSet):
-      raise ValueError("Cannot override already set attribute.")
-    self._posterior_tensor = value
-
-  @property
-  def prior_fn(self):
-    """`callable` which creates `tf.distributions.Distribution`-like prior.
-
-    The `callable` must accept the following parameters:
-      name: Python `str` name prepended to any created (or existing)
-        `tf.Variable`s.
-      shape: Python `list`-like representing the parameter's event shape.
-      dtype: Type of parameter's event.
-      trainable: Python `bool` indicating all created `tf.Variable`s should be
-        added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
-      add_variable_fn: `tf.get_variable`-like `callable` used to create (or
-        access existing) `tf.Variable`s.
-
-    Returns:
-      prior_fn: The Python `callable` specified in `__init__`.
-    """
-    return self._prior_fn
-
-  @property
-  def prior(self):
-    """`tf.distributions.Distribution`-like instance representing posterior."""
-    return self._prior
-
-  @prior.setter
-  def prior(self, value):
-    """One-time setter of the `prior` distribution."""
-    if not isinstance(self._prior, NotSet):
-      raise ValueError("Cannot override already set attribute.")
-    self._prior = value
-
-  @property
-  def divergence_fn(self):
-    """`callable` which computes KL-divergence `Tensor` from posterior to prior.
-
-    The `callable` must accept the following parameters:
-      posterior: `tf.distributions.Distribution`-like instance.
-      prior: `tf.distributions.Distribution`-like instance.
-      posterior_tensor: `Tensor` representing value of posterior.
-
-    Returns:
-      divergence_fn: The Python `callable` specified in `__init__`.
-    """
-    return self._divergence_fn
-
-  @property
-  def divergence(self):
-    """`Tensor` representing KL-divergence from posterior to prior."""
-    return self._divergence
-
-  @divergence.setter
-  def divergence(self, value):
-    """One-time setter of the `divergence`."""
-    if not isinstance(self._divergence, NotSet):
-      raise ValueError("Cannot override already set attribute.")
-    self._divergence = value
-
-  def _init_helper(self):
-    pass
-
-
-class VariationalKernelParameter(VariationalParameter):
-  """Struct-like container of variational kernel properties.
-
-  A `VariationalKernelParameter` is intitialized with Python `callable`s which
-  set the value of correspondingly named members. Corresponding values have "set
-  once" semantics, i.e., once set to any value they are immutable.
-  """
-
-  @property
-  def posterior_affine(self):
-    """`tf.distributions.Distribution` affine transformed posterior."""
-    return self._posterior_affine
-
-  @posterior_affine.setter
-  def posterior_affine(self, value):
-    """One-time setter of `posterior_affine`."""
-    if not isinstance(self._posterior_affine, NotSet):
-      raise ValueError("Cannot override already set attribute.")
-    self._posterior_affine = value
-
-  @property
-  def posterior_affine_tensor(self):
-    """`Tensor` representing the `posterior_affine` distribution."""
-    return self._posterior_affine_tensor
-
-  @posterior_affine_tensor.setter
-  def posterior_affine_tensor(self, value):
-    """One-time setter of the `posterior_affine_tensor`."""
-    if not isinstance(self._posterior_affine_tensor, NotSet):
-      raise ValueError("Cannot override already set attribute.")
-    self._posterior_affine_tensor = value
-
-  def _init_helper(self):
-    self._posterior_affine = NotSet()
-    self._posterior_affine_tensor = NotSet()
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers_util.py b/tensorflow/contrib/bayesflow/python/ops/layers_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c1fb203f7328e8260e49b4326d813fbe133613e
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/layers_util.py
@@ -0,0 +1,191 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for probabilistic layers.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import deterministic as deterministic_lib
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+
+
+def default_loc_scale_fn(
+    is_singular=False,
+    loc_initializer=init_ops.random_normal_initializer(stddev=0.1),
+    untransformed_scale_initializer=init_ops.random_normal_initializer(
+        mean=-3., stddev=0.1),
+    loc_regularizer=None,
+    untransformed_scale_regularizer=None,
+    loc_constraint=None,
+    untransformed_scale_constraint=None):
+  """Makes closure which creates `loc`, `scale` params from `tf.get_variable`.
+
+  This function produces a closure which produces `loc`, `scale` using
+  `tf.get_variable`. The closure accepts the following arguments:
+
+    dtype: Type of parameter's event.
+    shape: Python `list`-like representing the parameter's event shape.
+    name: Python `str` name prepended to any created (or existing)
+      `tf.Variable`s.
+    trainable: Python `bool` indicating all created `tf.Variable`s should be
+      added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
+    add_variable_fn: `tf.get_variable`-like `callable` used to create (or
+      access existing) `tf.Variable`s.
+
+  Args:
+    is_singular: Python `bool` indicating if `scale is None`. Default: `False`.
+    loc_initializer: Initializer function for the `loc` parameters.
+      The default is `tf.random_normal_initializer(mean=0., stddev=0.1)`.
+    untransformed_scale_initializer: Initializer function for the `scale`
+      parameters. Default value: `tf.random_normal_initializer(mean=-3.,
+      stddev=0.1)`. This implies the softplus transformed result has mean
+      approximately `0.05` and std. deviation approximately `0.005`.
+    loc_regularizer: Regularizer function for the `loc` parameters.
+      The default (`None`) is to use the `tf.get_variable` default.
+    untransformed_scale_regularizer: Regularizer function for the `scale`
+      parameters. The default (`None`) is to use the `tf.get_variable` default.
+    loc_constraint: An optional projection function to be applied to the
+      loc after being updated by an `Optimizer`. The function must take as input
+      the unprojected variable and must return the projected variable (which
+      must have the same shape). Constraints are not safe to use when doing
+      asynchronous distributed training.
+      The default (`None`) is to use the `tf.get_variable` default.
+    untransformed_scale_constraint: An optional projection function to be
+      applied to the `scale` parameters after being updated by an `Optimizer`
+      (e.g. used to implement norm constraints or value constraints). The
+      function must take as input the unprojected variable and must return the
+      projected variable (which must have the same shape). Constraints are not
+      safe to use when doing asynchronous distributed training. The default
+      (`None`) is to use the `tf.get_variable` default.
+
+  Returns:
+    default_loc_scale_fn: Python `callable` which instantiates `loc`, `scale`
+    parameters from args: `dtype, shape, name, trainable, add_variable_fn`.
+  """
+  def _fn(dtype, shape, name, trainable, add_variable_fn):
+    """Creates `loc`, `scale` parameters."""
+    loc = add_variable_fn(
+        name=name + "_loc",
+        shape=shape,
+        initializer=loc_initializer,
+        regularizer=loc_regularizer,
+        constraint=loc_constraint,
+        dtype=dtype,
+        trainable=trainable)
+    if is_singular:
+      return loc, None
+    untransformed_scale = add_variable_fn(
+        name=name + "_untransformed_scale",
+        shape=shape,
+        initializer=untransformed_scale_initializer,
+        regularizer=untransformed_scale_regularizer,
+        constraint=untransformed_scale_constraint,
+        dtype=dtype,
+        trainable=trainable)
+    scale = (np.finfo(dtype.as_numpy_dtype).eps +
+             nn_ops.softplus(untransformed_scale))
+    return loc, scale
+  return _fn
+
+
+def default_mean_field_normal_fn(
+    is_singular=False,
+    loc_initializer=None,
+    untransformed_scale_initializer=None,
+    loc_regularizer=None,
+    untransformed_scale_regularizer=None,
+    loc_constraint=None,
+    untransformed_scale_constraint=None):
+  """Creates a function to build Normal distributions with trainable params.
+
+  This function produces a closure which produces `tf.distributions.Normal`
+  parameterized by a loc` and `scale` each created using `tf.get_variable`. The
+  produced closure accepts the following arguments:
+
+    name: Python `str` name prepended to any created (or existing)
+      `tf.Variable`s.
+    shape: Python `list`-like representing the parameter's event shape.
+    dtype: Type of parameter's event.
+    trainable: Python `bool` indicating all created `tf.Variable`s should be
+      added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
+    add_variable_fn: `tf.get_variable`-like `callable` used to create (or
+      access existing) `tf.Variable`s.
+
+  Args:
+    is_singular: Python `bool` if `True`, forces the special case limit of
+      `scale->0`, i.e., a `Deterministic` distribution.
+    loc_initializer: Initializer function for the `loc` parameters.
+      If `None` (default), values are initialized using the default
+      initializer used by `tf.get_variable`.
+    untransformed_scale_initializer: Initializer function for the `scale`
+      parameters. If `None` (default), values are initialized using the default
+      initializer used by `tf.get_variable`.
+    loc_regularizer: Regularizer function for the `loc` parameters.
+    untransformed_scale_regularizer: Regularizer function for the `scale`
+      parameters.
+    loc_constraint: An optional projection function to be applied to the
+      loc after being updated by an `Optimizer`. The function must take as input
+      the unprojected variable and must return the projected variable (which
+      must have the same shape). Constraints are not safe to use when doing
+      asynchronous distributed training.
+    untransformed_scale_constraint: An optional projection function to be
+      applied to the `scale` parameters after being updated by an `Optimizer`
+      (e.g. used to implement norm constraints or value constraints). The
+      function must take as input the unprojected variable and must return the
+      projected variable (which must have the same shape). Constraints are not
+      safe to use when doing asynchronous distributed training.
+
+  Returns:
+    make_normal_fn: Python `callable` which creates a `tf.distributions.Normal`
+      using from args: `dtype, shape, name, trainable, add_variable_fn`.
+  """
+  loc_scale_fn_ = default_loc_scale_fn(
+      is_singular,
+      loc_initializer,
+      untransformed_scale_initializer,
+      loc_regularizer,
+      untransformed_scale_regularizer,
+      loc_constraint,
+      untransformed_scale_constraint)
+  def _fn(dtype, shape, name, trainable, add_variable_fn):
+    """Creates multivariate `Deterministic` or `Normal` distribution."""
+    loc, scale = loc_scale_fn_(dtype, shape, name, trainable, add_variable_fn)
+    if scale is None:
+      dist = deterministic_lib.Deterministic(loc=loc)
+    else:
+      dist = normal_lib.Normal(loc=loc, scale=scale)
+    reinterpreted_batch_ndims = array_ops.shape(dist.batch_shape_tensor())[0]
+    return independent_lib.Independent(
+        dist, reinterpreted_batch_ndims=reinterpreted_batch_ndims)
+  return _fn
+
+
+def random_sign(shape, dtype=dtypes.float32, seed=None):
+  """Draw values from {-1, 1} uniformly, i.e., Rademacher distribution."""
+  random_bernoulli = random_ops.random_uniform(shape, minval=0, maxval=2,
+                                               dtype=dtypes.int32,
+                                               seed=seed)
+  return math_ops.cast(2 * random_bernoulli - 1, dtype)
diff --git a/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py b/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3a645eafc249d1c39e0d4a238ae7ec8755c78d8
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics.py
@@ -0,0 +1,32 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for Markov Chain Monte Carlo (MCMC) sampling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.bayesflow.python.ops.mcmc_diagnostics_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "effective_sample_size",
+    "potential_scale_reduction",
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py b/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..0424b6952bc89ce7fe5b00b0135c9a5fe1faa8cf
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/mcmc_diagnostics_impl.py
@@ -0,0 +1,400 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for Markov Chain Monte Carlo (MCMC) sampling.
+
+@@effective_sample_size
+@@potential_scale_reduction
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import sample_stats
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+__all__ = [
+    "effective_sample_size",
+    "potential_scale_reduction",
+]
+
+
+def effective_sample_size(states,
+                          filter_threshold=0.,
+                          filter_beyond_lag=None,
+                          name=None):
+  """Estimate a lower bound on effective sample size for each independent chain.
+
+  Roughly speaking, "effective sample size" (ESS) is the size of an iid sample
+  with the same variance as `state`.
+
+  More precisely, given a stationary sequence of possibly correlated random
+  variables `X_1, X_2,...,X_N`, each identically distributed ESS is the number
+  such that
+
+  ```Variance{ N**-1 * Sum{X_i} } = ESS**-1 * Variance{ X_1 }.```
+
+  If the sequence is uncorrelated, `ESS = N`.  In general, one should expect
+  `ESS <= N`, with more highly correlated sequences having smaller `ESS`.
+
+  #### Example of using ESS to estimate standard error.
+
+  ```
+  tfd = tf.contrib.distributions
+  tfb = tf.contrib.bayesflow
+
+  target = tfd.MultivariateNormalDiag(scale_diag=[1., 2.])
+
+  # Get 1000 states from one chain.
+  states = tfb.hmc.sample_chain(
+      num_results=1000,
+      target_log_prob_fn=target.log_prob,
+      current_state=tf.constant([0., 0.]),
+      step_size=0.05,
+      num_leapfrog_steps=20,
+      num_burnin_steps=200)
+  states.shape
+  ==> (1000, 2)
+
+  ess = effective_sample_size(states)
+  ==> Shape (2,) Tensor
+
+  mean, variance = tf.nn.moments(states, axis=0)
+  standard_error = tf.sqrt(variance / ess)
+  ```
+
+  Some math shows that, with `R_k` the auto-correlation sequence,
+  `R_k := Covariance{X_1, X_{1+k}} / Variance{X_1}`, we have
+
+  ```ESS(N) =  N / [ 1 + 2 * ( (N - 1) / N * R_1 + ... + 1 / N * R_{N-1}  ) ]```
+
+  This function estimates the above by first estimating the auto-correlation.
+  Since `R_k` must be estimated using only `N - k` samples, it becomes
+  progressively noisier for larger `k`.  For this reason, the summation over
+  `R_k` should be truncated at some number `filter_beyond_lag < N`.  Since many
+  MCMC methods generate chains where `R_k > 0`, a reasonable critera is to
+  truncate at the first index where the estimated auto-correlation becomes
+  negative.
+
+  The arguments `filter_beyond_lag`, `filter_threshold` are filters intended to
+  remove noisy tail terms from `R_k`.  They combine in an "OR" manner meaning
+  terms are removed if they were to be filtered under the `filter_beyond_lag` OR
+  `filter_threshold` criteria.
+
+  Args:
+    states:  `Tensor` or list of `Tensor` objects.  Dimension zero should index
+      identically distributed states.
+    filter_threshold:  `Tensor` or list of `Tensor` objects.
+      Must broadcast with `state`.  The auto-correlation sequence is truncated
+      after the first appearance of a term less than `filter_threshold`.
+      Setting to `None` means we use no threshold filter.  Since `|R_k| <= 1`,
+      setting to any number less than `-1` has the same effect.
+    filter_beyond_lag:  `Tensor` or list of `Tensor` objects.  Must be
+      `int`-like and scalar valued.  The auto-correlation sequence is truncated
+      to this length.  Setting to `None` means we do not filter based on number
+      of lags.
+    name:  `String` name to prepend to created ops.
+
+  Returns:
+    ess:  `Tensor` or list of `Tensor` objects.  The effective sample size of
+      each component of `states`.  Shape will be `states.shape[1:]`.
+
+  Raises:
+    ValueError:  If `states` and `filter_threshold` or `states` and
+      `filter_beyond_lag` are both lists with different lengths.
+  """
+  states_was_list = _is_list_like(states)
+
+  # Convert all args to lists.
+  if not states_was_list:
+    states = [states]
+
+  filter_beyond_lag = _broadcast_maybelist_arg(states, filter_beyond_lag,
+                                               "filter_beyond_lag")
+  filter_threshold = _broadcast_maybelist_arg(states, filter_threshold,
+                                              "filter_threshold")
+
+  # Process items, one at a time.
+  with ops.name_scope(name, "effective_sample_size"):
+    ess_list = [
+        _effective_sample_size_single_state(s, ml, mlt)
+        for (s, ml, mlt) in zip(states, filter_beyond_lag, filter_threshold)
+    ]
+
+  if states_was_list:
+    return ess_list
+  return ess_list[0]
+
+
+def _effective_sample_size_single_state(states, filter_beyond_lag,
+                                        filter_threshold):
+  """ESS computation for one single Tensor argument."""
+
+  with ops.name_scope(
+      "effective_sample_size_single_state",
+      values=[states, filter_beyond_lag, filter_threshold]):
+
+    states = ops.convert_to_tensor(states, name="states")
+    dt = states.dtype
+
+    # filter_beyond_lag == None ==> auto_corr is the full sequence.
+    auto_corr = sample_stats.auto_correlation(
+        states, axis=0, max_lags=filter_beyond_lag)
+    if filter_threshold is not None:
+      filter_threshold = ops.convert_to_tensor(
+          filter_threshold, dtype=dt, name="filter_threshold")
+      # Get a binary mask to zero out values of auto_corr below the threshold.
+      #   mask[i, ...] = 1 if auto_corr[j, ...] > threshold for all j <= i,
+      #   mask[i, ...] = 0, otherwise.
+      # So, along dimension zero, the mask will look like [1, 1, ..., 0, 0,...]
+      # Building step by step,
+      #   Assume auto_corr = [1, 0.5, 0.0, 0.3], and filter_threshold = 0.2.
+      # Step 1:  mask = [False, False, True, False]
+      mask = auto_corr < filter_threshold
+      # Step 2:  mask = [0, 0, 1, 1]
+      mask = math_ops.cast(mask, dtype=dt)
+      # Step 3:  mask = [0, 0, 1, 2]
+      mask = math_ops.cumsum(mask, axis=0)
+      # Step 4:  mask = [1, 1, 0, 0]
+      mask = math_ops.maximum(1. - mask, 0.)
+      auto_corr *= mask
+
+    # With R[k] := auto_corr[k, ...],
+    # ESS = N / {1 + 2 * Sum_{k=1}^N (N - k) / N * R[k]}
+    #     = N / {-1 + 2 * Sum_{k=0}^N (N - k) / N * R[k]} (since R[0] = 1)
+    #     approx N / {-1 + 2 * Sum_{k=0}^M (N - k) / N * R[k]}
+    # where M is the filter_beyond_lag truncation point chosen above.
+
+    # Get the factor (N - k) / N, and give it shape [M, 1,...,1], having total
+    # ndims the same as auto_corr
+    n = _axis_size(states, axis=0)
+    k = math_ops.range(0., _axis_size(auto_corr, axis=0))
+    nk_factor = (n - k) / n
+    if auto_corr.shape.ndims is not None:
+      new_shape = [-1] + [1] * (auto_corr.shape.ndims - 1)
+    else:
+      new_shape = array_ops.concat(
+          ([-1],
+           array_ops.ones([array_ops.rank(auto_corr) - 1], dtype=dtypes.int32)),
+          axis=0)
+    nk_factor = array_ops.reshape(nk_factor, new_shape)
+
+    return n / (-1 + 2 * math_ops.reduce_sum(nk_factor * auto_corr, axis=0))
+
+
+def potential_scale_reduction(chains_states,
+                              independent_chain_ndims=1,
+                              name=None):
+  """Gelman and Rubin's potential scale reduction factor for chain convergence.
+
+  Given `N > 1` states from each of `C > 1` independent chains, the potential
+  scale reduction factor, commonly referred to as R-hat, measures convergence of
+  the chains (to the same target) by testing for equality of means.
+  Specifically, R-hat measures the degree to which variance (of the means)
+  between chains exceeds what one would expect if the chains were identically
+  distributed.  See [1], [2].
+
+  Some guidelines:
+
+  * The initial state of the chains should be drawn from a distribution
+    overdispersed with respect to the target.
+  * If all chains converge to the target, then as `N --> infinity`, R-hat --> 1.
+    Before that, R-hat > 1 (except in pathological cases, e.g. if the chain
+    paths were identical).
+  * The above holds for any number of chains `C > 1`.  Increasing `C` does
+    improves effectiveness of the diagnostic.
+  * Sometimes, R-hat < 1.2 is used to indicate approximate convergence, but of
+    course this is problem depedendent.  See [2].
+  * R-hat only measures non-convergence of the mean. If higher moments, or other
+    statistics are desired, a different diagnostic should be used.  See [2].
+
+  #### Examples
+
+  Diagnosing convergence by monitoring 10 chains that each attempt to
+  sample from a 2-variate normal.
+
+  ```python
+  tfd = tf.contrib.distributions
+  tfb = tf.contrib.bayesflow
+
+  target = tfd.MultivariateNormalDiag(scale_diag=[1., 2.])
+
+  # Get 10 (2x) overdispersed initial states.
+  initial_state = target.sample(10) * 2.
+  ==> (10, 2)
+
+  # Get 1000 samples from the 10 independent chains.
+  chains_states, _ = tfb.hmc.sample_chain(
+      num_results=1000,
+      target_log_prob_fn=target.log_prob,
+      current_state=initial_state,
+      step_size=0.05,
+      num_leapfrog_steps=20,
+      num_burnin_steps=200)
+  chains_states.shape
+  ==> (1000, 10, 2)
+
+  rhat = tfb.mcmc_diagnostics.potential_scale_reduction(
+      chains_states, independent_chain_ndims=1)
+
+  # The second dimension needed a longer burn-in.
+  rhat.eval()
+  ==> [1.05, 1.3]
+  ```
+
+  To see why R-hat is reasonable, let `X` be a random variable drawn uniformly
+  from the combined states (combined over all chains).  Then, in the limit
+  `N, C --> infinity`, with `E`, `Var` denoting expectation and variance,
+
+  ```R-hat = ( E[Var[X | chain]] + Var[E[X | chain]] ) / E[Var[X | chain]].```
+
+  Using the law of total variance, the numerator is the variance of the combined
+  states, and the denominator is the total variance minus the variance of the
+  the individual chain means.  If the chains are all drawing from the same
+  distribution, they will have the same mean, and thus the ratio should be one.
+
+  [1] "Inference from Iterative Simulation Using Multiple Sequences"
+      Andrew Gelman and Donald B. Rubin
+      Statist. Sci. Volume 7, Number 4 (1992), 457-472.
+  [2] "General Methods for Monitoring Convergence of Iterative Simulations"
+      Stephen P. Brooks and Andrew Gelman
+      Journal of Computational and Graphical Statistics, 1998. Vol 7, No. 4.
+
+  Args:
+    chains_states:  `Tensor` or Python `list` of `Tensor`s representing the
+      state(s) of a Markov Chain at each result step.  The `ith` state is
+      assumed to have shape `[Ni, Ci1, Ci2,...,CiD] + A`.
+      Dimension `0` indexes the `Ni > 1` result steps of the Markov Chain.
+      Dimensions `1` through `D` index the `Ci1 x ... x CiD` independent
+      chains to be tested for convergence to the same target.
+      The remaining dimensions, `A`, can have any shape (even empty).
+    independent_chain_ndims: Integer type `Tensor` with value `>= 1` giving the
+      number of giving the number of dimensions, from `dim = 1` to `dim = D`,
+      holding independent chain results to be tested for convergence.
+    name: `String` name to prepend to created ops.  Default:
+      `potential_scale_reduction`.
+
+  Returns:
+    `Tensor` or Python `list` of `Tensor`s representing the R-hat statistic for
+    the state(s).  Same `dtype` as `state`, and shape equal to
+    `state.shape[1 + independent_chain_ndims:]`.
+
+  Raises:
+    ValueError:  If `independent_chain_ndims < 1`.
+  """
+  chains_states_was_list = _is_list_like(chains_states)
+  if not chains_states_was_list:
+    chains_states = [chains_states]
+
+  # tensor_util.constant_value returns None iff a constant value (as a numpy
+  # array) is not efficiently computable.  Therefore, we try constant_value then
+  # check for None.
+  icn_const_ = tensor_util.constant_value(
+      ops.convert_to_tensor(independent_chain_ndims))
+  if icn_const_ is not None:
+    independent_chain_ndims = icn_const_
+    if icn_const_ < 1:
+      raise ValueError(
+          "Argument `independent_chain_ndims` must be `>= 1`, found: {}".format(
+              independent_chain_ndims))
+
+  with ops.name_scope(name, "potential_scale_reduction"):
+    rhat_list = [
+        _potential_scale_reduction_single_state(s, independent_chain_ndims)
+        for s in chains_states
+    ]
+
+  if chains_states_was_list:
+    return rhat_list
+  return rhat_list[0]
+
+
+def _potential_scale_reduction_single_state(state, independent_chain_ndims):
+  """potential_scale_reduction for one single state `Tensor`."""
+  with ops.name_scope(
+      "potential_scale_reduction_single_state",
+      values=[state, independent_chain_ndims]):
+    # We assume exactly one leading dimension indexes e.g. correlated samples
+    # from each Markov chain.
+    state = ops.convert_to_tensor(state, name="state")
+    sample_ndims = 1
+
+    sample_axis = math_ops.range(0, sample_ndims)
+    chain_axis = math_ops.range(sample_ndims,
+                                sample_ndims + independent_chain_ndims)
+    sample_and_chain_axis = math_ops.range(
+        0, sample_ndims + independent_chain_ndims)
+
+    n = _axis_size(state, sample_axis)
+    m = _axis_size(state, chain_axis)
+
+    # In the language of [2],
+    # B / n is the between chain variance, the variance of the chain means.
+    # W is the within sequence variance, the mean of the chain variances.
+    b_div_n = _reduce_variance(
+        math_ops.reduce_mean(state, sample_axis, keepdims=True),
+        sample_and_chain_axis,
+        biased=False)
+    w = math_ops.reduce_mean(
+        _reduce_variance(state, sample_axis, keepdims=True, biased=True),
+        sample_and_chain_axis)
+
+    # sigma^2_+ is an estimate of the true variance, which would be unbiased if
+    # each chain was drawn from the target.  c.f. "law of total variance."
+    sigma_2_plus = w + b_div_n
+
+    return ((m + 1.) / m) * sigma_2_plus / w - (n - 1.) / (m * n)
+
+
+# TODO(b/72873233) Move some variant of this to sample_stats.
+def _reduce_variance(x, axis=None, biased=True, keepdims=False):
+  with ops.name_scope("reduce_variance"):
+    x = ops.convert_to_tensor(x, name="x")
+    mean = math_ops.reduce_mean(x, axis=axis, keepdims=True)
+    biased_var = math_ops.reduce_mean(
+        math_ops.squared_difference(x, mean), axis=axis, keepdims=keepdims)
+    if biased:
+      return biased_var
+    n = _axis_size(x, axis)
+    return (n / (n - 1.)) * biased_var
+
+
+def _axis_size(x, axis=None):
+  """Get number of elements of `x` in `axis`, as type `x.dtype`."""
+  if axis is None:
+    return math_ops.cast(array_ops.size(x), x.dtype)
+  return math_ops.cast(
+      math_ops.reduce_prod(array_ops.gather(array_ops.shape(x), axis)), x.dtype)
+
+
+def _is_list_like(x):
+  """Helper which returns `True` if input is `list`-like."""
+  return isinstance(x, (tuple, list))
+
+
+def _broadcast_maybelist_arg(states, secondary_arg, name):
+  """Broadcast a listable secondary_arg to that of states."""
+  if _is_list_like(secondary_arg):
+    if len(secondary_arg) != len(states):
+      raise ValueError("Argument `%s` was a list of different length ({}) than "
+                       "`states` ({})".format(name, len(states)))
+  else:
+    secondary_arg = [secondary_arg] * len(states)
+
+  return secondary_arg
diff --git a/tensorflow/contrib/bayesflow/python/ops/optimizers.py b/tensorflow/contrib/bayesflow/python/ops/optimizers.py
index ee32e6b5c3d9efaeaf73436638c5eea55f2cfc70..fb70628d1083836281e9327e83e109493276c64f 100644
--- a/tensorflow/contrib/bayesflow/python/ops/optimizers.py
+++ b/tensorflow/contrib/bayesflow/python/ops/optimizers.py
@@ -24,11 +24,13 @@ from __future__ import print_function
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.contrib.bayesflow.python.ops.sgld_optimizer import *
+from tensorflow.contrib.bayesflow.python.ops.variational_sgd_optimizer import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     'SGLDOptimizer',
+    'VariationalSGDOptimizer',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py b/tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py
index 5d36ea7a2b51aa45cdc253992a2a58634c068987..7786656398e3c87704227be95b3cd23a38785249 100644
--- a/tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py
+++ b/tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py
@@ -189,6 +189,10 @@ class SGLDOptimizer(optimizer.Optimizer):
         new_grad,
         use_locking=self._use_locking).op
 
+  def _finish(self, update_ops, name_scope):
+    update_ops.append([self._counter.assign_add(1)])
+    return control_flow_ops.group(*update_ops, name=name_scope)
+
   @property
   def variable_scope(self):
     """Variable scope of all calls to `tf.get_variable`."""
diff --git a/tensorflow/contrib/bayesflow/python/ops/variable_utils.py b/tensorflow/contrib/bayesflow/python/ops/variable_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..eadf6f4d5fa1c776e2c71c66c4b64b8f5ac98359
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/variable_utils.py
@@ -0,0 +1,29 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions related to managing `tf.Variable`s."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+from tensorflow.contrib.bayesflow.python.ops.variable_utils_impl import *  # pylint: disable=wildcard-import,unused-wildcard-import,g-importing-member
+from tensorflow.python.util import all_util
+
+_allowed_symbols = [
+    "externalize_variables_as_args",
+]
+
+all_util.remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/variable_utils_impl.py b/tensorflow/contrib/bayesflow/python/ops/variable_utils_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca3d75b5bfee093449026c7d1d62e3bdeff6b096
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/variable_utils_impl.py
@@ -0,0 +1,157 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions related to managing `tf.Variable`s.
+
+@@externalize_variables_as_args
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gradients_impl as gradients_ops
+from tensorflow.python.ops import variable_scope as varscope_ops
+from tensorflow.python.ops import variables as variables_ops
+
+__all__ = [
+    "externalize_variables_as_args",
+]
+
+
+# Cause all warnings to always be triggered.
+# Not having this means subsequent calls wont trigger the warning.
+warnings.simplefilter("always")
+
+
+def externalize_variables_as_args(fn,
+                                  fn_args=(),
+                                  ancestor_variables=None,
+                                  possible_ancestor_vars=None,
+                                  assert_variable_override=False,
+                                  name=None):
+  """"Converts variables within a callable into explicit args.
+
+  Makes a new callable from `fn` which has arguments `list(fn_args) +
+  list(ancestor_variables)`. If `ancestor_variables` is not specified, it is
+  inferred by checking which of `possible_ancestor_vars` actually influences the
+  return value of `fn` (concretely, gradient of `fn(*fn_args)` is not `None`).
+  By default `possible_ancestor_vars` is `tf.trainable_variables() +
+  tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)`.
+
+  #### Examples:
+
+  ```python
+  num_samples = 2
+  num_dims = 1
+  dtype = np.float32
+
+  def foo(x):
+    x = tf.convert_to_tensor(x, dtype=dtype, name="x")
+    s = x.shape.as_list()
+    y = tf.get_variable(
+        name="y",
+        dtype=dtype,
+        initializer=np.arange(np.prod(s)).reshape(s).astype(dtype))
+    return x + y
+
+  x = tf.constant(dtype([0.1, 0.2]))
+
+  wrapped_foo, discovered_ancestor_variables = (
+      externalize_variables_as_args(foo, [x]))
+
+  new_x = dtype([[1.], [2.]])
+  new_y = dtype([[3.], [4.]])
+  new_result = wrapped_foo(new_x, new_y)
+  # ==> [[4.], [6.]]
+
+  discovered_ancestor_variables == [tf.get_variable("y", dtype)]
+  # ==> [True]
+  ```
+
+  Args:
+    fn: Python callable which returns a `Tensor` and accepts `*fn_args`.
+    fn_args: Python list of args to `fn`. Represents dummy arguments passed to
+      `fn` to trace its execution; actual values are unimportant. These args are
+      only used to construct the output of `fn` and to resolve the ancestor
+      `tf.Variable`s.
+      Default value: `()` (i.e., `fn` takes no args).
+    ancestor_variables: Python list of `tf.Variable`s. When `None` the list is
+      expanded to non-`None` gradients of `fn(*fn_args)`. By directly providing
+      the `ancestor_variables` the internal call to `fn` is avoided.
+      Default value: `None` (i.e., `tf.Variable` dependencies are discovered).
+    possible_ancestor_vars: Python list of possible `tf.Variable`s which might
+      be a dependency of computing `fn(*fn_args)`.
+      Default value: `None` (i.e., expanded as described above).
+    assert_variable_override: Python `bool` indicating that not finding a
+      `tf.Variable` in the override list is an exception.
+      Default value: `False` (i.e., missing a `Variable` triggers a `warning`).
+    name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "externalize_variables_as_args").
+
+  Returns:
+    wrapped_fn: Python callable taking arguments like
+      `*(list(fn_args) + discovered_ancestor_variables)`.
+    discovered_ancestor_variables: Python list of `tf.Variable`s known to be a
+      dependency of `fn(*fn_args)`.
+
+  Raises:
+    ValueError: if `assert_variable_override` is `True` and `Variable` is
+      requested but not overridden.
+  """
+  def _make_bypassing_custom_getter_fn(new_var_dict):
+    """Return dict value rather than what would otherwise be dict key."""
+    def _custom_getter(getter, *args, **kwargs):
+      v = getter(*args, **kwargs)
+      new_v = new_var_dict.get(v, None)
+      if new_v is None:
+        msg = "Variable \"{}\" not found in bypass dict.".format(v)
+        if assert_variable_override:
+          raise ValueError(msg)
+        warnings.warn(msg)
+        return v
+      return new_v
+    return _custom_getter
+
+  with ops.name_scope(name, "externalize_variables_as_args"):
+    if ancestor_variables is not None and not ancestor_variables:
+      return fn, ()
+    if ancestor_variables is None:
+      y = fn(*fn_args)  # Side-effect: adds trainable vars.
+      if possible_ancestor_vars is None:
+        possible_ancestor_vars = (
+            variables_ops.trainable_variables() +
+            ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+      # TODO(b/72873296): Add a dedicated op for identifying ancestors.
+      ancestors = [v for g, v
+                   in zip(gradients_ops.gradients(y, possible_ancestor_vars),
+                          possible_ancestor_vars)
+                   if g is not None]
+      ancestor_variables = sorted(ancestors, key=lambda v: v.name)
+  n = len(fn_args)
+  def _fn(*args):
+    with ops.name_scope("wrapped_fn"):
+      vars_dict = dict(
+          (k, ops.convert_to_tensor(
+              v, dtype=k.dtype.base_dtype, name=k.op.name))
+          for k, v in zip(ancestor_variables, args[n:]))
+      with varscope_ops.variable_scope(
+          varscope_ops.get_variable_scope(),
+          reuse=True,
+          custom_getter=_make_bypassing_custom_getter_fn(vars_dict)):
+        return fn(*args[:n])
+  return _fn, ancestor_variables
diff --git a/tensorflow/contrib/bayesflow/python/ops/variational_sgd_optimizer.py b/tensorflow/contrib/bayesflow/python/ops/variational_sgd_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d5f0cfe9713a011b32c5aba8d429847d81f33e2
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/variational_sgd_optimizer.py
@@ -0,0 +1,279 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An optimizer module for constant stochastic gradient descent."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope as varscope_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class VariationalSGDOptimizer(optimizer.Optimizer):
+  """An optimizer module for constant stochastic gradient descent.
+
+  This implements an optimizer module for the constant stochastic gradient
+  descent algorithm [1].  The optimization variable is regarded as an
+  approximate sample from the posterior .
+
+  Note: If a prior is included in the loss, it should be scaled by
+  `1/num_pseudo_batches`, where num_pseudo_batches is the number of minibatches
+  in the data.  I.e., it should be divided by the `num_pseudo_batches` term
+  described below.
+
+  [1]: "Stochastic Gradient Descent as Approximate Bayesian Inference
+       Stephan Mandt, Matthew D. Hoffman, David M. Blei.
+       ArXiv:1704.04289, 2017. https://arxiv.org/abs/1704.04289
+
+  Args:
+    batch_size: Scalar `int`-like `Tensor`. The number of examples in a
+      minibatch in the data set. Note: Assumes the loss is taken as the mean
+      over a minibatch. Otherwise if the sum was taken set this to 1.
+    total_num_examples: Scalar `int`-like `Tensor`. The total number of examples
+      in the data set.
+    max_learning_rate: Scalar `float`-like `Tensor`. A maximum allowable
+      effective coordinate-wise learning rate. The algorithm scales down any
+      effective learning rate (i.e. after preconditioning) that is larger than
+      this. (Default: `1`)
+    preconditioner_decay_rate: Scalar `float`-like `Tensor`. The exponential
+      decay rate of the rescaling of the preconditioner (RMSprop). (This is
+      "alpha" in [1]). Should be smaller than but nearly `1` to approximate
+      sampling from the posterior. (Default: `0.95`)
+    burnin: Scalar `int`-like `Tensor`. The number of iterations to collect
+      gradient statistics to update the preconditioner before starting to draw
+      noisy samples. (Default: `25`)
+    burnin_max_learning_rate: Scalar `float`-like `Tensor`. Maximum learning
+      rate to use during the burnin period.
+      (Default: `1e-8`)
+    use_single_learning_rate: Boolean Indicates whether one single learning
+      rate is used or coordinate_wise learning rates are used.
+      (Default: `False`)
+    name: Python `str` describing ops managed by this function.
+      (Default: `"VariationalSGDOptimizer"`)
+    variable_scope: Variable scope used for calls to `tf.get_variable`.
+      If `None`, a new variable scope is created using name
+      `ops.get_default_graph().unique_name(name or default_name)`.
+
+  Raises:
+    InvalidArgumentError: If preconditioner_decay_rate is a `Tensor` not in
+      `(0,1]`.
+  """
+
+  def __init__(self,
+               batch_size,
+               total_num_examples,
+               max_learning_rate=1.0,
+               preconditioner_decay_rate=0.95,
+               burnin=25,
+               burnin_max_learning_rate=1e-6,
+               use_single_learning_rate=False,
+               name=None,
+               variable_scope=None):
+    default_name = 'VariationalSGDOptimizer'
+    with ops.name_scope(name, default_name, [
+        max_learning_rate, preconditioner_decay_rate, batch_size, burnin,
+        burnin_max_learning_rate
+    ]):
+      if variable_scope is None:
+        var_scope_name = ops.get_default_graph().unique_name(
+            name or default_name)
+        with varscope_ops.variable_scope(var_scope_name) as scope:
+          self._variable_scope = scope
+      else:
+        self._variable_scope = variable_scope
+
+      self._preconditioner_decay_rate = ops.convert_to_tensor(
+          preconditioner_decay_rate, name='preconditioner_decay_rate')
+      self._batch_size = ops.convert_to_tensor(batch_size, name='batch_size')
+      self._total_num_examples = ops.convert_to_tensor(
+          total_num_examples, name='total_num_examples')
+      self._burnin = ops.convert_to_tensor(burnin, name='burnin')
+      self._burnin_max_learning_rate = ops.convert_to_tensor(
+          burnin_max_learning_rate, name='burnin_max_learning_rate')
+      self._max_learning_rate = ops.convert_to_tensor(
+          max_learning_rate, name='max_learning_rate')
+      self._use_single_learning_rate = use_single_learning_rate
+
+      with varscope_ops.variable_scope(self._variable_scope):
+        self._counter = varscope_ops.get_variable(
+            'counter', initializer=0, trainable=False)
+
+      self._preconditioner_decay_rate = control_flow_ops.with_dependencies([
+          check_ops.assert_non_negative(
+              self._preconditioner_decay_rate,
+              message='`preconditioner_decay_rate` must be non-negative'),
+          check_ops.assert_less_equal(
+              self._preconditioner_decay_rate,
+              1.,
+              message='`preconditioner_decay_rate` must be at most 1.'),
+      ], self._preconditioner_decay_rate)
+
+      self._batch_size = control_flow_ops.with_dependencies([
+          check_ops.assert_greater(
+              self._batch_size,
+              0,
+              message='`batch_size` must be greater than zero')
+      ], self._batch_size)
+
+      self._total_num_examples = control_flow_ops.with_dependencies([
+          check_ops.assert_greater(
+              self._total_num_examples,
+              0,
+              message='`total_num_examples` must be greater than zero')
+      ], self._total_num_examples)
+
+      self._burnin = control_flow_ops.with_dependencies([
+          check_ops.assert_non_negative(
+              self._burnin, message='`burnin` must be non-negative'),
+          check_ops.assert_integer(
+              self._burnin, message='`burnin` must be an integer')
+      ], self._burnin)
+
+      self._burnin_max_learning_rate = control_flow_ops.with_dependencies([
+          check_ops.assert_non_negative(
+              self._burnin_max_learning_rate,
+              message='`burnin_max_learning_rate` must be non-negative')
+      ], self._burnin_max_learning_rate)
+
+      self._max_learning_rate = control_flow_ops.with_dependencies([
+          check_ops.assert_non_negative(
+              self._max_learning_rate,
+              message='`max_learning_rate` must be non-negative')
+      ], self._max_learning_rate)
+
+      super(VariationalSGDOptimizer, self).__init__(
+          use_locking=False, name=name or default_name)
+
+  def _create_slots(self, var_list):
+    for v in var_list:
+      init_moment = init_ops.zeros_initializer(dtype=v.dtype)
+      self._get_or_make_slot_with_initializer(
+          v, init_moment, v.get_shape(), v.dtype, 'first_moment', self._name)
+      self._get_or_make_slot_with_initializer(
+          v, init_moment, v.get_shape(), v.dtype, 'second_moment', self._name)
+
+  def _prepare(self):
+    self._decay_tensor = ops.convert_to_tensor(
+        self._preconditioner_decay_rate, name='preconditioner_decay_rate')
+    self._batch_size_tensor = ops.convert_to_tensor(
+        self._batch_size, name='batch_size_tensor')
+
+    super(VariationalSGDOptimizer, self)._prepare()
+
+  def _get_coordinatewise_learning_rate(self, grad, var):
+    # Compute the learning rate using a moving average for the diagonal of BB^T
+    avg_first = self.get_slot(var, 'first_moment')
+    avg_second = self.get_slot(var, 'second_moment')
+    decay_tensor = math_ops.cast(self._decay_tensor, var.dtype)
+    batch_size = math_ops.cast(self._batch_size_tensor, var.dtype)
+
+    # Create an estimator for the moving average of gradient mean and variance
+    # via Welford's algorithm
+    if isinstance(grad, ops.Tensor):
+      delta = grad - avg_first
+      first_moment_update = avg_first.assign_add(
+          array_ops.where(self._counter < 1, math_ops.cast(1, var.dtype),
+                          1. - decay_tensor) * delta)
+
+      with ops.control_dependencies([first_moment_update]):
+        second_moment_update = avg_second.assign_add(
+            math_ops.cast(self._counter < 1, var.dtype) *
+            -(1. - decay_tensor) * (
+                avg_second - decay_tensor  * math_ops.square(delta)))
+      diag_preconditioner = control_flow_ops.with_dependencies(
+          [second_moment_update],
+          clip_ops.clip_by_value(avg_second, 1e-12, 1e12))
+    elif isinstance(grad, ops.IndexedSlices):
+      delta = grad.values - array_ops.gather_nd(avg_first, grad.indices)
+      first_moment_update = state_ops.scatter_add(
+          avg_first,
+          grad.indices,
+          array_ops.where(self._counter < 1,
+                          math_ops.cast(1., var.dtype),
+                          1. - decay_tensor) * delta)
+
+      with ops.control_dependencies([first_moment_update]):
+        avg_second = state_ops.scatter_add(
+            avg_second,
+            grad.indices,
+            math_ops.cast(self._counter < 1, var.dtype) *
+            -(1. - decay_tensor) * (
+                array_ops.gather_nd(avg_second, grad.indices) - decay_tensor *
+                math_ops.square(delta)))
+        avg_second = array_ops.gather_nd(avg_second, grad.indices)
+        # TODO(b/70783772)
+        diag_preconditioner = clip_ops.clip_by_value(avg_second, 1e-12, 1e12)
+    else:
+      raise errors.InvalidArgumentError(
+          None, None, 'grad must of type Tensor or IndexedSlice')
+
+    diag_preconditioner *= batch_size
+
+    if self._use_single_learning_rate:
+      diag_preconditioner = math_ops.reduce_mean(diag_preconditioner)
+
+    # From Theorem 2 Corollary 1 of Mandt et al. 2017
+    return 2. * batch_size / (
+        math_ops.cast(self._total_num_examples, var.dtype.base_dtype) *
+        diag_preconditioner)
+
+  def _apply_dense(self, grad, var):
+
+    max_learning_rate = array_ops.where(self._counter < self._burnin,
+                                        self._burnin_max_learning_rate,
+                                        self._max_learning_rate)
+
+    learn_rates = clip_ops.clip_by_value(
+        self._get_coordinatewise_learning_rate(grad, var), 0.0,
+        math_ops.cast(max_learning_rate, var.dtype.base_dtype))
+
+    newgrad = grad * learn_rates
+    return training_ops.apply_gradient_descent(
+        var,
+        math_ops.cast(1.0, var.dtype),
+        newgrad,
+        use_locking=self._use_locking).op
+
+  def _apply_sparse(self, grad, var):
+
+    max_learning_rate = array_ops.where(self._counter < self._burnin,
+                                        self._burnin_max_learning_rate,
+                                        self._max_learning_rate)
+
+    learn_rate = clip_ops.clip_by_value(
+        self._get_coordinatewise_learning_rate(grad, var), 0.0,
+        math_ops.cast(max_learning_rate, var.dtype))
+    delta = grad.values * learn_rate
+
+    return state_ops.scatter_sub(var, grad.indices, delta,
+                                 use_locking=self._use_locking)
+
+  def _finish(self, update_ops, name_scope):
+    update_ops.append([self._counter.assign_add(1)])
+    return control_flow_ops.group(*update_ops, name=name_scope)
+
+  @property
+  def variable_scope(self):
+    """Variable scope of all calls to `tf.get_variable`."""
+    return self._variable_scope
diff --git a/tensorflow/contrib/boosted_trees/BUILD b/tensorflow/contrib/boosted_trees/BUILD
index 7072f56420ac9e576b20b62c0aa67498857403a7..6fdcd0f996ee011842a5add79f06264a28a2145c 100644
--- a/tensorflow/contrib/boosted_trees/BUILD
+++ b/tensorflow/contrib/boosted_trees/BUILD
@@ -196,6 +196,7 @@ py_test(
     name = "quantile_ops_test",
     size = "small",
     srcs = ["python/kernel_tests/quantile_ops_test.py"],
+    shard_count = 3,
     srcs_version = "PY2AND3",
     deps = [
         ":quantile_ops_py",
@@ -601,6 +602,7 @@ py_library(
         ":init_py",
         "//tensorflow/contrib/boosted_trees:gbdt_batch",
         "//tensorflow/contrib/boosted_trees/estimator_batch:custom_export_strategy",
+        "//tensorflow/contrib/boosted_trees/estimator_batch:dnn_tree_combined_estimator",
         "//tensorflow/contrib/boosted_trees/estimator_batch:init_py",
         "//tensorflow/contrib/boosted_trees/estimator_batch:trainer_hooks",
         "//tensorflow/contrib/boosted_trees/lib:categorical_split_handler",
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index 7792c7127c0285dc2eb5b213da054674f6a81d64..289f5bb3140974d8c37f4938ceef27275b099f9a 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -50,6 +50,7 @@ py_library(
     deps = [
         "//tensorflow/contrib/learn",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:training",
@@ -129,3 +130,38 @@ py_library(
         "//tensorflow/python:math_ops",
     ],
 )
+
+py_library(
+    name = "dnn_tree_combined_estimator",
+    srcs = ["dnn_tree_combined_estimator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":trainer_hooks",
+        "//tensorflow/contrib/boosted_trees:gbdt_batch",
+        "//tensorflow/contrib/boosted_trees:model_ops_py",
+        "//tensorflow/contrib/learn",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "dnn_tree_combined_estimator_test",
+    size = "small",
+    srcs = ["dnn_tree_combined_estimator_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_gpu",
+        "no_pip_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":dnn_tree_combined_estimator",
+        "//tensorflow/contrib/boosted_trees:gbdt_batch",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
index ef8dee91b6cc05c4c3dd5eb3c81de4fb65b473e3..31f5c444817b9b82723c86bea3504d4934e57eb8 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
@@ -33,6 +33,8 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.saved_model import loader as saved_model_loader
 from tensorflow.python.saved_model import tag_constants
 
+_SPARSE_FLOAT_FEATURE_NAME_TEMPLATE = "%s_%d"
+
 
 def make_custom_export_strategy(name,
                                 convert_fn,
@@ -147,13 +149,15 @@ def convert_to_universal_format(dtec, sorted_feature_names,
           inequality_test.threshold.float_value = split.threshold
         elif node_type == "sparse_float_binary_split_default_left":
           split = gtflow_node.sparse_float_binary_split_default_left.split
-          node.default_direction = (
-              generic_tree_model_pb2.BinaryNode.LEFT)
-          # TODO(nponomareva): adjust this id assignement when we allow multi-
-          # column sparse tensors.
+          node.default_direction = (generic_tree_model_pb2.BinaryNode.LEFT)
           feature_id = split.feature_column + num_dense
           inequality_test = node.inequality_left_child_test
-          inequality_test.feature_id.id.value = sorted_feature_names[feature_id]
+          inequality_test.feature_id.id.value = (
+              _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE %
+              (sorted_feature_names[feature_id], split.dimension_id))
+          model_and_features.features.pop(sorted_feature_names[feature_id])
+          (model_and_features.features[inequality_test.feature_id.id.value]
+           .SetInParent())
           inequality_test.type = (
               generic_tree_model_pb2.InequalityTest.LESS_OR_EQUAL)
           inequality_test.threshold.float_value = split.threshold
@@ -165,7 +169,12 @@ def convert_to_universal_format(dtec, sorted_feature_names,
           # column sparse tensors.
           feature_id = split.feature_column + num_dense
           inequality_test = node.inequality_left_child_test
-          inequality_test.feature_id.id.value = sorted_feature_names[feature_id]
+          inequality_test.feature_id.id.value = (
+              _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE %
+              (sorted_feature_names[feature_id], split.dimension_id))
+          model_and_features.features.pop(sorted_feature_names[feature_id])
+          (model_and_features.features[inequality_test.feature_id.id.value]
+           .SetInParent())
           inequality_test.type = (
               generic_tree_model_pb2.InequalityTest.LESS_OR_EQUAL)
           inequality_test.threshold.float_value = split.threshold
@@ -201,10 +210,14 @@ def _get_feature_importances(dtec, feature_names, num_dense_floats,
         split_column = feature_names[split.feature_column]
       elif node_type == "sparse_float_binary_split_default_left":
         split = tree_node.sparse_float_binary_split_default_left.split
-        split_column = feature_names[split.feature_column + num_dense_floats]
+        split_column = _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE % (
+            feature_names[split.feature_column + num_dense_floats],
+            split.dimension_id)
       elif node_type == "sparse_float_binary_split_default_right":
         split = tree_node.sparse_float_binary_split_default_right.split
-        split_column = feature_names[split.feature_column + num_dense_floats]
+        split_column = _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE % (
+            feature_names[split.feature_column + num_dense_floats],
+            split.dimension_id)
       elif node_type == "categorical_id_binary_split":
         split = tree_node.categorical_id_binary_split
         split_column = feature_names[split.feature_column + num_dense_floats +
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
index 4ed18b2d34c5af47826ab1c058f5d13797593bd4..67ec0e16bf815e9dbea6567cc87c3980a825a004 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the conversion code from GTFlow format to Chauffeur."""
+"""Tests for the conversion code and for feature importances export.
+
+Tests that cover conversion from TFBT format to a tensorflow.contrib.
+decision_tree generic_tree_model format and feature importances export.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -95,10 +99,31 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
           }
         }
       }
+      nodes {
+        sparse_float_binary_split_default_right {
+          split {
+            feature_column: 1
+            dimension_id:3
+            threshold: -0.4
+            left_id: 7
+            right_id: 8
+          }
+        }
+        node_metadata {
+            gain: 3600
+        }
+      }
+      nodes {
+        leaf {
+          vector {
+            value: 0.36
+          }
+        }
+      }
       nodes {
         leaf {
           vector {
-            value: 0.3
+            value: 18
           }
         }
       }
@@ -108,17 +133,25 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
     """
     dtec = tree_config_pb2.DecisionTreeEnsembleConfig()
     text_format.Merge(dtec_str, dtec)
-    feature_columns = ["feature_b", "feature_a", "feature_d"]
+    feature_columns = [
+        "feature_b",
+        "feature_a",
+        "feature_a_m",
+        "feature_d",
+    ]
     return dtec, feature_columns
 
   def testConvertModel(self):
     dtec, feature_columns = self._make_trees()
+    # Assume 2 sparse float columns, one with 1 dimension, the second one with
+    # 5 dimensions.
     # The feature columns in the order they were added.
     out = custom_export_strategy.convert_to_universal_format(
-        dtec, feature_columns, 1, 1,
-        1)
+        dtec, feature_columns, 1, 2, 1)
+    # Features a and a_m are sparse float features, a_m is multidimensional.
     expected_tree = """
-    features { key: "feature_a" }
+    features { key: "feature_a_0" }
+    features { key: "feature_a_m_3" }
     features { key: "feature_b" }
     features { key: "feature_d" }
     model {
@@ -169,7 +202,6 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
                   }
                 }
               }
-
               nodes {
                 node_id {
                   value: 1
@@ -196,7 +228,7 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
                   inequality_left_child_test {
                     feature_id {
                       id {
-                        value: "feature_a"
+                        value: "feature_a_0"
                       }
                     }
                     threshold {
@@ -259,14 +291,51 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
                 node_id {
                   value: 6
                 }
+                binary_node {
+                  left_child_id {
+                    value: 7
+                  }
+                  right_child_id {
+                    value: 8
+                  }
+                  default_direction: RIGHT
+                  inequality_left_child_test {
+                      feature_id {
+                        id {
+                          value: "feature_a_m_3"
+                        }
+                      }
+                      threshold {
+                        float_value: -0.4
+                      }
+                  }
+                }
+              }
+              nodes {
+                node_id {
+                  value: 7
+                }
                 leaf {
                   vector {
                     value {
-                      float_value: 0.03
+                      float_value: 0.036
                     }
                   }
                 }
               }
+              nodes {
+                node_id {
+                  value: 8
+                }
+                leaf {
+                  vector {
+                    value {
+                      float_value: 1.8
+                    }
+                  }
+                }
+              }
+
             }
           }
           submodel_id {
@@ -280,12 +349,15 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
   def testFeatureImportance(self):
     dtec, feature_columns = self._make_trees()
     feature_importances = custom_export_strategy._get_feature_importances(
-        dtec, feature_columns, 1, 1, 1)
-    self.assertItemsEqual(["feature_b", "feature_a", "feature_d"],
-                          feature_importances.keys())
+        dtec, feature_columns, 1, 2, 1)
+    self.assertItemsEqual(
+        ["feature_b", "feature_a_0", "feature_a_m_3", "feature_d"],
+        feature_importances.keys())
     self.assertAlmostEqual(50.0, feature_importances["feature_b"], places=4)
-    self.assertAlmostEqual(50.0, feature_importances["feature_a"], places=4)
+    self.assertAlmostEqual(50.0, feature_importances["feature_a_0"], places=4)
     self.assertAlmostEqual(50.0, feature_importances["feature_d"], places=4)
+    self.assertAlmostEqual(
+        360.0, feature_importances["feature_a_m_3"], places=4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..cec3892b57655dc967b4e7926f7f5a6a30084487
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -0,0 +1,515 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow estimators for combined DNN + GBDT training model.
+
+The combined model trains a DNN first, then trains boosted trees to boost the
+logits of the DNN. The input layer of the DNN (including the embeddings learned
+over sparse features) can optionally be provided to the boosted trees as
+an additional input feature.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib import layers
+from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
+from tensorflow.contrib.boosted_trees.python.ops import model_ops
+from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
+from tensorflow.contrib.layers.python.layers import optimizers
+from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
+
+
+_DNN_LEARNING_RATE = 0.001
+
+
+def _get_optimizer(optimizer):
+  if callable(optimizer):
+    return optimizer()
+  else:
+    return optimizer
+
+
+def _add_hidden_layer_summary(value, tag):
+  summary.scalar("%s_fraction_of_zero_values" % tag, nn.zero_fraction(value))
+  summary.histogram("%s_activation" % tag, value)
+
+
+def _dnn_tree_combined_model_fn(
+    features, labels, mode, head, dnn_hidden_units,
+    dnn_feature_columns, tree_learner_config, num_trees,
+    tree_examples_per_layer,
+    config=None, dnn_optimizer="Adagrad",
+    dnn_activation_fn=nn.relu, dnn_dropout=None,
+    dnn_input_layer_partitioner=None,
+    dnn_input_layer_to_tree=True, dnn_steps_to_train=10000,
+    tree_feature_columns=None,
+    tree_center_bias=True):
+  """DNN and GBDT combined model_fn.
+
+  Args:
+    features: `dict` of `Tensor` objects.
+    labels: Labels used to train on.
+    mode: Mode we are in. (TRAIN/EVAL/INFER)
+    head: A `Head` instance.
+    dnn_hidden_units: List of hidden units per layer.
+    dnn_feature_columns: An iterable containing all the feature columns
+      used by the model's DNN.
+    tree_learner_config: A config for the tree learner.
+    num_trees: Number of trees to grow model to after training DNN.
+    tree_examples_per_layer: Number of examples to accumulate before
+      growing the tree a layer. This value has a big impact on model
+      quality and should be set equal to the number of examples in
+      training dataset if possible. It can also be a function that computes
+      the number of examples based on the depth of the layer that's
+      being built.
+    config: `RunConfig` of the estimator.
+    dnn_optimizer: string, `Optimizer` object, or callable that defines the
+      optimizer to use for training the DNN. If `None`, will use the Adagrad
+      optimizer with default learning rate of 0.001.
+    dnn_activation_fn: Activation function applied to each layer of the DNN.
+      If `None`, will use `tf.nn.relu`.
+    dnn_dropout: When not `None`, the probability to drop out a given
+      unit in the DNN.
+    dnn_input_layer_partitioner: Partitioner for input layer of the DNN.
+      Defaults to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+    dnn_input_layer_to_tree: Whether to provide the DNN's input layer
+    as a feature to the tree.
+    dnn_steps_to_train: Number of steps to train dnn for before switching
+      to gbdt.
+    tree_feature_columns: An iterable containing all the feature columns
+      used by the model's boosted trees. If dnn_input_layer_to_tree is
+      set to True, these features are in addition to dnn_feature_columns.
+    tree_center_bias: Whether a separate tree should be created for
+      first fitting the bias.
+
+  Returns:
+    A `ModelFnOps` object.
+  Raises:
+    ValueError: if inputs are not valid.
+  """
+  if not isinstance(features, dict):
+    raise ValueError("features should be a dictionary of `Tensor`s. "
+                     "Given type: {}".format(type(features)))
+
+  if not dnn_feature_columns:
+    raise ValueError("dnn_feature_columns must be specified")
+
+  # Build DNN Logits.
+  dnn_parent_scope = "dnn"
+  dnn_partitioner = dnn_input_layer_partitioner or (
+      partitioned_variables.min_max_variable_partitioner(
+          max_partitions=config.num_ps_replicas,
+          min_slice_size=64 << 20))
+
+  with variable_scope.variable_scope(
+      dnn_parent_scope,
+      values=tuple(six.itervalues(features)),
+      partitioner=dnn_partitioner):
+
+    with variable_scope.variable_scope(
+        "input_from_feature_columns",
+        values=tuple(six.itervalues(features)),
+        partitioner=dnn_partitioner) as input_layer_scope:
+      input_layer = layers.input_from_feature_columns(
+          columns_to_tensors=features,
+          feature_columns=dnn_feature_columns,
+          weight_collections=[dnn_parent_scope],
+          scope=input_layer_scope)
+    previous_layer = input_layer
+    for layer_id, num_hidden_units in enumerate(dnn_hidden_units):
+      with variable_scope.variable_scope(
+          "hiddenlayer_%d" % layer_id,
+          values=(previous_layer,)) as hidden_layer_scope:
+        net = layers.fully_connected(
+            previous_layer,
+            num_hidden_units,
+            activation_fn=dnn_activation_fn,
+            variables_collections=[dnn_parent_scope],
+            scope=hidden_layer_scope)
+        if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN:
+          net = layers.dropout(net, keep_prob=(1.0 - dnn_dropout))
+      _add_hidden_layer_summary(net, hidden_layer_scope.name)
+      previous_layer = net
+    with variable_scope.variable_scope(
+        "logits",
+        values=(previous_layer,)) as logits_scope:
+      dnn_logits = layers.fully_connected(
+          previous_layer,
+          head.logits_dimension,
+          activation_fn=None,
+          variables_collections=[dnn_parent_scope],
+          scope=logits_scope)
+    _add_hidden_layer_summary(dnn_logits, logits_scope.name)
+
+    def _dnn_train_op_fn(loss):
+      """Returns the op to optimize the loss."""
+      return optimizers.optimize_loss(
+          loss=loss,
+          global_step=training_util.get_global_step(),
+          learning_rate=_DNN_LEARNING_RATE,
+          optimizer=_get_optimizer(dnn_optimizer),
+          name=dnn_parent_scope,
+          variables=ops.get_collection(
+              ops.GraphKeys.TRAINABLE_VARIABLES,
+              scope=dnn_parent_scope),
+          # Empty summaries to prevent optimizers from logging training_loss.
+          summaries=[])
+
+  # Build Tree Logits.
+  global_step = training_util.get_global_step()
+  with ops.device(global_step.device):
+    ensemble_handle = model_ops.tree_ensemble_variable(
+        stamp_token=0,
+        tree_ensemble_config="",  # Initialize an empty ensemble.
+        name="ensemble_model")
+
+  tree_features = features.copy()
+  if dnn_input_layer_to_tree:
+    tree_features["dnn_input_layer"] = input_layer
+    tree_feature_columns.append(layers.real_valued_column("dnn_input_layer"))
+  gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+      is_chief=config.is_chief,
+      num_ps_replicas=config.num_ps_replicas,
+      ensemble_handle=ensemble_handle,
+      center_bias=tree_center_bias,
+      examples_per_layer=tree_examples_per_layer,
+      learner_config=tree_learner_config,
+      feature_columns=tree_feature_columns,
+      logits_dimension=head.logits_dimension,
+      features=tree_features)
+
+  with ops.name_scope("gbdt"):
+    predictions_dict = gbdt_model.predict(mode)
+    tree_logits = predictions_dict["predictions"]
+
+    def _tree_train_op_fn(loss):
+      """Returns the op to optimize the loss."""
+      update_op = gbdt_model.train(loss, predictions_dict, labels)
+      with ops.control_dependencies(
+          [update_op]), (ops.colocate_with(global_step)):
+        update_op = state_ops.assign_add(global_step, 1).op
+        return update_op
+
+  tree_train_logits = dnn_logits + tree_logits
+
+  def _no_train_op_fn(loss):
+    """Returns a no-op."""
+    del loss
+    return control_flow_ops.no_op()
+
+  model_fn_ops = head.create_model_fn_ops(
+      features=features,
+      mode=mode,
+      labels=labels,
+      train_op_fn=_no_train_op_fn,
+      logits=tree_train_logits)
+  dnn_train_op = head.create_model_fn_ops(
+      features=features,
+      mode=mode,
+      labels=labels,
+      train_op_fn=_dnn_train_op_fn,
+      logits=dnn_logits).train_op
+  tree_train_op = head.create_model_fn_ops(
+      features=tree_features,
+      mode=mode,
+      labels=labels,
+      train_op_fn=_tree_train_op_fn,
+      logits=tree_train_logits).train_op
+
+  if tree_center_bias:
+    num_trees += 1
+  finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor()
+
+  model_fn_ops.training_hooks.extend([
+      trainer_hooks.SwitchTrainOp(
+          dnn_train_op, dnn_steps_to_train, tree_train_op),
+      trainer_hooks.StopAfterNTrees(
+          num_trees, attempted_trees, finalized_trees)])
+
+  return model_fn_ops
+
+
+class DNNBoostedTreeCombinedClassifier(estimator.Estimator):
+  """A classifier that uses a combined DNN/GBDT model."""
+
+  def __init__(self,
+               dnn_hidden_units,
+               dnn_feature_columns,
+               tree_learner_config,
+               num_trees,
+               tree_examples_per_layer,
+               n_classes=2,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               label_name=None,
+               label_keys=None,
+               feature_engineering_fn=None,
+               dnn_optimizer="Adagrad",
+               dnn_activation_fn=nn.relu,
+               dnn_dropout=None,
+               dnn_input_layer_partitioner=None,
+               dnn_input_layer_to_tree=True,
+               dnn_steps_to_train=10000,
+               tree_feature_columns=None,
+               tree_center_bias=True):
+    """Initializes a DNNBoostedTreeCombinedClassifier instance.
+
+    Args:
+      dnn_hidden_units: List of hidden units per layer for DNN.
+      dnn_feature_columns: An iterable containing all the feature columns
+        used by the model's DNN.
+      tree_learner_config: A config for the tree learner.
+      num_trees: Number of trees to grow model to after training DNN.
+      tree_examples_per_layer: Number of examples to accumulate before
+        growing the tree a layer. This value has a big impact on model
+        quality and should be set equal to the number of examples in
+        training dataset if possible. It can also be a function that computes
+        the number of examples based on the depth of the layer that's
+        being built.
+      n_classes: The number of label classes.
+      weight_column_name: The name of weight column.
+      model_dir: Directory for model exports.
+      config: `RunConfig` of the estimator.
+      label_name: String, name of the key in label dict. Can be null if label
+        is a tensor (single headed models).
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      dnn_optimizer: string, `Optimizer` object, or callable that defines the
+        optimizer to use for training the DNN. If `None`, will use the Adagrad
+        optimizer with default learning rate.
+      dnn_activation_fn: Activation function applied to each layer of the DNN.
+        If `None`, will use `tf.nn.relu`.
+      dnn_dropout: When not `None`, the probability to drop out a given
+        unit in the DNN.
+      dnn_input_layer_partitioner: Partitioner for input layer of the DNN.
+        Defaults to `min_max_variable_partitioner` with `min_slice_size`
+        64 << 20.
+      dnn_input_layer_to_tree: Whether to provide the DNN's input layer
+      as a feature to the tree.
+      dnn_steps_to_train: Number of steps to train dnn for before switching
+        to gbdt.
+      tree_feature_columns: An iterable containing all the feature columns
+        used by the model's boosted trees. If dnn_input_layer_to_tree is
+        set to True, these features are in addition to dnn_feature_columns.
+      tree_center_bias: Whether a separate tree should be created for
+        first fitting the bias.
+    """
+    head = head_lib.multi_class_head(
+        n_classes=n_classes,
+        label_name=label_name,
+        label_keys=label_keys,
+        weight_column_name=weight_column_name,
+        enable_centered_bias=False)
+
+    def _model_fn(features, labels, mode, config):
+      return _dnn_tree_combined_model_fn(
+          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
+          tree_learner_config, num_trees, tree_examples_per_layer, config,
+          dnn_optimizer, dnn_activation_fn, dnn_dropout,
+          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
+          dnn_steps_to_train,
+          tree_feature_columns, tree_center_bias)
+
+    super(DNNBoostedTreeCombinedClassifier, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir,
+        config=config, feature_engineering_fn=feature_engineering_fn)
+
+
+class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
+  """A regressor that uses a combined DNN/GBDT model."""
+
+  def __init__(self,
+               dnn_hidden_units,
+               dnn_feature_columns,
+               tree_learner_config,
+               num_trees,
+               tree_examples_per_layer,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               label_name=None,
+               label_dimension=1,
+               feature_engineering_fn=None,
+               dnn_optimizer="Adagrad",
+               dnn_activation_fn=nn.relu,
+               dnn_dropout=None,
+               dnn_input_layer_partitioner=None,
+               dnn_input_layer_to_tree=True,
+               dnn_steps_to_train=10000,
+               tree_feature_columns=None,
+               tree_center_bias=True):
+    """Initializes a DNNBoostedTreeCombinedRegressor instance.
+
+    Args:
+      dnn_hidden_units: List of hidden units per layer for DNN.
+      dnn_feature_columns: An iterable containing all the feature columns
+        used by the model's DNN.
+      tree_learner_config: A config for the tree learner.
+      num_trees: Number of trees to grow model to after training DNN.
+      tree_examples_per_layer: Number of examples to accumulate before
+        growing the tree a layer. This value has a big impact on model
+        quality and should be set equal to the number of examples in
+        training dataset if possible. It can also be a function that computes
+        the number of examples based on the depth of the layer that's
+        being built.
+      weight_column_name: The name of weight column.
+      model_dir: Directory for model exports.
+      config: `RunConfig` of the estimator.
+      label_name: String, name of the key in label dict. Can be null if label
+        is a tensor (single headed models).
+      label_dimension: Number of regression labels per example. This is the size
+        of the last dimension of the labels `Tensor` (typically, this has shape
+        `[batch_size, label_dimension]`).
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      dnn_optimizer: string, `Optimizer` object, or callable that defines the
+        optimizer to use for training the DNN. If `None`, will use the Adagrad
+        optimizer with default learning rate.
+      dnn_activation_fn: Activation function applied to each layer of the DNN.
+        If `None`, will use `tf.nn.relu`.
+      dnn_dropout: When not `None`, the probability to drop out a given
+        unit in the DNN.
+      dnn_input_layer_partitioner: Partitioner for input layer of the DNN.
+        Defaults to `min_max_variable_partitioner` with `min_slice_size`
+        64 << 20.
+      dnn_input_layer_to_tree: Whether to provide the DNN's input layer
+      as a feature to the tree.
+      dnn_steps_to_train: Number of steps to train dnn for before switching
+        to gbdt.
+      tree_feature_columns: An iterable containing all the feature columns
+        used by the model's boosted trees. If dnn_input_layer_to_tree is
+        set to True, these features are in addition to dnn_feature_columns.
+      tree_center_bias: Whether a separate tree should be created for
+        first fitting the bias.
+    """
+    head = head_lib.regression_head(
+        label_name=label_name,
+        label_dimension=label_dimension,
+        weight_column_name=weight_column_name,
+        enable_centered_bias=False)
+
+    # num_classes needed for GradientBoostedDecisionTreeModel
+    if label_dimension == 1:
+      tree_learner_config.num_classes = 2
+    else:
+      tree_learner_config.num_classes = label_dimension
+
+    def _model_fn(features, labels, mode, config):
+      return _dnn_tree_combined_model_fn(
+          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
+          tree_learner_config, num_trees, tree_examples_per_layer, config,
+          dnn_optimizer, dnn_activation_fn, dnn_dropout,
+          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
+          dnn_steps_to_train, tree_feature_columns, tree_center_bias)
+
+    super(DNNBoostedTreeCombinedRegressor, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir,
+        config=config, feature_engineering_fn=feature_engineering_fn)
+
+
+class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
+  """An estimator that uses a combined DNN/GBDT model.
+
+  Useful for training with user specified `Head`.
+  """
+
+  def __init__(self,
+               dnn_hidden_units,
+               dnn_feature_columns,
+               tree_learner_config,
+               num_trees,
+               tree_examples_per_layer,
+               head,
+               model_dir=None,
+               config=None,
+               feature_engineering_fn=None,
+               dnn_optimizer="Adagrad",
+               dnn_activation_fn=nn.relu,
+               dnn_dropout=None,
+               dnn_input_layer_partitioner=None,
+               dnn_input_layer_to_tree=True,
+               dnn_steps_to_train=10000,
+               tree_feature_columns=None,
+               tree_center_bias=True):
+    """Initializes a DNNBoostedTreeCombinedEstimator instance.
+
+    Args:
+      dnn_hidden_units: List of hidden units per layer for DNN.
+      dnn_feature_columns: An iterable containing all the feature columns
+        used by the model's DNN.
+      tree_learner_config: A config for the tree learner.
+      num_trees: Number of trees to grow model to after training DNN.
+      tree_examples_per_layer: Number of examples to accumulate before
+        growing the tree a layer. This value has a big impact on model
+        quality and should be set equal to the number of examples in
+        training dataset if possible. It can also be a function that computes
+        the number of examples based on the depth of the layer that's
+        being built.
+      head: `Head` instance.
+      model_dir: Directory for model exports.
+      config: `RunConfig` of the estimator.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      dnn_optimizer: string, `Optimizer` object, or callable that defines the
+        optimizer to use for training the DNN. If `None`, will use the Adagrad
+        optimizer with default learning rate.
+      dnn_activation_fn: Activation function applied to each layer of the DNN.
+        If `None`, will use `tf.nn.relu`.
+      dnn_dropout: When not `None`, the probability to drop out a given
+        unit in the DNN.
+      dnn_input_layer_partitioner: Partitioner for input layer of the DNN.
+        Defaults to `min_max_variable_partitioner` with `min_slice_size`
+        64 << 20.
+      dnn_input_layer_to_tree: Whether to provide the DNN's input layer
+      as a feature to the tree.
+      dnn_steps_to_train: Number of steps to train dnn for before switching
+        to gbdt.
+      tree_feature_columns: An iterable containing all the feature columns
+        used by the model's boosted trees. If dnn_input_layer_to_tree is
+        set to True, these features are in addition to dnn_feature_columns.
+      tree_center_bias: Whether a separate tree should be created for
+        first fitting the bias.
+    """
+    def _model_fn(features, labels, mode, config):
+      return _dnn_tree_combined_model_fn(
+          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
+          tree_learner_config, num_trees, tree_examples_per_layer, config,
+          dnn_optimizer, dnn_activation_fn, dnn_dropout,
+          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
+          dnn_steps_to_train,
+          tree_feature_columns, tree_center_bias)
+
+    super(DNNBoostedTreeCombinedEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir,
+        config=config, feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..83d58c561008e8a5a69eb503d1605bb9e940f281
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
@@ -0,0 +1,105 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for combined DNN + GBDT estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+
+from tensorflow.contrib.boosted_trees.estimator_batch import dnn_tree_combined_estimator as estimator
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.layers.python.layers import feature_column
+from tensorflow.contrib.learn.python.learn.estimators import estimator_test_utils
+from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+def _train_input_fn():
+  features = {
+      "x": constant_op.constant([[2.], [1.], [1.]])
+  }
+  label = constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+  return features, label
+
+
+def _eval_input_fn():
+  features = {
+      "x": constant_op.constant([[1.], [2.], [2.]])
+  }
+  label = constant_op.constant([[0], [1], [1]], dtype=dtypes.int32)
+  return features, label
+
+
+class DNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
+
+  def testClassifierContract(self):
+    estimator_test_utils.assert_estimator_contract(
+        self, estimator.DNNBoostedTreeCombinedClassifier)
+
+  def testRegressorContract(self):
+    estimator_test_utils.assert_estimator_contract(
+        self, estimator.DNNBoostedTreeCombinedRegressor)
+
+  def testEstimatorContract(self):
+    estimator_test_utils.assert_estimator_contract(
+        self, estimator.DNNBoostedTreeCombinedEstimator)
+
+  def testNoDNNFeatureColumns(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "dnn_feature_columns must be specified"):
+      classifier = estimator.DNNBoostedTreeCombinedClassifier(
+          dnn_hidden_units=[1],
+          dnn_feature_columns=[],
+          tree_learner_config=learner_config,
+          num_trees=1,
+          tree_examples_per_layer=3,
+          n_classes=2)
+      classifier.fit(input_fn=_train_input_fn, steps=5)
+
+  def testFitAndEvaluateDontThrowException(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    classifier = estimator.DNNBoostedTreeCombinedClassifier(
+        dnn_hidden_units=[1],
+        dnn_feature_columns=[feature_column.real_valued_column("x")],
+        tree_learner_config=learner_config,
+        num_trees=1,
+        tree_examples_per_layer=3,
+        n_classes=2,
+        model_dir=model_dir,
+        config=config,
+        dnn_steps_to_train=10,
+        dnn_input_layer_to_tree=False,
+        tree_feature_columns=[feature_column.real_valued_column("x")])
+
+    classifier.fit(input_fn=_train_input_fn, steps=15)
+    classifier.evaluate(input_fn=_eval_input_fn, steps=1)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py b/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py
index 79193fffc3d3fa97e20a12181bf20e6ad86dcb58..2e4151cac40f770e2bece70d752122eb7f34dd40 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.learn.python.learn import session_run_hook
 from tensorflow.contrib.learn.python.learn.session_run_hook import SessionRunArgs
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
 from tensorflow.python.training.summary_io import SummaryWriterCache
@@ -175,3 +176,40 @@ class StopAfterNTrees(session_run_hook.SessionRunHook):
       logging.info("Requesting stop since we have reached %d trees.",
                    num_finalized_trees)
       run_context.request_stop()
+
+
+class SwitchTrainOp(session_run_hook.SessionRunHook):
+  """Hook that switches the train op after specified number of steps.
+
+  Hook that replaces the train op depending on the number of steps of training
+  that have taken place. The first_train_op is used till train_steps steps
+  are reached. Thereafter the second_train_op is used.
+  """
+
+  def __init__(self, first_train_op, train_steps, second_train_op):
+    """Initializes a `SwitchTrainOp`."""
+    self._first_train_op = first_train_op
+    self._second_train_op = second_train_op
+    self._train_steps = train_steps
+
+  def _get_train_op_for_global_step(self, current_step):
+    """Gets train_op for current global step."""
+    if current_step < self._train_steps:
+      return self._first_train_op
+    return self._second_train_op
+
+  def begin(self):
+    self._global_step_tensor = training_util.get_global_step()
+    self._current_train_op = control_flow_ops.no_op()
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          "Global step should be created to use SwitchTrainOp.")
+
+  def before_run(self, run_context):  # pylint: disable=unused-argument
+    return session_run_hook.SessionRunArgs(
+        {"global_step": self._global_step_tensor,
+         "train_op": self._current_train_op})
+
+  def after_run(self, run_context, run_values):
+    self._current_train_op = self._get_train_op_for_global_step(
+        run_values.results["global_step"])
diff --git a/tensorflow/contrib/boosted_trees/examples/boston_combined.py b/tensorflow/contrib/boosted_trees/examples/boston_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..e04b56afbfd266dc13a5b0d78d171ea273415ee3
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/examples/boston_combined.py
@@ -0,0 +1,165 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Regression on Boston housing data using DNNBoostedTreeCombinedRegressor.
+
+  Example Usage:
+
+  python tensorflow/contrib/boosted_trees/examples/boston_combined.py \
+  --batch_size=404 --output_dir="/tmp/boston" \
+  --dnn_hidden_units="8,4" --dnn_steps_to_train=1000 \
+  --tree_depth=4 --tree_learning_rate=0.1 \
+  --num_trees=100 --tree_l2=0.001 --num_eval_steps=1 \
+  --vmodule=training_ops=1
+
+  When training is done, mean squared error on eval data is reported.
+  Point tensorboard to the directory for the run to see how the training
+  progresses:
+
+  tensorboard --logdir=/tmp/boston
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+import tensorflow as tf
+
+from tensorflow.contrib.boosted_trees.estimator_batch.dnn_tree_combined_estimator import DNNBoostedTreeCombinedRegressor
+from tensorflow.contrib.boosted_trees.proto import learner_pb2
+from tensorflow.contrib.layers.python.layers import feature_column
+from tensorflow.contrib.learn.python.learn import learn_runner
+from tensorflow.contrib.learn.python.learn.utils import input_fn_utils
+from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
+
+_BOSTON_NUM_FEATURES = 13
+
+
+def _get_estimator(output_dir, feature_cols):
+  """Configures DNNBoostedTreeCombinedRegressor based on flags."""
+  learner_config = learner_pb2.LearnerConfig()
+  learner_config.learning_rate_tuner.fixed.learning_rate = (
+      FLAGS.tree_learning_rate)
+  learner_config.regularization.l1 = 0.0
+  learner_config.regularization.l2 = FLAGS.tree_l2
+  learner_config.constraints.max_tree_depth = FLAGS.tree_depth
+
+  run_config = tf.contrib.learn.RunConfig(save_summary_steps=1)
+
+  # Create a DNNBoostedTreeCombinedRegressor estimator.
+  estimator = DNNBoostedTreeCombinedRegressor(
+      dnn_hidden_units=[int(x) for x in FLAGS.dnn_hidden_units.split(",")],
+      dnn_feature_columns=feature_cols,
+      tree_learner_config=learner_config,
+      num_trees=FLAGS.num_trees,
+      # This should be the number of examples. For large datasets it can be
+      # larger than the batch_size.
+      tree_examples_per_layer=FLAGS.batch_size,
+      model_dir=output_dir,
+      config=run_config,
+      dnn_input_layer_to_tree=True,
+      dnn_steps_to_train=FLAGS.dnn_steps_to_train)
+  return estimator
+
+
+def _make_experiment_fn(output_dir):
+  """Creates experiment for DNNBoostedTreeCombinedRegressor."""
+  (x_train, y_train), (x_test,
+                       y_test) = tf.keras.datasets.boston_housing.load_data()
+
+  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={"x": x_train},
+      y=y_train,
+      batch_size=FLAGS.batch_size,
+      num_epochs=None,
+      shuffle=True)
+  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+      x={"x": x_test}, y=y_test, num_epochs=1, shuffle=False)
+
+  feature_columns = [
+      feature_column.real_valued_column("x", dimension=_BOSTON_NUM_FEATURES)
+  ]
+  feature_spec = tf.contrib.layers.create_feature_spec_for_parsing(
+      feature_columns)
+  serving_input_fn = input_fn_utils.build_parsing_serving_input_fn(feature_spec)
+  export_strategies = [
+      saved_model_export_utils.make_export_strategy(serving_input_fn)]
+  return tf.contrib.learn.Experiment(
+      estimator=_get_estimator(output_dir, feature_columns),
+      train_input_fn=train_input_fn,
+      eval_input_fn=eval_input_fn,
+      train_steps=None,
+      eval_steps=FLAGS.num_eval_steps,
+      eval_metrics=None,
+      export_strategies=export_strategies)
+
+
+def main(unused_argv):
+  learn_runner.run(
+      experiment_fn=_make_experiment_fn,
+      output_dir=FLAGS.output_dir,
+      schedule="train_and_evaluate")
+
+
+if __name__ == "__main__":
+  tf.logging.set_verbosity(tf.logging.INFO)
+  parser = argparse.ArgumentParser()
+  # Define the list of flags that users can change.
+  parser.add_argument(
+      "--batch_size",
+      type=int,
+      default=1000,
+      help="The batch size for reading data.")
+  parser.add_argument(
+      "--output_dir",
+      type=str,
+      required=True,
+      help="Choose the dir for the output.")
+  parser.add_argument(
+      "--num_eval_steps",
+      type=int,
+      default=1,
+      help="The number of steps to run evaluation for.")
+  # Flags for configuring DNNBoostedTreeCombinedRegressor.
+  parser.add_argument(
+      "--dnn_hidden_units",
+      type=str,
+      default="8,4",
+      help="Hidden layers for DNN.")
+  parser.add_argument(
+      "--dnn_steps_to_train",
+      type=int,
+      default=1000,
+      help="Number of steps to train DNN.")
+  parser.add_argument(
+      "--tree_depth", type=int, default=4, help="Maximum depth of trees.")
+  parser.add_argument(
+      "--tree_l2", type=float, default=1.0, help="l2 regularization per batch.")
+  parser.add_argument(
+      "--tree_learning_rate",
+      type=float,
+      default=0.1,
+      help=("Learning rate (shrinkage weight) with which each "
+            "new tree is added."))
+  parser.add_argument(
+      "--num_trees",
+      type=int,
+      default=None,
+      required=True,
+      help="Number of trees to grow before stopping.")
+
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
index 4b5d5ba0de6c3995ee2da7a44ab0ba099cbf1b35..754b7bc3270d647fc381033b769eadd7b791771e 100644
--- a/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/model_ops.cc
@@ -48,8 +48,9 @@ class CreateTreeEnsembleVariableOp : public OpKernel {
     if (!result->InitFromSerialized(tree_ensemble_config_t->scalar<string>()(),
                                     stamp_token)) {
       result->Unref();
-      OP_REQUIRES(context, false, errors::InvalidArgument(
-                                      "Unable to parse tree ensemble config."));
+      OP_REQUIRES(
+          context, false,
+          errors::InvalidArgument("Unable to parse tree ensemble config."));
     }
 
     // Only create one, if one does not exist already. Report status for all
diff --git a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
index f8086b0c2bb93eae6af0336bbe33fc23f8fcde22..b3fe38614e05801b223f0c96f7a70ce7e432a70b 100644
--- a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
@@ -47,8 +47,8 @@ namespace boosted_trees {
 using boosted_trees::learner::LearnerConfig;
 using boosted_trees::learner::LearningRateConfig;
 using boosted_trees::learner::LearningRateDropoutDrivenConfig;
-using boosted_trees::models::MultipleAdditiveTrees;
 using boosted_trees::models::DecisionTreeEnsembleResource;
+using boosted_trees::models::MultipleAdditiveTrees;
 using boosted_trees::utils::DropoutUtils;
 using boosted_trees::utils::TensorUtils;
 
diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
index 8600c8c53caa5fd4274ba6730fc764d8315d680c..0f4c2298f56be48bb32f52d5d44cff8afe284f1e 100644
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -36,22 +36,21 @@
 namespace tensorflow {
 
 using ::boosted_trees::QuantileConfig;
-using boosted_trees::utils::TensorUtils;
 using boosted_trees::QuantileStreamResource;
+using boosted_trees::utils::TensorUtils;
 
 namespace {
 const char* const kExampleWeightsName = "example_weights";
 const char* const kMaxElementsName = "max_elements";
-const char* const kHandleName = "handle";
 const char* const kNextStampTokenName = "next_stamp_token";
 const char* const kStampTokenName = "stamp_token";
 const char* const kAreBucketsReadyName = "are_buckets_ready";
+const char* const kGenerateQuantiles = "generate_quantiles";
 // Names for sparse arguments.
 const char* const kNumSparseFeaturesName = "num_sparse_features";
 const char* const kSparseBucketsName = "sparse_buckets";
 const char* const kSparseValuesName = "sparse_values";
 const char* const kSparseIndicesName = "sparse_indices";
-const char* const kSparseStreamsStateName = "sparse_streams_state";
 const char* const kSparseSummariesName = "sparse_summaries";
 const char* const kSparseConfigName = "sparse_config";
 const char* const kSparseOutputTensorName = "sparse_quantiles";
@@ -59,7 +58,6 @@ const char* const kSparseOutputTensorName = "sparse_quantiles";
 const char* const kDenseBucketsName = "dense_buckets";
 const char* const kDenseConfigName = "dense_config";
 const char* const kDenseOutputTensorName = "dense_quantiles";
-const char* const kDenseStreamsStateName = "dense_streams_state";
 const char* const kDenseSummariesName = "dense_summaries";
 const char* const kDenseValuesName = "dense_values";
 const char* const kNumDenseFeaturesName = "num_dense_features";
@@ -182,6 +180,16 @@ std::vector<float> GenerateBoundaries(const QuantileStream& stream,
   return boundaries;
 }
 
+// Generates quantiles on a finalized QuantileStream.
+std::vector<float> GenerateQuantiles(const QuantileStream& stream,
+                                     int num_quantiles) {
+  // Do not de-dup boundaries. Exactly num_quantiles+1 boundary values
+  // will be returned.
+  std::vector<float> boundaries = stream.GenerateQuantiles(num_quantiles);
+  CHECK_EQ(boundaries.size(), num_quantiles + 1);
+  return boundaries;
+}
+
 // Copies quantiles to output list.
 void CopyBoundaries(OpKernelContext* const context,
                     const std::vector<float>& boundaries, const int64 index,
@@ -224,6 +232,8 @@ class CreateQuantileAccumulatorOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->GetAttr(kNumQuantilesName, &num_quantiles_));
     OP_REQUIRES_OK(context, context->GetAttr(kMaxElementsName, &max_elements_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr(kGenerateQuantiles, &generate_quantiles_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -231,9 +241,9 @@ class CreateQuantileAccumulatorOp : public OpKernel {
     // other exceptions. If one already exists, it unrefs the new one.
     const Tensor* stamp_token_t;
     OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
-    auto result =
-        new QuantileStreamResource(epsilon_, num_quantiles_, max_elements_,
-                                   stamp_token_t->scalar<int64>()());
+    auto result = new QuantileStreamResource(epsilon_, num_quantiles_,
+                                             max_elements_, generate_quantiles_,
+                                             stamp_token_t->scalar<int64>()());
     auto status = CreateResource(context, HandleFromInput(context, 0), result);
     if (!status.ok() && status.code() != tensorflow::error::ALREADY_EXISTS) {
       OP_REQUIRES(context, false, status);
@@ -246,6 +256,7 @@ class CreateQuantileAccumulatorOp : public OpKernel {
   // An upperbound on the number of enteries that the summaries might have
   // for a feature.
   int64 max_elements_;
+  bool generate_quantiles_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("CreateQuantileAccumulator").Device(DEVICE_CPU),
@@ -373,7 +384,7 @@ class MakeQuantileSummariesOp : public OpKernel {
         protobuf::Arena arena;
         ::boosted_trees::QuantileSummaryState* summary_proto =
             protobuf::Arena::CreateMessage<
-            ::boosted_trees::QuantileSummaryState>(&arena);
+                ::boosted_trees::QuantileSummaryState>(&arena);
         const auto& summary = stream.GetFinalSummary();
         CopySummaryToProto(summary, summary_proto);
         // Output to tensor.
@@ -597,10 +608,15 @@ class QuantileAccumulatorFlushOp : public OpKernel {
         << "Passed stamp token: " << stamp_token << " "
         << "Current token: " << streams_resource->stamp();
     QuantileStream* stream = streams_resource->stream(stamp_token);
+    bool generate_quantiles = streams_resource->generate_quantiles();
     stream->Finalize();
+
     streams_resource->set_boundaries(
         stamp_token,
-        GenerateBoundaries(*stream, streams_resource->num_quantiles()));
+        generate_quantiles
+            ? GenerateQuantiles(*stream, streams_resource->num_quantiles())
+            : GenerateBoundaries(*stream, streams_resource->num_quantiles()));
+
     streams_resource->Reset(next_stamp_token);
   }
 };
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 18b4abd654ea3541d646a43ac901aca1a678446f..44a8ffaf4b2f5a9c11b3abc46ce55a18c80ad318 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -34,10 +34,10 @@
 
 namespace tensorflow {
 
+using boosted_trees::learner::LearnerConfig_MultiClassStrategy;
 using boosted_trees::learner::SplitInfo;
 using boosted_trees::learner::stochastic::GradientStats;
 using boosted_trees::learner::stochastic::NodeStats;
-using boosted_trees::learner::LearnerConfig_MultiClassStrategy;
 
 namespace {
 const int32 DUMMY_FEATURE_DIMENSION = -1;
@@ -47,9 +47,8 @@ class BaseBuildSplitOp : public OpKernel {
  public:
   explicit BaseBuildSplitOp(OpKernelConstruction* const context)
       : OpKernel(context) {
-    OP_REQUIRES_OK(
-        context,
-        context->GetAttr("feature_column_group_id", &feature_column_group_id_));
+    OP_REQUIRES_OK(context, context->GetAttr("feature_column_group_id",
+                                             &feature_column_group_id_));
     OP_REQUIRES_OK(context,
                    context->GetAttr("l1_regularization", &l1_regularization_));
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
index a9a229c8ae0c26bba5f0a684dad7e546298577bb..90a0655201f8cb8df6fc6417cb51216dec91b4d7 100644
--- a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
@@ -134,10 +134,9 @@ void SerializeScalarAccumulatorToOutput(
     OpKernelContext* context) {
   int64 num_slots = accumulator_resource.values().size();
   Tensor* partition_ids_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_partition_ids", TensorShape({num_slots}),
-                               &partition_ids_t));
+  OP_REQUIRES_OK(context, context->allocate_output("output_partition_ids",
+                                                   TensorShape({num_slots}),
+                                                   &partition_ids_t));
   auto partition_ids = partition_ids_t->vec<int32>();
 
   // Feature ids tensor has ids of feature columns and their dimensions.
@@ -149,15 +148,14 @@ void SerializeScalarAccumulatorToOutput(
 
   Tensor* gradients_t = nullptr;
   OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_gradients", TensorShape({num_slots}),
-                               &gradients_t));
+      context, context->allocate_output(
+                   "output_gradients", TensorShape({num_slots}), &gradients_t));
   auto gradients = gradients_t->vec<float>();
 
   Tensor* hessians_t = nullptr;
-  OP_REQUIRES_OK(context,
-                 context->allocate_output(
-                     "output_hessians", TensorShape({num_slots}), &hessians_t));
+  OP_REQUIRES_OK(
+      context, context->allocate_output("output_hessians",
+                                        TensorShape({num_slots}), &hessians_t));
   auto hessians = hessians_t->vec<float>();
 
   int i = 0;
@@ -177,10 +175,9 @@ void SerializeTensorAccumulatorToOutput(
     OpKernelContext* context) {
   int64 num_slots = accumulator_resource.values().size();
   Tensor* partition_ids_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_partition_ids", TensorShape({num_slots}),
-                               &partition_ids_t));
+  OP_REQUIRES_OK(context, context->allocate_output("output_partition_ids",
+                                                   TensorShape({num_slots}),
+                                                   &partition_ids_t));
   auto partition_ids = partition_ids_t->vec<int32>();
 
   Tensor* feature_ids_t = nullptr;
@@ -202,9 +199,8 @@ void SerializeTensorAccumulatorToOutput(
   int64 num_hessian_elements = hessian_shape.num_elements();
   hessian_shape.InsertDim(0, num_slots);
   Tensor* hessians_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_hessians", hessian_shape, &hessians_t));
+  OP_REQUIRES_OK(context, context->allocate_output("output_hessians",
+                                                   hessian_shape, &hessians_t));
   auto hessians = hessians_t->flat_outer_dims<float>();
 
   int i = 0;
diff --git a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
index c77d90e243c304ec8e9a10a0b63401f9bd825c3e..7f8dea1d3c2a04b725843f6e2932a0cdfbc7733c 100644
--- a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
@@ -361,10 +361,27 @@ class GrowTreeEnsembleOp : public OpKernel {
     // Increment attempt stats.
     ensemble_resource->IncrementAttempts();
 
+    // In case we want to do feature selection and we have reached the limit,
+    // build a list of handlers used so far to avoid adding new features.
+    std::vector<int64> allowed_handlers;
+    if (learner_config_.constraints().max_number_of_unique_feature_columns() >
+        0) {
+      allowed_handlers = ensemble_resource->GetUsedHandlers();
+      // TODO(soroush): We can disable handlers that are not going to be used to
+      // avoid unnecessary computations.
+      if (allowed_handlers.size() <
+          learner_config_.constraints()
+              .max_number_of_unique_feature_columns()) {
+        // We have not reached the limit yet. Empty the list of allow features
+        // which means we can keep adding new features.
+        allowed_handlers.clear();
+      }
+    }
+
     // Find best splits for each active partition.
     std::map<int32, SplitCandidate> best_splits;
-    FindBestSplitsPerPartition(context, partition_ids_list, gains_list,
-                               splits_list, &best_splits);
+    FindBestSplitsPerPartition(context, allowed_handlers, partition_ids_list,
+                               gains_list, splits_list, &best_splits);
 
     // No-op if no new splits can be considered.
     if (best_splits.empty()) {
@@ -381,7 +398,8 @@ class GrowTreeEnsembleOp : public OpKernel {
 
     // Split tree nodes.
     for (auto& split_entry : best_splits) {
-      SplitTreeNode(split_entry.first, &split_entry.second, tree_config);
+      SplitTreeNode(split_entry.first, &split_entry.second, tree_config,
+                    ensemble_resource);
     }
 
     // Post-prune finalized tree if needed.
@@ -403,12 +421,20 @@ class GrowTreeEnsembleOp : public OpKernel {
   // Helper method which effectively does a reduce over all split candidates
   // and finds the best split for each partition.
   void FindBestSplitsPerPartition(
-      OpKernelContext* const context, const OpInputList& partition_ids_list,
-      const OpInputList& gains_list, const OpInputList& splits_list,
+      OpKernelContext* const context,
+      const std::vector<int64>& allowed_handlers,  // Empty means all handlers.
+      const OpInputList& partition_ids_list, const OpInputList& gains_list,
+      const OpInputList& splits_list,
       std::map<int32, SplitCandidate>* best_splits) {
     // Find best split per partition going through every feature candidate.
     // TODO(salehay): Is this worth parallelizing?
     for (int64 handler_id = 0; handler_id < num_handlers_; ++handler_id) {
+      if (!allowed_handlers.empty()) {
+        if (!std::binary_search(allowed_handlers.begin(),
+                                allowed_handlers.end(), handler_id)) {
+          continue;
+        }
+      }
       const auto& partition_ids = partition_ids_list[handler_id].vec<int32>();
       const auto& gains = gains_list[handler_id].vec<float>();
       const auto& splits = splits_list[handler_id].vec<string>();
@@ -592,8 +618,10 @@ class GrowTreeEnsembleOp : public OpKernel {
 
   // Helper method to split a tree node and append its respective
   // leaf children given the split candidate.
-  void SplitTreeNode(const int32 node_id, SplitCandidate* split,
-                     boosted_trees::trees::DecisionTreeConfig* tree_config) {
+  void SplitTreeNode(
+      const int32 node_id, SplitCandidate* split,
+      boosted_trees::trees::DecisionTreeConfig* tree_config,
+      boosted_trees::models::DecisionTreeEnsembleResource* ensemble_resource) {
     // No-op if we have no real node.
     CHECK(node_id < tree_config->nodes_size())
         << "Invalid node " << node_id << " to split.";
@@ -633,6 +661,9 @@ class GrowTreeEnsembleOp : public OpKernel {
     // Replace node in tree.
     (*tree_config->mutable_nodes(node_id)) =
         *split->split_info.mutable_split_node();
+    if (learner_config_.constraints().max_number_of_unique_feature_columns()) {
+      ensemble_resource->MaybeAddUsedHandler(split->handler_id);
+    }
   }
 
   void PruneTree(boosted_trees::trees::DecisionTreeConfig* tree_config) {
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index 72e20aaa127cda592bd314786cddb925cc87a075..7df514cd207c5e781f3b4abaa2020016b197669d 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -436,7 +436,7 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column,
     quantized_feature = quantile_ops.quantiles([float_column], [],
                                                [quantile_buckets], [], [])
     quantized_feature = math_ops.cast(quantized_feature[0], dtypes.int64)
-    quantized_feature = array_ops.squeeze(quantized_feature)
+    quantized_feature = array_ops.squeeze(quantized_feature, axis=0)
     return (example_partition_ids, quantized_feature, gradients, hessians)
 
   def not_ready_inputs_fn():
@@ -468,7 +468,7 @@ def sparse_make_stats_update(
                                                [sparse_column_indices])
 
     quantized_feature = math_ops.cast(quantized_feature[1], dtypes.int64)
-    quantized_feature = array_ops.squeeze(quantized_feature)
+    quantized_feature = array_ops.squeeze(quantized_feature, axis=0)
 
     example_indices, _ = array_ops.split(
         sparse_column_indices, num_or_size_splits=2, axis=1)
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
index ee16a5f838a65f20db4436eb86527518621b6d8d..54d03018d9e266beabbbabd78ebbb80cfe689c04 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
@@ -1121,6 +1121,87 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
     self.assertEqual(len(gains), 0)
     self.assertEqual(len(splits), 0)
 
+  def testDegenerativeCase(self):
+    with self.test_session() as sess:
+      # One data example only, one leaf and thus one quantile bucket.The same
+      # situation is when all examples have the same values. This case was
+      # causing before a failure.
+      gradients = array_ops.constant([0.2])
+      hessians = array_ops.constant([0.12])
+      example_partitions = array_ops.constant([1], dtype=dtypes.int32)
+      indices = array_ops.constant([[0, 0]], dtype=dtypes.int64)
+      values = array_ops.constant([0.58])
+      sparse_column = sparse_tensor.SparseTensor(indices, values, [1, 1])
+
+      gradient_shape = tensor_shape.scalar()
+      hessian_shape = tensor_shape.scalar()
+      class_id = -1
+
+      split_handler = ordinal_split_handler.SparseSplitHandler(
+          l1_regularization=0,
+          l2_regularization=2,
+          tree_complexity_regularization=0,
+          min_node_weight=0,
+          epsilon=0.01,
+          num_quantiles=2,
+          feature_column_group_id=0,
+          sparse_float_column=sparse_column,
+          init_stamp_token=0,
+          gradient_shape=gradient_shape,
+          hessian_shape=hessian_shape,
+          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS)
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      empty_gradients, empty_hessians = get_empty_tensors(
+          gradient_shape, hessian_shape)
+      example_weights = array_ops.ones([1, 1], dtypes.float32)
+
+      update_1 = split_handler.update_stats_sync(
+          0,
+          example_partitions,
+          gradients,
+          hessians,
+          empty_gradients,
+          empty_hessians,
+          example_weights,
+          is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_1]):
+        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+
+      with ops.control_dependencies([are_splits_ready]):
+        update_2 = split_handler.update_stats_sync(
+            1,
+            example_partitions,
+            gradients,
+            hessians,
+            empty_gradients,
+            empty_hessians,
+            example_weights,
+            is_active=array_ops.constant([True, True]))
+      with ops.control_dependencies([update_2]):
+        are_splits_ready2, partitions, gains, splits = (
+            split_handler.make_splits(1, 2, class_id))
+        are_splits_ready, are_splits_ready2, partitions, gains, splits = (
+            sess.run([
+                are_splits_ready, are_splits_ready2, partitions, gains, splits
+            ]))
+
+    # During the first iteration, inequality split handlers are not going to
+    # have any splits. Make sure that we return not_ready in that case.
+    self.assertFalse(are_splits_ready)
+    self.assertTrue(are_splits_ready2)
+
+    self.assertAllEqual([1], partitions)
+    self.assertAllEqual([0.0], gains)
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+    split_node = split_info.split_node.sparse_float_binary_split_default_left
+
+    self.assertEqual(0, split_node.split.feature_column)
+
+    self.assertAllClose(0.58, split_node.split.threshold)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/class-partition-key.h b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/class-partition-key.h
index e1bef0278846e7ff6abc91e8c57f780af45e8b41..3c54868951a6db93a8b685c8da4dfc78996b7b1f 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/class-partition-key.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/class-partition-key.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
 
 #include "tensorflow/core/lib/hash/hash.h"
 
@@ -58,4 +58,4 @@ struct ClassPartitionKey {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_CLASS_PARTITION_KEY_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h
index 3814edb5675be74794a08e00becb649f8fc53fdb..ec4e7c52bb5f4536a50192e1b5fcc019dd7b2511 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
 
 #include <unordered_map>
 #include <vector>
@@ -79,4 +79,4 @@ class FeatureStatsAccumulator {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_ACCUMULATORS_FEATURE_STATS_ACCUMULATOR_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h b/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h
index aed0d9fdac108dff4576cc1563dae420340387be..37a71037041445e6a6fcf6290015b93cffef1618 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
 
 #include <vector>
 #include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
@@ -50,4 +50,4 @@ class ExamplePartitioner {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_PARTITIONERS_EXAMPLE_PARTITIONER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h
index 339c2e0fded10e6a7b140da62e152e2868ffd164..382b85cf0b2c146f82fa79551c569b9c70d9b7a6 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 //
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
 
 #include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h"
 #include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
@@ -58,4 +58,4 @@ struct FeatureSplitCandidate {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h
index 34e3ddb777242553d62035a51f1aec33d0f9ba54..3dd03215d88abc223a2d081d11901ffd3fb7aaa9 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
 
 #include <math.h>
 
@@ -190,4 +190,4 @@ inline GradientStats operator-(const GradientStats& a, const GradientStats& b) {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
index 642a183aec5c7e591579fa5ee91d45729bfb624d..cd925f6b65e569538212e9c26aef0abc8482960b 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/Eigen/Eigenvalues"
@@ -298,4 +298,4 @@ struct NodeStats {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
index f867e77d3ef0609774628b2a9c36ca52bcf2a957..8bca132acfde9397942b198db9a8d4c0e4d74897 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
@@ -17,8 +17,8 @@
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/platform/test.h"
 
-using tensorflow::test::AsTensor;
 using std::vector;
+using tensorflow::test::AsTensor;
 
 namespace tensorflow {
 namespace boosted_trees {
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h
index 054ccd9a8cd0be0c48b14cca013f15677deba900..81ee2774bdab91f492064455055181c56ef6a065 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
 
 #include <string>
 
@@ -81,4 +81,4 @@ struct SplitStats {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h
index ee29a8aa797b96d41ec2d77bf831ee287d5443e7..cc3dc226cdbc88fc7010ada1e7f0e6c0a3913c5f 100644
--- a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h
+++ b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
 
 #include <vector>
 
@@ -45,4 +45,4 @@ class MultipleAdditiveTrees {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
index 70037d5bd8f446bdbbfcc468edb8a76c05e4fab7..804b218f1c08338df80f8dd2e6135f5d92b9928e 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
 
 #include <algorithm>
 #include <unordered_map>
@@ -129,4 +129,4 @@ constexpr decltype(CompareFn())
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_BUFFER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
index fd577ad712f228fa8016a48942511a3263aae5da..8ad97fedc923ac50bcaad86e0ba2c2e46df6821b 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
 
+#include <cmath>
 #include <memory>
 #include <vector>
-#include <cmath>
 
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h"
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h"
@@ -322,4 +322,4 @@ WeightedQuantilesStream<ValueType, WeightType, CompareFn>::GetQuantileSpecs(
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_STREAM_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
index c329c6d4f7363a7738b06648943fe1dbd065cce5..aec232f3cbb096f0aa51e4362a821882391f8027 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
 
 #include <cstring>
 #include <vector>
@@ -334,4 +334,4 @@ constexpr decltype(CompareFn())
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h b/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h
index d95878ec87b9e903930d2016bb573eee2573f776..b98190b10dc88d5bba9023e771844a2bd6c9a45d 100644
--- a/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
 
 #include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -42,4 +42,4 @@ void RandomlyInitializeBatchFeatures(
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
index cbe26ba918d384ad903fb854ca3e88e84d16a923..705b65e9db9f1aed9af1be153240d57e163c2d5b 100644
--- a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
@@ -22,9 +22,9 @@ namespace tensorflow {
 namespace boosted_trees {
 namespace testutil {
 
+using boosted_trees::trees::DenseFloatBinarySplit;
 using tensorflow::boosted_trees::trees::DecisionTreeConfig;
 using tensorflow::boosted_trees::trees::TreeNode;
-using boosted_trees::trees::DenseFloatBinarySplit;
 
 namespace {
 
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h
index 5e12429ba778344edda623d149e017661f1e0222..1838b4cee21afb5df72a9b902f0ec0ce6f7ac627 100644
--- a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
 
 #include <memory>
 
@@ -72,4 +72,4 @@ class RandomTreeGen {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h
index 604ff02744b25b136bd935bf85635731730effe8..43526c229a65d45a2b0ced4aa1262d489526fc7b 100644
--- a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
 
 #include "tensorflow/contrib/boosted_trees/lib/utils/example.h"
 #include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"  // NOLINT
@@ -46,4 +46,4 @@ class DecisionTree {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
index badc629a118f768d5aa25ef1b94b8190e6910c7f..da5e7448519cb7f4092f7bbbe1b526271008ec22 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
 
 #include <vector>
 #include "tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h"
@@ -92,4 +92,4 @@ class BatchFeatures {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_BATCH_FEATURES_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
index 9de3e32b097a151b3bd6f5c30df2db0938b65e9c..609519e8b1153a27d987c5f9ca9bfcc9ee6717d6 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features_test.cc
@@ -25,8 +25,8 @@ namespace boosted_trees {
 namespace utils {
 namespace {
 
-using test::AsTensor;
 using errors::InvalidArgument;
+using test::AsTensor;
 
 class BatchFeaturesTest : public ::testing::Test {};
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
index 38f0151255bbf4fcd87f1d0d76fd111649ee4a12..db34db998a7442c69f2ab468f4557d991429f4ee 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.cc
@@ -23,10 +23,10 @@
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/logging.h"
 
+using tensorflow::Status;
 using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig;
 using tensorflow::random::PhiloxRandom;
 using tensorflow::random::SimplePhilox;
-using tensorflow::Status;
 
 namespace tensorflow {
 namespace boosted_trees {
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
index c3f1c918ca5f603cf9470071017d8ee384dc9320..928bfbfe5c9394ab4083aabced4c8e1149bb10aa 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
 
 #include <unordered_set>
 #include <vector>
@@ -74,4 +74,4 @@ class DropoutUtils {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_DROPOUT_UTILS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
index ce7632e58987f5890beaded5dd305724f950e1e8..02f972c8e00e8229426ac53d8f20765484787b6e 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/dropout_utils_test.cc
@@ -26,9 +26,9 @@
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/env.h"
 
+using std::unordered_set;
 using tensorflow::boosted_trees::learner::LearningRateDropoutDrivenConfig;
 using tensorflow::boosted_trees::trees::DecisionTreeEnsembleConfig;
-using std::unordered_set;
 
 namespace tensorflow {
 namespace boosted_trees {
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/example.h b/tensorflow/contrib/boosted_trees/lib/utils/example.h
index 54f60e1dee49a4a40b84fcc6e042fac1858aa187..1371ff337f78dd1c38f2bd0ba86911642f3aeb3e 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/example.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/example.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
 
 #include <algorithm>
 #include <unordered_set>
@@ -131,4 +131,4 @@ struct Example {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h
index 5b33c8158879ec65425ac77b5338ee98fbdf07db..1b654e1c44e545fb97216ad950f3cd2d3240ffd0 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
 
 #include <vector>
 
@@ -205,4 +205,4 @@ class ExamplesIterable {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLES_ITERABLE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/macros.h b/tensorflow/contrib/boosted_trees/lib/utils/macros.h
index 28ea0a4dc191af66ced574d78d9873cc8335f491..9a53fb2ef7d0581986885f3bc8233d91b67c0166 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/macros.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/macros.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
 
 #include "tensorflow/core/platform/macros.h"
 
@@ -23,4 +23,4 @@
     return (STATUS);                              \
   }
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_MACROS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/optional_value.h b/tensorflow/contrib/boosted_trees/lib/utils/optional_value.h
index c141fe059d48072c6c4495535eafec9633616d21..b2166f53d7a037fb8ec53d5295b98bb82b17d4c7 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/optional_value.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/optional_value.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
 
 #include "tensorflow/core/platform/logging.h"
 
@@ -44,4 +44,4 @@ class OptionalValue {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_OPTIONAL_VALUE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h b/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h
index c80431b5587cecc0bce22f6150a69d30397529da..ec06787e1db69514c9e60f6d152f3b0c7de23842 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
+#ifndef TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
+#define TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
 
 #include "tensorflow/core/lib/core/threadpool.h"
 
@@ -30,4 +30,4 @@ void ParallelFor(int64 batch_size, int64 desired_parallelism,
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
+#endif  // TENSORFLOW_CONTRIB_LIB_UTILS_PARALLEL_FOR_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/random.h b/tensorflow/contrib/boosted_trees/lib/utils/random.h
index 6dd55fcacc42b88116737ab6fb413852ffc1473d..546d344f5585458f10699a644621f0adf26b6446 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/random.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/random.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
+#ifndef TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
+#define TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
 
 #include "tensorflow/core/lib/random/simple_philox.h"
 
@@ -36,4 +36,4 @@ inline int32 PoissonBootstrap(random::SimplePhilox* rng) {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
+#endif  // TENSORFLOW_CONTRIB_LIB_UTILS_RANDOM_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
index 0d46565a1962b88cbb267f3d6043610758790578..1297aa884938f2f099a32568acc80c6cd8162651 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
@@ -51,7 +51,7 @@ class IndicesRowIterator
     return tmp;
   }
 
-  reference operator*() { return iter_->ix()(row_idx_, 0); }
+  reference operator*() const { return iter_->ix()(row_idx_, 0); }
 
   pointer operator->() { return &iter_->ix()(row_idx_, 0); }
 
@@ -97,7 +97,7 @@ class IndicesRowIterator
   }
 
   bool operator<(const IndicesRowIterator& other) const {
-	return (row_idx_ < other.row_idx_);
+    return (row_idx_ < other.row_idx_);
   }
 
   bool operator==(const IndicesRowIterator& other) const {
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h
index 9664c9d1c6a0c0c8b1bbd1506944c54d2310c611..87fb1fbf5ae3cc6bcf25f68a180d1d9b21ef4d6f 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -127,4 +127,4 @@ class SparseColumnIterable {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_SPARSE_COLUMN_ITERABLE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h b/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h
index 58f5e5a0d18788375cd8166d1fcbdc7c294ba5e2..475d3718eccc2b23260b7cf5286abdd31ef1bad6 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -57,4 +57,4 @@ class TensorUtils {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
index 1fa70bafddb0c94f47d006d5694bea941edaddf9..ae99d53a2cf805d70d60746cd44f73f7fd9dc6e2 100644
--- a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
@@ -19,8 +19,8 @@
 
 namespace tensorflow {
 namespace boosted_trees {
-using shape_inference::InferenceContext;
 using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
 REGISTER_RESOURCE_HANDLE_OP(QuantileStreamResource);
@@ -39,6 +39,7 @@ REGISTER_OP("CreateQuantileAccumulator")
     .Attr("max_elements: int = 1099511627776")  // 1 << 40
     .Attr("epsilon: float")
     .Attr("num_quantiles: int")
+    .Attr("generate_quantiles: bool=False")
     .Input("quantile_accumulator_handle: resource")
     .Input("stamp_token: int64")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
diff --git a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
index 0d27ddaf3a1d540efee268c2bcca217077ff5871..5d0ebbf73ce1272b51a475f67984db3a181b7130 100644
--- a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
@@ -18,9 +18,9 @@
 
 namespace tensorflow {
 
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
-using shape_inference::DimensionHandle;
 
 REGISTER_OP("BuildDenseInequalitySplits")
     .Attr("feature_column_group_id: int")
diff --git a/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
index 0354f7853cbedf22d0a299273b4dbd225b3121ab..179505eef01f79bb149137400468b84285fe478a 100644
--- a/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
@@ -19,9 +19,9 @@
 
 namespace tensorflow {
 namespace boosted_trees {
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
-using shape_inference::DimensionHandle;
 
 REGISTER_RESOURCE_HANDLE_OP(StatsAccumulatorScalarResource);
 
diff --git a/tensorflow/contrib/boosted_trees/proto/learner.proto b/tensorflow/contrib/boosted_trees/proto/learner.proto
index 919e7cd81427c27cf892bc77998f52406d2bcf15..d84ba7438e7f03685d5bafca52ff8283f0fce898 100644
--- a/tensorflow/contrib/boosted_trees/proto/learner.proto
+++ b/tensorflow/contrib/boosted_trees/proto/learner.proto
@@ -22,6 +22,10 @@ message TreeConstraintsConfig {
 
   // Min hessian weight per node.
   float min_node_weight = 2;
+
+  // Maximum number of unique features used in the tree. Zero means there is no
+  // limit.
+  int64 max_number_of_unique_feature_columns = 3;
 }
 
 // LearningRateConfig describes all supported learning rate tuners.
diff --git a/tensorflow/contrib/boosted_trees/proto/tree_config.proto b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
index fc570c1083d01a65760a456c109dad93afd9f62a..4407c4d981785a279b6296f4726a221cacb4c5b1 100644
--- a/tensorflow/contrib/boosted_trees/proto/tree_config.proto
+++ b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
@@ -128,6 +128,10 @@ message GrowingMetadata {
   // Number of layers that we have attempted to build. After pruning, these
   // layers might have been removed.
   int64 num_layers_attempted = 2;
+
+  // Sorted list of column handlers that have been used in at least one split
+  // so far.
+  repeated int64 used_handler_ids = 3;
 }
 
 // DecisionTreeEnsembleConfig describes an ensemble of decision trees.
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
index 888d5c57ed33446c8b6f18d2d1e393647613d132..81f58de28cbe98bb996c6665114eeb0030ee52f9 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
@@ -106,9 +106,11 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     |   6   |  16   |  [16, 17, 18, 19, 20, 21]
     """
 
+    num_quantiles = 3
     with self.test_session() as sess:
       accumulator = quantile_ops.QuantileAccumulator(
-          init_stamp_token=0, num_quantiles=3, epsilon=0.001, name="q1")
+          init_stamp_token=0, num_quantiles=num_quantiles,
+          epsilon=0.001, name="q1")
       resources.initialize_resources(resources.shared_resources()).run()
     input_column = array_ops.placeholder(dtypes.float32)
     weights = array_ops.placeholder(dtypes.float32)
@@ -131,8 +133,128 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
       buckets, are_ready_flush = (sess.run(
           [buckets, are_ready_flush]))
       self.assertEqual(True, are_ready_flush)
+      self.assertEqual(num_quantiles + 1, len(buckets))
       self.assertAllEqual([1, 86., 170., 253.], buckets)
 
+  def testStreamingQuantileBucketsLowPrecisionInput(self):
+    """Tests inputs that simulate low precision float16 values."""
+
+    num_quantiles = 3
+    # set generate_quantiles to True since the test will generate fewer
+    # boundaries otherwise.
+    with self.test_session() as sess:
+      accumulator = quantile_ops.QuantileAccumulator(
+          init_stamp_token=0, num_quantiles=num_quantiles,
+          epsilon=0.001, name="q1", generate_quantiles=True)
+      resources.initialize_resources(resources.shared_resources()).run()
+    input_column = array_ops.placeholder(dtypes.float32)
+    weights = array_ops.placeholder(dtypes.float32)
+    update = accumulator.add_summary(
+        stamp_token=0,
+        column=input_column,
+        example_weights=weights)
+
+    with self.test_session() as sess:
+      # This input is generated by integer in the range [2030, 2060]
+      # but represented by with float16 precision. Integers <= 2048 are
+      # exactly represented, whereas  numbers > 2048 are rounded; and hence
+      # numbers > 2048 are repeated. For precision loss / rounding, see:
+      # https://en.wikipedia.org/wiki/Half-precision_floating-point_format.
+      #
+      # The intent of the test is not handling of float16 values, but to
+      # validate the number of buckets is returned, in cases where  the input
+      # may contain repeated values.
+      inputs = [
+          2030.0, 2031.0, 2032.0, 2033.0, 2034.0, 2035.0, 2036.0, 2037.0,
+          2038.0, 2039.0, 2040.0, 2041.0, 2042.0, 2043.0, 2044.0, 2045.0,
+          2046.0, 2047.0, 2048.0, 2048.0, 2050.0, 2052.0, 2052.0, 2052.0,
+          2054.0, 2056.0, 2056.0, 2056.0, 2058.0, 2060.0
+      ]
+      sess.run(update,
+               {input_column: inputs,
+                weights: [1] * len(inputs)})
+
+    with self.test_session() as sess:
+      sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
+      are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
+      buckets, are_ready_flush = (sess.run(
+          [buckets, are_ready_flush]))
+      self.assertEqual(True, are_ready_flush)
+      self.assertEqual(num_quantiles + 1, len(buckets))
+      self.assertAllEqual([2030, 2040, 2050, 2060], buckets)
+
+  def _testStreamingQuantileBucketsHelper(
+      self, inputs, num_quantiles=3, expected_buckets=None):
+    """Helper to test quantile buckets on different inputs."""
+
+    # set generate_quantiles to True since the test will generate fewer
+    # boundaries otherwise.
+    with self.test_session() as sess:
+      accumulator = quantile_ops.QuantileAccumulator(
+          init_stamp_token=0, num_quantiles=num_quantiles,
+          epsilon=0.001, name="q1", generate_quantiles=True)
+      resources.initialize_resources(resources.shared_resources()).run()
+    input_column = array_ops.placeholder(dtypes.float32)
+    weights = array_ops.placeholder(dtypes.float32)
+    update = accumulator.add_summary(
+        stamp_token=0,
+        column=input_column,
+        example_weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(update,
+               {input_column: inputs,
+                weights: [1] * len(inputs)})
+
+    with self.test_session() as sess:
+      sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
+      are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
+      buckets, are_ready_flush = (sess.run(
+          [buckets, are_ready_flush]))
+      self.assertEqual(True, are_ready_flush)
+      # By default, use 3 quantiles, 4 boundaries for simplicity.
+      self.assertEqual(num_quantiles + 1, len(buckets))
+      if expected_buckets:
+        self.assertAllEqual(buckets, expected_buckets)
+
+  def testStreamingQuantileBucketsRepeatedSingleValue(self):
+    inputs = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    self._testStreamingQuantileBucketsHelper(inputs)
+
+  def testStreamingQ2antileBucketsRepeatedTwoValues(self):
+    inputs = [1, 1, 1, 2, 2, 2, 2, 2, 1, 1]
+    self._testStreamingQuantileBucketsHelper(inputs)
+
+  def testStreamingQ2antileBucketsRepeatedTwoValuesUnbalanced(self):
+    inputs = [7, 7, 7, 2, 7, 7, 2, 2, 7, 7]
+    self._testStreamingQuantileBucketsHelper(inputs)
+
+  def testStreamingQuantileBucketsFewerInputstThanBuckets(self):
+    inputs = [5]
+    self._testStreamingQuantileBucketsHelper(inputs)
+
+  def testStreamingQuantileBucketsEqualDistributionInSequence(self):
+    # Input pattern is of the form [1, 1, 1, 2, 2, 2, 3, 3, 3, ...]
+    ones = 100 * [1]
+    inputs = []
+    for i in range(1, 101):
+      inputs += [i * k for k in ones]
+    # Expect 100 equally spaced buckets.
+    expected_buckets = range(1, 101)
+    self._testStreamingQuantileBucketsHelper(
+        inputs, num_quantiles=99, expected_buckets=expected_buckets)
+
+  def testStreamingQuantileBucketsEqualDistributionInterleaved(self):
+    # Input pattern is of the form [1, 2, 3, 1, 2, 3, 1, 2, 3, ...]
+    sequence = range(1, 101)
+    inputs = []
+    for _ in range(1, 101):
+      inputs += sequence
+    # Expect 100 equally spaced buckets.
+    expected_buckets = range(1, 101)
+    self._testStreamingQuantileBucketsHelper(
+        inputs, num_quantiles=99, expected_buckets=expected_buckets)
+
   def testStreamingQuantileBuckets(self):
     """Sets up the quantile summary op test as follows.
 
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
index c2e65b643df90e88aadb0bb9acaf692da35b1a16..8ca1aabacaf53b66aaba184962922294427d6803 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
@@ -63,7 +63,7 @@ def _gen_learner_config(num_classes,
   if dropout_prob_of_skipping is not None:
     config.learning_rate_tuner.dropout.dropout_prob_of_skipping = (
         dropout_prob_of_skipping)
-  return config.SerializeToString()
+  return config
 
 
 def _gen_dense_split_info(fc, threshold, left_weight, right_weight):
@@ -145,7 +145,7 @@ class CenterTreeEnsembleBiasOpTest(test_util.TensorFlowTestCase):
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
           growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE,
           # Dropout does not change anything here.
-          dropout_probability=0.5)
+          dropout_probability=0.5).SerializeToString()
 
       # Center bias for the initial step.
       grads = constant_op.constant([0.4, -0.3])
@@ -296,7 +296,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
           growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE,
           # Dropout does not change anything here, tree is not finalized.
-          dropout_probability=0.5)
+          dropout_probability=0.5).SerializeToString()
 
       # Prepare handler inputs.
       # Note that handlers 1 & 3 have the same gain but different splits.
@@ -443,7 +443,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
           growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE,
           # Dropout does not change anything here - tree is not finalized.
-          dropout_probability=0.5)
+          dropout_probability=0.5).SerializeToString()
 
       # Prepare handler inputs.
       # Handler 1 only has a candidate for partition 1, handler 2 has candidates
@@ -632,7 +632,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           max_depth=1,
           min_node_weight=0,
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
-          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE).SerializeToString(
+          )
 
       # Prepare handler inputs.
       handler1_partitions = np.array([0], dtype=np.int32)
@@ -772,7 +773,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           max_depth=1,
           min_node_weight=0,
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
-          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE).SerializeToString(
+          )
 
       # Prepare handler inputs.
       # All handlers have negative gain.
@@ -837,7 +839,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           max_depth=1,
           min_node_weight=0,
           pruning_mode=learner_pb2.LearnerConfig.POST_PRUNE,
-          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE).SerializeToString(
+          )
 
       # Prepare handler inputs.
       # Note that handlers 1 & 3 have the same gain but different splits.
@@ -943,7 +946,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           max_depth=2,
           min_node_weight=0,
           pruning_mode=learner_pb2.LearnerConfig.POST_PRUNE,
-          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE).SerializeToString(
+          )
 
       # Prepare handler inputs.
       # All handlers have negative gain.
@@ -1090,7 +1094,8 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           max_depth=2,
           min_node_weight=0,
           pruning_mode=learner_pb2.LearnerConfig.POST_PRUNE,
-          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE).SerializeToString(
+          )
 
       # Prepare handler inputs.
       # Second handler has positive gain.
@@ -1330,7 +1335,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
           growing_mode=learner_pb2.LearnerConfig.LAYER_BY_LAYER,
           # Dropout will have no effect, since the tree will not be fully grown.
-          dropout_probability=1.0)
+          dropout_probability=1.0).SerializeToString()
 
       # Prepare handler inputs.
       # Handler 1 only has a candidate for partition 1, handler 2 has candidates
@@ -1538,7 +1543,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
           growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE,
-          dropout_probability=1.0)
+          dropout_probability=1.0).SerializeToString()
 
       # Prepare handler inputs.
       handler1_partitions = np.array([0], dtype=np.int32)
@@ -1583,6 +1588,301 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(
           2, tree_ensemble_config.tree_metadata[2].num_tree_weight_updates)
 
+  def testGrowExistingEnsembleTreeWithFeatureSelectionCanStillGrow(self):
+    """Test growing a tree with feature selection."""
+    with self.test_session() as session:
+      # Create existing ensemble with one root split and one bias tree.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      text_format.Merge("""
+        trees {
+          nodes {
+            leaf {
+              vector {
+                value: -0.32
+                value: 0.28
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            categorical_id_binary_split {
+              feature_column: 3
+              feature_id: 7
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 1.3
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 2.3
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -0.9
+              }
+            }
+          }
+        }
+        tree_weights: 0.7
+        tree_weights: 1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+          num_tree_weight_updates: 5
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 2
+          used_handler_ids: 2
+          used_handler_ids: 5
+        }
+      """, tree_ensemble_config)
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=1,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+      # There are 2 handler_ids in used_handler_ids already but one of them
+      # is handler 2, so we can still grow trees.
+      learner_config.constraints.max_number_of_unique_feature_columns = 2
+      learner_config = learner_config.SerializeToString()
+      # Prepare handler inputs.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([7.62], dtype=np.float32)
+      handler1_split = [_gen_dense_split_info(5, 0.52, -4.375, 7.143)]
+      handler2_partitions = np.array([0], dtype=np.int32)
+      handler2_gains = np.array([0.63], dtype=np.float32)
+      handler2_split = [_gen_dense_split_info(2, 0.23, -0.6, 0.24)]
+      handler3_partitions = np.array([0], dtype=np.int32)
+      handler3_gains = np.array([7.62], dtype=np.float32)
+      handler3_split = [_gen_categorical_split_info(8, 7, -4.375, 7.143)]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=1,
+          partition_ids=[
+              handler1_partitions, handler2_partitions, handler3_partitions
+          ],
+          gains=[handler1_gains, handler2_gains, handler3_gains],
+          splits=[handler1_split, handler2_split, handler3_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      # Expect a new tree to be added with the split from handler 1.
+      _, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      tree_ensemble_config.ParseFromString(serialized)
+      self.assertEqual(3, len(tree_ensemble_config.trees))
+      self.assertEqual(
+          2, len(tree_ensemble_config.growing_metadata.used_handler_ids))
+
+  def testGrowExistingEnsembleTreeWithFeatureSelectionEmptyEnsemble(self):
+    """Test growing a tree with feature selection with empty ensemble."""
+    with self.test_session() as session:
+      # Create existing ensemble with one root split and one bias tree.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=1,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+      learner_config.constraints.max_number_of_unique_feature_columns = 2
+      learner_config = learner_config.SerializeToString()
+      # Prepare handler inputs.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([7.62], dtype=np.float32)
+      handler1_split = [_gen_dense_split_info(5, 0.52, -4.375, 7.143)]
+      handler2_partitions = np.array([0], dtype=np.int32)
+      handler2_gains = np.array([0.63], dtype=np.float32)
+      handler2_split = [_gen_dense_split_info(2, 0.23, -0.6, 0.24)]
+      handler3_partitions = np.array([0], dtype=np.int32)
+      handler3_gains = np.array([7.62], dtype=np.float32)
+      handler3_split = [_gen_categorical_split_info(8, 7, -4.375, 7.143)]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=1,
+          partition_ids=[
+              handler1_partitions, handler2_partitions, handler3_partitions
+          ],
+          gains=[handler1_gains, handler2_gains, handler3_gains],
+          splits=[handler1_split, handler2_split, handler3_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      _, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      tree_ensemble_config.ParseFromString(serialized)
+      self.assertEqual(1, len(tree_ensemble_config.trees))
+      self.assertEqual(
+          1, len(tree_ensemble_config.growing_metadata.used_handler_ids))
+
+  def testGrowExistingEnsembleTreeWithFeatureSelectionCantGrow(self):
+    """Test growing a tree with feature selection with empty ensemble."""
+    with self.test_session() as session:
+      # Create existing ensemble with one root split and one bias tree.
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      text_format.Merge("""
+        trees {
+          nodes {
+            leaf {
+              vector {
+                value: -0.32
+                value: 0.28
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            categorical_id_binary_split {
+              feature_column: 3
+              feature_id: 7
+              left_id: 1
+              right_id: 2
+            }
+            node_metadata {
+              gain: 1.3
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: 2.3
+              }
+            }
+          }
+          nodes {
+            leaf {
+              sparse_vector {
+                index: 0
+                value: -0.9
+              }
+            }
+          }
+        }
+        tree_weights: 0.7
+        tree_weights: 1
+        tree_metadata {
+          num_tree_weight_updates: 1
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+          num_tree_weight_updates: 5
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 2
+          used_handler_ids: 4
+          used_handler_ids: 5
+        }
+      """, tree_ensemble_config)
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = _gen_learner_config(
+          num_classes=2,
+          l1_reg=0,
+          l2_reg=0,
+          tree_complexity=0,
+          max_depth=1,
+          min_node_weight=0,
+          pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
+      learner_config.constraints.max_number_of_unique_feature_columns = 2
+      learner_config = learner_config.SerializeToString()
+      # Prepare handler inputs.
+      handler1_partitions = np.array([0], dtype=np.int32)
+      handler1_gains = np.array([7.62], dtype=np.float32)
+      handler1_split = [_gen_dense_split_info(5, 0.52, -4.375, 7.143)]
+      handler2_partitions = np.array([0], dtype=np.int32)
+      handler2_gains = np.array([0.63], dtype=np.float32)
+      handler2_split = [_gen_dense_split_info(2, 0.23, -0.6, 0.24)]
+      handler3_partitions = np.array([0], dtype=np.int32)
+      handler3_gains = np.array([7.62], dtype=np.float32)
+      handler3_split = [_gen_categorical_split_info(8, 7, -4.375, 7.143)]
+
+      # Grow tree ensemble.
+      grow_op = training_ops.grow_tree_ensemble(
+          tree_ensemble_handle,
+          stamp_token=0,
+          next_stamp_token=1,
+          learning_rate=1,
+          partition_ids=[
+              handler1_partitions, handler2_partitions, handler3_partitions
+          ],
+          gains=[handler1_gains, handler2_gains, handler3_gains],
+          splits=[handler1_split, handler2_split, handler3_split],
+          learner_config=learner_config,
+          dropout_seed=123,
+          center_bias=True)
+      session.run(grow_op)
+
+      _, serialized = session.run(
+          model_ops.tree_ensemble_serialize(tree_ensemble_handle))
+      tree_ensemble_config.ParseFromString(serialized)
+      # We can't grow a tree since we have reached the limit of 2 unique
+      # features [4, 5] and the only available splits are from
+      # handlers [0, 1, 2].
+      self.assertEqual(2, len(tree_ensemble_config.trees))
+      self.assertEqual(
+          2, len(tree_ensemble_config.growing_metadata.used_handler_ids))
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
index 23168bf4935e92bcb5072348361ae04861641b6d..7a5f329b7ab3216972180ccbb4c85f2537175422 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
@@ -81,32 +81,32 @@ def _scheduled_stamp_resource_op_runner(batch, stamp):
   if not batch:
     return
   arg_keys = set(batch[0].args.keys())
-  grouped_args = collections.defaultdict(list)
+  grouped_args = collections.OrderedDict()
   resource_handles = []
   # Check that the set of arguments is the same across all the scheduled ops.
   for op in batch:
     if set(op.args.keys()) != arg_keys:
       raise ValueError("Mismatching arguments: %s, %s.", op.args, arg_keys)
     for key in arg_keys:
-      grouped_args[key].append(op.args[key])
+      grouped_args.setdefault(key, []).append(op.args[key])
     resource_handles.append(op.resource_handle)
   # Move all the inputs to the op device in one RPC.
-  grouped_args = {
-      k: _move_tensors(v, resource_handles[0].device)
-      for k, v in grouped_args.items()
-  }
+  grouped_args = collections.OrderedDict(
+      (k, _move_tensors(v, resource_handles[0].device))
+      for k, v in sorted(grouped_args.items()))
   with ops.device(resource_handles[0].device):
     return batch[0].op(resource_handles, stamp, **grouped_args)
 
 
 def run_handler_scheduled_ops(per_handler_ops, stamp, worker_device):
   """Given a dictionary of ops for each handler, runs them in batch."""
-  batched_ops = collections.defaultdict(list)
+  batched_ops = collections.OrderedDict()
   # Group the ops by their batching_key. Ops that share the same batching key
   # can be executed together.
   for handler in per_handler_ops.keys():
     for op in per_handler_ops[handler]:
-      batched_ops[(op.batching_key(), op.batch_runner_fn())].append(op)
+      key = (op.batching_key(), op.batch_runner_fn())
+      batched_ops.setdefault(key, []).append(op)
   op_results = {}
   for batch in batched_ops.values():
     # Run each of the batched ops using its runner.
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 7e8e15e7d8c89d1adaa472b1da7e8bb3c73ca17e..97d57e8b23608d4c3a8719426a75056fc6417d1d 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -45,18 +45,24 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
                init_stamp_token,
                epsilon,
                num_quantiles,
+               max_elements=None,
                name=None,
-               container=None):
+               container=None,
+               generate_quantiles=False):
     """Creates a QuantileAccumulator object.
 
     Args:
       init_stamp_token: The initial value for the stamp token.
       epsilon: Error bound on the quantile computation.
       num_quantiles: Number of quantiles to produce from the final summary.
+      max_elements: Maximum number of elements added to the accumulator.
       name: the name to save the accumulator under.
       container: An optional `string`. Defaults to `""`
+      generate_quantiles: Generate quantiles instead of approximate boundaries.
+        If true, exactly `num_quantiles` will be produced in the final summary.
     """
     self._epsilon = epsilon
+    self._generate_quantiles = generate_quantiles
 
     name = _PATTERN.sub("", name)
     with ops.name_scope(name, "QuantileAccumulator") as name:
@@ -67,7 +73,9 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
           self._quantile_accumulator_handle,
           init_stamp_token,
           epsilon=epsilon,
-          num_quantiles=num_quantiles)
+          max_elements=max_elements,
+          num_quantiles=num_quantiles,
+          generate_quantiles=generate_quantiles)
       is_initialized_op = gen_quantile_ops.quantile_accumulator_is_initialized(
           self._quantile_accumulator_handle)
     resources.register_resource(self._quantile_accumulator_handle,
@@ -173,7 +181,14 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
         summaries=summary)
 
   def flush(self, stamp_token, next_stamp_token):
-    """Finalizes quantile summary stream and resets it for next iteration."""
+    """Finalizes quantile summary stream and resets it for next iteration.
+
+    Args:
+      stamp_token: Exepcted current token.
+      next_stamp_token: Next value for the token.
+    Returns:
+      A list of quantiles or approximate boundaries.
+    """
     return gen_quantile_ops.quantile_accumulator_flush(
         quantile_accumulator_handle=self._quantile_accumulator_handle,
         stamp_token=stamp_token,
diff --git a/tensorflow/contrib/boosted_trees/python/training/__init__.py b/tensorflow/contrib/boosted_trees/python/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b569ac5fdb60e0907c322ad73aca65645e548d94
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/training/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""training module under boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py b/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1750117cd7c311515b4bca6882d55f496daac0e
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""functions module under boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 6094dae6b59d8b05bb12a28cf167a536e6825287..f0b66dcbbe1c5167b9993e66b30b1dc8a839c380 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
 
 from tensorflow.contrib import learn
@@ -163,7 +164,7 @@ def extract_features(features, feature_columns):
     scope = "gbdt"
     with variable_scope.variable_scope(scope):
       feature_columns = list(feature_columns)
-      transformed_features = {}
+      transformed_features = collections.OrderedDict()
       for fc in feature_columns:
         # pylint: disable=protected-access
         if isinstance(fc, feature_column_lib._EmbeddingColumn):
@@ -322,9 +323,11 @@ class GradientBoostedDecisionTreeModel(object):
     self._feature_columns = feature_columns
     self._learner_config_serialized = learner_config.SerializeToString()
     self._attempted_trees = variables.Variable(
-        initial_value=array_ops.zeros([], dtypes.int64), trainable=False)
+        initial_value=array_ops.zeros([], dtypes.int64), trainable=False,
+        name="attempted_trees")
     self._finalized_trees = variables.Variable(
-        initial_value=array_ops.zeros([], dtypes.int64), trainable=False)
+        initial_value=array_ops.zeros([], dtypes.int64), trainable=False,
+        name="finalized_trees")
     if not features:
       raise ValueError("Features dictionary must be specified.")
     (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
@@ -679,13 +682,13 @@ class GradientBoostedDecisionTreeModel(object):
                               control_flow_ops.no_op))
 
     # Update handler stats.
-    handler_reads = {}
+    handler_reads = collections.OrderedDict()
     for handler in handlers:
       handler_reads[handler] = handler.scheduled_reads()
 
     handler_results = batch_ops_utils.run_handler_scheduled_ops(
         handler_reads, ensemble_stamp, worker_device)
-    per_handler_updates = {}
+    per_handler_updates = collections.OrderedDict()
     # Two values per handler. First one is if the handler is active for the
     # current layer. The second one is if the handler is going to be active
     # for the next layer.
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 16e24d97ddee0751e0b808b89080074c1b4baba7..dba51d4f527792d2a8dedc693f74c07119fd231d 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -912,8 +912,10 @@ class GbdtTest(test_util.TensorFlowTestCase):
       self.assertEqual(1,
                        len(output.trees[0].nodes[2].leaf.sparse_vector.index))
       self.assertEqual(3, output.trees[0].nodes[2].leaf.sparse_vector.index[0])
-      self.assertAlmostEqual(
-          0.893284678459, output.trees[0].nodes[2].leaf.sparse_vector.value[0])
+      self.assertAllClose(
+          0.893284678459,
+          output.trees[0].nodes[2].leaf.sparse_vector.value[0],
+          atol=1e-4, rtol=1e-4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/boosted_trees/python/utils/__init__.py b/tensorflow/contrib/boosted_trees/python/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ceb150c26552584d631948f5eef2fedfa690894
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/python/utils/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""utils module under boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses.py b/tensorflow/contrib/boosted_trees/python/utils/losses.py
index 1e8b3ac08a74a94a0e5729e42ace91398a7b5c94..ab7ac2aba605db22a8ed370049b27d55cf1d413a 100644
--- a/tensorflow/contrib/boosted_trees/python/utils/losses.py
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses.py
@@ -78,7 +78,7 @@ def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15):
 
   # Calculate softmax probabilities for each class.
   unnormalized_probs = math_ops.exp(logits)
-  normalizers = math_ops.reduce_sum(unnormalized_probs, 1, keep_dims=True)
+  normalizers = math_ops.reduce_sum(unnormalized_probs, 1, keepdims=True)
   softmax_predictions = math_ops.divide(unnormalized_probs,
                                         math_ops.add(normalizers, eps))
 
@@ -120,7 +120,7 @@ def per_example_squared_loss(labels, weights, predictions):
     update_op: An update operation to update the loss's internal state.
   """
   unweighted_loss = math_ops.reduce_sum(
-      math_ops.square(predictions - labels), 1, keep_dims=True)
+      math_ops.square(predictions - labels), 1, keepdims=True)
 
   return unweighted_loss * weights, control_flow_ops.no_op()
 
diff --git a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
index 284ad5cdb9abf374650940ade7bb36663d72c0dd..3ebf28ea442edf87815c39971ae9e01a2a8aae9a 100644
--- a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
 
 #include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
 #include "tensorflow/contrib/boosted_trees/resources/stamped_resource.h"
@@ -111,6 +111,35 @@ class DecisionTreeEnsembleResource : public StampedResource {
     return decision_tree_ensemble_->tree_weights(index);
   }
 
+  void MaybeAddUsedHandler(const int32 handler_id) {
+    protobuf::RepeatedField<protobuf_int64>* used_ids =
+        decision_tree_ensemble_->mutable_growing_metadata()
+            ->mutable_used_handler_ids();
+    protobuf::RepeatedField<protobuf_int64>::iterator first =
+        std::lower_bound(used_ids->begin(), used_ids->end(), handler_id);
+    if (first == used_ids->end()) {
+      used_ids->Add(handler_id);
+      return;
+    }
+    if (handler_id == *first) {
+      // It is a duplicate entry.
+      return;
+    }
+    used_ids->Add(handler_id);
+    std::rotate(first, used_ids->end() - 1, used_ids->end());
+  }
+
+  std::vector<int64> GetUsedHandlers() const {
+    std::vector<int64> result;
+    result.reserve(
+        decision_tree_ensemble_->growing_metadata().used_handler_ids().size());
+    for (int64 h :
+         decision_tree_ensemble_->growing_metadata().used_handler_ids()) {
+      result.push_back(h);
+    }
+    return result;
+  }
+
   // Sets the weight of i'th tree, and increment num_updates in tree_metadata.
   void SetTreeWeight(const int32 index, const float weight,
                      const int32 increment_num_updates) {
@@ -150,4 +179,4 @@ class DecisionTreeEnsembleResource : public StampedResource {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
diff --git a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
index fb29f79e578e8e52b67de631c527be35b7772b41..fdaaae7f472c8f564ab45a8366d3746cbf1158ee 100644
--- a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
 
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h"
 #include "tensorflow/contrib/boosted_trees/proto/quantiles.pb.h"  // NOLINT
@@ -32,12 +32,14 @@ using QuantileStream =
 class QuantileStreamResource : public StampedResource {
  public:
   QuantileStreamResource(const float epsilon, const int32 num_quantiles,
-                         const int64 max_elements, int64 stamp_token)
+                         const int64 max_elements, bool generate_quantiles,
+                         int64 stamp_token)
       : stream_(epsilon, max_elements),
         are_buckets_ready_(false),
         epsilon_(epsilon),
         num_quantiles_(num_quantiles),
-        max_elements_(max_elements) {
+        max_elements_(max_elements),
+        generate_quantiles_(generate_quantiles) {
     set_stamp(stamp_token);
   }
 
@@ -74,6 +76,11 @@ class QuantileStreamResource : public StampedResource {
     are_buckets_ready_ = are_buckets_ready;
   }
 
+  bool generate_quantiles() const { return generate_quantiles_; }
+  void set_generate_quantiles(bool generate_quantiles) {
+    generate_quantiles_ = generate_quantiles;
+  }
+
  private:
   ~QuantileStreamResource() override {}
 
@@ -95,10 +102,15 @@ class QuantileStreamResource : public StampedResource {
   const int32 num_quantiles_;
   // An upper-bound for the number of elements.
   int64 max_elements_;
+
+  // Generate quantiles instead of approximate boundaries.
+  // If true, exactly `num_quantiles` will be produced in the final summary.
+  bool generate_quantiles_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(QuantileStreamResource);
 };
 
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
diff --git a/tensorflow/contrib/boosted_trees/resources/stamped_resource.h b/tensorflow/contrib/boosted_trees/resources/stamped_resource.h
index aabeeb98516eda6f7e8e7e296d6860fe5d8d5ec3..957bbe8d61d3dd32adba1a7f0cf840c69bce6273 100644
--- a/tensorflow/contrib/boosted_trees/resources/stamped_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/stamped_resource.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
+#ifndef TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
+#define TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
 
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -39,4 +39,4 @@ class StampedResource : public ResourceBase {
 
 }  // namespace boosted_trees
 }  // namespace tensorflow
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
+#endif  // TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
diff --git a/tensorflow/contrib/cloud/BUILD b/tensorflow/contrib/cloud/BUILD
index aa8f5ed12bc6f779e3c1a923b9225ec283189747..fe8bd072afd43a64fa62a65bd8900b5a98dbe761 100644
--- a/tensorflow/contrib/cloud/BUILD
+++ b/tensorflow/contrib/cloud/BUILD
@@ -60,9 +60,7 @@ tf_py_test(
     size = "small",
     srcs = ["python/ops/bigquery_reader_ops_test.py"],
     additional_deps = [
-        ":bigquery_reader_ops_op_lib",
         ":cloud_py",
-        "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
index 51821f6653550afd2d2e8a49b7337ff8ba0b5489..1bfd27305d569668a0bd67d876e59eec082296b3 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 
 namespace tensorflow {
-
 namespace {
 
 constexpr size_t kBufferSize = 1024 * 1024;  // In bytes.
@@ -40,33 +39,6 @@ Status ParseJson(StringPiece json, Json::Value* result) {
   return Status::OK();
 }
 
-string ColumnTypeToString(BigQueryTableAccessor::ColumnType enum_type) {
-  switch (enum_type) {
-    case BigQueryTableAccessor::ColumnType::kRecord:
-      return "RECORD";
-    case BigQueryTableAccessor::ColumnType::kString:
-      return "STRING";
-    case BigQueryTableAccessor::ColumnType::kBytes:
-      return "BYTES";
-    case BigQueryTableAccessor::ColumnType::kInteger:
-      return "INTEGER";
-    case BigQueryTableAccessor::ColumnType::kFloat:
-      return "FLOAT";
-    case BigQueryTableAccessor::ColumnType::kBoolean:
-      return "BOOLEAN";
-    case BigQueryTableAccessor::ColumnType::kTimestamp:
-      return "TIMESTAMP";
-    case BigQueryTableAccessor::ColumnType::kDate:
-      return "DATE";
-    case BigQueryTableAccessor::ColumnType::kTime:
-      return "TIME";
-    case BigQueryTableAccessor::ColumnType::kDatetime:
-      return "DATETIME";
-    case BigQueryTableAccessor::ColumnType::kNone:
-      return "NONE";
-  }
-}
-
 Status ParseColumnType(const string& type,
                        BigQueryTableAccessor::ColumnType* enum_type) {
   if (type == "RECORD") {
@@ -202,22 +174,21 @@ Status BigQueryTableAccessor::ReadRow(int64* row_id, Example* example) {
     std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
     std::vector<char> output_buffer;
     output_buffer.reserve(kBufferSize);
-    TF_RETURN_IF_ERROR(request->Init());
 
     // The first time that we access BigQuery there is no page token. After that
     // we use the page token (which returns rows faster).
     if (!next_page_token_.empty()) {
-      TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
+      request->SetUri(strings::StrCat(
           BigQueryUriPrefix(), "data?maxResults=", ComputeMaxResultsArg(),
-          "&pageToken=", request->EscapeString(next_page_token_))));
+          "&pageToken=", request->EscapeString(next_page_token_)));
       first_buffered_row_index_ += row_buffer_.size();
     } else {
-      TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
+      request->SetUri(strings::StrCat(
           BigQueryUriPrefix(), "data?maxResults=", ComputeMaxResultsArg(),
-          "&startIndex=", first_buffered_row_index_)));
+          "&startIndex=", first_buffered_row_index_));
     }
-    TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
-    TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
+    request->AddAuthBearerHeader(auth_token);
+    request->SetResultBuffer(&output_buffer);
     TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when reading rows from ",
                                     FullTableName());
 
@@ -293,10 +264,9 @@ Status BigQueryTableAccessor::ReadSchema() {
   std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
   std::vector<char> output_buffer;
   output_buffer.reserve(kBufferSize);
-  TF_RETURN_IF_ERROR(request->Init());
-  TF_RETURN_IF_ERROR(request->SetUri(BigQueryUriPrefix()));
-  TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
-  TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
+  request->SetUri(BigQueryUriPrefix());
+  request->AddAuthBearerHeader(auth_token);
+  request->SetResultBuffer(&output_buffer);
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when reading schema for ",
                                   FullTableName());
 
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h
index 7d0eee59ae2f47503c4f8994ef356ce0dc336733..b349063715c903c982cfe2fb116b6525e35ff63b 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
+#define TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
 
 #include <map>
 #include <memory>
@@ -205,4 +205,4 @@ class BigQueryTableAccessor {
 };
 
 }  // namespace tensorflow
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_PARTITION_ACCESSOR_H_
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
index b2b11f4f57800d55ebc86273fcda71e673ff143a..fea6b15640ded74432f35112bc5d5d68e641c9dc 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test_data.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
+#define TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
 
 #include <string>
 
@@ -399,6 +399,6 @@ const string kTestEmptyRow = R"({
     }]}]})";
 
 }  // namespace
-}  // namepsace tensorflow
+}  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CLOUD_BIGQUERY_TABLE_ACCESSOR_TEST_DATA_H_
diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD
index 15abd2be0385eb776ff4f76484133efb6e34f076..80e18a43a71cc9d6c9e2ccf5836e50c6427a30f6 100644
--- a/tensorflow/contrib/cluster_resolver/BUILD
+++ b/tensorflow/contrib/cluster_resolver/BUILD
@@ -34,6 +34,7 @@ py_library(
         ":cluster_resolver_py",
         ":gce_cluster_resolver_py",
         ":tpu_cluster_resolver_py",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/contrib/cluster_resolver/__init__.py b/tensorflow/contrib/cluster_resolver/__init__.py
index d17501e87e79158b1602ac6ddecc091bd86f2c2d..b4d8cd4a7cf42e910e7506dbeec8656a2cef62eb 100644
--- a/tensorflow/contrib/cluster_resolver/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/__init__.py
@@ -26,3 +26,15 @@ from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import
 from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
 from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
 # pylint: enable=wildcard-import,unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'ClusterResolver',
+    'SimpleClusterResolver',
+    'UnionClusterResolver',
+    'GceClusterResolver',
+    'TPUClusterResolver',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index c74da9cabd6816bc9c7891e32937534cff2d677d..a6a6e642e4e4c721b94821a70d55d6fe931347d6 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -18,6 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+
+from six.moves.urllib.request import Request
+from six.moves.urllib.request import urlopen
+
 from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
 from tensorflow.python.training.server_lib import ClusterSpec
 
@@ -38,10 +42,16 @@ class TPUClusterResolver(ClusterResolver):
   Cloud Platform project.
   """
 
+  def _requestComputeMetadata(self, path):
+    req = Request('http://metadata/computeMetadata/v1/%s' % path,
+                  headers={'Metadata-Flavor': 'Google'})
+    resp = urlopen(req)
+    return resp.read()
+
   def __init__(self,
-               project,
-               zone,
                tpu_names,
+               zone=None,
+               project=None,
                job_name='tpu_worker',
                credentials='default',
                service=None):
@@ -51,9 +61,13 @@ class TPUClusterResolver(ClusterResolver):
     for the IP addresses and ports of each Cloud TPU listed.
 
     Args:
-      project: Name of the GCP project containing Cloud TPUs
-      zone: Zone where the TPUs are located
       tpu_names: A list of names of the target Cloud TPUs.
+      zone: Zone where the TPUs are located. If omitted or empty, we will assume
+        that the zone of the TPU is the same as the zone of the GCE VM, which we
+        will try to discover from the GCE metadata service.
+      project: Name of the GCP project containing Cloud TPUs. If omitted or
+        empty, we will try to discover the project name of the GCE VM from the
+        GCE metadata service.
       job_name: Name of the TensorFlow job the TPUs belong to.
       credentials: GCE Credentials. If None, then we use default credentials
         from the oauth2client
@@ -65,6 +79,13 @@ class TPUClusterResolver(ClusterResolver):
       ImportError: If the googleapiclient is not installed.
     """
 
+    if not project:
+      project = self._requestComputeMetadata('/project/project-id')
+
+    if not zone:
+      zone_path = self._requestComputeMetadata('/instance/zone')
+      zone = zone_path.split('/')[-1]
+
     self._project = project
     self._zone = zone
     self._tpu_names = tpu_names
@@ -122,7 +143,8 @@ class TPUClusterResolver(ClusterResolver):
       request = self._service.projects().locations().nodes().get(name=full_name)
       response = request.execute()
 
-      instance_url = '%s:%s' % (response['ipAddress'], response['port'])
-      worker_list.append(instance_url)
+      if 'health' in response and response['health'] == 'HEALTHY':
+        instance_url = '%s:%s' % (response['ipAddress'], response['port'])
+        worker_list.append(instance_url)
 
     return ClusterSpec({self._job_name: worker_list})
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index db7419be06b58e1c5737f69f2c7fd9fee44b9d95..4fd34629cf74f90869c77b8cb098d3c585a49404 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -48,6 +48,15 @@ class MockNodeClass(object):
     return MockRequestClass(name, self._tpu_map)
 
 
+def mock_request_compute_metadata(cls, *args, **kwargs):
+  del cls, kwargs  # Unused.
+  if args[0] == '/project/project-id':
+    return 'test-project'
+  elif args[0] == '/instance/zone':
+    return 'projects/test-project/locations/us-central1-c'
+  return ''
+
+
 class TPUClusterResolverTest(test.TestCase):
 
   def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
@@ -89,11 +98,37 @@ class TPUClusterResolverTest(test.TestCase):
 
     return mock_client
 
+  @mock.patch.object(TPUClusterResolver,
+                     '_requestComputeMetadata',
+                     mock_request_compute_metadata)
+  def testRetrieveProjectAndZoneFromMetadata(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'ipAddress': '10.1.2.3',
+            'port': '8470',
+            'health': 'HEALTHY'
+        }
+    }
+
+    tpu_cluster_resolver = TPUClusterResolver(
+        project=None,
+        zone=None,
+        tpu_names=['test-tpu-1'],
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job { name: 'tpu_worker' tasks { key: 0 value: '10.1.2.3:8470' } }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
   def testSimpleSuccessfulRetrieval(self):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
             'ipAddress': '10.1.2.3',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         }
     }
 
@@ -114,11 +149,13 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
             'ipAddress': '10.1.2.3',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         },
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': {
             'ipAddress': '10.4.5.6',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         }
     }
 
@@ -136,15 +173,54 @@ class TPUClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  def testHealthyTpuNodeRetrieval(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'ipAddress': '10.1.2.3',
+            'port': '8470',
+            'health': 'HEALTHY'
+        },
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': {
+            'ipAddress': '10.4.5.6',
+            'port': '8470',
+        },
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-3': {
+            'ipAddress': '10.7.8.9',
+            'port': '8470',
+            'health': 'UNHEALTHY'
+        }
+    }
+
+    tpu_cluster_resolver = TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu_names=['test-tpu-2', 'test-tpu-1', 'test-tpu-3'],
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'tpu_worker'
+      tasks {
+        key: 0
+        value: '10.1.2.3:8470'
+      }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
   def testGetMasterMultipleEntries(self):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
             'ipAddress': '10.1.2.3',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         },
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': {
             'ipAddress': '10.4.5.6',
-            'port': '8470'
+            'port': '8470',
+            'health': 'HEALTHY'
         }
     }
 
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index ba708673b0d562f928230f427406147ab22f0007..16317f538f3890661f1b59ea39fe67dcf04d0d0a 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -18,7 +18,6 @@ cmake_policy(SET CMP0022 NEW)
 
 # Options
 option(tensorflow_VERBOSE "Enable for verbose output" OFF)
-option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" OFF)
 option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
 option(tensorflow_ENABLE_HDFS_SUPPORT "Enable HDFS support" OFF)
@@ -34,6 +33,13 @@ option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
 option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
 option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
+option(tensorflow_DISABLE_EIGEN_FORCEINLINE "Disable forceinline, to speed up build on windows." OFF)
+
+# GPU, CUDA and cuDNN options
+option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
+set(tensorflow_CUDA_VERSION "9.0" CACHE STRING "CUDA version to build against")
+set(tensorflow_CUDNN_VERSION "7" CACHE STRING "cuDNN version to build against")
+
 if(HAIKU)
 	option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" OFF)
 else()
@@ -46,6 +52,7 @@ if (NOT WIN32)
   # for targets that link ${CMAKE_THREAD_LIBS_INIT}.
   find_package (Threads)
 
+  # Options for linking CUDA/CUDNN libraries
   option(tensorflow_PATH_STATIC_LIB "Additional library search path for libcudnn_static.a, libnccl_static.a, libculibos.a" /usr/local/cuda/lib64/)
   option(tensorflow_CUDNN_INCLUDE "cudnn.h header install path" /usr/include/)
   if (NOT tensorflow_CUDNN_INCLUDE)
@@ -53,12 +60,28 @@ if (NOT WIN32)
     set(tensorflow_CUDNN_INCLUDE /usr/include)
   endif (NOT tensorflow_CUDNN_INCLUDE)
   option(tensorflow_PATH_CUDNN_STATIC_LIB "Override PATH_STATIC_LIB for libcudnn_static.a" ${tensorflow_PATH_STATIC_LIB})
+  if (NOT tensorflow_PATH_CUDNN_STATIC_LIB)
+    # option's default value is OFF. Fill it with real default values
+    set (tensorflow_PATH_CUDNN_STATIC_LIB ${tensorflow_PATH_STATIC_LIB})
+  endif (NOT tensorflow_PATH_CUDNN_STATIC_LIB)
   option(tensorflow_PATH_NCCL_STATIC_LIB "Override PATH_STATIC_LIB for libnccl_static.a" ${tensorflow_PATH_STATIC_LIB})
+  if (NOT tensorflow_PATH_NCCL_STATIC_LIB)
+    # option's default value is OFF. Fill it with real default values
+    set (tensorflow_PATH_NCCL_STATIC_LIB ${tensorflow_PATH_STATIC_LIB})
+  endif (NOT tensorflow_PATH_NCCL_STATIC_LIB)
   option(tensorflow_CUDA_LIBRARY_PATH "Designate the default CUDA library paths" /usr/local/cuda/lib64)
   if (NOT tensorflow_CUDA_LIBRARY_PATH)
     # option's default value is OFF. Fill it with real default values
     set(tensorflow_CUDA_LIBRARY_PATH /usr/local/cuda/lib64)
   endif (NOT tensorflow_CUDA_LIBRARY_PATH)
+
+  # Options for linking other libraries
+  option(systemlib_ZLIB "Use the system installed library as shared objects instead of downloading ZLIB and statically linking to it: ZLIB" OFF)
+
+  option(systemlib_ALL "Turn on every possible systemlib_* options" OFF)
+  if (systemlib_ALL)
+    set (systmelib_ZLIB ON)
+  endif (systemlib_ALL)
 endif()
 
 if (WIN32)
@@ -92,6 +115,13 @@ else()
 	set(CMAKE_POSITION_INDEPENDENT_CODE OFF)
 endif()
 
+# TODO(jart): We should make this only apply to snapfn.cc
+add_definitions(-DSQLITE_OMIT_LOAD_EXTENSION)
+
+if (tensorflow_DISABLE_EIGEN_FORCEINLINE)
+  add_definitions(-DEIGEN_STRONG_INLINE=inline)
+endif()
+
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
   add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
@@ -113,6 +143,9 @@ if(WIN32)
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /D_ITERATOR_DEBUG_LEVEL=0")
   set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /D_ITERATOR_DEBUG_LEVEL=0")
   set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /D_ITERATOR_DEBUG_LEVEL=0")
+
+  # Try to avoid flaky failures due to failed generation of generate.stamp files.
+  set(CMAKE_SUPPRESS_REGENERATION ON)
 endif()
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
@@ -160,13 +193,14 @@ include(protobuf)
 include(re2)
 include(cub)
 include(sqlite)
-include(double_conversion)
 if (tensorflow_BUILD_CC_TESTS)
   include(googletest)
 endif()
 
+add_definitions(${ADD_CFLAGS})
+link_directories(${ADD_LINK_DIRECTORY})
+
 set(tensorflow_EXTERNAL_LIBRARIES
-    ${zlib_STATIC_LIBRARIES}
     ${gif_STATIC_LIBRARIES}
     ${png_STATIC_LIBRARIES}
     ${jpeg_STATIC_LIBRARIES}
@@ -179,8 +213,16 @@ set(tensorflow_EXTERNAL_LIBRARIES
     ${protobuf_STATIC_LIBRARIES}
     ${re2_STATIC_LIBRARIES}
     ${sqlite_STATIC_LIBRARIES}
-    ${double_conversion_STATIC_LIBRARIES}
 )
+
+if (systemlib_ZLIB)
+  set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
+      ${ZLIB_LIBRARIES})
+else (systemlib_ZLIB)
+  set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
+    ${zlib_STATIC_LIBRARIES})
+endif (systemlib_ZLIB)
+
 set(tensorflow_EXTERNAL_DEPENDENCIES
     zlib_copy_headers_to_destination
     gif_copy_headers_to_destination
@@ -198,7 +240,6 @@ set(tensorflow_EXTERNAL_DEPENDENCIES
     fft2d
     re2
     sqlite_copy_headers_to_destination
-    double_conversion
 )
 
 include_directories(
@@ -221,7 +262,6 @@ include_directories(
     ${PROTOBUF_INCLUDE_DIRS}
     ${re2_INCLUDE_DIR}
     ${sqlite_INCLUDE_DIR}
-    ${double_conversion_INCLUDE_DIR}
 )
 
 if(tensorflow_ENABLE_SSL_SUPPORT)
@@ -266,7 +306,21 @@ if (tensorflow_ENABLE_GPU)
     list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}/stubs")
   endif (NOT WIN32)
 
-  find_package(CUDA 8.0 REQUIRED)
+  # later command will make use of the value in tensorflow_CUDA_VERSION
+  find_package(CUDA ${tensorflow_CUDA_VERSION} REQUIRED EXACT)
+
+  # Test compatibility of compiler on CUDA
+  try_compile(CUDA_TEST_COMPILE_C
+    ${CMAKE_CURRENT_BINARY_DIR}/tests/cuda
+    ${CMAKE_CURRENT_SOURCE_DIR}/tests/cuda/compatibility_test.c
+    CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS})
+  try_compile(CUDA_TEST_COMPILE_CXX
+    ${CMAKE_CURRENT_BINARY_DIR}/tests/cuda
+    ${CMAKE_CURRENT_SOURCE_DIR}/tests/cuda/compatibility_test.cc
+    CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS})
+  if(NOT (CUDA_TEST_COMPILE_C AND CUDA_TEST_COMPILE_CXX))
+    message(FATAL_ERROR "Selected compiler (or version) is not supported for CUDA")
+  endif()
 
   # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
   # CUDA_NVCC_FLAGS and cuda_config.h below
@@ -320,13 +374,16 @@ if (tensorflow_ENABLE_GPU)
       ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_STATIC_LIBRARY} ${culibos_STATIC_LIBRARY} ${nccl_STATIC_LIBRARY})
   endif (WIN32)
 
+  # Remove "." from CUDA version variable.
+  string(REPLACE "." "" short_CUDA_VER ${tensorflow_CUDA_VERSION})
+
   # create cuda_config.h
   FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
     "#ifndef CUDA_CUDA_CONFIG_H_\n"
     "#define CUDA_CUDA_CONFIG_H_\n"
     "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
-    "#define TF_CUDA_VERSION \"64_80\"\n"
-    "#define TF_CUDNN_VERSION \"64_6\"\n"
+    "#define TF_CUDA_VERSION \"64_${short_CUDA_VER}\"\n"
+    "#define TF_CUDNN_VERSION \"64_${tensorflow_CUDNN_VERSION}\"\n"
     "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
     "#endif  // CUDA_CUDA_CONFIG_H_\n"
   )
@@ -341,6 +398,8 @@ if (tensorflow_ENABLE_GPU)
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_runtime_api.h
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_fp16.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/device_functions.h
       DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
     )
   else(WIN32)
@@ -364,15 +423,15 @@ if (tensorflow_ENABLE_GPU)
   if(WIN32)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
       msvcp_dll_name=msvcp140.dll
-      cudart_dll_name=cudart64_80.dll
-      cuda_version_number=8.0
+      cudart_dll_name=cudart64_${short_CUDA_VER}.dll
+      cuda_version_number=${tensorflow_CUDA_VERSION}
       nvcuda_dll_name=nvcuda.dll
-      cudnn_dll_name=cudnn64_6.dll
-      cudnn_version_number=6)
+      cudnn_dll_name=cudnn64_${tensorflow_CUDNN_VERSION}.dll
+      cudnn_version_number=${tensorflow_CUDNN_VERSION})
   else(WIN32)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
-      cuda_version_number=8.0
-      cudnn_version_number=6)
+	    cuda_version_number=${tensorflow_CUDA_VERSION}
+	    cudnn_version_number=${tensorflow_CUDNN_VERSION})
   endif(WIN32)
 else(tensorflow_ENABLE_GPU)
   set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
@@ -387,10 +446,8 @@ endif()
 
 # Let's get to work!
 include(tf_core_framework.cmake)
-# NOTE: Disabled until issue #3996 is fixed.
-# include(tf_stream_executor.cmake)
 if (tensorflow_ENABLE_GPU)
-    include(tf_stream_executor.cmake)
+  include(tf_stream_executor.cmake)
 endif()
 
 include(tf_core_cpu.cmake)
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 4ddfec5960d2b759bacb376202cd8dab6ef2b024..8f85a75ee466dbac524a1266dc2522109ca77cd5 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -19,23 +19,6 @@ for instructions on how to install a pre-built TensorFlow package on Windows.
 ### Current known limitations
 * It is not possible to load a custom Op library.
 * GCS file system is not supported.
-* The following Ops are not currently implemented:
- - Dequantize
- - QuantizeAndDequantize
- - QuantizedAvgPool
- - QuantizedBatchNomWithGlobalNormalization
- - QuantizedBiasAdd
- - QuantizedConcat
- - QuantizedConv2D
- - QuantizedMatmul
- - QuantizedMaxPoo
- - QuantizeDownAndShrinkRange
- - QuantizedRelu
- - QuantizedRelu6
- - QuantizedReshape
- - QuantizeV2
- - RequantizationRange
- - Requantize
 
 ## Building with CMake
 
@@ -47,7 +30,7 @@ bindings.
 
 * CMake version 3.5 or later.
 
-* [Git](http://git-scm.com)
+* [Git](https://git-scm.com)
 
 * [SWIG](http://www.swig.org/download.html)
 
@@ -65,7 +48,7 @@ bindings.
 
 * Microsoft Windows 10
   - Microsoft Visual Studio Enterprise 2015 with Visual C++ 2015
-  - [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.continuum.io/downloads)
+  - [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.anaconda.com/download/)
   - [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
   - [swigwin-3.0.10](http://www.swig.org/download.html)
   - [NVidia CUDA Toolkit 8.0](https://developer.nvidia.com/cuda-downloads)
diff --git a/tensorflow/contrib/cmake/external/boringssl.cmake b/tensorflow/contrib/cmake/external/boringssl.cmake
index cca8444e2ae9952ea7c69a9392580ead715d363b..3c4bb01e24fd121c9d0fc3594cc25de37af0e8a1 100644
--- a/tensorflow/contrib/cmake/external/boringssl.cmake
+++ b/tensorflow/contrib/cmake/external/boringssl.cmake
@@ -37,13 +37,10 @@ ExternalProject_Add(boringssl
     GIT_TAG ${boringssl_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     # BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${boringssl_STATIC_LIBRARIES}
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
-        if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-        	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-        else()
-        	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
-        endif()
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
 )
diff --git a/tensorflow/contrib/cmake/external/double_conversion.cmake b/tensorflow/contrib/cmake/external/double_conversion.cmake
deleted file mode 100644
index 527ccdc8d887cb4c2e7d2412c99a8bc682568472..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/cmake/external/double_conversion.cmake
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-include (ExternalProject)
-
-set(double_conversion_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/double_conversion/src/double_conversion)
-set(double_conversion_URL https://github.com/google/double-conversion.git)
-set(double_conversion_TAG 5664746)
-set(double_conversion_BUILD ${double_conversion_INCLUDE_DIR})
-set(double_conversion_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.so)
-set(double_conversion_INCLUDES ${double_conversion_BUILD})
-
-if(WIN32)
-  set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/$(Configuration)/double-conversion.lib)
-else()
-  set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.a)
-endif()
-
-set(double_conversion_HEADERS
-    "${double_conversion_INCLUDE_DIR}/double-conversion/bignum-dtoa.h"
-    "${double_conversion_INCLUDE_DIR}/double-conversion/cached-powers.h"
-    "${double_conversion_INCLUDE_DIR}/double-conversion/double-conversion.h"
-    "${double_conversion_INCLUDE_DIR}/double-conversion/fixed-dtoa.h"
-    "${double_conversion_INCLUDE_DIR}/double-conversion/strtod.h"
-    "${double_conversion_INCLUDE_DIR}/double-conversion/bignum.h"
-    "${double_conversion_INCLUDE_DIR}/double-conversion/diy-fp.h"
-    "${double_conversion_INCLUDE_DIR}/double-conversion/fast-dtoa.h"
-    "${double_conversion_INCLUDE_DIR}/double-conversion/ieee.h"
-    "${double_conversion_INCLUDE_DIR}/double-conversion/utils.h"
-)
-
-ExternalProject_Add(double_conversion
-    PREFIX double_conversion
-    GIT_REPOSITORY ${double_conversion_URL}
-    GIT_TAG ${double_conversion_TAG}
-    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
-    BUILD_IN_SOURCE 1
-    INSTALL_COMMAND ""
-    CMAKE_CACHE_ARGS
-        -DCMAKE_BUILD_TYPE:STRING=Release
-        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-)
diff --git a/tensorflow/contrib/cmake/external/farmhash.cmake b/tensorflow/contrib/cmake/external/farmhash.cmake
index 0cd0c1030c73d5218411f281d2b077af217e8275..d51569bc213f2bd354571a00910714e787120951 100644
--- a/tensorflow/contrib/cmake/external/farmhash.cmake
+++ b/tensorflow/contrib/cmake/external/farmhash.cmake
@@ -33,6 +33,7 @@ if(WIN32)
       URL_HASH ${farmhash_HASH}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
       BUILD_IN_SOURCE 1
+      BUILD_BYPRODUCTS ${farmhash_STATIC_LIBRARIES}
       PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/farmhash/CMakeLists.txt ${farmhash_BUILD}
       INSTALL_DIR ${farmhash_INSTALL}
       CMAKE_CACHE_ARGS
diff --git a/tensorflow/contrib/cmake/external/fft2d.cmake b/tensorflow/contrib/cmake/external/fft2d.cmake
index d3af2a46761c0f7f0b5db134af8400fc93f2f095..a7bc50d5bcd4384d5c943d681fd7cd6fa1ffa796 100644
--- a/tensorflow/contrib/cmake/external/fft2d.cmake
+++ b/tensorflow/contrib/cmake/external/fft2d.cmake
@@ -29,6 +29,7 @@ if(WIN32)
       URL_HASH ${fft2d_HASH}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
       BUILD_IN_SOURCE 1
+      BUILD_BYPRODUCTS ${fft2d_STATIC_LIBRARIES}
       PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/fft2d/CMakeLists.txt ${fft2d_BUILD}/src/fft2d/CMakeLists.txt
       INSTALL_DIR ${fft2d_INSTALL}
       CMAKE_CACHE_ARGS
diff --git a/tensorflow/contrib/cmake/external/gemmlowp.cmake b/tensorflow/contrib/cmake/external/gemmlowp.cmake
index 3b146657bfc9bdd54db14839195af45972e67aff..a235442dc5c0a07e249653381436eeae81575883 100644
--- a/tensorflow/contrib/cmake/external/gemmlowp.cmake
+++ b/tensorflow/contrib/cmake/external/gemmlowp.cmake
@@ -14,8 +14,8 @@
 # ==============================================================================
 include (ExternalProject)
 
-set(gemmlowp_URL https://mirror.bazel.build/github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip)
-set(gemmlowp_HASH SHA256=dd2557072bde12141419cb8320a9c25e6ec41a8ae53c2ac78c076a347bb46d9d)
+set(gemmlowp_URL https://github.com/google/gemmlowp/archive/6a2a90822e8546fc2bfa7044de0faf1c1cb4862f.zip)
+set(gemmlowp_HASH SHA256=3447948d219f3270383766bbe08942888c0eb4e0ca6663c0e0548502ec5bb77d)
 set(gemmlowp_BUILD ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 set(gemmlowp_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 
diff --git a/tensorflow/contrib/cmake/external/gif.cmake b/tensorflow/contrib/cmake/external/gif.cmake
index 3d53c51fffcec1602a3b5553cdf3b225e3b0ae46..e1f8d13f8ea47b83e4a1840afac7398ef226eb45 100644
--- a/tensorflow/contrib/cmake/external/gif.cmake
+++ b/tensorflow/contrib/cmake/external/gif.cmake
@@ -33,6 +33,7 @@ if(WIN32)
       PREFIX gif
       URL ${gif_URL}
       URL_HASH ${gif_HASH}
+      BUILD_BYPRODUCTS ${gif_STATIC_LIBRARIES}
       PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_SOURCE_DIR}/patches/gif/CMakeLists.txt ${gif_BUILD}
       INSTALL_DIR ${gif_INSTALL}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
diff --git a/tensorflow/contrib/cmake/external/googletest.cmake b/tensorflow/contrib/cmake/external/googletest.cmake
index d09bb02890f25a0312e62c876c1729e57a059e82..7cc5ae6390934773635cf7a4dff77a3cbfb41ba1 100644
--- a/tensorflow/contrib/cmake/external/googletest.cmake
+++ b/tensorflow/contrib/cmake/external/googletest.cmake
@@ -20,8 +20,13 @@ set(googletest_BUILD ${CMAKE_CURRENT_BINARY_DIR}/googletest/)
 set(googletest_TAG ec44c6c1675c25b9827aacd08c02433cccde7780)
 
 if(WIN32)
-  set(googletest_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/$(Configuration)/gtest.lib)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(googletest_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/$(Configuration)/gtest.lib)
+  else()
+    set(googletest_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/gtest.lib)
+  endif()
 else()
   set(googletest_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/${CMAKE_BUILD_TYPE}/gtest.a)
@@ -33,6 +38,7 @@ ExternalProject_Add(googletest
     GIT_TAG ${googletest_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${googletest_STATIC_LIBRARIES}
     #PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_SOURCE_DIR}/patches/grpc/CMakeLists.txt ${GRPC_BUILD}
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index 41ea0b48a4600d7ca2dd2f4a61c14ec0cc5b4734..a9f43a3ecba4830533efcc13f8c4c1c61fe1ef78 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,13 +17,20 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG 54e8f37e537794c2d814c1604c1282125f64f093)
+set(GRPC_TAG 730b778632e79cc3c96ad237f282d687ee325ce7)
 
 if(WIN32)
-  set(grpc_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc++_unsecure.lib
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc_unsecure.lib
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/gpr.lib)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(grpc_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc++_unsecure.lib
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc_unsecure.lib
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/gpr.lib)
+  else()
+    set(grpc_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/grpc++_unsecure.lib
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/grpc_unsecure.lib
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/gpr.lib)
+  endif()
 else()
   set(grpc_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
@@ -40,6 +47,7 @@ ExternalProject_Add(grpc
     GIT_TAG ${GRPC_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${grpc_STATIC_LIBRARIES}
     BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc++_unsecure
     COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc_cpp_plugin
     INSTALL_COMMAND ""
diff --git a/tensorflow/contrib/cmake/external/highwayhash.cmake b/tensorflow/contrib/cmake/external/highwayhash.cmake
index 2c23bef8a331de356c93dbf9d0e91d8bb13bd6c8..a6e8a38d8c2ee3deb5453c264e0c5eb23248301f 100644
--- a/tensorflow/contrib/cmake/external/highwayhash.cmake
+++ b/tensorflow/contrib/cmake/external/highwayhash.cmake
@@ -42,6 +42,7 @@ ExternalProject_Add(highwayhash
     GIT_TAG ${highwayhash_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${highwayhash_STATIC_LIBRARIES}
     PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/highwayhash/CMakeLists.txt ${highwayhash_BUILD}
     INSTALL_DIR ${highwayhash_INSTALL}
     CMAKE_CACHE_ARGS
diff --git a/tensorflow/contrib/cmake/external/jemalloc.cmake b/tensorflow/contrib/cmake/external/jemalloc.cmake
index 198ba13e64e4b6df57c4325a0104b1a6745d173a..afadcc007d66414be3306e91e7186a00b6e587ce 100644
--- a/tensorflow/contrib/cmake/external/jemalloc.cmake
+++ b/tensorflow/contrib/cmake/external/jemalloc.cmake
@@ -24,8 +24,11 @@ if (WIN32)
         ${jemalloc_INCLUDE_DIRS} 
         ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include/msvc_compat
     )
-    set(jemalloc_ADDITIONAL_CMAKE_OPTIONS -A x64)
-    set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.lib)
+    if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+        set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.lib)
+    else()
+        set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/jemalloc.lib)
+    endif()
 else()
     set(jemalloc_STATIC_LIBRARIES ${jemalloc_BUILD}/Release/jemalloc.a)
 endif()
@@ -36,12 +39,12 @@ ExternalProject_Add(jemalloc
     URL_HASH ${jemalloc_HASH}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
-    CONFIGURE_COMMAND ${CMAKE_COMMAND}
+    BUILD_BYPRODUCTS ${jemalloc_STATIC_LIBRARIES}
+    BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target jemalloc
+    INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "Skipping install step."
+    CMAKE_CACHE_ARGS
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -Dwith-jemalloc-prefix:STRING=jemalloc_
         -Dwithout-export:BOOL=ON
-        ${jemalloc_ADDITIONAL_CMAKE_OPTIONS}
-    BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target jemalloc
-    INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "Skipping install step."
 )
diff --git a/tensorflow/contrib/cmake/external/jpeg.cmake b/tensorflow/contrib/cmake/external/jpeg.cmake
index d9a165e856c588880ebdf996666d70c9e7f53da8..c1c5842aa4454f1c95ec284392194a89d47ee8d5 100644
--- a/tensorflow/contrib/cmake/external/jpeg.cmake
+++ b/tensorflow/contrib/cmake/external/jpeg.cmake
@@ -46,6 +46,7 @@ if (WIN32)
         PREFIX jpeg
         URL ${jpeg_URL}
         URL_HASH ${jpeg_HASH}
+        BUILD_BYPRODUCTS ${jpeg_STATIC_LIBRARIES}
         PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/jpeg/CMakeLists.txt ${jpeg_BUILD}
         INSTALL_DIR ${jpeg_INSTALL}
         DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
diff --git a/tensorflow/contrib/cmake/external/jsoncpp.cmake b/tensorflow/contrib/cmake/external/jsoncpp.cmake
index d2ae4c76e8cd175cdc3ba41fdf4e4009f8237309..84c52e3652ff935c287d32c0c80fd407e1213f29 100644
--- a/tensorflow/contrib/cmake/external/jsoncpp.cmake
+++ b/tensorflow/contrib/cmake/external/jsoncpp.cmake
@@ -23,7 +23,11 @@ set(jsoncpp_LIBRARIES ${jsoncpp_BUILD}/obj/so/libjsoncpp.so)
 set(jsoncpp_INCLUDES ${jsoncpp_BUILD})
 
 if(WIN32)
-  set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/$(Configuration)/jsoncpp.lib)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/$(Configuration)/jsoncpp.lib)
+  else()
+    set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/jsoncpp.lib)
+  endif()
 else()
   set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/libjsoncpp.a)
 endif()
@@ -40,13 +44,10 @@ ExternalProject_Add(jsoncpp
     GIT_TAG ${jsoncpp_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${jsoncpp_STATIC_LIBRARIES}
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
-  	  if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-  	      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-  	  else()
-   	    	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
-   	 endif()
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
 )
diff --git a/tensorflow/contrib/cmake/external/lmdb.cmake b/tensorflow/contrib/cmake/external/lmdb.cmake
index e41384f023ca9fc4cba697917b491af5a9db92bc..ed5ab788acc5625b9c8020fce15f027d98433096 100644
--- a/tensorflow/contrib/cmake/external/lmdb.cmake
+++ b/tensorflow/contrib/cmake/external/lmdb.cmake
@@ -20,31 +20,28 @@ set(lmdb_HASH SHA256=108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d
 set(lmdb_BUILD ${CMAKE_BINARY_DIR}/lmdb/src/lmdb)
 set(lmdb_INSTALL ${CMAKE_BINARY_DIR}/lmdb/install)
 
+if(WIN32)
+    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/lmdb.lib)
+else()
+    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/liblmdb.a)
+endif()
+
 ExternalProject_Add(lmdb
     PREFIX lmdb
     URL ${lmdb_URL}
     URL_HASH ${lmdb_HASH}
+    BUILD_BYPRODUCTS ${lmdb_STATIC_LIBRARIES}
     PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different
         ${CMAKE_CURRENT_SOURCE_DIR}/patches/lmdb/CMakeLists.txt ${lmdb_BUILD}
     INSTALL_DIR ${lmdb_INSTALL}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
-		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-		else()
-			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
-		endif()
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${lmdb_INSTALL}
 )
 
-if(WIN32)
-    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/lmdb.lib)
-else()
-    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/liblmdb.a)
-endif()
-
 set(lmdb_HEADERS
     "${lmdb_INSTALL}/include/lmdb.h"
     "${lmdb_INSTALL}/include/midl.h"
diff --git a/tensorflow/contrib/cmake/external/nsync.cmake b/tensorflow/contrib/cmake/external/nsync.cmake
index 155c91cb97dbe5ef33c318efb5544a9fa22166c7..f3a37ff5088e3f9e54e38c0edb5777c27b26969f 100644
--- a/tensorflow/contrib/cmake/external/nsync.cmake
+++ b/tensorflow/contrib/cmake/external/nsync.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(nsync_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/public)
 set(nsync_URL https://github.com/google/nsync)
-set(nsync_TAG 93815892dddafe9146a5f7e7042281d59d0f4323)
+set(nsync_TAG 8502189abfa44c249c01c2cad64e6ed660a9a668)
 set(nsync_BUILD ${CMAKE_CURRENT_BINARY_DIR}/nsync/src/nsync)
 set(nsync_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/nsync/install)
 
@@ -42,6 +42,7 @@ ExternalProject_Add(nsync
     GIT_TAG ${nsync_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${nsync_STATIC_LIBRARIES}
     PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/nsync/CMakeLists.txt ${nsync_BUILD}
     INSTALL_DIR ${nsync_INSTALL}
     CMAKE_CACHE_ARGS
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index aad6618f52f909096fd2388e867ef3a965d033cb..6cd66a65990e7a2b963b52b310061b551752cd4d 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -21,9 +21,19 @@ set(png_BUILD ${CMAKE_BINARY_DIR}/png/src/png)
 set(png_INSTALL ${CMAKE_BINARY_DIR}/png/install)
 
 if(WIN32)
-  set(png_STATIC_LIBRARIES 
-    debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib
-    optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(png_STATIC_LIBRARIES 
+      debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib
+      optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+  else()
+    if(CMAKE_BUILD_TYPE EQUAL Debug)
+      set(png_STATIC_LIBRARIES 
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib)
+    else()
+      set(png_STATIC_LIBRARIES 
+        ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+    endif()
+  endif()
 else()
   set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12.a)
 endif()
@@ -38,14 +48,11 @@ ExternalProject_Add(png
     DEPENDS zlib
     URL ${png_URL}
     URL_HASH ${png_HASH}
+    BUILD_BYPRODUCTS ${png_STATIC_LIBRARIES}
     INSTALL_DIR ${png_INSTALL}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
-		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-		else()
-			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
-		endif()
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${png_INSTALL}
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index b53857a47bfbf797af02fe7f69474263119161cd..aba8a5244e17d717293deec6d9b6e8e725ef010e 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -16,14 +16,37 @@ include (ExternalProject)
 
 set(PROTOBUF_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src)
 set(PROTOBUF_URL https://github.com/google/protobuf.git)
-set(PROTOBUF_TAG b04e5cba356212e4e8c66c61bbe0c3a20537c5b9)
+set(PROTOBUF_TAG 396336eb961b75f03b25824fe86cf6490fb75e3a)
 
 if(WIN32)
-  set(protobuf_STATIC_LIBRARIES 
-    debug ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobufd.lib
-    optimized ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobuf.lib)
-  set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/protoc.exe)
-  set(PROTOBUF_ADDITIONAL_CMAKE_OPTIONS	-Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF -A x64)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(protobuf_STATIC_LIBRARIES 
+      debug ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobufd.lib
+      optimized ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobuf.lib)
+    set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/protoc.exe)
+  else()
+    if(CMAKE_BUILD_TYPE EQUAL Debug)
+      set(protobuf_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/libprotobufd.lib)
+    else()
+      set(protobuf_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/libprotobuf.lib)
+    endif()
+    set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/protoc.exe)
+  endif()
+
+  # This section is to make sure CONFIGURE_COMMAND use the same generator settings
+  set(PROTOBUF_GENERATOR_PLATFORM)
+  if (CMAKE_GENERATOR_PLATFORM)
+    set(PROTOBUF_GENERATOR_PLATFORM -A ${CMAKE_GENERATOR_PLATFORM})
+  endif()
+  set(PROTOBUF_GENERATOR_TOOLSET)
+  if (CMAKE_GENERATOR_TOOLSET)
+  set(PROTOBUF_GENERATOR_TOOLSET -T ${CMAKE_GENERATOR_TOOLSET})
+  endif()
+  set(PROTOBUF_ADDITIONAL_CMAKE_OPTIONS	-Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF
+    -G${CMAKE_GENERATOR} ${PROTOBUF_GENERATOR_PLATFORM} ${PROTOBUF_GENERATOR_TOOLSET})
+  # End of section
 else()
   set(protobuf_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/libprotobuf.a)
   set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/protoc)
@@ -36,20 +59,23 @@ ExternalProject_Add(protobuf
     GIT_TAG ${PROTOBUF_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${PROTOBUF_PROTOC_EXECUTABLE} ${protobuf_STATIC_LIBRARIES}
     SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf
+    # SOURCE_SUBDIR cmake/ # Requires CMake 3.7, this will allow removal of CONFIGURE_COMMAND
+    # CONFIGURE_COMMAND resets some settings made in CMAKE_CACHE_ARGS and the generator used
     CONFIGURE_COMMAND ${CMAKE_COMMAND} cmake/
-        -Dprotobuf_BUILD_TESTS=OFF
-        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -Dprotobuf_BUILD_TESTS:BOOL=OFF
         -DZLIB_ROOT=${ZLIB_INSTALL}
         ${PROTOBUF_ADDITIONAL_CMAKE_OPTIONS}
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
-		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-		else()
-			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
-		endif()
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -Dprotobuf_BUILD_TESTS:BOOL=OFF
+        -Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF
         -DZLIB_ROOT:STRING=${ZLIB_INSTALL}
 )
diff --git a/tensorflow/contrib/cmake/external/re2.cmake b/tensorflow/contrib/cmake/external/re2.cmake
index d10f5959f71dd350e6e2bcb81be8882b203fb231..c4bc0b1707bf9e86ea41234c8155fd6321c4c33b 100644
--- a/tensorflow/contrib/cmake/external/re2.cmake
+++ b/tensorflow/contrib/cmake/external/re2.cmake
@@ -21,7 +21,11 @@ set(re2_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/re2/install)
 set(re2_TAG e7efc48)
 
 if(WIN32)
-  set(re2_STATIC_LIBRARIES ${re2_BUILD}/$(Configuration)/re2.lib)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(re2_STATIC_LIBRARIES ${re2_BUILD}/$(Configuration)/re2.lib)
+  else()
+    set(re2_STATIC_LIBRARIES ${re2_BUILD}/re2.lib)
+  endif()
 else()
   set(re2_STATIC_LIBRARIES ${re2_BUILD}/libre2.a)
 endif()
@@ -36,13 +40,10 @@ ExternalProject_Add(re2
     GIT_TAG ${re2_TAG}
     INSTALL_DIR ${re2_INSTALL}
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${re2_STATIC_LIBRARIES}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
-		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-		else()
-			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
-		endif()
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_INSTALL_PREFIX:STRING=${re2_INSTALL}
         -DRE2_BUILD_TESTING:BOOL=OFF
diff --git a/tensorflow/contrib/cmake/external/snappy.cmake b/tensorflow/contrib/cmake/external/snappy.cmake
index 926c271fd9ea6e2a30251aa408bd49859ae95070..f54197643b06781dad35b40f526f28d301047299 100644
--- a/tensorflow/contrib/cmake/external/snappy.cmake
+++ b/tensorflow/contrib/cmake/external/snappy.cmake
@@ -20,7 +20,11 @@ set(snappy_BUILD ${CMAKE_CURRENT_BINARY_DIR}/snappy/src/snappy)
 set(snappy_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/snappy/src/snappy)
 
 if(WIN32)
-    set(snappy_STATIC_LIBRARIES ${snappy_BUILD}/$(Configuration)/snappy.lib)
+    if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+        set(snappy_STATIC_LIBRARIES ${snappy_BUILD}/$(Configuration)/snappy.lib)
+    else()
+        set(snappy_STATIC_LIBRARIES ${snappy_BUILD}/snappy.lib)
+    endif()
 else()
     set(snappy_STATIC_LIBRARIES ${snappy_BUILD}/libsnappy.a)
 endif()
@@ -35,20 +39,17 @@ ExternalProject_Add(snappy
     GIT_TAG ${snappy_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
+    BUILD_BYPRODUCTS ${snappy_STATIC_LIBRARIES}
     INSTALL_COMMAND ""
     LOG_DOWNLOAD ON
     LOG_CONFIGURE ON
     LOG_BUILD ON
     CMAKE_CACHE_ARGS
-		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-		else()
-			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
-		endif()
+        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DSNAPPY_BUILD_TESTS:BOOL=OFF
 )
 
 # actually enables snappy in the source code
-add_definitions(-DTF_USE_SNAPPY)
\ No newline at end of file
+add_definitions(-DTF_USE_SNAPPY)
diff --git a/tensorflow/contrib/cmake/external/sqlite.cmake b/tensorflow/contrib/cmake/external/sqlite.cmake
index 785039a46983747557607562675349c150e064ad..57c4ae76517e4d7247093edd5e5bd95a83258d87 100644
--- a/tensorflow/contrib/cmake/external/sqlite.cmake
+++ b/tensorflow/contrib/cmake/external/sqlite.cmake
@@ -28,6 +28,7 @@ endif()
 
 set(sqlite_HEADERS
     "${sqlite_BUILD}/sqlite3.h"
+    "${sqlite_BUILD}/sqlite3ext.h"
 )
 
 if (WIN32)
@@ -35,6 +36,7 @@ if (WIN32)
         PREFIX sqlite
         URL ${sqlite_URL}
         URL_HASH ${sqlite_HASH}
+        BUILD_BYPRODUCTS ${sqlite_STATIC_LIBRARIES}
         PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/sqlite/CMakeLists.txt ${sqlite_BUILD}
         INSTALL_DIR ${sqlite_INSTALL}
         DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
@@ -53,11 +55,7 @@ else()
         INSTALL_DIR ${sqlite_INSTALL}
         DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
         CMAKE_CACHE_ARGS
-			if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-				-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-			else()
-				-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
-			endif()
+            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
             -DCMAKE_BUILD_TYPE:STRING=Release
             -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
             -DCMAKE_INSTALL_PREFIX:STRING=${sqlite_INSTALL}
diff --git a/tensorflow/contrib/cmake/external/zlib.cmake b/tensorflow/contrib/cmake/external/zlib.cmake
index f10f84336e8b1c0a2c7de7ea1f8b8af7c21f8b51..116d42309394b92407cef79c9d3a975f494bc3ff 100644
--- a/tensorflow/contrib/cmake/external/zlib.cmake
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@@ -12,54 +12,75 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-include (ExternalProject)
+if (systemlib_ZLIB)
+  find_package(PkgConfig)
+  pkg_search_module(ZLIB REQUIRED zlib)
+  set(zlib_INCLUDE_DIR ${ZLIB_INCLUDE_DIRS})
+  set(ADD_LINK_DIRECTORY ${ADD_LINK_DIRECTORY} ${ZLIB_LIBRARY_DIRS})
+  set(ADD_CFLAGS ${ADD_CFLAGS} ${ZLIB_CFLAGS_OTHER})
 
-set(zlib_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/zlib_archive)
-set(ZLIB_URL https://github.com/madler/zlib)
-set(ZLIB_BUILD ${CMAKE_CURRENT_BINARY_DIR}/zlib/src/zlib)
-set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
-set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
+  # To meet DEPENDS zlib from other projects.
+  # If we hit this line, zlib is already built and installed to the system.
+  add_custom_target(zlib)
+  add_custom_target(zlib_copy_headers_to_destination)
 
-if(WIN32)
-  set(zlib_STATIC_LIBRARIES
-      debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
-      optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
-else()
-  set(zlib_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
-endif()
+else (systemlib_ZLIB)
+  include (ExternalProject)
 
-set(ZLIB_HEADERS
-    "${ZLIB_INSTALL}/include/zconf.h"
-    "${ZLIB_INSTALL}/include/zlib.h"
-)
+  set(zlib_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/zlib_archive)
+  set(ZLIB_URL https://github.com/madler/zlib)
+  set(ZLIB_BUILD ${CMAKE_CURRENT_BINARY_DIR}/zlib/src/zlib)
+  set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
+  set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
 
-ExternalProject_Add(zlib
-    PREFIX zlib
-    GIT_REPOSITORY ${ZLIB_URL}
-    GIT_TAG ${ZLIB_TAG}
-    INSTALL_DIR ${ZLIB_INSTALL}
-    BUILD_IN_SOURCE 1
-    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
-    CMAKE_CACHE_ARGS
-		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
-			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-		else()
-			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
-		endif()
-        -DCMAKE_BUILD_TYPE:STRING=Release
-        -DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
-)
+  if(WIN32)
+    if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+      set(zlib_STATIC_LIBRARIES
+          debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
+          optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
+    else()
+      if(CMAKE_BUILD_TYPE EQUAL Debug)
+        set(zlib_STATIC_LIBRARIES
+            ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib)
+      else()
+        set(zlib_STATIC_LIBRARIES
+            ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
+      endif()
+    endif()
+  else()
+    set(zlib_STATIC_LIBRARIES
+        ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
+  endif()
 
-# put zlib includes in the directory where they are expected
-add_custom_target(zlib_create_destination_dir
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${zlib_INCLUDE_DIR}
-    DEPENDS zlib)
+  set(ZLIB_HEADERS
+      "${ZLIB_INSTALL}/include/zconf.h"
+      "${ZLIB_INSTALL}/include/zlib.h"
+  )
 
-add_custom_target(zlib_copy_headers_to_destination
-    DEPENDS zlib_create_destination_dir)
+  ExternalProject_Add(zlib
+      PREFIX zlib
+      GIT_REPOSITORY ${ZLIB_URL}
+      GIT_TAG ${ZLIB_TAG}
+      INSTALL_DIR ${ZLIB_INSTALL}
+      BUILD_IN_SOURCE 1
+      BUILD_BYPRODUCTS ${zlib_STATIC_LIBRARIES}
+      DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+      CMAKE_CACHE_ARGS
+          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
+          -DCMAKE_BUILD_TYPE:STRING=Release
+          -DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
+  )
 
-foreach(header_file ${ZLIB_HEADERS})
-    add_custom_command(TARGET zlib_copy_headers_to_destination PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${zlib_INCLUDE_DIR})
-endforeach()
+  # put zlib includes in the directory where they are expected
+  add_custom_target(zlib_create_destination_dir
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${zlib_INCLUDE_DIR}
+      DEPENDS zlib)
+
+  add_custom_target(zlib_copy_headers_to_destination
+      DEPENDS zlib_create_destination_dir)
+
+  foreach(header_file ${ZLIB_HEADERS})
+      add_custom_command(TARGET zlib_copy_headers_to_destination PRE_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${zlib_INCLUDE_DIR})
+  endforeach()
+endif (systemlib_ZLIB)
diff --git a/tensorflow/tools/ci_build/install/install_cmake_for_clang.sh b/tensorflow/contrib/cmake/make.sh
similarity index 83%
rename from tensorflow/tools/ci_build/install/install_cmake_for_clang.sh
rename to tensorflow/contrib/cmake/make.sh
index 3e626a69ab5e6b7f8d1b4997b459301606501a8e..eed3c34aba1f0326ec741169a187eb2982f253a3 100755
--- a/tensorflow/tools/ci_build/install/install_cmake_for_clang.sh
+++ b/tensorflow/contrib/cmake/make.sh
@@ -14,6 +14,13 @@
 # limitations under the License.
 # ==============================================================================
 
-CMAKE_URL="https://cmake.org/files/v3.7/cmake-3.7.2-Linux-x86_64.tar.gz"
+(
+cd "$(dirname "$0")"
+mkdir -p _build
 
-wget -O - "${CMAKE_URL}" | tar xzf - -C /usr/local --strip-components=1
+(
+cd _build
+rm -rf -- *
+cmake ..
+)
+)
diff --git a/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt b/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
index 594c2492d4fd68b50c8493321a2c4dcc2d41917e..aaae18a313dd082b428654091c9411600c981ec9 100644
--- a/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
@@ -158,12 +158,21 @@ if (NOT "${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "NetBSDX")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/netbsd")
     set (NSYNC_POSIX ON)
+    set (NSYNC_OS_EXTRA_SRC
+      "platform/posix/src/nsync_semaphore_mutex.c"
+    )
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "FreeBSDX")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/freebsd")
     set (NSYNC_POSIX ON)
+    set (NSYNC_OS_EXTRA_SRC
+      "platform/posix/src/nsync_semaphore_mutex.c"
+    )
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "OpenBSDX")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/openbsd")
     set (NSYNC_POSIX ON)
+    set (NSYNC_OS_EXTRA_SRC
+      "platform/posix/src/nsync_semaphore_mutex.c"
+    )
   endif ()
 endif ()
 
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bfe53c01b3b5fb9db8a5d8fa280d1d7f98974882
--- /dev/null
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -0,0 +1,458 @@
+# python_sanity_test.py will complain about invalid or missing entries
+# problematic entries can be commented for temporary whitelisting
+tensorflow
+tensorflow/core
+tensorflow/core/example
+tensorflow/core/framework
+tensorflow/core/lib
+tensorflow/core/lib/core
+tensorflow/core/profiler
+tensorflow/core/protobuf
+tensorflow/core/util
+tensorflow/examples
+tensorflow/examples/tutorials
+tensorflow/examples/tutorials/mnist
+tensorflow/python
+tensorflow/python/client
+tensorflow/python/data
+tensorflow/python/data/ops
+tensorflow/python/data/util
+tensorflow/python/debug
+tensorflow/python/debug/cli
+tensorflow/python/debug/examples
+tensorflow/python/debug/lib
+tensorflow/python/debug/wrappers
+tensorflow/python/eager
+tensorflow/python/estimator
+tensorflow/python/estimator/canned
+tensorflow/python/estimator/export
+tensorflow/python/estimator/inputs
+tensorflow/python/estimator/inputs/queues
+tensorflow/python/feature_column
+tensorflow/python/framework
+tensorflow/python/grappler
+tensorflow/python/keras
+tensorflow/python/keras/activations
+tensorflow/python/keras/applications
+tensorflow/python/keras/applications/densenet
+tensorflow/python/keras/applications/inception_resnet_v2
+tensorflow/python/keras/applications/inception_v3
+tensorflow/python/keras/applications/mobilenet
+tensorflow/python/keras/applications/nasnet
+tensorflow/python/keras/applications/resnet50
+tensorflow/python/keras/applications/vgg16
+tensorflow/python/keras/applications/vgg19
+tensorflow/python/keras/applications/xception
+tensorflow/python/keras/backend
+tensorflow/python/keras/callbacks
+tensorflow/python/keras/constraints
+tensorflow/python/keras/datasets
+tensorflow/python/keras/datasets/boston_housing
+tensorflow/python/keras/datasets/cifar10
+tensorflow/python/keras/datasets/cifar100
+tensorflow/python/keras/datasets/fashion_mnist
+tensorflow/python/keras/datasets/imdb
+tensorflow/python/keras/datasets/mnist
+tensorflow/python/keras/datasets/reuters
+tensorflow/python/keras/estimator
+tensorflow/python/keras/initializers
+tensorflow/python/keras/layers
+tensorflow/python/keras/losses
+tensorflow/python/keras/metrics
+tensorflow/python/keras/models
+tensorflow/python/keras/optimizers
+tensorflow/python/keras/preprocessing
+tensorflow/python/keras/preprocessing/image
+tensorflow/python/keras/preprocessing/sequence
+tensorflow/python/keras/preprocessing/text
+tensorflow/python/keras/regularizers
+tensorflow/python/keras/utils
+tensorflow/python/keras/wrappers
+tensorflow/python/keras/wrappers/scikit_learn
+tensorflow/python/keras/_impl
+tensorflow/python/keras/_impl/keras
+tensorflow/python/keras/_impl/keras/applications
+tensorflow/python/keras/_impl/keras/datasets
+tensorflow/python/keras/_impl/keras/engine
+tensorflow/python/keras/_impl/keras/layers
+tensorflow/python/keras/_impl/keras/preprocessing
+tensorflow/python/keras/_impl/keras/utils
+tensorflow/python/keras/_impl/keras/wrappers
+tensorflow/python/kernel_tests
+tensorflow/python/kernel_tests/distributions
+tensorflow/python/kernel_tests/linalg
+tensorflow/python/kernel_tests/random
+tensorflow/python/layers
+tensorflow/python/lib
+tensorflow/python/lib/core
+tensorflow/python/lib/io
+tensorflow/python/ops
+tensorflow/python/ops/distributions
+tensorflow/python/ops/linalg
+tensorflow/python/ops/losses
+tensorflow/python/platform
+tensorflow/python/profiler
+tensorflow/python/profiler/internal
+tensorflow/python/saved_model
+tensorflow/python/summary
+tensorflow/python/summary/writer
+tensorflow/python/tools
+tensorflow/python/training
+tensorflow/python/user_ops
+tensorflow/python/util
+tensorflow/python/util/protobuf
+tensorflow/tools
+tensorflow/tools/graph_transforms
+tensorflow/contrib
+tensorflow/contrib/all_reduce
+tensorflow/contrib/all_reduce/python
+tensorflow/contrib/android
+tensorflow/contrib/android/java
+tensorflow/contrib/android/java/org
+tensorflow/contrib/android/java/org/tensorflow
+tensorflow/contrib/android/java/org/tensorflow/contrib
+tensorflow/contrib/android/java/org/tensorflow/contrib/android
+tensorflow/contrib/android/jni
+tensorflow/contrib/batching
+tensorflow/contrib/batching/python
+tensorflow/contrib/batching/python/ops
+tensorflow/contrib/bayesflow
+tensorflow/contrib/bayesflow/python
+tensorflow/contrib/bayesflow/python/ops
+tensorflow/contrib/boosted_trees
+tensorflow/contrib/boosted_trees/estimator_batch
+tensorflow/contrib/boosted_trees/kernels
+tensorflow/contrib/boosted_trees/ops
+tensorflow/contrib/boosted_trees/proto
+tensorflow/contrib/boosted_trees/python
+tensorflow/contrib/boosted_trees/python/ops
+tensorflow/contrib/cloud
+tensorflow/contrib/cloud/kernels
+tensorflow/contrib/cloud/ops
+tensorflow/contrib/cloud/python
+tensorflow/contrib/cloud/python/ops
+tensorflow/contrib/cluster_resolver
+tensorflow/contrib/cluster_resolver/python
+tensorflow/contrib/cluster_resolver/python/training
+tensorflow/contrib/coder
+tensorflow/contrib/coder/kernels
+tensorflow/contrib/coder/ops
+tensorflow/contrib/coder/python
+tensorflow/contrib/coder/python/ops
+tensorflow/contrib/compiler
+tensorflow/contrib/copy_graph
+tensorflow/contrib/copy_graph/python
+tensorflow/contrib/copy_graph/python/util
+tensorflow/contrib/crf
+tensorflow/contrib/crf/python
+tensorflow/contrib/crf/python/ops
+tensorflow/contrib/cudnn_rnn
+tensorflow/contrib/cudnn_rnn/kernels
+tensorflow/contrib/cudnn_rnn/ops
+tensorflow/contrib/cudnn_rnn/python
+tensorflow/contrib/cudnn_rnn/python/layers
+tensorflow/contrib/cudnn_rnn/python/ops
+tensorflow/contrib/data
+tensorflow/contrib/data/kernels
+tensorflow/contrib/data/python
+tensorflow/contrib/data/python/kernel_tests
+tensorflow/contrib/data/python/ops
+tensorflow/contrib/decision_trees
+tensorflow/contrib/decision_trees/proto
+tensorflow/contrib/deprecated
+tensorflow/contrib/distributions
+tensorflow/contrib/distributions/python
+tensorflow/contrib/distributions/python/ops
+tensorflow/contrib/distributions/python/ops/bijectors
+tensorflow/contrib/eager
+tensorflow/contrib/eager/python
+tensorflow/contrib/estimator
+tensorflow/contrib/estimator/python
+tensorflow/contrib/estimator/python/estimator
+tensorflow/contrib/factorization
+tensorflow/contrib/factorization/examples
+tensorflow/contrib/factorization/kernels
+tensorflow/contrib/factorization/ops
+tensorflow/contrib/factorization/python
+tensorflow/contrib/factorization/python/ops
+tensorflow/contrib/feature_column
+tensorflow/contrib/feature_column/python
+tensorflow/contrib/feature_column/python/feature_column
+tensorflow/contrib/ffmpeg
+tensorflow/contrib/ffmpeg/default
+tensorflow/contrib/framework
+tensorflow/contrib/framework/kernels
+tensorflow/contrib/framework/ops
+tensorflow/contrib/framework/python
+tensorflow/contrib/framework/python/framework
+tensorflow/contrib/framework/python/ops
+tensorflow/contrib/fused_conv
+tensorflow/contrib/fused_conv/kernels
+tensorflow/contrib/fused_conv/python
+tensorflow/contrib/fused_conv/python/ops
+tensorflow/contrib/gan
+tensorflow/contrib/gan/python
+tensorflow/contrib/gan/python/estimator
+tensorflow/contrib/gan/python/estimator/python
+tensorflow/contrib/gan/python/eval
+tensorflow/contrib/gan/python/eval/python
+tensorflow/contrib/gan/python/features
+tensorflow/contrib/gan/python/features/python
+tensorflow/contrib/gan/python/losses
+tensorflow/contrib/gan/python/losses/python
+tensorflow/contrib/graph_editor
+tensorflow/contrib/graph_editor/examples
+tensorflow/contrib/grid_rnn
+tensorflow/contrib/grid_rnn/python
+tensorflow/contrib/grid_rnn/python/ops
+tensorflow/contrib/hooks
+tensorflow/contrib/hooks/python
+tensorflow/contrib/image
+tensorflow/contrib/image/kernels
+tensorflow/contrib/image/ops
+tensorflow/contrib/image/python
+tensorflow/contrib/image/python/ops
+tensorflow/contrib/input_pipeline
+tensorflow/contrib/input_pipeline/kernels
+tensorflow/contrib/input_pipeline/ops
+tensorflow/contrib/input_pipeline/python
+tensorflow/contrib/input_pipeline/python/ops
+tensorflow/contrib/integrate
+tensorflow/contrib/integrate/python
+tensorflow/contrib/integrate/python/ops
+tensorflow/contrib/kafka/python
+tensorflow/contrib/kafka/python/ops
+tensorflow/contrib/keras
+tensorflow/contrib/keras/api
+tensorflow/contrib/keras/api/keras
+tensorflow/contrib/keras/api/keras/activations
+tensorflow/contrib/keras/api/keras/applications
+tensorflow/contrib/keras/api/keras/applications/inception_v3
+tensorflow/contrib/keras/api/keras/applications/mobilenet
+tensorflow/contrib/keras/api/keras/applications/resnet50
+tensorflow/contrib/keras/api/keras/applications/vgg16
+tensorflow/contrib/keras/api/keras/applications/vgg19
+tensorflow/contrib/keras/api/keras/applications/xception
+tensorflow/contrib/keras/api/keras/backend
+tensorflow/contrib/keras/api/keras/callbacks
+tensorflow/contrib/keras/api/keras/constraints
+tensorflow/contrib/keras/api/keras/datasets
+tensorflow/contrib/keras/api/keras/datasets/boston_housing
+tensorflow/contrib/keras/api/keras/datasets/cifar10
+tensorflow/contrib/keras/api/keras/datasets/cifar100
+tensorflow/contrib/keras/api/keras/datasets/imdb
+tensorflow/contrib/keras/api/keras/datasets/mnist
+tensorflow/contrib/keras/api/keras/datasets/reuters
+tensorflow/contrib/keras/api/keras/initializers
+tensorflow/contrib/keras/api/keras/layers
+tensorflow/contrib/keras/api/keras/losses
+tensorflow/contrib/keras/api/keras/metrics
+tensorflow/contrib/keras/api/keras/models
+tensorflow/contrib/keras/api/keras/optimizers
+tensorflow/contrib/keras/api/keras/preprocessing
+tensorflow/contrib/keras/api/keras/preprocessing/image
+tensorflow/contrib/keras/api/keras/preprocessing/sequence
+tensorflow/contrib/keras/api/keras/preprocessing/text
+tensorflow/contrib/keras/api/keras/regularizers
+tensorflow/contrib/keras/api/keras/utils
+tensorflow/contrib/keras/api/keras/wrappers
+tensorflow/contrib/keras/api/keras/wrappers/scikit_learn
+tensorflow/contrib/kernel_methods
+tensorflow/contrib/kernel_methods/python
+tensorflow/contrib/kernel_methods/python/mappers
+tensorflow/contrib/kfac
+tensorflow/contrib/kfac/examples
+tensorflow/contrib/kfac/python
+tensorflow/contrib/kfac/python/ops
+tensorflow/contrib/labeled_tensor
+tensorflow/contrib/labeled_tensor/python
+tensorflow/contrib/labeled_tensor/python/ops
+tensorflow/contrib/layers
+tensorflow/contrib/layers/kernels
+tensorflow/contrib/layers/ops
+tensorflow/contrib/layers/python
+tensorflow/contrib/layers/python/layers
+tensorflow/contrib/layers/python/ops
+tensorflow/contrib/learn
+tensorflow/contrib/learn/python
+tensorflow/contrib/learn/python/learn
+tensorflow/contrib/learn/python/learn/datasets
+tensorflow/contrib/learn/python/learn/datasets/data
+tensorflow/contrib/learn/python/learn/estimators
+tensorflow/contrib/learn/python/learn/learn_io
+tensorflow/contrib/learn/python/learn/ops
+tensorflow/contrib/learn/python/learn/preprocessing
+tensorflow/contrib/learn/python/learn/utils
+tensorflow/contrib/legacy_seq2seq
+tensorflow/contrib/legacy_seq2seq/python
+tensorflow/contrib/legacy_seq2seq/python/ops
+tensorflow/contrib/libsvm
+tensorflow/contrib/libsvm/python
+tensorflow/contrib/libsvm/python/kernel_tests
+tensorflow/contrib/libsvm/python/ops
+tensorflow/contrib/linalg
+tensorflow/contrib/linalg/python
+tensorflow/contrib/linalg/python/ops
+tensorflow/contrib/linear_optimizer
+tensorflow/contrib/linear_optimizer/kernels
+tensorflow/contrib/linear_optimizer/kernels/g3doc
+tensorflow/contrib/linear_optimizer/python
+tensorflow/contrib/linear_optimizer/python/ops
+# TODO(drpngx): Fix failing imports
+# tensorflow/contrib/lite
+# tensorflow/contrib/lite/python
+# tensorflow/contrib/lite/toco
+# tensorflow/contrib/lite/toco/python
+tensorflow/contrib/lookup
+tensorflow/contrib/losses
+tensorflow/contrib/losses/python
+tensorflow/contrib/losses/python/losses
+tensorflow/contrib/losses/python/metric_learning
+tensorflow/contrib/makefile
+tensorflow/contrib/memory_stats
+tensorflow/contrib/memory_stats/kernels
+tensorflow/contrib/memory_stats/ops
+tensorflow/contrib/memory_stats/python
+tensorflow/contrib/memory_stats/python/ops
+tensorflow/contrib/meta_graph_transform
+tensorflow/contrib/metrics
+tensorflow/contrib/metrics/python
+tensorflow/contrib/metrics/python/metrics
+tensorflow/contrib/metrics/python/ops
+tensorflow/contrib/mpi_collectives/python
+tensorflow/contrib/mpi_collectives/python/ops
+tensorflow/contrib/model_pruning
+tensorflow/contrib/model_pruning/examples
+tensorflow/contrib/model_pruning/examples/cifar10
+tensorflow/contrib/model_pruning/python
+tensorflow/contrib/model_pruning/python/layers
+tensorflow/contrib/nccl
+tensorflow/contrib/nccl/kernels
+tensorflow/contrib/nccl/ops
+tensorflow/contrib/nccl/python
+tensorflow/contrib/nccl/python/ops
+tensorflow/contrib/nearest_neighbor/kernels
+tensorflow/contrib/nearest_neighbor/ops
+tensorflow/contrib/nearest_neighbor/python
+tensorflow/contrib/nearest_neighbor/python/ops
+tensorflow/contrib/nn
+tensorflow/contrib/nn/python
+tensorflow/contrib/nn/python/ops
+tensorflow/contrib/opt
+tensorflow/contrib/opt/python
+tensorflow/contrib/opt/python/training
+tensorflow/contrib/pi_examples
+tensorflow/contrib/pi_examples/camera
+tensorflow/contrib/pi_examples/label_image
+tensorflow/contrib/pi_examples/label_image/data
+tensorflow/contrib/periodic_resample
+tensorflow/contrib/periodic_resample/python
+tensorflow/contrib/periodic_resample/python/ops
+tensorflow/contrib/predictor
+tensorflow/contrib/quantization
+tensorflow/contrib/quantization/python
+tensorflow/contrib/quantize
+tensorflow/contrib/quantize/python
+tensorflow/contrib/receptive_field
+tensorflow/contrib/receptive_field/python
+tensorflow/contrib/receptive_field/python/util
+tensorflow/contrib/receptive_field/python/util/examples
+tensorflow/contrib/reduce_slice_ops
+tensorflow/contrib/reduce_slice_ops/kernels
+tensorflow/contrib/reduce_slice_ops/ops
+tensorflow/contrib/reduce_slice_ops/python
+tensorflow/contrib/reduce_slice_ops/python/ops
+tensorflow/contrib/remote_fused_graph
+tensorflow/contrib/remote_fused_graph/pylib
+tensorflow/contrib/remote_fused_graph/pylib/python
+tensorflow/contrib/remote_fused_graph/pylib/python/ops
+tensorflow/contrib/resampler
+tensorflow/contrib/resampler/kernels
+tensorflow/contrib/resampler/ops
+tensorflow/contrib/resampler/python
+tensorflow/contrib/resampler/python/ops
+tensorflow/contrib/rnn
+tensorflow/contrib/rnn/kernels
+tensorflow/contrib/rnn/ops
+tensorflow/contrib/rnn/python
+tensorflow/contrib/rnn/python/kernel_tests
+tensorflow/contrib/rnn/python/ops
+tensorflow/contrib/saved_model
+tensorflow/contrib/saved_model/python
+tensorflow/contrib/saved_model/python/saved_model
+tensorflow/contrib/seq2seq
+tensorflow/contrib/seq2seq/kernels
+tensorflow/contrib/seq2seq/ops
+tensorflow/contrib/seq2seq/python
+tensorflow/contrib/seq2seq/python/ops
+tensorflow/contrib/session_bundle
+tensorflow/contrib/session_bundle/example
+tensorflow/contrib/signal
+tensorflow/contrib/signal/python
+tensorflow/contrib/signal/python/ops
+tensorflow/contrib/slim
+tensorflow/contrib/slim/python
+tensorflow/contrib/slim/python/slim
+tensorflow/contrib/slim/python/slim/data
+tensorflow/contrib/slim/python/slim/nets
+tensorflow/contrib/solvers
+tensorflow/contrib/solvers/python
+tensorflow/contrib/solvers/python/ops
+tensorflow/contrib/sparsemax
+tensorflow/contrib/sparsemax/python
+tensorflow/contrib/sparsemax/python/ops
+tensorflow/contrib/specs
+tensorflow/contrib/specs/python
+tensorflow/contrib/staging
+tensorflow/contrib/stat_summarizer
+tensorflow/contrib/stat_summarizer/python
+tensorflow/contrib/stateless
+tensorflow/contrib/stateless/python
+tensorflow/contrib/summary
+tensorflow/contrib/tensorboard
+tensorflow/contrib/tensorboard/plugins
+tensorflow/contrib/tensorboard/plugins/projector
+tensorflow/contrib/tensorboard/plugins/trace
+# TODO(sami): Add cmake implementations.
+# tensorflow/contrib/tensorrt/python
+# tensorflow/contrib/tensorrt/python/ops
+tensorflow/contrib/tensor_forest
+tensorflow/contrib/tensor_forest/client
+tensorflow/contrib/tensor_forest/hybrid
+tensorflow/contrib/tensor_forest/hybrid/core
+tensorflow/contrib/tensor_forest/hybrid/core/ops
+tensorflow/contrib/tensor_forest/hybrid/python
+tensorflow/contrib/tensor_forest/hybrid/python/layers
+tensorflow/contrib/tensor_forest/hybrid/python/models
+tensorflow/contrib/tensor_forest/hybrid/python/ops
+tensorflow/contrib/tensor_forest/kernels
+tensorflow/contrib/tensor_forest/proto
+tensorflow/contrib/tensor_forest/python
+tensorflow/contrib/tensor_forest/python/ops
+tensorflow/contrib/testing
+tensorflow/contrib/testing/python
+tensorflow/contrib/testing/python/framework
+tensorflow/contrib/text
+tensorflow/contrib/text/kernels
+tensorflow/contrib/text/ops
+tensorflow/contrib/text/python
+tensorflow/contrib/text/python/ops
+tensorflow/contrib/tfprof
+tensorflow/contrib/timeseries
+tensorflow/contrib/timeseries/examples
+tensorflow/contrib/timeseries/examples/data
+tensorflow/contrib/timeseries/python
+tensorflow/contrib/timeseries/python/timeseries
+tensorflow/contrib/timeseries/python/timeseries/state_space_models
+tensorflow/contrib/tpu
+tensorflow/contrib/tpu/ops
+tensorflow/contrib/tpu/profiler
+tensorflow/contrib/tpu/proto
+tensorflow/contrib/tpu/python
+tensorflow/contrib/tpu/python/ops
+tensorflow/contrib/tpu/python/profiler
+tensorflow/contrib/tpu/python/tpu
+tensorflow/contrib/training
+tensorflow/contrib/training/python
+tensorflow/contrib/training/python/training
+tensorflow/contrib/util
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8a9c406d8b118c10ddcaafb0e4fc242aa79cdb57
--- /dev/null
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -0,0 +1,19 @@
+tensorflow/core
+tensorflow/core/profiler
+tensorflow/python
+tensorflow/contrib/boosted_trees/proto
+tensorflow/contrib/cloud/kernels
+tensorflow/contrib/decision_trees/proto
+tensorflow/contrib/gdr
+tensorflow/contrib/lite/toco
+tensorflow/contrib/mpi
+tensorflow/contrib/mpi_collectives
+tensorflow/contrib/session_bundle
+tensorflow/contrib/tensor_forest/proto
+tensorflow/contrib/tensorboard/graph_explorer/proto
+tensorflow/contrib/tensorboard/plugins/projector
+tensorflow/contrib/tensorboard/plugins/trace
+tensorflow/contrib/tpu/proto
+tensorflow/contrib/tpu/profiler
+tensorflow/contrib/training/python/training
+tensorflow/contrib/verbs
diff --git a/tensorflow/contrib/cmake/python_protos_cc.txt b/tensorflow/contrib/cmake/python_protos_cc.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d4a257b25c814a1464308d0e6ce3ce65d21f6a36
--- /dev/null
+++ b/tensorflow/contrib/cmake/python_protos_cc.txt
@@ -0,0 +1,5 @@
+tensorflow/core/profiler
+tensorflow/python
+tensorflow/contrib/session_bundle
+tensorflow/contrib/tensorboard
+tensorflow/contrib/training
diff --git a/tensorflow/contrib/cmake/python_sanity_test.py b/tensorflow/contrib/cmake/python_sanity_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0056823a80833329bcb1f275a3384a33127bb40
--- /dev/null
+++ b/tensorflow/contrib/cmake/python_sanity_test.py
@@ -0,0 +1,128 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Complain about invalid or missing entries in python_*.txt files.
+
+Problematic entries can be commented for temporary whitelisting.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+
+
+def abs_path(path):
+  root = os.path.dirname(__file__)
+
+  for _ in range(3):
+    root = os.path.join(root, os.pardir)
+
+  path = os.path.join(root, path)
+  path = os.path.abspath(path)
+  return path
+
+
+def read_entries(test):
+  with open(abs_path(test.entries_file), "r") as f:
+    lines = f.readlines()
+
+  lines = [line.strip() for line in lines]
+  lines = [line for line in lines if line]
+
+  test.entries = []
+  test.whitelist = []
+
+  for line in lines:
+    # line is comment
+    if line.startswith("#"):
+      line = line[1:].strip()
+      # whitelist entry
+      if line.startswith("tensorflow/"):
+        test.whitelist.append(line)
+    # line has comment -> strip comment
+    elif line.find("#") != -1:
+      line = line[:line.find("#")].strip()
+      test.entries.append(line)
+    else:
+      test.entries.append(line)
+
+
+def test_invalid_directories(test):
+  for entry in test.entries:
+    if not os.path.isdir(abs_path(entry)):
+      problem = "'" + test.entries_file + "' contains invalid '" + entry + "'"
+      solution = ("Please remove the invalid entry (or add the missing "
+                  "directory).")
+      raise AssertionError(problem + "\n" + solution)
+
+
+def test_missing_directory(test, path):
+  if path in test.whitelist:
+    return
+
+  dir_exists = os.path.isdir(abs_path(path))
+  entry_exists = path in test.entries
+
+  if dir_exists and not entry_exists:
+    problem = "'" + test.entries_file + "' is missing '" + path + "'"
+    solution = "Please add the missing entry (comment to whitelist if needed)."
+    raise AssertionError(problem + "\n" + solution)
+
+
+class PythonModuleTest(unittest.TestCase):
+
+  def setUp(self):
+    self.entries_file = "tensorflow/contrib/cmake/python_modules.txt"
+    read_entries(self)
+
+  def testInvalidEntries(self):
+    test_invalid_directories(self)
+
+  def testMissingModules(self):
+    module_names = next(os.walk(abs_path("tensorflow/contrib")))[1]
+
+    for module_name in module_names:
+      path = "tensorflow/contrib/" + module_name
+
+      test_missing_directory(self, path + "/python")
+      test_missing_directory(self, path + "/python/ops")
+      test_missing_directory(self, path + "/python/kernels")
+      test_missing_directory(self, path + "/python/layers")
+
+
+class PythonProtoTest(unittest.TestCase):
+
+  def setUp(self):
+    self.entries_file = "tensorflow/contrib/cmake/python_protos.txt"
+    read_entries(self)
+
+  def testInvalidEntries(self):
+    test_invalid_directories(self)
+
+
+class PythonProtoCCTest(unittest.TestCase):
+
+  def setUp(self):
+    self.entries_file = "tensorflow/contrib/cmake/python_protos_cc.txt"
+    read_entries(self)
+
+  def testInvalidEntries(self):
+    test_invalid_directories(self)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/tensorflow/contrib/cmake/tests/cuda/compatibility_test.c b/tensorflow/contrib/cmake/tests/cuda/compatibility_test.c
new file mode 100644
index 0000000000000000000000000000000000000000..968ab13a0c43793341431248713f81010c87f148
--- /dev/null
+++ b/tensorflow/contrib/cmake/tests/cuda/compatibility_test.c
@@ -0,0 +1,7 @@
+// This is a program to test if compiler is compatible with CUDA.
+#define __CUDACC__
+#include "crt/host_config.h"
+
+int main(void) {
+  return 0;
+}
diff --git a/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc b/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..968ab13a0c43793341431248713f81010c87f148
--- /dev/null
+++ b/tensorflow/contrib/cmake/tests/cuda/compatibility_test.cc
@@ -0,0 +1,7 @@
+// This is a program to test if compiler is compatible with CUDA.
+#define __CUDACC__
+#include "crt/host_config.h"
+
+int main(void) {
+  return 0;
+}
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index 6e2ac203f9a7f96cb14752a91483840a9eb6b451..f73da0b8ab18af1eca4c2bd577604595f8b8ec6d 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -83,7 +83,7 @@ foreach(tf_cc_op_lib_name ${tf_cc_op_lib_names})
                ${cc_ops_target_dir}/${tf_cc_op_lib_name}.cc
                ${cc_ops_target_dir}/${tf_cc_op_lib_name}_internal.h
                ${cc_ops_target_dir}/${tf_cc_op_lib_name}_internal.cc
-        COMMAND ${tf_cc_op_lib_name}_gen_cc ${cc_ops_target_dir}/${tf_cc_op_lib_name}.h ${cc_ops_target_dir}/${tf_cc_op_lib_name}.cc ${tensorflow_source_dir}/tensorflow/cc/ops/op_gen_overrides.pbtxt ${cc_ops_include_internal} ${tensorflow_source_dir}/tensorflow/core/api_def/base_api
+        COMMAND ${tf_cc_op_lib_name}_gen_cc ${cc_ops_target_dir}/${tf_cc_op_lib_name}.h ${cc_ops_target_dir}/${tf_cc_op_lib_name}.cc ${cc_ops_include_internal} ${tensorflow_source_dir}/tensorflow/core/api_def/base_api
         DEPENDS ${tf_cc_op_lib_name}_gen_cc create_cc_ops_header_dir
     )
 
@@ -149,7 +149,11 @@ add_library(tf_cc OBJECT ${tf_cc_srcs})
 add_dependencies(tf_cc tf_cc_framework tf_cc_ops)
 
 if (WIN32)
-  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
+  else()
+    set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib")
+  endif()
 else (WIN32)
   set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
 endif (WIN32)
diff --git a/tensorflow/contrib/cmake/tf_core_cpu.cmake b/tensorflow/contrib/cmake/tf_core_cpu.cmake
index 5c01ca382fb9cc7a01a6f2b60a510c59f0aa7119..96ac60d095dbc84470ff1be92f4bf52bb420fc52 100644
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@@ -50,6 +50,12 @@ file(GLOB_RECURSE tf_core_cpu_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/graph/edgeset.cc"
     "${tensorflow_source_dir}/tensorflow/core/graph/graph.h"
     "${tensorflow_source_dir}/tensorflow/core/graph/graph.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/graph_def_builder.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/graph_def_builder.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/node_builder.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/node_builder.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/tensor_id.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/tensor_id.cc"
     "${tensorflow_source_dir}/tensorflow/core/graph/while_context.h"
     "${tensorflow_source_dir}/tensorflow/core/graph/while_context.cc"
     "${tensorflow_source_dir}/tensorflow/core/grappler/clusters/single_machine.h"
@@ -63,7 +69,7 @@ if (tensorflow_ENABLE_GPU)
   file(GLOB_RECURSE tf_core_gpu_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu/cupti_wrapper.cc"
-    "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu_tracer.cc"
+    "${tensorflow_source_dir}/tensorflow/core/platform/default/device_tracer.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu_device_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/grappler/devices.h"
     "${tensorflow_source_dir}/tensorflow/core/grappler/devices.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index c607546f4a5244fb6e7cd12db874f07a962f6f4d..a1c320347fe60f87806736befc677541a93e7e93 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -126,7 +126,9 @@ endfunction()
 file(GLOB_RECURSE tf_protos_cc_srcs RELATIVE ${tensorflow_source_dir}
     "${tensorflow_source_dir}/tensorflow/core/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/proto/*.proto"
+    "${tensorflow_source_dir}/tensorflow/contrib/tpu/proto/*.proto"
 )
+
 RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
     ${tensorflow_source_dir} ${tf_protos_cc_srcs}
 )
@@ -191,10 +193,6 @@ file(GLOB_RECURSE tf_core_lib_srcs
     "${tensorflow_source_dir}/tensorflow/core/lib/*.h"
     "${tensorflow_source_dir}/tensorflow/core/lib/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/public/*.h"
-    # TODO(@jart): Move StatusOr into core.
-    "${tensorflow_source_dir}/tensorflow/compiler/xla/statusor.cc"
-    "${tensorflow_source_dir}/tensorflow/compiler/xla/statusor.h"
-    "${tensorflow_source_dir}/tensorflow/compiler/xla/statusor_internals.h"
 )
 
 file(GLOB tf_core_platform_srcs
@@ -211,7 +209,7 @@ if (NOT tensorflow_ENABLE_GPU)
   list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_gpu_srcs})
 else()
   file(GLOB tf_core_platform_srcs_exclude
-      "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu_tracer.cc")
+      "${tensorflow_source_dir}/tensorflow/core/platform/default/device_tracer.cc")
   list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_srcs_exclude})
 endif()
 
@@ -294,6 +292,12 @@ file(GLOB_RECURSE tf_core_framework_srcs
     "${tensorflow_source_dir}/tensorflow/core/graph/edgeset.cc"
     "${tensorflow_source_dir}/tensorflow/core/graph/graph.h"
     "${tensorflow_source_dir}/tensorflow/core/graph/graph.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/graph_def_builder.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/graph_def_builder.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/node_builder.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/node_builder.cc"
+    "${tensorflow_source_dir}/tensorflow/core/graph/tensor_id.h"
+    "${tensorflow_source_dir}/tensorflow/core/graph/tensor_id.cc"
     "${tensorflow_source_dir}/tensorflow/core/graph/while_context.h"
     "${tensorflow_source_dir}/tensorflow/core/graph/while_context.cc"
     "${tensorflow_source_dir}/tensorflow/core/util/*.h"
@@ -317,8 +321,15 @@ file(GLOB_RECURSE tf_core_framework_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/util/*test*.cc"
     "${tensorflow_source_dir}/tensorflow/core/util/*main.cc"
     "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*test*.cc"
+    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/loader.cc"
+    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/vacuum.cc"
 )
 
+# TODO(jart): Why doesn't this work?
+# set_source_files_properties(
+#     ${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/snapfn.cc
+#     PROPERTIES COMPILE_FLAGS -DSQLITE_OMIT_LOAD_EXTENSION)
+
 list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_exclude_srcs})
 
 add_library(tf_core_framework OBJECT
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 2d015908a890fd7757bf212573f4ebce8ba8b30d..f219d5eb577afa9edaadca09aef9869c81d2bd87 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -63,10 +63,15 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/training_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/prefetching_kernels.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/ops/prefetching_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/clustering_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc"
@@ -79,12 +84,15 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/image/kernels/bipartite_match_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/image/kernels/image_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/image/kernels/segmentation_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/image/ops/distort_image_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/image/ops/image_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/libsvm/ops/libsvm_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
@@ -150,9 +158,6 @@ list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_exclude_srcs})
 if(WIN32)
   file(GLOB_RECURSE tf_core_kernels_windows_exclude_srcs
       # not working on windows yet
-      "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/neon/*"
       # not in core - those are loaded dynamically as dll
       "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 4a61ed7a3548b1992ddc71acb8a7761e252296ea..59e094812aaf4da2549d96314fc550e5635f9de8 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -15,6 +15,7 @@
 set(tf_op_lib_names
     "audio_ops"
     "array_ops"
+		"batch_ops"
     "bitwise_ops"
     "candidate_sampling_ops"
     "checkpoint_ops"
@@ -26,8 +27,10 @@ set(tf_op_lib_names
     "image_ops"
     "io_ops"
     "linalg_ops"
+		"list_ops"
     "lookup_ops"
     "logging_ops"
+    "manip_ops"
     "math_ops"
     "nn_ops"
     "no_op"
@@ -80,8 +83,9 @@ GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_training "${tensorflow_source_dir}/ten
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_prediction "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_quantiles "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_stats_accumulator "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(coder "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(cudnn_rnn "${tensorflow_source_dir}/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc")
-GENERATE_CONTRIB_OP_LIBRARY(data_prefetching "${tensorflow_source_dir}/tensorflow/contrib/data/ops/prefetching_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(data_dataset "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_clustering "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/clustering_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_factorization "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(framework_variable "${tensorflow_source_dir}/tensorflow/contrib/framework/ops/variable_ops.cc")
@@ -92,6 +96,7 @@ GENERATE_CONTRIB_OP_LIBRARY(image_sirds "${tensorflow_source_dir}/tensorflow/con
 GENERATE_CONTRIB_OP_LIBRARY(layers_sparse_feature_cross "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc")
 GENERATE_CONTRIB_OP_LIBRARY(memory_stats "${tensorflow_source_dir}/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(nccl "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(periodic_resample "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/ops/array_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(nearest_neighbor "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/ops/nearest_neighbor_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(resampler "${tensorflow_source_dir}/tensorflow/contrib/resampler/ops/resampler_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(rnn_gru "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc")
diff --git a/tensorflow/contrib/cmake/tf_core_profiler.cmake b/tensorflow/contrib/cmake/tf_core_profiler.cmake
index 61ed6a1e145299125d037b48b8b644cae1ce96e7..b91a7f43e5c03e933d10572e54e0c8c914c55f71 100644
--- a/tensorflow/contrib/cmake/tf_core_profiler.cmake
+++ b/tensorflow/contrib/cmake/tf_core_profiler.cmake
@@ -17,6 +17,8 @@
 ########################################################
 file(GLOB_RECURSE tf_core_profiler_srcs
     "${tensorflow_source_dir}/tensorflow/core/profiler/*.proto"
+    "${tensorflow_source_dir}/tensorflow/core/profiler/tfprof_options.h"
+    "${tensorflow_source_dir}/tensorflow/core/profiler/tfprof_options.cc"
     "${tensorflow_source_dir}/tensorflow/core/profiler/internal/*.h"
     "${tensorflow_source_dir}/tensorflow/core/profiler/internal/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/profiler/internal/advisor/*.h"
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 0128946e45ea48f47a8be0df66e498bb0240de11..b730ebd3baacafe8ae401e8987104f3062372954 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -120,33 +120,46 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS HDRS ROOT_DIR)
   set(${HDRS} ${${HDRS}} PARENT_SCOPE)
 endfunction()
 
-file(GLOB_RECURSE tf_protos_python_srcs RELATIVE ${tensorflow_source_dir}
-    "${tensorflow_source_dir}/tensorflow/core/*.proto"
-    "${tensorflow_source_dir}/tensorflow/core/profiler/*.proto"
-    "${tensorflow_source_dir}/tensorflow/python/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/proto/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/decision_trees/proto/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/proto/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tpu/proto/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tpu/profiler/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/training/*.proto"
-)
+FILE(READ python_protos.txt python_protos)
+# Convert file contents into a CMake list (where each element in the list is one line of the file)
+STRING(REGEX REPLACE ";" "\\\\;" python_protos "${python_protos}")
+STRING(REGEX REPLACE "\n" ";" python_protos "${python_protos}")
+
+foreach(python_proto ${python_protos})
+  if(NOT python_proto MATCHES "^\#")
+    STRING(REGEX REPLACE " *\#.*" "" python_proto "${python_proto}")
+    if(NOT EXISTS "${tensorflow_source_dir}/${python_proto}")
+      message(SEND_ERROR "Python proto directory not found: ${python_proto}")
+    endif()
+    file(GLOB_RECURSE tf_python_protos_src RELATIVE ${tensorflow_source_dir}
+        "${tensorflow_source_dir}/${python_proto}/*.proto"
+    )
+    list(APPEND tf_python_protos_srcs ${tf_python_protos_src})
+  endif()
+endforeach(python_proto)
+
 RELATIVE_PROTOBUF_GENERATE_PYTHON(
-    ${tensorflow_source_dir} PYTHON_PROTO_GENFILES ${tf_protos_python_srcs}
+    ${tensorflow_source_dir} PYTHON_PROTO_GENFILES ${tf_python_protos_srcs}
 )
 
-# NOTE(mrry): Avoid regenerating the tensorflow/core protos because this
-# can cause benign-but-failing-on-Windows-due-to-file-locking conflicts
-# when two rules attempt to generate the same file.
-file(GLOB_RECURSE tf_python_protos_cc_srcs RELATIVE ${tensorflow_source_dir}
-    "${tensorflow_source_dir}/tensorflow/core/profiler/*.proto"
-    "${tensorflow_source_dir}/tensorflow/python/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/training/*.proto"
-)
+FILE(READ python_protos_cc.txt python_protos_cc)
+# Convert file contents into a CMake list (where each element in the list is one line of the file)
+STRING(REGEX REPLACE ";" "\\\\;" python_protos_cc "${python_protos_cc}")
+STRING(REGEX REPLACE "\n" ";" python_protos_cc "${python_protos_cc}")
+
+foreach(python_proto_cc ${python_protos_cc})
+  if(NOT python_proto_cc MATCHES "^\#")
+    STRING(REGEX REPLACE " *\#.*" "" python_proto_cc "${python_proto_cc}")
+    if(NOT EXISTS "${tensorflow_source_dir}/${python_proto_cc}")
+      message(SEND_ERROR "Python proto CC directory not found: ${python_proto_cc}")
+    endif()
+    file(GLOB_RECURSE tf_python_protos_cc_src RELATIVE ${tensorflow_source_dir}
+        "${tensorflow_source_dir}/${python_proto_cc}/*.proto"
+    )
+    list(APPEND tf_python_protos_cc_srcs ${tf_python_protos_cc_src})
+  endif()
+endforeach(python_proto_cc)
+
 RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
     ${tensorflow_source_dir} ${tf_python_protos_cc_srcs}
 )
@@ -192,315 +205,21 @@ function(add_python_module MODULE_NAME)
     endif()
 endfunction()
 
-add_python_module("tensorflow")
-add_python_module("tensorflow/core")
-add_python_module("tensorflow/core/example")
-add_python_module("tensorflow/core/framework")
-add_python_module("tensorflow/core/lib")
-add_python_module("tensorflow/core/lib/core")
-add_python_module("tensorflow/core/protobuf")
-add_python_module("tensorflow/core/util")
-add_python_module("tensorflow/examples")
-add_python_module("tensorflow/examples/tutorials")
-add_python_module("tensorflow/examples/tutorials/mnist")
-add_python_module("tensorflow/python")
-add_python_module("tensorflow/python/client")
-add_python_module("tensorflow/python/data")
-add_python_module("tensorflow/python/data/ops")
-add_python_module("tensorflow/python/data/util")
-add_python_module("tensorflow/python/debug")
-add_python_module("tensorflow/python/debug/cli")
-add_python_module("tensorflow/python/debug/examples")
-add_python_module("tensorflow/python/debug/lib")
-add_python_module("tensorflow/python/debug/wrappers")
-add_python_module("tensorflow/python/eager")
-add_python_module("tensorflow/python/estimator")
-add_python_module("tensorflow/python/estimator/canned")
-add_python_module("tensorflow/python/estimator/export")
-add_python_module("tensorflow/python/estimator/inputs")
-add_python_module("tensorflow/python/estimator/inputs/queues")
-add_python_module("tensorflow/python/feature_column")
-add_python_module("tensorflow/python/framework")
-add_python_module("tensorflow/python/grappler")
-add_python_module("tensorflow/python/keras")
-add_python_module("tensorflow/python/keras/activations")
-add_python_module("tensorflow/python/keras/applications")
-add_python_module("tensorflow/python/keras/applications/inception_resnet_v2")
-add_python_module("tensorflow/python/keras/applications/inception_v3")
-add_python_module("tensorflow/python/keras/applications/mobilenet")
-add_python_module("tensorflow/python/keras/applications/resnet50")
-add_python_module("tensorflow/python/keras/applications/vgg16")
-add_python_module("tensorflow/python/keras/applications/vgg19")
-add_python_module("tensorflow/python/keras/applications/xception")
-add_python_module("tensorflow/python/keras/backend")
-add_python_module("tensorflow/python/keras/callbacks")
-add_python_module("tensorflow/python/keras/constraints")
-add_python_module("tensorflow/python/keras/datasets")
-add_python_module("tensorflow/python/keras/datasets/boston_housing")
-add_python_module("tensorflow/python/keras/datasets/cifar10")
-add_python_module("tensorflow/python/keras/datasets/cifar100")
-add_python_module("tensorflow/python/keras/datasets/fashion_mnist")
-add_python_module("tensorflow/python/keras/datasets/imdb")
-add_python_module("tensorflow/python/keras/datasets/mnist")
-add_python_module("tensorflow/python/keras/datasets/reuters")
-add_python_module("tensorflow/python/keras/estimator")
-add_python_module("tensorflow/python/keras/initializers")
-add_python_module("tensorflow/python/keras/layers")
-add_python_module("tensorflow/python/keras/losses")
-add_python_module("tensorflow/python/keras/metrics")
-add_python_module("tensorflow/python/keras/models")
-add_python_module("tensorflow/python/keras/optimizers")
-add_python_module("tensorflow/python/keras/preprocessing")
-add_python_module("tensorflow/python/keras/preprocessing/image")
-add_python_module("tensorflow/python/keras/preprocessing/sequence")
-add_python_module("tensorflow/python/keras/preprocessing/text")
-add_python_module("tensorflow/python/keras/regularizers")
-add_python_module("tensorflow/python/keras/utils")
-add_python_module("tensorflow/python/keras/wrappers")
-add_python_module("tensorflow/python/keras/wrappers/scikit_learn")
-add_python_module("tensorflow/python/keras/_impl")
-add_python_module("tensorflow/python/keras/_impl/keras")
-add_python_module("tensorflow/python/keras/_impl/keras/applications")
-add_python_module("tensorflow/python/keras/_impl/keras/datasets")
-add_python_module("tensorflow/python/keras/_impl/keras/engine")
-add_python_module("tensorflow/python/keras/_impl/keras/layers")
-add_python_module("tensorflow/python/keras/_impl/keras/preprocessing")
-add_python_module("tensorflow/python/keras/_impl/keras/utils")
-add_python_module("tensorflow/python/keras/_impl/keras/wrappers")
-add_python_module("tensorflow/python/kernel_tests")
-add_python_module("tensorflow/python/kernel_tests/distributions")
-add_python_module("tensorflow/python/kernel_tests/linalg")
-add_python_module("tensorflow/python/layers")
-add_python_module("tensorflow/python/lib")
-add_python_module("tensorflow/python/lib/core")
-add_python_module("tensorflow/python/lib/io")
-add_python_module("tensorflow/python/ops")
-add_python_module("tensorflow/python/ops/distributions")
-add_python_module("tensorflow/python/ops/linalg")
-add_python_module("tensorflow/python/ops/losses")
-add_python_module("tensorflow/python/platform")
-add_python_module("tensorflow/python/platform/default")
-add_python_module("tensorflow/python/platform/summary")
-add_python_module("tensorflow/python/profiler/")
-add_python_module("tensorflow/python/profiler/internal")
-add_python_module("tensorflow/python/saved_model")
-add_python_module("tensorflow/python/summary")
-add_python_module("tensorflow/python/summary/writer")
-add_python_module("tensorflow/python/tools")
-add_python_module("tensorflow/python/training")
-add_python_module("tensorflow/python/user_ops")
-add_python_module("tensorflow/python/util")
-add_python_module("tensorflow/python/util/protobuf")
-add_python_module("tensorflow/tools")
-add_python_module("tensorflow/tools/graph_transforms")
-add_python_module("tensorflow/contrib")
-add_python_module("tensorflow/contrib/all_reduce")
-add_python_module("tensorflow/contrib/all_reduce/python")
-add_python_module("tensorflow/contrib/android")
-add_python_module("tensorflow/contrib/android/java")
-add_python_module("tensorflow/contrib/android/java/org")
-add_python_module("tensorflow/contrib/android/java/org/tensorflow")
-add_python_module("tensorflow/contrib/android/java/org/tensorflow/contrib")
-add_python_module("tensorflow/contrib/android/java/org/tensorflow/contrib/android")
-add_python_module("tensorflow/contrib/android/jni")
-add_python_module("tensorflow/contrib/bayesflow")
-add_python_module("tensorflow/contrib/bayesflow/examples")
-add_python_module("tensorflow/contrib/bayesflow/examples/reinforce_simple")
-add_python_module("tensorflow/contrib/bayesflow/python")
-add_python_module("tensorflow/contrib/bayesflow/python/kernel_tests")
-add_python_module("tensorflow/contrib/bayesflow/python/ops")
-add_python_module("tensorflow/contrib/boosted_trees")
-add_python_module("tensorflow/contrib/boosted_trees/estimator_batch")
-add_python_module("tensorflow/contrib/boosted_trees/ops")
-add_python_module("tensorflow/contrib/boosted_trees/proto")
-add_python_module("tensorflow/contrib/boosted_trees/python")
-add_python_module("tensorflow/contrib/boosted_trees/python/kernel_tests")
-add_python_module("tensorflow/contrib/boosted_trees/python/ops")
-add_python_module("tensorflow/contrib/cloud")
-add_python_module("tensorflow/contrib/cloud/kernels")
-add_python_module("tensorflow/contrib/cloud/ops")
-add_python_module("tensorflow/contrib/cloud/python")
-add_python_module("tensorflow/contrib/cloud/python/ops")
-add_python_module("tensorflow/contrib/cluster_resolver")
-add_python_module("tensorflow/contrib/cluster_resolver/python")
-add_python_module("tensorflow/contrib/cluster_resolver/python/training")
-add_python_module("tensorflow/contrib/compiler")
-add_python_module("tensorflow/contrib/copy_graph")
-add_python_module("tensorflow/contrib/copy_graph/python")
-add_python_module("tensorflow/contrib/copy_graph/python/util")
-add_python_module("tensorflow/contrib/crf")
-add_python_module("tensorflow/contrib/crf/python")
-add_python_module("tensorflow/contrib/crf/python/kernel_tests")
-add_python_module("tensorflow/contrib/crf/python/ops")
-add_python_module("tensorflow/contrib/cudnn_rnn")
-add_python_module("tensorflow/contrib/cudnn_rnn/kernels")
-add_python_module("tensorflow/contrib/cudnn_rnn/ops")
-add_python_module("tensorflow/contrib/cudnn_rnn/python")
-add_python_module("tensorflow/contrib/cudnn_rnn/python/kernel_tests")
-add_python_module("tensorflow/contrib/cudnn_rnn/python/layers")
-add_python_module("tensorflow/contrib/cudnn_rnn/python/ops")
-add_python_module("tensorflow/contrib/data")
-add_python_module("tensorflow/contrib/data/python")
-add_python_module("tensorflow/contrib/data/python/kernel_tests")
-add_python_module("tensorflow/contrib/data/python/ops")
-add_python_module("tensorflow/contrib/decision_trees")
-add_python_module("tensorflow/contrib/decision_trees/proto")
-add_python_module("tensorflow/contrib/deprecated")
-add_python_module("tensorflow/contrib/distributions")
-add_python_module("tensorflow/contrib/distributions/python")
-add_python_module("tensorflow/contrib/distributions/python/kernel_tests")
-add_python_module("tensorflow/contrib/distributions/python/ops")
-add_python_module("tensorflow/contrib/distributions/python/ops/bijectors")
-add_python_module("tensorflow/contrib/eager")
-add_python_module("tensorflow/contrib/eager/python")
-add_python_module("tensorflow/contrib/estimator")
-add_python_module("tensorflow/contrib/estimator/python")
-add_python_module("tensorflow/contrib/estimator/python/estimator")
-add_python_module("tensorflow/contrib/factorization")
-add_python_module("tensorflow/contrib/factorization/examples")
-add_python_module("tensorflow/contrib/factorization/kernels")
-add_python_module("tensorflow/contrib/factorization/ops")
-add_python_module("tensorflow/contrib/factorization/python")
-add_python_module("tensorflow/contrib/factorization/python/kernel_tests")
-add_python_module("tensorflow/contrib/factorization/python/ops")
-add_python_module("tensorflow/contrib/ffmpeg")
-add_python_module("tensorflow/contrib/ffmpeg/default")
-add_python_module("tensorflow/contrib/ffmpeg/testdata")
-add_python_module("tensorflow/contrib/framework")
-add_python_module("tensorflow/contrib/framework/kernels")
-add_python_module("tensorflow/contrib/framework/ops")
-add_python_module("tensorflow/contrib/framework/python")
-add_python_module("tensorflow/contrib/framework/python/framework")
-add_python_module("tensorflow/contrib/framework/python/ops")
-add_python_module("tensorflow/contrib/gan")
-add_python_module("tensorflow/contrib/gan/python")
-add_python_module("tensorflow/contrib/gan/python/eval")
-add_python_module("tensorflow/contrib/gan/python/eval/python")
-add_python_module("tensorflow/contrib/gan/python/features")
-add_python_module("tensorflow/contrib/gan/python/features/python")
-add_python_module("tensorflow/contrib/gan/python/estimator")
-add_python_module("tensorflow/contrib/gan/python/estimator/python")
-add_python_module("tensorflow/contrib/gan/python/losses")
-add_python_module("tensorflow/contrib/gan/python/losses/python")
-add_python_module("tensorflow/contrib/graph_editor")
-add_python_module("tensorflow/contrib/graph_editor/examples")
-add_python_module("tensorflow/contrib/graph_editor/tests")
-add_python_module("tensorflow/contrib/grid_rnn")
-add_python_module("tensorflow/contrib/grid_rnn/python")
-add_python_module("tensorflow/contrib/grid_rnn/python/kernel_tests")
-add_python_module("tensorflow/contrib/grid_rnn/python/ops")
-add_python_module("tensorflow/contrib/hooks")
-add_python_module("tensorflow/contrib/image")
-add_python_module("tensorflow/contrib/image/ops")
-add_python_module("tensorflow/contrib/image/python")
-add_python_module("tensorflow/contrib/image/python/ops")
-add_python_module("tensorflow/contrib/input_pipeline")
-add_python_module("tensorflow/contrib/input_pipeline/ops")
-add_python_module("tensorflow/contrib/input_pipeline/python")
-add_python_module("tensorflow/contrib/input_pipeline/python/ops")
-add_python_module("tensorflow/contrib/integrate")
-add_python_module("tensorflow/contrib/integrate/python")
-add_python_module("tensorflow/contrib/integrate/python/ops")
-add_python_module("tensorflow/contrib/ios_examples")
-add_python_module("tensorflow/contrib/ios_examples/benchmark")
-add_python_module("tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj")
-add_python_module("tensorflow/contrib/ios_examples/benchmark/data")
-add_python_module("tensorflow/contrib/ios_examples/camera")
-add_python_module("tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj")
-add_python_module("tensorflow/contrib/ios_examples/camera/en.lproj")
-add_python_module("tensorflow/contrib/ios_examples/simple")
-add_python_module("tensorflow/contrib/ios_examples/simple/data")
-add_python_module("tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj")
-add_python_module("tensorflow/contrib/keras")
-add_python_module("tensorflow/contrib/keras/api")
-add_python_module("tensorflow/contrib/keras/api/keras")
-add_python_module("tensorflow/contrib/keras/api/keras/activations")
-add_python_module("tensorflow/contrib/keras/api/keras/applications")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/inception_v3")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/mobilenet")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/resnet50")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/vgg16")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/vgg19")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/xception")
-add_python_module("tensorflow/contrib/keras/api/keras/backend")
-add_python_module("tensorflow/contrib/keras/api/keras/callbacks")
-add_python_module("tensorflow/contrib/keras/api/keras/constraints")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/boston_housing")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/cifar10")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/cifar100")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/imdb")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/mnist")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/reuters")
-add_python_module("tensorflow/contrib/keras/api/keras/initializers")
-add_python_module("tensorflow/contrib/keras/api/keras/layers")
-add_python_module("tensorflow/contrib/keras/api/keras/losses")
-add_python_module("tensorflow/contrib/keras/api/keras/metrics")
-add_python_module("tensorflow/contrib/keras/api/keras/models")
-add_python_module("tensorflow/contrib/keras/api/keras/optimizers")
-add_python_module("tensorflow/contrib/keras/api/keras/preprocessing")
-add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/image")
-add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/sequence")
-add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/text")
-add_python_module("tensorflow/contrib/keras/api/keras/regularizers")
-add_python_module("tensorflow/contrib/keras/api/keras/utils")
-add_python_module("tensorflow/contrib/keras/api/keras/wrappers")
-add_python_module("tensorflow/contrib/keras/api/keras/wrappers/scikit_learn")
-add_python_module("tensorflow/contrib/keras/python")
-add_python_module("tensorflow/contrib/keras/python/keras")
-add_python_module("tensorflow/contrib/keras/python/keras/applications")
-add_python_module("tensorflow/contrib/keras/python/keras/datasets")
-add_python_module("tensorflow/contrib/keras/python/keras/engine")
-add_python_module("tensorflow/contrib/keras/python/keras/layers")
-add_python_module("tensorflow/contrib/keras/python/keras/preprocessing")
-add_python_module("tensorflow/contrib/keras/python/keras/utils")
-add_python_module("tensorflow/contrib/keras/python/keras/wrappers")
-add_python_module("tensorflow/contrib/kernel_methods")
-add_python_module("tensorflow/contrib/kernel_methods/python")
-add_python_module("tensorflow/contrib/kernel_methods/python/mappers")
-add_python_module("tensorflow/contrib/kfac")
-add_python_module("tensorflow/contrib/kfac/examples")
-add_python_module("tensorflow/contrib/kfac/python")
-add_python_module("tensorflow/contrib/kfac/python/ops")
-add_python_module("tensorflow/contrib/labeled_tensor")
-add_python_module("tensorflow/contrib/labeled_tensor/python")
-add_python_module("tensorflow/contrib/labeled_tensor/python/ops")
-add_python_module("tensorflow/contrib/layers")
-add_python_module("tensorflow/contrib/layers/kernels")
-add_python_module("tensorflow/contrib/layers/ops")
-add_python_module("tensorflow/contrib/layers/python")
-add_python_module("tensorflow/contrib/layers/python/kernel_tests")
-add_python_module("tensorflow/contrib/layers/python/layers")
-add_python_module("tensorflow/contrib/layers/python/ops")
-add_python_module("tensorflow/contrib/learn")
-add_python_module("tensorflow/contrib/learn/python")
-add_python_module("tensorflow/contrib/learn/python/learn")
-add_python_module("tensorflow/contrib/learn/python/learn/dataframe")
-add_python_module("tensorflow/contrib/learn/python/learn/dataframe/queues")
-add_python_module("tensorflow/contrib/learn/python/learn/dataframe/transforms")
-add_python_module("tensorflow/contrib/learn/python/learn/datasets")
-add_python_module("tensorflow/contrib/learn/python/learn/datasets/data")
-add_python_module("tensorflow/contrib/learn/python/learn/estimators")
-add_python_module("tensorflow/contrib/learn/python/learn/learn_io")
-add_python_module("tensorflow/contrib/learn/python/learn/ops")
-add_python_module("tensorflow/contrib/learn/python/learn/preprocessing")
-add_python_module("tensorflow/contrib/learn/python/learn/preprocessing/tests")
-add_python_module("tensorflow/contrib/learn/python/learn/tests")
-add_python_module("tensorflow/contrib/learn/python/learn/tests/dataframe")
-add_python_module("tensorflow/contrib/learn/python/learn/utils")
-add_python_module("tensorflow/contrib/legacy_seq2seq")
-add_python_module("tensorflow/contrib/legacy_seq2seq/python")
-add_python_module("tensorflow/contrib/legacy_seq2seq/python/ops")
-add_python_module("tensorflow/contrib/linalg")
-add_python_module("tensorflow/contrib/linalg/python")
-add_python_module("tensorflow/contrib/linalg/python/ops")
-add_python_module("tensorflow/contrib/linalg/python/kernel_tests")
-add_python_module("tensorflow/contrib/linear_optimizer")
-add_python_module("tensorflow/contrib/linear_optimizer/kernels")
-add_python_module("tensorflow/contrib/linear_optimizer/kernels/g3doc")
-add_python_module("tensorflow/contrib/linear_optimizer/python")
-add_python_module("tensorflow/contrib/linear_optimizer/python/kernel_tests")
-add_python_module("tensorflow/contrib/linear_optimizer/python/ops")
+FILE(READ python_modules.txt python_modules)
+# Convert file contents into a CMake list (where each element in the list is one line of the file)
+STRING(REGEX REPLACE ";" "\\\\;" python_modules "${python_modules}")
+STRING(REGEX REPLACE "\n" ";" python_modules "${python_modules}")
+
+foreach(python_module ${python_modules})
+  if(NOT python_module MATCHES "^\#")
+    STRING(REGEX REPLACE " *\#.*" "" python_module "${python_module}")
+    if(NOT EXISTS "${tensorflow_source_dir}/${python_module}")
+      message(SEND_ERROR "Python module not found: ${python_module}")
+    endif()
+    add_python_module(${python_module})
+  endif()
+endforeach(python_module)
+
 add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
     COMMAND ${CMAKE_COMMAND} -E make_directory
     "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/lite")
@@ -514,157 +233,6 @@ add_custom_command(
     TARGET tf_python_copy_scripts_to_destination PRE_BUILD
     COMMAND ${CMAKE_COMMAND} -E touch
     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/lite/python/lite.py)
-add_python_module("tensorflow/contrib/lookup")
-add_python_module("tensorflow/contrib/losses")
-add_python_module("tensorflow/contrib/losses/python")
-add_python_module("tensorflow/contrib/losses/python/losses")
-add_python_module("tensorflow/contrib/losses/python/metric_learning")
-add_python_module("tensorflow/contrib/makefile")
-add_python_module("tensorflow/contrib/makefile/test")
-add_python_module("tensorflow/contrib/memory_stats")
-add_python_module("tensorflow/contrib/memory_stats/kernels")
-add_python_module("tensorflow/contrib/memory_stats/ops")
-add_python_module("tensorflow/contrib/memory_stats/python")
-add_python_module("tensorflow/contrib/memory_stats/python/kernel_tests")
-add_python_module("tensorflow/contrib/memory_stats/python/ops")
-add_python_module("tensorflow/contrib/meta_graph_transform")
-add_python_module("tensorflow/contrib/metrics")
-add_python_module("tensorflow/contrib/metrics/kernels")
-add_python_module("tensorflow/contrib/metrics/ops")
-add_python_module("tensorflow/contrib/metrics/python")
-add_python_module("tensorflow/contrib/metrics/python/kernel_tests")
-add_python_module("tensorflow/contrib/metrics/python/metrics")
-add_python_module("tensorflow/contrib/metrics/python/ops")
-add_python_module("tensorflow/contrib/model_pruning")
-add_python_module("tensorflow/contrib/model_pruning/examples")
-add_python_module("tensorflow/contrib/model_pruning/examples/cifar10")
-add_python_module("tensorflow/contrib/model_pruning/python")
-add_python_module("tensorflow/contrib/model_pruning/python/layers")
-add_python_module("tensorflow/contrib/ndlstm")
-add_python_module("tensorflow/contrib/ndlstm/python")
-add_python_module("tensorflow/contrib/nn")
-add_python_module("tensorflow/contrib/nn/python")
-add_python_module("tensorflow/contrib/nn/python/ops")
-add_python_module("tensorflow/contrib/nccl")
-add_python_module("tensorflow/contrib/nccl/kernels")
-add_python_module("tensorflow/contrib/nccl/ops")
-add_python_module("tensorflow/contrib/nccl/python")
-add_python_module("tensorflow/contrib/nccl/python/ops")
-add_python_module("tensorflow/contrib/nearest_neighbor/kernels")
-add_python_module("tensorflow/contrib/nearest_neighbor/ops")
-add_python_module("tensorflow/contrib/nearest_neighbor/python")
-add_python_module("tensorflow/contrib/nearest_neighbor/python/kernel_tests")
-add_python_module("tensorflow/contrib/nearest_neighbor/python/ops")
-add_python_module("tensorflow/contrib/opt")
-add_python_module("tensorflow/contrib/opt/python")
-add_python_module("tensorflow/contrib/opt/python/training")
-add_python_module("tensorflow/contrib/pi_examples")
-add_python_module("tensorflow/contrib/pi_examples/camera")
-add_python_module("tensorflow/contrib/pi_examples/label_image")
-add_python_module("tensorflow/contrib/pi_examples/label_image/data")
-add_python_module("tensorflow/contrib/predictor")
-add_python_module("tensorflow/contrib/quantization")
-add_python_module("tensorflow/contrib/quantization/python")
-add_python_module("tensorflow/contrib/quantize")
-add_python_module("tensorflow/contrib/quantize/python")
-add_python_module("tensorflow/contrib/remote_fused_graph/pylib")
-add_python_module("tensorflow/contrib/remote_fused_graph/pylib/python")
-add_python_module("tensorflow/contrib/remote_fused_graph/pylib/python/ops")
-add_python_module("tensorflow/contrib/resampler")
-add_python_module("tensorflow/contrib/resampler/kernels")
-add_python_module("tensorflow/contrib/resampler/ops")
-add_python_module("tensorflow/contrib/resampler/python")
-add_python_module("tensorflow/contrib/resampler/python/ops")
-add_python_module("tensorflow/contrib/rnn")
-add_python_module("tensorflow/contrib/rnn/kernels")
-add_python_module("tensorflow/contrib/rnn/ops")
-add_python_module("tensorflow/contrib/rnn/python")
-add_python_module("tensorflow/contrib/rnn/python/kernel_tests")
-add_python_module("tensorflow/contrib/rnn/python/ops")
-add_python_module("tensorflow/contrib/saved_model")
-add_python_module("tensorflow/contrib/saved_model/python")
-add_python_module("tensorflow/contrib/saved_model/python/saved_model")
-add_python_module("tensorflow/contrib/seq2seq")
-add_python_module("tensorflow/contrib/seq2seq/kernels")
-add_python_module("tensorflow/contrib/seq2seq/ops")
-add_python_module("tensorflow/contrib/seq2seq/python")
-add_python_module("tensorflow/contrib/seq2seq/python/kernel_tests")
-add_python_module("tensorflow/contrib/seq2seq/python/ops")
-add_python_module("tensorflow/contrib/session_bundle")
-add_python_module("tensorflow/contrib/session_bundle/example")
-add_python_module("tensorflow/contrib/session_bundle/testdata")
-add_python_module("tensorflow/contrib/signal")
-add_python_module("tensorflow/contrib/signal/python")
-add_python_module("tensorflow/contrib/signal/python/ops")
-add_python_module("tensorflow/contrib/slim")
-add_python_module("tensorflow/contrib/slim/python")
-add_python_module("tensorflow/contrib/slim/python/slim")
-add_python_module("tensorflow/contrib/slim/python/slim/data")
-add_python_module("tensorflow/contrib/slim/python/slim/nets")
-add_python_module("tensorflow/contrib/solvers")
-add_python_module("tensorflow/contrib/solvers/python")
-add_python_module("tensorflow/contrib/solvers/python/ops")
-add_python_module("tensorflow/contrib/sparsemax")
-add_python_module("tensorflow/contrib/sparsemax/python")
-add_python_module("tensorflow/contrib/sparsemax/python/ops")
-add_python_module("tensorflow/contrib/specs")
-add_python_module("tensorflow/contrib/specs/python")
-add_python_module("tensorflow/contrib/staging")
-add_python_module("tensorflow/contrib/stat_summarizer")
-add_python_module("tensorflow/contrib/stateless")
-add_python_module("tensorflow/contrib/tensorboard")
-add_python_module("tensorflow/contrib/tensorboard/plugins")
-add_python_module("tensorflow/contrib/tensorboard/plugins/projector")
-add_python_module("tensorflow/contrib/tensor_forest")
-add_python_module("tensorflow/contrib/tensor_forest/client")
-add_python_module("tensorflow/contrib/tensor_forest/core")
-add_python_module("tensorflow/contrib/tensor_forest/core/ops")
-add_python_module("tensorflow/contrib/tensor_forest/data")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/core")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/core/ops")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/ops")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/layers")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/models")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/ops")
-add_python_module("tensorflow/contrib/tensor_forest/python")
-add_python_module("tensorflow/contrib/tensor_forest/python/kernel_tests")
-add_python_module("tensorflow/contrib/tensor_forest/python/ops")
-add_python_module("tensorflow/contrib/testing")
-add_python_module("tensorflow/contrib/testing/python")
-add_python_module("tensorflow/contrib/testing/python/framework")
-add_python_module("tensorflow/contrib/text")
-add_python_module("tensorflow/contrib/text/kernels")
-add_python_module("tensorflow/contrib/text/ops")
-add_python_module("tensorflow/contrib/text/python")
-add_python_module("tensorflow/contrib/text/python/ops")
-add_python_module("tensorflow/contrib/tfprof")
-add_python_module("tensorflow/contrib/timeseries")
-add_python_module("tensorflow/contrib/timeseries/examples")
-add_python_module("tensorflow/contrib/timeseries/examples/data")
-add_python_module("tensorflow/contrib/timeseries/python")
-add_python_module("tensorflow/contrib/timeseries/python/timeseries")
-add_python_module("tensorflow/contrib/timeseries/python/timeseries/state_space_models")
-add_python_module("tensorflow/contrib/tpu")
-add_python_module("tensorflow/contrib/tpu/ops")
-add_python_module("tensorflow/contrib/tpu/profiler")
-add_python_module("tensorflow/contrib/tpu/python")
-add_python_module("tensorflow/contrib/tpu/python/ops")
-add_python_module("tensorflow/contrib/tpu/python/profiler")
-add_python_module("tensorflow/contrib/tpu/python/tpu")
-add_python_module("tensorflow/contrib/training")
-add_python_module("tensorflow/contrib/training/python")
-add_python_module("tensorflow/contrib/training/python/training")
-add_python_module("tensorflow/contrib/util")
-add_python_module("tensorflow/contrib/reduce_slice_ops")
-add_python_module("tensorflow/contrib/reduce_slice_ops/kernels")
-add_python_module("tensorflow/contrib/reduce_slice_ops/ops")
-add_python_module("tensorflow/contrib/reduce_slice_ops/python")
-add_python_module("tensorflow/contrib/reduce_slice_ops/python/kernel_tests")
-add_python_module("tensorflow/contrib/reduce_slice_ops/python/ops")
-add_python_module("tensorflow/contrib/summary")
 
 # Generate the tensorflow.python.platform.build_info module.
 set(BUILD_INFO_PY "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/platform/build_info.py")
@@ -739,7 +307,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
     # containing the wrappers.
     add_custom_command(
       OUTPUT ${GENERATE_PYTHON_OP_LIB_DESTINATION}
-      COMMAND ${tf_python_op_lib_name}_gen_python @${tensorflow_source_dir}/tensorflow/python/ops/hidden_ops.txt ${require_shape_fn} > ${GENERATE_PYTHON_OP_LIB_DESTINATION}
+      COMMAND ${tf_python_op_lib_name}_gen_python ${tensorflow_source_dir}/tensorflow/core/api_def/base_api,${tensorflow_source_dir}/tensorflow/core/api_def/python_api ${require_shape_fn} > ${GENERATE_PYTHON_OP_LIB_DESTINATION}
       DEPENDS ${tf_python_op_lib_name}_gen_python
     )
 
@@ -749,6 +317,7 @@ endfunction()
 
 GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("array_ops")
+GENERATE_PYTHON_OP_LIB("batch_ops")
 GENERATE_PYTHON_OP_LIB("bitwise_ops")
 GENERATE_PYTHON_OP_LIB("math_ops")
 GENERATE_PYTHON_OP_LIB("functional_ops")
@@ -762,9 +331,11 @@ GENERATE_PYTHON_OP_LIB("dataset_ops")
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
+GENERATE_PYTHON_OP_LIB("list_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
 GENERATE_PYTHON_OP_LIB("lookup_ops")
 GENERATE_PYTHON_OP_LIB("nn_ops")
+GENERATE_PYTHON_OP_LIB("manip_ops")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
@@ -793,10 +364,12 @@ GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_quantiles_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_quantile_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_stats_accumulator_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_stats_accumulator_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_coder_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/coder/python/ops/gen_coder_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_cudnn_rnn_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cudnn_rnn/ops/gen_cudnn_rnn_ops.py)
-GENERATE_PYTHON_OP_LIB("contrib_data_prefetching_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/data/python/ops/gen_prefetching_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_data_dataset_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/data/python/ops/gen_dataset_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_clustering_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/factorization/python/ops/gen_clustering_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_factorization_ops"
@@ -817,6 +390,9 @@ GENERATE_PYTHON_OP_LIB("contrib_memory_stats_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/memory_stats/ops/gen_memory_stats_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_nccl_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nccl/ops/gen_nccl_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_periodic_resample_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/periodic_resample/python/ops/gen_periodic_resample_op.py)
+
 GENERATE_PYTHON_OP_LIB("contrib_nearest_neighbor_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nearest_neighbor/ops/gen_nearest_neighbor_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_resampler_ops"
@@ -889,6 +465,8 @@ set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/framework/cpp_shape_inference.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.h"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.cc"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/bfloat16.h"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/bfloat16.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/numpy.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/numpy.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/ndarray_tensor.h"
@@ -899,6 +477,8 @@ set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_func.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.cc"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_util.h"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_util.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/safe_ptr.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/safe_ptr.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/io/py_record_reader.h"
@@ -961,7 +541,11 @@ if(WIN32)
 	${nsync_STATIC_LIBRARIES}
     )
 
-    set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow.def")
+    if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+        set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow.def")
+    else()
+        set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow.def")
+    endif()
     set_source_files_properties(${pywrap_tensorflow_deffile} PROPERTIES GENERATED TRUE)
 
     add_custom_command(TARGET pywrap_tensorflow_internal_static POST_BUILD
@@ -969,6 +553,7 @@ if(WIN32)
             --input "${pywrap_tensorflow_internal_static_dependencies}"
             --output "${pywrap_tensorflow_deffile}"
             --target _pywrap_tensorflow_internal.pyd
+        BYPRODUCTS ${pywrap_tensorflow_deffile} # Required for Ninja
     )
 endif(WIN32)
 
@@ -1015,6 +600,20 @@ target_link_libraries(pywrap_tensorflow_internal PRIVATE
 )
 
 if(WIN32)
+
+    # include contrib/periodic_resample as .so
+    #
+    set(tf_periodic_resample_srcs
+       "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc"
+       "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h"
+       "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/ops/array_ops.cc"
+    )
+
+    AddUserOps(TARGET _periodic_resample_op
+        SOURCES "${tf_periodic_resample_srcs}"
+        DEPENDS pywrap_tensorflow_internal tf_python_ops
+        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/periodic_resample/python/ops/)
+
     # include contrib/nearest_neighbor as .so
     #
     set(tf_nearest_neighbor_srcs
@@ -1108,11 +707,19 @@ add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
 
 if(WIN32)
-  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  else()
+    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  endif()
 else()
   add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index 571d2b0decb5e9afcec2314f9837546f0974e90d..6d36d5fc5c2854b2d7d2542a3cb12e033e193b88 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -46,7 +46,11 @@ if(WIN32)
       $<TARGET_FILE:tf_protos_cc>
   )
 
-  set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/tensorflow.def")
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/tensorflow.def")
+  else()
+    set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/tensorflow.def")
+  endif()
   set_source_files_properties(${tensorflow_deffile} PROPERTIES GENERATED TRUE)
 
   add_custom_command(TARGET tensorflow_static POST_BUILD
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 18b71d1f9a47b717200a01ffc368f8c8daaa1519..1c4ebd7f0c1113bcd0857fb0858df2248499f920 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -139,21 +139,27 @@ if (tensorflow_BUILD_PYTHON_TESTS)
 
   file(GLOB_RECURSE tf_test_src_py
     ${tf_test_rnn_src_py}
+    "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/*.py"
     "${tensorflow_source_dir}/tensorflow/python/debug/cli/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/debug/lib/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/debug/wrappers/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/estimator/python/estimator/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/*.py"
     "${tensorflow_source_dir}/tensorflow/python/meta_graph_transform/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/ops/quantized_conv_ops_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/ops/quantized_ops_test.py"
     "${tensorflow_source_dir}/tensorflow/python/platform/build_info_test.py"
     "${tensorflow_source_dir}/tensorflow/python/profiler/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/profiler/internal/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/saved_model/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/training/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/coder/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/data/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/factorization/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/feature_column/python/feature_column/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/image/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/python/kernel_tests/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/python/kernel_tests/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/python/kernel_tests/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/stateless/python/kernel_tests/*_test.py"
@@ -186,6 +192,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/python/profiler/pprof_profiler_test.py"
     # flaky test
     "${tensorflow_source_dir}/tensorflow/python/profiler/internal/run_metadata_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/profiler/model_analyzer_test.py"
     # Fails because uses data dependencies with bazel
     "${tensorflow_source_dir}/tensorflow/python/saved_model/saved_model_test.py"
     # requires scipy
@@ -216,15 +223,20 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       # TFDBG grpc:// mode is not yet available on Windows.
       "${tensorflow_source_dir}/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py"
       "${tensorflow_source_dir}/tensorflow/python/debug/lib/session_debug_grpc_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/debug/lib/source_remote_test.py"
       # stl on windows handles overflows different
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/as_string_op_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/string_to_number_op_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/clip_ops_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/list_ops_test.py"  # Needs portpicker.
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/tensor_array_ops_test.py"  # Needs portpicker.
       # Numerical issues, calculations off.
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/concat_op_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/python/ops/wals_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py"
       "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/backend_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py"
       # Float division by zero
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/benchmark_test.py"
       # Flaky, for unknown reasons. Cannot reproduce in terminal. Revisit once we can get stack traces.
@@ -233,11 +245,11 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/training/sync_replicas_optimizer_test.py"
       "${tensorflow_source_dir}/tensorflow/python/debug/lib/session_debug_grpc_test.py"
       "${tensorflow_source_dir}tensorflow/python/training/localhost_cluster_performance_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/iterator_ops_cluster_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/functional_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py"
       # Type error in testRemoteIteratorUsingRemoteCallOpDirectSessionGPUCPU.
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/iterator_ops_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py"
       # IteratorGetMax OutOfRangeError
@@ -261,11 +273,10 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/linalg_grad_test.py"  # cudaSolver handle creation fails.
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/array_ops_test.py"  # depends on python/framework/test_ops
       # Dataset tests
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/dataset_constructor_op_test.py"  # Segfaults on windows
+      "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py"  # Segfaults on windows
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"  # Segfaults on Windows.
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/iterator_ops_cluster_test.py"
-      # Broken tensorboard test due to cmake issues.
-      "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py"  # Needs portpicker
+      "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py"  # Deadlocks
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/sloppy_transformation_dataset_op_test.py"  # b/65430561
       # tensor_forest tests (also note that we exclude the hybrid tests for now)
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py"  # Results in wrong order.
@@ -294,6 +305,13 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       # Test should only be run manually
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/reduction_ops_test_big.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/svd_op_test.py"
+      # Depends on python/framework/test_ops
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/array_ops_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/control_flow_util_test.py"
+      # Flaky replicate_model_fn_test
+      "${tensorflow_source_dir}/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py"  # b/71901810
+      # Broken io_utils_test
+      "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/utils/io_utils_test.py"  # b/72894325
   )
   endif()
   list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude})
@@ -361,7 +379,6 @@ if (tensorflow_BUILD_CC_TESTS)
     "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/tensor_coding_test.cc"
     "${tensorflow_source_dir}/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc"
     "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/graph_transferer_test.cc"
-    "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc"
   )
 
   if (NOT tensorflow_ENABLE_GPU)
diff --git a/tensorflow/contrib/cmake/tf_tools.cmake b/tensorflow/contrib/cmake/tf_tools.cmake
index cb58a2e7df85b2f214654eff5547c5788592f208..58c7df95c821b4d1aa2cc63c8aaf4039518b83ca 100644
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@@ -48,9 +48,6 @@ file(GLOB_RECURSE tf_tools_transform_graph_lib_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/tools/graph_transforms/compare_graphs.cc"
     "${tensorflow_source_dir}/tensorflow/tools/graph_transforms/summarize_graph_main.cc"
     "${tensorflow_source_dir}/tensorflow/tools/graph_transforms/transform_graph_main.cc"
-    "${tensorflow_source_dir}/tensorflow/tools/graph_transforms/quantize_nodes.cc"
-    "${tensorflow_source_dir}/tensorflow/tools/graph_transforms/quantize_weights.cc"
-    "${tensorflow_source_dir}/tensorflow/tools/graph_transforms/round_weights.cc"
 )
 list(REMOVE_ITEM tf_tools_transform_graph_lib_srcs ${tf_tools_transform_graph_lib_exclude_srcs})
 
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index f67698eb99a38eae307b52e55de748a67b798cbd..53c2285699a6ca94e1e6b147080338b507f4d768 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -31,7 +31,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import io
+import codecs
 import os
 import re
 import subprocess
@@ -103,7 +103,7 @@ def main():
   for lib_path in args.input:
     proc = subprocess.Popen([DUMPBIN, "/nologo", "/linkermember:1", lib_path],
                             stdout=subprocess.PIPE)
-    for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
+    for line in codecs.getreader("utf-8")(proc.stdout):
       cols = line.split()
       if len(cols) < 2:
         continue
@@ -131,7 +131,7 @@ def main():
     # We compare on undname but use the decorated name from candidates.
     dupes = 0
     proc = subprocess.Popen([UNDNAME, tmpfile.name], stdout=subprocess.PIPE)
-    for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")):
+    for idx, line in enumerate(codecs.getreader("utf-8")(proc.stdout)):
       decorated = candidates[idx]
       if decorated in taken:
         # Symbol is already in output, done.
diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ec3d550b70d2aaa23b989c44f3d86fa87cffb335
--- /dev/null
+++ b/tensorflow/contrib/coder/BUILD
@@ -0,0 +1,167 @@
+# Description:
+#   Contains entropy coding related modules.
+
+package(default_visibility = [
+    "//learning/brain:__subpackages__",
+    "//tensorflow:__subpackages__",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_custom_op_library",
+    "tf_custom_op_py_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+    "tf_py_test",
+)
+
+cc_library(
+    name = "range_coder",
+    srcs = [
+        "kernels/range_coder.cc",
+    ],
+    hdrs = [
+        "kernels/range_coder.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "range_coder_test",
+    size = "small",
+    srcs = ["kernels/range_coder_test.cc"],
+    deps = [
+        ":range_coder",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["coder_ops"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_kernel_library(
+    name = "range_coder_ops",
+    srcs = [
+        "kernels/range_coder_ops.cc",
+        "kernels/range_coder_ops_util.cc",
+    ],
+    hdrs = [
+        "kernels/range_coder_ops_util.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":coder_ops_op_lib",
+        ":range_coder",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "range_coder_ops_test",
+    size = "small",
+    srcs = ["kernels/range_coder_ops_test.cc"],
+    deps = [
+        ":range_coder",
+        ":range_coder_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+    ],
+)
+
+cc_library(
+    name = "all_ops",
+    deps = [":coder_ops_op_lib"],
+)
+
+cc_library(
+    name = "all_kernels",
+    deps = [":range_coder_ops"],
+)
+
+tf_custom_op_library(
+    name = "python/ops/_coder_ops.so",
+    srcs = [
+        "kernels/range_coder.cc",
+        "kernels/range_coder.h",
+        "kernels/range_coder_ops.cc",
+        "kernels/range_coder_ops_util.cc",
+        "kernels/range_coder_ops_util.h",
+        "ops/coder_ops.cc",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_coder_ops",
+    out = "python/ops/gen_coder_ops.py",
+    deps = [":coder_ops_op_lib"],
+)
+
+tf_custom_op_py_library(
+    name = "coder_ops_py",
+    srcs = [
+        "__init__.py",
+        "python/ops/coder_ops.py",
+    ],
+    dso = [
+        ":python/ops/_coder_ops.so",
+    ],
+    kernels = [
+        ":all_kernels",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_coder_ops",
+        "//tensorflow/contrib/util:util_py",
+    ],
+)
+
+tf_py_test(
+    name = "coder_ops_py_test",
+    srcs = [
+        "python/ops/coder_ops_test.py",
+    ],
+    additional_deps = [
+        ":coder_ops_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+    ],
+    main = "python/ops/coder_ops_test.py",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/coder/README.md b/tensorflow/contrib/coder/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c6c379c458893551b765327c0c1cbfff7f24f9c3
--- /dev/null
+++ b/tensorflow/contrib/coder/README.md
@@ -0,0 +1,73 @@
+# Entropy coder
+
+This module contains range encoder and range decoder which can encode integer
+data into string with cumulative distribution functions (CDF).
+
+## Data and CDF values
+
+The data to be encoded should be non-negative integers in half-open interval
+`[0, m)`. Then a CDF is represented as an integral vector of length `m + 1`
+where `CDF(i) = f(Pr(X < i) * 2^precision)` for i = 0,1,...,m, and `precision`
+is an attribute in range `0 < precision <= 16`. The function `f` maps real
+values into integers, e.g., round or floor. It is important that to encode a
+number `i`, `CDF(i + 1) - CDF(i)` cannot be zero.
+
+Note that we used `Pr(X < i)` not `Pr(X <= i)`, and therefore CDF(0) = 0 always.
+
+## RangeEncode: data shapes and CDF shapes
+
+For each data element, its CDF has to be provided. Therefore if the shape of CDF
+should be `data.shape + (m + 1,)` in NumPy-like notation. For example, if `data`
+is a 2-D tensor of shape (10, 10) and its elements are in `[0, 64)`, then the
+CDF tensor should have shape (10, 10, 65).
+
+This may make CDF tensor too large, and in many applications all data elements
+may have the same probability distribution. To handle this, `RangeEncode`
+supports limited broadcasting CDF into data. Broadcasting is limited in the
+following sense:
+
+- All CDF axes but the last one is broadcasted into data but not the other way
+  around,
+- The number of CDF axes does not extend, i.e., `CDF.ndim == data.ndim + 1`.
+
+In the previous example where data has shape (10, 10), the following are
+acceptable CDF shapes:
+
+- (10, 10, 65)
+- (1, 10, 65)
+- (10, 1, 65)
+- (1, 1, 65)
+
+## RangeDecode
+
+`RangeEncode` encodes neither data shape nor termination character. Therefore
+the decoder should know how many characters are encoded into the string, and
+`RangeDecode` takes the encoded data shape as the second argument. The same
+shape restrictions as `RangeEncode` inputs apply here.
+
+## Example
+
+```python
+data = tf.random_uniform((128, 128), 0, 10, dtype=tf.int32)
+
+histogram = tf.bincount(data, minlength=10, maxlength=10)
+cdf = tf.cumsum(histogram, exclusive=False)
+# CDF should have length m + 1.
+cdf = tf.pad(cdf, [[1, 0]])
+# CDF axis count must be one more than data.
+cdf = tf.reshape(cdf, [1, 1, -1])
+
+# Note that data has 2^14 elements, and therefore the sum of CDF is 2^14.
+data = tf.cast(data, tf.int16)
+encoded = coder.range_encode(data, cdf, precision=14)
+decoded = coder.range_decode(encoded, tf.shape(data), cdf, precision=14)
+
+# data and decoded should be the same.
+sess = tf.Session()
+x, y = sess.run((data, decoded))
+assert np.all(x == y)
+```
+
+## Authors
+Sung Jin Hwang (github: [ssjhv](https://github.com/ssjhv)) and Nick Johnston
+(github: [nmjohn](https://github.com/nmjohn))
diff --git a/tensorflow/contrib/coder/__init__.py b/tensorflow/contrib/coder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7e663e6f1359f399cdaa80e037635a8f7546b37
--- /dev/null
+++ b/tensorflow/contrib/coder/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Entropy code operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.coder.python.ops.coder_ops import *
+# pylint: enable=wildcard-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/coder/kernels/range_coder.cc b/tensorflow/contrib/coder/kernels/range_coder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21b35155ff317c6afbb1b86745f05385726505b6
--- /dev/null
+++ b/tensorflow/contrib/coder/kernels/range_coder.cc
@@ -0,0 +1,374 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Range coder implementation, based on [1].
+//
+// [1] G. N. N. Martin, "Range coding: an algorithm for removing redundancy from
+// a digitised message", presented to the Video & Data Recording Conference,
+// held in Southampton, July 24-27, 1979.
+//
+#include "tensorflow/contrib/coder/kernels/range_coder.h"
+
+#include <limits>
+#include <string>
+
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+RangeEncoder::RangeEncoder(int precision) : precision_(precision) {
+  CHECK_GT(precision, 0);
+  CHECK_LE(precision, 16);
+}
+
+void RangeEncoder::Encode(int32 lower, int32 upper, string* sink) {
+  // Input requirement: 0 <= lower < upper <= 2^precision.
+  DCHECK_LE(0, lower);
+  DCHECK_LT(lower, upper);
+  DCHECK_LE(upper, 1 << precision_);
+
+  // `base` and `size` represent a half-open interval [base, base + size).
+  // Loop invariant: 2^16 <= size <= 2^32.
+  //
+  // Note that keeping size above 2^16 is important. Since the interval sizes
+  // are quantized to up to 16 bits, the smallest interval size the encode may
+  // handle is 2^-16. If size is smaller than 2^16, a small interval input may
+  // collapse the encoder range into an empty interval.
+  const uint64 size = static_cast<uint64>(size_minus1_) + 1;
+  DCHECK_NE(size >> 16, 0);
+
+  // For short notation, let u := lower and v := upper.
+  //
+  // The input u, v represents a half-open interval [u, v) / 2^precision.
+  // This narrows the current interval roughly to
+  // [base + (size * u) / 2^precision, base + (size * v) / 2^precision).
+  //
+  // TODO(sjhwang): Try rounding if it helps improve compression ratio, at the
+  // expense of more operations. In the test using Zipf distribution, the
+  // overhead over the theoretical compression ratio was ~0.01%.
+  // NOTE: The max value of `size` is 2^32 and size > 0. Therefore `size * u`
+  // can be rewritten as `(size - 1) * u + u` and all the computation can be
+  // done in 32-bit mode. If 32-bit multiply is faster, then rewrite.
+  const uint32 a = (size * static_cast<uint64>(lower)) >> precision_;
+  const uint32 b = ((size * static_cast<uint64>(upper)) >> precision_) - 1;
+  DCHECK_LE(a, b);
+
+  // Let's confirm the RHS of a, b fit in uint32 type.
+  // Recall that 0 <= u < 2^precision, and size <= 2^32. Therefore
+  //   (size * u) / 2^precision < size <= 2^32,
+  // and the value of a fits in uint32 type. Similarly, since v <= 2^precision,
+  //   (size * v) / 2^precision - 1 <= size - 1 < 2^32.
+  // For lower bound of b, note that 1 <= v, 2^16 <= size, and 16 <= precision.
+  // Therefore (size * v) / 2^precision - 1 >= 2^16 / 2^precision - 1 >= 0.
+
+  // The new interval is [base + a, base + b] = [base + a, base + b + 1).
+  base_ += a;  // May overflow.
+  size_minus1_ = b - a;
+  const bool base_overflow = (base_ < a);
+
+  // The encoder has two states. Let's call them state 0 and state 1.
+  // State 0 is when base < base + size <= 2^32.
+  // State 1 is when base < 2^32 < base + size.
+  //
+  // The encoder initially starts in state 0, with base = 0, size = 2^32.
+  //
+  // TODO(sjhwang): Requires some profiling, but the encoder stays in state 0
+  // most of the time. Should optimize code for state 0.
+  //
+  // Each Encode() has up to two places where the interval changes:
+  //   #1. Refine the interval. [base, base + size) -> [base + a, base + b + 1).
+  //   #2. Expand interval if the new size is too small,
+  // and each change may cause a state transition.
+  //
+  // First, consider when the current state is 0.
+  //
+  // In this case, the next state after #1 is always state 0, since refining
+  // interval only shrinks the interval, therefore new_base + new_size <= 2^32.
+  //
+  // Let us explain #2.
+  //
+  // Recall that at the beginning of each Encode(), the encoder requires
+  // 2^16 < size <= 2^32. As precision <= 16, the new interval size can be as
+  // small as 1, but never zero.
+  //
+  // To keep size above 2^16, if new size is smaller than or equal to 2^16, the
+  // encoder would left-shift base and size by 16 bits: size' <- size * 2^16.
+  // Note that new size' is now in the range [2^16, 2^32].
+  //
+  // Since size is left-shifted, the same should be applied to base as well.
+  // However, after the left-shift, base will then contain 48 bits instead of 32
+  // bits. Therefore prior to the shift, The upper 16 bits in base should be
+  // stored somewhere else.
+  //
+  // If the upper 16 bits of all values in the interval were the same, i.e., if
+  // base[32:16] == (base + size - 1)[32:16], then base[32:16] can be written
+  // out to `output` string, since any further Encode() only narrows down the
+  // interval and that 16 bits would never change.
+  //
+  // If the upper 16 bits were not all the same, since this happens only when
+  // size <= 2^16, the upper 16 bits may differ only by one, i.e.,
+  // base[32:16] + 1 == (base + size - 1)[32:16]. At this stage, it is not
+  // determined yet whether base[32:16] should be written to the output  or
+  // (base[32:16] + 1) should be written to the output. In this case,
+  // (base[32:16] + 1) is temporarily stored in `delay`, and base is
+  // left-shifted by 16 bits.
+  //
+  // In the latter case, the condition implies that (base // 2^16) and
+  // ((base + size - 1) // 2^16) were different. Therefore after left-shift by
+  // 16 bits, the new (base + size) is greater than 2^32, i.e., the encoder
+  // transition to state 1.
+  //
+  // ==== Summary ====
+  // To detect the current encoder state,
+  //   state 0: delay == 0 iff (base mod 2^32) < (base + size) mod 2^32,
+  //   state 1: delay != 0 iff (base + size) mod 2^32 <= base mod 2^32,
+  // because size <= 2^32.
+  //
+  // ==== Summary for state 0 ====
+  // 1. Interval refinement does not cause state transition.
+  // 2. Interval expansion may cause state transition, depending on the upper 16
+  // bits of base and base + size - 1.
+  //
+  // Now suppose the previous state was 1. This means that
+  // base <= 2^32 < base + size.
+  //
+  // When in state 1, an interval refinement may trigger state transition.
+  // After Encode() refines the interval, there are three possibilities:
+  //   #1. base <= 2^32 < base + size (unchanged),
+  //   #2. 2^32 <= base < base + size (base overflowed),
+  //   #3. base < base + size <= 2^32 (base + size - 1 underflowed).
+  //
+  // In case #1, the encoder remains in state 1.
+  // In case #2 or #3, the encoder state changes to state 0.
+  //
+  // ==== State transition for interval refinement ====
+  // 1. state 0 -> state 0,
+  // 2. state 1 -> state 0 or state 1.
+  //
+  // Therefore if the new state is 1, then the previous state must have been
+  // state 1.
+  if (base_ + size_minus1_ < base_) {
+    // If statement checked if 2^32 < base + size. The new state is 1, hence the
+    // previous state was also state 1.
+    DCHECK_NE(((base_ - a) + size) >> 32, 0);
+    DCHECK_NE(delay_ & 0xFFFF, 0);
+
+    // Like in state 0, if the new size is <= 2^16, then base and size should
+    // be left-shifted by 16 bits. Combine the conditions
+    // base <= 2^32 < base + size and size <= 2^16 to conclude that
+    // base[32:16] >= 0xFFFF and (base + size - 1)[32:16] = 0x0000.
+    //
+    // Note that 2^32 - base < size, and since base is at least 0xFFFF0000,
+    // 2^16 - base[16:0] < size. Let base' and size' be the new base and size
+    // after the bit-shift. Then 2^32 - base' < size' => 2^32 < base' + size'.
+    // Therefore the encoder remains in state 1.
+    //
+    // Lastly, `delay` is modified. Conceptually, delay has to be changed to
+    //   delay' <- delay * 2^16 + (base + size - 1)[32:16].
+    // Since we know above that (base + size - 1)[32:16] = 0x0000, there is no
+    // need to explicitly do the computation above, but rather store how many
+    // trailing zeros there were. For this reason, the lower 16 bits of
+    // `delay` stores the delayed value when state changed from 0 to 1, and
+    // delay[32:16] stores the # of trailing zeros (in bytes).
+    //
+    // ==== State transition for interval expansion ====
+    // 1. state 0 -> state 0 or state 1,
+    // 2. state 1 -> state 1.
+    if (size_minus1_ >> 16 == 0) {
+      DCHECK_EQ(base_ >> 16, 0xFFFF);
+      base_ <<= 16;
+      size_minus1_ <<= 16;
+      size_minus1_ |= 0xFFFF;
+      // TODO(sjhwang): It is possible that for very long input, delay
+      // overflow during below. If overflow is detected, this delay is too
+      // long the encoder should forcefully move to state 0. In such case,
+      // base can be raised to 2^32 (force case #2), or (base + size) can be
+      // lowered to 2^32 (force case #3), depending on which transition
+      // keeps size larger.
+      CHECK_LT(delay_, static_cast<uint64>(1) << 62);
+      delay_ += 0x20000;  // Two more bytes of zeros. Check overflow?
+    }
+    return;
+  }
+
+  // If reached here, the current state is 0.
+  // First handle the case when the previous state was state 1.
+  if (delay_ != 0) {
+    // In case #2 or #3, the encoder state changes to state 0. Recall that when
+    // the encoder state changed from state 0 to state 1, the top 16 bits of
+    // (base + size - 1) was temporarily stored in `delay`, because the output
+    // could be either (delay - 1) or (delay).
+    //
+    // And from above, the delayed value encoded in `delay` is
+    //   delay' <- delay[16:0] * 2^(8 * delay[MAX:16])
+    //
+    // In case #2, the interval moved below 2^32. So (delay' - 1) is the
+    // converged value after interval refinements. Write out
+    // (delay[16:0] - 1) and write (8 * delay[MAX:16]) bytes of 0xFF.
+    //
+    // In case #3, the interval moved above 2^32. So delay' is the converged
+    // value after interval refinement. Write out delay[16:0] and write
+    // (8 * delay[MAX:16]) bytes of 0x00.
+    if (base_overflow) {
+      // Case #2.
+      DCHECK_NE((static_cast<uint64>(base_ - a) + a) >> 32, 0);
+      sink->push_back(static_cast<char>(delay_ >> 8));
+      sink->push_back(static_cast<char>(delay_ >> 0));
+      sink->append(delay_ >> 16, static_cast<char>(0));
+    } else {
+      // Case #3.
+      DCHECK_EQ(static_cast<uint64>(base_ + size_minus1_) >> 32, 0);
+      --delay_;
+      sink->push_back(static_cast<char>(delay_ >> 8));
+      sink->push_back(static_cast<char>(delay_ >> 0));
+      sink->append(delay_ >> 16, static_cast<char>(0xFF));
+    }
+    // Reset to state 0.
+    delay_ = 0;
+  }
+
+  if (size_minus1_ >> 16 == 0) {
+    const uint32 top = base_ >> 16;
+
+    base_ <<= 16;
+    size_minus1_ <<= 16;
+    size_minus1_ |= 0xFFFF;
+
+    if (base_ <= base_ + size_minus1_) {
+      // Still in state 0. Write the top 16 bits.
+      sink->push_back(static_cast<char>(top >> 8));
+      sink->push_back(static_cast<char>(top));
+    } else {
+      // New state is 1.
+      DCHECK_LT(top, 0xFFFF);
+      delay_ = top + 1;
+    }
+  }
+}
+
+void RangeEncoder::Finalize(string* sink) {
+  // Finalize the encode by writing out any number in the interval
+  // [base, base + size).
+  //
+  // Trailing zeros are not explicitly written out as decoder can fill in zeros
+  // by default.
+  if (delay_ != 0) {
+    // The last state was state 1. Since base < 2^32 < base + size, pick 2^32
+    // (state 1, case #3).
+    // NOTE: It is a bit difficult to trigger this code path on purpose.
+    // TODO(sjhwang): Find a way to trigger this code path for test coverage.
+    sink->push_back(static_cast<char>(delay_ >> 8));
+    if ((delay_ & 0xFF) != 0) {
+      sink->push_back(static_cast<char>(delay_));
+    }
+  } else if (base_ != 0) {
+    // If base == 0, then pick 0 from [base, base + size) and no zeros are
+    // explicitly written.
+    //
+    // Otherwise, pick (base + (2^16 - base[16:0])), i.e., round up base to the
+    // next multiple of 2^16. As 2^16 < size, this value should be in the
+    // interval [base, base + size).
+    const uint32 mid = ((base_ - 1) >> 16) + 1;
+    DCHECK_EQ(mid & 0xFFFF, mid);
+    sink->push_back(static_cast<char>(mid >> 8));
+    if ((mid & 0xFF) != 0) {
+      sink->push_back(static_cast<char>(mid >> 0));
+    }
+  }
+
+  base_ = 0;
+  size_minus1_ = std::numeric_limits<uint32>::max();
+  delay_ = 0;
+}
+
+RangeDecoder::RangeDecoder(const string& source, int precision)
+    : current_(source.begin()),
+      begin_(source.begin()),
+      end_(source.end()),
+      precision_(precision) {
+  CHECK_LE(precision, 16);
+
+  Read16BitValue();
+  Read16BitValue();
+}
+
+int32 RangeDecoder::Decode(tensorflow::gtl::ArraySlice<int32> cdf) {
+  const uint64 size = static_cast<uint64>(size_minus1_) + 1;
+  const uint64 offset =
+      ((static_cast<uint64>(value_ - base_) + 1) << precision_) - 1;
+
+  // This is similar to std::lower_range() with std::less_equal as comparison.
+  // After the binary search, `pv` points to the smallest number v that
+  // satisfies offset < (size * v) / 2^precision.
+
+  // Assumes that cdf[0] == 0. Therefore (size * cdf[0]) / 2^precision is always
+  // less than or equal to offset.
+  const int32* pv = cdf.data() + 1;
+  // `len` can be cdf.size() - 2 if there is guarantee that the last element of
+  // cdf is 2^precision.
+  auto len = cdf.size() - 1;
+  DCHECK_GT(len, 0);
+
+  do {
+    const auto half = len / 2;
+    const int32* mid = pv + half;
+    DCHECK_GE(*mid, 0);
+    DCHECK_LE(*mid, 1 << precision_);
+    if (size * static_cast<uint64>(*mid) <= offset) {
+      pv = mid + 1;
+      len -= half + 1;
+    } else {
+      len = half;
+    }
+  } while (len > 0);
+
+  // If (size * v) / 2^precision <= offset for all v in cdf, then pv points to
+  // one after the last element of cdf. That is a decoding error.
+  //
+  // TODO(sjhwang): Consider returning -1 to indicate error. Or start len =
+  // cdf.size() - 2 instead and give up detecting this error.
+  CHECK_LT(pv, cdf.data() + cdf.size());
+
+  const uint32 a = (size * static_cast<uint64>(*(pv - 1))) >> precision_;
+  const uint32 b = ((size * static_cast<uint64>(*pv)) >> precision_) - 1;
+  DCHECK_LE(a, offset >> precision_);
+  DCHECK_LE(offset >> precision_, b);
+
+  base_ += a;
+  size_minus1_ = b - a;
+
+  if (size_minus1_ >> 16 == 0) {
+    base_ <<= 16;
+    size_minus1_ <<= 16;
+    size_minus1_ |= 0xFFFF;
+
+    Read16BitValue();
+  }
+
+  return pv - cdf.data() - 1;
+}
+
+void RangeDecoder::Read16BitValue() {
+  value_ <<= 8;
+  if (current_ != end_) {
+    value_ |= static_cast<uint8>(*current_++);
+  }
+  value_ <<= 8;
+  if (current_ != end_) {
+    value_ |= static_cast<uint8>(*current_++);
+  }
+}
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/coder/kernels/range_coder.h b/tensorflow/contrib/coder/kernels/range_coder.h
new file mode 100644
index 0000000000000000000000000000000000000000..f46413072e34a55128d7854b9c312dfdde457d85
--- /dev/null
+++ b/tensorflow/contrib/coder/kernels/range_coder.h
@@ -0,0 +1,109 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_H_
+#define TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_H_
+
+#include <limits>
+#include <string>
+
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+class RangeEncoder {
+ public:
+  // `precision` determines the granularity of probability masses passed to
+  // Encode() function below.
+  //
+  // REQUIRES: 0 < precision <= 16.
+  explicit RangeEncoder(int precision);
+
+  // Encodes a half-open interval [lower / 2^precision, upper / 2^precision).
+  // Suppose each character to be encoded is from an integer-valued
+  // distribution. When encoding a random character x0, the arguments lower and
+  // upper represent
+  //   Pr(X < x0) = lower / 2^precision,
+  //   Pr(X < x0 + 1) = upper / 2^precision,
+  // where X is a random variable following the distribution.
+  //
+  // For example, assume that the distribution has possible outputs 0, 1, 2, ...
+  // To encode value 0, lower = 0 and upper = Pr(X = 0).
+  // To encode value 1, lower = Pr(X = 0) and upper = Pr(X = 0 or 1).
+  // To encode value 2, lower = Pr(X = 0 or 1) and upper = Pr(X = 0, 1, or 2).
+  // ...
+  //
+  // REQUIRES: 0 <= lower < upper <= 2^precision.
+  void Encode(int32 lower, int32 upper, string* sink);
+
+  // The encode may contain some under-determined values from previous encoding.
+  // After Encode() calls, Finalize() must be called. Otherwise the encoded
+  // string may not be decoded.
+  void Finalize(string* sink);
+
+ private:
+  uint32 base_ = 0;
+  uint32 size_minus1_ = std::numeric_limits<uint32>::max();
+  uint64 delay_ = 0;
+
+  const int precision_;
+};
+
+class RangeDecoder {
+ public:
+  // Holds a reference to `source`. The caller has to make sure that `source`
+  // outlives the decoder object.
+  //
+  // REQUIRES: `precision` must be the same as the encoder's precision.
+  // REQUIRES: 0 < precision <= 16.
+  RangeDecoder(const string& source, int precision);
+
+  // Decodes a character from `source` using CDF. The size of `cdf` should be
+  // one more than the number of the character in the alphabet.
+  //
+  // If x0, x1, x2, ... are the possible characters (in increasing order) from
+  // the distribution, then
+  //   cdf[0] = 0
+  //   cdf[1] = Pr(X <= x0),
+  //   cdf[2] = Pr(X <= x1),
+  //   cdf[3] = Pr(X <= x2),
+  //   ...
+  //
+  // The returned value is an index to `cdf` where the decoded character
+  // corresponds to.
+  //
+  // REQUIRES: cdf.size() > 1.
+  // REQUIRES: cdf[i] <= cdf[i + 1] for i = 0, 1, ..., cdf.size() - 2.
+  // REQUIRES: cdf[cdf.size() - 1] <= 2^precision.
+  //
+  // In practice the last element of `cdf` should equal to 2^precision.
+  int32 Decode(gtl::ArraySlice<int32> cdf);
+
+ private:
+  void Read16BitValue();
+
+  uint32 base_ = 0;
+  uint32 size_minus1_ = std::numeric_limits<uint32>::max();
+  uint32 value_ = 0;
+
+  string::const_iterator current_;
+  const string::const_iterator begin_;
+  const string::const_iterator end_;
+
+  const int precision_;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_H_
diff --git a/tensorflow/contrib/coder/kernels/range_coder_ops.cc b/tensorflow/contrib/coder/kernels/range_coder_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cde7982530fea6407aaf074f7af4a22263d50da3
--- /dev/null
+++ b/tensorflow/contrib/coder/kernels/range_coder_ops.cc
@@ -0,0 +1,307 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "tensorflow/contrib/coder/kernels/range_coder.h"
+#include "tensorflow/contrib/coder/kernels/range_coder_ops_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+// A helper class to iterate over data and cdf simultaneously, while cdf is
+// broadcasted to data.
+// NOTE: Moving this class out of anonymous namespace impacts compiler
+// optimization and affects performance. When moving this code around (e.g.,
+// into a library header), be sure to check the benchmark tests.
+template <typename T, typename U, int N>
+class BroadcastRange {
+ public:
+  BroadcastRange(T* data_pointer, gtl::ArraySlice<int64> data_shape,
+                 const U* cdf_pointer, gtl::ArraySlice<int64> cdf_shape)
+      : data_pointer_(data_pointer), cdf_pointer_(cdf_pointer) {
+    CHECK(!data_shape.empty());
+    CHECK_EQ(data_shape.size(), N);
+    CHECK_EQ(cdf_shape.size(), N + 1);
+
+    std::copy(data_shape.begin(), data_shape.end(), &data_shape_[0]);
+    data_index_.fill(0);
+
+    const int64 innermost_stride = cdf_shape[N];
+    cdf_displace_.fill(innermost_stride);
+
+    // Pre-compute the pointer displacement for cdf.
+    int64 stride = innermost_stride;
+    for (int i = N - 1; i >= 0; --i) {
+      const bool broadcasting = (cdf_shape[i] <= 1);
+
+      // When the data linear index advances by one, the cdf linear index
+      // advances by `innermost_stride`.
+      //
+      // Suppose that the i-th axis coordinate of data increased by one, and
+      // that i-th axis is broadcasting. The cdf linear index should be wound
+      // back by i-th axis stride, so that i-th axis coordinate of cdf is
+      // effectively kept at 0.
+      if (broadcasting) {
+        cdf_displace_[i] -= stride;
+      }
+      stride *= cdf_shape[i];
+    }
+  }
+
+  // Returns the pointers to the current iterating locations to data and cdf
+  // tensors.
+  //
+  // Note that this function does not track whether data pointer is running past
+  // the end of data buffer. The caller has to make sure Next() is called no
+  // more than that.
+  std::pair<T*, const U*> Next() {
+    std::pair<T*, const U*> return_value = {data_pointer_, cdf_pointer_};
+
+    int i = N - 1;
+    for (; i > 0; --i) {
+      ++data_index_[i];
+      if (data_index_[i] < data_shape_[i]) {
+        break;
+      }
+      data_index_[i] = 0;
+    }
+
+    // Advance data pointer by one.
+    data_pointer_ += 1;
+
+    // For cdf pointer, it's more complicated because of broadcasting. When i-th
+    // coordinate increase by one, and if i-th axis is broadcasting, then we
+    // need to rewind back the pointer so that the effective i-th axis
+    // coordinate for cdf is always 0. This value is precomputed as
+    // cdf_displace_.
+    cdf_pointer_ += cdf_displace_[i];
+    return return_value;
+  }
+
+ private:
+  std::array<int64, N> data_shape_;
+  std::array<int64, N> cdf_displace_;
+  std::array<int64, N> data_index_;
+
+  T* data_pointer_;
+  const U* cdf_pointer_;
+};
+
+Status CheckCdfShape(const TensorShape& data_shape,
+                     const TensorShape& cdf_shape) {
+  if (TF_PREDICT_FALSE(cdf_shape.dims() != data_shape.dims() + 1)) {
+    return errors::InvalidArgument(
+        "`cdf` should have one more axis than `data`: data shape=",
+        data_shape.DebugString(), ", cdf shape=", cdf_shape.DebugString());
+  }
+
+  if (TF_PREDICT_FALSE(cdf_shape.dim_size(cdf_shape.dims() - 1) <= 1)) {
+    return errors::InvalidArgument(
+        "The last dimension of `cdf` should be > 1: ", cdf_shape.DebugString());
+  }
+
+  return Status::OK();
+}
+
+// Non-incremental encoder op -------------------------------------------------
+class RangeEncodeOp : public OpKernel {
+ public:
+  explicit RangeEncodeOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("precision", &precision_));
+    OP_REQUIRES(context, 0 < precision_ && precision_ <= 16,
+                errors::InvalidArgument("`precision` must be in [1, 16]: ",
+                                        precision_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& data = context->input(0);
+    const Tensor& cdf = context->input(1);
+
+    OP_REQUIRES_OK(context, CheckCdfShape(data.shape(), cdf.shape()));
+
+    std::vector<int64> data_shape, cdf_shape;
+    OP_REQUIRES_OK(
+        context, MergeAxes(data.shape(), cdf.shape(), &data_shape, &cdf_shape));
+
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape{}, &output_tensor));
+    string* output = &output_tensor->scalar<string>()();
+
+    switch (data_shape.size()) {
+#define RANGE_ENCODE_CASE(dims)                                                \
+  case dims: {                                                                 \
+    RangeEncodeImpl<dims>(data.flat<int16>(), data_shape,                      \
+                          cdf.flat_inner_dims<int32, 2>(), cdf_shape, output); \
+  } break
+      RANGE_ENCODE_CASE(1);
+      RANGE_ENCODE_CASE(2);
+      RANGE_ENCODE_CASE(3);
+      RANGE_ENCODE_CASE(4);
+      RANGE_ENCODE_CASE(5);
+      RANGE_ENCODE_CASE(6);
+#undef RANGE_ENCODE_CASE
+      default:
+        context->CtxFailure(errors::InvalidArgument(
+            "Irregular broadcast pattern: ", data.shape().DebugString(), ", ",
+            cdf.shape().DebugString()));
+        return;
+    }
+  }
+
+ private:
+  template <int N>
+  void RangeEncodeImpl(TTypes<int16>::ConstFlat data,
+                       gtl::ArraySlice<int64> data_shape,
+                       TTypes<int32>::ConstMatrix cdf,
+                       gtl::ArraySlice<int64> cdf_shape, string* output) const {
+    const int64 data_size = data.size();
+    const int64 cdf_size = cdf.size();
+    const int64 chip_size = cdf.dimension(1);
+
+    BroadcastRange<const int16, int32, N> view{data.data(), data_shape,
+                                               cdf.data(), cdf_shape};
+    RangeEncoder encoder{precision_};
+    for (int64 linear = 0; linear < data_size; ++linear) {
+      const auto pair = view.Next();
+
+      const int64 index = *pair.first;
+      DCHECK_GE(index, 0);
+      DCHECK_LT(index + 1, chip_size);
+
+      const int32* cdf_slice = pair.second;
+      DCHECK_LE(cdf_slice + chip_size, cdf.data() + cdf_size);
+
+      const int32 lower = cdf_slice[index];
+      const int32 upper = cdf_slice[index + 1];
+      encoder.Encode(lower, upper, output);
+    }
+
+    encoder.Finalize(output);
+  }
+
+  int precision_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("RangeEncode").Device(DEVICE_CPU), RangeEncodeOp);
+
+// Non-incremental decoder op -------------------------------------------------
+class RangeDecodeOp : public OpKernel {
+ public:
+  explicit RangeDecodeOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("precision", &precision_));
+    OP_REQUIRES(context, 0 < precision_ && precision_ <= 16,
+                errors::InvalidArgument("`precision` must be in [1, 16]: ",
+                                        precision_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& encoded_tensor = context->input(0);
+    const Tensor& shape = context->input(1);
+    const Tensor& cdf = context->input(2);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(encoded_tensor.shape()),
+                errors::InvalidArgument("Invalid `encoded` shape: ",
+                                        encoded_tensor.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(shape.shape()),
+                errors::InvalidArgument("Invalid `shape` shape: ",
+                                        shape.shape().DebugString()));
+    TensorShape output_shape;
+    OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(shape.vec<int32>(),
+                                                        &output_shape));
+    OP_REQUIRES_OK(context, CheckCdfShape(output_shape, cdf.shape()));
+
+    std::vector<int64> data_shape, cdf_shape;
+    OP_REQUIRES_OK(
+        context, MergeAxes(output_shape, cdf.shape(), &data_shape, &cdf_shape));
+
+    const string& encoded = encoded_tensor.scalar<string>()();
+
+    Tensor* output;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+    switch (data_shape.size()) {
+#define RANGE_DECODE_CASE(dim)                                              \
+  case dim: {                                                               \
+    RangeDecodeImpl<dim>(output->flat<int16>(), data_shape,                 \
+                         cdf.flat_inner_dims<int32>(), cdf_shape, encoded); \
+  } break
+      RANGE_DECODE_CASE(1);
+      RANGE_DECODE_CASE(2);
+      RANGE_DECODE_CASE(3);
+      RANGE_DECODE_CASE(4);
+      RANGE_DECODE_CASE(5);
+      RANGE_DECODE_CASE(6);
+#undef RANGE_DECODE_CASE
+      default:
+        context->CtxFailure(errors::InvalidArgument(
+            "Irregular broadcast pattern: ", output_shape.DebugString(), ", ",
+            cdf.shape().DebugString()));
+        return;
+    }
+  }
+
+ private:
+  template <int N>
+  void RangeDecodeImpl(TTypes<int16>::Flat output,
+                       gtl::ArraySlice<int64> output_shape,
+                       TTypes<int32>::ConstMatrix cdf,
+                       gtl::ArraySlice<int64> cdf_shape,
+                       const string& encoded) const {
+    BroadcastRange<int16, int32, N> view{output.data(), output_shape,
+                                         cdf.data(), cdf_shape};
+
+    RangeDecoder decoder{encoded, precision_};
+
+    const int64 output_size = output.size();
+    const int64 cdf_size = cdf.size();
+    const auto chip_size =
+        static_cast<gtl::ArraySlice<int32>::size_type>(cdf.dimension(1));
+
+    for (int64 i = 0; i < output_size; ++i) {
+      const auto pair = view.Next();
+
+      int16* data = pair.first;
+      DCHECK_LT(data, output.data() + output_size);
+
+      const int32* cdf_slice = pair.second;
+      DCHECK_LE(cdf_slice + chip_size, cdf.data() + cdf_size);
+
+      *data = decoder.Decode(gtl::ArraySlice<int32>{cdf_slice, chip_size});
+    }
+  }
+
+  int precision_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("RangeDecode").Device(DEVICE_CPU), RangeDecodeOp);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/coder/kernels/range_coder_ops_test.cc b/tensorflow/contrib/coder/kernels/range_coder_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae4d9d2836a0f89a9765004a85bc3c292b0e484f
--- /dev/null
+++ b/tensorflow/contrib/coder/kernels/range_coder_ops_test.cc
@@ -0,0 +1,521 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/contrib/coder/kernels/range_coder.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/bits.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+int LogUniform(random::SimplePhilox* gen, uint32 n) {
+  CHECK_GT(n, 0);
+
+  // Split [0, n) into {0}, [1, 2), [2, 4), [4, 8), ..., [2^(m-1), n).
+  const int m = Log2Ceiling(n);
+
+  int outcome;
+  do {
+    // Uniform() consumes at least 32 bits per call, therefore this is somewhat
+    // wasteful implementation. Since this is used only for test, we do not
+    // refine this implementation further.
+    const int k = gen->Uniform(m + 1) - 1;
+    // If k == -1, then sample from {0}.
+    // If k == 0, then sample from [1, 2).
+    // If k == 1, then sample from [2, 4), ... and so on.
+    if (k < 1) {
+      outcome = k + 1;
+    } else {
+      outcome = (1 << k) + gen->Uniform(1 << k);
+    }
+  } while (n <= outcome);
+  return outcome;
+}
+
+std::vector<int64> ComputeStrides(const TensorShape& shape) {
+  std::vector<int64> stride(shape.dims());
+  int64 current = 1;
+  for (int i = shape.dims() - 1; i >= 0; --i) {
+    stride[i] = current;
+    current *= shape.dim_size(i);
+  }
+  return stride;
+}
+
+class RangeCoderOpsTest : public OpsTestBase {
+ protected:
+  Status RunEncodeOp(int precision, gtl::ArraySlice<Tensor> input,
+                     Tensor* output) {
+    TF_RETURN_IF_ERROR(NodeDefBuilder("encode", "RangeEncode")
+                           .Input(tensorflow::FakeInput(DT_INT16))
+                           .Input(tensorflow::FakeInput(DT_INT32))
+                           .Attr("precision", precision)
+                           .Finalize(node_def()));
+    TF_RETURN_IF_ERROR(InitOp());
+
+    inputs_.clear();
+    std::vector<Tensor> copies(input.size());
+    for (int i = 0; i < input.size(); ++i) {
+      copies[i] = input[i];
+      inputs_.emplace_back(&copies[i]);
+    }
+
+    TF_RETURN_IF_ERROR(RunOpKernel());
+
+    *output = *GetOutput(0);
+    inputs_.clear();
+
+    return Status::OK();
+  }
+
+  Status RunDecodeOp(int precision, gtl::ArraySlice<Tensor> input,
+                     Tensor* output) {
+    TF_RETURN_IF_ERROR(NodeDefBuilder("decode", "RangeDecode")
+                           .Input(tensorflow::FakeInput(DT_STRING))
+                           .Input(tensorflow::FakeInput(DT_INT32))
+                           .Input(tensorflow::FakeInput(DT_INT32))
+                           .Attr("precision", precision)
+                           .Finalize(node_def()));
+    TF_RETURN_IF_ERROR(InitOp());
+
+    inputs_.clear();
+    std::vector<Tensor> copies(input.size());
+    for (int i = 0; i < input.size(); ++i) {
+      copies[i] = input[i];
+      inputs_.emplace_back(&copies[i]);
+    }
+
+    TF_RETURN_IF_ERROR(RunOpKernel());
+
+    *output = *GetOutput(0);
+    inputs_.clear();
+
+    return Status::OK();
+  }
+
+  void TestEncodeAndDecode(int precision, const Tensor& data,
+                           const Tensor& cdf) {
+    Tensor encoded;
+    TF_ASSERT_OK(RunEncodeOp(precision, {data, cdf}, &encoded));
+
+    const TensorShape& data_shape = data.shape();
+    Tensor shape{DT_INT32, {data_shape.dims()}};
+    for (int i = 0; i < data_shape.dims(); ++i) {
+      shape.flat<int32>()(i) = data_shape.dim_size(i);
+    }
+
+    Tensor decoded;
+    TF_ASSERT_OK(RunDecodeOp(precision, {encoded, shape, cdf}, &decoded));
+
+    EXPECT_EQ(decoded.dtype(), data.dtype());
+    EXPECT_EQ(decoded.shape(), data.shape());
+    EXPECT_EQ(decoded.tensor_data(), data.tensor_data());
+  }
+
+  void PopulateMaxValues(random::SimplePhilox* gen, Tensor* maxvalue_tensor,
+                         int min_maxvalue, int max_maxvalue) {
+    const int range = max_maxvalue - min_maxvalue;
+    TTypes<int16>::Flat flat = maxvalue_tensor->flat<int16>();
+
+    for (int64 i = 0; i < flat.size(); ++i) {
+      flat(i) = min_maxvalue + gen->Uniform(range);
+    }
+  }
+
+  void BuildCdf(random::SimplePhilox* gen, Tensor* data_tensor,
+                Tensor* cdf_tensor, const Tensor& maxvalue_tensor) {
+    CHECK(TensorShapeUtils::StartsWith(cdf_tensor->shape(),
+                                       maxvalue_tensor.shape()));
+    CHECK_EQ(cdf_tensor->dims(), maxvalue_tensor.dims() + 1);
+    const int64 chip_size = cdf_tensor->dim_size(cdf_tensor->dims() - 1);
+
+    std::vector<int64> data_stride = ComputeStrides(data_tensor->shape());
+    std::vector<int64> cdf_stride = ComputeStrides(cdf_tensor->shape());
+
+    for (int i = 0; i < cdf_tensor->dims(); ++i) {
+      if (cdf_tensor->dim_size(i) == 1) {
+        cdf_stride[i] = 0;
+      }
+    }
+
+    Tensor histogram_tensor{DT_INT32, cdf_tensor->shape()};
+    TTypes<int16>::Flat data = data_tensor->flat<int16>();
+    TTypes<int32>::Flat histogram = histogram_tensor.flat<int32>();
+    TTypes<int16>::ConstFlat maxvalue = maxvalue_tensor.flat<int16>();
+    histogram.setZero();
+
+    for (int64 index = 0; index < data.size(); ++index) {
+      int64 temp = index;
+      int64 offset = 0;
+      for (int dim = 0; dim < data_stride.size(); ++dim) {
+        const int64 coord = temp / data_stride[dim];
+        offset += coord * cdf_stride[dim];
+        temp -= coord * data_stride[dim];
+      }
+      ASSERT_EQ(temp, 0);
+
+      const int64 maxvalue_offset = offset / chip_size;
+      CHECK_EQ(maxvalue_offset * chip_size, offset);
+      CHECK_LT(maxvalue(maxvalue_offset) + 1, chip_size);
+      const int value = LogUniform(gen, maxvalue(maxvalue_offset));
+      data(index) = value;
+      histogram(offset + value + 1) += 1;
+    }
+
+    cdf_tensor->flat_inner_dims<int32, 2>() =
+        histogram_tensor.flat_inner_dims<int32, 2>().cumsum(1);
+  }
+};
+
+TEST_F(RangeCoderOpsTest, NoBroadcast) {
+  constexpr int kPrecision = 14;
+  constexpr int kMaxValue = 10;
+
+  Tensor data{DT_INT16, {1, 32, 32, 16}};
+  Tensor temp{DT_INT32, {1, 1, 1, 1, kMaxValue + 2}};
+  Tensor maxvalue{DT_INT16, {1, 1, 1, 1}};
+  maxvalue.flat<int16>()(0) = kMaxValue;
+
+  ASSERT_LE(data.shape().num_elements(), 1 << kPrecision);
+
+  random::PhiloxRandom philox(random::New64(), random::New64());
+  random::SimplePhilox gen(&philox);
+  BuildCdf(&gen, &data, &temp, maxvalue);
+
+  const Eigen::array<int32, 5> broadcast = {1, 32, 32, 16, 1};
+
+  Tensor cdf{DT_INT32, {1, 32, 32, 16, kMaxValue + 2}};
+  cdf.tensor<int32, 5>() = temp.tensor<int32, 5>().broadcast(broadcast);
+
+  TestEncodeAndDecode(kPrecision, data, cdf);
+}
+
+TEST_F(RangeCoderOpsTest, Broadcast1Axis) {
+  constexpr int kPrecision = 9;
+  constexpr int kDimensionSize = 1 << kPrecision;
+  constexpr int kMinMaxValue = 10;
+  constexpr int kMaxMaxValue = 64;
+
+  random::PhiloxRandom philox(random::New64(), random::New64());
+  random::SimplePhilox gen(&philox);
+  Tensor data{DT_INT16, {1, kDimensionSize, kDimensionSize}};
+
+  Tensor maxvalue{DT_INT16, {kDimensionSize}};
+  PopulateMaxValues(&gen, &maxvalue, kMinMaxValue, kMaxMaxValue);
+
+  {
+    // Axis 1.
+    Tensor maxvalue1;
+    ASSERT_TRUE(maxvalue1.CopyFrom(maxvalue, {1, 1, kDimensionSize}));
+
+    Tensor cdf{DT_INT32, {1, 1, kDimensionSize, kMaxMaxValue + 2}};
+    BuildCdf(&gen, &data, &cdf, maxvalue1);
+    TestEncodeAndDecode(kPrecision, data, cdf);
+  }
+
+  {
+    // Axis 2.
+    Tensor maxvalue2;
+    ASSERT_TRUE(maxvalue2.CopyFrom(maxvalue, {1, kDimensionSize, 1}));
+
+    Tensor cdf{DT_INT32, {1, kDimensionSize, 1, kMaxMaxValue + 2}};
+    BuildCdf(&gen, &data, &cdf, maxvalue2);
+    TestEncodeAndDecode(kPrecision, data, cdf);
+  }
+}
+
+TEST_F(RangeCoderOpsTest, Broadcast2Axes) {
+  constexpr int kPrecision = 13;
+  constexpr int kDimensionSize1 = 1 << (kPrecision / 2);
+  constexpr int kDimensionSize2 = 1 << (kPrecision - kPrecision / 2);
+  constexpr int kMinMaxValue = 10;
+  constexpr int kMaxMaxValue = 64;
+
+  random::PhiloxRandom philox(random::New64(), random::New64());
+  random::SimplePhilox gen(&philox);
+  Tensor maxvalue{DT_INT16, {2, 1, 1, 7}};
+  PopulateMaxValues(&gen, &maxvalue, kMinMaxValue, kMaxMaxValue);
+
+  Tensor data{DT_INT16, {2, kDimensionSize1, kDimensionSize2, 7}};
+  Tensor cdf{DT_INT32, {2, 1, 1, 7, kMaxMaxValue + 2}};
+  BuildCdf(&gen, &data, &cdf, maxvalue);
+  TestEncodeAndDecode(kPrecision, data, cdf);
+}
+
+TEST_F(RangeCoderOpsTest, InvalidCdfShape) {
+  Tensor data{DT_INT16, {3, 3}};
+  Tensor cdf{DT_INT32, {3, 3}};
+
+  Tensor unused;
+  {
+    const Status status = RunEncodeOp(10, {data, cdf}, &unused);
+    EXPECT_FALSE(status.ok());
+    EXPECT_NE(status.error_message().find("`cdf` should have one more axis"),
+              string::npos);
+  }
+
+  Tensor empty{DT_STRING, {}};
+  Tensor shape{DT_INT32, {2}};
+  shape.vec<int32>().setValues({3, 3});
+  {
+    const Status status = RunDecodeOp(10, {empty, shape, cdf}, &unused);
+    EXPECT_FALSE(status.ok());
+    EXPECT_NE(status.error_message().find("`cdf` should have one more axis"),
+              string::npos);
+  }
+
+  cdf = Tensor{DT_INT32, {3, 3, 1}};
+  {
+    const Status status = RunEncodeOp(10, {data, cdf}, &unused);
+    EXPECT_FALSE(status.ok());
+    EXPECT_NE(
+        status.error_message().find("last dimension of `cdf` should be > 1"),
+        string::npos);
+  }
+  {
+    const Status status = RunDecodeOp(10, {empty, shape, cdf}, &unused);
+    EXPECT_FALSE(status.ok());
+    EXPECT_NE(
+        status.error_message().find("last dimension of `cdf` should be > 1"),
+        string::npos);
+  }
+}
+
+TEST_F(RangeCoderOpsTest, DecoderShapeFn) {
+  Tensor encoded_tensor{DT_STRING, {}};
+  Tensor shape_tensor{DT_INT32, {3}};
+  Tensor cdf_tensor{DT_INT32, {4, 6, 8, 2}};
+
+  shape_tensor.flat<int32>().setValues({4, 6, 8});
+
+  Graph g{OpRegistry::Global()};
+  Node* encoded = test::graph::Constant(&g, encoded_tensor);
+  Node* shape = test::graph::Constant(&g, shape_tensor);
+  Node* cdf = test::graph::Constant(&g, cdf_tensor);
+  Node* decode;
+  TF_ASSERT_OK(NodeBuilder("range_decode", "RangeDecode", g.op_registry())
+                   .Input(encoded)
+                   .Input(shape)
+                   .Input(cdf)
+                   .Attr("precision", 10)
+                   .Finalize(&g, &decode));
+
+  ShapeRefiner refiner{g.versions().producer(), g.op_registry()};
+  TF_ASSERT_OK(refiner.AddNode(encoded));
+  TF_ASSERT_OK(refiner.AddNode(shape));
+  TF_ASSERT_OK(refiner.AddNode(cdf));
+  TF_ASSERT_OK(refiner.AddNode(decode));
+
+  auto* context = refiner.GetContext(decode);
+  ASSERT_NE(context, nullptr);
+
+  ASSERT_EQ(context->num_outputs(), 1);
+  auto shape_handle = context->output(0);
+
+  ASSERT_EQ(context->Rank(shape_handle), 3);
+  EXPECT_EQ(context->Value(context->Dim(shape_handle, 0)), 4);
+  EXPECT_EQ(context->Value(context->Dim(shape_handle, 1)), 6);
+  EXPECT_EQ(context->Value(context->Dim(shape_handle, 2)), 8);
+}
+
+TEST_F(RangeCoderOpsTest, InvalidBroadcast) {
+  Tensor data{DT_INT16, {3, 3}};
+  Tensor cdf{DT_INT32, {3, 2, 2}};
+
+  Tensor unused;
+  {
+    const Status status = RunEncodeOp(10, {data, cdf}, &unused);
+    EXPECT_FALSE(status.ok());
+    EXPECT_NE(status.error_message().find("Cannot broadcast shape"),
+              string::npos);
+  }
+
+  data = Tensor{DT_INT16, {3, 1}};
+  cdf = Tensor{DT_INT32, {3, 3, 2}};
+  Tensor empty{DT_STRING, {}};
+  Tensor shape{DT_INT32, {2}};
+  shape.vec<int32>().setValues({3, 1});
+  {
+    const Status status = RunDecodeOp(10, {empty, shape, cdf}, &unused);
+    EXPECT_FALSE(status.ok());
+    EXPECT_NE(status.error_message().find("Cannot broadcast shape"),
+              string::npos);
+  }
+
+  std::vector<int64> shape_vector = {2, 2, 2, 2, 2, 2, 2, 2, 2};
+  data = Tensor{DT_INT16, TensorShape{shape_vector}};
+  cdf = Tensor{DT_INT32, {2, 1, 2, 1, 2, 1, 2, 1, 2, 2}};
+  {
+    const Status status = RunEncodeOp(10, {data, cdf}, &unused);
+    EXPECT_FALSE(status.ok());
+    EXPECT_NE(status.error_message().find("Irregular broadcast"), string::npos);
+  }
+
+  shape = Tensor{DT_INT32, {static_cast<int64>(shape_vector.size())}};
+  for (int i = 0; i < shape_vector.size(); ++i) {
+    shape.flat<int32>()(i) = shape_vector[i];
+  }
+  {
+    const Status status = RunDecodeOp(10, {empty, shape, cdf}, &unused);
+    EXPECT_FALSE(status.ok());
+    EXPECT_NE(status.error_message().find("Irregular broadcast"), string::npos);
+  }
+}
+
+// Benchmark -------------------------------------------------------------
+
+// This function creates RangeEncode graph with CDF built from a separate data
+// sample.
+Graph* CreateRangeEncodeFullBroadcastGraph(const TensorShape& shape,
+                                           int precision) {
+  CHECK_EQ(shape.dims(), 4);
+
+  constexpr int kAlphabetSize = 70;
+
+  Tensor histogram{DT_INT32, {kAlphabetSize + 1}};
+  TTypes<int32>::Vec h = histogram.vec<int32>();
+  h.setConstant(1);
+  h(0) = 0;
+
+  random::PhiloxRandom philox(random::New64(), random::New64());
+  random::SimplePhilox gen(&philox);
+  for (int i = 0; i < (1 << precision) - kAlphabetSize; ++i) {
+    const int value = LogUniform(&gen, kAlphabetSize - 1);
+    h(value + 1) += 1;
+  }
+
+  Tensor cdf{DT_INT32, {1, 1, 1, 1, kAlphabetSize + 1}};
+  cdf.flat<int32>() = h.cumsum(0);
+
+  Tensor data{DT_INT16, shape};
+  TTypes<int16>::Flat d = data.flat<int16>();
+  for (int64 i = 0; i < d.size(); ++i) {
+    d(i) = LogUniform(&gen, kAlphabetSize - 1);
+  }
+
+  Graph* g = new Graph(OpRegistry::Global());
+  TF_CHECK_OK(NodeBuilder("range_encode", "RangeEncode", g->op_registry())
+                  .Input(test::graph::Constant(g, data))
+                  .Input(test::graph::Constant(g, cdf))
+                  .Attr("precision", precision)
+                  .Finalize(g, nullptr));
+  return g;
+}
+
+// This function creates RangeDecode graph with CDF built from a separate data
+// sample.
+Graph* CreateRangeDecodeFullBroadcastGraph(const TensorShape& shape,
+                                           int precision) {
+  CHECK_EQ(shape.dims(), 4);
+
+  constexpr int kAlphabetSize = 200;
+  const int64 num_elements = shape.num_elements();
+
+  Tensor histogram{DT_INT32, {kAlphabetSize + 1}};
+  TTypes<int32>::Vec h = histogram.vec<int32>();
+  h.setConstant(1);
+  h(0) = 0;
+
+  random::PhiloxRandom philox(random::New64(), random::New64());
+  random::SimplePhilox gen(&philox);
+  for (int i = 0; i < (1 << precision) - kAlphabetSize; ++i) {
+    const int value = LogUniform(&gen, kAlphabetSize - 1);
+    h(value + 1) += 1;
+  }
+
+  Tensor cdf_tensor{DT_INT32, {1, 1, 1, 1, kAlphabetSize + 1}};
+  TTypes<int32>::Flat cdf = cdf_tensor.flat<int32>();
+  cdf = h.cumsum(0);
+
+  Tensor string_tensor{DT_STRING, TensorShape{}};
+  string& sink = string_tensor.scalar<string>()();
+
+  RangeEncoder encoder{precision};
+  for (int64 i = 0; i < num_elements; ++i) {
+    const int value = LogUniform(&gen, kAlphabetSize - 1);
+    encoder.Encode(cdf(value), cdf(value + 1), &sink);
+  }
+  encoder.Finalize(&sink);
+
+  Tensor shape_tensor{DT_INT32, {shape.dims()}};
+  for (int i = 0; i < shape.dims(); ++i) {
+    shape_tensor.flat<int32>()(i) = shape.dim_size(i);
+  }
+
+  Graph* g = new Graph(OpRegistry::Global());
+  TF_CHECK_OK(NodeBuilder("range_decode", "RangeDecode", g->op_registry())
+                  .Input(test::graph::Constant(g, string_tensor))
+                  .Input(test::graph::Constant(g, shape_tensor))
+                  .Input(test::graph::Constant(g, cdf_tensor))
+                  .Attr("precision", precision)
+                  .Finalize(g, nullptr));
+  return g;
+}
+
+void RunTensorFlowBenchmark(int iters, Graph* g, int64 num_elements) {
+  SessionOptions opts;
+  opts.config.set_intra_op_parallelism_threads(1);
+  opts.config.set_inter_op_parallelism_threads(1);
+
+  testing::UseRealTime();
+  test::Benchmark("cpu", g, &opts).Run(iters);
+
+  const int64 num_items = static_cast<int64>(iters) * num_elements;
+  testing::ItemsProcessed(num_items);
+}
+
+void BM_RangeEncodeFullBroadcast(int iters, int code_size) {
+  constexpr int kPrecision = 14;
+  const TensorShape shape = {1, code_size, code_size, 256};
+  Graph* g = CreateRangeEncodeFullBroadcastGraph(shape, kPrecision);
+  RunTensorFlowBenchmark(iters, g, shape.num_elements());
+}
+
+BENCHMARK(BM_RangeEncodeFullBroadcast)->Arg(32)->Arg(64);
+
+void BM_RangeDecodeFullBroadcast(int iters, int code_size) {
+  constexpr int kPrecision = 14;
+  const TensorShape shape = {1, code_size, code_size, 256};
+  Graph* g = CreateRangeDecodeFullBroadcastGraph(shape, kPrecision);
+  RunTensorFlowBenchmark(iters, g, shape.num_elements());
+}
+
+BENCHMARK(BM_RangeDecodeFullBroadcast)->Arg(32)->Arg(64);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc b/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d66730cb4881ea92b5477047c500291fa9c0c290
--- /dev/null
+++ b/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc
@@ -0,0 +1,85 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/coder/kernels/range_coder_ops_util.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::errors::InvalidArgument;
+
+namespace tensorflow {
+Status MergeAxes(const TensorShape& broadcast_shape,
+                 const TensorShape& storage_shape,
+                 std::vector<int64>* merged_broadcast_shape_pointer,
+                 std::vector<int64>* merged_storage_shape_pointer) {
+  CHECK_EQ(storage_shape.dims(), broadcast_shape.dims() + 1);
+
+  std::vector<int64>& merged_broadcast_shape = *merged_broadcast_shape_pointer;
+  std::vector<int64>& merged_storage_shape = *merged_storage_shape_pointer;
+
+  // The shapes are simplified so that the conversions between linear index
+  // and coordinates takes less CPU cycles. Two adjacent dimensions are
+  // merged if they both are broadcasting dimensions or if they both are
+  // non-broadcasting dimensions.
+  merged_broadcast_shape.resize(1);
+  merged_broadcast_shape[0] = 1;
+  merged_storage_shape.resize(1);
+  merged_storage_shape[0] = 1;
+
+  for (int i = 0, j = 0; j < broadcast_shape.dims(); ++j) {
+    if (TF_PREDICT_FALSE(
+            (broadcast_shape.dim_size(j) != storage_shape.dim_size(j)) &&
+            (storage_shape.dim_size(j) != 1))) {
+      return InvalidArgument("Cannot broadcast shape ",
+                             storage_shape.DebugString(), " to ",
+                             broadcast_shape.DebugString());
+    }
+
+    const bool was_broadcasting = (merged_storage_shape[i] == 1);
+    const bool is_broadcasting = (storage_shape.dim_size(j) == 1);
+
+    // Merge two adjacent axes if they both are broadcasting or both are
+    // non-broadcasting axes. The second and the third conditions in the if
+    // clause below are when the previously merged axis or the next j-th axis
+    // may be interpreted as either a broadcasting or a non-broadcasting axis.
+    const bool merge = (was_broadcasting == is_broadcasting) ||
+                       (broadcast_shape.dim_size(j) <= 1) ||
+                       (merged_broadcast_shape[i] <= 1);
+
+    if (merge) {
+      merged_broadcast_shape[i] *= broadcast_shape.dim_size(j);
+      merged_storage_shape[i] *= storage_shape.dim_size(j);
+    } else {
+      // Move to the next axis.
+      merged_broadcast_shape.push_back(broadcast_shape.dim_size(j));
+      merged_storage_shape.push_back(storage_shape.dim_size(j));
+      ++i;
+    }
+  }
+
+  int64 storage_stride = 1;
+  for (int i = broadcast_shape.dims(); i < storage_shape.dims(); ++i) {
+    storage_stride *= storage_shape.dim_size(i);
+  }
+  merged_storage_shape.push_back(storage_stride);
+
+  return Status::OK();
+}
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/coder/kernels/range_coder_ops_util.h b/tensorflow/contrib/coder/kernels/range_coder_ops_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8aabcef62e9de53810397960f871abc4adc0cf9
--- /dev/null
+++ b/tensorflow/contrib/coder/kernels/range_coder_ops_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_OPS_UTIL_H_
+#define TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_OPS_UTIL_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+// The shapes are simplified to reduce indexing cost.
+Status MergeAxes(const TensorShape& broadcast_shape,
+                 const TensorShape& storage_shape,
+                 std::vector<int64>* merged_broadcast_shape_pointer,
+                 std::vector<int64>* merged_storage_shape_pointer);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_CODER_KERNELS_RANGE_CODER_OPS_UTIL_H_
diff --git a/tensorflow/contrib/coder/kernels/range_coder_test.cc b/tensorflow/contrib/coder/kernels/range_coder_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..442994bf7c7566c1cbe1c439050a69e5b9a4208e
--- /dev/null
+++ b/tensorflow/contrib/coder/kernels/range_coder_test.cc
@@ -0,0 +1,116 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/coder/kernels/range_coder.h"
+
+#include <cmath>
+
+#include "tensorflow/core/lib/random/distribution_sampler.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+void RangeEncodeDecodeTest(int precision, random::SimplePhilox* gen) {
+  constexpr int kAlphabetSize = 256;
+
+  std::vector<float> distribution_weight;
+  distribution_weight.reserve(kAlphabetSize);
+  for (int i = 1; i <= kAlphabetSize; ++i) {
+    distribution_weight.push_back(std::pow(static_cast<float>(i), -2.0f));
+  }
+
+  random::DistributionSampler sampler(distribution_weight);
+
+  const int multiplier = (precision > 7) ? 32 : 1;
+  std::vector<int32> histogram(kAlphabetSize, multiplier - 1);
+
+  const int data_size =
+      (multiplier << precision) - histogram.size() * (multiplier - 1);
+  CHECK_GE(data_size, 0);
+  std::vector<uint8> data(data_size);
+  for (uint8& x : data) {
+    x = sampler.Sample(gen);
+    ++histogram[x];
+  }
+
+  std::vector<int32> cdf(histogram.size() + 1, 0);
+  int partial_sum = 0;
+  for (int i = 0; i < histogram.size(); ++i) {
+    partial_sum += histogram[i];
+    cdf[i + 1] = partial_sum / multiplier;
+  }
+
+  ASSERT_EQ(cdf.front(), 0);
+  ASSERT_EQ(cdf.back(), 1 << precision);
+
+  std::vector<double> ideal_code_length(histogram.size());
+  const double normalizer = static_cast<double>(1 << precision);
+  for (int i = 0; i < ideal_code_length.size(); ++i) {
+    ideal_code_length[i] = -std::log2((cdf[i + 1] - cdf[i]) / normalizer);
+  }
+
+  RangeEncoder encoder(precision);
+  string encoded;
+  double ideal_length = 0.0;
+  for (uint8 x : data) {
+    encoder.Encode(cdf[x], cdf[x + 1], &encoded);
+    ideal_length += ideal_code_length[x];
+  }
+  encoder.Finalize(&encoded);
+
+  LOG(INFO) << "Encoded string length (bits): " << 8 * encoded.size()
+            << ", whereas ideal " << ideal_length << " ("
+            << (8 * encoded.size()) / ideal_length << " of ideal) "
+            << " (ideal compression rate " << ideal_length / (8 * data.size())
+            << ")";
+
+  RangeDecoder decoder(encoded, precision);
+  for (int i = 0; i < data.size(); ++i) {
+    const int32 decoded = decoder.Decode(cdf);
+    ASSERT_EQ(decoded, static_cast<int32>(data[i])) << i;
+  }
+}
+
+TEST(RangeCoderTest, Precision1To11) {
+  random::PhiloxRandom gen(random::New64(), random::New64());
+  random::SimplePhilox rand(&gen);
+  const int precision = 1 + rand.Uniform(11);
+  RangeEncodeDecodeTest(precision, &rand);
+}
+
+TEST(RangeCoderTest, Precision12To16) {
+  random::PhiloxRandom gen(random::New64(), random::New64());
+  random::SimplePhilox rand(&gen);
+  for (int precision = 12; precision < 17; ++precision) {
+    RangeEncodeDecodeTest(precision, &rand);
+  }
+}
+
+TEST(RangeCoderTest, FinalizeState0) {
+  constexpr int kPrecision = 2;
+
+  string output;
+  RangeEncoder encoder(kPrecision);
+  encoder.Encode(0, 2, &output);
+  encoder.Finalize(&output);
+
+  RangeDecoder decoder(output, kPrecision);
+  EXPECT_EQ(decoder.Decode({0, 2, 4}), 0);
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/coder/ops/coder_ops.cc b/tensorflow/contrib/coder/ops/coder_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9056d1a6963d7be92f499db31385fb6afe2dc515
--- /dev/null
+++ b/tensorflow/contrib/coder/ops/coder_ops.cc
@@ -0,0 +1,119 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+// clang-format off
+REGISTER_OP("RangeEncode")
+    .Input("data: int16")
+    .Input("cdf: int32")
+    .Output("encoded: string")
+    .Attr("precision: int >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Using the provided cumulative distribution functions (CDF) inside `cdf`, returns
+a range-code of `data`.
+
+The shape of `cdf` should have one more axis than the shape of `data`, and the
+prefix `cdf.shape[:-1]` should be broadcastable to `data.shape`. That is, for
+every `i = 0,...,rank(data) - 1`, the op requires that either
+`cdf.shape[i] == 1` or `cdf.shape[i] == data.shape[i]`. Note that this
+broadcasting is limited in the sense that the number of axes must match, and
+broadcasts only `cdf` but not `data`.
+
+`data` should have an upper bound `m > 0` such that each element is an integer
+in range `[0, m)`. Then the last dimension size of `cdf` must be `m + 1`. For
+each element of `data`, the innermost strip of `cdf` is a vector representing a
+CDF. For each k = 0,...,m, `cdf[..., k] / 2^precision` is the probability that
+an outcome is less than `k` (not less than or equal to).
+
+```
+   cdf[..., 0] / 2^precision = Pr(data[...] < 0)
+   cdf[..., 1] / 2^precision = Pr(data[...] < 1) = Pr(data[...] <= 0)
+   cdf[..., 2] / 2^precision = Pr(data[...] < 2) = Pr(data[...] <= 1)
+   ...
+   cdf[..., m] / 2^precision = Pr(data[...] < m) = 1
+```
+
+Therefore each element of `cdf` must be in `[0, 2^precision]`.
+
+Ideally `cdf[..., m]` should equal to `2^precision` but this is not a hard
+requirement as long as `cdf[..., m] <= 2^precision`.
+
+The encoded string neither contains the shape information of the encoded data
+nor a termination symbol. Therefore the shape of the encoded data must be
+explicitly provided to the decoder.
+
+Implementation notes:
+
+- Because of potential performance issues, the op does not check whether
+elements of `data` is in the correct range `[0, m)`, or if `cdf` satisfies
+monotonic increase property.
+
+- For the range coder to decode the encoded string correctly, the decoder should
+be able to reproduce the internal states of the encoder precisely. Otherwise,
+the decoding would fail and once an error occur, all subsequent decoded values
+are incorrect. For this reason, the range coder uses integer arithmetics and
+avoids using any floating point operations internally, and `cdf` should contain
+integers representing quantized probability mass rather than floating points. 
+
+data: An int32 tensor.
+cdf: An int32 tensor representing the CDF's of `data`. Each integer is divided
+  by `2^precision` to represent a fraction.
+encoded: A range-coded scalar string.
+precision: The number of bits for probability quantization. Must be <= 16.
+)doc");
+
+
+REGISTER_OP("RangeDecode")
+    .Input("encoded: string")
+    .Input("shape: int32")
+    .Input("cdf: int32")
+    .Output("decoded: int16")
+    .Attr("precision: int >= 1")
+    .SetShapeFn([] (InferenceContext* c) {
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Decodes a range-coded `code` into an int32 tensor of shape `shape`.
+
+This is the reverse op of RangeEncode. The shape of the tensor that was encoded
+should be known by the caller.
+
+Implementation notes:
+
+- If wrong input was given (e.g., corrupt `encoded` string, or `cdf` or
+`precision` do not match encoder), the decode is unsuccessful. Because of
+potential performance issues, the decoder does not return error status.
+
+encoded: A scalar string tensor from RangeEncode.
+shape: An int32 1-D tensor representing the shape of the data encoded by
+  RangeEncode.
+decoded: An int32 tensor with shape equal to `shape`.
+precision: The number of bits for probability quantization. Must be <= 16, and
+  must match the precision used by RangeEncode that produced `encoded`.
+)doc");
+// clang-format on
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/coder/python/ops/coder_ops.py b/tensorflow/contrib/coder/python/ops/coder_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb262e338baf1d9c3c043f03a02c2d2851e22b49
--- /dev/null
+++ b/tensorflow/contrib/coder/python/ops/coder_ops.py
@@ -0,0 +1,30 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Range coder operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.contrib.coder.python.ops import gen_coder_ops
+from tensorflow.contrib.coder.python.ops.gen_coder_ops import *
+# pylint: enable=wildcard-import,unused-import
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+
+_coder_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_coder_ops.so"))
diff --git a/tensorflow/contrib/coder/python/ops/coder_ops_test.py b/tensorflow/contrib/coder/python/ops/coder_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5e14e7a641b5673e97882daf2b5a1796ee1bbef
--- /dev/null
+++ b/tensorflow/contrib/coder/python/ops/coder_ops_test.py
@@ -0,0 +1,53 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Coder operations tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.coder.python.ops import coder_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+class CoderOpsTest(test.TestCase):
+  """Coder ops test.
+
+  Coder ops have C++ tests. Python test just ensures that Python binding is not
+  broken.
+  """
+
+  def testReadmeExample(self):
+    data = random_ops.random_uniform((128, 128), 0, 10, dtype=dtypes.int32)
+    histogram = math_ops.bincount(data, minlength=10, maxlength=10)
+    cdf = math_ops.cumsum(histogram, exclusive=False)
+    cdf = array_ops.pad(cdf, [[1, 0]])
+    cdf = array_ops.reshape(cdf, [1, 1, -1])
+
+    data = math_ops.cast(data, dtypes.int16)
+    encoded = coder_ops.range_encode(data, cdf, precision=14)
+    decoded = coder_ops.range_decode(
+        encoded, array_ops.shape(data), cdf, precision=14)
+
+    with self.test_session() as sess:
+      self.assertAllEqual(*sess.run((data, decoded)))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/compiler/jit_test.py b/tensorflow/contrib/compiler/jit_test.py
index 2108e42bce4eba1eed158fe85888f1699a69ba7e..29a593f6bcfa05dcafcdb2f94087380ad720dba1 100644
--- a/tensorflow/contrib/compiler/jit_test.py
+++ b/tensorflow/contrib/compiler/jit_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -169,6 +170,7 @@ class JITTest(test.TestCase):
       self.assertEqual(b"jit_scope_0", func_attrs["_XlaScope"].s)
 
 
+@test_util.with_c_api
 class CompilationEnabledInGradientTest(test.TestCase):
 
   def testCompilationInGradient(self):
@@ -188,7 +190,7 @@ class CompilationEnabledInGradientTest(test.TestCase):
       for cg in c_grad_ops:
         self.assertTrue(cg.get_attr("_XlaCompile"))
       for ncg in nc_grad_ops:
-        with self.assertRaisesRegexp(ValueError, "No attr named"):
+        with self.assertRaisesRegexp(ValueError, "[Nn]o attr named"):
           ncg.get_attr("_XlaCompile")
 
       # d/dx (x ** 4) = 4 * (x ** 3)
diff --git a/tensorflow/contrib/copy_graph/__init__.py b/tensorflow/contrib/copy_graph/__init__.py
index 30a0aac140b576c501595fd6c8767b7dddde8e58..61ee39e4be1f0471309bb2672476dd9100cbfd49 100644
--- a/tensorflow/contrib/copy_graph/__init__.py
+++ b/tensorflow/contrib/copy_graph/__init__.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Functions to copy elements between graphs.
-
-See the @{$python/contrib.copy_graph} guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index d060eda0a74010db10d9506b2a1c2345b2731709..b806799202bff4f2f6dbf717fbeea74a04b8cd6e 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -35,10 +35,10 @@ from tensorflow.python.ops.variables import Variable
 from tensorflow.python.client.session import Session
 from tensorflow.python.framework import ops
 
-__all__ = ["copy_op_to_graph", "copy_variable_to_graph", "get_copied_op"]
+__all__ = ['copy_op_to_graph', 'copy_variable_to_graph', 'get_copied_op']
 
 
-def copy_variable_to_graph(org_instance, to_graph, scope=""):
+def copy_variable_to_graph(org_instance, to_graph, scope=''):
   """Given a `Variable` instance from one `Graph`, initializes and returns
   a copy of it from another `Graph`, under the specified scope
   (default `""`).
@@ -56,12 +56,11 @@ def copy_variable_to_graph(org_instance, to_graph, scope=""):
   """
 
   if not isinstance(org_instance, Variable):
-    raise TypeError(str(org_instance) + " is not a Variable")
+    raise TypeError(str(org_instance) + ' is not a Variable')
 
   #The name of the new variable
-  if scope != "":
-    new_name = (scope + '/' +
-                org_instance.name[:org_instance.name.index(':')])
+  if scope != '':
+    new_name = (scope + '/' + org_instance.name[:org_instance.name.index(':')])
   else:
     new_name = org_instance.name[:org_instance.name.index(':')]
 
@@ -73,15 +72,15 @@ def copy_variable_to_graph(org_instance, to_graph, scope=""):
   for name, collection in org_instance.graph._collections.items():
     if org_instance in collection:
       if (name == ops.GraphKeys.GLOBAL_VARIABLES or
-          name == ops.GraphKeys.TRAINABLE_VARIABLES or
-          scope == ''):
+          name == ops.GraphKeys.TRAINABLE_VARIABLES or scope == ''):
         collections.append(name)
       else:
         collections.append(scope + '/' + name)
 
   #See if its trainable.
-  trainable = (org_instance in org_instance.graph.get_collection(
-      ops.GraphKeys.TRAINABLE_VARIABLES))
+  trainable = (
+      org_instance in org_instance.graph.get_collection(
+          ops.GraphKeys.TRAINABLE_VARIABLES))
   #Get the initial value
   with org_instance.graph.as_default():
     temp_session = Session()
@@ -89,17 +88,17 @@ def copy_variable_to_graph(org_instance, to_graph, scope=""):
 
   #Initialize the new variable
   with to_graph.as_default():
-    new_var = Variable(init_value,
-                       trainable,
-                       name=new_name,
-                       collections=collections,
-                       validate_shape=False)
+    new_var = Variable(
+        init_value,
+        trainable,
+        name=new_name,
+        collections=collections,
+        validate_shape=False)
 
   return new_var
 
 
-def copy_op_to_graph(org_instance, to_graph, variables,
-                     scope=""):
+def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
   """Returns a copy of an operation from another Graph under a specified scope.
 
   Given an `Operation` `org_instance` from one `Graph`,
@@ -139,14 +138,12 @@ def copy_op_to_graph(org_instance, to_graph, variables,
   #If a variable by the new name already exists, return the
   #correspondng tensor that will act as an input
   if new_name in copied_variables:
-    return to_graph.get_tensor_by_name(
-        copied_variables[new_name].name)
+    return to_graph.get_tensor_by_name(copied_variables[new_name].name)
 
   #If an instance of the same name exists, return appropriately
   try:
-    already_present = to_graph.as_graph_element(new_name,
-                                                allow_tensor=True,
-                                                allow_operation=True)
+    already_present = to_graph.as_graph_element(
+        new_name, allow_tensor=True, allow_operation=True)
     return already_present
   except:
     pass
@@ -184,20 +181,21 @@ def copy_op_to_graph(org_instance, to_graph, variables,
 
     #If it has an original_op parameter, copy it
     if op._original_op is not None:
-      new_original_op = copy_op_to_graph(op._original_op, to_graph,
-                                      variables, scope)
+      new_original_op = copy_op_to_graph(op._original_op, to_graph, variables,
+                                         scope)
     else:
       new_original_op = None
 
     #If it has control inputs, call this function recursively on each.
-    new_control_inputs = [copy_op_to_graph(x, to_graph, variables,
-                                        scope)
-                          for x in op.control_inputs]
+    new_control_inputs = [
+        copy_op_to_graph(x, to_graph, variables, scope)
+        for x in op.control_inputs
+    ]
 
     #If it has inputs, call this function recursively on each.
-    new_inputs = [copy_op_to_graph(x, to_graph, variables,
-                                scope)
-                  for x in op.inputs]
+    new_inputs = [
+        copy_op_to_graph(x, to_graph, variables, scope) for x in op.inputs
+    ]
 
     #Make a new node_def based on that of the original.
     #An instance of tensorflow.core.framework.node_def_pb2.NodeDef, it
@@ -216,15 +214,11 @@ def copy_op_to_graph(org_instance, to_graph, variables,
     op_def = deepcopy(op._op_def)
 
     #Initialize a new Operation instance
-    new_op = ops.Operation(new_node_def,
-                           to_graph,
-                           new_inputs,
-                           output_types,
-                           new_control_inputs,
-                           input_types,
-                           new_original_op,
+    new_op = ops.Operation(new_node_def, to_graph, new_inputs, output_types,
+                           new_control_inputs, input_types, new_original_op,
                            op_def)
     #Use Graph's hidden methods to add the op
+    to_graph._add_op(new_op)  # pylint: disable=protected-access
     to_graph._record_op_seen_by_control_dependencies(new_op)
     for device_function in reversed(to_graph._device_function_stack):
       new_op._set_device(device_function(new_op))
@@ -232,10 +226,10 @@ def copy_op_to_graph(org_instance, to_graph, variables,
     return new_op
 
   else:
-    raise TypeError("Could not copy instance: " + str(org_instance))
+    raise TypeError('Could not copy instance: ' + str(org_instance))
 
 
-def get_copied_op(org_instance, graph, scope=""):
+def get_copied_op(org_instance, graph, scope=''):
   """Given an `Operation` instance from some `Graph`, returns
   its namesake from `graph`, under the specified scope
   (default `""`).
@@ -258,5 +252,5 @@ def get_copied_op(org_instance, graph, scope=""):
   else:
     new_name = org_instance.name
 
-  return graph.as_graph_element(new_name, allow_tensor=True,
-                                allow_operation=True)
+  return graph.as_graph_element(
+      new_name, allow_tensor=True, allow_operation=True)
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_test.py b/tensorflow/contrib/copy_graph/python/util/copy_test.py
index 2798d31229d048561f8ebd9b63d3df94a44c45c7..05744bec4e05405c04b5ec442e72e4495737ab5b 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_test.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_test.py
@@ -17,9 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 from tensorflow.contrib.copy_graph.python.util import copy_elements
-from tensorflow.contrib.framework.python.framework import tensor_util
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/crf/__init__.py b/tensorflow/contrib/crf/__init__.py
index bc749339bd4d49c8372bc731da98732f8c19cbe1..046c509626bc2eb20a65c0b38495ff37c294e0e1 100644
--- a/tensorflow/contrib/crf/__init__.py
+++ b/tensorflow/contrib/crf/__init__.py
@@ -16,15 +16,15 @@
 
 See the @{$python/contrib.crf} guide.
 
-@@crf_sequence_score
-@@crf_log_norm
-@@crf_log_likelihood
-@@crf_unary_score
 @@crf_binary_score
 @@crf_decode
-@@CrfForwardRnnCell
-@@CrfDecodeForwardRnnCell
+@@crf_log_likelihood
+@@crf_log_norm
+@@crf_sequence_score
+@@crf_unary_score
 @@CrfDecodeBackwardRnnCell
+@@CrfDecodeForwardRnnCell
+@@CrfForwardRnnCell
 @@viterbi_decode
 """
 
@@ -32,16 +32,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.crf.python.ops.crf import _lengths_to_masks
 from tensorflow.contrib.crf.python.ops.crf import crf_binary_score
 from tensorflow.contrib.crf.python.ops.crf import crf_decode
 from tensorflow.contrib.crf.python.ops.crf import crf_log_likelihood
 from tensorflow.contrib.crf.python.ops.crf import crf_log_norm
 from tensorflow.contrib.crf.python.ops.crf import crf_sequence_score
 from tensorflow.contrib.crf.python.ops.crf import crf_unary_score
-from tensorflow.contrib.crf.python.ops.crf import CrfForwardRnnCell
-from tensorflow.contrib.crf.python.ops.crf import CrfDecodeForwardRnnCell
 from tensorflow.contrib.crf.python.ops.crf import CrfDecodeBackwardRnnCell
+from tensorflow.contrib.crf.python.ops.crf import CrfDecodeForwardRnnCell
+from tensorflow.contrib.crf.python.ops.crf import CrfForwardRnnCell
 from tensorflow.contrib.crf.python.ops.crf import viterbi_decode
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index b47fb426a193e0fcc075deafae3eaab698f18ec9..721dc4d0801d1f0e116921888e3851a95e0b72b0 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -179,17 +179,6 @@ class CrfTest(test.TestCase):
       tf_total_log_likelihood = sess.run(total_log_likelihood)
       self.assertAllClose(tf_total_log_likelihood, 0.0)
 
-  def testLengthsToMasks(self):
-    with self.test_session() as sess:
-      sequence_lengths = [4, 1, 8, 2]
-      max_sequence_length = max(sequence_lengths)
-      mask = crf._lengths_to_masks(sequence_lengths, max_sequence_length)
-      tf_mask = sess.run(mask)
-      self.assertEqual(len(tf_mask), len(sequence_lengths))
-      for m, l in zip(tf_mask, sequence_lengths):
-        self.assertAllEqual(m[:l], [1] * l)
-        self.assertAllEqual(m[l:], [0] * (len(m) - l))
-
   def testViterbiDecode(self):
     inputs = np.array(
         [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 7f5ae937b26f465076c6976429697c35924432e5..faa78769b98699af59047aed2865771120110fc2 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -70,25 +70,6 @@ __all__ = [
 ]
 
 
-def _lengths_to_masks(lengths, max_length):
-  """Creates a binary matrix that can be used to mask away padding.
-
-  Args:
-    lengths: A vector of integers representing lengths.
-    max_length: An integer indicating the maximum length. All values in
-      lengths should be less than max_length.
-  Returns:
-    masks: Masks that can be used to get rid of padding.
-  """
-  tiled_ranges = array_ops.tile(
-      array_ops.expand_dims(math_ops.range(max_length), 0),
-      [array_ops.shape(lengths)[0], 1])
-  lengths = array_ops.expand_dims(lengths, 1)
-  masks = math_ops.to_float(
-      math_ops.to_int64(tiled_ranges) < math_ops.to_int64(lengths))
-  return masks
-
-
 def crf_sequence_score(inputs, tag_indices, sequence_lengths,
                        transition_params):
   """Computes the unnormalized score for a tag sequence.
@@ -185,8 +166,8 @@ def crf_log_likelihood(inputs,
     sequence_lengths: A [batch_size] vector of true sequence lengths.
     transition_params: A [num_tags, num_tags] transition matrix, if available.
   Returns:
-    log_likelihood: A scalar containing the log-likelihood of the given sequence
-        of tag indices.
+    log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
+      each example, given the sequence of tag indices.
     transition_params: A [num_tags, num_tags] transition matrix. This is either
         provided by the caller or created in this function.
   """
@@ -201,7 +182,7 @@ def crf_log_likelihood(inputs,
                                        transition_params)
   log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)
 
-  # Normalize the scores to get the log-likelihood.
+  # Normalize the scores to get the log-likelihood per example.
   log_likelihood = sequence_scores - log_norm
   return log_likelihood, transition_params
 
@@ -234,7 +215,9 @@ def crf_unary_score(tag_indices, sequence_lengths, inputs):
       array_ops.gather(flattened_inputs, flattened_tag_indices),
       [batch_size, max_seq_len])
 
-  masks = _lengths_to_masks(sequence_lengths, array_ops.shape(tag_indices)[1])
+  masks = array_ops.sequence_mask(sequence_lengths,
+                                  maxlen=array_ops.shape(tag_indices)[1],
+                                  dtype=dtypes.float32)
 
   unary_scores = math_ops.reduce_sum(unary_scores * masks, 1)
   return unary_scores
@@ -268,7 +251,9 @@ def crf_binary_score(tag_indices, sequence_lengths, transition_params):
   binary_scores = array_ops.gather(flattened_transition_params,
                                    flattened_transition_indices)
 
-  masks = _lengths_to_masks(sequence_lengths, array_ops.shape(tag_indices)[1])
+  masks = array_ops.sequence_mask(sequence_lengths,
+                                  maxlen=array_ops.shape(tag_indices)[1],
+                                  dtype=dtypes.float32)
   truncated_masks = array_ops.slice(masks, [0, 1], [-1, -1])
   binary_scores = math_ops.reduce_sum(binary_scores * truncated_masks, 1)
   return binary_scores
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index fce2c03e69bc4b8b0ac46b8e081a33c43c9d41ab..fec358c4e1067dc8dc8173d1b9d05dc90b90ca05 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -25,6 +25,7 @@ tf_custom_op_library(
     ],
     deps = [
         "//tensorflow/core/kernels:bounds_check_lib",
+        "@farmhash_archive//:farmhash",
     ],
 )
 
@@ -39,6 +40,7 @@ tf_kernel_library(
         "//tensorflow/core:stream_executor",
         "//tensorflow/core/kernels:bounds_check_lib",
         "//third_party/eigen3",
+        "@farmhash_archive//:farmhash",
     ],
 )
 
@@ -146,10 +148,10 @@ cuda_py_test(
 
 cuda_py_test(
     name = "cudnn_rnn_ops_benchmark",
-    size = "large",
+    size = "small",
     srcs = ["python/kernel_tests/cudnn_rnn_ops_benchmark.py"],
     additional_deps = [
-        ":cudnn_rnn_ops_py",
+        ":cudnn_rnn_py",
         "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
@@ -164,7 +166,6 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = [
-        "manual",
         "noasan",  # http://b/62067814
         "nomsan",
         "notsan",
diff --git a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
index 5d5f593d016a3bb9f7b5ea8f5cd40c29268dc4f5..ba9686e94ee7072cc485c955decb2287bd4a56f3 100644
--- a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/env_var.h"
@@ -369,6 +370,27 @@ struct CudnnModelShapes {
   }
 };
 
+// Utility class for using CudnnModelShapes as a hash table key.
+struct CudnnModelShapesHasher {
+  uint64 operator()(const CudnnModelShapes& to_hash) const {
+    uint64 hash = static_cast<uint64>(to_hash.num_layers);
+    hash = tensorflow::FingerprintCat64(
+        hash, static_cast<uint64>(to_hash.input_size));
+    hash = tensorflow::FingerprintCat64(hash,
+                                        static_cast<uint64>(to_hash.num_units));
+    return tensorflow::FingerprintCat64(hash,
+                                        static_cast<uint64>(to_hash.dir_count));
+  }
+};
+
+// Utility class for using CudnnModelShapes as a hash table key.
+struct CudnnModelShapesComparator {
+  bool operator()(const CudnnModelShapes& first,
+                  const CudnnModelShapes& second) const {
+    return first.IsCompatibleWith(second);
+  }
+};
+
 // Extract and checks the forward input tensors, parameters, and shapes from the
 // OpKernelContext.
 Status ExtractForwardInput(OpKernelContext* context,
@@ -627,7 +649,7 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
     }
     const int num_params_per_layer = num_params_ / num_layers / num_dirs;
     // Number of params applied on inputs. The rest are applied on recurrent
-    // hiddden states.
+    // hidden states.
     const int num_params_input_state = num_params_per_layer / 2;
     CHECK(num_params_ % (num_layers * num_dirs) == 0)
         << "Number of params is not a multiple of num_layers * num_dirs.";
@@ -764,6 +786,13 @@ TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #undef REGISTER_GPU
 
+// Pointers to RNN scratch space for a specific set of shape parameters (used as
+// a hash table value in CudnnRNNForwardOp and CudnnRNNBackwardOp).
+struct RnnScratchSpace {
+  std::unique_ptr<RnnDescriptor> rnn_desc;
+  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator;
+};
+
 // Run the forward operation of the RNN model.
 template <typename T>
 class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
@@ -808,32 +837,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context,
                    ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
                                   model_shapes.input_size, &input_mode));
-    // TODO(zhengxq): cache the descriptor so we don't have to create them all
-    // the time.
     auto data_type = ToDataType<T>::value;
-    {
-      mutex_lock l(mu_);
-      if (model_shapes_ == nullptr) {
-        model_shapes_.reset(new CudnnModelShapes(model_shapes));
-      } else {
-        OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
-                    errors::InvalidArgument(
-                        "Incompatible rnn model shapes inferred: expecting ",
-                        model_shapes_->RnnDescDebugString(), ", getting ",
-                        model_shapes.RnnDescDebugString(), "."));
-      }
-      if (rnn_desc_ == nullptr || ResetRndGenState()) {
-        dropout_state_allocator_.reset(
-            new CudnnRNNPersistentSpaceAllocator(context));
-        auto rnn_desc_s = executor->createRnnDescriptor(
-            model_shapes_->num_layers, model_shapes_->num_units,
-            model_shapes_->input_size, input_mode, rnn_direction_mode(),
-            rnn_mode(), data_type, dropout(), seed(),
-            dropout_state_allocator_.get());
-        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-        rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
-      }
-    }
 
     auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
         input_shape.dim_size(0), input_shape.dim_size(1),
@@ -882,14 +886,27 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     bool launch_status = false;
     {
       mutex_lock l(mu_);
+      RnnScratchSpace& rnn_state = rnn_state_cache_[model_shapes];
+      if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) {
+        CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
+            new CudnnRNNPersistentSpaceAllocator(context);
+        rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
+        auto rnn_desc_s = executor->createRnnDescriptor(
+            model_shapes.num_layers, model_shapes.num_units,
+            model_shapes.input_size, input_mode, rnn_direction_mode(),
+            rnn_mode(), data_type, dropout(), seed(), dropout_state_allocator);
+        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
+        rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie());
+      }
       launch_status =
           stream
-              ->ThenRnnForward(
-                  *rnn_desc_, *input_desc, input_data, *hidden_state_desc,
-                  input_h_data, *hidden_state_desc, input_c_data, params_data,
-                  *output_desc, &output_data, *hidden_state_desc,
-                  &output_h_data, *hidden_state_desc, &output_c_data,
-                  is_training_, &reserve_space_allocator, &workspace_allocator)
+              ->ThenRnnForward(*rnn_state.rnn_desc, *input_desc, input_data,
+                               *hidden_state_desc, input_h_data,
+                               *hidden_state_desc, input_c_data, params_data,
+                               *output_desc, &output_data, *hidden_state_desc,
+                               &output_h_data, *hidden_state_desc,
+                               &output_c_data, is_training_,
+                               &reserve_space_allocator, &workspace_allocator)
               .ok();
     }
     OP_REQUIRES(context, launch_status,
@@ -899,10 +916,9 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
  private:
   mutex mu_;
   bool is_training_;
-  std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
-  std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
-  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
-      GUARDED_BY(mu_);
+  std::unordered_map<CudnnModelShapes, RnnScratchSpace, CudnnModelShapesHasher,
+                     CudnnModelShapesComparator>
+      rnn_state_cache_ GUARDED_BY(mu_);
 };
 
 #define REGISTER_GPU(T)                                           \
@@ -1022,32 +1038,6 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context,
                    ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
                                   model_shapes.input_size, &input_mode));
-    // TODO(zhengxq): cache the descriptor so we don't have to create them all
-    // the time.
-    {
-      mutex_lock l(mu_);
-      if (model_shapes_ == nullptr) {
-        model_shapes_.reset(new CudnnModelShapes(model_shapes));
-      } else {
-        OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
-                    errors::InvalidArgument(
-                        "Incompatible rnn model shapes inferred: expecting ",
-                        model_shapes_->RnnDescDebugString(), ", getting ",
-                        model_shapes.RnnDescDebugString(), "."));
-      }
-
-      if (rnn_desc_ == nullptr || ResetRndGenState()) {
-        dropout_state_allocator_.reset(
-            new CudnnRNNPersistentSpaceAllocator(context));
-        auto rnn_desc_s = executor->createRnnDescriptor(
-            model_shapes.num_layers, model_shapes.num_units,
-            model_shapes.input_size, input_mode, rnn_direction_mode(),
-            rnn_mode(), data_type, dropout(), seed(),
-            dropout_state_allocator_.get());
-        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-        rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
-      }
-    }
 
     auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
         input_shape.dim_size(0), input_shape.dim_size(1),
@@ -1100,17 +1090,30 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     bool launch_status = false;
     {
       mutex_lock l(mu_);
+      RnnScratchSpace& rnn_state = rnn_state_cache_[model_shapes];
+      if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) {
+        CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
+            new CudnnRNNPersistentSpaceAllocator(context);
+        rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
+        auto rnn_desc_s = executor->createRnnDescriptor(
+            model_shapes.num_layers, model_shapes.num_units,
+            model_shapes.input_size, input_mode, rnn_direction_mode(),
+            rnn_mode(), data_type, dropout(), seed(), dropout_state_allocator);
+        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
+        rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie());
+      }
       launch_status =
           stream
-              ->ThenRnnBackward(
-                  *rnn_desc_, *input_desc, input_data, *hidden_state_desc,
-                  input_h_data, *hidden_state_desc, input_c_data, params_data,
-                  *output_desc, output_data, *hidden_state_desc, output_h_data,
-                  *hidden_state_desc, output_c_data, output_backprop_data,
-                  output_h_backprop_data, output_c_backprop_data,
-                  &input_backprop_data, &input_h_backprop_data,
-                  &input_c_backprop_data, &params_backprop_data,
-                  &reserve_space_uint8, &workspace_allocator)
+              ->ThenRnnBackward(*rnn_state.rnn_desc, *input_desc, input_data,
+                                *hidden_state_desc, input_h_data,
+                                *hidden_state_desc, input_c_data, params_data,
+                                *output_desc, output_data, *hidden_state_desc,
+                                output_h_data, *hidden_state_desc,
+                                output_c_data, output_backprop_data,
+                                output_h_backprop_data, output_c_backprop_data,
+                                &input_backprop_data, &input_h_backprop_data,
+                                &input_c_backprop_data, &params_backprop_data,
+                                &reserve_space_uint8, &workspace_allocator)
               .ok();
     }
     OP_REQUIRES(context, launch_status,
@@ -1119,10 +1122,9 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
 
  private:
   mutex mu_;
-  std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
-  std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
-  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
-      GUARDED_BY(mu_);
+  std::unordered_map<CudnnModelShapes, RnnScratchSpace, CudnnModelShapesHasher,
+                     CudnnModelShapesComparator>
+      rnn_state_cache_ GUARDED_BY(mu_);
 };
 
 #define REGISTER_GPU(T)                                                   \
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
index 9e41e67857101534e8bfef8d5d0b8a45ed8f1f76..1a79bf066c3a27e040099729fb079ee963f59270 100644
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
@@ -251,9 +251,8 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
       TF_RETURN_IF_ERROR(c->GetAttr("num_params", &num_params));
       // Set shape for weight matrices
       for (int i = 0; i < num_params; i++) {
-        c->set_output(i,
-                      c->Matrix(InferenceContext::kUnknownDim,
-                                InferenceContext::kUnknownDim));
+        c->set_output(i, c->Matrix(InferenceContext::kUnknownDim,
+                                   InferenceContext::kUnknownDim));
       }
       // Set shape for bias vectors
       for (int i = 0; i < num_params; i++) {
@@ -300,6 +299,7 @@ upcoming training or inferences.
 num_params: number of parameter sets for all layers.
     Each layer may contain multiple parameter sets, with each set consisting of
     a weight matrix and a bias vector.
-)doc", kCudnnRNNCommonAttrs));
+)doc",
+                         kCudnnRNNCommonAttrs));
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
index ff409ac71826f1f0f57e9133d768003f849abc09..933df6d71dd7c972efe63d54fa7344ecfc39b0a7 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_benchmark.py
@@ -20,8 +20,9 @@ from __future__ import print_function
 
 import time
 
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
-from tensorflow.contrib.rnn.python.ops import core_rnn
 from tensorflow.contrib.rnn.python.ops import lstm_ops
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
@@ -29,8 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import rnn
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -44,19 +44,19 @@ class CudnnRNNBenchmark(test.Benchmark):
         "large": {
             "num_layers": 4,
             "num_units": 1024,
-            "seq_length": 40,
+            "seq_length": 50,
             "batch_size": 64,
         },
         "medium": {
             "num_layers": 4,
             "num_units": 512,
-            "seq_length": 30,
+            "seq_length": 50,
             "batch_size": 64,
         },
         "small": {
             "num_layers": 4,
             "num_units": 128,
-            "seq_length": 20,
+            "seq_length": 50,
             "batch_size": 64,
         },
     }
@@ -71,7 +71,7 @@ class CudnnRNNBenchmark(test.Benchmark):
 
   def _BenchmarkOp(self, op, desc):
     burn_in_steps = 10
-    benchmark_steps = 40
+    benchmark_steps = 20
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
       for i in xrange(burn_in_steps + benchmark_steps):
@@ -126,16 +126,12 @@ class CudnnRNNBenchmark(test.Benchmark):
       seq_length = config["seq_length"]
 
       with ops.Graph().as_default(), ops.device("/device:GPU:0"):
-        inputs = seq_length * [
-            array_ops.zeros([batch_size, num_units], dtypes.float32)
-        ]
-        initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=127)
-
-        cell = rnn_cell.LSTMCell(
-            num_units=num_units, initializer=initializer, state_is_tuple=True)
-        multi_cell = rnn_cell.MultiRNNCell(
-            [cell() for _ in range(num_layers)])
-        outputs, final_state = core_rnn.static_rnn(
+        inputs = array_ops.zeros([batch_size, seq_length, num_units],
+                                 dtypes.float32)
+
+        multi_cell = contrib_rnn.MultiRNNCell(
+            [contrib_rnn.BasicLSTMCell(num_units) for _ in range(num_layers)])
+        outputs, final_state = rnn.dynamic_rnn(
             multi_cell, inputs, dtype=dtypes.float32)
         trainable_variables = ops.get_collection(
             ops.GraphKeys.TRAINABLE_VARIABLES)
@@ -154,14 +150,12 @@ class CudnnRNNBenchmark(test.Benchmark):
       seq_length = config["seq_length"]
 
       with ops.Graph().as_default(), ops.device("/device:GPU:0"):
-        inputs = seq_length * [
-            array_ops.zeros([batch_size, num_units], dtypes.float32)
-        ]
-        cell = lambda: lstm_ops.LSTMBlockCell(num_units=num_units)  # pylint: disable=cell-var-from-loop
-
-        multi_cell = rnn_cell.MultiRNNCell(
-            [cell() for _ in range(num_layers)])
-        outputs, final_state = core_rnn.static_rnn(
+        inputs = array_ops.zeros([batch_size, seq_length, num_units],
+                                 dtypes.float32)
+
+        multi_cell = contrib_rnn.MultiRNNCell(
+            [lstm_ops.LSTMBlockCell(num_units) for _ in range(num_layers)])
+        outputs, final_state = rnn.dynamic_rnn(
             multi_cell, inputs, dtype=dtypes.float32)
         trainable_variables = ops.get_collection(
             ops.GraphKeys.TRAINABLE_VARIABLES)
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index e65394cba07574ed49398981f1cbd8bcb402e24f..9897c31a98e0b335c18a84825fc518ed1fc310a2 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -29,6 +29,8 @@ import numpy as np
 from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.contrib.rnn.python.ops import rnn as contrib_rnn_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -49,7 +51,11 @@ from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import adagrad
+from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import momentum
+from tensorflow.python.training import rmsprop
 from tensorflow.python.training import saver as saver_lib
 
 
@@ -314,6 +320,150 @@ class CudnnRNNTestBasic(TensorFlowTestCase):
       self.assertEqual(0, total_sum2_v)
       self.assertEqual(0, total_sum3_v)
 
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testOptimizersSupport(self):
+    for opt in ("adagrad", "adam", "rmsprop", "momentum", "sgd"):
+      self._TestOptimizerSupportHelper(opt)
+
+  def _GetOptimizer(self, opt):
+    if opt == "adagrad":
+      return adagrad.AdagradOptimizer(learning_rate=1e-2)
+    elif opt == "adam":
+      return adam.AdamOptimizer(learning_rate=1e-2)
+    elif opt == "rmsprop":
+      return rmsprop.RMSPropOptimizer(learning_rate=1e-2)
+    elif opt == "momentum":
+      return momentum.MomentumOptimizer(learning_rate=1e-2, momentum=0.9)
+    elif opt == "sgd":
+      return gradient_descent.GradientDescentOptimizer(learning_rate=1e-2)
+    else:
+      raise ValueError("Unsupported optimizer: %s" % opt)
+
+  def _TestOptimizerSupportHelper(self, opt):
+    num_layers = 4
+    num_units = 2
+    batch_size = 8
+    direction = CUDNN_RNN_UNIDIRECTION
+    dir_count = 1
+
+    with ops.Graph().as_default() as g:
+      kernel_initializer = init_ops.constant_initializer(0.)
+      bias_initializer = init_ops.constant_initializer(0.)
+      inputs = random_ops.random_uniform([
+          num_layers * dir_count, batch_size, num_units], dtype=dtypes.float32)
+
+      lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units,
+                                 direction=direction,
+                                 kernel_initializer=kernel_initializer,
+                                 bias_initializer=bias_initializer,
+                                 name="awesome_lstm")
+      outputs, _ = lstm(inputs)
+      loss = math_ops.reduce_sum(outputs)
+      optimizer = self._GetOptimizer(opt)
+      train_op = optimizer.minimize(loss)
+
+    with self.test_session(use_gpu=True, graph=g) as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(train_op)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testSaveableGraphDeviceAssignment(self):
+    num_layers = 4
+    num_units = 2
+    batch_size = 8
+    direction = CUDNN_RNN_UNIDIRECTION
+    dir_count = 1
+
+    def DeviceFn(op):
+      if op.type in ("Variable", "VariableV2"):
+        return "/cpu:0"
+      else:
+        return "/gpu:0"
+
+    with ops.Graph().as_default() as g:
+      with ops.device(DeviceFn):
+        with vs.variable_scope("main"):
+          kernel_initializer = init_ops.constant_initializer(3.14)
+          bias_initializer = init_ops.constant_initializer(1.59)
+          inputs = random_ops.random_uniform(
+              [num_layers * dir_count, batch_size, num_units],
+              dtype=dtypes.float32)
+
+          lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units,
+                                     direction=direction,
+                                     kernel_initializer=kernel_initializer,
+                                     bias_initializer=bias_initializer,
+                                     name="awesome_lstm")
+          outputs = lstm(inputs)
+
+        # saver is created in the scope of DeviceFn.
+        saver = saver_lib.Saver()
+
+    with self.test_session(use_gpu=True, graph=g) as sess:
+      save_path = os.path.join(self.get_temp_dir(),
+                               "test-saveable-device-assignment")
+      sess.run(variables.global_variables_initializer())
+
+      saver.save(sess, save_path)
+      saver.restore(sess, save_path)
+      sess.run(outputs)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testDifferentShapesEager(self):
+    # Checks that kernel caching does not cause sharing of temporary storage
+    # across different input shapes when executing eagerly.
+    with context.eager_mode():
+      with ops.device("gpu:0"):
+        first_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
+            array_ops.zeros([28, 100, 28]))
+        second_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
+            array_ops.zeros([28, 100, 100]))
+        self.assertAllEqual([28, 100, 100], first_output.shape)
+        self.assertAllEqual([28, 100, 100], second_output.shape)
+
+        def _LossFunc():
+          first_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
+              array_ops.zeros([28, 100, 28]))
+          second_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
+              array_ops.zeros([28, 100, 100]))
+          return (math_ops.reduce_sum(first_output) +
+                  math_ops.reduce_sum(second_output))
+
+        backprop.implicit_grad(_LossFunc)()
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testDifferentShapesGraph(self):
+    # Tests that a single kernel instance presented with multiple input shapes
+    # does not crash with graph execution.
+    with ops.device("gpu:0"):
+      layer = cudnn_rnn.CudnnGRU(1, 100)
+      layer(array_ops.zeros([28, 100, 100]))
+
+      def _Cond(index, accumulation):
+        del accumulation  # unused
+        return math_ops.less(index, 4)
+
+      def _Body(index, accumulation):
+        layer_input = accumulation[:, :, 10 * (1 + index % 2):]
+        output, _ = layer(layer_input)
+        return index + 1, accumulation + output
+
+      original_input = array_ops.zeros([28, 100, 100])
+      _, accumulation = control_flow_ops.while_loop(_Cond, _Body,
+                                                    [0, original_input])
+      grad, = gradients.gradients(
+          math_ops.reduce_sum(accumulation), (original_input,))
+    init_op = variables.global_variables_initializer()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      accumulation_eval, grad_eval = sess.run((accumulation, grad))
+      self.assertAllEqual([28, 100, 100], accumulation_eval.shape)
+      self.assertAllEqual([28, 100, 100], grad_eval.shape)
+
 
 # TODO(jamesqin): Transform to parameterized test after it is included in the
 # TF open source codebase.
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 37c61a71a3bdac4fadef58ba8c24b853fb3638ef..36fba917a8f56c26fd5b4c3468d1d980a8ba2ba5 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -176,8 +176,9 @@ class _CudnnRNN(base_layer.Layer):
           otherwise, it implies 'linear_input'.
       direction: the direction model that the model operates. Can be either
           'unidirectional' or 'bidirectional'
-      dropout: dropout rate, a number between [0, 1]. Dropout is applied on
-          inputs of each layer. When set to 0, dropout is disabled.
+      dropout: dropout rate, a number between [0, 1]. Dropout is applied between
+          each layer (no dropout is applied for a model with a single layer).
+          When set to 0, dropout is disabled.
       seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
           for behavior.
       dtype: tf.float16, tf.float32 or tf.float64
@@ -358,7 +359,7 @@ class _CudnnRNN(base_layer.Layer):
     # Create saveable in the outer scope of the cudnn subgraph, such that
     # alternative subgraph with platform-independent rnn cells can load the
     # checkpoints directly.
-    if not (self.built or vs.get_variable_scope().reuse):
+    if not (self.built or vs.get_variable_scope().reuse is True):
       self._create_saveable()
     self.built = True
 
@@ -450,17 +451,18 @@ class _CudnnRNN(base_layer.Layer):
       raise RuntimeError(
           "%s._canonical_to_opaque invoked before input shape is known" %
           type(self).__name__)
-    return cudnn_rnn_ops.cudnn_rnn_canonical_to_opaque_params(
-        rnn_mode=self._rnn_mode,
-        num_layers=self._num_layers,
-        num_units=self._num_units,
-        input_size=self._input_size,
-        weights=cu_weights,
-        biases=cu_biases,
-        input_mode=self._input_mode,
-        seed=self._seed,
-        dropout=self._dropout,
-        direction=self._direction)
+    with ops.device("/gpu:0"):
+      return cudnn_rnn_ops.cudnn_rnn_canonical_to_opaque_params(
+          rnn_mode=self._rnn_mode,
+          num_layers=self._num_layers,
+          num_units=self._num_units,
+          input_size=self._input_size,
+          weights=cu_weights,
+          biases=cu_biases,
+          input_mode=self._input_mode,
+          seed=self._seed,
+          dropout=self._dropout,
+          direction=self._direction)
 
   def _forward(self, inputs, h, c, opaque_params, training):
     output, output_h, output_c = cudnn_rnn_ops._cudnn_rnn(  # pylint:disable=protected-access
@@ -489,14 +491,14 @@ class _CudnnRNN(base_layer.Layer):
     if self._saveable is not None:
       raise RuntimeError("Cudnn saveable already created.")
     self._saveable = self._saveable_cls(  # pylint:disable=not-callable
-        self.trainable_variables[0],
-        self.num_layers,
-        self.num_units,
-        self.input_size,
-        self.input_mode,
-        self.direction,
+        opaque_params=self.trainable_variables[0],
+        num_layers=self.num_layers,
+        num_units=self.num_units,
+        input_size=self.input_size,
+        input_mode=self.input_mode,
+        direction=self.direction,
         scope=vs.get_variable_scope(),
-        name="%s_saveable" % self.trainable_variables[0].op.name)
+        name="%s_saveable" % self.trainable_variables[0].name.split(":")[0])
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
 
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index dcd3d4732a27ae4bec579ac12ac568dc4a53baaa..e87162f0ee9cc4eed795555171f55a93639e83cf 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -72,7 +72,7 @@ class CudnnCompatibleLSTMCell(lstm_ops.LSTMBlockCell):
   def __init__(self, num_units, reuse=None):
     super(CudnnCompatibleLSTMCell, self).__init__(
         num_units, forget_bias=0, cell_clip=None, use_peephole=False,
-        reuse=reuse)
+        reuse=reuse, name="cudnn_compatible_lstm_cell")
     self._names.update({"scope": "cudnn_compatible_lstm_cell"})
 
 
@@ -303,16 +303,17 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     Returns:
       2 list for weights and biases respectively.
     """
-    weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical(
-        num_layers=self._num_layers,
-        num_units=self._num_units,
-        input_size=self._input_size,
-        params=self._variables,
-        num_params=self._num_params,
-        rnn_mode=self._rnn_mode,
-        input_mode=self._input_mode,
-        direction=self._direction)
-    return (weights, biases)
+    with ops.device("/gpu:0"):
+      weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical(
+          num_layers=self._num_layers,
+          num_units=self._num_units,
+          input_size=self._input_size,
+          params=self._variables,
+          num_params=self._num_params,
+          rnn_mode=self._rnn_mode,
+          input_mode=self._input_mode,
+          direction=self._direction)
+      return (weights, biases)
 
   def _CanonicalToOpaqueParams(self, cu_weights, cu_biases):
     """Converts from Cudnn canonical format to opaque params.
@@ -323,15 +324,16 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     Returns:
       a single opaque tensor.
     """
-    return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params(
-        num_layers=self._num_layers,
-        num_units=self._num_units,
-        input_size=self._input_size,
-        weights=cu_weights,
-        biases=cu_biases,
-        rnn_mode=self._rnn_mode,
-        input_mode=self._input_mode,
-        direction=self._direction)
+    with ops.device("/gpu:0"):
+      return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params(
+          num_layers=self._num_layers,
+          num_units=self._num_units,
+          input_size=self._input_size,
+          weights=cu_weights,
+          biases=cu_biases,
+          rnn_mode=self._rnn_mode,
+          input_mode=self._input_mode,
+          direction=self._direction)
 
   def _TransformCanonical(self, cu_weights, cu_biases):
     r"""Transform from Cudnn canonical to tf canonical.
@@ -1352,7 +1354,7 @@ class _CudnnRNN(object):
       params: the parameter buffer created for this model.
       is_training: whether this operation will be used in training or inference.
     Returns:
-      output: the output sequuence.
+      output: the output sequence.
       output_h: the final state for h.
       output_c: the final state for c. This is only relevant for LSTM.
     """
@@ -1470,7 +1472,7 @@ class CudnnLSTM(_CudnnRNN):
       params: the parameter buffer created for this model.
       is_training: whether this operation will be used in training or inference.
     Returns:
-      output: the output sequuence.
+      output: the output sequence.
       output_h: the final state for h.
       output_c: the final state for c.
     """
@@ -1540,7 +1542,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
       params: the parameter buffer created for this model.
       is_training: whether this operation will be used in training or inference.
     Returns:
-      output: the output sequuence.
+      output: the output sequence.
       output_h: the final state for h.
     """
     return _cudnn_rnn_no_input_c(
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index f7d8a084d9c12c05c411ae0751854d1823a818ec..0458199ff771bc45603106411550a39448e515b8 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -18,20 +18,22 @@ py_library(
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/contrib/data/python/ops:readers",
+        "//tensorflow/contrib/data/python/ops:shuffle_ops",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:iterator_ops",
     ],
 )
 
 tf_custom_op_library(
-    name = "_prefetching_ops.so",
-    srcs = ["ops/prefetching_ops.cc"],
-    deps = ["//tensorflow/contrib/data/kernels:prefetching_kernels"],
+    name = "_dataset_ops.so",
+    srcs = ["ops/dataset_ops.cc"],
+    deps = ["//tensorflow/contrib/data/kernels:dataset_kernels"],
 )
 
 tf_gen_op_libs(
-    op_lib_names = ["prefetching_ops"],
+    op_lib_names = ["dataset_ops"],
 )
 
 filegroup(
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 7c6244f22b0f41656369595d3e3e6c23b7088bcb..fcdccdd26ca1824bf13f1fd0cfd80b20ca8a10c3 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -12,30 +12,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""`tf.contrib.data.Dataset` API for input pipelines.
+"""Experimental API for building input pipelines.
+
+This module contains experimental `Dataset` sources and transformations that can
+be used in conjunction with the @{tf.data.Dataset} API. Note that the
+`tf.contrib.data` API is not subject to the same backwards compatibility
+guarantees as `tf.data`, but we will provide deprecation advice in advance of
+removing existing functionality.
 
 See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 
-@@Dataset
 @@Counter
-@@Iterator
-@@TFRecordDataset
-@@FixedLengthRecordDataset
-@@TextLineDataset
 
 @@batch_and_drop_remainder
-@@padded_batch_and_drop_remainder
 @@dense_to_sparse_batch
 @@enumerate_dataset
 @@group_by_window
 @@ignore_errors
 @@make_saveable_from_iterator
-@@read_batch_features
-@@unbatch
+@@map_and_batch
+@@padded_batch_and_drop_remainder
 @@parallel_interleave
+@@read_batch_features
 @@rejection_resample
 @@scan
+@@shuffle_and_repeat
 @@sloppy_interleave
+@@unbatch
 
 @@get_single_element
 """
@@ -48,25 +51,22 @@ from __future__ import print_function
 
 from tensorflow.contrib.data.python.ops.batching import batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import dense_to_sparse_batch
+from tensorflow.contrib.data.python.ops.batching import map_and_batch
 from tensorflow.contrib.data.python.ops.batching import padded_batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import unbatch
 from tensorflow.contrib.data.python.ops.counter import Counter
-from tensorflow.contrib.data.python.ops.dataset_ops import Dataset
-from tensorflow.contrib.data.python.ops.dataset_ops import get_single_element
 from tensorflow.contrib.data.python.ops.enumerate_ops import enumerate_dataset
 from tensorflow.contrib.data.python.ops.error_ops import ignore_errors
+from tensorflow.contrib.data.python.ops.get_single_element import get_single_element
 from tensorflow.contrib.data.python.ops.grouping import group_by_window
 from tensorflow.contrib.data.python.ops.interleave_ops import parallel_interleave
 from tensorflow.contrib.data.python.ops.interleave_ops import sloppy_interleave
 from tensorflow.contrib.data.python.ops.iterator_ops import make_saveable_from_iterator
-from tensorflow.contrib.data.python.ops.readers import FixedLengthRecordDataset
 from tensorflow.contrib.data.python.ops.readers import read_batch_features
 from tensorflow.contrib.data.python.ops.readers import SqlDataset
-from tensorflow.contrib.data.python.ops.readers import TextLineDataset
-from tensorflow.contrib.data.python.ops.readers import TFRecordDataset
 from tensorflow.contrib.data.python.ops.resampling import rejection_resample
 from tensorflow.contrib.data.python.ops.scan_ops import scan
-from tensorflow.python.data.ops.iterator_ops import Iterator
+from tensorflow.contrib.data.python.ops.shuffle_ops import shuffle_and_repeat
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/contrib/data/kernels/BUILD b/tensorflow/contrib/data/kernels/BUILD
index 4cb53741ebf8cd0db41b382c878bd2ccd1dcf7f1..56471911c5c0d1c1825955c67997b5bbc0786463 100644
--- a/tensorflow/contrib/data/kernels/BUILD
+++ b/tensorflow/contrib/data/kernels/BUILD
@@ -17,6 +17,28 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "ignore_errors_dataset_op",
+    srcs = ["ignore_errors_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "dataset_kernels",
+    deps = [
+        ":ignore_errors_dataset_op",
+        ":prefetching_kernels",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/core/kernels/ignore_errors_dataset_op.cc b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/ignore_errors_dataset_op.cc
rename to tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
index 8cf263d87fed601ed987e5d13909dd433391e5bd..bb29df60e8f114aaa50f578c43e73874f72ab0a3 100644
--- a/tensorflow/core/kernels/ignore_errors_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -109,7 +108,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         if (reader->Contains(full_name("input_impls_empty")))
diff --git a/tensorflow/contrib/data/kernels/prefetching_kernels.cc b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
index c9a3537c70c711290fb1111a1594e6dea3bc07a9..d3df14bdd03476e9ee4015b374512e5bb9893a63 100644
--- a/tensorflow/contrib/data/kernels/prefetching_kernels.cc
+++ b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
@@ -83,11 +83,10 @@ class FunctionBufferingResource : public ResourceBase {
       return Status::OK();
     }
     AttrValueMap attr_values = func_.attr();
-    AttrValue v;
-    v.set_s(target_device_);
-    AddAttr("_target", v, &attr_values);
-
-    return lib_->Instantiate(func_.name(), AttrSlice(&attr_values), &handle_);
+    FunctionLibraryRuntime::InstantiateOptions opts;
+    opts.target = target_device_;
+    return lib_->Instantiate(func_.name(), AttrSlice(&attr_values), opts,
+                             &handle_);
   }
 
   // Returns true if we've got to the end of the sequence and exhausted the
diff --git a/tensorflow/contrib/data/ops/prefetching_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc
similarity index 86%
rename from tensorflow/contrib/data/ops/prefetching_ops.cc
rename to tensorflow/contrib/data/ops/dataset_ops.cc
index 23cb62b6f0dbfed15667dd00ae0039b33aa944d4..289ffa1d9c29092cdf434e86ed5553ff9644d43e 100644
--- a/tensorflow/contrib/data/ops/prefetching_ops.cc
+++ b/tensorflow/contrib/data/ops/dataset_ops.cc
@@ -17,6 +17,16 @@ limitations under the License.
 
 namespace tensorflow {
 
+REGISTER_OP("IgnoreErrorsDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that contains the elements of `input_dataset` ignoring errors.
+)doc");
+
 REGISTER_OP("FunctionBufferingResource")
     .Input("string_arg: string")
     .Input("target_device: string")
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 1d4817fa2670317f4f4e9e63c724a79e18aa35bc..e51d57cc896dc32d8e11912cd89f34a04a858c78 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -4,7 +4,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
 
 py_test(
     name = "batch_dataset_op_test",
@@ -36,6 +36,7 @@ py_test(
     srcs = ["bucketing_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
         "//tensorflow/python:array_ops",
@@ -51,37 +52,17 @@ py_test(
     ],
 )
 
-py_test(
-    name = "cache_dataset_op_test",
-    size = "small",
-    srcs = ["cache_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_test(
     name = "concatenate_dataset_op_test",
     size = "small",
     srcs = ["concatenate_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
         "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
     ],
@@ -89,7 +70,7 @@ py_test(
 
 py_test(
     name = "dataset_constructor_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["dataset_constructor_op_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -118,7 +99,6 @@ py_test(
 
 py_library(
     name = "dataset_serialization_test",
-    testonly = 1,
     srcs = [
         "dataset_serialization_test_base.py",
     ],
@@ -128,6 +108,7 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
@@ -157,14 +138,13 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "flat_map_dataset_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["flat_map_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
+    additional_deps = [
         ":dataset_serialization_test",
+        "//third_party/py/numpy",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -177,17 +157,19 @@ py_test(
         "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
-        "//third_party/py/numpy",
     ],
+    grpc_enabled = True,
+    tags = ["no_pip"],
 )
 
 py_test(
     name = "interleave_dataset_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["interleave_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "manual",  # b/67958761
+        "no_oss",
+        "no_pip",
     ],
     deps = [
         ":dataset_serialization_test",
@@ -207,77 +189,25 @@ py_test(
     ],
 )
 
-py_test(
-    name = "iterator_ops_cluster_test",
+tf_py_test(
+    name = "get_single_element_test",
     size = "small",
-    srcs = ["iterator_ops_cluster_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-)
-
-py_test(
-    name = "iterator_ops_test",
-    size = "small",
-    srcs = ["iterator_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    srcs = ["get_single_element_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:readers",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:training",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "list_files_dataset_op_test",
-    size = "small",
-    srcs = ["list_files_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:util",
     ],
 )
 
 py_test(
     name = "map_dataset_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["map_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
@@ -304,6 +234,7 @@ py_test(
         "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -327,8 +258,8 @@ py_test(
     srcs = ["range_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -339,11 +270,8 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:iterator_ops",
     ],
 )
 
@@ -389,8 +317,27 @@ py_test(
 )
 
 py_test(
-    name = "sequence_dataset_op_test",
+    name = "scan_dataset_op_test",
     size = "small",
+    srcs = ["scan_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":dataset_serialization_test",
+        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "sequence_dataset_op_test",
+    size = "medium",
     srcs = ["sequence_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
@@ -406,33 +353,36 @@ py_test(
 )
 
 py_test(
-    name = "shard_dataset_op_test",
+    name = "serialization_integration_test",
     size = "small",
-    srcs = ["shard_dataset_op_test.py"],
+    srcs = ["serialization_integration_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
 py_test(
     name = "shuffle_dataset_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["shuffle_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:iterator_ops",
+        "//tensorflow/contrib/data/python/ops:shuffle_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:training",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//third_party/py/numpy",
@@ -450,19 +400,41 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "@org_sqlite//:python",
     ],
 )
 
 py_test(
     name = "stats_dataset_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["stats_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":dataset_serialization_test",
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+    ],
+)
+
+py_test(
+    name = "unique_dataset_op_test",
+    size = "small",
+    srcs = ["unique_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/stateless",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -493,7 +465,7 @@ py_test(
         "no_oss",  # b/68785503
     ],
     deps = [
-        "//tensorflow/contrib/data/python/ops:prefetching_py",
+        "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index a939b3c841286a3b5786268dc3a9c82fd7359bfb..71dc1c1172c9d515d4c85f85257c952135098329 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -23,292 +23,33 @@ import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import batching
-from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
-from tensorflow.python.util import compat
 
 
 class BatchDatasetTest(test.TestCase):
 
-  def testBatchDataset(self):
-    """Test an dataset that maps a TF function across its input elements."""
-    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
-    # RepeatDataset(count) -> BatchDataset(batch_size).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(count).batch(batch_size).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
-
-    with self.test_session() as sess:
-      # Batch of a finite input, where the batch_size divides the
-      # total number of elements.
-      sess.run(init_op, feed_dict={count: 28, batch_size: 14})
-      num_batches = (28 * 7) // 14
-      for i in range(num_batches):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(14):
-            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
-                                result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Batch of a finite input, where the batch_size does not
-      # divide the total number of elements.
-      sess.run(init_op, feed_dict={count: 14, batch_size: 8})
-
-      # We expect (num_batches - 1) full-sized batches.
-      num_batches = int(math.ceil((14 * 7) / 8))
-      for i in range(num_batches - 1):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(8):
-            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
-                                result_component[j])
-      result = sess.run(get_next)
-      for component, result_component in zip(components, result):
-        for j in range((14 * 7) % 8):
-          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
-                              result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Batch of an empty input should fail straight away.
-      sess.run(init_op, feed_dict={count: 0, batch_size: 8})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Empty batch should be an initialization time error.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, batch_size: 0})
-
   def assertSparseValuesEqual(self, a, b):
     self.assertAllEqual(a.indices, b.indices)
     self.assertAllEqual(a.values, b.values)
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
-  def testBatchSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
-        5).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(2):
-        actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedBatchSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(5).batch(
-        2).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      actual = sess.run(get_next)
-      expected = sparse_tensor.SparseTensorValue(
-          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0],
-                   [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0]],
-          values=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
-          dense_shape=[2, 5, 1])
-      self.assertTrue(sparse_tensor.is_sparse(actual))
-      self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchDataset(self):
-    seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
-    padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(seq_lens)
-        .map(lambda x: array_ops.fill([x], x)).padded_batch(
-            4, padded_shapes=padded_shape).make_initializable_iterator())
-
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # Test with random sequence lengths, and max padding.
-      random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(
-          init_op, feed_dict={
-              padded_shape: [-1],
-              seq_lens: random_seq_lens
-          })
-      for i in range(8):
-        result = sess.run(get_next)
-        padded_len = np.max(result)
-        self.assertEqual((4, padded_len), result.shape)
-        for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
-          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test with random sequence lengths, and constant padding.
-      sess.run(
-          init_op, feed_dict={
-              padded_shape: [25],
-              seq_lens: random_seq_lens
-          })
-      for i in range(8):
-        result = sess.run(get_next)
-        self.assertEqual((4, 25), result.shape)
-        for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
-          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[j, seq_len:], [0] * (25 - seq_len))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test correct handling of empty tensors.
-      sess.run(init_op, feed_dict={padded_shape: [-1], seq_lens: [0, 0, 0, 0]})
-      result = sess.run(get_next)
-      self.assertAllEqual([[], [], [], []], result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test error handling with constant sequence lengths, and
-      # too-short padding.
-      sess.run(init_op, feed_dict={padded_shape: [5], seq_lens: [6, 5, 5, 5]})
-      with self.assertRaises(errors.DataLossError):
-        result = sess.run(get_next)
-
-  def testPaddedBatchDatasetNonDefaultPadding(self):
-    seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
-    padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
-
-    def fill_tuple(x):
-      filled = array_ops.fill([x], x)
-      return (filled, string_ops.as_string(filled))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
-        .padded_batch(
-            4,
-            padded_shapes=(padded_shape, padded_shape),
-            padding_values=(-1, "<end>")).make_initializable_iterator())
-
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # Test with random sequence lengths, and max padding.
-      random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(
-          init_op, feed_dict={
-              padded_shape: [-1],
-              seq_lens: random_seq_lens
-          })
-      for i in range(8):
-        result = sess.run(get_next)
-        padded_len = np.max(result[0])
-        self.assertEqual((4, padded_len), result[0].shape)
-        self.assertEqual((4, padded_len), result[1].shape)
-        for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
-          self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[0][j, seq_len:],
-                              [-1] * (padded_len - seq_len))
-          self.assertAllEqual(result[1][j, :seq_len],
-                              [compat.as_bytes(str(seq_len))] * seq_len)
-          self.assertAllEqual(result[1][j, seq_len:],
-                              [b"<end>"] * (padded_len - seq_len))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchDatasetShapeSpecifications(self):
-    int_placeholder = array_ops.placeholder(dtypes.int32)
-    float_placeholder = array_ops.placeholder(dtypes.float32)
-    string_placeholder = array_ops.placeholder(dtypes.string)
-    input_dataset = dataset_ops.Dataset.from_tensors(
-        (int_placeholder, float_placeholder, string_placeholder))
-
-    # Test different ways of specifying the `padded_shapes` argument.
-    dynamic_padding_from_tensor_shapes = input_dataset.padded_batch(
-        32,
-        padded_shapes=(tensor_shape.TensorShape([None]),
-                       tensor_shape.TensorShape([None, None]),
-                       tensor_shape.TensorShape([37])))
-    dynamic_padding_from_lists = input_dataset.padded_batch(
-        32, padded_shapes=([None], [None, None], [37]))
-    dynamic_padding_from_lists_with_minus_one = input_dataset.padded_batch(
-        32, padded_shapes=([-1], [-1, -1], [37]))
-    dynamic_padding_from_tensors = input_dataset.padded_batch(
-        32,
-        padded_shapes=(constant_op.constant([-1], dtype=dtypes.int64),
-                       constant_op.constant([-1, -1], dtype=dtypes.int64),
-                       constant_op.constant([37], dtype=dtypes.int64)))
-
-    for dataset in [
-        dynamic_padding_from_tensor_shapes, dynamic_padding_from_lists,
-        dynamic_padding_from_lists_with_minus_one, dynamic_padding_from_tensors
-    ]:
-      self.assertEqual([None, None], dataset.output_shapes[0].as_list())
-      self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
-      self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
-
-  def testPaddedBatchSparseError(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
-
-    with self.assertRaises(TypeError):
-      _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
-
   def testDenseToSparseBatchDataset(self):
     components = np.random.randint(12, size=(100,)).astype(np.int32)
     iterator = (
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.fill([x], x)).apply(
-            batching.dense_to_sparse_batch(4,
-                                           [12])).make_initializable_iterator())
+            batching.dense_to_sparse_batch(4, [12]))
+        .make_initializable_iterator())
     init_op = iterator.initializer
-    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+    get_next = iterator.get_next()
 
     with self.test_session() as sess:
       sess.run(init_op)
@@ -334,9 +75,9 @@ class BatchDatasetTest(test.TestCase):
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.fill([x, x], x)).apply(
             batching.dense_to_sparse_batch(
-                4, [5, -1])).make_initializable_iterator())
+                4, [5, None])).make_initializable_iterator())
     init_op = iterator.initializer
-    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+    get_next = iterator.get_next()
 
     with self.test_session() as sess:
       sess.run(init_op)
@@ -363,25 +104,18 @@ class BatchDatasetTest(test.TestCase):
 
   def testDenseToSparseBatchDatasetWithInvalidShape(self):
     input_tensor = array_ops.constant([[1]])
-    iterator = (
-        dataset_ops.Dataset.from_tensors(input_tensor).apply(
-            batching.dense_to_sparse_batch(4, [-2]))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-
-    with self.test_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Dimension -2 must be >= -1"):
-        sess.run(init_op)
+    with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"):
+      dataset_ops.Dataset.from_tensors(input_tensor).apply(
+          batching.dense_to_sparse_batch(4, [-2])).make_initializable_iterator()
 
   def testDenseToSparseBatchDatasetShapeErrors(self):
     input_tensor = array_ops.placeholder(dtypes.int32)
     iterator = (
         dataset_ops.Dataset.from_tensors(input_tensor).apply(
-            batching.dense_to_sparse_batch(4,
-                                           [12])).make_initializable_iterator())
+            batching.dense_to_sparse_batch(4, [12]))
+        .make_initializable_iterator())
     init_op = iterator.initializer
-    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+    get_next = iterator.get_next()
 
     with self.test_session() as sess:
       # Initialize with an input tensor of incompatible rank.
@@ -577,7 +311,7 @@ class BatchDatasetTest(test.TestCase):
     self.assertEqual([None], dataset.output_shapes[1][0].as_list())
     self.assertEqual([None, 30], dataset.output_shapes[1][1].as_list())
 
-  def testBatchAndMapDataset(self):
+  def _testBatchAndMapDatasetHelper(self, num_parallel_batches=1):
     """Test a dataset that maps a TF function across its input elements."""
     # The pipeline is TensorSliceDataset ->
     # RepeatDataset(count) -> BatchAndMapDataset(square_3, batch_size).
@@ -593,7 +327,10 @@ class BatchDatasetTest(test.TestCase):
 
     iterator = (
         dataset_ops.Dataset.from_tensor_slices(components).repeat(count).apply(
-            batching.map_and_batch(_map_fn, batch_size))
+            batching.map_and_batch(
+                map_func=_map_fn,
+                batch_size=batch_size,
+                num_parallel_batches=num_parallel_batches))
         .make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -627,7 +364,11 @@ class BatchDatasetTest(test.TestCase):
           for j in range(8):
             self.assertAllEqual(component[(i * 8 + j) % 7]**2,
                                 result_component[j])
-      # The last batch should fail with `OutOfRange`.
+      result = sess.run(get_next)
+      for component, result_component in zip(components, result):
+        for j in range((14 * 7) % 8):
+          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
+                              result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -640,6 +381,12 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(init_op, feed_dict={count: 14, batch_size: 0})
 
+  def testBatchAndMapDataset(self):
+    return self._testBatchAndMapDatasetHelper()
+
+  def testBatchAndMapDatasetWithParallelBatching(self):
+    return self._testBatchAndMapDatasetHelper(num_parallel_batches=10)
+
   def testMapAndBatchSparse(self):
 
     def _sparse(i):
@@ -722,6 +469,39 @@ class BatchDatasetSerializationTest(
         lambda: self.build_dataset(20.0, tensor_slice_len, batch_size),
         num_outputs)
 
+  def _build_dataset_dense_to_sparse(self, components):
+    return dataset_ops.Dataset.from_tensor_slices(components).map(
+        lambda x: array_ops.fill([x], x)).apply(
+            batching.dense_to_sparse_batch(4, [12]))
+
+  # TODO(b/70988345): Re-enable when sparse tensors are properly supported by
+  # the DatasetSerializationTestBase.
+  def _testDenseToSparseBatchDatasetCore(self):
+    components = np.random.randint(5, size=(40,)).astype(np.int32)
+    diff_comp = np.random.randint(2, size=(100,)).astype(np.int32)
+
+    num_outputs = len(components) // 4
+    self.run_core_tests(lambda: self._build_dataset_dense_to_sparse(components),
+                        lambda: self._build_dataset_dense_to_sparse(diff_comp),
+                        num_outputs)
+
+  def _sparse(self, i):
+    return sparse_tensor.SparseTensorValue(
+        indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+  def _build_dataset_sparse(self, batch_size=5):
+    return dataset_ops.Dataset.range(10).map(self._sparse).batch(batch_size)
+
+  def testSparseCore(self):
+    self.run_core_tests(self._build_dataset_sparse,
+                        lambda: self._build_dataset_sparse(2), 2)
+
+  def _build_dataset_nested_sparse(self):
+    return dataset_ops.Dataset.range(10).map(self._sparse).batch(5).batch(2)
+
+  def testNestedSparseCore(self):
+    self.run_core_tests(self._build_dataset_nested_sparse, None, 1)
+
 
 class PaddedBatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index 765ed53618958a8c49b26e416c57be28ea3bba73..f1b494e1a620992365ed75613b508e32f94b40a4 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -19,8 +19,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import grouping
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -40,8 +41,7 @@ class GroupByWindowTest(test.TestCase):
         dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
         .apply(
             grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4))
-        .make_initializable_iterator())
+                                     4)).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -52,7 +52,8 @@ class GroupByWindowTest(test.TestCase):
         while True:
           result = sess.run(get_next)
           self.assertTrue(
-              all(x % 2 == 0 for x in result) or all(x % 2 == 1)
+              all(x % 2 == 0
+                  for x in result) or all(x % 2 == 1)
               for x in result)
           counts.append(result.shape[0])
 
@@ -115,8 +116,8 @@ class GroupByWindowTest(test.TestCase):
     iterator = (
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
-            grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32))
-        .make_initializable_iterator())
+            grouping.group_by_window(lambda x, _: x % 2, reduce_func,
+                                     32)).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -135,7 +136,8 @@ class GroupByWindowTest(test.TestCase):
           window.padded_batch(
               4, padded_shapes=tensor_shape.TensorShape([None])),
           window.padded_batch(
-              4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),))
+              4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),
+      ))
 
     iterator = (
         dataset_ops.Dataset.from_tensor_slices(components)
@@ -160,6 +162,34 @@ class GroupByWindowTest(test.TestCase):
       self.assertEqual(len(components), sum(counts))
 
 
+class GroupByWindowSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_dataset(self, components):
+    return dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply(
+        grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4), 4))
+
+  def testCoreGroupByWindow(self):
+    components = np.array(
+        [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
+    self.verify_unused_iterator(
+        lambda: self._build_dataset(components), 12, verify_exhausted=False)
+    self.verify_init_before_restore(
+        lambda: self._build_dataset(components), 12, verify_exhausted=False)
+    self.verify_multiple_breaks(
+        lambda: self._build_dataset(components), 12, verify_exhausted=False)
+    self.verify_reset_restored_iterator(
+        lambda: self._build_dataset(components), 12, verify_exhausted=False)
+    self.verify_restore_in_empty_graph(
+        lambda: self._build_dataset(components), 12, verify_exhausted=False)
+    diff_components = np.array([0, 0, 0, 1, 1, 1], dtype=np.int64)
+    self.verify_restore_in_modified_graph(
+        lambda: self._build_dataset(components),
+        lambda: self._build_dataset(diff_components),
+        12,
+        verify_exhausted=False)
+
+
 # NOTE(mrry): These tests are based on the tests in bucket_ops_test.py.
 # Currently, they use a constant batch size, though should be made to use a
 # different batch size per key.
@@ -171,9 +201,10 @@ class BucketTest(test.TestCase):
     # dynamically and does not rely on static shape information about
     # the arguments.
     return dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch(
-            32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape([None]),
-                 tensor_shape.TensorShape([3])))))
+        (dataset_ops.Dataset.from_tensors(bucket),
+         window.padded_batch(
+             32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape(
+                 [None]), tensor_shape.TensorShape([3])))))
 
   def testSingleBucket(self):
 
@@ -278,12 +309,13 @@ class BucketTest(test.TestCase):
 
     def _dynamic_pad_fn(bucket, window, _):
       return dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch(
-              32, {
-                  "x": tensor_shape.TensorShape([]),
-                  "y": tensor_shape.TensorShape([None]),
-                  "z": tensor_shape.TensorShape([3])
-              })))
+          (dataset_ops.Dataset.from_tensors(bucket),
+           window.padded_batch(
+               32, {
+                   "x": tensor_shape.TensorShape([]),
+                   "y": tensor_shape.TensorShape([None]),
+                   "z": tensor_shape.TensorShape([3])
+               })))
 
     input_dataset = (
         dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn)
diff --git a/tensorflow/contrib/data/python/kernel_tests/cache_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/cache_dataset_op_test.py
deleted file mode 100644
index 9818020680afb9d0f0197d272ec5339c6358db36..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/data/python/kernel_tests/cache_dataset_op_test.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from os import path
-import shutil
-import tempfile
-
-import numpy as np
-
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class FilesystemCacheDatasetTest(test.TestCase):
-
-  def setUp(self):
-    self.tmp_dir = tempfile.mkdtemp()
-    self.cache_prefix = path.join(self.tmp_dir, "cache")
-
-  def tearDown(self):
-    if self.tmp_dir:
-      shutil.rmtree(self.tmp_dir, ignore_errors=True)
-
-  def testCacheDatasetPassthrough(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
-
-    cache_dataset = repeat_dataset.cache(filename_placeholder)
-
-    self.assertEqual(
-        tuple([c.shape[1:] for c in components]), cache_dataset.output_shapes)
-
-    # Create initialization ops for iterators without and with
-    # caching, respectively.
-    iterator = iterator_ops.Iterator.from_structure(cache_dataset.output_types,
-                                                    cache_dataset.output_shapes)
-    init_fifo_op = iterator.make_initializer(repeat_dataset)
-    init_cache_op = iterator.make_initializer(cache_dataset)
-
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # First run without caching to collect the "ground truth".
-      sess.run(init_fifo_op)
-      elements = []
-      for _ in range(20):
-        elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Assert that the cached dataset has the same elements as the
-      # "ground truth".
-      sess.run(
-          init_cache_op, feed_dict={filename_placeholder: self.cache_prefix})
-      cached_elements = []
-      for _ in range(20):
-        cached_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(elements, cached_elements)
-
-      # Re-initialize with an empty upstream (to throw errors.OutOfRangeError
-      # if we didn't use the cache).
-      sess.run(
-          init_cache_op,
-          feed_dict={
-              count_placeholder: 0,
-              filename_placeholder: self.cache_prefix
-          })
-      replayed_elements = []
-      for _ in range(20):
-        replayed_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(cached_elements, replayed_elements)
-
-      # Re-initialize with an empty upstream and a missing cache file (should
-      # throw errors.OutOfRangeError immediately).
-      sess.run(
-          init_cache_op,
-          feed_dict={
-              count_placeholder: 0,
-              filename_placeholder: self.cache_prefix + "nonsense"
-          })
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcurrentWriters(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    cache_dataset1 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-    cache_dataset2 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-
-    iterator1 = cache_dataset1.make_initializable_iterator()
-    iterator2 = cache_dataset2.make_initializable_iterator()
-    init_cache_op1 = iterator1.initializer
-    init_cache_op2 = iterator2.initializer
-
-    get_next1 = iterator1.get_next()
-    get_next2 = iterator2.get_next()
-
-    with self.test_session() as sess:
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      sess.run(get_next1)  # this should succeed
-
-      sess.run(
-          init_cache_op2, feed_dict={filename_placeholder: self.cache_prefix})
-      with self.assertRaises(errors.AlreadyExistsError):
-        sess.run(get_next2)
-
-      sess.run(get_next1)  # this should continue to succeed
-
-  def testConcurrentReaders(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    cache_dataset1 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-    cache_dataset2 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-
-    iterator1 = cache_dataset1.make_initializable_iterator()
-    iterator2 = cache_dataset2.make_initializable_iterator()
-    init_cache_op1 = iterator1.initializer
-    init_cache_op2 = iterator2.initializer
-
-    get_next1 = iterator1.get_next()
-    get_next2 = iterator2.get_next()
-
-    with self.test_session() as sess:
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      elements = []
-      for _ in range(4):
-        elements.append(sess.run(get_next1))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next1)
-
-      # Re-initialize
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      sess.run(
-          init_cache_op2, feed_dict={filename_placeholder: self.cache_prefix})
-
-      # Reading concurrently should succeed.
-      elements_itr1 = []
-      elements_itr2 = []
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      # Intentionally reversing the order
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next2)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next1)
-
-      self.assertAllEqual(elements, elements_itr1)
-      self.assertAllEqual(elements, elements_itr2)
-
-
-class MemoryCacheDatasetTest(test.TestCase):
-
-  def testCacheDatasetPassthrough(self):
-    repeat_count = variables.Variable(constant_op.constant(10, dtypes.int64))
-    dataset = dataset_ops.Dataset.range(3).flat_map(
-        lambda x: dataset_ops.Dataset.from_tensors(x).repeat(repeat_count))
-
-    cached_dataset = dataset.cache().repeat(2)
-    uncached_dataset = dataset.repeat(2)
-
-    # Needs to be initializable to capture the variable.
-    cached_iterator = cached_dataset.make_initializable_iterator()
-    cached_next = cached_iterator.get_next()
-    uncached_iterator = uncached_dataset.make_initializable_iterator()
-    uncached_next = uncached_iterator.get_next()
-
-    with self.test_session() as sess:
-
-      sess.run(repeat_count.initializer)
-      sess.run(cached_iterator.initializer)
-      sess.run(uncached_iterator.initializer)
-
-      for i in range(3):
-        for _ in range(10):
-          self.assertEqual(sess.run(cached_next), i)
-          self.assertEqual(sess.run(uncached_next), i)
-
-      sess.run(repeat_count.assign(0))
-
-      # The uncached iterator should now be empty.
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(uncached_next)
-
-      # The cached iterator replays from cache.
-      for i in range(3):
-        for _ in range(10):
-          self.assertEqual(sess.run(cached_next), i)
-
-      # The cached iterator should now be empty.
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(cached_next)
-
-  def testEmptyCacheReading(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
-
-    cache_dataset = repeat_dataset.cache()
-
-    # Create initialization ops for iterators without and with
-    # caching, respectively.
-    iterator = cache_dataset.make_initializable_iterator()
-    init_cache_op = iterator.initializer
-
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # Initialize with an empty upstream and a missing cache file (should
-      # throw errors.OutOfRangeError immediately).
-      sess.run(init_cache_op, feed_dict={count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcurrentReaders(self):
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    dataset = dataset_ops.Dataset.range(count_placeholder).cache()
-    d1 = dataset.map(lambda x: x + 1)
-    d2 = dataset.map(lambda x: x + 6)
-
-    i1 = d1.make_initializable_iterator()
-    i2 = d2.make_initializable_iterator()
-
-    with self.test_session() as sess:
-      sess.run(i1.initializer)
-
-      self.assertEqual(1, sess.run(i1.get_next()))
-      self.assertEqual(2, sess.run(i1.get_next()))
-      self.assertEqual(3, sess.run(i1.get_next()))
-
-      sess.run(i2.initializer, feed_dict={count_placeholder: 3})
-
-      self.assertEqual(6, sess.run(i2.get_next()))
-      self.assertEqual(7, sess.run(i2.get_next()))
-      self.assertEqual(4, sess.run(i1.get_next()))  # interleave execution
-      self.assertEqual([8, 5], sess.run([i2.get_next(), i1.get_next()]))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(i1.get_next())
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(i2.get_next())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/concatenate_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/concatenate_dataset_op_test.py
index 870352209a08e6bc08bcca227ba455ad1851e8bf..17f2980157ddd0350dafd1d745cbb9b64e65f7c5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/concatenate_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/concatenate_dataset_op_test.py
@@ -17,255 +17,32 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.contrib.data.python.ops import iterator_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training import saver as saver_lib
 
 
-class ConcatenateDatasetTest(test.TestCase):
+class ConcatenateDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
 
-  def testConcatenateDataset(self):
-    input_components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 20),
-        np.tile(np.array([[12], [13], [14], [15]]), 15),
-        np.array([37.0, 38.0, 39.0, 40.0]))
-    to_concatenate_components = (
-        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
-        np.tile(np.array([[12], [13], [14], [15], [16]]), 15),
-        np.array([37.0, 38.0, 39.0, 40.0, 41.0]))
-
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-    concatenated = input_dataset.concatenate(dataset_to_concatenate)
-    self.assertEqual(concatenated.output_shapes, (tensor_shape.TensorShape(
-        [20]), tensor_shape.TensorShape([15]), tensor_shape.TensorShape([])))
-
-    iterator = concatenated.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(9):
-        result = sess.run(get_next)
-        if i < 4:
-          for component, result_component in zip(input_components, result):
-            self.assertAllEqual(component[i], result_component)
-        else:
-          for component, result_component in zip(to_concatenate_components,
-                                                 result):
-            self.assertAllEqual(component[i - 4], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcatenateDatasetDifferentShape(self):
-    input_components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 20),
-        np.tile(np.array([[12], [13], [14], [15]]), 4))
-    to_concatenate_components = (
-        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
-        np.tile(np.array([[12], [13], [14], [15], [16]]), 15))
-
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-    concatenated = input_dataset.concatenate(dataset_to_concatenate)
-    self.assertEqual(
-        [ts.as_list()
-         for ts in nest.flatten(concatenated.output_shapes)], [[20], [None]])
-
-    iterator = concatenated.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(9):
-        result = sess.run(get_next)
-        if i < 4:
-          for component, result_component in zip(input_components, result):
-            self.assertAllEqual(component[i], result_component)
-        else:
-          for component, result_component in zip(to_concatenate_components,
-                                                 result):
-            self.assertAllEqual(component[i - 4], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcatenateDatasetDifferentStructure(self):
-    input_components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 5),
-        np.tile(np.array([[12], [13], [14], [15]]), 4))
-    to_concatenate_components = (
-        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
-        np.tile(np.array([[12], [13], [14], [15], [16]]), 15),
-        np.array([37.0, 38.0, 39.0, 40.0, 41.0]))
-
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-
-    with self.assertRaisesRegexp(ValueError,
-                                 "don't have the same number of elements"):
-      input_dataset.concatenate(dataset_to_concatenate)
-
-  def testConcatenateDatasetDifferentType(self):
-    input_components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 5),
-        np.tile(np.array([[12], [13], [14], [15]]), 4))
-    to_concatenate_components = (
-        np.tile(np.array([[1.0], [2.0], [3.0], [4.0]]), 5),
-        np.tile(np.array([[12], [13], [14], [15]]), 15))
-
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-
-    with self.assertRaisesRegexp(TypeError, "have different types"):
-      input_dataset.concatenate(dataset_to_concatenate)
-
-  def _iterator_checkpoint_prefix(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
-
-  def _build_graph(self, input_components, to_concatenate_components):
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-    iterator = input_dataset.concatenate(
-        dataset_to_concatenate).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    saveable = iterator_ops.make_saveable_from_iterator(iterator)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-    # TODO(shivaniagrawal) : non-intuitive way, add support in mata_graph
-    for t in nest.flatten(get_next):
-      ops.add_to_collection("get_next", t)
-    return init_op, get_next
-
-  def _testSaveRestoreUtility(self, start, break_range, stop):
-    path = self._iterator_checkpoint_prefix()
-    step = 0
-    meta_filename = path + "-%d.meta" % step
-
-    input_components = (np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
-        np.array([[12], [13], [14], [15]]), 4))
-    to_concatenate_components = (np.tile(
-        np.array([[5], [6], [7], [8], [9]]), 20), np.tile(
-            np.array([[16], [17], [18], [19], [20]]), 15))
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next = self._build_graph(input_components,
-                                            to_concatenate_components)
-      saver = saver_lib.Saver()
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for i in range(start, break_range):
-          result = sess.run(get_next)
-          if i < 4:
-            for component, result_component in zip(input_components, result):
-              self.assertAllEqual(component[i], result_component)
-          else:
-            for component, result_component in zip(to_concatenate_components,
-                                                   result):
-              self.assertAllEqual(component[i - 4], result_component)
-        saver.save(sess, path, step)
-
-    with ops.Graph().as_default() as g:
-      saver = saver_lib.import_meta_graph(meta_filename)
-      with self.test_session(graph=g) as sess:
-        get_next = nest.pack_sequence_as(("a", "b"),
-                                         ops.get_collection("get_next"))
-        saver.restore(sess, saver_lib.latest_checkpoint(self.get_temp_dir()))
-        for i in range(break_range, stop):
-          result = sess.run(get_next)
-          if i < 4:
-            for component, result_component in zip(input_components, result):
-              self.assertAllEqual(component[i], result_component)
-          else:
-            for component, result_component in zip(to_concatenate_components,
-                                                   result):
-              self.assertAllEqual(component[i - 4], result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testRestoreAtFirstDataset(self):
-    start = 0
-    stop = 9
-    break_range = 3
-    self._testSaveRestoreUtility(start, break_range, stop)
-
-  def testRestoreAtSecondDataset(self):
-    start = 0
-    stop = 9
-    break_range = 6
-    self._testSaveRestoreUtility(start, break_range, stop)
-
-  def testRestoreAtBetweenDatasets(self):
-    start = 0
-    stop = 9
-    break_range = 4
-    self._testSaveRestoreUtility(start, break_range, stop)
-
-  def testRestoreExhaustedIterator(self):
-    start = 0
-    stop = 9
-    break_range = 9
-    self._testSaveRestoreUtility(start, break_range, stop)
-
-  def testRestoreInModifiedGraph(self):
-    start = 0
-    stop = 9
-    break_range = 6
-    path = self._iterator_checkpoint_prefix()
-    step = 0
-
-    input_components = (np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
-        np.array([[12], [13], [14], [15]]), 4))
+  def _build_concatenate_dataset(self, var_array):
+    input_components = (np.tile(np.array([[1], [2], [3], [4]]), 20),
+                        np.tile(np.array([[12], [13], [14], [15]]), 4))
     to_concatenate_components = (np.tile(
-        np.array([[5], [6], [7], [8], [9]]), 20), np.tile(
-            np.array([[16], [17], [18], [19], [20]]), 15))
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next = self._build_graph(input_components,
-                                            to_concatenate_components)
-      saver = saver_lib.Saver(allow_empty=True)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for i in range(start, break_range):
-          result = sess.run(get_next)
-          if i < 4:
-            for component, result_component in zip(input_components, result):
-              self.assertAllEqual(component[i], result_component)
-          else:
-            for component, result_component in zip(to_concatenate_components,
-                                                   result):
-              self.assertAllEqual(component[i - 4], result_component)
-        saver.save(sess, path, step)
-
-    new_to_concatenate_components = (np.array([[5], [6], [7], [8], [9]]),
-                                     np.array([[16], [17], [18], [19], [20]]))
-    with ops.Graph().as_default() as g:
-      init_op, get_next = self._build_graph(input_components,
-                                            new_to_concatenate_components)
-      saver = saver_lib.Saver()
-      with self.test_session(graph=g) as sess:
-        saver.restore(sess, saver_lib.latest_checkpoint(self.get_temp_dir()))
-        for i in range(break_range, stop):
-          result = sess.run(get_next)
-          for component, result_component in zip(to_concatenate_components,
-                                                 result):
-            self.assertAllEqual(component[i - 4], result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+        np.array([[5], [6], [7], [8], [9]]), 20), var_array)
+
+    return dataset_ops.Dataset.from_tensor_slices(input_components).concatenate(
+        dataset_ops.Dataset.from_tensor_slices(to_concatenate_components))
+
+  def testConcatenateCore(self):
+    num_outputs = 9
+    array = np.tile(np.array([[16], [17], [18], [19], [20]]), 15)
+    diff_array = np.array([[1], [2], [3], [4], [5]])
+    self.run_core_tests(lambda: self._build_concatenate_dataset(array),
+                        lambda: self._build_concatenate_dataset(diff_array),
+                        num_outputs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
index 55a1d3b95b212466b262ad3c26f1efd7ed0e067e..a842502cc6fe3605dde0be5f50cf46e3e37d7ed4 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
@@ -17,712 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import batching
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
 
 class DatasetConstructorTest(test.TestCase):
 
-  def testFromTensors(self):
-    """Test an dataset that represents a single tuple of tensors."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-
-    iterator = (dataset_ops.Dataset.from_tensors(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def assertSparseValuesEqual(self, a, b):
-    self.assertAllEqual(a.indices, b.indices)
-    self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.dense_shape, b.dense_shape)
-
-  def testFromTensorsSparse(self):
-    """Test an dataset that represents a single tuple of tensors."""
-    components = (sparse_tensor.SparseTensorValue(
-        indices=np.array([[0]]),
-        values=np.array([0]),
-        dense_shape=np.array([1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1]]),
-                      values=np.array([-1, 1]),
-                      dense_shape=np.array([2, 2])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(
-        [tensor_shape.TensorShape(c.dense_shape) for c in components],
-        [shape for shape in iterator.output_shapes])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        self.assertSparseValuesEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorsMixed(self):
-    """Test an dataset that represents a single tuple of tensors."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0]]),
-                      values=np.array([0]),
-                      dense_shape=np.array([1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1]]),
-                      values=np.array([-1, 1]),
-                      dense_shape=np.array([2, 2])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([
-        tensor_shape.TensorShape(c.dense_shape)
-        if sparse_tensor.is_sparse(c) else c.shape for c in components
-    ], [shape for shape in iterator.output_shapes])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        if sparse_tensor.is_sparse(component):
-          self.assertSparseValuesEqual(component, result_component)
-        else:
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlices(self):
-    """Test an dataset that represents the slices from a tuple of tensors."""
-    components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
-            np.array([[12], [13], [14], [15]]), 22),
-        np.array([37.0, 38.0, 39.0, 40.0])
-    )
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(4):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesSparse(self):
-    """Test an dataset that represents the slices from a tuple of tensors."""
-    components = (sparse_tensor.SparseTensorValue(
-        indices=np.array([[0, 0], [1, 0], [2, 0]]),
-        values=np.array([0, 0, 0]),
-        dense_shape=np.array([3, 1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
-                      values=np.array([1, 2, 3]),
-                      dense_shape=np.array([3, 3])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(
-        [tensor_shape.TensorShape(c.dense_shape[1:]) for c in components],
-        [shape for shape in iterator.output_shapes])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      expected = [
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[0]]),
-               values=np.array([1]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[1]]),
-               values=np.array([2]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[2]]),
-               values=np.array([3]),
-               dense_shape=np.array([3]))),
-      ]
-      for i in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(expected[i], results):
-          self.assertSparseValuesEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesMixed(self):
-    """Test an dataset that represents the slices from a tuple of tensors."""
-    components = (np.tile(np.array([[1], [2], [3]]), 20),
-                  np.tile(np.array([[12], [13], [14]]), 22),
-                  np.array([37.0, 38.0, 39.0]),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 0], [2, 0]]),
-                      values=np.array([0, 0, 0]),
-                      dense_shape=np.array([3, 1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
-                      values=np.array([1, 2, 3]),
-                      dense_shape=np.array([3, 3])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([
-        tensor_shape.TensorShape(c.dense_shape[1:])
-        if sparse_tensor.is_sparse(c) else c.shape[1:] for c in components
-    ], [shape for shape in iterator.output_shapes])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      expected = [
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[0]]),
-               values=np.array([1]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[1]]),
-               values=np.array([2]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[2]]),
-               values=np.array([3]),
-               dense_shape=np.array([3]))),
-      ]
-      for i in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            (zip(*components[:3])[i] + expected[i]), results):
-          if sparse_tensor.is_sparse(component):
-            self.assertSparseValuesEqual(component, result_component)
-          else:
-            self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesWithDict(self):
-    components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(dtypes.int32, iterator.output_types["foo"])
-    self.assertEqual(dtypes.float32, iterator.output_types["bar"])
-    self.assertEqual((), iterator.output_shapes["foo"])
-    self.assertEqual((1,), iterator.output_shapes["bar"])
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(3):
-        results = sess.run(get_next)
-        self.assertEqual(components["foo"][i], results["foo"])
-        self.assertEqual(components["bar"][i], results["bar"])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromSparseTensorSlices(self):
-    """Test a dataset based on slices of a `tf.SparseTensor`."""
-    st = array_ops.sparse_placeholder(dtypes.float64)
-    iterator = (dataset_ops.Dataset.from_sparse_tensor_slices(st)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
-
-    with self.test_session() as sess:
-      slices = [[1., 2., 3.], [1.], [1.], [1., 2.], [], [1., 2.], [], [], []]
-
-      # Test with sparse tensor in the appropriate order.
-      indices = np.array(
-          [[i, j] for i in range(len(slices)) for j in range(len(slices[i]))])
-      values = np.array([val for s in slices for val in s])
-      dense_shape = np.array([len(slices), max(len(s) for s in slices) + 1])
-      sparse_feed = sparse_tensor.SparseTensorValue(indices, values,
-                                                    dense_shape)
-      sess.run(init_op, feed_dict={st: sparse_feed})
-      for i, s in enumerate(slices):
-        results = sess.run(get_next)
-        self.assertAllEqual(s, results.values)
-        expected_indices = np.array(
-            [[j] for j in range(len(slices[i]))]).reshape([-1, 1])
-        self.assertAllEqual(expected_indices, results.indices)
-        self.assertAllEqual(dense_shape[1:], results.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test with sparse tensor in the reverse order, which is not
-      # currently supported.
-      reverse_order_indices = indices[::-1, :]
-      reverse_order_values = values[::-1]
-      sparse_feed = sparse_tensor.SparseTensorValue(
-          reverse_order_indices, reverse_order_values, dense_shape)
-      with self.assertRaises(errors.UnimplementedError):
-        sess.run(init_op, feed_dict={st: sparse_feed})
-
-      # Test with an empty sparse tensor.
-      empty_indices = np.empty((0, 4), dtype=np.int64)
-      empty_values = np.empty((0,), dtype=np.float64)
-      empty_dense_shape = [0, 4, 37, 9]
-      sparse_feed = sparse_tensor.SparseTensorValue(empty_indices, empty_values,
-                                                    empty_dense_shape)
-      sess.run(init_op, feed_dict={st: sparse_feed})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  # pylint: disable=g-long-lambda,unnecessary-lambda
-  def testNestedStructure(self):
-    components = (np.array([1, 2, 3]), (np.array([4., 5.]), np.array([6., 7.])),
-                  np.array([8, 9, 10]))
-
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.shuffle(10, 10)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.repeat(-1)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.filter(lambda x, y, z: True)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.take(5)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
-
-    dataset = dataset.flat_map(
-        lambda x, y: dataset_ops.Dataset.from_tensors(((x[0], x[1]),
-                                                       (y[0], y[1])))
-    )
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
-
-    dataset = dataset.batch(32)
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([None, 3], [None, 3]), ([None, 2], [None, 2])),
-                      nest.pack_sequence_as(dataset.output_shapes, [
-                          s.as_list()
-                          for s in nest.flatten(dataset.output_shapes)
-                      ]))
-
-    iterator = dataset.make_one_shot_iterator()
-    (w, x), (y, z) = iterator.get_next()
-    self.assertEquals(dtypes.int64, w.dtype)
-    self.assertEquals(dtypes.int64, x.dtype)
-    self.assertEquals(dtypes.float64, y.dtype)
-    self.assertEquals(dtypes.float64, z.dtype)
-    self.assertEquals([None, 3], w.shape.as_list())
-    self.assertEquals([None, 3], x.shape.as_list())
-    self.assertEquals([None, 2], y.shape.as_list())
-    self.assertEquals([None, 2], z.shape.as_list())
-
-    iterator = dataset.make_initializable_iterator()
-    (w, x), (y, z) = iterator.get_next()
-    self.assertEquals(dtypes.int64, w.dtype)
-    self.assertEquals(dtypes.int64, x.dtype)
-    self.assertEquals(dtypes.float64, y.dtype)
-    self.assertEquals(dtypes.float64, z.dtype)
-    self.assertEquals([None, 3], w.shape.as_list())
-    self.assertEquals([None, 3], x.shape.as_list())
-    self.assertEquals([None, 2], y.shape.as_list())
-    self.assertEquals([None, 2], z.shape.as_list())
-
-    # Define a separate set of components with matching leading
-    # dimension for the from-slices constructor.
-    components_for_slices = (np.array([1, 2, 3]), (np.array(
-        [4., 5., 6.]), np.array([7., 8., 9.])), np.array([10, 11, 12]))
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(components_for_slices)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([], ([], []), []), dataset.output_shapes)
-
-  def testNestedDict(self):
-    components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]}
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals(dtypes.int32, dataset.output_types["a"]["aa"])
-    self.assertEquals(dtypes.float32, dataset.output_types["a"]["ab"])
-    self.assertEquals(dtypes.int32, dataset.output_types["b"])
-    self.assertEquals([], dataset.output_shapes["a"]["aa"])
-    self.assertEquals([2], dataset.output_shapes["a"]["ab"])
-    self.assertEquals([3], dataset.output_shapes["b"])
-
-  def testNonSequenceNestedStructure(self):
-    components = np.array([1, 2, 3])
-
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    dataset = dataset.filter(
-        lambda x: math_ops.reduce_all(math_ops.equal(x, components)))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    dataset = dataset.map(lambda x: array_ops.stack([x, x]))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([2, 3], dataset.output_shapes)
-
-    dataset = dataset.flat_map(
-        lambda x: dataset_ops.Dataset.from_tensor_slices(x))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    self.assertEquals(dtypes.int64, get_next.dtype)
-    self.assertEquals([3], get_next.shape)
-
-  def _testFromGenerator(self, generator, elem_sequence, num_repeats):
-    iterator = (
-        dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
-        .repeat(num_repeats)
-        .prefetch(5)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      for _ in range(2):  # Run twice to test reinitialization.
-        sess.run(init_op)
-        for _ in range(num_repeats):
-          for elem in elem_sequence:
-            self.assertAllEqual(elem, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def _testFromGeneratorOneShot(self, generator, elem_sequence, num_repeats):
-    iterator = (
-        dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
-        .repeat(num_repeats)
-        .prefetch(5)
-        .make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      for _ in range(num_repeats):
-        for elem in elem_sequence:
-          self.assertAllEqual(elem, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorUsingFunction(self):
-    def generator():
-      for i in range(1, 100):
-        yield [i] * i
-    elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1)
-    self._testFromGenerator(generator, elem_sequence, 5)
-    self._testFromGeneratorOneShot(generator, elem_sequence, 1)
-    self._testFromGeneratorOneShot(generator, elem_sequence, 5)
-
-  def testFromGeneratorUsingList(self):
-    generator = lambda: [[i] * i for i in range(1, 100)]
-    elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1)
-    self._testFromGenerator(generator, elem_sequence, 5)
-
-  def testFromGeneratorUsingNdarray(self):
-    generator = lambda: np.arange(100, dtype=np.int64)
-    elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1)
-    self._testFromGenerator(generator, elem_sequence, 5)
-
-  def testFromGeneratorUsingGeneratorExpression(self):
-    # NOTE(mrry): Generator *expressions* are not repeatable (or in
-    # general reusable), because they eagerly evaluate the `for`
-    # expression as `iter(range(1, 100))` and discard the means of
-    # reconstructing `range(1, 100)`. Wrapping the generator
-    # expression in a `lambda` makes it repeatable.
-    generator = lambda: ([i] * i for i in range(1, 100))
-    elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1)
-    self._testFromGenerator(generator, elem_sequence, 5)
-
-  def testFromMultipleConcurrentGenerators(self):
-    num_inner_repeats = 5
-    num_outer_repeats = 100
-
-    def generator():
-      for i in range(1, 10):
-        yield ([i] * i, [i, i ** 2, i ** 3])
-    input_list = list(generator())
-
-    # The interleave transformation is essentially a flat map that
-    # draws from multiple input datasets concurrently (in a cyclic
-    # fashion). By placing `Datsaet.from_generator()` inside an
-    # interleave, we test its behavior when multiple iterators are
-    # active at the same time; by additionally prefetching inside the
-    # interleave, we create the possibility of parallel (modulo GIL)
-    # invocations to several iterators created by the same dataset.
-    def interleave_fn(_):
-      return (dataset_ops.Dataset.from_generator(
-          generator, output_types=(dtypes.int64, dtypes.int64),
-          output_shapes=([None], [3]))
-              .repeat(num_inner_repeats).prefetch(5))
-
-    iterator = (
-        dataset_ops.Dataset.range(num_outer_repeats)
-        .interleave(interleave_fn, cycle_length=10,
-                    block_length=len(input_list))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for _ in range(num_inner_repeats * num_outer_repeats):
-        for elem in input_list:
-          val0, val1 = sess.run(get_next)
-          self.assertAllEqual(elem[0], val0)
-          self.assertAllEqual(elem[1], val1)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorsRunningInParallel(self):
-    num_parallel_iterators = 3
-
-    # Define shared state that multiple iterator instances will access to
-    # demonstrate their concurrent activity.
-    lock = threading.Lock()
-    condition = threading.Condition(lock)
-    next_ticket = [0]  # GUARDED_BY(lock)
-
-    def generator():
-      # NOTE(mrry): We yield one element before the barrier, because
-      # the current implementation of `Dataset.interleave()` must
-      # fetch one element from each incoming dataset to start the
-      # prefetching.
-      yield 0
-
-      # Define a barrier that `num_parallel_iterators` iterators must enter
-      # before any can proceed. Demonstrates that multiple iterators may be
-      # active at the same time.
-      condition.acquire()
-      ticket = next_ticket[0]
-      next_ticket[0] += 1
-      if ticket == num_parallel_iterators - 1:
-        # The last iterator to join the barrier notifies the others.
-        condition.notify_all()
-      else:
-        # Wait until the last iterator enters the barrier.
-        while next_ticket[0] < num_parallel_iterators:
-          condition.wait()
-      condition.release()
-
-      yield 1
-
-    # As in `testFromMultipleConcurrentGenerators()`, we use a combination of
-    # `Dataset.interleave()` and `Dataset.prefetch()` to cause multiple
-    # iterators to be active concurrently.
-    def interleave_fn(_):
-      return dataset_ops.Dataset.from_generator(
-          generator, output_types=dtypes.int64, output_shapes=[]).prefetch(2)
-
-    iterator = (
-        dataset_ops.Dataset.range(num_parallel_iterators)
-        .interleave(
-            interleave_fn, cycle_length=num_parallel_iterators, block_length=1)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for elem in [0, 1]:
-        for _ in range(num_parallel_iterators):
-          self.assertAllEqual(elem, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorImplicitConversion(self):
-    def generator():
-      yield [1]
-      yield [2]
-      yield [3]
-
-    for dtype in [dtypes.int8, dtypes.int32, dtypes.int64]:
-      iterator = (dataset_ops.Dataset.from_generator(
-          generator, output_types=dtype, output_shapes=[1])
-                  .make_initializable_iterator())
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-
-      self.assertEqual(dtype, get_next.dtype)
-
-      with self.test_session() as sess:
-        sess.run(init_op)
-        for expected in [[1], [2], [3]]:
-          next_val = sess.run(get_next)
-          self.assertEqual(dtype.as_numpy_dtype, next_val.dtype)
-          self.assertAllEqual(expected, next_val)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testFromGeneratorTypeError(self):
-    def generator():
-      yield np.array([1, 2, 3], dtype=np.int64)
-      yield np.array([4, 5, 6], dtype=np.int64)
-      yield "ERROR"
-      yield np.array([7, 8, 9], dtype=np.int64)
-
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.int64, output_shapes=[3])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      self.assertAllEqual([4, 5, 6], sess.run(get_next))
-      with self.assertRaisesOpError(r"invalid literal for long\(\)"):
-        sess.run(get_next)
-      self.assertAllEqual([7, 8, 9], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorShapeError(self):
-    def generator():
-      yield np.array([1, 2, 3], dtype=np.int64)
-      yield np.array([4, 5, 6], dtype=np.int64)
-      yield np.array([7, 8, 9, 10], dtype=np.int64)
-      yield np.array([11, 12, 13], dtype=np.int64)
-
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.int64, output_shapes=[3])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      self.assertAllEqual([4, 5, 6], sess.run(get_next))
-      with self.assertRaisesOpError(r"element of shape \(3,\) was expected"):
-        sess.run(get_next)
-      self.assertAllEqual([11, 12, 13], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSplitPipelineFailsWithPlacementError(self):
-    with session.Session(
-        target="",
-        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
-
-      dataset = dataset_ops.Dataset.from_tensors(0)
-
-      # Define a pipeline that attempts to use variables on two
-      # different devices.
-      #
-      # Initialize the variables before creating to iterator, to avoid the
-      # placement algorithm overriding the DT_RESOURCE colocation constraints.
-      with ops.device("/cpu:0"):
-        var_0 = resource_variable_ops.ResourceVariable(initial_value=0)
-        dataset = dataset.map(lambda x: x + var_0.read_value())
-      sess.run(var_0.initializer)
-
-      with ops.device("/cpu:1"):
-        var_1 = resource_variable_ops.ResourceVariable(initial_value=0)
-        dataset = dataset.map(lambda x: x + var_1.read_value())
-      sess.run(var_1.initializer)
-
-      iterator = dataset.make_initializable_iterator()
-
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "Trying to access resource located in device"):
-        sess.run(iterator.initializer)
-
   def testRestructureDataset(self):
     components = (array_ops.placeholder(dtypes.int32),
                   (array_ops.placeholder(dtypes.int32, shape=[None]),
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index bf25cc60a1c0efc09bed6501fd2d6f4ccb07764b..dbc35097ddda9f0375060d43aeb43efa8107f929 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -24,9 +24,11 @@ import numpy as np
 
 from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -34,12 +36,29 @@ from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util import nest
 
 
+def remove_variants(get_next_op):
+  # TODO(b/72408568): Remove this once session.run can get
+  # variant tensors.
+  """Remove variants from a nest structure, so sess.run will execute."""
+
+  def _remove_variant(x):
+    if isinstance(x, ops.Tensor) and x.dtype == dtypes.variant:
+      return ()
+    else:
+      return x
+
+  return nest.map_structure(_remove_variant, get_next_op)
+
+
 class DatasetSerializationTestBase(test.TestCase):
   """Base class for testing serializable datasets."""
 
   def tearDown(self):
     self._delete_ckpt()
 
+  # TODO(b/72657739): Remove sparse_tensor argument, which is to test the
+  # (deprecated) saveable `SparseTensorSliceDataset`, once the API
+  # `from_sparse_tensor_slices()`and related tests are deleted.
   def run_core_tests(self, ds_fn1, ds_fn2, num_outputs, sparse_tensors=False):
     """Runs the core tests.
 
@@ -231,10 +250,10 @@ class DatasetSerializationTestBase(test.TestCase):
       saver = self._import_meta_graph()
       init_op, get_next_op = self._get_iterator_ops_from_collection(
           ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self._initialize(init_op, sess)
         for _ in range(num_outputs):
           actual.append(sess.run(get_next_op))
         if verify_exhausted:
@@ -294,6 +313,7 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default() as g:
       _, get_next_op, saver = self._build_graph(
           ds_fn2, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
         for _ in range(num_outputs - break_point):
@@ -354,6 +374,7 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default() as g:
       get_next_op, saver = self._build_empty_graph(
           ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
         self._restore(saver, sess)
         for _ in range(num_outputs - break_point):
@@ -387,9 +408,9 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default() as g:
       init_op, get_next_op, saver = self._build_graph(
           ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self._initialize(init_op, sess)
         for _ in range(break_point):
           sess.run(get_next_op)
         with self.assertRaises(error):
@@ -483,20 +504,20 @@ class DatasetSerializationTestBase(test.TestCase):
       else:
         init_op, get_next_op, saver = self._build_graph(
             ds_fn, sparse_tensors=sparse_tensors)
+      get_next_op = remove_variants(get_next_op)
       return init_op, get_next_op, saver
 
     for i in range(len(break_points) + 1):
       with ops.Graph().as_default() as g:
         init_op, get_next_op, saver = get_ops()
+        get_next_op = remove_variants(get_next_op)
         with self.test_session(graph=g) as sess:
           if ckpt_saved:
             if init_before_restore:
-              sess.run(variables.global_variables_initializer())
-              sess.run(init_op)
+              self._initialize(init_op, sess)
             self._restore(saver, sess)
           else:
-            sess.run(variables.global_variables_initializer())
-            sess.run(init_op)
+            self._initialize(init_op, sess)
           start = break_points[i - 1] if i > 0 else 0
           end = break_points[i] if i < len(break_points) else num_outputs
           num_iters = end - start
@@ -560,13 +581,16 @@ class DatasetSerializationTestBase(test.TestCase):
       get_next = sparse_tensor.SparseTensor(*iterator.get_next())
     else:
       get_next = iterator.get_next()
-    self._add_iterator_ops_to_collection(init_op, get_next, sparse_tensors)
+    self._add_iterator_ops_to_collection(init_op, get_next, ds_fn,
+                                         sparse_tensors)
     saver = saver_lib.Saver(allow_empty=True)
     return init_op, get_next, saver
 
   def _build_empty_graph(self, ds_fn, sparse_tensors=False):
     iterator = iterator_ops.Iterator.from_structure(
-        self._get_output_types(ds_fn), self._get_output_shapes(ds_fn))
+        self._get_output_types(ds_fn),
+        output_shapes=self._get_output_shapes(ds_fn),
+        output_classes=self._get_output_classes(ds_fn))
     saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
     if sparse_tensors:
@@ -579,12 +603,19 @@ class DatasetSerializationTestBase(test.TestCase):
   def _add_iterator_ops_to_collection(self,
                                       init_op,
                                       get_next,
+                                      ds_fn,
                                       sparse_tensors=False):
     ops.add_to_collection("iterator_ops", init_op)
     # `get_next` may be a tuple e.g. in TensorSliceDataset. Since Collections
     # do not support tuples we flatten the tensors and restore the shape in
     # `_get_iterator_ops_from_collection`.
-    if sparse_tensors:
+
+    # TODO(shivaniagrwal): `output_classes` is a nested structure of classes,
+    # this base class is specific to current test cases. Update when tests are
+    # added with `output_classes` as a nested structure with at least one of the
+    # component being `tf.SparseTensor`.
+    if (sparse_tensors or
+        self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor):
       ops.add_to_collection("iterator_ops", get_next.indices)
       ops.add_to_collection("iterator_ops", get_next.values)
       ops.add_to_collection("iterator_ops", get_next.dense_shape)
@@ -594,7 +625,8 @@ class DatasetSerializationTestBase(test.TestCase):
 
   def _get_iterator_ops_from_collection(self, ds_fn, sparse_tensors=False):
     all_ops = ops.get_collection("iterator_ops")
-    if sparse_tensors:
+    if (sparse_tensors or
+        self._get_output_classes(ds_fn) is sparse_tensor.SparseTensor):
       init_op, indices, values, dense_shape = all_ops
       return init_op, sparse_tensor.SparseTensor(indices, values, dense_shape)
     else:
@@ -609,6 +641,10 @@ class DatasetSerializationTestBase(test.TestCase):
     with ops.Graph().as_default():
       return ds_fn().output_shapes
 
+  def _get_output_classes(self, ds_fn):
+    with ops.Graph().as_default():
+      return ds_fn().output_classes
+
   def _ckpt_path(self):
     return os.path.join(self.get_temp_dir(), "iterator")
 
@@ -619,8 +655,14 @@ class DatasetSerializationTestBase(test.TestCase):
     saver.save(sess, self._ckpt_path())
 
   def _restore(self, saver, sess):
+    sess.run(lookup_ops.tables_initializer())
     saver.restore(sess, self._latest_ckpt())
 
+  def _initialize(self, init_op, sess):
+    sess.run(variables.global_variables_initializer())
+    sess.run(lookup_ops.tables_initializer())
+    sess.run(init_op)
+
   def _import_meta_graph(self):
     meta_file_path = self._ckpt_path() + ".meta"
     return saver_lib.import_meta_graph(meta_file_path)
diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
index 5921be2ae89ba1bbbb8d6e3a509cf49c65949544..b572d6ed770fc0fe0f852359baf343c55966eddd 100644
--- a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
@@ -20,144 +20,12 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class FilterDatasetTest(test.TestCase):
-
-  def testFilterDataset(self):
-    components = (
-        np.arange(7, dtype=np.int64),
-        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
-            7, dtype=np.int64)[:, np.newaxis],
-        np.array(37.0, dtype=np.float64) * np.arange(7)
-    )
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    modulus = array_ops.placeholder(dtypes.int64)
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(count)
-        .filter(lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      # Test that we can dynamically feed a different modulus value for each
-      # iterator.
-      def do_test(count_val, modulus_val):
-        sess.run(init_op, feed_dict={count: count_val, modulus: modulus_val})
-        for _ in range(count_val):
-          for i in [x for x in range(7) if x**2 % modulus_val == 0]:
-            result = sess.run(get_next)
-            for component, result_component in zip(components, result):
-              self.assertAllEqual(component[i]**2, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-      do_test(14, 2)
-      do_test(4, 18)
-
-      # Test an empty dataset.
-      do_test(0, 1)
-
-  def testFilterRange(self):
-    dataset = dataset_ops.Dataset.range(100).filter(
-        lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      self.assertEqual(0, sess.run(get_next))
-      self.assertEqual(1, sess.run(get_next))
-      self.assertEqual(3, sess.run(get_next))
-
-  def testFilterDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .filter(lambda d: math_ops.equal(d["bar"] % 2, 0))
-                .map(lambda d: d["foo"] + d["bar"])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        if (i ** 2) % 2 == 0:
-          self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testUseStepContainerInFilter(self):
-    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
-
-    # Define a predicate that returns true for the first element of
-    # the sequence and not the second, and uses `tf.map_fn()`.
-    def _predicate(xs):
-      squared_xs = functional_ops.map_fn(lambda x: x * x, xs)
-      summed = math_ops.reduce_sum(squared_xs)
-      return math_ops.equal(summed, 1 + 4 + 9)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices([[1, 2, 3], [4, 5, 6]])
-        .filter(_predicate)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(input_data[0], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def assertSparseValuesEqual(self, a, b):
-    self.assertAllEqual(a.indices, b.indices)
-    self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.dense_shape, b.dense_shape)
-
-  def testSparse(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1])), i
-
-    def _filter_fn(_, i):
-      return math_ops.equal(i % 2, 0)
-
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
-            lambda x, i: x).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(5):
-        actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, _map_fn(i * 2)[0])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
 class FilterDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
@@ -194,6 +62,10 @@ class FilterDatasetSerializationTest(
     return dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
         lambda x, i: x)
 
+  def testSparseCore(self):
+    num_outputs = 5
+    self.run_core_tests(self._build_sparse_filter, None, num_outputs)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
index d4fbaa5cdcdd315aa0524134b48eb0515169722c..f3feecef32e587045be25056815315136a883ca7 100644
--- a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
@@ -17,13 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import random
-
-import numpy as np
-
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -34,124 +29,6 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
-from tensorflow.python.training import server_lib
-
-
-class FlatMapDatasetTest(test.TestCase):
-
-  # pylint: disable=g-long-lambda
-  def testFlatMapDataset(self):
-    repeats = [1, 2, 3, 4, 5, 0, 1]
-    components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in repeats:
-        for _ in range(i):
-          self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedFlatMapDataset(self):
-    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
-    components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
-                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
-                            .repeat(y))).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for row in repeats:
-        for i in row:
-          for _ in range(i):
-            self.assertEqual(i, sess.run(get_next))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSharedResourceNestedFlatMapDataset(self):
-    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
-    components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
-                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
-                            .repeat(y))).make_initializable_iterator(
-                                shared_name="shared_flat_map_iterator"))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    # Create two concurrent sessions that share the same iterator
-    # resource on the same server, and verify that a random
-    # interleaving of `Session.run(get_next)` calls on the two
-    # sessions yields the expected result.
-    server = server_lib.Server.create_local_server()
-    with session.Session(server.target) as sess1:
-      with session.Session(server.target) as sess2:
-        for _ in range(3):
-          sess = random.choice([sess1, sess2])
-          sess.run(init_op)
-          for row in repeats:
-            for i in row:
-              for _ in range(i):
-                sess = random.choice([sess1, sess2])
-                self.assertEqual(i, sess.run(get_next))
-
-        with self.assertRaises(errors.OutOfRangeError):
-          sess = random.choice([sess1, sess2])
-          sess.run(get_next)
-
-  def testMapDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .flat_map(lambda d: dataset_ops.Dataset.from_tensors(d["foo"])
-                          .repeat(d["bar"]))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for _ in range(i ** 2):
-          self.assertEqual(i * 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-  # pylint: enable=g-long-lambda
-
-  def testSparse(self):
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
-
-    def _flat_map_fn(x):
-      return dataset_ops.Dataset.from_tensor_slices(
-          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
-
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for j in range(2):
-          expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
 
 
 class FlatMapDatasetSerializationTest(
@@ -225,6 +102,21 @@ class FlatMapDatasetSerializationTest(
 
     self.verify_error_on_save(build_ds, 500, errors.InvalidArgumentError)
 
+  def testSparseCore(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _flat_map_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    def _build_ds():
+      return dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
+
+    self.run_core_tests(_build_ds, None, 20)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py b/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ea44f7c7ba329dc253bb9fbbcac0a1ed16aec7
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
@@ -0,0 +1,58 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import get_single_element
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class GetSingleElementTest(test.TestCase):
+
+  def testGetSingleElement(self):
+    skip_value = array_ops.placeholder(dtypes.int64, shape=[])
+    take_value = array_ops.placeholder_with_default(
+        constant_op.constant(1, dtype=dtypes.int64), shape=[])
+
+    dataset = (dataset_ops.Dataset.range(100)
+               .skip(skip_value)
+               .map(lambda x: x * x)
+               .take(take_value))
+
+    element = get_single_element.get_single_element(dataset)
+
+    with self.test_session() as sess:
+      self.assertEqual(0, sess.run(element, feed_dict={skip_value: 0}))
+      self.assertEqual(25, sess.run(element, feed_dict={skip_value: 5}))
+      self.assertEqual(100, sess.run(element, feed_dict={skip_value: 10}))
+
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Dataset was empty."):
+        sess.run(element, feed_dict={skip_value: 100})
+
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Dataset had more than one element."):
+        sess.run(element, feed_dict={skip_value: 0, take_value: 2})
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index e66ed3f7aa2a512813ef353d2d0744ae67005884..256ad8d94dc1a7c2b26df3f1ebf8e8e321882c15 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -26,8 +26,8 @@ import numpy as np
 from six.moves import zip_longest
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
@@ -38,181 +38,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
-class InterleaveDatasetTest(test.TestCase):
-
-  def _interleave(self, lists, cycle_length, block_length):
-    num_open = 0
-
-    # `all_iterators` acts as a queue of iterators over each element of `lists`.
-    all_iterators = [iter(l) for l in lists]
-
-    # `open_iterators` are the iterators whose elements are currently being
-    # interleaved.
-    open_iterators = []
-    for i in range(cycle_length):
-      if all_iterators:
-        open_iterators.append(all_iterators.pop(0))
-        num_open += 1
-      else:
-        open_iterators.append(None)
-
-    while num_open or all_iterators:
-      for i in range(cycle_length):
-        if open_iterators[i] is None:
-          if all_iterators:
-            open_iterators[i] = all_iterators.pop(0)
-            num_open += 1
-          else:
-            continue
-        for _ in range(block_length):
-          try:
-            yield next(open_iterators[i])
-          except StopIteration:
-            open_iterators[i] = None
-            num_open -= 1
-            break
-
-  def testPythonImplementation(self):
-    input_lists = [[4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6],
-                   [4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6]]
-
-    # Cycle length 1 acts like `Dataset.flat_map()`.
-    expected_elements = itertools.chain(*input_lists)
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 1, 1)):
-      self.assertEqual(expected, produced)
-
-    # Cycle length > 1.
-    expected_elements = [4, 5, 4, 5, 4, 5, 4,
-                         5, 5, 6, 6,  # NOTE(mrry): When we cycle back
-                                      # to a list and are already at
-                                      # the end of that list, we move
-                                      # on to the next element.
-                         4, 6, 4, 6, 4, 6, 4, 6, 5, 6, 5, 6, 5, 6, 5, 6, 5]
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 2, 1)):
-      self.assertEqual(expected, produced)
-
-    # Cycle length > 1 and block length > 1.
-    expected_elements = [4, 4, 4, 5, 5, 5, 4, 5, 5, 6, 6, 6, 4, 4, 4, 6, 6, 6,
-                         4, 5, 5, 5, 6, 6, 6, 5, 5, 6, 6, 6]
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 2, 3)):
-      self.assertEqual(expected, produced)
-
-    # Cycle length > len(input_values).
-    expected_elements = [4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6,
-                         4, 4, 5, 5, 6, 6, 5, 6, 6, 5, 6, 6]
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 7, 2)):
-      self.assertEqual(expected, produced)
-
-  def testInterleaveDataset(self):
-    input_values = array_ops.placeholder(dtypes.int64, shape=[None])
-    cycle_length = array_ops.placeholder(dtypes.int64, shape=[])
-    block_length = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_count = 2
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_values)
-        .repeat(repeat_count)
-        .interleave(lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
-                    cycle_length, block_length))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    next_element = iterator.get_next()
-
-    with self.test_session() as sess:
-      # Cycle length 1 acts like `Dataset.flat_map()`.
-      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
-                                   cycle_length: 1, block_length: 3})
-
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 1, 3):
-        self.assertEqual(expected_element, sess.run(next_element))
-
-      # Cycle length > 1.
-      # expected: [4, 5, 4, 5, 4, 5, 4, 5, 5, 6, 6, 4, 6, 4, 6, 4, 6, 4, 6, 5,
-      #            6, 5, 6, 5, 6, 5, 6, 5]
-      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
-                                   cycle_length: 2, block_length: 1})
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 2, 1):
-        self.assertEqual(expected_element, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-      # Cycle length > 1 and block length > 1.
-      # expected: [4, 4, 4, 5, 5, 5, 4, 5, 5, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 5,
-      #            5, 5, 6, 6, 6, 5, 5, 6, 6, 6]
-      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
-                                   cycle_length: 2, block_length: 3})
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 2, 3):
-        self.assertEqual(expected_element, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-      # Cycle length > len(input_values) * repeat_count.
-      # expected: [4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4,
-      #            5, 5, 6, 6, 5, 6, 6, 5, 6, 6]
-      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
-                                   cycle_length: 7, block_length: 2})
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 7, 2):
-        self.assertEqual(expected_element, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-      # Empty input.
-      sess.run(init_op, feed_dict={input_values: [],
-                                   cycle_length: 2, block_length: 3})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-      # Non-empty input leading to empty output.
-      sess.run(init_op, feed_dict={input_values: [0, 0, 0],
-                                   cycle_length: 2, block_length: 3})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-      # Mixture of non-empty and empty interleaved datasets.
-      sess.run(init_op, feed_dict={input_values: [4, 0, 6],
-                                   cycle_length: 2, block_length: 3})
-      for expected_element in self._interleave(
-          [[4] * 4, [], [6] * 6] * repeat_count, 2, 3):
-        self.assertEqual(expected_element, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testSparse(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
-
-    def _interleave_fn(x):
-      return dataset_ops.Dataset.from_tensor_slices(
-          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
-
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).interleave(
-            _interleave_fn, cycle_length=1).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for j in range(2):
-          expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
-class InterleaveDatasetSeriazationTest(
+class InterleaveDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def _build_iterator_graph(self, input_values, cycle_length, block_length):
@@ -251,15 +77,35 @@ class InterleaveDatasetSeriazationTest(
         None, num_outputs)
     # pylint: enable=g-long-lambda
 
+  def testSparseCore(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _interleave_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    def _build_dataset():
+      return dataset_ops.Dataset.range(10).map(_map_fn).interleave(
+          _interleave_fn, cycle_length=1)
+
+    self.run_core_tests(_build_dataset, None, 20)
+
 
 class ParallelInterleaveDatasetTest(test.TestCase):
 
   def setUp(self):
+
     self.input_values = array_ops.placeholder(dtypes.int64, shape=[None])
     self.cycle_length = array_ops.placeholder(dtypes.int64, shape=[])
     self.block_length = array_ops.placeholder(dtypes.int64, shape=[])
     self.sloppy = array_ops.placeholder(dtypes.bool, shape=[])
+    self.buffer_output_elements = array_ops.placeholder(dtypes.int64, shape=[])
+    self.prefetch_input_elements = array_ops.placeholder(dtypes.int64, shape=[])
 
+    self.error = None
     self.repeat_count = 2
 
     # Set up threading events used to sequence when items are produced that
@@ -276,6 +122,10 @@ class ParallelInterleaveDatasetTest(test.TestCase):
       self.write_coordination_events[x].wait()
       self.write_coordination_events[x].clear()
       self.read_coordination_events[x].release()
+      if self.error:
+        err = self.error
+        self.error = None
+        raise err  # pylint: disable=raising-bad-type
       return x * x
 
     def map_fn(x):
@@ -286,11 +136,13 @@ class ParallelInterleaveDatasetTest(test.TestCase):
       dataset = dataset.repeat(x)
       return dataset.map(map_fn)
 
-    self.dataset = (dataset_ops.Dataset.from_tensor_slices(self.input_values)
-                    .repeat(self.repeat_count).apply(
-                        interleave_ops.parallel_interleave(
-                            interleave_fn, self.cycle_length,
-                            self.block_length, self.sloppy)))
+    self.dataset = (
+        dataset_ops.Dataset.from_tensor_slices(self.input_values)
+        .repeat(self.repeat_count).apply(
+            interleave_ops.parallel_interleave(interleave_fn, self.cycle_length,
+                                               self.block_length, self.sloppy,
+                                               self.buffer_output_elements,
+                                               self.prefetch_input_elements)))
     self.iterator = self.dataset.make_initializable_iterator()
     self.init_op = self.iterator.initializer
     self.next_element = self.iterator.get_next()
@@ -380,7 +232,7 @@ class ParallelInterleaveDatasetTest(test.TestCase):
     for i in range(4, 7):
       self.write_coordination_events[i].set()
 
-  def _testSingleThreaded(self, sloppy=False):
+  def _testSingleThreaded(self, sloppy=False, prefetch_input_elements=0):
     # cycle_length=1,block_length=1 acts like `Dataset.interleave()` and
     # `Dataset.flat_map()` and is single-threaded. No synchronization required.
     with self.test_session() as sess:
@@ -391,7 +243,9 @@ class ParallelInterleaveDatasetTest(test.TestCase):
               self.input_values: [4, 5, 6],
               self.cycle_length: 1,
               self.block_length: 1,
-              self.sloppy: sloppy
+              self.sloppy: sloppy,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: prefetch_input_elements,
           })
 
       for expected_element in self._interleave(
@@ -408,6 +262,41 @@ class ParallelInterleaveDatasetTest(test.TestCase):
   def testSingleThreadedSloppy(self):
     self._testSingleThreaded(sloppy=True)
 
+  def testSingleThreadedPrefetch1Itr(self):
+    self._testSingleThreaded(prefetch_input_elements=1)
+
+  def testSingleThreadedPrefetch1ItrSloppy(self):
+    self._testSingleThreaded(prefetch_input_elements=1, sloppy=True)
+
+  def testSingleThreadedRagged(self):
+    # Tests a sequence with wildly different elements per iterator.
+    with self.test_session() as sess:
+      self._clear_coordination_events()
+      sess.run(
+          self.init_op,
+          feed_dict={
+              self.input_values: [3, 7, 4],
+              self.cycle_length: 2,
+              self.block_length: 1,
+              self.sloppy: False,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: 1,
+          })
+
+      # Add coordination values for 3 and 7
+      self.read_coordination_events[3] = threading.Semaphore(0)
+      self.write_coordination_events[3] = threading.Event()
+      self.read_coordination_events[7] = threading.Semaphore(0)
+      self.write_coordination_events[7] = threading.Event()
+
+      for expected_element in self._interleave(
+          [[3] * 3, [7] * 7, [4] * 4] * self.repeat_count, 2, 1):
+        self.write_coordination_events[expected_element].set()
+        output = sess.run(self.next_element)
+        self.assertEqual(expected_element * expected_element, output)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.next_element)
+
   def _testTwoThreadsNoContention(self, sloppy=False):
     # num_threads > 1.
     # Explicit coordination should result in `Dataset.interleave()` behavior
@@ -420,7 +309,9 @@ class ParallelInterleaveDatasetTest(test.TestCase):
               self.input_values: [4, 5, 6],
               self.cycle_length: 2,
               self.block_length: 1,
-              self.sloppy: sloppy
+              self.sloppy: sloppy,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: 1,
           })
       for i, expected_element in enumerate(
           self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
@@ -463,6 +354,8 @@ class ParallelInterleaveDatasetTest(test.TestCase):
               self.cycle_length: 2,
               self.block_length: 1,
               self.sloppy: sloppy,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: 1,
           })
       for i, expected_element in enumerate(
           self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
@@ -472,7 +365,7 @@ class ParallelInterleaveDatasetTest(test.TestCase):
           self.read_coordination_events[expected_element].acquire()
         else:
           self.write_coordination_events[expected_element].set()
-        time.sleep(0.1)  # Sleep to consistently "avoid" the race condition.
+        time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
         actual_element = sess.run(self.next_element)
         if not done_first_event:
           done_first_event = True
@@ -502,7 +395,9 @@ class ParallelInterleaveDatasetTest(test.TestCase):
               self.input_values: [4, 5, 6],
               self.cycle_length: 2,
               self.block_length: 2,
-              self.sloppy: sloppy
+              self.sloppy: sloppy,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: 1,
           })
       for i, expected_element in enumerate(
           self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
@@ -545,7 +440,9 @@ class ParallelInterleaveDatasetTest(test.TestCase):
               self.input_values: [4, 5, 6],
               self.cycle_length: 2,
               self.block_length: 2,
-              self.sloppy: sloppy
+              self.sloppy: sloppy,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: 1,
           })
       for i, expected_element in enumerate(
           self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
@@ -555,7 +452,7 @@ class ParallelInterleaveDatasetTest(test.TestCase):
           self.read_coordination_events[expected_element].acquire()
         else:
           self.write_coordination_events[expected_element].set()
-        time.sleep(0.1)  # Sleep to consistently "avoid" the race condition.
+        time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
         actual_element = sess.run(self.next_element)
         if not done_first_event:
           done_first_event = True
@@ -583,7 +480,9 @@ class ParallelInterleaveDatasetTest(test.TestCase):
               self.input_values: [],
               self.cycle_length: 2,
               self.block_length: 3,
-              self.sloppy: sloppy
+              self.sloppy: sloppy,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: 0,
           })
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(self.next_element)
@@ -604,7 +503,9 @@ class ParallelInterleaveDatasetTest(test.TestCase):
               self.input_values: [0, 0, 0],
               self.cycle_length: 2,
               self.block_length: 3,
-              self.sloppy: sloppy
+              self.sloppy: sloppy,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: 0,
           })
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(self.next_element)
@@ -615,7 +516,8 @@ class ParallelInterleaveDatasetTest(test.TestCase):
   def testNonEmptyInputIntoEmptyOutputsSloppy(self):
     self._testNonEmptyInputIntoEmptyOutputs(sloppy=True)
 
-  def _testPartiallyEmptyOutputs(self, sloppy=False):
+  def _testPartiallyEmptyOutputs(self, sloppy=False, prefetch_input_elements=1):
+    race_indices = {2, 8, 14}  # Sequence points when sloppy mode has race conds
     # Mixture of non-empty and empty interleaved datasets.
     with self.test_session() as sess:
       self._clear_coordination_events()
@@ -627,27 +529,31 @@ class ParallelInterleaveDatasetTest(test.TestCase):
               self.cycle_length: 2,
               self.block_length: 1,
               self.sloppy: sloppy,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: prefetch_input_elements,
           })
       for i, expected_element in enumerate(
           self._interleave([[4] * 4, [], [6] * 6] * self.repeat_count, 2, 1)):
         self.write_coordination_events[expected_element].set()
-        if done_first_event:  # First event starts the worker threads
+        # First event starts the worker threads. Additionally, when running the
+        # sloppy case with prefetch_input_elements=0, we get stuck if we wait
+        # for the read coordination event for certain event orderings in the
+        # presence of finishing iterators.
+        if done_first_event and not (sloppy and (i in race_indices)):
           self.read_coordination_events[expected_element].acquire()
         actual_element = sess.run(self.next_element)
-        if not done_first_event:
+        if not done_first_event or (sloppy and (i in race_indices)):
           done_first_event = True
           self.read_coordination_events[expected_element].acquire()
         self.assertEqual(expected_element * expected_element, actual_element,
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
 
   def testPartiallyEmptyOutputs(self):
     self._testPartiallyEmptyOutputs()
 
   def testPartiallyEmptyOutputsSloppy(self):
-    self._testPartiallyEmptyOutputs(sloppy=True)
+    self._testPartiallyEmptyOutputs(sloppy=True, prefetch_input_elements=0)
 
   def testDelayedOutputSloppy(self):
     # Explicitly control the sequence of events to ensure we correctly avoid
@@ -661,6 +567,8 @@ class ParallelInterleaveDatasetTest(test.TestCase):
               self.cycle_length: 2,
               self.block_length: 1,
               self.sloppy: True,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: 0,
           })
 
       mis_ordering = [
@@ -683,8 +591,10 @@ class ParallelInterleaveDatasetTest(test.TestCase):
           feed_dict={
               self.input_values: [4, 5, 6],
               self.cycle_length: 2,
-              self.block_length: 3,
-              self.sloppy: True
+              self.block_length: 1,
+              self.sloppy: True,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: 1,
           })
       # Test against a generating sequence that differs from the uncontended
       # case, in order to prove sloppy correctness.
@@ -692,7 +602,7 @@ class ParallelInterleaveDatasetTest(test.TestCase):
           self._interleave(
               [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count,
               cycle_length=2,
-              block_length=2)):
+              block_length=3)):
         self.write_coordination_events[expected_element].set()
         if done_first_event:  # First event starts the worker threads.
           self.read_coordination_events[expected_element].acquire()
@@ -716,7 +626,9 @@ class ParallelInterleaveDatasetTest(test.TestCase):
               self.input_values: [4, 5, 6],
               self.cycle_length: 3,
               self.block_length: 2,
-              self.sloppy: sloppy
+              self.sloppy: sloppy,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: 0,
           })
       for i in range(4, 7):
         self.write_coordination_events[i].set()
@@ -790,6 +702,139 @@ class ParallelInterleaveDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testErrorsInOutputFn(self):
+    with self.test_session() as sess:
+      self._clear_coordination_events()
+      sess.run(
+          self.init_op,
+          feed_dict={
+              self.input_values: [4, 5, 6],
+              self.cycle_length: 2,
+              self.block_length: 1,
+              self.sloppy: False,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: 0,
+          })
+
+      except_on_element_indices = set([3])
+
+      for i, expected_element in enumerate(
+          self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
+                           1)):
+        if i in except_on_element_indices:
+          self.error = ValueError()
+          self.write_coordination_events[expected_element].set()
+          with self.assertRaises(errors.InvalidArgumentError):
+            sess.run(self.next_element)
+        else:
+          self.write_coordination_events[expected_element].set()
+          actual_element = sess.run(self.next_element)
+          self.assertEqual(expected_element * expected_element, actual_element,
+                           "At index %s: %s expected, got: %s" %
+                           (i, expected_element, actual_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.next_element)
+
+  def testErrorsInInputFn(self):
+
+    def map_py_fn(x):
+      if x == 5:
+        raise ValueError()
+      return x
+
+    def map_fn(x):
+      return script_ops.py_func(map_py_fn, [x], x.dtype)
+
+    def interleave_fn(x):
+      dataset = dataset_ops.Dataset.from_tensors(x)
+      dataset = dataset.repeat(x)
+      return dataset
+
+    self.dataset = (
+        dataset_ops.Dataset.from_tensor_slices(self.input_values).map(map_fn)
+        .repeat(self.repeat_count).apply(
+            interleave_ops.parallel_interleave(interleave_fn, self.cycle_length,
+                                               self.block_length, self.sloppy,
+                                               self.buffer_output_elements,
+                                               self.prefetch_input_elements)))
+
+    self.iterator = self.dataset.make_initializable_iterator()
+    self.init_op = self.iterator.initializer
+    self.next_element = self.iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(
+          self.init_op,
+          feed_dict={
+              self.input_values: [4, 5, 6],
+              self.cycle_length: 2,
+              self.block_length: 1,
+              self.sloppy: False,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: 0,
+          })
+      for i, expected_element in enumerate(
+          self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
+        if expected_element == 5:
+          with self.assertRaises(errors.InvalidArgumentError):
+            sess.run(self.next_element)
+        else:
+          actual_element = sess.run(self.next_element)
+          self.assertEqual(expected_element, actual_element,
+                           "At index %s: %s expected, got: %s" %
+                           (i, expected_element, actual_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.next_element)
+
+  def testErrorsInInterleaveFn(self):
+
+    def map_py_fn(x):
+      if x == 5:
+        raise ValueError()
+      return x
+
+    def interleave_fn(x):
+      dataset = dataset_ops.Dataset.from_tensors(x)
+      y = script_ops.py_func(map_py_fn, [x], x.dtype)
+      dataset = dataset.repeat(y)
+      return dataset
+
+    self.dataset = (
+        dataset_ops.Dataset.from_tensor_slices(self.input_values)
+        .repeat(self.repeat_count).apply(
+            interleave_ops.parallel_interleave(interleave_fn, self.cycle_length,
+                                               self.block_length, self.sloppy,
+                                               self.buffer_output_elements,
+                                               self.prefetch_input_elements)))
+
+    self.iterator = self.dataset.make_initializable_iterator()
+    self.init_op = self.iterator.initializer
+    self.next_element = self.iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(
+          self.init_op,
+          feed_dict={
+              self.input_values: [4, 5, 6],
+              self.cycle_length: 2,
+              self.block_length: 1,
+              self.sloppy: False,
+              self.buffer_output_elements: 1,
+              self.prefetch_input_elements: 0,
+          })
+      for i, expected_element in enumerate(
+          self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
+        if expected_element == 5:
+          with self.assertRaises(errors.InvalidArgumentError):
+            sess.run(self.next_element)
+        else:
+          actual_element = sess.run(self.next_element)
+          self.assertEqual(expected_element, actual_element,
+                           "At index %s: %s expected, got: %s" %
+                           (i, expected_element, actual_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.next_element)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py
deleted file mode 100644
index 02379d064d4ab857ce9c7d13881a3ae37eea0980..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops that need test_util."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.platform import test
-
-
-class IteratorClusterTest(test.TestCase):
-
-  def testRemoteIteratorWithoutRemoteCallFail(self):
-    worker_config = config_pb2.ConfigProto()
-    worker_config.device_count["CPU"] = 2
-    worker, _ = test_util.create_local_cluster(
-        1, 1, worker_config=worker_config)
-
-    with ops.device("/job:worker/replica:0/task:0/cpu:1"):
-      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
-
-    with ops.device("/job:worker/replica:0/task:0/cpu:0"):
-      remote_it = iterator_ops.Iterator.from_string_handle(
-          iterator_3_handle, dataset_3.output_types, dataset_3.output_shapes)
-      get_next_op = remote_it.get_next()
-
-    with session.Session(worker[0].target) as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next_op)
-
-  def _testRemoteIteratorHelper(self, device0, device1, target):
-    with ops.device(device1):
-      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
-
-    @function.Defun(dtypes.string)
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, dataset_3.output_types, dataset_3.output_shapes)
-      return remote_iterator.get_next()
-
-    with ops.device(device0):
-      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-      remote_op = functional_ops.remote_call(
-          args=[iterator_3_handle],
-          Tout=[dtypes.int32],
-          f=_remote_fn,
-          target=target_placeholder)
-
-    with session.Session(target) as sess:
-      elem = sess.run(remote_op, feed_dict={target_placeholder: device1})
-      self.assertEqual(elem, [1])
-      # Fails when target is cpu:0 where the resource is not located.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(remote_op, feed_dict={target_placeholder: device0})
-      elem = sess.run(iterator_3.get_next())
-      self.assertEqual(elem, [2])
-      elem = sess.run(remote_op, feed_dict={target_placeholder: device1})
-      self.assertEqual(elem, [3])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(remote_op, feed_dict={target_placeholder: device1})
-
-  def testRemoteIteratorUsingRemoteCallOp(self):
-    worker_config = config_pb2.ConfigProto()
-    worker_config.device_count["CPU"] = 2
-    worker, _ = test_util.create_local_cluster(
-        1, 1, worker_config=worker_config)
-
-    self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0",
-                                   "/job:worker/replica:0/task:0/cpu:1",
-                                   worker[0].target)
-
-  def testRemoteIteratorUsingRemoteCallOpCrossProcess(self):
-    workers, _ = test_util.create_local_cluster(2, 1)
-
-    self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0",
-                                   "/job:worker/replica:0/task:1/cpu:0",
-                                   workers[0].target)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
deleted file mode 100644
index bda9a2a4a37e9c3d35ff99041d1150ffc43f4c43..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
+++ /dev/null
@@ -1,625 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import numpy as np
-
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.contrib.data.python.ops import readers
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import script_ops
-from tensorflow.python.platform import test
-from tensorflow.python.training import server_lib
-
-
-class IteratorTest(test.TestCase):
-
-  def testAttemptingGradientsRaiseExceptions(self):
-    component = constant_op.constant([1])
-    side = constant_op.constant(0)
-    add = lambda x: x + side
-    dataset = dataset_ops.Dataset.from_tensor_slices(component).map(add)
-    value = dataset.make_one_shot_iterator().get_next()
-    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
-      gradients_impl.gradients(value, component)
-    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
-      gradients_impl.gradients(value, side)
-    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
-      gradients_impl.gradients(value, [component, side])
-
-  def testOneShotIterator(self):
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-                .repeat(14).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      for _ in range(14):
-        for i in range(7):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testOneShotIteratorCaptureByValue(self):
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-    tensor_components = tuple([ops.convert_to_tensor(c) for c in components])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(tensor_components)
-                .map(_map_fn).repeat(14).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      for _ in range(14):
-        for i in range(7):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testOneShotIteratorInsideContainer(self):
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    def within_container():
-      def _map_fn(x, y, z):
-        return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-      iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                  .map(_map_fn).repeat(14).make_one_shot_iterator())
-      return iterator.get_next()
-
-    server = server_lib.Server.create_local_server()
-
-    # Create two iterators within unique containers, and run them to
-    # make sure that the resources aren't shared.
-    #
-    # The test below would fail if cname were the same across both
-    # sessions.
-    for i in range(2):
-      with session.Session(server.target) as sess:
-        cname = "iteration%d" % i
-        with ops.container(cname):
-          get_next = within_container()
-
-        for _ in range(14):
-          for i in range(7):
-            result = sess.run(get_next)
-            for component, result_component in zip(components, result):
-              self.assertAllEqual(component[i]**2, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testOneShotIteratorNonBlocking(self):
-    dataset = dataset_ops.Dataset.from_tensors([1, 2, 3]).map(lambda x: x * x)
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    # Create a session with a single thread to ensure that the
-    # one-shot iterator initializer does not deadlock.
-    config = config_pb2.ConfigProto(inter_op_parallelism_threads=1,
-                                    use_per_session_threads=True)
-    with session.Session(config=config) as sess:
-      self.assertAllEqual([1, 4, 9], sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-    # Test with multiple threads invoking the one-shot iterator concurrently.
-    with session.Session(config=config) as sess:
-      results = []
-      def consumer_thread():
-        try:
-          results.append(sess.run(next_element))
-        except errors.OutOfRangeError:
-          results.append(None)
-
-      num_threads = 8
-      threads = [
-          self.checkedThread(consumer_thread) for _ in range(num_threads)]
-      for t in threads:
-        t.start()
-      for t in threads:
-        t.join()
-
-      self.assertEqual(num_threads, len(results))
-      self.assertEqual(num_threads - 1,
-                       len([None for r in results if r is None]))
-      self.assertAllEqual([[1, 4, 9]], [r for r in results if r is not None])
-
-  def testOneShotIteratorInitializerFails(self):
-    # Define a dataset whose initialization will always fail.
-    dataset = dataset_ops.Dataset.from_tensors(
-        array_ops.check_numerics(
-            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.test_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-        sess.run(next_element)
-
-      # Test that subsequent attempts to use the iterator also fail.
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-        sess.run(next_element)
-
-    with self.test_session() as sess:
-      def consumer_thread():
-        with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-          sess.run(next_element)
-
-      num_threads = 8
-      threads = [
-          self.checkedThread(consumer_thread) for _ in range(num_threads)]
-      for t in threads:
-        t.start()
-      for t in threads:
-        t.join()
-
-  def testSimpleSharedResource(self):
-    components = (
-        np.array(1, dtype=np.int64),
-        np.array([1, 2, 3], dtype=np.int64),
-        np.array(37.0, dtype=np.float64)
-    )
-
-    server = server_lib.Server.create_local_server()
-
-    # Create two non-overlapping sessions that share the same iterator
-    # resource on the same server, and verify that an action of the
-    # first session (initializing the iterator) is visible in the
-    # second session.
-    with ops.Graph().as_default():
-      iterator = (dataset_ops.Dataset.from_tensors(components)
-                  .map(lambda x, y, z: (x, y, z)).make_initializable_iterator(
-                      shared_name="shared_iterator"))
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-
-      with session.Session(server.target) as sess:
-        sess.run(init_op)
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-        # Re-initialize the iterator in the first session.
-        sess.run(init_op)
-
-    with ops.Graph().as_default():
-      # Re-define the iterator manually, without defining any of the
-      # functions in this graph, to ensure that we are not
-      # accidentally redefining functions with the same names in the
-      # new graph.
-      iterator = iterator_ops.Iterator.from_structure(
-          shared_name="shared_iterator",
-          output_types=(dtypes.int64, dtypes.int64, dtypes.float64),
-          output_shapes=([], [3], []))
-      get_next = iterator.get_next()
-
-      with session.Session(server.target) as sess:
-        # Use the iterator without re-initializing in the second session.
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testNotInitializedError(self):
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    iterator = (dataset_ops.Dataset.from_tensors(components)
-                .make_initializable_iterator())
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      with self.assertRaisesRegexp(errors.FailedPreconditionError,
-                                   "iterator has not been initialized"):
-        sess.run(get_next)
-
-  def testReinitializableIterator(self):
-    dataset_3 = dataset_ops.Dataset.from_tensors(
-        constant_op.constant([1, 2, 3]))
-    dataset_4 = dataset_ops.Dataset.from_tensors(
-        constant_op.constant([4, 5, 6, 7]))
-    iterator = iterator_ops.Iterator.from_structure(dataset_3.output_types,
-                                                    [None])
-
-    dataset_3_init_op = iterator.make_initializer(dataset_3)
-    dataset_4_init_op = iterator.make_initializer(dataset_4)
-    get_next = iterator.get_next()
-
-    self.assertEqual(dataset_3.output_types, iterator.output_types)
-    self.assertEqual(dataset_4.output_types, iterator.output_types)
-    self.assertEqual([None], iterator.output_shapes.as_list())
-
-    with self.test_session() as sess:
-      # The iterator is initially uninitialized.
-      with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(get_next)
-
-      # Initialize with one dataset.
-      sess.run(dataset_3_init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Initialize with a different dataset.
-      sess.run(dataset_4_init_op)
-      self.assertAllEqual([4, 5, 6, 7], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Reinitialize with the first dataset.
-      sess.run(dataset_3_init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testReinitializableIteratorStaticErrors(self):
-    # Non-matching structure for types and shapes.
-    with self.assertRaises(TypeError):
-      iterator = iterator_ops.Iterator.from_structure((dtypes.int64,
-                                                       dtypes.float64), [None])
-
-    # Test validation of dataset argument.
-    iterator = iterator_ops.Iterator.from_structure((dtypes.int64,
-                                                     dtypes.float64))
-
-    # Incompatible structure.
-    with self.assertRaises(ValueError):
-      iterator.make_initializer(
-          dataset_ops.Dataset.from_tensors(((constant_op.constant(
-              [1, 2, 3], dtype=dtypes.int64),), (constant_op.constant(
-                  [4., 5., 6., 7.], dtype=dtypes.float64),))))
-
-    # Incompatible types.
-    with self.assertRaises(TypeError):
-      iterator.make_initializer(
-          dataset_ops.Dataset.from_tensors((constant_op.constant(
-              [1, 2, 3], dtype=dtypes.int32), constant_op.constant(
-                  [4., 5., 6., 7.], dtype=dtypes.float32))))
-
-    # Incompatible shapes.
-    iterator = iterator_ops.Iterator.from_structure(
-        (dtypes.int64, dtypes.float64), ([None], []))
-    with self.assertRaises(TypeError):
-      iterator.make_initializer(
-          dataset_ops.Dataset.from_tensors((constant_op.constant(
-              [1, 2, 3], dtype=dtypes.int64), constant_op.constant(
-                  [4., 5., 6., 7.], dtype=dtypes.float64))))
-
-  def testIteratorStringHandle(self):
-    dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-    dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
-
-    iterator_3 = dataset_3.make_one_shot_iterator()
-    iterator_4 = dataset_4.make_one_shot_iterator()
-
-    handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    feedable_iterator = iterator_ops.Iterator.from_string_handle(
-        handle_placeholder, dataset_3.output_types, dataset_3.output_shapes)
-    next_element = feedable_iterator.get_next()
-
-    self.assertEqual(dataset_3.output_types, feedable_iterator.output_types)
-    self.assertEqual(dataset_4.output_types, feedable_iterator.output_types)
-    self.assertEqual([], feedable_iterator.output_shapes)
-
-    with self.test_session() as sess:
-      iterator_3_handle = sess.run(iterator_3.string_handle())
-      iterator_4_handle = sess.run(iterator_4.string_handle())
-
-      self.assertEqual(
-          10, sess.run(next_element,
-                       feed_dict={handle_placeholder: iterator_4_handle}))
-      self.assertEqual(
-          1, sess.run(next_element,
-                      feed_dict={handle_placeholder: iterator_3_handle}))
-      self.assertEqual(
-          20, sess.run(next_element,
-                       feed_dict={handle_placeholder: iterator_4_handle}))
-      self.assertEqual(
-          2, sess.run(next_element,
-                      feed_dict={handle_placeholder: iterator_3_handle}))
-      self.assertEqual(
-          30, sess.run(next_element,
-                       feed_dict={handle_placeholder: iterator_4_handle}))
-      self.assertEqual(
-          3, sess.run(next_element,
-                      feed_dict={handle_placeholder: iterator_3_handle}))
-      self.assertEqual(
-          40, sess.run(next_element,
-                       feed_dict={handle_placeholder: iterator_4_handle}))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element,
-                 feed_dict={handle_placeholder: iterator_3_handle})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element,
-                 feed_dict={handle_placeholder: iterator_4_handle})
-
-  def testIteratorStringHandleError(self):
-    dataset_int_scalar = (dataset_ops.Dataset.from_tensor_slices([1, 2,
-                                                                  3]).repeat())
-    dataset_float_vector = (dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0]))
-
-    handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    feedable_int_scalar = iterator_ops.Iterator.from_string_handle(
-        handle_placeholder, dtypes.int32, [])
-    feedable_int_vector = iterator_ops.Iterator.from_string_handle(
-        handle_placeholder, dtypes.int32, [None])
-    feedable_int_any = iterator_ops.Iterator.from_string_handle(
-        handle_placeholder, dtypes.int32)
-
-    with self.test_session() as sess:
-      handle_int_scalar = sess.run(
-          dataset_int_scalar.make_one_shot_iterator().string_handle())
-      handle_float_vector = sess.run(
-          dataset_float_vector.make_one_shot_iterator().string_handle())
-
-      self.assertEqual(1,
-                       sess.run(
-                           feedable_int_scalar.get_next(),
-                           feed_dict={handle_placeholder: handle_int_scalar}))
-
-      self.assertEqual(2,
-                       sess.run(
-                           feedable_int_any.get_next(),
-                           feed_dict={handle_placeholder: handle_int_scalar}))
-
-      with self.assertRaises(errors.InvalidArgumentError):
-        print(sess.run(
-            feedable_int_vector.get_next(),
-            feed_dict={handle_placeholder: handle_int_scalar}))
-
-      with self.assertRaises(errors.InvalidArgumentError):
-        print(sess.run(
-            feedable_int_vector.get_next(),
-            feed_dict={handle_placeholder: handle_float_vector}))
-
-  def testRemoteIteratorUsingRemoteCallOpDirectSession(self):
-    worker_config = config_pb2.ConfigProto()
-    worker_config.device_count["CPU"] = 3
-
-    with ops.device("/job:localhost/replica:0/task:0/cpu:1"):
-      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
-
-    @function.Defun(dtypes.string)
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, dataset_3.output_types, dataset_3.output_shapes)
-      return remote_iterator.get_next()
-
-    with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
-      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-      remote_op = functional_ops.remote_call(
-          args=[iterator_3_handle],
-          Tout=[dtypes.int32],
-          f=_remote_fn,
-          target=target_placeholder)
-
-    with self.test_session(config=worker_config) as sess:
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
-          })
-      self.assertEqual(elem, [1])
-      # Fails when target is cpu:2 where the resource is not located.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(
-            remote_op,
-            feed_dict={
-                target_placeholder: "/job:localhost/replica:0/task:0/cpu:2"
-            })
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
-          })
-      self.assertEqual(elem, [2])
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
-          })
-      self.assertEqual(elem, [3])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(
-            remote_op,
-            feed_dict={
-                target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
-            })
-
-  def testRemoteIteratorUsingRemoteCallOpDirectSessionGPUCPU(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
-      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
-
-    def _encode_raw(byte_array):
-      return bytes(bytearray(byte_array))
-
-    @function.Defun(dtypes.uint8)
-    def _remote_fn(h):
-      handle = script_ops.py_func(_encode_raw, [h], dtypes.string)
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, dataset_3.output_types, dataset_3.output_shapes)
-      return remote_iterator.get_next()
-
-    with ops.device("/job:localhost/replica:0/task:0/device:GPU:0"):
-      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-      iterator_3_handle_uint8 = parsing_ops.decode_raw(
-          bytes=iterator_3_handle, out_type=dtypes.uint8)
-      remote_op = functional_ops.remote_call(
-          args=[iterator_3_handle_uint8],
-          Tout=[dtypes.int32],
-          f=_remote_fn,
-          target=target_placeholder)
-
-    with self.test_session() as sess:
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
-          })
-      self.assertEqual(elem, [1])
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
-          })
-      self.assertEqual(elem, [2])
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
-          })
-      self.assertEqual(elem, [3])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(
-            remote_op,
-            feed_dict={
-                target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
-            })
-
-  def testIncorrectIteratorRestore(self):
-
-    def _path():
-      return os.path.join(self.get_temp_dir(), "iterator")
-
-    def _save_op(iterator_resource):
-      iterator_state_variant = gen_dataset_ops.serialize_iterator(
-          iterator_resource)
-      save_op = io_ops.write_file(
-          _path(), parsing_ops.serialize_tensor(iterator_state_variant))
-      return save_op
-
-    def _restore_op(iterator_resource):
-      iterator_state_variant = parsing_ops.parse_tensor(
-          io_ops.read_file(_path()), dtypes.variant)
-      restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
-                                                        iterator_state_variant)
-      return restore_op
-
-    def _build_range_dataset_graph():
-      start = 1
-      stop = 10
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = _save_op(iterator._iterator_resource)
-      restore_op = _restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    def _build_reader_dataset_graph():
-      filenames = ["test"]  # Does not exist but we don't care in this test.
-      iterator = readers.FixedLengthRecordDataset(
-          filenames, 1, 0, 0).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next_op = iterator.get_next()
-      save_op = _save_op(iterator._iterator_resource)
-      restore_op = _restore_op(iterator._iterator_resource)
-      return init_op, get_next_op, save_op, restore_op
-
-    # Saving iterator for RangeDataset graph.
-    with ops.Graph().as_default() as g:
-      init_op, _, save_op, _ = _build_range_dataset_graph()
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(save_op)
-
-    # Attempt to restore the saved iterator into an IteratorResource of
-    # incompatible type. An iterator of RangeDataset has output type int64,
-    # while an iterator of FixedLengthRecordDataset has output type string.
-    # So an InvalidArgumentError should be raised by
-    # IteratorResource::set_iterator.
-    with ops.Graph().as_default() as g:
-      _, _, _, restore_op = _build_reader_dataset_graph()
-      with self.test_session(graph=g) as sess:
-        with self.assertRaises(errors.InvalidArgumentError):
-          sess.run(restore_op)
-
-  def testToSingleElement(self):
-    skip_value = array_ops.placeholder(dtypes.int64, shape=[])
-    take_value = array_ops.placeholder_with_default(
-        constant_op.constant(1, dtype=dtypes.int64), shape=[])
-
-    dataset = (dataset_ops.Dataset.range(100)
-               .skip(skip_value)
-               .map(lambda x: x * x)
-               .take(take_value))
-
-    element = dataset_ops.get_single_element(dataset)
-
-    with self.test_session() as sess:
-      self.assertEqual(0, sess.run(element, feed_dict={skip_value: 0}))
-      self.assertEqual(25, sess.run(element, feed_dict={skip_value: 5}))
-      self.assertEqual(100, sess.run(element, feed_dict={skip_value: 10}))
-
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Dataset was empty."):
-        sess.run(element, feed_dict={skip_value: 100})
-
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Dataset had more than one element."):
-        sess.run(element, feed_dict={skip_value: 0, take_value: 2})
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/list_files_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/list_files_dataset_op_test.py
deleted file mode 100644
index 27298de65f90c627e5eb638385bfe0478ef74fca..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/data/python/kernel_tests/list_files_dataset_op_test.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from os import path
-import shutil
-import tempfile
-
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-class ListFilesDatasetOpTest(test.TestCase):
-
-  def setUp(self):
-    self.tmp_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    shutil.rmtree(self.tmp_dir, ignore_errors=True)
-
-  def _touchTempFiles(self, filenames):
-    for filename in filenames:
-      open(path.join(self.tmp_dir, filename), 'a').close()
-
-  def testEmptyDirectory(self):
-    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
-    with self.test_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testSimpleDirectory(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
-    with self.test_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testEmptyDirectoryInitializer(self):
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.test_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testSimpleDirectoryInitializer(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.test_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
-
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testFileSuffixes(self):
-    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.test_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[1:-1]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testFileMiddles(self):
-    filenames = ['a.txt', 'b.py', 'c.pyc']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.test_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py*')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[1:]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(itr.get_next())))
-
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index e9a07da84a8c80c09ebd4dab0b1d69febe1c9790..8d4042927970cab2f5a518fc0da49b38444dbcdf 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -16,16 +16,14 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from collections import namedtuple
 
 import os
-import threading
 
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import error_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -33,15 +31,9 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import script_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -49,227 +41,13 @@ from tensorflow.python.util import compat
 
 class MapDatasetTest(test.TestCase):
 
-  def _buildMapDataset(self, components, count):
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-    return (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-            .repeat(count))
-
-  def testMapDataset(self):
-    """Test an dataset that maps a TF function across its input elements."""
-    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
-    # RepeatDataset(count).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-
-    dataset = self._buildMapDataset(components, count)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      # Test single-threaded access to the iterator.
-      sess.run(init_op, feed_dict={count: 14})
-      for _ in range(14):
-        for i in range(7):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test multi-threaded access to the same iterator.
-      sess.run(init_op, feed_dict={count: 18})
-      results = []
-      def iterator_thread():
-        while True:
-          try:
-            results.append(sess.run(get_next))
-          except errors.OutOfRangeError:
-            return
-      threads = [self.checkedThread(target=iterator_thread) for _ in range(8)]
-      for t in threads:
-        t.start()
-      for t in threads:
-        t.join()
-
-      # `results` will contain the same elements components**2
-      # repeated 18 times, but in a non-deterministic order. Sort the
-      # results, and assert that each element of components**2 is
-      # produced 18 times.
-      results.sort(key=lambda x: x[0])
-      for i in range(7):
-        for j in range(18):
-          for component, result_component in zip(components,
-                                                 results[i * 18 + j]):
-            self.assertAllEqual(component[i]**2, result_component)
-
-  def _buildParallelMapDataset(self, components, count, num_threads,
-                               output_buffer_size):
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-    return (dataset_ops.Dataset.from_tensor_slices(components).map(
-        _map_fn, num_threads=num_threads, output_buffer_size=output_buffer_size)
-            .repeat(count))
-
-  def testParallelMapDataset(self):
-    """Test an dataset that maps a TF function across its input elements."""
-    # The pipeline is TensorSliceDataset -> ParallelMapDataset(square_3) ->
-    # RepeatDataset(count).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    num_threads = array_ops.placeholder(dtypes.int32, shape=[])
-    output_buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    dataset = self._buildParallelMapDataset(components, count, num_threads,
-                                            output_buffer_size)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      def do_test(num_threads_val, output_buffer_size_val):
-        # Test single-threaded access to the iterator.
-        sess.run(init_op, feed_dict={
-            count: 14,
-            num_threads: num_threads_val,
-            output_buffer_size: output_buffer_size_val})
-        for _ in range(14):
-          for i in range(7):
-            result = sess.run(get_next)
-            for component, result_component in zip(components, result):
-              self.assertAllEqual(component[i]**2, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-        # Test multi-threaded access to the same iterator.
-        sess.run(init_op, feed_dict={
-            count: 18,
-            num_threads: num_threads_val,
-            output_buffer_size: output_buffer_size_val})
-        results = []
-        def iterator_thread():
-          while True:
-            try:
-              results.append(sess.run(get_next))
-            except errors.OutOfRangeError:
-              return
-        threads = [self.checkedThread(target=iterator_thread)
-                   for _ in range(64)]
-        for t in threads:
-          t.start()
-        for t in threads:
-          t.join()
-
-        # `results` will contain the same elements components**2
-        # repeated 18 times, but in a non-deterministic order. Sort the
-        # results, and assert that each element of components**2 is
-        # produced 18 times.
-        results.sort(key=lambda x: x[0])
-        for i in range(7):
-          for j in range(18):
-            for component, result_component in zip(components,
-                                                   results[i * 18 + j]):
-              self.assertAllEqual(component[i]**2, result_component)
-
-      for num_threads_val, output_buffer_size_val in [
-          (1, 1), (1, 2), (2, 2), (2, 4), (8, 8), (8, 16)]:
-        do_test(num_threads_val, output_buffer_size_val)
-
-  def testImplicitDisposeParallelMapDataset(self):
-    # Tests whether a parallel map dataset will be cleaned up correctly when
-    # the pipeline does not run it until exhaustion.
-    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
-    # RepeatDataset(1000).
-    components = (np.arange(1000),
-                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
-                  np.array(37.0) * np.arange(1000))
-
-    dataset = self._buildParallelMapDataset(components, 1000, 100, 100)
-    # NOTE(mrry): Also test that the prefetching thread is cancelled correctly.
-    dataset = dataset.prefetch(100)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-
-  def testParallelMapUnspecifiedOutputSize(self):
-    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
-
-    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-               .map(lambda x: array_ops.check_numerics(x, "message"),
-                    num_threads=2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-
-  def testParallelMapError(self):
-    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
-
-    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-               .map(lambda x: array_ops.check_numerics(x, "message"),
-                    num_threads=2, output_buffer_size=2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-      sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPrefetchError(self):
-    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
-
-    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-               .map(lambda x: array_ops.check_numerics(x, "message"))
-               .prefetch(2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-      sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
   def testMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
-    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-               .map(lambda x: array_ops.check_numerics(x, "message")).apply(
-                   error_ops.ignore_errors()))
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .map(lambda x: array_ops.check_numerics(x, "message")).apply(
+            error_ops.ignore_errors()))
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -284,10 +62,10 @@ class MapDatasetTest(test.TestCase):
   def testParallelMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
-    dataset = (dataset_ops.Dataset.from_tensor_slices(components).map(
-        lambda x: array_ops.check_numerics(x, "message"),
-        num_threads=2,
-        output_buffer_size=2).apply(error_ops.ignore_errors()))
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components).map(
+            lambda x: array_ops.check_numerics(x, "message"),
+            num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -308,9 +86,10 @@ class MapDatasetTest(test.TestCase):
     for filename in filenames:
       write_string_to_file(filename, filename)
 
-    dataset = (dataset_ops.Dataset.from_tensor_slices(filenames).map(
-        io_ops.read_file, num_threads=2, output_buffer_size=2).apply(
-            error_ops.ignore_errors()))
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(filenames).map(
+            io_ops.read_file, num_parallel_calls=2).prefetch(2).apply(
+                error_ops.ignore_errors()))
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -334,321 +113,125 @@ class MapDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testCaptureHashTable(self):
-    # NOTE(mrry): We must use the V2 variants of `HashTable`
-    # etc. because these produce a `tf.resource`-typed output that is
-    # compatible with the in-graph function implementation.
-    default_val = -1
-    keys = constant_op.constant(["brain", "salad", "surgery"])
-    values = constant_op.constant([0, 1, 2], dtypes.int64)
-    table = lookup_ops.HashTable(
-        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-
-    input_sentences = dataset_ops.Dataset.from_tensor_slices(
-        ["brain brain tank salad surgery", "surgery brain"])
-
-    iterator = (input_sentences
-                .map(lambda x: string_ops.string_split([x]).values)
-                .map(table.lookup)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(table.init)
-      sess.run(init_op)
-
-      print(sess.run(get_next))
-      print(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testCaptureQueue(self):
-    elements = np.random.randint(100, size=[200])
-    queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[])
-    enqueue_op = queue.enqueue_many(elements)
-    close_op = queue.close()
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1)
-                .map(lambda _: queue.dequeue()).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(enqueue_op)
-      sess.run(close_op)
-      sess.run(init_op)
-      for element in elements:
-        self.assertEqual(element, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testCaptureSameResourceMultipleTimes(self):
-    elements = np.random.randint(100, size=[200])
-    queue = data_flow_ops.FIFOQueue(
-        200, dtypes.int64, shapes=[], shared_name="shared_queue")
-    queue_2 = data_flow_ops.FIFOQueue(
-        200, dtypes.int64, shapes=[], shared_name="shared_queue")
-
-    enqueue_op = queue.enqueue_many(elements)
-    close_op = queue.close()
+  def testCaptureResourceInMapFn(self):
 
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1)
-                .map(lambda _: (queue.dequeue(), queue_2.dequeue()))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    def _build_ds(iterator):
 
-    with self.test_session() as sess:
-      sess.run(enqueue_op)
-      sess.run(close_op)
-      sess.run(init_op)
-      for i in range(100):
-        self.assertEqual(sorted([elements[i * 2], elements[i * 2 + 1]]),
-                         sorted(sess.run(get_next)))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+      def _map_fn(x):
+        get_next = iterator.get_next()
+        return x * get_next
 
-  def testCaptureVariable(self):
-    counter_var = variable_scope.get_variable(
-        "counter", (), dtypes.int32, use_resource=True)
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: counter_var.assign_add(1))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+      return dataset_ops.Dataset.range(10).map(_map_fn)
 
-    with self.test_session() as sess:
-      sess.run(counter_var.initializer)
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual(i, sess.run(counter_var))
-        self.assertEqual(i + 1, sess.run(get_next))
-      self.assertEqual(10, sess.run(counter_var))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(10, sess.run(counter_var))
-
-  def testCaptureUninitializedVariableError(self):
-    counter_var = variable_scope.get_variable(
-        "counter", (), dtypes.int32, use_resource=True)
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: counter_var.assign_add(1))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
+    def _build_graph():
+      captured_iterator = dataset_ops.Dataset.range(
+          10).make_initializable_iterator()
+      ds = _build_ds(captured_iterator)
+      iterator = ds.make_initializable_iterator()
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+      return captured_iterator.initializer, init_op, get_next
 
-    with self.test_session() as sess:
-      with self.assertRaisesRegexp(errors.FailedPreconditionError,
-                                   "Failed to capture resource"):
+    with ops.Graph().as_default() as g:
+      captured_init_op, init_op, get_next = _build_graph()
+      with self.test_session(graph=g) as sess:
+        sess.run(captured_init_op)
         sess.run(init_op)
+        for i in range(10):
+          self.assertEquals(i * i, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
 
-  def testSeededStatefulOperatorIsProperlyStateful(self):
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
 
-    with self.test_session() as sess:
-      sess.run(init_op)
-      random_values = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          random_values.extend(sess.run(get_next))
-      self.assertEqual(10, len(random_values))
-      self.assertGreater(np.abs(np.diff(random_values)).max(), 1e-6)
-      sess.run(init_op)
-      random_values_2 = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          random_values_2.extend(sess.run(get_next))
+class MapDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
 
-      # Randomness is repeatable given same seed
-      self.assertAllClose(random_values, random_values_2)
+  def setUp(self):
+    self._tensor_slice_len = 7
+    self._num_epochs = 14
+    self._num_outputs = self._tensor_slice_len * self._num_epochs
 
-  def testMapDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .map(lambda d: d["foo"] + d["bar"])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+  def _build_ds(self, multiplier=37.0):
+    components = (np.arange(self._tensor_slice_len), np.array([[1, 2, 3]]) *
+                  np.arange(self._tensor_slice_len)[:, np.newaxis],
+                  np.array(multiplier) * np.arange(self._tensor_slice_len))
 
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-  def testMapNamedtuple(self, count=10):
-    # construct dataset of tuples
-    labels = dataset_ops.Dataset.range(count)
-    images = labels.map(lambda l: -l)
-    dataset_tuple = dataset_ops.Dataset.zip((labels, images))
+    return (
+        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+        .repeat(self._num_epochs))
 
-    # convert dataset of tuples to dataset of namedtuples
-    example = namedtuple("Example", ["label", "image"])
-    dataset_namedtuple = dataset_tuple.map(example)
+  def testSaveRestoreCore(self):
+    self.run_core_tests(
+        self._build_ds,
+        lambda: self._build_ds(multiplier=15.0),
+        self._num_outputs)
 
-    def preprocess_tuple(label, image):
-      image = 2 * image
-      return label, image
+  def testSaveStatefulFunction(self):
 
-    def preprocess_namedtuple(example):
-      return example._replace(image=2 * example.image)
+    def _build_ds():
 
-    # preprocess both datasets
-    dataset_tuple = dataset_tuple.map(preprocess_tuple)
-    dataset_namedtuple = dataset_namedtuple.map(preprocess_namedtuple)
+      def _map_fn(x):
+        return random_ops.random_uniform(
+            (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x)
 
-    next_tuple = dataset_tuple.make_one_shot_iterator().get_next()
-    next_namedtuple = dataset_namedtuple.make_one_shot_iterator().get_next()
+      return dataset_ops.Dataset.range(100).map(_map_fn)
 
-    # make sure both datasets contain the same data
-    with self.test_session() as sess:
-      for i in range(count):
-        tuple_, namedtuple_ = sess.run([next_tuple, next_namedtuple])
-        self.assertEqual(tuple_, namedtuple_)
-        self.assertEqual(tuple_, (i, -2 * i))
+    self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
 
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_namedtuple)
-
-  def testUseStepContainerInMap(self):
-    row = np.arange(6)
-    iterator = (
-        dataset_ops.Dataset.from_tensors(row)
-        .map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+  def testCaptureVariableInMapFn(self):
 
-    with self.test_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(row ** 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    def _build_ds():
+      counter_var = variable_scope.get_variable(
+          "counter", (), dtypes.int32, use_resource=True)
+      return (dataset_ops.Dataset.from_tensors(0).repeat(10).map(
+          lambda _: counter_var.assign_add(1)))
 
-  def testPrefetch(self):
-    # We will use this event to test that `_map_py_func()` has been
-    # invoked a certain number of times (6 times, to be exact) after
-    # consuming fewer elements from the iterator.
-    ev = threading.Event()
+    self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
 
-    set_event_during_invocation = 5
+  def testCaptureConstantInMapFn(self):
 
-    def _map_py_func(x):
-      if x == set_event_during_invocation:
-        ev.set()
-      return x * x
+    def _build_ds():
+      constant_var = constant_op.constant(5)
+      return (dataset_ops.Dataset.from_tensors(0).repeat(10).map(
+          lambda x: x + constant_var))
 
-    def _map_fn(x):
-      return script_ops.py_func(_map_py_func, [x], x.dtype)
+    self.run_core_tests(_build_ds, None, 10)
 
-    buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset_ops.Dataset.range(100)
-        .map(_map_fn)
-        .prefetch(buffer_size_placeholder)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+  def testCaptureDefunInMapFn(self):
+    num_outputs = 100
 
-    with self.test_session() as sess:
-      # Simple test that prefetch yields the expected values in the
-      # expected order.
-      for buffer_size in [1, 10, 100, 1000]:
-        sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
-        for i in range(100):
-          self.assertEqual(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+    def _build_ds():
 
-      # We can indirectly observe that varying the buffer size has the
-      # intended effect by observing when `ev` is set (on the 6th
-      # invocation of `_map_py_func()`).
-      # NOTE(mrry): We do not test with `buffer_size ==
-      # set_event_during_invocation`, because we must consume at least
-      # one element to start the prefetching.
-      for buffer_size in range(1, set_event_during_invocation):
-        event_will_be_set_after_consuming = (
-            set_event_during_invocation - buffer_size + 1)
-
-        ev.clear()
-        sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
-        for i in range(event_will_be_set_after_consuming):
-          self.assertFalse(ev.is_set())
-          self.assertEqual(i * i, sess.run(get_next))
-        ev.wait()
-        for i in range(event_will_be_set_after_consuming, 100):
-          self.assertEqual(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+      @function.Defun(dtypes.int64)
+      def defun_fn(x):
+        return constant_op.constant(1000) + math_ops.to_int32(x)
 
-  def testReturnList(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: [x, constant_op.constant(37.0)])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+      return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
 
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, 37.0), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.run_core_tests(_build_ds, None, num_outputs)
 
-  def testMultiOutputPyFunc(self):
-    # The `tf.py_func()` op returns a list of tensors for its outputs.
-    def _map_fn(x_tensor):
-      def _map_py_func(x):
-        return x, np.array(37.0, dtype=np.float64)
-      return script_ops.py_func(
-          _map_py_func, [x_tensor], [dtypes.int64, dtypes.float64])
-
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(_map_fn)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+  def testBuildDefunInMapFn(self):
+    num_outputs = 100
 
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, 37.0), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    def _build_ds():
 
-  def assertSparseValuesEqual(self, a, b):
-    self.assertAllEqual(a.indices, b.indices)
-    self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.dense_shape, b.dense_shape)
+      @function.Defun(dtypes.int64)
+      def defun_fn(x):
 
-  def testSparse(self):
+        @function.Defun(dtypes.int32)
+        def defun_fn_deep(x):
+          return constant_op.constant(1000) + math_ops.to_int32(x)
 
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1]))
+        return constant_op.constant(11000) + defun_fn_deep(math_ops.to_int32(x))
 
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(_sparse)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+      return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
 
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, _sparse(i))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.run_core_tests(_build_ds, None, num_outputs)
 
-  def testSparseChain(self):
+  def testSparseCore(self):
 
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
@@ -656,58 +239,20 @@ class MapDatasetTest(test.TestCase):
           values=(i * np.array([1])),
           dense_shape=np.array([1, 1]))
 
-    def _check(i):
-      self.assertTrue(sparse_tensor.is_sparse(i))
-      return sparse_ops.sparse_concat(0, [i, i])
-
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_sparse).map(_check)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testCaptureResourceInMapFn(self):
-
-    def _build_ds(iterator):
-
-      def _map_fn(x):
-        get_next = iterator.get_next()
-        return x * get_next
-
-      return dataset_ops.Dataset.range(10).map(_map_fn)
-
-    def _build_graph():
-      captured_iterator = dataset_ops.Dataset.range(
-          10).make_initializable_iterator()
-      ds = _build_ds(captured_iterator)
-      iterator = ds.make_initializable_iterator()
-      init_op = iterator.initializer
-      return captured_iterator.initializer, init_op
+    def _build_ds(num_outputs):
+      return dataset_ops.Dataset.range(num_outputs).map(_sparse)
 
-    with ops.Graph().as_default() as g:
-      captured_init_op, init_op = _build_graph()
-      with self.test_session(graph=g) as sess:
-        sess.run(captured_init_op)
-        with self.assertRaises(errors.UnimplementedError):
-          # CapturedFunction does not support capturing IteratorResource.
-          sess.run(init_op)
+    num_outputs = 10
+    self.run_core_tests(lambda: _build_ds(num_outputs),
+                        lambda: _build_ds(int(num_outputs / 2)), num_outputs)
 
 
-class MapDatasetSerializationTest(
+class ParallelMapDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def setUp(self):
     self._tensor_slice_len = 7
-    self._num_epochs = 14
+    self._num_epochs = 1
     self._num_outputs = self._tensor_slice_len * self._num_epochs
 
   def _build_ds(self, multiplier=37.0):
@@ -718,14 +263,26 @@ class MapDatasetSerializationTest(
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    return (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-            .repeat(self._num_epochs))
+    return (dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn, num_parallel_calls=3).repeat(self._num_epochs))
+
+  def _build_ds_with_prefetch(self, multiplier=37.0):
+    components = (np.arange(self._tensor_slice_len), np.array([[1, 2, 3]]) *
+                  np.arange(self._tensor_slice_len)[:, np.newaxis],
+                  np.array(multiplier) * np.arange(self._tensor_slice_len))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    return (dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn, num_parallel_calls=3).repeat(self._num_epochs).prefetch(5))
 
   def testSaveRestoreCore(self):
-    self.run_core_tests(
-        self._build_ds,
-        lambda: self._build_ds(multiplier=15.0),
-        self._num_outputs)
+    for ds_fn in [self._build_ds, self._build_ds_with_prefetch]:
+      self.run_core_tests(
+          ds_fn,
+          lambda: ds_fn(multiplier=15.0),
+          self._num_outputs)
 
   def testSaveStatefulFunction(self):
 
@@ -735,7 +292,8 @@ class MapDatasetSerializationTest(
         return random_ops.random_uniform(
             (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x)
 
-      return dataset_ops.Dataset.range(100).map(_map_fn)
+      return dataset_ops.Dataset.range(100).map(
+          _map_fn, num_parallel_calls=2).prefetch(2)
 
     self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
 
@@ -745,10 +303,20 @@ class MapDatasetSerializationTest(
       counter_var = variable_scope.get_variable(
           "counter", (), dtypes.int32, use_resource=True)
       return (dataset_ops.Dataset.from_tensors(0).repeat(10).map(
-          lambda _: counter_var.assign_add(1)))
+          lambda _: counter_var.assign_add(1),
+          num_parallel_calls=2).prefetch(2))
 
     self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
 
+  def testCaptureConstantInMapFn(self):
+
+    def _build_ds():
+      constant_var = constant_op.constant(5)
+      return (dataset_ops.Dataset.from_tensors(0).repeat(10).map(
+          lambda x: x + constant_var, num_parallel_calls=2).prefetch(2))
+
+    self.run_core_tests(_build_ds, None, 10)
+
   def testCaptureDefunInMapFn(self):
     num_outputs = 100
 
@@ -758,7 +326,8 @@ class MapDatasetSerializationTest(
       def defun_fn(x):
         return constant_op.constant(1000) + math_ops.to_int32(x)
 
-      return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
+      return dataset_ops.Dataset.range(num_outputs).map(
+          defun_fn, num_parallel_calls=2).prefetch(2)
 
     self.run_core_tests(_build_ds, None, num_outputs)
 
@@ -776,7 +345,8 @@ class MapDatasetSerializationTest(
 
         return constant_op.constant(11000) + defun_fn_deep(math_ops.to_int32(x))
 
-      return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
+      return dataset_ops.Dataset.range(num_outputs).map(
+          defun_fn, num_parallel_calls=2).prefetch(2)
 
     self.run_core_tests(_build_ds, None, num_outputs)
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
index 8e6ad061a11752ab7b1ffc13c90b4fa52f67d6aa..80e1cb0041024b68bd5268b5de5d69c88c839896 100644
--- a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
@@ -19,160 +19,24 @@ from __future__ import print_function
 
 import os
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import counter
-from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import enumerate_ops
-from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
-from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
-from tensorflow.python.training import saver as saver_lib
 
 
 class RangeDatasetTest(test.TestCase):
 
-  def tearDown(self):
-    # Remove all checkpoint files.
-    prefix = self._iterator_checkpoint_prefix()
-    pattern = prefix + "*"
-    files = gfile.Glob(pattern)
-    map(gfile.Remove, files)
-
-  def testStop(self):
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={stop: 5})
-      for i in range(5):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStartStop(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start,
-                                         stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 5})
-      for i in range(2, 5):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStartStopStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 10, step: 2})
-      for i in range(2, 10, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testZeroStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-
-    with self.test_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={start: 2, stop: 10, step: 0})
-
-  def testNegativeStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 10, step: -1})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(2, 10, -1):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStopLessThanStart(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start,
-                                         stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(10, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStopLessThanStartWithPositiveStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2, step: 2})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(10, 2, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStopLessThanStartWithNegativeStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2, step: -1})
-      for i in range(10, 2, -1):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
   def testEnumerateDataset(self):
     components = (["a", "b"], [1, 2], [37.0, 38])
     start = constant_op.constant(20, dtype=dtypes.int64)
@@ -216,20 +80,25 @@ class RangeDatasetTest(test.TestCase):
       self.assertEqual(-1, sess.run(negative_get_next))
       self.assertEqual(-2, sess.run(negative_get_next))
 
-  def _iterator_checkpoint_prefix(self):
+
+class RangeDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _iterator_checkpoint_prefix_local(self):
     return os.path.join(self.get_temp_dir(), "iterator")
 
   def _save_op(self, iterator_resource):
     iterator_state_variant = gen_dataset_ops.serialize_iterator(
         iterator_resource)
     save_op = io_ops.write_file(
-        self._iterator_checkpoint_prefix(),
+        self._iterator_checkpoint_prefix_local(),
         parsing_ops.serialize_tensor(iterator_state_variant))
     return save_op
 
   def _restore_op(self, iterator_resource):
     iterator_state_variant = parsing_ops.parse_tensor(
-        io_ops.read_file(self._iterator_checkpoint_prefix()), dtypes.variant)
+        io_ops.read_file(self._iterator_checkpoint_prefix_local()),
+        dtypes.variant)
     restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
                                                       iterator_state_variant)
     return restore_op
@@ -283,382 +152,16 @@ class RangeDatasetTest(test.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
-  def testSaveRestoreUsingSaverFromMetaGraph(self):
-
-    def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      ops.add_to_collection("iterator_ops", init_op)
-      ops.add_to_collection("iterator_ops", get_next)
-      saveable_obj = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-      # Add the SaveableObject to the `SAVEABLE_OBJECTS` collection
-      # so that it can be automatically picked up by the Saver.
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable_obj)
-      saver = saver_lib.Saver()
-      return init_op, get_next, saver
-
-    start = 2
-    stop = 10
-    break_point = 5
-    path = self._iterator_checkpoint_prefix()
-    meta_filename = path + ".meta"
-
-    # Execute input pipeline for a few steps and save iterator state.
-    with ops.Graph().as_default() as g:
-      init_op, get_next, saver = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        saver.save(sess, path)
-
-    # Build the saver from the MetaGraph using import_meta_graph and
-    # check that the iterator state is restored.
-    with ops.Graph().as_default() as g:
-      saver = saver_lib.import_meta_graph(meta_filename)
-      init_op, get_next = ops.get_collection("iterator_ops")
-      with self.test_session(graph=g) as sess:
-        saver.restore(sess, saver_lib.latest_checkpoint(self.get_temp_dir()))
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testSaveRestoreUsingBuiltSaver(self):
-
-    def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      ops.add_to_collection("iterator_ops", init_op)
-      ops.add_to_collection("iterator_ops", get_next)
-      # Add the SaveableObject to the `SAVEABLE_OBJECTS` collection
-      # so that it can be automatically picked up by the Saver.
-      saveable_obj = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable_obj)
-      saver = saver_lib.Saver()
-      return init_op, get_next, saver
-
-    start = 2
-    stop = 10
-    stop_new = 15
-    break_point = 5
-    path = self._iterator_checkpoint_prefix()
-
-    # Execute input pipeline for a few steps and save iterator state.
-    with ops.Graph().as_default() as g:
-      init_op, get_next, saver = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        saver.save(sess, path)
-
-    # Manually build a modified Graph and Saver instead of importing
-    # MetaGraph and verify that original iterator state gets restored.
-    with ops.Graph().as_default() as g:
-      init_op, get_next, saver = _build_graph(start, stop_new)
-      with self.test_session(graph=g) as sess:
-        saver.restore(sess, saver_lib.latest_checkpoint(self.get_temp_dir()))
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testSaveRestoreUsingSaverThenInit(self):
-
-    def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      ops.add_to_collection("iterator_ops", init_op)
-      ops.add_to_collection("iterator_ops", get_next)
-      # Add the SaveableObject to the `SAVEABLE_OBJECTS` collection
-      # so that it can be automatically picked up by the Saver.
-      saveable_obj = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable_obj)
-      saver = saver_lib.Saver()
-      return init_op, get_next, saver
-
-    start = 2
-    stop = 10
-    stop_new = 15
-    break_point = 5
-    path = self._iterator_checkpoint_prefix()
-
-    # Execute input pipeline for a few steps and save iterator state.
-    with ops.Graph().as_default() as g:
-      init_op, get_next, saver = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        saver.save(sess, path)
-
-    # Restore iterator state call and then call init_op for the iterator and
-    # verify that the new iterator hides the restored iterator.
-    with ops.Graph().as_default() as g:
-      init_op, get_next, saver = _build_graph(start, stop_new)
-      with self.test_session(graph=g) as sess:
-        saver.restore(sess, saver_lib.latest_checkpoint(self.get_temp_dir()))
-        sess.run(init_op)
-        for i in range(start, stop_new):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testRestoreWithoutBuildingDatasetGraph(self):
-
-    def _build_graph(start, stop, num_epochs):
-      dataset = dataset_ops.Dataset.range(start, stop).repeat(num_epochs)
-      iterator = dataset.make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    # Saving and restoring in different sessions.
-    start = 2
-    stop = 10
-    num_epochs = 5
-    break_point = 5
-    break_epoch = 3
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for _ in range(break_epoch):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      # Create an empty IteratorResource and restore the Iterator into it.
-      output_types = dtypes.int64
-      output_shapes = tensor_shape.scalar()
-      iterator = iterator_ops.Iterator.from_structure(output_types,
-                                                      output_shapes)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      get_next = iterator.get_next()
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        for _ in range(break_epoch + 1, num_epochs):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testRestoreInModifiedGraph(self):
+  def _build_range_dataset(self, start, stop):
+    return dataset_ops.Dataset.range(start, stop)
 
-    def _build_graph(start, stop):
-      dataset = dataset_ops.Dataset.range(start, stop)
-      iterator = dataset.make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    # Saving and restoring in different sessions.
+  def testRangeCore(self):
     start = 2
     stop = 10
     stop_1 = 8
-    break_point = 5
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      # Intentionally build a graph with a different value for stop to make sure
-      # the original dataset graph is actually getting loaded.
-      init_op, get_next, _, restore_op = _build_graph(start, stop_1)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testInitThenRestore(self):
-    # Note: Calling init_op before restore_op is redundant. This test just makes
-    # sure we do not fail if restore is called on an already initialized
-    # iterator resource.
-
-    def _build_graph(start, stop):
-      dataset = dataset_ops.Dataset.range(start, stop)
-      iterator = dataset.make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    # Saving and restoring in different sessions.
-    start = 2
-    stop = 10
-    break_point = 5
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, _, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testMultipleSaves(self):
-
-    def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    start = 2
-    stop = 10
-    break_point1 = 5
-    break_point2 = 7
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point1):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_point1, break_point2):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    break_point2 = 7
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(start, stop)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_point2, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testSaveRestoreWithRepeat(self):
-
-    def _build_graph(start, stop, num_epochs):
-      iterator = dataset_ops.Dataset.range(
-          start, stop).repeat(num_epochs).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    start = 2
-    stop = 10
-    num_epochs = 5
-    break_range = 5
-    break_epoch = 3
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(
-          start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for _ in range(break_epoch - 1):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        for i in range(start, break_range):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_range, stop):
-          self.assertEqual(i, sess.run(get_next))
-        for _ in range(break_epoch, num_epochs):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testSaveRestoreExhaustedIterator(self):
-
-    def _build_graph(start, stop, num_epochs):
-      iterator = dataset_ops.Dataset.range(
-          start, stop).repeat(num_epochs).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    start = 2
-    stop = 10
-    num_epochs = 5
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(
-          start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for _ in range(num_epochs):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+    self.run_core_tests(lambda: self._build_range_dataset(start, stop),
+                        lambda: self._build_range_dataset(start, stop_1),
+                        stop - start)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index 1c42a3d855bc16c21e385d7108c3106884ae4f5e..6efe97444a375febc550ff3a3ea04bcd9330a3a5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.data.python.ops import readers
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -76,101 +77,12 @@ class TextLineDatasetTestBase(test.TestCase):
     return filenames
 
 
-class TextLineDatasetTest(TextLineDatasetTestBase):
-
-  def _testTextLineDataset(self, compression_type=None):
-    test_filenames = self._createFiles(
-        2, 5, crlf=True, compression_type=compression_type)
-    filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = readers.TextLineDataset(
-        filenames, compression_type=compression_type).repeat(num_epochs)
-    batch_dataset = repeat_dataset.batch(batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    init_op = iterator.make_initializer(repeat_dataset)
-    init_batch_op = iterator.make_initializer(batch_dataset)
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[0]],
-                              num_epochs: 1})
-      for i in range(5):
-        self.assertEqual(self._lineText(0, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[1]],
-                              num_epochs: 1})
-      for i in range(5):
-        self.assertEqual(self._lineText(1, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 1})
-      for j in range(2):
-        for i in range(5):
-          self.assertEqual(self._lineText(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test repeated iteration through both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 10})
-      for _ in range(10):
-        for j in range(2):
-          for i in range(5):
-            self.assertEqual(self._lineText(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test batched and repeated iteration through both files.
-      sess.run(
-          init_batch_op,
-          feed_dict={filenames: test_filenames,
-                     num_epochs: 10,
-                     batch_size: 5})
-      for _ in range(10):
-        self.assertAllEqual([self._lineText(0, i) for i in range(5)],
-                            sess.run(get_next))
-        self.assertAllEqual([self._lineText(1, i) for i in range(5)],
-                            sess.run(get_next))
-
-  def testTextLineDatasetNoCompression(self):
-    self._testTextLineDataset()
-
-  def testTextLineDatasetGzipCompression(self):
-    self._testTextLineDataset(compression_type="GZIP")
-
-  def testTextLineDatasetZlibCompression(self):
-    self._testTextLineDataset(compression_type="ZLIB")
-
-  def testTextLineDatasetBuffering(self):
-    test_filenames = self._createFiles(2, 5, crlf=True)
-
-    repeat_dataset = readers.TextLineDataset(test_filenames, buffer_size=10)
-    iterator = repeat_dataset.make_one_shot_iterator()
-
-    with self.test_session() as sess:
-      for j in range(2):
-        for i in range(5):
-          self.assertEqual(self._lineText(j, i), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-
 class TextLineDatasetSerializationTest(
     TextLineDatasetTestBase,
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def _build_iterator_graph(self, test_filenames, compression_type=None):
-    return readers.TextLineDataset(
+    return core_readers.TextLineDataset(
         test_filenames, compression_type=compression_type, buffer_size=10)
 
   def testTextLineCore(self):
@@ -217,101 +129,13 @@ class FixedLengthRecordReaderTestBase(test.TestCase):
     return filenames
 
 
-class FixedLengthRecordReaderTest(FixedLengthRecordReaderTestBase):
-
-  def testFixedLengthRecordDataset(self):
-    test_filenames = self._createFiles()
-    filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = (readers.FixedLengthRecordDataset(
-        filenames, self._record_bytes, self._header_bytes, self._footer_bytes)
-                      .repeat(num_epochs))
-    batch_dataset = repeat_dataset.batch(batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    init_op = iterator.make_initializer(repeat_dataset)
-    init_batch_op = iterator.make_initializer(batch_dataset)
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[0]],
-                              num_epochs: 1})
-      for i in range(self._num_records):
-        self.assertEqual(self._record(0, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[1]],
-                              num_epochs: 1})
-      for i in range(self._num_records):
-        self.assertEqual(self._record(1, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 1})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertEqual(self._record(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test repeated iteration through both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 10})
-      for _ in range(10):
-        for j in range(self._num_files):
-          for i in range(self._num_records):
-            self.assertEqual(self._record(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test batched and repeated iteration through both files.
-      sess.run(
-          init_batch_op,
-          feed_dict={
-              filenames: test_filenames,
-              num_epochs: 10,
-              batch_size: self._num_records
-          })
-      for _ in range(10):
-        for j in range(self._num_files):
-          self.assertAllEqual(
-              [self._record(j, i) for i in range(self._num_records)],
-              sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFixedLengthRecordDatasetBuffering(self):
-    test_filenames = self._createFiles()
-    dataset = readers.FixedLengthRecordDataset(
-        test_filenames,
-        self._record_bytes,
-        self._header_bytes,
-        self._footer_bytes,
-        buffer_size=10)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.test_session() as sess:
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertEqual(self._record(j, i), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-
 class FixedLengthRecordDatasetSerializationTest(
     FixedLengthRecordReaderTestBase,
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def _build_iterator_graph(self, num_epochs, compression_type=None):
     filenames = self._createFiles()
-    return readers.FixedLengthRecordDataset(
+    return core_readers.FixedLengthRecordDataset(
         filenames, self._record_bytes, self._header_bytes,
         self._footer_bytes).repeat(num_epochs)
 
@@ -338,9 +162,8 @@ class TFRecordDatasetTestBase(test.TestCase):
     self.compression_type = array_ops.placeholder_with_default("", shape=[])
     self.batch_size = array_ops.placeholder(dtypes.int64, shape=[])
 
-    repeat_dataset = readers.TFRecordDataset(self.filenames,
-                                             self.compression_type).repeat(
-                                                 self.num_epochs)
+    repeat_dataset = core_readers.TFRecordDataset(
+        self.filenames, self.compression_type).repeat(self.num_epochs)
     batch_dataset = repeat_dataset.batch(self.batch_size)
 
     iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
@@ -363,129 +186,6 @@ class TFRecordDatasetTestBase(test.TestCase):
     return filenames
 
 
-class TFRecordDatasetTest(TFRecordDatasetTestBase):
-
-  def testReadOneEpoch(self):
-    with self.test_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.filenames: [self.test_filenames[0]],
-              self.num_epochs: 1
-          })
-      for i in range(self._num_records):
-        self.assertAllEqual(self._record(0, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.filenames: [self.test_filenames[1]],
-              self.num_epochs: 1
-          })
-      for i in range(self._num_records):
-        self.assertAllEqual(self._record(1, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-      # Basic test: read from both files.
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: self.test_filenames,
-                     self.num_epochs: 1})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadTenEpochs(self):
-    with self.test_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: self.test_filenames,
-                     self.num_epochs: 10})
-      for _ in range(10):
-        for j in range(self._num_files):
-          for i in range(self._num_records):
-            self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadTenEpochsOfBatches(self):
-    with self.test_session() as sess:
-      sess.run(
-          self.init_batch_op,
-          feed_dict={
-              self.filenames: self.test_filenames,
-              self.num_epochs: 10,
-              self.batch_size: self._num_records
-          })
-      for _ in range(10):
-        for j in range(self._num_files):
-          values = sess.run(self.get_next)
-          self.assertAllEqual(
-              [self._record(j, i) for i in range(self._num_records)], values)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadZlibFiles(self):
-    zlib_files = []
-    for i, fn in enumerate(self.test_filenames):
-      with open(fn, "rb") as f:
-        cdata = zlib.compress(f.read())
-
-        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
-        with open(zfn, "wb") as f:
-          f.write(cdata)
-        zlib_files.append(zfn)
-
-    with self.test_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: zlib_files,
-                     self.compression_type: "ZLIB"})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadGzipFiles(self):
-    gzip_files = []
-    for i, fn in enumerate(self.test_filenames):
-      with open(fn, "rb") as f:
-        gzfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
-        with gzip.GzipFile(gzfn, "wb") as gzf:
-          gzf.write(f.read())
-        gzip_files.append(gzfn)
-
-    with self.test_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: gzip_files,
-                     self.compression_type: "GZIP"})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadWithBuffer(self):
-    one_mebibyte = 2**20
-    d = readers.TFRecordDataset(self.test_filenames, buffer_size=one_mebibyte)
-    iterator = d.make_one_shot_iterator()
-    with self.test_session() as sess:
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-
 class TFRecordDatasetSerializationTest(
     TFRecordDatasetTestBase,
     dataset_serialization_test_base.DatasetSerializationTestBase):
@@ -517,7 +217,7 @@ class TFRecordDatasetSerializationTest(
           gzip_files.append(gzfn)
       filenames = gzip_files
 
-    return readers.TFRecordDataset(
+    return core_readers.TFRecordDataset(
         filenames, compression_type,
         buffer_size=buffer_size).repeat(num_epochs).batch(batch_size)
 
@@ -575,7 +275,7 @@ class ReadBatchFeaturesTest(test.TestCase):
             "record": parsing_ops.FixedLenFeature([], dtypes.int64),
             "keywords": parsing_ops.VarLenFeature(dtypes.string)
         },
-        reader=readers.TFRecordDataset,
+        reader=core_readers.TFRecordDataset,
         randomize_input=False,
         num_epochs=self.num_epochs)
 
@@ -714,12 +414,11 @@ class ReadBatchFeaturesTest(test.TestCase):
               self._next_actual_batch(sess)
 
   def testReadWithEquivalentDataset(self):
-    # TODO(mrry): Add support for tf.SparseTensor as a Dataset component.
     features = {
         "file": parsing_ops.FixedLenFeature([], dtypes.int64),
         "record": parsing_ops.FixedLenFeature([], dtypes.int64),
     }
-    dataset = (readers.TFRecordDataset(self.test_filenames)
+    dataset = (core_readers.TFRecordDataset(self.test_filenames)
                .map(lambda x: parsing_ops.parse_single_example(x, features))
                .repeat(10).batch(2))
     iterator = dataset.make_initializable_iterator()
diff --git a/tensorflow/contrib/data/python/kernel_tests/resample_test.py b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
index 0ac8d7359f7234d98167277724780bf31555e6fb..3c7b46629edb13459766b5ef3f392e8d00ad4db8 100644
--- a/tensorflow/contrib/data/python/kernel_tests/resample_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
@@ -19,8 +19,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import resampling
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
index 5338ec56bf275e481a984964e39aa0c1ade3a752..e0494736b72ae52f586cb80d42a5c1e50ac17a61 100644
--- a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
@@ -21,6 +21,7 @@ import itertools
 
 import numpy as np
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import scan_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -124,5 +125,18 @@ class ScanDatasetTest(test.TestCase):
           scan_ops.scan(constant_op.constant(1, dtype=dtypes.int32), _scan_fn))
 
 
+class ScanDatasetSerialzationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_dataset(self, num_elements):
+    return dataset_ops.Dataset.from_tensors(1).repeat(num_elements).apply(
+        scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1])))
+
+  def testScanCore(self):
+    num_output = 5
+    self.run_core_tests(lambda: self._build_dataset(num_output),
+                        lambda: self._build_dataset(2), num_output)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
index 1a26da82e533ec01106ea10525c1cd96627c34fb..36ddf3004237ed042f21d691d83eafbaa20621e6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
@@ -20,194 +20,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
 
-class SequenceDatasetTest(test.TestCase):
-
-  def testRepeatTensorDataset(self):
-    """Test a dataset that repeats its input multiple times."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    # This placeholder can be fed when dataset-definition subgraph
-    # runs (i.e. `init_op` below) to configure the number of
-    # repetitions used in a particular iterator.
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensors(components)
-                .repeat(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      # Test a finite repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 3})
-      for _ in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test a different finite repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 7})
-      for _ in range(7):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test an empty repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test an infinite repetition.
-      # NOTE(mrry): There's not a good way to test that the sequence
-      # actually is infinite.
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      for _ in range(17):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-
-  def testTakeTensorDataset(self):
-    components = (np.arange(10),)
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .take(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      # Take fewer than input size
-      sess.run(init_op, feed_dict={count_placeholder: 4})
-      for i in range(4):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take more than input size
-      sess.run(init_op, feed_dict={count_placeholder: 25})
-      for i in range(10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take all of input
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      for i in range(10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take nothing
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSkipTensorDataset(self):
-    components = (np.arange(10),)
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .skip(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      # Skip fewer than input size, we should skip
-      # the first 4 elements and then read the rest.
-      sess.run(init_op, feed_dict={count_placeholder: 4})
-      for i in range(4, 10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip more than input size: get nothing.
-      sess.run(init_op, feed_dict={count_placeholder: 25})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip exactly input size.
-      sess.run(init_op, feed_dict={count_placeholder: 10})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Set -1 for 'count': skip the entire dataset.
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip nothing
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-      for i in range(0, 10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testRepeatRepeatTensorDataset(self):
-    """Test the composition of repeat datasets."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    inner_count = array_ops.placeholder(dtypes.int64, shape=[])
-    outer_count = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensors(components).repeat(inner_count)
-                .repeat(outer_count).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.test_session() as sess:
-      sess.run(init_op, feed_dict={inner_count: 7, outer_count: 14})
-      for _ in range(7 * 14):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testRepeatEmptyDataset(self):
-    """Test that repeating an empty dataset does not hang."""
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10).skip(10)
-                .repeat(-1).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      with self.assertRaisesRegexp(
-          errors.OutOfRangeError,
-          "Attempted to repeat an empty dataset infinitely."):
-        sess.run(get_next)
-
-
 class SequenceDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization_integration_test.py b/tensorflow/contrib/data/python/kernel_tests/serialization_integration_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a6b74dc3eb80a6168117beed06935737198cecb
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/serialization_integration_test.py
@@ -0,0 +1,85 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration test for input pipeline serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_lib
+
+
+class MultipleInputPipelinesTest(test.TestCase):
+
+  def _build_input_pipeline(self, name, num_outputs):
+    with ops.name_scope(name):
+      ds = dataset_ops.Dataset.range(num_outputs).shuffle(
+          10, reshuffle_each_iteration=False).prefetch(10)
+      iterator = ds.make_initializable_iterator()
+      saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
+      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+      return iterator.initializer, iterator.get_next()
+
+  def _build_graph(self, num_pipelines, num_outputs):
+    init_ops = []
+    get_next_ops = []
+    for i in range(num_pipelines):
+      name = "input_pipeline_%d" % i
+      init_op, get_next_op = self._build_input_pipeline(name, num_outputs)
+      init_ops.append(init_op)
+      get_next_ops.append(get_next_op)
+    saver = saver_lib.Saver()
+    return init_ops, get_next_ops, saver
+
+  def _ckpt_path(self):
+    return os.path.join(self.get_temp_dir(), "iterator")
+
+  def testConcurrentSaves(self):
+    num_pipelines = 100
+    num_outputs = 100
+    break_point = 10
+    all_outputs = [[] for _ in range(num_pipelines)]
+    with ops.Graph().as_default() as g:
+      init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
+                                                        num_outputs)
+      with self.test_session(graph=g) as sess:
+        sess.run(init_ops)
+        for _ in range(break_point):
+          output = sess.run(get_next_ops)
+          for i in range(num_pipelines):
+            all_outputs[i].append(output[i])
+        saver.save(sess, self._ckpt_path())
+
+    with ops.Graph().as_default() as g:
+      init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
+                                                        num_outputs)
+      with self.test_session(graph=g) as sess:
+        saver.restore(sess, self._ckpt_path())
+        for _ in range(num_outputs - break_point):
+          output = sess.run(get_next_ops)
+          for i in range(num_pipelines):
+            all_outputs[i].append(output[i])
+
+    for output in all_outputs:
+      self.assertSequenceEqual(sorted(output), range(num_outputs))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/shard_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/shard_dataset_op_test.py
deleted file mode 100644
index 0b3c32c06eb1d69244c9a02ca4ba571769f13f40..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/data/python/kernel_tests/shard_dataset_op_test.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import test
-
-
-class ShardDatasetOpTest(test.TestCase):
-
-  def testSimpleCase(self):
-    dataset = dataset_ops.Dataset.range(10).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.test_session() as sess:
-      self.assertEqual(2, sess.run(iterator.get_next()))
-      self.assertEqual(7, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testNestedData(self):
-    dataset_a = dataset_ops.Dataset.range(10)
-    dataset_b = dataset_ops.Dataset.range(10, 0, -1)
-    dataset = dataset_ops.Dataset.zip((dataset_a, dataset_b)).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.test_session() as sess:
-      self.assertEqual((2, 8), sess.run(iterator.get_next()))
-      self.assertEqual((7, 3), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testOffsetZero(self):
-    dataset = dataset_ops.Dataset.range(10).shard(5, 0)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.test_session() as sess:
-      self.assertEqual(0, sess.run(iterator.get_next()))
-      self.assertEqual(5, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testOffsetGreaterNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(5, 7)
-
-  def testNegativeOffset(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(5, -3)
-
-  def testNegativeNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(-3, 1)
-
-  def testZeroNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(0, 1)
-
-  def testIteratorEndsBeforeFirstElem(self):
-    dataset = dataset_ops.Dataset.range(1).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.test_session() as sess:
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testLargerWorkerPool(self):
-    dataset = dataset_ops.Dataset.range(10).shard(7, 5)
-    iterator = dataset.make_one_shot_iterator()
-    with self.test_session() as sess:
-      self.assertEqual(5, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testIndexEqualsNumShards(self):
-    dataset = dataset_ops.Dataset.range(10).shard(5, 4)
-    iterator = dataset.make_one_shot_iterator()
-    with self.test_session() as sess:
-      self.assertEqual(4, sess.run(iterator.get_next()))
-      self.assertEqual(9, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testIndexEqualsNumShards2(self):
-    dataset = dataset_ops.Dataset.range(10).shard(4, 3)
-    iterator = dataset.make_one_shot_iterator()
-    with self.test_session() as sess:
-      self.assertEqual(3, sess.run(iterator.get_next()))
-      self.assertEqual(7, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
index 6b5b53cc0f8f2d1df5622a5bc5e2f8ef04c6342a..bcc644c0971854d948025009dc7add2fea214048 100644
--- a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
@@ -17,461 +17,145 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import os
-
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import dataset_ops as contrib_dataset_ops
-from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.data.python.ops import shuffle_ops
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
-from tensorflow.python.training import saver as saver_lib
-
-
-class ShuffleDatasetTest(test.TestCase):
-
-  def testShuffleDataset(self):
-    components = (
-        np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-        np.array([9.0, 10.0, 11.0, 12.0])
-    )
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = (
-        contrib_dataset_ops.Dataset.from_tensor_slices(components)
-        .repeat(count_placeholder))
-
-    shuffle_dataset = repeat_dataset.shuffle(buffer_size_placeholder,
-                                             seed_placeholder)
-
-    self.assertEqual(tuple([c.shape[1:] for c in components]),
-                     shuffle_dataset.output_shapes)
-
-    # Create initialization ops for iterators without and with
-    # shuffling, respectively.
-    iterator = iterator_ops.Iterator.from_structure(
-        shuffle_dataset.output_types, shuffle_dataset.output_shapes)
-    init_fifo_op = iterator.make_initializer(repeat_dataset)
-    init_shuffle_op = iterator.make_initializer(shuffle_dataset)
-
-    get_next = iterator.get_next()
 
-    with self.test_session() as sess:
-      # First run without shuffling to collect the "ground truth".
-      sess.run(init_fifo_op)
-      unshuffled_elements = []
-      for _ in range(20):
-        unshuffled_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
 
-      # Assert that the shuffled dataset has the same elements as the
-      # "ground truth".
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 37})
-      shuffled_elements = []
-      for _ in range(20):
-        shuffled_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(
-          sorted(unshuffled_elements), sorted(shuffled_elements))
-
-      # Assert that shuffling twice with the same seeds gives the same sequence.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 37})
-      reshuffled_elements_same_seed = []
-      for _ in range(20):
-        reshuffled_elements_same_seed.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(shuffled_elements, reshuffled_elements_same_seed)
-
-      # Assert that shuffling twice with a different seed gives a different
-      # permutation of the same elements.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 1037})
-      reshuffled_elements_different_seed = []
-      for _ in range(20):
-        reshuffled_elements_different_seed.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertNotEqual(shuffled_elements, reshuffled_elements_different_seed)
-      self.assertAllEqual(
-          sorted(shuffled_elements), sorted(reshuffled_elements_different_seed))
-
-      # Assert that the shuffled dataset has the same elements as the
-      # "ground truth" when the buffer size is smaller than the input
-      # dataset.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 2,
-                     seed_placeholder: 37})
-      reshuffled_elements_small_buffer = []
-      for _ in range(20):
-        reshuffled_elements_small_buffer.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(
-          sorted(unshuffled_elements), sorted(reshuffled_elements_small_buffer))
-
-      # Test the case of shuffling an empty dataset.
-      sess.run(init_shuffle_op, feed_dict={buffer_size_placeholder: 2,
-                                           seed_placeholder: 37,
-                                           count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testDefaultArguments(self):
-    components = [0, 1, 2, 3, 4]
-    iterator = (
-        contrib_dataset_ops.Dataset.from_tensor_slices(components).shuffle(5)
-        .repeat().make_one_shot_iterator())
-
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      counts = collections.defaultdict(lambda: 0)
-      for _ in range(10):
-        for _ in range(5):
-          counts[sess.run(get_next)] += 1
-
-    for i in range(5):
-      self.assertEqual(10, counts[i])
+class ShuffleDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
 
-
-class ShuffleDatasetSerializationTest(test.TestCase):
-
-  def tearDown(self):
-    # Remove all checkpoint files.
-    prefix = self._ckpt_path()
-    pattern = prefix + "*"
-    files = gfile.Glob(pattern)
-    map(gfile.Remove, files)
-
-  def _build_graph(self,
-                   range_limit=10,
-                   num_repeats=5,
-                   buffer_size=5,
-                   seed=None,
-                   reshuffle_each_iteration=None,
-                   build_saveable=True):
-    iterator = dataset_ops.Dataset.range(range_limit).shuffle(
+  def _build_shuffle_dataset(
+      self,
+      range_limit=10,
+      num_repeats=5,
+      buffer_size=5,
+      seed=None,
+      reshuffle_each_iteration=None,
+  ):
+    return dataset_ops.Dataset.range(range_limit).shuffle(
         buffer_size,
         seed=seed,
-        reshuffle_each_iteration=reshuffle_each_iteration).repeat(
-            num_repeats).make_initializable_iterator()
-    if build_saveable:
-      saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    ops.add_to_collection("iterator_ops", init_op)
-    ops.add_to_collection("iterator_ops", get_next)
-    saver = saver_lib.Saver(allow_empty=True)
-    return init_op, get_next, saver
-
-  def _ckpt_path(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
-
-  def _latest_ckpt(self):
-    return saver_lib.latest_checkpoint(self.get_temp_dir())
-
-  def _save(self, sess, saver):
-    saver.save(sess, self._ckpt_path())
-
-  def _restore(self, saver, sess):
-    saver.restore(sess, self._latest_ckpt())
-
-  def _import_meta_graph(self):
-    meta_file_path = self._ckpt_path() + ".meta"
-    return saver_lib.import_meta_graph(meta_file_path)
-
-  def _testReadWithBreaks(self, break_points, init_before_restore=False):
-    seed = 55
-    range_limit = 10
-    num_repeats = 5
-    num_outputs = range_limit * num_repeats
-    buffer_sizes = [1, 3, 8, 10, 25, 50]
-    reshuffle_each_iteration = False
-    for buffer_size in buffer_sizes:
-      expected = []
-      actual = []
-      # Generate the ground truth.
-      with ops.Graph().as_default() as g:
-        g.seed = 10
-        init_op, get_next_op, _ = self._build_graph(
-            range_limit=range_limit,
-            num_repeats=num_repeats,
-            buffer_size=buffer_size,
-            seed=seed,
-            reshuffle_each_iteration=reshuffle_each_iteration)
-        with self.test_session(graph=g) as sess:
-          sess.run(init_op)
-          for _ in range(num_outputs):
-            expected.append(sess.run(get_next_op))
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-      # Run and checkpoint after first break_point.
-      with ops.Graph().as_default() as g:
-        g.seed = 10
-        init_op, get_next_op, saver = self._build_graph(
-            range_limit=range_limit,
-            num_repeats=num_repeats,
-            buffer_size=buffer_size,
-            seed=seed,
-            reshuffle_each_iteration=reshuffle_each_iteration)
-        with self.test_session(graph=g) as sess:
-          sess.run(init_op)
-          for _ in range(break_points[0]):
-            actual.append(sess.run(get_next_op))
-          self._save(sess, saver)
+        reshuffle_each_iteration=reshuffle_each_iteration).repeat(num_repeats)
 
-      # Load from checkpoint and continue running while stopping at each
-      # subsequent checkpoint.
-      for i in range(len(break_points)):
-        with ops.Graph().as_default() as g:
-          saver = self._import_meta_graph()
-          init_op, get_next_op = ops.get_collection("iterator_ops")
-          with self.test_session(graph=g) as sess:
-            if init_before_restore:
-              sess.run(init_op)
-            self._restore(saver, sess)
-            start = break_points[i]
-            end = break_points[
-                i + 1] if i < len(break_points) - 1 else num_outputs
-            for _ in range(end - start):
-              actual.append(sess.run(get_next_op))
-            self._save(sess, saver)
-            if end == num_outputs:
-              with self.assertRaises(errors.OutOfRangeError):
-                sess.run(get_next_op)
-      self.assertEqual(expected, actual)
+  def testShuffleCore(self):
 
-  def testSaveRestore(self):
-    self._testReadWithBreaks([8])  # rng buffer_size: 0
-    self._testReadWithBreaks([13])  # rng buffer_size: 1
-    self._testReadWithBreaks([18])  # rng buffer_size: 2
-    self._testReadWithBreaks([23])  # rng buffer_size: 3
-
-  def testSaveUnusedIterator(self):
-    self._testReadWithBreaks([0])
-
-  def testSaveFullyUsedIterator(self):
-    self._testReadWithBreaks([50])
-
-  def testMultipleBreaks(self):
-    self._testReadWithBreaks([0, 5, 9, 15, 25, 32])
-
-  def testIdempotence(self):
-    # Attempt to save iterator immediately after restoring.
-    self._testReadWithBreaks([1, 1, 5, 5, 5, 25, 32])
-
-  def testInitThenRestore(self):
-    self._testReadWithBreaks([0, 5, 9, 15, 25, 32], init_before_restore=True)
-
-  def testRestoreExhaustedIterator(self):
-    seed = 55
-    range_limit = 10
-    num_repeats = 5
-    num_outputs = range_limit * num_repeats
-    buffer_sizes = [1, 3, 8, 10, 25, 50]
-    reshuffle_each_iteration = False
-    for buffer_size in buffer_sizes:
-      with ops.Graph().as_default() as g:
-        g.seed = 10
-        init_op, get_next_op, saver = self._build_graph(
-            range_limit=range_limit,
-            num_repeats=num_repeats,
-            buffer_size=buffer_size,
-            seed=seed,
-            reshuffle_each_iteration=reshuffle_each_iteration)
-        with self.test_session(graph=g) as sess:
-          sess.run(init_op)
-          for _ in range(num_outputs):
-            sess.run(get_next_op)
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-          self._save(sess, saver)
-
-        with ops.Graph().as_default() as g:
-          saver = self._import_meta_graph()
-          init_op, get_next_op = ops.get_collection("iterator_ops")
-          with self.test_session(graph=g) as sess:
-            self._restore(saver, sess)
-            with self.assertRaises(errors.OutOfRangeError):
-              sess.run(get_next_op)
-
-  def testResetRestoredIterator(self):
     seed = 55
     range_limit = 10
     num_repeats = 5
     num_outputs = range_limit * num_repeats
     buffer_sizes = [1, 3, 8, 10, 25, 50]
     reshuffle_each_iteration = False
+    # pylint: disable=cell-var-from-loop
+    # pylint: disable=g-long-lambda
     for buffer_size in buffer_sizes:
-      with ops.Graph().as_default() as g:
-        g.seed = 10
-        init_op, get_next_op, saver = self._build_graph(
-            range_limit=range_limit,
-            num_repeats=num_repeats,
-            buffer_size=buffer_size,
-            seed=seed,
-            reshuffle_each_iteration=reshuffle_each_iteration)
-        with self.test_session(graph=g) as sess:
-          sess.run(init_op)
-          for _ in range(num_outputs // 2):
-            sess.run(get_next_op)
-          self._save(sess, saver)
-
-        outputs = []
-        with ops.Graph().as_default() as g:
-          saver = self._import_meta_graph()
-          init_op, get_next_op = ops.get_collection("iterator_ops")
-          with self.test_session(graph=g) as sess:
-            self._restore(saver, sess)
-            sess.run(init_op)
-            for _ in range(num_outputs):
-              outputs.append(sess.run(get_next_op))
-            with self.assertRaises(errors.OutOfRangeError):
-              sess.run(get_next_op)
-        expected_outputs_sorted = sorted(
-            np.array([range(range_limit)
-                      for _ in range(num_repeats)]).flatten())
-        self.assertEqual(expected_outputs_sorted, sorted(outputs))
-
-  def testRestoreInModifiedGraph(self):
-    seed = 55
-    break_point = 25
-    range_limit = 10
-    num_repeats = 5
-    num_outputs = range_limit * num_repeats
-    buffer_sizes = [3, 8, 10, 25, 50]
-    reshuffle_each_iteration = False
-    for buffer_size in buffer_sizes:
-      expected = []
-      actual_without_restore = []
-      actual = []
-      with ops.Graph().as_default() as g:
-        g.seed = 10
-        init_op, get_next_op, saver = self._build_graph(
-            range_limit=range_limit,
-            num_repeats=num_repeats,
-            buffer_size=buffer_size,
-            seed=seed,
-            reshuffle_each_iteration=reshuffle_each_iteration)
-        with self.test_session(graph=g) as sess:
-          sess.run(init_op)
-          for _ in range(break_point):
-            expected.append(sess.run(get_next_op))
-          actual.extend(expected)
-          self._save(sess, saver)
-          for _ in range(num_outputs - break_point):
-            expected.append(sess.run(get_next_op))
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-      with ops.Graph().as_default() as g:
-        g.seed = 20  # Different seed than previous graph for shuffle rngs.
-        init_op, get_next_op, saver = self._build_graph(
-            range_limit=range_limit,
-            num_repeats=num_repeats,
-            buffer_size=buffer_size,
-            seed=seed,
-            reshuffle_each_iteration=reshuffle_each_iteration)
-        with self.test_session(graph=g) as sess:
-          sess.run(init_op)
-          for _ in range(num_outputs):
-            actual_without_restore.append(sess.run(get_next_op))
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-      with ops.Graph().as_default() as g:
-        g.seed = 20  # Different seed than previous graph for shuffle rngs.
-        init_op, get_next_op, saver = self._build_graph(
-            range_limit=range_limit,
-            num_repeats=num_repeats,
-            buffer_size=buffer_size,
-            seed=seed,
-            reshuffle_each_iteration=reshuffle_each_iteration)
-        with self.test_session(graph=g) as sess:
-          self._restore(saver, sess)
-          for _ in range(num_outputs - break_point):
-            actual.append(sess.run(get_next_op))
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-      # Since the modified graph has a different random seed it produces a
-      # different order of examples.
-      self.assertNotEqual(expected, actual_without_restore)
-      self.assertEqual(sorted(expected), sorted(actual_without_restore))
-      self.assertEqual(expected, actual)
-
-  def testDoNotBuildSaveable(self):
-    seed = 55
-    break_point = 25
-    range_limit = 10
-    num_repeats = 5
-    num_outputs = range_limit * num_repeats
-    buffer_sizes = [3, 8, 10, 25, 50]
-    reshuffle_each_iteration = False
-    for buffer_size in buffer_sizes:
-      actual = []
-      with ops.Graph().as_default() as g:
-        g.seed = 10
-        init_op, get_next_op, saver = self._build_graph(
-            range_limit=range_limit,
-            num_repeats=num_repeats,
-            buffer_size=buffer_size,
-            seed=seed,
-            reshuffle_each_iteration=reshuffle_each_iteration)
-        with self.test_session(graph=g) as sess:
-          sess.run(init_op)
-          for _ in range(break_point):
-            sess.run(get_next_op)
-          self._save(sess, saver)
-
-      with ops.Graph().as_default() as g:
-        g.seed = 20  # Different seed than previous graph for shuffle rngs.
-        init_op, get_next_op, saver = self._build_graph(
-            range_limit=range_limit,
-            num_repeats=num_repeats,
-            buffer_size=buffer_size,
-            seed=seed,
-            reshuffle_each_iteration=reshuffle_each_iteration,
-            build_saveable=False)
-        with self.test_session(graph=g) as sess:
-          # Since the SaveableObject was not added to Saver's list
-          # of saveables, iterator state is not restored by saver.restore().
-          self._restore(saver, sess)
-          with self.assertRaises(errors.FailedPreconditionError):
-            sess.run(get_next_op)
-          sess.run(init_op)
-          for _ in range(num_outputs):
-            actual.append(sess.run(get_next_op))
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-      expected_outputs_sorted = sorted(
-          np.array([range(range_limit) for _ in range(num_repeats)]).flatten())
-      self.assertEqual(expected_outputs_sorted, sorted(actual))
+      self.run_core_tests(
+          lambda: self._build_shuffle_dataset(
+              range_limit=range_limit,
+              num_repeats=num_repeats,
+              buffer_size=buffer_size,
+              seed=seed,
+              reshuffle_each_iteration=reshuffle_each_iteration),
+          lambda: self._build_shuffle_dataset(
+              range_limit=range_limit,
+              num_repeats=num_repeats,
+              buffer_size=buffer_size,
+              seed=10,
+              reshuffle_each_iteration=reshuffle_each_iteration),
+          num_outputs)
+    # pylint: enable=cell-var-from-loop
+    # pylint: enable=g-long-lambda
+
+
+class ShuffleAndRepeatTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_ds(self, seed, count=5, num_elements=20):
+    return dataset_ops.Dataset.range(num_elements).apply(
+        shuffle_ops.shuffle_and_repeat(buffer_size=5, count=count, seed=seed))
+
+  def testCorrectOutput(self):
+    output = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    self.assertSequenceEqual(
+        sorted(output), sorted(
+            np.array([range(20) for _ in range(5)]).flatten()))
+    for i in range(5):
+      self.assertSequenceEqual(sorted(output[i * 20:(i + 1) * 20]), range(20))
+
+  def testReshuffling(self):
+    # Check that the output orders of different epochs are indeed different.
+    output = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    for i in range(4):
+      epoch1 = output[i * 20:(i + 1) * 20]
+      epoch2 = output[(i + 1) * 20:(i + 2) * 20]
+      self.assertNotEqual(epoch1, epoch2)
+
+  def testSameOrderForSameSeeds(self):
+    output1 = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    output2 = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    self.assertEqual(output1, output2)
+
+  def testDifferentOrderForDifferentSeeds(self):
+    output1 = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    output2 = self.gen_outputs(lambda: self._build_ds(20), [], 100)
+    self.assertNotEqual(output1, output2)
+    self.assertEqual(sorted(output1), sorted(output2))
+
+  def testCountNone(self):
+    output1 = self.gen_outputs(
+        lambda: self._build_ds(10, count=None), [], 100, verify_exhausted=False)
+    output2 = self.gen_outputs(
+        lambda: self._build_ds(20, count=None), [], 100, verify_exhausted=False)
+    self.assertNotEqual(output1, output2)
+    self.assertEqual(sorted(output1), sorted(output2))
+
+  def testCountMinusOne(self):
+    output1 = self.gen_outputs(
+        lambda: self._build_ds(10, count=-1), [], 100, verify_exhausted=False)
+    output2 = self.gen_outputs(
+        lambda: self._build_ds(20, count=-1), [], 100, verify_exhausted=False)
+    self.assertNotEqual(output1, output2)
+    self.assertEqual(sorted(output1), sorted(output2))
+
+  def testInfiniteOutputs(self):
+    # Asserting the iterator is exhausted after producing 100 items should fail.
+    with self.assertRaises(AssertionError):
+      self.gen_outputs(lambda: self._build_ds(10, count=None), [], 100)
+    with self.assertRaises(AssertionError):
+      self.gen_outputs(lambda: self._build_ds(10, count=-1), [], 100)
+
+  def testInfiniteEmpty(self):
+    with self.assertRaises(errors.OutOfRangeError):
+      self.gen_outputs(lambda: self._build_ds(10, count=None, num_elements=0),
+                       [], 100)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.gen_outputs(lambda: self._build_ds(10, count=-1, num_elements=0), [],
+                       100)
+
+  def testLargeBufferSize(self):
+    with ops.Graph().as_default() as g:
+      ds = dataset_ops.Dataset.range(20).apply(
+          shuffle_ops.shuffle_and_repeat(buffer_size=21))
+      get_next_op = ds.make_one_shot_iterator().get_next()
+      with self.test_session(graph=g) as sess:
+        sess.run(get_next_op)
+
+
+class ShuffleAndRepeatSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_ds(self, seed):
+    return dataset_ops.Dataset.range(20).apply(
+        shuffle_ops.shuffle_and_repeat(buffer_size=5, count=5, seed=seed))
+
+  def testCore(self):
+    self.run_core_tests(lambda: self._build_ds(10), lambda: self._build_ds(20),
+                        100)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py
index efd864f866611bfd3bac1edcf98d84be852410fd..e26cef8ec522c7e69a0c19b2b30a969bbfc0ad78 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+
 import sqlite3
 
 from tensorflow.contrib.data.python.ops import readers
diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
index 8f24d6b2f612cff662aa8a36085bc69a9ea1a290..07bdf920446e953c2a1abaf495d2e9e1256106fd 100644
--- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import stats_ops
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.data.ops import dataset_ops
@@ -209,5 +210,48 @@ class StatsDatasetTest(test.TestCase):
         sess.run(stats_aggregator_1.subscribe(iterator))
 
 
+class StatsDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_dataset_bytes_stats(self, num_elements):
+    return dataset_ops.Dataset.range(num_elements).map(
+        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
+            stats_ops.bytes_produced_stats("bytes_produced"))
+
+  def testBytesStatsDatasetSaveableCore(self):
+    num_outputs = 100
+    self.run_core_tests(
+        lambda: self._build_dataset_bytes_stats(num_outputs),
+        lambda: self._build_dataset_bytes_stats(num_outputs // 10), num_outputs)
+
+  def _build_dataset_latency_stats(self, num_elements, tag="record_latency"):
+    return dataset_ops.Dataset.range(num_elements).apply(
+        stats_ops.latency_stats(tag))
+
+  def _build_dataset_multiple_tags(self,
+                                   num_elements,
+                                   tag1="record_latency",
+                                   tag2="record_latency_2"):
+    return dataset_ops.Dataset.range(num_elements).apply(
+        stats_ops.latency_stats(tag1)).apply(stats_ops.latency_stats(tag2))
+
+  def testLatencyStatsDatasetSaveableCore(self):
+    num_outputs = 100
+
+    self.run_core_tests(
+        lambda: self._build_dataset_latency_stats(num_outputs),
+        lambda: self._build_dataset_latency_stats(num_outputs // 10),
+        num_outputs)
+
+    self.run_core_tests(lambda: self._build_dataset_multiple_tags(num_outputs),
+                        None, num_outputs)
+
+    tag1 = "record_latency"
+    tag2 = "record_latency"
+    self.run_core_tests(
+        lambda: self._build_dataset_multiple_tags(num_outputs, tag1, tag2),
+        None, num_outputs)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c436f7a0b45a13109960e87dd97ca56b10bb871
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/unique_dataset_op_test.py
@@ -0,0 +1,96 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.data.python.ops import unique
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class UniqueDatasetTest(test.TestCase):
+
+  def _testSimpleHelper(self, dtype, test_cases):
+    """Test the `unique()` transformation on a list of test cases.
+
+    Args:
+      dtype: The `dtype` of the elements in each test case.
+      test_cases: A list of pairs of lists. The first component is the test
+        input that will be passed to the transformation; the second component
+        is the expected sequence of outputs from the transformation.
+    """
+
+    # The `current_test_case` will be updated when we loop over `test_cases`
+    # below; declare it here so that the generator can capture it once.
+    current_test_case = []
+    dataset = dataset_ops.Dataset.from_generator(lambda: current_test_case,
+                                                 dtype).apply(unique.unique())
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for test_case, expected in test_cases:
+        current_test_case = test_case
+        sess.run(iterator.initializer)
+        for element in expected:
+          if dtype == dtypes.string:
+            element = compat.as_bytes(element)
+          self.assertAllEqual(element, sess.run(next_element))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(next_element)
+
+  def testSimpleInt(self):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      self._testSimpleHelper(dtype, [
+          ([], []),
+          ([1], [1]),
+          ([1, 1, 1, 1, 1, 1, 1], [1]),
+          ([1, 2, 3, 4], [1, 2, 3, 4]),
+          ([1, 2, 4, 3, 2, 1, 2, 3, 4], [1, 2, 4, 3]),
+          ([[1], [1, 1], [1, 1, 1]], [[1], [1, 1], [1, 1, 1]]),
+          ([[1, 1], [1, 1], [2, 2], [3, 3], [1, 1]], [[1, 1], [2, 2], [3, 3]]),
+      ])
+
+  def testSimpleString(self):
+    self._testSimpleHelper(dtypes.string, [
+        ([], []),
+        (["hello"], ["hello"]),
+        (["hello", "hello", "hello"], ["hello"]),
+        (["hello", "world"], ["hello", "world"]),
+        (["foo", "bar", "baz", "baz", "bar", "foo"], ["foo", "bar", "baz"]),
+    ])
+
+
+class UniqueSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testUnique(self):
+
+    def build_dataset(num_elements, unique_elem_range):
+      return dataset_ops.Dataset.range(num_elements).map(
+          lambda x: x % unique_elem_range).apply(unique.unique())
+
+    self.run_core_tests(lambda: build_dataset(200, 100),
+                        lambda: build_dataset(40, 100), 100)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
index 5d34b0024c472d0393544ff3dad8acea7964345f..e39fa957f0bbb9d3671274d5f58b993e8399814b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
@@ -20,97 +20,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
-from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.platform import test
 
 
-class ZipDatasetTest(test.TestCase):
-
-  def testZipDataset(self):
-    component_placeholders = [
-        array_ops.placeholder(dtypes.int64),
-        array_ops.placeholder(dtypes.int64),
-        array_ops.placeholder(dtypes.float64)
-    ]
-
-    datasets = tuple([
-        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
-        for component_placeholder in component_placeholders
-    ])
-    zipped = dataset_ops.Dataset.zip(datasets)
-
-    iterator = zipped.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      equal_length_components = [
-          np.tile(np.array([[1], [2], [3], [4]]), 20),
-          np.tile(np.array([[12], [13], [14], [15]]), 22),
-          np.array([37.0, 38.0, 39.0, 40.0])
-      ]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, equal_length_components)})
-      for i in range(4):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            equal_length_components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      variable_length_components = [[1, 2, 3, 4], [1, 2, 3, 4, 5], [1.0, 2.0]]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, variable_length_components)})
-      for i in range(2):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            variable_length_components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedZipDataset(self):
-    component_placeholders = [
-        array_ops.placeholder(dtypes.int64, shape=[4, 20]),
-        array_ops.placeholder(dtypes.int64, shape=[4, 22]),
-        array_ops.placeholder(dtypes.float64, shape=[4])
-    ]
-
-    datasets = [
-        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
-        for component_placeholder in component_placeholders
-    ]
-    zipped = dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
-
-    iterator = zipped.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([20], get_next[0].shape)
-    self.assertEqual([22], get_next[1][0].shape)
-    self.assertEqual([], get_next[1][1].shape)
-
-    with self.test_session() as sess:
-      equal_length_components = [
-          np.tile(np.array([[1], [2], [3], [4]]), 20),
-          np.tile(np.array([[12], [13], [14], [15]]), 22),
-          np.array([37.0, 38.0, 39.0, 40.0])
-      ]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, equal_length_components)})
-      for i in range(4):
-        result1, (result2, result3) = sess.run(get_next)
-        self.assertAllEqual(equal_length_components[0][i], result1)
-        self.assertAllEqual(equal_length_components[1][i], result2)
-        self.assertAllEqual(equal_length_components[2][i], result3)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
 class ZipDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 25ed58cdf5833cd041582046bc1a358625e321e0..b488357f226d0922bba3799cc1f4b5c75e2e8328 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -15,7 +15,7 @@ py_library(
     name = "dataset_ops",
     srcs = [
         "counter.py",
-        "dataset_ops.py",
+        "get_single_element.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -40,6 +40,25 @@ py_library(
     ],
 )
 
+py_library(
+    name = "random_ops",
+    srcs = [
+        "random_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
 py_library(
     name = "readers",
     srcs = [
@@ -62,6 +81,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "shuffle_ops",
+    srcs = [
+        "shuffle_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":random_ops",
+        ":transformation_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_library(
     name = "transformation_ops",
     srcs = [
@@ -73,9 +105,12 @@ py_library(
         "resampling.py",
         "scan_ops.py",
         "stats_ops.py",
+        "unique.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":contrib_op_loader",
+        ":gen_dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
@@ -89,6 +124,7 @@ py_library(
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
         "//third_party/py/numpy",
@@ -96,36 +132,44 @@ py_library(
 )
 
 tf_gen_op_wrapper_py(
-    name = "prefetching_ops",
-    out = "gen_prefetching_ops.py",
-    deps = ["//tensorflow/contrib/data:prefetching_ops_op_lib"],
+    name = "gen_dataset_ops",
+    out = "gen_dataset_ops.py",
+    deps = ["//tensorflow/contrib/data:dataset_ops_op_lib"],
 )
 
 tf_kernel_library(
-    name = "prefetching_ops_kernels",
+    name = "dataset_ops_kernels",
     deps = [
-        "//tensorflow/contrib/data/kernels:prefetching_kernels",
+        "//tensorflow/contrib/data/kernels:dataset_kernels",
         "//tensorflow/core:framework",
     ],
     alwayslink = 1,
 )
 
 tf_custom_op_py_library(
-    name = "prefetching_py",
-    srcs = ["prefetching_ops.py"],
-    dso = ["//tensorflow/contrib/data:_prefetching_ops.so"],
+    name = "contrib_op_loader",
+    srcs = ["contrib_op_loader.py"],
+    dso = ["//tensorflow/contrib/data:_dataset_ops.so"],
     kernels = [
-        ":prefetching_ops_kernels",
-        "//tensorflow/contrib/data:prefetching_ops_op_lib",
+        ":dataset_ops_kernels",
+        "//tensorflow/contrib/data:dataset_ops_op_lib",
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":prefetching_ops",
+        ":gen_dataset_ops",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:platform",
     ],
 )
 
+py_library(
+    name = "prefetching_ops",
+    srcs = ["prefetching_ops.py"],
+    deps = [
+        ":contrib_op_loader",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 63782d229e1535892686f202ca1f0833dee6ed80..6eb512dec67cb7b9c8c4518d03aee0b436205f9a 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -22,6 +22,7 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -231,32 +232,29 @@ class DenseToSparseBatchDataset(dataset_ops.Dataset):
                       input_dataset.output_types)
     self._input_dataset = input_dataset
     self._batch_size = batch_size
-    # pylint: disable=protected-access
-    self._row_shape = dataset_ops._partial_shape_to_tensor(row_shape)
-    # pylint: enable=protected-access
+    self._row_shape = row_shape
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.dense_to_sparse_batch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._batch_size,
-        self._row_shape,
-        output_shapes=self.output_shapes,
-        output_types=self.output_types)
+        row_shape=dataset_ops._partial_shape_to_tensor(self._row_shape),  # pylint: disable=protected-access
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
 
   @property
   def output_classes(self):
-    return (ops.Tensor, ops.Tensor, ops.Tensor)
+    return sparse_tensor.SparseTensor
 
   @property
   def output_shapes(self):
-    num_elements = tensor_shape.Dimension(None)
-    return (tensor_shape.matrix(num_elements, self._row_shape.shape[0] + 1),
-            tensor_shape.vector(num_elements),
-            tensor_shape.vector(self._row_shape.shape[0] + 1))
+    return tensor_shape.vector(None).concatenate(self._row_shape)
 
   @property
   def output_types(self):
-    return (dtypes.int64, self._input_dataset.output_types, dtypes.int64)
+    return self._input_dataset.output_types
 
 
 class _RestructuredDataset(dataset_ops.Dataset):
@@ -390,17 +388,12 @@ def map_and_batch(map_func, batch_size, num_parallel_batches=1):
   """Fused implementation of `map` and `batch`.
 
   Maps `map_func` across `batch_size` consecutive elements of this dataset
-  and then combines them into a batch. Similarly to `batch_and_drop_remainder`,
-  if the batch size does not evenly divide the input dataset size, this
-  transformation will drop the final smaller element.
-
-
-  Functionally, it is equivalent to `map` followed by
-  `batch_and_drop_remainder`. However, by fusing the two transformations
-  together, the implementation can be more efficient. This transformation is a
-  stop gap solution for performance critical workloads. Once automatic input
-  pipeline optimization are implemented, the fusing of map and batch will not
-  need to be exposed at the API level and this method will be removed.
+  and then combines them into a batch. Functionally, it is equivalent to `map`
+  followed by `batch`. However, by fusing the two transformations together, the
+  implementation can be more efficient. Surfacing this transformation in the API
+  is temporary. Once automatic input pipeline optimization is implemented,
+  the fusing of `map` and `batch` will happen automatically and this API will be
+  deprecated.
 
   Args:
     map_func: A function mapping a nested structure of tensors to another
@@ -410,11 +403,11 @@ def map_and_batch(map_func, batch_size, num_parallel_batches=1):
     num_parallel_batches: A `tf.int64` scalar `tf.Tensor`, representing the
       number of batches to create in parallel. On one hand, higher values can
       help mitigate the effect of stragglers. On the other hand, higher values
-      can increasing contention if CPU is scarce.
+      can increase contention if CPU is scarce.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.contrib.data.Dataset.apply}.
+    @{tf.data.Dataset.apply}.
   """
 
   def _apply_fn(dataset):
diff --git a/tensorflow/contrib/data/python/ops/contrib_op_loader.py b/tensorflow/contrib/data/python/ops/contrib_op_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f495a9dc9c82311435e71d2ac9ed35fd9aea794
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/contrib_op_loader.py
@@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python helper for loading contrib ops and kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_dataset_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("../../_dataset_ops.so"))
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
index 626a9e0edcea5928b1636c1a2a86e83657c966a5..ff15c4451ad987bcd77dbdd022a1c070056c47e1 100644
--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
@@ -364,7 +364,7 @@ class Dataset(dataset_ops.Dataset):
     When reading a single input file, you can skip elements as follows:
 
     ```python
-    d = tf.contrib.data.TFRecordDataset(FLAGS.input_file)
+    d = tf.data.TFRecordDataset(FLAGS.input_file)
     d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
     d = d.repeat(FLAGS.num_epochs)
     d = d.shuffle(FLAGS.shuffle_buffer_size)
@@ -382,12 +382,11 @@ class Dataset(dataset_ops.Dataset):
       sharding strategy within a complete pipeline:
 
     ```python
-    d = Dataset.list_files(FLAGS.pattern)
+    d = tf.data.Dataset.list_files(FLAGS.pattern)
     d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
     d = d.repeat(FLAGS.num_epochs)
     d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.repeat()
-    d = d.interleave(tf.contrib.data.TFRecordDataset,
+    d = d.interleave(tf.data.TFRecordDataset,
                      cycle_length=FLAGS.num_readers, block_length=1)
     d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
     ```
@@ -484,7 +483,7 @@ class Dataset(dataset_ops.Dataset):
           num_threads=None,
           output_buffer_size=None,
           num_parallel_calls=None):
-    """Maps `map_func` across this datset.
+    """Maps `map_func` across this dataset.
 
     Args:
       map_func: A function mapping a nested structure of tensors (having
@@ -549,7 +548,7 @@ class Dataset(dataset_ops.Dataset):
     elements are produced. `cycle_length` controls the number of input elements
     that are processed concurrently. If you set `cycle_length` to 1, this
     transformation will handle one input element at a time, and will produce
-    identical results = to @{tf.contrib.data.Dataset.flat_map}. In general,
+    identical results = to @{tf.data.Dataset.flat_map}. In general,
     this transformation will apply `map_func` to `cycle_length` input elements,
     open iterators on the returned `Dataset` objects, and cycle through them
     producing `block_length` consecutive elements from each iterator, and
diff --git a/tensorflow/contrib/data/python/ops/error_ops.py b/tensorflow/contrib/data/python/ops/error_ops.py
index aa629cba479102ee4244884e7c546615b28cf4e5..6c21e489f7c35484ebacd465e3b46d6920df5933 100644
--- a/tensorflow/contrib/data/python/ops/error_ops.py
+++ b/tensorflow/contrib/data/python/ops/error_ops.py
@@ -17,10 +17,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
-from tensorflow.python.ops import gen_dataset_ops
 
 
 def ignore_errors():
diff --git a/tensorflow/contrib/data/python/ops/get_single_element.py b/tensorflow/contrib/data/python/ops/get_single_element.py
new file mode 100644
index 0000000000000000000000000000000000000000..a817b45b71b608810a9d7536ec123ab84f7cdc3b
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/get_single_element.py
@@ -0,0 +1,67 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for Datasets and Iterators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def get_single_element(dataset):
+  """Returns the single element in `dataset` as a nested structure of tensors.
+
+  This function enables you to use a @{tf.data.Dataset} in a stateless
+  "tensor-in tensor-out" expression, without creating a @{tf.data.Iterator}.
+  This can be useful when your preprocessing transformations are expressed
+  as a `Dataset`, and you want to use the transformation at serving time.
+  For example:
+
+  ```python
+  input_batch = tf.placeholder(tf.string, shape=[BATCH_SIZE])
+
+  def preprocessing_fn(input_str):
+    # ...
+    return image, label
+
+  dataset = (tf.data.Dataset.from_tensor_slices(input_batch)
+             .map(preprocessing_fn, num_parallel_calls=BATCH_SIZE)
+             .batch(BATCH_SIZE))
+
+  image_batch, label_batch = tf.contrib.data.get_single_element(dataset)
+  ```
+
+  Args:
+    dataset: A @{tf.data.Dataset} object containing a single element.
+
+  Returns:
+    A nested structure of @{tf.Tensor} objects, corresponding to the single
+    element of `dataset`.
+
+  Raises:
+    TypeError: if `dataset` is not a `tf.data.Dataset` object.
+    InvalidArgumentError (at runtime): if `dataset` does not contain exactly
+      one element.
+  """
+  if not isinstance(dataset, dataset_ops.Dataset):
+    raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
+  return nest.pack_sequence_as(
+      dataset.output_types,
+      gen_dataset_ops.dataset_to_single_element(
+          dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          output_types=nest.flatten(dataset.output_types),
+          output_shapes=nest.flatten(dataset.output_shapes)))
diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py
index ef91c56726e969053fdad667dda3e89430045652..67b085002aa7797d858837fea4646fb968ad5d97 100644
--- a/tensorflow/contrib/data/python/ops/grouping.py
+++ b/tensorflow/contrib/data/python/ops/grouping.py
@@ -45,7 +45,7 @@ def group_by_window(key_func,
     key_func: A function mapping a nested structure of tensors
       (having shapes and types defined by `self.output_shapes` and
       `self.output_types`) to a scalar `tf.int64` tensor.
-    reduce_func: A function mapping a key and a dataset of up to `batch_size`
+    reduce_func: A function mapping a key and a dataset of up to `window_size`
       consecutive elements matching that key to another dataset.
     window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
       consecutive elements matching the same key to combine in a single
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index 53324e06e7f1dc249388410f0e14e42336630cd1..3124ca1d1540e12d949dded88ce1c66181be3595 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
@@ -31,7 +32,7 @@ class ParallelInterleaveDataset(dataset_ops.Dataset):
   """A `Dataset` that maps a function over its input and flattens the result."""
 
   def __init__(self, input_dataset, map_func, cycle_length, block_length,
-               sloppy):
+               sloppy, buffer_output_elements, prefetch_input_elements):
     """See `tf.contrib.data.parallel_interleave()` for details."""
     super(ParallelInterleaveDataset, self).__init__()
     self._input_dataset = input_dataset
@@ -74,6 +75,14 @@ class ParallelInterleaveDataset(dataset_ops.Dataset):
         block_length, dtype=dtypes.int64, name="block_length")
     self._sloppy = ops.convert_to_tensor(
         sloppy, dtype=dtypes.bool, name="sloppy")
+    self._buffer_output_elements = convert.optional_param_to_tensor(
+        "buffer_output_elements",
+        buffer_output_elements,
+        argument_default=2 * block_length)
+    self._prefetch_input_elements = convert.optional_param_to_tensor(
+        "prefetch_input_elements",
+        prefetch_input_elements,
+        argument_default=2 * cycle_length)
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.parallel_interleave_dataset(
@@ -82,6 +91,8 @@ class ParallelInterleaveDataset(dataset_ops.Dataset):
         self._cycle_length,
         self._block_length,
         self._sloppy,
+        self._buffer_output_elements,
+        self._prefetch_input_elements,
         f=self._map_func,
         output_types=nest.flatten(
             sparse.as_dense_types(self.output_types, self.output_classes)),
@@ -101,7 +112,12 @@ class ParallelInterleaveDataset(dataset_ops.Dataset):
     return self._output_types
 
 
-def parallel_interleave(map_func, cycle_length, block_length=1, sloppy=False):
+def parallel_interleave(map_func,
+                        cycle_length,
+                        block_length=1,
+                        sloppy=False,
+                        buffer_output_elements=None,
+                        prefetch_input_elements=None):
   """A parallel version of the `Dataset.interleave()` transformation.
 
   `parallel_interleave()` maps `map_func` across its input to produce nested
@@ -129,12 +145,17 @@ def parallel_interleave(map_func, cycle_length, block_length=1, sloppy=False):
 
   Args:
     map_func: A function mapping a nested structure of tensors to a `Dataset`.
-    cycle_length: The number of threads to interleave from in parallel.
-    block_length: The number of consecutive elements to pull from a thread
-      before advancing to the next thread.
+    cycle_length: The number of input `Dataset`s to interleave from in parallel.
+    block_length: The number of consecutive elements to pull from an input
+      `Dataset` before advancing to the next input `Dataset`.
     sloppy: If false, elements are produced in deterministic order. Otherwise,
       the implementation is allowed, for the sake of expediency, to produce
       elements in a non-deterministic order.
+    buffer_output_elements: The number of elements each iterator being
+      interleaved should buffer (similar to the `.prefetch()` transformation for
+      each interleaved iterator).
+    prefetch_input_elements: The number of input elements to transform to
+      iterators before they are needed for interleaving.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -142,7 +163,9 @@ def parallel_interleave(map_func, cycle_length, block_length=1, sloppy=False):
   """
   def _apply_fn(dataset):
     return ParallelInterleaveDataset(
-        dataset, map_func, cycle_length, block_length, sloppy)
+        dataset, map_func, cycle_length, block_length, sloppy,
+        buffer_output_elements, prefetch_input_elements)
+
   return _apply_fn
 
 
@@ -187,11 +210,11 @@ def sloppy_interleave(map_func, cycle_length, block_length=1):
     map_func: A function mapping a nested structure of tensors (having shapes
       and types defined by `self.output_shapes` and `self.output_types`) to a
       `Dataset`.
-    cycle_length: The number of threads to interleave from in parallel.
-    block_length: The number of consecutive elements to pull from a thread
-      before advancing to the next thread. Note: sloppy_interleave will
-      skip the remainder of elements in the block_length in order to avoid
-      blocking.
+    cycle_length: The number of input `Dataset`s to interleave from in parallel.
+    block_length: The number of consecutive elements to pull from an input
+      `Dataset` before advancing to the next input `Dataset`. Note:
+      `sloppy_interleave` will skip the remainder of elements in the
+      `block_length` in order to avoid blocking.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -199,5 +222,12 @@ def sloppy_interleave(map_func, cycle_length, block_length=1):
   """
   def _apply_fn(dataset):
     return ParallelInterleaveDataset(
-        dataset, map_func, cycle_length, block_length, sloppy=True)
+        dataset,
+        map_func,
+        cycle_length,
+        block_length,
+        sloppy=True,
+        buffer_output_elements=None,
+        prefetch_input_elements=None)
+
   return _apply_fn
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index cfe8012b5657995b78d701528ea35cbb3748adb9..96a9e9ed6649444dac5e56d7dd2fcdb62fc56459 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -17,12 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import gen_prefetching_ops
-from tensorflow.contrib.util import loader
-from tensorflow.python.platform import resource_loader
-
-_prefetching_ops = loader.load_op_library(
-    resource_loader.get_path_to_datafile("../../_prefetching_ops.so"))
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
 
 
 # TODO(rohanj): Add a python class that constructs resource in the __init__
@@ -35,7 +31,7 @@ def function_buffering_resource(string_arg,
                                 thread_pool_size=1,
                                 container="",
                                 name=None):
-  return gen_prefetching_ops.function_buffering_resource(
+  return gen_dataset_ops.function_buffering_resource(
       string_arg=string_arg,
       target_device=target_device,
       shared_name=shared_name,
@@ -49,7 +45,7 @@ def function_buffering_resource(string_arg,
 def function_buffering_resource_get_next(function_buffer_resource,
                                          output_types,
                                          name=None):
-  return gen_prefetching_ops.function_buffering_resource_get_next(
+  return gen_dataset_ops.function_buffering_resource_get_next(
       function_buffer_resource=function_buffer_resource,
       output_types=output_types,
       name=name)
diff --git a/tensorflow/contrib/data/python/ops/random_ops.py b/tensorflow/contrib/data/python/ops/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d727165feabb101549567f28a2dfa07083de244
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/random_ops.py
@@ -0,0 +1,67 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Datasets for random number generators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_dataset_ops
+
+
+class RandomDataset(dataset_ops.Dataset):
+  """A `Dataset` of pseudorandom values."""
+
+  def __init__(self, seed=None):
+    """A `Dataset` of pseudorandom values."""
+    super(RandomDataset, self).__init__()
+    seed, seed2 = random_seed.get_seed(seed)
+    if seed is None:
+      self._seed = constant_op.constant(0, dtype=dtypes.int64, name="seed")
+    else:
+      self._seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
+    if seed2 is None:
+      self._seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
+    else:
+      self._seed2 = ops.convert_to_tensor(
+          seed2, dtype=dtypes.int64, name="seed2")
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.random_dataset(
+        seed=self._seed,
+        seed2=self._seed2,
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.int64
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index acb7a43211482f9cdeed66542abab5dbde78d60e..57f30102778f3bac47580f9bdf94e411dfe1b621 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -17,9 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import dataset_ops as contrib_dataset_ops
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -27,74 +25,6 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import gfile
-from tensorflow.python.util import deprecation
-
-
-class TextLineDataset(contrib_dataset_ops.Dataset):
-  """A `Dataset` comprising lines from one or more text files."""
-
-  @deprecation.deprecated(None, "Use `tf.data.TextLineDataset`.")
-  def __init__(self, filenames, compression_type=None, buffer_size=None):
-    """Creates a `TextLineDataset`.
-
-    Args:
-      filenames: A `tf.string` tensor containing one or more filenames.
-      compression_type: (Optional.) A `tf.string` scalar evaluating to one of
-        `""` (no compression), `"ZLIB"`, or `"GZIP"`.
-      buffer_size: (Optional.) A `tf.int64` scalar denoting the number of bytes
-        to buffer. A value of 0 results in the default buffering values chosen
-        based on the compression type.
-    """
-    dataset = readers.TextLineDataset(filenames, compression_type,
-                                      buffer_size)
-    super(TextLineDataset, self).__init__(dataset)
-
-
-class TFRecordDataset(contrib_dataset_ops.Dataset):
-  """A `Dataset` comprising records from one or more TFRecord files."""
-
-  @deprecation.deprecated(None, "Use `tf.data.TFRecordDataset`.")
-  def __init__(self, filenames, compression_type=None, buffer_size=None):
-    """Creates a `TFRecordDataset`.
-
-    Args:
-      filenames: A `tf.string` tensor containing one or more filenames.
-      compression_type: (Optional.) A `tf.string` scalar evaluating to one of
-        `""` (no compression), `"ZLIB"`, or `"GZIP"`.
-      buffer_size: (Optional.) A `tf.int64` scalar representing the number of
-        bytes in the read buffer. 0 means no buffering.
-    """
-    dataset = readers.TFRecordDataset(filenames, compression_type,
-                                      buffer_size)
-    super(TFRecordDataset, self).__init__(dataset)
-
-
-class FixedLengthRecordDataset(contrib_dataset_ops.Dataset):
-  """A `Dataset` of fixed-length records from one or more binary files."""
-
-  @deprecation.deprecated(None, "Use `tf.data.FixedLengthRecordDataset`.")
-  def __init__(self,
-               filenames,
-               record_bytes,
-               header_bytes=None,
-               footer_bytes=None,
-               buffer_size=None):
-    """Creates a `FixedLengthRecordDataset`.
-
-    Args:
-      filenames: A `tf.string` tensor containing one or more filenames.
-      record_bytes: A `tf.int64` scalar representing the number of bytes in
-        each record.
-      header_bytes: (Optional.) A `tf.int64` scalar representing the number of
-        bytes to skip at the start of a file.
-      footer_bytes: (Optional.) A `tf.int64` scalar representing the number of
-        bytes to ignore at the end of a file.
-      buffer_size: (Optional.) A `tf.int64` scalar representing the number of
-        bytes to buffer when reading.
-    """
-    dataset = readers.FixedLengthRecordDataset(
-        filenames, record_bytes, header_bytes, footer_bytes, buffer_size)
-    super(FixedLengthRecordDataset, self).__init__(dataset)
 
 
 def read_batch_features(file_pattern,
@@ -179,6 +109,7 @@ def read_batch_features(file_pattern,
     dataset = dataset.shuffle(capacity)
   dataset = dataset.batch(batch_size)
   dataset = dataset.map(lambda x: parsing_ops.parse_example(x, features))
+  dataset = dataset.prefetch(1)
   iterator = dataset.make_one_shot_iterator()
   outputs = iterator.get_next()
   return outputs
@@ -215,14 +146,7 @@ def _get_file_names(file_pattern, randomize_input):
   return file_names
 
 
-class SqlDataset(contrib_dataset_ops.Dataset):
-
-  def __init__(self, driver_name, data_source_name, query, output_types):
-    dataset = _SqlDataset(driver_name, data_source_name, query, output_types)
-    super(SqlDataset, self).__init__(dataset)
-
-
-class _SqlDataset(dataset_ops.Dataset):
+class SqlDataset(dataset_ops.Dataset):
   """A `Dataset` consisting of the results from a SQL query."""
 
   def __init__(self, driver_name, data_source_name, query, output_types):
@@ -254,7 +178,7 @@ class _SqlDataset(dataset_ops.Dataset):
       output_types: A tuple of `tf.DType` objects representing the types of the
         columns returned by `query`.
     """
-    super(_SqlDataset, self).__init__()
+    super(SqlDataset, self).__init__()
     self._driver_name = ops.convert_to_tensor(
         driver_name, dtype=dtypes.string, name="driver_name")
     self._data_source_name = ops.convert_to_tensor(
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index 2744786e9eec4c9268ba854df6ea761339bb0b4e..1c88366273f5d186509454188e02350d4ea9f66b 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -188,7 +188,7 @@ def scan(initial_state, scan_func):
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.contrib.data.Dataset.apply}.
+    @{tf.data.Dataset.apply}.
   """
   def _apply_fn(dataset):
     return _ScanDataset(dataset, initial_state, scan_func)
diff --git a/tensorflow/contrib/data/python/ops/shuffle_ops.py b/tensorflow/contrib/data/python/ops/shuffle_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..99bb79bc06a421f811869ca9169aaa11deaca2f3
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/shuffle_ops.py
@@ -0,0 +1,120 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental shuffle ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import gen_dataset_ops
+
+
+class _ShuffleAndRepeatDataset(dataset_ops.Dataset):
+  """A `Dataset` that fuses `shuffle` and `repeat`."""
+
+  def __init__(self,
+               input_dataset,
+               buffer_size,
+               count=None,
+               seed=None):
+    """See `Dataset.map()` for details."""
+    super(_ShuffleAndRepeatDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._buffer_size = ops.convert_to_tensor(
+        buffer_size, dtype=dtypes.int64, name="buffer_size")
+    if count is None:
+      self._count = constant_op.constant(-1, dtype=dtypes.int64, name="count")
+    else:
+      self._count = ops.convert_to_tensor(
+          count, dtype=dtypes.int64, name="count")
+
+    seed, seed2 = random_seed.get_seed(seed)
+    if seed is None:
+      self._seed = constant_op.constant(0, dtype=dtypes.int64, name="seed")
+    else:
+      self._seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
+    if seed2 is None:
+      self._seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
+    else:
+      self._seed2 = ops.convert_to_tensor(
+          seed2, dtype=dtypes.int64, name="seed2")
+
+  def _as_variant_tensor(self):
+    # pylint: disable=protected-access
+    input_resource = self._input_dataset._as_variant_tensor()
+    return gen_dataset_ops.shuffle_and_repeat_dataset(
+        input_resource,
+        buffer_size=self._buffer_size,
+        count=self._count,
+        seed=self._seed,
+        seed2=self._seed2,
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+    # pylint: enable=protected-access
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+def shuffle_and_repeat(buffer_size, count=None, seed=None):
+  """Shuffles and repeats a Dataset returning a new permutation for each epoch.
+
+  `dataset.apply(tf.contrib.data.shuffle_and_repeat(buffer_size, count))`
+
+  is equivalent to
+
+  `dataset.shuffle(buffer_size, reshuffle_each_iteration=True).repeat(count)`
+
+  The difference is that the latter dataset is not serializable. So,
+  if you need to checkpoint an input pipeline with reshuffling you must use
+  this implementation.
+
+  Args:
+    buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
+      maximum number elements that will be buffered when prefetching.
+    count: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+      number of times the dataset should be repeated. The default behavior
+      (if `count` is `None` or `-1`) is for the dataset be repeated
+      indefinitely.
+    seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+      random seed that will be used to create the distribution. See
+      @{tf.set_random_seed} for behavior.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):  # pylint: disable=missing-docstring
+    return _ShuffleAndRepeatDataset(dataset, buffer_size, count, seed)
+
+  return _apply_fn
diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py
index b8875bd533ddc9e2c195646619dccf3aab5225e4..9cd1701c397b5a0bf5cc47c1bcab033704794d80 100644
--- a/tensorflow/contrib/data/python/ops/stats_ops.py
+++ b/tensorflow/contrib/data/python/ops/stats_ops.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
@@ -117,7 +118,7 @@ def bytes_produced_stats(tag):
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.contrib.data.Dataset.apply}.
+    @{tf.data.Dataset.apply}.
   """
 
   def _apply_fn(dataset):
@@ -139,7 +140,7 @@ def latency_stats(tag):
 
   Returns:
     A `Dataset` transformation function, which can be passed to
-    @{tf.contrib.data.Dataset.apply}.
+    @{tf.data.Dataset.apply}.
   """
 
   def _apply_fn(dataset):
@@ -161,8 +162,10 @@ class _StatsDataset(dataset_ops.Dataset):
     return self._op_function(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._tag,
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
 
   @property
   def output_shapes(self):
diff --git a/tensorflow/contrib/data/python/ops/unique.py b/tensorflow/contrib/data/python/ops/unique.py
new file mode 100644
index 0000000000000000000000000000000000000000..133e17d20d0fc4c8d52cef3c95c132374e927a0b
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/unique.py
@@ -0,0 +1,82 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unique element dataset transformations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def unique():
+  """Creates a `Dataset` from another `Dataset`, discarding duplicates.
+
+  Use this transformation to produce a dataset that contains one instance of
+  each unique element in the input. For example:
+
+  ```python
+  dataset = tf.data.Dataset.from_tensor_slices([1, 37, 2, 37, 2, 1])
+
+  # Using `unique()` will drop the duplicate elements.
+  dataset = dataset.apply(tf.contrib.data.unique())  # ==> { 1, 37, 2 }
+  ```
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    return UniqueDataset(dataset)
+
+  return _apply_fn
+
+
+class UniqueDataset(dataset_ops.Dataset):
+  """A `Dataset` contains the unique elements from its input."""
+
+  def __init__(self, input_dataset):
+    """See `unique()` for details."""
+    super(UniqueDataset, self).__init__()
+    self._input_dataset = input_dataset
+    if input_dataset.output_types not in (dtypes.int32, dtypes.int64,
+                                          dtypes.string):
+      raise TypeError(
+          "`tf.contrib.data.unique()` only supports inputs with a single "
+          "`tf.int32`, `tf.int64`, or `tf.string` component.")
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.unique_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
diff --git a/tensorflow/contrib/decision_trees/proto/BUILD b/tensorflow/contrib/decision_trees/proto/BUILD
index 87c80740a8f0c0721394b5d832bc96e548e3a313..f6de5998d73a4869d2444cd90c9b64d1a2c889ac 100644
--- a/tensorflow/contrib/decision_trees/proto/BUILD
+++ b/tensorflow/contrib/decision_trees/proto/BUILD
@@ -7,7 +7,11 @@ exports_files([
     "generic_tree_model_proto.swig",
 ])
 
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library",
+    "tf_pyclif_proto_library",
+)
 
 filegroup(
     name = "all_files",
@@ -34,3 +38,10 @@ tf_proto_library(
     protodeps = [":generic_tree_model"],
     visibility = ["//visibility:public"],
 )
+
+tf_pyclif_proto_library(
+    name = "generic_tree_model_pyclif",
+    proto_lib = ":generic_tree_model",
+    proto_srcfile = "generic_tree_model.proto",
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig b/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig
index d3d201afd5761e7c5c136301c779222bedc68492..cafb9314caee1c4907786b8101e7c71bd7095306 100644
--- a/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig
+++ b/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig
@@ -2,7 +2,7 @@
 
 %include "net/proto/swig/protofunc.swig"
 
-#ifndef MUST_USE_RESULT
+#ifndef ABSL_MUST_USE_RESULT
 #error Use this file only as a %include or %import after google.swig.
 #endif
 
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index b2c641f8ab3ea23c5135042e4b1223d487ae8cbc..7f510c42215f48a9e795eb81bd9f66b0a2108335 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -60,6 +60,7 @@ py_library(
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:spectral_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
@@ -127,6 +128,19 @@ cuda_py_test(
     tags = ["no_pip"],
 )
 
+cuda_py_test(
+    name = "autoregressive_test",
+    size = "small",
+    srcs = ["python/kernel_tests/autoregressive_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "binomial_test",
     size = "small",
@@ -437,6 +451,7 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
@@ -916,6 +931,22 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "real_nvp_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/real_nvp_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "permute_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 66827179e9fa1bea852f55246c263c4696cf3bdc..61c411271d0bb8d7b4cc3b14992b82ec1e5674ed 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import,line-too-long,g-importing-member
 
 from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.contrib.distributions.python.ops.autoregressive import *
 from tensorflow.contrib.distributions.python.ops.binomial import *
 from tensorflow.contrib.distributions.python.ops.cauchy import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
@@ -39,6 +40,7 @@ from tensorflow.contrib.distributions.python.ops.geometric import *
 from tensorflow.contrib.distributions.python.ops.half_normal import *
 from tensorflow.contrib.distributions.python.ops.independent import *
 from tensorflow.contrib.distributions.python.ops.inverse_gamma import *
+from tensorflow.contrib.distributions.python.ops.kumaraswamy import *
 from tensorflow.contrib.distributions.python.ops.logistic import *
 from tensorflow.contrib.distributions.python.ops.mixture import *
 from tensorflow.contrib.distributions.python.ops.mixture_same_family import *
@@ -84,6 +86,7 @@ from tensorflow.python.ops.distributions.uniform import *
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
+    'auto_correlation',
     'bijectors',
     'Cauchy',
     'ConditionalDistribution',
@@ -92,9 +95,9 @@ _allowed_symbols = [
     'NOT_REPARAMETERIZED',
     'ReparameterizationType',
     'Distribution',
+    'Autoregressive',
     'Binomial',
     'Bernoulli',
-    'BernoulliWithSigmoidProbs',
     'Beta',
     'BetaWithSoftplusConcentration',
     'Categorical',
@@ -112,6 +115,7 @@ _allowed_symbols = [
     'Independent',
     'InverseGamma',
     'InverseGammaWithSoftplusConcentrationRate',
+    'Kumaraswamy',
     'Laplace',
     'LaplaceWithSoftplusScale',
     'Logistic',
@@ -159,6 +163,10 @@ _allowed_symbols = [
     'assign_log_moving_mean_exp',
     'moving_mean_variance',
     'estimator_head_distribution_regression',
+    'quadrature_scheme_softmaxnormal_gauss_hermite',
+    'quadrature_scheme_softmaxnormal_quantiles',
+    'quadrature_scheme_lognormal_gauss_hermite',
+    'quadrature_scheme_lognormal_quantiles',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/autoregressive_test.py b/tensorflow/contrib/distributions/python/kernel_tests/autoregressive_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0928dc3f358ede693865a8d1ff9257a0ecbe9499
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/autoregressive_test.py
@@ -0,0 +1,94 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import autoregressive as autoregressive_lib
+from tensorflow.contrib.distributions.python.ops import independent as independent_lib
+from tensorflow.contrib.distributions.python.ops import test_util
+from tensorflow.contrib.distributions.python.ops.bijectors.affine import Affine
+from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import MaskedAutoregressiveFlow
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import transformed_distribution as transformed_distribution_lib
+from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.platform import test
+
+
+class AutogressiveTest(test_util.VectorDistributionTestHelpers, test.TestCase):
+  """Tests the Autoregressive distribution."""
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def _random_scale_tril(self, event_size):
+    n = np.int32(event_size * (event_size + 1) // 2)
+    p = 2. * self._rng.random_sample(n).astype(np.float32) - 1.
+    return distribution_util.fill_triangular(0.25 * p)
+
+  def _normal_fn(self, affine_bijector):
+    def _fn(samples):
+      scale = math_ops.exp(affine_bijector.forward(samples))
+      return independent_lib.Independent(
+          normal_lib.Normal(loc=0., scale=scale, validate_args=True),
+          reinterpreted_batch_ndims=1)
+    return _fn
+
+  def testSampleAndLogProbConsistency(self):
+    batch_shape = []
+    event_size = 2
+    with self.test_session() as sess:
+      batch_event_shape = np.concatenate([batch_shape, [event_size]], axis=0)
+      sample0 = array_ops.zeros(batch_event_shape)
+      affine = Affine(scale_tril=self._random_scale_tril(event_size))
+      ar = autoregressive_lib.Autoregressive(
+          self._normal_fn(affine), sample0, validate_args=True)
+      self.run_test_sample_consistent_log_prob(
+          sess.run, ar, radius=1., center=0., rtol=0.01)
+
+  def testCompareToBijector(self):
+    """Demonstrates equivalence between TD, Bijector approach and AR dist."""
+    sample_shape = np.int32([4, 5])
+    batch_shape = np.int32([])
+    event_size = np.int32(2)
+    with self.test_session() as sess:
+      batch_event_shape = np.concatenate([batch_shape, [event_size]], axis=0)
+      sample0 = array_ops.zeros(batch_event_shape)
+      affine = Affine(scale_tril=self._random_scale_tril(event_size))
+      ar = autoregressive_lib.Autoregressive(
+          self._normal_fn(affine), sample0, validate_args=True)
+      ar_flow = MaskedAutoregressiveFlow(
+          is_constant_jacobian=True,
+          shift_and_log_scale_fn=lambda x: [None, affine.forward(x)],
+          validate_args=True)
+      td = transformed_distribution_lib.TransformedDistribution(
+          distribution=normal_lib.Normal(loc=0., scale=1.),
+          bijector=ar_flow,
+          event_shape=[event_size],
+          batch_shape=batch_shape,
+          validate_args=True)
+      x_shape = np.concatenate(
+          [sample_shape, batch_shape, [event_size]], axis=0)
+      x = 2. * self._rng.random_sample(x_shape).astype(np.float32) - 1.
+      td_log_prob_, ar_log_prob_ = sess.run([td.log_prob(x), ar.log_prob(x)])
+      self.assertAllClose(td_log_prob_, ar_log_prob_, atol=0., rtol=1e-6)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
index 25a9b6f5fe2ed6d218d6b44650fce17fa89c0664..dcfb0eb05185d36d96947905c2eb91b2201aece1 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
@@ -22,9 +22,9 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import test_util
 from tensorflow.contrib.distributions.python.ops.bijectors.invert import Invert
+from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import _gen_mask
 from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import masked_autoregressive_default_template
 from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import MaskedAutoregressiveFlow
-from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive_impl import _gen_mask
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
@@ -149,5 +149,17 @@ class MaskedAutoregressiveFlowShiftOnlyTest(MaskedAutoregressiveFlowTest):
     }
 
 
+class MaskedAutoregressiveFlowUnrollLoopTest(MaskedAutoregressiveFlowTest):
+
+  @property
+  def _autoregressive_flow_kwargs(self):
+    return {
+        "shift_and_log_scale_fn": masked_autoregressive_default_template(
+            hidden_layers=[2], shift_only=False),
+        "is_constant_jacobian": False,
+        "unroll_loop": True,
+    }
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..46fe7797419a9906ecdad60dd0dfe1e9d7c743ed
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/real_nvp_test.py
@@ -0,0 +1,144 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MaskedAutoregressiveFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.contrib.distributions.python.ops import test_util
+from tensorflow.contrib.distributions.python.ops.bijectors.invert import Invert
+from tensorflow.contrib.distributions.python.ops.bijectors.real_nvp import real_nvp_default_template
+from tensorflow.contrib.distributions.python.ops.bijectors.real_nvp import RealNVP
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import transformed_distribution as transformed_distribution_lib
+from tensorflow.python.platform import test
+
+
+class RealNVPTest(test_util.VectorDistributionTestHelpers, test.TestCase):
+
+  @property
+  def _real_nvp_kwargs(self):
+    return {
+        "shift_and_log_scale_fn": real_nvp_default_template(
+            hidden_layers=[3], shift_only=False),
+        "is_constant_jacobian": False,
+    }
+
+  def testBijector(self):
+    x_ = np.arange(3 * 4 * 2).astype(np.float32).reshape(3, 4 * 2)
+    with self.test_session() as sess:
+      nvp = RealNVP(
+          num_masked=4,
+          validate_args=True,
+          **self._real_nvp_kwargs)
+      x = constant_op.constant(x_)
+      forward_x = nvp.forward(x)
+      # Use identity to invalidate cache.
+      inverse_y = nvp.inverse(array_ops.identity(forward_x))
+      fldj = nvp.forward_log_det_jacobian(x)
+      # Use identity to invalidate cache.
+      ildj = nvp.inverse_log_det_jacobian(array_ops.identity(forward_x))
+      variables.global_variables_initializer().run()
+      [
+          forward_x_,
+          inverse_y_,
+          ildj_,
+          fldj_,
+      ] = sess.run([
+          forward_x,
+          inverse_y,
+          ildj,
+          fldj,
+      ])
+      self.assertEqual("real_nvp", nvp.name)
+      self.assertAllClose(forward_x_, forward_x_, rtol=1e-6, atol=0.)
+      self.assertAllClose(x_, inverse_y_, rtol=1e-5, atol=0.)
+      self.assertAllClose(ildj_, -fldj_, rtol=1e-6, atol=0.)
+
+  def testMutuallyConsistent(self):
+    dims = 4
+    with self.test_session() as sess:
+      nvp = RealNVP(
+          num_masked=3,
+          validate_args=True,
+          **self._real_nvp_kwargs)
+      dist = transformed_distribution_lib.TransformedDistribution(
+          distribution=normal_lib.Normal(loc=0., scale=1.),
+          bijector=nvp,
+          event_shape=[dims],
+          validate_args=True)
+      self.run_test_sample_consistent_log_prob(
+          sess_run_fn=sess.run,
+          dist=dist,
+          num_samples=int(1e5),
+          radius=1.,
+          center=0.,
+          rtol=0.02)
+
+  def testInvertMutuallyConsistent(self):
+    dims = 4
+    with self.test_session() as sess:
+      nvp = Invert(RealNVP(
+          num_masked=3,
+          validate_args=True,
+          **self._real_nvp_kwargs))
+      dist = transformed_distribution_lib.TransformedDistribution(
+          distribution=normal_lib.Normal(loc=0., scale=1.),
+          bijector=nvp,
+          event_shape=[dims],
+          validate_args=True)
+      self.run_test_sample_consistent_log_prob(
+          sess_run_fn=sess.run,
+          dist=dist,
+          num_samples=int(1e5),
+          radius=1.,
+          center=0.,
+          rtol=0.02)
+
+
+class NICETest(RealNVPTest):
+
+  @property
+  def _real_nvp_kwargs(self):
+    return {
+        "shift_and_log_scale_fn": real_nvp_default_template(
+            hidden_layers=[2], shift_only=True),
+        "is_constant_jacobian": True,
+    }
+
+
+class RealNVPConstantShiftScaleTest(RealNVPTest):
+
+  @property
+  def _real_nvp_kwargs(self):
+
+    def constant_shift_log_scale_fn(x0, output_units):
+      del x0, output_units
+      shift = constant_op.constant([0.1])
+      log_scale = constant_op.constant([0.5])
+      return shift, log_scale
+
+    return {
+        "shift_and_log_scale_fn": constant_shift_log_scale_fn,
+        "is_constant_jacobian": True,
+    }
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
index 38b3a23c2d684a6f89b7c4be4a763c649bf4de15..e216d88cb190dc16fc0056186f80817d6f2d7c67 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
@@ -22,14 +22,28 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops.bijectors.reshape import Reshape
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
 from tensorflow.python.platform import test
 
 
-class ReshapeBijectorTest(test.TestCase):
-  """Tests correctness of the reshape transformation."""
+@test_util.with_c_api
+class _ReshapeBijectorTest(object):
+  """Base class for testing the reshape transformation.
+
+  Methods defined in this class call a method self.build_shapes() that
+  is implemented by subclasses defined below, returning respectively
+   ReshapeBijectorTestStatic: static shapes,
+   ReshapeBijectorTestDynamic: shape placeholders of known ndims, and
+   ReshapeBijectorTestDynamicNdims: shape placeholders of unspecified ndims,
+  so that each test in this base class is automatically run over all
+  three cases. The subclasses also implement assertRaisesError to test
+  for either Python exceptions (in the case of static shapes) or
+  TensorFlow op errors (dynamic shapes).
+  """
 
   def setUp(self):
     self._rng = np.random.RandomState(42)
@@ -40,9 +54,10 @@ class ReshapeBijectorTest(test.TestCase):
     expected_y = np.reshape(expected_x, [4, 6])
 
     with self.test_session() as sess:
+      shape_in, shape_out, feed_dict = self.build_shapes([3, 2], [6,])
       bijector = Reshape(
-          event_shape_out=[6,],
-          event_shape_in=[3, 2],
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
           validate_args=True)
       (x_,
        y_,
@@ -52,66 +67,23 @@ class ReshapeBijectorTest(test.TestCase):
            bijector.forward(expected_x),
            bijector.forward_log_det_jacobian(expected_x),
            bijector.inverse_log_det_jacobian(expected_y),
-       ))
+       ), feed_dict=feed_dict)
       self.assertEqual("reshape", bijector.name)
       self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
       self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
       self.assertAllClose(0., fldj_, rtol=1e-6, atol=0)
       self.assertAllClose(0., ildj_, rtol=1e-6, atol=0)
 
-  def testEventShapeDynamicNdims(self):
-    """Check forward/inverse shape methods with dynamic ndims."""
-
-    shape_in = tensor_shape.TensorShape([6,])
-    shape_in_ph = array_ops.placeholder(dtype=dtypes.int32)
-
-    shape_out = tensor_shape.TensorShape([2, 3])
-    shape_out_ph = array_ops.placeholder(dtype=dtypes.int32)
+  def testEventShapeTensor(self):
+    """Test event_shape_tensor methods when even ndims may be dynamic."""
 
+    shape_in_static = [2, 3]
+    shape_out_static = [6,]
+    shape_in, shape_out, feed_dict = self.build_shapes(shape_in_static,
+                                                       shape_out_static)
     bijector = Reshape(
-        event_shape_out=shape_out_ph,
-        event_shape_in=shape_in_ph, validate_args=True)
-
-    # using the _tensor methods, we should always get a fully-specified
-    # result since these are evaluated at graph runtime.
-    with self.test_session() as sess:
-      (shape_out_,
-       shape_in_) = sess.run((
-           bijector.forward_event_shape_tensor(shape_in),
-           bijector.inverse_event_shape_tensor(shape_out),
-       ), feed_dict={
-           shape_in_ph: shape_in,
-           shape_out_ph: shape_out,
-       })
-      self.assertAllEqual(shape_out, shape_out_)
-      self.assertAllEqual(shape_in, shape_in_)
-
-  def testEventShapeDynamic(self):
-    """Check shape methods with static ndims but dynamic shape."""
-
-    shape_in = tensor_shape.TensorShape([6,])
-    shape_in_partial = tensor_shape.TensorShape([None,])
-    shape_in_ph = array_ops.placeholder(
-        shape=[1,], dtype=dtypes.int32)
-
-    shape_out = tensor_shape.TensorShape([2, 3])
-    shape_out_partial = tensor_shape.TensorShape([None, None])
-    shape_out_ph = array_ops.placeholder(
-        shape=[2,], dtype=dtypes.int32)
-
-    bijector = Reshape(
-        event_shape_out=shape_out_ph,
-        event_shape_in=shape_in_ph,
-        validate_args=True)
-
-    # if event shapes are not statically available, should
-    # return partially-specified TensorShapes.
-    self.assertAllEqual(
-        bijector.forward_event_shape(shape_in).as_list(),
-        shape_out_partial.as_list())
-    self.assertAllEqual(
-        bijector.inverse_event_shape(shape_out).as_list(),
-        shape_in_partial.as_list())
+        event_shape_out=shape_out,
+        event_shape_in=shape_in, validate_args=True)
 
     # using the _tensor methods, we should always get a fully-specified
     # result since these are evaluated at graph runtime.
@@ -120,42 +92,9 @@ class ReshapeBijectorTest(test.TestCase):
        shape_in_) = sess.run((
            bijector.forward_event_shape_tensor(shape_in),
            bijector.inverse_event_shape_tensor(shape_out),
-       ), feed_dict={
-           shape_in_ph: shape_in,
-           shape_out_ph: shape_out,
-       })
-      self.assertAllEqual(shape_out, shape_out_)
-      self.assertAllEqual(shape_in, shape_in_)
-
-  def testEventShapeStatic(self):
-    """Check shape methods when shape is statically known."""
-
-    shape_in = tensor_shape.TensorShape([6,])
-    shape_out = tensor_shape.TensorShape([2, 3])
-
-    bijector_static = Reshape(
-        event_shape_out=shape_out,
-        event_shape_in=shape_in,
-        validate_args=True)
-
-    # test that forward_ and inverse_event_shape do sensible things
-    # when shapes are statically known.
-    self.assertEqual(
-        bijector_static.forward_event_shape(shape_in),
-        shape_out)
-    self.assertEqual(
-        bijector_static.inverse_event_shape(shape_out),
-        shape_in)
-
-    with self.test_session() as sess:
-      (shape_out_static_,
-       shape_in_static_,
-      ) = sess.run((
-          bijector_static.forward_event_shape_tensor(shape_in),
-          bijector_static.inverse_event_shape_tensor(shape_out),
-      ))
-      self.assertAllEqual(shape_out, shape_out_static_)
-      self.assertAllEqual(shape_in, shape_in_static_)
+       ), feed_dict=feed_dict)
+      self.assertAllEqual(shape_out_static, shape_out_)
+      self.assertAllEqual(shape_in_static, shape_in_)
 
   def testScalarReshape(self):
     """Test reshaping to and from a scalar shape ()."""
@@ -166,11 +105,11 @@ class ReshapeBijectorTest(test.TestCase):
     expected_x_scalar = np.random.randn(1,)
     expected_y_scalar = expected_x_scalar[0]
 
+    shape_in, shape_out, feed_dict = self.build_shapes([], [1,])
     with self.test_session() as sess:
       bijector = Reshape(
-          event_shape_out=[],
-          event_shape_in=[1,], validate_args=True)
-
+          event_shape_out=shape_in,
+          event_shape_in=shape_out, validate_args=True)
       (x_,
        y_,
        x_scalar_,
@@ -180,53 +119,179 @@ class ReshapeBijectorTest(test.TestCase):
           bijector.forward(expected_x),
           bijector.inverse(expected_y_scalar),
           bijector.forward(expected_x_scalar),
-      ))
+      ), feed_dict=feed_dict)
       self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
       self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
       self.assertAllClose(expected_y_scalar, y_scalar_, rtol=1e-6, atol=0)
       self.assertAllClose(expected_x_scalar, x_scalar_, rtol=1e-6, atol=0)
 
-  def testRaisesOpError(self):
-    x1 = np.random.randn(4, 2, 3)
-    x2 = np.random.randn(4, 3, 2)
-    x3 = np.random.randn(4, 5, 1, 1)
+  def testMultipleUnspecifiedDimensionsOpError(self):
 
     with self.test_session() as sess:
-      shape_in_ph = array_ops.placeholder(shape=[2,], dtype=dtypes.int32)
-      shape_out_ph = array_ops.placeholder(shape=[3,], dtype=dtypes.int32)
+      shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [4, -1, -1,])
       bijector = Reshape(
-          event_shape_out=shape_out_ph,
-          event_shape_in=shape_in_ph,
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
           validate_args=True)
 
-      with self.assertRaisesOpError(
+      with self.assertRaisesError(
+          "elements must have at most one `-1`."):
+        sess.run(bijector.forward_event_shape_tensor(shape_in),
+                 feed_dict=feed_dict)
+
+  # pylint: disable=invalid-name
+  def _testInvalidDimensionsOpError(self, expected_error_message):
+
+    with self.test_session() as sess:
+
+      shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [1, 2, -2,])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+
+      with self.assertRaisesError(expected_error_message):
+        sess.run(bijector.forward_event_shape_tensor(shape_in),
+                 feed_dict=feed_dict)
+  # pylint: enable=invalid-name
+
+  def testValidButNonMatchingInputOpError(self):
+    x = np.random.randn(4, 3, 2)
+
+    with self.test_session() as sess:
+      shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [1, 6, 1,])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+
+      # Here we pass in a tensor (x) whose shape is compatible with
+      # the output shape, so tf.reshape will throw no error, but
+      # doesn't match the expected input shape.
+      with self.assertRaisesError(
+          "Input `event_shape` does not match `event_shape_in`."):
+        sess.run(bijector.forward(x),
+                 feed_dict=feed_dict)
+
+  def testValidButNonMatchingInputPartiallySpecifiedOpError(self):
+    x = np.random.randn(4, 3, 2)
+
+    with self.test_session() as sess:
+      shape_in, shape_out, feed_dict = self.build_shapes([2, -1], [1, 6, 1,])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+
+      with self.assertRaisesError(
           "Input `event_shape` does not match `event_shape_in`."):
-        sess.run(bijector.forward(x2),
-                 feed_dict={shape_out_ph: [1, 6, 1],
-                            shape_in_ph: [2, 3]})
-
-      with self.assertRaisesOpError(
-          "event_shape_out entries must be positive."):
-        sess.run(bijector.forward(x1),
-                 feed_dict={shape_out_ph: [-1, -1, 6],
-                            shape_in_ph: [2, 3]})
-
-      # test that *all* methods check basic assertions
-      fd_mismatched = {shape_out_ph: [1, 1, 5], shape_in_ph: [2, 3]}
-      with self.assertRaisesOpError(
-          "Input/output `event_size`s do not match."):
+        sess.run(bijector.forward(x),
+                 feed_dict=feed_dict)
+
+  # pylint: disable=invalid-name
+  def _testInputOutputMismatchOpError(self, expected_error_message):
+    x1 = np.random.randn(4, 2, 3)
+    x2 = np.random.randn(4, 1, 1, 5)
+
+    with self.test_session() as sess:
+      shape_in, shape_out, fd_mismatched = self.build_shapes([2, 3],
+                                                             [1, 1, 5])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+
+      with self.assertRaisesError(expected_error_message):
         sess.run(bijector.forward(x1), feed_dict=fd_mismatched)
-      with self.assertRaisesOpError(
-          "Input/output `event_size`s do not match."):
-        sess.run(bijector.inverse(x3), feed_dict=fd_mismatched)
-      with self.assertRaisesOpError(
-          "Input/output `event_size`s do not match."):
-        sess.run(bijector.inverse_log_det_jacobian(x3),
-                 feed_dict=fd_mismatched)
-      with self.assertRaisesOpError(
-          "Input/output `event_size`s do not match."):
-        sess.run(bijector.forward_log_det_jacobian(x1),
-                 feed_dict=fd_mismatched)
+      with self.assertRaisesError(expected_error_message):
+        sess.run(bijector.inverse(x2), feed_dict=fd_mismatched)
+  # pylint: enable=invalid-name
+
+  def testOneShapePartiallySpecified(self):
+    expected_x = np.random.randn(4, 6)
+    expected_y = np.reshape(expected_x, [4, 2, 3])
+
+    with self.test_session() as sess:
+      # one of input/output shapes is partially specified
+      shape_in, shape_out, feed_dict = self.build_shapes([-1,], [2, 3])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+      (x_,
+       y_,
+      ) = sess.run((
+          bijector.inverse(expected_y),
+          bijector.forward(expected_x),
+      ), feed_dict=feed_dict)
+      self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
+      self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
+
+  def testBothShapesPartiallySpecified(self):
+    expected_x = np.random.randn(4, 2, 3)
+    expected_y = np.reshape(expected_x, [4, 3, 2])
+    with self.test_session() as sess:
+      shape_in, shape_out, feed_dict = self.build_shapes([-1, 3], [-1, 2])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+      (x_,
+       y_,
+      ) = sess.run((
+          bijector.inverse(expected_y),
+          bijector.forward(expected_x),
+      ), feed_dict=feed_dict)
+      self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
+      self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
+
+  def testDefaultVectorShape(self):
+    expected_x = np.random.randn(4, 4)
+    expected_y = np.reshape(expected_x, [4, 2, 2])
+    with self.test_session() as sess:
+      _, shape_out, feed_dict = self.build_shapes([-1,], [-1, 2])
+      bijector = Reshape(shape_out,
+                         validate_args=True)
+      (x_,
+       y_,
+      ) = sess.run((
+          bijector.inverse(expected_y),
+          bijector.forward(expected_x),
+      ), feed_dict=feed_dict)
+      self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
+      self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
+
+  def build_shapes(self, *args, **kwargs):
+    raise NotImplementedError("Subclass failed to implement `build_shapes`.")
+
+
+@test_util.with_c_api
+class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest):
+
+  def build_shapes(self, shape_in, shape_out):
+    shape_in_static = shape_in
+    shape_out_static = shape_out
+    feed_dict = {}
+    return shape_in_static, shape_out_static, feed_dict
+
+  def assertRaisesError(self, msg):
+    return self.assertRaisesRegexp(Exception, msg)
+
+  def testEventShape(self):
+    shape_in_static = tensor_shape.TensorShape([2, 3])
+    shape_out_static = tensor_shape.TensorShape([6,])
+    bijector = Reshape(
+        event_shape_out=shape_out_static,
+        event_shape_in=shape_in_static, validate_args=True)
+
+    # test that forward_ and inverse_event_shape do sensible things
+    # when shapes are statically known.
+    self.assertEqual(
+        bijector.forward_event_shape(shape_in_static),
+        shape_out_static)
+    self.assertEqual(
+        bijector.inverse_event_shape(shape_out_static),
+        shape_in_static)
 
   def testBijectiveAndFinite(self):
     x = np.random.randn(4, 2, 3)
@@ -238,5 +303,62 @@ class ReshapeBijectorTest(test.TestCase):
           validate_args=True)
       assert_bijective_and_finite(bijector, x, y, rtol=1e-6, atol=0)
 
+  def testInvalidDimensionsOpError(self):
+    if ops._USE_C_API:
+      error_message = "Invalid value in tensor used for shape: -2"
+    else:
+      error_message = "elements must be either positive integers or `-1`."
+    self._testInvalidDimensionsOpError(error_message)
+
+  def testInputOutputMismatchOpError(self):
+    if ops._USE_C_API:
+      error_message = "Cannot reshape a tensor with"
+    else:
+      error_message = "Input to reshape is a tensor with"
+    self._testInputOutputMismatchOpError(error_message)
+
+
+@test_util.with_c_api
+class ReshapeBijectorTestDynamic(test.TestCase, _ReshapeBijectorTest):
+
+  def build_shapes(self, shape_in, shape_out):
+    shape_in_ph = array_ops.placeholder(shape=(len(shape_in),),
+                                        dtype=dtypes.int32)
+    shape_out_ph = array_ops.placeholder(shape=(len(shape_out),),
+                                         dtype=dtypes.int32)
+    feed_dict = {shape_in_ph: shape_in, shape_out_ph: shape_out}
+    return shape_in_ph, shape_out_ph, feed_dict
+
+  def assertRaisesError(self, msg):
+    return self.assertRaisesOpError(msg)
+
+  def testInvalidDimensionsOpError(self):
+    self._testInvalidDimensionsOpError(
+        "elements must be either positive integers or `-1`.")
+
+  def testInputOutputMismatchOpError(self):
+    self._testInputOutputMismatchOpError("Input to reshape is a tensor with")
+
+
+@test_util.with_c_api
+class ReshapeBijectorTestDynamicNdims(test.TestCase, _ReshapeBijectorTest):
+
+  def build_shapes(self, shape_in, shape_out):
+    shape_in_ph = array_ops.placeholder(shape=None, dtype=dtypes.int32)
+    shape_out_ph = array_ops.placeholder(shape=None, dtype=dtypes.int32)
+    feed_dict = {shape_in_ph: shape_in, shape_out_ph: shape_out}
+    return shape_in_ph, shape_out_ph, feed_dict
+
+  def assertRaisesError(self, msg):
+    return self.assertRaisesOpError(msg)
+
+  def testInvalidDimensionsOpError(self):
+    self._testInvalidDimensionsOpError(
+        "elements must be either positive integers or `-1`.")
+
+  def testInputOutputMismatchOpError(self):
+    self._testInputOutputMismatchOpError("Input to reshape is a tensor with")
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
index 2d74aa1f320149d0f7ef9e9c52b8c7053c2f74d7..31d24aa9ea09007b8db40e4869371b1f62639ac7 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@@ -23,10 +23,15 @@ import itertools
 import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import mixture
+from tensorflow.contrib.distributions.python.ops import mixture_same_family
+from tensorflow.contrib.distributions.python.ops import mvn_diag
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import categorical
+from tensorflow.python.ops.distributions import normal
 from tensorflow.python.ops.linalg import linear_operator_diag
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -395,5 +400,145 @@ class MixtureStddevTest(test.TestCase):
     self.assertAllClose(actual_devs, expected_devs)
 
 
+class PadMixtureDimensionsTest(test.TestCase):
+
+  def test_pad_mixture_dimensions_mixture(self):
+    with self.test_session() as sess:
+      gm = mixture.Mixture(
+          cat=categorical.Categorical(probs=[[0.3, 0.7]]),
+          components=[
+              normal.Normal(loc=[-1.0], scale=[1.0]),
+              normal.Normal(loc=[1.0], scale=[0.5])
+          ])
+
+      x = array_ops.constant([[1.0, 2.0], [3.0, 4.0]])
+      x_pad = distribution_util.pad_mixture_dimensions(
+          x, gm, gm.cat, gm.event_shape.ndims)
+      x_out, x_pad_out = sess.run([x, x_pad])
+
+    self.assertAllEqual(x_pad_out.shape, [2, 2])
+    self.assertAllEqual(x_out.reshape([-1]), x_pad_out.reshape([-1]))
+
+  def test_pad_mixture_dimensions_mixture_same_family(self):
+    with self.test_session() as sess:
+      gm = mixture_same_family.MixtureSameFamily(
+          mixture_distribution=categorical.Categorical(probs=[0.3, 0.7]),
+          components_distribution=mvn_diag.MultivariateNormalDiag(
+              loc=[[-1., 1], [1, -1]], scale_identity_multiplier=[1.0, 0.5]))
+
+      x = array_ops.constant([[1.0, 2.0], [3.0, 4.0]])
+      x_pad = distribution_util.pad_mixture_dimensions(
+          x, gm, gm.mixture_distribution, gm.event_shape.ndims)
+      x_out, x_pad_out = sess.run([x, x_pad])
+
+    self.assertAllEqual(x_pad_out.shape, [2, 2, 1])
+    self.assertAllEqual(x_out.reshape([-1]), x_pad_out.reshape([-1]))
+
+
+class _PadTest(object):
+
+  def testNegAxisCorrectness(self):
+    x_ = np.float32([[1., 2, 3],
+                     [4, 5, 6]])
+    value_ = np.float32(0.25)
+    count_ = np.int32(2)
+    with self.test_session() as sess:
+      x = array_ops.placeholder_with_default(
+          x_, shape=x_.shape if self.is_static_shape else None)
+      value = (constant_op.constant(value_) if self.is_static_shape
+               else array_ops.placeholder_with_default(value_, shape=None))
+      count = (constant_op.constant(count_) if self.is_static_shape
+               else array_ops.placeholder_with_default(count_, shape=None))
+
+      x0_front = distribution_util.pad(
+          x, axis=-2, value=value, count=count, front=True)
+      x0_back = distribution_util.pad(
+          x, axis=-2, count=count, back=True)
+      x0_both = distribution_util.pad(
+          x, axis=-2, value=value, front=True, back=True)
+
+      if self.is_static_shape:
+        self.assertAllEqual([4, 3], x0_front.shape)
+        self.assertAllEqual([4, 3], x0_back.shape)
+        self.assertAllEqual([4, 3], x0_both.shape)
+
+      [x0_front_, x0_back_, x0_both_] = sess.run([
+          x0_front, x0_back, x0_both])
+
+      self.assertAllClose(
+          np.float32([[value_]*3,
+                      [value_]*3,
+                      [1, 2, 3],
+                      [4, 5, 6]]),
+          x0_front_, atol=0., rtol=1e-6)
+      self.assertAllClose(
+          np.float32([[1, 2, 3],
+                      [4, 5, 6],
+                      [0.]*3,
+                      [0.]*3]),
+          x0_back_, atol=0., rtol=1e-6)
+      self.assertAllClose(
+          np.float32([[value_]*3,
+                      [1, 2, 3],
+                      [4, 5, 6],
+                      [value_]*3]),
+          x0_both_, atol=0., rtol=1e-6)
+
+  def testPosAxisCorrectness(self):
+    x_ = np.float32([[1., 2, 3],
+                     [4, 5, 6]])
+    value_ = np.float32(0.25)
+    count_ = np.int32(2)
+    with self.test_session() as sess:
+      x = array_ops.placeholder_with_default(
+          x_, shape=x_.shape if self.is_static_shape else None)
+      value = (constant_op.constant(value_) if self.is_static_shape
+               else array_ops.placeholder_with_default(value_, shape=None))
+      count = (constant_op.constant(count_) if self.is_static_shape
+               else array_ops.placeholder_with_default(count_, shape=None))
+
+      x1_front = distribution_util.pad(
+          x, axis=1, value=value, count=count, front=True)
+      x1_back = distribution_util.pad(
+          x, axis=1, count=count, back=True)
+      x1_both = distribution_util.pad(
+          x, axis=1, value=value, front=True, back=True)
+
+      if self.is_static_shape:
+        self.assertAllEqual([2, 5], x1_front.shape)
+        self.assertAllEqual([2, 5], x1_back.shape)
+        self.assertAllEqual([2, 5], x1_both.shape)
+
+      [x1_front_, x1_back_, x1_both_] = sess.run([
+          x1_front, x1_back, x1_both])
+
+      self.assertAllClose(
+          np.float32([[value_]*2 + [1, 2, 3],
+                      [value_]*2 + [4, 5, 6]]),
+          x1_front_, atol=0., rtol=1e-6)
+      self.assertAllClose(
+          np.float32([[1, 2, 3] + [0.]*2,
+                      [4, 5, 6] + [0.]*2]),
+          x1_back_, atol=0., rtol=1e-6)
+      self.assertAllClose(
+          np.float32([[value_, 1, 2, 3, value_],
+                      [value_, 4, 5, 6, value_]]),
+          x1_both_, atol=0., rtol=1e-6)
+
+
+class PadStaticTest(_PadTest, test.TestCase):
+
+  @property
+  def is_static_shape(self):
+    return True
+
+
+class PadDynamicTest(_PadTest, test.TestCase):
+
+  @property
+  def is_static_shape(self):
+    return False
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py
index a7571806f295af4566e57ac4a785bc8774fd31ab..a4e75660083dc2edd1759a3a54e221d9e8a268c3 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import importlib
 import numpy as np
 
+from tensorflow.contrib.distributions.python.ops import half_normal as hn_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -28,7 +29,6 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import variables
-from tensorflow.contrib.distributions.python.ops import half_normal as hn_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -200,7 +200,7 @@ class HalfNormalTest(test.TestCase):
     with self.test_session():
       scale = np.array([[1.0, 2.0, 3.0]])
       halfnorm = hn_lib.HalfNormal(scale=scale)
-      
+
       # See https://en.wikipedia.org/wiki/Half-normal_distribution for the
       # entropy formula used here.
       expected_entropy = 0.5 * np.log(np.pi * scale ** 2.0 / 2.0) + 0.5
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea3c86b5c0f42b64fc6e4e362cbcc162bccf74a2
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/kumaraswamy_test.py
@@ -0,0 +1,388 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import kumaraswamy as kumaraswamy_lib
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
+
+
+def _kumaraswamy_mode(a, b):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  return ((a - 1) / (a * b - 1))**(1 / a)
+
+
+def _kumaraswamy_moment(a, b, n):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  return b * special.beta(1.0 + n / a, b)
+
+
+def _harmonic_number(b):
+  b = np.asarray(b)
+  return special.psi(b + 1) - special.psi(1)
+
+
+def _kumaraswamy_cdf(a, b, x):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  x = np.asarray(x)
+  return 1 - (1 - x**a)**b
+
+
+def _kumaraswamy_pdf(a, b, x):
+  a = np.asarray(a)
+  b = np.asarray(b)
+  x = np.asarray(x)
+  return a * b * x ** (a - 1) * (1 - x ** a) ** (b - 1)
+
+
+class KumaraswamyTest(test.TestCase):
+
+  def testSimpleShapes(self):
+    with self.test_session():
+      a = np.random.rand(3)
+      b = np.random.rand(3)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertAllEqual([], dist.event_shape_tensor().eval())
+      self.assertAllEqual([3], dist.batch_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
+      self.assertEqual(tensor_shape.TensorShape([3]), dist.batch_shape)
+
+  def testComplexShapes(self):
+    with self.test_session():
+      a = np.random.rand(3, 2, 2)
+      b = np.random.rand(3, 2, 2)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertAllEqual([], dist.event_shape_tensor().eval())
+      self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
+      self.assertEqual(tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape)
+
+  def testComplexShapesBroadcast(self):
+    with self.test_session():
+      a = np.random.rand(3, 2, 2)
+      b = np.random.rand(2, 2)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertAllEqual([], dist.event_shape_tensor().eval())
+      self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval())
+      self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
+      self.assertEqual(tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape)
+
+  def testAProperty(self):
+    a = [[1., 2, 3]]
+    b = [[2., 4, 3]]
+    with self.test_session():
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual([1, 3], dist.concentration1.get_shape())
+      self.assertAllClose(a, dist.concentration1.eval())
+
+  def testBProperty(self):
+    a = [[1., 2, 3]]
+    b = [[2., 4, 3]]
+    with self.test_session():
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual([1, 3], dist.concentration0.get_shape())
+      self.assertAllClose(b, dist.concentration0.eval())
+
+  def testPdfXProper(self):
+    a = [[1., 2, 3]]
+    b = [[2., 4, 3]]
+    with self.test_session():
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, validate_args=True)
+      dist.prob([.1, .3, .6]).eval()
+      dist.prob([.2, .3, .5]).eval()
+      # Either condition can trigger.
+      with self.assertRaisesOpError("sample must be positive"):
+        dist.prob([-1., 0.1, 0.5]).eval()
+      with self.assertRaisesOpError("sample must be positive"):
+        dist.prob([0., 0.1, 0.5]).eval()
+      with self.assertRaisesOpError("sample must be no larger than `1`"):
+        dist.prob([.1, .2, 1.2]).eval()
+
+  def testPdfTwoBatches(self):
+    with self.test_session():
+      a = [1., 2]
+      b = [1., 2]
+      x = [.5, .5]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2,), pdf.get_shape())
+
+  def testPdfTwoBatchesNontrivialX(self):
+    with self.test_session():
+      a = [1., 2]
+      b = [1., 2]
+      x = [.3, .7]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2,), pdf.get_shape())
+
+  def testPdfUniformZeroBatch(self):
+    with self.test_session():
+      # This is equivalent to a uniform distribution
+      a = 1.
+      b = 1.
+      x = np.array([.1, .2, .3, .5, .8], dtype=np.float32)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((5,), pdf.get_shape())
+
+  def testPdfAStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      a = [[1., 2]]
+      b = [[1., 2]]
+      x = [[.5, .5], [.3, .7]]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      pdf = dist.prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testPdfAStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      a = [1., 2]
+      b = [1., 2]
+      x = [[.5, .5], [.2, .8]]
+      pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testPdfXStretchedInBroadcastWhenSameRank(self):
+    with self.test_session():
+      a = [[1., 2], [2., 3]]
+      b = [[1., 2], [2., 3]]
+      x = [[.5, .5]]
+      pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testPdfXStretchedInBroadcastWhenLowerRank(self):
+    with self.test_session():
+      a = [[1., 2], [2., 3]]
+      b = [[1., 2], [2., 3]]
+      x = [.5, .5]
+      pdf = kumaraswamy_lib.Kumaraswamy(a, b).prob(x)
+      expected_pdf = _kumaraswamy_pdf(a, b, x)
+      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertEqual((2, 2), pdf.get_shape())
+
+  def testKumaraswamyMean(self):
+    with session.Session():
+      a = [1., 2, 3]
+      b = [2., 4, 1.2]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_mean = _kumaraswamy_moment(a, b, 1)
+      self.assertAllClose(expected_mean, dist.mean().eval())
+
+  def testKumaraswamyVariance(self):
+    with session.Session():
+      a = [1., 2, 3]
+      b = [2., 4, 1.2]
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variance = _kumaraswamy_moment(a, b, 2) - _kumaraswamy_moment(
+          a, b, 1)**2
+      self.assertAllClose(expected_variance, dist.variance().eval())
+
+  def testKumaraswamyMode(self):
+    with session.Session():
+      a = np.array([1.1, 2, 3])
+      b = np.array([2., 4, 1.2])
+      expected_mode = _kumaraswamy_mode(a, b)
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.mode().get_shape(), (3,))
+      self.assertAllClose(expected_mode, dist.mode().eval())
+
+  def testKumaraswamyModeInvalid(self):
+    with session.Session():
+      a = np.array([1., 2, 3])
+      b = np.array([2., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=False)
+      with self.assertRaisesOpError("Condition x < y.*"):
+        dist.mode().eval()
+
+      a = np.array([2., 2, 3])
+      b = np.array([1., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=False)
+      with self.assertRaisesOpError("Condition x < y.*"):
+        dist.mode().eval()
+
+  def testKumaraswamyModeEnableAllowNanStats(self):
+    with session.Session():
+      a = np.array([1., 2, 3])
+      b = np.array([2., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=True)
+
+      expected_mode = _kumaraswamy_mode(a, b)
+      expected_mode[0] = np.nan
+      self.assertEqual((3,), dist.mode().get_shape())
+      self.assertAllClose(expected_mode, dist.mode().eval())
+
+      a = np.array([2., 2, 3])
+      b = np.array([1., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b, allow_nan_stats=True)
+
+      expected_mode = _kumaraswamy_mode(a, b)
+      expected_mode[0] = np.nan
+      self.assertEqual((3,), dist.mode().get_shape())
+      self.assertAllClose(expected_mode, dist.mode().eval())
+
+  def testKumaraswamyEntropy(self):
+    with session.Session():
+      a = np.array([1., 2, 3])
+      b = np.array([2., 4, 1.2])
+      dist = kumaraswamy_lib.Kumaraswamy(a, b)
+      self.assertEqual(dist.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = (1 - 1. / a) + (
+          1 - 1. / b) * _harmonic_number(b) + np.log(a * b)
+      self.assertAllClose(expected_entropy, dist.entropy().eval())
+
+  def testKumaraswamySample(self):
+    with self.test_session():
+      a = 1.
+      b = 2.
+      kumaraswamy = kumaraswamy_lib.Kumaraswamy(a, b)
+      n = constant_op.constant(100000)
+      samples = kumaraswamy.sample(n)
+      sample_values = samples.eval()
+      self.assertEqual(sample_values.shape, (100000,))
+      self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
+      self.assertLess(
+          stats.kstest(
+              # Kumaraswamy is a univariate distribution.
+              sample_values,
+              lambda x: _kumaraswamy_cdf(1., 2., x))[0],
+          0.01)
+      # The standard error of the sample mean is 1 / (sqrt(18 * n))
+      expected_mean = _kumaraswamy_moment(a, b, 1)
+      self.assertAllClose(sample_values.mean(axis=0), expected_mean, atol=1e-2)
+      expected_variance = _kumaraswamy_moment(a, b, 2) - _kumaraswamy_moment(
+          a, b, 1)**2
+      self.assertAllClose(
+          np.cov(sample_values, rowvar=0), expected_variance, atol=1e-1)
+
+  # Test that sampling with the same seed twice gives the same results.
+  def testKumaraswamySampleMultipleTimes(self):
+    with self.test_session():
+      a_val = 1.
+      b_val = 2.
+      n_val = 100
+
+      random_seed.set_random_seed(654321)
+      kumaraswamy1 = kumaraswamy_lib.Kumaraswamy(
+          concentration1=a_val, concentration0=b_val, name="kumaraswamy1")
+      samples1 = kumaraswamy1.sample(n_val, seed=123456).eval()
+
+      random_seed.set_random_seed(654321)
+      kumaraswamy2 = kumaraswamy_lib.Kumaraswamy(
+          concentration1=a_val, concentration0=b_val, name="kumaraswamy2")
+      samples2 = kumaraswamy2.sample(n_val, seed=123456).eval()
+
+      self.assertAllClose(samples1, samples2)
+
+  def testKumaraswamySampleMultidimensional(self):
+    with self.test_session():
+      a = np.random.rand(3, 2, 2).astype(np.float32)
+      b = np.random.rand(3, 2, 2).astype(np.float32)
+      kumaraswamy = kumaraswamy_lib.Kumaraswamy(a, b)
+      n = constant_op.constant(100000)
+      samples = kumaraswamy.sample(n)
+      sample_values = samples.eval()
+      self.assertEqual(sample_values.shape, (100000, 3, 2, 2))
+      self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
+      self.assertAllClose(
+          sample_values[:, 1, :].mean(axis=0),
+          _kumaraswamy_moment(a, b, 1)[1, :],
+          atol=1e-1)
+
+  def testKumaraswamyCdf(self):
+    with self.test_session():
+      shape = (30, 40, 50)
+      for dt in (np.float32, np.float64):
+        a = 10. * np.random.random(shape).astype(dt)
+        b = 10. * np.random.random(shape).astype(dt)
+        x = np.random.random(shape).astype(dt)
+        actual = kumaraswamy_lib.Kumaraswamy(a, b).cdf(x).eval()
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
+        self.assertAllClose(
+            _kumaraswamy_cdf(a, b, x), actual, rtol=1e-4, atol=0)
+
+  def testKumaraswamyLogCdf(self):
+    with self.test_session():
+      shape = (30, 40, 50)
+      for dt in (np.float32, np.float64):
+        a = 10. * np.random.random(shape).astype(dt)
+        b = 10. * np.random.random(shape).astype(dt)
+        x = np.random.random(shape).astype(dt)
+        actual = math_ops.exp(kumaraswamy_lib.Kumaraswamy(a,
+                                                          b).log_cdf(x)).eval()
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
+        self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
+        self.assertAllClose(
+            _kumaraswamy_cdf(a, b, x), actual, rtol=1e-4, atol=0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py
index ece6bc077d9e21502fdfd01300a9d3e9f2c9c380..ff6092fc260660b512e8123823c63e98a023af6d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py
@@ -45,6 +45,17 @@ class MixtureSameFamilyTest(test_util.VectorDistributionTestHelpers,
       self.assertEqual([4, 5], x.shape)
       self.assertEqual([4, 5], log_prob_x.shape)
 
+  def testSampleAndLogProbBatch(self):
+    with self.test_session():
+      gm = mixture_same_family_lib.MixtureSameFamily(
+          mixture_distribution=categorical_lib.Categorical(probs=[[0.3, 0.7]]),
+          components_distribution=normal_lib.Normal(
+              loc=[[-1., 1]], scale=[[0.1, 0.5]]))
+      x = gm.sample([4, 5], seed=42)
+      log_prob_x = gm.log_prob(x)
+      self.assertEqual([4, 5, 1], x.shape)
+      self.assertEqual([4, 5, 1], log_prob_x.shape)
+
   def testSampleAndLogProbShapesBroadcastMix(self):
     mix_probs = np.float32([.3, .7])
     bern_probs = np.float32([[.4, .6], [.25, .75]])
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
index 1e514fe0ff21cd53c8c235da417890773db50c37..02064891758a86c5108e11da6a3666f2d5c56c64 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
@@ -107,7 +107,7 @@ def _test_capture_normal_sample_outputs():
   ds.Normal._call_sample_n = true_normal_call_sample_n
 
 
-def make_univariate_mixture(batch_shape, num_components):
+def make_univariate_mixture(batch_shape, num_components, use_static_graph):
   batch_shape = ops.convert_to_tensor(batch_shape, dtypes.int32)
   logits = random_ops.random_uniform(
       array_ops.concat((batch_shape, [num_components]), axis=0),
@@ -119,11 +119,11 @@ def make_univariate_mixture(batch_shape, num_components):
       for _ in range(num_components)
   ]
   cat = ds.Categorical(logits, dtype=dtypes.int32)
-  return ds.Mixture(cat, components)
+  return ds.Mixture(cat, components, use_static_graph=use_static_graph)
 
 
 def make_multivariate_mixture(batch_shape, num_components, event_shape,
-                              batch_shape_tensor=None):
+                              use_static_graph, batch_shape_tensor=None):
   if batch_shape_tensor is None:
     batch_shape_tensor = batch_shape
   batch_shape_tensor = ops.convert_to_tensor(batch_shape_tensor, dtypes.int32)
@@ -145,15 +145,17 @@ def make_multivariate_mixture(batch_shape, num_components, event_shape,
         loc=loc, scale_diag=scale_diag)
   components = [create_component() for _ in range(num_components)]
   cat = ds.Categorical(logits, dtype=dtypes.int32)
-  return ds.Mixture(cat, components)
+  return ds.Mixture(cat, components, use_static_graph=use_static_graph)
 
 
 class MixtureTest(test.TestCase):
+  use_static_graph = False
 
   def testShapes(self):
     with self.test_session():
       for batch_shape in ([], [1], [2, 3, 4]):
-        dist = make_univariate_mixture(batch_shape, num_components=10)
+        dist = make_univariate_mixture(batch_shape, num_components=10,
+                                       use_static_graph=self.use_static_graph)
         self.assertAllEqual(batch_shape, dist.batch_shape)
         self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval())
         self.assertAllEqual([], dist.event_shape)
@@ -161,7 +163,8 @@ class MixtureTest(test.TestCase):
 
         for event_shape in ([1], [2]):
           dist = make_multivariate_mixture(
-              batch_shape, num_components=10, event_shape=event_shape)
+              batch_shape, num_components=10, event_shape=event_shape,
+              use_static_graph=self.use_static_graph)
           self.assertAllEqual(batch_shape, dist.batch_shape)
           self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval())
           self.assertAllEqual(event_shape, dist.event_shape)
@@ -172,7 +175,8 @@ class MixtureTest(test.TestCase):
                                              r"cat.num_classes != len"):
       ds.Mixture(
           ds.Categorical([0.1, 0.5]),  # 2 classes
-          [ds.Normal(loc=1.0, scale=2.0)])
+          [ds.Normal(loc=1.0, scale=2.0)],
+          use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(
         ValueError, r"\(\) and \(2,\) are not compatible"):
       # The value error is raised because the batch shapes of the
@@ -185,13 +189,15 @@ class MixtureTest(test.TestCase):
                   loc=1.0, scale=2.0),  # scalar dist
               ds.Normal(
                   loc=[1.0, 1.0], scale=[2.0, 2.0])
-          ])
+          ],
+          use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(ValueError, r"Could not infer"):
       cat_logits = array_ops.placeholder(shape=[1, None], dtype=dtypes.float32)
       ds.Mixture(
           ds.Categorical(cat_logits),
           [ds.Normal(
-              loc=[1.0], scale=[2.0])])
+              loc=[1.0], scale=[2.0])],
+          use_static_graph=self.use_static_graph)
 
   def testBrokenShapesDynamic(self):
     with self.test_session():
@@ -203,29 +209,37 @@ class MixtureTest(test.TestCase):
                   loc=d0_param, scale=d0_param), ds.Normal(
                       loc=d1_param, scale=d1_param)
           ],
-          validate_args=True)
-      with self.assertRaisesOpError(r"batch shape must match"):
+          validate_args=True,
+          use_static_graph=self.use_static_graph)
+
+      if self.use_static_graph:
+        error_string = r"Shapes of all inputs must match"
+      else:
+        error_string = r"batch shape must match"
+
+      with self.assertRaisesOpError(error_string):
         d.sample().eval(feed_dict={d0_param: [2.0, 3.0], d1_param: [1.0]})
-      with self.assertRaisesOpError(r"batch shape must match"):
+      with self.assertRaisesOpError(error_string):
         d.sample().eval(feed_dict={d0_param: [2.0, 3.0], d1_param: 1.0})
 
   def testBrokenTypes(self):
     with self.assertRaisesWithPredicateMatch(TypeError, "Categorical"):
-      ds.Mixture(None, [])
+      ds.Mixture(None, [], use_static_graph=self.use_static_graph)
     cat = ds.Categorical([0.3, 0.2])
     # components must be a list of distributions
     with self.assertRaisesWithPredicateMatch(
         TypeError, "all .* must be Distribution instances"):
-      ds.Mixture(cat, [None])
+      ds.Mixture(cat, [None], use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(TypeError, "same dtype"):
       ds.Mixture(
           cat, [
               ds.Normal(loc=[1.0], scale=[2.0]),
               ds.Normal(loc=[np.float16(1.0)],
                         scale=[np.float16(2.0)]),
-          ])
+          ], use_static_graph=self.use_static_graph)
     with self.assertRaisesWithPredicateMatch(ValueError, "non-empty list"):
-      ds.Mixture(ds.Categorical([0.3, 0.2]), None)
+      ds.Mixture(ds.Categorical([0.3, 0.2]), None,
+                 use_static_graph=self.use_static_graph)
 
     # TODO(ebrevdo): once distribution Domains have been added, add a
     # test to ensure that the domains of the distributions in a
@@ -235,7 +249,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_univariate_mixture(
-            batch_shape=batch_shape, num_components=2)
+            batch_shape=batch_shape, num_components=2,
+            use_static_graph=self.use_static_graph)
         mean = dist.mean()
         self.assertEqual(batch_shape, mean.get_shape())
 
@@ -256,7 +271,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_multivariate_mixture(
-            batch_shape=batch_shape, num_components=2, event_shape=(4,))
+            batch_shape=batch_shape, num_components=2, event_shape=(4,),
+            use_static_graph=self.use_static_graph)
         mean = dist.mean()
         self.assertEqual(batch_shape + (4,), mean.get_shape())
 
@@ -283,7 +299,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_univariate_mixture(
-            batch_shape=batch_shape, num_components=num_components)
+            batch_shape=batch_shape, num_components=num_components,
+            use_static_graph=self.use_static_graph)
         dev = dist.stddev()
         self.assertEqual(batch_shape, dev.get_shape())
 
@@ -325,7 +342,8 @@ class MixtureTest(test.TestCase):
         dist = make_multivariate_mixture(
             batch_shape=batch_shape,
             num_components=num_components,
-            event_shape=(4,))
+            event_shape=(4,),
+            use_static_graph=self.use_static_graph)
         dev = dist.stddev()
         self.assertEqual(batch_shape + (4,), dev.get_shape())
 
@@ -371,7 +389,8 @@ class MixtureTest(test.TestCase):
                       scale=component_devs[0]),
             ds.Normal(loc=component_means[1],
                       scale=component_devs[1]),
-        ])
+        ],
+        use_static_graph=self.use_static_graph)
     mix_dev = mixture_dist.stddev()
     with self.test_session() as sess:
       actual_stddev = sess.run(mix_dev)
@@ -379,7 +398,8 @@ class MixtureTest(test.TestCase):
 
   def testProbScalarUnivariate(self):
     with self.test_session() as sess:
-      dist = make_univariate_mixture(batch_shape=[], num_components=2)
+      dist = make_univariate_mixture(batch_shape=[], num_components=2,
+                                     use_static_graph=self.use_static_graph)
       for x in [
           np.array(
               [1.0, 2.0], dtype=np.float32), np.array(
@@ -405,7 +425,8 @@ class MixtureTest(test.TestCase):
   def testProbScalarMultivariate(self):
     with self.test_session() as sess:
       dist = make_multivariate_mixture(
-          batch_shape=[], num_components=2, event_shape=[3])
+          batch_shape=[], num_components=2, event_shape=[3],
+          use_static_graph=self.use_static_graph)
       for x in [
           np.array(
               [[-1.0, 0.0, 1.0], [0.5, 1.0, -0.3]], dtype=np.float32), np.array(
@@ -432,7 +453,8 @@ class MixtureTest(test.TestCase):
 
   def testProbBatchUnivariate(self):
     with self.test_session() as sess:
-      dist = make_univariate_mixture(batch_shape=[2, 3], num_components=2)
+      dist = make_univariate_mixture(batch_shape=[2, 3], num_components=2,
+                                     use_static_graph=self.use_static_graph)
 
       for x in [
           np.random.randn(2, 3).astype(np.float32),
@@ -459,7 +481,8 @@ class MixtureTest(test.TestCase):
   def testProbBatchMultivariate(self):
     with self.test_session() as sess:
       dist = make_multivariate_mixture(
-          batch_shape=[2, 3], num_components=2, event_shape=[4])
+          batch_shape=[2, 3], num_components=2, event_shape=[4],
+          use_static_graph=self.use_static_graph)
 
       for x in [
           np.random.randn(2, 3, 4).astype(np.float32),
@@ -487,7 +510,8 @@ class MixtureTest(test.TestCase):
       num_components = 3
       batch_shape = []
       dist = make_univariate_mixture(
-          batch_shape=batch_shape, num_components=num_components)
+          batch_shape=batch_shape, num_components=num_components,
+          use_static_graph=self.use_static_graph)
       n = 4
       with _test_capture_normal_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -502,7 +526,10 @@ class MixtureTest(test.TestCase):
         which_c = np.where(cat_sample_values == c)[0]
         size_c = which_c.size
         # Scalar Batch univariate case: batch_size == 1, rank 1
-        which_dist_samples = dist_sample_values[c][:size_c]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c]
+        else:
+          which_dist_samples = dist_sample_values[c][:size_c]
         self.assertAllClose(which_dist_samples, sample_values[which_c])
 
   # Test that sampling with the same seed twice gives the same results.
@@ -522,7 +549,8 @@ class MixtureTest(test.TestCase):
       ]
       cat = ds.Categorical(
           logits, dtype=dtypes.int32, name="cat1")
-      dist1 = ds.Mixture(cat, components, name="mixture1")
+      dist1 = ds.Mixture(cat, components, name="mixture1",
+                         use_static_graph=self.use_static_graph)
       samples1 = dist1.sample(n, seed=123456).eval()
 
       random_seed.set_random_seed(654321)
@@ -532,7 +560,8 @@ class MixtureTest(test.TestCase):
       ]
       cat2 = ds.Categorical(
           logits, dtype=dtypes.int32, name="cat2")
-      dist2 = ds.Mixture(cat2, components2, name="mixture2")
+      dist2 = ds.Mixture(cat2, components2, name="mixture2",
+                         use_static_graph=self.use_static_graph)
       samples2 = dist2.sample(n, seed=123456).eval()
 
       self.assertAllClose(samples1, samples2)
@@ -541,7 +570,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       num_components = 3
       dist = make_multivariate_mixture(
-          batch_shape=[], num_components=num_components, event_shape=[2])
+          batch_shape=[], num_components=num_components, event_shape=[2],
+          use_static_graph=self.use_static_graph)
       n = 4
       with _test_capture_mvndiag_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -555,14 +585,18 @@ class MixtureTest(test.TestCase):
         which_c = np.where(cat_sample_values == c)[0]
         size_c = which_c.size
         # Scalar Batch multivariate case: batch_size == 1, rank 2
-        which_dist_samples = dist_sample_values[c][:size_c, :]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c, :]
+        else:
+          which_dist_samples = dist_sample_values[c][:size_c, :]
         self.assertAllClose(which_dist_samples, sample_values[which_c, :])
 
   def testSampleBatchUnivariate(self):
     with self.test_session() as sess:
       num_components = 3
       dist = make_univariate_mixture(
-          batch_shape=[2, 3], num_components=num_components)
+          batch_shape=[2, 3], num_components=num_components,
+          use_static_graph=self.use_static_graph)
       n = 4
       with _test_capture_normal_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -576,8 +610,12 @@ class MixtureTest(test.TestCase):
         which_c_s, which_c_b0, which_c_b1 = np.where(cat_sample_values == c)
         size_c = which_c_s.size
         # Batch univariate case: batch_size == [2, 3], rank 3
-        which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
-                                                   which_c_b1]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c_s, which_c_b0,
+                                                     which_c_b1]
+        else:
+          which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
+                                                     which_c_b1]
         self.assertAllClose(which_dist_samples,
                             sample_values[which_c_s, which_c_b0, which_c_b1])
 
@@ -594,7 +632,8 @@ class MixtureTest(test.TestCase):
       dist = make_multivariate_mixture(
           batch_shape=batch_shape,
           num_components=num_components, event_shape=[4],
-          batch_shape_tensor=batch_shape_tensor)
+          batch_shape_tensor=batch_shape_tensor,
+          use_static_graph=self.use_static_graph)
       n = 5
       with _test_capture_mvndiag_sample_outputs() as component_samples:
         samples = dist.sample(n, seed=123)
@@ -617,8 +656,12 @@ class MixtureTest(test.TestCase):
         which_c_s, which_c_b0, which_c_b1 = np.where(cat_sample_values == c)
         size_c = which_c_s.size
         # Batch univariate case: batch_size == [2, 3], rank 4 (multivariate)
-        which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
-                                                   which_c_b1, :]
+        if self.use_static_graph:
+          which_dist_samples = dist_sample_values[c][which_c_s, which_c_b0,
+                                                     which_c_b1, :]
+        else:
+          which_dist_samples = dist_sample_values[c][range(size_c), which_c_b0,
+                                                     which_c_b1, :]
         self.assertAllClose(which_dist_samples,
                             sample_values[which_c_s, which_c_b0, which_c_b1, :])
 
@@ -632,7 +675,8 @@ class MixtureTest(test.TestCase):
     with self.test_session() as sess:
       for batch_shape in ((), (2,), (2, 3)):
         dist = make_multivariate_mixture(
-            batch_shape=batch_shape, num_components=2, event_shape=(4,))
+            batch_shape=batch_shape, num_components=2, event_shape=(4,),
+            use_static_graph=self.use_static_graph)
         entropy_lower_bound = dist.entropy_lower_bound()
         self.assertEqual(batch_shape, entropy_lower_bound.get_shape())
 
@@ -673,7 +717,8 @@ class MixtureTest(test.TestCase):
     cat_tf = ds.Categorical(probs=mixture_weights)
     components_tf = [ds.Normal(loc=mu, scale=sigma)
                      for (mu, sigma) in zip(means, sigmas)]
-    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf)
+    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf,
+                            use_static_graph=self.use_static_graph)
 
     x_tensor = array_ops.placeholder(shape=(), dtype=dtypes.float32)
 
@@ -721,7 +766,8 @@ class MixtureTest(test.TestCase):
     cat_tf = ds.Categorical(probs=mixture_weights)
     components_tf = [ds.Normal(loc=mu, scale=sigma)
                      for (mu, sigma) in zip(means, sigmas)]
-    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf)
+    mixture_tf = ds.Mixture(cat=cat_tf, components=components_tf,
+                            use_static_graph=self.use_static_graph)
 
     x_tensor = array_ops.placeholder(shape=psize, dtype=dtypes.float32)
     xs_to_check = [
@@ -760,12 +806,18 @@ class MixtureTest(test.TestCase):
       gm = ds.Mixture(
           cat=ds.Categorical(probs=[.3, .7]),
           components=[ds.Gamma(1., 2.),
-                      ds.Gamma(2., 1.)])
+                      ds.Gamma(2., 1.)],
+          use_static_graph=self.use_static_graph)
       x_ = gm.sample().eval()
       self.assertAllEqual([], x_.shape)
 
 
+class MixtureStaticSampleTest(MixtureTest):
+  use_static_graph = True
+
+
 class MixtureBenchmark(test.Benchmark):
+  use_static_graph = False
 
   def _runSamplingBenchmark(self, name, create_distribution, use_gpu,
                             num_components, batch_size, num_features,
@@ -811,7 +863,7 @@ class MixtureBenchmark(test.Benchmark):
       components = list(
           ds.MultivariateNormalDiag(
               loc=mu, scale_diag=sigma) for (mu, sigma) in zip(mus, sigmas))
-      return ds.Mixture(cat, components)
+      return ds.Mixture(cat, components, use_static_graph=self.use_static_graph)
 
     for use_gpu in False, True:
       if use_gpu and not test.is_gpu_available():
@@ -853,7 +905,7 @@ class MixtureBenchmark(test.Benchmark):
           ds.MultivariateNormalTriL(
               loc=mu, scale_tril=linalg_ops.cholesky(sigma))
           for (mu, sigma) in zip(mus, sigmas))
-      return ds.Mixture(cat, components)
+      return ds.Mixture(cat, components, use_static_graph=self.use_static_graph)
 
     for use_gpu in False, True:
       if use_gpu and not test.is_gpu_available():
@@ -872,5 +924,9 @@ class MixtureBenchmark(test.Benchmark):
                   sample_size=sample_size)
 
 
+class MixtureStaticSampleBenchmark(MixtureBenchmark):
+  use_static_graph = True
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/poisson_lognormal_test.py b/tensorflow/contrib/distributions/python/kernel_tests/poisson_lognormal_test.py
index 3c0147b8cf6e1b6a2791e85c0c0997992445fa7e..1035cb00f76d95c7c52c3e812e8bb2868d34b890 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/poisson_lognormal_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/poisson_lognormal_test.py
@@ -18,37 +18,40 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from tensorflow.contrib.distributions.python.ops import poisson_lognormal
 from tensorflow.contrib.distributions.python.ops import test_util
-from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class PoissonLogNormalQuadratureCompoundTest(
-    test_util.DiscreteScalarDistributionTestHelpers, test.TestCase):
+class _PoissonLogNormalQuadratureCompoundTest(
+    test_util.DiscreteScalarDistributionTestHelpers):
   """Tests the PoissonLogNormalQuadratureCompoundTest distribution."""
 
   def testSampleProbConsistent(self):
     with self.test_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=-2.,
-          scale=1.1,
-          quadrature_grid_and_probs=(
-              np.polynomial.hermite.hermgauss(deg=10)),
+          loc=array_ops.placeholder_with_default(
+              -2.,
+              shape=[] if self.static_shape else None),
+          scale=array_ops.placeholder_with_default(
+              1.1,
+              shape=[] if self.static_shape else None),
+          quadrature_size=10,
           validate_args=True)
       self.run_test_sample_consistent_log_prob(
-          sess.run, pln, rtol=0.1)
+          sess.run, pln, batch_size=1, rtol=0.1)
 
   def testMeanVariance(self):
     with self.test_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=0.,
-          scale=1.,
-          quadrature_grid_and_probs=(
-              np.polynomial.hermite.hermgauss(deg=10)),
+          loc=array_ops.placeholder_with_default(
+              0.,
+              shape=[] if self.static_shape else None),
+          scale=array_ops.placeholder_with_default(
+              1.,
+              shape=[] if self.static_shape else None),
+          quadrature_size=10,
           validate_args=True)
       self.run_test_sample_consistent_mean_variance(
           sess.run, pln, rtol=0.02)
@@ -56,21 +59,27 @@ class PoissonLogNormalQuadratureCompoundTest(
   def testSampleProbConsistentBroadcastScalar(self):
     with self.test_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=[0., -0.5],
-          scale=1.,
-          quadrature_grid_and_probs=(
-              np.polynomial.hermite.hermgauss(deg=10)),
+          loc=array_ops.placeholder_with_default(
+              [0., -0.5],
+              shape=[2] if self.static_shape else None),
+          scale=array_ops.placeholder_with_default(
+              1.,
+              shape=[] if self.static_shape else None),
+          quadrature_size=10,
           validate_args=True)
       self.run_test_sample_consistent_log_prob(
-          sess.run, pln, rtol=0.1, atol=0.01)
+          sess.run, pln, batch_size=2, rtol=0.1, atol=0.01)
 
   def testMeanVarianceBroadcastScalar(self):
     with self.test_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=[0., -0.5],
-          scale=1.,
-          quadrature_grid_and_probs=(
-              np.polynomial.hermite.hermgauss(deg=10)),
+          loc=array_ops.placeholder_with_default(
+              [0., -0.5],
+              shape=[2] if self.static_shape else None),
+          scale=array_ops.placeholder_with_default(
+              1.,
+              shape=[] if self.static_shape else None),
+          quadrature_size=10,
           validate_args=True)
       self.run_test_sample_consistent_mean_variance(
           sess.run, pln, rtol=0.1, atol=0.01)
@@ -78,38 +87,46 @@ class PoissonLogNormalQuadratureCompoundTest(
   def testSampleProbConsistentBroadcastBoth(self):
     with self.test_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=[[0.], [-0.5]],
-          scale=[[1., 0.9]],
-          quadrature_grid_and_probs=(
-              np.polynomial.hermite.hermgauss(deg=10)),
+          loc=array_ops.placeholder_with_default(
+              [[0.], [-0.5]],
+              shape=[2, 1] if self.static_shape else None),
+          scale=array_ops.placeholder_with_default(
+              [[1., 0.9]],
+              shape=[1, 2] if self.static_shape else None),
+          quadrature_size=10,
           validate_args=True)
       self.run_test_sample_consistent_log_prob(
-          sess.run, pln, rtol=0.1, atol=0.08)
+          sess.run, pln, batch_size=4, rtol=0.1, atol=0.08)
 
   def testMeanVarianceBroadcastBoth(self):
     with self.test_session() as sess:
       pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=[[0.], [-0.5]],
-          scale=[[1., 0.9]],
-          quadrature_grid_and_probs=(
-              np.polynomial.hermite.hermgauss(deg=10)),
+          loc=array_ops.placeholder_with_default(
+              [[0.], [-0.5]],
+              shape=[2, 1] if self.static_shape else None),
+          scale=array_ops.placeholder_with_default(
+              [[1., 0.9]],
+              shape=[1, 2] if self.static_shape else None),
+          quadrature_size=10,
           validate_args=True)
       self.run_test_sample_consistent_mean_variance(
           sess.run, pln, rtol=0.1, atol=0.01)
 
-  def testSampleProbConsistentDynamicQuadrature(self):
-    with self.test_session() as sess:
-      qgrid = array_ops.placeholder(dtype=dtypes.float32)
-      qprobs = array_ops.placeholder(dtype=dtypes.float32)
-      g, p = np.polynomial.hermite.hermgauss(deg=10)
-      pln = poisson_lognormal.PoissonLogNormalQuadratureCompound(
-          loc=-2.,
-          scale=1.1,
-          quadrature_grid_and_probs=(g, p),
-          validate_args=True)
-      self.run_test_sample_consistent_log_prob(
-          lambda x: sess.run(x, feed_dict={qgrid: g, qprobs: p}),
-          pln, rtol=0.1)
+
+class PoissonLogNormalQuadratureCompoundStaticShapeTest(
+    _PoissonLogNormalQuadratureCompoundTest, test.TestCase):
+
+  @property
+  def static_shape(self):
+    return True
+
+
+class PoissonLogNormalQuadratureCompoundDynamicShapeTest(
+    _PoissonLogNormalQuadratureCompoundTest, test.TestCase):
+
+  @property
+  def static_shape(self):
+    return False
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
index 595d9f5df755d7defa63d385039bafe4f87aa6ec..4186cf129dbf31724c84133734da3f226817c71a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
@@ -23,11 +23,244 @@ import numpy as np
 from tensorflow.contrib.distributions.python.ops import sample_stats
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.platform import test
 
 rng = np.random.RandomState(0)
 
 
+class _AutoCorrelationTest(object):
+
+  @property
+  def use_static_shape(self):
+    raise NotImplementedError("Subclass failed to implement `use_static_shape`")
+
+  @property
+  def dtype(self):
+    raise NotImplementedError("Subclass failed to implement `dtype`.")
+
+  def test_constant_sequence_axis_0_max_lags_none_center_false(self):
+    x_ = np.array([[0., 0., 0.],
+                   [1., 1., 1.]]).astype(self.dtype)
+    x_ph = array_ops.placeholder_with_default(
+        input=x_,
+        shape=x_.shape if self.use_static_shape else None)
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session() as sess:
+        # Setting normalize = True means we divide by zero.
+        auto_corr = sample_stats.auto_correlation(
+            x_ph, axis=1, center=False, normalize=False)
+        if self.use_static_shape:
+          self.assertEqual((2, 3), auto_corr.shape)
+        auto_corr_ = sess.run(auto_corr)
+        self.assertAllClose(
+            [[0., 0., 0.],
+             [1., 1., 1.]], auto_corr_)
+
+  def test_constant_sequence_axis_0_max_lags_none_center_true(self):
+    x_ = np.array([[0., 0., 0.],
+                   [1., 1., 1.]]).astype(self.dtype)
+    x_ph = array_ops.placeholder_with_default(
+        input=x_,
+        shape=x_.shape if self.use_static_shape else None)
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session() as sess:
+        # Setting normalize = True means we divide by zero.
+        auto_corr = sample_stats.auto_correlation(
+            x_ph, axis=1, normalize=False, center=True)
+        if self.use_static_shape:
+          self.assertEqual((2, 3), auto_corr.shape)
+        auto_corr_ = sess.run(auto_corr)
+        self.assertAllClose(
+            [[0., 0., 0.],
+             [0., 0., 0.]], auto_corr_)
+
+  def check_results_versus_brute_force(
+      self, x, axis, max_lags, center, normalize):
+    """Compute auto-correlation by brute force, then compare to tf result."""
+    # Brute for auto-corr -- avoiding fft and transpositions.
+    axis_len = x.shape[axis]
+    if max_lags is None:
+      max_lags = axis_len - 1
+    else:
+      max_lags = min(axis_len - 1, max_lags)
+    auto_corr_at_lag = []
+    if center:
+      x -= x.mean(axis=axis, keepdims=True)
+    for m in range(max_lags + 1):
+      auto_corr_at_lag.append((
+          np.take(x, indices=range(0, axis_len - m), axis=axis) *
+          np.conj(np.take(x, indices=range(m, axis_len), axis=axis))
+      ).mean(axis=axis, keepdims=True))
+    rxx = np.concatenate(auto_corr_at_lag, axis=axis)
+    if normalize:
+      rxx /= np.take(rxx, [0], axis=axis)
+
+    x_ph = array_ops.placeholder_with_default(
+        x, shape=x.shape if self.use_static_shape else None)
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session():
+        auto_corr = sample_stats.auto_correlation(
+            x_ph, axis=axis, max_lags=max_lags, center=center,
+            normalize=normalize)
+        if self.use_static_shape:
+          output_shape = list(x.shape)
+          output_shape[axis] = max_lags + 1
+          self.assertAllEqual(output_shape, auto_corr.shape)
+        self.assertAllClose(rxx, auto_corr.eval(), rtol=1e-5, atol=1e-5)
+
+  def test_axis_n1_center_false_max_lags_none(self):
+    x = rng.randn(2, 3, 4).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(2, 3, 4).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=-1, max_lags=None, center=False, normalize=False)
+
+  def test_axis_n2_center_false_max_lags_none(self):
+    x = rng.randn(3, 4, 5).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(3, 4, 5).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=-2, max_lags=None, center=False, normalize=False)
+
+  def test_axis_n1_center_false_max_lags_none_normalize_true(self):
+    x = rng.randn(2, 3, 4).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(2, 3, 4).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=-1, max_lags=None, center=False, normalize=True)
+
+  def test_axis_n2_center_false_max_lags_none_normalize_true(self):
+    x = rng.randn(3, 4, 5).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(3, 4, 5).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=-2, max_lags=None, center=False, normalize=True)
+
+  def test_axis_0_center_true_max_lags_none(self):
+    x = rng.randn(3, 4, 5).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(3, 4, 5).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=0, max_lags=None, center=True, normalize=False)
+
+  def test_axis_2_center_true_max_lags_1(self):
+    x = rng.randn(3, 4, 5).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(3, 4, 5).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=2, max_lags=1, center=True, normalize=False)
+
+  def test_axis_2_center_true_max_lags_100(self):
+    # There are less than 100 elements in axis 2, so expect we get back an array
+    # the same size as x, despite having asked for 100 lags.
+    x = rng.randn(3, 4, 5).astype(self.dtype)
+    if self.dtype in [np.complex64]:
+      x = 1j * rng.randn(3, 4, 5).astype(self.dtype)
+    self.check_results_versus_brute_force(
+        x, axis=2, max_lags=100, center=True, normalize=False)
+
+  def test_long_orthonormal_sequence_has_corr_length_0(self):
+    l = 10000
+    x = rng.randn(l).astype(self.dtype)
+    x_ph = array_ops.placeholder_with_default(
+        x, shape=(l,) if self.use_static_shape else None)
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session():
+        rxx = sample_stats.auto_correlation(
+            x_ph, max_lags=l // 2, center=True, normalize=False)
+        if self.use_static_shape:
+          self.assertAllEqual((l // 2 + 1,), rxx.shape)
+        rxx_ = rxx.eval()
+        # OSS CPU FFT has some accuracy issues is not the most accurate.
+        # So this tolerance is a bit bad.
+        self.assertAllClose(1., rxx_[0], rtol=0.05)
+        # The maximal error in the rest of the sequence is not great.
+        self.assertAllClose(np.zeros(l // 2), rxx_[1:], atol=0.1)
+        # The mean error in the rest is ok, actually 0.008 when I tested it.
+        self.assertLess(np.abs(rxx_[1:]).mean(), 0.02)
+
+  def test_step_function_sequence(self):
+    # x jumps to new random value every 10 steps.  So correlation length = 10.
+    x = (rng.randint(-10, 10, size=(1000, 1))
+         * np.ones((1, 10))).ravel().astype(self.dtype)
+    x_ph = array_ops.placeholder_with_default(
+        x, shape=(1000 * 10,) if self.use_static_shape else None)
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session():
+        rxx = sample_stats.auto_correlation(
+            x_ph, max_lags=1000 * 10 // 2, center=True, normalize=False)
+        if self.use_static_shape:
+          self.assertAllEqual((1000 * 10 // 2 + 1,), rxx.shape)
+        rxx_ = rxx.eval()
+        rxx_ /= rxx_[0]
+        # Expect positive correlation for the first 10 lags, then significantly
+        # smaller negative.
+        self.assertGreater(rxx_[:10].min(), 0)
+        self.assertGreater(rxx_[9], 5 * rxx_[10:20].mean())
+        # RXX should be decreasing for the first 10 lags.
+        diff = np.diff(rxx_)
+        self.assertLess(diff[:10].max(), 0)
+
+  def test_normalization(self):
+    l = 10000
+    x = 3 * rng.randn(l).astype(self.dtype)
+    x_ph = array_ops.placeholder_with_default(
+        x, shape=(l,) if self.use_static_shape else None)
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session():
+        rxx = sample_stats.auto_correlation(
+            x_ph, max_lags=l // 2, center=True, normalize=True)
+        if self.use_static_shape:
+          self.assertAllEqual((l // 2 + 1,), rxx.shape)
+        rxx_ = rxx.eval()
+        # Note that RXX[0] = 1, despite the fact that E[X^2] = 9, and this is
+        # due to normalize=True.
+        # OSS CPU FFT has some accuracy issues is not the most accurate.
+        # So this tolerance is a bit bad.
+        self.assertAllClose(1., rxx_[0], rtol=0.05)
+        # The maximal error in the rest of the sequence is not great.
+        self.assertAllClose(np.zeros(l // 2), rxx_[1:], atol=0.1)
+        # The mean error in the rest is ok, actually 0.008 when I tested it.
+        self.assertLess(np.abs(rxx_[1:]).mean(), 0.02)
+
+
+class AutoCorrelationTestStaticShapeFloat32(test.TestCase,
+                                            _AutoCorrelationTest):
+
+  @property
+  def dtype(self):
+    return np.float32
+
+  @property
+  def use_static_shape(self):
+    return True
+
+
+class AutoCorrelationTestStaticShapeComplex64(test.TestCase,
+                                              _AutoCorrelationTest):
+
+  @property
+  def dtype(self):
+    return np.complex64
+
+  @property
+  def use_static_shape(self):
+    return True
+
+
+class AutoCorrelationTestDynamicShapeFloat32(test.TestCase,
+                                             _AutoCorrelationTest):
+
+  @property
+  def dtype(self):
+    return np.float32
+
+  @property
+  def use_static_shape(self):
+    return False
+
+
 class PercentileTestWithLowerInterpolation(test.TestCase):
 
   _interpolation = "lower"
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
index 103d8e186221e879d1734a097114708429f725bd..cbaf74d3f66253ae5727e1ba579e2d49235b748e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
@@ -200,6 +200,27 @@ class TransformedDistributionTest(test.TestCase):
       self.assertAllEqual([2], multi_logit_normal.event_shape)
       self.assertAllEqual([2], multi_logit_normal.event_shape_tensor().eval())
 
+  def testCastLogDetJacobian(self):
+    """Test log_prob when Jacobian and log_prob dtypes do not match."""
+
+    with self.test_session():
+      # Create an identity bijector whose jacobians have dtype int32
+      int_identity = bs.Inline(
+          forward_fn=array_ops.identity,
+          inverse_fn=array_ops.identity,
+          inverse_log_det_jacobian_fn=lambda x: math_ops.cast(0, dtypes.int32),
+          forward_log_det_jacobian_fn=lambda x: math_ops.cast(0, dtypes.int32),
+          is_constant_jacobian=True)
+      normal = self._cls()(
+          distribution=ds.Normal(loc=0., scale=1.),
+          bijector=int_identity,
+          validate_args=True)
+
+      y = normal.sample()
+      normal.log_prob(y).eval()
+      normal.prob(y).eval()
+      normal.entropy().eval()
+
   def testEntropy(self):
     with self.test_session():
       shift = np.array([[-1, 0, 1], [-1, -2, -3]], dtype=np.float32)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
index de4a221f7badca8267a81d612a57137c676ff052..04f047aa0c81b3f59b97f14554fb59cb1b3dd8af 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_diffeomixture_test.py
@@ -21,14 +21,14 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import test_util
-from tensorflow.contrib.distributions.python.ops import vector_diffeomixture as vector_diffeomixture_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
+from tensorflow.contrib.distributions.python.ops import vector_diffeomixture as vdm_lib
 from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.ops.linalg import linear_operator_diag as linop_diag_lib
 from tensorflow.python.ops.linalg import linear_operator_identity as linop_identity_lib
 from tensorflow.python.platform import test
 
+rng = np.random.RandomState(0)
+
 
 class VectorDiffeomixtureTest(
     test_util.VectorDistributionTestHelpers, test.TestCase):
@@ -37,9 +37,9 @@ class VectorDiffeomixtureTest(
   def testSampleProbConsistentBroadcastMixNoBatch(self):
     with self.test_session() as sess:
       dims = 4
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+      vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [1.]],
-          mix_scale=[1.],
+          temperature=[1.],
           distribution=normal_lib.Normal(0., 1.),
           loc=[
               None,
@@ -54,20 +54,21 @@ class VectorDiffeomixtureTest(
                   diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
                   is_positive_definite=True),
           ],
+          quadrature_size=8,
           validate_args=True)
       # Ball centered at component0's mean.
       self.run_test_sample_consistent_log_prob(
-          sess.run, vdm, radius=2., center=0., rtol=0.005)
+          sess.run, vdm, radius=2., center=0., rtol=0.015)
       # Larger ball centered at component1's mean.
       self.run_test_sample_consistent_log_prob(
-          sess.run, vdm, radius=4., center=2., rtol=0.005)
+          sess.run, vdm, radius=4., center=2., rtol=0.015)
 
   def testSampleProbConsistentBroadcastMixNonStandardBase(self):
     with self.test_session() as sess:
       dims = 4
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+      vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [1.]],
-          mix_scale=[1.],
+          temperature=[1.],
           distribution=normal_lib.Normal(1., 1.5),
           loc=[
               None,
@@ -82,20 +83,21 @@ class VectorDiffeomixtureTest(
                   diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
                   is_positive_definite=True),
           ],
+          quadrature_size=8,
           validate_args=True)
       # Ball centered at component0's mean.
       self.run_test_sample_consistent_log_prob(
-          sess.run, vdm, radius=2., center=1., rtol=0.006)
+          sess.run, vdm, radius=2., center=1., rtol=0.015)
       # Larger ball centered at component1's mean.
       self.run_test_sample_consistent_log_prob(
-          sess.run, vdm, radius=4., center=3., rtol=0.009)
+          sess.run, vdm, radius=4., center=3., rtol=0.01)
 
   def testSampleProbConsistentBroadcastMixBatch(self):
     with self.test_session() as sess:
       dims = 4
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+      vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [1.]],
-          mix_scale=[1.],
+          temperature=[1.],
           distribution=normal_lib.Normal(0., 1.),
           loc=[
               None,
@@ -113,20 +115,48 @@ class VectorDiffeomixtureTest(
                   ]),
                   is_positive_definite=True),
           ],
+          quadrature_size=8,
           validate_args=True)
       # Ball centered at component0's mean.
       self.run_test_sample_consistent_log_prob(
-          sess.run, vdm, radius=2., center=0., rtol=0.005)
+          sess.run, vdm, radius=2., center=0., rtol=0.01)
       # Larger ball centered at component1's mean.
       self.run_test_sample_consistent_log_prob(
-          sess.run, vdm, radius=4., center=2., rtol=0.005)
+          sess.run, vdm, radius=4., center=2., rtol=0.01)
+
+  def testSampleProbConsistentBroadcastMixTwoBatchDims(self):
+    dims = 4
+    loc_1 = rng.randn(2, 3, dims).astype(np.float32)
+
+    with self.test_session() as sess:
+      vdm = vdm_lib.VectorDiffeomixture(
+          mix_loc=(rng.rand(2, 3, 1) - 0.5).astype(np.float32),
+          temperature=[1.],
+          distribution=normal_lib.Normal(0., 1.),
+          loc=[
+              None,
+              loc_1,
+          ],
+          scale=[
+              linop_identity_lib.LinearOperatorScaledIdentity(
+                  num_rows=dims,
+                  multiplier=[np.float32(1.1)],
+                  is_positive_definite=True),
+          ] * 2,
+          validate_args=True)
+      # Ball centered at component0's mean.
+      self.run_test_sample_consistent_log_prob(
+          sess.run, vdm, radius=2., center=0., rtol=0.01)
+      # Larger ball centered at component1's mean.
+      self.run_test_sample_consistent_log_prob(
+          sess.run, vdm, radius=3., center=loc_1, rtol=0.02)
 
   def testMeanCovarianceNoBatch(self):
     with self.test_session() as sess:
       dims = 3
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+      vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [4.]],
-          mix_scale=[10.],
+          temperature=[1 / 10.],
           distribution=normal_lib.Normal(0., 1.),
           loc=[
               np.float32([-2.]),
@@ -141,16 +171,99 @@ class VectorDiffeomixtureTest(
                   diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
                   is_positive_definite=True),
           ],
+          quadrature_size=8,
           validate_args=True)
       self.run_test_sample_consistent_mean_covariance(
-          sess.run, vdm, rtol=0.02, cov_rtol=0.06)
+          sess.run, vdm, rtol=0.02, cov_rtol=0.08)
+
+  def testTemperatureControlsHowMuchThisLooksLikeDiscreteMixture(self):
+    # As temperature decreases, this should approach a mixture of normals, with
+    # components at -2, 2.
+    with self.test_session() as sess:
+      dims = 1
+      vdm = vdm_lib.VectorDiffeomixture(
+          mix_loc=[0.],
+          temperature=[[2.], [1.], [0.2]],
+          distribution=normal_lib.Normal(0., 1.),
+          loc=[
+              np.float32([-2.]),
+              np.float32([2.]),
+          ],
+          scale=[
+              linop_identity_lib.LinearOperatorScaledIdentity(
+                  num_rows=dims,
+                  multiplier=np.float32(0.5),
+                  is_positive_definite=True),
+          ] * 2,  # Use the same scale for each component.
+          quadrature_size=8,
+          validate_args=True)
+
+      samps = vdm.sample(10000)
+      self.assertAllEqual((10000, 3, 1), samps.shape)
+      samps_ = sess.run(samps).reshape(10000, 3)  # Make scalar event shape.
+
+      # One characteristic of a discrete mixture (as opposed to a "smear") is
+      # that more weight is put near the component centers at -2, 2, and thus
+      # less weight is put near the origin.
+      prob_of_being_near_origin = (np.abs(samps_) < 1).mean(axis=0)
+      self.assertGreater(
+          prob_of_being_near_origin[0], prob_of_being_near_origin[1])
+      self.assertGreater(
+          prob_of_being_near_origin[1], prob_of_being_near_origin[2])
+
+      # Run this test as well, just because we can.
+      self.run_test_sample_consistent_mean_covariance(
+          sess.run, vdm, rtol=0.02, cov_rtol=0.08)
+
+  def testConcentrationLocControlsHowMuchWeightIsOnEachComponent(self):
+    with self.test_session() as sess:
+      dims = 1
+      vdm = vdm_lib.VectorDiffeomixture(
+          mix_loc=[[-1.], [0.], [1.]],
+          temperature=[0.5],
+          distribution=normal_lib.Normal(0., 1.),
+          loc=[
+              np.float32([-2.]),
+              np.float32([2.]),
+          ],
+          scale=[
+              linop_identity_lib.LinearOperatorScaledIdentity(
+                  num_rows=dims,
+                  multiplier=np.float32(0.5),
+                  is_positive_definite=True),
+          ] * 2,  # Use the same scale for each component.
+          quadrature_size=8,
+          validate_args=True)
+
+      samps = vdm.sample(10000)
+      self.assertAllEqual((10000, 3, 1), samps.shape)
+      samps_ = sess.run(samps).reshape(10000, 3)  # Make scalar event shape.
+
+      # One characteristic of putting more weight on a component is that the
+      # mean is closer to that component's mean.
+      # Get the mean for each batch member, the names signify the value of
+      # concentration for that batch member.
+      mean_neg1, mean_0, mean_1 = samps_.mean(axis=0)
+
+      # Since concentration is the concentration for component 0,
+      # concentration = -1 ==> more weight on component 1, which has mean = 2
+      # concentration = 0 ==> equal weight
+      # concentration = 1 ==> more weight on component 0, which has mean = -2
+      self.assertLess(-2, mean_1)
+      self.assertLess(mean_1, mean_0)
+      self.assertLess(mean_0, mean_neg1)
+      self.assertLess(mean_neg1, 2)
+
+      # Run this test as well, just because we can.
+      self.run_test_sample_consistent_mean_covariance(
+          sess.run, vdm, rtol=0.02, cov_rtol=0.08)
 
   def testMeanCovarianceNoBatchUncenteredNonStandardBase(self):
     with self.test_session() as sess:
       dims = 3
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+      vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [4.]],
-          mix_scale=[10.],
+          temperature=[0.1],
           distribution=normal_lib.Normal(-1., 1.5),
           loc=[
               np.float32([-2.]),
@@ -165,6 +278,7 @@ class VectorDiffeomixtureTest(
                   diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
                   is_positive_definite=True),
           ],
+          quadrature_size=8,
           validate_args=True)
       self.run_test_sample_consistent_mean_covariance(
           sess.run, vdm, num_samples=int(1e6), rtol=0.01, cov_atol=0.025)
@@ -172,9 +286,9 @@ class VectorDiffeomixtureTest(
   def testMeanCovarianceBatch(self):
     with self.test_session() as sess:
       dims = 3
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
+      vdm = vdm_lib.VectorDiffeomixture(
           mix_loc=[[0.], [4.]],
-          mix_scale=[10.],
+          temperature=[0.1],
           distribution=normal_lib.Normal(0., 1.),
           loc=[
               np.float32([[-2.]]),
@@ -192,19 +306,17 @@ class VectorDiffeomixtureTest(
                   ]),
                   is_positive_definite=True),
           ],
+          quadrature_size=8,
           validate_args=True)
       self.run_test_sample_consistent_mean_covariance(
-          sess.run, vdm, rtol=0.02, cov_rtol=0.06)
+          sess.run, vdm, rtol=0.02, cov_rtol=0.07)
 
-  def testSampleProbConsistentDynamicQuadrature(self):
+  def testSampleProbConsistentQuadrature(self):
     with self.test_session() as sess:
-      qgrid = array_ops.placeholder(dtype=dtypes.float32)
-      qprobs = array_ops.placeholder(dtype=dtypes.float32)
-      g, p = np.polynomial.hermite.hermgauss(deg=8)
       dims = 4
-      vdm = vector_diffeomixture_lib.VectorDiffeomixture(
-          mix_loc=[[0.], [1.]],
-          mix_scale=[1.],
+      vdm = vdm_lib.VectorDiffeomixture(
+          mix_loc=[0.],
+          temperature=[0.1],
           distribution=normal_lib.Normal(0., 1.),
           loc=[
               None,
@@ -219,38 +331,14 @@ class VectorDiffeomixtureTest(
                   diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
                   is_positive_definite=True),
           ],
-          quadrature_grid_and_probs=(g, p),
+          quadrature_size=3,
           validate_args=True)
       # Ball centered at component0's mean.
-      sess_run_fn = lambda x: sess.run(x, feed_dict={qgrid: g, qprobs: p})
       self.run_test_sample_consistent_log_prob(
-          sess_run_fn, vdm, radius=2., center=0., rtol=0.005)
+          sess.run, vdm, radius=2., center=0., rtol=0.015)
       # Larger ball centered at component1's mean.
       self.run_test_sample_consistent_log_prob(
-          sess_run_fn, vdm, radius=4., center=2., rtol=0.005)
-
-  # TODO(jvdillon): We've tested that (i) .sample and .log_prob are consistent,
-  # (ii) .mean, .stddev etc... and .sample are consistent. However, we haven't
-  # tested that the quadrature approach well-approximates the integral.
-  #
-  # To that end, consider adding these tests:
-  #
-  # Test1: In the limit of high mix_scale, this approximates a discrete mixture,
-  # and there are many discrete mixtures where we can explicitly compute
-  # mean/var, etc... So test1 would choose one of those discrete mixtures and
-  # show our mean/var/etc... is close to that.
-  #
-  # Test2:  In the limit of low mix_scale, the a diffeomixture of Normal(-5, 1),
-  # Normal(5, 1) should (I believe...must check) should look almost like
-  # Uniform(-5, 5), and thus (i) .prob(x) should be about 1/10 for x in (-5, 5),
-  # and (ii) the first few moments should approximately match that of
-  # Uniform(-5, 5)
-  #
-  # Test3:  If mix_loc is symmetric, then for any mix_scale, our
-  # quadrature-based diffeomixture of Normal(-1, 1), Normal(1, 1) should have
-  # mean zero, exactly.
-
-  # TODO(jvdillon): Add more tests which verify broadcasting.
+          sess.run, vdm, radius=4., center=2., rtol=0.005)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/ops/autoregressive.py b/tensorflow/contrib/distributions/python/ops/autoregressive.py
new file mode 100644
index 0000000000000000000000000000000000000000..852298bf334666db003353d5fc8e172ffb738668
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/autoregressive.py
@@ -0,0 +1,208 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Autoregressive distribution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.distributions import distribution as distribution_lib
+from tensorflow.python.ops.distributions import util as distribution_util
+
+
+class Autoregressive(distribution_lib.Distribution):
+  """Autoregressive distributions.
+
+  The Autoregressive distribution enables learning (often) richer multivariate
+  distributions by repeatedly applying a [diffeomorphic](
+  https://en.wikipedia.org/wiki/Diffeomorphism) transformation (such as
+  implemented by `Bijector`s). Regarding terminology,
+
+    "Autoregressive models decompose the joint density as a product of
+    conditionals, and model each conditional in turn. Normalizing flows
+    transform a base density (e.g. a standard Gaussian) into the target density
+    by an invertible transformation with tractable Jacobian." [1]
+
+  In other words, the "autoregressive property" is equivalent to the
+  decomposition, `p(x) = prod{ p(x[i] | x[0:i]) : i=0, ..., d }`. The provided
+  `shift_and_log_scale_fn`, `masked_autoregressive_default_template`, achieves
+  this property by zeroing out weights in its `masked_dense` layers.
+
+  Practically speaking the autoregressive property means that there exists a
+  permutation of the event coordinates such that each coordinate is a
+  diffeomorphic function of only preceding coordinates. [2]
+
+  #### Mathematical Details
+
+  The probability function is,
+
+  ```none
+  prob(x; fn, n) = fn(x).prob(x)
+  ```
+
+  And a sample is generated by,
+
+  ```none
+  x = fn(...fn(fn(x0).sample()).sample()).sample()
+  ```
+
+  where the ellipses (`...`) represent `n-2` composed calls to `fn`, `fn`
+  constructs a `tf.distributions.Distribution`-like instance, and `x0` is a
+  fixed initializing `Tensor`.
+
+  #### Examples
+
+  ```python
+  tfd = tf.contrib.distributions
+
+  def normal_fn(self, event_size):
+    n = event_size * (event_size + 1) / 2
+    p = tf.Variable(tfd.Normal(loc=0., scale=1.).sample(n))
+    affine = tfd.bijectors.Affine(
+        scale_tril=tfd.fill_triangular(0.25 * p))
+    def _fn(samples):
+      scale = math_ops.exp(affine.forward(samples)).eval()
+      return independent_lib.Independent(
+          normal_lib.Normal(loc=0., scale=scale, validate_args=True),
+          reinterpreted_batch_ndims=1)
+    return _fn
+
+  batch_and_event_shape = [3, 2, 4]
+  sample0 = array_ops.zeros(batch_and_event_shape)
+  ar = autoregressive_lib.Autoregressive(
+      self._normal_fn(batch_and_event_shape[-1]), sample0)
+  x = ar.sample([6, 5])
+  # ==> x.shape = [6, 5, 3, 2, 4]
+  prob_x = ar.prob(x)
+  # ==> x.shape = [6, 5, 3, 2]
+
+  ```
+
+  [1]: "Masked Autoregressive Flow for Density Estimation."
+       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
+       https://arxiv.org/abs/1705.07057
+
+  [2]: "Conditional Image Generation with PixelCNN Decoders."
+       Aaron van den Oord, Nal Kalchbrenner, Oriol Vinyals, Lasse Espeholt, Alex
+       Graves, Koray Kavukcuoglu. Arxiv, 2016.
+       https://arxiv.org/abs/1606.05328
+  """
+
+  def __init__(self,
+               distribution_fn,
+               sample0=None,
+               num_steps=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="Autoregressive"):
+    """Construct an `Autoregressive` distribution.
+
+    Args:
+      distribution_fn: Python `callable` which constructs a
+        `tf.distributions.Distribution`-like instance from a `Tensor` (e.g.,
+        `sample0`). The function must respect the "autoregressive property",
+        i.e., there exists a permutation of event such that each coordinate is a
+        diffeomorphic function of on preceding coordinates.
+      sample0: Initial input to `distribution_fn`; used to
+        build the distribution in `__init__` which in turn specifies this
+        distribution's properties, e.g., `event_shape`, `batch_shape`, `dtype`.
+        If unspecified, then `distribution_fn` should be default constructable.
+      num_steps: Number of times `distribution_fn` is composed from samples,
+        e.g., `num_steps=2` implies
+        `distribution_fn(distribution_fn(sample0).sample(n)).sample()`.
+      validate_args: Python `bool`.  Whether to validate input with asserts.
+        If `validate_args` is `False`, and the inputs are invalid,
+        correct behavior is not guaranteed.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+        Default value: "Autoregressive".
+
+    Raises:
+      ValueError: if `num_steps` and
+        `distribution_fn(sample0).event_shape.num_elements()` are both `None`.
+      ValueError: if `num_steps < 1`.
+    """
+    parameters = locals()
+    with ops.name_scope(name):
+      self._distribution_fn = distribution_fn
+      self._sample0 = sample0
+      self._distribution0 = (distribution_fn() if sample0 is None
+                             else distribution_fn(sample0))
+      if num_steps is None:
+        num_steps = self._distribution0.event_shape.num_elements()
+        if num_steps is None:
+          raise ValueError("distribution_fn must generate a distribution "
+                           "with fully known `event_shape`.")
+      if num_steps < 1:
+        raise ValueError("num_steps ({}) must be at least 1.".format(num_steps))
+      self._num_steps = num_steps
+    super(Autoregressive, self).__init__(
+        dtype=self._distribution0.dtype,
+        reparameterization_type=self._distribution0.reparameterization_type,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        parameters=parameters,
+        graph_parents=self._distribution0._graph_parents,  # pylint: disable=protected-access
+        name=name)
+
+  @property
+  def distribution_fn(self):
+    return self._distribution_fn
+
+  @property
+  def sample0(self):
+    return self._sample0
+
+  @property
+  def num_steps(self):
+    return self._num_steps
+
+  @property
+  def distribution0(self):
+    return self._distribution0
+
+  def _batch_shape(self):
+    return self.distribution0.batch_shape
+
+  def _batch_shape_tensor(self):
+    return self.distribution0.batch_shape_tensor()
+
+  def _event_shape(self):
+    return self.distribution0.event_shape
+
+  def _event_shape_tensor(self):
+    return self.distribution0.event_shape_tensor()
+
+  def _sample_n(self, n, seed=None):
+    if seed is None:
+      seed = distribution_util.gen_new_seed(
+          seed=np.random.randint(2**32 - 1),
+          salt="autoregressive")
+    samples = self.distribution0.sample(n, seed=seed)
+    for _ in range(self._num_steps):
+      samples = self.distribution_fn(samples).sample(seed=seed)
+    return samples
+
+  def _log_prob(self, value):
+    return self.distribution_fn(value).log_prob(value)
+
+  def _prob(self, value):
+    return self.distribution_fn(value).prob(value)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index bc0ec7f195af009c87020ce8c4ea18f2e713759a..93923c3f083c7f5136b55e9021cbd6323684b976 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -29,6 +29,7 @@
 @@MaskedAutoregressiveFlow
 @@Permute
 @@PowerTransform
+@@RealNVP
 @@Reshape
 @@Sigmoid
 @@SigmoidCentered
@@ -39,6 +40,7 @@
 
 @@masked_autoregressive_default_template
 @@masked_dense
+@@real_nvp_default_template
 """
 
 from __future__ import absolute_import
@@ -60,6 +62,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.invert import *
 from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import *
 from tensorflow.contrib.distributions.python.ops.bijectors.permute import *
 from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import *
+from tensorflow.contrib.distributions.python.ops.bijectors.real_nvp import *
 from tensorflow.contrib.distributions.python.ops.bijectors.reshape import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered import *
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
index 6049419818e18c54209f0be95d41fcecf6627b7e..0fe9f6aa78fbe845b99d0668f075b0162ec2a9f7 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
@@ -18,12 +18,117 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.absolute_value_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["AbsoluteValue"]
+__all__ = [
+    "AbsoluteValue",
+]
 
-remove_undocumented(__name__, _allowed_symbols)
+
+class AbsoluteValue(bijector.Bijector):
+  """Computes `Y = g(X) = Abs(X)`, element-wise.
+
+  This non-injective bijector allows for transformations of scalar distributions
+  with the absolute value function, which maps `(-inf, inf)` to `[0, inf)`.
+
+  * For `y in (0, inf)`, `AbsoluteValue.inverse(y)` returns the set inverse
+    `{x in (-inf, inf) : |x| = y}` as a tuple, `-y, y`.
+  * `AbsoluteValue.inverse(0)` returns `0, 0`, which is not the set inverse
+    (the set inverse is the singleton `{0}`), but "works" in conjunction with
+    `TransformedDistribution` to produce a left semi-continuous pdf.
+  * For `y < 0`, `AbsoluteValue.inverse(y)` happily returns the
+    wrong thing, `-y, y`.  This is done for efficiency.  If
+    `validate_args == True`, `y < 0` will raise an exception.
+
+
+  ```python
+  tfd = tf.contrib.distributions
+
+  abs = tfd.bijectors.AbsoluteValue()
+
+  abs.forward([-1., 0., 1.])
+  ==> [1., 0.,  1.]
+
+  abs.inverse(1.)
+  ==> [-1., 1.]
+
+  # The |dX/dY| is constant, == 1.  So Log|dX/dY| == 0.
+  abs.inverse_log_det_jacobian(1.)
+  ==> [0., 0.]
+
+  # Special case handling of 0.
+  abs.inverse(0.)
+  ==> [0., 0.]
+
+  abs.inverse_log_det_jacobian(0.)
+  ==> [0., 0.]
+  ```
+
+  """
+
+  def __init__(self, event_ndims=0, validate_args=False, name="absolute_value"):
+    """Instantiates the `AbsoluteValue` bijector.
+
+    Args:
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.  Currently only zero is
+        supported.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness, in particular whether inputs to `inverse` and
+        `inverse_log_det_jacobian` are non-negative.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError:  If `event_ndims` is not zero.
+    """
+    self._graph_parents = []
+    self._name = name
+
+    event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+    event_ndims_const = tensor_util.constant_value(event_ndims)
+    if event_ndims_const is not None and event_ndims_const not in (0,):
+      raise ValueError("event_ndims(%s) was not 0" % event_ndims_const)
+    else:
+      if validate_args:
+        event_ndims = control_flow_ops.with_dependencies(
+            [check_ops.assert_equal(
+                event_ndims, 0, message="event_ndims was not 0")],
+            event_ndims)
+
+    with self._name_scope("init"):
+      super(AbsoluteValue, self).__init__(
+          event_ndims=event_ndims,
+          validate_args=validate_args,
+          name=name)
+
+  def _forward(self, x):
+    return math_ops.abs(x)
+
+  def _inverse(self, y):
+    if self.validate_args:
+      y = control_flow_ops.with_dependencies(
+          [check_ops.assert_non_negative(y, message="Argument y was negative")],
+          y)
+    return -y, y
+
+  def _inverse_log_det_jacobian(self, y):
+    # If event_ndims = 2,
+    # F^{-1}(y) = (-y, y), so DF^{-1}(y) = (-1, 1),
+    # so Log|DF^{-1}(y)| = Log[1, 1] = [0, 0].
+    batch_shape = array_ops.shape(y)[:array_ops.rank(y) - self.event_ndims]
+    zeros = array_ops.zeros(batch_shape, dtype=y.dtype)
+    if self.validate_args:
+      zeros = control_flow_ops.with_dependencies(
+          [check_ops.assert_non_negative(y, message="Argument y was negative")],
+          zeros)
+    return zeros, zeros
+
+  @property
+  def _is_injective(self):
+    return False
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value_impl.py
deleted file mode 100644
index b84502003ab6c0c4ffdda21eea162f441509e1fa..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value_impl.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""AbsoluteValue bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-__all__ = [
-    "AbsoluteValue",
-]
-
-
-class AbsoluteValue(bijector.Bijector):
-  """Computes `Y = g(X) = Abs(X)`, element-wise.
-
-  This non-injective bijector allows for transformations of scalar distributions
-  with the absolute value function, which maps `(-inf, inf)` to `[0, inf)`.
-
-  * For `y in (0, inf)`, `AbsoluteValue.inverse(y)` returns the set inverse
-    `{x in (-inf, inf) : |x| = y}` as a tuple, `-y, y`.
-  * `AbsoluteValue.inverse(0)` returns `0, 0`, which is not the set inverse
-    (the set inverse is the singleton `{0}`), but "works" in conjunction with
-    `TransformedDistribution` to produce a left semi-continuous pdf.
-  * For `y < 0`, `AbsoluteValue.inverse(y)` happily returns the
-    wrong thing, `-y, y`.  This is done for efficiency.  If
-    `validate_args == True`, `y < 0` will raise an exception.
-
-
-  ```python
-  abs = ds.bijectors.AbsoluteValue()
-
-  abs.forward([-1., 0., 1.])
-  ==> [1., 0.,  1.]
-
-  abs.inverse(1.)
-  ==> [-1., 1.]
-
-  # The |dX/dY| is constant, == 1.  So Log|dX/dY| == 0.
-  abs.inverse_log_det_jacobian(1.)
-  ==> [0., 0.]
-
-  # Special case handling of 0.
-  abs.inverse(0.)
-  ==> [0., 0.]
-
-  abs.inverse_log_det_jacobian(0.)
-  ==> [0., 0.]
-  ```
-
-  """
-
-  def __init__(self, event_ndims=0, validate_args=False, name="absolute_value"):
-    """Instantiates the `AbsoluteValue` bijector.
-
-    Args:
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.  Currently only zero is
-        supported.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness, in particular whether inputs to `inverse` and
-        `inverse_log_det_jacobian` are non-negative.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError:  If `event_ndims` is not zero.
-    """
-    self._graph_parents = []
-    self._name = name
-
-    event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-    event_ndims_const = tensor_util.constant_value(event_ndims)
-    if event_ndims_const is not None and event_ndims_const not in (0,):
-      raise ValueError("event_ndims(%s) was not 0" % event_ndims_const)
-    else:
-      if validate_args:
-        event_ndims = control_flow_ops.with_dependencies(
-            [check_ops.assert_equal(
-                event_ndims, 0, message="event_ndims was not 0")],
-            event_ndims)
-
-    with self._name_scope("init"):
-      super(AbsoluteValue, self).__init__(
-          event_ndims=event_ndims,
-          validate_args=validate_args,
-          name=name)
-
-  def _forward(self, x):
-    return math_ops.abs(x)
-
-  def _inverse(self, y):
-    if self.validate_args:
-      y = control_flow_ops.with_dependencies(
-          [check_ops.assert_non_negative(y, message="Argument y was negative")],
-          y)
-    return -y, y
-
-  def _inverse_log_det_jacobian(self, y):
-    # If event_ndims = 2,
-    # F^{-1}(y) = (-y, y), so DF^{-1}(y) = (-1, 1),
-    # so Log|DF^{-1}(y)| = Log[1, 1] = [0, 0].
-    batch_shape = array_ops.shape(y)[:array_ops.rank(y) - self.event_ndims]
-    zeros = array_ops.zeros(batch_shape, dtype=y.dtype)
-    if self.validate_args:
-      zeros = control_flow_ops.with_dependencies(
-          [check_ops.assert_non_negative(y, message="Argument y was negative")],
-          zeros)
-    return zeros, zeros
-
-  @property
-  def _is_injective(self):
-    return False
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
index 940cceff04e77cfc2f7caae5a798d135f7601b95..05bb9c2f9bdf35e222c94db3491157893da64ebd 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
@@ -18,12 +18,386 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.affine_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.contrib import linalg
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Affine"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Affine",
+]
+
+
+def _as_tensor(x, name):
+  """Convenience to convert to `Tensor` or leave as `None`."""
+  return None if x is None else ops.convert_to_tensor(x, name=name)
+
+
+class Affine(bijector.Bijector):
+  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
+
+  Here `scale = c * I + diag(D1) + tril(L) + V @ diag(D2) @ V.T`.
+
+  In TF parlance, the `scale` term is logically equivalent to:
+
+  ```python
+  scale = (
+    scale_identity_multiplier * tf.diag(tf.ones(d)) +
+    tf.diag(scale_diag) +
+    scale_tril +
+    scale_perturb_factor @ diag(scale_perturb_diag) @
+      tf.transpose([scale_perturb_factor])
+  )
+  ```
+
+  The `scale` term is applied without necessarily materializing constituent
+  matrices, i.e., the matmul is [matrix-free](
+  https://en.wikipedia.org/wiki/Matrix-free_methods) when possible.
+
+  Examples:
+
+  ```python
+  # Y = X
+  b = Affine()
+
+  # Y = X + shift
+  b = Affine(shift=[1., 2, 3])
+
+  # Y = 2 * I @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_identity_multiplier=2.)
+
+  # Y = tf.diag(d1) @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_diag=[-1., 2, 1])         # Implicitly 3x3.
+
+  # Y = (I + v * v.T) @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_perturb_factor=[[1., 0],
+                                   [0, 1],
+                                   [1, 1]])
+
+  # Y = (diag(d1) + v * diag(d2) * v.T) @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_diag=[1., 3, 3],          # Implicitly 3x3.
+             scale_perturb_diag=[2., 1],     # Implicitly 2x2.
+             scale_perturb_factor=[[1., 0],
+                                   [0, 1],
+                                   [1, 1]])
+
+  ```
+
+  """
+
+  def __init__(self,
+               shift=None,
+               scale_identity_multiplier=None,
+               scale_diag=None,
+               scale_tril=None,
+               scale_perturb_factor=None,
+               scale_perturb_diag=None,
+               event_ndims=1,
+               validate_args=False,
+               name="affine"):
+    """Instantiates the `Affine` bijector.
+
+    This `Bijector` is initialized with `shift` `Tensor` and `scale` arguments,
+    giving the forward operation:
+
+    ```none
+    Y = g(X) = scale @ X + shift
+    ```
+
+    where the `scale` term is logically equivalent to:
+
+    ```python
+    scale = (
+      scale_identity_multiplier * tf.diag(tf.ones(d)) +
+      tf.diag(scale_diag) +
+      scale_tril +
+      scale_perturb_factor @ diag(scale_perturb_diag) @
+        tf.transpose([scale_perturb_factor])
+    )
+    ```
+
+    If none of `scale_identity_multiplier`, `scale_diag`, or `scale_tril` are
+    specified then `scale += IdentityMatrix`. Otherwise specifying a
+    `scale` argument has the semantics of `scale += Expand(arg)`, i.e.,
+    `scale_diag != None` means `scale += tf.diag(scale_diag)`.
+
+    Args:
+      shift: Floating-point `Tensor`. If this is set to `None`, no shift is
+        applied.
+      scale_identity_multiplier: floating point rank 0 `Tensor` representing a
+        scaling done to the identity matrix.
+        When `scale_identity_multiplier = scale_diag = scale_tril = None` then
+        `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
+        to `scale`.
+      scale_diag: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
+        diagonal matrix.
+        When `None` no diagonal term is added to `scale`.
+      scale_tril: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k, k], which represents a k x k
+        lower triangular matrix.
+        When `None` no `scale_tril` term is added to `scale`.
+        The upper triangular elements above the diagonal are ignored.
+      scale_perturb_factor: Floating-point `Tensor` representing factor matrix
+        with last two dimensions of shape `(k, r)`. When `None`, no rank-r
+        update is added to `scale`.
+      scale_perturb_diag: Floating-point `Tensor` representing the diagonal
+        matrix. `scale_perturb_diag` has shape [N1, N2, ...  r], which
+        represents an `r x r` diagonal matrix. When `None` low rank updates will
+        take the form `scale_perturb_factor * scale_perturb_factor.T`.
+      event_ndims: Scalar `int` `Tensor` indicating the number of dimensions
+        associated with a particular draw from the distribution. Must be 0 or 1.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if `perturb_diag` is specified but not `perturb_factor`.
+      TypeError: if `shift` has different `dtype` from `scale` arguments.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+
+    # Ambiguous definition of low rank update.
+    if scale_perturb_diag is not None and scale_perturb_factor is None:
+      raise ValueError("When scale_perturb_diag is specified, "
+                       "scale_perturb_factor must be specified.")
+
+    # Special case, only handling a scaled identity matrix. We don't know its
+    # dimensions, so this is special cased.
+    # We don't check identity_multiplier, since below we set it to 1. if all
+    # other scale args are None.
+    self._is_only_identity_multiplier = (scale_tril is None and
+                                         scale_diag is None and
+                                         scale_perturb_factor is None)
+
+    with self._name_scope("init", values=[
+        shift, scale_identity_multiplier, scale_diag, scale_tril,
+        scale_perturb_diag, scale_perturb_factor]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      event_ndims_const = tensor_util.constant_value(event_ndims)
+      if event_ndims_const is not None and event_ndims_const not in (0, 1):
+        raise ValueError("event_ndims(%s) was not 0 or 1" % event_ndims_const)
+      else:
+        if validate_args:
+          # Shape tool will catch if event_ndims is negative.
+          event_ndims = control_flow_ops.with_dependencies(
+              [check_ops.assert_less(
+                  event_ndims, 2, message="event_ndims must be 0 or 1")],
+              event_ndims)
+
+      if event_ndims_const == 0 and not self._is_only_identity_multiplier:
+        raise ValueError(
+            "If event_ndims == 0, the only scale argument you can pass is "
+            "scale_identity_multiplier.  All others operate on vectors.")
+
+      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
+      dtype = dtypes.float32
+
+      if shift is not None:
+        shift = ops.convert_to_tensor(shift, name="shift")
+        dtype = shift.dtype.base_dtype
+      self._shift = shift
+
+      # When no args are specified, pretend the scale matrix is the identity
+      # matrix.
+      if (self._is_only_identity_multiplier and
+          scale_identity_multiplier is None):
+        scale_identity_multiplier = ops.convert_to_tensor(1., dtype=dtype)
+
+      # self._create_scale_operator returns a LinearOperator in all cases
+      # except if self._is_only_identity_multiplier; in which case it
+      # returns a scalar Tensor.
+      scale = self._create_scale_operator(
+          identity_multiplier=scale_identity_multiplier,
+          diag=scale_diag,
+          tril=scale_tril,
+          perturb_diag=scale_perturb_diag,
+          perturb_factor=scale_perturb_factor,
+          shift=shift,
+          validate_args=validate_args)
+
+      if scale.dtype is not None:
+        dtype = scale.dtype.base_dtype
+
+      if scale is not None and not self._is_only_identity_multiplier:
+        if (shift is not None and
+            shift.dtype.base_dtype != scale.dtype.base_dtype):
+          raise TypeError(
+              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
+                  shift.dtype, scale.dtype))
+
+        if scale.tensor_rank is not None:
+          batch_ndims = scale.tensor_rank - 2
+        else:
+          batch_ndims = scale.tensor_rank_tensor() - 2
+      else:
+        # We won't need shape inference when scale is None or when scale is a
+        # scalar.
+        batch_ndims = 0
+      self._scale = scale
+      self._shaper = _DistributionShape(
+          batch_ndims=batch_ndims,
+          event_ndims=event_ndims,
+          validate_args=validate_args)
+      super(Affine, self).__init__(
+          event_ndims=event_ndims,
+          graph_parents=(
+              [event_ndims] +
+              [self._scale] if tensor_util.is_tensor(self._scale)
+              else self._scale.graph_parents +
+              [self._shift] if self._shift is not None else []),
+          is_constant_jacobian=True,
+          dtype=dtype,
+          validate_args=validate_args,
+          name=name)
+
+  def _create_scale_operator(self, identity_multiplier, diag, tril,
+                             perturb_diag, perturb_factor, shift,
+                             validate_args):
+    """Construct `scale` from various components.
+
+    Args:
+      identity_multiplier: floating point rank 0 `Tensor` representing a scaling
+        done to the identity matrix.
+      diag: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
+        diagonal matrix.
+      tril: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_tril` has shape [N1, N2, ...  k], which represents a k x k lower
+        triangular matrix.
+      perturb_diag: Floating-point `Tensor` representing the diagonal matrix of
+        the low rank update.
+      perturb_factor: Floating-point `Tensor` representing factor matrix.
+      shift: Floating-point `Tensor` representing `shift in `scale @ X + shift`.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+
+    Returns:
+      scale. In the case of scaling by a constant, scale is a
+      floating point `Tensor`. Otherwise, scale is a `LinearOperator`.
+
+    Raises:
+      ValueError: if all of `tril`, `diag` and `identity_multiplier` are `None`.
+    """
+    identity_multiplier = _as_tensor(identity_multiplier, "identity_multiplier")
+    diag = _as_tensor(diag, "diag")
+    tril = _as_tensor(tril, "tril")
+    perturb_diag = _as_tensor(perturb_diag, "perturb_diag")
+    perturb_factor = _as_tensor(perturb_factor, "perturb_factor")
+
+    # If possible, use the low rank update to infer the shape of
+    # the identity matrix, when scale represents a scaled identity matrix
+    # with a low rank update.
+    shape_hint = None
+    if perturb_factor is not None:
+      shape_hint = distribution_util.dimension_size(perturb_factor, axis=-2)
+
+    if self._is_only_identity_multiplier:
+      if validate_args:
+        return control_flow_ops.with_dependencies(
+            [check_ops.assert_none_equal(
+                identity_multiplier,
+                array_ops.zeros([], identity_multiplier.dtype),
+                ["identity_multiplier should be non-zero."])],
+            identity_multiplier)
+      return identity_multiplier
+
+    scale = distribution_util.make_tril_scale(
+        loc=shift,
+        scale_tril=tril,
+        scale_diag=diag,
+        scale_identity_multiplier=identity_multiplier,
+        validate_args=validate_args,
+        assert_positive=False,
+        shape_hint=shape_hint)
+
+    if perturb_factor is not None:
+      return linalg.LinearOperatorLowRankUpdate(
+          scale,
+          u=perturb_factor,
+          diag_update=perturb_diag,
+          is_diag_update_positive=perturb_diag is None,
+          is_non_singular=True,  # Implied by is_positive_definite=True.
+          is_self_adjoint=True,
+          is_positive_definite=True,
+          is_square=True)
+
+    return scale
+
+  @property
+  def shift(self):
+    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
+    return self._shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
+    return self._scale
+
+  def _forward(self, x):
+    y = x
+    if self._is_only_identity_multiplier:
+      y *= self._scale
+      if self.shift is not None:
+        return y + self.shift
+      return y
+    y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+        y, expand_batch_dim=False)
+    with ops.control_dependencies(self._maybe_check_scale() if
+                                  self.validate_args else []):
+      y = self.scale.matmul(y)
+    y = self._shaper.undo_make_batch_of_event_sample_matrices(
+        y, sample_shape, expand_batch_dim=False)
+    if self.shift is not None:
+      y += self.shift
+    return y
+
+  def _inverse(self, y):
+    x = y
+    if self.shift is not None:
+      x -= self.shift
+    if self._is_only_identity_multiplier:
+      return x / self._scale
+
+    x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+        x, expand_batch_dim=False)
+    # Solve fails if the op is singular so we may safely skip this assertion.
+    x = self.scale.solve(x)
+    x = self._shaper.undo_make_batch_of_event_sample_matrices(
+        x, sample_shape, expand_batch_dim=False)
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    return -self._forward_log_det_jacobian(y)
+
+  def _forward_log_det_jacobian(self, x):
+    if self._is_only_identity_multiplier:
+      # We don't pad in this case and instead let the fldj be applied
+      # via broadcast.
+      event_size = distribution_util.pick_vector(
+          math_ops.equal(self._shaper.event_ndims, 0),
+          [1], array_ops.shape(x))[-1]
+      event_size = math_ops.cast(event_size, dtype=self._scale.dtype)
+      return math_ops.log(math_ops.abs(self._scale)) * event_size
+    return self.scale.log_abs_determinant()
+
+  def _maybe_check_scale(self):
+    try:
+      return [self.scale.assert_non_singular()]
+    except NotImplementedError:
+      pass
+    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
deleted file mode 100644
index 05bb9c2f9bdf35e222c94db3491157893da64ebd..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Affine bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib import linalg
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Affine",
-]
-
-
-def _as_tensor(x, name):
-  """Convenience to convert to `Tensor` or leave as `None`."""
-  return None if x is None else ops.convert_to_tensor(x, name=name)
-
-
-class Affine(bijector.Bijector):
-  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
-
-  Here `scale = c * I + diag(D1) + tril(L) + V @ diag(D2) @ V.T`.
-
-  In TF parlance, the `scale` term is logically equivalent to:
-
-  ```python
-  scale = (
-    scale_identity_multiplier * tf.diag(tf.ones(d)) +
-    tf.diag(scale_diag) +
-    scale_tril +
-    scale_perturb_factor @ diag(scale_perturb_diag) @
-      tf.transpose([scale_perturb_factor])
-  )
-  ```
-
-  The `scale` term is applied without necessarily materializing constituent
-  matrices, i.e., the matmul is [matrix-free](
-  https://en.wikipedia.org/wiki/Matrix-free_methods) when possible.
-
-  Examples:
-
-  ```python
-  # Y = X
-  b = Affine()
-
-  # Y = X + shift
-  b = Affine(shift=[1., 2, 3])
-
-  # Y = 2 * I @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_identity_multiplier=2.)
-
-  # Y = tf.diag(d1) @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_diag=[-1., 2, 1])         # Implicitly 3x3.
-
-  # Y = (I + v * v.T) @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_perturb_factor=[[1., 0],
-                                   [0, 1],
-                                   [1, 1]])
-
-  # Y = (diag(d1) + v * diag(d2) * v.T) @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_diag=[1., 3, 3],          # Implicitly 3x3.
-             scale_perturb_diag=[2., 1],     # Implicitly 2x2.
-             scale_perturb_factor=[[1., 0],
-                                   [0, 1],
-                                   [1, 1]])
-
-  ```
-
-  """
-
-  def __init__(self,
-               shift=None,
-               scale_identity_multiplier=None,
-               scale_diag=None,
-               scale_tril=None,
-               scale_perturb_factor=None,
-               scale_perturb_diag=None,
-               event_ndims=1,
-               validate_args=False,
-               name="affine"):
-    """Instantiates the `Affine` bijector.
-
-    This `Bijector` is initialized with `shift` `Tensor` and `scale` arguments,
-    giving the forward operation:
-
-    ```none
-    Y = g(X) = scale @ X + shift
-    ```
-
-    where the `scale` term is logically equivalent to:
-
-    ```python
-    scale = (
-      scale_identity_multiplier * tf.diag(tf.ones(d)) +
-      tf.diag(scale_diag) +
-      scale_tril +
-      scale_perturb_factor @ diag(scale_perturb_diag) @
-        tf.transpose([scale_perturb_factor])
-    )
-    ```
-
-    If none of `scale_identity_multiplier`, `scale_diag`, or `scale_tril` are
-    specified then `scale += IdentityMatrix`. Otherwise specifying a
-    `scale` argument has the semantics of `scale += Expand(arg)`, i.e.,
-    `scale_diag != None` means `scale += tf.diag(scale_diag)`.
-
-    Args:
-      shift: Floating-point `Tensor`. If this is set to `None`, no shift is
-        applied.
-      scale_identity_multiplier: floating point rank 0 `Tensor` representing a
-        scaling done to the identity matrix.
-        When `scale_identity_multiplier = scale_diag = scale_tril = None` then
-        `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
-        to `scale`.
-      scale_diag: Floating-point `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
-        diagonal matrix.
-        When `None` no diagonal term is added to `scale`.
-      scale_tril: Floating-point `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ...  k, k], which represents a k x k
-        lower triangular matrix.
-        When `None` no `scale_tril` term is added to `scale`.
-        The upper triangular elements above the diagonal are ignored.
-      scale_perturb_factor: Floating-point `Tensor` representing factor matrix
-        with last two dimensions of shape `(k, r)`. When `None`, no rank-r
-        update is added to `scale`.
-      scale_perturb_diag: Floating-point `Tensor` representing the diagonal
-        matrix. `scale_perturb_diag` has shape [N1, N2, ...  r], which
-        represents an `r x r` diagonal matrix. When `None` low rank updates will
-        take the form `scale_perturb_factor * scale_perturb_factor.T`.
-      event_ndims: Scalar `int` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution. Must be 0 or 1.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if `perturb_diag` is specified but not `perturb_factor`.
-      TypeError: if `shift` has different `dtype` from `scale` arguments.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-
-    # Ambiguous definition of low rank update.
-    if scale_perturb_diag is not None and scale_perturb_factor is None:
-      raise ValueError("When scale_perturb_diag is specified, "
-                       "scale_perturb_factor must be specified.")
-
-    # Special case, only handling a scaled identity matrix. We don't know its
-    # dimensions, so this is special cased.
-    # We don't check identity_multiplier, since below we set it to 1. if all
-    # other scale args are None.
-    self._is_only_identity_multiplier = (scale_tril is None and
-                                         scale_diag is None and
-                                         scale_perturb_factor is None)
-
-    with self._name_scope("init", values=[
-        shift, scale_identity_multiplier, scale_diag, scale_tril,
-        scale_perturb_diag, scale_perturb_factor]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims_const = tensor_util.constant_value(event_ndims)
-      if event_ndims_const is not None and event_ndims_const not in (0, 1):
-        raise ValueError("event_ndims(%s) was not 0 or 1" % event_ndims_const)
-      else:
-        if validate_args:
-          # Shape tool will catch if event_ndims is negative.
-          event_ndims = control_flow_ops.with_dependencies(
-              [check_ops.assert_less(
-                  event_ndims, 2, message="event_ndims must be 0 or 1")],
-              event_ndims)
-
-      if event_ndims_const == 0 and not self._is_only_identity_multiplier:
-        raise ValueError(
-            "If event_ndims == 0, the only scale argument you can pass is "
-            "scale_identity_multiplier.  All others operate on vectors.")
-
-      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
-      dtype = dtypes.float32
-
-      if shift is not None:
-        shift = ops.convert_to_tensor(shift, name="shift")
-        dtype = shift.dtype.base_dtype
-      self._shift = shift
-
-      # When no args are specified, pretend the scale matrix is the identity
-      # matrix.
-      if (self._is_only_identity_multiplier and
-          scale_identity_multiplier is None):
-        scale_identity_multiplier = ops.convert_to_tensor(1., dtype=dtype)
-
-      # self._create_scale_operator returns a LinearOperator in all cases
-      # except if self._is_only_identity_multiplier; in which case it
-      # returns a scalar Tensor.
-      scale = self._create_scale_operator(
-          identity_multiplier=scale_identity_multiplier,
-          diag=scale_diag,
-          tril=scale_tril,
-          perturb_diag=scale_perturb_diag,
-          perturb_factor=scale_perturb_factor,
-          shift=shift,
-          validate_args=validate_args)
-
-      if scale.dtype is not None:
-        dtype = scale.dtype.base_dtype
-
-      if scale is not None and not self._is_only_identity_multiplier:
-        if (shift is not None and
-            shift.dtype.base_dtype != scale.dtype.base_dtype):
-          raise TypeError(
-              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
-                  shift.dtype, scale.dtype))
-
-        if scale.tensor_rank is not None:
-          batch_ndims = scale.tensor_rank - 2
-        else:
-          batch_ndims = scale.tensor_rank_tensor() - 2
-      else:
-        # We won't need shape inference when scale is None or when scale is a
-        # scalar.
-        batch_ndims = 0
-      self._scale = scale
-      self._shaper = _DistributionShape(
-          batch_ndims=batch_ndims,
-          event_ndims=event_ndims,
-          validate_args=validate_args)
-      super(Affine, self).__init__(
-          event_ndims=event_ndims,
-          graph_parents=(
-              [event_ndims] +
-              [self._scale] if tensor_util.is_tensor(self._scale)
-              else self._scale.graph_parents +
-              [self._shift] if self._shift is not None else []),
-          is_constant_jacobian=True,
-          dtype=dtype,
-          validate_args=validate_args,
-          name=name)
-
-  def _create_scale_operator(self, identity_multiplier, diag, tril,
-                             perturb_diag, perturb_factor, shift,
-                             validate_args):
-    """Construct `scale` from various components.
-
-    Args:
-      identity_multiplier: floating point rank 0 `Tensor` representing a scaling
-        done to the identity matrix.
-      diag: Floating-point `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
-        diagonal matrix.
-      tril: Floating-point `Tensor` representing the diagonal matrix.
-        `scale_tril` has shape [N1, N2, ...  k], which represents a k x k lower
-        triangular matrix.
-      perturb_diag: Floating-point `Tensor` representing the diagonal matrix of
-        the low rank update.
-      perturb_factor: Floating-point `Tensor` representing factor matrix.
-      shift: Floating-point `Tensor` representing `shift in `scale @ X + shift`.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-
-    Returns:
-      scale. In the case of scaling by a constant, scale is a
-      floating point `Tensor`. Otherwise, scale is a `LinearOperator`.
-
-    Raises:
-      ValueError: if all of `tril`, `diag` and `identity_multiplier` are `None`.
-    """
-    identity_multiplier = _as_tensor(identity_multiplier, "identity_multiplier")
-    diag = _as_tensor(diag, "diag")
-    tril = _as_tensor(tril, "tril")
-    perturb_diag = _as_tensor(perturb_diag, "perturb_diag")
-    perturb_factor = _as_tensor(perturb_factor, "perturb_factor")
-
-    # If possible, use the low rank update to infer the shape of
-    # the identity matrix, when scale represents a scaled identity matrix
-    # with a low rank update.
-    shape_hint = None
-    if perturb_factor is not None:
-      shape_hint = distribution_util.dimension_size(perturb_factor, axis=-2)
-
-    if self._is_only_identity_multiplier:
-      if validate_args:
-        return control_flow_ops.with_dependencies(
-            [check_ops.assert_none_equal(
-                identity_multiplier,
-                array_ops.zeros([], identity_multiplier.dtype),
-                ["identity_multiplier should be non-zero."])],
-            identity_multiplier)
-      return identity_multiplier
-
-    scale = distribution_util.make_tril_scale(
-        loc=shift,
-        scale_tril=tril,
-        scale_diag=diag,
-        scale_identity_multiplier=identity_multiplier,
-        validate_args=validate_args,
-        assert_positive=False,
-        shape_hint=shape_hint)
-
-    if perturb_factor is not None:
-      return linalg.LinearOperatorLowRankUpdate(
-          scale,
-          u=perturb_factor,
-          diag_update=perturb_diag,
-          is_diag_update_positive=perturb_diag is None,
-          is_non_singular=True,  # Implied by is_positive_definite=True.
-          is_self_adjoint=True,
-          is_positive_definite=True,
-          is_square=True)
-
-    return scale
-
-  @property
-  def shift(self):
-    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
-    return self._shift
-
-  @property
-  def scale(self):
-    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
-    return self._scale
-
-  def _forward(self, x):
-    y = x
-    if self._is_only_identity_multiplier:
-      y *= self._scale
-      if self.shift is not None:
-        return y + self.shift
-      return y
-    y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-        y, expand_batch_dim=False)
-    with ops.control_dependencies(self._maybe_check_scale() if
-                                  self.validate_args else []):
-      y = self.scale.matmul(y)
-    y = self._shaper.undo_make_batch_of_event_sample_matrices(
-        y, sample_shape, expand_batch_dim=False)
-    if self.shift is not None:
-      y += self.shift
-    return y
-
-  def _inverse(self, y):
-    x = y
-    if self.shift is not None:
-      x -= self.shift
-    if self._is_only_identity_multiplier:
-      return x / self._scale
-
-    x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-        x, expand_batch_dim=False)
-    # Solve fails if the op is singular so we may safely skip this assertion.
-    x = self.scale.solve(x)
-    x = self._shaper.undo_make_batch_of_event_sample_matrices(
-        x, sample_shape, expand_batch_dim=False)
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
-  def _forward_log_det_jacobian(self, x):
-    if self._is_only_identity_multiplier:
-      # We don't pad in this case and instead let the fldj be applied
-      # via broadcast.
-      event_size = distribution_util.pick_vector(
-          math_ops.equal(self._shaper.event_ndims, 0),
-          [1], array_ops.shape(x))[-1]
-      event_size = math_ops.cast(event_size, dtype=self._scale.dtype)
-      return math_ops.log(math_ops.abs(self._scale)) * event_size
-    return self.scale.log_abs_determinant()
-
-  def _maybe_check_scale(self):
-    try:
-      return [self.scale.assert_non_singular()]
-    except NotImplementedError:
-      pass
-    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
index aca04a89df7c3ee09d5f7cc10f6779e33fa7aa66..89043b1410370074f11f2cfa59b6b6663fa62521 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
@@ -18,12 +18,214 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.linalg import linear_operator
 
-_allowed_symbols = ["AffineLinearOperator"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "AffineLinearOperator",
+]
+
+
+class AffineLinearOperator(bijector.Bijector):
+  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
+
+  `shift` is a numeric `Tensor` and `scale` is a `LinearOperator`.
+
+  If `X` is a scalar then the forward transformation is: `scale * X + shift`
+  where `*` denotes the scalar product.
+
+  Note: we don't always simply transpose `X` (but write it this way for
+  brevity). Actually the input `X` undergoes the following transformation
+  before being premultiplied by `scale`:
+
+  1. If there are no sample dims, we call `X = tf.expand_dims(X, 0)`, i.e.,
+     `new_sample_shape = [1]`. Otherwise do nothing.
+  2. The sample shape is flattened to have one dimension, i.e.,
+     `new_sample_shape = [n]` where `n = tf.reduce_prod(old_sample_shape)`.
+  3. The sample dim is cyclically rotated left by 1, i.e.,
+     `new_shape = [B1,...,Bb, k, n]` where `n` is as above, `k` is the
+     event_shape, and `B1,...,Bb` are the batch shapes for each of `b` batch
+     dimensions.
+
+  (For more details see `shape.make_batch_of_event_sample_matrices`.)
+
+  The result of the above transformation is that `X` can be regarded as a batch
+  of matrices where each column is a draw from the distribution. After
+  premultiplying by `scale`, we take the inverse of this procedure. The input
+  `Y` also undergoes the same transformation before/after premultiplying by
+  `inv(scale)`.
+
+  Example Use:
+
+  ```python
+  linalg = tf.linalg
+
+  x = [1., 2, 3]
+
+  shift = [-1., 0., 1]
+  diag = [1., 2, 3]
+  scale = linalg.LinearOperatorDiag(diag)
+  affine = AffineLinearOperator(shift, scale)
+  # In this case, `forward` is equivalent to:
+  # y = scale @ x + shift
+  y = affine.forward(x)  # [0., 4, 10]
+
+  shift = [2., 3, 1]
+  tril = [[1., 0, 0],
+          [2, 1, 0],
+          [3, 2, 1]]
+  scale = linalg.LinearOperatorLowerTriangular(tril)
+  affine = AffineLinearOperator(shift, scale)
+  # In this case, `forward` is equivalent to:
+  # np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
+  y = affine.forward(x)  # [3., 7, 11]
+  ```
+
+  """
+
+  def __init__(self,
+               shift=None,
+               scale=None,
+               event_ndims=1,
+               validate_args=False,
+               name="affine_linear_operator"):
+    """Instantiates the `AffineLinearOperator` bijector.
+
+    Args:
+      shift: Floating-point `Tensor`.
+      scale:  Subclass of `LinearOperator`. Represents the (batch) positive
+        definite matrix `M` in `R^{k x k}`.
+      event_ndims: Scalar `integer` `Tensor` indicating the number of dimensions
+        associated with a particular draw from the distribution. Must be 0 or 1.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if `event_ndims` is not 0 or 1.
+      TypeError: if `scale` is not a `LinearOperator`.
+      TypeError: if `shift.dtype` does not match `scale.dtype`.
+      ValueError: if not `scale.is_non_singular`.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    graph_parents = []
+    with self._name_scope("init", values=[shift]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      if tensor_util.constant_value(event_ndims) is not None:
+        event_ndims = tensor_util.constant_value(event_ndims)
+        if event_ndims not in (0, 1):
+          raise ValueError("event_ndims({}) was not 0 or 1".format(event_ndims))
+      else:
+        if validate_args:
+          # Shape tool will catch if event_ndims is negative.
+          event_ndims = control_flow_ops.with_dependencies(
+              [check_ops.assert_less(
+                  event_ndims, 2, message="event_ndims must be 0 or 1")],
+              event_ndims)
+        graph_parents += [event_ndims]
+
+      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
+      dtype = dtypes.float32
+
+      if shift is not None:
+        shift = ops.convert_to_tensor(shift, name="shift")
+        graph_parents += [shift]
+        dtype = shift.dtype.base_dtype
+      self._shift = shift
+
+      if scale is not None:
+        if (shift is not None and
+            shift.dtype.base_dtype != scale.dtype.base_dtype):
+          raise TypeError(
+              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
+                  shift.dtype, scale.dtype))
+        if not isinstance(scale, linear_operator.LinearOperator):
+          raise TypeError("scale is not an instance of tf.LinearOperator")
+        if validate_args and not scale.is_non_singular:
+          raise ValueError("Scale matrix must be non-singular.")
+        graph_parents += scale.graph_parents
+        if scale.tensor_rank is not None:
+          batch_ndims = scale.tensor_rank - 2
+        else:
+          batch_ndims = scale.tensor_rank_tensor() - 2
+          graph_parents += [batch_ndims]
+        if scale.dtype is not None:
+          dtype = scale.dtype.base_dtype
+      else:
+        batch_ndims = 0  # We won't need shape inference when scale is None.
+      self._scale = scale
+      self._shaper = _DistributionShape(
+          batch_ndims=batch_ndims,
+          event_ndims=event_ndims,
+          validate_args=validate_args)
+      super(AffineLinearOperator, self).__init__(
+          event_ndims=event_ndims,
+          graph_parents=graph_parents,
+          is_constant_jacobian=True,
+          dtype=dtype,
+          validate_args=validate_args,
+          name=name)
+
+  @property
+  def shift(self):
+    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
+    return self._shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
+    return self._scale
+
+  def _forward(self, x):
+    y = x
+    if self.scale is not None:
+      y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+          y, expand_batch_dim=False)
+      with ops.control_dependencies(self._maybe_collect_assertions() if
+                                    self.validate_args else []):
+        y = self.scale.matmul(y)
+      y = self._shaper.undo_make_batch_of_event_sample_matrices(
+          y, sample_shape, expand_batch_dim=False)
+    if self.shift is not None:
+      y += self.shift
+    return y
+
+  def _inverse(self, y):
+    x = y
+    if self.shift is not None:
+      x -= self.shift
+    if self.scale is not None:
+      x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+          x, expand_batch_dim=False)
+      # Solve fails if the op is singular so we may safely skip this assertion.
+      x = self.scale.solve(x)
+      x = self._shaper.undo_make_batch_of_event_sample_matrices(
+          x, sample_shape, expand_batch_dim=False)
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    return -self._forward_log_det_jacobian(y)
+
+  def _forward_log_det_jacobian(self, x):  # pylint: disable=unused-argument
+    if self.scale is None:
+      return constant_op.constant(0, dtype=x.dtype.base_dtype)
+    with ops.control_dependencies(self._maybe_collect_assertions() if
+                                  self.validate_args else []):
+      return self.scale.log_abs_determinant()
+
+  def _maybe_collect_assertions(self):
+    try:
+      return [self.scale.assert_non_singular()]
+    except NotImplementedError:
+      pass
+    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
deleted file mode 100644
index 89043b1410370074f11f2cfa59b6b6663fa62521..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""AffineLinearOperator bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.ops.linalg import linear_operator
-
-
-__all__ = [
-    "AffineLinearOperator",
-]
-
-
-class AffineLinearOperator(bijector.Bijector):
-  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
-
-  `shift` is a numeric `Tensor` and `scale` is a `LinearOperator`.
-
-  If `X` is a scalar then the forward transformation is: `scale * X + shift`
-  where `*` denotes the scalar product.
-
-  Note: we don't always simply transpose `X` (but write it this way for
-  brevity). Actually the input `X` undergoes the following transformation
-  before being premultiplied by `scale`:
-
-  1. If there are no sample dims, we call `X = tf.expand_dims(X, 0)`, i.e.,
-     `new_sample_shape = [1]`. Otherwise do nothing.
-  2. The sample shape is flattened to have one dimension, i.e.,
-     `new_sample_shape = [n]` where `n = tf.reduce_prod(old_sample_shape)`.
-  3. The sample dim is cyclically rotated left by 1, i.e.,
-     `new_shape = [B1,...,Bb, k, n]` where `n` is as above, `k` is the
-     event_shape, and `B1,...,Bb` are the batch shapes for each of `b` batch
-     dimensions.
-
-  (For more details see `shape.make_batch_of_event_sample_matrices`.)
-
-  The result of the above transformation is that `X` can be regarded as a batch
-  of matrices where each column is a draw from the distribution. After
-  premultiplying by `scale`, we take the inverse of this procedure. The input
-  `Y` also undergoes the same transformation before/after premultiplying by
-  `inv(scale)`.
-
-  Example Use:
-
-  ```python
-  linalg = tf.linalg
-
-  x = [1., 2, 3]
-
-  shift = [-1., 0., 1]
-  diag = [1., 2, 3]
-  scale = linalg.LinearOperatorDiag(diag)
-  affine = AffineLinearOperator(shift, scale)
-  # In this case, `forward` is equivalent to:
-  # y = scale @ x + shift
-  y = affine.forward(x)  # [0., 4, 10]
-
-  shift = [2., 3, 1]
-  tril = [[1., 0, 0],
-          [2, 1, 0],
-          [3, 2, 1]]
-  scale = linalg.LinearOperatorLowerTriangular(tril)
-  affine = AffineLinearOperator(shift, scale)
-  # In this case, `forward` is equivalent to:
-  # np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
-  y = affine.forward(x)  # [3., 7, 11]
-  ```
-
-  """
-
-  def __init__(self,
-               shift=None,
-               scale=None,
-               event_ndims=1,
-               validate_args=False,
-               name="affine_linear_operator"):
-    """Instantiates the `AffineLinearOperator` bijector.
-
-    Args:
-      shift: Floating-point `Tensor`.
-      scale:  Subclass of `LinearOperator`. Represents the (batch) positive
-        definite matrix `M` in `R^{k x k}`.
-      event_ndims: Scalar `integer` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution. Must be 0 or 1.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if `event_ndims` is not 0 or 1.
-      TypeError: if `scale` is not a `LinearOperator`.
-      TypeError: if `shift.dtype` does not match `scale.dtype`.
-      ValueError: if not `scale.is_non_singular`.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    graph_parents = []
-    with self._name_scope("init", values=[shift]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      if tensor_util.constant_value(event_ndims) is not None:
-        event_ndims = tensor_util.constant_value(event_ndims)
-        if event_ndims not in (0, 1):
-          raise ValueError("event_ndims({}) was not 0 or 1".format(event_ndims))
-      else:
-        if validate_args:
-          # Shape tool will catch if event_ndims is negative.
-          event_ndims = control_flow_ops.with_dependencies(
-              [check_ops.assert_less(
-                  event_ndims, 2, message="event_ndims must be 0 or 1")],
-              event_ndims)
-        graph_parents += [event_ndims]
-
-      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
-      dtype = dtypes.float32
-
-      if shift is not None:
-        shift = ops.convert_to_tensor(shift, name="shift")
-        graph_parents += [shift]
-        dtype = shift.dtype.base_dtype
-      self._shift = shift
-
-      if scale is not None:
-        if (shift is not None and
-            shift.dtype.base_dtype != scale.dtype.base_dtype):
-          raise TypeError(
-              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
-                  shift.dtype, scale.dtype))
-        if not isinstance(scale, linear_operator.LinearOperator):
-          raise TypeError("scale is not an instance of tf.LinearOperator")
-        if validate_args and not scale.is_non_singular:
-          raise ValueError("Scale matrix must be non-singular.")
-        graph_parents += scale.graph_parents
-        if scale.tensor_rank is not None:
-          batch_ndims = scale.tensor_rank - 2
-        else:
-          batch_ndims = scale.tensor_rank_tensor() - 2
-          graph_parents += [batch_ndims]
-        if scale.dtype is not None:
-          dtype = scale.dtype.base_dtype
-      else:
-        batch_ndims = 0  # We won't need shape inference when scale is None.
-      self._scale = scale
-      self._shaper = _DistributionShape(
-          batch_ndims=batch_ndims,
-          event_ndims=event_ndims,
-          validate_args=validate_args)
-      super(AffineLinearOperator, self).__init__(
-          event_ndims=event_ndims,
-          graph_parents=graph_parents,
-          is_constant_jacobian=True,
-          dtype=dtype,
-          validate_args=validate_args,
-          name=name)
-
-  @property
-  def shift(self):
-    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
-    return self._shift
-
-  @property
-  def scale(self):
-    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
-    return self._scale
-
-  def _forward(self, x):
-    y = x
-    if self.scale is not None:
-      y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-          y, expand_batch_dim=False)
-      with ops.control_dependencies(self._maybe_collect_assertions() if
-                                    self.validate_args else []):
-        y = self.scale.matmul(y)
-      y = self._shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape, expand_batch_dim=False)
-    if self.shift is not None:
-      y += self.shift
-    return y
-
-  def _inverse(self, y):
-    x = y
-    if self.shift is not None:
-      x -= self.shift
-    if self.scale is not None:
-      x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-          x, expand_batch_dim=False)
-      # Solve fails if the op is singular so we may safely skip this assertion.
-      x = self.scale.solve(x)
-      x = self._shaper.undo_make_batch_of_event_sample_matrices(
-          x, sample_shape, expand_batch_dim=False)
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
-  def _forward_log_det_jacobian(self, x):  # pylint: disable=unused-argument
-    if self.scale is None:
-      return constant_op.constant(0, dtype=x.dtype.base_dtype)
-    with ops.control_dependencies(self._maybe_collect_assertions() if
-                                  self.validate_args else []):
-      return self.scale.log_abs_determinant()
-
-  def _maybe_collect_assertions(self):
-    try:
-      return [self.scale.assert_non_singular()]
-    except NotImplementedError:
-      pass
-    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
index 0db10fb75c8483a8209f39370362b05a03d047ca..3ce7c26213034c7345a20faa803c94a1bfa8d579 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
@@ -18,12 +18,151 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.chain_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import itertools
 
-_allowed_symbols = ["Chain"]
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops.distributions import bijector
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "Chain",
+]
+
+
+class Chain(bijector.Bijector):
+  """Bijector which applies a sequence of bijectors.
+
+  Example Use:
+
+  ```python
+  chain = Chain([Exp(), Softplus()], name="one_plus_exp")
+  ```
+
+  Results in:
+
+  * Forward:
+
+   ```python
+   exp = Exp()
+   softplus = Softplus()
+   Chain([exp, softplus]).forward(x)
+   = exp.forward(softplus.forward(x))
+   = tf.exp(tf.log(1. + tf.exp(x)))
+   = 1. + tf.exp(x)
+   ```
+
+  * Inverse:
+
+   ```python
+   exp = Exp()
+   softplus = Softplus()
+   Chain([exp, softplus]).inverse(y)
+   = softplus.inverse(exp.inverse(y))
+   = tf.log(tf.exp(tf.log(y)) - 1.)
+   = tf.log(y - 1.)
+   ```
+
+  """
+
+  def __init__(self, bijectors=None, validate_args=False, name=None):
+    """Instantiates `Chain` bijector.
+
+    Args:
+      bijectors: Python `list` of bijector instances. An empty list makes this
+        bijector equivalent to the `Identity` bijector.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object. Default:
+        E.g., `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
+
+    Raises:
+      ValueError: if bijectors have different dtypes.
+    """
+    if bijectors is None:
+      bijectors = ()
+    self._bijectors = bijectors
+
+    for a_bijector in bijectors:
+      if not a_bijector._is_injective:  # pylint: disable=protected-access
+        raise NotImplementedError(
+            "Invert is not implemented for non-injective bijector ({})".format(
+                a_bijector.name))
+
+    dtype = list(set([b.dtype for b in bijectors]))
+    if len(dtype) > 2:
+      raise ValueError("incompatible dtypes: %s" % dtype)
+    elif len(dtype) == 2:
+      dtype = dtype[1] if dtype[0] is None else dtype[0]
+      event_ndims = bijectors[0].event_ndims
+    elif len(dtype) == 1:
+      dtype = dtype[0]
+      event_ndims = bijectors[0].event_ndims
+    else:
+      dtype = None
+      event_ndims = None
+
+    super(Chain, self).__init__(
+        graph_parents=list(itertools.chain.from_iterable(
+            b.graph_parents for b in bijectors)),
+        is_constant_jacobian=all(b.is_constant_jacobian for b in bijectors),
+        validate_args=validate_args,
+        dtype=dtype,
+        event_ndims=event_ndims,
+        name=name or ("identity" if not bijectors else
+                      "_of_".join(["chain"] + [b.name for b in bijectors])))
+
+  @property
+  def bijectors(self):
+    return self._bijectors
+
+  def _shape_helper(self, func_name, input_shape, reverse):
+    new_shape = input_shape
+    for b in reversed(self.bijectors) if reverse else self.bijectors:
+      func = getattr(b, func_name, None)
+      if func is None:
+        raise ValueError("unable to call %s on bijector %s (%s)" %
+                         (func_name, b.name, func))
+      new_shape = func(new_shape)
+    return new_shape
+
+  def _forward_event_shape(self, input_shape):
+    return self._shape_helper("forward_event_shape", input_shape,
+                              reverse=True)
+
+  def _forward_event_shape_tensor(self, input_shape):
+    return self._shape_helper(
+        "forward_event_shape_tensor", input_shape, reverse=True)
+
+  def _inverse_event_shape(self, output_shape):
+    return self._shape_helper("inverse_event_shape", output_shape,
+                              reverse=False)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    return self._shape_helper("inverse_event_shape_tensor", output_shape,
+                              reverse=False)
+
+  def _inverse(self, y, **kwargs):
+    for b in self.bijectors:
+      y = b.inverse(y, **kwargs.get(b.name, {}))
+    return y
+
+  def _inverse_log_det_jacobian(self, y, **kwargs):
+    ildj = constant_op.constant(0., dtype=y.dtype,
+                                name="inverse_log_det_jacobian")
+    for b in self.bijectors:
+      ildj += b.inverse_log_det_jacobian(y, **kwargs.get(b.name, {}))
+      y = b.inverse(y, **kwargs.get(b.name, {}))
+    return ildj
+
+  def _forward(self, x, **kwargs):
+    for b in reversed(self.bijectors):
+      x = b.forward(x, **kwargs.get(b.name, {}))
+    return x
+
+  def _forward_log_det_jacobian(self, x, **kwargs):
+    fldj = constant_op.constant(0., dtype=x.dtype,
+                                name="forward_log_det_jacobian")
+    for b in reversed(self.bijectors):
+      fldj += b.forward_log_det_jacobian(x, **kwargs.get(b.name, {}))
+      x = b.forward(x, **kwargs.get(b.name, {}))
+    return fldj
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
deleted file mode 100644
index 3ce7c26213034c7345a20faa803c94a1bfa8d579..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Chain bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import itertools
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Chain",
-]
-
-
-class Chain(bijector.Bijector):
-  """Bijector which applies a sequence of bijectors.
-
-  Example Use:
-
-  ```python
-  chain = Chain([Exp(), Softplus()], name="one_plus_exp")
-  ```
-
-  Results in:
-
-  * Forward:
-
-   ```python
-   exp = Exp()
-   softplus = Softplus()
-   Chain([exp, softplus]).forward(x)
-   = exp.forward(softplus.forward(x))
-   = tf.exp(tf.log(1. + tf.exp(x)))
-   = 1. + tf.exp(x)
-   ```
-
-  * Inverse:
-
-   ```python
-   exp = Exp()
-   softplus = Softplus()
-   Chain([exp, softplus]).inverse(y)
-   = softplus.inverse(exp.inverse(y))
-   = tf.log(tf.exp(tf.log(y)) - 1.)
-   = tf.log(y - 1.)
-   ```
-
-  """
-
-  def __init__(self, bijectors=None, validate_args=False, name=None):
-    """Instantiates `Chain` bijector.
-
-    Args:
-      bijectors: Python `list` of bijector instances. An empty list makes this
-        bijector equivalent to the `Identity` bijector.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object. Default:
-        E.g., `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
-
-    Raises:
-      ValueError: if bijectors have different dtypes.
-    """
-    if bijectors is None:
-      bijectors = ()
-    self._bijectors = bijectors
-
-    for a_bijector in bijectors:
-      if not a_bijector._is_injective:  # pylint: disable=protected-access
-        raise NotImplementedError(
-            "Invert is not implemented for non-injective bijector ({})".format(
-                a_bijector.name))
-
-    dtype = list(set([b.dtype for b in bijectors]))
-    if len(dtype) > 2:
-      raise ValueError("incompatible dtypes: %s" % dtype)
-    elif len(dtype) == 2:
-      dtype = dtype[1] if dtype[0] is None else dtype[0]
-      event_ndims = bijectors[0].event_ndims
-    elif len(dtype) == 1:
-      dtype = dtype[0]
-      event_ndims = bijectors[0].event_ndims
-    else:
-      dtype = None
-      event_ndims = None
-
-    super(Chain, self).__init__(
-        graph_parents=list(itertools.chain.from_iterable(
-            b.graph_parents for b in bijectors)),
-        is_constant_jacobian=all(b.is_constant_jacobian for b in bijectors),
-        validate_args=validate_args,
-        dtype=dtype,
-        event_ndims=event_ndims,
-        name=name or ("identity" if not bijectors else
-                      "_of_".join(["chain"] + [b.name for b in bijectors])))
-
-  @property
-  def bijectors(self):
-    return self._bijectors
-
-  def _shape_helper(self, func_name, input_shape, reverse):
-    new_shape = input_shape
-    for b in reversed(self.bijectors) if reverse else self.bijectors:
-      func = getattr(b, func_name, None)
-      if func is None:
-        raise ValueError("unable to call %s on bijector %s (%s)" %
-                         (func_name, b.name, func))
-      new_shape = func(new_shape)
-    return new_shape
-
-  def _forward_event_shape(self, input_shape):
-    return self._shape_helper("forward_event_shape", input_shape,
-                              reverse=True)
-
-  def _forward_event_shape_tensor(self, input_shape):
-    return self._shape_helper(
-        "forward_event_shape_tensor", input_shape, reverse=True)
-
-  def _inverse_event_shape(self, output_shape):
-    return self._shape_helper("inverse_event_shape", output_shape,
-                              reverse=False)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    return self._shape_helper("inverse_event_shape_tensor", output_shape,
-                              reverse=False)
-
-  def _inverse(self, y, **kwargs):
-    for b in self.bijectors:
-      y = b.inverse(y, **kwargs.get(b.name, {}))
-    return y
-
-  def _inverse_log_det_jacobian(self, y, **kwargs):
-    ildj = constant_op.constant(0., dtype=y.dtype,
-                                name="inverse_log_det_jacobian")
-    for b in self.bijectors:
-      ildj += b.inverse_log_det_jacobian(y, **kwargs.get(b.name, {}))
-      y = b.inverse(y, **kwargs.get(b.name, {}))
-    return ildj
-
-  def _forward(self, x, **kwargs):
-    for b in reversed(self.bijectors):
-      x = b.forward(x, **kwargs.get(b.name, {}))
-    return x
-
-  def _forward_log_det_jacobian(self, x, **kwargs):
-    fldj = constant_op.constant(0., dtype=x.dtype,
-                                name="forward_log_det_jacobian")
-    for b in reversed(self.bijectors):
-      fldj += b.forward_log_det_jacobian(x, **kwargs.get(b.name, {}))
-      x = b.forward(x, **kwargs.get(b.name, {}))
-    return fldj
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
index 4686af8bc42a3232cb3a34f2cfcce8323c5896dd..cbd60f92a60612c6cf791b2c7708a3310c6e2b6b 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@@ -18,12 +18,219 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.cholesky_outer_product_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["CholeskyOuterProduct"]
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "CholeskyOuterProduct",
+]
+
+
+class CholeskyOuterProduct(bijector.Bijector):
+  """Compute `g(X) = X @ X.T`; X is lower-triangular, positive-diagonal matrix.
+
+  `event_ndims` must be 0 or 2, i.e., scalar or matrix.
+
+  Note: the upper-triangular part of X is ignored (whether or not its zero).
+
+  The surjectivity of g as a map from  the set of n x n positive-diagonal
+  lower-triangular matrices to the set of SPD matrices follows immediately from
+  executing the Cholesky factorization algorithm on an SPD matrix A to produce a
+  positive-diagonal lower-triangular matrix L such that `A = L @ L.T`.
+
+  To prove the injectivity of g, suppose that L_1 and L_2 are lower-triangular
+  with positive diagonals and satisfy `A = L_1 @ L_1.T = L_2 @ L_2.T`. Then
+    `inv(L_1) @ A @ inv(L_1).T = [inv(L_1) @ L_2] @ [inv(L_1) @ L_2].T = I`.
+  Setting `L_3 := inv(L_1) @ L_2`, that L_3 is a positive-diagonal
+  lower-triangular matrix follows from `inv(L_1)` being positive-diagonal
+  lower-triangular (which follows from the diagonal of a triangular matrix being
+  its spectrum), and that the product of two positive-diagonal lower-triangular
+  matrices is another positive-diagonal lower-triangular matrix.
+
+  A simple inductive argument (proceding one column of L_3 at a time) shows
+  that, if `I = L_3 @ L_3.T`, with L_3 being lower-triangular with positive-
+  diagonal, then `L_3 = I`. Thus, `L_1 = L_2`, proving injectivity of g.
+
+  Examples:
+
+  ```python
+  bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
+  # Result: [[1., 2], [2, 5]], i.e., x @ x.T
+
+  bijector.CholeskyOuterProduct(event_ndims=2).inverse(y=[[1., 2], [2, 5]])
+  # Result: [[1., 0], [2, 1]], i.e., cholesky(y).
+  ```
+
+  """
+
+  def __init__(self, event_ndims=2, validate_args=False,
+               name="cholesky_outer_product"):
+    """Instantiates the `CholeskyOuterProduct` bijector.
+
+    Args:
+      event_ndims: `constant` `int32` scalar `Tensor` indicating the number of
+        dimensions associated with a particular draw from the distribution. Must
+        be 0 or 2.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if event_ndims is neither 0 or 2.
+    """
+    self._graph_parents = []
+    self._name = name
+    with self._name_scope("init", values=[event_ndims]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      event_ndims = tensor_util.constant_value(event_ndims)
+    if event_ndims is None or event_ndims not in [0, 2]:
+      raise ValueError("`event_ndims` must be a TF constant which is 0 or 2")
+    self._static_event_ndims = event_ndims
+    super(CholeskyOuterProduct, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    if self._static_event_ndims == 0:
+      return math_ops.square(x)
+    if self.validate_args:
+      is_matrix = check_ops.assert_rank_at_least(x, 2)
+      shape = array_ops.shape(x)
+      is_square = check_ops.assert_equal(shape[-2], shape[-1])
+      x = control_flow_ops.with_dependencies([is_matrix, is_square], x)
+    # For safety, explicitly zero-out the upper triangular part.
+    x = array_ops.matrix_band_part(x, -1, 0)
+    return math_ops.matmul(x, x, adjoint_b=True)
+
+  def _inverse(self, y):
+    return (math_ops.sqrt(y) if self._static_event_ndims == 0
+            else linalg_ops.cholesky(y))
+
+  def _inverse_log_det_jacobian(self, y):
+    return -self._forward_log_det_jacobian(x=self._inverse(y))
+
+  def _forward_log_det_jacobian(self, x):
+    # Let Y be a symmetric, positive definite matrix and write:
+    #   Y = X X.T
+    # where X is lower-triangular.
+    #
+    # Observe that,
+    #   dY[i,j]/dX[a,b]
+    #   = d/dX[a,b] { X[i,:] X[j,:] }
+    #   = sum_{d=1}^p { I[i=a] I[d=b] X[j,d] + I[j=a] I[d=b] X[i,d] }
+    #
+    # To compute the Jacobian dX/dY we must represent X,Y as vectors. Since Y is
+    # symmetric and X is lower-triangular, we need vectors of dimension:
+    #   d = p (p + 1) / 2
+    # where X, Y are p x p matrices, p > 0. We use a row-major mapping, i.e.,
+    #   k = { i (i + 1) / 2 + j   i>=j
+    #       { undef               i<j
+    # and assume zero-based indexes. When k is undef, the element is dropped.
+    # Example:
+    #           j      k
+    #        0 1 2 3  /
+    #    0 [ 0 . . . ]
+    # i  1 [ 1 2 . . ]
+    #    2 [ 3 4 5 . ]
+    #    3 [ 6 7 8 9 ]
+    # Write vec[.] to indicate transforming a matrix to vector via k(i,j). (With
+    # slight abuse: k(i,j)=undef means the element is dropped.)
+    #
+    # We now show d vec[Y] / d vec[X] is lower triangular. Assuming both are
+    # defined, observe that k(i,j) < k(a,b) iff (1) i<a or (2) i=a and j<b.
+    # In both cases dvec[Y]/dvec[X]@[k(i,j),k(a,b)] = 0 since:
+    # (1) j<=i<a thus i,j!=a.
+    # (2) i=a>j  thus i,j!=a.
+    #
+    # Since the Jacobian is lower-triangular, we need only compute the product
+    # of diagonal elements:
+    #   d vec[Y] / d vec[X] @[k(i,j), k(i,j)]
+    #   = X[j,j] + I[i=j] X[i,j]
+    #   = 2 X[j,j].
+    # Since there is a 2 X[j,j] term for every lower-triangular element of X we
+    # conclude:
+    #   |Jac(d vec[Y]/d vec[X])| = 2^p prod_{j=0}^{p-1} X[j,j]^{p-j}.
+    if self._static_event_ndims == 0:
+      if self.validate_args:
+        is_positive = check_ops.assert_positive(
+            x, message="All elements must be positive.")
+        x = control_flow_ops.with_dependencies([is_positive], x)
+      return np.log(2.) + math_ops.log(x)
+
+    diag = array_ops.matrix_diag_part(x)
+
+    # We now ensure diag is columnar. Eg, if `diag = [1, 2, 3]` then the output
+    # is `[[1], [2], [3]]` and if `diag = [[1, 2, 3], [4, 5, 6]]` then the
+    # output is unchanged.
+    diag = self._make_columnar(diag)
+
+    if self.validate_args:
+      is_matrix = check_ops.assert_rank_at_least(
+          x, 2, message="Input must be a (batch of) matrix.")
+      shape = array_ops.shape(x)
+      is_square = check_ops.assert_equal(
+          shape[-2], shape[-1],
+          message="Input must be a (batch of) square matrix.")
+      # Assuming lower-triangular means we only need check diag>0.
+      is_positive_definite = check_ops.assert_positive(
+          diag, message="Input must be positive definite.")
+      x = control_flow_ops.with_dependencies(
+          [is_matrix, is_square, is_positive_definite], x)
+
+    # Create a vector equal to: [p, p-1, ..., 2, 1].
+    if x.get_shape().ndims is None or x.get_shape()[-1].value is None:
+      p_int = array_ops.shape(x)[-1]
+      p_float = math_ops.cast(p_int, dtype=x.dtype)
+    else:
+      p_int = x.get_shape()[-1].value
+      p_float = np.array(p_int, dtype=x.dtype.as_numpy_dtype)
+    exponents = math_ops.linspace(p_float, 1., p_int)
+
+    sum_weighted_log_diag = array_ops.squeeze(
+        math_ops.matmul(math_ops.log(diag),
+                        exponents[..., array_ops.newaxis]),
+        squeeze_dims=-1)
+    fldj = p_float * np.log(2.) + sum_weighted_log_diag
+
+    return fldj
+
+  def _make_columnar(self, x):
+    """Ensures non-scalar input has at least one column.
+
+    Example:
+      If `x = [1, 2, 3]` then the output is `[[1], [2], [3]]`.
+
+      If `x = [[1, 2, 3], [4, 5, 6]]` then the output is unchanged.
+
+      If `x = 1` then the output is unchanged.
+
+    Args:
+      x: `Tensor`.
+
+    Returns:
+      columnar_x: `Tensor` with at least two dimensions.
+    """
+    if x.get_shape().ndims is not None:
+      if x.get_shape().ndims == 1:
+        x = x[array_ops.newaxis, :]
+      return x
+    shape = array_ops.shape(x)
+    maybe_expanded_shape = array_ops.concat([
+        shape[:-1],
+        distribution_util.pick_vector(
+            math_ops.equal(array_ops.rank(x), 1),
+            [1], np.array([], dtype=np.int32)),
+        shape[-1:],
+    ], 0)
+    return array_ops.reshape(x, maybe_expanded_shape)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
deleted file mode 100644
index cbd60f92a60612c6cf791b2c7708a3310c6e2b6b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""CholeskyOuterProduct bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-__all__ = [
-    "CholeskyOuterProduct",
-]
-
-
-class CholeskyOuterProduct(bijector.Bijector):
-  """Compute `g(X) = X @ X.T`; X is lower-triangular, positive-diagonal matrix.
-
-  `event_ndims` must be 0 or 2, i.e., scalar or matrix.
-
-  Note: the upper-triangular part of X is ignored (whether or not its zero).
-
-  The surjectivity of g as a map from  the set of n x n positive-diagonal
-  lower-triangular matrices to the set of SPD matrices follows immediately from
-  executing the Cholesky factorization algorithm on an SPD matrix A to produce a
-  positive-diagonal lower-triangular matrix L such that `A = L @ L.T`.
-
-  To prove the injectivity of g, suppose that L_1 and L_2 are lower-triangular
-  with positive diagonals and satisfy `A = L_1 @ L_1.T = L_2 @ L_2.T`. Then
-    `inv(L_1) @ A @ inv(L_1).T = [inv(L_1) @ L_2] @ [inv(L_1) @ L_2].T = I`.
-  Setting `L_3 := inv(L_1) @ L_2`, that L_3 is a positive-diagonal
-  lower-triangular matrix follows from `inv(L_1)` being positive-diagonal
-  lower-triangular (which follows from the diagonal of a triangular matrix being
-  its spectrum), and that the product of two positive-diagonal lower-triangular
-  matrices is another positive-diagonal lower-triangular matrix.
-
-  A simple inductive argument (proceding one column of L_3 at a time) shows
-  that, if `I = L_3 @ L_3.T`, with L_3 being lower-triangular with positive-
-  diagonal, then `L_3 = I`. Thus, `L_1 = L_2`, proving injectivity of g.
-
-  Examples:
-
-  ```python
-  bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
-  # Result: [[1., 2], [2, 5]], i.e., x @ x.T
-
-  bijector.CholeskyOuterProduct(event_ndims=2).inverse(y=[[1., 2], [2, 5]])
-  # Result: [[1., 0], [2, 1]], i.e., cholesky(y).
-  ```
-
-  """
-
-  def __init__(self, event_ndims=2, validate_args=False,
-               name="cholesky_outer_product"):
-    """Instantiates the `CholeskyOuterProduct` bijector.
-
-    Args:
-      event_ndims: `constant` `int32` scalar `Tensor` indicating the number of
-        dimensions associated with a particular draw from the distribution. Must
-        be 0 or 2.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if event_ndims is neither 0 or 2.
-    """
-    self._graph_parents = []
-    self._name = name
-    with self._name_scope("init", values=[event_ndims]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims = tensor_util.constant_value(event_ndims)
-    if event_ndims is None or event_ndims not in [0, 2]:
-      raise ValueError("`event_ndims` must be a TF constant which is 0 or 2")
-    self._static_event_ndims = event_ndims
-    super(CholeskyOuterProduct, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward(self, x):
-    if self._static_event_ndims == 0:
-      return math_ops.square(x)
-    if self.validate_args:
-      is_matrix = check_ops.assert_rank_at_least(x, 2)
-      shape = array_ops.shape(x)
-      is_square = check_ops.assert_equal(shape[-2], shape[-1])
-      x = control_flow_ops.with_dependencies([is_matrix, is_square], x)
-    # For safety, explicitly zero-out the upper triangular part.
-    x = array_ops.matrix_band_part(x, -1, 0)
-    return math_ops.matmul(x, x, adjoint_b=True)
-
-  def _inverse(self, y):
-    return (math_ops.sqrt(y) if self._static_event_ndims == 0
-            else linalg_ops.cholesky(y))
-
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(x=self._inverse(y))
-
-  def _forward_log_det_jacobian(self, x):
-    # Let Y be a symmetric, positive definite matrix and write:
-    #   Y = X X.T
-    # where X is lower-triangular.
-    #
-    # Observe that,
-    #   dY[i,j]/dX[a,b]
-    #   = d/dX[a,b] { X[i,:] X[j,:] }
-    #   = sum_{d=1}^p { I[i=a] I[d=b] X[j,d] + I[j=a] I[d=b] X[i,d] }
-    #
-    # To compute the Jacobian dX/dY we must represent X,Y as vectors. Since Y is
-    # symmetric and X is lower-triangular, we need vectors of dimension:
-    #   d = p (p + 1) / 2
-    # where X, Y are p x p matrices, p > 0. We use a row-major mapping, i.e.,
-    #   k = { i (i + 1) / 2 + j   i>=j
-    #       { undef               i<j
-    # and assume zero-based indexes. When k is undef, the element is dropped.
-    # Example:
-    #           j      k
-    #        0 1 2 3  /
-    #    0 [ 0 . . . ]
-    # i  1 [ 1 2 . . ]
-    #    2 [ 3 4 5 . ]
-    #    3 [ 6 7 8 9 ]
-    # Write vec[.] to indicate transforming a matrix to vector via k(i,j). (With
-    # slight abuse: k(i,j)=undef means the element is dropped.)
-    #
-    # We now show d vec[Y] / d vec[X] is lower triangular. Assuming both are
-    # defined, observe that k(i,j) < k(a,b) iff (1) i<a or (2) i=a and j<b.
-    # In both cases dvec[Y]/dvec[X]@[k(i,j),k(a,b)] = 0 since:
-    # (1) j<=i<a thus i,j!=a.
-    # (2) i=a>j  thus i,j!=a.
-    #
-    # Since the Jacobian is lower-triangular, we need only compute the product
-    # of diagonal elements:
-    #   d vec[Y] / d vec[X] @[k(i,j), k(i,j)]
-    #   = X[j,j] + I[i=j] X[i,j]
-    #   = 2 X[j,j].
-    # Since there is a 2 X[j,j] term for every lower-triangular element of X we
-    # conclude:
-    #   |Jac(d vec[Y]/d vec[X])| = 2^p prod_{j=0}^{p-1} X[j,j]^{p-j}.
-    if self._static_event_ndims == 0:
-      if self.validate_args:
-        is_positive = check_ops.assert_positive(
-            x, message="All elements must be positive.")
-        x = control_flow_ops.with_dependencies([is_positive], x)
-      return np.log(2.) + math_ops.log(x)
-
-    diag = array_ops.matrix_diag_part(x)
-
-    # We now ensure diag is columnar. Eg, if `diag = [1, 2, 3]` then the output
-    # is `[[1], [2], [3]]` and if `diag = [[1, 2, 3], [4, 5, 6]]` then the
-    # output is unchanged.
-    diag = self._make_columnar(diag)
-
-    if self.validate_args:
-      is_matrix = check_ops.assert_rank_at_least(
-          x, 2, message="Input must be a (batch of) matrix.")
-      shape = array_ops.shape(x)
-      is_square = check_ops.assert_equal(
-          shape[-2], shape[-1],
-          message="Input must be a (batch of) square matrix.")
-      # Assuming lower-triangular means we only need check diag>0.
-      is_positive_definite = check_ops.assert_positive(
-          diag, message="Input must be positive definite.")
-      x = control_flow_ops.with_dependencies(
-          [is_matrix, is_square, is_positive_definite], x)
-
-    # Create a vector equal to: [p, p-1, ..., 2, 1].
-    if x.get_shape().ndims is None or x.get_shape()[-1].value is None:
-      p_int = array_ops.shape(x)[-1]
-      p_float = math_ops.cast(p_int, dtype=x.dtype)
-    else:
-      p_int = x.get_shape()[-1].value
-      p_float = np.array(p_int, dtype=x.dtype.as_numpy_dtype)
-    exponents = math_ops.linspace(p_float, 1., p_int)
-
-    sum_weighted_log_diag = array_ops.squeeze(
-        math_ops.matmul(math_ops.log(diag),
-                        exponents[..., array_ops.newaxis]),
-        squeeze_dims=-1)
-    fldj = p_float * np.log(2.) + sum_weighted_log_diag
-
-    return fldj
-
-  def _make_columnar(self, x):
-    """Ensures non-scalar input has at least one column.
-
-    Example:
-      If `x = [1, 2, 3]` then the output is `[[1], [2], [3]]`.
-
-      If `x = [[1, 2, 3], [4, 5, 6]]` then the output is unchanged.
-
-      If `x = 1` then the output is unchanged.
-
-    Args:
-      x: `Tensor`.
-
-    Returns:
-      columnar_x: `Tensor` with at least two dimensions.
-    """
-    if x.get_shape().ndims is not None:
-      if x.get_shape().ndims == 1:
-        x = x[array_ops.newaxis, :]
-      return x
-    shape = array_ops.shape(x)
-    maybe_expanded_shape = array_ops.concat([
-        shape[:-1],
-        distribution_util.pick_vector(
-            math_ops.equal(array_ops.rank(x), 1),
-            [1], np.array([], dtype=np.int32)),
-        shape[-1:],
-    ], 0)
-    return array_ops.reshape(x, maybe_expanded_shape)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
index d254b635d28099a09a2054536f04ffee3a355b2f..ccb1f029277bc07011df7be047a075274f2b3a27 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
@@ -18,12 +18,38 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.conditional_bijector_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
-_allowed_symbols = ["ConditionalBijector"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = ["ConditionalBijector"]
+
+
+class ConditionalBijector(bijector.Bijector):
+  """Conditional Bijector is a Bijector that allows intrinsic conditioning."""
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def forward(self, x, name="forward", **condition_kwargs):
+    return self._call_forward(x, name, **condition_kwargs)
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def inverse(self, y, name="inverse", **condition_kwargs):
+    return self._call_inverse(y, name, **condition_kwargs)
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def inverse_log_det_jacobian(
+      self, y, name="inverse_log_det_jacobian", **condition_kwargs):
+    return self._call_inverse_log_det_jacobian(y, name, **condition_kwargs)
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def forward_log_det_jacobian(
+      self, x, name="forward_log_det_jacobian", **condition_kwargs):
+    return self._call_forward_log_det_jacobian(x, name, **condition_kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
deleted file mode 100644
index ccb1f029277bc07011df7be047a075274f2b3a27..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""ConditionalBijector base."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-__all__ = ["ConditionalBijector"]
-
-
-class ConditionalBijector(bijector.Bijector):
-  """Conditional Bijector is a Bijector that allows intrinsic conditioning."""
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def forward(self, x, name="forward", **condition_kwargs):
-    return self._call_forward(x, name, **condition_kwargs)
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def inverse(self, y, name="inverse", **condition_kwargs):
-    return self._call_inverse(y, name, **condition_kwargs)
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def inverse_log_det_jacobian(
-      self, y, name="inverse_log_det_jacobian", **condition_kwargs):
-    return self._call_inverse_log_det_jacobian(y, name, **condition_kwargs)
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def forward_log_det_jacobian(
-      self, x, name="forward_log_det_jacobian", **condition_kwargs):
-    return self._call_forward_log_det_jacobian(x, name, **condition_kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/exp.py b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
index 399d713098eb7223601beb9518dc51dd6160ad64..b1ff840d62a73c941a4d67dec73b5c9f4d5353f9 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
@@ -18,12 +18,49 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.exp_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.contrib.distributions.python.ops.bijectors import power_transform
 
-_allowed_symbols = ["Exp"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Exp",
+]
+
+
+class Exp(power_transform.PowerTransform):
+  """Compute `Y = g(X) = exp(X)`.
+
+    Example Use:
+
+    ```python
+    # Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
+    # batch ndim and 2 event ndims (i.e., vector of matrices).
+    exp = Exp(event_ndims=2)
+    x = [[[1., 2],
+           [3, 4]],
+          [[5, 6],
+           [7, 8]]]
+    exp(x) == exp.forward(x)
+    log(x) == exp.inverse(x)
+    ```
+
+    Note: the exp(.) is applied element-wise but the Jacobian is a reduction
+    over the event space.
+  """
+
+  def __init__(self,
+               event_ndims=0,
+               validate_args=False,
+               name="exp"):
+    """Instantiates the `Exp` bijector.
+
+    Args:
+      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
+        associated with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    super(Exp, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/exp_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/exp_impl.py
deleted file mode 100644
index b1ff840d62a73c941a4d67dec73b5c9f4d5353f9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/exp_impl.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Exp bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops.bijectors import power_transform
-
-
-__all__ = [
-    "Exp",
-]
-
-
-class Exp(power_transform.PowerTransform):
-  """Compute `Y = g(X) = exp(X)`.
-
-    Example Use:
-
-    ```python
-    # Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
-    # batch ndim and 2 event ndims (i.e., vector of matrices).
-    exp = Exp(event_ndims=2)
-    x = [[[1., 2],
-           [3, 4]],
-          [[5, 6],
-           [7, 8]]]
-    exp(x) == exp.forward(x)
-    log(x) == exp.inverse(x)
-    ```
-
-    Note: the exp(.) is applied element-wise but the Jacobian is a reduction
-    over the event space.
-  """
-
-  def __init__(self,
-               event_ndims=0,
-               validate_args=False,
-               name="exp"):
-    """Instantiates the `Exp` bijector.
-
-    Args:
-      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-    """
-    super(Exp, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
index cf37aa51115ed98ab263bc03bcb297a03432a7ae..67f39785563255be0fe154aca3cbcf01c6a01e73 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
@@ -18,12 +18,107 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.gumbel_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Gumbel"]
+__all__ = [
+    "Gumbel",
+]
 
-remove_undocumented(__name__, _allowed_symbols)
+
+class Gumbel(bijector.Bijector):
+  """Compute `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
+
+  This bijector maps inputs from `[-inf, inf]` to [0, 1]`. The inverse of the
+  bijector applied to a uniform random variable `X ~ U(0, 1) gives back a
+  random variable with the
+  [Gumbel distribution](https://en.wikipedia.org/wiki/Gumbel_distribution):
+
+  ```none
+  Y ~ Gumbel(loc, scale)
+  pdf(y; loc, scale) = exp(
+    -( (y - loc) / scale + exp(- (y - loc) / scale) ) ) / scale
+  ```
+  """
+
+  def __init__(self,
+               loc=0.,
+               scale=1.,
+               event_ndims=0,
+               validate_args=False,
+               name="gumbel"):
+    """Instantiates the `Gumbel` bijector.
+
+    Args:
+      loc: Float-like `Tensor` that is the same dtype and is
+        broadcastable with `scale`.
+        This is `loc` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
+      scale: Positive Float-like `Tensor` that is the same dtype and is
+        broadcastable with `loc`.
+        This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[loc, scale]):
+      self._loc = ops.convert_to_tensor(loc, name="loc")
+      self._scale = ops.convert_to_tensor(scale, name="scale")
+      check_ops.assert_same_float_dtype([self._loc, self._scale])
+      if validate_args:
+        self._scale = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(
+                self._scale, message="Argument scale was not positive")
+        ], self._scale)
+
+    super(Gumbel, self).__init__(
+        event_ndims=event_ndims, validate_args=validate_args, name=name)
+
+  @property
+  def loc(self):
+    """The `loc` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`."""
+    return self._loc
+
+  @property
+  def scale(self):
+    """This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`."""
+    return self._scale
+
+  def _forward(self, x):
+    z = (x - self.loc) / self.scale
+    return math_ops.exp(-math_ops.exp(-z))
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid_y(y)
+    return self.loc - self.scale * math_ops.log(-math_ops.log(y))
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid_y(y)
+    event_dims = self._event_dims_tensor(y)
+    return math_ops.reduce_sum(
+        math_ops.log(self.scale / (-math_ops.log(y) * y)), axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    event_dims = self._event_dims_tensor(x)
+    z = (x - self.loc) / self.scale
+    return math_ops.reduce_sum(
+        -z - math_ops.exp(-z) - math_ops.log(self.scale), axis=event_dims)
+
+  def _maybe_assert_valid_y(self, y):
+    if not self.validate_args:
+      return y
+    is_positive = check_ops.assert_non_negative(
+        y, message="Inverse transformation input must be greater than 0.")
+    less_than_one = check_ops.assert_less_equal(
+        y,
+        constant_op.constant(1., y.dtype),
+        message="Inverse transformation input must be less than or equal to 1.")
+    return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel_impl.py
deleted file mode 100644
index 67f39785563255be0fe154aca3cbcf01c6a01e73..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel_impl.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Gumbel bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-__all__ = [
-    "Gumbel",
-]
-
-
-class Gumbel(bijector.Bijector):
-  """Compute `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
-
-  This bijector maps inputs from `[-inf, inf]` to [0, 1]`. The inverse of the
-  bijector applied to a uniform random variable `X ~ U(0, 1) gives back a
-  random variable with the
-  [Gumbel distribution](https://en.wikipedia.org/wiki/Gumbel_distribution):
-
-  ```none
-  Y ~ Gumbel(loc, scale)
-  pdf(y; loc, scale) = exp(
-    -( (y - loc) / scale + exp(- (y - loc) / scale) ) ) / scale
-  ```
-  """
-
-  def __init__(self,
-               loc=0.,
-               scale=1.,
-               event_ndims=0,
-               validate_args=False,
-               name="gumbel"):
-    """Instantiates the `Gumbel` bijector.
-
-    Args:
-      loc: Float-like `Tensor` that is the same dtype and is
-        broadcastable with `scale`.
-        This is `loc` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
-      scale: Positive Float-like `Tensor` that is the same dtype and is
-        broadcastable with `loc`.
-        This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    with self._name_scope("init", values=[loc, scale]):
-      self._loc = ops.convert_to_tensor(loc, name="loc")
-      self._scale = ops.convert_to_tensor(scale, name="scale")
-      check_ops.assert_same_float_dtype([self._loc, self._scale])
-      if validate_args:
-        self._scale = control_flow_ops.with_dependencies([
-            check_ops.assert_positive(
-                self._scale, message="Argument scale was not positive")
-        ], self._scale)
-
-    super(Gumbel, self).__init__(
-        event_ndims=event_ndims, validate_args=validate_args, name=name)
-
-  @property
-  def loc(self):
-    """The `loc` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`."""
-    return self._loc
-
-  @property
-  def scale(self):
-    """This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`."""
-    return self._scale
-
-  def _forward(self, x):
-    z = (x - self.loc) / self.scale
-    return math_ops.exp(-math_ops.exp(-z))
-
-  def _inverse(self, y):
-    y = self._maybe_assert_valid_y(y)
-    return self.loc - self.scale * math_ops.log(-math_ops.log(y))
-
-  def _inverse_log_det_jacobian(self, y):
-    y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        math_ops.log(self.scale / (-math_ops.log(y) * y)), axis=event_dims)
-
-  def _forward_log_det_jacobian(self, x):
-    event_dims = self._event_dims_tensor(x)
-    z = (x - self.loc) / self.scale
-    return math_ops.reduce_sum(
-        -z - math_ops.exp(-z) - math_ops.log(self.scale), axis=event_dims)
-
-  def _maybe_assert_valid_y(self, y):
-    if not self.validate_args:
-      return y
-    is_positive = check_ops.assert_non_negative(
-        y, message="Inverse transformation input must be greater than 0.")
-    less_than_one = check_ops.assert_less_equal(
-        y,
-        constant_op.constant(1., y.dtype),
-        message="Inverse transformation input must be less than or equal to 1.")
-    return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/inline.py b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
index db10c3fc3a9135b4c408ada74622ba9b360f9ec1..fab1b22fbf92e7b92a5ec86ec62d66bec71a8c94 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
@@ -18,12 +18,124 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.inline_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Inline"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Inline",
+]
+
+
+class Inline(bijector.Bijector):
+  """Bijector constructed from custom callables.
+
+  Example Use:
+
+  ```python
+  exp = Inline(
+    forward_fn=tf.exp,
+    inverse_fn=tf.log,
+    inverse_log_det_jacobian_fn=(
+      lambda y: -tf.reduce_sum(tf.log(y), axis=-1)),
+    name="exp")
+  ```
+
+  The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
+  """
+
+  def __init__(self,
+               forward_fn=None,
+               inverse_fn=None,
+               inverse_log_det_jacobian_fn=None,
+               forward_log_det_jacobian_fn=None,
+               forward_event_shape_fn=None,
+               forward_event_shape_tensor_fn=None,
+               inverse_event_shape_fn=None,
+               inverse_event_shape_tensor_fn=None,
+               is_constant_jacobian=False,
+               validate_args=False,
+               name="inline"):
+    """Creates a `Bijector` from callables.
+
+    Args:
+      forward_fn: Python callable implementing the forward transformation.
+      inverse_fn: Python callable implementing the inverse transformation.
+      inverse_log_det_jacobian_fn: Python callable implementing the
+        log o det o jacobian of the inverse transformation.
+      forward_log_det_jacobian_fn: Python callable implementing the
+        log o det o jacobian of the forward transformation.
+      forward_event_shape_fn: Python callable implementing non-identical
+        static event shape changes. Default: shape is assumed unchanged.
+      forward_event_shape_tensor_fn: Python callable implementing non-identical
+        event shape changes. Default: shape is assumed unchanged.
+      inverse_event_shape_fn: Python callable implementing non-identical
+        static event shape changes. Default: shape is assumed unchanged.
+      inverse_event_shape_tensor_fn: Python callable implementing non-identical
+        event shape changes. Default: shape is assumed unchanged.
+      is_constant_jacobian: Python `bool` indicating that the Jacobian is
+        constant for all input arguments.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+    """
+    super(Inline, self).__init__(
+        event_ndims=0,
+        is_constant_jacobian=is_constant_jacobian,
+        validate_args=validate_args,
+        name=name)
+    self._forward_fn = forward_fn
+    self._inverse_fn = inverse_fn
+    self._inverse_log_det_jacobian_fn = inverse_log_det_jacobian_fn
+    self._forward_log_det_jacobian_fn = forward_log_det_jacobian_fn
+    self._forward_event_shape_fn = forward_event_shape_fn
+    self._forward_event_shape_tensor_fn = forward_event_shape_tensor_fn
+    self._inverse_event_shape_fn = inverse_event_shape_fn
+    self._inverse_event_shape_tensor_fn = inverse_event_shape_tensor_fn
+
+  def _forward_event_shape(self, input_shape):
+    if self._forward_event_shape_fn is None:
+      # By default assume shape doesn't change.
+      return input_shape
+    return self._forward_event_shape_fn(input_shape)
+
+  def _forward_event_shape_tensor(self, input_shape):
+    if self._forward_event_shape_tensor_fn is None:
+      # By default assume shape doesn't change.
+      return input_shape
+    return self._forward_event_shape_tensor_fn(input_shape)
+
+  def _inverse_event_shape(self, output_shape):
+    if self._inverse_event_shape_fn is None:
+      # By default assume shape doesn't change.
+      return output_shape
+    return self._inverse_event_shape_fn(output_shape)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    if self._inverse_event_shape_tensor_fn is None:
+      # By default assume shape doesn't change.
+      return output_shape
+    return self._inverse_event_shape_tensor_fn(output_shape)
+
+  def _forward(self, x, **kwargs):
+    if not callable(self._forward_fn):
+      raise NotImplementedError(
+          "forward_fn is not a callable function.")
+    return self._forward_fn(x, **kwargs)
+
+  def _inverse(self, y, **kwargs):
+    if not callable(self._inverse_fn):
+      raise NotImplementedError(
+          "inverse_fn is not a callable function.")
+    return self._inverse_fn(y, **kwargs)
+
+  def _inverse_log_det_jacobian(self, y, **kwargs):
+    if not callable(self._inverse_log_det_jacobian_fn):
+      raise NotImplementedError(
+          "inverse_log_det_jacobian_fn is not a callable function.")
+    return self._inverse_log_det_jacobian_fn(y, **kwargs)
+
+  def _forward_log_det_jacobian(self, y, **kwargs):
+    if not callable(self._forward_log_det_jacobian_fn):
+      raise NotImplementedError(
+          "forward_log_det_jacobian_fn is not a callable function.")
+    return self._forward_log_det_jacobian_fn(y, **kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
deleted file mode 100644
index fab1b22fbf92e7b92a5ec86ec62d66bec71a8c94..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Inline bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Inline",
-]
-
-
-class Inline(bijector.Bijector):
-  """Bijector constructed from custom callables.
-
-  Example Use:
-
-  ```python
-  exp = Inline(
-    forward_fn=tf.exp,
-    inverse_fn=tf.log,
-    inverse_log_det_jacobian_fn=(
-      lambda y: -tf.reduce_sum(tf.log(y), axis=-1)),
-    name="exp")
-  ```
-
-  The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
-  """
-
-  def __init__(self,
-               forward_fn=None,
-               inverse_fn=None,
-               inverse_log_det_jacobian_fn=None,
-               forward_log_det_jacobian_fn=None,
-               forward_event_shape_fn=None,
-               forward_event_shape_tensor_fn=None,
-               inverse_event_shape_fn=None,
-               inverse_event_shape_tensor_fn=None,
-               is_constant_jacobian=False,
-               validate_args=False,
-               name="inline"):
-    """Creates a `Bijector` from callables.
-
-    Args:
-      forward_fn: Python callable implementing the forward transformation.
-      inverse_fn: Python callable implementing the inverse transformation.
-      inverse_log_det_jacobian_fn: Python callable implementing the
-        log o det o jacobian of the inverse transformation.
-      forward_log_det_jacobian_fn: Python callable implementing the
-        log o det o jacobian of the forward transformation.
-      forward_event_shape_fn: Python callable implementing non-identical
-        static event shape changes. Default: shape is assumed unchanged.
-      forward_event_shape_tensor_fn: Python callable implementing non-identical
-        event shape changes. Default: shape is assumed unchanged.
-      inverse_event_shape_fn: Python callable implementing non-identical
-        static event shape changes. Default: shape is assumed unchanged.
-      inverse_event_shape_tensor_fn: Python callable implementing non-identical
-        event shape changes. Default: shape is assumed unchanged.
-      is_constant_jacobian: Python `bool` indicating that the Jacobian is
-        constant for all input arguments.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-    """
-    super(Inline, self).__init__(
-        event_ndims=0,
-        is_constant_jacobian=is_constant_jacobian,
-        validate_args=validate_args,
-        name=name)
-    self._forward_fn = forward_fn
-    self._inverse_fn = inverse_fn
-    self._inverse_log_det_jacobian_fn = inverse_log_det_jacobian_fn
-    self._forward_log_det_jacobian_fn = forward_log_det_jacobian_fn
-    self._forward_event_shape_fn = forward_event_shape_fn
-    self._forward_event_shape_tensor_fn = forward_event_shape_tensor_fn
-    self._inverse_event_shape_fn = inverse_event_shape_fn
-    self._inverse_event_shape_tensor_fn = inverse_event_shape_tensor_fn
-
-  def _forward_event_shape(self, input_shape):
-    if self._forward_event_shape_fn is None:
-      # By default assume shape doesn't change.
-      return input_shape
-    return self._forward_event_shape_fn(input_shape)
-
-  def _forward_event_shape_tensor(self, input_shape):
-    if self._forward_event_shape_tensor_fn is None:
-      # By default assume shape doesn't change.
-      return input_shape
-    return self._forward_event_shape_tensor_fn(input_shape)
-
-  def _inverse_event_shape(self, output_shape):
-    if self._inverse_event_shape_fn is None:
-      # By default assume shape doesn't change.
-      return output_shape
-    return self._inverse_event_shape_fn(output_shape)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    if self._inverse_event_shape_tensor_fn is None:
-      # By default assume shape doesn't change.
-      return output_shape
-    return self._inverse_event_shape_tensor_fn(output_shape)
-
-  def _forward(self, x, **kwargs):
-    if not callable(self._forward_fn):
-      raise NotImplementedError(
-          "forward_fn is not a callable function.")
-    return self._forward_fn(x, **kwargs)
-
-  def _inverse(self, y, **kwargs):
-    if not callable(self._inverse_fn):
-      raise NotImplementedError(
-          "inverse_fn is not a callable function.")
-    return self._inverse_fn(y, **kwargs)
-
-  def _inverse_log_det_jacobian(self, y, **kwargs):
-    if not callable(self._inverse_log_det_jacobian_fn):
-      raise NotImplementedError(
-          "inverse_log_det_jacobian_fn is not a callable function.")
-    return self._inverse_log_det_jacobian_fn(y, **kwargs)
-
-  def _forward_log_det_jacobian(self, y, **kwargs):
-    if not callable(self._forward_log_det_jacobian_fn):
-      raise NotImplementedError(
-          "forward_log_det_jacobian_fn is not a callable function.")
-    return self._forward_log_det_jacobian_fn(y, **kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
index c134e10109ce5065eb58de1d847e3c487258954c..2c603fe61f36dd27f4984fe6c13c11f2fb534321 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
@@ -18,12 +18,85 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.invert_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.ops.distributions import bijector as bijector_lib
 
-_allowed_symbols = ["Invert"]
+__all__ = [
+    "Invert",
+]
 
-remove_undocumented(__name__, _allowed_symbols)
+
+class Invert(bijector_lib.Bijector):
+  """Bijector which inverts another Bijector.
+
+  Example Use: [ExpGammaDistribution (see Background & Context)](
+  https://reference.wolfram.com/language/ref/ExpGammaDistribution.html)
+  models `Y=log(X)` where `X ~ Gamma`.
+
+  ```python
+  exp_gamma_distribution = TransformedDistribution(
+    distribution=Gamma(concentration=1., rate=2.),
+    bijector=bijector.Invert(bijector.Exp())
+  ```
+
+  """
+
+  def __init__(self, bijector, validate_args=False, name=None):
+    """Creates a `Bijector` which swaps the meaning of `inverse` and `forward`.
+
+    Note: An inverted bijector's `inverse_log_det_jacobian` is often more
+    efficient if the base bijector implements `_forward_log_det_jacobian`. If
+    `_forward_log_det_jacobian` is not implemented then the following code is
+    used:
+
+    ```python
+    y = self.inverse(x, **kwargs)
+    return -self.inverse_log_det_jacobian(y, **kwargs)
+    ```
+
+    Args:
+      bijector: Bijector instance.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+    """
+
+    if not bijector._is_injective:  # pylint: disable=protected-access
+      raise NotImplementedError(
+          "Invert is not implemented for non-injective bijectors.")
+
+    self._bijector = bijector
+    super(Invert, self).__init__(
+        event_ndims=bijector.event_ndims,
+        graph_parents=bijector.graph_parents,
+        is_constant_jacobian=bijector.is_constant_jacobian,
+        validate_args=validate_args,
+        dtype=bijector.dtype,
+        name=name or "_".join(["invert", bijector.name]))
+
+  def _forward_event_shape(self, input_shape):
+    return self.bijector._inverse_event_shape(input_shape)  # pylint: disable=protected-access
+
+  def _forward_event_shape_tensor(self, input_shape):
+    return self.bijector._inverse_event_shape_tensor(input_shape)  # pylint: disable=protected-access
+
+  def _inverse_event_shape(self, output_shape):
+    return self.bijector._forward_event_shape(output_shape)  # pylint: disable=protected-access
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    return self.bijector._forward_event_shape_tensor(output_shape)  # pylint: disable=protected-access
+
+  @property
+  def bijector(self):
+    return self._bijector
+
+  def _forward(self, x, **kwargs):
+    return self.bijector._inverse(x, **kwargs)  # pylint: disable=protected-access
+
+  def _inverse(self, y, **kwargs):
+    return self.bijector._forward(y, **kwargs)  # pylint: disable=protected-access
+
+  def _inverse_log_det_jacobian(self, y, **kwargs):
+    return self.bijector._forward_log_det_jacobian(y, **kwargs)  # pylint: disable=protected-access
+
+  def _forward_log_det_jacobian(self, x, **kwargs):
+    return self.bijector._inverse_log_det_jacobian(x, **kwargs)  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
deleted file mode 100644
index 2c603fe61f36dd27f4984fe6c13c11f2fb534321..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Invert bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.ops.distributions import bijector as bijector_lib
-
-__all__ = [
-    "Invert",
-]
-
-
-class Invert(bijector_lib.Bijector):
-  """Bijector which inverts another Bijector.
-
-  Example Use: [ExpGammaDistribution (see Background & Context)](
-  https://reference.wolfram.com/language/ref/ExpGammaDistribution.html)
-  models `Y=log(X)` where `X ~ Gamma`.
-
-  ```python
-  exp_gamma_distribution = TransformedDistribution(
-    distribution=Gamma(concentration=1., rate=2.),
-    bijector=bijector.Invert(bijector.Exp())
-  ```
-
-  """
-
-  def __init__(self, bijector, validate_args=False, name=None):
-    """Creates a `Bijector` which swaps the meaning of `inverse` and `forward`.
-
-    Note: An inverted bijector's `inverse_log_det_jacobian` is often more
-    efficient if the base bijector implements `_forward_log_det_jacobian`. If
-    `_forward_log_det_jacobian` is not implemented then the following code is
-    used:
-
-    ```python
-    y = self.inverse(x, **kwargs)
-    return -self.inverse_log_det_jacobian(y, **kwargs)
-    ```
-
-    Args:
-      bijector: Bijector instance.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-    """
-
-    if not bijector._is_injective:  # pylint: disable=protected-access
-      raise NotImplementedError(
-          "Invert is not implemented for non-injective bijectors.")
-
-    self._bijector = bijector
-    super(Invert, self).__init__(
-        event_ndims=bijector.event_ndims,
-        graph_parents=bijector.graph_parents,
-        is_constant_jacobian=bijector.is_constant_jacobian,
-        validate_args=validate_args,
-        dtype=bijector.dtype,
-        name=name or "_".join(["invert", bijector.name]))
-
-  def _forward_event_shape(self, input_shape):
-    return self.bijector._inverse_event_shape(input_shape)  # pylint: disable=protected-access
-
-  def _forward_event_shape_tensor(self, input_shape):
-    return self.bijector._inverse_event_shape_tensor(input_shape)  # pylint: disable=protected-access
-
-  def _inverse_event_shape(self, output_shape):
-    return self.bijector._forward_event_shape(output_shape)  # pylint: disable=protected-access
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    return self.bijector._forward_event_shape_tensor(output_shape)  # pylint: disable=protected-access
-
-  @property
-  def bijector(self):
-    return self._bijector
-
-  def _forward(self, x, **kwargs):
-    return self.bijector._inverse(x, **kwargs)  # pylint: disable=protected-access
-
-  def _inverse(self, y, **kwargs):
-    return self.bijector._forward(y, **kwargs)  # pylint: disable=protected-access
-
-  def _inverse_log_det_jacobian(self, y, **kwargs):
-    return self.bijector._forward_log_det_jacobian(y, **kwargs)  # pylint: disable=protected-access
-
-  def _forward_log_det_jacobian(self, x, **kwargs):
-    return self.bijector._inverse_log_det_jacobian(x, **kwargs)  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
index 132dc570f94719b6c71fb269866c943774481b7e..5251dbcb5748f75688aa43ce6e4e9dbd76be78bb 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
@@ -18,16 +18,490 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = [
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core as layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import template as template_ops
+from tensorflow.python.ops import variable_scope as variable_scope_lib
+from tensorflow.python.ops.distributions import bijector as bijector_lib
+
+
+__all__ = [
     "MaskedAutoregressiveFlow",
-    "masked_dense",
     "masked_autoregressive_default_template",
+    "masked_dense",
 ]
 
-remove_undocumented(__name__, _allowed_symbols)
+
+class MaskedAutoregressiveFlow(bijector_lib.Bijector):
+  """Affine MaskedAutoregressiveFlow bijector for vector-valued events.
+
+  The affine autoregressive flow [1] provides a relatively simple framework for
+  user-specified (deep) architectures to learn a distribution over vector-valued
+  events. Regarding terminology,
+
+    "Autoregressive models decompose the joint density as a product of
+    conditionals, and model each conditional in turn. Normalizing flows
+    transform a base density (e.g. a standard Gaussian) into the target density
+    by an invertible transformation with tractable Jacobian." [1]
+
+  In other words, the "autoregressive property" is equivalent to the
+  decomposition, `p(x) = prod{ p(x[i] | x[0:i]) : i=0, ..., d }`. The provided
+  `shift_and_log_scale_fn`, `masked_autoregressive_default_template`, achieves
+  this property by zeroing out weights in its `masked_dense` layers.
+
+  In the `tf.distributions` framework, a "normalizing flow" is implemented as a
+  `tf.distributions.bijectors.Bijector`. The `forward` "autoregression"
+  is implemented using a `tf.while_loop` and a deep neural network (DNN) with
+  masked weights such that the autoregressive property is automatically met in
+  the `inverse`.
+
+  A `TransformedDistribution` using `MaskedAutoregressiveFlow(...)` uses the
+  (expensive) forward-mode calculation to draw samples and the (cheap)
+  reverse-mode calculation to compute log-probabilities. Conversely, a
+  `TransformedDistribution` using `Invert(MaskedAutoregressiveFlow(...))` uses
+  the (expensive) forward-mode calculation to compute log-probabilities and the
+  (cheap) reverse-mode calculation to compute samples.  See "Example Use"
+  [below] for more details.
+
+  Given a `shift_and_log_scale_fn`, the forward and inverse transformations are
+  (a sequence of) affine transformations. A "valid" `shift_and_log_scale_fn`
+  must compute each `shift` (aka `loc` or "mu" [2]) and `log(scale)` (aka
+  "alpha" [2]) such that each are broadcastable with the arguments to `forward`
+  and `inverse`, i.e., such that the calculations in `forward`, `inverse`
+  [below] are possible.
+
+  For convenience, `masked_autoregressive_default_template` is offered as a
+  possible `shift_and_log_scale_fn` function. It implements the MADE
+  architecture [2]. MADE is a feed-forward network that computes a `shift` and
+  `log(scale)` using `masked_dense` layers in a deep neural network. Weights are
+  masked to ensure the autoregressive property. It is possible that this
+  architecture is suboptimal for your task. To build alternative networks,
+  either change the arguments to `masked_autoregressive_default_template`, use
+  the `masked_dense` function to roll-out your own, or use some other
+  architecture, e.g., using `tf.layers`.
+
+  Warning: no attempt is made to validate that the `shift_and_log_scale_fn`
+  enforces the "autoregressive property".
+
+  Assuming `shift_and_log_scale_fn` has valid shape and autoregressive
+  semantics, the forward transformation is,
+
+  ```python
+  def forward(x):
+    y = zeros_like(x)
+    event_size = x.shape[-1]
+    for _ in range(event_size):
+      shift, log_scale = shift_and_log_scale_fn(y)
+      y = x * math_ops.exp(log_scale) + shift
+    return y
+  ```
+
+  and the inverse transformation is,
+
+  ```python
+  def inverse(y):
+    shift, log_scale = shift_and_log_scale_fn(y)
+    return (y - shift) / math_ops.exp(log_scale)
+  ```
+
+  Notice that the `inverse` does not need a for-loop. This is because in the
+  forward pass each calculation of `shift` and `log_scale` is based on the `y`
+  calculated so far (not `x`). In the `inverse`, the `y` is fully known, thus is
+  equivalent to the scaling used in `forward` after `event_size` passes, i.e.,
+  the "last" `y` used to compute `shift`, `log_scale`. (Roughly speaking, this
+  also proves the transform is bijective.)
+
+  #### Example Use
+
+  ```python
+  tfd = tf.contrib.distributions
+  tfb = tfd.bijectors
+
+  dims = 5
+
+  # A common choice for a normalizing flow is to use a Gaussian for the base
+  # distribution. (However, any continuous distribution would work.) E.g.,
+  maf = tfd.TransformedDistribution(
+      distribution=tfd.Normal(loc=0., scale=1.),
+      bijector=tfb.MaskedAutoregressiveFlow(
+          shift_and_log_scale_fn=tfb.masked_autoregressive_default_template(
+              hidden_layers=[512, 512])),
+      event_shape=[dims])
+
+  x = maf.sample()  # Expensive; uses `tf.while_loop`, no Bijector caching.
+  maf.log_prob(x)   # Almost free; uses Bijector caching.
+  maf.log_prob(0.)  # Cheap; no `tf.while_loop` despite no Bijector caching.
+
+  # [1] also describes an "Inverse Autoregressive Flow", e.g.,
+  iaf = tfd.TransformedDistribution(
+      distribution=tfd.Normal(loc=0., scale=1.),
+      bijector=tfb.Invert(tfb.MaskedAutoregressiveFlow(
+          shift_and_log_scale_fn=tfb.masked_autoregressive_default_template(
+              hidden_layers=[512, 512]))),
+      event_shape=[dims])
+
+  x = iaf.sample()  # Cheap; no `tf.while_loop` despite no Bijector caching.
+  iaf.log_prob(x)   # Almost free; uses Bijector caching.
+  iaf.log_prob(0.)  # Expensive; uses `tf.while_loop`, no Bijector caching.
+
+  # In many (if not most) cases the default `shift_and_log_scale_fn` will be a
+  # poor choice. Here's an example of using a "shift only" version and with a
+  # different number/depth of hidden layers.
+  shift_only = True
+  maf_no_scale_hidden2 = tfd.TransformedDistribution(
+      distribution=tfd.Normal(loc=0., scale=1.),
+      bijector=tfb.MaskedAutoregressiveFlow(
+          tfb.masked_autoregressive_default_template(
+              hidden_layers=[32],
+              shift_only=shift_only),
+          is_constant_jacobian=shift_only),
+      event_shape=[dims])
+  ```
+
+  [1]: "Masked Autoregressive Flow for Density Estimation."
+       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
+       https://arxiv.org/abs/1705.07057
+
+  [2]: "MADE: Masked Autoencoder for Distribution Estimation."
+       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
+       https://arxiv.org/abs/1502.03509
+
+  """
+
+  def __init__(self,
+               shift_and_log_scale_fn,
+               is_constant_jacobian=False,
+               validate_args=False,
+               unroll_loop=False,
+               name=None):
+    """Creates the MaskedAutoregressiveFlow bijector.
+
+    Args:
+      shift_and_log_scale_fn: Python `callable` which computes `shift` and
+        `log_scale` from both the forward domain (`x`) and the inverse domain
+        (`y`). Calculation must respect the "autoregressive property" (see class
+        docstring). Suggested default
+        `masked_autoregressive_default_template(hidden_layers=...)`.
+        Typically the function contains `tf.Variables` and is wrapped using
+        `tf.make_template`. Returning `None` for either (both) `shift`,
+        `log_scale` is equivalent to (but more efficient than) returning zero.
+      is_constant_jacobian: Python `bool`. Default: `False`. When `True` the
+        implementation assumes `log_scale` does not depend on the forward domain
+        (`x`) or inverse domain (`y`) values. (No validation is made;
+        `is_constant_jacobian=False` is always safe but possibly computationally
+        inefficient.)
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      unroll_loop: Python `bool` indicating whether the `tf.while_loop` in
+        `_forward` should be replaced with a static for loop. Requires that
+        the final dimension of `x` be known at graph construction time. Defaults
+        to `False`.
+      name: Python `str`, name given to ops managed by this object.
+    """
+    name = name or "masked_autoregressive_flow"
+    self._shift_and_log_scale_fn = shift_and_log_scale_fn
+    self._unroll_loop = unroll_loop
+    super(MaskedAutoregressiveFlow, self).__init__(
+        is_constant_jacobian=is_constant_jacobian,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    if self._unroll_loop:
+      event_size = x.shape.with_rank_at_least(1)[-1].value
+      if event_size is None:
+        raise ValueError(
+            "The final dimension of `x` must be known at graph construction "
+            "time if `unroll_loop=True`. `x.shape: %r`" % x.shape)
+      y = array_ops.zeros_like(x, name="y0")
+
+      for _ in range(event_size):
+        shift, log_scale = self._shift_and_log_scale_fn(y)
+        # next_y = scale * x + shift
+        next_y = x
+        if log_scale is not None:
+          next_y *= math_ops.exp(log_scale)
+        if shift is not None:
+          next_y += shift
+        y = next_y
+      return y
+
+    event_size = array_ops.shape(x)[-1]
+    # If the event size is available at graph construction time, we can inform
+    # the graph compiler of the maximum number of steps. If not,
+    # static_event_size will be None, and the maximum_iterations argument will
+    # have no effect.
+    static_event_size = x.shape.with_rank_at_least(1)[-1].value
+    y0 = array_ops.zeros_like(x, name="y0")
+    # call the template once to ensure creation
+    _ = self._shift_and_log_scale_fn(y0)
+    def _loop_body(index, y0):
+      """While-loop body for autoregression calculation."""
+      # Set caching device to avoid re-getting the tf.Variable for every while
+      # loop iteration.
+      with variable_scope_lib.variable_scope(
+          variable_scope_lib.get_variable_scope()) as vs:
+        if vs.caching_device is None:
+          vs.set_caching_device(lambda op: op.device)
+        shift, log_scale = self._shift_and_log_scale_fn(y0)
+      y = x
+      if log_scale is not None:
+        y *= math_ops.exp(log_scale)
+      if shift is not None:
+        y += shift
+      return index + 1, y
+    _, y = control_flow_ops.while_loop(
+        cond=lambda index, _: index < event_size,
+        body=_loop_body,
+        loop_vars=(0, y0),
+        maximum_iterations=static_event_size)
+    return y
+
+  def _inverse(self, y):
+    shift, log_scale = self._shift_and_log_scale_fn(y)
+    x = y
+    if shift is not None:
+      x -= shift
+    if log_scale is not None:
+      x *= math_ops.exp(-log_scale)
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    _, log_scale = self._shift_and_log_scale_fn(y)
+    if log_scale is None:
+      return constant_op.constant(0., dtype=y.dtype, name="ildj")
+    return -math_ops.reduce_sum(log_scale, axis=-1)
+
+
+MASK_INCLUSIVE = "inclusive"
+MASK_EXCLUSIVE = "exclusive"
+
+
+def _gen_slices(num_blocks, n_in, n_out, mask_type=MASK_EXCLUSIVE):
+  """Generate the slices for building an autoregressive mask."""
+  # TODO(b/67594795): Better support of dynamic shape.
+  slices = []
+  col = 0
+  d_in = n_in // num_blocks
+  d_out = n_out // num_blocks
+  row = d_out if mask_type == MASK_EXCLUSIVE else 0
+  for _ in range(num_blocks):
+    row_slice = slice(row, None)
+    col_slice = slice(col, col + d_in)
+    slices.append([row_slice, col_slice])
+    col += d_in
+    row += d_out
+  return slices
+
+
+def _gen_mask(num_blocks,
+              n_in,
+              n_out,
+              mask_type=MASK_EXCLUSIVE,
+              dtype=dtypes.float32):
+  """Generate the mask for building an autoregressive dense layer."""
+  # TODO(b/67594795): Better support of dynamic shape.
+  mask = np.zeros([n_out, n_in], dtype=dtype.as_numpy_dtype())
+  slices = _gen_slices(num_blocks, n_in, n_out, mask_type=mask_type)
+  for [row_slice, col_slice] in slices:
+    mask[row_slice, col_slice] = 1
+  return mask
+
+
+def masked_dense(inputs,
+                 units,
+                 num_blocks=None,
+                 exclusive=False,
+                 kernel_initializer=None,
+                 reuse=None,
+                 name=None,
+                 *args,
+                 **kwargs):
+  """A autoregressively masked dense layer. Analogous to `tf.layers.dense`.
+
+  See [1] for detailed explanation.
+
+  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
+       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
+       https://arxiv.org/abs/1502.03509
+
+  Arguments:
+    inputs: Tensor input.
+    units: Python `int` scalar representing the dimensionality of the output
+      space.
+    num_blocks: Python `int` scalar representing the number of blocks for the
+      MADE masks.
+    exclusive: Python `bool` scalar representing whether to zero the diagonal of
+      the mask, used for the first layer of a MADE.
+    kernel_initializer: Initializer function for the weight matrix.
+      If `None` (default), weights are initialized using the
+      `tf.glorot_random_initializer`.
+    reuse: Python `bool` scalar representing whether to reuse the weights of a
+      previous layer by the same name.
+    name: Python `str` used to describe ops managed by this function.
+    *args: `tf.layers.dense` arguments.
+    **kwargs: `tf.layers.dense` keyword arguments.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
+      graph execution.
+  """
+  # TODO(b/67594795): Better support of dynamic shape.
+  input_depth = inputs.shape.with_rank_at_least(1)[-1].value
+  if input_depth is None:
+    raise NotImplementedError(
+        "Rightmost dimension must be known prior to graph execution.")
+
+  mask = _gen_mask(num_blocks, input_depth, units,
+                   MASK_EXCLUSIVE if exclusive else MASK_INCLUSIVE).T
+
+  if kernel_initializer is None:
+    kernel_initializer = init_ops.glorot_normal_initializer()
+
+  def masked_initializer(shape, dtype=None, partition_info=None):
+    return mask * kernel_initializer(shape, dtype, partition_info)
+
+  with ops.name_scope(name, "masked_dense", [inputs, units, num_blocks]):
+    layer = layers.Dense(
+        units,
+        kernel_initializer=masked_initializer,
+        kernel_constraint=lambda x: mask * x,
+        name=name,
+        dtype=inputs.dtype.base_dtype,
+        _scope=name,
+        _reuse=reuse,
+        *args,
+        **kwargs)
+    return layer.apply(inputs)
+
+
+def masked_autoregressive_default_template(
+    hidden_layers,
+    shift_only=False,
+    activation=nn_ops.relu,
+    log_scale_min_clip=-5.,
+    log_scale_max_clip=3.,
+    log_scale_clip_gradient=False,
+    name=None,
+    *args,
+    **kwargs):
+  """Build the MADE Model [1].
+
+  This will be wrapped in a make_template to ensure the variables are only
+  created once. It takes the input and returns the `loc` ("mu" [1]) and
+  `log_scale` ("alpha" [1]) from the MADE network.
+
+  Warning: This function uses `masked_dense` to create randomly initialized
+  `tf.Variables`. It is presumed that these will be fit, just as you would any
+  other neural architecture which uses `tf.layers.dense`.
+
+  #### About Hidden Layers:
+
+  Each element of `hidden_layers` should be greater than the `input_depth`
+  (i.e., `input_depth = tf.shape(input)[-1]` where `input` is the input to the
+  neural network). This is necessary to ensure the autoregressivity property.
+
+  #### About Clipping:
+
+  This function also optionally clips the `log_scale` (but possibly not its
+  gradient). This is useful because if `log_scale` is too small/large it might
+  underflow/overflow making it impossible for the `MaskedAutoregressiveFlow`
+  bijector to implement a bijection. Additionally, the `log_scale_clip_gradient`
+  `bool` indicates whether the gradient should also be clipped. The default does
+  not clip the gradient; this is useful because it still provides gradient
+  information (for fitting) yet solves the numerical stability problem. I.e.,
+  `log_scale_clip_gradient = False` means
+  `grad[exp(clip(x))] = grad[x] exp(clip(x))` rather than the usual
+  `grad[clip(x)] exp(clip(x))`.
+
+  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
+       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
+       https://arxiv.org/abs/1502.03509
+
+  Arguments:
+    hidden_layers: Python `list`-like of non-negative integer, scalars
+      indicating the number of units in each hidden layer. Default: `[512, 512].
+    shift_only: Python `bool` indicating if only the `shift` term shall be
+      computed. Default: `False`.
+    activation: Activation function (callable). Explicitly setting to `None`
+      implies a linear activation.
+    log_scale_min_clip: `float`-like scalar `Tensor`, or a `Tensor` with the
+      same shape as `log_scale`. The minimum value to clip by. Default: -5.
+    log_scale_max_clip: `float`-like scalar `Tensor`, or a `Tensor` with the
+      same shape as `log_scale`. The maximum value to clip by. Default: 3.
+    log_scale_clip_gradient: Python `bool` indicating that the gradient of
+      `tf.clip_by_value` should be preserved. Default: `False`.
+    name: A name for ops managed by this function. Default:
+      "masked_autoregressive_default_template".
+    *args: `tf.layers.dense` arguments.
+    **kwargs: `tf.layers.dense` keyword arguments.
+
+  Returns:
+    shift: `Float`-like `Tensor` of shift terms (the "mu" in [2]).
+    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in [2]).
+
+  Raises:
+    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
+      graph execution.
+  """
+
+  with ops.name_scope(name, "masked_autoregressive_default_template",
+                      values=[log_scale_min_clip, log_scale_max_clip]):
+    def _fn(x):
+      """MADE parameterized via `masked_autoregressive_default_template`."""
+      # TODO(b/67594795): Better support of dynamic shape.
+      input_depth = x.shape.with_rank_at_least(1)[-1].value
+      if input_depth is None:
+        raise NotImplementedError(
+            "Rightmost dimension must be known prior to graph execution.")
+      input_shape = (np.int32(x.shape.as_list()) if x.shape.is_fully_defined()
+                     else array_ops.shape(x))
+      for i, units in enumerate(hidden_layers):
+        x = masked_dense(
+            inputs=x,
+            units=units,
+            num_blocks=input_depth,
+            exclusive=True if i == 0 else False,
+            activation=activation,
+            *args,
+            **kwargs)
+      x = masked_dense(
+          inputs=x,
+          units=(1 if shift_only else 2) * input_depth,
+          num_blocks=input_depth,
+          activation=None,
+          *args,
+          **kwargs)
+      if shift_only:
+        x = array_ops.reshape(x, shape=input_shape)
+        return x, None
+      x = array_ops.reshape(
+          x, shape=array_ops.concat([input_shape, [2]], axis=0))
+      shift, log_scale = array_ops.unstack(x, num=2, axis=-1)
+      which_clip = (math_ops.clip_by_value if log_scale_clip_gradient
+                    else _clip_by_value_preserve_grad)
+      log_scale = which_clip(log_scale, log_scale_min_clip, log_scale_max_clip)
+      return shift, log_scale
+    return template_ops.make_template(
+        "masked_autoregressive_default_template", _fn)
+
+
+def _clip_by_value_preserve_grad(x, clip_value_min, clip_value_max, name=None):
+  """Clips input while leaving gradient unaltered."""
+  with ops.name_scope(name, "clip_by_value_preserve_grad",
+                      [x, clip_value_min, clip_value_max]):
+    clip_x = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+    return x + array_ops.stop_gradient(clip_x - x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive_impl.py
deleted file mode 100644
index ae142883931274b594dbbafbe86bd71e75c621bc..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive_impl.py
+++ /dev/null
@@ -1,473 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""MaskedAutoregressiveFlow bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.layers import core as layers
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import template as template_ops
-from tensorflow.python.ops import variable_scope as variable_scope_lib
-from tensorflow.python.ops.distributions import bijector as bijector_lib
-
-
-__all__ = [
-    "MaskedAutoregressiveFlow",
-    "masked_autoregressive_default_template",
-    "masked_dense",
-]
-
-
-class MaskedAutoregressiveFlow(bijector_lib.Bijector):
-  """Affine MaskedAutoregressiveFlow bijector for vector-valued events.
-
-  The affine autoregressive flow [1] provides a relatively simple framework for
-  user-specified (deep) architectures to learn a distribution over vector-valued
-  events. Regarding terminology,
-
-    "Autoregressive models decompose the joint density as a product of
-    conditionals, and model each conditional in turn. Normalizing flows
-    transform a base density (e.g. a standard Gaussian) into the target density
-    by an invertible transformation with tractable Jacobian." [1]
-
-  In other words, the "autoregressive property" is equivalent to the
-  decomposition, `p(x) = prod{ p(x[i] | x[0:i]) : i=0, ..., d }`. The provided
-  `shift_and_log_scale_fn`, `masked_autoregressive_default_template`, achieves
-  this property by zeroing out weights in its `masked_dense` layers.
-
-  In the `tf.distributions` framework, a "normalizing flow" is implemented as a
-  `tf.distributions.bijectors.Bijector`. The `forward` "autoregression"
-  is implemented using a `tf.while_loop` and a deep neural network (DNN) with
-  masked weights such that the autoregressive property is automatically met in
-  the `inverse`.
-
-  A `TransformedDistribution` using `MaskedAutoregressiveFlow(...)` uses the
-  (expensive) forward-mode calculation to draw samples and the (cheap)
-  reverse-mode calculation to compute log-probabilities. Conversely, a
-  `TransformedDistribution` using `Invert(MaskedAutoregressiveFlow(...))` uses
-  the (expensive) forward-mode calculation to compute log-probabilities and the
-  (cheap) reverse-mode calculation to compute samples.  See "Example Use"
-  [below] for more details.
-
-  Given a `shift_and_log_scale_fn`, the forward and inverse transformations are
-  (a sequence of) affine transformations. A "valid" `shift_and_log_scale_fn`
-  must compute each `shift` (aka `loc` or "mu" [2]) and `log(scale)` (aka
-  "alpha" [2]) such that each are broadcastable with the arguments to `forward`
-  and `inverse`, i.e., such that the calculations in `forward`, `inverse`
-  [below] are possible.
-
-  For convenience, `masked_autoregressive_default_template` is offered as a
-  possible `shift_and_log_scale_fn` function. It implements the MADE
-  architecture [2]. MADE is a feed-forward network that computes a `shift` and
-  `log(scale)` using `masked_dense` layers in a deep neural network. Weights are
-  masked to ensure the autoregressive property. It is possible that this
-  architecture is suboptimal for your task. To build alternative networks,
-  either change the arguments to `masked_autoregressive_default_template`, use
-  the `masked_dense` function to roll-out your own, or use some other
-  architecture, e.g., using `tf.layers`.
-
-  Warning: no attempt is made to validate that the `shift_and_log_scale_fn`
-  enforces the "autoregressive property".
-
-  Assuming `shift_and_log_scale_fn` has valid shape and autoregressive
-  semantics, the forward transformation is,
-
-  ```python
-  def forward(x):
-    y = zeros_like(x)
-    event_size = x.shape[-1]
-    for _ in range(event_size):
-      shift, log_scale = shift_and_log_scale_fn(y)
-      y = x * math_ops.exp(log_scale) + shift
-    return y
-  ```
-
-  and the inverse transformation is,
-
-  ```python
-  def inverse(y):
-    shift, log_scale = shift_and_log_scale_fn(y)
-    return (y - shift) / math_ops.exp(log_scale)
-  ```
-
-  Notice that the `inverse` does not need a for-loop. This is because in the
-  forward pass each calculation of `shift` and `log_scale` is based on the `y`
-  calculated so far (not `x`). In the `inverse`, the `y` is fully known, thus is
-  equivalent to the scaling used in `forward` after `event_size` passes, i.e.,
-  the "last" `y` used to compute `shift`, `log_scale`. (Roughly speaking, this
-  also proves the transform is bijective.)
-
-  #### Example Use
-
-  ```python
-  ds = tf.contrib.distributions
-  bs = tf.contrib.distributions.bijectors
-
-  dims = 5
-
-  # A common choice for a normalizing flow is to use a Gaussian for the base
-  # distribution. (However, any continuous distribution would work.) E.g.,
-  maf = ds.TransformedDistribution(
-      distribution=ds.Normal(loc=0., scale=1.),
-      bijector=bs.MaskedAutoregressiveFlow(
-          shift_and_log_scale_fn=bs.masked_autoregressive_default_template(
-              hidden_layers=[512, 512])),
-      event_shape=[dims])
-
-  x = maf.sample()  # Expensive; uses `tf.while_loop`, no Bijector caching.
-  maf.log_prob(x)   # Almost free; uses Bijector caching.
-  maf.log_prob(0.)  # Cheap; no `tf.while_loop` despite no Bijector caching.
-
-  # [1] also describes an "Inverse Autoregressive Flow", e.g.,
-  iaf = ds.TransformedDistribution(
-      distribution=ds.Normal(loc=0., scale=1.),
-      bijector=bs.Invert(bs.MaskedAutoregressiveFlow(
-          shift_and_log_scale_fn=bs.masked_autoregressive_default_template(
-              hidden_layers=[512, 512]))),
-      event_shape=[dims])
-
-  x = iaf.sample()  # Cheap; no `tf.while_loop` despite no Bijector caching.
-  iaf.log_prob(x)   # Almost free; uses Bijector caching.
-  iaf.log_prob(0.)  # Expensive; uses `tf.while_loop`, no Bijector caching.
-
-  # In many (if not most) cases the default `shift_and_log_scale_fn` will be a
-  # poor choice. Here's an example of using a "shift only" version and with a
-  # different number/depth of hidden layers.
-  shift_only = True
-  maf_no_scale_hidden2 = ds.TransformedDistribution(
-      distribution=ds.Normal(loc=0., scale=1.),
-      bijector=bs.MaskedAutoregressiveFlow(
-          bs.masked_autoregressive_default_template(
-              hidden_layers=[32],
-              shift_only=shift_only),
-          is_constant_jacobian=shift_only),
-      event_shape=[dims])
-  ```
-
-  [1]: "Masked Autoregressive Flow for Density Estimation."
-       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
-       https://arxiv.org/abs/1705.07057
-
-  [2]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
-
-  """
-
-  def __init__(self,
-               shift_and_log_scale_fn,
-               is_constant_jacobian=False,
-               validate_args=False,
-               name=None):
-    """Creates the MaskedAutoregressiveFlow bijector.
-
-    Args:
-      shift_and_log_scale_fn: Python `callable` which computes `shift` and
-        `log_scale` from both the forward domain (`x`) and the inverse domain
-        (`y`). Calculation must respect the "autoregressive property" (see class
-        docstring). Suggested default
-        `masked_autoregressive_default_template(hidden_layers=...)`.
-        Typically the function contains `tf.Variables` and is wrapped using
-        `tf.make_template`. Returning `None` for either (both) `shift`,
-        `log_scale` is equivalent to (but more efficient than) returning zero.
-      is_constant_jacobian: Python `bool`. Default: `False`. When `True` the
-        implementation assumes `log_scale` does not depend on the forward domain
-        (`x`) or inverse domain (`y`) values. (No validation is made;
-        `is_constant_jacobian=False` is always safe but possibly computationally
-        inefficient.)
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-    """
-    name = name or "masked_autoregressive_flow"
-    self._shift_and_log_scale_fn = shift_and_log_scale_fn
-    super(MaskedAutoregressiveFlow, self).__init__(
-        is_constant_jacobian=is_constant_jacobian,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward(self, x):
-    event_size = array_ops.shape(x)[-1]
-    def _loop_body(index, y0):
-      """While-loop body for autoregression calculation."""
-      # Set caching device to avoid re-getting the tf.Variable for every while
-      # loop iteration.
-      with variable_scope_lib.variable_scope(
-          variable_scope_lib.get_variable_scope()) as vs:
-        if vs.caching_device is None:
-          vs.set_caching_device(lambda op: op.device)
-        shift, log_scale = self._shift_and_log_scale_fn(y0)
-      y = x
-      if log_scale is not None:
-        y *= math_ops.exp(log_scale)
-      if shift is not None:
-        y += shift
-      return index + 1, y
-    _, y = control_flow_ops.while_loop(
-        cond=lambda index, _: index < event_size,
-        body=_loop_body,
-        loop_vars=[0, array_ops.zeros_like(x, name="y0")])
-    return y
-
-  def _inverse(self, y):
-    shift, log_scale = self._shift_and_log_scale_fn(y)
-    x = y
-    if shift is not None:
-      x -= shift
-    if log_scale is not None:
-      x *= math_ops.exp(-log_scale)
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    _, log_scale = self._shift_and_log_scale_fn(y)
-    if log_scale is None:
-      return constant_op.constant(0., dtype=y.dtype, name="ildj")
-    return -math_ops.reduce_sum(log_scale, axis=-1)
-
-
-MASK_INCLUSIVE = "inclusive"
-MASK_EXCLUSIVE = "exclusive"
-
-
-def _gen_slices(num_blocks, n_in, n_out, mask_type=MASK_EXCLUSIVE):
-  """Generate the slices for building an autoregressive mask."""
-  # TODO(b/67594795): Better support of dynamic shape.
-  slices = []
-  col = 0
-  d_in = n_in // num_blocks
-  d_out = n_out // num_blocks
-  row = d_out if mask_type == MASK_EXCLUSIVE else 0
-  for _ in range(num_blocks):
-    row_slice = slice(row, None)
-    col_slice = slice(col, col + d_in)
-    slices.append([row_slice, col_slice])
-    col += d_in
-    row += d_out
-  return slices
-
-
-def _gen_mask(num_blocks,
-              n_in,
-              n_out,
-              mask_type=MASK_EXCLUSIVE,
-              dtype=dtypes.float32):
-  """Generate the mask for building an autoregressive dense layer."""
-  # TODO(b/67594795): Better support of dynamic shape.
-  mask = np.zeros([n_out, n_in], dtype=dtype.as_numpy_dtype())
-  slices = _gen_slices(num_blocks, n_in, n_out, mask_type=mask_type)
-  for [row_slice, col_slice] in slices:
-    mask[row_slice, col_slice] = 1
-  return mask
-
-
-def masked_dense(inputs,
-                 units,
-                 num_blocks=None,
-                 exclusive=False,
-                 kernel_initializer=None,
-                 reuse=None,
-                 name=None,
-                 *args,
-                 **kwargs):
-  """A autoregressively masked dense layer. Analogous to `tf.layers.dense`.
-
-  See [1] for detailed explanation.
-
-  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
-
-  Arguments:
-    inputs: Tensor input.
-    units: Python `int` scalar representing the dimensionality of the output
-      space.
-    num_blocks: Python `int` scalar representing the number of blocks for the
-      MADE masks.
-    exclusive: Python `bool` scalar representing whether to zero the diagonal of
-      the mask, used for the first layer of a MADE.
-    kernel_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the
-      `tf.glorot_random_initializer`.
-    reuse: Python `bool` scalar representing whether to reuse the weights of a
-      previous layer by the same name.
-    name: Python `str` used to describe ops managed by this function.
-    *args: `tf.layers.dense` arguments.
-    **kwargs: `tf.layers.dense` keyword arguments.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
-      graph execution.
-  """
-  # TODO(b/67594795): Better support of dynamic shape.
-  input_depth = inputs.shape.with_rank_at_least(1)[-1].value
-  if input_depth is None:
-    raise NotImplementedError(
-        "Rightmost dimension must be known prior to graph execution.")
-
-  mask = _gen_mask(num_blocks, input_depth, units,
-                   MASK_EXCLUSIVE if exclusive else MASK_INCLUSIVE).T
-
-  if kernel_initializer is None:
-    kernel_initializer = init_ops.glorot_normal_initializer()
-
-  def masked_initializer(shape, dtype=None, partition_info=None):
-    return mask * kernel_initializer(shape, dtype, partition_info)
-
-  with ops.name_scope(name, "masked_dense", [inputs, units, num_blocks]):
-    layer = layers.Dense(
-        units,
-        kernel_initializer=masked_initializer,
-        kernel_constraint=lambda x: mask * x,
-        name=name,
-        dtype=inputs.dtype.base_dtype,
-        _scope=name,
-        _reuse=reuse,
-        *args,
-        **kwargs)
-    return layer.apply(inputs)
-
-
-def masked_autoregressive_default_template(
-    hidden_layers,
-    shift_only=False,
-    activation=nn_ops.relu,
-    log_scale_min_clip=-5.,
-    log_scale_max_clip=3.,
-    log_scale_clip_gradient=False,
-    name=None,
-    *args,
-    **kwargs):
-  """Build the MADE Model [1].
-
-  This will be wrapped in a make_template to ensure the variables are only
-  created once. It takes the input and returns the `loc` ("mu" [1]) and
-  `log_scale` ("alpha" [1]) from the MADE network.
-
-  Warning: This function uses `masked_dense` to create randomly initialized
-  `tf.Variables`. It is presumed that these will be fit, just as you would any
-  other neural architecture which uses `tf.layers.dense`.
-
-  #### About Hidden Layers:
-
-  Each element of `hidden_layers` should be greater than the `input_depth`
-  (i.e., `input_depth = tf.shape(input)[-1]` where `input` is the input to the
-  neural network). This is necessary to ensure the autoregressivity property.
-
-  #### About Clipping:
-
-  This function also optionally clips the `log_scale` (but possibly not its
-  gradient). This is useful because if `log_scale` is too small/large it might
-  underflow/overflow making it impossible for the `MaskedAutoregressiveFlow`
-  bijector to implement a bijection. Additionally, the `log_scale_clip_gradient`
-  `bool` indicates whether the gradient should also be clipped. The default does
-  not clip the gradient; this is useful because it still provides gradient
-  information (for fitting) yet solves the numerical stability problem. I.e.,
-  `log_scale_clip_gradient = False` means
-  `grad[exp(clip(x))] = grad[x] exp(clip(x))` rather than the usual
-  `grad[clip(x)] exp(clip(x))`.
-
-  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
-
-  Arguments:
-    hidden_layers: Python `list`-like of non-negative integer, scalars
-      indicating the number of units in each hidden layer. Default: `[512, 512].
-    shift_only: Python `bool` indicating if only the `shift` term shall be
-      computed. Default: `False`.
-    activation: Activation function (callable). Explicitly setting to `None`
-      implies a linear activation.
-    log_scale_min_clip: `float`-like scalar `Tensor`, or a `Tensor` with the
-      same shape as `log_scale`. The minimum value to clip by. Default: -5.
-    log_scale_max_clip: `float`-like scalar `Tensor`, or a `Tensor` with the
-      same shape as `log_scale`. The maximum value to clip by. Default: 3.
-    log_scale_clip_gradient: Python `bool` indicating that the gradient of
-      `tf.clip_by_value` should be preserved. Default: `False`.
-    name: A name for ops managed by this function. Default:
-      "masked_autoregressive_default_template".
-    *args: `tf.layers.dense` arguments.
-    **kwargs: `tf.layers.dense` keyword arguments.
-
-  Returns:
-    shift: `Float`-like `Tensor` of shift terms (the "mu" in [2]).
-    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in [2]).
-
-  Raises:
-    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
-      graph execution.
-  """
-
-  with ops.name_scope(name, "masked_autoregressive_default_template",
-                      values=[log_scale_min_clip, log_scale_max_clip]):
-    def _fn(x):
-      """MADE parameterized via `masked_autoregressive_default_template`."""
-      # TODO(b/67594795): Better support of dynamic shape.
-      input_depth = x.shape.with_rank_at_least(1)[-1].value
-      if input_depth is None:
-        raise NotImplementedError(
-            "Rightmost dimension must be known prior to graph execution.")
-      input_shape = (np.int32(x.shape.as_list()) if x.shape.is_fully_defined()
-                     else array_ops.shape(x))
-      for i, units in enumerate(hidden_layers):
-        x = masked_dense(
-            inputs=x,
-            units=units,
-            num_blocks=input_depth,
-            exclusive=True if i == 0 else False,
-            activation=activation,
-            *args,
-            **kwargs)
-      x = masked_dense(
-          inputs=x,
-          units=(1 if shift_only else 2) * input_depth,
-          num_blocks=input_depth,
-          activation=None,
-          *args,
-          **kwargs)
-      if shift_only:
-        x = array_ops.reshape(x, shape=input_shape)
-        return x, None
-      x = array_ops.reshape(
-          x, shape=array_ops.concat([input_shape, [2]], axis=0))
-      shift, log_scale = array_ops.unstack(x, num=2, axis=-1)
-      which_clip = (math_ops.clip_by_value if log_scale_clip_gradient
-                    else _clip_by_value_preserve_grad)
-      log_scale = which_clip(log_scale, log_scale_min_clip, log_scale_max_clip)
-      return shift, log_scale
-    return template_ops.make_template(
-        "masked_autoregressive_default_template", _fn)
-
-
-def _clip_by_value_preserve_grad(x, clip_value_min, clip_value_max, name=None):
-  """Clips input while leaving gradient unaltered."""
-  with ops.name_scope(name, "clip_by_value_preserve_grad",
-                      [x, clip_value_min, clip_value_max]):
-    clip_x = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-    return x + array_ops.stop_gradient(clip_x - x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
index a187ce22d686ee1203802ae2bfe64b0e1a3ea850..8654cc39d0c41ec4f1b85cd5fc4366ceaf4b224d 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
@@ -12,18 +12,127 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Permute bijector."""
+"""Permutation bijectors."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.permute_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["Permute"]
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector as bijector_lib
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "Permute",
+]
+
+
+class Permute(bijector_lib.Bijector):
+  """Permutes the rightmost dimension of a `Tensor`.
+
+  ```python
+  tfd = tf.contrib.distributions
+
+  reverse = tfd.bijectors.Permute(permutation=[2, 1, 0])
+
+  reverse.forward([-1., 0., 1.])
+  # ==> [1., 0., -1]
+
+  reverse.inverse([1., 0., -1])
+  # ==> [-1., 0., 1.]
+
+  reverse.forward_log_det_jacobian(any_value)
+  # ==> 0.
+
+  reverse.inverse_log_det_jacobian(any_value)
+  # ==> 0.
+  ```
+
+  Warning: `tf.estimator` may repeatedly build the graph thus
+  `Permute(np.random.permutation(event_size)).astype("int32"))` is not a
+  reliable parameterization (nor would it be even if using `tf.constant`). A
+  safe alternative is to use `tf.get_variable` to achieve "init once" behavior,
+  i.e.,
+
+  ```python
+  def init_once(x, name):
+    return tf.get_variable(name, initializer=x, trainable=False)
+
+  Permute(permutation=init_once(
+      np.random.permutation(event_size).astype("int32"),
+      name="permutation"))
+  ```
+
+  """
+
+  def __init__(self, permutation, validate_args=False, name=None):
+    """Creates the `Permute` bijector.
+
+    Args:
+      permutation: An `int`-like vector-shaped `Tensor` representing the
+        permutation to apply to the rightmost dimension of the transformed
+        `Tensor`.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+
+    Raises:
+      TypeError: if `not permutation.dtype.is_integer`.
+      ValueError: if `permutation` does not contain exactly one of each of
+        `{0, 1, ..., d}`.
+    """
+    with ops.name_scope(name, "permute", values=[permutation]):
+      permutation = ops.convert_to_tensor(
+          permutation,
+          name="permutation")
+      if not permutation.dtype.is_integer:
+        raise TypeError("permutation.dtype ({}) should be `int`-like.".format(
+            permutation.dtype.name))
+      p = tensor_util.constant_value(permutation)
+      if p is not None:
+        if set(p) != set(np.arange(p.size)):
+          raise ValueError("Permutation over `d` must contain exactly one of "
+                           "each of `{0, 1, ..., d}`.")
+      elif validate_args:
+        p, _ = nn_ops.top_k(-permutation,
+                            k=array_ops.shape(permutation)[-1],
+                            sorted=True)
+        permutation = control_flow_ops.with_dependencies([
+            check_ops.assert_equal(
+                -p, math_ops.range(array_ops.size(p)),
+                message=("Permutation over `d` must contain exactly one of "
+                         "each of `{0, 1, ..., d}`.")),
+        ], permutation)
+      self._permutation = permutation
+      super(Permute, self).__init__(
+          is_constant_jacobian=True,
+          validate_args=validate_args,
+          name=name or "permute")
+
+  @property
+  def permutation(self):
+    return self._permutation
+
+  def _forward(self, x):
+    return array_ops.gather(x, self.permutation, axis=-1)
+
+  def _inverse(self, y):
+    return array_ops.gather(
+        y,
+        array_ops.invert_permutation(self.permutation),
+        axis=-1)
+
+  def _inverse_log_det_jacobian(self, y):
+    return constant_op.constant(0., dtype=y.dtype)
+
+  def _forward_log_det_jacobian(self, x):
+    return constant_op.constant(0., dtype=x.dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/permute_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/permute_impl.py
deleted file mode 100644
index b1d8f2f41b28a88208a19824377f93882b767f03..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/permute_impl.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Permutation bijectors."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
-
-
-__all__ = [
-    "Permute",
-]
-
-
-class Permute(bijector_lib.Bijector):
-  """Permutes the rightmost dimension of a `Tensor`.
-
-  ```python
-  bs = tf.contrib.distributions.bijectors
-
-  reverse = bs.Permute(permutation=[2, 1, 0])
-
-  reverse.forward([-1., 0., 1.])
-  # ==> [1., 0., -1]
-
-  reverse.inverse([1., 0., -1])
-  # ==> [-1., 0., 1.]
-
-  reverse.forward_log_det_jacobian(any_value)
-  # ==> 0.
-
-  reverse.inverse_log_det_jacobian(any_value)
-  # ==> 0.
-  ```
-
-  Warning: `tf.estimator` may repeatedly build the graph thus
-  `Permute(np.random.permutation(event_size)).astype("int32"))` is not a
-  reliable parameterization (nor would it be even if using `tf.constant`). A
-  safe alternative is to use `tf.get_variable` to achieve "init once" behavior,
-  i.e.,
-
-  ```python
-  def init_once(x, name):
-    return tf.get_variable(name, initializer=x, trainable=False)
-
-  Permute(permutation=init_once(
-      np.random.permutation(event_size).astype("int32"),
-      name="permutation"))
-  ```
-
-  """
-
-  def __init__(self, permutation, validate_args=False, name=None):
-    """Creates the `Permute` bijector.
-
-    Args:
-      permutation: An `int`-like vector-shaped `Tensor` representing the
-        permutation to apply to the rightmost dimension of the transformed
-        `Tensor`.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-
-    Raises:
-      TypeError: if `not permutation.dtype.is_integer`.
-      ValueError: if `permutation` does not contain exactly one of each of
-        `{0, 1, ..., d}`.
-    """
-    with ops.name_scope(name, "permute", values=[permutation]):
-      permutation = ops.convert_to_tensor(
-          permutation,
-          name="permutation")
-      if not permutation.dtype.is_integer:
-        raise TypeError("permutation.dtype ({}) should be `int`-like.".format(
-            permutation.dtype.name))
-      p = tensor_util.constant_value(permutation)
-      if p is not None:
-        if set(p) != set(np.arange(p.size)):
-          raise ValueError("Permutation over `d` must contain exactly one of "
-                           "each of `{0, 1, ..., d}`.")
-      elif validate_args:
-        p, _ = nn_ops.top_k(-permutation,
-                            k=array_ops.shape(permutation)[-1],
-                            sorted=True)
-        permutation = control_flow_ops.with_dependencies([
-            check_ops.assert_equal(
-                -p, math_ops.range(array_ops.size(p)),
-                message=("Permutation over `d` must contain exactly one of "
-                         "each of `{0, 1, ..., d}`.")),
-        ], permutation)
-      self._permutation = permutation
-      super(Permute, self).__init__(
-          is_constant_jacobian=True,
-          validate_args=validate_args,
-          name=name or "permute")
-
-  @property
-  def permutation(self):
-    return self._permutation
-
-  def _forward(self, x):
-    return array_ops.gather(x, self.permutation, axis=-1)
-
-  def _inverse(self, y):
-    return array_ops.gather(
-        y,
-        array_ops.invert_permutation(self.permutation),
-        axis=-1)
-
-  def _inverse_log_det_jacobian(self, y):
-    return constant_op.constant(0., dtype=y.dtype)
-
-  def _forward_log_det_jacobian(self, x):
-    return constant_op.constant(0., dtype=x.dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
index a83199549cd16101ab7b39b43d19a17bc66f03df..c37db61720d10949f294ff7b2e9778ba6efa57f0 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
@@ -18,12 +18,110 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.power_transform_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["PowerTransform"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "PowerTransform",
+]
+
+
+class PowerTransform(bijector.Bijector):
+  """Compute `Y = g(X) = (1 + X * c)**(1 / c), X >= -1 / c`.
+
+  The [power transform](https://en.wikipedia.org/wiki/Power_transform) maps
+  inputs from `[0, inf]` to `[-1/c, inf]`; this is equivalent to the `inverse`
+  of this bijector.
+
+  This bijector is equivalent to the `Exp` bijector when `c=0`.
+  """
+
+  def __init__(self,
+               power=0.,
+               event_ndims=0,
+               validate_args=False,
+               name="power_transform"):
+    """Instantiates the `PowerTransform` bijector.
+
+    Args:
+      power: Python `float` scalar indicating the transform power, i.e.,
+        `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if `power < 0` or is not known statically.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[power]):
+      power = tensor_util.constant_value(
+          ops.convert_to_tensor(power, name="power"))
+    if power is None or power < 0:
+      raise ValueError("`power` must be a non-negative TF constant.")
+    self._power = power
+    super(PowerTransform, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  @property
+  def power(self):
+    """The `c` in: `Y = g(X) = (1 + X * c)**(1 / c)`."""
+    return self._power
+
+  def _forward(self, x):
+    x = self._maybe_assert_valid_x(x)
+    if self.power == 0.:
+      return math_ops.exp(x)
+    # If large x accuracy is an issue, consider using:
+    # (1. + x * self.power)**(1. / self.power) when x >> 1.
+    return math_ops.exp(math_ops.log1p(x * self.power) / self.power)
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid_y(y)
+    if self.power == 0.:
+      return math_ops.log(y)
+    # If large y accuracy is an issue, consider using:
+    # (y**self.power - 1.) / self.power when y >> 1.
+    return math_ops.expm1(math_ops.log(y) * self.power) / self.power
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid_y(y)
+    event_dims = self._event_dims_tensor(y)
+    return (self.power - 1.) * math_ops.reduce_sum(
+        math_ops.log(y), axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    x = self._maybe_assert_valid_x(x)
+    event_dims = self._event_dims_tensor(x)
+    if self.power == 0.:
+      return math_ops.reduce_sum(x, axis=event_dims)
+    return (1. / self.power - 1.) * math_ops.reduce_sum(
+        math_ops.log1p(x * self.power),
+        axis=event_dims)
+
+  def _maybe_assert_valid_x(self, x):
+    if not self.validate_args or self.power == 0.:
+      return x
+    is_valid = check_ops.assert_non_negative(
+        1. + self.power * x,
+        message="Forward transformation input must be at least {}.".format(
+            -1. / self.power))
+    return control_flow_ops.with_dependencies([is_valid], x)
+
+  def _maybe_assert_valid_y(self, y):
+    if not self.validate_args:
+      return y
+    is_valid = check_ops.assert_positive(
+        y, message="Inverse transformation input must be greater than 0.")
+    return control_flow_ops.with_dependencies([is_valid], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
deleted file mode 100644
index c37db61720d10949f294ff7b2e9778ba6efa57f0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""PowerTransform bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "PowerTransform",
-]
-
-
-class PowerTransform(bijector.Bijector):
-  """Compute `Y = g(X) = (1 + X * c)**(1 / c), X >= -1 / c`.
-
-  The [power transform](https://en.wikipedia.org/wiki/Power_transform) maps
-  inputs from `[0, inf]` to `[-1/c, inf]`; this is equivalent to the `inverse`
-  of this bijector.
-
-  This bijector is equivalent to the `Exp` bijector when `c=0`.
-  """
-
-  def __init__(self,
-               power=0.,
-               event_ndims=0,
-               validate_args=False,
-               name="power_transform"):
-    """Instantiates the `PowerTransform` bijector.
-
-    Args:
-      power: Python `float` scalar indicating the transform power, i.e.,
-        `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if `power < 0` or is not known statically.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    with self._name_scope("init", values=[power]):
-      power = tensor_util.constant_value(
-          ops.convert_to_tensor(power, name="power"))
-    if power is None or power < 0:
-      raise ValueError("`power` must be a non-negative TF constant.")
-    self._power = power
-    super(PowerTransform, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  @property
-  def power(self):
-    """The `c` in: `Y = g(X) = (1 + X * c)**(1 / c)`."""
-    return self._power
-
-  def _forward(self, x):
-    x = self._maybe_assert_valid_x(x)
-    if self.power == 0.:
-      return math_ops.exp(x)
-    # If large x accuracy is an issue, consider using:
-    # (1. + x * self.power)**(1. / self.power) when x >> 1.
-    return math_ops.exp(math_ops.log1p(x * self.power) / self.power)
-
-  def _inverse(self, y):
-    y = self._maybe_assert_valid_y(y)
-    if self.power == 0.:
-      return math_ops.log(y)
-    # If large y accuracy is an issue, consider using:
-    # (y**self.power - 1.) / self.power when y >> 1.
-    return math_ops.expm1(math_ops.log(y) * self.power) / self.power
-
-  def _inverse_log_det_jacobian(self, y):
-    y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return (self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log(y), axis=event_dims)
-
-  def _forward_log_det_jacobian(self, x):
-    x = self._maybe_assert_valid_x(x)
-    event_dims = self._event_dims_tensor(x)
-    if self.power == 0.:
-      return math_ops.reduce_sum(x, axis=event_dims)
-    return (1. / self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log1p(x * self.power),
-        axis=event_dims)
-
-  def _maybe_assert_valid_x(self, x):
-    if not self.validate_args or self.power == 0.:
-      return x
-    is_valid = check_ops.assert_non_negative(
-        1. + self.power * x,
-        message="Forward transformation input must be at least {}.".format(
-            -1. / self.power))
-    return control_flow_ops.with_dependencies([is_valid], x)
-
-  def _maybe_assert_valid_y(self, y):
-    if not self.validate_args:
-      return y
-    is_valid = check_ops.assert_positive(
-        y, message="Inverse transformation input must be greater than 0.")
-    return control_flow_ops.with_dependencies([is_valid], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
new file mode 100644
index 0000000000000000000000000000000000000000..2840f52e742eac5e9e37a576bf7f6d6f05a07a35
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
@@ -0,0 +1,282 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Real NVP bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core as layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import template as template_ops
+from tensorflow.python.ops.distributions import bijector as bijector_lib
+
+
+__all__ = [
+    "RealNVP",
+    "real_nvp_default_template"
+]
+
+
+class RealNVP(bijector_lib.Bijector):
+  """RealNVP "affine coupling layer" for vector-valued events.
+
+  Real NVP models a normalizing flow on a `D`-dimensional distribution via a
+  single `D-d`-dimensional conditional distribution [1]:
+
+  `y[d:D] = y[d:D] * math_ops.exp(log_scale_fn(y[d:D])) + shift_fn(y[d:D])`
+  `y[0:d] = x[0:d]`
+
+  The last `D-d` units are scaled and shifted based on the first `d` units only,
+  while the first `d` units are 'masked' and left unchanged. Real NVP's
+  `shift_and_log_scale_fn` computes vector-valued quantities. For
+  scale-and-shift transforms that do not depend on any masked units, i.e.
+  `d=0`, use the `tfb.Affine` bijector with learned parameters instead.
+
+  Masking is currently only supported for base distributions with
+  `event_ndims=1`. For more sophisticated masking schemes like checkerboard or
+  channel-wise masking [2], use the `tfb.Permute` bijector to re-order desired
+  masked units into the first `d` units. For base distributions with
+  `event_ndims > 1`, use the `tfb.Reshape` bijector to flatten the event shape.
+
+  Recall that the MAF bijector [2] implements a normalizing flow via an
+  autoregressive transformation. MAF and IAF have opposite computational
+  tradeoffs - MAF can train all units in parallel but must sample units
+  sequentially, while IAF must train units sequentially but can sample in
+  parallel. In contrast, Real NVP can compute both forward and inverse
+  computations in parallel. However, the lack of an autoregressive
+  transformations makes it less expressive on a per-bijector basis.
+
+  A "valid" `shift_and_log_scale_fn` must compute each `shift` (aka `loc` or
+  "mu" [2]) and `log(scale)` (aka "alpha" [2]) such that each are broadcastable
+  with the arguments to `forward` and `inverse`, i.e., such that the
+  calculations in `forward`, `inverse` [below] are possible. For convenience,
+  `real_nvp_default_nvp` is offered as a possible `shift_and_log_scale_fn`
+  function.
+
+  NICE [3] is a special case of the Real NVP bijector which discards the scale
+  transformation, resulting in a constant-time inverse-log-determinant-Jacobian.
+  To use a NICE bijector instead of Real NVP, `shift_and_log_scale_fn` should
+  return `(shift, None)`, and `is_constant_jacobian` should be set to `True` in
+  the `RealNVP` constructor. Calling `real_nvp_default_template` with
+  `shift_only=True` returns one such NICE-compatible `shift_and_log_scale_fn`.
+
+  Caching: the scalar input depth `D` of the base distribution is not known at
+  construction time. The first call to any of `forward(x)`, `inverse(x)`,
+  `inverse_log_det_jacobian(x)`, or `forward_log_det_jacobian(x)` memoizes
+  `D`, which is re-used in subsequent calls. This shape must be known prior to
+  graph execution (which is the case if using tf.layers).
+
+  #### Example Use
+
+  ```python
+  tfd = tf.contrib.distributions
+  tfb = tfd.bijectors
+
+  # A common choice for a normalizing flow is to use a Gaussian for the base
+  # distribution. (However, any continuous distribution would work.) E.g.,
+  nvp = tfd.TransformedDistribution(
+      distribution=tfd.MultivariateNormalDiag(loc=[0., 0., 0.])),
+      bijector=tfb.RealNVP(
+          num_masked=2,
+          shift_and_log_scale_fn=tfb.real_nvp_default_template(
+              hidden_layers=[512, 512])))
+
+  x = nvp.sample()
+  nvp.log_prob(x)
+  nvp.log_prob(0.)
+  ```
+
+  For more examples, see [4].
+
+  [1]: "Density Estimation using Real NVP."
+       Laurent Dinh, Jascha Sohl-Dickstein, Samy Bengio. ICLR. 2017.
+       https://arxiv.org/abs/1605.08803
+
+  [2]: "Masked Autoregressive Flow for Density Estimation."
+       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
+       https://arxiv.org/abs/1705.07057
+
+  [3]: "NICE: Non-linear Independent Components Estimation."
+       Laurent Dinh, David Krueger, Yoshua Bengio. ICLR. 2015.
+       https://arxiv.org/abs/1410.8516
+
+  [4]: "Normalizing Flows Tutorial, Part 2: Modern Normalizing Flows."
+       Eric Jang. Blog post. January 2018.
+       http://blog.evjang.com/2018/01/nf2.html
+  """
+
+  def __init__(self,
+               num_masked,
+               shift_and_log_scale_fn,
+               is_constant_jacobian=False,
+               validate_args=False,
+               name=None):
+    """Creates the Real NVP or NICE bijector.
+
+    Args:
+      num_masked: Python `int` indicating that the first `d` units of the event
+        should be masked. Must be in the closed interval `[1, D-1]`, where `D`
+        is the event size of the base distribution.
+      shift_and_log_scale_fn: Python `callable` which computes `shift` and
+        `log_scale` from both the forward domain (`x`) and the inverse domain
+        (`y`). Calculation must respect the "autoregressive property" (see class
+        docstring). Suggested default
+        `masked_autoregressive_default_template(hidden_layers=...)`.
+        Typically the function contains `tf.Variables` and is wrapped using
+        `tf.make_template`. Returning `None` for either (both) `shift`,
+        `log_scale` is equivalent to (but more efficient than) returning zero.
+      is_constant_jacobian: Python `bool`. Default: `False`. When `True` the
+        implementation assumes `log_scale` does not depend on the forward domain
+        (`x`) or inverse domain (`y`) values. (No validation is made;
+        `is_constant_jacobian=False` is always safe but possibly computationally
+        inefficient.)
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+
+    Raises:
+      ValueError: If num_masked < 1.
+    """
+    name = name or "real_nvp"
+    if num_masked <= 0:
+      raise ValueError("num_masked must be a positive integer.")
+    self._num_masked = num_masked
+    # At construction time, we don't know input_depth.
+    self._input_depth = None
+    self._shift_and_log_scale_fn = shift_and_log_scale_fn
+    super(RealNVP, self).__init__(
+        event_ndims=1,
+        is_constant_jacobian=is_constant_jacobian,
+        validate_args=validate_args,
+        name=name)
+
+  def _cache_input_depth(self, x):
+    if self._input_depth is None:
+      self._input_depth = x.shape.with_rank_at_least(1)[-1].value
+      if self._input_depth is None:
+        raise NotImplementedError(
+            "Rightmost dimension must be known prior to graph execution.")
+      if self._num_masked >= self._input_depth:
+        raise ValueError(
+            "Number of masked units must be smaller than the event size.")
+
+  def _forward(self, x):
+    self._cache_input_depth(x)
+    # Performs scale and shift.
+    x0, x1 = x[:, :self._num_masked], x[:, self._num_masked:]
+    shift, log_scale = self._shift_and_log_scale_fn(
+        x0, self._input_depth - self._num_masked)
+    y1 = x1
+    if log_scale is not None:
+      y1 *= math_ops.exp(log_scale)
+    if shift is not None:
+      y1 += shift
+    y = array_ops.concat([x0, y1], axis=-1)
+    return y
+
+  def _inverse(self, y):
+    self._cache_input_depth(y)
+    # Performs un-shift and un-scale.
+    y0, y1 = y[:, :self._num_masked], y[:, self._num_masked:]
+    shift, log_scale = self._shift_and_log_scale_fn(
+        y0, self._input_depth - self._num_masked)
+    x1 = y1
+    if shift is not None:
+      x1 -= shift
+    if log_scale is not None:
+      x1 *= math_ops.exp(-log_scale)
+    x = array_ops.concat([y0, x1], axis=-1)
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    self._cache_input_depth(y)
+    y0 = y[:, :self._num_masked]
+    _, log_scale = self._shift_and_log_scale_fn(
+        y0, self._input_depth - self._num_masked)
+    if log_scale is None:
+      return constant_op.constant(0., dtype=y.dtype, name="ildj")
+    return -math_ops.reduce_sum(log_scale, axis=-1)
+
+  def _forward_log_det_jacobian(self, x):
+    self._cache_input_depth(x)
+    x0 = x[:, :self._num_masked]
+    _, log_scale = self._shift_and_log_scale_fn(
+        x0, self._input_depth - self._num_masked)
+    if log_scale is None:
+      return constant_op.constant(0., dtype=x.dtype, name="ildj")
+    return math_ops.reduce_sum(log_scale, axis=-1)
+
+
+def real_nvp_default_template(
+    hidden_layers,
+    shift_only=False,
+    activation=nn_ops.relu,
+    name=None,
+    *args,
+    **kwargs):
+  """Build a scale-and-shift function using a multi-layer neural network.
+
+  This will be wrapped in a make_template to ensure the variables are only
+  created once. It takes the `d`-dimensional input x[0:d] and returns the `D-d`
+  dimensional outputs `loc` ("mu") and `log_scale` ("alpha").
+
+  Arguments:
+    hidden_layers: Python `list`-like of non-negative integer, scalars
+      indicating the number of units in each hidden layer. Default: `[512, 512].
+    shift_only: Python `bool` indicating if only the `shift` term shall be
+      computed (i.e. NICE bijector). Default: `False`.
+    activation: Activation function (callable). Explicitly setting to `None`
+      implies a linear activation.
+    name: A name for ops managed by this function. Default:
+      "real_nvp_default_template".
+    *args: `tf.layers.dense` arguments.
+    **kwargs: `tf.layers.dense` keyword arguments.
+
+  Returns:
+    shift: `Float`-like `Tensor` of shift terms (the "mu" in [2]).
+    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in [2]).
+
+  Raises:
+    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
+      graph execution.
+  """
+
+  with ops.name_scope(name, "real_nvp_default_template"):
+    def _fn(x, output_units):
+      """Fully connected MLP parameterized via `real_nvp_template`."""
+      for units in hidden_layers:
+        x = layers.dense(
+            inputs=x,
+            units=units,
+            activation=activation,
+            *args,
+            **kwargs)
+      x = layers.dense(
+          inputs=x,
+          units=(1 if shift_only else 2) * output_units,
+          activation=None,
+          *args,
+          **kwargs)
+      if shift_only:
+        return x, None
+      shift, log_scale = array_ops.split(x, 2, axis=-1)
+      return shift, log_scale
+    return template_ops.make_template(
+        "real_nvp_default_template", _fn)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
index 8997f7ab6929745275edb38712a5bbb0a9b25ddb..55eca063126797d577653f0d6bcdfddf8192bdb5 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
@@ -12,18 +12,303 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Reshape bijector."""
+"""Reshape bijectors."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.reshape_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["Reshape"]
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector as bijector_lib
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "Reshape",
+]
+
+
+def _static_ndims_from_shape(shape):
+  return shape.shape.with_rank_at_least(1)[0].value
+
+
+def _ndims_from_shape(shape):
+  return array_ops.shape(shape)[0]
+
+
+class Reshape(bijector_lib.Bijector):
+  """Reshapes the `event_shape` of a `Tensor`.
+
+  The semantics generally follow that of `tf.reshape()`, with
+  a few differences:
+
+  * The user must provide both the input and output shape, so that
+    the transformation can be inverted. If an input shape is not
+    specified, the default assumes a vector-shaped input, i.e.,
+    event_shape_in = (-1,).
+  * The `Reshape` bijector automatically broadcasts over the leftmost
+    dimensions of its input (`sample_shape` and `batch_shape`); only
+    the rightmost `event_ndims_in` dimensions are reshaped. The
+    number of dimensions to reshape is inferred from the provided
+    `event_shape_in` (`event_ndims_in = len(event_shape_in)`).
+
+  Example usage:
+  ```python
+
+  tfd = tf.contrib.distributions
+
+  r = tfd.bijectors.Reshape(event_shape_out=[1, -1])
+
+  r.forward([3., 4.])    # shape [2]
+  # ==> [[3., 4.]]       # shape [1, 2]
+
+  r.forward([[1., 2.], [3., 4.]])  # shape [2, 2]
+  # ==> [[[1., 2.]],
+  #      [[3., 4.]]]   # shape [2, 1, 2]
+
+  r.inverse([[3., 4.]])  # shape [1,2]
+  # ==> [3., 4.]         # shape [2]
+
+  r.forward_log_det_jacobian(any_value)
+  # ==> 0.
+
+  r.inverse_log_det_jacobian(any_value)
+  # ==> 0.
+  ```
+
+  """
+
+  def __init__(self, event_shape_out, event_shape_in=(-1,),
+               validate_args=False, name=None):
+    """Creates a `Reshape` bijector.
+
+    Args:
+      event_shape_out: An `int`-like vector-shaped `Tensor`
+        representing the event shape of the transformed output.
+      event_shape_in: An optional `int`-like vector-shape `Tensor`
+        representing the event shape of the input. This is required in
+        order to define inverse operations; the default of (-1,)
+        assumes a vector-shaped input.
+      validate_args: Python `bool` indicating whether arguments should
+        be checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+
+    Raises:
+      TypeError: if either `event_shape_in` or `event_shape_out` has
+        non-integer `dtype`.
+      ValueError: if either of `event_shape_in` or `event_shape_out`
+       has non-vector shape (`rank > 1`), or if their sizes do not
+       match.
+    """
+    with ops.name_scope(name, "reshape",
+                        values=[event_shape_out, event_shape_in]):
+
+      event_shape_out = ops.convert_to_tensor(event_shape_out,
+                                              name="event_shape_out",
+                                              preferred_dtype=dtypes.int32)
+      event_shape_in = ops.convert_to_tensor(event_shape_in,
+                                             name="event_shape_in",
+                                             preferred_dtype=dtypes.int32)
+
+      assertions = []
+      assertions.extend(self._maybe_check_valid_shape(
+          event_shape_out, validate_args))
+      assertions.extend(self._maybe_check_valid_shape(
+          event_shape_in, validate_args))
+
+      self._assertions = assertions
+      self._event_shape_in = event_shape_in
+      self._event_shape_out = event_shape_out
+
+      super(Reshape, self).__init__(is_constant_jacobian=True,
+                                    validate_args=validate_args,
+                                    name=name or "reshape")
+
+  def _maybe_check_valid_shape(self, shape, validate_args):
+    """Check that a shape Tensor is int-type and otherwise sane."""
+    if not shape.dtype.is_integer:
+      raise TypeError("{} dtype ({}) should be `int`-like.".format(
+          shape.op.name, shape.dtype.name))
+
+    assertions = []
+
+    ndims = array_ops.rank(shape)
+    ndims_ = tensor_util.constant_value(ndims)
+    if ndims_ is not None and ndims_ > 1:
+      raise ValueError("`{}` rank ({}) should be <= 1.".format(
+          shape.op.name, ndims_))
+    elif validate_args:
+      assertions.append(check_ops.assert_less_equal(
+          ndims, 1, message="`{}` rank should be <= 1.".format(shape.op.name)))
+
+    shape_ = tensor_util.constant_value_as_shape(shape)
+    if shape_.is_fully_defined():
+      es = np.int32(shape_.as_list())
+      if sum(es == -1) > 1:
+        raise ValueError(
+            "`{}` must have at most one `-1` (given {})"
+            .format(shape.op.name, es))
+      if np.any(es < -1):
+        raise ValueError(
+            "`{}` elements must be either positive integers or `-1`"
+            "(given {})."
+            .format(shape.op.name, es))
+    elif validate_args:
+      assertions.extend([
+          check_ops.assert_less_equal(
+              math_ops.reduce_sum(
+                  math_ops.cast(math_ops.equal(shape, -1), dtypes.int32)),
+              1,
+              message="`{}` elements must have at most one `-1`."
+              .format(shape.op.name)),
+          check_ops.assert_greater_equal(
+              shape, -1,
+              message="`{}` elements must be either positive integers or `-1`."
+              .format(shape.op.name)),
+      ])
+    return assertions
+
+  def _reshape_helper(self, x, event_shape_in, event_shape_out):
+    """Reshape only the event_shape of an input `Tensor`."""
+
+    event_ndims_in_ = _static_ndims_from_shape(event_shape_in)
+    event_ndims_in = _ndims_from_shape(event_shape_in)
+    x_ndims_, x_ndims = x.shape.ndims, array_ops.rank(x)
+
+    assertions = []
+
+    # Ensure x.event_shape is compatible with event_shape_in.
+    if (event_ndims_in_ is not None
+        and x_ndims_ is not None
+        and x.shape.with_rank_at_least(event_ndims_in_)[
+            x_ndims_-event_ndims_in_:].is_fully_defined()):
+      x_event_shape_, x_event_shape = [  # pylint: disable=unbalanced-tuple-unpacking
+          np.int32(x.shape[x_ndims_-event_ndims_in_:])]*2
+    else:
+      x_event_shape_, x_event_shape = (
+          None, array_ops.shape(x)[x_ndims-event_ndims_in:])
+
+    event_shape_in_ = tensor_util.constant_value(event_shape_in)
+
+    if x_event_shape_ is not None and event_shape_in_ is not None:
+      # Compare the shape dimensions that are fully specified in the
+      # input (i.e., for which event_shape_in is not -1). If x_event_shape
+      # matches along all of these dimensions, it is compatible with
+      # the desired input shape and any further mismatches (i.e.,
+      # imcompatibility with the desired *output* shape) will be
+      # caught inside of array_ops.reshape() below.
+      x_event_shape_specified_ = x_event_shape_[event_shape_in_ >= 0]
+      event_shape_in_specified_ = event_shape_in_[event_shape_in_ >= 0]
+      if not np.equal(x_event_shape_specified_,
+                      event_shape_in_specified_).all():
+        raise ValueError(
+            "Input `event_shape` does not match `event_shape_in` ({} vs {}).".
+            format(x_event_shape_, event_shape_in_))
+    elif self.validate_args:
+      # Similarly to the static case, we compare the shape dimensions
+      # that are fully specified in the input. We extract these
+      # dimensions using boolean_mask(), which requires that the mask
+      # have known ndims. We can assume that shape Tensors always have
+      # ndims==1 (this assumption is verified inside of
+      # _maybe_check_valid_shape), so the reshape operation is just a
+      # no-op that formally encodes this fact to make boolean_mask()
+      # happy.
+      event_shape_mask = array_ops.reshape(event_shape_in >= 0, [-1])
+      x_event_shape_specified = array_ops.boolean_mask(x_event_shape,
+                                                       event_shape_mask)
+      event_shape_in_specified = array_ops.boolean_mask(event_shape_in,
+                                                        event_shape_mask)
+      assertions.append(check_ops.assert_equal(
+          x_event_shape_specified, event_shape_in_specified,
+          message="Input `event_shape` does not match `event_shape_in`."))
+
+    if assertions:
+      x = control_flow_ops.with_dependencies(assertions, x)
+
+    # get the parts of shape(x) that will not change
+    sample_and_batch_shape = array_ops.shape(x)
+
+    ndims = (x.shape.ndims if x.shape.ndims is not None
+             else array_ops.rank(x))
+    sample_and_batch_shape = sample_and_batch_shape[
+        :(ndims - math_ops.abs(event_ndims_in))]
+
+    if (event_ndims_in_ is not None
+        and x_ndims_ is not None
+        and event_ndims_in_ == x_ndims_):
+      # Hack to allow forward/inverse_event_shape to do shape
+      # inference by calling this helper method with a dummy Tensor of
+      # shape event_shape_in. In this special case,
+      # sample_and_batch_shape will be empty so we can preserve static
+      # shape information by avoiding the concat operation below
+      # (which would be a no-op).
+      new_shape = event_shape_out
+    else:
+      new_shape = array_ops.concat(
+          [sample_and_batch_shape, event_shape_out], axis=0)
+
+    return array_ops.reshape(x, new_shape)
+
+  def _forward(self, x):
+    with ops.control_dependencies(self._assertions):
+      return self._reshape_helper(x,
+                                  self._event_shape_in,
+                                  self._event_shape_out)
+
+  def _inverse(self, y):
+    with ops.control_dependencies(self._assertions):
+      return self._reshape_helper(y,
+                                  self._event_shape_out,
+                                  self._event_shape_in)
+
+  def _inverse_log_det_jacobian(self, y):
+    with ops.control_dependencies(self._assertions):
+      return constant_op.constant(0., dtype=y.dtype)
+
+  def _forward_log_det_jacobian(self, x):
+    with ops.control_dependencies(self._assertions):
+      return constant_op.constant(0., dtype=x.dtype)
+
+  def _forward_event_shape(self, input_shape):
+    # NOTE: this method and the other *_event_shape* methods
+    # compute shape by explicit transformation of a dummy
+    # variable. This approach is not generally recommended because it
+    # bloats the graph and could in general trigger side effects.
+    #
+    # In this particular case of the Reshape bijector, the
+    # forward and inverse transforms have no side effects, and we
+    # believe the reduction in code complexity from delegating the
+    # heavy lifting to tf.reshape() is worth the added graph ops.
+    # However, you should think hard before implementing this approach
+    # in other Bijectors; it is strongly preferred to compute
+    # shapes explicitly whenever it's feasible to do so.
+    with ops.control_dependencies(self._assertions):
+      dummy = array_ops.zeros(dtype=dtypes.float32, shape=input_shape)
+      dummy_reshaped = self.forward(dummy)
+      return dummy_reshaped.shape
+
+  def _inverse_event_shape(self, output_shape):
+    with ops.control_dependencies(self._assertions):
+      dummy = array_ops.zeros(dtype=dtypes.float32, shape=output_shape)
+      dummy_reshaped = self.inverse(dummy)
+      return dummy_reshaped.shape
+
+  def _forward_event_shape_tensor(self, input_shape):
+    with ops.control_dependencies(self._assertions):
+      dummy = array_ops.zeros(dtype=dtypes.float32, shape=input_shape)
+      dummy_reshaped = self.forward(dummy)
+      return array_ops.shape(dummy_reshaped)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    with ops.control_dependencies(self._assertions):
+      dummy = array_ops.zeros(dtype=dtypes.float32, shape=output_shape)
+      dummy_reshaped = self.inverse(dummy)
+      return array_ops.shape(dummy_reshaped)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/reshape_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/reshape_impl.py
deleted file mode 100644
index 93682639aa3be3b8f59a369dedb6ee773c468130..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/reshape_impl.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Reshape bijectors."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
-
-
-__all__ = [
-    "Reshape",
-]
-
-
-class Reshape(bijector_lib.Bijector):
-  """Reshapes the `event_shape` of a `Tensor`.
-
-  The semantics generally follow that of `tf.reshape()`, with
-  a few differences:
-   * The user must provide both the input and output shape, so that
-     the transformation can be inverted.
-   * The `Reshape` bijector automatically broadcasts over the leftmost
-     dimensions of its input (`sample_shape` and `batch_shape`); only
-     the rightmost `event_ndims_in` dimensions are reshaped. The
-     number of dimensions to reshape is inferred from the provided
-     `event_shape_in` (`event_ndims_in = len(event_shape_in)`).
-   * The `Reshape` bijector does not currently support
-     partially-specified shapes, i.e., those with a dimension
-     implicitly specified by `-1`.
-
-  Example usage:
-  ```python
-
-  bs = tf.contrib.distributions.bijectors
-
-  reverse = bs.Reshape(event_shape_out=[1,2],
-                       event_shape_in=[2,])
-
-  reverse.forward([1., 2.])    # shape [2,]
-  # ==> [[1., 2.]]             # shape [1,2]
-
-  reverse.forward([[1., 2.], [3., 4.]])  # shape [2, 2]
-  # ==> [[[1., 2.]], [[3., 4.]]]         # shape [2, 1, 2]
-
-  reverse.inverse([[1., 2.]])  # shape [1,2]
-  # ==> [1., 2.]               # shape [2,]
-
-  reverse.forward_log_det_jacobian(any_value)
-  # ==> 0.
-
-  reverse.inverse_log_det_jacobian(any_value)
-  # ==> 0.
-  ```
-
-  """
-
-  def __init__(self, event_shape_out, event_shape_in,
-               validate_args=False, name=None):
-    """Creates a `Reshape` bijector.
-
-    Args:
-      event_shape_out: An `int`-like vector-shaped `Tensor`
-        representing the fully specified (no -1's) event shape of the
-        transformed output.
-      event_shape_in: An `int`-like vector-shaped `Tensor`
-        representing the fully specified (no -1's) event shape of the
-        input.
-      validate_args: Python `bool` indicating whether arguments should
-        be checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-
-    Raises:
-      TypeError: if either `event_shape_in` or `event_shape_out` has
-       non-vector shape (`rank > 1`), or non-integer `dtype`.
-      ValueError: if either `event_shape_in` or `event_shape_out`
-       contains non-positive entries, or if their sizes do not match
-       (`prod(event_shape_in)` != `prod(event_shape_out)`), or if
-       their dimensionality(s) cannot be statically inferred.
-    """
-    with ops.name_scope(name, "reshape",
-                        values=[event_shape_out, event_shape_in]):
-
-      event_shape_out = ops.convert_to_tensor(event_shape_out,
-                                              name="event_shape_out",
-                                              preferred_dtype=dtypes.int32)
-      event_shape_in = ops.convert_to_tensor(event_shape_in,
-                                             name="event_shape_in",
-                                             preferred_dtype=dtypes.int32)
-
-      # check that input shapes are positive integers
-      assertions = []
-      assertions += self._maybe_check_valid_shape(
-          event_shape_out, "event_shape_out",
-          validate_args=validate_args)
-      assertions += self._maybe_check_valid_shape(
-          event_shape_in, "event_shape_in", validate_args=validate_args)
-
-      # check that prod(event_shape_in) = prod(event_shape_out)
-      assertions += self._maybe_check_matching_sizes(
-          event_shape_in, event_shape_out, validate_args=validate_args)
-
-      self._assertions = assertions
-      self._event_shape_in = event_shape_in
-      self._event_shape_out = event_shape_out
-      self._event_shape_in_static = tensor_util.constant_value_as_shape(
-          event_shape_in)
-      self._event_shape_out_static = tensor_util.constant_value_as_shape(
-          event_shape_out)
-
-      super(Reshape, self).__init__(is_constant_jacobian=True,
-                                    validate_args=validate_args,
-                                    name=name or "reshape")
-
-  def _maybe_check_valid_shape(self, shape_tensor, label,
-                               validate_args=False):
-    """Check that a shape Tensor is int-type and positive."""
-
-    assertions = []
-
-    if not shape_tensor.dtype.is_integer:
-      raise TypeError("{} dtype ({}) should be `int`-like.".format(
-          label, shape_tensor.dtype.name))
-
-    shape_rank = tensor_util.constant_value(array_ops.rank(shape_tensor))
-    if shape_rank is not None and shape_rank > 1:
-      raise ValueError("{} rank should be <= 1.".format(label))
-
-    s = tensor_util.constant_value(shape_tensor)
-    if s is not None:
-      if (s <= 0).any():
-        raise ValueError("{} entries must be positive, but found {}".format(
-            label, s))
-    elif validate_args:
-      assertions.append(check_ops.assert_positive(
-          shape_tensor, message="{} entries must be positive".format(label)))
-
-    return assertions
-
-  def _maybe_check_matching_sizes(self, event_shape_in, event_shape_out,
-                                  validate_args=False):
-    """Check that prod(event_shape_in)==prod(event_shape_out)."""
-
-    def _get_size_from_shape(shape):
-      """Computes size from a shape `Tensor`, statically if possible."""
-      s = tensor_util.constant_value(shape)
-      if s is not None:
-        return [np.int32(np.prod(s))]*2
-      return None, math_ops.reduce_prod(shape, name="size")
-
-    # Ensure `event_shape_in` is compatible with `event_shape_out`.
-    event_size_in_, event_size_in = _get_size_from_shape(  # pylint: disable=unbalanced-tuple-unpacking
-        event_shape_in)
-    event_size_out_, event_size_out = _get_size_from_shape(  # pylint: disable=unbalanced-tuple-unpacking
-        event_shape_out)
-
-    assertions = []
-    if event_size_in_ is not None and event_size_out_ is not None:
-      if event_size_in_ != event_size_out_:
-        raise ValueError(
-            "Input `event_size` ({}) does not match output `event_size` ({}).".
-            format(event_size_in, event_size_out_))
-    elif validate_args:
-      assertions.append(check_ops.assert_equal(
-          event_size_in, event_size_out,
-          message="Input/output `event_size`s do not match."))
-
-    return assertions
-
-  def _reshape_helper(self, x, event_shape_in, event_shape_out):
-    """Reshape only the event_shape of an input `Tensor`."""
-
-    def _get_rank_from_shape(shape):
-      """Computes rank from a shape `Tensor`, statically if possible."""
-      # Uses fact that rank is "shape of shape".
-      ndims = shape.shape.with_rank_at_least(1)[0].value
-      if ndims is not None:
-        return ndims, ndims
-      return None, array_ops.shape(shape)[0]
-
-    event_ndims_in_, event_ndims_in = _get_rank_from_shape(event_shape_in)
-
-    assertions = []
-    # Ensure x.event_shape is compatible with event_shape_in.
-    if x.shape.ndims is not None:
-      x_ndims_, x_ndims = [x.shape.ndims]*2
-    else:
-      x_ndims_, x_ndims = None, array_ops.rank(x)
-
-    if (event_ndims_in_ is not None
-        and x_ndims_ is not None
-        and x.shape.with_rank_at_least(event_ndims_in_)[
-            x_ndims_-event_ndims_in_:].is_fully_defined()):
-      x_event_shape_, x_event_shape = [  # pylint: disable=unbalanced-tuple-unpacking
-          np.int32(x.shape[x_ndims_-event_ndims_in_:])]*2
-    else:
-      x_event_shape_, x_event_shape = (
-          None, array_ops.shape(x)[x_ndims-event_ndims_in:])
-
-    event_shape_in_ = tensor_util.constant_value(event_shape_in)
-
-    if x_event_shape_ is not None and event_shape_in_ is not None:
-      if not np.equal(x_event_shape_, event_shape_in_).all():
-        raise ValueError(
-            "Input `event_shape` ({}) does not match `event_shape_in` ({}).".
-            format(x_event_shape_, event_shape_in_))
-    elif self.validate_args:
-      assertions.append(check_ops.assert_equal(
-          x_event_shape, event_shape_in,
-          message="Input `event_shape` does not match `event_shape_in`."))
-
-    if assertions:
-      x = control_flow_ops.with_dependencies(assertions, x)
-
-    # get the parts of shape(x) that will not change
-    sample_and_batch_shape = array_ops.shape(x)
-
-    ndims = (x.shape.ndims if x.shape.ndims is not None
-             else array_ops.rank(x))
-    sample_and_batch_shape = sample_and_batch_shape[
-        :(ndims - math_ops.abs(event_ndims_in))]
-
-    new_shape = array_ops.concat(
-        [sample_and_batch_shape, event_shape_out], axis=0)
-
-    return array_ops.reshape(x, new_shape)
-
-  def _forward(self, x):
-    with ops.control_dependencies(self._assertions):
-      return self._reshape_helper(x,
-                                  self._event_shape_in,
-                                  self._event_shape_out)
-
-  def _inverse(self, y):
-    with ops.control_dependencies(self._assertions):
-      return self._reshape_helper(y,
-                                  self._event_shape_out,
-                                  self._event_shape_in)
-
-  def _inverse_log_det_jacobian(self, y):
-    with ops.control_dependencies(self._assertions):
-      return constant_op.constant(0., dtype=y.dtype)
-
-  def _forward_log_det_jacobian(self, x):
-    with ops.control_dependencies(self._assertions):
-      return constant_op.constant(0., dtype=x.dtype)
-
-  def _forward_event_shape(self, input_shape):
-    self._event_shape_in_static.assert_is_compatible_with(input_shape)
-    return self._event_shape_out_static
-
-  def _inverse_event_shape(self, output_shape):
-    self._event_shape_out_static.assert_is_compatible_with(output_shape)
-    return self._event_shape_in_static
-
-  def _forward_event_shape_tensor(self, input_shape):
-    input_assertions = self._maybe_check_valid_shape(
-        input_shape, "input event shape", validate_args=self.validate_args)
-    input_assertions += self._maybe_check_matching_sizes(
-        input_shape, self._event_shape_out,
-        validate_args=self.validate_args)
-
-    return control_flow_ops.with_dependencies(
-        input_assertions + self._assertions, self._event_shape_out)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-
-    output_assertions = self._maybe_check_valid_shape(
-        output_shape, "output event shape", validate_args=self.validate_args)
-    output_assertions += self._maybe_check_matching_sizes(
-        output_shape, self._event_shape_in, validate_args=self.validate_args)
-
-    return control_flow_ops.with_dependencies(
-        output_assertions + self._assertions, self._event_shape_in)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
index c20e76c0b7367369865faf973377201c8b8b17e6..a640dfe7dfbcce96261589c7fc49107deaefdd54 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
@@ -18,12 +18,31 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Sigmoid"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Sigmoid",
+]
+
+
+class Sigmoid(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = 1 / (1 + exp(-X))`."""
+
+  def __init__(self, validate_args=False, name="sigmoid"):
+    super(Sigmoid, self).__init__(
+        event_ndims=0, validate_args=validate_args, name=name)
+
+  def _forward(self, x):
+    return math_ops.sigmoid(x)
+
+  def _inverse(self, y):
+    return math_ops.log(y) - math_ops.log1p(-y)
+
+  def _inverse_log_det_jacobian(self, y):
+    return -math_ops.log(y) - math_ops.log1p(-y)
+
+  def _forward_log_det_jacobian(self, x):
+    return -nn_ops.softplus(-x) - nn_ops.softplus(x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
index 448125230d24066697624bce03fed71a2c2f00b1..223bc9d042c69be05b0e578835a31ed6e83c0c97 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
@@ -18,12 +18,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.contrib.distributions.python.ops.bijectors import softmax_centered
 
-_allowed_symbols = ["SigmoidCentered"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "SigmoidCentered",
+]
+
+
+class SigmoidCentered(softmax_centered.SoftmaxCentered):
+  """Bijector which computes Y = g(X) = exp([X 0]) / (1 + exp(-X)).
+
+  Equivalent to: `bijector.SoftmaxCentered(event_ndims=0)`.
+
+  See `bijector.SoftmaxCentered` for more details.
+  """
+
+  def __init__(self, validate_args=False, name="sigmoid_centered"):
+    super(SigmoidCentered, self).__init__(
+        event_ndims=0, validate_args=validate_args, name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
index b3cf03c24612f5c618c71c0a8615f272acdf2d10..3a75e4ae9495793901b0da91a5aa3982aab35852 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
@@ -18,12 +18,162 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.sinh_arcsinh_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["SinhArcsinh"]
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "SinhArcsinh",
+]
+
+
+def _sqrtx2p1(x):
+  """Implementation of `sqrt(1 + x**2)` which is stable despite large `x`."""
+  return array_ops.where(
+      math_ops.abs(x) * np.sqrt(np.finfo(x.dtype.as_numpy_dtype).eps) <= 1.,
+      math_ops.sqrt(x**2. + 1.),
+      # For large x, calculating x**2 can overflow. This can be alleviated by
+      # considering:
+      # sqrt(1 + x**2)
+      # = exp(0.5 log(1 + x**2))
+      # = exp(0.5 log(x**2 * (1 + x**-2)))
+      # = exp(log(x) + 0.5 * log(1 + x**-2))
+      # = |x| * exp(0.5 log(1 + x**-2))
+      # = |x| * sqrt(1 + x**-2)
+      # We omit the last term in this approximation.
+      # When |x| > 1 / sqrt(machineepsilon), the second term will be 1,
+      # due to sqrt(1 + x**-2) = 1. This is also true with the gradient term,
+      # and higher order gradients, since the first order derivative of
+      # sqrt(1 + x**-2) is -2 * x**-3 / (1 + x**-2) = -2 / (x**3 + x),
+      # and all nth-order derivatives will be O(x**-(n + 2)). This makes any
+      # gradient terms that contain any derivatives of sqrt(1 + x**-2) vanish.
+      math_ops.abs(x))
+
+
+class SinhArcsinh(bijector.Bijector):
+  """Compute `Y = g(X) = Sinh( (Arcsinh(X) + skewness) * tailweight )`.
+
+  For `skewness in (-inf, inf)` and `tailweight in (0, inf)`, this
+  transformation is a
+  diffeomorphism of the real line `(-inf, inf)`.  The inverse transform is
+  `X = g^{-1}(Y) = Sinh( ArcSinh(Y) / tailweight - skewness )`.
+
+  The `SinhArcsinh` transformation of the Normal is described in
+  [Sinh-arcsinh distributions](https://www.jstor.org/stable/27798865)
+  This Bijector allows a similar transformation of any distribution supported on
+  `(-inf, inf)`.
+
+  #### Meaning of the parameters
+
+  * If `skewness = 0` and `tailweight = 1`, this transform is the identity.
+  * Positive (negative) `skewness` leads to positive (negative) skew.
+    * positive skew means, for unimodal `X` centered at zero, the mode of `Y` is
+      "tilted" to the right.
+    * positive skew means positive values of `Y` become more likely, and
+      negative values become less likely.
+  * Larger (smaller) `tailweight` leads to fatter (thinner) tails.
+    * Fatter tails mean larger values of `|Y|` become more likely.
+    * If `X` is a unit Normal, `tailweight < 1` leads to a distribution that is
+      "flat" around `Y = 0`, and a very steep drop-off in the tails.
+    * If `X` is a unit Normal, `tailweight > 1` leads to a distribution more
+      peaked at the mode with heavier tails.
+
+  To see the argument about the tails, note that for `|X| >> 1` and
+  `|X| >> (|skewness| * tailweight)**tailweight`, we have
+  `Y approx 0.5 X**tailweight e**(sign(X) skewness * tailweight)`.
+  """
+
+  def __init__(self,
+               skewness=None,
+               tailweight=None,
+               event_ndims=0,
+               validate_args=False,
+               name="SinhArcsinh"):
+    """Instantiates the `SinhArcsinh` bijector.
+
+    Args:
+      skewness:  Skewness parameter.  Float-type `Tensor`.  Default is `0`
+        of type `float32`.
+      tailweight:  Tailweight parameter.  Positive `Tensor` of same `dtype` as
+        `skewness` and broadcastable `shape`.  Default is `1` of type `float32`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[skewness, tailweight]):
+      tailweight = 1. if tailweight is None else tailweight
+      skewness = 0. if skewness is None else skewness
+      self._skewness = ops.convert_to_tensor(
+          skewness, name="skewness")
+      self._tailweight = ops.convert_to_tensor(
+          tailweight, name="tailweight", dtype=self._skewness.dtype)
+      check_ops.assert_same_float_dtype([self._skewness, self._tailweight])
+      if validate_args:
+        self._tailweight = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(
+                self._tailweight,
+                message="Argument tailweight was not positive")
+        ], self._tailweight)
+    super(SinhArcsinh, self).__init__(
+        event_ndims=event_ndims, validate_args=validate_args, name=name)
+
+  @property
+  def skewness(self):
+    """The `skewness` in: `Y  = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
+    return self._skewness
+
+  @property
+  def tailweight(self):
+    """The `tailweight` in: `Y = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
+    return self._tailweight
+
+  def _forward(self, x):
+    return math_ops.sinh((math_ops.asinh(x) + self.skewness) * self.tailweight)
+
+  def _inverse(self, y):
+    return math_ops.sinh(math_ops.asinh(y) / self.tailweight - self.skewness)
+
+  def _inverse_log_det_jacobian(self, y):
+    # x = sinh(arcsinh(y) / tailweight - skewness)
+    # Using sinh' = cosh, arcsinh'(y) = 1 / sqrt(y**2 + 1),
+    # dx/dy
+    # = cosh(arcsinh(y) / tailweight - skewness)
+    #     / (tailweight * sqrt(y**2 + 1))
+    event_dims = self._event_dims_tensor(y)
+    return math_ops.reduce_sum(
+        # This is computed inside the log to avoid catastrophic cancellations
+        # from cosh((arcsinh(y) / tailweight) - skewness) and sqrt(x**2 + 1).
+        math_ops.log(math_ops.cosh(
+            math_ops.asinh(y) / self.tailweight - self.skewness)
+                     # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
+                     # where (arcsinh(x) / tailweight) - skewness ~= arcsinh(x).
+                     / _sqrtx2p1(y))
+        - math_ops.log(self.tailweight),
+        axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    # y = sinh((arcsinh(x) + skewness) * tailweight)
+    # Using sinh' = cosh, arcsinh'(x) = 1 / sqrt(x**2 + 1),
+    # dy/dx
+    # = cosh((arcsinh(x) + skewness) * tailweight) * tailweight / sqrt(x**2 + 1)
+    event_dims = self._event_dims_tensor(x)
+    return math_ops.reduce_sum(
+        # This is computed inside the log to avoid catastrophic cancellations
+        # from cosh((arcsinh(x) + skewness) * tailweight) and sqrt(x**2 + 1).
+        math_ops.log(math_ops.cosh(
+            (math_ops.asinh(x) + self.skewness) * self.tailweight)
+                     # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
+                     # where (arcsinh(x) + skewness) * tailweight ~= arcsinh(x).
+                     / _sqrtx2p1(x))
+        + math_ops.log(self.tailweight),
+        axis=event_dims)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh_impl.py
deleted file mode 100644
index 3a75e4ae9495793901b0da91a5aa3982aab35852..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh_impl.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SinhArcsinh bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-__all__ = [
-    "SinhArcsinh",
-]
-
-
-def _sqrtx2p1(x):
-  """Implementation of `sqrt(1 + x**2)` which is stable despite large `x`."""
-  return array_ops.where(
-      math_ops.abs(x) * np.sqrt(np.finfo(x.dtype.as_numpy_dtype).eps) <= 1.,
-      math_ops.sqrt(x**2. + 1.),
-      # For large x, calculating x**2 can overflow. This can be alleviated by
-      # considering:
-      # sqrt(1 + x**2)
-      # = exp(0.5 log(1 + x**2))
-      # = exp(0.5 log(x**2 * (1 + x**-2)))
-      # = exp(log(x) + 0.5 * log(1 + x**-2))
-      # = |x| * exp(0.5 log(1 + x**-2))
-      # = |x| * sqrt(1 + x**-2)
-      # We omit the last term in this approximation.
-      # When |x| > 1 / sqrt(machineepsilon), the second term will be 1,
-      # due to sqrt(1 + x**-2) = 1. This is also true with the gradient term,
-      # and higher order gradients, since the first order derivative of
-      # sqrt(1 + x**-2) is -2 * x**-3 / (1 + x**-2) = -2 / (x**3 + x),
-      # and all nth-order derivatives will be O(x**-(n + 2)). This makes any
-      # gradient terms that contain any derivatives of sqrt(1 + x**-2) vanish.
-      math_ops.abs(x))
-
-
-class SinhArcsinh(bijector.Bijector):
-  """Compute `Y = g(X) = Sinh( (Arcsinh(X) + skewness) * tailweight )`.
-
-  For `skewness in (-inf, inf)` and `tailweight in (0, inf)`, this
-  transformation is a
-  diffeomorphism of the real line `(-inf, inf)`.  The inverse transform is
-  `X = g^{-1}(Y) = Sinh( ArcSinh(Y) / tailweight - skewness )`.
-
-  The `SinhArcsinh` transformation of the Normal is described in
-  [Sinh-arcsinh distributions](https://www.jstor.org/stable/27798865)
-  This Bijector allows a similar transformation of any distribution supported on
-  `(-inf, inf)`.
-
-  #### Meaning of the parameters
-
-  * If `skewness = 0` and `tailweight = 1`, this transform is the identity.
-  * Positive (negative) `skewness` leads to positive (negative) skew.
-    * positive skew means, for unimodal `X` centered at zero, the mode of `Y` is
-      "tilted" to the right.
-    * positive skew means positive values of `Y` become more likely, and
-      negative values become less likely.
-  * Larger (smaller) `tailweight` leads to fatter (thinner) tails.
-    * Fatter tails mean larger values of `|Y|` become more likely.
-    * If `X` is a unit Normal, `tailweight < 1` leads to a distribution that is
-      "flat" around `Y = 0`, and a very steep drop-off in the tails.
-    * If `X` is a unit Normal, `tailweight > 1` leads to a distribution more
-      peaked at the mode with heavier tails.
-
-  To see the argument about the tails, note that for `|X| >> 1` and
-  `|X| >> (|skewness| * tailweight)**tailweight`, we have
-  `Y approx 0.5 X**tailweight e**(sign(X) skewness * tailweight)`.
-  """
-
-  def __init__(self,
-               skewness=None,
-               tailweight=None,
-               event_ndims=0,
-               validate_args=False,
-               name="SinhArcsinh"):
-    """Instantiates the `SinhArcsinh` bijector.
-
-    Args:
-      skewness:  Skewness parameter.  Float-type `Tensor`.  Default is `0`
-        of type `float32`.
-      tailweight:  Tailweight parameter.  Positive `Tensor` of same `dtype` as
-        `skewness` and broadcastable `shape`.  Default is `1` of type `float32`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    with self._name_scope("init", values=[skewness, tailweight]):
-      tailweight = 1. if tailweight is None else tailweight
-      skewness = 0. if skewness is None else skewness
-      self._skewness = ops.convert_to_tensor(
-          skewness, name="skewness")
-      self._tailweight = ops.convert_to_tensor(
-          tailweight, name="tailweight", dtype=self._skewness.dtype)
-      check_ops.assert_same_float_dtype([self._skewness, self._tailweight])
-      if validate_args:
-        self._tailweight = control_flow_ops.with_dependencies([
-            check_ops.assert_positive(
-                self._tailweight,
-                message="Argument tailweight was not positive")
-        ], self._tailweight)
-    super(SinhArcsinh, self).__init__(
-        event_ndims=event_ndims, validate_args=validate_args, name=name)
-
-  @property
-  def skewness(self):
-    """The `skewness` in: `Y  = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
-    return self._skewness
-
-  @property
-  def tailweight(self):
-    """The `tailweight` in: `Y = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
-    return self._tailweight
-
-  def _forward(self, x):
-    return math_ops.sinh((math_ops.asinh(x) + self.skewness) * self.tailweight)
-
-  def _inverse(self, y):
-    return math_ops.sinh(math_ops.asinh(y) / self.tailweight - self.skewness)
-
-  def _inverse_log_det_jacobian(self, y):
-    # x = sinh(arcsinh(y) / tailweight - skewness)
-    # Using sinh' = cosh, arcsinh'(y) = 1 / sqrt(y**2 + 1),
-    # dx/dy
-    # = cosh(arcsinh(y) / tailweight - skewness)
-    #     / (tailweight * sqrt(y**2 + 1))
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        # This is computed inside the log to avoid catastrophic cancellations
-        # from cosh((arcsinh(y) / tailweight) - skewness) and sqrt(x**2 + 1).
-        math_ops.log(math_ops.cosh(
-            math_ops.asinh(y) / self.tailweight - self.skewness)
-                     # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
-                     # where (arcsinh(x) / tailweight) - skewness ~= arcsinh(x).
-                     / _sqrtx2p1(y))
-        - math_ops.log(self.tailweight),
-        axis=event_dims)
-
-  def _forward_log_det_jacobian(self, x):
-    # y = sinh((arcsinh(x) + skewness) * tailweight)
-    # Using sinh' = cosh, arcsinh'(x) = 1 / sqrt(x**2 + 1),
-    # dy/dx
-    # = cosh((arcsinh(x) + skewness) * tailweight) * tailweight / sqrt(x**2 + 1)
-    event_dims = self._event_dims_tensor(x)
-    return math_ops.reduce_sum(
-        # This is computed inside the log to avoid catastrophic cancellations
-        # from cosh((arcsinh(x) + skewness) * tailweight) and sqrt(x**2 + 1).
-        math_ops.log(math_ops.cosh(
-            (math_ops.asinh(x) + self.skewness) * self.tailweight)
-                     # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
-                     # where (arcsinh(x) + skewness) * tailweight ~= arcsinh(x).
-                     / _sqrtx2p1(x))
-        + math_ops.log(self.tailweight),
-        axis=event_dims)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
index be6608f97880ae68e10b17c815bf2d8438293261..a9dcce6c526600f3b26c6bceb730417000917ce7 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
@@ -18,12 +18,223 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["SoftmaxCentered"]
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "SoftmaxCentered",
+]
+
+
+class SoftmaxCentered(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = exp([X 0]) / sum(exp([X 0]))`.
+
+  To implement [softmax](https://en.wikipedia.org/wiki/Softmax_function) as a
+  bijection, the forward transformation appends a value to the input and the
+  inverse removes this coordinate. The appended coordinate represents a pivot,
+  e.g., `softmax(x) = exp(x-c) / sum(exp(x-c))` where `c` is the implicit last
+  coordinate.
+
+  Because we append a coordinate, this bijector only supports `event_ndim in [0,
+  1]`, i.e., scalars and vectors.
+
+  Example Use:
+
+  ```python
+  bijector.SoftmaxCentered(event_ndims=1).forward(tf.log([2, 3, 4]))
+  # Result: [0.2, 0.3, 0.4, 0.1]
+  # Extra result: 0.1
+
+  bijector.SoftmaxCentered(event_ndims=1).inverse([0.2, 0.3, 0.4, 0.1])
+  # Result: tf.log([2, 3, 4])
+  # Extra coordinate removed.
+  ```
+
+  At first blush it may seem like the [Invariance of domain](
+  https://en.wikipedia.org/wiki/Invariance_of_domain) theorem implies this
+  implementation is not a bijection. However, the appended dimension
+  makes the (forward) image non-open and the theorem does not directly apply.
+  """
+
+  def __init__(self,
+               event_ndims=0,
+               validate_args=False,
+               name="softmax_centered"):
+    self._graph_parents = []
+    self._name = name
+    with self._name_scope("init", values=[event_ndims]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      event_ndims = tensor_util.constant_value(event_ndims)
+      if event_ndims is None or event_ndims not in [0, 1]:
+        raise ValueError("`event_ndims` must be a TF constant which is 0 or 1")
+    self._static_event_ndims = event_ndims
+    super(SoftmaxCentered, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward_event_shape(self, input_shape):
+    if input_shape.ndims is None:
+      return input_shape
+    if input_shape.ndims != self._static_event_ndims:
+      raise ValueError("input_shape.dims = %d != %d" %
+                       (input_shape.ndims, self._static_event_ndims))
+    if input_shape.ndims == 0:
+      return tensor_shape.TensorShape([2])
+    if input_shape.ndims == 1:
+      return tensor_shape.TensorShape(input_shape[0] + 1)
+    # Unreachable code:
+    raise ValueError("event_ndims = %d must be 0 or 1" % input_shape.ndims)
+
+  def _forward_event_shape_tensor(self, input_shape):
+    ndims = array_ops.shape(input_shape)
+    if self.validate_args:
+      # It is not possible for a negative shape so we need only check <= 1.
+      is_zero_or_one = check_ops.assert_equal(
+          ndims, 0 if self._static_event_ndims == 0 else 1,
+          message="event_ndims must be 0 or 1")
+      ndims = control_flow_ops.with_dependencies([is_zero_or_one], ndims)
+    if self._static_event_ndims == 0:
+      return ops.convert_to_tensor(
+          [2], dtype=dtypes.int32, name="output_shape")
+    return input_shape + 1
+
+  def _inverse_event_shape(self, output_shape):
+    if output_shape.ndims is None:
+      return output_shape
+    if output_shape.ndims != 1:
+      raise ValueError("output_shape.ndims = %d != 1" % output_shape.ndims)
+    if self._static_event_ndims == 0:
+      return tensor_shape.TensorShape([])
+    return tensor_shape.TensorShape(output_shape[0] - 1)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    ndims = array_ops.shape(output_shape)[0]
+    if self.validate_args:
+      # It is not possible for a negative shape so we need only check <= 1.
+      is_one = check_ops.assert_equal(
+          ndims, 1, message="event_ndims must be 1")
+      ndims = control_flow_ops.with_dependencies([is_one], ndims)
+    if self._static_event_ndims == 0:
+      return ops.convert_to_tensor([], dtype=dtypes.int32, name="output_shape")
+    return array_ops.expand_dims(output_shape[0] - 1, dim=0)
+
+  def _forward(self, x):
+    # Pad the last dim with a zeros vector. We need this because it lets us
+    # infer the scale in the inverse function.
+    y = array_ops.expand_dims(x, dim=-1) if self._static_event_ndims == 0 else x
+    y = distribution_util.pad(y, axis=-1, back=True)
+
+    # Set shape hints.
+    if x.shape.ndims is not None:
+      shape = x.shape.as_list()
+      if self._static_event_ndims == 0:
+        shape += [2]
+      elif shape[-1] is not None:
+        shape[-1] += 1
+      shape = tensor_shape.TensorShape(shape)
+      y.shape.assert_is_compatible_with(shape)
+      y.set_shape(shape)
+
+    # Since we only support event_ndims in [0, 1] and we do padding, we always
+    # reduce over the last dimension, i.e., dim=-1 (which is the default).
+    return nn_ops.softmax(y)
+
+  def _inverse(self, y):
+    # To derive the inverse mapping note that:
+    #   y[i] = exp(x[i]) / normalization
+    # and
+    #   y[end] = 1 / normalization.
+    # Thus:
+    # x[i] = log(exp(x[i])) - log(y[end]) - log(normalization)
+    #      = log(exp(x[i])/normalization) - log(y[end])
+    #      = log(y[i]) - log(y[end])
+    shape = (np.asarray(y.shape.as_list(), dtype=np.int32)
+             if y.shape.is_fully_defined()
+             else array_ops.shape(y, name="shape"))
+    ndims = distribution_util.prefer_static_rank(y)
+
+    # Do this first to make sure CSE catches that it'll happen again in
+    # _inverse_log_det_jacobian.
+    x = math_ops.log(y)
+
+    # We now extract the last coordinate of the rightmost dimension.
+    # Our trick is to slice from [0,0,...,shape[-1]-1] to shape[:-1]+[1].
+    begin = array_ops.one_hot(indices=ndims-1,
+                              depth=ndims,
+                              on_value=shape[-1]-np.array(1, dtype=shape.dtype),
+                              dtype=shape.dtype)
+    size = array_ops.concat([shape[:-1], np.asarray([1], dtype=shape.dtype)], 0)
+    log_normalization = -array_ops.strided_slice(x, begin, begin + size)
+
+    # Here we slice out all but the last coordinate; see above for idea.
+    begin = array_ops.zeros_like(shape)
+    size = array_ops.concat([shape[:-1], [shape[-1] - 1]], 0)
+    x = array_ops.strided_slice(x, begin, begin + size)
+
+    x += log_normalization
+
+    if self._static_event_ndims == 0:
+      x = array_ops.squeeze(x, squeeze_dims=[ndims-1])
+
+    # Set shape hints.
+    if y.shape.ndims is not None:
+      shape = y.shape.as_list()
+      if self._static_event_ndims == 0:
+        shape = shape[:-1]
+      elif shape[-1] is not None:
+        shape[-1] -= 1
+      shape = tensor_shape.TensorShape(shape)
+      x.shape.assert_is_compatible_with(shape)
+      x.set_shape(shape)
+
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    # WLOG, consider the vector case:
+    #   x = log(y[:-1]) - log(y[-1])
+    # where,
+    #   y[-1] = 1 - sum(y[:-1]).
+    # We have:
+    #   det{ dX/dY } = det{ diag(1 ./ y[:-1]) + 1 / y[-1] }
+    #                = det{ inv{ diag(y[:-1]) - y[:-1]' y[:-1] } }   (1)
+    #                = 1 / det{ diag(y[:-1]) - y[:-1]' y[:-1] }
+    #                = 1 / { (1 + y[:-1]' inv(diag(y[:-1])) y[:-1]) *
+    #                        det(diag(y[:-1])) }                     (2)
+    #                = 1 / { y[-1] prod(y[:-1]) }
+    #                = 1 / prod(y)
+    # (1) - https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
+    #       or by noting that det{ dX/dY } = 1 / det{ dY/dX } from Bijector
+    #       docstring "Tip".
+    # (2) - https://en.wikipedia.org/wiki/Matrix_determinant_lemma
+    return -math_ops.reduce_sum(math_ops.log(y), axis=-1)
+
+  def _forward_log_det_jacobian(self, x):
+    if self._static_event_ndims == 0:
+      return x - 2. * nn_ops.softplus(x)
+    else:
+      # This code is similar to nn_ops.log_softmax but different because we have
+      # an implicit zero column to handle. I.e., instead of:
+      #   reduce_sum(logits - reduce_sum(exp(logits), dim))
+      # we must do:
+      #   log_normalization = 1 + reduce_sum(exp(logits))
+      #   -log_normalization + reduce_sum(logits - log_normalization)
+      log_normalization = nn_ops.softplus(
+          math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
+      fldj = (-log_normalization +
+              math_ops.reduce_sum(x - log_normalization,
+                                  axis=-1,
+                                  keep_dims=True))
+      return array_ops.squeeze(fldj, squeeze_dims=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
deleted file mode 100644
index 8645cc1b6b04be75a419342591272f07a4a1711c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SoftmaxCentered bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "SoftmaxCentered",
-]
-
-
-class SoftmaxCentered(bijector.Bijector):
-  """Bijector which computes `Y = g(X) = exp([X 0]) / sum(exp([X 0]))`.
-
-  To implement [softmax](https://en.wikipedia.org/wiki/Softmax_function) as a
-  bijection, the forward transformation appends a value to the input and the
-  inverse removes this coordinate. The appended coordinate represents a pivot,
-  e.g., `softmax(x) = exp(x-c) / sum(exp(x-c))` where `c` is the implicit last
-  coordinate.
-
-  Because we append a coordinate, this bijector only supports `event_ndim in [0,
-  1]`, i.e., scalars and vectors.
-
-  Example Use:
-
-  ```python
-  bijector.SoftmaxCentered(event_ndims=1).forward(tf.log([2, 3, 4]))
-  # Result: [0.2, 0.3, 0.4, 0.1]
-  # Extra result: 0.1
-
-  bijector.SoftmaxCentered(event_ndims=1).inverse([0.2, 0.3, 0.4, 0.1])
-  # Result: tf.log([2, 3, 4])
-  # Extra coordinate removed.
-  ```
-
-  At first blush it may seem like the [Invariance of domain](
-  https://en.wikipedia.org/wiki/Invariance_of_domain) theorem implies this
-  implementation is not a bijection. However, the appended dimension
-  makes the (forward) image non-open and the theorem does not directly apply.
-  """
-
-  def __init__(self,
-               event_ndims=0,
-               validate_args=False,
-               name="softmax_centered"):
-    self._graph_parents = []
-    self._name = name
-    with self._name_scope("init", values=[event_ndims]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims = tensor_util.constant_value(event_ndims)
-      if event_ndims is None or event_ndims not in [0, 1]:
-        raise ValueError("`event_ndims` must be a TF constant which is 0 or 1")
-    self._static_event_ndims = event_ndims
-    super(SoftmaxCentered, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward_event_shape(self, input_shape):
-    if input_shape.ndims is None:
-      return input_shape
-    if input_shape.ndims != self._static_event_ndims:
-      raise ValueError("input_shape.dims = %d != %d" %
-                       (input_shape.ndims, self._static_event_ndims))
-    if input_shape.ndims == 0:
-      return tensor_shape.TensorShape([2])
-    if input_shape.ndims == 1:
-      return tensor_shape.TensorShape(input_shape[0] + 1)
-    # Unreachable code:
-    raise ValueError("event_ndims = %d must be 0 or 1" % input_shape.ndims)
-
-  def _forward_event_shape_tensor(self, input_shape):
-    ndims = array_ops.shape(input_shape)
-    if self.validate_args:
-      # It is not possible for a negative shape so we need only check <= 1.
-      is_zero_or_one = check_ops.assert_equal(
-          ndims, 0 if self._static_event_ndims == 0 else 1,
-          message="event_ndims must be 0 or 1")
-      ndims = control_flow_ops.with_dependencies([is_zero_or_one], ndims)
-    if self._static_event_ndims == 0:
-      return ops.convert_to_tensor(
-          [2], dtype=dtypes.int32, name="output_shape")
-    return input_shape + 1
-
-  def _inverse_event_shape(self, output_shape):
-    if output_shape.ndims is None:
-      return output_shape
-    if output_shape.ndims != 1:
-      raise ValueError("output_shape.ndims = %d != 1" % output_shape.ndims)
-    if self._static_event_ndims == 0:
-      return tensor_shape.TensorShape([])
-    return tensor_shape.TensorShape(output_shape[0] - 1)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    ndims = array_ops.shape(output_shape)[0]
-    if self.validate_args:
-      # It is not possible for a negative shape so we need only check <= 1.
-      is_one = check_ops.assert_equal(
-          ndims, 1, message="event_ndims must be 1")
-      ndims = control_flow_ops.with_dependencies([is_one], ndims)
-    if self._static_event_ndims == 0:
-      return ops.convert_to_tensor([], dtype=dtypes.int32, name="output_shape")
-    return array_ops.expand_dims(output_shape[0] - 1, dim=0)
-
-  def _forward(self, x):
-    # Pad the last dim with a zeros vector. We need this because it lets us
-    # infer the scale in the inverse function.
-    y = array_ops.expand_dims(x, dim=-1) if self._static_event_ndims == 0 else x
-    ndims = (y.get_shape().ndims if y.get_shape().ndims is not None
-             else array_ops.rank(y))
-    y = array_ops.pad(y,
-                      paddings=array_ops.concat(
-                          (array_ops.zeros(
-                              (ndims - 1, 2), dtype=dtypes.int32), [[0, 1]]),
-                          0))
-
-    # Set shape hints.
-    if x.get_shape().ndims is not None:
-      shape = x.get_shape().as_list()
-      if self._static_event_ndims == 0:
-        shape += [2]
-      elif shape[-1] is not None:
-        shape[-1] += 1
-      shape = tensor_shape.TensorShape(shape)
-      y.get_shape().assert_is_compatible_with(shape)
-      y.set_shape(shape)
-
-    # Since we only support event_ndims in [0, 1] and we do padding, we always
-    # reduce over the last dimension, i.e., dim=-1 (which is the default).
-    return nn_ops.softmax(y)
-
-  def _inverse(self, y):
-    # To derive the inverse mapping note that:
-    #   y[i] = exp(x[i]) / normalization
-    # and
-    #   y[end] = 1 / normalization.
-    # Thus:
-    # x[i] = log(exp(x[i])) - log(y[end]) - log(normalization)
-    #      = log(exp(x[i])/normalization) - log(y[end])
-    #      = log(y[i]) - log(y[end])
-    shape = (np.asarray(y.get_shape().as_list(), dtype=np.int32)
-             if y.get_shape().is_fully_defined()
-             else array_ops.shape(y, name="shape"))
-    ndims = y.get_shape().ndims or math_ops.rank(y, name="ndims")
-
-    # Do this first to make sure CSE catches that it'll happen again in
-    # _inverse_log_det_jacobian.
-    x = math_ops.log(y)
-
-    # We now extract the last coordinate of the rightmost dimension.
-    # Our trick is to slice from [0,0,...,shape[-1]-1] to shape[:-1]+[1].
-    begin = array_ops.one_hot(indices=ndims-1,
-                              depth=ndims,
-                              on_value=shape[-1]-np.array(1, dtype=shape.dtype),
-                              dtype=shape.dtype)
-    size = array_ops.concat([shape[:-1], np.asarray([1], dtype=shape.dtype)], 0)
-    log_normalization = -array_ops.strided_slice(x, begin, begin + size)
-
-    # Here we slice out all but the last coordinate; see above for idea.
-    begin = array_ops.zeros_like(shape)
-    size = array_ops.concat([shape[:-1], [shape[-1] - 1]], 0)
-    x = array_ops.strided_slice(x, begin, begin + size)
-
-    x += log_normalization
-
-    if self._static_event_ndims == 0:
-      x = array_ops.squeeze(x, squeeze_dims=[ndims-1])
-
-    # Set shape hints.
-    if y.get_shape().ndims is not None:
-      shape = y.get_shape().as_list()
-      if self._static_event_ndims == 0:
-        shape = shape[:-1]
-      elif shape[-1] is not None:
-        shape[-1] -= 1
-      shape = tensor_shape.TensorShape(shape)
-      x.get_shape().assert_is_compatible_with(shape)
-      x.set_shape(shape)
-
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    # WLOG, consider the vector case:
-    #   x = log(y[:-1]) - log(y[-1])
-    # where,
-    #   y[-1] = 1 - sum(y[:-1]).
-    # We have:
-    #   det{ dX/dY } = det{ diag(1 ./ y[:-1]) + 1 / y[-1] }
-    #                = det{ inv{ diag(y[:-1]) - y[:-1]' y[:-1] } }   (1)
-    #                = 1 / det{ diag(y[:-1]) - y[:-1]' y[:-1] }
-    #                = 1 / { (1 + y[:-1]' inv(diag(y[:-1])) y[:-1]) *
-    #                        det(diag(y[:-1])) }                     (2)
-    #                = 1 / { y[-1] prod(y[:-1]) }
-    #                = 1 / prod(y)
-    # (1) - https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
-    #       or by noting that det{ dX/dY } = 1 / det{ dY/dX } from Bijector
-    #       docstring "Tip".
-    # (2) - https://en.wikipedia.org/wiki/Matrix_determinant_lemma
-    return -math_ops.reduce_sum(math_ops.log(y), axis=-1)
-
-  def _forward_log_det_jacobian(self, x):
-    if self._static_event_ndims == 0:
-      return x - 2. * nn_ops.softplus(x)
-    else:
-      # This code is similar to nn_ops.log_softmax but different because we have
-      # an implicit zero column to handle. I.e., instead of:
-      #   reduce_sum(logits - reduce_sum(exp(logits), dim))
-      # we must do:
-      #   log_normalization = 1 + reduce_sum(exp(logits))
-      #   -log_normalization + reduce_sum(logits - log_normalization)
-      log_normalization = nn_ops.softplus(
-          math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
-      fldj = (-log_normalization +
-              math_ops.reduce_sum(x - log_normalization,
-                                  axis=-1,
-                                  keep_dims=True))
-      return array_ops.squeeze(fldj, squeeze_dims=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
index 250a1144b53bb43271ff7ee494604d9bae6feda8..81957fcf78922fa15fd20a25d144071f431161ae 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
@@ -18,12 +18,127 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.softplus_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
-_allowed_symbols = ["Softplus"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Softplus",
+]
+
+
+class Softplus(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = Log[1 + exp(X)]`.
+
+  The softplus `Bijector` has the following two useful properties:
+
+  * The domain is the positive real numbers
+  * `softplus(x) approx x`, for large `x`, so it does not overflow as easily as
+    the `Exp` `Bijector`.
+
+  The optional nonzero `hinge_softness` parameter changes the transition at
+  zero.  With `hinge_softness = c`, the bijector is:
+
+    ```f_c(x) := c * g(x / c) = c * Log[1 + exp(x / c)].```
+
+  For large `x >> 1`, `c * Log[1 + exp(x / c)] approx c * Log[exp(x / c)] = x`,
+  so the behavior for large `x` is the same as the standard softplus.
+
+  As `c > 0` approaches 0 from the right, `f_c(x)` becomes less and less soft,
+  approaching `max(0, x)`.
+
+  * `c = 1` is the default.
+  * `c > 0` but small means `f(x) approx ReLu(x) = max(0, x)`.
+  * `c < 0` flips sign and reflects around the `y-axis`: `f_{-c}(x) = -f_c(-x)`.
+  * `c = 0` results in a non-bijective transformation and triggers an exception.
+
+    Example Use:
+
+    ```python
+    # Create the Y=g(X)=softplus(X) transform which works only on Tensors with 1
+    # batch ndim and 2 event ndims (i.e., vector of matrices).
+    softplus = Softplus(event_ndims=2)
+    x = [[[1., 2],
+          [3, 4]],
+         [[5, 6],
+          [7, 8]]]
+    log(1 + exp(x)) == softplus.forward(x)
+    log(exp(x) - 1) == softplus.inverse(x)
+    ```
+
+    Note: log(.) and exp(.) are applied element-wise but the Jacobian is a
+    reduction over the event space.
+  """
+
+  @distribution_util.AppendDocstring(
+      kwargs_dict={
+          "hinge_softness": (
+              "Nonzero floating point `Tensor`.  Controls the softness of what "
+              "would otherwise be a kink at the origin.  Default is 1.0")})
+  def __init__(self,
+               event_ndims=0,
+               hinge_softness=None,
+               validate_args=False,
+               name="softplus"):
+    with ops.name_scope(name, values=[hinge_softness]):
+      if hinge_softness is not None:
+        self._hinge_softness = ops.convert_to_tensor(
+            hinge_softness, name="hinge_softness")
+      else:
+        self._hinge_softness = None
+      if validate_args:
+        nonzero_check = check_ops.assert_none_equal(
+            ops.convert_to_tensor(
+                0, dtype=self.hinge_softness.dtype),
+            self.hinge_softness,
+            message="hinge_softness must be non-zero")
+        self._hinge_softness = control_flow_ops.with_dependencies(
+            [nonzero_check], self.hinge_softness)
+
+    super(Softplus, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    if self.hinge_softness is None:
+      return nn_ops.softplus(x)
+    hinge_softness = math_ops.cast(self.hinge_softness, x.dtype)
+    return hinge_softness * nn_ops.softplus(x / hinge_softness)
+
+  def _inverse(self, y):
+    if self.hinge_softness is None:
+      return distribution_util.softplus_inverse(y)
+    hinge_softness = math_ops.cast(self.hinge_softness, y.dtype)
+    return hinge_softness * distribution_util.softplus_inverse(
+        y / hinge_softness)
+
+  def _inverse_log_det_jacobian(self, y):
+    # Could also do:
+    #   ildj = math_ops.reduce_sum(y - distribution_util.softplus_inverse(y),
+    #                              axis=event_dims)
+    # but the following is more numerically stable. Ie,
+    # Y = Log[1 + exp{X}] ==> X = Log[exp{Y} - 1]
+    # ==> dX/dY = exp{Y} / (exp{Y} - 1)
+    #           = 1 / (1 - exp{-Y}),
+    # which is the most stable for large Y > 0. For small Y, we use
+    # 1 - exp{-Y} approx Y.
+    if self.hinge_softness is not None:
+      y /= math_ops.cast(self.hinge_softness, y.dtype)
+    return -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
+                                axis=self._event_dims_tensor(y))
+
+  def _forward_log_det_jacobian(self, x):
+    if self.hinge_softness is not None:
+      x /= math_ops.cast(self.hinge_softness, x.dtype)
+    return -math_ops.reduce_sum(nn_ops.softplus(-x),
+                                axis=self._event_dims_tensor(x))
+
+  @property
+  def hinge_softness(self):
+    return self._hinge_softness
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
deleted file mode 100644
index 81957fcf78922fa15fd20a25d144071f431161ae..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Softplus bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-__all__ = [
-    "Softplus",
-]
-
-
-class Softplus(bijector.Bijector):
-  """Bijector which computes `Y = g(X) = Log[1 + exp(X)]`.
-
-  The softplus `Bijector` has the following two useful properties:
-
-  * The domain is the positive real numbers
-  * `softplus(x) approx x`, for large `x`, so it does not overflow as easily as
-    the `Exp` `Bijector`.
-
-  The optional nonzero `hinge_softness` parameter changes the transition at
-  zero.  With `hinge_softness = c`, the bijector is:
-
-    ```f_c(x) := c * g(x / c) = c * Log[1 + exp(x / c)].```
-
-  For large `x >> 1`, `c * Log[1 + exp(x / c)] approx c * Log[exp(x / c)] = x`,
-  so the behavior for large `x` is the same as the standard softplus.
-
-  As `c > 0` approaches 0 from the right, `f_c(x)` becomes less and less soft,
-  approaching `max(0, x)`.
-
-  * `c = 1` is the default.
-  * `c > 0` but small means `f(x) approx ReLu(x) = max(0, x)`.
-  * `c < 0` flips sign and reflects around the `y-axis`: `f_{-c}(x) = -f_c(-x)`.
-  * `c = 0` results in a non-bijective transformation and triggers an exception.
-
-    Example Use:
-
-    ```python
-    # Create the Y=g(X)=softplus(X) transform which works only on Tensors with 1
-    # batch ndim and 2 event ndims (i.e., vector of matrices).
-    softplus = Softplus(event_ndims=2)
-    x = [[[1., 2],
-          [3, 4]],
-         [[5, 6],
-          [7, 8]]]
-    log(1 + exp(x)) == softplus.forward(x)
-    log(exp(x) - 1) == softplus.inverse(x)
-    ```
-
-    Note: log(.) and exp(.) are applied element-wise but the Jacobian is a
-    reduction over the event space.
-  """
-
-  @distribution_util.AppendDocstring(
-      kwargs_dict={
-          "hinge_softness": (
-              "Nonzero floating point `Tensor`.  Controls the softness of what "
-              "would otherwise be a kink at the origin.  Default is 1.0")})
-  def __init__(self,
-               event_ndims=0,
-               hinge_softness=None,
-               validate_args=False,
-               name="softplus"):
-    with ops.name_scope(name, values=[hinge_softness]):
-      if hinge_softness is not None:
-        self._hinge_softness = ops.convert_to_tensor(
-            hinge_softness, name="hinge_softness")
-      else:
-        self._hinge_softness = None
-      if validate_args:
-        nonzero_check = check_ops.assert_none_equal(
-            ops.convert_to_tensor(
-                0, dtype=self.hinge_softness.dtype),
-            self.hinge_softness,
-            message="hinge_softness must be non-zero")
-        self._hinge_softness = control_flow_ops.with_dependencies(
-            [nonzero_check], self.hinge_softness)
-
-    super(Softplus, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward(self, x):
-    if self.hinge_softness is None:
-      return nn_ops.softplus(x)
-    hinge_softness = math_ops.cast(self.hinge_softness, x.dtype)
-    return hinge_softness * nn_ops.softplus(x / hinge_softness)
-
-  def _inverse(self, y):
-    if self.hinge_softness is None:
-      return distribution_util.softplus_inverse(y)
-    hinge_softness = math_ops.cast(self.hinge_softness, y.dtype)
-    return hinge_softness * distribution_util.softplus_inverse(
-        y / hinge_softness)
-
-  def _inverse_log_det_jacobian(self, y):
-    # Could also do:
-    #   ildj = math_ops.reduce_sum(y - distribution_util.softplus_inverse(y),
-    #                              axis=event_dims)
-    # but the following is more numerically stable. Ie,
-    # Y = Log[1 + exp{X}] ==> X = Log[exp{Y} - 1]
-    # ==> dX/dY = exp{Y} / (exp{Y} - 1)
-    #           = 1 / (1 - exp{-Y}),
-    # which is the most stable for large Y > 0. For small Y, we use
-    # 1 - exp{-Y} approx Y.
-    if self.hinge_softness is not None:
-      y /= math_ops.cast(self.hinge_softness, y.dtype)
-    return -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
-                                axis=self._event_dims_tensor(y))
-
-  def _forward_log_det_jacobian(self, x):
-    if self.hinge_softness is not None:
-      x /= math_ops.cast(self.hinge_softness, x.dtype)
-    return -math_ops.reduce_sum(nn_ops.softplus(-x),
-                                axis=self._event_dims_tensor(x))
-
-  @property
-  def hinge_softness(self):
-    return self._hinge_softness
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
index d439f28884d8bd7f2b808317e10c5b5e44bfcfa2..00520bcda85e9527767e6342bf75f10667c264a8 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
@@ -18,12 +18,132 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.weibull_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Weibull"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Weibull",
+]
+
+
+class Weibull(bijector.Bijector):
+  """Compute `Y = g(X) = 1 - exp((-X / scale) ** concentration), X >= 0`.
+
+  This bijector maps inputs from `[0, inf]` to [0, 1]`. The inverse of the
+  bijector applied to a uniform random variable `X ~ U(0, 1) gives back a
+  random variable with the
+  [Weibull distribution](https://en.wikipedia.org/wiki/Weibull_distribution):
+
+  ```none
+  Y ~ Weibull(scale, concentration)
+  pdf(y; scale, concentration, y >= 0) = (scale / concentration) * (
+    scale / concentration) ** (concentration - 1) * exp(
+      -(y / scale) ** concentration)
+  ```
+  """
+
+  def __init__(self,
+               scale=1.,
+               concentration=1.,
+               event_ndims=0,
+               validate_args=False,
+               name="weibull"):
+    """Instantiates the `Weibull` bijector.
+
+    Args:
+      scale: Positive Float-type `Tensor` that is the same dtype and is
+        broadcastable with `concentration`.
+        This is `l` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
+      concentration: Positive Float-type `Tensor` that is the same dtype and is
+        broadcastable with `scale`.
+        This is `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[scale, concentration]):
+      self._scale = ops.convert_to_tensor(scale, name="scale")
+      self._concentration = ops.convert_to_tensor(
+          concentration, name="concentration")
+      check_ops.assert_same_float_dtype([self._scale, self._concentration])
+      if validate_args:
+        self._scale = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(
+                self._scale,
+                message="Argument scale was not positive")
+        ], self._scale)
+        self._concentration = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(
+                self._concentration,
+                message="Argument concentration was not positive")
+        ], self._concentration)
+
+    super(Weibull, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  @property
+  def scale(self):
+    """The `l` in `Y = g(X) = 1 - exp((-x / l) ** k)`."""
+    return self._scale
+
+  @property
+  def concentration(self):
+    """The `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`."""
+    return self._concentration
+
+  def _forward(self, x):
+    x = self._maybe_assert_valid_x(x)
+    return -math_ops.expm1(-((x / self.scale) ** self.concentration))
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid_y(y)
+    return self.scale * (-math_ops.log1p(-y)) ** (1 / self.concentration)
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid_y(y)
+    event_dims = self._event_dims_tensor(y)
+    return math_ops.reduce_sum(
+        -math_ops.log1p(-y) +
+        (1 / self.concentration - 1) * math_ops.log(-math_ops.log1p(-y)) +
+        math_ops.log(self.scale / self.concentration),
+        axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    x = self._maybe_assert_valid_x(x)
+    event_dims = self._event_dims_tensor(x)
+    return math_ops.reduce_sum(
+        -(x / self.scale) ** self.concentration +
+        (self.concentration - 1) * math_ops.log(x) +
+        math_ops.log(self.concentration) +
+        -self.concentration * math_ops.log(self.scale),
+        axis=event_dims)
+
+  def _maybe_assert_valid_x(self, x):
+    if not self.validate_args:
+      return x
+    is_valid = check_ops.assert_non_negative(
+        x,
+        message="Forward transformation input must be at least {}.".format(0))
+    return control_flow_ops.with_dependencies([is_valid], x)
+
+  def _maybe_assert_valid_y(self, y):
+    if not self.validate_args:
+      return y
+    is_positive = check_ops.assert_non_negative(
+        y, message="Inverse transformation input must be greater than 0.")
+    less_than_one = check_ops.assert_less_equal(
+        y, constant_op.constant(1., y.dtype),
+        message="Inverse transformation input must be less than or equal to 1.")
+    return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/weibull_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/weibull_impl.py
deleted file mode 100644
index 00520bcda85e9527767e6342bf75f10667c264a8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/weibull_impl.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Weibull bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Weibull",
-]
-
-
-class Weibull(bijector.Bijector):
-  """Compute `Y = g(X) = 1 - exp((-X / scale) ** concentration), X >= 0`.
-
-  This bijector maps inputs from `[0, inf]` to [0, 1]`. The inverse of the
-  bijector applied to a uniform random variable `X ~ U(0, 1) gives back a
-  random variable with the
-  [Weibull distribution](https://en.wikipedia.org/wiki/Weibull_distribution):
-
-  ```none
-  Y ~ Weibull(scale, concentration)
-  pdf(y; scale, concentration, y >= 0) = (scale / concentration) * (
-    scale / concentration) ** (concentration - 1) * exp(
-      -(y / scale) ** concentration)
-  ```
-  """
-
-  def __init__(self,
-               scale=1.,
-               concentration=1.,
-               event_ndims=0,
-               validate_args=False,
-               name="weibull"):
-    """Instantiates the `Weibull` bijector.
-
-    Args:
-      scale: Positive Float-type `Tensor` that is the same dtype and is
-        broadcastable with `concentration`.
-        This is `l` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
-      concentration: Positive Float-type `Tensor` that is the same dtype and is
-        broadcastable with `scale`.
-        This is `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    with self._name_scope("init", values=[scale, concentration]):
-      self._scale = ops.convert_to_tensor(scale, name="scale")
-      self._concentration = ops.convert_to_tensor(
-          concentration, name="concentration")
-      check_ops.assert_same_float_dtype([self._scale, self._concentration])
-      if validate_args:
-        self._scale = control_flow_ops.with_dependencies([
-            check_ops.assert_positive(
-                self._scale,
-                message="Argument scale was not positive")
-        ], self._scale)
-        self._concentration = control_flow_ops.with_dependencies([
-            check_ops.assert_positive(
-                self._concentration,
-                message="Argument concentration was not positive")
-        ], self._concentration)
-
-    super(Weibull, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  @property
-  def scale(self):
-    """The `l` in `Y = g(X) = 1 - exp((-x / l) ** k)`."""
-    return self._scale
-
-  @property
-  def concentration(self):
-    """The `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`."""
-    return self._concentration
-
-  def _forward(self, x):
-    x = self._maybe_assert_valid_x(x)
-    return -math_ops.expm1(-((x / self.scale) ** self.concentration))
-
-  def _inverse(self, y):
-    y = self._maybe_assert_valid_y(y)
-    return self.scale * (-math_ops.log1p(-y)) ** (1 / self.concentration)
-
-  def _inverse_log_det_jacobian(self, y):
-    y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        -math_ops.log1p(-y) +
-        (1 / self.concentration - 1) * math_ops.log(-math_ops.log1p(-y)) +
-        math_ops.log(self.scale / self.concentration),
-        axis=event_dims)
-
-  def _forward_log_det_jacobian(self, x):
-    x = self._maybe_assert_valid_x(x)
-    event_dims = self._event_dims_tensor(x)
-    return math_ops.reduce_sum(
-        -(x / self.scale) ** self.concentration +
-        (self.concentration - 1) * math_ops.log(x) +
-        math_ops.log(self.concentration) +
-        -self.concentration * math_ops.log(self.scale),
-        axis=event_dims)
-
-  def _maybe_assert_valid_x(self, x):
-    if not self.validate_args:
-      return x
-    is_valid = check_ops.assert_non_negative(
-        x,
-        message="Forward transformation input must be at least {}.".format(0))
-    return control_flow_ops.with_dependencies([is_valid], x)
-
-  def _maybe_assert_valid_y(self, y):
-    if not self.validate_args:
-      return y
-    is_positive = check_ops.assert_non_negative(
-        y, message="Inverse transformation input must be greater than 0.")
-    less_than_one = check_ops.assert_less_equal(
-        y, constant_op.constant(1., y.dtype),
-        message="Inverse transformation input must be less than or equal to 1.")
-    return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
diff --git a/tensorflow/contrib/distributions/python/ops/cauchy.py b/tensorflow/contrib/distributions/python/ops/cauchy.py
index 8d59c1abfbc607c67b2bbca21f880743a43e5b2a..6f5d724a2a945ed8f9c159d8314327c6f994d1db 100644
--- a/tensorflow/contrib/distributions/python/ops/cauchy.py
+++ b/tensorflow/contrib/distributions/python/ops/cauchy.py
@@ -43,16 +43,17 @@ class Cauchy(distribution.Distribution):
   The probability density function (pdf) is,
 
   ```none
-  pdf(x; loc, scale) = 1 / (pi * scale * (1 + ((x - loc) / scale)**2))
+  pdf(x; loc, scale) = 1 / (pi scale (1 + z**2))
+  z = (x - loc) / scale
   ```
   where `loc` is the location, and `scale` is the scale.
 
   The Cauchy distribution is a member of the [location-scale family](
   https://en.wikipedia.org/wiki/Location-scale_family), i.e.
+  `Y ~ Cauchy(loc, scale)` is equivalent to,
 
   ```none
   X ~ Cauchy(loc=0, scale=1)
-  Y ~ Cauchy(loc=loc, scale=scale)
   Y = loc + scale * X
   ```
 
@@ -61,14 +62,16 @@ class Cauchy(distribution.Distribution):
   Examples of initialization of one or a batch of distributions.
 
   ```python
+  tfd = tf.contrib.distributions
+
   # Define a single scalar Cauchy distribution.
-  dist = Cauchy(loc=0., scale=3.)
+  dist = tfd.Cauchy(loc=0., scale=3.)
 
   # Evaluate the cdf at 1, returning a scalar.
   dist.cdf(1.)
 
   # Define a batch of two scalar valued Cauchy distributions.
-  dist = Cauchy(loc=[1, 2.], scale=[11, 22.])
+  dist = tfd.Cauchy(loc=[1, 2.], scale=[11, 22.])
 
   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
@@ -76,18 +79,17 @@ class Cauchy(distribution.Distribution):
 
   # Get 3 samples, returning a 3 x 2 tensor.
   dist.sample([3])
-  ```
-
-  Arguments are broadcast when possible.
 
-  ```python
+  # Arguments are broadcast when possible.
   # Define a batch of two scalar valued Cauchy distributions.
   # Both have median 1, but different scales.
-  dist = tf.contrib.distributions.Cauchy(loc=1., scale=[11, 22.])
+  dist = tfd.Cauchy(loc=1., scale=[11, 22.])
+
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
-  dist.prob(3.0)
+  dist.prob(3.)
   ```
+
   """
 
   def __init__(self,
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
index 599c855cda434d9249187d5d154d50a8a8c49a6c..1d4c5660d8d73b7b6a7e758fc834ccfddeb5c8ea 100644
--- a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
@@ -121,7 +121,7 @@ class ConditionalTransformedDistribution(
     log_prob = self.distribution.log_prob(x, **distribution_kwargs)
     if self._is_maybe_event_override:
       log_prob = math_ops.reduce_sum(log_prob, self._reduce_event_indices)
-    return ildj + log_prob
+    return math_ops.cast(ildj, log_prob.dtype) + log_prob
 
   @distribution_util.AppendDocstring(kwargs_dict=_condition_kwargs_dict)
   def _prob(self, y, bijector_kwargs=None, distribution_kwargs=None):
@@ -143,7 +143,7 @@ class ConditionalTransformedDistribution(
     prob = self.distribution.prob(x, **distribution_kwargs)
     if self._is_maybe_event_override:
       prob = math_ops.reduce_prod(prob, self._reduce_event_indices)
-    return math_ops.exp(ildj) * prob
+    return math_ops.exp(math_ops.cast(ildj, prob.dtype)) * prob
 
   @distribution_util.AppendDocstring(kwargs_dict=_condition_kwargs_dict)
   def _log_cdf(self, y, bijector_kwargs=None, distribution_kwargs=None):
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
index 850d08d1bd69ebc7661557d648e2bffe77e6a908..8049522e9f5dc26b244b7e710a9ae8b981efd6b6 100644
--- a/tensorflow/contrib/distributions/python/ops/deterministic.py
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -290,8 +290,10 @@ class VectorDeterministic(_BaseDeterministic):
   #### Examples
 
   ```python
+  tfd = tf.contrib.distributions
+
   # Initialize a single VectorDeterministic supported at [0., 2.] in R^2.
-  constant = tf.contrib.distributions.Deterministic([0., 2.])
+  constant = tfd.Deterministic([0., 2.])
   constant.prob([0., 2.])
   ==> 1.
   constant.prob([0., 3.])
@@ -299,7 +301,7 @@ class VectorDeterministic(_BaseDeterministic):
 
   # Initialize a [3] batch of constants on R^2.
   loc = [[0., 1.], [2., 3.], [4., 5.]]
-  constant = constant_lib.VectorDeterministic(loc)
+  constant = tfd.VectorDeterministic(loc)
   constant.prob([[0., 1.], [1.9, 3.], [3.99, 5.]])
   ==> [1., 0., 0.]
   ```
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index 869b5698e57d199755ce1686a74a1eafe3b73e7d..289e1d50e1146a641c0cc433ece3465aed73b1c2 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.contrib import linalg
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -330,54 +329,14 @@ def shapes_from_loc_and_scale(loc, scale, name="shapes_from_loc_and_scale"):
       else:
         loc_batch_shape = ops.convert_to_tensor(loc_batch_shape,
                                                 name="loc_batch_shape")
+      # This is defined in the core util module.
+      # pylint: disable=undefined-variable
       batch_shape = prefer_static_broadcast_shape(batch_shape, loc_batch_shape)
+      # pylint: enable=undefined-variable
 
   return batch_shape, event_shape
 
 
-def prefer_static_broadcast_shape(
-    shape1, shape2, name="prefer_static_broadcast_shape"):
-  """Convenience function which statically broadcasts shape when possible.
-
-  Args:
-    shape1:  `1-D` integer `Tensor`.  Already converted to tensor!
-    shape2:  `1-D` integer `Tensor`.  Already converted to tensor!
-    name:  A string name to prepend to created ops.
-
-  Returns:
-    The broadcast shape, either as `TensorShape` (if broadcast can be done
-      statically), or as a `Tensor`.
-  """
-  with ops.name_scope(name, values=[shape1, shape2]):
-    def make_shape_tensor(x):
-      return ops.convert_to_tensor(x, name="shape", dtype=dtypes.int32)
-
-    def get_tensor_shape(s):
-      if isinstance(s, tensor_shape.TensorShape):
-        return s
-      s_ = tensor_util.constant_value(make_shape_tensor(s))
-      if s_ is not None:
-        return tensor_shape.TensorShape(s_)
-      return None
-
-    def get_shape_tensor(s):
-      if not isinstance(s, tensor_shape.TensorShape):
-        return make_shape_tensor(s)
-      if s.is_fully_defined():
-        return make_shape_tensor(s.as_list())
-      raise ValueError("Cannot broadcast from partially "
-                       "defined `TensorShape`.")
-
-    shape1_ = get_tensor_shape(shape1)
-    shape2_ = get_tensor_shape(shape2)
-    if shape1_ is not None and shape2_ is not None:
-      return array_ops.broadcast_static_shape(shape1_, shape2_)
-
-    shape1_ = get_shape_tensor(shape1)
-    shape2_ = get_shape_tensor(shape2)
-    return array_ops.broadcast_dynamic_shape(shape1_, shape2_)
-
-
 def get_broadcast_shape(*tensors):
   """Get broadcast shape as a Python list of integers (preferred) or `Tensor`.
 
@@ -484,6 +443,44 @@ def maybe_check_scalar_distribution(
     return assertions
 
 
+def pad_mixture_dimensions(x, mixture_distribution, categorical_distribution,
+                           event_ndims):
+  """Pad dimensions of event tensors for mixture distributions.
+
+  See `Mixture._sample_n` and `MixtureSameFamily._sample_n` for usage examples.
+
+  Args:
+    x: event tensor to pad.
+    mixture_distribution: Base distribution of the mixture.
+    categorical_distribution: `Categorical` distribution that mixes the base
+      distribution.
+    event_ndims: Integer specifying the number of event dimensions in the event
+      tensor.
+
+  Returns:
+    A padded version of `x` that can broadcast with `categorical_distribution`.
+  """
+  with ops.name_scope("pad_mix_dims", values=[x]):
+    def _get_ndims(d):
+      if d.batch_shape.ndims is not None:
+        return d.batch_shape.ndims
+      return array_ops.shape(d.batch_shape_tensor())[0]
+    dist_batch_ndims = _get_ndims(mixture_distribution)
+    cat_batch_ndims = _get_ndims(categorical_distribution)
+    pad_ndims = array_ops.where(
+        categorical_distribution.is_scalar_batch(),
+        dist_batch_ndims,
+        dist_batch_ndims - cat_batch_ndims)
+    s = array_ops.shape(x)
+    x = array_ops.reshape(x, shape=array_ops.concat([
+        s[:-1],
+        array_ops.ones([pad_ndims], dtype=dtypes.int32),
+        s[-1:],
+        array_ops.ones([event_ndims], dtype=dtypes.int32),
+    ], axis=0))
+    return x
+
+
 def static_value(x):
   """Returns the static value of a `Tensor` or `None`."""
   return tensor_util.constant_value(ops.convert_to_tensor(x))
diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py
index ba8d3c639b397422f0f6210ba9f48650f0da1e3e..d0efaefb8e78ddf4436e9e5a112d2c1cdddaf3b5 100644
--- a/tensorflow/contrib/distributions/python/ops/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/gumbel.py
@@ -62,15 +62,17 @@ class _Gumbel(distribution.Distribution):
   Examples of initialization of one or a batch of distributions.
 
   ```python
+  tfd = tf.contrib.distributions
+
   # Define a single scalar Gumbel distribution.
-  dist = tf.contrib.distributions.Gumbel(loc=0., scale=3.)
+  dist = tfd.Gumbel(loc=0., scale=3.)
 
   # Evaluate the cdf at 1, returning a scalar.
   dist.cdf(1.)
 
   # Define a batch of two scalar valued Gumbels.
   # The first has mean 1 and scale 11, the second 2 and 22.
-  dist = tf.contrib.distributions.Gumbel(loc=[1, 2.], scale=[11, 22.])
+  dist = tfd.Gumbel(loc=[1, 2.], scale=[11, 22.])
 
   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
@@ -85,7 +87,7 @@ class _Gumbel(distribution.Distribution):
   ```python
   # Define a batch of two scalar valued Logistics.
   # Both have mean 1, but different scales.
-  dist = tf.contrib.distributions.Gumbel(loc=1., scale=[11, 22.])
+  dist = tfd.Gumbel(loc=1., scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
diff --git a/tensorflow/contrib/distributions/python/ops/half_normal.py b/tensorflow/contrib/distributions/python/ops/half_normal.py
index 12059b6a9e199dc3ae00ac47a62ece9c9a147000..fc0751a6e0b78cb3d79bd3478e740bb05cd26428 100644
--- a/tensorflow/contrib/distributions/python/ops/half_normal.py
+++ b/tensorflow/contrib/distributions/python/ops/half_normal.py
@@ -84,6 +84,7 @@ class HalfNormal(distribution.Distribution):
   ```
 
   """
+
   def __init__(self,
                scale,
                validate_args=False,
@@ -120,7 +121,7 @@ class HalfNormal(distribution.Distribution):
 
   @staticmethod
   def _param_shapes(sample_shape):
-    return {'scale': ops.convert_to_tensor(sample_shape, dtype=dtypes.int32)}
+    return {"scale": ops.convert_to_tensor(sample_shape, dtype=dtypes.int32)}
 
   @property
   def scale(self):
diff --git a/tensorflow/contrib/distributions/python/ops/independent.py b/tensorflow/contrib/distributions/python/ops/independent.py
index 6a74ca9a0ae1ad30081d21cc15a65be052a99e2a..cbce005013281ff3c58c94d525d5ce7a865d725a 100644
--- a/tensorflow/contrib/distributions/python/ops/independent.py
+++ b/tensorflow/contrib/distributions/python/ops/independent.py
@@ -68,11 +68,11 @@ class Independent(distribution_lib.Distribution):
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Make independent distribution from a 2-batch Normal.
-  ind = ds.Independent(
-      distribution=ds.Normal(loc=[-1., 1], scale=[0.1, 0.5]),
+  ind = tfd.Independent(
+      distribution=tfd.Normal(loc=[-1., 1], scale=[0.1, 0.5]),
       reinterpreted_batch_ndims=1)
 
   # All batch dims have been "absorbed" into event dims.
@@ -80,8 +80,8 @@ class Independent(distribution_lib.Distribution):
   ind.event_shape  # ==> [2]
 
   # Make independent distribution from a 2-batch bivariate Normal.
-  ind = ds.Independent(
-      distribution=ds.MultivariateNormalDiag(
+  ind = tfd.Independent(
+      distribution=tfd.MultivariateNormalDiag(
           loc=[[-1., 1], [1, -1]],
           scale_identity_multiplier=[1., 0.5]),
       reinterpreted_batch_ndims=1)
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 956dee38a378813434656a28a69c89b6ec1e8b72..ee4d86867d48b20e97757bcec57d452085814b80 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -88,8 +88,9 @@ class InverseGamma(distribution.Distribution):
   #### Examples
 
   ```python
-  dist = InverseGamma(concentration=3.0, rate=2.0)
-  dist2 = InverseGamma(concentration=[3.0, 4.0], rate=[2.0, 3.0])
+  tfd = tf.contrib.distributions
+  dist = tfd.InverseGamma(concentration=3.0, rate=2.0)
+  dist2 = tfd.InverseGamma(concentration=[3.0, 4.0], rate=[2.0, 3.0])
   ```
 
   """
diff --git a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d5d8773cf3e69a52554c87d656fea2835c8354
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
@@ -0,0 +1,258 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Kumaraswamy distribution class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.distributions import beta
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = [
+    "Kumaraswamy",
+]
+
+_kumaraswamy_sample_note = """Note: `x` must have dtype `self.dtype` and be in
+`[0, 1].` It must have a shape compatible with `self.batch_shape()`."""
+
+
+def _harmonic_number(x):
+  """Compute the harmonic number from its analytic continuation.
+
+  Derivation from [1] and Euler's constant [2].
+  [1] -
+  https://en.wikipedia.org/wiki/Digamma_function#Relation_to_harmonic_numbers
+  [2] - https://en.wikipedia.org/wiki/Euler%E2%80%93Mascheroni_constant
+
+
+  Args:
+    x: input float.
+
+  Returns:
+    z: The analytic continuation of the harmonic number for the input.
+
+  """
+  one = array_ops.ones([], dtype=x.dtype)
+  return math_ops.digamma(x + one) - math_ops.digamma(one)
+
+
+@tf_export("distributions.Kumaraswamy")
+class Kumaraswamy(beta.Beta):
+  """Kumaraswamy distribution.
+
+  The Kumaraswamy distribution is defined over the `(0, 1)` interval using
+  parameters
+  `concentration1` (aka "alpha") and `concentration0` (aka "beta").  It has a
+  shape similar to the Beta distribution, but is reparameterizeable.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; alpha, beta) = alpha * beta * x**(alpha - 1) * (1 - x**alpha)**(beta -
+  1)
+  ```
+
+  where:
+
+  * `concentration1 = alpha`,
+  * `concentration0 = beta`,
+
+  Distribution parameters are automatically broadcast in all functions; see
+  examples for details.
+
+  #### Examples
+
+  ```python
+  # Create a batch of three Kumaraswamy distributions.
+  alpha = [1, 2, 3]
+  beta = [1, 2, 3]
+  dist = Kumaraswamy(alpha, beta)
+
+  dist.sample([4, 5])  # Shape [4, 5, 3]
+
+  # `x` has three batch entries, each with two samples.
+  x = [[.1, .4, .5],
+       [.2, .3, .5]]
+  # Calculate the probability of each pair of samples under the corresponding
+  # distribution in `dist`.
+  dist.prob(x)         # Shape [2, 3]
+  ```
+
+  ```python
+  # Create batch_shape=[2, 3] via parameter broadcast:
+  alpha = [[1.], [2]]      # Shape [2, 1]
+  beta = [3., 4, 5]        # Shape [3]
+  dist = Kumaraswamy(alpha, beta)
+
+  # alpha broadcast as: [[1., 1, 1,],
+  #                      [2, 2, 2]]
+  # beta broadcast as:  [[3., 4, 5],
+  #                      [3, 4, 5]]
+  # batch_Shape [2, 3]
+  dist.sample([4, 5])  # Shape [4, 5, 2, 3]
+
+  x = [.2, .3, .5]
+  # x will be broadcast as [[.2, .3, .5],
+  #                         [.2, .3, .5]],
+  # thus matching batch_shape [2, 3].
+  dist.prob(x)         # Shape [2, 3]
+  ```
+
+  """
+
+  def __init__(self,
+               concentration1=None,
+               concentration0=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="Kumaraswamy"):
+    """Initialize a batch of Kumaraswamy distributions.
+
+    Args:
+      concentration1: Positive floating-point `Tensor` indicating mean
+        number of successes; aka "alpha". Implies `self.dtype` and
+        `self.batch_shape`, i.e.,
+        `concentration1.shape = [N1, N2, ..., Nm] = self.batch_shape`.
+      concentration0: Positive floating-point `Tensor` indicating mean
+        number of failures; aka "beta". Otherwise has same semantics as
+        `concentration1`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+    """
+    super(Kumaraswamy, self).__init__(
+        concentration1=concentration1,
+        concentration0=concentration0,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=name)
+    self._reparameterization_type = distribution.FULLY_REPARAMETERIZED
+
+  def _sample_n(self, n, seed=None):
+    expanded_concentration1 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration1
+    expanded_concentration0 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration0
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
+    uniform_sample = random_ops.random_uniform(
+        shape=shape, minval=0.0, maxval=1.0, dtype=self.dtype, seed=seed)
+
+    kumaraswamy_sample = (1 - uniform_sample**(1. / expanded_concentration0))**(
+        1. / expanded_concentration1)
+    return kumaraswamy_sample
+
+  @distribution_util.AppendDocstring(_kumaraswamy_sample_note)
+  def _log_cdf(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return math_ops.log1p(-(1 - x**a)**b)
+
+  @distribution_util.AppendDocstring(_kumaraswamy_sample_note)
+  def _cdf(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return 1 - (1 - x**a)**b
+
+  def _survival_function(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return (1 - x**a)**b
+
+  def _log_survival_function(self, x):
+    a = self.concentration1
+    b = self.concentration0
+    return b * math_ops.log1p(-x**a)
+
+  def _log_unnormalized_prob(self, x):
+    x = self._maybe_assert_valid_sample(x)
+    a = self.concentration1
+    b = self.concentration0
+    return (a - 1) * math_ops.log(x) + (b - 1) * math_ops.log1p(-x**a)
+
+  def _log_normalization(self):
+    a = self.concentration1
+    b = self.concentration0
+    return -(math_ops.log(a) + math_ops.log(b))
+
+  def _entropy(self):
+    a = self.concentration1
+    b = self.concentration0
+    return (1 - 1. / a) + (
+        1 - 1. / b) * _harmonic_number(b) + math_ops.log(a) + math_ops.log(b)
+
+  def _moment(self, n):
+    """Compute the n'th (uncentered) moment."""
+    expanded_concentration1 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration1
+    expanded_concentration0 = array_ops.ones_like(
+        self.total_concentration, dtype=self.dtype) * self.concentration0
+    beta_arg0 = 1 + n / expanded_concentration1
+    beta_arg = array_ops.stack([beta_arg0, expanded_concentration0], -1)
+    log_moment = math_ops.log(expanded_concentration0) + special_math_ops.lbeta(
+        beta_arg)
+    return math_ops.exp(log_moment)
+
+  def _mean(self):
+    return self._moment(1)
+
+  def _variance(self):
+    # TODO(b/72696533): Investigate a more numerically stable version.
+    return self._moment(2) - math_ops.square(self._moment(1))
+
+  @distribution_util.AppendDocstring(
+      """Note: The mode is undefined when `concentration1 <= 1` or
+      `concentration0 <= 1`. If `self.allow_nan_stats` is `True`, `NaN`
+      is used for undefined modes. If `self.allow_nan_stats` is `False` an
+      exception is raised when one or more modes are undefined.""")
+  def _mode(self):
+    a = self.concentration1
+    b = self.concentration0
+    mode = ((a - 1) / (a * b - 1))**(1. / a)
+    if self.allow_nan_stats:
+      nan = array_ops.fill(
+          self.batch_shape_tensor(),
+          np.array(np.nan, dtype=self.dtype.as_numpy_dtype),
+          name="nan")
+      is_defined = (self.concentration1 > 1.) & (self.concentration0 > 1.)
+      return array_ops.where(is_defined, mode, nan)
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_less(
+            array_ops.ones([], dtype=self.dtype),
+            self.concentration1,
+            message="Mode undefined for concentration1 <= 1."),
+        check_ops.assert_less(
+            array_ops.ones([], dtype=self.dtype),
+            self.concentration0,
+            message="Mode undefined for concentration0 <= 1.")
+    ], mode)
diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py
index 48794a48828fe796e233e968d8c755136ce166ad..473677f8d91b184e029f345bb05f5c5d63df7a40 100644
--- a/tensorflow/contrib/distributions/python/ops/logistic.py
+++ b/tensorflow/contrib/distributions/python/ops/logistic.py
@@ -60,15 +60,17 @@ class Logistic(distribution.Distribution):
   Examples of initialization of one or a batch of distributions.
 
   ```python
+  tfd = tf.contrib.distributions
+
   # Define a single scalar Logistic distribution.
-  dist = tf.contrib.distributions.Logistic(loc=0., scale=3.)
+  dist = tfd.Logistic(loc=0., scale=3.)
 
   # Evaluate the cdf at 1, returning a scalar.
   dist.cdf(1.)
 
   # Define a batch of two scalar valued Logistics.
   # The first has mean 1 and scale 11, the second 2 and 22.
-  dist = tf.contrib.distributions.Logistic(loc=[1, 2.], scale=[11, 22.])
+  dist = tfd.Logistic(loc=[1, 2.], scale=[11, 22.])
 
   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
@@ -76,14 +78,11 @@ class Logistic(distribution.Distribution):
 
   # Get 3 samples, returning a 3 x 2 tensor.
   dist.sample([3])
-  ```
 
-  Arguments are broadcast when possible.
-
-  ```python
+  # Arguments are broadcast when possible.
   # Define a batch of two scalar valued Logistics.
   # Both have mean 1, but different scales.
-  dist = tf.contrib.distributions.Logistic(loc=1., scale=[11, 22.])
+  dist = tfd.Logistic(loc=1., scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index e676931d9145e72907d990148ee2d180e0da0258..cef6a143fc615901315a3780bf4ed53b8c7cd177 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -49,13 +49,13 @@ class Mixture(distribution.Distribution):
 
   ```python
   # Create a mixture of two Gaussians:
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
   mix = 0.3
-  bimix_gauss = ds.Mixture(
-    cat=ds.Categorical(probs=[mix, 1.-mix]),
+  bimix_gauss = tfd.Mixture(
+    cat=tfd.Categorical(probs=[mix, 1.-mix]),
     components=[
-      ds.Normal(loc=-1., scale=0.1),
-      ds.Normal(loc=+1., scale=0.5),
+      tfd.Normal(loc=-1., scale=0.1),
+      tfd.Normal(loc=+1., scale=0.5),
   ])
 
   # Plot the PDF.
@@ -71,6 +71,7 @@ class Mixture(distribution.Distribution):
                components,
                validate_args=False,
                allow_nan_stats=True,
+               use_static_graph=False,
                name="Mixture"):
     """Initialize a Mixture distribution.
 
@@ -96,6 +97,11 @@ class Mixture(distribution.Distribution):
        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
         batch member. If `True`, batch members with valid parameters leading to
         undefined statistics will return NaN for this statistic.
+      use_static_graph: Calls to `sample` will not rely on dynamic tensor
+        indexing, allowing for some static graph compilation optimizations, but
+        at the expense of sampling all underlying distributions in the mixture.
+        (Possibly useful when running on TPUs).
+        Default value: `False` (i.e., use dynamic indexing).
       name: A name for this distribution (optional).
 
     Raises:
@@ -178,6 +184,10 @@ class Mixture(distribution.Distribution):
       self._static_event_shape = static_event_shape
       self._static_batch_shape = static_batch_shape
 
+      self._use_static_graph = use_static_graph
+      if use_static_graph and static_num_components is None:
+        raise ValueError("Number of categories must be known statically when "
+                         "`static_sample=True`.")
     # We let the Mixture distribution access _graph_parents since its arguably
     # more like a baseclass.
     graph_parents = self._cat._graph_parents  # pylint: disable=protected-access
@@ -292,6 +302,31 @@ class Mixture(distribution.Distribution):
       return mixture_log_cdf
 
   def _sample_n(self, n, seed=None):
+    if self._use_static_graph:
+      # This sampling approach is almost the same as the approach used by
+      # `MixtureSameFamily`. The differences are due to having a list of
+      # `Distribution` objects rather than a single object, and maintaining
+      # random seed management that is consistent with the non-static code path.
+      samples = []
+      cat_samples = self.cat.sample(n, seed=seed)
+      for c in range(self.num_components):
+        seed = distribution_util.gen_new_seed(seed, "mixture")
+        samples.append(self.components[c].sample(n, seed=seed))
+      x = array_ops.stack(
+          samples, -self._static_event_shape.ndims - 1)     # [n, B, k, E]
+      npdt = x.dtype.as_numpy_dtype
+      mask = array_ops.one_hot(
+          indices=cat_samples,                              # [n, B]
+          depth=self._num_components,                       # == k
+          on_value=np.ones([], dtype=npdt),
+          off_value=np.zeros([], dtype=npdt))               # [n, B, k]
+      mask = distribution_utils.pad_mixture_dimensions(
+          mask, self, self._cat,
+          self._static_event_shape.ndims)                   # [n, B, k, [1]*e]
+      return math_ops.reduce_sum(
+          x * mask,
+          axis=-1 - self._static_event_shape.ndims)         # [n, B, E]
+
     with ops.control_dependencies(self._assertions):
       n = ops.convert_to_tensor(n, name="n")
       static_n = tensor_util.constant_value(n)
diff --git a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
index 5558ef0f255db684b229d129666634e50c625887..b93bdc5ab4010663baddda1410b302644853648b 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import dtypes
+from tensorflow.contrib.distributions.python.ops import distribution_util as distribution_utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -43,15 +43,14 @@ class MixtureSameFamily(distribution.Distribution):
   #### Examples
 
   ```python
-  import matplotlib.pyplot as plt
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   ### Create a mixture of two scalar Gaussians:
 
-  gm = ds.MixtureSameFamily(
-      mixture_distribution=ds.Categorical(
+  gm = tfd.MixtureSameFamily(
+      mixture_distribution=tfd.Categorical(
           probs=[0.3, 0.7]),
-      components_distribution=ds.Normal(
+      components_distribution=tfd.Normal(
         loc=[-1., 1],       # One for each component.
         scale=[0.1, 0.5]))  # And same here.
 
@@ -63,14 +62,15 @@ class MixtureSameFamily(distribution.Distribution):
 
   # Plot PDF.
   x = np.linspace(-2., 3., int(1e4), dtype=np.float32)
+  import matplotlib.pyplot as plt
   plt.plot(x, gm.prob(x).eval());
 
   ### Create a mixture of two Bivariate Gaussians:
 
-  gm = ds.MixtureSameFamily(
-      mixture_distribution=ds.Categorical(
+  gm = tfd.MixtureSameFamily(
+      mixture_distribution=tfd.Categorical(
           probs=[0.3, 0.7]),
-      components_distribution=ds.MultivariateNormalDiag(
+      components_distribution=tfd.MultivariateNormalDiag(
           loc=[[-1., 1],  # component 1
                [1, -1]],  # component 2
           scale_identity_multiplier=[.3, .6]))
@@ -239,7 +239,9 @@ class MixtureSameFamily(distribution.Distribution):
           depth=self._num_components,                        # == k
           on_value=np.ones([], dtype=npdt),
           off_value=np.zeros([], dtype=npdt))                # [n, B, k]
-      mask = self._pad_mix_dims(mask)                        # [n, B, k, [1]*e]
+      mask = distribution_utils.pad_mixture_dimensions(
+          mask, self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [n, B, k, [1]*e]
       return math_ops.reduce_sum(
           x * mask, axis=-1 - self._event_ndims)             # [n, B, E]
 
@@ -248,14 +250,15 @@ class MixtureSameFamily(distribution.Distribution):
       x = self._pad_sample_dims(x)
       log_prob_x = self.components_distribution.log_prob(x)  # [S, B, k]
       log_mix_prob = nn_ops.log_softmax(
-          self.mixture_distribution.logits, dim=-1)          # [B, k]
+          self.mixture_distribution.logits, axis=-1)         # [B, k]
       return math_ops.reduce_logsumexp(
           log_prob_x + log_mix_prob, axis=-1)                # [S, B]
 
   def _mean(self):
     with ops.control_dependencies(self._runtime_assertions):
-      probs = self._pad_mix_dims(
-          self.mixture_distribution.probs)                   # [B, k, [1]*e]
+      probs = distribution_utils.pad_mixture_dimensions(
+          self.mixture_distribution.probs, self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [B, k, [1]*e]
       return math_ops.reduce_sum(
           probs * self.components_distribution.mean(),
           axis=-1 - self._event_ndims)                       # [B, E]
@@ -264,15 +267,16 @@ class MixtureSameFamily(distribution.Distribution):
     x = self._pad_sample_dims(x)
     log_cdf_x = self.components_distribution.log_cdf(x)      # [S, B, k]
     log_mix_prob = nn_ops.log_softmax(
-        self.mixture_distribution.logits, dim=-1)            # [B, k]
+        self.mixture_distribution.logits, axis=-1)           # [B, k]
     return math_ops.reduce_logsumexp(
         log_cdf_x + log_mix_prob, axis=-1)                   # [S, B]
 
   def _variance(self):
     with ops.control_dependencies(self._runtime_assertions):
       # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X])
-      probs = self._pad_mix_dims(
-          self.mixture_distribution.probs)                   # [B, k, [1]*e]
+      probs = distribution_utils.pad_mixture_dimensions(
+          self.mixture_distribution.probs, self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [B, k, [1]*e]
       mean_cond_var = math_ops.reduce_sum(
           probs * self.components_distribution.variance(),
           axis=-1 - self._event_ndims)                       # [B, E]
@@ -291,8 +295,12 @@ class MixtureSameFamily(distribution.Distribution):
 
     with ops.control_dependencies(self._runtime_assertions):
       # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X])
-      probs = self._pad_mix_dims(self._pad_mix_dims(
-          self.mixture_distribution.probs))                  # [B, k, 1, 1]
+      probs = distribution_utils.pad_mixture_dimensions(
+          distribution_utils.pad_mixture_dimensions(
+              self.mixture_distribution.probs, self, self.mixture_distribution,
+              self._event_shape().ndims),
+          self, self.mixture_distribution,
+          self._event_shape().ndims)                         # [B, k, 1, 1]
       mean_cond_var = math_ops.reduce_sum(
           probs * self.components_distribution.covariance(),
           axis=-3)                                           # [B, e, e]
@@ -312,26 +320,6 @@ class MixtureSameFamily(distribution.Distribution):
           shape[:d], [1], shape[d:]], axis=0))
       return x
 
-  def _pad_mix_dims(self, x):
-    with ops.name_scope("pad_mix_dims", values=[x]):
-      def _get_ndims(d):
-        if d.batch_shape.ndims is not None:
-          return d.batch_shape.ndims
-        return array_ops.shape(d.batch_shape_tensor())[0]
-      dist_batch_ndims = _get_ndims(self)
-      cat_batch_ndims = _get_ndims(self.mixture_distribution)
-      bnd = distribution_util.pick_vector(
-          self.mixture_distribution.is_scalar_batch(),
-          [dist_batch_ndims], [cat_batch_ndims])[0]
-      s = array_ops.shape(x)
-      x = array_ops.reshape(x, shape=array_ops.concat([
-          s[:-1],
-          array_ops.ones([bnd], dtype=dtypes.int32),
-          s[-1:],
-          array_ops.ones([self._event_ndims], dtype=dtypes.int32),
-      ], axis=0))
-      return x
-
 
 def _outer_squared_difference(x, y):
   """Convenience function analogous to tf.squared_difference."""
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag.py b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
index 163cf75d990d5fe7ec1e3aaf0040fc71f61774a7..e862552880f4073c8fa8e90134d0633e7484b0bf 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
@@ -84,10 +84,10 @@ class MultivariateNormalDiag(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 2-variate Gaussian.
-  mvn = ds.MultivariateNormalDiag(
+  mvn = tfd.MultivariateNormalDiag(
       loc=[1., -1],
       scale_diag=[1, 2.])
 
@@ -101,7 +101,7 @@ class MultivariateNormalDiag(
   mvn.prob([-1., 0]).eval()  # shape: []
 
   # Initialize a 3-batch, 2-variate scaled-identity Gaussian.
-  mvn = ds.MultivariateNormalDiag(
+  mvn = tfd.MultivariateNormalDiag(
       loc=[1., -1],
       scale_identity_multiplier=[1, 2., 3])
 
@@ -119,7 +119,7 @@ class MultivariateNormalDiag(
   mvn.prob([-1., 0]).eval()  # shape: [3]
 
   # Initialize a 2-batch of 3-variate Gaussians.
-  mvn = ds.MultivariateNormalDiag(
+  mvn = tfd.MultivariateNormalDiag(
       loc=[[1., 2, 3],
            [11, 22, 33]]           # shape: [2, 3]
       scale_diag=[[1., 2, 3],
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
index 040bc230722194316b8a74627344e315a2578281..413e88f03ae0286c294f3404549a73e1a47dcff7 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
@@ -86,7 +86,7 @@ class MultivariateNormalDiagPlusLowRank(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate Gaussian with covariance `cov = S @ S.T`,
   # `S = diag(d) + U @ diag(m) @ U.T`. The perturbation, `U @ diag(m) @ U.T`, is
@@ -97,7 +97,7 @@ class MultivariateNormalDiagPlusLowRank(
        [-1, 1],
        [2, -0.5]]        # shape: [3, 2]
   m = [4., 5]            # shape: [2]
-  mvn = ds.MultivariateNormalDiagPlusLowRank(
+  mvn = tfd.MultivariateNormalDiagPlusLowRank(
       loc=mu
       scale_diag=d
       scale_perturb_factor=U,
@@ -118,7 +118,7 @@ class MultivariateNormalDiagPlusLowRank(
   m = [[0.1, 0.2],
        [0.4, 0.5]]         # shape: [b, r] = [2, 2]
 
-  mvn = ds.MultivariateNormalDiagPlusLowRank(
+  mvn = tfd.MultivariateNormalDiagPlusLowRank(
       loc=mu,
       scale_perturb_factor=U,
       scale_perturb_diag=m)
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
index f9952b2069d6dfd2593e6bd71ede0badf44cdf98..4bea99fbb75349f97fde473cb5716fe6c426ce90 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
@@ -73,14 +73,14 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate Gaussian.
   mu = [1., 2, 3]
   cov = [[ 0.36,  0.12,  0.06],
          [ 0.12,  0.29, -0.13],
          [ 0.06, -0.13,  0.26]]
-  mvn = ds.MultivariateNormalFullCovariance(
+  mvn = tfd.MultivariateNormalFullCovariance(
       loc=mu,
       covariance_matrix=cov)
 
@@ -100,7 +100,7 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
   mu = [[1., 2, 3],
         [11, 22, 33]]              # shape: [2, 3]
   covariance_matrix = ...  # shape: [2, 3, 3], symmetric, positive definite.
-  mvn = ds.MultivariateNormalFullCovariance(
+  mvn = tfd.MultivariateNormalFullCovariance(
       loc=mu,
       covariance=covariance_matrix)
 
@@ -167,12 +167,11 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
           covariance_matrix = ops.convert_to_tensor(
               covariance_matrix, name="covariance_matrix")
           if validate_args:
-            assert_symmetric = check_ops.assert_equal(
-                covariance_matrix,
-                array_ops.matrix_transpose(covariance_matrix),
-                message="Matrix was not symmetric.")
-            covariance_matrix = control_flow_ops.with_dependencies(
-                [assert_symmetric], covariance_matrix)
+            covariance_matrix = control_flow_ops.with_dependencies([
+                check_ops.assert_near(
+                    covariance_matrix,
+                    array_ops.matrix_transpose(covariance_matrix),
+                    message="Matrix was not symmetric")], covariance_matrix)
           # No need to validate that covariance_matrix is non-singular.
           # LinearOperatorLowerTriangular has an assert_non_singular method that
           # is called by the Bijector.
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index 300bdd5f6064a1cc9c336689ac4fae04338edb30..a7399792892f4c179c05168184d76ec95c168b51 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -90,8 +90,7 @@ class MultivariateNormalLinearOperator(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate Gaussian.
   mu = [1., 2, 3]
@@ -103,9 +102,9 @@ class MultivariateNormalLinearOperator(
   #      [ 0.2,  0.5,  0. ],
   #      [ 0.1, -0.3,  0.4]])
 
-  mvn = ds.MultivariateNormalLinearOperator(
+  mvn = tfd.MultivariateNormalLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorLowerTriangular(scale))
+      scale=tf.linalg.LinearOperatorLowerTriangular(scale))
 
   # Covariance agrees with cholesky(cov) parameterization.
   mvn.covariance().eval()
@@ -122,9 +121,9 @@ class MultivariateNormalLinearOperator(
   scale_diag = [[1., 2, 3],
                 [0.5, 1, 1.5]]     # shape: [2, 3]
 
-  mvn = ds.MultivariateNormalLinearOperator(
+  mvn = tfd.MultivariateNormalLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorDiag(scale_diag))
+      scale=tf.linalg.LinearOperatorDiag(scale_diag))
 
   # Compute the pdf of two `R^3` observations; return a length-2 vector.
   x = [[-0.9, 0, 0.1],
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_tril.py b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
index 260dcc18f513d5440d3d39368539274c03faa72a..6c7dc4ca7aaf5b3a20b072e9360d15528ad10556 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_tril.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
@@ -76,12 +76,13 @@ class MultivariateNormalTriL(
   ```
 
   Trainable (batch) lower-triangular matrices can be created with
-  `ds.matrix_diag_transform()` and/or `ds.fill_triangular()`
+  `tf.contrib.distributions.matrix_diag_transform()` and/or
+  `tf.contrib.distributions.fill_triangular()`
 
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate Gaussian.
   mu = [1., 2, 3]
@@ -92,7 +93,7 @@ class MultivariateNormalTriL(
   # ==> [[ 0.6,  0. ,  0. ],
   #      [ 0.2,  0.5,  0. ],
   #      [ 0.1, -0.3,  0.4]])
-  mvn = ds.MultivariateNormalTriL(
+  mvn = tfd.MultivariateNormalTriL(
       loc=mu,
       scale_tril=scale)
 
@@ -112,7 +113,7 @@ class MultivariateNormalTriL(
   mu = [[1., 2, 3],
         [11, 22, 33]]              # shape: [2, 3]
   tril = ...  # shape: [2, 3, 3], lower triangular, non-zero diagonal.
-  mvn = ds.MultivariateNormalTriL(
+  mvn = tfd.MultivariateNormalTriL(
       loc=mu,
       scale_tril=tril)
 
@@ -124,9 +125,9 @@ class MultivariateNormalTriL(
   # Instantiate a "learnable" MVN.
   dims = 4
   with tf.variable_scope("model"):
-    mvn = ds.MultivariateNormalTriL(
+    mvn = tfd.MultivariateNormalTriL(
         loc=tf.get_variable(shape=[dims], dtype=tf.float32, name="mu"),
-        scale_tril=ds.fill_triangular(
+        scale_tril=tfd.fill_triangular(
             tf.get_variable(shape=[dims * (dims + 1) / 2],
                             dtype=tf.float32, name="chol_Sigma")))
   ```
diff --git a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
index e1118ed4312ca2ed678a05a298110e2669d0a27e..92f2bba1828696248c9d9460566a08ba372c3358 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
@@ -22,21 +22,135 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import poisson as poisson_lib
+from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import categorical as categorical_lib
 from tensorflow.python.ops.distributions import distribution as distribution_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.ops.distributions import transformed_distribution as transformed_lib
 
 
 __all__ = [
     "PoissonLogNormalQuadratureCompound",
+    "quadrature_scheme_lognormal_gauss_hermite",
+    "quadrature_scheme_lognormal_quantiles",
 ]
 
 
+def quadrature_scheme_lognormal_gauss_hermite(
+    loc, scale, quadrature_size,
+    validate_args=False, name=None):  # pylint: disable=unused-argument
+  """Use Gauss-Hermite quadrature to form quadrature on positive-reals.
+
+  Note: for a given `quadrature_size`, this method is generally less accurate
+  than `quadrature_scheme_lognormal_quantiles`.
+
+  Args:
+    loc: `float`-like (batch of) scalar `Tensor`; the location parameter of
+      the LogNormal prior.
+    scale: `float`-like (batch of) scalar `Tensor`; the scale parameter of
+      the LogNormal prior.
+    quadrature_size: Python `int` scalar representing the number of quadrature
+      points.
+    validate_args: Python `bool`, default `False`. When `True` distribution
+      parameters are checked for validity despite possibly degrading runtime
+      performance. When `False` invalid inputs may silently render incorrect
+      outputs.
+    name: Python `str` name prefixed to Ops created by this class.
+
+  Returns:
+    grid: (Batch of) length-`quadrature_size` vectors representing the
+      `log_rate` parameters of a `Poisson`.
+    probs: (Batch of) length-`quadrature_size` vectors representing the
+      weight associate with each `grid` value.
+  """
+  with ops.name_scope(name, "vector_diffeomixture_quadrature_gauss_hermite",
+                      [loc, scale]):
+    grid, probs = np.polynomial.hermite.hermgauss(deg=quadrature_size)
+    grid = grid.astype(loc.dtype.as_numpy_dtype)
+    probs = probs.astype(loc.dtype.as_numpy_dtype)
+    probs /= np.linalg.norm(probs, ord=1, keepdims=True)
+    probs = ops.convert_to_tensor(probs, name="probs", dtype=loc.dtype)
+    # The following maps the broadcast of `loc` and `scale` to each grid
+    # point, i.e., we are creating several log-rates that correspond to the
+    # different Gauss-Hermite quadrature points and (possible) batches of
+    # `loc` and `scale`.
+    grid = (loc[..., array_ops.newaxis]
+            + np.sqrt(2.) * scale[..., array_ops.newaxis] * grid)
+    return grid, probs
+
+
+def quadrature_scheme_lognormal_quantiles(
+    loc, scale, quadrature_size,
+    validate_args=False, name=None):
+  """Use LogNormal quantiles to form quadrature on positive-reals.
+
+  Args:
+    loc: `float`-like (batch of) scalar `Tensor`; the location parameter of
+      the LogNormal prior.
+    scale: `float`-like (batch of) scalar `Tensor`; the scale parameter of
+      the LogNormal prior.
+    quadrature_size: Python `int` scalar representing the number of quadrature
+      points.
+    validate_args: Python `bool`, default `False`. When `True` distribution
+      parameters are checked for validity despite possibly degrading runtime
+      performance. When `False` invalid inputs may silently render incorrect
+      outputs.
+    name: Python `str` name prefixed to Ops created by this class.
+
+  Returns:
+    grid: (Batch of) length-`quadrature_size` vectors representing the
+      `log_rate` parameters of a `Poisson`.
+    probs: (Batch of) length-`quadrature_size` vectors representing the
+      weight associate with each `grid` value.
+  """
+  with ops.name_scope(name, "quadrature_scheme_lognormal_quantiles",
+                      [loc, scale]):
+    # Create a LogNormal distribution.
+    dist = transformed_lib.TransformedDistribution(
+        distribution=normal_lib.Normal(loc=loc, scale=scale),
+        bijector=Exp(event_ndims=0),
+        validate_args=validate_args)
+    batch_ndims = dist.batch_shape.ndims
+    if batch_ndims is None:
+      batch_ndims = array_ops.shape(dist.batch_shape_tensor())[0]
+
+    def _compute_quantiles():
+      """Helper to build quantiles."""
+      # Omit {0, 1} since they might lead to Inf/NaN.
+      zero = array_ops.zeros([], dtype=dist.dtype)
+      edges = math_ops.linspace(zero, 1., quadrature_size + 3)[1:-1]
+      # Expand edges so its broadcast across batch dims.
+      edges = array_ops.reshape(edges, shape=array_ops.concat([
+          [-1], array_ops.ones([batch_ndims], dtype=dtypes.int32)], axis=0))
+      quantiles = dist.quantile(edges)
+      # Cyclically permute left by one.
+      perm = array_ops.concat([
+          math_ops.range(1, 1 + batch_ndims), [0]], axis=0)
+      quantiles = array_ops.transpose(quantiles, perm)
+      return quantiles
+    quantiles = _compute_quantiles()
+
+    # Compute grid as quantile midpoints.
+    grid = (quantiles[..., :-1] + quantiles[..., 1:]) / 2.
+    # Set shape hints.
+    grid.set_shape(dist.batch_shape.concatenate([quadrature_size]))
+
+    # By construction probs is constant, i.e., `1 / quadrature_size`. This is
+    # important, because non-constant probs leads to non-reparameterizable
+    # samples.
+    probs = array_ops.fill(
+        dims=[quadrature_size],
+        value=1. / math_ops.cast(quadrature_size, dist.dtype))
+
+    return grid, probs
+
+
 class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
   """`PoissonLogNormalQuadratureCompound` distribution.
 
@@ -47,30 +161,18 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
   ```none
   p(k|loc, scale)
   = int_{R_+} dl LogNormal(l | loc, scale) Poisson(k | l)
-  = int_{R} dz ((lambda(z) sqrt(2) scale)
-                * exp(-z**2) / (lambda(z) sqrt(2 pi) sigma)
-                * Poisson(k | lambda(z)))
-  = int_{R} dz exp(-z**2) / sqrt(pi) Poisson(k | lambda(z))
   approx= sum{ prob[d] Poisson(k | lambda(grid[d])) : d=0, ..., deg-1 }
   ```
 
-  where `lambda(z) = exp(sqrt(2) scale z + loc)` and the `prob,grid` terms
-  are from [numerical quadrature](
-  https://en.wikipedia.org/wiki/Numerical_integration) (default:
-  [Gauss--Hermite quadrature](
-  https://en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)). Note that
-  the second line made the substitution:
-  `z(l) = (log(l) - loc) / (sqrt(2) scale)` which implies `lambda(z)` [above]
-  and `dl = sqrt(2) scale lambda(z) dz`
+  By default, the `grid` is chosen as quantiles of the `LogNormal` distribution
+  parameterized by `loc`, `scale` and the `prob` vector is
+  `[1. / quadrature_size]*quadrature_size`.
 
   In the non-approximation case, a draw from the LogNormal prior represents the
   Poisson rate parameter. Unfortunately, the non-approximate distribution lacks
   an analytical probability density function (pdf). Therefore the
   `PoissonLogNormalQuadratureCompound` class implements an approximation based
-  on [numerical quadrature](
-  https://en.wikipedia.org/wiki/Numerical_integration) (default:
-  [Gauss--Hermite quadrature](
-  https://en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)).
+  on [quadrature](https://en.wikipedia.org/wiki/Numerical_integration).
 
   Note: although the `PoissonLogNormalQuadratureCompound` is approximately the
   Poisson-LogNormal compound distribution, it is itself a valid distribution.
@@ -84,10 +186,8 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
   https://en.wikipedia.org/wiki/Compound_probability_distribution). Using
   variable-substitution and [numerical quadrature](
   https://en.wikipedia.org/wiki/Numerical_integration) (default:
-  [Gauss--Hermite quadrature](
-  https://en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)) we can
-  redefine the distribution to be a parameter-less convex combination of `deg`
-  different Poisson samples.
+  based on `LogNormal` quantiles) we can redefine the distribution to be a
+  parameter-less convex combination of `deg` different Poisson samples.
 
   That is, defined over positive integers, this distribution is parameterized
   by a (batch of) `loc` and `scale` scalars.
@@ -96,46 +196,51 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
 
   ```none
   pdf(k | loc, scale, deg)
-    = sum{ prob[d] Poisson(k | lambda=exp(sqrt(2) scale grid[d] + loc))
+    = sum{ prob[d] Poisson(k | lambda=exp(grid[d]))
           : d=0, ..., deg-1 }
   ```
 
-  where, [e.g., `grid, w = numpy.polynomial.hermite.hermgauss(deg)`](
-  https://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.polynomial.hermite.hermgauss.html)
-  and `prob = w / sqrt(pi)`.
-
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
+
   # Create two batches of PoissonLogNormalQuadratureCompounds, one with
   # prior `loc = 0.` and another with `loc = 1.` In both cases `scale = 1.`
-  pln = ds.PoissonLogNormalQuadratureCompound(
+  pln = tfd.PoissonLogNormalQuadratureCompound(
       loc=[0., -0.5],
       scale=1.,
-      quadrature_grid_and_probs=(
-        np.polynomial.hermite.hermgauss(deg=10)),
+      quadrature_size=10,
       validate_args=True)
   """
 
   def __init__(self,
                loc,
                scale,
-               quadrature_grid_and_probs=None,
+               quadrature_size=8,
+               quadrature_fn=quadrature_scheme_lognormal_quantiles,
                validate_args=False,
                allow_nan_stats=True,
                name="PoissonLogNormalQuadratureCompound"):
-    """Constructs the PoissonLogNormalQuadratureCompound on `R**k`.
+    """Constructs the PoissonLogNormalQuadratureCompound`.
+
+    Note: `probs` returned by (optional) `quadrature_fn` are presumed to be
+    either a length-`quadrature_size` vector or a batch of vectors in 1-to-1
+    correspondence with the returned `grid`. (I.e., broadcasting is only
+    partially supported.)
 
     Args:
       loc: `float`-like (batch of) scalar `Tensor`; the location parameter of
         the LogNormal prior.
       scale: `float`-like (batch of) scalar `Tensor`; the scale parameter of
         the LogNormal prior.
-      quadrature_grid_and_probs: Python pair of `float`-like `Tensor`s
-        representing the sample points and the corresponding (possibly
-        normalized) weight.  When `None`, defaults to:
-        `np.polynomial.hermite.hermgauss(deg=8)`.
+      quadrature_size: Python `int` scalar representing the number of quadrature
+        points.
+      quadrature_fn: Python callable taking `loc`, `scale`,
+        `quadrature_size`, `validate_args` and returning `tuple(grid, probs)`
+        representing the LogNormal grid and corresponding normalized weight.
+        normalized) weight.
+        Default value: `quadrature_scheme_lognormal_quantiles`.
       validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
@@ -147,47 +252,41 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
       name: Python `str` name prefixed to Ops created by this class.
 
     Raises:
-      TypeError: if `loc.dtype != scale[0].dtype`.
+      TypeError: if `quadrature_grid` and `quadrature_probs` have different base
+        `dtype`.
     """
     parameters = locals()
     with ops.name_scope(name, values=[loc, scale]):
-      loc = ops.convert_to_tensor(loc, name="loc")
-      self._loc = loc
+      if loc is not None:
+        loc = ops.convert_to_tensor(loc, name="loc")
+      if scale is not None:
+        scale = ops.convert_to_tensor(
+            scale, dtype=None if loc is None else loc.dtype, name="scale")
+      self._quadrature_grid, self._quadrature_probs = tuple(quadrature_fn(
+          loc, scale, quadrature_size, validate_args))
+
+      dt = self._quadrature_grid.dtype
+      if dt.base_dtype != self._quadrature_probs.dtype.base_dtype:
+        raise TypeError("Quadrature grid dtype ({}) does not match quadrature "
+                        "probs dtype ({}).".format(
+                            dt.name, self._quadrature_probs.dtype.name))
 
-      scale = ops.convert_to_tensor(scale, name="scale")
-      self._scale = scale
-
-      dtype = loc.dtype.base_dtype
-      if dtype != scale.dtype.base_dtype:
-        raise TypeError(
-            "loc.dtype(\"{}\") does not match scale.dtype(\"{}\")".format(
-                loc.dtype.name, scale.dtype.name))
-
-      grid, probs = distribution_util.process_quadrature_grid_and_probs(
-          quadrature_grid_and_probs, dtype, validate_args)
-      self._quadrature_grid = grid
-      self._quadrature_probs = probs
-      self._quadrature_size = distribution_util.dimension_size(probs, axis=0)
+      self._distribution = poisson_lib.Poisson(
+          log_rate=self._quadrature_grid,
+          validate_args=validate_args,
+          allow_nan_stats=allow_nan_stats)
 
       self._mixture_distribution = categorical_lib.Categorical(
           logits=math_ops.log(self._quadrature_probs),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats)
 
-      # The following maps the broadcast of `loc` and `scale` to each grid
-      # point, i.e., we are creating several log-rates that correspond to the
-      # different Gauss-Hermite quadrature points and (possible) batches of
-      # `loc` and `scale`.
-      self._log_rate = (loc[..., array_ops.newaxis]
-                        + np.sqrt(2.) * scale[..., array_ops.newaxis] * grid)
-
-      self._distribution = poisson_lib.Poisson(
-          log_rate=self._log_rate,
-          validate_args=validate_args,
-          allow_nan_stats=allow_nan_stats)
+      self._loc = loc
+      self._scale = scale
+      self._quadrature_size = quadrature_size
 
       super(PoissonLogNormalQuadratureCompound, self).__init__(
-          dtype=dtype,
+          dtype=dt,
           reparameterization_type=distribution_lib.NOT_REPARAMETERIZED,
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
@@ -197,12 +296,12 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
 
   @property
   def mixture_distribution(self):
-    """Distribution which randomly selects a Poisson with Gauss-Hermite rate."""
+    """Distribution which randomly selects a Poisson with quadrature param."""
     return self._mixture_distribution
 
   @property
   def distribution(self):
-    """Base Poisson parameterized by a Gauss-Hermite grid of rates."""
+    """Base Poisson parameterized by a quadrature grid."""
     return self._distribution
 
   @property
@@ -216,24 +315,18 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
     return self._scale
 
   @property
-  def quadrature_grid(self):
-    """Quadrature grid points."""
-    return self._quadrature_grid
-
-  @property
-  def quadrature_probs(self):
-    """Quadrature normalized weights."""
-    return self._quadrature_probs
+  def quadrature_size(self):
+    return self._quadrature_size
 
   def _batch_shape_tensor(self):
     return array_ops.broadcast_dynamic_shape(
-        array_ops.shape(self.loc),
-        array_ops.shape(self.scale))
+        self.distribution.batch_shape_tensor(),
+        array_ops.shape(self.mixture_distribution.logits))[:-1]
 
   def _batch_shape(self):
     return array_ops.broadcast_static_shape(
-        self.loc.shape,
-        self.scale.shape)
+        self.distribution.batch_shape,
+        self.mixture_distribution.logits.shape)[:-1]
 
   def _event_shape(self):
     return tensor_shape.scalar()
@@ -241,18 +334,31 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
   def _sample_n(self, n, seed=None):
     # Get ids as a [n, batch_size]-shaped matrix, unless batch_shape=[] then get
     # ids as a [n]-shaped vector.
-    batch_size = (np.prod(self.batch_shape.as_list(), dtype=np.int32)
-                  if self.batch_shape.is_fully_defined()
-                  else math_ops.reduce_prod(self.batch_shape_tensor()))
+    batch_size = self.batch_shape.num_elements()
+    if batch_size is None:
+      batch_size = math_ops.reduce_prod(self.batch_shape_tensor())
+    # We need to "sample extra" from the mixture distribution if it doesn't
+    # already specify a probs vector for each batch coordinate.
+    # We only support this kind of reduced broadcasting, i.e., there is exactly
+    # one probs vector for all batch dims or one for each.
     ids = self._mixture_distribution.sample(
         sample_shape=concat_vectors(
             [n],
             distribution_util.pick_vector(
-                self.is_scalar_batch(),
-                np.int32([]),
-                [batch_size])),
+                self.mixture_distribution.is_scalar_batch(),
+                [batch_size],
+                np.int32([]))),
         seed=distribution_util.gen_new_seed(
             seed, "poisson_lognormal_quadrature_compound"))
+    # We need to flatten batch dims in case mixture_distribution has its own
+    # batch dims.
+    ids = array_ops.reshape(ids, shape=concat_vectors(
+        [n],
+        distribution_util.pick_vector(
+            self.is_scalar_batch(),
+            np.int32([]),
+            np.int32([-1]))))
+
     # Stride `quadrature_size` for `batch_size` number of times.
     offset = math_ops.range(start=0,
                             limit=batch_size * self._quadrature_size,
@@ -275,7 +381,7 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
   def _mean(self):
     return math_ops.exp(
         math_ops.reduce_logsumexp(
-            self.mixture_distribution.logits + self._log_rate,
+            self.mixture_distribution.logits + self.distribution.log_rate,
             axis=-1))
 
   def _variance(self):
@@ -300,7 +406,7 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
     # Var[E[Z | V]] = sum{ prob[d] (Mean[d] - Mean)**2 : d=0, ..., deg-1 }
     v = array_ops.stack([
         # log(self.distribution.variance()) = log(Var[d]) = log(rate[d])
-        self._log_rate,
+        self.distribution.log_rate,
         # log((Mean[d] - Mean)**2)
         2. * math_ops.log(
             math_ops.abs(self.distribution.mean()
@@ -311,14 +417,9 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
         axis=[-2, -1])
 
 
-def static_value(x):
-  """Returns the static value of a `Tensor` or `None`."""
-  return tensor_util.constant_value(ops.convert_to_tensor(x))
-
-
 def concat_vectors(*args):
   """Concatenates input vectors, statically if possible."""
-  args_ = [static_value(x) for x in args]
+  args_ = [distribution_util.static_value(x) for x in args]
   if any(vec is None for vec in args_):
     return array_ops.concat(args, axis=0)
   return [val for vec in args_ for val in vec]
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index b6becfa9fc93f189a1a7bf7b2a7af8dc1f2e9720..2aa771a71efe52c8d86d459f090ea8ee137c4487 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -278,7 +278,7 @@ class ExpRelaxedOneHotCategorical(distribution.Distribution):
                       * math_ops.log(self.temperature))
     # compute the unnormalized density
     log_softmax = nn_ops.log_softmax(logits_2d - x_2d * self._temperature_2d)
-    log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keep_dims=False)
+    log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keepdims=False)
     # combine unnormalized density with normalization constant
     log_prob = log_norm_const + log_unnorm_prob
     # Reshapes log_prob to be consistent with shape of user-supplied logits
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index 2a4b92c72900f79785e7e34b77179d3decbace5b..dfc813361977c159d8d48f9d5b9ff03db5b4acdc 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -28,12 +28,190 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.distributions import util
 
 __all__ = [
+    "auto_correlation",
     "percentile",
 ]
 
 
+# TODO(langmore) Write separate versions of this for real/complex dtype, taking
+# advantage of optimized real-fft ops.
+def auto_correlation(
+    x,
+    axis=-1,
+    max_lags=None,
+    center=True,
+    normalize=True,
+    name="auto_correlation"):
+  """Auto correlation along one axis.
+
+  Given a `1-D` wide sense stationary (WSS) sequence `X`, the auto correlation
+  `RXX` may be defined as  (with `E` expectation and `Conj` complex conjugate)
+
+  ```
+  RXX[m] := E{ W[m] Conj(W[0]) } = E{ W[0] Conj(W[-m]) },
+  W[n]   := (X[n] - MU) / S,
+  MU     := E{ X[0] },
+  S**2   := E{ (X[0] - MU) Conj(X[0] - MU) }.
+  ```
+
+  This function takes the viewpoint that `x` is (along one axis) a finite
+  sub-sequence of a realization of (WSS) `X`, and then uses `x` to produce an
+  estimate of `RXX[m]` as follows:
+
+  After extending `x` from length `L` to `inf` by zero padding, the auto
+  correlation estimate `rxx[m]` is computed for `m = 0, 1, ..., max_lags` as
+
+  ```
+  rxx[m] := (L - m)**-1 sum_n w[n + m] Conj(w[n]),
+  w[n]   := (x[n] - mu) / s,
+  mu     := L**-1 sum_n x[n],
+  s**2   := L**-1 sum_n (x[n] - mu) Conj(x[n] - mu)
+  ```
+
+  The error in this estimate is proportional to `1 / sqrt(len(x) - m)`, so users
+  often set `max_lags` small enough so that the entire output is meaningful.
+
+  Note that since `mu` is an imperfect estimate of `E{ X[0] }`, and we divide by
+  `len(x) - m` rather than `len(x) - m - 1`, our estimate of auto correlation
+  contains a slight bias, which goes to zero as `len(x) - m --> infinity`.
+
+  Args:
+    x:  `float32` or `complex64` `Tensor`.
+    axis:  Python `int`. The axis number along which to compute correlation.
+      Other dimensions index different batch members.
+    max_lags:  Positive `int` tensor.  The maximum value of `m` to consider
+      (in equation above).  If `max_lags >= x.shape[axis]`, we effectively
+      re-set `max_lags` to `x.shape[axis] - 1`.
+    center:  Python `bool`.  If `False`, do not subtract the mean estimate `mu`
+      from `x[n]` when forming `w[n]`.
+    normalize:  Python `bool`.  If `False`, do not divide by the variance
+      estimate `s**2` when forming `w[n]`.
+    name:  `String` name to prepend to created ops.
+
+  Returns:
+    `rxx`: `Tensor` of same `dtype` as `x`.  `rxx.shape[i] = x.shape[i]` for
+      `i != axis`, and `rxx.shape[axis] = max_lags + 1`.
+
+  Raises:
+    TypeError:  If `x` is not a supported type.
+  """
+  # Implementation details:
+  # Extend length N / 2 1-D array x to length N by zero padding onto the end.
+  # Then, set
+  #   F[x]_k := sum_n x_n exp{-i 2 pi k n / N }.
+  # It is not hard to see that
+  #   F[x]_k Conj(F[x]_k) = F[R]_k, where
+  #   R_m := sum_n x_n Conj(x_{(n - m) mod N}).
+  # One can also check that R_m / (N / 2 - m) is an unbiased estimate of RXX[m].
+
+  # Since F[x] is the DFT of x, this leads us to a zero-padding and FFT/IFFT
+  # based version of estimating RXX.
+  # Note that this is a special case of the Wiener-Khinchin Theorem.
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+
+    # Rotate dimensions of x in order to put axis at the rightmost dim.
+    # FFT op requires this.
+    rank = util.prefer_static_rank(x)
+    if axis < 0:
+      axis = rank + axis
+    shift = rank - 1 - axis
+    # Suppose x.shape[axis] = T, so there are T "time" steps.
+    #   ==> x_rotated.shape = B + [T],
+    # where B is x_rotated's batch shape.
+    x_rotated = util.rotate_transpose(x, shift)
+
+    if center:
+      x_rotated -= math_ops.reduce_mean(x_rotated, axis=-1, keepdims=True)
+
+    # x_len = N / 2 from above explanation.  The length of x along axis.
+    # Get a value for x_len that works in all cases.
+    x_len = util.prefer_static_shape(x_rotated)[-1]
+
+    # TODO(langmore) Investigate whether this zero padding helps or hurts.  At
+    # the moment is is necessary so that all FFT implementations work.
+    # Zero pad to the next power of 2 greater than 2 * x_len, which equals
+    # 2**(ceil(Log_2(2 * x_len))).  Note: Log_2(X) = Log_e(X) / Log_e(2).
+    x_len_float64 = math_ops.cast(x_len, np.float64)
+    target_length = math_ops.pow(
+        np.float64(2.),
+        math_ops.ceil(math_ops.log(x_len_float64 * 2) / np.log(2.)))
+    pad_length = math_ops.cast(target_length - x_len_float64, np.int32)
+
+    # We should have:
+    # x_rotated_pad.shape = x_rotated.shape[:-1] + [T + pad_length]
+    #                     = B + [T + pad_length]
+    x_rotated_pad = util.pad(x_rotated, axis=-1, back=True, count=pad_length)
+
+    dtype = x.dtype
+    if not dtype.is_complex:
+      if not dtype.is_floating:
+        raise TypeError("Argument x must have either float or complex dtype"
+                        " found: {}".format(dtype))
+      x_rotated_pad = math_ops.complex(x_rotated_pad,
+                                       dtype.real_dtype.as_numpy_dtype(0.))
+
+    # Autocorrelation is IFFT of power-spectral density (up to some scaling).
+    fft_x_rotated_pad = spectral_ops.fft(x_rotated_pad)
+    spectral_density = fft_x_rotated_pad * math_ops.conj(fft_x_rotated_pad)
+    # shifted_product is R[m] from above detailed explanation.
+    # It is the inner product sum_n X[n] * Conj(X[n - m]).
+    shifted_product = spectral_ops.ifft(spectral_density)
+
+    # Cast back to real-valued if x was real to begin with.
+    shifted_product = math_ops.cast(shifted_product, dtype)
+
+    # Figure out if we can deduce the final static shape, and set max_lags.
+    # Use x_rotated as a reference, because it has the time dimension in the far
+    # right, and was created before we performed all sorts of crazy shape
+    # manipulations.
+    know_static_shape = True
+    if not x_rotated.shape.is_fully_defined():
+      know_static_shape = False
+    if max_lags is None:
+      max_lags = x_len - 1
+    else:
+      max_lags = ops.convert_to_tensor(max_lags, name="max_lags")
+      max_lags_ = tensor_util.constant_value(max_lags)
+      if max_lags_ is None or not know_static_shape:
+        know_static_shape = False
+        max_lags = math_ops.minimum(x_len - 1, max_lags)
+      else:
+        max_lags = min(x_len - 1, max_lags_)
+
+    # Chop off the padding.
+    # We allow users to provide a huge max_lags, but cut it off here.
+    # shifted_product_chopped.shape = x_rotated.shape[:-1] + [max_lags]
+    shifted_product_chopped = shifted_product[..., :max_lags + 1]
+
+    # If possible, set shape.
+    if know_static_shape:
+      chopped_shape = x_rotated.shape.as_list()
+      chopped_shape[-1] = min(x_len, max_lags + 1)
+      shifted_product_chopped.set_shape(chopped_shape)
+
+    # Recall R[m] is a sum of N / 2 - m nonzero terms x[n] Conj(x[n - m]).  The
+    # other terms were zeros arising only due to zero padding.
+    # `denominator = (N / 2 - m)` (defined below) is the proper term to
+    # divide by by to make this an unbiased estimate of the expectation
+    # E[X[n] Conj(X[n - m])].
+    x_len = math_ops.cast(x_len, dtype.real_dtype)
+    max_lags = math_ops.cast(max_lags, dtype.real_dtype)
+    denominator = x_len - math_ops.range(0., max_lags + 1.)
+    denominator = math_ops.cast(denominator, dtype)
+    shifted_product_rotated = shifted_product_chopped / denominator
+
+    if normalize:
+      shifted_product_rotated /= shifted_product_rotated[..., :1]
+
+    # Transpose dimensions back to those of x.
+    return util.rotate_transpose(shifted_product_rotated, -shift)
+
+
 # TODO(langmore) To make equivalent to numpy.percentile:
 #  Make work with a sequence of floats or single float for 'q'.
 #  Make work with "linear", "midpoint" interpolation. (linear should be default)
diff --git a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
index b05f15771a3a94779ffddea8f16ad2fa4ea2fdd1..c4b8f055b7fbc3f0835b503eddd7617610326d8c 100644
--- a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
@@ -115,7 +115,7 @@ class SinhArcsinh(transformed_distribution.TransformedDistribution):
       tailweight:  Tailweight parameter. Default is `1.0` (unchanged tailweight)
       distribution: `tf.Distribution`-like instance. Distribution that is
         transformed to produce this distribution.
-        Default is `ds.Normal(0., 1.)`.
+        Default is `tf.distributions.Normal(0., 1.)`.
         Must be a scalar-batch, scalar-event distribution.  Typically
         `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is
         a function of non-trainable parameters. WARNING: If you backprop through
diff --git a/tensorflow/contrib/distributions/python/ops/test_util.py b/tensorflow/contrib/distributions/python/ops/test_util.py
index 77f2a39273dc365a4ac202d846dd2bc364655c86..15b0820cbdf560e04a304c40a47e541006523b6d 100644
--- a/tensorflow/contrib/distributions/python/ops/test_util.py
+++ b/tensorflow/contrib/distributions/python/ops/test_util.py
@@ -40,6 +40,7 @@ class DiscreteScalarDistributionTestHelpers(object):
   def run_test_sample_consistent_log_prob(
       self, sess_run_fn, dist,
       num_samples=int(1e5), num_threshold=int(1e3), seed=42,
+      batch_size=None,
       rtol=1e-2, atol=0.):
     """Tests that sample/log_prob are consistent with each other.
 
@@ -66,6 +67,8 @@ class DiscreteScalarDistributionTestHelpers(object):
       seed: Python `int` indicating the seed to use when sampling from `dist`.
         In general it is not recommended to use `None` during a test as this
         increases the likelihood of spurious test failure.
+      batch_size: Hint for unpacking result of samples. Default: `None` means
+        batch_size is inferred.
       rtol: Python `float`-type indicating the admissible relative error between
         analytical and sample statistics.
       atol: Python `float`-type indicating the admissible absolute error between
@@ -80,10 +83,11 @@ class DiscreteScalarDistributionTestHelpers(object):
     # Histogram only supports vectors so we call it once per batch coordinate.
     y = dist.sample(num_samples, seed=seed)
     y = array_ops.reshape(y, shape=[num_samples, -1])
-    batch_size = math_ops.reduce_prod(dist.batch_shape_tensor())
+    if batch_size is None:
+      batch_size = math_ops.reduce_prod(dist.batch_shape_tensor())
     batch_dims = array_ops.shape(dist.batch_shape_tensor())[0]
     edges_expanded_shape = 1 + array_ops.pad([-2], paddings=[[0, batch_dims]])
-    for b, x in enumerate(array_ops.unstack(y, axis=1)):
+    for b, x in enumerate(array_ops.unstack(y, num=batch_size, axis=1)):
       counts, edges = self.histogram(x)
       edges = array_ops.reshape(edges, edges_expanded_shape)
       probs = math_ops.exp(dist.log_prob(edges))
@@ -323,7 +327,7 @@ class VectorDistributionTestHelpers(object):
       num_samples=int(1e5),
       seed=24,
       rtol=1e-2,
-      atol=0.,
+      atol=0.1,
       cov_rtol=None,
       cov_atol=None):
     """Tests that sample/mean/covariance are consistent with each other.
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index 92043d6a08833888c36009261addca0d14949ea8..0c747f8e68529484ae6f695b8500cde74857bb11 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -22,141 +22,237 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator import AffineLinearOperator
+from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
 from tensorflow.contrib.linalg.python.ops import linear_operator_addition as linop_add_lib
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops.distributions import categorical as categorical_lib
 from tensorflow.python.ops.distributions import distribution as distribution_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.ops.linalg import linear_operator_diag as linop_diag_lib
 from tensorflow.python.ops.linalg import linear_operator_full_matrix as linop_full_lib
 from tensorflow.python.ops.linalg import linear_operator_identity as linop_identity_lib
 from tensorflow.python.ops.linalg import linear_operator_lower_triangular as linop_tril_lib
 
-static_value = distribution_util.static_value
-
 
 __all__ = [
     "VectorDiffeomixture",
+    "quadrature_scheme_softmaxnormal_gauss_hermite",
+    "quadrature_scheme_softmaxnormal_quantiles",
 ]
 
 
-class VectorDiffeomixture(distribution_lib.Distribution):
-  """VectorDiffeomixture distribution.
+def quadrature_scheme_softmaxnormal_gauss_hermite(
+    normal_loc, normal_scale, quadrature_size,
+    validate_args=False, name=None):
+  """Use Gauss-Hermite quadrature to form quadrature on `K - 1` simplex.
 
-  The VectorDiffeomixture is an approximation to a [compound distribution](
-  https://en.wikipedia.org/wiki/Compound_probability_distribution), i.e.,
+  A `SoftmaxNormal` random variable `Y` may be generated via
 
-  ```none
-  p(x) = int_{X} q(x | v) p(v) dv
-       = lim_{Q->infty} sum{ prob[i] q(x | loc=sum_k^K lambda[k;i] loc[k],
-                                            scale=sum_k^K lambda[k;i] scale[k])
-                            : i=0, ..., Q-1 }
+  ```
+  Y = SoftmaxCentered(X),
+  X = Normal(normal_loc, normal_scale)
   ```
 
-  where `q(x | v)` is a vector version of the `distribution` argument and `p(v)`
-  is a SoftmaxNormal parameterized by `mix_loc` and `mix_scale`. The
-  vector-ization of `distribution` entails an affine transformation of iid
-  samples from `distribution`.  The `prob` term is from quadrature and
-  `lambda[k] = sigmoid(mix_loc[k] + sqrt(2) mix_scale[k] grid[k])` where the
-  `grid` points correspond to the `prob`s.
-
-  In the non-approximation case, a draw from the mixture distribution (the
-  "prior") represents the convex weights for different affine transformations.
-  I.e., draw a mixing vector `v` (from the `K-1`-simplex) and let the final
-  sample be: `y = (sum_k^K v[k] scale[k]) @ x + (sum_k^K v[k] loc[k])` where `@`
-  denotes matrix multiplication.  However, the non-approximate distribution does
-  not have an analytical probability density function (pdf). Therefore the
-  `VectorDiffeomixture` class implements an approximation based on
-  [numerical quadrature](
-  https://en.wikipedia.org/wiki/Numerical_integration) (default:
-  [Gauss--Hermite quadrature](
-  https://en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)). I.e., in
-  Note: although the `VectorDiffeomixture` is approximately the
-  `SoftmaxNormal-Distribution` compound distribution, it is itself a valid
-  distribution. It possesses a `sample`, `log_prob`, `mean`, `covariance` which
-  are all mutually consistent.
-
-  #### Intended Use
-
-  This distribution is noteworthy because it implements a mixture of
-  `Vector`-ized distributions yet has samples differentiable in the
-  distribution's parameters (aka "reparameterized"). It has an analytical
-  density function with `O(dKQ)` complexity. `d` is the vector dimensionality,
-  `K` is the number of components, and `Q` is the number of quadrature points.
-  These properties make it well-suited for Bayesian Variational Inference, i.e.,
-  as a surrogate family for the posterior.
-
-  For large values of `mix_scale`, the `VectorDistribution` behaves increasingly
-  like a discrete mixture. (In most cases this limit is only achievable by also
-  increasing the quadrature polynomial degree, `Q`.)
-
-  The term `Vector` is consistent with similar named Tensorflow `Distribution`s.
-  For more details, see the "About `Vector` distributions in Tensorflow."
-  section.
-
-  The term `Diffeomixture` is a portmanteau of
-  [diffeomorphism](https://en.wikipedia.org/wiki/Diffeomorphism) and [compound
-  mixture](https://en.wikipedia.org/wiki/Compound_probability_distribution). For
-  more details, see the "About `Diffeomixture`s and reparametrization.`"
-  section.
-
-  #### Mathematical Details
-
-  The `VectorDiffeomixture` approximates a SoftmaxNormal-mixed ("prior")
-  [compound distribution](
-  https://en.wikipedia.org/wiki/Compound_probability_distribution).
-  Using variable-substitution and [numerical quadrature](
-  https://en.wikipedia.org/wiki/Numerical_integration) (default:
-  [Gauss--Hermite quadrature](
-  https://en.wikipedia.org/wiki/Gauss%E2%80%93Hermite_quadrature)) we can
-  redefine the distribution to be a parameter-less convex combination of `K`
-  different affine combinations of a `d` iid samples from `distribution`.
-
-  That is, defined over `R**d` this distribution is parameterized by a
-  (batch of) length-`K` `mix_loc` and `mix_scale` vectors, a length-`K` list of
-  (a batch of) length-`d` `loc` vectors, and a length-`K` list of `scale`
-  `LinearOperator`s each operating on a (batch of) length-`d` vector space.
-  Finally, a `distribution` parameter specifies the underlying base distribution
-  which is "lifted" to become multivariate ("lifting" is the same concept as in
-  `TransformedDistribution`).
-
-  The probability density function (pdf) is,
+  Note: for a given `quadrature_size`, this method is generally less accurate
+  than `quadrature_scheme_softmaxnormal_quantiles`.
+
+  Args:
+    normal_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
+      The location parameter of the Normal used to construct the SoftmaxNormal.
+    normal_scale: `float`-like `Tensor`. Broadcastable with `normal_loc`.
+      The scale parameter of the Normal used to construct the SoftmaxNormal.
+    quadrature_size: Python `int` scalar representing the number of quadrature
+      points.
+    validate_args: Python `bool`, default `False`. When `True` distribution
+      parameters are checked for validity despite possibly degrading runtime
+      performance. When `False` invalid inputs may silently render incorrect
+      outputs.
+    name: Python `str` name prefixed to Ops created by this class.
+
+  Returns:
+    grid: Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the
+      convex combination of affine parameters for `K` components.
+      `grid[..., :, n]` is the `n`-th grid point, living in the `K - 1` simplex.
+    probs:  Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the
+      associated with each grid point.
+  """
+  with ops.name_scope(name, "quadrature_scheme_softmaxnormal_gauss_hermite",
+                      [normal_loc, normal_scale]):
+    normal_loc = ops.convert_to_tensor(normal_loc, name="normal_loc")
+    dt = normal_loc.dtype.base_dtype
+    normal_scale = ops.convert_to_tensor(
+        normal_scale, dtype=dt, name="normal_scale")
 
-  ```none
-  pdf(y; mix_loc, mix_scale, loc, scale, phi)
-    = sum{ prob[i] phi(f_inverse(x; i)) / abs(det(interp_scale[i]))
-          : i=0, ..., Q-1 }
+    normal_scale = maybe_check_quadrature_param(
+        normal_scale, "normal_scale", validate_args)
+
+    grid, probs = np.polynomial.hermite.hermgauss(deg=quadrature_size)
+    grid = grid.astype(dt.dtype.as_numpy_dtype)
+    probs = probs.astype(dt.dtype.as_numpy_dtype)
+    probs /= np.linalg.norm(probs, ord=1, keepdims=True)
+    probs = ops.convert_to_tensor(probs, name="probs", dtype=dt)
+
+    grid = softmax(
+        -distribution_util.pad(
+            (normal_loc[..., array_ops.newaxis] +
+             np.sqrt(2.) * normal_scale[..., array_ops.newaxis] * grid),
+            axis=-2,
+            front=True),
+        axis=-2)  # shape: [B, components, deg]
+
+    return grid, probs
+
+
+def quadrature_scheme_softmaxnormal_quantiles(
+    normal_loc, normal_scale, quadrature_size,
+    validate_args=False, name=None):
+  """Use SoftmaxNormal quantiles to form quadrature on `K - 1` simplex.
+
+  A `SoftmaxNormal` random variable `Y` may be generated via
+
+  ```
+  Y = SoftmaxCentered(X),
+  X = Normal(normal_loc, normal_scale)
   ```
 
-  where, `phi` is the base distribution pdf, and,
+  Args:
+    normal_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
+      The location parameter of the Normal used to construct the SoftmaxNormal.
+    normal_scale: `float`-like `Tensor`. Broadcastable with `normal_loc`.
+      The scale parameter of the Normal used to construct the SoftmaxNormal.
+    quadrature_size: Python `int` scalar representing the number of quadrature
+      points.
+    validate_args: Python `bool`, default `False`. When `True` distribution
+      parameters are checked for validity despite possibly degrading runtime
+      performance. When `False` invalid inputs may silently render incorrect
+      outputs.
+    name: Python `str` name prefixed to Ops created by this class.
+
+  Returns:
+    grid: Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the
+      convex combination of affine parameters for `K` components.
+      `grid[..., :, n]` is the `n`-th grid point, living in the `K - 1` simplex.
+    probs:  Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the
+      associated with each grid point.
+  """
+  with ops.name_scope(name, "softmax_normal_grid_and_probs",
+                      [normal_loc, normal_scale]):
+    normal_loc = ops.convert_to_tensor(normal_loc, name="normal_loc")
+    dt = normal_loc.dtype.base_dtype
+    normal_scale = ops.convert_to_tensor(
+        normal_scale, dtype=dt, name="normal_scale")
+
+    normal_scale = maybe_check_quadrature_param(
+        normal_scale, "normal_scale", validate_args)
+
+    dist = normal_lib.Normal(loc=normal_loc, scale=normal_scale)
+
+    def _get_batch_ndims():
+      """Helper to get dist.batch_shape.ndims, statically if possible."""
+      ndims = dist.batch_shape.ndims
+      if ndims is None:
+        ndims = array_ops.shape(dist.batch_shape_tensor())[0]
+      return ndims
+    batch_ndims = _get_batch_ndims()
+
+    def _get_final_shape(qs):
+      """Helper to build `TensorShape`."""
+      bs = dist.batch_shape.with_rank_at_least(1)
+      num_components = bs[-1].value
+      if num_components is not None:
+        num_components += 1
+      tail = tensor_shape.TensorShape([num_components, qs])
+      return bs[:-1].concatenate(tail)
+
+    def _compute_quantiles():
+      """Helper to build quantiles."""
+      # Omit {0, 1} since they might lead to Inf/NaN.
+      zero = array_ops.zeros([], dtype=dist.dtype)
+      edges = math_ops.linspace(zero, 1., quadrature_size + 3)[1:-1]
+      # Expand edges so its broadcast across batch dims.
+      edges = array_ops.reshape(edges, shape=array_ops.concat([
+          [-1], array_ops.ones([batch_ndims], dtype=dtypes.int32)], axis=0))
+      quantiles = dist.quantile(edges)
+      quantiles = SoftmaxCentered(event_ndims=1).forward(quantiles)
+      # Cyclically permute left by one.
+      perm = array_ops.concat([
+          math_ops.range(1, 1 + batch_ndims), [0]], axis=0)
+      quantiles = array_ops.transpose(quantiles, perm)
+      quantiles.set_shape(_get_final_shape(quadrature_size + 1))
+      return quantiles
+    quantiles = _compute_quantiles()
+
+    # Compute grid as quantile midpoints.
+    grid = (quantiles[..., :-1] + quantiles[..., 1:]) / 2.
+    # Set shape hints.
+    grid.set_shape(_get_final_shape(quadrature_size))
+
+    # By construction probs is constant, i.e., `1 / quadrature_size`. This is
+    # important, because non-constant probs leads to non-reparameterizable
+    # samples.
+    probs = array_ops.fill(
+        dims=[quadrature_size],
+        value=1. / math_ops.cast(quadrature_size, dist.dtype))
+
+    return grid, probs
+
+
+class VectorDiffeomixture(distribution_lib.Distribution):
+  """VectorDiffeomixture distribution.
+
+  A vector diffeomixture (VDM) is a distribution parameterized by a convex
+  combination of `K` component `loc` vectors, `loc[k], k = 0,...,K-1`, and `K`
+  `scale` matrices `scale[k], k = 0,..., K-1`.  It approximates the following
+  [compound distribution]
+  (https://en.wikipedia.org/wiki/Compound_probability_distribution)
 
   ```none
-  f_inverse(x; i) = inv(interp_scale[i]) @ (x - interp_loc[i])
-  interp_loc[i]   = sum{ lambda[k; i] loc[k]   : k=0, ..., K-1 }
-  interp_scale[i] = sum{ lambda[k; i] scale[k] : k=0, ..., K-1 }
+  p(x) = int p(x | z) p(z) dz,
+  where z is in the K-simplex, and
+  p(x | z) := p(x | loc=sum_k z[k] loc[k], scale=sum_k z[k] scale[k])
   ```
 
-  and,
+  The integral `int p(x | z) p(z) dz` is approximated with a quadrature scheme
+  adapted to the mixture density `p(z)`.  The `N` quadrature points `z_{N, n}`
+  and weights `w_{N, n}` (which are non-negative and sum to 1) are chosen
+  such that
 
-  ```none
-  grid, weight = np.polynomial.hermite.hermgauss(quadrature_size)
-  prob[k]   = weight[k] / sqrt(pi)
-  lambda[k; i] = sigmoid(mix_loc[k] + sqrt(2) mix_scale[k] grid[i])
+  ```q_N(x) := sum_{n=1}^N w_{n, N} p(x | z_{N, n}) --> p(x)```
+
+  as `N --> infinity`.
+
+  Since `q_N(x)` is in fact a mixture (of `N` points), we may sample from
+  `q_N` exactly.  It is important to note that the VDM is *defined* as `q_N`
+  above, and *not* `p(x)`.  Therefore, sampling and pdf may be implemented as
+  exact (up to floating point error) methods.
+
+  A common choice for the conditional `p(x | z)` is a multivariate Normal.
+
+  The implemented marginal `p(z)` is the `SoftmaxNormal`, which is a
+  `K-1` dimensional Normal transformed by a `SoftmaxCentered` bijector, making
+  it a density on the `K`-simplex.  That is,
+
+  ```
+  Z = SoftmaxCentered(X),
+  X = Normal(mix_loc / temperature, 1 / temperature)
   ```
 
-  The distribution corresponding to `phi` must be a scalar-batch, scalar-event
-  distribution. Typically it is reparameterized. If not, it must be a function
-  of non-trainable parameters.
+  The default quadrature scheme chooses `z_{N, n}` as `N` midpoints of
+  the quantiles of `p(z)` (generalized quantiles if `K > 2`).
 
-  WARNING: If you backprop through a VectorDiffeomixture sample and the "base"
-  distribution is both: not `FULLY_REPARAMETERIZED` and a function of trainable
-  variables, then the gradient is not guaranteed correct!
+  See [1] for more details.
+
+  [1]. "Quadrature Compound: An approximating family of distributions"
+       Joshua Dillon, Ian Langmore, arXiv preprints
+       https://arxiv.org/abs/1801.03080
 
   #### About `Vector` distributions in TensorFlow.
 
@@ -164,12 +260,11 @@ class VectorDiffeomixture(distribution_lib.Distribution):
   particularly useful in [variational Bayesian
   methods](https://en.wikipedia.org/wiki/Variational_Bayesian_methods).
 
-  Conditioned on a draw from the SoftmaxNormal, `Y|v` is a vector whose
+  Conditioned on a draw from the SoftmaxNormal, `X|z` is a vector whose
   components are linear combinations of affine transformations, thus is itself
-  an affine transformation. Therefore `Y|v` lives in the vector space generated
-  by vectors of affine-transformed distributions.
+  an affine transformation.
 
-  Note: The marginals `Y_1|v, ..., Y_d|v` are *not* generally identical to some
+  Note: The marginals `X_1|v, ..., X_d|v` are *not* generally identical to some
   parameterization of `distribution`.  This is due to the fact that the sum of
   draws from `distribution` are not generally itself the same `distribution`.
 
@@ -185,32 +280,35 @@ class VectorDiffeomixture(distribution_lib.Distribution):
   optimize Monte-Carlo objectives. Such objectives are a finite-sample
   approximation of an expectation and arise throughout scientific computing.
 
+  WARNING: If you backprop through a VectorDiffeomixture sample and the "base"
+  distribution is both: not `FULLY_REPARAMETERIZED` and a function of trainable
+  variables, then the gradient is not guaranteed correct!
+
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
-  # Create two batches of VectorDiffeomixtures, one with mix_loc=[0.] and
+  # Create two batches of VectorDiffeomixtures, one with mix_loc=[0.],
   # another with mix_loc=[1]. In both cases, `K=2` and the affine
   # transformations involve:
   # k=0: loc=zeros(dims)  scale=LinearOperatorScaledIdentity
   # k=1: loc=[2.]*dims    scale=LinOpDiag
   dims = 5
-  vdm = ds.VectorDiffeomixture(
+  vdm = tfd.VectorDiffeomixture(
       mix_loc=[[0.], [1]],
-      mix_scale=[1.],
-      distribution=ds.Normal(loc=0., scale=1.),
+      temperature=[1.],
+      distribution=tfd.Normal(loc=0., scale=1.),
       loc=[
           None,  # Equivalent to `np.zeros(dims, dtype=np.float32)`.
           np.float32([2.]*dims),
       ],
       scale=[
-          la.LinearOperatorScaledIdentity(
+          tf.linalg.LinearOperatorScaledIdentity(
             num_rows=dims,
             multiplier=np.float32(1.1),
             is_positive_definite=True),
-          la.LinearOperatorDiag(
+          tf.linalg.LinearOperatorDiag(
             diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
             is_positive_definite=True),
       ],
@@ -219,21 +317,33 @@ class VectorDiffeomixture(distribution_lib.Distribution):
 
   def __init__(self,
                mix_loc,
-               mix_scale,
+               temperature,
                distribution,
                loc=None,
                scale=None,
-               quadrature_grid_and_probs=None,
+               quadrature_size=8,
+               quadrature_fn=quadrature_scheme_softmaxnormal_quantiles,
                validate_args=False,
                allow_nan_stats=True,
                name="VectorDiffeomixture"):
-    """Constructs the VectorDiffeomixture on `R**k`.
+    """Constructs the VectorDiffeomixture on `R^d`.
+
+    The vector diffeomixture (VDM) approximates the compound distribution
+
+    ```none
+    p(x) = int p(x | z) p(z) dz,
+    where z is in the K-simplex, and
+    p(x | z) := p(x | loc=sum_k z[k] loc[k], scale=sum_k z[k] scale[k])
+    ```
 
     Args:
-      mix_loc: `float`-like `Tensor`. Represents the `location` parameter of the
-        SoftmaxNormal used for selecting one of the `K` affine transformations.
-      mix_scale: `float`-like `Tensor`. Represents the `scale` parameter of the
-        SoftmaxNormal used for selecting one of the `K` affine transformations.
+      mix_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`.
+        In terms of samples, larger `mix_loc[..., k]` ==>
+        `Z` is more likely to put more weight on its `kth` component.
+      temperature: `float`-like `Tensor`. Broadcastable with `mix_loc`.
+        In terms of samples, smaller `temperature` means one component is more
+        likely to dominate.  I.e., smaller `temperature` makes the VDM look more
+        like a standard mixture of `K` components.
       distribution: `tf.Distribution`-like instance. Distribution from which `d`
         iid samples are used as input to the selected affine transformation.
         Must be a scalar-batch, scalar-event distribution.  Typically
@@ -252,10 +362,14 @@ class VectorDiffeomixture(distribution_lib.Distribution):
         `k`-th element represents the `scale` used for the `k`-th affine
         transformation. `LinearOperator`s must have shape `[B1, ..., Bb, d, d]`,
         `b >= 0`, i.e., characterizes `b`-batches of `d x d` matrices
-      quadrature_grid_and_probs: Python pair of `float`-like `Tensor`s
-        representing the sample points and the corresponding (possibly
-        normalized) weight.  When `None`, defaults to:
-        `np.polynomial.hermite.hermgauss(deg=8)`.
+      quadrature_size: Python `int` scalar representing number of
+        quadrature points.  Larger `quadrature_size` means `q_N(x)` better
+        approximates `p(x)`.
+      quadrature_fn: Python callable taking `normal_loc`, `normal_scale`,
+        `quadrature_size`, `validate_args` and returning `tuple(grid, probs)`
+        representing the SoftmaxNormal grid and corresponding normalized weight.
+        normalized) weight.
+        Default value: `quadrature_scheme_softmaxnormal_quantiles`.
       validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
@@ -279,7 +393,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       ValueError: if `not distribution.is_scalar_event`.
     """
     parameters = locals()
-    with ops.name_scope(name, values=[mix_loc, mix_scale]):
+    with ops.name_scope(name, values=[mix_loc, temperature]):
       if not scale or len(scale) < 2:
         raise ValueError("Must specify list (or list-like object) of scale "
                          "LinearOperators, one for each component with "
@@ -322,11 +436,15 @@ class VectorDiffeomixture(distribution_lib.Distribution):
         raise NotImplementedError("Currently only bimixtures are supported; "
                                   "len(scale)={} is not 2.".format(len(scale)))
 
-      grid, probs = distribution_util.process_quadrature_grid_and_probs(
-          quadrature_grid_and_probs, dtype, validate_args)
-      self._quadrature_grid = grid
-      self._quadrature_probs = probs
-      self._quadrature_size = distribution_util.dimension_size(probs, axis=0)
+      mix_loc = ops.convert_to_tensor(
+          mix_loc, dtype=dtype, name="mix_loc")
+      temperature = ops.convert_to_tensor(
+          temperature, dtype=dtype, name="temperature")
+      self._grid, probs = tuple(quadrature_fn(
+          mix_loc / temperature,
+          1. / temperature,
+          quadrature_size,
+          validate_args))
 
       # Note: by creating the logits as `log(prob)` we ensure that
       # `self.mixture_distribution.logits` is equivalent to
@@ -336,22 +454,13 @@ class VectorDiffeomixture(distribution_lib.Distribution):
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats)
 
-      mix_loc = maybe_check_mix_param(
-          mix_loc, "mix_loc", dtype, validate_args)
-      mix_scale = maybe_check_mix_param(
-          mix_scale, "mix_scale", dtype, validate_args)
-
       asserts = distribution_util.maybe_check_scalar_distribution(
           distribution, dtype, validate_args)
       if asserts:
-        mix_loc = control_flow_ops.with_dependencies(asserts, mix_loc)
+        self._grid = control_flow_ops.with_dependencies(
+            asserts, self._grid)
       self._distribution = distribution
 
-      # shape: [B, deg]
-      self._interpolate_weight = math_ops.sigmoid(
-          mix_loc
-          + np.sqrt(2.) * mix_scale * grid)
-
       self._interpolated_affine = [
           AffineLinearOperator(shift=loc_,
                                scale=scale_,
@@ -359,15 +468,16 @@ class VectorDiffeomixture(distribution_lib.Distribution):
                                validate_args=validate_args,
                                name="interpolated_affine_{}".format(k))
           for k, (loc_, scale_) in enumerate(zip(
-              interpolate_loc(self._quadrature_size,
-                              self._interpolate_weight,
-                              loc),
-              interpolate_scale(self._quadrature_size,
-                                self._interpolate_weight,
-                                scale)))]
+              interpolate_loc(self._grid, loc),
+              interpolate_scale(self._grid, scale)))]
 
-      self._batch_shape_, self._event_shape_ = determine_batch_event_shapes(
-          mix_loc, mix_scale, self._endpoint_affine)
+      [
+          self._batch_shape_,
+          self._batch_shape_tensor_,
+          self._event_shape_,
+          self._event_shape_tensor_,
+      ] = determine_batch_event_shapes(self._grid,
+                                       self._endpoint_affine)
 
       super(VectorDiffeomixture, self).__init__(
           dtype=dtype,
@@ -386,8 +496,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
           allow_nan_stats=allow_nan_stats,
           parameters=parameters,
           graph_parents=(
-              [mix_loc, mix_scale]
-              + distribution._graph_parents  # pylint: disable=protected-access
+              distribution._graph_parents  # pylint: disable=protected-access
               + [loc_ for loc_ in loc if loc_ is not None]
               + [p for scale_ in scale for p in scale_.graph_parents]),
           name=name)
@@ -403,9 +512,9 @@ class VectorDiffeomixture(distribution_lib.Distribution):
     return self._distribution
 
   @property
-  def interpolate_weight(self):
+  def grid(self):
     """Grid of mixing probabilities, one for each grid point."""
-    return self._interpolate_weight
+    return self._grid
 
   @property
   def endpoint_affine(self):
@@ -417,27 +526,17 @@ class VectorDiffeomixture(distribution_lib.Distribution):
     """Affine transformation for each convex combination of `K` components."""
     return self._interpolated_affine
 
-  @property
-  def quadrature_grid(self):
-    """Quadrature grid points."""
-    return self._quadrature_grid
-
-  @property
-  def quadrature_probs(self):
-    """Quadrature normalized weights."""
-    return self._quadrature_probs
-
   def _batch_shape_tensor(self):
-    return self._batch_shape_
+    return self._batch_shape_tensor_
 
   def _batch_shape(self):
-    return tensor_shape.TensorShape(static_value(self._batch_shape_))
+    return self._batch_shape_
 
   def _event_shape_tensor(self):
-    return self._event_shape_
+    return self._event_shape_tensor_
 
   def _event_shape(self):
-    return tensor_shape.TensorShape(static_value(self._event_shape_))
+    return self._event_shape_
 
   def _sample_n(self, n, seed=None):
     x = self.distribution.sample(
@@ -450,27 +549,53 @@ class VectorDiffeomixture(distribution_lib.Distribution):
 
     # Get ids as a [n, batch_size]-shaped matrix, unless batch_shape=[] then get
     # ids as a [n]-shaped vector.
-    batch_size = reduce_prod(self.batch_shape_tensor())
-    ids = self._mixture_distribution.sample(
+    batch_size = self.batch_shape.num_elements()
+    if batch_size is None:
+      batch_size = array_ops.reduce_prod(self.batch_shape_tensor())
+    mix_batch_size = self.mixture_distribution.batch_shape.num_elements()
+    if mix_batch_size is None:
+      mix_batch_size = math_ops.reduce_prod(
+          self.mixture_distribution.batch_shape_tensor())
+    ids = self.mixture_distribution.sample(
         sample_shape=concat_vectors(
             [n],
             distribution_util.pick_vector(
                 self.is_scalar_batch(),
                 np.int32([]),
-                [batch_size])),
+                [batch_size // mix_batch_size])),
         seed=distribution_util.gen_new_seed(
             seed, "vector_diffeomixture"))
-
-    # Stride `quadrature_size` for `batch_size` number of times.
+    # We need to flatten batch dims in case mixture_distribution has its own
+    # batch dims.
+    ids = array_ops.reshape(ids, shape=concat_vectors(
+        [n],
+        distribution_util.pick_vector(
+            self.is_scalar_batch(),
+            np.int32([]),
+            np.int32([-1]))))
+
+    # Stride `components * quadrature_size` for `batch_size` number of times.
+    stride = self.grid.shape.with_rank_at_least(
+        2)[-2:].num_elements()
+    if stride is None:
+      stride = array_ops.reduce_prod(
+          array_ops.shape(self.grid)[-2:])
     offset = math_ops.range(start=0,
-                            limit=batch_size * self._quadrature_size,
-                            delta=self._quadrature_size,
+                            limit=batch_size * stride,
+                            delta=stride,
                             dtype=ids.dtype)
 
     weight = array_ops.gather(
-        array_ops.reshape(self.interpolate_weight, shape=[-1]),
+        array_ops.reshape(self.grid, shape=[-1]),
         ids + offset)
-    weight = weight[..., array_ops.newaxis]
+    # At this point, weight flattened all batch dims into one.
+    # We also need to append a singleton to broadcast with event dims.
+    if self.batch_shape.is_fully_defined():
+      new_shape = [-1] + self.batch_shape.as_list() + [1]
+    else:
+      new_shape = array_ops.concat(
+          ([-1], self.batch_shape_tensor(), [1]), axis=0)
+    weight = array_ops.reshape(weight, shape=new_shape)
 
     if len(x) != 2:
       # We actually should have already triggered this exception. However as a
@@ -500,10 +625,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
         self.mixture_distribution.logits - fldj + log_prob, axis=-1)
 
   def _mean(self):
-    # Since we created logits to already be scaled, we can use exp which is
-    # slightly cheaper than `self.mixture_distribution.probs`.
-    p = math_ops.exp(self.mixture_distribution.logits)
-
+    p = self._expand_mix_distribution_probs()
     m = self._expand_base_distribution_mean()
     mean = None
     for k, aff in enumerate(self.interpolated_affine):
@@ -537,13 +659,11 @@ class VectorDiffeomixture(distribution_lib.Distribution):
         self._covariance_of_mean_given_quadrature_component(diag_only=True))
 
   def _mean_of_covariance_given_quadrature_component(self, diag_only):
-    # Since we created logits to already be scaled, we can use exp which is
-    # slightly cheaper than `self.mixture_distribution.probs`.
-    p = math_ops.exp(self.mixture_distribution.logits)
+    p = self.mixture_distribution.probs
 
     # To compute E[Cov(Z|V)], we'll add matrices within three categories:
     # scaled-identity, diagonal, and full. Then we'll combine these at the end.
-    scaled_identity = None
+    scale_identity_multiplier = None
     diag = None
     full = None
 
@@ -551,10 +671,12 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       s = aff.scale  # Just in case aff.scale has side-effects, we'll call once.
       if (s is None
           or isinstance(s, linop_identity_lib.LinearOperatorIdentity)):
-        scaled_identity = add(scaled_identity, p[..., k, array_ops.newaxis])
+        scale_identity_multiplier = add(scale_identity_multiplier,
+                                        p[..., k, array_ops.newaxis])
       elif isinstance(s, linop_identity_lib.LinearOperatorScaledIdentity):
-        scaled_identity = add(scaled_identity, (p[..., k, array_ops.newaxis] *
-                                                math_ops.square(s.multiplier)))
+        scale_identity_multiplier = add(
+            scale_identity_multiplier,
+            (p[..., k, array_ops.newaxis] * math_ops.square(s.multiplier)))
       elif isinstance(s, linop_diag_lib.LinearOperatorDiag):
         diag = add(diag, (p[..., k, array_ops.newaxis] *
                           math_ops.square(s.diag_part())))
@@ -566,12 +688,13 @@ class VectorDiffeomixture(distribution_lib.Distribution):
         full = add(full, x)
 
     # We must now account for the fact that the base distribution might have a
-    # non-unity variance. Recall that `Cov(SX+m) = S.T Cov(X) S = S.T S Var(X)`.
+    # non-unity variance. Recall that, since X ~ iid Law(X_0),
+    #   `Cov(SX+m) = S Cov(X) S.T = S S.T Diag(Var(X_0))`.
     # We can scale by `Var(X)` (vs `Cov(X)`) since X corresponds to `d` iid
     # samples from a scalar-event distribution.
     v = self.distribution.variance()
-    if scaled_identity is not None:
-      scaled_identity *= v
+    if scale_identity_multiplier is not None:
+      scale_identity_multiplier *= v
     if diag is not None:
       diag *= v[..., array_ops.newaxis]
     if full is not None:
@@ -580,10 +703,10 @@ class VectorDiffeomixture(distribution_lib.Distribution):
     if diag_only:
       # Apparently we don't need the full matrix, just the diagonal.
       r = add(diag, full)
-      if r is None and scaled_identity is not None:
+      if r is None and scale_identity_multiplier is not None:
         ones = array_ops.ones(self.event_shape_tensor(), dtype=self.dtype)
-        return scaled_identity * ones
-      return add(r, scaled_identity)
+        return scale_identity_multiplier[..., array_ops.newaxis] * ones
+      return add(r, scale_identity_multiplier)
 
     # `None` indicates we don't know if the result is positive-definite.
     is_positive_definite = (True if all(aff.scale.is_positive_definite
@@ -599,10 +722,10 @@ class VectorDiffeomixture(distribution_lib.Distribution):
       to_add.append(linop_full_lib.LinearOperatorFullMatrix(
           matrix=full,
           is_positive_definite=is_positive_definite))
-    if scaled_identity is not None:
+    if scale_identity_multiplier is not None:
       to_add.append(linop_identity_lib.LinearOperatorScaledIdentity(
           num_rows=self.event_shape_tensor()[0],
-          multiplier=scaled_identity,
+          multiplier=scale_identity_multiplier,
           is_positive_definite=is_positive_definite))
 
     return (linop_add_lib.add_operators(to_add)[0].to_dense()
@@ -611,10 +734,9 @@ class VectorDiffeomixture(distribution_lib.Distribution):
   def _covariance_of_mean_given_quadrature_component(self, diag_only):
     square = math_ops.square if diag_only else vec_osquare
 
-    # Since we created logits to already be scaled, we can use exp which is
-    # slightly cheaper than `self.mixture_distribution.probs`.
-    p = math_ops.exp(self.mixture_distribution.logits)
-
+    p = self._expand_mix_distribution_probs()
+    if not diag_only:
+      p = p[..., array_ops.newaxis, :]  # Assuming event.ndims=1.
     m = self._expand_base_distribution_mean()
 
     cov_e_z_given_v = None
@@ -638,17 +760,25 @@ class VectorDiffeomixture(distribution_lib.Distribution):
     m.set_shape(self.batch_shape.concatenate(self.event_shape))
     return m
 
-
-def maybe_check_mix_param(param, name, expected_base_dtype, validate_args):
-  """Helper which checks validity of `mix_loc` and `mix_scale` init args."""
+  def _expand_mix_distribution_probs(self):
+    p = self.mixture_distribution.probs  # [B, deg]
+    deg = p.shape.with_rank_at_least(1)[-1].value
+    if deg is None:
+      deg = array_ops.shape(p)[-1]
+    event_ndims = self.event_shape.ndims
+    if event_ndims is None:
+      event_ndims = array_ops.shape(self.event_shape_tensor())[0]
+    expand_shape = array_ops.concat([
+        self.mixture_distribution.batch_shape_tensor(),
+        array_ops.ones([event_ndims], dtype=dtypes.int32),
+        [deg],
+    ], axis=0)
+    return array_ops.reshape(p, shape=expand_shape)
+
+
+def maybe_check_quadrature_param(param, name, validate_args):
+  """Helper which checks validity of `loc` and `scale` init args."""
   with ops.name_scope(name="check_" + name, values=[param]):
-    param = ops.convert_to_tensor(param, dtype=expected_base_dtype, name=name)
-
-    if param.dtype.base_dtype != expected_base_dtype:
-      raise TypeError(
-          "dtype mismatch; {}.base_dtype=\"{}\" is not \"{}\".".format(
-              name, param.dtype.base_dtype.name, expected_base_dtype.name))
-
     assertions = []
     if param.shape.ndims is not None:
       if param.shape.ndims == 0:
@@ -679,79 +809,84 @@ def maybe_check_mix_param(param, name, expected_base_dtype, validate_args):
     return param
 
 
-def determine_batch_event_shapes(mix_loc, mix_scale, endpoint_affine):
+def determine_batch_event_shapes(grid, endpoint_affine):
   """Helper to infer batch_shape and event_shape."""
   with ops.name_scope(name="determine_batch_event_shapes"):
-    mix_batch_shape = distribution_util.prefer_static_broadcast_shape(
-        array_ops.shape(mix_loc, name="mix_loc_shape"),
-        array_ops.shape(mix_scale, name="mix_scale_shape"))
-    if isinstance(mix_batch_shape, tensor_shape.TensorShape):
-      mix_batch_shape = mix_batch_shape.with_rank_at_least(1)[:-1]
-    else:
-      s = static_value(mix_batch_shape)
-      if s is not None:
-        mix_batch_shape = ops.convert_to_tensor(
-            s[:-1], dtype=dtypes.int32, name="mix_batch_shape")
-      else:
-        mix_batch_shape = mix_batch_shape[:-1]
-
-    # We broadcast with a 1D constant to automatically make the result a
-    # TensorShape if possible.
-    batch_shape = distribution_util.prefer_static_broadcast_shape(
-        mix_batch_shape,
-        constant_op.constant([], dtype=dtypes.int32, name="batch_shape"))
-    event_shape = constant_op.constant(
-        [], dtype=dtypes.int32, name="event_shape")
+    # grid  # shape: [B, k, q]
+    # endpoint_affine     # len=k, shape: [B, d, d]
+    batch_shape = grid.shape[:-2]
+    batch_shape_tensor = array_ops.shape(grid)[:-2]
+    event_shape = None
+    event_shape_tensor = None
+
+    def _set_event_shape(shape, shape_tensor):
+      if event_shape is None:
+        return shape, shape_tensor
+      return (array_ops.broadcast_static_shape(event_shape, shape),
+              array_ops.broadcast_dynamic_shape(
+                  event_shape_tensor, shape_tensor))
+
     for aff in endpoint_affine:
-      b, e = distribution_util.shapes_from_loc_and_scale(aff.shift, aff.scale)
-      if batch_shape is None:
-        batch_shape = distribution_util.prefer_static_broadcast_shape(
-            mix_batch_shape, b)
-      else:
-        batch_shape = distribution_util.prefer_static_broadcast_shape(
-            batch_shape, b)
-      event_shape = distribution_util.prefer_static_broadcast_shape(
-          event_shape, e)
-    if isinstance(batch_shape, tensor_shape.TensorShape):
-      batch_shape = ops.convert_to_tensor(
-          batch_shape.as_list(), dtype=dtypes.int32, name="batch_shape")
-    if isinstance(event_shape, tensor_shape.TensorShape):
-      event_shape = ops.convert_to_tensor(
-          event_shape.as_list(), dtype=dtypes.int32, name="event_shape")
-    return batch_shape, event_shape
-
-
-def interpolate_loc(deg, interpolate_weight, loc):
+      if aff.shift is not None:
+        batch_shape = array_ops.broadcast_static_shape(
+            batch_shape, aff.shift.shape[:-1])
+        batch_shape_tensor = array_ops.broadcast_dynamic_shape(
+            batch_shape_tensor, array_ops.shape(aff.shift)[:-1])
+        event_shape, event_shape_tensor = _set_event_shape(
+            aff.shift.shape[-1:], array_ops.shape(aff.shift)[-1:])
+
+      if aff.scale is not None:
+        batch_shape = array_ops.broadcast_static_shape(
+            batch_shape, aff.scale.batch_shape)
+        batch_shape_tensor = array_ops.broadcast_dynamic_shape(
+            batch_shape_tensor, aff.scale.batch_shape_tensor())
+        event_shape, event_shape_tensor = _set_event_shape(
+            tensor_shape.TensorShape([aff.scale.range_dimension]),
+            aff.scale.range_dimension_tensor()[array_ops.newaxis])
+
+    return batch_shape, batch_shape_tensor, event_shape, event_shape_tensor
+
+
+def interpolate_loc(grid, loc):
   """Helper which interpolates between two locs."""
   if len(loc) != 2:
     raise NotImplementedError("Currently only bimixtures are supported; "
                               "len(scale)={} is not 2.".format(len(loc)))
-  with ops.name_scope("interpolate_loc", values=[interpolate_weight, loc]):
+  deg = grid.shape.with_rank_at_least(1)[-1].value
+  if deg is None:
+    raise ValueError("Num quadrature grid points must be known prior "
+                     "to graph execution.")
+  with ops.name_scope("interpolate_loc", values=[grid, loc]):
     if loc is None or loc[0] is None and loc[1] is None:
       return [None]*deg
-    w = interpolate_weight[..., array_ops.newaxis, :]  # shape: [B, 1, deg]
+    # shape: [B, 1, k, deg]
+    w = grid[..., array_ops.newaxis, :, :]
     loc = [x[..., array_ops.newaxis]                   # shape: [B, e, 1]
            if x is not None else None for x in loc]
     if loc[0] is None:
-      x = (1. - w) * loc[1]                            # shape: [B, e, deg]
+      x = w[..., 1, :] * loc[1]                        # shape: [B, e, deg]
     elif loc[1] is None:
-      x = w * loc[0]                                   # shape: [B, e, deg]
+      x = w[..., 0, :] * loc[0]                        # shape: [B, e, deg]
     else:
       delta = loc[0] - loc[1]
-      x = w * delta + loc[1]                           # shape: [B, e, deg]
+      x = w[..., 0, :] * delta + loc[1]                # shape: [B, e, deg]
     return [x[..., k] for k in range(deg)]             # list(shape:[B, e])
 
 
-def interpolate_scale(deg, interpolate_weight, scale):
+def interpolate_scale(grid, scale):
   """Helper which interpolates between two scales."""
   if len(scale) != 2:
     raise NotImplementedError("Currently only bimixtures are supported; "
                               "len(scale)={} is not 2.".format(len(scale)))
-  with ops.name_scope("interpolate_scale", values=[interpolate_weight]):
+  deg = grid.shape.with_rank_at_least(1)[-1].value
+  if deg is None:
+    raise ValueError("Num quadrature grid points must be known prior "
+                     "to graph execution.")
+  with ops.name_scope("interpolate_scale", values=[grid]):
     return [linop_add_lib.add_operators([
-        linop_scale(interpolate_weight[..., k], scale[0]),
-        linop_scale(1. - interpolate_weight[..., k], scale[1]),
-    ])[0] for k in range(deg)]
+        linop_scale(grid[..., k, q], s)
+        for k, s in enumerate(scale)
+    ])[0] for q in range(deg)]
 
 
 def linop_scale(w, op):
@@ -791,39 +926,12 @@ def linop_scale(w, op):
 
 def concat_vectors(*args):
   """Concatenates input vectors, statically if possible."""
-  args_ = [static_value(x) for x in args]
+  args_ = [distribution_util.static_value(x) for x in args]
   if any(vec is None for vec in args_):
     return array_ops.concat(args, axis=0)
   return [val for vec in args_ for val in vec]
 
 
-def reduce_prod(x):
-  """Same as `math_ops.reduce_prod` but statically if possible."""
-  x_ = static_value(x)
-  if x_ is not None:
-    return np.prod(x_, dtype=x.dtype.as_numpy_dtype)
-  return array_ops.reduce_prod(x)
-
-
-def ndims_from_shape(shape):
-  """Returns `Tensor`'s `rank` implied by a `Tensor` shape."""
-  if shape.shape.ndims not in (None, 1):
-    raise ValueError("input is not a valid shape: not 1D")
-  if not shape.dtype.is_integer:
-    raise TypeError("input is not a valid shape: wrong dtype")
-  if shape.shape.is_fully_defined():
-    return shape.shape.as_list()[0]
-  return array_ops.shape(shape)[0]
-
-
-def ndims(x):
-  """Returns rank, statically if possible."""
-  x = ops.convert_to_tensor(x)
-  if x.shape.ndims is not None:
-    return x.shape.ndims
-  return array_ops.rank(x)
-
-
 def add(x, y):
   """Adds inputs; interprets `None` as zero."""
   if x is None:
@@ -836,3 +944,18 @@ def add(x, y):
 def vec_osquare(x):
   """Computes the outer-product of a (batch of) vector, i.e., x.T x."""
   return x[..., :, array_ops.newaxis] * x[..., array_ops.newaxis, :]
+
+
+def softmax(x, axis, name=None):
+  """Equivalent to tf.nn.softmax but works around b/70297725."""
+  with ops.name_scope(name, "softmax", [x, axis]):
+    x = ops.convert_to_tensor(x, name="x")
+    ndims = (x.shape.ndims if x.shape.ndims is not None
+             else array_ops.rank(x, name="ndims"))
+    axis = ops.convert_to_tensor(axis, dtype=dtypes.int32, name="axis")
+    axis_ = tensor_util.constant_value(axis)
+    if axis_ is not None:
+      axis = np.int(ndims + axis_ if axis_ < 0 else axis_)
+    else:
+      axis = array_ops.where(axis < 0, ndims + axis, axis)
+  return nn_ops.softmax(x, axis=axis)
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
index 356d78b67a8107750f68f7f84d73d1231f5b2b03..526fe2d39aef9aed833b889de80e849c469435e7 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
@@ -89,14 +89,13 @@ class VectorExponentialDiag(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Initialize a single 2-variate VectorExponential, supported on
   # {(x, y) in R^2 : x > 0, y > 0}.
 
   # The first component has pdf exp{-x}, the second 0.5 exp{-x / 2}
-  vex = ds.VectorExponentialDiag(scale_diag=[1., 2.])
+  vex = tfd.VectorExponentialDiag(scale_diag=[1., 2.])
 
   # Compute the pdf of an`R^2` observation; return a scalar.
   vex.prob([3., 4.]).eval()  # shape: []
@@ -107,7 +106,7 @@ class VectorExponentialDiag(
   scale_diag = [[1., 2, 3],
                 [0.5, 1, 1.5]]     # shape: [2, 3]
 
-  vex = ds.VectorExponentialDiag(loc, scale_diag)
+  vex = tfd.VectorExponentialDiag(loc, scale_diag)
 
   # Compute the pdf of two `R^3` observations; return a length-2 vector.
   x = [[1.9, 2.2, 3.1],
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
index b313a851b381e5b3a057fd17e6c2ef4eb0fc34f1..9d5fd9ac4178a1ae29b1ce32f304b22fd3d234dc 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
@@ -107,16 +107,15 @@ class VectorExponentialLinearOperator(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Initialize a single 2-variate VectorExponential, supported on
   # {(x, y) in R^2 : x > 0, y > 0}.
   mat = [[1.0, 0.1],
          [0.1, 1.0]]
 
-  vex = ds.VectorExponentialLinearOperator(
-      scale=la.LinearOperatorFullMatrix(mat))
+  vex = tfd.VectorExponentialLinearOperator(
+      scale=tf.linalg.LinearOperatorFullMatrix(mat))
 
   # Compute the pdf of an`R^2` observation; return a scalar.
   vex.prob([1., 2.]).eval()  # shape: []
@@ -127,9 +126,9 @@ class VectorExponentialLinearOperator(
   scale_diag = [[1., 2, 3],
                 [0.5, 1, 1.5]]     # shape: [2, 3]
 
-  vex = ds.VectorExponentialLinearOperator(
+  vex = tfd.VectorExponentialLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorDiag(scale_diag))
+      scale=tf.linalg.LinearOperatorDiag(scale_diag))
 
   # Compute the pdf of two `R^3` observations; return a length-2 vector.
   x = [[1.9, 2.2, 3.1],
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
index 0e3867809a820f49cfa7f5282c47f786626481a6..8dd983b750d9b39775e570800006011f4968f7f3 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
@@ -101,10 +101,10 @@ class VectorLaplaceDiag(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 2-variate VectorLaplace.
-  vla = ds.VectorLaplaceDiag(
+  vla = tfd.VectorLaplaceDiag(
       loc=[1., -1],
       scale_diag=[1, 2.])
 
@@ -118,7 +118,7 @@ class VectorLaplaceDiag(
   vla.prob([-1., 0]).eval()  # shape: []
 
   # Initialize a 3-batch, 2-variate scaled-identity VectorLaplace.
-  vla = ds.VectorLaplaceDiag(
+  vla = tfd.VectorLaplaceDiag(
       loc=[1., -1],
       scale_identity_multiplier=[1, 2., 3])
 
@@ -136,7 +136,7 @@ class VectorLaplaceDiag(
   vla.prob([-1., 0]).eval()  # shape: [3]
 
   # Initialize a 2-batch of 3-variate VectorLaplace's.
-  vla = ds.VectorLaplaceDiag(
+  vla = tfd.VectorLaplaceDiag(
       loc=[[1., 2, 3],
            [11, 22, 33]]           # shape: [2, 3]
       scale_diag=[[1., 2, 3],
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
index c7abdbb4caf9bee4cbd5991eb5d652f20dd0f8d1..ec485c95c15da2794b67d2699d2bdd9db97bb6c4 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
@@ -109,8 +109,7 @@ class VectorLaplaceLinearOperator(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate VectorLaplace with some desired covariance.
   mu = [1., 2, 3]
@@ -124,9 +123,9 @@ class VectorLaplaceLinearOperator(
   #      [ 0.1, -0.3,  0.4]])
 
   # Divide scale by sqrt(2) so that the final covariance will be what we want.
-  vla = ds.VectorLaplaceLinearOperator(
+  vla = tfd.VectorLaplaceLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorLowerTriangular(scale / tf.sqrt(2)))
+      scale=tf.linalg.LinearOperatorLowerTriangular(scale / tf.sqrt(2.)))
 
   # Covariance agrees with cholesky(cov) parameterization.
   vla.covariance().eval()
@@ -143,9 +142,9 @@ class VectorLaplaceLinearOperator(
   scale_diag = [[1., 2, 3],
                 [0.5, 1, 1.5]]     # shape: [2, 3]
 
-  vla = ds.VectorLaplaceLinearOperator(
+  vla = tfd.VectorLaplaceLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorDiag(scale_diag))
+      scale=tf.linalg.LinearOperatorDiag(scale_diag))
 
   # Compute the pdf of two `R^3` observations; return a length-2 vector.
   x = [[-0.9, 0, 0.1],
diff --git a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
index 544a8710709a0afb56c6ae6f36d35de892e8e420..e1ccf116457a97261b9ce3965552764771d3bdd2 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
@@ -143,7 +143,7 @@ class VectorSinhArcsinhDiag(transformed_distribution.TransformedDistribution):
         broadcastable with `event_shape`.
       distribution: `tf.Distribution`-like instance. Distribution from which `k`
         iid samples are used as input to transformation `F`.  Default is
-        `ds.Normal(0., 1.)`.
+        `tf.distributions.Normal(loc=0., scale=1.)`.
         Must be a scalar-batch, scalar-event distribution.  Typically
         `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is
         a function of non-trainable parameters. WARNING: If you backprop through
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 29d41ab81c62d621c3c3533e1449341e9a085645..8c67647a618d22a58428d78865c4ebf7d98bdf9e 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -91,14 +91,14 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   Extra leading dimensions, if provided, allow for batches.
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate vector Student's t-distribution.
   mu = [1., 2, 3]
   chol = [[1., 0, 0.],
           [1, 3, 0],
           [1, 2, 3]]
-  vt = ds.VectorStudentT(df=2, loc=mu, scale_tril=chol)
+  vt = tfd.VectorStudentT(df=2, loc=mu, scale_tril=chol)
 
   # Evaluate this on an observation in R^3, returning a scalar.
   vt.prob([-1., 0, 1])
@@ -107,7 +107,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   mu = [[1., 2, 3],
         [11, 22, 33]]
   chol = ...  # shape 2 x 3 x 3, lower triangular, positive diagonal.
-  vt = ds.VectorStudentT(loc=mu, scale_tril=chol)
+  vt = tfd.VectorStudentT(loc=mu, scale_tril=chol)
 
   # Evaluate this on a two observations, each in R^3, returning a length two
   # tensor.
diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md
index dcc370cd00d5f93cd5b145a31fd58ef5041a86a8..9d2ca07c3a25fa7acb9b0f5806b763d9a57b51fa 100644
--- a/tensorflow/contrib/eager/README.md
+++ b/tensorflow/contrib/eager/README.md
@@ -41,28 +41,8 @@ support for distributed and multi-GPU training and CPU performance.
 
 ## Installation
 
-Since eager execution is not yet part of a TensorFlow release, using it requires
-either [building from source](https://www.tensorflow.org/install/install_sources)
-or the latest nightly builds. The nightly builds are available as:
-
-- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and
-
-- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images.
-
-For example, to run the latest nightly docker image:
-
-```sh
-# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker
-nvidia-docker pull tensorflow/tensorflow:nightly-gpu
-nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
-
-# If you do not have a GPU, use the CPU-only image
-docker pull tensorflow/tensorflow:nightly
-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly
-```
-
-And then visit http://localhost:8888 in your browser for a Jupyter notebook
-environment. Try out the notebooks below.
+Eager execution is included in TensorFlow versions 1.5 and above.
+Installation instructions at https://www.tensorflow.org/install/
 
 ## Documentation
 
@@ -76,3 +56,6 @@ For an introduction to eager execution in TensorFlow, see:
 ## Changelog
 
 - 2017/10/31: Initial preview release.
+- 2017/12/01: Example of dynamic neural network:
+  [SPINN: Stack-augmented Parser-Interpreter Neural Network](https://arxiv.org/abs/1603.06021).
+  See [README.md](python/examples/spinn/README.md) for details.
diff --git a/tensorflow/contrib/eager/proto/BUILD b/tensorflow/contrib/eager/proto/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..aedfec8924e7314addd22349c0576a84a58d9aa3
--- /dev/null
+++ b/tensorflow/contrib/eager/proto/BUILD
@@ -0,0 +1,24 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_proto_library(
+    name = "checkpointable_object_graph_proto",
+    srcs = [
+        "checkpointable_object_graph.proto",
+    ],
+    visibility = ["//tensorflow/contrib/eager/python:__subpackages__"],
+)
diff --git a/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto b/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
new file mode 100644
index 0000000000000000000000000000000000000000..4f71aec96a2c3edee8a32b4e14584bd56ef3d439
--- /dev/null
+++ b/tensorflow/contrib/eager/proto/checkpointable_object_graph.proto
@@ -0,0 +1,57 @@
+syntax = "proto3";
+
+option cc_enable_arenas = true;
+
+package tensorflow.contrib.eager;
+
+// Prototype for an addition to BundleHeaderProto which saves extra information
+// about the objects which own variables, allowing for more robust checkpoint
+// loading into modified programs.
+
+message CheckpointableObjectGraph {
+  message Object {
+    message ObjectReference {
+      // An index into `CheckpointableObjectGraph.nodes`, indicating the object
+      // being referenced.
+      int32 node_id = 1;
+      // A user-provided name for the edge.
+      string local_name = 2;
+    }
+
+    message VariableReference {
+      // A name for the variable which is unique within the object which owns
+      // it. Does not include a name_scope or variable_scope prefix.
+      string local_name = 1;
+      // The full name of the variable. Used to allow name-based loading of
+      // checkpoints which were saved using an object-based API.
+      string full_name = 2;
+      // The generated name of the variable in the checkpoint.
+      string checkpoint_key = 3;
+    }
+
+    message SlotVariableReference {
+      // An index into `CheckpointableObjectGraph.nodes`, indicating the object
+      // which created the variable that this variable is slotting for.
+      int32 original_variable_node_id = 1;
+      // The local name of the variable being slotted for within the object that
+      // owns it.
+      string original_variable_local_name = 2;
+      // The name of the slot (e.g. "m"/"v").
+      string slot_name = 3;
+      // The full name of the slot variable. Used to allow name-based loading of
+      // checkpoints which were saved using an object-based API.
+      string full_name = 4;
+      // The generated name of the variable in the checkpoint.
+      string checkpoint_key = 5;
+    }
+
+    // Objects which this object depends on.
+    repeated ObjectReference children = 1;
+    // Non-slot variables owned by this object.
+    repeated VariableReference variables = 2;
+    // Slot variables owned by this object.
+    repeated SlotVariableReference slot_variables = 3;
+  }
+
+  repeated Object nodes = 1;
+}
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index bf2e883bc53c3281ef89d1200f5a089305ef3e72..cfb38a1d26c41a3923da7c989244a3d53b6a496b 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -19,6 +19,8 @@ py_library(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:numerics",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:template",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
@@ -50,7 +52,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/contrib/data/python/ops:prefetching_py",
+        "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:errors",
@@ -67,6 +69,7 @@ cuda_py_test(
     srcs = ["datasets_test.py"],
     additional_deps = [
         ":datasets",
+        "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -103,37 +106,6 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "summary_writer",
-    srcs = ["summary_writer.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/summary:gen_summary_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary_op_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
-cuda_py_test(
-    name = "summary_writer_test",
-    srcs = ["summary_writer_test.py"],
-    additional_deps = [
-        ":summary_writer",
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
 py_library(
     name = "metrics",
     srcs = [
@@ -232,6 +204,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":network",
+        "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
@@ -246,6 +219,51 @@ py_test(
     ],
 )
 
+py_library(
+    name = "checkpointable",
+    srcs = ["checkpointable.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/contrib/eager/proto:checkpointable_object_graph_proto_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_test(
+    name = "checkpointable_test",
+    srcs = ["checkpointable_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":checkpointable",
+        ":network",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:layers_base",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "@six_archive//:six",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/eager/python/checkpointable.py b/tensorflow/contrib/eager/python/checkpointable.py
new file mode 100644
index 0000000000000000000000000000000000000000..896b38a7348e1fdd5a13b197e3ee34f5c4c5a22c
--- /dev/null
+++ b/tensorflow/contrib/eager/python/checkpointable.py
@@ -0,0 +1,773 @@
+"""An object-local variable management scheme."""
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+import weakref
+
+from tensorflow.contrib.eager.proto import checkpointable_object_graph_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import slot_creator
+from tensorflow.python.training import training
+
+_CheckpointableReference = collections.namedtuple(
+    "_CheckpointableReference",
+    [
+        # The local name if explicitly specified, else None.
+        "name",
+        # The Checkpointable object being referenced.
+        "ref"
+    ])
+
+# Validation regular expression for the local names of Checkpointable
+# objects. In particular, disallows "/" in names, and reserves dash-prefixed
+# names (which are not valid Python identifiers, so we're not restricting the
+# __setattr__ syntax that way).
+_VALID_LOCAL_NAME = re.compile(r"^[A-Za-z0-9_.][A-Za-z0-9_.-]*$")
+
+# Keyword for identifying that the next bit of a checkpoint variable name is a
+# slot name. May not be the local name of a checkpointable. Checkpoint names for
+# slot variables look like:
+#
+#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
+#
+# Where <path to variable> is a full path from the checkpoint root to the
+# variable being slotted for.
+_OPTIMIZER_SLOTS_NAME = "-OPTIMIZER_SLOT"
+
+
+def _assign_existing_variable(variable_to_restore, value_pointer):
+  """Set a variable from a _ValuePointer object."""
+  base_type = variable_to_restore.dtype.base_dtype
+  with ops.colocate_with(variable_to_restore):
+    # TODO(allenl): Handle partitioned variables
+    value_to_restore, = io_ops.restore_v2(
+        prefix=value_pointer.save_path,
+        tensor_names=[value_pointer.checkpoint_key],
+        shape_and_slices=[""],
+        dtypes=[base_type],
+        name="checkpoint_initializer")
+    initializer_op = state_ops.assign(variable_to_restore, value_to_restore)
+    variable_to_restore._initializer_op = initializer_op  # pylint:disable=protected-access
+    if value_pointer.session is not None:
+      value_pointer.session.run(initializer_op)
+
+
+def _default_getter(name, shape, dtype, initializer=None,
+                    partition_info=None, **kwargs):
+  """A pared-down version of get_variable which does not reuse variables."""
+  dtype = dtypes.as_dtype(dtype)
+  shape_object = tensor_shape.as_shape(shape)
+  with ops.init_scope():
+    if initializer is None:
+      initializer, initializing_from_value = (
+          variable_scope._get_default_variable_store()._get_default_initializer(  # pylint: disable=protected-access
+              name=name, shape=shape_object, dtype=dtype))
+    else:
+      initializing_from_value = not callable(initializer)
+    # Same logic as get_variable
+    if initializing_from_value:
+      if shape is not None:
+        raise ValueError("If initializer is a constant, do not specify shape.")
+      initial_value = initializer
+      variable_dtype = None
+    else:
+      # Instantiate initializer if provided initializer is a type object.
+      if isinstance(initializer, type(init_ops.Initializer)):
+        initializer = initializer(dtype=dtype)
+      def initial_value():
+        return initializer(
+            shape_object.as_list(), dtype=dtype, partition_info=partition_info)
+      variable_dtype = dtype.base_dtype
+    return resource_variable_ops.ResourceVariable(
+        initial_value=initial_value,
+        name=name,
+        dtype=variable_dtype,
+        **kwargs
+    )
+
+
+class Checkpointable(object):
+  """Manages variables and dependencies on other objects.
+
+  To make reliable checkpoints, all `Checkpointable`s on which this object
+  depends must be registered in the constructor using `track_checkpointable` in
+  a deterministic order, and if possible they should be named. Variables may be
+  created using `add_variable` outside of the constructor and in any order, but
+  only these variables will be saved.
+  """
+
+  def __init__(self):
+    # A list of _CheckpointableReference objects.
+    self._checkpoint_dependencies = []
+    # Maps names -> Checkpointable objects for named dependencies
+    self._dependency_names = {}
+    # Set of all tracked Checkpointables
+    self._already_tracked = set()
+    self._owned_variables = {}  # local name -> variable object
+    self._deferred_restorations = {}  # local name -> _VariableRestoration
+                                      # object
+
+  def __setattr__(self, name, value):
+    """Support self.foo = checkpointable syntax.
+
+    `self.foo = checkpointable` is equivalent to
+    `self.foo = self.track_checkpointable(checkpointable, name='foo')`.
+
+    No new tracking if `value` is not a `Checkpointable`, or if `value` is
+    already being tracked (either because of an explicit `track_checkpointable`
+    or a previous `__setattr__`).
+
+    Args:
+      name: The name of the property being set.
+      value: The new value for the property.
+    """
+    # Give child classes (e.g. Network) priority, then track only if the object
+    # hasn't been added to _already_tracked.
+    super(Checkpointable, self).__setattr__(name, value)
+    if (isinstance(value, Checkpointable)
+        and value not in self._already_tracked):
+      self.track_checkpointable(value, name=name)
+
+  def add_variable(self, name, shape=None, dtype=dtypes.float32,
+                   initializer=None, **kwargs):
+    """Create a new variable object to be saved with this `Checkpointable`.
+
+    If the user has requested that this object or another `Checkpointable` which
+    depends on this object be restored from a checkpoint (deferred loading
+    before variable object creation), `initializer` may be ignored and the value
+    from the checkpoint used instead.
+
+    Args:
+      name: A name for the variable. Must be unique within this object.
+      shape: The shape of the variable.
+      dtype: The data type of the variable.
+      initializer: The initializer to use. Ignored if deferred loading has been
+        requested.
+      **kwargs: Passed to the ResourceVariable constructor.
+
+    Returns:
+      The new variable object.
+
+    Raises:
+      ValueError: If the variable name is not unique.
+      RuntimeError: If __init__ has not been called.
+    """
+    if not hasattr(self, "_owned_variables"):
+      raise RuntimeError("Need to call Checkpointable.__init__ before adding "
+                         "variables.")
+    if name in self._owned_variables:
+      raise ValueError(
+          ("A variable named '%s' already exists in this Checkpointable, but "
+           "Checkpointable.add_variable called to create another with "
+           "that name. Variable names must be unique within a Checkpointable "
+           "object.") % (name,))
+    if "getter" in kwargs:
+      # Allow the getter to be overridden, typically because there is a need for
+      # compatibility with some other variable creation mechanism. This should
+      # be relatively uncommon in user code.
+      getter = kwargs.pop("getter")
+    else:
+      getter = _default_getter
+    deferred_restoration = self._deferred_restorations.pop(name, None)
+    if deferred_restoration is not None:
+      dtype = deferred_restoration.value_pointer.dtype
+      base_type = dtype.base_dtype
+      # TODO(allenl): Handle partitioned variables here too
+      with ops.init_scope():
+        initializer, = io_ops.restore_v2(
+            prefix=deferred_restoration.value_pointer.save_path,
+            tensor_names=[deferred_restoration.value_pointer.checkpoint_key],
+            shape_and_slices=[""],
+            dtypes=[base_type],
+            name="checkpoint_initializer")
+      # We need to un-set the shape so get_variable doesn't complain, but we
+      # also need to set the static shape information on the initializer if
+      # possible so we don't get a variable with an unknown shape.
+      initializer.set_shape(shape)
+      # Un-set shape since we're using a constant initializer
+      shape = None
+
+    new_variable = getter(
+        name=name, shape=shape, dtype=dtype, initializer=initializer, **kwargs)
+    if deferred_restoration is not None:
+      if deferred_restoration.value_pointer.session is not None:
+        deferred_restoration.value_pointer.session.run(new_variable.initializer)
+      for slot_restoration in deferred_restoration.slot_restorations:
+        strong_ref = slot_restoration.optimizer_ref()
+        if strong_ref is None:
+          # If the optimizer object has been garbage collected, there's no need
+          # to create the slot variable.
+          continue
+        strong_ref._process_slot_restoration(  # pylint: disable=protected-access
+            slot_restoration, new_variable)
+    self._owned_variables[name] = new_variable
+    return new_variable
+
+  def track_checkpointable(self, checkpointable, name):
+    """Declare a dependency on another `Checkpointable` object.
+
+    Indicates that checkpoints for this object should include variables from
+    `checkpointable`.
+
+    Variables in a checkpoint are mapped to `Checkpointable`s based on names. To
+    avoid breaking existing checkpoints when modifying a class, neither variable
+    names nor dependency names (the names passed to `track_checkpointable`) may
+    change.
+
+    Args:
+      checkpointable: A `Checkpointable` which this object depends on.
+      name: A local name for `checkpointable`, used for loading checkpoints into
+        the correct objects. Python 2 identifiers are valid names, with the
+        addition of leading numerals, periods anywhere, and non-leading dashes.
+        Specifically names must match the regular expression
+        `^[A-Za-z0-9_.][A-Za-z0-9_.-]*$`.
+
+    Returns:
+      `checkpointable`, for convenience when declaring a dependency and
+      assigning to a member variable in one statement.
+
+    Raises:
+      RuntimeError: If __init__ was not called.
+      TypeError: If `checkpointable` does not inherit from `Checkpointable`.
+      ValueError: For invalid names.
+    """
+    if not hasattr(self, "_checkpoint_dependencies"):
+      raise RuntimeError("Need to call Checkpointable.__init__ before calling "
+                         "Checkpointable.track_checkpointable().")
+    if not isinstance(checkpointable, Checkpointable):
+      raise TypeError(
+          ("Checkpointable.track_checkpointable() passed type %s, not a "
+           "Checkpointable.") % (type(checkpointable),))
+    if not _VALID_LOCAL_NAME.match(name):
+      raise ValueError(
+          ("Checkpointable names must match the regular expression '%s', but "
+           "got an invalid name '%s' instead.") % (_VALID_LOCAL_NAME.pattern,
+                                                   name))
+    if (name in self._dependency_names
+        and self._dependency_names[name] is not checkpointable):
+      raise ValueError(
+          ("Called Checkpointable.track_checkpointable() with name='%s', but "
+           "a Checkpointable with this name is already declared as a "
+           "dependency. Names must be unique.") % (name,))
+    self._dependency_names[name] = checkpointable
+    self._checkpoint_dependencies.append(
+        _CheckpointableReference(name=name, ref=checkpointable))
+    self._already_tracked.add(checkpointable)
+    return checkpointable
+
+  def _process_restoration(self, restoration):
+    """Restore a variable and its slot variables (may be deferred)."""
+    variable_to_restore = self._owned_variables.get(restoration.name, None)
+    if variable_to_restore is not None:
+      # This variable already exists, so just do an assignment for this and any
+      # slot variables which depend on it.
+      _assign_existing_variable(
+          variable_to_restore, value_pointer=restoration.value_pointer)
+      for slot_restoration in restoration.slot_restorations:
+        strong_ref = slot_restoration.optimizer_ref()
+        if strong_ref is None:
+          continue
+        strong_ref._process_slot_restoration(  # pylint: disable=protected-access
+            slot_restoration, variable_to_restore)
+    else:
+      # Save this restoration for later. This intentionally overwrites any
+      # previous deferred restorations, since that gives the same semantics as
+      # direct assignment.
+      self._deferred_restorations[restoration.name] = restoration
+
+  def _process_slot_restoration(self, slot_restoration, variable):
+    """Restore a slot variable's value (creating it if necessary)."""
+    # TODO(allenl): Move this to Optimizer
+    assert isinstance(self, optimizer_lib.Optimizer)
+    named_slots = self._slot_dict(slot_restoration.slot_name)
+    variable_key = optimizer_lib._var_key(variable)  # pylint: disable=protected-access
+    existing_slot_variable = named_slots.get(variable_key, None)
+    if existing_slot_variable is None:
+      base_dtype = slot_restoration.value_pointer.dtype.base_dtype
+      initializer, = io_ops.restore_v2(
+          prefix=slot_restoration.value_pointer.save_path,
+          tensor_names=[slot_restoration.value_pointer.checkpoint_key],
+          shape_and_slices=[""],
+          dtypes=[base_dtype],
+          name="checkpoint_initializer")
+      new_slot_variable = slot_creator.create_slot(variable, initializer,
+                                                   slot_restoration.slot_name)
+      if slot_restoration.value_pointer.session is not None:
+        slot_restoration.value_pointer.session.run(
+            new_slot_variable.initializer)
+      named_slots[variable_key] = new_slot_variable
+    else:
+      _assign_existing_variable(
+          existing_slot_variable, value_pointer=slot_restoration.value_pointer)
+
+  @property
+  def checkpoint_dependencies(self):
+    """Other `Checkpointable` objects on which this object depends."""
+    return self._checkpoint_dependencies
+
+
+def _breadth_first_checkpointable_traversal(root_checkpointable):
+  """Find shortest paths to all variables owned by dependencies of root."""
+  bfs_sorted = []
+  root_checkpointable_reference = _CheckpointableReference(
+      name=None, ref=root_checkpointable)
+  to_visit = collections.deque([root_checkpointable_reference])
+  path_to_root = {root_checkpointable_reference: ()}
+  while to_visit:
+    current_checkpointable = to_visit.popleft()
+    bfs_sorted.append(current_checkpointable)
+    for child_checkpointable in (
+        current_checkpointable.ref.checkpoint_dependencies):
+      if child_checkpointable not in path_to_root:
+        path_to_root[child_checkpointable] = (
+            path_to_root[current_checkpointable] + (child_checkpointable,))
+        to_visit.append(child_checkpointable)
+  return bfs_sorted, path_to_root
+
+
+def _object_prefix_from_path(path_to_root):
+  return "/".join(
+      (checkpointable.name for checkpointable in path_to_root))
+
+
+def _escape_variable_name(variable_name):
+  # We need to support slashes in variable names for compatibility, since this
+  # naming scheme is being patched in to things like Layer.add_variable where
+  # slashes were previously accepted. We also want to use slashes to indicate
+  # edges traversed to reach the variable, so we escape forward slashes in
+  # variable names.
+  return variable_name.replace("_S_", "_S_.").replace(r"/", r"_S__")
+
+
+def _variable_naming_for_object(path_to_root):
+  """Make a function for naming variables in an object."""
+  # Name non-slot variables:
+  #
+  #   <path to node>/<local variable name>
+  #
+  # <path to node> is not necessarily unique, but this is fine since we also
+  # save the graph of `Checkpointable`s with the checkpoint. Even if this path
+  # no longer exists because of a change in the Python program, we can look up
+  # the `Checkpointable` which owns the variable in the checkpoint's graph and
+  # use another path if one still exists.
+
+  object_prefix = _object_prefix_from_path(path_to_root)
+  if object_prefix:
+    object_prefix += "/"
+
+  def _name_single_variable(local_name):
+    """Names a variable within an object."""
+    return object_prefix + _escape_variable_name(local_name)
+
+  return _name_single_variable
+
+
+def _slot_variable_naming_for_optimizer(optimizer, path_to_root):
+  """Make a function for naming slot variables in an optimizer."""
+  # Name slot variables:
+  #
+  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
+  #
+  # where <variable name> is exactly the checkpoint name used for the original
+  # variable, including the path from the checkpoint root and the local name in
+  # the object which owns it. Note that we only save slot variables if the
+  # variable it's slotting for is also being saved.
+
+  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME,
+                                      _object_prefix_from_path(path_to_root))
+
+  def _name_slot_variable(variable_path, slot_name):
+    """With an optimizer specified, name a slot variable."""
+
+    if not _VALID_LOCAL_NAME.match(slot_name):
+      # Slot variable names include the name of the slot. We need to
+      # validate that part of the name to be sure that the checkpoint name
+      # is a valid name scope name.
+      raise ValueError(
+          ("Could not save slot variables for optimizer %s, because its "
+           "slot name has invalid characters (got '%s', was expecting it "
+           "to match the regular expression '%s').") %
+          (optimizer, slot_name, _VALID_LOCAL_NAME.pattern))
+
+    return variable_path + optimizer_identifier + slot_name
+
+  return _name_slot_variable
+
+
+def _serialize_non_slot_variables(checkpointable_objects, path_to_root,
+                                  object_graph_proto):
+  """Name non-slot variables and add them to `object_graph_proto`."""
+  named_variables = {}
+  non_slot_variables = []
+  checkpoint_node_ids = {}
+
+  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
+    checkpoint_node_ids[checkpointable] = checkpoint_id
+
+  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
+    naming_scheme = _variable_naming_for_object(path_to_root[checkpointable])
+    object_proto = object_graph_proto.nodes.add()
+    for (local_name, owned_variable) in sorted(
+        checkpointable.ref._owned_variables.items(),  # pylint: disable=protected-access
+        key=lambda x: x[0]):
+      variable_name = naming_scheme(local_name)
+      named_variables[variable_name] = owned_variable
+      non_slot_variables.append((
+          variable_name,  # The variable's full checkpoint name
+          owned_variable,  # The variable object
+          local_name,  # The variable's local name
+          checkpoint_id))  # The checkpoint ID of the node which owns this
+      # variable.
+      variable_proto = object_proto.variables.add()
+      variable_proto.local_name = local_name
+      variable_proto.checkpoint_key = variable_name
+      # Figure out the name-based Saver's name for this variable.
+      saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
+          [owned_variable], convert_variable_to_tensor=False)
+      variable_full_name, = saver_dict.keys()
+      variable_proto.full_name = variable_full_name
+
+    for child in checkpointable.ref.checkpoint_dependencies:
+      child_proto = object_proto.children.add()
+      child_proto.node_id = checkpoint_node_ids[child]
+      child_proto.local_name = child.name
+  return named_variables, non_slot_variables
+
+
+def _serialize_slot_variables(checkpointable_objects, path_to_root,
+                              non_slot_variables, object_graph_proto):
+  """Name slot variables and add them to `object_graph_proto`."""
+  named_slot_variables = {}
+  for optimizer_checkpoint_id, checkpointable_ref in enumerate(
+      checkpointable_objects):
+    if isinstance(checkpointable_ref.ref, optimizer_lib.Optimizer):
+      optimizer_object_proto = object_graph_proto.nodes[optimizer_checkpoint_id]
+      naming_scheme = _slot_variable_naming_for_optimizer(
+          optimizer=checkpointable_ref.ref,
+          path_to_root=path_to_root[checkpointable_ref])
+      slot_names = checkpointable_ref.ref.get_slot_names()
+      for (variable_path, original_variable, original_variable_local_name,
+           original_node_checkpoint_id) in non_slot_variables:
+        for slot_name in slot_names:
+          slot_variable = checkpointable_ref.ref.get_slot(
+              original_variable, slot_name)
+          if slot_variable is not None:
+            checkpoint_name = naming_scheme(
+                variable_path=variable_path, slot_name=slot_name)
+            named_slot_variables[checkpoint_name] = slot_variable
+            slot_variable_proto = optimizer_object_proto.slot_variables.add()
+            slot_variable_proto.slot_name = slot_name
+            slot_variable_proto.checkpoint_key = checkpoint_name
+            # Figure out the name-based Saver's name for this variable.
+            saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
+                [slot_variable], convert_variable_to_tensor=False)
+            slot_variable_full_name, = saver_dict.keys()
+            slot_variable_proto.full_name = slot_variable_full_name
+            slot_variable_proto.original_variable_local_name = (
+                original_variable_local_name)
+            slot_variable_proto.original_variable_node_id = (
+                original_node_checkpoint_id)
+  return named_slot_variables
+
+
+# TODO(allenl): Convenience utility for saving multiple objects (i.e. construct
+# a root Checkpointable if passed a list of Checkpointables).
+def _serialize_object_graph(root_checkpointable):
+  """Determine checkpoint keys for variables and build a serialized graph.
+
+  Non-slot variables are keyed based on a shortest path from the root saveable
+  to the object which owns the variable (i.e. the one which called
+  `Checkpointable.add_variable` to create it).
+
+  Slot variables are keyed based on a shortest path to the variable being
+  slotted for, a shortest path to their optimizer, and the slot name.
+
+  Args:
+    root_checkpointable: A `Checkpointable` object whose variables (including
+      the variables of dependencies, recursively) should be saved.
+
+  Returns:
+    A tuple of (named_variables, object_graph_proto):
+      named_variables: A dictionary mapping names to variable objects.
+      object_graph_proto: A CheckpointableObjectGraph protocol buffer containing
+        the serialized object graph and variable references.
+
+  Raises:
+    ValueError: If there are invalid characters in an optimizer's slot names.
+  """
+  checkpointable_objects, path_to_root = (
+      _breadth_first_checkpointable_traversal(root_checkpointable))
+  object_graph_proto = (
+      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+
+  # Gather non-slot variables.
+  named_variables, non_slot_variables = _serialize_non_slot_variables(
+      checkpointable_objects, path_to_root, object_graph_proto)
+
+  # Gather slot variables which are associated with variables gathered above.
+  named_slot_variables = _serialize_slot_variables(
+      checkpointable_objects, path_to_root, non_slot_variables,
+      object_graph_proto)
+
+  named_variables.update(named_slot_variables)
+  return named_variables, object_graph_proto
+
+
+def _set_reference(reference_proto_table, key, checkpointable, parent,
+                   object_id_map):
+  """Record a checkpoint<->object correspondence, with error checking.
+
+  Args:
+    reference_proto_table: Map from names or numbers to `ObjectReference` protos
+      within the parent object.
+    key: Either a numeric or string identifier for the reference.
+    checkpointable: The object to record a correspondence for.
+    parent: The parent Python object, for creating a useful error message.
+    object_id_map: The map from `node_id` to Python object in which to record
+      the reference.
+  Returns:
+    The `node_id` of the Object proto corresponding to the specified Python
+    object.
+  Raises:
+    AssertionError: If another object is already bound to the `Object` proto.
+  """
+  reference_proto = reference_proto_table[key]
+  set_reference = object_id_map.setdefault(reference_proto.node_id,
+                                           checkpointable)
+  if set_reference is not checkpointable:
+    raise AssertionError(
+        ("Unable to load the checkpoint into this object graph. Either "
+         "the Checkpointable object references in the Python program "
+         "have changed in an incompatible way, or the checkpoint was "
+         "generated in an incompatible program.\n\nTwo checkpoint "
+         "references (one being '%s' in %s) resolved to different "
+         "objects (%s and %s).") % (key, parent, set_reference,
+                                    checkpointable))
+  return reference_proto.node_id
+
+
+def _checkpoint_object_id_map(root_checkpointable, object_graph_proto):
+  """Match a checkpointed object graph to a Python object graph.
+
+  Args:
+    root_checkpointable: A Checkpointable object.
+    object_graph_proto: A CheckpointableObjectGraph protocol buffer representing
+      a serialized object graph.
+  Returns:
+    A dictionary mapping from checkpoint node ids (indices into
+    `object_graph_proto.nodes`) to `Checkpointable` objects which are
+    dependencies of `root_checkpointable`.
+  """
+  node_list = object_graph_proto.nodes
+  # Queue of (checkpointable object, node id)
+  to_visit = collections.deque([(root_checkpointable, 0)])
+  object_id_map = {0: root_checkpointable}
+  seen = set()
+  while to_visit:
+    checkpointable, node_id = to_visit.popleft()
+    object_proto = node_list[node_id]
+    named_children = {}
+    for child_reference in object_proto.children:
+      if child_reference.local_name:
+        named_children[child_reference.local_name] = child_reference
+      else:
+        raise AssertionError(
+            ("The checkpointed object graph contains a reference without "
+             "a name (corrupted?). The reference was from the node %s.")
+            % (object_proto,))
+
+    for checkpointable_reference in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
+      child_node_id = _set_reference(
+          reference_proto_table=named_children,
+          key=checkpointable_reference.name,
+          checkpointable=checkpointable_reference.ref,
+          parent=checkpointable,
+          object_id_map=object_id_map)
+      if child_node_id not in seen:
+        seen.add(child_node_id)
+        to_visit.append((checkpointable_reference.ref, child_node_id))
+
+  return object_id_map
+
+
+_ValuePointer = collections.namedtuple(
+    "_ValuePointer",
+    [
+        # Information needed to look up the value to restore.
+        "save_path",
+        "checkpoint_key",
+        "dtype",
+        # The session to use when restoring (None when executing eagerly)
+        "session",
+    ])
+
+_SlotVariableRestoration = collections.namedtuple(
+    "_SlotVariableRestoration",
+    [
+        # A weak reference to the Optimizer object
+        "optimizer_ref",
+        # The slot name
+        "slot_name",
+        # The _ValuePointer to use when restoring
+        "value_pointer",
+    ])
+
+_VariableRestoration = collections.namedtuple(
+    "_VariableRestoration",
+    [
+        # The variable's (local) name.
+        "name",
+        # _SlotVariableRestoration objects indicating slot variables which
+        # should be created once this variable has been restored.
+        "slot_restorations",
+        # The _ValuePointer to use when restoring
+        "value_pointer",
+    ])
+
+
+def _gather_restorations(object_graph_proto, save_path, object_id_map,
+                         dtype_map, session):
+  """Iterate over variables to restore, matching with Checkpointable objects."""
+  variable_to_slot_restorations = {}
+  for node_id, node in enumerate(object_graph_proto.nodes):
+    for slot_variable in node.slot_variables:
+      original_variable_key = (slot_variable.original_variable_node_id,
+                               slot_variable.original_variable_local_name)
+      variable_to_slot_restorations.setdefault(
+          original_variable_key, []).append(
+              _SlotVariableRestoration(
+                  optimizer_ref=weakref.ref(object_id_map[node_id]),
+                  slot_name=slot_variable.slot_name,
+                  value_pointer=_ValuePointer(
+                      save_path=save_path,
+                      checkpoint_key=slot_variable.checkpoint_key,
+                      dtype=dtype_map[slot_variable.checkpoint_key],
+                      session=session)))
+
+  for node_id, node in enumerate(object_graph_proto.nodes):
+    for variable in node.variables:
+      slots_key = (node_id, variable.local_name)
+      variable_restore = _VariableRestoration(
+          name=variable.local_name,
+          slot_restorations=variable_to_slot_restorations.get(slots_key, []),
+          value_pointer=_ValuePointer(
+              save_path=save_path,
+              checkpoint_key=variable.checkpoint_key,
+              dtype=dtype_map[variable.checkpoint_key],
+              session=session))
+      yield variable_restore, object_id_map[node_id]
+
+
+def save(file_prefix, root_checkpointable, global_step=None, session=None):
+  """Save a training checkpoint.
+
+  Args:
+    file_prefix: A prefix to use for the checkpoint filenames
+      (/path/to/directory/and_a_prefix). Names are generated based on this
+      prefix and the global step, if provided.
+    root_checkpointable: A Checkpointable object to save. The checkpoint
+      includes variables created by this object and any Checkpointable objects
+      it depends on.
+    global_step: An integer variable or Tensor, used to number
+      checkpoints. Typically this value is saved along with other variables in
+      training checkpoints, which will happen automatically if it was created by
+      `root_checkpointable` or one of its dependencies (via
+      `Checkpointable.add_variable`).
+    session: The session to evaluate variables in. Ignored when executing
+      eagerly. If not provided when graph building, the default session is used.
+
+  Returns:
+    The full path to the checkpoint.
+
+    Currently also returns the serialized object graph proto, but that will go
+    away once it's saved with the checkpoint.
+  """
+  named_variables, serialized_graph = _serialize_object_graph(
+      root_checkpointable)
+  if context.in_graph_mode():
+    if session is None:
+      session = ops.get_default_session()
+  else:
+    session = None
+  with ops.device("/device:CPU:0"):
+    save_path = saver_lib.Saver(var_list=named_variables).save(
+        sess=session,
+        save_path=file_prefix,
+        write_meta_graph=False,
+        global_step=global_step)
+  # TODO(allenl): Save the graph with the checkpoint, then returning it and
+  # taking it as an argument to restore won't be necessary.
+  return serialized_graph, save_path
+
+
+# NOTE: Will be restore(file_prefix, root_checkpointable) once the object graph
+# is saved with the checkpoint.
+def restore(save_path, root_checkpointable, object_graph_proto, session=None):
+  """Restore a training checkpoint.
+
+  Restores the values of variables created with `Checkpointable.add_variable` in
+  the dependency graph of `root_checkpointable`. Either assigns values
+  immediately (if variables to restore have been created already), or defers
+  restoration until the variables are created.
+
+  When building a graph, restorations are executed in the default session if
+  `session` is `None`. Variable initializers read checkpointed values.
+
+  Args:
+    save_path: The path to the checkpoint, as returned by `save` or
+      `tf.train.latest_checkpoint`. If None (as when there is no latest
+      checkpoint for `tf.train.latest_checkpoint` to return), does nothing.
+    root_checkpointable: The root of the object graph to restore. Variables to
+      restore need not have been created yet, but all dependencies on other
+      Checkpointable objects should already be declared. Objects in the
+      dependency graph are matched to objects in the checkpointed graph, and
+      matching objects have their variables restored (or the checkpointed values
+      saved for eventual restoration when the variable is created).
+    object_graph_proto: (Temporary) the checkpointed object graph. This will
+      eventually be saved with the checkpoint, and will not be part of the final
+      API.
+    session: The session to evaluate assignment ops in. Ignored when executing
+      eagerly. If not provided when graph building, the default session is used.
+  """
+  if save_path is None:
+    return
+  object_id_map = _checkpoint_object_id_map(root_checkpointable,
+                                            object_graph_proto)
+  reader = training.NewCheckpointReader(save_path)
+  dtype_map = reader.get_variable_to_dtype_map()
+  if context.in_graph_mode():
+    if session is None:
+      session = ops.get_default_session()
+  else:
+    session = None
+  for restoration, checkpointable in _gather_restorations(
+      object_graph_proto, save_path, object_id_map, dtype_map, session=session):
+    checkpointable._process_restoration(restoration)  # pylint: disable=protected-access
+
diff --git a/tensorflow/contrib/eager/python/checkpointable_test.py b/tensorflow/contrib/eager/python/checkpointable_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7bc155decbb574ddd4b53190da3c3b3ee9b6a4e
--- /dev/null
+++ b/tensorflow/contrib/eager/python/checkpointable_test.py
@@ -0,0 +1,497 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+import six
+
+from tensorflow.contrib.eager.python import checkpointable
+from tensorflow.contrib.eager.python import network as network_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.layers import base
+from tensorflow.python.layers import core
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import adam
+from tensorflow.python.training import saver as core_saver
+from tensorflow.python.training import training_util
+
+
+class CheckpointableDenseLayer(core.Dense, checkpointable.Checkpointable):
+
+  def __init__(self, *args, **kwargs):
+    checkpointable.Checkpointable.__init__(self)
+    core.Dense.__init__(self, *args, **kwargs)
+
+  def add_variable(self, name, shape, **kwargs):
+    # Calls both Checkpointable.add_variable and Layer.add_variable. Eventually
+    # Layer.add_variable should inherit from Checkpointable and simply call
+    # super and then do post-processing.
+    return checkpointable.Checkpointable.add_variable(
+        self,
+        name=name,
+        shape=shape,
+        getter=functools.partial(core.Dense.add_variable, self),
+        **kwargs)
+
+
+# pylint: disable=not-callable
+class CheckpointableNetwork(network_lib.Network, checkpointable.Checkpointable):
+
+  def __init__(self):
+    network_lib.Network.__init__(self)
+    checkpointable.Checkpointable.__init__(self)
+
+  def __setattr__(self, name, value):
+    if isinstance(value, base.Layer) and value not in self._already_tracked:
+      self.track_layer(value, name=name)
+    # Checkpointable is next in the method resolution order, so this will catch
+    # Checkpointable objects which aren't Layers.
+    super(CheckpointableNetwork, self).__setattr__(name, value)
+
+  def track_layer(self, layer, name):
+    self.track_checkpointable(layer, name=name)
+    return super(CheckpointableNetwork, self).track_layer(layer)
+
+
+class CheckpointableAdam(adam.AdamOptimizer, checkpointable.Checkpointable):
+
+  def __init__(self, *args, **kwargs):
+    checkpointable.Checkpointable.__init__(self)
+    adam.AdamOptimizer.__init__(self, *args, **kwargs)
+
+  # NOTE: Copied from Optimizer with modifications to use add_variable
+  # for non-slot variables. These contortions are necessary to maintain
+  # checkpoint compatibility with variable.name based saving.
+  # TODO(allenl): Make this cleaner.
+  def _create_non_slot_variable(self, initial_value, name, colocate_with):
+    """Add an extra variable, not associated with a slot."""
+    if context.in_graph_mode():
+      graph = colocate_with.graph
+    else:
+      graph = None
+
+    key = (name, graph)
+    v = self._non_slot_dict.get(key, None)
+    if v is None:
+      with ops.colocate_with(colocate_with):
+        def _variable_getter(name, shape, dtype, initializer):
+          del shape, dtype  # not used, but there for compatibility
+          return variable_scope.variable(
+              name=name, initial_value=initializer, trainable=False)
+
+        initial_value = ops.convert_to_tensor(initial_value)
+        v = self.add_variable(
+            name=name,
+            shape=initial_value.get_shape(),
+            initializer=initial_value,
+            getter=_variable_getter)
+
+      self._non_slot_dict[key] = v
+
+    return v
+
+
+class NonLayerCheckpointable(checkpointable.Checkpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = self.add_variable(name="a_variable", shape=[])
+
+
+class MyNetwork(CheckpointableNetwork):
+  """A concrete Network for testing."""
+
+  def __init__(self):
+    super(MyNetwork, self).__init__()
+    self._named_dense = CheckpointableDenseLayer(1, use_bias=True)
+    self._via_track_layer = self.track_layer(
+        CheckpointableDenseLayer(1, use_bias=False), name="via_track_layer")
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    return self._via_track_layer(self._named_dense(values))
+
+
+class Root(checkpointable.Checkpointable):
+  """A stand-in for a Trainer class."""
+
+  def __init__(self, optimizer, network):
+    super(Root, self).__init__()
+    self._optimizer = optimizer
+    self._network = self.track_checkpointable(network, "network")
+    self._global_step = None
+
+  @property
+  def global_step(self):
+    if self._global_step is None:
+      # Get the default create_global_step utility to actually call
+      # self.add_variable, by setting a custom creator.
+      def _owned_variable_as_creator(
+          next_creator, initial_value, **kwargs):
+        def _creator_as_getter(initializer, **kwargs):
+          return next_creator(initial_value=initializer, **kwargs)
+        return self.add_variable(
+            getter=_creator_as_getter, initializer=initial_value, shape=[],
+            **kwargs)
+
+      with variable_scope.variable_creator_scope(
+          _owned_variable_as_creator):
+        self._global_step = training_util.create_global_step()
+    return self._global_step
+
+
+class InterfaceTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testAddVariable(self):
+    obj = NonLayerCheckpointable()
+    with self.assertRaisesRegexp(ValueError, "do not specify shape"):
+      obj.add_variable(
+          name="shape_specified_twice", shape=[], initializer=1)
+    constant_initializer = obj.add_variable(
+        name="constant_initializer", initializer=1)
+    with variable_scope.variable_scope("some_variable_scope"):
+      ones_initializer = obj.add_variable(
+          name="ones_initializer",
+          shape=[2],
+          initializer=init_ops.ones_initializer(dtype=dtypes.float32))
+    bare_initializer = obj.add_variable(
+        name="bare_initializer",
+        shape=[2, 2],
+        dtype=dtypes.float64,
+        initializer=init_ops.zeros_initializer)
+
+    # Even in graph mode, there are no naming conflicts between objects, only
+    # naming conflicts within an object.
+    other_duplicate = resource_variable_ops.ResourceVariable(
+        name="duplicate", initial_value=1.)
+    duplicate = obj.add_variable(name="duplicate", shape=[])
+    with self.assertRaisesRegexp(ValueError, "'duplicate' already exists"):
+      obj.add_variable(name="duplicate", shape=[])
+
+    if context.in_graph_mode():
+      self.evaluate(variables.global_variables_initializer())
+    self.assertEqual("constant_initializer:0", constant_initializer.name)
+    self.assertEqual(1, self.evaluate(constant_initializer))
+    self.assertEqual("some_variable_scope/ones_initializer:0",
+                     ones_initializer.name)
+    self.assertAllEqual([1, 1], self.evaluate(ones_initializer))
+    self.assertAllEqual([[0., 0.],
+                         [0., 0.]], self.evaluate(bare_initializer))
+    self.assertEqual("a_variable:0", obj.a_variable.name)
+    self.assertEqual("duplicate:0", other_duplicate.name)
+    if context.in_graph_mode():
+      # The .name attribute may be globally influenced, but the checkpoint name
+      # won't be (tested below).
+      self.assertEqual("duplicate_1:0", duplicate.name)
+    else:
+      # When executing eagerly, there's no uniquification of variable names. The
+      # checkpoint name will be the same.
+      self.assertEqual("duplicate:0", duplicate.name)
+    named_variables, _ = checkpointable._serialize_object_graph(obj)
+    expected_checkpoint_names = (
+        "a_variable",
+        "bare_initializer",
+        "constant_initializer",
+        "duplicate",
+        "ones_initializer",
+    )
+    six.assertCountEqual(
+        self, expected_checkpoint_names, named_variables.keys())
+
+  def testInitNotCalled(self):
+
+    class NoInit(checkpointable.Checkpointable):
+
+      def __init__(self):
+        pass
+
+    with self.assertRaisesRegexp(RuntimeError, "__init__"):
+      NoInit().add_variable("var", shape=[])
+
+
+class CheckpointingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNamingWithOptimizer(self):
+    input_value = constant_op.constant([[3.]])
+    network = MyNetwork()
+    # A nuisance Network using the same optimizer. Its slot variables should not
+    # go in the checkpoint, since it is never depended on.
+    other_network = MyNetwork()
+    optimizer = CheckpointableAdam(0.001)
+    root_checkpointable = Root(optimizer=optimizer, network=network)
+    if context.in_eager_mode():
+      optimizer.minimize(
+          lambda: network(input_value),
+          global_step=root_checkpointable.global_step)
+      optimizer.minimize(
+          lambda: other_network(input_value),
+          global_step=root_checkpointable.global_step)
+    else:
+      train_op = optimizer.minimize(
+          network(input_value), global_step=root_checkpointable.global_step)
+      optimizer.minimize(
+          other_network(input_value),
+          global_step=root_checkpointable.global_step)
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(train_op)
+    named_variables, serialized_graph = checkpointable._serialize_object_graph(
+        root_checkpointable)
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "global_step",
+        # No name provided to track_checkpointable(), so the position is used
+        # instead (one-based).
+        "network/via_track_layer/kernel",
+        # track_checkpointable() with a name provided, so that's used
+        "network/_named_dense/kernel",
+        "network/_named_dense/bias",
+        # non-Layer dependency of the network
+        "network/_non_layer/a_variable",
+        # The optimizer creates two non-slot variables
+        "_optimizer/beta1_power",
+        "_optimizer/beta2_power",
+        # Slot variables
+        "network/via_track_layer/kernel/-OPTIMIZER_SLOT/_optimizer/m",
+        "network/via_track_layer/kernel/-OPTIMIZER_SLOT/_optimizer/v",
+        "network/_named_dense/kernel/-OPTIMIZER_SLOT/_optimizer/m",
+        "network/_named_dense/kernel/-OPTIMIZER_SLOT/_optimizer/v",
+        "network/_named_dense/bias/-OPTIMIZER_SLOT/_optimizer/m",
+        "network/_named_dense/bias/-OPTIMIZER_SLOT/_optimizer/v",
+    )
+    six.assertCountEqual(self, expected_checkpoint_names,
+                         named_variables.keys())
+    # Check that we've mapped to the right variable objects (not exhaustive)
+    self.assertEqual("global_step:0", named_variables["global_step"].name)
+    self.assertEqual("my_network/checkpointable_dense_layer_1/kernel:0",
+                     named_variables["network/via_track_layer/kernel"].name)
+    self.assertEqual("my_network/checkpointable_dense_layer/kernel:0",
+                     named_variables["network/_named_dense/kernel"].name)
+    self.assertEqual("beta1_power:0",
+                     named_variables["_optimizer/beta1_power"].name)
+    self.assertEqual("beta2_power:0",
+                     named_variables["_optimizer/beta2_power"].name)
+    # Spot check the generated protocol buffers.
+    self.assertEqual("_optimizer",
+                     serialized_graph.nodes[0].children[0].local_name)
+    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
+        0].node_id]
+    self.assertEqual("beta1_power", optimizer_node.variables[0].local_name)
+    self.assertEqual("beta1_power", optimizer_node.variables[0].full_name)
+    # Variable ordering is arbitrary but deterministic (alphabetized)
+    self.assertEqual(
+        "bias", optimizer_node.slot_variables[0].original_variable_local_name)
+    original_variable_owner = serialized_graph.nodes[
+        optimizer_node.slot_variables[0].original_variable_node_id]
+    self.assertEqual("network/_named_dense/bias",
+                     original_variable_owner.variables[0].checkpoint_key)
+    self.assertEqual("bias", original_variable_owner.variables[0].local_name)
+    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+    self.assertEqual("network/_named_dense/bias/-OPTIMIZER_SLOT/_optimizer/m",
+                     optimizer_node.slot_variables[0].checkpoint_key)
+    # We strip off the :0 suffix, as variable.name-based saving does.
+    self.assertEqual("my_network/checkpointable_dense_layer/bias/Adam",
+                     optimizer_node.slot_variables[0].full_name)
+    self.assertEqual("my_network/checkpointable_dense_layer/bias/Adam:0",
+                     optimizer.get_slot(
+                         var=named_variables["network/_named_dense/bias"],
+                         name="m").name)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSaveRestore(self):
+    network = MyNetwork()
+    optimizer = CheckpointableAdam(0.001)
+    root_checkpointable = Root(optimizer=optimizer, network=network)
+    input_value = constant_op.constant([[3.]])
+    if context.in_eager_mode():
+      optimizer.minimize(
+          lambda: network(input_value),
+          global_step=root_checkpointable.global_step)
+    else:
+      train_op = optimizer.minimize(
+          network(input_value), global_step=root_checkpointable.global_step)
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(train_op)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(state_ops.assign(network._named_dense.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(network._named_dense.variables[1], "m")
+    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+    serialized_graph, save_path = checkpointable.save(
+        file_prefix=prefix,
+        root_checkpointable=root_checkpointable,
+        global_step=root_checkpointable.global_step)
+    self.evaluate(state_ops.assign(network._named_dense.variables[1], [43.]))
+    self.evaluate(state_ops.assign(root_checkpointable.global_step, 3))
+    optimizer_variables = self.evaluate(optimizer.variables())
+    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+    # Immediate restoration
+    checkpointable.restore(
+        save_path=save_path,
+        root_checkpointable=root_checkpointable,
+        object_graph_proto=serialized_graph)
+    self.assertAllEqual([42.], self.evaluate(network._named_dense.variables[1]))
+    self.assertAllEqual(1, self.evaluate(root_checkpointable.global_step))
+    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+    with ops.Graph().as_default():
+      on_create_network = MyNetwork()
+      on_create_optimizer = CheckpointableAdam(0.001)
+      on_create_root = Root(
+          optimizer=on_create_optimizer, network=on_create_network)
+      with self.test_session(graph=ops.get_default_graph()):
+        # Deferred restoration
+        checkpointable.restore(
+            save_path=save_path,
+            root_checkpointable=on_create_root,
+            object_graph_proto=serialized_graph)
+        on_create_network(constant_op.constant([[3.]]))  # create variables
+        self.assertAllEqual(1, self.evaluate(on_create_root.global_step))
+        self.assertAllEqual([42.],
+                            self.evaluate(
+                                on_create_network._named_dense.variables[1]))
+        on_create_m_bias_slot = on_create_optimizer.get_slot(
+            on_create_network._named_dense.variables[1], "m")
+        # Optimizer slot variables are created when the original variable is
+        # restored.
+        self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+        # beta1_power and beta2_power haven't been created yet, but everything
+        # else matches.
+        self.assertAllEqual(optimizer_variables[2:],
+                            self.evaluate(on_create_optimizer.variables()))
+        on_create_optimizer._create_slots(
+            [resource_variable_ops.ResourceVariable([1.])])
+        beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
+        self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
+        self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    latest_object_graph = None  # Will be saved with the checkpoint eventually.
+    for training_continuation in range(3):
+      with ops.Graph().as_default():
+        network = MyNetwork()
+        optimizer = CheckpointableAdam(0.001)
+        root = Root(optimizer=optimizer, network=network)
+        checkpointable.restore(
+            save_path=core_saver.latest_checkpoint(checkpoint_directory),
+            root_checkpointable=root,
+            object_graph_proto=latest_object_graph)
+        for _ in range(num_training_steps):
+          # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+          input_value = constant_op.constant([[3.]])
+          optimizer.minimize(
+              lambda: network(input_value),  # pylint: disable=cell-var-from-loop
+              global_step=root.global_step)
+        latest_object_graph, _ = checkpointable.save(
+            file_prefix=checkpoint_prefix,
+            root_checkpointable=root)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         root.global_step.numpy())
+
+  def testUsageGraph(self):
+    """Expected usage when graph building."""
+    with context.graph_mode():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      latest_object_graph = None
+      for training_continuation in range(3):
+        with ops.Graph().as_default():
+          network = MyNetwork()
+          optimizer = CheckpointableAdam(0.001)
+          root = Root(optimizer=optimizer, network=network)
+          input_value = constant_op.constant([[3.]])
+          train_op = optimizer.minimize(
+              network(input_value),
+              global_step=root.global_step)
+          init_op = variables.global_variables_initializer()
+          checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+          with self.test_session(graph=ops.get_default_graph()) as session:
+            if checkpoint_path is None:
+              self.assertEqual(0, training_continuation)
+              session.run(init_op)
+              # Another alternative would be to run initializers automatically
+              # if no checkpoint is being loaded. This would make deferred
+              # loading a bit more useful with graph execution.
+            else:
+              checkpointable.restore(
+                  save_path=checkpoint_path,
+                  root_checkpointable=root,
+                  object_graph_proto=latest_object_graph,
+                  session=session)
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            latest_object_graph, _ = checkpointable.save(
+                file_prefix=checkpoint_prefix,
+                root_checkpointable=root,
+                session=session)
+            self.assertEqual((training_continuation + 1) * num_training_steps,
+                             session.run(root.global_step))
+
+  def _get_checkpoint_name(self, name):
+    root = checkpointable.Checkpointable()
+    root.add_variable(name=name, shape=[1, 2], dtype=dtypes.float64)
+    named_variables, _ = checkpointable._serialize_object_graph(root)
+    checkpoint_name, = named_variables.keys()
+    with ops.name_scope("root/" + checkpoint_name):
+      pass  # Make sure we can use this as an op name if we prefix it.
+    return checkpoint_name
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testVariableNameEscaping(self):
+    self.assertEqual(r"a_S__b_S__c", self._get_checkpoint_name(r"a/b/c"))
+    self.assertEqual(r"b", self._get_checkpoint_name(r"b"))
+    self.assertEqual(r"c_S__", self._get_checkpoint_name(r"c/"))
+    self.assertEqual(r"d_S___S_._", self._get_checkpoint_name(r"d/_S__"))
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNumberedPath(self):
+    root = checkpointable.Checkpointable()
+    leaf = checkpointable.Checkpointable()
+    root.track_checkpointable(leaf, name="leaf")
+    leaf.add_variable(name="v", shape=[])
+    named_variables, _ = checkpointable._serialize_object_graph(root)
+    variable_name, = named_variables.keys()
+    self.assertEqual(r"leaf/v", variable_name)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLocalNameValidation(self):
+    root = checkpointable.Checkpointable()
+    leaf = checkpointable.Checkpointable()
+    with self.assertRaisesRegexp(ValueError, "invalid name"):
+      # Leading dashes are reserved, which avoids conflicts with un-named edges
+      # in paths and the optimizer slots identifier.
+      root.track_checkpointable(leaf, name="-unnamed-12")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index b559cce6b12a809d671ce7855680063f02a4ac22..d177bfeab2d1fdc05d7ced54df8723fae2c77fdb 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -23,6 +23,7 @@ import threading
 from tensorflow.contrib.data.python.ops import prefetching_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -41,7 +42,7 @@ def _generate_shared_name(prefix):
     global _uid_counter
     uid = _uid_counter
     _uid_counter += 1
-  return "{}_{}".format(prefix, uid)
+  return "{}{}".format(prefix, uid)
 
 
 class Iterator(object):
@@ -75,13 +76,16 @@ class Iterator(object):
           format(type(self)))
     with ops.device("/device:CPU:0"):
       ds_variant = dataset._as_variant_tensor()  # pylint: disable=protected-access
+      self._output_classes = dataset.output_classes
       self._output_types = dataset.output_types
       self._output_shapes = dataset.output_shapes
-      self._flat_output_types = nest.flatten(dataset.output_types)
-      self._flat_output_shapes = nest.flatten(dataset.output_shapes)
+      self._flat_output_types = nest.flatten(
+          sparse.as_dense_types(self._output_types, self._output_classes))
+      self._flat_output_shapes = nest.flatten(
+          sparse.as_dense_shapes(self._output_shapes, self._output_classes))
       self._resource = gen_dataset_ops.iterator(
-          container="",
-          shared_name=_generate_shared_name("eager_iterator"),
+          shared_name="",
+          container=_generate_shared_name("eageriterator"),
           output_types=self._flat_output_types,
           output_shapes=self._flat_output_shapes)
       gen_dataset_ops.make_iterator(ds_variant, self._resource)
@@ -108,7 +112,7 @@ class Iterator(object):
         remote_fn.add_to_graph(None)
         target = constant_op.constant("/device:CPU:0")
       with ops.device(self._device):
-        self._buffer_resource_handle = prefetching_ops.function_buffering_resource(
+        self._buffer_resource_handle = prefetching_ops.function_buffering_resource(  # pylint: disable=line-too-long
             string_arg=iter_string_handle,
             f=remote_fn,
             target_device=target,
@@ -116,8 +120,9 @@ class Iterator(object):
             thread_pool_size=1,
             container="",
             shared_name=_generate_shared_name("function_buffer_resource"))
-        self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(
-            handle=self._buffer_resource_handle, handle_device=self._device)
+        self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
+            handle=self._buffer_resource_handle,
+            handle_device=self._device)
 
   def __iter__(self):
     return self
@@ -125,22 +130,83 @@ class Iterator(object):
   def __next__(self):  # For Python 3 compatibility
     return self.next()
 
-  def next(self):
-    """Return the next tf.Tensor from the dataset."""
+  def _next_internal(self):
+    """Returns a nested structure of `tf.Tensor`s containing the next element.
+    """
     with ops.device(self._device):
-      try:
-        if self._buffer_resource_handle is not None:
-          ret = prefetching_ops.function_buffering_resource_get_next(
-              function_buffer_resource=self._buffer_resource_handle,
-              output_types=self._flat_output_types)
-        else:
-          # TODO(ashankar): Consider removing this ops.device() contextmanager
-          # and instead mimic ops placement in graphs: Operations on resource
-          # handles execute on the same device as where the resource is placed.
-          ret = gen_dataset_ops.iterator_get_next(
-              self._resource,
-              output_types=self._flat_output_types,
-              output_shapes=self._flat_output_shapes)
-      except errors.OutOfRangeError:
-        raise StopIteration
-      return nest.pack_sequence_as(self._output_types, ret)
+      if self._buffer_resource_handle is not None:
+        ret = prefetching_ops.function_buffering_resource_get_next(
+            function_buffer_resource=self._buffer_resource_handle,
+            output_types=self._flat_output_types)
+      else:
+        # TODO(ashankar): Consider removing this ops.device() contextmanager
+        # and instead mimic ops placement in graphs: Operations on resource
+        # handles execute on the same device as where the resource is placed.
+        # NOTE(mrry): Here we use the "_sync" variant of `iterator_get_next`
+        # because in eager mode this code will run synchronously on the calling
+        # thread. Therefore we do not need to make a defensive context switch
+        # to a background thread, and can achieve a small constant performance
+        # boost by invoking the iterator synchronously.
+        ret = gen_dataset_ops.iterator_get_next_sync(
+            self._resource,
+            output_types=self._flat_output_types,
+            output_shapes=self._flat_output_shapes)
+
+    return sparse.deserialize_sparse_tensors(
+        nest.pack_sequence_as(self._output_types, ret), self._output_types,
+        self._output_shapes, self._output_classes)
+
+  def next(self):
+    """Returns a nested structure of `tf.Tensor`s containing the next element.
+    """
+    try:
+      return self._next_internal()
+    except errors.OutOfRangeError:
+      raise StopIteration
+
+  @property
+  def output_classes(self):
+    """Returns the class of each component of an element of this iterator.
+
+    The expected values are `tf.Tensor` and `tf.SparseTensor`.
+
+    Returns:
+      A nested structure of Python `type` objects corresponding to each
+      component of an element of this dataset.
+    """
+    return self._output_classes
+
+  @property
+  def output_shapes(self):
+    """Returns the shape of each component of an element of this iterator.
+
+    Returns:
+      A nested structure of `tf.TensorShape` objects corresponding to each
+      component of an element of this dataset.
+    """
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    """Returns the type of each component of an element of this iterator.
+
+    Returns:
+      A nested structure of `tf.DType` objects corresponding to each component
+      of an element of this dataset.
+    """
+    return self._output_types
+
+  def get_next(self, name=None):
+    """Returns a nested structure of `tf.Tensor`s containing the next element.
+
+    Args:
+      name: (Optional.) A name for the created operation. Currently unused.
+
+    Returns:
+      A nested structure of `tf.Tensor` objects.
+
+    Raises:
+      `tf.errors.OutOfRangeError`: If the end of the dataset has been reached.
+    """
+    del name
+    return self._next_internal()
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index c924d81c9d85e638e4f35f260664c0ee7d03257e..a1611e92b113839c2dd2a3b2560b0ba90c0a7ef0 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -16,11 +16,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
+
+import numpy as np
+
+from tensorflow.contrib import lookup
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.python.data import Dataset
 from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 
@@ -33,6 +41,15 @@ class IteratorTest(test.TestCase):
       got.append(t.numpy())
     self.assertAllEqual([0, 1, 2, 3], got)
 
+  def testGetNext(self):
+    iterator = datasets.Iterator(Dataset.range(4))
+    self.assertEqual(0, iterator.get_next().numpy())
+    self.assertEqual(1, iterator.get_next().numpy())
+    self.assertEqual(2, iterator.get_next().numpy())
+    self.assertEqual(3, iterator.get_next().numpy())
+    with self.assertRaises(errors.OutOfRangeError):
+      iterator.get_next()
+
   def testMultipleIteratorsOnTheSameDataset(self):
     ds = Dataset.range(4)
     it1 = datasets.Iterator(ds)
@@ -64,6 +81,18 @@ class IteratorTest(test.TestCase):
     got = [x.numpy() for x in it]
     self.assertAllEqual([0, 4, 16, 36], got)
 
+  def testMapCaptureLookupTable(self):
+    default_val = -1
+    keys = constant_op.constant(['brain', 'salad', 'surgery'])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup.HashTable(
+        lookup.KeyValueTensorInitializer(keys, values), default_val)
+    dataset = Dataset.from_tensor_slices(['brain', 'salad', 'surgery'])
+    dataset = dataset.map(table.lookup)
+    it = datasets.Iterator(dataset)
+    got = [x.numpy() for x in it]
+    self.assertAllEqual([0, 1, 2], got)
+
   def testMultipleIteratorsOnADatasetThatUsesFunctions(self):
     ds = Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6]).map(math_ops.square)
 
@@ -72,6 +101,53 @@ class IteratorTest(test.TestCase):
     got2 = [x.numpy() for x in datasets.Iterator(ds)]
     self.assertAllEqual(got1, got2)
 
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
+  def testSparseTensorElements(self):
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0, 0], [1, 0], [2, 0]]),
+        values=np.array([0, 0, 0]),
+        dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    expected = [
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[0]]),
+             values=np.array([1]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[1]]),
+             values=np.array([2]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[2]]),
+             values=np.array([3]),
+             dense_shape=np.array([3]))),
+    ]
+
+    for i, result in enumerate(
+        datasets.Iterator(Dataset.from_tensor_slices(components))):
+      self.assertSparseValuesEqual(expected[i][0], result[0])
+      self.assertSparseValuesEqual(expected[i][1], result[1])
+
   def testPyFunc(self):
 
     def my_map(inp):
@@ -90,5 +166,64 @@ class IteratorTest(test.TestCase):
     self.assertAllEqual([0., 2.], x.numpy())
 
 
+class DatasetConstructorBenchmark(test.Benchmark):
+
+  def benchmarkSliceRepeatBatchEager(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        Dataset.from_tensor_slices(input_data).repeat(num_epochs)
+        .batch(batch_size))
+    iterator = datasets.Iterator(dataset)
+
+    ends = [time.time()]
+    for _ in iterator:
+      ends.append(time.time())
+
+    deltas = np.ediff1d(ends)
+    median_wall_time = np.median(deltas)
+    print(
+        'Slice/repeat/batch eager input size: %d batch size: %d Median wall '
+        'time per element: %f'
+        % (input_size, batch_size, median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name='benchmark_slice_repeat_batch_eager_input_%d_batch_%d' %
+        (input_size, batch_size))
+
+  def benchmarkSliceBatchCacheRepeatCallable(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        Dataset.from_tensor_slices(input_data).batch(batch_size).cache()
+        .repeat(num_epochs))
+    iterator = datasets.Iterator(dataset)
+
+    ends = [time.time()]
+    for _ in iterator:
+      ends.append(time.time())
+
+    deltas = np.ediff1d(ends)
+    median_wall_time = np.median(deltas)
+    print(
+        'Slice/batch/cache/repeat eager input size: %d batch size: %d Median '
+        'wall time per element: %f'
+        % (input_size, batch_size, median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name='benchmark_slice_batch_cache_repeat_eager_input_%d_batch_%d' %
+        (input_size, batch_size))
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index bd0ab02ecf7ae6025e08dde1c3ddc634db9255c1..68e7b5421fec7f73f10e381ca45f9d900de299d7 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -110,7 +110,7 @@ class Evaluator(object):
         return self._all_metric_results()
     else:
       def f():
-        with summary_ops.create_summary_file_writer(
+        with summary_ops.create_file_writer(
             summary_logdir).as_default(), summary_ops.always_record_summaries():
           return self._all_metric_results()
       if context.in_eager_mode():
@@ -178,7 +178,7 @@ class Evaluator(object):
       call_op: An op that updates evaluation state on a mini-batch of examples.
         Must generate an tf.errors.OutOfRangeError when done.
       results_op: A dictionary of tensors that compute the final evaluation
-        results from the evaulation state.
+        results from the evaluation state.
       sess: The Session to run the evaluation in. Defaults to the default
         Session.
 
diff --git a/tensorflow/contrib/eager/python/examples/BUILD b/tensorflow/contrib/eager/python/examples/BUILD
index aa21a6ab994acf929890ecebc07a86cf7ebf97db..15a21885f66eface291a39fa0ee1ff28bc297548 100644
--- a/tensorflow/contrib/eager/python/examples/BUILD
+++ b/tensorflow/contrib/eager/python/examples/BUILD
@@ -6,10 +6,12 @@ package(default_visibility = ["//tensorflow:internal"])
 py_library(
     name = "examples_pip",
     deps = [
+        "//tensorflow/contrib/eager/python/examples/gan:mnist",
         "//tensorflow/contrib/eager/python/examples/linear_regression",
         "//tensorflow/contrib/eager/python/examples/mnist",
         "//tensorflow/contrib/eager/python/examples/resnet50",
         "//tensorflow/contrib/eager/python/examples/rnn_colorbot",
         "//tensorflow/contrib/eager/python/examples/rnn_ptb",
+        "//tensorflow/contrib/eager/python/examples/spinn:data",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/gan/BUILD b/tensorflow/contrib/eager/python/examples/gan/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c61ec2dbae60a782c0e6589701554b045dcb92ae
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/gan/BUILD
@@ -0,0 +1,36 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+py_binary(
+    name = "mnist",
+    srcs = ["mnist.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/eager/python:tfe",
+        "//tensorflow/examples/tutorials/mnist:input_data",
+    ],
+)
+
+cuda_py_test(
+    name = "mnist_test",
+    srcs = ["mnist_test.py"],
+    additional_deps = [
+        ":mnist",
+        "//tensorflow/contrib/eager/python:tfe",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "mnist_graph_test",
+    srcs = ["mnist_graph_test.py"],
+    additional_deps = [
+        ":mnist",
+        "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/contrib/eager/python/examples/gan/README.md b/tensorflow/contrib/eager/python/examples/gan/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..208a64b05d47eea10b49a1bf967a5453677bfd21
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/gan/README.md
@@ -0,0 +1,38 @@
+# GAN with TensorFlow eager execution
+
+A simple Generative Adversarial Network (GAN) example using eager execution.
+The discriminator and generator networks each contain a few convolution and
+fully connected layers.
+
+Other eager execution examples can be found under the parent directory.
+
+##  Content
+
+- `mnist.py`: Model definitions and training routines.
+- `mnist_test.py`: Benchmarks for training and using the models using eager
+execution.
+- `mnist_graph_test.py`: Benchmarks for training and using the models using
+graph execution. The same model definitions and loss functions are used in
+all benchmarks.
+
+
+## To run
+
+- Make sure you have installed TensorFlow 1.5+ or the latest `tf-nightly`
+or `tf-nightly-gpu` pip package in order to access the eager execution feature.
+
+- Train model. E.g.,
+
+  ```bash
+  python mnist.py
+  ```
+  
+  Use `--output_dir=<DIR>` to direct the script to save TensorBoard summaries
+  during training. Disabled by default.
+  
+  Use `--checkpoint_dir=<DIR>` to direct the script to save checkpoints to
+  `<DIR>` during training. DIR defaults to /tmp/tensorflow/mnist/checkpoints/.
+  The script will load the   latest saved checkpoint from this directory if
+  one exists.
+  
+  Use `-h` for other options.
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist.py b/tensorflow/contrib/eager/python/examples/gan/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9ac79f46c83bb709918e3b72830b90ddcfd71b4
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist.py
@@ -0,0 +1,368 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A deep MNIST classifier using convolutional layers.
+
+Sample usage:
+  python mnist.py --help
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+import time
+
+import tensorflow as tf
+
+import tensorflow.contrib.eager as tfe
+from tensorflow.examples.tutorials.mnist import input_data
+
+FLAGS = None
+
+
+class Discriminator(tfe.Network):
+  """GAN Discriminator.
+
+  A network to differentiate between generated and real handwritten digits.
+  """
+
+  def __init__(self, data_format):
+    """Creates a model for discriminating between real and generated digits.
+
+    Args:
+      data_format: Either 'channels_first' or 'channels_last'.
+        'channels_first' is typically faster on GPUs while 'channels_last' is
+        typically faster on CPUs. See
+        https://www.tensorflow.org/performance/performance_guide#data_formats
+    """
+    super(Discriminator, self).__init__(name='')
+    if data_format == 'channels_first':
+      self._input_shape = [-1, 1, 28, 28]
+    else:
+      assert data_format == 'channels_last'
+      self._input_shape = [-1, 28, 28, 1]
+    self.conv1 = self.track_layer(tf.layers.Conv2D(64, 5, padding='SAME',
+                                                   data_format=data_format,
+                                                   activation=tf.tanh))
+    self.pool1 = self.track_layer(
+        tf.layers.AveragePooling2D(2, 2, data_format=data_format))
+    self.conv2 = self.track_layer(tf.layers.Conv2D(128, 5,
+                                                   data_format=data_format,
+                                                   activation=tf.tanh))
+    self.pool2 = self.track_layer(
+        tf.layers.AveragePooling2D(2, 2, data_format=data_format))
+    self.flatten = self.track_layer(tf.layers.Flatten())
+    self.fc1 = self.track_layer(tf.layers.Dense(1024, activation=tf.tanh))
+    self.fc2 = self.track_layer(tf.layers.Dense(1, activation=None))
+
+  def call(self, inputs):
+    """Return two logits per image estimating input authenticity.
+
+    Users should invoke __call__ to run the network, which delegates to this
+    method (and not call this method directly).
+
+    Args:
+      inputs: A batch of images as a Tensor with shape [batch_size, 28, 28, 1]
+        or [batch_size, 1, 28, 28]
+
+    Returns:
+      A Tensor with shape [batch_size] containing logits estimating
+      the probability that corresponding digit is real.
+    """
+    x = tf.reshape(inputs, self._input_shape)
+    x = self.conv1(x)
+    x = self.pool1(x)
+    x = self.conv2(x)
+    x = self.pool2(x)
+    x = self.flatten(x)
+    x = self.fc1(x)
+    x = self.fc2(x)
+    return x
+
+
+class Generator(tfe.Network):
+  """Generator of handwritten digits similar to the ones in the MNIST dataset.
+  """
+
+  def __init__(self, data_format):
+    """Creates a model for discriminating between real and generated digits.
+
+    Args:
+      data_format: Either 'channels_first' or 'channels_last'.
+        'channels_first' is typically faster on GPUs while 'channels_last' is
+        typically faster on CPUs. See
+        https://www.tensorflow.org/performance/performance_guide#data_formats
+    """
+    super(Generator, self).__init__(name='')
+    self.data_format = data_format
+    # We are using 128 6x6 channels as input to the first deconvolution layer
+    if data_format == 'channels_first':
+      self._pre_conv_shape = [-1, 128, 6, 6]
+    else:
+      assert data_format == 'channels_last'
+      self._pre_conv_shape = [-1, 6, 6, 128]
+    self.fc1 = self.track_layer(tf.layers.Dense(6 * 6 * 128,
+                                                activation=tf.tanh))
+
+    # In call(), we reshape the output of fc1 to _pre_conv_shape
+
+    # Deconvolution layer. Resulting image shape: (batch, 14, 14, 64)
+    self.conv1 = self.track_layer(tf.layers.Conv2DTranspose(
+        64, 4, strides=2, activation=None, data_format=data_format))
+
+    # Deconvolution layer. Resulting image shape: (batch, 28, 28, 1)
+    self.conv2 = self.track_layer(tf.layers.Conv2DTranspose(
+        1, 2, strides=2, activation=tf.nn.sigmoid, data_format=data_format))
+
+  def call(self, inputs):
+    """Return a batch of generated images.
+
+    Users should invoke __call__ to run the network, which delegates to this
+    method (and not call this method directly).
+
+    Args:
+      inputs: A batch of noise vectors as a Tensor with shape
+        [batch_size, length of noise vectors].
+
+    Returns:
+      A Tensor containing generated images. If data_format is 'channels_last',
+      the shape of returned images is [batch_size, 28, 28, 1], else
+      [batch_size, 1, 28, 28]
+    """
+
+    x = self.fc1(inputs)
+    x = tf.reshape(x, shape=self._pre_conv_shape)
+    x = self.conv1(x)
+    x = self.conv2(x)
+    return x
+
+
+def discriminator_loss(discriminator_real_outputs, discriminator_gen_outputs):
+  """Original discriminator loss for GANs, with label smoothing.
+
+  See `Generative Adversarial Nets` (https://arxiv.org/abs/1406.2661) for more
+  details.
+
+  Args:
+    discriminator_real_outputs: Discriminator output on real data.
+    discriminator_gen_outputs: Discriminator output on generated data. Expected
+      to be in the range of (-inf, inf).
+
+  Returns:
+    A scalar loss Tensor.
+  """
+
+  loss_on_real = tf.losses.sigmoid_cross_entropy(
+      tf.ones_like(discriminator_real_outputs), discriminator_real_outputs,
+      label_smoothing=0.25)
+  loss_on_generated = tf.losses.sigmoid_cross_entropy(
+      tf.zeros_like(discriminator_gen_outputs), discriminator_gen_outputs)
+  loss = loss_on_real + loss_on_generated
+  tf.contrib.summary.scalar('discriminator_loss', loss)
+  return loss
+
+
+def generator_loss(discriminator_gen_outputs):
+  """Original generator loss for GANs.
+
+  L = -log(sigmoid(D(G(z))))
+
+  See `Generative Adversarial Nets` (https://arxiv.org/abs/1406.2661)
+  for more details.
+
+  Args:
+    discriminator_gen_outputs: Discriminator output on generated data. Expected
+      to be in the range of (-inf, inf).
+
+  Returns:
+    A scalar loss Tensor.
+  """
+  loss = tf.losses.sigmoid_cross_entropy(
+      tf.ones_like(discriminator_gen_outputs), discriminator_gen_outputs)
+  tf.contrib.summary.scalar('generator_loss', loss)
+  return loss
+
+
+def train_one_epoch(generator, discriminator,
+                    generator_optimizer, discriminator_optimizer,
+                    dataset, log_interval, noise_dim):
+  """Trains `generator` and `discriminator` models on `dataset`.
+
+  Args:
+    generator: Generator model.
+    discriminator: Discriminator model.
+    generator_optimizer: Optimizer to use for generator.
+    discriminator_optimizer: Optimizer to use for discriminator.
+    dataset: Dataset of images to train on.
+    log_interval: How many global steps to wait between logging and collecting
+      summaries.
+    noise_dim: Dimension of noise vector to use.
+  """
+
+  total_generator_loss = 0.0
+  total_discriminator_loss = 0.0
+  for (batch_index, images) in enumerate(tfe.Iterator(dataset)):
+    with tf.device('/cpu:0'):
+      tf.assign_add(tf.train.get_global_step(), 1)
+
+    with tf.contrib.summary.record_summaries_every_n_global_steps(log_interval):
+      current_batch_size = images.shape[0]
+      noise = tf.random_uniform(shape=[current_batch_size, noise_dim],
+                                minval=-1., maxval=1., seed=batch_index)
+
+      with tfe.GradientTape(persistent=True) as g:
+        generated_images = generator(noise)
+        tf.contrib.summary.image('generated_images',
+                                 tf.reshape(generated_images, [-1, 28, 28, 1]),
+                                 max_images=10)
+
+        discriminator_gen_outputs = discriminator(generated_images)
+        discriminator_real_outputs = discriminator(images)
+        discriminator_loss_val = discriminator_loss(discriminator_real_outputs,
+                                                    discriminator_gen_outputs)
+        total_discriminator_loss += discriminator_loss_val
+
+        generator_loss_val = generator_loss(discriminator_gen_outputs)
+        total_generator_loss += generator_loss_val
+
+      generator_grad = g.gradient(generator_loss_val, generator.variables)
+      discriminator_grad = g.gradient(discriminator_loss_val,
+                                      discriminator.variables)
+
+      with tf.variable_scope('generator'):
+        generator_optimizer.apply_gradients(zip(generator_grad,
+                                                generator.variables))
+      with tf.variable_scope('discriminator'):
+        discriminator_optimizer.apply_gradients(zip(discriminator_grad,
+                                                    discriminator.variables))
+
+      if log_interval and batch_index > 0 and batch_index % log_interval == 0:
+        print('Batch #%d\tAverage Generator Loss: %.6f\t'
+              'Average Discriminator Loss: %.6f' % (
+                  batch_index, total_generator_loss/batch_index,
+                  total_discriminator_loss/batch_index))
+
+
+def main(_):
+  (device, data_format) = ('/gpu:0', 'channels_first')
+  if FLAGS.no_gpu or tfe.num_gpus() <= 0:
+    (device, data_format) = ('/cpu:0', 'channels_last')
+  print('Using device %s, and data format %s.' % (device, data_format))
+
+  # Load the datasets
+  data = input_data.read_data_sets(FLAGS.data_dir)
+  dataset = (tf.data.Dataset
+             .from_tensor_slices(data.train.images)
+             .shuffle(60000)
+             .batch(FLAGS.batch_size))
+
+  # Create the models and optimizers
+  generator = Generator(data_format)
+  discriminator = Discriminator(data_format)
+  with tf.variable_scope('generator'):
+    generator_optimizer = tf.train.AdamOptimizer(FLAGS.lr)
+  with tf.variable_scope('discriminator'):
+    discriminator_optimizer = tf.train.AdamOptimizer(FLAGS.lr)
+
+  # Prepare summary writer and checkpoint info
+  summary_writer = tf.contrib.summary.create_summary_file_writer(
+      FLAGS.output_dir, flush_millis=1000)
+  checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt')
+  latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
+  if latest_cpkt:
+    print('Using latest checkpoint at ' + latest_cpkt)
+
+  with tf.device(device):
+    for epoch in range(1, 101):
+      with tfe.restore_variables_on_create(latest_cpkt):
+        global_step = tf.train.get_or_create_global_step()
+        start = time.time()
+        with summary_writer.as_default():
+          train_one_epoch(generator, discriminator, generator_optimizer,
+                          discriminator_optimizer,
+                          dataset, FLAGS.log_interval, FLAGS.noise)
+        end = time.time()
+        print('\nTrain time for epoch #%d (global step %d): %f' % (
+            epoch, global_step.numpy(), end - start))
+
+      all_variables = (
+          generator.variables
+          + discriminator.variables
+          + generator_optimizer.variables()
+          + discriminator_optimizer.variables()
+          + [global_step])
+      tfe.Saver(all_variables).save(
+          checkpoint_prefix, global_step=global_step)
+
+
+if __name__ == '__main__':
+  tfe.enable_eager_execution()
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--data-dir',
+      type=str,
+      default='/tmp/tensorflow/mnist/input_data',
+      help=('Directory for storing input data (default '
+            '/tmp/tensorflow/mnist/input_data)'))
+  parser.add_argument(
+      '--batch-size',
+      type=int,
+      default=128,
+      metavar='N',
+      help='input batch size for training (default: 128)')
+  parser.add_argument(
+      '--log-interval',
+      type=int,
+      default=100,
+      metavar='N',
+      help=('number of batches between logging and writing summaries '
+            '(default: 100)'))
+  parser.add_argument(
+      '--output_dir',
+      type=str,
+      default=None,
+      metavar='DIR',
+      help='Directory to write TensorBoard summaries (defaults to none)')
+  parser.add_argument(
+      '--checkpoint_dir',
+      type=str,
+      default='/tmp/tensorflow/mnist/checkpoints/',
+      metavar='DIR',
+      help=('Directory to save checkpoints in (once per epoch) (default '
+            '/tmp/tensorflow/mnist/checkpoints/)'))
+  parser.add_argument(
+      '--lr',
+      type=float,
+      default=0.001,
+      metavar='LR',
+      help='learning rate (default: 0.001)')
+  parser.add_argument(
+      '--noise',
+      type=int,
+      default=100,
+      metavar='N',
+      help='Length of noise vector for generator input (default: 100)')
+  parser.add_argument(
+      '--no-gpu',
+      action='store_true',
+      default=False,
+      help='disables GPU usage even if a GPU is available')
+
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py b/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..12b39b0cde49d4c017acfa74572c725036c54eff
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
@@ -0,0 +1,151 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+import time
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.contrib.eager.python.examples.gan import mnist
+
+NOISE_DIM = 100
+# Big enough so that summaries are never recorded.
+# Lower this value if would like to benchmark with some summaries.
+SUMMARY_INTERVAL = 10000
+SUMMARY_FLUSH_MS = 100  # Flush summaries every 100ms
+
+
+def data_format():
+  return 'channels_first' if tf.test.is_gpu_available() else 'channels_last'
+
+
+class MnistGraphGanBenchmark(tf.test.Benchmark):
+
+  def _create_graph(self, batch_size):
+    # Generate some random data.
+    images_data = np.random.randn(batch_size, 784).astype(np.float32)
+    dataset = tf.data.Dataset.from_tensors(images_data)
+    images = dataset.repeat().make_one_shot_iterator().get_next()
+
+    # Create the models and optimizers
+    generator = mnist.Generator(data_format())
+    discriminator = mnist.Discriminator(data_format())
+    with tf.variable_scope('generator'):
+      generator_optimizer = tf.train.AdamOptimizer(0.001)
+    with tf.variable_scope('discriminator'):
+      discriminator_optimizer = tf.train.AdamOptimizer(0.001)
+
+    # Run models and compute loss
+    noise_placeholder = tf.placeholder(tf.float32,
+                                       shape=[batch_size, NOISE_DIM])
+    generated_images = generator(noise_placeholder)
+    tf.contrib.summary.image('generated_images',
+                             tf.reshape(generated_images, [-1, 28, 28, 1]),
+                             max_images=10)
+    discriminator_gen_outputs = discriminator(generated_images)
+    discriminator_real_outputs = discriminator(images)
+    generator_loss = mnist.generator_loss(discriminator_gen_outputs)
+    discriminator_loss = mnist.discriminator_loss(discriminator_real_outputs,
+                                                  discriminator_gen_outputs)
+    # Get train ops
+    with tf.variable_scope('generator'):
+      generator_train = generator_optimizer.minimize(
+          generator_loss, var_list=generator.variables)
+    with tf.variable_scope('discriminator'):
+      discriminator_train = discriminator_optimizer.minimize(
+          discriminator_loss, var_list=discriminator.variables)
+
+    return (generator_train, discriminator_train, noise_placeholder)
+
+  def _report(self, test_name, start, num_iters, batch_size):
+    avg_time = (time.time() - start) / num_iters
+    dev = 'gpu' if tf.test.is_gpu_available() else 'cpu'
+    name = 'graph_%s_%s_batch_%d_%s' % (test_name, dev, batch_size,
+                                        data_format())
+    extras = {'examples_per_sec': batch_size / avg_time}
+    self.report_benchmark(
+        iters=num_iters, wall_time=avg_time, name=name, extras=extras)
+
+  def benchmark_train(self):
+    for batch_size in [64, 128, 256]:
+      with tf.Graph().as_default():
+        global_step = tf.train.get_or_create_global_step()
+        increment_global_step = tf.assign_add(global_step, 1)
+        with tf.contrib.summary.create_file_writer(
+            tempfile.mkdtemp(), flush_millis=SUMMARY_FLUSH_MS).as_default(), (
+                tf.contrib.summary.record_summaries_every_n_global_steps(
+                    SUMMARY_INTERVAL)):
+          (generator_train, discriminator_train, noise_placeholder
+          ) = self._create_graph(batch_size)
+
+          with tf.Session() as sess:
+            tf.contrib.summary.initialize(graph=tf.get_default_graph(),
+                                          session=sess)
+
+            sess.run(tf.global_variables_initializer())
+
+            num_burn, num_iters = (3, 100)
+            for _ in range(num_burn):
+              noise = np.random.uniform(-1.0, 1.0, size=[batch_size, NOISE_DIM])
+              # Increment global step before evaluating summary ops to avoid
+              # race condition.
+              sess.run(increment_global_step)
+              sess.run([generator_train, discriminator_train,
+                        tf.contrib.summary.all_summary_ops()],
+                       feed_dict={noise_placeholder: noise})
+
+            # Run and benchmark 2 epochs
+            start = time.time()
+            for _ in range(num_iters):
+              noise = np.random.uniform(-1.0, 1.0, size=[batch_size, NOISE_DIM])
+              sess.run(increment_global_step)
+              sess.run([generator_train, discriminator_train,
+                        tf.contrib.summary.all_summary_ops()],
+                       feed_dict={noise_placeholder: noise})
+            self._report('train', start, num_iters, batch_size)
+
+  def benchmark_generate(self):
+    for batch_size in [64, 128, 256]:
+      with tf.Graph().as_default():
+        # Using random weights. This will generate garbage.
+        generator = mnist.Generator(data_format())
+        noise_placeholder = tf.placeholder(tf.float32,
+                                           shape=[batch_size, NOISE_DIM])
+        generated_images = generator(noise_placeholder)
+
+        init = tf.global_variables_initializer()
+        with tf.Session() as sess:
+          sess.run(init)
+          noise = np.random.uniform(-1.0, 1.0, size=[batch_size, NOISE_DIM])
+          num_burn, num_iters = (30, 1000)
+          for _ in range(num_burn):
+            sess.run(generated_images, feed_dict={noise_placeholder: noise})
+
+          start = time.time()
+          for _ in range(num_iters):
+            # Comparison with the eager execution benchmark in mnist_test.py
+            # isn't entirely fair as the time here includes the cost of copying
+            # the feeds from CPU memory to GPU.
+            sess.run(generated_images, feed_dict={noise_placeholder: noise})
+          self._report('generate', start, num_iters, batch_size)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist_test.py b/tensorflow/contrib/eager/python/examples/gan/mnist_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a3ca8d82bc2619b05a734f6d2e58431c1a45995
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist_test.py
@@ -0,0 +1,113 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+import time
+
+import tensorflow as tf
+
+import tensorflow.contrib.eager as tfe
+from tensorflow.contrib.eager.python.examples.gan import mnist
+
+NOISE_DIM = 100
+# Big enough so that summaries are never recorded.
+# Lower this value if would like to benchmark with some summaries.
+SUMMARY_INTERVAL = 10000
+SUMMARY_FLUSH_MS = 100  # Flush summaries every 100ms
+
+
+def data_format():
+  return 'channels_first' if tf.test.is_gpu_available() else 'channels_last'
+
+
+def device():
+  return '/gpu:0' if tfe.num_gpus() else '/cpu:0'
+
+
+class MnistEagerGanBenchmark(tf.test.Benchmark):
+
+  def _report(self, test_name, start, num_iters, batch_size):
+    avg_time = (time.time() - start) / num_iters
+    dev = 'gpu' if tfe.num_gpus() else 'cpu'
+    name = 'eager_%s_%s_batch_%d_%s' % (test_name, dev, batch_size,
+                                        data_format())
+    extras = {'examples_per_sec': batch_size / avg_time}
+    self.report_benchmark(
+        iters=num_iters, wall_time=avg_time, name=name, extras=extras)
+
+  def benchmark_train(self):
+    for batch_size in [64, 128, 256]:
+      # Generate some random data.
+      burn_batches, measure_batches = (3, 100)
+      burn_images = [tf.random_normal([batch_size, 784])
+                     for _ in range(burn_batches)]
+      burn_dataset = tf.data.Dataset.from_tensor_slices(burn_images)
+      measure_images = [tf.random_normal([batch_size, 784])
+                        for _ in range(measure_batches)]
+      measure_dataset = tf.data.Dataset.from_tensor_slices(measure_images)
+
+      tf.train.get_or_create_global_step()
+      with tf.device(device()):
+        # Create the models and optimizers
+        generator = mnist.Generator(data_format())
+        discriminator = mnist.Discriminator(data_format())
+        with tf.variable_scope('generator'):
+          generator_optimizer = tf.train.AdamOptimizer(0.001)
+        with tf.variable_scope('discriminator'):
+          discriminator_optimizer = tf.train.AdamOptimizer(0.001)
+
+        with tf.contrib.summary.create_file_writer(
+            tempfile.mkdtemp(), flush_millis=SUMMARY_FLUSH_MS).as_default():
+
+          # warm up
+          mnist.train_one_epoch(generator, discriminator, generator_optimizer,
+                                discriminator_optimizer,
+                                burn_dataset, log_interval=SUMMARY_INTERVAL,
+                                noise_dim=NOISE_DIM)
+          # measure
+          start = time.time()
+          mnist.train_one_epoch(generator, discriminator, generator_optimizer,
+                                discriminator_optimizer,
+                                measure_dataset, log_interval=SUMMARY_INTERVAL,
+                                noise_dim=NOISE_DIM)
+          self._report('train', start, measure_batches, batch_size)
+
+  def benchmark_generate(self):
+    for batch_size in [64, 128, 256]:
+      with tf.device(device()):
+        # Using random weights. This will generate garbage.
+        generator = mnist.Generator(data_format())
+
+        num_burn, num_iters = (30, 1000)
+        for _ in range(num_burn):
+          noise = tf.random_uniform(shape=[batch_size, NOISE_DIM],
+                                    minval=-1., maxval=1.)
+          generator(noise)
+
+        start = time.time()
+        for _ in range(num_iters):
+          noise = tf.random_uniform(shape=[batch_size, NOISE_DIM],
+                                    minval=-1., maxval=1.)
+          generator(noise)
+        self._report('generate', start, num_iters, batch_size)
+
+
+if __name__ == '__main__':
+  tfe.enable_eager_execution()
+  tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
index bab7ad0c701b2110fda9a8d27792fd361a5fc1c0..f86331af6f7928f0f86c888e22706c6e0a5978b2 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
@@ -23,3 +23,13 @@ cuda_py_test(
         "//tensorflow:tensorflow_py",
     ],
 )
+
+cuda_py_test(
+    name = "linear_regression_graph_test",
+    size = "small",
+    srcs = ["linear_regression_graph_test.py"],
+    additional_deps = [
+        ":linear_regression",
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
index d0130ebd118dbaff4f0161c8b2528764c6103e02..6ce4de6ee0bf50400eff339ac04e132252a2b53e 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
@@ -41,7 +41,7 @@ class LinearModel(tfe.Network):
   For those familiar with TensorFlow graphs, notice the absence of
   `tf.Session`. The `forward()` method here immediately executes and
   returns output values. The `loss()` method immediately compares the
-  output of `forward()` with the target adn returns the MSE loss value.
+  output of `forward()` with the target and returns the MSE loss value.
   The `fit()` performs gradient-descent training on the model's weights
   and bias.
   """
@@ -63,6 +63,10 @@ class LinearModel(tfe.Network):
     return self._hidden_layer(xs)
 
 
+def mean_square_loss(model, xs, ys):
+  return tf.reduce_mean(tf.square(model(xs) - ys))
+
+
 def fit(model, dataset, optimizer, verbose=False, logdir=None):
   """Fit the linear-regression model.
 
@@ -76,16 +80,14 @@ def fit(model, dataset, optimizer, verbose=False, logdir=None):
   """
 
   # The loss function to optimize.
-  def mean_square_loss(xs, ys):
-    return tf.reduce_mean(tf.square(model(xs) - ys))
-
-  loss_and_grads = tfe.implicit_value_and_gradients(mean_square_loss)
+  mse = lambda xs, ys: mean_square_loss(model, xs, ys)
+  loss_and_grads = tfe.implicit_value_and_gradients(mse)
 
   tf.train.get_or_create_global_step()
   if logdir:
     # Support for TensorBoard summaries. Once training has started, use:
     #   tensorboard --logdir=<logdir>
-    summary_writer = tf.contrib.summary.create_summary_file_writer(logdir)
+    summary_writer = tf.contrib.summary.create_file_writer(logdir)
 
   # Training loop.
   for i, (xs, ys) in enumerate(tfe.Iterator(dataset)):
@@ -103,14 +105,20 @@ def fit(model, dataset, optimizer, verbose=False, logdir=None):
 
 def synthetic_dataset(w, b, noise_level, batch_size, num_batches):
   """tf.data.Dataset that yields synthetic data for linear regression."""
+  return synthetic_dataset_helper(w, b,
+                                  tf.shape(w)[0], noise_level, batch_size,
+                                  num_batches)
+
 
+def synthetic_dataset_helper(w, b, num_features, noise_level, batch_size,
+                             num_batches):
   # w is a matrix with shape [N, M]
   # b is a vector with shape [M]
   # So:
   # - Generate x's as vectors with shape [batch_size N]
   # - y = tf.matmul(x, W) + b + noise
   def batch(_):
-    x = tf.random_normal([batch_size, tf.shape(w)[0]])
+    x = tf.random_normal([batch_size, num_features])
     y = tf.matmul(x, w) + b + noise_level * tf.random_normal([])
     return x, y
 
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..557ad42752144243ae3da61b955b31398cba846e
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
@@ -0,0 +1,85 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Graph benchmark for linear regression, to contrast with eager execution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import tensorflow as tf
+from tensorflow.contrib.eager.python.examples.linear_regression import linear_regression
+
+
+class GraphLinearRegressionBenchmark(tf.test.Benchmark):
+
+  def benchmarkGraphLinearRegression(self):
+    num_epochs = 10
+    num_batches = 200
+    batch_size = 64
+    dataset = linear_regression.synthetic_dataset_helper(
+        w=tf.random_uniform([3, 1]),
+        b=tf.random_uniform([1]),
+        num_features=3,
+        noise_level=0.01,
+        batch_size=batch_size,
+        num_batches=num_batches)
+    iterator = dataset.make_initializable_iterator()
+    x, y = iterator.get_next()
+
+    model = linear_regression.LinearModel()
+
+    if tf.test.is_gpu_available():
+      use_gpu = True
+      device = "/device:GPU:0"
+    else:
+      use_gpu = False
+      device = "/device:CPU:0"
+
+    with tf.device(device):
+      loss = linear_regression.mean_square_loss(model, x, y)
+      optimization_step = tf.train.GradientDescentOptimizer(
+          learning_rate=0.1).minimize(loss)
+
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+
+      def train(num_epochs):
+        for _ in range(num_epochs):
+          sess.run(iterator.initializer)
+          try:
+            while True:
+              _, _ = sess.run([optimization_step, loss])
+          except tf.errors.OutOfRangeError:
+            pass
+
+      # Warmup: a single epoch.
+      train(1)
+
+      start_time = time.time()
+      train(num_epochs)
+      wall_time = time.time() - start_time
+
+      examples_per_sec = num_epochs * num_batches * batch_size / wall_time
+      self.report_benchmark(
+          name="graph_train_%s" %
+          ("gpu" if use_gpu else "cpu"),
+          iters=num_epochs * num_batches,
+          extras={"examples_per_sec": examples_per_sec},
+          wall_time=wall_time)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py
index 39e7aabd7be04ba36a786a4c08d0df6c2ce916d0..e53234b51a7dccc11e548ac81a7ef070c628aa52 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py
@@ -83,6 +83,7 @@ class LinearRegressionTest(tf.test.TestCase):
 class EagerLinearRegressionBenchmark(tf.test.Benchmark):
 
   def benchmarkEagerLinearRegression(self):
+    num_epochs = 10
     num_batches = 200
     batch_size = 64
     dataset = linear_regression.synthetic_dataset(
@@ -102,14 +103,15 @@ class EagerLinearRegressionBenchmark(tf.test.Benchmark):
       linear_regression.fit(model, burn_in_dataset, optimizer)
 
       start_time = time.time()
-      linear_regression.fit(model, dataset, optimizer)
+      for _ in range(num_epochs):
+        linear_regression.fit(model, dataset, optimizer)
       wall_time = time.time() - start_time
 
-      examples_per_sec = num_batches * batch_size / wall_time
+      examples_per_sec = num_epochs * num_batches * batch_size / wall_time
       self.report_benchmark(
           name="eager_train_%s" %
           ("gpu" if tfe.num_gpus() > 0 else "cpu"),
-          iters=num_batches,
+          iters=num_epochs * num_batches,
           extras={"examples_per_sec": examples_per_sec},
           wall_time=wall_time)
 
diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist.py b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
index bfb7d5a9002787f6544d383de58150661ac2bde3..772f59562ba27cce510c82681f491d005298f44c 100644
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
@@ -23,7 +23,6 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import functools
 import os
 import sys
 import time
@@ -40,7 +39,7 @@ class MNISTModel(tfe.Network):
   """MNIST Network.
 
   Network structure is equivalent to:
-  https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/examples/tutorials/mnist/mnist_deep.py
+  https://github.com/tensorflow/tensorflow/blob/r1.6/tensorflow/examples/tutorials/mnist/mnist_deep.py
   and
   https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
 
@@ -96,8 +95,7 @@ class MNISTModel(tfe.Network):
     x = self.max_pool2d(x)
     x = tf.layers.flatten(x)
     x = self.fc1(x)
-    if training:
-      x = self.dropout(x)
+    x = self.dropout(x, training=training)
     x = self.fc2(x)
     return x
 
@@ -124,21 +122,18 @@ def train_one_epoch(model, optimizer, dataset, log_interval=None):
 
   tf.train.get_or_create_global_step()
 
-  def model_loss(labels, images):
-    prediction = model(images, training=True)
-    loss_value = loss(prediction, labels)
-    tf.contrib.summary.scalar('loss', loss_value)
-    tf.contrib.summary.scalar('accuracy',
-                              compute_accuracy(prediction, labels))
-    return loss_value
-
   for (batch, (images, labels)) in enumerate(tfe.Iterator(dataset)):
     with tf.contrib.summary.record_summaries_every_n_global_steps(10):
-      batch_model_loss = functools.partial(model_loss, labels, images)
-      optimizer.minimize(
-          batch_model_loss, global_step=tf.train.get_global_step())
+      with tfe.GradientTape() as tape:
+        prediction = model(images, training=True)
+        loss_value = loss(prediction, labels)
+        tf.contrib.summary.scalar('loss', loss_value)
+        tf.contrib.summary.scalar('accuracy',
+                                  compute_accuracy(prediction, labels))
+      grads = tape.gradient(loss_value, model.variables)
+      optimizer.apply_gradients(zip(grads, model.variables))
       if log_interval and batch % log_interval == 0:
-        print('Batch #%d\tLoss: %.6f' % (batch, batch_model_loss()))
+        print('Batch #%d\tLoss: %.6f' % (batch, loss_value))
 
 
 def test(model, dataset):
@@ -190,9 +185,9 @@ def main(_):
   else:
     train_dir = None
     test_dir = None
-  summary_writer = tf.contrib.summary.create_summary_file_writer(
+  summary_writer = tf.contrib.summary.create_file_writer(
       train_dir, flush_millis=10000)
-  test_summary_writer = tf.contrib.summary.create_summary_file_writer(
+  test_summary_writer = tf.contrib.summary.create_file_writer(
       test_dir, flush_millis=10000, name='test')
   checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt')
 
diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist_test.py b/tensorflow/contrib/eager/python/examples/mnist/mnist_test.py
index 205709fe2edd3c260c30a84b624e322e120edf8e..136085eba21284a42282395e54f32c33bf63b5c3 100644
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist_test.py
+++ b/tensorflow/contrib/eager/python/examples/mnist/mnist_test.py
@@ -39,22 +39,40 @@ def random_dataset():
   return tf.data.Dataset.from_tensors((images, labels))
 
 
+def train_one_epoch(defun=False):
+  model = mnist.MNISTModel(data_format())
+  if defun:
+    model.call = tfe.defun(model.call)
+  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+  dataset = random_dataset()
+  with tf.device(device()):
+    tf.train.get_or_create_global_step()
+    mnist.train_one_epoch(model, optimizer, dataset)
+
+
+def evaluate(defun=False):
+  model = mnist.MNISTModel(data_format())
+  dataset = random_dataset()
+  if defun:
+    model.call = tfe.defun(model.call)
+  with tf.device(device()):
+    tf.train.get_or_create_global_step()
+    mnist.test(model, dataset)
+
+
 class MNISTTest(tf.test.TestCase):
 
   def testTrainOneEpoch(self):
-    model = mnist.MNISTModel(data_format())
-    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-    dataset = random_dataset()
-    with tf.device(device()):
-      tf.train.get_or_create_global_step()
-      mnist.train_one_epoch(model, optimizer, dataset)
+    train_one_epoch(defun=False)
 
   def testTest(self):
-    model = mnist.MNISTModel(data_format())
-    dataset = random_dataset()
-    with tf.device(device()):
-      tf.train.get_or_create_global_step()
-      mnist.test(model, dataset)
+    evaluate(defun=False)
+
+  def testTrainOneEpochWithDefunCall(self):
+    train_one_epoch(defun=True)
+
+  def testTestWithDefunCall(self):
+    evaluate(defun=True)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/README.md b/tensorflow/contrib/eager/python/examples/resnet50/README.md
index db023e6c976c8eda09ef0dee7eecb144678773c4..79e460052945718eac194653015d60d900998e2d 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/README.md
+++ b/tensorflow/contrib/eager/python/examples/resnet50/README.md
@@ -34,7 +34,7 @@ bazel run -c opt --config=cuda :resnet50_graph_test -- --benchmarks=.
 
 (Or remove the `--config=cuda` flag for running on CPU instead of GPU).
 
-On October 31, 2017, the benchmarks demostrated comparable performance
+On October 31, 2017, the benchmarks demonstrated comparable performance
 for eager and graph execution of this particular model when using
 a single NVIDIA Titan X (Pascal) GPU on a host with an
 Intel Xeon E5-1650 CPU @ 3.50GHz and a batch size of 32.
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
index b302a87e0e8a61d2456db1eba847f31bd70f552e..9982fdb07eefa665379e7be095f4f8017d92cf97 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50.py
@@ -97,7 +97,7 @@ class _ConvBlock(tfe.Network):
 
   Args:
       kernel_size: the kernel size of middle conv layer at main path
-      filters: list of integers, the filterss of 3 conv layer at main path
+      filters: list of integers, the filters of 3 conv layer at main path
       stage: integer, current stage label, used for generating layer names
       block: 'a','b'..., current block label, used for generating layer names
       data_format: data_format for the input ('channels_first' or
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
index 14c82c87a72457d414c4a1d3c53d4d1a68a400e6..23317886e712323f4b520000e0fd372734fc53a1 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
@@ -73,7 +73,7 @@ class ResNet50GraphTest(tf.test.TestCase):
       tf.train.get_or_create_global_step()
       logdir = tempfile.mkdtemp()
       with tf.contrib.summary.always_record_summaries():
-        with tf.contrib.summary.create_summary_file_writer(
+        with tf.contrib.summary.create_file_writer(
             logdir, max_queue=0,
             name='t0').as_default():
           model = resnet50.ResNet50(data_format())
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index 582f4837c6f3197081cb558063e963866d173f29..0ff8746884c288f824f5f22ab4c550370d0e0302 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -22,6 +22,7 @@ import gc
 import tempfile
 import time
 
+from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
 import tensorflow.contrib.eager as tfe
@@ -52,26 +53,33 @@ def random_batch(batch_size):
 
 def train_one_step(model, images, labels, optimizer):
 
-  def model_loss():
+  with tfe.GradientTape() as tape:
     logits = model(images, training=True)
     loss = tf.losses.softmax_cross_entropy(
         logits=logits, onehot_labels=labels)
     tf.contrib.summary.scalar(name='loss', tensor=loss)
-    return loss
-
-  optimizer.minimize(model_loss)
+  grads = tape.gradient(loss, model.variables)
+  optimizer.apply_gradients(zip(grads, model.variables))
 
 
 class ResNet50Test(tf.test.TestCase):
 
-  def test_apply(self):
+  def _apply(self, defun=False):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format)
+    if defun:
+      model.call = tfe.defun(model.call)
     with tf.device(device):
       images, _ = random_batch(2)
       output = model(images)
     self.assertEqual((2, 1000), output.shape)
 
+  def test_apply(self):
+    self._apply(defun=False)
+
+  def test_apply_with_defun(self):
+    self._apply(defun=True)
+
   def test_apply_no_top(self):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False)
@@ -95,7 +103,7 @@ class ResNet50Test(tf.test.TestCase):
     model = resnet50.ResNet50(data_format)
     tf.train.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with tf.contrib.summary.create_summary_file_writer(
+    with tf.contrib.summary.create_file_writer(
         logdir, max_queue=0,
         name='t0').as_default(), tf.contrib.summary.always_record_summaries():
       with tf.device(device):
@@ -175,9 +183,11 @@ class ResNet50Benchmarks(tf.test.Benchmark):
     # a sync. This is a roundabout way, yes.
     tf.constant(1.).cpu()
 
-  def benchmark_eager_apply(self):
+  def _benchmark_eager_apply(self, label, defun=False):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format)
+    if defun:
+      model.call = tfe.defun(model.call)
     batch_size = 64
     num_burn = 5
     num_iters = 30
@@ -189,16 +199,23 @@ class ResNet50Benchmarks(tf.test.Benchmark):
       start = time.time()
       for _ in xrange(num_iters):
         model(images).cpu()
-      self._report('eager_apply', start, num_iters, device, batch_size,
-                   data_format)
+      self._report(label, start, num_iters, device, batch_size, data_format)
+
+  def benchmark_eager_apply(self):
+    self._benchmark_eager_apply('eager_apply', defun=False)
+
+  def benchmark_eager_apply_with_defun(self):
+    self._benchmark_eager_apply('eager_apply_with_defun', defun=True)
 
-  def _benchmark_eager_train(self, label, make_iterator):
+  def _benchmark_eager_train(self, label, make_iterator, defun=False):
     device, data_format = device_and_data_format()
     for batch_size in self._train_batch_sizes():
       (images, labels) = random_batch(batch_size)
       num_burn = 3
       num_iters = 10
       model = resnet50.ResNet50(data_format)
+      if defun:
+        model.call = tfe.defun(model.call)
       optimizer = tf.train.GradientDescentOptimizer(0.1)
 
       with tf.device(device):
@@ -217,7 +234,11 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         self._report(label, start, num_iters, device, batch_size, data_format)
 
   def benchmark_eager_train(self):
-    self._benchmark_eager_train('eager_train', MockIterator)
+    self._benchmark_eager_train('eager_train', MockIterator, defun=False)
+
+  def benchmark_eager_train_with_defun(self):
+    self._benchmark_eager_train(
+        'eager_train_with_defun', MockIterator, defun=True)
 
   def benchmark_eager_train_datasets(self):
 
@@ -226,7 +247,18 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         ds = tf.data.Dataset.from_tensors(tensors).repeat()
       return tfe.Iterator(ds)
 
-    self._benchmark_eager_train('eager_train_dataset', make_iterator)
+    self._benchmark_eager_train(
+        'eager_train_dataset', make_iterator, defun=False)
+
+  def benchmark_eager_train_datasets_with_defun(self):
+
+    def make_iterator(tensors):
+      with tf.device('/device:CPU:0'):
+        ds = tf.data.Dataset.from_tensors(tensors).repeat()
+      return tfe.Iterator(ds)
+
+    self._benchmark_eager_train(
+        'eager_train_dataset_with_defun', make_iterator, defun=True)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
index 609cbd28772c3ae8da70648ca5b1b264a8a255e2..aa87b94e7b0876e65405f6bcb2d6aabde36582bf 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
@@ -65,7 +65,6 @@ import six
 import tensorflow as tf
 
 from tensorflow.contrib.eager.python import tfe
-from tensorflow.python.eager import context
 
 try:
   import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
@@ -247,9 +246,9 @@ def main(_):
 
   log_dir = os.path.join(FLAGS.dir, "summaries")
   tf.gfile.MakeDirs(log_dir)
-  train_summary_writer = tf.contrib.summary.create_summary_file_writer(
+  train_summary_writer = tf.contrib.summary.create_file_writer(
       os.path.join(log_dir, "train"), flush_millis=10000)
-  test_summary_writer = tf.contrib.summary.create_summary_file_writer(
+  test_summary_writer = tf.contrib.summary.create_file_writer(
       os.path.join(log_dir, "eval"), flush_millis=10000, name="eval")
 
   with tf.device(device):
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md b/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md
index 743ebb68ee5bba5635899267cc4839828f7e4e2f..966177e91c212c1aa132fe3af6f7dc9a50fb984e 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md
@@ -40,7 +40,7 @@ bazel run -c opt --config=cuda :rnn_ptb_graph_test -- --benchmarks=.
 
 (Or remove the `--config=cuda` flag for running on CPU instead of GPU).
 
-On October 31, 2017, the benchmarks demostrated slightly better performance
+On October 31, 2017, the benchmarks demonstrated slightly better performance
 (3-6%) for graph execution over eager execution for this particular model when
 using a single NVIDIA Titan X (Pascal) GPU on a host with an Intel Xeon E5-1650
 CPU @ 3.50GHz and a batch size of 32.
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index 30bb3c8ad33d38453bd96a76c7770071e24bb034..5c5c59c87744f4ffa6db90e5d8d3aa3bc8132756 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -22,6 +22,11 @@ Usage: python ./rnn_ptb.py --data-path=<path_to_dataset>
 Penn Treebank (PTB) dataset from:
 http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
 """
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 import argparse
 import os
 import sys
@@ -83,7 +88,7 @@ class Embedding(tf.layers.Layer):
 
 
 class PTBModel(tfe.Network):
-  """LSTM for word language modelling.
+  """LSTM for word language modeling.
 
   Model described in:
   (Zaremba, et. al.) Recurrent Neural Network Regularization
@@ -209,7 +214,7 @@ class Datasets(object):
     """Load the Penn Treebank dataset.
 
     Args:
-      path: Path to the data/ directory of the dataset from from Tomas Mikolov's
+      path: Path to the data/ directory of the dataset from Tomas Mikolov's
         webpage - http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
     """
 
@@ -334,8 +339,7 @@ if __name__ == "__main__":
       "http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz")
   parser.add_argument(
       "--logdir", type=str, default="", help="Directory for checkpoint.")
-  parser.add_argument(
-      "--epoch", type=int, default=20, help="Number of epoches.")
+  parser.add_argument("--epoch", type=int, default=20, help="Number of epochs.")
   parser.add_argument("--batch-size", type=int, default=20, help="Batch size.")
   parser.add_argument(
       "--seq-len", type=int, default=35, help="Sequence length.")
diff --git a/tensorflow/contrib/eager/python/examples/spinn/BUILD b/tensorflow/contrib/eager/python/examples/spinn/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a1f8a759e2a556bc219f0aa13942f293c4f34cfa
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/BUILD
@@ -0,0 +1,42 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "data",
+    srcs = ["data.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = ["//third_party/py/numpy"],
+)
+
+py_test(
+    name = "data_test",
+    size = "small",
+    srcs = ["data_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":data",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "spinn_test",
+    size = "medium",
+    srcs = ["spinn_test.py"],
+    additional_deps = [
+        ":data",
+        "//third_party/examples/eager/spinn",
+        "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/summary:summary_test_util",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+    tags = ["no_pip"],  # because spinn.py is under third_party/.
+)
diff --git a/tensorflow/contrib/eager/python/examples/spinn/README.md b/tensorflow/contrib/eager/python/examples/spinn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..eb0637df473e22e5d39ca1b0816464cb2b7c6435
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/README.md
@@ -0,0 +1,13 @@
+# SPINN: Dynamic neural network with TensorFlow eager execution
+
+This directory contains files supporting the
+[spinn.py model in third_party/examples/eager/spinn/](../../../../../../third_party/examples/eager/spinn/spinn.py),
+including
+
+- `data.py`: Utility library for loading and preprocessing the SNLI and GloVe
+  data.
+- `data_test.py` and `spinn_test.py`: Unit tests for the data and model modules.
+
+See the [README.md in third_party/examples/eager/spinn/](../../../../../../third_party/examples/eager/spinn/README.md)
+for detailed background, license and usage information regarding the SPINN code.
+
diff --git a/tensorflow/contrib/eager/python/examples/spinn/data.py b/tensorflow/contrib/eager/python/examples/spinn/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bc3bb49bcbbc26f7a3134a8bfc385ec080dde1e
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/data.py
@@ -0,0 +1,373 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities of SNLI data and GloVe word vectors for SPINN model.
+
+See more details about the SNLI data set at:
+  https://nlp.stanford.edu/projects/snli/
+
+See more details about the GloVe pretrained word embeddings at:
+  https://nlp.stanford.edu/projects/glove/
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import math
+import os
+import random
+
+import numpy as np
+
+POSSIBLE_LABELS = ("entailment", "contradiction", "neutral")
+
+UNK_CODE = 0   # Code for unknown word tokens.
+PAD_CODE = 1   # Code for padding tokens.
+
+SHIFT_CODE = 3
+REDUCE_CODE = 2
+
+WORD_VECTOR_LEN = 300  # Embedding dimensions.
+
+LEFT_PAREN = "("
+RIGHT_PAREN = ")"
+PARENTHESES = (LEFT_PAREN, RIGHT_PAREN)
+
+
+def get_non_parenthesis_words(items):
+  """Get the non-parenthesis items from a SNLI parsed sentence.
+
+  Args:
+    items: Data items from a parsed SNLI sentence, with parentheses. E.g.,
+      ["(", "Man", "(", "(", "(", "(", "(", "wearing", "pass", ")", ...
+
+  Returns:
+    A list of non-parentheses word items, all converted to lower case. E.g.,
+      ["man", "wearing", "pass", ...
+  """
+  return [x.lower() for x in items if x not in PARENTHESES and x]
+
+
+def get_shift_reduce(items):
+  """Obtain shift-reduce vector from a list of items from the SNLI data.
+
+  Args:
+    items: Data items as a list of str, e.g.,
+       ["(", "Man", "(", "(", "(", "(", "(", "wearing", "pass", ")", ...
+
+  Returns:
+    A list of shift-reduce transitions, encoded as `SHIFT_CODE` for shift and
+      `REDUCE_CODE` for reduce. See code above for the values of `SHIFT_CODE`
+      and `REDUCE_CODE`.
+  """
+  trans = []
+  for item in items:
+    if item == LEFT_PAREN:
+      continue
+    elif item == RIGHT_PAREN:
+      trans.append(REDUCE_CODE)
+    else:
+      trans.append(SHIFT_CODE)
+  return trans
+
+
+def pad_and_reverse_word_ids(sentences):
+  """Pad a list of sentences to the common maximum length + 1.
+
+  Args:
+    sentences: A list of sentences as a list of list of integers. Each integer
+      is a word ID. Each list of integer corresponds to one sentence.
+
+  Returns:
+    A numpy.ndarray of shape (num_sentences, max_length + 1), wherein max_length
+      is the maximum sentence length (in # of words). Each sentence is reversed
+      and then padded with an extra one at head, as required by the model.
+  """
+  max_len = max(len(sent) for sent in sentences)
+  for sent in sentences:
+    if len(sent) < max_len:
+      sent.extend([PAD_CODE] * (max_len - len(sent)))
+  # Reverse in time order and pad an extra one.
+  sentences = np.fliplr(np.array(sentences, dtype=np.int64))
+  sentences = np.concatenate(
+      [np.ones([sentences.shape[0], 1], dtype=np.int64), sentences], axis=1)
+  return sentences
+
+
+def pad_transitions(sentences_transitions):
+  """Pad a list of shift-reduce transitions to the maximum length."""
+  max_len = max(len(transitions) for transitions in sentences_transitions)
+  for transitions in sentences_transitions:
+    if len(transitions) < max_len:
+      transitions.extend([PAD_CODE] * (max_len - len(transitions)))
+  return np.array(sentences_transitions, dtype=np.int64)
+
+
+def load_vocabulary(data_root):
+  """Load vocabulary from SNLI data files.
+
+  Args:
+    data_root: Root directory of the data. It is assumed that the SNLI data
+      files have been downloaded and extracted to the "snli/snli_1.0"
+      subdirectory of it.
+
+  Returns:
+    Vocabulary as a set of strings.
+
+  Raises:
+    ValueError: If SNLI data files cannot be found.
+  """
+  snli_path = os.path.join(data_root, "snli")
+  snli_glob_pattern = os.path.join(snli_path, "snli_1.0/snli_1.0_*.txt")
+  file_names = glob.glob(snli_glob_pattern)
+  if not file_names:
+    raise ValueError(
+        "Cannot find SNLI data files at %s. "
+        "Please download and extract SNLI data first." % snli_glob_pattern)
+
+  print("Loading vocabulary...")
+  vocab = set()
+  for file_name in file_names:
+    with open(os.path.join(snli_path, file_name), "rt") as f:
+      for i, line in enumerate(f):
+        if i == 0:
+          continue
+        items = line.split("\t")
+        premise_words = get_non_parenthesis_words(items[1].split(" "))
+        hypothesis_words = get_non_parenthesis_words(items[2].split(" "))
+        vocab.update(premise_words)
+        vocab.update(hypothesis_words)
+  return vocab
+
+
+def load_word_vectors(data_root, vocab):
+  """Load GloVe word vectors for words present in the vocabulary.
+
+  Args:
+    data_root: Data root directory. It is assumed that the GloVe file
+     has been downloaded and extracted at the "glove/" subdirectory of it.
+    vocab: A `set` of words, representing the vocabulary.
+
+  Returns:
+    1. word2index: A dict from lower-case word to row index in the embedding
+       matrix, i.e, `embed` below.
+    2. embed: The embedding matrix as a float32 numpy array. Its shape is
+       [vocabulary_size, WORD_VECTOR_LEN]. vocabulary_size is len(vocab).
+       WORD_VECTOR_LEN is the embedding dimension (300).
+
+  Raises:
+    ValueError: If GloVe embedding file cannot be found.
+  """
+  glove_path = os.path.join(data_root, "glove/glove.42B.300d.txt")
+  if not os.path.isfile(glove_path):
+    raise ValueError(
+        "Cannot find GloVe embedding file at %s. "
+        "Please download and extract GloVe embeddings first." % glove_path)
+
+  print("Loading word vectors...")
+
+  word2index = dict()
+  embed = []
+
+  embed.append([0] * WORD_VECTOR_LEN)  # <unk>
+  embed.append([0] * WORD_VECTOR_LEN)  # <pad>
+  word2index["<unk>"] = UNK_CODE
+  word2index["<pad>"] = PAD_CODE
+
+  with open(glove_path, "rt") as f:
+    for line in f:
+      items = line.split(" ")
+      word = items[0]
+      if word in vocab and word not in word2index:
+        word2index[word] = len(embed)
+        vector = np.array([float(item) for item in items[1:]])
+        assert (WORD_VECTOR_LEN,) == vector.shape
+        embed.append(vector)
+  embed = np.array(embed, dtype=np.float32)
+  return word2index, embed
+
+
+def calculate_bins(length2count, min_bin_size):
+  """Calculate bin boundaries given a histogram of lengths and minimum bin size.
+
+  Args:
+    length2count: A `dict` mapping length to sentence count.
+    min_bin_size: Minimum bin size in terms of total number of sentence pairs
+      in the bin.
+
+  Returns:
+    A `list` representing the right bin boundaries, starting from the inclusive
+    right boundary of the first bin. For example, if the output is
+      [10, 20, 35],
+    it means there are three bins: [1, 10], [11, 20] and [21, 35].
+  """
+  bounds = []
+  lengths = sorted(length2count.keys())
+  cum_count = 0
+  for length in lengths:
+    cum_count += length2count[length]
+    if cum_count >= min_bin_size:
+      bounds.append(length)
+      cum_count = 0
+  if bounds[-1] != lengths[-1]:
+    bounds.append(lengths[-1])
+  return bounds
+
+
+def encode_sentence(sentence, word2index):
+  """Encode a single sentence as word indices and shift-reduce code.
+
+  Args:
+    sentence: The sentence with added binary parse information, represented as
+      a string, with all the word items and parentheses separated by spaces.
+      E.g., '( ( The dog ) ( ( is ( playing toys ) ) . ) )'.
+    word2index: A `dict` mapping words to their word indices.
+
+  Returns:
+     1. Word indices as a numpy array, with shape `(sequence_len, 1)`.
+     2. Shift-reduce sequence as a numpy array, with shape
+       `(sequence_len * 2 - 3, 1)`.
+  """
+  items = [w for w in sentence.split(" ") if w]
+  words = get_non_parenthesis_words(items)
+  shift_reduce = get_shift_reduce(items)
+  word_indices = pad_and_reverse_word_ids(
+      [[word2index.get(word, UNK_CODE) for word in words]]).T
+  return (word_indices,
+          np.expand_dims(np.array(shift_reduce, dtype=np.int64), -1))
+
+
+class SnliData(object):
+  """A split of SNLI data."""
+
+  def __init__(self, data_file, word2index, sentence_len_limit=-1):
+    """SnliData constructor.
+
+    Args:
+      data_file: Full path to the data file, e.g.,
+        "/tmp/spinn-data/snli/snli_1.0/snli_1.0.train.txt"
+      word2index: A dict from lower-case word to row index in the embedding
+        matrix (see `load_word_vectors()` for details).
+      sentence_len_limit: Maximum allowed sentence length (# of words).
+        A value of <= 0 means unlimited. Sentences longer than this limit
+        are currently discarded, not truncated.
+    """
+
+    self._labels = []
+    self._premises = []
+    self._premise_transitions = []
+    self._hypotheses = []
+    self._hypothesis_transitions = []
+
+    with open(data_file, "rt") as f:
+      for i, line in enumerate(f):
+        if i == 0:
+          # Skip header line.
+          continue
+        items = line.split("\t")
+        if items[0] not in POSSIBLE_LABELS:
+          continue
+
+        premise_items = items[1].split(" ")
+        hypothesis_items = items[2].split(" ")
+        premise_words = get_non_parenthesis_words(premise_items)
+        hypothesis_words = get_non_parenthesis_words(hypothesis_items)
+
+        if (sentence_len_limit > 0 and
+            (len(premise_words) > sentence_len_limit or
+             len(hypothesis_words) > sentence_len_limit)):
+          # TODO(cais): Maybe truncate; do not discard.
+          continue
+
+        premise_ids = [
+            word2index.get(word, UNK_CODE) for word in premise_words]
+        hypothesis_ids = [
+            word2index.get(word, UNK_CODE) for word in hypothesis_words]
+
+        self._premises.append(premise_ids)
+        self._hypotheses.append(hypothesis_ids)
+        self._premise_transitions.append(get_shift_reduce(premise_items))
+        self._hypothesis_transitions.append(get_shift_reduce(hypothesis_items))
+        assert (len(self._premise_transitions[-1]) ==
+                2 * len(premise_words) - 1)
+        assert (len(self._hypothesis_transitions[-1]) ==
+                2 * len(hypothesis_words) - 1)
+
+        self._labels.append(POSSIBLE_LABELS.index(items[0]) + 1)
+
+    assert len(self._labels) == len(self._premises)
+    assert len(self._labels) == len(self._hypotheses)
+    assert len(self._labels) == len(self._premise_transitions)
+    assert len(self._labels) == len(self._hypothesis_transitions)
+
+  def num_batches(self, batch_size):
+    """Calculate number of batches given batch size."""
+    return int(math.ceil(len(self._labels) / batch_size))
+
+  def get_generator(self, batch_size):
+    """Obtain a generator for batched data.
+
+    All examples of this SnliData object are randomly shuffled, sorted
+    according to the maximum sentence length of the premise and hypothesis
+    sentences in the pair, and batched.
+
+    Args:
+      batch_size: Desired batch size.
+
+    Returns:
+      A generator for data batches. The generator yields a 5-tuple:
+        label: An array of the shape (batch_size,).
+        premise: An array of the shape (max_premise_len, batch_size), wherein
+          max_premise_len is the maximum length of the (padded) premise
+          sentence in the batch.
+        premise_transitions: An array of the shape (2 * max_premise_len -3,
+          batch_size).
+        hypothesis: Same as `premise`, but for hypothesis sentences.
+        hypothesis_transitions: Same as `premise_transitions`, but for
+          hypothesis sentences.
+      All the elements of the 5-tuple have dtype `int64`.
+    """
+    # Randomly shuffle examples.
+    zipped = list(zip(
+        self._labels, self._premises, self._premise_transitions,
+        self._hypotheses, self._hypothesis_transitions))
+    random.shuffle(zipped)
+    # Then sort the examples by maximum of the premise and hypothesis sentence
+    # lengths in the pair. During training, the batches are expected to be
+    # shuffled. So it is okay to leave them sorted by max length here.
+    (labels, premises, premise_transitions, hypotheses,
+     hypothesis_transitions) = zip(
+         *sorted(zipped, key=lambda x: max(len(x[1]), len(x[3]))))
+
+    def _generator():
+      begin = 0
+      while begin < len(labels):
+        # The sorting above and the batching here makes sure that sentences of
+        # similar max lengths are batched together, minimizing the inefficiency
+        # due to uneven max lengths. The sentences are batched differently in
+        # each call to get_generator() due to the shuffling before sorting
+        # above. The pad_and_reverse_word_ids() and pad_transitions() functions
+        # take care of any remaining unevenness of the max sentence lengths.
+        end = min(begin + batch_size, len(labels))
+        # Transpose, because the SPINN model requires time-major, instead of
+        # batch-major.
+        yield (labels[begin:end],
+               pad_and_reverse_word_ids(premises[begin:end]).T,
+               pad_transitions(premise_transitions[begin:end]).T,
+               pad_and_reverse_word_ids(hypotheses[begin:end]).T,
+               pad_transitions(hypothesis_transitions[begin:end]).T)
+        begin = end
+    return _generator
diff --git a/tensorflow/contrib/eager/python/examples/spinn/data_test.py b/tensorflow/contrib/eager/python/examples/spinn/data_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..54fef2c3fe4111cd2d93ac109a5b8fffad0c2fad
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/data_test.py
@@ -0,0 +1,270 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for SPINN data module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.contrib.eager.python.examples.spinn import data
+
+
+class DataTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(DataTest, self).setUp()
+    self._temp_data_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    shutil.rmtree(self._temp_data_dir)
+    super(DataTest, self).tearDown()
+
+  def testGenNonParenthesisWords(self):
+    seq_with_parse = (
+        "( Man ( ( ( ( ( wearing pass ) ( on ( a lanyard ) ) ) and "
+        ") ( standing ( in ( ( a crowd ) ( of people ) ) ) ) ) . ) )")
+    self.assertEqual(
+        ["man", "wearing", "pass", "on", "a", "lanyard", "and", "standing",
+         "in", "a", "crowd", "of", "people", "."],
+        data.get_non_parenthesis_words(seq_with_parse.split(" ")))
+
+  def testGetShiftReduce(self):
+    seq_with_parse = (
+        "( Man ( ( ( ( ( wearing pass ) ( on ( a lanyard ) ) ) and "
+        ") ( standing ( in ( ( a crowd ) ( of people ) ) ) ) ) . ) )")
+    self.assertEqual(
+        [3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 2, 3, 3, 3, 3, 2, 3, 3, 2, 2, 2, 2, 2,
+         3, 2, 2], data.get_shift_reduce(seq_with_parse.split(" ")))
+
+  def testPadAndReverseWordIds(self):
+    id_sequences = [[0, 2, 3, 4, 5],
+                    [6, 7, 8],
+                    [9, 10, 11, 12, 13, 14, 15, 16]]
+    self.assertAllClose(
+        [[1, 1, 1, 1, 5, 4, 3, 2, 0],
+         [1, 1, 1, 1, 1, 1, 8, 7, 6],
+         [1, 16, 15, 14, 13, 12, 11, 10, 9]],
+        data.pad_and_reverse_word_ids(id_sequences))
+
+  def testPadTransitions(self):
+    unpadded = [[3, 3, 3, 2, 2, 2, 2],
+                [3, 3, 2, 2, 2]]
+    self.assertAllClose(
+        [[3, 3, 3, 2, 2, 2, 2],
+         [3, 3, 2, 2, 2, 1, 1]],
+        data.pad_transitions(unpadded))
+
+  def testCalculateBins(self):
+    length2count = {
+        1: 10,
+        2: 15,
+        3: 25,
+        4: 40,
+        5: 35,
+        6: 10}
+    self.assertEqual([2, 3, 4, 5, 6],
+                     data.calculate_bins(length2count, 20))
+    self.assertEqual([3, 4, 6], data.calculate_bins(length2count, 40))
+    self.assertEqual([4, 6], data.calculate_bins(length2count, 60))
+
+  def testLoadVoacbulary(self):
+    snli_1_0_dir = os.path.join(self._temp_data_dir, "snli/snli_1.0")
+    fake_train_file = os.path.join(snli_1_0_dir, "snli_1.0_train.txt")
+    fake_dev_file = os.path.join(snli_1_0_dir, "snli_1.0_dev.txt")
+    os.makedirs(snli_1_0_dir)
+
+    with open(fake_train_file, "wt") as f:
+      f.write("gold_label\tsentence1_binary_parse\tsentence2_binary_parse\t"
+              "sentence1_parse\tsentence2_parse\tsentence1\tsentence2\t"
+              "captionID\tpairID\tlabel1\tlabel2\tlabel3\tlabel4\tlabel5\n")
+      f.write("neutral\t( ( Foo bar ) . )\t( ( foo baz ) . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+    with open(fake_dev_file, "wt") as f:
+      f.write("gold_label\tsentence1_binary_parse\tsentence2_binary_parse\t"
+              "sentence1_parse\tsentence2_parse\tsentence1\tsentence2\t"
+              "captionID\tpairID\tlabel1\tlabel2\tlabel3\tlabel4\tlabel5\n")
+      f.write("neutral\t( ( Quux quuz ) ? )\t( ( Corge grault ) ! )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Quux quuz?\t.Corge grault!\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+
+    vocab = data.load_vocabulary(self._temp_data_dir)
+    self.assertSetEqual(
+        {".", "?", "!", "foo", "bar", "baz", "quux", "quuz", "corge", "grault"},
+        vocab)
+
+  def testLoadVoacbularyWithoutFileRaisesError(self):
+    with self.assertRaisesRegexp(ValueError, "Cannot find SNLI data files at"):
+      data.load_vocabulary(self._temp_data_dir)
+
+    os.makedirs(os.path.join(self._temp_data_dir, "snli"))
+    with self.assertRaisesRegexp(ValueError, "Cannot find SNLI data files at"):
+      data.load_vocabulary(self._temp_data_dir)
+
+    os.makedirs(os.path.join(self._temp_data_dir, "snli/snli_1.0"))
+    with self.assertRaisesRegexp(ValueError, "Cannot find SNLI data files at"):
+      data.load_vocabulary(self._temp_data_dir)
+
+  def testLoadWordVectors(self):
+    glove_dir = os.path.join(self._temp_data_dir, "glove")
+    os.makedirs(glove_dir)
+    glove_file = os.path.join(glove_dir, "glove.42B.300d.txt")
+
+    words = [".", ",", "foo", "bar", "baz"]
+    with open(glove_file, "wt") as f:
+      for i, word in enumerate(words):
+        f.write("%s " % word)
+        for j in range(data.WORD_VECTOR_LEN):
+          f.write("%.5f" % (i * 0.1))
+          if j < data.WORD_VECTOR_LEN - 1:
+            f.write(" ")
+          else:
+            f.write("\n")
+
+    vocab = {"foo", "bar", "baz", "qux", "."}
+    # Notice that "qux" is not present in `words`.
+    word2index, embed = data.load_word_vectors(self._temp_data_dir, vocab)
+
+    self.assertEqual(6, len(word2index))
+    self.assertEqual(0, word2index["<unk>"])
+    self.assertEqual(1, word2index["<pad>"])
+    self.assertEqual(2, word2index["."])
+    self.assertEqual(3, word2index["foo"])
+    self.assertEqual(4, word2index["bar"])
+    self.assertEqual(5, word2index["baz"])
+    self.assertEqual((6, data.WORD_VECTOR_LEN), embed.shape)
+    self.assertAllClose([0.0] * data.WORD_VECTOR_LEN, embed[0, :])
+    self.assertAllClose([0.0] * data.WORD_VECTOR_LEN, embed[1, :])
+    self.assertAllClose([0.0] * data.WORD_VECTOR_LEN, embed[2, :])
+    self.assertAllClose([0.2] * data.WORD_VECTOR_LEN, embed[3, :])
+    self.assertAllClose([0.3] * data.WORD_VECTOR_LEN, embed[4, :])
+    self.assertAllClose([0.4] * data.WORD_VECTOR_LEN, embed[5, :])
+
+  def testLoadWordVectorsWithoutFileRaisesError(self):
+    vocab = {"foo", "bar", "baz", "qux", "."}
+    with self.assertRaisesRegexp(
+        ValueError, "Cannot find GloVe embedding file at"):
+      data.load_word_vectors(self._temp_data_dir, vocab)
+
+    os.makedirs(os.path.join(self._temp_data_dir, "glove"))
+    with self.assertRaisesRegexp(
+        ValueError, "Cannot find GloVe embedding file at"):
+      data.load_word_vectors(self._temp_data_dir, vocab)
+
+  def _createFakeSnliData(self, fake_snli_file):
+    # Four sentences in total.
+    with open(fake_snli_file, "wt") as f:
+      f.write("gold_label\tsentence1_binary_parse\tsentence2_binary_parse\t"
+              "sentence1_parse\tsentence2_parse\tsentence1\tsentence2\t"
+              "captionID\tpairID\tlabel1\tlabel2\tlabel3\tlabel4\tlabel5\n")
+      f.write("neutral\t( ( Foo bar ) . )\t( ( foo . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("contradiction\t( ( Bar foo ) . )\t( ( baz . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("entailment\t( ( Quux quuz ) . )\t( ( grault . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("entailment\t( ( Quuz quux ) . )\t( ( garply . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+
+  def _createFakeGloveData(self, glove_file):
+    words = [".", "foo", "bar", "baz", "quux", "quuz", "grault", "garply"]
+    with open(glove_file, "wt") as f:
+      for i, word in enumerate(words):
+        f.write("%s " % word)
+        for j in range(data.WORD_VECTOR_LEN):
+          f.write("%.5f" % (i * 0.1))
+          if j < data.WORD_VECTOR_LEN - 1:
+            f.write(" ")
+          else:
+            f.write("\n")
+
+  def testEncodeSingleSentence(self):
+    snli_1_0_dir = os.path.join(self._temp_data_dir, "snli/snli_1.0")
+    fake_train_file = os.path.join(snli_1_0_dir, "snli_1.0_train.txt")
+    os.makedirs(snli_1_0_dir)
+    self._createFakeSnliData(fake_train_file)
+    vocab = data.load_vocabulary(self._temp_data_dir)
+    glove_dir = os.path.join(self._temp_data_dir, "glove")
+    os.makedirs(glove_dir)
+    glove_file = os.path.join(glove_dir, "glove.42B.300d.txt")
+    self._createFakeGloveData(glove_file)
+    word2index, _ = data.load_word_vectors(self._temp_data_dir, vocab)
+
+    sentence_variants = [
+        "( Foo ( ( bar baz ) . ) )",
+        " ( Foo ( ( bar baz ) . ) ) ",
+        "( Foo ( ( bar baz ) . )  )"]
+    for sentence in sentence_variants:
+      word_indices, shift_reduce = data.encode_sentence(sentence, word2index)
+      self.assertEqual(np.int64, word_indices.dtype)
+      self.assertEqual((5, 1), word_indices.shape)
+      self.assertAllClose(
+          np.array([[3, 3, 3, 2, 3, 2, 2]], dtype=np.int64).T, shift_reduce)
+
+  def testSnliData(self):
+    snli_1_0_dir = os.path.join(self._temp_data_dir, "snli/snli_1.0")
+    fake_train_file = os.path.join(snli_1_0_dir, "snli_1.0_train.txt")
+    os.makedirs(snli_1_0_dir)
+    self._createFakeSnliData(fake_train_file)
+
+    glove_dir = os.path.join(self._temp_data_dir, "glove")
+    os.makedirs(glove_dir)
+    glove_file = os.path.join(glove_dir, "glove.42B.300d.txt")
+    self._createFakeGloveData(glove_file)
+
+    vocab = data.load_vocabulary(self._temp_data_dir)
+    word2index, _ = data.load_word_vectors(self._temp_data_dir, vocab)
+
+    train_data = data.SnliData(fake_train_file, word2index)
+    self.assertEqual(4, train_data.num_batches(1))
+    self.assertEqual(2, train_data.num_batches(2))
+    self.assertEqual(2, train_data.num_batches(3))
+    self.assertEqual(1, train_data.num_batches(4))
+
+    generator = train_data.get_generator(2)()
+    for _ in range(2):
+      label, prem, prem_trans, hypo, hypo_trans = next(generator)
+      self.assertEqual(2, len(label))
+      self.assertEqual((4, 2), prem.shape)
+      self.assertEqual((5, 2), prem_trans.shape)
+      self.assertEqual((3, 2), hypo.shape)
+      self.assertEqual((3, 2), hypo_trans.shape)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..eefc06d90d83b61d07a613643c913d3833a5f2c1
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -0,0 +1,475 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import gc
+import glob
+import os
+import shutil
+import tempfile
+import time
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+# pylint: disable=g-bad-import-order
+import tensorflow.contrib.eager as tfe
+from tensorflow.contrib.eager.python.examples.spinn import data
+from third_party.examples.eager.spinn import spinn
+from tensorflow.contrib.summary import summary_test_util
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+from tensorflow.python.training import checkpoint_utils
+# pylint: enable=g-bad-import-order
+
+
+def _generate_synthetic_snli_data_batch(sequence_length,
+                                        batch_size,
+                                        vocab_size):
+  """Generate a fake batch of SNLI data for testing."""
+  with tf.device("cpu:0"):
+    labels = tf.random_uniform([batch_size], minval=1, maxval=4, dtype=tf.int64)
+    prem = tf.random_uniform(
+        (sequence_length, batch_size), maxval=vocab_size, dtype=tf.int64)
+    prem_trans = tf.constant(np.array(
+        [[3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3,
+          2, 3, 3, 2, 2, 3, 3, 3, 2, 2, 2, 2,
+          3, 2, 2]] * batch_size, dtype=np.int64).T)
+    hypo = tf.random_uniform(
+        (sequence_length, batch_size), maxval=vocab_size, dtype=tf.int64)
+    hypo_trans = tf.constant(np.array(
+        [[3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3,
+          2, 3, 3, 2, 2, 3, 3, 3, 2, 2, 2, 2,
+          3, 2, 2]] * batch_size, dtype=np.int64).T)
+  if tfe.num_gpus():
+    labels = labels.gpu()
+    prem = prem.gpu()
+    prem_trans = prem_trans.gpu()
+    hypo = hypo.gpu()
+    hypo_trans = hypo_trans.gpu()
+  return labels, prem, prem_trans, hypo, hypo_trans
+
+
+def _test_spinn_config(d_embed, d_out, logdir=None, inference_sentences=None):
+  """Generate a config tuple for testing.
+
+  Args:
+    d_embed: Embedding dimensions.
+    d_out: Model output dimensions.
+    logdir: Optional logdir.
+    inference_sentences: A 2-tuple of strings representing the sentences (with
+      binary parsing result), e.g.,
+      ("( ( The dog ) ( ( is running ) . ) )", "( ( The dog ) ( moves . ) )").
+
+  Returns:
+    A config tuple.
+  """
+  config_tuple = collections.namedtuple(
+      "Config", ["d_hidden", "d_proj", "d_tracker", "predict",
+                 "embed_dropout", "mlp_dropout", "n_mlp_layers", "d_mlp",
+                 "d_out", "projection", "lr", "batch_size", "epochs",
+                 "force_cpu", "logdir", "log_every", "dev_every", "save_every",
+                 "lr_decay_every", "lr_decay_by", "inference_premise",
+                 "inference_hypothesis"])
+
+  inference_premise = inference_sentences[0] if inference_sentences else None
+  inference_hypothesis = inference_sentences[1] if inference_sentences else None
+  return config_tuple(
+      d_hidden=d_embed,
+      d_proj=d_embed * 2,
+      d_tracker=8,
+      predict=False,
+      embed_dropout=0.1,
+      mlp_dropout=0.1,
+      n_mlp_layers=2,
+      d_mlp=32,
+      d_out=d_out,
+      projection=True,
+      lr=2e-2,
+      batch_size=2,
+      epochs=20,
+      force_cpu=False,
+      logdir=logdir,
+      log_every=1,
+      dev_every=2,
+      save_every=2,
+      lr_decay_every=1,
+      lr_decay_by=0.75,
+      inference_premise=inference_premise,
+      inference_hypothesis=inference_hypothesis)
+
+
+class SpinnTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(SpinnTest, self).setUp()
+    self._test_device = "gpu:0" if tfe.num_gpus() else "cpu:0"
+    self._temp_data_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    shutil.rmtree(self._temp_data_dir)
+    super(SpinnTest, self).tearDown()
+
+  def testBundle(self):
+    with tf.device(self._test_device):
+      lstm_iter = [np.array([[0, 1], [2, 3]], dtype=np.float32),
+                   np.array([[0, -1], [-2, -3]], dtype=np.float32),
+                   np.array([[0, 2], [4, 6]], dtype=np.float32),
+                   np.array([[0, -2], [-4, -6]], dtype=np.float32)]
+      out = spinn._bundle(lstm_iter)
+
+      self.assertEqual(2, len(out))
+      self.assertEqual(tf.float32, out[0].dtype)
+      self.assertEqual(tf.float32, out[1].dtype)
+      self.assertAllEqual(np.array([[0, 2, 0, -2, 0, 4, 0, -4]]).T,
+                          out[0].numpy())
+      self.assertAllEqual(np.array([[1, 3, -1, -3, 2, 6, -2, -6]]).T,
+                          out[1].numpy())
+
+  def testUnbunbdle(self):
+    with tf.device(self._test_device):
+      state = [np.array([[0, 1, 2], [3, 4, 5]], dtype=np.float32),
+               np.array([[0, -1, -2], [-3, -4, -5]], dtype=np.float32)]
+      out = spinn._unbundle(state)
+
+      self.assertEqual(2, len(out))
+      self.assertEqual(tf.float32, out[0].dtype)
+      self.assertEqual(tf.float32, out[1].dtype)
+      self.assertAllEqual(np.array([[0, 1, 2, 0, -1, -2]]),
+                          out[0].numpy())
+      self.assertAllEqual(np.array([[3, 4, 5, -3, -4, -5]]),
+                          out[1].numpy())
+
+  def testReducer(self):
+    with tf.device(self._test_device):
+      batch_size = 3
+      size = 10
+      tracker_size = 8
+      reducer = spinn.Reducer(size, tracker_size=tracker_size)
+
+      left_in = []
+      right_in = []
+      tracking = []
+      for _ in range(batch_size):
+        left_in.append(tf.random_normal((1, size * 2)))
+        right_in.append(tf.random_normal((1, size * 2)))
+        tracking.append(tf.random_normal((1, tracker_size * 2)))
+
+      out = reducer(left_in, right_in, tracking=tracking)
+      self.assertEqual(batch_size, len(out))
+      self.assertEqual(tf.float32, out[0].dtype)
+      self.assertEqual((1, size * 2), out[0].shape)
+
+  def testReduceTreeLSTM(self):
+    with tf.device(self._test_device):
+      size = 10
+      tracker_size = 8
+      reducer = spinn.Reducer(size, tracker_size=tracker_size)
+
+      lstm_in = np.array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+                          [0, -1, -2, -3, -4, -5, -6, -7, -8, -9]],
+                         dtype=np.float32)
+      c1 = np.array([[0, 1], [2, 3]], dtype=np.float32)
+      c2 = np.array([[0, -1], [-2, -3]], dtype=np.float32)
+
+      h, c = reducer._tree_lstm(c1, c2, lstm_in)
+      self.assertEqual(tf.float32, h.dtype)
+      self.assertEqual(tf.float32, c.dtype)
+      self.assertEqual((2, 2), h.shape)
+      self.assertEqual((2, 2), c.shape)
+
+  def testTracker(self):
+    with tf.device(self._test_device):
+      batch_size = 2
+      size = 10
+      tracker_size = 8
+      buffer_length = 18
+      stack_size = 3
+
+      tracker = spinn.Tracker(tracker_size, False)
+      tracker.reset_state()
+
+      # Create dummy inputs for testing.
+      bufs = []
+      buf = []
+      for _ in range(buffer_length):
+        buf.append(tf.random_normal((batch_size, size * 2)))
+      bufs.append(buf)
+      self.assertEqual(1, len(bufs))
+      self.assertEqual(buffer_length, len(bufs[0]))
+      self.assertEqual((batch_size, size * 2), bufs[0][0].shape)
+
+      stacks = []
+      stack = []
+      for _ in range(stack_size):
+        stack.append(tf.random_normal((batch_size, size * 2)))
+      stacks.append(stack)
+      self.assertEqual(1, len(stacks))
+      self.assertEqual(3, len(stacks[0]))
+      self.assertEqual((batch_size, size * 2), stacks[0][0].shape)
+
+      for _ in range(2):
+        out1, out2 = tracker(bufs, stacks)
+        self.assertIsNone(out2)
+        self.assertEqual(batch_size, len(out1))
+        self.assertEqual(tf.float32, out1[0].dtype)
+        self.assertEqual((1, tracker_size * 2), out1[0].shape)
+
+        self.assertEqual(tf.float32, tracker.state.c.dtype)
+        self.assertEqual((batch_size, tracker_size), tracker.state.c.shape)
+        self.assertEqual(tf.float32, tracker.state.h.dtype)
+        self.assertEqual((batch_size, tracker_size), tracker.state.h.shape)
+
+  def testSPINN(self):
+    with tf.device(self._test_device):
+      embedding_dims = 10
+      d_tracker = 8
+      sequence_length = 15
+      num_transitions = 27
+
+      config_tuple = collections.namedtuple(
+          "Config", ["d_hidden", "d_proj", "d_tracker", "predict"])
+      config = config_tuple(
+          embedding_dims, embedding_dims * 2, d_tracker, False)
+      s = spinn.SPINN(config)
+
+      # Create some fake data.
+      buffers = tf.random_normal((sequence_length, 1, config.d_proj))
+      transitions = tf.constant(
+          [[3], [3], [2], [3], [3], [3], [2], [2], [2], [3], [3], [3],
+           [2], [3], [3], [2], [2], [3], [3], [3], [2], [2], [2], [2],
+           [3], [2], [2]], dtype=tf.int64)
+      self.assertEqual(tf.int64, transitions.dtype)
+      self.assertEqual((num_transitions, 1), transitions.shape)
+
+      out = s(buffers, transitions, training=True)
+      self.assertEqual(tf.float32, out.dtype)
+      self.assertEqual((1, embedding_dims), out.shape)
+
+  def testSNLIClassifierAndTrainer(self):
+    with tf.device(self._test_device):
+      vocab_size = 40
+      batch_size = 2
+      d_embed = 10
+      sequence_length = 15
+      d_out = 4
+
+      config = _test_spinn_config(d_embed, d_out)
+
+      # Create fake embedding matrix.
+      embed = tf.random_normal((vocab_size, d_embed))
+
+      model = spinn.SNLIClassifier(config, embed)
+      trainer = spinn.SNLIClassifierTrainer(model, config.lr)
+
+      (labels, prem, prem_trans, hypo,
+       hypo_trans) = _generate_synthetic_snli_data_batch(sequence_length,
+                                                         batch_size,
+                                                         vocab_size)
+
+      # Invoke model under non-training mode.
+      logits = model(prem, prem_trans, hypo, hypo_trans, training=False)
+      self.assertEqual(tf.float32, logits.dtype)
+      self.assertEqual((batch_size, d_out), logits.shape)
+
+      # Invoke model under training model.
+      logits = model(prem, prem_trans, hypo, hypo_trans, training=True)
+      self.assertEqual(tf.float32, logits.dtype)
+      self.assertEqual((batch_size, d_out), logits.shape)
+
+      # Calculate loss.
+      loss1 = trainer.loss(labels, logits)
+      self.assertEqual(tf.float32, loss1.dtype)
+      self.assertEqual((), loss1.shape)
+
+      loss2, logits = trainer.train_batch(
+          labels, prem, prem_trans, hypo, hypo_trans)
+      self.assertEqual(tf.float32, loss2.dtype)
+      self.assertEqual((), loss2.shape)
+      self.assertEqual(tf.float32, logits.dtype)
+      self.assertEqual((batch_size, d_out), logits.shape)
+      # Training on the batch should have led to a change in the loss value.
+      self.assertNotEqual(loss1.numpy(), loss2.numpy())
+
+  def _create_test_data(self, snli_1_0_dir):
+    fake_train_file = os.path.join(snli_1_0_dir, "snli_1.0_train.txt")
+    os.makedirs(snli_1_0_dir)
+
+    # Four sentences in total.
+    with open(fake_train_file, "wt") as f:
+      f.write("gold_label\tsentence1_binary_parse\tsentence2_binary_parse\t"
+              "sentence1_parse\tsentence2_parse\tsentence1\tsentence2\t"
+              "captionID\tpairID\tlabel1\tlabel2\tlabel3\tlabel4\tlabel5\n")
+      f.write("neutral\t( ( Foo bar ) . )\t( ( foo . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("contradiction\t( ( Bar foo ) . )\t( ( baz . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("entailment\t( ( Quux quuz ) . )\t( ( grault . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("entailment\t( ( Quuz quux ) . )\t( ( garply . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+
+    glove_dir = os.path.join(self._temp_data_dir, "glove")
+    os.makedirs(glove_dir)
+    glove_file = os.path.join(glove_dir, "glove.42B.300d.txt")
+
+    words = [".", "foo", "bar", "baz", "quux", "quuz", "grault", "garply"]
+    with open(glove_file, "wt") as f:
+      for i, word in enumerate(words):
+        f.write("%s " % word)
+        for j in range(data.WORD_VECTOR_LEN):
+          f.write("%.5f" % (i * 0.1))
+          if j < data.WORD_VECTOR_LEN - 1:
+            f.write(" ")
+          else:
+            f.write("\n")
+
+    return fake_train_file
+
+  def testInferSpinnWorks(self):
+    """Test inference with the spinn model."""
+    snli_1_0_dir = os.path.join(self._temp_data_dir, "snli/snli_1.0")
+    self._create_test_data(snli_1_0_dir)
+
+    vocab = data.load_vocabulary(self._temp_data_dir)
+    word2index, embed = data.load_word_vectors(self._temp_data_dir, vocab)
+
+    config = _test_spinn_config(
+        data.WORD_VECTOR_LEN, 4,
+        logdir=os.path.join(self._temp_data_dir, "logdir"),
+        inference_sentences=("( foo ( bar . ) )", "( bar ( foo . ) )"))
+    logits = spinn.train_or_infer_spinn(
+        embed, word2index, None, None, None, config)
+    self.assertEqual(np.float32, logits.dtype)
+    self.assertEqual((3,), logits.shape)
+
+  def testInferSpinnThrowsErrorIfOnlyOneSentenceIsSpecified(self):
+    snli_1_0_dir = os.path.join(self._temp_data_dir, "snli/snli_1.0")
+    self._create_test_data(snli_1_0_dir)
+
+    vocab = data.load_vocabulary(self._temp_data_dir)
+    word2index, embed = data.load_word_vectors(self._temp_data_dir, vocab)
+
+    config = _test_spinn_config(
+        data.WORD_VECTOR_LEN, 4,
+        logdir=os.path.join(self._temp_data_dir, "logdir"),
+        inference_sentences=("( foo ( bar . ) )", None))
+    with self.assertRaises(ValueError):
+      spinn.train_or_infer_spinn(embed, word2index, None, None, None, config)
+
+  def testTrainSpinn(self):
+    """Test with fake toy SNLI data and GloVe vectors."""
+
+    # 1. Create and load a fake SNLI data file and a fake GloVe embedding file.
+    snli_1_0_dir = os.path.join(self._temp_data_dir, "snli/snli_1.0")
+    fake_train_file = self._create_test_data(snli_1_0_dir)
+
+    vocab = data.load_vocabulary(self._temp_data_dir)
+    word2index, embed = data.load_word_vectors(self._temp_data_dir, vocab)
+
+    train_data = data.SnliData(fake_train_file, word2index)
+    dev_data = data.SnliData(fake_train_file, word2index)
+    test_data = data.SnliData(fake_train_file, word2index)
+
+    # 2. Create a fake config.
+    config = _test_spinn_config(
+        data.WORD_VECTOR_LEN, 4,
+        logdir=os.path.join(self._temp_data_dir, "logdir"))
+
+    # 3. Test training of a SPINN model.
+    trainer = spinn.train_or_infer_spinn(
+        embed, word2index, train_data, dev_data, test_data, config)
+
+    # 4. Load train loss values from the summary files and verify that they
+    #    decrease with training.
+    summary_file = glob.glob(os.path.join(config.logdir, "events.out.*"))[0]
+    events = summary_test_util.events_from_file(summary_file)
+    train_losses = [event.summary.value[0].simple_value for event in events
+                    if event.summary.value
+                    and event.summary.value[0].tag == "train/loss"]
+    self.assertEqual(config.epochs, len(train_losses))
+    self.assertLess(train_losses[-1], train_losses[0])
+
+    # 5. Verify that checkpoints exist and contains all the expected variables.
+    self.assertTrue(glob.glob(os.path.join(config.logdir, "ckpt*")))
+    ckpt_variable_names = [
+        item[0] for item in checkpoint_utils.list_variables(config.logdir)]
+    self.assertIn("global_step", ckpt_variable_names)
+    for v in trainer.variables:
+      variable_name = v.name[:v.name.index(":")] if ":" in v.name else v.name
+      self.assertIn(variable_name, ckpt_variable_names)
+
+
+class EagerSpinnSNLIClassifierBenchmark(test.Benchmark):
+
+  def benchmarkEagerSpinnSNLIClassifier(self):
+    test_device = "gpu:0" if tfe.num_gpus() else "cpu:0"
+    with tf.device(test_device):
+      burn_in_iterations = 2
+      benchmark_iterations = 10
+
+      vocab_size = 1000
+      batch_size = 128
+      sequence_length = 15
+      d_embed = 200
+      d_out = 4
+
+      embed = tf.random_normal((vocab_size, d_embed))
+
+      config = _test_spinn_config(d_embed, d_out)
+      model = spinn.SNLIClassifier(config, embed)
+      trainer = spinn.SNLIClassifierTrainer(model, config.lr)
+
+      (labels, prem, prem_trans, hypo,
+       hypo_trans) = _generate_synthetic_snli_data_batch(sequence_length,
+                                                         batch_size,
+                                                         vocab_size)
+
+      for _ in range(burn_in_iterations):
+        trainer.train_batch(labels, prem, prem_trans, hypo, hypo_trans)
+
+      gc.collect()
+      start_time = time.time()
+      for _ in xrange(benchmark_iterations):
+        trainer.train_batch(labels, prem, prem_trans, hypo, hypo_trans)
+      wall_time = time.time() - start_time
+      # Named "examples"_per_sec to conform with other benchmarks.
+      extras = {"examples_per_sec": benchmark_iterations / wall_time}
+      self.report_benchmark(
+          name="Eager_SPINN_SNLIClassifier_Benchmark",
+          iters=benchmark_iterations,
+          wall_time=wall_time,
+          extras=extras)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/eager/python/g3doc/guide.md b/tensorflow/contrib/eager/python/g3doc/guide.md
index 147b7047f42b7ccba5829b61370e82e217ce5838..ffc1d0332eae605ce0444a225e53baa68954cae0 100644
--- a/tensorflow/contrib/eager/python/g3doc/guide.md
+++ b/tensorflow/contrib/eager/python/g3doc/guide.md
@@ -19,29 +19,34 @@ to models defined without using eager execution.
 
 ## Installation
 
-Eager execution is **not** included in the latest release (version 1.4) of
-TensorFlow. To use it, you will need to [build TensorFlow from
-source](https://www.tensorflow.org/install/install_sources) or install the
-nightly builds.
+Eager execution is included in TensorFlow versions 1.5 and above.
+Installation instructions at https://www.tensorflow.org/install/
 
-For example, the nightly builds can be installed using `pip`:
+The contents of this guide are compatible with TensorFlow 1.5.
+However, if you run into bugs that are fixed in source but not the
+release, you may want to either either [building from
+source](https://www.tensorflow.org/install/install_sources)
+or the try latest nightly builds. The nightly builds are available as:
 
--   `pip install tf-nightly` (for CPU-only TensorFlow)
--   `pip install tf-nightly-gpu` (for GPU-enabled TensorFlow)
+- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and
 
-Or using `docker`, with [Jupyter Notebook](http://jupyter.org/) support:
+- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images.
+
+For example, to run the latest nightly docker image:
 
 ```sh
-# For CPU-only TensorFlow
+# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker
+docker pull tensorflow/tensorflow:nightly-gpu
+docker run --runtime=nvidia -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
+
+# If you do not have a GPU, use the CPU-only image
 docker pull tensorflow/tensorflow:nightly
 docker run -it -p 8888:8888 tensorflow/tensorflow:nightly
-
-# For GPU-enabled TensorFlow:
-# (Requires https://github.com/NVIDIA/nvidia-docker)
-nvidia-docker pull tensorflow/tensorflow:nightly-gpu
-nvidia-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
 ```
 
+And then visit http://localhost:8888 in your browser for a Jupyter notebook
+environment.
+
 ## Getting Started
 
 With TensorFlow installed, eager execution is enabled via a single call:
@@ -292,7 +297,7 @@ def loss(weight, bias):
   error = prediction(training_inputs, weight, bias) - training_outputs
   return tf.reduce_mean(tf.square(error))
 
-# Function that returns the the derivative of loss with respect to
+# Function that returns the derivative of loss with respect to
 # weight and bias
 grad = tfe.gradients_function(loss)
 
@@ -757,7 +762,7 @@ For example, to record summaries once every 100 global steps, use:
 
 ```python
 tf.train.get_or_create_global_step()  # Ensuring the global step variable exists
-writer = tf.contrib.summary.create_summary_file_writer(logdir)
+writer = tf.contrib.summary.create_file_writer(logdir)
 
 for _ in range(iterations):
   with writer.as_default():
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index 2f8016ede3caee6dbb6fd8f5226f1464b5c3976b..ea8dbf2b46ea4bd0e33645ae3c590c4dd13f7a52 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -49,6 +49,20 @@ class Metric(object):
 
   Example use with graph execution:
 
+  ```python
+  m = SomeMetric(...)
+  inputs = ... # Some tensors to compute the metric on.
+  m_update = m(inputs)
+  # Variables defined in first call, so get the initialization op afterwards.
+  m_init = m.init_variables()  # or tf.global_variables_initializer()
+  m_result = m.result()
+  with tf.Session() as sess:
+    sess.run(m_init)
+    for input in ...:
+      sess.run(m_update)
+    print(sess.run(m_result))
+  ```
+  Example use with graph execution with placeholders and feed_dict:
   ```python
   m = SomeMetric(...)
   m_placeholder = tf.placeholder(...)
@@ -107,6 +121,7 @@ class Metric(object):
     """Returns op to execute to update this metric for these inputs.
 
     Returns None if eager execution is enabled.
+    Returns a graph-mode function if graph execution is enabled.
 
     Args:
       *args:
@@ -183,6 +198,13 @@ class Metric(object):
     """Computes and returns a final value for the metric."""
     raise NotImplementedError("Metrics must define a result() member function")
 
+  def value(self):
+    """In graph mode returns the result Tensor while in eager the callable."""
+    if context.in_graph_mode():
+      return self.result()
+    else:
+      return self.result
+
   # We can support two different strategies of for doing data-parallel
   # distributed metric computations:
   # * Put metric variables on the first device and rely on small
@@ -269,6 +291,9 @@ class Mean(Metric):
     Args:
       values: Tensor with the per-example value.
       weights: Optional weighting of each example. Defaults to 1.
+
+    Returns:
+      The arguments, for easy chaining.
     """
     if weights is None:
       self.denom.assign_add(
@@ -280,6 +305,9 @@ class Mean(Metric):
       self.denom.assign_add(math_ops.reduce_sum(weights))
       values = math_ops.cast(values, self.dtype) * weights
       self.numer.assign_add(math_ops.reduce_sum(values))
+    if weights is None:
+      return values
+    return values, weights
 
   def result(self):
     t = self.numer / self.denom
@@ -307,7 +335,13 @@ class Accuracy(Mean):
         per element of the Tensor.
       predictions: Tensor with the predicted label for each example.
       weights: Optional weighting of each example. Defaults to 1.
+
+    Returns:
+      The arguments, for easy chaining.
     """
     matches = math_ops.equal(labels, predictions)
     matches = math_ops.cast(matches, dtypes.float64)
     super(Accuracy, self).call(matches, weights=weights)
+    if weights is None:
+      return labels, predictions
+    return labels, predictions, weights
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 96eb1b4f2a0e4c4af1f3310a2801b1b6aee285d6..a9ecaa3f8bced3043ea0eb0ac3aa8bfa65e9e1ff 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.training import training_util
 
@@ -67,7 +68,7 @@ class MetricsTest(test.TestCase):
     m([1, 10, 100])
     training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name="t0").as_default(), summary_ops.always_record_summaries():
       m.result()  # As a side-effect will write summaries.
@@ -137,7 +138,7 @@ class MetricsTest(test.TestCase):
     self.assertEqual(m1.name, "has space")
     self.assertEqual(m1.numer.name, "has_space/numer:0")
 
-  def testGraph(self):
+  def testGraphWithPlaceholder(self):
     with context.graph_mode(), self.test_session() as sess:
       m = metrics.Mean()
       p = array_ops.placeholder(dtypes.float32)
@@ -153,6 +154,22 @@ class MetricsTest(test.TestCase):
       sess.run(accumulate, feed_dict={p: 7})
       self.assertAllEqual(m.result().eval(), 7)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testGraphAndEagerTensor(self):
+    m = metrics.Mean()
+    inputs = ops.convert_to_tensor([1.0, 2.0])
+    accumulate = m(inputs)
+    result = m.result()
+    self.evaluate(m.init_variables())
+    self.evaluate(accumulate)
+    self.assertEqual(self.evaluate(result), 1.5)
+    # Second init resets all the variables.
+    self.evaluate(m.init_variables())
+    inputs = ops.convert_to_tensor([2.0, 3.0])
+    self.evaluate(m(inputs))
+    value = m.value()
+    self.assertEqual(self.evaluate(value), 2.5)
+
   def testTwoMeansGraph(self):
     # Verify two metrics with the same class and name don't
     # accidentally share state.
@@ -163,6 +180,19 @@ class MetricsTest(test.TestCase):
         m2 = metrics.Mean()
         m2(2)
 
+  def testMetricsChain(self):
+    with context.graph_mode(), self.test_session():
+      m1 = metrics.Mean()
+      m2 = metrics.Mean(name="m2")
+      update_m2 = m2(3.0)
+      update_m2_2 = m2(m1(1.0))
+      m1.init_variables().run()
+      m2.init_variables().run()
+      update_m2.eval()
+      update_m2_2.eval()
+      self.assertAllEqual(m2.result().eval(), 2.0)
+      self.assertAllEqual(m1.result().eval(), 1.0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index 0388aaa8495f380595b2635529bc2e33e808b06f..e3c13cbd2e8ccd2ab79da74e0e97905c6ed5c02d 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -451,8 +451,30 @@ class Network(base.Layer):
         "at https://github.com/tensorflow/tensorflow/issues/new if this is "
         "important to you")
 
+  def add_loss(self, losses, inputs=None):
+    raise RuntimeError(
+        "add_loss is not supported in Network class yet. Please file an issue "
+        "at https://github.com/tensorflow/tensorflow/issues/new if this is "
+        "important to you")
+
+  @property
+  def losses(self):
+    """Gather losses from `Layer`s in the `Network`.
+
+    Note that when executing eagerly, `Layer.losses` evaluates
+    regularizers. When using graph execution, variable regularization ops have
+    already been created and are simply returned here.
+
+    Returns:
+      A list of tensors.
+    """
+    layer_losses = []
+    for layer in self.layers:
+      layer_losses.extend(layer.losses)
+    return layer_losses
+
   # TODO(allenl): Support other Layer methods needed for graph mode, such as for
-  # losses and updates
+  # updates
 
 
 class Sequential(Network):
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index e7835a63e6db926aa2d4b6c76c681c8a301757bd..3329fc6c513265deff41a368f5688dd605209c14 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import gc
 
 from tensorflow.contrib.eager.python import network
+from tensorflow.contrib.layers.python.layers import regularizers
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
@@ -45,6 +46,22 @@ class MyNetwork(network.Network):
     return self.l1(x)
 
 
+class RegularizedNetwork(network.Network):
+
+  def __init__(self):
+    super(RegularizedNetwork, self).__init__()
+    self.l1 = self.track_layer(core.Dense(
+        1,
+        bias_regularizer=regularizers.l1_regularizer(2.0),
+        kernel_regularizer=regularizers.l1_regularizer(2.0)))
+    self.l2 = self.track_layer(core.Dense(
+        1,
+        bias_regularizer=regularizers.l1_regularizer(2.0)))
+
+  def call(self, values):
+    return self.l2(self.l1(values))
+
+
 class NetworkTest(test.TestCase):
 
   def _save_modify_load_network_built(self, net, global_step=None):
@@ -88,15 +105,13 @@ class NetworkTest(test.TestCase):
     result = net(constant_op.constant([[2.0]]))
     self.assertEqual(34.0, self.evaluate(result))
 
-  # TODO(akshayka): This test should be changed once an API for compiling
-  # `call` into a defun is implemented.
   def testReplacingNetworkCallWithDefun(self):
     net = MyNetwork(name="abcd")
+    net.call = function.defun(net.call)
     x = constant_op.constant([[2.0]])
     net(x)  # Force variables to be created.
     self.evaluate(net.trainable_variables[0].assign([[17.0]]))
 
-    net.call = function.defun(net.call)
     result = net(x)  # Build and execute the TensorFlow function
     self.assertEqual(34.0, self.evaluate(result))
 
@@ -484,6 +499,18 @@ class NetworkTest(test.TestCase):
       _check_op_prefixes(expected_prefix="my_network_1/dense/",
                          checked_ops=checked_ops)
 
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testVariableRegularizers(self):
+    net = RegularizedNetwork()
+    net(constant_op.constant([[1.]]))
+    self.evaluate(net.variables[0].assign([[2.]]))
+    self.evaluate(net.variables[1].assign([3.]))
+    self.evaluate(net.variables[2].assign([[-2.]]))
+    self.evaluate(net.variables[3].assign([4.]))
+    self.assertAllEqual([4., 6., 8.], self.evaluate(net.losses))
+    self.evaluate(net.variables[3].assign([5.]))
+    self.assertAllEqual([4., 6., 10.], self.evaluate(net.losses))
+
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testDuplicateNameError(self):
     one = constant_op.constant([[1.]])
@@ -512,7 +539,7 @@ class NetworkTest(test.TestCase):
         # No issue here since the name is unique within its scope.
         name_conflict3 = MyNetwork(name="name_conflict")
       net2 = MyNetwork()  # name=outside_scope/my_network_2 to avoid the
-                          # variable_scope my_network_1 below.
+      # variable_scope my_network_1 below.
       vs_name_conflict = MyNetwork(name="vs_name_conflict")  # conflict below
     with variable_scope.variable_scope("intervening_scope"):
       with variable_scope.variable_scope(captured_scope):
@@ -661,7 +688,7 @@ class NetworkTest(test.TestCase):
     net2(one)
     # Layer names typically are globally unique rather than being unique within
     # the scope of their first use. However, within a Network they must be named
-    # locally so that previous Layer consutrciton does not interfere with
+    # locally so that previous Layer construction does not interfere with
     # variable naming (e.g. add a Layer construction before the Network,
     # suddenly your previously saved checkpoint is incompatible).
     self.assertEqual("dense", net1.l1.name)
diff --git a/tensorflow/contrib/eager/python/saver.py b/tensorflow/contrib/eager/python/saver.py
index 57b070ec6eeac00c77f199a846639d64c4957cd8..62421849c766a1124c726812428985c913c653a3 100644
--- a/tensorflow/contrib/eager/python/saver.py
+++ b/tensorflow/contrib/eager/python/saver.py
@@ -82,7 +82,7 @@ def restore_variables_on_create(save_path, map_func=None):
       map_func_wrapper = lambda self, x: x
     else:
       if not callable(map_func):
-        raise ValueError("map_func must be callaled.")
+        raise ValueError("map_func must be callable.")
       map_func_wrapper = lambda self, x: map_func(x)
 
     ckpt_var_cache = dict()
diff --git a/tensorflow/contrib/eager/python/saver_test.py b/tensorflow/contrib/eager/python/saver_test.py
index abc7e3690c76c4446bce6b945325f1ca15ef1c8b..1a7f7b85e688e80e3cf482f2754462888187d311 100644
--- a/tensorflow/contrib/eager/python/saver_test.py
+++ b/tensorflow/contrib/eager/python/saver_test.py
@@ -73,16 +73,6 @@ class SaverTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, 'v1'):
         saver.save(ckpt_prefix)
 
-  def testDifferentGraphError(self):
-    with ops.device(self._dev()):
-      with ops.Graph().as_default():
-        v1 = resource_variable_ops.ResourceVariable(1.0, name='v1')
-      with ops.Graph().as_default():
-        saver = _saver.Saver([v1])
-        ckpt_prefix = os.path.join(test.get_temp_dir(), 'ckpt')
-        with self.assertRaisesRegexp(ValueError, 'Graph'):
-          saver.save(ckpt_prefix)
-
   def testSameObjectOK(self):
     with ops.device(self._dev()):
       v1 = resource_variable_ops.ResourceVariable(1.0, name='v1')
diff --git a/tensorflow/contrib/eager/python/summary_writer.py b/tensorflow/contrib/eager/python/summary_writer.py
deleted file mode 100644
index 5d8c41b545b3c9fd03af85f302ba05a394f085a4..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/python/summary_writer.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TensorBoard Summary Writer for TensorFlow Eager Execution."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import uuid
-
-from tensorflow.contrib.summary import gen_summary_ops
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_op_util
-from tensorflow.python.ops import variable_scope
-
-
-def _maybe_cpu(v):
-  if isinstance(v, (ops.EagerTensor, ops.Tensor)):
-    return v.cpu()
-  else:
-    return v
-
-
-def _summary_writer_function(name, tensor, function, family=None):
-  def record():
-    with summary_op_util.summary_scope(
-        name, family, values=[tensor]) as (tag, scope):
-      function(tag, scope)
-      return True
-  return record
-
-
-class SummaryWriter(object):
-  """Writes summaries for TensorBoard, compatible with eager execution.
-
-  This class is the supported way of writing TensorBoard summaries under
-  eager execution.
-  """
-
-  _CPU_DEVICE = "cpu:0"
-
-  def __init__(self,
-               logdir,
-               max_queue=10,
-               flush_secs=120,
-               filename_suffix=""):
-    """Summary writer for TensorBoard, compatible with eager execution.
-
-    If necessary, multiple instances of `SummaryWriter` can be created, with
-    distinct `logdir`s and `name`s. Each `SummaryWriter` instance will retain
-    its independent `global_step` counter and data writing destination.
-
-    Example:
-    ```python
-    writer = tfe.SummaryWriter("my_model")
-
-    # ... Code that sets up the model and data batches ...
-
-    for _ in xrange(train_iters):
-      loss = model.train_batch(batch)
-      writer.scalar("loss", loss)
-      writer.step()
-    ```
-
-    Args:
-      logdir: Directory in which summary files will be written.
-      max_queue: Number of summary items to buffer before flushing to
-        filesystem. If 0, summaries will be flushed immediately.
-      flush_secs: Number of secondsbetween forced commits to disk.
-      filename_suffix: Suffix of the event protobuf files in which the summary
-        data are stored.
-
-    Raises:
-      ValueError: If this constructor is called not under eager execution.
-    """
-    # TODO(apassos, ashankar): Make this class and the underlying
-    # contrib.summary_ops compatible with graph model and remove this check.
-    if not context.in_eager_mode():
-      raise ValueError(
-          "Use of SummaryWriter is currently supported only with eager "
-          "execution enabled. File an issue at "
-          "https://github.com/tensorflow/tensorflow/issues/new to express "
-          "interest in fixing this.")
-
-    # TODO(cais): Consider adding name keyword argument, which if None or empty,
-    # will register the global global_step that training_util.get_global_step()
-    # can find.
-    with context.device(self._CPU_DEVICE):
-      self._name = uuid.uuid4().hex
-      self._global_step = 0
-      self._global_step_tensor = variable_scope.get_variable(
-          "global_step/summary_writer/" + self._name,
-          shape=[], dtype=dtypes.int64,
-          initializer=init_ops.zeros_initializer())
-      self._global_step_dirty = False
-      self._resource = gen_summary_ops.summary_writer(shared_name=self._name)
-      gen_summary_ops.create_summary_file_writer(
-          self._resource, logdir, max_queue, flush_secs, filename_suffix)
-      # Delete the resource when this object is deleted
-      self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
-          handle=self._resource, handle_device=self._CPU_DEVICE)
-
-  def step(self):
-    """Increment the global step counter of this SummaryWriter instance."""
-    self._global_step += 1
-    self._global_step_dirty = True
-
-  @property
-  def global_step(self):
-    """Obtain the current global_step value of this SummaryWriter instance.
-
-    Returns:
-      An `int` representing the current value of the global_step of this
-       `SummaryWriter` instance.
-    """
-    return self._global_step
-
-  def _update_global_step_tensor(self):
-    with context.device(self._CPU_DEVICE):
-      if self._global_step_dirty:
-        self._global_step_dirty = False
-        return state_ops.assign(self._global_step_tensor, self._global_step)
-      else:
-        return self._global_step_tensor
-
-  def generic(self, name, tensor, metadata, family=None):
-    """Write a generic-type summary.
-
-    Args:
-      name: A name for the generated node. Will also serve as the series name in
-        TensorBoard.
-      tensor: A `Tensor` or compatible value type containing the value of the
-        summary.
-      metadata: Metadata about the summary.
-      family: Optional; if provided, used as the prefix of the summary tag name,
-        which controls the tab name used for display on Tensorboard.
-    """
-    with context.device(self._CPU_DEVICE):
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_summary(
-            self._resource,
-            self._update_global_step_tensor(),
-            _maybe_cpu(tensor),
-            tag,
-            _maybe_cpu(metadata),
-            name=scope)
-
-  def scalar(self, name, tensor, family=None):
-    """Write a scalar summary.
-
-    Args:
-      name: A name for the generated node. Will also serve as the series name in
-        TensorBoard.
-      tensor: A real numeric `Tensor` or compatible value type containing a
-        single value.
-      family: Optional; if provided, used as the prefix of the summary tag name,
-        which controls the tab name used for display on Tensorboard.
-
-    Returns:
-      A summary writer function for scalars.
-    """
-    with context.device(self._CPU_DEVICE):
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_scalar_summary(
-            self._resource, self._update_global_step_tensor(),
-            tag, _maybe_cpu(tensor), name=scope)
-
-  def histogram(self, name, tensor, family=None):
-    """Write a histogram summary.
-
-    Args:
-      name: A name for the generated node. Will also serve as a series name in
-        TensorBoard.
-      tensor: A real numeric `Tensor` or compatible value type. Any shape.
-        Values to use to build the histogram.
-      family: Optional; if provided, used as the prefix of the summary tag name,
-        which controls the tab name used for display on Tensorboard.
-    """
-    with context.device(self._CPU_DEVICE):
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_histogram_summary(
-            self._resource, self._update_global_step_tensor(),
-            tag, _maybe_cpu(tensor), name=scope)
-
-  def image(self, name, tensor, bad_color=None, max_images=3, family=None):
-    """Write an image summary."""
-    with context.device(self._CPU_DEVICE):
-      if bad_color is None:
-        bad_color_ = constant_op.constant([255, 0, 0, 255], dtype=dtypes.uint8)
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_image_summary(
-            self._resource, self._update_global_step_tensor(),
-            tag, _maybe_cpu(tensor), bad_color_, max_images,
-            name=scope)
-
-  def audio(self, name, tensor, sample_rate, max_outputs, family=None):
-    """Write an audio summary.
-
-    Args:
-      name: A name for the generated node. Will also serve as a series name in
-        TensorBoard.
-      tensor: A 3-D `float32` `Tensor` of shape `[batch_size, frames, channels]`
-        or a 2-D `float32` `Tensor` of shape `[batch_size, frames]`, or
-        compatible value type.
-      sample_rate: A Scalar `float32` `Tensor` indicating the sample rate of the
-        signal in hertz.
-      max_outputs: Max number of batch elements to generate audio for.
-      family: Optional; if provided, used as the prefix of the summary tag name,
-        which controls the tab name used for display on Tensorboard.
-    """
-    with context.device(self._CPU_DEVICE):
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_audio_summary(
-            self._resource, self._update_global_step_tensor(),
-            tag,
-            _maybe_cpu(tensor),
-            sample_rate=_maybe_cpu(sample_rate),
-            max_outputs=max_outputs,
-            name=scope)
diff --git a/tensorflow/contrib/eager/python/summary_writer_test.py b/tensorflow/contrib/eager/python/summary_writer_test.py
deleted file mode 100644
index 5ebb36d04fcba8f4558fa1c09716314af42f559f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/python/summary_writer_test.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Unit tests for eager execution SummaryWriter."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import shutil
-import tempfile
-
-import numpy as np
-
-from tensorflow.contrib.eager.python import summary_writer
-from tensorflow.core.util import event_pb2
-from tensorflow.python.eager import context
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.lib.io import tf_record
-from tensorflow.python.platform import gfile
-
-
-class SummaryWriterTest(test.TestCase):
-
-  def setUp(self):
-    super(SummaryWriterTest, self).setUp()
-    self._test_device = "gpu:0" if context.num_gpus() else "cpu:0"
-    self._tmp_logdir = tempfile.mkdtemp()
-    with context.device(self._test_device):
-      # Use max_queue=0 so that summaries are immediately flushed to filesystem,
-      # making testing easier.
-      self._writer = summary_writer.SummaryWriter(self._tmp_logdir, max_queue=0)
-
-  def tearDown(self):
-    if os.path.isdir(self._tmp_logdir):
-      shutil.rmtree(self._tmp_logdir)
-    super(SummaryWriterTest, self).tearDown()
-
-  def _readLastEvent(self, logdir=None):
-    if not logdir:
-      logdir = self._tmp_logdir
-    files = [f for f in gfile.ListDirectory(logdir)
-             if not gfile.IsDirectory(os.path.join(logdir, f))]
-    file_path = os.path.join(logdir, files[0])
-    records = list(tf_record.tf_record_iterator(file_path))
-    event = event_pb2.Event()
-    event.ParseFromString(records[-1])
-    return event
-
-  def testGlobalStep(self):
-    with context.device(self._test_device):
-      orig_step = self._writer.global_step
-      self._writer.step()
-      self.assertEqual(orig_step + 1, self._writer.global_step)
-      self.assertEqual(orig_step + 1, self._writer.global_step)
-      self._writer.step()
-      self._writer.step()
-      self.assertEqual(orig_step + 3, self._writer.global_step)
-
-  def testGenericSummary(self):
-    with context.device(self._test_device):
-      x = constant_op.constant(1337.0)
-      with context.device("cpu:0"):
-        metadata = constant_op.constant("foo")
-      self._writer.generic("x", x, metadata)
-      event = self._readLastEvent()
-      self.assertEqual("x", event.summary.value[0].tag)
-
-  def testScalarSummary(self):
-    with context.device(self._test_device):
-      x = constant_op.constant(1337.0)
-      self._writer.scalar("x", x)
-      event = self._readLastEvent()
-      self.assertTrue("x", event.summary.value[0].tag)
-      self.assertEqual(1337.0, event.summary.value[0].simple_value)
-
-  def testHistogramSummary(self):
-    with context.device(self._test_device):
-      y = constant_op.constant([1.0, 3.0, 3.0, 7.0])
-      self._writer.histogram("y", y)
-      event = self._readLastEvent()
-      self.assertEqual("y", event.summary.value[0].tag)
-      self.assertTrue(event.summary.value[0].histo)
-
-  def testImageSummary(self):
-    with context.device(self._test_device):
-      a = constant_op.constant([[10.0, 20.0], [-20.0, -10.0]])
-      self._writer.histogram("image1", a)
-      event = self._readLastEvent()
-      self.assertEqual("image1", event.summary.value[0].tag)
-      self.assertTrue(event.summary.value[0].image)
-
-  def testAudioSummary(self):
-    with context.device(self._test_device):
-      w = constant_op.constant(np.random.rand(3, 10, 2), dtype=dtypes.float32)
-      fs = constant_op.constant(44100.0, dtype=dtypes.float32)
-      max_outputs = 1
-      self._writer.audio("audio1", w, fs, max_outputs)
-      event = self._readLastEvent()
-      self.assertTrue(event.summary.value[0].audio)
-
-  def testTwoSummaryWritersGlobalStepsWorkWithoutCrosstalk(self):
-    tmp_logdir2 = os.path.join(self._tmp_logdir, "_writer2_")
-    writer2 = summary_writer.SummaryWriter(tmp_logdir2, max_queue=0)
-
-    self.assertEqual(0, writer2.global_step)
-    self._writer.step()
-    self.assertEqual(0, writer2.global_step)
-    writer2.step()
-    writer2.step()
-    writer2.step()
-    self.assertEqual(3, writer2.global_step)
-
-    x = constant_op.constant(1337.0)
-    writer_orig_step = self._writer.global_step
-    self._writer.step()
-    self._writer.scalar("x", x)
-
-    event = self._readLastEvent()
-    self.assertEqual(writer_orig_step + 1, event.step)
-
-    writer2.scalar("x", x)
-    event = self._readLastEvent(tmp_logdir2)
-    self.assertEqual(3, event.step)
-
-    self._writer.step()
-    self._writer.scalar("x", x)
-
-    event = self._readLastEvent()
-    self.assertEqual(writer_orig_step + 2, event.step)
-
-
-# TODO(cais): Add performance benchmark for SummaryWriter.
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index 1697c879def8af5c05f3c9b11d318d570785d6de..d32bebf90c1e768d1efec26b3b78bf1a522a8f00 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -23,7 +23,9 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 @@list_devices
 @@num_gpus
 
+@@py_func
 @@defun
+@@make_template
 @@implicit_gradients
 @@implicit_value_and_gradients
 @@gradients_function
@@ -50,13 +52,13 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 @@EagerVariableStore
 
 @@Network
+@@Sequential
 @@save_network_checkpoint
 @@restore_network_checkpoint
 
 @@in_eager_mode
 @@in_graph_mode
 
-@@IsolateTest
 @@run_test_in_graph_and_eager_modes
 
 @@DEVICE_PLACEMENT_EXPLICIT
@@ -74,6 +76,7 @@ from __future__ import print_function
 from tensorflow.contrib.eager.python import metrics
 from tensorflow.contrib.eager.python.datasets import Iterator
 from tensorflow.contrib.eager.python.network import Network
+from tensorflow.contrib.eager.python.network import Sequential
 from tensorflow.contrib.eager.python.network import save_network_checkpoint
 from tensorflow.contrib.eager.python.network import restore_network_checkpoint
 from tensorflow.contrib.eager.python.saver import get_optimizer_variables
@@ -97,13 +100,16 @@ from tensorflow.python.eager.execution_callbacks import nan_callback
 from tensorflow.python.eager.execution_callbacks import seterr
 from tensorflow.python.framework.ops import enable_eager_execution
 from tensorflow.python.framework.ops import eager_run as run
-from tensorflow.python.framework.test_util import IsolateTest
 from tensorflow.python.framework.test_util import run_in_graph_and_eager_modes as run_test_in_graph_and_eager_modes
 from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Variable
 from tensorflow.python.ops.variable_scope import EagerVariableStore
+from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import template
 from tensorflow.python.util.all_util import remove_undocumented
 
+py_func = script_ops.eager_py_func
 defun = function.defun
+make_template = template.make_template_internal
 implicit_gradients = backprop.implicit_grad
 implicit_value_and_gradients = backprop.implicit_val_and_grad
 gradients_function = backprop.gradients_function
diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py
index 0dedb2fd7c0905801cd87c239ff2ee09eecb6080..b6659c2a1797feab261d756e78b45231dbea5a02 100644
--- a/tensorflow/contrib/eager/python/tfe_test.py
+++ b/tensorflow/contrib/eager/python/tfe_test.py
@@ -102,10 +102,6 @@ class TFETest(test_util.TensorFlowTestCase):
     # Expect at least one device.
     self.assertTrue(tfe.list_devices())
 
-  def testNumGPUs(self):
-    devices = tfe.list_devices()
-    self.assertEqual(len(devices) - 1, tfe.num_gpus())
-
   def testAddCheckNumericsOpsRaisesError(self):
     with self.assertRaisesRegexp(
         RuntimeError,
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 8395e2db5ec0ce6f4adae5fa2467159549e70143..6cdbed5b896577f5622b1bd0123c289c798bc0a5 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -88,8 +88,9 @@ py_library(
 
 py_test(
     name = "dnn_linear_combined_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/estimator/dnn_linear_combined_test.py"],
+    shard_count = 3,
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
@@ -162,7 +163,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lookup_ops",
@@ -176,7 +177,6 @@ py_library(
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/estimator:prediction_keys",
-        "//tensorflow/python/estimator:util",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model:signature_constants",
     ],
@@ -204,6 +204,7 @@ py_test(
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/estimator:prediction_keys",
+        "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model:signature_constants",
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -330,23 +331,24 @@ py_library(
         "//tensorflow/python:device",
         "//tensorflow/python:device_lib",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/estimator:util",
+        "//tensorflow/python/ops/losses",
         "@six_archive//:six",
     ],
 )
 
 cuda_py_test(
     name = "replicate_model_fn_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/estimator/replicate_model_fn_test.py"],
     additional_deps = [
         "//tensorflow/python/estimator",
@@ -374,5 +376,9 @@ cuda_py_test(
         "//tensorflow/python:variables",
         ":replicate_model_fn",
     ],
-    tags = ["multi_gpu"],
+    tags = [
+        "manual",
+        "multi_gpu",
+        "notap",
+    ],
 )
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index 8191e06faed004df6927708ea04a67b90bd464de..0f75b77050b0ba4c752a6a74fdc7024170b6f318 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.estimator.python.estimator.head import *
 from tensorflow.contrib.estimator.python.estimator.linear import *
 from tensorflow.contrib.estimator.python.estimator.logit_fns import *
 from tensorflow.contrib.estimator.python.estimator.multi_head import *
+from tensorflow.contrib.estimator.python.estimator.replicate_model_fn import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
@@ -45,6 +46,8 @@ _allowed_symbols = [
     'call_logit_fn',
     'dnn_logit_fn_builder',
     'linear_logit_fn_builder',
+    'replicate_model_fn',
+    'TowerOptimizer',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py
index 29c3c7358534f6e8ebbd31cbfcd7e34086d9b506..c99bf8badb35e6fffb7cae8761db9d402b8b3a8f 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders.py
@@ -100,7 +100,7 @@ def add_metrics(estimator, metric_fn):
 
 
 def clip_gradients_by_norm(optimizer, clip_norm):
-  """Returns an optimizer which clips gradients before appliying them.
+  """Returns an optimizer which clips gradients before applying them.
 
   Example:
 
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders_test.py b/tensorflow/contrib/estimator/python/estimator/extenders_test.py
index 5f4a3cc902c9cc07c0688ad41dab7391a641c133..ad1a8ef152b07ecbab33d9eb3184a2ae89def27d 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.estimator.python.estimator import extenders
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator.canned import linear
 from tensorflow.python.feature_column import feature_column as fc
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index a9311a20f127d92f02a95b8b48082fc90850635a..238cf287b768eee28b20202084eb244c085c8b75 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator import util
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.canned import prediction_keys
@@ -29,7 +28,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_lib
@@ -44,6 +42,8 @@ _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 def multi_class_head(n_classes,
                      weight_column=None,
                      label_vocabulary=None,
+                     loss_reduction=losses.Reduction.SUM,
+                     loss_fn=None,
                      name=None):
   """Creates a `_Head` for multi class classification.
 
@@ -64,6 +64,12 @@ def multi_class_head(n_classes,
   labels have shape `[batch_size, 1]`, the loss is the weighted sum over
   `batch_size`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support integer `labels` with
+  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
+  the input labels before passing them to `loss_fn`.
+
   Args:
     n_classes: Number of classes, must be greater than 2 (for 2 classes, use
       `binary_classification_head`).
@@ -76,6 +82,9 @@ def multi_class_head(n_classes,
       integer within [0, n_classes). If given, labels must be of string type and
       have any value in `label_vocabulary`. Note that errors will be raised if
       `label_vocabulary` is not provided but labels are strings.
+    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
+      reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -83,17 +92,25 @@ def multi_class_head(n_classes,
     An instance of `_Head` for multi class classification.
 
   Raises:
-    ValueError: if `n_classes`, `metric_class_ids` or `label_keys` is invalid.
+    ValueError: if `n_classes`, `label_vocabulary` or `loss_reduction` is
+      invalid.
   """
   return head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint:disable=protected-access
       n_classes=n_classes,
       weight_column=weight_column,
       label_vocabulary=label_vocabulary,
+      loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
 def binary_classification_head(
-    weight_column=None, thresholds=None, label_vocabulary=None, name=None):
+    weight_column=None,
+    thresholds=None,
+    label_vocabulary=None,
+    loss_reduction=losses.Reduction.SUM,
+    loss_fn=None,
+    name=None):
   """Creates a `_Head` for single label binary classification.
 
   This head uses `sigmoid_cross_entropy_with_logits` loss.
@@ -113,6 +130,12 @@ def binary_classification_head(
   labels have shape `[batch_size, 1]`, the loss is the weighted sum over
   `batch_size`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support float `labels` with
+  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
+  the input labels before passing them to `loss_fn`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -128,6 +151,9 @@ def binary_classification_head(
       [0, 1]. If given, labels must be string type and have any value in
       `label_vocabulary`. Note that errors will be raised if `label_vocabulary`
       is not provided but labels are strings.
+    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
+      reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -135,17 +161,22 @@ def binary_classification_head(
     An instance of `_Head` for binary classification.
 
   Raises:
-    ValueError: if `thresholds` contains a value outside of `(0, 1)`.
+    ValueError: If `thresholds` contains a value outside of `(0, 1)`.
+    ValueError: If `loss_reduction` is invalid.
   """
   return head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint:disable=protected-access
       weight_column=weight_column,
       thresholds=thresholds,
       label_vocabulary=label_vocabulary,
+      loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
 def regression_head(weight_column=None,
                     label_dimension=1,
+                    loss_reduction=losses.Reduction.SUM,
+                    loss_fn=None,
                     name=None):
   """Creates a `_Head` for regression using the `mean_squared_error` loss.
 
@@ -164,6 +195,10 @@ def regression_head(weight_column=None,
   `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
   `[D0, D1, ... DN, label_dimension]`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, label_dimension]`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -172,15 +207,23 @@ def regression_head(weight_column=None,
     label_dimension: Number of regression labels per example. This is the size
       of the last dimension of the labels `Tensor` (typically, this has shape
       `[batch_size, label_dimension]`).
+    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
+      reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
   Returns:
     An instance of `_Head` for linear regression.
+
+  Raises:
+    ValueError: If `label_dimension` or `loss_reduction` is invalid.
   """
   return head_lib._regression_head_with_mean_squared_error_loss(  # pylint:disable=protected-access
       weight_column=weight_column,
       label_dimension=label_dimension,
+      loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
@@ -188,6 +231,7 @@ def multi_label_head(n_classes,
                      weight_column=None,
                      thresholds=None,
                      label_vocabulary=None,
+                     loss_reduction=losses.Reduction.SUM,
                      loss_fn=None,
                      name=None):
   """Creates a `_Head` for multi-label classification.
@@ -202,7 +246,7 @@ def multi_label_head(n_classes,
   `batch_size`.
 
   The head expects `logits` with shape `[D0, D1, ... DN, n_classes]`. In many
-  applications, the shape is `[batch_size, label_n_classes]`.
+  applications, the shape is `[batch_size, n_classes]`.
 
   Labels can be:
   * A multi-hot tensor of shape `[D0, D1, ... DN, n_classes]`
@@ -237,6 +281,8 @@ def multi_label_head(n_classes,
       [0, n_classes) or multi-hot Tensor. If given, labels must be SparseTensor
       string type and have any value in `label_vocabulary`. Also there will be
       errors if vocabulary is not provided and labels are string.
+    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
+      reduce training loss over batch. Defaults to `SUM`.
     loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
@@ -245,7 +291,8 @@ def multi_label_head(n_classes,
     An instance of `_Head` for multi-label classification.
 
   Raises:
-    ValueError: if `n_classes`, `thresholds`, or `loss_fn` is invalid.
+    ValueError: if `n_classes`, `thresholds`, `loss_reduction` or `loss_fn` is
+    invalid.
   """
   thresholds = tuple(thresholds) if thresholds else tuple()
   if n_classes is None or n_classes < 2:
@@ -266,10 +313,14 @@ def multi_label_head(n_classes,
           'Length of label_vocabulary must be n_classes ({}). '
           'Given: {}'.format(n_classes, len(label_vocabulary)))
   if loss_fn:
-    _validate_loss_fn_args(loss_fn)
+    head_lib._validate_loss_fn_args(loss_fn)  # pylint:disable=protected-access
+  if (loss_reduction not in losses.Reduction.all() or
+      loss_reduction == losses.Reduction.NONE):
+    raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
   return _MultiLabelHead(
       n_classes=n_classes, weight_column=weight_column, thresholds=thresholds,
-      label_vocabulary=label_vocabulary, loss_fn=loss_fn, name=name)
+      label_vocabulary=label_vocabulary, loss_reduction=loss_reduction,
+      loss_fn=loss_fn, name=name)
 
 
 class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
@@ -280,12 +331,14 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                weight_column=None,
                thresholds=None,
                label_vocabulary=None,
+               loss_reduction=losses.Reduction.SUM,
                loss_fn=None,
                name=None):
     self._n_classes = n_classes
     self._weight_column = weight_column
     self._thresholds = thresholds
     self._label_vocabulary = label_vocabulary
+    self._loss_reduction = loss_reduction
     self._loss_fn = loss_fn
     self._name = name
 
@@ -344,9 +397,9 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         labels=processed_labels, logits=logits,
         expected_labels_dimension=self.logits_dimension)
     if self._loss_fn:
-      unweighted_loss = _call_loss_fn(
+      unweighted_loss = head_lib._call_loss_fn(  # pylint:disable=protected-access
           loss_fn=self._loss_fn, labels=processed_labels, logits=logits,
-          features=features)
+          features=features, expected_loss_dim=1)
     else:
       unweighted_loss = losses.sigmoid_cross_entropy(
           multi_class_labels=processed_labels, logits=logits,
@@ -356,19 +409,41 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
           unweighted_loss, axis=-1, keep_dims=True)
     weights = head_lib._get_weights_and_check_match_logits(  # pylint:disable=protected-access,
         features=features, weight_column=self._weight_column, logits=logits)
-    weighted_sum_loss = losses.compute_weighted_loss(
-        unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
-    # _weights() can return 1.
-    example_weight_sum = math_ops.reduce_sum(
-        weights * array_ops.ones_like(unweighted_loss))
+    training_loss = losses.compute_weighted_loss(
+        unweighted_loss, weights=weights, reduction=self._loss_reduction)
     return head_lib.LossSpec(
-        weighted_sum_loss=weighted_sum_loss,
-        example_weight_sum=example_weight_sum,
+        training_loss=training_loss,
+        unreduced_loss=unweighted_loss,
+        weights=weights,
         processed_labels=processed_labels)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None):
-    """See `Head`."""
+      self, features, mode, logits, labels=None, train_op_fn=None,
+      regularization_losses=None):
+    """Returns an `EstimatorSpec`.
+
+    Args:
+      features: Input `dict` of `Tensor` or `SparseTensor` objects.
+      mode: Estimator's `ModeKeys`.
+      logits: logits `Tensor` with shape `[D0, D1, ... DN, n_classes]`.
+        For many applications, the shape is `[batch_size, n_classes]`.
+      labels: Labels with shape matching `logits`. Can be multi-hot `Tensor`
+        with shape `[D0, D1, ... DN, n_classes]` or `SparseTensor` with
+        `dense_shape` `[D0, D1, ... DN, ?]`. `labels` is required argument when
+        `mode` equals `TRAIN` or `EVAL`.
+      train_op_fn: Function that takes a scalar loss `Tensor` and returns
+        `train_op`. Required in TRAIN mode.
+      regularization_losses: A list of additional scalar losses to be added to
+        the training loss, such as regularization losses. These losses are
+        usually expressed as a batch average, so for best results users need to
+        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
+        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
+        avoid scaling errors.
+    Returns:
+      `EstimatorSpec`.
+    Raises:
+      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+    """
     with ops.name_scope(self._name, 'head'):
       logits = head_lib._check_logits_final_dim(logits, self.logits_dimension)  # pylint:disable=protected-access
 
@@ -394,60 +469,74 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                     export_output.PredictOutput(predictions))
             })
 
-      (weighted_sum_loss, example_weight_sum,
+      (training_loss, unreduced_loss, weights,
        processed_labels) = self.create_loss(
            features=features, mode=mode, logits=logits, labels=labels)
+      if regularization_losses:
+        regularization_loss = math_ops.add_n(regularization_losses)
+        regularized_training_loss = math_ops.add_n(
+            [training_loss, regularization_loss])
+      else:
+        regularization_loss = None
+        regularized_training_loss = training_loss
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
-        weights = head_lib._get_weights_and_check_match_logits(  # pylint:disable=protected-access,
-            features=features, weight_column=self._weight_column, logits=logits)
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
-            loss=weighted_sum_loss,
+            loss=regularized_training_loss,
             eval_metric_ops=self._eval_metric_ops(
                 labels=processed_labels,
                 probabilities=probabilities,
                 weights=weights,
-                weighted_sum_loss=weighted_sum_loss,
-                example_weight_sum=example_weight_sum))
+                unreduced_loss=unreduced_loss,
+                regularization_loss=regularization_loss))
 
       # Train.
       if train_op_fn is None:
         raise ValueError('train_op_fn can not be None.')
+      # Only summarize mean_loss for SUM reduction to preserve backwards
+      # compatibility. Otherwise skip it to avoid unnecessary computation.
+      if self._loss_reduction == losses.Reduction.SUM:
+        example_weight_sum = math_ops.reduce_sum(
+            weights * array_ops.ones_like(unreduced_loss))
+        mean_loss = training_loss / example_weight_sum
+      else:
+        mean_loss = None
     with ops.name_scope(''):
+      keys = metric_keys.MetricKeys
       summary.scalar(
-          head_lib._summary_key(self._name, metric_keys.MetricKeys.LOSS),  # pylint:disable=protected-access
-          weighted_sum_loss)
-      summary.scalar(
-          head_lib._summary_key(  # pylint:disable=protected-access
-              self._name, metric_keys.MetricKeys.LOSS_MEAN),
-          weighted_sum_loss / example_weight_sum)
+          head_lib._summary_key(self._name, keys.LOSS),  # pylint:disable=protected-access
+          regularized_training_loss)
+      if mean_loss is not None:
+        summary.scalar(
+            head_lib._summary_key(self._name, keys.LOSS_MEAN),  # pylint:disable=protected-access
+            mean_loss)
+      if regularization_loss is not None:
+        summary.scalar(
+            head_lib._summary_key(self._name, keys.LOSS_REGULARIZATION),  # pylint:disable=protected-access
+            regularization_loss)
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
-        loss=weighted_sum_loss,
-        train_op=train_op_fn(weighted_sum_loss))
+        loss=regularized_training_loss,
+        train_op=train_op_fn(regularized_training_loss))
 
-  def _eval_metric_ops(self, labels, probabilities, weights, weighted_sum_loss,
-                       example_weight_sum):
+  def _eval_metric_ops(
+      self, labels, probabilities, weights, unreduced_loss,
+      regularization_loss):
     """Returns a dict of metrics for eval_metric_ops."""
     with ops.name_scope(
         None, 'metrics',
-        [labels, probabilities, weights, weighted_sum_loss, example_weight_sum
-        ]):
+        [labels, probabilities, weights, unreduced_loss, regularization_loss]):
       keys = metric_keys.MetricKeys
       metric_ops = {
           # Estimator already adds a metric for loss.
           head_lib._summary_key(self._name, keys.LOSS_MEAN):  # pylint:disable=protected-access
               metrics_lib.mean(
-                  # Both values and weights here are reduced, scalar Tensors.
-                  # values is the actual mean we want, but we pass the scalar
-                  # example_weight_sum in order to return the correct update_op
-                  # alongside the value_op for streaming metrics.
-                  values=(weighted_sum_loss / example_weight_sum),
-                  weights=example_weight_sum,
+                  values=unreduced_loss,
+                  weights=weights,
                   name=keys.LOSS_MEAN),
           head_lib._summary_key(self._name, keys.AUC):  # pylint:disable=protected-access
               metrics_lib.auc(labels=labels, predictions=probabilities,
@@ -457,6 +546,13 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                               weights=weights, curve='PR',
                               name=keys.AUC_PR),
       }
+      if regularization_loss is not None:
+        loss_regularization_key = head_lib._summary_key(  # pylint:disable=protected-access
+            self._name, keys.LOSS_REGULARIZATION)
+        metric_ops[loss_regularization_key] = (
+            metrics_lib.mean(
+                values=regularization_loss,
+                name=keys.LOSS_REGULARIZATION))
       for threshold in self._thresholds:
         accuracy_key = keys.ACCURACY_AT_THRESHOLD % threshold
         metric_ops[head_lib._summary_key(self._name, accuracy_key)] = (  # pylint:disable=protected-access
@@ -485,52 +581,3 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                 threshold=threshold,
                 name=recall_key))
     return metric_ops
-
-
-def _validate_loss_fn_args(loss_fn):
-  """Validates loss_fn arguments.
-
-  Required arguments: labels, logits.
-  Optional arguments: features.
-
-  Args:
-    loss_fn: The loss function.
-  Raises:
-    ValueError: If the signature is unexpected.
-  """
-  loss_fn_args = util.fn_args(loss_fn)
-  for required_arg in ['labels', 'logits']:
-    if required_arg not in loss_fn_args:
-      raise ValueError(
-          'loss_fn must contain argument: {}. '
-          'Given arguments: {}'.format(required_arg, loss_fn_args))
-  invalid_args = list(set(loss_fn_args) - set(['labels', 'logits', 'features']))
-  if invalid_args:
-    raise ValueError('loss_fn has unexpected args: {}'.format(invalid_args))
-
-
-def _call_loss_fn(loss_fn, labels, logits, features):
-  """Calls loss_fn and checks the returned shape.
-
-  Args:
-    loss_fn: The loss function.
-    labels: Processed labels Tensor.
-    logits: Logits Tensor of shape [batch_size, logits_dimension].
-    features: Features dict.
-  Returns:
-    Loss Tensor with shape [batch_size, 1].
-  """
-  loss_fn_args = util.fn_args(loss_fn)
-  kwargs = {}
-  if 'features' in loss_fn_args:
-    kwargs['features'] = features
-  unweighted_loss = loss_fn(labels=labels, logits=logits, **kwargs)
-  batch_size = array_ops.shape(logits)[0]
-  loss_shape = array_ops.shape(unweighted_loss)
-  check_shape_op = control_flow_ops.Assert(
-      math_ops.reduce_all(math_ops.equal(loss_shape, [batch_size, 1])),
-      data=[
-          'loss_fn must return Tensor of shape [batch_size, 1]. Given: ',
-          loss_shape])
-  with ops.control_dependencies([check_shape_op]):
-    return array_ops.identity(unweighted_loss)
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index d1cf9090048470181818c573647923c9f5824dfa..43cdfec9689879201305385499b3b784e1593d60 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import monitored_session
@@ -132,6 +133,16 @@ class MultiLabelHead(test.TestCase):
         r'Length of label_vocabulary must be n_classes \(3\). Given: 2'):
       head_lib.multi_label_head(n_classes=3, label_vocabulary=['foo', 'bar'])
 
+  def test_invalid_loss_reduction(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'Invalid loss_reduction: invalid_loss_reduction'):
+      head_lib.multi_label_head(
+          n_classes=3, loss_reduction='invalid_loss_reduction')
+    with self.assertRaisesRegexp(
+        ValueError, r'Invalid loss_reduction: none'):
+      head_lib.multi_label_head(
+          n_classes=3, loss_reduction=losses.Reduction.NONE)
+
   def test_loss_fn_arg_labels_missing(self):
     def _loss_fn(logits):
       del logits  # Unused
@@ -262,17 +273,17 @@ class MultiLabelHead(test.TestCase):
     labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
     # loss = labels * -log(sigmoid(logits)) +
     #        (1 - labels) * -log(1 - sigmoid(logits))
-    expected_weighted_sum_loss = np.sum(
+    expected_training_loss = np.sum(
         _sigmoid_cross_entropy(labels=labels, logits=logits))
-    actual_weighted_sum_loss = head.create_loss(
+    actual_training_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
         labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_weighted_sum_loss,
-                          actual_weighted_sum_loss.eval())
+      self.assertAllClose(expected_training_loss,
+                          actual_training_loss.eval())
 
   def test_eval_create_loss_large_logits(self):
     """Tests head.create_loss for eval mode and large logits."""
@@ -286,9 +297,9 @@ class MultiLabelHead(test.TestCase):
     # For large logits, this is approximated as:
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits
-    expected_weighted_sum_loss = np.sum(
+    expected_training_loss = np.sum(
         np.array([[(10. + 10.) / 2.], [(15. + 0.) / 2.]], dtype=np.float32))
-    actual_weighted_sum_loss = head.create_loss(
+    actual_training_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
@@ -296,9 +307,7 @@ class MultiLabelHead(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss,
-          actual_weighted_sum_loss.eval(),
-          atol=1e-4)
+          expected_training_loss, actual_training_loss.eval(), atol=1e-4)
 
   def test_eval_create_loss_labels_wrong_shape(self):
     """Tests head.create_loss for eval mode when labels has the wrong shape."""
@@ -307,7 +316,7 @@ class MultiLabelHead(test.TestCase):
 
     logits = np.array([[-1., 1.], [-1.5, 1.]], dtype=np.float32)
     labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
-    actual_weighted_sum_loss = head.create_loss(
+    actual_training_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
@@ -317,14 +326,14 @@ class MultiLabelHead(test.TestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           r'\[expected_labels_shape: \] \[2 2\] \[labels_shape: \] \[2 1\]'):
-        actual_weighted_sum_loss.eval({
+        actual_training_loss.eval({
             labels_placeholder: np.array([[1], [1]], dtype=np.int64)
         })
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           r'labels shape must be \[D0, D1, ... DN, 2\]\..*'
           r'\[Received shape: \] \[2\]'):
-        actual_weighted_sum_loss.eval({
+        actual_training_loss.eval({
             labels_placeholder: np.array([1, 1], dtype=np.int64)
         })
 
@@ -344,14 +353,14 @@ class MultiLabelHead(test.TestCase):
         return constant_op.constant(loss)
     head = head_lib.multi_label_head(n_classes=2, loss_fn=_loss_fn)
 
-    actual_weighted_sum_loss = head.create_loss(
+    actual_training_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_input,
         labels=labels_input)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(np.sum(loss), actual_weighted_sum_loss.eval())
+      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
 
   def test_eval_create_loss_loss_fn_wrong_shape(self):
     """Tests custom loss_fn that returns Tensor of unexpected shape."""
@@ -363,7 +372,7 @@ class MultiLabelHead(test.TestCase):
 
     logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
     labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    actual_weighted_sum_loss = head.create_loss(
+    actual_training_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
@@ -372,9 +381,9 @@ class MultiLabelHead(test.TestCase):
       _initialize_variables(self, monitored_session.Scaffold())
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
-          r'loss_fn must return Tensor of shape \[batch_size, 1\]\. '
-          r'Given: \] \[2\]'):
-        actual_weighted_sum_loss.eval()
+          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 1\]\. \] '
+          r'\[logits_shape: \] \[2 2\] \[loss_shape: \] \[2\]'):
+        actual_training_loss.eval()
 
   def test_eval_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -390,12 +399,13 @@ class MultiLabelHead(test.TestCase):
 
   def _test_eval(
       self, head, logits, labels, expected_loss, expected_metrics,
-      features=None):
+      features=None, regularization_losses=None):
     spec = head.create_estimator_spec(
         features=features or {},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels)
+        labels=labels,
+        regularization_losses=regularization_losses)
 
     # Assert spec contains expected tensors.
     self.assertIsNotNone(spec.loss)
@@ -477,6 +487,38 @@ class MultiLabelHead(test.TestCase):
         expected_loss=expected_loss,
         expected_metrics=expected_metrics)
 
+  def test_eval_with_regularization_losses(self):
+    n_classes = 2
+    head = head_lib.multi_label_head(
+        n_classes, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
+    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    regularization_losses = [1.5, 0.5]
+    expected_regularization_loss = 2.
+    # unregularized_loss = sum(
+    #     labels * -log(sigmoid(logits)) +
+    #     (1 - labels) * -log(1 - sigmoid(logits))) / batch_size
+    expected_unregularized_loss = np.sum(
+        _sigmoid_cross_entropy(labels=labels, logits=logits)) / 2.
+    expected_regularized_loss = (
+        expected_unregularized_loss + expected_regularization_loss)
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_unregularized_loss,
+        keys.LOSS_REGULARIZATION: expected_regularization_loss,
+        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
+        # this assert tests that the algorithm remains consistent.
+        keys.AUC: 0.3333,
+        keys.AUC_PR: 0.7639,
+    }
+    self._test_eval(
+        head=head,
+        logits=logits,
+        labels=labels,
+        expected_loss=expected_regularized_loss,
+        expected_metrics=expected_metrics,
+        regularization_losses=regularization_losses)
+
   def test_eval_with_label_vocabulary(self):
     n_classes = 2
     head = head_lib.multi_label_head(
@@ -618,12 +660,44 @@ class MultiLabelHead(test.TestCase):
     # For large logits, this is approximated as:
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits
-    expected_weighted_sum_loss = np.sum(
-        np.array(
-            [[1. * (10. + 10.) / 2.], [2. * (15. + 0.) / 2.]],
-            dtype=np.float32))
-    expected_example_weight_sum = 1. + 2.
-    actual_weighted_sum_loss, actual_example_weight_sum, _ = head.create_loss(
+    expected_unreduced_loss = [[(10. + 10.) / 2.], [(15. + 0.) / 2.]]
+    expected_weights = [[1.], [2.]]
+    expected_training_loss = 1. * (10. + 10.) / 2. + 2. * (15. + 0.) / 2.
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
+        features={
+            'x': np.array(((42,),), dtype=np.int32),
+            'example_weights': weights
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(
+          expected_training_loss, training_loss.eval(), atol=1e-4)
+      self.assertAllClose(
+          expected_unreduced_loss, unreduced_loss.eval(), atol=1e-4)
+      self.assertAllClose(expected_weights, actual_weights.eval())
+
+  def test_train_create_loss_loss_reduction(self):
+    """Tests head.create_loss with loss_reduction."""
+    n_classes = 2
+    head = head_lib.multi_label_head(
+        n_classes, weight_column='example_weights',
+        loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+
+    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
+    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    weights = np.array([[1.], [2.]], dtype=np.float32)
+    # loss = labels * -log(sigmoid(logits)) +
+    #        (1 - labels) * -log(1 - sigmoid(logits))
+    # For large logits, this is approximated as:
+    # loss = labels * (logits < 0) * (-logits) +
+    #        (1 - labels) * (logits > 0) * logits
+    expected_unreduced_loss = [[(10. + 10.) / 2.], [(15. + 0.) / 2.]]
+    expected_weights = [[1.], [2.]]
+    expected_training_loss = (1. * (10. + 10.) / 2. + 2. * (15. + 0.) / 2.) / 2.
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features={
             'x': np.array(((42,),), dtype=np.int32),
             'example_weights': weights
@@ -634,13 +708,10 @@ class MultiLabelHead(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss,
-          actual_weighted_sum_loss.eval(),
-          atol=1e-4)
+          expected_training_loss, training_loss.eval(), atol=1e-4)
       self.assertAllClose(
-          expected_example_weight_sum,
-          actual_example_weight_sum.eval(),
-          atol=1e-4)
+          expected_unreduced_loss, unreduced_loss.eval(), atol=1e-4)
+      self.assertAllClose(expected_weights, actual_weights.eval())
 
   def test_train_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -791,6 +862,49 @@ class MultiLabelHead(test.TestCase):
     self._test_train(
         head=head, logits=logits, labels=labels, expected_loss=expected_loss)
 
+  def test_train_with_regularization_losses(self):
+    head = head_lib.multi_label_head(
+        n_classes=2, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
+    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    regularization_losses = [1.5, 0.5]
+    # For large logits, sigmoid cross entropy loss is approximated as:
+    # loss = labels * (logits < 0) * (-logits) +
+    #        (1 - labels) * (logits > 0) * logits =>
+    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
+    # Average over classes and over batch and add regularization loss.
+    expected_loss = 35. / 4. + 2.
+    expected_summaries = {
+        metric_keys.MetricKeys.LOSS: expected_loss,
+        metric_keys.MetricKeys.LOSS_REGULARIZATION: 2.,
+    }
+    expected_train_result = 'my_train_op'
+    def _train_op_fn(loss):
+      return string_ops.string_join(
+          [constant_op.constant(expected_train_result),
+           string_ops.as_string(loss, precision=3)])
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        regularization_losses=regularization_losses)
+
+    # Assert predictions, loss, train_op, and summaries.
+    tol = 1e-3
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
+                                                  spec.scaffold.summary_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
+          train_result)
+      _assert_simple_summaries(self, expected_summaries, summary_str, tol)
+
   def test_train_with_weights(self):
     n_classes = 2
     head = head_lib.multi_label_head(n_classes, weight_column='example_weights')
@@ -851,12 +965,15 @@ class MultiLabelHead(test.TestCase):
     labels = np.array([[[1, 0, 0], [1, 0, 0]],
                        [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
     weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-    # loss = [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
-    #      = [[20/3, 10/3], [4, 8]]
+    # unreduced_loss =
+    #     [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
+    #   = [[20/3, 10/3], [4, 8]]
+    expected_unreduced_loss = [[[20./3.], [10./3.]], [[4.], [8.]]]
+    # weights are reshaped to [2, 2, 1] to match logits.
+    expected_weights = [[[1.], [1.5]], [[2.], [2.5]]]
     # weighted_sum_loss = 1*20/3 + 1.5*10/3 + 2*4 + 2.5*8 = 39.6667
-    expected_weighted_sum_loss = 39.6667
-    expected_example_weight_sum = np.sum(weights)
-    actual_weighted_sum_loss, actual_example_weight_sum, _ = head.create_loss(
+    expected_training_loss = 39.6667
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features={'weights': weights},
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
@@ -865,11 +982,10 @@ class MultiLabelHead(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss, actual_weighted_sum_loss.eval(),
-          atol=atol)
+          expected_training_loss, training_loss.eval(), atol=atol)
       self.assertAllClose(
-          expected_example_weight_sum, actual_example_weight_sum.eval(),
-          atol=atol)
+          expected_unreduced_loss, unreduced_loss.eval(), atol=atol)
+      self.assertAllClose(expected_weights, actual_weights.eval())
 
   def test_multi_dim_weighted_train(self):
     """Logits and labels of shape [2, 2, 3], weights [2, 2]."""
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head.py b/tensorflow/contrib/estimator/python/estimator/multi_head.py
index f2a6eae03ec021e5c28d48b3887870d8a057e077..0346ddc24bffd61068177f4622bd03be4acd53d9 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head.py
@@ -186,40 +186,44 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
       logits_dict = logits
     else:
       logits_dict = self._split_logits(logits)
-    weighted_sum_losses = []
-    example_weight_sums = []
+    training_losses = []
     labels_by_head = {}
-    for head in self._heads:
-      (weighted_sum_loss,
-       example_weight_sum, processed_labels) = head.create_loss(
+    unreduced_losses_by_head = {}
+    example_weights_by_head = {}
+    for i, head in enumerate(self._heads):
+      (training_loss, unreduced_loss,
+       weights, processed_labels) = head.create_loss(
            features, mode, logits_dict[head.name], labels[head.name])
-      weighted_sum_losses.append(weighted_sum_loss)
-      example_weight_sums.append(example_weight_sum)
+      training_losses.append(training_loss)
       labels_by_head[head.name] = processed_labels
+      if self._head_weights:
+        head_weight = self._head_weights[i]
+        unreduced_losses_by_head[head.name] = math_ops.multiply(
+            unreduced_loss, head_weight)
+        example_weights_by_head[head.name] = math_ops.multiply(
+            weights, head_weight)
+      else:
+        unreduced_losses_by_head[head.name] = unreduced_loss
+        example_weights_by_head[head.name] = weights
 
-    weighted_sum_losses = tuple(weighted_sum_losses)
-    with ops.name_scope('merge_losses',
-                        values=weighted_sum_losses + (self._head_weights or
-                                                      tuple())):
+    training_losses = tuple(training_losses)
+    with ops.name_scope(
+        'merge_losses',
+        values=training_losses + (self._head_weights or tuple())):
       if self._head_weights:
-        head_weighted_losses = []
-        head_weighted_example_weight_sums = []
-        for loss, example_weight_sum, weight in zip(weighted_sum_losses,
-                                                    example_weight_sums,
-                                                    self._head_weights):
-          head_weighted_losses.append(math_ops.multiply(loss, weight))
-          head_weighted_example_weight_sums.append(math_ops.multiply(
-              example_weight_sum, weight))
-        merged_weighted_sum_loss = math_ops.add_n(head_weighted_losses)
-        merged_example_weight_sum = math_ops.add_n(
-            head_weighted_example_weight_sums)
+        head_weighted_training_losses = []
+        for training_loss, head_weight in zip(
+            training_losses, self._head_weights):
+          head_weighted_training_losses.append(
+              math_ops.multiply(training_loss, head_weight))
+        merged_training_loss = math_ops.add_n(head_weighted_training_losses)
       else:
-        merged_weighted_sum_loss = math_ops.add_n(weighted_sum_losses)
-        merged_example_weight_sum = math_ops.add_n(example_weight_sums)
+        merged_training_loss = math_ops.add_n(training_losses)
 
     return head_lib.LossSpec(
-        weighted_sum_loss=merged_weighted_sum_loss,
-        example_weight_sum=merged_example_weight_sum,
+        training_loss=merged_training_loss,
+        unreduced_loss=unreduced_losses_by_head,
+        weights=example_weights_by_head,
         processed_labels=labels_by_head)
 
   def create_estimator_spec(
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
index 68f2d5d1cd53456f7dd82222e171b3619052321a..65ea89ba1b9236d0bf4d2de430fab168ef50bf97 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
@@ -370,7 +370,7 @@ class MultiHeadTest(test.TestCase):
         'head1': np.array([[1, 0], [1, 1]], dtype=np.int64),
         'head2': np.array([[0, 1, 0], [1, 1, 0]], dtype=np.int64),
     }
-    weighted_sum_loss, example_weight_sum, _ = multi_head.create_loss(
+    training_loss, unreduced_losses, weights, _ = multi_head.create_loss(
         features={
             'x': np.array(((42,),), dtype=np.int32),
             'weights1': weights1,
@@ -383,14 +383,23 @@ class MultiHeadTest(test.TestCase):
     with self.test_session():
       # loss of the first head is [[(10 + 10) / 2], [(15 + 0) / 2]]
       # = [10, 7.5]
-      # weighted_sum_loss = 1 * 10 + 2 * 7.5 = 25
+      # training_loss = 1 * 10 + 2 * 7.5 = 25
+      # head-weighted unreduced_loss = 1 * [10, 7.5]
+      self.assertAllClose(
+          [[10.], [7.5]], unreduced_losses['head1'].eval(), rtol=tol, atol=tol)
       # loss of the second head is [[(20 + 20 + 20) / 3], [(30 + 0 + 0) / 3]]
       # = [20, 10]
-      # weighted_sum_loss = 2 * 20 + 3 * 10 = 70
-      # head-weighted merge = 1 * 25 + 2 * 70 = 165
-      self.assertAllClose(165, weighted_sum_loss.eval(), rtol=tol, atol=tol)
-      # example_weight_sum = 1 * (1 + 2) + 2 * (2 + 3) = 13
-      self.assertAllClose(13., example_weight_sum.eval(), rtol=tol, atol=tol)
+      # training_loss = 2 * 20 + 3 * 10 = 70
+      # head-weighted unreduced_loss = 2 * [20, 10]
+      self.assertAllClose(
+          [[40.], [20.]], unreduced_losses['head2'].eval(), rtol=tol, atol=tol)
+      # head-weighted training_loss = 1 * 25 + 2 * 70 = 165
+      self.assertAllClose(165, training_loss.eval(), rtol=tol, atol=tol)
+      # head-weighted example weights
+      self.assertAllClose(
+          [[1.], [2.]], weights['head1'].eval(), rtol=tol, atol=tol)
+      self.assertAllClose(
+          [[4.], [6.]], weights['head2'].eval(), rtol=tol, atol=tol)
 
   def test_train_create_loss_logits_tensor(self):
     """Tests create_loss with logits Tensor."""
@@ -409,7 +418,7 @@ class MultiHeadTest(test.TestCase):
         'head1': np.array([[1, 0], [1, 1]], dtype=np.int64),
         'head2': np.array([[0, 1, 0], [1, 1, 0]], dtype=np.int64),
     }
-    weighted_sum_loss, example_weight_sum, _ = multi_head.create_loss(
+    training_loss, unreduced_losses, weights, _ = multi_head.create_loss(
         features={
             'x': np.array(((42,),), dtype=np.int32),
             'weights1': weights1,
@@ -422,14 +431,23 @@ class MultiHeadTest(test.TestCase):
     with self.test_session():
       # loss of the first head is [[(10 + 10) / 2], [(15 + 0) / 2]]
       # = [10, 7.5]
-      # weighted_sum_loss = 1 * 10 + 2 * 7.5 = 25
+      # training_loss = 1 * 10 + 2 * 7.5 = 25
+      # head-weighted unreduced_loss = 1 * [10, 7.5]
+      self.assertAllClose(
+          [[10.], [7.5]], unreduced_losses['head1'].eval(), rtol=tol, atol=tol)
       # loss of the second head is [[(20 + 20 + 20) / 3], [(30 + 0 + 0) / 3]]
       # = [20, 10]
-      # weighted_sum_loss = 2 * 20 + 3 * 10 = 70
-      # head-weighted merge = 1 * 25 + 2 * 70 = 165
-      self.assertAllClose(165, weighted_sum_loss.eval(), rtol=tol, atol=tol)
-      # example_weight_sum = 1 * (1 + 2) + 2 * (2 + 3) = 13
-      self.assertAllClose(13., example_weight_sum.eval(), rtol=tol, atol=tol)
+      # training_loss = 2 * 20 + 3 * 10 = 70
+      # head-weighted unreduced_loss = 2 * [20, 10]
+      self.assertAllClose(
+          [[40.], [20.]], unreduced_losses['head2'].eval(), rtol=tol, atol=tol)
+      # head-weighted training_loss = 1 * 25 + 2 * 70 = 165
+      self.assertAllClose(165, training_loss.eval(), rtol=tol, atol=tol)
+      # head-weighted example weights
+      self.assertAllClose(
+          [[1.], [2.]], weights['head1'].eval(), rtol=tol, atol=tol)
+      self.assertAllClose(
+          [[4.], [6.]], weights['head2'].eval(), rtol=tol, atol=tol)
 
   def test_train_create_loss_logits_tensor_multi_dim(self):
     """Tests create_loss with multi-dimensional logits of shape [2, 2, 5]."""
@@ -455,20 +473,17 @@ class MultiHeadTest(test.TestCase):
     # loss2 = (0-2)^2 + (1+2)^2 + (0-2)^2 + (0-2)^2 + (1+2)^2 + (0-2)^2 +
     #         (2+2)^2 + (2-2)^2 + (0+2)^2 + (2+2)^2 + (2-2)^2 + (0+2)^2
     #       = 74
-    expected_weighted_sum_loss = 28. + 74.
+    expected_training_loss = 28. + 74.
 
-    weighted_sum_loss, example_weight_sum, _ = multi_head.create_loss(
+    training_loss = multi_head.create_loss(
         features={},
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     tol = 1e-3
     with self.test_session():
       self.assertAllClose(
-          expected_weighted_sum_loss, weighted_sum_loss.eval(),
-          rtol=tol, atol=tol)
-      self.assertAllClose(
-          2. * 2. * 5., example_weight_sum.eval(), rtol=tol, atol=tol)
+          expected_training_loss, training_loss.eval(), rtol=tol, atol=tol)
 
   def test_train_one_head(self):
     head1 = head_lib.multi_label_head(n_classes=2, name='head1')
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index d9c83aa86577aa129458c56887ff4668c103d0db..7134cd3f5a457a322f51066eb791133c3181d3fb 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -23,6 +23,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from collections import defaultdict
+from contextlib import contextmanager
 import copy
 
 import six
@@ -41,20 +43,24 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import tf_logging
-from tensorflow.python.training import training_util
+from tensorflow.python.training import device_setter as device_setter_lib
+from tensorflow.python.training import optimizer as optimizer_lib
 
 
-def replicate_model_fn(model_fn, optimizer_fn, devices=None):
-  """Replicate `Estimator.model_fn` over GPUs within a single host.
+def replicate_model_fn(model_fn,
+                       loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
+                       devices=None):
+  """Replicate `Estimator.model_fn` over GPUs.
 
   The given `model_fn` specifies a single forward pass of a model.  To replicate
   such a model over GPUs, each GPU gets its own instance of the forward pass
   (a.k.a. a tower).  The input features and labels get sharded into the chunks
-  that correspond to the number of GPUs.  Each tower computes its own loss based
+  that correspond to the number of GPUs.  Each tower computes a loss based
   on its input.  For each such loss, gradients are computed.  After that, the
-  available losses are summed to form aggregated loss.  The available
-  gradients are summed too.  Then, they update weights using the specified
+  available losses are aggregated to form aggregated loss.  Available
+  gradients are summed.  Then, they update weights using the specified
   optimizer.
 
   If `devices` are `None`, then all available GPUs are going to be used for
@@ -63,36 +69,38 @@ def replicate_model_fn(model_fn, optimizer_fn, devices=None):
 
   Two modes of local replication over available GPUs are supported:
     1)  If exactly 1 GPU is detected, then variables and operations are placed
-        onto GPU.
+        onto the GPU.
     2)  If more than 1 GPU is detected, then variables are going to be placed on
         the CPU.  Replicas of operations are placed on each individual GPU.
 
   Here is an example of how one might use their `model_fn` to run over GPUs:
     ```python
-       def optimizer_fn():
-         return tf.train.GradientDescentOptimizer(learning_rate=0.001)
        ...
        def model_fn(...):  # See `model_fn` in `Estimator`.
          loss = ...
+         optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
+         optimizer = tf.contrib.estimator.TowerOptimizer(optimizer)
          if mode == tf.estimator.ModeKeys.TRAIN:
            #  See the section below on `EstimatorSpec.train_op`.
-           return EstimatorSpec(mode=mode, loss=loss, train_op=tf.noop())
+           return EstimatorSpec(mode=mode, loss=loss,
+                                train_op=optimizer.minimize(loss))
 
          #  No change for `ModeKeys.EVAL` or `ModeKeys.PREDICT`.
          return EstimatorSpec(...)
        ...
        classifier = tf.estimator.Estimator(
-         model_fn=replicate_model_fn.replicate_model_fn(model_fn, optimizer_fn))
+         model_fn=tf.contrib.estimator.replicate_model_fn(model_fn))
     ```
 
+  Please see `DNNClassifierIntegrationTest` for an example with a canned
+  Estimator.
+
   On `EstimatorSpec.train_op`:
   `model_fn` returns `EstimatorSpec.train_op` for
   `tf.estimator.GraphKeys.TRAIN`. It is typically derived using an optimizer.
-  `replicate_model_fn` ignores the returned `EstimatorSpec.train_op`, so there
-  is no need to use an optimizer inside the user's `model_fn`.  The
-  `EstimatorSpec.loss` subgraph is going to be executed, while
-  `EstimatorSpec.train_op` isn't going to be executed. One could pass
-  `train_op=tf.noop()` to `EstimatorSpec`.
+  Towers are expected to populate it in the same way.  Gradients from all towers
+  are reduced and applied in the last tower.  To achieve that in the case of
+  multiple towers, `TowerOptimizer` needs to be used.  See `TowerOptimizer`.
 
   On sharding input features and labels:
   Input features and labels are split for consumption by each tower. They are
@@ -101,7 +109,7 @@ def replicate_model_fn(model_fn, optimizer_fn, devices=None):
   On reduction algorithms:
   Certain algorithms were chosen for aggregating results of computations on
   multiple towers:
-    - Losses from all towers are reduced using sum.
+    - Losses from all towers are reduced according to `loss_reduction`.
     - Gradients are reduced using sum for each trainable variable.
     - `eval_metrics_ops` are reduced per metric using `reduce_mean`.
     - `EstimatorSpec.predictions` and `EstimatorSpec.export_outputs` are
@@ -109,65 +117,332 @@ def replicate_model_fn(model_fn, optimizer_fn, devices=None):
     - For all other fields of `EstimatorSpec` the values of the first tower
       are taken.
 
-  On replication of variables:
+  On distribution of variables:
   Variables are not duplicated between towers.  Instead, they are placed on a
   single device as defined above and shared across towers.
 
-  Other current limitations:
-    - `predictions` are not supported for `ModeKeys.EVAL`.  That is required for
-      `tf.contrib.estimator.add_metrics`.
+  On overhead:
+  If only one device is specified, then aggregation of loss and gradients
+  doesn't happen. Replication consists of placing `model_fn` onto the
+  specified device.
+
+  On current limitations:
+    - `predictions` are not supported for `ModeKeys.EVAL`.  They are required
+       for `tf.contrib.estimator.add_metrics`.
 
   Args:
     model_fn: `model_fn` as defined in `Estimator`.  See the section above about
       the train_op argument of `EstimatorSpec`.
-    optimizer_fn: a function that returns an optimizer instance.  The function
-      may accept one `params` argument.  This is the `params` argument as
-      defined by `Estimator`.  See  the `Estimator` documentation for details.
+    loss_reduction: controls whether losses are summed or averaged.
     devices: Optional list of devices to replicate the model across.  This
       argument can be used to replice only on the subset of available GPUs.
       If `None`, then all available GPUs are going to be used for replication.
       If no GPUs are available, then the model is going to be placed on the CPU.
 
+  Raises:
+    ValueError: if there is no `loss_reduction` or if TowerOptimizer is
+      mis-used.
+
   Returns:
     A replicated version of the supplied `model_fn`. Returned function that
       conforms to the requirements of `Estimator`'s `model_fn` and can be used
       instead of the supplied `model_fn`.
   """
+  return _replicate_model_fn_with_mode(
+      model_fn,
+      loss_reduction,
+      devices,
+      # TODO(isaprykin): Query the system configuration to choose modes other
+      # than `SHARED_LOCAL_PARAMETER_SERVER`, even though it is often
+      # appropriate.
+      mode=_VariableDistributionMode.SHARED_LOCAL_PARAMETER_SERVER)
+
+
+class _VariableDistributionMode(object):
+  """Modes for variable distribution used for forcing a particular one.
+
+  Forcing a mode is meant for performance experimentation purposes rather than
+  for general use cases.
+  """
+
+  SHARED_LOCAL_PARAMETER_SERVER = 1
+  """Variables are placed on a single device and shared across all devices.
+
+  Two ways to achieve this distribution over available GPUs are supported:
+    1)  If exactly 1 GPU is detected, then variables and operations are placed
+        onto GPU.
+    2)  If more than 1 GPU is detected, then variables are going to be placed on
+        the CPU.  Replicas of operations are placed on each individual GPU.
+  """
+
+  SHARED_ROUND_ROBIN = 2
+  """Variables are placed on all devices in a round-robin fashion.
+
+  Every subsequent variable is placed on the next device.  There is only one
+  copy of each variable that is shared across all devices.
+  """
+
+
+def _replicate_model_fn_with_mode(
+    model_fn,
+    loss_reduction,
+    devices=None,
+    mode=_VariableDistributionMode.SHARED_LOCAL_PARAMETER_SERVER):
+  """A version of `replicate_model_fn` that allows to specify a `mode`."""
+  if loss_reduction == losses.Reduction.NONE:
+    raise ValueError('Tower losses need to be reduced in some way, yet {} '
+                     'reduction is specified.'.format(loss_reduction))
   if not devices:
     devices = _get_local_devices('GPU') or _get_local_devices('CPU')
 
-  is_a_single_gpu_case = len(devices) == 1 and 'GPU' in devices[0]
-  local_ps_device = '/{}:0'.format('GPU' if is_a_single_gpu_case else 'CPU')
+  is_a_single_gpu_case = len(devices) == 1 and 'GPU' in devices[0].upper()
+  consolidation_device = devices[0] if is_a_single_gpu_case else '/CPU:0'
 
-  tf_logging.info('Replicating the `model_fn` across {}.  Local parameter '
-                  'server device is going to be {}.'.format(
-                      devices, local_ps_device))
+  ps_devices = [consolidation_device]
+  if mode == _VariableDistributionMode.SHARED_ROUND_ROBIN:
+    ps_devices = devices
+
+  tf_logging.info('Replicating the `model_fn` across {}.  Variables are going '
+                  'to be placed on {}.  Consolidation device is going to be {}.'
+                  .format(devices, ps_devices, consolidation_device))
+
+  def single_device_model_fn(features, labels, mode, params=None, config=None):
+    """`model_fn` on a single device without reduction overhead."""
+    return _get_loss_towers(
+        model_fn=model_fn,
+        mode=mode,
+        features=[features],
+        labels=[labels],
+        params=params,
+        loss_reduction=loss_reduction,
+        config=config,
+        devices=devices,
+        local_ps_devices=ps_devices)[0]  # One device, so one spec is out.
 
   def replicated_model_fn(features, labels, mode, params=None, config=None):
     """Replicated version of `model_fn` to be used instead."""
     feature_shards, label_shards = _split_batch(
-        features, labels, len(devices), device=local_ps_device)
+        features, labels, len(devices), device=consolidation_device)
     tower_specs = _get_loss_towers(
         model_fn=model_fn,
         mode=mode,
         features=feature_shards,
         labels=label_shards,
         params=params,
+        loss_reduction=loss_reduction,
         config=config,
         devices=devices,
-        local_ps_device=local_ps_device)
+        local_ps_devices=ps_devices)
 
     if mode == model_fn_lib.ModeKeys.TRAIN:
-      train_op = _minimize_towers(tower_specs,
-                                  _call_optimizer_fn(optimizer_fn, params))
+      train_op = _minimize_towers(tower_specs)
       return _train_spec(
-          tower_specs, train_op, aggregation_device=local_ps_device)
+          tower_specs, train_op, aggregation_device=consolidation_device)
     elif mode == model_fn_lib.ModeKeys.EVAL:
-      return _eval_spec(tower_specs, aggregation_device=local_ps_device)
+      return _eval_spec(tower_specs, aggregation_device=consolidation_device)
     elif mode == model_fn_lib.ModeKeys.PREDICT:
-      return _predict_spec(tower_specs, aggregation_device=local_ps_device)
+      return _predict_spec(tower_specs, aggregation_device=consolidation_device)
+
+  if len(devices) == 1:
+    return single_device_model_fn
+  else:
+    return replicated_model_fn
+
+
+class TowerOptimizer(optimizer_lib.Optimizer):
+  """Gathers gradients from all towers and reduces them in the last one."""
+
+  COLLECTION_FOR_GRAPH_STATES = 'replicate_model_fn_graph_states'
 
-  return replicated_model_fn
+  def __init__(self, optimizer_or_optimizer_fn):
+    """Wrap an existing optimizer for gathering gradients across towers.
+
+    Each invocation of model_fn has to call the same optimizers in the same
+    order.
+
+    Multiple optimizers that use the same or different losses are supported.
+
+    If TowerOptimizer is used but `replicate_model_fn` isn't, then no
+    aggregation will happen.  All calls will simply be forwarded to the
+    underlying optimizer. The behavior is similar if there is only one tower.
+
+    If TowerOptimizer is used together with SyncReplicasOptimizer that wraps
+    the user's optimizer, then it's the SyncReplicasOptimizer that needs to be
+    wrapped with TowerOptimizer.
+
+    Args:
+      optimizer_or_optimizer_fn: an instance of optimizer to wrap.  That
+        instance is going to be used for optimizer-specific logic.  This can
+        also be a no-argument function that returns such an optimizer instance.
+    """
+    self._optimizer_or_optimizer_fn = optimizer_or_optimizer_fn
+
+  @staticmethod
+  def has_been_used():
+    return TowerOptimizer._graph_state().has_tower_optimizer_been_used
+
+  def get_slot(self, *args, **kwargs):
+    return self._get_optimizer().get_slot(*args, **kwargs)
+
+  def get_slot_names(self, *args, **kwargs):
+    return self._get_optimizer().get_slot_names(*args, **kwargs)
+
+  def get_name(self, *args, **kwargs):
+    return self._get_optimizer().get_name(*args, **kwargs)
+
+  def variables(self, *args, **kwargs):
+    return self._get_optimizer().variables(*args, **kwargs)
+
+  def compute_gradients(self, loss, *args, **kwargs):
+    """Compute gradients, but first, if needed, scale the loss."""
+    loss = _scale_loss(loss,
+                       self._graph_state().loss_reduction,
+                       self._graph_state().number_of_towers)
+    return self._get_optimizer().compute_gradients(loss, *args, **kwargs)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, **kwargs):
+    """Collect gradients updates to apply them with the last tower."""
+    if self._graph_state().number_of_towers == 1:
+      # Avoid the overhead of reduction if there's only one tower.
+      #
+      # There assumed to be only one tower if aggregation-related methods were
+      # not called by `_get_loss_towers`, for example if the model_fn uses
+      # TowerEstimator, but `replicate_model_fn` isn't used.
+      return self._get_optimizer().apply_gradients(grads_and_vars, global_step,
+                                                   **kwargs)
+
+    self._graph_state().collect_gradients(grads_and_vars)
+
+    if not self._graph_state().is_the_last_tower:
+      with ops_lib.control_dependencies(_extract_tensors(grads_and_vars)):
+        return self._construct_no_op_train_op()
+    else:
+      # Gradients need to be gathered and applied in the scope of the first
+      # tower, so that the tensors are accessible via names without prefixes.
+      var_scope, name_scope = self._graph_state().scopes_of_the_first_tower
+      with variable_scope.variable_scope(var_scope):
+        with ops_lib.name_scope(name_scope):
+          return self._apply_gathered_gradients(global_step, **kwargs)
+
+  def _apply_gathered_gradients(self, global_step, **kwargs):
+    graph_state = self._graph_state()
+    optimizer = self._get_optimizer()
+
+    grad_lists = {}
+    for grad, var in graph_state.get_latest_gradients_from_all_towers():
+      if grad is not None:
+        grad_lists.setdefault(var, []).append(grad)
+
+    aggregated_grads = []
+    with ops_lib.name_scope('gradient_aggregating'):
+      for var, grads in six.iteritems(grad_lists):
+        grad = _compute_sum_on_device(grads, var.device)
+        aggregated_grads.append((grad, var))
+    return optimizer.apply_gradients(
+        aggregated_grads, global_step=global_step, **kwargs)
+
+  def _get_optimizer(self):
+    if callable(self._optimizer_or_optimizer_fn):
+      # If optimizer is given as a function then we need to wait till we are
+      # under the right graph context before constructing it.  That's why the
+      # optimizer is constructed in _get_optimizer() rather than __init__().
+      self._optimizer_or_optimizer_fn = self._optimizer_or_optimizer_fn()
+    self._graph_state().has_tower_optimizer_been_used = True
+    return self._optimizer_or_optimizer_fn
+
+  def _construct_no_op_train_op(self):
+    return control_flow_ops.no_op(name='train_op_placeholder')
+
+  @staticmethod
+  def _graph_state():
+    graph_states = ops_lib.get_default_graph().get_collection_ref(
+        TowerOptimizer.COLLECTION_FOR_GRAPH_STATES)
+    if not graph_states:
+      graph_states.append(TowerOptimizer._PerGraphState())
+    return graph_states[-1]
+
+  @staticmethod
+  def _did_towers_have_same_optimizer_calls():
+    graph_state = TowerOptimizer._graph_state()
+    return graph_state.did_towers_have_same_optimizer_calls()
+
+  @staticmethod
+  def _clear_graph_state():
+    # Clearing the Graph collection will prevent _PerGraphState from being
+    # serialized.
+    ops_lib.get_default_graph().clear_collection(
+        TowerOptimizer.COLLECTION_FOR_GRAPH_STATES)
+
+  class _PerGraphState(object):
+    """Gradient reduction related state of a Tensorflow graph."""
+
+    def __init__(self):
+      self._collected_grads_and_vars = defaultdict(list)
+      self._current_tower_index = 0
+      self._number_of_towers = 1
+      self._loss_reduction = None
+      # Scopes of the first tower that don't have a prefix:
+      self._variable_scope = None
+      self._name_scope = None
+      # If needed, alert that TowerOptimizer needs to be used with model_fn.
+      self._has_tower_optimizer_been_used = False
+
+    def collect_gradients(self, grads_and_vars):
+      self._collected_grads_and_vars[self._current_tower_index].append(
+          grads_and_vars)
+
+    def get_latest_gradients_from_all_towers(self):
+      """Get gradients across towers for the last called optimizer."""
+      grads_and_vars = []
+      index_of_last_gradients = len(
+          self._collected_grads_and_vars[self._current_tower_index]) - 1
+      for tower_id in range(self._current_tower_index + 1):
+        grads_and_vars.extend(
+            self._collected_grads_and_vars[tower_id][index_of_last_gradients])
+      return grads_and_vars
+
+    def set_reduction_across_towers(self, loss_reduction, number_of_towers):
+      self._loss_reduction = loss_reduction
+      self._number_of_towers = number_of_towers
+
+    @contextmanager
+    def tower(self, tower_id, var_scope, name_scope):
+      if tower_id == 0:
+        self._variable_scope = var_scope
+        self._name_scope = name_scope
+      self._current_tower_index = tower_id
+      yield
+
+    @property
+    def scopes_of_the_first_tower(self):
+      return self._variable_scope, self._name_scope
+
+    @property
+    def is_the_last_tower(self):
+      return self._current_tower_index == (self._number_of_towers - 1)
+
+    @property
+    def number_of_towers(self):
+      return self._number_of_towers
+
+    @property
+    def loss_reduction(self):
+      return self._loss_reduction
+
+    @property
+    def has_tower_optimizer_been_used(self):
+      return self._has_tower_optimizer_been_used
+
+    @has_tower_optimizer_been_used.setter
+    def has_tower_optimizer_been_used(self, value):
+      self._has_tower_optimizer_been_used = value
+
+    def did_towers_have_same_optimizer_calls(self):
+      total_number_of_grads = sum([
+          len(grads)
+          for _, grads in six.iteritems(self._collected_grads_and_vars)
+      ])
+      return total_number_of_grads % self._number_of_towers == 0
 
 
 def _get_local_devices(device_type):
@@ -182,6 +457,13 @@ def _get_local_devices(device_type):
 def _split_batch(features, labels, number_of_shards, device):
   """Split input features and labes into batches."""
 
+  def ensure_divisible_by_shards(sequence):
+    batch_size = ops_lib.convert_to_tensor(sequence).get_shape()[0]
+    if batch_size % number_of_shards != 0:
+      raise ValueError(
+          'Batch size {} needs to be divisible by the number of GPUs, which '
+          'is {}.'.format(batch_size, number_of_shards))
+
   def split_dictionary(dictionary):
     """Split a dictionary into shards."""
     shards = [{} for _ in range(number_of_shards)]
@@ -192,6 +474,7 @@ def _split_batch(features, labels, number_of_shards, device):
                 sp_input=tensor, num_split=number_of_shards, axis=0)):
           shards[i][name] = shard
       else:
+        ensure_divisible_by_shards(tensor)
         for i, shard in enumerate(array_ops.split(tensor, number_of_shards)):
           shards[i][name] = shard
     return shards
@@ -201,6 +484,7 @@ def _split_batch(features, labels, number_of_shards, device):
       if isinstance(features, dict):
         feature_shards = split_dictionary(features)
       else:
+        ensure_divisible_by_shards(features)
         feature_shards = array_ops.split(features, number_of_shards)
 
       if labels is None:
@@ -208,6 +492,7 @@ def _split_batch(features, labels, number_of_shards, device):
       elif isinstance(labels, dict):
         label_shards = split_dictionary(labels)
       else:
+        ensure_divisible_by_shards(labels)
         label_shards = array_ops.split(labels, number_of_shards)
   return feature_shards, label_shards
 
@@ -222,7 +507,8 @@ def _get_loss_towers(model_fn,
                      params,
                      config,
                      devices,
-                     local_ps_device,
+                     local_ps_devices,
+                     loss_reduction,
                      name_scope_pattern=_DEFAULT_NAME_SCOPE_PATTERN):
   """Replicate the loss computation across devices."""
   tower_specs = []
@@ -234,36 +520,64 @@ def _get_loss_towers(model_fn,
   if 'config' in model_fn_args:
     optional_params['config'] = copy.deepcopy(config)
 
+  # pylint: disable=protected-access
+  round_robin_strategy = device_setter_lib._RoundRobinStrategy(
+      num_tasks=len(local_ps_devices))
+  TowerOptimizer._graph_state().set_reduction_across_towers(
+      loss_reduction, len(devices))
+
   for i, device in enumerate(devices):
     is_the_first_tower = (i == 0)
 
     device_setter = _local_device_setter(
-        worker_device=device, ps_device=local_ps_device)
+        worker_device=device,
+        ps_devices=local_ps_devices,
+        ps_strategy=round_robin_strategy)
 
-    # We would like to preserve the names of the variables and ops that a user
-    # might be relying on. Names with prefix are going to resolve to variables
-    # and ops of the first tower.
+    # We would like to preserve the names of the variables and ops that the user
+    # might be relying on. Names without a prefix are going to resolve to
+    # variables and ops of the first tower.
     name_scope = name_scope_pattern
     if is_the_first_tower:
       name_scope = ''
 
-    with variable_scope.variable_scope('', reuse=not is_the_first_tower):
-      with ops_lib.name_scope(name_scope.format(i)):
-        with ops_lib.device(device_setter):
-          labels_shard = None
-          if labels:
-            labels_shard = labels[i]
-
-          tower_specs.append(
-              model_fn(
-                  mode=mode,
-                  features=features[i],
-                  labels=labels_shard,
-                  **optional_params))
+    with variable_scope.variable_scope(
+        '', reuse=not is_the_first_tower) as var_scope:
+      with ops_lib.name_scope(name_scope.format(i)) as name_scope:
+        with TowerOptimizer._graph_state().tower(
+            tower_id=i, var_scope=var_scope, name_scope=name_scope):
+          with ops_lib.device(device_setter):
+            labels_shard = None
+            if labels:
+              labels_shard = labels[i]
+
+            tower_spec = model_fn(
+                mode=mode,
+                features=features[i],
+                labels=labels_shard,
+                **optional_params)
+
+            if (tower_spec.train_op is not None and len(devices) > 1 and
+                not TowerOptimizer.has_been_used()):
+              raise ValueError('Please wrap optimizers with TowerOptimizer'
+                               ' in order to use replicate_model_fn with'
+                               ' multiple `devices`.')
+
+            # Scaling the loss here doesn't actually affect gradients.  Another
+            # instance of scaling happens inside the TowerOptimizer.
+            tower_spec = _scale_tower_loss(
+                tower_spec, loss_reduction, number_of_towers=len(devices))
+            tower_specs.append(tower_spec)
+
+  if not TowerOptimizer._did_towers_have_same_optimizer_calls():
+    raise ValueError('Each invocation of model_fn was supposed to make the same'
+                     ' optimizer calls.')
+  TowerOptimizer._clear_graph_state()
+  # pylint: enable=protected-access
   return tower_specs
 
 
-def _local_device_setter(ps_device, worker_device):
+def _local_device_setter(worker_device, ps_devices, ps_strategy):
   """A device setter that puts distributes Var/Ops to PS/workers."""
   ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']
 
@@ -273,7 +587,7 @@ def _local_device_setter(ps_device, worker_device):
     node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
     if node_def.op in ps_ops:
       ps_device_spec = framework_device.DeviceSpec.from_string(
-          '{}'.format(ps_device))
+          '{}'.format(ps_devices[ps_strategy(op)]))
 
       ps_device_spec.merge_from(current_device)
       return ps_device_spec.to_string()
@@ -286,33 +600,33 @@ def _local_device_setter(ps_device, worker_device):
   return local_device_chooser
 
 
-def _minimize_towers(tower_specs, optimizer):
-  """Aggregate and apply gradients for computed losses."""
-  grad_lists = {}
-  for tower_spec in tower_specs:
-    with ops_lib.device(tower_spec.loss.device):
-      for grad, var in optimizer.compute_gradients(tower_spec.loss):
-        if grad is not None:
-          grad_lists.setdefault(var, []).append(grad)
+def _scale_tower_loss(tower_spec, loss_reduction, number_of_towers):
+  """Produce an EstimatorSpec with approproriately scaled loss."""
+  if tower_spec.loss is None:
+    return tower_spec
+
+  estimator_spec = _asdict(tower_spec)
+  estimator_spec['loss'] = _scale_loss(tower_spec.loss, loss_reduction,
+                                       number_of_towers)
+  return model_fn_lib.EstimatorSpec(**estimator_spec)
 
-  aggregated_grads = []
-  with ops_lib.name_scope('gradient_aggregating'):
-    for var, grads in six.iteritems(grad_lists):
-      grad = _compute_sum_on_device(grads, var.device)
-      aggregated_grads.append((grad, var))
 
-  train_op = optimizer.apply_gradients(
-      aggregated_grads, global_step=training_util.get_global_step())
+def _scale_loss(loss, loss_reduction, number_of_towers):
+  """If needed, scale down the loss for averaging loss by summing."""
+  if loss is None:
+    return None
+  if number_of_towers == 1:
+    return loss
 
-  return train_op
+  if loss_reduction != losses.Reduction.SUM:
+    return math_ops.div(loss, 1.0 * number_of_towers, name='averaged_loss')
+  else:
+    return loss
 
 
-def _call_optimizer_fn(optimizer_fn, params):
-  arguments = {}
-  optimizer_fn_arguments = util.fn_args(optimizer_fn)
-  if 'params' in optimizer_fn_arguments:
-    arguments['params'] = params
-  return optimizer_fn(**arguments)
+def _minimize_towers(tower_specs):
+  """`train_op` of the last tower applies aggregated gradients."""
+  return tower_specs[-1].train_op
 
 
 def _compute_sum_on_device(values, device, name=None):
@@ -335,7 +649,12 @@ def _train_spec(tower_specs,
                 aggregation_device,
                 aggregated_loss_name='loss'):
   """Populate replicated EstimatorSpec for `GraphKeys.TRAIN`."""
-  estimator_spec = tower_specs[0]._asdict()
+  # Spec of the last tower is used as the template for the final spec, because
+  # some `EstimatorSpec.training_hooks` rely on calls made in model_fn.  For
+  # example, `SyncReplicasOptimizerHook` validates the
+  # `SyncReplicasOptimizer.apply_gradients` call. `TowerEstimator` makes that
+  # call only in the last tower.
+  estimator_spec = _asdict(tower_specs[-1])
   estimator_spec['mode'] = model_fn_lib.ModeKeys.TRAIN
   estimator_spec['train_op'] = train_op
   estimator_spec['loss'] = _compute_sum_on_device(
@@ -346,7 +665,7 @@ def _train_spec(tower_specs,
 
 def _eval_spec(tower_specs, aggregation_device, aggregated_loss_name='loss'):
   """Populate replicated EstimatorSpec for `GraphKeys.EVAL`."""
-  estimator_spec = tower_specs[0]._asdict()
+  estimator_spec = _asdict(tower_specs[0])
   estimator_spec['mode'] = model_fn_lib.ModeKeys.EVAL
   estimator_spec['loss'] = _compute_sum_on_device(
       [spec.loss for spec in tower_specs], aggregation_device,
@@ -370,7 +689,7 @@ def _eval_spec(tower_specs, aggregation_device, aggregated_loss_name='loss'):
 def _reduce_metric_variables(number_of_towers):
   """Aggregate local variables used in metrics into the first tower."""
   if number_of_towers == 1:
-    return control_flow_ops.no_op()
+    return control_flow_ops.no_op(name='no_eval_metric_reduction')
 
   metric_variables = ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)
   variables_per_tower = len(metric_variables) // number_of_towers
@@ -414,7 +733,7 @@ def _reduce_metric_variables(number_of_towers):
 
 def _predict_spec(tower_specs, aggregation_device):
   """Populate replicated EstimatorSpec for `GraphKeys.PREDICT`."""
-  estimator_spec = tower_specs[0]._asdict()
+  estimator_spec = _asdict(tower_specs[0])
   estimator_spec['mode'] = model_fn_lib.ModeKeys.PREDICT
 
   with ops_lib.device(aggregation_device):
@@ -465,6 +784,17 @@ def _concat_tensor_dicts(*tensor_dicts):
   }
 
 
+def _extract_tensors(tensors_and_vars):
+  tensors = []
+  for tensor_and_var in tensors_and_vars:
+    tensor, _ = tensor_and_var
+    if isinstance(tensor, ops_lib.IndexedSlices):
+      tensors.append(tensor.values)
+    elif tensor is not None:
+      tensors.append(tensor)
+  return tensors
+
+
 def _dict_concat(*dicts):
   list_dict = {}
   for d in dicts:
@@ -474,3 +804,19 @@ def _dict_concat(*dicts):
     for k, v in six.iteritems(d):
       list_dict.setdefault(k, []).append(v)
   return list_dict
+
+
+def _asdict(namedtuple):
+  """Returns a namedtuple as a dictionary.
+
+  This is required because `_asdict()` in Python 3.x.x is broken in classes
+  that inherit from `collections.namedtuple`. See
+  https://bugs.python.org/issue24931 for more details.
+
+  Args:
+    namedtuple: An object that inherits from `collections.namedtuple`.
+
+  Returns:
+    A dictionary version of the tuple.
+  """
+  return {k: getattr(namedtuple, k) for k in namedtuple._fields}
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
index ffe69f89b4c4d48d329a1aef3aa3cad2b17b3fdf..d46a18aacfcd911c56a9f22dc9581060c7b458a6 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
@@ -37,9 +37,11 @@ from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import losses
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import variable_scope
@@ -49,15 +51,32 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import adam
+from tensorflow.python.training import device_setter
 from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import training
 
 
+# TODO(isaprykin):  Parametrize all the tests on
+#   replicate_model_fn._VariableDistributionMode when it's supported.
 class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
 
-  def test_complete_flow(self):
+  def test_complete_flow_with_public_version(self):
+    return self._complete_flow_with_mode(mode=None)
+
+  def test_complete_flow_with_mode_local_ps_server(self):
+    return self._complete_flow_with_mode(
+        replicate_model_fn._VariableDistributionMode.
+        SHARED_LOCAL_PARAMETER_SERVER)
+
+  def test_complete_flow_with_mode_round_robin(self):
+    return self._complete_flow_with_mode(
+        replicate_model_fn._VariableDistributionMode.SHARED_ROUND_ROBIN)
+
+  def _complete_flow_with_mode(self, mode):
     n_classes = 3
     input_dimension = 2
     batch_size = 12
@@ -96,20 +115,30 @@ class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase):
                     0., len(x_data), len(x_data), dtype=np.int64)), 1)
     ]
 
+    def optimizer_fn():
+      return optimizers.get_optimizer_instance('Adagrad', learning_rate=0.05)
+
     estimator = dnn.DNNClassifier(
         hidden_units=(2, 2),
+        # Adagrad is configured with `get_optimizer_instance`, so the function
+        # form of `TowerOptimizer.__init__` is used.
+        optimizer=replicate_model_fn.TowerOptimizer(optimizer_fn),
         feature_columns=feature_columns,
         n_classes=n_classes,
         model_dir=self._model_dir)
 
-    def optimizer_fn():
-      return optimizers.get_optimizer_instance('Adagrad', learning_rate=0.05)
+    if not mode:  # Use the public `replicate_model_fn`.
+      model_fn = replicate_model_fn.replicate_model_fn(
+          estimator.model_fn, devices=['/gpu:0', '/gpu:1', '/gpu:2'])
+    else:
+      model_fn = replicate_model_fn._replicate_model_fn_with_mode(
+          estimator.model_fn,
+          devices=['/gpu:0', '/gpu:1', '/gpu:2'],
+          loss_reduction=losses.Reduction.SUM,
+          mode=mode)
 
     estimator = estimator_lib.Estimator(
-        model_fn=replicate_model_fn.replicate_model_fn(
-            estimator.model_fn,
-            optimizer_fn,
-            devices=['/gpu:0', '/gpu:1', '/gpu:2']),
+        model_fn=model_fn,
         model_dir=estimator.model_dir,
         config=estimator.config,
         params=estimator.params)
@@ -134,6 +163,10 @@ class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase):
                                              serving_input_receiver_fn)
     self.assertTrue(gfile.Exists(export_dir))
 
+    # Nothing should be left in the graph so that it doesn't get serialized.
+    self.assertFalse(ops_lib.get_default_graph().get_collection_ref(
+        replicate_model_fn.TowerOptimizer.COLLECTION_FOR_GRAPH_STATES))
+
   def _as_label(self, data_in_float):
     return np.rint(data_in_float).astype(np.int64)
 
@@ -153,28 +186,24 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
 
     predictions = math_ops.multiply(features, c)
 
-    loss = None
-    if mode is not model_fn_lib.ModeKeys.PREDICT:
-      loss = losses.absolute_difference(
-          labels=labels,
-          predictions=predictions,
-          reduction=losses.Reduction.SUM)
-      loss = math_ops.reduce_sum(loss)
+    loss = losses.absolute_difference(
+        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
+    loss = math_ops.reduce_sum(loss)
 
     metrics = {
         'accuracy': metrics_lib.accuracy(labels, predictions),
         'auc': metrics_lib.auc(labels, predictions)
     }
 
+    optimizer = replicate_model_fn.TowerOptimizer(
+        gradient_descent.GradientDescentOptimizer(params['learning_rate']))
+
     return model_fn_lib.EstimatorSpec(
         mode=mode,
         loss=loss,
         eval_metric_ops=metrics,
         predictions={'probabilities': predictions},
-        train_op=control_flow_ops.no_op())  # This train_op isn't actually used.
-
-  def optimizer_fn(self, params):
-    return gradient_descent.GradientDescentOptimizer(params['learning_rate'])
+        train_op=optimizer.minimize(loss))
 
   @property
   def params(self):
@@ -188,7 +217,9 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
 
     with self.test_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, self.optimizer_fn, devices=['/gpu:0', '/gpu:1'])
+          self.model_fn,
+          loss_reduction=losses.Reduction.SUM,
+          devices=['/gpu:0', '/gpu:1'])
       estimator_spec = replicated_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
       session.run(variables.global_variables_initializer())
@@ -197,31 +228,78 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
       total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
       self.assertEqual(total_loss, session.run(estimator_spec.loss))
 
-      # loss' of c is 3.
+      # derivative of loss = (1*c - 1) + (2*c - 2) is 3.
       # new value of c = 10 - learning rate * 3 = 7.0.
       session.run(estimator_spec.train_op)
       with variable_scope.variable_scope('', reuse=True):
         c = variable_scope.get_variable('c', dtype=dtypes.float64)
         self.assertEqual(7.0, session.run(c))
 
-  def test_train_spec_with_optimizer_without_params(self):
-
-    def optimizer_fn_without_params():
-      return gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
-
+  def test_train_with_mean_reduction(self):
     features = np.array([[1.0], [2.0]])
     labels = np.array([[1.0], [2.0]])
 
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session() as session:
+      # Add another trainable variable that doesn't produce a gradient to
+      # verify that None gradients are supported.
+      _ = variable_scope.get_variable(
+          'another_variable',
+          initializer=constant_op.constant(1, dtype=dtypes.float64),
+          dtype=dtypes.float64)
+
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn,
-          optimizer_fn_without_params,
-          devices=['/gpu:0', '/gpu:1'])
-      # This call is going to fail if `replicated_model_fn` is still passing
-      # `params` inside `optimizer_fn`, even though the latter doesn't take any:
+          self.model_fn, losses.Reduction.MEAN, devices=['/gpu:0', '/gpu:1'])
       estimator_spec = replicated_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-      del estimator_spec
+      session.run(variables.global_variables_initializer())
+
+      # loss = feature * c - label
+      total_loss = ((1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)) / 2.0
+      self.assertEqual(total_loss, session.run(estimator_spec.loss))
+
+      # derivative of loss = (1*c - 1)/2 + (2*c - 2)/2 is 1.5.
+      # It's the same computation as without mean reduction, but the
+      # loss from every tower is scaled by 1/<number of towers>.
+      # new value of c = 10 - learning rate * 1.5 = 8.5
+      session.run(estimator_spec.train_op)
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual(8.5, session.run(c))
+
+  def test_train_two_steps_collected_gradients_are_reset_between_steps(self):
+    with ops_lib.Graph().as_default():
+      features = array_ops.placeholder(dtypes.float64)
+      labels = array_ops.placeholder(dtypes.float64)
+
+      feature_inputs = np.array([[1.0], [2.0]]), np.array([[1.5], [2.5]])
+      label_inputs = np.array([[1.0], [2.0]]), np.array([[1.5], [2.5]])
+
+      # loss = feature * c - label
+      expected_losses = ((1.0 * 10 - 1.0) + (2.0 * 10 - 2.0),
+                         (1.5 * 7.0 - 1.5) + (2.5 * 7.0 - 2.5))
+      # Derivative of the loss is 1.0 + 2.0 for the first step and 1.5 + 2.5
+      # for the second.
+      expected_c = 10.0 - 3.0, 7.0 - 4.0
+
+      with self.test_session() as session, variable_scope.variable_scope(
+          '', reuse=variable_scope.AUTO_REUSE):
+        replicated_model_fn = replicate_model_fn.replicate_model_fn(
+            self.model_fn,
+            loss_reduction=losses.Reduction.SUM,
+            devices=['/gpu:0', '/gpu:1'])
+        estimator_spec = replicated_model_fn(
+            features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+        session.run(variables.global_variables_initializer())
+
+        for feature_input, label_input, loss, weight in zip(
+            feature_inputs, label_inputs, expected_losses, expected_c):
+          feeds = {features: feature_input, labels: label_input}
+
+          self.assertEqual(loss, session.run(estimator_spec.loss, feeds))
+
+          session.run(estimator_spec.train_op, feeds)
+          c = variable_scope.get_variable('c', dtype=dtypes.float64)
+          self.assertEqual(weight, session.run(c, feeds))
 
   def test_eval(self):
     features = np.array([[0.01], [0.002]])
@@ -229,7 +307,9 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
 
     with self.test_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, self.optimizer_fn, devices=['/gpu:0', '/gpu:1'])
+          self.model_fn,
+          loss_reduction=losses.Reduction.SUM,
+          devices=['/gpu:0', '/gpu:1'])
       estimator_spec = replicated_model_fn(
           features, labels, model_fn_lib.ModeKeys.EVAL, self.params)
       session.run(variables.local_variables_initializer())
@@ -252,13 +332,42 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
       self.assertEqual(0, auc)
       self.assertNear(total_loss, session.run(estimator_spec.loss), 0.01)
 
+  def test_eval_with_mean_reduction(self):
+    features = np.array([[0.01], [0.002]])
+    labels = np.array([[0.01], [0.02]])
+
+    with self.test_session() as session:
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, losses.Reduction.MEAN, devices=['/gpu:0', '/gpu:1'])
+      estimator_spec = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.EVAL, self.params)
+      session.run(variables.local_variables_initializer())
+      session.run(variables.global_variables_initializer())
+
+      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
+      auc, b = estimator_spec.eval_metric_ops['auc']
+
+      session.run([a, b])
+      accuracy = session.run(accuracy)
+      auc = session.run(auc)
+
+      # loss[i] = features[i] * 10 - labels[i].
+      # Accuracy is 0.0 (no match) in the first tower.
+      # Accuracy is 1.0 (match) in the second tower, since the feature
+      # times weight "c" happened to be equal to the label.
+      total_loss = ((0.01 * 10 - 0.01) + (0.002 * 10 - 0.02)) / 2.0
+
+      self.assertNear((0.0 + 1.0) / 2.0, accuracy, 0.01)
+      self.assertEqual(0, auc)
+      self.assertNear(total_loss, session.run(estimator_spec.loss), 0.01)
+
   def test_predict(self):
     features = np.array([[0.01], [0.002]])
     labels = np.array([[0.01], [0.02]])
 
     with self.test_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, self.optimizer_fn, devices=['/gpu:0', '/gpu:1'])
+          self.model_fn, devices=['/gpu:0', '/gpu:1'])
       estimator_spec = replicated_model_fn(
           features, labels, model_fn_lib.ModeKeys.PREDICT, self.params)
       session.run(variables.global_variables_initializer())
@@ -273,7 +382,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
 
     with self.test_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, self.optimizer_fn)
+          self.model_fn, devices=['/gpu:0'])
       estimator_spec = replicated_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
       session.run(variables.global_variables_initializer())
@@ -295,7 +404,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
 
     with self.test_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, self.optimizer_fn, devices=['/gpu:0'])
+          self.model_fn, devices=['/gpu:0'])
       estimator_spec = replicated_model_fn(
           features, labels, model_fn_lib.ModeKeys.EVAL, self.params)
       session.run(variables.local_variables_initializer())
@@ -323,7 +432,7 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
 
     with self.test_session() as session:
       replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, self.optimizer_fn, devices=['/gpu:0'])
+          self.model_fn, devices=['/gpu:0'])
       estimator_spec = replicated_model_fn(
           features, labels, model_fn_lib.ModeKeys.PREDICT, self.params)
       session.run(variables.global_variables_initializer())
@@ -332,6 +441,451 @@ class ReplicateModelTest(test_util.TensorFlowTestCase):
           'probabilities': np.array([[0.1], [0.02]])
       }, session.run(estimator_spec.predictions))
 
+  def test_batch_size_that_is_not_divisible_by_the_number_of_gpus(self):
+    features = np.array([[1.0], [2.0], [3.0]])
+    labels = np.array([[1.0], [2.0], [3.0]])
+
+    with self.assertRaisesRegexp(
+        ValueError, '.*Batch.+size.+needs.+to.+be.+divisible.+by.+GPUs.+'):
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, devices=['/gpu:0', '/gpu:1'])
+      _ = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+
+  def test_unsupported_loss_reduction(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 '.+none.+reduction.+is.+specified.+'):
+      _ = replicate_model_fn.replicate_model_fn(self.model_fn,
+                                                losses.Reduction.NONE)
+
+  def test_places_on_gpu_with_upper_case_spelling(self):
+    features = np.array([[0.01], [0.002]])
+    labels = np.array([[0.01], [0.02]])
+
+    with self.test_session():
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, devices=['/GPU:0'])
+      _ = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:0', c.device)
+
+  def test_places_on_gpu_with_lower_case_spelling(self):
+    features = np.array([[0.01], [0.002]])
+    labels = np.array([[0.01], [0.02]])
+
+    with self.test_session():
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, devices=['/gpu:0'])
+      _ = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:0', c.device)
+
+
+class ReplicateAcrossASingleDeviceWithoutTowerOptimizer(
+    test_util.TensorFlowTestCase):
+
+  def model_fn(self, mode, features, labels, params):
+    c = variable_scope.get_variable(
+        'c',
+        initializer=constant_op.constant(10, dtype=dtypes.float64),
+        dtype=dtypes.float64)
+
+    predictions = math_ops.multiply(features, c)
+
+    loss = losses.absolute_difference(
+        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
+    loss = math_ops.reduce_sum(loss)
+
+    metrics = {
+        'accuracy': metrics_lib.accuracy(labels, predictions),
+        'auc': metrics_lib.auc(labels, predictions)
+    }
+
+    optimizer = gradient_descent.GradientDescentOptimizer(
+        params['learning_rate'])
+
+    return model_fn_lib.EstimatorSpec(
+        mode=mode,
+        loss=loss,
+        eval_metric_ops=metrics,
+        predictions={'probabilities': predictions},
+        train_op=optimizer.minimize(loss))
+
+  @property
+  def params(self):
+    params = {}
+    params['learning_rate'] = 1.0
+    return params
+
+  def test_train_single_tower(self):
+    features = np.array([[1.0], [2.0]])
+    labels = np.array([[1.0], [2.0]])
+
+    with self.test_session() as session:
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, devices=['/gpu:0'])
+      estimator_spec = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+      session.run(variables.global_variables_initializer())
+
+      # loss = feature * c - label
+      total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
+      self.assertEqual(total_loss, session.run(estimator_spec.loss))
+
+      # loss' of c is 3.
+      # new value of c = 10 - learning rate * 3 = 7.0.
+      session.run(estimator_spec.train_op)
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual(7.0, session.run(c))
+
+
+class UseTowerEstimatorWithoutReplication(test_util.TensorFlowTestCase):
+
+  def model_fn(self, mode, features, labels, params):
+    c = variable_scope.get_variable(
+        'c',
+        initializer=constant_op.constant(10, dtype=dtypes.float64),
+        dtype=dtypes.float64)
+
+    features = features['features']
+    predictions = math_ops.multiply(features, c)
+
+    loss = losses.absolute_difference(
+        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
+    loss = math_ops.reduce_sum(loss)
+
+    metrics = {
+        'accuracy': metrics_lib.accuracy(labels, predictions),
+        'auc': metrics_lib.auc(labels, predictions)
+    }
+
+    optimizer = replicate_model_fn.TowerOptimizer(
+        gradient_descent.GradientDescentOptimizer(params['learning_rate']))
+
+    return model_fn_lib.EstimatorSpec(
+        mode=mode,
+        loss=loss,
+        eval_metric_ops=metrics,
+        predictions={'probabilities': predictions},
+        train_op=optimizer.minimize(loss))
+
+  @property
+  def params(self):
+    params = {}
+    params['learning_rate'] = 1.0
+    return params
+
+  def test_train_single_tower(self):
+    features = np.array([[1.0], [2.0]])
+    labels = np.array([[1.0], [2.0]])
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'features': features}, y=labels, batch_size=2, shuffle=False)
+
+    with self.test_session():
+      estimator = estimator_lib.Estimator(
+          model_fn=self.model_fn,
+          model_dir=tempfile.mkdtemp(),
+          params=self.params)
+      estimator.train(train_input_fn, steps=1)
+
+      self.assertEqual(7.0, estimator.get_variable_value('c'))
+
+
+class MakeSureSyncReplicasOptimizerWorks(test_util.TensorFlowTestCase):
+
+  def model_fn(self, mode, features, labels, params):
+    c = variable_scope.get_variable(
+        'c',
+        initializer=constant_op.constant(10, dtype=dtypes.float64),
+        dtype=dtypes.float64)
+
+    features = features['features']
+    predictions = math_ops.multiply(features, c)
+
+    loss = losses.absolute_difference(
+        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
+    loss = math_ops.reduce_sum(loss)
+
+    metrics = {
+        'accuracy': metrics_lib.accuracy(labels, predictions),
+        'auc': metrics_lib.auc(labels, predictions)
+    }
+
+    optimizer = gradient_descent.GradientDescentOptimizer(
+        params['learning_rate'])
+    optimizer = training.SyncReplicasOptimizer(
+        optimizer, replicas_to_aggregate=1)
+    sync_hook = optimizer.make_session_run_hook(True)
+    optimizer = replicate_model_fn.TowerOptimizer(optimizer)
+
+    return model_fn_lib.EstimatorSpec(
+        mode=mode,
+        loss=loss,
+        eval_metric_ops=metrics,
+        training_hooks=[sync_hook],
+        predictions={'probabilities': predictions},
+        train_op=optimizer.minimize(
+            loss, global_step=training.get_global_step()))
+
+  @property
+  def params(self):
+    params = {}
+    params['learning_rate'] = 1.0
+    return params
+
+  def test_train_multiple_towers(self):
+    features = np.array([[1.0], [2.0]])
+    labels = np.array([[1.0], [2.0]])
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'features': features}, y=labels, batch_size=2, shuffle=False)
+
+    model_fn = replicate_model_fn.replicate_model_fn(
+        self.model_fn,
+        loss_reduction=losses.Reduction.SUM,
+        devices=['/gpu:0', '/gpu:1'])
+
+    estimator = estimator_lib.Estimator(
+        model_fn=model_fn, model_dir=tempfile.mkdtemp(), params=self.params)
+    estimator.train(train_input_fn, steps=1)
+
+    self.assertEqual(7.0, estimator.get_variable_value('c'))
+
+
+class ReplicateWithTwoOptimizersTest(test_util.TensorFlowTestCase):
+
+  def model_fn(self, mode, features, labels, params):
+    c = variable_scope.get_variable(
+        'c',
+        initializer=constant_op.constant(10, dtype=dtypes.float64),
+        dtype=dtypes.float64)
+
+    side_effects = variable_scope.get_variable(
+        'side_effects',
+        initializer=constant_op.constant(0, dtype=dtypes.float64),
+        dtype=dtypes.float64,
+        trainable=False)
+
+    predictions = math_ops.multiply(features, c)
+
+    loss = losses.absolute_difference(
+        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
+    loss = math_ops.reduce_sum(loss)
+
+    metrics = {
+        'accuracy': metrics_lib.accuracy(labels, predictions),
+        'auc': metrics_lib.auc(labels, predictions)
+    }
+
+    first_optimizer = replicate_model_fn.TowerOptimizer(
+        gradient_descent.GradientDescentOptimizer(1.0))
+    second_optimizer = replicate_model_fn.TowerOptimizer(
+        adam.AdamOptimizer(1.0))
+
+    with ops_lib.control_dependencies([side_effects.assign_add(1.0)]):
+      first_grads_and_vars = first_optimizer.compute_gradients(loss)
+
+    train_op = control_flow_ops.group(
+        [first_optimizer.apply_gradients(first_grads_and_vars),
+         second_optimizer.minimize(loss)])
+
+    return model_fn_lib.EstimatorSpec(
+        mode=mode,
+        loss=loss,
+        eval_metric_ops=metrics,
+        predictions={'probabilities': predictions},
+        train_op=train_op)
+
+  def test_train(self):
+    features = np.array([[1.0], [2.0]])
+    labels = np.array([[1.0], [2.0]])
+
+    with self.test_session() as session:
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn,
+          loss_reduction=losses.Reduction.SUM,
+          devices=['/gpu:0', '/gpu:1'])
+      estimator_spec = replicated_model_fn(features, labels,
+                                           model_fn_lib.ModeKeys.TRAIN, {})
+      session.run(variables.global_variables_initializer())
+
+      # loss = feature * c - label
+      total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
+      self.assertEqual(total_loss, session.run(estimator_spec.loss))
+
+      # loss' of c is 3.
+      # new value of c = 10 - learning rate * 3 = 7.0.
+      # Adam subtracts another ~1.
+      session.run(estimator_spec.train_op)
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertNear(6.0, session.run(c), 0.000001)
+
+        side_effects = variable_scope.get_variable(
+            'side_effects', dtype=dtypes.float64)
+        self.assertNear(2.0, session.run(side_effects), 0.000001)
+
+
+class ReplicateWithTwoLossesAndOneOptimizer(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._should_skip_optimizer = False
+    self._towers_left_before_skipping_optimizer = -1
+
+  def incorrectly_skip_optimizer_for_tower(self, tower_number):
+    self._should_skip_optimizer = True
+    self._towers_left_before_skipping_optimizer = tower_number
+
+  def should_skip_optimizer(self):
+    if not self._should_skip_optimizer:
+      return False
+    if self._towers_left_before_skipping_optimizer == 0:
+      return True
+    else:
+      self._towers_left_before_skipping_optimizer -= 1
+      return False
+
+  def model_fn(self, mode, features, labels, params):
+    c = variable_scope.get_variable(
+        'c',
+        initializer=constant_op.constant(10, dtype=dtypes.float64),
+        dtype=dtypes.float64)
+    d = variable_scope.get_variable(
+        'd',
+        initializer=constant_op.constant(2, dtype=dtypes.float64),
+        dtype=dtypes.float64)
+
+    predictions = math_ops.multiply(features, c)
+
+    loss = losses.absolute_difference(
+        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
+    loss = math_ops.reduce_sum(loss)
+
+    another_predictions = math_ops.multiply(features, d)
+    another_loss = losses.absolute_difference(
+        labels=labels,
+        predictions=another_predictions,
+        reduction=losses.Reduction.SUM)
+    another_loss = math_ops.reduce_sum(another_loss)
+
+    total_loss = math_ops.add(loss, another_loss)
+
+    metrics = {
+        'accuracy': metrics_lib.accuracy(labels, predictions),
+        'auc': metrics_lib.auc(labels, predictions)
+    }
+
+    train_ops = []
+
+    optimizer = replicate_model_fn.TowerOptimizer(
+        gradient_descent.GradientDescentOptimizer(1.0))
+    train_ops.append(optimizer.minimize(loss, var_list=[c]))
+    if not self.should_skip_optimizer():
+      another_optimizer = replicate_model_fn.TowerOptimizer(
+          gradient_descent.GradientDescentOptimizer(1.0))
+      train_ops.append(another_optimizer.minimize(another_loss, var_list=[d]))
+
+    train_op = control_flow_ops.group(train_ops)
+    return model_fn_lib.EstimatorSpec(
+        mode=mode,
+        loss=total_loss,
+        eval_metric_ops=metrics,
+        predictions={'probabilities': predictions},
+        train_op=train_op)
+
+  def test_train(self):
+    features = np.array([[1.0], [2.0]])
+    labels = np.array([[1.0], [2.0]])
+
+    with self.test_session() as session:
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn,
+          loss_reduction=losses.Reduction.SUM,
+          devices=['/gpu:0', '/gpu:1'])
+      estimator_spec = replicated_model_fn(features, labels,
+                                           model_fn_lib.ModeKeys.TRAIN, {})
+      session.run(variables.global_variables_initializer())
+
+      # For each tower, loss = (feature * c - label) + (feature * d - label).
+      total_loss = (1.0 * 10 - 1.0 + 1.0 * 2.0 - 1.0) + (
+          2.0 * 10 - 2.0 + 2.0 * 2.0 - 2.0)
+      self.assertEqual(total_loss, session.run(estimator_spec.loss))
+
+      session.run(estimator_spec.train_op)
+
+      # loss' of c or loss' of d is 3.
+      # new value of c = 10 - learning rate * 3 = 7.0.
+      # new value of d = 2  - learning rate * 3 = -1.0.
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertNear(7.0, session.run(c), 0.000001)
+        d = variable_scope.get_variable('d', dtype=dtypes.float64)
+        self.assertNear(-1.0, session.run(d), 0.000001)
+
+  def test_different_optimizer_calls_within_towers(self):
+    self.incorrectly_skip_optimizer_for_tower(1)
+
+    features = np.array([[1.0], [2.0]])
+    labels = np.array([[1.0], [2.0]])
+
+    with self.test_session(), ops_lib.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError, '.+was.+supposed.+to.+make.+same.+optimizer.+calls.+'):
+        replicated_model_fn = replicate_model_fn.replicate_model_fn(
+            self.model_fn, devices=['/gpu:0', '/gpu:1'])
+        _ = replicated_model_fn(features, labels, model_fn_lib.ModeKeys.TRAIN,
+                                {})
+
+
+class FailToWrapOptimizerInTheModelFn(test_util.TensorFlowTestCase):
+
+  def model_fn(self, mode, features, labels, params):
+    c = variable_scope.get_variable(
+        'c',
+        initializer=constant_op.constant(10, dtype=dtypes.float64),
+        dtype=dtypes.float64)
+
+    predictions = math_ops.multiply(features, c)
+
+    loss = losses.absolute_difference(
+        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
+    loss = math_ops.reduce_sum(loss)
+
+    metrics = {
+        'accuracy': metrics_lib.accuracy(labels, predictions),
+        'auc': metrics_lib.auc(labels, predictions)
+    }
+
+    optimizer = gradient_descent.GradientDescentOptimizer(1.0)
+    train_op = optimizer.minimize(loss)
+
+    return model_fn_lib.EstimatorSpec(
+        mode=mode,
+        loss=loss,
+        eval_metric_ops=metrics,
+        predictions={'probabilities': predictions},
+        train_op=train_op)
+
+  def test_train(self):
+    features = np.array([[1.0], [2.0]])
+    labels = np.array([[1.0], [2.0]])
+
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError,
+                                   'Please.+wrap.+with.+TowerOptimizer'):
+        replicated_model_fn = replicate_model_fn.replicate_model_fn(
+            self.model_fn, devices=['/gpu:0', '/gpu:1'])
+        _ = replicated_model_fn(features, labels, model_fn_lib.ModeKeys.TRAIN,
+                                {})
+
 
 class GetLossTowersTest(test_util.TensorFlowTestCase):
 
@@ -358,8 +912,9 @@ class GetLossTowersTest(test_util.TensorFlowTestCase):
           labels=[[0.6], [0.6]],
           params=None,
           config=None,
+          loss_reduction=losses.Reduction.SUM,
           devices=['/gpu:0', '/gpu:1'],
-          local_ps_device='/gpu:0',
+          local_ps_devices=['/gpu:0'],
           name_scope_pattern='test_tower_{}')
       session.run(variables.global_variables_initializer())
 
@@ -382,6 +937,89 @@ class GetLossTowersTest(test_util.TensorFlowTestCase):
         c = variable_scope.get_variable('c', dtype=dtypes.float64)
         self.assertEqual(0.25, session.run(c))
 
+  def test_gradients_are_computed_with_mean_reduction(self):
+    with self.test_session() as session:
+      tower_specs = replicate_model_fn._get_loss_towers(
+          self.model_fn,
+          mode=model_fn_lib.ModeKeys.EVAL,
+          features=[[0.6], [1.6]],
+          labels=[[0.6], [0.6]],
+          params=None,
+          loss_reduction=losses.Reduction.MEAN,
+          config=None,
+          devices=['/gpu:0', '/gpu:1'],
+          local_ps_devices=['/gpu:0'],
+          name_scope_pattern='test_tower_{}')
+      session.run(variables.global_variables_initializer())
+
+      self.assertEqual(len(tower_specs), 2)
+
+      self.assertEqual('/device:GPU:0', tower_specs[0].loss.device)
+      self.assertEqual('averaged_loss:0', tower_specs[0].loss.name)
+      self.assertEqual(0.5, session.run(tower_specs[0].loss))
+
+      self.assertEqual('/device:GPU:1', tower_specs[1].loss.device)
+      self.assertEqual('test_tower_1/averaged_loss:0', tower_specs[1].loss.name)
+      # The input batch for the second tower had a loss that is 1.0
+      # bigger: 0.6 vs 1.6.
+      self.assertEqual(1.0, session.run(tower_specs[1].loss))
+
+      self.assertEqual(1, len(variables.global_variables()))
+      self.assertEqual(1, len(variables.trainable_variables()))
+
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual(0.25, session.run(c))
+
+  def test_variables_are_round_robined_correctly(self):
+    """Test that creates multiple variables and tests round-robin placement."""
+
+    def model_fn(mode, features, labels, params):
+      del params
+      for variable_name in ['a', 'b', 'c', 'd']:
+        c = variable_scope.get_variable(
+            variable_name,
+            initializer=constant_op.constant(0.25, dtype=dtypes.float64),
+            dtype=dtypes.float64)
+
+      predictions = math_ops.add(np.array([0.1, 0.2, 0.3, features[0]]), c)
+      labels = np.array([0.1, 0.2, 0.3, labels[0]])
+      loss = losses.absolute_difference(
+          labels=labels,
+          predictions=predictions,
+          reduction=losses.Reduction.SUM)
+      return model_fn_lib.EstimatorSpec(
+          mode=mode, loss=math_ops.reduce_sum(loss))
+
+    with self.test_session() as session:
+      tower_specs = replicate_model_fn._get_loss_towers(
+          model_fn,
+          mode=None,
+          features=[[0.6], [1.6], [2.6]],
+          labels=[[0.6], [0.6], [2.6]],
+          params=None,
+          loss_reduction=losses.Reduction.SUM,
+          config=None,
+          devices=['/gpu:0', '/gpu:1', '/gpu:3'],
+          local_ps_devices=['/gpu:0', '/gpu:1', '/gpu:3'],
+          name_scope_pattern='test_tower_{}')
+      session.run(variables.global_variables_initializer())
+
+      self.assertEqual(len(tower_specs), 3)
+      self.assertEqual('/device:GPU:0', tower_specs[0].loss.device)
+      self.assertEqual('/device:GPU:1', tower_specs[1].loss.device)
+      self.assertEqual('/device:GPU:3', tower_specs[2].loss.device)
+
+      with variable_scope.variable_scope('', reuse=True):
+        a = variable_scope.get_variable('a', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:0', a.device)
+        b = variable_scope.get_variable('b', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:1', b.device)
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:3', c.device)
+        d = variable_scope.get_variable('d', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:0', d.device)
+
 
 class SplitBatchTest(test_util.TensorFlowTestCase):
 
@@ -390,8 +1028,13 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
     return list(map(evaluate_items, first_list)), list(
         map(evaluate_items, second_list))
 
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
   def test_simple_half_split(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -404,7 +1047,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0, 11.0], [12.0, 13.0]], label_shards)
 
   def test_to_each_their_own(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -417,7 +1060,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0], [11.0], [12.0], [13.0]], label_shards)
 
   def test_one_batch(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = [0.0, 1.0, 2.0, 3.0]
       labels = [10.0, 11.0, 12.0, 13.0]
       feature_shards, label_shards = replicate_model_fn._split_batch(
@@ -430,7 +1073,7 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[10.0, 11.0, 12.0, 13.0]], label_shards)
 
   def test_half_split_in_dictionary(self):
-    with self.test_session() as session:  # pylint: disable=unused-variable
+    with self.test_session():
       features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
       labels = [10.0, 11.0, 12.0, 13.0]
 
@@ -444,6 +1087,58 @@ class SplitBatchTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([10.0, 11.0], label_shards[0].eval())
       self.assertAllEqual([12.0, 13.0], label_shards[1].eval())
 
+  def test_sparse_tensor_can_be_split_unevenly(self):
+    with self.test_session():
+      features = {
+          'x':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [1, 2], [2, 2]],
+                  values=[1.0, 2.0, 3.0],
+                  dense_shape=[3, 4])
+      }
+      labels = np.array([[1.0], [2.0]])
+
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 2, device='/gpu:0')
+
+      self.assertSparseValuesEqual(
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [1, 2]], values=[1., 2.], dense_shape=[2, 4]),
+          feature_shards[0]['x'].eval())
+      self.assertSparseValuesEqual(
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 2]], values=[3.], dense_shape=[1, 4]),
+          feature_shards[1]['x'].eval())
+      self.assertAllEqual([[1.0]], label_shards[0].eval())
+      self.assertAllEqual([[2.0]], label_shards[1].eval())
+
+  def test_sparse_tensor_can_be_split_unevenly_repeated_row(self):
+    with self.test_session():
+      features = {
+          'x':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [1, 0], [1, 1]],
+                  values=[1.0, 2.0, 3.0],
+                  dense_shape=[3, 4])
+      }
+      labels = np.array([[1.0], [2.0]])
+
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 2, device='/gpu:0')
+
+      self.assertSparseValuesEqual(
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [1, 0], [1, 1]],
+              values=[1., 2., 3.],
+              dense_shape=[2, 4]), feature_shards[0]['x'].eval())
+
+      second_batch = feature_shards[1]['x'].eval()
+      self.assertFalse(len(second_batch.indices))
+      self.assertFalse(len(second_batch.values))
+      self.assertAllEqual([1, 4], second_batch.dense_shape)
+      self.assertAllEqual([[1.0]], label_shards[0].eval())
+      self.assertAllEqual([[2.0]], label_shards[1].eval())
+
   def test_one_batch_in_dictionary(self):
     with self.test_session() as session:  # pylint: disable=unused-variable
       features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
@@ -600,11 +1295,12 @@ class PredictSpecTest(test_util.TensorFlowTestCase):
           self.model_fn,
           mode=None,
           features=[[0.1], [0.2]],
+          loss_reduction=losses.Reduction.SUM,
           labels=[[], []],
           params=None,
           config=None,
           devices=['/gpu:0', '/gpu:1'],
-          local_ps_device='/gpu:0',
+          local_ps_devices=['/gpu:0'],
       )
       session.run(variables.global_variables_initializer())
 
@@ -718,16 +1414,14 @@ class ReduceMetricVariablesTest(test_util.TensorFlowTestCase):
           variables.variables_initializer(
               ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)))
 
-      with self.assertRaisesRegexp(ValueError, ''):
+      with self.assertRaisesRegexp(
+          ValueError, '.+Expected.+local.+variables.+but.+got.+instead.+'):
         session.run(
             replicate_model_fn._reduce_metric_variables(number_of_towers=3))
 
 
 class MergeExportOutputsTest(test_util.TensorFlowTestCase):
 
-  def optimizer_fn(self):
-    return gradient_descent.GradientDescentOptimizer(1.0)
-
   def model_fn(self, mode, features, labels, params):
     c = variable_scope.get_variable(
         'c',
@@ -769,7 +1463,6 @@ class MergeExportOutputsTest(test_util.TensorFlowTestCase):
         loss=math_ops.reduce_sum(loss),
         eval_metric_ops=metrics,
         predictions=predictions,
-        train_op=loss,  # This train_op isn't actually used.
         export_outputs=export_outputs)
 
   def replicate_estimator_spec(self, session):
@@ -777,13 +1470,13 @@ class MergeExportOutputsTest(test_util.TensorFlowTestCase):
     labels = np.array([0.01, 0.02])
 
     replicated_model_fn = replicate_model_fn.replicate_model_fn(
-        self.model_fn, self.optimizer_fn, devices=['/gpu:0', '/gpu:1'])
+        self.model_fn, devices=['/gpu:0', '/gpu:1'])
     estimator_spec = replicated_model_fn(features, labels,
                                          model_fn_lib.ModeKeys.PREDICT, {})
     session.run(variables.global_variables_initializer())
     return estimator_spec
 
-  def test_merde_predict_output(self):
+  def test_merge_predict_output(self):
     with self.test_session() as session:
       estimator_spec = self.replicate_estimator_spec(session)
       self.assertAllClose(
@@ -850,25 +1543,66 @@ class GetLocalDevicesTest(test_util.TensorFlowTestCase):
 class LocalDeviceSetterTest(test_util.TensorFlowTestCase):
 
   def test_vars_are_on_ps_but_ops_are_on_workers(self):
+    ps_devices = ['/device:GPU:3']
+    round_robin = device_setter._RoundRobinStrategy(num_tasks=len(ps_devices))
+
     local_device_setter = replicate_model_fn._local_device_setter(
-        ps_device='/device:GPU:3', worker_device='/device:GPU:2')
+        ps_devices=ps_devices,
+        ps_strategy=round_robin,
+        worker_device='/device:GPU:2')
 
     with ops_lib.device(local_device_setter):
-      c = variables.Variable(0.01)
+      a = variables.Variable(0.01)
+      self.assertEqual('/device:GPU:3', a.device)
+
+      b = variables.Variable(0.02)
+      self.assertEqual('/device:GPU:3', b.device)
+
+      c = variables.Variable(0.03)
       self.assertEqual('/device:GPU:3', c.device)
 
-      cc = variables.Variable(0.02)
-      self.assertEqual('/device:GPU:3', cc.device)
+      a_op = array_ops.concat(a, axis=0)
+      self.assertEqual('/device:GPU:2', a_op.device)
 
-      ccc = variables.Variable(0.03)
-      self.assertEqual('/device:GPU:3', ccc.device)
+      b_op = array_ops.concat(b, axis=0)
+      self.assertEqual('/device:GPU:2', b_op.device)
+
+  def test_round_robin_placement(self):
+    ps_devices = [
+        '/device:GPU:0', '/device:GPU:1', '/device:GPU:3', '/device:GPU:4'
+    ]
+    round_robin = device_setter._RoundRobinStrategy(num_tasks=len(ps_devices))
+
+    local_device_setter = replicate_model_fn._local_device_setter(
+        ps_devices=ps_devices,
+        ps_strategy=round_robin,
+        worker_device='/device:GPU:2')
+
+    with ops_lib.device(local_device_setter):
+      a = variables.Variable(0.01)
+      self.assertEqual('/device:GPU:0', a.device)
+
+      b = variables.Variable(0.02)
+      self.assertEqual('/device:GPU:1', b.device)
+
+      c = variables.Variable(0.03)
+      self.assertEqual('/device:GPU:3', c.device)
+
+      a_op = array_ops.concat(a, axis=0)
+      self.assertEqual('/device:GPU:2', a_op.device)
+
+      b_op = array_ops.concat(b, axis=0)
+      self.assertEqual('/device:GPU:2', b_op.device)
+
+      c = variables.Variable(0.03)
+      self.assertEqual('/device:GPU:4', c.device)
+
+      d = variables.Variable(0.03)
+      self.assertEqual('/device:GPU:0', d.device)
 
       c_op = array_ops.concat(c, axis=0)
       self.assertEqual('/device:GPU:2', c_op.device)
 
-      cc_op = array_ops.concat(cc, axis=0)
-      self.assertEqual('/device:GPU:2', cc_op.device)
-
 
 class ComputeSumWithDevicePlacementTest(test_util.TensorFlowTestCase):
 
@@ -939,7 +1673,7 @@ class ComputeSumWithDevicePlacementTest(test_util.TensorFlowTestCase):
         dense_shape=constant_op.constant([2]))
     b = ops_lib.IndexedSlices(constant_op.constant([3.0, 4.0]), [0, 1])
 
-    with self.assertRaisesRegexp(ValueError, ''):
+    with self.assertRaisesRegexp(ValueError, '.+name.+not.+expected.+'):
       _ = replicate_model_fn._compute_sum_on_device(
           [a, b], device='/device:GPU:0', name='cant_name_indexslices')
 
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index fe86a20ab1f69a0eaf9d7486142451dac6337274..180f1b68f3b56113dfbbfc100bd04efc3bb8b31f 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -221,6 +221,7 @@ py_test(
     name = "kmeans_test",
     size = "medium",
     srcs = ["python/ops/kmeans_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # b/67512932
     deps = [
diff --git a/tensorflow/contrib/factorization/examples/BUILD b/tensorflow/contrib/factorization/examples/BUILD
index 363baa121ab3854a802ca3606e35597d31b35a57..bbe842bd5ccc7357805adda1df42ba8799fcd8f2 100644
--- a/tensorflow/contrib/factorization/examples/BUILD
+++ b/tensorflow/contrib/factorization/examples/BUILD
@@ -21,3 +21,14 @@ tf_py_test(
     ],
     tags = ["notsan"],
 )
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
index 31d08bfb65ea49e1378ffba480771d38ce16abec..a8c5d0763c28ba2b54f217405f0da65533f26b91 100644
--- a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
@@ -57,11 +57,11 @@ typedef Eigen::Map<
 
 class MaskedMatmulOp : public OpKernel {
  public:
-  explicit MaskedMatmulOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->MatchSignature(
-        {DT_FLOAT, DT_FLOAT, DT_INT64, DT_BOOL, DT_BOOL},
-        {DT_FLOAT}));
+  explicit MaskedMatmulOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(
+        context,
+        context->MatchSignature(
+            {DT_FLOAT, DT_FLOAT, DT_INT64, DT_BOOL, DT_BOOL}, {DT_FLOAT}));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -110,12 +110,11 @@ class MaskedMatmulOp : public OpKernel {
                                       num_nonzero_elements, 2);
 
     Tensor* prod_values_tensor;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       0, TensorShape({num_nonzero_elements}),
-                       &prod_values_tensor));
-    EigenMatFloatMap prod_values(prod_values_tensor->vec<float>().data(),
-                                 1, num_nonzero_elements);
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, TensorShape({num_nonzero_elements}),
+                                &prod_values_tensor));
+    EigenMatFloatMap prod_values(prod_values_tensor->vec<float>().data(), 1,
+                                 num_nonzero_elements);
 
     auto get_a_index = [&indices_mat, &a_dim_0](int64 i) {
       int64 a_index = internal::SubtleMustCopy(indices_mat(i, 0));
@@ -182,8 +181,8 @@ class MaskedMatmulOp : public OpKernel {
       }
     };
     // Shard the work.
-    worker_threads.workers->ParallelFor(
-        num_nonzero_elements, cost_per_unit, work);
+    worker_threads.workers->ParallelFor(num_nonzero_elements, cost_per_unit,
+                                        work);
   }
 };
 REGISTER_KERNEL_BUILDER(Name("MaskedMatmul").Device(DEVICE_CPU),
diff --git a/tensorflow/contrib/factorization/python/ops/clustering_ops.py b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
index 96cc80ce241347ebca5b68140f1b1c8b9898ae72..23137e0a973c0bdd2cdbd97159f7fd310178bf54 100644
--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@@ -192,11 +192,11 @@ class KMeans(object):
         # Computes Euclidean distance. Note the first and third terms are
         # broadcast additions.
         squared_distance = (
-            math_ops.reduce_sum(math_ops.square(inp), 1, keep_dims=True) -
+            math_ops.reduce_sum(math_ops.square(inp), 1, keepdims=True) -
             2 * math_ops.matmul(inp, clusters, transpose_b=True) +
             array_ops.transpose(
                 math_ops.reduce_sum(
-                    math_ops.square(clusters), 1, keep_dims=True)))
+                    math_ops.square(clusters), 1, keepdims=True)))
         output.append(squared_distance)
 
     return output
@@ -261,8 +261,8 @@ class KMeans(object):
             inp, clusters, 1)
         if self._distance_metric == COSINE_DISTANCE:
           distances *= 0.5
-        output.append((score, array_ops.squeeze(distances),
-                       array_ops.squeeze(indices)))
+        output.append((score, array_ops.squeeze(distances, [-1]),
+                       array_ops.squeeze(indices, [-1])))
     return zip(*output)
 
   def _clusters_l2_normalized(self):
diff --git a/tensorflow/contrib/factorization/python/ops/gmm.py b/tensorflow/contrib/factorization/python/ops/gmm.py
index 0d67e09f8151b48c97094b6b48f26e63443707ef..b2dfe48b2dbe0ec0975f865bba95a7ceba0f590c 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm.py
@@ -24,17 +24,16 @@ import numpy as np
 from tensorflow.contrib import framework
 from tensorflow.contrib.factorization.python.ops import gmm_ops
 from tensorflow.contrib.framework.python.framework import checkpoint_utils
-from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops as logging
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops.control_flow_ops import with_dependencies
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
 
 
 def _streaming_sum(scalar_tensor):
@@ -70,8 +69,8 @@ class _InitializeClustersHook(session_run_hook.SessionRunHook):
 class GMM(estimator.Estimator):
   """An estimator for GMM clustering."""
   SCORES = 'scores'
+  LOG_LIKELIHOOD = 'loss'
   ASSIGNMENTS = 'assignments'
-  ALL_SCORES = 'all_scores'
 
   def __init__(self,
                num_clusters,
@@ -113,10 +112,7 @@ class GMM(estimator.Estimator):
       yield result[GMM.ASSIGNMENTS]
 
   def score(self, input_fn=None, batch_size=None, steps=None):
-    """Predict total sum of distances to nearest clusters.
-
-    Note that this function is different from the corresponding one in sklearn
-    which returns the negative of the sum of distances.
+    """Predict total log-likelihood.
 
     Args:
       input_fn: see predict.
@@ -124,11 +120,11 @@ class GMM(estimator.Estimator):
       steps: see predict.
 
     Returns:
-      Total sum of distances to nearest clusters.
+      Total log-likelihood.
     """
     results = self.evaluate(input_fn=input_fn, batch_size=batch_size,
                             steps=steps)
-    return np.sum(results[GMM.SCORES])
+    return np.log(np.sum(np.exp(results[GMM.SCORES])))
 
   def weights(self):
     """Returns the cluster weights."""
@@ -158,26 +154,26 @@ class GMM(estimator.Estimator):
     def _model_fn(features, labels, mode, config):
       """Model function."""
       assert labels is None, labels
-      (all_scores,
+      (loss,
+       scores,
        model_predictions,
-       losses, training_op,
+       training_op,
        init_op,
        is_initialized) = gmm_ops.gmm(self._parse_tensor_or_dict(features),
                                      self._training_initial_clusters,
                                      self._num_clusters, self._random_seed,
                                      self._covariance_type,
                                      self._params)
-      incr_step = state_ops.assign_add(variables.get_global_step(), 1)
-      loss = math_ops.reduce_sum(losses)
+      incr_step = state_ops.assign_add(training_util.get_global_step(), 1)
       training_op = with_dependencies([training_op, incr_step], loss)
       training_hooks = [_InitializeClustersHook(
           init_op, is_initialized, config.is_chief)]
       predictions = {
-          GMM.ALL_SCORES: all_scores[0],
           GMM.ASSIGNMENTS: model_predictions[0][0],
       }
       eval_metric_ops = {
-          GMM.SCORES: _streaming_sum(loss),
+          GMM.SCORES: scores,
+          GMM.LOG_LIKELIHOOD: _streaming_sum(loss),
       }
       return model_fn_lib.ModelFnOps(mode=mode, predictions=predictions,
                                      eval_metric_ops=eval_metric_ops,
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index a61681c7f5a69a0fff1089404fc80b95c1c3106e..98d6434f4752b224201e38bed05ccd14428a758b 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -21,7 +21,6 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -36,7 +35,6 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.embedding_ops import embedding_lookup
-from tensorflow.python.summary import summary
 
 # Machine epsilon.
 MEPS = np.finfo(float).eps
@@ -253,14 +251,16 @@ class GmmAlgorithm(object):
     return ret
 
   def scores(self):
-    """Returns the distances to each class.
+    """Returns the per-sample likelihood fo the data.
 
     Returns:
-      A tuple with two Tensors. The first contains the distance to
-    each class. The second contains the distance to the assigned
-    class.
+      Log probabilities of each data point.
     """
-    return (self._all_scores, self._scores)
+    return self._scores
+
+  def log_likelihood_op(self):
+    """Returns the log-likelihood operation."""
+    return self._log_likelihood_op
 
   def _define_graph(self, data):
     """Define graph for a single iteration.
@@ -276,7 +276,8 @@ class GmmAlgorithm(object):
       self._define_expectation_operation(shard_id)
       self._define_partial_maximization_operation(shard_id, shard)
     self._define_maximization_operation(len(data))
-    self._define_distance_to_clusters(data)
+    self._define_loglikelihood_operation()
+    self._define_score_samples()
 
   def _define_full_covariance_probs(self, shard_id, shard):
     """Defines the full covariance probabilties per example in a class.
@@ -440,50 +441,20 @@ class GmmAlgorithm(object):
                 state_ops.assign(
                     self._covs, new_covs, validate_shape=False))
 
-  def _define_distance_to_clusters(self, data):
-    """Defines the Mahalanobis distance to the assigned Gaussian."""
-    # TODO(xavigonzalvo): reuse (input - mean) * cov^-1 * (input -
-    # mean) from log probability function.
-    self._all_scores = []
-    for shard in data:
-      all_scores = []
-      shard = array_ops.expand_dims(shard, 0)
-      for c in xrange(self._num_classes):
-        if self._covariance_type == FULL_COVARIANCE:
-          cov = self._covs[c, :, :]
-        elif self._covariance_type == DIAG_COVARIANCE:
-          cov = array_ops.diag(self._covs[c, :])
-        inverse = linalg_ops.matrix_inverse(cov + self._min_var)
-        inv_cov = array_ops.tile(
-            array_ops.expand_dims(inverse, 0),
-            array_ops.stack([self._num_examples, 1, 1]))
-        diff = array_ops.transpose(shard - self._means[c, :, :], perm=[1, 0, 2])
-        m_left = math_ops.matmul(diff, inv_cov)
-        all_scores.append(
-            math_ops.sqrt(
-                math_ops.matmul(
-                    m_left, array_ops.transpose(
-                        diff, perm=[0, 2, 1]))))
-      self._all_scores.append(
-          array_ops.reshape(
-              array_ops.concat(all_scores, 1),
-              array_ops.stack([self._num_examples, self._num_classes])))
-
-    # Distance to the associated class.
-    self._all_scores = array_ops.concat(self._all_scores, 0)
-    assignments = array_ops.concat(self.assignments(), 0)
-    rows = math_ops.to_int64(math_ops.range(0, self._num_examples))
-    indices = array_ops.concat(
-        [array_ops.expand_dims(rows, 1), array_ops.expand_dims(assignments, 1)],
-        1)
-    self._scores = array_ops.gather_nd(self._all_scores, indices)
-
   def _define_loglikelihood_operation(self):
     """Defines the total log-likelihood of current iteration."""
-    self._ll_op = []
+    op = []
     for prior_probs in self._prior_probs:
-      self._ll_op.append(math_ops.reduce_sum(math_ops.log(prior_probs)))
-    summary.scalar('ll', math_ops.reduce_sum(self._ll_op))
+      op.append(math_ops.reduce_logsumexp(prior_probs))
+    self._log_likelihood_op = math_ops.reduce_logsumexp(op)
+
+  def _define_score_samples(self):
+    """Defines the likelihood of each data sample."""
+    op = []
+    for shard_id, prior_probs in enumerate(self._prior_probs):
+      op.append(prior_probs + math_ops.log(self._w[shard_id]))
+    self._scores = array_ops.squeeze(
+        math_ops.reduce_logsumexp(op, axis=2, keep_dims=True), axis=0)
 
 
 def gmm(inp,
@@ -511,14 +482,9 @@ def gmm(inp,
   Returns:
     Note: tuple of lists returned to be consistent with skflow
     A tuple consisting of:
-    all_scores: A matrix (or list of matrices) of dimensions (num_input,
-      num_clusters) where the value is the distance of an input vector and a
-      cluster center.
     assignments: A vector (or list of vectors). Each element in the vector
       corresponds to an input row in 'inp' and specifies the cluster id
       corresponding to the input.
-    scores: Similar to assignments but specifies the distance to the
-      assigned cluster instead.
     training_op: an op that runs an iteration of training.
     init_op: an op that runs the initialization.
   """
@@ -532,6 +498,7 @@ def gmm(inp,
   gmm_tool = GmmAlgorithm(inp, num_clusters, initial_means, params,
                           covariance_type, random_seed)
   assignments = gmm_tool.assignments()
-  all_scores, scores = gmm_tool.scores()
-  return ([all_scores], [assignments], [scores], gmm_tool.training_ops(),
+  scores = gmm_tool.scores()
+  loss = gmm_tool.log_likelihood_op()
+  return (loss, scores, [assignments], gmm_tool.training_ops(),
           gmm_tool.init_ops(), gmm_tool.is_initialized())
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py b/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
index c50e82db8a230012ba13c1d7ad7e28c23bd27355..888c3c238c2654ea11ea3bf8270d6c3fcd951a03 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops_test.py
@@ -122,17 +122,23 @@ class GmmOpsTest(test.TestCase):
       g.seed = 5
       with self.test_session() as sess:
         data = constant_op.constant(self.data, dtype=dtypes.float32)
-        _, assignments, _, training_op, init_op, _ = gmm_ops.gmm(
+        loss_op, scores, assignments, training_op, init_op, _ = gmm_ops.gmm(
             data, 'random', num_classes, random_seed=self.seed)
 
         variables.global_variables_initializer().run()
         sess.run(init_op)
+        first_loss = sess.run(loss_op)
         for _ in xrange(self.iterations):
           sess.run(training_op)
         assignments = sess.run(assignments)
+        end_loss = sess.run(loss_op)
+        scores = sess.run(scores)
+        self.assertEqual((self.num_examples, 1), scores.shape)
         accuracy = np.mean(
             np.asarray(self.true_assignments) == np.squeeze(assignments))
         logging.info('Accuracy: %f', accuracy)
+        logging.info('First loss: %f, end loss: %f', first_loss, end_loss)
+        self.assertGreater(end_loss, first_loss)
         self.assertGreater(accuracy, 0.98)
 
   def testParams(self):
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_test.py b/tensorflow/contrib/factorization/python/ops/gmm_test.py
index 7717b47daefce9ff65b1f1e84f671a463cf2e826..00a4734eb6d89cd02484f1c5161366377cc71208 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_test.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.factorization.python.ops import gmm as gmm_lib
 from tensorflow.contrib.learn.python.learn.estimators import kmeans
@@ -30,12 +29,9 @@ from tensorflow.python.framework import random_seed as random_seed_lib
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
 from tensorflow.python.training import queue_runner
 
-FLAGS = flags.FLAGS
-
 
 class GMMTest(test.TestCase):
 
@@ -64,9 +60,8 @@ class GMMTest(test.TestCase):
     self.batch_size = self.num_points
     self.true_centers = self.make_random_centers(self.num_centers,
                                                  self.num_dims)
-    self.points, self.assignments, self.scores = self.make_random_points(
+    self.points, self.assignments = self.make_random_points(
         self.true_centers, self.num_points)
-    self.true_score = np.add.reduce(self.scores)
 
     # Use initial means from kmeans (just like scikit-learn does).
     clusterer = kmeans.KMeansClustering(num_clusters=self.num_centers)
@@ -86,24 +81,7 @@ class GMMTest(test.TestCase):
     offsets = np.round(
         np.random.randn(num_points, num_dims).astype(np.float32) * 20)
     points = centers[assignments] + offsets
-    means = [
-        np.mean(
-            points[assignments == center], axis=0)
-        for center in xrange(num_centers)
-    ]
-    covs = [
-        np.cov(points[assignments == center].T)
-        for center in xrange(num_centers)
-    ]
-    scores = []
-    for r in xrange(num_points):
-      scores.append(
-          np.sqrt(
-              np.dot(
-                  np.dot(points[r, :] - means[assignments[r]],
-                         np.linalg.inv(covs[assignments[r]])), points[r, :] -
-                  means[assignments[r]])))
-    return (points, assignments, scores)
+    return (points, assignments)
 
   def test_weights(self):
     """Tests the shape of the weights."""
@@ -136,8 +114,7 @@ class GMMTest(test.TestCase):
     gmm.fit(input_fn=self.input_fn(), steps=10)
     score2 = gmm.score(input_fn=self.input_fn(batch_size=self.num_points),
                        steps=1)
-    self.assertGreater(score1, score2)
-    self.assertNear(self.true_score, score2, self.true_score * 0.15)
+    self.assertLess(score1, score2)
 
   def test_infer(self):
     gmm = gmm_lib.GMM(self.num_centers,
@@ -149,8 +126,7 @@ class GMMTest(test.TestCase):
 
     # Make a small test set
     num_points = 40
-    points, true_assignments, true_offsets = (
-        self.make_random_points(clusters, num_points))
+    points, true_assignments = self.make_random_points(clusters, num_points)
 
     assignments = []
     for item in gmm.predict_assignments(
@@ -159,11 +135,6 @@ class GMMTest(test.TestCase):
     assignments = np.ravel(assignments)
     self.assertAllEqual(true_assignments, assignments)
 
-    # Test score
-    score = gmm.score(input_fn=self.input_fn(points=points,
-                                             batch_size=num_points), steps=1)
-    self.assertNear(score, np.sum(true_offsets), 4.05)
-
   def _compare_with_sklearn(self, cov_type):
     # sklearn version.
     iterations = 40
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index 9a5413fc3f2642443621b33d325e3d8c893fd6ac..c861cfff544a78617aa1ace730b50c094cf16330 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -25,6 +25,7 @@ import time
 from tensorflow.contrib.factorization.python.ops import clustering_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export_output
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -32,6 +33,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.summary import summary
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
@@ -141,7 +143,7 @@ class _ModelFn(object):
   def model_fn(self, features, mode, config):
     """Model function for the estimator.
 
-    Note that this does not take a `1abels` arg. This works, but `input_fn` must
+    Note that this does not take a `labels` arg. This works, but `input_fn` must
     return either `features` or, equivalently, `(features, None)`.
 
     Args:
@@ -207,6 +209,15 @@ class _ModelFn(object):
       training_hooks.append(
           _LossRelativeChangeHook(loss, self._relative_tolerance))
 
+    export_outputs = {
+        KMeansClustering.ALL_DISTANCES:
+            export_output.PredictOutput(all_distances[0]),
+        KMeansClustering.CLUSTER_INDEX:
+            export_output.PredictOutput(model_predictions[0]),
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.PredictOutput(model_predictions[0])
+    }
+
     return model_fn_lib.EstimatorSpec(
         mode=mode,
         predictions={
@@ -216,7 +227,8 @@ class _ModelFn(object):
         loss=loss,
         train_op=training_op,
         eval_metric_ops={KMeansClustering.SCORE: metrics.mean(loss)},
-        training_hooks=training_hooks)
+        training_hooks=training_hooks,
+        export_outputs=export_outputs)
 
 
 # TODO(agarwal,ands): support sharded input.
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans_test.py b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
index 4709d7942583f1406a3fa0ff3a078d0283872ea6..f9598bfc08c05ea3bba88b3135da0cf2e6bb0c95 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans_test.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
@@ -194,15 +194,7 @@ class KMeansTest(KMeansTestBase):
     score = kmeans.score(input_fn=self.input_fn(batch_size=self.num_points))
     self.assertNear(self.true_score, score, self.true_score * 0.01)
 
-  def test_infer(self):
-    kmeans = self._kmeans()
-    # Make a call to fit to initialize the cluster centers.
-    max_steps = 1
-    kmeans.train(input_fn=self.input_fn(), max_steps=max_steps)
-    clusters = kmeans.cluster_centers()
-
-    # Make a small test set
-    num_points = 10
+  def _infer_helper(self, kmeans, clusters, num_points):
     points, true_assignments, true_offsets = make_random_points(
         clusters, num_points)
     input_fn = self.input_fn(batch_size=num_points, points=points, num_epochs=1)
@@ -223,6 +215,17 @@ class KMeansTest(KMeansTestBase):
             np.sum(np.square(clusters), axis=1, keepdims=True)))
     self.assertAllClose(transform, true_transform, rtol=0.05, atol=10)
 
+  def test_infer(self):
+    kmeans = self._kmeans()
+    # Make a call to fit to initialize the cluster centers.
+    max_steps = 1
+    kmeans.train(input_fn=self.input_fn(), max_steps=max_steps)
+    clusters = kmeans.cluster_centers()
+
+    # Run inference on small datasets.
+    self._infer_helper(kmeans, clusters, 10)
+    self._infer_helper(kmeans, clusters, 1)
+
 
 class KMeansTestMultiStageInit(KMeansTestBase):
 
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6fc053759c58d30c24657dd22e7d12be46fc7a7e
--- /dev/null
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -0,0 +1,37 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "feature_column_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":sequential_feature_column",
+    ],
+)
+
+py_library(
+    name = "sequential_feature_column",
+    srcs = ["python/feature_column/sequential_feature_column.py"],
+    srcs_version = "PY2AND3",
+    deps = [],
+)
diff --git a/tensorflow/contrib/feature_column/__init__.py b/tensorflow/contrib/feature_column/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6da7b126931effae9cc97091a27070d7013450d4
--- /dev/null
+++ b/tensorflow/contrib/feature_column/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental utilities for tf.feature_column."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.contrib.feature_column.python.feature_column.sequential_feature_column import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+# pylint: enable=unused-import,line-too-long,wildcard-import
+
+_allowed_symbols = [
+]
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py
new file mode 100644
index 0000000000000000000000000000000000000000..690a44ff4368663306733300a1ea70397fb93e1e
--- /dev/null
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequential_feature_column.py
@@ -0,0 +1,19 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental methods for tf.feature_column sequential input."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/ffmpeg/BUILD b/tensorflow/contrib/ffmpeg/BUILD
index dc5a04a0b15870babbc98cf104e109caf829901c..eccce99071dc1477cf4f3bb152f3304b3b0fc35a 100644
--- a/tensorflow/contrib/ffmpeg/BUILD
+++ b/tensorflow/contrib/ffmpeg/BUILD
@@ -155,7 +155,10 @@ tf_py_test(
     data = [
         ":test_data",
     ],
-    tags = ["manual"],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 py_library(
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index 871dff7bbe4912f0daf2bc184d6b0f12510abee7..daba965a98893b992abdc598ec713f13020d6e91 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -26,6 +26,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
+from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op.cc b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
index 92fad70b1f9cc55e0690a3fbb35abcf56aa68f16..5ab57ca4cd413bd92f1576278b22d2602c905309 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
@@ -44,7 +44,7 @@ const char* kValidFileFormats[] = {"mp3", "mp4", "ogg", "wav"};
 void Decode(OpKernelContext* context,
             const tensorflow::StringPiece& file_contents,
             const string& file_format, const int32 samples_per_second,
-            const int32 channel_count) {
+            const int32 channel_count, const string& stream) {
   // Write the input data to a temp file.
   const string temp_filename = io::GetTempFilename(file_format);
   OP_REQUIRES_OK(context, WriteFile(temp_filename, file_contents));
@@ -54,7 +54,7 @@ void Decode(OpKernelContext* context,
   std::vector<float> output_samples;
   Status result =
       ffmpeg::ReadAudioFile(temp_filename, file_format, samples_per_second,
-                            channel_count, &output_samples);
+                            channel_count, stream, &output_samples);
   if (result.code() == error::Code::NOT_FOUND) {
     OP_REQUIRES(
         context, result.ok(),
@@ -99,7 +99,12 @@ void Decode(OpKernelContext* context,
  */
 class DecodeAudioOpV2 : public OpKernel {
  public:
-  explicit DecodeAudioOpV2(OpKernelConstruction* context) : OpKernel(context) {}
+  explicit DecodeAudioOpV2(OpKernelConstruction* context) : OpKernel(context) {
+    string stream;
+    if (context->GetAttr("stream", &stream).ok()) {
+      stream_ = stream;
+    }
+  }
 
   void Compute(OpKernelContext* context) override {
     OP_REQUIRES(
@@ -153,8 +158,12 @@ class DecodeAudioOpV2 : public OpKernel {
         errors::InvalidArgument("channel_count must be positive, but got: ",
                                 channel_count));
 
-    Decode(context, contents, file_format, samples_per_second, channel_count);
+    Decode(context, contents, file_format, samples_per_second, channel_count,
+           stream_);
   }
+
+ private:
+  string stream_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("DecodeAudioV2").Device(DEVICE_CPU),
@@ -166,6 +175,7 @@ REGISTER_OP("DecodeAudioV2")
     .Input("samples_per_second: int32")
     .Input("channel_count: int32")
     .Output("sampled_audio: float")
+    .Attr("stream: string = ''")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       const Tensor* channels_tensor = c->input_tensor(3);
       if (channels_tensor == nullptr) {
@@ -237,7 +247,7 @@ class DecodeAudioOp : public OpKernel {
 
     const tensorflow::StringPiece file_contents = contents.scalar<string>()();
     Decode(context, file_contents, file_format_, samples_per_second_,
-           channel_count_);
+           channel_count_, "");
   }
 
  private:
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op_test.py b/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
index 0d7c9cb99e8a5fad4a7ccf86d7253170ace91fd7..3dc663bb6f589d09ed067eae09d7d7dd0c40ec95 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op_test.py
@@ -33,7 +33,8 @@ class DecodeAudioOpTest(test.TestCase):
 
   def _loadFileAndTest(self, filename, file_format, duration_sec,
                        samples_per_second, channel_count,
-                       samples_per_second_tensor=None, feed_dict=None):
+                       samples_per_second_tensor=None, feed_dict=None,
+                       stream=None):
     """Loads an audio file and validates the output tensor.
 
     Args:
@@ -49,6 +50,9 @@ class DecodeAudioOpTest(test.TestCase):
       feed_dict: Used when evaluating the `decode_audio` op. If not
         provided, will be empty. Useful when providing a placeholder for
         `samples_per_second_tensor`.
+      stream: A string specifying which stream from the content file
+        should be decoded. The default value is '' which leaves the
+        decision to ffmpeg.
     """
     if samples_per_second_tensor is None:
       samples_per_second_tensor = samples_per_second
@@ -62,7 +66,7 @@ class DecodeAudioOpTest(test.TestCase):
           contents,
           file_format=file_format,
           samples_per_second=samples_per_second_tensor,
-          channel_count=channel_count)
+          channel_count=channel_count, stream=stream)
       audio = audio_op.eval(feed_dict=feed_dict or {})
       self.assertEqual(len(audio.shape), 2)
       self.assertNear(
@@ -72,6 +76,17 @@ class DecodeAudioOpTest(test.TestCase):
           0.1 * audio.shape[0])
       self.assertEqual(audio.shape[1], channel_count)
 
+  def testStreamIdentifier(self):
+    # mono_16khz_mp3_32khz_aac.mp4 was generated from:
+    # ffmpeg -i tensorflow/contrib/ffmpeg/testdata/mono_16khz_mp3.mp4 \
+    #        -i tensorflow/contrib/ffmpeg/testdata/mono_32khz_aac.mp4 \
+    #        -strict -2 -map 0:a -map 1:a \
+    #        tensorflow/contrib/ffmpeg/testdata/mono_16khz_mp3_32khz_aac.mp4
+    self._loadFileAndTest('mono_16khz_mp3_32khz_aac.mp4', 'mp4', 2.77, 20000,
+                          1, stream='0')
+    self._loadFileAndTest('mono_16khz_mp3_32khz_aac.mp4', 'mp4', 2.77, 20000,
+                          1, stream='1')
+
   def testMonoMp3(self):
     self._loadFileAndTest('mono_16khz.mp3', 'mp3', 0.57, 20000, 1)
     self._loadFileAndTest('mono_16khz.mp3', 'mp3', 0.57, 20000, 2)
diff --git a/tensorflow/contrib/ffmpeg/decode_video_op.cc b/tensorflow/contrib/ffmpeg/decode_video_op.cc
index d44032968d559bec14722902a4d47d22c46ea4aa..6f8ad486d10a825a277749157d68fa671b9f8d3a 100644
--- a/tensorflow/contrib/ffmpeg/decode_video_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_video_op.cc
@@ -102,16 +102,12 @@ REGISTER_OP("DecodeVideo")
       return Status::OK();
     })
     .Doc(R"doc(
-Processes the contents of an audio file into a tensor using FFmpeg to decode
+Processes the contents of an video file into a tensor using FFmpeg to decode
 the file.
 
-One row of the tensor is created for each channel in the audio file. Each
-channel contains audio samples starting at the beginning of the audio and
-having `1/samples_per_second` time between them. If the `channel_count` is
-different from the contents of the file, channels will be merged or created.
-
-contents: The binary audio file contents, as a string or rank-0 string
-    tensor.
+contents: The binary contents of the video file to decode. This is a
+    scalar.
+output: A rank-4 `Tensor` that has `[frames, height, width, 3]` RGB as output.
 )doc");
 
 }  // namespace ffmpeg
diff --git a/tensorflow/contrib/ffmpeg/decode_video_op_test.py b/tensorflow/contrib/ffmpeg/decode_video_op_test.py
index 4d1fac4ef8afbf44cd45bae065f8a95b0527079a..b43b6b8919223bd7731209d5423b142601396ea5 100644
--- a/tensorflow/contrib/ffmpeg/decode_video_op_test.py
+++ b/tensorflow/contrib/ffmpeg/decode_video_op_test.py
@@ -20,11 +20,9 @@ from __future__ import print_function
 
 import os.path
 
-import six
+import six  # pylint: disable=unused-import
 
 from tensorflow.contrib import ffmpeg
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
@@ -32,7 +30,8 @@ from tensorflow.python.platform import test
 
 class DecodeVideoOpTest(test.TestCase):
 
-  def _loadFileAndTest(self, filename, width, height, frames, bmp_filename, index):
+  def _loadFileAndTest(self, filename, width, height, frames, bmp_filename,
+                       index):
     """Loads an video file and validates the output tensor.
 
     Args:
@@ -40,6 +39,8 @@ class DecodeVideoOpTest(test.TestCase):
       width: The width of the video.
       height: The height of the video.
       frames: The frames of the video.
+      bmp_filename: The filename for the bmp file.
+      index: Index location inside the video.
     """
     with self.test_session():
       path = os.path.join(resource_loader.get_data_files_path(), 'testdata',
@@ -48,7 +49,7 @@ class DecodeVideoOpTest(test.TestCase):
         contents = f.read()
 
       bmp_path = os.path.join(resource_loader.get_data_files_path(), 'testdata',
-                          bmp_filename)
+                              bmp_filename)
       with open(bmp_path, 'rb') as f:
         bmp_contents = f.read()
 
@@ -58,7 +59,7 @@ class DecodeVideoOpTest(test.TestCase):
       video_op = ffmpeg.decode_video(contents)
       video = video_op.eval()
       self.assertEqual(video.shape, (frames, height, width, 3))
-      self.assertAllEqual(video[index,:,:,:], image)
+      self.assertAllEqual(video[index, :, :, :], image)
 
   def testMp4(self):
     self._loadFileAndTest('small.mp4', 560, 320, 166, 'small_100.bmp', 99)
diff --git a/tensorflow/contrib/ffmpeg/default/BUILD b/tensorflow/contrib/ffmpeg/default/BUILD
index 949ae9ad9e4b045ee1b5cc82d49c0e7468c2005d..6b455567d766dbe6d380a498bd7f521db27e077b 100644
--- a/tensorflow/contrib/ffmpeg/default/BUILD
+++ b/tensorflow/contrib/ffmpeg/default/BUILD
@@ -19,6 +19,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
         "@protobuf_archive//:protobuf_headers",
     ],
 )
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index 201774e1d011f35df9c3803f2ed8818cc9b1c1c2..e61221a6b0d34373279a379f356c99c379488182 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -44,39 +44,43 @@ std::vector<string> FfmpegAudioCommandLine(const string& input_filename,
                                            const string& output_filename,
                                            const string& input_format_id,
                                            int32 samples_per_second,
-                                           int32 channel_count) {
-  return {"-nostats",             // No additional progress display.
-          "-nostdin",             // No interactive commands accepted.
-          "-f", input_format_id,  // eg: "mp3"
-          "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename,
-          "-loglevel", "info",  // Enable verbose logging to support debugging.
-          "-map_metadata", "-1",  // Copy global metadata from input to output.
-          "-vn",                  // No video recording.
-          "-ac:a:0", StrCat(channel_count), "-ar:a:0",
-          StrCat(samples_per_second),
-          // Output set (in several ways) to signed 16-bit little-endian ints.
-          "-codec:a:0", "pcm_s16le", "-sample_fmt", "s16", "-f", "s16le",
-          "-sn",  // No subtitle recording.
-          "-y",   // Overwrite output file.
-          StrCat(output_filename)};
+                                           int32 channel_count,
+                                           const string& stream) {
+  std::vector<string> command({
+      "-nostats",             // No additional progress display.
+      "-nostdin",             // No interactive commands accepted.
+      "-f", input_format_id,  // eg: "mp3"
+      "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename,
+      "-loglevel", "error",   // Print errors only.
+      "-hide_banner",         // Skip printing build options, version, etc.
+      "-map_metadata", "-1",  // Copy global metadata from input to output.
+      "-vn",                  // No video recording.
+      "-ac:a:0", StrCat(channel_count), "-ar:a:0", StrCat(samples_per_second),
+      // Output set (in several ways) to signed 16-bit little-endian ints.
+      "-codec:a:0", "pcm_s16le", "-sample_fmt", "s16", "-f", "s16le",
+      "-sn",  // No subtitle recording.
+      "-y"    // Overwrite output file.
+  });
+  if (!stream.empty()) {
+    command.emplace_back("-map");
+    command.emplace_back(StrCat("0:", stream));
+  }
+  command.emplace_back(StrCat(output_filename));
+
+  return command;
 }
 
 std::vector<string> FfmpegVideoCommandLine(const string& input_filename,
                                            const string& output_filename) {
   return {"-nostats",  // No additional progress display.
           "-nostdin",  // No interactive commands accepted.
-          "-i",
-          input_filename,
-          "-f",
-          "image2pipe",
-          "-probesize",
-          StrCat(kDefaultProbeSize),
-          "-loglevel",
-          "info",  // Enable verbose logging to support debugging.
-          "-vcodec",
-          "rawvideo",
-          "-pix_fmt",
-          "rgb24",
+          "-i", input_filename, "-f", "image2pipe", "-probesize",
+          StrCat(kDefaultProbeSize), "-loglevel",
+          // Info is needed to get the information about stream, etc.
+          // It is generated to a separate file, not stdout/stderr.
+          "info",
+          "-hide_banner",  // Skip printing build options, version, etc.
+          "-vcodec", "rawvideo", "-pix_fmt", "rgb24",
           "-y",  // Overwrite output file.
           StrCat(output_filename)};
 }
@@ -121,7 +125,6 @@ bool IsBinaryInstalled(const string& binary_name) {
   std::transform(args.begin(), args.end(), std::back_inserter(args_chars),
                  [](const string& s) { return const_cast<char*>(s.c_str()); });
   args_chars.push_back(nullptr);
-
   ::execvp(kFfmpegExecutable, args_chars.data());
   // exec only returns on error.
   const int error = errno;
@@ -220,7 +223,8 @@ string BuildWavFile(int32 samples_per_second, int32 channel_count,
 Status ReadInfoFile(const string& filename, uint32* width, uint32* height,
                     uint32* frames) {
   string data;
-  ReadFileToString(Env::Default(), filename, &data);
+  TF_QCHECK_OK(ReadFileToString(Env::Default(), filename, &data))
+      << "Could not read FFmpeg file: " << filename;
   bool in_output = false;
   bool in_mapping = false;
   uint32 frames_value = 0;
@@ -305,13 +309,12 @@ Status WriteFile(const string& filename, StringPiece contents) {
 
 Status ReadAudioFile(const string& filename, const string& audio_format_id,
                      int32 samples_per_second, int32 channel_count,
-                     std::vector<float>* output_samples) {
+                     const string& stream, std::vector<float>* output_samples) {
   // Create an argument list.
   string output_filename = io::GetTempFilename("raw");
   const std::vector<string> args =
       FfmpegAudioCommandLine(filename, output_filename, audio_format_id,
-                             samples_per_second, channel_count);
-
+                             samples_per_second, channel_count, stream);
   // Unfortunately, it's impossible to differentiate an exec failure due to the
   // binary being missing and an error from the binary's execution. Therefore,
   // check to see if the binary *should* be available. If not, return an error
@@ -365,7 +368,6 @@ Status ReadVideoFile(const string& filename, std::vector<uint8>* output_data,
   // Create an argument list.
   const std::vector<string> args =
       FfmpegVideoCommandLine(filename, output_filename);
-
   // Execute ffmpeg and report errors.
   pid_t child_pid = ::fork();
   if (child_pid < 0) {
@@ -377,7 +379,7 @@ Status ReadVideoFile(const string& filename, std::vector<uint8>* output_data,
         open(stderr_filename.c_str(), O_RDWR | O_CREAT | O_APPEND, 0600);
     if (fd < 0) {
       const int error = errno;
-      LOG(ERROR) << "FFmpeg stderr file coule not be created: "
+      LOG(ERROR) << "FFmpeg stderr file could not be created: "
                  << strerror(error);
       ::_exit(error);
     }
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
index 85b61b26163d87a10d4e316720b4f633e038bbec..05728b3d37570d06f2f8af67e3b0612d21d07601 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
@@ -32,10 +32,8 @@ namespace tensorflow {
 namespace ffmpeg {
 namespace {
 
-const char kTestWavFilename[] =
-    "contrib/ffmpeg/testdata/mono_10khz.wav";
-const char kTestMp3Filename[] =
-    "contrib/ffmpeg/testdata/test_sound1.mp3";
+const char kTestWavFilename[] = "contrib/ffmpeg/testdata/mono_10khz.wav";
+const char kTestMp3Filename[] = "contrib/ffmpeg/testdata/test_sound1.mp3";
 
 // Set to true via a command line flag iff the test is expected to have FFmpeg
 // installed.
@@ -139,7 +137,7 @@ TEST(FfmpegLibTest, TestRoundTripWav) {
 }  // namespace ffmpeg
 }  // namespace tensorflow
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   tensorflow::string usage = tensorflow::ffmpeg::ParseTestFlags(&argc, argv);
   testing::InitGoogleTest(&argc, argv);
   if (argc != 1) {
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
index 39e7e90cccf1012eb42261bde55d0dc3b7f278ef..d6c885a32424334bfc28c830e3701f219aa244ee 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
@@ -20,9 +20,8 @@
 #include <string>
 #include <vector>
 
-
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_lib.h b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
index c5ea1432bf8b61c87615074a93a45325371c4c87..a8d5a0dd83fb504b5e6671c3e82dc7d2dd3e6a9b 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_FFMPEG_FFMPEG_LIB_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_FFMPEG_FFMPEG_LIB_H_
+#ifndef TENSORFLOW_CONTRIB_FFMPEG_FFMPEG_LIB_H_
+#define TENSORFLOW_CONTRIB_FFMPEG_FFMPEG_LIB_H_
 
 #include <string>
 #include <vector>
@@ -42,7 +42,7 @@ Status WriteFile(const string& filename, tensorflow::StringPiece contents);
 // contain a separate sample for each channel. Frames are ordered by time.
 Status ReadAudioFile(const string& filename, const string& audio_format_id,
                      int32 samples_per_second, int32 channel_count,
-                     std::vector<float>* output_samples);
+                     const string& stream, std::vector<float>* output_samples);
 
 // Creates an audio file using ffmpeg in a specific format. The samples are in
 // [-1.0, 1.0]. If there are multiple channels in the audio then each frame will
@@ -61,4 +61,4 @@ Status ReadVideoFile(const string& filename, std::vector<uint8>* output_data,
 }  // namespace ffmpeg
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_FFMPEG_DEFAULT_FFMPEG_LIB_H_
+#endif  // TENSORFLOW_CONTRIB_FFMPEG_DEFAULT_FFMPEG_LIB_H_
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index 78ead471d2cf9f0654a06dc022d7cc592d14c710..020b5c99c61019254bef0b1dff6bc5901c92758a 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
+from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.util import loader
@@ -30,7 +31,7 @@ _ffmpeg_so = loader.load_op_library(
 
 
 def decode_audio(contents, file_format=None, samples_per_second=None,
-                 channel_count=None):
+                 channel_count=None, stream=None):
   """Create an op that decodes the contents of an audio file.
 
   Note that ffmpeg is free to select the "best" audio track from an mp4.
@@ -50,6 +51,9 @@ def decode_audio(contents, file_format=None, samples_per_second=None,
         `contents` have more than this number, then some channels will
         be merged or dropped. If `contents` has fewer than this, then
         additional channels will be created from the existing ones.
+    stream: A string specifying which stream from the content file
+        should be decoded, e.g., '0' means the 0-th stream.
+        The default value is '' which leaves the decision to ffmpeg.
 
   Returns:
     A rank-2 tensor that has time along dimension 0 and channels along
@@ -60,7 +64,7 @@ def decode_audio(contents, file_format=None, samples_per_second=None,
   """
   return gen_decode_audio_op_py.decode_audio_v2(
       contents, file_format=file_format, samples_per_second=samples_per_second,
-      channel_count=channel_count)
+      channel_count=channel_count, stream=stream)
 
 
 ops.NotDifferentiable('DecodeAudio')
diff --git a/tensorflow/contrib/ffmpeg/testdata/mono_16khz_mp3_32khz_aac.mp4 b/tensorflow/contrib/ffmpeg/testdata/mono_16khz_mp3_32khz_aac.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..2485da86d60837800fbb0b390c440e674de25993
Binary files /dev/null and b/tensorflow/contrib/ffmpeg/testdata/mono_16khz_mp3_32khz_aac.mp4 differ
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index 5b659ddaa1386736eb8cc05a203ed1827ccd160e..9e5f54f0973eae899ca65e4098358107053cb7d4 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -11,11 +11,12 @@ package(default_visibility = [
 ])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 tf_custom_op_py_library(
     name = "framework_py",
@@ -31,8 +32,10 @@ tf_custom_op_py_library(
         "python/ops/arg_scope.py",
         "python/ops/audio_ops.py",
         "python/ops/checkpoint_ops.py",
+        "python/ops/critical_section_ops.py",
         "python/ops/ops.py",
         "python/ops/prettyprint_ops.py",
+        "python/ops/script_ops.py",
         "python/ops/sort_ops.py",
         "python/ops/variables.py",
     ],
@@ -60,6 +63,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:state_ops_gen",
@@ -70,6 +74,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -173,6 +178,21 @@ py_test(
     ],
 )
 
+cuda_py_test(
+    name = "critical_section_test",
+    size = "medium",
+    srcs = ["python/ops/critical_section_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        ":framework_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resource_variable_ops",
+    ],
+)
+
 py_test(
     name = "accumulate_n_v2_eager_test",
     size = "small",
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 4edc77f86ba786ca547b8d3842e2cf02833fbbac..a49d42cd525434d4ffd4a6bb0d8854dc707b9280 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -53,6 +53,7 @@ See the @{$python/contrib.framework} guide.
 @@assign_from_values_fn
 @@create_global_step
 @@filter_variables
+@@fuse_op
 @@get_global_step
 @@get_or_create_global_step
 @@get_local_variables
@@ -81,7 +82,15 @@ See the @{$python/contrib.framework} guide.
 @@load_linear_multiclass_bias_initializer
 @@load_variable_slot_initializer
 
+@@py_func
 @@sort
+
+@@get_placeholders
+
+@@CriticalSection
+
+@@BoundedTensorSpec
+@@TensorSpec
 """
 
 from __future__ import absolute_import
@@ -96,6 +105,9 @@ from tensorflow.contrib.framework.python.ops import *
 from tensorflow.python.framework.ops import prepend_name_scope
 from tensorflow.python.framework.ops import strip_name_scope
 
+from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
+from tensorflow.python.framework.tensor_spec import TensorSpec
+
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = ['nest']
diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
index 6677dca752f84fc1ba7548b7739df04b7aaf14f7..5bf6b67529579e71a615c27e035111a58d5c02e0 100644
--- a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
+++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc
@@ -21,8 +21,8 @@ limitations under the License.
 
 #include "tensorflow/contrib/framework/kernels/zero_initializer_op.h"
 
-#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
@@ -81,8 +81,8 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 #define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 #undef REGISTER_KERNELS
 
-} // namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.h b/tensorflow/contrib/framework/kernels/zero_initializer_op.h
index 14c9268efa869ffd48b01dd2add44990ef7a43f8..99389a5ab6aa73c2ab0e522dd0f9fbc7093c8f4a 100644
--- a/tensorflow/contrib/framework/kernels/zero_initializer_op.h
+++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.h
@@ -29,5 +29,5 @@ struct TensorSetZero {
 };
 }  // namespace functor
 
-} // end namespace tensorflow
-#endif // TENSORFLOW_CONTRIB_FRAMEWORK_KERNELS_ZERO_INITIALIZER_OP_H_
+}  // end namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_FRAMEWORK_KERNELS_ZERO_INITIALIZER_OP_H_
diff --git a/tensorflow/contrib/framework/ops/variable_ops.cc b/tensorflow/contrib/framework/ops/variable_ops.cc
index 1ee8e1498cf07559fe3db78ef832e2cdf26bea1c..706134ba9a51de6253ba7463b17ff662ea740ed0 100644
--- a/tensorflow/contrib/framework/ops/variable_ops.cc
+++ b/tensorflow/contrib/framework/ops/variable_ops.cc
@@ -26,8 +26,8 @@ REGISTER_OP("ZeroInitializer")
     .Attr("T: realnumbertype")
     .SetAllowsUninitializedInput()
     .SetShapeFn([](InferenceContext* c) {
-        c->set_output(0, c->input(0));
-        return Status::OK();
+      c->set_output(0, c->input(0));
+      return Status::OK();
     })
     .Doc(R"doc(
 Initialize 'ref' with all zeros. This op requires that the tensor is not
diff --git a/tensorflow/contrib/framework/python/framework/graph_util.py b/tensorflow/contrib/framework/python/framework/graph_util.py
index 6d5cde5c9e118d372a6532bfc593bd08b9e18a7b..49eec3a3f1a0f357ea3adfade51e71cb0f89942d 100644
--- a/tensorflow/contrib/framework/python/framework/graph_util.py
+++ b/tensorflow/contrib/framework/python/framework/graph_util.py
@@ -133,6 +133,18 @@ def fuse_op(graph_def, input_nodes, output_nodes, output_dtypes,
 def get_placeholders(graph):
   """Get placeholders of a graph.
 
+  For example:
+
+  ```python
+  a = tf.placeholder(dtype=tf.float32, shape=[2, 2], name='a')
+  a = tf.placeholder(dtype=tf.int32, shape=[3, 2], name='b')
+
+  tf.contrib.framework.get_placeholders(tf.get_default_graph())
+  # Returns:
+  #  [<tf.Tensor 'a:0' shape=(2, 2) dtype=float32>,
+  #   <tf.Tensor 'b:0' shape=(3, 2) dtype=int32>]
+  ```
+
   Args:
     graph: A tf.Graph.
   Returns:
@@ -150,5 +162,5 @@ def get_placeholders(graph):
   # The return value (a Tensor) of placeholder() is the
   # first output of this operation in fact.
   operations = graph.get_operations()
-  result = [i.outputs[0] for i in operations if i.type == 'Placeholder']
+  result = [i.outputs[0] for i in operations if i.type == "Placeholder"]
   return result
diff --git a/tensorflow/contrib/framework/python/framework/graph_util_test.py b/tensorflow/contrib/framework/python/framework/graph_util_test.py
index 0722fafc132c0db2ad621f6f9345185f34c643f5..b8a6d109e19211d271c2b15bac66ddacd38fe395 100644
--- a/tensorflow/contrib/framework/python/framework/graph_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/graph_util_test.py
@@ -90,8 +90,9 @@ class GetPlaceholdersTest(test.TestCase):
     with ops.Graph().as_default() as g:
       placeholders = [array_ops.placeholder(dtypes.float32) for _ in range(5)]
       results = graph_util.get_placeholders(g)
-      self.assertEqual(sorted(placeholders, key=lambda x: x._id),  # pylint: disable=protected-access
-                       sorted(results, key=lambda x: x._id))  # pylint: disable=protected-access
+      self.assertEqual(
+          sorted(placeholders, key=lambda x: x._id),  # pylint: disable=protected-access
+          sorted(results, key=lambda x: x._id))  # pylint: disable=protected-access
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index 2effe8eb26e98caa2707315d5f2e0e530ead31d3..8cdb340f2ddd9b3a7f55c1937ef045f4627e99be 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
@@ -77,6 +78,7 @@ class AssertScalarIntTest(test.TestCase):
               [3, 4], dtype=dtypes.int32))
 
 
+@test_util.with_c_api
 class WithShapeTest(test.TestCase):
 
   def _assert_with_shape(self, tensor, expected_value, expected_shape,
@@ -213,16 +215,25 @@ class WithShapeTest(test.TestCase):
       tensor_partial_shape.set_shape([None, 2])
 
       for incompatible_shape in [[0], [1]]:
+        if ops._USE_C_API:
+          error_message = "Shapes must be equal rank, but are 2 and 1"
+        else:
+          error_message = r"Shapes \(\?, 2\) and \([01],\) are not compatible"
         self.assertRaisesRegexp(
-            ValueError, r"Shapes \(\?, 2\) and \([01],\) are not compatible",
+            ValueError, error_message,
             tensor_util.with_shape, incompatible_shape, tensor_partial_shape)
       for incompatible_shape in [[1, 2, 1]]:
         self.assertRaisesRegexp(ValueError, "Dimensions must be equal",
                                 tensor_util.with_shape, incompatible_shape,
                                 tensor_partial_shape)
       for incompatible_shape in [[2, 1]]:
+        if ops._USE_C_API:
+          error_message = (r"Dimension 1 in both shapes must be equal, but are "
+                           r"2 and 1. Shapes are \[\?,2\] and \[2,1\].")
+        else:
+          error_message = r"Shapes \(\?, 2\) and \(2, 1\) are not compatible"
         self.assertRaisesRegexp(
-            ValueError, r"Shapes \(\?, 2\) and \(2, 1\) are not compatible",
+            ValueError, error_message,
             tensor_util.with_shape, incompatible_shape, tensor_partial_shape)
 
       compatible_shape = [2, 2]
diff --git a/tensorflow/contrib/framework/python/ops/__init__.py b/tensorflow/contrib/framework/python/ops/__init__.py
index 685bb94779762ce46ee342e7e0a182c54be64743..c4976497f5fa95d82e492153b117681f693eaa13 100644
--- a/tensorflow/contrib/framework/python/ops/__init__.py
+++ b/tensorflow/contrib/framework/python/ops/__init__.py
@@ -22,8 +22,10 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.contrib.framework.python.ops.arg_scope import *
 from tensorflow.contrib.framework.python.ops.checkpoint_ops import *
+from tensorflow.contrib.framework.python.ops.critical_section_ops import *
 from tensorflow.contrib.framework.python.ops.ops import *
 from tensorflow.contrib.framework.python.ops.prettyprint_ops import *
+from tensorflow.contrib.framework.python.ops.script_ops import *
 from tensorflow.contrib.framework.python.ops.sort_ops import *
 from tensorflow.contrib.framework.python.ops.variables import *
 # pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
index 2375ee4f550616ff60d20b87b5773704d8fbbe1e..476528b0dd3df05239d5dc402b466e06dd789985 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 
@@ -108,4 +109,3 @@ def _AddNGrad(op, grad):
   """Same as gradient for AddN. Copies the gradient to all inputs."""
   # Not broadcasting.
   return [grad] * len(op.inputs)
-
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
index 8f44698da851b48abf831e957c80fa1643a58bda..35974b9e21d2d7423777a95a99f51c9cb4b453b2 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
@@ -27,16 +27,11 @@ import numpy as np
 from tensorflow.contrib.framework.python.ops import accumulate_n_v2 as av2
 
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context as eager_context
-from tensorflow.python.eager import tape
 
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gradients
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
index b5e9f8df79262635bf579a6bf2260bc40c140c6f..45962098e93acfac414396ddbeaa847701ff2b4b 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
@@ -22,7 +22,6 @@ import numpy as np
 
 from tensorflow.contrib.framework.python.ops import accumulate_n_v2 as av2
 
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -31,7 +30,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
-
 class AccumulateNV2Test(test_util.TensorFlowTestCase):
   """Tests of the new, differentiable version of accumulate_n"""
 
@@ -62,8 +60,9 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
         accum_n = av2.accumulate_n_v2(input_vars)
         sess.run(variables.global_variables_initializer())
         accum_n_grad = gradients.gradients(accum_n, input_vars)
-        self.assertAllEqual(np.repeat(1.0, num_inputs), # d/dx (x + y + ...) = 1
-                            [g.eval() for g in accum_n_grad])
+        self.assertAllEqual(
+            np.repeat(1.0, num_inputs),  # d/dx (x + y + ...) = 1
+            [g.eval() for g in accum_n_grad])
 
   # The tests below used to be in a separate class under cwise_ops_test.py,
   # which did not run in the default test target.
@@ -75,8 +74,8 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
           np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20)
       ]
       random_tensors = [
-          ops.convert_to_tensor(
-              x, dtype=dtypes_lib.float32) for x in random_arrays
+          ops.convert_to_tensor(x, dtype=dtypes_lib.float32)
+          for x in random_arrays
       ]
       tf_val = av2.accumulate_n_v2(random_tensors)
       np_val = random_arrays[0]
@@ -95,21 +94,21 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         a = variables.Variable(0.2)
         b = variables.Variable(0.1)
-        tf_val = av2.accumulate_n_v2([a,b], shape=[2,2]) # Should be shape=[]
+        tf_val = av2.accumulate_n_v2([a, b], shape=[2, 2])  # Should be shape=[]
 
   def testIncompatibleShapes(self):
     with self.test_session():
       with self.assertRaises(ValueError):
-        a = variables.Variable(np.array([0.1,0.2]))
-        b = variables.Variable(np.array([[0.3],[0.4]]))
-        tf_val = av2.accumulate_n_v2([a,b])
+        a = variables.Variable(np.array([0.1, 0.2]))
+        b = variables.Variable(np.array([[0.3], [0.4]]))
+        tf_val = av2.accumulate_n_v2([a, b])
 
   def testWrongType(self):
     with self.test_session():
       with self.assertRaises(TypeError):
         a = variables.Variable(0.2, dtype=np.float32)
         b = variables.Variable(0.1, dtype=np.float32)
-        tf_val = av2.accumulate_n_v2([a,b], tensor_dtype=np.int32)
+        tf_val = av2.accumulate_n_v2([a, b], tensor_dtype=np.int32)
 
   def testWrongTypeOneInput(self):
     # Scenario that used to trigger a bug, even when testWrongType() worked
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py
index 2bce00fde2459878a12027bb4d98bd3818bc92a2..409657fe1da0e5540cd2ad6070d86737c039e91f 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope.py
@@ -53,7 +53,8 @@
     net = layers.conv2d(net, 256, [5, 5], scope='conv2')
   ```
 
-  Example of how to use tf.contrib.framework.add_arg_scope to enable your function to be called within an arg_scope later:
+  Example of how to use tf.contrib.framework.add_arg_scope to enable your
+  function to be called within an arg_scope later:
 
   @tf.contrib.framework.add_arg_scope
   def conv2d(*args, **kwargs)
@@ -65,11 +66,10 @@ from __future__ import print_function
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 
-__all__ = ['arg_scope',
-           'add_arg_scope',
-           'current_arg_scope',
-           'has_arg_scope',
-           'arg_scoped_arguments']
+__all__ = [
+    'arg_scope', 'add_arg_scope', 'current_arg_scope', 'has_arg_scope',
+    'arg_scoped_arguments'
+]
 
 _ARGSTACK = [{}]
 
@@ -172,6 +172,7 @@ def add_arg_scope(func):
   Returns:
     A tuple with the decorated function func_with_args().
   """
+
   def func_with_args(*args, **kwargs):
     current_scope = current_arg_scope()
     current_args = kwargs
@@ -180,6 +181,7 @@ def add_arg_scope(func):
       current_args = current_scope[key_func].copy()
       current_args.update(kwargs)
     return func(*args, **current_args)
+
   _add_op(func)
   setattr(func_with_args, '_key_op', _key_op(func))
   return tf_decorator.make_decorator(func, func_with_args)
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_ops.py b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..182fec924febb74a23b82b1664d137f033f3b1b4
--- /dev/null
+++ b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
@@ -0,0 +1,324 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Critical Section object and execution logic."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+# TODO(ebrevdo): Re-enable once CriticalSection is in core.
+# from tensorflow.core.protobuf import critical_section_pb2
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.util import nest
+
+
+# Graph Keys
+CRITICAL_SECTIONS = "critical_sections"
+CRITICAL_SECTION_EXECUTIONS = "critical_section_executions"
+
+
+class _ExecutionSignature(
+    collections.namedtuple("_ExecutionSignature",
+                           ("op", "exclusive_resource_access"))):
+  """A class storing an `ExecuteInCriticalResource` op and associated attrs."""
+  pass
+
+
+class CriticalSection(object):
+  """Critical section.
+
+  A `CriticalSection` object is a resource in the graph which executes subgraphs
+  in **serial** order.  A common example of a subgraph one may wish to run
+  exclusively is the one given by the following function:
+
+  ```python
+  v = resource_variable_ops.ResourceVariable(0.0, name="v")
+
+  def count():
+    value = v.read_value()
+    with tf.control_dependencies([value]):
+      with tf.control_dependencies([v.assign_add(1)]):
+        return tf.identity(value)
+  ```
+
+  Here, a snapshot of `v` is captured in `value`; and then `v` is updated.
+  The snapshot value is returned.
+
+  If multiple workers or threads all execute `count` in parallel, there is no
+  guarantee that access to the variable `v` is atomic at any point within
+  any thread's calculation of `count`.  In fact, even implementing an atomic
+  counter that guarantees that the user will see each value `0, 1, ...,` is
+  currently impossible.
+
+  The solution is to ensure any access to the underlying resource `v` is
+  only processed through a critical section:
+
+  ```python
+  cs = CriticalSection()
+  f1 = cs.execute(count)
+  f2 = cs.execute(count)
+  output = f1 + f2
+  session.run(output)
+  ```
+  The functions `f1` and `f2` will be executed serially, and updates to `v`
+  will be atomic.
+
+  **NOTES**
+
+  All resource objects, including the critical section and any captured
+  variables of functions executed on that critical section, will be
+  colocated to the same device (host and cpu/gpu).
+
+  When using multiple critical sections on the same resources, there is no
+  guarantee of exclusive access to those resources.  This behavior is disallowed
+  by default (but see the kwarg `exclusive_resource_access`).
+
+  For example, running the same function in two separate critical sections
+  will not ensure serial execution:
+
+  ```python
+  v = tf.get_variable("v", initializer=0.0, use_resource=True)
+  def accumulate(up):
+    x = v.read_value()
+    with tf.control_dependencies([x]):
+      with tf.control_dependencies([v.assign_add(up)]):
+        return tf.identity(x)
+  ex1 = CriticalSection().execute(
+    accumulate, 1.0, exclusive_resource_access=False)
+  ex2 = CriticalSection().execute(
+    accumulate, 1.0, exclusive_resource_access=False)
+  bad_sum = ex1 + ex2
+  sess.run(v.initializer)
+  sess.run(bad_sum)  # May return 0.0
+  ```
+  """
+
+  def __init__(self, name=None, critical_section_def=None, import_scope=None):
+    """Creates a critical section."""
+    if critical_section_def and name is not None:
+      raise ValueError("critical_section_def and name are mutually exclusive.")
+    if critical_section_def:
+      self._init_from_proto(critical_section_def, import_scope=import_scope)
+    else:
+      self._init_from_args(name)
+
+  def _init_from_proto(self, critical_section_def, import_scope):
+    raise NotImplementedError("Not yet implemented")
+    # TODO(ebrevdo): Re-enable once CriticalSection is in core.
+    # assert isinstance(
+    #     critical_section_def, critical_section_pb2.CriticalSectionDef)
+    # # Create from critical_section_def.
+    # g = ops.get_default_graph()
+    # self._handle = g.as_graph_element(
+    #     ops.prepend_name_scope(
+    #         critical_section_def.critical_section_name,
+    #         import_scope=import_scope))
+
+  def _init_from_args(self, name):
+    """Initialize the CriticalSection from constructor arguments."""
+    with ops.name_scope(name, "CriticalSection", []) as name:
+      with ops.control_dependencies(None):
+        # pylint: disable=protected-access
+        handle_name = ops._name_from_scope_name(name)
+        container = ops.get_default_graph()._container
+        # pylint: enable=protected-access
+        if container is None:
+          container = ""
+        self._handle = gen_resource_variable_ops.critical_section_op(
+            shared_name=handle_name, name=name)
+    if context.in_graph_mode():
+      ops.add_to_collections(CRITICAL_SECTIONS, self)
+
+  @property
+  def name(self):
+    return self._handle.op.name
+
+  def execute(self, fn, *args, **kwargs):
+    """Execute function `fn(*args, **kwargs)` inside the CriticalSection.
+
+    Args:
+      fn: The function to execute.  Must return at least one tensor.
+      *args: Additional positional arguments to `fn`.
+      **kwargs: Additional keyword arguments to `fn`.
+        Several keywords are reserved for `execute`.  These are:
+
+        - name; The name to use when creating the execute operation.
+        - exclusive_resource_access; Whether the resources required by
+          `fn` should be exclusive to this `CriticalSection`.  Default: `True`.
+          You may want to set this to `False` if you will be accessing a
+          resource in read-only mode in two different CriticalSections.
+
+    Returns:
+      The tensors returned from `fn(*args, **kwargs)`.
+
+    Raises:
+      ValueError: If `fn` attempts to use this `CriticalSection` in any nested
+        way.
+      ValueError: If `exclusive_resource_access` is not provided (is `True`) and
+        another `CriticalSection` has an execution requesting the same
+        resources as in `*args`, `**kwargs`, and any additionaly captured
+        inputs in `fn`.  Note, even if `exclusive_resource_access` is `True`,
+        if another execution in another `CriticalSection` was created without
+        `exclusive_resource_access=True`, a `ValueError` will be raised.
+    """
+    name = kwargs.pop("name", None)
+    exclusive_resource_access = kwargs.pop("exclusive_resource_access", True)
+
+    args = nest.map_structure(ops.convert_to_tensor, args)
+    with ops.name_scope(name, "critical_section_execute", []):
+      fn_op = function.make_defun_op(fn, *args, **kwargs)
+      flat_dtypes = nest.flatten(fn_op.output_dtypes)
+      flat_shapes = nest.flatten(fn_op.output_shapes)
+      all_inputs = nest.flatten(args) + fn_op.captured_inputs
+      if self._handle in all_inputs:
+        raise ValueError("The function fn attempts to access the "
+                         "CriticalSection in which it would be running.  This "
+                         "is illegal and would cause deadlocks.  "
+                         "CriticalSection: %s." % self._handle)
+
+      if context.in_graph_mode():
+        # Collections and op introspection does not work in eager
+        # mode.  This is generally ok; since eager mode (as of
+        # writing) executes sequentially anyway.
+        all_input_resources = [
+            x for x in all_inputs if x.dtype == dtypes.resource]
+        for sg in ops.get_collection(CRITICAL_SECTION_EXECUTIONS):
+          if sg.op.inputs[0].name == self._handle.name:
+            # Other executions in the same critical section are allowed.
+            continue
+          if not (exclusive_resource_access or sg.exclusive_resource_access):
+            # Neither execution requested exclusive access.
+            continue
+          sg_input_names = [y.name for y in sg.op.inputs[1:]]
+          for res in all_input_resources:
+            if res.name in sg_input_names:
+              raise ValueError(
+                  "This execution would access resource %s; but either this "
+                  "execution (CriticalSection: %s) or Execution '%s' "
+                  "(CriticalSection: %s) requested exclusive resource access "
+                  "of this resource for their critical section.  Did you mean "
+                  "to call execute with keyword argument "
+                  "exclusive_resource_access=False?"
+                  % (res.name,
+                     self.name,
+                     sg.op.name,
+                     sg.op.inputs[0].op.name))
+
+      flat_outputs = gen_resource_variable_ops.execute_in_critical_section(
+          critical_section=self._handle,
+          arguments=all_inputs,
+          f=fn_op,
+          output_types=flat_dtypes,
+          output_shapes=flat_shapes)
+
+      if context.in_graph_mode():
+        if isinstance(flat_outputs, ops.Operation):
+          flat_outputs = [flat_outputs]
+        op = (flat_outputs[0].op if isinstance(flat_outputs[0], ops.Tensor)
+              else flat_outputs[0])
+        signature = _ExecutionSignature(
+            op=op,
+            exclusive_resource_access=exclusive_resource_access)
+        ops.add_to_collections(
+            CRITICAL_SECTION_EXECUTIONS, signature)
+
+      return (flat_outputs[0]
+              if (len(flat_outputs) == 1
+                  and isinstance(flat_outputs[0], ops.Operation))
+              else nest.pack_sequence_as(fn_op.output_dtypes, flat_outputs))
+
+  # TODO(ebrevdo): Re-enable once CriticalSection is in core.
+
+  # def to_proto(self, export_scope=None):
+  #   """Converts a `CriticalSection` to a `CriticalSectoinDef` protocol buffer.
+
+  #   Args:
+  #     export_scope: Optional `string`. Name scope to remove.
+
+  #   Returns:
+  #     A `CriticalSectionDef` protocol buffer, or `None` if the
+  #     `CriticalSection` is not in the specified name scope.
+  #   """
+  #   if export_scope is None or self.handle.name.startswith(export_scope):
+  #     cs_def = critical_section_pb2.CriticalSectionDef()
+  #     cs_def.critical_section_name = ops.strip_name_scope(
+  #         self._handle.name, export_scope)
+  #     return cs_def
+  #   else:
+  #     return None
+
+  # @staticmethod
+  # def from_proto(critical_section_def, import_scope=None):
+  #   return CriticalSection(
+  #       critical_section_def=critical_section_def, import_scope=import_scope)
+
+
+# TODO(ebrevdo): Re-enable once CriticalSection is in core.
+
+# def _execution_to_proto_fn(execution_signature, export_scope=None):
+#   """Converts `_ExecutionSignature` to a `CriticalSectionExecutionDef`.
+
+#   Args:
+#     execution_signature: Instance of `_ExecutionSignature`.
+#     export_scope: The export scope, if any.
+
+#   Returns:
+#     An instance of `CriticalSectionExecutionDef`.
+#   """
+#   if (export_scope is None
+#       or execution_signature.op.name.startswith(export_scope)):
+#     op_def = critical_section_pb2.CriticalSectionExecutionDef()
+#     op_def.execute_in_critical_section_name = ops.strip_name_scope(
+#         execution_signature.op.name, export_scope)
+#     op_def.exclusive_resource_access = (
+#         execution_signature.exclusive_resource_access)
+#     return op_def
+#   else:
+#     return None
+
+
+# def _execution_from_proto_fn(op_def, import_scope=None):
+#   """Converts a `CriticalSectionExecutionDef` to a `_ExecutionSignature`."""
+#   assert isinstance(
+#       op_def, critical_section_pb2.CriticalSectionExecutionDef)
+
+#   # Create from op_def.
+#   g = ops.get_default_graph()
+#   execution_op = g.as_graph_element(
+#       ops.prepend_name_scope(
+#           op_def.execute_in_critical_section_name,
+#           import_scope=import_scope))
+#   return _ExecutionSignature(
+#       op=execution_op,
+#       exclusive_resource_access=op_def.exclusive_resource_access)
+
+# ops.register_proto_function(
+#     CRITICAL_SECTIONS,
+#     proto_type=critical_section_pb2.CriticalSectionDef,
+#     to_proto=CriticalSection.to_proto,
+#     from_proto=CriticalSection.from_proto)
+
+# ops.register_proto_function(
+#     CRITICAL_SECTION_EXECUTIONS,
+#     proto_type=critical_section_pb2.CriticalSectionExecutionDef,
+#     to_proto=_execution_to_proto_fn,
+#     from_proto=_execution_from_proto_fn)
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_test.py b/tensorflow/contrib/framework/python/ops/critical_section_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a416724d3ba1719471d70667e140f9cd2daf86c7
--- /dev/null
+++ b/tensorflow/contrib/framework/python/ops/critical_section_test.py
@@ -0,0 +1,178 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""critical section tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.framework.python.ops import critical_section_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import test
+# TODO(ebrevdo): Re-enable once CriticalSection is in core.
+# from tensorflow.python.training import saver as saver_lib
+
+
+class CriticalSectionTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCreateCriticalSection(self):
+    cs = critical_section_ops.CriticalSection(name="cs")
+    v = resource_variable_ops.ResourceVariable(0.0, name="v")
+
+    def fn(a, b):
+      c = v.read_value()
+      with ops.control_dependencies([c]):
+        nv = v.assign_add(a * b)
+        with ops.control_dependencies([nv]):
+          return array_ops.identity(c)
+
+    num_concurrent = 1000
+    r = [cs.execute(fn, 1.0, 2.0) for _ in range(num_concurrent)]
+    self.evaluate(v.initializer)
+    r_value = self.evaluate(r)
+    self.assertAllClose([2.0 * i for i in range(num_concurrent)],
+                        sorted(r_value))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCreateCriticalSectionFnReturnsOp(self):
+    cs = critical_section_ops.CriticalSection(name="cs")
+    v = resource_variable_ops.ResourceVariable(0.0, name="v")
+
+    def fn_return_op(a, b):
+      c = v.read_value()
+      with ops.control_dependencies([c]):
+        nv = v.assign_add(a * b)
+        with ops.control_dependencies([nv]):
+          return ()
+
+    num_concurrent = 100
+    r = [cs.execute(fn_return_op, 1.0, 2.0) for _ in range(num_concurrent)]
+    self.evaluate(v.initializer)
+    self.evaluate(r)
+    final_v = self.evaluate(v)
+    self.assertAllClose(2.0 * num_concurrent, final_v)
+
+  def testCreateCriticalSectionRaw(self):
+    cs = critical_section_ops.CriticalSection(name="cs")
+    v = resource_variable_ops.ResourceVariable(0.0, name="v")
+
+    @function.Defun(dtypes.float32, dtypes.float32)
+    def fn(a, b):
+      c = v.read_value()
+      with ops.control_dependencies([c]):
+        nv = v.assign_add(a * b)
+        with ops.control_dependencies([nv]):
+          return array_ops.identity(c)
+
+    def execute(fn, *args):
+      output_args = fn.definition.signature.output_arg
+      return resource_variable_ops.execute_in_critical_section(
+          critical_section=cs._handle,
+          arguments=list(args) + fn.captured_inputs,
+          f=fn,
+          output_types=[out.type for out in output_args],
+          output_shapes=[tensor_shape.TensorShape(None) for _ in output_args])
+
+    num_concurrent = 1000
+    r = [execute(fn, 1.0, 2.0)[0] for _ in range(num_concurrent)]
+    self.evaluate(v.initializer)
+    r_value = self.evaluate(r)
+    self.assertAllClose([2.0 * i for i in range(num_concurrent)],
+                        sorted(r_value))
+
+  def testCollection(self):
+    cs = critical_section_ops.CriticalSection(name="cs")
+    self.assertIn(
+        cs, ops.get_collection(critical_section_ops.CRITICAL_SECTIONS))
+    execute_op = cs.execute(lambda x: x + 1, 1.0).op
+    self.assertIn(
+        execute_op,
+        [signature.op for signature in
+         ops.get_collection(critical_section_ops.CRITICAL_SECTION_EXECUTIONS)])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testRecursiveCriticalSectionAccessIsIllegal(self):
+    cs = critical_section_ops.CriticalSection(name="cs")
+    def fn(x):
+      return cs.execute(lambda x: x+1, x)
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"attempts to access the CriticalSection in which it would be running"):
+      cs.execute(fn, 1.0)
+
+  def testMultipleCSExecutionsRequestSameResource(self):
+    cs0 = critical_section_ops.CriticalSection()
+    cs1 = critical_section_ops.CriticalSection()
+    v = resource_variable_ops.ResourceVariable(0.0, name="v")
+    cs0.execute(lambda: v + 1)
+    # It's OK for the same CriticalSection to access this resource.
+    cs0.execute(lambda: v - 1)
+    # It's *not* OK for a different CriticalSection to access it by
+    # default.
+    with self.assertRaisesRegexp(
+        ValueError, "requested exclusive resource access"):
+      cs1.execute(lambda: v + 1)
+    # It's not even OK if the second call doesn't request exclusive access.
+    with self.assertRaisesRegexp(
+        ValueError, "requested exclusive resource access"):
+      cs1.execute(lambda: v + 1, exclusive_resource_access=False)
+
+    v2 = resource_variable_ops.ResourceVariable(0.0, name="v2")
+    cs0.execute(lambda: v2 + 1, exclusive_resource_access=False)
+    # It's OK if neither requests exclusive resource access.
+    cs1.execute(lambda: v2 + 1, exclusive_resource_access=False)
+
+    # It's not OK if the second request requires exlusive resource
+    # access.
+    with self.assertRaisesRegexp(
+        ValueError, "requested exclusive resource access"):
+      cs1.execute(lambda: v2 + 1)
+
+  # TODO(ebrevdo): Re-enable once CriticalSection is in core.
+  #
+  # def testCriticalSectionAndExecuteOpSaverRoundTrip(self):
+  #   cs = critical_section_ops.CriticalSection()
+  #   r = cs.execute(lambda x: x + 1, 1.0)
+  #   graph = ops.get_default_graph()
+  #   meta_graph = saver_lib.export_meta_graph(
+  #       graph=graph, collection_list=graph.get_all_collection_keys())
+  #   graph_copy = ops.Graph()
+  #   with graph_copy.as_default():
+  #     _ = saver_lib.import_meta_graph(meta_graph, import_scope="imported")
+  #     restored_cs = ops.get_collection(critical_section_ops.CRITICAL_SECTIONS)
+  #     restored_exec = ops.get_collection(
+  #         critical_section_ops.CRITICAL_SECTION_EXECUTIONS)
+  #     self.assertEqual(1, len(restored_cs))
+  #     self.assertEqual(1, len(restored_exec))
+  #     self.assertEqual(restored_cs[0].name, "imported/%s" % cs.name)
+  #     self.assertEqual(restored_exec[0].op.name, "imported/%s" % r.op.name)
+
+  # def testToProto(self):
+  #   cs = critical_section_ops.CriticalSection(name="cs")
+  #   proto = cs.to_proto()
+  #   self.assertEqual(proto.critical_section_name, cs._handle.name)
+  #   cs_copy = critical_section_ops.CriticalSection.from_proto(proto)
+  #   self.assertEqual(cs_copy._handle, cs._handle)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/framework/python/ops/script_ops.py b/tensorflow/contrib/framework/python/ops/script_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d269fefdcfae7902b35e0f29f8cd12fcc58b882
--- /dev/null
+++ b/tensorflow/contrib/framework/python/ops/script_ops.py
@@ -0,0 +1,143 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Script Language Operators. See the @{$python/script_ops} guide.
+
+@@py_func
+"""
+
+# pylint: disable=g-bad-name
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.script_ops import py_func as _py_func
+from tensorflow.python.util import nest
+
+__all__ = ['py_func']
+
+
+def py_func(func,
+            args=(),
+            kwargs=None,
+            output_types=None,
+            output_shapes=None,
+            stateful=True,
+            name=None):
+  """Wraps a python function and uses it as a TensorFlow op.
+
+  This function is a wrapper around `tf.py_func` and improve it with kwargs
+  and output_shapes. Further it changed some argument names.
+
+  Given a python function `func`, which takes numpy arrays as its
+  inputs and returns numpy arrays as its outputs, wrap this function as an
+  operation in a TensorFlow graph. The following snippet constructs a simple
+  TensorFlow graph that invokes the `np.sinh()` NumPy function as a operation
+  in the graph:
+
+  ```python
+  def my_func(x):
+    # x will be a numpy array with the contents of the placeholder below
+    return np.sinh(x)
+  inp = tf.placeholder(tf.float32)
+  y = tf.py_func(my_func, [inp], tf.float32)
+  ```
+
+
+  **N.B.** The `tf.py_func()` operation has the following known limitations:
+
+  * The body of the function (i.e. `func`) will not be serialized in a
+    `GraphDef`. Therefore, you should not use this function if you need to
+    serialize your model and restore it in a different environment.
+
+  * The operation must run in the same address space as the Python program
+    that calls `tf.py_func()`. If you are using distributed TensorFlow, you
+    must run a `tf.train.Server` in the same process as the program that calls
+    `tf.py_func()` and you must pin the created operation to a device in that
+    server (e.g. using `with tf.device():`).
+
+  Args:
+    func: A Python function, which accepts a list of NumPy `ndarray` objects
+      having element types that match the corresponding `tf.Tensor` objects
+      in `inp`, and returns a list of `ndarray` objects (or a single `ndarray`)
+      having element types that match the corresponding values in `Tout`.
+    args: A list of `Tensor` objects.
+    kwargs: A dict with `Tensor` objects as values.
+    output_types: A nested structure of tensorflow data types or a single
+      tensorflow data type if there is only one, indicating what `func` returns.
+    output_shapes: Same as output_types, except the types are replaces with
+      shapes (optional).
+    stateful: (Boolean.) If True, the function should be considered stateful.
+      If a function is stateless, when given the same input it will return the
+      same output and have no observable side effects. Optimizations such as
+      common subexpression elimination are only performed on stateless
+      operations.
+    name: A name for the operation (optional).
+
+  Returns:
+    Tensorflow op that wraps the input python function.
+  """
+
+  if kwargs is None:
+    kwargs = {}
+
+  if not isinstance(args, (list, tuple)):
+    raise TypeError('args must be list and not {}. args: {}'.format(
+        type(args), args))
+
+  if not isinstance(kwargs, dict):
+    raise TypeError('kwargs must be dict and not {}. args: {}'.format(
+        type(kwargs), kwargs))
+
+  # For dynamic type inference use callable output_types and output_shapes
+  if callable(output_types):
+    # If callable assume same signature and call with tensors and get the types
+    output_types = output_types(*args, **kwargs)
+  if callable(output_shapes):
+    # If callable assume same signature and call with tensors and get the shapes
+    output_shapes = output_shapes(*args, **kwargs)
+
+  flat_output_types = nest.flatten(output_types)
+  args = (args, kwargs)
+  flat_args = nest.flatten(args)
+
+  def python_function_wrapper(*py_args):
+    py_args, py_kwargs = nest.pack_sequence_as(args, py_args)
+
+    ret = func(*py_args, **py_kwargs)
+    # TODO(alextp): Catch Exceptions and improve msg, because tensorflow
+    # ist not able to preserve the traceback, i.e. the Exceptions does not
+    # contain any information where the Exception was raised.
+    nest.assert_shallow_structure(output_types, ret)
+    return nest.flatten(ret)
+
+  flat_values = _py_func(
+      python_function_wrapper,
+      flat_args,
+      flat_output_types,
+      stateful=stateful,
+      name=name)
+
+  if output_shapes is not None:
+    # I am not sure if this is nessesary
+    output_shapes = nest.map_structure_up_to(
+        output_types, tensor_shape.as_shape, output_shapes)
+
+    flattened_shapes = nest.flatten(output_shapes)
+    for ret_t, shape in zip(flat_values, flattened_shapes):
+      ret_t.set_shape(shape)
+
+  return nest.pack_sequence_as(output_types, flat_values)
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index 07b7857e7b2114d251ebb5c14eda9dff0d55bbef..0754c3e0e30a340910a43a3ce86f6ca10afe848e 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -25,6 +25,7 @@ import re
 from tensorflow.contrib.framework.python.ops import add_arg_scope as contrib_add_arg_scope
 from tensorflow.contrib.framework.python.ops import gen_variable_ops
 from tensorflow.contrib.util import loader
+from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import dtypes
@@ -32,9 +33,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import gen_state_ops
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.training import training_util
 from tensorflow.python.util.deprecation import deprecated
@@ -441,7 +441,7 @@ def get_unique_variable(var_op_name):
   """
   candidates = get_variables(scope=var_op_name)
   if not candidates:
-    raise ValueError('Couldnt find variable %s' % var_op_name)
+    raise ValueError('Couldn\'t find variable %s' % var_op_name)
 
   for candidate in candidates:
     if candidate.op.name == var_op_name:
@@ -685,7 +685,8 @@ def assign_from_checkpoint_fn(model_path, var_list, ignore_missing_vars=False,
             'Variable %s missing in checkpoint %s', var, model_path)
     var_list = available_vars
   if var_list:
-    saver = tf_saver.Saver(var_list, reshape=reshape_variables)
+    saver = tf_saver.Saver(var_list, reshape=reshape_variables,
+                           write_version=saver_pb2.SaverDef.V1)
     def callback(session):
       saver.restore(session, model_path)
     return callback
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 88306094ab9947c9c78b03c0013f6afc88316803..0e06575d96f9b9538f0245b12d48cfd7c0e8d981 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA
+#include "cuda/include/cudnn.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/activation_mode.h"
@@ -278,6 +279,28 @@ Status TransformNHWCToNCHW(OpKernelContext* ctx, const Tensor& nhwc_tensor,
   return Status::OK();
 }
 
+// Adjusts padding so cudnn supports it. Sets `adjusted_padding` to be the
+// adjusted padding, and `extra_padding_before` and `extra_padding_after` to be
+// the extra padding that FusedConv needs to apply before calling cudnn.
+void AdjustPaddingForCudnn(int padding, bool is_int8x4, int filter_size,
+                           int* adjusted_padding, int* extra_padding_before,
+                           int* extra_padding_after) {
+#if CUDNN_VERSION < 7000
+  if (is_int8x4 && filter_size >= 6) {
+    // TODO(b/70795525): Remove after NVIDIA fixes this bug with int8 fused
+    // convolution. I don't know cuDNN7 still has the bug, so enable this
+    // workaround for cuDNN6 or older.
+    *adjusted_padding = 0;
+    *extra_padding_before = padding / 2;
+    *extra_padding_after = padding - *extra_padding_before;
+    return;
+  }
+#endif
+  *adjusted_padding = padding / 2 * 2;
+  *extra_padding_before = 0;
+  *extra_padding_after = padding % 2;
+}
+
 template <typename T, typename BiasType, typename ScaleType>
 void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
     launch(OpKernelContext* ctx, bool cudnn_use_autotune,
@@ -303,7 +326,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
     stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
                                                                      &cc_minor);
     OP_REQUIRES(
-        ctx, cc_major >= 6 && cc_minor >= 1,
+        ctx, ((cc_major == 6 && cc_minor >= 1) || cc_major > 6),
         errors::Unimplemented(
             "FusedConv2DBiasActivation for int8 is only supported on GPUs with "
             "compute capability 6.1 or later."));
@@ -338,12 +361,21 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
         0, (output_rows - 1) * row_stride + filter_rows - conv_input_rows);
     padding_cols = std::max<int>(
         0, (output_cols - 1) * col_stride + filter_cols - conv_input_cols);
-    const int padding_rows_parity = padding_rows & 1;
-    const int padding_cols_parity = padding_cols & 1;
-    if ((padding_rows_parity | padding_cols_parity) != 0) {
+    int extra_top_padding = 0;
+    int extra_bottom_padding = 0;
+    int extra_left_padding = 0;
+    int extra_right_padding = 0;
+    AdjustPaddingForCudnn(padding_rows, is_int8x4, filter_rows, &padding_rows,
+                          &extra_top_padding, &extra_bottom_padding);
+    AdjustPaddingForCudnn(padding_cols, is_int8x4, filter_cols, &padding_cols,
+                          &extra_left_padding, &extra_right_padding);
+    if (extra_top_padding != 0 || extra_bottom_padding != 0 ||
+        extra_left_padding != 0 || extra_right_padding != 0) {
       Tensor transformed_input;
-      const int new_conv_input_rows = conv_input_rows + padding_rows_parity;
-      const int new_conv_input_cols = conv_input_cols + padding_cols_parity;
+      const int new_conv_input_rows =
+          conv_input_rows + extra_top_padding + extra_bottom_padding;
+      const int new_conv_input_cols =
+          conv_input_cols + extra_left_padding + extra_right_padding;
 
       using VectT = typename Int8x4ToInt32<typename RawType<T>::type>::type;
       auto pad_data_format = is_int8x4 ? FORMAT_NCHW : data_format;
@@ -361,8 +393,9 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
           maybe_padded_conv_input.reinterpret_last_dimension<VectT, 4>());
 
       functor::PadInput<GPUDevice, VectT, int, 4>()(
-          ctx->eigen_device<GPUDevice>(), conv_input_eigen_tensor, {{0, 0}},
-          {{padding_rows_parity, padding_cols_parity}},
+          ctx->eigen_device<GPUDevice>(), conv_input_eigen_tensor,
+          {{extra_top_padding, extra_left_padding}},
+          {{extra_bottom_padding, extra_right_padding}},
           padded_conv_input_eigen_tensor, pad_data_format);
 
       conv_input = &maybe_padded_conv_input;
@@ -439,6 +472,8 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
       .set_feature_map_count(output_depth)
       .set_layout(data_layout);
   dnn::ConvolutionDescriptor conv_desc;
+  CHECK_EQ(0, padding_rows % 2);
+  CHECK_EQ(0, padding_cols % 2);
   conv_desc.set_vertical_filter_stride(row_stride)
       .set_horizontal_filter_stride(col_stride)
       .set_zero_padding_height(padding_rows / 2)
@@ -493,6 +528,8 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
       {{conv_input_rows, conv_input_cols}},
       output_depth,
       {{filter_rows, filter_cols}},
+      // TODO(yangzihao): Add support for arbitrary dilations for fused conv.
+      {{1, 1}},  // dilation_rows, dilation_cols
       {{row_stride, col_stride}},
       {{padding_rows, padding_cols}},
       conv_input->dtype(),
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
index dc43af11580ce5fda74ee25da6c151a5b89c7aee..ba52697679dafc239b1dac5562573b3589877a8c 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
+#ifndef TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
+#define TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
 
 #if GOOGLE_CUDA
 
@@ -30,11 +30,12 @@ class FusedConvParameters : public ConvParameters {
  public:
   FusedConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
                       int64 out_depths, const SpatialArray& filter,
-                      const SpatialArray& stride, const SpatialArray& padding,
-                      DataType dtype, int device_id, bool has_side_input,
+                      const SpatialArray& dilation, const SpatialArray& stride,
+                      const SpatialArray& padding, DataType dtype,
+                      int device_id, bool has_side_input,
                       ActivationMode activation_mode)
-      : ConvParameters(batch, in_depths, in, out_depths, filter, stride,
-                       padding, dtype, device_id),
+      : ConvParameters(batch, in_depths, in, out_depths, filter, dilation,
+                       stride, padding, dtype, device_id),
         activation_mode_(activation_mode),
         has_side_input_(has_side_input) {
     hash_code_ = Hash64Combine(hash_code_, has_side_input);
@@ -71,4 +72,4 @@ class FusedConvParameters : public ConvParameters {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
+#endif  // TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
diff --git a/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
index 887ebc5a6c35379476fa1a643c866d38e2b25699..bafd1d59418f0ba47ebbdaabbf06f8e5471fc1a1 100644
--- a/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
@@ -25,13 +25,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-// Return the string containing the list of valid activation modes, that can be
-// used as an Attr() in REGISTER_OP.
-string GetAllActivationModeAttrString() { return "activation_mode: {'Relu'}"; }
-
-}  // namespace
-
 // --------------------------------------------------------------------------
 
 // TODO(pauldonnelly): Add support for double inputs and scales to this Op,
@@ -52,6 +45,7 @@ REGISTER_OP("FusedConv2DBiasActivation")
     .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
     .Attr("filter_format: {'HWIO', 'OIHW', 'OIHW_VECT_I'} = 'HWIO'")
     .Attr("activation_mode: {'Relu'} = 'Relu'")
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       using shape_inference::ShapeHandle;
       using shape_inference::DimensionHandle;
@@ -151,6 +145,11 @@ REGISTER_OP("FusedConv2DBiasActivation")
                      kernel_height, kernel_width, input_channels % 4 ]`
     activation_mode: The activation applied to the output.
         Currently must be "Relu".
+    dilations: 1-D tensor of length 4.  The dilation factor for each dimension
+        of `input`. If set to k > 1, there will be k-1 skipped cells between
+        each filter element on that dimension. The dimension order is determined
+        by the value of `data_format`, see above for details. Dilations in the
+        batch and depth dimensions must be 1.
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
index a65d4bc50ff796977e8ea7f652b7cbe3fe37f673..96cdd8b1ca4d56d12d38ea961ae73f3a3aa28968 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_benchmark.py
@@ -116,7 +116,7 @@ def build_fused_conv_bias_relu_graph(device, input_shape, filter_shape, strides,
     for _ in range(1, num_iters):
       with ops.control_dependencies([fused_out]):
         # pylint: disable=g-line-too-long
-        fused_out = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
+        fused_out = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(  # pylint: disable=line-too-long
             inp,
             filt,
             bias,
@@ -166,10 +166,10 @@ class FusedConv2DBiasActivationBenchmark(test.Benchmark):
         duration = (time.time() - start_time) / num_iters
 
         print("%s inputshape:%s filtershape:%s strides:%s padding:%s "
-              "%d iters: %.8f sec" %
-              (device, str(input_shape).replace(" ", ""),
-               str(filter_shape).replace(" ", ""),
-               str(strides).replace(" ", ""), padding, num_iters, duration))
+              "%d iters: %.8f sec" % (device, str(input_shape).replace(" ", ""),
+                                      str(filter_shape).replace(" ", ""),
+                                      str(strides).replace(" ", ""), padding,
+                                      num_iters, duration))
     name_template = (
         "conv2d_{device}_input_shape_{inputshape}_filter_shape_{filtershape}_"
         "strides_{strides}_padding_{padding}")
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index 2a18f3eeecc7e0e69c54b219886a263136f01b2c..bb155aa2496cbafd9f0630d3dffb2ba69395186c 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -658,6 +658,36 @@ def SimulateFusedConv2dBiasActivationInt8(conv_input_scale, conv_input, kernel,
 
 class FusedConvInt8Tests(test.TestCase):
   _test_params = [
+      {
+          "batch_size": 1,
+          "input_channels": 4,
+          "output_channels": 4,
+          "input_height": 8,
+          "input_width": 8,
+          "filter_height": 6,
+          "filter_width": 6,
+          "vertical_stride": 2,
+          "horizontal_stride": 2,
+          "conv_input_scale": 0.002,
+          "side_input_scale": 0.0,
+          "bias_scale": 1,
+          "padding_type": "SAME"
+      },
+      {
+          "batch_size": 1,
+          "input_channels": 4,
+          "output_channels": 4,
+          "input_height": 6,
+          "input_width": 6,
+          "filter_height": 6,
+          "filter_width": 6,
+          "vertical_stride": 2,
+          "horizontal_stride": 2,
+          "conv_input_scale": 0.002,
+          "side_input_scale": 0.0,
+          "bias_scale": 1,
+          "padding_type": "SAME"
+      },
       {
           "batch_size": 2,
           "input_channels": 8,
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index abe4665caa9b23b5663df48487c6c77d33d15c59..5db34f0f8db93620b8b4a6b71f63b66ac718ee30 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -56,6 +56,7 @@ py_test(
     srcs = ["python/train_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":features",
         ":namedtuples",
         ":train",
         "//tensorflow/contrib/framework:framework_py",
@@ -82,6 +83,7 @@ py_library(
     deps = [
         ":classifier_metrics",
         ":eval_utils",
+        ":sliced_wasserstein",
         ":summaries",
         "//tensorflow/python:util",
     ],
@@ -116,7 +118,7 @@ py_library(
     deps = [
         ":clip_weights",
         ":conditioning_utils",
-        ":tensor_pool",
+        ":random_tensor_pool",
         ":virtual_batchnorm",
         "//tensorflow/python:util",
     ],
@@ -175,6 +177,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":losses_impl",
+        ":namedtuples",
         "//tensorflow/python:util",
     ],
 )
@@ -186,6 +189,9 @@ py_test(
     deps = [
         ":tuple_losses",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
 )
@@ -221,10 +227,10 @@ py_test(
 )
 
 py_library(
-    name = "tensor_pool",
+    name = "random_tensor_pool",
     srcs = [
-        "python/features/python/tensor_pool.py",
-        "python/features/python/tensor_pool_impl.py",
+        "python/features/python/random_tensor_pool.py",
+        "python/features/python/random_tensor_pool_impl.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -239,11 +245,11 @@ py_library(
 )
 
 py_test(
-    name = "tensor_pool_test",
-    srcs = ["python/features/python/tensor_pool_test.py"],
+    name = "random_tensor_pool_test",
+    srcs = ["python/features/python/random_tensor_pool_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":tensor_pool",
+        ":random_tensor_pool",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -393,6 +399,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":eval_utils",
+        ":namedtuples",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -502,6 +509,41 @@ py_test(
     ],
 )
 
+py_library(
+    name = "sliced_wasserstein",
+    srcs = [
+        "python/eval/python/sliced_wasserstein.py",
+        "python/eval/python/sliced_wasserstein_impl.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "sliced_wasserstein_test",
+    srcs = ["python/eval/python/sliced_wasserstein_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":sliced_wasserstein",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:random_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/gan/README.md b/tensorflow/contrib/gan/README.md
index 4bca0a1d62a2b404c6783c7cfe3b5c67cfc58221..4ead66ca13e74bacc0e4679a8d5c4e0f23d04b69 100644
--- a/tensorflow/contrib/gan/README.md
+++ b/tensorflow/contrib/gan/README.md
@@ -99,8 +99,8 @@ gan_model = tfgan.gan_model(
 # Build the GAN loss.
 gan_loss = tfgan.gan_loss(
     gan_model,
-    generator_loss_fn=tfgan_losses.wasserstein_generator_loss,
-    discriminator_loss_fn=tfgan_losses.wasserstein_discriminator_loss)
+    generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
+    discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss)
 
 # Create the train ops, which calculate gradients and apply updates to weights.
 train_ops = tfgan.gan_train_ops(
@@ -161,8 +161,8 @@ gan_model = tfgan.gan_model(
 # Build the GAN loss and standard pixel loss.
 gan_loss = tfgan.gan_loss(
     gan_model,
-    generator_loss_fn=tfgan_losses.wasserstein_generator_loss,
-    discriminator_loss_fn=tfgan_losses.wasserstein_discriminator_loss,
+    generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
+    discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
     gradient_penalty=1.0)
 l1_pixel_loss = tf.norm(gan_model.real_data - gan_model.generated_data, ord=1)
 
@@ -193,8 +193,8 @@ gan_model = tfgan.gan_model(
 # Build the GAN loss and standard pixel loss.
 gan_loss = tfgan.gan_loss(
     gan_model,
-    generator_loss_fn=tfgan_losses.least_squares_generator_loss,
-    discriminator_loss_fn=tfgan_losses.least_squares_discriminator_loss)
+    generator_loss_fn=tfgan.losses.least_squares_generator_loss,
+    discriminator_loss_fn=tfgan.losses.least_squares_discriminator_loss)
 l1_pixel_loss = tf.norm(gan_model.real_data - gan_model.generated_data, ord=1)
 
 # Modify the loss tuple to include the pixel loss.
@@ -223,8 +223,8 @@ gan_model = tfgan.infogan_model(
 # Build the GAN loss with mutual information penalty.
 gan_loss = tfgan.gan_loss(
     gan_model,
-    generator_loss_fn=tfgan_losses.wasserstein_generator_loss,
-    discriminator_loss_fn=tfgan_losses.wasserstein_discriminator_loss,
+    generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
+    discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
     gradient_penalty=1.0,
     mutual_information_penalty_weight=1.0)
 
diff --git a/tensorflow/contrib/gan/__init__.py b/tensorflow/contrib/gan/__init__.py
index dff361fdc42708ea69999c2def4721f9d49fcf14..f1946c7f925660eae3aaa650c437e03da1f33d6c 100644
--- a/tensorflow/contrib/gan/__init__.py
+++ b/tensorflow/contrib/gan/__init__.py
@@ -12,7 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN is a lightweight library for training and evaluating GANs.
+
+In addition to providing the infrastructure for easily training and evaluating
+GANS, this library contains modules for a TFGAN-backed Estimator,
+evaluation metrics, features (such as virtual batch normalization), and losses.
+Please see README.md for details and usage.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/__init__.py b/tensorflow/contrib/gan/python/estimator/__init__.py
index 8c4a18228039cb4f2c06e0333f4b8408f1f631e9..c9f7bc61b25230e4159cf8cbc7c9cceead0aa706 100644
--- a/tensorflow/contrib/gan/python/estimator/__init__.py
+++ b/tensorflow/contrib/gan/python/estimator/__init__.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN estimator module.
+
+GANEstimator provides all the infrastructure support of a TensorFlow Estimator
+with the feature support of TFGAN.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 058dc1d1f8cc176dcdb81268da2c4704d7eddc99..082c42eba180917e732bb7890129dfa94bf00fec 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -59,7 +59,11 @@ _summary_type_map = {
 class GANEstimator(estimator.Estimator):
   """An estimator for Generative Adversarial Networks (GANs).
 
-  This Estimator is backed by TFGAN.
+  This Estimator is backed by TFGAN. The network functions follow the TFGAN API
+  except for one exception: if either `generator_fn` or `discriminator_fn` have
+  an argument called `mode`, then the tf.Estimator mode is passed in for that
+  argument. This helps with operations like batch normalization, which have
+  different train and evaluation behavior.
 
   Example:
 
@@ -96,7 +100,7 @@ class GANEstimator(estimator.Estimator):
       # Generate samples from generator.
       predictions = np.array([
           x for x in gan_estimator.predict(predict_input_fn)])
-    ```
+  ```
   """
 
   def __init__(self,
@@ -107,6 +111,7 @@ class GANEstimator(estimator.Estimator):
                discriminator_loss_fn=None,
                generator_optimizer=None,
                discriminator_optimizer=None,
+               get_hooks_fn=None,
                add_summaries=None,
                use_loss_summaries=True,
                config=None):
@@ -137,6 +142,10 @@ class GANEstimator(estimator.Estimator):
         work.
       discriminator_optimizer: Same as `generator_optimizer`, but for the
         discriminator updates.
+      get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
+        list of hooks. These hooks are run on the generator and discriminator
+        train ops, and can be used to implement the GAN training scheme.
+        Defaults to `train.get_sequential_train_hooks()`.
       add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`.
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
@@ -151,7 +160,7 @@ class GANEstimator(estimator.Estimator):
               else discriminator_optimizer)
       gan_head = head_lib.gan_head(
           generator_loss_fn, discriminator_loss_fn, gopt, dopt,
-          use_loss_summaries)
+          use_loss_summaries, get_hooks_fn=get_hooks_fn)
       return _gan_model_fn(
           features, labels, mode, generator_fn, discriminator_fn, gan_head,
           add_summaries)
@@ -160,11 +169,6 @@ class GANEstimator(estimator.Estimator):
         model_fn=_model_fn, model_dir=model_dir, config=config)
 
 
-def _use_check_shapes(real_data):
-  """Determines whether TFGAN should check Tensor shapes."""
-  return isinstance(real_data, ops.Tensor)
-
-
 def _gan_model_fn(
     features,
     labels,
@@ -233,16 +237,18 @@ def _gan_model_fn(
 def _make_gan_model(generator_fn, discriminator_fn, real_data,
                     generator_inputs, generator_scope, add_summaries, mode):
   """Make a `GANModel`, and optionally pass in `mode`."""
-  # If `generator_fn` has an argument `mode`, pass mode to it.
+  # If network functions have an argument `mode`, pass mode to it.
   if 'mode' in inspect.getargspec(generator_fn).args:
     generator_fn = functools.partial(generator_fn, mode=mode)
+  if 'mode' in inspect.getargspec(discriminator_fn).args:
+    discriminator_fn = functools.partial(discriminator_fn, mode=mode)
   gan_model = tfgan_train.gan_model(
       generator_fn,
       discriminator_fn,
       real_data,
       generator_inputs,
       generator_scope=generator_scope,
-      check_shapes=_use_check_shapes(real_data))
+      check_shapes=False)
   if add_summaries:
     if not isinstance(add_summaries, (tuple, list)):
       add_summaries = [add_summaries]
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index e752f0bcccda418b79d4fdabb27807394cbbb425..387a62bd741bd42c03dc1bf70592060c29ccd7a8 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -54,7 +54,8 @@ def generator_fn(noise_dict, mode):
   return layers.fully_connected(noise, noise.shape[1].value)
 
 
-def discriminator_fn(data, _):
+def discriminator_fn(data, unused_conditioning, mode):
+  del unused_conditioning, mode
   return layers.fully_connected(data, 1)
 
 
@@ -99,7 +100,6 @@ def mock_head(testcase, expected_generator_inputs, expected_real_data,
     else:
       testcase.assertEqual(discriminator_scope_name,
                            gan_model.discriminator_scope.name)
-    testcase.assertEqual(_or_none(discriminator_fn), gan_model.discriminator_fn)
 
     with ops.control_dependencies(assertions):
       if mode == model_fn_lib.ModeKeys.TRAIN:
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
index 204c646e194319c0e63599da0b2a4909ef270ef3..a21358c50bbdb4a1a929b0c5bc322cec4c9923b5 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
@@ -71,7 +71,7 @@ class GANHead(head._Head):  # pylint: disable=protected-access
   def __init__(self, generator_loss_fn, discriminator_loss_fn,
                generator_optimizer, discriminator_optimizer,
                use_loss_summaries=True,
-               get_hooks_fn=tfgan_train.get_sequential_train_hooks(),
+               get_hooks_fn=None,
                name=None):
     """`Head` for GAN training.
 
@@ -86,10 +86,12 @@ class GANHead(head._Head):  # pylint: disable=protected-access
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
       get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list
-        of hooks.
+        of hooks. Defaults to `train.get_sequential_train_hooks()`
       name: name of the head. If provided, summary and metrics keys will be
         suffixed by `"/" + name`.
     """
+    if get_hooks_fn is None:
+      get_hooks_fn = tfgan_train.get_sequential_train_hooks()
     # TODO(joelshor): Validate inputs.
 
     if use_loss_summaries in [True, False]:
diff --git a/tensorflow/contrib/gan/python/eval/__init__.py b/tensorflow/contrib/gan/python/eval/__init__.py
index bb8046187807d0cc584f7174eb9aac578855c110..f86b8513053a45f9830411f7df2c32d1f36a97b2 100644
--- a/tensorflow/contrib/gan/python/eval/__init__.py
+++ b/tensorflow/contrib/gan/python/eval/__init__.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN evaluation module.
+
+This module supports techniques such as Inception Score, Frechet Inception
+distance, and Sliced Wasserstein distance.
+"""
 # pylint: disable=,wildcard-import,unused-import
 
 from __future__ import absolute_import
@@ -22,10 +26,12 @@ from __future__ import print_function
 # Collapse eval into a single namespace.
 from tensorflow.contrib.gan.python.eval.python import classifier_metrics
 from tensorflow.contrib.gan.python.eval.python import eval_utils
+from tensorflow.contrib.gan.python.eval.python import sliced_wasserstein
 from tensorflow.contrib.gan.python.eval.python import summaries
 
 from tensorflow.contrib.gan.python.eval.python.classifier_metrics import *
 from tensorflow.contrib.gan.python.eval.python.eval_utils import *
+from tensorflow.contrib.gan.python.eval.python.sliced_wasserstein import *
 from tensorflow.contrib.gan.python.eval.python.summaries import *
 # pylint: enable=wildcard-import,unused-import
 
@@ -33,7 +39,10 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     'classifier_metrics',
+    'sliced_wasserstein_distance',
     'summaries',
     'eval_utils',
-] + classifier_metrics.__all__ + summaries.__all__ + eval_utils.__all__
+] + (
+    classifier_metrics.__all__ + sliced_wasserstein.__all__ +
+    summaries.__all__ + eval_utils.__all__)
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index bb65f05b5a17e9a872e41d1dcb05aeb3cd6f6f40..fdfabd07c13f689d075ecbb8786d725fa8a62d01 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -28,6 +28,7 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import os
 import sys
 import tarfile
 
@@ -57,8 +58,10 @@ __all__ = [
     'run_inception',
     'inception_score',
     'classifier_score',
+    'classifier_score_from_logits',
     'frechet_inception_distance',
     'frechet_classifier_distance',
+    'frechet_classifier_distance_from_activations',
     'INCEPTION_DEFAULT_IMAGE_SIZE',
 ]
 
@@ -187,20 +190,34 @@ def get_graph_def_from_resource(filename):
   return graph_pb2.GraphDef.FromString(resource_loader.load_resource(filename))
 
 
-def get_graph_def_from_url_tarball(url, filename):
-  """Get a GraphDef proto from a tarball on the web."""
-  def _progress(count, block_size, total_size):
-    sys.stdout.write('\r>> Downloading %s %.1f%%' % (
-        url, float(count * block_size) / float(total_size) * 100.0))
-    sys.stdout.flush()
-  tar_filename, _ = urllib.request.urlretrieve(url, reporthook=_progress)
+def get_graph_def_from_url_tarball(url, filename, tar_filename=None):
+  """Get a GraphDef proto from a tarball on the web.
+
+  Args:
+    url: Web address of tarball
+    filename: Filename of graph definition within tarball
+    tar_filename: Temporary download filename (None = always download)
+
+  Returns:
+    A GraphDef loaded from a file in the downloaded tarball.
+  """
+  if not (tar_filename and os.path.exists(tar_filename)):
+
+    def _progress(count, block_size, total_size):
+      sys.stdout.write('\r>> Downloading %s %.1f%%' %
+                       (url,
+                        float(count * block_size) / float(total_size) * 100.0))
+      sys.stdout.flush()
+
+    tar_filename, _ = urllib.request.urlretrieve(url, tar_filename, _progress)
   with tarfile.open(tar_filename, 'r:gz') as tar:
     proto_str = tar.extractfile(filename).read()
   return graph_pb2.GraphDef.FromString(proto_str)
 
 
 def _default_graph_def_fn():
-  return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH)
+  return get_graph_def_from_url_tarball(INCEPTION_URL, INCEPTION_FROZEN_GRAPH,
+                                        os.path.basename(INCEPTION_URL))
 
 
 def run_inception(images,
@@ -222,13 +239,13 @@ def run_inception(images,
     image_size: Required image width and height. See unit tests for the default
       values.
     input_tensor: Name of input Tensor.
-    output_tensor: Name of output Tensor. This function will compute activations
-      at the specified layer. Examples include INCEPTION_V3_OUTPUT and
-      INCEPTION_V3_FINAL_POOL which would result in this function computing
+    output_tensor: Name or list of output Tensors. This function will compute
+      activations at the specified layer. Examples include INCEPTION_V3_OUTPUT
+      and INCEPTION_V3_FINAL_POOL which would result in this function computing
       the final logits or the penultimate pooling layer.
 
   Returns:
-    Logits.
+    Tensor or Tensors corresponding to computed `output_tensor`.
 
   Raises:
     ValueError: If images are not the correct size.
@@ -244,8 +261,14 @@ def run_inception(images,
 
   activations = run_image_classifier(images, graph_def, input_tensor,
                                      output_tensor)
-  if array_ops.rank(activations) != 2:
-    activations = layers.flatten(activations)
+  if isinstance(activations, list):
+    for i, activation in enumerate(activations):
+      if array_ops.rank(activation) != 2:
+        activations[i] = layers.flatten(activation)
+  else:
+    if array_ops.rank(activations) != 2:
+      activations = layers.flatten(activations)
+
   return activations
 
 
@@ -257,23 +280,26 @@ def run_image_classifier(tensor, graph_def, input_tensor,
     tensor: An Input tensor.
     graph_def: A GraphDef proto.
     input_tensor: Name of input tensor in graph def.
-    output_tensor: Name of output tensor in graph def.
+    output_tensor: A tensor name or list of tensor names in graph def.
     scope: Name scope for classifier.
 
   Returns:
-    Classifier output. Shape depends on the classifier used, but is often
-    [batch, classes].
+    Classifier output if `output_tensor` is a string, or a list of outputs if
+    `output_tensor` is a list.
 
   Raises:
-    ValueError: If `image_size` is not `None`, and `tensor` are not the correct
-      size.
+    ValueError: If `input_tensor` or `output_tensor` aren't in the graph_def.
   """
   input_map = {input_tensor: tensor}
-  return_elements = [output_tensor]
-  classifier_output = importer.import_graph_def(
-      graph_def, input_map, return_elements, name=scope)[0]
+  is_singleton = isinstance(output_tensor, str)
+  if is_singleton:
+    output_tensor = [output_tensor]
+  classifier_outputs = importer.import_graph_def(
+      graph_def, input_map, output_tensor, name=scope)
+  if is_singleton:
+    classifier_outputs = classifier_outputs[0]
 
-  return classifier_output
+  return classifier_outputs
 
 
 def classifier_score(images, classifier_fn, num_batches=1):
@@ -289,6 +315,11 @@ def classifier_score(images, classifier_fn, num_batches=1):
   which captures how different the network's classification prediction is from
   the prior distribution over classes.
 
+  NOTE: This function consumes images, computes their logits, and then
+  computes the classifier score. If you would like to precompute many logits for
+  large batches, use clasifier_score_from_logits(), which this method also
+  uses.
+
   Args:
     images: Images to calculate the classifier score for.
     classifier_fn: A function that takes images and produces logits based on a
@@ -312,6 +343,34 @@ def classifier_score(images, classifier_fn, num_batches=1):
       swap_memory=True,
       name='RunClassifier')
   logits = array_ops.concat(array_ops.unstack(logits), 0)
+
+  return classifier_score_from_logits(logits)
+
+
+def classifier_score_from_logits(logits):
+  """Classifier score for evaluating a generative model from logits.
+
+  This method computes the classifier score for a set of logits. This can be
+  used independently of the classifier_score() method, especially in the case
+  of using large batches during evaluation where we would like precompute all
+  of the logits before computing the classifier score.
+
+  This technique is described in detail in https://arxiv.org/abs/1606.03498. In
+  summary, this function calculates:
+
+  exp( E[ KL(p(y|x) || p(y)) ] )
+
+  which captures how different the network's classification prediction is from
+  the prior distribution over classes.
+
+  Args:
+    logits: Precomputed 2D tensor of logits that will be used to
+      compute the classifier score.
+
+  Returns:
+    The classifier score. A floating-point scalar of the same type as the output
+    of `logits`.
+  """
   logits.shape.assert_has_rank(2)
 
   # Use maximum precision for best results.
@@ -328,6 +387,7 @@ def classifier_score(images, classifier_fn, num_batches=1):
 
   if logits_dtype != dtypes.float64:
     final_score = math_ops.cast(final_score, logits_dtype)
+
   return final_score
 
 
@@ -406,6 +466,11 @@ def frechet_classifier_distance(real_images,
   sample size to compute frechet classifier distance when comparing two
   generative models.
 
+  NOTE: This function consumes images, computes their activations, and then
+  computes the classifier score. If you would like to precompute many
+  activations for real and generated images for large batches, please use
+  frechet_clasifier_distance_from_activations(), which this method also uses.
+
   Args:
     real_images: Real images to use to compute Frechet Inception distance.
     generated_images: Generated images to use to compute Frechet Inception
@@ -417,7 +482,7 @@ def frechet_classifier_distance(real_images,
 
   Returns:
     The Frechet Inception distance. A floating-point scalar of the same type
-    as the output of `classifier_fn`
+    as the output of `classifier_fn`.
   """
 
   real_images_list = array_ops.split(
@@ -436,31 +501,69 @@ def frechet_classifier_distance(real_images,
       swap_memory=True,
       name='RunClassifier')
 
-  activations_dtype = activations.dtype
   # Split the activations by the real and generated images.
   real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0)
 
   # Ensure the activations have the right shapes.
   real_a = array_ops.concat(array_ops.unstack(real_a), 0)
   gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
-  if activations_dtype != dtypes.float64:
-    real_a = math_ops.to_double(real_a)
-    gen_a = math_ops.to_double(gen_a)
 
-  real_a.shape.assert_has_rank(2)
-  gen_a.shape.assert_has_rank(2)
+  return frechet_classifier_distance_from_activations(real_a, gen_a)
+
+
+def frechet_classifier_distance_from_activations(
+    real_activations, generated_activations):
+  """Classifier distance for evaluating a generative model from activations.
+
+  This methods computes the Frechet classifier distance from activations of
+  real images and generated images. This can be used independently of the
+  frechet_classifier_distance() method, especially in the case of using large
+  batches during evaluation where we would like precompute all of the
+  activations before computing the classifier distance.
+
+  This technique is described in detail in https://arxiv.org/abs/1706.08500.
+  Given two Gaussian distribution with means m and m_w and covariance matrices
+  C and C_w, this function calcuates
+
+  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
+
+  which captures how different the distributions of real images and generated
+  images (or more accurately, their visual features) are. Note that unlike the
+  Inception score, this is a true distance and utilizes information about real
+  world images.
+
+  Args:
+    real_activations: 2D Tensor containing activations of real data. Shape is
+      [batch_size, activation_size].
+    generated_activations: 2D Tensor containing activations of generated data.
+      Shape is [batch_size, activation_size].
+
+  Returns:
+   The Frechet Inception distance. A floating-point scalar of the same type
+   as the output of the activations.
+
+  """
+  real_activations.shape.assert_has_rank(2)
+  generated_activations.shape.assert_has_rank(2)
+
+  activations_dtype = real_activations.dtype
+  if activations_dtype != dtypes.float64:
+    real_activations = math_ops.to_double(real_activations)
+    generated_activations = math_ops.to_double(generated_activations)
 
   # Compute mean and covariance matrices of activations.
-  m = math_ops.reduce_mean(real_a, 0)
-  m_v = math_ops.reduce_mean(gen_a, 0)
-  num_examples = math_ops.to_double(array_ops.shape(real_a)[0])
+  m = math_ops.reduce_mean(real_activations, 0)
+  m_v = math_ops.reduce_mean(generated_activations, 0)
+  num_examples = math_ops.to_double(array_ops.shape(real_activations)[0])
 
   # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
+  real_centered = real_activations - m
   sigma = math_ops.matmul(
-      real_a - m, real_a - m, transpose_a=True) / (num_examples - 1)
+      real_centered, real_centered, transpose_a=True) / (num_examples - 1)
 
+  gen_centered = generated_activations - m_v
   sigma_v = math_ops.matmul(
-      gen_a - m_v, gen_a - m_v, transpose_a=True) / (num_examples - 1)
+      gen_centered, gen_centered, transpose_a=True) / (num_examples - 1)
 
   # Find the Tr(sqrt(sigma sigma_v)) component of FID
   sqrt_trace_component = trace_sqrt_product(sigma, sigma_v)
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index 92e0a995748c1c4c2ddfff0daae59be5a6eaefb4..61dc8646ddc10605561ae6b19e90f4739c346608 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -181,7 +181,8 @@ class ClassifierMetricsTest(test.TestCase):
     batch_size = 3
     img = array_ops.ones([batch_size, 299, 299, 3])
     pool = _run_with_mock(
-        classifier_metrics.run_inception, img,
+        classifier_metrics.run_inception,
+        img,
         output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
 
     self.assertTrue(isinstance(pool, ops.Tensor))
@@ -190,10 +191,32 @@ class ClassifierMetricsTest(test.TestCase):
     # Check that none of the model variables are trainable.
     self.assertListEqual([], variables.trainable_variables())
 
+  def test_run_inception_multiple_outputs(self):
+    """Test `run_inception` graph construction with multiple outputs."""
+    batch_size = 3
+    img = array_ops.ones([batch_size, 299, 299, 3])
+    logits, pool = _run_with_mock(
+        classifier_metrics.run_inception,
+        img,
+        output_tensor=[
+            classifier_metrics.INCEPTION_OUTPUT,
+            classifier_metrics.INCEPTION_FINAL_POOL
+        ])
+
+    self.assertTrue(isinstance(logits, ops.Tensor))
+    self.assertTrue(isinstance(pool, ops.Tensor))
+    logits.shape.assert_is_compatible_with([batch_size, 1001])
+    pool.shape.assert_is_compatible_with([batch_size, 2048])
+
+    # Check that none of the model variables are trainable.
+    self.assertListEqual([], variables.trainable_variables())
+
   def test_inception_score_graph(self):
     """Test `inception_score` graph construction."""
-    score = _run_with_mock(classifier_metrics.inception_score,
-                           array_ops.zeros([6, 299, 299, 3]), num_batches=3)
+    score = _run_with_mock(
+        classifier_metrics.inception_score,
+        array_ops.zeros([6, 299, 299, 3]),
+        num_batches=3)
     self.assertTrue(isinstance(score, ops.Tensor))
     score.shape.assert_has_rank(0)
 
@@ -231,12 +254,14 @@ class ClassifierMetricsTest(test.TestCase):
           array_ops.zeros([8, 10], dtype=dtypes.int32), p_logits, q)
 
     with self.assertRaisesRegexp(ValueError, 'must be floating type'):
-      classifier_metrics._kl_divergence(
-          p, array_ops.zeros([8, 10], dtype=dtypes.int32), q)
+      classifier_metrics._kl_divergence(p,
+                                        array_ops.zeros(
+                                            [8, 10], dtype=dtypes.int32), q)
 
     with self.assertRaisesRegexp(ValueError, 'must be floating type'):
-      classifier_metrics._kl_divergence(
-          p, p_logits, array_ops.zeros([10], dtype=dtypes.int32))
+      classifier_metrics._kl_divergence(p, p_logits,
+                                        array_ops.zeros(
+                                            [10], dtype=dtypes.int32))
 
     with self.assertRaisesRegexp(ValueError, 'must have rank 2'):
       classifier_metrics._kl_divergence(array_ops.zeros([8]), p_logits, q)
@@ -249,8 +274,9 @@ class ClassifierMetricsTest(test.TestCase):
 
   def test_inception_score_value(self):
     """Test that `inception_score` gives the correct value."""
-    logits = np.array([np.array([1, 2] * 500 + [4]),
-                       np.array([4, 5] * 500 + [6])])
+    logits = np.array(
+        [np.array([1, 2] * 500 + [4]),
+         np.array([4, 5] * 500 + [6])])
     unused_image = array_ops.zeros([2, 299, 299, 3])
     incscore = _run_with_mock(classifier_metrics.inception_score, unused_image)
 
@@ -268,9 +294,11 @@ class ClassifierMetricsTest(test.TestCase):
     test_pool_real_a = np.float32(np.random.randn(512, 256))
     test_pool_gen_a = np.float32(np.random.randn(512, 256))
 
-    fid_op = _run_with_mock(classifier_metrics.frechet_classifier_distance,
-                            test_pool_real_a, test_pool_gen_a,
-                            classifier_fn=lambda x: x)
+    fid_op = _run_with_mock(
+        classifier_metrics.frechet_classifier_distance,
+        test_pool_real_a,
+        test_pool_gen_a,
+        classifier_fn=lambda x: x)
 
     with self.test_session() as sess:
       actual_fid = sess.run(fid_op)
@@ -279,6 +307,33 @@ class ClassifierMetricsTest(test.TestCase):
 
     self.assertAllClose(expected_fid, actual_fid, 0.0001)
 
+  def test_frechet_classifier_distance_covariance(self):
+    """Test that `frechet_classifier_distance` takes covariance into account."""
+    np.random.seed(0)
+
+    # Make num_examples > num_features to ensure scipy's sqrtm function
+    # doesn't return a complex matrix.
+    test_pool_reals, test_pool_gens = [], []
+    for i in range(1, 11, 2):
+      test_pool_reals.append(np.float32(np.random.randn(2048, 256) * i))
+      test_pool_gens.append(np.float32(np.random.randn(2048, 256) * i))
+
+    fid_ops = []
+    for i in range(len(test_pool_reals)):
+      fid_ops.append(_run_with_mock(
+          classifier_metrics.frechet_classifier_distance,
+          test_pool_reals[i],
+          test_pool_gens[i],
+          classifier_fn=lambda x: x))
+
+    fids = []
+    with self.test_session() as sess:
+      for fid_op in fid_ops:
+        fids.append(sess.run(fid_op))
+
+    # Check that the FIDs increase monotonically.
+    self.assertTrue(all(fid_a < fid_b for fid_a, fid_b in zip(fids, fids[1:])))
+
   def test_trace_sqrt_product_value(self):
     """Test that `trace_sqrt_product` gives the correct value."""
     np.random.seed(0)
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
new file mode 100644
index 0000000000000000000000000000000000000000..523968bed91f1021ae629bf52c405cf5c2d7b917
--- /dev/null
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
@@ -0,0 +1,28 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model evaluation tools for TFGAN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python.eval.python import sliced_wasserstein_impl
+# pylint: disable=wildcard-import
+from tensorflow.contrib.gan.python.eval.python.sliced_wasserstein_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+__all__ = sliced_wasserstein_impl.__all__
+remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bebcacbe46d85fc4226c4275b71b3ecbde57a97
--- /dev/null
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_impl.py
@@ -0,0 +1,282 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Sliced Wasserstein Distance.
+
+Proposed in https://arxiv.org/abs/1710.10196 and the official Theano
+implementation that we used as reference can be found here:
+https://github.com/tkarras/progressive_growing_of_gans
+
+Note: this is not an exact distance but an approximation through random
+projections.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import script_ops
+
+__all__ = ['sliced_wasserstein_distance']
+_GAUSSIAN_FILTER = np.float32([[1, 4, 6, 4, 1], [4, 16, 24, 16, 4], [
+    6, 24, 36, 24, 6
+], [4, 16, 24, 16, 4], [1, 4, 6, 4, 1]]).reshape([5, 5, 1, 1]) / 256.0
+
+
+def _laplacian_pyramid(batch, num_levels):
+  """Compute a Laplacian pyramid.
+
+  Args:
+      batch: (tensor) The batch of images (batch, height, width, channels).
+      num_levels: (int) Desired number of hierarchical levels.
+  Returns:
+      List of tensors from the highest to lowest resolution.
+  """
+  gaussian_filter = constant_op.constant(_GAUSSIAN_FILTER)
+
+  def spatial_conv(batch, gain):
+    s = array_ops.shape(batch)
+    padded = array_ops.pad(batch, [[0, 0], [2, 2], [2, 2], [0, 0]], 'REFLECT')
+    xt = array_ops.transpose(padded, [0, 3, 1, 2])
+    xt = array_ops.reshape(xt, [s[0] * s[3], s[1] + 4, s[2] + 4, 1])
+    conv_out = nn_ops.conv2d(xt, gaussian_filter * gain, [1] * 4, 'VALID')
+    conv_xt = array_ops.reshape(conv_out, [s[0], s[3], s[1], s[2]])
+    conv_xt = array_ops.transpose(conv_xt, [0, 2, 3, 1])
+    return conv_xt
+
+  def pyr_down(batch):  # matches cv2.pyrDown()
+    return spatial_conv(batch, 1)[:, ::2, ::2]
+
+  def pyr_up(batch):  # matches cv2.pyrUp()
+    s = array_ops.shape(batch)
+    zeros = array_ops.zeros([3 * s[0], s[1], s[2], s[3]])
+    res = array_ops.concat([batch, zeros], 0)
+    res = array_ops.batch_to_space(res, crops=[[0, 0], [0, 0]], block_size=2)
+    res = spatial_conv(res, 4)
+    return res
+
+  pyramid = [math_ops.to_float(batch)]
+  for _ in range(1, num_levels):
+    pyramid.append(pyr_down(pyramid[-1]))
+    pyramid[-2] -= pyr_up(pyramid[-1])
+  return pyramid
+
+
+def _batch_to_patches(batch, patches_per_image, patch_size):
+  """Extract patches from a batch.
+
+  Args:
+      batch: (tensor) The batch of images (batch, height, width, channels).
+      patches_per_image: (int) Number of patches to extract per image.
+      patch_size: (int) Size of the patches (size, size, channels) to extract.
+  Returns:
+      Tensor (batch*patches_per_image, patch_size, patch_size, channels) of
+      patches.
+  """
+
+  def py_func_random_patches(batch):
+    """Numpy wrapper."""
+    batch_size, height, width, channels = batch.shape
+    patch_count = patches_per_image * batch_size
+    hs = patch_size // 2
+    # Randomly pick patches.
+    patch_id, y, x, chan = np.ogrid[0:patch_count, -hs:hs + 1, -hs:hs + 1, 0:3]
+    img_id = patch_id // patches_per_image
+    # pylint: disable=g-no-augmented-assignment
+    # Need explicit addition for broadcast to work properly.
+    y = y + np.random.randint(hs, height - hs, size=(patch_count, 1, 1, 1))
+    x = x + np.random.randint(hs, width - hs, size=(patch_count, 1, 1, 1))
+    # pylint: enable=g-no-augmented-assignment
+    idx = ((img_id * height + y) * width + x) * channels + chan
+    patches = batch.flat[idx]
+    return patches
+
+  patches = script_ops.py_func(
+      py_func_random_patches, [batch], batch.dtype, stateful=False)
+  return patches
+
+
+def _normalize_patches(patches):
+  """Normalize patches by their mean and standard deviation.
+
+  Args:
+      patches: (tensor) The batch of patches (batch, size, size, channels).
+  Returns:
+      Tensor (batch, size, size, channels) of the normalized patches.
+  """
+  patches = array_ops.concat(patches, 0)
+  mean, variance = nn.moments(patches, [1, 2, 3], keep_dims=True)
+  patches = (patches - mean) / math_ops.sqrt(variance)
+  return array_ops.reshape(patches, [array_ops.shape(patches)[0], -1])
+
+
+def _sort_rows(matrix, num_rows):
+  """Sort matrix rows by the last column.
+
+  Args:
+      matrix: a matrix of values (row,col).
+      num_rows: (int) number of sorted rows to return from the matrix.
+  Returns:
+      Tensor (num_rows, col) of the sorted matrix top K rows.
+  """
+  tmatrix = array_ops.transpose(matrix, [1, 0])
+  sorted_tmatrix = nn_ops.top_k(tmatrix, num_rows)[0]
+  return array_ops.transpose(sorted_tmatrix, [1, 0])
+
+
+def _sliced_wasserstein(a, b, random_sampling_count, random_projection_dim):
+  """Compute the approximate sliced Wasserstein distance.
+
+  Args:
+      a: (matrix) Distribution "a" of samples (row, col).
+      b: (matrix) Distribution "b" of samples (row, col).
+      random_sampling_count: (int) Number of random projections to average.
+      random_projection_dim: (int) Dimension of the random projection space.
+  Returns:
+      Float containing the approximate distance between "a" and "b".
+  """
+  s = array_ops.shape(a)
+  means = []
+  for _ in range(random_sampling_count):
+    # Random projection matrix.
+    proj = random_ops.random_normal(
+        [array_ops.shape(a)[1], random_projection_dim])
+    proj *= math_ops.rsqrt(
+        math_ops.reduce_sum(math_ops.square(proj), 0, keep_dims=True))
+    # Project both distributions and sort them.
+    proj_a = math_ops.matmul(a, proj)
+    proj_b = math_ops.matmul(b, proj)
+    proj_a = _sort_rows(proj_a, s[0])
+    proj_b = _sort_rows(proj_b, s[0])
+    # Pairwise Wasserstein distance.
+    wdist = math_ops.reduce_mean(math_ops.abs(proj_a - proj_b))
+    means.append(wdist)
+  return math_ops.reduce_mean(means)
+
+
+def _sliced_wasserstein_svd(a, b):
+  """Compute the approximate sliced Wasserstein distance using an SVD.
+
+  This is not part of the paper, it's a variant with possibly more accurate
+  measure.
+
+  Args:
+      a: (matrix) Distribution "a" of samples (row, col).
+      b: (matrix) Distribution "b" of samples (row, col).
+  Returns:
+      Float containing the approximate distance between "a" and "b".
+  """
+  s = array_ops.shape(a)
+  # Random projection matrix.
+  sig, u = linalg_ops.svd(array_ops.concat([a, b], 0))[:2]
+  proj_a, proj_b = array_ops.split(u * sig, 2, axis=0)
+  proj_a = _sort_rows(proj_a[:, ::-1], s[0])
+  proj_b = _sort_rows(proj_b[:, ::-1], s[0])
+  # Pairwise Wasserstein distance.
+  wdist = math_ops.reduce_mean(math_ops.abs(proj_a - proj_b))
+  return wdist
+
+
+def sliced_wasserstein_distance(real_images,
+                                fake_images,
+                                resolution_min=16,
+                                patches_per_image=64,
+                                patch_size=7,
+                                random_sampling_count=1,
+                                random_projection_dim=7 * 7 * 3,
+                                use_svd=False):
+  """Compute the Wasserstein distance between two distributions of images.
+
+  Note that measure vary with the number of images. Use 8192 images to get
+  numbers comparable to the ones in the original paper.
+
+  Args:
+      real_images: (tensor) Real images (batch, height, width, channels).
+      fake_images: (tensor) Fake images (batch, height, width, channels).
+      resolution_min: (int) Minimum resolution for the Laplacion pyramid.
+      patches_per_image: (int) Number of patches to extract per image per
+        Laplacian level.
+      patch_size: (int) Width of a square patch.
+      random_sampling_count: (int) Number of random projections to average.
+      random_projection_dim: (int) Dimension of the random projection space.
+      use_svd: experimental method to compute a more accurate distance.
+  Returns:
+      List of tuples (distance_real, distance_fake) for each level of the
+      Laplacian pyramid from the highest resoluion to the lowest.
+        distance_real is the Wasserstein distance between real images
+        distance_fake is the Wasserstein distance between real and fake images.
+  Raises:
+      ValueError: If the inputs shapes are incorrect. Input tensor dimensions
+      (batch, height, width, channels) are expected to be known at graph
+      construction time. In addition height and width must be the same and the
+      number of colors should be exactly 3. Real and fake images must have the
+      same size.
+  """
+  height = real_images.shape[1]
+  real_images.shape.assert_is_compatible_with([None, None, height, 3])
+  fake_images.shape.assert_is_compatible_with(real_images.shape)
+
+  # Select resolutions.
+  resolution_full = int(height)
+  resolution_min = min(resolution_min, resolution_full)
+  resolution_max = resolution_full
+  # Base loss of detail.
+  resolutions = [
+      2**i
+      for i in range(
+          int(np.log2(resolution_max)),
+          int(np.log2(resolution_min)) - 1, -1)
+  ]
+
+  # Gather patches for each level of the Laplacian pyramids.
+  patches_real, patches_fake, patches_test = (
+      [[] for _ in resolutions] for _ in range(3))
+  for lod, level in enumerate(
+      _laplacian_pyramid(real_images, len(resolutions))):
+    patches_real[lod].append(
+        _batch_to_patches(level, patches_per_image, patch_size))
+    patches_test[lod].append(
+        _batch_to_patches(level, patches_per_image, patch_size))
+
+  for lod, level in enumerate(
+      _laplacian_pyramid(fake_images, len(resolutions))):
+    patches_fake[lod].append(
+        _batch_to_patches(level, patches_per_image, patch_size))
+
+  for lod in range(len(resolutions)):
+    for patches in [patches_real, patches_test, patches_fake]:
+      patches[lod] = _normalize_patches(patches[lod])
+
+  # Evaluate scores.
+  scores = []
+  for lod in range(len(resolutions)):
+    if not use_svd:
+      scores.append(
+          (_sliced_wasserstein(patches_real[lod], patches_test[lod],
+                               random_sampling_count, random_projection_dim),
+           _sliced_wasserstein(patches_real[lod], patches_fake[lod],
+                               random_sampling_count, random_projection_dim)))
+    else:
+      scores.append(
+          (_sliced_wasserstein_svd(patches_real[lod], patches_test[lod]),
+           _sliced_wasserstein_svd(patches_real[lod], patches_fake[lod])))
+  return scores
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..871f1ad54e2559f5df28efa78f99997a866f7087
--- /dev/null
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein_test.py
@@ -0,0 +1,131 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Sliced Wasserstein Distance."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import ndimage
+from tensorflow.contrib.gan.python.eval.python import sliced_wasserstein_impl as swd
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+class ClassifierMetricsTest(test.TestCase):
+
+  def test_laplacian_pyramid(self):
+    # The numpy/scipy code for reference estimation comes from:
+    # https://github.com/tkarras/progressive_growing_of_gans
+    gaussian_filter = np.float32([[1, 4, 6, 4, 1], [4, 16, 24, 16, 4], [
+        6, 24, 36, 24, 6
+    ], [4, 16, 24, 16, 4], [1, 4, 6, 4, 1]]) / 256.0
+
+    def np_pyr_down(minibatch):  # matches cv2.pyrDown()
+      assert minibatch.ndim == 4
+      return ndimage.convolve(
+          minibatch,
+          gaussian_filter[np.newaxis, np.newaxis, :, :],
+          mode='mirror')[:, :, ::2, ::2]
+
+    def np_pyr_up(minibatch):  # matches cv2.pyrUp()
+      assert minibatch.ndim == 4
+      s = minibatch.shape
+      res = np.zeros((s[0], s[1], s[2] * 2, s[3] * 2), minibatch.dtype)
+      res[:, :, ::2, ::2] = minibatch
+      return ndimage.convolve(
+          res,
+          gaussian_filter[np.newaxis, np.newaxis, :, :] * 4.0,
+          mode='mirror')
+
+    def np_laplacian_pyramid(minibatch, num_levels):
+      # Note: there's a bug in the original SWD, fixed repeatability.
+      pyramid = [minibatch.astype('f').copy()]
+      for _ in range(1, num_levels):
+        pyramid.append(np_pyr_down(pyramid[-1]))
+        pyramid[-2] -= np_pyr_up(pyramid[-1])
+      return pyramid
+
+    data = np.random.normal(size=[256, 3, 32, 32]).astype('f')
+    pyramid = np_laplacian_pyramid(data, 3)
+    data_tf = array_ops.placeholder(dtypes.float32, [256, 32, 32, 3])
+    pyramid_tf = swd._laplacian_pyramid(data_tf, 3)
+    with self.test_session() as sess:
+      pyramid_tf = sess.run(
+          pyramid_tf, feed_dict={
+              data_tf: data.transpose(0, 2, 3, 1)
+          })
+    for x in range(3):
+      self.assertAllClose(
+          pyramid[x].transpose(0, 2, 3, 1), pyramid_tf[x], atol=1e-6)
+
+  def test_sliced_wasserstein_distance(self):
+    """Test the distance."""
+    d1 = random_ops.random_uniform([256, 32, 32, 3])
+    d2 = random_ops.random_normal([256, 32, 32, 3])
+    wfunc = swd.sliced_wasserstein_distance(d1, d2)
+    with self.test_session() as sess:
+      wscores = [sess.run(x) for x in wfunc]
+    self.assertAllClose(
+        np.array([0.014, 0.014], 'f'),
+        np.array([x[0] for x in wscores], 'f'),
+        rtol=0.15)
+    self.assertAllClose(
+        np.array([0.014, 0.020], 'f'),
+        np.array([x[1] for x in wscores], 'f'),
+        rtol=0.15)
+
+  def test_sliced_wasserstein_distance_svd(self):
+    """Test the distance."""
+    d1 = random_ops.random_uniform([256, 32, 32, 3])
+    d2 = random_ops.random_normal([256, 32, 32, 3])
+    wfunc = swd.sliced_wasserstein_distance(d1, d2, use_svd=True)
+    with self.test_session() as sess:
+      wscores = [sess.run(x) for x in wfunc]
+    self.assertAllClose(
+        np.array([0.013, 0.013], 'f'),
+        np.array([x[0] for x in wscores], 'f'),
+        rtol=0.15)
+    self.assertAllClose(
+        np.array([0.014, 0.019], 'f'),
+        np.array([x[1] for x in wscores], 'f'),
+        rtol=0.15)
+
+  def test_swd_mismatched(self):
+    """Test the inputs mismatched shapes are detected."""
+    d1 = random_ops.random_uniform([256, 32, 32, 3])
+    d2 = random_ops.random_normal([256, 32, 31, 3])
+    d3 = random_ops.random_normal([256, 31, 32, 3])
+    d4 = random_ops.random_normal([255, 32, 32, 3])
+    with self.assertRaises(ValueError):
+      swd.sliced_wasserstein_distance(d1, d2)
+    with self.assertRaises(ValueError):
+      swd.sliced_wasserstein_distance(d1, d3)
+    with self.assertRaises(ValueError):
+      swd.sliced_wasserstein_distance(d1, d4)
+
+  def test_swd_not_rgb(self):
+    """Test that only RGB is supported."""
+    d1 = random_ops.random_uniform([256, 32, 32, 1])
+    d2 = random_ops.random_normal([256, 32, 32, 1])
+    with self.assertRaises(ValueError):
+      swd.sliced_wasserstein_distance(d1, d2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
index 508b4d20d8767f42246a0d0c87f911b7ac612f45..0d1afad72da8a8e087239868e25ddebe23490d1e 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.gan.python import namedtuples
 from tensorflow.contrib.gan.python.eval.python import eval_utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -38,16 +39,26 @@ def _assert_is_image(data):
   data.shape[1:].assert_is_fully_defined()
 
 
-def add_gan_model_image_summaries(gan_model, grid_size=4):
+def add_gan_model_image_summaries(gan_model, grid_size=4, model_summaries=True):
   """Adds image summaries for real and fake images.
 
   Args:
     gan_model: A GANModel tuple.
     grid_size: The size of an image grid.
+    model_summaries: Also add summaries of the model.
 
   Raises:
     ValueError: If real and generated data aren't images.
   """
+  if isinstance(gan_model, namedtuples.CycleGANModel):
+    saved_params = locals()
+    saved_params.pop('gan_model', None)
+    with ops.name_scope('cyclegan_x2y_image_summaries'):
+      add_gan_model_image_summaries(gan_model.model_x2y, **saved_params)
+    with ops.name_scope('cyclegan_y2x_image_summaries'):
+      add_gan_model_image_summaries(gan_model.model_y2x, **saved_params)
+    return
+
   _assert_is_image(gan_model.real_data)
   _assert_is_image(gan_model.generated_data)
 
@@ -73,7 +84,9 @@ def add_gan_model_image_summaries(gan_model, grid_size=4):
           image_shape=generated_image_shape,
           num_channels=generated_channels),
       max_outputs=1)
-  add_gan_model_summaries(gan_model)
+
+  if model_summaries:
+    add_gan_model_summaries(gan_model)
 
 
 def add_image_comparison_summaries(gan_model, num_comparisons=2,
@@ -96,6 +109,15 @@ def add_image_comparison_summaries(gan_model, num_comparisons=2,
     ValueError: If the generator input, real, and generated data aren't all the
       same size.
   """
+  if isinstance(gan_model, namedtuples.CycleGANModel):
+    saved_params = locals()
+    saved_params.pop('gan_model', None)
+    with ops.name_scope('cyclegan_x2y_image_comparison_summaries'):
+      add_image_comparison_summaries(gan_model.model_x2y, **saved_params)
+    with ops.name_scope('cyclegan_y2x_image_comparison_summaries'):
+      add_image_comparison_summaries(gan_model.model_y2x, **saved_params)
+    return
+
   _assert_is_image(gan_model.generator_inputs)
   _assert_is_image(gan_model.generated_data)
   _assert_is_image(gan_model.real_data)
@@ -133,6 +155,13 @@ def add_gan_model_summaries(gan_model):
   Args:
     gan_model: A GANModel tuple.
   """
+  if isinstance(gan_model, namedtuples.CycleGANModel):
+    with ops.name_scope('cyclegan_x2y_summaries'):
+      add_gan_model_summaries(gan_model.model_x2y)
+    with ops.name_scope('cyclegan_y2x_summaries'):
+      add_gan_model_summaries(gan_model.model_y2x)
+    return
+
   with ops.name_scope('generator_variables'):
     for var in gan_model.generator_variables:
       summary.histogram(var.name, var)
@@ -147,6 +176,13 @@ def add_regularization_loss_summaries(gan_model):
   Args:
     gan_model: A GANModel tuple.
   """
+  if isinstance(gan_model, namedtuples.CycleGANModel):
+    with ops.name_scope('cyclegan_x2y_regularization_loss_summaries'):
+      add_regularization_loss_summaries(gan_model.model_x2y)
+    with ops.name_scope('cyclegan_y2x_regularization_loss_summaries'):
+      add_regularization_loss_summaries(gan_model.model_y2x)
+    return
+
   if gan_model.generator_scope:
     summary.scalar(
         'generator_regularization_loss',
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
index a3b02bcefc6cbaa6e24131b336b5c9c072bde52c..7956db43348c0cc0f3d372e92a2e343f5aa62013 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
@@ -57,40 +57,89 @@ def get_gan_model():
       discriminator_fn=discriminator_model)
 
 
+def get_cyclegan_model():
+  with variable_scope.variable_scope('x2y'):
+    model_x2y = get_gan_model()
+  with variable_scope.variable_scope('y2x'):
+    model_y2x = get_gan_model()
+  return namedtuples.CycleGANModel(
+      model_x2y=model_x2y,
+      model_y2x=model_y2x,
+      reconstructed_x=array_ops.zeros([3, 30, 35, 6]),
+      reconstructed_y=array_ops.zeros([3, 30, 35, 6]))
+
+
 class SummariesTest(test.TestCase):
 
-  def testAddGanModelImageSummaries(self):
-    summaries.add_gan_model_image_summaries(get_gan_model(), grid_size=2)
+  def _test_add_gan_model_image_summaries_impl(self, get_model_fn,
+                                               expected_num_summary_ops,
+                                               model_summaries):
+    summaries.add_gan_model_image_summaries(get_model_fn(), grid_size=2,
+                                            model_summaries=model_summaries)
 
-    self.assertEquals(5, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
+    self.assertEquals(expected_num_summary_ops,
+                      len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
     with self.test_session(use_gpu=True):
       variables.global_variables_initializer().run()
       summary.merge_all().eval()
 
-  def testAddGanModelSummaries(self):
-    summaries.add_gan_model_summaries(get_gan_model())
+  def test_add_gan_model_image_summaries(self):
+    self._test_add_gan_model_image_summaries_impl(get_gan_model, 5, True)
+
+  def test_add_gan_model_image_summaries_no_model(self):
+    self._test_add_gan_model_image_summaries_impl(get_gan_model, 2, False)
 
-    self.assertEquals(3, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
+  def test_add_gan_model_image_summaries_for_cyclegan(self):
+    self._test_add_gan_model_image_summaries_impl(get_cyclegan_model, 10,
+                                                  True)
+
+  def _test_add_gan_model_summaries_impl(self, get_model_fn,
+                                         expected_num_summary_ops):
+    summaries.add_gan_model_summaries(get_model_fn())
+
+    self.assertEquals(expected_num_summary_ops,
+                      len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
     with self.test_session(use_gpu=True):
       variables.global_variables_initializer().run()
       summary.merge_all().eval()
 
-  def testAddRegularizationLossSummaries(self):
-    summaries.add_regularization_loss_summaries(get_gan_model())
+  def test_add_gan_model_summaries(self):
+    self._test_add_gan_model_summaries_impl(get_gan_model, 3)
+
+  def test_add_gan_model_summaries_for_cyclegan(self):
+    self._test_add_gan_model_summaries_impl(get_cyclegan_model, 6)
 
-    self.assertEquals(2, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
+  def _test_add_regularization_loss_summaries_impl(self, get_model_fn,
+                                                   expected_num_summary_ops):
+    summaries.add_regularization_loss_summaries(get_model_fn())
+
+    self.assertEquals(expected_num_summary_ops,
+                      len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
     with self.test_session(use_gpu=True):
       summary.merge_all().eval()
 
+  def test_add_regularization_loss_summaries(self):
+    self._test_add_regularization_loss_summaries_impl(get_gan_model, 2)
+
+  def test_add_regularization_loss_summaries_for_cyclegan(self):
+    self._test_add_regularization_loss_summaries_impl(get_cyclegan_model, 4)
+
   # TODO(joelshor): Add correctness test.
-  def testAddImageComparisonSummaries(self):
-    summaries.add_image_comparison_summaries(
-        get_gan_model(), display_diffs=True)
+  def _test_add_image_comparison_summaries_impl(self, get_model_fn,
+                                                expected_num_summary_ops):
+    summaries.add_image_comparison_summaries(get_model_fn(), display_diffs=True)
 
-    self.assertEquals(1, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
+    self.assertEquals(expected_num_summary_ops,
+                      len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
     with self.test_session(use_gpu=True):
       summary.merge_all().eval()
 
+  def test_add_image_comparison_summaries(self):
+    self._test_add_image_comparison_summaries_impl(get_gan_model, 1)
+
+  def test_add_image_comparison_summaries_for_cyclegan(self):
+    self._test_add_image_comparison_summaries_impl(get_cyclegan_model, 2)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/gan/python/features/__init__.py b/tensorflow/contrib/gan/python/features/__init__.py
index 6d0972f8db418d6fcf517cc6f7e96093ae08a9e4..4816daf760143af9f1502873b123ffad8e5ec8ce 100644
--- a/tensorflow/contrib/gan/python/features/__init__.py
+++ b/tensorflow/contrib/gan/python/features/__init__.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN features module.
+
+This module includes support for virtual batch normalization, buffer replay,
+conditioning, etc.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,10 +26,12 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.gan.python.features.python import clip_weights
 from tensorflow.contrib.gan.python.features.python import conditioning_utils
+from tensorflow.contrib.gan.python.features.python import random_tensor_pool
 from tensorflow.contrib.gan.python.features.python import virtual_batchnorm
 
 from tensorflow.contrib.gan.python.features.python.clip_weights import *
 from tensorflow.contrib.gan.python.features.python.conditioning_utils import *
+from tensorflow.contrib.gan.python.features.python.random_tensor_pool import *
 from tensorflow.contrib.gan.python.features.python.virtual_batchnorm import *
 # pylint: enable=unused-import,wildcard-import
 
@@ -33,5 +39,6 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = clip_weights.__all__
 _allowed_symbols += conditioning_utils.__all__
+_allowed_symbols += random_tensor_pool.__all__
 _allowed_symbols += virtual_batchnorm.__all__
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/features/python/clip_weights_test.py b/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
index 030e37ec679ec58e3b534fd3644ffe1d23173404..2b7bb5f14e7f3d1b3f913d3426efaaae19079ffb 100644
--- a/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
+++ b/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tfgan.python.features.clip_weights."""
+"""Tests for features.clip_weights."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -31,17 +31,18 @@ class ClipWeightsTest(test.TestCase):
   """Tests for `discriminator_weight_clip`."""
 
   def setUp(self):
+    super(ClipWeightsTest, self).setUp()
     self.variables = [variables.Variable(2.0)]
     self.tuple = collections.namedtuple(
         'VarTuple', ['discriminator_variables'])(self.variables)
 
   def _test_weight_clipping_helper(self, use_tuple):
-    loss = self.variables[0] * 2.0
+    loss = self.variables[0]
     opt = training.GradientDescentOptimizer(1.0)
     if use_tuple:
-      opt_clip = clip_weights.weight_clip(opt, self.variables, 0.1)
+      opt_clip = clip_weights.clip_variables(opt, self.variables, 0.1)
     else:
-      opt_clip = clip_weights.discriminator_weight_clip(opt, self.tuple, 0.1)
+      opt_clip = clip_weights.clip_discriminator_weights(opt, self.tuple, 0.1)
 
     train_op1 = opt.minimize(loss, var_list=self.variables)
     train_op2 = opt_clip.minimize(loss, var_list=self.variables)
@@ -72,10 +73,14 @@ class ClipWeightsTest(test.TestCase):
         clip_weights.clip_discriminator_weights(opt, self.tuple, weight_clip=-1)
     else:
       with self.assertRaisesRegexp(ValueError, 'must be positive'):
-        clip_weights.clip_weights(opt, self.variables, weight_clip=-1)
+        clip_weights.clip_variables(opt, self.variables, weight_clip=-1)
 
   def test_incorrect_weight_clip_value_argsonly(self):
     self._test_incorrect_weight_clip_value_helper(False)
 
   def test_incorrect_weight_clip_value_tuple(self):
     self._test_incorrect_weight_clip_value_helper(True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/features/python/tensor_pool.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool.py
similarity index 86%
rename from tensorflow/contrib/gan/python/features/python/tensor_pool.py
rename to tensorflow/contrib/gan/python/features/python/random_tensor_pool.py
index 0bd2fa3db9427315ed623bc4d47d74683777bb94..ca904971fa8cb0440d3e0c9060f13cc214c9eaad 100644
--- a/tensorflow/contrib/gan/python/features/python/tensor_pool.py
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool.py
@@ -25,11 +25,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.gan.python.features.python import tensor_pool_impl
+from tensorflow.contrib.gan.python.features.python import random_tensor_pool_impl
 # pylint: disable=wildcard-import
-from tensorflow.contrib.gan.python.features.python.tensor_pool_impl import *
+from tensorflow.contrib.gan.python.features.python.random_tensor_pool_impl import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util.all_util import remove_undocumented
 
-__all__ = tensor_pool_impl.__all__
+__all__ = random_tensor_pool_impl.__all__
 remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/features/python/tensor_pool_impl.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
similarity index 67%
rename from tensorflow/contrib/gan/python/features/python/tensor_pool_impl.py
rename to tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
index 79318a69d291f11b7978e898423f1dd3e757466f..4cfae0de4451880cf8229903b0eb74b1c6e2e04d 100644
--- a/tensorflow/contrib/gan/python/features/python/tensor_pool_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
@@ -42,8 +42,14 @@ __all__ = [
 ]
 
 
-def tensor_pool(input_value,
-                pool_size,
+def _to_tuple(x):
+  if isinstance(x, (list, tuple)):
+    return tuple(x)
+  return (x,)
+
+
+def tensor_pool(input_values,
+                pool_size=50,
                 pooling_probability=0.5,
                 name='tensor_pool'):
   """Queue storing input values and returning random previously stored ones.
@@ -57,15 +63,18 @@ def tensor_pool(input_value,
   `pool_size` = 0 or `pooling_probability` = 0.
 
   Args:
-    input_value: A `Tensor` from which to read values to be pooled.
-    pool_size: An integer specifying the maximum size of the pool.
+    input_values: A `Tensor`, or a list or tuple of `Tensor`s from which to read
+      values to be pooled.
+    pool_size: An integer specifying the maximum size of the pool. Defaults to
+      50.
     pooling_probability: A float `Tensor` specifying the probability of getting
       a value from the pool, as opposed to just the current input.
     name: A string prefix for the name scope for all tensorflow ops.
 
   Returns:
-    A `Tensor` which is with given probability either the `input_value` or a
-    randomly chosen sample that was previously inserted in the pool.
+    A `Tensor`, or a list or tuple of `Tensor`s (according to the type ofx
+    `input_values`) which is with given probability either the `input_values` or
+    a randomly chosen sample that was previously inserted in the pool.
 
   Raises:
     ValueError: If `pool_size` is negative.
@@ -74,45 +83,57 @@ def tensor_pool(input_value,
   if pool_size < 0:
     raise ValueError('`pool_size` is negative.')
   elif pool_size == 0:
-    return input_value
+    return input_values
 
-  with ops.name_scope('{}_pool_queue'.format(name),
-                      values=[input_value, pooling_probability]):
+  original_input_values = input_values
+  input_values = _to_tuple(input_values)
+
+  with ops.name_scope(
+      '{}_pool_queue'.format(name),
+      values=input_values + (pooling_probability,)):
     pool_queue = data_flow_ops.RandomShuffleQueue(
         capacity=pool_size,
         min_after_dequeue=0,
-        dtypes=[input_value.dtype],
+        dtypes=[v.dtype for v in input_values],
         shapes=None)
 
     # In pseudeo code this code does the following:
     # if not pool_full:
-    #   enqueue(input_value)
-    #   return input_value
+    #   enqueue(input_values)
+    #   return input_values
     # else
-    #   dequeue_value = dequeue_random_sample()
-    #   enqueue(input_value)
+    #   dequeue_values = dequeue_random_sample()
+    #   enqueue(input_values)
     #   if rand() < pooling_probability:
-    #     return dequeue_value
+    #     return dequeue_values
     #   else
-    #     return input_value
+    #     return input_values
 
     def _get_input_value_pooled():
-      enqueue_op = pool_queue.enqueue(input_value)
+      enqueue_op = pool_queue.enqueue(input_values)
       with ops.control_dependencies([enqueue_op]):
-        return array_ops.identity(input_value)
+        return tuple(array_ops.identity(v) for v in input_values)
 
     def _get_random_pool_value_and_enqueue_input():
-      dequeue_value = pool_queue.dequeue()
-      with ops.control_dependencies([dequeue_value]):
-        enqueue_op = pool_queue.enqueue(input_value)
+      dequeue_values = _to_tuple(pool_queue.dequeue())
+      with ops.control_dependencies(dequeue_values):
+        enqueue_op = pool_queue.enqueue(input_values)
         with ops.control_dependencies([enqueue_op]):
           prob = random_ops.random_uniform(
               (), dtype=dtypes.float32) < pooling_probability
-          return control_flow_ops.cond(prob, lambda: dequeue_value,
-                                       lambda: input_value)
+          return control_flow_ops.cond(prob, lambda: dequeue_values,
+                                       lambda: input_values)
 
-    output_value = control_flow_ops.cond(
+    output_values = _to_tuple(control_flow_ops.cond(
         pool_queue.size() < pool_size, _get_input_value_pooled,
-        _get_random_pool_value_and_enqueue_input)
+        _get_random_pool_value_and_enqueue_input))
+
+    # Make sure that the shape of `output_value` is set.
+    for input_value, output_value in zip(input_values, output_values):
+      output_value.set_shape(input_value.shape)
 
-  return output_value
+  if isinstance(original_input_values, list):
+    return list(output_values)
+  elif isinstance(original_input_values, tuple):
+    return output_values
+  return output_values[0]
diff --git a/tensorflow/contrib/gan/python/features/python/tensor_pool_test.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
similarity index 70%
rename from tensorflow/contrib/gan/python/features/python/tensor_pool_test.py
rename to tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
index 49b77bb3fc56b91cd419f76b6eea920df7efe4a7..d8cf549cf71838178c9da01df462d41d81595fe5 100644
--- a/tensorflow/contrib/gan/python/features/python/tensor_pool_test.py
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.contrib.gan.python.features.tensor_pool."""
+"""Tests for tf.contrib.gan.python.features.random_tensor_pool."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.gan.python.features.python import tensor_pool_impl as tensor_pool
+from tensorflow.contrib.gan.python.features.python.random_tensor_pool_impl import tensor_pool
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -32,7 +32,8 @@ class TensorPoolTest(test.TestCase):
     """Checks that `input_value` can have unknown shape."""
     input_value = array_ops.placeholder(
         dtype=dtypes.int32, shape=[None, None, 3])
-    output_value = tensor_pool.tensor_pool(input_value, pool_size=10)
+    output_value = tensor_pool(input_value, pool_size=10)
+    self.assertEqual(output_value.shape.as_list(), [None, None, 3])
 
     with self.test_session(use_gpu=True) as session:
       for i in range(10):
@@ -43,7 +44,8 @@ class TensorPoolTest(test.TestCase):
   def test_pool_sequence(self):
     """Checks that values are pooled and returned maximally twice."""
     input_value = array_ops.placeholder(dtype=dtypes.int32, shape=[])
-    output_value = tensor_pool.tensor_pool(input_value, pool_size=10)
+    output_value = tensor_pool(input_value, pool_size=10)
+    self.assertEqual(output_value.shape.as_list(), [])
 
     with self.test_session(use_gpu=True) as session:
       outs = []
@@ -59,8 +61,9 @@ class TensorPoolTest(test.TestCase):
   def test_never_pool(self):
     """Checks that setting `pooling_probability` to zero works."""
     input_value = array_ops.placeholder(dtype=dtypes.int32, shape=[])
-    output_value = tensor_pool.tensor_pool(
+    output_value = tensor_pool(
         input_value, pool_size=10, pooling_probability=0.0)
+    self.assertEqual(output_value.shape.as_list(), [])
 
     with self.test_session(use_gpu=True) as session:
       for i in range(50):
@@ -72,10 +75,11 @@ class TensorPoolTest(test.TestCase):
     input_value = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     pool_size = 10
     pooling_probability = 0.2
-    output_value = tensor_pool.tensor_pool(
+    output_value = tensor_pool(
         input_value,
         pool_size=pool_size,
         pooling_probability=pooling_probability)
+    self.assertEqual(output_value.shape.as_list(), [])
 
     with self.test_session(use_gpu=True) as session:
       not_pooled = 0
@@ -89,6 +93,24 @@ class TensorPoolTest(test.TestCase):
           1 - pooling_probability,
           atol=0.03)
 
+  def test_input_values_tuple(self):
+    """Checks that `input_values` can be a tuple."""
+    input_values = (array_ops.placeholder(dtype=dtypes.int32, shape=[]),
+                    array_ops.placeholder(dtype=dtypes.int32, shape=[]))
+    output_values = tensor_pool(input_values, pool_size=3)
+    self.assertEqual(len(output_values), len(input_values))
+    for output_value in output_values:
+      self.assertEqual(output_value.shape.as_list(), [])
+
+    with self.test_session(use_gpu=True) as session:
+      for i in range(10):
+        outs = session.run(output_values, {
+            input_values[0]: i,
+            input_values[1]: i + 1
+        })
+        self.assertEqual(len(outs), len(input_values))
+        self.assertEqual(outs[1] - outs[0], 1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/gan/python/losses/__init__.py b/tensorflow/contrib/gan/python/losses/__init__.py
index 290ff867a1e443f20a63e27fd97f53fed8a6cc11..d9bf8ebfdf65dfc76e4569dcaf26e0e51c7fc107 100644
--- a/tensorflow/contrib/gan/python/losses/__init__.py
+++ b/tensorflow/contrib/gan/python/losses/__init__.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN losses and penalties.
+
+Losses can be used with individual arguments or with GANModel tuples.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
index 940762cf2aa0f473cd41d9d543e2773b565a5248..39588b7219ebac1cc4855532be3fcc38e6381134 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
@@ -67,6 +67,7 @@ __all__ = [
     'wasserstein_gradient_penalty',
     'mutual_information_penalty',
     'combine_adversarial_loss',
+    'cycle_consistency_loss',
 ]
 
 
@@ -304,6 +305,7 @@ def wasserstein_gradient_penalty(
     discriminator_fn,
     discriminator_scope,
     epsilon=1e-10,
+    target=1.0,
     weights=1.0,
     scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -323,6 +325,8 @@ def wasserstein_gradient_penalty(
     discriminator_scope: If not `None`, reuse discriminators from this scope.
     epsilon: A small positive number added for numerical stability when
       computing the gradient norm.
+    target: Optional Python number or `Tensor` indicating the target value of
+      gradient norm. Defaults to 1.0.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `real_data` and `generated_data`, and must be broadcastable to
       them (i.e., all dimensions must be either `1`, or the same as the
@@ -373,7 +377,7 @@ def wasserstein_gradient_penalty(
     # For numerical stability, add epsilon to the sum before taking the square
     # root. Note tf.norm does not add epsilon.
     slopes = math_ops.sqrt(gradient_squares + epsilon)
-    penalties = math_ops.square(slopes - 1.0)
+    penalties = math_ops.square(slopes / target - 1.0)
     penalty = losses.compute_weighted_loss(
         penalties, weights, scope=scope, loss_collection=loss_collection,
         reduction=reduction)
@@ -915,3 +919,63 @@ def combine_adversarial_loss(main_loss,
                     array_ops.stop_gradient(adv_coeff) * adversarial_loss)
 
   return final_loss
+
+
+def cycle_consistency_loss(data_x,
+                           reconstructed_data_x,
+                           data_y,
+                           reconstructed_data_y,
+                           scope=None,
+                           add_summaries=False):
+  """Defines the cycle consistency loss.
+
+  The cyclegan model has two partial models where `model_x2y` generator F maps
+  data set X to Y, `model_y2x` generator G maps data set Y to X. For a `data_x`
+  in data set X, we could reconstruct it by
+  * reconstructed_data_x = G(F(data_x))
+  Similarly
+  * reconstructed_data_y = F(G(data_y))
+
+  The cycle consistency loss is about the difference between data and
+  reconstructed data, namely
+  * loss_x2x = |data_x - G(F(data_x))| (L1-norm)
+  * loss_y2y = |data_y - F(G(data_y))| (L1-norm)
+  * loss = (loss_x2x + loss_y2y) / 2
+  where `loss` is the final result.
+
+  See https://arxiv.org/abs/1703.10593 for more details.
+
+  Args:
+    data_x: A `Tensor` of data X.
+    reconstructed_data_x: A `Tensor` of reconstructed data X.
+    data_y: A `Tensor` of data Y.
+    reconstructed_data_y: A `Tensor` of reconstructed data Y.
+    scope: The scope for the operations performed in computing the loss.
+      Defaults to None.
+    add_summaries: Whether or not to add detailed summaries for the loss.
+      Defaults to False.
+
+  Returns:
+    A scalar `Tensor` of cycle consistency loss.
+  """
+
+  def _partial_cycle_consistency_loss(data, reconstructed_data):
+    # Following the original implementation
+    # https://github.com/junyanz/CycleGAN/blob/master/models/cycle_gan_model.lua
+    # use L1-norm of pixel-wise error normalized by data size so that
+    # `cycle_loss_weight` can be specified independent of image size.
+    return math_ops.reduce_mean(math_ops.abs(data - reconstructed_data))
+
+  with ops.name_scope(
+      scope,
+      'cycle_consistency_loss',
+      values=[data_x, reconstructed_data_x, data_y, reconstructed_data_y]):
+    loss_x2x = _partial_cycle_consistency_loss(data_x, reconstructed_data_x)
+    loss_y2y = _partial_cycle_consistency_loss(data_y, reconstructed_data_y)
+    loss = (loss_x2x + loss_y2y) / 2.0
+    if add_summaries:
+      summary.scalar('cycle_consistency_loss_x2x', loss_x2x)
+      summary.scalar('cycle_consistency_loss_y2y', loss_y2y)
+      summary.scalar('cycle_consistency_loss', loss)
+
+  return loss
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
index b5cd8c92ba180e981e0faf877021cb6d69dc34b4..dbaa624ae9d6a5a5949db692e52c0c1deb18b8df 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
@@ -481,6 +481,29 @@ class GradientPenaltyTest(test.TestCase, _PenaltyTest):
                       })
       self.assertAlmostEqual(self._expected_loss, loss, 5)
 
+  def test_loss_with_gradient_norm_target(self):
+    """Test loss value with non default gradient norm target."""
+    generated_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
+    real_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
+
+    loss = tfgan_losses.wasserstein_gradient_penalty(
+        generated_data,
+        real_data,
+        self._kwargs['generator_inputs'],
+        self._kwargs['discriminator_fn'],
+        self._kwargs['discriminator_scope'],
+        target=2.0)
+
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      loss = sess.run(
+          loss,
+          feed_dict={
+              generated_data: self._generated_data_np,
+              real_data: self._real_data_np,
+          })
+      self.assertAlmostEqual(1.0, loss, 5)
+
   def test_reuses_scope(self):
     """Test that gradient penalty reuses discriminator scope."""
     num_vars = len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
@@ -620,7 +643,34 @@ class CombineAdversarialLossTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       for _ in range(10):  # spot check closeness on more than one sample.
         gnorm_np, precond_gnorm_np = sess.run([gnorm, precond_gnorm])
-        self.assertNear(gnorm_np, precond_gnorm_np, 1e-5)
+        self.assertNear(gnorm_np, precond_gnorm_np, 1e-4)
+
+
+class CycleConsistencyLossTest(test.TestCase):
+  """Tests for cycle_consistency_loss."""
+
+  def setUp(self):
+    super(CycleConsistencyLossTest, self).setUp()
+
+    self._data_x_np = [[1.0, 2, 3], [4, 5, 6]]
+    self._reconstructed_data_x_np = [[7.0, 8, 9], [10, 11, 12]]
+    self._data_y_np = [1.0, 9]
+    self._reconstructed_data_y_np = [-2.0, 3]
+
+    self._data_x = constant_op.constant(self._data_x_np, dtype=dtypes.float32)
+    self._reconstructed_data_x = constant_op.constant(
+        self._reconstructed_data_x_np, dtype=dtypes.float32)
+    self._data_y = constant_op.constant(self._data_y_np, dtype=dtypes.float32)
+    self._reconstructed_data_y = constant_op.constant(
+        self._reconstructed_data_y_np, dtype=dtypes.float32)
+
+  def test_correct_loss(self):
+    loss = tfgan_losses.cycle_consistency_loss(
+        self._data_x, self._reconstructed_data_x, self._data_y,
+        self._reconstructed_data_y)
+    with self.test_session(use_gpu=True):
+      variables.global_variables_initializer().run()
+      self.assertNear(5.25, loss.eval(), 1e-5)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
index b341f03a0ddaacca8b036189516c71908bee50eb..dcc3f94c2d6b9e5e44036e7cc1a9d1bb39104fb5 100644
--- a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
@@ -60,6 +60,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.gan.python import namedtuples
 from tensorflow.contrib.gan.python.losses.python import losses_impl
 from tensorflow.python.util import tf_inspect
 
@@ -78,6 +79,7 @@ __all__ = [
     'wasserstein_gradient_penalty',
     'mutual_information_penalty',
     'combine_adversarial_loss',
+    'cycle_consistency_loss',
 ]
 
 
@@ -246,3 +248,32 @@ def combine_adversarial_loss(gan_loss,
       scalar_summaries,
       gradient_summaries)
   return gan_loss._replace(generator_loss=combined_loss)
+
+
+def cycle_consistency_loss(cyclegan_model, scope=None, add_summaries=False):
+  """Defines the cycle consistency loss.
+
+  Uses `cycle_consistency_loss` to compute the cycle consistency loss for a
+  `cyclegan_model`.
+
+  Args:
+    cyclegan_model: A `CycleGANModel` namedtuple.
+    scope: The scope for the operations performed in computing the loss.
+      Defaults to None.
+    add_summaries: Whether or not to add detailed summaries for the loss.
+      Defaults to False.
+
+  Returns:
+    A scalar `Tensor` of cycle consistency loss.
+
+  Raises:
+    ValueError: If `cyclegan_model` is not a `CycleGANModel` namedtuple.
+  """
+  if not isinstance(cyclegan_model, namedtuples.CycleGANModel):
+    raise ValueError(
+        '`cyclegan_model` must be a `CycleGANModel`. Instead, was %s.' %
+        type(cyclegan_model))
+  return losses_impl.cycle_consistency_loss(
+      cyclegan_model.model_x2y.generator_inputs, cyclegan_model.reconstructed_x,
+      cyclegan_model.model_y2x.generator_inputs, cyclegan_model.reconstructed_y,
+      scope, add_summaries)
diff --git a/tensorflow/contrib/gan/python/losses/python/tuple_losses_test.py b/tensorflow/contrib/gan/python/losses/python/tuple_losses_test.py
index 215b15ef6915d0b8113def35987ed6ab85617bcc..aa1ef11172dee6799994b87f70a3883cd67fd15b 100644
--- a/tensorflow/contrib/gan/python/losses/python/tuple_losses_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/tuple_losses_test.py
@@ -22,8 +22,11 @@ import collections
 
 import numpy as np
 
+from tensorflow.contrib.gan.python import namedtuples
 from tensorflow.contrib.gan.python.losses.python import tuple_losses_impl as tfgan_losses
-
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -125,6 +128,7 @@ manual_tests = [
     'combine_adversarial_loss',
     'mutual_information_penalty',
     'wasserstein_gradient_penalty',
+    'cycle_consistency_loss',
 ]
 
 discriminator_keyword_args = {
@@ -139,6 +143,38 @@ generator_keyword_args = {
 }
 
 
+class CycleConsistencyLossTest(test.TestCase):
+
+  def setUp(self):
+    super(CycleConsistencyLossTest, self).setUp()
+
+    def _partial_model(generator_inputs_np):
+      model = namedtuples.GANModel(*[None] * 11)
+      return model._replace(
+          generator_inputs=constant_op.constant(
+              generator_inputs_np, dtype=dtypes.float32))
+
+    self._model_x2y = _partial_model([1, 2])
+    self._model_y2x = _partial_model([5, 6])
+
+  def test_model_type(self):
+    """Test the input model type for `cycle_consistency_loss`."""
+    with self.assertRaises(ValueError):
+      tfgan_losses.cycle_consistency_loss(self._model_x2y)
+
+  def test_correct_loss(self):
+    """Test the output of `cycle_consistency_loss`."""
+    loss = tfgan_losses.cycle_consistency_loss(
+        namedtuples.CycleGANModel(
+            model_x2y=self._model_x2y,
+            model_y2x=self._model_y2x,
+            reconstructed_x=constant_op.constant([9, 8], dtype=dtypes.float32),
+            reconstructed_y=constant_op.constant([7, 2], dtype=dtypes.float32)))
+    with self.test_session(use_gpu=True):
+      variables.global_variables_initializer().run()
+      self.assertNear(5.0, loss.eval(), 1e-5)
+
+
 if __name__ == '__main__':
   for loss_name in tfgan_losses.__all__:
     if loss_name in manual_tests: continue
diff --git a/tensorflow/contrib/gan/python/namedtuples.py b/tensorflow/contrib/gan/python/namedtuples.py
index 48f5e8e47dbcd5d32c23806b967a0d1e7403d2f7..25cfeafeec9000b0dc3849ebe646e59c1b4d1cc3 100644
--- a/tensorflow/contrib/gan/python/namedtuples.py
+++ b/tensorflow/contrib/gan/python/namedtuples.py
@@ -30,7 +30,9 @@ __all__ = [
     'GANModel',
     'InfoGANModel',
     'ACGANModel',
+    'CycleGANModel',
     'GANLoss',
+    'CycleGANLoss',
     'GANTrainOps',
     'GANTrainSteps',
 ]
@@ -79,6 +81,7 @@ class InfoGANModel(
     collections.namedtuple('InfoGANModel', GANModel._fields + (
         'structured_generator_inputs',
         'predicted_distributions',
+        'discriminator_and_aux_fn',
     ))):
   """An InfoGANModel contains all the pieces needed for InfoGAN training.
 
@@ -91,6 +94,8 @@ class InfoGANModel(
     predicted_distributions: A list of tf.Distributions. Predicted by the
       recognizer, and used to evaluate the likelihood of the structured noise.
       List length should match `structured_generator_inputs`.
+    discriminator_and_aux_fn: The original discriminator function that returns
+      a tuple of (logits, `predicted_distributions`).
   """
 
 
@@ -112,6 +117,25 @@ class ACGANModel(
   """
 
 
+class CycleGANModel(
+    collections.namedtuple(
+        'CycleGANModel',
+        ('model_x2y', 'model_y2x', 'reconstructed_x', 'reconstructed_y'))):
+  """An CycleGANModel contains all the pieces needed for CycleGAN training.
+
+  The model `model_x2y` generator F maps data set X to Y, while the model
+  `model_y2x` generator G maps data set Y to X.
+
+  See https://arxiv.org/abs/1703.10593 for more details.
+
+  Args:
+    model_x2y: A `GANModel` namedtuple whose generator maps data set X to Y.
+    model_y2x: A `GANModel` namedtuple whose generator maps data set Y to X.
+    reconstructed_x: A `Tensor` of reconstructed data X which is G(F(X)).
+    reconstructed_y: A `Tensor` of reconstructed data Y which is F(G(Y)).
+  """
+
+
 class GANLoss(
     collections.namedtuple('GANLoss', (
         'generator_loss',
@@ -125,6 +149,18 @@ class GANLoss(
   """
 
 
+class CycleGANLoss(
+    collections.namedtuple('CycleGANLoss', ('loss_x2y', 'loss_y2x'))):
+  """CycleGANLoss contains the losses for `CycleGANModel`.
+
+  See https://arxiv.org/abs/1703.10593 for more details.
+
+  Args:
+    loss_x2y: A `GANLoss` namedtuple representing the loss of `model_x2y`.
+    loss_y2x: A `GANLoss` namedtuple representing the loss of `model_y2x`.
+  """
+
+
 class GANTrainOps(
     collections.namedtuple('GANTrainOps', (
         'generator_train_op',
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index e9443f766bdc59cf45513c93e14390cd6126c295..776eb11ecb1624544d24611d8fe6ca19768b8313 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -52,7 +52,9 @@ __all__ = [
     'gan_model',
     'infogan_model',
     'acgan_model',
+    'cyclegan_model',
     'gan_loss',
+    'cyclegan_loss',
     'gan_train_ops',
     'gan_train',
     'get_sequential_train_hooks',
@@ -215,7 +217,8 @@ def infogan_model(
       disc_scope,
       lambda x, y: discriminator_fn(x, y)[0],  # conform to non-InfoGAN API
       structured_generator_inputs,
-      predicted_distributions)
+      predicted_distributions,
+      discriminator_fn)
 
 
 def acgan_model(
@@ -276,14 +279,16 @@ def acgan_model(
     generator_inputs = _convert_tensor_or_l_or_d(generator_inputs)
     generated_data = generator_fn(generator_inputs)
   with variable_scope.variable_scope(discriminator_scope) as dis_scope:
-    (discriminator_gen_outputs, discriminator_gen_classification_logits
-    ) = _validate_acgan_discriminator_outputs(
-        discriminator_fn(generated_data, generator_inputs))
+    with ops.name_scope(dis_scope.name+'/generated/'):
+      (discriminator_gen_outputs, discriminator_gen_classification_logits
+      ) = _validate_acgan_discriminator_outputs(
+          discriminator_fn(generated_data, generator_inputs))
   with variable_scope.variable_scope(dis_scope, reuse=True):
-    real_data = ops.convert_to_tensor(real_data)
-    (discriminator_real_outputs, discriminator_real_classification_logits
-    ) = _validate_acgan_discriminator_outputs(
-        discriminator_fn(real_data, generator_inputs))
+    with ops.name_scope(dis_scope.name+'/real/'):
+      real_data = ops.convert_to_tensor(real_data)
+      (discriminator_real_outputs, discriminator_real_classification_logits
+      ) = _validate_acgan_discriminator_outputs(
+          discriminator_fn(real_data, generator_inputs))
   if check_shapes:
     if not generated_data.shape.is_compatible_with(real_data.shape):
       raise ValueError(
@@ -304,6 +309,76 @@ def acgan_model(
       discriminator_gen_classification_logits)
 
 
+def cyclegan_model(
+    # Lambdas defining models.
+    generator_fn,
+    discriminator_fn,
+    # data X and Y.
+    data_x,
+    data_y,
+    # Optional scopes.
+    generator_scope='Generator',
+    discriminator_scope='Discriminator',
+    model_x2y_scope='ModelX2Y',
+    model_y2x_scope='ModelY2X',
+    # Options.
+    check_shapes=True):
+  """Returns a CycleGAN model outputs and variables.
+
+  See https://arxiv.org/abs/1703.10593 for more details.
+
+  Args:
+    generator_fn: A python lambda that takes `data_x` or `data_y` as inputs and
+      returns the outputs of the GAN generator.
+    discriminator_fn: A python lambda that takes `real_data`/`generated data`
+      and `generator_inputs`. Outputs a Tensor in the range [-inf, inf].
+    data_x: A `Tensor` of dataset X. Must be the same shape as `data_y`.
+    data_y: A `Tensor` of dataset Y. Must be the same shape as `data_x`.
+    generator_scope: Optional generator variable scope. Useful if you want to
+      reuse a subgraph that has already been created. Defaults to 'Generator'.
+    discriminator_scope: Optional discriminator variable scope. Useful if you
+      want to reuse a subgraph that has already been created. Defaults to
+      'Discriminator'.
+    model_x2y_scope: Optional variable scope for model x2y variables. Defaults
+      to 'ModelX2Y'.
+    model_y2x_scope: Optional variable scope for model y2x variables. Defaults
+      to 'ModelY2X'.
+    check_shapes: If `True`, check that generator produces Tensors that are the
+      same shape as `data_x` (`data_y`). Otherwise, skip this check.
+
+  Returns:
+    A `CycleGANModel` namedtuple.
+
+  Raises:
+    ValueError: If `check_shapes` is True and `data_x` or the generator output
+      does not have the same shape as `data_y`.
+  """
+
+  # Create models.
+  def _define_partial_model(input_data, output_data):
+    return gan_model(
+        generator_fn=generator_fn,
+        discriminator_fn=discriminator_fn,
+        real_data=output_data,
+        generator_inputs=input_data,
+        generator_scope=generator_scope,
+        discriminator_scope=discriminator_scope,
+        check_shapes=check_shapes)
+
+  with variable_scope.variable_scope(model_x2y_scope):
+    model_x2y = _define_partial_model(data_x, data_y)
+  with variable_scope.variable_scope(model_y2x_scope):
+    model_y2x = _define_partial_model(data_y, data_x)
+
+  with variable_scope.variable_scope(model_y2x.generator_scope, reuse=True):
+    reconstructed_x = model_y2x.generator_fn(model_x2y.generated_data)
+  with variable_scope.variable_scope(model_x2y.generator_scope, reuse=True):
+    reconstructed_y = model_x2y.generator_fn(model_y2x.generated_data)
+
+  return namedtuples.CycleGANModel(model_x2y, model_y2x, reconstructed_x,
+                                   reconstructed_y)
+
+
 def _validate_aux_loss_weight(aux_loss_weight, name='aux_loss_weight'):
   if isinstance(aux_loss_weight, ops.Tensor):
     aux_loss_weight.shape.assert_is_compatible_with([])
@@ -326,6 +401,56 @@ def _use_aux_loss(aux_loss_weight):
     return False
 
 
+def _tensor_pool_adjusted_model(model, tensor_pool_fn):
+  """Adjusts model using `tensor_pool_fn`.
+
+  Args:
+    model: A GANModel tuple.
+    tensor_pool_fn: A function that takes (generated_data, generator_inputs),
+      stores them in an internal pool and returns a previously stored
+      (generated_data, generator_inputs) with some probability. For example
+      tfgan.features.tensor_pool.
+
+  Returns:
+    A new GANModel tuple where discriminator outputs are adjusted by taking
+    pooled generator outputs as inputs. Returns the original model if
+    `tensor_pool_fn` is None.
+
+  Raises:
+    ValueError: If tensor pool does not support the `model`.
+  """
+  if tensor_pool_fn is None:
+    return model
+
+  pooled_generated_data, pooled_generator_inputs = tensor_pool_fn(
+      (model.generated_data, model.generator_inputs))
+
+  if isinstance(model, namedtuples.GANModel):
+    with variable_scope.variable_scope(model.discriminator_scope, reuse=True):
+      dis_gen_outputs = model.discriminator_fn(pooled_generated_data,
+                                               pooled_generator_inputs)
+    return model._replace(discriminator_gen_outputs=dis_gen_outputs)
+  elif isinstance(model, namedtuples.ACGANModel):
+    with variable_scope.variable_scope(model.discriminator_scope, reuse=True):
+      (dis_pooled_gen_outputs,
+       dis_pooled_gen_classification_logits) = model.discriminator_fn(
+           pooled_generated_data, pooled_generator_inputs)
+    return model._replace(
+        discriminator_gen_outputs=dis_pooled_gen_outputs,
+        discriminator_gen_classification_logits=
+        dis_pooled_gen_classification_logits)
+  elif isinstance(model, namedtuples.InfoGANModel):
+    with variable_scope.variable_scope(model.discriminator_scope, reuse=True):
+      (dis_pooled_gen_outputs,
+       pooled_predicted_distributions) = model.discriminator_and_aux_fn(
+           pooled_generated_data, pooled_generator_inputs)
+    return model._replace(
+        discriminator_gen_outputs=dis_pooled_gen_outputs,
+        predicted_distributions=pooled_predicted_distributions)
+  else:
+    raise ValueError('Tensor pool does not support `model`: %s.' % type(model))
+
+
 def gan_loss(
     # GANModel.
     model,
@@ -335,9 +460,11 @@ def gan_loss(
     # Auxiliary losses.
     gradient_penalty_weight=None,
     gradient_penalty_epsilon=1e-10,
+    gradient_penalty_target=1.0,
     mutual_information_penalty_weight=None,
     aux_cond_generator_weight=None,
     aux_cond_discriminator_weight=None,
+    tensor_pool_fn=None,
     # Options.
     add_summaries=True):
   """Returns losses necessary to train generator and discriminator.
@@ -355,6 +482,9 @@ def gan_loss(
       small positive value used by the gradient penalty function for numerical
       stability. Note some applications will need to increase this value to
       avoid NaNs.
+    gradient_penalty_target: If `gradient_penalty_weight` is not None, a Python
+      number or `Tensor` indicating the target value of gradient norm. See the
+      CIFAR10 section of https://arxiv.org/abs/1710.10196. Defaults to 1.0.
     mutual_information_penalty_weight: If not `None`, must be a non-negative
       Python number or Tensor indicating how much to weight the mutual
       information penalty. See https://arxiv.org/abs/1606.03657 for more
@@ -363,6 +493,10 @@ def gan_loss(
       https://arxiv.org/abs/1610.09585
     aux_cond_discriminator_weight: If not None: add a classification loss as in
       https://arxiv.org/abs/1610.09585
+    tensor_pool_fn: A function that takes (generated_data, generator_inputs),
+      stores them in an internal pool and returns previous stored
+      (generated_data, generator_inputs). For example
+      `tf.gan.features.tensor_pool`. Defaults to None (not using tensor pool).
     add_summaries: Whether or not to add summaries for the losses.
 
   Returns:
@@ -402,12 +536,17 @@ def gan_loss(
 
   # Create standard losses.
   gen_loss = generator_loss_fn(model, add_summaries=add_summaries)
-  dis_loss = discriminator_loss_fn(model, add_summaries=add_summaries)
+  dis_loss = discriminator_loss_fn(
+      _tensor_pool_adjusted_model(model, tensor_pool_fn),
+      add_summaries=add_summaries)
 
   # Add optional extra losses.
   if _use_aux_loss(gradient_penalty_weight):
     gp_loss = tfgan_losses.wasserstein_gradient_penalty(
-        model, epsilon=gradient_penalty_epsilon, add_summaries=add_summaries)
+        model,
+        epsilon=gradient_penalty_epsilon,
+        target=gradient_penalty_target,
+        add_summaries=add_summaries)
     dis_loss += gradient_penalty_weight * gp_loss
   if _use_aux_loss(mutual_information_penalty_weight):
     info_loss = tfgan_losses.mutual_information_penalty(
@@ -436,6 +575,69 @@ def gan_loss(
   return namedtuples.GANLoss(gen_loss + gen_reg_loss, dis_loss + dis_reg_loss)
 
 
+def cyclegan_loss(
+    model,
+    # Loss functions.
+    generator_loss_fn=tfgan_losses.least_squares_generator_loss,
+    discriminator_loss_fn=tfgan_losses.least_squares_discriminator_loss,
+    # Auxiliary losses.
+    cycle_consistency_loss_fn=tfgan_losses.cycle_consistency_loss,
+    cycle_consistency_loss_weight=10.0,
+    # Options
+    **kwargs):
+  """Returns the losses for a `CycleGANModel`.
+
+  See https://arxiv.org/abs/1703.10593 for more details.
+
+  Args:
+    model: A `CycleGANModel` namedtuple.
+    generator_loss_fn: The loss function on the generator. Takes a `GANModel`
+      named tuple.
+    discriminator_loss_fn: The loss function on the discriminator. Takes a
+      `GANModel` namedtuple.
+    cycle_consistency_loss_fn: The cycle consistency loss function. Takes a
+      `CycleGANModel` namedtuple.
+    cycle_consistency_loss_weight: A non-negative Python number or a scalar
+      `Tensor` indicating how much to weigh the cycle consistency loss.
+    **kwargs: Keyword args to pass directly to `gan_loss` to construct the loss
+      for each partial model of `model`.
+
+  Returns:
+    A `CycleGANLoss` namedtuple.
+
+  Raises:
+    ValueError: If `model` is not a `CycleGANModel` namedtuple.
+  """
+  # Sanity checks.
+  if not isinstance(model, namedtuples.CycleGANModel):
+    raise ValueError(
+        '`model` must be a `CycleGANModel`. Instead, was %s.' % type(model))
+
+  # Defines cycle consistency loss.
+  cycle_consistency_loss = cycle_consistency_loss_fn(
+      model, add_summaries=kwargs.get('add_summaries', True))
+  cycle_consistency_loss_weight = _validate_aux_loss_weight(
+      cycle_consistency_loss_weight, 'cycle_consistency_loss_weight')
+  aux_loss = cycle_consistency_loss_weight * cycle_consistency_loss
+
+  # Defines losses for each partial model.
+  def _partial_loss(partial_model):
+    partial_loss = gan_loss(
+        partial_model,
+        generator_loss_fn=generator_loss_fn,
+        discriminator_loss_fn=discriminator_loss_fn,
+        **kwargs)
+    return partial_loss._replace(
+        generator_loss=partial_loss.generator_loss + aux_loss)
+
+  with ops.name_scope('cyclegan_loss_x2y'):
+    loss_x2y = _partial_loss(model.model_x2y)
+  with ops.name_scope('cyclegan_loss_y2x'):
+    loss_y2x = _partial_loss(model.model_y2x)
+
+  return namedtuples.CycleGANLoss(loss_x2y, loss_y2x)
+
+
 def _get_update_ops(kwargs, gen_scope, dis_scope, check_for_unused_ops=True):
   """Gets generator and discriminator update ops.
 
@@ -503,6 +705,24 @@ def gan_train_ops(
     A GANTrainOps tuple of (generator_train_op, discriminator_train_op) that can
     be used to train a generator/discriminator pair.
   """
+  if isinstance(model, namedtuples.CycleGANModel):
+    saved_params = locals()
+    saved_params.pop('model', None)
+    saved_params.pop('loss', None)
+    kwargs = saved_params.pop('kwargs', {})
+    saved_params.update(kwargs)
+    with ops.name_scope('cyclegan_x2y_train'):
+      train_ops_x2y = gan_train_ops(model.model_x2y, loss.loss_x2y,
+                                    **saved_params)
+    with ops.name_scope('cyclegan_y2x_train'):
+      train_ops_y2x = gan_train_ops(model.model_y2x, loss.loss_y2x,
+                                    **saved_params)
+    return namedtuples.GANTrainOps(
+        (train_ops_x2y.generator_train_op, train_ops_y2x.generator_train_op),
+        (train_ops_x2y.discriminator_train_op,
+         train_ops_y2x.discriminator_train_op),
+        training_util.get_or_create_global_step().assign_add(1))
+
   # Create global step increment op.
   global_step = training_util.get_or_create_global_step()
   global_step_inc = global_step.assign_add(1)
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
index 6b27b6926102b6e5a7ff134ceed75c23459a6534..f9bdaa74c948ecee11d5cfd89f06087924f8dace 100644
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.gan.python import namedtuples
 from tensorflow.contrib.gan.python import train
+from tensorflow.contrib.gan.python.features.python import random_tensor_pool
 from tensorflow.contrib.slim.python.slim import learning as slim_learning
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -145,14 +146,16 @@ def get_infogan_model():
   return namedtuples.InfoGANModel(
       *get_gan_model(),
       structured_generator_inputs=[constant_op.constant(0)],
-      predicted_distributions=[categorical.Categorical([1.0])])
+      predicted_distributions=[categorical.Categorical([1.0])],
+      discriminator_and_aux_fn=infogan_discriminator_model)
 
 
 def get_callable_infogan_model():
   return namedtuples.InfoGANModel(
       *get_callable_gan_model(),
       structured_generator_inputs=[constant_op.constant(0)],
-      predicted_distributions=[categorical.Categorical([1.0])])
+      predicted_distributions=[categorical.Categorical([1.0])],
+      discriminator_and_aux_fn=infogan_discriminator_model)
 
 
 def create_infogan_model():
@@ -207,12 +210,63 @@ def create_callable_acgan_model():
       one_hot_labels=array_ops.one_hot([0, 1, 2], 10))
 
 
+def get_cyclegan_model():
+  return namedtuples.CycleGANModel(
+      model_x2y=get_gan_model(),
+      model_y2x=get_gan_model(),
+      reconstructed_x=array_ops.ones([1, 2, 3]),
+      reconstructed_y=array_ops.zeros([1, 2, 3]))
+
+
+def get_callable_cyclegan_model():
+  return namedtuples.CycleGANModel(
+      model_x2y=get_callable_gan_model(),
+      model_y2x=get_callable_gan_model(),
+      reconstructed_x=array_ops.ones([1, 2, 3]),
+      reconstructed_y=array_ops.zeros([1, 2, 3]))
+
+
+def create_cyclegan_model():
+  return train.cyclegan_model(
+      generator_model,
+      discriminator_model,
+      data_x=array_ops.zeros([1, 2]),
+      data_y=array_ops.ones([1, 2]))
+
+
+def create_callable_cyclegan_model():
+  return train.cyclegan_model(
+      Generator(),
+      Discriminator(),
+      data_x=array_ops.zeros([1, 2]),
+      data_y=array_ops.ones([1, 2]))
+
+
 def get_sync_optimizer():
   return sync_replicas_optimizer.SyncReplicasOptimizer(
       gradient_descent.GradientDescentOptimizer(learning_rate=1.0),
       replicas_to_aggregate=1)
 
 
+def get_tensor_pool_fn(pool_size):
+
+  def tensor_pool_fn_impl(input_values):
+    return random_tensor_pool.tensor_pool(input_values, pool_size=pool_size)
+
+  return tensor_pool_fn_impl
+
+
+def get_tensor_pool_fn_for_infogan(pool_size):
+
+  def tensor_pool_fn_impl(input_values):
+    generated_data, generator_inputs = input_values
+    output_values = random_tensor_pool.tensor_pool(
+        [generated_data] + generator_inputs, pool_size=pool_size)
+    return output_values[0], output_values[1:]
+
+  return tensor_pool_fn_impl
+
+
 class GANModelTest(test.TestCase):
   """Tests for `gan_model`."""
 
@@ -239,6 +293,13 @@ class GANModelTest(test.TestCase):
     self._test_output_type_helper(
         get_callable_acgan_model, namedtuples.ACGANModel)
 
+  def test_output_type_cyclegan(self):
+    self._test_output_type_helper(get_cyclegan_model, namedtuples.CycleGANModel)
+
+  def test_output_type_callable_cyclegan(self):
+    self._test_output_type_helper(get_callable_cyclegan_model,
+                                  namedtuples.CycleGANModel)
+
   def test_no_shape_check(self):
     def dummy_generator_model(_):
       return (None, None)
@@ -286,6 +347,17 @@ class GANLossTest(test.TestCase):
   def test_output_type_callable_acgan(self):
     self._test_output_type_helper(get_callable_acgan_model)
 
+  def test_output_type_cyclegan(self):
+    loss = train.cyclegan_loss(create_cyclegan_model(), add_summaries=True)
+    self.assertIsInstance(loss, namedtuples.CycleGANLoss)
+    self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
+
+  def test_output_type_callable_cyclegan(self):
+    loss = train.cyclegan_loss(
+        create_callable_cyclegan_model(), add_summaries=True)
+    self.assertIsInstance(loss, namedtuples.CycleGANLoss)
+    self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
+
   # Test gradient penalty option.
   def _test_grad_penalty_helper(self, create_gan_model_fn):
     model = create_gan_model_fn()
@@ -409,6 +481,142 @@ class GANLossTest(test.TestCase):
   def test_callable_acgan(self):
     self._test_acgan_helper(create_callable_acgan_model)
 
+  # Test that CycleGan models work.
+  def _test_cyclegan_helper(self, create_gan_model_fn):
+    model = create_gan_model_fn()
+    loss = train.cyclegan_loss(model)
+    self.assertIsInstance(loss, namedtuples.CycleGANLoss)
+
+    # Check values.
+    with self.test_session(use_gpu=True) as sess:
+      variables.global_variables_initializer().run()
+      (loss_x2y_gen_np, loss_x2y_dis_np, loss_y2x_gen_np,
+       loss_y2x_dis_np) = sess.run([
+           loss.loss_x2y.generator_loss, loss.loss_x2y.discriminator_loss,
+           loss.loss_y2x.generator_loss, loss.loss_y2x.discriminator_loss
+       ])
+
+    self.assertGreater(loss_x2y_gen_np, loss_x2y_dis_np)
+    self.assertGreater(loss_y2x_gen_np, loss_y2x_dis_np)
+    self.assertTrue(np.isscalar(loss_x2y_gen_np))
+    self.assertTrue(np.isscalar(loss_x2y_dis_np))
+    self.assertTrue(np.isscalar(loss_y2x_gen_np))
+    self.assertTrue(np.isscalar(loss_y2x_dis_np))
+
+  def test_cyclegan(self):
+    self._test_cyclegan_helper(create_cyclegan_model)
+
+  def test_callable_cyclegan(self):
+    self._test_cyclegan_helper(create_callable_cyclegan_model)
+
+  def _check_tensor_pool_adjusted_model_outputs(self, tensor1, tensor2,
+                                                pool_size):
+    history_values = []
+    with self.test_session(use_gpu=True) as sess:
+      variables.global_variables_initializer().run()
+      for i in range(2 * pool_size):
+        t1, t2 = sess.run([tensor1, tensor2])
+        history_values.append(t1)
+        if i < pool_size:
+          # For [0, pool_size), the pool is not full, tensor1 should be equal
+          # to tensor2 as the pool.
+          self.assertAllEqual(t1, t2)
+        else:
+          # For [pool_size, ?), the pool is full, tensor2 must be equal to some
+          # historical values of tensor1 (which is previously stored in the
+          # pool).
+          self.assertTrue(any([(v == t2).all() for v in history_values]))
+
+  # Test `_tensor_pool_adjusted_model` for gan model.
+  def test_tensor_pool_adjusted_model_gan(self):
+    model = create_gan_model()
+
+    new_model = train._tensor_pool_adjusted_model(model, None)
+    # 'Generator/dummy_g:0' and 'Discriminator/dummy_d:0'
+    self.assertEqual(2, len(ops.get_collection(ops.GraphKeys.VARIABLES)))
+    self.assertIs(new_model.discriminator_gen_outputs,
+                  model.discriminator_gen_outputs)
+
+    pool_size = 5
+    new_model = train._tensor_pool_adjusted_model(
+        model, get_tensor_pool_fn(pool_size=pool_size))
+    self.assertIsNot(new_model.discriminator_gen_outputs,
+                     model.discriminator_gen_outputs)
+    # Check values.
+    self._check_tensor_pool_adjusted_model_outputs(
+        model.discriminator_gen_outputs, new_model.discriminator_gen_outputs,
+        pool_size)
+
+  # Test _tensor_pool_adjusted_model for infogan model.
+  def test_tensor_pool_adjusted_model_infogan(self):
+    model = create_infogan_model()
+
+    pool_size = 5
+    new_model = train._tensor_pool_adjusted_model(
+        model, get_tensor_pool_fn_for_infogan(pool_size=pool_size))
+    # 'Generator/dummy_g:0' and 'Discriminator/dummy_d:0'
+    self.assertEqual(2, len(ops.get_collection(ops.GraphKeys.VARIABLES)))
+    self.assertIsNot(new_model.discriminator_gen_outputs,
+                     model.discriminator_gen_outputs)
+    self.assertIsNot(new_model.predicted_distributions,
+                     model.predicted_distributions)
+    # Check values.
+    self._check_tensor_pool_adjusted_model_outputs(
+        model.discriminator_gen_outputs, new_model.discriminator_gen_outputs,
+        pool_size)
+
+  # Test _tensor_pool_adjusted_model for acgan model.
+  def test_tensor_pool_adjusted_model_acgan(self):
+    model = create_acgan_model()
+
+    pool_size = 5
+    new_model = train._tensor_pool_adjusted_model(
+        model, get_tensor_pool_fn(pool_size=pool_size))
+    # 'Generator/dummy_g:0' and 'Discriminator/dummy_d:0'
+    self.assertEqual(2, len(ops.get_collection(ops.GraphKeys.VARIABLES)))
+    self.assertIsNot(new_model.discriminator_gen_outputs,
+                     model.discriminator_gen_outputs)
+    self.assertIsNot(new_model.discriminator_gen_classification_logits,
+                     model.discriminator_gen_classification_logits)
+    # Check values.
+    self._check_tensor_pool_adjusted_model_outputs(
+        model.discriminator_gen_outputs, new_model.discriminator_gen_outputs,
+        pool_size)
+
+  # Test tensor pool.
+  def _test_tensor_pool_helper(self, create_gan_model_fn):
+    model = create_gan_model_fn()
+    if isinstance(model, namedtuples.InfoGANModel):
+      tensor_pool_fn = get_tensor_pool_fn_for_infogan(pool_size=5)
+    else:
+      tensor_pool_fn = get_tensor_pool_fn(pool_size=5)
+    loss = train.gan_loss(model, tensor_pool_fn=tensor_pool_fn)
+    self.assertTrue(isinstance(loss, namedtuples.GANLoss))
+
+    # Check values.
+    with self.test_session(use_gpu=True) as sess:
+      variables.global_variables_initializer().run()
+      for _ in range(10):
+        sess.run([loss.generator_loss, loss.discriminator_loss])
+
+  def test_tensor_pool_gan(self):
+    self._test_tensor_pool_helper(create_gan_model)
+
+  def test_tensor_pool_callable_gan(self):
+    self._test_tensor_pool_helper(create_callable_gan_model)
+
+  def test_tensor_pool_infogan(self):
+    self._test_tensor_pool_helper(create_infogan_model)
+
+  def test_tensor_pool_callable_infogan(self):
+    self._test_tensor_pool_helper(create_callable_infogan_model)
+
+  def test_tensor_pool_acgan(self):
+    self._test_tensor_pool_helper(create_acgan_model)
+
+  def test_tensor_pool_callable_acgan(self):
+    self._test_tensor_pool_helper(create_callable_acgan_model)
+
   def test_doesnt_crash_when_in_nested_scope(self):
     with variable_scope.variable_scope('outer_scope'):
       gan_model = train.gan_model(
diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD
index bdbe6f0a72621e59562fe113da101ff5a2b8c06d..707ae25d485c64f15694ee0e357f32b619d3cd33 100644
--- a/tensorflow/contrib/gdr/BUILD
+++ b/tensorflow/contrib/gdr/BUILD
@@ -82,6 +82,7 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/distributed_runtime:graph_mgr",
+        "//tensorflow/core/distributed_runtime:recent_request_ids",
         "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
         "//tensorflow/core/distributed_runtime:worker",
         "//tensorflow/core/distributed_runtime:worker_cache",
@@ -103,6 +104,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:request_id",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_cache",
         "//tensorflow/core/distributed_runtime:worker_env",
diff --git a/tensorflow/contrib/gdr/README.md b/tensorflow/contrib/gdr/README.md
index 34ce60b360822888aa6223c89362ae1b0d9d991f..8242d93f129904828a11b61d48f2df8fb0f88bc3 100644
--- a/tensorflow/contrib/gdr/README.md
+++ b/tensorflow/contrib/gdr/README.md
@@ -119,4 +119,4 @@ In the original design (as in the reference), tensor buffers are only registered
 Reference
 ===
 
-Bairen Yi, Jiacheng Xia, Li Chen, and Kai Chen. 2017. Towards Zero Copy Dataflows using RDMA. In Proceedings of SIGCOMM Posters and Demos'17, Los Angeles, CA, USA, August 22-24, 2017, 3 pages. https://doi.org/10.1145/3123878.3123907
+Bairen Yi, Jiacheng Xia, Li Chen, and Kai Chen. 2017. Towards Zero Copy Dataflows using RDMA. In Proceedings of SIGCOMM Posters and Demos'17, Los Angeles, CA, USA, August 22-24, 2017, 3 pages. https://doi.org/10.1145/3123878.3131975
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index 5c7ac744289ab7729b4cc43ab9bedc9342284e65..81e70ae30a4c72dbcedd1aabfe758ecca4c8b366 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -86,8 +86,9 @@ int TryToReadNumaNode(ibv_device* device) {
   if (strings::safe_strto32(content, &value)) {
     if (value < 0) {
       LOG(INFO) << "Successful NUMA node read from SysFS had negative value ("
-                << value << "), but there must be at least one NUMA node"
-                            ", so returning NUMA node zero";
+                << value
+                << "), but there must be at least one NUMA node"
+                   ", so returning NUMA node zero";
       return 0;
     }
     LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
@@ -290,8 +291,8 @@ Status GdrMemoryManager::Init() {
   // Host memory allocators
   for (Allocator* allocator : allocators) {
     auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator);
-    CHECK(visitable_allocator) << "is not visitable for instrumentation"
-                               << allocator->Name();
+    CHECK(visitable_allocator)
+        << "is not visitable for instrumentation" << allocator->Name();
     // Make sure we don't instrument the same allocator twice
     if (instrumented_.find(allocator) == std::end(instrumented_)) {
       visitable_allocator->AddAllocVisitor(alloc_visitor);
@@ -635,8 +636,8 @@ void GdrMemoryManager::TensorFromTransportOptions(
     } else {
       checksum = GPUUtil::Checksum(*tensor);
     }
-    CHECK(checksum == remote_mr.checksum()) << "Checksum mismatch: " << checksum
-                                            << "!=" << remote_mr.checksum();
+    CHECK(checksum == remote_mr.checksum())
+        << "Checksum mismatch: " << checksum << "!=" << remote_mr.checksum();
 #endif
   }
   done(Status::OK());
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index adef2aac33e3e0839a268eabe2496e58861535c5..28f68cec8cce126f1b177a73e197ccd7ab749f4a 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
@@ -47,6 +48,7 @@ class GdrRecvTensorCall : public BaseRecvTensorCall {
         recv_args_(recv_args) {
     req_.set_step_id(step_id);
     req_.set_rendezvous_key(key.data(), key.size());
+    req_.set_request_id(GetUniqueRequestId());
   }
 
   ~GdrRecvTensorCall() override {}
diff --git a/tensorflow/contrib/gdr/gdr_worker.cc b/tensorflow/contrib/gdr/gdr_worker.cc
index 568641234731a458a05886d12066ee9f55fa58aa..ce1d8d2d73000559f03046aceacb169890ecc1b6 100644
--- a/tensorflow/contrib/gdr/gdr_worker.cc
+++ b/tensorflow/contrib/gdr/gdr_worker.cc
@@ -41,17 +41,26 @@ namespace tensorflow {
 
 GdrWorker::GdrWorker(WorkerEnv* worker_env,
                      RemoteMemoryManager* remote_memory_manager)
-    : GrpcWorker(worker_env), remote_memory_manager_(remote_memory_manager) {}
+    : GrpcWorker(worker_env),
+      remote_memory_manager_(remote_memory_manager),
+      recv_tensor_recent_request_ids_(100000) {}
 
 void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
                                     const RecvTensorRequest* request,
                                     ::grpc::ByteBuffer* response,
                                     StatusCallback done) {
+  Status s = recv_tensor_recent_request_ids_.TrackUnique(
+      request->request_id(), "RecvTensor (GdrWorker)", *request);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+
   const int64 step_id = request->step_id();
   const string& key = request->rendezvous_key();
   TRACEPRINTF("RecvTensor: %lld %s", step_id, key.c_str());
   Rendezvous::ParsedKey parsed;
-  Status s = Rendezvous::ParseKey(key, &parsed);
+  s = Rendezvous::ParseKey(key, &parsed);
   Device* src_dev = nullptr;
   if (s.ok()) {
     s = PrepareRecvTensor(parsed, &src_dev);
diff --git a/tensorflow/contrib/gdr/gdr_worker.h b/tensorflow/contrib/gdr/gdr_worker.h
index a30b7baaedcbc80d93d7f37756732c37d2435935..54081f655ec087d78ac07974656257dcf478bcef 100644
--- a/tensorflow/contrib/gdr/gdr_worker.h
+++ b/tensorflow/contrib/gdr/gdr_worker.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/gdr/gdr_memory_manager.h"
 
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
 
 namespace tensorflow {
@@ -38,6 +39,7 @@ class GdrWorker : public GrpcWorker {
 
  private:
   RemoteMemoryManager* remote_memory_manager_;  // Not owned
+  RecentRequestIds recv_tensor_recent_request_ids_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index 2a97a79070ea3a0e634d76c5877e2307b6e2e577..14ac5296657d48c7f9e94d220c9e7e28af4d4353 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -173,6 +173,9 @@ def copy_op_handler(info, op, copy_shape=True):
   if op._original_op:
     op_._original_op = op._original_op
 
+  # Add op to the graph
+  info.graph_._add_op(op_)
+
   return op_, op_.outputs
 
 
diff --git a/tensorflow/contrib/hvx/README.md b/tensorflow/contrib/hvx/README.md
index 5a6f2f3086d708e5264b0483c211902ac8dce5f6..163993a3f6bb1bedcdffb32944a98c7cc846878e 100644
--- a/tensorflow/contrib/hvx/README.md
+++ b/tensorflow/contrib/hvx/README.md
@@ -1,60 +1,67 @@
 # TensorFlow Runtime with HVX Acceleration
 
-## Description
+This README explain how to build and use the TensorFlow runtime with HVX Acceleration. HVX is an extension of Hexagon, a DSP provided by Qualcomm, which can compute vector calculations faster using less energy than ARM processors.
 
-This README explain how to build and use the TensorFlow Runtime with HVX Acceleration. HVX is an extension of Hexagon which is a DSP provided by qualcomm which can compute vector calculations faster using lower energy than ARM processors.
+## Dependencies
+
+* [Android SDK](https://developer.android.com/studio/index.html).
+* [Android NDK](https://developer.android.com/ndk/index.html). Save the path in `${NDK_ROOT}`.
+* A rooted Qualcomm-based Android device connected to the computer (preferably, a [Snapdragon Development Board](https://developer.qualcomm.com/hardware/additional-snapdragon), but it could be a rooted phone with a Qualcomm SoC, albeit this guide may not work with it). The device needs to be rooted for development and testing purposes, and shouldn't be needed in production. See [Behold, The Snapdragon MDP](https://developer.qualcomm.com/blog/behold-snapdragon-mdp) for more information.
+* [Hexagon SDK v3.0](https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools). Save the path in `${QUALCOMM_SDK}`.
+* The current directory should be TensorFlow source code (`git clone https://github.com/tensorflow/tensorflow.git && cd tensorflow`), and saved into `${TF_ROOT_DIR}`.
+
+You may also need to add a test signature in the device to run HVX-based binaries. Follow the instructions in `${QUALCOMM_SDK}/docs/Tools_Signing.html`, using Python 2.
+
+Note that if the device is not rooted, you may not be able to get the serial number, push the test signature and/or run binary files that call HVX libraries.
 
 ## Quick Start Guide
 
-We provides several tools to build and run inference with this runtime quickly.
+We provide several tools to build and run inference with this runtime quickly.
 
-#### All-in-one script to run inception model with prebuild hexagon library
-If you don’t need to build your own implementation of hexagon HVX, we provide a shortcut to execute graphs by using pre-compiled binaries.
+### Run inception model with a prebuilt Hexagon library
 
+If you don’t need to build your own implementation of Hexagon HVX, we provide a shortcut to execute graphs by using pre-compiled binaries.
+
+```shell
+./tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh -p
 ```
-git clone https://github.com/tensorflow/tensorflow.git
-cd tensorflow
-NDK_ROOT="/path/to/ndk" ./tensorflow/contrib/makefile/build_all_android.sh -X
-```
-(-X downloads dependencies to hexagon HVX and graphs, and copy all dependencies to android and execute a test)
 
-#### All-in-one script to run inception model by building entire libraries from source code
- If you want to build your own implementation of hexagon HVX, we provide a sample all-in-one script to execute graphs which downloads source and build everything for hexagon.
+The `-p` option makes the script download dependencies (i.e., Hexagon HVX binaries and graphs models), copy them to the Android device and execute a test.
 
-```
-git clone https://github.com/tensorflow/tensorflow.git
-cd tensorflow
-QUALCOMM_SDK="/path/to/qualcomm/sdk" NDK_ROOT="/path/to/ndk" ./tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
+### Run inception model by building all from the source code
+
+If you want to build your own implementation of Hexagon HVX, we provide a sample all-in-one script to execute graphs which downloads the source and builds everything that's necessary.
+
+```shell
+./tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
 ```
 
 ## Building libraries
 
 If you've finished walking through the quick start guide, you may want to try building each binary manually.
 
-#### Build libhexagon_nn_skel.so
-Download hexagon nn library from codeaurora.org and build it.
+### Build libhexagon\_nn\_skel.so
 
-```
+Download Hexagon NN library from codeaurora.org and build it.
+
+```shell
 git clone https://source.codeaurora.org/quic/hexagon_nn/nnlib
 cd nnlib
 ```
 
-(Just follow instructions in README.HOW_TO_BUILD. You can find libhexagon_nn_skel.so in hexagon_Release_dynamic_toolv72_v60/ship)
-Then copy the generated binary to GEN_LIBS_DIR
+Just follow the instructions in `README.HOW_TO_BUILD`. You can find the file `libhexagon_nn_skel.so` in `hexagon_Release_dynamic_toolv72_v60/ship`.
+Then copy the generated binary to `${GEN_LIBS_DIR}`.
 
-```
+```shell
 GEN_LIBS_DIR="/path/to/a/dir/to/store/hexagon/libraries"
 cp -v "hexagon_Release_dynamic_toolv72_v60/ship/libhexagon_nn_skel.so" "${GEN_LIBS_DIR}"
 ```
 
-#### Build libhexagon_controller.so
+### Build libhexagon\_controller.so
+
 Download tensorflow and build hexagon controller.
 
-```
-git clone https://github.com/tensorflow/tensorflow.git
-cd tensorflow
-TF_ROOT_DIR="$(pwd)"
-QUALCOMM_SDK="/path/to/qualcomm/sdk"
+```shell
 GENERATED_NNLIB_DIRECTORY="/path/to/nnlib"
 GENERATED_HEXAGON_CONTROLLER_DIRECTORY="${QUALCOMM_SDK}/examples/common/generated_hexagon_controller"
 rm -rf "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}"
@@ -70,12 +77,12 @@ make tree VERBOSE=1 V=android_Release
 cp -v "${GENERATED_HEXAGON_CONTROLLER_DIRECTORY}/android_Release/ship/libhexagon_controller.so" "${GEN_LIBS_DIR}"
 ```
 
-#### Build tensorflow linking hexagon library
-Build tensorflow with the build_all_android.sh with specifying -x option.
+### Build TensorFlow linking Hexagon library
 
-```
+Build TensorFlow with `build_all_android.sh` specifying the `-x` option.
+
+```shell
 BUILD_ALL_ANDROID_PATH="${TF_ROOT_DIR}/tensorflow/contrib/makefile/build_all_android.sh"
-NDK_ROOT="/path/to/ndk/root"
 
 CC_PREFIX=${CC_PREFIX} NDK_ROOT=${NDK_ROOT} "${BUILD_ALL_ANDROID_PATH}" \
 -x "${GEN_LIBS_DIR}" \
@@ -83,11 +90,11 @@ CC_PREFIX=${CC_PREFIX} NDK_ROOT=${NDK_ROOT} "${BUILD_ALL_ANDROID_PATH}" \
 -t hexagon_graph_execution
 ```
 
-#### Push binaries to your Android device
+### Push binaries to your Android device
 
 Before running tests on your Android device, you need to push several binaries to it.
 
-```
+```shell
 adb push "${GEN_LIBS_DIR}/libhexagon_controller.so" "/data/local/tmp"
 adb push "${GEN_LIBS_DIR}/libhexagon_nn_skel.so" "/vendor/lib/rfsa/adsp"
 adb push -p \
@@ -100,40 +107,54 @@ adb shell chmod "${ANDROID_EXEC_FILE_MODE}" \
 adb wait-for-device
 ```
 
-#### Run tests on the device
+### Run tests on the device
 
 Finally, you can run the inference tests on your device.
 
-```
+```shell
 adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \
 "/data/local/tmp/hexagon_graph_execution"
 ```
 
-#### Troubleshooting
-If you're using the Open-Q 820 Snapdragon development kit, you may run into an issue with running the executable due to a missing testsig library. From the Hexagon SDK documentation: *Dynamic shared objects are required to be digitally signed and then authenticated at runtime before they are allowed to be loaded and executed.* Generating a testsig library is necessary to run the unsigned sample library built from this project.
+### Troubleshooting
+
+#### Testsig issue
+
+If you're using the Open-Q 820 Snapdragon Development Kit, you may run into an issue with running the executable due to a missing `testsig` library. From the Hexagon SDK documentation: *Dynamic shared objects are required to be digitally signed and then authenticated at runtime before they are allowed to be loaded and executed.* Generating a testsig library is necessary to run the unsigned sample library built from this project.
 
-If the lack of a testsig library is your problem, you will see errors of the type:
+If the lack of a `testsig` library is your problem, you will see errors of the type:
 `vendor/qcom/proprietary/adsprpc/src/fastrpc_apps_user.c:169::error: -1: 0 == (nErr = remotectl_open(name, (int*)ph, dlerrstr, sizeof(dlerrstr), &dlerr))`
-appearing in adb logcat.
-
-There are several ways to create the testsig library, the only prerequisite is Python and the correct version of the Hexagon-SDK. The following steps is one way to create this library:
-1. Run adb as root: `adb root`
-2. Run the command `adb shell cat /sys/devices/soc0/serial_number`
-3. Convert the decimal number you get as output to hex
-4. Run the python script: `python ${QUALCOMM_SDK}/tools/elfsigner/elfsigner.py -t $(SERIAL_NUMBER_HEX_VALUE)`
-5. The output of the python script is a shared library stored in ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so
-6. Push the shared library to your device:
+appearing in `adb logcat` or ["Expected: (version) >= (1), actual: 0 vs 1" while running a binary from adb](https://github.com/tensorflow/tensorflow/issues/11210).
+
+You need to add a test signature, as described at the beginning of this README. After rebooting your device, you should be able to run the sample application.
+
+#### Qualcomm SDK Linux installation fails with "Malformed \uxxxx encoding"
+
+The installation file is based on LaunchAnywhere, which fails in Linux if the `PS1` env variable contains non-common Unicode chars:
+
 ```
-adb root
-adb wait-for-device
-adb remount
-adb wait-for-device
-adb shell mkdir /system/lib/rfsa
-adb shell mkdir /system/lib/rfsa/adsp
-adb push ${QUALCOMM_SDK}/tools/elfsigner/output/testsig-$(SERIAL_NUMBER_HEX_VALUE).so /system/lib/rfsa/adsp/
+Preparing to install...
+Extracting the JRE from the installer archive...
+Unpacking the JRE...
+Extracting the installation resources from the installer archive...
+Configuring the installer for this system's environment...
+
+Launching installer...
+
+An internal LaunchAnywhere application error has occurred and this application cannot proceed. (LAX)
+
+Stack Trace:
+java.lang.IllegalArgumentException: Malformed \uxxxx encoding.
+  at java.util.Properties.loadConvert(Properties.java:574)
+  at java.util.Properties.load0(Properties.java:391)
+  at java.util.Properties.load(Properties.java:317)
+  at com.zerog.common.java.util.PropertiesUtil.loadProperties(Unknown Source)
+  at com.zerog.lax.LAX.<init>(Unknown Source)
+  at com.zerog.lax.LAX.main(Unknown Source)
 ```
 
-After rebooting your device, you should be able to run the sample application.
+It can be solved by temporarily assigning the `PS1` environment variable to something simple, such as '$'.
+
+## Maintainers
 
-Maintainers:
-- Satoshi Kataoka (satok@google.com, github.com/satok16)
+* Satoshi Kataoka (satok@google.com, github.com/satok16)
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
index 157e97d237021d95c935a6be66aa57842b97125c..3ff02e085ee63fabf42b3cc4389f4605455f3800 100755
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -9,10 +9,12 @@ package(default_visibility = ["//visibility:public"])
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
     "tf_custom_op_library",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
     "tf_kernel_library",
+    "tf_py_test",
 )
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
@@ -23,6 +25,8 @@ tf_custom_op_library(
         "kernels/bipartite_match_op.cc",
         "kernels/image_ops.cc",
         "kernels/image_ops.h",
+        "kernels/segmentation_ops.cc",
+        "kernels/segmentation_ops.h",
         "ops/image_ops.cc",
     ],
     gpu_srcs = [
@@ -37,6 +41,8 @@ tf_kernel_library(
         "kernels/bipartite_match_op.cc",
         "kernels/image_ops.cc",
         "kernels/image_ops.h",
+        "kernels/segmentation_ops.cc",
+        "kernels/segmentation_ops.h",
     ],
     gpu_srcs = [
         "kernels/image_ops_gpu.cu.cc",
@@ -77,6 +83,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:common_shapes",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
@@ -106,10 +113,33 @@ tf_custom_op_library(
     name = "python/ops/_distort_image_ops.so",
     srcs = [
         "kernels/adjust_hsv_in_yiq_op.cc",
+        "kernels/adjust_hsv_in_yiq_op.h",
         "ops/distort_image_ops.cc",
     ],
+    gpu_srcs = [
+        "kernels/adjust_hsv_in_yiq_op_gpu.cu.cc",
+        "kernels/adjust_hsv_in_yiq_op.h",
+    ],
     deps = [
-        "@protobuf_archive//:protobuf",
+        "//tensorflow/core/kernels:gpu_util_hdrs",
+    ],
+)
+
+tf_cc_test(
+    name = "adjust_hsv_in_yiq_op_test",
+    size = "small",
+    srcs = [
+        "kernels/adjust_hsv_in_yiq_op.h",
+        "kernels/adjust_hsv_in_yiq_op_test.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+        "//third_party/eigen3",
     ],
 )
 
@@ -122,19 +152,6 @@ tf_gen_op_wrapper_py(
     deps = [":distort_image_ops_op_lib"],
 )
 
-cc_library(
-    name = "distort_image_ops_cc",
-    srcs = [
-        "kernels/adjust_hsv_in_yiq_op.cc",
-    ],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-    ],
-    alwayslink = 1,
-)
-
 py_library(
     name = "distort_image_py",
     srcs = [
@@ -177,6 +194,21 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "segmentation_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/segmentation_test.py"],
+    additional_deps = [
+        ":distort_image_py",
+        ":image_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_custom_op_library(
     name = "python/ops/_single_image_random_dot_stereograms.so",
     srcs = [
@@ -222,6 +254,23 @@ py_library(
     ],
 )
 
+cuda_py_test(
+    name = "single_image_random_dot_stereograms_ops_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/single_image_random_dot_stereograms_ops_test.py"],
+    additional_deps = [
+        ":distort_image_py",
+        ":image_py",
+        ":single_image_random_dot_stereograms_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/image/__init__.py b/tensorflow/contrib/image/__init__.py
index d030dffadeb9d67f7ffcbc197a2a3feb9b3b122d..cc8ed117ba2edcc7a53e609381166f17a2fbb45e 100755
--- a/tensorflow/contrib/image/__init__.py
+++ b/tensorflow/contrib/image/__init__.py
@@ -20,6 +20,8 @@ This module provides functions for image manipulation; currently, chrominance
 transformas (including changing saturation and hue) in YIQ space and
 projective transforms (including rotation) are supported.
 
+## Image Transformation `Ops`
+
 @@angles_to_projective_transforms
 @@compose_transforms
 @@adjust_yiq_hsv
@@ -28,19 +30,29 @@ projective transforms (including rotation) are supported.
 @@transform
 @@translate
 @@translations_to_projective_transforms
+
+## Image Segmentation `Ops`
+
+@@connected_components
+
+## Matching `Ops`
+
 @@bipartite_match
+
+## Random Dot Stereogram `Ops`
+
 @@single_image_random_dot_stereograms
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=line-too-long
 from tensorflow.contrib.image.python.ops.distort_image_ops import adjust_hsv_in_yiq
 from tensorflow.contrib.image.python.ops.distort_image_ops import random_hsv_in_yiq
 
 from tensorflow.contrib.image.python.ops.image_ops import angles_to_projective_transforms
 from tensorflow.contrib.image.python.ops.image_ops import compose_transforms
+from tensorflow.contrib.image.python.ops.image_ops import connected_components
 from tensorflow.contrib.image.python.ops.image_ops import rotate
 from tensorflow.contrib.image.python.ops.image_ops import transform
 from tensorflow.contrib.image.python.ops.image_ops import translate
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
index f4962ed69dc68d4bad06ef29d7a167e0ba8ae044..478b716d88321101c971789f36c0ff8ecd3f418e 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
@@ -12,14 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cmath>
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
+#include "tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h"
 #include <memory>
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
@@ -36,10 +37,10 @@ class AdjustHsvInYiqOpBase : public OpKernel {
 
   struct ComputeOptions {
     const Tensor* input = nullptr;
+    Tensor* output = nullptr;
     const Tensor* delta_h = nullptr;
     const Tensor* scale_s = nullptr;
     const Tensor* scale_v = nullptr;
-    Tensor* output = nullptr;
     int64 channel_count = 0;
   };
 
@@ -65,7 +66,7 @@ class AdjustHsvInYiqOpBase : public OpKernel {
                                         scale_v.shape().DebugString()));
     auto channels = input.dim_size(input.dims() - 1);
     OP_REQUIRES(
-        context, channels == 3,
+        context, channels == kChannelSize,
         errors::InvalidArgument("input must have 3 channels but instead has ",
                                 channels, " channels."));
 
@@ -101,53 +102,21 @@ class AdjustHsvInYiqOp<CPUDevice> : public AdjustHsvInYiqOpBase {
     const Tensor* input = options.input;
     Tensor* output = options.output;
     const int64 channel_count = options.channel_count;
-    static const int kChannelSize = 3;
     auto input_data = input->shaped<float, 2>({channel_count, kChannelSize});
     const float delta_h = options.delta_h->scalar<float>()();
     const float scale_s = options.scale_s->scalar<float>()();
     const float scale_v = options.scale_v->scalar<float>()();
     auto output_data = output->shaped<float, 2>({channel_count, kChannelSize});
+    float tranformation_matrix[kChannelSize * kChannelSize] = {0};
+    internal::compute_tranformation_matrix<kChannelSize * kChannelSize>(
+        delta_h, scale_s, scale_v, tranformation_matrix);
     const int kCostPerChannel = 10;
     const DeviceBase::CpuWorkerThreads& worker_threads =
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, delta_h, scale_s, scale_v](
+          [channel_count, &input_data, &output_data, &tranformation_matrix](
               int64 start_channel, int64 end_channel) {
-            // Using approximate linear transfomation described in:
-            // https://beesbuzz.biz/code/hsv_color_transforms.php
-            /** Get the constants from sympy
-             from sympy import Matrix
-             from sympy.abc import u, w
-             # Projection matrix to YIQ. http://en.wikipedia.org/wiki/YIQ
-             tyiq = Matrix([[0.299, 0.587, 0.114],
-                            [0.596, -0.274, -0.322],
-                            [0.211, -0.523, 0.312]])
-             # Hue rotation matrix in YIQ space.
-             hue_proj = Matrix(3,3, [v, 0, 0, 0, vsu, -vsw, 0, vsw, vsu])
-             m = tyiq.inv() * hue_proj * tyiq
-             **/
-            // TODO(huangyp): directly compute the projection matrix from tyiq.
-            static const float t[kChannelSize][kChannelSize][kChannelSize] = {
-                {{.299, .701, .16862179492229},
-                 {.587, -.587, .329804745287403},
-                 {.114, -.114, -0.498426540209694}},
-                {{.299, -.299, -.327963394172371},
-                 {.587, .413, .0346106879248821},
-                 {.114, -.114, .293352706247489}},
-                {{.299, -.299, 1.24646136576682},
-                 {.587, -.587, -1.04322888291964},
-                 {.114, .886, -.203232482847173}}};
-            float m[kChannelSize][kChannelSize] = {{0.}};
-            float su = scale_s * std::cos(delta_h);
-            float sw = scale_s * std::sin(delta_h);
-            for (int q_index = 0; q_index < kChannelSize; q_index++) {
-              for (int p_index = 0; p_index < kChannelSize; p_index++) {
-                m[q_index][p_index] = scale_v * (t[q_index][p_index][0] +
-                                                 t[q_index][p_index][1] * su +
-                                                 t[q_index][p_index][2] * sw);
-              }
-            }
             // Applying projection matrix to input RGB vectors.
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
@@ -155,7 +124,9 @@ class AdjustHsvInYiqOp<CPUDevice> : public AdjustHsvInYiqOpBase {
               for (int q_index = 0; q_index < kChannelSize; q_index++) {
                 q[q_index] = 0;
                 for (int p_index = 0; p_index < kChannelSize; p_index++) {
-                  q[q_index] += m[q_index][p_index] * p[p_index];
+                  q[q_index] +=
+                      p[p_index] *
+                      tranformation_matrix[q_index + kChannelSize * p_index];
                 }
               }
               p += kChannelSize;
@@ -165,8 +136,33 @@ class AdjustHsvInYiqOp<CPUDevice> : public AdjustHsvInYiqOpBase {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustHsvInYiq").Device(DEVICE_CPU),
-                        AdjustHsvInYiqOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustHsvInYiq").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    AdjustHsvInYiqOp<CPUDevice>);
+
+#if GOOGLE_CUDA
+template <>
+class AdjustHsvInYiqOp<GPUDevice> : public AdjustHsvInYiqOpBase {
+ public:
+  explicit AdjustHsvInYiqOp(OpKernelConstruction* context)
+      : AdjustHsvInYiqOpBase(context) {}
+
+  void DoCompute(OpKernelContext* ctx, const ComputeOptions& options) override {
+    const int64 number_of_elements = options.input->NumElements();
+    if (number_of_elements <= 0) {
+      return;
+    }
+    const float* delta_h = options.delta_h->flat<float>().data();
+    const float* scale_s = options.scale_s->flat<float>().data();
+    const float* scale_v = options.scale_v->flat<float>().data();
+    functor::AdjustHsvInYiqGPU()(ctx, options.channel_count, options.input,
+                                 delta_h, scale_s, scale_v, options.output);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustHsvInYiq").Device(DEVICE_GPU).TypeConstraint<float>("T"),
+    AdjustHsvInYiqOp<GPUDevice>);
+#endif
 
-// TODO(huangyp): add the GPU kernel
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8968da6d8241ca7cd548910a024a618913c3ed70
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
+#define TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include <cmath>
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+static constexpr int kChannelSize = 3;
+
+namespace internal {
+
+template <int MATRIX_SIZE>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void compute_tranformation_matrix(
+    const float delta_h, const float scale_s, const float scale_v,
+    float* matrix) {
+  static_assert(MATRIX_SIZE == kChannelSize * kChannelSize,
+                "Size of matrix should be 9.");
+  // Projection matrix from RGB to YIQ. Numbers from wikipedia
+  // https://en.wikipedia.org/wiki/YIQ
+  Eigen::Matrix3f yiq;
+  /* clang-format off */
+  yiq << 0.299, 0.587, 0.114,
+         0.596, -0.274, -0.322,
+         0.211, -0.523, 0.312;
+  Eigen::Matrix3f yiq_inverse;
+  yiq_inverse << 1, 0.95617069, 0.62143257,
+                 1, -0.2726886, -0.64681324,
+                 1, -1.103744, 1.70062309;
+  /* clang-format on */
+  // Construct hsv linear transformation matrix in YIQ space.
+  // https://beesbuzz.biz/code/hsv_color_transforms.php
+  float vsu = scale_v * scale_s * std::cos(delta_h);
+  float vsw = scale_v * scale_s * std::sin(delta_h);
+  Eigen::Matrix3f hsv_transform;
+  /* clang-format off */
+  hsv_transform << scale_v, 0, 0,
+                   0, vsu, -vsw,
+                   0, vsw, vsu;
+  /* clang-format on */
+  // Compute final transformation matrix = inverse_yiq * hsv_transform * yiq
+  Eigen::Map<Eigen::Matrix<float, 3, 3, Eigen::ColMajor>> eigen_matrix(matrix);
+  eigen_matrix = yiq_inverse * hsv_transform * yiq;
+}
+}  // namespace internal
+
+#if GOOGLE_CUDA
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+struct AdjustHsvInYiqGPU {
+  void operator()(OpKernelContext* ctx, int channel_count,
+                  const Tensor* const input, const float* const delta_h,
+                  const float* const scale_s, const float* const scale_v,
+                  Tensor* const output);
+};
+
+}  // namespace functor
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b71ff9cd507faac66b3a33d3c02ec9b5901d814a
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
@@ -0,0 +1,84 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h"
+#include "tensorflow/core/kernels/gpu_utils.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+__global__ void compute_tranformation_matrix_cuda(const float* const delta_h,
+                                                  const float* const scale_s,
+                                                  const float* const scale_v,
+                                                  float* const matrix,
+                                                  const int matrix_size) {
+  if (matrix_size == kChannelSize * kChannelSize) {
+    compute_tranformation_matrix<kChannelSize * kChannelSize>(
+        *delta_h, *scale_s, *scale_v, matrix);
+  }
+}
+}  // namespace internal
+
+namespace functor {
+
+void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count,
+                                   const Tensor* const input,
+                                   const float* const delta_h,
+                                   const float* const scale_s,
+                                   const float* const scale_v,
+                                   Tensor* const output) {
+  const uint64 m = channel_count;
+  const uint64 k = kChannelSize;
+  const uint64 n = kChannelSize;
+  auto* cu_stream = ctx->eigen_device<GPUDevice>().stream();
+  OP_REQUIRES(ctx, cu_stream, errors::Internal("No GPU stream available."));
+  Tensor tranformation_matrix;
+  OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                          DT_FLOAT, TensorShape({kChannelSize * kChannelSize}),
+                          &tranformation_matrix));
+  // TODO(huangyp): It takes about 3.5 us to comute tranformation_matrix
+  // with one thread. Improve its performance if necessary.
+  internal::compute_tranformation_matrix_cuda<<<1, 1, 0, cu_stream>>>(
+      delta_h, scale_s, scale_v, tranformation_matrix.flat<float>().data(),
+      tranformation_matrix.flat<float>().size());
+  // Call cuBlas C = A * B directly.
+  auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+  auto a_ptr =
+      AsDeviceMemory(input->flat<float>().data(), input->flat<float>().size());
+  auto b_ptr = AsDeviceMemory(tranformation_matrix.flat<float>().data(),
+                              tranformation_matrix.flat<float>().size());
+  auto c_ptr = AsDeviceMemory(output->flat<float>().data(),
+                              output->flat<float>().size());
+  auto* stream = ctx->op_device_context()->stream();
+  OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+  // TODO(huangyp): share/use autotune cublas algorithms in Matmul.op.
+  bool blas_launch_status =
+      stream
+          ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n,
+                         a_ptr, k, 0.0f, &c_ptr, n)
+          .ok();
+  if (!blas_launch_status) {
+    ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
+                                    ", n=", n, ", k=", k));
+  }
+}
+}  // namespace functor
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_test.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4cbbd277840133c9419f9ce3d945b7d099679dc0
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_test.cc
@@ -0,0 +1,48 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class AdjustHsvInYiqOpTest : public OpsTestBase {
+ protected:
+};
+
+TEST_F(AdjustHsvInYiqOpTest, IdentiyTransformMatrix) {
+  Tensor matrix(allocator(), DT_FLOAT, TensorShape({9}));
+  internal::compute_tranformation_matrix<9>(0.0, 1.0, 1.0,
+                                            matrix.flat<float>().data());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({9}));
+  test::FillValues<float>(&expected, {1, 0, 0, 0, 1, 0, 0, 0, 1});
+  test::ExpectClose(matrix, expected);
+}
+
+TEST_F(AdjustHsvInYiqOpTest, ScaleValueTransformMatrix) {
+  float scale_v = 2.3;
+  Tensor matrix(allocator(), DT_FLOAT, TensorShape({9}));
+  internal::compute_tranformation_matrix<9>(0.0, 1.0, scale_v,
+                                            matrix.flat<float>().data());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({9}));
+  test::FillValues<float>(&expected,
+                          {scale_v, 0, 0, 0, scale_v, 0, 0, 0, scale_v});
+  test::ExpectClose(matrix, expected);
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index 6adf837ca0ab506bd18f5e2e1fc1847e31d782bf..c2e32da133b32c8fe169302668031af8bace2c22 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -43,9 +43,9 @@ template struct FillProjectiveTransform<CPUDevice, double>;
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 using functor::FillProjectiveTransform;
+using generator::Interpolation;
 using generator::INTERPOLATION_BILINEAR;
 using generator::INTERPOLATION_NEAREST;
-using generator::Interpolation;
 using generator::ProjectiveGenerator;
 
 template <typename Device, typename T>
@@ -72,11 +72,12 @@ class ImageProjectiveTransform : public OpKernel {
     const Tensor& transform_t = ctx->input(1);
     OP_REQUIRES(ctx, images_t.shape().dims() == 4,
                 errors::InvalidArgument("Input images must have rank 4"));
-    OP_REQUIRES(ctx, (TensorShapeUtils::IsMatrix(transform_t.shape()) &&
-                      (transform_t.dim_size(0) == images_t.dim_size(0) ||
-                       transform_t.dim_size(0) == 1) &&
-                      transform_t.dim_size(1) ==
-                          ProjectiveGenerator<Device, T>::kNumParameters),
+    OP_REQUIRES(ctx,
+                (TensorShapeUtils::IsMatrix(transform_t.shape()) &&
+                 (transform_t.dim_size(0) == images_t.dim_size(0) ||
+                  transform_t.dim_size(0) == 1) &&
+                 transform_t.dim_size(1) ==
+                     ProjectiveGenerator<Device, T>::kNumParameters),
                 errors::InvalidArgument(
                     "Input transform should be num_images x 8 or 1 x 8"));
     auto images = images_t.tensor<T, 4>();
diff --git a/tensorflow/contrib/image/kernels/segmentation_ops.cc b/tensorflow/contrib/image/kernels/segmentation_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fe8bf6e21c7b7310527668324571774e8bc50893
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/segmentation_ops.cc
@@ -0,0 +1,139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs for ImageConnectedComponents in ../ops/image_ops.cc, and description
+// of the algorithm in segmentation_ops.h.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/contrib/image/kernels/segmentation_ops.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using tensorflow::functor::BlockedImageUnionFindFunctor;
+using tensorflow::functor::FindRootFunctor;
+using tensorflow::functor::ImageConnectedComponentsFunctor;
+using tensorflow::functor::TensorRangeFunctor;
+
+using OutputType = typename BlockedImageUnionFindFunctor<bool>::OutputType;
+
+// Computes connected components on batches of 2D images.
+template <typename Device, typename T>
+class ImageConnectedComponents : public OpKernel {
+ public:
+  explicit ImageConnectedComponents(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& images_t = ctx->input(0);
+    OP_REQUIRES(ctx, images_t.shape().dims() == 3,
+                errors::InvalidArgument("Input images must have rank 3"));
+    Tensor forest_t, rank_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_INT64,
+                                           images_t.shape(), &forest_t));
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_INT64,
+                                           images_t.shape(), &rank_t));
+    Tensor* output_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t));
+
+    // Fill forest with values from 0 to n - 1, so that each node points to
+    // itself.
+    TensorRangeFunctor<Device>()(ctx->eigen_device<Device>(),
+                                 forest_t.flat<OutputType>());
+    auto rank = rank_t.tensor<OutputType, 3>();
+    rank.device(ctx->eigen_device<Device>()) = rank.constant(OutputType(0));
+
+    const auto images = images_t.tensor<T, 3>();
+    auto forest = forest_t.tensor<OutputType, 3>();
+    ImageConnectedComponentsFunctor<Device, T>()(
+        ctx, output_t->flat<OutputType>(), images, forest, rank);
+  }
+};
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+namespace functor {
+
+// Connected components CPU implementation. See `segmentation_ops.h` for a
+// description of the algorithm.
+template <typename T>
+struct ImageConnectedComponentsFunctor<CPUDevice, T> {
+  void operator()(OpKernelContext* ctx,
+                  typename TTypes<OutputType>::Flat output,
+                  typename TTypes<T, 3>::ConstTensor images,
+                  typename TTypes<OutputType, 3>::Tensor forest,
+                  typename TTypes<OutputType, 3>::Tensor rank) {
+    const int64 num_images = images.dimension(0),
+                num_rows = images.dimension(1), num_cols = images.dimension(2),
+                num_elements = images.size();
+    // Bail out early for an empty image--no work to do.
+    if (num_elements == 0) {
+      return;
+    }
+    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+    BlockedImageUnionFindFunctor<T> union_find(
+        images.data(), num_rows, num_cols, forest.data(), rank.data());
+    while (union_find.can_merge()) {
+      union_find.merge_blocks();
+      int64 num_blocks_vertically = union_find.num_blocks_vertically();
+      int64 num_blocks_horizontally = union_find.num_blocks_horizontally();
+      // Merging each block calls union_down for each pixel in a row of the
+      // block, and union_right for each pixel in a column of the block. Assume
+      // 20 instructions for each call to union_down or union_right. find() may
+      // loop more while searching for the root, but this should not be very
+      // significant.
+      int cost = (union_find.block_height() + union_find.block_width()) * 20;
+      Shard(worker_threads->num_threads, worker_threads->workers,
+            num_images * num_blocks_vertically * num_blocks_horizontally, cost,
+            [&union_find, num_images, num_blocks_vertically,
+             num_blocks_horizontally](int64 start_block, int64 limit_block) {
+              for (int64 i = start_block; i < limit_block; i++) {
+                int64 block_x = i % num_blocks_horizontally;
+                int64 block_y =
+                    (i / num_blocks_horizontally) % num_blocks_vertically;
+                int64 image =
+                    i / (num_blocks_horizontally * num_blocks_vertically);
+                union_find.merge_internal_block_edges(image, block_y, block_x);
+              }
+            });
+    }
+    FindRootFunctor<CPUDevice, T>()(ctx->eigen_device<CPUDevice>(), output,
+                                    images.data(), union_find);
+  }
+};
+
+}  // end namespace functor
+
+#define REGISTER_IMAGE_CONNECTED_COMPONENTS(TYPE)             \
+  REGISTER_KERNEL_BUILDER(Name("ImageConnectedComponents")    \
+                              .Device(DEVICE_CPU)             \
+                              .TypeConstraint<TYPE>("dtype"), \
+                          ImageConnectedComponents<CPUDevice, TYPE>)
+// Connected components (arguably) make sense for number, bool, and string types
+TF_CALL_NUMBER_TYPES(REGISTER_IMAGE_CONNECTED_COMPONENTS);
+TF_CALL_bool(REGISTER_IMAGE_CONNECTED_COMPONENTS);
+TF_CALL_string(REGISTER_IMAGE_CONNECTED_COMPONENTS);
+#undef REGISTER_IMAGE_CONNECTED_COMPONENTS
+
+// TODO(ringwalt): Implement on GPU. We probably want to stick to the original
+// algorithm by Stava and Benes there for efficiency (computing small blocks in
+// shared memory in CUDA thread blocks, instead of starting with single-pixel
+// blocks).
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/image/kernels/segmentation_ops.h b/tensorflow/contrib/image/kernels/segmentation_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..0957d5fd10f02daad3d8d51aadec9ce9da2660b5
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/segmentation_ops.h
@@ -0,0 +1,303 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IMAGE_KERNELS_SEGMENTATION_OPS_H_
+#define TENSORFLOW_CONTRIB_IMAGE_KERNELS_SEGMENTATION_OPS_H_
+
+// Connected component analysis. The op is described in ../ops/image_ops.cc. A
+// description of the algorithm appears below.
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename T>
+bool is_nonzero(T value) {
+  return value != T(0);
+}
+
+template <>
+bool is_nonzero(string value) {
+  return value.size() != 0;
+}
+
+// Processes each pixel of an image for union-find, in parallel blocks. This is
+// loosely based on the algorithm in "GPU Computing Gems" by Ondrej Stava and
+// Bedrich Benes, available here:
+// http://hpcg.purdue.edu/bbenes/papers/Stava2011CCL.pdf
+// The bulk of the process uses blocks of each image, which have each been
+// processed separately. As long as there are multiple blocks in the image, we
+// double the height and width of the blocks, creating new blocks which each
+// consist of 2x2 previous sub-blocks. On each new block, we process adjacent
+// pixels from the previous sub-blocks serially. However, the new blocks are not
+// connected, so we can process each block in parallel.
+// The GPU algorithm first processes blocks of a fixed size in GPU shared
+// memory, with one image block per CUDA thread block. On the CPU, we just start
+// with a block size of a single pixel, and borrow the rest of the algorithm
+// unchanged.
+template <typename T>
+class BlockedImageUnionFindFunctor {
+ public:
+  using OutputType = int64;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockedImageUnionFindFunctor(
+      const T* images, const int64 num_rows, const int64 num_cols,
+      OutputType* forest, OutputType* rank)
+      : images_(images),
+        num_rows_(num_rows),
+        num_cols_(num_cols),
+        block_height_(1),
+        block_width_(1),
+        forest_(forest),
+        rank_(rank) {}
+
+  // Returns the root of the tree that the pixel at the given index belongs to.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE OutputType
+  find(OutputType index) const {
+    while (forest_[index] != index) {
+      index = forest_[index];
+    }
+    return index;
+  }
+
+  // Returns the number of blocks along the y axis.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int64 num_blocks_vertically() const {
+    return (num_rows_ + block_height_ - 1) / block_height_;
+  }
+
+  // Returns the number of blocks along the x axis.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int64 num_blocks_horizontally() const {
+    return (num_cols_ + block_width_ - 1) / block_width_;
+  }
+
+  // Returns the total number of blocks in each image.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int64 num_blocks() const {
+    return num_blocks_vertically() * num_blocks_horizontally();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int64 block_height() const {
+    return block_height_;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int64 block_width() const {
+    return block_width_;
+  }
+
+  // Returns whether we may merge again (the image contains more than one
+  // block).
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool can_merge() const {
+    return block_height_ < num_rows_ || block_width_ < num_cols_;
+  }
+
+  // Doubles the block size. After this method, you must call
+  // `merge_internal_block_edges` for each image and each *new* block's xy
+  // coordinates (typically in parallel).
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void merge_blocks() {
+    block_height_ *= 2;
+    block_width_ *= 2;
+  }
+
+  // Processes pairs of pixels within the block which were adjacent in the four
+  // sub-blocks. This must be done at each stage so that the connected
+  // components in each block are joined correctly.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void merge_internal_block_edges(
+      int64 image_index, int64 block_vertical_index,
+      int64 block_horizontal_index) const {
+    int64 block_start_y = block_vertical_index * block_height_;
+    int64 block_start_x = block_horizontal_index * block_width_;
+    // Merge the 4 sub-blocks horizontally (fixing the vertical seam).
+    int64 block_center_x = block_start_x + block_width_ / 2 - 1;
+    if (0 <= block_center_x && block_center_x + 1 < num_cols_) {
+      int64 merge_blocks_limit_y =
+          std::min(num_rows_, block_start_y + block_height_);
+      for (int64 y = block_start_y; y < merge_blocks_limit_y; y++) {
+        union_right(image_index, y, block_center_x);
+      }
+    }
+    // Merge the 4 sub-blocks vertically (fixing the horizontal seam).
+    int64 block_center_y = block_start_y + block_height_ / 2 - 1;
+    if (0 <= block_center_y && block_center_y + 1 < num_rows_) {
+      int64 merge_blocks_limit_x =
+          std::min(num_cols_, block_start_x + block_width_);
+      for (int64 x = block_start_x; x < merge_blocks_limit_x; x++) {
+        union_down(image_index, block_center_y, x);
+      }
+    }
+  }
+
+ private:
+  // The input image(s).
+  const T* const images_;
+  const int64 num_rows_;
+  const int64 num_cols_;
+  // Current height of each sub-block of the image.
+  int64 block_height_;
+  // Current width of each sub-block of the image.
+  int64 block_width_;
+  // Union-find forest. This has the same size as `images_`, and each entry
+  // holds the index of its parent in `images_` (roots hold their own index).
+  // Cycles should not occur.
+  OutputType* const forest_;
+  // Union-find rank of each pixel.
+  OutputType* const rank_;
+
+  // Unions the pixel with the pixel below it if applicable (both pixels are
+  // true, and the pixel is not in the last row).
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void union_down(OutputType batch,
+                                                        OutputType row,
+                                                        OutputType col) const {
+    T pixel = read_pixel(batch, row, col);
+    if (is_nonzero<T>(pixel)) {
+      const int64 index_a = col + num_cols_ * (row + num_rows_ * batch);
+      if (row + 1 < num_rows_ && read_pixel(batch, row + 1, col) == pixel) {
+        const int64 index_b = col + num_cols_ * (row + 1 + num_rows_ * batch);
+        do_union(index_a, index_b);
+      }
+    }
+  }
+
+  // Unions the pixel with the pixel to the right of it if applicable.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void union_right(OutputType batch,
+                                                         OutputType row,
+                                                         OutputType col) const {
+    T pixel = read_pixel(batch, row, col);
+    if (is_nonzero<T>(pixel)) {
+      const int64 index_a = col + num_cols_ * (row + num_rows_ * batch);
+      if (col + 1 < num_cols_ && read_pixel(batch, row, col + 1) == pixel) {
+        const int64 index_b = col + 1 + num_cols_ * (row + num_rows_ * batch);
+        do_union(index_a, index_b);
+      }
+    }
+  }
+
+  // Reads a pixel value in the images.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  read_pixel(const OutputType batch, const OutputType row,
+             const OutputType col) const {
+    return images_[col + num_cols_ * (row + num_rows_ * batch)];
+  }
+
+  // Unions the trees that the two pixels belong to, using their index in the
+  // `images_` array.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void do_union(
+      OutputType index_a, OutputType index_b) const {
+    // Find the roots of index_a and index_b in the forest, and make one the
+    // child of the other.
+    index_a = find(index_a);
+    index_b = find(index_b);
+    const OutputType rank_a = rank_[index_a];
+    const OutputType rank_b = rank_[index_b];
+    OutputType parent, child;
+    if (index_a == index_b) {
+      return;
+    } else if (rank_a < rank_b) {
+      parent = index_a;
+      child = index_b;
+    } else {
+      parent = index_b;
+      child = index_a;
+      rank_[parent]++;
+    }
+    forest_[child] = parent;
+  }
+};
+
+// Runs the ImageUnionFindFunctor on all pixels. Will require different CPU and
+// GPU implementations.
+template <typename Device, typename T>
+class ImageConnectedComponentsFunctor {
+ public:
+  using OutputType = typename BlockedImageUnionFindFunctor<T>::OutputType;
+
+  void operator()(OpKernelContext* ctx,
+                  typename TTypes<T, 3>::ConstTensor images,
+                  typename TTypes<OutputType, 3>::Tensor forest,
+                  typename TTypes<OutputType, 3>::Tensor rank);
+};
+
+// Fills a flat Tensor with indices from 0 to n - 1.
+template <typename Device>
+class TensorRangeFunctor {
+ public:
+  using OutputType = typename BlockedImageUnionFindFunctor<bool>::OutputType;
+
+  void operator()(const Device& device,
+                  typename TTypes<OutputType>::Flat tensor) {
+    tensor.device(device) = tensor.generate(TensorRangeGenerator());
+  }
+
+ private:
+  class TensorRangeGenerator {
+   public:
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE OutputType
+    operator()(const Eigen::array<Eigen::DenseIndex, 1>& coords) const {
+      return coords[0];
+    }
+  };
+};
+
+// Given the union-find forest, generates the root index for each node. This
+// gives us arbitrary, usually non-consecutive ids for each connected component.
+// The ids are massaged in Python to get deterministic, consecutive ids.
+template <typename Device, typename T>
+class FindRootFunctor {
+ public:
+  using OutputType = typename BlockedImageUnionFindFunctor<T>::OutputType;
+
+  void operator()(const Device& device,
+                  typename TTypes<OutputType>::Flat component_ids,
+                  const T* images,
+                  const BlockedImageUnionFindFunctor<T>& union_find) {
+    component_ids.device(device) =
+        component_ids.generate(FindRootGenerator(images, union_find));
+  }
+
+ private:
+  class FindRootGenerator {
+    const T* const images_;
+    const BlockedImageUnionFindFunctor<T> union_find_;
+
+   public:
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE FindRootGenerator(
+        const T* images, BlockedImageUnionFindFunctor<T> union_find)
+        : images_(images), union_find_(union_find) {}
+
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE OutputType
+    operator()(const Eigen::array<Eigen::DenseIndex, 1>& coords) const {
+      if (is_nonzero<T>(images_[coords[0]])) {
+        // True pixels have an arbitrary segment id > 0. The segment ids will be
+        // made contiguous later.
+        return union_find_.find(coords[0]) + 1;
+      } else {
+        // False pixels have a segment of 0.
+        return 0;
+      }
+    }
+  };
+};
+
+}  // end namespace functor
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IMAGE_KERNELS_SEGMENTATION_OPS_H_
diff --git a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
index 9f0bf37aed3fc9aeefb7602ef3fda4cfd76f1917..8f9a5c28039b74a874028826ca8a6d5a36ab7cf4 100755
--- a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
@@ -143,8 +143,8 @@ class SingleImageRandomDotStereogramsOp : public OpKernel {
     }
 
     data_box_left = deltaX_border_image / 2;  // Center DATA in X dimension
-    data_box_width = data_Xwindow;             // width of scan line
-    data_box_height = data_Ywindow;            // hight of image
+    data_box_width = data_Xwindow;            // width of scan line
+    data_box_height = data_Ywindow;           // hight of image
 
     const T* inputZ = input_tensor.flat<T>().data();  // Flatten input Z buffer
 
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 4527fdd87a8be3390fb0840410218ab74a27f0d2..68771b3d054a64ba94141c092e20df1ed6b2339b 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -98,4 +98,34 @@ col_to_row_match_indices: A vector of length num_columns, which is the number
   `col_to_row_match_indices[j]`.
 )doc");
 
+REGISTER_OP("ImageConnectedComponents")
+    .Input("image: dtype")
+    .Output("components: int64")
+    .Attr(
+        "dtype: {int64, int32, uint16, int16, uint8, int8, half, float, "
+        "double, bool, string}")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::UnchangedShape(c);
+    })
+    .Doc(R"doc(
+Find the connected components of image(s).
+
+For each image (along the 0th axis), all connected components of adjacent pixels
+with the same non-zero value are detected and given unique ids.
+
+The returned `components` tensor has 0s for the zero pixels of `images`, and
+arbitrary nonzero ids for the connected components of nonzero values. Ids are
+unique across all of the images, and are in row-major order by the first pixel
+in the component.
+
+Uses union-find with union by rank but not path compression, giving a runtime of
+`O(n log n)`. See:
+    https://en.wikipedia.org/wiki/Disjoint-set_data_structure#Time_Complexity
+
+image: Image(s) with shape (N, H, W).
+components: Component ids for each pixel in "image". Same shape as "image". Zero
+    pixels all have an output of 0, and all components of adjacent pixels with
+    the same value are given consecutive ids, starting from 1.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
index f8b56ab1c5400694b3aa8d4a0c19c7769aa8cbce..8139d4272d6950815bd39a64e86e0f7422e6f799 100755
--- a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -19,6 +19,10 @@ limitations under the License.
 
 namespace tensorflow {
 
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
 REGISTER_OP("SingleImageRandomDotStereograms")
     .Attr("T: {double,float,int64,int32}")
     .Input("depth_values: T")
@@ -37,6 +41,28 @@ REGISTER_OP("SingleImageRandomDotStereograms")
         "output_image_shape: shape = { dim {size:1024} dim {size: 768} dim "
         "{size: 1}}")
     .Attr("output_data_window: shape = { dim {size:1022} dim {size: 757}}")
+    .SetShapeFn([](InferenceContext* c) {
+      // Validate that the output_image_shape attr is correct.
+      // NOTE: The output_image_shape is [X, Y, C]
+      // while the output data is [Y, X, C] (or [H, W, C]).
+      // As a result, by default the output_image_shape has the value
+      // of [1024, 768, 1] but the output data will be [768, 1024, 1].
+      PartialTensorShape shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_image_shape", &shape));
+      ShapeHandle output_image_shape;
+      TF_RETURN_IF_ERROR(
+          c->MakeShapeFromPartialTensorShape(shape, &output_image_shape));
+      DimensionHandle x_dim = c->Dim(output_image_shape, 0);
+      DimensionHandle y_dim = c->Dim(output_image_shape, 1);
+
+      int colors;
+      TF_RETURN_IF_ERROR(c->GetAttr("number_colors", &colors));
+
+      c->set_output(
+          0, c->MakeShape(
+                 {y_dim, x_dim, colors > 256 ? c->MakeDim(3) : c->MakeDim(1)}));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Outputs a single image random dot stereogram for export via encode_PNG/JPG OP.
 
diff --git a/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py
index b85f19d29b79defa10493bdbaa4a1b237cb2a9ee..a495b58b7f6481d4cdedf73f23615d0390eb6a45 100644
--- a/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py
@@ -172,7 +172,7 @@ class AdjustValueInYiqTest(test_util.TensorFlowTestCase):
           raise AssertionError('Invalid test style: %s' % (test_style))
         y_np = self._adjust_value_in_yiq_np(x_np, scale)
         y_tf = self._adjust_value_in_yiq_tf(x_np, scale)
-        self.assertAllClose(y_tf, y_np, rtol=2e-5, atol=1e-5)
+        self.assertAllClose(y_tf, y_np, rtol=2e-4, atol=1e-4)
 
   def test_invalid_shapes(self):
     x_np = np.random.rand(2, 3) * 255.
@@ -237,7 +237,7 @@ class AdjustSaturationInYiqTest(test_util.TensorFlowTestCase):
             raise AssertionError('Invalid test style: %s' % (test_style))
           y_baseline = self._adjust_saturation_in_yiq_np(x_np, scale)
           y_tf = self._adjust_saturation_in_yiq_tf(x_np, scale)
-          self.assertAllClose(y_tf, y_baseline, rtol=2e-5, atol=1e-5)
+          self.assertAllClose(y_tf, y_baseline, rtol=2e-4, atol=1e-4)
 
   def test_invalid_shapes(self):
     x_np = np.random.rand(2, 3) * 255.
@@ -291,6 +291,9 @@ class AdjustHueInYiqBenchmark(test.Benchmark):
   def benchmark_adjust_hue_in_yiqCpuAll(self):
     self._benchmark_adjust_hue_in_yiq('/cpu:0', None)
 
+  def benchmark_adjust_hue_in_yiq_gpu_all(self):
+    self._benchmark_adjust_hue_in_yiq(test.gpu_device_name(), None)
+
 
 class AdjustSaturationInYiqBenchmark(test.Benchmark):
 
@@ -333,6 +336,9 @@ class AdjustSaturationInYiqBenchmark(test.Benchmark):
   def benchmark_adjust_saturation_in_yiq_cpu_all(self):
     self._benchmark_adjust_saturation_in_yiq('/cpu:0', None)
 
+  def benchmark_adjust_saturation_in_yiq_gpu_all(self):
+    self._benchmark_adjust_saturation_in_yiq(test.gpu_device_name(), None)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/image/python/kernel_tests/segmentation_test.py b/tensorflow/contrib/image/python/kernel_tests/segmentation_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..48066cbacefe6b229a1f485486f11e8b8af7704f
--- /dev/null
+++ b/tensorflow/contrib/image/python/kernel_tests/segmentation_test.py
@@ -0,0 +1,189 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for connected component analysis."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+
+import numpy as np
+
+from tensorflow.contrib.image.python.ops import image_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+
+# Image for testing connected_components, with a single, winding component.
+SNAKE = np.asarray(
+    [[0, 0, 0, 0, 0, 0, 0, 0, 0],
+     [0, 1, 1, 1, 1, 0, 0, 0, 0],
+     [0, 0, 0, 0, 1, 1, 1, 1, 0],
+     [0, 0, 0, 0, 0, 0, 0, 1, 0],
+     [0, 1, 1, 1, 1, 1, 1, 1, 0],
+     [0, 1, 0, 0, 0, 0, 0, 0, 0],
+     [0, 1, 0, 1, 1, 1, 1, 1, 0],
+     [0, 1, 0, 0, 0, 0, 0, 1, 0],
+     [0, 1, 1, 1, 1, 1, 1, 1, 0],
+     [0, 0, 0, 0, 0, 0, 0, 0, 0]])  # pyformat: disable
+
+
+class SegmentationTest(test_util.TensorFlowTestCase):
+
+  def testDisconnected(self):
+    arr = math_ops.cast(
+        [[1, 0, 0, 1, 0, 0, 0, 0, 1],
+         [0, 1, 0, 0, 0, 1, 0, 1, 0],
+         [1, 0, 1, 0, 0, 0, 1, 0, 0],
+         [0, 0, 0, 0, 1, 0, 0, 0, 0],
+         [0, 0, 1, 0, 0, 0, 0, 0, 0]],
+        dtypes.bool)  # pyformat: disable
+    expected = (
+        [[1, 0, 0, 2, 0, 0, 0, 0, 3],
+         [0, 4, 0, 0, 0, 5, 0, 6, 0],
+         [7, 0, 8, 0, 0, 0, 9, 0, 0],
+         [0, 0, 0, 0, 10, 0, 0, 0, 0],
+         [0, 0, 11, 0, 0, 0, 0, 0, 0]])  # pyformat: disable
+    with self.test_session():
+      self.assertAllEqual(image_ops.connected_components(arr).eval(), expected)
+
+  def testSimple(self):
+    arr = [[0, 1, 0], [1, 1, 1], [0, 1, 0]]
+    with self.test_session():
+      # Single component with id 1.
+      self.assertAllEqual(
+          image_ops.connected_components(math_ops.cast(
+              arr, dtypes.bool)).eval(), arr)
+
+  def testSnake(self):
+    with self.test_session():
+      # Single component with id 1.
+      self.assertAllEqual(
+          image_ops.connected_components(math_ops.cast(
+              SNAKE, dtypes.bool)).eval(), SNAKE)
+
+  def testSnake_disconnected(self):
+    for i in range(SNAKE.shape[0]):
+      for j in range(SNAKE.shape[1]):
+        with self.test_session():
+          # If we disconnect any part of the snake except for the endpoints,
+          # there will be 2 components.
+          if SNAKE[i, j] and (i, j) not in [(1, 1), (6, 3)]:
+            disconnected_snake = SNAKE.copy()
+            disconnected_snake[i, j] = 0
+            components = image_ops.connected_components(
+                math_ops.cast(disconnected_snake, dtypes.bool)).eval()
+            self.assertEqual(components.max(), 2, 'disconnect (%d, %d)' % (i,
+                                                                           j))
+            bins = np.bincount(components.ravel())
+            # Nonzero number of pixels labeled 0, 1, or 2.
+            self.assertGreater(bins[0], 0)
+            self.assertGreater(bins[1], 0)
+            self.assertGreater(bins[2], 0)
+
+  def testMultipleImages(self):
+    images = [[[1, 1, 1, 1],
+               [1, 0, 0, 1],
+               [1, 0, 0, 1],
+               [1, 1, 1, 1]],
+              [[1, 0, 0, 1],
+               [0, 0, 0, 0],
+               [0, 0, 0, 0],
+               [1, 0, 0, 1]],
+              [[1, 1, 0, 1],
+               [0, 1, 1, 0],
+               [1, 0, 1, 0],
+               [0, 0, 1, 1]]]  # pyformat: disable
+    expected = [[[1, 1, 1, 1],
+                 [1, 0, 0, 1],
+                 [1, 0, 0, 1],
+                 [1, 1, 1, 1]],
+                [[2, 0, 0, 3],
+                 [0, 0, 0, 0],
+                 [0, 0, 0, 0],
+                 [4, 0, 0, 5]],
+                [[6, 6, 0, 7],
+                 [0, 6, 6, 0],
+                 [8, 0, 6, 0],
+                 [0, 0, 6, 6]]]  # pyformat: disable
+    with self.test_session():
+      self.assertAllEqual(
+          image_ops.connected_components(math_ops.cast(
+              images, dtypes.bool)).eval(), expected)
+
+  def testZeros(self):
+    with self.test_session():
+      self.assertAllEqual(
+          image_ops.connected_components(
+              array_ops.zeros((100, 20, 50), dtypes.bool)).eval(),
+          np.zeros((100, 20, 50)))
+
+  def testOnes(self):
+    with self.test_session():
+      self.assertAllEqual(
+          image_ops.connected_components(
+              array_ops.ones((100, 20, 50), dtypes.bool)).eval(),
+          np.tile(np.arange(100)[:, None, None] + 1, [1, 20, 50]))
+
+  def testOnes_small(self):
+    with self.test_session():
+      self.assertAllEqual(
+          image_ops.connected_components(array_ops.ones((3, 5),
+                                                        dtypes.bool)).eval(),
+          np.ones((3, 5)))
+
+  def testRandom_scipy(self):
+    np.random.seed(42)
+    images = np.random.randint(0, 2, size=(10, 100, 200)).astype(np.bool)
+    expected = connected_components_reference_implementation(images)
+    if expected is None:
+      return
+    with self.test_session():
+      self.assertAllEqual(
+          image_ops.connected_components(images).eval(), expected)
+
+
+def connected_components_reference_implementation(images):
+  try:
+    # pylint: disable=g-import-not-at-top
+    from scipy.ndimage import measurements
+  except ImportError:
+    logging.exception('Skipping test method because scipy could not be loaded')
+    return
+  image_or_images = np.asarray(images)
+  if len(image_or_images.shape) == 2:
+    images = image_or_images[None, :, :]
+  elif len(image_or_images.shape) == 3:
+    images = image_or_images
+  components = np.asarray([measurements.label(image)[0] for image in images])
+  # Get the count of nonzero ids for each image, and offset each image's nonzero
+  # ids using the cumulative sum.
+  num_ids_per_image = components.reshape(
+      [-1, components.shape[1] * components.shape[2]]).max(axis=-1)
+  positive_id_start_per_image = np.cumsum(num_ids_per_image)
+  for i in range(components.shape[0]):
+    new_id_start = positive_id_start_per_image[i - 1] if i > 0 else 0
+    components[i, components[i] > 0] += new_id_start
+  if len(image_or_images.shape) == 2:
+    return components[0, :, :]
+  else:
+    return components
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/image/python/kernel_tests/single_image_random_dot_stereograms_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/single_image_random_dot_stereograms_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f4029e558d92a2b6539456bf9cf49ec2d21c9f3
--- /dev/null
+++ b/tensorflow/contrib/image/python/kernel_tests/single_image_random_dot_stereograms_ops_test.py
@@ -0,0 +1,82 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for python single_image_random_dot_stereograms_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.image.python.ops.single_image_random_dot_stereograms \
+    import single_image_random_dot_stereograms
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+class SingleImageRandomDotStereogramsTest(test_util.TensorFlowTestCase):
+
+  def test_shape_function_default(self):
+    """
+    NOTE: The output_image_shape is [X, Y, C]
+    while the output data is [Y, X, C] (or [H, W, C]).
+    As a result, by default the output_image_shape has the value
+    of [1024, 768, 1], but the output data will be [768, 1024, 1].
+    """
+    x_np = [[1, 2, 3, 3, 2, 1],
+            [1, 2, 3, 4, 5, 2],
+            [1, 2, 3, 4, 5, 3],
+            [1, 2, 3, 4, 5, 4],
+            [6, 5, 4, 4, 5, 5]]
+    x_tf = constant_op.constant(x_np)
+    # By default [1024, 768, 1] => [768, 1024, 1].
+    sirds_1 = single_image_random_dot_stereograms(
+        x_tf,
+        convergence_dots_size=8,
+        number_colors=256,
+        normalize=True)
+    shape_1 = sirds_1.get_shape().as_list()
+    self.assertEqual(shape_1, [768, 1024, 1])
+    with self.test_session():
+      r_tf_1 = sirds_1.eval()
+      self.assertAllEqual(shape_1, r_tf_1.shape)
+
+    # If color > 256 then [1024, 768, 3] => [768, 1024, 3].
+    sirds_2 = single_image_random_dot_stereograms(
+        x_tf,
+        convergence_dots_size=8,
+        number_colors=512,
+        normalize=True)
+    shape_2 = sirds_2.get_shape().as_list()
+    self.assertEqual(shape_2, [768, 1024, 3])
+    with self.test_session():
+      r_tf_2 = sirds_2.eval()
+      self.assertAllEqual(shape_2, r_tf_2.shape)
+
+    # If explicitly set output_image_shape to [1200, 800, 1],
+    # then the output data should be [800, 1200, 1].
+    sirds_3 = single_image_random_dot_stereograms(
+        x_tf,
+        convergence_dots_size=8,
+        number_colors=256,
+        normalize=True,
+        output_image_shape=[1200, 800, 1])
+    shape_3 = sirds_3.get_shape().as_list()
+    self.assertEqual(shape_3, [800, 1200, 1])
+    with self.test_session():
+      r_tf_3 = sirds_3.eval()
+      self.assertAllEqual(shape_3, r_tf_3.shape)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index faedee6f87772016561671bacd87f88657eafffb..c139ae89d8d682d6b87813c3a21703ffa762f28e 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import resource_loader
@@ -34,11 +35,12 @@ _image_ops_so = loader.load_op_library(
 _IMAGE_DTYPES = set(
     [dtypes.uint8, dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64])
 
+ops.RegisterShape("ImageConnectedComponents")(common_shapes.call_cpp_shape_fn)
 ops.RegisterShape("ImageProjectiveTransform")(common_shapes.call_cpp_shape_fn)
 
 
 def rotate(images, angles, interpolation="NEAREST", name=None):
-  """Rotate image(s) by the passed angle(s) in radians.
+  """Rotate image(s) counterclockwise by the passed angle(s) in radians.
 
   Args:
     images: A tensor of shape (num_images, num_rows, num_columns, num_channels)
@@ -288,31 +290,76 @@ def compose_transforms(*transforms):
   """
   assert transforms, "transforms cannot be empty"
   with ops.name_scope("compose_transforms"):
-    composed = _flat_transforms_to_matrices(transforms[0])
+    composed = flat_transforms_to_matrices(transforms[0])
     for tr in transforms[1:]:
       # Multiply batches of matrices.
-      composed = math_ops.matmul(composed, _flat_transforms_to_matrices(tr))
-    return _transform_matrices_to_flat(composed)
+      composed = math_ops.matmul(composed, flat_transforms_to_matrices(tr))
+    return matrices_to_flat_transforms(composed)
 
 
-def _flat_transforms_to_matrices(transforms):
-  # Make the transform(s) 2D in case the input is a single transform.
-  transforms = array_ops.reshape(transforms, constant_op.constant([-1, 8]))
-  num_transforms = array_ops.shape(transforms)[0]
-  # Add a column of ones for the implicit last entry in the matrix.
-  return array_ops.reshape(
-      array_ops.concat(
-          [transforms, array_ops.ones([num_transforms, 1])], axis=1),
-      constant_op.constant([-1, 3, 3]))
+def flat_transforms_to_matrices(transforms):
+  """Converts `tf.contrib.image` projective transforms to affine matrices.
 
+  Note that the output matrices map output coordinates to input coordinates. For
+  the forward transformation matrix, call `tf.linalg.inv` on the result.
 
-def _transform_matrices_to_flat(transform_matrices):
-  # Flatten each matrix.
-  transforms = array_ops.reshape(transform_matrices,
-                                 constant_op.constant([-1, 9]))
-  # Divide each matrix by the last entry (normally 1).
-  transforms /= transforms[:, 8:9]
-  return transforms[:, :8]
+  Args:
+    transforms: Vector of length 8, or batches of transforms with shape
+      `(N, 8)`.
+
+  Returns:
+    3D tensor of matrices with shape `(N, 3, 3)`. The output matrices map the
+      *output coordinates* (in homogeneous coordinates) of each transform to the
+      corresponding *input coordinates*.
+
+  Raises:
+    ValueError: If `transforms` have an invalid shape.
+  """
+  with ops.name_scope("flat_transforms_to_matrices"):
+    transforms = ops.convert_to_tensor(transforms, name="transforms")
+    if transforms.shape.ndims not in (1, 2):
+      raise ValueError("Transforms should be 1D or 2D, got: %s" % transforms)
+    # Make the transform(s) 2D in case the input is a single transform.
+    transforms = array_ops.reshape(transforms, constant_op.constant([-1, 8]))
+    num_transforms = array_ops.shape(transforms)[0]
+    # Add a column of ones for the implicit last entry in the matrix.
+    return array_ops.reshape(
+        array_ops.concat(
+            [transforms, array_ops.ones([num_transforms, 1])], axis=1),
+        constant_op.constant([-1, 3, 3]))
+
+
+def matrices_to_flat_transforms(transform_matrices):
+  """Converts affine matrices to `tf.contrib.image` projective transforms.
+
+  Note that we expect matrices that map output coordinates to input coordinates.
+  To convert forward transformation matrices, call `tf.linalg.inv` on the
+  matrices and use the result here.
+
+  Args:
+    transform_matrices: One or more affine transformation matrices, for the
+      reverse transformation in homogeneous coordinates. Shape `(3, 3)` or
+      `(N, 3, 3)`.
+
+  Returns:
+    2D tensor of flat transforms with shape `(N, 8)`, which may be passed into
+      `tf.contrib.image.transform`.
+
+  Raises:
+    ValueError: If `transform_matrices` have an invalid shape.
+  """
+  with ops.name_scope("matrices_to_flat_transforms"):
+    transform_matrices = ops.convert_to_tensor(
+        transform_matrices, name="transform_matrices")
+    if transform_matrices.shape.ndims not in (2, 3):
+      raise ValueError(
+          "Matrices should be 2D or 3D, got: %s" % transform_matrices)
+    # Flatten each matrix.
+    transforms = array_ops.reshape(transform_matrices,
+                                   constant_op.constant([-1, 9]))
+    # Divide each matrix by the last entry (normally 1).
+    transforms /= transforms[:, 8:9]
+    return transforms[:, :8]
 
 
 @ops.RegisterGradient("ImageProjectiveTransform")
@@ -344,9 +391,9 @@ def _image_projective_transform_grad(op, grad):
     raise TypeError("Transforms should have rank 1 or 2.")
 
   # Invert transformations
-  transforms = _flat_transforms_to_matrices(transforms=transforms)
+  transforms = flat_transforms_to_matrices(transforms=transforms)
   inverse = linalg_ops.matrix_inverse(transforms)
-  transforms = _transform_matrices_to_flat(inverse)
+  transforms = matrices_to_flat_transforms(inverse)
   output = gen_image_ops.image_projective_transform(
       grad, transforms, interpolation=interpolation)
   if len(image_or_images.get_shape()) == 2:
@@ -395,4 +442,72 @@ def bipartite_match(distance_mat,
   return result
 
 
+def connected_components(images):
+  """Labels the connected components in a batch of images.
+
+  A component is a set of pixels in a single input image, which are all adjacent
+  and all have the same non-zero value. The components using a squared
+  connectivity of one (all True entries are joined with their neighbors above,
+  below, left, and right). Components across all images have consecutive ids 1
+  through n. Components are labeled according to the first pixel of the
+  component appearing in row-major order (lexicographic order by
+  image_index_in_batch, row, col). Zero entries all have an output id of 0.
+
+  This op is equivalent with `scipy.ndimage.measurements.label` on a 2D array
+  with the default structuring element (which is the connectivity used here).
+
+  Args:
+    images: A 2D (H, W) or 3D (N, H, W) Tensor of boolean image(s).
+
+  Returns:
+    Components with the same shape as `images`. False entries in `images` have
+    value 0, and all True entries map to a component id > 0.
+
+  Raises:
+    TypeError: if `images` is not 2D or 3D.
+  """
+  with ops.name_scope("connected_components"):
+    image_or_images = ops.convert_to_tensor(images, name="images")
+    if len(image_or_images.get_shape()) == 2:
+      images = image_or_images[None, :, :]
+    elif len(image_or_images.get_shape()) == 3:
+      images = image_or_images
+    else:
+      raise TypeError(
+          "images should have rank 2 (HW) or 3 (NHW). Static shape is %s" %
+          image_or_images.get_shape())
+    components = gen_image_ops.image_connected_components(images)
+
+    # TODO(ringwalt): Component id renaming should be done in the op, to avoid
+    # constructing multiple additional large tensors.
+    components_flat = array_ops.reshape(components, [-1])
+    unique_ids, id_index = array_ops.unique(components_flat)
+    id_is_zero = array_ops.where(math_ops.equal(unique_ids, 0))[:, 0]
+    # Map each nonzero id to consecutive values.
+    nonzero_consecutive_ids = math_ops.range(
+        array_ops.shape(unique_ids)[0] - array_ops.shape(id_is_zero)[0]) + 1
+
+    def no_zero():
+      # No need to insert a zero into the ids.
+      return nonzero_consecutive_ids
+
+    def has_zero():
+      # Insert a zero in the consecutive ids where zero appears in unique_ids.
+      # id_is_zero has length 1.
+      zero_id_ind = math_ops.to_int32(id_is_zero[0])
+      ids_before = nonzero_consecutive_ids[:zero_id_ind]
+      ids_after = nonzero_consecutive_ids[zero_id_ind:]
+      return array_ops.concat([ids_before, [0], ids_after], axis=0)
+
+    new_ids = control_flow_ops.cond(
+        math_ops.equal(array_ops.shape(id_is_zero)[0], 0), no_zero, has_zero)
+    components = array_ops.reshape(
+        array_ops.gather(new_ids, id_index), array_ops.shape(components))
+    if len(image_or_images.get_shape()) == 2:
+      return components[0, :, :]
+    else:
+      return components
+
+
 ops.NotDifferentiable("BipartiteMatch")
+ops.NotDifferentiable("ImageConnectedComponents")
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
index bb766e59d2cee648042cc08be466796d9233ad66..d4a6a5bcbb52511d4093587814100b2a0e8b2420 100755
--- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -26,18 +26,20 @@ _sirds_ops = loader.load_op_library(
     resource_loader.get_path_to_datafile(
         "_single_image_random_dot_stereograms.so"))
 
-def single_image_random_dot_stereograms(
-    depth_values,
-    hidden_surface_removal=None,
-    convergence_dots_size=None,
-    dots_per_inch=None,
-    eye_separation=None, mu=None,
-    normalize=None, normalize_max=None,
-    normalize_min=None,
-    border_level=None,
-    number_colors=None,
-    output_image_shape=None,
-    output_data_window=None):
+
+def single_image_random_dot_stereograms(depth_values,
+                                        hidden_surface_removal=None,
+                                        convergence_dots_size=None,
+                                        dots_per_inch=None,
+                                        eye_separation=None,
+                                        mu=None,
+                                        normalize=None,
+                                        normalize_max=None,
+                                        normalize_min=None,
+                                        border_level=None,
+                                        number_colors=None,
+                                        output_image_shape=None,
+                                        output_data_window=None):
   """Output a RandomDotStereogram Tensor for export via encode_PNG/JPG OP.
 
   Given the 2-D tensor 'depth_values' with encoded Z values, this operation
@@ -45,7 +47,8 @@ def single_image_random_dot_stereograms(
   for the encode_PNG/JPG ops.  Be careful with image compression as this may
   corrupt the encode 3-D data witin the image.
 
-  Based upon [this paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
+  Based upon [this
+  paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
 
   This outputs a SIRDS image as picture_out.png:
 
@@ -113,7 +116,8 @@ def single_image_random_dot_stereograms(
       hidden_surface_removal=hidden_surface_removal,
       convergence_dots_size=convergence_dots_size,
       dots_per_inch=dots_per_inch,
-      eye_separation=eye_separation, mu=mu,
+      eye_separation=eye_separation,
+      mu=mu,
       normalize=normalize,
       normalize_max=normalize_max,
       normalize_min=normalize_min,
@@ -123,4 +127,5 @@ def single_image_random_dot_stereograms(
       output_data_window=output_data_window)
   return result
 
+
 ops.NotDifferentiable("SingleImageRandomDotStereograms")
diff --git a/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
index ca288c1f737d25faac678f5c199d5c1e49f721cb..886f6798150c57d8066546b0919481d3878882fc 100644
--- a/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
+++ b/tensorflow/contrib/input_pipeline/kernels/input_pipeline_kernels.cc
@@ -34,9 +34,8 @@ class ObtainNextOp : public OpKernel {
 
     // Allocate output.
     Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(
-        ctx,
-        ctx->allocate_output("out_element", TensorShape({}), &output_tensor));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("out_element", TensorShape({}),
+                                             &output_tensor));
 
     // Obtain mutex for the "counter" tensor.
     mutex* mu;
diff --git a/tensorflow/contrib/kafka/BUILD b/tensorflow/contrib/kafka/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..efb403462a6e5df5b69ac0735ffc03f40d4a252c
--- /dev/null
+++ b/tensorflow/contrib/kafka/BUILD
@@ -0,0 +1,105 @@
+package(
+    default_visibility = ["//visibility:private"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+tf_kernel_library(
+    name = "kafka_kernels",
+    srcs = ["kernels/kafka_dataset_ops.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:bounds_check_lib",
+        "//tensorflow/core/kernels:dataset",
+        "//third_party/eigen3",
+        "@kafka",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["kafka_ops"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_kafka_ops",
+    out = "python/ops/gen_kafka_ops.py",
+    require_shape_functions = True,
+    deps = [":kafka_ops_op_lib"],
+)
+
+py_library(
+    name = "kafka",
+    srcs = [
+        "__init__.py",
+        "python/ops/kafka_dataset_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":gen_kafka_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+# The Kafka server has to be setup before running the test.
+# The Kafka server is setup through Docker so the Docker engine
+# has to be installed.
+#
+# Once the Docker engine is ready:
+# To setup the Kafka server:
+# $ bash tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh start kafka
+#
+# After the test is complete:
+# To team down the Kafka server:
+# $ bash tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh stop kafka
+tf_py_test(
+    name = "kafka_test",
+    srcs = ["python/kernel_tests/kafka_test.py"],
+    additional_deps = [
+        ":kafka",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/ndlstm/python/__init__.py b/tensorflow/contrib/kafka/__init__.py
similarity index 68%
rename from tensorflow/contrib/ndlstm/python/__init__.py
rename to tensorflow/contrib/kafka/__init__.py
index 1aa51a6ec40c042ca3c26c6b08e5bdb8a42a12bd..4d755c40568dfa2f7f6f617cf3180268837a5ca0 100644
--- a/tensorflow/contrib/ndlstm/python/__init__.py
+++ b/tensorflow/contrib/kafka/__init__.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-# http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,14 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Init file, giving convenient access to all ndlstm ops."""
+"""Kafka Dataset.
+
+@@KafkaDataset
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=wildcard-import,g-importing-member
-from tensorflow.contrib.ndlstm.python.lstm1d import *
-from tensorflow.contrib.ndlstm.python.lstm2d import *
-from tensorflow.contrib.ndlstm.python.misc import *
-# pylint: enable=wildcard-import
+from tensorflow.contrib.kafka.python.ops.kafka_dataset_ops import KafkaDataset
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "KafkaDataset",
+]
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88ef5f357113372b0a2d0cb13382ac980a61252d
--- /dev/null
+++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
@@ -0,0 +1,321 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/tensor.h"
+
+#include "src-cpp/rdkafkacpp.h"
+
+namespace tensorflow {
+
+class KafkaDatasetOp : public DatasetOpKernel {
+ public:
+  using DatasetOpKernel::DatasetOpKernel;
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    const Tensor* topics_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("topics", &topics_tensor));
+    OP_REQUIRES(
+        ctx, topics_tensor->dims() <= 1,
+        errors::InvalidArgument("`topics` must be a scalar or a vector."));
+
+    std::vector<string> topics;
+    topics.reserve(topics_tensor->NumElements());
+    for (int i = 0; i < topics_tensor->NumElements(); ++i) {
+      topics.push_back(topics_tensor->flat<string>()(i));
+    }
+
+    std::string servers = "";
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<std::string>(ctx, "servers", &servers));
+    std::string group = "";
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<std::string>(ctx, "group", &group));
+    bool eof = false;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "eof", &eof));
+    int64 timeout = -1;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "timeout", &timeout));
+    OP_REQUIRES(ctx, (timeout > 0),
+                errors::InvalidArgument(
+                    "Timeout value should be large than 0, got ", timeout));
+    *output = new Dataset(ctx, std::move(topics), servers, group, eof, timeout);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, std::vector<string> topics,
+            const string& servers, const string& group, const bool eof,
+            const int64 timeout)
+        : GraphDatasetBase(ctx),
+          topics_(std::move(topics)),
+          servers_(servers),
+          group_(group),
+          eof_(eof),
+          timeout_(timeout) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Kafka")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override { return "KafkaDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* topics = nullptr;
+      TF_RETURN_IF_ERROR(b->AddVector(topics_, &topics));
+      Node* servers = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(servers_, &servers));
+      Node* group = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(group_, &group));
+      Node* eof = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(eof_, &eof));
+      Node* timeout = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(timeout_, &timeout));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {topics, servers, group, eof, timeout}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a topic, so try to read the next line.
+          if (consumer_.get()) {
+            while (true) {
+              if (limit_ >= 0 &&
+                  (topic_partition_->offset() >= limit_ || offset_ >= limit_)) {
+                // EOF current topic
+                break;
+              }
+              std::unique_ptr<RdKafka::Message> message(
+                  consumer_->consume(dataset()->timeout_));
+              if (message->err() == RdKafka::ERR_NO_ERROR) {
+                // Produce the line as output.
+                Tensor line_tensor(cpu_allocator(), DT_STRING, {});
+                line_tensor.scalar<string>()() =
+                    std::string(static_cast<const char*>(message->payload()),
+                                message->len());
+                out_tensors->emplace_back(std::move(line_tensor));
+                *end_of_sequence = false;
+                // Sync offset
+                offset_ = message->offset();
+                return Status::OK();
+              }
+
+              if (message->err() == RdKafka::ERR__PARTITION_EOF &&
+                  dataset()->eof_) {
+                // EOF current topic
+                break;
+              }
+              if (message->err() != RdKafka::ERR__TIMED_OUT) {
+                return errors::Internal("Failed to consume:",
+                                        message->errstr());
+              }
+              message.reset(nullptr);
+              consumer_->poll(0);
+            }
+
+            // We have reached the end of the current topic, so maybe
+            // move on to next topic.
+            ResetStreamsLocked();
+            ++current_topic_index_;
+          }
+
+          // Iteration ends when there are no more topic to process.
+          if (current_topic_index_ == dataset()->topics_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+        } while (true);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_topic_index"),
+                                               current_topic_index_));
+
+        // `consumer_` is empty if
+        // 1. GetNext has not been called even once.
+        // 2. All topics have been read and iterator has been exhausted.
+        if (consumer_.get()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("current_pos"), offset_));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        ResetStreamsLocked();
+        int64 current_topic_index;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("current_topic_index"),
+                                              &current_topic_index));
+        current_topic_index_ = size_t(current_topic_index);
+        // The key "current_pos" is written only if the iterator was saved
+        // with an open topic.
+        if (reader->Contains(full_name("current_pos"))) {
+          int64 current_pos;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("current_pos"), &current_pos));
+
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+          topic_partition_->set_offset(current_pos);
+          if (topic_partition_->offset() != current_pos) {
+            return errors::Internal("Failed to restore to offset ",
+                                    current_pos);
+          }
+          offset_ = current_pos;
+        }
+        return Status::OK();
+      }
+
+     private:
+      // Sets up Kafka streams to read from the topic at
+      // `current_topic_index_`.
+      Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (current_topic_index_ >= dataset()->topics_.size()) {
+          return errors::InvalidArgument(
+              "current_topic_index_:", current_topic_index_,
+              " >= topics_.size():", dataset()->topics_.size());
+        }
+
+        // Actually move on to next topic.
+        string entry = dataset()->topics_[current_topic_index_];
+
+        std::vector<string> parts = str_util::Split(entry, ":");
+        if (parts.size() < 1) {
+          return errors::InvalidArgument("Invalid parameters: ", entry);
+        }
+        string topic = parts[0];
+        int32 partition = 0;
+        if (parts.size() > 1) {
+          if (!strings::safe_strto32(parts[1], &partition)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+        int64 offset = 0;
+        if (parts.size() > 2) {
+          if (!strings::safe_strto64(parts[2], &offset)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+
+        topic_partition_.reset(
+            RdKafka::TopicPartition::create(topic, partition, offset));
+
+        offset_ = topic_partition_->offset();
+        limit_ = -1;
+        if (parts.size() > 3) {
+          if (!strings::safe_strto64(parts[3], &limit_)) {
+            return errors::InvalidArgument("Invalid parameters: ", entry);
+          }
+        }
+
+        std::unique_ptr<RdKafka::Conf> conf(
+            RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL));
+        std::unique_ptr<RdKafka::Conf> topic_conf(
+            RdKafka::Conf::create(RdKafka::Conf::CONF_TOPIC));
+
+        std::string errstr;
+
+        RdKafka::Conf::ConfResult result =
+            conf->set("default_topic_conf", topic_conf.get(), errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set default_topic_conf:", errstr);
+        }
+
+        result = conf->set("bootstrap.servers", dataset()->servers_, errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set bootstrap.servers ",
+                                  dataset()->servers_, ":", errstr);
+        }
+        result = conf->set("group.id", dataset()->group_, errstr);
+        if (result != RdKafka::Conf::CONF_OK) {
+          return errors::Internal("Failed to set group.id ", dataset()->group_,
+                                  ":", errstr);
+        }
+
+        consumer_.reset(RdKafka::KafkaConsumer::create(conf.get(), errstr));
+        if (!consumer_.get()) {
+          return errors::Internal("Failed to create consumer:", errstr);
+        }
+
+        std::vector<RdKafka::TopicPartition*> partitions;
+        partitions.emplace_back(topic_partition_.get());
+        RdKafka::ErrorCode err = consumer_->assign(partitions);
+        if (err != RdKafka::ERR_NO_ERROR) {
+          return errors::Internal(
+              "Failed to assign partition [", topic_partition_->topic(), ", ",
+              topic_partition_->partition(), ", ", topic_partition_->offset(),
+              "]:", RdKafka::err2str(err));
+        }
+
+        return Status::OK();
+      }
+
+      // Resets all Kafka streams.
+      void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        consumer_->unassign();
+        consumer_->close();
+        consumer_.reset(nullptr);
+      }
+
+      mutex mu_;
+      size_t current_topic_index_ GUARDED_BY(mu_) = 0;
+      int64 offset_ GUARDED_BY(mu_) = 0;
+      int64 limit_ GUARDED_BY(mu_) = -1;
+      std::unique_ptr<RdKafka::TopicPartition> topic_partition_ GUARDED_BY(mu_);
+      std::unique_ptr<RdKafka::KafkaConsumer> consumer_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<string> topics_;
+    const std::string servers_;
+    const std::string group_;
+    const bool eof_;
+    const int64 timeout_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("KafkaDataset").Device(DEVICE_CPU),
+                        KafkaDatasetOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/kafka/ops/kafka_ops.cc b/tensorflow/contrib/kafka/ops/kafka_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8cdf16103bab2b22d51c144d21a589e1e39f2f0b
--- /dev/null
+++ b/tensorflow/contrib/kafka/ops/kafka_ops.cc
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("KafkaDataset")
+    .Input("topics: string")
+    .Input("servers: string")
+    .Input("group: string")
+    .Input("eof: bool")
+    .Input("timeout: int64")
+    .Output("handle: variant")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that emits the messages of one or more Kafka topics.
+
+topics: A `tf.string` tensor containing one or more subscriptions,
+  in the format of [topic:partition:offset:length],
+  by default length is -1 for unlimited.
+servers: A list of bootstrap servers.
+group: The consumer group id.
+eof: If True, the kafka reader will stop on EOF.
+timeout: The timeout value for the Kafka Consumer to wait
+  (in millisecond).
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..621911876fc502ece76b08eb6c28697b3c12c863
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.py
@@ -0,0 +1,115 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for KafkaDataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kafka.python.ops import kafka_dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class KafkaDatasetTest(test.TestCase):
+
+  def setUp(self):
+    # The Kafka server has to be setup before the test
+    # and tear down after the test manually.
+    # The docker engine has to be installed.
+    #
+    # To setup the Kafka server:
+    # $ bash kafka_test.sh start kafka
+    #
+    # To team down the Kafka server:
+    # $ bash kafka_test.sh stop kafka
+    pass
+
+  def testKafkaDataset(self):
+    topics = array_ops.placeholder(dtypes.string, shape=[None])
+    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = kafka_dataset_ops.KafkaDataset(
+        topics, group="test", eof=True).repeat(num_epochs)
+    batch_dataset = repeat_dataset.batch(batch_size)
+
+    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
+    init_op = iterator.make_initializer(repeat_dataset)
+    init_batch_op = iterator.make_initializer(batch_dataset)
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Basic test: read from topic 0.
+      sess.run(init_op, feed_dict={topics: ["test:0:0:4"], num_epochs: 1})
+      for i in range(5):
+        self.assertEqual("D" + str(i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from topic 1.
+      sess.run(init_op, feed_dict={topics: ["test:0:5:-1"], num_epochs: 1})
+      for i in range(5):
+        self.assertEqual("D" + str(i + 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from both topics.
+      sess.run(
+          init_op,
+          feed_dict={
+              topics: ["test:0:0:4", "test:0:5:-1"],
+              num_epochs: 1
+          })
+      for j in range(2):
+        for i in range(5):
+          self.assertEqual("D" + str(i + j * 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test repeated iteration through both files.
+      sess.run(
+          init_op,
+          feed_dict={
+              topics: ["test:0:0:4", "test:0:5:-1"],
+              num_epochs: 10
+          })
+      for _ in range(10):
+        for j in range(2):
+          for i in range(5):
+            self.assertEqual("D" + str(i + j * 5), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test batched and repeated iteration through both files.
+      sess.run(
+          init_batch_op,
+          feed_dict={
+              topics: ["test:0:0:4", "test:0:5:-1"],
+              num_epochs: 10,
+              batch_size: 5
+          })
+      for _ in range(10):
+        self.assertAllEqual(["D" + str(i) for i in range(5)],
+                            sess.run(get_next))
+        self.assertAllEqual(["D" + str(i + 5) for i in range(5)],
+                            sess.run(get_next))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..adf027b8e714124cde2b4618546e20c6b7162e1f
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+set -o pipefail
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 start|stop <kafka container name>" >&2
+  exit 1
+fi
+
+container=$2
+if [ "$1" == "start" ]; then
+    docker run -d --rm --net=host --name=$container spotify/kafka
+    echo Wait 5 secs until kafka is up and running
+    sleep 5
+    echo Create test topic
+    docker exec $container bash -c '/opt/kafka_2.11-0.10.1.0/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic test'
+    echo Create test message
+    docker exec $container bash -c 'echo -e "D0\nD1\nD2\nD3\nD4\nD5\nD6\nD7\nD8\nD9" > /test'
+    echo Produce test message
+    docker exec $container bash -c '/opt/kafka_2.11-0.10.1.0/bin/kafka-console-producer.sh --topic test --broker-list 127.0.0.1:9092 < /test'
+
+    echo Container $container started successfully
+elif [ "$1" == "stop" ]; then
+    docker rm -f $container
+
+    echo Container $container stopped successfully
+else
+  echo "Usage: $0 start|stop <kafka container name>" >&2
+  exit 1
+fi
+
+
+
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e51d27a342359881de072c3979a2b5a7fc034ea
--- /dev/null
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -0,0 +1,74 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kafka Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kafka.python.ops import gen_kafka_ops
+from tensorflow.python.data.ops.readers import Dataset
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+
+
+class KafkaDataset(Dataset):
+  """A Kafka Dataset that consumes the message.
+  """
+
+  def __init__(self,
+               topics,
+               servers="localhost",
+               group="",
+               eof=False,
+               timeout=1000):
+    """Create a KafkaReader.
+
+    Args:
+      topics: A `tf.string` tensor containing one or more subscriptions,
+              in the format of [topic:partition:offset:length],
+              by default length is -1 for unlimited.
+      servers: A list of bootstrap servers.
+      group: The consumer group id.
+      eof: If True, the kafka reader will stop on EOF.
+      timeout: The timeout value for the Kafka Consumer to wait
+               (in millisecond).
+    """
+    super(KafkaDataset, self).__init__()
+    self._topics = ops.convert_to_tensor(
+        topics, dtype=dtypes.string, name="topics")
+    self._servers = ops.convert_to_tensor(
+        servers, dtype=dtypes.string, name="servers")
+    self._group = ops.convert_to_tensor(
+        group, dtype=dtypes.string, name="group")
+    self._eof = ops.convert_to_tensor(eof, dtype=dtypes.bool, name="eof")
+    self._timeout = ops.convert_to_tensor(
+        timeout, dtype=dtypes.int64, name="timeout")
+
+  def _as_variant_tensor(self):
+    return gen_kafka_ops.kafka_dataset(self._topics, self._servers, self._group,
+                                       self._eof, self._timeout)
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.string
diff --git a/tensorflow/contrib/keras/api/__init__.py b/tensorflow/contrib/keras/api/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..52e83069cb0c68b510da46149248369dce376647 100644
--- a/tensorflow/contrib/keras/api/__init__.py
+++ b/tensorflow/contrib/keras/api/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/kernel_methods/BUILD b/tensorflow/contrib/kernel_methods/BUILD
index a2f320ab11291e4049c8367e1f133a4fbcb72a62..eff7dfeb4c1117e40f4faf43c5e92a52cffd6528 100644
--- a/tensorflow/contrib/kernel_methods/BUILD
+++ b/tensorflow/contrib/kernel_methods/BUILD
@@ -83,9 +83,11 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":kernel_methods",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/contrib/kernel_methods/python/losses.py b/tensorflow/contrib/kernel_methods/python/losses.py
index 208b0e1c9dbe93fb99e17e7be5ed5b6e30f4e201..f182fef067b7f523bc5ca63227265be40528b171 100644
--- a/tensorflow/contrib/kernel_methods/python/losses.py
+++ b/tensorflow/contrib/kernel_methods/python/losses.py
@@ -73,13 +73,13 @@ def sparse_multiclass_hinge_loss(
                                                               labels)) as scope:
 
     # Check logits Tensor has valid rank.
-    logits_shape = logits.get_shape()
-    logits_rank = logits_shape.ndims
+    logits_rank = logits.get_shape().ndims
     if logits_rank != 2:
       raise ValueError(
           'logits should have rank 2 ([batch_size, num_classes]). Given rank is'
           ' {}'.format(logits_rank))
-    batch_size, num_classes = logits_shape[0].value, logits_shape[1].value
+    logits_shape = array_ops.shape(logits)
+    batch_size, num_classes = logits_shape[0], logits_shape[1]
     logits = math_ops.to_float(logits)
 
     # Check labels have valid type.
diff --git a/tensorflow/contrib/kernel_methods/python/losses_test.py b/tensorflow/contrib/kernel_methods/python/losses_test.py
index 8a1a5ffe56ba283bfae514738fa87e4055f8934e..72507539f813d14064bc58f03b6db4781abc9438 100644
--- a/tensorflow/contrib/kernel_methods/python/losses_test.py
+++ b/tensorflow/contrib/kernel_methods/python/losses_test.py
@@ -18,10 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.contrib.kernel_methods.python import losses
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -114,6 +117,27 @@ class SparseMulticlassHingeLossTest(test.TestCase):
       loss = losses.sparse_multiclass_hinge_loss(labels, logits)
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
+  def testUnknownShape(self):
+    """Result keeps same with `testZeroLossInt32Labels`"""
+    logits_np = np.array([[1.2, -1.4, -1.0], [1.4, 1.8, 4.0], [0.5, 1.8, -1.0]])
+    labels_np = np.array([0, 2, 1], dtype=np.int32)
+
+    logits_shapes = [
+        [3, 3],  # batch_size, num_classes
+        [None, 3],
+        [3, None],
+        [None, None]
+    ]
+
+    for batch_size, num_classes in logits_shapes:
+      with self.test_session():
+        logits = array_ops.placeholder(
+            dtypes.float32, shape=(batch_size, num_classes))
+        labels = array_ops.placeholder(dtypes.int32, shape=(batch_size,))
+        loss = losses.sparse_multiclass_hinge_loss(labels, logits)
+        result = loss.eval(feed_dict={logits: logits_np, labels: labels_np})
+        self.assertAlmostEqual(result, 0.0, 3)
+
   def testCorrectPredictionsSomeClassesInsideMargin(self):
     """Loss is > 0 even if true class logits are higher than other classes."""
     with self.test_session():
diff --git a/tensorflow/contrib/kfac/examples/convnet.py b/tensorflow/contrib/kfac/examples/convnet.py
index 558bc294bc8ac129b3055ed46623c78a0d5a33e3..39d80addaac1fe855a37255b32bf4412b99df46a 100644
--- a/tensorflow/contrib/kfac/examples/convnet.py
+++ b/tensorflow/contrib/kfac/examples/convnet.py
@@ -286,7 +286,7 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
         damping=0.001,
         layer_collection=layer_collection,
         momentum=0.9)
-    inv_update_queue = oq.OpQueue(optimizer.inv_updates_dict.values())
+    inv_update_queue = oq.OpQueue(optimizer.inv_update_ops)
     sync_optimizer = tf.train.SyncReplicasOptimizer(
         opt=optimizer,
         replicas_to_aggregate=_num_gradient_tasks(num_worker_tasks))
diff --git a/tensorflow/contrib/kfac/examples/mlp.py b/tensorflow/contrib/kfac/examples/mlp.py
index 4275ceadc210ff471109b596e1c9aa260ce31ab5..87eed03888c894a04c0521d1ce5ee8975b60776b 100644
--- a/tensorflow/contrib/kfac/examples/mlp.py
+++ b/tensorflow/contrib/kfac/examples/mlp.py
@@ -239,3 +239,88 @@ def train_mnist_multitower(data_dir,
       })
   return minimize(
       loss, accuracy, layer_collection, session_config=session_config)
+
+
+def train_mnist_estimator(data_dir, num_epochs, use_fake_data=False):
+  """Train an MLP on MNIST using tf.estimator.
+
+  Args:
+    data_dir: string. Directory to read MNIST examples from.
+    num_epochs: int. Number of passes to make over the training set.
+    use_fake_data: bool. If True, generate a synthetic dataset.
+
+  Returns:
+    accuracy of model on the final minibatch of training data.
+  """
+
+  # Load a dataset.
+  def input_fn():
+    tf.logging.info("Loading MNIST into memory.")
+    return mnist.load_mnist(
+        data_dir,
+        num_epochs=num_epochs,
+        batch_size=64,
+        flatten_images=True,
+        use_fake_data=use_fake_data)
+
+  def model_fn(features, labels, mode, params):
+    """Model function for MLP trained with K-FAC.
+
+    Args:
+      features: Tensor of shape [batch_size, input_size]. Input features.
+      labels: Tensor of shape [batch_size]. Target labels for training.
+      mode: tf.estimator.ModeKey. Must be TRAIN.
+      params: ignored.
+
+    Returns:
+      EstimatorSpec for training.
+
+    Raises:
+      ValueError: If 'mode' is anything other than TRAIN.
+    """
+    del params
+
+    if mode != tf.estimator.ModeKeys.TRAIN:
+      raise ValueError("Only training is supposed with this API.")
+
+    # Build a ConvNet.
+    layer_collection = lc.LayerCollection()
+    loss, accuracy = build_model(
+        features, labels, num_labels=10, layer_collection=layer_collection)
+
+    # Train with K-FAC.
+    global_step = tf.train.get_or_create_global_step()
+    optimizer = opt.KfacOptimizer(
+        learning_rate=tf.train.exponential_decay(
+            0.00002, global_step, 10000, 0.5, staircase=True),
+        cov_ema_decay=0.95,
+        damping=0.0001,
+        layer_collection=layer_collection,
+        momentum=0.99)
+
+    # Run cov_update_op every step. Run 1 inv_update_ops per step.
+    cov_update_op = optimizer.cov_update_op
+    inv_update_op = tf.group(
+        tf.contrib.kfac.utils.batch_execute(
+            global_step, optimizer.inv_update_thunks, batch_size=1))
+    with tf.control_dependencies([cov_update_op, inv_update_op]):
+      train_op = optimizer.minimize(loss, global_step=global_step)
+
+    # Print metrics every 5 sec.
+    hooks = [
+        tf.train.LoggingTensorHook(
+            {
+                "loss": loss,
+                "accuracy": accuracy
+            }, every_n_secs=5),
+    ]
+    return tf.estimator.EstimatorSpec(
+        mode=mode, loss=loss, train_op=train_op, training_hooks=hooks)
+
+  run_config = tf.estimator.RunConfig(
+      model_dir="/tmp/mnist", save_checkpoints_steps=1, keep_checkpoint_max=100)
+
+  # Train until input_fn() is empty with Estimator. This is a prerequisite for
+  # TPU compatibility.
+  estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)
+  estimator.train(input_fn=input_fn)
diff --git a/tensorflow/contrib/kfac/examples/mlp_mnist_main.py b/tensorflow/contrib/kfac/examples/mlp_mnist_main.py
index b318c71a568be2d717745579df24134ceb3b6a0b..9c34ade1d2018135b3636fddb9dcc65839cd59de 100644
--- a/tensorflow/contrib/kfac/examples/mlp_mnist_main.py
+++ b/tensorflow/contrib/kfac/examples/mlp_mnist_main.py
@@ -33,7 +33,11 @@ FLAGS = None
 
 def main(argv):
   _ = argv
-  if FLAGS.num_towers > 1:
+  if FLAGS.use_estimator:
+    if FLAGS.num_towers != 1:
+      raise ValueError("Only 1 device supported in tf.estimator example.")
+    mlp.train_mnist_estimator(FLAGS.data_dir, num_epochs=200)
+  elif FLAGS.num_towers > 1:
     mlp.train_mnist_multitower(
         FLAGS.data_dir, num_epochs=200, num_towers=FLAGS.num_towers)
   else:
@@ -52,5 +56,9 @@ if __name__ == "__main__":
       type=int,
       default=1,
       help="Number of CPUs to split minibatch across.")
+  parser.add_argument(
+      "--use_estimator",
+      action="store_true",
+      help="Use tf.estimator API to train.")
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/kfac/examples/mnist.py b/tensorflow/contrib/kfac/examples/mnist.py
index cf92c909f4b5201bc0ffda5703136f46c7058ec6..547c4ab25d589192f2a5b65987be3b05128fe298 100644
--- a/tensorflow/contrib/kfac/examples/mnist.py
+++ b/tensorflow/contrib/kfac/examples/mnist.py
@@ -63,7 +63,7 @@ def load_mnist(data_dir,
     images = mnist_data.train.images
     labels = mnist_data.train.labels
 
-  dataset = tf.contrib.data.Dataset.from_tensor_slices((np.asarray(
+  dataset = tf.data.Dataset.from_tensor_slices((np.asarray(
       images, dtype=np.float32), np.asarray(labels, dtype=np.int64)))
   return (dataset.repeat(num_epochs).shuffle(num_examples).batch(batch_size)
           .make_one_shot_iterator().get_next())
diff --git a/tensorflow/contrib/kfac/examples/tests/convnet_test.py b/tensorflow/contrib/kfac/examples/tests/convnet_test.py
index 3c98c54ef6cbd527aa0035e0b6f40be961c6308d..8d86c2bb5150cd4bc8a2b21ba050e904929e0fe9 100644
--- a/tensorflow/contrib/kfac/examples/tests/convnet_test.py
+++ b/tensorflow/contrib/kfac/examples/tests/convnet_test.py
@@ -96,7 +96,7 @@ class ConvNetTest(tf.test.TestCase):
     """
     x = np.asarray([[1.], [2.]]).astype(np.float32)
     y = np.asarray([1., 2.]).astype(np.float32)
-    x, y = (tf.contrib.data.Dataset.from_tensor_slices((x, y))
+    x, y = (tf.data.Dataset.from_tensor_slices((x, y))
             .repeat(100).batch(2).make_one_shot_iterator().get_next())
     w = tf.get_variable("w", shape=[1, 1], initializer=tf.zeros_initializer())
     y_hat = tf.matmul(x, w)
diff --git a/tensorflow/contrib/kfac/examples/tests/mlp_test.py b/tensorflow/contrib/kfac/examples/tests/mlp_test.py
index 34a942d27f64e2583c686c2ba3240bc636ed918b..22da6c29f1b364d94432315988d844db9b95ec28 100644
--- a/tensorflow/contrib/kfac/examples/tests/mlp_test.py
+++ b/tensorflow/contrib/kfac/examples/tests/mlp_test.py
@@ -53,6 +53,11 @@ class MlpTest(tf.test.TestCase):
       mlp.train_mnist_multitower(
           data_dir=None, num_epochs=1, num_towers=2, use_fake_data=True)
 
+  def testTrainMnistEstimator(self):
+    with tf.Graph().as_default():
+      # Ensure model training doesn't crash.
+      mlp.train_mnist_estimator(data_dir=None, num_epochs=1, use_fake_data=True)
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/BUILD b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
index 95fba59e3c96ae3c69e0b154740785b0d2bcb3c9..f4ed978174a9ddd8b54a88e60bfb48a67a2e76d2 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
@@ -17,12 +17,17 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -110,12 +115,15 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/contrib/kfac/python/ops:utils",
+        "//tensorflow/contrib/tpu",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:random_seed",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py b/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
index 9b28c45c7263208d21b1514ae5f05b7e81e315a3..bfdb69ad02caaa57827e0ae6b3c9fc0d0ed03754 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.contrib.kfac.python.ops import estimator
 from tensorflow.contrib.kfac.python.ops import layer_collection as lc
 from tensorflow.contrib.kfac.python.ops import utils
@@ -25,11 +27,15 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import training_util
 
 _ALL_ESTIMATION_MODES = ["gradients", "empirical", "curvature_prop", "exact"]
 
@@ -119,6 +125,114 @@ class EstimatorTest(test.TestCase):
         estimator.FisherEstimator([self.weights], 0.1, 0.2,
                                   self.layer_collection, mode)
 
+  def test_cov_update_thunks(self):
+    """Ensures covariance update ops run once per global_step."""
+    with self._graph.as_default(), self.test_session() as sess:
+      fisher_estimator = estimator.FisherEstimator(
+          variables=[self.weights],
+          layer_collection=self.layer_collection,
+          cov_ema_decay=0.0,
+          damping=0.0)
+
+      # Construct an op that executes one covariance update per step.
+      global_step = training_util.get_or_create_global_step()
+      cov_matrices = [
+          fisher_factor.get_cov()
+          for fisher_factor in self.layer_collection.get_factors()
+      ]
+      cov_update_op_thunks = fisher_estimator.cov_update_thunks
+      cov_update_op = control_flow_ops.case(
+          [(math_ops.equal(global_step, i), thunk)
+           for i, thunk in enumerate(cov_update_op_thunks)])
+      increment_global_step = global_step.assign_add(1)
+
+      sess.run(variables.global_variables_initializer())
+      initial_cov_values = sess.run(cov_matrices)
+
+      # Ensure there's one update per covariance matrix.
+      self.assertEqual(len(cov_matrices), len(cov_update_op_thunks))
+
+      # Test is no-op if only 1 covariance matrix.
+      assert len(cov_matrices) > 1
+
+      for i in range(len(cov_matrices)):
+        # Compare new and old covariance values
+        new_cov_values = sess.run(cov_matrices)
+        is_cov_equal = [
+            np.allclose(initial_cov_value, new_cov_value)
+            for (initial_cov_value,
+                 new_cov_value) in zip(initial_cov_values, new_cov_values)
+        ]
+        num_cov_equal = sum(is_cov_equal)
+
+        # Ensure exactly one covariance matrix changes per step.
+        self.assertEqual(num_cov_equal, len(cov_matrices) - i)
+
+        # Run all covariance update ops.
+        sess.run(cov_update_op)
+        sess.run(increment_global_step)
+
+  def test_inv_update_thunks(self):
+    """Ensures inverse update ops run once per global_step."""
+    with self._graph.as_default(), self.test_session() as sess:
+      fisher_estimator = estimator.FisherEstimator(
+          variables=[self.weights],
+          layer_collection=self.layer_collection,
+          cov_ema_decay=0.0,
+          damping=0.0)
+
+      # Construct op that updates one inverse per global step.
+      global_step = training_util.get_or_create_global_step()
+      inv_matrices = [
+          matrix
+          for fisher_factor in self.layer_collection.get_factors()
+          for matrix in fisher_factor._inverses_by_damping.values()
+      ]
+      inv_update_op_thunks = fisher_estimator.inv_update_thunks
+      inv_update_op = control_flow_ops.case(
+          [(math_ops.equal(global_step, i), thunk)
+           for i, thunk in enumerate(inv_update_op_thunks)])
+      increment_global_step = global_step.assign_add(1)
+
+      sess.run(variables.global_variables_initializer())
+      initial_inv_values = sess.run(inv_matrices)
+
+      # Ensure there's one update per inverse matrix. This is true as long as
+      # there's no fan-in/fan-out or parameter re-use.
+      self.assertEqual(len(inv_matrices), len(inv_update_op_thunks))
+
+      # Test is no-op if only 1 invariance matrix.
+      assert len(inv_matrices) > 1
+
+      # Assign each covariance matrix a value other than the identity. This
+      # ensures that the inverse matrices are updated to something different as
+      # well.
+      cov_matrices = [
+          fisher_factor.get_cov()
+          for fisher_factor in self.layer_collection.get_factors()
+      ]
+      sess.run([
+          cov_matrix.assign(2 * linalg_ops.eye(int(cov_matrix.shape[0])))
+          for cov_matrix in cov_matrices
+      ])
+
+      for i in range(len(inv_matrices)):
+        # Compare new and old inverse values
+        new_inv_values = sess.run(inv_matrices)
+        is_inv_equal = [
+            np.allclose(initial_inv_value, new_inv_value)
+            for (initial_inv_value,
+                 new_inv_value) in zip(initial_inv_values, new_inv_values)
+        ]
+        num_inv_equal = sum(is_inv_equal)
+
+        # Ensure exactly one inverse matrix changes per step.
+        self.assertEqual(num_inv_equal, len(inv_matrices) - i)
+
+        # Run all inverse update ops.
+        sess.run(inv_update_op)
+        sess.run(increment_global_step)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
index 5f2b5c6cace9cd18f4cc5590ff55a9b39680a381..82accd57f0c37d140238f1884fce956654d14227 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
@@ -40,6 +40,21 @@ def _make_psd(dim):
   return array_ops.constant(mat)
 
 
+class UtilsTest(test.TestCase):
+
+  def testComputePiTracenorm(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      random_seed.set_random_seed(200)
+      left_factor = array_ops.diag([1., 2., 0., 1.])
+      right_factor = array_ops.ones([2., 2.])
+
+      # pi is the sqrt of the left trace norm divided by the right trace norm
+      pi = fb.compute_pi_tracenorm(left_factor, right_factor)
+
+      pi_val = sess.run(pi)
+      self.assertEqual(1., pi_val)
+
+
 class FullFBTest(test.TestCase):
 
   def testFullFBInitSingleTensor(self):
@@ -301,8 +316,7 @@ class FullyConnectedDiagonalFB(test.TestCase):
     multiply_result_big, multiply_inverse_result_big = self.runFisherBlockOps(
         self.w, [self.inputs], [self.outputs], [self.output_grads])
     multiply_result_small, multiply_inverse_result_small = (
-        self.runFisherBlockOps(self.w,
-                               np.split(self.inputs, 2),
+        self.runFisherBlockOps(self.w, np.split(self.inputs, 2),
                                np.split(self.outputs, 2),
                                np.split(self.output_grads, 2)))
 
@@ -584,8 +598,7 @@ class ConvDiagonalFBTest(test.TestCase):
     multiply_result_big, multiply_inverse_result_big = self.runFisherBlockOps(
         self.w, [self.inputs], [self.outputs], [self.output_grads])
     multiply_result_small, multiply_inverse_result_small = (
-        self.runFisherBlockOps(self.w,
-                               np.split(self.inputs, 2),
+        self.runFisherBlockOps(self.w, np.split(self.inputs, 2),
                                np.split(self.outputs, 2),
                                np.split(self.output_grads, 2)))
 
@@ -608,8 +621,9 @@ class ConvDiagonalFBTest(test.TestCase):
         self.kernel_size, self.kernel_size, self.input_channels + 1,
         self.output_channels
     ])
-    expected_result = (expected_result[:, :, 0:-1, :], np.reshape(
-        expected_result[:, :, -1, :], [self.output_channels]))
+    expected_result = (expected_result[:, :, 0:-1, :],
+                       np.reshape(expected_result[:, :, -1, :],
+                                  [self.output_channels]))
 
     self.assertEqual(len(result), 2)
     self.assertAllClose(expected_result[0], result[0])
@@ -692,8 +706,8 @@ class ConvKFCBasicFBTest(test.TestCase):
       sess.run(block._input_factor.make_inverse_update_ops())
       sess.run(block._output_factor.make_inverse_update_ops())
 
-      vector = (np.arange(1, 15).reshape(7, 2).astype(np.float32), np.arange(
-          2, 4).reshape(2, 1).astype(np.float32))
+      vector = (np.arange(1, 15).reshape(7, 2).astype(np.float32),
+                np.arange(2, 4).reshape(2, 1).astype(np.float32))
       output = block.multiply_inverse((array_ops.constant(vector[0]),
                                        array_ops.constant(vector[1])))
 
@@ -776,11 +790,50 @@ class ConvKFCBasicFBTest(test.TestCase):
       self.assertAllClose(output_flat, explicit)
 
 
+class FullyConnectedSeriesFBTest(test.TestCase):
+
+  def testFullyConnectedSeriesFBInit(self):
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(200)
+      inputs = array_ops.constant([1., 2.])
+      outputs = array_ops.constant([3., 4.])
+      block = fb.FullyConnectedSeriesFB(
+          lc.LayerCollection(), inputs=[inputs], outputs=[outputs])
+      self.assertAllEqual([outputs], block.tensors_to_compute_grads())
+
+  def testInstantiateFactorsHasBias(self):
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(200)
+      inputs = array_ops.constant([[1., 2.], [3., 4.]])
+      outputs = array_ops.constant([[3., 4.], [5., 6.]])
+      block = fb.FullyConnectedSeriesFB(
+          lc.LayerCollection(),
+          inputs=[inputs],
+          outputs=[outputs],
+          has_bias=True)
+      grads = outputs**2
+      block.instantiate_factors(((grads,),), 0.5)
+
+  def testInstantiateFactorsNoBias(self):
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(200)
+      inputs = array_ops.constant([[1., 2.], [3., 4.]])
+      outputs = array_ops.constant([[3., 4.], [5., 6.]])
+      block = fb.FullyConnectedSeriesFB(
+          lc.LayerCollection(),
+          inputs=[inputs],
+          outputs=[outputs],
+          has_bias=False)
+      grads = outputs**2
+      block.instantiate_factors(((grads,),), 0.5)
+
+
 def as_tensors(tensor_or_tuple):
   """Converts a potentially nested tuple of np.array to Tensors."""
   if isinstance(tensor_or_tuple, (tuple, list)):
     return tuple(as_tensors(t) for t in tensor_or_tuple)
   return ops.convert_to_tensor(tensor_or_tuple)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
index 5e2ce5a3096f5b523fafad56be742154d79e4803..753378d9f4a0d8762bafbee2ec27d6c71783dda1 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
@@ -35,18 +35,27 @@ from tensorflow.python.platform import test
 
 class MaybeColocateTest(test.TestCase):
 
+  def setUp(self):
+    self._colocate_cov_ops_with_inputs = ff.COLOCATE_COV_OPS_WITH_INPUTS
+
+  def tearDown(self):
+    ff.set_global_constants(
+        colocate_cov_ops_with_inputs=self._colocate_cov_ops_with_inputs)
+
   def testFalse(self):
+    ff.set_global_constants(colocate_cov_ops_with_inputs=False)
     with tf_ops.Graph().as_default():
       a = constant_op.constant([2.0], name='a')
-      with ff._maybe_colocate_with(a, False):
+      with ff.maybe_colocate_with(a):
         b = constant_op.constant(3.0, name='b')
       self.assertEqual([b'loc:@a'], a.op.colocation_groups())
       self.assertEqual([b'loc:@b'], b.op.colocation_groups())
 
   def testTrue(self):
+    ff.set_global_constants(colocate_cov_ops_with_inputs=True)
     with tf_ops.Graph().as_default():
       a = constant_op.constant([2.0], name='a')
-      with ff._maybe_colocate_with(a, True):
+      with ff.maybe_colocate_with(a):
         b = constant_op.constant(3.0, name='b')
       self.assertEqual([b'loc:@a'], a.op.colocation_groups())
       self.assertEqual([b'loc:@a'], b.op.colocation_groups())
@@ -67,12 +76,19 @@ class FisherFactorTestingDummy(ff.FisherFactor):
   def _num_sources(self):
     return 1
 
+  @property
+  def _dtype(self):
+    return dtypes.float32
+
   def _compute_new_cov(self):
     raise NotImplementedError
 
   def instantiate_covariance(self):
     pass
 
+  def make_inverse_update_ops(self):
+    return []
+
 
 class InverseProvidingFactorTestingDummy(ff.InverseProvidingFactor):
   """Dummy class to test the non-abstract methods on ff.InverseProvidingFactor.
@@ -94,6 +110,10 @@ class InverseProvidingFactorTestingDummy(ff.InverseProvidingFactor):
   def _num_sources(self):
     return 1
 
+  @property
+  def _dtype(self):
+    return dtypes.float32
+
   def _compute_new_cov(self):
     raise NotImplementedError
 
@@ -109,7 +129,7 @@ class NumericalUtilsTest(test.TestCase):
       random_seed.set_random_seed(200)
 
       x = npr.randn(100, 3)
-      cov = ff._compute_cov(array_ops.constant(x))
+      cov = ff.compute_cov(array_ops.constant(x))
       np_cov = np.dot(x.T, x) / x.shape[0]
 
       self.assertAllClose(sess.run(cov), np_cov)
@@ -121,7 +141,7 @@ class NumericalUtilsTest(test.TestCase):
 
       normalizer = 10.
       x = npr.randn(100, 3)
-      cov = ff._compute_cov(array_ops.constant(x), normalizer)
+      cov = ff.compute_cov(array_ops.constant(x), normalizer=normalizer)
       np_cov = np.dot(x.T, x) / normalizer
 
       self.assertAllClose(sess.run(cov), np_cov)
@@ -132,7 +152,7 @@ class NumericalUtilsTest(test.TestCase):
 
       m, n = 3, 4
       a = npr.randn(m, n)
-      a_homog = ff._append_homog(array_ops.constant(a))
+      a_homog = ff.append_homog(array_ops.constant(a))
       np_result = np.hstack([a, np.ones((m, 1))])
 
       self.assertAllClose(sess.run(a_homog), np_result)
@@ -267,13 +287,13 @@ class InverseProvidingFactorTest(test.TestCase):
       for i in range(1, ff.EIGENVALUE_DECOMPOSITION_THRESHOLD + 1):
         factor.register_damped_inverse(1. / i)
       ops = factor.make_inverse_update_ops()
-      self.assertEqual(ff.EIGENVALUE_DECOMPOSITION_THRESHOLD, len(ops))
+      self.assertEqual(1, len(ops))
 
       sess.run(tf_variables.global_variables_initializer())
       new_invs = []
+      sess.run(ops)
       for i in range(1, ff.EIGENVALUE_DECOMPOSITION_THRESHOLD + 1):
         # The inverse op will assign the damped inverse of cov to the inv var.
-        sess.run(ops[i - 1])
         new_invs.append(sess.run(factor._inverses_by_damping[1. / i]))
       # We want to see that the new invs are all different from each other.
       for i in range(len(new_invs)):
@@ -331,6 +351,16 @@ class FullFactorTest(test.TestCase):
       factor = ff.FullFactor((tensor,), 32)
       self.assertEqual([6, 6], factor.get_cov().get_shape().as_list())
 
+  def testFullFactorInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
+      factor = ff.FullFactor((tensor,), 32)
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([6, 6], cov.get_shape().as_list())
+
   def testMakeCovarianceUpdateOp(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
@@ -351,6 +381,16 @@ class NaiveDiagonalFactorTest(test.TestCase):
       factor = ff.NaiveDiagonalFactor((tensor,), 32)
       self.assertEqual([6, 1], factor.get_cov().get_shape().as_list())
 
+  def testNaiveDiagonalFactorInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
+      factor = ff.NaiveDiagonalFactor((tensor,), 32)
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([6, 1], cov.get_shape().as_list())
+
   def testMakeCovarianceUpdateOp(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
@@ -364,18 +404,25 @@ class NaiveDiagonalFactorTest(test.TestCase):
 
 class FullyConnectedKroneckerFactorTest(test.TestCase):
 
-  def _testFullyConnectedKroneckerFactorInit(self, has_bias, final_shape):
+  def _testFullyConnectedKroneckerFactorInit(self,
+                                             has_bias,
+                                             final_shape,
+                                             dtype=dtypes.float32_ref):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), name='a/b/c')
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
       factor = ff.FullyConnectedKroneckerFactor((tensor,), has_bias=has_bias)
-      self.assertEqual(final_shape, factor.get_cov().get_shape().as_list())
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual(final_shape, cov.get_shape().as_list())
 
   def testFullyConnectedKroneckerFactorInitNoBias(self):
-    self._testFullyConnectedKroneckerFactorInit(False, [3, 3])
+    for dtype in (dtypes.float32_ref, dtypes.float64_ref):
+      self._testFullyConnectedKroneckerFactorInit(False, [3, 3], dtype=dtype)
 
   def testFullyConnectedKroneckerFactorInitWithBias(self):
-    self._testFullyConnectedKroneckerFactorInit(True, [4, 4])
+    for dtype in (dtypes.float32_ref, dtypes.float64_ref):
+      self._testFullyConnectedKroneckerFactorInit(True, [4, 4], dtype=dtype)
 
   def testMakeCovarianceUpdateOpWithBias(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
@@ -418,6 +465,18 @@ class ConvInputKroneckerFactorTest(test.TestCase):
       self.assertEqual([1 * 2 * 3 + 1, 1 * 2 * 3 + 1],
                        factor.get_cov().get_shape().as_list())
 
+  def testConvInputKroneckerFactorInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
+      factor = ff.ConvInputKroneckerFactor(
+          tensor, (1, 2, 3, 4), 3, 2, has_bias=True)
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([1 * 2 * 3 + 1, 1 * 2 * 3 + 1],
+                       cov.get_shape().as_list())
+
   def testMakeCovarianceUpdateOpWithBias(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
@@ -453,6 +512,16 @@ class ConvOutputKroneckerFactorTest(test.TestCase):
       factor = ff.ConvOutputKroneckerFactor((tensor,))
       self.assertEqual([5, 5], factor.get_cov().get_shape().as_list())
 
+  def testConvOutputKroneckerFactorInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3, 4, 5), dtype=dtype, name='a/b/c')
+      factor = ff.ConvOutputKroneckerFactor((tensor,))
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([5, 5], cov.get_shape().as_list())
+
   def testConvOutputKroneckerFactorInitNotEnoughDims(self):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
@@ -471,5 +540,49 @@ class ConvOutputKroneckerFactorTest(test.TestCase):
       self.assertAllClose([[43, 46.5], [46.5, 51.5]], new_cov)
 
 
+class FullyConnectedMultiKFTest(test.TestCase):
+
+  def testFullyConnectedMultiKFInit(self):
+    with tf_ops.Graph().as_default():
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), name='a/b/c')
+      tensor_list = [tensor]
+      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=False)
+      self.assertEqual([3, 3], factor.get_cov().get_shape().as_list())
+
+  def testFullyConnectedMultiKFInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
+      tensor_list = [tensor]
+      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=False)
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([3, 3], cov.get_shape().as_list())
+
+  def testMakeCovarianceUpdateOpWithBias(self):
+    with tf_ops.Graph().as_default(), self.test_session() as sess:
+      random_seed.set_random_seed(200)
+      tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
+      tensor_list = [tensor]
+      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=True)
+
+      sess.run(tf_variables.global_variables_initializer())
+      new_cov = sess.run(factor.make_covariance_update_op(.5))
+      self.assertAllClose([[3, 3.5, 1], [3.5, 5.5, 1.5], [1, 1.5, 1]], new_cov)
+
+  def testMakeCovarianceUpdateOpNoBias(self):
+    with tf_ops.Graph().as_default(), self.test_session() as sess:
+      random_seed.set_random_seed(200)
+      tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
+      tensor_list = [tensor]
+      factor = ff.FullyConnectedMultiKF((tensor_list,))
+
+      sess.run(tf_variables.global_variables_initializer())
+      new_cov = sess.run(factor.make_covariance_update_op(.5))
+      self.assertAllClose([[3, 3.5], [3.5, 5.5]], new_cov)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py b/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
index 39ce3e9337157c8206107bc40c489e44019743ab..ae787b6f1ac90218f2ac73d37fb270df0b822de2 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
@@ -113,6 +113,113 @@ class CategoricalLogitsNegativeLogProbLossTest(test.TestCase):
       self.assertListEqual(loss.input_minibatches, tower_logits)
       self.assertEqual(loss.num_registered_minibatches, num_towers)
 
+  def testMultiplyFisherSingleVector(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      logits = np.array([1., 2., 3.])
+      loss = loss_functions.CategoricalLogitsNegativeLogProbLoss(logits)
+
+      # the LossFunction.multiply_fisher docstring only says it supports the
+      # case where the vector is the same shape as the input natural parameters
+      # (i.e. the logits here), but here we also test leading dimensions
+      vector = np.array([1., 2., 3.])
+      vectors = [vector, vector.reshape(1, -1), np.stack([vector] * 4)]
+
+      probs = np.exp(logits - np.logaddexp.reduce(logits))
+      fisher = np.diag(probs) - np.outer(probs, probs)
+
+      for vector in vectors:
+        result = loss.multiply_fisher(vector)
+        expected_result = np.dot(vector, fisher)
+        self.assertAllClose(expected_result, sess.run(result))
+
+  def testMultiplyFisherBatch(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      logits = np.array([[1., 2., 3.], [4., 6., 8.]])
+      loss = loss_functions.CategoricalLogitsNegativeLogProbLoss(logits)
+
+      vector = np.array([[1., 2., 3.], [5., 3., 1.]])
+
+      na = np.newaxis
+      probs = np.exp(logits - np.logaddexp.reduce(logits, axis=-1,
+                                                  keepdims=True))
+      fishers = probs[..., na] * np.eye(3) - probs[..., na] * probs[..., na, :]
+
+      result = loss.multiply_fisher(vector)
+      expected_result = np.matmul(vector[..., na, :], fishers)[..., 0, :]
+      self.assertEqual(sess.run(result).shape, logits.shape)
+      self.assertAllClose(expected_result, sess.run(result))
+
+
+class OnehotCategoricalLogitsNegativeLogProbLossTest(test.TestCase):
+
+  def testSample(self):
+    """Ensure samples can be drawn."""
+    with ops.Graph().as_default(), self.test_session() as sess:
+      logits = np.asarray([
+          [0., 0., 0.],  #
+          [1., -1., 0.]
+      ]).astype(np.float32)
+      loss = loss_functions.OnehotCategoricalLogitsNegativeLogProbLoss(
+          array_ops.constant(logits))
+      sample = loss.sample(42)
+      sample = sess.run(sample)
+      self.assertEqual(sample.shape, (2, 3))
+
+  def testEvaluateOnTargets(self):
+    """Ensure log probability can be evaluated correctly."""
+    with ops.Graph().as_default(), self.test_session() as sess:
+      logits = np.asarray([
+          [0., 0., 0.],  #
+          [1., -1., 0.]
+      ]).astype(np.float32)
+      targets = np.asarray([2, 1]).astype(np.int32)
+      loss = loss_functions.OnehotCategoricalLogitsNegativeLogProbLoss(
+          array_ops.constant(logits), targets=array_ops.one_hot(targets, 3))
+      neg_log_prob = loss.evaluate()
+      neg_log_prob = sess.run(neg_log_prob)
+
+      # Calculate explicit log probability of targets.
+      probs = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
+      log_probs = np.log([
+          probs[0, targets[0]],  #
+          probs[1, targets[1]]
+      ])
+      expected_log_prob = np.sum(log_probs)
+
+      self.assertAllClose(neg_log_prob, -expected_log_prob)
+
+  def testEvaluateOnSample(self):
+    """Ensure log probability of a sample can be drawn."""
+    with ops.Graph().as_default(), self.test_session() as sess:
+      logits = np.asarray([
+          [0., 0., 0.],  #
+          [1., -1., 0.]
+      ]).astype(np.float32)
+      loss = loss_functions.OnehotCategoricalLogitsNegativeLogProbLoss(
+          array_ops.constant(logits))
+      neg_log_prob = loss.evaluate_on_sample(42)
+
+      # Simply ensure this doesn't crash. As the output is random, it's
+      # difficult to say if the output is correct or not...
+      neg_log_prob = sess.run(neg_log_prob)
+
+  def testMultiMinibatchRegistration(self):
+    """Ensure this loss function supports registering multiple minibatches."""
+    with ops.Graph().as_default():
+      tower_logits = []
+      loss = None
+      num_towers = 5
+      for _ in range(num_towers):
+        logits = random_ops.random_uniform(shape=[2, 3])
+        tower_logits.append(logits)
+        if loss is None:
+          loss = loss_functions.OnehotCategoricalLogitsNegativeLogProbLoss(
+              logits)
+        else:
+          loss.register_additional_minibatch(logits)
+      self.assertListEqual(loss.input_minibatches, tower_logits)
+      self.assertEqual(loss.num_registered_minibatches, num_towers)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py b/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
index 55fe38e3e9aab2dbd70a45cdc8fa0c208b036db0..97a97adbf5577cd2694d3055acaa59258ad27964 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
@@ -22,11 +22,15 @@ import numpy as np
 import numpy.random as npr
 
 from tensorflow.contrib.kfac.python.ops import utils
+from tensorflow.contrib.tpu.python.tpu import tpu_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -95,6 +99,18 @@ class SubGraphTest(test.TestCase):
     filtered_list = sub_graph.filter_list(input_list)
     self.assertEqual(filtered_list, [b])
 
+  def testVariableUses(self):
+    with ops.Graph().as_default():
+      var = variable_scope.get_variable('var', shape=[10, 10])
+      resource_var = variable_scope.get_variable(
+          'resource_var', shape=[10, 10], use_resource=True)
+      x = array_ops.zeros([3, 10])
+      z0 = math_ops.matmul(x, var) + math_ops.matmul(x, var)
+      z1 = math_ops.matmul(x, resource_var)
+      sub_graph = utils.SubGraph((z0, z1))
+      self.assertEqual(2, sub_graph.variable_uses(var))
+      self.assertEqual(1, sub_graph.variable_uses(resource_var))
+
 
 class UtilsTest(test.TestCase):
 
@@ -222,18 +238,6 @@ class UtilsTest(test.TestCase):
       self.assertAllClose(b, np.array([4., 5.]))
       self.assertAllClose(c, np.array([[6.], [7.], [8.], [9.]]))
 
-  def testComputePi(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      left_factor = array_ops.diag([1., 2., 0., 1.])
-      right_factor = array_ops.ones([2., 2.])
-
-      # pi is the sqrt of the left trace norm divided by the right trace norm
-      pi = utils.compute_pi(left_factor, right_factor)
-
-      pi_val = sess.run(pi)
-      self.assertEqual(1., pi_val)
-
   def testPosDefInvCholesky(self):
     with ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
@@ -265,6 +269,62 @@ class UtilsTest(test.TestCase):
       np_inv = np.linalg.inv(x + damp * np.eye(size))
       self.assertAllClose(sess.run(tf_inv), np_inv)
 
+  def testCrossReplicaMean(self):
+    """Ensures that cross_replica_mean() executes only when num_shards > 1."""
+    with ops.Graph().as_default():
+      with tpu_function.tpu_shard_context(4):
+        tensor = array_ops.zeros([], dtype=dtypes.float32)
+        mean = utils.cross_replica_mean(tensor)
+      self.assertNotEqual(mean, tensor)
+
+    with ops.Graph().as_default():
+      with tpu_function.tpu_shard_context(1):
+        tensor = array_ops.zeros([], dtype=dtypes.float32)
+        mean = utils.cross_replica_mean(tensor)
+      self.assertEqual(mean, tensor)
+
+    with ops.Graph().as_default():
+      with self.assertRaises(ValueError):  # Outside of TPU context.
+        tensor = array_ops.zeros([], dtype=dtypes.float32)
+        mean = utils.cross_replica_mean(tensor)
+
+  def testBatchExecute(self):
+    """Ensure batch_execute runs in a round-robin fashion."""
+
+    def increment_var(var):
+      return lambda: var.assign_add(1)
+
+    with ops.Graph().as_default(), self.test_session() as sess:
+      i = variable_scope.get_variable('i', initializer=0)
+      accumulators = [
+          variable_scope.get_variable('var%d' % j, initializer=0)
+          for j in range(3)
+      ]
+      thunks = [increment_var(var) for var in accumulators]
+      increment_accumulators = utils.batch_execute(i, thunks, 2)
+      increment_i = i.assign_add(1)
+
+      sess.run(variables.global_variables_initializer())
+
+      # Ensure one op per thunk.
+      self.assertEqual(3, len(increment_accumulators))
+
+      # Ensure round-robin execution.
+      values = []
+      for _ in range(5):
+        sess.run(increment_accumulators)
+        sess.run(increment_i)
+        values.append(sess.run(accumulators))
+      self.assertAllClose(
+          [
+              [1, 1, 0],  #
+              [2, 1, 1],  #
+              [2, 2, 2],  #
+              [3, 3, 2],  #
+              [4, 3, 3]
+          ],
+          values)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/kfac/python/ops/BUILD b/tensorflow/contrib/kfac/python/ops/BUILD
index b2272a4cee09b35ff672514077b4b128b870b772..ee6549b109399766579b6ea18a987ae2c8275983 100644
--- a/tensorflow/contrib/kfac/python/ops/BUILD
+++ b/tensorflow/contrib/kfac/python/ops/BUILD
@@ -38,6 +38,7 @@ py_library(
         ":utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:special_math_ops",
@@ -64,6 +65,7 @@ py_library(
     srcs = ["loss_functions.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_shape",
@@ -195,7 +197,9 @@ py_library(
     srcs = ["utils.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/tpu",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
diff --git a/tensorflow/contrib/kfac/python/ops/estimator.py b/tensorflow/contrib/kfac/python/ops/estimator.py
index 27ff951f16112e09b82ac6885072d966de09983f..a7b1f9d35c931fc44408be804479e758f28f7110 100644
--- a/tensorflow/contrib/kfac/python/ops/estimator.py
+++ b/tensorflow/contrib/kfac/python/ops/estimator.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import contextlib
 import itertools
-import math
 
 import numpy as np
 
@@ -67,7 +66,21 @@ class _DeviceContextGenerator(object):
 
 
 class FisherEstimator(object):
-  """Fisher estimator class supporting various approximations of the Fisher."""
+  """Fisher estimator class supporting various approximations of the Fisher.
+
+  Attributes:
+    cov_update_thunks: list of no-arg functions. Executing a function adds
+      covariance update ops for a single FisherFactor to the graph.
+    cov_update_ops: List of Ops. Running an op updates covariance matrices for a
+      single FisherFactor.
+    cov_update_op: Op. Running updates covariance matrices for all
+      FisherFactors.
+    inv_update_thunks: list of no-arg functions.  Executing a function adds
+      inverse update ops for a single FisherFactor to the graph.
+    inv_update_ops: List of Ops. Running an op updates inverse matrices for a
+      single FisherFactor.
+    inv_update_op: Op. Running updates inverse matrices for all FisherFactors.
+  """
 
   def __init__(self,
                variables,
@@ -75,7 +88,7 @@ class FisherEstimator(object):
                damping,
                layer_collection,
                estimation_mode="gradients",
-               colocate_gradients_with_ops=False,
+               colocate_gradients_with_ops=True,
                cov_devices=None,
                inv_devices=None):
     """Create a FisherEstimator object.
@@ -111,7 +124,7 @@ class FisherEstimator(object):
           is more expensive to compute than the other three options by a factor
           equal to the output dimension, roughly speaking.
       colocate_gradients_with_ops: Whether we should request gradients be
-          colocated with their respective ops.
+          colocated with their respective ops. (Default: True)
       cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
           computations will be placed on these devices in a round-robin fashion.
           Can be None, which means that no devices are specified.
@@ -123,12 +136,13 @@ class FisherEstimator(object):
       ValueError: If no losses have been registered with layer_collection.
     """
 
+    self._cov_ema_decay = cov_ema_decay
     self._variables = variables
     self._damping = damping
     self._estimation_mode = estimation_mode
     self._layers = layer_collection
     self._layers.create_subgraph()
-    self._check_registration(variables)
+    self._layers.check_registration(variables)
     self._gradient_fns = {
         "gradients": self._get_grads_lists_gradients,
         "empirical": self._get_grads_lists_empirical,
@@ -136,13 +150,31 @@ class FisherEstimator(object):
         "exact": self._get_grads_lists_exact
     }
     self._colocate_gradients_with_ops = colocate_gradients_with_ops
+
+    # TODO(b/70674513): Factor device placement outside of this class.
     self._cov_device_context_generator = _DeviceContextGenerator(cov_devices)
     if inv_devices == cov_devices:
       self._inv_device_context_generator = self._cov_device_context_generator
     else:
       self._inv_device_context_generator = _DeviceContextGenerator(inv_devices)
-    setup = self._setup(cov_ema_decay)
-    self.cov_update_op, self.inv_update_op, self.inv_updates_dict = setup
+
+    self._instantiate_factors()
+
+    self.cov_update_thunks = [
+        self._create_cov_update_thunk(factor)
+        for factor in self._layers.get_factors()
+    ]
+    self.cov_update_ops = [thunk() for thunk in self.cov_update_thunks]
+    self.cov_update_op = control_flow_ops.group(
+        self.cov_update_ops, name="cov_update_op")
+
+    self.inv_update_thunks = [
+        self._create_inv_update_thunk(factor)
+        for factor in self._layers.get_factors()
+    ]
+    self.inv_update_ops = [thunk() for thunk in self.inv_update_thunks]
+    self.inv_update_op = control_flow_ops.group(
+        self.inv_update_ops, name="inv_update_op")
 
   @property
   def variables(self):
@@ -203,61 +235,8 @@ class FisherEstimator(object):
     return self._apply_transformation(vecs_and_vars,
                                       lambda fb, vec: fb.multiply(vec))
 
-  def _check_registration(self, variables):
-    """Checks that all variable uses have been registered properly.
-
-    Args:
-      variables: List of variables.
-
-    Raises:
-      ValueError: If any registered variables are not included in the list.
-      ValueError: If any variable in the list is not registered.
-      ValueError: If any variable in the list is registered with the wrong
-          number of "uses" in the subgraph recorded (vs the number of times that
-          variable is actually used in the subgraph).
-    """
-    # Note that overlapping parameters (i.e. those that share variables) will
-    # be caught by layer_collection.LayerParametersDict during registration.
-
-    reg_use_map = self._layers.get_use_count_map()
-
-    error_messages = []
-
-    for var in variables:
-      total_uses = self._layers.subgraph.variable_uses(var)
-      reg_uses = reg_use_map[var]
-
-      if reg_uses == 0:
-        error_messages.append("Variable {} not registered.".format(var))
-      elif (not math.isinf(reg_uses)) and reg_uses != total_uses:
-        error_messages.append(
-            "Variable {} registered with wrong number of uses ({} "
-            "registrations vs {} uses).".format(var, reg_uses, total_uses))
-
-    num_get_vars = len(reg_use_map)
-
-    if num_get_vars > len(variables):
-      error_messages.append("{} registered variables were not included in list."
-                            .format(num_get_vars - len(variables)))
-
-    if error_messages:
-      error_messages = [
-          "Found the following errors with variable registration:"
-      ] + error_messages
-      raise ValueError("\n\t".join(error_messages))
-
-  def _setup(self, cov_ema_decay):
-    """Sets up the various operations.
-
-    Args:
-      cov_ema_decay: The decay factor used when calculating the covariance
-          estimate moving averages.
-
-    Returns:
-      A triple (covs_update_op, invs_update_op, inv_updates_dict), where
-      covs_update_op is the grouped Op to update all the covariance estimates,
-      invs_update_op is the grouped Op to update all the inverses, and
-      inv_updates_dict is a dict mapping Op names to individual inverse updates.
+  def _instantiate_factors(self):
+    """Instantiates FisherFactors' variables.
 
     Raises:
       ValueError: If estimation_mode was improperly specified at construction.
@@ -282,20 +261,25 @@ class FisherEstimator(object):
       with self._cov_device_context_generator():
         fb.instantiate_factors(grads_list, self.damping)
 
-    cov_updates = [
-        factor.make_covariance_update_op(cov_ema_decay)
-        for factor in self._layers.get_factors()
-    ]
-    inv_updates = {op.name: op for op in self._get_all_inverse_update_ops()}
+  def _create_cov_update_thunk(self, factor):
+    """Constructs a covariance update thunk for a single FisherFactor."""
+
+    def thunk():
+      with tf_ops.name_scope(
+          "create_cov_update_thunk", values=[self._cov_ema_decay]):
+        return factor.make_covariance_update_op(self._cov_ema_decay)
+
+    return thunk
 
-    return control_flow_ops.group(*cov_updates), control_flow_ops.group(
-        *inv_updates.values()), inv_updates
+  def _create_inv_update_thunk(self, factor):
+    """Constructs an inverse update thunk for a single FisherFactor."""
 
-  def _get_all_inverse_update_ops(self):
-    for factor in self._layers.get_factors():
-      with self._inv_device_context_generator():
-        for op in factor.make_inverse_update_ops():
-          yield op
+    def thunk():
+      with tf_ops.name_scope("create_inv_update_thunk"):
+        with self._inv_device_context_generator():
+          return control_flow_ops.group(factor.make_inverse_update_ops())
+
+    return thunk
 
   def _get_grads_lists_gradients(self, tensors):
     grads_flat = gradients_impl.gradients(
@@ -333,11 +317,7 @@ class FisherEstimator(object):
     return tuple((grad,) for grad in grads_all)
 
   def _get_grads_lists_exact(self, tensors):
-    """Returns a list of all gradients, computing them exactly.
-
-    Args:
-      tensors: Tensors for which to compute gradients.
-    """
+    """No docstring required."""
     # Loop over all coordinates of all losses.
     grads_all = []
     for loss in self._layers.losses:
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
index e822a1213a4132522be8031401609c78572cb1a6..0d2fa706f5853570bb8c04a9b9ac3378e2f2386e 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@@ -38,6 +38,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import enum  # pylint: disable=g-bad-import-order
 
 import six
 
@@ -52,14 +53,61 @@ from tensorflow.python.ops import math_ops
 #   damping /= num_replications ** NORMALIZE_DAMPING_POWER
 NORMALIZE_DAMPING_POWER = 1.0
 
+# Methods for adjusting damping for FisherBlocks. See
+# compute_pi_adjusted_damping() for details.
+PI_OFF_NAME = "off"
+PI_TRACENORM_NAME = "tracenorm"
+PI_TYPE = PI_TRACENORM_NAME
 
-def set_global_constants(normalize_damping_power=None):
+
+def set_global_constants(normalize_damping_power=None, pi_type=None):
   """Sets various global constants used by the classes in this module."""
   global NORMALIZE_DAMPING_POWER
+  global PI_TYPE
 
   if normalize_damping_power is not None:
     NORMALIZE_DAMPING_POWER = normalize_damping_power
 
+  if pi_type is not None:
+    PI_TYPE = pi_type
+
+
+def normalize_damping(damping, num_replications):
+  """Normalize damping after adjusting scale by NORMALIZE_DAMPING_POWER."""
+  if NORMALIZE_DAMPING_POWER:
+    return damping / (num_replications ** NORMALIZE_DAMPING_POWER)
+  return damping
+
+
+def compute_pi_tracenorm(left_cov, right_cov):
+  """Computes the scalar constant pi for Tikhonov regularization/damping.
+
+  pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) )
+  See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
+
+  Args:
+    left_cov: The left Kronecker factor "covariance".
+    right_cov: The right Kronecker factor "covariance".
+
+  Returns:
+    The computed scalar constant pi for these Kronecker Factors (as a Tensor).
+  """
+  # Instead of dividing by the dim of the norm, we multiply by the dim of the
+  # other norm. This works out the same in the ratio.
+  left_norm = math_ops.trace(left_cov) * right_cov.shape.as_list()[0]
+  right_norm = math_ops.trace(right_cov) * left_cov.shape.as_list()[0]
+  return math_ops.sqrt(left_norm / right_norm)
+
+
+def compute_pi_adjusted_damping(left_cov, right_cov, damping):
+
+  if PI_TYPE == PI_TRACENORM_NAME:
+    pi = compute_pi_tracenorm(left_cov, right_cov)
+    return (damping * pi, damping / pi)
+
+  elif PI_TYPE == PI_OFF_NAME:
+    return (damping, damping)
+
 
 @six.add_metaclass(abc.ABCMeta)
 class FisherBlock(object):
@@ -153,7 +201,7 @@ class FullFB(FisherBlock):
     self._factor.register_damped_inverse(damping)
 
   def multiply_inverse(self, vector):
-    inverse = self._factor.get_inverse(self._damping)
+    inverse = self._factor.get_damped_inverse(self._damping)
     out_flat = math_ops.matmul(inverse, utils.tensors_to_column(vector))
     return utils.column_to_tensors(vector, out_flat)
 
@@ -410,9 +458,8 @@ class ConvDiagonalFB(FisherBlock):
         inputs_shape[1] * inputs_shape[2] //
         (self._strides[1] * self._strides[2]))
 
-    if NORMALIZE_DAMPING_POWER:
-      damping /= self._num_locations ** NORMALIZE_DAMPING_POWER
-    self._damping = damping
+    self._damping = (self._num_locations
+                     * normalize_damping(damping, self._num_locations))
 
     self._factor = self._layer_collection.make_or_get_factor(
         fisher_factors.ConvDiagonalFactor,
@@ -465,11 +512,10 @@ class KroneckerProductFB(FisherBlock):
     Args:
       damping: The base damping factor (float or Tensor) for the damped inverse.
     """
-    pi = utils.compute_pi(self._input_factor.get_cov(),
-                          self._output_factor.get_cov())
-
-    self._input_damping = (damping**0.5) * pi
-    self._output_damping = (damping**0.5) / pi
+    self._input_damping, self._output_damping = compute_pi_adjusted_damping(
+        self._input_factor.get_cov(),
+        self._output_factor.get_cov(),
+        damping**0.5)
 
     self._input_factor.register_damped_inverse(self._input_damping)
     self._output_factor.register_damped_inverse(self._output_damping)
@@ -487,8 +533,9 @@ class KroneckerProductFB(FisherBlock):
     return 1.0
 
   def multiply_inverse(self, vector):
-    left_factor_inv = self._input_factor.get_inverse(self._input_damping)
-    right_factor_inv = self._output_factor.get_inverse(self._output_damping)
+    left_factor_inv = self._input_factor.get_damped_inverse(self._input_damping)
+    right_factor_inv = self._output_factor.get_damped_inverse(
+        self._output_damping)
     reshaped_vector = utils.layer_params_to_mat2d(vector)
     reshaped_out = math_ops.matmul(left_factor_inv,
                                    math_ops.matmul(reshaped_vector,
@@ -650,8 +697,8 @@ class ConvKFCBasicFB(KroneckerProductFB):
     grads_list = tuple(_concat_along_batch_dim(grads) for grads in grads_list)
 
     # Infer number of locations upon which convolution is applied.
-    self._num_locations = _num_conv_locations(inputs.shape.as_list(),
-                                              self._strides)
+    self._num_locations = num_conv_locations(inputs.shape.as_list(),
+                                             self._strides)
 
     self._input_factor = self._layer_collection.make_or_get_factor(
         fisher_factors.ConvInputKroneckerFactor,
@@ -660,11 +707,9 @@ class ConvKFCBasicFB(KroneckerProductFB):
     self._output_factor = self._layer_collection.make_or_get_factor(
         fisher_factors.ConvOutputKroneckerFactor, (grads_list,))
 
-    if NORMALIZE_DAMPING_POWER:
-      damping /= self._num_locations**NORMALIZE_DAMPING_POWER
-    self._damping = damping
-
+    damping = normalize_damping(damping, self._num_locations)
     self._register_damped_input_and_output_inverses(damping)
+    self._damping = damping
 
   @property
   def _renorm_coeff(self):
@@ -717,6 +762,267 @@ def _concat_along_batch_dim(tensor_list):
     return array_ops.concat(tensor_list, axis=0)
 
 
-def _num_conv_locations(input_shape, strides):
-  """Returns the number of locations a Conv kernel is applied to."""
+def num_conv_locations(input_shape, strides):
+  """Returns the number of spatial locations a 2D Conv kernel is applied to.
+
+  Args:
+    input_shape: list representing shape of inputs to the Conv layer.
+    strides: list representing strides for the Conv kernel.
+
+  Returns:
+    A scalar |T| denoting the number of spatial locations for the Conv layer.
+  """
   return input_shape[1] * input_shape[2] // (strides[1] * strides[2])
+
+
+class FullyConnectedMultiIndepFB(KroneckerProductFB):
+  """FisherBlock for fully-connected layers that share parameters.
+  """
+
+  def __init__(self, layer_collection, inputs, outputs, has_bias=False):
+    """Creates a FullyConnectedMultiIndepFB block.
+
+    Args:
+      layer_collection: LayerCollection instance.
+      inputs: list or tuple of Tensors. Each Tensor has shape [batch_size,
+        inputs_size].
+      outputs: list or tuple of Tensors. Each Tensor has shape [batch_size,
+        outputs_size].
+      has_bias: bool. If True, estimates Fisher with respect to a bias
+        parameter as well as the layer's parameters.
+    """
+
+    assert len(inputs) == len(outputs)
+    # We need to make sure inputs and outputs are tuples and not lists so that
+    # they get hashed by layer_collection.make_or_get_factor properly.
+    self._inputs = tuple(inputs)
+    self._outputs = tuple(outputs)
+    self._has_bias = has_bias
+    self._num_uses = len(inputs)
+
+    super(FullyConnectedMultiIndepFB, self).__init__(layer_collection)
+
+  @property
+  def num_registered_minibatches(self):
+    # TODO(b/69411207): Add support for registering additional minibatches.
+    return 1
+
+  def instantiate_factors(self, grads_list, damping):
+
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF,
+        ((self._inputs,), self._has_bias))
+
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF, (grads_list,))
+
+    damping = normalize_damping(damping, self._num_uses)
+    self._register_damped_input_and_output_inverses(damping)
+
+  @property
+  def _renorm_coeff(self):
+    return self._num_uses
+
+  def tensors_to_compute_grads(self):
+    return self._outputs
+
+  def num_inputs(self):
+    return len(self._inputs)
+
+
+class SeriesFBApproximation(enum.IntEnum):
+  """See FullyConnectedSeriesFB.__init__ for description and usage."""
+  option1 = 1
+  option2 = 2
+
+
+class FullyConnectedSeriesFB(FisherBlock):
+  """FisherBlock for fully-connected layers that share parameters across time.
+
+  See the following preprint for details:
+    https://openreview.net/pdf?id=HyMTkQZAb
+
+  See the end of the appendix of the paper for a pseudo-code of the
+  algorithm being implemented by multiply_inverse here.  Note that we are
+  using pre-computed versions of certain matrix-matrix products to speed
+  things up.  This is explicitly explained wherever it is done.
+  """
+
+  def __init__(self,
+               layer_collection,
+               inputs,
+               outputs,
+               has_bias=False,
+               option=SeriesFBApproximation.option2):
+    """Constructs a new `FullyConnectedSeriesFB`.
+
+    Args:
+      layer_collection: The collection of all layers in the K-FAC approximate
+        Fisher information matrix to which this FisherBlock belongs.
+      inputs: List of tensors of shape [batch_size, input_size].
+        Inputs to the layer.
+      outputs: List of tensors of shape [batch_size, input_size].
+        Outputs of the layer (before activations).
+      has_bias: Whether the layer includes a bias parameter.
+      option: A `SeriesFBApproximation` specifying the simplifying assumption
+        to be used in this block. `option1` approximates the cross-covariance
+        over time as a symmetric matrix, while `option2` makes
+        the assumption that training sequences are infinitely long. See section
+        3.5 of the paper for more details.
+    """
+
+    assert len(inputs) == len(outputs)
+    # We need to make sure inputs and outputs are tuples and not lists so that
+    # they get hashed by layer_collection.make_or_get_factor properly.
+    self._inputs = tuple(inputs)
+    self._outputs = tuple(outputs)
+    self._has_bias = has_bias
+    self._num_timesteps = len(inputs)
+    self._option = option
+
+    super(FullyConnectedSeriesFB, self).__init__(layer_collection)
+
+  @property
+  def num_registered_minibatches(self):
+    # TODO(b/69411207): Add support for registering additional minibatches.
+    return 1
+
+  def instantiate_factors(self, grads_list, damping):
+
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF, ((self._inputs,), self._has_bias))
+
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF, (grads_list,))
+
+    damping = normalize_damping(damping, self._num_timesteps)
+    self._damping_input, self._damping_output = compute_pi_adjusted_damping(
+        self._input_factor.get_cov(),
+        self._output_factor.get_cov(),
+        damping**0.5)
+
+    if self._option == SeriesFBApproximation.option1:
+      self._input_factor.register_option1quants(self._damping_input)
+      self._output_factor.register_option1quants(self._damping_output)
+    elif self._option == SeriesFBApproximation.option2:
+      self._input_factor.register_option2quants(self._damping_input)
+      self._output_factor.register_option2quants(self._damping_output)
+    else:
+      raise ValueError(
+          "Unrecognized FullyConnectedSeriesFB approximation: {}".format(
+              self._option))
+
+  def multiply_inverse(self, vector):
+    # pylint: disable=invalid-name
+
+    Z = utils.layer_params_to_mat2d(vector)
+
+    # Derivations were done for "batch_dim==1" case so we need to convert to
+    # that orientation:
+    Z = array_ops.transpose(Z)
+
+    if self._option == SeriesFBApproximation.option1:
+
+      # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G.
+      L_A, psi_A = self._input_factor.get_option1quants(self._damping_input)
+      L_G, psi_G = self._output_factor.get_option1quants(self._damping_output)
+
+      def gamma(x):
+        # We are assuming that each case has the same number of time-steps.
+        # If this stops being the case one shouldn't simply replace this T
+        # with its average value.  Instead, one needs to go back to the
+        # definition of the gamma function from the paper.
+        T = self._num_timesteps
+        return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T))
+
+      # Y = gamma( psi_G*psi_A^T ) (computed element-wise)
+      # Even though Y is Z-independent we are recomputing it from the psi's
+      # each since Y depends on both A and G quantities, and it is relatively
+      # cheap to compute.
+      Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A)
+
+      # Z = L_G^T * Z * L_A
+      # This is equivalent to the following computation from the original
+      # pseudo-code:
+      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # Z = U_G^T * Z * U_A
+      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True)
+
+      # Z = Z .* Y
+      Z *= Y
+
+      # Z = L_G * Z * L_A^T
+      # This is equivalent to the following computation from the original
+      # pseudo-code:
+      # Z = U_G * Z * U_A^T
+      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True))
+
+    elif self._option == SeriesFBApproximation.option2:
+
+      # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1),
+      # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G.
+      P_A, K_A, mu_A = self._input_factor.get_option2quants(self._damping_input)
+      P_G, K_G, mu_G = self._output_factor.get_option2quants(
+          self._damping_output)
+
+      # Our approach differs superficially from the pseudo-code in the paper
+      # in order to reduce the total number of matrix-matrix multiplies.
+      # In particular, the first three computations in the pseudo code are
+      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # Z = Z - hPsi_G^T * Z * hPsi_A
+      # Z = E_G^T * Z * E_A
+      # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that
+      # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2)
+      # the entire computation can be written as
+      # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
+      #     - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A
+      #   = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
+      #     - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A
+      #   = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A
+      #     -  E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A
+      #   = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A
+      # This final expression is computed by the following two lines:
+      # Z = Z - P_G * Z * P_A^T
+      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True))
+      # Z = K_G^T * Z * K_A
+      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True)
+
+      # Z = Z ./ (1*1^T - mu_G*mu_A^T)
+      # Be careful with the outer product.  We don't want to accidentally
+      # make it an inner-product instead.
+      tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A
+      # Prevent some numerical issues by setting any 0.0 eigs to 1.0
+      tmp += 1.0 * math_ops.cast(math_ops.equal(tmp, 0.0), dtype=tmp.dtype)
+      Z /= tmp
+
+      # We now perform the transpose/reverse version of the operations
+      # derived above, whose derivation from the original pseudo-code is
+      # analgous.
+      # Z = K_G * Z * K_A^T
+      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True))
+
+      # Z = Z - P_G^T * Z * P_A
+      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True)
+
+      # Z = normalize (1/E[T]) * Z
+      # Note that this normalization is done because we compute the statistics
+      # by averaging, not summing, over time. (And the gradient is presumably
+      # summed over time, not averaged, and thus their scales are different.)
+      Z /= math_ops.cast(self._num_timesteps, Z.dtype)
+
+    # Convert back to the "batch_dim==0" orientation.
+    Z = array_ops.transpose(Z)
+
+    return utils.mat2d_to_layer_params(vector, Z)
+
+    # pylint: enable=invalid-name
+
+  def multiply(self, vector):
+    raise NotImplementedError
+
+  def tensors_to_compute_grads(self):
+    return self._outputs
+
+  def num_inputs(self):
+    return len(self._inputs)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks_lib.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks_lib.py
index 59389f8d385c18f50914d690cfaa2825ef807ed3..ac396309206fe09af65c2b70840a513fb25b579b 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks_lib.py
@@ -33,6 +33,10 @@ _allowed_symbols = [
     'ConvKFCBasicFB',
     'ConvDiagonalFB',
     'set_global_constants',
+    'compute_pi_tracenorm',
+    'compute_pi_adjusted_damping',
+    'num_conv_locations',
+    'normalize_damping'
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
index fbc192f1dcfa0b384e2cb31c43af3651436321ea..bcba18ae147c6ceca50bc9a2a17e01fc201d88c1 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
@@ -27,8 +27,11 @@ import six
 from tensorflow.contrib.kfac.python.ops import utils
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -50,11 +53,15 @@ EIGENVALUE_DECOMPOSITION_THRESHOLD = 2
 # matrix powers. Must be nonnegative.
 EIGENVALUE_CLIPPING_THRESHOLD = 0.0
 
+# Colocate the covariance ops and variables with the input tensors for each
+# factor.
+COLOCATE_COV_OPS_WITH_INPUTS = True
+
 
 @contextlib.contextmanager
-def _maybe_colocate_with(op, colocate_cov_ops_with_inputs):
-  """Context to colocate with `op` if `colocate_cov_ops_with_inputs`."""
-  if colocate_cov_ops_with_inputs:
+def maybe_colocate_with(op):
+  """Context to colocate with `op` if `COLOCATE_COV_OPS_WITH_INPUTS`."""
+  if COLOCATE_COV_OPS_WITH_INPUTS:
     if isinstance(op, (list, tuple)):
       with tf_ops.colocate_with(op[0]):
         yield
@@ -68,12 +75,14 @@ def _maybe_colocate_with(op, colocate_cov_ops_with_inputs):
 def set_global_constants(init_covariances_at_zero=None,
                          zero_debias=None,
                          eigenvalue_decomposition_threshold=None,
-                         eigenvalue_clipping_threshold=None):
+                         eigenvalue_clipping_threshold=None,
+                         colocate_cov_ops_with_inputs=None):
   """Sets various global constants used by the classes in this module."""
   global INIT_COVARIANCES_AT_ZERO
   global ZERO_DEBIAS
   global EIGENVALUE_DECOMPOSITION_THRESHOLD
   global EIGENVALUE_CLIPPING_THRESHOLD
+  global COLOCATE_COV_OPS_WITH_INPUTS
 
   if init_covariances_at_zero is not None:
     INIT_COVARIANCES_AT_ZERO = init_covariances_at_zero
@@ -83,6 +92,8 @@ def set_global_constants(init_covariances_at_zero=None,
     EIGENVALUE_DECOMPOSITION_THRESHOLD = eigenvalue_decomposition_threshold
   if eigenvalue_clipping_threshold is not None:
     EIGENVALUE_CLIPPING_THRESHOLD = eigenvalue_clipping_threshold
+  if colocate_cov_ops_with_inputs is not None:
+    COLOCATE_COV_OPS_WITH_INPUTS = colocate_cov_ops_with_inputs
 
 
 def inverse_initializer(shape, dtype, partition_info=None):  # pylint: disable=unused-argument
@@ -101,7 +112,55 @@ def diagonal_covariance_initializer(shape, dtype, partition_info):  # pylint: di
   return array_ops.ones(shape, dtype)
 
 
-def _compute_cov(tensor, normalizer=None):
+def extract_image_patches(image, ksizes, strides, padding, name=None):
+  """Extracts image patches for an N-dimensional convolution.
+
+  This function is a compatibility wrapper over tf.extract_image_patches(), as
+  ExtractImagePatches isn't yet implemented in XLA.
+
+  Args:
+    image: Tensor of shape [batch, in_x, in_y, ..., in_channels]. Input images.
+      All dimensions except 'batch' must be defined.
+    ksizes: [filter_x, filter_y, ...]. Spatial shape of filter in each
+      dimension.
+    strides: [stride_x, stride_y, ...]. Spatial stride for filter in each
+      dimension.
+    padding: str. "VALID" or "SAME".
+    name: str or None. name of Op.
+
+  Returns:
+    result: [batch, out_x, out_y, ..., filter_x, filter_y, ..., in_channels].
+      Contains image patches to which conv kernel would be applied for each
+      output location. [out_x, out_y, ...] depends on padding.
+  """
+  if not utils.on_tpu():
+    return array_ops.extract_image_patches(
+        image,
+        ksizes=([1] + list(ksizes) + [1]),
+        strides=([1] + list(strides) + [1]),
+        rates=[1, 1, 1, 1],
+        padding=padding,
+        name=name)
+
+  with tf_ops.name_scope(name, "extract_image_patches",
+                         [image, ksizes, strides, padding]):
+    batch = image.shape.as_list()[0]
+    in_channels = image.shape.as_list()[-1]
+
+    # Map each input feature to a location in the output.
+    out_channels = np.prod(ksizes) * in_channels
+    filters = linalg_ops.eye(out_channels),
+    filters = array_ops.reshape(filters, ksizes + [in_channels, out_channels])
+
+    result = nn.convolution(image, filters, padding, strides=strides)
+    out_spatial = result.shape.as_list()[1:-1]
+    result = array_ops.reshape(
+        result, [batch or -1] + out_spatial + ksizes + [in_channels])
+
+    return result
+
+
+def compute_cov(tensor, tensor_right=None, normalizer=None):
   """Compute the empirical second moment of the rows of a 2D Tensor.
 
   This function is meant to be applied to random matrices for which the true row
@@ -109,6 +168,8 @@ def _compute_cov(tensor, normalizer=None):
 
   Args:
     tensor: A 2D Tensor.
+    tensor_right: An optional 2D Tensor. If provided, this function computes
+      the matrix product tensor^T * tensor_right instead of tensor^T * tensor.
     normalizer: optional scalar for the estimator (by default, the normalizer is
         the number of rows of tensor).
 
@@ -117,12 +178,17 @@ def _compute_cov(tensor, normalizer=None):
   """
   if normalizer is None:
     normalizer = array_ops.shape(tensor)[0]
-  cov = (math_ops.matmul(tensor, tensor, transpose_a=True) / math_ops.cast(
-      normalizer, tensor.dtype))
-  return (cov + array_ops.transpose(cov)) / math_ops.cast(2, cov.dtype)
+  if tensor_right is None:
+    cov = (
+        math_ops.matmul(tensor, tensor, transpose_a=True) / math_ops.cast(
+            normalizer, tensor.dtype))
+    return (cov + array_ops.transpose(cov)) / math_ops.cast(2.0, cov.dtype)
+  else:
+    return (math_ops.matmul(tensor, tensor_right, transpose_a=True) /
+            math_ops.cast(normalizer, tensor.dtype))
 
 
-def _append_homog(tensor):
+def append_homog(tensor):
   """Appends a homogeneous coordinate to the last dimension of a Tensor.
 
   Args:
@@ -135,7 +201,7 @@ def _append_homog(tensor):
   rank = len(tensor.shape.as_list())
   shape = array_ops.concat([array_ops.shape(tensor)[:-1], [1]], axis=0)
   ones = array_ops.ones(shape, dtype=tensor.dtype)
-  return array_ops.concat([tensor, ones], axis=rank-1)
+  return array_ops.concat([tensor, ones], axis=rank - 1)
 
 
 def scope_string_from_params(params):
@@ -173,8 +239,8 @@ def scope_string_from_params(params):
     elif isinstance(param, (tf_ops.Tensor, variables.Variable)):
       name_parts.append(scope_string_from_name(param))
     else:
-      raise ValueError(
-          "Encountered an unsupported param type {}".format(type(param)))
+      raise ValueError("Encountered an unsupported param type {}".format(
+          type(param)))
   return "_".join(name_parts)
 
 
@@ -225,6 +291,10 @@ class FisherFactor(object):
     """
     pass
 
+  @abc.abstractproperty
+  def _dtype(self):
+    pass
+
   @property
   def _cov_initializer(self):
     return covariance_initializer
@@ -236,7 +306,8 @@ class FisherFactor(object):
           "cov",
           initializer=self._cov_initializer,
           shape=self._cov_shape,
-          trainable=False)
+          trainable=False,
+          dtype=self._dtype)
 
   @abc.abstractmethod
   def _compute_new_cov(self, idx=0):
@@ -250,15 +321,27 @@ class FisherFactor(object):
     Returns:
       An Op for updating the covariance Variable referenced by _cov.
     """
-    new_cov = math_ops.add_n(
-        tuple(self._compute_new_cov(idx) for idx in range(self._num_sources)))
-
-    return moving_averages.assign_moving_average(
-        self._cov, new_cov, ema_decay, zero_debias=ZERO_DEBIAS)
+    new_cov_contribs = tuple(self._compute_new_cov(idx)
+                             for idx in range(self._num_sources))
+    # This gets the job done but we might want a better solution in the future.
+    # In particular, we could have a separate way of specifying where the
+    # the cov variables finally end up, independent of where their various
+    # contributions are computed.  Right now these are the same thing, but in
+    # the future we might want to perform the cov computations on each tower,
+    # so that each tower will be considered a "source" (allowing us to reuse
+    # the existing "source" code for this).
+    with maybe_colocate_with(new_cov_contribs[0]):
+      new_cov = math_ops.add_n(new_cov_contribs)
+      # Synchronize value across all TPU cores.
+      if utils.on_tpu():
+        new_cov = utils.cross_replica_mean(new_cov)
+      return moving_averages.assign_moving_average(
+          self._cov, new_cov, ema_decay, zero_debias=ZERO_DEBIAS)
 
+  @abc.abstractmethod
   def make_inverse_update_ops(self):
     """Create and return update ops corresponding to registered computations."""
-    return []
+    pass
 
   def get_cov(self):
     return self._cov
@@ -273,6 +356,13 @@ class InverseProvidingFactor(FisherFactor):
   _cov_shape properties.
   """
 
+  # TODO(b/69108481): This class (and its subclasses) should be refactored to
+  # serve the matrix quantities it computes as both (potentially stale)
+  # variables, updated by the inverse update ops, and fresh values stored in
+  # tensors that recomputed once every session.run() call.  Currently matpower
+  # and damp_inverse have the former behavior, while eigendecomposition has
+  # the latter.
+
   def __init__(self):
     self._inverses_by_damping = {}
     self._matpower_by_exp_and_damping = {}
@@ -283,6 +373,10 @@ class InverseProvidingFactor(FisherFactor):
   def register_damped_inverse(self, damping):
     """Registers a damped inverse needed by a FisherBlock.
 
+    This creates a variable and signals make_inverse_update_ops to make the
+    corresponding update op.  The variable can be read via the method
+    get_inverse.
+
     Args:
       damping: The damping value (float or Tensor) for this factor.
     """
@@ -293,12 +387,17 @@ class InverseProvidingFactor(FisherFactor):
             "inv_damp{}".format(damping_string),
             initializer=inverse_initializer,
             shape=self._cov_shape,
-            trainable=False)
+            trainable=False,
+            dtype=self._dtype)
       self._inverses_by_damping[damping] = inv
 
   def register_matpower(self, exp, damping):
     """Registers a matrix power needed by a FisherBlock.
 
+    This creates a variable and signals make_inverse_update_ops to make the
+    corresponding update op.  The variable can be read via the method
+    get_matpower.
+
     Args:
       exp: The exponent (float or Tensor) to raise the matrix to.
       damping: The damping value (float or Tensor).
@@ -311,59 +410,81 @@ class InverseProvidingFactor(FisherFactor):
             "matpower_exp{}_damp{}".format(exp_string, damping_string),
             initializer=inverse_initializer,
             shape=self._cov_shape,
-            trainable=False)
+            trainable=False,
+            dtype=self._dtype)
       self._matpower_by_exp_and_damping[(exp, damping)] = matpower
 
-  def register_eigendecomp(self):
-    """Registers that an eigendecomposition is needed by a FisherBlock."""
-    if not self._eigendecomp:
-      self._eigendecomp = linalg_ops.self_adjoint_eig(self._cov)
-
   def make_inverse_update_ops(self):
     """Create and return update ops corresponding to registered computations."""
-    ops = super(InverseProvidingFactor, self).make_inverse_update_ops()
+    ops = []
+
+    # We do this to ensure that we don't reuse the eigendecomp from old calls
+    # to make_inverse_update_ops that may be placed on different devices.  This
+    # can happen is the user has both a permanent and lazily constructed
+    # version of the inverse ops (and only uses one of them).
+    self.reset_eigendecomp()
 
     num_inverses = len(self._inverses_by_damping)
     matrix_power_registered = bool(self._matpower_by_exp_and_damping)
-    use_eig = (self._eigendecomp or matrix_power_registered or
-               num_inverses >= EIGENVALUE_DECOMPOSITION_THRESHOLD)
+    use_eig = (
+        self._eigendecomp or matrix_power_registered or
+        num_inverses >= EIGENVALUE_DECOMPOSITION_THRESHOLD)
 
     if use_eig:
-      self.register_eigendecomp()  # ensures self._eigendecomp is set
-      eigenvalues, eigenvectors = self._eigendecomp  # pylint: disable=unpacking-non-sequence
-
-      # The matrix self._cov is positive semidefinite by construction, but the
-      # numerical eigenvalues could be negative due to numerical errors, so here
-      # we clip them to be at least EIGENVALUE_CLIPPING_THRESHOLD.
-      clipped_eigenvalues = math_ops.maximum(eigenvalues,
-                                             EIGENVALUE_CLIPPING_THRESHOLD)
+      eigenvalues, eigenvectors = self.get_eigendecomp()  # pylint: disable=unpacking-non-sequence
 
       for damping, inv in self._inverses_by_damping.items():
         ops.append(
             inv.assign(
-                math_ops.matmul(eigenvectors / (clipped_eigenvalues + damping),
+                math_ops.matmul(eigenvectors / (eigenvalues + damping),
                                 array_ops.transpose(eigenvectors))))
 
       for (exp, damping), matpower in self._matpower_by_exp_and_damping.items():
         ops.append(
             matpower.assign(
-                math_ops.matmul(eigenvectors * (clipped_eigenvalues + damping)**
-                                exp, array_ops.transpose(eigenvectors))))
+                math_ops.matmul(eigenvectors *
+                                (eigenvalues + damping)**exp,
+                                array_ops.transpose(eigenvectors))))
+      # These ops share computation and should be run on a single device.
+      ops = [control_flow_ops.group(*ops)]
     else:
       for damping, inv in self._inverses_by_damping.items():
         ops.append(inv.assign(utils.posdef_inv(self._cov, damping)))
 
     return ops
 
-  def get_inverse(self, damping):
+  def get_damped_inverse(self, damping):
+    # Note that this function returns a variable which gets updated by the
+    # inverse ops.  It may be stale / inconsistent with the latest value of
+    # get_cov().
     return self._inverses_by_damping[damping]
 
   def get_matpower(self, exp, damping):
+    # Note that this function returns a variable which gets updated by the
+    # inverse ops.  It may be stale / inconsistent with the latest value of
+    # get_cov().
     return self._matpower_by_exp_and_damping[(exp, damping)]
 
   def get_eigendecomp(self):
+    """Creates or retrieves eigendecomposition of self._cov."""
+    # Unlike get_inverse and get_matpower this doesn't retrieve a stored
+    # variable, but instead always computes a fresh version from the current
+    # value of get_cov().
+    if not self._eigendecomp:
+      eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig(self._cov)
+
+      # The matrix self._cov is positive semidefinite by construction, but the
+      # numerical eigenvalues could be negative due to numerical errors, so here
+      # we clip them to be at least FLAGS.eigenvalue_clipping_threshold
+      clipped_eigenvalues = math_ops.maximum(eigenvalues,
+                                             EIGENVALUE_CLIPPING_THRESHOLD)
+      self._eigendecomp = (clipped_eigenvalues, eigenvectors)
+
     return self._eigendecomp
 
+  def reset_eigendecomp(self):
+    self._eigendecomp = None
+
 
 class FullFactor(InverseProvidingFactor):
   """FisherFactor for a full matrix representation of the Fisher of a parameter.
@@ -374,41 +495,38 @@ class FullFactor(InverseProvidingFactor):
 
   def __init__(self,
                params_grads,
-               batch_size,
-               colocate_cov_ops_with_inputs=False):
+               batch_size):
     self._batch_size = batch_size
-    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
-    self._orig_params_grads_name = scope_string_from_params(
-        [params_grads, self._batch_size])
-    params_grads_flat = []
-    for params_grad in params_grads:
-      with _maybe_colocate_with(params_grad,
-                                self._colocate_cov_ops_with_inputs):
-        col = utils.tensors_to_column(params_grad)
-        params_grads_flat.append(col)
-    self._params_grads_flat = tuple(params_grads_flat)
+    self._params_grads = tuple(utils.ensure_sequence(params_grad)
+                               for params_grad in params_grads)
     super(FullFactor, self).__init__()
 
   @property
   def _var_scope(self):
-    return "ff_full/" + self._orig_params_grads_name
+    return "ff_full/" + scope_string_from_params(
+        [self._params_grads, self._batch_size])
 
   @property
   def _cov_shape(self):
-    size = self._params_grads_flat[0].shape[0]
-    return [size, size]
+    size = sum(param_grad.shape.num_elements()
+               for param_grad in self._params_grads[0])
+    return (size, size)
 
   @property
   def _num_sources(self):
-    return len(self._params_grads_flat)
+    return len(self._params_grads)
+
+  @property
+  def _dtype(self):
+    return self._params_grads[0][0].dtype
 
   def _compute_new_cov(self, idx=0):
     # This will be a very basic rank 1 estimate
-    with _maybe_colocate_with(self._params_grads_flat[idx],
-                              self._colocate_cov_ops_with_inputs):
-      return ((self._params_grads_flat[idx] * array_ops.transpose(
-          self._params_grads_flat[idx])) / math_ops.cast(
-              self._batch_size, self._params_grads_flat[idx].dtype))
+    with maybe_colocate_with(self._params_grads[idx]):
+      params_grads_flat = utils.tensors_to_column(self._params_grads[idx])
+      return ((params_grads_flat * array_ops.transpose(
+          params_grads_flat)) / math_ops.cast(self._batch_size,
+                                              params_grads_flat.dtype))
 
 
 class DiagonalFactor(FisherFactor):
@@ -421,6 +539,9 @@ class DiagonalFactor(FisherFactor):
   def _cov_initializer(self):
     return diagonal_covariance_initializer
 
+  def make_inverse_update_ops(self):
+    return []
+
 
 class NaiveDiagonalFactor(DiagonalFactor):
   """FisherFactor for a diagonal approximation of any type of param's Fisher.
@@ -431,38 +552,36 @@ class NaiveDiagonalFactor(DiagonalFactor):
 
   def __init__(self,
                params_grads,
-               batch_size,
-               colocate_cov_ops_with_inputs=False):
+               batch_size):
+    self._params_grads = tuple(utils.ensure_sequence(params_grad)
+                               for params_grad in params_grads)
     self._batch_size = batch_size
-    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
-    params_grads_flat = []
-    for params_grad in params_grads:
-      with _maybe_colocate_with(params_grad,
-                                self._colocate_cov_ops_with_inputs):
-        col = utils.tensors_to_column(params_grad)
-        params_grads_flat.append(col)
-    self._params_grads = tuple(params_grads_flat)
-    self._orig_params_grads_name = scope_string_from_params(
-        [self._params_grads, self._batch_size])
     super(NaiveDiagonalFactor, self).__init__()
 
   @property
   def _var_scope(self):
-    return "ff_naivediag/" + self._orig_params_grads_name
+    return "ff_naivediag/" + scope_string_from_params(
+        [self._params_grads, self._batch_size])
 
   @property
   def _cov_shape(self):
-    return self._params_grads[0].shape
+    size = sum(param_grad.shape.num_elements()
+               for param_grad in self._params_grads[0])
+    return (size, 1)
 
   @property
   def _num_sources(self):
     return len(self._params_grads)
 
+  @property
+  def _dtype(self):
+    return self._params_grads[0][0].dtype
+
   def _compute_new_cov(self, idx=0):
-    with _maybe_colocate_with(self._params_grads[idx],
-                              self._colocate_cov_ops_with_inputs):
-      return (math_ops.square(self._params_grads[idx]) / math_ops.cast(
-          self._batch_size, self._params_grads[idx].dtype))
+    with maybe_colocate_with(self._params_grads[idx]):
+      params_grads_flat = utils.tensors_to_column(self._params_grads[idx])
+      return (math_ops.square(params_grads_flat) / math_ops.cast(
+          self._batch_size, params_grads_flat.dtype))
 
 
 class FullyConnectedDiagonalFactor(DiagonalFactor):
@@ -471,18 +590,15 @@ class FullyConnectedDiagonalFactor(DiagonalFactor):
   Given in = [batch_size, input_size] and out_grad = [batch_size, output_size],
   approximates the covariance as,
 
-    Cov(in, out) = (1/batch_size) \sum_{i} outer(in[i], out_grad[i]) ** 2.0
+    Cov(in, out) = (1/batch_size) sum_{i} outer(in[i], out_grad[i]) ** 2.0
 
   where the square is taken element-wise.
   """
 
-  # TODO(jamesmartens): add units tests for this class
-
   def __init__(self,
                inputs,
                outputs_grads,
-               has_bias=False,
-               colocate_cov_ops_with_inputs=False):
+               has_bias=False):
     """Instantiate FullyConnectedDiagonalFactor.
 
     Args:
@@ -491,44 +607,46 @@ class FullyConnectedDiagonalFactor(DiagonalFactor):
       outputs_grads: List of Tensors of shape [batch_size, output_size].
         Gradient of loss with respect to layer's preactivations.
       has_bias: bool. If True, append '1' to each input.
-      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
-          their inputs.
     """
+    self._inputs = inputs
+    self._has_bias = has_bias
     self._outputs_grads = outputs_grads
-    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     self._batch_size = array_ops.shape(inputs)[0]
-    self._orig_tensors_name = scope_string_from_params((inputs,) +
-                                                       tuple(outputs_grads))
-
-    # Note that we precompute the required operations on the inputs since the
-    # inputs don't change with the 'idx' argument to _compute_new_cov.  (Only
-    # the target entry of _outputs_grads changes with idx.)
-    with _maybe_colocate_with(inputs, self._colocate_cov_ops_with_inputs):
-      if has_bias:
-        inputs = _append_homog(inputs)
-      self._squared_inputs = math_ops.square(inputs)
+    self._squared_inputs = None
 
     super(FullyConnectedDiagonalFactor, self).__init__()
 
   @property
   def _var_scope(self):
-    return "ff_diagfc/" + self._orig_tensors_name
+    return "ff_diagfc/" + scope_string_from_params(
+        (self._inputs,) + tuple(self._outputs_grads))
 
   @property
   def _cov_shape(self):
-    return [self._squared_inputs.shape[1], self._outputs_grads[0].shape[1]]
+    return [self._inputs.shape[1] + self._has_bias,
+            self._outputs_grads[0].shape[1]]
 
   @property
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _dtype(self):
+    return self._outputs_grads[0].dtype
+
   def _compute_new_cov(self, idx=0):
     # The well-known special formula that uses the fact that the entry-wise
     # square of an outer product is the outer-product of the entry-wise squares.
     # The gradient is the outer product of the input and the output gradients,
     # so we just square both and then take their outer-product.
-    with _maybe_colocate_with(self._squared_inputs,
-                              self._colocate_cov_ops_with_inputs):
+    with maybe_colocate_with(self._outputs_grads[idx]):
+      # We only need to compute squared_inputs once
+      if self._squared_inputs is None:
+        inputs = self._inputs
+        if self._has_bias:
+          inputs = append_homog(self._inputs)
+        self._squared_inputs = math_ops.square(inputs)
+
       new_cov = math_ops.matmul(
           self._squared_inputs,
           math_ops.square(self._outputs_grads[idx]),
@@ -540,16 +658,13 @@ class FullyConnectedDiagonalFactor(DiagonalFactor):
 class ConvDiagonalFactor(DiagonalFactor):
   """FisherFactor for a diagonal approx of a convolutional layer's Fisher."""
 
-  # TODO(jamesmartens): add units tests for this class
-
   def __init__(self,
                inputs,
                outputs_grads,
                filter_shape,
                strides,
                padding,
-               has_bias=False,
-               colocate_cov_ops_with_inputs=False):
+               has_bias=False):
     """Creates a ConvDiagonalFactor object.
 
     Args:
@@ -564,53 +679,63 @@ class ConvDiagonalFactor(DiagonalFactor):
       padding: The padding in this layer (1-D of Tensor length 4).
       has_bias: Python bool. If True, the layer is assumed to have a bias
         parameter in addition to its filter parameter.
-      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
-          their inputs.
     """
+    self._inputs = inputs
     self._filter_shape = filter_shape
+    self._strides = strides
+    self._padding = padding
     self._has_bias = has_bias
     self._outputs_grads = outputs_grads
-    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
-
-    self._orig_tensors_name = scope_string_from_name((inputs,)
-                                                     + tuple(outputs_grads))
-
-    # Note that we precompute the required operations on the inputs since the
-    # inputs don't change with the 'idx' argument to _compute_new_cov.  (Only
-    # the target entry of _outputs_grads changes with idx.)
-    with _maybe_colocate_with(inputs, self._colocate_cov_ops_with_inputs):
-      filter_height, filter_width, _, _ = self._filter_shape
-      patches = array_ops.extract_image_patches(
-          inputs,
-          ksizes=[1, filter_height, filter_width, 1],
-          strides=strides,
-          rates=[1, 1, 1, 1],
-          padding=padding)
-
-      if has_bias:
-        patches = _append_homog(patches)
-
-      self._patches = patches
+    self._patches = None
 
     super(ConvDiagonalFactor, self).__init__()
 
   @property
   def _var_scope(self):
-    return "ff_convdiag/" + self._orig_tensors_name
+    return "ff_convdiag/" + scope_string_from_name(
+        (self._inputs,) + tuple(self._outputs_grads))
 
   @property
   def _cov_shape(self):
     filter_height, filter_width, in_channels, out_channels = self._filter_shape
-    return [filter_height * filter_width * in_channels + self._has_bias,
-            out_channels]
+    return [
+        filter_height * filter_width * in_channels + self._has_bias,
+        out_channels
+    ]
 
   @property
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _dtype(self):
+    return self._outputs_grads[0].dtype
+
+  def make_covariance_update_op(self, ema_decay):
+    with maybe_colocate_with(self._inputs):
+      filter_height, filter_width, _, _ = self._filter_shape
+
+      # TODO(b/64144716): there is potential here for a big savings in terms
+      # of memory use.
+      patches = extract_image_patches(
+          self._inputs,
+          ksizes=[filter_height, filter_width],
+          strides=self._strides[1:-1],
+          padding=self._padding)
+
+      if self._has_bias:
+        patches = append_homog(patches)
+
+      self._patches = patches
+
+    op = super(ConvDiagonalFactor, self).make_covariance_update_op(ema_decay)
+
+    self._patches = None
+
+    return op
+
   def _compute_new_cov(self, idx=0):
-    with _maybe_colocate_with(self._outputs_grads[idx],
-                              self._colocate_cov_ops_with_inputs):
+    with maybe_colocate_with(self._outputs_grads[idx]):
       outputs_grad = self._outputs_grads[idx]
       batch_size = array_ops.shape(self._patches)[0]
 
@@ -634,23 +759,18 @@ class FullyConnectedKroneckerFactor(InverseProvidingFactor):
 
   def __init__(self,
                tensors,
-               has_bias=False,
-               colocate_cov_ops_with_inputs=False):
+               has_bias=False):
     """Instantiate FullyConnectedKroneckerFactor.
 
     Args:
       tensors: List of Tensors of shape [batch_size, n]. Represents either a
         layer's inputs or its output's gradients.
-      has_bias: bool. If True, assume this factor is for the layer's inputs and
-        append '1' to each row.
-      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
-          their inputs.
+      has_bias: bool. If True, append '1' to each row.
     """
     # The tensor argument is either a tensor of input activations or a tensor of
     # output pre-activation gradients.
     self._has_bias = has_bias
     self._tensors = tensors
-    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     super(FullyConnectedKroneckerFactor, self).__init__()
 
   @property
@@ -667,13 +787,16 @@ class FullyConnectedKroneckerFactor(InverseProvidingFactor):
   def _num_sources(self):
     return len(self._tensors)
 
+  @property
+  def _dtype(self):
+    return self._tensors[0].dtype
+
   def _compute_new_cov(self, idx=0):
-    with _maybe_colocate_with(self._tensors[idx],
-                              self._colocate_cov_ops_with_inputs):
+    with maybe_colocate_with(self._tensors[idx]):
       tensor = self._tensors[idx]
       if self._has_bias:
-        tensor = _append_homog(tensor)
-      return _compute_cov(tensor)
+        tensor = append_homog(tensor)
+      return compute_cov(tensor)
 
 
 class ConvInputKroneckerFactor(InverseProvidingFactor):
@@ -682,7 +805,7 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
   Estimates E[ a a^T ] where a is the inputs to a convolutional layer given
   example x. Expectation is taken over all examples and locations.
 
-  Equivalent to \Omega in https://arxiv.org/abs/1602.01407 for details. See
+  Equivalent to Omega in https://arxiv.org/abs/1602.01407 for details. See
   Section 3.1 Estimating the factors.
   """
 
@@ -691,8 +814,7 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
                filter_shape,
                strides,
                padding,
-               has_bias=False,
-               colocate_cov_ops_with_inputs=False):
+               has_bias=False):
     """Initializes ConvInputKroneckerFactor.
 
     Args:
@@ -704,15 +826,12 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
         width_stride, in_channel_stride].
       padding: str. Padding method for layer. "SAME" or "VALID".
       has_bias: bool. If True, append 1 to in_channel.
-      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
-          their inputs.
     """
     self._filter_shape = filter_shape
     self._strides = strides
     self._padding = padding
     self._has_bias = has_bias
     self._inputs = inputs
-    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     super(ConvInputKroneckerFactor, self).__init__()
 
   @property
@@ -732,27 +851,44 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
   def _num_sources(self):
     return 1
 
+  @property
+  def _dtype(self):
+    return self._inputs.dtype
+
   def _compute_new_cov(self, idx=0):
     if idx != 0:
       raise ValueError("ConvInputKroneckerFactor only supports idx = 0")
 
-    # TODO(jamesmartens): factor this patches stuff out into a utility function
-    with _maybe_colocate_with(self._inputs, self._colocate_cov_ops_with_inputs):
+    with maybe_colocate_with(self._inputs):
       filter_height, filter_width, in_channels, _ = self._filter_shape
-      patches = array_ops.extract_image_patches(
+
+      # TODO(b/64144716): there is potential here for a big savings in terms of
+      # memory use.
+      patches = extract_image_patches(
           self._inputs,
-          ksizes=[1, filter_height, filter_width, 1],
-          strides=self._strides,
-          rates=[1, 1, 1, 1],
+          ksizes=[filter_height, filter_width],
+          strides=self._strides[1:-1],
           padding=self._padding)
 
       flatten_size = (filter_height * filter_width * in_channels)
+      # patches_flat below is the matrix [[A_l]] from the KFC paper (tilde
+      # omitted over A for clarity). It has shape M|T| x J|Delta| (eq. 14),
+      # where M = minibatch size, |T| = number of spatial locations,
+      # |Delta| = number of spatial offsets, and J = number of input maps
+      # for convolutional layer l.
       patches_flat = array_ops.reshape(patches, [-1, flatten_size])
-
+      # We append a homogenous coordinate to patches_flat if the layer has
+      # bias parameters. This gives us [[A_l]]_H from the paper.
       if self._has_bias:
-        patches_flat = _append_homog(patches_flat)
-
-      return _compute_cov(patches_flat)
+        patches_flat = append_homog(patches_flat)
+      # We call compute_cov without passing in a normalizer. compute_cov uses
+      # the first dimension of patches_flat i.e. M|T| as the normalizer by
+      # default. Hence we end up computing 1/M|T| * [[A_l]]^T [[A_l]], with
+      # shape J|Delta| x J|Delta|. This is related to hat{Omega}_l from
+      # the paper but has a different scale here for consistency with
+      # ConvOutputKroneckerFactor.
+      # (Tilde omitted over A for clarity.)
+      return compute_cov(patches_flat)
 
 
 class ConvOutputKroneckerFactor(InverseProvidingFactor):
@@ -762,22 +898,19 @@ class ConvOutputKroneckerFactor(InverseProvidingFactor):
   given example x and ds = (d / d s) log(p(y|x, w)). Expectation is taken over
   all examples and locations.
 
-  Equivalent to \Gamma in https://arxiv.org/abs/1602.01407 for details. See
+  Equivalent to Gamma in https://arxiv.org/abs/1602.01407 for details. See
   Section 3.1 Estimating the factors.
   """
 
-  def __init__(self, outputs_grads, colocate_cov_ops_with_inputs=False):
+  def __init__(self, outputs_grads):
     """Initializes ConvOutputKroneckerFactor.
 
     Args:
       outputs_grads: list of Tensors. Each Tensor is of shape
           [batch_size, height, width, out_channels].
-      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
-          their inputs.
     """
     self._out_channels = outputs_grads[0].shape.as_list()[3]
     self._outputs_grads = outputs_grads
-    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     super(ConvOutputKroneckerFactor, self).__init__()
 
   @property
@@ -793,9 +926,292 @@ class ConvOutputKroneckerFactor(InverseProvidingFactor):
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _dtype(self):
+    return self._outputs_grads[0].dtype
+
   def _compute_new_cov(self, idx=0):
-    with _maybe_colocate_with(self._outputs_grads[idx],
-                              self._colocate_cov_ops_with_inputs):
+    with maybe_colocate_with(self._outputs_grads[idx]):
+      # reshaped_tensor below is the matrix DS_l defined in the KFC paper
+      # (tilde omitted over S for clarity). It has shape M|T| x I, where
+      # M = minibatch size, |T| = number of spatial locations, and
+      # I = number of output maps for convolutional layer l.
       reshaped_tensor = array_ops.reshape(self._outputs_grads[idx],
                                           [-1, self._out_channels])
-      return _compute_cov(reshaped_tensor)
+      # Following the reasoning in ConvInputKroneckerFactor._compute_new_cov,
+      # compute_cov here returns 1/M|T| * DS_l^T DS_l = hat{Gamma}_l
+      # as defined in the paper, with shape I x I.
+      # (Tilde omitted over S for clarity.)
+      return compute_cov(reshaped_tensor)
+
+
+class FullyConnectedMultiKF(InverseProvidingFactor):
+  """Kronecker factor for a fully connected recurrent layer."""
+
+  def __init__(self,
+               tensor_lists,
+               has_bias=False):
+    """Constructs a new `FullyConnectedMultiKF`.
+
+    Args:
+      tensor_lists: List of lists of Tensors of shape [batch_size, n].
+      has_bias: bool. If True, '1' is appended to each row.
+    """
+
+    self._tensor_lists = tensor_lists
+    self._has_bias = has_bias
+    self._batch_size = array_ops.shape(tensor_lists[0][0])[0]
+    self._num_timesteps = len(tensor_lists[0])
+    self._tensors = [None] * len(tensor_lists)
+
+    self._cov_dt1 = None
+    self._option1quants_by_damping = {}
+    self._option2quants_by_damping = {}
+
+    super(FullyConnectedMultiKF, self).__init__()
+
+  @property
+  def _var_scope(self):
+    return "ff_fc_multi/" + scope_string_from_params(self._tensor_lists)
+
+  @property
+  def _num_sources(self):
+    return len(self._tensor_lists)
+
+  @property
+  def _dtype(self):
+    return self._tensor_lists[0][0].dtype
+
+  def make_covariance_update_op(self, ema_decay):
+
+    op = super(FullyConnectedMultiKF, self).make_covariance_update_op(ema_decay)
+
+    if self._cov_dt1 is not None:
+      new_cov_dt1_contribs = tuple(self._compute_new_cov_dt1(idx)
+                                   for idx in range(self._num_sources))
+
+      with maybe_colocate_with(new_cov_dt1_contribs[0]):
+        new_cov_dt1 = math_ops.add_n(new_cov_dt1_contribs)
+
+        op2 = moving_averages.assign_moving_average(
+            self._cov_dt1, new_cov_dt1, ema_decay, zero_debias=ZERO_DEBIAS)
+
+        # TODO(b/69112164):
+        # It's important that _cov and _cov_dt1 remain consistent with each
+        # other while the inverse ops are happening. How can we ensure this?
+        # We will need to add explicit synchronization for this to
+        # work with asynchronous training.
+        op = control_flow_ops.group(op, op2)
+
+    return op
+
+  def _compute_new_cov(self, idx=0):
+    with maybe_colocate_with(self._tensor_lists[idx]):
+      tensor = array_ops.concat(self._tensor_lists[idx], 0)
+      if self._has_bias:
+        tensor = append_homog(tensor)
+      # We save these so they can be used by _compute_new_cov_dt1
+      self._tensors[idx] = tensor
+      return compute_cov(tensor)
+
+  def _compute_new_cov_dt1(self, idx=0):
+    tensor = self._tensors[idx]
+    with maybe_colocate_with(tensor):
+      # Is there a more elegant way to do this computation?
+      tensor_present = tensor[:-self._batch_size, :]
+      tensor_future = tensor[self._batch_size:, :]
+      # We specify a normalizer for this computation to ensure a PSD Fisher
+      # block estimate.  This is equivalent to padding with zeros, as was done
+      # in Section B.2 of the appendix.
+      normalizer = self._num_timesteps * self._batch_size
+      return compute_cov(
+          tensor_future, tensor_right=tensor_present, normalizer=normalizer)
+
+  @property
+  def _cov_shape(self):
+    size = self._tensor_lists[0][0].shape[1] + self._has_bias
+    return [size, size]
+
+  @property
+  def _vec_shape(self):
+    size = self._tensor_lists[0][0].shape[1] + self._has_bias
+    return [size]
+
+  def get_option1quants(self, damping):
+    return self._option1quants_by_damping[damping]
+
+  def get_option2quants(self, damping):
+    return self._option2quants_by_damping[damping]
+
+  def get_cov_dt1(self):
+    assert self._cov_dt1 is not None
+    return self._cov_dt1
+
+  def register_cov_dt1(self):
+    """Create a variable representing temporal cross-covariance.
+
+    (This is technically the second moment, not covariance, since it's
+    not mean subtracted.)
+    """
+    if self._cov_dt1 is None:
+      with variable_scope.variable_scope(self._var_scope):
+        self._cov_dt1 = variable_scope.get_variable(
+            "cov_dt1",
+            initializer=init_ops.zeros_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+
+  def register_option1quants(self, damping):
+
+    self.register_cov_dt1()
+
+    if damping not in self._option1quants_by_damping:
+      # It's questionable as to whether we should initialize with stuff like
+      # this at all.  Ideally these values should never be used until they are
+      # updated at least once.
+      damping_string = scalar_or_tensor_to_string(damping)
+      with variable_scope.variable_scope(self._var_scope):
+        Lmat = variable_scope.get_variable(  # pylint: disable=invalid-name
+            "Lmat_damp{}".format(damping_string),
+            initializer=inverse_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+        psi = variable_scope.get_variable(
+            "psi_damp{}".format(damping_string),
+            initializer=init_ops.ones_initializer,
+            shape=self._vec_shape,
+            trainable=False,
+            dtype=self._dtype)
+
+      self._option1quants_by_damping[damping] = (Lmat, psi)
+
+  def register_option2quants(self, damping):
+
+    self.register_cov_dt1()
+
+    if damping not in self._option2quants_by_damping:
+      # It's questionable as to whether we should initialize with stuff like
+      # this at all.  Ideally these values should never be used until they are
+      # updated at least once.
+      damping_string = scalar_or_tensor_to_string(damping)
+      with variable_scope.variable_scope(self._var_scope):
+        Pmat = variable_scope.get_variable(  # pylint: disable=invalid-name
+            "Lmat_damp{}".format(damping_string),
+            initializer=inverse_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+        Kmat = variable_scope.get_variable(  # pylint: disable=invalid-name
+            "Kmat_damp{}".format(damping_string),
+            initializer=inverse_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+        mu = variable_scope.get_variable(
+            "mu_damp{}".format(damping_string),
+            initializer=init_ops.ones_initializer,
+            shape=self._vec_shape,
+            trainable=False,
+            dtype=self._dtype)
+
+      self._option2quants_by_damping[damping] = (Pmat, Kmat, mu)
+
+  def make_inverse_update_ops(self):
+    """Create and return update ops corresponding to registered computations."""
+    # TODO(b/69918258): Add correctness tests for this method.
+    # pylint: disable=invalid-name
+
+    ops = super(FullyConnectedMultiKF, self).make_inverse_update_ops()
+
+    if (len(self._option1quants_by_damping) +
+        len(self._option2quants_by_damping)):
+
+      # Note that C0 and C1 are stand-ins for A0 and A1, or G0 and G1, from
+      # the pseudo-code in the original paper.  Because the computations for
+      # the A and G case are essentially the same they can both be performed by
+      # the same class (this one).
+
+      C1 = self.get_cov_dt1()
+
+      # Get the eigendecomposition of C0  (= self.get_cov())
+      eigen_e, eigen_V = self.get_eigendecomp()
+
+      # TODO(b/69678661): Note, there is an implicit assumption here that C1
+      # and C0 (as represented here by its eigen-decomp) are consistent.  This
+      # could fail to be the case if self._cov and self._cov_dt1 are not updated
+      # consistently, or are somehow read between or during the cov updates.
+      # Can this possibly happen?  Is there a way to prevent it?
+
+      for damping, (Lmat_var,
+                    psi_var) in self._option1quants_by_damping.items():
+
+        invsqrtC0 = math_ops.matmul(
+            eigen_V * (eigen_e + damping)**(-0.5), eigen_V, transpose_b=True)
+
+        # Might need to enforce symmetry lost due to numerical issues.
+        invsqrtC0 = (invsqrtC0 + array_ops.transpose(invsqrtC0)) / 2.0
+
+        # The following line imposses the symmetry assumed by "Option 1" on C1.
+        # Stangely the code can work okay with this line commented out,
+        # depending on how psd_eig is defined.  I'm not sure why.
+        C1 = (C1 + array_ops.transpose(C1)) / 2.0
+
+        # hPsi = C0^(-1/2) * C1 * C0^(-1/2)  (hPsi means hat{Psi})
+        hPsi = math_ops.matmul(math_ops.matmul(invsqrtC0, C1), invsqrtC0)
+
+        # Compute the decomposition U*diag(psi)*U^T = hPsi
+        psi, U = utils.posdef_eig(hPsi)
+
+        # L = C0^(-1/2) * U
+        Lmat = math_ops.matmul(invsqrtC0, U)
+
+        ops.append(Lmat_var.assign(Lmat))
+        ops.append(psi_var.assign(psi))
+
+      for damping, (Pmat_var, Kmat_var,
+                    mu_var) in self._option2quants_by_damping.items():
+
+        # compute C0^(-1/2)
+        invsqrtC0 = math_ops.matmul(
+            eigen_V * (eigen_e + damping)**(-0.5), eigen_V, transpose_b=True)
+
+        # Might need to enforce symmetry lost due to numerical issues.
+        invsqrtC0 = (invsqrtC0 + array_ops.transpose(invsqrtC0)) / 2.0
+
+        # Compute the product C0^(-1/2) * C1
+        invsqrtC0C1 = math_ops.matmul(invsqrtC0, C1)
+
+        # hPsi = C0^(-1/2) * C1 * C0^(-1/2)  (hPsi means hat{Psi})
+        hPsi = math_ops.matmul(invsqrtC0C1, invsqrtC0)
+
+        # Compute the decomposition E*diag(mu)*E^T = hPsi^T * hPsi
+        # Note that we using the notation mu instead of "m" for the eigenvalues.
+        # Instead of computing the product hPsi^T * hPsi and then doing an
+        # eigen-decomposition of this we just compute the SVD of hPsi and then
+        # square the singular values to get the eigenvalues. For a justification
+        # of this approach, see:
+        # https://en.wikipedia.org/wiki/Singular-value_decomposition#Relation_to_eigenvalue_decomposition
+        sqrtmu, _, E = linalg_ops.svd(hPsi)
+        mu = math_ops.square(sqrtmu)
+
+        # Mathematically, the eigenvalues should not should not exceed 1.0, but
+        # due to numerical issues, or possible issues with inconsistent
+        # values of C1 and (the eigen-decomposition of) C0 they might. So
+        # we enforce this condition.
+        mu = math_ops.minimum(mu, 1.0)
+
+        # P = (C0^(-1/2) * C1)^T * C0^(-1/2) = C_1^T * C_0^(-1)
+        Pmat = math_ops.matmul(invsqrtC0C1, invsqrtC0, transpose_a=True)
+
+        # K = C_0^(-1/2) * E
+        Kmat = math_ops.matmul(invsqrtC0, E)
+
+        ops.append(Pmat_var.assign(Pmat))
+        ops.append(Kmat_var.assign(Kmat))
+        ops.append(mu_var.assign(mu))
+
+    return [control_flow_ops.group(*ops)]
+
+    # pylint: enable=invalid-name
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors_lib.py b/tensorflow/contrib/kfac/python/ops/fisher_factors_lib.py
index 23ee93cd405bbf719939df89d525c812ee061f8b..ad93919149c287b1932dd2b6bd772c0dab26192d 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors_lib.py
@@ -41,6 +41,9 @@ _allowed_symbols = [
     "ConvOutputKroneckerFactor",
     "ConvDiagonalFactor",
     "set_global_constants",
+    "maybe_colocate_with",
+    "compute_cov",
+    "append_homog"
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index 3a005ee39dd9400c21ae6c41fad5351d7fff2aac..8d450f04f379701e46a18b2e34bbbd6fcfcce2bb 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -26,7 +26,9 @@ from __future__ import print_function
 
 from collections import defaultdict
 from collections import OrderedDict
+from functools import partial
 
+import math
 import six
 
 from tensorflow.contrib.kfac.python.ops import fisher_blocks as fb
@@ -57,20 +59,22 @@ _CONV2D_APPROX_TO_BLOCK_TYPES = {
     APPROX_DIAGONAL_NAME: fb.ConvDiagonalFB,
 }
 
+APPROX_KRONECKER_INDEP_NAME = "kron_indep"
+APPROX_KRONECKER_SERIES_1_NAME = "kron_series_1"
+APPROX_KRONECKER_SERIES_2_NAME = "kron_series_2"
+
+_FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES = {
+    APPROX_KRONECKER_INDEP_NAME: fb.FullyConnectedMultiIndepFB,
+    APPROX_KRONECKER_SERIES_1_NAME: partial(fb.FullyConnectedSeriesFB,
+                                            option=1),
+    APPROX_KRONECKER_SERIES_2_NAME: partial(fb.FullyConnectedSeriesFB,
+                                            option=2)
+}
+
 # Possible value for 'reuse' keyword argument. Sets 'reuse' to
 # tf.get_variable_scope().reuse.
 VARIABLE_SCOPE = "VARIABLE_SCOPE"
 
-# TODO(jamesmartens): need to add find_canonical_output back into this somewhere
-
-
-def ensure_sequence(obj):
-  """If `obj` isn't a tuple or list, return a tuple containing `obj`."""
-  if isinstance(obj, (tuple, list)):
-    return obj
-  else:
-    return (obj,)
-
 
 class LayerParametersDict(OrderedDict):
   """An OrderedDict where keys are Tensors or tuples of Tensors.
@@ -130,7 +134,6 @@ class LayerCollection(object):
 
   def __init__(self,
                graph=None,
-               colocate_cov_ops_with_inputs=False,
                name="LayerCollection"):
     self.fisher_blocks = LayerParametersDict()
     self.fisher_factors = OrderedDict()
@@ -142,7 +145,8 @@ class LayerCollection(object):
     self._default_generic_approximation = APPROX_FULL_NAME
     self._default_fully_connected_approximation = APPROX_KRONECKER_NAME
     self._default_convolution_2d_approximation = APPROX_KRONECKER_NAME
-    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
+    self._default_fully_connected_multi_approximation = (
+        APPROX_KRONECKER_SERIES_2_NAME)
 
     with variable_scope.variable_scope(None, default_name=name) as scope:
       self._var_scope = scope.name
@@ -152,19 +156,13 @@ class LayerCollection(object):
     """LossFunctions registered with this LayerCollection."""
     return list(self._loss_dict.values())
 
-  def is_variable_registered(self, variable):
-    """Checks whether the variable has already been registered.
-
-    Args:
-      variable: A single variable or tensor.
-    Returns:
-      True if the variable has been registered either by itself or as part of a
-      tuple.
-    """
-    return any([
-        variable in key if isinstance(key, (tuple, list)) else variable == key
-        for key in self.fisher_blocks.keys()
-    ])
+  @property
+  def registered_variables(self):
+    """A tuple of all of the variables currently registered."""
+    tuple_of_tuples = (utils.ensure_sequence(key) for key, block
+                       in six.iteritems(self.fisher_blocks))
+    flat_tuple = tuple(item for tuple_ in tuple_of_tuples for item in tuple_)
+    return flat_tuple
 
   @property
   def linked_parameters(self):
@@ -213,6 +211,16 @@ class LayerCollection(object):
               value))
     self._default_convolution_2d_approximation = value
 
+  @property
+  def default_fully_connected_multi_approximation(self):
+    return self._default_fully_connected_multi_approximation
+
+  def set_default_fully_connected_multi_approximation(self, value):
+    if value not in _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES:
+      raise ValueError("{} is not a valid approximation for a fully-connected "
+                       "multi layer.".format(value))
+    self._default_fully_connected_multi_approximation = value
+
   def register_block(self, layer_key, fisher_block, reuse=VARIABLE_SCOPE):
     """Validates and registers the layer_key associated with the fisher_block.
 
@@ -221,7 +229,7 @@ class LayerCollection(object):
           existing registrations and to register if valid.
       fisher_block: The associated `FisherBlock`.
       reuse: Method to use for inserting new `FisherBlock`s. One of True, False,
-        or VARIABLE_SCOPE.
+        or 'VARIABLE_SCOPE'.
 
     Raises:
       ValueError: If `layer_key` was already registered and reuse is `False`,
@@ -258,9 +266,9 @@ class LayerCollection(object):
     variable_to_block = {
         var: (params, block)
         for (params, block) in self.fisher_blocks.items()
-        for var in ensure_sequence(params)
+        for var in utils.ensure_sequence(params)
     }
-    for variable in ensure_sequence(layer_key):
+    for variable in utils.ensure_sequence(layer_key):
       if variable in variable_to_block:
         prev_key, prev_block = variable_to_block[variable]
         raise ValueError(
@@ -272,13 +280,65 @@ class LayerCollection(object):
 
   def get_use_count_map(self):
     """Returns a dict of variables to their number of registrations."""
+    # TODO(b/70283403): Reimplement this in the old way, where each
+    # registration function would be responsible for incrementing the count.
+    # Also, this version has a bug: it won't do the right thing for generic
+    # registration for parameters that are shared.  i.e. it won't set the use
+    # count to infinity.
     vars_to_uses = defaultdict(int)
     for key, block in six.iteritems(self.fisher_blocks):
-      key = key if isinstance(key, (tuple, list)) else (key,)
+      n = (
+          block.num_inputs()*block.num_registered_minibatches if isinstance(
+              block, (fb.FullyConnectedSeriesFB, fb.FullyConnectedMultiIndepFB))
+          else block.num_registered_minibatches)
+      key = utils.ensure_sequence(key)
       for k in key:
-        vars_to_uses[k] += block.num_registered_minibatches
+        vars_to_uses[k] += n
     return vars_to_uses
 
+  def check_registration(self, variables):
+    """Checks that all variable uses have been registered properly.
+
+    Args:
+      variables: List of variables.
+
+    Raises:
+      ValueError: If any registered variables are not included in the list.
+      ValueError: If any variable in the list is not registered.
+      ValueError: If any variable in the list is registered with the wrong
+          number of "uses" in the subgraph recorded (vs the number of times that
+          variable is actually used in the subgraph).
+    """
+    # Note that overlapping parameters (i.e. those that share variables) will
+    # be caught by layer_collection.LayerParametersDict during registration.
+
+    reg_use_map = self.get_use_count_map()
+
+    error_messages = []
+
+    for var in variables:
+      total_uses = self.subgraph.variable_uses(var)
+      reg_uses = reg_use_map[var]
+
+      if reg_uses == 0:
+        error_messages.append("Variable {} not registered.".format(var))
+      elif (not math.isinf(reg_uses)) and reg_uses != total_uses:
+        error_messages.append(
+            "Variable {} registered with wrong number of uses ({} "
+            "registrations vs {} uses).".format(var, reg_uses, total_uses))
+
+    num_get_vars = len(reg_use_map)
+
+    if num_get_vars > len(variables):
+      error_messages.append("{} registered variables were not included in list."
+                            .format(num_get_vars - len(variables)))
+
+    if error_messages:
+      error_messages = [
+          "Found the following errors with variable registration:"
+      ] + error_messages
+      raise ValueError("\n\t".join(error_messages))
+
   def get_blocks(self):
     return self.fisher_blocks.values()
 
@@ -312,12 +372,12 @@ class LayerCollection(object):
       ValueError: If the parameters were already registered in a layer or
         identified as part of an incompatible group.
     """
-    params = frozenset(ensure_sequence(params))
+    params = frozenset(utils.ensure_sequence(params))
 
     # Check if any of the variables in 'params' is already in
     # 'self.fisher_blocks.keys()'.
     for registered_params, fisher_block in self.fisher_blocks.items():
-      registered_params_set = set(ensure_sequence(registered_params))
+      registered_params_set = set(utils.ensure_sequence(registered_params))
       for variable in params:
         if (variable in registered_params_set and
             params != registered_params_set):
@@ -351,7 +411,7 @@ class LayerCollection(object):
 
   def _get_linked_approx(self, params):
     """If params were linked, return their specified approximation."""
-    params_set = frozenset(ensure_sequence(params))
+    params_set = frozenset(utils.ensure_sequence(params))
     if params_set in self.linked_parameters:
       return self.linked_parameters[params_set]
     else:
@@ -370,11 +430,11 @@ class LayerCollection(object):
         this layer. Weight matrix should have shape [input_size, output_size].
         Bias should have shape [output_size].
       inputs: Tensor of shape [batch_size, input_size]. Inputs to layer.
-      outputs: Tensor of shape [batch_size, output_size]. Preactivations
+      outputs: Tensor of shape [batch_size, output_size]. Outputs
         produced by layer.
-      approx: str. One of APPROX_KRONECKER_NAME or APPROX_DIAGONAL_NAME.
+      approx: str. One of "kron" or "diagonal".
       reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If VARIABLE_SCOPE, use
+        create a new FisherBlock.  If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse.
 
     Raises:
@@ -416,10 +476,10 @@ class LayerCollection(object):
       inputs: Tensor of shape [batch_size, height, width, in_channels]. Inputs
         to layer.
       outputs: Tensor of shape [batch_size, height, width, out_channels].
-        Preactivations produced by layer.
-      approx: str. One of APPROX_KRONECKER_NAME or APPROX_DIAGONAL_NAME.
+        Output produced by layer.
+      approx: str. One of "kron" or "diagonal".
       reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If VARIABLE_SCOPE, use
+        create a new FisherBlock.  If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse.
 
     Raises:
@@ -449,14 +509,11 @@ class LayerCollection(object):
     """Registers a generic layer.
 
     Args:
-      params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
-        this layer. Weight matrix should have shape [kernel_height,
-        kernel_width, in_channels, out_channels].  Bias should have shape
-        [out_channels].
+      params: Tensor or tuple of Tensors corresponding to the parameters.
       batch_size: 0-D Tensor. Size of the minibatch.
-      approx: str. One of APPROX_KRONECKER_NAME or APPROX_DIAGONAL_NAME.
+      approx: str. One of "full" or "diagonal".
       reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If VARIABLE_SCOPE, use
+        create a new FisherBlock.  If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse.
 
     Raises:
@@ -477,6 +534,47 @@ class LayerCollection(object):
     block = self.register_block(params, block_type(self, params), reuse=reuse)
     block.register_additional_minibatch(batch_size)
 
+  def register_fully_connected_multi(self, params, inputs, outputs,
+                                     approx=None):
+    """Register fully connected layers with shared parameters.
+
+    This can handle general fully-connected layers with shared parameters, but
+    has specialized approximations to deal with the case where there is a
+    meaningful linear order to the share instances (such as in an RNN).
+
+    Args:
+      params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
+        this layer. Weight matrix should have shape [input_size, output_size].
+        Bias should have shape [output_size].
+      inputs: A list of tensors, each of shape [batch_size, input_size]. Inputs
+        to layer. In the case of RNNs, one Tensor per time step.
+      outputs: A list of tensors, the same length as 'inputs', each of shape
+        [batch_size, output_size]. Outputs produced by layer. In the case of
+        RNNs, one Tensor per time step.
+      approx: str. One of "kron_indep", "kron_series_1", or "kron_series_2".
+
+    Raises:
+      ValueError: For improper value to 'approx'.
+    """
+    if approx is None:
+      approx = self._get_linked_approx(params)
+      if approx is None:
+        approx = self.default_fully_connected_multi_approximation
+    has_bias = isinstance(params, (tuple, list))
+
+    # TODO(b/70283649): something along the lines of find_canonical_output
+    # should be added back in here (and for the other block types, arguably).
+
+    if approx not in _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES:
+      raise ValueError("Bad value {} for approx.".format(approx))
+    block_type = _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES[approx]
+
+    # For now we don't support multiple minibatches for this type of layer, so
+    # we set reuse=False
+    self.register_block(params,
+                        block_type(self, inputs, outputs, has_bias=has_bias),
+                        reuse=False)
+
   def register_categorical_predictive_distribution(self,
                                                    logits,
                                                    seed=None,
@@ -619,7 +717,6 @@ class LayerCollection(object):
 
     key = cls, args
     if key not in self.fisher_factors:
-      colo = self._colocate_cov_ops_with_inputs
       with variable_scope.variable_scope(self._var_scope):
-        self.fisher_factors[key] = cls(*args, colocate_cov_ops_with_inputs=colo)
+        self.fisher_factors[key] = cls(*args)
     return self.fisher_factors[key]
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection_lib.py b/tensorflow/contrib/kfac/python/ops/layer_collection_lib.py
index d6bf61a210203dd74d4e93b65005f660b1fab4ff..f8aa230d9ca1f542950f56b1e6cf1ab7ccd3d05f 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection_lib.py
@@ -36,6 +36,9 @@ _allowed_symbols = [
     "APPROX_DIAGONAL_NAME",
     "APPROX_FULL_NAME",
     "VARIABLE_SCOPE",
+    "APPROX_KRONECKER_INDEP_NAME",
+    "APPROX_KRONECKER_SERIES_1_NAME",
+    "APPROX_KRONECKER_SERIES_2_NAME"
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions.py b/tensorflow/contrib/kfac/python/ops/loss_functions.py
index e2e5bc3ffea3e52087c24802948bc8260e3b199a..cb3e698b9ceab920785adf735f88bd8e535a628f 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions.py
@@ -22,6 +22,7 @@ import abc
 
 import six
 
+from tensorflow.contrib.distributions.python.ops import onehot_categorical
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -91,13 +92,13 @@ class LossFunction(object):
 
   @abc.abstractmethod
   def _evaluate(self, targets):
-    """Evaluates the log probability of the targets.
+    """Evaluates the negative log probability of the targets.
 
     Args:
       targets: Tensor that distribution can calculate log_prob() of.
 
     Returns:
-      log probability of each target, summed across all targets.
+      negative log probability of each target, summed across all targets.
     """
     pass
 
@@ -659,19 +660,20 @@ class CategoricalLogitsNegativeLogProbLoss(DistributionNegativeLogProbLoss,
 
   def multiply_fisher(self, vector):
     probs = self._probs
-    return vector * probs - math_ops.reduce_sum(vector * probs, axis=1) * probs
+    return vector * probs - probs * math_ops.reduce_sum(
+        vector * probs, axis=-1, keep_dims=True)
 
   def multiply_fisher_factor(self, vector):
     probs = self._probs
     sqrt_probs = self._sqrt_probs
     return sqrt_probs * vector - probs * math_ops.reduce_sum(
-        sqrt_probs * vector, axis=1, keep_dims=True)
+        sqrt_probs * vector, axis=-1, keep_dims=True)
 
   def multiply_fisher_factor_transpose(self, vector):
     probs = self._probs
     sqrt_probs = self._sqrt_probs
     return sqrt_probs * vector - sqrt_probs * math_ops.reduce_sum(
-        probs * vector, axis=1, keep_dims=True)
+        probs * vector, axis=-1, keep_dims=True)
 
   def multiply_fisher_factor_replicated_one_hot(self, index):
     assert len(index) == 1, "Length of index was {}".format(len(index))
@@ -785,3 +787,16 @@ def insert_slice_in_zeros(slice_to_insert, dim, dim_size, position):
   after[dim] = dim_size - position - 1
 
   return array_ops.pad(slice_to_insert, list(zip(before, after)))
+
+
+class OnehotCategoricalLogitsNegativeLogProbLoss(
+    CategoricalLogitsNegativeLogProbLoss):
+  """Neg log prob loss for a categorical distribution with onehot targets.
+
+  Identical to CategoricalLogitsNegativeLogProbLoss except that the underlying
+  distribution is OneHotCategorical as opposed to Categorical.
+  """
+
+  @property
+  def dist(self):
+    return onehot_categorical.OneHotCategorical(logits=self._logits)
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py b/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
index e9bb4f14e9e24128382832fcdaccdc9b24017046..705a871d482565897e7ac850327729a6186f1746 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions_lib.py
@@ -31,6 +31,7 @@ _allowed_symbols = [
     "NormalMeanNegativeLogProbLoss",
     "NormalMeanVarianceNegativeLogProbLoss",
     "CategoricalLogitsNegativeLogProbLoss",
+    "OnehotCategoricalLogitsNegativeLogProbLoss",
     "MultiBernoulliNegativeLogProbLoss",
     "MultiBernoulliNegativeLogProbLoss",
     "insert_slice_in_zeros",
diff --git a/tensorflow/contrib/kfac/python/ops/op_queue.py b/tensorflow/contrib/kfac/python/ops/op_queue.py
index 831870fca451c585cb1a1dc6b24aad757e2bbaa8..b6d9d37a31a949b154b79e6f3677289a0d167373 100644
--- a/tensorflow/contrib/kfac/python/ops/op_queue.py
+++ b/tensorflow/contrib/kfac/python/ops/op_queue.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import ops as tf_ops
 
 
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer.py b/tensorflow/contrib/kfac/python/ops/optimizer.py
index ecf7f3e4e5ab7d9c151f760fdab733bc3830e37b..1974b07acfc879dc4bc844db9af88fd1043d6698 100644
--- a/tensorflow/contrib/kfac/python/ops/optimizer.py
+++ b/tensorflow/contrib/kfac/python/ops/optimizer.py
@@ -41,12 +41,12 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
                damping,
                layer_collection,
                var_list=None,
-               momentum=0.,
+               momentum=0.9,
                momentum_type="regular",
                norm_constraint=None,
                name="KFAC",
                estimation_mode="gradients",
-               colocate_gradients_with_ops=False,
+               colocate_gradients_with_ops=True,
                cov_devices=None,
                inv_devices=None):
     """Initializes the KFAC optimizer with the given settings.
@@ -70,8 +70,8 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       var_list: Optional list or tuple of variables to train. Defaults to the
           list of variables collected in the graph under the key
           `GraphKeys.TRAINABLE_VARIABLES`.
-      momentum: The momentum value for this optimizer. Only applies when
-          momentum_type is 'regular' or 'adam'. (Default: 0)
+      momentum: The momentum decay constant to use. Only applies when
+          momentum_type is 'regular' or 'adam'. (Default: 0.9)
       momentum_type: The type of momentum to use in this optimizer, one of
           'regular', 'adam', or 'qmodel'. (Default: 'regular')
       norm_constraint: float or Tensor. If specified, the update is scaled down
@@ -85,6 +85,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
           more a more detailed description of these options.
       colocate_gradients_with_ops: Whether we should request gradients we
           compute in the estimator be colocated with their respective ops.
+          (Default: True)
       cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
           computations will be placed on these devices in a round-robin fashion.
           Can be None, which means that no devices are specified.
@@ -136,12 +137,32 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     self._batch_size = array_ops.shape(layer_collection.losses[0].inputs)[0]
     self._losses = layer_collection.losses
 
-    self.cov_update_op = self._fisher_est.cov_update_op
-    self.inv_update_op = self._fisher_est.inv_update_op
-    self.inv_updates_dict = self._fisher_est.inv_updates_dict
-
     super(KfacOptimizer, self).__init__(learning_rate, name=name)
 
+  @property
+  def cov_update_thunks(self):
+    return self._fisher_est.cov_update_thunks
+
+  @property
+  def cov_update_ops(self):
+    return self._fisher_est.cov_update_ops
+
+  @property
+  def cov_update_op(self):
+    return self._fisher_est.cov_update_op
+
+  @property
+  def inv_update_thunks(self):
+    return self._fisher_est.inv_update_thunks
+
+  @property
+  def inv_update_ops(self):
+    return self._fisher_est.inv_update_ops
+
+  @property
+  def inv_update_op(self):
+    return self._fisher_est.inv_update_op
+
   @property
   def variables(self):
     return self._fisher_est.variables
diff --git a/tensorflow/contrib/kfac/python/ops/utils.py b/tensorflow/contrib/kfac/python/ops/utils.py
index d5461c9f2ea0512ad7c4f2d393ac8e7f441d1b77..e89508fa46b6e2ce278e5373e6c9d17203ad1ef2 100644
--- a/tensorflow/contrib/kfac/python/ops/utils.py
+++ b/tensorflow/contrib/kfac/python/ops/utils.py
@@ -20,16 +20,22 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import tpu_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
 
 # Method used for inverting matrices.
 POSDEF_INV_METHOD = "cholesky"
+POSDEF_EIG_METHOD = "self_adjoint"
 
 
 def set_global_constants(posdef_inv_method=None):
@@ -161,33 +167,11 @@ def mat2d_to_layer_params(vector_template, mat2d):
     return array_ops.reshape(mat2d, vector_template.shape)
 
 
-def compute_pi(left_factor, right_factor):
-  """Computes the scalar constant pi for Tikhonov regularization/damping.
-
-  pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) )
-  See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
-
-  Args:
-    left_factor: The left Kronecker factor Tensor.
-    right_factor: The right Kronecker factor Tensor.
-
-  Returns:
-    The computed scalar constant pi for these Kronecker Factors (as a Tensor).
-  """
-  # Instead of dividing by the dim of the norm, we multiply by the dim of the
-  # other norm. This works out the same in the ratio.
-  left_norm = math_ops.trace(left_factor) * right_factor.get_shape().as_list()[
-      0]
-  right_norm = math_ops.trace(right_factor) * left_factor.get_shape().as_list()[
-      0]
-  return math_ops.sqrt(left_norm / right_norm)
-
-
 def posdef_inv(tensor, damping):
   """Computes the inverse of tensor + damping * identity."""
   identity = linalg_ops.eye(tensor.shape.as_list()[0], dtype=tensor.dtype)
   damping = math_ops.cast(damping, dtype=tensor.dtype)
-  return posdef_inv_funcs[POSDEF_INV_METHOD](tensor, identity, damping)
+  return posdef_inv_functions[POSDEF_INV_METHOD](tensor, identity, damping)
 
 
 def posdef_inv_matrix_inverse(tensor, identity, damping):
@@ -209,23 +193,51 @@ def posdef_inv_eig(tensor, identity, damping):
       eigenvectors / eigenvalues, eigenvectors, transpose_b=True)
 
 
-posdef_inv_funcs = {
+posdef_inv_functions = {
     "matrix_inverse": posdef_inv_matrix_inverse,
     "cholesky": posdef_inv_cholesky,
     "eig": posdef_inv_eig,
 }
 
 
+def posdef_eig(mat):
+  """Computes the eigendecomposition of a positive semidefinite matrix."""
+  return posdef_eig_functions[POSDEF_EIG_METHOD](mat)
+
+
+def posdef_eig_svd(mat):
+  """Computes the singular values and left singular vectors of a matrix."""
+  evals, evecs, _ = linalg_ops.svd(mat)
+
+  return evals, evecs
+
+
+def posdef_eig_self_adjoint(mat):
+  """Computes eigendecomposition using self_adjoint_eig."""
+  evals, evecs = linalg_ops.self_adjoint_eig(mat)
+  evals = math_ops.abs(evals)  # Should be equivalent to svd approach.
+
+  return evals, evecs
+
+
+posdef_eig_functions = {
+    "self_adjoint": posdef_eig_self_adjoint,
+    "svd": posdef_eig_svd,
+}
+
+
 class SubGraph(object):
   """Defines a subgraph given by all the dependencies of a given set of outputs.
   """
 
   def __init__(self, outputs):
+    # Set of all ancestor Tensors, Ops to 'outputs'.
     self._members = set()
 
     self._recurse_add(outputs)
 
   def _recurse_add(self, nodes):
+    """Recursively adds all of nodes' ancestors."""
     for node in nodes:
       if node in self._members:
         continue
@@ -241,8 +253,25 @@ class SubGraph(object):
     return node in self._members
 
   def variable_uses(self, var):
-    """Computes number of times a variable is used."""
-    return len(self._members.intersection(set(var.value().consumers())))
+    """Computes number of times a variable is used.
+
+    Args:
+      var: Variable or ResourceVariable instance.
+
+    Returns:
+      Number of times a variable is used within this subgraph.
+
+    Raises:
+      ValueError: If 'var' is not a variable type.
+    """
+    if isinstance(var, resource_variable_ops.ResourceVariable):
+      var = var.handle
+    elif isinstance(var, variables.Variable):
+      var = var.value()
+    else:
+      raise ValueError("%s does not appear to be a variable." % str(var))
+
+    return len(self._members.intersection(set(var.consumers())))
 
   def filter_list(self, node_list):
     """Filters 'node_list' to nodes in this subgraph."""
@@ -287,5 +316,109 @@ def fwd_gradients(ys, xs, grad_xs=None, stop_gradients=None):
 
   return dysdx
 
+
+def on_tpu():
+  """Returns True when building a TPU computation."""
+  return tpu_function.get_tpu_context().number_of_shards is not None
+
+
+def cross_replica_mean(tensor, name=None):
+  """Takes mean value of a Tensor across all TPU cores.
+
+  Args:
+    tensor: Tensor to be synchronized.
+    name: None or string. Name of Op.
+
+  Returns:
+    Average of Tensor across all TPU cores.
+
+  Raises:
+    ValueError: If called outside of TPU context.
+  """
+  with ops.name_scope(name, "cross_replica_mean", [tensor]):
+    num_shards = tpu_function.get_tpu_context().number_of_shards
+    if num_shards is None:
+      raise ValueError(
+          "Cannot take cross_replica_mean() outside of TPU Context.")
+    if num_shards == 1:
+      return tensor
+    return tpu_ops.cross_replica_sum(tensor / num_shards)
+
+
+def ensure_sequence(obj):
+  """If `obj` isn't a tuple or list, return a tuple containing `obj`."""
+  if isinstance(obj, (tuple, list)):
+    return obj
+  else:
+    return (obj,)
+
+
+def batch_execute(global_step, thunks, batch_size, name=None):
+  """Executes a subset of ops per global step.
+
+  Given a list of thunks, each of which produces a single stateful op,
+  ensures that exactly 'batch_size' ops are run per global step. Ops are
+  scheduled in a round-robin fashion. For example, with 3 ops
+
+    global_step | op0 | op1 | op2
+    ------------+-----+-----+-----
+        0       |  x  |  x  |
+    ------------+-----+-----+-----
+        1       |  x  |     |  x
+    ------------+-----+-----+-----
+        2       |     |  x  |  x
+    ------------+-----+-----+-----
+        3       |  x  |  x  |
+    ------------+-----+-----+-----
+        4       |  x  |     |  x
+
+  Does not guarantee order of op execution within a single global step.
+
+  Args:
+    global_step: Tensor indicating time. Determines which ops run.
+    thunks: List of thunks. Each thunk encapsulates one op. Return values are
+      ignored.
+    batch_size: int. Number of ops to execute per global_step.
+    name: string or None. Name scope for newly added ops.
+
+  Returns:
+    List of ops. Exactly 'batch_size' ops are guaranteed to have an effect
+    every global step.
+  """
+
+  def true_fn(thunk):
+    """Ensures thunk is executed and returns an Op (not a Tensor)."""
+
+    def result():
+      with ops.control_dependencies([thunk()]):
+        return control_flow_ops.no_op()
+
+    return result
+
+  def false_fn(_):
+    """Executes a no-op."""
+
+    def result():
+      return control_flow_ops.no_op()
+
+    return result
+
+  with ops.name_scope(name, "batch_execute"):
+    true_fns = [true_fn(thunk) for thunk in thunks]
+    false_fns = [false_fn(thunk) for thunk in thunks]
+    num_thunks = len(thunks)
+    conditions = [
+        math_ops.less(
+            math_ops.mod(batch_size - 1 + global_step * batch_size - j,
+                         num_thunks), batch_size) for j in range(num_thunks)
+    ]
+    result = [
+        control_flow_ops.cond(condition, true_fn, false_fn)
+        for (condition, true_fn,
+             false_fn) in zip(conditions, true_fns, false_fns)
+    ]
+    return result
+
+
 # TODO(b/69623235): Add a function for finding tensors that share gradients
 # to eliminate redundant fisher factor computations.
diff --git a/tensorflow/contrib/kfac/python/ops/utils_lib.py b/tensorflow/contrib/kfac/python/ops/utils_lib.py
index 9df07d69aad5e61f9cfb994c9a63fdec04f025fe..fe8e39c212c2c3381f9aa6fdb9fdf423ff958481 100644
--- a/tensorflow/contrib/kfac/python/ops/utils_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/utils_lib.py
@@ -24,13 +24,13 @@ from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
 
 _allowed_symbols = [
+    "set_global_constants",
     "SequenceDict",
     "tensors_to_column",
     "column_to_tensors",
     "kronecker_product",
     "layer_params_to_mat2d",
     "mat2d_to_layer_params",
-    "compute_pi",
     "posdef_inv",
     "posdef_inv_matrix_inverse",
     "posdef_inv_cholesky",
@@ -38,6 +38,8 @@ _allowed_symbols = [
     "SubGraph",
     "generate_random_signs",
     "fwd_gradients",
+    "ensure_sequence",
+    "batch_execute",
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/core_test.py b/tensorflow/contrib/labeled_tensor/python/ops/core_test.py
index 1f4a3ef568efc459d4a36fcb0d5de7e0bce8335c..e70b4923749d89aba1bd0187857d762305daeb07 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/core_test.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/core_test.py
@@ -225,7 +225,7 @@ class LabeledTensorTest(test_util.Base):
     tensor = array_ops.placeholder(dtypes.string, [None])
     actual = core.LabeledTensor(tensor, ['x'])
     self.assertIsNone(actual.axes['x'].size)
-    self.assertIs(actual.axes['x'].value, tensor.get_shape()[0])
+    self.assertIsNone(actual.axes['x'].value.value)
 
   def test_eq(self):
     self.assertEqual(self.lt, self.lt)
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index 6c624929f20503054e0258aad8a843f4a201be64..337c9e06b870b2cca53fcdbf3d94225660e193c4 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -27,6 +27,7 @@ See the @{$python/contrib.layers} guide.
 @@convolution2d_transpose
 @@conv3d_transpose
 @@convolution3d_transpose
+@@dense_to_sparse
 @@dropout
 @@elu
 @@embedding_lookup_unique
@@ -34,6 +35,7 @@ See the @{$python/contrib.layers} guide.
 @@fully_connected
 @@GDN
 @@gdn
+@@images_to_sequence
 @@layer_norm
 @@linear
 @@max_pool2d
@@ -49,6 +51,7 @@ See the @{$python/contrib.layers} guide.
 @@scale_gradient
 @@separable_conv2d
 @@separable_convolution2d
+@@sequence_to_images
 @@softmax
 @@spatial_softmax
 @@stack
diff --git a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
index 932c5ab99249feda1e3a7f2d707ce4237fe7177f..01893d60615a9b4ded2afc88c6de0168d4be0921 100644
--- a/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
+++ b/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc
@@ -423,8 +423,9 @@ class SparseFeatureCrossOp : public OpKernel {
               "Input values should be a std::vector but received shape ",
               values_list_in[i].shape().DebugString(), " at position ", i));
       OP_REQUIRES(
-          context, indices_list_in[i].shape().dim_size(0) ==
-                       values_list_in[i].shape().dim_size(0),
+          context,
+          indices_list_in[i].shape().dim_size(0) ==
+              values_list_in[i].shape().dim_size(0),
           errors::InvalidArgument(
               "Expected size of values to be ",
               indices_list_in[i].shape().dim_size(0), " got ",
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 226d933d85d91600e36ffb84212703e10455bfbb..b7d34d6435789e54403926a342481971e854b449 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -156,6 +156,10 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
 
 
+# Imports the core `InputLayer` symbol in contrib during development.
+InputLayer = fc_core.InputLayer  # pylint: disable=invalid-name
+
+
 class _LinearEmbeddingLookupArguments(
     collections.namedtuple("_LinearEmbeddingLookupArguments",
                            ["input_tensor",
@@ -521,7 +525,7 @@ def sparse_column_with_integerized_feature(column_name,
 
   Args:
     column_name: A string defining sparse column name.
-    bucket_size: An int that is > 1. The number of buckets. It should be bigger
+    bucket_size: An int that is >= 1. The number of buckets. It should be bigger
       than maximum feature. In other words features in this column should be an
       int64 in range [0, bucket_size)
     combiner: A string specifying how to reduce if the sparse column is
@@ -539,7 +543,7 @@ def sparse_column_with_integerized_feature(column_name,
     An integerized _SparseColumn definition.
 
   Raises:
-    ValueError: bucket_size is not greater than 1.
+    ValueError: bucket_size is less than 1.
     ValueError: dtype is not integer.
   """
   return _SparseColumnIntegerized(
@@ -748,6 +752,10 @@ class _WeightedSparseColumn(
         {self.weight_column_name: parsing_ops.VarLenFeature(self.dtype)})
     return config
 
+  @property
+  def lookup_config(self):
+    return self.sparse_id_column.lookup_config
+
   @property
   def key(self):
     """Returns a string which will be used as a key when we do sorting."""
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops.py b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
index fa0047f05d893f6543ddb1680824a32469e13293..78affea44cbfb92523063968dbc1be98841854db 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
@@ -97,10 +97,13 @@ def _input_from_feature_columns(columns_to_tensors,
                                 trainable,
                                 scope,
                                 output_rank,
-                                default_name):
+                                default_name,
+                                cols_to_outs=None):
   """Implementation of `input_from(_sequence)_feature_columns`."""
   columns_to_tensors = columns_to_tensors.copy()
   check_feature_columns(feature_columns)
+  if cols_to_outs is not None and not isinstance(cols_to_outs, dict):
+    raise ValueError('cols_to_outs must be a dict unless None')
   with variable_scope.variable_scope(scope,
                                      default_name=default_name,
                                      values=columns_to_tensors.values()):
@@ -144,6 +147,8 @@ def _input_from_feature_columns(columns_to_tensors,
           except ValueError as e:
             raise ValueError('Error creating input layer for column: {}.\n'
                              '{}, {}'.format(column.name, e, ee))
+        if cols_to_outs is not None:
+          cols_to_outs[column] = output_tensors[-1]
     return array_ops.concat(output_tensors, output_rank - 1)
 
 
@@ -151,7 +156,8 @@ def input_from_feature_columns(columns_to_tensors,
                                feature_columns,
                                weight_collections=None,
                                trainable=True,
-                               scope=None):
+                               scope=None,
+                               cols_to_outs=None):
   """A tf.contrib.layers style input layer builder based on FeatureColumns.
 
   Generally a single example in training data is described with feature columns.
@@ -196,6 +202,8 @@ def input_from_feature_columns(columns_to_tensors,
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     scope: Optional scope for variable_scope.
+    cols_to_outs: Optional dict from feature column to output tensor,
+      which is concatenated into the returned tensor.
 
   Returns:
     A Tensor which can be consumed by hidden layers in the neural network.
@@ -209,7 +217,8 @@ def input_from_feature_columns(columns_to_tensors,
                                      trainable,
                                      scope,
                                      output_rank=2,
-                                     default_name='input_from_feature_columns')
+                                     default_name='input_from_feature_columns',
+                                     cols_to_outs=cols_to_outs)
 
 
 @experimental
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index fbfa0e32de55edab3c90189ddfe05ab826ac9167..e6bbd86ab722c4e853a59f816bed8a8ac1fe9ede 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -607,6 +607,31 @@ class CreateInputLayersForDNNsTest(test.TestCase):
       # Verify cross compatibility: Core builder output should equal to contrib.
       self.assertAllEqual(output.eval().shape, output_core.eval().shape)
 
+  def testAllDNNColumnsWithColumnwiseOutputs(self):
+    sparse_column = feature_column.sparse_column_with_keys(
+        "ids", ["a", "b", "c", "unseen"])
+    real_valued_column = feature_column.real_valued_column("income", 2)
+    one_hot_column = feature_column.one_hot_column(sparse_column)
+    embedding_column = feature_column.embedding_column(sparse_column, 10)
+    features = {
+        "ids":
+            sparse_tensor.SparseTensor(
+                values=["c", "b", "a"],
+                indices=[[0, 0], [1, 0], [2, 0]],
+                dense_shape=[3, 1]),
+        "income":
+            constant_op.constant([[20.3, 10], [110.3, 0.4], [-3.0, 30.4]]),
+    }
+    columns = [one_hot_column, embedding_column, real_valued_column]
+    cols_to_outs = {}
+    feature_column_ops.input_from_feature_columns(
+        features, columns, cols_to_outs=cols_to_outs)
+    with self.test_session():
+      variables_lib.global_variables_initializer().run()
+      lookup_ops.tables_initializer().run()
+      for column in columns:
+        self.assertTrue(column in cols_to_outs)
+
   def testRealValuedColumn(self):
     real_valued = feature_column.real_valued_column("price")
     features = {"price": constant_op.constant([[20.], [110], [-3]])}
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index 5ae885b7202357326bd8494d382adb57fa636d20..fc8f153fe3abdc83aca5abfa9a4bb5f5d5531480 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -102,6 +102,16 @@ class FeatureColumnTest(test.TestCase):
     weighted_ids = fc.weighted_sparse_column(ids, "weights")
     self.assertEqual(weighted_ids.name, "ids_weighted_by_weights")
 
+  def testWeightedSparseColumnWithVocabularyFile(self):
+    ids = fc.sparse_column_with_vocabulary_file(
+        "ids", "a_file", num_oov_buckets=7, vocab_size=3)
+    weighted_ids = fc.weighted_sparse_column(ids, "weights")
+    self.assertEqual(weighted_ids.name, "ids_weighted_by_weights")
+    self.assertEqual(weighted_ids.lookup_config, ids.lookup_config)
+    self.assertEqual(weighted_ids.lookup_config.vocab_size, 3)
+    self.assertEqual(weighted_ids.lookup_config.num_oov_buckets, 7)
+    self.assertEqual(weighted_ids.lookup_config.vocabulary_file, "a_file")
+
   def testWeightedSparseColumnDeepCopy(self):
     ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
     weighted = fc.weighted_sparse_column(ids, "weights")
@@ -211,8 +221,8 @@ class FeatureColumnTest(test.TestCase):
     weighted_sparse_col = fc.weighted_sparse_column(ids, "weights")
     self.assertEqual(weighted_sparse_col.name, "ids_weighted_by_weights")
 
-    b = fc.shared_embedding_columns([sparse_col, weighted_sparse_col],
-                                    dimension=4, combiner="mean")
+    b = fc.shared_embedding_columns(
+        [sparse_col, weighted_sparse_col], dimension=4, combiner="mean")
     self.assertEqual(len(b), 2)
     self.assertEqual(b[0].shared_embedding_name,
                      "a1_ids_weighted_by_weights_shared_embedding")
@@ -220,8 +230,8 @@ class FeatureColumnTest(test.TestCase):
                      "a1_ids_weighted_by_weights_shared_embedding")
 
     # Tries reversing order to check compatibility condition.
-    b = fc.shared_embedding_columns([weighted_sparse_col, sparse_col],
-                                    dimension=4, combiner="mean")
+    b = fc.shared_embedding_columns(
+        [weighted_sparse_col, sparse_col], dimension=4, combiner="mean")
     self.assertEqual(len(b), 2)
     self.assertEqual(b[0].shared_embedding_name,
                      "a1_ids_weighted_by_weights_shared_embedding")
@@ -230,18 +240,17 @@ class FeatureColumnTest(test.TestCase):
 
     # Tries adding two weighted columns to check compatibility between them.
     weighted_sparse_col_2 = fc.weighted_sparse_column(ids, "weights_2")
-    b = fc.shared_embedding_columns([weighted_sparse_col,
-                                     weighted_sparse_col_2],
-                                    dimension=4, combiner="mean")
+    b = fc.shared_embedding_columns(
+        [weighted_sparse_col, weighted_sparse_col_2],
+        dimension=4,
+        combiner="mean")
     self.assertEqual(len(b), 2)
     self.assertEqual(
         b[0].shared_embedding_name,
-        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding"
-    )
+        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding")
     self.assertEqual(
         b[1].shared_embedding_name,
-        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding"
-    )
+        "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding")
 
   def testSharedEmbeddingColumnDeterminism(self):
     # Tests determinism in auto-generated shared_embedding_name.
@@ -276,10 +285,10 @@ class FeatureColumnTest(test.TestCase):
     columns = fc.shared_embedding_columns(
         [a1, a2], dimension=4, combiner="mean")
     columns_copy = copy.deepcopy(columns)
-    self.assertEqual(
-        columns_copy[0].shared_embedding_name, "a1_a2_shared_embedding")
-    self.assertEqual(
-        columns_copy[1].shared_embedding_name, "a1_a2_shared_embedding")
+    self.assertEqual(columns_copy[0].shared_embedding_name,
+                     "a1_a2_shared_embedding")
+    self.assertEqual(columns_copy[1].shared_embedding_name,
+                     "a1_a2_shared_embedding")
 
   def testOneHotColumn(self):
     a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"])
@@ -326,11 +335,11 @@ class FeatureColumnTest(test.TestCase):
     weighted_ids = fc.weighted_sparse_column(ids, "weights")
     one_hot = fc.one_hot_column(weighted_ids)
     features = {
-        'ids': constant_op.constant([['marlo', 'unknown', 'omar']]),
-        'weights': constant_op.constant([[2., 4., 6.]])
+        "ids": constant_op.constant([["marlo", "unknown", "omar"]]),
+        "weights": constant_op.constant([[2., 4., 6.]])
     }
     one_hot_tensor = feature_column_ops.input_from_feature_columns(
-      features, [one_hot])
+        features, [one_hot])
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
       sess.run(lookup_ops.tables_initializer())
@@ -339,11 +348,9 @@ class FeatureColumnTest(test.TestCase):
   def testMissingValueInOneHotColumnForSparseColumnWithKeys(self):
     ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"])
     one_hot = fc.one_hot_column(ids)
-    features = {
-      'ids': constant_op.constant([['marlo', 'unknown', 'omar']])
-    }
+    features = {"ids": constant_op.constant([["marlo", "unknown", "omar"]])}
     one_hot_tensor = feature_column_ops.input_from_feature_columns(
-      features, [one_hot])
+        features, [one_hot])
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
       sess.run(lookup_ops.tables_initializer())
@@ -369,8 +376,7 @@ class FeatureColumnTest(test.TestCase):
     self.assertEqual(d4.default_value, None)
     self.assertEqual(d4.is_sparse, True)
     # Default value is a list but dimension is None.
-    with self.assertRaisesRegexp(ValueError,
-                                 "Only scalar default value.*"):
+    with self.assertRaisesRegexp(ValueError, "Only scalar default value.*"):
       fc._real_valued_var_len_column("g5", default_value=[2., 3.])
 
   def testRealValuedVarLenColumnDtypes(self):
@@ -380,18 +386,19 @@ class FeatureColumnTest(test.TestCase):
             "rvc": parsing_ops.VarLenFeature(dtype=dtypes.float32)
         }, rvc.config)
 
-    rvc = fc._real_valued_var_len_column("rvc", default_value=0,
-                                         is_sparse=False)
-    self.assertDictEqual(
-        {
-            "rvc": parsing_ops.FixedLenSequenceFeature(shape=[],
-                                                       dtype=dtypes.float32,
-                                                       allow_missing=True,
-                                                       default_value=0.0)
-        }, rvc.config)
-
-    rvc = fc._real_valued_var_len_column("rvc", dtype=dtypes.int32,
-                                         default_value=0, is_sparse=True)
+    rvc = fc._real_valued_var_len_column(
+        "rvc", default_value=0, is_sparse=False)
+    self.assertDictEqual({
+        "rvc":
+            parsing_ops.FixedLenSequenceFeature(
+                shape=[],
+                dtype=dtypes.float32,
+                allow_missing=True,
+                default_value=0.0)
+    }, rvc.config)
+
+    rvc = fc._real_valued_var_len_column(
+        "rvc", dtype=dtypes.int32, default_value=0, is_sparse=True)
     self.assertDictEqual(
         {
             "rvc": parsing_ops.VarLenFeature(dtype=dtypes.int32)
@@ -399,8 +406,8 @@ class FeatureColumnTest(test.TestCase):
 
     with self.assertRaisesRegexp(TypeError,
                                  "dtype must be convertible to float"):
-      fc._real_valued_var_len_column("rvc", dtype=dtypes.string,
-                                     default_value="", is_sparse=True)
+      fc._real_valued_var_len_column(
+          "rvc", dtype=dtypes.string, default_value="", is_sparse=True)
 
   def testRealValuedColumn(self):
     a = fc.real_valued_column("aaa")
@@ -494,13 +501,13 @@ class FeatureColumnTest(test.TestCase):
     for output_rank in range(1, 3 + len(dimensions)):
       with variable_scope.variable_scope("output_rank_{}".format(output_rank)):
         real_valued_output = real_valued_column._to_dnn_input_layer(
-            constant_op.constant(
-                real_valued_input, dtype=dtypes.float32),
+            constant_op.constant(real_valued_input, dtype=dtypes.float32),
             output_rank=output_rank)
       with self.test_session() as sess:
         real_valued_eval = sess.run(real_valued_output)
-      expected_shape = (input_shape[:output_rank - 1] +
-                        [np.prod(input_shape[output_rank - 1:])])
+      expected_shape = (
+          input_shape[:output_rank - 1] +
+          [np.prod(input_shape[output_rank - 1:])])
       self.assertEquals(expected_shape, list(real_valued_eval.shape))
 
   def testRealValuedColumnDensification(self):
@@ -510,8 +517,7 @@ class FeatureColumnTest(test.TestCase):
         "sparse_real_valued1", is_sparse=True)
     sparse_tensor = sparse_tensor_lib.SparseTensor(
         values=[2.0, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
-    with self.assertRaisesRegexp(
-        ValueError, "Set is_sparse to False"):
+    with self.assertRaisesRegexp(ValueError, "Set is_sparse to False"):
       real_valued_column._to_dnn_input_layer(sparse_tensor)
 
   def testRealValuedColumnDeepCopy(self):
@@ -539,9 +545,8 @@ class FeatureColumnTest(test.TestCase):
   def testBucketizedColumnRequiresRealValuedColumnDimension(self):
     with self.assertRaisesRegexp(
         TypeError, "source_column must be an instance of _RealValuedColumn.*"):
-      fc.bucketized_column(fc._real_valued_var_len_column("bbb",
-                                                          is_sparse=True),
-                           [0])
+      fc.bucketized_column(
+          fc._real_valued_var_len_column("bbb", is_sparse=True), [0])
 
   def testBucketizedColumnRequiresSortedBuckets(self):
     with self.assertRaisesRegexp(ValueError,
@@ -644,20 +649,14 @@ class FeatureColumnTest(test.TestCase):
 
   def testRealValuedColumnDtypes(self):
     rvc = fc.real_valued_column("rvc")
-    self.assertDictEqual(
-        {
-            "rvc": parsing_ops.FixedLenFeature(
-                [1], dtype=dtypes.float32)
-        },
-        rvc.config)
+    self.assertDictEqual({
+        "rvc": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32)
+    }, rvc.config)
 
     rvc = fc.real_valued_column("rvc", dtype=dtypes.int32)
-    self.assertDictEqual(
-        {
-            "rvc": parsing_ops.FixedLenFeature(
-                [1], dtype=dtypes.int32)
-        },
-        rvc.config)
+    self.assertDictEqual({
+        "rvc": parsing_ops.FixedLenFeature([1], dtype=dtypes.int32)
+    }, rvc.config)
 
     with self.assertRaisesRegexp(ValueError,
                                  "dtype must be convertible to float"):
@@ -692,8 +691,9 @@ class FeatureColumnTest(test.TestCase):
     batch_size = 4
     dense_scalar_input = [1, 2, 3, 4]
     sparse_column = fc.sparse_column_with_integerized_feature("values", 10)
-    features = {"values":
-                constant_op.constant(dense_scalar_input, dtype=dtypes.int64)}
+    features = {
+        "values": constant_op.constant(dense_scalar_input, dtype=dtypes.int64)
+    }
     sparse_column.insert_transformed_feature(features)
     sparse_output = features[sparse_column]
     expected_shape = [batch_size, 1]
@@ -721,8 +721,7 @@ class FeatureColumnTest(test.TestCase):
 
   def testSparseColumnKeysDeepCopy(self):
     """Tests deepcopy of sparse_column_with_keys."""
-    column = fc.sparse_column_with_keys(
-        "a", keys=["key0", "key1", "key2"])
+    column = fc.sparse_column_with_keys("a", keys=["key0", "key1", "key2"])
     self.assertEqual("a", column.name)
     column_copy = copy.deepcopy(column)
     self.assertEqual("a", column_copy.name)
@@ -775,8 +774,9 @@ class FeatureColumnTest(test.TestCase):
     a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100)
     b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100)
     cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000)
-    one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket(
-        "sparse_column_for_one_hot", hash_bucket_size=100))
+    one_hot_col = fc.one_hot_column(
+        fc.sparse_column_with_hash_bucket(
+            "sparse_column_for_one_hot", hash_bucket_size=100))
     scattered_embedding_col = fc.scattered_embedding_column(
         "scattered_embedding_column", size=100, dimension=10, hash_key=1)
     feature_columns = set([
@@ -799,17 +799,13 @@ class FeatureColumnTest(test.TestCase):
         "str_id_weights_column":
             parsing_ops.VarLenFeature(dtypes.float32),
         "real_valued_column1":
-            parsing_ops.FixedLenFeature(
-                [1], dtype=dtypes.float32),
+            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
         "real_valued_column2":
-            parsing_ops.FixedLenFeature(
-                [5], dtype=dtypes.float32),
+            parsing_ops.FixedLenFeature([5], dtype=dtypes.float32),
         "real_valued_column_for_bucketization1":
-            parsing_ops.FixedLenFeature(
-                [1], dtype=dtypes.float32),
+            parsing_ops.FixedLenFeature([1], dtype=dtypes.float32),
         "real_valued_column_for_bucketization2":
-            parsing_ops.FixedLenFeature(
-                [4], dtype=dtypes.float32),
+            parsing_ops.FixedLenFeature([4], dtype=dtypes.float32),
         "cross_aaa":
             parsing_ops.VarLenFeature(dtypes.string),
         "cross_bbb":
@@ -839,11 +835,14 @@ class FeatureColumnTest(test.TestCase):
     real_valued_col0 = fc._real_valued_var_len_column(
         "real_valued_column0", is_sparse=True)
     real_valued_col1 = fc._real_valued_var_len_column(
-        "real_valued_column1", dtype=dtypes.int64, default_value=0,
+        "real_valued_column1",
+        dtype=dtypes.int64,
+        default_value=0,
         is_sparse=False)
     feature_columns = set([real_valued_col0, real_valued_col1])
     expected_config = {
-        "real_valued_column0": parsing_ops.VarLenFeature(dtype=dtypes.float32),
+        "real_valued_column0":
+            parsing_ops.VarLenFeature(dtype=dtypes.float32),
         "real_valued_column1":
             parsing_ops.FixedLenSequenceFeature(
                 [], dtype=dtypes.int64, allow_missing=True, default_value=0),
@@ -864,7 +863,9 @@ class FeatureColumnTest(test.TestCase):
     real_valued_col5 = fc._real_valued_var_len_column(
         "real_valued_column5", default_value=2, is_sparse=True)
     real_valued_col6 = fc._real_valued_var_len_column(
-        "real_valued_column6", dtype=dtypes.int64, default_value=1,
+        "real_valued_column6",
+        dtype=dtypes.int64,
+        default_value=1,
         is_sparse=False)
     feature_columns = [
         real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4,
@@ -892,8 +893,7 @@ class FeatureColumnTest(test.TestCase):
                 parsing_ops.VarLenFeature(dtype=dtypes.float32),
             "real_valued_column6":
                 parsing_ops.FixedLenSequenceFeature(
-                    [], dtype=dtypes.int64, allow_missing=True,
-                    default_value=1)
+                    [], dtype=dtypes.int64, allow_missing=True, default_value=1)
         },
         config)
 
@@ -1094,8 +1094,8 @@ class FeatureColumnTest(test.TestCase):
       # This will initialize the crossed column weights from provided checkpoint
       # and return a [4, 1] tensor which is same as weights variable. Since we
       # won't modify weights, this should be same as 'saved_col_weights'.
-      _, col_weights, _ = (feature_column_ops.weighted_sum_from_feature_columns(
-          {
+      _, col_weights, _ = (
+          feature_column_ops.weighted_sum_from_feature_columns({
               sparse_col_1.name: input_tensor,
               sparse_col_2.name: input_tensor
           }, [crossed_col_initialized], 1))
diff --git a/tensorflow/contrib/layers/python/layers/initializers.py b/tensorflow/contrib/layers/python/layers/initializers.py
index b12a882d9ae88f7cf4f920cfa5872e5de1c67290..51610f21b24f1d40f26630cc1e69ca723d130639 100644
--- a/tensorflow/contrib/layers/python/layers/initializers.py
+++ b/tensorflow/contrib/layers/python/layers/initializers.py
@@ -79,7 +79,8 @@ def variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False,
   ```
 
   * To get [Delving Deep into Rectifiers](
-     http://arxiv.org/pdf/1502.01852v1.pdf), use (Default):<br/>
+     http://arxiv.org/pdf/1502.01852v1.pdf) (also know as the "MSRA 
+     initialization"), use (Default):<br/>
     `factor=2.0 mode='FAN_IN' uniform=False`
   * To get [Convolutional Architecture for Fast Feature Embedding](
      http://arxiv.org/abs/1408.5093), use:<br/>
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 6cd586a5f016c76cc52b340bfd0d32fa08f23748..5c1ff9ec267f1bccd9bee44a4b19e7ed3ec24cf0 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import initializers
 from tensorflow.contrib.layers.python.layers import utils
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
@@ -54,47 +55,18 @@ from tensorflow.python.layers.maxout import maxout
 
 # TODO(b/28426988): Replace legacy_* fns migrated from slim.
 # TODO(b/28426988): Remove legacy_* when all uses have migrated to new API.
-__all__ = ['avg_pool2d',
-           'avg_pool3d',
-           'batch_norm',
-           'bias_add',
-           'conv2d',
-           'conv3d',
-           'conv2d_in_plane',
-           'conv2d_transpose',
-           'conv3d_transpose',
-           'convolution',
-           'convolution2d',
-           'convolution2d_in_plane',
-           'convolution2d_transpose',
-           'convolution3d',
-           'convolution3d_transpose',
-           'dropout',
-           'elu',
-           'flatten',
-           'fully_connected',
-           'GDN',
-           'gdn',
-           'layer_norm',
-           'linear',
-           'pool',
-           'max_pool2d',
-           'max_pool3d',
-           'one_hot_encoding',
-           'relu',
-           'relu6',
-           'repeat',
-           'scale_gradient',
-           'separable_conv2d',
-           'separable_convolution2d',
-           'softmax',
-           'spatial_softmax',
-           'stack',
-           'unit_norm',
-           'legacy_fully_connected',
-           'legacy_linear',
-           'legacy_relu',
-           'maxout']
+__all__ = [
+    'avg_pool2d', 'avg_pool3d', 'batch_norm', 'bias_add', 'conv2d', 'conv3d',
+    'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', 'convolution',
+    'convolution2d', 'convolution2d_in_plane', 'convolution2d_transpose',
+    'convolution3d', 'convolution3d_transpose', 'dense_to_sparse',
+    'dropout', 'elu', 'flatten', 'fully_connected', 'GDN', 'gdn',
+    'images_to_sequence', 'layer_norm', 'linear', 'pool', 'max_pool2d',
+    'max_pool3d', 'one_hot_encoding', 'relu', 'relu6', 'repeat',
+    'scale_gradient', 'separable_conv2d', 'separable_convolution2d',
+    'sequence_to_images', 'softmax', 'spatial_softmax', 'stack', 'unit_norm',
+    'legacy_fully_connected', 'legacy_linear', 'legacy_relu', 'maxout'
+]
 
 DATA_FORMAT_NCHW = 'NCHW'
 DATA_FORMAT_NHWC = 'NHWC'
@@ -139,13 +111,14 @@ def avg_pool2d(inputs,
     raise ValueError('data_format has to be either NCHW or NHWC.')
   with ops.name_scope(scope, 'AvgPool2D', [inputs]) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
-    layer = pooling_layers.AveragePooling2D(pool_size=kernel_size,
-                                            strides=stride,
-                                            padding=padding,
-                                            data_format=df,
-                                            _scope=sc)
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
+    layer = pooling_layers.AveragePooling2D(
+        pool_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        _scope=sc)
     outputs = layer.apply(inputs)
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
@@ -187,13 +160,14 @@ def avg_pool3d(inputs,
     raise ValueError('data_format has to be either NCDHW or NDHWC.')
   with ops.name_scope(scope, 'AvgPool3D', [inputs]) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
-    layer = pooling_layers.AveragePooling3D(pool_size=kernel_size,
-                                            strides=stride,
-                                            padding=padding,
-                                            data_format=df,
-                                            _scope=sc)
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
+    layer = pooling_layers.AveragePooling3D(
+        pool_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        _scope=sc)
     outputs = layer.apply(inputs)
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
@@ -298,8 +272,8 @@ def _fused_batch_norm(inputs,
       raise ValueError('Inputs %s has undefined rank' % inputs.name)
     elif original_rank not in [2, 4]:
       raise ValueError('Inputs %s has unsupported rank.'
-                       ' Expected 2 or 4 but got %d' % (
-                           inputs.name, original_rank))
+                       ' Expected 2 or 4 but got %d' % (inputs.name,
+                                                        original_rank))
     if original_rank == 2:
       channels = inputs.get_shape()[-1].value
       if channels is None:
@@ -393,6 +367,7 @@ def _fused_batch_norm(inputs,
     def _fused_batch_norm_training():
       return nn.fused_batch_norm(
           inputs, gamma, beta, epsilon=epsilon, data_format=data_format)
+
     def _fused_batch_norm_inference():
       return nn.fused_batch_norm(
           inputs,
@@ -403,9 +378,9 @@ def _fused_batch_norm(inputs,
           epsilon=epsilon,
           is_training=False,
           data_format=data_format)
-    outputs, mean, variance = utils.smart_cond(is_training,
-                                               _fused_batch_norm_training,
-                                               _fused_batch_norm_inference)
+
+    outputs, mean, variance = utils.smart_cond(
+        is_training, _fused_batch_norm_training, _fused_batch_norm_inference)
 
     # If `is_training` doesn't have a constant value, because it is a `Tensor`,
     # a `Variable` or `Placeholder` then is_training_value will be None and
@@ -415,6 +390,7 @@ def _fused_batch_norm(inputs,
     if need_updates:
       if updates_collections is None:
         no_updates = lambda: outputs
+
         def _force_updates():
           """Internal function forces updates moving_vars if is_training."""
           update_moving_mean = moving_averages.assign_moving_average(
@@ -424,9 +400,11 @@ def _fused_batch_norm(inputs,
           with ops.control_dependencies(
               [update_moving_mean, update_moving_variance]):
             return array_ops.identity(outputs)
+
         outputs = utils.smart_cond(is_training, _force_updates, no_updates)
       else:
         moving_vars_fn = lambda: (moving_mean, moving_variance)
+
         def _delay_updates():
           """Internal function that delay updates moving_vars if is_training."""
           update_moving_mean = moving_averages.assign_moving_average(
@@ -434,9 +412,9 @@ def _fused_batch_norm(inputs,
           update_moving_variance = moving_averages.assign_moving_average(
               moving_variance, variance, decay, zero_debias=False)
           return update_moving_mean, update_moving_variance
-        update_mean, update_variance = utils.smart_cond(is_training,
-                                                        _delay_updates,
-                                                        moving_vars_fn)
+
+        update_mean, update_variance = utils.smart_cond(
+            is_training, _delay_updates, moving_vars_fn)
         ops.add_to_collections(updates_collections, update_mean)
         ops.add_to_collections(updates_collections, update_variance)
 
@@ -479,7 +457,12 @@ def batch_norm(inputs,
 
     Sergey Ioffe, Christian Szegedy
 
-  Can be used as a normalizer function for conv2d and fully_connected.
+  Can be used as a normalizer function for conv2d and fully_connected. The
+  normalization is over all but the last dimension if `data_format` is `NHWC`
+  and all but the second dimension if `data_format` is `NCHW`.  In case of a 2D
+  tensor this corresponds to the batch dimension, while in case of a 4D tensor
+  this
+  corresponds to the batch and space dimensions.
 
   Note: when training, the moving_mean and moving_variance need to be updated.
   By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
@@ -535,8 +518,8 @@ def batch_norm(inputs,
       then the batch normalization uses weighted mean and
       variance. (This can be used to correct for bias in training
       example selection.)
-    fused: if `True`, use a faster, fused implementation if possible.
-      If `None`, use the system recommended implementation.
+    fused: if `None` or `True`, use a faster, fused implementation if possible.
+      If `False`, use the system recommended implementation.
     data_format: A string. `NHWC` (default) and `NCHW` are supported.
     zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new
       pair of variables 'moving_mean/biased' and 'moving_mean/local_step'.
@@ -588,10 +571,9 @@ def batch_norm(inputs,
   #   implementation in normalization_layers.BatchNormalization.
   inputs = ops.convert_to_tensor(inputs)
   rank = inputs.get_shape().ndims
-  possible_to_fuse = (batch_weights is None and
-                      not renorm and
-                      rank in [2, 4] and
-                      adjustment is None)
+  possible_to_fuse = (
+      batch_weights is None and not renorm and rank in [2, 4] and
+      adjustment is None)
   if fused and possible_to_fuse and (
       zero_debias_moving_mean or rank == 2 or
       updates_collections is not ops.GraphKeys.UPDATE_OPS):
@@ -619,7 +601,9 @@ def batch_norm(inputs,
 
   layer_variable_getter = _build_variable_getter()
   with variable_scope.variable_scope(
-      scope, 'BatchNorm', [inputs], reuse=reuse,
+      scope,
+      'BatchNorm', [inputs],
+      reuse=reuse,
       custom_getter=layer_variable_getter) as sc:
     inputs = ops.convert_to_tensor(inputs)
 
@@ -667,15 +651,15 @@ def batch_norm(inputs,
       outputs = layer.apply(inputs, training=is_training)
 
       # Add variables to collections.
-      _add_variable_to_collections(
-          layer.moving_mean, variables_collections, 'moving_mean')
-      _add_variable_to_collections(
-          layer.moving_variance, variables_collections, 'moving_variance')
+      _add_variable_to_collections(layer.moving_mean, variables_collections,
+                                   'moving_mean')
+      _add_variable_to_collections(layer.moving_variance, variables_collections,
+                                   'moving_variance')
       if layer.beta is not None:
         _add_variable_to_collections(layer.beta, variables_collections, 'beta')
       if layer.gamma is not None:
-        _add_variable_to_collections(
-            layer.gamma, variables_collections, 'gamma')
+        _add_variable_to_collections(layer.gamma, variables_collections,
+                                     'gamma')
 
       if activation_fn is not None:
         outputs = activation_fn(outputs)
@@ -715,8 +699,8 @@ def batch_norm(inputs,
       params_shape = inputs_shape[-1:]
       params_shape_broadcast = None
     if not params_shape.is_fully_defined():
-      raise ValueError('Inputs %s has undefined channels dimension %s.' % (
-          inputs.name, params_shape))
+      raise ValueError('Inputs %s has undefined channels dimension %s.' %
+                       (inputs.name, params_shape))
 
     # Allocate parameters for the beta and gamma of the normalization.
     beta, gamma = None, None
@@ -727,23 +711,25 @@ def batch_norm(inputs,
                                                         'beta')
       beta_initializer = param_initializers.get('beta',
                                                 init_ops.zeros_initializer())
-      beta = variables.model_variable('beta',
-                                      shape=params_shape,
-                                      dtype=dtype,
-                                      initializer=beta_initializer,
-                                      collections=beta_collections,
-                                      trainable=trainable)
+      beta = variables.model_variable(
+          'beta',
+          shape=params_shape,
+          dtype=dtype,
+          initializer=beta_initializer,
+          collections=beta_collections,
+          trainable=trainable)
     if scale:
-      gamma_collections = utils.get_variable_collections(variables_collections,
-                                                         'gamma')
+      gamma_collections = utils.get_variable_collections(
+          variables_collections, 'gamma')
       gamma_initializer = param_initializers.get('gamma',
                                                  init_ops.ones_initializer())
-      gamma = variables.model_variable('gamma',
-                                       shape=params_shape,
-                                       dtype=dtype,
-                                       initializer=gamma_initializer,
-                                       collections=gamma_collections,
-                                       trainable=trainable)
+      gamma = variables.model_variable(
+          'gamma',
+          shape=params_shape,
+          dtype=dtype,
+          initializer=gamma_initializer,
+          collections=gamma_collections,
+          trainable=trainable)
 
     # Create moving_mean and moving_variance variables and add them to the
     # appropriate collections. We disable variable partitioning while creating
@@ -792,8 +778,8 @@ def batch_norm(inputs,
           mean, variance = nn.moments(inputs, moments_axes)
       else:
         if data_format == DATA_FORMAT_NCHW:
-          mean, variance = nn.weighted_moments(inputs, moments_axes,
-                                               batch_weights, keep_dims=True)
+          mean, variance = nn.weighted_moments(
+              inputs, moments_axes, batch_weights, keepdims=True)
           mean = array_ops.reshape(mean, [-1])
           variance = array_ops.reshape(variance, [-1])
         else:
@@ -802,19 +788,21 @@ def batch_norm(inputs,
 
       moving_vars_fn = lambda: (moving_mean, moving_variance)
       if updates_collections is None:
+
         def _force_updates():
           """Internal function forces updates moving_vars if is_training."""
           update_moving_mean = moving_averages.assign_moving_average(
               moving_mean, mean, decay, zero_debias=zero_debias_moving_mean)
           update_moving_variance = moving_averages.assign_moving_average(
               moving_variance, variance, decay, zero_debias=False)
-          with ops.control_dependencies([update_moving_mean,
-                                         update_moving_variance]):
+          with ops.control_dependencies(
+              [update_moving_mean, update_moving_variance]):
             return array_ops.identity(mean), array_ops.identity(variance)
-        mean, variance = utils.smart_cond(is_training,
-                                          _force_updates,
+
+        mean, variance = utils.smart_cond(is_training, _force_updates,
                                           moving_vars_fn)
       else:
+
         def _delay_updates():
           """Internal function that delay updates moving_vars if is_training."""
           update_moving_mean = moving_averages.assign_moving_average(
@@ -823,9 +811,8 @@ def batch_norm(inputs,
               moving_variance, variance, decay, zero_debias=False)
           return update_moving_mean, update_moving_variance
 
-        update_mean, update_variance = utils.smart_cond(is_training,
-                                                        _delay_updates,
-                                                        moving_vars_fn)
+        update_mean, update_variance = utils.smart_cond(
+            is_training, _delay_updates, moving_vars_fn)
         ops.add_to_collections(updates_collections, update_mean)
         ops.add_to_collections(updates_collections, update_variance)
         # Use computed moments during training and moving_vars otherwise.
@@ -893,8 +880,8 @@ def bias_add(inputs,
   """
   if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
     raise ValueError('data_format has to be either NCHW or NHWC.')
-  with variable_scope.variable_scope(scope, 'BiasAdd', [inputs],
-                                     reuse=reuse) as sc:
+  with variable_scope.variable_scope(
+      scope, 'BiasAdd', [inputs], reuse=reuse) as sc:
     inputs = ops.convert_to_tensor(inputs)
     dtype = inputs.dtype.base_dtype
     inputs_shape = inputs.get_shape()
@@ -909,13 +896,16 @@ def bias_add(inputs,
       raise ValueError('`C` dimension must be known but is None')
     biases_collections = utils.get_variable_collections(variables_collections,
                                                         'biases')
-    biases = variables.model_variable('biases',
-                                      shape=[num_features,],
-                                      dtype=dtype,
-                                      initializer=initializer,
-                                      regularizer=regularizer,
-                                      collections=biases_collections,
-                                      trainable=trainable)
+    biases = variables.model_variable(
+        'biases',
+        shape=[
+            num_features,
+        ],
+        dtype=dtype,
+        initializer=initializer,
+        regularizer=regularizer,
+        collections=biases_collections,
+        trainable=trainable)
     outputs = nn.bias_add(inputs, biases, data_format=data_format)
     if activation_fn is not None:
       outputs = activation_fn(outputs)
@@ -1015,8 +1005,10 @@ def convolution(inputs,
   if data_format not in [None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC', 'NCDHW']:
     raise ValueError('Invalid data_format: %r' % (data_format,))
 
-  layer_variable_getter = _build_variable_getter(
-      {'bias': 'biases', 'kernel': 'weights'})
+  layer_variable_getter = _build_variable_getter({
+      'bias': 'biases',
+      'kernel': 'weights'
+  })
 
   with variable_scope.variable_scope(
       scope, 'Conv', [inputs], reuse=reuse,
@@ -1034,26 +1026,27 @@ def convolution(inputs,
       raise ValueError('Convolution not supported for input with rank',
                        input_rank)
 
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
-    layer = layer_class(filters=num_outputs,
-                        kernel_size=kernel_size,
-                        strides=stride,
-                        padding=padding,
-                        data_format=df,
-                        dilation_rate=rate,
-                        activation=None,
-                        use_bias=not normalizer_fn and biases_initializer,
-                        kernel_initializer=weights_initializer,
-                        bias_initializer=biases_initializer,
-                        kernel_regularizer=weights_regularizer,
-                        bias_regularizer=biases_regularizer,
-                        activity_regularizer=None,
-                        trainable=trainable,
-                        name=sc.name,
-                        dtype=inputs.dtype.base_dtype,
-                        _scope=sc,
-                        _reuse=reuse)
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
+    layer = layer_class(
+        filters=num_outputs,
+        kernel_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        dilation_rate=rate,
+        activation=None,
+        use_bias=not normalizer_fn and biases_initializer,
+        kernel_initializer=weights_initializer,
+        bias_initializer=biases_initializer,
+        kernel_regularizer=weights_regularizer,
+        bias_regularizer=biases_regularizer,
+        activity_regularizer=None,
+        trainable=trainable,
+        name=sc.name,
+        dtype=inputs.dtype.base_dtype,
+        _scope=sc,
+        _reuse=reuse)
     outputs = layer.apply(inputs)
 
     # Add variables to collections.
@@ -1069,6 +1062,7 @@ def convolution(inputs,
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
+
 convolution2d = convolution
 convolution3d = convolution
 
@@ -1144,13 +1138,14 @@ def convolution2d_in_plane(
     weights_shape = [kernel_h, kernel_w, 1, 1]
     weights_collections = utils.get_variable_collections(
         variables_collections, 'weights')
-    weights = variables.model_variable('weights',
-                                       shape=weights_shape,
-                                       dtype=dtype,
-                                       initializer=weights_initializer,
-                                       regularizer=weights_regularizer,
-                                       collections=weights_collections,
-                                       trainable=trainable)
+    weights = variables.model_variable(
+        'weights',
+        shape=weights_shape,
+        dtype=dtype,
+        initializer=weights_initializer,
+        regularizer=weights_regularizer,
+        collections=weights_collections,
+        trainable=trainable)
     depthwise_weights = array_ops.tile(weights, [1, 1, num_filters_in, 1])
     outputs = nn.depthwise_conv2d(inputs, depthwise_weights,
                                   [1, stride_h, stride_w, 1], padding)
@@ -1161,13 +1156,16 @@ def convolution2d_in_plane(
       if biases_initializer is not None:
         biases_collections = utils.get_variable_collections(
             variables_collections, 'biases')
-        biases = variables.model_variable('biases',
-                                          shape=[num_filters_in,],
-                                          dtype=dtype,
-                                          initializer=biases_initializer,
-                                          regularizer=biases_regularizer,
-                                          collections=biases_collections,
-                                          trainable=trainable)
+        biases = variables.model_variable(
+            'biases',
+            shape=[
+                num_filters_in,
+            ],
+            dtype=dtype,
+            initializer=biases_initializer,
+            regularizer=biases_regularizer,
+            collections=biases_collections,
+            trainable=trainable)
         outputs = nn.bias_add(outputs, biases)
 
     if activation_fn is not None:
@@ -1240,19 +1238,23 @@ def convolution2d_transpose(
     ValueError: If `data_format` is neither `NHWC` nor `NCHW`.
     ValueError: If `C` dimension of `inputs` is None.
   """
-  layer_variable_getter = _build_variable_getter(
-      {'bias': 'biases', 'kernel': 'weights'})
+  layer_variable_getter = _build_variable_getter({
+      'bias': 'biases',
+      'kernel': 'weights'
+  })
 
   with variable_scope.variable_scope(
-      scope, 'Conv2d_transpose', [inputs], reuse=reuse,
+      scope,
+      'Conv2d_transpose', [inputs],
+      reuse=reuse,
       custom_getter=layer_variable_getter) as sc:
     if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
       raise ValueError('data_format has to be either NCHW or NHWC.')
 
     inputs = ops.convert_to_tensor(inputs)
 
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
     layer = convolutional_layers.Convolution2DTranspose(
         filters=num_outputs,
         kernel_size=kernel_size,
@@ -1349,19 +1351,23 @@ def convolution3d_transpose(
     ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
     ValueError: If `C` dimension of `inputs` is None.
   """
-  layer_variable_getter = _build_variable_getter(
-      {'bias': 'biases', 'kernel': 'weights'})
+  layer_variable_getter = _build_variable_getter({
+      'bias': 'biases',
+      'kernel': 'weights'
+  })
 
   with variable_scope.variable_scope(
-      scope, 'Conv3d_transpose', [inputs], reuse=reuse,
+      scope,
+      'Conv3d_transpose', [inputs],
+      reuse=reuse,
       custom_getter=layer_variable_getter) as sc:
     if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
       raise ValueError('data_format has to be either NCDHW or NDHWC.')
 
     inputs = ops.convert_to_tensor(inputs)
 
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
     layer = convolutional_layers.Convolution3DTranspose(
         filters=num_outputs,
         kernel_size=kernel_size,
@@ -1396,6 +1402,30 @@ def convolution3d_transpose(
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
 
+@add_arg_scope
+def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
+  """Converts a dense tensor into a sparse tensor.
+  An example use would be to convert dense labels to sparse ones
+  so that they can be fed to the ctc_loss.
+
+  Args:
+     tensor: An `int` `Tensor` to be converted to a `Sparse`.
+     eos_token: An integer.
+       It is part of the target label that signfies the end of a sentence.
+     outputs_collections: Collection to add the outputs.
+     scope: Optional scope for name_scope.
+  """
+  with variable_scope.variable_scope(scope, 'dense_to_sparse', [tensor]) as sc:
+    tensor = ops.convert_to_tensor(tensor)
+    indices = array_ops.where(
+        math_ops.not_equal(tensor, constant_op.constant(eos_token,
+                                                        tensor.dtype)))
+    values = array_ops.gather_nd(tensor, indices)
+    shape = array_ops.shape(tensor, out_type=dtypes.int64)
+    outputs = sparse_tensor.SparseTensor(indices, values, shape)
+    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+
+
 @add_arg_scope
 def dropout(inputs,
             keep_prob=0.5,
@@ -1430,19 +1460,18 @@ def dropout(inputs,
   with variable_scope.variable_scope(
       scope, 'Dropout', [inputs], custom_getter=_model_variable_getter) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    layer = core_layers.Dropout(rate=1 - keep_prob,
-                                noise_shape=noise_shape,
-                                seed=seed,
-                                name=sc.name,
-                                _scope=sc)
+    layer = core_layers.Dropout(
+        rate=1 - keep_prob,
+        noise_shape=noise_shape,
+        seed=seed,
+        name=sc.name,
+        _scope=sc)
     outputs = layer.apply(inputs, training=is_training)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
 
 @add_arg_scope
-def flatten(inputs,
-            outputs_collections=None,
-            scope=None):
+def flatten(inputs, outputs_collections=None, scope=None):
   """Flattens the input while maintaining the batch_size.
 
     Assumes that the first dimension represents the batch.
@@ -1474,8 +1503,8 @@ def _sparse_inner_flatten(inputs, new_rank):
 
   outer_dimensions = inputs.dense_shape[:new_rank - 1]
   inner_dimensions = inputs.dense_shape[new_rank - 1:]
-  new_shape = array_ops.concat((outer_dimensions,
-                                [math_ops.reduce_prod(inner_dimensions)]), 0)
+  new_shape = array_ops.concat(
+      (outer_dimensions, [math_ops.reduce_prod(inner_dimensions)]), 0)
   flattened = sparse_ops.sparse_reshape(inputs, new_shape)
   return flattened
 
@@ -1541,10 +1570,18 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
   return utils.collect_named_outputs(output_collections, sc, flattened)
 
 
-def _model_variable_getter(getter, name, shape=None, dtype=None,
-                           initializer=None, regularizer=None, trainable=True,
-                           collections=None, caching_device=None,
-                           partitioner=None, rename=None, use_resource=None,
+def _model_variable_getter(getter,
+                           name,
+                           shape=None,
+                           dtype=None,
+                           initializer=None,
+                           regularizer=None,
+                           trainable=True,
+                           collections=None,
+                           caching_device=None,
+                           partitioner=None,
+                           rename=None,
+                           use_resource=None,
                            **_):
   """Getter that uses model_variable for compatibility with core layers."""
   short_name = name.split('/')[-1]
@@ -1553,25 +1590,34 @@ def _model_variable_getter(getter, name, shape=None, dtype=None,
     name_components[-1] = rename[short_name]
     name = '/'.join(name_components)
   return variables.model_variable(
-      name, shape=shape, dtype=dtype, initializer=initializer,
-      regularizer=regularizer, collections=collections, trainable=trainable,
-      caching_device=caching_device, partitioner=partitioner,
-      custom_getter=getter, use_resource=use_resource)
+      name,
+      shape=shape,
+      dtype=dtype,
+      initializer=initializer,
+      regularizer=regularizer,
+      collections=collections,
+      trainable=trainable,
+      caching_device=caching_device,
+      partitioner=partitioner,
+      custom_getter=getter,
+      use_resource=use_resource)
 
 
 def _build_variable_getter(rename=None):
   """Build a model variable getter that respects scope getter and renames."""
+
   # VariableScope will nest the getters
   def layer_variable_getter(getter, *args, **kwargs):
     kwargs['rename'] = rename
     return _model_variable_getter(getter, *args, **kwargs)
+
   return layer_variable_getter
 
 
 def _add_variable_to_collections(variable, collections_set, collections_name):
   """Adds variable (or all its parts) to all collections with that name."""
-  collections = utils.get_variable_collections(
-      collections_set, collections_name) or []
+  collections = utils.get_variable_collections(collections_set,
+                                               collections_name) or []
   variables_list = [variable]
   if isinstance(variable, tf_variables.PartitionedVariable):
     variables_list = [v for v in variable]
@@ -1640,15 +1686,19 @@ def fully_connected(inputs,
     ValueError: If x has rank less than 2 or if its last dimension is not set.
   """
   if not isinstance(num_outputs, six.integer_types):
-    raise ValueError(
-        'num_outputs should be int or long, got %s.' % (num_outputs,))
+    raise ValueError('num_outputs should be int or long, got %s.' %
+                     (num_outputs,))
 
-  layer_variable_getter = _build_variable_getter({'bias': 'biases',
-                                                  'kernel': 'weights'})
+  layer_variable_getter = _build_variable_getter({
+      'bias': 'biases',
+      'kernel': 'weights'
+  })
 
   with variable_scope.variable_scope(
-      scope, 'fully_connected', [inputs],
-      reuse=reuse, custom_getter=layer_variable_getter) as sc:
+      scope,
+      'fully_connected', [inputs],
+      reuse=reuse,
+      custom_getter=layer_variable_getter) as sc:
     inputs = ops.convert_to_tensor(inputs)
     layer = core_layers.Dense(
         units=num_outputs,
@@ -1754,15 +1804,17 @@ class GDN(base.Layer):
                inverse=False,
                beta_min=1e-6,
                gamma_init=.1,
-               reparam_offset=2 ** -18,
+               reparam_offset=2**-18,
                data_format='channels_last',
                activity_regularizer=None,
                trainable=True,
                name=None,
                **kwargs):
-    super(GDN, self).__init__(trainable=trainable, name=name,
-                              activity_regularizer=activity_regularizer,
-                              **kwargs)
+    super(GDN, self).__init__(
+        trainable=trainable,
+        name=name,
+        activity_regularizer=activity_regularizer,
+        **kwargs)
     self.inverse = inverse
     self._beta_min = beta_min
     self._gamma_init = gamma_init
@@ -1797,8 +1849,9 @@ class GDN(base.Layer):
     with ops.name_scope(name, 'GDNLowerBound', [inputs, bound]) as scope:
       inputs = ops.convert_to_tensor(inputs, name='inputs')
       bound = ops.convert_to_tensor(bound, name='bound')
-      with ops.get_default_graph().gradient_override_map(
-          {'Maximum': 'GDNLowerBound'}):
+      with ops.get_default_graph().gradient_override_map({
+          'Maximum': 'GDNLowerBound'
+      }):
         return math_ops.maximum(inputs, bound, name=scope)
 
   @staticmethod
@@ -1825,12 +1878,14 @@ class GDN(base.Layer):
       raise ValueError('The channel dimension of the inputs to `GDN` '
                        'must be defined.')
     self._input_rank = input_shape.ndims
-    self.input_spec = base.InputSpec(ndim=input_shape.ndims,
-                                     axes={channel_axis: num_channels})
+    self.input_spec = base.InputSpec(
+        ndim=input_shape.ndims, axes={
+            channel_axis: num_channels
+        })
 
-    pedestal = array_ops.constant(self._reparam_offset ** 2, dtype=self.dtype)
+    pedestal = array_ops.constant(self._reparam_offset**2, dtype=self.dtype)
     beta_bound = array_ops.constant(
-        (self._beta_min + self._reparam_offset ** 2) ** .5, dtype=self.dtype)
+        (self._beta_min + self._reparam_offset**2)**.5, dtype=self.dtype)
     gamma_bound = array_ops.constant(self._reparam_offset, dtype=self.dtype)
 
     def beta_initializer(shape, dtype=None, partition_info=None):
@@ -1844,19 +1899,21 @@ class GDN(base.Layer):
       eye = linalg_ops.eye(shape[0], dtype=dtype)
       return math_ops.sqrt(self._gamma_init * eye + pedestal)
 
-    beta = self.add_variable('reparam_beta',
-                             shape=[num_channels],
-                             initializer=beta_initializer,
-                             dtype=self.dtype,
-                             trainable=True)
+    beta = self.add_variable(
+        'reparam_beta',
+        shape=[num_channels],
+        initializer=beta_initializer,
+        dtype=self.dtype,
+        trainable=True)
     beta = self._lower_bound(beta, beta_bound)
     self.beta = math_ops.square(beta) - pedestal
 
-    gamma = self.add_variable('reparam_gamma',
-                              shape=[num_channels, num_channels],
-                              initializer=gamma_initializer,
-                              dtype=self.dtype,
-                              trainable=True)
+    gamma = self.add_variable(
+        'reparam_gamma',
+        shape=[num_channels, num_channels],
+        initializer=gamma_initializer,
+        dtype=self.dtype,
+        trainable=True)
     gamma = self._lower_bound(gamma, gamma_bound)
     self.gamma = math_ops.square(gamma) - pedestal
 
@@ -1871,8 +1928,11 @@ class GDN(base.Layer):
 
     # Compute normalization pool.
     if self.data_format == 'channels_first':
-      norm_pool = nn.convolution(math_ops.square(inputs), gamma, 'VALID',
-                                 data_format='NC' + 'DHW'[-(ndim - 2):])
+      norm_pool = nn.convolution(
+          math_ops.square(inputs),
+          gamma,
+          'VALID',
+          data_format='NC' + 'DHW' [-(ndim - 2):])
       if ndim == 3:
         norm_pool = array_ops.expand_dims(norm_pool, 2)
         norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
@@ -1896,7 +1956,7 @@ class GDN(base.Layer):
     outputs.set_shape(inputs.get_shape())
     return outputs
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     channel_axis = self._channel_axis()
     input_shape = tensor_shape.TensorShape(input_shape)
     if not 3 <= input_shape.ndim <= 5:
@@ -1914,7 +1974,7 @@ def gdn(inputs,
         inverse=False,
         beta_min=1e-6,
         gamma_init=.1,
-        reparam_offset=2 ** -18,
+        reparam_offset=2**-18,
         data_format='channels_last',
         activity_regularizer=None,
         trainable=True,
@@ -1980,17 +2040,18 @@ def gdn(inputs,
   Returns:
     Output tensor.
   """
-  layer = GDN(inverse=inverse,
-              beta_min=beta_min,
-              gamma_init=gamma_init,
-              reparam_offset=reparam_offset,
-              data_format=data_format,
-              activity_regularizer=activity_regularizer,
-              trainable=trainable,
-              name=name,
-              dtype=inputs.dtype.base_dtype,
-              _scope=name,
-              _reuse=reuse)
+  layer = GDN(
+      inverse=inverse,
+      beta_min=beta_min,
+      gamma_init=gamma_init,
+      reparam_offset=reparam_offset,
+      data_format=data_format,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
   return layer.apply(inputs)
 
 
@@ -2066,8 +2127,8 @@ def layer_norm(inputs,
       or if `inputs.shape[begin_params_axis:]` is not fully defined at
       graph build time.
   """
-  with variable_scope.variable_scope(scope, 'LayerNorm', [inputs],
-                                     reuse=reuse) as sc:
+  with variable_scope.variable_scope(
+      scope, 'LayerNorm', [inputs], reuse=reuse) as sc:
     inputs = ops.convert_to_tensor(inputs)
     inputs_shape = inputs.shape
     inputs_rank = inputs_shape.ndims
@@ -2077,15 +2138,14 @@ def layer_norm(inputs,
     if begin_norm_axis < 0:
       begin_norm_axis = inputs_rank + begin_norm_axis
     if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
-      raise ValueError(
-          'begin_params_axis (%d) and begin_norm_axis (%d) '
-          'must be < rank(inputs) (%d)'
-          % (begin_params_axis, begin_norm_axis, inputs_rank))
+      raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) '
+                       'must be < rank(inputs) (%d)' %
+                       (begin_params_axis, begin_norm_axis, inputs_rank))
     params_shape = inputs_shape[begin_params_axis:]
     if not params_shape.is_fully_defined():
       raise ValueError(
-          'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' % (
-              inputs.name, begin_params_axis, inputs_shape))
+          'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
+          (inputs.name, begin_params_axis, inputs_shape))
     # Allocate parameters for the beta and gamma of the normalization.
     beta, gamma = None, None
     if center:
@@ -2099,8 +2159,8 @@ def layer_norm(inputs,
           collections=beta_collections,
           trainable=trainable)
     if scale:
-      gamma_collections = utils.get_variable_collections(variables_collections,
-                                                         'gamma')
+      gamma_collections = utils.get_variable_collections(
+          variables_collections, 'gamma')
       gamma = variables.model_variable(
           'gamma',
           shape=params_shape,
@@ -2114,7 +2174,11 @@ def layer_norm(inputs,
     # Compute layer normalization using the batch_normalization function.
     variance_epsilon = 1e-12
     outputs = nn.batch_normalization(
-        inputs, mean, variance, offset=beta, scale=gamma,
+        inputs,
+        mean,
+        variance,
+        offset=beta,
+        scale=gamma,
         variance_epsilon=variance_epsilon)
     outputs.set_shape(inputs_shape)
     if activation_fn is not None:
@@ -2122,6 +2186,34 @@ def layer_norm(inputs,
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
 
+@add_arg_scope
+def images_to_sequence(inputs, data_format=DATA_FORMAT_NHWC,
+                       outputs_collections=None, scope=None):
+  """Convert a batch of images into a batch of sequences.
+  Args:
+    inputs: a (num_images, height, width, depth) tensor
+    data_format: A string. `NHWC` (default) and `NCHW` are supported.
+    outputs_collections: The collections to which the outputs are added.
+    scope: Optional scope for name_scope.
+  Returns:
+    (width, num_images*height, depth) sequence tensor
+  """
+  if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
+    raise ValueError('data_format has to be either NCHW or NHWC.')
+  with ops.name_scope(scope, 'ImagesToSequence', [inputs]) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
+    if df == 'channels_first':
+      inputs = array_ops.transpose(inputs, [0, 2, 3, 1])
+    _, _, width, depth = inputs.get_shape().as_list()
+    s = array_ops.shape(inputs)
+    batch_size, height = s[0], s[1]
+    transposed = array_ops.transpose(inputs, [2, 0, 1, 3])
+    outputs = array_ops.reshape(transposed, [width, batch_size * height, depth])
+    return utils.collect_named_outputs(outputs_collections, sc, outputs)
+
+
 @add_arg_scope
 def max_pool2d(inputs,
                kernel_size,
@@ -2160,13 +2252,14 @@ def max_pool2d(inputs,
     raise ValueError('data_format has to be either NCHW or NHWC.')
   with ops.name_scope(scope, 'MaxPool2D', [inputs]) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
-    layer = pooling_layers.MaxPooling2D(pool_size=kernel_size,
-                                        strides=stride,
-                                        padding=padding,
-                                        data_format=df,
-                                        _scope=sc)
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
+    layer = pooling_layers.MaxPooling2D(
+        pool_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        _scope=sc)
     outputs = layer.apply(inputs)
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
@@ -2209,13 +2302,14 @@ def max_pool3d(inputs,
     raise ValueError('data_format has to be either NCDHW or NDHWC.')
   with ops.name_scope(scope, 'MaxPool3D', [inputs]) as sc:
     inputs = ops.convert_to_tensor(inputs)
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
-    layer = pooling_layers.MaxPooling3D(pool_size=kernel_size,
-                                        strides=stride,
-                                        padding=padding,
-                                        data_format=df,
-                                        _scope=sc)
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
+    layer = pooling_layers.MaxPooling3D(
+        pool_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        _scope=sc)
     outputs = layer.apply(inputs)
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
@@ -2268,8 +2362,8 @@ def pool(inputs,
 
   """
   # pylint: enable=line-too-long
-  with ops.name_scope(scope, '%s_pool' %
-                      (pooling_type.lower()), [inputs]) as sc:
+  with ops.name_scope(scope, '%s_pool' % (pooling_type.lower()),
+                      [inputs]) as sc:
     inputs = ops.convert_to_tensor(inputs)
     input_rank = inputs.get_shape().ndims
     if input_rank is None:
@@ -2314,18 +2408,16 @@ def one_hot_encoding(labels,
     labels = ops.convert_to_tensor(labels)
     if labels.dtype == dtypes.int32:
       labels = standard_ops.to_int64(labels)
-    outputs = standard_ops.one_hot(labels,
-                                   num_classes,
-                                   on_value=on_value,
-                                   off_value=off_value)
+    outputs = standard_ops.one_hot(
+        labels, num_classes, on_value=on_value, off_value=off_value)
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
 
 def _apply_activation(y, activation_fn, output_collections):
   if activation_fn is not None:
     y = activation_fn(y)
-  ops.add_to_collections(list(output_collections or []) +
-                         [ops.GraphKeys.ACTIVATIONS], y)
+  ops.add_to_collections(
+      list(output_collections or []) + [ops.GraphKeys.ACTIVATIONS], y)
   return y
 
 
@@ -2370,7 +2462,7 @@ def repeat(inputs, repetitions, layer, *args, **kwargs):
         scope = 'repeat'
     outputs = inputs
     for i in range(repetitions):
-      kwargs['scope'] = scope + '_' + str(i+1)
+      kwargs['scope'] = scope + '_' + str(i + 1)
       outputs = layer(outputs, *args, **kwargs)
     return outputs
 
@@ -2385,8 +2477,8 @@ def _scale_gradient_grad(op, grad):
   return [grad * op.inputs[1], None]
 
 
-@function.Defun(python_grad_func=_scale_gradient_grad,
-                shape_func=_scale_gradient_shape)
+@function.Defun(
+    python_grad_func=_scale_gradient_grad, shape_func=_scale_gradient_shape)
 def scale_gradient(inputs, gradient_multiplier):
   """Identity operation, but with the gradient multiplied by a tensor.
 
@@ -2491,18 +2583,21 @@ def separable_convolution2d(
   """
   if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
     raise ValueError('data_format has to be either NCHW or NHWC.')
-  layer_variable_getter = _build_variable_getter(
-      {'bias': 'biases',
-       'depthwise_kernel': 'depthwise_weights',
-       'pointwise_kernel': 'pointwise_weights'})
+  layer_variable_getter = _build_variable_getter({
+      'bias': 'biases',
+      'depthwise_kernel': 'depthwise_weights',
+      'pointwise_kernel': 'pointwise_weights'
+  })
 
   with variable_scope.variable_scope(
-      scope, 'SeparableConv2d', [inputs], reuse=reuse,
+      scope,
+      'SeparableConv2d', [inputs],
+      reuse=reuse,
       custom_getter=layer_variable_getter) as sc:
     inputs = ops.convert_to_tensor(inputs)
 
-    df = ('channels_first' if data_format and data_format.startswith('NC')
-          else 'channels_last')
+    df = ('channels_first'
+          if data_format and data_format.startswith('NC') else 'channels_last')
     if num_outputs is not None:
       # Apply separable conv using the SeparableConvolution2D layer.
       layer = convolutional_layers.SeparableConvolution2D(
@@ -2535,8 +2630,8 @@ def separable_convolution2d(
       _add_variable_to_collections(layer.pointwise_kernel,
                                    variables_collections, 'weights')
       if layer.bias is not None:
-        _add_variable_to_collections(layer.bias,
-                                     variables_collections, 'biases')
+        _add_variable_to_collections(layer.bias, variables_collections,
+                                     'biases')
 
       if normalizer_fn is not None:
         normalizer_params = normalizer_params or {}
@@ -2551,8 +2646,7 @@ def separable_convolution2d(
       weights_collections = utils.get_variable_collections(
           variables_collections, 'weights')
 
-      depthwise_shape = [kernel_h, kernel_w,
-                         num_filters_in, depth_multiplier]
+      depthwise_shape = [kernel_h, kernel_w, num_filters_in, depth_multiplier]
       depthwise_weights = variables.model_variable(
           'depthwise_weights',
           shape=depthwise_shape,
@@ -2561,11 +2655,18 @@ def separable_convolution2d(
           regularizer=weights_regularizer,
           trainable=trainable,
           collections=weights_collections)
-      strides = [1, 1, stride_h, stride_w] if data_format.startswith('NC') else [1, stride_h, stride_w, 1]
+      strides = [1, 1, stride_h,
+                 stride_w] if data_format.startswith('NC') else [
+                     1, stride_h, stride_w, 1
+                 ]
 
-      outputs = nn.depthwise_conv2d(inputs, depthwise_weights, strides, padding,
-                                    rate=utils.two_element_tuple(rate),
-                                    data_format=data_format)
+      outputs = nn.depthwise_conv2d(
+          inputs,
+          depthwise_weights,
+          strides,
+          padding,
+          rate=utils.two_element_tuple(rate),
+          data_format=data_format)
       num_outputs = depth_multiplier * num_filters_in
 
       if normalizer_fn is not None:
@@ -2575,13 +2676,16 @@ def separable_convolution2d(
         if biases_initializer is not None:
           biases_collections = utils.get_variable_collections(
               variables_collections, 'biases')
-          biases = variables.model_variable('biases',
-                                            shape=[num_outputs,],
-                                            dtype=dtype,
-                                            initializer=biases_initializer,
-                                            regularizer=biases_regularizer,
-                                            trainable=trainable,
-                                            collections=biases_collections)
+          biases = variables.model_variable(
+              'biases',
+              shape=[
+                  num_outputs,
+              ],
+              dtype=dtype,
+              initializer=biases_initializer,
+              regularizer=biases_regularizer,
+              trainable=trainable,
+              collections=biases_collections)
           outputs = nn.bias_add(outputs, biases, data_format=data_format)
 
     if activation_fn is not None:
@@ -2589,6 +2693,36 @@ def separable_convolution2d(
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
 
+@add_arg_scope
+def sequence_to_images(inputs, height, output_data_format='channels_last',
+                       outputs_collections=None, scope=None):
+  """Convert a batch of sequences into a batch of images.
+  Args:
+    inputs: (num_steps, num_batches, depth) sequence tensor
+    height: the height of the images
+    output_data_format: Format of output tensor.
+      Currently supports `'channels_first'` and `'channels_last'`.
+    outputs_collections: The collections to which the outputs are added.
+    scope: Optional scope for name_scope.
+  Returns:
+    A tensor representing the output of the operation.
+  """
+  with ops.name_scope(scope, 'SequenceToImages', [inputs]) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    width, num_batches, depth = inputs.get_shape().as_list()
+    if num_batches is None:
+      num_batches = -1
+    else:
+      num_batches = num_batches // height
+    reshaped = array_ops.reshape(inputs,
+                                 [width, num_batches, height, depth])
+    if output_data_format == 'channels_first':
+      outputs = array_ops.transpose(reshaped, [1, 3, 2, 0])
+    else:
+      outputs = array_ops.transpose(reshaped, [1, 2, 0, 3])
+    return utils.collect_named_outputs(outputs_collections, sc, outputs)
+
+
 @add_arg_scope
 def softmax(logits, scope=None):
   """Performs softmax on Nth dimension of N-dimensional logit tensor.
@@ -2651,7 +2785,7 @@ def spatial_softmax(features,
     ValueError: If unexpected data_format specified.
     ValueError: If num_channels dimension is unspecified.
   """
-  with variable_scope.variable_scope(name, 'spatial_softmax'):  
+  with variable_scope.variable_scope(name, 'spatial_softmax'):
     shape = array_ops.shape(features)
     static_shape = features.shape
     if data_format == DATA_FORMAT_NHWC:
@@ -2663,44 +2797,52 @@ def spatial_softmax(features,
     if num_channels.value is None:
       raise ValueError('The num_channels dimension of the inputs to '
                        '`spatial_softmax` should be defined. Found `None`.')
-  
-    with ops.name_scope('spatial_softmax_op', 'spatial_softmax_op', [features]):  
+
+    with ops.name_scope('spatial_softmax_op', 'spatial_softmax_op', [features]):
       # Create tensors for x and y coordinate values, scaled to range [-1, 1].
-      pos_x, pos_y = array_ops.meshgrid(math_ops.lin_space(-1., 1., num=height),
-                                        math_ops.lin_space(-1., 1., num=width),
-                                        indexing='ij')
+      pos_x, pos_y = array_ops.meshgrid(
+          math_ops.lin_space(-1., 1., num=height),
+          math_ops.lin_space(-1., 1., num=width),
+          indexing='ij')
       pos_x = array_ops.reshape(pos_x, [height * width])
       pos_y = array_ops.reshape(pos_y, [height * width])
+
       if temperature is None:
-        temperature_collections = utils.get_variable_collections(
+        temp_initializer = init_ops.ones_initializer()
+      else:
+        temp_initializer = init_ops.constant_initializer(temperature)
+
+      if not trainable:
+        temp_collections = None
+      else:
+        temp_collections = utils.get_variable_collections(
             variables_collections, 'temperature')
-        temperature = variables.model_variable(
-            'temperature',
-            shape=(),
-            dtype=dtypes.float32,
-            initializer=init_ops.ones_initializer(),
-            collections=temperature_collections,
-            trainable=trainable)
+
+      temperature = variables.model_variable(
+          'temperature',
+          shape=(),
+          dtype=dtypes.float32,
+          initializer=temp_initializer,
+          collections=temp_collections,
+          trainable=trainable)
       if data_format == 'NCHW':
         features = array_ops.reshape(features, [-1, height * width])
       else:
         features = array_ops.reshape(
             array_ops.transpose(features, [0, 3, 1, 2]), [-1, height * width])
-  
-      softmax_attention = nn.softmax(features/temperature)
+
+      softmax_attention = nn.softmax(features / temperature)
       expected_x = math_ops.reduce_sum(
-          pos_x * softmax_attention, [1], keep_dims=True)
+          pos_x * softmax_attention, [1], keepdims=True)
       expected_y = math_ops.reduce_sum(
-          pos_y * softmax_attention, [1], keep_dims=True)
+          pos_y * softmax_attention, [1], keepdims=True)
       expected_xy = array_ops.concat([expected_x, expected_y], 1)
-      feature_keypoints = array_ops.reshape(
-          expected_xy, [-1, num_channels.value * 2])
+      feature_keypoints = array_ops.reshape(expected_xy,
+                                            [-1, num_channels.value * 2])
       feature_keypoints.set_shape([None, num_channels.value * 2])
   return feature_keypoints
 
 
-
-
 def stack(inputs, layer, stack_args, **kwargs):
   """Builds a stack of layers by applying layer repeatedly using stack_args.
 
@@ -2748,7 +2890,7 @@ def stack(inputs, layer, stack_args, **kwargs):
         scope = 'stack'
     outputs = inputs
     for i in range(len(stack_args)):
-      kwargs['scope'] = scope + '_' + str(i+1)
+      kwargs['scope'] = scope + '_' + str(i + 1)
       layer_args = stack_args[i]
       if not isinstance(layer_args, (list, tuple)):
         layer_args = [layer_args]
@@ -2779,11 +2921,10 @@ def unit_norm(inputs, dim, epsilon=1e-7, scope=None):
       raise ValueError('The input rank must be known.')
     input_rank = len(inputs.get_shape().as_list())
     if dim < 0 or dim >= input_rank:
-      raise ValueError(
-          'dim must be positive but smaller than the input rank.')
+      raise ValueError('dim must be positive but smaller than the input rank.')
 
-    lengths = math_ops.sqrt(epsilon + math_ops.reduce_sum(
-        math_ops.square(inputs), dim, True))
+    lengths = math_ops.sqrt(
+        epsilon + math_ops.reduce_sum(math_ops.square(inputs), dim, True))
     multiples = []
     if dim > 0:
       multiples.append(array_ops.ones([dim], dtypes.int32))
@@ -2827,7 +2968,7 @@ def poincare_normalize(x, axis=1, epsilon=1e-5, name=None):
   """
   with ops.name_scope(name, 'poincare_normalize', [x]) as name:
     x = ops.convert_to_tensor(x, name='x')
-    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keep_dims=True)
+    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
     x_inv_norm = math_ops.rsqrt(square_sum)
     x_inv_norm = math_ops.minimum((1. - epsilon) * x_inv_norm, 1.)
     return math_ops.multiply(x, x_inv_norm, name=name)
@@ -2924,29 +3065,31 @@ def legacy_fully_connected(x,
       raise ValueError('last dimension of x must be known but is None')
     dtype = x.dtype.base_dtype
 
-    weight_collections = set(list(weight_collections or []) +
-                             [ops.GraphKeys.GLOBAL_VARIABLES])
-    w = variable_scope.get_variable('weights',
-                                    shape=[num_input_units, num_output_units],
-                                    dtype=dtype,
-                                    initializer=weight_init,
-                                    collections=weight_collections,
-                                    regularizer=weight_regularizer,
-                                    trainable=trainable)
-    x_2_dim = x if len(dims) <= 2 else array_ops.reshape(x,
-                                                         [-1, num_input_units])
+    weight_collections = set(
+        list(weight_collections or []) + [ops.GraphKeys.GLOBAL_VARIABLES])
+    w = variable_scope.get_variable(
+        'weights',
+        shape=[num_input_units, num_output_units],
+        dtype=dtype,
+        initializer=weight_init,
+        collections=weight_collections,
+        regularizer=weight_regularizer,
+        trainable=trainable)
+    x_2_dim = x if len(dims) <= 2 else array_ops.reshape(
+        x, [-1, num_input_units])
     y = standard_ops.matmul(x_2_dim, w)
 
     if bias_init is not None:
-      bias_collections = set(list(bias_collections or []) +
-                             [ops.GraphKeys.GLOBAL_VARIABLES])
-      b = variable_scope.get_variable('bias',
-                                      shape=[num_output_units],
-                                      dtype=dtype,
-                                      initializer=bias_init,
-                                      collections=bias_collections,
-                                      regularizer=bias_regularizer,
-                                      trainable=trainable)
+      bias_collections = set(
+          list(bias_collections or []) + [ops.GraphKeys.GLOBAL_VARIABLES])
+      b = variable_scope.get_variable(
+          'bias',
+          shape=[num_output_units],
+          dtype=dtype,
+          initializer=bias_init,
+          collections=bias_collections,
+          regularizer=bias_regularizer,
+          trainable=trainable)
 
       y = nn.bias_add(y, b)
 
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index a05e464a26d8167707ce6d6455aca50b0416aa1f..0f062adbab3ca9acfb89543b69c7c957bbdf5dd8 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
@@ -126,8 +127,8 @@ class AvgPool3DTest(test.TestCase):
   def testInvalidDataFormat(self):
     depth, height, width = 3, 6, 9
     images = np.random.uniform(size=(5, depth, height, width, 3))
-    with self.assertRaisesRegexp(ValueError,
-                                 'data_format has to be either NCDHW or NDHWC.'):
+    with self.assertRaisesRegexp(
+        ValueError, 'data_format has to be either NCDHW or NDHWC.'):
       _layers.avg_pool3d(images, [3, 3, 3], data_format='CDHWN')
 
   def testCreateAvgPool(self):
@@ -147,7 +148,8 @@ class AvgPool3DTest(test.TestCase):
   def testCollectOutputs(self):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
-    output = _layers.avg_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output = _layers.avg_pool3d(
+        images, [3, 3, 3], outputs_collections='outputs')
     output_collected = ops.get_collection('outputs')[0]
     self.assertEqual(output_collected.aliases, ['AvgPool3D'])
     self.assertEqual(output_collected, output)
@@ -182,7 +184,8 @@ class AvgPool3DTest(test.TestCase):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
     output = _layers.avg_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
-    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth, height, width, 3])
 
   def testGlobalAvgPool(self):
     depth, height, width = 3, 6, 9
@@ -514,7 +517,9 @@ class ConvolutionTest(test.TestCase):
       with arg_scope(
           [layers_lib.convolution2d],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = layers_lib.convolution2d(images, 32, [3, 3])
         net = layers_lib.convolution2d(net, 32, [3, 3])
       self.assertEqual(len(variables.get_variables()), 8)
@@ -528,7 +533,9 @@ class ConvolutionTest(test.TestCase):
       with arg_scope(
           [layers_lib.convolution2d],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = layers_lib.convolution2d(images, 32, [3, 3], scope='Conv')
         net = layers_lib.convolution2d(
             net, 32, [3, 3], scope='Conv', reuse=True)
@@ -701,7 +708,7 @@ class Convolution2dTransposeTests(test.TestCase):
         _layers.convolution2d_transpose(images, 32, 3, data_format='CHWN')
 
   def testOutputSizeWithStrideOneSamePaddingNCHW(self):
-    # `NCHW` data fomat is only supported for `GPU` device.
+    # `NCHW` data format is only supported for `GPU` device.
     if test.is_gpu_available(cuda_only=True):
       with self.test_session(use_gpu=True) as sess:
         num_filters = 32
@@ -1030,7 +1037,8 @@ class Convolution2dTransposeTests(test.TestCase):
     for _ in range(10):
       num_filters = 1
       input_size = [
-          1, np.random.randint(1, max_image_size),
+          1,
+          np.random.randint(1, max_image_size),
           np.random.randint(1, max_image_size), 1
       ]
       filter_size = [
@@ -1184,8 +1192,10 @@ class ConvolutionInPlaneTest(test.TestCase):
 
     with self.test_session() as sess:
       sess.run(init_op)
-      result = sess.run(horz_gradients,
-                        feed_dict={image: np.ones((1, 10, 10, 1))})
+      result = sess.run(
+          horz_gradients, feed_dict={
+              image: np.ones((1, 10, 10, 1))
+          })
       expected = np.zeros((1, 10, 9, 1))
 
       self.assertAllEqual(result, expected)
@@ -1292,6 +1302,19 @@ class ConvolutionInPlaneTest(test.TestCase):
       self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
 
 
+class DenseToSparseTest(test.TestCase):
+
+  def testDenseFromConstantToSparse(self):
+    expected_constant = np.reshape(np.arange(24, dtype=np.int64), (3, 4, 2))
+    tensor = constant_op.constant(expected_constant)
+    sparse = _layers.dense_to_sparse(tensor)
+    dense = sparse_ops.sparse_to_dense(sparse.indices, sparse.dense_shape,
+                                       sparse.values)
+    with self.test_session() as sess:
+      constant = sess.run(dense)
+      self.assertAllEqual(expected_constant, constant)
+
+
 class DropoutTest(test.TestCase):
 
   def testCreateDropout(self):
@@ -1406,8 +1429,7 @@ class FlattenTest(test.TestCase):
     with ops.Graph().as_default() as g, self.test_session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
       inputs.set_shape(tensor_shape.TensorShape((5,)))
-      with self.assertRaisesRegexp(ValueError,
-                                   'incompatible with the layer'):
+      with self.assertRaisesRegexp(ValueError, 'incompatible with the layer'):
         _layers.flatten(inputs)
 
   def testUnknownLastDim(self):
@@ -1717,7 +1739,9 @@ class FCTest(test.TestCase):
       with arg_scope(
           [_layers.fully_connected],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = _layers.fully_connected(images, 27)
         net = _layers.fully_connected(net, 27)
       self.assertEqual(len(variables.get_variables()), 8)
@@ -1733,7 +1757,9 @@ class FCTest(test.TestCase):
       with arg_scope(
           [_layers.fully_connected],
           normalizer_fn=_layers.batch_norm,
-          normalizer_params={'decay': 0.9}):
+          normalizer_params={
+              'decay': 0.9
+          }):
         net = _layers.fully_connected(images, 27, scope='fc1')
         net = _layers.fully_connected(net, 27, scope='fc1', reuse=True)
       self.assertEqual(len(variables.get_variables()), 4)
@@ -1747,6 +1773,12 @@ class BatchNormTest(test.TestCase):
     expected_var *= correction_factor
     return expected_var, correction_factor
 
+  def testBatchNormCenterFalse(self):
+    a = array_ops.placeholder(dtype=dtypes.float32, shape=(10, 10, 10, 10))
+    # Test that center=False builds a valid graph.
+    _layers.batch_norm(
+        a, center=False, data_format='NCHW', zero_debias_moving_mean=True)
+
   def testUnknownShape(self):
     with ops.Graph().as_default() as g, self.test_session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
@@ -1782,8 +1814,8 @@ class BatchNormTest(test.TestCase):
       images = np.random.uniform(size=(5, height, width, 3)).astype(
           dtype.as_numpy_dtype)
       output = _layers.batch_norm(images, fused=fused)
-      expected_name = ('BatchNorm/FusedBatchNorm' if fused else
-                       'BatchNorm/batchnorm')
+      expected_name = ('BatchNorm/FusedBatchNorm'
+                       if fused else 'BatchNorm/batchnorm')
       self.assertTrue(output.op.name.startswith(expected_name))
       self.assertListEqual(output.get_shape().as_list(), [5, height, width, 3])
       self.assertEqual(
@@ -2002,8 +2034,8 @@ class BatchNormTest(test.TestCase):
       expected_var = np.var(image_values, axis=axis)
       if fused:
         # Add Bessel's correction
-        expected_var, _ = self._addBesselsCorrection(batch_size * height *
-                                                     width, expected_var)
+        expected_var, _ = self._addBesselsCorrection(
+            batch_size * height * width, expected_var)
       images = constant_op.constant(
           image_values, shape=image_shape, dtype=dtypes.float32)
       output = _layers.batch_norm(
@@ -2164,7 +2196,7 @@ class BatchNormTest(test.TestCase):
       # After initialization moving_mean == 0 and moving_variance == 1.
       self.assertAllClose(mean, [0] * 3)
       self.assertAllClose(variance, [1] * 3)
-      # Simulate assigment from saver restore.
+      # Simulate assignment from saver restore.
       init_assigns = [
           state_ops.assign(moving_mean, expected_mean),
           state_ops.assign(moving_variance, expected_var)
@@ -2522,8 +2554,8 @@ class BatchNormTest(test.TestCase):
       expected_var = np.var(image_values, axis=axis)
       if fused:
         # Add Bessel's correction
-        expected_var, _ = self._addBesselsCorrection(batch_size * height *
-                                                     width, expected_var)
+        expected_var, _ = self._addBesselsCorrection(
+            batch_size * height * width, expected_var)
       images = constant_op.constant(
           image_values, shape=image_shape, dtype=dtypes.float32)
       output = _layers.batch_norm(
@@ -2553,8 +2585,9 @@ class BatchNormTest(test.TestCase):
         np_output, new_images_gradients = sess.run([output, images_gradients])
         # The outputs should be close to 0.0 mean and 1.0 variance
         self.assertAllClose(
-            np.mean(
-                np_output, axis=axis), [0] * channels, rtol=0.001, atol=0.001)
+            np.mean(np_output, axis=axis), [0] * channels,
+            rtol=0.001,
+            atol=0.001)
         self.assertAllClose(
             np.var(np_output, axis=axis), [1] * channels, rtol=0.01, atol=0.01)
         # The gradients should change slowly while updating moving_mean.
@@ -2582,14 +2615,14 @@ class BatchNormTest(test.TestCase):
     channels = 3
     with self.test_session() as sess:
       images = (np.ones((5, height, width, channels)) * 9.0).astype('f')
-      beta = init_ops.constant_initializer((np.ones(channels) * 5.0).astype(
-          'f'))
-      gamma = init_ops.constant_initializer((np.ones(channels) * 2.0).astype(
-          'f'))
-      mean = init_ops.constant_initializer((np.ones(channels) * 5.0).astype(
-          'f'))
-      variance = init_ops.constant_initializer((np.ones(channels) * 4.0).astype(
-          'f'))
+      beta = init_ops.constant_initializer(
+          (np.ones(channels) * 5.0).astype('f'))
+      gamma = init_ops.constant_initializer(
+          (np.ones(channels) * 2.0).astype('f'))
+      mean = init_ops.constant_initializer(
+          (np.ones(channels) * 5.0).astype('f'))
+      variance = init_ops.constant_initializer(
+          (np.ones(channels) * 4.0).astype('f'))
       output = _layers.batch_norm(
           images,
           is_training=False,
@@ -2610,21 +2643,18 @@ class BatchNormTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       images = np.arange(np.product(shape), dtype=np.float32).reshape(shape)
       beta = init_ops.constant_initializer(
-          np.arange(
-              2, channels + 2, dtype=np.float32))
+          np.arange(2, channels + 2, dtype=np.float32))
       gamma = init_ops.constant_initializer(
-          np.arange(
-              10, channels + 10, dtype=np.float32) * 2.0)
+          np.arange(10, channels + 10, dtype=np.float32) * 2.0)
       mean = init_ops.constant_initializer(
-          np.arange(
-              3, channels + 3, dtype=np.float32) * 5.0)
+          np.arange(3, channels + 3, dtype=np.float32) * 5.0)
       variance = init_ops.constant_initializer(
-          np.arange(
-              1, channels + 1, dtype=np.float32) * 4.0)
+          np.arange(1, channels + 1, dtype=np.float32) * 4.0)
       if data_format == 'NCHW':
         # Reshape inputs from NHWC to NCHW format.
         images = array_ops.transpose(
-            images, [0, len(shape) - 1] + list(range(1, len(shape) - 1)))
+            images, [0, len(shape) - 1] + list(range(1,
+                                                     len(shape) - 1)))
       output = _layers.batch_norm(
           images,
           is_training=is_training,
@@ -2727,16 +2757,16 @@ class BatchNormTest(test.TestCase):
     # Tests that the adjustment is appropriately passed to and used by the core
     # BN layer.
     all_adjustments = []
+
     def _create_adjustment(shape):
       adjustments = [array_ops.ones(shape[-1:]), array_ops.zeros(shape[-1:])]
       all_adjustments.extend(adjustments)
       return adjustments
+
     depth = 8
     images = array_ops.zeros([10, 5, 5, depth])
     output = _layers.batch_norm(
-        images,
-        is_training=True,
-        adjustment=_create_adjustment)
+        images, is_training=True, adjustment=_create_adjustment)
     self.assertListEqual(output.shape.as_list(), images.shape.as_list())
     self.assertEqual(len(all_adjustments), 2)
     self.assertListEqual(all_adjustments[0].shape.as_list(), [depth])
@@ -2801,7 +2831,10 @@ class LayerNormTest(test.TestCase):
       # output_train and output_eval should be the same.
       self.assertAllClose(sess.run([output_train]), sess.run([output_eval]))
 
-  def doOutputTest(self, input_shape, tol=1e-5, begin_norm_axis=1,
+  def doOutputTest(self,
+                   input_shape,
+                   tol=1e-5,
+                   begin_norm_axis=1,
                    dtype=dtypes.float64):
     expected_mean = np.zeros(input_shape[:begin_norm_axis])
     expected_var = np.ones(input_shape[:begin_norm_axis])
@@ -2832,13 +2865,10 @@ class LayerNormTest(test.TestCase):
             # Layer-norm implemented in numpy
             eps = 1e-12
             expected_out = (
-                (gamma * (
-                    input_values
-                    - np.mean(input_values, axis=moments_axis, keepdims=True))
-                 / np.sqrt(
-                     eps
-                     + np.var(input_values, axis=moments_axis, keepdims=True)))
-                + beta)
+                (gamma * (input_values - np.mean(
+                    input_values, axis=moments_axis, keepdims=True)) /
+                 np.sqrt(eps + np.var(
+                     input_values, axis=moments_axis, keepdims=True))) + beta)
             self.assertAllClose(expected_mean, mean, atol=tol, rtol=tol)
             self.assertAllClose(expected_var, var, atol=tol)
             # The full computation gets a bigger tolerance
@@ -2856,10 +2886,10 @@ class LayerNormTest(test.TestCase):
 
   def testOutput4DInputNormOnInnermostAxis(self):
     # Equivalent tests
-    self.doOutputTest((100, 10, 10, 3), begin_norm_axis=3, tol=1e-4,
-                      dtype=dtypes.float64)
-    self.doOutputTest((100, 10, 10, 3), begin_norm_axis=-1, tol=1e-4,
-                      dtype=dtypes.float64)
+    self.doOutputTest(
+        (100, 10, 10, 3), begin_norm_axis=3, tol=1e-4, dtype=dtypes.float64)
+    self.doOutputTest(
+        (100, 10, 10, 3), begin_norm_axis=-1, tol=1e-4, dtype=dtypes.float64)
 
   def testOutputSmallInput(self):
     self.doOutputTest((10, 10, 10, 30))
@@ -2896,7 +2926,7 @@ class GDNTest(test.TestCase):
       x = np.random.uniform(size=(1, 2, 3, 4)[:ndim])
       y = self._runGDN(x, x.shape, False, 'channels_last')
       self.assertEqual(x.shape, y.shape)
-      self.assertAllClose(y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+      self.assertAllClose(y, x / np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6)
 
   def testChannelsFirst(self):
     # `bias_add` doesn't support NCHW on CPU.
@@ -2905,8 +2935,7 @@ class GDNTest(test.TestCase):
         x = np.random.uniform(size=(4, 3, 2, 1)[:ndim])
         y = self._runGDN(x, x.shape, False, 'channels_first')
         self.assertEqual(x.shape, y.shape)
-        self.assertAllClose(
-            y, x / np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+        self.assertAllClose(y, x / np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6)
 
   def testWrongDims(self):
     for ndim in [1, 2, 6]:
@@ -2918,7 +2947,29 @@ class GDNTest(test.TestCase):
     x = np.random.uniform(size=(1, 2, 3, 4))
     y = self._runGDN(x, x.shape, True, 'channels_last')
     self.assertEqual(x.shape, y.shape)
-    self.assertAllClose(y, x * np.sqrt(1 + .1 * (x ** 2)), rtol=0, atol=1e-6)
+    self.assertAllClose(y, x * np.sqrt(1 + .1 * (x**2)), rtol=0, atol=1e-6)
+
+
+class ImagesToSequenceTest(test.TestCase):
+
+  def testInvalidDataFormat(self):
+    height, width = 7, 11
+    images = np.random.uniform(size=(5, height, width, 2))
+    with self.assertRaisesRegexp(ValueError,
+                                 'data_format has to be either NCHW or NHWC.'):
+      _layers.images_to_sequence(images, data_format='CHWN')
+
+  def testImagesToSequenceDims(self):
+    height, width = 7, 11
+    images = np.random.uniform(size=(2, height, width, 5)).astype(np.float32)
+    output = _layers.images_to_sequence(images)
+    self.assertListEqual(output.get_shape().as_list(), [11, 14, 5])
+
+  def testImagesToSequenceNCHW(self):
+    height, width = 7, 11
+    images = np.random.uniform(size=(2, 5, height, width)).astype(np.float32)
+    output = _layers.images_to_sequence(images, data_format='NCHW')
+    self.assertListEqual(output.get_shape().as_list(), [11, 14, 5])
 
 
 class MaxPool2DTest(test.TestCase):
@@ -2995,20 +3046,22 @@ class MaxPool3DTest(test.TestCase):
   def testInvalidDataFormat(self):
     depth, height, width = 3, 6, 9
     images = np.random.uniform(size=(5, depth, height, width, 3))
-    with self.assertRaisesRegexp(ValueError,
-                                 'data_format has to be either NCDHW or NDHWC.'):
+    with self.assertRaisesRegexp(
+        ValueError, 'data_format has to be either NCDHW or NDHWC.'):
       _layers.max_pool3d(images, [3, 3, 3], data_format='CDHWN')
 
   def testCreateMaxPool(self):
     depth, height, width = 3, 6, 9
-    images = np.random.uniform(size=(5, depth, height, width, 3)).astype(np.float32)
+    images = np.random.uniform(size=(5, depth, height, width, 3)).astype(
+        np.float32)
     output = _layers.max_pool3d(images, [3, 3, 3])
     self.assertEqual(output.op.name, 'MaxPool3D/MaxPool3D')
     self.assertListEqual(output.get_shape().as_list(), [5, 1, 2, 4, 3])
 
   def testCreateMaxPoolNCDHW(self):
     depth, height, width = 3, 6, 9
-    images = np.random.uniform(size=(5, 3, depth, height, width)).astype(np.float32)
+    images = np.random.uniform(size=(5, 3, depth, height, width)).astype(
+        np.float32)
     output = _layers.max_pool3d(images, [3, 3, 3], data_format='NCDHW')
     self.assertEquals(output.op.name, 'MaxPool3D/transpose_1')
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 1, 2, 4])
@@ -3016,7 +3069,8 @@ class MaxPool3DTest(test.TestCase):
   def testCollectOutputs(self):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
-    output = _layers.max_pool3d(images, [3, 3, 3], outputs_collections='outputs')
+    output = _layers.max_pool3d(
+        images, [3, 3, 3], outputs_collections='outputs')
     output_collected = ops.get_collection('outputs')[0]
     self.assertEqual(output_collected.aliases, ['MaxPool3D'])
     self.assertEqual(output_collected, output)
@@ -3051,7 +3105,8 @@ class MaxPool3DTest(test.TestCase):
     depth, height, width = 3, 6, 9
     images = random_ops.random_uniform((5, depth, height, width, 3), seed=1)
     output = _layers.max_pool3d(images, [3, 3, 3], stride=1, padding='SAME')
-    self.assertListEqual(output.get_shape().as_list(), [5, depth, height, width, 3])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth, height, width, 3])
 
   def testGlobalMaxPool(self):
     depth, height, width = 3, 6, 9
@@ -3231,7 +3286,11 @@ class SeparableConv2dTest(test.TestCase):
       images = random_ops.random_uniform((5, height, width, 3), seed=1)
       regularizer = regularizers.l2_regularizer(0.01)
       layers_lib.separable_conv2d(
-          images, 32, [3, 3], 2, weights_regularizer=regularizer)
+          images,
+          32, [3, 3],
+          2,
+          weights_regularizer=regularizer,
+          weights_initializer=init_ops.ones_initializer())
       self.assertEqual(
           len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 2)
       weight_decay = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)[0]
@@ -3239,12 +3298,31 @@ class SeparableConv2dTest(test.TestCase):
           weight_decay.op.name,
           'SeparableConv2d/depthwise_kernel/Regularizer/l2_regularizer')
       sess.run(variables_lib.global_variables_initializer())
-      self.assertLessEqual(sess.run(weight_decay), 0.05)
+      depth_weight_one = sess.run(weight_decay)
       weight_decay = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)[1]
       self.assertEqual(
           weight_decay.op.name,
           'SeparableConv2d/pointwise_kernel/Regularizer/l2_regularizer')
-      self.assertLessEqual(sess.run(weight_decay), 0.05)
+      pointwise_weight_one = sess.run(weight_decay)
+
+      regularizer = regularizers.l2_regularizer(1.0)
+      layers_lib.separable_conv2d(
+          images,
+          32, [3, 3],
+          2,
+          weights_regularizer=regularizer,
+          weights_initializer=init_ops.ones_initializer())
+      self.assertEqual(
+          len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 4)
+      weight_decay = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)[2]
+      sess.run(variables_lib.global_variables_initializer())
+      depth_weight_two = sess.run(weight_decay)
+      weight_decay = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)[3]
+      pointwise_weight_two = sess.run(weight_decay)
+
+      self.assertAllClose(
+          [100.0 * depth_weight_one, 100.0 * pointwise_weight_one],
+          [depth_weight_two, pointwise_weight_two])
 
   def testReuseConvWithWeightDecay(self):
     height, width = 3, 3
@@ -3332,11 +3410,18 @@ class SeparableConv2dTest(test.TestCase):
         batch, height, width = 4, 10, 12
         kernel_dim, stride = 3, 2
         images = random_ops.random_uniform((batch, 3, height, width), seed=1)
-        output = layers_lib.separable_conv2d(images, num_outputs=num_filters, kernel_size=[kernel_dim, kernel_dim],
-                                             depth_multiplier=2, stride=stride, padding='VALID', data_format='NCHW')
-        self.assertListEqual(
-            output.get_shape().as_list(), [batch, correct_output_filters,
-                                           (height - kernel_dim + 1) // stride, (width - kernel_dim + 1) // stride])
+        output = layers_lib.separable_conv2d(
+            images,
+            num_outputs=num_filters,
+            kernel_size=[kernel_dim, kernel_dim],
+            depth_multiplier=2,
+            stride=stride,
+            padding='VALID',
+            data_format='NCHW')
+        self.assertListEqual(output.get_shape().as_list(), [
+            batch, correct_output_filters, (height - kernel_dim + 1) // stride,
+            (width - kernel_dim + 1) // stride
+        ])
 
 
 class ScaleGradientTests(test.TestCase):
@@ -3355,6 +3440,33 @@ class ScaleGradientTests(test.TestCase):
       np.testing.assert_array_equal([3 * 2], g_x.eval())
 
 
+class SequenceToImagesTest(test.TestCase):
+
+  def testImagesToSequenceDims(self):
+    num_batches = 14
+    num_time_steps = 11
+    num_channels = 5
+    desired_height = 7
+    sequence = np.random.uniform(size=(num_time_steps,
+                                       num_batches,
+                                       num_channels)).astype(np.float32)
+    output = _layers.sequence_to_images(sequence, desired_height)
+    self.assertListEqual(output.get_shape().as_list(), [2, 7, 11, 5])
+
+  def testImagesToSequenceNCHW(self):
+    num_batches = 14
+    num_time_steps = 11
+    num_channels = 5
+    desired_height = 7
+    sequence = np.random.uniform(size=(num_time_steps,
+                                       num_batches,
+                                       num_channels)).astype(np.float32)
+    output = _layers.sequence_to_images(sequence,
+                                        desired_height,
+                                        output_data_format='channels_first')
+    self.assertListEqual(output.get_shape().as_list(), [2, 5, 7, 11])
+
+
 class SoftmaxTests(test.TestCase):
 
   def setUp(self):
@@ -3433,8 +3545,7 @@ class SpatialSoftmaxTests(test.TestCase):
       sess.run(variables_lib.global_variables_initializer())
       feed_dict = {features: np_features}
       keypoints = sess.run(spatial_softmax, feed_dict)
-      self.assertAllEqual(keypoints.shape,
-                          (batch_shape[0], batch_shape[3] * 2))
+      self.assertAllEqual(keypoints.shape, (batch_shape[0], batch_shape[3] * 2))
 
   def testSpatialSoftmaxShapeNCHW(self):
     batch_shape = (2, 2, 35, 35)
@@ -3445,8 +3556,7 @@ class SpatialSoftmaxTests(test.TestCase):
       sess.run(variables_lib.global_variables_initializer())
       feed_dict = {features: np_features}
       keypoints = sess.run(spatial_softmax, feed_dict)
-      self.assertAllEqual(keypoints.shape,
-                          (batch_shape[0], batch_shape[1] * 2))
+      self.assertAllEqual(keypoints.shape, (batch_shape[0], batch_shape[1] * 2))
 
   def testTwoMaxActivationsSameChannel(self):
     batch_size, height, width, nchannels = (2, 35, 35, 1)
@@ -3465,8 +3575,8 @@ class SpatialSoftmaxTests(test.TestCase):
     x_loc = [avg_x]
     y_loc = [avg_y]
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3484,13 +3594,13 @@ class SpatialSoftmaxTests(test.TestCase):
     spatial_softmax = _layers.spatial_softmax(features)
     np_features = np.zeros(batch_shape, dtype=np.float32)
 
-    edges = [(0, 0), (0, width-1), (height-1, 0), (height-1, width-1)]
+    edges = [(0, 0), (0, width - 1), (height - 1, 0), (height - 1, width - 1)]
     x_loc, y_loc = zip(*edges)
     for c in range(nchannels):
       np_features[:, x_loc[c], y_loc[c], c] = 100.
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3519,10 +3629,10 @@ class SpatialSoftmaxTests(test.TestCase):
       np_features1[:, x_loc[c], y_loc[c], c] = 100.
       np_features2[:, x_loc[c], y_loc[c], c] = 100.
 
-    np_keypoints1 = self._SpatialSoftmax(
-        x_loc, y_loc, height1, width1, batch_size, nchannels)
-    np_keypoints2 = self._SpatialSoftmax(
-        x_loc, y_loc, height2, width2, batch_size, nchannels)
+    np_keypoints1 = self._SpatialSoftmax(x_loc, y_loc, height1, width1,
+                                         batch_size, nchannels)
+    np_keypoints2 = self._SpatialSoftmax(x_loc, y_loc, height2, width2,
+                                         batch_size, nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3548,8 +3658,8 @@ class SpatialSoftmaxTests(test.TestCase):
     for c in range(nchannels):
       np_features[:, x_loc[c], y_loc[c], c] = 100.
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3571,8 +3681,8 @@ class SpatialSoftmaxTests(test.TestCase):
     for c in range(nchannels):
       np_features[:, c, x_loc[c], y_loc[c]] = 100.
 
-    np_keypoints = self._SpatialSoftmax(
-        x_loc, y_loc, height, width, batch_size, nchannels)
+    np_keypoints = self._SpatialSoftmax(x_loc, y_loc, height, width, batch_size,
+                                        nchannels)
 
     # Make sure expected location keypoints matches actual location keypoints.
     with self.test_session() as sess:
@@ -3667,8 +3777,7 @@ class UnitNormTests(test.TestCase):
       image = random_ops.random_uniform((height, width, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(
-              math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
 
       shape = [height, width, 3]
       del shape[dim]
@@ -3704,8 +3813,7 @@ class UnitNormTests(test.TestCase):
       image = array_ops.placeholder(dtypes.float32, (None, None, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(
-              math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
 
       with self.test_session():
         actual = norms.eval({image: placeholder_value})
@@ -3769,8 +3877,8 @@ class PoincareNormalizeTest(test.TestCase):
       with self.test_session():
         x_tf = constant_op.constant(x_np, name='x')
         y_tf = _layers.poincare_normalize(x_tf, dim)
-        err = gradient_checker.compute_gradient_error(x_tf, x_shape,
-                                                      y_tf, x_shape)
+        err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
+                                                      x_shape)
       print('PoinCareNormalize gradient err = %g ' % err)
       self.assertLess(err, 1e-4)
 
@@ -3782,14 +3890,9 @@ class LegacyFullyConnectedTest(test.TestCase):
     test.TestCase.setUp(self)
     random_seed.set_random_seed(1234)
     self.input = constant_op.constant([[1., 2., 3.], [-4., 15., -6.]])
-    self.input_3_dim_arr = [[[1., 1.1, 1.2],
-                             [2., 2.1, 2.2],
-                             [3., 3.1, 3.2],
-                             [4., 4.1, 4.2]],
-                            [[5., 5.1, 5.2],
-                             [6., 6.1, 6.2],
-                             [7., 7.1, 7.2],
-                             [8., 8.1, 8.2]]]
+    self.input_3_dim_arr = [[[1., 1.1, 1.2], [2., 2.1, 2.2], [3., 3.1, 3.2],
+                             [4., 4.1, 4.2]], [[5., 5.1, 5.2], [6., 6.1, 6.2],
+                                               [7., 7.1, 7.2], [8., 8.1, 8.2]]]
     self.input_3_dim = constant_op.constant(self.input_3_dim_arr)
 
     assert not ops.get_collection(ops.GraphKeys.SUMMARIES)
@@ -3884,15 +3987,10 @@ class LegacyFullyConnectedTest(test.TestCase):
     self._custom_initializers(self.input, 2, [[13.0, 13.0], [11.0, 11.0]])
 
   def test_custom_initializers_multi_dim(self):
-    self._custom_initializers(self.input_3_dim, 2,
-                              [[[7.6, 7.6],
-                                [13.6, 13.6],
-                                [19.6, 19.6],
-                                [25.6, 25.6]],
-                               [[31.6, 31.6],
-                                [37.6, 37.6],
-                                [43.6, 43.6],
-                                [49.6, 49.6]]])
+    self._custom_initializers(
+        self.input_3_dim, 2,
+        [[[7.6, 7.6], [13.6, 13.6], [19.6, 19.6], [25.6, 25.6]],
+         [[31.6, 31.6], [37.6, 37.6], [43.6, 43.6], [49.6, 49.6]]])
 
   def test_custom_collections(self):
     layers_lib.legacy_relu(
@@ -4002,12 +4100,16 @@ class LegacyFullyConnectedTest(test.TestCase):
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
       # we can feed in input with first dimension 2
-      shape_value = sess.run(array_ops.shape(y),
-                             feed_dict={x: self.input_3_dim_arr})
+      shape_value = sess.run(
+          array_ops.shape(y), feed_dict={
+              x: self.input_3_dim_arr
+          })
       self.assertAllClose(shape_value, [2, 4, 1])
       # we can feed in input with first dimension 1
-      shape_value = sess.run(array_ops.shape(y),
-                             feed_dict={x: [self.input_3_dim_arr[0]]})
+      shape_value = sess.run(
+          array_ops.shape(y), feed_dict={
+              x: [self.input_3_dim_arr[0]]
+          })
       self.assertAllClose(shape_value, [1, 4, 1])
       # we cannot feed in input with inconsistent dimensions
       with self.assertRaises(ValueError):
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index 31a1b38bd4832c5816136cab3297aa22e843b0f3..123275e1fde047cd3772528641b2e3b09742fbdc 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -34,12 +34,13 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib.framework.python import ops as contrib_framework_ops
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops as framework_ops
+from tensorflow.python.layers import base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 __all__ = ["rev_block", "RevBlock", "recompute_grad"]
@@ -137,7 +138,17 @@ def _rev_block_forward(x1,
   return y1, y2
 
 
-class RevBlock(object):
+def _scope_wrap(fn, scope):
+
+  @functools.wraps(fn)
+  def wrap(*args, **kwargs):
+    with variable_scope.variable_scope(scope):
+      return fn(*args, **kwargs)
+
+  return wrap
+
+
+class RevBlock(base.Layer):
   """Block of reversible layers. See rev_block."""
 
   def __init__(self,
@@ -146,7 +157,10 @@ class RevBlock(object):
                num_layers=1,
                f_side_input=None,
                g_side_input=None,
-               use_efficient_backprop=True):
+               use_efficient_backprop=True,
+               name="revblock",
+               **kwargs):
+    super(RevBlock, self).__init__(name=name, **kwargs)
 
     if isinstance(f, list):
       assert len(f) == num_layers
@@ -158,18 +172,8 @@ class RevBlock(object):
     else:
       g = [g] * num_layers
 
-    scope_prefix = "revblock/revlayer_%d/"
-    f_scope = scope_prefix + "f"
-    g_scope = scope_prefix + "g"
-
-    f = [
-        template.make_template(f_scope % i, fn, create_scope_now_=True)
-        for i, fn in enumerate(f)
-    ]
-    g = [
-        template.make_template(g_scope % i, fn, create_scope_now_=True)
-        for i, fn in enumerate(g)
-    ]
+    f = [_scope_wrap(fn, "revlayer_%d/f" % i) for i, fn in enumerate(f)]
+    g = [_scope_wrap(fn, "revlayer_%d/g" % i) for i, fn in enumerate(g)]
 
     self.f = f
     self.g = g
@@ -180,6 +184,39 @@ class RevBlock(object):
 
     self._use_efficient_backprop = use_efficient_backprop
 
+  def call(self, inputs, forward=True):
+    vs = variable_scope.get_variable_scope()
+    vars_before = vs.global_variables()
+
+    if forward:
+      x1, x2 = inputs
+      out = self._forward(x1, x2)
+    else:
+      y1, y2 = inputs
+      out = self._backward(y1, y2)
+
+    # Add any created variables to the Layer's variable stores
+    new_vars = vs.global_variables()[len(vars_before):]
+    train_vars = vs.trainable_variables()
+    for new_var in new_vars:
+      if new_var in train_vars:
+        self._trainable_weights.append(new_var)
+      else:
+        self._non_trainable_weights.append(new_var)
+
+    return out
+
+  def forward(self, x1, x2):
+    return self.apply([x1, x2])
+
+  def backward(self, y1, y2):
+    return self.apply([y1, y2], forward=False)
+
+  def build(self, _):
+    logging.warn("RevBlock constructs its variables on first call, not on "
+                 "build.")
+    self.built = True
+
   def _efficient_grad_fn(self, inputs, variables, ys, grad_ys):
     """Custom gradient fn for a block of reversible residual layers."""
     side_inputs = inputs[2:]
@@ -228,17 +265,18 @@ class RevBlock(object):
     f.reverse()
     g.reverse()
 
-    for i in xrange(self.num_layers):
-      ys, grad_ys, f_ret, g_ret = _rev_layer_backward(
-          ys, grad_ys, f[i], g[i], f_vars[i], self.f_side_input, g_vars[i],
-          self.g_side_input)
+    with variable_scope.variable_scope(self.scope_name, reuse=True):
+      for i in xrange(self.num_layers):
+        ys, grad_ys, f_ret, g_ret = _rev_layer_backward(
+            ys, grad_ys, f[i], g[i], f_vars[i], self.f_side_input, g_vars[i],
+            self.g_side_input)
 
-      grad_f_vars, grad_f_side = f_ret
-      grad_g_vars, grad_g_side = g_ret
-      f_var_grads.append(grad_f_vars)
-      g_var_grads.append(grad_g_vars)
-      f_side_grads.append(grad_f_side)
-      g_side_grads.append(grad_g_side)
+        grad_f_vars, grad_f_side = f_ret
+        grad_g_vars, grad_g_side = g_ret
+        f_var_grads.append(grad_f_vars)
+        g_var_grads.append(grad_g_vars)
+        f_side_grads.append(grad_f_side)
+        g_side_grads.append(grad_g_side)
 
     # Accumulate layer gradients for f_side_input and g_side_input
     acc_f_side_grads = _acc_grads(*f_side_grads)
@@ -265,7 +303,7 @@ class RevBlock(object):
     grad_x1, grad_x2 = grad_ys
     return [grad_x1, grad_x2] + side_input_grads, variable_grads
 
-  def forward(self, x1, x2):
+  def _forward(self, x1, x2):
     """Run forward through the reversible layers."""
 
     side_inputs = [self.f_side_input, self.g_side_input]
@@ -275,7 +313,7 @@ class RevBlock(object):
         self._efficient_grad_fn if self._use_efficient_backprop else None)
 
     @_fn_with_custom_grad(custom_grad_fn)
-    def _forward(x1_, x2_, *flat_side_inputs):
+    def _forward_wrap(x1_, x2_, *flat_side_inputs):
       f_side, g_side = nest.pack_sequence_as(side_inputs, flat_side_inputs)
       return _rev_block_forward(
           x1_,
@@ -287,9 +325,9 @@ class RevBlock(object):
           g_side_input=g_side,
           gate_outputs=self._use_efficient_backprop)
 
-    return _forward(x1, x2, *flat_side_inputs)
+    return _forward_wrap(x1, x2, *flat_side_inputs)
 
-  def backward(self, y1, y2):
+  def _backward(self, y1, y2):
     """Run backward through the reversible layers."""
 
     f = list(self.f)
@@ -356,7 +394,14 @@ def rev_block(x1,
   Returns:
     y1, y2: tuple of float Tensors.
   """
-  block = RevBlock(f, g, num_layers, f_side_input, g_side_input, is_training)
+  block = RevBlock(
+      f=f,
+      g=g,
+      num_layers=num_layers,
+      f_side_input=f_side_input,
+      g_side_input=g_side_input,
+      use_efficient_backprop=is_training,
+      _reuse=variable_scope.get_variable_scope().reuse)
   return block.forward(x1, x2)
 
 
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index a420753fd5728e7eef4f135d4943d25e8e05d5c2..cbcbcd75114a522b95631e4e7e95c1641b0a9987 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -188,13 +188,46 @@ class RevBlockTest(test.TestCase):
 
     def f(x):
       x = convolutional.conv1d(x, self.CHANNELS // 2, 3, padding="same")
-      x = core_layers.batch_normalization(x, training=True)
+      x = layers.batch_norm(x, is_training=True)
       x = convolutional.conv1d(x, self.CHANNELS // 2, 3, padding="same")
-      x = core_layers.batch_normalization(x, training=True)
+      x = layers.batch_norm(x, is_training=True)
       return x
 
     self._testRevBlock(x=x, f=f)
 
+  def testReuse(self):
+
+    def f(x):
+      return core_layers.dense(x, self.CHANNELS // 2)
+
+    def g(x):
+      return core_layers.dense(x, self.CHANNELS // 2)
+
+    x = random_ops.random_uniform(
+        [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32)
+    x1, x2 = array_ops.split(x, 2, axis=-1)
+
+    with variable_scope.variable_scope("test"):
+      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)
+
+    num_vars_before = len(variables.global_variables())
+
+    with variable_scope.variable_scope("test", reuse=True):
+      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)
+
+    num_vars_after = len(variables.global_variables())
+    self.assertEqual(num_vars_before, num_vars_after)
+
+    loss = math_ops.reduce_mean(y1 + y2)
+    _ = gradients_impl.gradients(loss,
+                                 [x] + variables.trainable_variables())
+
+    with variable_scope.variable_scope("test", reuse=True):
+      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)
+
+    num_vars_after = len(variables.global_variables())
+    self.assertEqual(num_vars_before, num_vars_after)
+
 
 class RecomputeTest(test.TestCase):
 
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 94920db574e07529c28313a78e0128676fcc7970..abf6e393bb0fbbce4e43f6d209e9b30517df36c3 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -10,7 +10,7 @@ package(default_visibility = [
     "//tensorflow:internal",
 ])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
 
 py_library(
     name = "learn",
@@ -154,12 +154,11 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "experiment_test",
     size = "medium",
     srcs = ["python/learn/experiment_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/core:protos_all_py",
@@ -173,6 +172,17 @@ py_test(
     ],
 )
 
+py_test(
+    name = "export_strategy_test",
+    size = "small",
+    srcs = ["python/learn/export_strategy_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "graph_actions_test",
     size = "small",
@@ -346,6 +356,7 @@ py_test(
     srcs = ["python/learn/estimators/dnn_linear_combined_test.py"],
     shard_count = 4,
     srcs_version = "PY2AND3",
+    tags = ["no_oss"],  # flaky b/70524820
     deps = [
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
@@ -377,6 +388,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variables",
@@ -461,6 +473,7 @@ py_test(
     size = "medium",
     srcs = ["python/learn/estimators/state_saving_rnn_estimator_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["noasan"],
     deps = [
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
@@ -482,7 +495,7 @@ py_test(
     name = "linear_test",
     size = "medium",
     srcs = ["python/learn/estimators/linear_test.py"],
-    shard_count = 4,
+    shard_count = 20,
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -715,12 +728,11 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "graph_io_test",
     size = "small",
     srcs = ["python/learn/learn_io/graph_io_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":learn",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -736,20 +748,7 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
-)
-
-py_test(
-    name = "numpy_io_test",
-    size = "small",
-    srcs = ["python/learn/learn_io/numpy_io_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":learn",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:training",
-        "//third_party/py/numpy",
-    ],
+    grpc_enabled = True,
 )
 
 py_test(
diff --git a/tensorflow/contrib/learn/python/learn/datasets/__init__.py b/tensorflow/contrib/learn/python/learn/datasets/__init__.py
index a3521b4109ab40d8478f20afc317cf5154da2b43..7240b0de149051afa045a8113f9e9b212840c311 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Dataset utilities and synthetic/reference datasets."""
 
 from __future__ import absolute_import
@@ -46,11 +45,12 @@ DATASETS = {
 
 # List of all synthetic datasets
 SYNTHETIC = {
-  # All of these will return ['data', 'target'] -> base.Dataset
-  'circles': synthetic.circles,
-  'spirals': synthetic.spirals
+    # All of these will return ['data', 'target'] -> base.Dataset
+    'circles': synthetic.circles,
+    'spirals': synthetic.spirals
 }
 
+
 def load_dataset(name, size='small', test_with_fake_data=False):
   """Loads dataset by name.
 
@@ -83,23 +83,28 @@ def make_dataset(name, n_samples=100, noise=None, seed=42, *args, **kwargs):
     seed: int or None, seed for noise
 
   Returns:
-    Shuffled features and labels for given synthetic dataset of type `base.Dataset`
+    Shuffled features and labels for given synthetic dataset of type
+    `base.Dataset`
 
   Raises:
     ValueError: Raised if `name` not found
 
   Note:
-    - This is a generic synthetic data generator - individual generators might have more parameters!
+    - This is a generic synthetic data generator - individual generators might
+    have more parameters!
       See documentation for individual parameters
-    - Note that the `noise` parameter uses `numpy.random.normal` and depends on `numpy`'s seed
+    - Note that the `noise` parameter uses `numpy.random.normal` and depends on
+    `numpy`'s seed
 
   TODO:
     - Support multiclass datasets
-    - Need shuffling routine. Currently synthetic datasets are reshuffled to avoid train/test correlation,
+    - Need shuffling routine. Currently synthetic datasets are reshuffled to
+    avoid train/test correlation,
       but that hurts reprodusability
   """
   # seed = kwargs.pop('seed', None)
   if name not in SYNTHETIC:
     raise ValueError('Synthetic dataset not found or not implemeted: %s' % name)
   else:
-    return SYNTHETIC[name](n_samples=n_samples, noise=noise, seed=seed, *args, **kwargs)
+    return SYNTHETIC[name](
+        n_samples=n_samples, noise=noise, seed=seed, *args, **kwargs)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py
index 71978d439449e29c7cb907b18bab5d6659a972b6..ca720ae5ed26e74da12bd6c5a37231b41442f76f 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Base utilities for loading datasets."""
 
 from __future__ import absolute_import
@@ -24,13 +23,11 @@ import csv
 import os
 from os import path
 import random
-import tempfile
 import time
 
 import numpy as np
 from six.moves import urllib
 
-from tensorflow.contrib.framework import deprecated
 from tensorflow.python.platform import gfile
 
 Dataset = collections.namedtuple('Dataset', ['data', 'target'])
@@ -100,9 +97,7 @@ def load_iris(data_path=None):
     module_path = path.dirname(__file__)
     data_path = path.join(module_path, 'data', 'iris.csv')
   return load_csv_with_header(
-      data_path,
-      target_dtype=np.int,
-      features_dtype=np.float)
+      data_path, target_dtype=np.int, features_dtype=np.float)
 
 
 def load_boston(data_path=None):
@@ -118,16 +113,10 @@ def load_boston(data_path=None):
     module_path = path.dirname(__file__)
     data_path = path.join(module_path, 'data', 'boston_house_prices.csv')
   return load_csv_with_header(
-      data_path,
-      target_dtype=np.float,
-      features_dtype=np.float)
+      data_path, target_dtype=np.float, features_dtype=np.float)
 
 
-def retry(initial_delay,
-          max_delay,
-          factor=2.0,
-          jitter=0.25,
-          is_retriable=None):
+def retry(initial_delay, max_delay, factor=2.0, jitter=0.25, is_retriable=None):
   """Simple decorator for wrapping retriable functions.
 
   Args:
@@ -152,7 +141,7 @@ def retry(initial_delay,
   def delays():
     delay = initial_delay
     while delay <= max_delay:
-      yield delay * random.uniform(1 - jitter,  1 + jitter)
+      yield delay * random.uniform(1 - jitter, 1 + jitter)
       delay *= factor
 
   def wrap(fn):
@@ -172,7 +161,9 @@ def retry(initial_delay,
           else:
             raise
       return fn(*args, **kwargs)
+
     return wrapped_fn
+
   return wrap
 
 
diff --git a/tensorflow/contrib/learn/python/learn/datasets/mnist.py b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
index 1f3295747e141760445b021bf4f59cc47b88b8b2..37f9175015a239f763c7721cf36ab8063c0a3e32 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/mnist.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Functions for downloading and reading MNIST data."""
 
 from __future__ import absolute_import
@@ -123,8 +122,8 @@ class DataSet(object):
     numpy.random.seed(seed1 if seed is None else seed2)
     dtype = dtypes.as_dtype(dtype).base_dtype
     if dtype not in (dtypes.uint8, dtypes.float32):
-      raise TypeError('Invalid image dtype %r, expected uint8 or float32' %
-                      dtype)
+      raise TypeError(
+          'Invalid image dtype %r, expected uint8 or float32' % dtype)
     if fake_data:
       self._num_examples = 10000
       self.one_hot = one_hot
@@ -202,7 +201,9 @@ class DataSet(object):
       end = self._index_in_epoch
       images_new_part = self._images[start:end]
       labels_new_part = self._labels[start:end]
-      return numpy.concatenate((images_rest_part, images_new_part), axis=0) , numpy.concatenate((labels_rest_part, labels_new_part), axis=0)
+      return numpy.concatenate(
+          (images_rest_part, images_new_part), axis=0), numpy.concatenate(
+              (labels_rest_part, labels_new_part), axis=0)
     else:
       self._index_in_epoch += batch_size
       end = self._index_in_epoch
@@ -257,16 +258,14 @@ def read_data_sets(train_dir,
     test_labels = extract_labels(f, one_hot=one_hot)
 
   if not 0 <= validation_size <= len(train_images):
-    raise ValueError(
-        'Validation size should be between 0 and {}. Received: {}.'
-        .format(len(train_images), validation_size))
+    raise ValueError('Validation size should be between 0 and {}. Received: {}.'
+                     .format(len(train_images), validation_size))
 
   validation_images = train_images[:validation_size]
   validation_labels = train_labels[:validation_size]
   train_images = train_images[validation_size:]
   train_labels = train_labels[validation_size:]
 
-
   options = dict(dtype=dtype, reshape=reshape, seed=seed)
 
   train = DataSet(train_images, train_labels, **options)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
index 907dc0f3dfced7e55c5f46711fbe93f6400e1de7..9a843168c27d9cae3f55efe4fe4c688d86c745f3 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Synthetic dataset generators."""
 
 from __future__ import absolute_import
@@ -23,18 +22,27 @@ import numpy as np
 
 from tensorflow.contrib.learn.python.learn.datasets.base import Dataset
 
-def circles(n_samples=100, noise=None, seed=None, factor=0.8, n_classes=2, *args, **kwargs):
+
+def circles(n_samples=100,
+            noise=None,
+            seed=None,
+            factor=0.8,
+            n_classes=2,
+            *args,
+            **kwargs):
   """Create circles separated by some value
 
   Args:
     n_samples: int, number of datapoints to generate
     noise: float or None, standard deviation of the Gaussian noise added
     seed: int or None, seed for the noise
-    factor: float, size factor of the inner circles with respect to the outer ones
+    factor: float, size factor of the inner circles with respect to the outer
+      ones
     n_classes: int, number of classes to generate
 
   Returns:
-    Shuffled features and labels for 'circles' synthetic dataset of type `base.Dataset`
+    Shuffled features and labels for 'circles' synthetic dataset of type
+    `base.Dataset`
 
   Note:
     The multi-class support might not work as expected if `noise` is enabled
@@ -54,7 +62,7 @@ def circles(n_samples=100, noise=None, seed=None, factor=0.8, n_classes=2, *args
   if seed is not None:
     np.random.seed(seed)
   # Algo: 1) Generate initial circle, 2) For ever class generate a smaller radius circle
-  linspace = np.linspace(0, 2*np.pi, n_samples // n_classes)
+  linspace = np.linspace(0, 2 * np.pi, n_samples // n_classes)
   circ_x = np.empty(0, dtype=np.int32)
   circ_y = np.empty(0, dtype=np.int32)
   base_cos = np.cos(linspace)
@@ -66,12 +74,12 @@ def circles(n_samples=100, noise=None, seed=None, factor=0.8, n_classes=2, *args
     circ_y = np.append(circ_y, base_sin)
     base_cos *= factor
     base_sin *= factor
-    y = np.append(y, label*np.ones(n_samples // n_classes, dtype=np.int32))
+    y = np.append(y, label * np.ones(n_samples // n_classes, dtype=np.int32))
 
   # Add more points if n_samples is not divisible by n_classes (unbalanced!)
   extras = n_samples % n_classes
-  circ_x = np.append(circ_x, np.cos(np.random.rand(extras)*2*np.pi))
-  circ_y = np.append(circ_y, np.sin(np.random.rand(extras)*2*np.pi))
+  circ_x = np.append(circ_x, np.cos(np.random.rand(extras) * 2 * np.pi))
+  circ_y = np.append(circ_y, np.sin(np.random.rand(extras) * 2 * np.pi))
   y = np.append(y, np.zeros(extras, dtype=np.int32))
 
   # Reshape the features/labels
@@ -85,10 +93,13 @@ def circles(n_samples=100, noise=None, seed=None, factor=0.8, n_classes=2, *args
   return Dataset(data=X[indices], target=y[indices])
 
 
-def spirals(n_samples=100, noise=None, seed=None,
-            mode = 'archimedes',
-            n_loops = 2,
-            *args, **kwargs):
+def spirals(n_samples=100,
+            noise=None,
+            seed=None,
+            mode='archimedes',
+            n_loops=2,
+            *args,
+            **kwargs):
   """Create spirals
 
   Currently only binary classification is supported for spiral generation
@@ -104,7 +115,8 @@ def spirals(n_samples=100, noise=None, seed=None,
       'fermat': a spiral with branch distances decreasing (sqrt)
 
   Returns:
-    Shuffled features and labels for 'spirals' synthetic dataset of type `base.Dataset`
+    Shuffled features and labels for 'spirals' synthetic dataset of type
+    `base.Dataset`
 
   Raises:
     ValueError: If the generation `mode` is not valid
@@ -112,34 +124,35 @@ def spirals(n_samples=100, noise=None, seed=None,
   TODO:
     - Generation of unbalanced data
   """
-  n_classes = 2 # I am not sure how to make it multiclass
+  n_classes = 2  # I am not sure how to make it multiclass
 
   _modes = {
-    'archimedes': _archimedes_spiral,
-    'bernoulli': _bernoulli_spiral,
-    'fermat': _fermat_spiral
+      'archimedes': _archimedes_spiral,
+      'bernoulli': _bernoulli_spiral,
+      'fermat': _fermat_spiral
   }
 
   if mode is None or mode not in _modes:
-    raise ValueError("Cannot generate spiral with mode %s"%mode)
+    raise ValueError('Cannot generate spiral with mode %s' % mode)
 
   if seed is not None:
     np.random.seed(seed)
-  linspace = np.linspace(0, 2*n_loops*np.pi, n_samples // n_classes)
+  linspace = np.linspace(0, 2 * n_loops * np.pi, n_samples // n_classes)
   spir_x = np.empty(0, dtype=np.int32)
   spir_y = np.empty(0, dtype=np.int32)
 
   y = np.empty(0, dtype=np.int32)
   for label in range(n_classes):
-    base_cos, base_sin = _modes[mode](linspace, label*np.pi, *args, **kwargs)
+    base_cos, base_sin = _modes[mode](linspace, label * np.pi, *args, **kwargs)
     spir_x = np.append(spir_x, base_cos)
     spir_y = np.append(spir_y, base_sin)
-    y = np.append(y, label*np.ones(n_samples // n_classes, dtype=np.int32))
+    y = np.append(y, label * np.ones(n_samples // n_classes, dtype=np.int32))
 
   # Add more points if n_samples is not divisible by n_classes (unbalanced!)
   extras = n_samples % n_classes
   if extras > 0:
-    x_exrta, y_extra = _modes[mode](np.random.rand(extras)*2*np.pi, *args, **kwargs)
+    x_extra, y_extra = _modes[mode](np.random.rand(extras) * 2 * np.pi, *args,
+                                    **kwargs)
     spir_x = np.append(spir_x, x_extra)
     spir_y = np.append(spir_y, y_extra)
     y = np.append(y, np.zeros(extras, dtype=np.int32))
@@ -162,7 +175,8 @@ def _archimedes_spiral(theta, theta_offset=0., *args, **kwargs):
     theta: array-like, angles from polar coordinates to be converted
     theta_offset: float, angle offset in radians (2*pi = 0)
   """
-  x, y = theta*np.cos(theta + theta_offset), theta*np.sin(theta + theta_offset)
+  x, y = theta * np.cos(theta + theta_offset), theta * np.sin(
+      theta + theta_offset)
   x_norm = np.max(np.abs(x))
   y_norm = np.max(np.abs(y))
   x, y = x / x_norm, y / y_norm
@@ -181,7 +195,8 @@ def _bernoulli_spiral(theta, theta_offset=0., *args, **kwargs):
   """
   exp_scale = kwargs.pop('exp_scale', 0.1)
 
-  x, y = np.exp(exp_scale*theta)*np.cos(theta + theta_offset), np.exp(exp_scale*theta)*np.sin(theta + theta_offset)
+  x, y = np.exp(exp_scale * theta) * np.cos(theta + theta_offset), np.exp(
+      exp_scale * theta) * np.sin(theta + theta_offset)
   x_norm = np.max(np.abs(x))
   y_norm = np.max(np.abs(y))
   x, y = x / x_norm, y / y_norm
@@ -195,7 +210,8 @@ def _fermat_spiral(theta, theta_offset=0., *args, **kwargs):
     theta: array-like, angles from polar coordinates to be converted
     theta_offset: float, angle offset in radians (2*pi = 0)
   """
-  x, y = np.sqrt(theta)*np.cos(theta + theta_offset), np.sqrt(theta)*np.sin(theta + theta_offset)
+  x, y = np.sqrt(theta) * np.cos(theta + theta_offset), np.sqrt(theta) * np.sin(
+      theta + theta_offset)
   x_norm = np.max(np.abs(x))
   y_norm = np.max(np.abs(y))
   x, y = x / x_norm, y / y_norm
diff --git a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
index 5340afab46eba957d6d612bb583983b627537547..5809995c8c7d8e72eb47ee88a72547bae7fd3594 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/synthetic_test.py
@@ -24,12 +24,14 @@ from tensorflow.python.platform import test
 from tensorflow.contrib.learn.python.learn import datasets
 from tensorflow.contrib.learn.python.learn.datasets import synthetic
 
+
 class SyntheticTest(test.TestCase):
   """Test synthetic dataset generation"""
 
   def test_make_dataset(self):
     """Test if the synthetic routine wrapper complains about the name"""
-    self.assertRaises(ValueError, datasets.make_dataset, name='_non_existing_name')
+    self.assertRaises(
+        ValueError, datasets.make_dataset, name='_non_existing_name')
 
   def test_all_datasets_callable(self):
     """Test if all methods inside the `SYNTHETIC` are callable"""
@@ -52,9 +54,10 @@ class SyntheticTest(test.TestCase):
     """
     n_samples = 100
     n_classes = 2
-    circ = synthetic.circles(n_samples = n_samples, noise = None, n_classes = n_classes)
+    circ = synthetic.circles(
+        n_samples=n_samples, noise=None, n_classes=n_classes)
     self.assertIsInstance(circ, datasets.base.Dataset)
-    self.assertTupleEqual(circ.data.shape, (n_samples,2))
+    self.assertTupleEqual(circ.data.shape, (n_samples, 2))
     self.assertTupleEqual(circ.target.shape, (n_samples,))
     self.assertSetEqual(set(circ.target), set(range(n_classes)))
 
@@ -67,17 +70,24 @@ class SyntheticTest(test.TestCase):
     """
     seed = 42
     noise = 0.1
-    circ0 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed)
-    circ1 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed)
+    circ0 = synthetic.circles(
+        n_samples=100, noise=noise, n_classes=2, seed=seed)
+    circ1 = synthetic.circles(
+        n_samples=100, noise=noise, n_classes=2, seed=seed)
     np.testing.assert_array_equal(circ0.data, circ1.data)
     np.testing.assert_array_equal(circ0.target, circ1.target)
 
-    circ1 = synthetic.circles(n_samples = 100, noise = noise, n_classes = 2, seed = seed+1)
-    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data, circ1.data)
-    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.target, circ1.target)
+    circ1 = synthetic.circles(
+        n_samples=100, noise=noise, n_classes=2, seed=seed + 1)
+    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data,
+                      circ1.data)
+    self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                      circ0.target, circ1.target)
 
-    circ1 = synthetic.circles(n_samples = 100, noise = noise/2., n_classes = 2, seed = seed)
-    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data, circ1.data)
+    circ1 = synthetic.circles(
+        n_samples=100, noise=noise / 2., n_classes=2, seed=seed)
+    self.assertRaises(AssertionError, np.testing.assert_array_equal, circ0.data,
+                      circ1.data)
 
   def test_spirals(self):
     """Test if the circles are generated correctly
@@ -89,13 +99,14 @@ class SyntheticTest(test.TestCase):
       - returned `target` shape is (n_samples,)
       - set of unique classes range is [0, n_classes)
     """
-    self.assertRaises(ValueError, synthetic.spirals, mode='_unknown_mode_spiral_')
+    self.assertRaises(
+        ValueError, synthetic.spirals, mode='_unknown_mode_spiral_')
     n_samples = 100
     modes = ('archimedes', 'bernoulli', 'fermat')
     for mode in modes:
-      spir = synthetic.spirals(n_samples = n_samples, noise = None, mode = mode)
+      spir = synthetic.spirals(n_samples=n_samples, noise=None, mode=mode)
       self.assertIsInstance(spir, datasets.base.Dataset)
-      self.assertTupleEqual(spir.data.shape, (n_samples,2))
+      self.assertTupleEqual(spir.data.shape, (n_samples, 2))
       self.assertTupleEqual(spir.target.shape, (n_samples,))
       self.assertSetEqual(set(spir.target), set(range(2)))
 
@@ -110,18 +121,24 @@ class SyntheticTest(test.TestCase):
     noise = 0.1
     modes = ('archimedes', 'bernoulli', 'fermat')
     for mode in modes:
-      spir0 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed)
-      spir1 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed)
+      spir0 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed)
+      spir1 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed)
       np.testing.assert_array_equal(spir0.data, spir1.data)
       np.testing.assert_array_equal(spir0.target, spir1.target)
 
-      spir1 = synthetic.spirals(n_samples = 1000, noise = noise, seed = seed+1)
-      self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.data, spir1.data)
-      self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.target, spir1.target)
+      spir1 = synthetic.spirals(n_samples=1000, noise=noise, seed=seed + 1)
+      self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                        spir0.data, spir1.data)
+      self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                        spir0.target, spir1.target)
+
+      spir1 = synthetic.spirals(n_samples=1000, noise=noise / 2., seed=seed)
+      self.assertRaises(AssertionError, np.testing.assert_array_equal,
+                        spir0.data, spir1.data)
 
-      spir1 = synthetic.spirals(n_samples = 1000, noise = noise/2., seed = seed)
-      self.assertRaises(AssertionError, np.testing.assert_array_equal, spir0.data, spir1.data)
+  def test_spirals_synthetic(self):
+    synthetic.spirals(3)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
index 14750961efa30128708430fac038498de0a42118..ef5e620e8f08cffa7c2b945089aa5d150baefefc 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.estimators import composable_model
@@ -55,7 +55,7 @@ def _base_model_fn(features, labels, mode, params):
     raise NotImplementedError
 
   def _train_op_fn(loss):
-    global_step = contrib_variables.get_global_step()
+    global_step = training_util.get_global_step()
     assert global_step
     train_step = model.get_train_step(loss)
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
index 6b125534a42c5cdde69773d99cefd6e7b2d60c9c..b968aeed1b7a11d522b531783f04f0104b37904f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/debug_test.py
@@ -44,7 +44,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import input as input_lib
 
-
 NUM_EXAMPLES = 100
 N_CLASSES = 5  #  Cardinality of multiclass labels.
 LABEL_DIMENSION = 3  #  Dimensionality of regression labels.
@@ -52,8 +51,10 @@ LABEL_DIMENSION = 3  #  Dimensionality of regression labels.
 
 def _train_test_split(features_and_labels):
   features, labels = features_and_labels
-  train_set = (features[:int(len(features) / 2)], labels[:int(len(features) / 2)])
-  test_set = (features[int(len(features) / 2):], labels[int(len(features) / 2):])
+  train_set = (features[:int(len(features) / 2)],
+               labels[:int(len(features) / 2)])
+  test_set = (features[int(len(features) / 2):],
+              labels[int(len(features) / 2):])
   return train_set, test_set
 
 
@@ -86,17 +87,17 @@ class DebugClassifierTest(test.TestCase):
     (train_features, train_labels), (test_features,
                                      test_labels) = _train_test_split(
                                          [self.features, self.labels])
-    majority_class, _ = max(collections.Counter(train_labels).items(),
-                            key=operator.itemgetter(1))
+    majority_class, _ = max(
+        collections.Counter(train_labels).items(), key=operator.itemgetter(1))
     expected_prediction = np.vstack(
         [[majority_class] for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=N_CLASSES)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
-                                                                 None))
+    pred = classifier.predict_classes(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllEqual(expected_prediction, np.vstack(pred))
 
   def testPredictBinary(self):
@@ -105,34 +106,34 @@ class DebugClassifierTest(test.TestCase):
                                      test_labels) = _train_test_split(
                                          [self.features, self.binary_labels])
 
-    majority_class, _ = max(collections.Counter(train_labels).items(),
-                            key=operator.itemgetter(1))
+    majority_class, _ = max(
+        collections.Counter(train_labels).items(), key=operator.itemgetter(1))
     expected_prediction = np.vstack(
         [[majority_class] for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
-                                                                 None))
+    pred = classifier.predict_classes(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllEqual(expected_prediction, np.vstack(pred))
 
-    (train_features, train_labels), (
-        test_features, test_labels) = _train_test_split(
-            [self.features, self.binary_float_labels])
+    (train_features,
+     train_labels), (test_features, test_labels) = _train_test_split(
+         [self.features, self.binary_float_labels])
 
-    majority_class, _ = max(collections.Counter(train_labels).items(),
-                            key=operator.itemgetter(1))
+    majority_class, _ = max(
+        collections.Counter(train_labels).items(), key=operator.itemgetter(1))
     expected_prediction = np.vstack(
         [[majority_class] for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_classes(input_fn=_input_fn_builder(test_features,
-                                                                 None))
+    pred = classifier.predict_classes(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllEqual(expected_prediction, np.vstack(pred))
 
   def testPredictProba(self):
@@ -150,8 +151,8 @@ class DebugClassifierTest(test.TestCase):
         [class_distribution for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=N_CLASSES)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
     pred = classifier.predict_proba(
         input_fn=_input_fn_builder(test_features, None))
@@ -173,17 +174,17 @@ class DebugClassifierTest(test.TestCase):
         [class_distribution for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
     pred = classifier.predict_proba(
         input_fn=_input_fn_builder(test_features, None))
 
     self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1)
 
-    (train_features, train_labels), (
-        test_features, test_labels) = _train_test_split(
-            [self.features, self.binary_float_labels])
+    (train_features,
+     train_labels), (test_features, test_labels) = _train_test_split(
+         [self.features, self.binary_float_labels])
 
     class_distribution = np.zeros((1, 2))
     for label in train_labels:
@@ -194,8 +195,8 @@ class DebugClassifierTest(test.TestCase):
         [class_distribution for _ in range(test_labels.shape[0])])
 
     classifier = debug.DebugClassifier(n_classes=2)
-    classifier.fit(input_fn=_input_fn_builder(train_features, train_labels),
-                   steps=50)
+    classifier.fit(
+        input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
     pred = classifier.predict_proba(
         input_fn=_input_fn_builder(test_features, None))
@@ -232,13 +233,12 @@ class DebugClassifierTest(test.TestCase):
     def _input_fn():
       iris = test_data.prepare_iris_data_for_logistic_regression()
       return {
-          'feature': constant_op.constant(
-              iris.data, dtype=dtypes.float32)
+          'feature': constant_op.constant(iris.data, dtype=dtypes.float32)
       }, constant_op.constant(
           iris.target, shape=[100], dtype=dtypes.int32)
 
-    classifier = debug.DebugClassifier(config=run_config.RunConfig(
-        tf_random_seed=1))
+    classifier = debug.DebugClassifier(
+        config=run_config.RunConfig(tf_random_seed=1))
     classifier.fit(input_fn=_input_fn, steps=5)
     scores = classifier.evaluate(input_fn=_input_fn, steps=1)
     self.assertIn('loss', scores)
@@ -342,8 +342,7 @@ class DebugClassifierTest(test.TestCase):
     def _input_fn():
       iris = base.load_iris()
       return {
-          'feature': constant_op.constant(
-              iris.data, dtype=dtypes.float32)
+          'feature': constant_op.constant(iris.data, dtype=dtypes.float32)
       }, constant_op.constant(
           iris.target, shape=[150], dtype=dtypes.int32)
 
@@ -387,7 +386,9 @@ class DebugClassifierTest(test.TestCase):
       # Create 4 rows, one of them (y = x), three of them (y=Not(x))
       # The logistic prediction should be (y = 0.25).
       labels = constant_op.constant([[1], [0], [0], [0]])
-      features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),}
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+      }
       return features, labels
 
     classifier = debug.DebugClassifier(n_classes=2)
@@ -404,8 +405,7 @@ class DebugClassifierTest(test.TestCase):
       # The logistic prediction should be (y = 0.25).
       labels = constant_op.constant([[1.], [0.], [0.], [0.]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[1.], [1.], [1.], [1.]])
       }
       return features, labels
@@ -414,8 +414,7 @@ class DebugClassifierTest(test.TestCase):
       # 4 rows, with different weights.
       labels = constant_op.constant([[1.], [0.], [0.], [0.]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[7.], [1.], [1.], [1.]])
       }
       return features, labels
@@ -438,8 +437,7 @@ class DebugClassifierTest(test.TestCase):
       # than (y=Not(x)) due to the relative higher weight of the first row.
       labels = constant_op.constant([[1], [0], [0], [0]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[100.], [3.], [2.], [2.]])
       }
       return features, labels
@@ -448,8 +446,7 @@ class DebugClassifierTest(test.TestCase):
       # Create 4 rows (y = x)
       labels = constant_op.constant([[1], [1], [1], [1]])
       features = {
-          'x': array_ops.ones(
-              shape=[4, 1], dtype=dtypes.float32),
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
           'w': constant_op.constant([[1.], [1.], [1.], [1.]])
       }
       return features, labels
@@ -469,8 +466,7 @@ class DebugClassifierTest(test.TestCase):
       features = {
           'x':
               input_lib.limit_epochs(
-                  array_ops.ones(
-                      shape=[4, 1], dtype=dtypes.float32),
+                  array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
                   num_epochs=num_epochs),
       }
       return features, labels
@@ -578,12 +574,11 @@ class DebugClassifierTest(test.TestCase):
     language = feature_column.sparse_column_with_hash_bucket('language', 100)
     feature_columns = [
         feature_column.real_valued_column('age'),
-        feature_column.embedding_column(
-            language, dimension=1)
+        feature_column.embedding_column(language, dimension=1)
     ]
 
-    classifier = debug.DebugClassifier(config=run_config.RunConfig(
-        tf_random_seed=1))
+    classifier = debug.DebugClassifier(
+        config=run_config.RunConfig(tf_random_seed=1))
     classifier.fit(input_fn=input_fn, steps=5)
 
     def default_input_fn(unused_estimator, examples):
@@ -614,8 +609,8 @@ class DebugRegressorTest(test.TestCase):
     classifier.fit(
         input_fn=_input_fn_builder(train_features, train_labels), steps=50)
 
-    pred = classifier.predict_scores(input_fn=_input_fn_builder(test_features,
-                                                                None))
+    pred = classifier.predict_scores(
+        input_fn=_input_fn_builder(test_features, None))
     self.assertAllClose(expected_prediction, np.vstack(pred), atol=0.1)
 
   def testExperimentIntegration(self):
@@ -698,7 +693,9 @@ class DebugRegressorTest(test.TestCase):
       # Create 4 rows, one of them (y = x), three of them (y=Not(x))
       # The algorithm should learn (y = 0.25).
       labels = constant_op.constant([[1.], [0.], [0.], [0.]])
-      features = {'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),}
+      features = {
+          'x': array_ops.ones(shape=[4, 1], dtype=dtypes.float32),
+      }
       return features, labels
 
     regressor = debug.DebugRegressor(
@@ -853,5 +850,6 @@ class DebugRegressorTest(test.TestCase):
     predictions2 = list(regressor2.predict_scores(input_fn=predict_input_fn))
     self.assertAllClose(predictions, predictions2)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index cb15ef23e95d27c737d8ae08065b804bafd39a07..c17b41c0f767e19d9c3635a8f60347a49b297cfb 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -23,7 +23,7 @@ import six
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -189,7 +189,7 @@ def _dnn_model_fn(features, labels, mode, params, config=None):
       """Returns the op to optimize the loss."""
       return optimizers.optimize_loss(
           loss=loss,
-          global_step=contrib_variables.get_global_step(),
+          global_step=training_util.get_global_step(),
           learning_rate=_LEARNING_RATE,
           optimizer=_get_optimizer(optimizer),
           gradient_multipliers=(
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
index 57e70e169ca9d6fb2adc4e50bf387cc7cf330aed..4e65c180d8bee9ab8fe9b1fbf32edc229c31af09 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@@ -1046,11 +1046,14 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
 
     if global_step == 100:
       # Expected is 100, but because of the global step increment bug, is 50.
-      self.assertEqual(50, step_counter.steps)
+      # Occasionally, step increments one more time due to a race condition,
+      # reaching 51 steps.
+      self.assertIn(step_counter.steps, [50, 51])
     else:
-      # Occasionally, training stops when global_step == 101, due to a race
-      # condition.
-      self.assertEqual(51, step_counter.steps)
+      # Occasionally, training stops when global_step == 102, due to a race
+      # condition. In addition, occasionally step increments one more time due
+      # to a race condition reaching 52 steps.
+      self.assertIn(step_counter.steps, [51, 52])
 
   def testGlobalStepDNNLinearCombinedBugFixed(self):
     """Tests global step update for dnn-linear combined model."""
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 12f9bba531a296a00d17956b8ce32e5d7dead380..2bd57597c2e9444b51b1dacfbe4180b443c95a3d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -1224,7 +1224,7 @@ class DNNRegressorTest(test.TestCase):
       self, predictions, expected_shape):
     predictions_nparray = np.array(predictions)
     self.assertAllEqual(expected_shape, predictions_nparray.shape)
-    self.assertTrue(np.issubdtype(predictions_nparray.dtype, np.float))
+    self.assertTrue(np.issubdtype(predictions_nparray.dtype, np.floating))
 
   def testPredict_AsIterableFalse(self):
     """Tests predict method with as_iterable=False."""
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 788d2d0b1a58fad16712c968593b40de0d3979f0..4b63e08ab3372849309ee5d28d754de82e9632f4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Base Estimator class."""
 
 from __future__ import absolute_import
@@ -30,7 +29,6 @@ import six
 
 from google.protobuf import message
 from tensorflow.contrib import layers
-from tensorflow.contrib import metrics as metrics_lib
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_args
 from tensorflow.contrib.framework import list_variables
@@ -60,6 +58,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
@@ -76,7 +75,6 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
-
 AS_ITERABLE_DATE = '2016-09-15'
 AS_ITERABLE_INSTRUCTIONS = (
     'The default behavior of predict() is changing. The default value for\n'
@@ -213,7 +211,7 @@ def _get_replica_device_setter(config):
       'Variable', 'VariableV2', 'AutoReloadVariable', 'MutableHashTable',
       'MutableHashTableV2', 'MutableHashTableOfTensors',
       'MutableHashTableOfTensorsV2', 'MutableDenseHashTable',
-      'MutableDenseHashTableV2'
+      'MutableDenseHashTableV2', 'VarHandleOp'
   ]
 
   if config.task_type:
@@ -223,8 +221,11 @@ def _get_replica_device_setter(config):
 
   if config.num_ps_replicas > 0:
     return device_setter.replica_device_setter(
-        ps_tasks=config.num_ps_replicas, worker_device=worker_device,
-        merge_devices=True, ps_ops=ps_ops, cluster=config.cluster_spec)
+        ps_tasks=config.num_ps_replicas,
+        worker_device=worker_device,
+        merge_devices=True,
+        ps_ops=ps_ops,
+        cluster=config.cluster_spec)
   else:
     return None
 
@@ -284,10 +285,10 @@ def _make_metrics_ops(metrics, features, labels, predictions):
         raise ValueError('Invalid metric for {}. It returned a tuple with '
                          'len {}, expected 2.'.format(name, len(name)))
       if not isinstance(predictions, dict):
-        raise ValueError(
-            'Metrics passed provide (name, prediction), '
-            'but predictions are not dict. '
-            'Metrics: %s, Predictions: %s.' % (metrics, predictions))
+        raise ValueError('Metrics passed provide (name, prediction), '
+                         'but predictions are not dict. '
+                         'Metrics: %s, Predictions: %s.' % (metrics,
+                                                            predictions))
       # Here are two options: labels are single Tensor or a dict.
       if isinstance(labels, dict) and name[1] in labels:
         # If labels are dict and the prediction name is in it, apply metric.
@@ -298,10 +299,10 @@ def _make_metrics_ops(metrics, features, labels, predictions):
     else:
       # Single head metrics.
       if isinstance(predictions, dict):
-        raise ValueError(
-            'Metrics passed provide only name, no prediction, '
-            'but predictions are dict. '
-            'Metrics: %s, Labels: %s.' % (metrics, labels_tensor_or_dict))
+        raise ValueError('Metrics passed provide only name, no prediction, '
+                         'but predictions are dict. '
+                         'Metrics: %s, Labels: %s.' % (metrics,
+                                                       labels_tensor_or_dict))
       result[name] = metric(predictions, labels_tensor_or_dict)
   return result
 
@@ -360,10 +361,22 @@ def _write_dict_to_summary(output_dir, dictionary, current_global_step):
         logging.warn('Skipping summary for %s, cannot parse string to Summary.',
                      key)
         continue
+    elif isinstance(dictionary[key], np.ndarray):
+      value = summary_proto.value.add()
+      value.tag = key
+      value.node_name = key
+      tensor_proto = tensor_util.make_tensor_proto(dictionary[key])
+      value.tensor.CopyFrom(tensor_proto)
+      logging.info(
+          'Summary for np.ndarray is not visible in Tensorboard by default. '
+          'Consider using a Tensorboard plugin for visualization (see '
+          'https://github.com/tensorflow/tensorboard-plugin-example/blob/master/README.md'
+          ' for more information).')
     else:
       logging.warn(
           'Skipping summary for %s, must be a float, np.float32, np.int64, '
-          'np.int32 or int or a serialized string of Summary.', key)
+          'np.int32 or int or np.ndarray or a serialized string of Summary.',
+          key)
   summary_writer.add_summary(summary_proto, current_global_step)
   summary_writer.flush()
 
@@ -372,8 +385,8 @@ GraphRewriteSpec = collections.namedtuple('GraphRewriteSpec',
                                           ['tags', 'transforms'])
 
 
-class BaseEstimator(
-    sklearn.BaseEstimator, evaluable.Evaluable, trainable.Trainable):
+class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
+                    trainable.Trainable):
   """Abstract BaseEstimator class to train and evaluate TensorFlow models.
 
   Users should not instantiate or subclass this class. Instead, use an
@@ -415,7 +428,7 @@ class BaseEstimator(
         #                  necessary.
         # pylint: disable=g-doc-exception
         raise ValueError(
-            "model_dir are set both in constructor and RunConfig, but with "
+            'model_dir are set both in constructor and RunConfig, but with '
             "different values. In constructor: '{}', in RunConfig: "
             "'{}' ".format(model_dir, self._config.model_dir))
         # pylint: enable=g-doc-exception
@@ -444,12 +457,16 @@ class BaseEstimator(
     # TODO(wicke): make RunConfig immutable, and then return it without a copy.
     return copy.deepcopy(self._config)
 
-  @deprecated_args(
-      SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS, ('x', None),
-      ('y', None), ('batch_size', None)
-  )
-  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
-          monitors=None, max_steps=None):
+  @deprecated_args(SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS,
+                   ('x', None), ('y', None), ('batch_size', None))
+  def fit(self,
+          x=None,
+          y=None,
+          input_fn=None,
+          steps=None,
+          batch_size=None,
+          monitors=None,
+          max_steps=None):
     # pylint: disable=g-doc-args,g-doc-return-or-yield
     """See `Trainable`.
 
@@ -481,13 +498,15 @@ class BaseEstimator(
     logging.info('Loss for final step: %s.', loss)
     return self
 
-  @deprecated_args(
-      SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS, ('x', None),
-      ('y', None), ('batch_size', None)
-  )
-  def partial_fit(
-      self, x=None, y=None, input_fn=None, steps=1, batch_size=None,
-      monitors=None):
+  @deprecated_args(SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS,
+                   ('x', None), ('y', None), ('batch_size', None))
+  def partial_fit(self,
+                  x=None,
+                  y=None,
+                  input_fn=None,
+                  steps=1,
+                  batch_size=None,
+                  monitors=None):
     """Incremental fit on a batch of samples.
 
     This method is expected to be called several times consecutively
@@ -523,13 +542,16 @@ class BaseEstimator(
     """
     logging.warning('The current implementation of partial_fit is not optimized'
                     ' for use in a loop. Consider using fit() instead.')
-    return self.fit(x=x, y=y, input_fn=input_fn, steps=steps,
-                    batch_size=batch_size, monitors=monitors)
+    return self.fit(
+        x=x,
+        y=y,
+        input_fn=input_fn,
+        steps=steps,
+        batch_size=batch_size,
+        monitors=monitors)
 
-  @deprecated_args(
-      SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS, ('x', None),
-      ('y', None), ('batch_size', None)
-  )
+  @deprecated_args(SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS,
+                   ('x', None), ('y', None), ('batch_size', None))
   def evaluate(self,
                x=None,
                y=None,
@@ -571,13 +593,15 @@ class BaseEstimator(
       eval_results.update({'global_step': global_step})
     return eval_results
 
-  @deprecated_args(
-      SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS, ('x', None),
-      ('batch_size', None), ('as_iterable', True)
-  )
-  def predict(
-      self, x=None, input_fn=None, batch_size=None, outputs=None,
-      as_iterable=True):
+  @deprecated_args(SCIKIT_DECOUPLE_DATE, SCIKIT_DECOUPLE_INSTRUCTIONS,
+                   ('x', None), ('batch_size', None), ('as_iterable', True))
+  def predict(self,
+              x=None,
+              input_fn=None,
+              batch_size=None,
+              outputs=None,
+              as_iterable=True,
+              iterate_batches=False):
     """Returns predictions for given features.
 
     Args:
@@ -593,6 +617,9 @@ class BaseEstimator(
         for each example until inputs are exhausted. Note: The inputs must
         terminate if you want the iterable to terminate (e.g. be sure to pass
         num_epochs=1 if you are using something like read_batch_features).
+      iterate_batches: If True, yield the whole batch at once instead of
+        decomposing the batch into individual samples. Only relevant when
+        as_iterable is True.
 
     Returns:
       A numpy array of predicted classes or regression values if the
@@ -612,7 +639,8 @@ class BaseEstimator(
         input_fn=input_fn,
         feed_fn=feed_fn,
         outputs=outputs,
-        as_iterable=as_iterable)
+        as_iterable=as_iterable,
+        iterate_batches=iterate_batches)
 
   def get_variable_value(self, name):
     """Returns value of the variable given by name.
@@ -638,16 +666,17 @@ class BaseEstimator(
     return self._model_dir
 
   @deprecated('2017-03-25', 'Please use Estimator.export_savedmodel() instead.')
-  def export(self,
-             export_dir,
-             input_fn=export._default_input_fn,  # pylint: disable=protected-access
-             input_feature_key=None,
-             use_deprecated_input_fn=True,
-             signature_fn=None,
-             prediction_key=None,
-             default_batch_size=1,
-             exports_to_keep=None,
-             checkpoint_path=None):
+  def export(
+      self,
+      export_dir,
+      input_fn=export._default_input_fn,  # pylint: disable=protected-access
+      input_feature_key=None,
+      use_deprecated_input_fn=True,
+      signature_fn=None,
+      prediction_key=None,
+      default_batch_size=1,
+      exports_to_keep=None,
+      checkpoint_path=None):
     """Exports inference graph into given dir.
 
     Args:
@@ -785,8 +814,8 @@ class BaseEstimator(
       logging.debug('Setting feature info to %s.', str(self._features_info))
     if labels is not None:
       if self._labels_info is not None:
-        logging.debug('Given labels: %s, required signatures: %s.',
-                      str(labels), str(self._labels_info))
+        logging.debug('Given labels: %s, required signatures: %s.', str(labels),
+                      str(self._labels_info))
         if not tensor_signature.tensors_compatible(labels, self._labels_info):
           raise ValueError('Labels are incompatible with given information. '
                            'Given labels: %s, required signatures: %s.' %
@@ -837,13 +866,13 @@ class BaseEstimator(
     if not checkpoint_path:
       latest_path = saver.latest_checkpoint(self._model_dir)
       if not latest_path:
-        raise NotFittedError("Couldn't find trained model at %s."
-                             % self._model_dir)
+        raise NotFittedError(
+            "Couldn't find trained model at %s." % self._model_dir)
       checkpoint_path = latest_path
 
     # Setup output directory.
-    eval_dir = os.path.join(self._model_dir, 'eval' if not name else
-                            'eval_' + name)
+    eval_dir = os.path.join(self._model_dir, 'eval'
+                            if not name else 'eval_' + name)
 
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
@@ -866,8 +895,7 @@ class BaseEstimator(
                         'Use steps=None if intended.')
       if steps:
         hooks.append(
-            evaluation.StopAfterNEvalsHook(
-                steps, log_progress=log_progress))
+            evaluation.StopAfterNEvalsHook(steps, log_progress=log_progress))
 
       global_step_key = 'global_step'
       while global_step_key in eval_dict:
@@ -903,8 +931,8 @@ class BaseEstimator(
     # Check that model has been trained.
     checkpoint_path = saver.latest_checkpoint(self._model_dir)
     if not checkpoint_path:
-      raise NotFittedError("Couldn't find trained model at %s."
-                           % self._model_dir)
+      raise NotFittedError(
+          "Couldn't find trained model at %s." % self._model_dir)
 
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
@@ -966,7 +994,8 @@ class BaseEstimator(
     existing_keys = predictions.keys()
     predictions = {
         key: value
-        for key, value in six.iteritems(predictions) if key in outputs
+        for key, value in six.iteritems(predictions)
+        if key in outputs
     }
     if not predictions:
       raise ValueError('Expected to run at least one output from %s, '
@@ -1032,8 +1061,7 @@ class BaseEstimator(
           chief_only_hooks=chief_hooks + model_fn_ops.training_chief_hooks,
           save_checkpoint_secs=0,  # Saving is handled by a hook.
           save_summaries_steps=self._config.save_summary_steps,
-          config=self._session_config
-      ) as mon_sess:
+          config=self._session_config) as mon_sess:
         loss = None
         while not mon_sess.should_stop():
           _, loss = mon_sess.run([model_fn_ops.train_op, model_fn_ops.loss])
@@ -1124,8 +1152,7 @@ class Estimator(BaseEstimator):
       if params is not None and 'params' not in model_fn_args:
         raise ValueError('Estimator\'s model_fn (%s) does not have a params '
                          'argument, but params (%s) were passed to the '
-                         'Estimator\'s constructor.' %
-                         (model_fn, params))
+                         'Estimator\'s constructor.' % (model_fn, params))
       if params is None and 'params' in model_fn_args:
         logging.warning('Estimator\'s model_fn (%s) includes params '
                         'argument, but params are not passed to Estimator.',
@@ -1179,8 +1206,9 @@ class Estimator(BaseEstimator):
 
     # Custom metrics should overwrite defaults.
     if metrics:
-      model_fn_ops.eval_metric_ops.update(_make_metrics_ops(
-        metrics, features, labels, model_fn_ops.predictions))
+      model_fn_ops.eval_metric_ops.update(
+          _make_metrics_ops(metrics, features, labels,
+                            model_fn_ops.predictions))
 
     return model_fn_ops
 
@@ -1225,12 +1253,12 @@ class Estimator(BaseEstimator):
     Raises:
       ValueError: if `metrics` don't match `labels`.
     """
-    model_fn_ops = self._call_model_fn(
-        features, labels, model_fn_lib.ModeKeys.EVAL, metrics)
+    model_fn_ops = self._call_model_fn(features, labels,
+                                       model_fn_lib.ModeKeys.EVAL, metrics)
 
     if metric_key.MetricKey.LOSS not in model_fn_ops.eval_metric_ops:
       model_fn_ops.eval_metric_ops[metric_key.MetricKey.LOSS] = (
-          metrics_lib.streaming_mean(model_fn_ops.loss))
+          metrics_lib.mean(model_fn_ops.loss))
     return model_fn_ops
 
   def _get_predict_ops(self, features):
@@ -1250,13 +1278,17 @@ class Estimator(BaseEstimator):
         self._labels_info)
     return self._call_model_fn(features, labels, model_fn_lib.ModeKeys.INFER)
 
-  def export_savedmodel(
-      self, export_dir_base, serving_input_fn,
-      default_output_alternative_key=None,
-      assets_extra=None,
-      as_text=False,
-      checkpoint_path=None,
-      graph_rewrite_specs=(GraphRewriteSpec((tag_constants.SERVING,), ()),)):
+  def export_savedmodel(self,
+                        export_dir_base,
+                        serving_input_fn,
+                        default_output_alternative_key=None,
+                        assets_extra=None,
+                        as_text=False,
+                        checkpoint_path=None,
+                        graph_rewrite_specs=(GraphRewriteSpec(
+                            (tag_constants.SERVING,), ()),),
+                        strip_default_attrs=False):
+    # pylint: disable=line-too-long
     """Exports inference graph as a SavedModel into given dir.
 
     Args:
@@ -1280,6 +1312,10 @@ class Estimator(BaseEstimator):
         produce a separate MetaGraphDef within the exported SavedModel, tagged
         and rewritten as specified.  Defaults to a single entry using the
         default serving tag ("serve") and no rewriting.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued
+          Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
 
     Returns:
       The string path to the exported directory.
@@ -1287,6 +1323,7 @@ class Estimator(BaseEstimator):
     Raises:
       ValueError: if an unrecognized export_type is requested.
     """
+    # pylint: enable=line-too-long
     if serving_input_fn is None:
       raise ValueError('serving_input_fn must be defined.')
 
@@ -1294,8 +1331,8 @@ class Estimator(BaseEstimator):
       # Locate the latest checkpoint
       checkpoint_path = saver.latest_checkpoint(self._model_dir)
     if not checkpoint_path:
-      raise NotFittedError("Couldn't find trained model at %s."
-                           % self._model_dir)
+      raise NotFittedError(
+          "Couldn't find trained model at %s." % self._model_dir)
 
     export_dir = saved_model_export_utils.get_timestamped_export_dir(
         export_dir_base)
@@ -1329,10 +1366,10 @@ class Estimator(BaseEstimator):
           saved_model_export_utils.get_output_alternatives(
               model_fn_ops, default_output_alternative_key))
 
-      init_op = control_flow_ops.group(
-          variables.local_variables_initializer(),
-          resources.initialize_resources(resources.shared_resources()),
-          lookup_ops.tables_initializer())
+      init_op = control_flow_ops.group(variables.local_variables_initializer(),
+                                       resources.initialize_resources(
+                                           resources.shared_resources()),
+                                       lookup_ops.tables_initializer())
 
       # Build the SignatureDefs from all pairs of input and output alternatives
       signature_def_map = saved_model_export_utils.build_all_signature_defs(
@@ -1362,11 +1399,12 @@ class Estimator(BaseEstimator):
 
         # TODO(soergel): switch to main_op or otherwise update when dust settles
         builder.add_meta_graph_and_variables(
-            session, untransformed_tags,
+            session,
+            untransformed_tags,
             signature_def_map=signature_def_map,
-            assets_collection=ops.get_collection(
-                ops.GraphKeys.ASSET_FILEPATHS),
-            legacy_init_op=init_op)
+            assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
+            legacy_init_op=init_op,
+            strip_default_attrs=strip_default_attrs)
 
     # pylint: disable=protected-access
     base_meta_graph_def = builder._saved_model.meta_graphs[0]
@@ -1375,12 +1413,16 @@ class Estimator(BaseEstimator):
     if graph_rewrite_specs[1:]:
       # Prepare the input_names and output_names needed for the
       # meta_graph_transform call below.
-      input_names = [tensor.name
-                     for input_dict in input_alternatives.values()
-                     for tensor in input_dict.values()]
-      output_names = [tensor.name
-                      for output_alternative in output_alternatives.values()
-                      for tensor in output_alternative[1].values()]
+      input_names = [
+          tensor.name
+          for input_dict in input_alternatives.values()
+          for tensor in input_dict.values()
+      ]
+      output_names = [
+          tensor.name
+          for output_alternative in output_alternatives.values()
+          for tensor in output_alternative[1].values()
+      ]
 
     # Write the additional MetaGraphDefs
     for graph_rewrite_spec in graph_rewrite_specs[1:]:
@@ -1399,11 +1441,11 @@ class Estimator(BaseEstimator):
 
     # Add the extra assets
     if assets_extra:
-      assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir),
-                                       compat.as_bytes('assets.extra'))
+      assets_extra_path = os.path.join(
+          compat.as_bytes(temp_export_dir), compat.as_bytes('assets.extra'))
       for dest_relative, source in assets_extra.items():
-        dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
-                                     compat.as_bytes(dest_relative))
+        dest_absolute = os.path.join(
+            compat.as_bytes(assets_extra_path), compat.as_bytes(dest_relative))
         dest_path = os.path.dirname(dest_absolute)
         gfile.MakeDirs(dest_path)
         gfile.Copy(source, dest_absolute)
@@ -1423,25 +1465,36 @@ class SKCompat(sklearn.BaseEstimator):
 
   def fit(self, x, y, batch_size=128, steps=None, max_steps=None,
           monitors=None):
-    input_fn, feed_fn = _get_input_fn(x, y, input_fn=None, feed_fn=None,
-                                      batch_size=batch_size, shuffle=True,
-                                      epochs=None)
+    input_fn, feed_fn = _get_input_fn(
+        x,
+        y,
+        input_fn=None,
+        feed_fn=None,
+        batch_size=batch_size,
+        shuffle=True,
+        epochs=None)
     all_monitors = []
     if feed_fn:
       all_monitors = [basic_session_run_hooks.FeedFnHook(feed_fn)]
     if monitors:
       all_monitors.extend(monitors)
 
-    self._estimator.fit(input_fn=input_fn,
-                        steps=steps,
-                        max_steps=max_steps,
-                        monitors=all_monitors)
+    self._estimator.fit(
+        input_fn=input_fn,
+        steps=steps,
+        max_steps=max_steps,
+        monitors=all_monitors)
     return self
 
   def score(self, x, y, batch_size=128, steps=None, metrics=None, name=None):
-    input_fn, feed_fn = _get_input_fn(x, y, input_fn=None,
-                                      feed_fn=None, batch_size=batch_size,
-                                      shuffle=False, epochs=1)
+    input_fn, feed_fn = _get_input_fn(
+        x,
+        y,
+        input_fn=None,
+        feed_fn=None,
+        batch_size=batch_size,
+        shuffle=False,
+        epochs=1)
     if metrics is not None and not isinstance(metrics, dict):
       raise ValueError('Metrics argument should be None or dict. '
                        'Got %s.' % metrics)
@@ -1457,8 +1510,13 @@ class SKCompat(sklearn.BaseEstimator):
 
   def predict(self, x, batch_size=128, outputs=None):
     input_fn, feed_fn = _get_input_fn(
-        x, None, input_fn=None, feed_fn=None, batch_size=batch_size,
-        shuffle=False, epochs=1)
+        x,
+        None,
+        input_fn=None,
+        feed_fn=None,
+        batch_size=batch_size,
+        shuffle=False,
+        epochs=1)
     results = list(
         self._estimator._infer_model(
             input_fn=input_fn,
@@ -1469,7 +1527,6 @@ class SKCompat(sklearn.BaseEstimator):
     if not isinstance(results[0], dict):
       return np.concatenate([output for output in results], axis=0)
     return {
-        key: np.concatenate(
-            [output[key] for output in results], axis=0)
+        key: np.concatenate([output[key] for output in results], axis=0)
         for key in results[0]
     }
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
index 248c6c733ffca351c848ba07110ba89928634a23..d4a46b41d0c93ef58d5db8c433cbf348fec10f5e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
@@ -23,7 +23,7 @@ import tempfile
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn import models
@@ -41,7 +41,6 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import queue_runner_impl
 
-
 _BOSTON_INPUT_DIM = 13
 _IRIS_INPUT_DIM = 4
 
@@ -93,8 +92,8 @@ def boston_eval_fn():
       constant_op.constant(boston.data), [n_examples, _BOSTON_INPUT_DIM])
   labels = array_ops.reshape(
       constant_op.constant(boston.target), [n_examples, 1])
-  return array_ops.concat([features, features], 0), array_ops.concat(
-      [labels, labels], 0)
+  return array_ops.concat([features, features],
+                          0), array_ops.concat([labels, labels], 0)
 
 
 def extract(data, key):
@@ -114,7 +113,7 @@ def linear_model_params_fn(features, labels, mode, params):
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
       loss,
-      variables.get_global_step(),
+      training_util.get_global_step(),
       optimizer='Adagrad',
       learning_rate=params['learning_rate'])
   return prediction, loss, train_op
@@ -129,7 +128,10 @@ def linear_model_fn(features, labels, mode):
     (_, features), = features.items()
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return prediction, loss, train_op
 
 
@@ -139,7 +141,10 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode):
                   model_fn.ModeKeys.INFER)
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return model_fn.ModelFnOps(
       mode=mode, predictions=prediction, loss=loss, train_op=train_op)
 
@@ -150,7 +155,10 @@ def logistic_model_no_mode_fn(features, labels):
   labels = array_ops.one_hot(labels, 3, 1, 0)
   prediction, loss = (models.logistic_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return {
       'class': math_ops.argmax(prediction, 1),
       'prob': prediction
@@ -173,7 +181,9 @@ class EstimatorInputTest(test.TestCase):
     scores = est.evaluate(
         x=boston_input,
         y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     del est
     # Create another estimator object with the same output dir.
     est2 = estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir)
@@ -182,7 +192,9 @@ class EstimatorInputTest(test.TestCase):
     scores2 = est2.evaluate(
         x=boston_input,
         y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     self.assertAllClose(scores2['MSE'], scores['MSE'])
     predictions = np.array(list(est2.predict(x=boston_input)))
     other_score = _sklearn.mean_squared_error(predictions,
@@ -197,7 +209,9 @@ class EstimatorInputTest(test.TestCase):
     scores = est.score(
         x=boston.data,
         y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     predictions = np.array(list(est.predict(x=boston.data)))
     other_score = _sklearn.mean_squared_error(predictions, boston.target)
     self.assertAllClose(scores['MSE'], other_score)
@@ -213,7 +227,9 @@ class EstimatorInputTest(test.TestCase):
     scores = est.evaluate(
         x=boston_input,
         y=float64_target,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     predictions = np.array(list(est.predict(x=boston_input)))
     other_score = _sklearn.mean_squared_error(predictions, boston.target)
     self.assertAllClose(other_score, scores['MSE'])
@@ -228,14 +244,15 @@ class EstimatorInputTest(test.TestCase):
     scores = est.score(
         x=iris.data,
         y=iris.target,
-        metrics={('accuracy', 'class'): metric_ops.streaming_accuracy})
+        metrics={
+            ('accuracy', 'class'): metric_ops.streaming_accuracy
+        })
     predictions = est.predict(x=iris.data)
     predictions_class = est.predict(x=iris.data, outputs=['class'])['class']
     self.assertEqual(predictions['prob'].shape[0], iris.target.shape[0])
     self.assertAllClose(predictions['class'], predictions_class)
-    self.assertAllClose(
-        predictions['class'], np.argmax(
-            predictions['prob'], axis=1))
+    self.assertAllClose(predictions['class'],
+                        np.argmax(predictions['prob'], axis=1))
     other_score = _sklearn.accuracy_score(iris.target, predictions['class'])
     self.assertAllClose(scores['accuracy'], other_score)
     self.assertTrue('global_step' in scores)
@@ -250,17 +267,18 @@ class EstimatorInputTest(test.TestCase):
     scores = est.evaluate(
         x=iris_data,
         y=iris_target,
-        metrics={('accuracy', 'class'): metric_ops.streaming_accuracy})
+        metrics={
+            ('accuracy', 'class'): metric_ops.streaming_accuracy
+        })
     predictions = list(est.predict(x=iris_data))
     predictions_class = list(est.predict(x=iris_data, outputs=['class']))
     self.assertEqual(len(predictions), iris.target.shape[0])
     classes_batch = np.array([p['class'] for p in predictions])
     self.assertAllClose(classes_batch,
                         np.array([p['class'] for p in predictions_class]))
-    self.assertAllClose(
-        classes_batch,
-        np.argmax(
-            np.array([p['prob'] for p in predictions]), axis=1))
+    self.assertAllClose(classes_batch,
+                        np.argmax(
+                            np.array([p['prob'] for p in predictions]), axis=1))
     other_score = _sklearn.accuracy_score(iris.target, classes_batch)
     self.assertAllClose(other_score, scores['accuracy'])
     self.assertTrue('global_step' in scores)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index be2b0cb3ca959323b4de095ca072278f028be301..d81a534b79bc90fe91ffd3cb97a7865a7cb4c2a9 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -32,7 +32,7 @@ from google.protobuf import text_format
 
 from tensorflow.contrib import learn
 from tensorflow.contrib import lookup
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import experiment
@@ -111,8 +111,8 @@ def boston_eval_fn():
       constant_op.constant(boston.data), [n_examples, _BOSTON_INPUT_DIM])
   labels = array_ops.reshape(
       constant_op.constant(boston.target), [n_examples, 1])
-  return array_ops.concat([features, features], 0), array_ops.concat(
-      [labels, labels], 0)
+  return array_ops.concat([features, features],
+                          0), array_ops.concat([labels, labels], 0)
 
 
 def extract(data, key):
@@ -132,7 +132,7 @@ def linear_model_params_fn(features, labels, mode, params):
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
       loss,
-      variables.get_global_step(),
+      training_util.get_global_step(),
       optimizer='Adagrad',
       learning_rate=params['learning_rate'])
   return prediction, loss, train_op
@@ -147,7 +147,10 @@ def linear_model_fn(features, labels, mode):
     (_, features), = features.items()
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return prediction, loss, train_op
 
 
@@ -157,7 +160,10 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode):
                   model_fn.ModeKeys.INFER)
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return model_fn.ModelFnOps(
       mode=mode, predictions=prediction, loss=loss, train_op=train_op)
 
@@ -168,7 +174,10 @@ def logistic_model_no_mode_fn(features, labels):
   labels = array_ops.one_hot(labels, 3, 1, 0)
   prediction, loss = (models.logistic_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return {
       'class': math_ops.argmax(prediction, 1),
       'prob': prediction
@@ -184,14 +193,12 @@ def _build_estimator_for_export_tests(tmpdir):
   def _input_fn():
     iris = base.load_iris()
     return {
-        'feature': constant_op.constant(
-            iris.data, dtype=dtypes.float32)
+        'feature': constant_op.constant(iris.data, dtype=dtypes.float32)
     }, constant_op.constant(
         iris.target, shape=[150], dtype=dtypes.int32)
 
   feature_columns = [
-      feature_column_lib.real_valued_column(
-          'feature', dimension=4)
+      feature_column_lib.real_valued_column('feature', dimension=4)
   ]
 
   est = linear.LinearRegressor(feature_columns)
@@ -241,7 +248,7 @@ def _build_estimator_for_resource_export_test():
     const = constant_op.constant(-1, dtype=dtypes.int64)
     table = lookup.MutableHashTable(
         dtypes.string, dtypes.int64, const, name='LookupTableModel')
-    update_global_step = variables.get_global_step().assign_add(1)
+    update_global_step = training_util.get_global_step().assign_add(1)
     if mode in (model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL):
       key = constant_op.constant(['key'])
       value = constant_op.constant([42], dtype=dtypes.int64)
@@ -291,8 +298,8 @@ class CheckCallsMonitor(monitors_lib.BaseMonitor):
             self.begin_calls == self.expect_calls)
 
 
-def _model_fn_ops(
-    expected_features, expected_labels, actual_features, actual_labels, mode):
+def _model_fn_ops(expected_features, expected_labels, actual_features,
+                  actual_labels, mode):
   assert_ops = tuple([
       check_ops.assert_equal(
           expected_features[k], actual_features[k], name='assert_%s' % k)
@@ -306,15 +313,15 @@ def _model_fn_ops(
         mode=mode,
         predictions=constant_op.constant(0.),
         loss=constant_op.constant(0.),
-        train_op=variables.get_global_step().assign_add(1))
+        train_op=training_util.get_global_step().assign_add(1))
 
 
 def _make_input_fn(features, labels):
+
   def _input_fn():
-    return {
-        k: constant_op.constant(v)
-        for k, v in six.iteritems(features)
-    }, constant_op.constant(labels)
+    return {k: constant_op.constant(v)
+            for k, v in six.iteritems(features)}, constant_op.constant(labels)
+
   return _input_fn
 
 
@@ -369,11 +376,13 @@ class EstimatorModelFnTest(test.TestCase):
       self.assertEqual(expected_params, params)
       self.assertTrue(config.i_am_test)
       return _model_fn_ops(features, labels, arg0, arg1, mode)
+
     partial_model_fn = functools.partial(
         _model_fn, foo=expected_foo, bar=expected_bar)
 
     est = estimator.Estimator(
-        model_fn=partial_model_fn, params=expected_params,
+        model_fn=partial_model_fn,
+        params=expected_params,
         config=expected_config)
     self.assertEqual(0, model_fn_call_count[0])
     est.fit(input_fn=_make_input_fn(features, labels), steps=1)
@@ -382,17 +391,24 @@ class EstimatorModelFnTest(test.TestCase):
   def testModelFnWithModelDir(self):
     expected_param = {'some_param': 'some_value'}
     expected_model_dir = tempfile.mkdtemp()
-    def _argument_checker(features, labels, mode, params, config=None,
+
+    def _argument_checker(features,
+                          labels,
+                          mode,
+                          params,
+                          config=None,
                           model_dir=None):
       _, _, _ = features, labels, config
       self.assertEqual(model_fn.ModeKeys.TRAIN, mode)
       self.assertEqual(expected_param, params)
       self.assertEqual(model_dir, expected_model_dir)
       return (constant_op.constant(0.), constant_op.constant(0.),
-              variables.get_global_step().assign_add(1))
-    est = estimator.Estimator(model_fn=_argument_checker,
-                              params=expected_param,
-                              model_dir=expected_model_dir)
+              training_util.get_global_step().assign_add(1))
+
+    est = estimator.Estimator(
+        model_fn=_argument_checker,
+        params=expected_param,
+        model_dir=expected_model_dir)
     est.fit(input_fn=boston_input_fn, steps=1)
 
   def testInvalidModelFn_no_train_op(self):
@@ -400,7 +416,7 @@ class EstimatorModelFnTest(test.TestCase):
     def _invalid_model_fn(features, labels):
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       with ops.control_dependencies([update_global_step]):
         loss = 100.0 - w
       return None, loss, None
@@ -415,7 +431,7 @@ class EstimatorModelFnTest(test.TestCase):
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
       loss = 100.0 - w
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       with ops.control_dependencies([update_global_step]):
         train_op = w.assign_add(loss / 100.0)
       predictions = loss
@@ -434,7 +450,7 @@ class EstimatorModelFnTest(test.TestCase):
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
       loss = 100.0 - w
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       with ops.control_dependencies([update_global_step]):
         train_op = w.assign_add(loss / 100.0)
       return None, loss, train_op
@@ -447,8 +463,7 @@ class EstimatorModelFnTest(test.TestCase):
       est.predict(input_fn=boston_input_fn)
     with self.assertRaisesRegexp(ValueError, 'Missing prediction'):
       est.predict(
-          input_fn=functools.partial(
-              boston_input_fn, num_epochs=1),
+          input_fn=functools.partial(boston_input_fn, num_epochs=1),
           as_iterable=True)
 
   def testModelFnScaffoldInTraining(self):
@@ -464,7 +479,7 @@ class EstimatorModelFnTest(test.TestCase):
           mode=mode,
           predictions=constant_op.constant(0.),
           loss=constant_op.constant(0.),
-          train_op=variables.get_global_step().assign_add(1),
+          train_op=training_util.get_global_step().assign_add(1),
           scaffold=monitored_session.Scaffold(init_fn=_init_fn))
 
     est = estimator.Estimator(model_fn=_model_fn_scaffold)
@@ -483,7 +498,7 @@ class EstimatorModelFnTest(test.TestCase):
           mode=mode,
           predictions=constant_op.constant([[1.]]),
           loss=constant_op.constant(0.),
-          train_op=variables.get_global_step().assign_add(1),
+          train_op=training_util.get_global_step().assign_add(1),
           scaffold=monitored_session.Scaffold(saver=self.mock_saver))
 
     def input_fn():
@@ -498,15 +513,17 @@ class EstimatorModelFnTest(test.TestCase):
     self.assertTrue(self.mock_saver.restore.called)
     est.predict(input_fn=input_fn)
     self.assertTrue(self.mock_saver.restore.called)
+
     def serving_input_fn():
-      serialized_tf_example = array_ops.placeholder(dtype=dtypes.string,
-                                                    shape=[None],
-                                                    name='input_example_tensor')
+      serialized_tf_example = array_ops.placeholder(
+          dtype=dtypes.string, shape=[None], name='input_example_tensor')
       features, labels = input_fn()
-      return input_fn_utils.InputFnOps(
-          features, labels, {'examples': serialized_tf_example})
+      return input_fn_utils.InputFnOps(features, labels, {
+          'examples': serialized_tf_example
+      })
 
-    est.export_savedmodel(os.path.join(est.model_dir, 'export'), serving_input_fn)
+    est.export_savedmodel(
+        os.path.join(est.model_dir, 'export'), serving_input_fn)
     self.assertTrue(self.mock_saver.restore.called)
 
 
@@ -550,33 +567,28 @@ class EstimatorTest(test.TestCase):
 
   def testRunConfigModelDir(self):
     config = run_config.RunConfig(model_dir='test_dir')
-    est = estimator.Estimator(model_fn=linear_model_fn,
-                              config=config)
+    est = estimator.Estimator(model_fn=linear_model_fn, config=config)
     self.assertEqual('test_dir', est.config.model_dir)
     self.assertEqual('test_dir', est.model_dir)
 
   def testModelDirAndRunConfigModelDir(self):
     config = run_config.RunConfig(model_dir='test_dir')
-    est = estimator.Estimator(model_fn=linear_model_fn,
-                              config=config,
-                              model_dir='test_dir')
+    est = estimator.Estimator(
+        model_fn=linear_model_fn, config=config, model_dir='test_dir')
     self.assertEqual('test_dir', est.config.model_dir)
 
     with self.assertRaisesRegexp(
-        ValueError,
-        'model_dir are set both in constructor and RunConfig, '
+        ValueError, 'model_dir are set both in constructor and RunConfig, '
         'but with different'):
-      estimator.Estimator(model_fn=linear_model_fn,
-                          config=config,
-                          model_dir='different_dir')
+      estimator.Estimator(
+          model_fn=linear_model_fn, config=config, model_dir='different_dir')
 
   def testModelDirIsCopiedToRunConfig(self):
     config = run_config.RunConfig()
     self.assertIsNone(config.model_dir)
 
-    est = estimator.Estimator(model_fn=linear_model_fn,
-                              model_dir='test_dir',
-                              config=config)
+    est = estimator.Estimator(
+        model_fn=linear_model_fn, model_dir='test_dir', config=config)
     self.assertEqual('test_dir', est.config.model_dir)
     self.assertEqual('test_dir', est.model_dir)
 
@@ -656,25 +668,27 @@ class EstimatorTest(test.TestCase):
     boston = base.load_boston()
     output_dir = tempfile.mkdtemp()
     est = estimator.SKCompat(
-        estimator.Estimator(
-            model_fn=linear_model_fn, model_dir=output_dir))
+        estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir))
     float64_labels = boston.target.astype(np.float64)
     est.fit(x=boston.data, y=float64_labels, steps=50)
     scores = est.score(
         x=boston.data,
         y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     del est
     # Create another estimator object with the same output dir.
     est2 = estimator.SKCompat(
-        estimator.Estimator(
-            model_fn=linear_model_fn, model_dir=output_dir))
+        estimator.Estimator(model_fn=linear_model_fn, model_dir=output_dir))
 
     # Check we can evaluate and predict.
     scores2 = est2.score(
         x=boston.data,
         y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     self.assertAllClose(scores['MSE'], scores2['MSE'])
     predictions = np.array(list(est2.predict(x=boston.data)))
     other_score = _sklearn.mean_squared_error(predictions, float64_labels)
@@ -685,14 +699,15 @@ class EstimatorTest(test.TestCase):
     scores3 = est2.score(
         x=boston.data,
         y=float64_labels,
-        metrics={'MSE': metric_ops.streaming_mean_squared_error})
+        metrics={
+            'MSE': metric_ops.streaming_mean_squared_error
+        })
     self.assertLess(scores3['MSE'], scores['MSE'])
 
   def test_checkpoint_contains_relative_paths(self):
     tmpdir = tempfile.mkdtemp()
     est = estimator.Estimator(
-        model_dir=tmpdir,
-        model_fn=linear_model_fn_with_model_fn_ops)
+        model_dir=tmpdir, model_fn=linear_model_fn_with_model_fn_ops)
     est.fit(input_fn=boston_input_fn, steps=5)
 
     checkpoint_file_content = file_io.read_file_to_string(
@@ -700,22 +715,20 @@ class EstimatorTest(test.TestCase):
     ckpt = checkpoint_state_pb2.CheckpointState()
     text_format.Merge(checkpoint_file_content, ckpt)
     self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
-    self.assertAllEqual(
-        ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
+    self.assertAllEqual(['model.ckpt-1', 'model.ckpt-5'],
+                        ckpt.all_model_checkpoint_paths)
 
   def test_train_save_copy_reload(self):
     tmpdir = tempfile.mkdtemp()
     model_dir1 = os.path.join(tmpdir, 'model_dir1')
     est1 = estimator.Estimator(
-        model_dir=model_dir1,
-        model_fn=linear_model_fn_with_model_fn_ops)
+        model_dir=model_dir1, model_fn=linear_model_fn_with_model_fn_ops)
     est1.fit(input_fn=boston_input_fn, steps=5)
 
     model_dir2 = os.path.join(tmpdir, 'model_dir2')
     os.renames(model_dir1, model_dir2)
     est2 = estimator.Estimator(
-        model_dir=model_dir2,
-        model_fn=linear_model_fn_with_model_fn_ops)
+        model_dir=model_dir2, model_fn=linear_model_fn_with_model_fn_ops)
     self.assertEqual(5, est2.get_variable_value('global_step'))
     est2.fit(input_fn=boston_input_fn, steps=5)
     self.assertEqual(10, est2.get_variable_value('global_step'))
@@ -724,7 +737,9 @@ class EstimatorTest(test.TestCase):
     boston = base.load_boston()
     est = estimator.SKCompat(
         estimator.Estimator(
-            model_fn=linear_model_params_fn, params={'learning_rate': 0.01}))
+            model_fn=linear_model_params_fn, params={
+                'learning_rate': 0.01
+            }))
     est.fit(x=boston.data, y=boston.target, steps=100)
 
   def testHooksNotChanged(self):
@@ -824,11 +839,13 @@ class EstimatorTest(test.TestCase):
 
   def testMonitorsForFit(self):
     est = estimator.Estimator(model_fn=linear_model_fn)
-    est.fit(input_fn=boston_input_fn,
-            steps=21,
-            monitors=[CheckCallsMonitor(expect_calls=21)])
+    est.fit(
+        input_fn=boston_input_fn,
+        steps=21,
+        monitors=[CheckCallsMonitor(expect_calls=21)])
 
   def testHooksForEvaluate(self):
+
     class CheckCallHook(session_run_hook.SessionRunHook):
 
       def __init__(self):
@@ -874,7 +891,9 @@ class EstimatorTest(test.TestCase):
     est.evaluate(
         input_fn=boston_input_fn,
         steps=200,
-        metrics={'MSE': _streaming_mean_squared_error_histogram})
+        metrics={
+            'MSE': _streaming_mean_squared_error_histogram
+        })
     events = util_test.latest_events(est.model_dir + '/eval')
     output_values = {}
     for e in events:
@@ -884,6 +903,37 @@ class EstimatorTest(test.TestCase):
     self.assertTrue('MSE' in output_values)
     self.assertTrue(output_values['MSE'].HasField('histo'))
 
+  def testSummaryWritingWithTensor(self):
+
+    def _streaming_precition_mean_tensor(predictions,
+                                         weights=None,
+                                         metrics_collections=None,
+                                         updates_collections=None,
+                                         name=None):
+      return metric_ops.streaming_mean_tensor(
+          predictions,
+          weights=weights,
+          metrics_collections=metrics_collections,
+          updates_collections=updates_collections,
+          name=name)
+
+    est = estimator.Estimator(model_fn=linear_model_fn)
+    est.fit(input_fn=boston_input_fn, steps=200)
+    est.evaluate(
+        input_fn=boston_input_fn,
+        steps=200,
+        metrics={
+            'PMT': _streaming_precition_mean_tensor
+        })
+    events = util_test.latest_events(est.model_dir + '/eval')
+    output_values = {}
+    for e in events:
+      if e.HasField('summary'):
+        for v in e.summary.value:
+          output_values[v.tag] = v
+    self.assertTrue('PMT' in output_values)
+    self.assertTrue(output_values['PMT'].HasField('tensor'))
+
   def testLossInGraphCollection(self):
 
     class _LossCheckerHook(session_run_hook.SessionRunHook):
@@ -927,8 +977,8 @@ class EstimatorTest(test.TestCase):
     self.assertTrue(
         gfile.Exists(
             os.path.join(
-                compat.as_bytes(export_dir), compat.as_bytes(
-                    'saved_model.pb'))))
+                compat.as_bytes(export_dir),
+                compat.as_bytes('saved_model.pb'))))
     self.assertTrue(
         gfile.Exists(
             os.path.join(
@@ -988,11 +1038,11 @@ class EstimatorTest(test.TestCase):
         self.assertTrue('input_example_tensor' in graph_ops)
         self.assertTrue('ParseExample/ParseExample' in graph_ops)
         self.assertTrue('linear/linear/feature/matmul' in graph_ops)
-        self.assertItemsEqual(
-          ['bogus_lookup', 'feature'],
-          [compat.as_str_any(x) for x in graph.get_collection(
-            constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS)])
-
+        self.assertItemsEqual(['bogus_lookup', 'feature'], [
+            compat.as_str_any(x)
+            for x in graph.get_collection(
+                constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS)
+        ])
 
     # cleanup
     gfile.DeleteRecursively(tmpdir)
@@ -1010,8 +1060,8 @@ class EstimatorTest(test.TestCase):
     self.assertTrue(
         gfile.Exists(
             os.path.join(
-                compat.as_bytes(export_dir), compat.as_bytes(
-                    'saved_model.pb'))))
+                compat.as_bytes(export_dir),
+                compat.as_bytes('saved_model.pb'))))
     self.assertTrue(
         gfile.Exists(
             os.path.join(
@@ -1054,19 +1104,22 @@ class EstimatorTest(test.TestCase):
     export_dir_base = os.path.join(
         compat.as_bytes(tmpdir), compat.as_bytes('export'))
     export_dir = est.export_savedmodel(
-        export_dir_base, serving_input_fn, assets_extra=assets_extra,
+        export_dir_base,
+        serving_input_fn,
+        assets_extra=assets_extra,
         graph_rewrite_specs=[
             estimator.GraphRewriteSpec(['tag_1'], []),
             estimator.GraphRewriteSpec(['tag_2', 'tag_3'],
-                                       ['strip_unused_nodes'])])
+                                       ['strip_unused_nodes'])
+        ])
 
     self.assertTrue(gfile.Exists(export_dir_base))
     self.assertTrue(gfile.Exists(export_dir))
     self.assertTrue(
         gfile.Exists(
             os.path.join(
-                compat.as_bytes(export_dir), compat.as_bytes(
-                    'saved_model.pb'))))
+                compat.as_bytes(export_dir),
+                compat.as_bytes('saved_model.pb'))))
     self.assertTrue(
         gfile.Exists(
             os.path.join(
@@ -1179,18 +1232,15 @@ class InferRealValuedColumnsTest(test.TestCase):
     self.assertEqual(1, len(feature_columns))
     feature_column = feature_columns[0]
     self.assertEqual('', feature_column.name)
-    self.assertEqual(
-        {
-            '':
-                parsing_ops.FixedLenFeature(
-                    shape=expected_shape, dtype=expected_dtype)
-        },
-        feature_column.config)
+    self.assertEqual({
+        '':
+            parsing_ops.FixedLenFeature(
+                shape=expected_shape, dtype=expected_dtype)
+    }, feature_column.config)
 
   def testInt32Input(self):
     feature_columns = estimator.infer_real_valued_columns_from_input(
-        np.ones(
-            shape=[7, 8], dtype=np.int32))
+        np.ones(shape=[7, 8], dtype=np.int32))
     self._assert_single_feature_column([8], dtypes.int32, feature_columns)
 
   def testInt32InputFn(self):
@@ -1200,8 +1250,7 @@ class InferRealValuedColumnsTest(test.TestCase):
 
   def testInt64Input(self):
     feature_columns = estimator.infer_real_valued_columns_from_input(
-        np.ones(
-            shape=[7, 8], dtype=np.int64))
+        np.ones(shape=[7, 8], dtype=np.int64))
     self._assert_single_feature_column([8], dtypes.int64, feature_columns)
 
   def testInt64InputFn(self):
@@ -1211,8 +1260,7 @@ class InferRealValuedColumnsTest(test.TestCase):
 
   def testFloat32Input(self):
     feature_columns = estimator.infer_real_valued_columns_from_input(
-        np.ones(
-            shape=[7, 8], dtype=np.float32))
+        np.ones(shape=[7, 8], dtype=np.float32))
     self._assert_single_feature_column([8], dtypes.float32, feature_columns)
 
   def testFloat32InputFn(self):
@@ -1222,8 +1270,7 @@ class InferRealValuedColumnsTest(test.TestCase):
 
   def testFloat64Input(self):
     feature_columns = estimator.infer_real_valued_columns_from_input(
-        np.ones(
-            shape=[7, 8], dtype=np.float64))
+        np.ones(shape=[7, 8], dtype=np.float64))
     self._assert_single_feature_column([8], dtypes.float64, feature_columns)
 
   def testFloat64InputFn(self):
@@ -1242,8 +1289,8 @@ class InferRealValuedColumnsTest(test.TestCase):
         ValueError, 'on integer or non floating types are not supported'):
       # pylint: disable=g-long-lambda
       estimator.infer_real_valued_columns_from_input_fn(
-          lambda: (constant_op.constant(False, shape=[7, 8], dtype=dtypes.bool),
-                   None))
+          lambda: (constant_op.constant(False, shape=[7, 8], dtype=dtypes.bool), None)
+      )
 
   def testStringInput(self):
     with self.assertRaisesRegexp(
@@ -1280,8 +1327,9 @@ class ReplicaDeviceSetterTest(test.TestCase):
 
   def testVariablesAreOnPs(self):
     tf_config = {'cluster': {run_config.TaskType.PS: ['fake_ps_0']}}
-    with test.mock.patch.dict('os.environ',
-                              {'TF_CONFIG': json.dumps(tf_config)}):
+    with test.mock.patch.dict('os.environ', {
+        'TF_CONFIG': json.dumps(tf_config)
+    }):
       config = run_config.RunConfig()
 
     with ops.device(estimator._get_replica_device_setter(config)):
@@ -1308,14 +1356,14 @@ class ReplicaDeviceSetterTest(test.TestCase):
 
   def testMutableHashTableIsOnPs(self):
     tf_config = {'cluster': {run_config.TaskType.PS: ['fake_ps_0']}}
-    with test.mock.patch.dict('os.environ',
-                              {'TF_CONFIG': json.dumps(tf_config)}):
+    with test.mock.patch.dict('os.environ', {
+        'TF_CONFIG': json.dumps(tf_config)
+    }):
       config = run_config.RunConfig()
 
     with ops.device(estimator._get_replica_device_setter(config)):
       default_val = constant_op.constant([-1, -1], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
+      table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
       input_string = constant_op.constant(['brain', 'salad', 'tank'])
       output = table.lookup(input_string)
     self.assertDeviceEqual('/job:ps/task:0', table._table_ref.device)
@@ -1325,8 +1373,7 @@ class ReplicaDeviceSetterTest(test.TestCase):
     with ops.device(
         estimator._get_replica_device_setter(run_config.RunConfig())):
       default_val = constant_op.constant([-1, -1], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
+      table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
       input_string = constant_op.constant(['brain', 'salad', 'tank'])
       output = table.lookup(input_string)
     self.assertDeviceEqual('', table._table_ref.device)
@@ -1342,8 +1389,9 @@ class ReplicaDeviceSetterTest(test.TestCase):
             'index': 3
         }
     }
-    with test.mock.patch.dict('os.environ',
-                              {'TF_CONFIG': json.dumps(tf_config)}):
+    with test.mock.patch.dict('os.environ', {
+        'TF_CONFIG': json.dumps(tf_config)
+    }):
       config = run_config.RunConfig()
 
     with ops.device(estimator._get_replica_device_setter(config)):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
index 1d89dfb55b10b032cab7dcf434d396404d4eb83b..2113fae3940f14c8ca07e5f76986408ae8a33831 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
@@ -22,7 +22,7 @@ import random
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn import datasets
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -62,7 +62,7 @@ class FeatureEngineeringFunctionTest(test.TestCase):
       _ = labels
       predictions = features["transformed_x"]
       loss = constant_op.constant([2.])
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       return predictions, loss, update_global_step
 
     estimator = estimator_lib.Estimator(
@@ -72,9 +72,11 @@ class FeatureEngineeringFunctionTest(test.TestCase):
     # predictions = transformed_x (9)
     self.assertEqual(9., prediction)
     metrics = estimator.evaluate(
-        input_fn=input_fn, steps=1,
-        metrics={"label":
-                 metric_spec.MetricSpec(lambda predictions, labels: labels)})
+        input_fn=input_fn,
+        steps=1,
+        metrics={
+            "label": metric_spec.MetricSpec(lambda predictions, labels: labels)
+        })
     # labels = transformed_y (99)
     self.assertEqual(99., metrics["label"])
 
@@ -82,10 +84,10 @@ class FeatureEngineeringFunctionTest(test.TestCase):
 
     def input_fn():
       return {
-               "x": constant_op.constant(["9."])
-             }, {
-               "y": constant_op.constant(["99."])
-             }
+          "x": constant_op.constant(["9."])
+      }, {
+          "y": constant_op.constant(["99."])
+      }
 
     def feature_engineering_fn(features, labels):
       # Github #12205: raise a TypeError if called twice.
@@ -100,19 +102,21 @@ class FeatureEngineeringFunctionTest(test.TestCase):
       _ = labels
       predictions = features["x"]
       loss = constant_op.constant([2.])
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       return predictions, loss, update_global_step
 
     estimator = estimator_lib.Estimator(
-      model_fn=model_fn, feature_engineering_fn=feature_engineering_fn)
+        model_fn=model_fn, feature_engineering_fn=feature_engineering_fn)
     estimator.fit(input_fn=input_fn, steps=1)
     prediction = next(estimator.predict(input_fn=input_fn, as_iterable=True))
     # predictions = transformed_x (9)
     self.assertEqual(9., prediction)
     metrics = estimator.evaluate(
-      input_fn=input_fn, steps=1,
-      metrics={"label":
-                 metric_spec.MetricSpec(lambda predictions, labels: labels)})
+        input_fn=input_fn,
+        steps=1,
+        metrics={
+            "label": metric_spec.MetricSpec(lambda predictions, labels: labels)
+        })
     # labels = transformed_y (99)
     self.assertEqual(99., metrics["label"])
 
@@ -139,7 +143,7 @@ class FeatureEngineeringFunctionTest(test.TestCase):
       _ = labels
       predictions = features["x"]
       loss = constant_op.constant([2.])
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       return predictions, loss, update_global_step
 
     estimator_with_fe_fn = estimator_lib.Estimator(
@@ -150,12 +154,10 @@ class FeatureEngineeringFunctionTest(test.TestCase):
 
     # predictions = x
     prediction_with_fe_fn = next(
-        estimator_with_fe_fn.predict(
-            input_fn=input_fn, as_iterable=True))
+        estimator_with_fe_fn.predict(input_fn=input_fn, as_iterable=True))
     self.assertEqual(9., prediction_with_fe_fn)
     prediction_without_fe_fn = next(
-        estimator_without_fe_fn.predict(
-            input_fn=input_fn, as_iterable=True))
+        estimator_without_fe_fn.predict(input_fn=input_fn, as_iterable=True))
     self.assertEqual(1., prediction_without_fe_fn)
 
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index bc0e6fc0091c9b5419ab526855b404eb4a927e97..9b124b2c19f16bbc9b2afeadb82a32006e1a0ae9 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -181,7 +181,8 @@ def regression_head(label_name=None,
                     weight_column_name=None,
                     label_dimension=1,
                     enable_centered_bias=False,
-                    head_name=None):
+                    head_name=None,
+                    link_fn=None):
   """Creates a `Head` for linear regression.
 
   Args:
@@ -199,6 +200,8 @@ def regression_head(label_name=None,
     head_name: name of the head. If provided, predictions, summary and metrics
       keys will be suffixed by `"/" + head_name` and the default variable scope
       will be `head_name`.
+    link_fn: link function to convert logits to predictions. If provided,
+      this link function will be used instead of identity.
 
   Returns:
     An instance of `Head` for linear regression.
@@ -210,7 +213,7 @@ def regression_head(label_name=None,
       enable_centered_bias=enable_centered_bias,
       head_name=head_name,
       loss_fn=_mean_squared_loss,
-      link_fn=array_ops.identity)
+      link_fn=(link_fn if link_fn is not None else array_ops.identity))
 
 
 def poisson_regression_head(label_name=None,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head_test.py b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
index 3881bf533d642bef68fa9ab4ba908bbb8f7f8091..7c2d9bb0767cb979dae9c84b5342d129225677ed 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses as losses_lib
 from tensorflow.python.platform import test
@@ -153,6 +154,25 @@ class RegressionHeadTest(test.TestCase):
       _assert_no_variables(self)
       _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
 
+  def testRegressionWithLogitFn(self):
+    head = head_lib.regression_head(link_fn=math_ops.square)
+    def _assert_preditions(test_case, expected_predictions, model_fn_ops):
+      variables.initialize_local_variables().run()
+      test_case.assertAllClose(expected_predictions,
+                               model_fn_ops.predictions["scores"].eval())
+    with ops.Graph().as_default(), session.Session():
+      model_fn_ops = head.create_model_fn_ops(
+          {},
+          labels=((0.,), (1.,), (1.,)),
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=((1.,), (1.,), (3.,)))
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_summary_tags(self, ["loss"])
+      _assert_no_variables(self)
+      _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
+      _assert_preditions(self, ([1.0, 1.0, 9.0]), model_fn_ops)
+
   def testRegressionWithInvalidLogits(self):
     head = head_lib.regression_head()
     with ops.Graph().as_default(), session.Session():
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
index 992b804f59ecd88fedc2fba10d3079f93c4fe83d..8f9d6fc318a357853bdb8e3264f6691b410006b1 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
@@ -28,7 +28,7 @@ import time
 import numpy as np
 
 from tensorflow.contrib.factorization.python.ops import clustering_ops
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators.model_fn import ModelFnOps
 from tensorflow.python.framework import ops
@@ -128,7 +128,7 @@ def _kmeans_clustering_model_fn(features, labels, mode, params, config):
        random_seed=params.get('random_seed'),
        kmeans_plus_plus_num_retries=params.get(
            'kmeans_plus_plus_num_retries')).training_graph()
-  incr_step = state_ops.assign_add(variables.get_global_step(), 1)
+  incr_step = state_ops.assign_add(training_util.get_global_step(), 1)
   loss = math_ops.reduce_sum(losses, name=KMeansClustering.LOSS_OP_NAME)
   summary.scalar('loss/raw', loss)
   training_op = with_dependencies([training_op, incr_step], loss)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
index ce87b4723d436495e5fb149f0ab8f2eea44d82b8..b28835a809736a099ad2f08d127dc68d7977a3c1 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans_test.py
@@ -199,15 +199,7 @@ class KMeansTest(KMeansTestBase):
         input_fn=self.input_fn(batch_size=self.num_points), steps=1)
     self.assertNear(self.true_score, score, self.true_score * 0.01)
 
-  def test_infer(self):
-    kmeans = self._kmeans()
-    # Make a call to fit to initialize the cluster centers.
-    max_steps = 1
-    kmeans.fit(input_fn=self.input_fn(), max_steps=max_steps)
-    clusters = kmeans.clusters()
-
-    # Make a small test set
-    num_points = 10
+  def _infer_helper(self, kmeans, clusters, num_points):
     points, true_assignments, true_offsets = make_random_points(
         clusters, num_points)
     # Test predict
@@ -231,6 +223,17 @@ class KMeansTest(KMeansTestBase):
         np.transpose(np.sum(np.square(clusters), axis=1, keepdims=True)))
     self.assertAllClose(transform, true_transform, rtol=0.05, atol=10)
 
+  def test_infer(self):
+    kmeans = self._kmeans()
+    # Make a call to fit to initialize the cluster centers.
+    max_steps = 1
+    kmeans.fit(input_fn=self.input_fn(), max_steps=max_steps)
+    clusters = kmeans.clusters()
+
+    # Run inference on small datasets.
+    self._infer_helper(kmeans, clusters, num_points=10)
+    self._infer_helper(kmeans, clusters, num_points=1)
+
 
 class KMeansTestMultiStageInit(KMeansTestBase):
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index f5445ad4e728dbd3904279573771de9454b5d17c..37aa8b339622415d082933cdf66d2472a4119b48 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -26,7 +26,7 @@ import six
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
@@ -170,7 +170,7 @@ def _linear_model_fn(features, labels, mode, params, config=None):
           weight_collections=[parent_scope])
 
     def _train_op_fn(loss):
-      global_step = contrib_variables.get_global_step()
+      global_step = training_util.get_global_step()
       my_vars = ops.get_collection(parent_scope)
       grads = gradients.gradients(loss, my_vars)
       if gradient_clip_norm:
@@ -252,7 +252,7 @@ def sdca_model_fn(features, labels, mode, params):
     _add_bias_column(feature_columns, features, bias, columns_to_variables)
 
   def _train_op_fn(unused_loss):
-    global_step = contrib_variables.get_global_step()
+    global_step = training_util.get_global_step()
     sdca_model, train_op = optimizer.get_train_step(columns_to_variables,
                                                     weight_column_name,
                                                     loss_type, features,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
index 93c62f87e8495f299a8c456574c7b40534186304..ac2d10011e222eb9c534d7fbae3c0cb5f4820945 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib import layers
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.estimators import logistic_regressor
@@ -57,7 +57,10 @@ def _logistic_regression_model_fn(features, labels, mode):
   predictions = math_ops.sigmoid(logits)
   loss = losses.sigmoid_cross_entropy(labels, logits)
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss,
+      training_util.get_global_step(),
+      optimizer='Adagrad',
+      learning_rate=0.1)
   return predictions, loss, train_op
 
 
diff --git a/tensorflow/contrib/learn/python/learn/evaluable.py b/tensorflow/contrib/learn/python/learn/evaluable.py
index 66e15265171679dcd710fdf05bed3105de6bab99..8f6cd39864b437f163dd7c1140dc88755ce98529 100644
--- a/tensorflow/contrib/learn/python/learn/evaluable.py
+++ b/tensorflow/contrib/learn/python/learn/evaluable.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """`Evaluable` interface."""
 
 from __future__ import absolute_import
@@ -59,9 +58,12 @@ class Evaluable(object):
     for which this evaluation was performed.
 
     Args:
-      x: Matrix of shape [n_samples, n_features...] or dictionary of many matrices
-        containing the input samples for fitting the model. Can be iterator that returns
-        arrays of features or dictionary of array of features. If set, `input_fn` must
+      x: Matrix of shape [n_samples, n_features...] or dictionary of many
+        matrices
+        containing the input samples for fitting the model. Can be iterator that
+          returns
+        arrays of features or dictionary of array of features. If set,
+          `input_fn` must
         be `None`.
       y: Vector or matrix [n_samples] or [n_samples, n_outputs] containing the
         label values (class labels in classification, real numbers in
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index fc4bd1f461d7bfbfcfb78201d527959055342f0a..bec976afd2719138117976381669ca3292360480 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Experiment class collecting information needed for a single training run."""
 
 from __future__ import absolute_import
@@ -35,6 +34,7 @@ from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.tpu.python.tpu import tpu_estimator
 from tensorflow.python.estimator import estimator as core_estimator
+from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import basic_session_run_hooks
@@ -42,10 +42,21 @@ from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
-
 __all__ = ["Experiment"]
 
 
+def _get_standardized_predicate_fn(predicate_fn):
+  pred_fn_args = estimator_util.fn_args(predicate_fn)
+  if "checkpoint_path" not in pred_fn_args:
+    # pylint: disable=unused-argument
+    def _pred_fn_wrapper(eval_results, checkpoint_path):
+      return predicate_fn(eval_results)
+
+    return _pred_fn_wrapper
+  else:
+    return predicate_fn
+
+
 class _EvalAndExportListener(basic_session_run_hooks.CheckpointSaverListener):
   """Listener that evaluates and exports a model after creating a checkpoint.
 
@@ -265,8 +276,7 @@ class Experiment(object):
     self._train_steps_per_iteration = train_steps_per_iteration
     if (self._train_steps_per_iteration is not None and
         not isinstance(self._train_steps_per_iteration, int)):
-      raise ValueError(
-          "`train_steps_per_iteration` must be an integer.")
+      raise ValueError("`train_steps_per_iteration` must be an integer.")
 
   @property
   def estimator(self):
@@ -346,9 +356,10 @@ class Experiment(object):
           config.cluster_spec and config.master):
         self._start_server()
     elif config.cluster_spec and config.master:
-      raise ValueError('For distributed runtime, Experiment class only works with'
-                       'tf.contrib.learn.RunConfig for now, but provided {}'
-                       .format(type(config)))
+      raise ValueError(
+          "For distributed runtime, Experiment class only works with"
+          "tf.contrib.learn.RunConfig for now, but provided {}".format(
+              type(config)))
 
     extra_hooks = []
     if delay_secs is None:
@@ -401,11 +412,12 @@ class Experiment(object):
       logging.info("Waiting %d secs before starting eval.", delay_secs)
       time.sleep(delay_secs)
 
-    return self._call_evaluate(input_fn=self._eval_input_fn,
-                               steps=self._eval_steps,
-                               metrics=self._eval_metrics,
-                               name=(name or "one_pass"),
-                               hooks=self._eval_hooks)
+    return self._call_evaluate(
+        input_fn=self._eval_input_fn,
+        steps=self._eval_steps,
+        metrics=self._eval_metrics,
+        name=(name or "one_pass"),
+        hooks=self._eval_hooks)
 
   @deprecated(
       "2016-10-23",
@@ -446,22 +458,33 @@ class Experiment(object):
       evaluate_checkpoint_only_once: Whether to skip evaluation of checkpoints
         that have already been evaluated. Default is `True`.
       continuous_eval_predicate_fn: A predicate function determining whether to
-        continue eval after each iteration. `predicate_fn` takes the evaluation
-        results as arguments. At the beginning of evaluation, the passed eval
-        results will be None so it's expected that the predicate function
-        handles that gracefully. When `predicate_fn` is not specified,
-        continuous eval will run in an infinite loop (if `train_steps` is None)
-        or exit once global step reaches `train_steps`.
+        continue eval after each iteration. A `predicate_fn` has one of the
+        following signatures:
+          * (eval_results) -> boolean
+          * (eval_results, checkpoint_path) -> boolean
+        Where `eval_results` is the dictionary of metric evaluations and
+        checkpoint_path is the path to the checkpoint containing the parameters
+        on which that evaluation was based.
+        At the beginning of evaluation, the passed `eval_results` will be None
+        so it's expected that the predicate function handles that gracefully.
+        When `predicate_fn` is not specified, continuous eval will run in an
+        infinite loop (if `train_steps` is None). or exit once global step
+        reaches `train_steps`.
+
       export: Whether to export from this step. Default is 'True'.
 
     Raises:
       ValueError: if `continuous_eval_predicate_fn` is neither None nor
         callable.
     """
-    if (continuous_eval_predicate_fn is not None and
-        not callable(continuous_eval_predicate_fn)):
-      raise ValueError(
-          "`continuous_eval_predicate_fn` must be a callable, or None.")
+    if continuous_eval_predicate_fn is not None:
+      if not callable(continuous_eval_predicate_fn):
+        raise ValueError(
+            "`continuous_eval_predicate_fn` must be a callable, or None.")
+      predicate_fn = _get_standardized_predicate_fn(
+          continuous_eval_predicate_fn)
+    else:
+      predicate_fn = None
 
     if delay_secs is None:
       delay_secs = self._eval_delay_secs
@@ -475,13 +498,12 @@ class Experiment(object):
     previous_path = None
     eval_result = None
     last_warning_time = 0
-    while (not continuous_eval_predicate_fn or
-           continuous_eval_predicate_fn(eval_result)):
+    while (not predicate_fn or predicate_fn(
+        eval_result, checkpoint_path=previous_path if eval_result else None)):
       # Exit if we have already reached number of steps to train.
       if self._has_training_stopped(eval_result):
         logging.info("Exiting continuous eval, global_step=%s >= "
-                     "train_step=%s",
-                     eval_result[ops.GraphKeys.GLOBAL_STEP],
+                     "train_step=%s", eval_result[ops.GraphKeys.GLOBAL_STEP],
                      self._train_steps)
         return
 
@@ -502,12 +524,13 @@ class Experiment(object):
           logging.warning(error_msg)
           last_warning_time = time.time()
       else:
-        eval_result = self._call_evaluate(input_fn=input_fn,
-                                          steps=self._eval_steps,
-                                          metrics=self._eval_metrics,
-                                          name=name,
-                                          checkpoint_path=latest_path,
-                                          hooks=self._eval_hooks)
+        eval_result = self._call_evaluate(
+            input_fn=input_fn,
+            steps=self._eval_steps,
+            metrics=self._eval_metrics,
+            name=name,
+            checkpoint_path=latest_path,
+            hooks=self._eval_hooks)
         # Ensure eval result is not None for next round of evaluation.
         if not eval_result:
           eval_result = {}
@@ -532,8 +555,8 @@ class Experiment(object):
       return False
 
     global_step = eval_result.get(ops.GraphKeys.GLOBAL_STEP)
-    return global_step and self._train_steps and (
-        global_step >= self._train_steps)
+    return global_step and self._train_steps and (global_step >=
+                                                  self._train_steps)
 
   def continuous_eval(self,
                       delay_secs=None,
@@ -652,8 +675,7 @@ class Experiment(object):
       return eval_result, export_results
 
   @experimental
-  def continuous_train_and_eval(self,
-                                continuous_eval_predicate_fn=None):
+  def continuous_train_and_eval(self, continuous_eval_predicate_fn=None):
     """Interleaves training and evaluation.
 
     The frequency of evaluation is controlled by the `train_steps_per_iteration`
@@ -682,11 +704,19 @@ class Experiment(object):
 
     Args:
       continuous_eval_predicate_fn: A predicate function determining whether to
-        continue after each iteration. `predicate_fn` takes the evaluation
-        results as its arguments. At the beginning of evaluation, the passed
-        eval results will be None so it's expected that the predicate function
-        handles that gracefully. When `predicate_fn` is not specified, this will
-        run in an infinite loop or exit when global_step reaches `train_steps`.
+        continue eval after each iteration. A `predicate_fn` has one of the
+        following signatures:
+          * (eval_results) -> boolean
+          * (eval_results, checkpoint_path) -> boolean
+        Where `eval_results` is the dictionary of metric evaluations and
+        checkpoint_path is the path to the checkpoint containing the parameters
+        on which that evaluation was based.
+        At the beginning of evaluation, the passed `eval_results` and
+        `checkpoint_path` will be None so it's expected that the predicate
+        function handles that gracefully.
+        When `predicate_fn` is not specified, continuous eval will run in an
+        infinite loop (if `train_steps` is None). or exit once global step
+        reaches `train_steps`.
 
     Returns:
       A tuple of the result of the `evaluate` call to the `Estimator` and the
@@ -697,13 +727,18 @@ class Experiment(object):
         callable.
     """
 
-    if (continuous_eval_predicate_fn is not None and
-        not callable(continuous_eval_predicate_fn)):
-      raise ValueError(
-          "`continuous_eval_predicate_fn` must be a callable, or None.")
+    if continuous_eval_predicate_fn is not None:
+      if not callable(continuous_eval_predicate_fn):
+        raise ValueError(
+            "`continuous_eval_predicate_fn` must be a callable, or None.")
+      predicate_fn = _get_standardized_predicate_fn(
+          continuous_eval_predicate_fn)
+    else:
+      predicate_fn = None
 
-    eval_result = None
     export_results = None
+    latest_checkpoint = None
+    eval_result = None
 
     # Set the default value for train_steps_per_iteration, which will be
     # overridden by other settings.
@@ -713,8 +748,9 @@ class Experiment(object):
     elif self._train_steps is not None:
       train_steps_per_iteration = int(self._train_steps / 10)
 
-    while (not continuous_eval_predicate_fn or
-           continuous_eval_predicate_fn(eval_result)):
+    while (not predicate_fn or predicate_fn(
+        eval_result, checkpoint_path=latest_checkpoint
+        if eval_result else None)):
 
       if self._has_training_stopped(eval_result):
         # Exits once max steps of training is satisfied.
@@ -729,11 +765,14 @@ class Experiment(object):
           saving_listeners=self._saving_listeners)
 
       logging.info("Evaluating model now.")
-      eval_result = self._call_evaluate(input_fn=self._eval_input_fn,
-                                        steps=self._eval_steps,
-                                        metrics=self._eval_metrics,
-                                        name="one_pass",
-                                        hooks=self._eval_hooks)
+      latest_checkpoint = saver.latest_checkpoint(self._estimator.model_dir)
+      eval_result = self._call_evaluate(
+          input_fn=self._eval_input_fn,
+          steps=self._eval_steps,
+          metrics=self._eval_metrics,
+          name="one_pass",
+          checkpoint_path=latest_checkpoint,
+          hooks=self._eval_hooks)
       export_results = self._maybe_export(eval_result)
 
     return eval_result, export_results
@@ -741,8 +780,7 @@ class Experiment(object):
   def _maybe_export(self, eval_result, checkpoint_path=None):
     """Export the Estimator using export_fn, if defined."""
     export_dir_base = os.path.join(
-        compat.as_bytes(self._estimator.model_dir),
-        compat.as_bytes("export"))
+        compat.as_bytes(self._estimator.model_dir), compat.as_bytes("export"))
 
     export_results = []
     for strategy in self._export_strategies:
@@ -780,10 +818,11 @@ class Experiment(object):
         hooks=self._train_monitors,
         saving_listeners=self._saving_listeners)
 
-    eval_result = self._call_evaluate(input_fn=self._eval_input_fn,
-                                      steps=1,
-                                      metrics=self._eval_metrics,
-                                      name="one_pass")
+    eval_result = self._call_evaluate(
+        input_fn=self._eval_input_fn,
+        steps=1,
+        metrics=self._eval_metrics,
+        name="one_pass")
     _ = self._maybe_export(eval_result)
 
     return eval_result
@@ -805,9 +844,14 @@ class Experiment(object):
     server.start()
     return server
 
-  def _call_train(self, _sentinel=None,  # pylint: disable=invalid-name,
-                  input_fn=None, steps=None, hooks=None, max_steps=None,
-                  saving_listeners=None):
+  def _call_train(
+      self,
+      _sentinel=None,  # pylint: disable=invalid-name,
+      input_fn=None,
+      steps=None,
+      hooks=None,
+      max_steps=None,
+      saving_listeners=None):
     if _sentinel is not None:
       raise ValueError("_call_train should be called with keyword args only")
 
@@ -823,14 +867,18 @@ class Experiment(object):
           hooks=hooks,
           saving_listeners=saving_listeners)
     else:
-      return self._estimator.fit(input_fn=input_fn,
-                                 steps=steps,
-                                 max_steps=max_steps,
-                                 monitors=hooks)
-
-  def _call_evaluate(self, _sentinel=None,  # pylint: disable=invalid-name,
-                     input_fn=None, steps=None, metrics=None, name=None,
-                     checkpoint_path=None, hooks=None):
+      return self._estimator.fit(
+          input_fn=input_fn, steps=steps, max_steps=max_steps, monitors=hooks)
+
+  def _call_evaluate(
+      self,
+      _sentinel=None,  # pylint: disable=invalid-name,
+      input_fn=None,
+      steps=None,
+      metrics=None,
+      name=None,
+      checkpoint_path=None,
+      hooks=None):
     if _sentinel is not None:
       raise ValueError("_call_evaluate should be called with keyword args only")
 
@@ -838,18 +886,20 @@ class Experiment(object):
       if metrics is not None:
         raise ValueError(
             "`eval_metrics` must be `None` with `tf.estimator.Estimator`")
-      return self._estimator.evaluate(input_fn=input_fn,
-                                      steps=steps,
-                                      name=name,
-                                      checkpoint_path=checkpoint_path,
-                                      hooks=hooks)
+      return self._estimator.evaluate(
+          input_fn=input_fn,
+          steps=steps,
+          name=name,
+          checkpoint_path=checkpoint_path,
+          hooks=hooks)
     else:
-      return self._estimator.evaluate(input_fn=input_fn,
-                                      steps=steps,
-                                      metrics=metrics,
-                                      name=name,
-                                      checkpoint_path=checkpoint_path,
-                                      hooks=hooks)
+      return self._estimator.evaluate(
+          input_fn=input_fn,
+          steps=steps,
+          metrics=metrics,
+          name=name,
+          checkpoint_path=checkpoint_path,
+          hooks=hooks)
 
 
 @contextlib.contextmanager
diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py
index c29c198d094090a59c8c7dd2949c3f069adf49d0..545d7d8924c0c10544e6113e2968b7ae3d2090fc 100644
--- a/tensorflow/contrib/learn/python/learn/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/experiment_test.py
@@ -492,6 +492,33 @@ class ExperimentTest(test.TestCase):
       self.assertEqual(3, est.eval_count)
       self.assertEqual([noop_hook], est.eval_hooks)
 
+  def test_continuous_eval_predicate_fn_with_checkpoint(self):
+    for est in self._estimators_for_tests():
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      est.fake_checkpoint()
+      noop_hook = _NoopHook()
+
+      def _predicate_fn(eval_result, checkpoint_path):
+        self.assertEqual(not eval_result,
+                         checkpoint_path is None)
+        return est.eval_count < 3  # pylint: disable=cell-var-from-loop
+
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          eval_hooks=[noop_hook],
+          eval_delay_secs=0,
+          continuous_eval_throttle_secs=0)
+      ex.continuous_eval(
+          evaluate_checkpoint_only_once=False,
+          continuous_eval_predicate_fn=_predicate_fn)
+      self.assertEqual(0, est.fit_count)
+      self.assertEqual(3, est.eval_count)
+      self.assertEqual([noop_hook], est.eval_hooks)
+
   def test_run_local(self):
     for est in self._estimators_for_tests():
       eval_metrics = 'eval_metrics' if not isinstance(
diff --git a/tensorflow/contrib/learn/python/learn/export_strategy.py b/tensorflow/contrib/learn/python/learn/export_strategy.py
index f276aab0e6beb011a21c20fa194dd5212db796d1..55a8b824312b89e0ac66513242191f4201ac212a 100644
--- a/tensorflow/contrib/learn/python/learn/export_strategy.py
+++ b/tensorflow/contrib/learn/python/learn/export_strategy.py
@@ -26,13 +26,14 @@ __all__ = ['ExportStrategy']
 
 
 class ExportStrategy(
-    collections.namedtuple('ExportStrategy', ['name', 'export_fn'])):
+    collections.namedtuple('ExportStrategy',
+                           ['name', 'export_fn', 'strip_default_attrs'])):
   """A class representing a type of model export.
 
   Typically constructed by a utility function specific to the exporter, such as
   `saved_model_export_utils.make_export_strategy()`.
 
-  The fields are:
+  Attributes:
     name: The directory name under the export base directory where exports of
       this type will be written.
     export_fn: A function that writes an export, given an estimator, a
@@ -45,11 +46,20 @@ class ExportStrategy(
 
     The signature of this function must be one of:
 
-    * `(estimator, export_path) -> export_path`
-    * `(estimator, export_path, checkpoint_path) -> export_path`
-    * `(estimator, export_path, checkpoint_path, eval_result) -> export_path`
+      * `(estimator, export_path) -> export_path`
+      * `(estimator, export_path, checkpoint_path) -> export_path`
+      * `(estimator, export_path, checkpoint_path, eval_result) -> export_path`
+      * `(estimator, export_path, checkpoint_path, eval_result,
+          strip_default_attrs) -> export_path`
+    strip_default_attrs: (Optional) Boolean. If set as True, default attrs in
+        the `GraphDef` will be stripped on write. This is recommended for better
+        forward compatibility of the resulting `SavedModel`.
   """
 
+  def __new__(cls, name, export_fn, strip_default_attrs=None):
+    return super(ExportStrategy, cls).__new__(
+        cls, name, export_fn, strip_default_attrs)
+
   def export(self,
              estimator,
              export_path,
@@ -83,5 +93,6 @@ class ExportStrategy(
         raise ValueError('An export_fn accepting eval_result must also accept '
                          'checkpoint_path.')
       kwargs['eval_result'] = eval_result
-
+    if 'strip_default_attrs' in export_fn_args:
+      kwargs['strip_default_attrs'] = self.strip_default_attrs
     return self.export_fn(estimator, export_path, **kwargs)
diff --git a/tensorflow/contrib/learn/python/learn/export_strategy_test.py b/tensorflow/contrib/learn/python/learn/export_strategy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..43c3551cccc3b8e6b66bd2b36839a3dfc5fe8eea
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/export_strategy_test.py
@@ -0,0 +1,89 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ExportStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn import export_strategy
+from tensorflow.python.platform import test
+
+
+class ExportStrategyTest(test.TestCase):
+
+  def test_no_optional_args_export(self):
+    model_path = '/path/to/model'
+    def _export_fn(estimator, export_path):
+      self.assertTupleEqual((estimator, export_path), (None, None))
+      return model_path
+
+    strategy = export_strategy.ExportStrategy('foo', _export_fn)
+    self.assertTupleEqual(strategy, ('foo', _export_fn, None))
+    self.assertIs(strategy.export(None, None), model_path)
+
+  def test_checkpoint_export(self):
+    ckpt_model_path = '/path/to/checkpoint_model'
+    def _ckpt_export_fn(estimator, export_path, checkpoint_path):
+      self.assertTupleEqual((estimator, export_path), (None, None))
+      self.assertEqual(checkpoint_path, 'checkpoint')
+      return ckpt_model_path
+
+    strategy = export_strategy.ExportStrategy('foo', _ckpt_export_fn)
+    self.assertTupleEqual(strategy, ('foo', _ckpt_export_fn, None))
+    self.assertIs(strategy.export(None, None, 'checkpoint'), ckpt_model_path)
+
+  def test_checkpoint_eval_export(self):
+    ckpt_eval_model_path = '/path/to/checkpoint_eval_model'
+    def _ckpt_eval_export_fn(estimator, export_path, checkpoint_path,
+                             eval_result):
+      self.assertTupleEqual((estimator, export_path), (None, None))
+      self.assertEqual(checkpoint_path, 'checkpoint')
+      self.assertEqual(eval_result, 'eval')
+      return ckpt_eval_model_path
+
+    strategy = export_strategy.ExportStrategy('foo', _ckpt_eval_export_fn)
+    self.assertTupleEqual(strategy, ('foo', _ckpt_eval_export_fn, None))
+    self.assertIs(strategy.export(None, None, 'checkpoint', 'eval'),
+                  ckpt_eval_model_path)
+
+  def test_eval_only_export(self):
+    def _eval_export_fn(estimator, export_path, eval_result):
+      del estimator, export_path, eval_result
+
+    strategy = export_strategy.ExportStrategy('foo', _eval_export_fn)
+    self.assertTupleEqual(strategy, ('foo', _eval_export_fn, None))
+    with self.assertRaisesRegexp(ValueError, 'An export_fn accepting '
+                                 'eval_result must also accept '
+                                 'checkpoint_path'):
+      strategy.export(None, None, eval_result='eval')
+
+  def test_strip_default_attr_export(self):
+    strip_default_attrs_model_path = '/path/to/strip_default_attrs_model'
+    def _strip_default_attrs_export_fn(estimator, export_path,
+                                       strip_default_attrs):
+      self.assertTupleEqual((estimator, export_path), (None, None))
+      self.assertTrue(strip_default_attrs)
+      return strip_default_attrs_model_path
+
+    strategy = export_strategy.ExportStrategy('foo',
+                                              _strip_default_attrs_export_fn,
+                                              True)
+    self.assertTupleEqual(strategy,
+                          ('foo', _strip_default_attrs_export_fn, True))
+    self.assertIs(strategy.export(None, None), strip_default_attrs_model_path)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
index 86fad4c5535a918d87e0741687cfebe3afaf9ddf..96be8b1bc402479d5611965f27abb197363cb939 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@@ -35,6 +35,7 @@ from tensorflow.python.platform import tf_logging as logging
 # pylint: disable=g-multiple-import,g-bad-import-order
 from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
 from .dask_io import HAS_DASK, extract_dask_data, extract_dask_labels
+
 # pylint: enable=g-multiple-import,g-bad-import-order
 
 
@@ -74,11 +75,11 @@ def _get_in_out_shape(x_shape, y_shape, n_classes, batch_size=None):
   if not y_is_dict:
     output_shape = out_el_shape(y_shape, n_classes)
   else:
-    output_shape = dict([
-        (k, out_el_shape(v, n_classes[k]
-                         if n_classes is not None and k in n_classes else None))
-        for k, v in list(y_shape.items())
-    ])
+    output_shape = dict([(k,
+                          out_el_shape(v, n_classes[k]
+                                       if n_classes is not None and
+                                       k in n_classes else None))
+                         for k, v in list(y_shape.items())])
 
   return input_shape, output_shape, batch_size
 
@@ -314,23 +315,23 @@ class DataFeeder(object):
       input_dtype: DType of input (or dictionary of shapes).
       output_dtype: DType of output (or dictionary of shapes.
     """
-    x_is_dict, y_is_dict = isinstance(x, dict), y is not None and isinstance(
-        y, dict)
+    x_is_dict, y_is_dict = isinstance(
+        x, dict), y is not None and isinstance(y, dict)
     if isinstance(y, list):
       y = np.array(y)
 
     self._x = dict([(k, check_array(v, v.dtype)) for k, v in list(x.items())
                    ]) if x_is_dict else check_array(x, x.dtype)
-    self._y = None if y is None else (
-        dict([(k, check_array(v, v.dtype)) for k, v in list(y.items())])
-        if y_is_dict else check_array(y, y.dtype))
+    self._y = None if y is None else (dict(
+        [(k, check_array(v, v.dtype)) for k, v in list(y.items())])
+                                      if y_is_dict else check_array(y, y.dtype))
 
     # self.n_classes is not None means we're converting raw target indices
     # to one-hot.
     if n_classes is not None:
       if not y_is_dict:
-        y_dtype = (np.int64
-                   if n_classes is not None and n_classes > 1 else np.float32)
+        y_dtype = (
+            np.int64 if n_classes is not None and n_classes > 1 else np.float32)
         self._y = (None if y is None else check_array(y, dtype=y_dtype))
 
     self.n_classes = n_classes
@@ -352,8 +353,8 @@ class DataFeeder(object):
     # self._output_dtype == np.float32 when y is None
     self._output_dtype = (
         dict([(k, _check_dtype(v.dtype)) for k, v in list(self._y.items())])
-        if y_is_dict else (
-            _check_dtype(self._y.dtype) if y is not None else np.float32))
+        if y_is_dict else (_check_dtype(self._y.dtype)
+                           if y is not None else np.float32))
 
     # self.n_classes is None means we're passing in raw target indices
     if n_classes is not None and y_is_dict:
@@ -478,8 +479,8 @@ class DataFeeder(object):
 
     # Assign input features from random indices.
     def extract(data, indices):
-      return (np.array(_access(data, indices)).reshape((indices.shape[0], 1)) if
-              len(data.shape) == 1 else _access(data, indices))
+      return (np.array(_access(data, indices)).reshape((indices.shape[0], 1))
+              if len(data.shape) == 1 else _access(data, indices))
 
     # assign labels from random indices
     def assign_label(data, shape, dtype, n_classes, indices):
@@ -511,16 +512,18 @@ class DataFeeder(object):
         feed_dict[self._epoch_placeholder.name] = [self.epoch]
 
       # Take next batch of indices.
-      x_len = list(self._x.values())[0].shape[
-          0] if x_is_dict else self._x.shape[0]
+      x_len = list(
+          self._x.values())[0].shape[0] if x_is_dict else self._x.shape[0]
       end = min(x_len, self.offset + self._batch_size)
       batch_indices = self.indices[self.offset:end]
 
       # adding input placeholder
       feed_dict.update(
           dict([(self._input_placeholder[k].name, extract(v, batch_indices))
-                for k, v in list(self._x.items())]) if x_is_dict else
-          {self._input_placeholder.name: extract(self._x, batch_indices)})
+                for k, v in list(self._x.items())]) if x_is_dict else {
+                    self._input_placeholder.name:
+                        extract(self._x, batch_indices)
+                })
 
       # move offset and reset it if necessary
       self.offset += self._batch_size
@@ -545,7 +548,8 @@ class DataFeeder(object):
                   assign_label(v, shape, dtype, n_classes, batch_indices)
           })
       else:
-        shape, dtype, n_classes = self.output_shape, self._output_dtype, self.n_classes
+        shape, dtype, n_classes = (self.output_shape, self._output_dtype,
+                                   self.n_classes)
         feed_dict.update({
             self._output_placeholder.name:
                 assign_label(self._y, shape, dtype, n_classes, batch_indices)
@@ -621,8 +625,9 @@ class StreamingDataFeeder(DataFeeder):
     elif y is None:
       y_first_el_shape = None
     else:
-      y_first_el_shape = ([1] + list(y_first_el[0].shape if isinstance(
-          y_first_el, list) else y_first_el.shape))
+      y_first_el_shape = (
+          [1] + list(y_first_el[0].shape
+                     if isinstance(y_first_el, list) else y_first_el.shape))
 
     self.input_shape, self.output_shape, self._batch_size = _get_in_out_shape(
         x_first_el_shape, y_first_el_shape, n_classes, batch_size)
@@ -683,8 +688,8 @@ class StreamingDataFeeder(DataFeeder):
         if shape is None:
           return None
         elif isinstance(shape, dict):
-          return dict([(k, np.zeros(shape[k], dtype[k]))
-                       for k in list(shape.keys())])
+          return dict(
+              [(k, np.zeros(shape[k], dtype[k])) for k in list(shape.keys())])
         else:
           return np.zeros(shape, dtype=dtype)
 
@@ -857,8 +862,8 @@ class DaskDataFeeder(object):
     """Returns a function, that will sample data and provide it to placeholders.
 
     Args:
-      input_placeholder: tf.Placeholder for input features mini batch.
-      output_placeholder: tf.Placeholder for output labels.
+      input_placeholder: tf.placeholder for input features mini batch.
+      output_placeholder: tf.placeholder for output labels.
 
     Returns:
       A function that when called samples a random subset of batch size
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
index 4b34fc62849766370979bb2002d42ee03ea7161a..3a46c239688017f9204d2c6182a6f81cd325a417 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
@@ -280,14 +281,33 @@ def _get_file_names(file_pattern, randomize_input):
 
 def _get_examples(file_name_queue, reader, num_threads, read_batch_size,
                   filter_fn, parse_fn):
+  """Get example filenames matching.
+
+  Args:
+    file_name_queue: A queue implementation that dequeues elements in
+      first-in first-out order.
+    reader: A function or class that returns an object with
+      `read` method, (filename tensor) -> (example tensor).
+    num_threads: The number of threads enqueuing examples.
+    read_batch_size: An int or scalar `Tensor` specifying the number of
+      records to read at once.
+    filter_fn: Filtering function, takes both keys as well as an `Example`
+      Tensors and returns a boolean mask of the same shape as the input Tensors
+      to be applied for filtering. If `None`, no filtering is done.
+    parse_fn: Parsing function, takes `Example` Tensor returns parsed
+      representation. If `None`, no parsing is done.
+
+  Returns:
+    List of example file names matching `file_name_queue`.
+  """
   with ops.name_scope('read'):
     example_list = []
     for _ in range(num_threads):
-      if read_batch_size > 1:
-        keys, examples_proto = reader().read_up_to(file_name_queue,
-                                                   read_batch_size)
-      else:
-        keys, examples_proto = reader().read(file_name_queue)
+      keys, examples_proto = utils.smart_cond(
+          read_batch_size > 1,
+          lambda: reader().read_up_to(file_name_queue, read_batch_size),
+          lambda: reader().read(file_name_queue))
+
       if filter_fn:
         mask = filter_fn(keys, examples_proto)
         keys = array_ops.boolean_mask(keys, mask)
@@ -379,14 +399,15 @@ def _read_keyed_batch_examples_helper(file_pattern,
             capacity=1, dtypes=[dtypes.string], shapes=[[]])
         enqueue_op = file_name_queue.enqueue(
             input_pipeline_ops.seek_next(
-                file_names, shuffle=randomize_input, num_epochs=num_epochs,
+                file_names,
+                shuffle=randomize_input,
+                num_epochs=num_epochs,
                 seed=seed))
         queue_runner.add_queue_runner(
             queue_runner.QueueRunner(file_name_queue, [enqueue_op]))
       else:
         file_name_queue = input_ops.string_input_producer(
-            constant_op.constant(
-                file_names, name='input'),
+            constant_op.constant(file_names, name='input'),
             shuffle=randomize_input,
             num_epochs=num_epochs,
             name=file_name_queue_scope,
@@ -496,7 +517,8 @@ def read_keyed_batch_features(file_pattern,
   """
 
   with ops.name_scope(name, 'read_batch_features', [file_pattern]) as scope:
-    if read_batch_size is None: read_batch_size = batch_size
+    if read_batch_size is None:
+      read_batch_size = batch_size
     keys, examples = read_keyed_batch_examples(
         file_pattern,
         batch_size,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
index 6f0fd9a2976d37d1c701a96f50c2b987562cb191..e11e8b698adc113486bbb45572c8129e964cc931 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@@ -204,8 +204,7 @@ class GraphIOTest(test.TestCase):
     shape = (0,)
     features = {
         "feature":
-            parsing_ops.FixedLenFeature(
-                shape=shape, dtype=dtypes_lib.float32)
+            parsing_ops.FixedLenFeature(shape=shape, dtype=dtypes_lib.float32)
     }
 
     with ops.Graph().as_default() as g, self.test_session(graph=g) as sess:
@@ -255,8 +254,8 @@ class GraphIOTest(test.TestCase):
       self.assertAllEqual((None,), inputs.get_shape().as_list())
       self.assertEqual("%s:1" % name, inputs.name)
       file_name_queue_name = "%s/file_name_queue" % name
-      file_name_queue_limit_name = ("%s/limit_epochs/epochs" %
-                                    file_name_queue_name)
+      file_name_queue_limit_name = (
+          "%s/limit_epochs/epochs" % file_name_queue_name)
       file_names_name = "%s/input" % file_name_queue_name
       example_queue_name = "%s/random_shuffle_queue" % name
       op_nodes = test_util.assert_ops_in_graph({
@@ -354,8 +353,8 @@ class GraphIOTest(test.TestCase):
     json_lines = [
         "".join([
             '{"features": { "feature": { "sequence": {',
-            '"bytes_list": { "value": ["', base64.b64encode(l).decode("ascii"),
-            '"]}}}}}\n'
+            '"bytes_list": { "value": ["',
+            base64.b64encode(l).decode("ascii"), '"]}}}}}\n'
         ]) for l in lines
     ]
     return self._create_temp_file("".join(json_lines))
@@ -823,6 +822,31 @@ class GraphIOTest(test.TestCase):
       coord.request_stop()
       coord.join(threads)
 
+  def test_read_keyed_batch_features_shared_queue(self):
+    batch_size = 17
+    shape = (0,)
+    fixed_feature = parsing_ops.FixedLenFeature(
+        shape=shape, dtype=dtypes_lib.float32)
+    feature = {"feature": fixed_feature}
+    reader = io_ops.TFRecordReader
+
+    _, queued_feature = graph_io.read_keyed_batch_features_shared_queue(
+        _VALID_FILE_PATTERN, batch_size, feature, reader)
+
+    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+      features_result = graph_io.read_batch_features(
+          _VALID_FILE_PATTERN, batch_size, feature, reader)
+      session.run(variables.local_variables_initializer())
+
+    self.assertAllEqual(
+        queued_feature.get("feature").get_shape().as_list(),
+        features_result.get("feature").get_shape().as_list())
+
+  def test_get_file_names_errors(self):
+    # Raise bad file_pattern.
+    with self.assertRaises(ValueError):
+      graph_io._get_file_names([], True)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io_test.py
deleted file mode 100644
index 6fe8de8705b8854e5861879d2a505fe03fddc7e5..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io_test.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for numpy_io."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.learn.python.learn.learn_io import numpy_io
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import test
-from tensorflow.python.training import coordinator
-from tensorflow.python.training import queue_runner_impl
-
-
-class NumpyIoTest(test.TestCase):
-
-  def testNumpyInputFn(self):
-    a = np.arange(4) * 1.0
-    b = np.arange(32, 36)
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -28)
-
-    with self.test_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [0, 1])
-      self.assertAllEqual(res[0]['b'], [32, 33])
-      self.assertAllEqual(res[1], [-32, -31])
-
-      session.run([features, target])
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithVeryLargeBatchSizeAndMultipleEpochs(self):
-    a = np.arange(2) * 1.0
-    b = np.arange(32, 34)
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -30)
-
-    with self.test_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=128, shuffle=False, num_epochs=2)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [0, 1, 0, 1])
-      self.assertAllEqual(res[0]['b'], [32, 33, 32, 33])
-      self.assertAllEqual(res[1], [-32, -31, -32, -31])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithZeroEpochs(self):
-    a = np.arange(4) * 1.0
-    b = np.arange(32, 36)
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -28)
-
-    with self.test_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=0)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithBatchSizeNotDividedByDataSize(self):
-    batch_size = 2
-    a = np.arange(5) * 1.0
-    b = np.arange(32, 37)
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -27)
-
-    with self.test_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=batch_size, shuffle=False, num_epochs=1)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [0, 1])
-      self.assertAllEqual(res[0]['b'], [32, 33])
-      self.assertAllEqual(res[1], [-32, -31])
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [2, 3])
-      self.assertAllEqual(res[0]['b'], [34, 35])
-      self.assertAllEqual(res[1], [-30, -29])
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [4])
-      self.assertAllEqual(res[0]['b'], [36])
-      self.assertAllEqual(res[1], [-28])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithBatchSizeNotDividedByDataSizeAndMultipleEpochs(self):
-    batch_size = 2
-    a = np.arange(3) * 1.0
-    b = np.arange(32, 35)
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -29)
-
-    with self.test_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=batch_size, shuffle=False, num_epochs=3)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [0, 1])
-      self.assertAllEqual(res[0]['b'], [32, 33])
-      self.assertAllEqual(res[1], [-32, -31])
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [2, 0])
-      self.assertAllEqual(res[0]['b'], [34, 32])
-      self.assertAllEqual(res[1], [-30, -32])
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [1, 2])
-      self.assertAllEqual(res[0]['b'], [33, 34])
-      self.assertAllEqual(res[1], [-31, -30])
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [0, 1])
-      self.assertAllEqual(res[0]['b'], [32, 33])
-      self.assertAllEqual(res[1], [-32, -31])
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [2])
-      self.assertAllEqual(res[0]['b'], [34])
-      self.assertAllEqual(res[1], [-30])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithBatchSizeLargerThanDataSize(self):
-    batch_size = 10
-    a = np.arange(4) * 1.0
-    b = np.arange(32, 36)
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -28)
-
-    with self.test_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=batch_size, shuffle=False, num_epochs=1)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [0, 1, 2, 3])
-      self.assertAllEqual(res[0]['b'], [32, 33, 34, 35])
-      self.assertAllEqual(res[1], [-32, -31, -30, -29])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithDifferentDimensionsOfFeatures(self):
-    a = np.array([[1, 2], [3, 4]])
-    b = np.array([5, 6])
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -30)
-
-    with self.test_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [[1, 2], [3, 4]])
-      self.assertAllEqual(res[0]['b'], [5, 6])
-      self.assertAllEqual(res[1], [-32, -31])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithXAsNonDict(self):
-    x = np.arange(32, 36)
-    y = np.arange(4)
-    with self.test_session():
-      with self.assertRaisesRegexp(TypeError, 'x must be dict'):
-        failing_input_fn = numpy_io.numpy_input_fn(
-            x, y, batch_size=2, shuffle=False, num_epochs=1)
-        failing_input_fn()
-
-  def testNumpyInputFnWithTargetKeyAlreadyInX(self):
-    array = np.arange(32, 36)
-    x = {'__target_key__': array}
-    y = np.arange(4)
-
-    with self.test_session():
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-      input_fn()
-      self.assertAllEqual(x['__target_key__'], array)
-      self.assertItemsEqual(x.keys(), ['__target_key__'])
-
-  def testNumpyInputFnWithMismatchLengthOfInputs(self):
-    a = np.arange(4) * 1.0
-    b = np.arange(32, 36)
-    x = {'a': a, 'b': b}
-    x_mismatch_length = {'a': np.arange(1), 'b': b}
-    y_longer_length = np.arange(10)
-
-    with self.test_session():
-      with self.assertRaisesRegexp(
-          ValueError, 'Length of tensors in x and y is mismatched.'):
-        failing_input_fn = numpy_io.numpy_input_fn(
-            x, y_longer_length, batch_size=2, shuffle=False, num_epochs=1)
-        failing_input_fn()
-
-      with self.assertRaisesRegexp(
-          ValueError, 'Length of tensors in x and y is mismatched.'):
-        failing_input_fn = numpy_io.numpy_input_fn(
-            x=x_mismatch_length,
-            y=None,
-            batch_size=2,
-            shuffle=False,
-            num_epochs=1)
-        failing_input_fn()
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/learn/python/learn/metric_spec.py b/tensorflow/contrib/learn/python/learn/metric_spec.py
index ed6683abedbb8ae76ba364405158eb52cbb6d762..6440bc204b8e339ff51311dcc87b36f556b94092 100644
--- a/tensorflow/contrib/learn/python/learn/metric_spec.py
+++ b/tensorflow/contrib/learn/python/learn/metric_spec.py
@@ -42,10 +42,8 @@ def _args(fn):
   """
   if hasattr(fn, 'func') and hasattr(fn, 'keywords'):
     # Handle functools.partial and similar objects.
-    return tuple([
-        arg for arg in tf_inspect.getargspec(fn.func).args
-        if arg not in set(fn.keywords.keys())
-    ])
+    return tuple(
+        [arg for arg in _args(fn.func) if arg not in set(fn.keywords.keys())])
   # Handle function.
   return tuple(tf_inspect.getargspec(fn).args)
 
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index 3e0b1ad21a9a4a08fa94c8e9796f2b0dd5f8d622..51381a7427c919592b8e818c4b46dba974992610 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Monitors instrument the training process.
 
 @@get_default_monitors
@@ -151,8 +150,8 @@ class BaseMonitor(object):
       ValueError: if we've not begun an epoch, or `epoch` number does not match.
     """
     if self._current_epoch != epoch:
-      raise ValueError(
-          "epoch_end expected %s but got %s.", self._current_epoch, epoch)
+      raise ValueError("epoch_end expected %s but got %s.", self._current_epoch,
+                       epoch)
     self._current_epoch = None
 
   def step_begin(self, step):
@@ -171,8 +170,8 @@ class BaseMonitor(object):
       ValueError: if we've already begun a step, or `step` < 0, or
           `step` > `max_steps`.
     """
-    if (step < 0) or (
-        (self._max_steps is not None) and (step > self._max_steps)):
+    if (step < 0) or ((self._max_steps is not None) and
+                      (step > self._max_steps)):
       raise ValueError("Invalid step %s." % step)
     self._current_step = step
     return []
@@ -203,8 +202,8 @@ class BaseMonitor(object):
       ValueError: if we've not begun a step, or `step` number does not match.
     """
     if self._current_step != step:
-      raise ValueError(
-          "step_end expected %s but got %s.", self._current_step, step)
+      raise ValueError("step_end expected %s but got %s.", self._current_step,
+                       step)
     self._current_step = None
     return False
 
@@ -253,6 +252,7 @@ class EveryN(BaseMonitor):
   treatment.
 
   """
+
   # TODO(ipolosukhin): Add also every n seconds.
 
   def __init__(self, every_n_steps=100, first_n_steps=1):
@@ -475,8 +475,8 @@ class LoggingTrainable(EveryN):
     super(LoggingTrainable, self).every_n_step_begin(step)
     # Get a list of trainable variables at the beginning of every N steps.
     # We cannot get this in __init__ because train_op has not been generated.
-    trainables = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES,
-                                    scope=self._scope)
+    trainables = ops.get_collection(
+        ops.GraphKeys.TRAINABLE_VARIABLES, scope=self._scope)
     self._names = {}
     for var in trainables:
       self._names[var.name] = var.value().name
@@ -561,12 +561,19 @@ class ValidationMonitor(EveryN):
   provided.
   """
 
-  def __init__(self, x=None, y=None, input_fn=None, batch_size=None,
+  def __init__(self,
+               x=None,
+               y=None,
+               input_fn=None,
+               batch_size=None,
                eval_steps=None,
-               every_n_steps=100, metrics=None, hooks=None,
+               every_n_steps=100,
+               metrics=None,
+               hooks=None,
                early_stopping_rounds=None,
                early_stopping_metric="loss",
-               early_stopping_metric_minimize=True, name=None):
+               early_stopping_metric_minimize=True,
+               name=None):
     """Initializes a ValidationMonitor.
 
     Args:
@@ -597,8 +604,8 @@ class ValidationMonitor(EveryN):
     Raises:
       ValueError: If both x and input_fn are provided.
     """
-    super(ValidationMonitor, self).__init__(every_n_steps=every_n_steps,
-                                            first_n_steps=-1)
+    super(ValidationMonitor, self).__init__(
+        every_n_steps=every_n_steps, first_n_steps=-1)
     # TODO(mdan): Checks like this are already done by evaluate.
     if x is None and input_fn is None:
       raise ValueError("Either x or input_fn should be provided.")
@@ -654,20 +661,27 @@ class ValidationMonitor(EveryN):
 
   def _evaluate_estimator(self):
     if isinstance(self._estimator, core_estimator.Estimator):
-      if any((x is not None for x in
-              [self.x, self.y, self.batch_size, self.metrics])):
+      if any((x is not None
+              for x in [self.x, self.y, self.batch_size, self.metrics])):
         raise ValueError(
             "tf.estimator.Estimator does not support following "
             "arguments: x, y, batch_size, metrics. Should set as `None` "
             "in ValidationMonitor")
       return self._estimator.evaluate(
-          input_fn=self.input_fn, steps=self.eval_steps, hooks=self.hooks,
+          input_fn=self.input_fn,
+          steps=self.eval_steps,
+          hooks=self.hooks,
           name=self.name)
     else:
       return self._estimator.evaluate(
-          x=self.x, y=self.y, input_fn=self.input_fn,
-          batch_size=self.batch_size, steps=self.eval_steps,
-          metrics=self.metrics, hooks=self.hooks, name=self.name)
+          x=self.x,
+          y=self.y,
+          input_fn=self.input_fn,
+          batch_size=self.batch_size,
+          steps=self.eval_steps,
+          metrics=self.metrics,
+          hooks=self.hooks,
+          name=self.name)
 
   def every_n_step_end(self, step, outputs):
     super(ValidationMonitor, self).every_n_step_end(step, outputs)
@@ -700,8 +714,9 @@ class ValidationMonitor(EveryN):
     # Early stopping logic.
     if self.early_stopping_rounds is not None:
       if self.early_stopping_metric not in validation_outputs:
-        raise ValueError("Metric %s missing from outputs %s." % (
-            self.early_stopping_metric, set(validation_outputs.keys())))
+        raise ValueError("Metric %s missing from outputs %s." %
+                         (self.early_stopping_metric,
+                          set(validation_outputs.keys())))
       current_value = validation_outputs[self.early_stopping_metric]
       if (self._best_value is None or (self.early_stopping_metric_minimize and
                                        (current_value < self._best_value)) or
@@ -712,9 +727,9 @@ class ValidationMonitor(EveryN):
         self._best_value_step = step
       stop_now = (step - self._best_value_step >= self.early_stopping_rounds)
       if stop_now:
-        logging.info("Stopping. Best step: {} with {} = {}."
-                     .format(self._best_value_step,
-                             self.early_stopping_metric, self._best_value))
+        logging.info("Stopping. Best step: {} with {} = {}.".format(
+            self._best_value_step, self.early_stopping_metric,
+            self._best_value))
         self._early_stopped = True
         return True
     return False
@@ -763,8 +778,11 @@ class CaptureVariable(EveryN):
     self._var_values[step] = _extract_output(outputs, self._var_name)
 
 
-def get_default_monitors(loss_op=None, summary_op=None, save_summary_steps=100,
-                         output_dir=None, summary_writer=None):
+def get_default_monitors(loss_op=None,
+                         summary_op=None,
+                         save_summary_steps=100,
+                         output_dir=None,
+                         summary_writer=None):
   """Returns a default set of typically-used monitors.
 
   Args:
@@ -782,9 +800,12 @@ def get_default_monitors(loss_op=None, summary_op=None, save_summary_steps=100,
   if loss_op is not None:
     monitors.append(PrintTensor(tensor_names={"loss": loss_op.name}))
   if summary_op is not None:
-    monitors.append(SummarySaver(summary_op, save_steps=save_summary_steps,
-                                 output_dir=output_dir,
-                                 summary_writer=summary_writer))
+    monitors.append(
+        SummarySaver(
+            summary_op,
+            save_steps=save_summary_steps,
+            output_dir=output_dir,
+            summary_writer=summary_writer))
   return monitors
 
 
@@ -794,8 +815,10 @@ class GraphDump(BaseMonitor):
   Note, this is very expensive, prefer `PrintTensor` in production.
   """
 
-  IGNORE_OPS = ["Const", "Assign", "Identity", "Placeholder",
-                "RandomUniform", "Cast", "RestoreSlice"]
+  IGNORE_OPS = [
+      "Const", "Assign", "Identity", "Placeholder", "RandomUniform", "Cast",
+      "RestoreSlice"
+  ]
 
   def __init__(self, ignore_ops=None):
     """Initializes GraphDump monitor.
@@ -856,7 +879,7 @@ class GraphDump(BaseMonitor):
     this_output = self.data[step] if step in self.data else {}
     other_output = other_dump.data[step] if step in other_dump.data else {}
     for key in this_output:
-      if not isinstance(key, str) and not isinstance(key, unicode):
+      if not isinstance(key, six.string_types):
         continue
       if key not in other_output:
         raise ValueError("%s missing at step %s.", (key, step))
@@ -881,8 +904,8 @@ class ExportMonitor(EveryN):
   """Monitor that exports Estimator every N steps."""
 
   @deprecation.deprecated("2017-03-25",
-              "ExportMonitor is deprecated. Please pass an "
-              "ExportStrategy to Experiment instead.")
+                          "ExportMonitor is deprecated. Please pass an "
+                          "ExportStrategy to Experiment instead.")
   def __init__(self,
                every_n_steps,
                export_dir,
@@ -1088,8 +1111,7 @@ class CheckpointSaver(BaseMonitor):
 class StepCounter(EveryN):
   """Steps per second monitor."""
 
-  def __init__(self, every_n_steps=100, output_dir=None,
-               summary_writer=None):
+  def __init__(self, every_n_steps=100, output_dir=None, summary_writer=None):
     super(StepCounter, self).__init__(every_n_steps=every_n_steps)
     self._summary_tag = "global_step/sec"
     self._last_reported_step = None
@@ -1101,7 +1123,8 @@ class StepCounter(EveryN):
   def set_estimator(self, estimator):
     super(StepCounter, self).set_estimator(estimator)
     if self._summary_writer is None:
-      self._summary_writer = core_summary.FileWriterCache.get(estimator.model_dir)
+      self._summary_writer = core_summary.FileWriterCache.get(
+          estimator.model_dir)
 
   def every_n_step_end(self, current_step, outputs):
     current_time = time.time()
@@ -1109,8 +1132,9 @@ class StepCounter(EveryN):
       added_steps = current_step - self._last_reported_step
       elapsed_time = current_time - self._last_reported_time
       steps_per_sec = added_steps / elapsed_time
-      summary = Summary(value=[Summary.Value(tag=self._summary_tag,
-                                             simple_value=steps_per_sec)])
+      summary = Summary(value=[
+          Summary.Value(tag=self._summary_tag, simple_value=steps_per_sec)
+      ])
       self._summary_writer.add_summary(summary, current_step)
     self._last_reported_step = current_step
     self._last_reported_time = current_time
diff --git a/tensorflow/contrib/learn/python/learn/ops/ops_test.py b/tensorflow/contrib/learn/python/learn/ops/ops_test.py
index d0b9eb8abcbee187b6c53b7b419882f0a1e7da51..80d4923db37feb2a1304218f501ab51f9e0d9a14 100644
--- a/tensorflow/contrib/learn/python/learn/ops/ops_test.py
+++ b/tensorflow/contrib/learn/python/learn/ops/ops_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.layers import conv2d
 from tensorflow.contrib.learn.python.learn import ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/contrib/learn/python/learn/trainable.py b/tensorflow/contrib/learn/python/learn/trainable.py
index 972fec026f25d39dca75e8c5bafffb57fcd323fa..429b6040be21d8cbe1f2bba58090366552fdfbe7 100644
--- a/tensorflow/contrib/learn/python/learn/trainable.py
+++ b/tensorflow/contrib/learn/python/learn/trainable.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """`Trainable` interface."""
 
 from __future__ import absolute_import
@@ -28,18 +27,31 @@ class Trainable(object):
   __metaclass__ = abc.ABCMeta
 
   @abc.abstractmethod
-  def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
-          monitors=None, max_steps=None):
+  def fit(self,
+          x=None,
+          y=None,
+          input_fn=None,
+          steps=None,
+          batch_size=None,
+          monitors=None,
+          max_steps=None):
     """Trains a model given training data `x` predictions and `y` labels.
 
     Args:
-      x: Matrix of shape [n_samples, n_features...] or the dictionary of Matrices.
-         Can be iterator that returns arrays of features or dictionary of arrays of features.
-         The training input samples for fitting the model. If set, `input_fn` must be `None`.
-      y: Vector or matrix [n_samples] or [n_samples, n_outputs] or the dictionary of same.
-         Can be iterator that returns array of labels or dictionary of array of labels.
-         The training label values (class labels in classification, real numbers in regression).
-         If set, `input_fn` must be `None`. Note: For classification, label values must
+      x: Matrix of shape [n_samples, n_features...] or the dictionary of
+        Matrices.
+         Can be iterator that returns arrays of features or dictionary of arrays
+           of features.
+         The training input samples for fitting the model. If set, `input_fn`
+           must be `None`.
+      y: Vector or matrix [n_samples] or [n_samples, n_outputs] or the
+        dictionary of same.
+         Can be iterator that returns array of labels or dictionary of array of
+           labels.
+         The training label values (class labels in classification, real numbers
+           in regression).
+         If set, `input_fn` must be `None`. Note: For classification, label
+           values must
          be integers representing the class index (i.e. values from 0 to
          n_classes-1).
       input_fn: Input function returning a tuple of:
diff --git a/tensorflow/contrib/learn/python/learn/utils/export.py b/tensorflow/contrib/learn/python/learn/utils/export.py
index 6af2287761299f6725f9547917101c18b0cc0164..cb34cb1d26b6812c7f3f39e9f965615de5a8ef07 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export.py
@@ -20,7 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework import deprecated
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.session_bundle import exporter
 from tensorflow.contrib.session_bundle import gc
 from tensorflow.python.client import session as tf_session
@@ -78,7 +78,7 @@ def _export_graph(graph, saver, checkpoint_path, export_dir,
           default_graph_signature=default_graph_signature,
           named_graph_signatures=named_graph_signatures,
           assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS))
-      return export.export(export_dir, contrib_variables.get_global_step(),
+      return export.export(export_dir, training_util.get_global_step(),
                            session, exports_to_keep=exports_to_keep)
 
 
@@ -295,7 +295,7 @@ def _export_estimator(estimator,
   checkpoint_path = (checkpoint_path or
                      tf_saver.latest_checkpoint(estimator._model_dir))
   with ops.Graph().as_default() as g:
-    contrib_variables.create_global_step(g)
+    training_util.create_global_step(g)
 
     if use_deprecated_input_fn:
       examples = array_ops.placeholder(dtype=dtypes.string,
diff --git a/tensorflow/contrib/learn/python/learn/utils/export_test.py b/tensorflow/contrib/learn/python/learn/utils/export_test.py
index 95070ada3b9d3ccb00009bd9b885e8163d7fbed4..9bfb1fc952c07bd6c09d1f1074e8dc5539dc0529 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export_test.py
@@ -50,6 +50,7 @@ def _training_input_fn():
 
 
 class ExportTest(test.TestCase):
+
   def _get_default_signature(self, export_meta_filename):
     """ Gets the default signature from the export.meta file. """
     with session.Session():
@@ -69,18 +70,18 @@ class ExportTest(test.TestCase):
     # Only the written checkpoints are exported.
     self.assertTrue(
         saver.checkpoint_exists(os.path.join(export_dir, '00000001', 'export')),
-        'Exported checkpoint expected but not found: %s' %
-        os.path.join(export_dir, '00000001', 'export'))
+        'Exported checkpoint expected but not found: %s' % os.path.join(
+            export_dir, '00000001', 'export'))
     self.assertTrue(
         saver.checkpoint_exists(os.path.join(export_dir, '00000010', 'export')),
-        'Exported checkpoint expected but not found: %s' %
-        os.path.join(export_dir, '00000010', 'export'))
+        'Exported checkpoint expected but not found: %s' % os.path.join(
+            export_dir, '00000010', 'export'))
     self.assertEquals(
         six.b(os.path.join(export_dir, '00000010')),
         export_monitor.last_export_dir)
     # Validate the signature
     signature = self._get_default_signature(
-      os.path.join(export_dir, '00000010', 'export.meta'))
+        os.path.join(export_dir, '00000010', 'export.meta'))
     self.assertTrue(signature.HasField(expected_signature))
 
   def testExportMonitor_EstimatorProvidesSignature(self):
@@ -116,8 +117,7 @@ class ExportTest(test.TestCase):
     def _serving_input_fn():
       return {
           _X_KEY:
-              random_ops.random_uniform(
-                  shape=(1,), minval=0.0, maxval=1000.0)
+              random_ops.random_uniform(shape=(1,), minval=0.0, maxval=1000.0)
       }, None
 
     input_feature_key = 'my_example_key'
@@ -160,8 +160,7 @@ class ExportTest(test.TestCase):
           input_feature_key:
               None,
           _X_KEY:
-              random_ops.random_uniform(
-                  shape=(1,), minval=0.0, maxval=1000.0)
+              random_ops.random_uniform(shape=(1,), minval=0.0, maxval=1000.0)
       }, None
 
     monitor = learn.monitors.ExportMonitor(
@@ -182,8 +181,7 @@ class ExportTest(test.TestCase):
     def _serving_input_fn():
       return {
           input_feature_key:
-              array_ops.placeholder(
-                  dtype=dtypes.string, shape=(1,))
+              array_ops.placeholder(dtype=dtypes.string, shape=(1,))
       }, None
 
     monitor = learn.monitors.ExportMonitor(
@@ -204,11 +202,9 @@ class ExportTest(test.TestCase):
     def _serving_input_fn():
       return {
           input_feature_key:
-              array_ops.placeholder(
-                  dtype=dtypes.string, shape=(1,)),
+              array_ops.placeholder(dtype=dtypes.string, shape=(1,)),
           _X_KEY:
-              random_ops.random_uniform(
-                  shape=(1,), minval=0.0, maxval=1000.0)
+              random_ops.random_uniform(shape=(1,), minval=0.0, maxval=1000.0)
       }, None
 
     export_dir = os.path.join(tempfile.mkdtemp(), 'export')
@@ -227,8 +223,8 @@ class ExportTest(test.TestCase):
 
     def _regression_signature(examples, unused_features, predictions):
       signatures = {}
-      signatures['regression'] = (exporter.regression_signature(examples,
-                                                                predictions))
+      signatures['regression'] = (
+          exporter.regression_signature(examples, predictions))
       return signatures['regression'], signatures
 
     random.seed(42)
@@ -248,10 +244,10 @@ class ExportTest(test.TestCase):
     with self.assertRaises(errors.NotFoundError):
       saver.checkpoint_exists(os.path.join(export_dir, '00000000', 'export'))
     self.assertTrue(
-      saver.checkpoint_exists(os.path.join(export_dir, '00000010', 'export')))
+        saver.checkpoint_exists(os.path.join(export_dir, '00000010', 'export')))
     # Validate the signature
     signature = self._get_default_signature(
-      os.path.join(export_dir, '00000010', 'export.meta'))
+        os.path.join(export_dir, '00000010', 'export.meta'))
     self.assertTrue(signature.HasField('regression_signature'))
 
 
diff --git a/tensorflow/contrib/learn/python/learn/utils/gc_test.py b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
index 76cfd88e1d68856907131f7e2bae65d4c9fcc4b1..e7d091e18a8f186f89f5217442c24fb106c5cdab 100644
--- a/tensorflow/contrib/learn/python/learn/utils/gc_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
@@ -34,12 +34,13 @@ def _create_parser(base_dir):
   # create a simple parser that pulls the export_version from the directory.
   def parser(path):
     # Modify the path object for RegEx match for Windows Paths
-    if os.name == 'nt':
-      match = re.match("^" + compat.as_str_any(base_dir).replace('\\','/') + "/(\\d+)$",
-                      compat.as_str_any(path.path).replace('\\','/'))
+    if os.name == "nt":
+      match = re.match(
+          "^" + compat.as_str_any(base_dir).replace("\\", "/") + "/(\\d+)$",
+          compat.as_str_any(path.path).replace("\\", "/"))
     else:
       match = re.match("^" + compat.as_str_any(base_dir) + "/(\\d+)$",
-                      compat.as_str_any(path.path))
+                       compat.as_str_any(path.path))
     if not match:
       return None
     return path._replace(export_version=int(match.group(1)))
@@ -63,7 +64,9 @@ class GcTest(test_util.TensorFlowTestCase):
 
   def testModExportVersion(self):
     paths = [
-        gc.Path("/foo", 4), gc.Path("/foo", 5), gc.Path("/foo", 6),
+        gc.Path("/foo", 4),
+        gc.Path("/foo", 5),
+        gc.Path("/foo", 6),
         gc.Path("/foo", 9)
     ]
     mod = gc.mod_export_version(2)
@@ -73,14 +76,21 @@ class GcTest(test_util.TensorFlowTestCase):
 
   def testOneOfEveryNExportVersions(self):
     paths = [
-        gc.Path("/foo", 0), gc.Path("/foo", 1), gc.Path("/foo", 3),
-        gc.Path("/foo", 5), gc.Path("/foo", 6), gc.Path("/foo", 7),
-        gc.Path("/foo", 8), gc.Path("/foo", 33)
+        gc.Path("/foo", 0),
+        gc.Path("/foo", 1),
+        gc.Path("/foo", 3),
+        gc.Path("/foo", 5),
+        gc.Path("/foo", 6),
+        gc.Path("/foo", 7),
+        gc.Path("/foo", 8),
+        gc.Path("/foo", 33)
     ]
     one_of = gc.one_of_every_n_export_versions(3)
     self.assertEqual(
         one_of(paths), [
-            gc.Path("/foo", 3), gc.Path("/foo", 6), gc.Path("/foo", 8),
+            gc.Path("/foo", 3),
+            gc.Path("/foo", 6),
+            gc.Path("/foo", 8),
             gc.Path("/foo", 33)
         ])
 
@@ -98,13 +108,19 @@ class GcTest(test_util.TensorFlowTestCase):
     f = gc.union(gc.largest_export_versions(3), gc.mod_export_version(3))
     self.assertEqual(
         f(paths), [
-            gc.Path("/foo", 0), gc.Path("/foo", 3), gc.Path("/foo", 6),
-            gc.Path("/foo", 7), gc.Path("/foo", 8), gc.Path("/foo", 9)
+            gc.Path("/foo", 0),
+            gc.Path("/foo", 3),
+            gc.Path("/foo", 6),
+            gc.Path("/foo", 7),
+            gc.Path("/foo", 8),
+            gc.Path("/foo", 9)
         ])
 
   def testNegation(self):
     paths = [
-        gc.Path("/foo", 4), gc.Path("/foo", 5), gc.Path("/foo", 6),
+        gc.Path("/foo", 4),
+        gc.Path("/foo", 5),
+        gc.Path("/foo", 6),
         gc.Path("/foo", 9)
     ]
     mod = gc.negation(gc.mod_export_version(2))
@@ -121,8 +137,7 @@ class GcTest(test_util.TensorFlowTestCase):
     gfile.MakeDirs(os.path.join(base_dir, "ignore"))
 
     self.assertEqual(
-        gc.get_paths(base_dir, _create_parser(base_dir)),
-        [
+        gc.get_paths(base_dir, _create_parser(base_dir)), [
             gc.Path(os.path.join(base_dir, "0"), 0),
             gc.Path(os.path.join(base_dir, "1"), 1),
             gc.Path(os.path.join(base_dir, "2"), 2)
@@ -131,10 +146,10 @@ class GcTest(test_util.TensorFlowTestCase):
   def testMixedStrTypes(self):
     temp_dir = compat.as_bytes(test.get_temp_dir())
 
-    for sub_dir in ['str', b'bytes', u'unicode']:
+    for sub_dir in ["str", b"bytes", u"unicode"]:
       base_dir = os.path.join(
-          (temp_dir if isinstance(sub_dir, bytes) else temp_dir.decode()),
-          sub_dir)
+          (temp_dir
+           if isinstance(sub_dir, bytes) else temp_dir.decode()), sub_dir)
       self.assertFalse(gfile.Exists(base_dir))
       gfile.MakeDirs(os.path.join(compat.as_str_any(base_dir), "42"))
       gc.get_paths(base_dir, _create_parser(base_dir))
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index 6ffd2a133995a6ff8b35540221fb5676bf5de19f..1593380007b2799fb1d17e92408ab19a7b47fe1e 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -33,7 +33,6 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import tempfile
 import time
 
 from tensorflow.contrib.layers.python.layers import feature_column
@@ -51,6 +50,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.summary import summary_iterator
 from tensorflow.python.training import saver
 
 from tensorflow.python.util import compat
@@ -391,7 +391,8 @@ def make_export_strategy(serving_input_fn,
                          default_output_alternative_key=None,
                          assets_extra=None,
                          as_text=False,
-                         exports_to_keep=5):
+                         exports_to_keep=5,
+                         strip_default_attrs=None):
   """Create an ExportStrategy for use with Experiment.
 
   Args:
@@ -412,12 +413,16 @@ def make_export_strategy(serving_input_fn,
     exports_to_keep: Number of exports to keep.  Older exports will be
       garbage-collected.  Defaults to 5.  Set to None to disable garbage
       collection.
+    strip_default_attrs: Boolean. If True, default attrs in the
+      `GraphDef` will be stripped on write. This is recommended for better
+      forward compatibility of the resulting `SavedModel`.
 
   Returns:
     An ExportStrategy that can be passed to the Experiment constructor.
   """
 
-  def export_fn(estimator, export_dir_base, checkpoint_path=None):
+  def export_fn(estimator, export_dir_base, checkpoint_path=None,
+                strip_default_attrs=False):
     """Exports the given Estimator as a SavedModel.
 
     Args:
@@ -426,6 +431,8 @@ def make_export_strategy(serving_input_fn,
         graph and checkpoints.
       checkpoint_path: The checkpoint path to export.  If None (the default),
         the most recent checkpoint found within the model directory is chosen.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will
+        be removed from the NodeDefs.
 
     Returns:
       The string path to the exported directory.
@@ -444,7 +451,8 @@ def make_export_strategy(serving_input_fn,
           serving_input_fn,
           assets_extra=assets_extra,
           as_text=as_text,
-          checkpoint_path=checkpoint_path)
+          checkpoint_path=checkpoint_path,
+          strip_default_attrs=strip_default_attrs)
     else:
       export_result = estimator.export_savedmodel(
           export_dir_base,
@@ -452,12 +460,13 @@ def make_export_strategy(serving_input_fn,
           default_output_alternative_key=default_output_alternative_key,
           assets_extra=assets_extra,
           as_text=as_text,
-          checkpoint_path=checkpoint_path)
+          checkpoint_path=checkpoint_path,
+          strip_default_attrs=strip_default_attrs)
 
     garbage_collect_exports(export_dir_base, exports_to_keep)
     return export_result
 
-  return export_strategy.ExportStrategy('Servo', export_fn)
+  return export_strategy.ExportStrategy('Servo', export_fn, strip_default_attrs)
 
 
 def make_parsing_export_strategy(feature_columns,
@@ -465,7 +474,8 @@ def make_parsing_export_strategy(feature_columns,
                                  assets_extra=None,
                                  as_text=False,
                                  exports_to_keep=5,
-                                 target_core=False):
+                                 target_core=False,
+                                 strip_default_attrs=None):
   """Create an ExportStrategy for use with Experiment, using `FeatureColumn`s.
 
   Creates a SavedModel export that expects to be fed with a single string
@@ -493,6 +503,9 @@ def make_parsing_export_strategy(feature_columns,
     target_core: If True, prepare an ExportStrategy for use with
       tensorflow.python.estimator.*.  If False (default), prepare an
       ExportStrategy for use with tensorflow.contrib.learn.python.learn.*.
+    strip_default_attrs: Boolean. If True, default attrs in the
+      `GraphDef` will be stripped on write. This is recommended for better
+      forward compatibility of the resulting `SavedModel`.
 
   Returns:
     An ExportStrategy that can be passed to the Experiment constructor.
@@ -509,7 +522,8 @@ def make_parsing_export_strategy(feature_columns,
       default_output_alternative_key=default_output_alternative_key,
       assets_extra=assets_extra,
       as_text=as_text,
-      exports_to_keep=exports_to_keep)
+      exports_to_keep=exports_to_keep,
+      strip_default_attrs=strip_default_attrs)
 
 
 def _default_compare_fn(curr_best_eval_result, cand_eval_result):
@@ -543,15 +557,16 @@ def _default_compare_fn(curr_best_eval_result, cand_eval_result):
 class BestModelSelector(object):
   """A helper that keeps track of export selection candidates."""
 
-  def __init__(self, compare_fn=None):
+  def __init__(self, event_file_pattern=None, compare_fn=None):
     """Constructor of this class.
 
     Args:
+      event_file_pattern: absolute event file name pattern.
       compare_fn: a function that returns true if the candidate is better than
         the current best model.
     """
-    self._best_eval_result = None
     self._compare_fn = compare_fn or _default_compare_fn
+    self._best_eval_result = self._get_best_eval_result(event_file_pattern)
 
   def update(self, checkpoint_path, eval_result):
     """Records a given checkpoint and exports if this is the best model.
@@ -581,11 +596,40 @@ class BestModelSelector(object):
     else:
       return '', None
 
+  def _get_best_eval_result(self, event_files):
+    """Get the best eval result from event files.
 
-def make_best_model_export_strategy(serving_input_fn,
-                                    exports_to_keep=1,
-                                    compare_fn=None,
-                                    default_output_alternative_key=None):
+    Args:
+      event_files: Absolute pattern of event files.
+
+    Returns:
+      The best eval result.
+    """
+    if not event_files:
+      return None
+
+    best_eval_result = None
+    for event_file in gfile.Glob(os.path.join(event_files)):
+      for event in summary_iterator.summary_iterator(event_file):
+        if event.HasField('summary'):
+          event_eval_result = {}
+          for value in event.summary.value:
+            if value.HasField('simple_value'):
+              event_eval_result[value.tag] = value.simple_value
+          if best_eval_result is None or self._compare_fn(
+              best_eval_result, event_eval_result):
+            best_eval_result = event_eval_result
+    return best_eval_result
+
+
+def make_best_model_export_strategy(
+    serving_input_fn,
+    exports_to_keep=1,
+    model_dir=None,
+    event_file_pattern=None,
+    compare_fn=None,
+    default_output_alternative_key=None,
+    strip_default_attrs=None):
   """Creates an custom ExportStrategy for use with tf.contrib.learn.Experiment.
 
   Args:
@@ -593,10 +637,24 @@ def make_best_model_export_strategy(serving_input_fn,
       `InputFnOps`.
     exports_to_keep: an integer indicating how many historical best models need
       to be preserved.
+    model_dir: Directory where model parameters, graph etc. are saved. This will
+        be used to load eval metrics from the directory when the export strategy
+        is created. So the best metrics would not be lost even if the export
+        strategy got preempted, which guarantees that only the best model would
+        be exported regardless of preemption. If None, however, the export
+        strategy would not be preemption-safe. To be preemption-safe, both
+        model_dir and event_file_pattern would be needed.
+    event_file_pattern: event file name pattern relative to model_dir, e.g.
+        "eval_continuous/*.tfevents.*". If None, however, the export strategy
+        would not be preemption-safe. To be preemption-safe, both
+        model_dir and event_file_pattern would be needed.
     compare_fn: a function that select the 'best' candidate from a dictionary
         of evaluation result keyed by corresponding checkpoint path.
     default_output_alternative_key: the key for default serving signature for
         multi-headed inference graphs.
+    strip_default_attrs: Boolean. If True, default attrs in the
+      `GraphDef` will be stripped on write. This is recommended for better
+      forward compatibility of the resulting `SavedModel`.
 
   Returns:
     An ExportStrategy that can be passed to the Experiment constructor.
@@ -604,9 +662,13 @@ def make_best_model_export_strategy(serving_input_fn,
   best_model_export_strategy = make_export_strategy(
       serving_input_fn,
       exports_to_keep=exports_to_keep,
-      default_output_alternative_key=default_output_alternative_key)
+      default_output_alternative_key=default_output_alternative_key,
+      strip_default_attrs=strip_default_attrs)
 
-  best_model_selector = BestModelSelector(compare_fn)
+  full_event_file_pattern = os.path.join(
+      model_dir,
+      event_file_pattern) if model_dir and event_file_pattern else None
+  best_model_selector = BestModelSelector(full_event_file_pattern, compare_fn)
 
   def export_fn(estimator, export_dir_base, checkpoint_path, eval_result=None):
     """Exports the given Estimator as a SavedModel.
@@ -682,22 +744,36 @@ def extend_export_strategy(base_export_strategy,
       ValueError: If `estimator` is a ${tf.estimator.Estimator} instance
         and `default_output_alternative_key` was specified or if post_export_fn
         does not return a valid directory.
+      RuntimeError: If unable to create temporary or final export directory.
     """
-    tmp_base_export_dir = tempfile.mkdtemp()
+    tmp_base_export_folder = 'temp-base-export-' + str(int(time.time()))
+    tmp_base_export_dir = os.path.join(export_dir_base, tmp_base_export_folder)
+    if gfile.Exists(tmp_base_export_dir):
+      raise RuntimeError('Failed to obtain base export directory')
+    gfile.MakeDirs(tmp_base_export_dir)
     tmp_base_export = base_export_strategy.export(
         estimator, tmp_base_export_dir, checkpoint_path)
-    tmp_post_export_dir = tempfile.mkdtemp()
+
+    tmp_post_export_folder = 'temp-post-export-' + str(int(time.time()))
+    tmp_post_export_dir = os.path.join(export_dir_base, tmp_post_export_folder)
+    if gfile.Exists(tmp_post_export_dir):
+      raise RuntimeError('Failed to obtain temp export directory')
+
+    gfile.MakeDirs(tmp_post_export_dir)
     tmp_post_export = post_export_fn(tmp_base_export, tmp_post_export_dir)
 
     if not tmp_post_export.startswith(tmp_post_export_dir):
       raise ValueError('post_export_fn must return a sub-directory of {}'
                        .format(tmp_post_export_dir))
-    export_relpath = os.path.relpath(tmp_post_export, tmp_post_export_dir)
-
-    gfile.Rename(
-        os.path.join(tmp_post_export_dir, export_relpath),
-        os.path.join(export_dir_base, export_relpath))
-    return os.path.join(export_dir_base, export_relpath)
+    post_export_relpath = os.path.relpath(tmp_post_export, tmp_post_export_dir)
+    post_export = os.path.join(export_dir_base, post_export_relpath)
+    if gfile.Exists(post_export):
+      raise RuntimeError('Failed to obtain final export directory')
+    gfile.Rename(tmp_post_export, post_export)
+
+    gfile.DeleteRecursively(tmp_base_export_dir)
+    gfile.DeleteRecursively(tmp_post_export_dir)
+    return post_export
 
   name = post_export_name if post_export_name else base_export_strategy.name
   return export_strategy.ExportStrategy(name, export_fn)
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
index ec3a88003f01b3b62591c13472029601b11ba491..14bf1136e8e9ab1488c4850d458382028ec5583d 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
@@ -24,13 +24,14 @@ import time
 from tensorflow.contrib.layers.python.layers import feature_column as fc
 from tensorflow.contrib.learn.python.learn import export_strategy as export_strategy_lib
 from tensorflow.contrib.learn.python.learn.estimators import constants
-from tensorflow.contrib.learn.python.learn.estimators import estimator as core_estimator
+from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.utils import input_fn_utils
 from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
@@ -41,7 +42,7 @@ from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.util import compat
 
 
-class TestEstimator(core_estimator.Estimator):
+class TestEstimator(estimator.Estimator):
 
   def __init__(self, *args, **kwargs):
     super(TestEstimator, self).__init__(*args, **kwargs)
@@ -55,7 +56,8 @@ class TestEstimator(core_estimator.Estimator):
                         default_output_alternative_key=None,
                         assets_extra=None,
                         as_text=False,
-                        checkpoint_path=None):
+                        checkpoint_path=None,
+                        strip_default_attrs=False):
 
     if not os.path.exists(export_dir):
       os.makedirs(export_dir)
@@ -93,9 +95,9 @@ class SavedModelExportUtilsTest(test.TestCase):
             name="input-tensor-1:0", dtype=dtype_string, tensor_shape=shape))
     expected_signature_def.outputs[
         signature_constants.REGRESS_OUTPUTS].CopyFrom(
-            meta_graph_pb2.TensorInfo(name="output-tensor-1:0",
-                                      dtype=dtype_float,
-                                      tensor_shape=shape))
+            meta_graph_pb2.TensorInfo(
+                name="output-tensor-1:0", dtype=dtype_float,
+                tensor_shape=shape))
 
     expected_signature_def.method_name = signature_constants.REGRESS_METHOD_NAME
     self.assertEqual(actual_signature_def, expected_signature_def)
@@ -506,7 +508,9 @@ class SavedModelExportUtilsTest(test.TestCase):
     input_example = constant_op.constant(["input string"])
     input_ops = input_fn_utils.InputFnOps({
         "features": input_features
-    }, None, {"default input": input_example})
+    }, None, {
+        "default input": input_example
+    })
     input_alternatives, _ = (
         saved_model_export_utils.get_input_alternatives(input_ops))
     output_1 = constant_op.constant([1.0])
@@ -527,8 +531,9 @@ class SavedModelExportUtilsTest(test.TestCase):
         model_fn.ModeKeys.INFER,
         predictions={"some_output": constant_op.constant(["4"])},
         output_alternatives=provided_output_alternatives)
-    output_alternatives, _ = (saved_model_export_utils.get_output_alternatives(
-        model_fn_ops, "head-1"))
+    output_alternatives, _ = (
+        saved_model_export_utils.get_output_alternatives(
+            model_fn_ops, "head-1"))
 
     signature_defs = saved_model_export_utils.build_all_signature_defs(
         input_alternatives, output_alternatives, "head-1")
@@ -546,7 +551,9 @@ class SavedModelExportUtilsTest(test.TestCase):
         "default_input_alternative:head-3":
             signature_def_utils.predict_signature_def({
                 "default input": input_example
-            }, {"some_output_3": output_3}),
+            }, {
+                "some_output_3": output_3
+            }),
         # "features_input_alternative:head-1":
         #     signature_def_utils.regression_signature_def(input_features,
         #                                                  output_1),
@@ -589,8 +596,9 @@ class SavedModelExportUtilsTest(test.TestCase):
         model_fn.ModeKeys.INFER,
         predictions={"some_output": constant_op.constant(["4"])},
         output_alternatives=provided_output_alternatives)
-    output_alternatives, _ = (saved_model_export_utils.get_output_alternatives(
-        model_fn_ops, "head-1"))
+    output_alternatives, _ = (
+        saved_model_export_utils.get_output_alternatives(
+            model_fn_ops, "head-1"))
 
     with self.assertRaisesRegexp(
         ValueError, "A default input_alternative must be provided"):
@@ -706,25 +714,72 @@ class SavedModelExportUtilsTest(test.TestCase):
 
     self.assertNotEqual("",
                         export_strategy.export(test_estimator, export_dir_base,
-                                               "fake_ckpt_0", {"loss": 100}))
+                                               "fake_ckpt_0", {
+                                                   "loss": 100
+                                               }))
     self.assertNotEqual("", test_estimator.last_exported_dir)
     self.assertNotEqual("", test_estimator.last_exported_checkpoint)
 
     self.assertEqual("",
                      export_strategy.export(test_estimator, export_dir_base,
-                                            "fake_ckpt_1", {"loss": 101}))
+                                            "fake_ckpt_1", {
+                                                "loss": 101
+                                            }))
     self.assertEqual(test_estimator.last_exported_dir,
                      os.path.join(export_dir_base, "fake_ckpt_0"))
 
     self.assertNotEqual("",
                         export_strategy.export(test_estimator, export_dir_base,
-                                               "fake_ckpt_2", {"loss": 10}))
+                                               "fake_ckpt_2", {
+                                                   "loss": 10
+                                               }))
+    self.assertEqual(test_estimator.last_exported_dir,
+                     os.path.join(export_dir_base, "fake_ckpt_2"))
+
+    self.assertEqual("",
+                     export_strategy.export(test_estimator, export_dir_base,
+                                            "fake_ckpt_3", {
+                                                "loss": 20
+                                            }))
+    self.assertEqual(test_estimator.last_exported_dir,
+                     os.path.join(export_dir_base, "fake_ckpt_2"))
+
+  def test_make_best_model_export_strategy_with_preemption(self):
+    model_dir = self.get_temp_dir()
+    eval_dir_base = os.path.join(model_dir, "eval_continuous")
+    core_estimator._write_dict_to_summary(eval_dir_base, {"loss": 50}, 1)
+    core_estimator._write_dict_to_summary(eval_dir_base, {"loss": 60}, 2)
+
+    test_estimator = TestEstimator()
+    export_strategy = saved_model_export_utils.make_best_model_export_strategy(
+        serving_input_fn=None,
+        exports_to_keep=3,
+        model_dir=model_dir,
+        event_file_pattern="eval_continuous/*.tfevents.*",
+        compare_fn=None)
+
+    export_dir_base = os.path.join(self.get_temp_dir(), "export")
+    self.assertEqual("",
+                     export_strategy.export(test_estimator, export_dir_base,
+                                            "fake_ckpt_0", {
+                                                "loss": 100
+                                            }))
+    self.assertEqual("", test_estimator.last_exported_dir)
+    self.assertEqual("", test_estimator.last_exported_checkpoint)
+
+    self.assertNotEqual("",
+                        export_strategy.export(test_estimator, export_dir_base,
+                                               "fake_ckpt_2", {
+                                                   "loss": 10
+                                               }))
     self.assertEqual(test_estimator.last_exported_dir,
                      os.path.join(export_dir_base, "fake_ckpt_2"))
 
     self.assertEqual("",
                      export_strategy.export(test_estimator, export_dir_base,
-                                            "fake_ckpt_3", {"loss": 20}))
+                                            "fake_ckpt_3", {
+                                                "loss": 20
+                                            }))
     self.assertEqual(test_estimator.last_exported_dir,
                      os.path.join(export_dir_base, "fake_ckpt_2"))
 
@@ -766,10 +821,11 @@ class SavedModelExportUtilsTest(test.TestCase):
 
     test_estimator = TestEstimator()
     tmpdir = tempfile.mkdtemp()
-    final_path = final_export_strategy.export(test_estimator, tmpdir,
-                                              os.path.join(
-                                                  tmpdir, "checkpoint"))
-    self.assertEqual(os.path.join(tmpdir, "rewrite"), final_path)
+    export_model_dir = os.path.join(tmpdir, "model")
+    checkpoint_path = os.path.join(tmpdir, "checkpoint")
+    final_path = final_export_strategy.export(test_estimator, export_model_dir,
+                                              checkpoint_path)
+    self.assertEqual(os.path.join(export_model_dir, "rewrite"), final_path)
 
   def test_extend_export_strategy_same_name(self):
 
@@ -795,10 +851,11 @@ class SavedModelExportUtilsTest(test.TestCase):
 
     test_estimator = TestEstimator()
     tmpdir = tempfile.mkdtemp()
-    final_path = final_export_strategy.export(test_estimator, tmpdir,
-                                              os.path.join(
-                                                  tmpdir, "checkpoint"))
-    self.assertEqual(os.path.join(tmpdir, "rewrite"), final_path)
+    export_model_dir = os.path.join(tmpdir, "model")
+    checkpoint_path = os.path.join(tmpdir, "checkpoint")
+    final_path = final_export_strategy.export(test_estimator, export_model_dir,
+                                              checkpoint_path)
+    self.assertEqual(os.path.join(export_model_dir, "rewrite"), final_path)
 
   def test_extend_export_strategy_raises_error(self):
 
diff --git a/tensorflow/contrib/legacy_seq2seq/python/__init__.py b/tensorflow/contrib/legacy_seq2seq/python/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..52e83069cb0c68b510da46149248369dce376647 100644
--- a/tensorflow/contrib/legacy_seq2seq/python/__init__.py
+++ b/tensorflow/contrib/legacy_seq2seq/python/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/__init__.py b/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..52e83069cb0c68b510da46149248369dce376647 100644
--- a/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/__init__.py
+++ b/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/libsvm/BUILD b/tensorflow/contrib/libsvm/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..df96402a4ffd51840f77d58d8066487030362340
--- /dev/null
+++ b/tensorflow/contrib/libsvm/BUILD
@@ -0,0 +1,102 @@
+package(
+    default_visibility = ["//visibility:private"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+tf_custom_op_library(
+    name = "python/ops/_libsvm_ops.so",
+    srcs = [
+        "kernels/decode_libsvm_op.cc",
+        "ops/libsvm_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/core/kernels:bounds_check_lib",
+    ],
+)
+
+tf_kernel_library(
+    name = "libsvm_kernels",
+    srcs = ["kernels/decode_libsvm_op.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:bounds_check_lib",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["libsvm_ops"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "libsvm_ops",
+    deps = [":libsvm_ops_op_lib"],
+)
+
+tf_custom_op_py_library(
+    name = "libsvm",
+    srcs = [
+        "__init__.py",
+        "python/ops/libsvm_ops.py",
+    ],
+    dso = [
+        ":python/ops/_libsvm_ops.so",
+    ],
+    kernels = [
+        ":libsvm_kernels",
+        ":libsvm_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libsvm_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+    ],
+)
+
+tf_py_test(
+    name = "decode_libsvm_op_test",
+    srcs = ["python/kernel_tests/decode_libsvm_op_test.py"],
+    additional_deps = [
+        ":libsvm",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/libsvm/__init__.py b/tensorflow/contrib/libsvm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a875863caab29eb59a1834ca9184a5e272cb6656
--- /dev/null
+++ b/tensorflow/contrib/libsvm/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Libsvm decoder.
+
+@@decode_libsvm
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.libsvm.python.ops.libsvm_ops import decode_libsvm
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "decode_libsvm",
+]
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc b/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..720c74e3de5907fa006227d1278c45fd2175fe5f
--- /dev/null
+++ b/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc
@@ -0,0 +1,168 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+template <typename T, typename Tlabel>
+class DecodeLibsvmOp : public OpKernel {
+ public:
+  explicit DecodeLibsvmOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_features", &num_features_));
+    OP_REQUIRES(ctx, (num_features_ >= 1),
+                errors::InvalidArgument("Invalid number of features \"",
+                                        num_features_, "\""));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    const auto& input_flat = input_tensor->flat<string>();
+
+    Tensor* label_tensor;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, input_tensor->shape(), &label_tensor));
+    auto label = label_tensor->flat<Tlabel>();
+
+    std::vector<T> out_values;
+    std::vector<std::pair<int64, int64>> out_indices;
+    for (int i = 0; i < input_flat.size(); ++i) {
+      StringPiece line(input_flat(i));
+      str_util::RemoveWhitespaceContext(&line);
+
+      StringPiece piece;
+      OP_REQUIRES(ctx, str_util::ConsumeNonWhitespace(&line, &piece),
+                  errors::InvalidArgument("No label found for input[", i,
+                                          "]: \"", input_flat(i), "\""));
+
+      Tlabel label_value;
+      OP_REQUIRES(ctx,
+                  strings::SafeStringToNumeric<Tlabel>(piece, &label_value),
+                  errors::InvalidArgument("Label format incorrect: ", piece));
+
+      label(i) = label_value;
+
+      str_util::RemoveLeadingWhitespace(&line);
+      while (str_util::ConsumeNonWhitespace(&line, &piece)) {
+        size_t p = piece.find(':');
+        OP_REQUIRES(ctx, (p != StringPiece::npos),
+                    errors::InvalidArgument("Invalid feature \"", piece, "\""));
+
+        int64 feature_index;
+        OP_REQUIRES(
+            ctx, strings::safe_strto64(piece.substr(0, p), &feature_index),
+            errors::InvalidArgument("Feature format incorrect: ", piece));
+        OP_REQUIRES(ctx, (feature_index >= 0),
+                    errors::InvalidArgument(
+                        "Feature index should be >= 0, got ", feature_index));
+
+        T feature_value;
+        OP_REQUIRES(
+
+            ctx,
+            strings::SafeStringToNumeric<T>(piece.substr(p + 1),
+                                            &feature_value),
+            errors::InvalidArgument("Feature format incorrect: ", piece));
+
+        out_values.emplace_back(feature_value);
+        out_indices.emplace_back(std::pair<int64, int64>(i, feature_index));
+
+        str_util::RemoveLeadingWhitespace(&line);
+      }
+    }
+
+    Tensor* indices_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                            1,
+                            TensorShape({static_cast<int64>(out_indices.size()),
+                                         input_tensor->shape().dims() + 1}),
+                            &indices_tensor));
+    auto indices = indices_tensor->matrix<int64>();
+    // Translate flat index to shaped index like np.unravel_index
+    // Calculate factors for each dimension
+    std::vector<int64> factors(input_tensor->shape().dims());
+    factors[input_tensor->shape().dims() - 1] = 1;
+    for (int j = input_tensor->shape().dims() - 2; j >= 0; j--) {
+      factors[j] = factors[j + 1] * input_tensor->shape().dim_size(j + 1);
+    }
+    for (int i = 0; i < out_indices.size(); i++) {
+      indices(i, 0) = out_indices[i].first;
+      int64 value = out_indices[i].first;
+      for (int j = 0; j < input_tensor->shape().dims(); j++) {
+        indices(i, j) = value / factors[j];
+        value = value % factors[j];
+      }
+      indices(i, input_tensor->shape().dims()) = out_indices[i].second;
+    }
+
+    Tensor* values_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(
+                       2, TensorShape({static_cast<int64>(out_values.size())}),
+                       &values_tensor));
+    auto values = values_tensor->vec<T>();
+    std::copy_n(out_values.begin(), out_values.size(), &values(0));
+
+    Tensor* shape_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(
+                            3, TensorShape({input_tensor->shape().dims() + 1}),
+                            &shape_tensor));
+    auto shape = shape_tensor->flat<int64>();
+    for (int i = 0; i < input_tensor->shape().dims(); i++) {
+      shape(i) = input_tensor->shape().dim_size(i);
+    }
+    shape(input_tensor->shape().dims()) = num_features_;
+  }
+
+ private:
+  int64 num_features_;
+};
+
+#define REGISTER_KERNEL(type)                                         \
+  REGISTER_KERNEL_BUILDER(Name("DecodeLibsvm")                        \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("dtype")          \
+                              .TypeConstraint<int32>("label_dtype"),  \
+                          DecodeLibsvmOp<type, int32>);               \
+  REGISTER_KERNEL_BUILDER(Name("DecodeLibsvm")                        \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("dtype")          \
+                              .TypeConstraint<int64>("label_dtype"),  \
+                          DecodeLibsvmOp<type, int64>);               \
+  REGISTER_KERNEL_BUILDER(Name("DecodeLibsvm")                        \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("dtype")          \
+                              .TypeConstraint<float>("label_dtype"),  \
+                          DecodeLibsvmOp<type, float>);               \
+  REGISTER_KERNEL_BUILDER(Name("DecodeLibsvm")                        \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<type>("dtype")          \
+                              .TypeConstraint<double>("label_dtype"), \
+                          DecodeLibsvmOp<type, double>);
+
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(int64);
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/libsvm/ops/libsvm_ops.cc b/tensorflow/contrib/libsvm/ops/libsvm_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dec946189e3cd67e2557b83806c0db79a46e5f82
--- /dev/null
+++ b/tensorflow/contrib/libsvm/ops/libsvm_ops.cc
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+
+REGISTER_OP("DecodeLibsvm")
+    .Input("input: string")
+    .Output("label: label_dtype")
+    .Output("feature_indices: int64")
+    .Output("feature_values: dtype")
+    .Output("feature_shape: int64")
+    .Attr("dtype: {float, double, int32, int64} = DT_FLOAT")
+    .Attr("label_dtype: {float, double, int32, int64} = DT_INT64")
+    .Attr("num_features: int >= 1")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(0));
+
+      c->set_output(1, c->Matrix(InferenceContext::kUnknownDim,
+                                 InferenceContext::kUnknownDim));
+      c->set_output(2, c->Vector(InferenceContext::kUnknownDim));
+      c->set_output(3, c->Vector(InferenceContext::kUnknownDim));
+
+      return Status::OK();
+    })
+
+    .Doc(R"doc(
+Convert LibSVM input to tensors. The output consists of
+a label and a feature tensor. The shape of the label tensor
+is the same as input and the shape of the feature tensor is
+`[input_shape, num_features]`.
+
+input: Each string is a record in the LibSVM.
+label: A tensor of the same shape as input.
+feature_indices: A 2-D int64 tensor of dense_shape [N, ndims].
+feature_values: A 1-D tensor of any type and dense_shape [N].
+feature_shape: A 1-D int64 tensor of dense_shape [ndims].
+num_features: The number of features.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/libsvm/python/kernel_tests/decode_libsvm_op_test.py b/tensorflow/contrib/libsvm/python/kernel_tests/decode_libsvm_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..423dcce8de9b9c77fcfdc8c90c909e2918852905
--- /dev/null
+++ b/tensorflow/contrib/libsvm/python/kernel_tests/decode_libsvm_op_test.py
@@ -0,0 +1,71 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DecodeLibsvm op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.libsvm.python.ops import libsvm_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+
+
+class DecodeLibsvmOpTest(test.TestCase):
+
+  def testBasic(self):
+    with self.test_session() as sess:
+      content = [
+          "1 1:3.4 2:0.5 4:0.231", "1 2:2.5 3:inf 5:0.503",
+          "2 3:2.5 2:nan 1:0.105"
+      ]
+      sparse_features, labels = libsvm_ops.decode_libsvm(
+          content, num_features=6)
+      features = sparse_ops.sparse_tensor_to_dense(
+          sparse_features, validate_indices=False)
+
+      self.assertAllEqual(labels.get_shape().as_list(), [3])
+
+      features, labels = sess.run([features, labels])
+      self.assertAllEqual(labels, [1, 1, 2])
+      self.assertAllClose(
+          features, [[0, 3.4, 0.5, 0, 0.231, 0], [0, 0, 2.5, np.inf, 0, 0.503],
+                     [0, 0.105, np.nan, 2.5, 0, 0]])
+
+  def testNDimension(self):
+    with self.test_session() as sess:
+      content = [["1 1:3.4 2:0.5 4:0.231", "1 1:3.4 2:0.5 4:0.231"],
+                 ["1 2:2.5 3:inf 5:0.503", "1 2:2.5 3:inf 5:0.503"],
+                 ["2 3:2.5 2:nan 1:0.105", "2 3:2.5 2:nan 1:0.105"]]
+      sparse_features, labels = libsvm_ops.decode_libsvm(
+          content, num_features=6, label_dtype=dtypes.float64)
+      features = sparse_ops.sparse_tensor_to_dense(
+          sparse_features, validate_indices=False)
+
+      self.assertAllEqual(labels.get_shape().as_list(), [3, 2])
+
+      features, labels = sess.run([features, labels])
+      self.assertAllEqual(labels, [[1, 1], [1, 1], [2, 2]])
+      self.assertAllClose(
+          features, [[[0, 3.4, 0.5, 0, 0.231, 0], [0, 3.4, 0.5, 0, 0.231, 0]], [
+              [0, 0, 2.5, np.inf, 0, 0.503], [0, 0, 2.5, np.inf, 0, 0.503]
+          ], [[0, 0.105, np.nan, 2.5, 0, 0], [0, 0.105, np.nan, 2.5, 0, 0]]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/libsvm/python/ops/libsvm_ops.py b/tensorflow/contrib/libsvm/python/ops/libsvm_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3022505635bca81625cf7abd2be5628a4760970
--- /dev/null
+++ b/tensorflow/contrib/libsvm/python/ops/libsvm_ops.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Libsvm decoder."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.libsvm.ops import gen_libsvm_ops
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.platform import resource_loader
+
+
+_libsvm_ops_so = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_libsvm_ops.so"))
+
+
+def decode_libsvm(content, num_features, dtype=None, label_dtype=None):
+  """Convert Libsvm records to a tensor of label and a tensor of feature.
+
+  Args:
+    content: A `Tensor` of type `string`. Each string is a record/row in
+      the Libsvm format.
+    num_features: The number of features.
+    dtype: The type of the output feature tensor. Default to tf.float32.
+    label_dtype: The type of the output label tensor. Default to tf.int64.
+
+  Returns:
+    features: A `SparseTensor` of the shape `[input_shape, num_features]`.
+    labels: A `Tensor` of the same shape as content.
+  """
+  labels, indices, values, shape = gen_libsvm_ops.decode_libsvm(
+      content, num_features, dtype=dtype, label_dtype=label_dtype)
+  return sparse_tensor.SparseTensor(indices, values, shape), labels
+
+
+ops.NotDifferentiable("DecodeLibSVM")
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index fe2f183ac970cef4ebf6ca1a927b5a48eefb7d7b..cea3627ed565f0de86d8d9bb6b45c4b19c5b5558 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -126,6 +126,7 @@ py_library(
 py_test(
     name = "sdca_estimator_test",
     srcs = ["python/sdca_estimator_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":sdca_estimator_py",
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index 7526f3ae0dbdb3d6827e9d7f690090b8438e4f6e..3f5fdc18bb8f47cceee8f81dd5ded02059344b8b 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -211,9 +211,8 @@ class SdcaModel(object):
             sums.append(
                 math_ops.reduce_sum(
                     math_ops.abs(math_ops.cast(weights, dtypes.float64))))
-      sum = math_ops.add_n(sums)
       # SDCA L1 regularization cost is: l1 * sum(|weights|)
-      return self._options['symmetric_l1_regularization'] * sum
+      return self._options['symmetric_l1_regularization'] * math_ops.add_n(sums)
 
   def _l2_loss(self, l2):
     """Computes the (un-normalized) l2 loss of the model."""
@@ -225,9 +224,8 @@ class SdcaModel(object):
             sums.append(
                 math_ops.reduce_sum(
                     math_ops.square(math_ops.cast(weights, dtypes.float64))))
-      sum = math_ops.add_n(sums)
       # SDCA L2 regularization cost is: l2 * sum(weights^2) / 2
-      return l2 * sum / 2.0
+      return l2 * math_ops.add_n(sums) / 2.0
 
   def _convert_n_to_tensor(self, input_list, as_ref=False):
     """Converts input list to a set of tensors."""
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
index 7e214905b13db6a7e2f54f15873f5a9aedb4f44f..ec726bbed41a86eb314e3591ecaedaa6bf0e5e9b 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
@@ -102,7 +102,7 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface):
                        keys.get_shape())
 
   def lookup(self, keys, name=None):
-    if keys.dtype != self._key_dtype:
+    if keys.dtype.base_dtype != self._key_dtype:
       raise TypeError('Signature mismatch. Keys must be dtype %s, got %s.' %
                       (self._key_dtype, keys.dtype))
     self._check_keys(keys)
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
index 701fc1c0597d1de0b0189e86feafbd1c5bbdc818..05794a42c5f2d0eece6adab36fb5610078cece31 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import layers
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
@@ -154,7 +154,7 @@ def sdca_model_fn(features, labels, mode, params, config=None):
     _add_bias_column(feature_columns, features, bias, columns_to_variables)
 
   def _train_op_fn(unused_loss):
-    global_step = contrib_variables.get_global_step()
+    global_step = training_util.get_global_step()
     sdca_model, train_op = optimizer.get_train_step(
         columns_to_variables, weight_column_name, loss_type, features, labels,
         global_step)
diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 52460123cc10ec9b2ee13043fd43f84508b05000..44c4a7e2ca8d019ca602c7f2b492cd1e70b17561 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -6,8 +6,11 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
 
+exports_files(["LICENSE"])
+
 exports_files(glob([
     "testdata/*.bin",
+    "testdata/*.pb",
     "models/testdata/*",
 ]))
 
@@ -25,16 +28,35 @@ config_setting(
     },
 )
 
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-)
-
 cc_library(
     name = "schema_fbs_version",
     hdrs = ["version.h"],
 )
 
+cc_library(
+    name = "arena_planner",
+    srcs = ["arena_planner.cc"],
+    hdrs = ["arena_planner.h"],
+    deps = [
+        ":context",
+        ":graph_info",
+        ":memory_planner",
+        ":simple_memory_arena",
+    ],
+)
+
+cc_test(
+    name = "arena_planner_test",
+    size = "small",
+    srcs = ["arena_planner_test.cc"],
+    deps = [
+        ":arena_planner",
+        "//tensorflow/contrib/lite/testing:util",
+        "//tensorflow/core:lib",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 # Main library. No ops are included here.
 # TODO(aselle): Resolve problems preventing C99 usage.
 cc_library(
@@ -43,6 +65,25 @@ cc_library(
     hdrs = ["context.h"],
 )
 
+cc_library(
+    name = "graph_info",
+    hdrs = ["graph_info.h"],
+    deps = [":context"],
+)
+
+cc_library(
+    name = "memory_planner",
+    hdrs = ["memory_planner.h"],
+    deps = [":context"],
+)
+
+cc_library(
+    name = "simple_memory_arena",
+    srcs = ["simple_memory_arena.cc"],
+    hdrs = ["simple_memory_arena.h"],
+    deps = [":context"],
+)
+
 cc_library(
     name = "builtin_op_data",
     hdrs = [
@@ -66,27 +107,31 @@ cc_library(
     srcs = [
         "allocation.cc",
         "error_reporter.cc",
+        "graph_info.cc",
         "interpreter.cc",
         "model.cc",
         "nnapi_delegate.cc",
         "optional_debug_tools.cc",
-        "simple_memory_arena.cc",
     ],
     hdrs = [
         "allocation.h",
         "context.h",
         "error_reporter.h",
+        "graph_info.h",
         "interpreter.h",
         "model.h",
         "nnapi_delegate.h",
         "optional_debug_tools.h",
-        "simple_memory_arena.h",
     ],
     copts = tflite_copts(),
     deps = [
+        ":arena_planner",
         ":builtin_op_data",
         ":context",
+        ":graph_info",
+        ":memory_planner",
         ":schema_fbs_version",
+        ":simple_memory_arena",
         "//tensorflow/contrib/lite/kernels:gemm_support",
         "//tensorflow/contrib/lite/nnapi:nnapi_lib",
         "//tensorflow/contrib/lite/schema:schema_fbs",
@@ -111,6 +156,7 @@ cc_test(
     deps = [
         ":framework",
         ":string_util",
+        "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -123,6 +169,22 @@ cc_test(
     deps = [
         ":framework",
         ":string_util",
+        "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Test graph utils
+cc_test(
+    name = "graph_info_test",
+    size = "small",
+    srcs = ["graph_info_test.cc"],
+    deps = [
+        ":framework",
+        ":string_util",
+        "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -133,7 +195,8 @@ cc_test(
     size = "small",
     srcs = ["simple_memory_arena_test.cc"],
     deps = [
-        ":framework",
+        ":simple_memory_arena",
+        "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -152,6 +215,7 @@ cc_test(
     ],
     deps = [
         ":framework",
+        "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -163,6 +227,7 @@ cc_test(
     srcs = ["context_test.cc"],
     deps = [
         ":framework",
+        "//tensorflow/contrib/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -171,18 +236,18 @@ cc_test(
 
 # Model tests
 
-cc_library(
-    name = "models_test_utils",
-    testonly = 1,
-    hdrs = ["models/test_utils.h"],
-    deps = select({
-        "//tensorflow:android": [],
-        "//conditions:default": [
-            "@com_google_absl//absl/strings",
-            "//tensorflow/core:test",
-        ],
-    }),
-)
+#cc_library(
+#    name = "models_test_utils",
+#    testonly = 1,
+#    hdrs = ["models/test_utils.h"],
+#    deps = select({
+#        "//tensorflow:android": [],
+#        "//conditions:default": [
+#            "@com_google_absl//absl/strings",
+#            "//tensorflow/core:test",
+#        ],
+#    }),
+#)
 
 filegroup(
     name = "all_files",
diff --git a/tensorflow/contrib/lite/Makefile b/tensorflow/contrib/lite/Makefile
index 78402727abdd2742ffff54bf59ca076d8b97b042..7f316292724ea0baaf034d4e914773ad97a957d4 100644
--- a/tensorflow/contrib/lite/Makefile
+++ b/tensorflow/contrib/lite/Makefile
@@ -56,7 +56,7 @@ LIBS := \
 -lz
 
 # If we're on Linux, also link in the dl library.
-ifeq ($(OS),LINUX)
+ifeq ($(HOST_OS),LINUX)
 	LIBS += -ldl -lpthread
 endif
 
diff --git a/tensorflow/contrib/lite/README.md b/tensorflow/contrib/lite/README.md
index c7464bcc9d39b0e884e76f5a3ffa152e98bb0f47..3e55d2a496c1d83ec0501df27deee4e19a5012a7 100644
--- a/tensorflow/contrib/lite/README.md
+++ b/tensorflow/contrib/lite/README.md
@@ -4,7 +4,7 @@ TensorFlow Lite is TensorFlow's lightweight solution for mobile and embedded dev
 TensorFlow Lite uses many techniques for achieving low latency like optimizing the kernels for specific mobile apps, pre-fused activations, quantized kernels that allow smaller and faster (fixed-point math) models, and in the future, leverage specialized machine learning hardware to get the best possible performance for a particular model on a particular device.
 
 ![image](g3doc/TFLite-Architecture.jpg)
-# Getting Started with a Demo App
+# Getting Started with an Android Demo App
 
 This section contains an example application using TensorFlow Lite for Android devices. The demo is a sample camera app that classifies images continuously using a quantized Mobilenet model. A device running Android 5.0 ( API 21) or higher is required to run the demo.
 
@@ -17,7 +17,7 @@ There are 3 ways to get the demo app to your device
 In the demo app, inference is done using the TensorFlow Lite Java API. The demo app classifies frames in real-time, displaying the top most probable classifications. It also displays the time taken to detect the object.
 
 ## Downloading the pre-built binary
-The  fastest path to trying the demo, is to download the pre-built binary
+The fastest path to trying the demo, is to download the pre-built binary
 [TfLiteCameraDemo.apk](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk)
 
 Once the apk is installed, click the app icon to start the app. The first-time the app is opened, the app asks for runtime permissions to access the device camera. The demo app opens the back-camera of the device and recognizes the objects in the camera's field of view. At the bottom of the image (or at the left of the image if the device is in landscape mode), it shows the latency of classification and the top three objects classified.
@@ -69,7 +69,7 @@ android_ndk_repository(
 
 Additional details on building with Android can be found [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/README.md).
 
-### Build the  source code
+### Build the source code
 Run bazel with the following command to build the demo.
 
 Build the demo app:
@@ -86,6 +86,17 @@ environment (due to a Bazel bug).
 ### More about the demo
 The demo is resizing each camera image frame to (224 width * 224 height) to match the  quantized Mobilenet model being used. The resized image is converted into a ByteBuffer row by row of size 1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch 224 * 224 is the width and height of the image 3 bytes represents three colors of a pixel. This demo uses the TensorFlow Lite Java inference API for models which take a single input and provide a single output. This outputs a two-dimensional array, with the first dimension being the category index and the second dimension being the confidence of classification. The Mobilenet model has 1001 unique categories and the app sorts the probabilities of all the categories and displays the top three. The Mobilenet quantized model is bundled within the assets directory of the app.
 
+# iOS Demo App
+
+Similar to the Android demo app, there's an iOS camera app that uses exactly the same model (224 * 224 quantized Mobilenet).
+
+This demo app requires a camera so it doesn't work with simulators. It need to be executed on a real iOS device. Follow the instructions to build and run the demo app:
+
+1.   Run `third_party/tensorflow/contrib/lite/examples/ios/download_models.sh` to download the model files used by the demo app.
+1.   Install [CocoaPods](https://cocoapods.org/) if it wasn't installed yet: `sudo gem install cocoapods`.
+1.   Run `pod install` in `tensorflow/contrib/lite/examples/ios/camera` to generate the workspace file.
+1.   Open the project by running `open tflite_camera_example.xcworkspace`, and build the app in XCode.
+
 # TensorFlow Lite Quick Start
 
 ## Step 1. Decide which GraphDef to use
@@ -131,7 +142,7 @@ Since we employ several formats, the following definitions may be useful:
 
  - SavedModel - A collection of GraphDef and CheckPoint together with a signature that labels input and output arguments to a model. A GraphDef and Checkpoint can be extracted from a saved model.
 
- - TensorFlow lite model (.lite) - a serialized flatbuffer, containing TensorFlow lite operators and Tensors for the TensorFlow lite interpreter. This is most analogous to TensorFlow frozen GraphDefs.
+ - TensorFlow lite model (.tflite) - a serialized flatbuffer, containing TensorFlow lite operators and Tensors for the TensorFlow lite interpreter. This is most analogous to TensorFlow frozen GraphDefs.
 
 ### Freeze Graph
 To use this .pb GraphDef file within TensorFlow Lite, the application developer will need checkpoints containing trained weight parameters. The .pb contains only the structure of the graph. The process of merging the checkpoint values with the graph structure is known as "freezing" the graph.
@@ -153,17 +164,18 @@ bazel-bin/tensorflow/python/tools/freeze_graph\
 The user has to first build the freeze_graph script using bazel and then run the script.  The input_binary flag has to be enabled to ensure that the protobuf is read and written in binary format.  The user has to input the .pb and the .ckpt files to freeze the graph The output_node_names may not be obvious outside of the code that built the model. The easiest way to find them is to visualize the graph, either with
 graphviz, or [in tensorboard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3).
 
-This frozen Graphdef is now ready to be converted to flatbuffer format (.lite) for use on Android or iOS.  On Android users have the flexibility to use either the float or quantized versions of the frozen graphdef, if available, using the Tensorflow Optimizing Converter tool.
+This frozen Graphdef is now ready to be converted to flatbuffer format (.tflite) for use on Android or iOS.  On Android users have the flexibility to use either the float or quantized versions of the frozen graphdef, if available, using the Tensorflow Optimizing Converter tool.
 
-Here is a sample command line to convert the frozen Graphdef to '.lite' format for  The Tensorflow Optimizing Converter supports both float and quantized models, however, different configuration parameters are needed depending on whether a FLOAT or QUANTIZED mode is being used.
+Here is a sample command line to convert the frozen Graphdef to '.tflite' format for  The Tensorflow Optimizing Converter supports both float and quantized models, however, different configuration parameters are needed depending on whether a FLOAT or QUANTIZED mode is being used.
+(Here is a link to the pb [file](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)).
 
 ```
 bazel build tensorflow/contrib/lite/toco:toco
 
-bazel-bin/tensorflow/contrib/lite/toco/toco -- \
+bazel-bin/tensorflow/contrib/lite/toco/toco \
   --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
   --input_format=TENSORFLOW_GRAPHDEF  --output_format=TFLITE \
-  --output_file=/tmp/mobilenet_v1_1.0_224.lite --inference_type=FLOAT \
+  --output_file=/tmp/mobilenet_v1_1.0_224.tflite --inference_type=FLOAT \
   --input_type=FLOAT --input_arrays=input \
   --output_arrays=MobilenetV1/Predictions/Reshape_1 --input_shapes=1,224,224,3
 ```
@@ -174,9 +186,9 @@ bazel-bin/tensorflow/contrib/lite/toco/toco -- \
 - Setting the input_array, output_array and input_shape arguments are a bit trickier. The easiest way to find these values is to explore the graph in tensorboard .  The user should reuse the arguments that were used for specifying the output nodes for inference in the `freeze_graph`step.
 
 Note, it is also possible to use the Tensorflow Optimizing Converter through protos either from Python or from the command line see the
-documentation [here](https://github.com/tensorflow/tensorflow/tree/mastertensorflow/contrib/lite/python:toco_from_protos target) A developer can then integrate the conversion step into their model design workflow to ensure that a model will be easily convertible to a mobile inference graph. For example,
+documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/python/toco_from_protos.py). A developer can then integrate the conversion step into their model design workflow to ensure that a model will be easily convertible to a mobile inference graph. For example,
 
-```
+```python
 import tensorflow as tf
 
 img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
@@ -191,9 +203,15 @@ For detailed instructions on how to use the Tensorflow Optimizing Converter, ple
 
 You may refer to the [Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for troubleshooting help. If that doesn't help, please file an [issue](https://github.com/tensorflow/tensorflow/issues).
 
+If you would like to see a visual description of your TensorFlow Lite model after conversion, you can use tensorflow/contrib/lite/tools/visualize.py by running
+```sh
+bazel run tensorflow/contrib/lite/tools:visualize -- model.tflite model_viz.html
+```
+and then visualize the resulting HTML file in a browser.
+
 ## Step 3. Use the TensorFlow Lite model for inference in a mobile app
 
-After completion of Step 2 the developer should have a .lite model.
+After completion of Step 2 the developer should have a .tflite model.
 
 ### For Android
 Because Android apps need to be written in Java, and core TensorFlow is in C++, a JNI library is provided to interface between the two. Its interface is aimed only at inference, so it provides the ability to load a graph, set up inputs, and run the model to calculate particular outputs. The full documentation for the set of methods can be seen [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/). The demo app is also open sourced on [github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
@@ -204,3 +222,7 @@ Note that you'd need to follow instructions for installing TensorFlow on Android
 
 ### For iOS
 Follow the documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md) to get integrate a TFLite model into your app.
+
+## Core ML support
+
+Core ML is a machine learning framework used across Apple products. In addition to using Tensorflow Lite models directly in their applications, developers have the option to convert their trained Tensorflow models to the [CoreML](https://developer.apple.com/machine-learning/) format for use on Apple devices. For information on how to use the converter please refer to the [Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
diff --git a/tensorflow/contrib/lite/allocation.h b/tensorflow/contrib/lite/allocation.h
index ee8a7ccd0b232f9e48095567fd4aefe94f595bc3..68aee2e64473320c461ec8b3f194904e7b8da43c 100644
--- a/tensorflow/contrib/lite/allocation.h
+++ b/tensorflow/contrib/lite/allocation.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Main abstraction controlling the tflite interpreter.
 // See context.h for the API for defining operations (TfLiteRegistration).
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
+#define TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
 
 #include <cstdio>
 #include <cstdlib>
@@ -91,4 +91,4 @@ class MemoryAllocation : public Allocation {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
diff --git a/tensorflow/contrib/lite/arena_planner.cc b/tensorflow/contrib/lite/arena_planner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..87b17c338e7afc33d32dd9688cc0825ac319dd19
--- /dev/null
+++ b/tensorflow/contrib/lite/arena_planner.cc
@@ -0,0 +1,251 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/arena_planner.h"
+
+namespace tflite {
+
+namespace {
+
+// Memory allocation tuning
+constexpr const int kDefaultArenaAlignment = 64;
+constexpr const int kDefaultTensorAlignment = 4;
+
+}  // namespace
+
+struct AllocationInfo {
+  // The node index requesting this allocation.
+  int node;
+  // The tensor index to be allocated or deallocated.
+  int tensor;
+  // Whether to allocate or deallocate
+  enum { ALLOC, DEALLOC } type;
+};
+
+ArenaPlanner::ArenaPlanner(TfLiteContext* context,
+                           std::unique_ptr<GraphInfo> graph_info)
+    : context_(context),
+      graph_info_(std::move(graph_info)),
+      arena_(kDefaultArenaAlignment),
+      persistent_arena_(kDefaultArenaAlignment) {}
+
+ArenaPlanner::~ArenaPlanner() {}
+
+int64_t ArenaPlanner::BasePointer(TfLiteAllocationType type) {
+  if (type == kTfLiteArenaRwPersistent) {
+    return persistent_arena_.BasePointer();
+  }
+  if (type == kTfLiteArenaRw) {
+    return arena_.BasePointer();
+  }
+  return 0;
+}
+
+TfLiteStatus ArenaPlanner::ResetAllocations() {
+  TF_LITE_ENSURE_STATUS(arena_.Clear());
+  TF_LITE_ENSURE_STATUS(persistent_arena_.Clear());
+  allocs_.clear();
+  allocs_.resize(graph_info_->num_tensors());
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::PlanAllocations() {
+  // Invalidate any existing data.
+  TF_LITE_ENSURE_STATUS(ResetAllocations());
+
+  // Keeps track of references to each tensor.
+  std::vector<int> refcounts(graph_info_->num_tensors(), 0);
+
+  // There will be an entry in alloc_queue_ for the allocation of each tensor
+  // and another for their deallocation.
+  alloc_queue_.reserve(2 * graph_info_->num_tensors());
+
+  // We must make sure the output tensors are never overwritten. We do that by
+  // artificially adding one to their ref-counts so they are never selected
+  // for deallocation.
+  for (int tensor_index : graph_info_->outputs()) {
+    refcounts[tensor_index]++;
+  }
+
+  // Count references to node input tensors.
+  for (int i = 0; i < graph_info_->num_nodes(); ++i) {
+    const TfLiteNode& node = graph_info_->node(i);
+    TfLiteIntArray* node_inputs = node.inputs;
+    for (int j = 0; j < node_inputs->size; ++j) {
+      int tensor_index = node_inputs->data[j];
+      if (tensor_index != kOptionalTensor) {
+        refcounts[tensor_index]++;
+      }
+    }
+  }
+
+  // Queue all graph inputs for allocation.
+  for (int tensor_index : graph_info_->inputs()) {
+    if (tensor_index != kOptionalTensor) {
+      alloc_queue_.push_back({0, tensor_index, AllocationInfo::ALLOC});
+    }
+  }
+
+  // Go through the graph in execution order.
+  for (int i = 0; i < graph_info_->num_nodes(); ++i) {
+    const TfLiteNode& node = graph_info_->node(i);
+
+    // First queue output tensors for allocation.
+    TfLiteIntArray* node_outputs = node.outputs;
+    for (int j = 0; j < node_outputs->size; ++j) {
+      int tensor_index = node_outputs->data[j];
+      alloc_queue_.push_back({i, tensor_index, AllocationInfo::ALLOC});
+    }
+
+    // Then update the ref-counts of the node's inputs, and if necessary queue
+    // them for deallocation.
+    TfLiteIntArray* node_inputs = node.inputs;
+    for (int j = 0; j < node_inputs->size; ++j) {
+      int tensor_index = node_inputs->data[j];
+      if (tensor_index != kOptionalTensor) {
+        refcounts[tensor_index]--;
+        if (refcounts[tensor_index] == 0) {
+          alloc_queue_.push_back({i, tensor_index, AllocationInfo::DEALLOC});
+        }
+      }
+    }
+  }
+
+  // Note that graph outputs will never be scheduled for deallocation. We
+  // could do that here for completeness, but it won't have any effect.
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) {
+  TF_LITE_ENSURE_STATUS(CalculateAllocations(first_node, last_node));
+  TF_LITE_ENSURE_STATUS(Commit());
+
+  for (int i = 0; i < graph_info_->num_tensors(); ++i) {
+    // TODO(ahentz): we could do this only for the tensors that were modified
+    // in CalculateAllocations(), instead of redoing it for tensors that
+    // already had proper pointers. However we must be very careful, because
+    // SimpleMemoryArena::Commit() could move the base pointer.
+    TF_LITE_ENSURE_STATUS(ResolveTensorAllocation(i));
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::Commit() {
+  TF_LITE_ENSURE_STATUS(arena_.Commit(context_));
+  TF_LITE_ENSURE_STATUS(persistent_arena_.Commit(context_));
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::CalculateAllocations(int first_node, int last_node) {
+  int active_node = first_node;
+  // When dynamic tensors are present this method is called multiple times.
+  // The items in the alloc_queue_ referring to nodes before first_node were
+  // processed previously and should be skipped. Entries after last_node are
+  // not yet ready to be handled.
+  for (const auto& alloc_info : alloc_queue_) {
+    if (alloc_info.node < first_node) continue;
+    if (alloc_info.node > last_node) break;
+    if (alloc_info.node == active_node) {
+      // This is the first allocation/deallocation for a given node.  It is
+      // time to deallocate the previous temporaries and allocate new ones.
+      if (active_node != first_node) {
+        TF_LITE_ENSURE_STATUS(
+            CalculateDeallocationOfInternalTensors(active_node - 1));
+      }
+      TF_LITE_ENSURE_STATUS(CalculateAllocationOfInternalTensors(active_node));
+      ++active_node;
+    }
+    // Handle the current item.
+    if (alloc_info.type == AllocationInfo::ALLOC) {
+      TF_LITE_ENSURE_STATUS(CalculateTensorAllocation(alloc_info.tensor));
+    } else {
+      TF_LITE_ENSURE_STATUS(CalculateTensorDeallocation(alloc_info.tensor));
+    }
+  }
+
+  // Don't forget to deallocate temporaries of last node.
+  TF_LITE_ENSURE_STATUS(
+      CalculateDeallocationOfInternalTensors(active_node - 1));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::ResolveTensorAllocation(int tensor_index) {
+  TfLiteTensor& tensor = *graph_info_->tensor(tensor_index);
+  if (tensor.allocation_type == kTfLiteArenaRw) {
+    // Skip resolution if the size of the tensor is zero, leaving it as a
+    // nullptr.
+    if (allocs_[tensor_index].size != 0) {
+      TF_LITE_ENSURE_STATUS(arena_.ResolveAlloc(context_, allocs_[tensor_index],
+                                                &tensor.data.raw));
+    }
+  }
+  if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
+    TF_LITE_ENSURE_STATUS(persistent_arena_.ResolveAlloc(
+        context_, allocs_[tensor_index], &tensor.data.raw));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::CalculateTensorAllocation(int tensor_index) {
+  TfLiteTensor& tensor = *graph_info_->tensor(tensor_index);
+  if (tensor.allocation_type == kTfLiteArenaRw) {
+    TF_LITE_ENSURE_STATUS(arena_.Allocate(context_, kDefaultTensorAlignment,
+                                          tensor.bytes,
+                                          &allocs_[tensor_index]));
+  }
+  if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
+    TF_LITE_ENSURE_STATUS(
+        persistent_arena_.Allocate(context_, kDefaultTensorAlignment,
+                                   tensor.bytes, &allocs_[tensor_index]));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::CalculateTensorDeallocation(int tensor_index) {
+  TfLiteTensor& tensor = *graph_info_->tensor(tensor_index);
+  if (tensor.allocation_type == kTfLiteArenaRw) {
+    TF_LITE_ENSURE_STATUS(arena_.Deallocate(context_, allocs_[tensor_index]));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::CalculateAllocationOfInternalTensors(
+    int node_index) {
+  if (node_index < graph_info_->num_nodes()) {
+    const TfLiteNode& node = graph_info_->node(node_index);
+    TfLiteIntArray* node_temporaries = node.temporaries;
+    for (int i = 0; i < node_temporaries->size; ++i) {
+      int tensor_index = node_temporaries->data[i];
+      TF_LITE_ENSURE_STATUS(CalculateTensorAllocation(tensor_index));
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ArenaPlanner::CalculateDeallocationOfInternalTensors(
+    int node_index) {
+  if (node_index < graph_info_->num_nodes()) {
+    const TfLiteNode& node = graph_info_->node(node_index);
+    TfLiteIntArray* node_temporaries = node.temporaries;
+    for (int i = 0; i < node_temporaries->size; ++i) {
+      int tensor_index = node_temporaries->data[i];
+      TF_LITE_ENSURE_STATUS(CalculateTensorDeallocation(tensor_index));
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/arena_planner.h b/tensorflow/contrib/lite/arena_planner.h
new file mode 100644
index 0000000000000000000000000000000000000000..58bc164619c2c053b9492e9a0e5de2da30e199af
--- /dev/null
+++ b/tensorflow/contrib/lite/arena_planner.h
@@ -0,0 +1,107 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
+#define TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/graph_info.h"
+#include "tensorflow/contrib/lite/memory_planner.h"
+#include "tensorflow/contrib/lite/simple_memory_arena.h"
+
+namespace tflite {
+
+class AllocationInfo;
+
+// A memory planner that makes all the allocations using arenas.
+//
+// Before a model is executed by the interpreter, this class determines when
+// each tensor needs to be allocated and deallocated, and preallocates all the
+// necessary memory (the PlanAllocations phase). It then assigns portions of
+// this memory buffer to each tensor (the ExecuteAllocations phase). Tensors may
+// share some of the bufer if a tensor B is to be allocated after another tensor
+// A has been deallocated.
+//
+// If dynamic tensors are used the planning steps can be repeated during model
+// execution. Since dynamic tensors don't have sizes until after the
+// corresponding operation is executed, this class supports incremental
+// planning.
+class ArenaPlanner : public MemoryPlanner {
+ public:
+  // Ownership of 'context' is not taken and it must remain util the
+  // ArenaPlanner is destroyed.
+  ArenaPlanner(TfLiteContext* context, std::unique_ptr<GraphInfo> graph_info);
+  ~ArenaPlanner() override;
+  ArenaPlanner(const ArenaPlanner&) = delete;
+  ArenaPlanner& operator=(const ArenaPlanner&) = delete;
+
+  TfLiteStatus ResetAllocations() override;
+  TfLiteStatus PlanAllocations() override;
+  TfLiteStatus ExecuteAllocations(int first_node, int last_node) override;
+
+  // Returns the base arena location for a given allocation type.
+  int64_t BasePointer(TfLiteAllocationType type);
+
+ private:
+  // Make sure all the arenas have reserved enough memory to store all their
+  // tensors.
+  TfLiteStatus Commit();
+
+  // Traverse the allocation queue and reserve space in the appropriate arena
+  // for all tensors affected by ops in the interval [first_node, last_node].
+  TfLiteStatus CalculateAllocations(int first_node, int last_node);
+
+  // Assign absolute memory location to a tensor, based on its relative
+  // position inside the corresponding arena buffer.
+  TfLiteStatus ResolveTensorAllocation(int tensor_index);
+
+  // Register an allocation for the given tensor.
+  TfLiteStatus CalculateTensorAllocation(int tensor_index);
+
+  // Register a deallocation for the given tensor.
+  TfLiteStatus CalculateTensorDeallocation(int tensor_index);
+
+  // Register an allocation for all internal (temporary) tensors of
+  // 'node_index'.
+  TfLiteStatus CalculateAllocationOfInternalTensors(int node_index);
+
+  // Register a deallocation for all internal (temporary) tensors of
+  // 'node_index'.
+  TfLiteStatus CalculateDeallocationOfInternalTensors(int node_index);
+
+  TfLiteContext* context_;
+  std::unique_ptr<GraphInfo> graph_info_;
+
+  // Stores allocation data for all tensors.
+  std::vector<ArenaAlloc> allocs_;
+
+  // A chronological list of instructions to allocated and deallocate tensors,
+  // reflecting the way they are used in the graph.
+  std::vector<AllocationInfo> alloc_queue_;
+
+  // Raw memory buffer that is allocated for all temporary and graph outputs.
+  // that are declared kTfLiteArenaRw.
+  SimpleMemoryArena arena_;
+
+  // Raw memory buffer that is allocated for persistent tensors that are
+  // declared as kTfLiteArenaRwPersistent.
+  SimpleMemoryArena persistent_arena_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
diff --git a/tensorflow/contrib/lite/arena_planner_test.cc b/tensorflow/contrib/lite/arena_planner_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a8a8755e2c9e81474f2ff9cd2b85c0eb3d5c3441
--- /dev/null
+++ b/tensorflow/contrib/lite/arena_planner_test.cc
@@ -0,0 +1,468 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/arena_planner.h"
+
+#include <cstdarg>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tflite {
+namespace {
+
+// A simple op to be used in tests, as syntactic sugar.
+class TestOp {
+ public:
+  TestOp(std::initializer_list<int> inputs, std::initializer_list<int> outputs,
+         std::initializer_list<int> temporaries)
+      : inputs_(inputs), outputs_(outputs), temporaries_(temporaries) {}
+
+  const std::vector<int>& inputs() const { return inputs_; }
+  const std::vector<int>& outputs() const { return outputs_; }
+  const std::vector<int>& temporaries() const { return temporaries_; }
+
+ private:
+  std::vector<int> inputs_;
+  std::vector<int> outputs_;
+  std::vector<int> temporaries_;
+};
+
+// A test graph where inputs are processed by the given nodes to produce
+// outputs.
+class TestGraph {
+ public:
+  TestGraph(std::initializer_list<int> inputs,
+            std::initializer_list<TestOp> nodes,
+            std::initializer_list<int> outputs)
+      : inputs_(inputs), outputs_(outputs) {
+    int max_tensor_index = 0;
+
+    for (int t : inputs) {
+      max_tensor_index = std::max(max_tensor_index, t);
+    }
+    for (int t : outputs) {
+      max_tensor_index = std::max(max_tensor_index, t);
+    }
+    for (const auto& node : nodes) {
+      auto int_array = [](const std::vector<int>& x) {
+        TfLiteIntArray* lite = TfLiteIntArrayCreate(x.size());
+        for (size_t i = 0; i < x.size(); i++) lite->data[i] = x[i];
+        return lite;
+      };
+
+      nodes_.push_back(TfLiteNode());
+      nodes_.back().inputs = int_array(node.inputs());
+      for (int t : node.inputs()) {
+        max_tensor_index = std::max(max_tensor_index, t);
+      }
+      nodes_.back().outputs = int_array(node.outputs());
+      for (int t : node.outputs()) {
+        max_tensor_index = std::max(max_tensor_index, t);
+      }
+      nodes_.back().temporaries = int_array(node.temporaries());
+      for (int t : node.temporaries()) {
+        max_tensor_index = std::max(max_tensor_index, t);
+      }
+    }
+
+    for (int i = 0; i <= max_tensor_index; ++i) {
+      tensors_.push_back(TfLiteTensor());
+      // Set some default values for allocation_type and bytes, which are the
+      // only fields used by the arena planner.
+      tensors_.back().allocation_type = kTfLiteArenaRw;
+      tensors_.back().bytes = (i + 1) * 3;
+    }
+  }
+
+  ~TestGraph() {
+    for (auto node : nodes_) {
+      TfLiteIntArrayFree(node.inputs);
+      TfLiteIntArrayFree(node.outputs);
+      TfLiteIntArrayFree(node.temporaries);
+    }
+  }
+
+  const std::vector<TfLiteNode>& nodes() { return nodes_; }
+  std::vector<TfLiteTensor>* tensors() { return &tensors_; }
+  const std::vector<int>& inputs() { return inputs_; }
+  const std::vector<int>& outputs() { return outputs_; }
+
+ private:
+  std::vector<TfLiteNode> nodes_;
+  std::vector<TfLiteTensor> tensors_;
+  std::vector<int> inputs_;
+  std::vector<int> outputs_;
+};
+
+// The GraphInfo for a TestGraph.
+class TestGraphInfo : public GraphInfo {
+ public:
+  explicit TestGraphInfo(TestGraph* graph) : graph_(graph) {}
+
+  size_t num_tensors() const override { return graph_->tensors()->size(); }
+  TfLiteTensor* tensor(size_t index) override {
+    return &graph_->tensors()->at(index);
+  }
+  size_t num_nodes() const override { return graph_->nodes().size(); }
+  const TfLiteNode& node(size_t index) const override {
+    return graph_->nodes()[index];
+  }
+  const std::vector<int>& inputs() const override { return graph_->inputs(); }
+  const std::vector<int>& outputs() const override { return graph_->outputs(); }
+
+ private:
+  TestGraph* graph_;
+};
+
+void ReportError(TfLiteContext* context, const char* format, ...) {
+  const size_t kBufferSize = 1024;
+  char temp_buffer[kBufferSize];
+
+  va_list args;
+  va_start(args, format);
+  vsnprintf(temp_buffer, kBufferSize, format, args);
+  va_end(args);
+
+  LOG(INFO) << temp_buffer;
+}
+
+class ArenaPlannerTest : public ::testing::Test {
+ protected:
+  void SetGraph(TestGraph* graph) {
+    graph_ = graph;
+    context_.ReportError = ReportError;
+    planner_.reset(new ArenaPlanner(
+        &context_, std::unique_ptr<GraphInfo>(new TestGraphInfo(graph))));
+    CHECK(planner_->ResetAllocations() == kTfLiteOk);
+    CHECK(planner_->PlanAllocations() == kTfLiteOk);
+  }
+
+  void Execute(int start, int end) {
+    CHECK(planner_->ExecuteAllocations(start, end) == kTfLiteOk);
+  }
+
+  // Returns the actual offset of a given tensor, relative to the start of its
+  // arena.
+  int64_t GetOffset(int tensor_index) {
+    const TfLiteTensor& tensor = (*graph_->tensors())[tensor_index];
+    return reinterpret_cast<int64_t>(tensor.data.raw) -
+           planner_->BasePointer(tensor.allocation_type);
+  }
+
+  // Returns the first aligned offset after a given tensor.
+  int64_t GetOffsetAfter(int tensor_index) {
+    const TfLiteTensor& tensor = (*graph_->tensors())[tensor_index];
+    int64_t offset = GetOffset(tensor_index) + tensor.bytes;
+    // We must make sure the offset is aligned to kDefaultArenaAlignment.
+    if (offset % 4 != 0) {
+      offset += 4 - offset % 4;
+    }
+    return offset;
+  };
+
+  TfLiteContext context_;
+  TestGraph* graph_;
+  std::unique_ptr<ArenaPlanner> planner_;
+};
+
+TEST_F(ArenaPlannerTest, EmptyGraph) {
+  TestGraph graph({}, {}, {});
+  SetGraph(&graph);
+  Execute(0, 10);
+}
+
+TEST_F(ArenaPlannerTest, GraphWithNoOps) {
+  TestGraph graph({0, 10}, {}, {5, 11});
+  SetGraph(&graph);
+  Execute(0, 10);
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(10), GetOffsetAfter(0));
+  // The outputs are never allocated because they are not connected to any
+  // inputs.
+  EXPECT_TRUE((*graph.tensors())[5].data.raw == nullptr);
+  EXPECT_TRUE((*graph.tensors())[11].data.raw == nullptr);
+}
+
+TEST_F(ArenaPlannerTest, GraphWithOneOp) {
+  TestGraph graph({1}, {{{1}, {2}, {}}}, {2});
+  SetGraph(&graph);
+  Execute(0, 10);
+  EXPECT_EQ(GetOffset(1), 0);
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+}
+
+TEST_F(ArenaPlannerTest, ZeroSizedTensors) {
+  TestGraph graph({1}, {{{1}, {2}, {}}}, {2});
+  (*graph.tensors())[1].bytes = 0;
+  SetGraph(&graph);
+  // TODO(ahentz): this is currently broken because the arena finds two
+  // allocations with the same offset and returns an error.
+  ASSERT_FALSE(planner_->ExecuteAllocations(0, 10) == kTfLiteOk);
+  // EXPECT_EQ(GetOffset(1), 0);
+  // EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraph) {
+  TestGraph graph({0, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2}, {}},     // First op
+                      {{2, 0}, {4, 5}, {}},  // Second op
+                      {{4, 5}, {3}, {}}      // Third op
+                  },
+                  {3});
+  SetGraph(&graph);
+  Execute(0, 10);
+
+  // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +4 +5 -2 -0 +3 -4 -5
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(3), 0);
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphWithTemporary) {
+  TestGraph graph({0, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2}, {}},   // First op
+                      {{2, 0}, {4}, {5}},  // Second op, with temporary
+                      {{4}, {3}, {}}       // Third op
+                  },
+                  {3});
+  SetGraph(&graph);
+  Execute(0, 10);
+
+  // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), 0);
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphWithOptionals) {
+  TestGraph graph({0, -1, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2}, {}},     // First op
+                      {{2, 0}, {4, 5}, {}},  // Second op
+                      {{4, -1, 5}, {3}, {}}  // Third op, with optional
+                  },
+                  {3});
+  SetGraph(&graph);
+  Execute(0, 10);
+
+  // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +4 +5 -2 -0 +3 -4 -5
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(3), 0);
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphWithLargeTensor) {
+  TestGraph graph({0, -1, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2}, {}},   // First op
+                      {{2, 0}, {4}, {5}},  // Second op, with temporary
+                      {{4, -1}, {3}, {}}   // Third op, with optional
+                  },
+                  {3});
+
+  // Make #1 very large so its vacancy can be filled with #5 and #4.
+  (*graph.tensors())[1].bytes = 40;
+
+  SetGraph(&graph);
+  Execute(0, 10);
+
+  // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), 0);
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphWithPersistentTensor) {
+  TestGraph graph({0, -1, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2}, {}},   // First op
+                      {{2, 0}, {4}, {5}},  // Second op, with temporary
+                      {{4, -1}, {3}, {}}   // Third op, with optional
+                  },
+                  {3});
+
+  // Make #1 persistent so it goes into its own arena.
+  (*graph.tensors())[1].allocation_type = kTfLiteArenaRwPersistent;
+
+  SetGraph(&graph);
+  Execute(0, 10);
+
+  // Make sure #0 and #1 were given different memory locations (because they
+  // will both have offset=0, in different arenas.)
+  EXPECT_NE((*graph.tensors())[0].data.raw, (*graph.tensors())[1].data.raw);
+
+  // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), 0);
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), 0);
+}
+
+TEST_F(ArenaPlannerTest, SimpleGraphWithDynamicTensor) {
+  TestGraph graph({0, -1, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2}, {}},   // First op
+                      {{2, 0}, {4}, {5}},  // Second op, with temporary
+                      {{4, -1}, {3}, {}}   // Third op, with optional
+                  },
+                  {3});
+
+  // Make #1 dynaic so it does not get allocated.
+  (*graph.tensors())[1].allocation_type = kTfLiteDynamic;
+
+  SetGraph(&graph);
+  Execute(0, 10);
+
+  EXPECT_EQ((*graph.tensors())[1].data.raw, nullptr);
+
+  // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), 0);
+}
+
+TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) {
+  TestGraph graph({0, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2, 3}, {}},
+                      {{2, 0}, {4, 5}, {6}},
+                      {{1, -1}, {7}, {}},
+                      {{7, 3}, {8}, {9}},
+                      {{4, 5, 8}, {10}, {}},
+                  },
+                  {10});
+  SetGraph(&graph);
+
+  auto is_unallocated = [&](int tensor_index) {
+    return (*graph.tensors())[tensor_index].data.raw == nullptr;
+  };
+
+  // The allocation plan is made at the beginning and is independent of
+  // the execution steps. Here's the allocation order:
+  //   Op0: +0 +1 +2 +3
+  //   Op1: +6 +4 +5 -6 -0 -2
+  //   Op2: +7 -1
+  //   Op3: +9 +8 -9 -3 -7
+  //   Op4: +10 -4 -5 -8
+
+  Execute(0, 0);
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
+  EXPECT_TRUE(is_unallocated(6));
+  EXPECT_TRUE(is_unallocated(4));
+  EXPECT_TRUE(is_unallocated(5));
+  EXPECT_TRUE(is_unallocated(7));
+  EXPECT_TRUE(is_unallocated(9));
+  EXPECT_TRUE(is_unallocated(8));
+  EXPECT_TRUE(is_unallocated(10));
+
+  Execute(1, 1);
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(6));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
+  EXPECT_TRUE(is_unallocated(7));
+  EXPECT_TRUE(is_unallocated(9));
+  EXPECT_TRUE(is_unallocated(8));
+  EXPECT_TRUE(is_unallocated(10));
+
+  Execute(2, 2);
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(6));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
+  // Here's an interesting allocation. Even though #6 requires only 21 bytes,
+  // its deallocation freed up 24 bytes due to the alignment requirements in
+  // the arena. That means we can fit #7 in the same space!
+  EXPECT_EQ(GetOffset(7), GetOffsetAfter(3));
+  EXPECT_TRUE(is_unallocated(9));
+  EXPECT_TRUE(is_unallocated(8));
+  EXPECT_TRUE(is_unallocated(10));
+
+  Execute(3, 3);
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(6));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(7), GetOffsetAfter(3));
+  // The deallocation of #0, #1 and #2 freed up 24 bytes but that's not enough
+  // for #9, so it goes at the end.
+  EXPECT_EQ(GetOffset(9), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(8), GetOffsetAfter(9));
+  EXPECT_TRUE(is_unallocated(10));
+
+  Execute(4, 4);
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(6));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(7), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(9), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(8), GetOffsetAfter(9));
+  // There's just enough space at the beginning for #10 due to the
+  // deallocation of #0, #1, #2 and #3 (total 36 bytes, #10 needs
+  // only 33.)
+  EXPECT_EQ(GetOffset(10), 0);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index e3c9cdd99beb93e356c148298dcbe6498fbe0306..19829e4991651111e13fc1805f97daef8bc016a7 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -5,25 +5,25 @@ def tflite_copts():
   copts = [
       "-DFARMHASH_NO_CXX_STRING",
   ] + select({
-          "//tensorflow:android_arm64": [
+          str(Label("//tensorflow:android_arm64")): [
               "-std=c++11",
               "-O3",
           ],
-          "//tensorflow:android_arm": [
+          str(Label("//tensorflow:android_arm")): [
               "-mfpu=neon",
               "-mfloat-abi=softfp",
               "-std=c++11",
               "-O3",
           ],
-          "//tensorflow:android_x86": [
+          str(Label("//tensorflow:android_x86")): [
               "-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK",
           ],
-          "//tensorflow:ios_x86_64": [
+          str(Label("//tensorflow:ios_x86_64")): [
               "-msse4.1",
           ],
           "//conditions:default": [],
   }) + select({
-      "//tensorflow:with_default_optimizations": [],
+      str(Label("//tensorflow:with_default_optimizations")): [],
       "//conditions:default": ["-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK"],
   })
 
@@ -89,11 +89,11 @@ def tflite_jni_linkopts():
   return tflite_jni_linkopts_unstripped() + select({
       "//tensorflow:android": [
           "-s",  # Omit symbol table.
+          "-latomic",  # Required for some uses of ISO C++11 <atomic> in x86.
       ],
       "//conditions:default": [],
   })
 
-
 def tflite_jni_binary(name,
                       copts=tflite_copts(),
                       linkopts=tflite_jni_linkopts(),
@@ -223,11 +223,12 @@ def gen_selected_ops(name, model):
   """
   out = name + "_registration.cc"
   tool = "//tensorflow/contrib/lite/tools:generate_op_registrations"
+  tflite_path = "//tensorflow/contrib/lite"
   native.genrule(
       name = name,
       srcs = [model],
       outs = [out],
-      cmd = ("$(location %s) --input_model=$(location %s) --output_registration=$(location %s)")
-      % (tool, model, out),
+      cmd = ("$(location %s) --input_model=$(location %s) --output_registration=$(location %s) --tflite_path=%s")
+      % (tool, model, out, tflite_path[2:]),
       tools = [tool],
   )
diff --git a/tensorflow/contrib/lite/build_ios_universal_lib.sh b/tensorflow/contrib/lite/build_ios_universal_lib.sh
index e0f2ef768bfed544ed8acd6c0e3a5823e61a1e8c..4a9023ff33de15dd384531d51e39de4ffeecdb8b 100755
--- a/tensorflow/contrib/lite/build_ios_universal_lib.sh
+++ b/tensorflow/contrib/lite/build_ios_universal_lib.sh
@@ -1,5 +1,24 @@
 #!/bin/bash -x
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
 set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR/../../.."
+
 make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8
 make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8
 make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 93072bf90bd8a18d9011a74c2eec95d86dbdce8a..5dbeadd16582ec586adab100b8a46e10182bd5ee 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
+#define TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
 
 #include <stdint.h>
 
@@ -83,7 +83,14 @@ typedef struct {
   TfLiteFusedActivation activation;
 } TfLiteRNNParams;
 
-typedef struct { TfLiteFusedActivation activation; } TfLiteFullyConnectedParams;
+typedef struct {
+  bool time_major;
+  TfLiteFusedActivation activation;
+} TfLiteSequenceRNNParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteFullyConnectedParams;
 
 typedef enum {
   kTfLiteLshProjectionUnknown = 0,
@@ -91,9 +98,13 @@ typedef enum {
   kTfLiteLshProjectionDense = 2,
 } TfLiteLSHProjectionType;
 
-typedef struct { TfLiteLSHProjectionType type; } TfLiteLSHProjectionParams;
+typedef struct {
+  TfLiteLSHProjectionType type;
+} TfLiteLSHProjectionParams;
 
-typedef struct { float beta; } TfLiteSoftmaxParams;
+typedef struct {
+  float beta;
+} TfLiteSoftmaxParams;
 
 typedef struct {
   int axis;
@@ -104,10 +115,24 @@ typedef struct {
   TfLiteFusedActivation activation;
 } TfLiteAddParams;
 
+typedef struct {
+} TfLiteSpaceToBatchNDParams;
+
+typedef struct {
+} TfLiteBatchToSpaceNDParams;
+
 typedef struct {
   TfLiteFusedActivation activation;
 } TfLiteMulParams;
 
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteSubParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteDivParams;
+
 typedef struct {
   TfLiteFusedActivation activation;
 } TfLiteL2NormParams;
@@ -126,10 +151,12 @@ typedef struct {
 } TfLiteLSTMParams;
 
 typedef struct {
-  int new_height;
-  int new_width;
+  bool align_corners;
 } TfLiteResizeBilinearParams;
 
+typedef struct {
+} TfLitePadParams;
+
 typedef struct {
   // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
   // For now we will fix the maximum possible number of dimensions.
@@ -157,8 +184,34 @@ typedef struct {
   TfLiteCombinerType combiner;
 } TfLiteEmbeddingLookupSparseParams;
 
+typedef struct {
+  int axis;
+} TfLiteGatherParams;
+
+typedef struct {
+} TfLiteTransposeParams;
+
+typedef struct {
+  bool keep_dims;
+} TfLiteMeanParams;
+
+typedef struct {
+  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
+  // For now we will fix the maximum possible number of dimensions.
+  int squeeze_dims[8];
+  int num_squeeze_dims;
+} TfLiteSqueezeParams;
+
+typedef struct {
+  int begin_mask;
+  int end_mask;
+  int ellipsis_mask;
+  int new_axis_mask;
+  int shrink_axis_mask;
+} TfLiteStridedSliceParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
index 41257a53b145cbe7e252c9d4de6ea7ef654431b5..b0c4d3431f9a67bc87d51ada91ed73f1661023a2 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/contrib/lite/context.h
@@ -26,8 +26,8 @@ limitations under the License.
 // TfLiteRegistration - the implementation of a conceptual operation.
 //
 // Some abstractions in this file are created and managed by Interpreter.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
+#define TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -38,6 +38,9 @@ extern "C" {
 
 typedef enum { kTfLiteOk = 0, kTfLiteError = 1 } TfLiteStatus;
 
+// Forward declare so GetNode can use this is in Context.
+typedef struct _TfLiteRegistration TfLiteRegistration;
+
 #define kOptionalTensor (-1)
 
 // Fixed size list of integers. Used for dimensions and inputs/outputs tensor
@@ -141,6 +144,7 @@ typedef struct {
 // A union of points that points to memory for a given tensor.
 typedef union {
   int* i32;
+  int64_t* i64;
   float* f;
   char* raw;
   const char* raw_const;
@@ -204,9 +208,56 @@ void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
 // Resize the allocated data of a (dynamic) tensor.
 void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
 
+// A structure representing an instance of a node.
+// This structure only exhibits the inputs, outputs and user defined data, not
+// other features like the type.
+typedef struct {
+  // Inputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* inputs;
+
+  // Outputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* outputs;
+
+  // Temporary tensors uses during the computations. This usually contains no
+  // tensors, but ops are allowed to change that if they need scratch space of
+  // any sort.
+  TfLiteIntArray* temporaries;
+
+  // Opaque data provided by the node implementer through `Registration.init`.
+  void* user_data;
+
+  // Opaque data provided to the node if the node is a builtin. This is usually
+  // a structure defined in builtin_op_data.h
+  void* builtin_data;
+
+  // Custom initial data. This is the opaque data provided in the flatbuffer.
+  // WARNING: This is an experimental interface that is subject to change.
+  const void* custom_initial_data;
+  int custom_initial_data_size;
+} TfLiteNode;
+
 typedef struct TfLiteContext {
   // Number of tensors in the context.
   int tensors_size;
+
+  // The execution plan contains a list of the node indices in execution
+  // order. execution_plan->size is the current number of nodes. And,
+  // execution_plan->data[0] is the first node that needs to be run.
+  // TfLiteDelegates can traverse the current execution plan by iterating
+  // through each member of this array and using GetNodeAndRegistration() to
+  // access details about a node. i.e.
+  // TfLiteIntArray* execution_plan;
+  // TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &execution_plan));
+  // for (int exec_index = 0; exec_index < execution_plan->size; exec_index++) {
+  //    int node_index = execution_plan->data[exec_index];
+  //    TfLiteNode* node;
+  //    TfLiteRegistration* reg;
+  //    context->GetNodeAndRegistration(context, node_index, &node, &reg);
+  // }
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*GetExecutionPlan)(struct TfLiteContext* context,
+                                   TfLiteIntArray** execution_plan);
+
   // An tensor of tensors in the interpreter context (of length `tensors_size`)
   TfLiteTensor* tensors;
 
@@ -226,34 +277,23 @@ typedef struct TfLiteContext {
   TfLiteStatus (*AddTensors)(struct TfLiteContext*, int tensors_to_add,
                              int* first_new_tensor_index);
 
+  // Get a Tensor node by node_index.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*GetNodeAndRegistration)(struct TfLiteContext*, int node_index,
+                                         TfLiteNode** node,
+                                         TfLiteRegistration** registration);
+
+  // Replace ops with delegate.
+  TfLiteStatus (*ReplaceSubgraphsWithDelegateKernels)(
+      struct TfLiteContext*, TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace);
+
   // TODO(ahentz): we should create a more general mechanism for this sort of
   // library-global objects.
   void* gemm_context;
 } TfLiteContext;
 
-// A structure representing an instance of a node.
-// This structure only exhibits the inputs, outputs and user defined data, not
-// other features like the type.
-typedef struct {
-  // Inputs to this node expressed as indices into the simulator's tensors.
-  TfLiteIntArray* inputs;
-
-  // Outputs to this node expressed as indices into the simulator's tensors.
-  TfLiteIntArray* outputs;
-
-  // Temporary tensors uses during the computations. This usually contains no
-  // tensors, but ops are allowed to change that if they need scratch space of
-  // any sort.
-  TfLiteIntArray* temporaries;
-
-  // Opaque data provided by the node implementer through `Registration.init`.
-  void* user_data;
-
-  // Opaque data provided to the node if the node is a builtin.
-  void* builtin_data;
-} TfLiteNode;
-
-typedef struct {
+typedef struct _TfLiteRegistration {
   // Initializes the op from serialized data.
   // If a built-in op:
   //   `buffer` is the op's params data (TfLiteLSTMParams*).
@@ -290,9 +330,27 @@ typedef struct {
   // NN API. Note, it is the responsibility of the registration binder to
   // set this properly.
   int32_t builtin_code;
+
+  // Custom op name. If the op is a builtin, this will be null.
+  // WARNING: This is an experimental interface that is subject to change.
+  const char* custom_name;
 } TfLiteRegistration;
 
+// WARNING: This is an experimental interface that is subject to change.
+typedef struct {
+  // Data that delegate needs to identify itself. This data is owned by the
+  // delegate. The delegate is owned in the user code, so the delegate is
+  // responsible for doing this when it is destroyed.
+  void* data_;
+  // Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
+  // delegate a view of the current graph through TfLiteContext*. It typically
+  // will look at the nodes and call ReplaceSubgraphsWithDelegateKernels()
+  // to ask the TensorFlow lite runtime to create macro-nodes to represent
+  // delegated subgraphs of the original graph.
+  TfLiteStatus (*Prepare)(TfLiteContext* context, void* data);
+} TfLiteDelegate;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
diff --git a/tensorflow/contrib/lite/context_test.cc b/tensorflow/contrib/lite/context_test.cc
index d0a104f43d9b9d148d80ce26b8ecf732d51ef110..20d6f69a25e9f0bb4323cf5d067b8ebd37bb3c23 100644
--- a/tensorflow/contrib/lite/context_test.cc
+++ b/tensorflow/contrib/lite/context_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/context.h"
 #include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
 
 namespace tflite {
 
@@ -68,7 +69,7 @@ TEST(IntArray, TestIntArrayEqual) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index 571d857be7292998996a4fb8101f0070064aa6be..a93ed201d647ddf2359a57254a959871c13fb94f 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,9 @@
 
 set -e
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR/../../.."
+
 DOWNLOADS_DIR=tensorflow/contrib/lite/downloads
 BZL_FILE_PATH=tensorflow/workspace.bzl
 
@@ -26,15 +29,13 @@ if [ ! -f $BZL_FILE_PATH ]; then
   exit 1;
 fi
 
-EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
 FARMHASH_URL="https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
 FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/master.zip"
-MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_ios_lite_float_2017_11_08.zip"
-QUANTIZED_MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
@@ -90,8 +91,6 @@ download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
 download_and_extract "${NEON_2_SSE_URL}" "${DOWNLOADS_DIR}/neon_2_sse"
 download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
-download_and_extract "${MODELS_URL}" "${DOWNLOADS_DIR}/models"
-download_and_extract "${QUANTIZED_MODELS_URL}" "${DOWNLOADS_DIR}/quantized_models"
 
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
@@ -100,7 +99,4 @@ replace_by_sed 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#s
 replace_by_sed 's#static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );#static uint64x2_t p2ul_CONJ_XOR;// = vld1q_u64( p2ul_conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
 
-cp ${DOWNLOADS_DIR}/models/models/* tensorflow/contrib/lite/examples/ios/simple/data/
-cp ${DOWNLOADS_DIR}/quantized_models/* tensorflow/contrib/lite/examples/ios/camera/data/
-
 echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/contrib/lite/error_reporter.cc b/tensorflow/contrib/lite/error_reporter.cc
index 6ba5384a94dbf9de03fb2e4e2f63074525eafa2d..03fcd5409ceab1895cea3b9e0e4fcb5a127e6a45 100644
--- a/tensorflow/contrib/lite/error_reporter.cc
+++ b/tensorflow/contrib/lite/error_reporter.cc
@@ -39,7 +39,9 @@ int ErrorReporter::ReportError(void*, const char* format, ...) {
 }
 
 int StderrReporter::Report(const char* format, va_list args) {
-  return vfprintf(stderr, format, args);
+  const int result = vfprintf(stderr, format, args);
+  fputc('\n', stderr);
+  return result;
 }
 
 ErrorReporter* DefaultErrorReporter() {
diff --git a/tensorflow/contrib/lite/error_reporter.h b/tensorflow/contrib/lite/error_reporter.h
index 637d456ce7a754c7da34e551869e49b4efd18e3b..da193d2586e9123341b9a41be049ee2a4382017a 100644
--- a/tensorflow/contrib/lite/error_reporter.h
+++ b/tensorflow/contrib/lite/error_reporter.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
+#define TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
 
 #include <cstdarg>
 #include "tensorflow/contrib/lite/context.h"
@@ -25,10 +25,10 @@ namespace tflite {
 //
 // Usage:
 //  ErrorReporter foo;
-//  foo.Report("test %d\n", 5);
+//  foo.Report("test %d", 5);
 // or
 //  va_list args;
-//  foo.Report("test %d\n", args); // where args is va_list
+//  foo.Report("test %d", args); // where args is va_list
 //
 // Sublclass ErrorReporter to provide another reporting destination.
 // For example, if you have a GUI program, you might redirect to a buffer
@@ -51,4 +51,4 @@ ErrorReporter* DefaultErrorReporter();
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
index ea398ad14e8be4c5a0021befc7cc076549b47e23..d74e275f0439b1ce56b29e0eadff5f211f6a4faa 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -123,7 +123,11 @@ static void GetTopN(const uint8_t* prediction, const int prediction_size, const
   AVCaptureDevice* device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
   AVCaptureDeviceInput* deviceInput =
       [AVCaptureDeviceInput deviceInputWithDevice:device error:&error];
-  assert(error == nil);
+
+  if (error != nil) {
+    NSLog(@"Failed to initialize AVCaptureDeviceInput. Note: This app doesn't work with simulator");
+    assert(NO);
+  }
 
   if ([session canAddInput:deviceInput]) [session addInput:deviceInput];
 
@@ -221,14 +225,8 @@ static void GetTopN(const uint8_t* prediction, const int prediction_size, const
   assert(pixelBuffer != NULL);
 
   OSType sourcePixelFormat = CVPixelBufferGetPixelFormatType(pixelBuffer);
-  int doReverseChannels;
-  if (kCVPixelFormatType_32ARGB == sourcePixelFormat) {
-    doReverseChannels = 1;
-  } else if (kCVPixelFormatType_32BGRA == sourcePixelFormat) {
-    doReverseChannels = 0;
-  } else {
-    assert(false);  // Unknown source format
-  }
+  assert(sourcePixelFormat == kCVPixelFormatType_32ARGB ||
+         sourcePixelFormat == kCVPixelFormatType_32BGRA);
 
   const int sourceRowBytes = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
   const int image_width = (int)CVPixelBufferGetWidth(pixelBuffer);
diff --git a/tensorflow/contrib/lite/examples/ios/camera/Podfile b/tensorflow/contrib/lite/examples/ios/camera/Podfile
index 4ae6fb6b94e4489f63506b05a2f348b7daafd3b7..c7d3b1c966eaa0de71f5c37a6a77b3881e30ddd7 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/Podfile
+++ b/tensorflow/contrib/lite/examples/ios/camera/Podfile
@@ -2,4 +2,4 @@ platform :ios, '8.0'
 inhibit_all_warnings!
 
 target 'tflite_camera_example'
-       pod 'TensorFlow-experimental'
+       pod 'TensorFlowLite'
diff --git a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
index c98183276bd60d2a0ad023ba26aad12572a02786..b0236e9c608ec35437bcfe79c51149a76f9f416e 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
@@ -16,7 +16,6 @@
 		1CDB2D4E1ED3AA35007929E9 /* Info.plist in Resources */ = {isa = PBXBuildFile; fileRef = 1CDB2D4D1ED3AA35007929E9 /* Info.plist */; };
 		54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */; };
 		AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */ = {isa = PBXBuildFile; fileRef = AC1F82641FBA3CBD0052BA77 /* labels.txt */; };
-		AC1F82691FBA3F930052BA77 /* libtensorflow-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = AC1F82681FBA3F930052BA77 /* libtensorflow-lite.a */; };
 		ACA1A4CA1FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = ACA1A4C91FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite */; };
 /* End PBXBuildFile section */
 
@@ -38,7 +37,6 @@
 		3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.debug.xcconfig"; sourceTree = "<group>"; };
 		55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.release.xcconfig"; sourceTree = "<group>"; };
 		AC1F82641FBA3CBD0052BA77 /* labels.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = labels.txt; sourceTree = "<group>"; };
-		AC1F82681FBA3F930052BA77 /* libtensorflow-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libtensorflow-lite.a"; path = "../../../gen/lib/libtensorflow-lite.a"; sourceTree = "<group>"; };
 		ACA1A4C91FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_quant_v1_224.tflite; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
@@ -47,7 +45,6 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				AC1F82691FBA3F930052BA77 /* libtensorflow-lite.a in Frameworks */,
 				1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */,
 				1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */,
 				54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */,
@@ -60,7 +57,6 @@
 		24D7686C331131624F4454A0 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
-				AC1F82681FBA3F930052BA77 /* libtensorflow-lite.a */,
 				1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */,
 				1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */,
 				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
@@ -336,7 +332,6 @@
 					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				LIBRARY_SEARCH_PATHS = ../../../gen/lib/;
 				MTL_ENABLE_DEBUG_INFO = YES;
 				ONLY_ACTIVE_ARCH = YES;
 				SDKROOT = iphoneos;
@@ -384,7 +379,6 @@
 					../../../downloads/,
 				);
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				LIBRARY_SEARCH_PATHS = ../../../gen/lib/;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				SDKROOT = iphoneos;
 				TARGETED_DEVICE_FAMILY = "1,2";
diff --git a/tensorflow/contrib/lite/examples/ios/download_models.sh b/tensorflow/contrib/lite/examples/ios/download_models.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ccd163758c5830dc9367e023dcb3a604e07ca5db
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/download_models.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -ex
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_ios_lite_float_2017_11_08.zip"
+QUANTIZED_MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip"
+DOWNLOADS_DIR=$(mktemp -d)
+
+cd $SCRIPT_DIR
+
+download_and_extract() {
+  local usage="Usage: download_and_extract URL DIR"
+  local url="${1:?${usage}}"
+  local dir="${2:?${usage}}"
+  echo "downloading ${url}" >&2
+  mkdir -p "${dir}"
+  tempdir=$(mktemp -d)
+  tempdir2=$(mktemp -d)
+
+  curl -L ${url} > ${tempdir}/zipped.zip
+  unzip ${tempdir}/zipped.zip -d ${tempdir2}
+
+  # If the zip file contains nested directories, extract the files from the
+  # inner directory.
+  if ls ${tempdir2}/*/* 1> /dev/null 2>&1; then
+    # unzip has no strip components, so unzip to a temp dir, and move the
+    # files we want from the tempdir to destination.
+    cp -R ${tempdir2}/*/* ${dir}/
+  else
+    cp -R ${tempdir2}/* ${dir}/
+  fi
+  rm -rf ${tempdir2} ${tempdir}
+}
+
+download_and_extract "${MODELS_URL}" "${DOWNLOADS_DIR}/models"
+download_and_extract "${QUANTIZED_MODELS_URL}" "${DOWNLOADS_DIR}/quantized_models"
+
+file ${DOWNLOADS_DIR}/models
+
+cp ${DOWNLOADS_DIR}/models/models/* simple/data/
+cp ${DOWNLOADS_DIR}/quantized_models/* camera/data/
+
diff --git a/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.h b/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.h
index 75b1f1da384b527e8332dfba08fec87c65eff8b1..94046d9728258901091f018fd0d081651145f400 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.h
+++ b/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.h
@@ -14,8 +14,8 @@
 
 #import <UIKit/UIKit.h>
 
-@interface AppDelegate : UIResponder <UIApplicationDelegate>
+@interface AppDelegate : UIResponder<UIApplicationDelegate>
 
-@property (strong, nonatomic) UIWindow *window;
+@property(strong, nonatomic) UIWindow *window;
 
 @end
diff --git a/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.mm b/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.mm
index 1e808eb976ff3eeda4cf6f81b3c1794c6a037dc8..d1215fa0bffd978b4aaadbd8bc13b07723703c9a 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.mm
+++ b/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.mm
@@ -22,8 +22,7 @@
     didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
 
   UITabBarController *bar = [[UITabBarController alloc] init];
-  [bar setViewControllers:
-      @[[[RunModelViewController alloc] init]]];
+  [bar setViewControllers:@[ [[RunModelViewController alloc] init] ]];
   bar.selectedIndex = 0;
   self.window = [[UIWindow alloc] initWithFrame:[[UIScreen mainScreen] bounds]];
   self.window.rootViewController = bar;
@@ -31,14 +30,19 @@
   return YES;
 }
 
-- (void)applicationWillResignActive:(UIApplication *)application {}
+- (void)applicationWillResignActive:(UIApplication *)application {
+}
 
-- (void)applicationDidEnterBackground:(UIApplication *)application {}
+- (void)applicationDidEnterBackground:(UIApplication *)application {
+}
 
-- (void)applicationWillEnterForeground:(UIApplication *)application {}
+- (void)applicationWillEnterForeground:(UIApplication *)application {
+}
 
-- (void)applicationDidBecomeActive:(UIApplication *)application {}
+- (void)applicationDidBecomeActive:(UIApplication *)application {
+}
 
-- (void)applicationWillTerminate:(UIApplication *)application {}
+- (void)applicationWillTerminate:(UIApplication *)application {
+}
 
 @end
diff --git a/tensorflow/contrib/lite/examples/ios/simple/Podfile b/tensorflow/contrib/lite/examples/ios/simple/Podfile
index 1740ad64573a84fae6de0fcf284eb06afec67e25..e4aca2be82d437a0225d2c15d3e486b0344aa978 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/Podfile
+++ b/tensorflow/contrib/lite/examples/ios/simple/Podfile
@@ -1,5 +1,5 @@
 platform :ios, '8.0'
 inhibit_all_warnings!
 
-target 'tf_simple_example'
-       pod 'TensorFlow-experimental'
+target 'tflite_simple_example'
+       pod 'TensorFlowLite'
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModel-Info.plist b/tensorflow/contrib/lite/examples/ios/simple/RunModel-Info.plist
index 1a3eaa8a2c18d1cd24dfd475d396b00ec4d86c9d..a19a43a7541e3d751116e868dbcbdd607d15ab4a 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/RunModel-Info.plist
+++ b/tensorflow/contrib/lite/examples/ios/simple/RunModel-Info.plist
@@ -7,7 +7,7 @@
 	<key>CFBundleDisplayName</key>
 	<string>tflite-simple-example</string>
 	<key>CFBundleExecutable</key>
-	<string>tf_simple_example</string>
+	<string>tflite_simple_example</string>
 	<key>CFBundleIdentifier</key>
 	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
 	<key>CFBundleInfoDictionaryVersion</key>
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.h b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.h
index 4e1a83ccf5a12c609baadab7359c55ec4f464ed8..a4b358b4eb7f6ba109638405091b798d30bd1768 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.h
+++ b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.h
@@ -18,7 +18,7 @@
 
 - (IBAction)getUrl:(id)sender;
 
-@property (weak, nonatomic) IBOutlet UITextView *urlContentTextView;
-@property (weak, nonatomic) IBOutlet UITextField *urlTextField;
+@property(weak, nonatomic) IBOutlet UITextView *urlContentTextView;
+@property(weak, nonatomic) IBOutlet UITextField *urlTextField;
 
 @end
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
index 965d83010516c6db72c9e8b1c33079b3eda204de..0ab7aa25d0b4e6d2c02e61ec1d82b85258b3dfbc 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
+++ b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
@@ -14,10 +14,10 @@
 
 #import "RunModelViewController.h"
 
-#include <fstream>
-#include <iostream>
 #include <pthread.h>
 #include <unistd.h>
+#include <fstream>
+#include <iostream>
 #include <queue>
 #include <sstream>
 #include <string>
@@ -29,9 +29,6 @@
 
 #include "ios_image_load.h"
 
-#define LOG(x) std::cerr
-#define CHECK(x) if (!(x)) { LOG(ERROR) << #x << "failed"; exit(1); }
-
 NSString* RunInferenceOnImage();
 
 @interface RunModelViewController ()
@@ -49,15 +46,12 @@ NSString* RunInferenceOnImage();
 
 // Returns the top N confidence values over threshold in the provided vector,
 // sorted by confidence in descending order.
-static void GetTopN(
-    const float* prediction,
-    const int prediction_size,
-    const int num_results, const float threshold,
-    std::vector<std::pair<float, int> >* top_results) {
+static void GetTopN(const float* prediction, const int prediction_size, const int num_results,
+                    const float threshold, std::vector<std::pair<float, int> >* top_results) {
   // Will contain top N results in ascending order.
-  std::priority_queue<std::pair<float, int>,
-      std::vector<std::pair<float, int> >,
-      std::greater<std::pair<float, int> > > top_result_pq;
+  std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int> >,
+                      std::greater<std::pair<float, int> > >
+      top_result_pq;
 
   const long count = prediction_size;
   for (int i = 0; i < count; ++i) {
@@ -88,27 +82,29 @@ static void GetTopN(
 NSString* FilePathForResourceName(NSString* name, NSString* extension) {
   NSString* file_path = [[NSBundle mainBundle] pathForResource:name ofType:extension];
   if (file_path == NULL) {
-    LOG(FATAL) << "Couldn't find '" << [name UTF8String] << "."
-	       << [extension UTF8String] << "' in bundle.";
+    NSLog(@"Couldn't find '%@.%@' in bundle.", name, extension);
+    exit(-1);
   }
   return file_path;
 }
 
 NSString* RunInferenceOnImage() {
-  std::string graph;
+  NSString* graph = @"mobilenet_v1_1.0_224";
   const int num_threads = 1;
   std::string input_layer_type = "float";
   std::vector<int> sizes = {1, 224, 224, 3};
 
-  NSString* graph_path = FilePathForResourceName(@"mobilenet_v1_1.0_224", @"tflite");
+  const NSString* graph_path = FilePathForResourceName(graph, @"tflite");
 
-  std::unique_ptr<tflite::FlatBufferModel> model(tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]));
+  std::unique_ptr<tflite::FlatBufferModel> model(
+      tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]));
   if (!model) {
-    LOG(FATAL) << "Failed to mmap model " << graph;
+    NSLog(@"Failed to mmap model %@.", graph);
+    exit(-1);
   }
-  LOG(INFO) << "Loaded model " << graph;
+  NSLog(@"Loaded model %@.", graph);
   model->error_reporter();
-  LOG(INFO) << "resolved reporter";
+  NSLog(@"Resolved reporter.");
 
 #ifdef TFLITE_CUSTOM_OPS_HEADER
   tflite::MutableOpResolver resolver;
@@ -120,7 +116,8 @@ NSString* RunInferenceOnImage() {
   std::unique_ptr<tflite::Interpreter> interpreter;
   tflite::InterpreterBuilder(*model, resolver)(&interpreter);
   if (!interpreter) {
-    LOG(FATAL) << "Failed to construct interpreter";
+    NSLog(@"Failed to construct interpreter.");
+    exit(-1);
   }
 
   if (num_threads != -1) {
@@ -134,7 +131,8 @@ NSString* RunInferenceOnImage() {
   }
 
   if (interpreter->AllocateTensors() != kTfLiteOk) {
-    LOG(FATAL) << "Failed to allocate tensors!";
+    NSLog(@"Failed to allocate tensors.");
+    exit(-1);
   }
 
   // Read the label list
@@ -143,7 +141,7 @@ NSString* RunInferenceOnImage() {
   std::ifstream t;
   t.open([labels_path UTF8String]);
   std::string line;
-  while(t){
+  while (t) {
     std::getline(t, line);
     label_strings.push_back(line);
   }
@@ -154,7 +152,8 @@ NSString* RunInferenceOnImage() {
   int image_width;
   int image_height;
   int image_channels;
-  std::vector<uint8_t> image_data = LoadImageFromFile([image_path UTF8String], &image_width, &image_height, &image_channels);
+  std::vector<uint8_t> image_data =
+      LoadImageFromFile([image_path UTF8String], &image_width, &image_height, &image_channels);
   const int wanted_width = 224;
   const int wanted_height = 224;
   const int wanted_channels = 3;
@@ -178,7 +177,8 @@ NSString* RunInferenceOnImage() {
   }
 
   if (interpreter->Invoke() != kTfLiteOk) {
-    LOG(FATAL) << "Failed to invoke!";
+    NSLog(@"Failed to invoke!");
+    exit(-1);
   }
 
   float* output = interpreter->typed_output_tensor<float>(0);
@@ -208,12 +208,9 @@ NSString* RunInferenceOnImage() {
     ss << "\n";
   }
 
-  LOG(INFO) << "Predictions: " << ss.str();
-
   std::string predictions = ss.str();
   NSString* result = @"";
-  result = [NSString stringWithFormat: @"%@ - %s", result,
-            predictions.c_str()];
-  
+  result = [NSString stringWithFormat:@"%@ - %s", result, predictions.c_str()];
+  NSLog(@"Predictions: %@", result);
   return result;
 }
diff --git a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h
index 7287d0d63d5b4c0b9c9a528578b6341cdb9c9954..98934ce41d349b33d4fc010a39a956e52f3d5721 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h
+++ b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h
@@ -17,9 +17,7 @@
 
 #include <vector>
 
-std::vector<uint8_t> LoadImageFromFile(const char* file_name,
-						 int* out_width,
-						 int* out_height,
-						 int* out_channels);
+std::vector<uint8_t> LoadImageFromFile(const char* file_name, int* out_width,
+                                       int* out_height, int* out_channels);
 
 #endif  // TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
diff --git a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.mm b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.mm
index 789522d2a9900b136f91f77c4ada682f1a316848..cb0fe1a7650c572d3745066431f2759daa94ffc9 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.mm
+++ b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.mm
@@ -14,17 +14,16 @@
 
 #include "ios_image_load.h"
 
-#include <stdlib.h>
-#include <string.h>
 #include <assert.h>
 #include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #import <CoreImage/CoreImage.h>
 #import <ImageIO/ImageIO.h>
 
-std::vector<uint8_t> LoadImageFromFile(const char* file_name,
-				     int* out_width, int* out_height,
-				     int* out_channels) {
+std::vector<uint8_t> LoadImageFromFile(const char* file_name, int* out_width, int* out_height,
+                                       int* out_channels) {
   FILE* file_handle = fopen(file_name, "rb");
   fseek(file_handle, 0, SEEK_END);
   const size_t bytes_in_file = ftell(file_handle);
@@ -32,11 +31,10 @@ std::vector<uint8_t> LoadImageFromFile(const char* file_name,
   std::vector<uint8_t> file_data(bytes_in_file);
   fread(file_data.data(), 1, bytes_in_file, file_handle);
   fclose(file_handle);
-  CFDataRef file_data_ref = CFDataCreateWithBytesNoCopy(NULL, file_data.data(),
-						      bytes_in_file,
-						      kCFAllocatorNull);
-  CGDataProviderRef image_provider =
-    CGDataProviderCreateWithCFData(file_data_ref);
+
+  CFDataRef file_data_ref =
+      CFDataCreateWithBytesNoCopy(NULL, file_data.data(), bytes_in_file, kCFAllocatorNull);
+  CGDataProviderRef image_provider = CGDataProviderCreateWithCFData(file_data_ref);
 
   const char* suffix = strrchr(file_name, '.');
   if (!suffix || suffix == file_name) {
@@ -44,12 +42,10 @@ std::vector<uint8_t> LoadImageFromFile(const char* file_name,
   }
   CGImageRef image;
   if (strcasecmp(suffix, ".png") == 0) {
-    image = CGImageCreateWithPNGDataProvider(image_provider, NULL, true,
-					     kCGRenderingIntentDefault);
-  } else if ((strcasecmp(suffix, ".jpg") == 0) ||
-    (strcasecmp(suffix, ".jpeg") == 0)) {
-    image = CGImageCreateWithJPEGDataProvider(image_provider, NULL, true,
-					      kCGRenderingIntentDefault);
+    image = CGImageCreateWithPNGDataProvider(image_provider, NULL, true, kCGRenderingIntentDefault);
+  } else if ((strcasecmp(suffix, ".jpg") == 0) || (strcasecmp(suffix, ".jpeg") == 0)) {
+    image =
+        CGImageCreateWithJPEGDataProvider(image_provider, NULL, true, kCGRenderingIntentDefault);
   } else {
     CFRelease(image_provider);
     CFRelease(file_data_ref);
@@ -68,9 +64,10 @@ std::vector<uint8_t> LoadImageFromFile(const char* file_name,
   const int bytes_in_image = (bytes_per_row * height);
   std::vector<uint8_t> result(bytes_in_image);
   const int bits_per_component = 8;
-  CGContextRef context = CGBitmapContextCreate(result.data(), width, height,
-    bits_per_component, bytes_per_row, color_space,
-    kCGImageAlphaPremultipliedLast | kCGBitmapByteOrder32Big);
+
+  CGContextRef context =
+      CGBitmapContextCreate(result.data(), width, height, bits_per_component, bytes_per_row,
+                            color_space, kCGImageAlphaPremultipliedLast | kCGBitmapByteOrder32Big);
   CGColorSpaceRelease(color_space);
   CGContextDrawImage(context, CGRectMake(0, 0, width, height), image);
   CGContextRelease(context);
diff --git a/tensorflow/contrib/lite/examples/ios/simple/main.mm b/tensorflow/contrib/lite/examples/ios/simple/main.mm
index d70550a730720e5d6799a186c1beb3cfa04b0b9d..05cb55ddd7a230593863e64b351f6aac31a1b4d7 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/main.mm
+++ b/tensorflow/contrib/lite/examples/ios/simple/main.mm
@@ -14,7 +14,7 @@
 
 #import <UIKit/UIKit.h>
 
-int main(int argc, char * argv[]) {
+int main(int argc, char *argv[]) {
   @autoreleasepool {
     NSString *delegateClassName = @"AppDelegate";
     return UIApplicationMain(argc, argv, nil, delegateClassName);
diff --git a/tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj b/tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj
index 9277c230b8cce1b5673a50d32d7640d52e2e8f9d..f5b8382d5ae4ac80a7edb52c34ebaf12ad65f4db 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj
+++ b/tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj
@@ -9,7 +9,7 @@
 /* Begin PBXBuildFile section */
 		1C0D734B1ECCC460008C1DAB /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */; };
 		1CA45FFF1ECCC356002FA6A4 /* UIKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */; };
-		594C14AE1FB8F9B500EE8BFE /* libtensorflow-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 594C14AD1FB8F9B500EE8BFE /* libtensorflow-lite.a */; };
+		1E6F42DBB39A4A3871D4F848 /* libPods-tflite_simple_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 73DBC33C5DD9A526EE6D1EF2 /* libPods-tflite_simple_example.a */; };
 		594C14B11FB9037100EE8BFE /* labels.txt in Resources */ = {isa = PBXBuildFile; fileRef = 594C14AF1FB9037100EE8BFE /* labels.txt */; };
 		594C14B21FB9037100EE8BFE /* mobilenet_v1_1.0_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = 594C14B01FB9037100EE8BFE /* mobilenet_v1_1.0_224.tflite */; };
 		59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFF21CF4E68100C4259F /* AppDelegate.mm */; };
@@ -24,8 +24,7 @@
 		1C0D73481ECCC41B008C1DAB /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = System/Library/Frameworks/CoreImage.framework; sourceTree = SDKROOT; };
 		1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
 		1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
-		5911579B1CF4011C00C31E3A /* tf_simple_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tf_simple_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		594C14AD1FB8F9B500EE8BFE /* libtensorflow-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libtensorflow-lite.a"; path = "../../../gen/lib/libtensorflow-lite.a"; sourceTree = "<group>"; };
+		5911579B1CF4011C00C31E3A /* tflite_simple_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tflite_simple_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		594C14AF1FB9037100EE8BFE /* labels.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = labels.txt; sourceTree = "<group>"; };
 		594C14B01FB9037100EE8BFE /* mobilenet_v1_1.0_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_v1_1.0_224.tflite; sourceTree = "<group>"; };
 		59A3CFF11CF4E68100C4259F /* AppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
@@ -38,7 +37,9 @@
 		59A3CFFE1CF4E68100C4259F /* RunModelViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RunModelViewController.h; sourceTree = "<group>"; };
 		59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RunModelViewController.mm; sourceTree = "<group>"; };
 		59A3D0001CF4E68100C4259F /* RunModelViewController.xib */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.xib; path = RunModelViewController.xib; sourceTree = "<group>"; };
-		73DBC33C5DD9A526EE6D1EF2 /* libPods-tf_simple_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tf_simple_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
+		5D6203B9FAEEB9824194DBE8 /* Pods-tflite_simple_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_simple_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_simple_example/Pods-tflite_simple_example.release.xcconfig"; sourceTree = "<group>"; };
+		73DBC33C5DD9A526EE6D1EF2 /* libPods-tflite_simple_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tflite_simple_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
+		987DD5BCAB2DD8B682674E20 /* Pods-tflite_simple_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_simple_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_simple_example/Pods-tflite_simple_example.debug.xcconfig"; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -46,9 +47,9 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				594C14AE1FB8F9B500EE8BFE /* libtensorflow-lite.a in Frameworks */,
 				1C0D734B1ECCC460008C1DAB /* CoreGraphics.framework in Frameworks */,
 				1CA45FFF1ECCC356002FA6A4 /* UIKit.framework in Frameworks */,
+				1E6F42DBB39A4A3871D4F848 /* libPods-tflite_simple_example.a in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -58,11 +59,10 @@
 		24D7686C331131624F4454A0 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
-				594C14AD1FB8F9B500EE8BFE /* libtensorflow-lite.a */,
 				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
 				1C0D73481ECCC41B008C1DAB /* CoreImage.framework */,
 				1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */,
-				73DBC33C5DD9A526EE6D1EF2 /* libPods-tf_simple_example.a */,
+				73DBC33C5DD9A526EE6D1EF2 /* libPods-tflite_simple_example.a */,
 			);
 			name = Frameworks;
 			sourceTree = "<group>";
@@ -82,13 +82,14 @@
 				59A3D0001CF4E68100C4259F /* RunModelViewController.xib */,
 				5911579C1CF4011C00C31E3A /* Products */,
 				24D7686C331131624F4454A0 /* Frameworks */,
+				5CE7E4179B26BF77944D8637 /* Pods */,
 			);
 			sourceTree = "<group>";
 		};
 		5911579C1CF4011C00C31E3A /* Products */ = {
 			isa = PBXGroup;
 			children = (
-				5911579B1CF4011C00C31E3A /* tf_simple_example.app */,
+				5911579B1CF4011C00C31E3A /* tflite_simple_example.app */,
 			);
 			name = Products;
 			sourceTree = "<group>";
@@ -103,24 +104,36 @@
 			path = data;
 			sourceTree = "<group>";
 		};
+		5CE7E4179B26BF77944D8637 /* Pods */ = {
+			isa = PBXGroup;
+			children = (
+				987DD5BCAB2DD8B682674E20 /* Pods-tflite_simple_example.debug.xcconfig */,
+				5D6203B9FAEEB9824194DBE8 /* Pods-tflite_simple_example.release.xcconfig */,
+			);
+			name = Pods;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */
 
 /* Begin PBXNativeTarget section */
-		5911579A1CF4011C00C31E3A /* tf_simple_example */ = {
+		5911579A1CF4011C00C31E3A /* tflite_simple_example */ = {
 			isa = PBXNativeTarget;
-			buildConfigurationList = 591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tf_simple_example" */;
+			buildConfigurationList = 591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tflite_simple_example" */;
 			buildPhases = (
+				A507411BCC70190B9ABD2721 /* [CP] Check Pods Manifest.lock */,
 				591157971CF4011C00C31E3A /* Sources */,
 				591157981CF4011C00C31E3A /* Frameworks */,
 				591157991CF4011C00C31E3A /* Resources */,
+				25E1671BDC7334C678FB5DFB /* [CP] Embed Pods Frameworks */,
+				10976C49D86B7F8A59157601 /* [CP] Copy Pods Resources */,
 			);
 			buildRules = (
 			);
 			dependencies = (
 			);
-			name = tf_simple_example;
+			name = tflite_simple_example;
 			productName = tf_ios_makefile_example;
-			productReference = 5911579B1CF4011C00C31E3A /* tf_simple_example.app */;
+			productReference = 5911579B1CF4011C00C31E3A /* tflite_simple_example.app */;
 			productType = "com.apple.product-type.application";
 		};
 /* End PBXNativeTarget section */
@@ -152,7 +165,7 @@
 			projectDirPath = "";
 			projectRoot = "";
 			targets = (
-				5911579A1CF4011C00C31E3A /* tf_simple_example */,
+				5911579A1CF4011C00C31E3A /* tflite_simple_example */,
 			);
 		};
 /* End PBXProject section */
@@ -171,6 +184,57 @@
 		};
 /* End PBXResourcesBuildPhase section */
 
+/* Begin PBXShellScriptBuildPhase section */
+		10976C49D86B7F8A59157601 /* [CP] Copy Pods Resources */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Copy Pods Resources";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tflite_simple_example/Pods-tflite_simple_example-resources.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+		25E1671BDC7334C678FB5DFB /* [CP] Embed Pods Frameworks */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Embed Pods Frameworks";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tflite_simple_example/Pods-tflite_simple_example-frameworks.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+		A507411BCC70190B9ABD2721 /* [CP] Check Pods Manifest.lock */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
+				"${PODS_ROOT}/Manifest.lock",
+			);
+			name = "[CP] Check Pods Manifest.lock";
+			outputPaths = (
+				"$(DERIVED_FILE_DIR)/Pods-tflite_simple_example-checkManifestLockResult.txt",
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
+			showEnvVarsInLog = 0;
+		};
+/* End PBXShellScriptBuildPhase section */
+
 /* Begin PBXSourcesBuildPhase section */
 		591157971CF4011C00C31E3A /* Sources */ = {
 			isa = PBXSourcesBuildPhase;
@@ -274,6 +338,7 @@
 		};
 		591157B31CF4011D00C31E3A /* Debug */ = {
 			isa = XCBuildConfiguration;
+			baseConfigurationReference = 987DD5BCAB2DD8B682674E20 /* Pods-tflite_simple_example.debug.xcconfig */;
 			buildSettings = {
 				CLANG_DEBUG_INFORMATION_LEVEL = default;
 				CODE_SIGN_IDENTITY = "iPhone Developer";
@@ -283,15 +348,10 @@
 				GCC_ENABLE_CPP_RTTI = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				INFOPLIST_FILE = "$(SRCROOT)/RunModel-Info.plist";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				LIBRARY_SEARCH_PATHS = ../../../gen/lib/;
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
 				OTHER_LDFLAGS = "$(inherited)";
 				PRODUCT_BUNDLE_IDENTIFIER = "com.google.tflite-simple-example";
@@ -304,6 +364,7 @@
 		};
 		591157B41CF4011D00C31E3A /* Release */ = {
 			isa = XCBuildConfiguration;
+			baseConfigurationReference = 5D6203B9FAEEB9824194DBE8 /* Pods-tflite_simple_example.release.xcconfig */;
 			buildSettings = {
 				CLANG_DEBUG_INFORMATION_LEVEL = default;
 				CODE_SIGN_IDENTITY = "iPhone Developer";
@@ -313,15 +374,10 @@
 				GCC_ENABLE_CPP_RTTI = YES;
 				HEADER_SEARCH_PATHS = (
 					"$(inherited)",
-					../../../../../../,
-					../../../downloads/flatbuffers/include/,
-					../../../downloads/eigen/,
-					../../../downloads/,
 				);
 				INFOPLIST_FILE = "$(SRCROOT)/RunModel-Info.plist";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				LIBRARY_SEARCH_PATHS = ../../../gen/lib/;
 				ONLY_ACTIVE_ARCH = YES;
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
 				OTHER_LDFLAGS = "$(inherited)";
@@ -344,7 +400,7 @@
 			defaultConfigurationIsVisible = 0;
 			defaultConfigurationName = Release;
 		};
-		591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tf_simple_example" */ = {
+		591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tflite_simple_example" */ = {
 			isa = XCConfigurationList;
 			buildConfigurations = (
 				591157B31CF4011D00C31E3A /* Debug */,
diff --git a/tensorflow/contrib/lite/examples/label_image/BUILD b/tensorflow/contrib/lite/examples/label_image/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..959347b5491514ddc13af57ea6f7385a0d39e418
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/label_image/BUILD
@@ -0,0 +1,83 @@
+# Description:
+# TensorFlow Lite Example Label Image.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_linkopts")
+
+exports_files(glob([
+    "testdata/*.bmp",
+]))
+
+tf_cc_binary(
+    name = "label_image",
+    srcs = [
+        "get_top_n.h",
+        "get_top_n_impl.h",
+        "label_image.cc",
+    ],
+    linkopts = tflite_linkopts() + select({
+        "//tensorflow:android": [
+            "-pie",  # Android 5.0 and later supports only PIE
+            "-lm",  # some builtin ops, e.g., tanh, need -lm
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":bitmap_helpers",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ],
+)
+
+cc_library(
+    name = "bitmap_helpers",
+    srcs = ["bitmap_helpers.cc"],
+    hdrs = [
+        "bitmap_helpers.h",
+        "bitmap_helpers_impl.h",
+        "label_image.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite:string",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+    ],
+)
+
+# TODO(ahentz): Test disabled as it has a memory leek from read_bmp
+# cc_test(
+#     name = "label_image_test",
+#     srcs = [
+#         "get_top_n.h",
+#         "get_top_n_impl.h",
+#         "label_image_test.cc",
+#     ],
+#     data = [
+#         "testdata/grace_hopper.bmp",
+#     ],
+#     deps = [
+#         ":bitmap_helpers",
+#         "//testing/base/public:gunit",
+#     ],
+# )
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.cc b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b38cd38c83927c65d251b9356301b6bef7521f2
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.cc
@@ -0,0 +1,120 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+
+#include <unistd.h>  // NOLINT(build/include_order)
+
+#include "tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h"
+
+#define LOG(x) std::cerr
+
+namespace tflite {
+namespace label_image {
+
+uint8_t* decode_bmp(const uint8_t* input, int row_size, uint8_t* const output,
+                    int width, int height, int channels, bool top_down) {
+  for (int i = 0; i < height; i++) {
+    int src_pos;
+    int dst_pos;
+
+    for (int j = 0; j < width; j++) {
+      if (!top_down) {
+        src_pos = ((height - 1 - i) * row_size) + j * channels;
+      } else {
+        src_pos = i * row_size + j * channels;
+      }
+
+      dst_pos = (i * width + j) * channels;
+
+      switch (channels) {
+        case 1:
+          output[dst_pos] = input[src_pos];
+          break;
+        case 3:
+          // BGR -> RGB
+          output[dst_pos] = input[src_pos + 2];
+          output[dst_pos + 1] = input[src_pos + 1];
+          output[dst_pos + 2] = input[src_pos];
+          break;
+        case 4:
+          // BGRA -> RGBA
+          output[dst_pos] = input[src_pos + 2];
+          output[dst_pos + 1] = input[src_pos + 1];
+          output[dst_pos + 2] = input[src_pos];
+          output[dst_pos + 3] = input[src_pos + 3];
+          break;
+        default:
+          LOG(FATAL) << "Unexpected number of channels: " << channels;
+          break;
+      }
+    }
+  }
+
+  return output;
+}
+
+uint8_t* read_bmp(const std::string& input_bmp_name, int* width, int* height,
+                  int* channels, Settings* s) {
+  int begin, end;
+
+  std::ifstream file(input_bmp_name, std::ios::in | std::ios::binary);
+  if (!file) {
+    LOG(FATAL) << "input file " << input_bmp_name << " not found\n";
+    exit(-1);
+  }
+
+  begin = file.tellg();
+  file.seekg(0, std::ios::end);
+  end = file.tellg();
+  size_t len = end - begin;
+
+  if (s->verbose) LOG(INFO) << "len: " << len << "\n";
+
+  const uint8_t* img_bytes = new uint8_t[len];
+  file.seekg(0, std::ios::beg);
+  file.read((char*)img_bytes, len);
+  const int32_t header_size =
+      *(reinterpret_cast<const int32_t*>(img_bytes + 10));
+  *width = *(reinterpret_cast<const int32_t*>(img_bytes + 18));
+  *height = *(reinterpret_cast<const int32_t*>(img_bytes + 22));
+  const int32_t bpp = *(reinterpret_cast<const int32_t*>(img_bytes + 28));
+  *channels = bpp / 8;
+
+  if (s->verbose)
+    LOG(INFO) << "width, height, channels: " << *width << ", " << *height
+              << ", " << *channels << "\n";
+
+  // there may be padding bytes when the width is not a multiple of 4 bytes
+  // 8 * channels == bits per pixel
+  const int row_size = (8 * *channels * *width + 31) / 32 * 4;
+
+  // if height is negative, data layout is top down
+  // otherwise, it's bottom up
+  bool top_down = (*height < 0);
+
+  // Decode image, allocating tensor once the image size is known
+  uint8_t* output = new uint8_t[abs(*height) * *width * *channels];
+  const uint8_t* bmp_pixels = &img_bytes[header_size];
+  return decode_bmp(bmp_pixels, row_size, output, *width, abs(*height),
+                    *channels, top_down);
+}
+
+}  // namespace label_image
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..97343dde6b31694e5b2de20b35a7083fb8fe4a0e
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
+#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
+
+#include "tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h"
+#include "tensorflow/contrib/lite/examples/label_image/label_image.h"
+
+namespace tflite {
+namespace label_image {
+
+uint8_t* read_bmp(const std::string& input_bmp_name, int* width, int* height,
+                  int* channels, Settings* s);
+
+template <class T>
+void resize(T* out, uint8_t* in, int image_height, int image_width,
+            int image_channels, int wanted_height, int wanted_width,
+            int wanted_channels, Settings* s);
+
+// explicit instantiation
+template void resize<uint8_t>(uint8_t*, unsigned char*, int, int, int, int, int,
+                              int, Settings*);
+template void resize<float>(float*, unsigned char*, int, int, int, int, int,
+                            int, Settings*);
+
+}  // namespace label_image
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a64c1de725b601e9b6e9325d9faacb37df0e626
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
@@ -0,0 +1,103 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
+#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/version.h"
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/version.h"
+
+#include "tensorflow/contrib/lite/examples/label_image/label_image.h"
+
+namespace tflite {
+namespace label_image {
+
+template <class T>
+void resize(T* out, uint8_t* in, int image_height, int image_width,
+            int image_channels, int wanted_height, int wanted_width,
+            int wanted_channels, Settings* s) {
+  int number_of_pixels = image_height * image_width * image_channels;
+  std::unique_ptr<Interpreter> interpreter(new Interpreter);
+
+  int base_index = 0;
+
+  // two inputs: input and new_sizes
+  interpreter->AddTensors(2, &base_index);
+  // one output
+  interpreter->AddTensors(1, &base_index);
+  // set input and output tensors
+  interpreter->SetInputs({0, 1});
+  interpreter->SetOutputs({2});
+
+  // set parameters of tensors
+  TfLiteQuantizationParams quant;
+  interpreter->SetTensorParametersReadWrite(
+      0, kTfLiteFloat32, "input",
+      {1, image_height, image_width, image_channels}, quant);
+  interpreter->SetTensorParametersReadWrite(1, kTfLiteInt32, "new_size", {2},
+                                            quant);
+  interpreter->SetTensorParametersReadWrite(
+      2, kTfLiteFloat32, "output",
+      {1, wanted_height, wanted_width, wanted_channels}, quant);
+
+  ops::builtin::BuiltinOpResolver resolver;
+  TfLiteRegistration* resize_op =
+      resolver.FindOp(BuiltinOperator_RESIZE_BILINEAR);
+  auto* params = reinterpret_cast<TfLiteResizeBilinearParams*>(
+      malloc(sizeof(TfLiteResizeBilinearParams)));
+  params->align_corners = false;
+  interpreter->AddNodeWithParameters({0, 1}, {2}, nullptr, 0, params, resize_op,
+                                     nullptr);
+
+  interpreter->AllocateTensors();
+
+  // fill input image
+  // in[] are integers, cannot do memcpy() directly
+  auto input = interpreter->typed_tensor<float>(0);
+  for (int i = 0; i < number_of_pixels; i++) {
+    input[i] = in[i];
+  }
+
+  // fill new_sizes
+  interpreter->typed_tensor<int>(1)[0] = wanted_height;
+  interpreter->typed_tensor<int>(1)[1] = wanted_width;
+
+  interpreter->Invoke();
+
+  auto output = interpreter->typed_tensor<float>(2);
+  auto output_number_of_pixels =
+      wanted_height * wanted_height * wanted_channels;
+
+  for (int i = 0; i < output_number_of_pixels; i++) {
+    if (s->input_floating)
+      out[i] = (output[i] - s->input_mean) / s->input_std;
+    else
+      out[i] = (uint8_t)output[i];
+  }
+}
+
+}  // namespace label_image
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
diff --git a/tensorflow/contrib/lite/examples/label_image/get_top_n.h b/tensorflow/contrib/lite/examples/label_image/get_top_n.h
new file mode 100644
index 0000000000000000000000000000000000000000..70a7586fe6a008f0da20a7bac928ca676e5914ab
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/label_image/get_top_n.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H
+#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H
+
+#include "tensorflow/contrib/lite/examples/label_image/get_top_n_impl.h"
+
+namespace tflite {
+namespace label_image {
+
+template <class T>
+void get_top_n(T* prediction, int prediction_size, size_t num_results,
+               float threshold, std::vector<std::pair<float, int>>* top_results,
+               bool input_floating);
+
+// explicit instantiation so that we can use them otherwhere
+template void get_top_n<uint8_t>(uint8_t*, int, size_t, float,
+                                 std::vector<std::pair<float, int>>*, bool);
+template void get_top_n<float>(float*, int, size_t, float,
+                               std::vector<std::pair<float, int>>*, bool);
+
+}  // namespace label_image
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H
diff --git a/tensorflow/contrib/lite/examples/label_image/get_top_n_impl.h b/tensorflow/contrib/lite/examples/label_image/get_top_n_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..e416fbd39b125ea65d1155b19ab0967a9062e71a
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/label_image/get_top_n_impl.h
@@ -0,0 +1,70 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H
+#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H
+
+#include <algorithm>
+#include <queue>
+
+namespace tflite {
+namespace label_image {
+
+extern bool input_floating;
+
+// Returns the top N confidence values over threshold in the provided vector,
+// sorted by confidence in descending order.
+template <class T>
+void get_top_n(T* prediction, int prediction_size, size_t num_results,
+               float threshold, std::vector<std::pair<float, int>>* top_results,
+               bool input_floating) {
+  // Will contain top N results in ascending order.
+  std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>,
+                      std::greater<std::pair<float, int>>>
+      top_result_pq;
+
+  const long count = prediction_size;  // NOLINT(runtime/int)
+  for (int i = 0; i < count; ++i) {
+    float value;
+    if (input_floating)
+      value = prediction[i];
+    else
+      value = prediction[i] / 255.0;
+    // Only add it if it beats the threshold and has a chance at being in
+    // the top N.
+    if (value < threshold) {
+      continue;
+    }
+
+    top_result_pq.push(std::pair<float, int>(value, i));
+
+    // If at capacity, kick the smallest value out.
+    if (top_result_pq.size() > num_results) {
+      top_result_pq.pop();
+    }
+  }
+
+  // Copy to output vector and reverse into descending order.
+  while (!top_result_pq.empty()) {
+    top_results->push_back(top_result_pq.top());
+    top_result_pq.pop();
+  }
+  std::reverse(top_results->begin(), top_results->end());
+}
+
+}  // namespace label_image
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/contrib/lite/examples/label_image/label_image.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a91467d345fdce1268635a69a96939921dc170e8
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc
@@ -0,0 +1,308 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include <fcntl.h>      // NOLINT(build/include_order)
+#include <getopt.h>     // NOLINT(build/include_order)
+#include <sys/time.h>   // NOLINT(build/include_order)
+#include <sys/types.h>  // NOLINT(build/include_order)
+#include <sys/uio.h>    // NOLINT(build/include_order)
+#include <unistd.h>     // NOLINT(build/include_order)
+
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/optional_debug_tools.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+#include "tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h"
+#include "tensorflow/contrib/lite/examples/label_image/get_top_n.h"
+
+#define LOG(x) std::cerr
+
+namespace tflite {
+namespace label_image {
+
+double get_us(struct timeval t) { return (t.tv_sec * 1000000 + t.tv_usec); }
+
+// Takes a file name, and loads a list of labels from it, one per line, and
+// returns a vector of the strings. It pads with empty strings so the length
+// of the result is a multiple of 16, because our model expects that.
+TfLiteStatus ReadLabelsFile(const string& file_name,
+                            std::vector<string>* result,
+                            size_t* found_label_count) {
+  std::ifstream file(file_name);
+  if (!file) {
+    LOG(FATAL) << "Labels file " << file_name << " not found\n";
+    return kTfLiteError;
+  }
+  result->clear();
+  string line;
+  while (std::getline(file, line)) {
+    result->push_back(line);
+  }
+  *found_label_count = result->size();
+  const int padding = 16;
+  while (result->size() % padding) {
+    result->emplace_back();
+  }
+  return kTfLiteOk;
+}
+
+void RunInference(Settings* s) {
+  if (!s->model_name.c_str()) {
+    LOG(ERROR) << "no model file name\n";
+    exit(-1);
+  }
+
+  std::unique_ptr<tflite::FlatBufferModel> model;
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  model = tflite::FlatBufferModel::BuildFromFile(s->model_name.c_str());
+  if (!model) {
+    LOG(FATAL) << "\nFailed to mmap model " << s->model_name << "\n";
+    exit(-1);
+  }
+  LOG(INFO) << "Loaded model " << s->model_name << "\n";
+  model->error_reporter();
+  LOG(INFO) << "resolved reporter\n";
+
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+
+  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+  if (!interpreter) {
+    LOG(FATAL) << "Failed to construct interpreter\n";
+    exit(-1);
+  }
+
+  interpreter->UseNNAPI(s->accel);
+
+  if (s->verbose) {
+    LOG(INFO) << "tensors size: " << interpreter->tensors_size() << "\n";
+    LOG(INFO) << "nodes size: " << interpreter->nodes_size() << "\n";
+    LOG(INFO) << "inputs: " << interpreter->inputs().size() << "\n";
+    LOG(INFO) << "input(0) name: " << interpreter->GetInputName(0) << "\n";
+
+    int t_size = interpreter->tensors_size();
+    for (int i = 0; i < t_size; i++) {
+      if (interpreter->tensor(i)->name)
+        LOG(INFO) << i << ": " << interpreter->tensor(i)->name << ", "
+                  << interpreter->tensor(i)->bytes << ", "
+                  << interpreter->tensor(i)->type << ", "
+                  << interpreter->tensor(i)->params.scale << ", "
+                  << interpreter->tensor(i)->params.zero_point << "\n";
+    }
+  }
+
+  if (s->number_of_threads != -1) {
+    interpreter->SetNumThreads(s->number_of_threads);
+  }
+
+  int image_width = 224;
+  int image_height = 224;
+  int image_channels = 3;
+  uint8_t* in = read_bmp(s->input_bmp_name, &image_width, &image_height,
+                         &image_channels, s);
+
+  int input = interpreter->inputs()[0];
+  if (s->verbose) LOG(INFO) << "input: " << input << "\n";
+
+  const std::vector<int> inputs = interpreter->inputs();
+  const std::vector<int> outputs = interpreter->outputs();
+
+  if (s->verbose) {
+    LOG(INFO) << "number of inputs: " << inputs.size() << "\n";
+    LOG(INFO) << "number of outputs: " << outputs.size() << "\n";
+  }
+
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    LOG(FATAL) << "Failed to allocate tensors!";
+  }
+
+  if (s->verbose) PrintInterpreterState(interpreter.get());
+
+  // get input dimension from the input tensor metadata
+  // assuming one input only
+  TfLiteIntArray* dims = interpreter->tensor(input)->dims;
+  int wanted_height = dims->data[1];
+  int wanted_width = dims->data[2];
+  int wanted_channels = dims->data[3];
+
+  switch (interpreter->tensor(input)->type) {
+    case kTfLiteFloat32:
+      s->input_floating = true;
+      resize<float>(interpreter->typed_tensor<float>(input), in, image_height,
+                    image_width, image_channels, wanted_height, wanted_width,
+                    wanted_channels, s);
+      break;
+    case kTfLiteUInt8:
+      resize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in,
+                      image_height, image_width, image_channels, wanted_height,
+                      wanted_width, wanted_channels, s);
+      break;
+    default:
+      LOG(FATAL) << "cannot handle input type "
+                 << interpreter->tensor(input)->type << " yet";
+      exit(-1);
+  }
+
+  struct timeval start_time, stop_time;
+  gettimeofday(&start_time, NULL);
+  for (int i = 0; i < s->loop_count; i++) {
+    if (interpreter->Invoke() != kTfLiteOk) {
+      LOG(FATAL) << "Failed to invoke tflite!\n";
+    }
+  }
+  gettimeofday(&stop_time, NULL);
+  LOG(INFO) << "invoked \n";
+  LOG(INFO) << "average time: "
+            << (get_us(stop_time) - get_us(start_time)) / (s->loop_count * 1000)
+            << " ms \n";
+
+  const int output_size = 1000;
+  const size_t num_results = 5;
+  const float threshold = 0.001f;
+
+  std::vector<std::pair<float, int>> top_results;
+
+  int output = interpreter->outputs()[0];
+  switch (interpreter->tensor(output)->type) {
+    case kTfLiteFloat32:
+      get_top_n<float>(interpreter->typed_output_tensor<float>(0), output_size,
+                       num_results, threshold, &top_results, true);
+      break;
+    case kTfLiteUInt8:
+      get_top_n<uint8_t>(interpreter->typed_output_tensor<uint8_t>(0),
+                         output_size, num_results, threshold, &top_results,
+                         false);
+      break;
+    default:
+      LOG(FATAL) << "cannot handle output type "
+                 << interpreter->tensor(input)->type << " yet";
+      exit(-1);
+  }
+
+  std::vector<string> labels;
+  size_t label_count;
+
+  if (ReadLabelsFile(s->labels_file_name, &labels, &label_count) != kTfLiteOk)
+    exit(-1);
+
+  for (const auto& result : top_results) {
+    const float confidence = result.first;
+    const int index = result.second;
+    LOG(INFO) << confidence << ": " << index << " " << labels[index] << "\n";
+  }
+}
+
+void display_usage() {
+  LOG(INFO) << "label_image\n"
+            << "--accelerated, -a: [0|1], use Android NNAPI or note\n"
+            << "--count, -c: loop interpreter->Invoke() for certain times\n"
+            << "--input_mean, -b: input mean\n"
+            << "--input_std, -s: input standard deviation\n"
+            << "--image, -i: image_name.bmp\n"
+            << "--labels, -l: labels for the model\n"
+            << "--tflite_model, -m: model_name.tflite\n"
+            << "--threads, -t: number of threads\n"
+            << "--verbose, -v: [0|1] print more information\n"
+            << "\n";
+}
+
+int Main(int argc, char** argv) {
+  Settings s;
+
+  int c;
+  while (1) {
+    static struct option long_options[] = {
+        {"accelerated", required_argument, 0, 'a'},
+        {"count", required_argument, 0, 'c'},
+        {"verbose", required_argument, 0, 'v'},
+        {"image", required_argument, 0, 'i'},
+        {"labels", required_argument, 0, 'l'},
+        {"tflite_model", required_argument, 0, 'm'},
+        {"threads", required_argument, 0, 't'},
+        {"input_mean", required_argument, 0, 'b'},
+        {"input_std", required_argument, 0, 's'},
+        {0, 0, 0, 0}};
+
+    /* getopt_long stores the option index here. */
+    int option_index = 0;
+
+    c = getopt_long(argc, argv, "a:b:c:f:i:l:m:s:t:v:", long_options,
+                    &option_index);
+
+    /* Detect the end of the options. */
+    if (c == -1) break;
+
+    switch (c) {
+      case 'a':
+        s.accel = strtol(  // NOLINT(runtime/deprecated_fn)
+            optarg, (char**)NULL, 10);
+        break;
+      case 'b':
+        s.input_mean = strtod(optarg, NULL);
+        break;
+      case 'c':
+        s.loop_count = strtol(  // NOLINT(runtime/deprecated_fn)
+            optarg, (char**)NULL, 10);
+        break;
+      case 'i':
+        s.input_bmp_name = optarg;
+        break;
+      case 'l':
+        s.labels_file_name = optarg;
+        break;
+      case 'm':
+        s.model_name = optarg;
+        break;
+      case 's':
+        s.input_std = strtod(optarg, NULL);
+        break;
+      case 't':
+        s.number_of_threads = strtol(  // NOLINT(runtime/deprecated_fn)
+            optarg, (char**)NULL, 10);
+        break;
+      case 'v':
+        s.verbose = strtol(  // NOLINT(runtime/deprecated_fn)
+            optarg, (char**)NULL, 10);
+        break;
+      case 'h':
+      case '?':
+        /* getopt_long already printed an error message. */
+        display_usage();
+        exit(-1);
+      default:
+        exit(-1);
+    }
+  }
+  RunInference(&s);
+  return 0;
+}
+
+}  // namespace label_image
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  return tflite::label_image::Main(argc, argv);
+}
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.h b/tensorflow/contrib/lite/examples/label_image/label_image.h
new file mode 100644
index 0000000000000000000000000000000000000000..4de32e33fb4ef2ab5d0e111886cdc737398147e9
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
+#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
+
+#include "tensorflow/contrib/lite/string.h"
+
+namespace tflite {
+namespace label_image {
+
+struct Settings {
+  bool verbose = false;
+  bool accel = false;
+  bool input_floating = false;
+  int loop_count = 1;
+  float input_mean = 127.5f;
+  float input_std = 127.5f;
+  string model_name = "./mobilenet_quant_v1_224.tflite";
+  string input_bmp_name = "./grace_hopper.bmp";
+  string labels_file_name = "./labels.txt";
+  string input_layer_type = "uint8_t";
+  int number_of_threads = 4;
+};
+
+}  // namespace label_image
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.md b/tensorflow/contrib/lite/examples/label_image/label_image.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ce32cf101897f2d41cd14a485aeb432344928a0
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.md
@@ -0,0 +1,78 @@
+label_image for TensorFlow Lite inspired by TensorFlow's label_image.
+
+To build label_image for Android, run $TENSORFLOW_ROOT/configure 
+and set Android NDK or configure NDK setting in 
+$TENSORFLOW_ROOT/WORKSPACE first.
+ 
+To build it for android ARMv8:
+```
+> bazel build --config monolithic --cxxopt=-std=c++11 \
+  --crosstool_top=//external:android/crosstool \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --cpu=arm64-v8a \
+  //tensorflow/contrib/lite/examples/label_image:label_image
+```
+or
+```
+> bazel build --config android_arm64 --config monolithic --cxxopt=-std=c++11 \
+  //tensorflow/contrib/lite/examples/label_image:label_image
+```
+
+To build it for android arm-v7a:
+```
+> bazel build --config monolithic --cxxopt=-std=c++11 \
+  --crosstool_top=//external:android/crosstool \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --cpu=armeabi-v7a \
+  //tensorflow/contrib/lite/examples/label_image:label_image
+```
+or
+```
+> bazel build --config android_arm --config monolithic --cxxopt=-std=c++11 \
+  //tensorflow/contrib/lite/examples/label_image:label_image
+```
+
+Build it for desktop machines (tested on Ubuntu and OS X)
+```
+> bazel build --config opt --cxxopt=-std=c++11 //tensorflow/contrib/lite/examples/label_image:label_image
+```
+To run it. Prepare `./mobilenet_quant_v1_224.tflite`, `./grace_hopper.bmp`, and `./labels.txt`.
+
+Run it:
+```
+> ./label_image                                        
+Loaded model ./mobilenet_quant_v1_224.tflite
+resolved reporter
+invoked
+average time: 100.986 ms 
+0.439216: 653 military uniform
+0.372549: 458 bow tie
+0.0705882: 466 bulletproof vest
+0.0235294: 514 cornet
+0.0196078: 835 suit
+```
+Run `interpreter->Invoker()` 100 times:
+```
+> ./label_image   -c 100                               
+Loaded model ./mobilenet_quant_v1_224.tflite
+resolved reporter
+invoked
+average time: 33.4694 ms
+...
+```
+
+Run a floating point (`mobilenet_v1_1.0_224.tflite`) model,
+```
+> ./label_image -f 1 -m mobilenet_v1_1.0_224.tflite
+Loaded model mobilenet_v1_1.0_224.tflite
+resolved reporter
+invoked
+average time: 263.493 ms 
+0.88615: 653 military uniform
+0.0422316: 440 bearskin
+0.0109948: 466 bulletproof vest
+0.0105327: 401 academic gown
+0.00947104: 723 ping-pong bal
+```
+
+See the source code for other command line options.
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image_test.cc b/tensorflow/contrib/lite/examples/label_image/label_image_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce35483f76e8f40ced79e1ee30774c62d0eba94e
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/label_image/label_image_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h"
+#include "tensorflow/contrib/lite/examples/label_image/get_top_n.h"
+#include "tensorflow/contrib/lite/examples/label_image/label_image.h"
+
+using ::testing::ElementsAreArray;
+
+namespace tflite {
+namespace label_image {
+
+TEST(LabelImageTest, GraceHopper) {
+  std::string lena_file =
+      "tensorflow/contrib/lite/examples/label_image/testdata/grace_hopper.bmp";
+  int height, width, channels;
+  Settings s;
+  uint8_t *data;
+
+  data = read_bmp(lena_file, &width, &height, &channels, &s);
+  ASSERT_EQ(height, 606);
+  ASSERT_EQ(width, 517);
+  ASSERT_EQ(channels, 3);
+
+  uint8_t *out = new uint8_t[606 * 517 * 3];
+  downsize<uint8_t>(out, data, 606, 517, 3, 214, 214, 3, &s);
+  ASSERT_EQ(out[0], 0x15);
+  ASSERT_EQ(out[214 * 214 * 3 - 1], 0x12);
+}
+
+TEST(LabelImageTest, GetTopN) {
+  uint8_t in[] = {1, 1, 2, 2, 4, 4, 16, 32, 128, 64};
+
+  std::vector<std::pair<float, int>> top_results;
+  get_top_n<uint8_t>(in, 10, 5, 0.025, &top_results, false);
+  ASSERT_EQ(top_results.size(), 4);
+  ASSERT_EQ(top_results[0].second, 8);
+}
+
+}  // namespace label_image
+}  // namespace tflite
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/examples/label_image/testdata/grace_hopper.bmp b/tensorflow/contrib/lite/examples/label_image/testdata/grace_hopper.bmp
new file mode 100644
index 0000000000000000000000000000000000000000..0d94cd3e930a138b7c20308f5ba375576484d48b
Binary files /dev/null and b/tensorflow/contrib/lite/examples/label_image/testdata/grace_hopper.bmp differ
diff --git a/tensorflow/contrib/lite/g3doc/custom_operators.md b/tensorflow/contrib/lite/g3doc/custom_operators.md
index 204a489a93519309bb09238f1b2c8bbd4f1f19e4..d7cc854ebac08e79d346df0aca6e1fa56b490156 100644
--- a/tensorflow/contrib/lite/g3doc/custom_operators.md
+++ b/tensorflow/contrib/lite/g3doc/custom_operators.md
@@ -73,7 +73,7 @@ TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteRegistration* Register_SIN() {
-  static TfLiteRegistration r = {nullptr, nullptr, SinResize, SinEval};
+  static TfLiteRegistration r = {nullptr, nullptr, SinPrepare, SinEval};
   return &r;
 }
 ```
diff --git a/tensorflow/contrib/lite/g3doc/ios.md b/tensorflow/contrib/lite/g3doc/ios.md
index ce8b37fbf9b0db5dee60784e85a3cbf0326fddb6..a359b8d4b481dbc15cc86db14eabda5433722b8b 100644
--- a/tensorflow/contrib/lite/g3doc/ios.md
+++ b/tensorflow/contrib/lite/g3doc/ios.md
@@ -45,6 +45,10 @@ into a universal file containing armv7, armv7s, arm64, i386, and x86_64
 architectures. The resulting library is in
 `tensorflow/contrib/lite/gen/lib/libtensorflow-lite.a`.
 
+If you get an error such as `no such file or directory: 'x86_64'` when running 
+`build_ios_universal_lib.sh`: open Xcode > Preferences > Locations, and ensure 
+a value is selected in the "Command Line Tools" dropdown.
+
 ## Using in your own application
 
 You'll need to update various settings in your app to link against TensorFlow
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index 9ade04eb8c696d7e0e39a8104e02b6e5feec95eb..b1bbb7c67013acfb575cc1e9f9390ba191cbd08e 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -1,4 +1,4 @@
-# TensorFlow Compatibility Guide
+# TensorFlow Lite & TensorFlow Compatibility Guide
 
 TensorFlow Lite supports a number of TensorFlow operations used in common
 inference models. As they are processed by the TensorFlow Lite Optimizing
@@ -329,18 +329,18 @@ Inputs {
   0: a tensor
 }
 Outputs {
-  0: a tensor equivalent to max(0, min(input, 1)
+  0: a tensor equivalent to max(0, input)
 }
 ```
 
-**RELU1**
+**RELU_N1_TO_1**
 
 ```
 Inputs {
   0: a tensor
 }
 Outputs {
-  0: a tensor equivalent to max(-1, min(input, 6)
+  0: a tensor equivalent to max(-1, min(input, 1)
 }
 ```
 
diff --git a/tensorflow/contrib/lite/graph_info.cc b/tensorflow/contrib/lite/graph_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e60ed2c2463cb621015ba725ca030e8d8c02f3c7
--- /dev/null
+++ b/tensorflow/contrib/lite/graph_info.cc
@@ -0,0 +1,224 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/graph_info.h"
+#include <algorithm>
+
+namespace tflite {
+
+namespace {
+
+// Provide a range iterable wrapper for TfLiteIntArray* (C lists that TfLite
+// C api uses. Can't use the google array_view, since we can't depend on even
+// absl for embedded device reasons.
+// TODO(aselle): Move this into central utilities.
+class TfLiteIntArrayView {
+ public:
+  // Construct a view of a TfLiteIntArray*. Note, `int_array` should be non-null
+  // and this view does not take ownership of it.
+  explicit TfLiteIntArrayView(const TfLiteIntArray* int_array)
+      : int_array_(int_array) {}
+
+  typedef const int* const_iterator;
+  const_iterator begin() const { return int_array_->data; }
+  const_iterator end() const { return &int_array_->data[int_array_->size]; }
+
+  TfLiteIntArrayView(const TfLiteIntArrayView&) = default;
+  TfLiteIntArrayView& operator=(const TfLiteIntArrayView& rhs) = default;
+
+ private:
+  const TfLiteIntArray* int_array_;
+};
+
+// Helper class that actually performs partitioning by subgraph.
+// Outputs to a provided `subgraphs` structure.
+//
+// Example usage:
+// PartitionGraphIntoIndependentSubgraphsImpl partitioner(
+//     info, nodes_to_part, subgraphs);
+// partitioner.Partition();
+class PartitionGraphIntoIndependentSubgraphsImpl {
+ public:
+  PartitionGraphIntoIndependentSubgraphsImpl(
+      const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
+      std::vector<Subgraph>* subgraphs)
+      : info_(info),
+        subgraphs_(subgraphs),
+        node_type_(info->num_nodes(), Subgraph::kTfNonPartition) {
+    // Populate the node_type_ map.
+    for (auto node_index : TfLiteIntArrayView(nodes_to_partition)) {
+      node_type_[node_index] = Subgraph::kTfPartition;
+    }
+  }
+
+  // Actually partition the graph.
+  void Partition() {
+    // Initialize here to make Partition() re-entrant.
+    subgraphs_->clear();
+    tensor_epochs_.clear();
+    tensor_epochs_.resize(info_->num_tensors(), kEpochAlwaysReady);
+    node_epochs_.clear();
+    node_epochs_.resize(info_->num_nodes(), kEpochNotReady);
+    // Set computed tensors to be kEpochNotReady (initializer set everything to
+    // AlwaysReady).
+    for (int node_index = 0; node_index < info_->num_nodes(); node_index++) {
+      const TfLiteNode& node = info_->node(node_index);
+      for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
+        tensor_epochs_[output_tensor_index] = kEpochNotReady;
+      }
+    }
+
+    // Do a graph traversal where each iteration in the loop is an epoch
+    // that corresponds to a subgraph that only contains nodes that are of
+    // the same node_type_.
+    while (true) {
+      BuildSubgraph();
+      if (subgraphs_->back().nodes.empty()) {
+        subgraphs_->pop_back();
+        break;
+      }
+    }
+
+    // Mark model outputs as subgraph outputs. All the rest have already been
+    // identified.
+    for (int output_index : info_->outputs()) {
+      int output_epoch = tensor_epochs_[output_index];
+      Subgraph& output_subgraph = (*subgraphs_)[output_epoch];
+      output_subgraph.output_tensors.push_back(output_index);
+    }
+    // Make sure every subgraph's inputs and outputs are unique. Since the
+    // list of inputs and outputs is generated in a way that produces
+    // duplicates.
+    for (Subgraph& subgraph : *subgraphs_) {
+      // Sort and uniquefy using standard library algorithms.
+      auto uniquefy = [](std::vector<int>* items) {
+        std::sort(items->begin(), items->end());
+        auto last = std::unique(items->begin(), items->end());
+        items->erase(last, items->end());
+      };
+      uniquefy(&subgraph.input_tensors);
+      uniquefy(&subgraph.output_tensors);
+    }
+  }
+
+ private:
+  // Special integer values needed for tensor_epochs_ and node_epochs_.
+  enum {
+    // The node or tensor is not ready to be assigned an epoch. e.g. a node's
+    // inputs have not all been assigned epochs.
+    kEpochNotReady = -1,
+    // Used for tensor_epochs_. This means that the tensor is always ready.
+    // e.g. an input to the whole model or a constant that has no dependencies.
+    kEpochAlwaysReady = -2
+  };
+
+  // Updates the  node `node_index` and returns true if it is assigned to an
+  // epoch. False is returned if the node is already set to an epoch, its inputs
+  // are not all assigned to epochs, or if it cannot be assigned to the current
+  // epoch since the epoch's node_type doesn't match.
+  bool UpdateNode(int node_index) {
+    const TfLiteNode& node = info_->node(node_index);
+    Subgraph& current_subgraph = subgraphs_->back();
+    int current_epoch = subgraphs_->size() - 1;
+    // Check if node is already done.
+    if (node_epochs_[node_index] != kEpochNotReady) {
+      return false;
+    }
+    // See if all dependencies of this node are already assigned to a
+    // subgraph.
+    for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
+      if (tensor_epochs_[input_tensor_index] == kEpochNotReady) {
+        return false;
+      }
+    }
+    // When we are starting a new epoch, the first ready node defines
+    // the type of that epoch.
+    if (current_subgraph.type == Subgraph::kTfUnexplored) {
+      current_subgraph.type = node_type_[node_index];
+    }
+    // The node gets assigned to this epoch if it is the same type as
+    // the epoch's assigned type. Note, if this is the current ready
+    // node encountered during this epoch, this condition will be
+    // automatically true.
+    if (current_subgraph.type == node_type_[node_index]) {
+      node_epochs_[node_index] = current_epoch;
+      current_subgraph.nodes.push_back(node_index);
+      // All outputs of this node now are assigned to this epoch as
+      // well.
+      for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
+        tensor_epochs_[output_tensor_index] = current_epoch;
+      }
+      // Look at our inputs one more time to update that tensor's
+      // epochs' outputs
+      for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
+        int input_epoch = tensor_epochs_[input_tensor_index];
+        int node_epoch = current_epoch;
+        if (input_epoch != node_epoch) {
+          current_subgraph.input_tensors.push_back(input_tensor_index);
+          // Set inputs to be outputs of the subgraph where they reside.
+          // the if condition makes sure inputs to the whole computation
+          // are not included (i.e. those initialized to -2 above).
+          if (input_epoch >= 0) {
+            Subgraph& input_subgraph = (*subgraphs_)[input_epoch];
+            input_subgraph.output_tensors.push_back(input_tensor_index);
+          }
+        }
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  // Completely populates the current subgraph by doing graph traversal
+  void BuildSubgraph() {
+    subgraphs_->emplace_back(Subgraph());
+    // loop until no more nodes can be updated.
+    while (true) {
+      bool did_something = false;
+      for (int node_index = 0; node_index < info_->num_nodes(); node_index++) {
+        if (UpdateNode(node_index)) {
+          did_something = true;
+        }
+      }
+      if (!did_something) return;
+    }
+  }
+
+  // Temporary data needed for partitioning.
+  const GraphInfo* info_;
+  // List of subgraphs to populate
+  std::vector<Subgraph>* subgraphs_;
+  std::vector<Subgraph::Type> node_type_;
+  // Maps from tensor index to the epoch in which it is assigned. Also special
+  // negative values of kEpochNotAssigned if not assigned, kEpochNotReady if it
+  // is an input or constant.
+  std::vector<int> tensor_epochs_;
+  // Maps from tensor index to the epoch in which it is assigned. Also special
+  // negative values of kEpochNotAssigned if not assigned.
+  std::vector<int> node_epochs_;
+};
+
+}  // namespace
+
+TfLiteStatus PartitionGraphIntoIndependentSubgraphs(
+    const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
+    std::vector<Subgraph>* subgraphs) {
+  PartitionGraphIntoIndependentSubgraphsImpl(info, nodes_to_partition,
+                                             subgraphs)
+      .Partition();
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/graph_info.h b/tensorflow/contrib/lite/graph_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..313af5fb7574b42bcdd53b4baad06e4ccfb34053
--- /dev/null
+++ b/tensorflow/contrib/lite/graph_info.h
@@ -0,0 +1,79 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
+#define TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
+
+#include <vector>
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+// Basic information about an inference graph, where execution nodes
+// are connected via tensors.
+class GraphInfo {
+ public:
+  virtual ~GraphInfo() {}
+
+  // Total number of tensors in the graph.
+  virtual size_t num_tensors() const = 0;
+
+  // Returns a tensor given its index which is expected to be between 0 and
+  // num_tensors().
+  virtual TfLiteTensor* tensor(size_t index) = 0;
+
+  // Total number of nodes in the graph.
+  virtual size_t num_nodes() const = 0;
+
+  // Returns a node given its index which is expected to be between 0 and
+  // num_nodes().
+  virtual const TfLiteNode& node(size_t index) const = 0;
+
+  // Returns the indices of the input tensors.
+  virtual const std::vector<int>& inputs() const = 0;
+
+  // Returns the indices of the output tensors.
+  virtual const std::vector<int>& outputs() const = 0;
+};
+
+// Represents a subgraph of a TensorFlow Lite graph.
+struct Subgraph {
+  enum Type {
+    kTfUnexplored = 0,  // temporarily used during creation
+    kTfPartition,
+    kTfNonPartition
+  };
+  Type type = kTfUnexplored;
+  // Nodes within the subgraph
+  std::vector<int> nodes;
+  // Tensors that stride output from another subgraph that this depends on,
+  // or global inputs to the TensorFlow Lite full graph.
+  std::vector<int> input_tensors;
+  // Outputs that are consumed by other subgraphs or are global output tensors.
+  // All output tensors of the nodes in the subgraph that do not appear in this
+  // list are intermediate results that can be potentially elided.
+  std::vector<int> output_tensors;
+};
+
+// Partitions a list of node indices `nodes_to_partition` into subgraphs.
+// Each subgraph is in dependency order (i.e. all members of the subgraph).
+// `subgraphs` is assumed to be empty.
+TfLiteStatus PartitionGraphIntoIndependentSubgraphs(
+    const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
+    std::vector<Subgraph>* subgraphs);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
diff --git a/tensorflow/contrib/lite/graph_info_test.cc b/tensorflow/contrib/lite/graph_info_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea38b43993fef71c6820c7a978351d92d5420287
--- /dev/null
+++ b/tensorflow/contrib/lite/graph_info_test.cc
@@ -0,0 +1,270 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "tensorflow/contrib/lite/graph_info.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+// Makes a TfLiteIntArray* from std::vector, must free with TfLiteIntFree().
+TfLiteIntArray* ConvertVector(const std::vector<int>& x) {
+  TfLiteIntArray* lite = TfLiteIntArrayCreate(x.size());
+  for (size_t i = 0; i < x.size(); i++) lite->data[i] = x[i];
+  return lite;
+}
+
+// A very simple test graph that supports setting in/out tensors on nodes.
+class SimpleTestGraph : public GraphInfo {
+ public:
+  ~SimpleTestGraph() override {
+    for (auto& node : nodes_) {
+      TfLiteIntArrayFree(node.inputs);
+      TfLiteIntArrayFree(node.outputs);
+    }
+  }
+
+  size_t num_tensors() const override { return tensors_.size(); }
+  size_t num_nodes() const override { return nodes_.size(); }
+  const TfLiteNode& node(size_t index) const override { return nodes_[index]; }
+  TfLiteTensor* tensor(size_t index) override { return &tensors_[index]; }
+  const std::vector<int>& inputs() const override { return inputs_; }
+  const std::vector<int>& outputs() const override { return outputs_; }
+
+  void AddNode(const std::vector<int>& inputs,
+               const std::vector<int>& outputs) {
+    nodes_.push_back(TfLiteNode());
+    TfLiteNode& node = nodes_.back();
+    node.inputs = ConvertVector(inputs);
+    node.outputs = ConvertVector(outputs);
+  }
+
+  void AddTensors(int count) { tensors_.resize(count + tensors_.size()); }
+
+  void SetInputsAndOutputs(const std::vector<int>& inputs,
+                           const std::vector<int>& outputs) {
+    inputs_ = inputs;
+    outputs_ = outputs;
+  }
+
+ private:
+  std::vector<TfLiteNode> nodes_;
+  std::vector<TfLiteTensor> tensors_;
+  std::vector<int> inputs_;
+  std::vector<int> outputs_;
+};
+
+// Partition a graph to generate a list of subgraphs. This wraps the API call
+// we are testing and handles memory management and conversion to
+// TfLiteIntArray. Populates `subgraphs` with resulting generated subgraphs.
+void PartitionGraph(const SimpleTestGraph& graph,
+                    const std::vector<int>& nodes_to_partition,
+                    std::vector<Subgraph>* subgraphs) {
+  TfLiteIntArray* nodes_to_partition_int_array =
+      ConvertVector(nodes_to_partition);
+  PartitionGraphIntoIndependentSubgraphs(&graph, nodes_to_partition_int_array,
+                                         subgraphs);
+  TfLiteIntArrayFree(nodes_to_partition_int_array);
+}
+
+// Check a generated list of subgraphs against the expected list of subgraphs.
+void CheckPartitionSubgraphs(const std::vector<Subgraph>& generated_subgraphs,
+                             const std::vector<Subgraph>& expected_subgraphs) {
+  ASSERT_EQ(generated_subgraphs.size(), expected_subgraphs.size());
+  for (int subgraph_index = 0; subgraph_index < generated_subgraphs.size();
+       subgraph_index++) {
+    EXPECT_EQ(generated_subgraphs[subgraph_index].nodes,
+              expected_subgraphs[subgraph_index].nodes);
+    EXPECT_EQ(generated_subgraphs[subgraph_index].input_tensors,
+              expected_subgraphs[subgraph_index].input_tensors);
+    EXPECT_EQ(generated_subgraphs[subgraph_index].output_tensors,
+              expected_subgraphs[subgraph_index].output_tensors);
+  }
+}
+
+// Test an empty trivial graph with no partitions.
+TEST(PartitionTest, Nodes0_PartitionNodes0) {
+  SimpleTestGraph graph;
+  std::vector<int> nodes_to_partition = {};
+  std::vector<Subgraph> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+  CheckPartitionSubgraphs(generated_subgraphs, {});
+}
+
+// Test a 1 node graph with no partitions.
+// Input: tensor(0) -> node(0) -> tensor(1), nodes_to_partition=[]
+// Output: [kTfNoPartition, tensor(0) -> node(0) -> tensor(1)]
+TEST(PartitionTest, Nodes1PartitionNodes0) {
+  SimpleTestGraph graph;
+  graph.AddTensors(2);
+  graph.AddNode({0}, {1});
+  graph.SetInputsAndOutputs({0}, {1});
+  std::vector<int> nodes_to_partition = {};
+  std::vector<Subgraph> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  Subgraph expected_subgraph;
+  expected_subgraph.type = Subgraph::kTfNonPartition;
+  expected_subgraph.nodes = {0};
+  expected_subgraph.input_tensors = {0};
+  expected_subgraph.output_tensors = {1};
+  CheckPartitionSubgraphs(generated_subgraphs, {expected_subgraph});
+}
+
+// Test a 1 node graph with no inputs that is fully partitioned.
+// Input: node(0) -> tensor(1), nodes_to_partition=[node0]
+// Output: [kTfPartition, node(0) -> tensor(1)]
+TEST(PartitionTest, Nodes1PartitionNodes0Inputs0) {
+  SimpleTestGraph graph;
+  graph.AddTensors(1);
+  graph.AddNode({}, {0});
+  graph.SetInputsAndOutputs({}, {0});
+  std::vector<Subgraph> generated_subgraphs;
+  std::vector<int> nodes_to_partition = {0};
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  Subgraph expected_subgraph;
+  expected_subgraph.type = Subgraph::kTfPartition;
+  expected_subgraph.nodes = {0};
+  expected_subgraph.input_tensors = {};
+  expected_subgraph.output_tensors = {0};
+  CheckPartitionSubgraphs(generated_subgraphs, {expected_subgraph});
+}
+
+// Test a 1 node graph that is partitioned completely.
+// Input: tensor(0) -> node(0) -> tensor(1), nodes_to_partition=[node0]
+// Output: [kTfPartition, tensor(0) -> node(0) -> tensor(1)]
+TEST(PartitionTest, Nodes1PartitionNodes1) {
+  SimpleTestGraph graph;
+  graph.AddTensors(2);
+  graph.AddNode({0}, {1});
+  graph.SetInputsAndOutputs({0}, {1});
+  std::vector<int> nodes_to_partition = {0};
+  std::vector<Subgraph> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  Subgraph expected_subgraph;
+  expected_subgraph.type = Subgraph::kTfPartition;
+  expected_subgraph.nodes = {0};
+  expected_subgraph.input_tensors = {0};
+  expected_subgraph.output_tensors = {1};
+  CheckPartitionSubgraphs(generated_subgraphs, {expected_subgraph});
+}
+
+// Test a 2 node graph where 1 node is partitioned and the other is not.
+// Input: tensor(0) -> node(0) -> tensor(1) -> node(1) -> tensor(2),
+//    nodes_to_partition = [1]
+// Output: [kTfNonPartition, tensor(0) -> node(0) -> tensor(1),
+//          kTfPartition, tensor(1) -> node(1), tensor(2)]
+TEST(PartitionTest, Nodes2PartitionNodes1) {
+  SimpleTestGraph graph;
+  graph.AddTensors(3);
+  graph.AddNode({0}, {1});
+  graph.AddNode({1}, {2});
+  graph.SetInputsAndOutputs({0}, {2});
+  std::vector<int> nodes_to_partition = {1};
+  std::vector<Subgraph> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  Subgraph expected_subgraph0;
+  expected_subgraph0.type = Subgraph::kTfPartition;
+  expected_subgraph0.nodes = {0};
+  expected_subgraph0.input_tensors = {0};
+  expected_subgraph0.output_tensors = {1};
+  Subgraph expected_subgraph1;
+  expected_subgraph1.type = Subgraph::kTfPartition;
+  expected_subgraph1.nodes = {1};
+  expected_subgraph1.input_tensors = {1};
+  expected_subgraph1.output_tensors = {2};
+  CheckPartitionSubgraphs(generated_subgraphs,
+                          {expected_subgraph0, expected_subgraph1});
+}
+
+// Test a 2 node graph where both nodes are fully partitioned.
+// Input: tensor(0) -> node(0) -> tensor(1) -> node(1) -> tensor(2),
+//    nodes_to_partition = [0, 1]
+// Output: [kTfPartition, tensor(0) -> node(0) -> node(1) -> tensor(1)]
+TEST(PartitionTest, Nodes2PartitionNodes2) {
+  SimpleTestGraph graph;
+  graph.AddTensors(3);
+  graph.AddNode({0}, {1});
+  graph.AddNode({1}, {2});
+  graph.SetInputsAndOutputs({0}, {2});
+  std::vector<int> nodes_to_partition = {0, 1};
+  std::vector<Subgraph> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  Subgraph expected_subgraph0;
+  expected_subgraph0.type = Subgraph::kTfPartition;
+  expected_subgraph0.nodes = {0, 1};
+  expected_subgraph0.input_tensors = {0};
+  expected_subgraph0.output_tensors = {2};
+  CheckPartitionSubgraphs(generated_subgraphs, {expected_subgraph0});
+}
+
+// Test a three node model where we want to partition nodes 0 and nodes
+// 2, but nodes 0 and nodes 2 cannot be in the same subgraph since node 2
+// depends on node 1 which depends on node 0. Thus, we need to produce three
+// subgraphs.
+//
+// Input: tensor(0) -> node(0) -> tensor(1)
+//        tensor(1) -> node(1) -> tensor(2)
+//        [tensor(2), tensor(1)] -> node(2) -> tensor(3)
+//    nodes_to_partition = [0, 2]
+// Output: [[kTfPartition, tensor(0) -> node(0) -> tensor(1),
+//          [kTfNonPartition, tensor(1) -> node(1) -> tensor(2)],
+//          [kTfPartition, [tensor(2), tensor(1)] -> node(2) -> node(3)]
+TEST(PartitionTest, Nodes3PartitionNodes2) {
+  SimpleTestGraph graph;
+  graph.AddTensors(4);
+  graph.AddNode({0}, {1});
+  graph.AddNode({1}, {2});
+  graph.AddNode({1, 2}, {3});
+  graph.SetInputsAndOutputs({0}, {3});
+  std::vector<int> nodes_to_partition = {0, 2};
+  std::vector<Subgraph> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  Subgraph expected_subgraph0;
+  expected_subgraph0.type = Subgraph::kTfPartition;
+  expected_subgraph0.nodes = {0};
+  expected_subgraph0.input_tensors = {0};
+  expected_subgraph0.output_tensors = {1};
+  Subgraph expected_subgraph1;
+  expected_subgraph1.type = Subgraph::kTfNonPartition;
+  expected_subgraph1.nodes = {1};
+  expected_subgraph1.input_tensors = {1};
+  expected_subgraph1.output_tensors = {2};
+  Subgraph expected_subgraph2;
+  expected_subgraph2.type = Subgraph::kTfPartition;
+  expected_subgraph2.nodes = {2};
+  expected_subgraph2.input_tensors = {1, 2};
+  expected_subgraph2.output_tensors = {3};
+  CheckPartitionSubgraphs(
+      generated_subgraphs,
+      {expected_subgraph0, expected_subgraph1, expected_subgraph2});
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 954e236ac8f0c8c59a9d20d62e66b3aa1164ecc1..028449211b8108d004df4d1cd8a58b4a08df6604 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -18,16 +18,16 @@ limitations under the License.
 #include <cstdarg>
 #include <cstdint>
 #include <cstring>
+#include "tensorflow/contrib/lite/arena_planner.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/graph_info.h"
 #include "tensorflow/contrib/lite/kernels/gemm_support.h"
+#include "tensorflow/contrib/lite/memory_planner.h"
 #include "tensorflow/contrib/lite/nnapi_delegate.h"
 
 namespace {
 
-// Memory allocation tuning
-constexpr const int kDefaultArenaAlignment = 64;
-constexpr const int kDefaultTensorAlignment = 4;
 // std::vector preallocation tuning.
 constexpr const int kSlotsToReserve = 128;
 
@@ -35,10 +35,40 @@ constexpr const int kSlotsToReserve = 128;
 
 namespace tflite {
 
+// A trivial implementation of GraphInfo around the Interpreter.
+// NOTE: this interpreter info represents the subset of the
+// graph that is executed according to execution plan. Thus,
+// the indices are execution plan indices rather than raw node
+// indices.
+class InterpreterInfo : public GraphInfo {
+ public:
+  explicit InterpreterInfo(Interpreter* interpreter)
+      : interpreter_(interpreter) {}
+
+  size_t num_tensors() const override { return interpreter_->tensors_size(); }
+  TfLiteTensor* tensor(size_t index) override {
+    return interpreter_->tensor(index);
+  }
+  size_t num_nodes() const override {
+    return interpreter_->execution_plan().size();
+  }
+  const TfLiteNode& node(size_t index) const override {
+    int node_index = interpreter_->execution_plan()[index];
+    return interpreter_->node_and_registration(node_index)->first;
+  }
+  const std::vector<int>& inputs() const override {
+    return interpreter_->inputs();
+  }
+  const std::vector<int>& outputs() const override {
+    return interpreter_->outputs();
+  }
+
+ public:
+  Interpreter* interpreter_;
+};
+
 Interpreter::Interpreter(ErrorReporter* error_reporter)
-    : arena_(kDefaultArenaAlignment),
-      persistent_arena_(kDefaultArenaAlignment),
-      error_reporter_(error_reporter ? error_reporter
+    : error_reporter_(error_reporter ? error_reporter
                                      : DefaultErrorReporter()) {
   context_.impl_ = static_cast<void*>(this);
   context_.ResizeTensor = ResizeTensor;
@@ -47,10 +77,16 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   context_.tensors = nullptr;
   context_.tensors_size = 0;
   context_.gemm_context = nullptr;
+
+  // Invalid to call these these except from TfLiteDelegate
+  context_.GetNodeAndRegistration = nullptr;
+  context_.ReplaceSubgraphsWithDelegateKernels = nullptr;
+  context_.GetExecutionPlan = nullptr;
+
   // Reserve some space for the tensors to avoid excessive resizing.
   tensors_.reserve(kSlotsToReserve);
   nodes_and_registration_.reserve(kSlotsToReserve);
-  next_allocate_node_id_ = 0;
+  next_execution_plan_index_to_prepare_ = 0;
   UseNNAPI(false);
 }
 
@@ -70,6 +106,78 @@ Interpreter::~Interpreter() {
   }
 }
 
+TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
+    TfLiteContext* context, TfLiteRegistration registration,
+    const TfLiteIntArray* nodes_to_replace) {
+  return static_cast<Interpreter*>(context->impl_)
+      ->ReplaceSubgraphsWithDelegateKernels(registration, nodes_to_replace);
+}
+
+TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
+    TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace) {
+  // Analyze the graph to find all independent subgraphs that are either
+  // fully not-this-delegate or this-delegate computation.
+  InterpreterInfo info(this);
+  std::vector<Subgraph> subgraphs;
+  PartitionGraphIntoIndependentSubgraphs(&info, nodes_to_replace, &subgraphs);
+
+  execution_plan_.clear();
+  for (auto& subgraph : subgraphs) {
+    // Turn subgraph.nodes into a TfLiteIntArray compatible data structure.
+    // TODO(aselle): Avoid this copy by constructing subgraph.nodes that way
+    // in the first place
+    subgraph.nodes.insert(subgraph.nodes.begin(),
+                          static_cast<int>(subgraph.nodes.size()));
+    // Subgraphs calimed by the delegate should have a "macro" op created, the
+    // other subgraphs (kTfNonPartition) just have their nodes added back to
+    // the execution plan.
+    switch (subgraph.type) {
+      case Subgraph::kTfNonPartition:
+        for (auto it = subgraph.nodes.begin() + 1; it != subgraph.nodes.end();
+             ++it) {
+          execution_plan_.push_back(*it);
+        }
+        break;
+      case Subgraph::kTfPartition: {
+        void* builtin_data = nullptr;
+        int node_index;
+        // Create a node that represents computation of this subgraph.
+        AddNodeWithParameters(
+            subgraph.input_tensors, subgraph.output_tensors,
+            reinterpret_cast<const char*>(subgraph.nodes.data()),
+            subgraph.nodes.size() * sizeof(subgraph.nodes[0]), builtin_data,
+            &registration, &node_index);
+      } break;
+      case Subgraph::kTfUnexplored:
+        return kTfLiteError;
+        break;
+    }
+  }
+  return kTfLiteOk;
+}
+
+// Gets an TfLiteIntArray* representing the execution plan. The interpreter owns
+// this memory and it is only guaranteed to exist during the invocation of the
+// delegate prepare.
+TfLiteStatus Interpreter::GetExecutionPlan(TfLiteIntArray** execution_plan) {
+  // TODO(aselle): Do not make a copy here
+  plan_cache_.reset(TfLiteIntArrayCreate(execution_plan_.size()));
+  *execution_plan = plan_cache_.get();
+  static_assert(sizeof(plan_cache_->data[0]) == sizeof(execution_plan_[0]),
+                "TfLiteIntArray and execution_plan do not contain same type.");
+  memcpy(plan_cache_->data, execution_plan_.data(),
+         sizeof(plan_cache_->data[0]) * execution_plan_.size());
+  return kTfLiteOk;
+}
+
+// WARNING: This is an experimental interface that is subject to change.
+// Entry point for C node plugin API to get the execution plan
+TfLiteStatus Interpreter::GetExecutionPlan(struct TfLiteContext* context,
+                                           TfLiteIntArray** execution_plan) {
+  return static_cast<Interpreter*>(context->impl_)
+      ->GetExecutionPlan(execution_plan);
+}
+
 TfLiteStatus Interpreter::SetInputs(std::vector<int> inputs) {
   TF_LITE_ENSURE_OK(&context_,
                     CheckTensorIndices("inputs", inputs.data(), inputs.size()));
@@ -128,181 +236,6 @@ TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims,
   return kTfLiteOk;
 }
 
-TfLiteStatus Interpreter::AllocateTensorsWhoseSizesAreKnown() {
-  if (!consistent_) {
-    ReportError(&context_, "AllocateTensors() called on inconsistent model.");
-    return kTfLiteError;
-  }
-  if (next_allocate_node_id_ == nodes_and_registration_.size() && invokable_) {
-    return kTfLiteOk;
-  }
-  allocs_and_refcounts_.resize(context_.tensors_size);
-
-  int new_next_allocate_node_id = next_allocate_node_id_;
-  invokable_ = false;
-
-  // Allocate graph input nodes.
-  if (next_allocate_node_id_ == 0) {
-    for (int i = 0; i < inputs_.size(); ++i) {
-      int tensor_index = inputs_[i];
-      if (tensor_index == kOptionalTensor) {
-        continue;
-      }
-      TfLiteTensor& tensor = context_.tensors[tensor_index];
-      if (tensor.allocation_type == kTfLiteArenaRw) {
-        TF_LITE_ENSURE_OK(
-            &context_,
-            arena_.Allocate(&context_, kDefaultTensorAlignment, tensor.bytes,
-                            &allocs_and_refcounts_[tensor_index].alloc));
-      }
-    }
-    // Add 1 to output tensors, so they will not get overwritten.
-    for (int i = 0; i < outputs_.size(); ++i) {
-      allocs_and_refcounts_[outputs_[i]].count++;
-    }
-  }
-
-  // Count references to node input tensors, and resize node-referenced tensors
-  // until we encounter a node that has a dynamic output tensor.
-  for (int k = next_allocate_node_id_; k < nodes_and_registration_.size();
-       k++) {
-    new_next_allocate_node_id++;
-    TfLiteNode& node = nodes_and_registration_[k].first;
-    const TfLiteRegistration& registration = nodes_and_registration_[k].second;
-    if (OpPrepare(registration, &node) == kTfLiteError) {
-      return kTfLiteError;
-    }
-
-    TfLiteIntArray* node_inputs = node.inputs;
-    for (int i = 0; i < node_inputs->size; ++i) {
-      int tensor_index = node_inputs->data[i];
-      if (tensor_index != kOptionalTensor) {
-        allocs_and_refcounts_[node_inputs->data[i]].count++;
-      }
-    }
-
-    // Discontinue if the node has dynamic outputs.
-    bool has_unallocated_dynamic_tensor = false;
-    TfLiteIntArray* node_outputs = node.outputs;
-    for (int i = 0; i < node_outputs->size; ++i) {
-      TfLiteTensor& tensor = context_.tensors[node_outputs->data[i]];
-      if (tensor.allocation_type == kTfLiteDynamic) {
-        has_unallocated_dynamic_tensor = true;
-        break;
-      }
-    }
-    if (has_unallocated_dynamic_tensor) {
-      break;
-    }
-  }
-
-  // Allocate graph persistent outputs, e.g. RNN cell states, etc.
-  for (int k = next_allocate_node_id_; k < new_next_allocate_node_id; k++) {
-    TfLiteNode& node = nodes_and_registration_[k].first;
-
-    // Go through output tensors and allocate the persistent ones first.
-    TfLiteIntArray* node_outputs = node.outputs;
-    for (int i = 0; i < node_outputs->size; ++i) {
-      int tensor_index = node_outputs->data[i];
-      TfLiteTensor& tensor = context_.tensors[tensor_index];
-      if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
-        TF_LITE_ENSURE_OK(&context_,
-                          persistent_arena_.Allocate(
-                              &context_, kDefaultTensorAlignment, tensor.bytes,
-                              &allocs_and_refcounts_[tensor_index].alloc));
-      }
-    }
-  }
-
-  // Go through the graph in execution order.
-  for (int k = next_allocate_node_id_; k < new_next_allocate_node_id; k++) {
-    TfLiteNode& node = nodes_and_registration_[k].first;
-
-    // First allocate output tensors.
-    TfLiteIntArray* node_outputs = node.outputs;
-    for (int i = 0; i < node_outputs->size; ++i) {
-      int tensor_index = node_outputs->data[i];
-      TfLiteTensor& tensor = context_.tensors[tensor_index];
-      if (tensor.allocation_type == kTfLiteArenaRw) {
-        TF_LITE_ENSURE_OK(
-            &context_,
-            arena_.Allocate(&context_, kDefaultTensorAlignment, tensor.bytes,
-                            &allocs_and_refcounts_[tensor_index].alloc));
-      }
-    }
-    // Then the temporaries, in two passes. First allocate them all, them
-    // deallocate them.
-    TfLiteIntArray* node_temporaries = node.temporaries;
-    for (int i = 0; i < node_temporaries->size; ++i) {
-      int tensor_index = node_temporaries->data[i];
-      TfLiteTensor& tensor = context_.tensors[tensor_index];
-      if (tensor.allocation_type == kTfLiteArenaRw) {
-        TF_LITE_ENSURE_OK(
-            &context_,
-            arena_.Allocate(&context_, kDefaultTensorAlignment, tensor.bytes,
-                            &allocs_and_refcounts_[tensor_index].alloc));
-      }
-    }
-    for (int i = 0; i < node_temporaries->size; ++i) {
-      int tensor_index = node_temporaries->data[i];
-      TfLiteTensor& tensor = context_.tensors[tensor_index];
-      allocs_and_refcounts_[tensor_index].count--;
-      if (tensor.allocation_type == kTfLiteArenaRw &&
-          allocs_and_refcounts_[tensor_index].count == 0) {
-        TF_LITE_ENSURE_OK(
-            &context_,
-            arena_.Deallocate(&context_,
-                              allocs_and_refcounts_[tensor_index].alloc));
-      }
-    }
-
-    // Then process the node's inputs.
-    TfLiteIntArray* node_inputs = node.inputs;
-    for (int i = 0; i < node_inputs->size; ++i) {
-      int tensor_index = node_inputs->data[i];
-      if (tensor_index == kOptionalTensor) {
-        continue;
-      }
-      TfLiteTensor& tensor = context_.tensors[tensor_index];
-
-      // Decrease reference count and deallocate if not needed anymore.
-      allocs_and_refcounts_[tensor_index].count--;
-      if (tensor.allocation_type == kTfLiteArenaRw &&
-          allocs_and_refcounts_[tensor_index].count == 0) {
-        TF_LITE_ENSURE_OK(
-            &context_,
-            arena_.Deallocate(&context_,
-                              allocs_and_refcounts_[tensor_index].alloc));
-      }
-    }
-  }
-
-  // Resize the buffer and commit the arena.
-  TF_LITE_ENSURE_OK(&context_, arena_.Commit(&context_));
-  TF_LITE_ENSURE_OK(&context_, persistent_arena_.Commit(&context_));
-
-  // Rewire the tensors to use the underlying arena buffer.
-  for (int i = 0; i < context_.tensors_size; ++i) {
-    TfLiteTensor& tensor = context_.tensors[i];
-    if (tensor.allocation_type == kTfLiteArenaRw) {
-      TF_LITE_ENSURE_OK(
-          &context_,
-          arena_.ResolveAlloc(&context_, allocs_and_refcounts_[i].alloc,
-                              &tensor.data.raw));
-    }
-    if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
-      TF_LITE_ENSURE_OK(
-          &context_,
-          persistent_arena_.ResolveAlloc(
-              &context_, allocs_and_refcounts_[i].alloc, &tensor.data.raw));
-    }
-  }
-
-  invokable_ = true;
-  next_allocate_node_id_ = new_next_allocate_node_id;
-  return kTfLiteOk;
-}
-
 namespace {
 TfLiteIntArray* convertVectorToTfLiteIntArray(const std::vector<int>& x) {
   TfLiteIntArray* lite = TfLiteIntArrayCreate(x.size());
@@ -312,11 +245,19 @@ TfLiteIntArray* convertVectorToTfLiteIntArray(const std::vector<int>& x) {
 }  // namespace
 
 TfLiteStatus Interpreter::AllocateTensors() {
-  next_allocate_node_id_ = 0;
-  TF_LITE_ENSURE_OK(&context_, arena_.Clear());
-  TF_LITE_ENSURE_OK(&context_, persistent_arena_.Clear());
-  allocs_and_refcounts_.clear();
-  return AllocateTensorsWhoseSizesAreKnown();
+  next_execution_plan_index_to_prepare_ = 0;
+  if (memory_planner_) {
+    TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
+  }
+
+  if (!consistent_) {
+    ReportError(&context_, "AllocateTensors() called on inconsistent model.");
+    return kTfLiteError;
+  }
+
+  TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+  invokable_ = true;
+  return kTfLiteOk;
 }
 
 TfLiteStatus Interpreter::AddNodeWithParameters(
@@ -334,8 +275,10 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
       &context_,
       CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
 
-  if (node_index) *node_index = nodes_and_registration_.size();
+  int new_node_index = nodes_and_registration_.size();
+  if (node_index) *node_index = new_node_index;
   nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
+
   auto& node_and_reg = nodes_and_registration_.back();
   TfLiteNode& node = node_and_reg.first;
   if (node.inputs) TfLiteIntArrayFree(node.inputs);
@@ -357,6 +300,7 @@ TfLiteStatus Interpreter::AddNodeWithParameters(
   }
   node.builtin_data = builtin_data_deleter.release();
   node_and_reg.second = *registration;
+  execution_plan_.push_back(new_node_index);
   return kTfLiteOk;
 }
 
@@ -372,6 +316,60 @@ TfLiteStatus Interpreter::ResizeInputTensor(int tensor_index,
   return ResizeTensorImpl(&context_.tensors[tensor_index], dims_lite);
 }
 
+// Returns true if at least one tensor in the given list is kTfLiteDynamic.
+bool HasDynamicTensor(const TfLiteContext& context,
+                      const TfLiteIntArray* tensors) {
+  for (int i = 0; i < tensors->size; ++i) {
+    const TfLiteTensor& tensor = context.tensors[tensors->data[i]];
+    if (tensor.allocation_type == kTfLiteDynamic) {
+      return true;
+    }
+  }
+  return false;
+}
+
+TfLiteStatus Interpreter::PrepareOpsStartingAt(
+    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
+  for (int execution_plan_index = first_execution_plan_index;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
+    if (OpPrepare(registration, &node) == kTfLiteError) {
+      return kTfLiteError;
+    }
+
+    *last_execution_plan_index_prepared = execution_plan_index;
+
+    // Discontinue if the node has dynamic outputs. Note that we don't
+    // stop for dynamic temporary tensors since they won't affect the
+    // sizes of other tensors in the graph.
+    if (HasDynamicTensor(context_, node.outputs)) {
+      break;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::PrepareOpsAndTensors() {
+  if (!memory_planner_) {
+    memory_planner_.reset(new ArenaPlanner(
+        &context_, std::unique_ptr<GraphInfo>(new InterpreterInfo(this))));
+    memory_planner_->PlanAllocations();
+  }
+
+  int last_exec_plan_index_prepared = 0;
+
+  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
+      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
+  TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
+      next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared));
+
+  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
+  return kTfLiteOk;
+}
+
 TfLiteStatus Interpreter::Invoke() {
   if (!consistent_) {
     ReportError(&context_, "Invoke called on model that is not consistent.");
@@ -384,10 +382,7 @@ TfLiteStatus Interpreter::Invoke() {
 
   TfLiteStatus status = kTfLiteOk;
   if (nnapi_delegate_) {
-    if (AllocateTensorsWhoseSizesAreKnown() == kTfLiteError) {
-      return kTfLiteError;
-    }
-    if (next_allocate_node_id_ == nodes_and_registration_.size()) {
+    if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
       TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this));
       return kTfLiteOk;
     } else {
@@ -400,17 +395,24 @@ TfLiteStatus Interpreter::Invoke() {
     }
   }
 
-  for (int i = 0; i < nodes_and_registration_.size(); i++) {
-    // Ensure we have allocated up to this node. The point of this is to
-    // allocate as much as possible before running any evaluation, but
-    // dynamic shapes can prevent this from being possible.
-    if (i >= next_allocate_node_id_) {
-      if (AllocateTensorsWhoseSizesAreKnown() == kTfLiteError) {
-        return kTfLiteError;
-      }
+  // Invocations are always done in node order.
+  // Note that calling Invoke repeatedly will cause the original memory plan to
+  // be reused, unless either ResizeInputTensor() or AllocateTensors() has been
+  // called.
+  // TODO(b/71913981): we should force recalculation in the presence of dynamic
+  // tensors, because they may have new value which in turn may affect shapes
+  // and allocations.
+  for (int execution_plan_index = 0;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    if (execution_plan_index == next_execution_plan_index_to_prepare_) {
+      TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+      TF_LITE_ENSURE(&context_, next_execution_plan_index_to_prepare_ >=
+                                    execution_plan_index);
     }
-    TfLiteNode& node = nodes_and_registration_[i].first;
-    const TfLiteRegistration& registration = nodes_and_registration_[i].second;
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
     if (OpInvoke(registration, &node) == kTfLiteError) {
       status = kTfLiteError;
     }
@@ -465,6 +467,22 @@ TfLiteStatus Interpreter::AddTensors(TfLiteContext* context, int tensors_to_add,
       ->AddTensors(tensors_to_add, first_new_tensor_index);
 }
 
+TfLiteStatus Interpreter::GetNodeAndRegistration(
+    int node_index, TfLiteNode** node, TfLiteRegistration** registration) {
+  TF_LITE_ENSURE(&context_, node_index < nodes_size() && node_index >= 0);
+  TF_LITE_ENSURE(&context_, node != nullptr && registration != nullptr);
+  *node = &nodes_and_registration_[node_index].first;
+  *registration = &nodes_and_registration_[node_index].second;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::GetNodeAndRegistration(
+    struct TfLiteContext* context, int node_index, TfLiteNode** node,
+    TfLiteRegistration** registration) {
+  return static_cast<Interpreter*>(context->impl_)
+      ->GetNodeAndRegistration(node_index, node, registration);
+}
+
 TfLiteStatus Interpreter::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name,
     const std::vector<int>& dims, TfLiteQuantizationParams quantization,
@@ -514,6 +532,14 @@ TfLiteStatus Interpreter::SetTensorParametersReadWrite(
   return kTfLiteOk;
 }
 
+TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
+  for (int node_index : new_plan) {
+    TF_LITE_ENSURE(&context_, node_index >= 0 && node_index < nodes_size());
+  }
+  execution_plan_ = new_plan;
+  return kTfLiteOk;
+}
+
 TfLiteStatus Interpreter::ResizeTensorImpl(TfLiteTensor* tensor,
                                            TfLiteIntArray* new_size) {
   // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
@@ -527,6 +553,9 @@ TfLiteStatus Interpreter::ResizeTensorImpl(TfLiteTensor* tensor,
         TfLiteIntArrayFree(new_size);
         return kTfLiteError;
       }
+
+      // Realloc space for kTfLiteDynamic tensors.
+      TfLiteTensorRealloc(bytesRequired, tensor);
       tensor->bytes = bytesRequired;
     }
     if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
@@ -564,4 +593,20 @@ void Interpreter::SetNumThreads(int num_threads) {
   tflite::gemm_support::SetMaxNumThreads(&context_, num_threads);
 }
 
+TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
+  // TODO(aselle): Consider if it is worth storing pointers to delegates.
+  // Setup additional context interface
+  context_.GetNodeAndRegistration = GetNodeAndRegistration;
+  context_.ReplaceSubgraphsWithDelegateKernels =
+      ReplaceSubgraphsWithDelegateKernels;
+  context_.GetExecutionPlan = GetExecutionPlan;
+
+  TfLiteStatus status = delegate->Prepare(&context_, delegate->data_);
+  // Remove additional context info.
+  context_.GetNodeAndRegistration = nullptr;
+  context_.ReplaceSubgraphsWithDelegateKernels = nullptr;
+  context_.GetExecutionPlan = nullptr;
+  return status;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 65c61e44bee48535f884a3afaddc691972f5e04b..bab56a9d72f8992a9d8af23f92133c7c918fd46d 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Main abstraction controlling the tflite interpreter.
 // See context.h for the API for defining operations (TfLiteRegistration).
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
+#define TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
 
 #include <cstdio>
 #include <cstdlib>
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
-#include "tensorflow/contrib/lite/simple_memory_arena.h"
+#include "tensorflow/contrib/lite/memory_planner.h"
 
 namespace tflite {
 
@@ -49,13 +49,6 @@ constexpr TfLiteType typeToTfLiteType<unsigned char>() {
   return kTfLiteUInt8;
 }
 
-struct ArenaAllocRefCount {
-  ArenaAllocRefCount() : alloc(), count(0) {}
-
-  ArenaAlloc alloc;
-  int count;
-};
-
 // Forward declare since NNAPIDelegate uses Interpreter.
 class NNAPIDelegate;
 
@@ -87,6 +80,12 @@ class NNAPIDelegate;
 // foo.Invoke();
 //
 
+struct TfLiteIntArrayDeleter {
+  void operator()(TfLiteIntArray* a) {
+    if (a) TfLiteIntArrayFree(a);
+  }
+};
+
 class Interpreter {
  public:
   // Instantiate an interpreter. All errors associated with reading and
@@ -115,7 +114,7 @@ class Interpreter {
 
   // Adds a node with the given parameters and returns the index of the new
   // node in `node_index` (optionally). Interpreter will take ownership of
-  // `builtin_data` and destroy it with `delete`. Ownership of 'init_data'
+  // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
   // remains with the caller.
   TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
                                      const std::vector<int>& outputs,
@@ -173,12 +172,19 @@ class Interpreter {
   // Return the number of ops in the model.
   int nodes_size() const { return nodes_and_registration_.size(); }
 
+  // WARNING: Experimental interface, subject to change
+  const std::vector<int>& execution_plan() const { return execution_plan_; }
+
+  // WARNING: Experimental interface, subject to change
+  // Overrides execution plan. This bounds checks indices sent in.
+  TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
+
   // Get a tensor data structure.
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
   TfLiteTensor* tensor(int tensor_index) {
     if (tensor_index >= context_.tensors_size || tensor_index < 0)
-        return nullptr;
+      return nullptr;
     return &context_.tensors[tensor_index];
   }
 
@@ -247,6 +253,11 @@ class Interpreter {
   // Set the number of threads available to the interpreter.
   void SetNumThreads(int num_threads);
 
+  // Allow a delegate to look at the graph and modify the graph to handle
+  // parts of the graph themselves. After this is called, the graph may
+  // contain new nodes that replace 1 more nodes.
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
+
  private:
   // Give 'op_reg' a chance to initialize itself using the contents of
   // 'buffer'.
@@ -276,9 +287,18 @@ class Interpreter {
     return op_reg.invoke(&context_, node);
   }
 
-  // Allocate tensors whose sizes are known in order of nodes. Discontinue when
-  // we encounter a node that has a dynamic output tensor.
-  TfLiteStatus AllocateTensorsWhoseSizesAreKnown();
+  // Call OpPrepare() for as many ops as possible, allocating memory for their
+  // tensors. If an op containing dynamic tensors is found, preparation will be
+  // postponed until this function is called again. This allows the interpreter
+  // to wait until Invoke() to resolve the sizes of dynamic tensors.
+  TfLiteStatus PrepareOpsAndTensors();
+
+  // Call OpPrepare() for all ops starting at 'first_node'. Stop when a
+  // dynamic tensors is found or all ops have been prepared. Fill
+  // 'last_node_prepared' with the id of the op containing dynamic tensors, or
+  // the last in the graph.
+  TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
+                                    int* last_execution_plan_index_prepared);
 
   // Tensors needed by the interpreter. Use `AddTensors` to add more blank
   // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
@@ -298,7 +318,8 @@ class Interpreter {
   TfLiteStatus BytesRequired(TfLiteType type, const int* dims, int dims_size,
                              size_t* bytes);
 
-  // Request an tensor be resized implementation.
+  // Request an tensor be resized implementation. If the given tensor is of
+  // type kTfLiteDynamic it will also be allocated new memory.
   TfLiteStatus ResizeTensorImpl(TfLiteTensor* tensor, TfLiteIntArray* new_size);
 
   // Report a detailed error string (will be printed to stderr).
@@ -315,6 +336,40 @@ class Interpreter {
   static TfLiteStatus AddTensors(TfLiteContext* context, int tensors_to_add,
                                  int* first_new_tensor_index);
 
+  // WARNING: This is an experimental API and subject to change.
+  // Entry point for C API ReplaceSubgraphsWithDelegateKernels
+  static TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
+      TfLiteContext* context, TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace);
+
+  // Update the execution graph to replace some of the nodes with stub
+  // nodes. Specifically any node index that has `nodes[index]==1` will be
+  // slated for replacement with a delegate kernel specified by registration.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
+      TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets the internal pointer to a TensorFlow lite node by node_index.
+  TfLiteStatus GetNodeAndRegistration(int node_index, TfLiteNode** node,
+                                      TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get a node by index.
+  static TfLiteStatus GetNodeAndRegistration(struct TfLiteContext*,
+                                             int node_index, TfLiteNode** node,
+                                             TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets an TfLiteIntArray* representing the execution plan. The caller owns
+  // this memory and must free it with TfLiteIntArrayFree().
+  TfLiteStatus GetExecutionPlan(TfLiteIntArray** execution_plan);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get the execution plan
+  static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context,
+                                       TfLiteIntArray** execution_plan);
+
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
@@ -325,17 +380,6 @@ class Interpreter {
   std::vector<std::pair<TfLiteNode, TfLiteRegistration>>
       nodes_and_registration_;
 
-  // Raw memory buffer that is allocated for all temporary and graph outputs.
-  // that are declared kTfLiteArenaRw.
-  SimpleMemoryArena arena_;
-
-  // Raw memory buffer that is allocated for persistent tensors that are
-  // declared as kTfLiteArenaRwPersistent.
-  SimpleMemoryArena persistent_arena_;
-
-  // Stores allocation and reference counts of all tensors.
-  std::vector<ArenaAllocRefCount> allocs_and_refcounts_;
-
   // Whether the model is consistent. That is to say if the inputs and outputs
   // of every node and the global inputs and outputs are valid indexes into
   // the tensor array.
@@ -356,7 +400,7 @@ class Interpreter {
   // The error reporter delegate that tflite will forward queries errors to.
   ErrorReporter* error_reporter_;
 
-  // Next node to allocate output tensors.
+  // Index of the next node to prepare.
   // During Invoke(), Interpreter will allocate input tensors first, which are
   // known to be fixed size. Then it will allocate outputs from nodes as many
   // as possible. When there is a node that produces dynamic sized tensor.
@@ -364,11 +408,24 @@ class Interpreter {
   // node id, and execute the node to generate the output tensor before continue
   // to allocate successors. This process repeats until all nodes are executed.
   // NOTE: this relies on the order of nodes that is in topological order.
-  int next_allocate_node_id_;
+  int next_execution_plan_index_to_prepare_;
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // This is a list of node indices (to index into nodes_and_registration).
+  // This represents a valid topological sort (dependency ordered) execution
+  // plan. In particular, it is valid for this ordering to contain only a
+  // subset of the node indices.
+  std::vector<int> execution_plan_;
+
+  // In the future, we'd like a TfLiteIntArray compatible representation.
+  // TODO(aselle): replace execution_plan_ with this.
+  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> plan_cache_;
 
   // Whether to delegate to NN API
   std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
+
+  std::unique_ptr<MemoryPlanner> memory_planner_;
 };
 
 }  // namespace tflite
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index edff2109430c6e1ec6c481619ed7772237a3301d..28c96e5dde6ffa62bb073db9716a00f91c6e0bdf 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/contrib/lite/interpreter.h"
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/string_util.h"
-
+#include "tensorflow/contrib/lite/testing/util.h"
 namespace tflite {
 namespace {
 
@@ -282,6 +284,51 @@ TEST(BasicInterpreter, NoOpInterpreter) {
   ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
 }
 
+TEST(BasicInterpreter, ResizingTensors) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(1), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetInputs({0}), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetOutputs({0}), kTfLiteOk);
+
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(
+                0, kTfLiteFloat32, "", {3}, TfLiteQuantizationParams()),
+            kTfLiteOk);
+
+  int t = interpreter.inputs()[0];
+  TfLiteTensor* tensor = interpreter.tensor(t);
+
+  ASSERT_EQ(interpreter.ResizeInputTensor(t, {1, 2, 3}), kTfLiteOk);
+  EXPECT_EQ(tensor->bytes, 6 * sizeof(float));
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+
+  tensor->data.f[5] = 0.123f;
+
+  // Changing from kTfLiteArenaRw to kTfLiteDynamic is quite complicate: we need
+  // to unset data.raw, otherwise Realloc will try to free that memory.
+  tensor->data.raw = nullptr;
+  tensor->allocation_type = kTfLiteDynamic;
+
+  ASSERT_EQ(interpreter.ResizeInputTensor(t, {1, 2, 4}), kTfLiteOk);
+  EXPECT_EQ(tensor->bytes, 8 * sizeof(float));
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+
+  // TODO(ahentz): We shouldn't have to force reallocation, but
+  // ResizeInputTensor doesn't realloc dynamic tensors. Also note that
+  // TfLiteTensorRealloc(tensor->bytes, tensor) is a no-op.
+  TfLiteTensorRealloc(9 * sizeof(float), tensor);
+  tensor->data.f[7] = 0.123f;
+
+  ASSERT_EQ(interpreter.ResizeInputTensor(t, {2, 2, 4}), kTfLiteOk);
+  EXPECT_EQ(tensor->bytes, 16 * sizeof(float));
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+
+  // TODO(ahentz): We shouldn't have to force reallocation, but
+  // ResizeInputTensor doesn't realloc dynamic tensors. Also note that
+  // TfLiteTensorRealloc(tensor->bytes, tensor) is a no-op.
+  TfLiteTensorRealloc(17 * sizeof(float), tensor);
+  tensor->data.f[15] = 0.123f;
+}
+
 TEST(BasicInterpreter, OneOpInterpreter) {
   Interpreter interpreter;
   ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
@@ -514,13 +561,283 @@ TEST(BasicInterpreter, TestCustomErrorReporter) {
   ASSERT_EQ(reporter.calls, 1);
 }
 
+// Test fixture that allows playing with execution plans. It creates a two
+// node graph that can be executed in either [0,1] order or [1,0] order.
+// The CopyOp records when it is invoked in the class member run_order_
+// so we can test whether the execution plan was honored.
+class TestExecutionPlan : public ::testing::Test {
+  // Encapsulates the node ids and provides them to a C primitive data type
+  // Allocatable with placement new, but never destructed, so make sure this
+  // doesn't own any heap allocated data. This is then is used as op local
+  // data to allow access to the test fixture data.
+  class CallReporting {
+   public:
+    CallReporting(int node_id, std::vector<int>* run_order)
+        : node_id_(node_id), run_order_(run_order) {}
+
+    void Record() { run_order_->push_back(node_id_); }
+
+   private:
+    // The node id for this particular node
+    int node_id_;
+    // A pointer to the global run-order
+    std::vector<int>* run_order_;
+  };
+
+  // Build a kernel registration for an op that copies its one input
+  // to an output
+  TfLiteRegistration CopyOpRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      // Set output size to input size
+      TfLiteTensor* tensor0 = &context->tensors[node->inputs->data[0]];
+      TfLiteTensor* tensor1 = &context->tensors[node->outputs->data[0]];
+      TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims);
+      return context->ResizeTensor(context, tensor1, newSize);
+    };
+
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      CallReporting* call_reporting =
+          reinterpret_cast<CallReporting*>(node->builtin_data);
+      // Copy input data to output data.
+      TfLiteTensor* a0 = &context->tensors[node->inputs->data[0]];
+      TfLiteTensor* a1 = &context->tensors[node->outputs->data[0]];
+      int num = a0->dims->data[0];
+      for (int i = 0; i < num; i++) {
+        a1->data.f[i] = a0->data.f[i];
+      }
+      call_reporting->Record();
+      return kTfLiteOk;
+    };
+    return reg;
+  }
+
+  // Adds a copy node going from tensor `input` to output tensor `output`.
+  // Note, input is used as the node_id. Inject run_order as op accessible
+  // data. Note: this is a little strange of a way to do this, but it is
+  // using op functionality to avoid static global variables.
+  void MakeCopyNode(int input, int output) {
+    // Ownership of call_reporting is taken by interpreter (malloc is used due
+    // to nodes being a C99 interface so free() is used).
+    TfLiteRegistration copy_op = CopyOpRegistration();
+    CallReporting* call_reporting_1 =
+        reinterpret_cast<CallReporting*>(malloc(sizeof(CallReporting)));
+    new (call_reporting_1) CallReporting(input, &run_order_);
+    ASSERT_EQ(interpreter_.AddNodeWithParameters(
+                  {0}, {2}, nullptr, 0,
+                  reinterpret_cast<void*>(call_reporting_1), &copy_op),
+              kTfLiteOk);
+    ASSERT_EQ(interpreter_.ResizeInputTensor(input, {3}), kTfLiteOk);
+  }
+
+  void SetUp() final {
+    // Add two inputs and two outputs that don't depend on each other
+    ASSERT_EQ(interpreter_.AddTensors(4), kTfLiteOk);
+    interpreter_.SetInputs({0, 1});
+    interpreter_.SetOutputs({2, 3});
+    TfLiteQuantizationParams quantized;
+    for (int tensor_index = 0; tensor_index < 4; tensor_index++) {
+      ASSERT_EQ(interpreter_.SetTensorParametersReadWrite(
+                    tensor_index, kTfLiteFloat32, "", {3}, quantized),
+                kTfLiteOk);
+    }
+
+    // Define two copy functions that also use the user_data to report that
+    // they were called.
+    // i.e. tensor[2] = copy(tensor[0]); tensor[3] = copy(tensor[1]);
+    // thus we can reorder the two nodes arbitrary and still satisfy dependency
+    // order.
+    MakeCopyNode(0, 2);
+    MakeCopyNode(1, 3);
+
+    ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+  }
+
+ protected:
+  Interpreter interpreter_;
+
+  // list of node_ids that were run
+  std::vector<int> run_order_;
+};
+
+TEST_F(TestExecutionPlan, DefaultExecutionPlan) {
+  // Check default order
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>({0, 1}));
+}
+
+TEST_F(TestExecutionPlan, ReversedExecutionPlan) {
+  // Check reversed order
+  interpreter_.SetExecutionPlan({1, 0});
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>({1, 0}));
+}
+
+TEST_F(TestExecutionPlan, SubsetExecutionPlan) {
+  // Check running only node index 1
+  interpreter_.SetExecutionPlan({1});
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>({1}));
+}
+
+TEST_F(TestExecutionPlan, NullExecutionPlan) {
+  // Check nothing executed.
+  interpreter_.SetExecutionPlan({});
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+  ASSERT_EQ(run_order_, std::vector<int>());
+}
+
+// Build a kernel registration for an op that copies its one input
+// to an output
+TfLiteRegistration AddOpRegistration() {
+  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+  reg.custom_name = "my_add";
+  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
+
+  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    // Set output size to input size
+    TfLiteTensor* tensor0 = &context->tensors[node->inputs->data[0]];
+    TfLiteTensor* tensor1 = &context->tensors[node->inputs->data[1]];
+    TfLiteTensor* tensor2 = &context->tensors[node->outputs->data[0]];
+    TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims);
+    TfLiteIntArray* newSizeOther = TfLiteIntArrayCopy(tensor1->dims);
+    TF_LITE_ENSURE_EQ(context, newSize->size, newSizeOther->size);
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, tensor2, newSize));
+    return kTfLiteOk;
+  };
+
+  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    // Copy input data to output data.
+    TfLiteTensor* a0 = &context->tensors[node->inputs->data[0]];
+    TfLiteTensor* a1 = &context->tensors[node->inputs->data[1]];
+    TfLiteTensor* out = &context->tensors[node->outputs->data[0]];
+    int num = a0->dims->data[0];
+    for (int i = 0; i < num; i++) {
+      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+    }
+    return kTfLiteOk;
+  };
+  return reg;
+}
+
+class TestDelegate : public ::testing::Test {
+ public:
+  TestDelegate() {
+    interpreter_.AddTensors(5);
+    interpreter_.SetInputs({0, 1});
+    interpreter_.SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_.SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                              quant);
+    interpreter_.SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                              quant);
+    interpreter_.SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                              quant);
+    interpreter_.SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                              quant);
+    TfLiteRegistration reg = AddOpRegistration();
+    interpreter_.AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
+    interpreter_.AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
+    interpreter_.AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
+  }
+
+ protected:
+  class SimpleDelegate {
+   public:
+    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
+    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
+    // value-copyable and compatible with TfLite.
+    explicit SimpleDelegate(const std::vector<int>& nodes) : nodes_(nodes) {
+      delegate_.Prepare = [](TfLiteContext* context,
+                             void* data) -> TfLiteStatus {
+        auto* simple = reinterpret_cast<SimpleDelegate*>(data);
+        TfLiteIntArray* nodes_to_separate =
+            TfLiteIntArrayCreate(simple->nodes_.size());
+        // Mark nodes that we want in TfLiteIntArray* structure.
+        int index = 0;
+        for (auto node_index : simple->nodes_) {
+          nodes_to_separate->data[index++] = node_index;
+          // make sure node is add
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+        }
+        // Check that all nodes are available
+        TfLiteIntArray* execution_plan;
+        TF_LITE_ENSURE_STATUS(
+            context->GetExecutionPlan(context, &execution_plan));
+        for (int exec_index = 0; exec_index < execution_plan->size;
+             exec_index++) {
+          int node_index = execution_plan->data[exec_index];
+          // Check that we are an identity map to start.
+          TFLITE_CHECK_EQ(exec_index, node_index);
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+        }
+
+        context->ReplaceSubgraphsWithDelegateKernels(
+            context, FakeFusedRegistration(), nodes_to_separate);
+        TfLiteIntArrayFree(nodes_to_separate);
+        return kTfLiteOk;
+      };
+      // Store type-punned data SimpleDelegate structure.
+      delegate_.data_ = reinterpret_cast<void*>(this);
+    }
+
+    static TfLiteRegistration FakeFusedRegistration() {
+      TfLiteRegistration reg = {nullptr};
+      reg.custom_name = "fake_fused_op";
+      return reg;
+    }
+
+    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
+
+   private:
+    std::vector<int> nodes_;
+    TfLiteDelegate delegate_;
+  };
+  Interpreter interpreter_;
+};
+
+TEST_F(TestDelegate, BasicDelegate) {
+  interpreter_.Invoke();
+  SimpleDelegate simple({0, 1, 2});
+  interpreter_.ModifyGraphWithDelegate(simple.get_tf_lite_delegate());
+
+  ASSERT_EQ(interpreter_.execution_plan().size(), 1);
+  int node = interpreter_.execution_plan()[0];
+  const auto* node_and_reg = interpreter_.node_and_registration(node);
+  ASSERT_EQ(node_and_reg->second.custom_name,
+            SimpleDelegate::FakeFusedRegistration().custom_name);
+}
+
+TEST_F(TestDelegate, ComplexDeligate) {
+  interpreter_.Invoke();
+  SimpleDelegate simple({1, 2});
+  interpreter_.ModifyGraphWithDelegate(simple.get_tf_lite_delegate());
+
+  ASSERT_EQ(interpreter_.execution_plan().size(), 2);
+  // 0th should be a non-delegated original op
+  ASSERT_EQ(interpreter_.execution_plan()[0], 0);
+  // 1st should be a new macro op (3) which didn't exist)
+  ASSERT_EQ(interpreter_.execution_plan()[1], 3);
+  const auto* node_and_reg = interpreter_.node_and_registration(3);
+  ASSERT_EQ(node_and_reg->second.custom_name,
+            SimpleDelegate::FakeFusedRegistration().custom_name);
+}
+
 }  // namespace
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-#ifdef OS_LINUX
-  FLAGS_logtostderr = true;
-#endif
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/ios_makefile.inc b/tensorflow/contrib/lite/ios_makefile.inc
index bcff7ed9889e95c13294b6cf0d0f4788991a04df..fc6594c3a04ba6aabba99bb631f85737baf389f1 100644
--- a/tensorflow/contrib/lite/ios_makefile.inc
+++ b/tensorflow/contrib/lite/ios_makefile.inc
@@ -22,6 +22,7 @@ ifeq ($(TARGET), IOS)
 	IOS_ARCH := x86_64
 	CXXFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
 		-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+		-DTFLITE_USE_APPLE_ACCELERATE_FOR_CONV \
 		-fembed-bitcode \
 		-Wno-c++11-narrowing \
 		-mno-thumb \
@@ -30,6 +31,9 @@ ifeq ($(TARGET), IOS)
 		${IPHONEOS_SYSROOT} \
 		-arch $(IOS_ARCH) \
 		-O3
+	ifeq ($(IOS_ARCH), x86_64)
+		CXXFLAGS += -msse4.1
+	endif
 	CCFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
 		-fembed-bitcode \
 		-mno-thumb \
@@ -39,6 +43,7 @@ ifeq ($(TARGET), IOS)
 		-O3
 	LDFLAGS := -fembed-bitcode \
 		-miphoneos-version-min=${MIN_SDK_VERSION} \
+		-framework Accelerate \
 		-arch $(IOS_ARCH)
 	OBJDIR := $(OBJDIR)ios_$(IOS_ARCH)/
 	LIBDIR := $(LIBDIR)ios_$(IOS_ARCH)/
diff --git a/tensorflow/contrib/lite/java/AndroidManifest.xml b/tensorflow/contrib/lite/java/AndroidManifest.xml
new file mode 100644
index 0000000000000000000000000000000000000000..f705feacbec38ab5152ce52b701320d8f1cd8d3d
--- /dev/null
+++ b/tensorflow/contrib/lite/java/AndroidManifest.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+          package="org.tensorflow.lite">
+    <application>
+    </application>
+</manifest>
+
diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
index 1de28eb52ddb458df0be0a8f9ef453f7caf68654..35aacb70002d1d454f675484e4398bcdffc4acf1 100644
--- a/tensorflow/contrib/lite/java/BUILD
+++ b/tensorflow/contrib/lite/java/BUILD
@@ -7,6 +7,16 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_jni_binary")
+load("//tensorflow/contrib/lite/java:aar_with_jni.bzl", "aar_with_jni")
+
+# Building tensorflow-lite.aar including 4 variants of .so
+# To build an aar for release, run below command:
+# bazel build --cxxopt='--std=c++11' -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
+# tensorflow/contrib/lite/java:tensorflow-lite
+aar_with_jni(
+    name = "tensorflow-lite",
+    android_library = ":tensorflowlite",
+)
 
 android_library(
     name = "tensorflowlite",
@@ -15,6 +25,7 @@ android_library(
             "src/main/java/org/tensorflow/lite/*.java",
         ],
     ),
+    manifest = "AndroidManifest.xml",
     visibility = ["//visibility:public"],
     deps = [
         ":tflite_runtime",
@@ -100,6 +111,26 @@ java_test(
     ],
 )
 
+# TODO: generate large models at runtime, instead of storing them.
+java_test(
+    name = "InterpreterTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/lite/InterpreterTest.java"],
+    data = [
+        "src/testdata/add.bin",
+        "src/testdata/mobilenet.tflite.bin",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.lite.InterpreterTest",
+    visibility = ["//visibility:private"],
+    deps = [
+        ":libtensorflowlite_jni.so",
+        ":tensorflowlitelib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
 java_test(
     name = "TensorTest",
     size = "small",
diff --git a/tensorflow/contrib/lite/java/aar_with_jni.bzl b/tensorflow/contrib/lite/java/aar_with_jni.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..4450bc9085555b3416f51bac07ea94a1240e919c
--- /dev/null
+++ b/tensorflow/contrib/lite/java/aar_with_jni.bzl
@@ -0,0 +1,47 @@
+"""Generate zipped aar file including different variants of .so in jni folder."""
+
+def aar_with_jni(name, android_library):
+  # Generate dummy AndroidManifest.xml for dummy apk usage
+  # (dummy apk is generated by <name>_dummy_app_for_so target below)
+  native.genrule(
+      name = name + "_binary_manifest_generator",
+      outs = [name + "_generated_AndroidManifest.xml"],
+      cmd = """
+cat > $(OUTS) <<EOF
+<manifest
+  xmlns:android="http://schemas.android.com/apk/res/android"
+  package="dummy.package.for.so">
+  <uses-sdk android:minSdkVersion="999"/>
+</manifest>
+EOF
+""",
+  )
+
+  # Generate dummy apk including .so files and later we extract out
+  # .so files and throw away the apk.
+  native.android_binary(
+      name = name + "_dummy_app_for_so",
+      manifest = name + "_generated_AndroidManifest.xml",
+      custom_package = "dummy.package.for.so",
+      deps = [android_library],
+      # In some platforms we don't have an Android SDK/NDK and this target
+      # can't be built. We need to prevent the build system from trying to
+      # use the target in that case.
+      tags = ["manual"],
+  )
+
+  native.genrule(
+      name = name,
+      srcs = [android_library + ".aar", name + "_dummy_app_for_so_unsigned.apk"],
+      outs = [name + ".aar"],
+      tags = ["manual"],
+      cmd = """
+cp $(location {}.aar) $(location :{}.aar)
+chmod +w $(location :{}.aar)
+origdir=$$PWD
+cd $$(mktemp -d)
+unzip $$origdir/$(location :{}_dummy_app_for_so_unsigned.apk) "lib/*"
+cp -r lib jni
+zip -r $$origdir/$(location :{}.aar) jni/*/*.so
+""".format(android_library, name, name, name, name),
+  )
diff --git a/tensorflow/contrib/lite/java/build_aar_for_release.sh b/tensorflow/contrib/lite/java/build_aar_for_release.sh
new file mode 100755
index 0000000000000000000000000000000000000000..fbcb1e7db9a3f9b885505e989b7ff7224f2d2b15
--- /dev/null
+++ b/tensorflow/contrib/lite/java/build_aar_for_release.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+TMPDIR=`mktemp -d`
+trap "rm -rf $TMPDIR" EXIT
+
+VERSION=1.0
+
+BUILDER=bazel
+BASEDIR=tensorflow/contrib/lite
+CROSSTOOL="//external:android/crosstool"
+HOST_CROSSTOOL="@bazel_tools//tools/cpp:toolchain"
+
+BUILD_OPTS="--cxxopt=--std=c++11 -c opt"
+CROSSTOOL_OPTS="--crosstool_top=$CROSSTOOL --host_crosstool_top=$HOST_CROSSTOOL"
+
+test -d $BASEDIR || (echo "Aborting: not at top-level build directory"; exit 1)
+
+function build_basic_aar() {
+  local OUTDIR=$1
+  $BUILDER build $BUILD_OPTS $BASEDIR/java:tensorflowlite.aar
+  unzip -d $OUTDIR $BUILDER-bin/$BASEDIR/java/tensorflowlite.aar
+  # targetSdkVersion is here to prevent the app from requesting spurious
+  # permissions, such as permission to make phone calls. It worked for v1.0,
+  # but minSdkVersion might be the preferred way to handle this.
+  sed -i -e 's/<application>/<uses-sdk android:targetSdkVersion="25"\/><application>/' $OUTDIR/AndroidManifest.xml
+}
+
+function build_arch() {
+  local ARCH=$1
+  local CONFIG=$2
+  local OUTDIR=$3
+  mkdir -p $OUTDIR/jni/$ARCH/
+  $BUILDER build $BUILD_OPTS $CROSSTOOL_OPTS --cpu=$CONFIG \
+    $BASEDIR/java:libtensorflowlite_jni.so
+  cp $BUILDER-bin/$BASEDIR/java/libtensorflowlite_jni.so $OUTDIR/jni/$ARCH/
+}
+
+rm -rf $TMPDIR
+mkdir -p $TMPDIR/jni
+
+build_basic_aar $TMPDIR
+build_arch arm64-v8a arm64-v8a $TMPDIR
+build_arch armeabi-v7a armeabi-v7a $TMPDIR
+build_arch x86 x86 $TMPDIR
+build_arch x86_64 x86_64 $TMPDIR
+
+AAR_FILE=`realpath tflite-${VERSION}.aar`
+(cd $TMPDIR && zip $AAR_FILE -r *)
+echo "New AAR file is $AAR_FILE"
+
diff --git a/tensorflow/contrib/lite/java/demo/README.md b/tensorflow/contrib/lite/java/demo/README.md
index 71b633c5774d93684f651821adad13c378a8243c..2e818f728ef208d30b0eeb27ffd7e3fa0c7c1a2d 100644
--- a/tensorflow/contrib/lite/java/demo/README.md
+++ b/tensorflow/contrib/lite/java/demo/README.md
@@ -8,7 +8,12 @@
      It's easiest with Android Studio.
 
       - You'll need at least SDK version 23.
+      - Make sure to install the latest version of Bazel. Some distributions
+        ship with Bazel 0.5.4, which is too old.
       - Bazel requires Android Build Tools `26.0.1` or higher.
+      - **Bazel is incompatible with NDK revisions 15 and above,** with revision
+        16 being a compile-breaking change. [Download an older version manually
+        instead of using the SDK Manager.](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install-bazel-and-android-prerequisites)
       - You also need to install the Android Support Repository, available
         through Android Studio under `Android SDK Manager -> SDK Tools ->
         Android Support Repository`.
@@ -16,10 +21,15 @@
   2. [Edit your `WORKSPACE`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#edit-workspace)
      to add SDK and NDK targets.
 
+     NOTE: As long as you have the SDK and NDK installed, the `./configure`
+     script will create these rules for you. Answer "Yes" when the script asks
+     to automatically configure the `./WORKSPACE`.
+
       - Make sure the `api_level` in `WORKSPACE` is set to an SDK version that
         you have installed.
       - By default, Android Studio will install the SDK to `~/Android/Sdk` and
-        the NDK to `~/Android/Sdk/ndk-bundle`.
+        the NDK to `~/Android/Sdk/ndk-bundle` (but the NDK should be a manual
+        download until Bazel supports NDK 16. See bullet points under (1)).
 
 2. Build the app with Bazel. The demo needs C++11:
 
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index e7bad4637041d003c1e507d81c0c30404c587653..e44c5ae6b48eda187079dd3a0a1bc563276d816e 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -73,6 +73,11 @@ public class ImageClassifier {
 
   /** An array to hold inference results, to be feed into Tensorflow Lite as outputs. */
   private byte[][] labelProbArray = null;
+  /** multi-stage low pass filter * */
+  private float[][] filterLabelProbArray = null;
+
+  private static final int FILTER_STAGES = 3;
+  private static final float FILTER_FACTOR = 0.4f;
 
   private PriorityQueue<Map.Entry<String, Float>> sortedLabels =
       new PriorityQueue<>(
@@ -93,6 +98,7 @@ public class ImageClassifier {
             DIM_BATCH_SIZE * DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y * DIM_PIXEL_SIZE);
     imgData.order(ByteOrder.nativeOrder());
     labelProbArray = new byte[1][labelList.size()];
+    filterLabelProbArray = new float[FILTER_STAGES][labelList.size()];
     Log.d(TAG, "Created a Tensorflow Lite Image Classifier.");
   }
 
@@ -108,11 +114,38 @@ public class ImageClassifier {
     tflite.run(imgData, labelProbArray);
     long endTime = SystemClock.uptimeMillis();
     Log.d(TAG, "Timecost to run model inference: " + Long.toString(endTime - startTime));
+
+    // Smooth the results across frames.
+    applyFilter();
+
+    // Print the results.
     String textToShow = printTopKLabels();
     textToShow = Long.toString(endTime - startTime) + "ms" + textToShow;
     return textToShow;
   }
 
+  void applyFilter() {
+    int numLabels = labelList.size();
+
+    // Low pass filter `labelProbArray` into the first stage of the filter.
+    for (int j = 0; j < numLabels; ++j) {
+      filterLabelProbArray[0][j] +=
+          FILTER_FACTOR * (labelProbArray[0][j] - filterLabelProbArray[0][j]);
+    }
+    // Low pass filter each stage into the next.
+    for (int i = 1; i < FILTER_STAGES; ++i) {
+      for (int j = 0; j < numLabels; ++j) {
+        filterLabelProbArray[i][j] +=
+            FILTER_FACTOR * (filterLabelProbArray[i - 1][j] - filterLabelProbArray[i][j]);
+      }
+    }
+
+    // Copy the last stage filter output back to `labelProbArray`.
+    for (int j = 0; j < numLabels; ++j) {
+      labelProbArray[0][j] = (byte)filterLabelProbArray[FILTER_STAGES - 1][j];
+    }
+  }
+
   /** Closes tflite to release resources. */
   public void close() {
     tflite.close();
@@ -177,7 +210,7 @@ public class ImageClassifier {
     final int size = sortedLabels.size();
     for (int i = 0; i < size; ++i) {
       Map.Entry<String, Float> label = sortedLabels.poll();
-      textToShow = "\n" + label.getKey() + ":" + Float.toString(label.getValue()) + textToShow;
+      textToShow = String.format("\n%s: %4.2f", label.getKey(), label.getValue()) + textToShow;
     }
     return textToShow;
   }
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 1939a078ad8031b99620773c9b91335c4e8f7b22..5ee594dec492ad2fee22e603a6de311b3fed4cac 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -34,7 +34,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
   NativeInterpreterWrapper(String modelPath) {
     errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
     modelHandle = createModel(modelPath, errorHandle);
-    interpreterHandle = createInterpreter(modelHandle);
+    interpreterHandle = createInterpreter(modelHandle, errorHandle);
   }
 
   /**
@@ -46,7 +46,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     modelByteBuffer = mappedByteBuffer;
     errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
     modelHandle = createModelWithBuffer(modelByteBuffer, errorHandle);
-    interpreterHandle = createInterpreter(modelHandle);
+    interpreterHandle = createInterpreter(modelHandle, errorHandle);
   }
 
   /** Releases resources associated with this {@code NativeInterpreterWrapper}. */
@@ -103,11 +103,22 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     return outputs;
   }
 
+  private static native long[] run(
+      long interpreterHandle,
+      long errorHandle,
+      Object[] sizes,
+      int[] dtypes,
+      int[] numsOfBytes,
+      Object[] values);
+
   /** Resizes dimensions of a specific input. */
   void resizeInput(int idx, int[] dims) {
     resizeInput(interpreterHandle, errorHandle, idx, dims);
   }
 
+  private static native void resizeInput(
+      long interpreterHandle, long errorHandle, int inputIdx, int[] dims);
+
   void setUseNNAPI(boolean useNNAPI) {
     useNNAPI(interpreterHandle, useNNAPI);
   }
@@ -245,9 +256,6 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native String[] getOutputNames(long interpreterHandle);
 
-  private static native void resizeInput(
-      long interpreterHandle, long errorHandle, int inputIdx, int[] dims);
-
   private static native void useNNAPI(long interpreterHandle, boolean state);
 
   private static native long createErrorReporter(int size);
@@ -256,15 +264,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native long createModelWithBuffer(MappedByteBuffer modelBuffer, long errorHandle);
 
-  private static native long createInterpreter(long modelHandle);
-
-  private static native long[] run(
-      long interpreterHandle,
-      long errorHandle,
-      Object[] sizes,
-      int[] dtypes,
-      int[] numsOfBytes,
-      Object[] values);
+  private static native long createInterpreter(long modelHandle, long errorHandle);
 
   private static native void delete(long errorHandle, long modelHandle, long interpreterHandle);
 
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index bc6462eb5466e14769f94c5103984f5201b4b8dc..c346f9f92e360c0722ebac440d790da6441ceecf 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -200,6 +200,12 @@ TfLiteStatus setInputs(JNIEnv* env, tflite::Interpreter* interpreter,
   return kTfLiteOk;
 }
 
+// TODO(yichengfan): evaluate the benefit to use tflite verifier.
+bool VerifyModel(const void* buf, size_t len) {
+  flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
+  return tflite::VerifyModelBuffer(verifier);
+}
+
 }  // namespace
 
 JNIEXPORT jobjectArray JNICALL
@@ -271,6 +277,17 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel(
       convertLongToErrorReporter(env, error_handle);
   if (error_reporter == nullptr) return 0;
   const char* path = env->GetStringUTFChars(model_file, nullptr);
+
+  {
+    tflite::FileCopyAllocation allocation(path, nullptr);
+    if (!VerifyModel(allocation.base(), allocation.bytes())) {
+      throwException(env, kIllegalArgumentException,
+                     "Contents of %s is not a valid flatbuffer model", path);
+      env->ReleaseStringUTFChars(model_file, path);
+      return 0;
+    }
+  }
+
   auto model = tflite::FlatBufferModel::BuildFromFile(path, error_reporter);
   if (!model) {
     throwException(env, kIllegalArgumentException,
@@ -293,6 +310,12 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
   const char* buf =
       static_cast<char*>(env->GetDirectBufferAddress(model_buffer));
   jlong capacity = env->GetDirectBufferCapacity(model_buffer);
+  if (!VerifyModel(buf, capacity)) {
+    throwException(env, kIllegalArgumentException,
+                   "MappedByteBuffer is not a valid flatbuffer model");
+    return 0;
+  }
+
   auto model = tflite::FlatBufferModel::BuildFromBuffer(
       buf, static_cast<size_t>(capacity), error_reporter);
   if (!model) {
@@ -307,12 +330,21 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
 
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
-    JNIEnv* env, jclass clazz, jlong model_handle) {
+    JNIEnv* env, jclass clazz, jlong model_handle, jlong error_handle) {
   tflite::FlatBufferModel* model = convertLongToModel(env, model_handle);
   if (model == nullptr) return 0;
+  BufferErrorReporter* error_reporter =
+      convertLongToErrorReporter(env, error_handle);
+  if (error_reporter == nullptr) return 0;
   auto resolver = ::tflite::CreateOpResolver();
   std::unique_ptr<tflite::Interpreter> interpreter;
-  tflite::InterpreterBuilder(*model, *(resolver.get()))(&interpreter);
+  TfLiteStatus status =
+      tflite::InterpreterBuilder(*model, *(resolver.get()))(&interpreter);
+  if (status != kTfLiteOk) {
+    throwException(env, kIllegalArgumentException,
+                   "Cannot create interpreter: %s",
+                   error_reporter->CachedErrorMessage());
+  }
   return reinterpret_cast<jlong>(interpreter.release());
 }
 
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
index 430886b7cc04a356d1826843acc1bbebf4189bf7..c52a7e4e439936344be26d5761fb5747db64794a 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
@@ -95,11 +95,11 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
- *  Signature: (J)J
+ *  Signature: (JJ)J
  */
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
-    JNIEnv* env, jclass clazz, jlong model_handle);
+    JNIEnv* env, jclass clazz, jlong model_handle, jlong error_handle);
 
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
index 9a6894f49c0b7278511717d2671648c6d1763e00..90323555d88419d837a76bca7de6d9998e388fca 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
@@ -25,6 +25,7 @@ import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
 
 /** Unit tests for {@link org.tensorflow.lite.NativeInterpreterWrapper}. */
+// TODO(b/71818425): Generates model files dynamically.
 @RunWith(JUnit4.class)
 public final class NativeInterpreterWrapperTest {
 
@@ -43,6 +44,9 @@ public final class NativeInterpreterWrapperTest {
   private static final String INVALID_MODEL_PATH =
       "tensorflow/contrib/lite/java/src/testdata/invalid_model.bin";
 
+  private static final String MODEL_WITH_CUSTOM_OP_PATH =
+      "tensorflow/contrib/lite/java/src/testdata/with_custom_op.lite";
+
   @Test
   public void testConstructor() {
     NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
@@ -55,10 +59,20 @@ public final class NativeInterpreterWrapperTest {
     try {
       NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(INVALID_MODEL_PATH);
       fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("is not a valid flatbuffer model");
+    }
+  }
+
+  @Test
+  public void testConstructorWithUnresolableCustomOp() {
+    try {
+      NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(MODEL_WITH_CUSTOM_OP_PATH);
+      fail();
     } catch (IllegalArgumentException e) {
       assertThat(e)
           .hasMessageThat()
-          .contains("Model provided has model identifier ' is ', should be 'TFL3'");
+          .contains("Cannot create interpreter: Didn't find custom op for name 'Assign'");
     }
   }
 
diff --git a/tensorflow/contrib/lite/java/src/testdata/with_custom_op.lite b/tensorflow/contrib/lite/java/src/testdata/with_custom_op.lite
new file mode 100644
index 0000000000000000000000000000000000000000..e775d56d88854ecdf70233262ff5884d224f4373
Binary files /dev/null and b/tensorflow/contrib/lite/java/src/testdata/with_custom_op.lite differ
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index bbbfa3e7415bfd7a34dfc7d764da55cac22e7d42..5d553def0a213da2350cdedc159de43b4d8cff04 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -32,6 +32,7 @@ cc_library(
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
         "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/testing:util",
         "//tensorflow/core:lib",
         "@com_google_googletest//:gtest",
     ],
@@ -49,7 +50,7 @@ cc_library(
     deps = [
         ":op_macros",
         "//tensorflow/contrib/lite:context",
-        "@gemmlowp//:gemmlowp",
+        "@gemmlowp",
     ],
 )
 
@@ -70,35 +71,73 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "kernel_util",
+    srcs = [
+        "kernel_util.cc",
+    ],
+    hdrs = [
+        "kernel_util.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:context",
+        "//tensorflow/contrib/lite/kernels/internal:round",
+    ],
+)
+
+tf_cc_test(
+    name = "kernel_util_test",
+    size = "small",
+    srcs = ["kernel_util_test.cc"],
+    deps = [
+        ":kernel_util",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "builtin_ops",
     srcs = [
         "activations.cc",
         "add.cc",
         "basic_rnn.cc",
+        "batch_to_space_nd.cc",
+        "bidirectional_sequence_rnn.cc",
         "concatenation.cc",
         "conv.cc",
         "depthwise_conv.cc",
+        "div.cc",
         "embedding_lookup.cc",
         "embedding_lookup_sparse.cc",
+        "exp.cc",
         "fully_connected.cc",
+        "gather.cc",
         "hashtable_lookup.cc",
-        "kernel_util.cc",
         "l2norm.cc",
         "local_response_norm.cc",
         "lsh_projection.cc",
         "lstm.cc",
+        "mean.cc",
         "mul.cc",
+        "pad.cc",
         "pooling.cc",
         "register.cc",
         "reshape.cc",
         "resize_bilinear.cc",
         "skip_gram.cc",
+        "space_to_batch_nd.cc",
         "space_to_depth.cc",
+        "squeeze.cc",
+        "strided_slice.cc",
+        "sub.cc",
         "svdf.cc",
+        "transpose.cc",
+        "unidirectional_sequence_lstm.cc",
+        "unidirectional_sequence_rnn.cc",
     ],
     hdrs = [
-        "kernel_util.h",
         "padding.h",
         "register.h",
     ],
@@ -112,11 +151,13 @@ cc_library(
     }),
     deps = [
         ":activation_functor",
+        ":kernel_util",
         ":op_macros",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:gemm_support",
+        "//tensorflow/contrib/lite/kernels/internal:kernel_utils",
         "//tensorflow/contrib/lite/kernels/internal:optimized",
         "//tensorflow/contrib/lite/kernels/internal:optimized_base",
         "//tensorflow/contrib/lite/kernels/internal:quantization_util",
@@ -152,6 +193,44 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "transpose_test",
+    size = "small",
+    srcs = ["transpose_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "//tensorflow/contrib/lite/kernels/internal:reference",
+        "//tensorflow/contrib/lite/kernels/internal:reference_base",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "space_to_batch_nd_test",
+    size = "small",
+    srcs = ["space_to_batch_nd_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "batch_to_space_nd_test",
+    size = "small",
+    srcs = ["batch_to_space_nd_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "concatenation_test",
     size = "small",
@@ -172,6 +251,7 @@ tf_cc_test(
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -200,6 +280,42 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "unidirectional_sequence_lstm_test",
+    size = "small",
+    srcs = ["unidirectional_sequence_lstm_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "bidirectional_sequence_rnn_test",
+    size = "small",
+    srcs = ["bidirectional_sequence_rnn_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "unidirectional_sequence_rnn_test",
+    size = "small",
+    srcs = ["unidirectional_sequence_rnn_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "l2norm_test",
     size = "small",
@@ -212,6 +328,30 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "exp_test",
+    size = "small",
+    srcs = ["exp_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "mean_test",
+    size = "small",
+    srcs = ["mean_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "mul_test",
     size = "small",
@@ -224,6 +364,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "pad_test",
+    size = "small",
+    srcs = ["pad_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "reshape_test",
     size = "small",
@@ -236,6 +388,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gather_test",
+    size = "small",
+    srcs = ["gather_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "resize_bilinear_test",
     size = "small",
@@ -395,6 +560,30 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "squeeze_test",
+    size = "small",
+    srcs = ["squeeze_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "strided_slice_test",
+    size = "small",
+    srcs = ["strided_slice_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/lite/kernels/activation_functor.h b/tensorflow/contrib/lite/kernels/activation_functor.h
index cfb3369e991a474315424423fe655ba214edabbc..41ec3cca33ae1c6bb3f7c43dd1923f104c2ab6a2 100644
--- a/tensorflow/contrib/lite/kernels/activation_functor.h
+++ b/tensorflow/contrib/lite/kernels/activation_functor.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
 
 #include <algorithm>
 #include <cmath>
@@ -55,4 +55,4 @@ class ActivationFunctor {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
index 7ab60a33e5e2ff61bae5f4c6db85ab9c47a391bc..3c5c77815d0f2592ab549152b4d77f45b967a660 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
@@ -134,8 +134,7 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       float* out = output->data.f;
       for (; in < in_end; in++, out++) *out = std::max(0.f, *in);
       return kTfLiteOk;
-    }
-    break;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently.");
       return kTfLiteError;
@@ -173,8 +172,7 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       float* out = output->data.f;
       for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
       return kTfLiteOk;
-    }
-    break;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently.");
       return kTfLiteError;
@@ -192,8 +190,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       float* out = output->data.f;
       for (; in < in_end; in++, out++) *out = std::tanh(*in);
       return kTfLiteOk;
-    }
-    break;
+    } break;
     default:
       context->ReportError(context, "Only float32 supported currently.");
       return kTfLiteError;
@@ -349,7 +346,7 @@ TfLiteRegistration* Register_RELU() {
   return &r;
 }
 
-TfLiteRegistration* Register_RELU1() {
+TfLiteRegistration* Register_RELU_N1_TO_1() {
   static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
                                  activations::GenericPrepare,
                                  activations::Relu1Eval};
diff --git a/tensorflow/contrib/lite/kernels/activations_test.cc b/tensorflow/contrib/lite/kernels/activations_test.cc
index f10aee70170d4a94ed54376fa410b22a60f109af..68d49944e51b043b6b82aa1589d22f6ebed37574 100644
--- a/tensorflow/contrib/lite/kernels/activations_test.cc
+++ b/tensorflow/contrib/lite/kernels/activations_test.cc
@@ -102,7 +102,7 @@ TEST(FloatActivationsOpTest, Relu) {
 }
 
 TEST(FloatActivationsOpTest, Relu1) {
-  FloatActivationsOpModel m(BuiltinOperator_RELU1,
+  FloatActivationsOpModel m(BuiltinOperator_RELU_N1_TO_1,
                             /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
   m.SetInput({
       0.0, -0.6, 0.2, -0.4,  //
@@ -317,7 +317,7 @@ TEST(QuantizedActivationsOpTest, Softmax2D) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
index 0e10a249abac3ba19cf107e055aa71d1eee00122..63ea89df56bafa995950afec3a58267681af304f 100644
--- a/tensorflow/contrib/lite/kernels/add.cc
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,43 +61,56 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
-  }
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
 void EvalAddFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLiteAddParams* params, TfLiteTensor* input1,
-                  TfLiteTensor* input2, TfLiteTensor* output) {
+                  TfLiteAddParams* params, const OpData* data,
+                  TfLiteTensor* input1, TfLiteTensor* input2,
+                  TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_ADD(type)                                        \
-  type::Add(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
-    if (kernel_type == kReference) {
-      TF_LITE_ADD(reference_ops);
+#define TF_LITE_ADD(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    if (data->requires_broadcast) {
+      TF_LITE_ADD(reference_ops, BroadcastAdd);
     } else {
-      TF_LITE_ADD(optimized_ops);
+      TF_LITE_ADD(reference_ops, Add);
+    }
+  } else {
+    if (data->requires_broadcast) {
+      TF_LITE_ADD(optimized_ops, BroadcastAdd);
+    } else {
+      TF_LITE_ADD(optimized_ops, Add);
+    }
   }
 #undef TF_LITE_ADD
 }
 
 template <KernelType kernel_type>
 void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
-                      TfLiteAddParams* params, TfLiteTensor* input1,
-                      TfLiteTensor* input2, TfLiteTensor* output) {
+                      TfLiteAddParams* params, const OpData* data,
+                      TfLiteTensor* input1, TfLiteTensor* input2,
+                      TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
   auto output_offset = output->params.zero_point;
@@ -112,19 +141,20 @@ void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRangeUint8(params->activation, output,
                                 &output_activation_min, &output_activation_max);
 
-#define TF_LITE_ADD(type)                                                   \
-  type::BroadcastAdd(                                                       \
-      left_shift, GetTensorData<uint8_t>(input1), GetTensorDims(input1),    \
-      input1_offset, input1_multiplier, input1_shift,                       \
-      GetTensorData<uint8_t>(input2), GetTensorDims(input2), input2_offset, \
-      input2_multiplier, input2_shift, output_offset, output_multiplier,    \
-      output_shift, output_activation_min, output_activation_max,           \
-      GetTensorData<uint8_t>(output), GetTensorDims(output));
-
+#define TF_LITE_ADD(type, opname)                                            \
+  type::opname(left_shift, GetTensorData<uint8_t>(input1),                   \
+               GetTensorDims(input1), input1_offset, input1_multiplier,      \
+               input1_shift, GetTensorData<uint8_t>(input2),                 \
+               GetTensorDims(input2), input2_offset, input2_multiplier,      \
+               input2_shift, output_offset, output_multiplier, output_shift, \
+               output_activation_min, output_activation_max,                 \
+               GetTensorData<uint8_t>(output), GetTensorDims(output));
+  // The quantized version of Add doesn't support activations, so we
+  // always use BroadcastAdd.
   if (kernel_type == kReference) {
-    TF_LITE_ADD(reference_ops);
+    TF_LITE_ADD(reference_ops, BroadcastAdd);
   } else {
-    TF_LITE_ADD(optimized_ops);
+    TF_LITE_ADD(optimized_ops, BroadcastAdd);
   }
 #undef TF_LITE_ADD
 }
@@ -132,15 +162,17 @@ void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalAddFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalAddFloat<kernel_type>(context, node, params, data, input1, input2,
+                              output);
   } else if (output->type == kTfLiteUInt8) {
-    EvalAddQuantized<kernel_type>(context, node, params, input1, input2,
+    EvalAddQuantized<kernel_type>(context, node, params, data, input1, input2,
                                   output);
   } else {
     context->ReportError(context,
@@ -154,19 +186,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace add
 
 TfLiteRegistration* Register_ADD_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+  static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
                                  add::Eval<add::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_ADD_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+  static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
                                  add::Eval<add::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_ADD_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+  static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
                                  add::Eval<add::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/add_test.cc b/tensorflow/contrib/lite/kernels/add_test.cc
index 8e12a837c4954832ff37a6d1ab377bee9e8d5763..956d05bed5162f6ce59705d59aad77ff056dda77 100644
--- a/tensorflow/contrib/lite/kernels/add_test.cc
+++ b/tensorflow/contrib/lite/kernels/add_test.cc
@@ -25,10 +25,11 @@ using ::testing::ElementsAreArray;
 
 class BaseAddOpModel : public SingleOpModel {
  public:
-  BaseAddOpModel(const TensorData& input, const TensorData& output,
+  BaseAddOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
                  ActivationFunctionType activation_type) {
-    input1_ = AddInput(input);
-    input2_ = AddInput(input);
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
     output_ = AddOutput(output);
     SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
                  CreateAddOptions(builder_, activation_type).Union());
@@ -70,6 +71,7 @@ float GetTolerance(int min, int max) {
 
 TEST(FloatAddOpModel, NoActivation) {
   FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
                     {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
@@ -77,9 +79,10 @@ TEST(FloatAddOpModel, NoActivation) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
 }
 
-TEST(FloatAddOpModel, ActivationRELU1) {
-  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
-                    {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU1);
+TEST(FloatAddOpModel, ActivationRELU_N1_TO_1) {
+  FloatAddOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
   m.Invoke();
@@ -91,6 +94,7 @@ TEST(FloatAddOpModel, VariousInputShapes) {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
                       {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
     m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
     m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1});
@@ -101,6 +105,23 @@ TEST(FloatAddOpModel, VariousInputShapes) {
   }
 }
 
+TEST(FloatAddOpModel, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-1.9, 0.3, 0.8, 0.9, 1.2, 2.1})))
+        << "With shape number " << i;
+  }
+}
+
 TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::initializer_list<float>> inputs1 = {
@@ -111,6 +132,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
       {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {}, -1.0, 1.0},
                           ActivationFunctionType_NONE);
     m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
@@ -122,7 +144,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU1) {
+TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::initializer_list<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
                                                        {-0.8, 0.2, 0.7, 0.3}};
@@ -132,8 +154,9 @@ TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU1) {
                                                        {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {}, -1.0, 1.0},
-                          ActivationFunctionType_RELU1);
+                          ActivationFunctionType_RELU_N1_TO_1);
     m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
     m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
     m.Invoke();
@@ -149,6 +172,7 @@ TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, test_shapes[i], -3.0, 3.0},
                           {TensorType_UINT8, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
     m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
@@ -161,11 +185,29 @@ TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
   }
 }
 
+TEST(QuantizedAddOpModel, QuantizedWithBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear({-1.9, 0.3, 0.8, 0.9, 1.2, 2.1},
+                                                kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/contrib/lite/kernels/basic_rnn.cc
index 3cee43c68b2a0af5a3fd84b33a980b74bb8f0cb4..2c5074eca3176c7f33a6f051b492dc41333257ed 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn.cc
@@ -15,14 +15,15 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -76,8 +77,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
   output_size_array->data[0] = batch_size;
   output_size_array->data[1] = num_units;
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output,
-                                                   output_size_array));
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size_array));
 
   return kTfLiteOk;
 }
@@ -101,50 +102,20 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const int batch_size = input->dims->data[0];
   const int num_units = input_weights->dims->data[0];
   const int input_size = input->dims->data[1];
-  const int input_weights_stride = input_weights->dims->data[1];
-  const int recurrent_weights_stride = recurrent_weights->dims->data[1];
-
-  // For each batch
-  for (int b = 0; b < batch_size; b++) {
-    // Initialize the pointer to input, output and bias.
-    const float* input_ptr_batch = input->data.f + b * input_size;
-    float* output_ptr_batch = output->data.f + b * num_units;
-    float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units;
-
-    // Initialize input_weights and recurrent_weights.
-    const float* input_weights_ptr = input_weights->data.f;
-    const float* recurrent_weights_ptr = recurrent_weights->data.f;
-
-    // Output = bias
-    for (int o = 0; o < num_units; o++) {
-      output_ptr_batch[o] = bias_ptr[o];
-    }
-
-    // Output += input * input_weights
-    for (int o = 0; o < num_units; o++) {
-      for (int i = 0; i < input_size; i++) {
-        output_ptr_batch[o] += input_ptr_batch[i] * input_weights_ptr[i];
-      }
-      input_weights_ptr += input_weights_stride;
-    }
-
-    // Output += recurrent_weights * hidden_state
-    for (int o = 0; o < num_units; o++) {
-      for (int h = 0; h < num_units; h++) {
-        output_ptr_batch[o] +=
-            hidden_state_ptr_batch[h] * recurrent_weights_ptr[h];
-      }
-      recurrent_weights_ptr += recurrent_weights_stride;
-    }
-
-    // Output = activation(Output) and update hidden_state
-    for (int o = 0; o < num_units; o++) {
-      output_ptr_batch[o] =
-          (ActivationFunctor(params->activation))(output_ptr_batch[o]);
-      hidden_state_ptr_batch[o] = output_ptr_batch[o];
-    }
-  }
 
+  // Initialize the pointer to hidden state.
+  float* hidden_state_ptr_batch = hidden_state->data.f;
+  // Initialize the pointer to input and output.
+  const float* input_ptr_batch = input->data.f;
+  float* output_ptr_batch = output->data.f;
+  // Initialize input_weights and recurrent_weights.
+  const float* input_weights_ptr = input_weights->data.f;
+  const float* recurrent_weights_ptr = recurrent_weights->data.f;
+
+  kernel_utils::RnnBatchStep(input_ptr_batch, input_weights_ptr,
+                             recurrent_weights_ptr, bias_ptr, input_size,
+                             num_units, batch_size, params->activation,
+                             hidden_state_ptr_batch, output_ptr_batch);
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
index dfa75655bcfe7762c6cc4c9a98a71d529028c03a..fa7ef525db47c93f98951604cd04da66196422d7 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite RNN op.
 
-#include <vector>
 #include <iomanip>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -120,8 +120,7 @@ static float rnn_golden_output[] = {
 
     0.415153,   0.210318,   0,         0,         0,         0,
     0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
-    0.628881,   3.58099,    1.49974,   0
-};
+    0.628881,   3.58099,    1.49974,   0};
 
 class RNNOpModel : public SingleOpModel {
  public:
@@ -261,7 +260,7 @@ TEST(FullyConnectedOpTest, BlackBoxTest) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc438f99c6a72fdbc2794dee03524db6a7523834
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
@@ -0,0 +1,188 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace batch_to_space_nd {
+
+// This file has two implementations of BatchToSpaceND.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
+struct BatchToSpaceNDContext {
+  BatchToSpaceNDContext(TfLiteContext* context, TfLiteNode* node) {
+    input = GetInput(context, node, 0);
+    block_shape = GetInput(context, node, 1);
+    crops = GetInput(context, node, 2);
+    output = GetOutput(context, node, 0);
+  }
+  TfLiteTensor* input;
+  TfLiteTensor* block_shape;
+  TfLiteTensor* crops;
+  TfLiteTensor* output;
+};
+
+// Currently, only 4D NHWC input/output op_context are supported.
+// The 4D array need to have exactly 2 spatial dimensions.
+// TODO(ycling): Support arbitrary dimension in BatchToSpaceND.
+const int kInputDimensionNum = 4;
+const int kBlockSizeDimensionNum = 1;
+const int kSpatialDimensionNum = 2;
+
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                BatchToSpaceNDContext* op_context) {
+  TfLiteIntArray* input_size = op_context->input->dims;
+  const int* block_shape = GetTensorData<int32>(op_context->block_shape);
+  const int* crops = GetTensorData<int32>(op_context->crops);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape),
+                    kBlockSizeDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context->block_shape->dims->data[0],
+                    kSpatialDimensionNum);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->crops),
+                    kSpatialDimensionNum);
+
+  // TODO(ycling): Add crops as part of calculation. Remove check for a crops
+  // containing all zeroes.
+  TF_LITE_ENSURE_EQ(context, crops[0], 0);
+  TF_LITE_ENSURE_EQ(context, crops[1], 0);
+  TF_LITE_ENSURE_EQ(context, crops[2], 0);
+  TF_LITE_ENSURE_EQ(context, crops[3], 0);
+
+  // Number of batch must be multiple of (block_shape[0] * block_shape[1]).
+  TF_LITE_ENSURE_EQ(context,
+                    input_size->data[0] % (block_shape[0] * block_shape[1]), 0);
+
+  const int output_batch_size =
+      input_size->data[0] / (block_shape[0] * block_shape[1]);
+  const int output_height = input_size->data[1] * block_shape[0];
+  const int output_width = input_size->data[2] * block_shape[1];
+  const int output_channel_size = input_size->data[3];
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
+  output_size->data[0] = output_batch_size;
+  output_size->data[1] = output_height;
+  output_size->data[2] = output_width;
+  output_size->data[3] = output_channel_size;
+
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  BatchToSpaceNDContext op_context(context, node);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
+                    kInputDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  if (!IsConstantTensor(op_context.block_shape) ||
+      !IsConstantTensor(op_context.crops)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  BatchToSpaceNDContext op_context(context, node);
+
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+  }
+
+#define TF_LITE_BATCH_TO_SPACE_ND(type, scalar)                        \
+  type::BatchToSpaceND(GetTensorData<scalar>(op_context.input),        \
+                       GetTensorDims(op_context.input),                \
+                       GetTensorData<int32_t>(op_context.block_shape), \
+                       GetTensorDims(op_context.block_shape),          \
+                       GetTensorData<scalar>(op_context.output),       \
+                       GetTensorDims(op_context.output))
+  switch (op_context.input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      if (kernel_type == kReference) {
+        TF_LITE_BATCH_TO_SPACE_ND(reference_ops, float);
+      } else {
+        TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, float);
+      }
+      break;
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_BATCH_TO_SPACE_ND(reference_ops, uint8_t);
+      } else {
+        TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, uint8_t);
+      }
+      break;
+    case kTfLiteInt32:
+      if (kernel_type == kReference) {
+        TF_LITE_BATCH_TO_SPACE_ND(reference_ops, int32_t);
+      } else {
+        TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, int32_t);
+      }
+      break;
+    case kTfLiteInt64:
+      if (kernel_type == kReference) {
+        TF_LITE_BATCH_TO_SPACE_ND(reference_ops, int64_t);
+      } else {
+        TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, int64_t);
+      }
+      break;
+    default:
+      context->ReportError(context,
+                           "Type is currently not supported by BatchToSpace.");
+      return kTfLiteError;
+  }
+#undef TF_LITE_BATCH_TO_SPACE_ND
+  return kTfLiteOk;
+}
+
+}  // namespace batch_to_space_nd
+
+TfLiteRegistration* Register_BATCH_TO_SPACE_ND_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, batch_to_space_nd::Prepare,
+      batch_to_space_nd::Eval<batch_to_space_nd::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_BATCH_TO_SPACE_ND_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, batch_to_space_nd::Prepare,
+      batch_to_space_nd::Eval<batch_to_space_nd::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_BATCH_TO_SPACE_ND() {
+  return Register_BATCH_TO_SPACE_ND_GENERIC_OPT();
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8485cde1b40066f2070855bca91ea78a9f80e83c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BatchToSpaceNDOpModel : public SingleOpModel {
+ public:
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+
+  void SetBlockShape(std::initializer_list<int> data) {
+    PopulateTensor<int>(block_shape_, data);
+  }
+
+  void SetCrops(std::initializer_list<int> data) {
+    PopulateTensor<int>(crops_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int block_shape_;
+  int crops_;
+  int output_;
+};
+
+// Tests case where block_shape and crops are const tensors.
+//
+// Example usage is as follows:
+//    BatchToSpaceNDOpConstModel m(input_shape, block_shape, crops);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
+ public:
+  BatchToSpaceNDOpConstModel(std::initializer_list<int> input_shape,
+                             std::initializer_list<int> block_shape,
+                             std::initializer_list<int> crops) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
+    crops_ = AddConstInput(TensorType_INT32, crops, {2, 2});
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
+                 BuiltinOptions_BatchToSpaceNDOptions,
+                 CreateBatchToSpaceNDOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Tests case where block_shape and crops are non-const tensors.
+//
+// Example usage is as follows:
+//    BatchToSpaceNDOpDynamicModel m(input_shape);
+//    m.SetInput(input_data);
+//    m.SetBlockShape(block_shape);
+//    m.SetPaddings(crops);
+//    m.Invoke();
+class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
+ public:
+  BatchToSpaceNDOpDynamicModel(std::initializer_list<int> input_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddInput(TensorType_INT32);
+    crops_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
+                 BuiltinOptions_BatchToSpaceNDOptions,
+                 CreateBatchToSpaceNDOptions(builder_).Union());
+    BuildInterpreter({input_shape, {2}, {2, 2}});
+  }
+};
+
+TEST(BatchToSpaceNDOpTest, SimpleConstTest) {
+  BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
+                                               4, 8, 11, 15, 12, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) {
+  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetCrops({0, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
+                                               4, 8, 11, 15, 12, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
+  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 0}),
+               "Cannot allocate tensors");
+}
+
+TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) {
+  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 1}),
+               "1 != 0");
+}
+
+TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
+  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetCrops({0, 0, 1, 0});
+  EXPECT_DEATH(m.Invoke(), "1 != 0");
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa24c1f34cd1e8c02a6a75b62fbe5f3c629498ca
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
@@ -0,0 +1,205 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace bidirectional_sequence_rnn {
+
+constexpr int kInputTensor = 0;
+// Forward and backward cell tensors.
+constexpr int kFwWeightsTensor = 1;
+constexpr int kFwRecurrentWeightsTensor = 2;
+constexpr int kFwBiasTensor = 3;
+constexpr int kBwWeightsTensor = 4;
+constexpr int kBwRecurrentWeightsTensor = 5;
+constexpr int kBwBiasTensor = 6;
+// State and output tensors.
+constexpr int kFwHiddenStateTensor = 0;
+constexpr int kFwOutputTensor = 1;
+constexpr int kBwHiddenStateTensor = 2;
+constexpr int kBwOutputTensor = 3;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 7);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 4);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* fw_input_weights =
+      &context->tensors[node->inputs->data[kFwWeightsTensor]];
+  TfLiteTensor* fw_recurrent_weights =
+      &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]];
+  TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]];
+  TfLiteTensor* bw_input_weights =
+      &context->tensors[node->inputs->data[kBwWeightsTensor]];
+  TfLiteTensor* bw_recurrent_weights =
+      &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]];
+  TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]];
+
+  // Check all the parameters of tensor match within themselves and match the
+  // input configuration.
+  const int batch_size = input->dims->data[0];
+  const int max_time = input->dims->data[1];
+  const int fw_num_units = fw_input_weights->dims->data[0];
+  const int bw_num_units = bw_input_weights->dims->data[0];
+  TF_LITE_ASSERT_EQ(input->dims->data[2], fw_input_weights->dims->data[1]);
+  TF_LITE_ASSERT_EQ(input->dims->data[2], bw_input_weights->dims->data[1]);
+  TF_LITE_ASSERT_EQ(fw_input_weights->dims->data[0], fw_bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(bw_input_weights->dims->data[0], bw_bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(fw_recurrent_weights->dims->data[0],
+                    fw_bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(bw_recurrent_weights->dims->data[1],
+                    bw_bias->dims->data[0]);
+
+  TfLiteTensor* fw_output =
+      &context->tensors[node->outputs->data[kFwOutputTensor]];
+  TfLiteTensor* bw_output =
+      &context->tensors[node->outputs->data[kBwOutputTensor]];
+
+  // Resize hidden states.
+  TfLiteIntArray* fw_hidden_state_size_array = TfLiteIntArrayCreate(2);
+  fw_hidden_state_size_array->data[0] = batch_size;
+  fw_hidden_state_size_array->data[1] = fw_num_units;
+  TfLiteTensor* fw_hidden_state =
+      &context->tensors[node->outputs->data[kFwHiddenStateTensor]];
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_hidden_state,
+                                                   fw_hidden_state_size_array));
+
+  TfLiteIntArray* bw_hidden_state_size_array = TfLiteIntArrayCreate(2);
+  bw_hidden_state_size_array->data[0] = batch_size;
+  bw_hidden_state_size_array->data[1] = fw_num_units;
+  TfLiteTensor* bw_hidden_state =
+      &context->tensors[node->outputs->data[kBwHiddenStateTensor]];
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_hidden_state,
+                                                   bw_hidden_state_size_array));
+
+  // Mark hidden states as a persistent tensor.
+  fw_hidden_state->allocation_type = kTfLiteArenaRwPersistent;
+  bw_hidden_state->allocation_type = kTfLiteArenaRwPersistent;
+
+  // Resize outputs.
+  TfLiteIntArray* fw_output_size_array = TfLiteIntArrayCreate(3);
+  fw_output_size_array->data[0] = batch_size;
+  fw_output_size_array->data[1] = max_time;
+  fw_output_size_array->data[2] = fw_num_units;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, fw_output, fw_output_size_array));
+  TfLiteIntArray* bw_output_size_array = TfLiteIntArrayCreate(3);
+  bw_output_size_array->data[0] = batch_size;
+  bw_output_size_array->data[1] = max_time;
+  bw_output_size_array->data[2] = bw_num_units;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, bw_output, bw_output_size_array));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* fw_input_weights =
+      &context->tensors[node->inputs->data[kFwWeightsTensor]];
+  TfLiteTensor* fw_recurrent_weights =
+      &context->tensors[node->inputs->data[kFwRecurrentWeightsTensor]];
+  TfLiteTensor* fw_bias = &context->tensors[node->inputs->data[kFwBiasTensor]];
+  TfLiteTensor* fw_hidden_state =
+      &context->tensors[node->outputs->data[kFwHiddenStateTensor]];
+  TfLiteTensor* fw_output =
+      &context->tensors[node->outputs->data[kFwOutputTensor]];
+
+  TfLiteTensor* bw_input_weights =
+      &context->tensors[node->inputs->data[kBwWeightsTensor]];
+  TfLiteTensor* bw_recurrent_weights =
+      &context->tensors[node->inputs->data[kBwRecurrentWeightsTensor]];
+  TfLiteTensor* bw_bias = &context->tensors[node->inputs->data[kBwBiasTensor]];
+  TfLiteTensor* bw_hidden_state =
+      &context->tensors[node->outputs->data[kBwHiddenStateTensor]];
+  TfLiteTensor* bw_output =
+      &context->tensors[node->outputs->data[kBwOutputTensor]];
+
+  const int batch_size = input->dims->data[0];
+  const int max_time = input->dims->data[1];
+  const int input_size = input->dims->data[2];
+
+  const int fw_num_units = fw_input_weights->dims->data[0];
+  const float* fw_bias_ptr = fw_bias->data.f;
+  const float* fw_input_weights_ptr = fw_input_weights->data.f;
+  const float* fw_recurrent_weights_ptr = fw_recurrent_weights->data.f;
+
+  const int bw_num_units = bw_input_weights->dims->data[0];
+  const float* bw_bias_ptr = bw_bias->data.f;
+  const float* bw_input_weights_ptr = bw_input_weights->data.f;
+  const float* bw_recurrent_weights_ptr = bw_recurrent_weights->data.f;
+
+  for (int b = 0; b < batch_size; b++) {
+    // Forward cell.
+    float* fw_hidden_state_ptr_batch =
+        fw_hidden_state->data.f + b * fw_num_units;
+    for (int s = 0; s < max_time; s++) {
+      const float* input_ptr_batch =
+          input->data.f + b * input_size * max_time + s * input_size;
+      float* output_ptr_batch =
+          fw_output->data.f + b * fw_num_units * max_time + s * fw_num_units;
+
+      kernel_utils::RnnBatchStep(
+          input_ptr_batch, fw_input_weights_ptr, fw_recurrent_weights_ptr,
+          fw_bias_ptr, input_size, fw_num_units, /*batch_size=*/1,
+          params->activation, fw_hidden_state_ptr_batch, output_ptr_batch);
+    }
+    // Backward cell.
+    float* bw_hidden_state_ptr_batch =
+        bw_hidden_state->data.f + b * bw_num_units;
+    for (int s = max_time - 1; s >= 0; s--) {
+      const float* input_ptr_batch =
+          input->data.f + b * input_size * max_time + s * input_size;
+      float* output_ptr_batch =
+          bw_output->data.f + b * bw_num_units * max_time + s * bw_num_units;
+
+      kernel_utils::RnnBatchStep(
+          input_ptr_batch, bw_input_weights_ptr, bw_recurrent_weights_ptr,
+          bw_bias_ptr, input_size, bw_num_units, /*batch_size=*/1,
+          params->activation, bw_hidden_state_ptr_batch, output_ptr_batch);
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace bidirectional_sequence_rnn
+
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 bidirectional_sequence_rnn::Prepare,
+                                 bidirectional_sequence_rnn::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12f4ff97cfd90e3a6894a24d15fcbc356f96cde2
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -0,0 +1,931 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Bidirectional RNN op.
+
+#include <vector>
+#include <iomanip>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+static float rnn_input[] = {
+    0.23689353,   0.285385,     0.037029743, -0.19858193,  -0.27569133,
+    0.43773448,   0.60379338,   0.35562468,  -0.69424844,  -0.93421471,
+    -0.87287879,  0.37144363,   -0.62476718, 0.23791671,   0.40060222,
+    0.1356622,    -0.99774903,  -0.98858172, -0.38952237,  -0.47685933,
+    0.31073618,   0.71511042,   -0.63767755, -0.31729108,  0.33468103,
+    0.75801885,   0.30660987,   -0.37354088, 0.77002847,   -0.62747043,
+    -0.68572164,  0.0069220066, 0.65791464,  0.35130811,   0.80834007,
+    -0.61777675,  -0.21095741,  0.41213346,  0.73784804,   0.094794154,
+    0.47791874,   0.86496925,   -0.53376222, 0.85315156,   0.10288584,
+    0.86684,      -0.011186242, 0.10513687,  0.87825835,   0.59929144,
+    0.62827742,   0.18899453,   0.31440187,  0.99059987,   0.87170351,
+    -0.35091716,  0.74861872,   0.17831337,  0.2755419,    0.51864719,
+    0.55084288,   0.58982027,   -0.47443086, 0.20875752,   -0.058871567,
+    -0.66609079,  0.59098077,   0.73017097,  0.74604273,   0.32882881,
+    -0.17503482,  0.22396147,   0.19379807,  0.29120302,   0.077113032,
+    -0.70331609,  0.15804303,   -0.93407321, 0.40182066,   0.036301374,
+    0.66521823,   0.0300982,    -0.7747041,  -0.02038002,  0.020698071,
+    -0.90300065,  0.62870288,   -0.23068321, 0.27531278,   -0.095755219,
+    -0.712036,    -0.17384434,  -0.50593495, -0.18646687,  -0.96508682,
+    0.43519354,   0.14744234,   0.62589407,  0.1653645,    -0.10651493,
+    -0.045277178, 0.99032974,   -0.88255352, -0.85147917,  0.28153265,
+    0.19455957,   -0.55479527,  -0.56042433, 0.26048636,   0.84702539,
+    0.47587705,   -0.074295521, -0.12287641, 0.70117295,   0.90532446,
+    0.89782166,   0.79817224,   0.53402734,  -0.33286154,  0.073485017,
+    -0.56172788,  -0.044897556, 0.89964068,  -0.067662835, 0.76863563,
+    0.93455386,   -0.6324693,   -0.083922029};
+
+static float rnn_golden_fw_output[] = {
+    0.496726,   0,          0.965996,  0,         0.0584254, 0,
+    0,          0.12315,    0,         0,         0.612266,  0.456601,
+    0,          0.52286,    1.16099,   0.0291232,
+
+    0,          0,          0.524901,  0,         0,         0,
+    0,          1.02116,    0,         1.35762,   0,         0.356909,
+    0.436415,   0.0355727,  0,         0,
+
+    0,          0,          0,         0.262335,  0,         0,
+    0,          1.33992,    0,         2.9739,    0,         0,
+    1.31914,    2.66147,    0,         0,
+
+    0.942568,   0,          0,         0,         0.025507,  0,
+    0,          0,          0.321429,  0.569141,  1.25274,   1.57719,
+    0.8158,     1.21805,    0.586239,  0.25427,
+
+    1.04436,    0,          0.630725,  0,         0.133801,  0.210693,
+    0.363026,   0,          0.533426,  0,         1.25926,   0.722707,
+    0,          1.22031,    1.30117,   0.495867,
+
+    0.222187,   0,          0.72725,   0,         0.767003,  0,
+    0,          0.147835,   0,         0,         0,         0.608758,
+    0.469394,   0.00720298, 0.927537,  0,
+
+    0.856974,   0.424257,   0,         0,         0.937329,  0,
+    0,          0,          0.476425,  0,         0.566017,  0.418462,
+    0.141911,   0.996214,   1.13063,   0,
+
+    0.967899,   0,          0,         0,         0.0831304, 0,
+    0,          1.00378,    0,         0,         0,         1.44818,
+    1.01768,    0.943891,   0.502745,  0,
+
+    0.940135,   0,          0,         0,         0,         0,
+    0,          2.13243,    0,         0.71208,   0.123918,  1.53907,
+    1.30225,    1.59644,    0.70222,   0,
+
+    0.804329,   0,          0.430576,  0,         0.505872,  0.509603,
+    0.343448,   0,          0.107756,  0.614544,  1.44549,   1.52311,
+    0.0454298,  0.300267,   0.562784,  0.395095,
+
+    0.228154,   0,          0.675323,  0,         1.70536,   0.766217,
+    0,          0,          0,         0.735363,  0.0759267, 1.91017,
+    0.941888,   0,          0,         0,
+
+    0,          0,          1.5909,    0,         0,         0,
+    0,          0.5755,     0,         0.184687,  0,         1.56296,
+    0.625285,   0,          0,         0,
+
+    0,          0,          0.0857888, 0,         0,         0,
+    0,          0.488383,   0.252786,  0,         0,         0,
+    1.02817,    1.85665,    0,         0,
+
+    0.00981836, 0,          1.06371,   0,         0,         0,
+    0,          0,          0,         0.290445,  0.316406,  0,
+    0.304161,   1.25079,    0.0707152, 0,
+
+    0.986264,   0.309201,   0,         0,         0,         0,
+    0,          1.64896,    0.346248,  0,         0.918175,  0.78884,
+    0.524981,   1.92076,    2.07013,   0.333244,
+
+    0.415153,   0.210318,   0,         0,         0,         0,
+    0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
+    0.628881,   3.58099,    1.49974,   0};
+
+static float rnn_golden_bw_output[] = {
+    0.496726, 0,          1.00883,   0,         0.0584256, 0,         0,
+    0.236412, 0,          0,         0.612267,  0.487726,  0,         0.54883,
+    1.16099,  0.0291233,  0,         0,         0.428302,  0,         0,
+    0,        0,          1.13262,   0,         1.64415,   0,         0.311249,
+    0.570804, 0.259696,   0,         0,         0,         0,         0,
+    0.262334, 0,          0,         0,         1.23781,   0,         2.86532,
+    0,        0,          1.34389,   2.76409,   0,         0,         1.03969,
+    0,        0.00410865, 0,         0.0470295, 0,         0,         0,
+    0.371556, 0.27175,    1.36614,   1.63956,   0.683887,  1.06176,   0.719552,
+    0.301314, 0.971195,   0,         0.697143,  0,         0.215219,  0.210693,
+    0.363027, 0,          0.501283,  0,         1.13399,   0.623774,  0,
+    1.09851,  1.33313,    0.470441,  0.210965,  0,         0.664178,  0,
+    0.839686, 0,          0,         0.147834,  0,         0,         0,
+    0.58786,  0.490128,   0,         0.905806,  0,         0.932134,  0.424257,
+    0,        0,          0.860629,  0,         0,         0,         0.476425,
+    0,        0.566017,   0.513721,  0.207341,  1.09508,   1.08385,   0,
+    0.973787, 0,          0,         0,         0,         0,         0,
+    1.20698,  0,          0,         0,         1.56135,   1.12369,   0.99588,
+    0.459803, 0,          0.915854,  0,         0,         0,         0,
+    0,        0,          2.03206,   0,         0.773264,  0.267228,  1.55012,
+    1.202,    1.51611,    0.701202,  0,         0.725088,  0,         0.509069,
+    0,        0.671349,   0.581129,  0.343447,  0,         0.107755,  0.611838,
+    1.4331,   1.55871,    0.015242,  0.140624,  0.492562,  0.395095,  0.147722,
+    0,        0.784925,   0,         1.65477,   0.715257,  0,         0,
+    0,        0.685024,   0,         1.89505,   1.00037,   0,         0,
+    0,        0,          0,         1.52659,   0,         0,         0,
+    0,        0.618583,   0,         0.11115,   0,         1.37194,   0.630225,
+    0,        0,          0,         0,         0,         0.0322124, 0,
+    0,        0,          0,         0.430834,  0.252786,  0,         0,
+    0,        0.991297,   1.98451,   0,         0,         0.111511,  0,
+    1.05513,  0,          0,         0,         0,         0,         0,
+    0.290445, 0.412559,   0.0429958, 0.256564,  1.27858,   0.289948,  0,
+    1.01693,  0.327141,   0,         0,         0,         0,         0,
+    1.83508,  0.346248,   0,         0.961535,  0.790026,  0.552203,  2.13457,
+    2.19233,  0.333244,   0.316526,  0.179398,  0,         0,         0,
+    0,        0,          1.86126,   0,         0.728256,  0.750013,  0.011861,
+    0.576383, 3.38891,    1.29273,   0};
+
+constexpr std::initializer_list<float> weights = {
+    0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
+    0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
+    0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
+    -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
+    -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
+    -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
+    -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
+    0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
+    0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
+    0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
+    -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
+    0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
+    -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
+    -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
+    0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
+    0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
+    0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
+    -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
+    0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
+    0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
+    -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
+    0.277308,    0.415818};
+
+static float endtoend_input[] = {
+    0.996808, 0.060710, 0.981855, 0.570017, 0.525164, 0.796859, 0.696547,
+    0.505925, 0.991844, 0.461208, 0.949371, 0.027624, 0.539236, 0.841854,
+    0.915222, 0.538569, 0.069375, 0.237905, 0.903700, 0.441703, 0.536196,
+    0.402724, 0.761635, 0.025063, 0.082592, 0.688245, 0.239310, 0.256931,
+    0.658900, 0.105695, 0.301983, 0.655708, 0.166405, 0.283837, 0.225725,
+    0.691569, 0.080696, 0.922272, 0.197494, 0.072540, 0.383481, 0.146865,
+    0.100163, 0.922717, 0.988720, 0.015386, 0.461286, 0.058095, 0.253290,
+    0.364986, 0.499797, 0.789487, 0.767709, 0.261433, 0.814549, 0.850302,
+    0.949678, 0.053859, 0.107233, 0.608577, 0.159554, 0.409215, 0.264285,
+    0.325960, 0.693053, 0.490011, 0.017529, 0.773749, 0.412283, 0.215023,
+    0.846288, 0.795764, 0.361889, 0.946452, 0.718481, 0.350608, 0.961837,
+    0.179767, 0.408703, 0.215128, 0.544753, 0.908500, 0.004614, 0.312462,
+    0.169933, 0.819163, 0.162764, 0.119611, 0.873022, 0.269997, 0.728188,
+    0.032576, 0.679212, 0.992474, 0.358536, 0.372265, 0.482484, 0.376065,
+    0.146014, 0.894767, 0.591088, 0.992302, 0.690531, 0.952977, 0.938754,
+    0.409012, 0.303585, 0.900591, 0.588780, 0.712287, 0.115719, 0.133533,
+    0.620788, 0.120334, 0.445995, 0.790720, 0.939497, 0.608759, 0.910331,
+    0.812519, 0.878756, 0.638519, 0.845096, 0.557968, 0.630993, 0.203632,
+    0.930233, 0.113477, 0.579697, 0.076247, 0.008244, 0.170785, 0.068549,
+    0.698776, 0.123761, 0.007303, 0.107788, 0.427346, 0.907894, 0.696568,
+    0.139633, 0.023613, 0.830100, 0.760421, 0.143947, 0.276096, 0.551141,
+    0.083444, 0.884855, 0.461472, 0.895963, 0.763611, 0.099992, 0.741059,
+    0.321579, 0.730984, 0.944691, 0.251812, 0.844461, 0.524388, 0.328059,
+    0.852706, 0.695172, 0.396607, 0.551482, 0.818934, 0.403910, 0.659270,
+    0.246280, 0.311804, 0.355838, 0.385913, 0.335418, 0.185938, 0.146334,
+    0.479364, 0.462034, 0.697475, 0.562808, 0.346888, 0.158948, 0.458771,
+    0.110499, 0.258939, 0.199830, 0.432078, 0.989924, 0.144521, 0.683890,
+    0.834385, 0.668908, 0.011949, 0.687091, 0.364081, 0.408556, 0.238572,
+    0.183015, 0.812466, 0.897842, 0.429294, 0.124271, 0.253680, 0.815207,
+    0.459688, 0.439618, 0.961541, 0.939053, 0.901651, 0.659016, 0.501861,
+    0.248539, 0.817964, 0.960632, 0.359038, 0.076903, 0.160462, 0.791117,
+    0.066826, 0.304983, 0.475007, 0.901211, 0.973891, 0.486955, 0.588302,
+    0.337972, 0.895512, 0.826874, 0.520987, 0.707978, 0.724716, 0.950281,
+    0.832249, 0.978396, 0.765488, 0.291937, 0.418014, 0.727029, 0.230990,
+    0.319665, 0.386045, 0.732850, 0.568204, 0.204009, 0.693482, 0.927242,
+    0.280912, 0.853944, 0.718359, 0.347738, 0.158927, 0.193366, 0.248950,
+    0.132818, 0.680321, 0.837252, 0.470790, 0.575833, 0.664126, 0.991777,
+    0.283811, 0.388843, 0.942058, 0.116060, 0.367239, 0.707546, 0.407997,
+    0.785253, 0.434575, 0.638986, 0.104917, 0.820620, 0.371837, 0.673121,
+    0.024629, 0.065319, 0.600363, 0.305541, 0.919263, 0.318722, 0.653279,
+    0.078190, 0.512088, 0.902229, 0.211009, 0.192409, 0.739480, 0.681799,
+    0.768242, 0.403607, 0.673576, 0.052052, 0.792450, 0.615634, 0.168112,
+    0.159689, 0.323180, 0.576109, 0.944941, 0.757755, 0.215095, 0.049858,
+    0.578375, 0.586932, 0.722979, 0.603003, 0.652251, 0.323343, 0.908544,
+    0.571514, 0.642065, 0.561823, 0.649704, 0.154153, 0.464051, 0.860713,
+    0.346562, 0.203532, 0.542512, 0.114804, 0.607139, 0.216088, 0.166856,
+    0.399588, 0.831722, 0.334968, 0.559277, 0.154902, 0.911077, 0.504218,
+    0.912656, 0.126172, 0.554076, 0.491031, 0.713104, 0.277055, 0.094034,
+    0.365355, 0.600398, 0.002578, 0.936869, 0.242463, 0.564401, 0.586574,
+    0.396616, 0.028452, 0.447287, 0.743178, 0.231984, 0.989799, 0.857982,
+    0.839122, 0.205887, 0.024838, 0.238711, 0.037608, 0.359806, 0.797987,
+    0.192510, 0.270883, 0.302205, 0.105166, 0.397055, 0.856281, 0.596197,
+    0.110160, 0.133336, 0.690231, 0.475515, 0.733734, 0.692809, 0.412384,
+    0.976196, 0.257209, 0.998958, 0.372812, 0.285661, 0.446245, 0.115990,
+    0.517645, 0.436044, 0.973972, 0.356767, 0.641930, 0.998810, 0.595478,
+    0.679539, 0.358617, 0.393465, 0.872049, 0.629500, 0.695670, 0.977215,
+    0.026555, 0.551951, 0.573412, 0.136715, 0.685287, 0.263643, 0.612229,
+    0.419020, 0.956451, 0.024613, 0.395216, 0.213661, 0.023572, 0.768029,
+    0.499322, 0.469816, 0.884019, 0.016967, 0.905860, 0.857991, 0.373734,
+    0.547791, 0.856802, 0.969211, 0.227330, 0.215418, 0.362676, 0.099378,
+    0.844918, 0.058346, 0.076594, 0.871473, 0.610297, 0.650006, 0.008188,
+    0.295583, 0.913648, 0.620417, 0.714603, 0.870100, 0.645031, 0.109820,
+    0.083760, 0.668602, 0.877849, 0.583082, 0.138419, 0.761868, 0.600049,
+    0.044279, 0.619859, 0.973783, 0.592069, 0.476661, 0.942994, 0.819399,
+    0.692079, 0.305670, 0.918778, 0.536997, 0.364016, 0.995371, 0.408470,
+    0.974313, 0.645377, 0.416658, 0.269896, 0.559025, 0.037075, 0.984499,
+    0.429125, 0.682105, 0.094319, 0.512885, 0.350707, 0.972168, 0.095967,
+    0.489126, 0.734035, 0.696016, 0.533405, 0.353894, 0.669799, 0.125474,
+    0.830555, 0.612793, 0.944873, 0.522634, 0.918463, 0.863651, 0.059631,
+    0.282479, 0.859022, 0.468101, 0.256791, 0.504398, 0.884758, 0.526687,
+    0.063423, 0.921833, 0.511186, 0.492548, 0.603939, 0.605505, 0.005433,
+    0.954646, 0.577673, 0.101400, 0.443772, 0.311708, 0.797417, 0.977176,
+    0.665602, 0.467216, 0.102650, 0.496157, 0.080009, 0.047524, 0.018791,
+    0.998471, 0.911174, 0.078422, 0.280950, 0.770196, 0.546523, 0.537741,
+    0.274594, 0.431281, 0.064428, 0.338017, 0.353115, 0.575615, 0.830565,
+    0.957053, 0.181120, 0.835998, 0.911699, 0.758793, 0.937398, 0.355471,
+    0.070501, 0.734815, 0.332647, 0.736103, 0.202031, 0.435297, 0.232261,
+    0.282039, 0.482821, 0.251052, 0.280511, 0.393995, 0.329474, 0.561460,
+    0.164191, 0.875997, 0.099202, 0.438785, 0.307278, 0.163630, 0.776802,
+    0.660393, 0.739244, 0.607367, 0.617446, 0.920364, 0.443365, 0.529145,
+    0.679157, 0.380763, 0.884616, 0.749658, 0.115578, 0.217263, 0.485761,
+    0.317609, 0.652560, 0.718021, 0.599648, 0.135381, 0.969073, 0.880159,
+    0.529376, 0.298547, 0.441619, 0.693567, 0.174544, 0.540821, 0.132351,
+    0.481822, 0.704450, 0.909153, 0.142215, 0.443695, 0.516520, 0.759661,
+    0.364059, 0.959885, 0.288806, 0.043216, 0.340648, 0.173422, 0.792874,
+    0.456226, 0.390685, 0.278634, 0.773834, 0.043245, 0.996656, 0.373483,
+    0.178625, 0.965729, 0.253641, 0.708001, 0.264276, 0.695260, 0.401568,
+    0.438820, 0.236081, 0.533919, 0.920642, 0.940531, 0.443072, 0.062857,
+    0.384226, 0.959592, 0.822518, 0.748285, 0.919477, 0.111325, 0.791501,
+    0.260124, 0.284747, 0.584375, 0.716350, 0.675431, 0.863009, 0.490184,
+    0.718676, 0.859665, 0.863666, 0.897301, 0.825393, 0.117308, 0.605302,
+    0.089669, 0.812568, 0.006870, 0.528489, 0.048649, 0.540788, 0.449131,
+    0.989180, 0.983860, 0.511988, 0.373407, 0.943452, 0.334506, 0.121692,
+    0.862929, 0.445831, 0.913193, 0.123053, 0.730578, 0.497568, 0.839402,
+    0.406009, 0.360577, 0.329586, 0.124685, 0.220241, 0.193253, 0.021986,
+    0.045634, 0.310560, 0.627288, 0.135303, 0.123128, 0.634158, 0.663792,
+    0.171777, 0.174946, 0.112923, 0.160958, 0.158806, 0.624911, 0.534364,
+    0.102259, 0.959418, 0.656056, 0.965187, 0.405249, 0.569249, 0.088240,
+    0.135827, 0.066817, 0.927642, 0.541836, 0.427393, 0.257229, 0.666520,
+    0.647634, 0.450481, 0.688506, 0.693269, 0.761042, 0.315794, 0.828572,
+    0.884170, 0.949952, 0.492364, 0.055947, 0.124898, 0.605288, 0.216905,
+    0.283705, 0.230199, 0.751269, 0.385963, 0.189616, 0.407326, 0.351151,
+    0.594865, 0.976575, 0.439391, 0.730692, 0.043392, 0.367033, 0.272527,
+    0.470785, 0.624261, 0.939048, 0.118419, 0.074743, 0.627554, 0.811688,
+    0.835784, 0.943348, 0.640260, 0.719954, 0.893300, 0.132625, 0.775901,
+    0.018199, 0.737913, 0.992806, 0.301903, 0.968111, 0.744076, 0.687867,
+    0.157728, 0.151401, 0.039017, 0.752593, 0.127976, 0.478408, 0.483284,
+    0.171368, 0.845441, 0.755811, 0.642153, 0.469702, 0.694859, 0.760572,
+    0.544445, 0.322413, 0.572260, 0.380229, 0.265761, 0.212521, 0.100183,
+    0.159062, 0.345146, 0.876084, 0.177261, 0.083058, 0.868891, 0.479164,
+    0.051169, 0.612966, 0.167030, 0.208897, 0.764367, 0.206048, 0.961490,
+    0.892343, 0.684456, 0.444774, 0.063711, 0.529896, 0.200585, 0.705863,
+    0.999598, 0.895444, 0.466435, 0.544043, 0.217857, 0.038696, 0.924272,
+    0.483618, 0.251217, 0.024455, 0.642680, 0.596362, 0.900539, 0.819941,
+    0.679420, 0.769430, 0.299105, 0.730590, 0.382396, 0.466135, 0.939487,
+    0.146763, 0.672183, 0.900977, 0.039106, 0.356638, 0.345750, 0.102817,
+    0.886535, 0.546336, 0.808681, 0.886133, 0.441780, 0.275116, 0.430176,
+    0.659637, 0.313812, 0.354448, 0.143255, 0.565028, 0.378903, 0.785935,
+    0.161391, 0.279443, 0.605876, 0.840811, 0.048873, 0.904980, 0.571401,
+    0.431269, 0.371115, 0.510887, 0.578032, 0.043298, 0.411864, 0.617138,
+    0.399936, 0.757614, 0.719955, 0.286471, 0.303950, 0.528636, 0.172604,
+    0.745730, 0.803752, 0.602780, 0.405367, 0.117564, 0.957228, 0.548622,
+    0.682592, 0.336131, 0.334557, 0.843983, 0.615574, 0.940433, 0.684794,
+    0.664447, 0.845413, 0.256194, 0.095715, 0.216529, 0.767082, 0.673747,
+    0.259827, 0.178946, 0.290885, 0.659763, 0.936560, 0.010840, 0.946234,
+    0.240510, 0.539476, 0.118838, 0.986240, 0.343228, 0.721618, 0.391606,
+    0.460792, 0.678846, 0.940228, 0.143384, 0.014977, 0.274785, 0.987367,
+    0.630551, 0.215218, 0.672161, 0.294998, 0.060631, 0.928355, 0.390713,
+    0.277160, 0.695436, 0.064460, 0.536987, 0.874382, 0.355345, 0.196751,
+    0.810942, 0.366185, 0.142985, 0.051452, 0.905661, 0.261823, 0.037691,
+    0.248889, 0.983441, 0.429297, 0.709681, 0.662286, 0.369525, 0.853066,
+    0.677263, 0.644310, 0.840433, 0.307814, 0.859528, 0.512593, 0.602812,
+    0.920160, 0.440948, 0.993525, 0.197320, 0.136384, 0.057984, 0.734307,
+    0.010766, 0.413329, 0.931058, 0.821707, 0.779514, 0.074043, 0.873159,
+    0.685175, 0.335865, 0.910850, 0.934065, 0.319306, 0.340147, 0.643746,
+    0.981592, 0.709673, 0.496812, 0.658856, 0.353983, 0.337245, 0.966670,
+    0.213511, 0.849838, 0.569482, 0.133671, 0.290786, 0.563007, 0.330991,
+    0.427170, 0.620991, 0.065299, 0.437936, 0.034320, 0.996356, 0.259643,
+    0.813834, 0.070399, 0.132802, 0.499009, 0.406265, 0.043652, 0.433074,
+    0.725570, 0.383800, 0.076820, 0.707163, 0.093473, 0.573632, 0.366018,
+    0.447456, 0.910877, 0.332688, 0.660967, 0.760714, 0.902170, 0.794638,
+    0.051500, 0.465177, 0.125630, 0.478670, 0.086168, 0.190928, 0.916605,
+    0.120488, 0.187285, 0.176248, 0.934322, 0.257684, 0.309050, 0.433331,
+    0.663949, 0.352703, 0.866405, 0.389519, 0.736502, 0.943226, 0.096682,
+    0.829975, 0.516858, 0.462700, 0.277430, 0.427734, 0.795388, 0.938398,
+    0.188449, 0.697558, 0.733036, 0.239948, 0.162735, 0.858666, 0.718618,
+    0.248903, 0.049594, 0.635223, 0.369391, 0.236879, 0.811472, 0.303713,
+    0.494563, 0.120522, 0.737044, 0.158511, 0.473225, 0.603450, 0.548030,
+    0.209727, 0.546675, 0.644712, 0.039702, 0.063533, 0.107412, 0.317132,
+    0.491267, 0.902800, 0.255530, 0.679716, 0.600359, 0.988566, 0.919664,
+    0.763094, 0.847232, 0.638283, 0.011997, 0.896825, 0.273506, 0.381388,
+    0.133704, 0.084978, 0.685101, 0.628267, 0.205500, 0.422145, 0.786778,
+    0.678725, 0.025595, 0.334808, 0.888452, 0.572271, 0.979520, 0.928154,
+    0.635804, 0.086932, 0.245286, 0.127071, 0.989732, 0.500816, 0.806787,
+    0.590091, 0.489382, 0.726451, 0.353185, 0.336614, 0.364734, 0.365182,
+    0.233439, 0.638240, 0.746570, 0.367143, 0.723218, 0.431671, 0.995410,
+    0.928718, 0.853816, 0.782188, 0.607442, 0.879411, 0.116995, 0.495894,
+    0.451682, 0.096515, 0.424048, 0.087485, 0.183447, 0.669334, 0.214556,
+    0.173179, 0.170151, 0.021343, 0.763269, 0.659533, 0.747794, 0.116454,
+    0.996147, 0.112528, 0.481635, 0.229586, 0.750768, 0.228205, 0.596730,
+    0.473985, 0.659876, 0.592139, 0.402703, 0.513692, 0.374327, 0.010145,
+    0.393103, 0.491322, 0.506039, 0.844785, 0.587837, 0.930088, 0.932270,
+    0.771284, 0.599422, 0.146826, 0.944463, 0.769573, 0.168169, 0.707732,
+    0.429106, 0.915964, 0.824186, 0.425253, 0.028492, 0.305821, 0.654839,
+    0.779259, 0.534026, 0.251569, 0.253245, 0.193901, 0.843708, 0.655947,
+    0.707593, 0.218035, 0.666093, 0.100696, 0.709357, 0.172132, 0.945481,
+    0.297195, 0.102220, 0.877751, 0.068479, 0.701642, 0.024577, 0.012941,
+    0.471215, 0.192747, 0.720673, 0.900321, 0.108710, 0.544859, 0.325574,
+    0.137202, 0.850679, 0.980413, 0.916462, 0.384705, 0.231982, 0.169706,
+    0.578607, 0.075690, 0.825654, 0.286200, 0.293725, 0.491746, 0.386896,
+    0.003083, 0.663878, 0.332377, 0.300278, 0.766098, 0.210128, 0.368756,
+    0.467740, 0.234705, 0.381697, 0.938955, 0.427451, 0.102370, 0.839275,
+    0.536162, 0.647229, 0.164849, 0.673364, 0.497908, 0.145262, 0.589825,
+    0.882613, 0.377244, 0.759532, 0.461220, 0.452934, 0.585185, 0.747420,
+    0.746660, 0.076932, 0.134316, 0.749743, 0.740810, 0.466692, 0.050020,
+    0.506908, 0.676820, 0.418776, 0.974648, 0.911525, 0.800474, 0.913602,
+    0.338976, 0.902844, 0.752878, 0.875138, 0.550072, 0.917727, 0.548502,
+    0.047981, 0.062989, 0.138327, 0.930594, 0.440233, 0.897859, 0.391814,
+    0.893168, 0.483044, 0.139234, 0.639828, 0.559975, 0.273549, 0.389570,
+    0.300785, 0.740242, 0.439590, 0.807693, 0.417062, 0.858367, 0.782341,
+    0.328586, 0.658840, 0.695943, 0.667562, 0.561684, 0.448821, 0.542700,
+    0.111756, 0.366548, 0.091202, 0.159737, 0.429537, 0.229529, 0.090331,
+    0.869770, 0.127388, 0.482145, 0.762938, 0.610432, 0.621379, 0.402765,
+    0.170407, 0.894928, 0.792336, 0.471192, 0.635170, 0.231926, 0.278886,
+    0.052232, 0.090293, 0.061226, 0.380818, 0.749133, 0.757170, 0.048380,
+    0.310817, 0.205990, 0.591080, 0.422573, 0.572538, 0.682282, 0.582310,
+    0.002075, 0.911812, 0.672641, 0.871845, 0.039199, 0.154786, 0.634783,
+    0.649631, 0.776165, 0.037548, 0.820038, 0.671093, 0.829884, 0.291231,
+    0.306263, 0.061810, 0.570116, 0.358495, 0.152103, 0.631343, 0.739313,
+    0.901236, 0.388512, 0.787693, 0.212053, 0.594503, 0.378773, 0.634626,
+    0.167040, 0.061056, 0.216937, 0.169115, 0.972867, 0.889578, 0.040960,
+    0.012067, 0.044364, 0.675743, 0.661698, 0.820529, 0.713291, 0.481736,
+    0.491623, 0.543175, 0.772966, 0.797886, 0.604985, 0.343083, 0.156380,
+    0.757088, 0.974425, 0.895693, 0.658324, 0.362938, 0.683386, 0.870376,
+    0.957440, 0.062159, 0.505002, 0.124481, 0.123215, 0.721939, 0.293596,
+    0.096082, 0.611517, 0.334556, 0.108149, 0.655881, 0.010299, 0.769846,
+    0.476411, 0.723590, 0.251582, 0.968033, 0.266765, 0.024548, 0.765919,
+    0.871750, 0.367631, 0.922299, 0.628838, 0.342056, 0.817992, 0.287162,
+    0.704994, 0.501378, 0.157538, 0.662434, 0.563537, 0.662541, 0.786915,
+    0.686752, 0.384480, 0.080511, 0.782834, 0.995997, 0.415067, 0.890983,
+    0.651878, 0.425365, 0.660829, 0.128289, 0.148956, 0.912411, 0.096322,
+    0.415721, 0.936959, 0.862241, 0.287471, 0.304590, 0.784540, 0.916309,
+    0.646646, 0.602533, 0.203471, 0.351640, 0.103911, 0.361009, 0.014074,
+    0.667448, 0.023550, 0.800989, 0.354200, 0.408030, 0.881500, 0.137034,
+    0.404026, 0.296566, 0.028017, 0.055904, 0.721932, 0.688846, 0.184193,
+    0.870887, 0.601257, 0.280515, 0.286608, 0.538216, 0.142755, 0.574079,
+    0.842806, 0.927296, 0.490388, 0.489452, 0.529828, 0.693859, 0.841092,
+    0.633739, 0.054869, 0.855167, 0.301187, 0.078419, 0.656156, 0.655388,
+    0.486448, 0.537656, 0.792422, 0.890475, 0.834222, 0.820439, 0.946379,
+    0.556153, 0.509285, 0.130571, 0.427041, 0.110542, 0.411086, 0.713648,
+    0.648758, 0.553842, 0.287727, 0.491563, 0.481137, 0.778116, 0.981015,
+    0.010966, 0.471975, 0.822107, 0.644705, 0.526844, 0.677274, 0.945892,
+    0.605263, 0.333430, 0.601280, 0.091711, 0.871086, 0.393702, 0.982186,
+    0.705307, 0.214141, 0.928564, 0.261461, 0.723426, 0.059136, 0.688501,
+    0.833968, 0.470222, 0.402150, 0.482725, 0.024063, 0.689877, 0.974289,
+    0.505201, 0.467993, 0.955304, 0.516166, 0.939968, 0.777411, 0.160871,
+    0.466812, 0.454685, 0.106763, 0.072075, 0.788115, 0.708043, 0.163786,
+    0.659201, 0.101744, 0.145971, 0.364508, 0.315885, 0.074536, 0.625969,
+    0.039311, 0.133672, 0.314471, 0.873279, 0.603893, 0.716620, 0.356004,
+    0.627957, 0.406498, 0.330292, 0.133157, 0.874490, 0.285596, 0.649324,
+    0.814458, 0.063007, 0.810195, 0.281270, 0.517693, 0.916958, 0.353345,
+    0.305808, 0.625000, 0.517131, 0.965009, 0.726745, 0.663102, 0.329518,
+    0.042630, 0.737638, 0.955487, 0.081940, 0.871310, 0.269957, 0.955219,
+    0.475203, 0.986578, 0.311223, 0.103160, 0.393075, 0.641515, 0.236317,
+    0.267566, 0.927112, 0.885641, 0.082024, 0.990119, 0.695835, 0.363295,
+    0.507812, 0.612793, 0.716640, 0.813620, 0.237793, 0.233770, 0.778629,
+    0.964538, 0.896872, 0.108147, 0.007167, 0.634510, 0.063633, 0.089108,
+    0.505820, 0.333591, 0.044327, 0.981023, 0.320168, 0.355550, 0.084182,
+    0.713244, 0.997065, 0.320499, 0.980810, 0.924177, 0.206140, 0.062834,
+    0.914296, 0.901975, 0.426129, 0.422107, 0.514768, 0.142768, 0.235727,
+    0.752561, 0.376539, 0.014356, 0.717099, 0.273411, 0.122502, 0.724266,
+    0.907921, 0.186136, 0.813374, 0.413741, 0.519726, 0.857701, 0.394764,
+    0.839895, 0.213251, 0.478946, 0.553139, 0.210317, 0.799446, 0.533948,
+    0.134493, 0.005586, 0.596782, 0.048789, 0.907561, 0.022911, 0.470896,
+    0.422329, 0.165679, 0.706623, 0.174890, 0.542218, 0.720979, 0.891989,
+    0.815629, 0.843481, 0.616255, 0.723551, 0.029617, 0.429630, 0.137292,
+    0.549343, 0.287331, 0.532056, 0.389238, 0.500583, 0.011002, 0.942377,
+    0.710899, 0.810448, 0.476326, 0.845392, 0.816033, 0.073108, 0.894181,
+    0.723594, 0.096019, 0.365077, 0.145923, 0.261699, 0.071700, 0.320813,
+    0.803917, 0.792679, 0.212802, 0.619546, 0.636160, 0.829057, 0.343096,
+    0.665777, 0.258687, 0.480388, 0.215121, 0.546018, 0.012444, 0.604359,
+    0.046601, 0.023446, 0.546736, 0.757500, 0.833893, 0.023062, 0.602892,
+    0.649927, 0.096170, 0.497074, 0.373521, 0.192189, 0.862151, 0.519444,
+    0.453887, 0.933851, 0.840257, 0.257804, 0.726531, 0.053058, 0.877350,
+    0.362691, 0.882115, 0.220446, 0.028468, 0.140802, 0.700834, 0.243589,
+    0.686821, 0.713278, 0.847948, 0.733421, 0.736723, 0.394684, 0.490921,
+    0.570617, 0.417746, 0.093813, 0.220543, 0.513916, 0.590887, 0.594064,
+    0.706105, 0.453038, 0.113508, 0.159992, 0.386889, 0.953765, 0.417796,
+    0.113420, 0.006823, 0.295146, 0.476111, 0.888938, 0.515592, 0.504579,
+    0.029741, 0.216426, 0.748168, 0.716561, 0.929703, 0.596117, 0.449982,
+    0.666427, 0.990801, 0.940903, 0.237043, 0.408547, 0.034717, 0.457587,
+    0.922463, 0.625603, 0.051651, 0.628568, 0.078641, 0.165159, 0.788560,
+    0.465530, 0.118923, 0.206356, 0.578950, 0.125746, 0.501502, 0.055060,
+    0.014685, 0.017094, 0.559640, 0.044425, 0.233519, 0.307808, 0.760986,
+    0.163223, 0.903925, 0.210969, 0.829650, 0.894726, 0.151872, 0.066693,
+    0.303273, 0.186589, 0.524279, 0.225736, 0.812192, 0.575930, 0.854304,
+    0.890833, 0.741089, 0.642864, 0.356363, 0.860012, 0.849220, 0.935313,
+    0.985758, 0.350722, 0.990373, 0.000443, 0.367815, 0.550013, 0.044868,
+    0.601335, 0.857820, 0.805855, 0.764557, 0.761745, 0.016823, 0.594207,
+    0.656471, 0.168696, 0.660900, 0.959744, 0.355284, 0.185179, 0.185480,
+    0.167477, 0.761110, 0.039784, 0.058310, 0.502199, 0.682648, 0.414673,
+    0.362211, 0.531868, 0.349985, 0.347969, 0.882589, 0.340358, 0.348412,
+    0.250404, 0.890371, 0.393280, 0.851739, 0.748191, 0.199135, 0.616297,
+    0.509936, 0.215958, 0.210504, 0.166407, 0.384654, 0.871404, 0.126151,
+    0.739938, 0.056583, 0.311631, 0.907415, 0.817693, 0.351415, 0.965724,
+    0.319891, 0.034062, 0.380397, 0.682102, 0.565930, 0.730382, 0.030072,
+    0.448519, 0.070741, 0.378484, 0.698924, 0.961112, 0.771764, 0.550663,
+    0.709303, 0.970899, 0.166959, 0.219239, 0.186857, 0.377463, 0.385647,
+    0.571511, 0.248867, 0.511798, 0.311449, 0.305450, 0.823429, 0.218864,
+    0.123142, 0.174844, 0.184588, 0.443034, 0.208906, 0.564986, 0.125136,
+    0.774836, 0.295368, 0.155207, 0.223355, 0.366109, 0.533691, 0.922279,
+    0.327221, 0.305455, 0.472942, 0.036524, 0.276354, 0.639901, 0.255763,
+    0.463211, 0.017364, 0.641410, 0.034722, 0.266231, 0.153207, 0.346171,
+    0.571680, 0.976636, 0.565036, 0.694822, 0.151480, 0.749624, 0.137856,
+    0.360386, 0.314610, 0.262992, 0.135222, 0.609978, 0.418200, 0.358578,
+    0.976087, 0.951891, 0.280856, 0.303307, 0.257346, 0.753798, 0.339831,
+    0.533700, 0.393699, 0.595594, 0.996911, 0.411063, 0.237003, 0.031634,
+    0.677294, 0.390211, 0.377805, 0.248974, 0.366847, 0.942841, 0.943796,
+    0.518327, 0.692465, 0.081653, 0.878713, 0.007074, 0.344645, 0.013936,
+    0.617052, 0.762845, 0.372513, 0.593138, 0.714736, 0.653370, 0.896446,
+    0.972082, 0.407168, 0.236276, 0.505782, 0.800867, 0.831870, 0.502693,
+    0.211930, 0.068873, 0.534327, 0.889224, 0.459084, 0.912132, 0.138197,
+    0.825931, 0.854972, 0.081994, 0.344259, 0.547437, 0.163646, 0.222972,
+    0.554511, 0.508291, 0.236908, 0.171563, 0.271135, 0.609421, 0.764701,
+    0.985871, 0.262790, 0.661147, 0.957953, 0.669958, 0.897423, 0.463734,
+    0.470825, 0.729293, 0.966427, 0.682755, 0.798166, 0.500754, 0.571978,
+    0.257251, 0.412886, 0.710176, 0.083182, 0.267858, 0.792169, 0.427441,
+    0.815295, 0.955815, 0.650413, 0.369805, 0.464106, 0.887320, 0.541368,
+    0.735242, 0.496741, 0.306069, 0.721113, 0.759531, 0.967216, 0.679065,
+    0.429489, 0.864639, 0.142799, 0.900314, 0.593932, 0.109227, 0.583069,
+    0.392098, 0.609981, 0.155047, 0.649349, 0.022867, 0.865222, 0.732531,
+    0.290725, 0.657392, 0.159972, 0.106019, 0.613207, 0.810384, 0.475824,
+    0.077313, 0.697704, 0.017192, 0.812555};
+
+static float golden_endtoend_output[] = {
+    -1.881211, -0.028385, -3.585066, 1.939770,  -3.461155, 1.280415,  -4.408978,
+    0.608663,  -2.704937, 1.859742,  -5.777429, 2.691839,  -1.049012, 1.640870,
+    -4.856245, 1.604236,  0.992707,  0.422858,  -4.307465, 1.887332,  -0.884831,
+    -0.154277, -2.634801, 0.586827,  -1.849960, 1.399608,  -4.531559, 1.943591,
+    0.271676,  -2.893054, -2.066826, 0.235467,  -1.248263, -1.164534, -2.640174,
+    -0.112878, -4.386484, 1.253024,  -4.135623, 1.068984,  -0.043579, -0.832957,
+    -3.257258, -0.514396, -1.651174, 0.638630,  -4.364372, 1.548441,  -0.289455,
+    0.539845,  -4.097627, 0.635001,  -0.465071, -0.927701, -2.481498, 0.356616,
+    -2.355012, 0.728806,  -3.340283, 1.609038,  -4.786268, -0.532272, -1.886150,
+    0.254797,  0.746620,  -1.657134, -3.264265, 0.525551,  -1.756837, 0.845446,
+    -5.572190, 1.715797,  -2.856942, 3.394245,  -5.803662, 2.281806,  -3.014739,
+    2.616136,  -4.728482, 1.659984,  -2.106307, 2.711709,  -6.173832, 1.352869,
+    -0.038035, 0.107619,  -4.279774, 2.341930,  -0.980413, -0.119538, -4.049717,
+    1.172128,  -3.477744, 2.602274,  -6.231380, 2.537300,  -0.862214, 0.568722,
+    -3.858362, 0.197867,  -1.725885, 3.687312,  -7.067363, 2.403544,  -0.944963,
+    0.235639,  -3.250094, 0.659117,  -1.459576, 0.426128,  -3.637207, 1.030386,
+    -4.224351, 3.516220,  -6.053367, 0.993473,  -2.182416, -0.762625, -1.884405,
+    -0.113736, -2.572602, 0.329290,  -1.913233, 0.517418,  -0.019757, 0.203176,
+    -3.715881, 0.482136,  -1.912823, 1.357907,  -5.473043, 1.714658,  -3.177160,
+    0.089285,  -3.127669, 1.268076,  0.772498,  -1.622712, -3.850314, 0.436124,
+    -1.495983, 3.439982,  -7.623405, 1.726721,  -0.423979, 0.180201,  -2.902406,
+    0.986457,  -1.845638, 0.460903,  -5.359343, -1.133931, -1.074456, 0.717304,
+    -3.519856, 1.012126,  -0.562301, 1.881967,  -6.716627, 2.525036,  0.945480,
+    0.337081,  -5.210562, 2.572035,  -0.943370, 0.442026,  -2.666313, 0.411296,
+    0.002787,  -0.000735, -2.498933, 0.771719,  -3.568153, 3.833721,  -6.617026,
+    2.813922,  -0.573970, 1.025208,  -3.909923, 1.722648,  -1.406849, 0.719783,
+    -5.207438, 1.819442,  -0.530895, -0.010887, -2.939614, 0.971225,  -1.660297,
+    1.345243,  -4.454571, 2.244876,  -2.021213, 1.756090,  -4.880947, 0.364597,
+    -2.380270, 2.763117,  -5.613013, 2.137534,  0.289101,  -2.279400, -3.365582,
+    0.170028,  -1.142254, -0.709604, -3.656223, 1.804870,  -0.854690, 0.592102,
+    -5.010415, 2.462687,  -1.474710, 0.566002,  -3.621819, -0.391946, -0.423524,
+    -0.631428, -3.513310, 0.962825,  -1.480262, 0.319791,  -3.610137, 1.842339,
+    -0.250073, 1.182022,  -6.249267, 1.604172,  1.153759,  -0.734054, -4.620415,
+    -0.030858, 0.050911,  1.524406,  -4.724010, 1.451846,  -3.277104, 2.414182,
+    -4.605285, 1.846092,  -1.503047, -0.618200, -2.746546, -0.459332, -0.980326,
+    -1.199977, -2.043865, -0.165793, -2.214698, 3.108281,  -7.127830, -0.123065,
+    1.244948,  -3.039923, -4.660061, -0.225957, -0.307210, -1.513205, -2.456005,
+    0.840048,  -0.741445, 2.328635,  -6.015267, 2.723240,  -1.381171, -0.728878,
+    -5.114925, -0.362034, -0.574923, 0.518080,  -3.892457, 1.798948,  0.435119,
+    -0.371696, -2.807571, 1.302864,  -2.063052, 1.036388,  -4.232038, 1.397059,
+    -1.615668, -1.511019, -3.095508, 1.290955,  -3.428723, 2.000287,  -4.196487,
+    1.566983,  0.196957,  0.224343,  -4.926359, -0.691975, -0.214941, 1.546821,
+    -5.384868, 2.290820,  -1.878865, 0.493692,  -4.129823, 2.112036,  0.516558,
+    -2.553077, -2.717338, 0.017146,  -2.016057, 1.628995,  -4.240602, 1.189533,
+    -5.460220, 1.254738,  -4.214903, 0.755659,  -2.893235, 2.937762,  -6.169453,
+    2.035456,  -5.613212, -0.122254, -1.973646, -0.060619, -2.119598, 1.413512,
+    -4.938738, 1.890244,  0.544169,  -2.062413, -3.329637, -0.062515, -1.855805,
+    -0.791297, -2.570353, 0.607615,  0.305812,  0.338930,  -4.150270, 2.274937,
+    0.042653,  0.133825,  -3.538155, 1.523639,  -3.173690, -1.496599, -2.414655,
+    0.464687,  -1.448998, -0.368907, -3.520129, 0.203382,  -2.443626, 1.266233,
+    -3.393848, 0.605911,  -0.015353, 1.402006,  -4.441003, 1.419281,  0.603587,
+    0.434146,  -4.966566, 2.171872,  -0.688264, -0.009981, -4.461103, 1.538354,
+    -5.029816, -0.264424, -1.713510, -0.315258, -1.891606, 0.252074,  -2.419428,
+    0.043970,  -1.291143, 2.048704,  -4.590105, 0.524734,  -1.889576, 0.134836,
+    -3.462745, 1.390663,  -0.112773, 0.402735,  -4.203784, 1.381043,  -1.201634,
+    -1.968277, -1.425637, -0.181725, -1.250742, -2.102041, -3.925464, -1.256797,
+    -3.701354, -1.754610, -1.917231, -1.455910, -1.838006, 2.041781,  -5.666212,
+    2.752957,  -2.659553, 2.553637,  -4.872212, 1.443437,  -2.081846, 3.311263,
+    -5.912457, 1.871049,  0.196148,  -0.307044, -4.024967, 2.149149,  0.361809,
+    0.620415,  -5.939984, 0.180672,  -1.209180, -0.269122, -3.240285, 1.460315,
+    -1.040803, 1.125700,  -6.060366, 0.887767,  -3.214111, 1.314368,  -3.026808,
+    1.023640,  -3.815175, 1.795642,  -4.355603, 1.064454,  -0.046472, 0.618463,
+    -5.941646, 2.861891,  -2.852155, -0.990457, -2.624445, 1.794494,  -1.176747,
+    -0.358159, -3.206776, 1.138721,  -2.819523, -1.825522, -1.450902, -0.187312,
+    -0.808727, 0.636872,  -4.120567, 1.192623,  0.810731,  -1.768519, -3.699450,
+    1.527116,  -2.772720, 3.012835,  -5.912736, 1.599365,  -4.696381, 2.234591,
+    -4.139552, 1.061768,  -1.880089, 3.596274,  -7.006379, 2.382152,  -3.158115,
+    3.844430,  -7.044156, 2.307596,  -2.473970, 1.312644,  -5.467269, 0.197154,
+    -1.530040, 1.762275,  -5.550757, 0.630276,  -3.048947, 1.043777,  -3.096658,
+    1.345893,  -1.329494, 2.065748,  -4.711032, 2.227600,  -0.413321, -0.032428,
+    -4.599650, 1.668734,  -4.351490, -0.200022, -2.359903, 0.021997,  0.116028,
+    1.159718,  -5.093972, -0.142951, -2.409895, 0.906133,  -2.728812, 0.809932,
+    -2.597363, 0.494130,  -2.357861, 0.369825,  -2.165235, 1.148522,  -3.130562,
+    0.759034,  0.646335,  -1.463660, -3.508299, 1.059679,  -1.485465, 1.007319,
+    -4.340716, 1.789864,  -1.590654, 1.612324,  -4.452007, 2.389805,  -5.200148,
+    -1.068398, -1.306923, -0.472408, -0.392165, -0.524996, -2.933478, 1.518430,
+    -1.287781, 0.113422,  -3.020525, 1.338359,  -0.105982, 0.936014,  -4.132197,
+    1.836807,  -0.616589, -1.029716, -3.271347, 0.284889,  -2.653359, 2.135829,
+    -4.643613, 1.627981,  0.287733,  -2.017263, -2.776574, 1.184792,  1.004161,
+    -1.483019, -4.339290, -0.787322, 0.582420,  1.137839,  -5.673941, -0.001862,
+    -1.219142, 0.532561,  -4.457245, 1.826807,  -3.343291, 3.034610,  -6.179855,
+    2.235917,  -4.369989, 4.018128,  -6.632714, 0.926585,  -0.485469, 0.536073,
+    -4.179557, 1.489637,  -0.521762, 1.636089,  -6.137912, 1.500867,  -4.086009,
+    1.961372,  -3.688977, 1.358220,  -1.544034, 1.763837,  -4.357567, 1.852201,
+    -2.018725, 1.046264,  -6.211127, 1.609419,  -0.118441, 1.602284,  -6.242423,
+    1.518578,  -0.604078, 1.106613,  -5.393445, 2.595629,  0.142712,  -1.903953,
+    -2.821177, 0.032758,  -0.009152, 0.184628,  -4.227636, 2.046843,  -2.240138,
+    1.256176,  -5.108516, -0.308447, -2.998571, 4.657396,  -7.582112, 2.510951,
+    -3.535784, 1.704560,  -5.068484, 1.318466,  -3.058265, 3.073172,  -6.998089,
+    3.178849,  -2.420286, 2.277806,  -4.999528, 1.423890,  -1.672914, 0.447460,
+    -4.088940, 1.351087,  -1.051546, -0.417955, -4.042147, 1.604102,  -1.700931,
+    2.796663,  -6.497579, 2.857974,  -0.240828, 0.858001,  -5.778933, 2.778508,
+    -0.406211, 1.300766,  -5.073671, 2.089362,  -0.201673, 1.588396,  -6.000150,
+    2.185055,  -2.332125, 0.768216,  -2.609184, 0.327277,  -3.358943, -1.020736,
+    -2.389984, 0.315512,  -0.561905, 1.948740,  -6.408485, 2.231985,  -0.603652,
+    0.661829,  -5.070386, -1.063058, -0.624796, 1.375772,  -4.379606, 1.929358,
+    -1.047263, 0.739100,  -5.217857, 2.127625,  -5.025338, 0.650344,  -2.068460,
+    0.076936,  -0.457505, -1.050984, -1.917765, 1.150908,  0.782625,  0.855595,
+    -5.321719, 0.787209,  -0.460232, 1.106736,  -5.552326, 2.801043,  -0.360217,
+    -0.434432, -4.273378, 0.967556,  -0.972652, 0.874811,  -5.429918, -0.331039,
+    0.115477,  0.111883,  -5.418786, 1.240546,  -1.842794, 0.505880,  -3.676064,
+    -0.682369, 1.858984,  -0.742566, -5.784060, 0.673239,  -1.280398, 0.280842,
+    -4.848077, 2.214860,  -0.785100, -0.588488, -2.438206, 0.786651,  -1.568752,
+    1.935400,  -6.320256, 2.125338,  -1.476457, -1.651941, -2.695734, 0.007338,
+    -3.280860, 2.310385,  -5.319578, 1.890123,  -0.775723, 0.630606,  -4.321582,
+    1.085521,  -1.847371, 1.188521,  -4.596577, 2.056443,  -2.340172, -0.108501,
+    -3.156392, 0.933279,  -0.495331, 0.122405,  -5.171133, 1.763245,  -0.796913,
+    2.310487,  -7.247197, 2.401678,  -1.908860, 0.043798,  -2.393796, 0.573806,
+    -0.608531, 0.154710,  -4.669001, 0.750680,  0.468380,  0.392591,  -4.755001,
+    2.615217,  -1.957774, 1.153513,  -4.530099, 1.124362,  -3.569415, 1.697154,
+    -3.536335, 0.910758,  -2.976264, 1.833129,  -4.287203, -0.547050, -2.409768,
+    0.061585,  -1.324116, 0.268497,  -2.962222, -1.524245, -2.063413, 0.442058,
+    -4.292337, 3.538863,  -6.699603, 1.718664,  -2.290363, 1.994596,  -6.245037,
+    -0.433084, -0.367059, 1.020297,  -4.940721, 2.902264,  -0.577056, -0.709887,
+    -5.001413, -0.268316, -1.112048, -1.083307, -1.753492, 0.209973,  0.139540,
+    0.917602,  -5.232745, 2.538467,  -2.139234, -0.187388, -1.837249, -0.478582,
+    -0.731653, -0.481550, -2.531261, 1.044770,  0.707750,  0.279971,  -3.221119,
+    1.552074,  -2.373144, 0.859518,  -3.665156, 1.620278,  -1.440871, -0.525581,
+    -2.758271, 1.491873,  -2.302013, 1.119935,  -5.257080, 2.627170,  -3.174739,
+    1.363282,  -4.831639, 1.101076,  -4.337008, 2.689639,  -5.165915, 1.069201,
+    -1.882078, -0.120370, -2.287967, 1.147619,  -1.403616, 1.077150,  -5.084296,
+    1.658236,  -0.919642, 0.487423,  -3.001075, 0.741268,  0.107300,  0.943556,
+    -3.544311, 1.000239,  -1.627171, 2.871253,  -5.179172, 1.429893,  -0.826040,
+    0.188670,  -4.499894, 1.013447,  -2.101299, 0.317516,  -3.452141, -0.833776,
+    -1.362144, 1.272437,  -4.449355, 1.613591,  -2.039873, 2.613175,  -6.229640,
+    1.659790,  -1.595520, -0.237462, -2.744997, 0.337841,  0.148981,  -1.703771,
+    -2.388023, 1.276469,  1.058508,  -0.401642, -4.680769, 0.861881,  -1.336381,
+    1.153080,  -2.834378, 0.721075,  0.900115,  1.360511,  -5.573611, 0.949182,
+    -2.970844, 2.017563,  -5.186108, -0.201038, -1.192824, 0.610142,  -4.450919,
+    -0.897114, -1.812093, 0.422310,  -5.245487, 0.256549,  0.320275,  -2.324150,
+    -2.967040, -0.260536, -0.721467, 0.454148,  -5.058031, 0.526370,  -0.895656,
+    0.732240,  -3.327363, 1.353953,  -1.277912, -0.483171, -1.926713, 0.065044,
+    -2.167506, -0.196606, -1.923437, 0.604962,  -2.088319, 1.406834,  -5.227296,
+    2.247351,  -4.421744, 1.729791,  -5.007922, 1.264769,  -0.897019, 0.922902,
+    -3.887108, 2.087432,  -1.310226, -0.101938, -3.359082, -0.079662, -0.514988,
+    -0.963179, -4.038209, 2.223278,  -0.590083, -2.310458, -1.748338, 0.363406,
+    -0.540731, -0.885913, -4.179595, 2.216781,  -3.044339, -0.447100, -2.446098,
+    0.931101,  -1.676190, 2.096175,  -4.980755, 2.262151,  -1.095047, 1.897516,
+    -5.996138, 2.191038,  0.297128,  -0.780974, -2.884299, 1.195408,  -0.521065,
+    -1.955837, -3.091064, -0.404183, -1.961519, 4.076096,  -7.521851, 2.242064,
+    -1.988043, 0.303300,  -2.422585, 0.322230,  -3.377634, 3.499955,  -7.084434,
+    2.375587,  -0.718851, 2.150076,  -5.412241, 2.374280,  -2.006088, 2.229828,
+    -5.848188, 2.543077,  -2.171042, 2.096026,  -5.300007, 0.141405,  -1.187745,
+    0.105340,  -4.003816, 1.034281,  -3.980804, 1.856709,  -5.103042, 0.623737,
+    -2.080307, 0.896140,  -3.104050, 0.983158,  -0.424898, -1.154270, -3.805728,
+    1.978917,  -1.314387, 1.235096,  -3.148906, 1.113173,  0.111713,  2.055213,
+    -7.565283, 2.100342};
+constexpr std::initializer_list<float> biases = {
+    0.065691948, -0.69055247, 0.1107955,  -0.97084129, -0.23957068, -0.23566568,
+    -0.389184,   0.47481549,  -0.4791103, 0.29931796,  0.10463274,  0.83918178,
+    0.37197268,  0.61957061,  0.3956964,  -0.37609905};
+
+constexpr std::initializer_list<float> recurrent_weights = {
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1};
+
+class BidirectionalRNNOpModel : public SingleOpModel {
+ public:
+  BidirectionalRNNOpModel(int batches, int sequence_len, int fw_units,
+                          int bw_units, int input_size)
+      : batches_(batches),
+        sequence_len_(sequence_len),
+        fw_units_(fw_units),
+        bw_units_(bw_units),
+        input_size_(input_size) {
+    input_ = AddInput(TensorType_FLOAT32);
+    fw_weights_ = AddInput(TensorType_FLOAT32);
+    fw_recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    fw_bias_ = AddInput(TensorType_FLOAT32);
+    fw_hidden_state_ = AddOutput(TensorType_FLOAT32);
+    fw_output_ = AddOutput(TensorType_FLOAT32);
+    bw_weights_ = AddInput(TensorType_FLOAT32);
+    bw_recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    bw_bias_ = AddInput(TensorType_FLOAT32);
+    bw_hidden_state_ = AddOutput(TensorType_FLOAT32);
+    bw_output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+                 BuiltinOptions_SequenceRNNOptions,
+                 CreateSequenceRNNOptions(builder_, /*time_major=*/false,
+                                          ActivationFunctionType_RELU)
+                     .Union());
+    BuildInterpreter({
+        {batches_, sequence_len_, input_size_},  // input
+        {fw_units_, input_size_},                // fw_weights
+        {fw_units_, fw_units_},                  // fw_recurrent_weights
+        {fw_units_},                             // fw_bias
+        {bw_units_, input_size_},                // bw_weights
+        {bw_units_, bw_units_},                  // bw_recurrent_weights
+        {bw_units_}                              // bw_bias
+    });
+  }
+
+  void SetFwBias(std::initializer_list<float> f) {
+    PopulateTensor(fw_bias_, f);
+  }
+
+  void SetBwBias(std::initializer_list<float> f) {
+    PopulateTensor(bw_bias_, f);
+  }
+
+  void SetFwWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_weights_, f);
+  }
+
+  void SetBwWeights(std::initializer_list<float> f) {
+    PopulateTensor(bw_weights_, f);
+  }
+
+  void SetFwRecurrentWeights(std::initializer_list<float> f) {
+    PopulateTensor(fw_recurrent_weights_, f);
+  }
+
+  void SetBwRecurrentWeights(std::initializer_list<float> f) {
+    PopulateTensor(bw_recurrent_weights_, f);
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  void ResetHiddenStates() {
+    const int fw_zero_buffer_size = fw_units_ * batches_;
+    std::unique_ptr<float[]> fw_zero_buffer(new float[fw_zero_buffer_size]);
+    memset(fw_zero_buffer.get(), 0, fw_zero_buffer_size * sizeof(float));
+    PopulateTensor(fw_hidden_state_, 0, fw_zero_buffer.get(),
+                   fw_zero_buffer.get() + fw_zero_buffer_size);
+    const int bw_zero_buffer_size = bw_units_ * batches_;
+    std::unique_ptr<float[]> bw_zero_buffer(new float[bw_zero_buffer_size]);
+    memset(bw_zero_buffer.get(), 0, bw_zero_buffer_size * sizeof(float));
+    PopulateTensor(bw_hidden_state_, 0, bw_zero_buffer.get(),
+                   bw_zero_buffer.get() + bw_zero_buffer_size);
+  }
+
+  std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); }
+  std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); }
+
+  int input_size() { return input_size_; }
+  int num_fw_units() { return fw_units_; }
+  int num_bw_units() { return bw_units_; }
+  int num_batches() { return batches_; }
+  int sequence_len() { return sequence_len_; }
+
+ private:
+  int input_;
+  int fw_weights_;
+  int fw_recurrent_weights_;
+  int fw_bias_;
+  int fw_hidden_state_;
+  int fw_output_;
+  int bw_weights_;
+  int bw_recurrent_weights_;
+  int bw_bias_;
+  int bw_hidden_state_;
+  int bw_output_;
+
+  int batches_;
+  int sequence_len_;
+  int fw_units_;
+  int bw_units_;
+  int input_size_;
+};
+
+// TODO(mirkov): add another test which directly compares to TF once TOCO
+// supports the conversion from dynamic_rnn with BasicRNNCell.
+TEST(BidirectionalRNNOpTest, BlackBoxTest) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  rnn.ResetHiddenStates();
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  float* batch_start = rnn_input;
+  float* batch_end = batch_start + input_sequence_size;
+  rnn.SetInput(0, batch_start, batch_end);
+  rnn.SetInput(input_sequence_size, batch_start, batch_end);
+
+  rnn.Invoke();
+
+  float* golden_fw_start = rnn_golden_fw_output;
+  float* golden_fw_end =
+      golden_fw_start + rnn.num_fw_units() * rnn.sequence_len();
+  std::vector<float> fw_expected;
+  fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  float* golden_bw_start = rnn_golden_bw_output;
+  float* golden_bw_end =
+      golden_bw_start + rnn.num_bw_units() * rnn.sequence_len();
+  std::vector<float> bw_expected;
+  bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
+// Check that if the input sequence is reversed the outputs are the same just
+// forward and backward are swapped (and reversed).
+TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  rnn.ResetHiddenStates();
+
+  // Reverse inputs in each batch: in_1, in_2,..., in_k is inserted in the
+  // following order: [in_k,..., in_2, in_1, in_k,...,in_2, in_1].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    const int reverse_idx = rnn.sequence_len() - i - 1;
+    rnn.SetInput(reverse_idx * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((rnn.sequence_len() + reverse_idx) * rnn.input_size(),
+                 batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  // The forward and backward outputs are swapped.
+  std::vector<float> fw_expected;  // consider using std::deque instead.
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_fw_start = rnn_golden_bw_output + i * rnn.num_fw_units();
+    float* golden_fw_end = golden_fw_start + rnn.num_fw_units();
+    fw_expected.insert(fw_expected.begin(), golden_fw_start, golden_fw_end);
+  }
+  fw_expected.insert(fw_expected.end(), fw_expected.begin(), fw_expected.end());
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+
+  std::vector<float> bw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_bw_start = rnn_golden_fw_output + i * rnn.num_bw_units();
+    float* golden_bw_end = golden_bw_start + rnn.num_bw_units();
+    bw_expected.insert(bw_expected.begin(), golden_bw_start, golden_bw_end);
+  }
+  bw_expected.insert(bw_expected.end(), bw_expected.begin(), bw_expected.end());
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
+// Tests an end-to-end neural network with a Bidirectional RNN followed by a
+// DNN that aggregates the outputs from the two sequences.
+TEST(BidirectionalRNNOpTest, EndToEndTest) {
+  BidirectionalRNNOpModel rnn(/*batches=*/1, /*sequence_len=*/4,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8);
+  const int output_size = 4;
+  float dnn_weights[] = {
+      -0.5782342,  -0.052212059, 0.73036242,  -0.81216097, -0.80088139,
+      -0.23420811, -0.39647382,  0.31423986,  0.61819065,  -0.73659575,
+      -0.89698344, -0.8931554,   -0.0845688,  0.5617367,   0.38415289,
+      -0.11487955, -0.7617774,   0.17927337,  0.15726972,  0.059798479,
+      0.19009054,  -0.27616632,  -0.39142907, 0.77744663,  -0.046830714,
+      -0.6603595,  0.21945822,   0.051494241, 0.23785079,  0.19239247,
+      -0.53268754, 0.65961659,   -0.85981959, -0.80232513, 0.84745562,
+      -0.66070104, -0.036533296, -0.54901814, 0.65353882,  -0.41834265,
+      -0.28561389, 0.75655544,   -0.31149811, 0.62981737,  0.31829214,
+      -0.92734522, -0.48506218,  0.55651462,  0.25192821,  0.67220747,
+      -0.3836869,  -0.55798125,  -0.60395885, 0.22488403,  -0.78053463,
+      0.3492105,   0.56452453,   0.4389236,   -0.59929526, -0.19762468,
+      -0.36868393, -0.13198286,  -0.53800809, -0.22850353};
+
+  std::initializer_list<float> dnn_biases = {
+    0.29177809, -0.98799044, 0.065919638, 0.68781924};
+
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  rnn.ResetHiddenStates();
+
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  const int output_sequence_size = output_size * rnn.sequence_len();
+  const int num_examples = 64;
+  for (int k = 0; k < num_examples; k++) {
+    float* batch_start = endtoend_input + k * input_sequence_size;
+    float* batch_end = batch_start + input_sequence_size;
+    rnn.SetInput(0, batch_start, batch_end);
+
+    rnn.Invoke();
+
+    std::vector<float> fw_output = rnn.GetFwOutput();
+    std::vector<float> bw_output = rnn.GetBwOutput();
+    EXPECT_EQ(fw_output.size(), bw_output.size());
+
+    std::transform(fw_output.begin(), fw_output.end(), bw_output.begin(),
+                   fw_output.begin(), std::plus<float>());
+
+    std::vector<float> sequence_result;
+    for (int s = 0; s < rnn.sequence_len(); s++) {
+      const float* rnn_output = fw_output.data() + s * rnn.num_fw_units();
+      std::vector<float> results(dnn_biases);
+      for (int i = 0; i < output_size; i++) {
+        for (int j = 0; j < rnn.num_fw_units(); j++) {
+          results[i] += *(rnn_output + j) * dnn_weights[output_size * j + i];
+        }
+      }
+      sequence_result.insert(sequence_result.end(), results.begin(),
+                             results.end());
+    }
+
+    float* golden_start = golden_endtoend_output + k * output_sequence_size;
+    float* golden_end = golden_start + output_sequence_size;
+
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+    EXPECT_THAT(sequence_result, ElementsAreArray(ArrayFloatNear(expected)));
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/contrib/lite/kernels/concatenation.cc
index 9e7a1233dac0f3cd02dc386f9d194597f38ca3b8..a619ada86af64c299f8e518a7493db20f1011a50 100644
--- a/tensorflow/contrib/lite/kernels/concatenation.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation.cc
@@ -49,6 +49,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // dimensions except 'axis' must be equal.
   TfLiteTensor* t0 = &context->tensors[node->inputs->data[0]];
   TfLiteType input_type = t0->type;
+  if (axis < 0) axis += t0->dims->size;
   TF_LITE_ENSURE(context, axis >= 0);
   TF_LITE_ENSURE(context, axis < t0->dims->size);
 
@@ -95,53 +96,22 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return context->ResizeTensor(context, output, output_size);
 }
 
-template <typename T>
-class VectorOfInputs {
- public:
-  VectorOfInputs(const TfLiteContext& context, const TfLiteIntArray& inputs) {
-    int num_inputs = inputs.size;
-
-    all_data_.reserve(num_inputs);
-    all_dims_.reserve(num_inputs);
-    all_dims_ptr_.reserve(num_inputs);
-
-    for (int i = 0; i < num_inputs; ++i) {
-      TfLiteTensor* input = &context.tensors[inputs.data[i]];
-      all_data_.push_back(GetTensorData<T>(input));
-      all_dims_.push_back(GetTensorDims(input));
-    }
-
-    // Taking the pointer from inside a std::vector is only OK if the vector is
-    // never modified, so we populate all_dims in the previous loop and then we
-    // are free to grab iterators here.
-    for (int i = 0; i < num_inputs; ++i) {
-      all_dims_ptr_.push_back(&all_dims_[i]);
-    }
-  }
-  const T* const* data() const { return all_data_.data(); }
-  const Dims<4>* const* dims() const { return all_dims_ptr_.data(); }
-
- private:
-  std::vector<T*> all_data_;
-  std::vector<Dims<4>> all_dims_;
-  std::vector<Dims<4>*> all_dims_ptr_;
-};
-
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
-
+  int axis = params->axis;
   TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+  if (axis < 0) axis += output->dims->size;
 
 // TODO(ahentz): Creating 'all_inputs' below is not very efficient. We should
 // allocate and populate these during Prepare().
 // TODO(ycling): Activation function parameter is ignored. For now we dont have
 // a model with a Concatenation with fused activation function.
 #define TF_LITE_CONCATENATION(type, scalar)                                 \
-  VectorOfInputs<scalar> all_inputs(*context, *node->inputs);               \
+  VectorOfTensors<scalar> all_inputs(*context, *node->inputs);              \
   type::Concatenation<FusedActivationFunctionType::kNone, scalar>(          \
-      RemapDim(NumDimensions(output), params->axis), all_inputs.data(),     \
+      RemapDim(NumDimensions(output), axis), all_inputs.data(),             \
       all_inputs.dims(), node->inputs->size, GetTensorData<scalar>(output), \
       GetTensorDims(output))
 
diff --git a/tensorflow/contrib/lite/kernels/concatenation_test.cc b/tensorflow/contrib/lite/kernels/concatenation_test.cc
index 94e5b2acdcabeedb4652baa1a008b22bf6bc8433..ba1ffc5f8423b9626c9c8e2a1086ea0dcca43f50 100644
--- a/tensorflow/contrib/lite/kernels/concatenation_test.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation_test.cc
@@ -94,7 +94,7 @@ TEST(ConcatenationOpTest, TwoDimensionalOneInput) {
   EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
 }
 
-TEST(ConcatenationOpTest, TwoInputsTwoAxis) {
+TEST(ConcatenationOpTest, TwoInputsTwoAxesNegativeAxes) {
   // We will concatenate two tensors along different dimensions.
   auto tensor0 = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   auto tensor1 = {7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
@@ -107,6 +107,14 @@ TEST(ConcatenationOpTest, TwoInputsTwoAxis) {
   EXPECT_THAT(m0.GetOutput(),
               ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
 
+  ConcatenationOpModel m0_negative({TensorType_FLOAT32, {2, 3}}, /*axis=*/-2,
+                                   /*num_inputs=*/2);
+  m0_negative.SetInput(0, tensor0);
+  m0_negative.SetInput(1, tensor1);
+  m0_negative.Invoke();
+  EXPECT_THAT(m0_negative.GetOutput(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+
   ConcatenationOpModel m1({TensorType_FLOAT32, {2, 3}}, /*axis=*/1,
                           /*num_inputs=*/2);
   m1.SetInput(0, tensor0);
@@ -114,6 +122,14 @@ TEST(ConcatenationOpTest, TwoInputsTwoAxis) {
   m1.Invoke();
   EXPECT_THAT(m1.GetOutput(),
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+
+  ConcatenationOpModel m1_negative({TensorType_FLOAT32, {2, 3}}, /*axis=*/-1,
+                                   /*num_inputs=*/2);
+  m1_negative.SetInput(0, tensor0);
+  m1_negative.SetInput(1, tensor1);
+  m1_negative.Invoke();
+  EXPECT_THAT(m1_negative.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 }
 
 TEST(ConcatenationOpTest, FourInputs) {
@@ -156,7 +172,7 @@ TEST(ConcatenationOpTest, FourInputsQuantized) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index c75c04baeac2ce53c6261d677dca8d72fafa0da5..66d2c04bba4a164bbcdcf4b1a097d9aac0b3aeeb 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/gemm_support.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
@@ -38,11 +39,16 @@ namespace ops {
 namespace builtin {
 namespace conv {
 
-// This file has three implementation of Conv.
+// This file has 4 implementation of Conv.
 enum KernelType {
   kReference,
   kGenericOptimized,  // Neon-free
-  kNeonOptimized,
+  kMultithreadOptimized,
+  // The kernel uses use CBLAS interface for matrix multiplication.
+  // It's fast when an optimized CBLAS implementation is available (e.g. Apple
+  // Accelerate Framework), and it's slow when falling back to naive
+  // implementation.
+  kCblasOptimized,
 };
 
 struct OpData {
@@ -265,10 +271,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       free(hwcn_weights->data.raw);
       hwcn_weights->data.raw = nullptr;
     }
+
+    // Note that hwcn_weights_status is a kTfLiteDynamic tensor, and
+    // ResizeTensor will actually allocate space for it. The would be more
+    // efficient if we placed hwcn_weights_status in the persistent arena.
     auto hwcn_weights_status =
         context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
     if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
-    hwcn_weights->data.raw = static_cast<char*>(malloc(hwcn_weights->bytes));
 
     // TODO(petewarden): If Resize() is called when the size hasn't actually
     // changed, this will do extra redundant work.
@@ -290,26 +299,34 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   auto filter_offset = -filter->params.zero_point;
   auto output_offset = output->params.zero_point;
 
-  if (kernel_type == kReference) {
-    reference_ops::Conv(
-        GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
-        GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
-        GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
-        params->stride_height, data->padding.width, data->padding.height,
-        output_offset, data->output_multiplier, data->output_shift,
-        data->output_activation_min, data->output_activation_max,
-        GetTensorData<uint8_t>(output), GetTensorDims(output),
-        GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
-  } else {
-    optimized_ops::Conv(
-        GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
-        GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
-        GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
-        params->stride_height, data->padding.width, data->padding.height,
-        output_offset, data->output_multiplier, data->output_shift,
-        data->output_activation_min, data->output_activation_max,
-        GetTensorData<uint8_t>(output), GetTensorDims(output),
-        GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
+  switch (kernel_type) {
+    case kReference:
+      reference_ops::Conv(
+          GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+          GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+          GetTensorData<int32_t>(bias), GetTensorDims(bias),
+          params->stride_width, params->stride_height, data->padding.width,
+          data->padding.height, output_offset, data->output_multiplier,
+          data->output_shift, data->output_activation_min,
+          data->output_activation_max, GetTensorData<uint8_t>(output),
+          GetTensorDims(output), GetTensorData<uint8_t>(im2col),
+          GetTensorDims(im2col), gemm_context);
+      break;
+    case kGenericOptimized:
+    case kMultithreadOptimized:
+    case kCblasOptimized:
+      // There is only one optimized implementation for Quantized Conv.
+      optimized_ops::Conv(
+          GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+          GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+          GetTensorData<int32_t>(bias), GetTensorDims(bias),
+          params->stride_width, params->stride_height, data->padding.width,
+          data->padding.height, output_offset, data->output_multiplier,
+          data->output_shift, data->output_activation_min,
+          data->output_activation_max, GetTensorData<uint8_t>(output),
+          GetTensorDims(output), GetTensorData<uint8_t>(im2col),
+          GetTensorDims(im2col), gemm_context);
+      break;
   }
 }
 
@@ -322,30 +339,57 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
 
-  const float* filter_data;
-  if (data->need_hwcn_weights) {
-    filter_data = GetTensorData<float>(hwcn_weights);
-  } else {
-    filter_data = GetTensorData<float>(filter);
-  }
-
-  if (kernel_type == kReference) {
-    reference_ops::Conv(
-        GetTensorData<float>(input), GetTensorDims(input), filter_data,
-        GetTensorDims(filter), GetTensorData<float>(bias), GetTensorDims(bias),
-        params->stride_width, params->stride_height, data->padding.width,
-        data->padding.height, output_activation_min, output_activation_max,
-        GetTensorData<float>(output), GetTensorDims(output),
-        GetTensorData<float>(im2col), GetTensorDims(im2col));
-  } else {
-    multithreaded_ops::Conv(
-        GetTensorData<float>(input), GetTensorDims(input), filter_data,
-        GetTensorDims(filter), GetTensorData<float>(bias), GetTensorDims(bias),
-        params->stride_width, params->stride_height, data->padding.width,
-        data->padding.height, params->padding, output_activation_min,
-        output_activation_max, GetTensorData<float>(output),
-        GetTensorDims(output), GetTensorData<float>(im2col),
-        GetTensorDims(im2col));
+  switch (kernel_type) {
+    case kReference: {
+      reference_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
+                          GetTensorData<float>(filter), GetTensorDims(filter),
+                          GetTensorData<float>(bias), GetTensorDims(bias),
+                          params->stride_width, params->stride_height,
+                          data->padding.width, data->padding.height,
+                          output_activation_min, output_activation_max,
+                          GetTensorData<float>(output), GetTensorDims(output),
+                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
+    }
+    case kGenericOptimized: {
+      optimized_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
+                          GetTensorData<float>(filter), GetTensorDims(filter),
+                          GetTensorData<float>(bias), GetTensorDims(bias),
+                          params->stride_width, params->stride_height,
+                          data->padding.width, data->padding.height,
+                          output_activation_min, output_activation_max,
+                          GetTensorData<float>(output), GetTensorDims(output),
+                          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
+    }
+    case kMultithreadOptimized: {
+      const float* filter_data;
+      if (data->need_hwcn_weights) {
+        filter_data = GetTensorData<float>(hwcn_weights);
+      } else {
+        filter_data = GetTensorData<float>(filter);
+      }
+      multithreaded_ops::Conv(
+          GetTensorData<float>(input), GetTensorDims(input), filter_data,
+          GetTensorDims(filter), GetTensorData<float>(bias),
+          GetTensorDims(bias), params->stride_width, params->stride_height,
+          data->padding.width, data->padding.height, params->padding,
+          output_activation_min, output_activation_max,
+          GetTensorData<float>(output), GetTensorDims(output),
+          GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
+    }
+    case kCblasOptimized: {
+      cblas_ops::Conv(GetTensorData<float>(input), GetTensorDims(input),
+                      GetTensorData<float>(filter), GetTensorDims(filter),
+                      GetTensorData<float>(bias), GetTensorDims(bias),
+                      params->stride_width, params->stride_height,
+                      data->padding.width, data->padding.height,
+                      output_activation_min, output_activation_max,
+                      GetTensorData<float>(output), GetTensorDims(output),
+                      GetTensorData<float>(im2col), GetTensorDims(im2col));
+      break;
+    }
   }
 }
 
@@ -406,17 +450,23 @@ TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
   return &r;
 }
 
-TfLiteRegistration* Register_CONVOLUTION_NEON_OPT() {
+TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
+  static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
+                                 conv::Eval<conv::kMultithreadOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
   static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
-                                 conv::Eval<conv::kNeonOptimized>};
+                                 conv::Eval<conv::kCblasOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_CONV_2D() {
-#ifdef USE_NEON
-  return Register_CONVOLUTION_NEON_OPT();
+#ifdef TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
+  return Register_CONVOLUTION_CBLAS_OPT();
 #else
-  return Register_CONVOLUTION_GENERIC_OPT();
+  return Register_CONVOLUTION_MULTITHREADED_OPT();
 #endif
 }
 
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
index 18d7a31d594efb6a05fe7292a0194ea17599a65b..d2393c3c97bb9516e2b8a6c8ae037dc0dfdfe64b 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -15,12 +15,25 @@ limitations under the License.
 #include <cstdarg>
 
 #include <gtest/gtest.h>
+#include "absl/memory/memory.h"
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
 #include "tensorflow/contrib/lite/model.h"
 
 namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_CONVOLUTION_REF();
+TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT();
+TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT();
+TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT();
+
+}  // namespace builtin
+}  // namespace ops
+
 namespace {
 
 using ::testing::ElementsAreArray;
@@ -30,9 +43,9 @@ class BaseConvolutionOpModel : public SingleOpModel {
   // TODO(ahentz): Also test different activation types, bias, padding types,
   // stride values.
   BaseConvolutionOpModel(
-      const TensorData& input, const TensorData& filter,
-      const TensorData& output, int stride_width = 2, int stride_height = 2,
-      enum Padding padding = Padding_VALID,
+      TfLiteRegistration* registration, const TensorData& input,
+      const TensorData& filter, const TensorData& output, int stride_width = 2,
+      int stride_height = 2, enum Padding padding = Padding_VALID,
       enum ActivationFunctionType activation = ActivationFunctionType_NONE) {
     input_ = AddInput(input);
     filter_ = AddInput(filter);
@@ -62,6 +75,8 @@ class BaseConvolutionOpModel : public SingleOpModel {
                                      stride_height, activation)
                      .Union());
 
+    resolver_ = absl::make_unique<SingleOpResolver>(BuiltinOperator_CONV_2D,
+                                                    registration);
     BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
   }
 
@@ -83,12 +98,26 @@ class ConvolutionOpModel : public BaseConvolutionOpModel {
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor(input_, data);
   }
-
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
 };
 
-TEST(ConvolutionOpTest, SimpleTestFloat32) {
-  ConvolutionOpModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
+const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_CONVOLUTION_REF()},
+    {"GenericOptimized", ops::builtin::Register_CONVOLUTION_GENERIC_OPT()},
+    {"MultithreadedOptimized",
+     ops::builtin::Register_CONVOLUTION_MULTITHREADED_OPT()},
+    {"CblasOptimized", ops::builtin::Register_CONVOLUTION_CBLAS_OPT()},
+});
+
+class ConvolutionOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(ConvolutionOpTest, SimpleTestFloat32) {
+  ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}},
                        {TensorType_FLOAT32, {3, 2, 2, 1}},
                        {TensorType_FLOAT32, {}});
 
@@ -117,8 +146,8 @@ TEST(ConvolutionOpTest, SimpleTestFloat32) {
                              }));
 }
 
-TEST(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
-  ConvolutionOpModel m({TensorType_FLOAT32, {1, 3, 6, 1}},
+TEST_P(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
+  ConvolutionOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 3, 6, 1}},
                        {TensorType_FLOAT32, {1, 2, 2, 1}},
                        {TensorType_FLOAT32, {}},
                        /*stride_width=*/3, /*stride_height=*/1);
@@ -139,7 +168,7 @@ TEST(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
                              }));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -150,6 +179,7 @@ TEST(ConvolutionOpTest, HandCalculatedFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_SAME;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -192,7 +222,7 @@ TEST(ConvolutionOpTest, HandCalculatedFloat32) {
                                                178, 187, 234, 261, 121}));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -203,6 +233,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_SAME;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -245,7 +276,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
                                                367, 188, 197, 244, 271, 131}));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -256,6 +287,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_SAME;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -300,7 +332,7 @@ TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
               ElementsAreArray({0, 0, 0, 0, 35, 112, 157, 0, 0, 34, 61, 0}));
 }
 
-TEST(ConvolutionOpTest, HandCalculatedValidFloat32) {
+TEST_P(ConvolutionOpTest, HandCalculatedValidFloat32) {
   const int depth = 1;
   const int image_width = 4;
   const int image_height = 3;
@@ -311,6 +343,7 @@ TEST(ConvolutionOpTest, HandCalculatedValidFloat32) {
   const int stride_height = 1;
   const Padding padding = Padding_VALID;
   ConvolutionOpModel m(
+      GetRegistration(),
       {TensorType_FLOAT32,
        {image_batch_count, image_height, image_width, depth}},
       {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
@@ -366,8 +399,9 @@ class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
 
 // In this tests we set the input and output scales so that the results
 // match exactly the 'non-quantized' version.
-TEST(ConvolutionOpTest, SimpleTestQuantized) {
-  QuantizedConvolutionOpModel m({TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+TEST_P(ConvolutionOpTest, SimpleTestQuantized) {
+  QuantizedConvolutionOpModel m(GetRegistration(),
+                                {TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
                                 {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
                                 {TensorType_UINT8, {}, -127, 128});
   m.SetInput({
@@ -405,8 +439,9 @@ TEST(ConvolutionOpTest, SimpleTestQuantized) {
                              }));
 }
 
-TEST(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
-  QuantizedConvolutionOpModel m({TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64},
+TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
+  QuantizedConvolutionOpModel m(GetRegistration(),
+                                {TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64},
                                 {TensorType_UINT8, {1, 2, 2, 1}, -63.5, 64},
                                 {TensorType_UINT8, {}, -127, 128},
                                 /*stride_width=*/3, /*stride_height=*/1);
@@ -430,11 +465,16 @@ TEST(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
                                  167, 93,   //
                              }));
 }
+
+INSTANTIATE_TEST_CASE_P(
+    ConvolutionOpTest, ConvolutionOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
 }  // namespace
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
index 39227b2811e2be719a0be77f89793bcf9366d513..1439c8bce14ad127ed68dc54991aed8b8bb39383 100644
--- a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
@@ -180,7 +180,7 @@ TEST(QuantizedDepthwiseConvolutionOpTest, SimpleTestQuantized) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/div.cc b/tensorflow/contrib/lite/kernels/div.cc
new file mode 100644
index 0000000000000000000000000000000000000000..44bd0dc85d50c98ec6b6888e05064a8f2e2731c0
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/div.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace div {
+
+// This file has three implementation of Div.
+enum KernelType {
+  kReference,
+  kGenericOptimized,  // Neon-free
+  kNeonOptimized,
+};
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
+  for (int i = 0; i < NumDimensions(input1); ++i) {
+    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
+                      SizeOfDimension(input2, i));
+  }
+
+  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
+  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+void EvalDivFloat(TfLiteContext* context, TfLiteNode* node,
+                  TfLiteDivParams* params, TfLiteTensor* input1,
+                  TfLiteTensor* input2, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(params->activation, &output_activation_min,
+                                &output_activation_max);
+#define TF_LITE_DIV(type)                                        \
+  type::Div(GetTensorData<float>(input1), GetTensorDims(input1), \
+            GetTensorData<float>(input2), GetTensorDims(input2), \
+            output_activation_min, output_activation_max,        \
+            GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    TF_LITE_DIV(reference_ops);
+  } else {
+    TF_LITE_DIV(optimized_ops);
+  }
+#undef TF_LITE_DIV
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteDivParams*>(node->builtin_data);
+
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    EvalDivFloat<kernel_type>(context, node, params, input1, input2, output);
+  } else {
+    context->ReportError(context, "Inputs and outputs not all float types.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace div
+
+TfLiteRegistration* Register_DIV_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, div::Prepare,
+                                 div::Eval<div::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_DIV_GENERIC_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, div::Prepare,
+                                 div::Eval<div::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_DIV_NEON_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, div::Prepare,
+                                 div::Eval<div::kNeonOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_DIV() {
+#ifdef USE_NEON
+  return Register_DIV_NEON_OPT();
+#else
+  return Register_DIV_GENERIC_OPT();
+#endif
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
index 69d9c5cc7dec13a65f1c5050f2f1c56812ad5aa1..ef2b5422253ea880a9ded4d3c0efc5cec07178a9 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
@@ -123,18 +123,16 @@ TEST(EmbeddingLookupOpTest, SimpleTestSqrtn) {
       [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
   m.Invoke();
 
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray(ArrayFloatNear({
-          1.00, 1.01, 1.10, 1.11, 1.20, 1.21,  // Row 1
-          0.00, 0.00, 0.00, 0.00, 0.00, 0.00,  // -
-          6.00f / std::sqrt(20.0f), 6.06f / std::sqrt(20.0f),
-          6.60f / std::sqrt(20.0f), 6.66f / std::sqrt(20.0f),
-          7.20f / std::sqrt(20.0f),
-          7.26f /
-              std::sqrt(
-                  20.0f),  // 2 * Row 3 + 4 * Row 0,  // 2 * Row 3 + 4 * Row 0
-      })));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.00, 1.01, 1.10, 1.11, 1.20, 1.21,  // Row 1
+                  0.00, 0.00, 0.00, 0.00, 0.00, 0.00,  // -
+                  6.00f / std::sqrt(20.0f), 6.06f / std::sqrt(20.0f),
+                  6.60f / std::sqrt(20.0f), 6.66f / std::sqrt(20.0f),
+                  7.20f / std::sqrt(20.0f),
+                  7.26f / std::sqrt(20.0f),  // 2 * Row 3 + 4 * Row 0,  // 2 *
+                                             // Row 3 + 4 * Row 0
+              })));
 }
 
 TEST(EmbeddingLookupOpTest, Indices3DTest) {
@@ -158,9 +156,7 @@ TEST(EmbeddingLookupOpTest, Indices3DTest) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-#ifdef OS_LINUX
-  tflite::LogToStderr();
-#endif
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
index 8c030b06772ac0c6af34a45897f03ebc4637d4de..9b501878f196216a61568bfa36e6615f4dd07478 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
@@ -88,7 +88,7 @@ TEST(EmbeddingLookupOpTest, SimpleTest) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/exp.cc b/tensorflow/contrib/lite/kernels/exp.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a9e79b742dc2c80ce4ed9a3aa786814265dcb660
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/exp.cc
@@ -0,0 +1,92 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace exp {
+
+// This file has reference implementation of Exp.
+enum KernelType {
+  kReference,
+};
+
+struct ExpContext {
+  ExpContext(TfLiteContext* context, TfLiteNode* node) {
+    input = GetInput(context, node, 0);
+    output = GetOutput(context, node, 0);
+  }
+  TfLiteTensor* input;
+  TfLiteTensor* output;
+};
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  ExpContext op_context(context, node);
+  TfLiteIntArray* output_dims = TfLiteIntArrayCopy(op_context.input->dims);
+  op_context.output->type = op_context.input->type;
+  return context->ResizeTensor(context, op_context.output, output_dims);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  ExpContext op_context(context, node);
+
+#define TF_LITE_EXP(kernel_type, data_type)                               \
+  kernel_type::Exp<data_type>(GetTensorData<data_type>(op_context.input), \
+                              NumElements(op_context.input),              \
+                              GetTensorData<data_type>(op_context.output))
+
+  // TODO(kanlig): supports half, bfloat16, float64, complex64, and complex128.
+  if (kernel_type == kReference) {
+    switch (op_context.input->type) {
+      case kTfLiteFloat32:
+        TF_LITE_EXP(reference_ops, float);
+        break;
+      default:
+        context->ReportError(context,
+                             "Type %d is currently not supported by Exp.",
+                             op_context.input->type);
+        return kTfLiteError;
+    }
+  }
+#undef TF_LITE_EXP
+  return kTfLiteOk;
+}
+
+}  // namespace exp
+
+TfLiteRegistration* Register_EXP_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, exp::Prepare,
+                                 exp::Eval<exp::kReference>};
+  return &r;
+}
+
+// TODO(kanlig): add optimized implementation of Exp.
+TfLiteRegistration* Register_EXP() { return Register_EXP_REF(); }
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/exp_test.cc b/tensorflow/contrib/lite/kernels/exp_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eed67369a1f30e57cd29a3975a899db41938def0
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/exp_test.cc
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class ExpOpModel : public SingleOpModel {
+ public:
+  ExpOpModel(const TensorData& input, const TensorType& output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_EXP, BuiltinOptions_ExpOptions,
+                 CreateExpOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  template <class T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor(input_, data);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+TEST(ExpOpTest, FloatTest) {
+  std::initializer_list<float> data = {1.0, 0.0, -1.0, 1.0, 1.0, -1.0};
+  ExpOpModel m({TensorType_FLOAT32, {3, 1, 2}}, TensorType_FLOAT32);
+  m.SetInput<float>(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {2.71828, 1, 0.367879, 2.71828, 2.71828, 0.367879})));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
index 112e3f1ba01a428023eea5ee8410fb76c1d67de6..a0f766c4f4580d7679275c0b63aa200410fcb5ad 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
@@ -370,8 +370,7 @@ TEST(FullyConnectedOpTest, BlackBoxTest) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/gather.cc b/tensorflow/contrib/lite/kernels/gather.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e4187d1eac64636a2e2b25e9a1cc45c3a4da557
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/gather.cc
@@ -0,0 +1,131 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace gather {
+constexpr int kInputTensor = 0;
+constexpr int kInputPositions = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const auto* params =
+      reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* positions = GetInput(context, node, kInputPositions);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  // Only INT32 positions are supported.
+  TF_LITE_ENSURE_EQ(context, positions->type, kTfLiteInt32);
+  // Check that input and output types match.
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  // TODO(mgubin): only 0D or 1D positions are currently supported.
+  TF_LITE_ENSURE(context, NumDimensions(positions) <= 1);
+  // TODO(mgubin): Only default axis == 0 is supported.
+  TF_LITE_ENSURE_EQ(context, params->axis, 0);
+  // Check conditions for different types.
+  switch (input->type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt32: {
+      // Fully supported by reference_ops::Gather.
+    } break;
+
+    case kTfLiteString: {
+      // Only 1D input is supported.
+      TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+    } break;
+    default:
+      context->ReportError(context,
+                           "Only float32 and string types are supported");
+      return kTfLiteError;
+  }
+  const int num_dimensions =
+      NumDimensions(input) + NumDimensions(positions) - 1;
+  TF_LITE_ENSURE(context, params->axis <= num_dimensions);
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(num_dimensions);
+  int output_index = 0;
+  for (int i = 0; i < params->axis; ++i) {
+    output_shape->data[output_index++] = input->dims->data[i];
+  }
+  for (int i = 0; i < positions->dims->size; ++i) {
+    output_shape->data[output_index++] = positions->dims->data[i];
+  }
+  for (int i = params->axis + 1; i < input->dims->size; ++i) {
+    output_shape->data[output_index++] = input->dims->data[i];
+  }
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* positions = GetInput(context, node, kInputPositions);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const int input_rank = NumDimensions(input);
+#define TF_LITE_GATHER(data_type, index_type)                            \
+  optimized_ops::Gather(                                                 \
+      GetTensorData<data_type>(input), GetTensorDims(input), input_rank, \
+      GetTensorData<index_type>(positions), GetTensorDims(positions),    \
+      GetTensorData<data_type>(output), GetTensorDims(output));
+  switch (input->type) {
+    case kTfLiteFloat32:
+      TF_LITE_GATHER(float, int32_t);
+      break;
+    case kTfLiteUInt8:
+      TF_LITE_GATHER(uint8_t, int32_t);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_GATHER(int32_t, int32_t);
+      break;
+    case kTfLiteString: {
+      DynamicBuffer buffer;
+      const int32* indexes = positions->data.i32;
+      const int num_strings = GetStringCount(input);
+      for (int i = 0; i < positions->dims->data[0]; ++i) {
+        const int pos = indexes[i];
+        TF_LITE_ENSURE(context, pos < num_strings);
+        const auto string_ref = GetString(input, pos);
+        buffer.AddString(string_ref.str, string_ref.len);
+      }
+      buffer.WriteToTensor(output);
+    } break;
+    default:
+      return kTfLiteError;
+  }
+#undef TF_LITE_GATHER
+  return kTfLiteOk;
+}
+}  // namespace gather
+
+TfLiteRegistration* Register_GATHER() {
+  static TfLiteRegistration r = {nullptr, nullptr, gather::Prepare,
+                                 gather::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/gather_test.cc b/tensorflow/contrib/lite/kernels/gather_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cdadbeda1884ba0186846826dd16be6ff69878d9
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/gather_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class GatherOpModel : public SingleOpModel {
+ public:
+  GatherOpModel(std::initializer_list<int> input_shape, TensorType input_type,
+                std::initializer_list<int> positions_shape) {
+    input_ = AddInput(input_type);
+    positions_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(input_type);
+    SetBuiltinOp(BuiltinOperator_GATHER, BuiltinOptions_GatherOptions,
+                 CreateGatherOptions(builder_, 0).Union());
+    BuildInterpreter({input_shape, positions_shape});
+  }
+
+  void SetInputFloat(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+
+  void SetInputUint8(std::initializer_list<uint8_t> data) {
+    PopulateTensor<uint8_t>(input_, data);
+  }
+
+  void SetInput(std::initializer_list<string> data) {
+    PopulateStringTensor(input_, data);
+  }
+
+  void SetPositions(std::initializer_list<int> data) {
+    PopulateTensor<int>(positions_, data);
+  }
+
+  std::vector<float> GetOutputFloat() { return ExtractVector<float>(output_); }
+  std::vector<uint8_t> GetOutputUint8() {
+    return ExtractVector<uint8_t>(output_);
+  }
+  std::vector<string> GetOutputString() {
+    return ExtractVector<string>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int positions_;
+  int output_;
+};
+
+TEST(GatherOpTest, Shuffle) {
+  GatherOpModel m({2, 2}, TensorType_FLOAT32, {2});
+  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions({1, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputFloat(),
+              ElementsAreArray(ArrayFloatNear({0.7, 0.8, -2, 0.2})));
+}
+
+TEST(GatherOpTest, Test0DIndex) {
+  GatherOpModel m({2, 2}, TensorType_FLOAT32, {});
+  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+}
+
+TEST(GatherOpTest, Test0DIndexWith0DResult) {
+  // 0D tensor is special case in current TFLite. Test it once to make sure
+  // existing workarounds are fine with it.
+  GatherOpModel m({3}, TensorType_FLOAT32, {});
+  m.SetInputFloat({1.0, 2.0, 3.0});
+  m.SetPositions({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({2.0})));
+  EXPECT_TRUE(m.GetOutputShape().empty());
+}
+
+TEST(FloatGatherOpTest, Duplicate) {
+  GatherOpModel m({1, 2, 2}, TensorType_FLOAT32, {2});
+  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions({0, 0});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutputFloat(),
+      ElementsAreArray(ArrayFloatNear({-2, 0.2, 0.7, 0.8, -2, 0.2, 0.7, 0.8})));
+}
+
+TEST(FloatGatherOpTest, Slice) {
+  GatherOpModel m({4, 1}, TensorType_FLOAT32, {2});
+  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({0.2, 0.8})));
+}
+
+TEST(Uint8tGatherOpTest, Shuffle) {
+  GatherOpModel m({2, 2}, TensorType_UINT8, {2});
+  m.SetInputUint8({133, 134, 14, 15});
+  m.SetPositions({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutputUint8(), ElementsAreArray({14, 15, 133, 134}));
+}
+
+TEST(GatherOpTest, SimpleString) {
+  GatherOpModel m({3}, TensorType_STRING, {2});
+  m.SetInput({"A", "B", "C"});
+  m.SetPositions({0, 2});
+  m.Invoke();
+  ASSERT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutputString(), ElementsAreArray({"A", "C"}));
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/gemm_support.h b/tensorflow/contrib/lite/kernels/gemm_support.h
index b531959ffb143c774ee715743480b03ebfbdc114..466781cbcecc7fb851d9078c450cc6c12364d2bb 100644
--- a/tensorflow/contrib/lite/kernels/gemm_support.h
+++ b/tensorflow/contrib/lite/kernels/gemm_support.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
 
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/context.h"
@@ -51,4 +51,4 @@ void SetMaxNumThreads(TfLiteContext* context, int num_threads);
 }  // namespace gemm_support
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
index 916a23225e2ad3c5645a7809169677a7a8880535..ba0ed5ce06392613238b757308dddc2b22e7eb30 100644
--- a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
+++ b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
@@ -116,7 +116,10 @@ TEST(HashtableLookupOpTest, Test2DInput) {
                                  1.0, 1.1,  // 1-st item
                              })));
   EXPECT_THAT(m.GetHit(), ElementsAreArray({
-                              1, 0, 1, 1,
+                              1,
+                              0,
+                              1,
+                              1,
                           }));
 }
 
@@ -170,7 +173,7 @@ TEST(HashtableLookupOpTest, TestString) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 288534099b9e090ce0c223a401b4152ca6ffb61f..f47fb04cbaa688b75e763ff9d3cb7df44ac3f166 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -124,6 +124,20 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "darwin_x86_64",
+    values = {
+        "cpu": "darwin_x86_64",
+    },
+)
+
+config_setting(
+    name = "freebsd",
+    values = {
+        "cpu": "freebsd",
+    },
+)
+
 cc_library(
     name = "optimized_base",
     srcs = [],
@@ -138,7 +152,7 @@ cc_library(
         ":types",
         ":round",
         "//third_party/eigen3",
-        "@gemmlowp//:gemmlowp",
+        "@gemmlowp",
         "//tensorflow/contrib/lite:builtin_op_data",
     ] + select({
         ":haswell": tflite_deps_intel,
@@ -147,6 +161,8 @@ cc_library(
         ":x86": tflite_deps_intel,
         ":x86_64": tflite_deps_intel,
         ":darwin": tflite_deps_intel,
+        ":darwin_x86_64": tflite_deps_intel,
+        ":freebsd": tflite_deps_intel,
         "//conditions:default": [],
     }),
 )
@@ -154,6 +170,8 @@ cc_library(
 cc_library(
     name = "optimized",
     hdrs = [
+        "optimized/cblas_conv.h",
+        "optimized/cblas_reference.h",
         "optimized/eigen_spatial_convolutions.h",
         "optimized/eigen_tensor_reduced_instantiations_oss.h",
         "optimized/multithreaded_conv.h",
@@ -215,7 +233,7 @@ cc_library(
         ":round",
         ":types",
         "//third_party/eigen3",
-        "@gemmlowp//:gemmlowp",
+        "@gemmlowp",
         "//tensorflow/contrib/lite:builtin_op_data",
     ] + select({
         ":haswell": tflite_deps_intel,
@@ -224,6 +242,8 @@ cc_library(
         ":x86": tflite_deps_intel,
         ":x86_64": tflite_deps_intel,
         ":darwin": tflite_deps_intel,
+        ":darwin_x86_64": tflite_deps_intel,
+        ":freebsd": tflite_deps_intel,
         "//conditions:default": [],
     }),
 )
@@ -258,6 +278,8 @@ cc_library(
         "optimized/neon_tensor_utils.cc",
     ],
     hdrs = [
+        "common.h",
+        "optimized/cpu_check.h",
         "optimized/neon_tensor_utils.h",
         "optimized/tensor_utils_impl.h",
     ],
@@ -265,8 +287,21 @@ cc_library(
     deps = [
         ":cpu_check",
         ":portable_tensor_utils",
+        ":types",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite/kernels:activation_functor",
+        "@arm_neon_2_x86_sse",
+        "@gemmlowp",
+    ],
+)
+
+cc_library(
+    name = "kernel_utils",
+    srcs = ["kernel_utils.cc"],
+    hdrs = ["kernel_utils.h"],
+    deps = [
+        ":tensor_utils",
+        "//tensorflow/contrib/lite:builtin_op_data",
     ],
 )
 
@@ -276,14 +311,21 @@ cc_library(
         "tensor_utils.cc",
     ],
     hdrs = [
+        "common.h",
+        "compatibility.h",
+        "optimized/cpu_check.h",
+        "optimized/neon_tensor_utils.h",
         "optimized/tensor_utils_impl.h",
         "reference/portable_tensor_utils.h",
         "tensor_utils.h",
+        "types.h",
     ],
     copts = NEON_FLAGS_IF_APPLICABLE,
     deps = [
         "//tensorflow/contrib/lite/kernels:activation_functor",
         "//tensorflow/contrib/lite:builtin_op_data",
+        "@arm_neon_2_x86_sse",
+        "@gemmlowp",
     ] + select({
         ":arm": [
             ":neon_tensor_utils",
@@ -303,6 +345,21 @@ cc_library(
         ":ios_arm64": [
             ":neon_tensor_utils",
         ],
+        ":ios_x86_64": [
+            ":neon_tensor_utils",
+        ],
+        ":x86_64": [
+            ":neon_tensor_utils",
+        ],
+        ":x86": [
+            ":neon_tensor_utils",
+        ],
+        ":k8": [
+            ":neon_tensor_utils",
+        ],
+        ":darwin": [
+            ":neon_tensor_utils",
+        ],
         "//conditions:default": [
             ":portable_tensor_utils",
         ],
diff --git a/tensorflow/contrib/lite/kernels/internal/common.h b/tensorflow/contrib/lite/kernels/internal/common.h
index 28f19a250629aec4d03aa71df57d31d8a5014e9f..18601df22c1894dea6ce51f46ba815cd12dab095 100644
--- a/tensorflow/contrib/lite/kernels/internal/common.h
+++ b/tensorflow/contrib/lite/kernels/internal/common.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
 
 #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
 #ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
@@ -102,6 +102,17 @@ inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
                                            quantized_multiplier);
 }
 
+inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
+                                           int shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                                 x * (1 << left_shift), quantized_multiplier),
+                             right_shift);
+}
+
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h
index 796a03566a4bf971294dd2375f590dfd20d600f7..51426bb1c584b82af7b1a2ffaf5a675a1dd9a6fd 100644
--- a/tensorflow/contrib/lite/kernels/internal/compatibility.h
+++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
 
 #include <cassert>
 #include <cstdint>
@@ -27,6 +27,10 @@ limitations under the License.
 #define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false)
 #endif
 
+#ifndef TFLITE_DCHECK_NE
+#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : assert(false)
+#endif
+
 #ifndef TFLITE_DCHECK_GE
 #define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false)
 #endif
@@ -52,6 +56,10 @@ limitations under the License.
 #define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : abort()
 #endif
 
+#ifndef TFLITE_CHECK_NE
+#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : abort()
+#endif
+
 #ifndef TFLITE_CHECK_GE
 #define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : abort()
 #endif
@@ -75,4 +83,4 @@ using uint16 = std::uint16_t;
 using int32 = std::int32_t;
 using uint32 = std::uint32_t;
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..510395126ce3785b1d44fec1e0eb994c29ff0db7
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+
+namespace tflite {
+namespace kernel_utils {
+
+void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
+                  const float* recurrent_weights_ptr, const float* bias_ptr,
+                  int input_size, int num_units, int batch_size,
+                  TfLiteFusedActivation activation,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch) {
+  // Output = bias
+  tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
+                                        output_ptr_batch);
+  // Output += input * input_weights
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size,
+      output_ptr_batch, /*result_stride=*/1);
+  // Output += recurrent_weights * hidden_state
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch,
+      batch_size, output_ptr_batch, /*result_stride=*/1);
+  // Output = activation(Output) and update hidden_state
+  tensor_utils::ApplyActivationToVector(
+      output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
+  tensor_utils::VectorBatchVectorAssign(output_ptr_batch, num_units, batch_size,
+                                        hidden_state_ptr_batch);
+}
+
+}  // namespace kernel_utils
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..9872d4500b862388ed4b96c97e3755f548e35d35
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+
+namespace tflite {
+namespace kernel_utils {
+
+// Performs an RNN batch inference step for inputs specified by input_ptr_batch.
+// The RNN cell is specified by the pointers to its input and recurrent weights,
+// and biases, along with the input size, number of units, activation.
+//
+// The pointers to the hidden state and the output are updated as a result.
+//
+// The pointers with the suffix "_batch" point to data aligned in batch_major
+// order, and each step processes batch_size many inputs from input_ptr_batch,
+// and updates batch_size many outputs and hidden states.
+void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
+                  const float* recurrent_weights_ptr, const float* bias_ptr,
+                  int input_size, int num_units, int batch_size,
+                  TfLiteFusedActivation activation,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch);
+
+}  // namespace kernel_utils
+}  // namespace tflite
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a90e7e640ef29b675c236d8bbb479aa16560761
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
+
+// The Conv implementation based on CBLAS interface. This is only used on iOS
+// for now, utilizing Apple's Accelerate framework.
+
+#if TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
+#include <Accelerate/Accelerate.h>
+#else
+#include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h"
+#endif
+
+#include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+
+namespace tflite {
+namespace cblas_ops {
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+                 const float* filter_data, const Dims<4>& filter_dims,
+                 const float* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, float output_activation_min,
+                 float output_activation_max, float* output_data,
+                 const Dims<4>& output_dims, float* im2col_data,
+                 const Dims<4>& im2col_dims) {
+  gemmlowp::ScopedProfilingLabel label("Conv/cblas");
+
+  const float* gemm_input_data = nullptr;
+  const Dims<4>* gemm_input_dims = nullptr;
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    optimized_ops::Im2col(input_data, input_dims, stride_width, stride_height,
+                          pad_width, pad_height, filter_height, filter_width, 0,
+                          im2col_data, im2col_dims);
+    gemm_input_data = im2col_data;
+    gemm_input_dims = &im2col_dims;
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_dims = &input_dims;
+  }
+
+  // The following code computes matrix multiplication c = a * transponse(b)
+  // with CBLAS, where:
+  // * `a` is a matrix with dimensions (m, k).
+  // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+  // * `c` is a matrix with dimensions (m, n).
+  // The naming of variables are aligned with CBLAS specification here.
+  const float* a = gemm_input_data;
+  const float* b = filter_data;
+  float* c = output_data;
+  int m = gemm_input_dims->sizes[1] * gemm_input_dims->sizes[2] *
+          gemm_input_dims->sizes[3];
+  int n = output_dims.sizes[0];
+  int k = gemm_input_dims->sizes[0];
+  // The stride of matrix a, b and c respectively.
+  int stride_a = k;
+  int stride_b = k;
+  int stride_c = n;
+
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
+              stride_a, b, stride_b, 0.0f, c, stride_c);
+
+  optimized_ops::AddBiasAndEvalActivationFunction(
+      bias_data, bias_dims, output_data, output_dims, output_activation_min,
+      output_activation_max);
+}
+
+}  // namespace cblas_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h
new file mode 100644
index 0000000000000000000000000000000000000000..6acc513805c9398c304f3e24175d3bd6c96938f6
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h
@@ -0,0 +1,69 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
+
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+
+// The reference implementation for a small subset of CBLAS interface.
+// This is only used for testing CBLAS implementation, and should never be used
+// in production code.
+
+namespace tflite {
+namespace cblas_ops {
+
+// The following code follows the original CBLAS specification, and it might
+// conflict with the TensorFlow naming convention.
+// TODO(ycling): Find another way to test CBLAS with bazel, without writing
+// a reference implementation by ourselves.
+enum CBLAS_ORDER { CblasRowMajor = 0, CblasColMajor = 1 };
+
+enum CBLAS_TRANSPOSE { CblasNoTrans = 0, CblasTrans = 1, CblasConjTrans = 2 };
+
+// A reference implementation for matrix multiplication.
+// The following code computes, c = a * transponse(b) matrix multiplication
+// with CBLAS, where:
+// * `a` is a matrix with dimensions (m, k).
+// * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+// * `c` is a matrix with dimensions (m, n).
+// The naming of variables is aligned with CBLAS specification here.
+void cblas_sgemm(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE trans_a,
+                 const enum CBLAS_TRANSPOSE trans_b, const int m, const int n,
+                 const int k, const float alpha, const float *a,
+                 const int stride_a, const float *b, const int stride_b,
+                 const float beta, float *c, const int stride_c) {
+  TFLITE_DCHECK(order == CblasRowMajor);
+  TFLITE_DCHECK(trans_a == CblasNoTrans);
+  TFLITE_DCHECK(trans_b == CblasTrans);
+  TFLITE_DCHECK(beta == 0.0f);
+  for (int row = 0; row < m; ++row) {
+    for (int col = 0; col < n; ++col) {
+      // If `beta` non-zero, multiple it with the original values in output.
+      // Otherwise, ignore the original value in output completely.
+      float value = 0.0f;
+      for (int idx = 0; idx < k; ++idx) {
+        value += alpha * a[stride_a * row + idx] * b[stride_b * col + idx];
+      }
+      c[stride_c * row + col] = value;
+    }
+  }
+}
+
+}  // namespace cblas_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
index dea46cc12065ed34cf681916a46a55bd7a86f463..3a53d3ab07faf63250fc18fc846e0b8f5a39d9c4 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
@@ -34,17 +34,13 @@ inline bool TestCPUFeatureNeon() {
 #endif  // __aarch64__
 }
 
-#elif __ARM_NEON
+#elif defined USE_NEON || defined __ARM_NEON
 
-inline bool TestCPUFeatureNeon() {
-  return true;
-}
+inline bool TestCPUFeatureNeon() { return true; }
 
 #else
 
-inline bool TestCPUFeatureNeon() {
-  return false;
-}
+inline bool TestCPUFeatureNeon() { return false; }
 
 #endif
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
index 974611f52ac74cec275f978c5af5bd561688db78..7f6eea2d5d1cfd6f4e2a569760ecbe0d96f754c8 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
 
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
@@ -311,6 +311,9 @@ struct FloatDepthwiseConvKernel<true, 0, 8> {
   }
 };
 
+// Note this implementation is very slow for input_depths < 8
+// (e.g. comparable to reference implementation) see, specializations for
+// input_depth=3 below.
 template <>
 struct FloatDepthwiseConvKernel<true, 0, 2> {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
@@ -417,6 +420,74 @@ struct FloatDepthwiseConvKernel<true, 0, 2> {
   }
 };
 
+template <>
+struct FloatDepthwiseConvKernel<true, 3, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x2_t filter[3];
+    for (int i = 0; i < 3; i++) {
+      filter[i] = vld1_f32(filter_ptr + 2 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x2_t acc[3];
+      for (int i = 0; i < 3; i++) {
+        acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+      }
+      // Multiply-accumulate for each input channel there 2 outputs
+      acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++) {
+        vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+      }
+      acc_buffer_ptr += 6;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 3, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[3];
+    for (int i = 0; i < 3; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // NOTE: we only want 3 values, so we read it as two ops where
+      // the second op just duplicates the lane
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[3];
+      for (int i = 0; i < 3; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate all outputs.
+      acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 12;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
 template <>
 struct FloatDepthwiseConvKernel<true, 1, 8> {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
@@ -502,6 +573,46 @@ struct FloatDepthwiseConvKernel<true, 1, 32> {
   }
 };
 
+template <>
+struct FloatDepthwiseConvKernel<true, 1, 20> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
 template <>
 struct FloatDepthwiseConvKernel<true, 0, 16> {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
@@ -855,8 +966,11 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
 
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
 
   // Finally, the kernels allowing a variable input depth,
@@ -919,11 +1033,11 @@ inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
           for (int k = 0; k < 4; k++) {
             acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
           }
-            for (int k = 0; k < 4; k++) {
-              acc[k] = vmaxq_f32(
-                  vdupq_n_f32(output_activation_min),
-                  vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
-            }
+          for (int k = 0; k < 4; k++) {
+            acc[k] = vmaxq_f32(
+                vdupq_n_f32(output_activation_min),
+                vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
+          }
           for (int k = 0; k < 4; k++) {
             vst1q_f32(output_ptr + 4 * k, acc[k]);
           }
@@ -984,4 +1098,4 @@ void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
 }  // namespace optimized_ops
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index 051ed2a2c44a04f0473dfd26637e53865a5a51ac..dbc4f0d6fdca8279072d6ea225334722d6a89eb2 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
 
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
@@ -1205,6 +1205,55 @@ struct QuantizedDepthwiseConvKernel<true, 1, 32> {
   }
 };
 
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 20> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
+    // We load the first 16 bytes into filter_u8_{0,1} as usual.
+    // Then we load the 8 last bytes into filter_u8_x  (x for 'extra').
+    // This is redundant: the first 4 bytes of filter_u8_x are the same
+    // as the last 4 bytes of filter_u8_x.
+    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
+    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
+    uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
+    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+    int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
+    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+    filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8 input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16 input = static_cast<int16>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
 template <>
 struct QuantizedDepthwiseConvKernel<true, 1, 8> {
   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
@@ -1504,7 +1553,7 @@ inline void QuantizedDepthwiseConvAccumRowGeneric(
       << "*\n"
       << "* If you would like to carry on with the slow code, compile\n"
       << "* with this preprocessor token defined:\n"
-      << "* TFLITE_ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
+      << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
       << "*\n"
       << "* The right thing to do, if you care about performance, is to add\n"
       << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
@@ -1691,6 +1740,7 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
@@ -1913,4 +1963,4 @@ void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
 }  // namespace optimized_ops
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
index 8004c24a9914e216974539930853d0aadf61e324..ce3cde76999c77e1f9bf1eaccdba7e84ed508dda 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
@@ -16,8 +16,8 @@ limitations under the License.
 // Copied from tensorflow/core/kernels/eigen_spatial_convolutions.h.
 // TODO(petewarden) - move this to a common location in Eigen itself.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
 
 #define EIGEN_USE_CUSTOM_THREAD_POOL
 #define EIGEN_USE_THREADS
@@ -39,7 +39,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #endif
 
-
 namespace Eigen {
 
 /** SpatialConvolution
@@ -215,17 +214,16 @@ EIGEN_DEVICE_FUNC
   }
   // TODO(yangke): choose() is defined in TensorContraction.h -- consider
   // moving it to somewhere more "common".
-  return
-      input
-          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
-                                 row_in_stride, col_in_stride, padding_type)
-          .reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims)
-          .reshape(post_contract_dims);
+  return input
+      .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                             row_in_stride, col_in_stride, padding_type)
+      .reshape(pre_contract_dims)
+      .contract(kernel.reshape(kernel_dims), contract_dims)
+      .reshape(post_contract_dims);
 }
 
 }  // end namespace Eigen
 
 // clang-format on
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
index 7f78f69360b1ebbfb08600c8bc427f1ba9d5244d..d85e06a5d5af8d23235a08592d49754e4f493d34 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
 
 #define EIGEN_USE_CUSTOM_THREAD_POOL
 #define EIGEN_USE_THREADS
@@ -140,4 +140,4 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
 
 #include "Eigen/src/Core/util/ReenableStupidWarnings.h"
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_H
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_H
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
index 1d5c316194df0b87ee7eecbdd04bd5ce9e2e40b5..d34708b8fd0c0732c13ddbd8d70c87a278c40ff8 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
@@ -19,8 +19,8 @@ limitations under the License.
 // clang-format off
 
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
 
 
 #include "Eigen/Core"
@@ -164,4 +164,4 @@ typedef unsigned __int64 uint64_t;
 #include "Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
index b3615f4658a1a70284cc9d386a868a87aa09819b..0bfb4e9b1f8ee4167cfb629645a38538be1d73d4 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
 
 #include <assert.h>
 #include <stdint.h>
@@ -192,4 +192,4 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims,
 }  // namespace multithreaded_ops
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
index bf0bdfb1fb875c4b54c55e25d4a17541507ecd4c..780401e052733cccae0cc34f495df090c1530624 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -15,12 +15,13 @@ limitations under the License.
 #include <string.h>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
 
 #ifdef USE_NEON
 
-#include <arm_neon.h>
 #define kFloatWeightsPerNeonLane 4
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
index 3a4af87304eaf33489b38bd9b15ad9789e091d24..b7e317dc60e2c68e9e993ff45c9090a01bd13b94 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
 
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
@@ -110,4 +110,4 @@ void ReductionSumVector(const float* input_vector, float* output_vector,
 }  // namespace tensor_utils
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index cd565c16a1ee7226f83c19f0020beed75e401497..cd52385f417b469a24b6aa2b15f54ddad5fa9731 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
 
 #include <assert.h>
 #include <stdint.h>
@@ -1538,9 +1538,10 @@ void Add(const int32* input1_data, const Dims<4>& input1_dims,
 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
 // reference_ops.h.
-template <FusedActivationFunctionType Ac, typename T>
+template <typename T>
 void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
                   const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
                   T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
 
@@ -1563,15 +1564,30 @@ void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
         }
       }
     }
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
 inline void BroadcastAdd(int left_shift, const uint8* input1_data,
                          const Dims<4>& input1_dims, int32 input1_offset,
                          int32 input1_multiplier, int input1_shift,
@@ -1772,9 +1788,10 @@ void Mul(const int32* input1_data, const Dims<4>& input1_dims,
 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
 // reference_ops.h.
-template <FusedActivationFunctionType Ac, typename T>
+template <typename T>
 void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
                   const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
                   T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastMul");
 
@@ -1797,15 +1814,30 @@ void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
         }
       }
     }
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastMul(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
 inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                          int32 input1_offset, const uint8* input2_data,
                          const Dims<4>& input2_dims, int32 input2_offset,
@@ -1868,6 +1900,61 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                output_data, output_dims);
 }
 
+// TODO(aselle): This is not actually optimized yet.
+inline void Div(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[Offset(input1_dims, c, x, y, b)] /
+                      input2_data[Offset(input2_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// TODO(aselle): This is not actually optimized yet.
+inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[Offset(input1_dims, c, x, y, b)] -
+                      input2_data[Offset(input2_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
 template <FusedActivationFunctionType Ac, typename Scalar>
 void Concatenation(int concat_dim, const Scalar* const* input_data,
                    const Dims<4>* const* input_dims, int inputs_count,
@@ -1994,6 +2081,166 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
       output_state_map.tanh();
 }
 
+// Quantized LSTM cell. Currently just a copy of the reference impl in
+// reference_ops.h. See the big function comment there, not replicating it
+// here.
+template <int StateIntegerBits>
+void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
+              const uint8* prev_activ_data_uint8,
+              const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8,
+              const Dims<4>& weights_dims, const int32* bias_data_int32,
+              const Dims<4>& bias_dims, const int16* prev_state_data_int16,
+              const Dims<4>& prev_state_dims, int16* output_state_data_int16,
+              const Dims<4>& output_state_dims, uint8* output_activ_data_uint8,
+              const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8,
+              const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16,
+              const Dims<4>& activ_temp_dims, int32 weights_zero_point,
+              int32 accum_multiplier, int accum_shift) {
+  gemmlowp::ScopedProfilingLabel label(
+      "LstmCell/quantized (8bit external, 16bit internal)");
+  // Gather dimensions information, and perform consistency checks.
+  const int batches =
+      MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
+                        output_state_dims, 3, output_activ_dims, 3);
+  const int height =
+      MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
+                        output_state_dims, 2, output_activ_dims, 2);
+  const int width =
+      MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
+                        output_state_dims, 1, output_activ_dims, 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
+  TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
+                  1);
+  const int intern_activ_depth =
+      MatchingArraySize(weights_dims, 1, bias_dims, 0);
+  TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
+                        output_state_dims, 0, output_activ_dims, 0);
+  TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = ArraySize(activ_temp_dims, 1) *
+                         ArraySize(activ_temp_dims, 2) *
+                         ArraySize(activ_temp_dims, 3);
+  const int fc_output_depth =
+      MatchingArraySize(weights_dims, 1, activ_temp_dims, 0);
+  const int fc_accum_depth = ArraySize(weights_dims, 0);
+  TFLITE_CHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  uint8 const* concat_input_arrays_data[2] = {input_data_uint8,
+                                              prev_activ_data_uint8};
+  Dims<4> const* concat_input_arrays_dims[2] = {&input_dims, &prev_activ_dims};
+  Concatenation<FusedActivationFunctionType::kNone, uint8>(
+      0, concat_input_arrays_data, concat_input_arrays_dims, 2,
+      concat_temp_data_uint8, concat_temp_dims);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  for (int b = 0; b < fc_batches; ++b) {
+    for (int out_c = 0; out_c < fc_output_depth; ++out_c) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum = bias_data_int32[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < fc_accum_depth; ++d) {
+        int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
+        int16 weights_val =
+            weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
+        accum += input_val * weights_val;
+      }
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, using 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      // Note that the implicit assumption here, that this multiplier is smaller
+      // than one, is equivalent to the assumption that the fully-connected
+      // weights min-max is enclosed within [-4, 4] (it may be narrower).
+      // If that eventually fails, offline tools (e.g. toco) will fail early
+      // and that will be easy to support as needed. For now, assuming that
+      // this multiplier is less than one allows us to use a simpler, more
+      // accurate implementation.
+      accum =
+          MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
+      // Saturate, cast to int16, and store to the temporary activations array.
+      accum = std::max(-32768, std::min(32767, accum));
+      activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
+    }
+  }
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  const int outer_size = batches * width * height;
+  for (int b = 0; b < outer_size; ++b) {
+    for (int c = 0; c < output_depth; ++c) {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
+      F0 input_modulation_gate_output =
+          gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation =
+          input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state = gemmlowp::SaturatingAdd(
+          gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+          prev_state_times_forget_state);
+      // Implementation of last internal tanh node, still in fixed-point.
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state);
+      // Store the new internal state back to memory, as 16-bit integers.
+      output_state_data_int16[b * output_depth + c] = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16 rescaled_output_activ =
+          gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16 clamped_output_activ =
+          std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
+      output_activ_data_uint8[b * output_depth + c] =
+          128 + clamped_output_activ;
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
                      int outputs_count, Scalar* const* output_data,
@@ -2851,6 +3098,156 @@ inline void Tanh(const float* input_data, const Dims<4>& input_dims,
   output_map.array() = input_map.array().tanh();
 }
 
+inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_zero_point, int32 input_range_radius,
+                 int32 input_multiplier, int input_left_shift,
+                 uint8* output_data, const Dims<4>& output_dims) {
+  // Note that this is almost the exact same code as in Logistic().
+  gemmlowp::ScopedProfilingLabel label("Tanh");
+  /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3);
+  /* height */ MatchingArraySize(input_dims, 2, output_dims, 2);
+  /* width */ MatchingArraySize(input_dims, 1, output_dims, 1);
+  /* depth */ MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int size = RequiredBufferSizeForDims(input_dims);
+
+  int c = 0;
+  int32_t output_zero_point = 128;
+#ifdef USE_NEON
+  // Handle 16 values at a time
+  for (; c <= size - 16; c += 16) {
+    // Read input uint8 values, cast to int16 and subtract input_zero_point
+    uint8x16_t input_val_u8 = vld1q_u8(input_data + c);
+    int16x8_t input_val_centered_0 =
+        vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input_val_u8))),
+                  vdupq_n_s16(input_zero_point));
+    int16x8_t input_val_centered_1 =
+        vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input_val_u8))),
+                  vdupq_n_s16(input_zero_point));
+
+    // Prepare the bit masks that we will use at the end to implement the logic
+    // that was expressed in the scalar code with branching:
+    //   if (input_val_centered < -input_range_radius) {
+    //     output_val = 0;
+    //   } else if (input_val_centered > input_range_radius) {
+    //     output_val = 255;
+    //   } else {
+    //     ...
+    uint16x8_t mask_rightclamp_0 =
+        vcgtq_s16(input_val_centered_0, vdupq_n_s16(input_range_radius));
+    uint16x8_t mask_rightclamp_1 =
+        vcgtq_s16(input_val_centered_1, vdupq_n_s16(input_range_radius));
+    uint16x8_t mask_leftclamp_0 =
+        vcgeq_s16(input_val_centered_0, vdupq_n_s16(-input_range_radius));
+    uint16x8_t mask_leftclamp_1 =
+        vcgeq_s16(input_val_centered_1, vdupq_n_s16(-input_range_radius));
+    uint8x16_t mask_rightclamp = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
+                                             vshrn_n_u16(mask_rightclamp_1, 8));
+    uint8x16_t mask_leftclamp = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
+                                            vshrn_n_u16(mask_leftclamp_1, 8));
+
+    // This performs what is expressed in the scalar code as
+    // const int32 input_val_rescaled =
+    //     MultiplyByQuantizedMultiplierGreaterThanOne(
+    //         input_val_centered, input_multiplier, input_left_shift);
+    int32x4_t input_val_rescaled_0 =
+        vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_0)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_1 =
+        vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_0)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_2 =
+        vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_1)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_3 =
+        vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_1)),
+                  vdupq_n_s32(input_left_shift));
+    input_val_rescaled_0 =
+        vqrdmulhq_n_s32(input_val_rescaled_0, input_multiplier);
+    input_val_rescaled_1 =
+        vqrdmulhq_n_s32(input_val_rescaled_1, input_multiplier);
+    input_val_rescaled_2 =
+        vqrdmulhq_n_s32(input_val_rescaled_2, input_multiplier);
+    input_val_rescaled_3 =
+        vqrdmulhq_n_s32(input_val_rescaled_3, input_multiplier);
+
+    // Invoke gemmlowp::tanh on FixedPoint wrapping int32x4_t
+    using FixedPoint4 = gemmlowp::FixedPoint<int32x4_t, 4>;
+    using FixedPoint0 = gemmlowp::FixedPoint<int32x4_t, 0>;
+    const FixedPoint4 input_val_f4_0 =
+        FixedPoint4::FromRaw(input_val_rescaled_0);
+    const FixedPoint4 input_val_f4_1 =
+        FixedPoint4::FromRaw(input_val_rescaled_1);
+    const FixedPoint4 input_val_f4_2 =
+        FixedPoint4::FromRaw(input_val_rescaled_2);
+    const FixedPoint4 input_val_f4_3 =
+        FixedPoint4::FromRaw(input_val_rescaled_3);
+    const FixedPoint0 output_val_f0_0 = gemmlowp::tanh(input_val_f4_0);
+    const FixedPoint0 output_val_f0_1 = gemmlowp::tanh(input_val_f4_1);
+    const FixedPoint0 output_val_f0_2 = gemmlowp::tanh(input_val_f4_2);
+    const FixedPoint0 output_val_f0_3 = gemmlowp::tanh(input_val_f4_3);
+
+    // Divide by 2^24 as in the scalar code
+    using gemmlowp::RoundingDivideByPOT;
+    int32x4_t output_val_s32_0 = RoundingDivideByPOT(output_val_f0_0.raw(), 24);
+    int32x4_t output_val_s32_1 = RoundingDivideByPOT(output_val_f0_1.raw(), 24);
+    int32x4_t output_val_s32_2 = RoundingDivideByPOT(output_val_f0_2.raw(), 24);
+    int32x4_t output_val_s32_3 = RoundingDivideByPOT(output_val_f0_3.raw(), 24);
+
+    // Add the output zero point
+    int32x4_t output_zero_point_s32 = vdupq_n_s32(output_zero_point);
+    output_val_s32_0 = vaddq_s32(output_val_s32_0, output_zero_point_s32);
+    output_val_s32_1 = vaddq_s32(output_val_s32_1, output_zero_point_s32);
+    output_val_s32_2 = vaddq_s32(output_val_s32_2, output_zero_point_s32);
+    output_val_s32_3 = vaddq_s32(output_val_s32_3, output_zero_point_s32);
+
+    // Cast output values to uint8, saturating
+    int16x8_t output_val_s16_0 = vcombine_s16(vqmovn_s32(output_val_s32_0),
+                                              vqmovn_s32(output_val_s32_1));
+    int16x8_t output_val_s16_1 = vcombine_s16(vqmovn_s32(output_val_s32_2),
+                                              vqmovn_s32(output_val_s32_3));
+    uint8x16_t output_val_u8 = vcombine_u8(vqmovun_s16(output_val_s16_0),
+                                           vqmovun_s16(output_val_s16_1));
+
+    // Perform the bit-masking with the bit masks computed at the beginning,
+    // see the comment there.
+    output_val_u8 = vorrq_u8(output_val_u8, mask_rightclamp);
+    output_val_u8 = vandq_u8(output_val_u8, mask_leftclamp);
+
+    // Store back to memory
+    vst1q_u8(output_data + c, output_val_u8);
+  }
+#endif
+  // Leftover loop: handle one value at a time with scalar code.
+  for (; c < size; ++c) {
+    const uint8 input_val_u8 = input_data[c];
+    const int32 input_val_centered =
+        static_cast<int32>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered < -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered > input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32 input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      using gemmlowp::RoundingDivideByPOT;
+      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+      output_val_s32 += output_zero_point;
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
+      }
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8>(output_val_s32);
+    }
+    output_data[c] = output_val;
+  }
+}
 inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
                        int32 zero_point, double scale, float* output_data,
                        const Dims<4>& output_dims) {
@@ -3323,7 +3720,7 @@ inline void ResizeBilinearGeneric(const float* input_data,
 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                            const int32* output_size_data,
                            const Dims<4>& output_size_dims, float* output_data,
-                           const Dims<4>& output_dims) {
+                           const Dims<4>& output_dims, bool align_corners) {
   gemmlowp::ScopedProfilingLabel label("ResizeBilinear");
   int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   int32 input_height = ArraySize(input_dims, 2);
@@ -3338,13 +3735,20 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
   int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
 
   // Specialize for 2x2 upsample.
-  if (output_height == 2 * input_height && output_width == 2 * input_width) {
+  if (!align_corners && output_height == 2 * input_height &&
+      output_width == 2 * input_width) {
     ResizeBilinear2x2(input_data, input_dims, output_data, output_dims, batches,
                       input_height, input_width, depth, output_height,
                       output_width);
   } else {
     float height_scale = static_cast<float>(input_height) / output_height;
     float width_scale = static_cast<float>(input_width) / output_width;
+    if (align_corners && output_height > 1) {
+      height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
+    }
+    if (align_corners && output_width > 1) {
+      width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
+    }
 
     ResizeBilinearGeneric(input_data, input_dims, output_data, output_dims,
                           batches, input_height, input_width, depth,
@@ -3353,6 +3757,15 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+// legacy, for compatibility with old checked-in code
+inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, float* output_data,
+                           const Dims<4>& output_dims) {
+  ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims,
+                 output_data, output_dims, /*align_corners=*/false);
+}
+
 template <typename T>
 inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
@@ -3381,10 +3794,11 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
     for (int out_h = 0; out_h < output_height; ++out_h) {
       for (int out_w = 0; out_w < output_width; ++out_w) {
         T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b);
-        if (out_h * block_shape_height < padding_top ||
-            out_h * block_shape_height >= padding_top + input_height ||
-            out_w * block_shape_width < padding_left ||
-            out_w * block_shape_width >= padding_left + input_width) {
+        if (out_h * block_shape_height + shift_h < padding_top ||
+            out_h * block_shape_height + shift_h >=
+                padding_top + input_height ||
+            out_w * block_shape_width + shift_w < padding_left ||
+            out_w * block_shape_width + shift_w >= padding_left + input_width) {
           memset(out, 0, depth * sizeof(T));
         } else {
           const T* in =
@@ -3704,6 +4118,43 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
   auto max_value = input2_data[0];
   output_map.array() = input1_map.array().max(max_value);
 }
+
+template <typename T1, typename T2, typename T3>
+void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
+            T2* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("ArgMax");
+
+  // The current ArgMax implemention can only determine the index of the maximum
+  // value in the last dimension. So the axis argument is ignored.
+  TFLITE_DCHECK_EQ(axis[0], 3);
+
+  // For ArgMax, the number of output dimensions = (number of input dimensions -
+  // 1). For the sake of simplicity, the output dimensions are equal to the
+  // input dimensions here. We enforce the constraint that the last dimension
+  // must always be 1.
+  TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = ArraySize(input_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        auto max_value = input_data[Offset(input_dims, 0, x, y, b)];
+        int max_index = 0;
+        for (int d = 1; d < depth; ++d) {
+          const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)];
+          if (curr_value > max_value) {
+            max_value = curr_value;
+            max_index = d;
+          }
+        }
+        output_data[Offset(output_dims, 0, x, y, b)] = max_index;
+      }
+    }
+  }
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
@@ -3712,4 +4163,4 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
 #pragma GCC diagnostic pop
 #endif
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
index f8be99e82fb8721ced7a3e5da686b20ce241ea2d..4e324a5e107cf5a90c0042331899edab831c8e51 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
 #define TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
 
-// TDOD(ghodrat): Remove this header file and the dependency to internal data
+// TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
index 98f2e365c5249a6c28673fc185ebec34cc2105b2..18be6777a5caeb45a4ffabd8b7f1793de7b053f8 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
@@ -22,27 +22,20 @@ limitations under the License.
 
 namespace tflite {
 
-void QuantizeMultiplierSmallerThanOne(double double_multiplier,
-                                      int32_t* quantized_multiplier,
-                                      int* right_shift) {
-  TFLITE_CHECK(double_multiplier >= 0.);
-  TFLITE_CHECK(double_multiplier < 1.);
+void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
+                        int* shift) {
   if (double_multiplier == 0.) {
     *quantized_multiplier = 0;
-    *right_shift = 0;
+    *shift = 0;
     return;
   }
-  TFLITE_CHECK(double_multiplier > 0.);
-  const double q = std::frexp(double_multiplier, right_shift);
-  *right_shift *= -1;
-
+  const double q = std::frexp(double_multiplier, shift);
   auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
   TFLITE_CHECK(q_fixed <= (1ll << 31));
   if (q_fixed == (1ll << 31)) {
     q_fixed /= 2;
-    --*right_shift;
+    ++*shift;
   }
-  TFLITE_CHECK_GE(*right_shift, 0);
   TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
   *quantized_multiplier = static_cast<int32_t>(q_fixed);
 }
@@ -50,17 +43,20 @@ void QuantizeMultiplierSmallerThanOne(double double_multiplier,
 void QuantizeMultiplierGreaterThanOne(double double_multiplier,
                                       int32_t* quantized_multiplier,
                                       int* left_shift) {
-  TFLITE_CHECK(double_multiplier > 1.);
-  const double q = std::frexp(double_multiplier, left_shift);
-  auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
-  TFLITE_CHECK(q_fixed <= (1ll << 31));
-  if (q_fixed == (1ll << 31)) {
-    q_fixed /= 2;
-    ++*left_shift;
-  }
+  TFLITE_CHECK_GT(double_multiplier, 1.);
+  QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
   TFLITE_CHECK_GE(*left_shift, 0);
-  TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
-  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+void QuantizeMultiplierSmallerThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* right_shift) {
+  TFLITE_CHECK_LT(double_multiplier, 1.);
+  TFLITE_CHECK_GT(double_multiplier, 0.);
+  int shift;
+  QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
+  TFLITE_CHECK_LE(shift, 0);
+  *right_shift = -shift;
 }
 
 void PreprocessSoftmaxScaling(double beta, double input_scale,
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
index efb7191c8deb2a23ea5473ab131d2b6537202765..ba06bc0975b6847b24592daa60efe99983d03707 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.h
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
@@ -20,7 +20,8 @@ limitations under the License.
 namespace tflite {
 
 // Decompose a double multiplier into a Q0.31 int32 representation of its
-// significand, and shift representation of its exponent.
+// significand, and shift representation of NEGATIVE its exponent ---
+// this is intended as a RIGHT-shift.
 //
 // Restricted to the case where the multiplier < 1 (and non-negative).
 void QuantizeMultiplierSmallerThanOne(double double_multiplier,
@@ -35,6 +36,16 @@ void QuantizeMultiplierGreaterThanOne(double double_multiplier,
                                       int32_t* quantized_multiplier,
                                       int* left_shift);
 
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Handles an arbitrary positive multiplier. The 'shift' output-value is
+// basically the 'floating-point exponent' of the multiplier:
+// Negative for a right-shift (when the multiplier is <1), positive for a
+// left-shift (when the multiplier is >1)
+void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
+                        int* shift);
+
 // This first creates a multiplier in a double equivalent of
 // Q(input_integer_bits).(31-input_integer_bits) representation, with extra
 // precision in the double's fractional bits.  It then splits the result into
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
index d6f306e2cbae3c780b3d773638ba46cd2abf02f5..19b1b408ec74b0939065b0ad10b91ecfc2cd4765 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
@@ -31,7 +31,7 @@ TEST(QuantizationUtilTest, QuantizeMultiplierSmallerThanOne) {
   };
 
   EXPECT_DEATH(quantize(-0.1), "");
-  EXPECT_THAT(quantize(0.0), Pair(0, 0));
+  EXPECT_DEATH(quantize(0.0), "");
   EXPECT_THAT(quantize(0.25), Pair(1073741824, 1));
 
   // Around 0.5 we can see the change in exponent and how we try hard to
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
index 8e0f234545e43dd8b2412e065aaecad8325a1182..9aabee5000c29ed97fcf7e874d661e72fd768f84 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
 
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
@@ -112,4 +112,4 @@ void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
 }  // end namespace reference_ops
 }  // end namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
index 8a80558b32f2858778460956cd9f57617674e21e..e9b6baeaee87d22aef238410bc9f447509a81c47 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
 
 #include <algorithm>
 
@@ -135,4 +135,4 @@ void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
 }  // end namespace reference_ops
 }  // end namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
index c2ab78000b81485f037c507933cd024e70f39850..c05c21b472b05f2cbe133adf94d91ab0c6d9ef40 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
 
-// TDOD(ghodrat): Remove this header file and the dependency to internal data
+// TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 
 namespace tflite {
 namespace tensor_utils {
 
-// Limit a float input f betweeen +abs_limit and -abs_limit.
+// Limit a float input f between +abs_limit and -abs_limit.
 float PortableClip(float f, float abs_limit);
 
 // Multiply a matrix by a batch vector, and store results in a batch-size
@@ -186,4 +186,4 @@ void ReductionSumVector(const float* input_vector, float* output_vector,
 }  // namespace tensor_utils
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index b9ca3d5c626dff4ea8ba52949e8fea8e9b43689f..2e0376656ac286585ce967c37cbbeb66a7e29172 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
 
 #include <stdint.h>
 #include <sys/types.h>
@@ -889,10 +889,11 @@ inline void Add(int left_shift, const uint8* input1_data,
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
 // generate max(D1, D2) nested for loops.
-template <FusedActivationFunctionType Ac>
-void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims,
-                  const float* input2_data, const Dims<4>& input2_dims,
-                  float* output_data, const Dims<4>& output_dims) {
+template <typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
 
   NdArrayDesc<4> desc1;
@@ -914,15 +915,30 @@ void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims,
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
         }
       }
     }
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
 inline void BroadcastAdd(int left_shift, const uint8* input1_data,
                          const Dims<4>& input1_dims, int32 input1_offset,
                          int32 input1_multiplier, int input1_shift,
@@ -1053,10 +1069,11 @@ void Mul(const float* input1_data, const Dims<4>& input1_dims,
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
 // generate max(D1, D2) nested for loops.
-template <FusedActivationFunctionType Ac>
-void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
-                  const float* input2_data, const Dims<4>& input2_dims,
-                  float* output_data, const Dims<4>& output_dims) {
+template <typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("BroadcastMul");
 
   NdArrayDesc<4> desc1;
@@ -1078,15 +1095,30 @@ void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
         }
       }
     }
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastMul(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
 inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                          int32 input1_offset, const uint8* input2_data,
                          const Dims<4>& input2_dims, int32 input2_offset,
@@ -1149,6 +1181,60 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                output_data, output_dims);
 }
 
+inline void Div(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[Offset(input1_dims, c, x, y, b)] /
+                      input2_data[Offset(input2_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[Offset(input1_dims, c, x, y, b)] -
+                      input2_data[Offset(input2_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void Concatenation(int concat_dim, const Scalar* const* input_data,
                    const Dims<4>* const* input_dims, int inputs_count,
@@ -1272,6 +1358,238 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Quantized LSTM cell implementation.
+// The quantization of the input, output arrays is as follows:
+//  - The input activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that that is the natural interval for output
+//    activations (see next point) and these need to be concatenated together.
+//    We could accommodate different ranges by re-scaling, but we empirically
+//    found that setting the input activations range to be [-1, 127/128] in the
+//    first place, removing the need for re-scaling, greatly improves accuracy.
+//  - The output activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that the definition of a LSTM cell makes them
+//    intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
+//    makes for simpler, more accurate fixed-point arithmetic.
+//  - The output-at-previous-timestep state array is obviously quantized as
+//    the output activations.
+//  - The internal LSTM memory (not the output-at-previous-timestep, the other
+//    internal state array) is int16-quantized and may use any power-of-two,
+//    symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
+//    StateIntegerBits below, see the below discussion of that template
+//    parameter ("The StateIntegerBits template parameter").
+//  - The output of the internal fully-connected node is int16-quantized
+//    on the interval [-8, 8 * 32767/32768], the rationale for which is
+//    explained just below ("Why [-8, 8] for fully-connected output?").
+//
+//
+// === The StateIntegerBits template parameter ===
+//
+// The StateIntegerBits template parameter controls the fixed-point format used
+// to represent the internal memory of the LSTM cell (not the
+// output-at-previous-timestep, the other internal state array). It's currently
+// a template parameter so that the model can control that. The most typical
+// value for StateIntegerBits is 4. Other plausible values are anywhere between
+// 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
+// and drop that template parameter. The reason why it can't be a runtime
+// parameter is that this controls the fixed-point format used, i.e. we need to
+// generate actually different code based on it. In particular, we generate code
+// for a fixed-point tanh() implementation for that format, which internally
+// uses a fixed-point exp() implementation, which internally uses a
+// barrel-shifter with a number of steps that depends on StateIntegerBits.
+// Another consequence of that is that a higher value of StateIntegerBits
+// results in a more expensive implementation (more barrel shifter steps
+// needed).
+//
+//
+// === Why [-8, 8] for fully-connected output? ===
+//
+// This array is only fed to Logistic and Tanh functions, for which
+// the quantized implementation will want to use fixed-point arithmetic,
+// requiring a power-of-two representation interval. Thus, we should right
+// away quantize this array to a power-of-two interval; otherwise,
+// implementation will need to rescale that, losing any benefit that a tighter
+// representation interval might otherwise yield, while introducting some
+// numerical error and computational overhead.
+//
+// Now, Logistic and Tanh
+// are nearly constant (nearly equal to their horizontal asymptotes)
+// outside of a small bounded interval around 0:
+//
+//   Logistic(4) = 1 - 1.8e-2     Tanh(4) = 1 - 6.7e-4
+//   Logistic(8) = 1 - 3.4e-4     Tanh(8) = 1 - 2.3e-7
+//   Logistic(16) = 1 - 1.1e-7    Tanh(16) = 1 - 2.5e-14
+//
+// From this, we see that clamping to [-4, 4] would be too inaccurate
+// (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
+// while clamping to [-16, 16] would make no difference even in float32.
+// However, for a fixed-point implementation in 16-bit integers, using 5
+// integer bits to represent the [-16, 16] range would leave only 11
+// fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
+// representable values. Notice that that is higher than the
+// worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
+// Using [-8, 8] thus seems like the better compromise overall, enjoying
+// an increment of 2.4e-4 between representable values and a worst-case
+// clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
+// [-16, 16].
+//
+// Moreover, all other things being equal, it is nice to choose the narrower
+// representation range, as that makes the implementation of fixed-point
+// math functions a little cheaper (each integer bit requires an additional
+// barrel-shifter atep in the implementation of exp(-x)). That is further
+// reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
+// sense for 32-bit float or 32-bit fixed-point quantization, but we are
+// aiming for 16-bit fixed-point quantization of these internal nodes here.
+//
+template <int StateIntegerBits>
+void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
+              const uint8* prev_activ_data_uint8,
+              const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8,
+              const Dims<4>& weights_dims, const int32* bias_data_int32,
+              const Dims<4>& bias_dims, const int16* prev_state_data_int16,
+              const Dims<4>& prev_state_dims, int16* output_state_data_int16,
+              const Dims<4>& output_state_dims, uint8* output_activ_data_uint8,
+              const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8,
+              const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16,
+              const Dims<4>& activ_temp_dims, int32 weights_zero_point,
+              int32 accum_multiplier, int accum_shift) {
+  // Gather dimensions information, and perform consistency checks.
+  const int batches =
+      MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
+                        output_state_dims, 3, output_activ_dims, 3);
+  const int height =
+      MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
+                        output_state_dims, 2, output_activ_dims, 2);
+  const int width =
+      MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
+                        output_state_dims, 1, output_activ_dims, 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
+  TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
+                  1);
+  const int intern_activ_depth =
+      MatchingArraySize(weights_dims, 1, bias_dims, 0);
+  TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
+                        output_state_dims, 0, output_activ_dims, 0);
+  TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = ArraySize(activ_temp_dims, 1) *
+                         ArraySize(activ_temp_dims, 2) *
+                         ArraySize(activ_temp_dims, 3);
+  const int fc_output_depth =
+      MatchingArraySize(weights_dims, 1, activ_temp_dims, 0);
+  const int fc_accum_depth = ArraySize(weights_dims, 0);
+  TFLITE_CHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  uint8 const* concat_input_arrays_data[2] = {input_data_uint8,
+                                              prev_activ_data_uint8};
+  Dims<4> const* concat_input_arrays_dims[2] = {&input_dims, &prev_activ_dims};
+  Concatenation<FusedActivationFunctionType::kNone, uint8>(
+      0, concat_input_arrays_data, concat_input_arrays_dims, 2,
+      concat_temp_data_uint8, concat_temp_dims);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  for (int b = 0; b < fc_batches; ++b) {
+    for (int out_c = 0; out_c < fc_output_depth; ++out_c) {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum = bias_data_int32[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < fc_accum_depth; ++d) {
+        int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
+        int16 weights_val =
+            weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
+        accum += input_val * weights_val;
+      }
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, using 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      accum =
+          MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
+      // Saturate, cast to int16, and store to the temporary activations array.
+      accum = std::max(-32768, std::min(32767, accum));
+      activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
+    }
+  }
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  const int outer_size = batches * width * height;
+  for (int b = 0; b < outer_size; ++b) {
+    for (int c = 0; c < output_depth; ++c) {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
+      F0 input_modulation_gate_output =
+          gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input = F3::FromRaw(
+          activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation =
+          input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state = gemmlowp::SaturatingAdd(
+          gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+          prev_state_times_forget_state);
+      // Implementation of last internal tanh node, still in fixed-point.
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state);
+      // Store the new internal state back to memory, as 16-bit integers.
+      output_state_data_int16[b * output_depth + c] = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16 rescaled_output_activ =
+          gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16 clamped_output_activ =
+          std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
+      output_activ_data_uint8[b * output_depth + c] =
+          128 + clamped_output_activ;
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
                      int outputs_count, Scalar* const* output_data,
@@ -1957,6 +2275,54 @@ inline void Tanh(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_zero_point, int32 input_range_radius,
+                 int32 input_multiplier, int input_left_shift,
+                 uint8* output_data, const Dims<4>& output_dims) {
+  const int32 output_zero_point = 128;
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
+          const int32 input_val_centered =
+              static_cast<int32>(input_val_u8) - input_zero_point;
+          uint8 output_val;
+          if (input_val_centered <= -input_range_radius) {
+            output_val = 0;
+          } else if (input_val_centered >= input_range_radius) {
+            output_val = 255;
+          } else {
+            const int32 input_val_rescaled =
+                MultiplyByQuantizedMultiplierGreaterThanOne(
+                    input_val_centered, input_multiplier, input_left_shift);
+            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+            const FixedPoint4 input_val_f4 =
+                FixedPoint4::FromRaw(input_val_rescaled);
+            const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+
+            using gemmlowp::RoundingDivideByPOT;
+            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+            output_val_s32 += output_zero_point;
+            if (output_val_s32 == 256) {
+              output_val_s32 = 255;
+            }
+            TFLITE_DCHECK_GE(output_val_s32, 0);
+            TFLITE_DCHECK_LE(output_val_s32, 255);
+            output_val = static_cast<uint8>(output_val_s32);
+          }
+          output_data[Offset(output_dims, c, x, y, b)] = output_val;
+        }
+      }
+    }
+  }
+}
+
 inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
                        int32 zero_point, double scale, float* output_data,
                        const Dims<4>& output_dims) {
@@ -2116,7 +2482,7 @@ inline void Gather(const T* input_data, const Dims<4>& input_dims,
 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                            const int32* output_size_data,
                            const Dims<4>& output_size_dims, float* output_data,
-                           const Dims<4>& output_dims) {
+                           const Dims<4>& output_dims, bool align_corners) {
   int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   int32 input_height = ArraySize(input_dims, 2);
   int32 input_width = ArraySize(input_dims, 1);
@@ -2130,6 +2496,12 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
   int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
   float height_scale = static_cast<float>(input_height) / output_height;
   float width_scale = static_cast<float>(input_width) / output_width;
+  if (align_corners && output_height > 1) {
+    height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
+  }
+  if (align_corners && output_width > 1) {
+    width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
+  }
 
   for (int b = 0; b < batches; ++b) {
     for (int y = 0; y < output_height; ++y) {
@@ -2157,6 +2529,15 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+// legacy, for compatibility with old checked-in code
+inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, float* output_data,
+                           const Dims<4>& output_dims) {
+  ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims,
+                 output_data, output_dims, /*align_corners=*/false);
+}
+
 template <typename T>
 inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
@@ -2183,10 +2564,11 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
     for (int out_h = 0; out_h < output_height; ++out_h) {
       for (int out_w = 0; out_w < output_width; ++out_w) {
         T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b);
-        if (out_h * block_shape_height < padding_top ||
-            out_h * block_shape_height >= padding_top + input_height ||
-            out_w * block_shape_width < padding_left ||
-            out_w * block_shape_width >= padding_left + input_width) {
+        if (out_h * block_shape_height + shift_h < padding_top ||
+            out_h * block_shape_height + shift_h >=
+                padding_top + input_height ||
+            out_w * block_shape_width + shift_w < padding_left ||
+            out_w * block_shape_width + shift_w >= padding_left + input_width) {
           memset(out, 0, depth * sizeof(T));
         } else {
           const T* in =
@@ -2275,27 +2657,60 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+inline bool LoopCondition(int index, int stop, int stride) {
+  return stride > 0 ? index < stop : index > stop;
+}
+
+inline int StartIndex(int start, int stride, int dim, bool masked) {
+  return masked ? (stride > 0 ? 0 : dim - 1) : start;
+}
+
+inline int StopIndex(int start, int stop, int stride, int dim, bool masked,
+                     bool shrink_axis_masked) {
+  return shrink_axis_masked ? stride > 0 ? start + 1 : start - 1
+                            : masked ? (stride > 0 ? dim : -1) : stop;
+}
+
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
-                         int begin_mask, int end_mask,
+                         int begin_mask, int end_mask, int shrink_axis_mask,
                          const std::vector<int>& starts,
                          const std::vector<int>& stops,
                          const std::vector<int>& strides, T* output_data,
                          const Dims<4>& output_dims) {
-  const int start_b = (begin_mask & 8) ? 0 : starts[3];
-  const int stop_b = (end_mask & 8) ? input_dims.sizes[3] : stops[3];
-  const int start_h = (begin_mask & 4) ? 0 : starts[2];
-  const int stop_h = (end_mask & 4) ? input_dims.sizes[2] : stops[2];
-  const int start_w = (begin_mask & 2) ? 0 : starts[1];
-  const int stop_w = (end_mask & 2) ? input_dims.sizes[1] : stops[1];
-  const int start_d = (begin_mask & 1) ? 0 : starts[0];
-  const int stop_d = (end_mask & 1) ? input_dims.sizes[0] : stops[0];
+  TFLITE_DCHECK_EQ(starts.size(), 4);
+  TFLITE_DCHECK_EQ(stops.size(), 4);
+  TFLITE_DCHECK_EQ(strides.size(), 4);
+  const int start_b =
+      StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8);
+  const int stop_b =
+      StopIndex(start_b, stops[3], strides[3], input_dims.sizes[3],
+                end_mask & 8, shrink_axis_mask & 8);
+  const int start_h =
+      StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4);
+  const int stop_h =
+      StopIndex(start_h, stops[2], strides[2], input_dims.sizes[2],
+                end_mask & 4, shrink_axis_mask & 4);
+  const int start_w =
+      StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2);
+  const int stop_w =
+      StopIndex(start_w, stops[1], strides[1], input_dims.sizes[1],
+                end_mask & 2, shrink_axis_mask & 2);
+  const int start_d =
+      StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1);
+  const int stop_d =
+      StopIndex(start_d, stops[0], strides[0], input_dims.sizes[0],
+                end_mask & 1, shrink_axis_mask & 1);
 
   T* out_ptr = output_data;
-  for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
-    for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
-      for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
-        for (int in_d = start_d; in_d < stop_d; in_d += strides[0]) {
+  for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]);
+       in_b += strides[3]) {
+    for (int in_h = start_h; LoopCondition(in_h, stop_h, strides[2]);
+         in_h += strides[2]) {
+      for (int in_w = start_w; LoopCondition(in_w, stop_w, strides[1]);
+           in_w += strides[1]) {
+        for (int in_d = start_d; LoopCondition(in_d, stop_d, strides[0]);
+             in_d += strides[0]) {
           *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
         }
       }
@@ -2303,6 +2718,18 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
+                         int begin_mask, int end_mask,
+                         const std::vector<int>& starts,
+                         const std::vector<int>& stops,
+                         const std::vector<int>& strides, T* output_data,
+                         const Dims<4>& output_dims) {
+  StridedSlice(input_data, input_dims, begin_mask, end_mask,
+               /*shrink_axis_mask=*/0, starts, stops, strides, output_data,
+               output_dims);
+}
+
 template <typename T>
 inline void Slice(const T* input_data, const Dims<4>& input_dims,
                   const std::vector<int>& begin, const std::vector<int>& size,
@@ -2335,6 +2762,72 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void Exp(const T* input_data, const size_t num_elements,
+                T* output_data) {
+  for (size_t idx = 0; idx < num_elements; ++idx) {
+    output_data[idx] = exp(input_data[idx]);
+  }
+}
+
+template <typename T>
+inline void Mean(T* input_data, const int* input_dims, const int input_num_dims,
+                 T* output_data, const int* output_dims,
+                 const int output_num_dims, const int* axis,
+                 const int num_axis_dimensions, bool keep_dims, int* temp_index,
+                 int* resolved_axis) {
+  // resets output data.
+  size_t num_outputs = 1;
+  for (int idx = 0; idx < output_num_dims; ++idx) {
+    num_outputs *= static_cast<size_t>(output_dims[idx]);
+  }
+  for (size_t idx = 0; idx < num_outputs; ++idx) {
+    output_data[idx] = 0;
+  }
+  // resets temp index.
+  for (int idx = 0; idx < input_num_dims; ++idx) {
+    temp_index[idx] = 0;
+  }
+  // resolves axis.
+  int num_resolved_axis = 0;
+  for (int idx = 0; idx < num_axis_dimensions; ++idx) {
+    int current = axis[idx];
+    TFLITE_DCHECK(current < input_num_dims && current + input_num_dims >= 0);
+    if (current < 0) {
+      current += input_num_dims;
+    }
+    bool is_dup = false;
+    for (int j = 0; j < num_resolved_axis; ++j) {
+      if (resolved_axis[j] == current) {
+        is_dup = true;
+        break;
+      }
+    }
+    if (!is_dup) {
+      resolved_axis[num_resolved_axis++] = current;
+    }
+  }
+  // iterates through input_data.
+  for (bool has_next = true; has_next;
+       has_next = NextIndex(input_num_dims, input_dims, temp_index)) {
+    size_t input_offset =
+        ReducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr);
+    size_t output_offset =
+        ReducedOutputOffset(input_num_dims, input_dims, temp_index,
+                            num_resolved_axis, resolved_axis);
+    output_data[output_offset] += input_data[input_offset];
+  }
+  // takes average by num of elements added to get mean.
+  size_t num_elements_in_axis = 1;
+  for (int idx = 0; idx < num_resolved_axis; ++idx) {
+    num_elements_in_axis *= static_cast<size_t>(input_dims[resolved_axis[idx]]);
+  }
+  for (size_t idx = 0; idx < num_outputs; ++idx) {
+    output_data[idx] = static_cast<T>(static_cast<float>(output_data[idx]) /
+                                      num_elements_in_axis);
+  }
+}
+
 template <typename T>
 inline void Mean(const T* input_data, const Dims<4>& input_dims,
                  const std::vector<int>& reduction_indices, T* output_data,
@@ -2449,7 +2942,70 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+template <typename T1, typename T2, typename T3>
+void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
+            T2* output_data, const Dims<4>& output_dims) {
+  // The current ArgMax implemention can only determine the index of the maximum
+  // value in the last dimension. So the axis argument is ignored.
+  TFLITE_DCHECK_EQ(axis[0], 3);
+
+  // For ArgMax, the number of output dimensions = (number of input dimensions -
+  // 1). For the sake of simplicity, the output dimensions are equal to the
+  // input dimensions here. We enforce the constraint that the last dimension
+  // must always be 1.
+  TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = ArraySize(input_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        auto max_value = input_data[Offset(input_dims, 0, x, y, b)];
+        int max_index = 0;
+        for (int d = 1; d < depth; ++d) {
+          const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)];
+          if (curr_value > max_value) {
+            max_value = curr_value;
+            max_index = d;
+          }
+        }
+        output_data[Offset(output_dims, 0, x, y, b)] = max_index;
+      }
+    }
+  }
+}
+
+template <typename T>
+void Transpose(const T* input, const Dims<4>& input_dims, T* output,
+               const Dims<4>& output_dims, int* permuted_axes) {
+  int out_sizes[4];
+  // Compute the inverse permutation array so we can do an output centered
+  // transpose. Also, check to make sure output_dims is matching input_dims.
+  for (int k = 0; k < 4; k++) {
+    out_sizes[k] =
+        MatchingArraySize(input_dims, permuted_axes[k], output_dims, k);
+  }
+
+  // Naive transpose loop (iterate on output index and compute input index).
+  int o[4];  // loop index (on output).
+  int i[4];
+  for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) {
+    i[permuted_axes[3]] = o[3];
+    for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) {
+      i[permuted_axes[2]] = o[2];
+      for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) {
+        i[permuted_axes[1]] = o[1];
+        for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) {
+          i[permuted_axes[0]] = o[0];
+          output[Offset(output_dims, o)] = input[Offset(input_dims, i)];
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/round.h b/tensorflow/contrib/lite/kernels/internal/round.h
index 38525b0e208b852343849096ac68cbfc9ef3e389..f299d0bd8733dc603c4950091c8ac3d7890548a7 100644
--- a/tensorflow/contrib/lite/kernels/internal/round.h
+++ b/tensorflow/contrib/lite/kernels/internal/round.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
 
 #include <cmath>
 
@@ -36,4 +36,4 @@ inline T TfLiteRound(const T x) {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h
index ee4111e0416560d94d513c528971bdf3bf819662..62e38e0d4c3e023d0ed2242fc9438b096b86dc59 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
 
 #include <vector>
 #include "tensorflow/contrib/lite/context.h"
@@ -41,8 +41,7 @@ inline int32_t* GetTensorData(TfLiteTensor* tensor) {
 
 template <>
 inline int64_t* GetTensorData(TfLiteTensor* tensor) {
-  return tensor != nullptr ? reinterpret_cast<int64_t*>(tensor->data.raw)
-                           : nullptr;
+  return tensor != nullptr ? tensor->data.i64 : nullptr;
 }
 
 inline int RemapDim(int max_dimensions, int d) {
@@ -82,6 +81,51 @@ inline Dims<4> GetTensorDims(const TfLiteTensor* tensor) {
   return GetTensorDims(dims->data, dims->size);
 }
 
+// A list of tensors in a format that can be used by kernels like split and
+// concatenation.
+template <typename T>
+class VectorOfTensors {
+ public:
+  // Build with the tensors in 'tensor_list'.
+  VectorOfTensors(const TfLiteContext& context,
+                  const TfLiteIntArray& tensor_list) {
+    int num_tensors = tensor_list.size;
+
+    all_data_.reserve(num_tensors);
+    all_dims_.reserve(num_tensors);
+    all_dims_ptr_.reserve(num_tensors);
+
+    for (int i = 0; i < num_tensors; ++i) {
+      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
+      all_data_.push_back(GetTensorData<T>(t));
+      all_dims_.push_back(GetTensorDims(t));
+    }
+
+    // Taking the pointer from inside a std::vector is only OK if the vector is
+    // never modified, so we populate all_dims in the previous loop and then we
+    // are free to grab iterators here.
+    for (int i = 0; i < num_tensors; ++i) {
+      all_dims_ptr_.push_back(&all_dims_[i]);
+    }
+  }
+  // Return a pointer to the data pointers of all tensors in the list. For
+  // example:
+  //   float* const* f = v.data();
+  //   f[0][1] is the second element of the first tensor.
+  T* const* data() const { return all_data_.data(); }
+
+  // Return a pointer the dim pointers of all tensors in the list. For
+  // example:
+  //   const Dims<4>* const* d = v.dims();
+  //   dims[1] are the dimensions of the second tensor in the list.
+  const Dims<4>* const* dims() const { return all_dims_ptr_.data(); }
+
+ private:
+  std::vector<T*> all_data_;
+  std::vector<Dims<4>> all_dims_;
+  std::vector<Dims<4>*> all_dims_ptr_;
+};
+
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
index 904a97803a6a9ba369c1e64c711b12d19ffc10c4..f4181b18a8f46fd9bef4b81a210a6b8134a4e9d0 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
 
 #ifndef USE_NEON
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
index 0e69ef5982f01e364d865684652d1dfecab6fee3..40d144979b2f965725db86ff311e90f39438802f 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 
 namespace tflite {
 namespace tensor_utils {
 
-// Limit a float input f betweeen +abs_limit and -abs_limit.
+// Limit a float input f between +abs_limit and -abs_limit.
 float Clip(float f, float abs_limit);
 
 // Multiply a matrix by a batch vector, and store results in a batch-size
@@ -113,4 +113,4 @@ void ReductionSumVector(const float* input_vector, float* output_vector,
 }  // namespace tensor_utils
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 07f1cb40045fff3ae47ed4efa6ec43b0cb88a0a7..afe131b06ec41201395e80aa5415fd7db990f8d4 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
 
 #include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
 
@@ -27,6 +27,58 @@ struct Dims {
   int strides[N];
 };
 
+// Gets next index to iterate through a multidimensional array.
+inline bool NextIndex(const int num_dims, const int* dims, int* current) {
+  TFLITE_DCHECK_GT(num_dims, 0);
+  TFLITE_DCHECK(dims != nullptr);
+  TFLITE_DCHECK(current != nullptr);
+  int carry = 1;
+  for (int idx = num_dims - 1; idx >= 0; --idx) {
+    int current_val = current[idx] + carry;
+    TFLITE_DCHECK_GE(dims[idx], current_val);
+    if (dims[idx] == current_val) {
+      current[idx] = 0;
+    } else {
+      current[idx] = current_val;
+      carry = 0;
+      break;
+    }
+  }
+  return (carry == 0);
+}
+
+// Gets offset of index if reducing on axis. When reducing, the flattened offset
+// will not change, if the input index changes on the given axis. For example,
+// if you have a 3D tensor and you are reducing to 2D by eliminating axis 0,
+// then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened
+// offset.
+// TODO(kanlig): uses Dims to represent dimensions.
+inline size_t ReducedOutputOffset(const int num_dims, const int* dims,
+                                  const int* index, const int num_axis,
+                                  const int* axis) {
+  TFLITE_DCHECK_GT(num_dims, 0);
+  TFLITE_DCHECK(dims != nullptr);
+  TFLITE_DCHECK(index != nullptr);
+  size_t offset = 0;
+  for (int idx = 0; idx < num_dims; ++idx) {
+    // if we need to skip this axis
+    bool is_axis = false;
+    if (axis != nullptr) {
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
+        if (idx == axis[axis_idx]) {
+          is_axis = true;
+          break;
+        }
+      }
+    }
+    if (!is_axis) {
+      offset = offset * static_cast<size_t>(dims[idx]) +
+               static_cast<size_t>(index[idx]);
+    }
+  }
+  return offset;
+}
+
 inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
   TFLITE_DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
   TFLITE_DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
@@ -36,6 +88,10 @@ inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
          i3 * dims.strides[3];
 }
 
+inline int Offset(const Dims<4>& dims, int* index) {
+  return Offset(dims, index[0], index[1], index[2], index[3]);
+}
+
 // Get array size, DCHECKing that the dim index is in range.
 template <int N>
 int ArraySize(const Dims<N>& array, int index) {
@@ -78,4 +134,4 @@ bool IsPackedWithoutStrides(const Dims<N>& dims) {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc
index b0546c00cf977af5f722a802866448b0cb293b8d..955e8c5764c6adad37a0009f4ddf8accb437b174 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
 #include <algorithm>
 #include <cmath>
+#include <memory>
+
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 
 namespace tflite {
@@ -84,4 +87,27 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
   }
 }
 
+bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) {
+  return TfLiteIntArrayEqual(input1->dims, input2->dims);
+}
+
+TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
+                                        TfLiteTensor* input1,
+                                        TfLiteTensor* input2,
+                                        TfLiteIntArray** output_shape) {
+  int64_t dims1 = NumDimensions(input1);
+  int64_t dims2 = NumDimensions(input2);
+  int64_t out_dims = std::max(dims1, dims2);
+  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
+      TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
+  for (int i = 0; i < out_dims; ++i) {
+    int64_t d1 = i >= dims1 ? 1 : SizeOfDimension(input1, dims1 - i - 1);
+    int64_t d2 = i >= dims2 ? 1 : SizeOfDimension(input2, dims2 - i - 1);
+    TF_LITE_ENSURE(context, d1 == d2 || d1 == 1 || d2 == 1);
+    shape->data[out_dims - i - 1] = std::max(d1, d2);
+  }
+  *output_shape = shape.release();
+  return kTfLiteOk;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
index 25556ae4567aca45b3bfe4ba02b1cb58331d239d..28f53b9fbbc5620f2fab5c73e40bed8af4af5f1e 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
@@ -35,6 +35,14 @@ inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
 inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
 inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
 
+inline int64_t NumElements(const TfLiteTensor* t) {
+  int64_t count = 1;
+  for (int i = 0; i < NumDimensions(t); ++i) {
+    count *= SizeOfDimension(t, i);
+  }
+  return count;
+}
+
 inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
                                             const TfLiteNode* node, int index) {
   const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
@@ -44,6 +52,25 @@ inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
   return nullptr;
 }
 
+// Determines whether tensor is constant.
+inline bool IsConstantTensor(TfLiteTensor* tensor) {
+  return tensor->allocation_type == kTfLiteMmapRo;
+}
+
+// Determines whether tensor is dynamic. Note that a tensor can be non-const and
+// not dynamic. This function specificially checks for a dynamic tensor.
+inline bool IsDynamicTensor(TfLiteTensor* tensor) {
+  return tensor->allocation_type == kTfLiteDynamic;
+}
+
+// Sets tensor to dynamic.
+inline void SetTensorToDynamic(TfLiteTensor* tensor) {
+  if (tensor->allocation_type != kTfLiteDynamic) {
+    tensor->allocation_type = kTfLiteDynamic;
+    tensor->data.raw = nullptr;
+  }
+}
+
 // Calculates the multiplication factor for a quantized convolution (or
 // quantized depthwise convolution) involving the given tensors. Returns an
 // error if the scales of the tensors are not compatible.
@@ -60,6 +87,15 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
                                    float* activation_min,
                                    float* activation_max);
 
+// Return true if the given tensors have the same shape.
+bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2);
+
+// Calculate the output_shape that is necessary for element-wise operations
+// with broadcasting involving the two input tensors.
+TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
+                                        TfLiteTensor* input1,
+                                        TfLiteTensor* input2,
+                                        TfLiteIntArray** output_shape);
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/kernel_util_test.cc b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c65b68970f6853e17af3a70aad7a2bc982a1ee60
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
@@ -0,0 +1,152 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+void ReportError(TfLiteContext* context, const char* format, ...) {}
+
+class KernelUtilTest : public ::testing::Test {
+ public:
+  KernelUtilTest() {
+    context_.ReportError = ReportError;
+
+    tensor1_.dims = nullptr;
+    tensor2_.dims = nullptr;
+    tensor1_.allocation_type = kTfLiteMmapRo;
+    tensor2_.allocation_type = kTfLiteMmapRo;
+  }
+  ~KernelUtilTest() {
+    TfLiteTensorFree(&tensor1_);
+    TfLiteTensorFree(&tensor2_);
+  }
+
+  void SetShape(TfLiteTensor* tensor, std::initializer_list<int> dims) {
+    TfLiteTensorFree(tensor);
+    tensor->dims = TfLiteIntArrayCreate(dims.size());
+    int i = 0;
+    for (int d : dims) {
+      tensor->dims->data[i] = d;
+      ++i;
+    }
+  }
+
+  std::vector<int> GetShape(TfLiteIntArray* dims) {
+    std::vector<int> result;
+    for (int i = 0; i < dims->size; ++i) {
+      result.push_back(dims->data[i]);
+    }
+    return result;
+  }
+
+ protected:
+  TfLiteContext context_;
+  TfLiteTensor tensor1_;
+  TfLiteTensor tensor2_;
+};
+
+TEST_F(KernelUtilTest, SameShapeEmpty) {
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor1_, {1, 2, 3});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2, 3, 4});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {1, 2, 3});
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor2_, {});
+  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+
+  SetShape(&tensor1_, {});
+  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDim) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {1, 3});
+  EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_EQ(output, nullptr);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeOnes) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 1});
+  SetShape(&tensor2_, {1, 3});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {1, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  TfLiteIntArrayFree(output);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeScalars) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {});
+  SetShape(&tensor2_, {2});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(2));
+  TfLiteIntArrayFree(output);
+}
+
+TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) {
+  TfLiteIntArray* output = nullptr;
+  SetShape(&tensor1_, {1, 2});
+  SetShape(&tensor2_, {3, 1, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(3, 1, 2));
+  TfLiteIntArrayFree(output);
+
+  SetShape(&tensor1_, {1, 2, 3, 4});
+  SetShape(&tensor2_, {1, 3, 1});
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
+                                                  &tensor2_, &output));
+  EXPECT_THAT(GetShape(output), ::testing::ElementsAre(1, 2, 3, 4));
+  TfLiteIntArrayFree(output);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/l2norm.cc b/tensorflow/contrib/lite/kernels/l2norm.cc
index f43aa372b6398a38e57dd38f3d7c7db2bd3aefc1..ee8bfe56d95e9f383ef49b40b8f58b63d61da3e1 100644
--- a/tensorflow/contrib/lite/kernels/l2norm.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm.cc
@@ -43,8 +43,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  // TODO(ahentz): Our current implementations rely on the inputs being 4D.
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+  TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
 
   // TODO(ahentz): Our current implementations only support float32.
   TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
@@ -54,12 +53,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // activations.
   TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
-  output_size->data[0] = input->dims->data[0];
-  output_size->data[1] = input->dims->data[1];
-  output_size->data[2] = input->dims->data[2];
-  output_size->data[3] = input->dims->data[3];
-
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
diff --git a/tensorflow/contrib/lite/kernels/l2norm_test.cc b/tensorflow/contrib/lite/kernels/l2norm_test.cc
index b1db89b8bd3474ac868d7215e4a0de12088c48ef..30e103f3303484c339ef98e6a68e0438291c102f 100644
--- a/tensorflow/contrib/lite/kernels/l2norm_test.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm_test.cc
@@ -57,7 +57,7 @@ TEST(L2NormOpTest, SimpleTest) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/local_response_norm_test.cc b/tensorflow/contrib/lite/kernels/local_response_norm_test.cc
index 63a8b0a3d0186def7da2c9f31481721f1a55281c..d75ce258a04c820d8f82735988c01d0154ef36f2 100644
--- a/tensorflow/contrib/lite/kernels/local_response_norm_test.cc
+++ b/tensorflow/contrib/lite/kernels/local_response_norm_test.cc
@@ -95,7 +95,7 @@ TEST(LocalResponseNormOpTest, SmallRadius) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection_test.cc b/tensorflow/contrib/lite/kernels/lsh_projection_test.cc
index 1011927848d586c8541fb694914b5eee123cb8dc..414d728dfc153058ec878d3c766f58e86815cd3f 100644
--- a/tensorflow/contrib/lite/kernels/lsh_projection_test.cc
+++ b/tensorflow/contrib/lite/kernels/lsh_projection_test.cc
@@ -117,7 +117,7 @@ TEST(LSHProjectionOpTest2, Sparse3DInputs) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/lstm_test.cc b/tensorflow/contrib/lite/kernels/lstm_test.cc
index be4c7ddbf88fc902368cda13aff72f5aecb9dac4..c068286b0d84bcb51ebb0e239350a42863de6523 100644
--- a/tensorflow/contrib/lite/kernels/lstm_test.cc
+++ b/tensorflow/contrib/lite/kernels/lstm_test.cc
@@ -1081,8 +1081,7 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/mean.cc b/tensorflow/contrib/lite/kernels/mean.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aff19581ea56f94c08638b7b388ae181f566cf4f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/mean.cc
@@ -0,0 +1,233 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace mean {
+
+// This file has reference implementation of Mean.
+enum KernelType {
+  kReference,
+};
+
+struct MeanContext {
+  MeanContext(TfLiteContext* context, TfLiteNode* node) {
+    params = reinterpret_cast<TfLiteMeanParams*>(node->builtin_data);
+    input = GetInput(context, node, 0);
+    axis = GetInput(context, node, 1);
+    output = GetOutput(context, node, 0);
+  }
+  TfLiteMeanParams* params;
+  TfLiteTensor* input;
+  TfLiteTensor* axis;
+  TfLiteTensor* output;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  // Creates two temp tensors to store index and axis for internal
+  // implementation only.
+  auto* scratch_tensor_index = new int;
+  context->AddTensors(context, 2, scratch_tensor_index);
+  return scratch_tensor_index;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<int*>(buffer);
+}
+
+// Resizes the temp tensor that stores resolved axis.
+TfLiteStatus ResizeTempAxis(TfLiteContext* context, MeanContext* op_context,
+                            TfLiteTensor* resolved_axis) {
+  TfLiteIntArray* axis_size = TfLiteIntArrayCreate(1);
+  axis_size->data[0] = static_cast<int>(NumElements(op_context->axis));
+  return context->ResizeTensor(context, resolved_axis, axis_size);
+}
+
+// Resizes output array based on the input size and resolved axis.
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                MeanContext* op_context) {
+  size_t num_axis = NumElements(op_context->axis);
+  const TfLiteIntArray* input_dims = op_context->input->dims;
+  int input_num_dims = NumDimensions(op_context->input);
+  const int* axis = GetTensorData<int>(op_context->axis);
+  if (op_context->params->keep_dims) {
+    TfLiteIntArray* output_dims = TfLiteIntArrayCreate(input_num_dims);
+    for (int idx = 0; idx < input_num_dims; ++idx) {
+      bool is_axis = false;
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
+        if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) {
+          is_axis = true;
+          break;
+        }
+      }
+      if (is_axis) {
+        output_dims->data[idx] = 1;
+      } else {
+        output_dims->data[idx] = input_dims->data[idx];
+      }
+    }
+    return context->ResizeTensor(context, op_context->output, output_dims);
+  } else {
+    // Calculates size of reducing axis.
+    int num_reduce_axis = num_axis;
+    for (int i = 0; i < num_axis; ++i) {
+      int current = axis[i];
+      if (current < 0) {
+        current += input_num_dims;
+      }
+      TF_LITE_ENSURE(context, current >= 0 && current < input_num_dims);
+      for (int j = 0; j < i; ++j) {
+        int previous = axis[j];
+        if (previous < 0) {
+          previous += input_num_dims;
+        }
+        if (current == previous) {
+          --num_reduce_axis;
+          break;
+        }
+      }
+    }
+    // Determines output dimensions.
+    TfLiteIntArray* output_dims =
+        TfLiteIntArrayCreate(input_num_dims - num_reduce_axis);
+    int num_skip_axis = 0;
+    for (int idx = 0; idx < input_num_dims; ++idx) {
+      bool is_axis = false;
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
+        if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) {
+          ++num_skip_axis;
+          is_axis = true;
+          break;
+        }
+      }
+      if (!is_axis) {
+        output_dims->data[idx - num_skip_axis] = input_dims->data[idx];
+      }
+    }
+    return context->ResizeTensor(context, op_context->output, output_dims);
+  }
+}
+
+// Initializes temp tensors to store index and resolved axis.
+TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
+                                   MeanContext* op_context) {
+  // Creates a temp index to iterate through input data.
+  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(2);
+  node->temporaries->data[0] = *scratch_tensor_index;
+  TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
+  scratch_tensor->type = kTfLiteInt32;
+  scratch_tensor->allocation_type = kTfLiteArenaRw;
+  TfLiteIntArray* index_size = TfLiteIntArrayCreate(1);
+  index_size->data[0] = NumDimensions(op_context->input);
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, scratch_tensor, index_size));
+
+  // Creates a temp tensor to store resolved axis given input data.
+  node->temporaries->data[1] = *scratch_tensor_index + 1;
+  TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  resolved_axis->type = kTfLiteInt32;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  MeanContext op_context(context, node);
+  TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
+
+  TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  // Leaves work to Eval if axis is not constant; else resizes output.
+  if (!IsConstantTensor(op_context.axis)) {
+    SetTensorToDynamic(op_context.output);
+    SetTensorToDynamic(resolved_axis);
+    return kTfLiteOk;
+  }
+  resolved_axis->allocation_type = kTfLiteArenaRw;
+  TF_LITE_ENSURE_OK(context,
+                    ResizeTempAxis(context, &op_context, resolved_axis));
+  return ResizeOutputTensor(context, &op_context);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  MeanContext op_context(context, node);
+  int num_axis = static_cast<int>(NumElements(op_context.axis));
+  TfLiteTensor* temp_index = &context->tensors[node->temporaries->data[0]];
+  TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeTempAxis(context, &op_context, resolved_axis));
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+  }
+
+#define TF_LITE_MEAN(kernel_type, data_type)                        \
+  kernel_type::Mean<>(                                              \
+      GetTensorData<data_type>(op_context.input),                   \
+      op_context.input->dims->data, op_context.input->dims->size,   \
+      GetTensorData<data_type>(op_context.output),                  \
+      op_context.output->dims->data, op_context.output->dims->size, \
+      GetTensorData<int>(op_context.axis), num_axis,                \
+      op_context.params->keep_dims, GetTensorData<int>(temp_index), \
+      GetTensorData<int>(resolved_axis))
+
+  if (kernel_type == kReference) {
+    switch (op_context.input->type) {
+      case kTfLiteFloat32:
+        TF_LITE_MEAN(reference_ops, float);
+        break;
+      case kTfLiteInt32:
+        TF_LITE_MEAN(reference_ops, int);
+        break;
+      case kTfLiteUInt8:
+        TF_LITE_MEAN(reference_ops, uint8_t);
+        break;
+      case kTfLiteInt64:
+        TF_LITE_MEAN(reference_ops, int64_t);
+        break;
+      default:
+        return kTfLiteError;
+    }
+  }
+#undef TF_LITE_MEAN
+  return kTfLiteOk;
+}
+
+}  // namespace mean
+
+TfLiteRegistration* Register_MEAN_REF() {
+  static TfLiteRegistration r = {mean::Init, mean::Free, mean::Prepare,
+                                 mean::Eval<mean::kReference>};
+  return &r;
+}
+
+// TODO(kanlig): add optimized implementation of Mean.
+TfLiteRegistration* Register_MEAN() { return Register_MEAN_REF(); }
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/mean_test.cc b/tensorflow/contrib/lite/kernels/mean_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4c53c2ded351849e7c458fc754c36395a25ebd0
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/mean_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseMeanOpModel : public SingleOpModel {
+ public:
+  void SetAxis(std::initializer_list<int> data) { PopulateTensor(axis_, data); }
+
+  template <class T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor(input_, data);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int axis_;
+  int output_;
+};
+
+// Model for the tests case where axis is a const tensor.
+class MeanOpConstModel : public BaseMeanOpModel {
+ public:
+  MeanOpConstModel(const TensorData& input, const TensorData& output,
+                   std::initializer_list<int> axis_shape,
+                   std::initializer_list<int> axis, bool keep_dims) {
+    input_ = AddInput(input);
+    axis_ = AddConstInput(TensorType_INT32, axis, axis_shape);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_MeanOptions,
+                 CreateMeanOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+};
+
+// Model for the tests case where axis is a dynamic tensor.
+class MeanOpDynamicModel : public BaseMeanOpModel {
+ public:
+  MeanOpDynamicModel(const TensorData& input, const TensorData& output,
+                     const TensorData& axis, bool keep_dims) {
+    input_ = AddInput(input);
+    axis_ = AddInput(axis);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_MeanOptions,
+                 CreateMeanOptions(builder_, keep_dims).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+};
+
+TEST(ConstMeanOpTest, NotKeepDims) {
+  std::initializer_list<float> data = {
+      1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}},
+                     {4}, {1, 0, -3, -3}, false);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13})));
+}
+
+TEST(ConstMeanOpTest, KeepDims) {
+  std::initializer_list<float> data = {
+      1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}},
+                     {2}, {0, 2}, true);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
+}
+
+TEST(DynamicMeanOpTest, NotKeepDims) {
+  std::initializer_list<float> data = {
+      1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}},
+                       {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}},
+                       false);
+  std::initializer_list<int> axis = {1, 0, -3, -3};
+  m.SetAxis(axis);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({12, 13})));
+}
+
+TEST(DynamicMeanOpTest, KeepDims) {
+  std::initializer_list<float> data = {
+      1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}},
+                       {TensorType_FLOAT32, {3}}, {TensorType_INT32, {2}},
+                       true);
+  std::initializer_list<int> axis = {0, 2};
+  m.SetAxis(axis);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/mul.cc b/tensorflow/contrib/lite/kernels/mul.cc
index 81c73f2523186c2d4072d56bdc8980fcdbb588a3..54575019de4c678ce25561cf2ac8dc80c9973363 100644
--- a/tensorflow/contrib/lite/kernels/mul.cc
+++ b/tensorflow/contrib/lite/kernels/mul.cc
@@ -37,7 +37,23 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -45,43 +61,56 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
-  for (int i = 0; i < NumDimensions(input1); ++i) {
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
-                      SizeOfDimension(input2, i));
-  }
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
 
-  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
-  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
   return context->ResizeTensor(context, output, output_size);
 }
 
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteMulParams* params, TfLiteTensor* input1,
-               TfLiteTensor* input2, TfLiteTensor* output) {
+               TfLiteMulParams* params, const OpData* data,
+               TfLiteTensor* input1, TfLiteTensor* input2,
+               TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
-#define TF_LITE_MUL(type)                                        \
-  type::Mul(GetTensorData<float>(input1), GetTensorDims(input1), \
-            GetTensorData<float>(input2), GetTensorDims(input2), \
-            output_activation_min, output_activation_max,        \
-            GetTensorData<float>(output), GetTensorDims(output))
+#define TF_LITE_MUL(type, opname)                                   \
+  type::opname(GetTensorData<float>(input1), GetTensorDims(input1), \
+               GetTensorData<float>(input2), GetTensorDims(input2), \
+               output_activation_min, output_activation_max,        \
+               GetTensorData<float>(output), GetTensorDims(output))
   if (kernel_type == kReference) {
-    TF_LITE_MUL(reference_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_MUL(reference_ops, BroadcastMul);
+    } else {
+      TF_LITE_MUL(reference_ops, Mul);
+    }
   } else {
-    TF_LITE_MUL(optimized_ops);
+    if (data->requires_broadcast) {
+      TF_LITE_MUL(optimized_ops, BroadcastMul);
+    } else {
+      TF_LITE_MUL(optimized_ops, Mul);
+    }
   }
 #undef TF_LITE_MUL
 }
 
 template <KernelType kernel_type>
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteMulParams* params, TfLiteTensor* input1,
-                   TfLiteTensor* input2, TfLiteTensor* output) {
+                   TfLiteMulParams* params, const OpData* data,
+                   TfLiteTensor* input1, TfLiteTensor* input2,
+                   TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
   auto output_offset = output->params.zero_point;
@@ -98,17 +127,19 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRangeUint8(params->activation, output,
                                 &output_activation_min, &output_activation_max);
 
-#define TF_LITE_MUL(type)                                                    \
-  type::BroadcastMul(GetTensorData<uint8_t>(input1), GetTensorDims(input1),  \
-                     input1_offset, GetTensorData<uint8_t>(input2),          \
-                     GetTensorDims(input2), input2_offset, output_offset,    \
-                     output_multiplier, output_shift, output_activation_min, \
-                     output_activation_max, GetTensorData<uint8_t>(output),  \
-                     GetTensorDims(output));
+#define TF_LITE_MUL(type, opname)                                      \
+  type::opname(GetTensorData<uint8_t>(input1), GetTensorDims(input1),  \
+               input1_offset, GetTensorData<uint8_t>(input2),          \
+               GetTensorDims(input2), input2_offset, output_offset,    \
+               output_multiplier, output_shift, output_activation_min, \
+               output_activation_max, GetTensorData<uint8_t>(output),  \
+               GetTensorDims(output));
+  // The quantized version of Mul doesn't support activations, so we
+  // always use BroadcastMul.
   if (kernel_type == kReference) {
-    TF_LITE_MUL(reference_ops);
+    TF_LITE_MUL(reference_ops, BroadcastMul);
   } else {
-    TF_LITE_MUL(optimized_ops);
+    TF_LITE_MUL(optimized_ops, BroadcastMul);
   }
 #undef TF_LITE_MUL
 }
@@ -116,15 +147,17 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
-    EvalFloat<kernel_type>(context, node, params, input1, input2, output);
+    EvalFloat<kernel_type>(context, node, params, data, input1, input2, output);
   } else if (output->type == kTfLiteUInt8) {
-    EvalQuantized<kernel_type>(context, node, params, input1, input2, output);
+    EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
+                               output);
   } else {
     context->ReportError(context,
                          "Mul only supports FLOAT32 and quantized UINT8 now.");
@@ -137,19 +170,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace mul
 
 TfLiteRegistration* Register_MUL_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+  static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare,
                                  mul::Eval<mul::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_MUL_GENERIC_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+  static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare,
                                  mul::Eval<mul::kGenericOptimized>};
   return &r;
 }
 
 TfLiteRegistration* Register_MUL_NEON_OPT() {
-  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+  static TfLiteRegistration r = {mul::Init, mul::Free, mul::Prepare,
                                  mul::Eval<mul::kNeonOptimized>};
   return &r;
 }
diff --git a/tensorflow/contrib/lite/kernels/mul_test.cc b/tensorflow/contrib/lite/kernels/mul_test.cc
index 4b858e1f396252e7f7bdc231bc1e00f47277f08a..f1a30f82634631ba8320421d5b36ffe446f443fa 100644
--- a/tensorflow/contrib/lite/kernels/mul_test.cc
+++ b/tensorflow/contrib/lite/kernels/mul_test.cc
@@ -25,10 +25,11 @@ using ::testing::ElementsAreArray;
 
 class BaseMulOpModel : public SingleOpModel {
  public:
-  BaseMulOpModel(TensorData input, TensorData output,
+  BaseMulOpModel(const TensorData& input1, const TensorData& input2,
+                 const TensorData& output,
                  ActivationFunctionType activation_type) {
-    input1_ = AddInput(input);
-    input2_ = AddInput(input);
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
     output_ = AddOutput(output);
     SetBuiltinOp(BuiltinOperator_MUL, BuiltinOptions_MulOptions,
                  CreateMulOptions(builder_, activation_type).Union());
@@ -70,6 +71,7 @@ class QuantizedMulOpModel : public BaseMulOpModel {
 
 TEST(FloatMulOpTest, NoActivation) {
   FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
                     {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
@@ -78,9 +80,10 @@ TEST(FloatMulOpTest, NoActivation) {
               ElementsAreArray(ArrayFloatNear({-0.2, 0.04, 0.21, 0.4})));
 }
 
-TEST(FloatMulOpTest, ActivationRELU1) {
-  FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
-                    {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU1);
+TEST(FloatMulOpTest, ActivationRELU_N1_TO_1) {
+  FloatMulOpModel m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {TensorType_FLOAT32, {1, 2, 2, 1}},
+      {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU_N1_TO_1);
   m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
   m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 5});
   m.Invoke();
@@ -93,6 +96,7 @@ TEST(FloatMulOpTest, VariousInputShapes) {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, test_shapes[i]},
                       {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
     m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
     m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1});
@@ -104,8 +108,26 @@ TEST(FloatMulOpTest, VariousInputShapes) {
   }
 }
 
+TEST(FloatMulOpTest, WithBroadcast) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}},  // always a scalar
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-0.2, 0.02, 0.07, 0.08, 0.11, 0.2})))
+        << "With shape number " << i;
+  }
+}
+
 TEST(QuantizedMulOpTest, NoActivation) {
   QuantizedMulOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                        {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                         {TensorType_UINT8, {}, -1.0, 1.0},
                         ActivationFunctionType_NONE);
   m.QuantizeAndPopulate<uint8_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
@@ -116,12 +138,37 @@ TEST(QuantizedMulOpTest, NoActivation) {
                                               kQuantizedTolerance)));
 }
 
+// for quantized Mul, the error shouldn't exceed 2*step
+float GetTolerance(int min, int max) {
+  float kQuantizedStep = (max - min) / 255.0;
+  float kQuantizedTolerance = 2.0 * kQuantizedStep;
+  return kQuantizedTolerance;
+}
+
+TEST(QuantizedMulOpTest, WithBroadcast) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedMulOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},  // always a scalar
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear(
+                    {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2}, kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/op_macros.h b/tensorflow/contrib/lite/kernels/op_macros.h
index 7535afaf8ea52d855e2e4773e56ce2118a16447c..7568eaa88edfa3260964e16f03299aecb97da6be 100644
--- a/tensorflow/contrib/lite/kernels/op_macros.h
+++ b/tensorflow/contrib/lite/kernels/op_macros.h
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
+
+#include <cstdio>
 
 #define TF_LITE_FATAL(msg)          \
   do {                              \
@@ -29,4 +31,4 @@ limitations under the License.
     if ((x) != (y)) TF_LITE_FATAL(#x " didn't equal " #y); \
   } while (0)
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
index 8e9cc07656c8bea83f7cb78ca0b6cc5de7ad1b73..cee3ec6197c698a11004d42dccdfe2bcca088015 100644
--- a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
+++ b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
@@ -243,7 +243,6 @@ class LSTMOpModel : public SingleOpModel {
   int n_output_;
 };
 
-
 TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
   const int n_batch = 1;
   const int n_input = 2;
@@ -282,7 +281,6 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
                        {0},     // projection_bias tensor
                    });
 
-
   lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
                               0.04717243, 0.48944736, -0.38535351,
                               -0.17212132});
@@ -334,8 +332,7 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c29da3862e84d6756bf5ef34b2ca06307b0a065d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -0,0 +1,183 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace pad {
+
+// This file has two implementations of Pad.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
+struct PadContext {
+  PadContext(TfLiteContext* context, TfLiteNode* node) {
+    input = GetInput(context, node, 0);
+    paddings = GetInput(context, node, 1);
+    output = GetOutput(context, node, 0);
+    dims = NumDimensions(input);
+  }
+  TfLiteTensor* input;
+  TfLiteTensor* paddings;
+  TfLiteTensor* output;
+  int dims;
+};
+
+// Resizes output array based on the input size and padding size. This function
+// is callable from both Prepare() and Eval() as long as the caller ensures the
+// paddings data is present.
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                PadContext* op_context) {
+  // Ensures the paddings array is dims x 2.
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(op_context->paddings, 0),
+                    op_context->dims);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(op_context->paddings, 1), 2);
+
+  // Determines the size of the output tensor.
+  TfLiteIntArray* input_size = op_context->input->dims;
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
+  const int32* paddings_data = GetTensorData<int32>(op_context->paddings);
+
+  for (int idx = 0; idx < op_context->dims; ++idx) {
+    int before_padding = *paddings_data++;
+    int after_padding = *paddings_data++;
+
+    TF_LITE_ENSURE_MSG(context, (before_padding >= 0 && after_padding >= 0),
+                       "Pad value has to be greater than equal to 0.");
+
+    output_size->data[idx] =
+        (input_size->data[idx] + before_padding + after_padding);
+  }
+
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  PadContext op_context(context, node);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  // TODO(nupurgarg): Our current implementations rely on the inputs being 4D.
+  TF_LITE_ENSURE_EQ(context, op_context.dims, 4);
+
+  // Exit early if paddings is a non-const tensor. Set output tensor to
+  // dynamic so output size can be determined in Eval.
+  if (!IsConstantTensor(op_context.paddings)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  PadContext op_context(context, node);
+
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+  }
+
+  // TODO(nupurgarg): Change kernel implementation to take in int* instead of
+  // vector<int> to remove malloc from Eval().
+  // Create before and after padding arrays that are accepted by the kernel.
+  std::vector<int> before_padding;
+  std::vector<int> after_padding;
+  const int32* paddings_data = GetTensorData<int32>(op_context.paddings);
+
+  // TODO(nupurgarg): Change kernel implementation to use padding arrays in
+  // forward order (depth, width, height, batch).
+  // Build paddings in order of int[] = {batch, height, width, depth} to match
+  // kernel implementation of Pad in referenced_ops.h and optimized_ops.h.
+  for (int idx = op_context.dims - 1; idx >= 0; --idx) {
+    before_padding.push_back(paddings_data[idx * 2]);
+    after_padding.push_back(paddings_data[idx * 2 + 1]);
+  }
+
+#define TF_LITE_PAD(type, scalar)                                           \
+  type::Pad(GetTensorData<scalar>(op_context.input),                        \
+            GetTensorDims(op_context.input), before_padding, after_padding, \
+            GetTensorData<scalar>(op_context.output),                       \
+            GetTensorDims(op_context.output))
+
+  switch (op_context.input->type) {
+    case kTfLiteFloat32:
+      if (kernel_type == kReference) {
+        TF_LITE_PAD(reference_ops, float);
+      } else if (kernel_type == kGenericOptimized) {
+        TF_LITE_PAD(optimized_ops, float);
+      }
+      break;
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_PAD(reference_ops, uint8_t);
+      } else if (kernel_type == kGenericOptimized) {
+        TF_LITE_PAD(optimized_ops, uint8_t);
+      }
+      break;
+    case kTfLiteInt32:
+      if (kernel_type == kReference) {
+        TF_LITE_PAD(reference_ops, int32_t);
+      } else if (kernel_type == kGenericOptimized) {
+        TF_LITE_PAD(optimized_ops, int32_t);
+      }
+      break;
+    case kTfLiteInt64:
+      if (kernel_type == kReference) {
+        TF_LITE_PAD(reference_ops, int64_t);
+      } else if (kernel_type == kGenericOptimized) {
+        TF_LITE_PAD(optimized_ops, int64_t);
+      }
+      break;
+    default:
+      context->ReportError(context, "Type is currently not supported by Pad.");
+      return kTfLiteError;
+  }
+#undef TF_LITE_PAD
+  return kTfLiteOk;
+}
+
+}  // namespace pad
+
+TfLiteRegistration* Register_PAD_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, pad::Prepare,
+                                 pad::Eval<pad::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_PAD_GENERIC_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, pad::Prepare,
+                                 pad::Eval<pad::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_PAD() { return Register_PAD_GENERIC_OPT(); }
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/pad_test.cc b/tensorflow/contrib/lite/kernels/pad_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..28834ad0719291b2e868bca2d86a6685e6eb9962
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/pad_test.cc
@@ -0,0 +1,154 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class PadOpModel : public SingleOpModel {
+ public:
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+
+  void SetPaddings(std::initializer_list<int> paddings) {
+    PopulateTensor<int>(paddings_, paddings);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int output_;
+  int paddings_;
+};
+
+// Tests case where paddings is a const tensor.
+//
+// Example usage is as follows:
+//    PadOpDynamicModel m(input_shape, paddings_shape, paddings_data);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class PadOpConstModel : public PadOpModel {
+ public:
+  PadOpConstModel(std::initializer_list<int> input_shape,
+                  std::initializer_list<int> paddings_shape,
+                  std::initializer_list<int> paddings) {
+    input_ = AddInput(TensorType_FLOAT32);
+    paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
+                 CreatePadOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Test case where paddings is a non-const tensor.
+//
+// Example usage is as follows:
+//    PadOpDynamicModel m(input_shape, paddings_shape);
+//    m.SetInput(input_data);
+//    m.SetPaddings(paddings_data);
+//    m.Invoke();
+class PadOpDynamicModel : public PadOpModel {
+ public:
+  PadOpDynamicModel(std::initializer_list<int> input_shape,
+                    std::initializer_list<int> paddings_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    paddings_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
+                 CreatePadOptions(builder_).Union());
+    BuildInterpreter({input_shape, paddings_shape});
+  }
+};
+
+TEST(PadOpTest, TooManyDimensions) {
+  EXPECT_DEATH(
+      PadOpConstModel({1, 2, 3, 4, 5, 6, 7, 8, 9}, {9, 2},
+                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}),
+      "dims != 4");
+}
+
+TEST(PadOpTest, UnequalDimensions) {
+  EXPECT_DEATH(PadOpConstModel({1, 1, 2, 1}, {3, 2}, {1, 1, 2, 2, 3, 3}),
+               "3 != 4");
+}
+
+TEST(PadOpTest, InvalidPadValue) {
+  EXPECT_DEATH(
+      PadOpConstModel({1, 1, 2, 1}, {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0}),
+      "Pad value has to be greater than equal to 0.");
+}
+
+TEST(PadOpTest, SimpleConstTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadOpConstModel m({1, 2, 2, 1}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadOpTest, SimpleDynamicTest) {
+  PadOpDynamicModel m({1, 2, 2, 1}, {4, 2});
+  m.SetInput({1, 2, 3, 4});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadOpTest, AdvancedConstTest) {
+  PadOpConstModel m({1, 2, 3, 1}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST(PadOpTest, AdvancedDynamicTest) {
+  PadOpDynamicModel m({1, 2, 3, 1}, {4, 2});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/padding.h b/tensorflow/contrib/lite/kernels/padding.h
index 3a60274524c468ef29e522de5569e0d8354974c2..40b8476b3779c66e31a04856bce8aebd378f1e5f 100644
--- a/tensorflow/contrib/lite/kernels/padding.h
+++ b/tensorflow/contrib/lite/kernels/padding.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
 
 namespace tflite {
 
@@ -25,4 +25,4 @@ inline int ComputePadding(int stride, int in_size, int filter_size,
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
diff --git a/tensorflow/contrib/lite/kernels/pooling_test.cc b/tensorflow/contrib/lite/kernels/pooling_test.cc
index e1b51ec7d5141bf2a41e7ede3e90ff20ec523819..01c91b2ba905e249c36af19f175c68a7e7f17f6d 100644
--- a/tensorflow/contrib/lite/kernels/pooling_test.cc
+++ b/tensorflow/contrib/lite/kernels/pooling_test.cc
@@ -155,7 +155,7 @@ TEST(FloatPoolingOpTest, L2Pool) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index ca7a0dd1949a3a31d26be770a7df781cc5fe7533..0f365078cdf4f43d545a69cf5b4ac4d353615106 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -20,7 +20,7 @@ namespace ops {
 namespace builtin {
 
 TfLiteRegistration* Register_RELU();
-TfLiteRegistration* Register_RELU1();
+TfLiteRegistration* Register_RELU_N1_TO_1();
 TfLiteRegistration* Register_RELU6();
 TfLiteRegistration* Register_TANH();
 TfLiteRegistration* Register_LOGISTIC();
@@ -31,6 +31,8 @@ TfLiteRegistration* Register_CONV_2D();
 TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
 TfLiteRegistration* Register_SVDF();
 TfLiteRegistration* Register_RNN();
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN();
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_RNN();
 TfLiteRegistration* Register_EMBEDDING_LOOKUP();
 TfLiteRegistration* Register_EMBEDDING_LOOKUP_SPARSE();
 TfLiteRegistration* Register_FULLY_CONNECTED();
@@ -39,18 +41,30 @@ TfLiteRegistration* Register_HASHTABLE_LOOKUP();
 TfLiteRegistration* Register_SOFTMAX();
 TfLiteRegistration* Register_CONCATENATION();
 TfLiteRegistration* Register_ADD();
+TfLiteRegistration* Register_SPACE_TO_BATCH_ND();
+TfLiteRegistration* Register_DIV();
+TfLiteRegistration* Register_SUB();
+TfLiteRegistration* Register_BATCH_TO_SPACE_ND();
 TfLiteRegistration* Register_MUL();
 TfLiteRegistration* Register_L2_NORMALIZATION();
 TfLiteRegistration* Register_LOCAL_RESPONSE_NORMALIZATION();
 TfLiteRegistration* Register_LSTM();
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
+TfLiteRegistration* Register_PAD();
 TfLiteRegistration* Register_RESHAPE();
 TfLiteRegistration* Register_RESIZE_BILINEAR();
 TfLiteRegistration* Register_SKIP_GRAM();
 TfLiteRegistration* Register_SPACE_TO_DEPTH();
+TfLiteRegistration* Register_GATHER();
+TfLiteRegistration* Register_TRANSPOSE();
+TfLiteRegistration* Register_MEAN();
+TfLiteRegistration* Register_SQUEEZE();
+TfLiteRegistration* Register_STRIDED_SLICE();
+TfLiteRegistration* Register_EXP();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
-  AddBuiltin(BuiltinOperator_RELU1, Register_RELU1());
+  AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1());
   AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
   AddBuiltin(BuiltinOperator_TANH, Register_TANH());
   AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC());
@@ -61,6 +75,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D());
   AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
   AddBuiltin(BuiltinOperator_RNN, Register_RNN());
+  AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+             Register_BIDIRECTIONAL_SEQUENCE_RNN());
+  AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
+             Register_UNIDIRECTIONAL_SEQUENCE_RNN());
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
              Register_EMBEDDING_LOOKUP_SPARSE());
@@ -70,15 +88,28 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
   AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION());
   AddBuiltin(BuiltinOperator_ADD, Register_ADD());
+  AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND, Register_SPACE_TO_BATCH_ND());
+  AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND, Register_BATCH_TO_SPACE_ND());
   AddBuiltin(BuiltinOperator_MUL, Register_MUL());
   AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
              Register_LOCAL_RESPONSE_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LSTM, Register_LSTM());
+  AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+             Register_UNIDIRECTIONAL_SEQUENCE_LSTM());
+  AddBuiltin(BuiltinOperator_PAD, Register_PAD());
   AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
   AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR());
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
   AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH());
+  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER());
+  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE());
+  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN());
+  AddBuiltin(BuiltinOperator_DIV, Register_DIV());
+  AddBuiltin(BuiltinOperator_SUB, Register_SUB());
+  AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
+  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
+  AddBuiltin(BuiltinOperator_EXP, Register_EXP());
 }
 
 TfLiteRegistration* BuiltinOpResolver::FindOp(
diff --git a/tensorflow/contrib/lite/kernels/register.h b/tensorflow/contrib/lite/kernels/register.h
index 28f5e0fcc80a14cf9fb6fb19b795d0c0d55e0df9..b9cff0ae21086b44e0c920095d5f6c9668346f38 100644
--- a/tensorflow/contrib/lite/kernels/register.h
+++ b/tensorflow/contrib/lite/kernels/register.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
 
 #include <unordered_map>
 #include "tensorflow/contrib/lite/context.h"
@@ -47,4 +47,4 @@ class BuiltinOpResolver : public OpResolver {
 }  // namespace ops
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_BUILTIN_KERNELS_H
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_BUILTIN_KERNELS_H
diff --git a/tensorflow/contrib/lite/kernels/reshape_test.cc b/tensorflow/contrib/lite/kernels/reshape_test.cc
index 59ce7d5648c04f78123b16a195d3a4928d28394b..0fbcf6e6aa311d2cac491336ee54ccf58bbda8fd 100644
--- a/tensorflow/contrib/lite/kernels/reshape_test.cc
+++ b/tensorflow/contrib/lite/kernels/reshape_test.cc
@@ -83,8 +83,7 @@ TEST(ReshapeOpTest, WithStretchDimension) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
index 1613c9a89faa3579b913408cc09cdad7f942cb99..9e3e19c09a4012ebdadbc2a7c2ba06c4bfefd206 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
@@ -33,32 +33,44 @@ enum KernelType {
 };
 
 constexpr int kInputTensor = 0;
+constexpr int kSizeTensor = 1;
 constexpr int kOutputTensor = 0;
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context, TfLiteTensor* input,
+                                TfLiteTensor* size, TfLiteTensor* output) {
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+  output_size->data[0] = input->dims->data[0];
+  const int32* size_data = GetTensorData<int32>(size);
+  output_size->data[1] = size_data[0];
+  output_size->data[2] = size_data[1];
+  output_size->data[3] = input->dims->data[3];
+  return context->ResizeTensor(context, output, output_size);
+}
 
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* size = GetInput(context, node, kSizeTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // TODO(ahentz): Our current implementations rely on the inputs being 4D.
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(size), 1);
 
   // TODO(ahentz): Our current implementations only support float32.
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
-
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
-  output_size->data[0] = input->dims->data[0];
-  output_size->data[1] = params->new_height;
-  output_size->data[2] = params->new_width;
-  output_size->data[3] = input->dims->data[3];
-
-  return context->ResizeTensor(context, output, output_size);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, size->type, kTfLiteInt32);
+  // ResizeBilinear creates a float tensor even when the input is made of
+  // integers.
+  output->type = kTfLiteFloat32;
+
+  if (!IsConstantTensor(size)) {
+    SetTensorToDynamic(output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, input, size, output);
 }
 
 template <KernelType kernel_type>
@@ -68,15 +80,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* size = GetInput(context, node, kSizeTensor);
 
-  // We have to fake a tensor here, to satisfy ResizeBilinear().
-  int32 output_size_data[2] = {params->new_height, params->new_width};
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputTensor(context, input, size, output));
+  }
 
   if (output->type == kTfLiteFloat32) {
-#define TF_LITE_RESIZE_BILINEAR(type)                                     \
-  type::ResizeBilinear(GetTensorData<float>(input), GetTensorDims(input), \
-                       output_size_data, GetTensorDims({1, 1, 1, 2}),     \
-                       GetTensorData<float>(output), GetTensorDims(output))
+#define TF_LITE_RESIZE_BILINEAR(type)                                       \
+  type::ResizeBilinear(GetTensorData<float>(input), GetTensorDims(input),   \
+                       GetTensorData<int32>(size), GetTensorDims(size),     \
+                       GetTensorData<float>(output), GetTensorDims(output), \
+                       params->align_corners)
 
     if (kernel_type == kReference) {
       TF_LITE_RESIZE_BILINEAR(reference_ops);
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
index 0257c0b557feb352413bcc33cb4e2ecdb32c5111..4e03f3820a5c14ee1692c553db61e385716b1723 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
@@ -25,63 +25,101 @@ using ::testing::ElementsAreArray;
 
 class ResizeBilinearOpModel : public SingleOpModel {
  public:
-  ResizeBilinearOpModel(std::initializer_list<int> input_shape, int new_height,
-                        int new_width) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(
-        BuiltinOperator_RESIZE_BILINEAR, BuiltinOptions_ResizeBilinearOptions,
-        CreateResizeBilinearOptions(builder_, new_height, new_width).Union());
-    BuildInterpreter({input_shape});
+  ResizeBilinearOpModel(const TensorData& input,
+                        std::initializer_list<int> size_data = {}) {
+    bool const_size = size_data.size() != 0;
+    input_ = AddInput(input);
+    if (const_size) {
+      size_ = AddConstInput(TensorType_INT32, size_data, {2});
+    } else {
+      size_ = AddInput({TensorType_INT32, {2}});
+    }
+    output_ = AddOutput(TensorType_FLOAT32);  // Always float.
+    SetBuiltinOp(BuiltinOperator_RESIZE_BILINEAR,
+                 BuiltinOptions_ResizeBilinearOptions,
+                 CreateResizeBilinearOptions(builder_).Union());
+    if (const_size) {
+      BuildInterpreter({GetShape(input_)});
+    } else {
+      BuildInterpreter({GetShape(input_), GetShape(size_)});
+    }
   }
 
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor(input_, data);
   }
+  void SetSize(std::initializer_list<int> data) { PopulateTensor(size_, data); }
 
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
 
  private:
   int input_;
+  int size_;
   int output_;
 };
 
 TEST(ResizeBilinearOpTest, HorizontalResize) {
-  ResizeBilinearOpModel m({1, 1, 2, 1}, 1, 3);
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}});
   m.SetInput({3, 6});
+  m.SetSize({1, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6})));
 }
 
 TEST(ResizeBilinearOpTest, VerticalResize) {
-  ResizeBilinearOpModel m({1, 2, 1, 1}, 3, 1);
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
   m.SetInput({3, 9});
+  m.SetSize({3, 1});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9})));
 }
 
 TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
-  ResizeBilinearOpModel m({1, 2, 2, 1}, 3, 3);
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
   m.SetInput({
       3, 6,  //
       9, 12  //
   });
+  m.SetSize({3, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
                                  3, 5, 6,    //
                                  7, 9, 10,   //
                                  9, 11, 12,  //
                              })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                       3, 5, 6,    //
+                                       7, 9, 10,   //
+                                       9, 11, 12,  //
+                                   })));
 }
 
 TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
-  ResizeBilinearOpModel m({2, 2, 2, 1}, 3, 3);
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
   m.SetInput({
       3, 6,   //
       9, 12,  //
       4, 10,  //
       10, 16  //
   });
+  m.SetSize({3, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
                                  3, 5, 6,     //
@@ -91,27 +129,57 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
                                  8, 12, 14,   //
                                  10, 14, 16,  //
                              })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3});
+  const_m.SetInput({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                       3, 5, 6,     //
+                                       7, 9, 10,    //
+                                       9, 11, 12,   //
+                                       4, 8, 10,    //
+                                       8, 12, 14,   //
+                                       10, 14, 16,  //
+                                   })));
 }
 
 TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
-  ResizeBilinearOpModel m({1, 2, 2, 2}, 3, 3);
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}});
   m.SetInput({
       3, 4, 6, 10,    //
       9, 10, 12, 16,  //
   });
+  m.SetSize({3, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
                                  3, 4, 5, 8, 6, 10,      //
                                  7, 8, 9, 12, 10, 14,    //
                                  9, 10, 11, 14, 12, 16,  //
                              })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 2}}, {3, 3});
+  const_m.SetInput({
+      3, 4, 6, 10,    //
+      9, 10, 12, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                       3, 4, 5, 8, 6, 10,      //
+                                       7, 8, 9, 12, 10, 14,    //
+                                       9, 10, 11, 14, 12, 16,  //
+                                   })));
 }
 
 }  // namespace
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/skip_gram_test.cc b/tensorflow/contrib/lite/kernels/skip_gram_test.cc
index e7f6bc904be5e4c23a88f5b4ae7e199346c78ab2..185b64cb44969b57588ea5d0b40f55b6ddf8e11f 100644
--- a/tensorflow/contrib/lite/kernels/skip_gram_test.cc
+++ b/tensorflow/contrib/lite/kernels/skip_gram_test.cc
@@ -251,7 +251,7 @@ TEST(SkipGramTest, TestInputWithExtraSpace) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/softmax_test.cc b/tensorflow/contrib/lite/kernels/softmax_test.cc
index ec8ec03b0d0279cad8543352b1dbaf34c88a7957..6c5338ff0fd26337c9adc8e0b94a0a88edfde37f 100644
--- a/tensorflow/contrib/lite/kernels/softmax_test.cc
+++ b/tensorflow/contrib/lite/kernels/softmax_test.cc
@@ -136,8 +136,7 @@ TEST(SoftmaxOpTest, CompareWithTFminiBetaNotEq1) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8c9e352f00627eee45ae836b720f2af77140538
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
@@ -0,0 +1,186 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace space_to_batch_nd {
+
+// This file has two implementations of SpaceToBatchND.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
+struct SpaceToBatchNDContext {
+  SpaceToBatchNDContext(TfLiteContext* context, TfLiteNode* node) {
+    input = GetInput(context, node, 0);
+    block_shape = GetInput(context, node, 1);
+    paddings = GetInput(context, node, 2);
+    output = GetOutput(context, node, 0);
+  }
+  TfLiteTensor* input;
+  TfLiteTensor* block_shape;
+  TfLiteTensor* paddings;
+  TfLiteTensor* output;
+};
+
+// Currently, only 4D NHWC input/output op_context are supported.
+// The 4D array need to have exactly 2 spatial dimensions.
+// TODO(nupurgarg): Support arbitrary dimension in SpaceToBatchND.
+const int kInputDimensionNum = 4;
+const int kBlockSizeDimensionNum = 1;
+const int kSpatialDimensionNum = 2;
+
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                SpaceToBatchNDContext* op_context) {
+  TfLiteIntArray* input_size = op_context->input->dims;
+  const int32* block_shape = GetTensorData<int32>(op_context->block_shape);
+  const int32* paddings_data = GetTensorData<int32>(op_context->paddings);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->block_shape),
+                    kBlockSizeDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context->block_shape->dims->data[0],
+                    kSpatialDimensionNum);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->paddings),
+                    kSpatialDimensionNum);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
+
+  // Ensures the input height and width (with padding) is a multiple of block
+  // shape height and width.
+  for (int dim = 0; dim < kSpatialDimensionNum; ++dim) {
+    int final_dim_size = (input_size->data[dim + 1] + paddings_data[dim * 2] +
+                          paddings_data[dim * 2 + 1]);
+    TF_LITE_ENSURE_EQ(context, final_dim_size % block_shape[dim], 0);
+    output_size->data[dim + 1] = final_dim_size / block_shape[dim];
+  }
+
+  const int output_batch_size =
+      input_size->data[0] * block_shape[0] * block_shape[1];
+  const int output_channel_size = input_size->data[3];
+
+  output_size->data[0] = output_batch_size;
+  output_size->data[3] = output_channel_size;
+
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  SpaceToBatchNDContext op_context(context, node);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.input),
+                    kInputDimensionNum);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  if (!IsConstantTensor(op_context.block_shape) ||
+      !IsConstantTensor(op_context.paddings)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  SpaceToBatchNDContext op_context(context, node);
+
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+  }
+
+#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar)                        \
+  type::SpaceToBatchND(GetTensorData<scalar>(op_context.input),        \
+                       GetTensorDims(op_context.input),                \
+                       GetTensorData<int32_t>(op_context.block_shape), \
+                       GetTensorDims(op_context.block_shape),          \
+                       GetTensorData<int32_t>(op_context.paddings),    \
+                       GetTensorDims(op_context.paddings),             \
+                       GetTensorData<scalar>(op_context.output),       \
+                       GetTensorDims(op_context.output))
+  switch (op_context.input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, float);
+      } else {
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, float);
+      }
+      break;
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, uint8_t);
+      } else {
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, uint8_t);
+      }
+      break;
+    case kTfLiteInt32:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int32_t);
+      } else {
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int32_t);
+      }
+      break;
+    case kTfLiteInt64:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int64_t);
+      } else {
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int64_t);
+      }
+      break;
+    default:
+      context->ReportError(context,
+                           "Type is currently not supported by SpaceToBatch.");
+      return kTfLiteError;
+  }
+#undef TF_LITE_SPACE_TO_BATCH_ND
+  return kTfLiteOk;
+}
+
+}  // namespace space_to_batch_nd
+
+TfLiteRegistration* Register_SPACE_TO_BATCH_ND_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, space_to_batch_nd::Prepare,
+      space_to_batch_nd::Eval<space_to_batch_nd::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_SPACE_TO_BATCH_ND_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, space_to_batch_nd::Prepare,
+      space_to_batch_nd::Eval<space_to_batch_nd::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_SPACE_TO_BATCH_ND() {
+  // return Register_SPACE_TO_BATCH_ND_REF();
+  return Register_SPACE_TO_BATCH_ND_GENERIC_OPT();
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..92a4a037d5873e608ee7bdbdfc5eaa5e9b62bc8c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
@@ -0,0 +1,199 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class SpaceToBatchNDOpModel : public SingleOpModel {
+ public:
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+
+  void SetBlockShape(std::initializer_list<int> data) {
+    PopulateTensor<int>(block_shape_, data);
+  }
+
+  void SetPaddings(std::initializer_list<int> data) {
+    PopulateTensor<int>(paddings_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int block_shape_;
+  int paddings_;
+  int output_;
+};
+
+// Tests case where block_shape and paddings are const tensors.
+//
+// Example usage is as follows:
+//    SpaceToBatchNDOpConstModel m(input_shape, block_shape, paddings);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel {
+ public:
+  SpaceToBatchNDOpConstModel(std::initializer_list<int> input_shape,
+                             std::initializer_list<int> block_shape,
+                             std::initializer_list<int> paddings) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
+    paddings_ = AddConstInput(TensorType_INT32, paddings, {2, 2});
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
+                 BuiltinOptions_SpaceToBatchNDOptions,
+                 CreateSpaceToBatchNDOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Tests case where block_shape and paddings are non-const tensors.
+//
+// Example usage is as follows:
+//    SpaceToBatchNDOpDynamicModel m(input_shape);
+//    m.SetInput(input_data);
+//    m.SetBlockShape(block_shape);
+//    m.SetPaddings(paddings);
+//    m.Invoke();
+class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel {
+ public:
+  SpaceToBatchNDOpDynamicModel(std::initializer_list<int> input_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddInput(TensorType_INT32);
+    paddings_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
+                 BuiltinOptions_SpaceToBatchNDOptions,
+                 CreateSpaceToBatchNDOptions(builder_).Union());
+    BuildInterpreter({input_shape, {2}, {2, 2}});
+  }
+};
+
+TEST(SpaceToBatchNDOpTest, InvalidShapeTest) {
+  EXPECT_DEATH(SpaceToBatchNDOpConstModel({1, 3, 3, 1}, {2, 2}, {0, 0, 0, 0}),
+               "Cannot allocate tensors");
+}
+
+TEST(SpaceToBatchNDOpTest, SimpleConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 4, 4, 1}, {2, 2}, {0, 0, 0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(SpaceToBatchNDOpTest, SimpleDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({1, 4, 4, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetPaddings({0, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(SpaceToBatchNDOpTest, MultipleInputBatchesConstTest) {
+  SpaceToBatchNDOpConstModel m({2, 2, 4, 1}, {2, 2}, {0, 0, 0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(SpaceToBatchNDOpTest, MultipleInputBatchesDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({2, 2, 4, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetPaddings({0, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(SpaceToBatchNDOpTest, SimplePaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 5, 2, 1}, {3, 2}, {1, 0, 2, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 5, 0, 0, 0, 6, 0, 1, 0, 7,
+                                 0, 2, 0, 8, 0, 3, 0, 9, 0, 4, 0, 10,
+                             }));
+}
+
+TEST(SpaceToBatchNDOpTest, SimplePaddingDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({1, 5, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 0, 2, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 5, 0, 0, 0, 6, 0, 1, 0, 7,
+                                 0, 2, 0, 8, 0, 3, 0, 9, 0, 4, 0, 10,
+                             }));
+}
+
+TEST(SpaceToBatchNDOpTest, ComplexPaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 4, 2, 1}, {3, 2}, {1, 1, 2, 4});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0,
+                                 0, 1, 0, 0, 0, 7, 0, 0, 0, 2, 0, 0, 0, 8, 0, 0,
+                                 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,
+                             }));
+}
+
+TEST(SpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({1, 4, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 1, 2, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0,
+                                 0, 1, 0, 0, 0, 7, 0, 0, 0, 2, 0, 0, 0, 8, 0, 0,
+                                 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,
+                             }));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/space_to_depth_test.cc b/tensorflow/contrib/lite/kernels/space_to_depth_test.cc
index 911f08a92ccd6a97bee414c87bd79091808f0ed1..997f354861a235fb511235e4d64544dc8c3ddb34 100644
--- a/tensorflow/contrib/lite/kernels/space_to_depth_test.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_depth_test.cc
@@ -95,8 +95,7 @@ TEST(SpaceToDepthOpModel, Int64) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
-  tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/squeeze.cc b/tensorflow/contrib/lite/kernels/squeeze.cc
new file mode 100644
index 0000000000000000000000000000000000000000..29447ab021c7b68ff51070d35262402e08dc7ab9
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/squeeze.cc
@@ -0,0 +1,99 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace squeeze {
+
+struct SqueezeContext {
+  SqueezeContext(TfLiteContext* context, TfLiteNode* node) {
+    params = reinterpret_cast<TfLiteSqueezeParams*>(node->builtin_data);
+    input = GetInput(context, node, 0);
+    output = GetOutput(context, node, 0);
+  }
+  TfLiteSqueezeParams* params;
+  TfLiteTensor* input;
+  TfLiteTensor* output;
+};
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  SqueezeContext op_context(context, node);
+  int input_num_dims = NumDimensions(op_context.input);
+  int num_squeeze_dims = op_context.params->num_squeeze_dims;
+
+  // Determines number of dimensions of output tensor after squeeze.
+  const TfLiteIntArray* input_dims = op_context.input->dims;
+  const int* squeeze_dims = op_context.params->squeeze_dims;
+  TF_LITE_ENSURE(context, input_num_dims <= 8);
+  bool should_squeeze[8] = {false};
+  int num_squeezed_dims = 0;
+  if (num_squeeze_dims == 0) {
+    for (int idx = 0; idx < input_num_dims; ++idx) {
+      if (input_dims->data[idx] == 1) {
+        should_squeeze[idx] = true;
+        ++num_squeezed_dims;
+      }
+    }
+  } else {
+    for (int idx = 0; idx < num_squeeze_dims; ++idx) {
+      int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + input_num_dims
+                                          : squeeze_dims[idx];
+      TF_LITE_ENSURE(context, current >= 0 && current < input_num_dims &&
+                                  input_dims->data[current] == 1);
+      if (!should_squeeze[current]) ++num_squeezed_dims;
+      should_squeeze[current] = true;
+    }
+  }
+  // Sets output dimensions.
+  TfLiteIntArray* output_dims =
+      TfLiteIntArrayCreate(input_num_dims - num_squeezed_dims);
+  for (int in_idx = 0, out_idx = 0; in_idx < input_num_dims; ++in_idx) {
+    if (!should_squeeze[in_idx]) {
+      output_dims->data[out_idx++] = input_dims->data[in_idx];
+    }
+  }
+  return context->ResizeTensor(context, op_context.output, output_dims);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  SqueezeContext op_context(context, node);
+  TF_LITE_ENSURE_EQ(context, op_context.input->bytes, op_context.output->bytes);
+  memcpy(op_context.output->data.raw, op_context.input->data.raw,
+         op_context.input->bytes);
+  return kTfLiteOk;
+}
+
+}  // namespace squeeze
+
+TfLiteRegistration* Register_SQUEEZE() {
+  static TfLiteRegistration r = {nullptr, nullptr, squeeze::Prepare,
+                                 squeeze::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/squeeze_test.cc b/tensorflow/contrib/lite/kernels/squeeze_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a8aab88357cacbb72784a4bc6e860aeb47783eb3
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/squeeze_test.cc
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::IsEmpty;
+
+class BaseSqueezeOpModel : public SingleOpModel {
+ public:
+  BaseSqueezeOpModel(const TensorData& input, const TensorData& output,
+                     std::initializer_list<int> axis) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(
+        BuiltinOperator_SQUEEZE, BuiltinOptions_SqueezeOptions,
+        CreateSqueezeOptions(builder_, builder_.CreateVector<int>(axis))
+            .Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() { return input_; }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+class FloatSqueezeOpModel : public BaseSqueezeOpModel {
+ public:
+  using BaseSqueezeOpModel::BaseSqueezeOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+};
+
+TEST(FloatSqueezeOpTest, SqueezeAll) {
+  std::initializer_list<float> data = {
+      1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  FloatSqueezeOpModel m({TensorType_FLOAT32, {1, 24, 1}},
+                        {TensorType_FLOAT32, {24}}, {});
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({24}));
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                        9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                        17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}));
+}
+
+TEST(FloatSqueezeOpTest, SqueezeSelectedAxis) {
+  std::initializer_list<float> data = {
+      1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  FloatSqueezeOpModel m({TensorType_FLOAT32, {1, 24, 1}},
+                        {TensorType_FLOAT32, {24}}, {2});
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 24}));
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                        9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                        17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}));
+}
+
+TEST(FloatSqueezeOpTest, SqueezeNegativeAxis) {
+  std::initializer_list<float> data = {
+      1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
+      13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  FloatSqueezeOpModel m({TensorType_FLOAT32, {1, 24, 1}},
+                        {TensorType_FLOAT32, {24}}, {-1, 0});
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({24}));
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                        9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                        17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}));
+}
+
+TEST(FloatSqueezeOpTest, SqueezeAllDims) {
+  std::initializer_list<float> data = {3.85};
+  FloatSqueezeOpModel m({TensorType_FLOAT32, {1, 1, 1, 1, 1, 1, 1}},
+                        {TensorType_FLOAT32, {1}}, {});
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3.85}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fb1e11e0ca00abb36d7f29d562711a7bbcbeca1c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -0,0 +1,259 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <cmath>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace strided_slice {
+
+enum KernelType {
+  kReference,
+  // TODO(soroosh): add kGenericOptimized
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kBeginTensor = 1;
+constexpr int kEndTensor = 2;
+constexpr int kStridesTensor = 3;
+constexpr int kOutputTensor = 0;
+
+struct StridedSliceContext {
+  StridedSliceContext(TfLiteContext* context, TfLiteNode* node) {
+    params = reinterpret_cast<TfLiteStridedSliceParams*>(node->builtin_data);
+    input = GetInput(context, node, kInputTensor);
+    begin = GetInput(context, node, kBeginTensor);
+    end = GetInput(context, node, kEndTensor);
+    strides = GetInput(context, node, kStridesTensor);
+    output = GetOutput(context, node, kOutputTensor);
+    dims = NumDimensions(input);
+  }
+  TfLiteStridedSliceParams* params;
+  TfLiteTensor* input;
+  TfLiteTensor* begin;
+  TfLiteTensor* end;
+  TfLiteTensor* strides;
+  TfLiteTensor* output;
+  int dims;
+};
+
+// Reverse order of bits in the mask to match the expected order in kernel
+inline int ReverseMaskBits(int mask, int num_dimensions) {
+  int out = 0;
+  for (int dim = 0; dim < num_dimensions; dim++) {
+    out <<= 1;
+    out += (mask & 1);
+    mask >>= 1;
+  }
+  return out;
+}
+
+// This Op only supports 1-4D cases and since we use the reference 4D
+// implementation, the 1-3D tensors are mapped to 4D.
+const int kMaxDim = 4;
+
+inline int32_t PositiveRemainder(int32_t dividend, int32_t divisor) {
+  return (divisor + (dividend % divisor)) % divisor;
+}
+
+inline int32_t ClampedIndex(int32_t index, int dim, bool pos_stride) {
+  return pos_stride
+             ? (index >= dim ? dim
+                             : PositiveRemainder(
+                                   std::min(std::max(index, -dim), dim), dim))
+             : (index < -dim
+                    ? -1
+                    : PositiveRemainder(
+                          std::min(std::max(index, -dim), dim - 1), dim));
+}
+
+inline int32_t GetBeginValueAtIndex(StridedSliceContext* op_context, int idx) {
+  const int dim = op_context->input->dims->data[idx];
+  const bool pos_stride = GetTensorData<int32_t>(op_context->strides)[idx] > 0;
+  return op_context->params->begin_mask & (1 << idx)
+             ? pos_stride ? 0 : dim - 1
+             : ClampedIndex(GetTensorData<int32_t>(op_context->begin)[idx], dim,
+                            pos_stride);
+}
+
+inline int32_t GetEndValueAtIndex(StridedSliceContext* op_context, int idx) {
+  const int dim = op_context->input->dims->data[idx];
+  const bool pos_stride = GetTensorData<int32_t>(op_context->strides)[idx] > 0;
+  return op_context->params->end_mask & (1 << idx)
+             ? pos_stride ? dim : -1
+             : ClampedIndex(GetTensorData<int32_t>(op_context->end)[idx], dim,
+                            pos_stride);
+}
+
+// Processes the indexing tensors (begin, end and strides) to resize the
+// output tensor. This function is callable from both Prepare() and Eval() as
+// long as the caller ensures the indexing tensors are present.
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                StridedSliceContext* op_context) {
+  std::vector<int> output_shape_vector;
+
+  for (int idx = op_context->dims - 1; idx >= 0; --idx) {
+    int32_t stride = GetTensorData<int32_t>(op_context->strides)[idx];
+    TF_LITE_ENSURE_MSG(context, stride != 0, "stride value has to be non-zero");
+
+    int32_t begin = GetBeginValueAtIndex(op_context, idx);
+    int32_t end = GetEndValueAtIndex(op_context, idx);
+
+    // This is valid for both positive and negative strides
+    int32_t dim_shape = ceil((end - begin) / static_cast<float>(stride));
+    dim_shape = dim_shape < 0 ? 0 : dim_shape;
+    if (!(op_context->params->shrink_axis_mask & (1 << idx))) {
+      output_shape_vector.push_back(dim_shape);
+    }
+  }
+
+  TfLiteIntArray* output_shape =
+      TfLiteIntArrayCreate(output_shape_vector.size());
+
+  std::reverse_copy(output_shape_vector.begin(), output_shape_vector.end(),
+                    output_shape->data);
+
+  TF_LITE_ENSURE_STATUS(
+      context->ResizeTensor(context, op_context->output, output_shape));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  StridedSliceContext op_context(context, node);
+
+  // Ensure validity of input tensor and its dimension
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.begin), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.end), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context.strides), 1);
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+  // Only INT32 begin/end/strides are supported
+  // TODO(soroosh) add support for INT64
+  TF_LITE_ENSURE_EQ(context, op_context.begin->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, op_context.end->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, op_context.strides->type, kTfLiteInt32);
+  TF_LITE_ENSURE_MSG(context, op_context.dims <= 4,
+                     "StridedSlice op only supports 1D-4D input arrays.");
+
+  // TODO(soroosh): add the following missing functionalities
+  TF_LITE_ENSURE_MSG(context, op_context.params->ellipsis_mask == 0,
+                     "ellipsis_mask is not implemented yet.");
+  TF_LITE_ENSURE_MSG(context, op_context.params->new_axis_mask == 0,
+                     "new_axis_mask is not implemented yet.");
+
+  // Postpone allocation of output if any of the indexing tensors is not
+  // constant
+  if (!(IsConstantTensor(op_context.begin) &&
+        IsConstantTensor(op_context.end) &&
+        IsConstantTensor(op_context.strides))) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  StridedSliceContext op_context(context, node);
+
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+  }
+
+  std::vector<int32_t> starts;
+  std::vector<int32_t> stops;
+  std::vector<int32_t> strides;
+
+  for (int idx = op_context.dims - 1; idx >= 0; --idx) {
+    starts.emplace_back(GetBeginValueAtIndex(&op_context, idx));
+    stops.emplace_back(GetEndValueAtIndex(&op_context, idx));
+    strides.emplace_back(GetTensorData<int32_t>(op_context.strides)[idx]);
+  }
+
+  for (int i = op_context.dims; i < kMaxDim; i++) {
+    starts.emplace_back(0);
+    stops.emplace_back(1);
+    strides.emplace_back(1);
+  }
+
+  op_context.params->begin_mask =
+      ReverseMaskBits(op_context.params->begin_mask, op_context.dims);
+  op_context.params->end_mask =
+      ReverseMaskBits(op_context.params->end_mask, op_context.dims);
+  op_context.params->shrink_axis_mask =
+      ReverseMaskBits(op_context.params->shrink_axis_mask, op_context.dims);
+
+#define TF_LITE_STRIDED_SLICE(kernel_type, data_type)                      \
+  kernel_type::StridedSlice(                                               \
+      GetTensorData<data_type>(op_context.input),                          \
+      GetTensorDims(op_context.input), op_context.params->begin_mask,      \
+      op_context.params->end_mask, op_context.params->shrink_axis_mask,    \
+      starts, stops, strides, GetTensorData<data_type>(op_context.output), \
+      GetTensorDims(op_context.output))
+
+  switch (op_context.input->type) {
+    case kTfLiteFloat32:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, float);
+      }
+      break;
+    case kTfLiteInt32:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, int32_t);
+      }
+      break;
+    case kTfLiteInt64:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, int64_t);
+      }
+      break;
+    default:
+      context->ReportError(context,
+                           "Type is currently not supported "
+                           "by StridedSlice.");
+      return kTfLiteError;
+  }
+#undef TF_LITE_STRIDED_SLICE
+  return kTfLiteOk;
+}
+
+}  // namespace strided_slice
+
+TfLiteRegistration* Register_STRIDED_SLICE_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, strided_slice::Prepare,
+      strided_slice::Eval<strided_slice::kReference>};
+  return &r;
+}
+
+// TODO(soroosh): add optimized
+TfLiteRegistration* Register_STRIDED_SLICE() {
+  return Register_STRIDED_SLICE_REF();
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5cac04b38364958c5b0794c21742e8b592372ae9
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -0,0 +1,532 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::int32;
+using ::testing::ElementsAreArray;
+
+class StridedSliceOpModel : public SingleOpModel {
+ public:
+  StridedSliceOpModel(std::initializer_list<int> input_shape,
+                      std::initializer_list<int> begin_shape,
+                      std::initializer_list<int> end_shape,
+                      std::initializer_list<int> strides_shape, int begin_mask,
+                      int end_mask, int ellipsis_mask, int new_axis_mask,
+                      int shrink_axis_mask) {
+    input_ = AddInput(TensorType_FLOAT32);
+    begin_ = AddInput(TensorType_INT32);
+    end_ = AddInput(TensorType_INT32);
+    strides_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(
+        BuiltinOperator_STRIDED_SLICE, BuiltinOptions_StridedSliceOptions,
+        CreateStridedSliceOptions(builder_, begin_mask, end_mask, ellipsis_mask,
+                                  new_axis_mask, shrink_axis_mask)
+            .Union());
+    BuildInterpreter({input_shape, begin_shape, end_shape, strides_shape});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+  void SetBegin(std::initializer_list<int32> data) {
+    PopulateTensor<int32>(begin_, data);
+  }
+  void SetEnd(std::initializer_list<int32> data) {
+    PopulateTensor<int32>(end_, data);
+  }
+  void SetStrides(std::initializer_list<int32> data) {
+    PopulateTensor<int32>(strides_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int begin_;
+  int end_;
+  int strides_;
+  int output_;
+};
+
+TEST(StridedSliceOpTest, UnsupportedInputSize) {
+  EXPECT_DEATH(
+      StridedSliceOpModel({2, 2, 2, 2, 2}, {5}, {5}, {5}, 0, 0, 0, 0, 0),
+      "StridedSlice op only supports 1D-4D input arrays.");
+}
+
+TEST(StridedSliceOpTest, UnssupportedArgs) {
+  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 1, 0, 0),
+               "ellipsis_mask is not implemented yet.");
+  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
+               "new_axis_mask is not implemented yet.");
+}
+
+TEST(StridedSliceOpTest, In1D) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3}));
+}
+
+TEST(StridedSliceOpTest, In1D_EmptyOutput) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({10});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({0}));
+}
+
+TEST(StridedSliceOpTest, In1D_NegativeBegin) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-3});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3}));
+}
+
+TEST(StridedSliceOpTest, In1D_OutOfRangeBegin) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-5});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(StridedSliceOpTest, In1D_NegativeEnd) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({-2});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
+}
+
+TEST(StridedSliceOpTest, In1D_OutOfRangeEnd) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-3});
+  m.SetEnd({5});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4}));
+}
+
+TEST(StridedSliceOpTest, In1D_BeginMask) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStride) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-2});
+  m.SetEnd({-3});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+}
+
+TEST(StridedSliceOpTest, In1D_OutOfRangeBeginNegativeStride) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({5});
+  m.SetEnd({2});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4}));
+}
+
+TEST(StridedSliceOpTest, In1D_NegativeEndNegativeStride) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({2});
+  m.SetEnd({-4});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 2}));
+}
+
+TEST(StridedSliceOpTest, In1D_OutOfRangeEndNegativeStride) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-3});
+  m.SetEnd({-5});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 1}));
+}
+
+TEST(StridedSliceOpTest, In1D_EndMask) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 1, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4}));
+}
+
+TEST(StridedSliceOpTest, In1D_NegStride) {
+  StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3});
+  m.SetBegin({-1});
+  m.SetEnd({-4});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 2, 1}));
+}
+
+TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
+  StridedSliceOpModel m({2}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2});
+  m.SetBegin({0});
+  m.SetEnd({2});
+  m.SetStrides({2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
+
+TEST(StridedSliceOpTest, In1D_OddLenStride2) {
+  StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3});
+  m.SetBegin({0});
+  m.SetEnd({3});
+  m.SetStrides({2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3}));
+}
+
+TEST(StridedSliceOpTest, In2D_Identity) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(StridedSliceOpTest, In2D) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, 0});
+  m.SetEnd({2, 2});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5}));
+}
+
+TEST(StridedSliceOpTest, In2D_Stride2) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({2, 2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3}));
+}
+
+TEST(StridedSliceOpTest, In2D_NegStride) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, -1});
+  m.SetEnd({2, -4});
+  m.SetStrides({2, -1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 5, 4}));
+}
+
+TEST(StridedSliceOpTest, In2D_BeginMask) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, 0});
+  m.SetEnd({2, 2});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 4, 5}));
+}
+
+TEST(StridedSliceOpTest, In2D_EndMask) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, 0});
+  m.SetEnd({2, 2});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5, 6}));
+}
+
+TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 2, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, -2});
+  m.SetEnd({2, -4});
+  m.SetStrides({1, -1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 5, 4}));
+}
+
+TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, -2});
+  m.SetEnd({2, -3});
+  m.SetStrides({1, -1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 4}));
+}
+
+TEST(StridedSliceOpTest, In3D_Identity) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3, 2}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+}
+
+TEST(StridedSliceOpTest, In3D_NegStride) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({-1, -1, -1});
+  m.SetEnd({-3, -4, -3});
+  m.SetStrides({-1, -1, -1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3, 2}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}));
+}
+
+TEST(StridedSliceOpTest, In3D_Strided2) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({2, 2, 2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5}));
+}
+
+TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
+}
+
+TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({2});
+  m.SetEnd({1});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+}
+
+TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetEnd({3});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
+
+TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) {
+  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({-2});
+  m.SetEnd({-3});
+  m.SetStrides({-1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+}
+
+TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 4}));
+}
+
+TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
+  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({2, 3});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 7, 8}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5, 7, 9, 11}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 7}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
+  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/contrib/lite/kernels/sub.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ddaf498d5bac0109429224e7cf66cb3debcabc22
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/sub.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace sub {
+
+// This file has three implementation of Div.
+enum KernelType {
+  kReference,
+  kGenericOptimized,  // Neon-free
+  kNeonOptimized,
+};
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
+  for (int i = 0; i < NumDimensions(input1); ++i) {
+    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
+                      SizeOfDimension(input2, i));
+  }
+
+  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
+  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+void EvalSubFloat(TfLiteContext* context, TfLiteNode* node,
+                  TfLiteSubParams* params, TfLiteTensor* input1,
+                  TfLiteTensor* input2, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(params->activation, &output_activation_min,
+                                &output_activation_max);
+#define TF_LITE_Sub(type)                                        \
+  type::Sub(GetTensorData<float>(input1), GetTensorDims(input1), \
+            GetTensorData<float>(input2), GetTensorDims(input2), \
+            output_activation_min, output_activation_max,        \
+            GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    TF_LITE_Sub(reference_ops);
+  } else {
+    TF_LITE_Sub(optimized_ops);
+  }
+#undef TF_LITE_Sub
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
+
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    EvalSubFloat<kernel_type>(context, node, params, input1, input2, output);
+  } else {
+    context->ReportError(context, "Inputs and outputs not all float types.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace sub
+
+TfLiteRegistration* Register_SUB_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, sub::Prepare,
+                                 sub::Eval<sub::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_SUB_GENERIC_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, sub::Prepare,
+                                 sub::Eval<sub::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_SUB_NEON_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, sub::Prepare,
+                                 sub::Eval<sub::kNeonOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_SUB() {
+#ifdef USE_NEON
+  return Register_SUB_NEON_OPT();
+#else
+  return Register_SUB_GENERIC_OPT();
+#endif
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/contrib/lite/kernels/svdf.cc
index 72f705fe4242b01c1516c99d3500484e8729fd9a..c69755447d5093e25d408eb6dea80750937465e7 100644
--- a/tensorflow/contrib/lite/kernels/svdf.cc
+++ b/tensorflow/contrib/lite/kernels/svdf.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <unistd.h>
 #include <cassert>
 #include <cmath>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 
diff --git a/tensorflow/contrib/lite/kernels/svdf_test.cc b/tensorflow/contrib/lite/kernels/svdf_test.cc
index d956025e9dfc9b6c03e55657023fb042c8ac485d..0f166dc69b95f3459388135b3a6c4d9b73a31cb4 100644
--- a/tensorflow/contrib/lite/kernels/svdf_test.cc
+++ b/tensorflow/contrib/lite/kernels/svdf_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite SVDF op.
 
-#include <vector>
 #include <iomanip>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -306,7 +306,7 @@ TEST(SVDFOpTest, BlackBoxTestRank2) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
index f716ba8741fd469e7ee405ac300924b53c5c48e5..6f56aa6bf38781e860e33e8ac3b6a0bb8b50bb01 100644
--- a/tensorflow/contrib/lite/kernels/test_util.cc
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -49,7 +49,7 @@ std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
   return matchers;
 }
 
-int SingleOpModel::AddTensor(TensorData t) {
+int SingleOpModel::AddTensor(TensorData t, std::initializer_list<int> data) {
   int id = tensors_.size();
 
   // This is slightly different depending on whether we are adding a
@@ -78,8 +78,23 @@ int SingleOpModel::AddTensor(TensorData t) {
         builder_.CreateVector<int64_t>({t.zero_point}));
   }
 
-  tensors_.push_back(CreateTensor(builder_, builder_.CreateVector<int>({}),
-                                  t.type, /*buffer=*/0,
+  int buffer_id = 0;
+  if (data.size()) {
+    // Initialize buffers list with empty buffer to allow for non-const tensors.
+    if (buffers_.empty()) {
+      buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
+    }
+
+    // Add data as a Buffer to buffers list.
+    buffer_id = buffers_.size();
+    auto data_buffer =
+        builder_.CreateVector(reinterpret_cast<const uint8_t*>(data.begin()),
+                              sizeof(int) * data.size());
+    buffers_.push_back(CreateBuffer(builder_, data_buffer));
+  }
+
+  tensors_.push_back(CreateTensor(builder_, builder_.CreateVector<int>(t.shape),
+                                  t.type, /*buffer=*/buffer_id,
                                   /*name=*/0, q_params));
 
   tensor_data_[id] = t;
@@ -88,7 +103,15 @@ int SingleOpModel::AddTensor(TensorData t) {
 }
 
 int SingleOpModel::AddInput(const TensorData& t) {
-  int id = AddTensor(t);
+  int id = AddTensor(t, {});
+  inputs_.push_back(id);
+  return id;
+}
+
+int SingleOpModel::AddConstInput(TensorType type,
+                                 std::initializer_list<int> data,
+                                 std::initializer_list<int> shape) {
+  int id = AddTensor(TensorData{type, shape}, data);
   inputs_.push_back(id);
   return id;
 }
@@ -100,7 +123,7 @@ int SingleOpModel::AddNullInput() {
 }
 
 int SingleOpModel::AddOutput(const TensorData& t) {
-  int id = AddTensor(t);
+  int id = AddTensor(t, {});
   outputs_.push_back(id);
   return id;
 }
@@ -142,19 +165,21 @@ void SingleOpModel::BuildInterpreter(
   subgraphs.push_back(subgraph);
   auto subgraphs_flatbuffer = builder_.CreateVector(subgraphs);
 
-  std::vector<flatbuffers::Offset<Buffer>> buffers_vec;
-  auto buffers = builder_.CreateVector(buffers_vec);
+  auto buffers = builder_.CreateVector(buffers_);
   auto description = builder_.CreateString("programmatic model");
   builder_.Finish(CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
                               subgraphs_flatbuffer, description, buffers));
 
   auto* model = GetModel(builder_.GetBufferPointer());
 
-  ops::builtin::BuiltinOpResolver builtins;
-  for (const auto& reg : custom_registrations_) {
-    builtins.AddCustom(reg.first.data(), reg.second());
+  if (!resolver_) {
+    auto resolver = new ops::builtin::BuiltinOpResolver();
+    for (const auto& reg : custom_registrations_) {
+      resolver->AddCustom(reg.first.data(), reg.second());
+    }
+    resolver_ = std::unique_ptr<OpResolver>(resolver);
   }
-  InterpreterBuilder(model, builtins)(&interpreter_);
+  InterpreterBuilder(model, *resolver_)(&interpreter_);
 
   CHECK(interpreter_ != nullptr);
 
@@ -180,4 +205,17 @@ int32_t SingleOpModel::GetTensorSize(int index) const {
   return total_size;
 }
 
+template <>
+std::vector<string> SingleOpModel::ExtractVector(int index) {
+  TfLiteTensor* tensor_ptr = interpreter_->tensor(index);
+  CHECK(tensor_ptr != nullptr);
+  const int num_strings = GetStringCount(tensor_ptr);
+  std::vector<string> result;
+  result.reserve(num_strings);
+  for (int i = 0; i < num_strings; ++i) {
+    const auto str = GetString(tensor_ptr, i);
+    result.emplace_back(str.str, str.len);
+  }
+  return result;
+}
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
index e68e49466119c50ec123edb84f1b1b6390a15a60..7d476ba1eaffbb24fb77390c0e71c32d60b6411e 100644
--- a/tensorflow/contrib/lite/kernels/test_util.h
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
 
 #include <vector>
 
@@ -24,16 +24,11 @@ limitations under the License.
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/testing/util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tflite {
 
-inline void LogToStderr() {
-#ifdef PLATFORM_GOOGLE
-  FLAGS_logtostderr = true;
-#endif
-}
-
 // A gmock matcher that check that elements of a float vector match to a given
 // tolerance.
 std::vector<::testing::Matcher<float>> ArrayFloatNear(
@@ -90,6 +85,23 @@ struct TensorData {
   int32_t zero_point;
 };
 
+class SingleOpResolver : public OpResolver {
+ public:
+  SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration)
+      : op_(op), registration_(registration) {}
+  TfLiteRegistration* FindOp(BuiltinOperator op) const override {
+    if (op == op_) {
+      return registration_;
+    }
+    return nullptr;
+  }
+  TfLiteRegistration* FindOp(const char* op) const override { return nullptr; }
+
+ private:
+  const BuiltinOperator op_;
+  TfLiteRegistration* registration_;
+};
+
 class SingleOpModel {
  public:
   SingleOpModel() {}
@@ -103,6 +115,10 @@ class SingleOpModel {
   int AddInput(TensorType type) { return AddInput(TensorData{type}); }
   int AddInput(const TensorData& t);
 
+  // Add a Tensor containing const data and return the tensor id.
+  int AddConstInput(TensorType type, std::initializer_list<int> data,
+                    std::initializer_list<int> shape);
+
   // Add a null input tensor (optional input) and return kOptionalTensor.
   int AddNullInput();
 
@@ -179,14 +195,19 @@ class SingleOpModel {
     return result;
   }
 
+  void SetResolver(std::unique_ptr<OpResolver> resolver) {
+    resolver_ = std::move(resolver);
+  }
+
  protected:
   int32_t GetTensorSize(int index) const;
 
   flatbuffers::FlatBufferBuilder builder_;
   std::unique_ptr<tflite::Interpreter> interpreter_;
+  std::unique_ptr<OpResolver> resolver_;
 
  private:
-  int AddTensor(TensorData t);
+  int AddTensor(TensorData t, std::initializer_list<int> data);
 
   std::map<int, TensorData> tensor_data_;
   std::vector<int32_t> inputs_;
@@ -194,9 +215,43 @@ class SingleOpModel {
   std::vector<flatbuffers::Offset<Tensor>> tensors_;
   std::vector<flatbuffers::Offset<OperatorCode>> opcodes_;
   std::vector<flatbuffers::Offset<Operator>> operators_;
+  std::vector<flatbuffers::Offset<Buffer>> buffers_;
   std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
 };
 
+// Base class for single op unit tests.
+// The tests are parameterized to test multiple kernels for a single op.
+// The parameters are strings like "optimized" and "reference" to have better
+// readability in test reports.
+//
+// To use this class:
+// * Define a constant map from strings to TfLiteRegistration.
+// * Implement a test class that inherits SingleOpTest.
+// * Instantiate the test cases with SingleOpTest::GetKernelTags helper
+//   function.
+// * Call GetRegistration to get the TfLiteRegistration to be used before
+//   building the interpreter.
+class SingleOpTest : public ::testing::TestWithParam<string> {
+ public:
+  static std::vector<string> GetKernelTags(
+      const std::map<string, TfLiteRegistration*>& kernel_map) {
+    std::vector<string> tags;
+    for (auto it : kernel_map) {
+      tags.push_back(it.first);
+    }
+    return tags;
+  }
+
+ protected:
+  virtual const std::map<string, TfLiteRegistration*>& GetKernelMap() = 0;
+  TfLiteRegistration* GetRegistration() {
+    return GetKernelMap().at(GetParam());
+  }
+};
+
+// Strings have a special implementation that is in test_util.cc
+template <>
+std::vector<string> SingleOpModel::ExtractVector(int index);
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/transpose.cc b/tensorflow/contrib/lite/kernels/transpose.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d3c10a9bb7b07404ccd8cfe2636473a622b91787
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/transpose.cc
@@ -0,0 +1,159 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace transpose {
+
+// This file has two implementations of Transpose.
+enum KernelType {
+  kReference,
+};
+
+struct TransposeContext {
+  TransposeContext(TfLiteContext* context, TfLiteNode* node) {
+    input = GetInput(context, node, 0);
+    perm = GetInput(context, node, 1);
+    output = GetOutput(context, node, 0);
+  }
+  TfLiteTensor* input;
+  TfLiteTensor* perm;
+  TfLiteTensor* output;
+};
+
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                TransposeContext* op_context) {
+  int dims = NumDimensions(op_context->input);
+  const int* perm_data = GetTensorData<int32_t>(op_context->perm);
+
+  // Ensure validity of the permutations tensor as a 1D tensor.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->perm), 1);
+  TF_LITE_ENSURE_EQ(context, op_context->perm->dims->data[0], dims);
+  for (int idx = 0; idx < dims; ++idx) {
+    TF_LITE_ENSURE_MSG(context, (perm_data[idx] >= 0 && perm_data[idx] < dims),
+                       "Transpose op permutations array is out of bounds.");
+  }
+
+  // Determine size of output tensor.
+  TfLiteIntArray* input_size = op_context->input->dims;
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
+  for (int idx = 0; idx < dims; ++idx) {
+    output_size->data[idx] = input_size->data[perm_data[idx]];
+  }
+
+  return context->ResizeTensor(context, op_context->output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TransposeContext op_context(context, node);
+
+  // Ensure validity of input tensor.
+  TF_LITE_ENSURE_MSG(context, NumDimensions(op_context.input) <= 4,
+                     "Transpose op only supports 1D-4D input arrays.");
+  TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+
+  if (!IsConstantTensor(op_context.perm)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, &op_context);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TransposeContext op_context(context, node);
+
+  // Resize the output tensor if the output tensor is dynamic.
+  if (IsDynamicTensor(op_context.output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+  }
+
+  // Reverse the permuted axes and convert to 4D due to the way Dims are
+  // constructed in GetTensorDims.
+  const int* perm_data = GetTensorData<int32_t>(op_context.perm);
+  const int size = op_context.perm->dims->data[0];
+  const int kOutputDimensionNum = 4;
+  int reversed_perm[kOutputDimensionNum];
+
+  for (int output_k = 0, input_k = size - 1; output_k < size;
+       ++output_k, --input_k) {
+    reversed_perm[output_k] = size - perm_data[input_k] - 1;
+  }
+  for (int k = size; k < kOutputDimensionNum; ++k) {
+    reversed_perm[k] = k;
+  }
+
+#define TF_LITE_TRANSPOSE(type, scalar)                     \
+  type::Transpose(GetTensorData<scalar>(op_context.input),  \
+                  GetTensorDims(op_context.input),          \
+                  GetTensorData<scalar>(op_context.output), \
+                  GetTensorDims(op_context.output), reversed_perm)
+
+  switch (op_context.input->type) {
+    case kTfLiteFloat32:
+      if (kernel_type == kReference) {
+        TF_LITE_TRANSPOSE(reference_ops, float);
+      }
+      break;
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_TRANSPOSE(reference_ops, uint8_t);
+      }
+      break;
+    case kTfLiteInt32:
+      if (kernel_type == kReference) {
+        TF_LITE_TRANSPOSE(reference_ops, int32_t);
+      }
+      break;
+    case kTfLiteInt64:
+      if (kernel_type == kReference) {
+        TF_LITE_TRANSPOSE(reference_ops, int64_t);
+      }
+      break;
+    default:
+      context->ReportError(context,
+                           "Type is currently not supported by Transpose.");
+      return kTfLiteError;
+  }
+#undef TF_LITE_TRANSPOSE
+
+  return kTfLiteOk;
+}
+
+}  // namespace transpose
+
+TfLiteRegistration* Register_TRANSPOSE_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, transpose::Prepare,
+                                 transpose::Eval<transpose::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_TRANSPOSE() { return Register_TRANSPOSE_REF(); }
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/transpose_test.cc b/tensorflow/contrib/lite/kernels/transpose_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..337bc144b967392523bf784603cca4c1b968cdf2
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/transpose_test.cc
@@ -0,0 +1,347 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+void RunTestPermutation(const std::vector<int>& shape,
+                        const std::vector<int>& perms,
+                        std::vector<float>* input_transposed) {
+  // Count elements and allocate output.
+  int count = 1;
+  for (auto factor : shape) count *= factor;
+  input_transposed->resize(count);
+
+  // Create the dummy data
+  std::vector<float> input(count);
+  for (int i = 0; i < input.size(); i++) {
+    input[i] = i;
+  }
+
+  // Create reversed and padded perms.
+  int reversed_perms[4];
+  for (int output_k = 0, input_k = shape.size() - 1; output_k < shape.size();
+       output_k++, input_k--) {
+    reversed_perms[output_k] = shape.size() - perms[input_k] - 1;
+  }
+  // Unused dimensions should not be permuted so pad with identity transform
+  // subset.
+  for (int k = shape.size(); k < 4; k++) {
+    reversed_perms[k] = k;
+  }
+
+  // Make input and output dims (i.e. reversed shape and dest_shape).
+  Dims<4> input_dims = GetTensorDims(shape);
+  Dims<4> output_dims;
+  for (int i = 0; i < 4; i++) {
+    output_dims.sizes[i] = input_dims.sizes[reversed_perms[i]];
+  }
+  output_dims.strides[0] = 1;
+  for (int k = 1; k < 4; k++) {
+    output_dims.strides[k] =
+        output_dims.strides[k - 1] * output_dims.sizes[k - 1];
+  }
+
+  reference_ops::Transpose<float>(input.data(), input_dims,
+                                  input_transposed->data(), output_dims,
+                                  reversed_perms);
+}
+
+TEST(TransposeTest, TestRefOps1D) {
+  // Basic 1D identity.
+  std::vector<float> out;
+  RunTestPermutation({3}, {0}, &out);
+  ASSERT_EQ(out, std::vector<float>({0, 1, 2}));
+}
+
+TEST(TransposeTest, TestRefOps2D) {
+  std::vector<float> out;
+  // Basic 2D.
+  RunTestPermutation({3, 2}, {1, 0}, &out);
+  ASSERT_EQ(out, std::vector<float>({0, 2, 4, 1, 3, 5}));
+  // Identity.
+  RunTestPermutation({3, 2}, {0, 1}, &out);
+  ASSERT_EQ(out, std::vector<float>({0, 1, 2, 3, 4, 5}));
+}
+
+TEST(TransposeTest, TestRefOps3D) {
+  std::vector<float> out;
+  // Test 3 dimensional
+  {
+    std::vector<float> ref({0, 4, 8,  12, 16, 20, 1, 5, 9,  13, 17, 21,
+                            2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23});
+    RunTestPermutation({2, 3, 4}, {2, 0, 1}, &out);
+    ASSERT_EQ(out, ref);
+  }
+  // Test 3 dimensional identity transform
+  {
+    RunTestPermutation({2, 3, 4}, {0, 1, 2}, &out);
+    std::vector<float> ref(out.size());
+    for (int k = 0; k < ref.size(); k++) ref[k] = k;
+    ASSERT_EQ(out, ref);
+  }
+}
+
+TEST(TransposeTest, TestRefOps4D) {
+  std::vector<float> out;
+  // Basic 4d.
+  RunTestPermutation({2, 3, 4, 5}, {2, 0, 1, 3}, &out);
+  ASSERT_EQ(
+      out,
+      std::vector<float>(
+          {0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
+           60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
+           5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
+           65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
+           10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
+           70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
+           15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
+           75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119}));
+  RunTestPermutation({2, 3, 4, 5}, {0, 1, 2, 3}, &out);
+  // Basic identity.
+  std::vector<float> ref(out.size());
+  for (int k = 0; k < ref.size(); k++) ref[k] = k;
+  ASSERT_EQ(out, ref);
+}
+
+class TransposeOpModel : public SingleOpModel {
+ public:
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+
+  void SetPerm(std::initializer_list<int> data) {
+    PopulateTensor<int>(perm_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int perm_;
+  int output_;
+};
+
+// Tests case where perm is a const tensor.
+//
+// Example usage is as follows:
+//    SpaceToBatchNDOpConstModel m(input_shape, perm_shape, perm_data);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class TransposeOpConstModel : public TransposeOpModel {
+ public:
+  TransposeOpConstModel(std::initializer_list<int> input_shape,
+                        std::initializer_list<int> perm_shape,
+                        std::initializer_list<int> perm) {
+    input_ = AddInput(TensorType_FLOAT32);
+    perm_ = AddConstInput(TensorType_INT32, perm, perm_shape);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions,
+                 CreateTransposeOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Tests case where perm is a non-const tensor.
+//
+// Example usage is as follows:
+//    TransposeOpDynamicModel m(input_shape, perm_shape);
+//    m.SetInput(input_data);
+//    m.SetPerm(perm_data);
+//    m.Invoke();
+class TransposeOpDynamicModel : public TransposeOpModel {
+ public:
+  TransposeOpDynamicModel(std::initializer_list<int> input_shape,
+                          std::initializer_list<int> perm_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    perm_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_TRANSPOSE, BuiltinOptions_TransposeOptions,
+                 CreateTransposeOptions(builder_).Union());
+    BuildInterpreter({input_shape, perm_shape});
+  }
+};
+
+TEST(TransposeTest, TestUnequalPermSize) {
+  EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {2}, {2, 2}), "2 != 4");
+}
+
+TEST(TransposeTest, TestPermOutOfBounds) {
+  EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {4}, {0, -1, -2, -3}),
+               "Transpose op permutations array is out of bounds.");
+  EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {4}, {0, 1, 2, 4}),
+               "Transpose op permutations array is out of bounds.");
+}
+
+TEST(TransposeTest, Test1DInputConstTensor) {
+  TransposeOpConstModel m({3}, {1}, {0});
+  m.SetInput({1, 2, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(TransposeTest, Test1DInputDynamicTensor) {
+  TransposeOpDynamicModel m({3}, {1});
+  m.SetInput({1, 2, 3});
+  m.SetPerm({0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(TransposeTest, Test2DInputConstTensor) {
+  TransposeOpConstModel m({3, 2}, {2}, {1, 0});
+  m.SetInput({0, 1, 2, 3, 4, 5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 4, 1, 3, 5}));
+}
+
+TEST(TransposeTest, Test2DInputDynamicTensor) {
+  TransposeOpDynamicModel m({3, 2}, {2});
+  m.SetInput({0, 1, 2, 3, 4, 5});
+  m.SetPerm({1, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 4, 1, 3, 5}));
+}
+
+TEST(TransposeTest, Test3DInputConstTensor) {
+  TransposeOpConstModel m({2, 3, 4}, {3}, {2, 0, 1});
+  m.SetInput({0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 4, 8,  12, 16, 20, 1, 5, 9,  13, 17, 21,
+                                2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
+}
+
+TEST(TransposeTest, Test3DInputDynamicTensor) {
+  TransposeOpDynamicModel m({2, 3, 4}, {3});
+  m.SetInput({0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+              12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+  m.SetPerm({2, 0, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 4, 8,  12, 16, 20, 1, 5, 9,  13, 17, 21,
+                                2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
+}
+
+TEST(TransposeTest, Test5DInputTensor) {
+  EXPECT_DEATH(TransposeOpConstModel({1, 2, 3, 4, 5}, {5}, {0, 1, 2, 3, 4}),
+               "Transpose op only supports 1D-4D input arrays.");
+}
+
+TEST(TransposeTest, SimpleTestNoReorderConstTensor) {
+  TransposeOpConstModel m({1, 2, 3, 1}, {4}, {0, 1, 2, 3});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(TransposeTest, SimpleTestNoReorderDynamicTensor) {
+  TransposeOpDynamicModel m({1, 2, 3, 1}, {4});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetPerm({0, 1, 2, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(TransposeTest, SimpleTestWithReorderConstTensor) {
+  TransposeOpConstModel m({1, 2, 3, 1}, {4}, {2, 1, 3, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2, 1, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
+}
+
+TEST(TransposeTest, ComplexTestWithReorderConstTensor) {
+  TransposeOpConstModel m({2, 3, 4, 5}, {4}, {2, 0, 1, 3});
+  m.SetInput({0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,
+              12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
+              24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
+              36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+              48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+              60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
+              72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
+              84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+              96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+              108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3, 5}));
+  auto result = ElementsAreArray(
+      {0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
+       60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
+       5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
+       65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
+       10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
+       70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
+       15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
+       75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119});
+  EXPECT_THAT(m.GetOutput(), result);
+}
+
+TEST(TransposeTest, ComplexTestWithReorderDynamicTensor) {
+  TransposeOpDynamicModel m({2, 3, 4, 5}, {4});
+  m.SetInput({0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,
+              12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,
+              24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,
+              36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+              48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+              60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
+              72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
+              84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+              96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106, 107,
+              108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119});
+  m.SetPerm({2, 0, 1, 3});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 3, 5}));
+  auto result = ElementsAreArray(
+      {0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
+       60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
+       5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
+       65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
+       10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
+       70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
+       15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
+       75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119});
+  EXPECT_THAT(m.GetOutput(), result);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cdb58714edb5fee771fc45f3c53a570f8fb28d1
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
@@ -0,0 +1,527 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace unidirectional_sequence_lstm {
+
+// Input Tensors of size {max_time, n_batch, n_input}
+constexpr int kInputTensor = 0;
+
+// Input weight tensors of size: {n_cell, n_input}
+constexpr int kInputToInputWeightsTensor = 1;  // Optional
+constexpr int kInputToForgetWeightsTensor = 2;
+constexpr int kInputToCellWeightsTensor = 3;
+constexpr int kInputToOutputWeightsTensor = 4;
+
+// Recurrent weight tensors of size {n_cell, n_output}
+constexpr int kRecurrentToInputWeightsTensor = 5;  // Optional
+constexpr int kRecurrentToForgetWeightsTensor = 6;
+constexpr int kRecurrentToCellWeightsTensor = 7;
+constexpr int kRecurrentToOutputWeightsTensor = 8;
+
+// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
+constexpr int kCellToInputWeightsTensor = 9;    // Optional
+constexpr int kCellToForgetWeightsTensor = 10;  // Optional
+constexpr int kCellToOutputWeightsTensor = 11;  // Optional
+
+// Gates bias tensors of size {n_cell}
+constexpr int kInputGateBiasTensor = 12;  // Optional
+constexpr int kForgetGateBiasTensor = 13;
+constexpr int kCellGateBiasTensor = 14;
+constexpr int kOutputGateBiasTensor = 15;
+
+// Projection weight tensor of size {n_output, n_cell}
+constexpr int kProjectionWeightsTensor = 16;  // Optional
+// Projection bias tensor of size {n_output}
+constexpr int kProjectionBiasTensor = 17;  // Optional
+
+// Output tensors.
+constexpr int kScratchBufferTensor = 0;
+constexpr int kOutputStateTensor = 1;
+constexpr int kCellStateTensor = 2;
+constexpr int kOutputTensor = 3;
+
+// Check that input tensor dimensions matches with each other.
+TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
+                                        TfLiteNode* node, int n_input,
+                                        int n_output, int n_cell) {
+  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+
+  // Making sure clipping parameters have valid values.
+  // == 0 means no clipping
+  //  > 0 means clipping
+  TF_LITE_ENSURE(context, params->cell_clip >= 0);
+  TF_LITE_ENSURE(context, params->proj_clip >= 0);
+
+  TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  if (input_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
+  }
+
+  TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
+
+  TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
+
+  TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  if (recurrent_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
+                      n_cell);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
+                      n_output);
+  }
+
+  TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
+                    n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
+                    n_output);
+
+  TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
+                    n_output);
+
+  // We make sure the input-gate's parameters are either both present (regular
+  // LSTM) or not at all (CIFG-LSTM).
+  const bool cifg_weights_all_or_none =
+      ((input_to_input_weights != nullptr) &&
+       (recurrent_to_input_weights != nullptr)) ||
+      ((input_to_input_weights == nullptr) &&
+       (recurrent_to_input_weights == nullptr));
+  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
+
+  TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  if (cell_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
+  }
+
+  TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  if (cell_to_forget_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
+  }
+
+  TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+  if (cell_to_output_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
+  }
+
+  // Making sure the peephole weights are there all or none.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool peephole_weights_all_or_none =
+      ((cell_to_input_weights != nullptr || use_cifg) &&
+       (cell_to_forget_weights != nullptr) &&
+       (cell_to_output_weights != nullptr)) ||
+      ((cell_to_input_weights == nullptr) &&
+       (cell_to_forget_weights == nullptr) &&
+       (cell_to_output_weights == nullptr));
+  TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
+
+  // Make sure the input gate bias is present only when not a CIFG-LSTM.
+  TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
+  }
+
+  TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
+
+  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
+
+  TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
+
+  TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  if (projection_weights) {
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
+  }
+
+  TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+  if (projection_bias) {
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
+  }
+
+  // Making sure the projection tensors are consistent:
+  // 1) If projection weight is not present, then projection bias should not be
+  // present.
+  // 2) If projection weight is present, then projection bias is optional.
+  // TODO(ghodrat): make sure this is correct.
+  const bool projecton_tensors_consistent =
+      ((projection_weights != nullptr) || (projection_bias == nullptr));
+  TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
+
+  return kTfLiteOk;
+}
+
+// Resize the output, state and scratch tensors based on the sizes of the input
+// tensors. Also check that the size of the input tensors match each other.
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 18);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 4);
+
+  // Inferring batch size, number of outputs and sequence length and
+  // number of cells from the input tensors.
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input->dims->size > 1);
+  const int max_time = input->dims->data[0];
+  const int n_batch = input->dims->data[1];
+  const int n_input = input->dims->data[2];
+
+  TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+  const int n_cell = input_to_output_weights->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
+
+  TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
+                    n_cell);
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Check that input tensor dimensions matches with each other.
+  CheckInputTensorDimensions(context, node, n_input, n_output, n_cell);
+
+  // Get the pointer to output, state and scratch buffer tensors.
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
+  TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
+  // TODO(ghodrat): Modify this as soon as we have a finalized method for
+  // scratch buffers.
+  TfLiteTensor* scratch_buffer = GetOutput(context, node, kScratchBufferTensor);
+
+  // Resize the output and output_state tensors.
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
+  output_size->data[0] = max_time;
+  output_size->data[1] = n_batch;
+  output_size->data[2] = n_output;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size));
+
+  TfLiteIntArray* output_state_size = TfLiteIntArrayCreate(2);
+  output_state_size->data[0] = n_batch;
+  output_state_size->data[1] = n_output;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, output_state, output_state_size));
+
+  // Resize the scratch buffer tensor.
+  TfLiteIntArray* cell_size = TfLiteIntArrayCreate(2);
+  cell_size->data[0] = n_batch;
+  cell_size->data[1] = n_cell;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, cell_state, cell_size));
+
+  // Mark state tensors as persistent tensors.
+  output_state->allocation_type = kTfLiteArenaRwPersistent;
+  cell_state->allocation_type = kTfLiteArenaRwPersistent;
+
+  TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  if (use_cifg) {
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+    scratch_buffer_size->data[0] = n_batch;
+    // Reserving space for Cell, Forget, Output gates
+    scratch_buffer_size->data[1] = n_cell * 3;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  } else {
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+    scratch_buffer_size->data[0] = n_batch;
+    // Reserving space for Input, Cell, Forget, Output gates
+    scratch_buffer_size->data[1] = n_cell * 4;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  }
+  return kTfLiteOk;
+}
+
+// The LSTM Op engine.
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+
+  TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+
+  TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+
+  TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+
+  TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+
+  TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
+  TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  const int max_time = input->dims->data[0];
+  const int n_batch = input->dims->data[1];
+  const int n_input = input->dims->data[2];
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existense of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  TfLiteTensor* scratch_buffer = GetOutput(context, node, kScratchBufferTensor);
+  float* input_gate_scratch = nullptr;
+  float* cell_scratch = nullptr;
+  float* forget_gate_scratch = nullptr;
+  float* output_gate_scratch = nullptr;
+  if (use_cifg) {
+    cell_scratch = scratch_buffer->data.f;
+    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+  } else {
+    input_gate_scratch = scratch_buffer->data.f;
+    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
+  }
+
+  for (int t = 0; t < max_time; t++) {
+    const float* input_ptr_time = input->data.f + t * n_batch * n_input;
+    // Initialize scratch buffers with bias.
+    if (!use_cifg) {
+      tensor_utils::VectorBatchVectorAssign(input_gate_bias->data.f, n_cell,
+                                            n_batch, input_gate_scratch);
+    }
+    tensor_utils::VectorBatchVectorAssign(forget_gate_bias->data.f, n_cell,
+                                          n_batch, forget_gate_scratch);
+    tensor_utils::VectorBatchVectorAssign(cell_bias->data.f, n_cell, n_batch,
+                                          cell_scratch);
+    tensor_utils::VectorBatchVectorAssign(output_gate_bias->data.f, n_cell,
+                                          n_batch, output_gate_scratch);
+
+    // For each batch and cell: compute input_weight * input.
+    if (!use_cifg) {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          input_to_input_weights->data.f, n_cell, n_input, input_ptr_time,
+          n_batch, input_gate_scratch, /*result_stride=*/1);
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_forget_weights->data.f, n_cell, n_input, input_ptr_time,
+        n_batch, forget_gate_scratch, /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_cell_weights->data.f, n_cell, n_input, input_ptr_time, n_batch,
+        cell_scratch, /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_output_weights->data.f, n_cell, n_input, input_ptr_time,
+        n_batch, output_gate_scratch, /*result_stride=*/1);
+
+    // For each batch and cell: compute recurrent_weight * output_state.
+    if (!use_cifg) {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          recurrent_to_input_weights->data.f, n_cell, n_output,
+          output_state->data.f, n_batch, input_gate_scratch,
+          /*result_stride=*/1);
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_forget_weights->data.f, n_cell, n_output,
+        output_state->data.f, n_batch, forget_gate_scratch,
+        /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_cell_weights->data.f, n_cell, n_output,
+        output_state->data.f, n_batch, cell_scratch, /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_output_weights->data.f, n_cell, n_output,
+        output_state->data.f, n_batch, output_gate_scratch,
+        /*result_stride=*/1);
+
+    // For each batch and cell: update input gate.
+    if (!use_cifg) {
+      if (use_peephole) {
+        tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+            cell_to_input_weights->data.f, n_cell, cell_state->data.f, n_batch,
+            input_gate_scratch);
+      }
+      tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+                                         input_gate_scratch);
+    }
+
+    // For each batch and cell: update forget gate.
+    if (use_peephole) {
+      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+          cell_to_forget_weights->data.f, n_cell, cell_state->data.f, n_batch,
+          forget_gate_scratch);
+    }
+    tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+                                       forget_gate_scratch);
+
+    // For each batch and cell: update the cell.
+    tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch,
+                                           cell_state->data.f, n_batch * n_cell,
+                                           cell_state->data.f);
+    tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
+                                          params->activation, cell_scratch);
+    if (use_cifg) {
+      tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                               forget_gate_scratch);
+      tensor_utils::VectorVectorCwiseProductAccumulate(
+          cell_scratch, forget_gate_scratch, n_batch * n_cell,
+          cell_state->data.f);
+    } else {
+      tensor_utils::VectorVectorCwiseProductAccumulate(
+          cell_scratch, input_gate_scratch, n_batch * n_cell,
+          cell_state->data.f);
+    }
+    if (params->cell_clip > 0.0) {
+      tensor_utils::ClipVector(cell_state->data.f, n_batch * n_cell,
+                               params->cell_clip, cell_state->data.f);
+    }
+
+    // For each batch and cell: update the output gate.
+    if (use_peephole) {
+      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+          cell_to_output_weights->data.f, n_cell, cell_state->data.f, n_batch,
+          output_gate_scratch);
+    }
+    tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+                                       output_gate_scratch);
+    tensor_utils::ApplyActivationToVector(cell_state->data.f, n_batch * n_cell,
+                                          params->activation, cell_scratch);
+    tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                           n_batch * n_cell,
+                                           output_gate_scratch);
+
+    // For each batch: update the projection and output_state.
+    const bool use_projection_weight = (projection_weights != nullptr);
+    const bool use_projection_bias = (projection_bias != nullptr);
+    float* output_ptr_time = output->data.f + t * n_batch * n_output;
+    if (use_projection_weight) {
+      if (use_projection_bias) {
+        tensor_utils::VectorBatchVectorAssign(projection_bias->data.f, n_output,
+                                              n_batch, output_ptr_time);
+      } else {
+        tensor_utils::ZeroVector(output_ptr_time, n_batch * n_output);
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          projection_weights->data.f, n_output, n_cell, output_gate_scratch,
+          n_batch, output_ptr_time, /*result_stride=*/1);
+      if (params->proj_clip > 0.0) {
+        tensor_utils::ClipVector(output_ptr_time, n_batch * n_output,
+                                 params->proj_clip, output_ptr_time);
+      }
+    } else {
+      tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
+                               output_ptr_time);
+    }
+    tensor_utils::CopyVector(output_ptr_time, n_batch * n_output,
+                             output_state->data.f);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace unidirectional_sequence_lstm
+
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 unidirectional_sequence_lstm::Prepare,
+                                 unidirectional_sequence_lstm::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..93b635ae576e99854796d9fa997e5bf355b20534
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -0,0 +1,1089 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Sequential LSTM op.
+
+#include <iomanip>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class UnidirectionalLSTMOpModel : public SingleOpModel {
+ public:
+  UnidirectionalLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
+                            int sequence_length, bool use_cifg,
+                            bool use_peephole, bool use_projection_weights,
+                            bool use_projection_bias, float cell_clip,
+                            float proj_clip,
+                            const std::vector<std::vector<int>>& input_shapes)
+      : n_batch_(n_batch),
+        n_input_(n_input),
+        n_cell_(n_cell),
+        n_output_(n_output),
+        sequence_length_(sequence_length) {
+    input_ = AddInput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      input_to_input_weights_ = AddNullInput();
+    } else {
+      input_to_input_weights_ = AddInput(TensorType_FLOAT32);
+    }
+
+    input_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+    input_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+    input_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      recurrent_to_input_weights_ = AddNullInput();
+    } else {
+      recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32);
+    }
+
+    recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+    recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+    recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+    if (use_peephole) {
+      if (use_cifg) {
+        cell_to_input_weights_ = AddNullInput();
+      } else {
+        cell_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      }
+      cell_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+      cell_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    } else {
+      cell_to_input_weights_ = AddNullInput();
+      cell_to_forget_weights_ = AddNullInput();
+      cell_to_output_weights_ = AddNullInput();
+    }
+
+    if (use_cifg) {
+      input_gate_bias_ = AddNullInput();
+    } else {
+      input_gate_bias_ = AddInput(TensorType_FLOAT32);
+    }
+    forget_gate_bias_ = AddInput(TensorType_FLOAT32);
+    cell_bias_ = AddInput(TensorType_FLOAT32);
+    output_gate_bias_ = AddInput(TensorType_FLOAT32);
+
+    if (use_projection_weights) {
+      projection_weights_ = AddInput(TensorType_FLOAT32);
+      if (use_projection_bias) {
+        projection_bias_ = AddInput(TensorType_FLOAT32);
+      } else {
+        projection_bias_ = AddNullInput();
+      }
+    } else {
+      projection_weights_ = AddNullInput();
+      projection_bias_ = AddNullInput();
+    }
+
+    scratch_buffer_ = AddOutput(TensorType_FLOAT32);
+    // TODO(ghodrat): Modify these states when we have a permanent solution for
+    // persistent buffer.
+    output_state_ = AddOutput(TensorType_FLOAT32);
+    cell_state_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+                 BuiltinOptions_LSTMOptions,
+                 CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
+                                   cell_clip, proj_clip)
+                     .Union());
+    BuildInterpreter(input_shapes);
+  }
+
+  void SetInputToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(cell_to_output_weights_, f);
+  }
+
+  void SetInputGateBias(std::initializer_list<float> f) {
+    PopulateTensor(input_gate_bias_, f);
+  }
+
+  void SetForgetGateBias(std::initializer_list<float> f) {
+    PopulateTensor(forget_gate_bias_, f);
+  }
+
+  void SetCellBias(std::initializer_list<float> f) {
+    PopulateTensor(cell_bias_, f);
+  }
+
+  void SetOutputGateBias(std::initializer_list<float> f) {
+    PopulateTensor(output_gate_bias_, f);
+  }
+
+  void SetProjectionWeights(std::initializer_list<float> f) {
+    PopulateTensor(projection_weights_, f);
+  }
+
+  void SetProjectionBias(std::initializer_list<float> f) {
+    PopulateTensor(projection_bias_, f);
+  }
+
+  void ResetOutputState() {
+    const int zero_buffer_size = n_cell_ * n_batch_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(output_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  void ResetCellState() {
+    const int zero_buffer_size = n_cell_ * n_batch_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(cell_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+  int num_cells() { return n_cell_; }
+  int num_batches() { return n_batch_; }
+  int sequence_length() { return sequence_length_; }
+
+ private:
+  int input_;
+  int input_to_input_weights_;
+  int input_to_forget_weights_;
+  int input_to_cell_weights_;
+  int input_to_output_weights_;
+
+  int recurrent_to_input_weights_;
+  int recurrent_to_forget_weights_;
+  int recurrent_to_cell_weights_;
+  int recurrent_to_output_weights_;
+
+  int cell_to_input_weights_;
+  int cell_to_forget_weights_;
+  int cell_to_output_weights_;
+
+  int input_gate_bias_;
+  int forget_gate_bias_;
+  int cell_bias_;
+  int output_gate_bias_;
+
+  int projection_weights_;
+  int projection_bias_;
+
+  int output_;
+  int output_state_;
+  int cell_state_;
+  int scratch_buffer_;
+
+  int n_batch_;
+  int n_input_;
+  int n_cell_;
+  int n_output_;
+  int sequence_length_;
+};
+
+TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  UnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
+      /*use_peephole=*/false, /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      });
+
+  lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
+                               -0.34550029, 0.04266912, -0.15680569,
+                               -0.34856534, 0.43890524});
+
+  lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
+                              -0.20583314, 0.44344562, 0.22077113,
+                              -0.29909778});
+
+  lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
+                                -0.31343272, -0.40032279, 0.44781327,
+                                0.01387155, -0.35593212});
+
+  lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
+                                0.40525138, 0.44272184, 0.03897077, -0.1556896,
+                                0.19487578});
+
+  lstm.SetInputGateBias({0., 0., 0., 0.});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324,
+       -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322,
+       -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841,
+       -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659,
+       -0.46367589, 0.26016325, -0.03894562, -0.16368064});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892,
+       -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436,
+       0.28053468, 0.01560611, -0.20127171, -0.01140004});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793,
+       0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421,
+       -0.51818722, -0.15390486, 0.0468148, 0.39922136});
+
+  // Input should have n_input * sequence_length many values.
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_golden_output[] = {-0.02973187, 0.1229473,   0.20885126,
+                                       -0.15358765, -0.03716109, 0.12507336,
+                                       0.41193449,  -0.20860538, -0.15053082,
+                                       0.09120187,  0.24278517,  -0.12222792};
+
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
+
+  float* batch0_start = lstm_input;
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+
+  lstm.Invoke();
+
+  float* golden_start = lstm_golden_output;
+  float* golden_end =
+      golden_start + lstm.num_outputs() * lstm.sequence_length();
+  std::vector<float> expected;
+  expected.insert(expected.end(), golden_start, golden_end);
+  EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  UnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true,
+      /*use_peephole=*/true, /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      });
+
+  lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
+                              0.04717243, 0.48944736, -0.38535351,
+                              -0.17212132});
+
+  lstm.SetInputToForgetWeights({-0.55291498, -0.42866567, 0.13056988,
+                                -0.3633365, -0.22755712, 0.28253698, 0.24407166,
+                                0.33826375});
+
+  lstm.SetInputToOutputWeights({0.10725588, -0.02335852, -0.55932593,
+                                -0.09426838, -0.44257352, 0.54939759,
+                                0.01533556, 0.42751634});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToCellWeights(
+      {0.54066205, -0.32668582, -0.43562764, -0.56094903, 0.42957711,
+       0.01841056, -0.32764608, -0.33027974, -0.10826075, 0.20675004,
+       0.19069612, -0.03026325, -0.54532051, 0.33003211, 0.44901288,
+       0.21193194});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.13832897, -0.0515101, -0.2359007, -0.16661474, -0.14340827,
+       0.36986142, 0.23414481, 0.55899, 0.10798943, -0.41174671, 0.17751795,
+       -0.34484994, -0.35874045, -0.11352962, 0.27268326, 0.54058349});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.41613156, 0.42610586, -0.16495961, -0.5663873, 0.30579174, -0.05115908,
+       -0.33941799, 0.23364776, 0.11178309, 0.09481031, -0.26424935, 0.46261835,
+       0.50248802, 0.26114327, -0.43736315, 0.33149987});
+
+  lstm.SetCellToForgetWeights(
+      {0.47485286, -0.51955009, -0.24458408, 0.31544167});
+  lstm.SetCellToOutputWeights(
+      {-0.17135078, 0.82760304, 0.85573703, -0.77109635});
+
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_golden_output[] = {-0.36444446, -0.00352185, 0.12886585,
+                                       -0.05163646, -0.42312205, -0.01218222,
+                                       0.24201041,  -0.08124574, -0.358325,
+                                       -0.04621704, 0.21641694,  -0.06471302};
+
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
+
+  float* batch0_start = lstm_input;
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+
+  lstm.Invoke();
+
+  float* golden_start = lstm_golden_output;
+  float* golden_end =
+      golden_start + lstm.num_outputs() * lstm.sequence_length();
+  std::vector<float> expected;
+  expected.insert(expected.end(), golden_start, golden_end);
+  EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
+  const int sequence_length = 4;
+
+  UnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
+      /*use_peephole=*/true, /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      });
+
+  lstm.SetInputToInputWeights(
+      {0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
+       0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
+       -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
+       -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
+       -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
+       -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
+       -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
+       0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
+       0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
+       0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
+       -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
+       0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
+       -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
+       -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
+       -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
+       0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
+       -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
+       -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
+       -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
+       -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677});
+
+  lstm.SetInputToForgetWeights(
+      {-0.0018401089, -0.004852237,  0.03698424,   0.014181704,   0.028273236,
+       -0.016726194,  -0.05249759,   -0.10204261,  0.00861066,    -0.040979505,
+       -0.009899187,  0.01923892,    -0.028177269, -0.08535103,   -0.14585495,
+       0.10662567,    -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
+       0.0030784295,  0.076784775,   0.07463696,   0.094531395,   0.0814421,
+       -0.12257899,   -0.033945758,  -0.031303465, 0.045630626,   0.06843887,
+       -0.13492945,   -0.012480007,  -0.0811829,   -0.07224499,   -0.09628791,
+       0.045100946,   0.0012300825,  0.013964662,  0.099372394,   0.02543059,
+       0.06958324,    0.034257296,   0.0482646,    0.06267997,    0.052625068,
+       0.12784666,    0.07077897,    0.025725935,  0.04165009,    0.07241905,
+       0.018668644,   -0.037377294,  -0.06277783,  -0.08833636,   -0.040120605,
+       -0.011405586,  -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
+       0.05483423,    0.11449111,    0.11289652,   0.10939839,    0.13396506,
+       -0.08402166,   -0.01901462,   -0.044678304, -0.07720565,   0.014350063,
+       -0.11757958,   -0.0652038,    -0.08185733,  -0.076754324,  -0.092614375,
+       0.10405491,    0.052960336,   0.035755895,  0.035839386,   -0.012540553,
+       0.036881298,   0.02913376,    0.03420159,   0.05448447,    -0.054523353,
+       0.02582715,    0.02327355,    -0.011857179, -0.0011980024, -0.034641717,
+       -0.026125094,  -0.17582615,   -0.15923657,  -0.27486774,   -0.0006143371,
+       0.0001771948,  -8.470171e-05, 0.02651807,   0.045790765,   0.06956496});
+
+  lstm.SetInputToCellWeights(
+      {-0.04580283,   -0.09549462,   -0.032418985,  -0.06454633,
+       -0.043528453,  0.043018587,   -0.049152344,  -0.12418144,
+       -0.078985475,  -0.07596889,   0.019484362,   -0.11434962,
+       -0.0074034138, -0.06314844,   -0.092981495,  0.0062155537,
+       -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
+       0.10665918,    -0.032036792,  -0.08505916,   -0.10843358,
+       -0.13002433,   -0.036816437,  -0.02130134,   -0.016518239,
+       0.0047691227,  -0.0025825808, 0.066017866,   0.029991534,
+       -0.10652836,   -0.1037554,    -0.13056071,   -0.03266643,
+       -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
+       -0.025174323,  0.0396852,     0.081777506,   0.06157468,
+       0.10210095,    -0.009658194,  0.046511717,   0.03603906,
+       0.0069369148,  0.015960095,   -0.06507666,   0.09551598,
+       0.053568836,   0.06408714,    0.12835667,    -0.008714329,
+       -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
+       -0.029227901,  0.1164364,     -0.08560263,   0.09941786,
+       -0.036999565,  -0.028842626,  -0.0033637602, -0.017012902,
+       -0.09720865,   -0.11193351,   -0.029155117,  -0.017936034,
+       -0.009768936,  -0.04223324,   -0.036159635,  0.06505112,
+       -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
+       0.05453865,    0.091149814,   0.06387331,    0.007518393,
+       0.055960953,   0.069779344,   0.046411168,   0.10509911,
+       0.07463894,    0.0075130584,  0.012850982,   0.04555431,
+       0.056955688,   0.06555285,    0.050801456,   -0.009862683,
+       0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042});
+
+  lstm.SetInputToOutputWeights(
+      {-0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
+       -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
+       0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
+       -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
+       -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
+       0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
+       -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
+       -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
+       -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
+       -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
+       0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
+       0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
+       0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
+       -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
+       0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
+       0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
+       -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
+       0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
+       -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
+       -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956});
+
+  lstm.SetInputGateBias(
+      {0.02234832,  0.14757581,   0.18176508,  0.10380666,  0.053110216,
+       -0.06928846, -0.13942584,  -0.11816189, 0.19483899,  0.03652339,
+       -0.10250295, 0.036714908,  -0.18426876, 0.036065217, 0.21810818,
+       0.02383196,  -0.043370757, 0.08690144,  -0.04444982, 0.00030581196});
+
+  lstm.SetForgetGateBias({0.035185695, -0.042891346, -0.03032477, 0.23027696,
+                          0.11098921,  0.15378423,   0.09263801,  0.09790885,
+                          0.09508917,  0.061199076,  0.07665568,  -0.015443159,
+                          -0.03499149, 0.046190713,  0.08895977,  0.10899629,
+                          0.40694186,  0.06030037,   0.012413437, -0.06108739});
+
+  lstm.SetCellBias({-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
+                    -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
+                    -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
+                    -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
+                    0.016178843,  0.1749513,    0.13975595,   0.92058027});
+
+  lstm.SetOutputGateBias(
+      {0.046159424,  -0.0012809046, 0.03563469,   0.12648113, 0.027195795,
+       0.35373217,   -0.018957434,  0.008907322,  -0.0762701, 0.12018895,
+       0.04216877,   0.0022856654,  0.040952638,  0.3147856,  0.08225149,
+       -0.057416286, -0.14995944,   -0.008040261, 0.13208859, 0.029760877});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.001374326,   -0.078856036,   0.10672688,    0.029162422,
+       -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
+       -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
+       -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
+       0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
+       0.08981,        -0.045407712,   0.08682226,    -0.06867011,
+       -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
+       0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
+       -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
+       0.009352075,    0.22920375,     0.0016303885,  0.11583097,
+       -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
+       0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
+       -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
+       0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
+       -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
+       -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
+       -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
+       -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
+       -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
+       0.01068115,     0.032956902,    0.022433773,   0.0026891115,
+       0.08944216,     -0.0685835,     0.010513544,   0.07228705,
+       0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
+       0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
+       0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
+       -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
+       -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
+       0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
+       -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
+       -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
+       -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
+       -0.017142897,   0.03312627,     0.009205989,   0.024138335,
+       -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
+       -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
+       0.0365468,      0.07590991,     0.08838724,    0.021681072,
+       -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
+       0.023646897,    -0.095322326,   0.02233014,    0.09756986,
+       -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
+       -0.09801813,    0.019894179,    0.08502348,    0.004032281,
+       0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
+       -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
+       -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
+       0.010889619,    0.0047078193,   0.038385306,   0.08540671,
+       -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
+       0.015963363,    0.00871737,     0.060130805,   0.028611384,
+       0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
+       0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
+       0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
+       0.019899689,    0.006106124,    -0.027092824,  0.0786356,
+       0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
+       -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
+       -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
+       -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
+       -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
+       -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
+       0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
+       0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
+       -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
+       0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
+       0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
+       0.058618143,    -0.08598433,    0.00972939,    0.023867095,
+       -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
+       -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
+       0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
+       -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
+       -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
+       0.06358255,     0.18531723,     0.07759293,    0.12006465,
+       0.1305557,      0.058638252,    -0.03393652,   0.09622831,
+       -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
+       -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
+       0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
+       0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
+       0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
+       0.08184801,     -0.019164372,   0.06791302,    0.034257166,
+       -0.10307039,    0.021943003,    0.046745934,   0.0790918,
+       -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
+       -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
+       -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
+       0.026351685,    0.012641483,    0.07466548,    0.044301085,
+       -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
+       -0.04106223,    -0.028126027,   0.028473156,   0.10467447});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
+       0.14811787,    0.10826372,    0.09471067,     0.03987225,
+       -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
+       0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
+       0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
+       -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
+       -0.06193199,   0.055729095,   0.03736828,     0.020123724,
+       0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
+       -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
+       -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
+       0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
+       -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
+       -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
+       -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
+       0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
+       0.013454138,   0.028934088,   0.01685226,     -0.086110644,
+       -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
+       0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
+       0.03761666,    0.008096139,   -0.014454086,   0.014361001,
+       -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
+       -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
+       0.060212336,   0.055259194,   0.06974018,     0.049454916,
+       -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
+       0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
+       -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
+       0.0042065294,  0.03881498,    0.019844765,    0.041858196,
+       -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
+       0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
+       0.012321099,   0.082840554,   -0.029899208,   0.044217527,
+       0.059855383,   0.07711018,    -0.045319796,   0.0948846,
+       -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
+       -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
+       -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
+       0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
+       0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
+       0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
+       0.052958444,   0.07558703,    0.04817258,     0.044462286,
+       -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
+       0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
+       0.024734668,   0.024614193,   -0.042046934,   0.09597743,
+       -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
+       -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
+       -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
+       0.04383914,    -0.046476185,  0.028658995,    0.060410924,
+       0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
+       0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
+       0.015898481,   0.021362653,   -0.030262267,   0.016587038,
+       -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
+       -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
+       0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
+       -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
+       -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
+       -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
+       -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
+       0.15443139,    0.07684145,    0.036571592,    -0.035900835,
+       -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
+       -0.03858649,   0.01849943,    0.13872518,     0.01503974,
+       0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
+       -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
+       0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
+       0.05866852,    0.023947537,   -0.09445152,    0.035450947,
+       0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
+       0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
+       0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
+       0.051808182,   0.05875331,    -0.04536488,    0.001626336,
+       -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
+       0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
+       -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
+       -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
+       0.11475477,    -0.023854522,  0.10071741,     0.0686208,
+       -0.014250481,  0.034261297,   0.047418304,    0.08562733,
+       -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
+       0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
+       0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
+       0.014410365,   0.020995233,   0.17040324,     0.11511526,
+       0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
+       -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
+       -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
+       0.007076659,   0.10964551,    0.0409152,      0.008275321,
+       -0.07283536,   0.07937492,    0.04192024,     -0.1075027});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
+       0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
+       0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
+       -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
+       0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
+       0.08089997,     0.05143358,    0.038261272,   0.03339287,
+       -0.027673481,   0.044746667,   0.028349208,   0.020090483,
+       -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
+       -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
+       -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
+       0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
+       -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
+       -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
+       0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
+       0.010868644,    -0.031489216,  0.09525667,    0.013939797,
+       0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
+       -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
+       0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
+       0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
+       -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
+       0.02786344,     -0.014179351,  0.005264273,   0.14376344,
+       0.015983658,    0.03406988,    -0.06939408,   0.040699873,
+       0.02111075,     0.09669095,    0.041345075,   -0.08316494,
+       -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
+       0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
+       -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
+       0.06760663,     -0.027437469,  0.07216407,    0.06977076,
+       -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
+       0.043184172,    -0.037189785,  0.10420091,    0.00882477,
+       -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
+       0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
+       0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
+       -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
+       0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
+       -0.008264958,   0.042035464,   0.05891794,    0.029673764,
+       0.0063542654,   0.044788733,   0.054816857,   0.062257513,
+       -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
+       -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
+       -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
+       -0.007376126,   0.003533447,   0.006570588,   0.056037236,
+       0.12436656,     0.051817212,   0.028532185,   -0.08686856,
+       0.11868599,     0.07663395,    -0.07323171,   0.03463402,
+       -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
+       0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
+       0.023029093,    0.086124025,   0.006445803,   -0.03496501,
+       0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
+       -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
+       0.09465633,     0.008115513,   -0.02171956,   0.08304309,
+       0.071401566,    0.019622514,   0.032163795,   -0.004167056,
+       0.02295182,     0.030739572,   0.056506045,   0.004612461,
+       0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
+       -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
+       0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
+       -0.0329582,     0.07922767,    0.029322514,   0.026405897,
+       0.04207835,     -0.07073373,   0.063781224,   0.0859677,
+       -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
+       -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
+       -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
+       -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
+       0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
+       0.15978073,     0.10185836,    0.10298046,    -0.015476589,
+       -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
+       -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
+       -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
+       -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
+       -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
+       0.012962922,    -0.031234352,  0.07029052,    0.016418684,
+       0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
+       -0.054761406,   0.029065743,   0.052404847,   0.020238016,
+       0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
+       0.06262858,     0.009184685,   0.020785125,   -0.043904778,
+       -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
+       -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
+       0.09232601,     -0.035886683,  0.06000002,    0.05229691,
+       -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
+       -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
+       0.031502828,    0.036232427,   -0.031581745,  0.023051167,
+       -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
+       -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
+       -0.008799762,   0.056595087,   0.0022273948,  0.055752404});
+
+  lstm.SetRecurrentToOutputWeights({
+      0.025825322,   -0.05813119,  0.09495884,   -0.045984812,   -0.01255415,
+      -0.0026479573, -0.08196161,  -0.054914974, -0.0046604523,  -0.029587349,
+      -0.044576716,  -0.07480124,  -0.082868785, 0.023254942,    0.027502948,
+      -0.0039728214, -0.08683098,  -0.08116779,  -0.014675607,   -0.037924774,
+      -0.023314456,  -0.007401714, -0.09255757,  0.029460307,    -0.08829125,
+      -0.005139627,  -0.08989442,  -0.0555066,   0.13596267,     -0.025062224,
+      -0.048351806,  -0.03850004,  0.07266485,   -0.022414139,   0.05940088,
+      0.075114764,   0.09597592,   -0.010211725, -0.0049794707,  -0.011523867,
+      -0.025980417,  0.072999895,  0.11091378,   -0.081685916,   0.014416728,
+      0.043229222,   0.034178585,  -0.07530371,  0.035837382,    -0.085607,
+      -0.007721233,  -0.03287832,  -0.043848954, -0.06404588,    -0.06632928,
+      -0.073643476,  0.008214239,  -0.045984086, 0.039764922,    0.03474462,
+      0.060612556,   -0.080590084, 0.049127717,  0.04151091,     -0.030063879,
+      0.008801774,   -0.023021035, -0.019558564, 0.05158114,     -0.010947698,
+      -0.011825728,  0.0075720972, 0.0699727,    -0.0039981045,  0.069350146,
+      0.08799282,    0.016156472,  0.035502106,  0.11695009,     0.006217345,
+      0.13392477,    -0.037875112, 0.025745004,  0.08940699,     -0.00924166,
+      0.0046702605,  -0.036598757, -0.08811812,  0.10522024,     -0.032441203,
+      0.008176899,   -0.04454919,  0.07058152,   0.0067963637,   0.039206743,
+      0.03259838,    0.03725492,   -0.09515802,  0.013326398,    -0.052055415,
+      -0.025676316,  0.03198509,   -0.015951829, -0.058556724,   0.036879618,
+      0.043357447,   0.028362012,  -0.05908629,  0.0059240665,   -0.04995891,
+      -0.019187413,  0.0276265,    -0.01628143,  0.0025863599,   0.08800015,
+      0.035250366,   -0.022165963, -0.07328642,  -0.009415526,   -0.07455109,
+      0.11690406,    0.0363299,    0.07411125,   0.042103454,    -0.009660886,
+      0.019076364,   0.018299393,  -0.046004917, 0.08891175,     0.0431396,
+      -0.026327137,  -0.051502608, 0.08979574,   -0.051670972,   0.04940282,
+      -0.07491107,   -0.021240504, 0.022596184,  -0.034280192,   0.060163025,
+      -0.058211457,  -0.051837247, -0.01349775,  -0.04639988,    -0.035936575,
+      -0.011681591,  0.064818054,  0.0073146066, -0.021745546,   -0.043124277,
+      -0.06471268,   -0.07053354,  -0.029321948, -0.05330136,    0.016933719,
+      -0.053782392,  0.13747959,   -0.1361751,   -0.11569455,    0.0033329215,
+      0.05693899,    -0.053219706, 0.063698,     0.07977434,     -0.07924483,
+      0.06936997,    0.0034815092, -0.007305279, -0.037325785,   -0.07251102,
+      -0.033633437,  -0.08677009,  0.091591336,  -0.14165086,    0.021752775,
+      0.019683983,   0.0011612234, -0.058154266, 0.049996935,    0.0288841,
+      -0.0024567875, -0.14345716,  0.010955264,  -0.10234828,    0.1183656,
+      -0.0010731248, -0.023590032, -0.072285876, -0.0724771,     -0.026382286,
+      -0.0014920527, 0.042667855,  0.0018776858, 0.02986552,     0.009814309,
+      0.0733756,     0.12289186,   0.018043943,  -0.0458958,     0.049412545,
+      0.033632483,   0.05495232,   0.036686596,  -0.013781798,   -0.010036754,
+      0.02576849,    -0.08307328,  0.010112348,  0.042521734,    -0.05869831,
+      -0.071689695,  0.03876447,   -0.13275425,  -0.0352966,     -0.023077697,
+      0.10285965,    0.084736146,  0.15568255,   -0.00040734606, 0.027835453,
+      -0.10292561,   -0.032401145, 0.10053256,   -0.026142767,   -0.08271222,
+      -0.0030240538, -0.016368777, 0.1070414,    0.042672627,    0.013456989,
+      -0.0437609,    -0.022309763, 0.11576483,   0.04108048,     0.061026827,
+      -0.0190714,    -0.0869359,   0.037901703,  0.0610107,      0.07202949,
+      0.01675338,    0.086139716,  -0.08795751,  -0.014898893,   -0.023771819,
+      -0.01965048,   0.007955471,  -0.043740474, 0.03346837,     -0.10549954,
+      0.090567775,   0.042013682,  -0.03176985,  0.12569028,     -0.02421228,
+      -0.029526481,  0.023851605,  0.031539805,  0.05292009,     -0.02344001,
+      -0.07811758,   -0.08834428,  0.10094801,   0.16594367,     -0.06861939,
+      -0.021256343,  -0.041093912, -0.06669611,  0.035498552,    0.021757556,
+      -0.09302526,   -0.015403468, -0.06614931,  -0.051798206,   -0.013874718,
+      0.03630673,    0.010412845,  -0.08077351,  0.046185967,    0.0035662893,
+      0.03541868,    -0.094149634, -0.034814864, 0.003128424,    -0.020674974,
+      -0.03944324,   -0.008110165, -0.11113267,  0.08484226,     0.043586485,
+      0.040582247,   0.0968012,    -0.065249965, -0.028036479,   0.0050708856,
+      0.0017462453,  0.0326779,    0.041296225,  0.09164146,     -0.047743853,
+      -0.015952192,  -0.034451712, 0.084197424,  -0.05347844,    -0.11768019,
+      0.085926116,   -0.08251791,  -0.045081906, 0.0948852,      0.068401024,
+      0.024856757,   0.06978981,   -0.057309967, -0.012775832,   -0.0032452994,
+      0.01977615,    -0.041040014, -0.024264973, 0.063464895,    0.05431621,
+  });
+
+  lstm.SetCellToInputWeights(
+      {0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
+       -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
+       -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
+       0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175});
+
+  lstm.SetCellToForgetWeights(
+      {-0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
+       -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
+       -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
+       0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355});
+
+  lstm.SetCellToOutputWeights(
+      {0.08286371,  -0.08261836, -0.51210177, 0.002913762, 0.17764764,
+       -0.5495371,  -0.08460716, -0.24552552, 0.030037103, 0.04123544,
+       -0.11940523, 0.007358328, 0.1890978,   0.4833202,   -0.34441817,
+       0.36312827,  -0.26375428, 0.1457655,   -0.19724406, 0.15548733});
+
+  lstm.SetProjectionWeights(
+      {-0.009802181,  0.09401916,    0.0717386,     -0.13895074,  0.09641832,
+       0.060420845,   0.08539281,    0.054285463,   0.061395317,  0.034448683,
+       -0.042991187,  0.019801661,   -0.16840284,   -0.015726732, -0.23041931,
+       -0.024478018,  -0.10959692,   -0.013875541,  0.18600968,   -0.061274476,
+       0.0138165,     -0.08160894,   -0.07661644,   0.032372914,  0.16169067,
+       0.22465782,    -0.03993472,   -0.004017731,  0.08633481,   -0.28869787,
+       0.08682067,    0.17240396,    0.014975425,   0.056431185,  0.031037588,
+       0.16702051,    0.0077946745,  0.15140012,    0.29405436,   0.120285,
+       -0.188994,     -0.027265169,  0.043389652,   -0.022061434, 0.014777949,
+       -0.20203483,   0.094781205,   0.19100232,    0.13987629,   -0.036132768,
+       -0.06426278,   -0.05108664,   0.13221376,    0.009441198,  -0.16715929,
+       0.15859416,    -0.040437475,  0.050779544,   -0.022187516, 0.012166504,
+       0.027685808,   -0.07675938,   -0.0055694645, -0.09444123,  0.0046453946,
+       0.050794356,   0.10770313,    -0.20790008,   -0.07149004,  -0.11425117,
+       0.008225835,   -0.035802525,  0.14374903,    0.15262283,   0.048710253,
+       0.1847461,     -0.007487823,  0.11000021,    -0.09542012,  0.22619456,
+       -0.029149994,  0.08527916,    0.009043713,   0.0042746216, 0.016261552,
+       0.022461696,   0.12689082,    -0.043589946,  -0.12035478,  -0.08361797,
+       -0.050666027,  -0.1248618,    -0.1275799,    -0.071875185, 0.07377272,
+       0.09944291,    -0.18897448,   -0.1593054,    -0.06526116,  -0.040107165,
+       -0.004618631,  -0.067624845,  -0.007576253,  0.10727444,   0.041546922,
+       -0.20424393,   0.06907816,    0.050412357,   0.00724631,   0.039827548,
+       0.12449835,    0.10747581,    0.13708383,    0.09134148,   -0.12617786,
+       -0.06428341,   0.09956831,    0.1208086,     -0.14676677,  -0.0727722,
+       0.1126304,     0.010139365,   0.015571211,   -0.038128063, 0.022913318,
+       -0.042050496,  0.16842307,    -0.060597885,  0.10531834,   -0.06411776,
+       -0.07451711,   -0.03410368,   -0.13393489,   0.06534304,   0.003620307,
+       0.04490757,    0.05970546,    0.05197996,    0.02839995,   0.10434969,
+       -0.013699693,  -0.028353551,  -0.07260381,   0.047201227,  -0.024575593,
+       -0.036445823,  0.07155557,    0.009672501,   -0.02328883,  0.009533515,
+       -0.03606021,   -0.07421458,   -0.028082801,  -0.2678904,   -0.13221288,
+       0.18419984,    -0.13012612,   -0.014588381,  -0.035059117, -0.04824723,
+       0.07830115,    -0.056184657,  0.03277091,    0.025466874,  0.14494097,
+       -0.12522776,   -0.098633975,  -0.10766018,   -0.08317623,  0.08594209,
+       0.07749552,    0.039474737,   0.1776665,     -0.07409566,  -0.0477268,
+       0.29323658,    0.10801441,    0.1154011,     0.013952499,  0.10739139,
+       0.10708251,    -0.051456142,  0.0074137426,  -0.10430189,  0.10034707,
+       0.045594677,   0.0635285,     -0.0715442,    -0.089667566, -0.10811871,
+       0.00026344223, 0.08298446,    -0.009525053,  0.006585689,  -0.24567553,
+       -0.09450807,   0.09648481,    0.026996298,   -0.06419476,  -0.04752702,
+       -0.11063944,   -0.23441927,   -0.17608605,   -0.052156363, 0.067035615,
+       0.19271925,    -0.0032889997, -0.043264326,  0.09663576,   -0.057112187,
+       -0.10100678,   0.0628376,     0.04447668,    0.017961001,  -0.10094388,
+       -0.10190601,   0.18335468,    0.10494553,    -0.052095775, -0.0026118709,
+       0.10539724,    -0.04383912,   -0.042349473,  0.08438151,   -0.1947263,
+       0.02251204,    0.11216432,    -0.10307853,   0.17351969,   -0.039091777,
+       0.08066188,    -0.00561982,   0.12633002,    0.11335965,   -0.0088127935,
+       -0.019777594,  0.06864014,    -0.059751723,  0.016233567,  -0.06894641,
+       -0.28651384,   -0.004228674,  0.019708522,   -0.16305895,  -0.07468996,
+       -0.0855457,    0.099339016,   -0.07580735,   -0.13775392,  0.08434318,
+       0.08330512,    -0.12131499,   0.031935584,   0.09180414,   -0.08876437,
+       -0.08049874,   0.008753825,   0.03498998,    0.030215185,  0.03907079,
+       0.089751154,   0.029194152,   -0.03337423,   -0.019092513, 0.04331237,
+       0.04299654,    -0.036394123,  -0.12915532,   0.09793732,   0.07512415,
+       -0.11319543,   -0.032502122,  0.15661901,    0.07671967,   -0.005491124,
+       -0.19379048,   -0.218606,     0.21448623,    0.017840758,  0.1416943,
+       -0.07051762,   0.19488361,    0.02664691,    -0.18104725,  -0.09334311,
+       0.15026465,    -0.15493552,   -0.057762887,  -0.11604192,  -0.262013,
+       -0.01391798,   0.012185008,   0.11156489,    -0.07483202,  0.06693364,
+       -0.26151478,   0.046425626,   0.036540434,   -0.16435726,  0.17338543,
+       -0.21401681,   -0.11385144,   -0.08283257,   -0.069031075, 0.030635102,
+       0.010969227,   0.11109743,    0.010919218,   0.027526086,  0.13519906,
+       0.01891392,    -0.046839405,  -0.040167913,  0.017953383,  -0.09700955,
+       0.0061885654,  -0.07000971,   0.026893595,   -0.038844477, 0.14543656});
+
+  static float lstm_input[][20] = {
+      {// Batch0: 4 (input_sequence_size) * 5 (n_input)
+       0.787926, 0.151646, 0.071352, 0.118426, 0.458058, 0.596268, 0.998386,
+       0.568695, 0.864524, 0.571277, 0.073204, 0.296072, 0.743333, 0.069199,
+       0.045348, 0.867394, 0.291279, 0.013714, 0.482521, 0.626339},
+
+      {// Batch1: 4 (input_sequence_size) * 5 (n_input)
+       0.295743, 0.544053, 0.690064, 0.858138, 0.497181, 0.642421, 0.524260,
+       0.134799, 0.003639, 0.162482, 0.640394, 0.930399, 0.050782, 0.432485,
+       0.988078, 0.082922, 0.563329, 0.865614, 0.333232, 0.259916}};
+
+  static float lstm_golden_output[][64] = {
+      {// Batch0: 4 (input_sequence_size) * 16 (n_output)
+       -0.00396806, 0.029352,     -0.00279226, 0.0159977,   -0.00835576,
+       -0.0211779,  0.0283512,    -0.0114597,  0.00907307,  -0.0244004,
+       -0.0152191,  -0.0259063,   0.00914318,  0.00415118,  0.017147,
+       0.0134203,   -0.0166936,   0.0381209,   0.000889694, 0.0143363,
+       -0.0328911,  -0.0234288,   0.0333051,   -0.012229,   0.0110322,
+       -0.0457725,  -0.000832209, -0.0202817,  0.0327257,   0.0121308,
+       0.0155969,   0.0312091,    -0.0213783,  0.0350169,   0.000324794,
+       0.0276012,   -0.0263374,   -0.0371449,  0.0446149,   -0.0205474,
+       0.0103729,   -0.0576349,   -0.0150052,  -0.0292043,  0.0376827,
+       0.0136115,   0.0243435,    0.0354492,   -0.0189322,  0.0464512,
+       -0.00251373, 0.0225745,    -0.0308346,  -0.0317124,  0.0460407,
+       -0.0189395,  0.0149363,    -0.0530162,  -0.0150767,  -0.0340193,
+       0.0286833,   0.00824207,   0.0264887,   0.0305169},
+      {// Batch1: 4 (input_sequence_size) * 16 (n_output)
+       -0.013869,    0.0287268,   -0.00334693, 0.00733398,  -0.0287926,
+       -0.0186926,   0.0193662,   -0.0115437,  0.00422612,  -0.0345232,
+       0.00223253,   -0.00957321, 0.0210624,   0.013331,    0.0150954,
+       0.02168,      -0.0141913,  0.0322082,   0.00227024,  0.0260507,
+       -0.0188721,   -0.0296489,  0.0399134,   -0.0160509,  0.0116039,
+       -0.0447318,   -0.0150515,  -0.0277406,  0.0316596,   0.0118233,
+       0.0214762,    0.0293641,   -0.0204549,  0.0450315,   -0.00117378,
+       0.0167673,    -0.0375007,  -0.0238314,  0.038784,    -0.0174034,
+       0.0131743,    -0.0506589,  -0.0048447,  -0.0240239,  0.0325789,
+       0.00790065,   0.0220157,   0.0333314,   -0.0264787,  0.0387855,
+       -0.000764675, 0.0217599,   -0.037537,   -0.0335206,  0.0431679,
+       -0.0211424,   0.010203,    -0.062785,   -0.00832363, -0.025181,
+       0.0412031,    0.0118723,   0.0239643,   0.0394009}};
+
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
+
+  for (int i = 0; i < lstm.sequence_length(); i++) {
+    float* batch0_start = lstm_input[0] + i * lstm.num_inputs();
+    float* batch0_end = batch0_start + lstm.num_inputs();
+
+    lstm.SetInput(2 * i * lstm.num_inputs(), batch0_start, batch0_end);
+
+    float* batch1_start = lstm_input[1] + i * lstm.num_inputs();
+    float* batch1_end = batch1_start + lstm.num_inputs();
+    lstm.SetInput((2 * i + 1) * lstm.num_inputs(), batch1_start, batch1_end);
+  }
+
+  lstm.Invoke();
+
+  std::vector<float> expected;
+  for (int i = 0; i < lstm.sequence_length(); i++) {
+    float* golden_start_batch0 = lstm_golden_output[0] + i * lstm.num_outputs();
+    float* golden_end_batch0 = golden_start_batch0 + lstm.num_outputs();
+    float* golden_start_batch1 = lstm_golden_output[1] + i * lstm.num_outputs();
+    float* golden_end_batch1 = golden_start_batch1 + lstm.num_outputs();
+    expected.insert(expected.end(), golden_start_batch0, golden_end_batch0);
+    expected.insert(expected.end(), golden_start_batch1, golden_end_batch1);
+  }
+  EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac00c37b67dcbe77023a2495a698967ca555b1d5
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
@@ -0,0 +1,168 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace unidirectional_sequence_rnn {
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kRecurrentWeightsTensor = 2;
+constexpr int kBiasTensor = 3;
+constexpr int kHiddenStateTensor = 0;
+constexpr int kOutputTensor = 1;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* input_weights =
+      &context->tensors[node->inputs->data[kWeightsTensor]];
+  TfLiteTensor* recurrent_weights =
+      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
+  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
+
+  // Check all the parameters of tensor match within themselves and match the
+  // input configuration.
+  auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
+  const bool time_major = params->time_major;
+  const int batch_size =
+      (time_major) ? input->dims->data[1] : input->dims->data[0];
+  const int max_time =
+      (time_major) ? input->dims->data[0] : input->dims->data[1];
+  const int num_units = input_weights->dims->data[0];
+  TF_LITE_ASSERT_EQ(input->dims->data[2], input_weights->dims->data[1]);
+  TF_LITE_ASSERT_EQ(input_weights->dims->data[0], bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[0], bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[1], bias->dims->data[0]);
+
+  TfLiteTensor* hidden_state =
+      &context->tensors[node->outputs->data[kHiddenStateTensor]];
+  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
+
+  // Resize state.
+  TfLiteIntArray* hidden_state_size_array = TfLiteIntArrayCreate(2);
+  hidden_state_size_array->data[0] = batch_size;
+  hidden_state_size_array->data[1] = num_units;
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, hidden_state,
+                                                   hidden_state_size_array));
+
+  // Mark hidden state as a persistent tensor.
+  hidden_state->allocation_type = kTfLiteArenaRwPersistent;
+
+  // Resize output.
+  TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(3);
+  output_size_array->data[0] = (time_major) ? max_time : batch_size;
+  output_size_array->data[1] = (time_major) ? batch_size : max_time;
+  output_size_array->data[2] = num_units;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size_array));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* input_weights =
+      &context->tensors[node->inputs->data[kWeightsTensor]];
+  TfLiteTensor* recurrent_weights =
+      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
+  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
+  TfLiteTensor* hidden_state =
+      &context->tensors[node->outputs->data[kHiddenStateTensor]];
+  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
+
+  // Initialize the pointer bias.
+  const float* bias_ptr = bias->data.f;
+
+  const bool time_major = params->time_major;
+  const int batch_size =
+      (time_major) ? input->dims->data[1] : input->dims->data[0];
+  const int max_time =
+      (time_major) ? input->dims->data[0] : input->dims->data[1];
+  const int num_units = input_weights->dims->data[0];
+  const int input_size = input->dims->data[2];
+
+  // Initialize input_weights and recurrent_weights.
+  const float* input_weights_ptr = input_weights->data.f;
+  const float* recurrent_weights_ptr = recurrent_weights->data.f;
+
+  if (time_major) {
+    // Initialize the pointer to hidden state.
+    float* hidden_state_ptr_batch = hidden_state->data.f;
+    // Unroll the sequence and use batch batch operations for efficiency.
+    for (int s = 0; s < max_time; s++) {
+      // Initialize the pointer to input and output.
+      const float* input_ptr_batch =
+          input->data.f + s * input_size * batch_size;
+      float* output_ptr_batch = output->data.f + s * num_units * batch_size;
+
+      kernel_utils::RnnBatchStep(input_ptr_batch, input_weights_ptr,
+                                 recurrent_weights_ptr, bias_ptr, input_size,
+                                 num_units, batch_size, params->activation,
+                                 hidden_state_ptr_batch, output_ptr_batch);
+    }
+  } else {
+    // For each batch
+    for (int b = 0; b < batch_size; b++) {
+      // Initialize the pointer to hidden state.
+      float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units;
+      for (int s = 0; s < max_time; s++) {
+        // Initialize the pointer to input and output.
+        const float* input_ptr_batch =
+            input->data.f + b * input_size * max_time + s * input_size;
+        float* output_ptr_batch =
+            output->data.f + b * num_units * max_time + s * num_units;
+
+        kernel_utils::RnnBatchStep(
+            input_ptr_batch, input_weights_ptr, recurrent_weights_ptr, bias_ptr,
+            input_size, num_units, /*batch_size=*/1, params->activation,
+            hidden_state_ptr_batch, output_ptr_batch);
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace unidirectional_sequence_rnn
+
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_RNN() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 unidirectional_sequence_rnn::Prepare,
+                                 unidirectional_sequence_rnn::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e32969763b59620dc3534708f965750680002d2
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -0,0 +1,351 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Sequential RNN op.
+
+#include <iomanip>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+static float rnn_input[] = {
+    0.23689353,   0.285385,     0.037029743, -0.19858193,  -0.27569133,
+    0.43773448,   0.60379338,   0.35562468,  -0.69424844,  -0.93421471,
+    -0.87287879,  0.37144363,   -0.62476718, 0.23791671,   0.40060222,
+    0.1356622,    -0.99774903,  -0.98858172, -0.38952237,  -0.47685933,
+    0.31073618,   0.71511042,   -0.63767755, -0.31729108,  0.33468103,
+    0.75801885,   0.30660987,   -0.37354088, 0.77002847,   -0.62747043,
+    -0.68572164,  0.0069220066, 0.65791464,  0.35130811,   0.80834007,
+    -0.61777675,  -0.21095741,  0.41213346,  0.73784804,   0.094794154,
+    0.47791874,   0.86496925,   -0.53376222, 0.85315156,   0.10288584,
+    0.86684,      -0.011186242, 0.10513687,  0.87825835,   0.59929144,
+    0.62827742,   0.18899453,   0.31440187,  0.99059987,   0.87170351,
+    -0.35091716,  0.74861872,   0.17831337,  0.2755419,    0.51864719,
+    0.55084288,   0.58982027,   -0.47443086, 0.20875752,   -0.058871567,
+    -0.66609079,  0.59098077,   0.73017097,  0.74604273,   0.32882881,
+    -0.17503482,  0.22396147,   0.19379807,  0.29120302,   0.077113032,
+    -0.70331609,  0.15804303,   -0.93407321, 0.40182066,   0.036301374,
+    0.66521823,   0.0300982,    -0.7747041,  -0.02038002,  0.020698071,
+    -0.90300065,  0.62870288,   -0.23068321, 0.27531278,   -0.095755219,
+    -0.712036,    -0.17384434,  -0.50593495, -0.18646687,  -0.96508682,
+    0.43519354,   0.14744234,   0.62589407,  0.1653645,    -0.10651493,
+    -0.045277178, 0.99032974,   -0.88255352, -0.85147917,  0.28153265,
+    0.19455957,   -0.55479527,  -0.56042433, 0.26048636,   0.84702539,
+    0.47587705,   -0.074295521, -0.12287641, 0.70117295,   0.90532446,
+    0.89782166,   0.79817224,   0.53402734,  -0.33286154,  0.073485017,
+    -0.56172788,  -0.044897556, 0.89964068,  -0.067662835, 0.76863563,
+    0.93455386,   -0.6324693,   -0.083922029};
+
+static float rnn_golden_output[] = {
+    0.496726,   0,          0.965996,  0,         0.0584254, 0,
+    0,          0.12315,    0,         0,         0.612266,  0.456601,
+    0,          0.52286,    1.16099,   0.0291232,
+
+    0,          0,          0.524901,  0,         0,         0,
+    0,          1.02116,    0,         1.35762,   0,         0.356909,
+    0.436415,   0.0355727,  0,         0,
+
+    0,          0,          0,         0.262335,  0,         0,
+    0,          1.33992,    0,         2.9739,    0,         0,
+    1.31914,    2.66147,    0,         0,
+
+    0.942568,   0,          0,         0,         0.025507,  0,
+    0,          0,          0.321429,  0.569141,  1.25274,   1.57719,
+    0.8158,     1.21805,    0.586239,  0.25427,
+
+    1.04436,    0,          0.630725,  0,         0.133801,  0.210693,
+    0.363026,   0,          0.533426,  0,         1.25926,   0.722707,
+    0,          1.22031,    1.30117,   0.495867,
+
+    0.222187,   0,          0.72725,   0,         0.767003,  0,
+    0,          0.147835,   0,         0,         0,         0.608758,
+    0.469394,   0.00720298, 0.927537,  0,
+
+    0.856974,   0.424257,   0,         0,         0.937329,  0,
+    0,          0,          0.476425,  0,         0.566017,  0.418462,
+    0.141911,   0.996214,   1.13063,   0,
+
+    0.967899,   0,          0,         0,         0.0831304, 0,
+    0,          1.00378,    0,         0,         0,         1.44818,
+    1.01768,    0.943891,   0.502745,  0,
+
+    0.940135,   0,          0,         0,         0,         0,
+    0,          2.13243,    0,         0.71208,   0.123918,  1.53907,
+    1.30225,    1.59644,    0.70222,   0,
+
+    0.804329,   0,          0.430576,  0,         0.505872,  0.509603,
+    0.343448,   0,          0.107756,  0.614544,  1.44549,   1.52311,
+    0.0454298,  0.300267,   0.562784,  0.395095,
+
+    0.228154,   0,          0.675323,  0,         1.70536,   0.766217,
+    0,          0,          0,         0.735363,  0.0759267, 1.91017,
+    0.941888,   0,          0,         0,
+
+    0,          0,          1.5909,    0,         0,         0,
+    0,          0.5755,     0,         0.184687,  0,         1.56296,
+    0.625285,   0,          0,         0,
+
+    0,          0,          0.0857888, 0,         0,         0,
+    0,          0.488383,   0.252786,  0,         0,         0,
+    1.02817,    1.85665,    0,         0,
+
+    0.00981836, 0,          1.06371,   0,         0,         0,
+    0,          0,          0,         0.290445,  0.316406,  0,
+    0.304161,   1.25079,    0.0707152, 0,
+
+    0.986264,   0.309201,   0,         0,         0,         0,
+    0,          1.64896,    0.346248,  0,         0.918175,  0.78884,
+    0.524981,   1.92076,    2.07013,   0.333244,
+
+    0.415153,   0.210318,   0,         0,         0,         0,
+    0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
+    0.628881,   3.58099,    1.49974,   0};
+
+class UnidirectionalRNNOpModel : public SingleOpModel {
+ public:
+  UnidirectionalRNNOpModel(int batches, int sequence_len, int units, int size,
+                           bool time_major)
+      : batches_(batches),
+        sequence_len_(sequence_len),
+        units_(units),
+        input_size_(size) {
+    input_ = AddInput(TensorType_FLOAT32);
+    weights_ = AddInput(TensorType_FLOAT32);
+    recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    bias_ = AddInput(TensorType_FLOAT32);
+    hidden_state_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
+                 BuiltinOptions_SequenceRNNOptions,
+                 CreateSequenceRNNOptions(builder_, time_major,
+                                          ActivationFunctionType_RELU)
+                     .Union());
+    if (time_major) {
+      BuildInterpreter({{sequence_len_, batches_, input_size_},
+                        {units_, input_size_},
+                        {units_, units_},
+                        {units_}});
+    } else {
+      BuildInterpreter({{batches_, sequence_len_, input_size_},
+                        {units_, input_size_},
+                        {units_, units_},
+                        {units_}});
+    }
+  }
+
+  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
+
+  void SetWeights(std::initializer_list<float> f) {
+    PopulateTensor(weights_, f);
+  }
+
+  void SetRecurrentWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_weights_, f);
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  void ResetHiddenState() {
+    const int zero_buffer_size = units_ * batches_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(hidden_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int input_size() { return input_size_; }
+  int num_units() { return units_; }
+  int num_batches() { return batches_; }
+  int sequence_len() { return sequence_len_; }
+
+ private:
+  int input_;
+  int weights_;
+  int recurrent_weights_;
+  int bias_;
+  int hidden_state_;
+  int output_;
+
+  int batches_;
+  int sequence_len_;
+  int units_;
+  int input_size_;
+};
+
+// TODO(mirkov): add another test which directly compares to TF once TOCO
+// supports the conversion from dynamic_rnn with BasicRNNCell.
+TEST(FullyConnectedOpTest, BlackBoxTest) {
+  UnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                               /*units=*/16, /*size=*/8, /*time_major=*/false);
+  rnn.SetWeights(
+      {0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
+       0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
+       0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
+       -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
+       -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
+       -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
+       -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
+       0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
+       0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
+       0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
+       -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
+       0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
+       -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
+       -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
+       0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
+       0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
+       0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
+       -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
+       0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
+       0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
+       -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
+       0.277308,    0.415818});
+
+  rnn.SetBias({0.065691948, -0.69055247, 0.1107955, -0.97084129, -0.23957068,
+               -0.23566568, -0.389184, 0.47481549, -0.4791103, 0.29931796,
+               0.10463274, 0.83918178, 0.37197268, 0.61957061, 0.3956964,
+               -0.37609905});
+
+  rnn.SetRecurrentWeights({0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1});
+
+  rnn.ResetHiddenState();
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  float* batch_start = rnn_input;
+  float* batch_end = batch_start + input_sequence_size;
+  rnn.SetInput(0, batch_start, batch_end);
+  rnn.SetInput(input_sequence_size, batch_start, batch_end);
+
+  rnn.Invoke();
+
+  float* golden_start = rnn_golden_output;
+  float* golden_end = golden_start + rnn.num_units() * rnn.sequence_len();
+  std::vector<float> expected;
+  expected.insert(expected.end(), golden_start, golden_end);
+  expected.insert(expected.end(), golden_start, golden_end);
+
+  EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+TEST(FullyConnectedOpTest, TimeMajorBlackBoxTest) {
+  UnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                               /*units=*/16, /*size=*/8, /*time_major=*/true);
+  rnn.SetWeights(
+      {0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
+       0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
+       0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
+       -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
+       -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
+       -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
+       -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
+       0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
+       0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
+       0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
+       -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
+       0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
+       -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
+       -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
+       0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
+       0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
+       0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
+       -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
+       0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
+       0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
+       -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
+       0.277308,    0.415818});
+
+  rnn.SetBias({0.065691948, -0.69055247, 0.1107955, -0.97084129, -0.23957068,
+               -0.23566568, -0.389184, 0.47481549, -0.4791103, 0.29931796,
+               0.10463274, 0.83918178, 0.37197268, 0.61957061, 0.3956964,
+               -0.37609905});
+
+  rnn.SetRecurrentWeights({0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1});
+
+  rnn.ResetHiddenState();
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    rnn.SetInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_batch_start = rnn_golden_output + i * rnn.num_units();
+    float* golden_batch_end = golden_batch_start + rnn.num_units();
+    expected.insert(expected.end(), golden_batch_start, golden_batch_end);
+    expected.insert(expected.end(), golden_batch_start, golden_batch_end);
+  }
+
+  EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/lib_package/BUILD b/tensorflow/contrib/lite/lib_package/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3c1b8d3d45f2bb382bbe6b789ec6ac7ec89ebc66
--- /dev/null
+++ b/tensorflow/contrib/lite/lib_package/BUILD
@@ -0,0 +1,16 @@
+package(default_visibility = ["//visibility:private"])
+
+# Create the LICENSE file for libraries that are used by TensorFlow Lite
+# C library.
+genrule(
+    name = "clicenses_generate",
+    srcs = [
+        "//third_party/eigen3:LICENSE",
+        "@arm_neon_2_x86_sse//:LICENSE",
+        "@farmhash_archive//:COPYING",
+        "@gemmlowp//:LICENSE",
+    ],
+    outs = ["LICENSE"],
+    cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
+    tools = [":concat_licenses.sh"],
+)
diff --git a/tensorflow/contrib/lite/lib_package/concat_licenses.sh b/tensorflow/contrib/lite/lib_package/concat_licenses.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2070f64e9fa4384234361556da0ed6f5089319b3
--- /dev/null
+++ b/tensorflow/contrib/lite/lib_package/concat_licenses.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Script aimed to combining multiple license files into a single one.
+
+for f in $@
+do
+  echo "--------------------------------------------------------------------------------"
+  echo "BEGIN LICENSE FOR $f"
+  echo "--------------------------------------------------------------------------------"
+  cat $f
+  echo "--------------------------------------------------------------------------------"
+  echo "END LICENSE FOR $f"
+  echo "--------------------------------------------------------------------------------"
+done
diff --git a/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b58ae266017caf8781c28331f49a8f5bc1550767
--- /dev/null
+++ b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh
@@ -0,0 +1,81 @@
+#!/bin/bash -x
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+echo "Starting"
+TFLITE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.."
+
+TMP_DIR=$(mktemp -d)
+echo "Package dir: " $TMP_DIR
+FW_DIR=$TMP_DIR/tensorflow_lite_ios_frameworks
+FW_DIR_TFLITE=$FW_DIR/tensorflow_lite.framework
+FW_DIR_TFLITE_HDRS=$FW_DIR_TFLITE/Headers
+
+echo "Creating target Headers directories"
+mkdir -p $FW_DIR_TFLITE_HDRS
+
+echo "Headers, populating: TensorFlow Lite"
+cd $TFLITE_DIR/../../..
+
+find tensorflow/contrib/lite -name '*.h' \
+    -not -path 'tensorflow/contrib/lite/downloads/*' \
+    -not -path 'tensorflow/contrib/lite/examples/*' \
+    -not -path 'tensorflow/contrib/lite/gen/*' \
+    -not -path 'tensorflow/contrib/lite/toco/*' \
+    -not -path 'tensorflow/contrib/lite/nnapi/*' \
+    -not -path 'tensorflow/contrib/lite/java/*' \
+    | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
+cd $FW_DIR_TFLITE_HDRS
+tar xf tmp.tar
+rm -f tmp.tar
+
+echo "Headers, populating: Flatbuffer"
+cd $TFLITE_DIR/downloads/flatbuffers/include/
+find . -name '*.h' | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
+cd $FW_DIR_TFLITE_HDRS
+tar xf tmp.tar
+rm -f tmp.tar
+
+cd $TFLITE_DIR/../../..
+echo "Generate master LICENSE file and copy to target"
+bazel build //tensorflow/tools/lib_package:clicenses_generate
+cp $TFLITE_DIR/../../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \
+   $FW_DIR_TFLITE
+
+echo "Copying static libraries"
+cp $TFLITE_DIR/gen/lib/libtensorflow-lite.a \
+   $FW_DIR_TFLITE/tensorflow_lite
+
+# This is required, otherwise they interfere with the documentation of the
+# pod at cocoapods.org.
+echo "Remove all README files"
+cd $FW_DIR_TFLITE_HDRS
+find . -type f -name README\* -exec rm -f {} \;
+find . -type f -name readme\* -exec rm -f {} \;
+
+TARGET_GEN_LOCATION="$TFLITE_DIR/gen/ios_frameworks"
+echo "Moving results to target: " $TARGET_GEN_LOCATION
+cd $FW_DIR
+zip -q -r tensorflow_lite.framework.zip tensorflow_lite.framework -x .DS_Store
+rm -rf $TARGET_GEN_LOCATION
+mkdir -p $TARGET_GEN_LOCATION
+cp -r tensorflow_lite.framework.zip $TARGET_GEN_LOCATION
+
+echo "Cleaning up"
+rm -rf $TMP_DIR
+
+echo "Finished"
diff --git a/tensorflow/contrib/lite/memory_planner.h b/tensorflow/contrib/lite/memory_planner.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cd6c208500f3ea84ab8146f7f136e8b7851ff03
--- /dev/null
+++ b/tensorflow/contrib/lite/memory_planner.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
+#define TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+// A MemoryPlanner is responsible for planning and executing a number of
+// memory-related operations that are necessary in TF Lite.
+class MemoryPlanner {
+ public:
+  virtual ~MemoryPlanner() {}
+
+  // Plans the necessary memory allocations. This is the MemoryPlanner's
+  // pre-processing step and is called when the graph structure is known but
+  // actual size of the tensors is not.
+  virtual TfLiteStatus PlanAllocations() = 0;
+
+  // Allocates the necessary memory to execute all nodes in the interval
+  // [first_node, last_node].
+  virtual TfLiteStatus ExecuteAllocations(int first_node, int last_node) = 0;
+
+  // Invalidates allocations made earliers. This is called when tensors sizes
+  // have change. All planned allocations remain, but can't be used until
+  // ExecuteAllocations() is called.
+  virtual TfLiteStatus ResetAllocations() = 0;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index e2f3560e61baae88a4afaafaa202cde784063efc..2ee0cac11ca8b5c964e04f9baa2471ab27b6972d 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -30,17 +30,6 @@ limitations under the License.
 
 namespace tflite {
 
-namespace {
-inline const tflite::Model* VerifyAndGetModel(const void* buf, size_t len) {
-  ::flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
-  if (VerifyModelBuffer(verifier)) {
-    return ::tflite::GetModel(buf);
-  } else {
-    return nullptr;
-  }
-}
-}  // namespace
-
 const char* kEmptyTensorName = "";
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
@@ -60,6 +49,14 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
   return model;
 }
 
+std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromModel(
+    const tflite::Model* model_spec, ErrorReporter* error_reporter) {
+  std::unique_ptr<FlatBufferModel> model;
+  model.reset(new FlatBufferModel(model_spec, error_reporter));
+  if (!model->initialized()) model.reset();
+  return model;
+}
+
 FlatBufferModel::FlatBufferModel(const char* filename, bool mmap_file,
                                  ErrorReporter* error_reporter, bool use_nnapi)
     : error_reporter_(error_reporter ? error_reporter
@@ -72,10 +69,9 @@ FlatBufferModel::FlatBufferModel(const char* filename, bool mmap_file,
   } else {
     allocation_ = new FileCopyAllocation(filename, error_reporter);
   }
-  if (!allocation_->valid()) return;
-  if (!CheckModelIdentifier()) return;
+  if (!allocation_->valid() || !CheckModelIdentifier()) return;
 
-  model_ = VerifyAndGetModel(allocation_->base(), allocation_->bytes());
+  model_ = ::tflite::GetModel(allocation_->base());
 }
 
 bool FlatBufferModel::CheckModelIdentifier() const {
@@ -96,7 +92,14 @@ FlatBufferModel::FlatBufferModel(const char* ptr, size_t num_bytes,
   allocation_ = new MemoryAllocation(ptr, num_bytes, error_reporter);
   if (!allocation_->valid()) return;
 
-  model_ = VerifyAndGetModel(allocation_->base(), allocation_->bytes());
+  model_ = ::tflite::GetModel(allocation_->base());
+}
+
+FlatBufferModel::FlatBufferModel(const Model* model,
+                                 ErrorReporter* error_reporter)
+    : error_reporter_(error_reporter ? error_reporter
+                                     : DefaultErrorReporter()) {
+  model_ = model;
 }
 
 FlatBufferModel::~FlatBufferModel() { delete allocation_; }
@@ -160,6 +163,27 @@ std::vector<int> FlatBufferIntArrayToVector(T* flat_array) {
   return ret;
 }
 
+// Copies the contents from the flatbuffer int vector `flatbuffer` into the
+// int array `buffer`. `flat_vector` and `buffer` represent the same
+// configuration operation for a given operation.
+void FlatBufferIntVectorToArray(int max_size_of_buffer,
+                                const flatbuffers::Vector<int32_t>* flat_vector,
+                                int* buffer, ErrorReporter* error_reporter) {
+  if (!flat_vector) {
+    error_reporter->Report("Input array not provided for operation.\n");
+  } else {
+    int num_dimensions = flat_vector->Length();
+    if (num_dimensions > max_size_of_buffer / sizeof(int)) {
+      error_reporter->Report(
+          "Found too many dimensions in the operation's input array.\n");
+    } else {
+      for (int i = 0; i < num_dimensions; ++i) {
+        buffer[i] = flat_vector->Get(i);
+      }
+    }
+  }
+}
+
 // Allocate a structure using C malloc, but make sure the structure is a
 // POD structure that doesn't require constructors to run. The reason we do
 // this, is that Interpreter's C extension part will take ownership and wants
@@ -175,6 +199,9 @@ T* MallocPOD() {
 // This handles builtin data explicitly as there are flatbuffer schemas.
 //
 // Returns memory that must be feed.
+//
+// TODO(nupurgarg): Pass in void ** and return TfLiteStatus to ensure program
+// crashes if error reporter is called.
 void* ParseOpData(const Operator* op, BuiltinOperator op_type,
                   ErrorReporter* error_reporter) {
   auto parse_padding = [](Padding padding) {
@@ -192,7 +219,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         return kTfLiteActNone;
       case ActivationFunctionType_RELU:
         return kTfLiteActRelu;
-      case ActivationFunctionType_RELU1:
+      case ActivationFunctionType_RELU_N1_TO_1:
         return kTfLiteActRelu1;
       case ActivationFunctionType_RELU6:
         return kTfLiteActRelu6;
@@ -248,9 +275,10 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_TANH:
     case BuiltinOperator_LOGISTIC:
     case BuiltinOperator_RELU:
-    case BuiltinOperator_RELU1:
+    case BuiltinOperator_RELU_N1_TO_1:
     case BuiltinOperator_RELU6:
     case BuiltinOperator_CONCAT_EMBEDDINGS:
+    case BuiltinOperator_EXP:
       break;
     case BuiltinOperator_LSH_PROJECTION: {
       TfLiteLSHProjectionParams* params =
@@ -301,6 +329,18 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
+    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN: {
+      TfLiteSequenceRNNParams* params = MallocPOD<TfLiteSequenceRNNParams>();
+      if (auto* sequence_rnn_params =
+              op->builtin_options_as_SequenceRNNOptions()) {
+        params->activation =
+            parse_activation(sequence_rnn_params->fused_activation_function());
+        params->time_major = sequence_rnn_params->time_major();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_RNN: {
       TfLiteRNNParams* params = MallocPOD<TfLiteRNNParams>();
       if (auto* rnn_params = op->builtin_options_as_RNNOptions()) {
@@ -375,6 +415,24 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_DIV: {
+      auto* params = MallocPOD<TfLiteDivParams>();
+      if (auto* schema_params = op->builtin_options_as_DivOptions()) {
+        params->activation =
+            parse_activation(schema_params->fused_activation_function());
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_SUB: {
+      auto* params = MallocPOD<TfLiteSubParams>();
+      if (auto* schema_params = op->builtin_options_as_SubOptions()) {
+        params->activation =
+            parse_activation(schema_params->fused_activation_function());
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_L2_NORMALIZATION: {
       auto* params = MallocPOD<TfLiteL2NormParams>();
       if (auto* schema_params = op->builtin_options_as_L2NormOptions()) {
@@ -396,6 +454,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
     case BuiltinOperator_LSTM: {
       TfLiteLSTMParams* params = MallocPOD<TfLiteLSTMParams>();
       if (auto* lstm_params = op->builtin_options_as_LSTMOptions()) {
@@ -411,29 +470,21 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       auto* params = MallocPOD<TfLiteResizeBilinearParams>();
       if (auto* schema_params =
               op->builtin_options_as_ResizeBilinearOptions()) {
-        params->new_height = schema_params->new_height();
-        params->new_width = schema_params->new_width();
+        params->align_corners = schema_params->align_corners();
       }
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_PAD: {
+      break;
+    }
     case BuiltinOperator_RESHAPE: {
       auto* params = MallocPOD<TfLiteReshapeParams>();
       if (auto* schema_params = op->builtin_options_as_ReshapeOptions()) {
         auto* new_shape = schema_params->new_shape();
-        if (!new_shape) {
-          error_reporter->Report("No new_shape provided for Reshape\n");
-        } else {
-          params->num_dimensions = new_shape->Length();
-          if (params->num_dimensions > sizeof(params->shape) / sizeof(int)) {
-            error_reporter->Report(
-                "Found too many dimensions in Reshape's new_shape\n");
-          } else {
-            for (int i = 0; i < params->num_dimensions; ++i) {
-              params->shape[i] = new_shape->Get(i);
-            }
-          }
-        }
+        FlatBufferIntVectorToArray(sizeof(params->shape), new_shape,
+                                   params->shape, error_reporter);
+        params->num_dimensions = new_shape->Length();
       }
       builtin_data = reinterpret_cast<void*>(params);
       break;
@@ -456,6 +507,56 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_GATHER: {
+      TfLiteGatherParams* params = MallocPOD<TfLiteGatherParams>();
+      params->axis = 0;
+      if (auto* gather_params = op->builtin_options_as_GatherOptions()) {
+        params->axis = gather_params->axis();
+      }
+
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_SPACE_TO_BATCH_ND: {
+      break;
+    }
+    case BuiltinOperator_BATCH_TO_SPACE_ND: {
+      break;
+    }
+    case BuiltinOperator_TRANSPOSE: {
+      break;
+    }
+    case BuiltinOperator_MEAN: {
+      auto* params = MallocPOD<TfLiteMeanParams>();
+      if (auto* schema_params = op->builtin_options_as_MeanOptions()) {
+        params->keep_dims = schema_params->keep_dims();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_SQUEEZE: {
+      auto* params = MallocPOD<TfLiteSqueezeParams>();
+      if (auto* schema_params = op->builtin_options_as_SqueezeOptions()) {
+        const auto& squeeze_dims = schema_params->squeeze_dims();
+        FlatBufferIntVectorToArray(sizeof(params->squeeze_dims), squeeze_dims,
+                                   params->squeeze_dims, error_reporter);
+        params->num_squeeze_dims = squeeze_dims->Length();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_STRIDED_SLICE: {
+      auto* params = MallocPOD<TfLiteStridedSliceParams>();
+      if (auto* schema_params = op->builtin_options_as_StridedSliceOptions()) {
+        params->begin_mask = schema_params->begin_mask();
+        params->end_mask = schema_params->end_mask();
+        params->ellipsis_mask = schema_params->ellipsis_mask();
+        params->new_axis_mask = schema_params->new_axis_mask();
+        params->shrink_axis_mask = schema_params->shrink_axis_mask();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
   }
   return builtin_data;
 }
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
index 15659d33f37dfb2f119480ed88d2e1b81f34c145..a467df5bb4eee3f6ce814512cb8b74bf09a6a4e7 100644
--- a/tensorflow/contrib/lite/model.h
+++ b/tensorflow/contrib/lite/model.h
@@ -31,8 +31,8 @@ limitations under the License.
 // OpResolver must be defined to provide your kernel implementations to the
 // interpreter. This is environment specific and may consist of just the builtin
 // ops, or some custom operators you defined to extend tflite.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODEL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODEL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_MODEL_H_
+#define TENSORFLOW_CONTRIB_LITE_MODEL_H_
 
 #include <memory>
 #include "tensorflow/contrib/lite/error_reporter.h"
@@ -45,18 +45,25 @@ namespace tflite {
 // or mmapped. This uses flatbuffers as the serialization format.
 class FlatBufferModel {
  public:
-  // Build a model based on a file. Return a nullptr in case of failure.
+  // Builds a model based on a file. Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromFile(
       const char* filename,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
-  // Build a model based on a pre-loaded flatbuffer. The caller retains
+  // Builds a model based on a pre-loaded flatbuffer. The caller retains
   // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Return a nullptr in case of failure.
+  // is destroyed. Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
       const char* buffer, size_t buffer_size,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
+  // Builds a model directly from a flatbuffer pointer. The caller retains
+  // ownership of the buffer and should keep it alive until the returned object
+  // is destroyed. Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromModel(
+      const tflite::Model* model_spec,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
   // Releases memory or unmaps mmaped meory.
   ~FlatBufferModel();
 
@@ -75,7 +82,7 @@ class FlatBufferModel {
   bool CheckModelIdentifier() const;
 
  private:
-  // Load a model from `filename`. If `mmap_file` is true then use mmap,
+  // Loads a model from `filename`. If `mmap_file` is true then use mmap,
   // otherwise make a copy of the model in a buffer.
   //
   // Note, if `error_reporter` is null, then a DefaultErrorReporter() will be
@@ -85,8 +92,8 @@ class FlatBufferModel {
       ErrorReporter* error_reporter = DefaultErrorReporter(),
       bool use_nnapi = false);
 
-  // Load a model from `ptr` and `num_bytes` of the model file. The `ptr` has to
-  // remain alive and unchanged until the end of this flatbuffermodel's
+  // Loads a model from `ptr` and `num_bytes` of the model file. The `ptr` has
+  // to remain alive and unchanged until the end of this flatbuffermodel's
   // lifetime.
   //
   // Note, if `error_reporter` is null, then a DefaultErrorReporter() will be
@@ -94,6 +101,10 @@ class FlatBufferModel {
   FlatBufferModel(const char* ptr, size_t num_bytes,
                   ErrorReporter* error_reporter = DefaultErrorReporter());
 
+  // Loads a model from Model flatbuffer. The `model` has to remain alive and
+  // unchanged until the end of this flatbuffermodel's lifetime.
+  FlatBufferModel(const Model* model, ErrorReporter* error_reporter);
+
   // Flatbuffer traverser pointer. (Model* is a pointer that is within the
   // allocated memory of the data allocated by allocation's internals.
   const tflite::Model* model_ = nullptr;
@@ -106,9 +117,9 @@ class FlatBufferModel {
 // model are mapped to executable function pointers (TfLiteRegistrations).
 class OpResolver {
  public:
-  // Find the op registration for a builtin operator by enum code.
+  // Finds the op registration for a builtin operator by enum code.
   virtual TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const = 0;
-  // Find the op registration of a custom operator by op name.
+  // Finds the op registration of a custom operator by op name.
   virtual TfLiteRegistration* FindOp(const char* op) const = 0;
   virtual ~OpResolver() {}
 };
@@ -131,7 +142,7 @@ class InterpreterBuilder {
  public:
   InterpreterBuilder(const FlatBufferModel& model,
                      const OpResolver& op_resolver);
-  // Build an interpreter given only the raw flatbuffer Model object (instead
+  // Builds an interpreter given only the raw flatbuffer Model object (instead
   // of a FlatBufferModel). Mostly used for testing.
   // If `error_reporter` is null, then DefaultErrorReporter() is used.
   InterpreterBuilder(const ::tflite::Model* model,
@@ -162,4 +173,4 @@ class InterpreterBuilder {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODEL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_MODEL_H_
diff --git a/tensorflow/contrib/lite/model_test.cc b/tensorflow/contrib/lite/model_test.cc
index 61043866420752b552281e353be9a2b41a6aadc8..66f22fd66a9ae0d35553a1f780ef73a5c5994c99 100644
--- a/tensorflow/contrib/lite/model_test.cc
+++ b/tensorflow/contrib/lite/model_test.cc
@@ -20,12 +20,12 @@ limitations under the License.
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
-#include <string>
 
 #include "tensorflow/contrib/lite/model.h"
 
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/testing/util.h"
 
 // Comparison for TfLiteRegistration. Since TfLiteRegistration is a C object,
 // we must declare this in global namespace, so argument-dependent operator
@@ -246,12 +246,26 @@ TEST(BasicFlatBufferModel, TestNullErrorReporter) {
   ASSERT_NE(interpreter->Invoke(), kTfLiteOk);
 }
 
-// Test what happens if we cannot bind any of the ops.
-TEST(BasicFlatBufferModel, TestBuildModelFromCorruptedData) {
-  std::string corrupted_data = "123";
-  auto model = FlatBufferModel::BuildFromBuffer(corrupted_data.c_str(),
-                                                corrupted_data.length());
-  ASSERT_FALSE(model);
+// Test that loading model directly from a Model flatbuffer works.
+TEST(BasicFlatBufferModel, TestBuildFromModel) {
+  TestErrorReporter reporter;
+  FileCopyAllocation model_allocation(
+      "tensorflow/contrib/lite/testdata/test_model.bin", &reporter);
+  ASSERT_TRUE(model_allocation.valid());
+  ::flatbuffers::Verifier verifier(
+      reinterpret_cast<const uint8_t*>(model_allocation.base()),
+      model_allocation.bytes());
+  ASSERT_TRUE(VerifyModelBuffer(verifier));
+  const Model* model_fb = ::tflite::GetModel(model_allocation.base());
+
+  auto model = FlatBufferModel::BuildFromModel(model_fb);
+  ASSERT_TRUE(model);
+
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(*model, TrivialResolver(&dummy_reg))(&interpreter),
+      kTfLiteOk);
+  ASSERT_NE(interpreter, nullptr);
 }
 
 // TODO(aselle): Add tests for serialization of builtin op data types.
@@ -261,7 +275,7 @@ TEST(BasicFlatBufferModel, TestBuildModelFromCorruptedData) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/models/BUILD b/tensorflow/contrib/lite/models/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6a1255b586ef04b80159156a78f0c4569a4661c5
--- /dev/null
+++ b/tensorflow/contrib/lite/models/BUILD
@@ -0,0 +1,26 @@
+# Model tests
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
+
+exports_files(glob([
+    "testdata/*",
+]))
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/models/smartreply/BUILD b/tensorflow/contrib/lite/models/smartreply/BUILD
index fbdf19f2054cf01aec44e3fcb13d0d0a2ff6f914..733c3f4c7fa0605f24a1e6b4c458e34310c079c4 100644
--- a/tensorflow/contrib/lite/models/smartreply/BUILD
+++ b/tensorflow/contrib/lite/models/smartreply/BUILD
@@ -1,7 +1,92 @@
 package(default_visibility = ["//visibility:public"])
 
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
+
 licenses(["notice"])  # Apache 2.0
 
+gen_selected_ops(
+    name = "smartreply_ops",
+    model = "@tflite_smartreply//:smartreply.tflite",
+)
+
+cc_library(
+    name = "custom_ops",
+    srcs = [
+        "ops/extract_feature.cc",
+        "ops/normalize.cc",
+        "ops/predict.cc",
+        ":smartreply_ops",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/tools:mutable_op_resolver",
+        "@com_google_absl//absl/strings",
+        "@com_googlesource_code_re2//:re2",
+        "@farmhash_archive//:farmhash",
+    ],
+)
+
+cc_library(
+    name = "predictor_lib",
+    srcs = ["predictor.cc"],
+    hdrs = ["predictor.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":custom_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/tools:mutable_op_resolver",
+        "@com_google_absl//absl/strings",
+        "@com_googlesource_code_re2//:re2",
+    ],
+)
+
+cc_test(
+    name = "extract_feature_op_test",
+    size = "small",
+    srcs = ["ops/extract_feature_test.cc"],
+    deps = [
+        ":custom_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@farmhash_archive//:farmhash",
+    ],
+)
+
+cc_test(
+    name = "normalize_op_test",
+    size = "small",
+    srcs = ["ops/normalize_test.cc"],
+    deps = [
+        ":custom_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "predict_op_test",
+    size = "small",
+    srcs = ["ops/predict_test.cc"],
+    deps = [
+        ":custom_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/AndroidManifest.xml b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/AndroidManifest.xml
new file mode 100644
index 0000000000000000000000000000000000000000..75ed9432c8fcdfd77a64d3c659e6336c977cdda2
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/AndroidManifest.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2017 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+  package="com.example.android.smartreply" >
+
+  <uses-sdk
+      android:minSdkVersion="15"
+      android:targetSdkVersion="24" />
+
+  <application android:label="TfLite SmartReply Demo">
+    <activity
+        android:name="com.example.android.smartreply.MainActivity"
+        android:configChanges="orientation|keyboardHidden|screenSize"
+        android:windowSoftInputMode="stateUnchanged|adjustPan"
+        android:label="TfLite SmartReply Demo"
+        android:screenOrientation="portrait" >
+      <intent-filter>
+        <action android:name="android.intent.action.MAIN" />
+        <category android:name="android.intent.category.LAUNCHER" />
+      </intent-filter>
+    </activity>
+  </application>
+
+</manifest>
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f8767b443a2aa64b666c3b6bfb7db30cc0be62ea
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD
@@ -0,0 +1,65 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/contrib/lite:build_def.bzl",
+    "tflite_copts",
+    "tflite_jni_binary",
+)
+
+filegroup(
+    name = "assets",
+    srcs = [
+        "@tflite_smartreply//:model_files",
+    ],
+)
+
+android_binary(
+    name = "SmartReplyDemo",
+    srcs = glob(["java/**/*.java"]),
+    assets = [":assets"],
+    assets_dir = "",
+    custom_package = "com.example.android.smartreply",
+    manifest = "AndroidManifest.xml",
+    nocompress_extensions = [
+        ".tflite",
+    ],
+    resource_files = glob(["res/**"]),
+    tags = ["manual"],
+    deps = [
+        ":smartreply_runtime",
+        "@androidsdk//com.android.support:support-v13-25.2.0",
+        "@androidsdk//com.android.support:support-v4-25.2.0",
+    ],
+)
+
+cc_library(
+    name = "smartreply_runtime",
+    srcs = ["libsmartreply_jni.so"],
+    visibility = ["//visibility:public"],
+)
+
+tflite_jni_binary(
+    name = "libsmartreply_jni.so",
+    deps = [
+        ":smartreply_jni_lib",
+    ],
+)
+
+cc_library(
+    name = "smartreply_jni_lib",
+    srcs = [
+        "smartreply_jni.cc",
+    ],
+    copts = tflite_copts(),
+    linkopts = [
+        "-lm",
+        "-ldl",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/models/smartreply:predictor_lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/BUILD b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3c882ffc43fde577801428151a43b592e8faaed1
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/BUILD
@@ -0,0 +1,15 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(glob(["*"]))
+
+filegroup(
+    name = "assets_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/backoff_response.txt b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/backoff_response.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a0a5b46b5f8d5fd6a0297c8056bb2fb9b6ad9ada
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/backoff_response.txt
@@ -0,0 +1,16 @@
+Ok
+Yes
+No
+👍
+☺
+😟
+❤️
+Lol
+Thanks
+Got it
+Done
+Nice
+I don't know
+What?
+Why?
+What's up?
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/MainActivity.java b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/MainActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..02fec9ae5e971ad756ae6c2b0149a6aacfa27cad
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/MainActivity.java
@@ -0,0 +1,99 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.smartreply;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.os.Handler;
+import android.util.Log;
+import android.view.View;
+import android.widget.Button;
+import android.widget.EditText;
+import android.widget.TextView;
+
+/**
+ * The main (and only) activity of this demo app. Displays a text box which updates as messages are
+ * received.
+ */
+public class MainActivity extends Activity {
+  private static final String TAG = "SmartReplyDemo";
+  private SmartReplyClient client;
+
+  private Button sendButton;
+  private TextView messageTextView;
+  private EditText messageInput;
+
+  private Handler handler;
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+    Log.v(TAG, "onCreate");
+    setContentView(R.layout.main_activity);
+
+    client = new SmartReplyClient(getApplicationContext());
+    handler = new Handler();
+
+    sendButton = (Button) findViewById(R.id.send_button);
+    sendButton.setOnClickListener(
+        (View v) -> {
+          send(messageInput.getText().toString());
+        });
+
+    messageTextView = (TextView) findViewById(R.id.message_text);
+    messageInput = (EditText) findViewById(R.id.message_input);
+  }
+
+  @Override
+  protected void onStart() {
+    super.onStart();
+    Log.v(TAG, "onStart");
+    handler.post(
+        () -> {
+          client.loadModel();
+        });
+  }
+
+  @Override
+  protected void onStop() {
+    super.onStop();
+    Log.v(TAG, "onStop");
+    handler.post(
+        () -> {
+          client.unloadModel();
+        });
+  }
+
+  private void send(final String message) {
+    handler.post(
+        () -> {
+          messageTextView.append("Input: " + message + "\n");
+
+          SmartReply[] ans = client.predict(new String[] {message});
+          for (SmartReply reply : ans) {
+            appendMessage("Reply: " + reply.getText());
+          }
+          appendMessage("------");
+        });
+  }
+
+  private void appendMessage(final String message) {
+    handler.post(
+        () -> {
+          messageTextView.append(message + "\n");
+        });
+  }
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReply.java b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReply.java
new file mode 100644
index 0000000000000000000000000000000000000000..3357fd17c11f870d1b0998bb26ffa9abf149686b
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReply.java
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.smartreply;
+
+import android.support.annotation.Keep;
+
+/**
+ * SmartReply contains predicted message, and confidence.
+ *
+ * <p>NOTE: this class used by JNI, class name and constructor should not be obfuscated.
+ */
+@Keep
+public class SmartReply {
+
+  private final String text;
+  private final float score;
+
+  @Keep
+  public SmartReply(String text, float score) {
+    this.text = text;
+    this.score = score;
+  }
+
+  public String getText() {
+    return text;
+  }
+
+  public float getScore() {
+    return score;
+  }
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
new file mode 100644
index 0000000000000000000000000000000000000000..d5b1ac0ffbc47283aa0c1bf68c0a85ad6228cdcc
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.smartreply;
+
+import android.content.Context;
+import android.content.res.AssetFileDescriptor;
+import android.support.annotation.Keep;
+import android.support.annotation.WorkerThread;
+import android.util.Log;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.List;
+
+/** Interface to load TfLite model and provide predictions. */
+public class SmartReplyClient implements AutoCloseable {
+  private static final String TAG = "SmartReplyDemo";
+  private static final String MODEL_PATH = "smartreply.tflite";
+  private static final String BACKOFF_PATH = "backoff_response.txt";
+  private static final String JNI_LIB = "smartreply_jni";
+
+  private final Context context;
+  private long storage;
+  private MappedByteBuffer model;
+
+  private volatile boolean isLibraryLoaded;
+
+  public SmartReplyClient(Context context) {
+    this.context = context;
+  }
+
+  public boolean isLoaded() {
+    return storage != 0;
+  }
+
+  @WorkerThread
+  public synchronized void loadModel() {
+    if (!isLibraryLoaded) {
+      System.loadLibrary(JNI_LIB);
+      isLibraryLoaded = true;
+    }
+
+    try {
+      model = loadModelFile();
+      String[] backoff = loadBackoffList();
+      storage = loadJNI(model, backoff);
+    } catch (IOException e) {
+      Log.e(TAG, "Fail to load model", e);
+      return;
+    }
+  }
+
+  @WorkerThread
+  public synchronized SmartReply[] predict(String[] input) {
+    if (storage != 0) {
+      return predictJNI(storage, input);
+    } else {
+      return new SmartReply[] {};
+    }
+  }
+
+  @WorkerThread
+  public synchronized void unloadModel() {
+    close();
+  }
+
+  @Override
+  public synchronized void close() {
+    if (storage != 0) {
+      unloadJNI(storage);
+      storage = 0;
+    }
+  }
+
+  private MappedByteBuffer loadModelFile() throws IOException {
+    AssetFileDescriptor fileDescriptor = context.getAssets().openFd(MODEL_PATH);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    try {
+      FileChannel fileChannel = inputStream.getChannel();
+      long startOffset = fileDescriptor.getStartOffset();
+      long declaredLength = fileDescriptor.getDeclaredLength();
+      return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+    } finally {
+      inputStream.close();
+    }
+  }
+
+  private String[] loadBackoffList() throws IOException {
+    List<String> labelList = new ArrayList<String>();
+    BufferedReader reader =
+        new BufferedReader(new InputStreamReader(context.getAssets().open(BACKOFF_PATH)));
+    String line;
+    while ((line = reader.readLine()) != null) {
+      if (!line.isEmpty()) {
+        labelList.add(line);
+      }
+    }
+    reader.close();
+    String[] ans = new String[labelList.size()];
+    labelList.toArray(ans);
+    return ans;
+  }
+
+  @Keep
+  private native long loadJNI(MappedByteBuffer buffer, String[] backoff);
+
+  @Keep
+  private native SmartReply[] predictJNI(long storage, String[] text);
+
+  @Keep
+  private native void unloadJNI(long storage);
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/res/layout/main_activity.xml b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/res/layout/main_activity.xml
new file mode 100644
index 0000000000000000000000000000000000000000..23b4cadc007a4457d33b8c8fecf9b1e7b7436320
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/res/layout/main_activity.xml
@@ -0,0 +1,44 @@
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:orientation="vertical">
+
+    <LinearLayout
+        android:layout_width="fill_parent"
+        android:layout_height="0dp"
+        android:padding="5dip"
+        android:layout_weight="3">
+
+        <TextView
+            android:id="@+id/message_text"
+            android:layout_width="fill_parent"
+            android:layout_height="fill_parent"
+            android:scrollbars="vertical"
+            android:gravity="bottom"/>
+    </LinearLayout>
+
+    <LinearLayout
+        android:layout_width="fill_parent"
+        android:layout_height="0dp"
+        android:padding="5dip"
+        android:layout_weight="1">
+
+        <EditText
+            android:id="@+id/message_input"
+            android:layout_width="0dp"
+            android:layout_height="fill_parent"
+            android:layout_weight="6"
+            android:scrollbars="vertical"
+            android:hint="Enter Text"
+            android:gravity="top"
+            android:inputType="text"/>
+        <Button
+            android:id="@+id/send_button"
+            android:layout_width="0dp"
+            android:layout_height="fill_parent"
+            android:layout_weight="2"
+            android:text="Send" />
+    </LinearLayout>
+
+</LinearLayout>
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f158cc511a9bee0710aee13cd04f77b6f95fb868
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <jni.h>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/models/smartreply/predictor.h"
+
+const char kIllegalStateException[] = "java/lang/IllegalStateException";
+
+using tflite::custom::smartreply::GetSegmentPredictions;
+using tflite::custom::smartreply::PredictorResponse;
+
+template <typename T>
+T CheckNotNull(JNIEnv* env, T&& t) {
+  if (t == nullptr) {
+    env->ThrowNew(env->FindClass(kIllegalStateException), "");
+    return nullptr;
+  }
+  return std::forward<T>(t);
+}
+
+std::vector<std::string> jniStringArrayToVector(JNIEnv* env,
+                                                jobjectArray string_array) {
+  int count = env->GetArrayLength(string_array);
+  std::vector<std::string> result;
+  for (int i = 0; i < count; i++) {
+    auto jstr =
+        reinterpret_cast<jstring>(env->GetObjectArrayElement(string_array, i));
+    const char* raw_str = env->GetStringUTFChars(jstr, JNI_FALSE);
+    result.emplace_back(std::string(raw_str));
+    env->ReleaseStringUTFChars(jstr, raw_str);
+  }
+  return result;
+}
+
+struct JNIStorage {
+  std::vector<std::string> backoff_list;
+  std::unique_ptr<::tflite::FlatBufferModel> model;
+};
+
+extern "C" JNIEXPORT jlong JNICALL
+Java_com_example_android_smartreply_SmartReplyClient_loadJNI(
+    JNIEnv* env, jobject thiz, jobject model_buffer,
+    jobjectArray backoff_list) {
+  const char* buf =
+      static_cast<char*>(env->GetDirectBufferAddress(model_buffer));
+  jlong capacity = env->GetDirectBufferCapacity(model_buffer);
+
+  JNIStorage* storage = new JNIStorage;
+  storage->model = tflite::FlatBufferModel::BuildFromBuffer(
+      buf, static_cast<size_t>(capacity));
+  storage->backoff_list = jniStringArrayToVector(env, backoff_list);
+
+  if (!storage->model) {
+    delete storage;
+    env->ThrowNew(env->FindClass(kIllegalStateException), "");
+    return 0;
+  }
+  return reinterpret_cast<jlong>(storage);
+}
+
+extern "C" JNIEXPORT jobjectArray JNICALL
+Java_com_example_android_smartreply_SmartReplyClient_predictJNI(
+    JNIEnv* env, jobject /*thiz*/, jlong storage_ptr, jobjectArray input_text) {
+  // Predict
+  if (storage_ptr == 0) {
+    return nullptr;
+  }
+  JNIStorage* storage = reinterpret_cast<JNIStorage*>(storage_ptr);
+  if (storage == nullptr) {
+    return nullptr;
+  }
+  std::vector<PredictorResponse> responses;
+  GetSegmentPredictions(jniStringArrayToVector(env, input_text),
+                        *storage->model, {storage->backoff_list}, &responses);
+
+  // Create a SmartReply[] to return back to Java
+  jclass smart_reply_class = CheckNotNull(
+      env, env->FindClass("com/example/android/smartreply/SmartReply"));
+  if (env->ExceptionCheck()) {
+    return nullptr;
+  }
+  jmethodID smart_reply_ctor = CheckNotNull(
+      env,
+      env->GetMethodID(smart_reply_class, "<init>", "(Ljava/lang/String;F)V"));
+  if (env->ExceptionCheck()) {
+    return nullptr;
+  }
+  jobjectArray array = CheckNotNull(
+      env, env->NewObjectArray(responses.size(), smart_reply_class, nullptr));
+  if (env->ExceptionCheck()) {
+    return nullptr;
+  }
+  for (int i = 0; i < responses.size(); i++) {
+    jstring text =
+        CheckNotNull(env, env->NewStringUTF(responses[i].GetText().data()));
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jobject reply = env->NewObject(smart_reply_class, smart_reply_ctor, text,
+                                   responses[i].GetScore());
+    env->SetObjectArrayElement(array, i, reply);
+  }
+  return array;
+}
+
+extern "C" JNIEXPORT void JNICALL
+Java_com_example_android_smartreply_SmartReplyClient_unloadJNI(
+    JNIEnv* env, jobject thiz, jlong storage_ptr) {
+  if (storage_ptr != 0) {
+    JNIStorage* storage = reinterpret_cast<JNIStorage*>(storage_ptr);
+    delete storage;
+  }
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/g3doc/README.md b/tensorflow/contrib/lite/models/smartreply/g3doc/README.md
index cab5dcca43a31ec3cf824f00d6794ea9e66d9bf8..a6d75648b3f3da98afd85daad6c2234e73a802e8 100644
--- a/tensorflow/contrib/lite/models/smartreply/g3doc/README.md
+++ b/tensorflow/contrib/lite/models/smartreply/g3doc/README.md
@@ -137,8 +137,8 @@ Following are the ops supported for using On-Device Smart Reply model:
 
 *   **HASHTABLE_LOOKUP**
 
-    This is a custom op that uses label id from predict op and looks up the
-    response text from the given label id.
+    This is an op inside TensorFlow Lite that uses label id from predict op and
+    looks up the response text from the given label id.
 
 ## Further Information
 
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc b/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
index 1c422b659abc0871a346b8cffc260df4b22a4f9d..f97a6486d6c11cf0184622f515fe5b1e096c6257 100644
--- a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
+++ b/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <map>
-#include "re2/re2.h"
+
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/string_util.h"
@@ -81,7 +81,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* label = GetOutput(context, node, 0);
   TfLiteTensor* weight = GetOutput(context, node, 1);
 
-  std::map<int64, int> feature_id_counts;
+  std::map<int64_t, int> feature_id_counts;
   for (int i = 0; i < num_strings; i++) {
     // Use fingerprint of feature name as id.
     auto strref = tflite::GetString(input, i);
@@ -91,10 +91,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       continue;
     }
 
-    int64 feature_id =
+    int64_t feature_id =
         ::util::Fingerprint64(strref.str, strref.len) % kMaxDimension;
-
-    label->data.i32[i] = static_cast<int32>(feature_id);
+    label->data.i32[i] = static_cast<int32_t>(feature_id);
     weight->data.f[i] =
         std::count(strref.str, strref.str + strref.len, ' ') + 1;
   }
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/normalize.cc b/tensorflow/contrib/lite/models/smartreply/ops/normalize.cc
index d0dc2a35a7cc527bef0b24508f207da8eec17fc0..c55ac9f52f7293a8ba5baf17f2052e11a7422074 100644
--- a/tensorflow/contrib/lite/models/smartreply/ops/normalize.cc
+++ b/tensorflow/contrib/lite/models/smartreply/ops/normalize.cc
@@ -21,7 +21,10 @@ limitations under the License.
 // Output:
 //     Output[0]: Normalized sentence. string[1]
 //
-#include "absl/strings/ascii.h"
+
+#include <algorithm>
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/strip.h"
 #include "re2/re2.h"
@@ -50,7 +53,7 @@ const std::map<string, string>* kRegexTransforms =
 
 static const char kStartToken[] = "<S>";
 static const char kEndToken[] = "<E>";
-static const int32 kMaxInputChars = 300;
+static const int32_t kMaxInputChars = 300;
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   tflite::StringRef input = tflite::GetString(GetInput(context, node, 0), 0);
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor.cc b/tensorflow/contrib/lite/models/smartreply/predictor.cc
index a28222213ea8c66a1e9288ba9ae06aea7653f108..6da5cc8eecc0920850f666b0992c4d9598c55b6c 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor.cc
+++ b/tensorflow/contrib/lite/models/smartreply/predictor.cc
@@ -30,7 +30,7 @@ namespace custom {
 namespace smartreply {
 
 // Split sentence into segments (using punctuation).
-std::vector<string> SplitSentence(const string& input) {
+std::vector<std::string> SplitSentence(const std::string& input) {
   string result(input);
 
   RE2::GlobalReplace(&result, "([?.!,])+", " \\1");
@@ -38,12 +38,13 @@ std::vector<string> SplitSentence(const string& input) {
   RE2::GlobalReplace(&result, "[ ]+", " ");
   RE2::GlobalReplace(&result, "\t+$", "");
 
-  return strings::Split(result, '\t');
+  return absl::StrSplit(result, '\t');
 }
 
 // Predict with TfLite model.
-void ExecuteTfLite(const string& sentence, ::tflite::Interpreter* interpreter,
-                   std::map<string, float>* response_map) {
+void ExecuteTfLite(const std::string& sentence,
+                   ::tflite::Interpreter* interpreter,
+                   std::map<std::string, float>* response_map) {
   {
     TfLiteTensor* input = interpreter->tensor(interpreter->inputs()[0]);
     tflite::DynamicBuffer buf;
@@ -67,8 +68,8 @@ void ExecuteTfLite(const string& sentence, ::tflite::Interpreter* interpreter,
 }
 
 void GetSegmentPredictions(
-    const std::vector<string>& input, const ::tflite::FlatBufferModel& model,
-    const SmartReplyConfig& config,
+    const std::vector<std::string>& input,
+    const ::tflite::FlatBufferModel& model, const SmartReplyConfig& config,
     std::vector<PredictorResponse>* predictor_responses) {
   // Initialize interpreter
   std::unique_ptr<::tflite::Interpreter> interpreter;
@@ -82,10 +83,10 @@ void GetSegmentPredictions(
   }
 
   // Execute Tflite Model
-  std::map<string, float> response_map;
-  std::vector<string> sentences;
-  for (const string& str : input) {
-    std::vector<string> splitted_str = SplitSentence(str);
+  std::map<std::string, float> response_map;
+  std::vector<std::string> sentences;
+  for (const std::string& str : input) {
+    std::vector<std::string> splitted_str = SplitSentence(str);
     sentences.insert(sentences.end(), splitted_str.begin(), splitted_str.end());
   }
   for (const auto& sentence : sentences) {
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor.h b/tensorflow/contrib/lite/models/smartreply/predictor.h
index 3b9a2b32e17f93f7ebbf35e77ec1e238fe14b020..90260c8d620b0e756f72089d3f4d8d9f92d44fbe 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor.h
+++ b/tensorflow/contrib/lite/models/smartreply/predictor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
+#define TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
 
 #include <string>
 #include <vector>
@@ -34,7 +34,7 @@ struct SmartReplyConfig;
 // With a given string as input, predict the response with a Tflite model.
 // When config.backoff_response is not empty, predictor_responses will be filled
 // with messagees from backoff response.
-void GetSegmentPredictions(const std::vector<string>& input,
+void GetSegmentPredictions(const std::vector<std::string>& input,
                            const ::tflite::FlatBufferModel& model,
                            const SmartReplyConfig& config,
                            std::vector<PredictorResponse>* predictor_responses);
@@ -43,17 +43,17 @@ void GetSegmentPredictions(const std::vector<string>& input,
 // It includes messages, and confidence.
 class PredictorResponse {
  public:
-  PredictorResponse(const string& response_text, float score) {
+  PredictorResponse(const std::string& response_text, float score) {
     response_text_ = response_text;
     prediction_score_ = score;
   }
 
   // Accessor methods.
-  const string& GetText() const { return response_text_; }
+  const std::string& GetText() const { return response_text_; }
   float GetScore() const { return prediction_score_; }
 
  private:
-  string response_text_ = "";
+  std::string response_text_ = "";
   float prediction_score_ = 0.0;
 };
 
@@ -65,9 +65,9 @@ struct SmartReplyConfig {
   float backoff_confidence;
   // Backoff responses are used when predicted responses cannot fulfill the
   // list.
-  const std::vector<string>& backoff_responses;
+  const std::vector<std::string>& backoff_responses;
 
-  SmartReplyConfig(std::vector<string> backoff_responses)
+  SmartReplyConfig(std::vector<std::string> backoff_responses)
       : num_response(kDefaultNumResponse),
         backoff_confidence(kDefaultBackoffConfidence),
         backoff_responses(backoff_responses) {}
@@ -77,4 +77,4 @@ struct SmartReplyConfig {
 }  // namespace custom
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor_test.cc b/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
index 2fa9923bc93d7e559884b6880187637b78f4b217..e6c8d966f1aff5a867f9469f8fcdec526df84763 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
+++ b/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
@@ -18,12 +18,13 @@ limitations under the License.
 #include <fstream>
 #include <unordered_set>
 
-#include "base/logging.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
-#include "tensorflow/contrib/lite/models/test_utils.h"
+//#include "tensorflow/contrib/lite/models/test_utils.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/core/platform/test.h"
 
 namespace tflite {
 namespace custom {
@@ -33,6 +34,11 @@ namespace {
 const char kModelName[] = "smartreply_ondevice_model.bin";
 const char kSamples[] = "smartreply_samples.tsv";
 
+string TestDataPath() {
+  return string(StrCat(tensorflow::testing::TensorFlowSrcRoot(), "/",
+                       "contrib/lite/models/testdata/"));
+}
+
 MATCHER_P(IncludeAnyResponesIn, expected_response, "contains the response") {
   bool has_expected_response = false;
   for (const auto &item : *arg) {
@@ -65,7 +71,6 @@ TEST_F(PredictorTest, GetSegmentPredictions) {
 
   float max = 0;
   for (const auto &item : predictions) {
-    LOG(INFO) << "Response: " << item.GetText();
     if (item.GetScore() > max) {
       max = item.GetScore();
     }
@@ -86,7 +91,6 @@ TEST_F(PredictorTest, TestTwoSentences) {
 
   float max = 0;
   for (const auto &item : predictions) {
-    LOG(INFO) << "Response: " << item.GetText();
     if (item.GetScore() > max) {
       max = item.GetScore();
     }
@@ -119,7 +123,7 @@ TEST_F(PredictorTest, BatchTest) {
   string line;
   std::ifstream fin(StrCat(TestDataPath(), "/", kSamples));
   while (std::getline(fin, line)) {
-    const std::vector<string> &fields = strings::Split(line, '\t');
+    const std::vector<string> fields = absl::StrSplit(line, '\t');
     if (fields.empty()) {
       continue;
     }
@@ -139,9 +143,8 @@ TEST_F(PredictorTest, BatchTest) {
                                   fields.begin() + 1, fields.end())));
   }
 
-  LOG(INFO) << "Responses: " << total_responses << " / " << total_items;
-  LOG(INFO) << "Triggers: " << total_triggers << " / " << total_items;
   EXPECT_EQ(total_triggers, total_items);
+  EXPECT_GE(total_responses, total_triggers);
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/models/speech_hotword_model_test.cc b/tensorflow/contrib/lite/models/speech_hotword_model_test.cc
deleted file mode 100644
index 0b8266447adf758184fe3b1ad6a77f1ac6045193..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/models/speech_hotword_model_test.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for speech Hotword model using TFLite Ops.
-
-#include <string.h>
-
-#include <memory>
-#include <string>
-
-#include "base/logging.h"
-#include "testing/base/public/googletest.h"
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/models/test_utils.h"
-
-namespace tflite {
-namespace models {
-
-void RunTest(int model_input_tensor, int svdf_layer_state_tensor,
-             int model_output_tensor, const string& model_name,
-             const string& golden_in_name, const string& golden_out_name) {
-  // Read the model.
-  string tflite_file_path = StrCat(TestDataPath(), "/", model_name);
-  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
-  CHECK(model) << "Failed to read model from file " << tflite_file_path;
-
-  // Initialize the interpreter.
-  ops::builtin::BuiltinOpResolver builtins;
-  std::unique_ptr<Interpreter> interpreter;
-  InterpreterBuilder(*model, builtins)(&interpreter);
-  CHECK(interpreter != nullptr);
-  interpreter->AllocateTensors();
-
-  // Reset the SVDF layer state.
-  memset(interpreter->tensor(svdf_layer_state_tensor)->data.raw, 0,
-         interpreter->tensor(svdf_layer_state_tensor)->bytes);
-
-  // Load the input frames.
-  Frames input_frames;
-  const string input_file_path = StrCat(TestDataPath(), "/", golden_in_name);
-  ReadFrames(input_file_path, &input_frames);
-
-  // Load the golden output results.
-  Frames output_frames;
-  const string output_file_path = StrCat(TestDataPath(), "/", golden_out_name);
-  ReadFrames(output_file_path, &output_frames);
-
-  const int speech_batch_size =
-      interpreter->tensor(model_input_tensor)->dims->data[0];
-  const int speech_input_size =
-      interpreter->tensor(model_input_tensor)->dims->data[1];
-  const int speech_output_size =
-      interpreter->tensor(model_output_tensor)->dims->data[1];
-  const int input_sequence_size =
-      input_frames[0].size() / (speech_input_size * speech_batch_size);
-  float* input_ptr = interpreter->tensor(model_input_tensor)->data.f;
-  float* output_ptr = interpreter->tensor(model_output_tensor)->data.f;
-
-  // The first layer (SVDF) input size is 40 (speech_input_size). Each speech
-  // input frames for this model is 1280 floats, which can be fed to input in a
-  // sequence of size 32 (input_sequence_size).
-  for (int i = 0; i < TestInputSize(input_frames); i++) {
-    int frame_ptr = 0;
-    for (int s = 0; s < input_sequence_size; s++) {
-      for (int k = 0; k < speech_input_size * speech_batch_size; k++) {
-        input_ptr[k] = input_frames[i][frame_ptr++];
-      }
-      interpreter->Invoke();
-    }
-    // After the whole frame (1280 floats) is fed, we can check the output frame
-    // matches with the golden output frame.
-    for (int k = 0; k < speech_output_size; k++) {
-      ASSERT_NEAR(output_ptr[k], output_frames[i][k], 1e-5);
-    }
-  }
-}
-
-TEST(SpeechHotword, OkGoogleTestRank1) {
-  constexpr int kModelInputTensor = 0;
-  constexpr int kSvdfLayerStateTensor = 4;
-  constexpr int kModelOutputTensor = 18;
-
-  RunTest(kModelInputTensor, kSvdfLayerStateTensor, kModelOutputTensor,
-          "speech_hotword_model_rank1.tflite", "speech_hotword_model_in.csv",
-          "speech_hotword_model_out_rank1.csv");
-}
-
-TEST(SpeechHotword, OkGoogleTestRank2) {
-  constexpr int kModelInputTensor = 17;
-  constexpr int kSvdfLayerStateTensor = 1;
-  constexpr int kModelOutputTensor = 18;
-  RunTest(kModelInputTensor, kSvdfLayerStateTensor, kModelOutputTensor,
-          "speech_hotword_model_rank2.tflite", "speech_hotword_model_in.csv",
-          "speech_hotword_model_out_rank2.csv");
-}
-
-}  // namespace models
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc b/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc
deleted file mode 100644
index 9da0fb1fc62360dcf584c4a08f99b0cef9964a0d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for speech SpeakerId model using TFLite Ops.
-
-#include <string.h>
-
-#include <memory>
-#include <string>
-
-#include "base/logging.h"
-#include "testing/base/public/googletest.h"
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/models/test_utils.h"
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
-
-void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
-
-namespace tflite {
-namespace models {
-
-constexpr int kModelInputTensor = 0;
-constexpr int kLstmLayer1OutputStateTensor = 19;
-constexpr int kLstmLayer1CellStateTensor = 20;
-constexpr int kLstmLayer2OutputStateTensor = 40;
-constexpr int kLstmLayer2CellStateTensor = 41;
-constexpr int kLstmLayer3OutputStateTensor = 61;
-constexpr int kLstmLayer3CellStateTensor = 62;
-constexpr int kModelOutputTensor = 66;
-
-TEST(SpeechSpeakerId, OkGoogleTest) {
-  // Read the model.
-  string tflite_file_path =
-      StrCat(TestDataPath(), "/", "speech_speakerid_model.tflite");
-  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
-  CHECK(model) << "Failed to read model from file " << tflite_file_path;
-
-  // Initialize the interpreter.
-  ::tflite::MutableOpResolver resolver;
-  RegisterSelectedOps(&resolver);
-  std::unique_ptr<Interpreter> interpreter;
-  InterpreterBuilder(*model, resolver)(&interpreter);
-  CHECK(interpreter != nullptr);
-  interpreter->AllocateTensors();
-
-  // Load the input frames.
-  Frames input_frames;
-  const string input_file_path =
-      StrCat(TestDataPath(), "/", "speech_speakerid_model_in.csv");
-  ReadFrames(input_file_path, &input_frames);
-
-  // Load the golden output results.
-  Frames output_frames;
-  const string output_file_path =
-      StrCat(TestDataPath(), "/", "speech_speakerid_model_out.csv");
-  ReadFrames(output_file_path, &output_frames);
-
-  const int speech_batch_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[0];
-  const int speech_input_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[1];
-  const int speech_output_size =
-      interpreter->tensor(kModelOutputTensor)->dims->data[1];
-
-  float* input_ptr = interpreter->tensor(kModelInputTensor)->data.f;
-  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
-
-  // Clear the LSTM state for layers.
-  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer3OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer3CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3CellStateTensor)->bytes);
-  for (int i = 0; i < input_frames.size(); i++) {
-    // Feed the input to model.
-    int frame_ptr = 0;
-    for (int k = 0; k < speech_input_size * speech_batch_size; k++) {
-      input_ptr[k] = input_frames[i][frame_ptr++];
-    }
-    // Run the model.
-    interpreter->Invoke();
-    // Validate the output.
-    for (int k = 0; k < speech_output_size; k++) {
-      ASSERT_NEAR(output_ptr[k], output_frames[i][k], 1e-5);
-    }
-  }
-}
-
-}  // namespace models
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_terse_am_model_test.cc b/tensorflow/contrib/lite/models/speech_terse_am_model_test.cc
deleted file mode 100644
index 30d89a135403db2ef6e4533ddcc321206bf8bd5e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/models/speech_terse_am_model_test.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for speech TERSE AM model using TFLite Ops.
-
-#include <string.h>
-
-#include <memory>
-#include <string>
-
-#include "base/logging.h"
-#include "file/base/path.h"
-#include "testing/base/public/googletest.h"
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/models/test_utils.h"
-
-namespace tflite {
-namespace models {
-
-constexpr int kModelInputTensor = 0;
-constexpr int kLstmLayer1OutputStateTensor = 19;
-constexpr int kLstmLayer1CellStateTensor = 20;
-constexpr int kLstmLayer2OutputStateTensor = 40;
-constexpr int kLstmLayer2CellStateTensor = 41;
-constexpr int kLstmLayer3OutputStateTensor = 61;
-constexpr int kLstmLayer3CellStateTensor = 62;
-constexpr int kLstmLayer4OutputStateTensor = 82;
-constexpr int kLstmLayer4CellStateTensor = 83;
-constexpr int kLstmLayer5OutputStateTensor = 103;
-constexpr int kLstmLayer5CellStateTensor = 104;
-constexpr int kModelOutputTensor = 109;
-
-TEST(SpeechTerseAm, RandomIOTest) {
-  // Read the model.
-  string tflite_file_path =
-      file::JoinPath(TestDataPath(), "speech_terse_am_model.tflite");
-  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
-  CHECK(model) << "Failed to mmap model " << tflite_file_path;
-
-  // Initialize the interpreter.
-  ops::builtin::BuiltinOpResolver builtins;
-  std::unique_ptr<Interpreter> interpreter;
-  InterpreterBuilder(*model, builtins)(&interpreter);
-  CHECK(interpreter != nullptr);
-  interpreter->AllocateTensors();
-
-  // Load the input frames.
-  Frames input_frames;
-  const string input_file_path =
-      file::JoinPath(TestDataPath(), "speech_terse_am_model_in.csv");
-  ReadFrames(input_file_path, &input_frames);
-
-  // Load the golden output results.
-  Frames output_frames;
-  const string output_file_path =
-      file::JoinPath(TestDataPath(), "speech_terse_am_model_out.csv");
-  ReadFrames(output_file_path, &output_frames);
-
-  const int speech_batch_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[0];
-  const int speech_input_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[1];
-  const int speech_output_size =
-      interpreter->tensor(kModelOutputTensor)->dims->data[1];
-
-  float* input_ptr = interpreter->tensor(kModelInputTensor)->data.f;
-  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
-
-  // Clear the LSTM state for layers.
-  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer3OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer3CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer4OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer4OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer4CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer4CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer5OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer5OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer5CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer5CellStateTensor)->bytes);
-
-
-  for (int i = 0; i < input_frames.size(); i++) {
-    // Feed the input to model.
-    int frame_ptr = 0;
-    for (int k = 0; k < speech_input_size * speech_batch_size; k++) {
-      input_ptr[k] = input_frames[i][frame_ptr++];
-    }
-    // Run the model.
-    interpreter->Invoke();
-    // Validate the output.
-    for (int k = 0; k < speech_output_size; k++) {
-      ASSERT_NEAR(output_ptr[k], output_frames[i][k], 5.2e-4);
-    }
-  }
-}
-
-}  // namespace models
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_terse_lm_model_test.cc b/tensorflow/contrib/lite/models/speech_terse_lm_model_test.cc
deleted file mode 100644
index 04c54ffb2201acaac069e01707e10194f78789fd..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/models/speech_terse_lm_model_test.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for speech ASR LM model using TFLite Ops.
-
-#include <string.h>
-
-#include <memory>
-#include <string>
-
-#include "base/logging.h"
-#include "file/base/path.h"
-#include "testing/base/public/googletest.h"
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/models/test_utils.h"
-
-namespace tflite {
-namespace models {
-
-constexpr int kModelInput1Tensor = 0;
-constexpr int kModelInput2Tensor = 66;
-constexpr int kLstmLayer1OutputStateTensor = 21;
-constexpr int kLstmLayer1CellStateTensor = 22;
-constexpr int kLstmLayer2OutputStateTensor = 42;
-constexpr int kLstmLayer2CellStateTensor = 43;
-constexpr int kLstmLayer3OutputStateTensor = 63;
-constexpr int kLstmLayer3CellStateTensor = 64;
-constexpr int kModelOutputTensor = 75;
-
-static void ClearLstmStates(Interpreter* interpreter) {
-  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer3OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer3CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3CellStateTensor)->bytes);
-}
-
-TEST(SpeechTerseLm, EndToEndTest) {
-  // Read the model.
-  string tflite_file_path =
-      file::JoinPath(TestDataPath(), "speech_terse_lm_model.tflite");
-  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
-  CHECK(model) << "Failed to mmap model " << tflite_file_path;
-
-  // Initialize the interpreter.
-  ops::builtin::BuiltinOpResolver builtins;
-  std::unique_ptr<Interpreter> interpreter;
-  InterpreterBuilder(*model, builtins)(&interpreter);
-  CHECK(interpreter != nullptr);
-  interpreter->AllocateTensors();
-
-  // Load the input frames.
-  Frames input_frames;
-  const string input_file_path =
-      file::JoinPath(TestDataPath(), "speech_terse_lm_model_in.csv");
-  ReadFrames(input_file_path, &input_frames);
-
-  // Load the golden output results.
-  Frames output_frames;
-  const string output_file_path =
-      file::JoinPath(TestDataPath(), "speech_terse_lm_model_out.csv");
-  ReadFrames(output_file_path, &output_frames);
-
-  CHECK_EQ(interpreter->tensor(kModelInput1Tensor)->dims->size, 1);
-  const int input1_size =
-      interpreter->tensor(kModelInput1Tensor)->dims->data[0];
-  CHECK_EQ(input1_size, 1);
-  CHECK_EQ(interpreter->tensor(kModelInput2Tensor)->dims->size, 1);
-  const int output_size =
-      interpreter->tensor(kModelOutputTensor)->dims->data[0];
-  CHECK_EQ(output_size, 1);
-
-  int* input_lookup_ptr = interpreter->tensor(kModelInput1Tensor)->data.i32;
-  int* output_lookup_ptr = interpreter->tensor(kModelInput2Tensor)->data.i32;
-  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
-
-
-  for (int i = 0; i < input_frames.size(); i++) {
-    float output_score = 0.0f;
-    // Reset LSTM states for each sequence.
-    ClearLstmStates(interpreter.get());
-    // For subsequent inputs feed them sequentially, one-by-one.
-    for (int k = 1; k < input_frames[i].size(); k++) {
-      // Feed the inputs to model.
-      input_lookup_ptr[0] = static_cast<int32>(input_frames[i][k - 1]);
-      output_lookup_ptr[0] = static_cast<int32>(input_frames[i][k]);
-      // Run the model.
-      interpreter->Invoke();
-      // Sum up the outputs.
-      output_score += output_ptr[0];
-    }
-    // Validate the output.
-    ASSERT_NEAR(output_score, output_frames[i][0], 1.4e-5);
-  }
-}
-
-}  // namespace models
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_test.cc b/tensorflow/contrib/lite/models/speech_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..daa8c3100b64e9290256aa14a6ab641f19174a0a
--- /dev/null
+++ b/tensorflow/contrib/lite/models/speech_test.cc
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for speech models (Hotword, SpeakerId) using TFLite Ops.
+
+#include <memory>
+#include <string>
+
+#include <fstream>
+
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/parse_testdata.h"
+#include "tensorflow/contrib/lite/testing/split.h"
+#include "tensorflow/contrib/lite/testing/tflite_driver.h"
+
+namespace tflite {
+namespace {
+
+const char kDataPath[] = "third_party/tensorflow/contrib/lite/models/testdata/";
+
+bool Init(const string& in_file_name, testing::TfLiteDriver* driver,
+          std::ifstream* in_file) {
+  driver->SetModelBaseDir(kDataPath);
+  in_file->open(string(kDataPath) + in_file_name, std::ifstream::in);
+  return in_file->is_open();
+}
+
+// Converts a set of test files provided by the speech team into a single
+// test_spec. Input CSV files are supposed to contain a number of sequences per
+// line. Each sequence maps to a single invocation of the interpreter and the
+// output tensor after all sequences have run is compared to the corresponding
+// line in the output CSV file.
+bool ConvertCsvData(const string& model_name, const string& in_name,
+                    const string& out_name, const string& input_tensor,
+                    const string& output_tensor,
+                    const string& persistent_tensors, int sequence_size,
+                    std::ostream* out) {
+  auto data_path = [](const string& s) { return string(kDataPath) + s; };
+
+  *out << "load_model: \"" << data_path(model_name) << "\"" << std::endl;
+
+  *out << "init_state: \"" << persistent_tensors << "\"" << std::endl;
+
+  string in_file_name = data_path(in_name);
+  std::ifstream in_file(in_file_name);
+  if (!in_file.is_open()) {
+    std::cerr << "Failed to open " << in_file_name << std::endl;
+    return false;
+  }
+  string out_file_name = data_path(out_name);
+  std::ifstream out_file(out_file_name);
+  if (!out_file.is_open()) {
+    std::cerr << "Failed to open " << out_file_name << std::endl;
+    return false;
+  }
+
+  int invocation_count = 0;
+  string in_values;
+  while (std::getline(in_file, in_values, '\n')) {
+    std::vector<string> input = testing::Split<string>(in_values, ",");
+    int num_sequences = input.size() / sequence_size;
+
+    for (int j = 0; j < num_sequences; ++j) {
+      *out << "invoke {" << std::endl;
+      *out << "  id: " << invocation_count << std::endl;
+      *out << "  input: \"";
+      for (int k = 0; k < sequence_size; ++k) {
+        *out << input[k + j * sequence_size] << ",";
+      }
+      *out << "\"" << std::endl;
+
+      if (j == num_sequences - 1) {
+        string out_values;
+        if (!std::getline(out_file, out_values, '\n')) {
+          std::cerr << "Not enough lines in " << out_file_name << std::endl;
+          return false;
+        }
+        *out << "  output: \"" << out_values << "\"" << std::endl;
+      }
+
+      *out << "}" << std::endl;
+      ++invocation_count;
+    }
+  }
+  return true;
+}
+
+TEST(SpeechTest, HotwordOkGoogleRank1Test) {
+  std::stringstream os;
+  ASSERT_TRUE(ConvertCsvData(
+      "speech_hotword_model_rank1.tflite", "speech_hotword_model_in.csv",
+      "speech_hotword_model_out_rank1.csv", /*input_tensor=*/"0",
+      /*output_tensor=*/"18", /*persistent_tensors=*/"4",
+      /*sequence_size=*/40, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+TEST(SpeechTest, HotwordOkGoogleRank2Test) {
+  std::stringstream os;
+  ASSERT_TRUE(ConvertCsvData(
+      "speech_hotword_model_rank2.tflite", "speech_hotword_model_in.csv",
+      "speech_hotword_model_out_rank2.csv", /*input_tensor=*/"17",
+      /*output_tensor=*/"18", /*persistent_tensors=*/"1",
+      /*sequence_size=*/40, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+TEST(SpeechTest, SpeakerIdOkGoogleTest) {
+  std::stringstream os;
+  ASSERT_TRUE(ConvertCsvData(
+      "speech_speakerid_model.tflite", "speech_speakerid_model_in.csv",
+      "speech_speakerid_model_out.csv", /*input_tensor=*/"0",
+      /*output_tensor=*/"66",
+      /*persistent_tensors=*/"19,20,40,41,61,62",
+      /*sequence_size=*/80, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+TEST(SpeechTest, AsrAmTest) {
+  std::stringstream os;
+  ASSERT_TRUE(
+      ConvertCsvData("speech_asr_am_model.tflite", "speech_asr_am_model_in.csv",
+                     "speech_asr_am_model_out.csv", /*input_tensor=*/"0",
+                     /*output_tensor=*/"109",
+                     /*persistent_tensors=*/"19,20,40,41,61,62,82,83,103,104",
+                     /*sequence_size=*/320, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+// The original version of speech_asr_lm_model_test.cc ran a few sequences
+// through the interpreter and stored the sum of all the output, which was them
+// compared for correctness. In this test we are comparing all the intermediate
+// results.
+TEST(SpeechTest, AsrLmTest) {
+  std::ifstream in_file;
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(Init("speech_asr_lm_model.test_spec", &test_driver, &in_file));
+  ASSERT_TRUE(testing::ParseAndRunTests(&in_file, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+TEST(SpeechTest, EndpointerTest) {
+  std::stringstream os;
+  ASSERT_TRUE(ConvertCsvData(
+      "speech_endpointer_model.tflite", "speech_endpointer_model_in.csv",
+      "speech_endpointer_model_out.csv", /*input_tensor=*/"0",
+      /*output_tensor=*/"58",
+      /*persistent_tensors=*/"28,29,49,50",
+      /*sequence_size=*/320, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+TEST(SpeechTest, TtsTest) {
+  std::stringstream os;
+  ASSERT_TRUE(ConvertCsvData("speech_tts_model.tflite",
+                             "speech_tts_model_in.csv",
+                             "speech_tts_model_out.csv", /*input_tensor=*/"0",
+                             /*output_tensor=*/"74",
+                             /*persistent_tensors=*/"25,26,46,47,67,68,73",
+                             /*sequence_size=*/334, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver))
+      << test_driver.GetErrorMessage();
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_tts_model_test.cc b/tensorflow/contrib/lite/models/speech_tts_model_test.cc
deleted file mode 100644
index 88291776892f3186ca5bfc726e814f8d23d73b11..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/models/speech_tts_model_test.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for speech TTS model using TFLite Ops.
-
-#include <string.h>
-
-#include <memory>
-#include <string>
-
-#include "base/logging.h"
-#include "testing/base/public/googletest.h"
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/models/test_utils.h"
-
-namespace tflite {
-namespace models {
-
-constexpr int kModelInputTensor = 0;
-constexpr int kLstmLayer1OutputStateTensor = 25;
-constexpr int kLstmLayer1CellStateTensor = 26;
-constexpr int kLstmLayer2OutputStateTensor = 46;
-constexpr int kLstmLayer2CellStateTensor = 47;
-constexpr int kLstmLayer3OutputStateTensor = 67;
-constexpr int kLstmLayer3CellStateTensor = 68;
-constexpr int kRnnLayerHiddenStateTensor = 73;
-constexpr int kModelOutputTensor = 74;
-
-TEST(SpeechTTS, RandomIOTest) {
-  // Read the model.
-  string tflite_file_path =
-      StrCat(TestDataPath(), "/", "speech_tts_model.tflite");
-  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
-  CHECK(model) << "Failed to mmap model " << tflite_file_path;
-
-  // Initialize the interpreter.
-  ops::builtin::BuiltinOpResolver builtins;
-  std::unique_ptr<Interpreter> interpreter;
-  InterpreterBuilder(*model, builtins)(&interpreter);
-  CHECK(interpreter != nullptr);
-  interpreter->AllocateTensors();
-
-  // Load the input frames.
-  Frames input_frames;
-  const string input_file_path =
-      StrCat(TestDataPath(), "/", "speech_tts_model_in.csv");
-  ReadFrames(input_file_path, &input_frames);
-
-  // Load the golden output results.
-  Frames output_frames;
-  const string output_file_path =
-      StrCat(TestDataPath(), "/", "speech_tts_model_out.csv");
-  ReadFrames(output_file_path, &output_frames);
-
-  const int speech_batch_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[0];
-  const int speech_input_size =
-      interpreter->tensor(kModelInputTensor)->dims->data[1];
-  const int speech_output_size =
-      interpreter->tensor(kModelOutputTensor)->dims->data[1];
-
-  float* input_ptr = interpreter->tensor(kModelInputTensor)->data.f;
-  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
-
-  // Clear the LSTM state for layers.
-  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kLstmLayer3OutputStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3OutputStateTensor)->bytes);
-  memset(interpreter->tensor(kLstmLayer3CellStateTensor)->data.raw, 0,
-         interpreter->tensor(kLstmLayer3CellStateTensor)->bytes);
-
-  memset(interpreter->tensor(kRnnLayerHiddenStateTensor)->data.raw, 0,
-         interpreter->tensor(kRnnLayerHiddenStateTensor)->bytes);
-
-  for (int i = 0; i < input_frames.size(); i++) {
-    // Feed the input to model.
-    int frame_ptr = 0;
-    for (int k = 0; k < speech_input_size * speech_batch_size; k++) {
-      input_ptr[k] = input_frames[i][frame_ptr++];
-    }
-    // Run the model.
-    interpreter->Invoke();
-    // Validate the output.
-    for (int k = 0; k < speech_output_size; k++) {
-      ASSERT_NEAR(output_ptr[k], output_frames[i][k], 1e-5);
-    }
-  }
-}
-
-}  // namespace models
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/test_utils.h b/tensorflow/contrib/lite/models/test_utils.h
deleted file mode 100644
index 1e14c26a3544ed44f9395ff3b59a70551a1a6394..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/models/test_utils.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_TEST_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_TEST_UTILS_H_
-
-#include <stdlib.h>
-#include <string.h>
-
-#include <fstream>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace tflite {
-namespace models {
-using Frames = std::vector<std::vector<float>>;
-}  // namespace models
-}  // namespace tflite
-
-#ifndef __ANDROID__
-#include "absl/strings/str_cat.h"
-#include "tensorflow/core/platform/test.h"
-
-inline string TestDataPath() {
-  return string(StrCat(tensorflow::testing::TensorFlowSrcRoot(), "/",
-                       "contrib/lite/models/testdata/"));
-}
-inline int TestInputSize(const tflite::models::Frames& input_frames) {
-  return input_frames.size();
-}
-#else
-inline string TestDataPath() {
-  return string("third_party/tensorflow/contrib/lite/models/testdata/");
-}
-
-inline int TestInputSize(const tflite::models::Frames& input_frames) {
-  // Android TAP is very slow, we only test the first 20 frames.
-  return 20;
-}
-#endif
-
-namespace tflite {
-namespace models {
-
-// Read float data from a comma-separated file:
-// Each line will be read into a float vector.
-// The return result will be a vector of float vectors.
-void ReadFrames(const string& csv_file_path, Frames* frames) {
-  std::ifstream csv_file(csv_file_path);
-  string line;
-  while (std::getline(csv_file, line, '\n')) {
-    std::vector<float> fields;
-    // Used by strtok_r internaly for successive calls on the same string.
-    char* save_ptr = nullptr;
-
-    // Tokenize the line.
-    char* next_token =
-        strtok_r(const_cast<char*>(line.c_str()), ",", &save_ptr);
-    while (next_token != nullptr) {
-      float f = strtod(next_token, nullptr);
-      fields.push_back(f);
-      next_token = strtok_r(nullptr, ",", &save_ptr);
-    }
-    frames->push_back(fields);
-  }
-  csv_file.close();
-}
-
-}  // namespace models
-}  // namespace tflite
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_TEST_UTILS_H_
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/README.md b/tensorflow/contrib/lite/models/testdata/g3doc/README.md
index c9630c00db56a0d40979f9fe9704cf0c9583a015..1c47e00aae2a0e76ba04004a2fc3cc02ec4536f7 100644
--- a/tensorflow/contrib/lite/models/testdata/g3doc/README.md
+++ b/tensorflow/contrib/lite/models/testdata/g3doc/README.md
@@ -53,7 +53,7 @@ with the corresponding parameters as shown in the figure.
 ### Automatic Speech Recognizer (ASR) Acoustic Model (AM)
 
 The acoustic model for automatic speech recognition is the neural network model
-for matching phonemes to the input autio features. It generates posterior
+for matching phonemes to the input audio features. It generates posterior
 probabilities of phonemes from speech frontend features (log-mel filterbanks).
 It has an input size of 320 (float), an output size of 42 (float), five LSTM
 layers and one fully connected layers with a Softmax activation function, with
@@ -68,13 +68,27 @@ for predicting the probability of a word given previous words in a sentence.
 It generates posterior probabilities of the next word based from a sequence of
 words. The words are encoded as indices in a fixed size dictionary.
 The model has two inputs both of size one (integer): the current word index and
-next word index, an output size of one (float): the log probability. It consits
+next word index, an output size of one (float): the log probability. It consists
 of three embedding layer, three LSTM layers, followed by a multiplication, a
 fully connected layers and an addition.
 The corresponding parameters as shown in the figure.
 
 ![asr_lm_model](asr_lm.svg "ASR LM model")
 
+### Endpointer Model
+
+The endpointer model is the neural network model for predicting end of speech
+in an utterance. More precisely, it generates posterior probabilities of various
+events that allow detection of speech start and end events.
+It has an input size of 40 (float) which are speech frontend features
+(log-mel filterbanks), and an output size of four corresponding to:
+speech, intermediate non-speech, initial non-speech, and final non-speech.
+The model consists of a convolutional layer, followed by a fully-connected
+layer, two LSTM layers, and two additional fully-connected layers.
+The corresponding parameters as shown in the figure.
+![endpointer_model](endpointer.svg "Endpointer model")
+
+
 ## Speech models test input/output generation
 
 As mentioned above the input to models are generated from a pre-processing
@@ -86,25 +100,40 @@ same input.
 
 ### Models:
 
-[Speech hotword model (Svdf rank=1)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank1_2017_11_14.tflite)
+[Speech hotword model (Svdf
+rank=1)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank1_2017_11_14.tflite)
 
-[Speech hotword model (Svdf rank=2)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank2_2017_11_14.tflite)
+[Speech hotword model (Svdf
+rank=2)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank2_2017_11_14.tflite)
 
-[Speaker-id model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_speakerid_model_2017_11_14.tflite)
+[Speaker-id
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_speakerid_model_2017_11_14.tflite)
 
-[TTS model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_tts_model_2017_11_14.tflite)
+[TTS
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_tts_model_2017_11_14.tflite)
 
-[ASR AM model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_terse_am_model_2017_11_14.tflite)
+[ASR AM
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_terse_am_model_2017_11_14.tflite)
 
 ### Test benches
 
-[Speech hotword model test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_hotword_model_test.cc)
+[Speech hotword model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_hotword_model_test.cc)
+
+[Speaker-id model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc)
+
+[TTS model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_tts_model_test.cc)
 
-[Speaker-id model test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc)
+[ASR AM model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_asr_am_model_test.cc)
 
-[TTS model test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_tts_model_test.cc)
+[ASR LM model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_asr_lm_model_test.cc)
 
-[ASR AM model test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_terse_am_model_test.cc)
+[Endpointer model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_endpointer_model_test.cc)
 
 ## Android Support
 The models have been tested on Android phones, using the following tests:
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/endpointer.svg b/tensorflow/contrib/lite/models/testdata/g3doc/endpointer.svg
new file mode 100644
index 0000000000000000000000000000000000000000..6033bdc529e18355131965a26c49b6f17d671f27
--- /dev/null
+++ b/tensorflow/contrib/lite/models/testdata/g3doc/endpointer.svg
@@ -0,0 +1,4 @@
+<?xml version="1.0" standalone="yes"?>
+
+<svg version="1.1" viewBox="0.0 0.0 681.8005249343832 883.6010498687664" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><clipPath id="p.0"><path d="m0 0l681.80054 0l0 883.6011l-681.80054 0l0 -883.6011z" clip-rule="nonzero"></path></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l681.80054 0l0 883.6011l-681.80054 0z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m261.15503 14.700843l166.01575 0l0 42.110233l-166.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m261.15503 14.700843l166.01575 0l0 42.110233l-166.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m278.78244 41.620842l0 -13.593752l1.8125 0l0 13.593752l-1.8125 0zm4.6676636 0l0 -9.859377l1.5 0l0 1.4062519q1.09375 -1.6250019 3.140625 -1.6250019q0.890625 0 1.640625 0.328125q0.75 0.3125019 1.109375 0.8437519q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375732 3.78125l0 -13.640627l1.53125 0l0 1.2812519q0.53125 -0.75 1.203125 -1.1250019q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.6562519q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313202 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109377l1.671875 0l0 5.468752q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.281252l1.671875 0l0 9.859377l-1.5 0zm7.5788574 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125019l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125019l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.343752 2.578125 -4.671877l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671877q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.2812519 1.0625 -0.4843769q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.3281269 1.28125 0.9062519q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.750002l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.687502l0 -9.859377l1.671875 0l0 9.859377l-1.671875 0zm3.254181 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.3593769l8.046875 0l0 1.1093769l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.3750019 3.328125 -1.3750019q1.984375 0 3.234375 1.3437519q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm12.187653 3.875l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.812502l1.359375 0l0 8.812502l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140627l-4.25 6.140627l4.25 0zm5.016327 -1.921875q0 -2.421875 0.5 -3.890627q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875019 0.3125 3.218752q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359377 -0.78125 -4.468752q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.593752zm10.219482 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.3906269 -0.890625 -2.671877q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671877q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m228.15503 78.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m228.15503 78.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m308.4097 104.94362l0 -13.59375l6.03125 0q1.8125 0 2.75 0.359375q0.953125 0.359375 1.515625 1.296875q0.5625 0.921875 0.5625 2.046875q0 1.453125 -0.9375 2.453125q-0.921875 0.984375 -2.890625 1.25q0.71875 0.34375 1.09375 0.671875q0.78125 0.734375 1.484375 1.8125l2.375 3.703125l-2.265625 0l-1.796875 -2.828125q-0.796875 -1.21875 -1.3125 -1.875q-0.5 -0.65625 -0.90625 -0.90625q-0.40625 -0.265625 -0.8125 -0.359375q-0.3125 -0.078125 -1.015625 -0.078125l-2.078125 0l0 6.046875l-1.796875 0zm1.796875 -7.59375l3.859375 0q1.234375 0 1.921875 -0.25q0.703125 -0.265625 1.0625 -0.828125q0.375 -0.5625 0.375 -1.21875q0 -0.96875 -0.703125 -1.578125q-0.703125 -0.625 -2.21875 -0.625l-4.296875 0l0 4.5zm18.176056 4.421875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm8.438232 2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.0 2.9375l0 -13.59375l1.671875 0l0 4.875q1.171875 -1.359375 2.953125 -1.359375q1.09375 0 1.890625 0.4375q0.8125 0.421875 1.15625 1.1875q0.359375 0.765625 0.359375 2.203125l0 6.25l-1.671875 0l0 -6.25q0 -1.25 -0.546875 -1.8125q-0.546875 -0.578125 -1.53125 -0.578125q-0.75 0 -1.40625 0.390625q-0.640625 0.375 -0.921875 1.046875q-0.28125 0.65625 -0.28125 1.8125l0 5.390625l-1.671875 0zm16.813202 -1.21875q-0.9375 0.796875 -1.796875 1.125q-0.859375 0.3125 -1.84375 0.3125q-1.609375 0 -2.484375 -0.78125q-0.875 -0.796875 -0.875 -2.03125q0 -0.734375 0.328125 -1.328125q0.328125 -0.59375 0.859375 -0.953125q0.53125 -0.359375 1.203125 -0.546875q0.5 -0.140625 1.484375 -0.25q2.03125 -0.25 2.984375 -0.578125q0 -0.34375 0 -0.4375q0 -1.015625 -0.46875 -1.4375q-0.640625 -0.5625 -1.90625 -0.5625q-1.171875 0 -1.734375 0.40625q-0.5625 0.40625 -0.828125 1.46875l-1.640625 -0.234375q0.234375 -1.046875 0.734375 -1.6875q0.515625 -0.640625 1.46875 -0.984375q0.96875 -0.359375 2.25 -0.359375q1.265625 0 2.046875 0.296875q0.78125 0.296875 1.15625 0.75q0.375 0.453125 0.515625 1.140625q0.09375 0.421875 0.09375 1.53125l0 2.234375q0 2.328125 0.09375 2.953125q0.109375 0.609375 0.4375 1.171875l-1.75 0q-0.265625 -0.515625 -0.328125 -1.21875zm-0.140625 -3.71875q-0.90625 0.359375 -2.734375 0.625q-1.03125 0.140625 -1.453125 0.328125q-0.421875 0.1875 -0.65625 0.546875q-0.234375 0.359375 -0.234375 0.796875q0 0.671875 0.5 1.125q0.515625 0.4375 1.484375 0.4375q0.96875 0 1.71875 -0.421875q0.75 -0.4375 1.109375 -1.15625q0.265625 -0.578125 0.265625 -1.671875l0 -0.609375zm4.0788574 8.71875l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.610077 1.703125l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875z" fill-rule="nonzero"></path><path fill="#000000" d="m268.58267 130.94362q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.254181 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm7.3439026 7.65625l0 -17.375l3.671875 0l0 1.375l-2.015625 0l0 14.609375l2.015625 0l0 1.390625l-3.671875 0zm10.964539 -3.78125l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm4.9851074 0l0 -1.90625l1.90625 0l0 1.90625q0 1.046875 -0.375 1.6875q-0.375 0.65625 -1.171875 1.0l-0.46875 -0.71875q0.53125 -0.21875 0.78125 -0.671875q0.25 -0.453125 0.28125 -1.296875l-0.953125 0zm14.819733 0l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm5.016327 -1.921875q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm9.563232 6.703125l0 -1.90625l1.90625 0l0 1.90625q0 1.046875 -0.375 1.6875q-0.375 0.65625 -1.171875 1.0l-0.46875 -0.71875q0.53125 -0.21875 0.78125 -0.671875q0.25 -0.453125 0.28125 -1.296875l-0.953125 0zm15.757233 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm4.985077 0l0 -1.90625l1.90625 0l0 1.90625q0 1.046875 -0.375 1.6875q-0.375 0.65625 -1.171875 1.0l-0.46875 -0.71875q0.53125 -0.21875 0.78125 -0.671875q0.25 -0.453125 0.28125 -1.296875l-0.953125 0zm15.757233 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm7.3444824 3.78125l-3.6875 0l0 -1.390625l2.015625 0l0 -14.609375l-2.015625 0l0 -1.375l3.6875 0l0 17.375zm3.4801636 0.21875l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m254.95555 833.01575l180.00002 0l0 42.11023l-180.00002 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m254.95555 833.01575l180.00002 0l0 42.11023l-180.00002 0z" fill-rule="evenodd"></path><path fill="#000000" d="m276.66348 853.3107q0 -3.390625 1.8125 -5.296875q1.828125 -1.921875 4.703125 -1.921875q1.875 0 3.390625 0.90625q1.515625 0.890625 2.296875 2.5q0.796875 1.609375 0.796875 3.65625q0 2.0625 -0.84375 3.703125q-0.828125 1.625 -2.359375 2.46875q-1.53125 0.84375 -3.296875 0.84375q-1.921875 0 -3.4375 -0.921875q-1.5 -0.9375 -2.28125 -2.53125q-0.78125 -1.609375 -0.78125 -3.40625zm1.859375 0.03125q0 2.453125 1.3125 3.875q1.328125 1.40625 3.3125 1.40625q2.03125 0 3.34375 -1.421875q1.3125 -1.4375 1.3125 -4.0625q0 -1.65625 -0.5625 -2.890625q-0.546875 -1.234375 -1.640625 -1.921875q-1.078125 -0.6875 -2.421875 -0.6875q-1.90625 0 -3.28125 1.3125q-1.375 1.3125 -1.375 4.390625zm19.433289 6.59375l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.5788574 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5270386 5.28125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313232 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578827 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm12.187622 3.875l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.5788574 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m344.2495 137.01575l0 24.724411" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m344.2495 137.01575l0 18.724411" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m342.59778 155.74016l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m344.2495 220.72906l0 25.291336" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m344.2495 220.72906l0 19.291336" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m342.59778 240.0204l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m344.1629 56.811077l0.09448242 21.19685" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m344.1629 56.81108l0.06774902 15.196915" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m342.57892 72.01535l1.671936 4.530693l1.6315002 -4.545418z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m228.15503 694.4199l232.18896 0l0 42.11029l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m228.15503 694.4199l232.18896 0l0 42.11029l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m251.96599 721.33997l0 -13.59375l9.17186 0l0 1.59375l-7.3749847 0l0 4.21875l6.3749847 0l0 1.609375l-6.3749847 0l0 6.171875l-1.796875 0zm17.536606 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.891327 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.1448364 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.097931 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.9260864 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375732 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125702 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277039 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500732 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm17.637146 8.921875q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.228302 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875702 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm4.3757324 4.78125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm15.328125 0l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.578827 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m344.95538 503.2441l0 37.88974" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m344.95538 503.24408l0 31.88977" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m343.30365 535.13385l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m344.2495 284.66928l0 25.35434" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m344.2495 284.66928l0 19.35434" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m342.59778 304.02362l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m344.2495 664.5302l0 29.88971" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m344.2495 664.5302l0 23.88971" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m342.59778 688.4199l1.6517334 4.538147l1.6517334 -4.538147z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m228.15503 161.73694l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m228.15503 161.73694l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m321.72083 183.89131l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.926056 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm13.110077 0l-3.75 -9.859375l1.765625 0l2.125 5.90625q0.34375 0.953125 0.625 1.984375q0.21875 -0.78125 0.625 -1.875l2.1875 -6.015625l1.71875 0l-3.734375 9.859375l-1.5625 0zm14.90625 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm2.2819824 1.609375l0 -13.59375l4.6875 0q1.578125 0 2.421875 0.1875q1.15625 0.265625 1.984375 0.96875q1.078125 0.921875 1.609375 2.34375q0.53125 1.40625 0.53125 3.21875q0 1.546875 -0.359375 2.75q-0.359375 1.1875 -0.921875 1.984375q-0.5625 0.78125 -1.234375 1.234375q-0.671875 0.4375 -1.625 0.671875q-0.953125 0.234375 -2.1875 0.234375l-4.90625 0zm1.796875 -1.609375l2.90625 0q1.34375 0 2.109375 -0.25q0.765625 -0.25 1.21875 -0.703125q0.640625 -0.640625 1.0 -1.71875q0.359375 -1.078125 0.359375 -2.625q0 -2.125 -0.703125 -3.265625q-0.703125 -1.15625 -1.703125 -1.546875q-0.71875 -0.28125 -2.328125 -0.28125l-2.859375 0l0 10.390625z" fill-rule="nonzero"></path><path fill="#000000" d="m268.58267 214.65694q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.254181 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm7.3439026 7.65625l0 -17.375l3.671875 0l0 1.375l-2.015625 0l0 14.609375l2.015625 0l0 1.390625l-3.671875 0zm13.339539 -14.046875l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875732 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm5.922577 4.78125l0 -1.90625l1.90625 0l0 1.90625q0 1.046875 -0.375 1.6875q-0.375 0.65625 -1.171875 1.0l-0.46875 -0.71875q0.53125 -0.21875 0.78125 -0.671875q0.25 -0.453125 0.28125 -1.296875l-0.953125 0zm12.038483 -7.375q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm9.578857 3.921875l0 -1.90625l1.90625 0l0 1.90625q0 1.046875 -0.375 1.6875q-0.375 0.65625 -1.171875 1.0l-0.46875 -0.71875q0.53125 -0.21875 0.78125 -0.671875q0.25 -0.453125 0.28125 -1.296875l-0.953125 0zm15.757233 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm4.985077 0l0 -1.90625l1.90625 0l0 1.90625q0 1.046875 -0.375 1.6875q-0.375 0.65625 -1.171875 1.0l-0.46875 -0.71875q0.53125 -0.21875 0.78125 -0.671875q0.25 -0.453125 0.28125 -1.296875l-0.953125 0zm15.757233 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm7.3444824 3.78125l-3.6875 0l0 -1.390625l2.015625 0l0 -14.609375l-2.015625 0l0 -1.375l3.6875 0l0 17.375zm3.4801636 0.21875l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m228.15503 246.02362l232.18896 0l0 38.64566l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m228.15503 246.02362l232.18896 0l0 38.64566l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m255.45354 272.94363l0 -13.59375l2.7187347 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.7343597 0zm21.822037 -1.21875q-0.9375 0.796875 -1.796875 1.125q-0.859375 0.3125 -1.84375 0.3125q-1.609375 0 -2.484375 -0.78125q-0.875 -0.796875 -0.875 -2.03125q0 -0.734375 0.328125 -1.328125q0.328125 -0.59375 0.859375 -0.953125q0.53125 -0.359375 1.203125 -0.546875q0.5 -0.140625 1.484375 -0.25q2.03125 -0.25 2.984375 -0.578125q0 -0.34375 0 -0.4375q0 -1.015625 -0.46875 -1.4375q-0.640625 -0.5625 -1.90625 -0.5625q-1.171875 0 -1.734375 0.40625q-0.5625 0.40625 -0.828125 1.46875l-1.640625 -0.234375q0.234375 -1.046875 0.734375 -1.6875q0.515625 -0.640625 1.46875 -0.984375q0.96875 -0.359375 2.25 -0.359375q1.265625 0 2.046875 0.296875q0.78125 0.296875 1.15625 0.75q0.375 0.453125 0.515625 1.140625q0.09375 0.421875 0.09375 1.53125l0 2.234375q0 2.328125 0.09375 2.953125q0.109375 0.609375 0.4375 1.171875l-1.75 0q-0.265625 -0.515625 -0.328125 -1.21875zm-0.140625 -3.71875q-0.90625 0.359375 -2.734375 0.625q-1.03125 0.140625 -1.453125 0.328125q-0.421875 0.1875 -0.65625 0.546875q-0.234375 0.359375 -0.234375 0.796875q0 0.671875 0.5 1.125q0.515625 0.4375 1.484375 0.4375q0.96875 0 1.71875 -0.421875q0.75 -0.4375 1.109375 -1.15625q0.265625 -0.578125 0.265625 -1.671875l0 -0.609375zm2.9694824 4.9375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm10.65625 0l0 -13.59375l5.125 0q1.359375 0 2.078125 0.125q1.0 0.171875 1.671875 0.640625q0.671875 0.46875 1.078125 1.3125q0.421875 0.84375 0.421875 1.84375q0 1.734375 -1.109375 2.9375q-1.09375 1.203125 -3.984375 1.203125l-3.484375 0l0 5.53125l-1.796875 0zm1.796875 -7.140625l3.515625 0q1.75 0 2.46875 -0.640625q0.734375 -0.65625 0.734375 -1.828125q0 -0.859375 -0.4375 -1.46875q-0.421875 -0.609375 -1.125 -0.796875q-0.453125 -0.125 -1.671875 -0.125l-3.484375 0l0 4.859375zm9.802948 2.21875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm8.656952 0q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.250732 4.921875l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm12.488556 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm2.2819824 1.609375l0 -13.59375l4.6875 0q1.578125 0 2.421875 0.1875q1.15625 0.265625 1.984375 0.96875q1.078125 0.921875 1.609375 2.34375q0.53125 1.40625 0.53125 3.21875q0 1.546875 -0.359375 2.75q-0.359375 1.1875 -0.921875 1.984375q-0.5625 0.78125 -1.234375 1.234375q-0.671875 0.4375 -1.625 0.671875q-0.953125 0.234375 -2.1875 0.234375l-4.90625 0zm1.796875 -1.609375l2.90625 0q1.34375 0 2.109375 -0.25q0.765625 -0.25 1.21875 -0.703125q0.640625 -0.640625 1.0 -1.71875q0.359375 -1.078125 0.359375 -2.625q0 -2.125 -0.703125 -3.265625q-0.703125 -1.15625 -1.703125 -1.546875q-0.71875 -0.28125 -2.328125 -0.28125l-2.859375 0l0 10.390625zm19.828125 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm13.65625 1.4375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5114136 1.5l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm6.243927 -11.6875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm10.519836 0l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm16.016327 1.75l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578857 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm9.444733 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm11.922577 7.59375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m228.86089 461.13388l232.18898 0l0 42.11023l-232.18898 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m228.86089 461.13388l232.18898 0l0 42.11023l-232.18898 0z" fill-rule="evenodd"></path><path fill="#000000" d="m282.06027 488.05386l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm21.212677 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.918396 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.2283325 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.4062805 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.8750305 -0.453125 1.8281555 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.0937805 0 -3.4062805 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.4219055 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125305 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.6562805 0.40625 1.3750305 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.1250305 0 -1.9219055 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875732 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm4.3757324 4.78125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm18.640625 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875702 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.5788574 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m274.61697 770.54596l140.06299 0l0 42.11023l-140.06299 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m274.61697 770.54596l140.06299 0l0 42.11023l-140.06299 0z" fill-rule="evenodd"></path><path fill="#000000" d="m311.29257 793.09094l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm12.209198 -0.546875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.688232 4.921875l0 -8.546875l-1.484375 0l0 -1.3125l1.484375 0l0 -1.046875q0 -0.984375 0.171875 -1.46875q0.234375 -0.65625 0.84375 -1.046875q0.609375 -0.40625 1.703125 -0.40625q0.703125 0 1.5625 0.15625l-0.25 1.46875q-0.515625 -0.09375 -0.984375 -0.09375q-0.765625 0 -1.078125 0.328125q-0.3125 0.3125 -0.3125 1.203125l0 0.90625l1.921875 0l0 1.3125l-1.921875 0l0 8.546875l-1.65625 0zm8.433289 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5270691 1.5l0 -9.859375l1.5 0l0 1.390625q0.453125 -0.71875 1.21875 -1.15625q0.78125 -0.453125 1.765625 -0.453125q1.09375 0 1.796875 0.453125q0.703125 0.453125 0.984375 1.28125q1.171875 -1.734375 3.046875 -1.734375q1.46875 0 2.25 0.8125q0.796875 0.8125 0.796875 2.5l0 6.765625l-1.671875 0l0 -6.203125q0 -1.0 -0.15625 -1.4375q-0.15625 -0.453125 -0.59375 -0.71875q-0.421875 -0.265625 -1.0 -0.265625q-1.03125 0 -1.71875 0.6875q-0.6875 0.6875 -0.6875 2.21875l0 5.71875l-1.671875 0l0 -6.40625q0 -1.109375 -0.40625 -1.65625q-0.40625 -0.5625 -1.34375 -0.5625q-0.703125 0 -1.3125 0.375q-0.59375 0.359375 -0.859375 1.078125q-0.265625 0.71875 -0.265625 2.0625l0 5.109375l-1.671875 0zm21.978302 -1.21875q-0.9375 0.796875 -1.796875 1.125q-0.859375 0.3125 -1.84375 0.3125q-1.609375 0 -2.484375 -0.78125q-0.875 -0.796875 -0.875 -2.03125q0 -0.734375 0.328125 -1.328125q0.328125 -0.59375 0.859375 -0.953125q0.53125 -0.359375 1.203125 -0.546875q0.5 -0.140625 1.484375 -0.25q2.03125 -0.25 2.984375 -0.578125q0 -0.34375 0 -0.4375q0 -1.015625 -0.46875 -1.4375q-0.640625 -0.5625 -1.90625 -0.5625q-1.171875 0 -1.734375 0.40625q-0.5625 0.40625 -0.828125 1.46875l-1.640625 -0.234375q0.234375 -1.046875 0.734375 -1.6875q0.515625 -0.640625 1.46875 -0.984375q0.96875 -0.359375 2.25 -0.359375q1.265625 0 2.046875 0.296875q0.78125 0.296875 1.15625 0.75q0.375 0.453125 0.515625 1.140625q0.09375 0.421875 0.09375 1.53125l0 2.234375q0 2.328125 0.09375 2.953125q0.109375 0.609375 0.4375 1.171875l-1.75 0q-0.265625 -0.515625 -0.328125 -1.21875zm-0.140625 -3.71875q-0.90625 0.359375 -2.734375 0.625q-1.03125 0.140625 -1.453125 0.328125q-0.421875 0.1875 -0.65625 0.546875q-0.234375 0.359375 -0.234375 0.796875q0 0.671875 0.5 1.125q0.515625 0.4375 1.484375 0.4375q0.96875 0 1.71875 -0.421875q0.75 -0.4375 1.109375 -1.15625q0.265625 -0.578125 0.265625 -1.671875l0 -0.609375zm2.969452 4.9375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m344.2495 736.5302l0.40945435 34.015747" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m344.2495 736.5302l0.33721924 28.016113" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m342.93512 764.5662l1.7062378 4.5178833l1.5969849 -4.557617z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m344.64847 812.6562l0.31497192 20.346436" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m344.64847 812.6562l0.22210693 14.347168" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m343.21902 827.02893l1.7217712 4.511963l1.5812988 -4.5631104z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m228.15503 622.4199l232.18896 0l0 42.11029l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m228.15503 622.4199l232.18896 0l0 42.11029l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m246.77812 649.33997l0 -13.59375l9.171875 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.796875 0zm17.536606 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913574 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144806 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.0979614 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.926056 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125732 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547577 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277069 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500702 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm17.637146 8.921875q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.228302 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875732 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm4.375702 4.78125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm18.640625 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875732 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.578827 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m228.15503 390.41995l232.18896 0l0 42.11023l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m228.15503 390.41995l232.18896 0l0 42.11023l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m241.59027 417.33994l0 -13.59375l9.171875 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.796875 0zm17.53659 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.85935974 0 -1.6249847 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.2656097 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913574 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144806 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.0979614 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.926056 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125732 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547577 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277069 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500702 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm17.637146 8.921875q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.681427 -15.8125l0 -1.609375l8.796875 0l0 1.296875q-1.296875 1.375 -2.578125 3.671875q-1.265625 2.296875 -1.96875 4.71875q-0.5 1.703125 -0.640625 3.734375l-1.71875 0q0.03125 -1.609375 0.625 -3.875q0.609375 -2.28125 1.734375 -4.390625q1.140625 -2.109375 2.40625 -3.546875l-6.65625 0zm10.250732 5.109375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm14.016327 6.703125l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm4.3757324 4.78125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm18.640625 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875702 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.5788574 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m228.86089 541.1336l232.18898 0l0 42.11023l-232.18898 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m228.86089 541.1336l232.18898 0l0 42.11023l-232.18898 0z" fill-rule="evenodd"></path><path fill="#000000" d="m282.06027 568.0536l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm23.697052 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm10.434021 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.2283325 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.4062805 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.8750305 -0.453125 1.8281555 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.0937805 0 -3.4062805 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.4219055 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125305 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.6562805 0.40625 1.3750305 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.1250305 0 -1.9219055 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875732 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm4.3757324 4.78125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm18.640625 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875702 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.5788574 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m344.95538 583.24384l-0.6929016 39.18109" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m344.95538 583.24384l-0.5868225 33.182068" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m342.71707 616.39667l1.5712585 4.5665894l1.7316895 -4.5081787z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m344.2495 432.53018l0.6929321 28.59842" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m344.2495 432.53018l0.5475769 22.60019" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m343.14584 455.17038l1.7611694 4.496765l1.5413208 -4.576782z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m228.15486 310.02362l232.18898 0l0 58.992126l-232.18898 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m228.15486 310.02362l232.18898 0l0 58.992126l-232.18898 0z" fill-rule="evenodd"></path><path fill="#000000" d="m308.40952 336.94363l0 -13.59375l6.03125 0q1.8125 0 2.75 0.359375q0.953125 0.359375 1.515625 1.296875q0.5625 0.921875 0.5625 2.046875q0 1.453125 -0.9375 2.453125q-0.921875 0.984375 -2.890625 1.25q0.71875 0.34375 1.09375 0.671875q0.78125 0.734375 1.484375 1.8125l2.375 3.703125l-2.265625 0l-1.796875 -2.828125q-0.796875 -1.21875 -1.3125 -1.875q-0.5 -0.65625 -0.90625 -0.90625q-0.40625 -0.265625 -0.8125 -0.359375q-0.3125 -0.078125 -1.015625 -0.078125l-2.078125 0l0 6.046875l-1.796875 0zm1.796875 -7.59375l3.859375 0q1.234375 0 1.921875 -0.25q0.703125 -0.265625 1.0625 -0.828125q0.375 -0.5625 0.375 -1.21875q0 -0.96875 -0.703125 -1.578125q-0.703125 -0.625 -2.21875 -0.625l-4.296875 0l0 4.5zm18.176086 4.421875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm8.438202 2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.0 2.9375l0 -13.59375l1.671875 0l0 4.875q1.171875 -1.359375 2.953125 -1.359375q1.09375 0 1.890625 0.4375q0.8125 0.421875 1.15625 1.1875q0.359375 0.765625 0.359375 2.203125l0 6.25l-1.671875 0l0 -6.25q0 -1.25 -0.546875 -1.8125q-0.546875 -0.578125 -1.53125 -0.578125q-0.75 0 -1.40625 0.390625q-0.640625 0.375 -0.921875 1.046875q-0.28125 0.65625 -0.28125 1.8125l0 5.390625l-1.671875 0zm16.813202 -1.21875q-0.9375 0.796875 -1.796875 1.125q-0.8593445 0.3125 -1.8437195 0.3125q-1.609375 0 -2.484375 -0.78125q-0.875 -0.796875 -0.875 -2.03125q0 -0.734375 0.328125 -1.328125q0.328125 -0.59375 0.859375 -0.953125q0.53125 -0.359375 1.203125 -0.546875q0.5 -0.140625 1.484375 -0.25q2.0312195 -0.25 2.9843445 -0.578125q0 -0.34375 0 -0.4375q0 -1.015625 -0.46875 -1.4375q-0.640625 -0.5625 -1.9062195 -0.5625q-1.171875 0 -1.734375 0.40625q-0.5625 0.40625 -0.828125 1.46875l-1.640625 -0.234375q0.234375 -1.046875 0.734375 -1.6875q0.515625 -0.640625 1.46875 -0.984375q0.96875 -0.359375 2.25 -0.359375q1.2655945 0 2.0468445 0.296875q0.78125 0.296875 1.15625 0.75q0.375 0.453125 0.515625 1.140625q0.09375 0.421875 0.09375 1.53125l0 2.234375q0 2.328125 0.09375 2.953125q0.109375 0.609375 0.4375 1.171875l-1.75 0q-0.265625 -0.515625 -0.328125 -1.21875zm-0.140625 -3.71875q-0.90625 0.359375 -2.7343445 0.625q-1.03125 0.140625 -1.453125 0.328125q-0.421875 0.1875 -0.65625 0.546875q-0.234375 0.359375 -0.234375 0.796875q0 0.671875 0.5 1.125q0.515625 0.4375 1.484375 0.4375q0.96875 0 1.7187195 -0.421875q0.75 -0.4375 1.109375 -1.15625q0.265625 -0.578125 0.265625 -1.671875l0 -0.609375zm4.0788574 8.71875l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.610077 1.703125l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875z" fill-rule="nonzero"></path><path fill="#000000" d="m284.13696 362.94363q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.254181 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm7.3439026 7.65625l0 -17.375l3.671875 0l0 1.375l-2.015625 0l0 14.609375l2.015625 0l0 1.390625l-3.671875 0zm10.964539 -3.78125l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm4.985077 0l0 -1.90625l1.90625 0l0 1.90625q0 1.046875 -0.375 1.6875q-0.375 0.65625 -1.171875 1.0l-0.46875 -0.71875q0.53125 -0.21875 0.78125 -0.671875q0.25 -0.453125 0.28125 -1.296875l-0.953125 0zm9.585358 -11.8125l0 -1.609375l8.796875 0l0 1.296875q-1.296875 1.375 -2.578125 3.671875q-1.265625 2.296875 -1.96875 4.71875q-0.5 1.703125 -0.640625 3.734375l-1.71875 0q0.03125 -1.609375 0.625 -3.875q0.609375 -2.28125 1.734375 -4.390625q1.140625 -2.109375 2.40625 -3.546875l-6.65625 0zm10.250732 5.109375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm14.016327 6.703125l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm8.281982 8.5625l-3.6875 0l0 -1.390625l2.015625 0l0 -14.609375l-2.015625 0l0 -1.375l3.6875 0l0 17.375zm3.4801636 0.21875l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m344.24933 369.01575l0 21.417328" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m344.24933 369.01575l0 15.417328" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m342.59763 384.43307l1.6517029 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path></g></svg>
+
diff --git a/tensorflow/contrib/lite/models/testdata/speech_asr_lm_model.test_spec b/tensorflow/contrib/lite/models/testdata/speech_asr_lm_model.test_spec
new file mode 100644
index 0000000000000000000000000000000000000000..5812de4b30382f6b031c907bf8bd12a34ac9e0b3
--- /dev/null
+++ b/tensorflow/contrib/lite/models/testdata/speech_asr_lm_model.test_spec
@@ -0,0 +1,202 @@
+load_model: "speech_asr_lm_model.tflite"
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 3
+  input: "63982"
+  input: "8409"
+  output: "-2.75389"
+}
+invoke {
+  id: 4
+  input: "8409"
+  input: "1488"
+  output: "0.601841"
+}
+invoke {
+  id: 5
+  input: "1488"
+  input: "63981"
+  output: "-0.314846"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 6
+  input: "63982"
+  input: "8409"
+  output: "-2.75389"
+}
+invoke {
+  id: 7
+  input: "8409"
+  input: "3082"
+  output: "-3.63721"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 8
+  input: "63982"
+  input: "8409"
+  output: "-2.75389"
+}
+invoke {
+  id: 9
+  input: "8409"
+  input: "18965"
+  output: "-6.93985"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 13
+  input: "63982"
+  input: "12516"
+  output: "-6.20867"
+}
+invoke {
+  id: 14
+  input: "12516"
+  input: "914"
+  output: "-0.407277"
+}
+invoke {
+  id: 15
+  input: "914"
+  input: "63981"
+  output: "-3.82091"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 19
+  input: "63982"
+  input: "12516"
+  output: "-6.20867"
+}
+invoke {
+  id: 20
+  input: "12516"
+  input: "914"
+  output: "-0.407277"
+}
+invoke {
+  id: 21
+  input: "914"
+  input: "48619"
+  output: "-4.02131"
+}
+invoke {
+  id: 22
+  input: "48619"
+  input: "63981"
+  output: "-0.677399"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 26
+  input: "63982"
+  input: "12516"
+  output: "-6.20867"
+}
+invoke {
+  id: 27
+  input: "12516"
+  input: "914"
+  output: "-0.407277"
+}
+invoke {
+  id: 28
+  input: "914"
+  input: "4700"
+  output: "-4.056"
+}
+invoke {
+  id: 29
+  input: "4700"
+  input: "63981"
+  output: "0.415889"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 30
+  input: "63982"
+  input: "12516"
+  output: "-6.20867"
+}
+invoke {
+  id: 31
+  input: "12516"
+  input: "914"
+  output: "-0.407277"
+invoke {
+  id: 32
+  input: "914"
+  input: "51923"
+  output: "-14.1147"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 34
+  input: "63982"
+  input: "5520"
+  output: "-4.56971"
+}
+invoke {
+  id: 35
+  input: "5520"
+  input: "16318"
+  output: "-1.54815"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 36
+  input: "63982"
+  input: "5520"
+  output: "-4.56971"
+}
+invoke {
+  id: 37
+  input: "5520"
+  input: "28303"
+  output: "-14.0947"
+}
+init_state: "21,22,42,43,63,64"
+invoke {
+  id: 38
+  input: "63982"
+  input: "12451"
+  output: "-6.24243"
+}
+invoke {
+  id: 39
+  input: "12451"
+  input: "752"
+  output: "0.0700736"
+}
+invoke {
+  id: 40
+  input: "752"
+  input: "11"
+  output: "-1.72744"
+}
+invoke {
+  id: 41
+  input: "11"
+  input: "19454"
+  output: "-3.19211"
+}
+invoke {
+  id: 42
+  input: "19454"
+  input: "16989"
+  output: "-4.01684"
+}
+invoke {
+  id: 43
+  input: "16989"
+  input: "40168"
+  output: "-8.91317"
+}
+invoke {
+  id: 44
+  input: "40168"
+  input: "63981"
+  output: "-0.675377"
+}
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index b78e958e7f3a99993ab5e2cf487cfa73de8a74e8..76032771af2c8e099aed498b2071816646f3b606 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -108,7 +108,7 @@ enum {
  * The type of operations that can be added to a model.
  */
 enum {
-  /** Adds two tensors, elment-wise.
+  /** Adds two tensors, element-wise.
    *
    * Takes two input tensors of identical type and compatible dimensions. The
    * output is the sum of both input tensors, optionally modified by an
@@ -370,7 +370,7 @@ enum {
    * Looks up items from a given tensor.
    *
    * Each item in the output is a raw copy of the corresponding item in
-   * the input “values”. If the the given “lookup” indices are out of bounds,
+   * the input “values”. If the given “lookup” indices are out of bounds,
    * the op will fail and an error will be reported.
    *
    * Inputs:
@@ -743,7 +743,7 @@ enum {
    */
   ANEURALNETWORKS_MAX_POOL_2D = 17,
 
-  /** Multiplies two tensors, elment-wise.
+  /** Multiplies two tensors, element-wise.
    *
    * Takes two input tensors of identical type and compatible dimensions. The
    * output is the product of both input tensors, optionally modified by an
@@ -1454,9 +1454,9 @@ inline int ANeuralNetworksModel_finish(ANeuralNetworksModel* model) {
  * {@link ANeuralNetworksExecution_setOutputFromMemory} and
  * {@link ANeuralNetworksExecution_setOperandValue}.
  *
- * To build a model that can accommodate inputs of various sizes, as you may want
- * to do for a CNN, set the size of the dimensions that will vary at run time to
- * 0. If you do so, provide the full dimensions when calling
+ * To build a model that can accommodate inputs of various sizes, as you may
+ * want to do for a CNN, set the size of the dimensions that will vary at run
+ * time to 0. If you do so, provide the full dimensions when calling
  * {@link ANeuralNetworksExecution_setInput} or {@link
  * ANeuralNetworksExecution_setInputFromMemory}.
  *
@@ -1571,7 +1571,7 @@ inline int ANeuralNetworksModel_addOperation(ANeuralNetworksModel* model,
 }
 
 /**
- * Specfifies which operands will be the model's inputs and outputs.
+ * Specifies which operands will be the model's inputs and outputs.
  *
  * An operand cannot be used for both input and output. Doing so will
  * return an error.
@@ -1774,7 +1774,7 @@ inline int ANeuralNetworksExecution_setInput(
  *             model. If the type is the same as specified when the model
  *             was built, NULL can be passed.
  * @param memory The memory containing the data.
- * @param offset This specifies the location of the data whithin the memory.
+ * @param offset This specifies the location of the data within the memory.
  *               The offset is in bytes from the start of memory.
  * @param length The size in bytes of the data value.
  *
@@ -1841,7 +1841,7 @@ inline int ANeuralNetworksExecution_setOutput(
  *             model. If the type is the same as specified when the model
  *             was built, NULL can be passed.
  * @param memory The memory where the data is to be stored.
- * @param offset This specifies the location of the data whithin the memory.
+ * @param offset This specifies the location of the data within the memory.
  *               The offset is in bytes from the start of memory.
  * @param length The length in bytes of the data value.
  *
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index 6a199cc8406c73f822b813603e55b0ba1994a235..77084b4dc83b20543ee5eb7a611c9bba42c6714c 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -161,6 +161,14 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       augmented_inputs.push_back(next_id++);
     };
 
+    auto duplicate_state_tensor_float32 =
+        [interpreter, &nn_model, &augmented_inputs, &next_id](int tensor_id) {
+          const TfLiteTensor* tensor = interpreter->tensor(tensor_id);
+          CHECK_NN(ANeuralNetworksModel_setOperandValue(
+              nn_model, tensor_id, tensor->data.raw, tensor->bytes));
+          augmented_inputs.push_back(tensor_id);
+        };
+
     auto add_add_params = [&add_scalar_int32]() { add_scalar_int32(0); };
 
     auto add_pooling_params = [&add_scalar_int32](void* data) {
@@ -208,6 +216,19 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       add_scalar_float32(builtin->beta);
     };
 
+    auto add_space_to_depth_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(data);
+      add_scalar_int32(builtin->block_size);
+    };
+
+    auto add_lstm_params = [&add_scalar_int32,
+                            &add_scalar_float32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteLSTMParams*>(data);
+      add_scalar_int32(builtin->activation);
+      add_scalar_float32(builtin->cell_clip);
+      add_scalar_float32(builtin->proj_clip);
+    };
+
 #if 0
     auto add_reshape_params = [&](void* data) {
       auto builtin = reinterpret_cast<TfLiteReshapeParams*>(data);
@@ -280,22 +301,47 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
         nn_op_type = ANEURALNETWORKS_RESHAPE;
         // add_reshape_params(node.builtin_data);
         break;
+      case tflite::BuiltinOperator_SPACE_TO_DEPTH:
+        add_space_to_depth_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH;
+        break;
+      case tflite::BuiltinOperator_LSTM: {
+        duplicate_state_tensor_float32(
+            node.outputs->data[/*kOutputStateTensor*/ 1]);
+        duplicate_state_tensor_float32(
+            node.outputs->data[/*kCellStateTensor*/ 2]);
+        add_lstm_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_LSTM;
+        break;
+      }
       case tflite::BuiltinOperator_CONCAT_EMBEDDINGS:
       case tflite::BuiltinOperator_LSH_PROJECTION:
       case tflite::BuiltinOperator_SVDF:
       case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
       case tflite::BuiltinOperator_RNN:
+      case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
+      case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
-      case tflite::BuiltinOperator_LSTM:
+      case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
       case tflite::BuiltinOperator_L2_NORMALIZATION:
       case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
       case tflite::BuiltinOperator_MUL:
+      case tflite::BuiltinOperator_PAD:
       case tflite::BuiltinOperator_RESIZE_BILINEAR:
       case tflite::BuiltinOperator_CALL:
       case tflite::BuiltinOperator_SKIP_GRAM:
-      case tflite::BuiltinOperator_RELU1:
-      case tflite::BuiltinOperator_SPACE_TO_DEPTH:
+      case tflite::BuiltinOperator_RELU_N1_TO_1:
+      case tflite::BuiltinOperator_GATHER:
+      case tflite::BuiltinOperator_SPACE_TO_BATCH_ND:
+      case tflite::BuiltinOperator_BATCH_TO_SPACE_ND:
+      case tflite::BuiltinOperator_TRANSPOSE:
+      case tflite::BuiltinOperator_MEAN:
+      case tflite::BuiltinOperator_DIV:
+      case tflite::BuiltinOperator_SUB:
+      case tflite::BuiltinOperator_SQUEEZE:
+      case tflite::BuiltinOperator_STRIDED_SLICE:
+      case tflite::BuiltinOperator_EXP:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
         break;
diff --git a/tensorflow/contrib/lite/nnapi_delegate.h b/tensorflow/contrib/lite/nnapi_delegate.h
index f29aa9e18e605ef0b5d246b2a672639c64391646..e98000929a1168c786f6c18f498f9d1d72311ada 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.h
+++ b/tensorflow/contrib/lite/nnapi_delegate.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
+#define TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
 
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/context.h"
@@ -63,4 +63,4 @@ class NNAPIDelegate {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
diff --git a/tensorflow/contrib/lite/optional_debug_tools.h b/tensorflow/contrib/lite/optional_debug_tools.h
index 54d48760951c946d0493a86961348df25e53bd1f..1b6998cda382782b974bea3d18ffb6217e8f780c 100644
--- a/tensorflow/contrib/lite/optional_debug_tools.h
+++ b/tensorflow/contrib/lite/optional_debug_tools.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // Optional debugging functionality. For small sized binaries, these are not
 // needed.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
+#define TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
 
 #include "tensorflow/contrib/lite/interpreter.h"
 
@@ -29,4 +29,4 @@ TfLiteStatus ValidateInterpreterState(const Interpreter* interpreter);
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 89e8693490dcec79e7a117073696e57a9060e68f..82feae0f0041997949212613c654a5695f468d56 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -13,6 +13,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":op_hint",
         "//tensorflow/contrib/lite/toco:model_flags_proto_py",
         "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
         "//tensorflow/contrib/lite/toco/python:tensorflow_wrap_toco",
@@ -20,12 +21,26 @@ py_library(
     ],
 )
 
+py_library(
+    name = "op_hint",
+    srcs = ["op_hint.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
 py_test(
     name = "lite_test",
     srcs = ["lite_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_oss"],
     deps = [
         ":lite",
+        ":op_hint",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 0fd70f842b9db0e6ef48480e79cc6bb59840761c..5d2f21653762a405a57288a7ba38323e5e42b3e1 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -18,23 +18,36 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 
 @@toco_convert
 @@toco_convert_protos
+@@OpHint
+@@convert_op_hints_to_stubs
 
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 import os
 import subprocess
 import tempfile
 
+# pylint: disable=unused-import
+from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs
+from tensorflow.contrib.lite.python.op_hint import OpHint
+# pylint: enable=unused-import
 from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
 from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
-from tensorflow.contrib.lite.toco.python.tensorflow_wrap_toco import TocoConvert as _toco_convert_protos
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.platform import resource_loader as _resource_loader
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+# Lazy load since some of the performance benchmark skylark rules
+# break dependencies.
+_toco_python = LazyLoader(
+    "tensorflow_wrap_toco", globals(),
+    "tensorflow.contrib.lite.toco.python."
+    "tensorflow_wrap_toco")
+del LazyLoader
 
 # Enum types from the protobuf promoted to the API
 FLOAT = _types_pb2.FLOAT
@@ -50,7 +63,7 @@ GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT
 # to protect against crashes. However, it breaks some dependent targets because
 # it forces us to depend on an external py_binary. The experimental API doesn't
 # have that drawback.
-EXPERIMENTAL_USE_TOCO_API_DIRECTLY = True
+EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False
 
 # Find the toco_from_protos binary using the resource loader if using from
 # bazel, otherwise we are in a pip where console_scripts already has
@@ -86,7 +99,8 @@ def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
   # TODO(aselle): When toco does not use fatal errors for failure, we can
   # switch this on.
   if not _toco_from_proto_bin:
-    return _toco_convert_protos(model_flags_str, toco_flags_str, input_data_str)
+    return _toco_python.TocoConvert(
+        model_flags_str, toco_flags_str, input_data_str)
 
   with tempfile.NamedTemporaryFile() as fp_toco, \
            tempfile.NamedTemporaryFile() as fp_model, \
@@ -184,10 +198,10 @@ def toco_convert(input_data,
     if inference_type == QUANTIZED_UINT8:
       if tflite_input_type == FLOAT:
         tflite_input_type = QUANTIZED_UINT8
-      input_array.mean, input_array.std = quantized_input_stats[idx]
+      input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
 
     input_array.name = _tensor_name(input_tensor)
-    input_array.shape.extend(map(int, input_tensor.get_shape()))
+    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
     toco.inference_input_type = tflite_input_type
 
   for output_tensor in output_tensors:
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index da360aeb344ab9c4eb183d84e9b5f60ba715c6e8..b8b4510188bee867b32ffde714b27f41a1df778a 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -18,10 +18,14 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.lite.python.op_hint import _tensor_name_base as _tensor_name_base
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
+from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -35,11 +39,133 @@ class LiteTest(test_util.TensorFlowTestCase):
     # Try running on valid graph
     result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor])
     self.assertTrue(result)
-    # TODO(aselle): remove tests that fail.
+    # TODO(aselle): remove tests that fail (we must get TOCO to not fatal
+    # all the time).
     # Try running on identity graph (known fail)
     # with self.assertRaisesRegexp(RuntimeError, "!model->operators.empty()"):
     #   result = lite.toco_convert(sess.graph_def, [in_tensor], [in_tensor])
 
+  def testQuantization(self):
+    in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3],
+                                      dtype=dtypes.float32)
+    out_tensor = array_ops.fake_quant_with_min_max_args(in_tensor + in_tensor,
+                                                        min=0., max=1.)
+    sess = session.Session()
+    result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor],
+                               inference_type=lite.QUANTIZED_UINT8,
+                               quantized_input_stats=[(0., 1.)])
+    self.assertTrue(result)
+
+
+class LiteTestOpHint(test_util.TensorFlowTestCase):
+  """Test the hint to stub functionality."""
+
+  def _getGraphOpTypes(self, graphdef, output_nodes):
+    """Returns used op types in `graphdef` reachable from `output_nodes`.
+
+    This is used to check that after the stub transformation the expected
+    nodes are there. Typically use this with self.assertCountEqual(...).
+
+    NOTE: this is not a exact test that the graph is the correct output, but
+      it balances compact expressibility of test with sanity checking.
+
+    Args:
+      graphdef: TensorFlow proto graphdef.
+      output_nodes: A list of output node names that we need to reach.
+
+    Returns:
+      A set of node types reachable from `output_nodes`.
+    """
+    name_to_input_name, name_to_node, _ = (
+        _extract_graph_summary(graphdef))
+    # Find all nodes that are needed by the outputs
+    used_node_names = _bfs_for_reachable_nodes(output_nodes, name_to_input_name)
+    return set([name_to_node[node_name].op for node_name in used_node_names])
+
+  def _countIdentities(self, nodes):
+    """Count the number of "Identity" op types in the list of proto nodes.
+
+    Args:
+      nodes: NodeDefs of the graph.
+
+    Returns:
+      The number of nodes with op type "Identity" found.
+    """
+    return len([x for x in nodes if x.op == "Identity"])
+
+  def testSwishLiteHint(self):
+    """Makes a custom op swish and makes sure it gets converted as a unit."""
+    image = array_ops.constant([1., 2., 3., 4.])
+    swish_scale = array_ops.constant(1.0)
+
+    def _swish(input_tensor, scale):
+      custom = lite.OpHint("cool_activation")
+      input_tensor, scale = custom.add_inputs(input_tensor, scale)
+      output = math_ops.sigmoid(input_tensor) * input_tensor * scale
+      output, = custom.add_outputs(output)
+      return output
+    output = array_ops.identity(_swish(image, swish_scale), name="ModelOutput")
+
+    with self.test_session() as sess:
+      # check if identities have been put into the graph (2 input, 1 output,
+      # and 1 final output).
+      self.assertEqual(self._countIdentities(sess.graph_def.node), 4)
+
+      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+          ["cool_activation", "Const", "Identity"])
+
+  def testScaleAndBiasAndIdentity(self):
+    """This tests a scaled add which has 3 inputs and 2 outputs."""
+    a = array_ops.constant(1.)
+    x = array_ops.constant([2., 3.])
+    b = array_ops.constant([4., 5.])
+
+    def _scaled_and_bias_and_identity(a, x, b):
+      custom = lite.OpHint("scale_and_bias_and_identity")
+      a, x, b = custom.add_inputs(a, x, b)
+      return custom.add_outputs(a * x + b, x)
+    output = array_ops.identity(_scaled_and_bias_and_identity(a, x, b),
+                                name="ModelOutput")
+
+    with self.test_session() as sess:
+      # make sure one identity for each input (3) and output (2) => 3 + 2 = 5
+      # +1 for the final output
+      self.assertEqual(self._countIdentities(sess.graph_def.node), 6)
+
+      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+          ["scale_and_bias_and_identity", "Const", "Identity", "Pack"])
+
+  def testTwoFunctions(self):
+    """Tests if two functions are converted correctly."""
+    a = array_ops.constant([1.])
+    b = array_ops.constant([1.])
+    def _double_values(x):
+      custom = lite.OpHint("add_test")
+      x = custom.add_inputs(x)
+      output = math_ops.multiply(x, x)
+      output, = custom.add_outputs(output)
+      return output
+    output = array_ops.identity(
+        math_ops.add(_double_values(a), _double_values(b)), name="ModelOutput")
+
+    with self.test_session() as sess:
+      # make sure one identity for each input (2) and output (2) => 2 + 2
+      # +1 for the final output
+      self.assertEqual(self._countIdentities(sess.graph_def.node), 5)
+      stubbed_graphdef = lite.convert_op_hints_to_stubs(sess)
+      self.assertCountEqual(
+          self._getGraphOpTypes(
+              stubbed_graphdef, output_nodes=[_tensor_name_base(output)]),
+          ["add_test", "Const", "Identity", "Add"])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/lite/python/op_hint.py b/tensorflow/contrib/lite/python/op_hint.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a3971228a683211e84b4c55d3a3e8d574b5ed94
--- /dev/null
+++ b/tensorflow/contrib/lite/python/op_hint.py
@@ -0,0 +1,306 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Define tflite op hints (intrinsic operations).
+
+This essentially allows defining a TensorFlow API for tflite operations in
+Python with hints on how they are represented in TensorFlow Lite. This basically
+is a form of tflite intrinsic. It wraps a subpart of a TensorFlow execution
+graph and is useful for LSTMs and other complicated TensorFlow constructions
+that are difficult to pattern match in TOCO, but are represented by a single
+accelerated tflite op.
+
+Example:
+  def tflite_cool_activation(input):
+    # A cool activation function.
+    custom = tf.contrib.lite.OpHint("cool_activation")
+    input = custom.add_inputs(input)
+    output = tf.sigmoid(input) * input
+    custom.add_outputs(output)
+    return output
+
+  image = tf.placeholder(tf.float32, (1, 16, 16, 1))
+  output = tf.identity(tflite_cool_activation(image))
+
+  session = tf.Session()
+
+  graphdef_to_convert = tf.contrib.lite.convert_op_hints_to_stubs(session)
+  tflite_graph = tf.contrib.lite.toco_convert(graphdef_to_convert,
+                                              [image], [output])
+                                              [image], [output])
+  with open("/tmp/graph.fb", "wb") as fp:
+    fp.write(tflite_graph)
+
+How does it work?:
+
+OpHint is a helper that you use when defining a vanilla python function.
+It allows you to wrap arguments with tf.identities with some custom attributes.
+These attributes allow you to find the original block of ops that was created.
+For example, if you use cool_activation above you essentially get:
+
+a_input = tf.identity()
+result = tf.multiply(tf.sigmoid(a_input), a_input)
+output = tf.identity()
+
+a_input, output are identities that have parameters representing
+what argument they are, what the name of the function they should turn into
+in tf lite as well as a guid that uniquely identifies a particular invocation.
+
+Once you have built your whole tensorflow graph, you can run it and train it
+as usual, but after you have done that, you need to convert the graph into
+a form that replaces these subgraphs wrapped in identities to stub ops. These
+ops don't actually exist in the normal TensorFlow runtime, but will be
+understood by toco later.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as _collections
+import itertools as _itertools
+import uuid as _uuid
+
+from tensorflow.contrib import framework as _framework
+from tensorflow.core.framework import attr_value_pb2 as _attr_value_pb2
+from tensorflow.python.framework import ops as _ops
+from tensorflow.python.ops import array_ops as _array_ops
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+class OpHint(object):
+  """A class that helps build tflite function invocations.
+
+  It allows you to take a bunch of TensorFlow ops and annotate the construction
+  such that toco knows how to convert it to tflite. This embeds a pseudo
+  function in a TensorFlow graph. This allows embedding high-level API usage
+  information in a lower level TensorFlow implementation so that an alternative
+  implementation can be substituted later.
+
+  Essentially, any "input" into this pseudo op is fed into an identity, and
+  attributes are added to that input before being used by the constituent ops
+  that make up the pseudo op. A similar process is done to any output that
+  is to be exported from the current op.
+
+  TODO(aselle): When TensorFlow functions functionality works for arbitrary
+  constructs, this mechanism can be retired and changed to use python defun's.
+  """
+
+  # Attr constants that are used for representation in the GraphDef
+  FUNCTION_NAME_ATTR = "_tflite_function_name"
+  FUNCTION_UUID_ATTR = "_tflite_function_uuid"
+  FUNCTION_INPUT_INDEX_ATTR = "_tflite_function_input_index"
+  FUNCTION_OUTPUT_INDEX_ATTR = "_tflite_function_output_index"
+
+  def __init__(self, function_name, **kwargs):
+    """Create a OpHint.
+
+    Args:
+      function_name: Name of the function (the custom op name in tflite)
+      **kwargs: Keyword arguments of any constant attributes for the function.
+    """
+    self._function_name = function_name
+    self._unique_function_id = _uuid.uuid1().hex  # TODO(aselle): Unique enough?
+    self._curr_input_index = 0
+    self._curr_output_index = 0
+    self._attrs_to_store_later = kwargs
+    self._stored_attrs = False
+
+  def _setattr(self, dest_op, name, value):
+    tensor_value = _ops.convert_to_tensor(value)
+    dest_op.op.node_def.attr[name].tensor.CopyFrom(
+        tensor_value.op.node_def.attr["value"].tensor)
+
+  def add_inputs(self, *args):
+    """Add a sequence of inputs to the function invocation.
+
+    Args:
+      *args: List of inputs to be converted (should be Tf.Tensor).
+    Returns:
+      Wrapped inputs (identity standins that have additional metadata). These
+      are also are also tf.Tensor's.
+    """
+
+    def augmented_identity(arg):
+      identity_op = _array_ops.identity(arg)
+      # pylint: disable=protected-access
+      identity_op.op._set_attr(
+          OpHint.FUNCTION_NAME_ATTR,
+          _attr_value_pb2.AttrValue(s=self._function_name))
+      identity_op.op._set_attr(
+          OpHint.FUNCTION_UUID_ATTR,
+          _attr_value_pb2.AttrValue(s=self._unique_function_id))
+      identity_op.op._set_attr(
+          OpHint.FUNCTION_INPUT_INDEX_ATTR,
+          _attr_value_pb2.AttrValue(i=self._curr_input_index))
+      # pylint: enable=protected-access
+      self._curr_input_index += 1
+      return identity_op
+
+    return [augmented_identity(arg) for arg in args]
+
+  def add_outputs(self, *args):
+    """Add a sequence of outputs to the function invocation.
+
+    Args:
+      *args: List of outputs to be converted (should be tf.Tensor).
+    Returns:
+      Wrapped outputs (identity standins that have additional metadata). These
+      are also tf.Tensor's.
+    """
+
+    def augmented_identity(arg):
+      identity_op = _array_ops.identity(arg)
+      # pylint: disable=protected-access
+      identity_op.op._set_attr(
+          OpHint.FUNCTION_NAME_ATTR,
+          _attr_value_pb2.AttrValue(s=self._function_name))
+      identity_op.op._set_attr(
+          OpHint.FUNCTION_UUID_ATTR,
+          _attr_value_pb2.AttrValue(s=self._unique_function_id))
+      identity_op.op._set_attr(
+          OpHint.FUNCTION_OUTPUT_INDEX_ATTR,
+          _attr_value_pb2.AttrValue(i=self._curr_output_index))
+      # pylint: enable=protected-access
+      self._curr_output_index += 1
+      return identity_op
+
+    wrapped_outputs = [augmented_identity(arg) for arg in args]
+
+    if not self._stored_attrs:
+      for key, value in self._attrs_to_store_later.iteritems():
+        self._setattr(wrapped_outputs[0], "_tflite_attr_" + key, value)
+      self._stored_attrs = True
+
+    return wrapped_outputs
+
+
+class _LiteFuncCall(object):
+  """Represent a TensorFlow Lite custom function.
+
+  This is uses to accumulate found hints in the graphdef into a single
+  conceptual unit.
+
+  Properties:
+    self.inputs: inputs to the op (hash from index # to argument)
+    self.outputs: outputs to the op (hash from index # to argument)
+    self.function_name: the tflite custom op name to use
+    self.uuid: a unique call id for this particular call  (i.e.
+      multiple function calls would have the same function_name but different
+      uuids.
+    self.params: A param name to key value for op constant data. I.e. for
+      axis on a reduction, strides on a convolution, etc.
+  """
+
+  def __init__(self):
+    self.inputs = {}
+    self.outputs = {}
+    self.function_name = None
+    self.uuid = None
+    self.params = {}
+
+  def __str__(self):
+    return "tflite function %s call %s\n\tinputs: %r\n\toutputs: %r" % (
+        self.function_name, self.uuid, self.inputs, self.outputs)
+
+
+def _find_all_hints_in_graph_def(session):
+  """Look at the current default graph and return a list of LiteFuncCall objs.
+
+  Args:
+    session: A TensorFlow session that contains the graph to convert.
+  Returns:
+    a list of `LifeFuncCall` objects in the form
+
+  """
+  func_calls = _collections.defaultdict(_LiteFuncCall)
+  seen_ops = set()
+
+  for op in session.graph.get_operations():
+    for operand in _itertools.chain(op.inputs, op.outputs):
+      if operand in seen_ops:
+        continue
+      seen_ops.add(operand)
+      attr = operand.op.node_def.attr
+      uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
+      if OpHint.FUNCTION_UUID_ATTR not in attr:
+        continue
+      call_def = func_calls[uuid]
+      call_def.uuid = uuid
+      if OpHint.FUNCTION_UUID_ATTR in attr:
+        call_def.function_name = attr[OpHint.FUNCTION_NAME_ATTR].s
+        if OpHint.FUNCTION_INPUT_INDEX_ATTR in attr:
+          call_def.inputs[attr[OpHint.FUNCTION_INPUT_INDEX_ATTR].i] = operand
+        if OpHint.FUNCTION_OUTPUT_INDEX_ATTR in attr:
+          call_def.outputs[attr[OpHint.FUNCTION_OUTPUT_INDEX_ATTR].i] = operand
+
+      for a in attr:
+        if a.startswith("_tflite_attr_"):
+          # TODO(aselle): Remember the attribute tensors so we can put them
+          # in collapse.
+          call_def.params[a.replace("_tflite_attr_,", "")] = attr[a].tensor
+
+  return func_calls
+
+
+def _tensor_name_base(full_tensor_name):
+  """Removes the device assignment code from a tensor.
+
+  e.g. _tensor_name_base("foo:3") => "foo"
+
+  Args:
+    full_tensor_name: A tensor name that is annotated with a device placement
+      (this is what tensor flow introspection gives).
+  Returns:
+    A name without any device assignment.
+  """
+  return full_tensor_name.name.split(":")[0]
+
+
+def convert_op_hints_to_stubs(session):
+  """Converts a graphdef with LiteOp hints into stub operations.
+
+  This is used to prepare for toco conversion of complex intrinsic usages.
+
+  Args:
+    session: A TensorFlow session that contains the graph to convert.
+  Returns:
+    A new graphdef with all ops contained in OpHints being replaced by
+    a single op call with the right parameters.
+  """
+  hints = _find_all_hints_in_graph_def(session)
+  current_graph_def = session.graph_def
+  for call in hints.values():
+    input_names = [None] * len(call.inputs)
+    output_names = [None] * len(call.outputs)
+    output_dtypes = [None] * len(call.outputs)
+    output_quantized = False
+    for input_index, tensor in call.inputs.items():
+      input_names[input_index] = _tensor_name_base(tensor)
+    for output_index, tensor in call.outputs.items():
+      output_names[output_index] = _tensor_name_base(tensor)
+      output_dtypes[output_index] = tensor.dtype.as_datatype_enum
+    # TODO(aselle): Support quantized flag properly
+    current_graph_def = _framework.fuse_op(
+        current_graph_def, input_names, output_names, output_dtypes,
+        output_quantized, call.uuid, call.function_name)
+    for node in current_graph_def.node:
+      if node.name == call.uuid:
+        for param, tensor in call.params.items():
+          node.attr[param].tensor.CopyFrom(tensor)
+  return current_graph_def
+
+
+_allowed_symbols = ["OpHint", "convert_op_hints_to_stubs"]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index ddb2ab792c520eb245445532f534ebce8a9f1280..ef8f39cf55b7d76e5cd51ac8f2a6f9748f627f7a 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -47,18 +47,18 @@ table QuantizationParameters {
 
 table Tensor {
   // The tensor shape. The meaning of each entry is operator-specific but
-  // builtin ops use: [batch size, number of channels, height, width] (That's
-  // Tensorflow's NCHW).
+  // builtin ops use: [batch size, height, width, number of channels] (That's
+  // Tensorflow's NHWC).
   shape:[int];
   type:TensorType;
   // An index that refers to the buffers table at the root of the model. Or,
   // if there is no data buffer associated (i.e. intermediate results), then
-  // this is 0 (which refers to an always existant empty buffer).
+  // this is 0 (which refers to an always existent empty buffer).
   //
   // The data_buffer itself is an opaque container, with the assumption that the
   // target device is little-endian. In addition, all builtin operators assume
   // the memory is ordered such that if `shape` is [4, 3, 2], then index
-  // [i, j, k] maps to data_buffer[i*3*2 + j*3 + k].
+  // [i, j, k] maps to data_buffer[i*3*2 + j*2 + k].
   buffer:uint;
   name:string;  // For debugging and importing back into tensorflow.
   quantization:QuantizationParameters;  // Optional.
@@ -89,7 +89,10 @@ enum BuiltinOperator : byte {
   MAX_POOL_2D = 17,
   MUL = 18,
   RELU = 19,
-  RELU1 = 20,
+  // NOTE(aselle): RELU_N1_TO_1 used to be called RELU1, but it was renamed
+  // since different model developers use RELU1 in different ways. Never
+  // create another op called RELU1.
+  RELU_N1_TO_1 = 20,
   RELU6 = 21,
   RESHAPE = 22,
   RESIZE_BILINEAR = 23,
@@ -104,6 +107,20 @@ enum BuiltinOperator : byte {
   CALL = 31,
   CUSTOM = 32,
   EMBEDDING_LOOKUP_SPARSE = 33,
+  PAD = 34,
+  UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  GATHER = 36,
+  BATCH_TO_SPACE_ND = 37,
+  SPACE_TO_BATCH_ND = 38,
+  TRANSPOSE = 39,
+  MEAN = 40,
+  SUB = 41,
+  DIV = 42,
+  SQUEEZE = 43,
+  UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  STRIDED_SLICE = 45,
+  BIDIRECTIONAL_SEQUENCE_RNN = 46,
+  EXP = 47,
 }
 
 // Options for the builtin operators.
@@ -129,6 +146,18 @@ union BuiltinOptions {
   SpaceToDepthOptions,
   EmbeddingLookupSparseOptions,
   MulOptions,
+  PadOptions,
+  GatherOptions,
+  BatchToSpaceNDOptions,
+  SpaceToBatchNDOptions,
+  TransposeOptions,
+  MeanOptions,
+  SubOptions,
+  DivOptions,
+  SqueezeOptions,
+  SequenceRNNOptions,
+  StridedSliceOptions,
+  ExpOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -136,7 +165,7 @@ enum Padding : byte { SAME, VALID }
 enum ActivationFunctionType : byte {
   NONE = 0,
   RELU = 1,
-  RELU1 = 2,
+  RELU_N1_TO_1 = 2,
   RELU6 = 3,
   TANH = 4,
   SIGN_BIT = 5,
@@ -192,6 +221,18 @@ table RNNOptions {
   fused_activation_function:ActivationFunctionType;
 }
 
+// An implementation of TensorFlow dynamic_rnn with RNNCell.
+table SequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+}
+
+// An implementation of TensorFlow bidrectional_dynamic_rnn with RNNCell.
+table BidirectionalSequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+}
+
 // An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
 table FullyConnectedOptions {
   fused_activation_function:ActivationFunctionType;
@@ -234,8 +275,9 @@ table LSTMOptions {
 }
 
 table ResizeBilinearOptions {
-  new_height:int;
-  new_width:int;
+  new_height: int (deprecated);
+  new_width: int (deprecated);
+  align_corners: bool;
 }
 
 // A call operation options
@@ -244,10 +286,19 @@ table CallOptions {
   subgraph:uint;
 }
 
+table PadOptions {
+}
+
 table ReshapeOptions {
   new_shape:[int];
 }
 
+table SpaceToBatchNDOptions {
+}
+
+table BatchToSpaceNDOptions {
+}
+
 table SkipGramOptions {
   ngram_size: int;
   max_skip_size: int;
@@ -258,6 +309,14 @@ table SpaceToDepthOptions {
   block_size: int;
 }
 
+table SubOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DivOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
 enum CombinerType : byte {
   SUM = 0,
   MEAN = 1,
@@ -268,6 +327,32 @@ table EmbeddingLookupSparseOptions {
   combiner:CombinerType;
 }
 
+table GatherOptions {
+  axis: int;
+}
+
+table TransposeOptions {
+}
+
+table ExpOptions {
+}
+
+table MeanOptions {
+  keep_dims: bool;
+}
+
+table SqueezeOptions {
+  squeeze_dims:[int];
+}
+
+table StridedSliceOptions {
+  begin_mask: int;
+  end_mask: int;
+  ellipsis_mask: int;
+  new_axis_mask: int;
+  shrink_axis_mask: int;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
@@ -343,4 +428,3 @@ table Model {
 }
 
 root_type Model;
-
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index df460ab9a32f1d80c0788649e799778db8050b7f..40b50bab4581f8dc1a8fabfc4fc41f23b2a807cf 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -1,5 +1,18 @@
-// automatically generated by the FlatBuffers compiler, do not modify
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
 
 #ifndef FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
 #define FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
@@ -35,6 +48,12 @@ struct SVDFOptionsT;
 struct RNNOptions;
 struct RNNOptionsT;
 
+struct SequenceRNNOptions;
+struct SequenceRNNOptionsT;
+
+struct BidirectionalSequenceRNNOptions;
+struct BidirectionalSequenceRNNOptionsT;
+
 struct FullyConnectedOptions;
 struct FullyConnectedOptionsT;
 
@@ -65,18 +84,51 @@ struct ResizeBilinearOptionsT;
 struct CallOptions;
 struct CallOptionsT;
 
+struct PadOptions;
+struct PadOptionsT;
+
 struct ReshapeOptions;
 struct ReshapeOptionsT;
 
+struct SpaceToBatchNDOptions;
+struct SpaceToBatchNDOptionsT;
+
+struct BatchToSpaceNDOptions;
+struct BatchToSpaceNDOptionsT;
+
 struct SkipGramOptions;
 struct SkipGramOptionsT;
 
 struct SpaceToDepthOptions;
 struct SpaceToDepthOptionsT;
 
+struct SubOptions;
+struct SubOptionsT;
+
+struct DivOptions;
+struct DivOptionsT;
+
 struct EmbeddingLookupSparseOptions;
 struct EmbeddingLookupSparseOptionsT;
 
+struct GatherOptions;
+struct GatherOptionsT;
+
+struct TransposeOptions;
+struct TransposeOptionsT;
+
+struct ExpOptions;
+struct ExpOptionsT;
+
+struct MeanOptions;
+struct MeanOptionsT;
+
+struct SqueezeOptions;
+struct SqueezeOptionsT;
+
+struct StridedSliceOptions;
+struct StridedSliceOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -104,27 +156,15 @@ enum TensorType {
 };
 
 inline TensorType (&EnumValuesTensorType())[6] {
-  static TensorType values[] = {
-    TensorType_FLOAT32,
-    TensorType_FLOAT16,
-    TensorType_INT32,
-    TensorType_UINT8,
-    TensorType_INT64,
-    TensorType_STRING
-  };
+  static TensorType values[] = {TensorType_FLOAT32, TensorType_FLOAT16,
+                                TensorType_INT32,   TensorType_UINT8,
+                                TensorType_INT64,   TensorType_STRING};
   return values;
 }
 
 inline const char **EnumNamesTensorType() {
-  static const char *names[] = {
-    "FLOAT32",
-    "FLOAT16",
-    "INT32",
-    "UINT8",
-    "INT64",
-    "STRING",
-    nullptr
-  };
+  static const char *names[] = {"FLOAT32", "FLOAT16", "INT32", "UINT8",
+                                "INT64",   "STRING",  nullptr};
   return names;
 }
 
@@ -151,7 +191,7 @@ enum BuiltinOperator {
   BuiltinOperator_MAX_POOL_2D = 17,
   BuiltinOperator_MUL = 18,
   BuiltinOperator_RELU = 19,
-  BuiltinOperator_RELU1 = 20,
+  BuiltinOperator_RELU_N1_TO_1 = 20,
   BuiltinOperator_RELU6 = 21,
   BuiltinOperator_RESHAPE = 22,
   BuiltinOperator_RESIZE_BILINEAR = 23,
@@ -165,85 +205,124 @@ enum BuiltinOperator {
   BuiltinOperator_CALL = 31,
   BuiltinOperator_CUSTOM = 32,
   BuiltinOperator_EMBEDDING_LOOKUP_SPARSE = 33,
+  BuiltinOperator_PAD = 34,
+  BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  BuiltinOperator_GATHER = 36,
+  BuiltinOperator_BATCH_TO_SPACE_ND = 37,
+  BuiltinOperator_SPACE_TO_BATCH_ND = 38,
+  BuiltinOperator_TRANSPOSE = 39,
+  BuiltinOperator_MEAN = 40,
+  BuiltinOperator_SUB = 41,
+  BuiltinOperator_DIV = 42,
+  BuiltinOperator_SQUEEZE = 43,
+  BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  BuiltinOperator_STRIDED_SLICE = 45,
+  BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN = 46,
+  BuiltinOperator_EXP = 47,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_EMBEDDING_LOOKUP_SPARSE
+  BuiltinOperator_MAX = BuiltinOperator_EXP
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[31] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[45] {
   static BuiltinOperator values[] = {
-    BuiltinOperator_ADD,
-    BuiltinOperator_AVERAGE_POOL_2D,
-    BuiltinOperator_CONCATENATION,
-    BuiltinOperator_CONV_2D,
-    BuiltinOperator_DEPTHWISE_CONV_2D,
-    BuiltinOperator_EMBEDDING_LOOKUP,
-    BuiltinOperator_FULLY_CONNECTED,
-    BuiltinOperator_HASHTABLE_LOOKUP,
-    BuiltinOperator_L2_NORMALIZATION,
-    BuiltinOperator_L2_POOL_2D,
-    BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
-    BuiltinOperator_LOGISTIC,
-    BuiltinOperator_LSH_PROJECTION,
-    BuiltinOperator_LSTM,
-    BuiltinOperator_MAX_POOL_2D,
-    BuiltinOperator_MUL,
-    BuiltinOperator_RELU,
-    BuiltinOperator_RELU1,
-    BuiltinOperator_RELU6,
-    BuiltinOperator_RESHAPE,
-    BuiltinOperator_RESIZE_BILINEAR,
-    BuiltinOperator_RNN,
-    BuiltinOperator_SOFTMAX,
-    BuiltinOperator_SPACE_TO_DEPTH,
-    BuiltinOperator_SVDF,
-    BuiltinOperator_TANH,
-    BuiltinOperator_CONCAT_EMBEDDINGS,
-    BuiltinOperator_SKIP_GRAM,
-    BuiltinOperator_CALL,
-    BuiltinOperator_CUSTOM,
-    BuiltinOperator_EMBEDDING_LOOKUP_SPARSE
-  };
+      BuiltinOperator_ADD,
+      BuiltinOperator_AVERAGE_POOL_2D,
+      BuiltinOperator_CONCATENATION,
+      BuiltinOperator_CONV_2D,
+      BuiltinOperator_DEPTHWISE_CONV_2D,
+      BuiltinOperator_EMBEDDING_LOOKUP,
+      BuiltinOperator_FULLY_CONNECTED,
+      BuiltinOperator_HASHTABLE_LOOKUP,
+      BuiltinOperator_L2_NORMALIZATION,
+      BuiltinOperator_L2_POOL_2D,
+      BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+      BuiltinOperator_LOGISTIC,
+      BuiltinOperator_LSH_PROJECTION,
+      BuiltinOperator_LSTM,
+      BuiltinOperator_MAX_POOL_2D,
+      BuiltinOperator_MUL,
+      BuiltinOperator_RELU,
+      BuiltinOperator_RELU_N1_TO_1,
+      BuiltinOperator_RELU6,
+      BuiltinOperator_RESHAPE,
+      BuiltinOperator_RESIZE_BILINEAR,
+      BuiltinOperator_RNN,
+      BuiltinOperator_SOFTMAX,
+      BuiltinOperator_SPACE_TO_DEPTH,
+      BuiltinOperator_SVDF,
+      BuiltinOperator_TANH,
+      BuiltinOperator_CONCAT_EMBEDDINGS,
+      BuiltinOperator_SKIP_GRAM,
+      BuiltinOperator_CALL,
+      BuiltinOperator_CUSTOM,
+      BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
+      BuiltinOperator_PAD,
+      BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
+      BuiltinOperator_GATHER,
+      BuiltinOperator_BATCH_TO_SPACE_ND,
+      BuiltinOperator_SPACE_TO_BATCH_ND,
+      BuiltinOperator_TRANSPOSE,
+      BuiltinOperator_MEAN,
+      BuiltinOperator_SUB,
+      BuiltinOperator_DIV,
+      BuiltinOperator_SQUEEZE,
+      BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+      BuiltinOperator_STRIDED_SLICE,
+      BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+      BuiltinOperator_EXP};
   return values;
 }
 
 inline const char **EnumNamesBuiltinOperator() {
-  static const char *names[] = {
-    "ADD",
-    "AVERAGE_POOL_2D",
-    "CONCATENATION",
-    "CONV_2D",
-    "DEPTHWISE_CONV_2D",
-    "",
-    "",
-    "EMBEDDING_LOOKUP",
-    "",
-    "FULLY_CONNECTED",
-    "HASHTABLE_LOOKUP",
-    "L2_NORMALIZATION",
-    "L2_POOL_2D",
-    "LOCAL_RESPONSE_NORMALIZATION",
-    "LOGISTIC",
-    "LSH_PROJECTION",
-    "LSTM",
-    "MAX_POOL_2D",
-    "MUL",
-    "RELU",
-    "RELU1",
-    "RELU6",
-    "RESHAPE",
-    "RESIZE_BILINEAR",
-    "RNN",
-    "SOFTMAX",
-    "SPACE_TO_DEPTH",
-    "SVDF",
-    "TANH",
-    "CONCAT_EMBEDDINGS",
-    "SKIP_GRAM",
-    "CALL",
-    "CUSTOM",
-    "EMBEDDING_LOOKUP_SPARSE",
-    nullptr
-  };
+  static const char *names[] = {"ADD",
+                                "AVERAGE_POOL_2D",
+                                "CONCATENATION",
+                                "CONV_2D",
+                                "DEPTHWISE_CONV_2D",
+                                "",
+                                "",
+                                "EMBEDDING_LOOKUP",
+                                "",
+                                "FULLY_CONNECTED",
+                                "HASHTABLE_LOOKUP",
+                                "L2_NORMALIZATION",
+                                "L2_POOL_2D",
+                                "LOCAL_RESPONSE_NORMALIZATION",
+                                "LOGISTIC",
+                                "LSH_PROJECTION",
+                                "LSTM",
+                                "MAX_POOL_2D",
+                                "MUL",
+                                "RELU",
+                                "RELU_N1_TO_1",
+                                "RELU6",
+                                "RESHAPE",
+                                "RESIZE_BILINEAR",
+                                "RNN",
+                                "SOFTMAX",
+                                "SPACE_TO_DEPTH",
+                                "SVDF",
+                                "TANH",
+                                "CONCAT_EMBEDDINGS",
+                                "SKIP_GRAM",
+                                "CALL",
+                                "CUSTOM",
+                                "EMBEDDING_LOOKUP_SPARSE",
+                                "PAD",
+                                "UNIDIRECTIONAL_SEQUENCE_RNN",
+                                "GATHER",
+                                "BATCH_TO_SPACE_ND",
+                                "SPACE_TO_BATCH_ND",
+                                "TRANSPOSE",
+                                "MEAN",
+                                "SUB",
+                                "DIV",
+                                "SQUEEZE",
+                                "UNIDIRECTIONAL_SEQUENCE_LSTM",
+                                "STRIDED_SLICE",
+                                "BIDIRECTIONAL_SEQUENCE_RNN",
+                                "EXP",
+                                nullptr};
   return names;
 }
 
@@ -275,64 +354,97 @@ enum BuiltinOptions {
   BuiltinOptions_SpaceToDepthOptions = 19,
   BuiltinOptions_EmbeddingLookupSparseOptions = 20,
   BuiltinOptions_MulOptions = 21,
+  BuiltinOptions_PadOptions = 22,
+  BuiltinOptions_GatherOptions = 23,
+  BuiltinOptions_BatchToSpaceNDOptions = 24,
+  BuiltinOptions_SpaceToBatchNDOptions = 25,
+  BuiltinOptions_TransposeOptions = 26,
+  BuiltinOptions_MeanOptions = 27,
+  BuiltinOptions_SubOptions = 28,
+  BuiltinOptions_DivOptions = 29,
+  BuiltinOptions_SqueezeOptions = 30,
+  BuiltinOptions_SequenceRNNOptions = 31,
+  BuiltinOptions_StridedSliceOptions = 32,
+  BuiltinOptions_ExpOptions = 33,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_MulOptions
+  BuiltinOptions_MAX = BuiltinOptions_ExpOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[22] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[34] {
   static BuiltinOptions values[] = {
-    BuiltinOptions_NONE,
-    BuiltinOptions_Conv2DOptions,
-    BuiltinOptions_DepthwiseConv2DOptions,
-    BuiltinOptions_ConcatEmbeddingsOptions,
-    BuiltinOptions_LSHProjectionOptions,
-    BuiltinOptions_Pool2DOptions,
-    BuiltinOptions_SVDFOptions,
-    BuiltinOptions_RNNOptions,
-    BuiltinOptions_FullyConnectedOptions,
-    BuiltinOptions_SoftmaxOptions,
-    BuiltinOptions_ConcatenationOptions,
-    BuiltinOptions_AddOptions,
-    BuiltinOptions_L2NormOptions,
-    BuiltinOptions_LocalResponseNormalizationOptions,
-    BuiltinOptions_LSTMOptions,
-    BuiltinOptions_ResizeBilinearOptions,
-    BuiltinOptions_CallOptions,
-    BuiltinOptions_ReshapeOptions,
-    BuiltinOptions_SkipGramOptions,
-    BuiltinOptions_SpaceToDepthOptions,
-    BuiltinOptions_EmbeddingLookupSparseOptions,
-    BuiltinOptions_MulOptions
-  };
+      BuiltinOptions_NONE,
+      BuiltinOptions_Conv2DOptions,
+      BuiltinOptions_DepthwiseConv2DOptions,
+      BuiltinOptions_ConcatEmbeddingsOptions,
+      BuiltinOptions_LSHProjectionOptions,
+      BuiltinOptions_Pool2DOptions,
+      BuiltinOptions_SVDFOptions,
+      BuiltinOptions_RNNOptions,
+      BuiltinOptions_FullyConnectedOptions,
+      BuiltinOptions_SoftmaxOptions,
+      BuiltinOptions_ConcatenationOptions,
+      BuiltinOptions_AddOptions,
+      BuiltinOptions_L2NormOptions,
+      BuiltinOptions_LocalResponseNormalizationOptions,
+      BuiltinOptions_LSTMOptions,
+      BuiltinOptions_ResizeBilinearOptions,
+      BuiltinOptions_CallOptions,
+      BuiltinOptions_ReshapeOptions,
+      BuiltinOptions_SkipGramOptions,
+      BuiltinOptions_SpaceToDepthOptions,
+      BuiltinOptions_EmbeddingLookupSparseOptions,
+      BuiltinOptions_MulOptions,
+      BuiltinOptions_PadOptions,
+      BuiltinOptions_GatherOptions,
+      BuiltinOptions_BatchToSpaceNDOptions,
+      BuiltinOptions_SpaceToBatchNDOptions,
+      BuiltinOptions_TransposeOptions,
+      BuiltinOptions_MeanOptions,
+      BuiltinOptions_SubOptions,
+      BuiltinOptions_DivOptions,
+      BuiltinOptions_SqueezeOptions,
+      BuiltinOptions_SequenceRNNOptions,
+      BuiltinOptions_StridedSliceOptions,
+      BuiltinOptions_ExpOptions};
   return values;
 }
 
 inline const char **EnumNamesBuiltinOptions() {
-  static const char *names[] = {
-    "NONE",
-    "Conv2DOptions",
-    "DepthwiseConv2DOptions",
-    "ConcatEmbeddingsOptions",
-    "LSHProjectionOptions",
-    "Pool2DOptions",
-    "SVDFOptions",
-    "RNNOptions",
-    "FullyConnectedOptions",
-    "SoftmaxOptions",
-    "ConcatenationOptions",
-    "AddOptions",
-    "L2NormOptions",
-    "LocalResponseNormalizationOptions",
-    "LSTMOptions",
-    "ResizeBilinearOptions",
-    "CallOptions",
-    "ReshapeOptions",
-    "SkipGramOptions",
-    "SpaceToDepthOptions",
-    "EmbeddingLookupSparseOptions",
-    "MulOptions",
-    nullptr
-  };
+  static const char *names[] = {"NONE",
+                                "Conv2DOptions",
+                                "DepthwiseConv2DOptions",
+                                "ConcatEmbeddingsOptions",
+                                "LSHProjectionOptions",
+                                "Pool2DOptions",
+                                "SVDFOptions",
+                                "RNNOptions",
+                                "FullyConnectedOptions",
+                                "SoftmaxOptions",
+                                "ConcatenationOptions",
+                                "AddOptions",
+                                "L2NormOptions",
+                                "LocalResponseNormalizationOptions",
+                                "LSTMOptions",
+                                "ResizeBilinearOptions",
+                                "CallOptions",
+                                "ReshapeOptions",
+                                "SkipGramOptions",
+                                "SpaceToDepthOptions",
+                                "EmbeddingLookupSparseOptions",
+                                "MulOptions",
+                                "PadOptions",
+                                "GatherOptions",
+                                "BatchToSpaceNDOptions",
+                                "SpaceToBatchNDOptions",
+                                "TransposeOptions",
+                                "MeanOptions",
+                                "SubOptions",
+                                "DivOptions",
+                                "SqueezeOptions",
+                                "SequenceRNNOptions",
+                                "StridedSliceOptions",
+                                "ExpOptions",
+                                nullptr};
   return names;
 }
 
@@ -341,114 +453,211 @@ inline const char *EnumNameBuiltinOptions(BuiltinOptions e) {
   return EnumNamesBuiltinOptions()[index];
 }
 
-template<typename T> struct BuiltinOptionsTraits {
+template <typename T>
+struct BuiltinOptionsTraits {
   static const BuiltinOptions enum_value = BuiltinOptions_NONE;
 };
 
-template<> struct BuiltinOptionsTraits<Conv2DOptions> {
+template <>
+struct BuiltinOptionsTraits<Conv2DOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_Conv2DOptions;
 };
 
-template<> struct BuiltinOptionsTraits<DepthwiseConv2DOptions> {
-  static const BuiltinOptions enum_value = BuiltinOptions_DepthwiseConv2DOptions;
+template <>
+struct BuiltinOptionsTraits<DepthwiseConv2DOptions> {
+  static const BuiltinOptions enum_value =
+      BuiltinOptions_DepthwiseConv2DOptions;
 };
 
-template<> struct BuiltinOptionsTraits<ConcatEmbeddingsOptions> {
-  static const BuiltinOptions enum_value = BuiltinOptions_ConcatEmbeddingsOptions;
+template <>
+struct BuiltinOptionsTraits<ConcatEmbeddingsOptions> {
+  static const BuiltinOptions enum_value =
+      BuiltinOptions_ConcatEmbeddingsOptions;
 };
 
-template<> struct BuiltinOptionsTraits<LSHProjectionOptions> {
+template <>
+struct BuiltinOptionsTraits<LSHProjectionOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_LSHProjectionOptions;
 };
 
-template<> struct BuiltinOptionsTraits<Pool2DOptions> {
+template <>
+struct BuiltinOptionsTraits<Pool2DOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_Pool2DOptions;
 };
 
-template<> struct BuiltinOptionsTraits<SVDFOptions> {
+template <>
+struct BuiltinOptionsTraits<SVDFOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SVDFOptions;
 };
 
-template<> struct BuiltinOptionsTraits<RNNOptions> {
+template <>
+struct BuiltinOptionsTraits<RNNOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_RNNOptions;
 };
 
-template<> struct BuiltinOptionsTraits<FullyConnectedOptions> {
+template <>
+struct BuiltinOptionsTraits<FullyConnectedOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_FullyConnectedOptions;
 };
 
-template<> struct BuiltinOptionsTraits<SoftmaxOptions> {
+template <>
+struct BuiltinOptionsTraits<SoftmaxOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SoftmaxOptions;
 };
 
-template<> struct BuiltinOptionsTraits<ConcatenationOptions> {
+template <>
+struct BuiltinOptionsTraits<ConcatenationOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_ConcatenationOptions;
 };
 
-template<> struct BuiltinOptionsTraits<AddOptions> {
+template <>
+struct BuiltinOptionsTraits<AddOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_AddOptions;
 };
 
-template<> struct BuiltinOptionsTraits<L2NormOptions> {
+template <>
+struct BuiltinOptionsTraits<L2NormOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_L2NormOptions;
 };
 
-template<> struct BuiltinOptionsTraits<LocalResponseNormalizationOptions> {
-  static const BuiltinOptions enum_value = BuiltinOptions_LocalResponseNormalizationOptions;
+template <>
+struct BuiltinOptionsTraits<LocalResponseNormalizationOptions> {
+  static const BuiltinOptions enum_value =
+      BuiltinOptions_LocalResponseNormalizationOptions;
 };
 
-template<> struct BuiltinOptionsTraits<LSTMOptions> {
+template <>
+struct BuiltinOptionsTraits<LSTMOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_LSTMOptions;
 };
 
-template<> struct BuiltinOptionsTraits<ResizeBilinearOptions> {
+template <>
+struct BuiltinOptionsTraits<ResizeBilinearOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_ResizeBilinearOptions;
 };
 
-template<> struct BuiltinOptionsTraits<CallOptions> {
+template <>
+struct BuiltinOptionsTraits<CallOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_CallOptions;
 };
 
-template<> struct BuiltinOptionsTraits<ReshapeOptions> {
+template <>
+struct BuiltinOptionsTraits<ReshapeOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_ReshapeOptions;
 };
 
-template<> struct BuiltinOptionsTraits<SkipGramOptions> {
+template <>
+struct BuiltinOptionsTraits<SkipGramOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SkipGramOptions;
 };
 
-template<> struct BuiltinOptionsTraits<SpaceToDepthOptions> {
+template <>
+struct BuiltinOptionsTraits<SpaceToDepthOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SpaceToDepthOptions;
 };
 
-template<> struct BuiltinOptionsTraits<EmbeddingLookupSparseOptions> {
-  static const BuiltinOptions enum_value = BuiltinOptions_EmbeddingLookupSparseOptions;
+template <>
+struct BuiltinOptionsTraits<EmbeddingLookupSparseOptions> {
+  static const BuiltinOptions enum_value =
+      BuiltinOptions_EmbeddingLookupSparseOptions;
 };
 
-template<> struct BuiltinOptionsTraits<MulOptions> {
+template <>
+struct BuiltinOptionsTraits<MulOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_MulOptions;
 };
 
+template <>
+struct BuiltinOptionsTraits<PadOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PadOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<GatherOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GatherOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<BatchToSpaceNDOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_BatchToSpaceNDOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<SpaceToBatchNDOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SpaceToBatchNDOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<TransposeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TransposeOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<MeanOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MeanOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<SubOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SubOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<DivOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_DivOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<SqueezeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SqueezeOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<SequenceRNNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SequenceRNNOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<StridedSliceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_StridedSliceOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<ExpOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ExpOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
 
   BuiltinOptionsUnion() : type(BuiltinOptions_NONE), value(nullptr) {}
-  BuiltinOptionsUnion(BuiltinOptionsUnion&& u) FLATBUFFERS_NOEXCEPT :
-    type(BuiltinOptions_NONE), value(nullptr)
-    { std::swap(type, u.type); std::swap(value, u.value); }
+  BuiltinOptionsUnion(BuiltinOptionsUnion &&u) FLATBUFFERS_NOEXCEPT
+      : type(BuiltinOptions_NONE),
+        value(nullptr) {
+    std::swap(type, u.type);
+    std::swap(value, u.value);
+  }
   BuiltinOptionsUnion(const BuiltinOptionsUnion &) FLATBUFFERS_NOEXCEPT;
-  BuiltinOptionsUnion &operator=(const BuiltinOptionsUnion &u) FLATBUFFERS_NOEXCEPT
-    { BuiltinOptionsUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
-  BuiltinOptionsUnion &operator=(BuiltinOptionsUnion &&u) FLATBUFFERS_NOEXCEPT
-    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  BuiltinOptionsUnion &operator=(const BuiltinOptionsUnion &u)
+      FLATBUFFERS_NOEXCEPT {
+    BuiltinOptionsUnion t(u);
+    std::swap(type, t.type);
+    std::swap(value, t.value);
+    return *this;
+  }
+  BuiltinOptionsUnion &operator=(BuiltinOptionsUnion &&u) FLATBUFFERS_NOEXCEPT {
+    std::swap(type, u.type);
+    std::swap(value, u.value);
+    return *this;
+  }
   ~BuiltinOptionsUnion() { Reset(); }
 
   void Reset();
 
 #ifndef FLATBUFFERS_CPP98_STL
   template <typename T>
-  void Set(T&& val) {
+  void Set(T &&val) {
     Reset();
     type = BuiltinOptionsTraits<typename T::TableType>::enum_value;
     if (type != BuiltinOptions_NONE) {
@@ -457,181 +666,352 @@ struct BuiltinOptionsUnion {
   }
 #endif  // FLATBUFFERS_CPP98_STL
 
-  static void *UnPack(const void *obj, BuiltinOptions type, const flatbuffers::resolver_function_t *resolver);
-  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+  static void *UnPack(const void *obj, BuiltinOptions type,
+                      const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
 
   Conv2DOptionsT *AsConv2DOptions() {
-    return type == BuiltinOptions_Conv2DOptions ?
-      reinterpret_cast<Conv2DOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_Conv2DOptions
+               ? reinterpret_cast<Conv2DOptionsT *>(value)
+               : nullptr;
   }
   const Conv2DOptionsT *AsConv2DOptions() const {
-    return type == BuiltinOptions_Conv2DOptions ?
-      reinterpret_cast<const Conv2DOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_Conv2DOptions
+               ? reinterpret_cast<const Conv2DOptionsT *>(value)
+               : nullptr;
   }
   DepthwiseConv2DOptionsT *AsDepthwiseConv2DOptions() {
-    return type == BuiltinOptions_DepthwiseConv2DOptions ?
-      reinterpret_cast<DepthwiseConv2DOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_DepthwiseConv2DOptions
+               ? reinterpret_cast<DepthwiseConv2DOptionsT *>(value)
+               : nullptr;
   }
   const DepthwiseConv2DOptionsT *AsDepthwiseConv2DOptions() const {
-    return type == BuiltinOptions_DepthwiseConv2DOptions ?
-      reinterpret_cast<const DepthwiseConv2DOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_DepthwiseConv2DOptions
+               ? reinterpret_cast<const DepthwiseConv2DOptionsT *>(value)
+               : nullptr;
   }
   ConcatEmbeddingsOptionsT *AsConcatEmbeddingsOptions() {
-    return type == BuiltinOptions_ConcatEmbeddingsOptions ?
-      reinterpret_cast<ConcatEmbeddingsOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ConcatEmbeddingsOptions
+               ? reinterpret_cast<ConcatEmbeddingsOptionsT *>(value)
+               : nullptr;
   }
   const ConcatEmbeddingsOptionsT *AsConcatEmbeddingsOptions() const {
-    return type == BuiltinOptions_ConcatEmbeddingsOptions ?
-      reinterpret_cast<const ConcatEmbeddingsOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ConcatEmbeddingsOptions
+               ? reinterpret_cast<const ConcatEmbeddingsOptionsT *>(value)
+               : nullptr;
   }
   LSHProjectionOptionsT *AsLSHProjectionOptions() {
-    return type == BuiltinOptions_LSHProjectionOptions ?
-      reinterpret_cast<LSHProjectionOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_LSHProjectionOptions
+               ? reinterpret_cast<LSHProjectionOptionsT *>(value)
+               : nullptr;
   }
   const LSHProjectionOptionsT *AsLSHProjectionOptions() const {
-    return type == BuiltinOptions_LSHProjectionOptions ?
-      reinterpret_cast<const LSHProjectionOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_LSHProjectionOptions
+               ? reinterpret_cast<const LSHProjectionOptionsT *>(value)
+               : nullptr;
   }
   Pool2DOptionsT *AsPool2DOptions() {
-    return type == BuiltinOptions_Pool2DOptions ?
-      reinterpret_cast<Pool2DOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_Pool2DOptions
+               ? reinterpret_cast<Pool2DOptionsT *>(value)
+               : nullptr;
   }
   const Pool2DOptionsT *AsPool2DOptions() const {
-    return type == BuiltinOptions_Pool2DOptions ?
-      reinterpret_cast<const Pool2DOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_Pool2DOptions
+               ? reinterpret_cast<const Pool2DOptionsT *>(value)
+               : nullptr;
   }
   SVDFOptionsT *AsSVDFOptions() {
-    return type == BuiltinOptions_SVDFOptions ?
-      reinterpret_cast<SVDFOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SVDFOptions
+               ? reinterpret_cast<SVDFOptionsT *>(value)
+               : nullptr;
   }
   const SVDFOptionsT *AsSVDFOptions() const {
-    return type == BuiltinOptions_SVDFOptions ?
-      reinterpret_cast<const SVDFOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SVDFOptions
+               ? reinterpret_cast<const SVDFOptionsT *>(value)
+               : nullptr;
   }
   RNNOptionsT *AsRNNOptions() {
-    return type == BuiltinOptions_RNNOptions ?
-      reinterpret_cast<RNNOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_RNNOptions
+               ? reinterpret_cast<RNNOptionsT *>(value)
+               : nullptr;
   }
   const RNNOptionsT *AsRNNOptions() const {
-    return type == BuiltinOptions_RNNOptions ?
-      reinterpret_cast<const RNNOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_RNNOptions
+               ? reinterpret_cast<const RNNOptionsT *>(value)
+               : nullptr;
   }
   FullyConnectedOptionsT *AsFullyConnectedOptions() {
-    return type == BuiltinOptions_FullyConnectedOptions ?
-      reinterpret_cast<FullyConnectedOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_FullyConnectedOptions
+               ? reinterpret_cast<FullyConnectedOptionsT *>(value)
+               : nullptr;
   }
   const FullyConnectedOptionsT *AsFullyConnectedOptions() const {
-    return type == BuiltinOptions_FullyConnectedOptions ?
-      reinterpret_cast<const FullyConnectedOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_FullyConnectedOptions
+               ? reinterpret_cast<const FullyConnectedOptionsT *>(value)
+               : nullptr;
   }
   SoftmaxOptionsT *AsSoftmaxOptions() {
-    return type == BuiltinOptions_SoftmaxOptions ?
-      reinterpret_cast<SoftmaxOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SoftmaxOptions
+               ? reinterpret_cast<SoftmaxOptionsT *>(value)
+               : nullptr;
   }
   const SoftmaxOptionsT *AsSoftmaxOptions() const {
-    return type == BuiltinOptions_SoftmaxOptions ?
-      reinterpret_cast<const SoftmaxOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SoftmaxOptions
+               ? reinterpret_cast<const SoftmaxOptionsT *>(value)
+               : nullptr;
   }
   ConcatenationOptionsT *AsConcatenationOptions() {
-    return type == BuiltinOptions_ConcatenationOptions ?
-      reinterpret_cast<ConcatenationOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ConcatenationOptions
+               ? reinterpret_cast<ConcatenationOptionsT *>(value)
+               : nullptr;
   }
   const ConcatenationOptionsT *AsConcatenationOptions() const {
-    return type == BuiltinOptions_ConcatenationOptions ?
-      reinterpret_cast<const ConcatenationOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ConcatenationOptions
+               ? reinterpret_cast<const ConcatenationOptionsT *>(value)
+               : nullptr;
   }
   AddOptionsT *AsAddOptions() {
-    return type == BuiltinOptions_AddOptions ?
-      reinterpret_cast<AddOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_AddOptions
+               ? reinterpret_cast<AddOptionsT *>(value)
+               : nullptr;
   }
   const AddOptionsT *AsAddOptions() const {
-    return type == BuiltinOptions_AddOptions ?
-      reinterpret_cast<const AddOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_AddOptions
+               ? reinterpret_cast<const AddOptionsT *>(value)
+               : nullptr;
   }
   L2NormOptionsT *AsL2NormOptions() {
-    return type == BuiltinOptions_L2NormOptions ?
-      reinterpret_cast<L2NormOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_L2NormOptions
+               ? reinterpret_cast<L2NormOptionsT *>(value)
+               : nullptr;
   }
   const L2NormOptionsT *AsL2NormOptions() const {
-    return type == BuiltinOptions_L2NormOptions ?
-      reinterpret_cast<const L2NormOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_L2NormOptions
+               ? reinterpret_cast<const L2NormOptionsT *>(value)
+               : nullptr;
   }
   LocalResponseNormalizationOptionsT *AsLocalResponseNormalizationOptions() {
-    return type == BuiltinOptions_LocalResponseNormalizationOptions ?
-      reinterpret_cast<LocalResponseNormalizationOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_LocalResponseNormalizationOptions
+               ? reinterpret_cast<LocalResponseNormalizationOptionsT *>(value)
+               : nullptr;
   }
-  const LocalResponseNormalizationOptionsT *AsLocalResponseNormalizationOptions() const {
-    return type == BuiltinOptions_LocalResponseNormalizationOptions ?
-      reinterpret_cast<const LocalResponseNormalizationOptionsT *>(value) : nullptr;
+  const LocalResponseNormalizationOptionsT *
+  AsLocalResponseNormalizationOptions() const {
+    return type == BuiltinOptions_LocalResponseNormalizationOptions
+               ? reinterpret_cast<const LocalResponseNormalizationOptionsT *>(
+                     value)
+               : nullptr;
   }
   LSTMOptionsT *AsLSTMOptions() {
-    return type == BuiltinOptions_LSTMOptions ?
-      reinterpret_cast<LSTMOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_LSTMOptions
+               ? reinterpret_cast<LSTMOptionsT *>(value)
+               : nullptr;
   }
   const LSTMOptionsT *AsLSTMOptions() const {
-    return type == BuiltinOptions_LSTMOptions ?
-      reinterpret_cast<const LSTMOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_LSTMOptions
+               ? reinterpret_cast<const LSTMOptionsT *>(value)
+               : nullptr;
   }
   ResizeBilinearOptionsT *AsResizeBilinearOptions() {
-    return type == BuiltinOptions_ResizeBilinearOptions ?
-      reinterpret_cast<ResizeBilinearOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ResizeBilinearOptions
+               ? reinterpret_cast<ResizeBilinearOptionsT *>(value)
+               : nullptr;
   }
   const ResizeBilinearOptionsT *AsResizeBilinearOptions() const {
-    return type == BuiltinOptions_ResizeBilinearOptions ?
-      reinterpret_cast<const ResizeBilinearOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ResizeBilinearOptions
+               ? reinterpret_cast<const ResizeBilinearOptionsT *>(value)
+               : nullptr;
   }
   CallOptionsT *AsCallOptions() {
-    return type == BuiltinOptions_CallOptions ?
-      reinterpret_cast<CallOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_CallOptions
+               ? reinterpret_cast<CallOptionsT *>(value)
+               : nullptr;
   }
   const CallOptionsT *AsCallOptions() const {
-    return type == BuiltinOptions_CallOptions ?
-      reinterpret_cast<const CallOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_CallOptions
+               ? reinterpret_cast<const CallOptionsT *>(value)
+               : nullptr;
   }
   ReshapeOptionsT *AsReshapeOptions() {
-    return type == BuiltinOptions_ReshapeOptions ?
-      reinterpret_cast<ReshapeOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ReshapeOptions
+               ? reinterpret_cast<ReshapeOptionsT *>(value)
+               : nullptr;
   }
   const ReshapeOptionsT *AsReshapeOptions() const {
-    return type == BuiltinOptions_ReshapeOptions ?
-      reinterpret_cast<const ReshapeOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_ReshapeOptions
+               ? reinterpret_cast<const ReshapeOptionsT *>(value)
+               : nullptr;
   }
   SkipGramOptionsT *AsSkipGramOptions() {
-    return type == BuiltinOptions_SkipGramOptions ?
-      reinterpret_cast<SkipGramOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SkipGramOptions
+               ? reinterpret_cast<SkipGramOptionsT *>(value)
+               : nullptr;
   }
   const SkipGramOptionsT *AsSkipGramOptions() const {
-    return type == BuiltinOptions_SkipGramOptions ?
-      reinterpret_cast<const SkipGramOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SkipGramOptions
+               ? reinterpret_cast<const SkipGramOptionsT *>(value)
+               : nullptr;
   }
   SpaceToDepthOptionsT *AsSpaceToDepthOptions() {
-    return type == BuiltinOptions_SpaceToDepthOptions ?
-      reinterpret_cast<SpaceToDepthOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SpaceToDepthOptions
+               ? reinterpret_cast<SpaceToDepthOptionsT *>(value)
+               : nullptr;
   }
   const SpaceToDepthOptionsT *AsSpaceToDepthOptions() const {
-    return type == BuiltinOptions_SpaceToDepthOptions ?
-      reinterpret_cast<const SpaceToDepthOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_SpaceToDepthOptions
+               ? reinterpret_cast<const SpaceToDepthOptionsT *>(value)
+               : nullptr;
   }
   EmbeddingLookupSparseOptionsT *AsEmbeddingLookupSparseOptions() {
-    return type == BuiltinOptions_EmbeddingLookupSparseOptions ?
-      reinterpret_cast<EmbeddingLookupSparseOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_EmbeddingLookupSparseOptions
+               ? reinterpret_cast<EmbeddingLookupSparseOptionsT *>(value)
+               : nullptr;
   }
   const EmbeddingLookupSparseOptionsT *AsEmbeddingLookupSparseOptions() const {
-    return type == BuiltinOptions_EmbeddingLookupSparseOptions ?
-      reinterpret_cast<const EmbeddingLookupSparseOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_EmbeddingLookupSparseOptions
+               ? reinterpret_cast<const EmbeddingLookupSparseOptionsT *>(value)
+               : nullptr;
   }
   MulOptionsT *AsMulOptions() {
-    return type == BuiltinOptions_MulOptions ?
-      reinterpret_cast<MulOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_MulOptions
+               ? reinterpret_cast<MulOptionsT *>(value)
+               : nullptr;
   }
   const MulOptionsT *AsMulOptions() const {
-    return type == BuiltinOptions_MulOptions ?
-      reinterpret_cast<const MulOptionsT *>(value) : nullptr;
+    return type == BuiltinOptions_MulOptions
+               ? reinterpret_cast<const MulOptionsT *>(value)
+               : nullptr;
+  }
+  PadOptionsT *AsPadOptions() {
+    return type == BuiltinOptions_PadOptions
+               ? reinterpret_cast<PadOptionsT *>(value)
+               : nullptr;
+  }
+  const PadOptionsT *AsPadOptions() const {
+    return type == BuiltinOptions_PadOptions
+               ? reinterpret_cast<const PadOptionsT *>(value)
+               : nullptr;
+  }
+  GatherOptionsT *AsGatherOptions() {
+    return type == BuiltinOptions_GatherOptions
+               ? reinterpret_cast<GatherOptionsT *>(value)
+               : nullptr;
+  }
+  const GatherOptionsT *AsGatherOptions() const {
+    return type == BuiltinOptions_GatherOptions
+               ? reinterpret_cast<const GatherOptionsT *>(value)
+               : nullptr;
+  }
+  BatchToSpaceNDOptionsT *AsBatchToSpaceNDOptions() {
+    return type == BuiltinOptions_BatchToSpaceNDOptions
+               ? reinterpret_cast<BatchToSpaceNDOptionsT *>(value)
+               : nullptr;
+  }
+  const BatchToSpaceNDOptionsT *AsBatchToSpaceNDOptions() const {
+    return type == BuiltinOptions_BatchToSpaceNDOptions
+               ? reinterpret_cast<const BatchToSpaceNDOptionsT *>(value)
+               : nullptr;
+  }
+  SpaceToBatchNDOptionsT *AsSpaceToBatchNDOptions() {
+    return type == BuiltinOptions_SpaceToBatchNDOptions
+               ? reinterpret_cast<SpaceToBatchNDOptionsT *>(value)
+               : nullptr;
+  }
+  const SpaceToBatchNDOptionsT *AsSpaceToBatchNDOptions() const {
+    return type == BuiltinOptions_SpaceToBatchNDOptions
+               ? reinterpret_cast<const SpaceToBatchNDOptionsT *>(value)
+               : nullptr;
+  }
+  TransposeOptionsT *AsTransposeOptions() {
+    return type == BuiltinOptions_TransposeOptions
+               ? reinterpret_cast<TransposeOptionsT *>(value)
+               : nullptr;
+  }
+  const TransposeOptionsT *AsTransposeOptions() const {
+    return type == BuiltinOptions_TransposeOptions
+               ? reinterpret_cast<const TransposeOptionsT *>(value)
+               : nullptr;
+  }
+  MeanOptionsT *AsMeanOptions() {
+    return type == BuiltinOptions_MeanOptions
+               ? reinterpret_cast<MeanOptionsT *>(value)
+               : nullptr;
+  }
+  const MeanOptionsT *AsMeanOptions() const {
+    return type == BuiltinOptions_MeanOptions
+               ? reinterpret_cast<const MeanOptionsT *>(value)
+               : nullptr;
+  }
+  SubOptionsT *AsSubOptions() {
+    return type == BuiltinOptions_SubOptions
+               ? reinterpret_cast<SubOptionsT *>(value)
+               : nullptr;
+  }
+  const SubOptionsT *AsSubOptions() const {
+    return type == BuiltinOptions_SubOptions
+               ? reinterpret_cast<const SubOptionsT *>(value)
+               : nullptr;
+  }
+  DivOptionsT *AsDivOptions() {
+    return type == BuiltinOptions_DivOptions
+               ? reinterpret_cast<DivOptionsT *>(value)
+               : nullptr;
+  }
+  const DivOptionsT *AsDivOptions() const {
+    return type == BuiltinOptions_DivOptions
+               ? reinterpret_cast<const DivOptionsT *>(value)
+               : nullptr;
+  }
+  SqueezeOptionsT *AsSqueezeOptions() {
+    return type == BuiltinOptions_SqueezeOptions
+               ? reinterpret_cast<SqueezeOptionsT *>(value)
+               : nullptr;
+  }
+  const SqueezeOptionsT *AsSqueezeOptions() const {
+    return type == BuiltinOptions_SqueezeOptions
+               ? reinterpret_cast<const SqueezeOptionsT *>(value)
+               : nullptr;
+  }
+  SequenceRNNOptionsT *AsSequenceRNNOptions() {
+    return type == BuiltinOptions_SequenceRNNOptions
+               ? reinterpret_cast<SequenceRNNOptionsT *>(value)
+               : nullptr;
+  }
+  const SequenceRNNOptionsT *AsSequenceRNNOptions() const {
+    return type == BuiltinOptions_SequenceRNNOptions
+               ? reinterpret_cast<const SequenceRNNOptionsT *>(value)
+               : nullptr;
+  }
+  StridedSliceOptionsT *AsStridedSliceOptions() {
+    return type == BuiltinOptions_StridedSliceOptions
+               ? reinterpret_cast<StridedSliceOptionsT *>(value)
+               : nullptr;
+  }
+  const StridedSliceOptionsT *AsStridedSliceOptions() const {
+    return type == BuiltinOptions_StridedSliceOptions
+               ? reinterpret_cast<const StridedSliceOptionsT *>(value)
+               : nullptr;
+  }
+  ExpOptionsT *AsExpOptions() {
+    return type == BuiltinOptions_ExpOptions
+               ? reinterpret_cast<ExpOptionsT *>(value)
+               : nullptr;
+  }
+  const ExpOptionsT *AsExpOptions() const {
+    return type == BuiltinOptions_ExpOptions
+               ? reinterpret_cast<const ExpOptionsT *>(value)
+               : nullptr;
   }
 };
 
-bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
-bool VerifyBuiltinOptionsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj,
+                          BuiltinOptions type);
+bool VerifyBuiltinOptionsVector(
+    flatbuffers::Verifier &verifier,
+    const flatbuffers::Vector<flatbuffers::Offset<void>> *values,
+    const flatbuffers::Vector<uint8_t> *types);
 
 enum Padding {
   Padding_SAME = 0,
@@ -641,19 +1021,12 @@ enum Padding {
 };
 
 inline Padding (&EnumValuesPadding())[2] {
-  static Padding values[] = {
-    Padding_SAME,
-    Padding_VALID
-  };
+  static Padding values[] = {Padding_SAME, Padding_VALID};
   return values;
 }
 
 inline const char **EnumNamesPadding() {
-  static const char *names[] = {
-    "SAME",
-    "VALID",
-    nullptr
-  };
+  static const char *names[] = {"SAME", "VALID", nullptr};
   return names;
 }
 
@@ -665,7 +1038,7 @@ inline const char *EnumNamePadding(Padding e) {
 enum ActivationFunctionType {
   ActivationFunctionType_NONE = 0,
   ActivationFunctionType_RELU = 1,
-  ActivationFunctionType_RELU1 = 2,
+  ActivationFunctionType_RELU_N1_TO_1 = 2,
   ActivationFunctionType_RELU6 = 3,
   ActivationFunctionType_TANH = 4,
   ActivationFunctionType_SIGN_BIT = 5,
@@ -675,26 +1048,15 @@ enum ActivationFunctionType {
 
 inline ActivationFunctionType (&EnumValuesActivationFunctionType())[6] {
   static ActivationFunctionType values[] = {
-    ActivationFunctionType_NONE,
-    ActivationFunctionType_RELU,
-    ActivationFunctionType_RELU1,
-    ActivationFunctionType_RELU6,
-    ActivationFunctionType_TANH,
-    ActivationFunctionType_SIGN_BIT
-  };
+      ActivationFunctionType_NONE,         ActivationFunctionType_RELU,
+      ActivationFunctionType_RELU_N1_TO_1, ActivationFunctionType_RELU6,
+      ActivationFunctionType_TANH,         ActivationFunctionType_SIGN_BIT};
   return values;
 }
 
 inline const char **EnumNamesActivationFunctionType() {
-  static const char *names[] = {
-    "NONE",
-    "RELU",
-    "RELU1",
-    "RELU6",
-    "TANH",
-    "SIGN_BIT",
-    nullptr
-  };
+  static const char *names[] = {"NONE", "RELU",     "RELU_N1_TO_1", "RELU6",
+                                "TANH", "SIGN_BIT", nullptr};
   return names;
 }
 
@@ -712,21 +1074,14 @@ enum LSHProjectionType {
 };
 
 inline LSHProjectionType (&EnumValuesLSHProjectionType())[3] {
-  static LSHProjectionType values[] = {
-    LSHProjectionType_UNKNOWN,
-    LSHProjectionType_SPARSE,
-    LSHProjectionType_DENSE
-  };
+  static LSHProjectionType values[] = {LSHProjectionType_UNKNOWN,
+                                       LSHProjectionType_SPARSE,
+                                       LSHProjectionType_DENSE};
   return values;
 }
 
 inline const char **EnumNamesLSHProjectionType() {
-  static const char *names[] = {
-    "UNKNOWN",
-    "SPARSE",
-    "DENSE",
-    nullptr
-  };
+  static const char *names[] = {"UNKNOWN", "SPARSE", "DENSE", nullptr};
   return names;
 }
 
@@ -744,21 +1099,13 @@ enum CombinerType {
 };
 
 inline CombinerType (&EnumValuesCombinerType())[3] {
-  static CombinerType values[] = {
-    CombinerType_SUM,
-    CombinerType_MEAN,
-    CombinerType_SQRTN
-  };
+  static CombinerType values[] = {CombinerType_SUM, CombinerType_MEAN,
+                                  CombinerType_SQRTN};
   return values;
 }
 
 inline const char **EnumNamesCombinerType() {
-  static const char *names[] = {
-    "SUM",
-    "MEAN",
-    "SQRTN",
-    nullptr
-  };
+  static const char *names[] = {"SUM", "MEAN", "SQRTN", nullptr};
   return names;
 }
 
@@ -774,17 +1121,12 @@ enum CustomOptionsFormat {
 };
 
 inline CustomOptionsFormat (&EnumValuesCustomOptionsFormat())[1] {
-  static CustomOptionsFormat values[] = {
-    CustomOptionsFormat_FLEXBUFFERS
-  };
+  static CustomOptionsFormat values[] = {CustomOptionsFormat_FLEXBUFFERS};
   return values;
 }
 
 inline const char **EnumNamesCustomOptionsFormat() {
-  static const char *names[] = {
-    "FLEXBUFFERS",
-    nullptr
-  };
+  static const char *names[] = {"FLEXBUFFERS", nullptr};
   return names;
 }
 
@@ -799,18 +1141,13 @@ struct QuantizationParametersT : public flatbuffers::NativeTable {
   std::vector<float> max;
   std::vector<float> scale;
   std::vector<int64_t> zero_point;
-  QuantizationParametersT() {
-  }
+  QuantizationParametersT() {}
 };
 
-struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct QuantizationParameters FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef QuantizationParametersT NativeTableType;
-  enum {
-    VT_MIN = 4,
-    VT_MAX = 6,
-    VT_SCALE = 8,
-    VT_ZERO_POINT = 10
-  };
+  enum { VT_MIN = 4, VT_MAX = 6, VT_SCALE = 8, VT_ZERO_POINT = 10 };
   const flatbuffers::Vector<float> *min() const {
     return GetPointer<const flatbuffers::Vector<float> *>(VT_MIN);
   }
@@ -824,20 +1161,20 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
     return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_ZERO_POINT);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_MIN) &&
-           verifier.Verify(min()) &&
-           VerifyOffset(verifier, VT_MAX) &&
-           verifier.Verify(max()) &&
-           VerifyOffset(verifier, VT_SCALE) &&
-           verifier.Verify(scale()) &&
-           VerifyOffset(verifier, VT_ZERO_POINT) &&
-           verifier.Verify(zero_point()) &&
-           verifier.EndTable();
-  }
-  QuantizationParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(QuantizationParametersT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<QuantizationParameters> Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_MIN) &&
+           verifier.Verify(min()) && VerifyOffset(verifier, VT_MAX) &&
+           verifier.Verify(max()) && VerifyOffset(verifier, VT_SCALE) &&
+           verifier.Verify(scale()) && VerifyOffset(verifier, VT_ZERO_POINT) &&
+           verifier.Verify(zero_point()) && verifier.EndTable();
+  }
+  QuantizationParametersT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      QuantizationParametersT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<QuantizationParameters> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct QuantizationParametersBuilder {
@@ -852,14 +1189,16 @@ struct QuantizationParametersBuilder {
   void add_scale(flatbuffers::Offset<flatbuffers::Vector<float>> scale) {
     fbb_.AddOffset(QuantizationParameters::VT_SCALE, scale);
   }
-  void add_zero_point(flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point) {
+  void add_zero_point(
+      flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point) {
     fbb_.AddOffset(QuantizationParameters::VT_ZERO_POINT, zero_point);
   }
   explicit QuantizationParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  QuantizationParametersBuilder &operator=(const QuantizationParametersBuilder &);
+  QuantizationParametersBuilder &operator=(
+      const QuantizationParametersBuilder &);
   flatbuffers::Offset<QuantizationParameters> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<QuantizationParameters>(end);
@@ -881,21 +1220,23 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersDirect(
+inline flatbuffers::Offset<QuantizationParameters>
+CreateQuantizationParametersDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<float> *min = nullptr,
     const std::vector<float> *max = nullptr,
     const std::vector<float> *scale = nullptr,
     const std::vector<int64_t> *zero_point = nullptr) {
   return tflite::CreateQuantizationParameters(
-      _fbb,
-      min ? _fbb.CreateVector<float>(*min) : 0,
+      _fbb, min ? _fbb.CreateVector<float>(*min) : 0,
       max ? _fbb.CreateVector<float>(*max) : 0,
       scale ? _fbb.CreateVector<float>(*scale) : 0,
       zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0);
 }
 
-flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
+    flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct TensorT : public flatbuffers::NativeTable {
   typedef Tensor TableType;
@@ -904,10 +1245,7 @@ struct TensorT : public flatbuffers::NativeTable {
   uint32_t buffer;
   std::string name;
   std::unique_ptr<QuantizationParametersT> quantization;
-  TensorT()
-      : type(TensorType_FLOAT32),
-        buffer(0) {
-  }
+  TensorT() : type(TensorType_FLOAT32), buffer(0) {}
 };
 
 struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -925,9 +1263,7 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   TensorType type() const {
     return static_cast<TensorType>(GetField<int8_t>(VT_TYPE, 0));
   }
-  uint32_t buffer() const {
-    return GetField<uint32_t>(VT_BUFFER, 0);
-  }
+  uint32_t buffer() const { return GetField<uint32_t>(VT_BUFFER, 0); }
   const flatbuffers::String *name() const {
     return GetPointer<const flatbuffers::String *>(VT_NAME);
   }
@@ -935,20 +1271,20 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return GetPointer<const QuantizationParameters *>(VT_QUANTIZATION);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_SHAPE) &&
-           verifier.Verify(shape()) &&
-           VerifyField<int8_t>(verifier, VT_TYPE) &&
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_SHAPE) &&
+           verifier.Verify(shape()) && VerifyField<int8_t>(verifier, VT_TYPE) &&
            VerifyField<uint32_t>(verifier, VT_BUFFER) &&
-           VerifyOffset(verifier, VT_NAME) &&
-           verifier.Verify(name()) &&
+           VerifyOffset(verifier, VT_NAME) && verifier.Verify(name()) &&
            VerifyOffset(verifier, VT_QUANTIZATION) &&
-           verifier.VerifyTable(quantization()) &&
-           verifier.EndTable();
-  }
-  TensorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Tensor> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           verifier.VerifyTable(quantization()) && verifier.EndTable();
+  }
+  TensorT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t *_resolver =
+                                 nullptr) const;
+  static flatbuffers::Offset<Tensor> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct TensorBuilder {
@@ -966,11 +1302,11 @@ struct TensorBuilder {
   void add_name(flatbuffers::Offset<flatbuffers::String> name) {
     fbb_.AddOffset(Tensor::VT_NAME, name);
   }
-  void add_quantization(flatbuffers::Offset<QuantizationParameters> quantization) {
+  void add_quantization(
+      flatbuffers::Offset<QuantizationParameters> quantization) {
     fbb_.AddOffset(Tensor::VT_QUANTIZATION, quantization);
   }
-  explicit TensorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit TensorBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   TensorBuilder &operator=(const TensorBuilder &);
@@ -984,8 +1320,7 @@ struct TensorBuilder {
 inline flatbuffers::Offset<Tensor> CreateTensor(
     flatbuffers::FlatBufferBuilder &_fbb,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape = 0,
-    TensorType type = TensorType_FLOAT32,
-    uint32_t buffer = 0,
+    TensorType type = TensorType_FLOAT32, uint32_t buffer = 0,
     flatbuffers::Offset<flatbuffers::String> name = 0,
     flatbuffers::Offset<QuantizationParameters> quantization = 0) {
   TensorBuilder builder_(_fbb);
@@ -1000,20 +1335,17 @@ inline flatbuffers::Offset<Tensor> CreateTensor(
 inline flatbuffers::Offset<Tensor> CreateTensorDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int32_t> *shape = nullptr,
-    TensorType type = TensorType_FLOAT32,
-    uint32_t buffer = 0,
+    TensorType type = TensorType_FLOAT32, uint32_t buffer = 0,
     const char *name = nullptr,
     flatbuffers::Offset<QuantizationParameters> quantization = 0) {
   return tflite::CreateTensor(
-      _fbb,
-      shape ? _fbb.CreateVector<int32_t>(*shape) : 0,
-      type,
-      buffer,
-      name ? _fbb.CreateString(name) : 0,
-      quantization);
+      _fbb, shape ? _fbb.CreateVector<int32_t>(*shape) : 0, type, buffer,
+      name ? _fbb.CreateString(name) : 0, quantization);
 }
 
-flatbuffers::Offset<Tensor> CreateTensor(flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<Tensor> CreateTensor(
+    flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct Conv2DOptionsT : public flatbuffers::NativeTable {
   typedef Conv2DOptions TableType;
@@ -1025,8 +1357,7 @@ struct Conv2DOptionsT : public flatbuffers::NativeTable {
       : padding(Padding_SAME),
         stride_w(0),
         stride_h(0),
-        fused_activation_function(ActivationFunctionType_NONE) {
-  }
+        fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -1040,14 +1371,11 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   Padding padding() const {
     return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
   }
-  int32_t stride_w() const {
-    return GetField<int32_t>(VT_STRIDE_W, 0);
-  }
-  int32_t stride_h() const {
-    return GetField<int32_t>(VT_STRIDE_H, 0);
-  }
+  int32_t stride_w() const { return GetField<int32_t>(VT_STRIDE_W, 0); }
+  int32_t stride_h() const { return GetField<int32_t>(VT_STRIDE_H, 0); }
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -1057,16 +1385,22 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  Conv2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(Conv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Conv2DOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  Conv2DOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      Conv2DOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Conv2DOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct Conv2DOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_padding(Padding padding) {
-    fbb_.AddElement<int8_t>(Conv2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_PADDING,
+                            static_cast<int8_t>(padding), 0);
   }
   void add_stride_w(int32_t stride_w) {
     fbb_.AddElement<int32_t>(Conv2DOptions::VT_STRIDE_W, stride_w, 0);
@@ -1074,11 +1408,13 @@ struct Conv2DOptionsBuilder {
   void add_stride_h(int32_t stride_h) {
     fbb_.AddElement<int32_t>(Conv2DOptions::VT_STRIDE_H, stride_h, 0);
   }
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(Conv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit Conv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   Conv2DOptionsBuilder &operator=(const Conv2DOptionsBuilder &);
@@ -1090,11 +1426,10 @@ struct Conv2DOptionsBuilder {
 };
 
 inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    Padding padding = Padding_SAME,
-    int32_t stride_w = 0,
-    int32_t stride_h = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    flatbuffers::FlatBufferBuilder &_fbb, Padding padding = Padding_SAME,
+    int32_t stride_w = 0, int32_t stride_h = 0,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   Conv2DOptionsBuilder builder_(_fbb);
   builder_.add_stride_h(stride_h);
   builder_.add_stride_w(stride_w);
@@ -1103,7 +1438,9 @@ inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct Pool2DOptionsT : public flatbuffers::NativeTable {
   typedef Pool2DOptions TableType;
@@ -1119,8 +1456,7 @@ struct Pool2DOptionsT : public flatbuffers::NativeTable {
         stride_h(0),
         filter_width(0),
         filter_height(0),
-        fused_activation_function(ActivationFunctionType_NONE) {
-  }
+        fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -1136,20 +1472,15 @@ struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   Padding padding() const {
     return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
   }
-  int32_t stride_w() const {
-    return GetField<int32_t>(VT_STRIDE_W, 0);
-  }
-  int32_t stride_h() const {
-    return GetField<int32_t>(VT_STRIDE_H, 0);
-  }
-  int32_t filter_width() const {
-    return GetField<int32_t>(VT_FILTER_WIDTH, 0);
-  }
+  int32_t stride_w() const { return GetField<int32_t>(VT_STRIDE_W, 0); }
+  int32_t stride_h() const { return GetField<int32_t>(VT_STRIDE_H, 0); }
+  int32_t filter_width() const { return GetField<int32_t>(VT_FILTER_WIDTH, 0); }
   int32_t filter_height() const {
     return GetField<int32_t>(VT_FILTER_HEIGHT, 0);
   }
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -1161,16 +1492,22 @@ struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  Pool2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(Pool2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Pool2DOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  Pool2DOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      Pool2DOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Pool2DOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct Pool2DOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_padding(Padding padding) {
-    fbb_.AddElement<int8_t>(Pool2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+    fbb_.AddElement<int8_t>(Pool2DOptions::VT_PADDING,
+                            static_cast<int8_t>(padding), 0);
   }
   void add_stride_w(int32_t stride_w) {
     fbb_.AddElement<int32_t>(Pool2DOptions::VT_STRIDE_W, stride_w, 0);
@@ -1184,11 +1521,13 @@ struct Pool2DOptionsBuilder {
   void add_filter_height(int32_t filter_height) {
     fbb_.AddElement<int32_t>(Pool2DOptions::VT_FILTER_HEIGHT, filter_height, 0);
   }
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(Pool2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Pool2DOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit Pool2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   Pool2DOptionsBuilder &operator=(const Pool2DOptionsBuilder &);
@@ -1200,13 +1539,11 @@ struct Pool2DOptionsBuilder {
 };
 
 inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    Padding padding = Padding_SAME,
-    int32_t stride_w = 0,
-    int32_t stride_h = 0,
-    int32_t filter_width = 0,
+    flatbuffers::FlatBufferBuilder &_fbb, Padding padding = Padding_SAME,
+    int32_t stride_w = 0, int32_t stride_h = 0, int32_t filter_width = 0,
     int32_t filter_height = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   Pool2DOptionsBuilder builder_(_fbb);
   builder_.add_filter_height(filter_height);
   builder_.add_filter_width(filter_width);
@@ -1217,7 +1554,9 @@ inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct DepthwiseConv2DOptionsT : public flatbuffers::NativeTable {
   typedef DepthwiseConv2DOptions TableType;
@@ -1231,11 +1570,11 @@ struct DepthwiseConv2DOptionsT : public flatbuffers::NativeTable {
         stride_w(0),
         stride_h(0),
         depth_multiplier(0),
-        fused_activation_function(ActivationFunctionType_NONE) {
-  }
+        fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
-struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef DepthwiseConv2DOptionsT NativeTableType;
   enum {
     VT_PADDING = 4,
@@ -1247,17 +1586,14 @@ struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   Padding padding() const {
     return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
   }
-  int32_t stride_w() const {
-    return GetField<int32_t>(VT_STRIDE_W, 0);
-  }
-  int32_t stride_h() const {
-    return GetField<int32_t>(VT_STRIDE_H, 0);
-  }
+  int32_t stride_w() const { return GetField<int32_t>(VT_STRIDE_W, 0); }
+  int32_t stride_h() const { return GetField<int32_t>(VT_STRIDE_H, 0); }
   int32_t depth_multiplier() const {
     return GetField<int32_t>(VT_DEPTH_MULTIPLIER, 0);
   }
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -1268,16 +1604,22 @@ struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  DepthwiseConv2DOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(DepthwiseConv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<DepthwiseConv2DOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  DepthwiseConv2DOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      DepthwiseConv2DOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<DepthwiseConv2DOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct DepthwiseConv2DOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_padding(Padding padding) {
-    fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+    fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_PADDING,
+                            static_cast<int8_t>(padding), 0);
   }
   void add_stride_w(int32_t stride_w) {
     fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_STRIDE_W, stride_w, 0);
@@ -1286,16 +1628,21 @@ struct DepthwiseConv2DOptionsBuilder {
     fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_STRIDE_H, stride_h, 0);
   }
   void add_depth_multiplier(int32_t depth_multiplier) {
-    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DEPTH_MULTIPLIER, depth_multiplier, 0);
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DEPTH_MULTIPLIER,
+                             depth_multiplier, 0);
   }
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(
+        DepthwiseConv2DOptions::VT_FUSED_ACTIVATION_FUNCTION,
+        static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit DepthwiseConv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  DepthwiseConv2DOptionsBuilder &operator=(const DepthwiseConv2DOptionsBuilder &);
+  DepthwiseConv2DOptionsBuilder &operator=(
+      const DepthwiseConv2DOptionsBuilder &);
   flatbuffers::Offset<DepthwiseConv2DOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<DepthwiseConv2DOptions>(end);
@@ -1304,12 +1651,10 @@ struct DepthwiseConv2DOptionsBuilder {
 };
 
 inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    Padding padding = Padding_SAME,
-    int32_t stride_w = 0,
-    int32_t stride_h = 0,
-    int32_t depth_multiplier = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    flatbuffers::FlatBufferBuilder &_fbb, Padding padding = Padding_SAME,
+    int32_t stride_w = 0, int32_t stride_h = 0, int32_t depth_multiplier = 0,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   DepthwiseConv2DOptionsBuilder builder_(_fbb);
   builder_.add_depth_multiplier(depth_multiplier);
   builder_.add_stride_h(stride_h);
@@ -1319,33 +1664,34 @@ inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct ConcatEmbeddingsOptionsT : public flatbuffers::NativeTable {
   typedef ConcatEmbeddingsOptions TableType;
   int32_t num_channels;
   std::vector<int32_t> num_columns_per_channel;
   std::vector<int32_t> embedding_dim_per_channel;
-  ConcatEmbeddingsOptionsT()
-      : num_channels(0) {
-  }
+  ConcatEmbeddingsOptionsT() : num_channels(0) {}
 };
 
-struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef ConcatEmbeddingsOptionsT NativeTableType;
   enum {
     VT_NUM_CHANNELS = 4,
     VT_NUM_COLUMNS_PER_CHANNEL = 6,
     VT_EMBEDDING_DIM_PER_CHANNEL = 8
   };
-  int32_t num_channels() const {
-    return GetField<int32_t>(VT_NUM_CHANNELS, 0);
-  }
+  int32_t num_channels() const { return GetField<int32_t>(VT_NUM_CHANNELS, 0); }
   const flatbuffers::Vector<int32_t> *num_columns_per_channel() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_NUM_COLUMNS_PER_CHANNEL);
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(
+        VT_NUM_COLUMNS_PER_CHANNEL);
   }
   const flatbuffers::Vector<int32_t> *embedding_dim_per_channel() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_EMBEDDING_DIM_PER_CHANNEL);
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(
+        VT_EMBEDDING_DIM_PER_CHANNEL);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -1353,31 +1699,43 @@ struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Ta
            VerifyOffset(verifier, VT_NUM_COLUMNS_PER_CHANNEL) &&
            verifier.Verify(num_columns_per_channel()) &&
            VerifyOffset(verifier, VT_EMBEDDING_DIM_PER_CHANNEL) &&
-           verifier.Verify(embedding_dim_per_channel()) &&
-           verifier.EndTable();
-  }
-  ConcatEmbeddingsOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ConcatEmbeddingsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ConcatEmbeddingsOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           verifier.Verify(embedding_dim_per_channel()) && verifier.EndTable();
+  }
+  ConcatEmbeddingsOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      ConcatEmbeddingsOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ConcatEmbeddingsOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ConcatEmbeddingsOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_num_channels(int32_t num_channels) {
-    fbb_.AddElement<int32_t>(ConcatEmbeddingsOptions::VT_NUM_CHANNELS, num_channels, 0);
+    fbb_.AddElement<int32_t>(ConcatEmbeddingsOptions::VT_NUM_CHANNELS,
+                             num_channels, 0);
   }
-  void add_num_columns_per_channel(flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel) {
-    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_NUM_COLUMNS_PER_CHANNEL, num_columns_per_channel);
+  void add_num_columns_per_channel(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>>
+          num_columns_per_channel) {
+    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_NUM_COLUMNS_PER_CHANNEL,
+                   num_columns_per_channel);
   }
-  void add_embedding_dim_per_channel(flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel) {
-    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_EMBEDDING_DIM_PER_CHANNEL, embedding_dim_per_channel);
+  void add_embedding_dim_per_channel(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>>
+          embedding_dim_per_channel) {
+    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_EMBEDDING_DIM_PER_CHANNEL,
+                   embedding_dim_per_channel);
   }
   explicit ConcatEmbeddingsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  ConcatEmbeddingsOptionsBuilder &operator=(const ConcatEmbeddingsOptionsBuilder &);
+  ConcatEmbeddingsOptionsBuilder &operator=(
+      const ConcatEmbeddingsOptionsBuilder &);
   flatbuffers::Offset<ConcatEmbeddingsOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<ConcatEmbeddingsOptions>(end);
@@ -1385,11 +1743,13 @@ struct ConcatEmbeddingsOptionsBuilder {
   }
 };
 
-inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t num_channels = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> num_columns_per_channel = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int32_t>> embedding_dim_per_channel = 0) {
+inline flatbuffers::Offset<ConcatEmbeddingsOptions>
+CreateConcatEmbeddingsOptions(flatbuffers::FlatBufferBuilder &_fbb,
+                              int32_t num_channels = 0,
+                              flatbuffers::Offset<flatbuffers::Vector<int32_t>>
+                                  num_columns_per_channel = 0,
+                              flatbuffers::Offset<flatbuffers::Vector<int32_t>>
+                                  embedding_dim_per_channel = 0) {
   ConcatEmbeddingsOptionsBuilder builder_(_fbb);
   builder_.add_embedding_dim_per_channel(embedding_dim_per_channel);
   builder_.add_num_columns_per_channel(num_columns_per_channel);
@@ -1397,54 +1757,61 @@ inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOption
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptionsDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t num_channels = 0,
+inline flatbuffers::Offset<ConcatEmbeddingsOptions>
+CreateConcatEmbeddingsOptionsDirect(
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t num_channels = 0,
     const std::vector<int32_t> *num_columns_per_channel = nullptr,
     const std::vector<int32_t> *embedding_dim_per_channel = nullptr) {
   return tflite::CreateConcatEmbeddingsOptions(
-      _fbb,
-      num_channels,
-      num_columns_per_channel ? _fbb.CreateVector<int32_t>(*num_columns_per_channel) : 0,
-      embedding_dim_per_channel ? _fbb.CreateVector<int32_t>(*embedding_dim_per_channel) : 0);
+      _fbb, num_channels,
+      num_columns_per_channel
+          ? _fbb.CreateVector<int32_t>(*num_columns_per_channel)
+          : 0,
+      embedding_dim_per_channel
+          ? _fbb.CreateVector<int32_t>(*embedding_dim_per_channel)
+          : 0);
 }
 
-flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct LSHProjectionOptionsT : public flatbuffers::NativeTable {
   typedef LSHProjectionOptions TableType;
   LSHProjectionType type;
-  LSHProjectionOptionsT()
-      : type(LSHProjectionType_UNKNOWN) {
-  }
+  LSHProjectionOptionsT() : type(LSHProjectionType_UNKNOWN) {}
 };
 
-struct LSHProjectionOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LSHProjectionOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef LSHProjectionOptionsT NativeTableType;
-  enum {
-    VT_TYPE = 4
-  };
+  enum { VT_TYPE = 4 };
   LSHProjectionType type() const {
     return static_cast<LSHProjectionType>(GetField<int8_t>(VT_TYPE, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int8_t>(verifier, VT_TYPE) &&
-           verifier.EndTable();
-  }
-  LSHProjectionOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LSHProjectionOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LSHProjectionOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<int8_t>(verifier, VT_TYPE) && verifier.EndTable();
+  }
+  LSHProjectionOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      LSHProjectionOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LSHProjectionOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LSHProjectionOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_type(LSHProjectionType type) {
-    fbb_.AddElement<int8_t>(LSHProjectionOptions::VT_TYPE, static_cast<int8_t>(type), 0);
+    fbb_.AddElement<int8_t>(LSHProjectionOptions::VT_TYPE,
+                            static_cast<int8_t>(type), 0);
   }
   explicit LSHProjectionOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   LSHProjectionOptionsBuilder &operator=(const LSHProjectionOptionsBuilder &);
@@ -1463,29 +1830,25 @@ inline flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct SVDFOptionsT : public flatbuffers::NativeTable {
   typedef SVDFOptions TableType;
   int32_t rank;
   ActivationFunctionType fused_activation_function;
   SVDFOptionsT()
-      : rank(0),
-        fused_activation_function(ActivationFunctionType_NONE) {
-  }
+      : rank(0), fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct SVDFOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SVDFOptionsT NativeTableType;
-  enum {
-    VT_RANK = 4,
-    VT_FUSED_ACTIVATION_FUNCTION = 6
-  };
-  int32_t rank() const {
-    return GetField<int32_t>(VT_RANK, 0);
-  }
+  enum { VT_RANK = 4, VT_FUSED_ACTIVATION_FUNCTION = 6 };
+  int32_t rank() const { return GetField<int32_t>(VT_RANK, 0); }
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -1493,9 +1856,14 @@ struct SVDFOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  SVDFOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SVDFOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SVDFOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SVDFOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SVDFOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SVDFOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SVDFOptionsBuilder {
@@ -1504,11 +1872,13 @@ struct SVDFOptionsBuilder {
   void add_rank(int32_t rank) {
     fbb_.AddElement<int32_t>(SVDFOptions::VT_RANK, rank, 0);
   }
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(SVDFOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SVDFOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit SVDFOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   SVDFOptionsBuilder &operator=(const SVDFOptionsBuilder &);
@@ -1520,51 +1890,57 @@ struct SVDFOptionsBuilder {
 };
 
 inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t rank = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t rank = 0,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   SVDFOptionsBuilder builder_(_fbb);
   builder_.add_rank(rank);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct RNNOptionsT : public flatbuffers::NativeTable {
   typedef RNNOptions TableType;
   ActivationFunctionType fused_activation_function;
-  RNNOptionsT()
-      : fused_activation_function(ActivationFunctionType_NONE) {
-  }
+  RNNOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct RNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef RNNOptionsT NativeTableType;
-  enum {
-    VT_FUSED_ACTIVATION_FUNCTION = 4
-  };
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  RNNOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(RNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<RNNOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  RNNOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      RNNOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<RNNOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct RNNOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(RNNOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(RNNOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit RNNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   RNNOptionsBuilder &operator=(const RNNOptionsBuilder &);
@@ -1577,48 +1953,207 @@ struct RNNOptionsBuilder {
 
 inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   RNNOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<RNNOptions> CreateRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<RNNOptions> CreateRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SequenceRNNOptionsT : public flatbuffers::NativeTable {
+  typedef SequenceRNNOptions TableType;
+  bool time_major;
+  ActivationFunctionType fused_activation_function;
+  SequenceRNNOptionsT()
+      : time_major(false),
+        fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct SequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SequenceRNNOptionsT NativeTableType;
+  enum { VT_TIME_MAJOR = 4, VT_FUSED_ACTIVATION_FUNCTION = 6 };
+  bool time_major() const { return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0; }
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  SequenceRNNOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SequenceRNNOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SequenceRNNOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SequenceRNNOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(SequenceRNNOptions::VT_TIME_MAJOR,
+                             static_cast<uint8_t>(time_major), 0);
+  }
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SequenceRNNOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit SequenceRNNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SequenceRNNOptionsBuilder &operator=(const SequenceRNNOptionsBuilder &);
+  flatbuffers::Offset<SequenceRNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SequenceRNNOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  SequenceRNNOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_time_major(time_major);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BidirectionalSequenceRNNOptionsT : public flatbuffers::NativeTable {
+  typedef BidirectionalSequenceRNNOptions TableType;
+  bool time_major;
+  ActivationFunctionType fused_activation_function;
+  BidirectionalSequenceRNNOptionsT()
+      : time_major(false),
+        fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef BidirectionalSequenceRNNOptionsT NativeTableType;
+  enum { VT_TIME_MAJOR = 4, VT_FUSED_ACTIVATION_FUNCTION = 6 };
+  bool time_major() const { return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0; }
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  BidirectionalSequenceRNNOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      BidirectionalSequenceRNNOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BidirectionalSequenceRNNOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb,
+      const BidirectionalSequenceRNNOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BidirectionalSequenceRNNOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_TIME_MAJOR,
+                             static_cast<uint8_t>(time_major), 0);
+  }
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(
+        BidirectionalSequenceRNNOptions::VT_FUSED_ACTIVATION_FUNCTION,
+        static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit BidirectionalSequenceRNNOptionsBuilder(
+      flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  BidirectionalSequenceRNNOptionsBuilder &operator=(
+      const BidirectionalSequenceRNNOptionsBuilder &);
+  flatbuffers::Offset<BidirectionalSequenceRNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BidirectionalSequenceRNNOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+CreateBidirectionalSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, bool time_major = false,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  BidirectionalSequenceRNNOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_time_major(time_major);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+CreateBidirectionalSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct FullyConnectedOptionsT : public flatbuffers::NativeTable {
   typedef FullyConnectedOptions TableType;
   ActivationFunctionType fused_activation_function;
   FullyConnectedOptionsT()
-      : fused_activation_function(ActivationFunctionType_NONE) {
-  }
+      : fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
-struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef FullyConnectedOptionsT NativeTableType;
-  enum {
-    VT_FUSED_ACTIVATION_FUNCTION = 4
-  };
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  FullyConnectedOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(FullyConnectedOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<FullyConnectedOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  FullyConnectedOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      FullyConnectedOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<FullyConnectedOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct FullyConnectedOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit FullyConnectedOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   FullyConnectedOptionsBuilder &operator=(const FullyConnectedOptionsBuilder &);
@@ -1631,38 +2166,39 @@ struct FullyConnectedOptionsBuilder {
 
 inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   FullyConnectedOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct SoftmaxOptionsT : public flatbuffers::NativeTable {
   typedef SoftmaxOptions TableType;
   float beta;
-  SoftmaxOptionsT()
-      : beta(0.0f) {
-  }
+  SoftmaxOptionsT() : beta(0.0f) {}
 };
 
 struct SoftmaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SoftmaxOptionsT NativeTableType;
-  enum {
-    VT_BETA = 4
-  };
-  float beta() const {
-    return GetField<float>(VT_BETA, 0.0f);
-  }
+  enum { VT_BETA = 4 };
+  float beta() const { return GetField<float>(VT_BETA, 0.0f); }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<float>(verifier, VT_BETA) &&
-           verifier.EndTable();
-  }
-  SoftmaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SoftmaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<float>(verifier, VT_BETA) && verifier.EndTable();
+  }
+  SoftmaxOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SoftmaxOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SoftmaxOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SoftmaxOptionsBuilder {
@@ -1672,7 +2208,7 @@ struct SoftmaxOptionsBuilder {
     fbb_.AddElement<float>(SoftmaxOptions::VT_BETA, beta, 0.0f);
   }
   explicit SoftmaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   SoftmaxOptionsBuilder &operator=(const SoftmaxOptionsBuilder &);
@@ -1684,36 +2220,32 @@ struct SoftmaxOptionsBuilder {
 };
 
 inline flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    float beta = 0.0f) {
+    flatbuffers::FlatBufferBuilder &_fbb, float beta = 0.0f) {
   SoftmaxOptionsBuilder builder_(_fbb);
   builder_.add_beta(beta);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct ConcatenationOptionsT : public flatbuffers::NativeTable {
   typedef ConcatenationOptions TableType;
   int32_t axis;
   ActivationFunctionType fused_activation_function;
   ConcatenationOptionsT()
-      : axis(0),
-        fused_activation_function(ActivationFunctionType_NONE) {
-  }
+      : axis(0), fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
-struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef ConcatenationOptionsT NativeTableType;
-  enum {
-    VT_AXIS = 4,
-    VT_FUSED_ACTIVATION_FUNCTION = 6
-  };
-  int32_t axis() const {
-    return GetField<int32_t>(VT_AXIS, 0);
-  }
+  enum { VT_AXIS = 4, VT_FUSED_ACTIVATION_FUNCTION = 6 };
+  int32_t axis() const { return GetField<int32_t>(VT_AXIS, 0); }
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -1721,9 +2253,14 @@ struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  ConcatenationOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ConcatenationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ConcatenationOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ConcatenationOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      ConcatenationOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ConcatenationOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ConcatenationOptionsBuilder {
@@ -1732,11 +2269,13 @@ struct ConcatenationOptionsBuilder {
   void add_axis(int32_t axis) {
     fbb_.AddElement<int32_t>(ConcatenationOptions::VT_AXIS, axis, 0);
   }
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(ConcatenationOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(ConcatenationOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit ConcatenationOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   ConcatenationOptionsBuilder &operator=(const ConcatenationOptionsBuilder &);
@@ -1748,51 +2287,57 @@ struct ConcatenationOptionsBuilder {
 };
 
 inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t axis = 0,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t axis = 0,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   ConcatenationOptionsBuilder builder_(_fbb);
   builder_.add_axis(axis);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct AddOptionsT : public flatbuffers::NativeTable {
   typedef AddOptions TableType;
   ActivationFunctionType fused_activation_function;
-  AddOptionsT()
-      : fused_activation_function(ActivationFunctionType_NONE) {
-  }
+  AddOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct AddOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef AddOptionsT NativeTableType;
-  enum {
-    VT_FUSED_ACTIVATION_FUNCTION = 4
-  };
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  AddOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(AddOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<AddOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  AddOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      AddOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<AddOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct AddOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(AddOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(AddOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit AddOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   AddOptionsBuilder &operator=(const AddOptionsBuilder &);
@@ -1805,48 +2350,55 @@ struct AddOptionsBuilder {
 
 inline flatbuffers::Offset<AddOptions> CreateAddOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   AddOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<AddOptions> CreateAddOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<AddOptions> CreateAddOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct MulOptionsT : public flatbuffers::NativeTable {
   typedef MulOptions TableType;
   ActivationFunctionType fused_activation_function;
-  MulOptionsT()
-      : fused_activation_function(ActivationFunctionType_NONE) {
-  }
+  MulOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct MulOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef MulOptionsT NativeTableType;
-  enum {
-    VT_FUSED_ACTIVATION_FUNCTION = 4
-  };
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  MulOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<MulOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  MulOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      MulOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MulOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct MulOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(MulOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(MulOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit MulOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   MulOptionsBuilder &operator=(const MulOptionsBuilder &);
@@ -1859,48 +2411,55 @@ struct MulOptionsBuilder {
 
 inline flatbuffers::Offset<MulOptions> CreateMulOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   MulOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<MulOptions> CreateMulOptions(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<MulOptions> CreateMulOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct L2NormOptionsT : public flatbuffers::NativeTable {
   typedef L2NormOptions TableType;
   ActivationFunctionType fused_activation_function;
-  L2NormOptionsT()
-      : fused_activation_function(ActivationFunctionType_NONE) {
-  }
+  L2NormOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
 };
 
 struct L2NormOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef L2NormOptionsT NativeTableType;
-  enum {
-    VT_FUSED_ACTIVATION_FUNCTION = 4
-  };
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
-  L2NormOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(L2NormOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<L2NormOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  L2NormOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      L2NormOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<L2NormOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct L2NormOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(L2NormOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(L2NormOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   explicit L2NormOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   L2NormOptionsBuilder &operator=(const L2NormOptionsBuilder &);
@@ -1913,13 +2472,16 @@ struct L2NormOptionsBuilder {
 
 inline flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
   L2NormOptionsBuilder builder_(_fbb);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct LocalResponseNormalizationOptionsT : public flatbuffers::NativeTable {
   typedef LocalResponseNormalizationOptions TableType;
@@ -1928,66 +2490,61 @@ struct LocalResponseNormalizationOptionsT : public flatbuffers::NativeTable {
   float alpha;
   float beta;
   LocalResponseNormalizationOptionsT()
-      : radius(0),
-        bias(0.0f),
-        alpha(0.0f),
-        beta(0.0f) {
-  }
+      : radius(0), bias(0.0f), alpha(0.0f), beta(0.0f) {}
 };
 
-struct LocalResponseNormalizationOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct LocalResponseNormalizationOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef LocalResponseNormalizationOptionsT NativeTableType;
-  enum {
-    VT_RADIUS = 4,
-    VT_BIAS = 6,
-    VT_ALPHA = 8,
-    VT_BETA = 10
-  };
-  int32_t radius() const {
-    return GetField<int32_t>(VT_RADIUS, 0);
-  }
-  float bias() const {
-    return GetField<float>(VT_BIAS, 0.0f);
-  }
-  float alpha() const {
-    return GetField<float>(VT_ALPHA, 0.0f);
-  }
-  float beta() const {
-    return GetField<float>(VT_BETA, 0.0f);
-  }
+  enum { VT_RADIUS = 4, VT_BIAS = 6, VT_ALPHA = 8, VT_BETA = 10 };
+  int32_t radius() const { return GetField<int32_t>(VT_RADIUS, 0); }
+  float bias() const { return GetField<float>(VT_BIAS, 0.0f); }
+  float alpha() const { return GetField<float>(VT_ALPHA, 0.0f); }
+  float beta() const { return GetField<float>(VT_BETA, 0.0f); }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_RADIUS) &&
            VerifyField<float>(verifier, VT_BIAS) &&
            VerifyField<float>(verifier, VT_ALPHA) &&
-           VerifyField<float>(verifier, VT_BETA) &&
-           verifier.EndTable();
-  }
-  LocalResponseNormalizationOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LocalResponseNormalizationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LocalResponseNormalizationOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<float>(verifier, VT_BETA) && verifier.EndTable();
+  }
+  LocalResponseNormalizationOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      LocalResponseNormalizationOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LocalResponseNormalizationOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb,
+      const LocalResponseNormalizationOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LocalResponseNormalizationOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_radius(int32_t radius) {
-    fbb_.AddElement<int32_t>(LocalResponseNormalizationOptions::VT_RADIUS, radius, 0);
+    fbb_.AddElement<int32_t>(LocalResponseNormalizationOptions::VT_RADIUS,
+                             radius, 0);
   }
   void add_bias(float bias) {
-    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BIAS, bias, 0.0f);
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BIAS, bias,
+                           0.0f);
   }
   void add_alpha(float alpha) {
-    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_ALPHA, alpha, 0.0f);
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_ALPHA, alpha,
+                           0.0f);
   }
   void add_beta(float beta) {
-    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BETA, beta, 0.0f);
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BETA, beta,
+                           0.0f);
   }
-  explicit LocalResponseNormalizationOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit LocalResponseNormalizationOptionsBuilder(
+      flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  LocalResponseNormalizationOptionsBuilder &operator=(const LocalResponseNormalizationOptionsBuilder &);
+  LocalResponseNormalizationOptionsBuilder &operator=(
+      const LocalResponseNormalizationOptionsBuilder &);
   flatbuffers::Offset<LocalResponseNormalizationOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<LocalResponseNormalizationOptions>(end);
@@ -1995,12 +2552,10 @@ struct LocalResponseNormalizationOptionsBuilder {
   }
 };
 
-inline flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t radius = 0,
-    float bias = 0.0f,
-    float alpha = 0.0f,
-    float beta = 0.0f) {
+inline flatbuffers::Offset<LocalResponseNormalizationOptions>
+CreateLocalResponseNormalizationOptions(flatbuffers::FlatBufferBuilder &_fbb,
+                                        int32_t radius = 0, float bias = 0.0f,
+                                        float alpha = 0.0f, float beta = 0.0f) {
   LocalResponseNormalizationOptionsBuilder builder_(_fbb);
   builder_.add_beta(beta);
   builder_.add_alpha(alpha);
@@ -2009,7 +2564,11 @@ inline flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalRespons
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<LocalResponseNormalizationOptions>
+CreateLocalResponseNormalizationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const LocalResponseNormalizationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct LSTMOptionsT : public flatbuffers::NativeTable {
   typedef LSTMOptions TableType;
@@ -2019,43 +2578,41 @@ struct LSTMOptionsT : public flatbuffers::NativeTable {
   LSTMOptionsT()
       : fused_activation_function(ActivationFunctionType_NONE),
         cell_clip(0.0f),
-        proj_clip(0.0f) {
-  }
+        proj_clip(0.0f) {}
 };
 
 struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef LSTMOptionsT NativeTableType;
-  enum {
-    VT_FUSED_ACTIVATION_FUNCTION = 4,
-    VT_CELL_CLIP = 6,
-    VT_PROJ_CLIP = 8
-  };
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4, VT_CELL_CLIP = 6, VT_PROJ_CLIP = 8 };
   ActivationFunctionType fused_activation_function() const {
-    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
-  }
-  float cell_clip() const {
-    return GetField<float>(VT_CELL_CLIP, 0.0f);
-  }
-  float proj_clip() const {
-    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
+  float cell_clip() const { return GetField<float>(VT_CELL_CLIP, 0.0f); }
+  float proj_clip() const { return GetField<float>(VT_PROJ_CLIP, 0.0f); }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            VerifyField<float>(verifier, VT_CELL_CLIP) &&
-           VerifyField<float>(verifier, VT_PROJ_CLIP) &&
-           verifier.EndTable();
-  }
-  LSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(LSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<LSTMOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<float>(verifier, VT_PROJ_CLIP) && verifier.EndTable();
+  }
+  LSTMOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      LSTMOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LSTMOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct LSTMOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
-    fbb_.AddElement<int8_t>(LSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(LSTMOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
   }
   void add_cell_clip(float cell_clip) {
     fbb_.AddElement<float>(LSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
@@ -2064,7 +2621,7 @@ struct LSTMOptionsBuilder {
     fbb_.AddElement<float>(LSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
   }
   explicit LSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   LSTMOptionsBuilder &operator=(const LSTMOptionsBuilder &);
@@ -2077,9 +2634,9 @@ struct LSTMOptionsBuilder {
 
 inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
-    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
-    float cell_clip = 0.0f,
-    float proj_clip = 0.0f) {
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE,
+    float cell_clip = 0.0f, float proj_clip = 0.0f) {
   LSTMOptionsBuilder builder_(_fbb);
   builder_.add_proj_clip(proj_clip);
   builder_.add_cell_clip(cell_clip);
@@ -2087,52 +2644,47 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct ResizeBilinearOptionsT : public flatbuffers::NativeTable {
   typedef ResizeBilinearOptions TableType;
-  int32_t new_height;
-  int32_t new_width;
-  ResizeBilinearOptionsT()
-      : new_height(0),
-        new_width(0) {
-  }
+  bool align_corners;
+  ResizeBilinearOptionsT() : align_corners(false) {}
 };
 
-struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef ResizeBilinearOptionsT NativeTableType;
-  enum {
-    VT_NEW_HEIGHT = 4,
-    VT_NEW_WIDTH = 6
-  };
-  int32_t new_height() const {
-    return GetField<int32_t>(VT_NEW_HEIGHT, 0);
-  }
-  int32_t new_width() const {
-    return GetField<int32_t>(VT_NEW_WIDTH, 0);
+  enum { VT_ALIGN_CORNERS = 8 };
+  bool align_corners() const {
+    return GetField<uint8_t>(VT_ALIGN_CORNERS, 0) != 0;
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_NEW_HEIGHT) &&
-           VerifyField<int32_t>(verifier, VT_NEW_WIDTH) &&
+           VerifyField<uint8_t>(verifier, VT_ALIGN_CORNERS) &&
            verifier.EndTable();
   }
-  ResizeBilinearOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ResizeBilinearOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ResizeBilinearOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  ResizeBilinearOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      ResizeBilinearOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ResizeBilinearOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ResizeBilinearOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_new_height(int32_t new_height) {
-    fbb_.AddElement<int32_t>(ResizeBilinearOptions::VT_NEW_HEIGHT, new_height, 0);
-  }
-  void add_new_width(int32_t new_width) {
-    fbb_.AddElement<int32_t>(ResizeBilinearOptions::VT_NEW_WIDTH, new_width, 0);
+  void add_align_corners(bool align_corners) {
+    fbb_.AddElement<uint8_t>(ResizeBilinearOptions::VT_ALIGN_CORNERS,
+                             static_cast<uint8_t>(align_corners), 0);
   }
   explicit ResizeBilinearOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   ResizeBilinearOptionsBuilder &operator=(const ResizeBilinearOptionsBuilder &);
@@ -2144,41 +2696,38 @@ struct ResizeBilinearOptionsBuilder {
 };
 
 inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t new_height = 0,
-    int32_t new_width = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb, bool align_corners = false) {
   ResizeBilinearOptionsBuilder builder_(_fbb);
-  builder_.add_new_width(new_width);
-  builder_.add_new_height(new_height);
+  builder_.add_align_corners(align_corners);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct CallOptionsT : public flatbuffers::NativeTable {
   typedef CallOptions TableType;
   uint32_t subgraph;
-  CallOptionsT()
-      : subgraph(0) {
-  }
+  CallOptionsT() : subgraph(0) {}
 };
 
 struct CallOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef CallOptionsT NativeTableType;
-  enum {
-    VT_SUBGRAPH = 4
-  };
-  uint32_t subgraph() const {
-    return GetField<uint32_t>(VT_SUBGRAPH, 0);
-  }
+  enum { VT_SUBGRAPH = 4 };
+  uint32_t subgraph() const { return GetField<uint32_t>(VT_SUBGRAPH, 0); }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<uint32_t>(verifier, VT_SUBGRAPH) &&
-           verifier.EndTable();
-  }
-  CallOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(CallOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<CallOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<uint32_t>(verifier, VT_SUBGRAPH) && verifier.EndTable();
+  }
+  CallOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      CallOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CallOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct CallOptionsBuilder {
@@ -2188,7 +2737,7 @@ struct CallOptionsBuilder {
     fbb_.AddElement<uint32_t>(CallOptions::VT_SUBGRAPH, subgraph, 0);
   }
   explicit CallOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   CallOptionsBuilder &operator=(const CallOptionsBuilder &);
@@ -2200,49 +2749,96 @@ struct CallOptionsBuilder {
 };
 
 inline flatbuffers::Offset<CallOptions> CreateCallOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    uint32_t subgraph = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t subgraph = 0) {
   CallOptionsBuilder builder_(_fbb);
   builder_.add_subgraph(subgraph);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<CallOptions> CreateCallOptions(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<CallOptions> CreateCallOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct PadOptionsT : public flatbuffers::NativeTable {
+  typedef PadOptions TableType;
+  PadOptionsT() {}
+};
+
+struct PadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef PadOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+  PadOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      PadOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<PadOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PadOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit PadOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  PadOptionsBuilder &operator=(const PadOptionsBuilder &);
+  flatbuffers::Offset<PadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<PadOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<PadOptions> CreatePadOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  PadOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<PadOptions> CreatePadOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct ReshapeOptionsT : public flatbuffers::NativeTable {
   typedef ReshapeOptions TableType;
   std::vector<int32_t> new_shape;
-  ReshapeOptionsT() {
-  }
+  ReshapeOptionsT() {}
 };
 
 struct ReshapeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef ReshapeOptionsT NativeTableType;
-  enum {
-    VT_NEW_SHAPE = 4
-  };
+  enum { VT_NEW_SHAPE = 4 };
   const flatbuffers::Vector<int32_t> *new_shape() const {
     return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_NEW_SHAPE);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_NEW_SHAPE) &&
-           verifier.Verify(new_shape()) &&
-           verifier.EndTable();
-  }
-  ReshapeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ReshapeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<ReshapeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NEW_SHAPE) &&
+           verifier.Verify(new_shape()) && verifier.EndTable();
+  }
+  ReshapeOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      ReshapeOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ReshapeOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ReshapeOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_new_shape(flatbuffers::Offset<flatbuffers::Vector<int32_t>> new_shape) {
+  void add_new_shape(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>> new_shape) {
     fbb_.AddOffset(ReshapeOptions::VT_NEW_SHAPE, new_shape);
   }
   explicit ReshapeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   ReshapeOptionsBuilder &operator=(const ReshapeOptionsBuilder &);
@@ -2265,38 +2861,122 @@ inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptionsDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<int32_t> *new_shape = nullptr) {
   return tflite::CreateReshapeOptions(
-      _fbb,
-      new_shape ? _fbb.CreateVector<int32_t>(*new_shape) : 0);
+      _fbb, new_shape ? _fbb.CreateVector<int32_t>(*new_shape) : 0);
 }
 
-flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct SkipGramOptionsT : public flatbuffers::NativeTable {
-  typedef SkipGramOptions TableType;
-  int32_t ngram_size;
-  int32_t max_skip_size;
-  bool include_all_ngrams;
-  SkipGramOptionsT()
-      : ngram_size(0),
-        max_skip_size(0),
-        include_all_ngrams(false) {
-  }
+struct SpaceToBatchNDOptionsT : public flatbuffers::NativeTable {
+  typedef SpaceToBatchNDOptions TableType;
+  SpaceToBatchNDOptionsT() {}
 };
 
-struct SkipGramOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef SkipGramOptionsT NativeTableType;
-  enum {
-    VT_NGRAM_SIZE = 4,
-    VT_MAX_SKIP_SIZE = 6,
-    VT_INCLUDE_ALL_NGRAMS = 8
-  };
-  int32_t ngram_size() const {
-    return GetField<int32_t>(VT_NGRAM_SIZE, 0);
-  }
-  int32_t max_skip_size() const {
-    return GetField<int32_t>(VT_MAX_SKIP_SIZE, 0);
-  }
-  bool include_all_ngrams() const {
+struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef SpaceToBatchNDOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+  SpaceToBatchNDOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SpaceToBatchNDOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SpaceToBatchNDOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SpaceToBatchNDOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SpaceToBatchNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SpaceToBatchNDOptionsBuilder &operator=(const SpaceToBatchNDOptionsBuilder &);
+  flatbuffers::Offset<SpaceToBatchNDOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SpaceToBatchNDOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SpaceToBatchNDOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BatchToSpaceNDOptionsT : public flatbuffers::NativeTable {
+  typedef BatchToSpaceNDOptions TableType;
+  BatchToSpaceNDOptionsT() {}
+};
+
+struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef BatchToSpaceNDOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+  BatchToSpaceNDOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      BatchToSpaceNDOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<BatchToSpaceNDOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BatchToSpaceNDOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit BatchToSpaceNDOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  BatchToSpaceNDOptionsBuilder &operator=(const BatchToSpaceNDOptionsBuilder &);
+  flatbuffers::Offset<BatchToSpaceNDOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<BatchToSpaceNDOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  BatchToSpaceNDOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SkipGramOptionsT : public flatbuffers::NativeTable {
+  typedef SkipGramOptions TableType;
+  int32_t ngram_size;
+  int32_t max_skip_size;
+  bool include_all_ngrams;
+  SkipGramOptionsT()
+      : ngram_size(0), max_skip_size(0), include_all_ngrams(false) {}
+};
+
+struct SkipGramOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SkipGramOptionsT NativeTableType;
+  enum { VT_NGRAM_SIZE = 4, VT_MAX_SKIP_SIZE = 6, VT_INCLUDE_ALL_NGRAMS = 8 };
+  int32_t ngram_size() const { return GetField<int32_t>(VT_NGRAM_SIZE, 0); }
+  int32_t max_skip_size() const {
+    return GetField<int32_t>(VT_MAX_SKIP_SIZE, 0);
+  }
+  bool include_all_ngrams() const {
     return GetField<uint8_t>(VT_INCLUDE_ALL_NGRAMS, 0) != 0;
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
@@ -2306,9 +2986,14 @@ struct SkipGramOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<uint8_t>(verifier, VT_INCLUDE_ALL_NGRAMS) &&
            verifier.EndTable();
   }
-  SkipGramOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SkipGramOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SkipGramOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SkipGramOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SkipGramOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SkipGramOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SkipGramOptionsBuilder {
@@ -2318,13 +3003,15 @@ struct SkipGramOptionsBuilder {
     fbb_.AddElement<int32_t>(SkipGramOptions::VT_NGRAM_SIZE, ngram_size, 0);
   }
   void add_max_skip_size(int32_t max_skip_size) {
-    fbb_.AddElement<int32_t>(SkipGramOptions::VT_MAX_SKIP_SIZE, max_skip_size, 0);
+    fbb_.AddElement<int32_t>(SkipGramOptions::VT_MAX_SKIP_SIZE, max_skip_size,
+                             0);
   }
   void add_include_all_ngrams(bool include_all_ngrams) {
-    fbb_.AddElement<uint8_t>(SkipGramOptions::VT_INCLUDE_ALL_NGRAMS, static_cast<uint8_t>(include_all_ngrams), 0);
+    fbb_.AddElement<uint8_t>(SkipGramOptions::VT_INCLUDE_ALL_NGRAMS,
+                             static_cast<uint8_t>(include_all_ngrams), 0);
   }
   explicit SkipGramOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   SkipGramOptionsBuilder &operator=(const SkipGramOptionsBuilder &);
@@ -2336,10 +3023,8 @@ struct SkipGramOptionsBuilder {
 };
 
 inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t ngram_size = 0,
-    int32_t max_skip_size = 0,
-    bool include_all_ngrams = false) {
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t ngram_size = 0,
+    int32_t max_skip_size = 0, bool include_all_ngrams = false) {
   SkipGramOptionsBuilder builder_(_fbb);
   builder_.add_max_skip_size(max_skip_size);
   builder_.add_ngram_size(ngram_size);
@@ -2347,32 +3032,33 @@ inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct SpaceToDepthOptionsT : public flatbuffers::NativeTable {
   typedef SpaceToDepthOptions TableType;
   int32_t block_size;
-  SpaceToDepthOptionsT()
-      : block_size(0) {
-  }
+  SpaceToDepthOptionsT() : block_size(0) {}
 };
 
-struct SpaceToDepthOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct SpaceToDepthOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef SpaceToDepthOptionsT NativeTableType;
-  enum {
-    VT_BLOCK_SIZE = 4
-  };
-  int32_t block_size() const {
-    return GetField<int32_t>(VT_BLOCK_SIZE, 0);
-  }
+  enum { VT_BLOCK_SIZE = 4 };
+  int32_t block_size() const { return GetField<int32_t>(VT_BLOCK_SIZE, 0); }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_BLOCK_SIZE) &&
-           verifier.EndTable();
-  }
-  SpaceToDepthOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SpaceToDepthOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SpaceToDepthOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<int32_t>(verifier, VT_BLOCK_SIZE) && verifier.EndTable();
+  }
+  SpaceToDepthOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SpaceToDepthOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SpaceToDepthOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SpaceToDepthOptionsBuilder {
@@ -2382,7 +3068,7 @@ struct SpaceToDepthOptionsBuilder {
     fbb_.AddElement<int32_t>(SpaceToDepthOptions::VT_BLOCK_SIZE, block_size, 0);
   }
   explicit SpaceToDepthOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   SpaceToDepthOptionsBuilder &operator=(const SpaceToDepthOptionsBuilder &);
@@ -2394,52 +3080,180 @@ struct SpaceToDepthOptionsBuilder {
 };
 
 inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t block_size = 0) {
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t block_size = 0) {
   SpaceToDepthOptionsBuilder builder_(_fbb);
   builder_.add_block_size(block_size);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SubOptionsT : public flatbuffers::NativeTable {
+  typedef SubOptions TableType;
+  ActivationFunctionType fused_activation_function;
+  SubOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct SubOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SubOptionsT NativeTableType;
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  SubOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SubOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SubOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SubOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SubOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit SubOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SubOptionsBuilder &operator=(const SubOptionsBuilder &);
+  flatbuffers::Offset<SubOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SubOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SubOptions> CreateSubOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  SubOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SubOptions> CreateSubOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DivOptionsT : public flatbuffers::NativeTable {
+  typedef DivOptions TableType;
+  ActivationFunctionType fused_activation_function;
+  DivOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct DivOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef DivOptionsT NativeTableType;
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  DivOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      DivOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<DivOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DivOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(DivOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit DivOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  DivOptionsBuilder &operator=(const DivOptionsBuilder &);
+  flatbuffers::Offset<DivOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DivOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DivOptions> CreateDivOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  DivOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<DivOptions> CreateDivOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct EmbeddingLookupSparseOptionsT : public flatbuffers::NativeTable {
   typedef EmbeddingLookupSparseOptions TableType;
   CombinerType combiner;
-  EmbeddingLookupSparseOptionsT()
-      : combiner(CombinerType_SUM) {
-  }
+  EmbeddingLookupSparseOptionsT() : combiner(CombinerType_SUM) {}
 };
 
-struct EmbeddingLookupSparseOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+struct EmbeddingLookupSparseOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
   typedef EmbeddingLookupSparseOptionsT NativeTableType;
-  enum {
-    VT_COMBINER = 4
-  };
+  enum { VT_COMBINER = 4 };
   CombinerType combiner() const {
     return static_cast<CombinerType>(GetField<int8_t>(VT_COMBINER, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int8_t>(verifier, VT_COMBINER) &&
-           verifier.EndTable();
-  }
-  EmbeddingLookupSparseOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(EmbeddingLookupSparseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<EmbeddingLookupSparseOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyField<int8_t>(verifier, VT_COMBINER) && verifier.EndTable();
+  }
+  EmbeddingLookupSparseOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      EmbeddingLookupSparseOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<EmbeddingLookupSparseOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb,
+      const EmbeddingLookupSparseOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct EmbeddingLookupSparseOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_combiner(CombinerType combiner) {
-    fbb_.AddElement<int8_t>(EmbeddingLookupSparseOptions::VT_COMBINER, static_cast<int8_t>(combiner), 0);
+    fbb_.AddElement<int8_t>(EmbeddingLookupSparseOptions::VT_COMBINER,
+                            static_cast<int8_t>(combiner), 0);
   }
-  explicit EmbeddingLookupSparseOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit EmbeddingLookupSparseOptionsBuilder(
+      flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  EmbeddingLookupSparseOptionsBuilder &operator=(const EmbeddingLookupSparseOptionsBuilder &);
+  EmbeddingLookupSparseOptionsBuilder &operator=(
+      const EmbeddingLookupSparseOptionsBuilder &);
   flatbuffers::Offset<EmbeddingLookupSparseOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
     auto o = flatbuffers::Offset<EmbeddingLookupSparseOptions>(end);
@@ -2447,31 +3261,397 @@ struct EmbeddingLookupSparseOptionsBuilder {
   }
 };
 
-inline flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    CombinerType combiner = CombinerType_SUM) {
+inline flatbuffers::Offset<EmbeddingLookupSparseOptions>
+CreateEmbeddingLookupSparseOptions(flatbuffers::FlatBufferBuilder &_fbb,
+                                   CombinerType combiner = CombinerType_SUM) {
   EmbeddingLookupSparseOptionsBuilder builder_(_fbb);
   builder_.add_combiner(combiner);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<EmbeddingLookupSparseOptions>
+CreateEmbeddingLookupSparseOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const EmbeddingLookupSparseOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GatherOptionsT : public flatbuffers::NativeTable {
+  typedef GatherOptions TableType;
+  int32_t axis;
+  GatherOptionsT() : axis(0) {}
+};
+
+struct GatherOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GatherOptionsT NativeTableType;
+  enum { VT_AXIS = 4 };
+  int32_t axis() const { return GetField<int32_t>(VT_AXIS, 0); }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS) && verifier.EndTable();
+  }
+  GatherOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      GatherOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GatherOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GatherOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(GatherOptions::VT_AXIS, axis, 0);
+  }
+  explicit GatherOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  GatherOptionsBuilder &operator=(const GatherOptionsBuilder &);
+  flatbuffers::Offset<GatherOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GatherOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GatherOptions> CreateGatherOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t axis = 0) {
+  GatherOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GatherOptions> CreateGatherOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TransposeOptionsT : public flatbuffers::NativeTable {
+  typedef TransposeOptions TableType;
+  TransposeOptionsT() {}
+};
+
+struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TransposeOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+  TransposeOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      TransposeOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TransposeOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TransposeOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit TransposeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  TransposeOptionsBuilder &operator=(const TransposeOptionsBuilder &);
+  flatbuffers::Offset<TransposeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TransposeOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  TransposeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ExpOptionsT : public flatbuffers::NativeTable {
+  typedef ExpOptions TableType;
+  ExpOptionsT() {}
+};
+
+struct ExpOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ExpOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+  ExpOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      ExpOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ExpOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ExpOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit ExpOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ExpOptionsBuilder &operator=(const ExpOptionsBuilder &);
+  flatbuffers::Offset<ExpOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ExpOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ExpOptions> CreateExpOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  ExpOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ExpOptions> CreateExpOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MeanOptionsT : public flatbuffers::NativeTable {
+  typedef MeanOptions TableType;
+  bool keep_dims;
+  MeanOptionsT() : keep_dims(false) {}
+};
+
+struct MeanOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MeanOptionsT NativeTableType;
+  enum { VT_KEEP_DIMS = 4 };
+  bool keep_dims() const { return GetField<uint8_t>(VT_KEEP_DIMS, 0) != 0; }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_KEEP_DIMS) && verifier.EndTable();
+  }
+  MeanOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      MeanOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MeanOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const MeanOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MeanOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_keep_dims(bool keep_dims) {
+    fbb_.AddElement<uint8_t>(MeanOptions::VT_KEEP_DIMS,
+                             static_cast<uint8_t>(keep_dims), 0);
+  }
+  explicit MeanOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  MeanOptionsBuilder &operator=(const MeanOptionsBuilder &);
+  flatbuffers::Offset<MeanOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MeanOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MeanOptions> CreateMeanOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, bool keep_dims = false) {
+  MeanOptionsBuilder builder_(_fbb);
+  builder_.add_keep_dims(keep_dims);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MeanOptions> CreateMeanOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const MeanOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SqueezeOptionsT : public flatbuffers::NativeTable {
+  typedef SqueezeOptions TableType;
+  std::vector<int32_t> squeeze_dims;
+  SqueezeOptionsT() {}
+};
+
+struct SqueezeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SqueezeOptionsT NativeTableType;
+  enum { VT_SQUEEZE_DIMS = 4 };
+  const flatbuffers::Vector<int32_t> *squeeze_dims() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_SQUEEZE_DIMS);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SQUEEZE_DIMS) &&
+           verifier.Verify(squeeze_dims()) && verifier.EndTable();
+  }
+  SqueezeOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SqueezeOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SqueezeOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SqueezeOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_squeeze_dims(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>> squeeze_dims) {
+    fbb_.AddOffset(SqueezeOptions::VT_SQUEEZE_DIMS, squeeze_dims);
+  }
+  explicit SqueezeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SqueezeOptionsBuilder &operator=(const SqueezeOptionsBuilder &);
+  flatbuffers::Offset<SqueezeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SqueezeOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> squeeze_dims = 0) {
+  SqueezeOptionsBuilder builder_(_fbb);
+  builder_.add_squeeze_dims(squeeze_dims);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptionsDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *squeeze_dims = nullptr) {
+  return tflite::CreateSqueezeOptions(
+      _fbb, squeeze_dims ? _fbb.CreateVector<int32_t>(*squeeze_dims) : 0);
+}
+
+flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StridedSliceOptionsT : public flatbuffers::NativeTable {
+  typedef StridedSliceOptions TableType;
+  int32_t begin_mask;
+  int32_t end_mask;
+  int32_t ellipsis_mask;
+  int32_t new_axis_mask;
+  int32_t shrink_axis_mask;
+  StridedSliceOptionsT()
+      : begin_mask(0),
+        end_mask(0),
+        ellipsis_mask(0),
+        new_axis_mask(0),
+        shrink_axis_mask(0) {}
+};
+
+struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef StridedSliceOptionsT NativeTableType;
+  enum {
+    VT_BEGIN_MASK = 4,
+    VT_END_MASK = 6,
+    VT_ELLIPSIS_MASK = 8,
+    VT_NEW_AXIS_MASK = 10,
+    VT_SHRINK_AXIS_MASK = 12
+  };
+  int32_t begin_mask() const { return GetField<int32_t>(VT_BEGIN_MASK, 0); }
+  int32_t end_mask() const { return GetField<int32_t>(VT_END_MASK, 0); }
+  int32_t ellipsis_mask() const {
+    return GetField<int32_t>(VT_ELLIPSIS_MASK, 0);
+  }
+  int32_t new_axis_mask() const {
+    return GetField<int32_t>(VT_NEW_AXIS_MASK, 0);
+  }
+  int32_t shrink_axis_mask() const {
+    return GetField<int32_t>(VT_SHRINK_AXIS_MASK, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_BEGIN_MASK) &&
+           VerifyField<int32_t>(verifier, VT_END_MASK) &&
+           VerifyField<int32_t>(verifier, VT_ELLIPSIS_MASK) &&
+           VerifyField<int32_t>(verifier, VT_NEW_AXIS_MASK) &&
+           VerifyField<int32_t>(verifier, VT_SHRINK_AXIS_MASK) &&
+           verifier.EndTable();
+  }
+  StridedSliceOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      StridedSliceOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<StridedSliceOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct StridedSliceOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_begin_mask(int32_t begin_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_BEGIN_MASK, begin_mask, 0);
+  }
+  void add_end_mask(int32_t end_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_END_MASK, end_mask, 0);
+  }
+  void add_ellipsis_mask(int32_t ellipsis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_ELLIPSIS_MASK,
+                             ellipsis_mask, 0);
+  }
+  void add_new_axis_mask(int32_t new_axis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_NEW_AXIS_MASK,
+                             new_axis_mask, 0);
+  }
+  void add_shrink_axis_mask(int32_t shrink_axis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_SHRINK_AXIS_MASK,
+                             shrink_axis_mask, 0);
+  }
+  explicit StridedSliceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  StridedSliceOptionsBuilder &operator=(const StridedSliceOptionsBuilder &);
+  flatbuffers::Offset<StridedSliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<StridedSliceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t begin_mask = 0,
+    int32_t end_mask = 0, int32_t ellipsis_mask = 0, int32_t new_axis_mask = 0,
+    int32_t shrink_axis_mask = 0) {
+  StridedSliceOptionsBuilder builder_(_fbb);
+  builder_.add_shrink_axis_mask(shrink_axis_mask);
+  builder_.add_new_axis_mask(new_axis_mask);
+  builder_.add_ellipsis_mask(ellipsis_mask);
+  builder_.add_end_mask(end_mask);
+  builder_.add_begin_mask(begin_mask);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
   std::string custom_code;
-  OperatorCodeT()
-      : builtin_code(BuiltinOperator_ADD) {
-  }
+  OperatorCodeT() : builtin_code(BuiltinOperator_ADD) {}
 };
 
 struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef OperatorCodeT NativeTableType;
-  enum {
-    VT_BUILTIN_CODE = 4,
-    VT_CUSTOM_CODE = 6
-  };
+  enum { VT_BUILTIN_CODE = 4, VT_CUSTOM_CODE = 6 };
   BuiltinOperator builtin_code() const {
     return static_cast<BuiltinOperator>(GetField<int8_t>(VT_BUILTIN_CODE, 0));
   }
@@ -2482,25 +3662,30 @@ struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_BUILTIN_CODE) &&
            VerifyOffset(verifier, VT_CUSTOM_CODE) &&
-           verifier.Verify(custom_code()) &&
-           verifier.EndTable();
-  }
-  OperatorCodeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(OperatorCodeT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<OperatorCode> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           verifier.Verify(custom_code()) && verifier.EndTable();
+  }
+  OperatorCodeT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      OperatorCodeT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<OperatorCode> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct OperatorCodeBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_builtin_code(BuiltinOperator builtin_code) {
-    fbb_.AddElement<int8_t>(OperatorCode::VT_BUILTIN_CODE, static_cast<int8_t>(builtin_code), 0);
+    fbb_.AddElement<int8_t>(OperatorCode::VT_BUILTIN_CODE,
+                            static_cast<int8_t>(builtin_code), 0);
   }
   void add_custom_code(flatbuffers::Offset<flatbuffers::String> custom_code) {
     fbb_.AddOffset(OperatorCode::VT_CUSTOM_CODE, custom_code);
   }
   explicit OperatorCodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+      : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   OperatorCodeBuilder &operator=(const OperatorCodeBuilder &);
@@ -2526,12 +3711,12 @@ inline flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
     BuiltinOperator builtin_code = BuiltinOperator_ADD,
     const char *custom_code = nullptr) {
   return tflite::CreateOperatorCode(
-      _fbb,
-      builtin_code,
-      custom_code ? _fbb.CreateString(custom_code) : 0);
+      _fbb, builtin_code, custom_code ? _fbb.CreateString(custom_code) : 0);
 }
 
-flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct OperatorT : public flatbuffers::NativeTable {
   typedef Operator TableType;
@@ -2543,8 +3728,7 @@ struct OperatorT : public flatbuffers::NativeTable {
   CustomOptionsFormat custom_options_format;
   OperatorT()
       : opcode_index(0),
-        custom_options_format(CustomOptionsFormat_FLEXBUFFERS) {
-  }
+        custom_options_format(CustomOptionsFormat_FLEXBUFFERS) {}
 };
 
 struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -2568,185 +3752,408 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
   }
   BuiltinOptions builtin_options_type() const {
-    return static_cast<BuiltinOptions>(GetField<uint8_t>(VT_BUILTIN_OPTIONS_TYPE, 0));
+    return static_cast<BuiltinOptions>(
+        GetField<uint8_t>(VT_BUILTIN_OPTIONS_TYPE, 0));
   }
   const void *builtin_options() const {
     return GetPointer<const void *>(VT_BUILTIN_OPTIONS);
   }
-  template<typename T> const T *builtin_options_as() const;
+  template <typename T>
+  const T *builtin_options_as() const;
   const Conv2DOptions *builtin_options_as_Conv2DOptions() const {
-    return builtin_options_type() == BuiltinOptions_Conv2DOptions ? static_cast<const Conv2DOptions *>(builtin_options()) : nullptr;
-  }
-  const DepthwiseConv2DOptions *builtin_options_as_DepthwiseConv2DOptions() const {
-    return builtin_options_type() == BuiltinOptions_DepthwiseConv2DOptions ? static_cast<const DepthwiseConv2DOptions *>(builtin_options()) : nullptr;
-  }
-  const ConcatEmbeddingsOptions *builtin_options_as_ConcatEmbeddingsOptions() const {
-    return builtin_options_type() == BuiltinOptions_ConcatEmbeddingsOptions ? static_cast<const ConcatEmbeddingsOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_Conv2DOptions
+               ? static_cast<const Conv2DOptions *>(builtin_options())
+               : nullptr;
+  }
+  const DepthwiseConv2DOptions *builtin_options_as_DepthwiseConv2DOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_DepthwiseConv2DOptions
+               ? static_cast<const DepthwiseConv2DOptions *>(builtin_options())
+               : nullptr;
+  }
+  const ConcatEmbeddingsOptions *builtin_options_as_ConcatEmbeddingsOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_ConcatEmbeddingsOptions
+               ? static_cast<const ConcatEmbeddingsOptions *>(builtin_options())
+               : nullptr;
   }
   const LSHProjectionOptions *builtin_options_as_LSHProjectionOptions() const {
-    return builtin_options_type() == BuiltinOptions_LSHProjectionOptions ? static_cast<const LSHProjectionOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_LSHProjectionOptions
+               ? static_cast<const LSHProjectionOptions *>(builtin_options())
+               : nullptr;
   }
   const Pool2DOptions *builtin_options_as_Pool2DOptions() const {
-    return builtin_options_type() == BuiltinOptions_Pool2DOptions ? static_cast<const Pool2DOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_Pool2DOptions
+               ? static_cast<const Pool2DOptions *>(builtin_options())
+               : nullptr;
   }
   const SVDFOptions *builtin_options_as_SVDFOptions() const {
-    return builtin_options_type() == BuiltinOptions_SVDFOptions ? static_cast<const SVDFOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_SVDFOptions
+               ? static_cast<const SVDFOptions *>(builtin_options())
+               : nullptr;
   }
   const RNNOptions *builtin_options_as_RNNOptions() const {
-    return builtin_options_type() == BuiltinOptions_RNNOptions ? static_cast<const RNNOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_RNNOptions
+               ? static_cast<const RNNOptions *>(builtin_options())
+               : nullptr;
   }
-  const FullyConnectedOptions *builtin_options_as_FullyConnectedOptions() const {
-    return builtin_options_type() == BuiltinOptions_FullyConnectedOptions ? static_cast<const FullyConnectedOptions *>(builtin_options()) : nullptr;
+  const FullyConnectedOptions *builtin_options_as_FullyConnectedOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_FullyConnectedOptions
+               ? static_cast<const FullyConnectedOptions *>(builtin_options())
+               : nullptr;
   }
   const SoftmaxOptions *builtin_options_as_SoftmaxOptions() const {
-    return builtin_options_type() == BuiltinOptions_SoftmaxOptions ? static_cast<const SoftmaxOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_SoftmaxOptions
+               ? static_cast<const SoftmaxOptions *>(builtin_options())
+               : nullptr;
   }
   const ConcatenationOptions *builtin_options_as_ConcatenationOptions() const {
-    return builtin_options_type() == BuiltinOptions_ConcatenationOptions ? static_cast<const ConcatenationOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_ConcatenationOptions
+               ? static_cast<const ConcatenationOptions *>(builtin_options())
+               : nullptr;
   }
   const AddOptions *builtin_options_as_AddOptions() const {
-    return builtin_options_type() == BuiltinOptions_AddOptions ? static_cast<const AddOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_AddOptions
+               ? static_cast<const AddOptions *>(builtin_options())
+               : nullptr;
   }
   const L2NormOptions *builtin_options_as_L2NormOptions() const {
-    return builtin_options_type() == BuiltinOptions_L2NormOptions ? static_cast<const L2NormOptions *>(builtin_options()) : nullptr;
-  }
-  const LocalResponseNormalizationOptions *builtin_options_as_LocalResponseNormalizationOptions() const {
-    return builtin_options_type() == BuiltinOptions_LocalResponseNormalizationOptions ? static_cast<const LocalResponseNormalizationOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_L2NormOptions
+               ? static_cast<const L2NormOptions *>(builtin_options())
+               : nullptr;
+  }
+  const LocalResponseNormalizationOptions *
+  builtin_options_as_LocalResponseNormalizationOptions() const {
+    return builtin_options_type() ==
+                   BuiltinOptions_LocalResponseNormalizationOptions
+               ? static_cast<const LocalResponseNormalizationOptions *>(
+                     builtin_options())
+               : nullptr;
   }
   const LSTMOptions *builtin_options_as_LSTMOptions() const {
-    return builtin_options_type() == BuiltinOptions_LSTMOptions ? static_cast<const LSTMOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_LSTMOptions
+               ? static_cast<const LSTMOptions *>(builtin_options())
+               : nullptr;
   }
-  const ResizeBilinearOptions *builtin_options_as_ResizeBilinearOptions() const {
-    return builtin_options_type() == BuiltinOptions_ResizeBilinearOptions ? static_cast<const ResizeBilinearOptions *>(builtin_options()) : nullptr;
+  const ResizeBilinearOptions *builtin_options_as_ResizeBilinearOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_ResizeBilinearOptions
+               ? static_cast<const ResizeBilinearOptions *>(builtin_options())
+               : nullptr;
   }
   const CallOptions *builtin_options_as_CallOptions() const {
-    return builtin_options_type() == BuiltinOptions_CallOptions ? static_cast<const CallOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_CallOptions
+               ? static_cast<const CallOptions *>(builtin_options())
+               : nullptr;
   }
   const ReshapeOptions *builtin_options_as_ReshapeOptions() const {
-    return builtin_options_type() == BuiltinOptions_ReshapeOptions ? static_cast<const ReshapeOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_ReshapeOptions
+               ? static_cast<const ReshapeOptions *>(builtin_options())
+               : nullptr;
   }
   const SkipGramOptions *builtin_options_as_SkipGramOptions() const {
-    return builtin_options_type() == BuiltinOptions_SkipGramOptions ? static_cast<const SkipGramOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_SkipGramOptions
+               ? static_cast<const SkipGramOptions *>(builtin_options())
+               : nullptr;
   }
   const SpaceToDepthOptions *builtin_options_as_SpaceToDepthOptions() const {
-    return builtin_options_type() == BuiltinOptions_SpaceToDepthOptions ? static_cast<const SpaceToDepthOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_SpaceToDepthOptions
+               ? static_cast<const SpaceToDepthOptions *>(builtin_options())
+               : nullptr;
   }
-  const EmbeddingLookupSparseOptions *builtin_options_as_EmbeddingLookupSparseOptions() const {
-    return builtin_options_type() == BuiltinOptions_EmbeddingLookupSparseOptions ? static_cast<const EmbeddingLookupSparseOptions *>(builtin_options()) : nullptr;
+  const EmbeddingLookupSparseOptions *
+  builtin_options_as_EmbeddingLookupSparseOptions() const {
+    return builtin_options_type() == BuiltinOptions_EmbeddingLookupSparseOptions
+               ? static_cast<const EmbeddingLookupSparseOptions *>(
+                     builtin_options())
+               : nullptr;
   }
   const MulOptions *builtin_options_as_MulOptions() const {
-    return builtin_options_type() == BuiltinOptions_MulOptions ? static_cast<const MulOptions *>(builtin_options()) : nullptr;
+    return builtin_options_type() == BuiltinOptions_MulOptions
+               ? static_cast<const MulOptions *>(builtin_options())
+               : nullptr;
+  }
+  const PadOptions *builtin_options_as_PadOptions() const {
+    return builtin_options_type() == BuiltinOptions_PadOptions
+               ? static_cast<const PadOptions *>(builtin_options())
+               : nullptr;
+  }
+  const GatherOptions *builtin_options_as_GatherOptions() const {
+    return builtin_options_type() == BuiltinOptions_GatherOptions
+               ? static_cast<const GatherOptions *>(builtin_options())
+               : nullptr;
+  }
+  const BatchToSpaceNDOptions *builtin_options_as_BatchToSpaceNDOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_BatchToSpaceNDOptions
+               ? static_cast<const BatchToSpaceNDOptions *>(builtin_options())
+               : nullptr;
+  }
+  const SpaceToBatchNDOptions *builtin_options_as_SpaceToBatchNDOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_SpaceToBatchNDOptions
+               ? static_cast<const SpaceToBatchNDOptions *>(builtin_options())
+               : nullptr;
+  }
+  const TransposeOptions *builtin_options_as_TransposeOptions() const {
+    return builtin_options_type() == BuiltinOptions_TransposeOptions
+               ? static_cast<const TransposeOptions *>(builtin_options())
+               : nullptr;
+  }
+  const MeanOptions *builtin_options_as_MeanOptions() const {
+    return builtin_options_type() == BuiltinOptions_MeanOptions
+               ? static_cast<const MeanOptions *>(builtin_options())
+               : nullptr;
+  }
+  const SubOptions *builtin_options_as_SubOptions() const {
+    return builtin_options_type() == BuiltinOptions_SubOptions
+               ? static_cast<const SubOptions *>(builtin_options())
+               : nullptr;
+  }
+  const DivOptions *builtin_options_as_DivOptions() const {
+    return builtin_options_type() == BuiltinOptions_DivOptions
+               ? static_cast<const DivOptions *>(builtin_options())
+               : nullptr;
+  }
+  const SqueezeOptions *builtin_options_as_SqueezeOptions() const {
+    return builtin_options_type() == BuiltinOptions_SqueezeOptions
+               ? static_cast<const SqueezeOptions *>(builtin_options())
+               : nullptr;
+  }
+  const SequenceRNNOptions *builtin_options_as_SequenceRNNOptions() const {
+    return builtin_options_type() == BuiltinOptions_SequenceRNNOptions
+               ? static_cast<const SequenceRNNOptions *>(builtin_options())
+               : nullptr;
+  }
+  const StridedSliceOptions *builtin_options_as_StridedSliceOptions() const {
+    return builtin_options_type() == BuiltinOptions_StridedSliceOptions
+               ? static_cast<const StridedSliceOptions *>(builtin_options())
+               : nullptr;
+  }
+  const ExpOptions *builtin_options_as_ExpOptions() const {
+    return builtin_options_type() == BuiltinOptions_ExpOptions
+               ? static_cast<const ExpOptions *>(builtin_options())
+               : nullptr;
   }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
   CustomOptionsFormat custom_options_format() const {
-    return static_cast<CustomOptionsFormat>(GetField<int8_t>(VT_CUSTOM_OPTIONS_FORMAT, 0));
+    return static_cast<CustomOptionsFormat>(
+        GetField<int8_t>(VT_CUSTOM_OPTIONS_FORMAT, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint32_t>(verifier, VT_OPCODE_INDEX) &&
-           VerifyOffset(verifier, VT_INPUTS) &&
-           verifier.Verify(inputs()) &&
-           VerifyOffset(verifier, VT_OUTPUTS) &&
-           verifier.Verify(outputs()) &&
+           VerifyOffset(verifier, VT_INPUTS) && verifier.Verify(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) && verifier.Verify(outputs()) &&
            VerifyField<uint8_t>(verifier, VT_BUILTIN_OPTIONS_TYPE) &&
            VerifyOffset(verifier, VT_BUILTIN_OPTIONS) &&
-           VerifyBuiltinOptions(verifier, builtin_options(), builtin_options_type()) &&
+           VerifyBuiltinOptions(verifier, builtin_options(),
+                                builtin_options_type()) &&
            VerifyOffset(verifier, VT_CUSTOM_OPTIONS) &&
            verifier.Verify(custom_options()) &&
            VerifyField<int8_t>(verifier, VT_CUSTOM_OPTIONS_FORMAT) &&
            verifier.EndTable();
   }
-  OperatorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(OperatorT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Operator> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  OperatorT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      OperatorT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Operator> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
-template<> inline const Conv2DOptions *Operator::builtin_options_as<Conv2DOptions>() const {
+template <>
+inline const Conv2DOptions *Operator::builtin_options_as<Conv2DOptions>()
+    const {
   return builtin_options_as_Conv2DOptions();
 }
 
-template<> inline const DepthwiseConv2DOptions *Operator::builtin_options_as<DepthwiseConv2DOptions>() const {
+template <>
+inline const DepthwiseConv2DOptions *
+Operator::builtin_options_as<DepthwiseConv2DOptions>() const {
   return builtin_options_as_DepthwiseConv2DOptions();
 }
 
-template<> inline const ConcatEmbeddingsOptions *Operator::builtin_options_as<ConcatEmbeddingsOptions>() const {
+template <>
+inline const ConcatEmbeddingsOptions *
+Operator::builtin_options_as<ConcatEmbeddingsOptions>() const {
   return builtin_options_as_ConcatEmbeddingsOptions();
 }
 
-template<> inline const LSHProjectionOptions *Operator::builtin_options_as<LSHProjectionOptions>() const {
+template <>
+inline const LSHProjectionOptions *
+Operator::builtin_options_as<LSHProjectionOptions>() const {
   return builtin_options_as_LSHProjectionOptions();
 }
 
-template<> inline const Pool2DOptions *Operator::builtin_options_as<Pool2DOptions>() const {
+template <>
+inline const Pool2DOptions *Operator::builtin_options_as<Pool2DOptions>()
+    const {
   return builtin_options_as_Pool2DOptions();
 }
 
-template<> inline const SVDFOptions *Operator::builtin_options_as<SVDFOptions>() const {
+template <>
+inline const SVDFOptions *Operator::builtin_options_as<SVDFOptions>() const {
   return builtin_options_as_SVDFOptions();
 }
 
-template<> inline const RNNOptions *Operator::builtin_options_as<RNNOptions>() const {
+template <>
+inline const RNNOptions *Operator::builtin_options_as<RNNOptions>() const {
   return builtin_options_as_RNNOptions();
 }
 
-template<> inline const FullyConnectedOptions *Operator::builtin_options_as<FullyConnectedOptions>() const {
+template <>
+inline const FullyConnectedOptions *
+Operator::builtin_options_as<FullyConnectedOptions>() const {
   return builtin_options_as_FullyConnectedOptions();
 }
 
-template<> inline const SoftmaxOptions *Operator::builtin_options_as<SoftmaxOptions>() const {
+template <>
+inline const SoftmaxOptions *Operator::builtin_options_as<SoftmaxOptions>()
+    const {
   return builtin_options_as_SoftmaxOptions();
 }
 
-template<> inline const ConcatenationOptions *Operator::builtin_options_as<ConcatenationOptions>() const {
+template <>
+inline const ConcatenationOptions *
+Operator::builtin_options_as<ConcatenationOptions>() const {
   return builtin_options_as_ConcatenationOptions();
 }
 
-template<> inline const AddOptions *Operator::builtin_options_as<AddOptions>() const {
+template <>
+inline const AddOptions *Operator::builtin_options_as<AddOptions>() const {
   return builtin_options_as_AddOptions();
 }
 
-template<> inline const L2NormOptions *Operator::builtin_options_as<L2NormOptions>() const {
+template <>
+inline const L2NormOptions *Operator::builtin_options_as<L2NormOptions>()
+    const {
   return builtin_options_as_L2NormOptions();
 }
 
-template<> inline const LocalResponseNormalizationOptions *Operator::builtin_options_as<LocalResponseNormalizationOptions>() const {
+template <>
+inline const LocalResponseNormalizationOptions *
+Operator::builtin_options_as<LocalResponseNormalizationOptions>() const {
   return builtin_options_as_LocalResponseNormalizationOptions();
 }
 
-template<> inline const LSTMOptions *Operator::builtin_options_as<LSTMOptions>() const {
+template <>
+inline const LSTMOptions *Operator::builtin_options_as<LSTMOptions>() const {
   return builtin_options_as_LSTMOptions();
 }
 
-template<> inline const ResizeBilinearOptions *Operator::builtin_options_as<ResizeBilinearOptions>() const {
+template <>
+inline const ResizeBilinearOptions *
+Operator::builtin_options_as<ResizeBilinearOptions>() const {
   return builtin_options_as_ResizeBilinearOptions();
 }
 
-template<> inline const CallOptions *Operator::builtin_options_as<CallOptions>() const {
+template <>
+inline const CallOptions *Operator::builtin_options_as<CallOptions>() const {
   return builtin_options_as_CallOptions();
 }
 
-template<> inline const ReshapeOptions *Operator::builtin_options_as<ReshapeOptions>() const {
+template <>
+inline const ReshapeOptions *Operator::builtin_options_as<ReshapeOptions>()
+    const {
   return builtin_options_as_ReshapeOptions();
 }
 
-template<> inline const SkipGramOptions *Operator::builtin_options_as<SkipGramOptions>() const {
+template <>
+inline const SkipGramOptions *Operator::builtin_options_as<SkipGramOptions>()
+    const {
   return builtin_options_as_SkipGramOptions();
 }
 
-template<> inline const SpaceToDepthOptions *Operator::builtin_options_as<SpaceToDepthOptions>() const {
+template <>
+inline const SpaceToDepthOptions *
+Operator::builtin_options_as<SpaceToDepthOptions>() const {
   return builtin_options_as_SpaceToDepthOptions();
 }
 
-template<> inline const EmbeddingLookupSparseOptions *Operator::builtin_options_as<EmbeddingLookupSparseOptions>() const {
+template <>
+inline const EmbeddingLookupSparseOptions *
+Operator::builtin_options_as<EmbeddingLookupSparseOptions>() const {
   return builtin_options_as_EmbeddingLookupSparseOptions();
 }
 
-template<> inline const MulOptions *Operator::builtin_options_as<MulOptions>() const {
+template <>
+inline const MulOptions *Operator::builtin_options_as<MulOptions>() const {
   return builtin_options_as_MulOptions();
 }
 
+template <>
+inline const PadOptions *Operator::builtin_options_as<PadOptions>() const {
+  return builtin_options_as_PadOptions();
+}
+
+template <>
+inline const GatherOptions *Operator::builtin_options_as<GatherOptions>()
+    const {
+  return builtin_options_as_GatherOptions();
+}
+
+template <>
+inline const BatchToSpaceNDOptions *
+Operator::builtin_options_as<BatchToSpaceNDOptions>() const {
+  return builtin_options_as_BatchToSpaceNDOptions();
+}
+
+template <>
+inline const SpaceToBatchNDOptions *
+Operator::builtin_options_as<SpaceToBatchNDOptions>() const {
+  return builtin_options_as_SpaceToBatchNDOptions();
+}
+
+template <>
+inline const TransposeOptions *Operator::builtin_options_as<TransposeOptions>()
+    const {
+  return builtin_options_as_TransposeOptions();
+}
+
+template <>
+inline const MeanOptions *Operator::builtin_options_as<MeanOptions>() const {
+  return builtin_options_as_MeanOptions();
+}
+
+template <>
+inline const SubOptions *Operator::builtin_options_as<SubOptions>() const {
+  return builtin_options_as_SubOptions();
+}
+
+template <>
+inline const DivOptions *Operator::builtin_options_as<DivOptions>() const {
+  return builtin_options_as_DivOptions();
+}
+
+template <>
+inline const SqueezeOptions *Operator::builtin_options_as<SqueezeOptions>()
+    const {
+  return builtin_options_as_SqueezeOptions();
+}
+
+template <>
+inline const SequenceRNNOptions *
+Operator::builtin_options_as<SequenceRNNOptions>() const {
+  return builtin_options_as_SequenceRNNOptions();
+}
+
+template <>
+inline const StridedSliceOptions *
+Operator::builtin_options_as<StridedSliceOptions>() const {
+  return builtin_options_as_StridedSliceOptions();
+}
+
+template <>
+inline const ExpOptions *Operator::builtin_options_as<ExpOptions>() const {
+  return builtin_options_as_ExpOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -2760,19 +4167,21 @@ struct OperatorBuilder {
     fbb_.AddOffset(Operator::VT_OUTPUTS, outputs);
   }
   void add_builtin_options_type(BuiltinOptions builtin_options_type) {
-    fbb_.AddElement<uint8_t>(Operator::VT_BUILTIN_OPTIONS_TYPE, static_cast<uint8_t>(builtin_options_type), 0);
+    fbb_.AddElement<uint8_t>(Operator::VT_BUILTIN_OPTIONS_TYPE,
+                             static_cast<uint8_t>(builtin_options_type), 0);
   }
   void add_builtin_options(flatbuffers::Offset<void> builtin_options) {
     fbb_.AddOffset(Operator::VT_BUILTIN_OPTIONS, builtin_options);
   }
-  void add_custom_options(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options) {
+  void add_custom_options(
+      flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options) {
     fbb_.AddOffset(Operator::VT_CUSTOM_OPTIONS, custom_options);
   }
   void add_custom_options_format(CustomOptionsFormat custom_options_format) {
-    fbb_.AddElement<int8_t>(Operator::VT_CUSTOM_OPTIONS_FORMAT, static_cast<int8_t>(custom_options_format), 0);
+    fbb_.AddElement<int8_t>(Operator::VT_CUSTOM_OPTIONS_FORMAT,
+                            static_cast<int8_t>(custom_options_format), 0);
   }
-  explicit OperatorBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit OperatorBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   OperatorBuilder &operator=(const OperatorBuilder &);
@@ -2784,14 +4193,14 @@ struct OperatorBuilder {
 };
 
 inline flatbuffers::Offset<Operator> CreateOperator(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    uint32_t opcode_index = 0,
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t opcode_index = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
     BuiltinOptions builtin_options_type = BuiltinOptions_NONE,
     flatbuffers::Offset<void> builtin_options = 0,
     flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options = 0,
-    CustomOptionsFormat custom_options_format = CustomOptionsFormat_FLEXBUFFERS) {
+    CustomOptionsFormat custom_options_format =
+        CustomOptionsFormat_FLEXBUFFERS) {
   OperatorBuilder builder_(_fbb);
   builder_.add_custom_options(custom_options);
   builder_.add_builtin_options(builtin_options);
@@ -2804,26 +4213,25 @@ inline flatbuffers::Offset<Operator> CreateOperator(
 }
 
 inline flatbuffers::Offset<Operator> CreateOperatorDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    uint32_t opcode_index = 0,
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t opcode_index = 0,
     const std::vector<int32_t> *inputs = nullptr,
     const std::vector<int32_t> *outputs = nullptr,
     BuiltinOptions builtin_options_type = BuiltinOptions_NONE,
     flatbuffers::Offset<void> builtin_options = 0,
     const std::vector<uint8_t> *custom_options = nullptr,
-    CustomOptionsFormat custom_options_format = CustomOptionsFormat_FLEXBUFFERS) {
+    CustomOptionsFormat custom_options_format =
+        CustomOptionsFormat_FLEXBUFFERS) {
   return tflite::CreateOperator(
-      _fbb,
-      opcode_index,
-      inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0,
-      outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0,
-      builtin_options_type,
+      _fbb, opcode_index, inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0,
+      outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0, builtin_options_type,
       builtin_options,
       custom_options ? _fbb.CreateVector<uint8_t>(*custom_options) : 0,
       custom_options_format);
 }
 
-flatbuffers::Offset<Operator> CreateOperator(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<Operator> CreateOperator(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct SubGraphT : public flatbuffers::NativeTable {
   typedef SubGraph TableType;
@@ -2832,8 +4240,7 @@ struct SubGraphT : public flatbuffers::NativeTable {
   std::vector<int32_t> outputs;
   std::vector<std::unique_ptr<OperatorT>> operators;
   std::string name;
-  SubGraphT() {
-  }
+  SubGraphT() {}
 };
 
 struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -2846,7 +4253,8 @@ struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_NAME = 12
   };
   const flatbuffers::Vector<flatbuffers::Offset<Tensor>> *tensors() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Tensor>> *>(VT_TENSORS);
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Tensor>> *>(
+        VT_TENSORS);
   }
   const flatbuffers::Vector<int32_t> *inputs() const {
     return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUTS);
@@ -2855,36 +4263,41 @@ struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
   }
   const flatbuffers::Vector<flatbuffers::Offset<Operator>> *operators() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Operator>> *>(VT_OPERATORS);
+    return GetPointer<
+        const flatbuffers::Vector<flatbuffers::Offset<Operator>> *>(
+        VT_OPERATORS);
   }
   const flatbuffers::String *name() const {
     return GetPointer<const flatbuffers::String *>(VT_NAME);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_TENSORS) &&
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_TENSORS) &&
            verifier.Verify(tensors()) &&
            verifier.VerifyVectorOfTables(tensors()) &&
-           VerifyOffset(verifier, VT_INPUTS) &&
-           verifier.Verify(inputs()) &&
-           VerifyOffset(verifier, VT_OUTPUTS) &&
-           verifier.Verify(outputs()) &&
+           VerifyOffset(verifier, VT_INPUTS) && verifier.Verify(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) && verifier.Verify(outputs()) &&
            VerifyOffset(verifier, VT_OPERATORS) &&
            verifier.Verify(operators()) &&
            verifier.VerifyVectorOfTables(operators()) &&
-           VerifyOffset(verifier, VT_NAME) &&
-           verifier.Verify(name()) &&
+           VerifyOffset(verifier, VT_NAME) && verifier.Verify(name()) &&
            verifier.EndTable();
   }
-  SubGraphT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(SubGraphT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<SubGraph> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  SubGraphT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SubGraphT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SubGraph> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct SubGraphBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_tensors(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Tensor>>> tensors) {
+  void add_tensors(
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Tensor>>>
+          tensors) {
     fbb_.AddOffset(SubGraph::VT_TENSORS, tensors);
   }
   void add_inputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs) {
@@ -2893,14 +4306,15 @@ struct SubGraphBuilder {
   void add_outputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs) {
     fbb_.AddOffset(SubGraph::VT_OUTPUTS, outputs);
   }
-  void add_operators(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Operator>>> operators) {
+  void add_operators(
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Operator>>>
+          operators) {
     fbb_.AddOffset(SubGraph::VT_OPERATORS, operators);
   }
   void add_name(flatbuffers::Offset<flatbuffers::String> name) {
     fbb_.AddOffset(SubGraph::VT_NAME, name);
   }
-  explicit SubGraphBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit SubGraphBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   SubGraphBuilder &operator=(const SubGraphBuilder &);
@@ -2913,10 +4327,12 @@ struct SubGraphBuilder {
 
 inline flatbuffers::Offset<SubGraph> CreateSubGraph(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Tensor>>> tensors = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Tensor>>>
+        tensors = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Operator>>> operators = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Operator>>>
+        operators = 0,
     flatbuffers::Offset<flatbuffers::String> name = 0) {
   SubGraphBuilder builder_(_fbb);
   builder_.add_name(name);
@@ -2939,36 +4355,38 @@ inline flatbuffers::Offset<SubGraph> CreateSubGraphDirect(
       tensors ? _fbb.CreateVector<flatbuffers::Offset<Tensor>>(*tensors) : 0,
       inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0,
       outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0,
-      operators ? _fbb.CreateVector<flatbuffers::Offset<Operator>>(*operators) : 0,
+      operators ? _fbb.CreateVector<flatbuffers::Offset<Operator>>(*operators)
+                : 0,
       name ? _fbb.CreateString(name) : 0);
 }
 
-flatbuffers::Offset<SubGraph> CreateSubGraph(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<SubGraph> CreateSubGraph(
+    flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct BufferT : public flatbuffers::NativeTable {
   typedef Buffer TableType;
   std::vector<uint8_t> data;
-  BufferT() {
-  }
+  BufferT() {}
 };
 
 struct Buffer FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef BufferT NativeTableType;
-  enum {
-    VT_DATA = 4
-  };
+  enum { VT_DATA = 4 };
   const flatbuffers::Vector<uint8_t> *data() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_DATA);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyOffset(verifier, VT_DATA) &&
-           verifier.Verify(data()) &&
-           verifier.EndTable();
-  }
-  BufferT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(BufferT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Buffer> Pack(flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_DATA) &&
+           verifier.Verify(data()) && verifier.EndTable();
+  }
+  BufferT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BufferT *_o, const flatbuffers::resolver_function_t *_resolver =
+                                 nullptr) const;
+  static flatbuffers::Offset<Buffer> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct BufferBuilder {
@@ -2977,8 +4395,7 @@ struct BufferBuilder {
   void add_data(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data) {
     fbb_.AddOffset(Buffer::VT_DATA, data);
   }
-  explicit BufferBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit BufferBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   BufferBuilder &operator=(const BufferBuilder &);
@@ -3000,12 +4417,13 @@ inline flatbuffers::Offset<Buffer> CreateBuffer(
 inline flatbuffers::Offset<Buffer> CreateBufferDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const std::vector<uint8_t> *data = nullptr) {
-  return tflite::CreateBuffer(
-      _fbb,
-      data ? _fbb.CreateVector<uint8_t>(*data) : 0);
+  return tflite::CreateBuffer(_fbb,
+                              data ? _fbb.CreateVector<uint8_t>(*data) : 0);
 }
 
-flatbuffers::Offset<Buffer> CreateBuffer(flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<Buffer> CreateBuffer(
+    flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct ModelT : public flatbuffers::NativeTable {
   typedef Model TableType;
@@ -3014,9 +4432,7 @@ struct ModelT : public flatbuffers::NativeTable {
   std::vector<std::unique_ptr<SubGraphT>> subgraphs;
   std::string description;
   std::vector<std::unique_ptr<BufferT>> buffers;
-  ModelT()
-      : version(0) {
-  }
+  ModelT() : version(0) {}
 };
 
 struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -3028,20 +4444,24 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_DESCRIPTION = 10,
     VT_BUFFERS = 12
   };
-  uint32_t version() const {
-    return GetField<uint32_t>(VT_VERSION, 0);
-  }
-  const flatbuffers::Vector<flatbuffers::Offset<OperatorCode>> *operator_codes() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<OperatorCode>> *>(VT_OPERATOR_CODES);
+  uint32_t version() const { return GetField<uint32_t>(VT_VERSION, 0); }
+  const flatbuffers::Vector<flatbuffers::Offset<OperatorCode>> *operator_codes()
+      const {
+    return GetPointer<
+        const flatbuffers::Vector<flatbuffers::Offset<OperatorCode>> *>(
+        VT_OPERATOR_CODES);
   }
   const flatbuffers::Vector<flatbuffers::Offset<SubGraph>> *subgraphs() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<SubGraph>> *>(VT_SUBGRAPHS);
+    return GetPointer<
+        const flatbuffers::Vector<flatbuffers::Offset<SubGraph>> *>(
+        VT_SUBGRAPHS);
   }
   const flatbuffers::String *description() const {
     return GetPointer<const flatbuffers::String *>(VT_DESCRIPTION);
   }
   const flatbuffers::Vector<flatbuffers::Offset<Buffer>> *buffers() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Buffer>> *>(VT_BUFFERS);
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Buffer>> *>(
+        VT_BUFFERS);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -3054,14 +4474,16 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            verifier.VerifyVectorOfTables(subgraphs()) &&
            VerifyOffset(verifier, VT_DESCRIPTION) &&
            verifier.Verify(description()) &&
-           VerifyOffset(verifier, VT_BUFFERS) &&
-           verifier.Verify(buffers()) &&
-           verifier.VerifyVectorOfTables(buffers()) &&
-           verifier.EndTable();
-  }
-  ModelT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<Model> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+           VerifyOffset(verifier, VT_BUFFERS) && verifier.Verify(buffers()) &&
+           verifier.VerifyVectorOfTables(buffers()) && verifier.EndTable();
+  }
+  ModelT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *_resolver =
+                                nullptr) const;
+  static flatbuffers::Offset<Model> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
 struct ModelBuilder {
@@ -3070,20 +4492,26 @@ struct ModelBuilder {
   void add_version(uint32_t version) {
     fbb_.AddElement<uint32_t>(Model::VT_VERSION, version, 0);
   }
-  void add_operator_codes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>> operator_codes) {
+  void add_operator_codes(
+      flatbuffers::Offset<
+          flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>>
+          operator_codes) {
     fbb_.AddOffset(Model::VT_OPERATOR_CODES, operator_codes);
   }
-  void add_subgraphs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>> subgraphs) {
+  void add_subgraphs(
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>>
+          subgraphs) {
     fbb_.AddOffset(Model::VT_SUBGRAPHS, subgraphs);
   }
   void add_description(flatbuffers::Offset<flatbuffers::String> description) {
     fbb_.AddOffset(Model::VT_DESCRIPTION, description);
   }
-  void add_buffers(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>> buffers) {
+  void add_buffers(
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>>
+          buffers) {
     fbb_.AddOffset(Model::VT_BUFFERS, buffers);
   }
-  explicit ModelBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit ModelBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
   ModelBuilder &operator=(const ModelBuilder &);
@@ -3095,12 +4523,14 @@ struct ModelBuilder {
 };
 
 inline flatbuffers::Offset<Model> CreateModel(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    uint32_t version = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>> operator_codes = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>> subgraphs = 0,
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t version = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>>
+        operator_codes = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>>
+        subgraphs = 0,
     flatbuffers::Offset<flatbuffers::String> description = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>> buffers = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>>
+        buffers = 0) {
   ModelBuilder builder_(_fbb);
   builder_.add_buffers(buffers);
   builder_.add_description(description);
@@ -3110,891 +4540,2049 @@ inline flatbuffers::Offset<Model> CreateModel(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<Model> CreateModelDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    uint32_t version = 0,
-    const std::vector<flatbuffers::Offset<OperatorCode>> *operator_codes = nullptr,
-    const std::vector<flatbuffers::Offset<SubGraph>> *subgraphs = nullptr,
-    const char *description = nullptr,
-    const std::vector<flatbuffers::Offset<Buffer>> *buffers = nullptr) {
-  return tflite::CreateModel(
-      _fbb,
-      version,
-      operator_codes ? _fbb.CreateVector<flatbuffers::Offset<OperatorCode>>(*operator_codes) : 0,
-      subgraphs ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>>(*subgraphs) : 0,
-      description ? _fbb.CreateString(description) : 0,
-      buffers ? _fbb.CreateVector<flatbuffers::Offset<Buffer>>(*buffers) : 0);
+inline flatbuffers::Offset<Model> CreateModelDirect(
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t version = 0,
+    const std::vector<flatbuffers::Offset<OperatorCode>> *operator_codes =
+        nullptr,
+    const std::vector<flatbuffers::Offset<SubGraph>> *subgraphs = nullptr,
+    const char *description = nullptr,
+    const std::vector<flatbuffers::Offset<Buffer>> *buffers = nullptr) {
+  return tflite::CreateModel(
+      _fbb, version,
+      operator_codes ? _fbb.CreateVector<flatbuffers::Offset<OperatorCode>>(
+                           *operator_codes)
+                     : 0,
+      subgraphs ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>>(*subgraphs)
+                : 0,
+      description ? _fbb.CreateString(description) : 0,
+      buffers ? _fbb.CreateVector<flatbuffers::Offset<Buffer>>(*buffers) : 0);
+}
+
+flatbuffers::Offset<Model> CreateModel(
+    flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline QuantizationParametersT *QuantizationParameters::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new QuantizationParametersT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void QuantizationParameters::UnPackTo(
+    QuantizationParametersT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = min();
+    if (_e) {
+      _o->min.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->min[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = max();
+    if (_e) {
+      _o->max.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->max[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = scale();
+    if (_e) {
+      _o->scale.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->scale[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = zero_point();
+    if (_e) {
+      _o->zero_point.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->zero_point[_i] = _e->Get(_i);
+      }
+    }
+  };
+}
+
+inline flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateQuantizationParameters(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
+    flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const QuantizationParametersT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _min = _o->min.size() ? _fbb.CreateVector(_o->min) : 0;
+  auto _max = _o->max.size() ? _fbb.CreateVector(_o->max) : 0;
+  auto _scale = _o->scale.size() ? _fbb.CreateVector(_o->scale) : 0;
+  auto _zero_point =
+      _o->zero_point.size() ? _fbb.CreateVector(_o->zero_point) : 0;
+  return tflite::CreateQuantizationParameters(_fbb, _min, _max, _scale,
+                                              _zero_point);
+}
+
+inline TensorT *Tensor::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new TensorT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void Tensor::UnPackTo(
+    TensorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = shape();
+    if (_e) {
+      _o->shape.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->shape[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = type();
+    _o->type = _e;
+  };
+  {
+    auto _e = buffer();
+    _o->buffer = _e;
+  };
+  {
+    auto _e = name();
+    if (_e) _o->name = _e->str();
+  };
+  {
+    auto _e = quantization();
+    if (_e)
+      _o->quantization =
+          std::unique_ptr<QuantizationParametersT>(_e->UnPack(_resolver));
+  };
+}
+
+inline flatbuffers::Offset<Tensor> Tensor::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTensor(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Tensor> CreateTensor(
+    flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const TensorT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _shape = _o->shape.size() ? _fbb.CreateVector(_o->shape) : 0;
+  auto _type = _o->type;
+  auto _buffer = _o->buffer;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _quantization = _o->quantization
+                           ? CreateQuantizationParameters(
+                                 _fbb, _o->quantization.get(), _rehasher)
+                           : 0;
+  return tflite::CreateTensor(_fbb, _shape, _type, _buffer, _name,
+                              _quantization);
+}
+
+inline Conv2DOptionsT *Conv2DOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new Conv2DOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void Conv2DOptions::UnPackTo(
+    Conv2DOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = padding();
+    _o->padding = _e;
+  };
+  {
+    auto _e = stride_w();
+    _o->stride_w = _e;
+  };
+  {
+    auto _e = stride_h();
+    _o->stride_h = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<Conv2DOptions> Conv2DOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConv2DOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const Conv2DOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateConv2DOptions(_fbb, _padding, _stride_w, _stride_h,
+                                     _fused_activation_function);
+}
+
+inline Pool2DOptionsT *Pool2DOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new Pool2DOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void Pool2DOptions::UnPackTo(
+    Pool2DOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = padding();
+    _o->padding = _e;
+  };
+  {
+    auto _e = stride_w();
+    _o->stride_w = _e;
+  };
+  {
+    auto _e = stride_h();
+    _o->stride_h = _e;
+  };
+  {
+    auto _e = filter_width();
+    _o->filter_width = _e;
+  };
+  {
+    auto _e = filter_height();
+    _o->filter_height = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<Pool2DOptions> Pool2DOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePool2DOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const Pool2DOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _filter_width = _o->filter_width;
+  auto _filter_height = _o->filter_height;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreatePool2DOptions(_fbb, _padding, _stride_w, _stride_h,
+                                     _filter_width, _filter_height,
+                                     _fused_activation_function);
+}
+
+inline DepthwiseConv2DOptionsT *DepthwiseConv2DOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new DepthwiseConv2DOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void DepthwiseConv2DOptions::UnPackTo(
+    DepthwiseConv2DOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = padding();
+    _o->padding = _e;
+  };
+  {
+    auto _e = stride_w();
+    _o->stride_w = _e;
+  };
+  {
+    auto _e = stride_h();
+    _o->stride_h = _e;
+  };
+  {
+    auto _e = depth_multiplier();
+    _o->depth_multiplier = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<DepthwiseConv2DOptions> DepthwiseConv2DOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDepthwiseConv2DOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const DepthwiseConv2DOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _depth_multiplier = _o->depth_multiplier;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateDepthwiseConv2DOptions(_fbb, _padding, _stride_w,
+                                              _stride_h, _depth_multiplier,
+                                              _fused_activation_function);
+}
+
+inline ConcatEmbeddingsOptionsT *ConcatEmbeddingsOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ConcatEmbeddingsOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ConcatEmbeddingsOptions::UnPackTo(
+    ConcatEmbeddingsOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = num_channels();
+    _o->num_channels = _e;
+  };
+  {
+    auto _e = num_columns_per_channel();
+    if (_e) {
+      _o->num_columns_per_channel.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->num_columns_per_channel[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = embedding_dim_per_channel();
+    if (_e) {
+      _o->embedding_dim_per_channel.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->embedding_dim_per_channel[_i] = _e->Get(_i);
+      }
+    }
+  };
+}
+
+inline flatbuffers::Offset<ConcatEmbeddingsOptions>
+ConcatEmbeddingsOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConcatEmbeddingsOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ConcatEmbeddingsOptions>
+CreateConcatEmbeddingsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ConcatEmbeddingsOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _num_channels = _o->num_channels;
+  auto _num_columns_per_channel =
+      _o->num_columns_per_channel.size()
+          ? _fbb.CreateVector(_o->num_columns_per_channel)
+          : 0;
+  auto _embedding_dim_per_channel =
+      _o->embedding_dim_per_channel.size()
+          ? _fbb.CreateVector(_o->embedding_dim_per_channel)
+          : 0;
+  return tflite::CreateConcatEmbeddingsOptions(_fbb, _num_channels,
+                                               _num_columns_per_channel,
+                                               _embedding_dim_per_channel);
+}
+
+inline LSHProjectionOptionsT *LSHProjectionOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LSHProjectionOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LSHProjectionOptions::UnPackTo(
+    LSHProjectionOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = type();
+    _o->type = _e;
+  };
 }
 
-flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+inline flatbuffers::Offset<LSHProjectionOptions> LSHProjectionOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLSHProjectionOptions(_fbb, _o, _rehasher);
+}
 
-inline QuantizationParametersT *QuantizationParameters::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new QuantizationParametersT();
+inline flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const LSHProjectionOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _type = _o->type;
+  return tflite::CreateLSHProjectionOptions(_fbb, _type);
+}
+
+inline SVDFOptionsT *SVDFOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SVDFOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void QuantizationParameters::UnPackTo(QuantizationParametersT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SVDFOptions::UnPackTo(
+    SVDFOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = min(); if (_e) { _o->min.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->min[_i] = _e->Get(_i); } } };
-  { auto _e = max(); if (_e) { _o->max.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->max[_i] = _e->Get(_i); } } };
-  { auto _e = scale(); if (_e) { _o->scale.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scale[_i] = _e->Get(_i); } } };
-  { auto _e = zero_point(); if (_e) { _o->zero_point.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->zero_point[_i] = _e->Get(_i); } } };
+  {
+    auto _e = rank();
+    _o->rank = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateQuantizationParameters(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<SVDFOptions> SVDFOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSVDFOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const QuantizationParametersT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _min = _o->min.size() ? _fbb.CreateVector(_o->min) : 0;
-  auto _max = _o->max.size() ? _fbb.CreateVector(_o->max) : 0;
-  auto _scale = _o->scale.size() ? _fbb.CreateVector(_o->scale) : 0;
-  auto _zero_point = _o->zero_point.size() ? _fbb.CreateVector(_o->zero_point) : 0;
-  return tflite::CreateQuantizationParameters(
-      _fbb,
-      _min,
-      _max,
-      _scale,
-      _zero_point);
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SVDFOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _rank = _o->rank;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateSVDFOptions(_fbb, _rank, _fused_activation_function);
 }
 
-inline TensorT *Tensor::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new TensorT();
+inline RNNOptionsT *RNNOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new RNNOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void Tensor::UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void RNNOptions::UnPackTo(
+    RNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = shape(); if (_e) { _o->shape.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape[_i] = _e->Get(_i); } } };
-  { auto _e = type(); _o->type = _e; };
-  { auto _e = buffer(); _o->buffer = _e; };
-  { auto _e = name(); if (_e) _o->name = _e->str(); };
-  { auto _e = quantization(); if (_e) _o->quantization = std::unique_ptr<QuantizationParametersT>(_e->UnPack(_resolver)); };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<Tensor> Tensor::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateTensor(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<RNNOptions> RNNOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRNNOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Tensor> CreateTensor(flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TensorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _shape = _o->shape.size() ? _fbb.CreateVector(_o->shape) : 0;
-  auto _type = _o->type;
-  auto _buffer = _o->buffer;
-  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
-  auto _quantization = _o->quantization ? CreateQuantizationParameters(_fbb, _o->quantization.get(), _rehasher) : 0;
-  return tflite::CreateTensor(
-      _fbb,
-      _shape,
-      _type,
-      _buffer,
-      _name,
-      _quantization);
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const RNNOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateRNNOptions(_fbb, _fused_activation_function);
 }
 
-inline Conv2DOptionsT *Conv2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new Conv2DOptionsT();
+inline SequenceRNNOptionsT *SequenceRNNOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SequenceRNNOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void Conv2DOptions::UnPackTo(Conv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SequenceRNNOptions::UnPackTo(
+    SequenceRNNOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = padding(); _o->padding = _e; };
-  { auto _e = stride_w(); _o->stride_w = _e; };
-  { auto _e = stride_h(); _o->stride_h = _e; };
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = time_major();
+    _o->time_major = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<Conv2DOptions> Conv2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateConv2DOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<SequenceRNNOptions> SequenceRNNOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSequenceRNNOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Conv2DOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _padding = _o->padding;
-  auto _stride_w = _o->stride_w;
-  auto _stride_h = _o->stride_h;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SequenceRNNOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _time_major = _o->time_major;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateConv2DOptions(
-      _fbb,
-      _padding,
-      _stride_w,
-      _stride_h,
-      _fused_activation_function);
+  return tflite::CreateSequenceRNNOptions(_fbb, _time_major,
+                                          _fused_activation_function);
 }
 
-inline Pool2DOptionsT *Pool2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new Pool2DOptionsT();
+inline BidirectionalSequenceRNNOptionsT *
+BidirectionalSequenceRNNOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new BidirectionalSequenceRNNOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void Pool2DOptions::UnPackTo(Pool2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void BidirectionalSequenceRNNOptions::UnPackTo(
+    BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = padding(); _o->padding = _e; };
-  { auto _e = stride_w(); _o->stride_w = _e; };
-  { auto _e = stride_h(); _o->stride_h = _e; };
-  { auto _e = filter_width(); _o->filter_width = _e; };
-  { auto _e = filter_height(); _o->filter_height = _e; };
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = time_major();
+    _o->time_major = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<Pool2DOptions> Pool2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreatePool2DOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+BidirectionalSequenceRNNOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBidirectionalSequenceRNNOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<BidirectionalSequenceRNNOptions>
+CreateBidirectionalSequenceRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const BidirectionalSequenceRNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const Pool2DOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _padding = _o->padding;
-  auto _stride_w = _o->stride_w;
-  auto _stride_h = _o->stride_h;
-  auto _filter_width = _o->filter_width;
-  auto _filter_height = _o->filter_height;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const BidirectionalSequenceRNNOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _time_major = _o->time_major;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreatePool2DOptions(
-      _fbb,
-      _padding,
-      _stride_w,
-      _stride_h,
-      _filter_width,
-      _filter_height,
-      _fused_activation_function);
+  return tflite::CreateBidirectionalSequenceRNNOptions(
+      _fbb, _time_major, _fused_activation_function);
 }
 
-inline DepthwiseConv2DOptionsT *DepthwiseConv2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new DepthwiseConv2DOptionsT();
+inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new FullyConnectedOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void DepthwiseConv2DOptions::UnPackTo(DepthwiseConv2DOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void FullyConnectedOptions::UnPackTo(
+    FullyConnectedOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = padding(); _o->padding = _e; };
-  { auto _e = stride_w(); _o->stride_w = _e; };
-  { auto _e = stride_h(); _o->stride_h = _e; };
-  { auto _e = depth_multiplier(); _o->depth_multiplier = _e; };
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<DepthwiseConv2DOptions> DepthwiseConv2DOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateDepthwiseConv2DOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<FullyConnectedOptions> FullyConnectedOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFullyConnectedOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DepthwiseConv2DOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _padding = _o->padding;
-  auto _stride_w = _o->stride_w;
-  auto _stride_h = _o->stride_h;
-  auto _depth_multiplier = _o->depth_multiplier;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const FullyConnectedOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateDepthwiseConv2DOptions(
-      _fbb,
-      _padding,
-      _stride_w,
-      _stride_h,
-      _depth_multiplier,
-      _fused_activation_function);
+  return tflite::CreateFullyConnectedOptions(_fbb, _fused_activation_function);
 }
 
-inline ConcatEmbeddingsOptionsT *ConcatEmbeddingsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new ConcatEmbeddingsOptionsT();
+inline SoftmaxOptionsT *SoftmaxOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SoftmaxOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void ConcatEmbeddingsOptions::UnPackTo(ConcatEmbeddingsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SoftmaxOptions::UnPackTo(
+    SoftmaxOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = num_channels(); _o->num_channels = _e; };
-  { auto _e = num_columns_per_channel(); if (_e) { _o->num_columns_per_channel.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->num_columns_per_channel[_i] = _e->Get(_i); } } };
-  { auto _e = embedding_dim_per_channel(); if (_e) { _o->embedding_dim_per_channel.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->embedding_dim_per_channel[_i] = _e->Get(_i); } } };
+  {
+    auto _e = beta();
+    _o->beta = _e;
+  };
 }
 
-inline flatbuffers::Offset<ConcatEmbeddingsOptions> ConcatEmbeddingsOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateConcatEmbeddingsOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<SoftmaxOptions> SoftmaxOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSoftmaxOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ConcatEmbeddingsOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _num_channels = _o->num_channels;
-  auto _num_columns_per_channel = _o->num_columns_per_channel.size() ? _fbb.CreateVector(_o->num_columns_per_channel) : 0;
-  auto _embedding_dim_per_channel = _o->embedding_dim_per_channel.size() ? _fbb.CreateVector(_o->embedding_dim_per_channel) : 0;
-  return tflite::CreateConcatEmbeddingsOptions(
-      _fbb,
-      _num_channels,
-      _num_columns_per_channel,
-      _embedding_dim_per_channel);
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SoftmaxOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _beta = _o->beta;
+  return tflite::CreateSoftmaxOptions(_fbb, _beta);
 }
 
-inline LSHProjectionOptionsT *LSHProjectionOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new LSHProjectionOptionsT();
+inline ConcatenationOptionsT *ConcatenationOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ConcatenationOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void LSHProjectionOptions::UnPackTo(LSHProjectionOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ConcatenationOptions::UnPackTo(
+    ConcatenationOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = type(); _o->type = _e; };
+  {
+    auto _e = axis();
+    _o->axis = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<LSHProjectionOptions> LSHProjectionOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateLSHProjectionOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<ConcatenationOptions> ConcatenationOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConcatenationOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LSHProjectionOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _type = _o->type;
-  return tflite::CreateLSHProjectionOptions(
-      _fbb,
-      _type);
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ConcatenationOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _axis = _o->axis;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateConcatenationOptions(_fbb, _axis,
+                                            _fused_activation_function);
 }
 
-inline SVDFOptionsT *SVDFOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new SVDFOptionsT();
+inline AddOptionsT *AddOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new AddOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void SVDFOptions::UnPackTo(SVDFOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void AddOptions::UnPackTo(
+    AddOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = rank(); _o->rank = _e; };
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<SVDFOptions> SVDFOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateSVDFOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<AddOptions> AddOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAddOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<AddOptions> CreateAddOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SVDFOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _rank = _o->rank;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const AddOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateSVDFOptions(
-      _fbb,
-      _rank,
-      _fused_activation_function);
+  return tflite::CreateAddOptions(_fbb, _fused_activation_function);
 }
 
-inline RNNOptionsT *RNNOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new RNNOptionsT();
+inline MulOptionsT *MulOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new MulOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void RNNOptions::UnPackTo(RNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MulOptions::UnPackTo(
+    MulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<RNNOptions> RNNOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateRNNOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<MulOptions> MulOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMulOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<MulOptions> CreateMulOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RNNOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const MulOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateRNNOptions(
-      _fbb,
-      _fused_activation_function);
+  return tflite::CreateMulOptions(_fbb, _fused_activation_function);
 }
 
-inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new FullyConnectedOptionsT();
+inline L2NormOptionsT *L2NormOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new L2NormOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void FullyConnectedOptions::UnPackTo(FullyConnectedOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void L2NormOptions::UnPackTo(
+    L2NormOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<FullyConnectedOptions> FullyConnectedOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateFullyConnectedOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<L2NormOptions> L2NormOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateL2NormOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FullyConnectedOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const L2NormOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateFullyConnectedOptions(
-      _fbb,
-      _fused_activation_function);
+  return tflite::CreateL2NormOptions(_fbb, _fused_activation_function);
 }
 
-inline SoftmaxOptionsT *SoftmaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new SoftmaxOptionsT();
+inline LocalResponseNormalizationOptionsT *
+LocalResponseNormalizationOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LocalResponseNormalizationOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void SoftmaxOptions::UnPackTo(SoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LocalResponseNormalizationOptions::UnPackTo(
+    LocalResponseNormalizationOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = beta(); _o->beta = _e; };
+  {
+    auto _e = radius();
+    _o->radius = _e;
+  };
+  {
+    auto _e = bias();
+    _o->bias = _e;
+  };
+  {
+    auto _e = alpha();
+    _o->alpha = _e;
+  };
+  {
+    auto _e = beta();
+    _o->beta = _e;
+  };
 }
 
-inline flatbuffers::Offset<SoftmaxOptions> SoftmaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateSoftmaxOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<LocalResponseNormalizationOptions>
+LocalResponseNormalizationOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const LocalResponseNormalizationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLocalResponseNormalizationOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<LocalResponseNormalizationOptions>
+CreateLocalResponseNormalizationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const LocalResponseNormalizationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SoftmaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const LocalResponseNormalizationOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _radius = _o->radius;
+  auto _bias = _o->bias;
+  auto _alpha = _o->alpha;
   auto _beta = _o->beta;
-  return tflite::CreateSoftmaxOptions(
-      _fbb,
-      _beta);
+  return tflite::CreateLocalResponseNormalizationOptions(_fbb, _radius, _bias,
+                                                         _alpha, _beta);
 }
 
-inline ConcatenationOptionsT *ConcatenationOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new ConcatenationOptionsT();
+inline LSTMOptionsT *LSTMOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LSTMOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void ConcatenationOptions::UnPackTo(ConcatenationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LSTMOptions::UnPackTo(
+    LSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = axis(); _o->axis = _e; };
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+  {
+    auto _e = cell_clip();
+    _o->cell_clip = _e;
+  };
+  {
+    auto _e = proj_clip();
+    _o->proj_clip = _e;
+  };
 }
 
-inline flatbuffers::Offset<ConcatenationOptions> ConcatenationOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateConcatenationOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<LSTMOptions> LSTMOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLSTMOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ConcatenationOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _axis = _o->axis;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const LSTMOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateConcatenationOptions(
-      _fbb,
-      _axis,
-      _fused_activation_function);
+  auto _cell_clip = _o->cell_clip;
+  auto _proj_clip = _o->proj_clip;
+  return tflite::CreateLSTMOptions(_fbb, _fused_activation_function, _cell_clip,
+                                   _proj_clip);
 }
 
-inline AddOptionsT *AddOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new AddOptionsT();
+inline ResizeBilinearOptionsT *ResizeBilinearOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ResizeBilinearOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void AddOptions::UnPackTo(AddOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ResizeBilinearOptions::UnPackTo(
+    ResizeBilinearOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = align_corners();
+    _o->align_corners = _e;
+  };
 }
 
-inline flatbuffers::Offset<AddOptions> AddOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateAddOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<ResizeBilinearOptions> ResizeBilinearOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateResizeBilinearOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<AddOptions> CreateAddOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AddOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateAddOptions(
-      _fbb,
-      _fused_activation_function);
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ResizeBilinearOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _align_corners = _o->align_corners;
+  return tflite::CreateResizeBilinearOptions(_fbb, _align_corners);
+}
+
+inline CallOptionsT *CallOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new CallOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void CallOptions::UnPackTo(
+    CallOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = subgraph();
+    _o->subgraph = _e;
+  };
+}
+
+inline flatbuffers::Offset<CallOptions> CallOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCallOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CallOptions> CreateCallOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const CallOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _subgraph = _o->subgraph;
+  return tflite::CreateCallOptions(_fbb, _subgraph);
+}
+
+inline PadOptionsT *PadOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new PadOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void PadOptions::UnPackTo(
+    PadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<PadOptions> PadOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePadOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<PadOptions> CreatePadOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const PadOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  return tflite::CreatePadOptions(_fbb);
+}
+
+inline ReshapeOptionsT *ReshapeOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ReshapeOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ReshapeOptions::UnPackTo(
+    ReshapeOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = new_shape();
+    if (_e) {
+      _o->new_shape.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->new_shape[_i] = _e->Get(_i);
+      }
+    }
+  };
+}
+
+inline flatbuffers::Offset<ReshapeOptions> ReshapeOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReshapeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ReshapeOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _new_shape = _o->new_shape.size() ? _fbb.CreateVector(_o->new_shape) : 0;
+  return tflite::CreateReshapeOptions(_fbb, _new_shape);
+}
+
+inline SpaceToBatchNDOptionsT *SpaceToBatchNDOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SpaceToBatchNDOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SpaceToBatchNDOptions::UnPackTo(
+    SpaceToBatchNDOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SpaceToBatchNDOptions> SpaceToBatchNDOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSpaceToBatchNDOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SpaceToBatchNDOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SpaceToBatchNDOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  return tflite::CreateSpaceToBatchNDOptions(_fbb);
+}
+
+inline BatchToSpaceNDOptionsT *BatchToSpaceNDOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new BatchToSpaceNDOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void BatchToSpaceNDOptions::UnPackTo(
+    BatchToSpaceNDOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<BatchToSpaceNDOptions> BatchToSpaceNDOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBatchToSpaceNDOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const BatchToSpaceNDOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const BatchToSpaceNDOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  return tflite::CreateBatchToSpaceNDOptions(_fbb);
+}
+
+inline SkipGramOptionsT *SkipGramOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SkipGramOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SkipGramOptions::UnPackTo(
+    SkipGramOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = ngram_size();
+    _o->ngram_size = _e;
+  };
+  {
+    auto _e = max_skip_size();
+    _o->max_skip_size = _e;
+  };
+  {
+    auto _e = include_all_ngrams();
+    _o->include_all_ngrams = _e;
+  };
+}
+
+inline flatbuffers::Offset<SkipGramOptions> SkipGramOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSkipGramOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SkipGramOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _ngram_size = _o->ngram_size;
+  auto _max_skip_size = _o->max_skip_size;
+  auto _include_all_ngrams = _o->include_all_ngrams;
+  return tflite::CreateSkipGramOptions(_fbb, _ngram_size, _max_skip_size,
+                                       _include_all_ngrams);
 }
 
-inline MulOptionsT *MulOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new MulOptionsT();
+inline SpaceToDepthOptionsT *SpaceToDepthOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SpaceToDepthOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void MulOptions::UnPackTo(MulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SpaceToDepthOptions::UnPackTo(
+    SpaceToDepthOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = block_size();
+    _o->block_size = _e;
+  };
 }
 
-inline flatbuffers::Offset<MulOptions> MulOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateMulOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<SpaceToDepthOptions> SpaceToDepthOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSpaceToDepthOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MulOptions> CreateMulOptions(flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MulOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateMulOptions(
-      _fbb,
-      _fused_activation_function);
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SpaceToDepthOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _block_size = _o->block_size;
+  return tflite::CreateSpaceToDepthOptions(_fbb, _block_size);
 }
 
-inline L2NormOptionsT *L2NormOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new L2NormOptionsT();
+inline SubOptionsT *SubOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SubOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void L2NormOptions::UnPackTo(L2NormOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SubOptions::UnPackTo(
+    SubOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<L2NormOptions> L2NormOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateL2NormOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<SubOptions> SubOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSubOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SubOptions> CreateSubOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SubOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const L2NormOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SubOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
-  return tflite::CreateL2NormOptions(
-      _fbb,
-      _fused_activation_function);
+  return tflite::CreateSubOptions(_fbb, _fused_activation_function);
 }
 
-inline LocalResponseNormalizationOptionsT *LocalResponseNormalizationOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new LocalResponseNormalizationOptionsT();
+inline DivOptionsT *DivOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new DivOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void LocalResponseNormalizationOptions::UnPackTo(LocalResponseNormalizationOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void DivOptions::UnPackTo(
+    DivOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = radius(); _o->radius = _e; };
-  { auto _e = bias(); _o->bias = _e; };
-  { auto _e = alpha(); _o->alpha = _e; };
-  { auto _e = beta(); _o->beta = _e; };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
 }
 
-inline flatbuffers::Offset<LocalResponseNormalizationOptions> LocalResponseNormalizationOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateLocalResponseNormalizationOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<DivOptions> DivOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDivOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(flatbuffers::FlatBufferBuilder &_fbb, const LocalResponseNormalizationOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<DivOptions> CreateDivOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const DivOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LocalResponseNormalizationOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _radius = _o->radius;
-  auto _bias = _o->bias;
-  auto _alpha = _o->alpha;
-  auto _beta = _o->beta;
-  return tflite::CreateLocalResponseNormalizationOptions(
-      _fbb,
-      _radius,
-      _bias,
-      _alpha,
-      _beta);
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const DivOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateDivOptions(_fbb, _fused_activation_function);
 }
 
-inline LSTMOptionsT *LSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new LSTMOptionsT();
+inline EmbeddingLookupSparseOptionsT *EmbeddingLookupSparseOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new EmbeddingLookupSparseOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void LSTMOptions::UnPackTo(LSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void EmbeddingLookupSparseOptions::UnPackTo(
+    EmbeddingLookupSparseOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
-  { auto _e = cell_clip(); _o->cell_clip = _e; };
-  { auto _e = proj_clip(); _o->proj_clip = _e; };
+  {
+    auto _e = combiner();
+    _o->combiner = _e;
+  };
 }
 
-inline flatbuffers::Offset<LSTMOptions> LSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateLSTMOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<EmbeddingLookupSparseOptions>
+EmbeddingLookupSparseOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const EmbeddingLookupSparseOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEmbeddingLookupSparseOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<EmbeddingLookupSparseOptions>
+CreateEmbeddingLookupSparseOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const EmbeddingLookupSparseOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LSTMOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _fused_activation_function = _o->fused_activation_function;
-  auto _cell_clip = _o->cell_clip;
-  auto _proj_clip = _o->proj_clip;
-  return tflite::CreateLSTMOptions(
-      _fbb,
-      _fused_activation_function,
-      _cell_clip,
-      _proj_clip);
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const EmbeddingLookupSparseOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _combiner = _o->combiner;
+  return tflite::CreateEmbeddingLookupSparseOptions(_fbb, _combiner);
 }
 
-inline ResizeBilinearOptionsT *ResizeBilinearOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new ResizeBilinearOptionsT();
+inline GatherOptionsT *GatherOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new GatherOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void ResizeBilinearOptions::UnPackTo(ResizeBilinearOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void GatherOptions::UnPackTo(
+    GatherOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = new_height(); _o->new_height = _e; };
-  { auto _e = new_width(); _o->new_width = _e; };
+  {
+    auto _e = axis();
+    _o->axis = _e;
+  };
 }
 
-inline flatbuffers::Offset<ResizeBilinearOptions> ResizeBilinearOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateResizeBilinearOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<GatherOptions> GatherOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGatherOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<GatherOptions> CreateGatherOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const GatherOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ResizeBilinearOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _new_height = _o->new_height;
-  auto _new_width = _o->new_width;
-  return tflite::CreateResizeBilinearOptions(
-      _fbb,
-      _new_height,
-      _new_width);
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const GatherOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _axis = _o->axis;
+  return tflite::CreateGatherOptions(_fbb, _axis);
 }
 
-inline CallOptionsT *CallOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new CallOptionsT();
+inline TransposeOptionsT *TransposeOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new TransposeOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void CallOptions::UnPackTo(CallOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void TransposeOptions::UnPackTo(
+    TransposeOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = subgraph(); _o->subgraph = _e; };
 }
 
-inline flatbuffers::Offset<CallOptions> CallOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateCallOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<TransposeOptions> TransposeOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTransposeOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<CallOptions> CreateCallOptions(flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const TransposeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CallOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _subgraph = _o->subgraph;
-  return tflite::CreateCallOptions(
-      _fbb,
-      _subgraph);
-}
-
-inline ReshapeOptionsT *ReshapeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new ReshapeOptionsT();
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const TransposeOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  return tflite::CreateTransposeOptions(_fbb);
+}
+
+inline ExpOptionsT *ExpOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ExpOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void ReshapeOptions::UnPackTo(ReshapeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ExpOptions::UnPackTo(
+    ExpOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = new_shape(); if (_e) { _o->new_shape.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->new_shape[_i] = _e->Get(_i); } } };
 }
 
-inline flatbuffers::Offset<ReshapeOptions> ReshapeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateReshapeOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<ExpOptions> ExpOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateExpOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<ExpOptions> CreateExpOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReshapeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _new_shape = _o->new_shape.size() ? _fbb.CreateVector(_o->new_shape) : 0;
-  return tflite::CreateReshapeOptions(
-      _fbb,
-      _new_shape);
-}
-
-inline SkipGramOptionsT *SkipGramOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new SkipGramOptionsT();
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ExpOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  return tflite::CreateExpOptions(_fbb);
+}
+
+inline MeanOptionsT *MeanOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new MeanOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void SkipGramOptions::UnPackTo(SkipGramOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MeanOptions::UnPackTo(
+    MeanOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = ngram_size(); _o->ngram_size = _e; };
-  { auto _e = max_skip_size(); _o->max_skip_size = _e; };
-  { auto _e = include_all_ngrams(); _o->include_all_ngrams = _e; };
+  {
+    auto _e = keep_dims();
+    _o->keep_dims = _e;
+  };
 }
 
-inline flatbuffers::Offset<SkipGramOptions> SkipGramOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateSkipGramOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<MeanOptions> MeanOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const MeanOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMeanOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<MeanOptions> CreateMeanOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const MeanOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SkipGramOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _ngram_size = _o->ngram_size;
-  auto _max_skip_size = _o->max_skip_size;
-  auto _include_all_ngrams = _o->include_all_ngrams;
-  return tflite::CreateSkipGramOptions(
-      _fbb,
-      _ngram_size,
-      _max_skip_size,
-      _include_all_ngrams);
-}
-
-inline SpaceToDepthOptionsT *SpaceToDepthOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new SpaceToDepthOptionsT();
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const MeanOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _keep_dims = _o->keep_dims;
+  return tflite::CreateMeanOptions(_fbb, _keep_dims);
+}
+
+inline SqueezeOptionsT *SqueezeOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SqueezeOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void SpaceToDepthOptions::UnPackTo(SpaceToDepthOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SqueezeOptions::UnPackTo(
+    SqueezeOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = block_size(); _o->block_size = _e; };
+  {
+    auto _e = squeeze_dims();
+    if (_e) {
+      _o->squeeze_dims.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->squeeze_dims[_i] = _e->Get(_i);
+      }
+    }
+  };
 }
 
-inline flatbuffers::Offset<SpaceToDepthOptions> SpaceToDepthOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateSpaceToDepthOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<SqueezeOptions> SqueezeOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSqueezeOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SpaceToDepthOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _block_size = _o->block_size;
-  return tflite::CreateSpaceToDepthOptions(
-      _fbb,
-      _block_size);
-}
-
-inline EmbeddingLookupSparseOptionsT *EmbeddingLookupSparseOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new EmbeddingLookupSparseOptionsT();
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SqueezeOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _squeeze_dims =
+      _o->squeeze_dims.size() ? _fbb.CreateVector(_o->squeeze_dims) : 0;
+  return tflite::CreateSqueezeOptions(_fbb, _squeeze_dims);
+}
+
+inline StridedSliceOptionsT *StridedSliceOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new StridedSliceOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void EmbeddingLookupSparseOptions::UnPackTo(EmbeddingLookupSparseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void StridedSliceOptions::UnPackTo(
+    StridedSliceOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = combiner(); _o->combiner = _e; };
+  {
+    auto _e = begin_mask();
+    _o->begin_mask = _e;
+  };
+  {
+    auto _e = end_mask();
+    _o->end_mask = _e;
+  };
+  {
+    auto _e = ellipsis_mask();
+    _o->ellipsis_mask = _e;
+  };
+  {
+    auto _e = new_axis_mask();
+    _o->new_axis_mask = _e;
+  };
+  {
+    auto _e = shrink_axis_mask();
+    _o->shrink_axis_mask = _e;
+  };
 }
 
-inline flatbuffers::Offset<EmbeddingLookupSparseOptions> EmbeddingLookupSparseOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateEmbeddingLookupSparseOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<StridedSliceOptions> StridedSliceOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStridedSliceOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(flatbuffers::FlatBufferBuilder &_fbb, const EmbeddingLookupSparseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const EmbeddingLookupSparseOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _combiner = _o->combiner;
-  return tflite::CreateEmbeddingLookupSparseOptions(
-      _fbb,
-      _combiner);
-}
-
-inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const StridedSliceOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _begin_mask = _o->begin_mask;
+  auto _end_mask = _o->end_mask;
+  auto _ellipsis_mask = _o->ellipsis_mask;
+  auto _new_axis_mask = _o->new_axis_mask;
+  auto _shrink_axis_mask = _o->shrink_axis_mask;
+  return tflite::CreateStridedSliceOptions(_fbb, _begin_mask, _end_mask,
+                                           _ellipsis_mask, _new_axis_mask,
+                                           _shrink_axis_mask);
+}
+
+inline OperatorCodeT *OperatorCode::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void OperatorCode::UnPackTo(OperatorCodeT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void OperatorCode::UnPackTo(
+    OperatorCodeT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = builtin_code(); _o->builtin_code = _e; };
-  { auto _e = custom_code(); if (_e) _o->custom_code = _e->str(); };
+  {
+    auto _e = builtin_code();
+    _o->builtin_code = _e;
+  };
+  {
+    auto _e = custom_code();
+    if (_e) _o->custom_code = _e->str();
+  };
 }
 
-inline flatbuffers::Offset<OperatorCode> OperatorCode::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<OperatorCode> OperatorCode::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateOperatorCode(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OperatorCodeT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const OperatorCodeT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _builtin_code = _o->builtin_code;
-  auto _custom_code = _o->custom_code.empty() ? 0 : _fbb.CreateString(_o->custom_code);
-  return tflite::CreateOperatorCode(
-      _fbb,
-      _builtin_code,
-      _custom_code);
+  auto _custom_code =
+      _o->custom_code.empty() ? 0 : _fbb.CreateString(_o->custom_code);
+  return tflite::CreateOperatorCode(_fbb, _builtin_code, _custom_code);
 }
 
-inline OperatorT *Operator::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline OperatorT *Operator::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void Operator::UnPackTo(OperatorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Operator::UnPackTo(
+    OperatorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = opcode_index(); _o->opcode_index = _e; };
-  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } };
-  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } };
-  { auto _e = builtin_options_type(); _o->builtin_options.type = _e; };
-  { auto _e = builtin_options(); if (_e) _o->builtin_options.value = BuiltinOptionsUnion::UnPack(_e, builtin_options_type(), _resolver); };
-  { auto _e = custom_options(); if (_e) { _o->custom_options.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->custom_options[_i] = _e->Get(_i); } } };
-  { auto _e = custom_options_format(); _o->custom_options_format = _e; };
+  {
+    auto _e = opcode_index();
+    _o->opcode_index = _e;
+  };
+  {
+    auto _e = inputs();
+    if (_e) {
+      _o->inputs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->inputs[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = outputs();
+    if (_e) {
+      _o->outputs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->outputs[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = builtin_options_type();
+    _o->builtin_options.type = _e;
+  };
+  {
+    auto _e = builtin_options();
+    if (_e)
+      _o->builtin_options.value =
+          BuiltinOptionsUnion::UnPack(_e, builtin_options_type(), _resolver);
+  };
+  {
+    auto _e = custom_options();
+    if (_e) {
+      _o->custom_options.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->custom_options[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = custom_options_format();
+    _o->custom_options_format = _e;
+  };
 }
 
-inline flatbuffers::Offset<Operator> Operator::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Operator> Operator::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateOperator(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Operator> CreateOperator(flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Operator> CreateOperator(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OperatorT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const OperatorT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _opcode_index = _o->opcode_index;
   auto _inputs = _o->inputs.size() ? _fbb.CreateVector(_o->inputs) : 0;
   auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0;
   auto _builtin_options_type = _o->builtin_options.type;
   auto _builtin_options = _o->builtin_options.Pack(_fbb);
-  auto _custom_options = _o->custom_options.size() ? _fbb.CreateVector(_o->custom_options) : 0;
+  auto _custom_options =
+      _o->custom_options.size() ? _fbb.CreateVector(_o->custom_options) : 0;
   auto _custom_options_format = _o->custom_options_format;
-  return tflite::CreateOperator(
-      _fbb,
-      _opcode_index,
-      _inputs,
-      _outputs,
-      _builtin_options_type,
-      _builtin_options,
-      _custom_options,
-      _custom_options_format);
+  return tflite::CreateOperator(_fbb, _opcode_index, _inputs, _outputs,
+                                _builtin_options_type, _builtin_options,
+                                _custom_options, _custom_options_format);
 }
 
-inline SubGraphT *SubGraph::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline SubGraphT *SubGraph::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new SubGraphT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void SubGraph::UnPackTo(SubGraphT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SubGraph::UnPackTo(
+    SubGraphT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = tensors(); if (_e) { _o->tensors.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->tensors[_i] = std::unique_ptr<TensorT>(_e->Get(_i)->UnPack(_resolver)); } } };
-  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = _e->Get(_i); } } };
-  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } };
-  { auto _e = operators(); if (_e) { _o->operators.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->operators[_i] = std::unique_ptr<OperatorT>(_e->Get(_i)->UnPack(_resolver)); } } };
-  { auto _e = name(); if (_e) _o->name = _e->str(); };
+  {
+    auto _e = tensors();
+    if (_e) {
+      _o->tensors.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->tensors[_i] =
+            std::unique_ptr<TensorT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
+  {
+    auto _e = inputs();
+    if (_e) {
+      _o->inputs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->inputs[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = outputs();
+    if (_e) {
+      _o->outputs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->outputs[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = operators();
+    if (_e) {
+      _o->operators.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->operators[_i] =
+            std::unique_ptr<OperatorT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
+  {
+    auto _e = name();
+    if (_e) _o->name = _e->str();
+  };
 }
 
-inline flatbuffers::Offset<SubGraph> SubGraph::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SubGraph> SubGraph::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateSubGraph(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SubGraph> CreateSubGraph(flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SubGraph> CreateSubGraph(
+    flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SubGraphT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _tensors = _o->tensors.size() ? _fbb.CreateVector<flatbuffers::Offset<Tensor>> (_o->tensors.size(), [](size_t i, _VectorArgs *__va) { return CreateTensor(*__va->__fbb, __va->__o->tensors[i].get(), __va->__rehasher); }, &_va ) : 0;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SubGraphT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _tensors =
+      _o->tensors.size()
+          ? _fbb.CreateVector<flatbuffers::Offset<Tensor>>(
+                _o->tensors.size(),
+                [](size_t i, _VectorArgs *__va) {
+                  return CreateTensor(*__va->__fbb, __va->__o->tensors[i].get(),
+                                      __va->__rehasher);
+                },
+                &_va)
+          : 0;
   auto _inputs = _o->inputs.size() ? _fbb.CreateVector(_o->inputs) : 0;
   auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0;
-  auto _operators = _o->operators.size() ? _fbb.CreateVector<flatbuffers::Offset<Operator>> (_o->operators.size(), [](size_t i, _VectorArgs *__va) { return CreateOperator(*__va->__fbb, __va->__o->operators[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _operators = _o->operators.size()
+                        ? _fbb.CreateVector<flatbuffers::Offset<Operator>>(
+                              _o->operators.size(),
+                              [](size_t i, _VectorArgs *__va) {
+                                return CreateOperator(
+                                    *__va->__fbb, __va->__o->operators[i].get(),
+                                    __va->__rehasher);
+                              },
+                              &_va)
+                        : 0;
   auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
-  return tflite::CreateSubGraph(
-      _fbb,
-      _tensors,
-      _inputs,
-      _outputs,
-      _operators,
-      _name);
+  return tflite::CreateSubGraph(_fbb, _tensors, _inputs, _outputs, _operators,
+                                _name);
 }
 
-inline BufferT *Buffer::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline BufferT *Buffer::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new BufferT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void Buffer::UnPackTo(BufferT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Buffer::UnPackTo(
+    BufferT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = data(); if (_e) { _o->data.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->data[_i] = _e->Get(_i); } } };
+  {
+    auto _e = data();
+    if (_e) {
+      _o->data.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->data[_i] = _e->Get(_i);
+      }
+    }
+  };
 }
 
-inline flatbuffers::Offset<Buffer> Buffer::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Buffer> Buffer::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateBuffer(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Buffer> CreateBuffer(flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Buffer> CreateBuffer(
+    flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BufferT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const BufferT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _data = _o->data.size() ? _fbb.CreateVector(_o->data) : 0;
-  return tflite::CreateBuffer(
-      _fbb,
-      _data);
+  return tflite::CreateBuffer(_fbb, _data);
 }
 
-inline ModelT *Model::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+inline ModelT *Model::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ModelT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void Model::UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void Model::UnPackTo(
+    ModelT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = version(); _o->version = _e; };
-  { auto _e = operator_codes(); if (_e) { _o->operator_codes.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->operator_codes[_i] = std::unique_ptr<OperatorCodeT>(_e->Get(_i)->UnPack(_resolver)); } } };
-  { auto _e = subgraphs(); if (_e) { _o->subgraphs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->subgraphs[_i] = std::unique_ptr<SubGraphT>(_e->Get(_i)->UnPack(_resolver)); } } };
-  { auto _e = description(); if (_e) _o->description = _e->str(); };
-  { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->buffers[_i] = std::unique_ptr<BufferT>(_e->Get(_i)->UnPack(_resolver)); } } };
+  {
+    auto _e = version();
+    _o->version = _e;
+  };
+  {
+    auto _e = operator_codes();
+    if (_e) {
+      _o->operator_codes.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->operator_codes[_i] =
+            std::unique_ptr<OperatorCodeT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
+  {
+    auto _e = subgraphs();
+    if (_e) {
+      _o->subgraphs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->subgraphs[_i] =
+            std::unique_ptr<SubGraphT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
+  {
+    auto _e = description();
+    if (_e) _o->description = _e->str();
+  };
+  {
+    auto _e = buffers();
+    if (_e) {
+      _o->buffers.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->buffers[_i] =
+            std::unique_ptr<BufferT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
 }
 
-inline flatbuffers::Offset<Model> Model::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Model> Model::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   return CreateModel(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<Model> CreateModel(
+    flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ModelT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ModelT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
   auto _version = _o->version;
-  auto _operator_codes = _o->operator_codes.size() ? _fbb.CreateVector<flatbuffers::Offset<OperatorCode>> (_o->operator_codes.size(), [](size_t i, _VectorArgs *__va) { return CreateOperatorCode(*__va->__fbb, __va->__o->operator_codes[i].get(), __va->__rehasher); }, &_va ) : 0;
-  auto _subgraphs = _o->subgraphs.size() ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>> (_o->subgraphs.size(), [](size_t i, _VectorArgs *__va) { return CreateSubGraph(*__va->__fbb, __va->__o->subgraphs[i].get(), __va->__rehasher); }, &_va ) : 0;
-  auto _description = _o->description.empty() ? 0 : _fbb.CreateString(_o->description);
-  auto _buffers = _o->buffers.size() ? _fbb.CreateVector<flatbuffers::Offset<Buffer>> (_o->buffers.size(), [](size_t i, _VectorArgs *__va) { return CreateBuffer(*__va->__fbb, __va->__o->buffers[i].get(), __va->__rehasher); }, &_va ) : 0;
-  return tflite::CreateModel(
-      _fbb,
-      _version,
-      _operator_codes,
-      _subgraphs,
-      _description,
-      _buffers);
-}
-
-inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type) {
+  auto _operator_codes =
+      _o->operator_codes.size()
+          ? _fbb.CreateVector<flatbuffers::Offset<OperatorCode>>(
+                _o->operator_codes.size(),
+                [](size_t i, _VectorArgs *__va) {
+                  return CreateOperatorCode(*__va->__fbb,
+                                            __va->__o->operator_codes[i].get(),
+                                            __va->__rehasher);
+                },
+                &_va)
+          : 0;
+  auto _subgraphs = _o->subgraphs.size()
+                        ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>>(
+                              _o->subgraphs.size(),
+                              [](size_t i, _VectorArgs *__va) {
+                                return CreateSubGraph(
+                                    *__va->__fbb, __va->__o->subgraphs[i].get(),
+                                    __va->__rehasher);
+                              },
+                              &_va)
+                        : 0;
+  auto _description =
+      _o->description.empty() ? 0 : _fbb.CreateString(_o->description);
+  auto _buffers =
+      _o->buffers.size()
+          ? _fbb.CreateVector<flatbuffers::Offset<Buffer>>(
+                _o->buffers.size(),
+                [](size_t i, _VectorArgs *__va) {
+                  return CreateBuffer(*__va->__fbb, __va->__o->buffers[i].get(),
+                                      __va->__rehasher);
+                },
+                &_va)
+          : 0;
+  return tflite::CreateModel(_fbb, _version, _operator_codes, _subgraphs,
+                             _description, _buffers);
+}
+
+inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier,
+                                 const void *obj, BuiltinOptions type) {
   switch (type) {
     case BuiltinOptions_NONE: {
       return true;
@@ -4048,7 +6636,8 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       return verifier.VerifyTable(ptr);
     }
     case BuiltinOptions_LocalResponseNormalizationOptions: {
-      auto ptr = reinterpret_cast<const LocalResponseNormalizationOptions *>(obj);
+      auto ptr =
+          reinterpret_cast<const LocalResponseNormalizationOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
     case BuiltinOptions_LSTMOptions: {
@@ -4083,22 +6672,77 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const MulOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
-    default: return false;
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<const PadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<const GatherOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<const BatchToSpaceNDOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      auto ptr = reinterpret_cast<const SpaceToBatchNDOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_TransposeOptions: {
+      auto ptr = reinterpret_cast<const TransposeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MeanOptions: {
+      auto ptr = reinterpret_cast<const MeanOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SubOptions: {
+      auto ptr = reinterpret_cast<const SubOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DivOptions: {
+      auto ptr = reinterpret_cast<const DivOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      auto ptr = reinterpret_cast<const SqueezeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const SequenceRNNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<const StridedSliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ExpOptions: {
+      auto ptr = reinterpret_cast<const ExpOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default:
+      return false;
   }
 }
 
-inline bool VerifyBuiltinOptionsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+inline bool VerifyBuiltinOptionsVector(
+    flatbuffers::Verifier &verifier,
+    const flatbuffers::Vector<flatbuffers::Offset<void>> *values,
+    const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
   if (values->size() != types->size()) return false;
   for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
-    if (!VerifyBuiltinOptions(
-        verifier,  values->Get(i), types->GetEnum<BuiltinOptions>(i))) {
+    if (!VerifyBuiltinOptions(verifier, values->Get(i),
+                              types->GetEnum<BuiltinOptions>(i))) {
       return false;
     }
   }
   return true;
 }
 
-inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, const flatbuffers::resolver_function_t *resolver) {
+inline void *BuiltinOptionsUnion::UnPack(
+    const void *obj, BuiltinOptions type,
+    const flatbuffers::resolver_function_t *resolver) {
   switch (type) {
     case BuiltinOptions_Conv2DOptions: {
       auto ptr = reinterpret_cast<const Conv2DOptions *>(obj);
@@ -4149,7 +6793,8 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       return ptr->UnPack(resolver);
     }
     case BuiltinOptions_LocalResponseNormalizationOptions: {
-      auto ptr = reinterpret_cast<const LocalResponseNormalizationOptions *>(obj);
+      auto ptr =
+          reinterpret_cast<const LocalResponseNormalizationOptions *>(obj);
       return ptr->UnPack(resolver);
     }
     case BuiltinOptions_LSTMOptions: {
@@ -4184,11 +6829,62 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const MulOptions *>(obj);
       return ptr->UnPack(resolver);
     }
-    default: return nullptr;
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<const PadOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<const GatherOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<const BatchToSpaceNDOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      auto ptr = reinterpret_cast<const SpaceToBatchNDOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_TransposeOptions: {
+      auto ptr = reinterpret_cast<const TransposeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MeanOptions: {
+      auto ptr = reinterpret_cast<const MeanOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SubOptions: {
+      auto ptr = reinterpret_cast<const SubOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DivOptions: {
+      auto ptr = reinterpret_cast<const DivOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      auto ptr = reinterpret_cast<const SqueezeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const SequenceRNNOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<const StridedSliceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ExpOptions: {
+      auto ptr = reinterpret_cast<const ExpOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default:
+      return nullptr;
   }
 }
 
-inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const flatbuffers::rehasher_function_t *_rehasher) const {
   switch (type) {
     case BuiltinOptions_Conv2DOptions: {
       auto ptr = reinterpret_cast<const Conv2DOptionsT *>(value);
@@ -4239,8 +6935,10 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       return CreateL2NormOptions(_fbb, ptr, _rehasher).Union();
     }
     case BuiltinOptions_LocalResponseNormalizationOptions: {
-      auto ptr = reinterpret_cast<const LocalResponseNormalizationOptionsT *>(value);
-      return CreateLocalResponseNormalizationOptions(_fbb, ptr, _rehasher).Union();
+      auto ptr =
+          reinterpret_cast<const LocalResponseNormalizationOptionsT *>(value);
+      return CreateLocalResponseNormalizationOptions(_fbb, ptr, _rehasher)
+          .Union();
     }
     case BuiltinOptions_LSTMOptions: {
       auto ptr = reinterpret_cast<const LSTMOptionsT *>(value);
@@ -4274,26 +6972,80 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const MulOptionsT *>(value);
       return CreateMulOptions(_fbb, ptr, _rehasher).Union();
     }
-    default: return 0;
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<const PadOptionsT *>(value);
+      return CreatePadOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<const GatherOptionsT *>(value);
+      return CreateGatherOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<const BatchToSpaceNDOptionsT *>(value);
+      return CreateBatchToSpaceNDOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      auto ptr = reinterpret_cast<const SpaceToBatchNDOptionsT *>(value);
+      return CreateSpaceToBatchNDOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_TransposeOptions: {
+      auto ptr = reinterpret_cast<const TransposeOptionsT *>(value);
+      return CreateTransposeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MeanOptions: {
+      auto ptr = reinterpret_cast<const MeanOptionsT *>(value);
+      return CreateMeanOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SubOptions: {
+      auto ptr = reinterpret_cast<const SubOptionsT *>(value);
+      return CreateSubOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DivOptions: {
+      auto ptr = reinterpret_cast<const DivOptionsT *>(value);
+      return CreateDivOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      auto ptr = reinterpret_cast<const SqueezeOptionsT *>(value);
+      return CreateSqueezeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const SequenceRNNOptionsT *>(value);
+      return CreateSequenceRNNOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<const StridedSliceOptionsT *>(value);
+      return CreateStridedSliceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ExpOptions: {
+      auto ptr = reinterpret_cast<const ExpOptionsT *>(value);
+      return CreateExpOptions(_fbb, ptr, _rehasher).Union();
+    }
+    default:
+      return 0;
   }
 }
 
-inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FLATBUFFERS_NOEXCEPT : type(u.type), value(nullptr) {
+inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u)
+    FLATBUFFERS_NOEXCEPT : type(u.type),
+                           value(nullptr) {
   switch (type) {
     case BuiltinOptions_Conv2DOptions: {
       value = new Conv2DOptionsT(*reinterpret_cast<Conv2DOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_DepthwiseConv2DOptions: {
-      value = new DepthwiseConv2DOptionsT(*reinterpret_cast<DepthwiseConv2DOptionsT *>(u.value));
+      value = new DepthwiseConv2DOptionsT(
+          *reinterpret_cast<DepthwiseConv2DOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_ConcatEmbeddingsOptions: {
-      value = new ConcatEmbeddingsOptionsT(*reinterpret_cast<ConcatEmbeddingsOptionsT *>(u.value));
+      value = new ConcatEmbeddingsOptionsT(
+          *reinterpret_cast<ConcatEmbeddingsOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_LSHProjectionOptions: {
-      value = new LSHProjectionOptionsT(*reinterpret_cast<LSHProjectionOptionsT *>(u.value));
+      value = new LSHProjectionOptionsT(
+          *reinterpret_cast<LSHProjectionOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_Pool2DOptions: {
@@ -4309,15 +7061,18 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       break;
     }
     case BuiltinOptions_FullyConnectedOptions: {
-      value = new FullyConnectedOptionsT(*reinterpret_cast<FullyConnectedOptionsT *>(u.value));
+      value = new FullyConnectedOptionsT(
+          *reinterpret_cast<FullyConnectedOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_SoftmaxOptions: {
-      value = new SoftmaxOptionsT(*reinterpret_cast<SoftmaxOptionsT *>(u.value));
+      value =
+          new SoftmaxOptionsT(*reinterpret_cast<SoftmaxOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_ConcatenationOptions: {
-      value = new ConcatenationOptionsT(*reinterpret_cast<ConcatenationOptionsT *>(u.value));
+      value = new ConcatenationOptionsT(
+          *reinterpret_cast<ConcatenationOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_AddOptions: {
@@ -4329,7 +7084,8 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       break;
     }
     case BuiltinOptions_LocalResponseNormalizationOptions: {
-      value = new LocalResponseNormalizationOptionsT(*reinterpret_cast<LocalResponseNormalizationOptionsT *>(u.value));
+      value = new LocalResponseNormalizationOptionsT(
+          *reinterpret_cast<LocalResponseNormalizationOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_LSTMOptions: {
@@ -4337,7 +7093,8 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       break;
     }
     case BuiltinOptions_ResizeBilinearOptions: {
-      value = new ResizeBilinearOptionsT(*reinterpret_cast<ResizeBilinearOptionsT *>(u.value));
+      value = new ResizeBilinearOptionsT(
+          *reinterpret_cast<ResizeBilinearOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_CallOptions: {
@@ -4345,25 +7102,83 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       break;
     }
     case BuiltinOptions_ReshapeOptions: {
-      value = new ReshapeOptionsT(*reinterpret_cast<ReshapeOptionsT *>(u.value));
+      value =
+          new ReshapeOptionsT(*reinterpret_cast<ReshapeOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_SkipGramOptions: {
-      value = new SkipGramOptionsT(*reinterpret_cast<SkipGramOptionsT *>(u.value));
+      value =
+          new SkipGramOptionsT(*reinterpret_cast<SkipGramOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_SpaceToDepthOptions: {
-      value = new SpaceToDepthOptionsT(*reinterpret_cast<SpaceToDepthOptionsT *>(u.value));
+      value = new SpaceToDepthOptionsT(
+          *reinterpret_cast<SpaceToDepthOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_EmbeddingLookupSparseOptions: {
-      value = new EmbeddingLookupSparseOptionsT(*reinterpret_cast<EmbeddingLookupSparseOptionsT *>(u.value));
+      value = new EmbeddingLookupSparseOptionsT(
+          *reinterpret_cast<EmbeddingLookupSparseOptionsT *>(u.value));
       break;
     }
     case BuiltinOptions_MulOptions: {
       value = new MulOptionsT(*reinterpret_cast<MulOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_PadOptions: {
+      value = new PadOptionsT(*reinterpret_cast<PadOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GatherOptions: {
+      value = new GatherOptionsT(*reinterpret_cast<GatherOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      value = new BatchToSpaceNDOptionsT(
+          *reinterpret_cast<BatchToSpaceNDOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      value = new SpaceToBatchNDOptionsT(
+          *reinterpret_cast<SpaceToBatchNDOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_TransposeOptions: {
+      value = new TransposeOptionsT(
+          *reinterpret_cast<TransposeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MeanOptions: {
+      value = new MeanOptionsT(*reinterpret_cast<MeanOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SubOptions: {
+      value = new SubOptionsT(*reinterpret_cast<SubOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DivOptions: {
+      value = new DivOptionsT(*reinterpret_cast<DivOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      value =
+          new SqueezeOptionsT(*reinterpret_cast<SqueezeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      value = new SequenceRNNOptionsT(
+          *reinterpret_cast<SequenceRNNOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      value = new StridedSliceOptionsT(
+          *reinterpret_cast<StridedSliceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ExpOptions: {
+      value = new ExpOptionsT(*reinterpret_cast<ExpOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -4476,7 +7291,68 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
-    default: break;
+    case BuiltinOptions_PadOptions: {
+      auto ptr = reinterpret_cast<PadOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GatherOptions: {
+      auto ptr = reinterpret_cast<GatherOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<BatchToSpaceNDOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SpaceToBatchNDOptions: {
+      auto ptr = reinterpret_cast<SpaceToBatchNDOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_TransposeOptions: {
+      auto ptr = reinterpret_cast<TransposeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MeanOptions: {
+      auto ptr = reinterpret_cast<MeanOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SubOptions: {
+      auto ptr = reinterpret_cast<SubOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DivOptions: {
+      auto ptr = reinterpret_cast<DivOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SqueezeOptions: {
+      auto ptr = reinterpret_cast<SqueezeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SequenceRNNOptions: {
+      auto ptr = reinterpret_cast<SequenceRNNOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_StridedSliceOptions: {
+      auto ptr = reinterpret_cast<StridedSliceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ExpOptions: {
+      auto ptr = reinterpret_cast<ExpOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    default:
+      break;
   }
   value = nullptr;
   type = BuiltinOptions_NONE;
@@ -4486,33 +7362,25 @@ inline const tflite::Model *GetModel(const void *buf) {
   return flatbuffers::GetRoot<tflite::Model>(buf);
 }
 
-inline const char *ModelIdentifier() {
-  return "TFL3";
-}
+inline const char *ModelIdentifier() { return "TFL3"; }
 
 inline bool ModelBufferHasIdentifier(const void *buf) {
-  return flatbuffers::BufferHasIdentifier(
-      buf, ModelIdentifier());
+  return flatbuffers::BufferHasIdentifier(buf, ModelIdentifier());
 }
 
-inline bool VerifyModelBuffer(
-    flatbuffers::Verifier &verifier) {
+inline bool VerifyModelBuffer(flatbuffers::Verifier &verifier) {
   return verifier.VerifyBuffer<tflite::Model>(ModelIdentifier());
 }
 
-inline const char *ModelExtension() {
-  return "tflite";
-}
+inline const char *ModelExtension() { return "tflite"; }
 
-inline void FinishModelBuffer(
-    flatbuffers::FlatBufferBuilder &fbb,
-    flatbuffers::Offset<tflite::Model> root) {
+inline void FinishModelBuffer(flatbuffers::FlatBufferBuilder &fbb,
+                              flatbuffers::Offset<tflite::Model> root) {
   fbb.Finish(root, ModelIdentifier());
 }
 
 inline std::unique_ptr<ModelT> UnPackModel(
-    const void *buf,
-    const flatbuffers::resolver_function_t *res = nullptr) {
+    const void *buf, const flatbuffers::resolver_function_t *res = nullptr) {
   return std::unique_ptr<ModelT>(GetModel(buf)->UnPack(res));
 }
 
diff --git a/tensorflow/contrib/lite/simple_memory_arena.h b/tensorflow/contrib/lite/simple_memory_arena.h
index 0d0b7f9ff79bf9fd8a60dbc057d63f44eeaa6396..0535522374c63459d029c252ebe94628cf3122d5 100644
--- a/tensorflow/contrib/lite/simple_memory_arena.h
+++ b/tensorflow/contrib/lite/simple_memory_arena.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
+#define TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
 
 #include <list>
 #include <memory>
@@ -36,9 +36,9 @@ struct ArenaAlloc {
   }
 };
 
-// This small class is responsible for allocating, dealocating and reusing
+// This small class is responsible for allocating, deallocating and reusing
 // dynamic memory from a common underlying buffer. The arena can be used in
-// scenarios when the pattern of memory allocations and dealocations is
+// scenarios when the pattern of memory allocations and deallocations is
 // repetitive, e.g. running NN inference in multiple iterations.
 class SimpleMemoryArena {
  public:
@@ -68,6 +68,10 @@ class SimpleMemoryArena {
 
   TfLiteStatus Clear();
 
+  int64_t BasePointer() const {
+    return reinterpret_cast<int64_t>(underlying_buffer_aligned_ptr_);
+  }
+
  private:
   bool commited_;
   size_t arena_alignment_;
@@ -81,4 +85,4 @@ class SimpleMemoryArena {
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
diff --git a/tensorflow/contrib/lite/simple_memory_arena_test.cc b/tensorflow/contrib/lite/simple_memory_arena_test.cc
index ac676092c6d5d8982b65cd35c2b9770d10ea37b2..4444f642eb75c563c57762d095e454ac63d836c6 100644
--- a/tensorflow/contrib/lite/simple_memory_arena_test.cc
+++ b/tensorflow/contrib/lite/simple_memory_arena_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
 
 namespace tflite {
 namespace {
@@ -85,7 +86,7 @@ TEST(SimpleMemoryArenaTest, TestAfterClear) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/string_util.h b/tensorflow/contrib/lite/string_util.h
index 12872d11232e2a32527d660be8acce3e09f00125..c35a2fff3c23b17515323b65d08df6f6da288834 100644
--- a/tensorflow/contrib/lite/string_util.h
+++ b/tensorflow/contrib/lite/string_util.h
@@ -37,8 +37,8 @@ limitations under the License.
 //   # described above.
 //   buf.WriteToTensor(tensor)
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
 
 #include <vector>
 
@@ -88,4 +88,4 @@ int GetStringCount(const TfLiteTensor* tensor);
 StringRef GetString(const TfLiteTensor* tensor, int string_index);
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
diff --git a/tensorflow/contrib/lite/string_util_test.cc b/tensorflow/contrib/lite/string_util_test.cc
index 5c351638dc2fad0e64fda6d3a9cb14dfc45375af..d53fec7512f902fb277524100640f4a6a2aaf130 100644
--- a/tensorflow/contrib/lite/string_util_test.cc
+++ b/tensorflow/contrib/lite/string_util_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/testing/util.h"
 
 namespace tflite {
 
@@ -111,7 +112,7 @@ TEST(StringUtil, TestEmptyList) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/testdata/multi_add.pb b/tensorflow/contrib/lite/testdata/multi_add.pb
new file mode 100644
index 0000000000000000000000000000000000000000..e95a20841fb2b320bd77994d9dda157d79311dd6
--- /dev/null
+++ b/tensorflow/contrib/lite/testdata/multi_add.pb
@@ -0,0 +1,26 @@
+
+I
+aPlaceholder"/device:CPU:0*
+shape:*
+dtype0
+I
+bPlaceholder"/device:CPU:0*
+dtype0*
+shape:
+I
+cPlaceholder"/device:CPU:0*
+dtype0*
+shape:
+I
+dPlaceholder"/device:CPU:0*
+dtype0*
+shape:
+&
+iAddbc"/device:CPU:0*
+T0
+&
+xAddai"/device:CPU:0*
+T0
+&
+yAdddi"/device:CPU:0*
+T0"
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index ecddb4b807bf1dddec10adfcbab6db6cca85247a..8739ffb34c8378ac1be3c4a25c1db9f70cde38dc 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -18,19 +18,25 @@ gen_zipped_test_files(
     files = [
         "add.zip",
         "avg_pool.zip",
+        "batch_to_space_nd.zip",
         "concat.zip",
         "constant.zip",
         "control_dep.zip",
         "conv.zip",
         "depthwiseconv.zip",
+        "div.zip",
+        "exp.zip",
         "fully_connected.zip",
         "fused_batch_norm.zip",
+        "gather.zip",
         "global_batch_norm.zip",
         "l2_pool.zip",
         "l2norm.zip",
         "local_response_norm.zip",
         "max_pool.zip",
+        "mean.zip",
         "mul.zip",
+        "pad.zip",
         "relu.zip",
         "relu1.zip",
         "relu6.zip",
@@ -38,7 +44,12 @@ gen_zipped_test_files(
         "resize_bilinear.zip",
         "sigmoid.zip",
         "softmax.zip",
+        "space_to_batch_nd.zip",
         "space_to_depth.zip",
+        "squeeze.zip",
+        "strided_slice.zip",
+        "sub.zip",
+        "transpose.zip",
     ],
 )
 
@@ -160,6 +171,12 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "util",
+    testonly = 1,
+    hdrs = ["util.h"],
+)
+
 cc_test(
     name = "test_runner_test",
     srcs = ["test_runner_test.cc"],
@@ -174,31 +191,70 @@ cc_binary(
     srcs = ["nnapi_example.cc"],
     deps = [
         ":parse_testdata_lib",
-        "//tensorflow/contrib/lite:builtin_op_data",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        ":tflite_driver",
         "//tensorflow/contrib/lite/nnapi:nnapi_lib",
     ],
 )
 
+cc_library(
+    name = "tf_driver",
+    srcs = ["tf_driver.cc"],
+    hdrs = ["tf_driver.h"],
+    deps = [
+        ":split",
+        ":test_runner",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
+cc_test(
+    name = "tf_driver_test",
+    size = "small",
+    srcs = ["tf_driver_test.cc"],
+    data = ["//tensorflow/contrib/lite:testdata/multi_add.pb"],
+    deps = [
+        ":tf_driver",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 tf_cc_test(
     name = "generated_examples_zip_test",
-    size = "medium",
+    size = "large",
     srcs = ["generated_examples_zip_test.cc"],
+    args = [
+        "--zip_files_dir=tensorflow/contrib/lite/testing/optest",
+        # TODO(angerson) We may be able to add an external unzip binary instead
+        # of relying on an existing one for OSS builds.
+        "--unzip_binary_path=/usr/bin/unzip",
+    ],
     data = [":optest"],
-    shard_count = 10,
+    shard_count = 20,
     tags = ["no_oss"],
     deps = [
         ":parse_testdata_lib",
+        ":tflite_driver",
+        ":util",
+        "@com_google_googletest//:gtest",
+        "@com_googlesource_code_re2//:re2",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "@com_google_googletest//:gtest",
-        "@com_googlesource_code_re2//:re2",
-    ],
+    ] + select({
+        "//conditions:default": [
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
+        ],
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:android_tensorflow_test_lib",
+        ],
+    }),
 )
 
 filegroup(
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 5bca82ded038ded702effd46c0f4247e45a36524..67621d9722e9067439afb9bc324f521126d13099 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -94,6 +94,13 @@ KNOWN_BUGS = {
     r"softmax.*input_shape=\[1,3,4,3\]": "67749831",
     # SpaceToDepth only supports float32.
     r"space_to_depth.*(float16|int32|uint8|int64)": "68018134",
+    # BatchToSpaceND doesn't support cropping. This catches test cases with
+    # const tensors as crops.
+    r"batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\]": "70594634",
+    # BatchToSpaceND only supports 4D tensors.
+    r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
+    # Div will use floordiv
+    r"div.*int32": "72051395"
 }
 
 
@@ -120,7 +127,7 @@ def toco_options(data_types,
   # to change
   if data_types[0] == "QUANTIZED_UINT8":
     inference_type = "QUANTIZED_UINT8"
-  s = (" --input_types=%s" % ",".join(data_types) +
+  s = (" --input_data_types=%s" % ",".join(data_types) +
        " --inference_type=%s" % inference_type +
        " --input_format=TENSORFLOW_GRAPHDEF" + " --output_format=TFLITE" +
        " --input_arrays=%s" % ",".join(input_arrays) +
@@ -234,7 +241,7 @@ def create_tensor_data(dtype, shape, min_value=-100, max_value=100):
   if dtype in (tf.float32, tf.float16):
     value = (max_value-min_value)*np.random.random_sample(shape)+min_value
   elif dtype in (tf.int32, tf.uint8, tf.int64):
-    value = np.random.random_integers(min_value, max_value, shape)
+    value = np.random.randint(min_value, max_value+1, shape)
   return value.astype(dtype)
 
 
@@ -612,7 +619,7 @@ def make_constant_tests(zip_path):
 
   def build_graph(parameters):
     # Since Toco & Tflite can't have a single constant op in the entire graph,
-    # this test adds a zero tesnor with a constant op tensor.
+    # this test adds a zero tensor with a constant op tensor.
     input1 = tf.placeholder(dtype=parameters["dtype"], name="input1",
                             shape=parameters["input_shape"])
     out = tf.ones(parameters["input_shape"], dtype=parameters["dtype"]) + input1
@@ -626,7 +633,7 @@ def make_constant_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_add_tests(zip_path):
+def make_binary_op_tests(zip_path, binary_operator):
   """Make a set of tests to do add with and without broadcast."""
 
   # These parameters are split because we don't support broadcasting.
@@ -634,25 +641,36 @@ def make_add_tests(zip_path):
       "dtype": [tf.float32, tf.int32],
       "input_shape_1": [[1, 3, 4, 3]],
       "input_shape_2": [[1, 3, 4, 3]],
+      "activation": [True]
   }, {
       "dtype": [tf.float32],
       "input_shape_1": [[5]],
       "input_shape_2": [[5]],
+      "activation": [False, True]
   }, {
       "dtype": [tf.float32],
       "input_shape_1": [[1, 3, 4, 3]],
       "input_shape_2": [[3]],
+      "activation": [True]
   }]
 
   def build_graph(parameters):
-    input1 = tf.placeholder(dtype=parameters["dtype"], name="input1",
-                            shape=parameters["input_shape_1"])
-    input2 = tf.placeholder(dtype=parameters["dtype"], name="input2",
-                            shape=parameters["input_shape_2"])
-    out = tf.add(input1, input2)
+    """Builds the graph given the current parameters."""
+    input1 = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input1",
+        shape=parameters["input_shape_1"])
+    input2 = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input2",
+        shape=parameters["input_shape_2"])
+    out = binary_operator(input1, input2)
+    if parameters["activation"]:
+      out = tf.nn.relu(out)
     return [input1, input2], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
+    """Builds operand inputs for op."""
     input1 = create_tensor_data(parameters["dtype"],
                                 parameters["input_shape_1"])
     input2 = create_tensor_data(parameters["dtype"],
@@ -666,40 +684,135 @@ def make_add_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_mul_tests(zip_path):
-  """Make a set of tests to do mul with and without broadcast."""
+def make_mean_tests(zip_path):
+  """Make a set of tests to do mean."""
 
-  # These parameters are split because we don't support broadcasting.
   test_parameters = [{
-      "dtype": [tf.float32, tf.int32],
-      "input_shape_1": [[1, 3, 4, 3]],
-      "input_shape_2": [[1, 3, 4, 3]],
+      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_shape": [[3, 2, 4]],
+      "axis": [
+          None, 0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
+          [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0],
+          [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
+      ],
+      "const_axis": [True, False],
+      "keep_dims": [True, False],
   }, {
-      "dtype": [tf.float32],
-      "input_shape_1": [[5]],
-      "input_shape_2": [[5]],
-  }, {
-      "dtype": [tf.float32],
-      "input_shape_1": [[1, 3, 4, 3]],
-      "input_shape_2": [[3]],
+      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_shape": [[1, 224, 224, 3]],
+      "axis": [
+          None, 0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2, 3],
+          [3, 2, 1, 0], [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2,
+          -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
+          [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
+      ],
+      "const_axis": [True, False],
+      "keep_dims": [True, False],
   }]
 
   def build_graph(parameters):
-    input1 = tf.placeholder(dtype=parameters["dtype"], name="input1",
-                            shape=parameters["input_shape_1"])
-    input2 = tf.placeholder(dtype=parameters["dtype"], name="input2",
-                            shape=parameters["input_shape_2"])
-    out = tf.multiply(input1, input2)
-    return [input1, input2], [out]
+    """Build the mean op testing graph."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+
+    # Get axis as either a placeholder or constants.
+    if parameters["const_axis"]:
+      axis = parameters["axis"]
+      input_tensors = [input_tensor]
+    else:
+      if isinstance(parameters["axis"], list):
+        shape = [len(parameters["axis"])]
+      else:
+        shape = [0]  # shape for None or integers.
+      axis = tf.placeholder(dtype=tf.int32, name="axis", shape=shape)
+      input_tensors = [input_tensor, axis]
+
+    out = tf.reduce_mean(
+        input_tensor, axis=axis, keep_dims=parameters["keep_dims"])
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input1 = create_tensor_data(parameters["dtype"],
-                                parameters["input_shape_1"])
-    input2 = create_tensor_data(parameters["dtype"],
-                                parameters["input_shape_2"])
-    return [input1, input2], sess.run(
-        outputs, feed_dict={inputs[0]: input1,
-                            inputs[1]: input2})
+    values = [
+        create_tensor_data(parameters["input_dtype"], parameters["input_shape"])
+    ]
+    if not parameters["const_axis"]:
+      if parameters["axis"]:
+        values.append(np.array(parameters["axis"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_exp_tests(zip_path):
+  """Make a set of tests to do exp."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+  }]
+
+  def build_graph(parameters):
+    """Build the exp op testing graph."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+
+    out = tf.exp(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["input_dtype"], parameters["input_shape"])
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_binary_op_tests_func(binary_operator):
+  """Return a function that does a test on a binary operator."""
+  return lambda zip_path: make_binary_op_tests(zip_path, binary_operator)
+
+
+def make_gather_tests(zip_path):
+  """Make a set of tests to do gather."""
+
+  test_parameters = [{
+      # TODO(mgubin): add string tests when they are supported by Toco.
+      # TODO(mgubin): add tests for Nd indices when they are supported by
+      # TfLite.
+      # TODO(mgubin): add tests for axis != 0 when it is supported by TfLite.
+      "params_dtype": [tf.float32, tf.int32],
+      "params_shape": [[10], [1, 2, 20]],
+      "indices_dtype": [tf.int32],
+      "indices_shape": [[3], [5]],
+      "axis": [0],  # axis!=0 is GatherV2
+  }]
+
+  def build_graph(parameters):
+    """Build the gather op testing graph."""
+    params = tf.placeholder(
+        dtype=parameters["params_dtype"],
+        name="params",
+        shape=parameters["params_shape"])
+    indices = tf.placeholder(
+        dtype=parameters["indices_dtype"],
+        name="indices",
+        shape=parameters["indices_shape"])
+    out = tf.gather(params, indices, axis=parameters["axis"])
+    return [params, indices], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    params = create_tensor_data(parameters["params_dtype"],
+                                parameters["params_shape"])
+    indices = create_tensor_data(parameters["indices_dtype"],
+                                 parameters["indices_shape"], 0,
+                                 parameters["params_shape"][0] - 1)
+    return [params, indices], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [params, indices])))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -784,34 +897,55 @@ def make_fused_batch_norm_tests(zip_path):
 def make_conv_tests(zip_path):
   """Make a set of tests to do convolution."""
 
-  test_parameters = [{
-      "input_shape": [[1, 3, 4, 3]],
-      "filter_shape": [[1, 1, 3, 2]],
-      "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
-      "padding": ["SAME", "VALID"],
-      "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
-  }, {
-      "input_shape": [[2, 14, 14, 2]],
-      "filter_shape": [[6, 6, 2, 2]],
-      "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
-      "padding": ["SAME", "VALID"],
-      "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
-  }]
+  test_parameters = [
+      {
+          "input_shape": [[1, 3, 4, 3]],
+          "filter_shape": [[1, 1, 3, 2]],
+          "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+          "padding": ["SAME", "VALID"],
+          "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
+          "constant_filter": [True, False],
+      },
+      {
+          "input_shape": [[2, 14, 14, 2]],
+          "filter_shape": [[6, 6, 2, 2]],
+          "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+          "padding": ["SAME", "VALID"],
+          "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
+          "constant_filter": [True, False],
+      }
+  ]
 
   def build_graph(parameters):
+    """Build a conv graph given `parameters`."""
     input_tensor = tf.placeholder(
         dtype=tf.float32, name="input", shape=parameters["input_shape"])
-    filter_values = create_tensor_data(np.float32, parameters["filter_shape"])
-    out = tf.nn.conv2d(input_tensor, filter_values,
-                       strides=parameters["strides"],
-                       padding=parameters["padding"],
-                       data_format=parameters["data_format"])
-    return [input_tensor], [out]
+
+    # Get filter input either as a placeholder or constants. Also get a list of
+    # the input tensors that are represented as placeholders.
+    if parameters["constant_filter"]:
+      filter_input = create_tensor_data(np.float32, parameters["filter_shape"])
+      input_tensors = [input_tensor]
+    else:
+      filter_input = tf.placeholder(
+          dtype=tf.float32, name="filter", shape=parameters["filter_shape"])
+      input_tensors = [input_tensor, filter_input]
+
+    out = tf.nn.conv2d(
+        input_tensor,
+        filter_input,
+        strides=parameters["strides"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(np.float32, parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    # Build list of input values either containing 1 tensor (input) or 2 tensors
+    # (input, filter) based on whether filter is constant or variable input.
+    values = [create_tensor_data(np.float32, parameters["input_shape"])]
+    if not parameters["constant_filter"]:
+      values.append(create_tensor_data(np.float32, parameters["filter_shape"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -820,45 +954,70 @@ def make_depthwiseconv_tests(zip_path):
   """Make a set of tests to do convolution."""
 
   # Tensorflow only supports equal strides
-  test_parameters = [{
-      "input_shape": [[1, 3, 4, 3], [1, 10, 10, 3]],
-      "filter_size": [[1, 1], [1, 2], [3, 3]],
-      "strides": [[1, 1, 1, 1], [1, 3, 3, 1]],
-      "channel_multiplier": [1, 2],
-      "rate": [[1, 1]],
-      "padding": ["SAME", "VALID"],
-      "data_format": ["NHWC"],
-  }, {
-      "input_shape": [[1, 3, 4, 3]],
-      "filter_size": [[1, 1]],
-      "strides": [[1, 1, 2, 1]],  # TF needs [1, x, x, 1]
-      "channel_multiplier": [2],
-      "rate": [[2, 2]],   #  Only [1, 1] is supported
-      "padding": ["SAME"],
-      "data_format": ["NHWC"],
-  }]
+  test_parameters = [
+      {
+          "input_shape": [[1, 3, 4, 3], [1, 10, 10, 3]],
+          "filter_size": [[1, 1], [1, 2], [3, 3]],
+          "strides": [[1, 1, 1, 1], [1, 3, 3, 1]],
+          "channel_multiplier": [1, 2],
+          "rate": [[1, 1]],
+          "padding": ["SAME", "VALID"],
+          "data_format": ["NHWC"],
+          "constant_filter": [True, False],
+      },
+      {
+          "input_shape": [[1, 3, 4, 3]],
+          "filter_size": [[1, 1]],
+          "strides": [[1, 1, 2, 1]],  # TF needs [1, x, x, 1]
+          "channel_multiplier": [2],
+          "rate": [[2, 2]],  #  Only [1, 1] is supported
+          "padding": ["SAME"],
+          "data_format": ["NHWC"],
+          "constant_filter": [True, False],
+      }
+  ]
+
+  def get_tensor_shapes(parameters):
+    input_shape = parameters["input_shape"]
+    filter_size = parameters["filter_size"]
+    filter_shape = filter_size + [
+        input_shape[3], parameters["channel_multiplier"]
+    ]
+    return [input_shape, filter_shape]
 
   def build_graph(parameters):
     """Build a depthwise conv graph given `parameters`."""
-    input_shape = parameters["input_shape"]
-    filter_size = parameters["filter_size"]
+    input_shape, filter_shape = get_tensor_shapes(parameters)
     input_tensor = tf.placeholder(
         dtype=tf.float32, name="input", shape=input_shape)
-    filter_shape = filter_size + [
-        input_shape[3], parameters["channel_multiplier"]]
-    filter_values = create_tensor_data(np.float32, filter_shape)
+
+    # Get filter input either as a placeholder or constants. Also get a list of
+    # the input tensors that are represented as placeholders.
+    if parameters["constant_filter"]:
+      filter_input = create_tensor_data(np.float32, filter_shape)
+      input_tensors = [input_tensor]
+    else:
+      filter_input = tf.placeholder(
+          dtype=tf.float32, name="filter", shape=filter_shape)
+      input_tensors = [input_tensor, filter_input]
+
     out = tf.nn.depthwise_conv2d(
-        input_tensor, filter_values,
+        input_tensor,
+        filter_input,
         strides=parameters["strides"],
         rate=parameters["rate"],
         padding=parameters["padding"],
         data_format=parameters["data_format"])
-    return [input_tensor], [out]
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(np.float32, parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    # Build list of input values either containing 1 tensor (input) or 2 tensors
+    # (input, filter) based on whether filter is constant or variable input.
+    input_shape, filter_shape = get_tensor_shapes(parameters)
+    values = [create_tensor_data(np.float32, input_shape)]
+    if not parameters["constant_filter"]:
+      values.append(create_tensor_data(np.float32, filter_shape))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -869,13 +1028,15 @@ def make_concatenation_tests(zip_path):
   test_parameters = [{
       "base_shape": [[1, 3, 4, 3], [3, 4]],
       "num_tensors": [1, 2, 3, 4, 5, 6],
-      "axis": [0, 1, 2, 3],
+      "axis": [0, 1, 2, 3, -3, -2, -1],
   }]
 
   def get_shape(parameters, delta):
     """Return a tweaked version of 'base_shape'."""
     axis = parameters["axis"]
     shape = parameters["base_shape"][:]
+    if axis < 0:
+      axis += len(shape)
     if axis < len(shape):
       shape[axis] += delta
     return shape
@@ -909,32 +1070,49 @@ def make_fully_connected_tests(zip_path):
       "shape2": [[3, 3]],
       "transpose_a": [True, False],
       "transpose_b": [True, False],
+      "constant_filter": [True, False],
   }, {
       "shape1": [[4, 4], [1, 4], [4]],
       "shape2": [[4, 4], [4, 1], [4]],
       "transpose_a": [False],
       "transpose_b": [False],
+      "constant_filter": [True, False],
   }, {
       "shape1": [[40, 37]],
       "shape2": [[37, 40]],
       "transpose_a": [False],
       "transpose_b": [False],
-
+      "constant_filter": [True, False],
   }]
 
   def build_graph(parameters):
+    """Build a matmul graph given `parameters`."""
     input_tensor1 = tf.placeholder(dtype=tf.float32, name="input1",
                                    shape=parameters["shape1"])
-    input_tensor2 = create_tensor_data(np.float32, parameters["shape2"])
+
+    # Get input_tensor2 either as a placeholder or constants. Also get a list of
+    # the input tensors that are represented as placeholders.
+    if parameters["constant_filter"]:
+      input_tensor2 = create_tensor_data(np.float32, parameters["shape2"])
+      input_tensors = [input_tensor1]
+    else:
+      input_tensor2 = tf.placeholder(
+          dtype=tf.float32, name="input2", shape=parameters["shape2"])
+      input_tensors = [input_tensor1, input_tensor2]
+
     out = tf.matmul(input_tensor1, input_tensor2,
                     transpose_a=parameters["transpose_a"],
                     transpose_b=parameters["transpose_b"])
-    return [input_tensor1], [out]
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values1 = create_tensor_data(np.float32, shape=parameters["shape1"])
-    return [input_values1], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values1])))
+    # Build list of input values either containing 1 tensor (input_values1) or 2
+    # tensors (input_values1, input_values2) based on whether the second input
+    # is a constant or variable input.
+    values = [create_tensor_data(np.float32, shape=parameters["shape1"])]
+    if not parameters["constant_filter"]:
+      values.append(create_tensor_data(np.float32, parameters["shape2"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -999,10 +1177,61 @@ def make_local_response_norm_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_pad_tests(zip_path):
+  """Make a set of tests to do pad."""
+
+  # TODO(nupurgarg): Add test for tf.uint8.
+  test_parameters = [
+      {
+          "dtype": [tf.int32, tf.int64, tf.float32],
+          "input_shape": [[1, 1, 2, 1], [2, 1, 1, 1]],
+          "paddings": [[[0, 0], [0, 1], [2, 3], [0, 0]], [[0, 1], [0, 0],
+                                                          [0, 0], [2, 3]]],
+          "constant_paddings": [True, False],
+      },
+      # Non-4D use case.
+      {
+          "dtype": [tf.int32, tf.int64, tf.float32],
+          "input_shape": [[1, 2], [0, 1, 2]],
+          "paddings": [[[0, 1], [2, 3]]],
+          "constant_paddings": [True, False],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build a pad graph given `parameters`."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+
+    # Get paddings as either a placeholder or constants.
+    if parameters["constant_paddings"]:
+      paddings = parameters["paddings"]
+      input_tensors = [input_tensor]
+    else:
+      shape = [len(parameters["paddings"]), 2]
+      paddings = tf.placeholder(dtype=tf.int32, name="padding", shape=shape)
+      input_tensors = [input_tensor, paddings]
+
+    out = tf.pad(input_tensor, paddings=paddings)
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_paddings"]:
+      values.append(np.array(parameters["paddings"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_reshape_tests(zip_path):
   """Make a set of tests to do reshape."""
 
-  # Alll shapes below are suitable for tensors with 420 elements.
+  # All shapes below are suitable for tensors with 420 elements.
   test_parameters = [{
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[3, 4, 5, 7], [4, 105], [21, 5, 2, 2], [420]],
@@ -1125,6 +1354,335 @@ def make_space_to_depth_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_space_to_batch_nd_tests(zip_path):
+  """Make a set of tests to do space_to_batch_nd."""
+
+  # TODO(nupurgarg): Add test for uint8.
+  test_parameters = [
+      {
+          "dtype": [tf.int32, tf.int64, tf.float32],
+          "input_shape": [[1, 2, 2, 3], [2, 2, 4, 1]],
+          "block_shape": [[1, 3], [2, 2]],
+          "paddings": [[[0, 0], [0, 0]], [[0, 0], [2, 0]], [[1, 1], [1, 1]]],
+          "constant_block_shape": [True, False],
+          "constant_paddings": [True, False],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape": [[2, 3, 7, 3]],
+          "block_shape": [[1, 3], [2, 2]],
+          "paddings": [[[0, 0], [2, 0]], [[1, 0], [1, 0]]],
+          "constant_block_shape": [True, False],
+          "constant_paddings": [True, False],
+      },
+      # Non-4D use case: 1 bath dimension, 3 spatial dimensions, 2 others.
+      {
+          "dtype": [tf.float32],
+          "input_shape": [[1, 4, 4, 4, 1, 1]],
+          "block_shape": [[2, 2, 2]],
+          "paddings": [[[0, 0], [0, 0], [0, 0]]],
+          "constant_block_shape": [True, False],
+          "constant_paddings": [True, False],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build a space_to_batch graph given `parameters`."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    input_tensors = [input_tensor]
+
+    # Get block_shape either as a const or as a placeholder (tensor).
+    if parameters["constant_block_shape"]:
+      block_shape = parameters["block_shape"]
+    else:
+      shape = [len(parameters["block_shape"])]
+      block_shape = tf.placeholder(dtype=tf.int32, name="shape", shape=shape)
+      input_tensors.append(block_shape)
+
+    # Get paddings either as a const or as a placeholder (tensor).
+    if parameters["constant_paddings"]:
+      paddings = parameters["paddings"]
+    else:
+      shape = [len(parameters["paddings"]), 2]
+      paddings = tf.placeholder(dtype=tf.int32, name="paddings", shape=shape)
+      input_tensors.append(paddings)
+
+    out = tf.space_to_batch_nd(input_tensor, block_shape, paddings)
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_block_shape"]:
+      values.append(np.array(parameters["block_shape"]))
+    if not parameters["constant_paddings"]:
+      values.append(np.array(parameters["paddings"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_batch_to_space_nd_tests(zip_path):
+  """Make a set of tests to do batch_to_space_nd."""
+
+  test_parameters = [
+      {
+          "dtype": [tf.float32, tf.int64, tf.int32],
+          "input_shape": [[12, 2, 2, 1]],
+          "block_shape": [[1, 4], [2, 2], [3, 4]],
+          "crops": [[[0, 0], [0, 0]], [[1, 1], [1, 1]]],
+          "constant_block_shape": [True, False],
+          "constant_crops": [True, False],
+      },
+      # Non-4D use case: 1 bath dimension, 3 spatial dimensions, 2 others.
+      {
+          "dtype": [tf.float32],
+          "input_shape": [[8, 2, 2, 2, 1, 1]],
+          "block_shape": [[2, 2, 2]],
+          "crops": [[[0, 0], [0, 0], [0, 0]]],
+          "constant_block_shape": [True, False],
+          "constant_crops": [True, False],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build a batch_to_space graph given `parameters`."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    input_tensors = [input_tensor]
+
+    # Get block_shape either as a const or as a placeholder (tensor).
+    if parameters["constant_block_shape"]:
+      block_shape = parameters["block_shape"]
+    else:
+      shape = [len(parameters["block_shape"])]
+      block_shape = tf.placeholder(dtype=tf.int32, name="shape", shape=shape)
+      input_tensors.append(block_shape)
+
+    # Get crops either as a const or as a placeholder (tensor).
+    if parameters["constant_crops"]:
+      crops = parameters["crops"]
+    else:
+      shape = [len(parameters["crops"]), 2]
+      crops = tf.placeholder(dtype=tf.int32, name="crops", shape=shape)
+      input_tensors.append(crops)
+
+    out = tf.batch_to_space_nd(input_tensor, block_shape, crops)
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_block_shape"]:
+      values.append(np.array(parameters["block_shape"]))
+    if not parameters["constant_crops"]:
+      values.append(np.array(parameters["crops"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_transpose_tests(zip_path):
+  """Make a set of tests to do transpose."""
+
+  # TODO(nupurgarg): Add test for uint8.
+  test_parameters = [{
+      "dtype": [tf.int32, tf.int64, tf.float32],
+      "input_shape": [[2, 2, 3]],
+      "perm": [[0, 1, 2], [0, 2, 1]],
+      "constant_perm": [True, False],
+  }, {
+      "dtype": [tf.float32],
+      "input_shape": [[1, 2, 3, 4]],
+      "perm": [[0, 1, 2, 3], [3, 0, 1, 2]],
+      "constant_perm": [True, False],
+  }, {
+      "dtype": [tf.float32],
+      "input_shape": [[1, 2, 3, 4, 5]],
+      "perm": [[0, 1, 2, 3, 4]],
+      "constant_perm": [True, False],
+  }]
+
+  def build_graph(parameters):
+    """Build a transpose graph given `parameters`."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+
+    if parameters["constant_perm"]:
+      perm = parameters["perm"]
+      input_tensors = [input_tensor]
+    else:
+      shape = [len(parameters["perm"]), 2]
+      perm = tf.placeholder(dtype=tf.int32, name="perm", shape=shape)
+      input_tensors = [input_tensor, perm]
+
+    out = tf.transpose(input_tensor, perm=perm)
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_perm"]:
+      values.append(np.array(parameters["perm"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_squeeze_tests(zip_path):
+  """Make a set of tests to do squeeze."""
+
+  test_parameters = [{
+      "dtype": [tf.int32, tf.float32, tf.int64],
+      "input_shape": [[1, 2, 1, 3, 1, 4, 1, 1]],
+      "axis": [
+          None, [], [0, 2], [4, 7], [-1, 0, 2, 0, 7, -6], [1], [2, 3, 2],
+          [-1, -2, -4, -6, -8], [0, 2, 4, 6, 7], [7, 6, 4, 2, 0], [6, 6],
+          [0, 1, 2, 3, 4, 5, 6, 7], [-2, -3, 1, 0, 7, -5]
+      ],
+  }, {
+      "dtype": [tf.int32, tf.float32, tf.int64],
+      "input_shape": [[1]],
+      "axis": [None, [], [0], [-1]],
+  }, {
+      "dtype": [tf.int32, tf.float32, tf.int64],
+      "input_shape": [[1, 1, 1, 1, 1]],
+      "axis": [None, [], [0], [3, 0], [-2, 0, 3, 2]],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    out = tf.squeeze(input_tensor, axis=parameters["axis"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_strided_slice_tests(zip_path):
+  """Make a set of tests to do strided_slice."""
+
+  # TODO(soroosh): add test/support for uint8.
+  test_parameters = [
+      # 4-D
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "index_type": [tf.int32],
+          "input_shape": [[12, 2, 2, 5]],
+          "begin": [[0, 0, 0, 0], [1, 0, 1, 0]],
+          "end": [[8, 2, 2, 3], [12, 2, 2, 5]],
+          "strides": [None, [2, 1, 3, 1]],
+          "begin_mask": [None, 1, 8],
+          "end_mask": [None, 1, 8],
+          "shrink_axis_mask": [None, 1, 8, 11, 15, -1],
+          "constant_indices": [False, True],
+      },
+      # 2-D
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "index_type": [tf.int32],
+          "input_shape": [[2, 3]],
+          "begin": [[0, 0], [1, 0]],
+          "end": [[2, 3], [2, 2]],
+          "strides": [None, [2, 2]],
+          "begin_mask": [None, 1, 2],
+          "end_mask": [None, 1, 2],
+          "shrink_axis_mask": [None, 1, 2, 3, -1],
+          "constant_indices": [False, True],
+      },
+      # Negative strides
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "index_type": [tf.int32],
+          "input_shape": [[2, 3]],
+          "begin": [[0, -1]],
+          "end": [[2, -3]],
+          "strides": [[1, -1]],
+          "begin_mask": [None, 1, 2],
+          "end_mask": [None, 1, 2],
+          "shrink_axis_mask": [None, 1, 2, 3, -1],
+          "constant_indices": [False],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build graph for stride_slice test."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    if parameters["constant_indices"]:
+      begin = parameters["begin"]
+      end = parameters["end"]
+      strides = parameters["strides"]
+      tensors = [input_tensor]
+    else:
+      begin = tf.placeholder(
+          dtype=parameters["index_type"],
+          name="begin",
+          shape=[len(parameters["input_shape"])])
+      end = tf.placeholder(
+          dtype=parameters["index_type"],
+          name="end",
+          shape=[len(parameters["input_shape"])])
+      strides = (
+          tf.placeholder(
+              dtype=parameters["index_type"],
+              name="strides",
+              shape=[len(parameters["input_shape"])])
+          if parameters["strides"] is not None else None)
+      tensors = [input_tensor, begin, end]
+      if strides is not None:
+        tensors.append(strides)
+    out = tf.strided_slice(
+        input_tensor,
+        begin,
+        end,
+        strides,
+        begin_mask=parameters["begin_mask"],
+        end_mask=parameters["end_mask"])
+    return tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Build inputs for stride_slice test."""
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    index_type = _TF_TYPE_INFO[parameters["index_type"]][0]
+    values = [input_values]
+    if not parameters["constant_indices"]:
+      begin_values = np.array(parameters["begin"]).astype(index_type)
+      end_values = np.array(parameters["end"]).astype(index_type)
+      stride_values = (
+          np.array(parameters["strides"]).astype(index_type)
+          if parameters["strides"] is not None else None)
+      values.append(begin_values)
+      values.append(end_values)
+      if stride_values is not None:
+        values.append(stride_values)
+
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_l2_pool(input_tensor, ksize, strides, padding, data_format):
   """Given an input perform a sequence of TensorFlow ops to produce l2pool."""
   return tf.sqrt(tf.nn.avg_pool(
@@ -1152,28 +1710,39 @@ def main(unused_args):
 
     dispatch = {
         "control_dep.zip": make_control_dep_tests,
-        "add.zip": make_add_tests,
+        "add.zip": make_binary_op_tests_func(tf.add),
+        "space_to_batch_nd.zip": make_space_to_batch_nd_tests,
+        "div.zip": make_binary_op_tests_func(tf.div),
+        "sub.zip": make_binary_op_tests_func(tf.subtract),
+        "batch_to_space_nd.zip": make_batch_to_space_nd_tests,
         "conv.zip": make_conv_tests,
         "constant.zip": make_constant_tests,
         "depthwiseconv.zip": make_depthwiseconv_tests,
         "concat.zip": make_concatenation_tests,
         "fully_connected.zip": make_fully_connected_tests,
         "global_batch_norm.zip": make_global_batch_norm_tests,
+        "gather.zip": make_gather_tests,
         "fused_batch_norm.zip": make_fused_batch_norm_tests,
         "l2norm.zip": make_l2norm_tests,
         "local_response_norm.zip": make_local_response_norm_tests,
-        "mul.zip": make_mul_tests,
+        "mul.zip": make_binary_op_tests_func(tf.multiply),
         "relu.zip": make_relu_tests,
         "relu1.zip": make_relu1_tests,
         "relu6.zip": make_relu6_tests,
         "l2_pool.zip": make_pool_tests(make_l2_pool),
         "avg_pool.zip": make_pool_tests(tf.nn.avg_pool),
         "max_pool.zip": make_pool_tests(tf.nn.max_pool),
+        "pad.zip": make_pad_tests,
         "reshape.zip": make_reshape_tests,
         "resize_bilinear.zip": make_resize_bilinear_tests,
         "sigmoid.zip": make_sigmoid_tests,
         "softmax.zip": make_softmax_tests,
         "space_to_depth.zip": make_space_to_depth_tests,
+        "transpose.zip": make_transpose_tests,
+        "mean.zip": make_mean_tests,
+        "squeeze.zip": make_squeeze_tests,
+        "strided_slice.zip": make_strided_slice_tests,
+        "exp.zip": make_exp_tests,
     }
     out = FLAGS.zip_to_output
     bin_path = FLAGS.toco
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index e7df97ee54cc631c29a3a6f63a85894236f08157..80e806ab03fe1a28da42bd704b3dcb7254c69347 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -20,24 +20,25 @@ limitations under the License.
 #include <sstream>
 #include <gtest/gtest.h>
 #include "re2/re2.h"
-#include "tensorflow/contrib/lite/builtin_op_data.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/testing/parse_testdata.h"
+#include "tensorflow/contrib/lite/testing/tflite_driver.h"
+#include "tensorflow/contrib/lite/testing/util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/subprocess.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
+namespace tflite {
+namespace testing {
+
 namespace {
 bool FLAGS_ignore_known_bugs = true;
+// TODO(b/71769302) zip_files_dir should have a more accurate default, if
+// possible
+string* FLAGS_zip_files_dir = new string("./");
+string* FLAGS_unzip_binary_path = new string("/usr/bin/unzip");
 }  // namespace
 
-namespace tflite {
-namespace testing {
-
 // TensorFlow system environment for file system called.
 tensorflow::Env* env = tensorflow::Env::Default();
 
@@ -46,37 +47,51 @@ tensorflow::Env* env = tensorflow::Env::Default();
 // Key is a substring of the test name and value is a bug number.
 // TODO(ahentz): make sure we clean this list up frequently.
 std::map<string, string> kBrokenTests = {
-    // Add doesn't support broadcasting.
-    {R"(addd.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
-    {R"(muld.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
+    // Sub and Div don't support broadcasting.
+    {R"(^\/diva.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
+    {R"(^\/suba.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
 
     // Add only supports float32. (and "constant" tests use Add)
-    {R"(addd.*int32)", "68808744"},
-    {R"(constant.*int32)", "68808744"},
-    {R"(mul.*int32)", "68808744"},
+    {R"(^\/adda.*int32)", "68808744"},
+    {R"(^\/constant.*int32)", "68808744"},
+    {R"(^\/mul.*int32)", "68808744"},
+    {R"(^\/div.*int32)", "68808744"},
+    {R"(^\/sub.*int32)", "68808744"},
+
+    // Pad only supports 4D tensors.
+    {R"(^\/pad.*,input_shape=\[.,.\],paddings=\[\[.,.\],\[.,.\]\])",
+     "70527055"},
+
+    // L2Norm only supports tensors with 4D or fewer.
+    {R"(^\/l2normdim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
 
-    // Toco or TFLite has a bug to deal with some constant functions with
-    // more than 1 element.
-    {R"(constant.*input_shape=\[(2|2,2,2,2)\])", "68721522"},
+    // BatchToSpaceND doesn't support cropping. This catches test cases with
+    // non-const tensors as crops.
+    {R"(^\/batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\])", "70594634"},
 
-    // L2Norm only supports 4D tensors.
-    {R"(l2normdim=.*,epsilon=.*,input_shape=\[.,.\])", "67963684"},
-    {R"(l2normdim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
+    // SpaceToBatchND only supports 4D tensors.
+    {R"(^\/space_to_batch_nd.*input_shape=\[1,4,4,4,1,1\])", "70848787"},
 
     // L2Norm only works for dim=-1.
-    {R"(l2normdim=-2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(l2normdim=-2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(l2normdim=2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(l2normdim=2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(l2normdim=0,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(l2normdim=0,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(l2normdim=1,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(l2normdim=1,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(l2normdim=\[2,3\],epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(l2normdim=\[2,3\],epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2normdim=-2,epsilon=.*,input_shape=\[.,.\])", "67963812"},
+    {R"(^\/l2normdim=0,epsilon=.*,input_shape=\[.,.\])", "67963812"},
+    {R"(^\/l2normdim=-2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2normdim=-2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2normdim=2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2normdim=2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2normdim=0,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2normdim=0,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2normdim=1,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2normdim=1,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2normdim=\[2,3\],epsilon=.*,input_shape=\[3,15,14,3\])",
+     "67963812"},
+    {R"(^\/l2normdim=\[2,3\],epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
 
     // ResizeBilinear looks completely incompatible with Tensorflow
-    {R"(resize_bilinear)", "67964336"},
+    {R"(^\/resize_bilinear.*dtype=tf.int32)", "72401107"},
+
+    // Transpose only supports 1D-4D input tensors.
+    {R"(^\/transpose.*input_shape=\[.,.,.,.,.\])", "71545879"},
 };
 
 // Allows test data to be unzipped into a temporary directory and makes
@@ -96,13 +111,14 @@ class ZipEnvironment : public ::testing::Environment {
   }
 
   // Unzip `zip` file into a new temporary directory  `out_dir`.
-  tensorflow::Status UnZip(const std::string& zip, std::string* out_dir) {
+  tensorflow::Status UnZip(const string& zip, string* out_dir) {
     string dir;
     TF_CHECK_OK(MakeTemporaryDirectory(&dir));
     tensorflow::SubProcess proc;
-    std::string unzip_binary =
-        "/usr/bin/unzip";
-    proc.SetProgram(unzip_binary, {"unzip", "-d", dir, zip.c_str()});
+    string unzip_binary = *FLAGS_unzip_binary_path;
+    TF_CHECK_OK(env->FileExists(unzip_binary));
+    TF_CHECK_OK(env->FileExists(zip));
+    proc.SetProgram(unzip_binary, {"unzip", "-d", dir, zip});
     proc.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
     proc.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
     if (!proc.Start())
@@ -144,85 +160,68 @@ ZipEnvironment* zip_environment() {
 // the temporary directory where the zip file has been unarchived and
 // `test_paths` is the list of test prefixes that were in the manifest.
 // Note, it is an error for a manifest to contain no tests.
-tensorflow::Status ReadManifest(const std::string& original_file,
-                                const std::string& dir,
-                                std::vector<std::string>* test_paths) {
+tensorflow::Status ReadManifest(const string& original_file, const string& dir,
+                                std::vector<string>* test_paths) {
   // Read the newline delimited list of entries in the manifest.
   std::ifstream manifest_fp(dir + "/manifest.txt");
-  std::string manifest((std::istreambuf_iterator<char>(manifest_fp)),
-                       std::istreambuf_iterator<char>());
+  string manifest((std::istreambuf_iterator<char>(manifest_fp)),
+                  std::istreambuf_iterator<char>());
   size_t pos = 0;
   int added = 0;
   while (true) {
     size_t end_pos = manifest.find("\n", pos);
-    if (end_pos == std::string::npos) break;
-    std::string filename = manifest.substr(pos, end_pos - pos);
+    if (end_pos == string::npos) break;
+    string filename = manifest.substr(pos, end_pos - pos);
     test_paths->push_back(dir + "/" + filename);
     pos = end_pos + 1;
     added += 1;
   }
   if (!added) {
-    std::string message = "Test had no examples: " + original_file;
+    string message = "Test had no examples: " + original_file;
     return tensorflow::Status(tensorflow::error::UNKNOWN, message.c_str());
   }
   return tensorflow::Status::OK();
 }
 
 // Get a list of tests from a zip file `zip_file_name`.
-std::vector<std::string> UnarchiveZipAndFindTestNames(
-    const std::string& zip_file_name) {
-  std::string zip_file = ::tensorflow::testing::TensorFlowSrcRoot() +
-                         "/contrib/lite/testing/optest/" + zip_file_name;
-  std::string decompress_tmp_dir;
+std::vector<string> UnarchiveZipAndFindTestNames(const string& zip_file_name) {
+  string zip_file = *FLAGS_zip_files_dir + "/" + zip_file_name;
+  string decompress_tmp_dir;
   TF_CHECK_OK(zip_environment()->UnZip(zip_file, &decompress_tmp_dir));
-  std::vector<std::string> stuff;
+  std::vector<string> stuff;
   TF_CHECK_OK(ReadManifest(zip_file, decompress_tmp_dir, &stuff));
   return stuff;
 }
 
-class OpsTest : public ::testing::TestWithParam<std::string> {};
+class OpsTest : public ::testing::TestWithParam<string> {};
 
 TEST_P(OpsTest, RunStuff) {
-  std::string test_path = GetParam();
-  std::string tflite_file = test_path + ".bin";
-  std::string tflite_examples = test_path + ".inputs";
-  auto model = tflite::FlatBufferModel::BuildFromFile(tflite_file.c_str());
-  std::unique_ptr<tflite::Interpreter> interpreter;
-
-  tflite::ops::builtin::BuiltinOpResolver builtins;
-  ASSERT_EQ(tflite::InterpreterBuilder(*model, builtins)(&interpreter),
-            kTfLiteOk);
+  string test_path = GetParam();
+  string tflite_test_case = test_path + "_tests.txt";
+  string tflite_dir = test_path.substr(0, test_path.find_last_of("/"));
+  string test_name = test_path.substr(test_path.find_last_of('/'));
 
-  std::vector<tflite::testing::Example> examples;
-  ASSERT_EQ(tflite::testing::ParseExamples(tflite_examples.c_str(), &examples),
-            kTfLiteOk);
+  std::ifstream tflite_stream(tflite_test_case);
+  ASSERT_TRUE(tflite_stream.is_open()) << tflite_test_case;
+  tflite::testing::TfLiteDriver test_driver(/*use_nnapi=*/true);
+  test_driver.SetModelBaseDir(tflite_dir);
 
   string bug_number;
   for (const auto& p : kBrokenTests) {
-    if (RE2::PartialMatch(test_path, p.first)) {
+    if (RE2::PartialMatch(test_name, p.first)) {
       bug_number = p.second;
     }
   }
 
-  for (const auto& example : examples) {
-    ASSERT_EQ(interpreter->inputs().size(), example.inputs.size());
-    auto result = [&]() {
-      TF_LITE_ENSURE_STATUS(FeedExample(interpreter.get(), example));
-      TF_LITE_ENSURE_STATUS(interpreter->Invoke());
-      TF_LITE_ENSURE_STATUS(CheckOutputs(interpreter.get(), example));
-      return kTfLiteOk;
-    }();
-
-    if (bug_number.empty()) {
-      ASSERT_EQ(result, kTfLiteOk);
+  bool result = tflite::testing::ParseAndRunTests(&tflite_stream, &test_driver);
+  if (bug_number.empty()) {
+    EXPECT_TRUE(result) << test_driver.GetErrorMessage();
+  } else {
+    if (FLAGS_ignore_known_bugs) {
+      EXPECT_FALSE(result);
     } else {
-      if (FLAGS_ignore_known_bugs) {
-        ASSERT_EQ(result, kTfLiteError)
-            << "Not failing as expected dut to http://b/" << bug_number;
-      } else {
-        ASSERT_EQ(result, kTfLiteOk)
-            << "Possibly due to http://b/" << bug_number;
-      }
+      EXPECT_TRUE(result) << test_driver.GetErrorMessage()
+                          << ": Possibly due to http://b/" << bug_number;
     }
   }
 }
@@ -236,19 +235,24 @@ TEST_P(OpsTest, RunStuff) {
 
 INSTANTIATE_TESTS(add)
 INSTANTIATE_TESTS(avg_pool)
+INSTANTIATE_TESTS(space_to_batch_nd)
+INSTANTIATE_TESTS(batch_to_space_nd)
 INSTANTIATE_TESTS(concat)
 INSTANTIATE_TESTS(constant)
 INSTANTIATE_TESTS(control_dep)
 INSTANTIATE_TESTS(conv)
 INSTANTIATE_TESTS(depthwiseconv)
+INSTANTIATE_TESTS(exp)
 INSTANTIATE_TESTS(fully_connected)
 INSTANTIATE_TESTS(fused_batch_norm)
+INSTANTIATE_TESTS(gather)
 INSTANTIATE_TESTS(global_batch_norm)
 INSTANTIATE_TESTS(l2norm)
 INSTANTIATE_TESTS(l2_pool)
 INSTANTIATE_TESTS(local_response_norm)
 INSTANTIATE_TESTS(max_pool)
 INSTANTIATE_TESTS(mul)
+INSTANTIATE_TESTS(pad)
 INSTANTIATE_TESTS(relu)
 INSTANTIATE_TESTS(relu1)
 INSTANTIATE_TESTS(relu6)
@@ -257,6 +261,12 @@ INSTANTIATE_TESTS(resize_bilinear)
 INSTANTIATE_TESTS(sigmoid)
 INSTANTIATE_TESTS(softmax)
 INSTANTIATE_TESTS(space_to_depth)
+INSTANTIATE_TESTS(sub)
+INSTANTIATE_TESTS(div)
+INSTANTIATE_TESTS(transpose)
+INSTANTIATE_TESTS(mean)
+INSTANTIATE_TESTS(squeeze)
+INSTANTIATE_TESTS(strided_slice)
 
 }  // namespace testing
 }  // namespace tflite
@@ -264,16 +274,23 @@ INSTANTIATE_TESTS(space_to_depth)
 int main(int argc, char** argv) {
   ::testing::AddGlobalTestEnvironment(tflite::testing::zip_environment());
 
-  std::vector<tensorflow::Flag> flags = {tensorflow::Flag(
-      "ignore_known_bugs", &FLAGS_ignore_known_bugs,
-      "If a particular model is affected by a known bug, the "
-      "corresponding test should expect the outputs to not match.")};
+  std::vector<tensorflow::Flag> flags = {
+      tensorflow::Flag(
+          "ignore_known_bugs", &tflite::testing::FLAGS_ignore_known_bugs,
+          "If a particular model is affected by a known bug, the "
+          "corresponding test should expect the outputs to not match."),
+      tensorflow::Flag("zip_files_dir", tflite::testing::FLAGS_zip_files_dir,
+                       "Required: Location of the test zips."),
+      tensorflow::Flag("unzip_binary_path",
+                       tflite::testing::FLAGS_unzip_binary_path,
+                       "Required: Location of a suitable unzip binary.")};
   bool success = tensorflow::Flags::Parse(&argc, argv, flags);
   if (!success || (argc == 2 && !strcmp(argv[1], "--helpfull"))) {
     fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
     return 1;
   }
 
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/contrib/lite/testing/message.h b/tensorflow/contrib/lite/testing/message.h
index 78ef7e2cbe1c323753ac36f1be06a089e650aa37..e2bc4082141f0601c141a193fbea75f8f759146a 100644
--- a/tensorflow/contrib/lite/testing/message.h
+++ b/tensorflow/contrib/lite/testing/message.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
 
 #include <memory>
 #include <string>
@@ -79,4 +79,4 @@ class Message {
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
diff --git a/tensorflow/contrib/lite/testing/nnapi_example.cc b/tensorflow/contrib/lite/testing/nnapi_example.cc
index 74f6cfc3de5d209671c38595434a43128966bb0e..5870782b69217f292fe60821ea8ce4ea1174c495 100644
--- a/tensorflow/contrib/lite/testing/nnapi_example.cc
+++ b/tensorflow/contrib/lite/testing/nnapi_example.cc
@@ -19,80 +19,35 @@ limitations under the License.
 // Usage: bazel run -c opt \
 // tensorflow/contrib/lite/nnapi:nnapi_example -- <filename>
 //
+#include <dirent.h>
 #include <cstdarg>
 #include <cstdio>
-#include "tensorflow/contrib/lite/builtin_op_data.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
 #include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
 #include "tensorflow/contrib/lite/testing/parse_testdata.h"
+#include "tensorflow/contrib/lite/testing/tflite_driver.h"
 
-// TODO(aselle): FATAL leaves resources hanging.
-void FATAL(const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  vfprintf(stderr, format, args);
-  va_end(args);
-  fflush(stderr);
-  exit(1);
-}
+string dirname(const string& s) { return s.substr(0, s.find_last_of("/")); }
 
-#define CHECK_TFLITE_SUCCESS(x)                       \
-  if (x != kTfLiteOk) {                               \
-    FATAL("Aborting since tflite returned failure."); \
+bool Interpret(const char* examples_filename, bool use_nnapi) {
+  std::ifstream tflite_stream(examples_filename);
+  if (!tflite_stream.is_open()) {
+    fprintf(stderr, "Can't open input file.");
+    return false;
   }
 
-void Interpret(const char* filename, const char* examples_filename,
-               bool use_nnapi) {
-  // TODO(aselle): Resize of input image should go here
-  // ...
-  // For now I am allocating all tensors. This means I am fixed size.
-  // So I am not using the variable size ability yet.
-  fprintf(stderr, "example file %s\n", examples_filename);
-  std::vector<tflite::testing::Example> examples;
-  CHECK_TFLITE_SUCCESS(
-      tflite::testing::ParseExamples(examples_filename, &examples));
-
-  for (const tflite::testing::Example& example : examples) {
-    auto model = tflite::FlatBufferModel::BuildFromFile(filename);
-    if (!model) FATAL("Cannot read file %s\n", filename);
-    std::unique_ptr<tflite::Interpreter> interpreter;
-    tflite::ops::builtin::BuiltinOpResolver builtins;
-
-    CHECK_TFLITE_SUCCESS(
-        tflite::InterpreterBuilder(*model, builtins)(&interpreter));
+  printf("Use nnapi is set to: %d\n", use_nnapi);
+  tflite::testing::TfLiteDriver test_driver(use_nnapi);
 
-    printf("Use nnapi is set to: %d\n", use_nnapi);
-    interpreter->UseNNAPI(use_nnapi);
-    CHECK_TFLITE_SUCCESS(
-        tflite::testing::FeedExample(interpreter.get(), example));
-
-    {
-      TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
-      if (float* data =
-              interpreter->typed_tensor<float>(interpreter->outputs()[0])) {
-        size_t num = tensor->bytes / sizeof(float);
-        for (float* p = data; p < data + num; p++) {
-          *p = 0;
-        }
-      }
-    }
-    interpreter->Invoke();
-
-    CHECK_TFLITE_SUCCESS(
-        tflite::testing::CheckOutputs(interpreter.get(), example));
-
-    printf("Result:\n");
-    TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
-    if (float* data =
-            interpreter->typed_tensor<float>(interpreter->outputs()[0])) {
-      size_t num = tensor->bytes / sizeof(float);
-      for (float* p = data; p < data + num; p++) {
-        printf(" %f", *p);
-      }
-    }
+  test_driver.SetModelBaseDir(dirname(examples_filename));
+  if (!tflite::testing::ParseAndRunTests(&tflite_stream, &test_driver)) {
+    fprintf(stderr, "Results from tflite don't match.");
+    return false;
   }
+
+  return true;
 }
 
 int main(int argc, char* argv[]) {
@@ -109,6 +64,25 @@ int main(int argc, char* argv[]) {
             argv[0]);
     return 1;
   }
-  Interpret(argv[1], argv[2], use_nnapi);
+
+  string base_dir = dirname(argv[1]);
+  DIR* dir = opendir(base_dir.c_str());
+  if (dir == nullptr) {
+    fprintf(stderr, "Can't open dir %s\n", base_dir.c_str());
+    return 1;
+  }
+  while (struct dirent* ent = readdir(dir)) {
+    string name = ent->d_name;
+    if (name.rfind(".txt") == name.length() - 4) {
+      printf("%s: ", name.c_str());
+      if (Interpret((base_dir + "/" + name).c_str(), use_nnapi)) {
+        printf(" %s\n", "OK");
+      } else {
+        printf(" %s\n", "FAIL");
+      }
+    }
+  }
+  closedir(dir);
+
   return 0;
 }
diff --git a/tensorflow/contrib/lite/testing/parse_testdata.cc b/tensorflow/contrib/lite/testing/parse_testdata.cc
index d745ed27158cdad55bdcd97162cb3dfa9e32c112..0caef0fe2201a668b2235a98304eb353072a3c2f 100644
--- a/tensorflow/contrib/lite/testing/parse_testdata.cc
+++ b/tensorflow/contrib/lite/testing/parse_testdata.cc
@@ -18,6 +18,7 @@ limitations under the License.
 // ASCII file.
 #include "tensorflow/contrib/lite/testing/parse_testdata.h"
 
+#include <cinttypes>
 #include <cmath>
 #include <cstdint>
 #include <cstdio>
@@ -169,6 +170,11 @@ TfLiteStatus FeedExample(tflite::Interpreter* interpreter,
       for (size_t idx = 0; idx < example.inputs[i].flat_data.size(); idx++) {
         data[idx] = example.inputs[i].flat_data[idx];
       }
+    } else if (int64_t* data =
+                   interpreter->typed_tensor<int64_t>(input_index)) {
+      for (size_t idx = 0; idx < example.inputs[i].flat_data.size(); idx++) {
+        data[idx] = example.inputs[i].flat_data[idx];
+      }
     } else {
       fprintf(stderr, "input[%zu] was not float or int data\n", i);
       return kTfLiteError;
@@ -213,8 +219,22 @@ TfLiteStatus CheckOutputs(tflite::Interpreter* interpreter,
         int32_t computed = data[idx];
         int32_t reference = example.outputs[0].flat_data[idx];
         if (std::abs(computed - reference) > 0) {
-          fprintf(stderr, "output[%zu][%zu] did not match %d vs reference %f\n",
-                  i, idx, data[idx], example.outputs[0].flat_data[idx]);
+          fprintf(stderr, "output[%zu][%zu] did not match %d vs reference %d\n",
+                  i, idx, computed, reference);
+          return kTfLiteError;
+        }
+      }
+      fprintf(stderr, "\n");
+    } else if (const int64_t* data =
+                   interpreter->typed_tensor<int64_t>(output_index)) {
+      for (size_t idx = 0; idx < example.outputs[i].flat_data.size(); idx++) {
+        int64_t computed = data[idx];
+        int64_t reference = example.outputs[0].flat_data[idx];
+        if (std::abs(computed - reference) > 0) {
+          fprintf(stderr,
+                  "output[%zu][%zu] did not match %" PRId64
+                  " vs reference %" PRId64 "\n",
+                  i, idx, computed, reference);
           return kTfLiteError;
         }
       }
diff --git a/tensorflow/contrib/lite/testing/parse_testdata.h b/tensorflow/contrib/lite/testing/parse_testdata.h
index 90839fe24550b6c4a0a3a3f4115c479a71580bb0..7ebf362eb99c5f4cf6ea3654cf71e13ff1de99b3 100644
--- a/tensorflow/contrib/lite/testing/parse_testdata.h
+++ b/tensorflow/contrib/lite/testing/parse_testdata.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
+#define TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
 
 #include <vector>
 #include "tensorflow/contrib/lite/interpreter.h"
@@ -71,4 +71,4 @@ bool ParseAndRunTests(std::istream* input, TestRunner* test_runner);
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
diff --git a/tensorflow/contrib/lite/testing/split.h b/tensorflow/contrib/lite/testing/split.h
index 24071442e8929f37443df1b98d22711b3024b87c..428cfda4f216f0ee6409a32c43a4cf91ecc11922 100644
--- a/tensorflow/contrib/lite/testing/split.h
+++ b/tensorflow/contrib/lite/testing/split.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
 
 #include <cstdlib>
 #include <string>
@@ -53,6 +53,15 @@ inline std::vector<int> Split(const string& s, const string& delimiter) {
   return fields;
 }
 
+template <>
+inline std::vector<int64_t> Split(const string& s, const string& delimiter) {
+  std::vector<int64_t> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(strtoll(s.data() + p.first, nullptr, 10));
+  }
+  return fields;
+}
+
 template <>
 inline std::vector<float> Split(const string& s, const string& delimiter) {
   std::vector<float> fields;
@@ -74,4 +83,4 @@ inline std::vector<uint8_t> Split(const string& s, const string& delimiter) {
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
diff --git a/tensorflow/contrib/lite/testing/test_runner.h b/tensorflow/contrib/lite/testing/test_runner.h
index f4b26949b57e0702ac5554afd766a6072af268a4..60eaafa474a01887bee12b031b1f59cc5c91f173 100644
--- a/tensorflow/contrib/lite/testing/test_runner.h
+++ b/tensorflow/contrib/lite/testing/test_runner.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
 
 #include <memory>
 #include <string>
@@ -121,4 +121,4 @@ class TestRunner {
 
 }  // namespace testing
 }  // namespace tflite
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
diff --git a/tensorflow/contrib/lite/testing/tf_driver.cc b/tensorflow/contrib/lite/testing/tf_driver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da6c6ce7b1fda5c3e57daf38207a1598f1e2cd72
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/tf_driver.cc
@@ -0,0 +1,189 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/testing/tf_driver.h"
+
+#include <fstream>
+#include <iostream>
+
+#include "tensorflow/contrib/lite/testing/split.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tflite {
+namespace testing {
+
+namespace {
+
+tensorflow::Tensor CreateTensor(const tensorflow::DataType type,
+                                const std::vector<int64_t>& dim) {
+  tensorflow::TensorShape shape{gtl::ArraySlice<int64>{
+      reinterpret_cast<const int64*>(dim.data()), dim.size()}};
+  return {type, shape};
+}
+
+template <typename T>
+void FillTensorWithData(tensorflow::Tensor* tensor, const string& csv_values) {
+  auto data = tensor->flat<T>();
+
+  const auto& values = testing::Split<T>(csv_values, ",");
+  for (int i = 0; i < values.size(); i++) {
+    data(i) = values[i];
+  }
+}
+
+template <typename T>
+void FillTensorWithZeros(tensorflow::Tensor* tensor) {
+  auto data = tensor->flat<T>();
+  for (int i = 0; i < tensor->NumElements(); i++) {
+    data(i) = 0;
+  }
+}
+
+template <typename T>
+string TensorDataToCsvString(const tensorflow::Tensor& tensor) {
+  std::stringstream stream;
+  const auto& data = tensor.flat<T>();
+  if (data.size() == 0) {
+    return "";
+  }
+  stream << data(0);
+  for (int i = 1; i < data.size(); i++) {
+    stream << "," << data(i);
+  }
+  return stream.str();
+}
+
+}  // namespace
+
+TfDriver::TfDriver(const std::vector<string>& input_layer,
+                   const std::vector<string>& input_layer_type,
+                   const std::vector<string>& input_layer_shape,
+                   const std::vector<string>& output_layer)
+    : input_names_(input_layer), output_names_(output_layer) {
+  CHECK_EQ(input_layer.size(), input_layer_type.size());
+  CHECK_EQ(input_layer.size(), input_layer_shape.size());
+
+  input_ids_.resize(input_layer.size());
+  input_tensors_.reserve(input_layer.size());
+  input_types_.resize(input_layer.size());
+  input_shapes_.resize(input_layer.size());
+  for (int i = 0; i < input_layer.size(); i++) {
+    input_ids_[i] = i;
+    input_tensors_[input_layer[i]] = {};
+    CHECK(DataTypeFromString(input_layer_type[i], &input_types_[i]));
+    input_shapes_[i] = Split<int64_t>(input_layer_shape[i], ",");
+  }
+
+  output_ids_.resize(output_layer.size());
+  output_tensors_.reserve(output_layer.size());
+  for (int i = 0; i < output_layer.size(); i++) {
+    output_ids_[i] = i;
+  }
+}
+
+void TfDriver::LoadModel(const string& bin_file_path) {
+  if (!IsValid()) return;
+  std::cout << std::endl << "Loading model: " << bin_file_path << std::endl;
+  std::ifstream model(bin_file_path);
+  if (model.fail()) {
+    Invalidate("Failed to find the model");
+    return;
+  }
+
+  tensorflow::GraphDef graphdef;
+  if (!graphdef.ParseFromIstream(&model)) {
+    Invalidate("Failed to parse tensorflow graphdef");
+    return;
+  }
+
+  tensorflow::SessionOptions options;
+  session_.reset(tensorflow::NewSession(options));
+  auto status = session_->Create(graphdef);
+  if (!status.ok()) {
+    Invalidate("Failed to create session");
+  }
+}
+
+void TfDriver::SetInput(int id, const string& csv_values) {
+  if (!IsValid()) return;
+
+  auto tensor = CreateTensor(input_types_[id], input_shapes_[id]);
+  switch (input_types_[id]) {
+    case tensorflow::DT_FLOAT: {
+      FillTensorWithData<float>(&tensor, csv_values);
+      break;
+    }
+    case tensorflow::DT_INT32: {
+      FillTensorWithData<int32_t>(&tensor, csv_values);
+      break;
+    }
+    default:
+      fprintf(stderr, "Unsupported type %d in SetInput\n", input_types_[id]);
+      Invalidate("Unsupported tensor data type");
+      return;
+  }
+  input_tensors_[input_names_[id]] = tensor;
+}
+
+void TfDriver::ResetTensor(int id) {
+  if (!IsValid()) return;
+  auto tensor = input_tensors_[input_names_[id]];
+  switch (input_types_[id]) {
+    case tensorflow::DT_FLOAT: {
+      FillTensorWithZeros<float>(&tensor);
+      break;
+    }
+    case tensorflow::DT_INT32: {
+      FillTensorWithZeros<int32_t>(&tensor);
+      break;
+    }
+    default:
+      fprintf(stderr, "Unsupported type %d in ResetTensor\n", input_types_[id]);
+      Invalidate("Unsupported tensor data type");
+      return;
+  }
+}
+
+void TfDriver::ReshapeTensor(int id, const string& csv_values) {
+  input_shapes_[id] = Split<int64_t>(csv_values, ",");
+  input_tensors_[input_names_[id]] =
+      CreateTensor(input_types_[id], input_shapes_[id]);
+  ResetTensor(id);
+}
+
+string TfDriver::ReadOutput(int id) {
+  if (!IsValid()) return "";
+  switch (output_tensors_[id].dtype()) {
+    case tensorflow::DT_FLOAT:
+      return TensorDataToCsvString<float>(output_tensors_[id]);
+    case tensorflow::DT_INT32:
+      return TensorDataToCsvString<int32_t>(output_tensors_[id]);
+    default:
+      fprintf(stderr, "Unsupported type %d in ResetTensor\n", input_types_[id]);
+      Invalidate("Unsupported tensor data type");
+      return "";
+  }
+}
+
+void TfDriver::Invoke() {
+  if (!IsValid()) return;
+  auto status = session_->Run({input_tensors_.begin(), input_tensors_.end()},
+                              output_names_, {}, &output_tensors_);
+  if (!status.ok()) {
+    Invalidate("Failed to invoke interpreter");
+  }
+}
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tf_driver.h b/tensorflow/contrib/lite/testing/tf_driver.h
new file mode 100644
index 0000000000000000000000000000000000000000..2928e57282b97e6e0c3da9a8247274bdbf919b9c
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/tf_driver.h
@@ -0,0 +1,75 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TF_DRIVER_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_TF_DRIVER_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/testing/split.h"
+#include "tensorflow/contrib/lite/testing/test_runner.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tflite {
+namespace testing {
+
+// A test runner that feeds inputs into Tensorflow and generates outputs.
+class TfDriver : public TestRunner {
+ public:
+  explicit TfDriver(const std::vector<string>& input_layer,
+                    const std::vector<string>& input_layer_type,
+                    const std::vector<string>& input_layer_shape,
+                    const std::vector<string>& output_layer);
+  ~TfDriver() override {}
+
+  void LoadModel(const string& bin_file_path) override;
+  void SetInput(int id, const string& csv_values) override;
+  void Invoke() override;
+  string ReadOutput(int id);
+
+  const std::vector<int>& GetInputs() override { return input_ids_; }
+  const std::vector<int>& GetOutputs() override { return output_ids_; }
+  void ReshapeTensor(int id, const string& csv_values) override;
+  // Note: ResetTensor only works for input tensor.
+  void ResetTensor(int id) override;
+
+  // no-op. SetInput will overwrite existing data .
+  void AllocateTensors() override {}
+  // no-op. Tf driver is not supposed to check the results.
+  void SetExpectation(int id, const string& csv_values) override {}
+  // tf driver is not supposed to check the results.
+  bool CheckResults() override { return false; }
+
+ private:
+  std::unique_ptr<tensorflow::Session> session_;
+  std::vector<int> input_ids_;
+  std::vector<string> input_names_;
+  std::vector<std::vector<int64_t>> input_shapes_;
+  std::vector<tensorflow::DataType> input_types_;
+  std::unordered_map<string, tensorflow::Tensor> input_tensors_;
+
+  std::vector<int> output_ids_;
+  std::vector<string> output_names_;
+  std::vector<::tensorflow::Tensor> output_tensors_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TF_DRIVER_H_
diff --git a/tensorflow/contrib/lite/testing/tf_driver_test.cc b/tensorflow/contrib/lite/testing/tf_driver_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0faa4676adc3e846ad398bb203b77b99a2ba360
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/tf_driver_test.cc
@@ -0,0 +1,56 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/testing/tf_driver.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace testing {
+namespace {
+
+using ::testing::ElementsAre;
+
+TEST(TfDriverTest, SimpleTest) {
+  std::unique_ptr<TfDriver> runner(
+      new TfDriver({"a", "b", "c", "d"}, {"float", "float", "float", "float"},
+                   {"1,8,8,3", "1,8,8,3", "1,8,8,3", "1,8,8,3"}, {"x", "y"}));
+
+  runner->LoadModel(
+      "third_party/tensorflow/contrib/lite/testdata/multi_add.pb");
+  EXPECT_TRUE(runner->IsValid()) << runner->GetErrorMessage();
+
+  ASSERT_THAT(runner->GetInputs(), ElementsAre(0, 1, 2, 3));
+  ASSERT_THAT(runner->GetOutputs(), ElementsAre(0, 1));
+
+  for (int i : {0, 1, 2, 3}) {
+    runner->ReshapeTensor(i, "1,2,2,1");
+  }
+  ASSERT_TRUE(runner->IsValid());
+
+  runner->SetInput(0, "0.1,0.2,0.3,0.4");
+  runner->SetInput(1, "0.001,0.002,0.003,0.004");
+  runner->SetInput(2, "0.001,0.002,0.003,0.004");
+  runner->SetInput(3, "0.01,0.02,0.03,0.04");
+  runner->ResetTensor(2);
+  runner->Invoke();
+
+  ASSERT_EQ(runner->ReadOutput(0), "0.101,0.202,0.303,0.404");
+  ASSERT_EQ(runner->ReadOutput(1), "0.011,0.022,0.033,0.044");
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index cf9df2ec264bcff7f836a70db37afe8a5ce01c28..613223f3d4ff212cb8672494243b2d7a1d06b3db 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -31,6 +31,14 @@ float Value(const TfLitePtrUnion& data, int index) {
   return data.f[index];
 }
 template <>
+int32_t Value(const TfLitePtrUnion& data, int index) {
+  return data.i32[index];
+}
+template <>
+int64_t Value(const TfLitePtrUnion& data, int index) {
+  return data.i64[index];
+}
+template <>
 uint8_t Value(const TfLitePtrUnion& data, int index) {
   return data.uint8[index];
 }
@@ -61,9 +69,14 @@ class TfLiteDriver::Expectation {
     switch (tensor.type) {
       case kTfLiteFloat32:
         return TypedCheck<float>(verbose, tensor);
+      case kTfLiteInt32:
+        return TypedCheck<int32_t>(verbose, tensor);
+      case kTfLiteInt64:
+        return TypedCheck<int64_t>(verbose, tensor);
       case kTfLiteUInt8:
         return TypedCheck<uint8_t>(verbose, tensor);
       default:
+        fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
         return false;
     }
   }
@@ -71,15 +84,30 @@ class TfLiteDriver::Expectation {
  private:
   template <typename T>
   bool TypedCheck(bool verbose, const TfLiteTensor& tensor) {
+    // TODO(ahentz): must find a way to configure the tolerance.
+    constexpr double kRelativeThreshold = 1e-2f;
+    constexpr double kAbsoluteThreshold = 1e-4f;
+
     int tensor_size = tensor.bytes / sizeof(T);
 
     bool good_output = true;
     for (int i = 0; i < tensor_size; ++i) {
-      if (std::abs(Value<T>(data_, i) - Value<T>(tensor.data, i)) > 1e-5) {
+      float computed = Value<T>(tensor.data, i);
+      float reference = Value<T>(data_, i);
+      float diff = std::abs(computed - reference);
+      bool error_is_large = false;
+      // For very small numbers, try absolute error, otherwise go with
+      // relative.
+      if (std::abs(reference) < kRelativeThreshold) {
+        error_is_large = (diff > kAbsoluteThreshold);
+      } else {
+        error_is_large = (diff > kRelativeThreshold * std::abs(reference));
+      }
+      if (error_is_large) {
         good_output = false;
         if (verbose) {
-          std::cerr << "  index " << i << ": " << Value<T>(data_, i)
-                    << " != " << Value<T>(tensor.data, i) << std::endl;
+          std::cerr << "  index " << i << ": got " << computed
+                    << ", but expected " << reference << std::endl;
         }
       }
     }
@@ -95,8 +123,8 @@ TfLiteDriver::~TfLiteDriver() {}
 void TfLiteDriver::AllocateTensors() {
   if (must_allocate_tensors_) {
     if (interpreter_->AllocateTensors() != kTfLiteOk) {
-      std::cerr << "Failed to allocate tensors" << std::endl;
-      abort();
+      Invalidate("Failed to allocate tensors");
+      return;
     }
     must_allocate_tensors_ = false;
   }
@@ -147,6 +175,18 @@ void TfLiteDriver::SetInput(int id, const string& csv_values) {
       SetTensorData(values, &tensor->data);
       break;
     }
+    case kTfLiteInt32: {
+      const auto& values = testing::Split<int32_t>(csv_values, ",");
+      if (!CheckSizes<int32_t>(tensor->bytes, values.size())) return;
+      SetTensorData(values, &tensor->data);
+      break;
+    }
+    case kTfLiteInt64: {
+      const auto& values = testing::Split<int64_t>(csv_values, ",");
+      if (!CheckSizes<int64_t>(tensor->bytes, values.size())) return;
+      SetTensorData(values, &tensor->data);
+      break;
+    }
     case kTfLiteUInt8: {
       const auto& values = testing::Split<uint8_t>(csv_values, ",");
       if (!CheckSizes<uint8_t>(tensor->bytes, values.size())) return;
@@ -154,6 +194,7 @@ void TfLiteDriver::SetInput(int id, const string& csv_values) {
       break;
     }
     default:
+      fprintf(stderr, "Unsupported type %d in SetInput\n", tensor->type);
       Invalidate("Unsupported tensor data type");
       return;
   }
@@ -162,15 +203,26 @@ void TfLiteDriver::SetInput(int id, const string& csv_values) {
 void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
   if (!IsValid()) return;
   auto* tensor = interpreter_->tensor(id);
+  if (expected_output_.count(id) != 0) {
+    fprintf(stderr, "Overriden expectation for tensor %d\n", id);
+    Invalidate("Overriden expectation");
+  }
   expected_output_[id].reset(new Expectation);
   switch (tensor->type) {
     case kTfLiteFloat32:
       expected_output_[id]->SetData<float>(csv_values);
       break;
+    case kTfLiteInt32:
+      expected_output_[id]->SetData<int32_t>(csv_values);
+      break;
+    case kTfLiteInt64:
+      expected_output_[id]->SetData<int64_t>(csv_values);
+      break;
     case kTfLiteUInt8:
       expected_output_[id]->SetData<uint8_t>(csv_values);
       break;
     default:
+      fprintf(stderr, "Unsupported type %d in SetExpectation\n", tensor->type);
       Invalidate("Unsupported tensor data type");
       return;
   }
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.h b/tensorflow/contrib/lite/testing/tflite_driver.h
index 4440d4285e948c3d1622c8de5c47ff3729c5847f..25689a9fb42c06fa3f8f2f92064cf59e8c331637 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.h
+++ b/tensorflow/contrib/lite/testing/tflite_driver.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
 
 #include <map>
 
@@ -59,4 +59,4 @@ class TfLiteDriver : public TestRunner {
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
diff --git a/tensorflow/contrib/lite/testing/tokenize.h b/tensorflow/contrib/lite/testing/tokenize.h
index daccf0e84a450a0ffdf04a1eb8ff319878cfc808..7ed8eb96b7a10eecd915fe426ab3abf0e7a46ca4 100644
--- a/tensorflow/contrib/lite/testing/tokenize.h
+++ b/tensorflow/contrib/lite/testing/tokenize.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
 
 #include <istream>
 #include <string>
@@ -39,4 +39,4 @@ void Tokenize(std::istream* input, TokenProcessor* processor);
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
diff --git a/tensorflow/contrib/lite/testing/util.h b/tensorflow/contrib/lite/testing/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d20aec141c7c3a3e48af290edb169c6fd7254cf
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/util.h
@@ -0,0 +1,28 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
+
+namespace tflite {
+
+inline void LogToStderr() {
+#ifdef PLATFORM_GOOGLE
+  FLAGS_logtostderr = true;
+#endif
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 0bf8d067a3f21a01fc1b384bba2a1703f9367733..45031de09c75e9dbf5ee34fe31e7c69ad08b10aa 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -159,22 +159,23 @@ cc_library(
         "toco_types.h",
     ],
     deps = [
+        # Placeholder for internal file dependency.
+        "@protobuf_archive//:protobuf_headers",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-    ] + select({
-        "//tensorflow:android": [],
-        "//tensorflow:darwin": [],
-        "//tensorflow:ios": [],
-        "//conditions:default": [],
-        "//tensorflow:dummy_disabled_internal": [],
-    }),
+    ],
 )
 
 cc_library(
     name = "graph_transformations",
     srcs = [
+        "graph_transformations/convert_expanddims_to_reshape.cc",
         "graph_transformations/convert_pure_conv_to_depthwise.cc",
+        "graph_transformations/convert_reorder_axes.cc",
+        "graph_transformations/convert_trivial_addn_to_add.cc",
+        "graph_transformations/convert_trivial_stack_to_reshape.cc",
+        "graph_transformations/convert_trivial_transpose_to_reshape.cc",
         "graph_transformations/create_im2col_arrays.cc",
         "graph_transformations/dequantize.cc",
         "graph_transformations/drop_fake_quant.cc",
@@ -188,7 +189,10 @@ cc_library(
         "graph_transformations/identify_l2_normalization.cc",
         "graph_transformations/identify_l2_pool.cc",
         "graph_transformations/identify_lstm.cc",
+        "graph_transformations/identify_lstm_merge_inputs.cc",
+        "graph_transformations/identify_lstm_split_inputs.cc",
         "graph_transformations/identify_relu1.cc",
+        "graph_transformations/lstm_utils.cc",
         "graph_transformations/make_initial_dequantize_operator.cc",
         "graph_transformations/propagate_array_data_types.cc",
         "graph_transformations/propagate_fixed_sizes.cc",
@@ -204,29 +208,41 @@ cc_library(
         "graph_transformations/remove_trivial_passthrough.h",
         "graph_transformations/remove_trivial_quantized_activation_func.cc",
         "graph_transformations/remove_trivial_reshape.cc",
+        "graph_transformations/remove_trivial_slice.cc",
         "graph_transformations/remove_unused_op.cc",
+        "graph_transformations/reorder_activation_functions.cc",
         "graph_transformations/resolve_batch_normalization.cc",
+        "graph_transformations/resolve_batch_to_space_nd_attributes.cc",
         "graph_transformations/resolve_constant_binary.cc",
         "graph_transformations/resolve_constant_concatenation.cc",
         "graph_transformations/resolve_constant_fake_quant.cc",
-        "graph_transformations/resolve_constant_tensorflow_shape.cc",
+        "graph_transformations/resolve_constant_fill.cc",
+        "graph_transformations/resolve_constant_range.cc",
+        "graph_transformations/resolve_constant_shape_or_rank.cc",
+        "graph_transformations/resolve_constant_stack.cc",
+        "graph_transformations/resolve_constant_strided_slice.cc",
+        "graph_transformations/resolve_constant_transpose.cc",
         "graph_transformations/resolve_constant_unary.cc",
         "graph_transformations/resolve_mean_attributes.cc",
         "graph_transformations/resolve_pad_attributes.cc",
         "graph_transformations/resolve_reorder_axes.cc",
         "graph_transformations/resolve_reshape_attributes.cc",
         "graph_transformations/resolve_slice_attributes.cc",
+        "graph_transformations/resolve_space_to_batch_nd_attributes.cc",
+        "graph_transformations/resolve_squeeze_attributes.cc",
         "graph_transformations/resolve_strided_slice_attributes.cc",
         "graph_transformations/resolve_tensorflow_concat.cc",
         "graph_transformations/resolve_tensorflow_matmul.cc",
         "graph_transformations/resolve_tensorflow_merge.cc",
-        "graph_transformations/resolve_tensorflow_squeeze.cc",
         "graph_transformations/resolve_tensorflow_switch.cc",
         "graph_transformations/resolve_tensorflow_tile.cc",
+        "graph_transformations/resolve_transpose_attributes.cc",
         "graph_transformations/unfuse_activation_functions.cc",
+        "graph_transformations/unroll_batch_matmul.cc",
     ],
     hdrs = [
         "graph_transformations/graph_transformations.h",
+        "graph_transformations/lstm_utils.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -237,6 +253,7 @@ cc_library(
         ":tooling_util",
         ":types_proto_cc",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index 2f4454d7c849c49c853e1379cbdd8241062ba348..49cc1fc2aa365925cde86ceb658ff2b354d06911 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -148,7 +148,7 @@ std::size_t TransientArraySize(const Model& model, const string& array_name,
   if (!IsAllocatableTransientArray(model, array_name)) {
     return 0;
   }
-  const auto& array = model.arrays.at(array_name);
+  const auto& array = &model.GetArray(array_name);
   CHECK(array->has_shape())
       << "Array '" << array_name << "' doesn't have a shape";
   if (array->data_type == ArrayDataType::kNone) {
@@ -158,9 +158,7 @@ std::size_t TransientArraySize(const Model& model, const string& array_name,
         LOG(FATAL)
             << "A RNN state array, " << array_name << ", still does not "
             << "have a known data type after all graph transformations have "
-            << "run. That's mostly a toco bug --- sorry. For now, you can "
-            << "work around this issue by adding manually_create:true in the "
-            << "--rnn_state description of this RNN state.";
+            << "run.";
       }
     }
     LOG(FATAL) << "An array, " << array_name << ", still does not "
@@ -185,7 +183,7 @@ void AllocateTransientArray(const Model& model, const string& array_name,
   }
   const std::size_t size =
       TransientArraySize(model, array_name, transient_data_alignment);
-  const auto& array = model.arrays.at(array_name);
+  const auto& array = &model.GetArray(array_name);
   CHECK(!array->alloc);
   allocator->Allocate(size, &array->GetOrCreateAlloc());
 }
@@ -197,7 +195,7 @@ void DeallocateTransientArray(const Model& model, const string& array_name,
   if (!IsAllocatableTransientArray(model, array_name)) {
     return;
   }
-  const auto& array = model.arrays.at(array_name);
+  const auto& array = &model.GetArray(array_name);
   CHECK(!!array->alloc);
   allocator->Deallocate(*array->alloc);
 }
@@ -218,7 +216,8 @@ void AllocateTransientArrays(Model* model,
   // just guard this assumption with a CHECK:
   bool batchless_input_shapes = true;
   for (const auto& input_array : model->flags.input_arrays()) {
-    if (input_array.shape().empty() || input_array.shape(0) != 1) {
+    if (!input_array.has_shape() || input_array.shape().dims().empty() ||
+        input_array.shape().dims(0) != 1) {
       batchless_input_shapes = false;
       break;
     }
@@ -230,7 +229,7 @@ void AllocateTransientArrays(Model* model,
   // Construct a sorted map of array names, so that other layout engines can
   // match exactly.
   std::map<string, const Array*> ordered_arrays_map;
-  for (const auto& pair : model->arrays) {
+  for (const auto& pair : model->GetArrayMap()) {
     ordered_arrays_map[pair.first] = pair.second.get();
   }
 
@@ -238,8 +237,8 @@ void AllocateTransientArrays(Model* model,
   // is a misnormer, should read 'workspace'.
   for (const auto& array_pair : ordered_arrays_map) {
     const string& array_name = array_pair.first;
-    const auto& array_lifespan = array_lifespans.find(array_name)->second;
-    if (array_lifespan.persistent) {
+    auto it = array_lifespans.find(array_name);
+    if (it != array_lifespans.end() && it->second.persistent) {
       AllocateTransientArray(*model, array_name, &allocator,
                              transient_data_alignment);
     }
@@ -281,8 +280,8 @@ void AllocateTransientArrays(Model* model,
   std::size_t persistent_alloc_size = 0;
   for (const auto& array_pair : ordered_arrays_map) {
     const string& array_name = array_pair.first;
-    const auto& array_lifespan = array_lifespans.find(array_name)->second;
-    if (array_lifespan.persistent) {
+    auto it = array_lifespans.find(array_name);
+    if (it != array_lifespans.end() && it->second.persistent) {
       persistent_alloc_size +=
           TransientArraySize(*model, array_name, transient_data_alignment);
     }
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.h b/tensorflow/contrib/lite/toco/allocate_transient_arrays.h
index 12d0d0498f5224962f2775d4e3cb7d8e360cbe46..59d8ada1e9bb985f2eaa7ff6d29bc4f1b054a070 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.h
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
 
 #include "tensorflow/contrib/lite/toco/model.h"
 
@@ -41,4 +41,4 @@ void AllocateTransientArrays(Model* model,
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 5268902346f720be7ecd4980c696d4df8c3da173..b97a4720a7c4e69f8b69574475d19e0522cfe86d 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -15,12 +15,15 @@ limitations under the License.
 // This abstracts command line arguments in toco.
 // Arg<T> is a parseable type that can register a default value, be able to
 // parse itself, and keep track of whether it was specified.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
 
 #include <functional>
 #include <unordered_map>
 #include <vector>
+#if defined(PLATFORM_GOOGLE)
+#include "strings/split.h"
+#endif
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/contrib/lite/toco/toco_port.h"
@@ -144,12 +147,12 @@ class Arg<toco::StringMapList> final {
       if (!TryStripPrefixString(outer_member, "{", &outer_member)) return false;
       if (!TryStripSuffixString(outer_member, "}", &outer_member)) return false;
       const std::vector<string> inner_fields_vector =
-          strings::Split(outer_member, ',');
+          absl::StrSplit(outer_member, ',');
 
       std::unordered_map<string, string> element;
       for (const string& member_field : inner_fields_vector) {
         std::vector<string> outer_member_key_value =
-            strings::Split(member_field, ':');
+            absl::StrSplit(member_field, ':');
         if (outer_member_key_value.size() != 2) return false;
         string& key = outer_member_key_value[0];
         string& value = outer_member_key_value[1];
@@ -203,6 +206,9 @@ struct ParsedModelFlags {
   Arg<string> graphviz_last_array;
   Arg<string> dump_graphviz;
   Arg<bool> dump_graphviz_video = Arg<bool>(false);
+  Arg<bool> allow_nonexistent_arrays = Arg<bool>(false);
+  Arg<bool> allow_nonascii_arrays = Arg<bool>(false);
+  Arg<string> arrays_extra_info_file;
 };
 
 // Flags that describe the operation you would like to do (what conversion
@@ -227,4 +233,4 @@ struct ParsedTocoFlags {
 };
 
 }  // namespace toco
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc
index f5e2868dc05306d9f08d585e54900a3f873e6079..c726eb6d8678e2703f5acba8b3d8d740186939f5 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.cc
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_replace.h"
+#include "absl/strings/strip.h"
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h"
 #include "tensorflow/contrib/lite/toco/toco_port.h"
@@ -105,6 +106,34 @@ Color GetColorForArray(const Model& model, const string& array_name) {
   return Color(0xF5, 0xF5, 0xF5);
 }
 
+void AppendArrayVal(string* string, Array const& array, int index) {
+  if (array.buffer->type == ArrayDataType::kFloat) {
+    const auto& data = array.GetBuffer<ArrayDataType::kFloat>().data;
+    if (index >= data.size()) {
+      return;
+    }
+    AppendF(string, "%.3f", data[index]);
+  } else if (array.buffer->type == ArrayDataType::kUint8) {
+    const auto& data = array.GetBuffer<ArrayDataType::kUint8>().data;
+    if (index >= data.size()) {
+      return;
+    }
+    AppendF(string, "%d", data[index]);
+  } else if (array.buffer->type == ArrayDataType::kInt32) {
+    const auto& data = array.GetBuffer<ArrayDataType::kInt32>().data;
+    if (index >= data.size()) {
+      return;
+    }
+    AppendF(string, "%d", data[index]);
+  } else if (array.buffer->type == ArrayDataType::kInt64) {
+    const auto& data = array.GetBuffer<ArrayDataType::kInt64>().data;
+    if (index >= data.size()) {
+      return;
+    }
+    AppendF(string, "%d", data[index]);
+  }
+}
+
 NodeProperties GetPropertiesForArray(const Model& model,
                                      const string& array_name) {
   NodeProperties node_properties;
@@ -129,10 +158,44 @@ NodeProperties GetPropertiesForArray(const Model& model,
       if (id == 0) {
         AppendF(&node_properties.label, "%d", array_shape.dims(id));
       } else {
-        AppendF(&node_properties.label, "x%d", array_shape.dims(id));
+        // 0x00D7 is the unicode multiplication symbol
+        AppendF(&node_properties.label, "\u00D7%d", array_shape.dims(id));
       }
     }
     node_properties.label += "]";
+
+    if (array.buffer) {
+      const auto& array = model.GetArray(array_name);
+      int buffer_size = RequiredBufferSizeForShape(array.shape());
+      if (buffer_size <= 4) {
+        AppendF(&node_properties.label, " = ");
+        if (array.shape().dimensions_count() > 0) {
+          AppendF(&node_properties.label, "{");
+        }
+        for (int i = 0; i < buffer_size; i++) {
+          AppendArrayVal(&node_properties.label, array, i);
+          if (i + 1 < buffer_size) {
+            AppendF(&node_properties.label, ", ");
+          }
+        }
+      } else {
+        AppendF(&node_properties.label, "\\n = ");
+        if (array.shape().dimensions_count() > 0) {
+          AppendF(&node_properties.label, "{");
+        }
+        AppendArrayVal(&node_properties.label, array, 0);
+        AppendF(&node_properties.label, ", ");
+        AppendArrayVal(&node_properties.label, array, 1);
+        // 0x2026 is the unicode ellipsis symbol
+        AppendF(&node_properties.label, " \u2026 ");
+        AppendArrayVal(&node_properties.label, array, buffer_size - 2);
+        AppendF(&node_properties.label, ", ");
+        AppendArrayVal(&node_properties.label, array, buffer_size - 1);
+      }
+      if (array.shape().dimensions_count() > 0) {
+        AppendF(&node_properties.label, "}");
+      }
+    }
   }
 
   if (array.minmax) {
@@ -160,7 +223,21 @@ NodeProperties GetPropertiesForOperator(const Operator& op) {
     node_properties.label =
         static_cast<const TensorFlowUnsupportedOperator&>(op).tensorflow_op;
   } else {
-    node_properties.label = OperatorTypeName(op.type);
+    node_properties.label =
+        string(absl::StripPrefix(OperatorTypeName(op.type), "TensorFlow"));
+  }
+  switch (op.fused_activation_function) {
+    case FusedActivationFunctionType::kRelu:
+      AppendF(&node_properties.label, "\\nReLU");
+      break;
+    case FusedActivationFunctionType::kRelu6:
+      AppendF(&node_properties.label, "\\nReLU6");
+      break;
+    case FusedActivationFunctionType::kRelu1:
+      AppendF(&node_properties.label, "\\nReLU1");
+      break;
+    default:
+      break;
   }
   // Additional information for some of the operators.
   switch (op.type) {
@@ -201,8 +278,8 @@ std::vector<const Operator*> OperatorsToDump(const Model& model) {
   if (last_specified) {
     // Return only the part of the graph between graphviz_first_array
     // and graphviz_last_array.
-    CHECK(model.arrays.count(dump_options.graphviz_first_array));
-    CHECK(model.arrays.count(dump_options.graphviz_last_array));
+    CHECK(model.HasArray(dump_options.graphviz_first_array));
+    CHECK(model.HasArray(dump_options.graphviz_last_array));
     std::unordered_set<string> arrays_already_produced;
     std::vector<string> arrays_to_produce;
     arrays_to_produce.push_back(dump_options.graphviz_last_array);
@@ -259,6 +336,10 @@ void DumpGraphviz(const Model& model, string* output_file_contents) {
             op_properties.color.TextColorString().c_str());
     // Add nodes and edges for all inputs of the operator.
     for (const auto& input : op.inputs) {
+      if (!model.HasArray(input)) {
+        // Arrays should _always_ exist. Except, perhaps, during development.
+        continue;
+      }
       auto array_properties = GetPropertiesForArray(model, input);
       if (!already_added_arrays.count(input)) {
         AppendF(output_file_contents, kNodeFormat, input,
@@ -271,6 +352,10 @@ void DumpGraphviz(const Model& model, string* output_file_contents) {
     }
     // Add nodes and edges for all outputs of the operator.
     for (const auto& output : op.outputs) {
+      if (!model.HasArray(output)) {
+        // Arrays should _always_ exist. Except, perhaps, during development.
+        continue;
+      }
       auto array_properties = GetPropertiesForArray(model, output);
       if (!already_added_arrays.count(output)) {
         AppendF(output_file_contents, kNodeFormat, output,
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.h b/tensorflow/contrib/lite/toco/dump_graphviz.h
index 0fb28e3de844b123a60e36bc23c7d2add8189962..ea5a4031c39580be00130a2fd3a89c61da2acf01 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.h
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
 
 #include <string>
 
@@ -25,4 +25,4 @@ void DumpGraphviz(const Model& model, string* output_file_contents);
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index e18cf46c69badf4b7584f723a4ba39f2e0d8dd1d..70d7a9d4a5b823d6b4f704b194aa279fbd3f6e13 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -46,6 +46,32 @@ using tensorflow::TensorProto;
 namespace toco {
 namespace {
 
+tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type) {
+  switch (data_type) {
+    case ArrayDataType::kBool:
+      return tensorflow::DT_BOOL;
+    case ArrayDataType::kFloat:
+      return tensorflow::DT_FLOAT;
+    case ArrayDataType::kUint8:
+      return tensorflow::DT_UINT8;
+    case ArrayDataType::kInt32:
+      return tensorflow::DT_INT32;
+    case ArrayDataType::kInt64:
+      return tensorflow::DT_INT64;
+    case ArrayDataType::kString:
+      return tensorflow::DT_STRING;
+    default:
+    case ArrayDataType::kNone:
+      LOG(FATAL) << "Unsupported data type: " << static_cast<int>(data_type);
+      return tensorflow::DT_INVALID;
+  }
+}
+
+tensorflow::DataType GetTensorFlowDataType(const Model& model,
+                                           const string& array_name) {
+  return GetTensorFlowDataType(model.GetArray(array_name).data_type);
+}
+
 // TensorFlow sometimes forbids what it calls "legacy scalars",
 // which are 1-D shapes where the unique shape size is 1.
 // See OpKernel::IsLegacyScalar and OpKernel::allow_legacy_scalars.
@@ -156,8 +182,8 @@ void ConvertFloatTensorConst(const Model& model, const string& name,
   const_op->set_name(name);
   (*const_op->mutable_attr())["dtype"].set_type(DT_FLOAT);
   auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
-  CHECK(model.arrays.count(name));
-  const auto& input_array = *model.arrays.at(name);
+  CHECK(model.HasArray(name));
+  const auto& input_array = model.GetArray(name);
   const auto& input_shape = input_array.shape();
   CHECK(input_array.buffer);
   CHECK(input_array.buffer->type == ArrayDataType::kFloat);
@@ -177,8 +203,8 @@ void ConvertFloatTensorConst(const Model& model, const string& name,
   const_op->set_name(name);
   (*const_op->mutable_attr())["dtype"].set_type(DT_FLOAT);
   auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
-  CHECK(model.arrays.count(name));
-  const auto& input_array = *model.arrays.at(name);
+  CHECK(model.HasArray(name));
+  const auto& input_array = model.GetArray(name);
   const auto& input_shape = input_array.shape();
   CHECK(input_array.buffer);
   CHECK(input_array.buffer->type == ArrayDataType::kFloat);
@@ -193,8 +219,8 @@ void ConvertIntTensorConst(const Model& model, const string& name,
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
   }
-  CHECK(model.arrays.count(name));
-  const auto& array = *model.arrays.at(name);
+  CHECK(model.HasArray(name));
+  const auto& array = model.GetArray(name);
   auto* const_op = tensorflow_graph->add_node();
   const_op->set_op("Const");
   const_op->set_name(name);
@@ -212,6 +238,24 @@ void ConvertIntTensorConst(const Model& model, const string& name,
   }
 }
 
+void CreateIntTensorConst(const string& name, const std::vector<int32>& data,
+                          GraphDef* tensorflow_graph) {
+  if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
+    return;
+  }
+  auto* const_op = tensorflow_graph->add_node();
+  const_op->set_op("Const");
+  const_op->set_name(name);
+  (*const_op->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+  for (auto index : data) {
+    tensor->add_int_val(index);
+  }
+  auto* shape = tensor->mutable_tensor_shape();
+  shape->add_dim()->set_size(data.size());
+}
+
 void CreateMatrixShapeTensorConst(const string& name, int rows, int cols,
                                   GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
@@ -324,7 +368,7 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
     biasadd_op->add_input(conv_output);
     biasadd_op->add_input(src_op.inputs[2]);
     (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
-    CHECK(model.arrays.count(src_op.inputs[2]));
+    CHECK(model.HasArray(src_op.inputs[2]));
     const string& bias_array_name =
         WalkUpToConstantArray(model, src_op.inputs[2]);
     const auto& bias_array = model.GetArray(bias_array_name);
@@ -361,7 +405,7 @@ void ConvertDepthwiseConvOperator(const Model& model,
   // We need to convert that to H x W x InputDepth x Multiplier.
   // That's only a matter of constructing a Dims object; the actual
   // array layout is the same.
-  CHECK(model.arrays.count(src_op.inputs[1]));
+  CHECK(model.HasArray(src_op.inputs[1]));
   const string& src_weights_name =
       WalkUpToConstantArray(model, src_op.inputs[1]);
   const auto& src_weights_array = model.GetArray(src_weights_name);
@@ -404,7 +448,7 @@ void ConvertDepthwiseConvOperator(const Model& model,
     biasadd_op->add_input(conv_output);
     biasadd_op->add_input(src_op.inputs[2]);
     (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
-    CHECK(model.arrays.count(src_op.inputs[2]));
+    CHECK(model.HasArray(src_op.inputs[2]));
     const string& bias_name = WalkUpToConstantArray(model, src_op.inputs[2]);
     const auto& bias_array = model.GetArray(bias_name);
     // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
@@ -445,14 +489,23 @@ void ConvertSpaceToDepthOperator(const Model& model,
 void ConvertFullyConnectedOperator(const Model& model,
                                    const FullyConnectedOperator& src_op,
                                    GraphDef* tensorflow_graph) {
-  const string reshape_output = src_op.outputs[0] + "/reshape";
-  const string reshape_shape = src_op.outputs[0] + "/reshape/shape";
+  // Reshape input activations to have the shape expected by the MatMul.
+  const string reshape_output =
+      AvailableArrayName(model, src_op.outputs[0] + "/reshape");
+  const string reshape_shape =
+      AvailableArrayName(model, reshape_output + "/shape");
+  const auto& fc_weights_array = model.GetArray(src_op.inputs[1]);
+  const auto& fc_weights_shape = fc_weights_array.shape();
+  CHECK_EQ(fc_weights_shape.dimensions_count(), 2);
+  CreateMatrixShapeTensorConst(reshape_shape, fc_weights_shape.dims(1), -1,
+                               tensorflow_graph);
   auto* reshape_op = tensorflow_graph->add_node();
   reshape_op->set_op("Reshape");
   reshape_op->set_name(reshape_output);
   reshape_op->add_input(src_op.inputs[0]);
   reshape_op->add_input(reshape_shape);
-  (*reshape_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*reshape_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
 
   const bool has_bias = src_op.inputs.size() >= 3;
   string matmul_output = src_op.outputs[0];
@@ -460,40 +513,45 @@ void ConvertFullyConnectedOperator(const Model& model,
     matmul_output += "/matmul";
   }
 
+  // Transpose the RHS input from column-major to row-major to match TensorFlow
+  // expectations. This is the inverse of the transpose we do during
+  // ResolveTensorFlowMatMul.
+  const string transpose_output =
+      AvailableArrayName(model, matmul_output + "/transpose_weights");
+  const string transpose_perm =
+      AvailableArrayName(model, transpose_output + "/perm");
+  CreateIntTensorConst(transpose_perm, {1, 0}, tensorflow_graph);
+  auto transpose_op = tensorflow_graph->add_node();
+  transpose_op->set_op("Transpose");
+  transpose_op->set_name(transpose_output);
+  *transpose_op->add_input() = src_op.inputs[1];
+  *transpose_op->add_input() = transpose_perm;
+  (*transpose_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[1]));
+  (*transpose_op->mutable_attr())["Tperm"].set_type(DT_INT32);
+
   auto* matmul_op = tensorflow_graph->add_node();
   matmul_op->set_op("MatMul");
-
   matmul_op->set_name(matmul_output);
   *matmul_op->add_input() = reshape_output;
-  *matmul_op->add_input() = src_op.inputs[1];
-  (*matmul_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  *matmul_op->add_input() = transpose_op->name();
+  (*matmul_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
   (*matmul_op->mutable_attr())["transpose_a"].set_b(false);
   (*matmul_op->mutable_attr())["transpose_b"].set_b(false);
-  CHECK(model.arrays.count(src_op.inputs[1]));
-  const string& fc_weights_name =
-      WalkUpToConstantArray(model, src_op.inputs[1]);
-  const auto& fc_weights_array = *model.arrays.at(fc_weights_name);
-  const auto& fc_weights_shape = fc_weights_array.shape();
-  CHECK_EQ(fc_weights_shape.dimensions_count(), 2);
-  CreateMatrixShapeTensorConst(reshape_shape, fc_weights_shape.dims(1), -1,
-                               tensorflow_graph);
-
-  CHECK(fc_weights_array.buffer);
-  CHECK(fc_weights_array.buffer->type == ArrayDataType::kFloat);
-  const float* fc_weights_data =
-      fc_weights_array.GetBuffer<ArrayDataType::kFloat>().data.data();
-  ConvertFloatTensorConst(fc_weights_name, fc_weights_shape, fc_weights_data,
-                          AxesOrder::kCR, AxesOrder::kRC, tensorflow_graph);
+  CHECK(model.HasArray(src_op.inputs[1]));
 
+  // Add the bias, if it exists.
   if (has_bias) {
     auto* biasadd_op = tensorflow_graph->add_node();
     biasadd_op->set_op("BiasAdd");
     biasadd_op->set_name(src_op.outputs[0]);
     biasadd_op->add_input(matmul_output);
     biasadd_op->add_input(src_op.inputs[2]);
-    (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
-    CHECK(model.arrays.count(src_op.inputs[2]));
-    const auto& bias_array = *model.arrays.at(src_op.inputs[2]);
+    (*biasadd_op->mutable_attr())["T"].set_type(
+        GetTensorFlowDataType(model, src_op.inputs[0]));
+    CHECK(model.HasArray(src_op.inputs[2]));
+    const auto& bias_array = model.GetArray(src_op.inputs[2]);
     // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
     Shape bias_shape_1d = bias_array.shape();
     UnextendShape(&bias_shape_1d, 1);
@@ -519,6 +577,18 @@ void ConvertAddOperator(const Model& model, const AddOperator& src_op,
   (*add_op->mutable_attr())["T"].set_type(DT_FLOAT);
 }
 
+void ConvertAddNOperator(const Model& model, const AddNOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  auto* add_op = tensorflow_graph->add_node();
+  add_op->set_op("AddN");
+  add_op->set_name(src_op.outputs[0]);
+  for (const auto& input : src_op.inputs) {
+    *add_op->add_input() = input;
+  }
+  (*add_op->mutable_attr())["N"].set_i(src_op.inputs.size());
+  (*add_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
 void ConvertMulOperator(const Model& model, const MulOperator& src_op,
                         GraphDef* tensorflow_graph) {
   auto* add_op = tensorflow_graph->add_node();
@@ -609,7 +679,8 @@ void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op,
                             GraphDef* tensorflow_graph) {
   string softmax_input;
   Operator* providing_op = GetOpWithOutput(model, src_op.inputs[0]);
-  if (providing_op->type == OperatorType::kTensorFlowReshape) {
+  if (providing_op != nullptr &&
+      providing_op->type == OperatorType::kTensorFlowReshape) {
     softmax_input = src_op.inputs[0];
   } else {
     // Insert a reshape operator that reduces the dimensions down to the 2 that
@@ -625,7 +696,7 @@ void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op,
     *reshape_op->add_input() = softmax_size;
     (*reshape_op->mutable_attr())["T"].set_type(DT_FLOAT);
 
-    const auto& input_shape = model.arrays.at(src_op.inputs[0])->shape();
+    const auto& input_shape = model.GetArray(src_op.inputs[0]).shape();
     int32 flattened_size = 1;
     for (int i = 0; i < input_shape.dimensions_count() - 1; ++i) {
       flattened_size *= input_shape.dims(i);
@@ -644,6 +715,45 @@ void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op,
   (*softmax_op->mutable_attr())["T"].set_type(DT_FLOAT);
 }
 
+void ConvertLogSoftmaxOperator(const Model& model,
+                               const LogSoftmaxOperator& src_op,
+                               GraphDef* tensorflow_graph) {
+  string softmax_input;
+  Operator* providing_op = GetOpWithOutput(model, src_op.inputs[0]);
+  if (providing_op->type == OperatorType::kTensorFlowReshape) {
+    softmax_input = src_op.inputs[0];
+  } else {
+    // Insert a reshape operator that reduces the dimensions down to the 2 that
+    // are required for TensorFlow Logits.
+    const string reshape_output =
+        src_op.outputs[0] + "/log_softmax_insert_reshape";
+    const string softmax_size = src_op.outputs[0] + "/log_softmax_insert_size";
+    softmax_input = reshape_output;
+
+    auto* reshape_op = tensorflow_graph->add_node();
+    reshape_op->set_op("Reshape");
+    reshape_op->set_name(reshape_output);
+    *reshape_op->add_input() = src_op.inputs[0];
+    *reshape_op->add_input() = softmax_size;
+    (*reshape_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+    const auto& input_shape = model.GetArray(src_op.inputs[0]).shape();
+    int32 flattened_size = 1;
+    for (int i = 0; i < input_shape.dimensions_count() - 1; ++i) {
+      flattened_size *= input_shape.dims(i);
+    }
+    const std::vector<int32> shape_data = {
+        flattened_size, input_shape.dims(input_shape.dimensions_count() - 1)};
+    CreateReshapeShapeTensorConst(softmax_size, shape_data, tensorflow_graph);
+  }
+
+  auto* log_softmax_op = tensorflow_graph->add_node();
+  log_softmax_op->set_op("LogSoftmax");
+  log_softmax_op->set_name(src_op.outputs[0]);
+  *log_softmax_op->add_input() = softmax_input;
+  (*log_softmax_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
 void ConvertL2NormalizationOperator(const L2NormalizationOperator& src_op,
                                     GraphDef* tensorflow_graph) {
   const string square_output = src_op.outputs[0] + "/square";
@@ -780,14 +890,14 @@ void ConvertConcatenationOperator(const Model& model,
   auto* dc_op = tensorflow_graph->add_node();
   dc_op->set_op("ConcatV2");
   dc_op->set_name(src_op.outputs[0]);
-  const string dummy_concat_dim = src_op.outputs[0] + "/concat_dim";
-  CreateDummyConcatDimTensorConst(dummy_concat_dim, src_op.concat_dim,
-                                  tensorflow_graph);
+  const string dummy_axis = src_op.outputs[0] + "/axis";
+  CreateDummyConcatDimTensorConst(dummy_axis, src_op.axis, tensorflow_graph);
   for (const auto& input : src_op.inputs) {
     *dc_op->add_input() = input;
   }
-  *dc_op->add_input() = dummy_concat_dim;
-  (*dc_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  *dc_op->add_input() = dummy_axis;
+  (*dc_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
   (*dc_op->mutable_attr())["Tidx"].set_type(DT_INT32);
   (*dc_op->mutable_attr())["N"].set_i(src_op.inputs.size());
 }
@@ -801,10 +911,13 @@ void ConvertTensorFlowReshapeOperator(const Model& model,
   CHECK_EQ(src_op.inputs.size(), 2);
   *reshape_op->add_input() = src_op.inputs[0];
   *reshape_op->add_input() = src_op.inputs[1];
-  (*reshape_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*reshape_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.outputs[0]));
   const auto& shape_array = model.GetArray(src_op.inputs[1]);
-  CHECK(shape_array.data_type == ArrayDataType::kInt32);
-  CHECK(shape_array.buffer != nullptr);
+  QCHECK(shape_array.data_type == ArrayDataType::kInt32)
+      << "Only int32 shape is supported.";
+  QCHECK(shape_array.buffer != nullptr)
+      << "Shape inferred at runtime is not supported.";
   const auto& shape_data = shape_array.GetBuffer<ArrayDataType::kInt32>().data;
   CreateReshapeShapeTensorConst(src_op.inputs[1], shape_data, tensorflow_graph);
 }
@@ -896,22 +1009,6 @@ void ConvertSplitOperator(const Model& model,
                                   tensorflow_graph);
 }
 
-tensorflow::DataType GetTensorFlowDataType(const Model& model,
-                                           const string& array_name) {
-  auto& dtype = model.GetArray(array_name).data_type;
-  CHECK(dtype == ArrayDataType::kFloat || dtype == ArrayDataType::kInt32 ||
-        dtype == ArrayDataType::kUint8);
-  if (dtype == ArrayDataType::kFloat) {
-    return tensorflow::DT_FLOAT;
-  } else if (dtype == ArrayDataType::kInt32) {
-    return tensorflow::DT_INT32;
-  } else if (dtype == ArrayDataType::kUint8) {
-    return tensorflow::DT_UINT8;
-  } else {
-    LOG(FATAL) << "Wrong data type";
-  }
-}
-
 void ConvertCastOperator(const Model& model, const CastOperator& src_op,
                          GraphDef* tensorflow_graph) {
   auto* cast_op = tensorflow_graph->add_node();
@@ -950,6 +1047,129 @@ void ConvertGatherOperator(const Model& model, const GatherOperator& src_op,
   (*gather_op->mutable_attr())["Tparams"].set_type(params_type);
 }
 
+void ConvertArgMaxOperator(const Model& model, const ArgMaxOperator& src_op,
+                           GraphDef* tensorflow_graph) {
+  auto* argmax_op = tensorflow_graph->add_node();
+  argmax_op->set_op("ArgMax");
+  argmax_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *argmax_op->add_input() = src_op.inputs[0];
+  *argmax_op->add_input() = src_op.inputs[1];
+  (*argmax_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+  (*argmax_op->mutable_attr())["Tidx"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[1]));
+  (*argmax_op->mutable_attr())["output_type"].set_type(
+      GetTensorFlowDataType(model, src_op.outputs[0]));
+}
+
+void ConvertTransposeOperator(const Model& model,
+                              const TransposeOperator& src_op,
+                              GraphDef* tensorflow_graph) {
+  auto* transpose_op = tensorflow_graph->add_node();
+  transpose_op->set_op("Transpose");
+  transpose_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *transpose_op->add_input() = src_op.inputs[0];
+  *transpose_op->add_input() = src_op.inputs[1];
+  (*transpose_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+  (*transpose_op->mutable_attr())["Tperm"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[1]));
+}
+
+void ConvertTensorFlowShapeOperator(const Model& model,
+                                    const TensorFlowShapeOperator& src_op,
+                                    GraphDef* tensorflow_graph) {
+  auto* shape_op = tensorflow_graph->add_node();
+  shape_op->set_op("Shape");
+  shape_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *shape_op->add_input() = src_op.inputs[0];
+  (*shape_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+  (*shape_op->mutable_attr())["out_type"].set_type(
+      GetTensorFlowDataType(model, src_op.outputs[0]));
+}
+
+void ConvertRankOperator(const Model& model, const RankOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  auto* rank_op = tensorflow_graph->add_node();
+  rank_op->set_op("Rank");
+  rank_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *rank_op->add_input() = src_op.inputs[0];
+  (*rank_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+}
+
+void ConvertRangeOperator(const Model& model, const RangeOperator& src_op,
+                          GraphDef* tensorflow_graph) {
+  auto* range_op = tensorflow_graph->add_node();
+  range_op->set_op("Range");
+  range_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 3);
+  *range_op->add_input() = src_op.inputs[0];
+  *range_op->add_input() = src_op.inputs[1];
+  *range_op->add_input() = src_op.inputs[2];
+  (*range_op->mutable_attr())["Tidx"].set_type(
+      GetTensorFlowDataType(src_op.dtype));
+}
+
+void ConvertStackOperator(const Model& model, const StackOperator& src_op,
+                          GraphDef* tensorflow_graph) {
+  auto* stack_op = tensorflow_graph->add_node();
+  stack_op->set_op("Stack");
+  stack_op->set_name(src_op.outputs[0]);
+  for (const auto& input : src_op.inputs) {
+    *stack_op->add_input() = input;
+  }
+  (*stack_op->mutable_attr())["elem_type"].set_type(
+      GetTensorFlowDataType(model, src_op.outputs[0]));
+  (*stack_op->mutable_attr())["axis"].set_i(src_op.axis);
+}
+
+void ConvertFillOperator(const Model& model, const FillOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  auto* fill_op = tensorflow_graph->add_node();
+  fill_op->set_op("Fill");
+  fill_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *fill_op->add_input() = src_op.inputs[0];
+  *fill_op->add_input() = src_op.inputs[1];
+  (*fill_op->mutable_attr())["index_type"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+  (*fill_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[1]));
+}
+
+void ConvertFloorDivOperator(const Model& model, const FloorDivOperator& src_op,
+                             GraphDef* tensorflow_graph) {
+  auto* floor_div_op = tensorflow_graph->add_node();
+  floor_div_op->set_op("FloorDiv");
+  floor_div_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *floor_div_op->add_input() = src_op.inputs[0];
+  *floor_div_op->add_input() = src_op.inputs[1];
+  (*floor_div_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+}
+
+void ConvertExpandDimsOperator(const Model& model,
+                               const ExpandDimsOperator& src_op,
+                               GraphDef* tensorflow_graph) {
+  auto* expand_dims_op = tensorflow_graph->add_node();
+  expand_dims_op->set_op("ExpandDims");
+  expand_dims_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *expand_dims_op->add_input() = src_op.inputs[0];
+  *expand_dims_op->add_input() = src_op.inputs[1];
+  (*expand_dims_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+  (*expand_dims_op->mutable_attr())["Tdim"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[1]));
+}
+
 void ConvertResizeBilinearOperator(const Model& model,
                                    const ResizeBilinearOperator& src_op,
                                    GraphDef* tensorflow_graph) {
@@ -960,6 +1180,7 @@ void ConvertResizeBilinearOperator(const Model& model,
   *resize_op->add_input() = src_op.inputs[0];
   *resize_op->add_input() = src_op.inputs[1];
   (*resize_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*resize_op->mutable_attr())["align_corners"].set_b(src_op.align_corners);
 }
 
 namespace {
@@ -993,31 +1214,30 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   const string concat_output = base + "basic_lstm_cell/concat";
   // Op names have been chosen to match the tf.slim LSTM naming
   // as closely as possible.
-  const int concat_dim =
-      model.arrays.at(src_op.inputs[LstmCellOperator::PREV_ACTIV_INPUT])
-          ->shape()
+  const int axis =
+      model.GetArray(src_op.inputs[LstmCellOperator::PREV_ACTIV_INPUT])
+          .shape()
           .dimensions_count() -
       1;
   // Note that DATA_INPUT may have extra size 1 dimensions, but TF concat
   // works the same since the tensor has the same underlying data layout.
-  const string concat_dim_output = concat_output + "/concat_dim";
-  CreateDummyConcatDimTensorConst(concat_dim_output, concat_dim,
-                                  tensorflow_graph);
+  const string axis_output = concat_output + "/axis";
+  CreateDummyConcatDimTensorConst(axis_output, axis, tensorflow_graph);
   auto* concat_op = tensorflow_graph->add_node();
   concat_op->set_op("ConcatV2");
   concat_op->set_name(concat_output);
   *concat_op->add_input() = src_op.inputs[LstmCellOperator::DATA_INPUT];
   *concat_op->add_input() = src_op.inputs[LstmCellOperator::PREV_ACTIV_INPUT];
-  *concat_op->add_input() = concat_dim_output;
+  *concat_op->add_input() = axis_output;
   (*concat_op->mutable_attr())["T"].set_type(DT_FLOAT);
   (*concat_op->mutable_attr())["Tidx"].set_type(DT_INT32);
   (*concat_op->mutable_attr())["N"].set_i(2);  // Number of inputs
 
   // Write weights
   const string weights_output = base + "weights";
-  CHECK(model.arrays.count(src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]));
+  CHECK(model.HasArray(src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]));
   const auto& weights_array =
-      *model.arrays.at(src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]);
+      model.GetArray(src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]);
   // Convert 4D FullyConnected weights into 2D matrix
   const auto& weights_shape = weights_array.shape();
   CHECK_EQ(weights_shape.dimensions_count(), 2);
@@ -1041,9 +1261,9 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
 
   // Write biases
   const string biases_output = base + "biases";
-  CHECK(model.arrays.count(src_op.inputs[LstmCellOperator::BIASES_INPUT]));
+  CHECK(model.HasArray(src_op.inputs[LstmCellOperator::BIASES_INPUT]));
   const auto& bias_array =
-      *model.arrays.at(src_op.inputs[LstmCellOperator::BIASES_INPUT]);
+      model.GetArray(src_op.inputs[LstmCellOperator::BIASES_INPUT]);
   // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
   Shape bias_shape_1d = bias_array.shape();
   UnextendShape(&bias_shape_1d, 1);
@@ -1069,8 +1289,7 @@ void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
   // Split
   string split_dim_output = base + "split/split_dim";
   // The dimension is the same as the concatenation dimension
-  CreateDummyConcatDimTensorConst(split_dim_output, concat_dim,
-                                  tensorflow_graph);
+  CreateDummyConcatDimTensorConst(split_dim_output, axis, tensorflow_graph);
   string split_output = base + "split";
   auto* split_op = tensorflow_graph->add_node();
   split_op->set_op("Split");
@@ -1298,11 +1517,11 @@ void ConvertMeanOperator(const Model& model, const MeanOperator& src_op,
   auto* tensor = (*params_op->mutable_attr())["value"].mutable_tensor();
   tensor->set_dtype(DT_INT32);
 
-  for (int i = 0; i < src_op.reduction_indices.size(); ++i) {
-    tensor->add_int_val(src_op.reduction_indices[i]);
+  for (int i = 0; i < src_op.axis.size(); ++i) {
+    tensor->add_int_val(src_op.axis[i]);
   }
   auto* shape = tensor->mutable_tensor_shape();
-  shape->add_dim()->set_size(src_op.reduction_indices.size());
+  shape->add_dim()->set_size(src_op.axis.size());
 }
 
 void ConvertSqueezeOperator(const Model& model, const SqueezeOperator& src_op,
@@ -1389,6 +1608,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kAdd) {
     ConvertAddOperator(model, static_cast<const AddOperator&>(src_op),
                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kAddN) {
+    ConvertAddNOperator(model, static_cast<const AddNOperator&>(src_op),
+                        tensorflow_graph);
   } else if (src_op.type == OperatorType::kMul) {
     ConvertMulOperator(model, static_cast<const MulOperator&>(src_op),
                        tensorflow_graph);
@@ -1413,6 +1635,10 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kSoftmax) {
     ConvertSoftmaxOperator(model, static_cast<const SoftmaxOperator&>(src_op),
                            tensorflow_graph);
+  } else if (src_op.type == OperatorType::kLogSoftmax) {
+    ConvertLogSoftmaxOperator(model,
+                              static_cast<const LogSoftmaxOperator&>(src_op),
+                              tensorflow_graph);
   } else if (src_op.type == OperatorType::kLocalResponseNormalization) {
     ConvertLocalResponseNormalizationOperator(
         static_cast<const LocalResponseNormalizationOperator&>(src_op),
@@ -1498,6 +1724,35 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kSlice) {
     ConvertSliceOperator(model, static_cast<const SliceOperator&>(src_op),
                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kArgMax) {
+    ConvertArgMaxOperator(model, static_cast<const ArgMaxOperator&>(src_op),
+                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTranspose) {
+    ConvertTransposeOperator(
+        model, static_cast<const TransposeOperator&>(src_op), tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowShape) {
+    ConvertTensorFlowShapeOperator(
+        model, static_cast<const TensorFlowShapeOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kRank) {
+    ConvertRankOperator(model, static_cast<const RankOperator&>(src_op),
+                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kRange) {
+    ConvertRangeOperator(model, static_cast<const RangeOperator&>(src_op),
+                         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kStack) {
+    ConvertStackOperator(model, static_cast<const StackOperator&>(src_op),
+                         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kFill) {
+    ConvertFillOperator(model, static_cast<const FillOperator&>(src_op),
+                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kFloorDiv) {
+    ConvertFloorDivOperator(model, static_cast<const FloorDivOperator&>(src_op),
+                            tensorflow_graph);
+  } else if (src_op.type == OperatorType::kExpandDims) {
+    ConvertExpandDimsOperator(model,
+                              static_cast<const ExpandDimsOperator&>(src_op),
+                              tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
@@ -1537,7 +1792,7 @@ void AddPlaceholderForRNNState(const Model& model, const string& name, int size,
   (*placeholder->mutable_attr())["dtype"].set_type(DT_FLOAT);
 
   auto* shape = (*placeholder->mutable_attr())["shape"].mutable_shape();
-  const auto& state_array = *model.arrays.at(name);
+  const auto& state_array = model.GetArray(name);
   if (state_array.has_shape()) {
     const auto& state_shape = state_array.shape();
     const int kDims = state_shape.dimensions_count();
@@ -1554,7 +1809,7 @@ void ExportTensorFlowGraphDefImplementation(const Model& model,
                                             GraphDef* tensorflow_graph) {
   for (const auto& input_array : model.flags.input_arrays()) {
     AddPlaceholder(input_array.name(),
-                   model.arrays.at(input_array.name())->data_type,
+                   model.GetArray(input_array.name()).data_type,
                    tensorflow_graph);
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
@@ -1568,7 +1823,7 @@ void ExportTensorFlowGraphDefImplementation(const Model& model,
   // by the above operators export. It's important that this comes
   // after, as some operators need to export arrays that they reference
   // in a specific way, rather than in the generic way done below.
-  for (const auto& array_pair : model.arrays) {
+  for (const auto& array_pair : model.GetArrayMap()) {
     const string& array_name = array_pair.first;
     const auto& array = *array_pair.second;
     if (array.buffer) {
@@ -1587,6 +1842,30 @@ void ExportTensorFlowGraphDefImplementation(const Model& model,
 }
 }  // namespace
 
+void EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(Model* model) {
+  for (const auto& array_kv : model->GetArrayMap()) {
+    const string& array_name = array_kv.first;
+    Array& array = *array_kv.second;
+    if (!array.buffer || !array.minmax) {
+      continue;
+    }
+    const string& wrapped_array_name =
+        AvailableArrayName(*model, array_name + "/data");
+    Array& wrapped_array = model->GetOrCreateArray(wrapped_array_name);
+    wrapped_array.data_type = array.data_type;
+    wrapped_array.copy_shape(array.shape());
+    wrapped_array.buffer = std::move(array.buffer);
+    FakeQuantOperator* fakequant_op = new FakeQuantOperator;
+    fakequant_op->inputs = {wrapped_array_name};
+    fakequant_op->outputs = {array_name};
+    fakequant_op->minmax.reset(new MinMax);
+    *fakequant_op->minmax = *array.minmax;
+    const auto& it = FindOpWithInput(*model, array_name);
+    model->operators.emplace(it, fakequant_op);
+  }
+  CheckInvariants(*model);
+}
+
 void ExportTensorFlowGraphDef(const Model& model,
                               string* output_file_contents) {
   CHECK(output_file_contents->empty());
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.h b/tensorflow/contrib/lite/toco/export_tensorflow.h
index eca97745767387a04bcd2c8deb579928edf2497c..d7310bb75f258cde25236da2a9269f18234784e4 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.h
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
 
 #include <string>
 #include "tensorflow/contrib/lite/toco/model.h"
@@ -22,6 +22,8 @@ namespace toco {
 
 void ExportTensorFlowGraphDef(const Model& model, string* output_file_contents);
 
+void EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(Model* model);
+
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
diff --git a/tensorflow/contrib/lite/toco/format_port.h b/tensorflow/contrib/lite/toco/format_port.h
index 0e999001e0e35fb916b11db199dbf28572685f3d..eb81e90faf20133ed722185928f86ef45ac4f8f6 100644
--- a/tensorflow/contrib/lite/toco/format_port.h
+++ b/tensorflow/contrib/lite/toco/format_port.h
@@ -16,8 +16,8 @@ limitations under the License.
 // and util::format::AppendF. Unfortunately, type safety is not as good as a
 // a full C++ example.
 // TODO(aselle): When absl adds support for StrFormat, use that instead.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
 
 #include "tensorflow/contrib/lite/toco/toco_types.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -74,4 +74,4 @@ inline string StringF(const char* fmt, Args&&... args) {
 }  // namespace port
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
index 7e152f5ba887088c98055596f8245b82fbc86eaa..372c52558973f4aacc180ac44b9e95a5e9b199ef 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
@@ -23,7 +23,7 @@ curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.lite \
+  --output_file=/tmp/foo.tflite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
   --inference_type=FLOAT \
@@ -101,7 +101,7 @@ direction, let us just give an example of that:
 ```
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
-  --input_file=/tmp/foo.lite \
+  --input_file=/tmp/foo.tflite \
   --output_file=/tmp/foo.pb \
   --input_format=TFLITE \
   --output_format=TENSORFLOW_GRAPHDEF \
@@ -130,7 +130,7 @@ flatbuffer is done like this:
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/some_quantized_graph.pb \
-  --output_file=/tmp/foo.lite \
+  --output_file=/tmp/foo.tflite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
   --inference_type=QUANTIZED_UINT8 \
@@ -207,7 +207,7 @@ curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/inception_v1_2016_08_28_frozen.pb \
-  --output_file=/tmp/foo.lite \
+  --output_file=/tmp/foo.tflite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
   --inference_type=FLOAT \
@@ -235,7 +235,7 @@ curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/inception_v1_2016_08_28_frozen.pb \
-  --output_file=/tmp/foo.lite \
+  --output_file=/tmp/foo.tflite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
   --inference_type=FLOAT \
@@ -308,7 +308,7 @@ curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.lite \
+  --output_file=/tmp/foo.tflite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
   --inference_type=FLOAT \
@@ -415,7 +415,7 @@ curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.lite \
+  --output_file=/tmp/foo.tflite \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TFLITE \
   --inference_type=FLOAT \
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
index 4776741ab9273cf3b2ef0c63a6dbfdea5475b057..5e077952235fa1aac1e12403d3d83633a617ccb7 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
@@ -229,7 +229,7 @@ additional information about the multiple input arrays:
     well-formed quantized representation of these graphs. Such graphs should be
     fixed, but as a temporary work-around, setting this
     reorder_across_fake_quant flag allows the converter to perform necessary
-    graph transformaitons on them, at the cost of no longer faithfully matching
+    graph transformations on them, at the cost of no longer faithfully matching
     inference and training arithmetic.
 
 ### Logging flags
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..56f48d47de4e86ece76ceef1d09a25f50957a8dc
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
@@ -0,0 +1,101 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ConvertExpandDimsToReshape::Run(Model* model, std::size_t op_index) {
+  auto expand_it = model->operators.begin() + op_index;
+  if (expand_it->get()->type != OperatorType::kExpandDims) {
+    return false;
+  }
+  ExpandDimsOperator* expand_op =
+      static_cast<ExpandDimsOperator*>(expand_it->get());
+  CHECK_EQ(expand_op->inputs.size(), 2);
+  CHECK_EQ(expand_op->outputs.size(), 1);
+
+  const auto& input_array = model->GetArray(expand_op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // Yield until input dims have been resolved.
+    return false;
+  }
+  if (input_array.shape().dimensions_count() == 0) {
+    // Input array cannot be 0-D.
+    // (Unsure if this is TF behavior, but was required to get a test to pass.)
+    return false;
+  }
+
+  const auto& axis_array = model->GetArray(expand_op->inputs[1]);
+  if (!axis_array.has_shape()) {
+    // Yield until input axis array shape has been resolved.
+    return false;
+  }
+  CHECK_EQ(RequiredBufferSizeForShape(axis_array.shape()), 1);
+  if (!axis_array.buffer) {
+    // Yield until the input axis array is constant
+    return false;
+  }
+  int axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  std::vector<int> reshape_dims(input_array.shape().dims());
+  if (axis < 0) {
+    axis = reshape_dims.size();
+  }
+  reshape_dims.insert(reshape_dims.begin() + axis, 1);
+
+  // The input tensor has shape, and the axis input is constant. We can now
+  // replace ExpandDims with a Reshape.
+  auto* reshape_op = new TensorFlowReshapeOperator;
+
+  // Copy inputs
+  reshape_op->inputs.push_back(expand_op->inputs[0]);
+  reshape_op->outputs = expand_op->outputs;
+
+  // Create a new input array
+  string axis_array_name = expand_op->inputs[1];
+  string shape_array_name = toco::AvailableArrayName(*model, axis_array_name);
+  Array& shape_array = model->GetOrCreateArray(shape_array_name);
+  *(shape_array.mutable_shape()->mutable_dims()) = {
+      1, static_cast<int>(reshape_dims.size())};
+  reshape_op->inputs.push_back(shape_array_name);
+  shape_array.data_type = ArrayDataType::kInt32;
+  auto& shape_buffer = shape_array.GetMutableBuffer<ArrayDataType::kInt32>();
+  shape_buffer.data = reshape_dims;
+
+  // Delete axis array if unused
+  if (IsDiscardableArray(*model, axis_array_name) &&
+      CountOpsWithInput(*model, axis_array_name) == 1 &&
+      !GetOpWithOutput(*model, axis_array_name)) {
+    model->EraseArray(axis_array_name);
+  }
+
+  // Replace the operator in the graph.
+  const auto reshape_it = model->operators.emplace(expand_it, reshape_op);
+  expand_it = reshape_it + 1;
+  CHECK_EQ(expand_it->get(), expand_op);
+  model->operators.erase(expand_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
index bf454c40c7b50d242d8a7e9eb6b7e579fb0da217..d38db85280d7bd935a47cda70227d383a513fbac 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
@@ -58,7 +58,7 @@ bool ConvertPureConvToDepthwise::Run(Model* model, std::size_t op_index) {
   depthwiseconv_op->outputs = {conv_op->outputs[0]};
   if (conv_op->outputs.size() > 1) {
     // delete the im2col array.
-    model->arrays.erase(conv_op->outputs[1]);
+    model->EraseArray(conv_op->outputs[1]);
   }
   depthwiseconv_op->fused_activation_function =
       conv_op->fused_activation_function;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_reorder_axes.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_reorder_axes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d274fc687c8d42d47ddb5beb4f9c6f39b417097
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_reorder_axes.cc
@@ -0,0 +1,149 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// Creates a Reshape operator from ReorderAxes operator.
+TensorFlowReshapeOperator* CreateReshapeFromReorderAxes(
+    Model* model, ReorderAxesOperator* reorder_op, const Shape& input_shape) {
+  auto* reshape_op = new TensorFlowReshapeOperator;
+
+  // Copy inputs and outputs to Reshape.
+  reshape_op->inputs.push_back(reorder_op->inputs[0]);
+  reshape_op->outputs = reorder_op->outputs;
+
+  // Create reshape dimensions based on input shape. Conversion from
+  // ReorderAxes to Reshape requires a 4D input shape.
+  CHECK_EQ(input_shape.dimensions_count(), 4);
+  std::vector<int> reshape_dims = {1, input_shape.dims(0), input_shape.dims(1),
+                                   input_shape.dims(3) * input_shape.dims(2)};
+
+  // Create a new input array for Reshape.
+  string reshape_array_name =
+      AvailableArrayName(*model, reshape_op->outputs[0]);
+  reshape_op->inputs.push_back(reshape_array_name);
+
+  Array& reshape_array = model->GetOrCreateArray(reshape_array_name);
+  *(reshape_array.mutable_shape()->mutable_dims()) = {
+      1, static_cast<int>(reshape_dims.size())};
+  reshape_array.data_type = ArrayDataType::kInt32;
+  auto& reshape_buffer =
+      reshape_array.GetMutableBuffer<ArrayDataType::kInt32>();
+  reshape_buffer.data = reshape_dims;
+
+  return reshape_op;
+}
+
+// Creates a Transpose operator from ReorderAxes operator.
+TransposeOperator* CreateTransposeFromReorderAxes(
+    Model* model, ReorderAxesOperator* reorder_op, const Shape& input_shape,
+    const AxesOrder& input_axes_order, const AxesOrder& output_axes_order) {
+  auto* transpose_op = new TransposeOperator;
+
+  // Copy inputs and outputs to Transpose.
+  transpose_op->inputs.push_back(reorder_op->inputs[0]);
+  transpose_op->outputs = reorder_op->outputs;
+
+  // Create permutations data based on input and output axes order.
+  std::vector<int> permutations_data;
+  GetShuffleShape(input_axes_order, output_axes_order, &permutations_data);
+
+  // Create a new input permutations array for Transpose.
+  string perm_array_name = AvailableArrayName(*model, transpose_op->outputs[0]);
+  transpose_op->inputs.push_back(perm_array_name);
+
+  Array& perm_array = model->GetOrCreateArray(perm_array_name);
+  *(perm_array.mutable_shape()->mutable_dims()) = {
+      static_cast<int>(permutations_data.size())};
+  perm_array.data_type = ArrayDataType::kInt32;
+  auto& perm_buffer = perm_array.GetMutableBuffer<ArrayDataType::kInt32>();
+  perm_buffer.data = permutations_data;
+
+  return transpose_op;
+}
+
+// Converts ReorderAxes into Transpose and Reshape which are compatible with the
+// TFLite interpreter.
+bool ConvertReorderAxes::Run(Model* model, std::size_t op_index) {
+  auto reorder_it = model->operators.begin() + op_index;
+  if (reorder_it->get()->type != OperatorType::kReorderAxes) return false;
+
+  auto* reorder_op = static_cast<ReorderAxesOperator*>(reorder_it->get());
+  CHECK_EQ(reorder_op->inputs.size(), 1);
+  CHECK_EQ(reorder_op->outputs.size(), 1);
+
+  const auto& input_array_name = reorder_op->inputs[0];
+  const auto& output_array_name = reorder_op->outputs[0];
+  auto& input_array = model->GetArray(input_array_name);
+  auto& output_array = model->GetArray(output_array_name);
+
+  // Get input array. If kFakeQuant is the input into ReorderAxes, get the input
+  // array passed into kFakeQuant. kFakeQuant op is dropped when possible.
+  string constant_input_array_name = input_array_name;
+  if (!input_array.buffer) {
+    const auto* op_producing_input = GetOpWithOutput(*model, input_array_name);
+    if (op_producing_input &&
+        op_producing_input->type == OperatorType::kFakeQuant) {
+      constant_input_array_name = op_producing_input->inputs[0];
+    }
+  }
+
+  // Yield if input array contains constants or if output array size has not
+  // been adjusted to reflect the permutations in ReorderAxes. ReorderAxes will
+  // be merged into a constant array when possible.
+  if (IsConstantParameterArray(*model, constant_input_array_name)) return false;
+  if (!output_array.has_shape()) return false;
+
+  const auto input_axes_order = reorder_op->input_axes_order;
+  const auto output_axes_order = reorder_op->output_axes_order;
+  const Shape input_shape = input_array.shape();
+
+  // Creates a Reshape or Transpose operator depending on the conversion.
+  if (input_axes_order == AxesOrder::kHWIM &&
+      output_axes_order == AxesOrder::k1HWO) {
+    // Add Reshape operator into the graph. This special case is not just a
+    // permutation. The input dimensions get merged into 3 dimensions while the
+    // order of the elements does not change.
+    auto* reshape_op =
+        CreateReshapeFromReorderAxes(model, reorder_op, input_shape);
+    const auto reshape_it = model->operators.emplace(reorder_it, reshape_op);
+    reorder_it = reshape_it + 1;
+  } else {
+    // Add Transpose operator into the graph.
+    auto* transpose_op = CreateTransposeFromReorderAxes(
+        model, reorder_op, input_shape, input_axes_order, output_axes_order);
+    const auto transpose_it =
+        model->operators.emplace(reorder_it, transpose_op);
+    reorder_it = transpose_it + 1;
+  }
+
+  // Remove ReorderAxes operator from the graph.
+  CHECK_EQ(reorder_it->get(), reorder_op);
+  model->operators.erase(reorder_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dcaaddbf3b5409f0fc3ddaf32e23b1e5eefb6565
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// This pass will convert an AddN operator with only 2 inputs into a regular Add
+// operator, to which more optimizations may apply.
+bool ConvertTrivialAddNToAdd::Run(Model* model, std::size_t op_index) {
+  auto addn_it = model->operators.begin() + op_index;
+  if (addn_it->get()->type != OperatorType::kAddN) {
+    return false;
+  }
+  AddNOperator* addn_op = static_cast<AddNOperator*>(addn_it->get());
+  CHECK_GE(addn_op->inputs.size(), 2);
+  CHECK_EQ(addn_op->outputs.size(), 1);
+
+  // We only reduce AddN with N=2 to a regular Add.
+  if (addn_op->inputs.size() != 2) {
+    return false;
+  }
+
+  // Copy inputs & outputs to regular Add.
+  auto* add_op = new AddOperator;
+  add_op->inputs.push_back(addn_op->inputs[0]);
+  add_op->inputs.push_back(addn_op->inputs[1]);
+  add_op->outputs = addn_op->outputs;
+
+  // Replace the AddN operator in the graph.
+  const auto add_it = model->operators.emplace(addn_it, add_op);
+  addn_it = add_it + 1;
+  CHECK_EQ(addn_it->get(), addn_op);
+  model->operators.erase(addn_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_stack_to_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_stack_to_reshape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0615b5e6c6db910ee847188427b416fd812aa141
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_stack_to_reshape.cc
@@ -0,0 +1,81 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ConvertTrivialStackToReshape::Run(Model* model, std::size_t op_index) {
+  auto stack_it = model->operators.begin() + op_index;
+  if (stack_it->get()->type != OperatorType::kStack) {
+    return false;
+  }
+  auto* stack_op = static_cast<StackOperator*>(stack_it->get());
+  if (stack_op->inputs.size() > 1) {
+    // Not trivial.
+    return false;
+  }
+  CHECK_EQ(stack_op->outputs.size(), 1);
+
+  const auto& input_array = model->GetArray(stack_op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // Yield until input dims have been resolved.
+    return false;
+  }
+  if (input_array.shape().dimensions_count() == 0) {
+    // Input array cannot be 0-D.
+    // (Unsure if this is TF behavior, but was required to get a test to pass.)
+    return false;
+  }
+
+  AddMessageF("Converting trivial %s to a reshape", LogName(*stack_op));
+
+  // Note that we could convert to ExpandDims but toco prefers reshapes.
+  auto* reshape_op = new TensorFlowReshapeOperator;
+  reshape_op->inputs = {stack_op->inputs[0]};
+  reshape_op->outputs = stack_op->outputs;
+
+  // Create shape param.
+  string shape_array_name =
+      AvailableArrayName(*model, stack_op->outputs[0] + "_shape");
+  Array& shape_array = model->GetOrCreateArray(shape_array_name);
+  *(shape_array.mutable_shape()->mutable_dims()) = {
+      1 + input_array.shape().dimensions_count()};
+  reshape_op->inputs.push_back(shape_array_name);
+  shape_array.data_type = ArrayDataType::kInt32;
+  auto& shape_buffer = shape_array.GetMutableBuffer<ArrayDataType::kInt32>();
+  shape_buffer.data.push_back(1);
+  for (int dim : input_array.shape().dims()) {
+    shape_buffer.data.push_back(dim);
+  }
+
+  // Replace the operator in the graph.
+  const auto reshape_it = model->operators.emplace(stack_it, reshape_op);
+  stack_it = reshape_it + 1;
+  CHECK_EQ(stack_it->get(), stack_op);
+  model->operators.erase(stack_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c2b166033c33b777bad88cb712adf8517be1762a
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
@@ -0,0 +1,85 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ConvertTrivialTransposeToReshape::Run(Model* model, std::size_t op_index) {
+  auto transpose_it = model->operators.begin() + op_index;
+  if (transpose_it->get()->type != OperatorType::kTranspose) {
+    return false;
+  }
+  TransposeOperator* transpose_op =
+      static_cast<TransposeOperator*>(transpose_it->get());
+
+  const auto& output_array = model->GetArray(transpose_op->outputs[0]);
+  if (!output_array.has_shape()) {
+    // Yield until PropagateFixedSizes has been run on this op.
+    return false;
+  }
+  // Note: We can assume we have error checked inputs in PropagateFixedSizes.
+
+  // This transpose is trivial if we only have one non-unitary dimension.
+  std::vector<int> const& dims = output_array.shape().dims();
+  unsigned non_unitary_axis_count = 0;
+  for (int i = 0; i < dims.size(); i++) {
+    if (dims[i] != 1) {
+      non_unitary_axis_count++;
+    }
+  }
+  if (non_unitary_axis_count > 1) {
+    // Transpose is not trivial
+    return false;
+  }
+
+  // This transpose is trivial. Replace it with a Reshape op.
+  auto* reshape_op = new TensorFlowReshapeOperator;
+
+  // Copy input and output
+  reshape_op->inputs.push_back(transpose_op->inputs[0]);
+  reshape_op->outputs = transpose_op->outputs;
+
+  // Create a new input array for the shape input
+  string perm_array_name = transpose_op->inputs[1];
+  string shape_array_name = toco::AvailableArrayName(*model, perm_array_name);
+  Array& shape_array = model->GetOrCreateArray(shape_array_name);
+  *(shape_array.mutable_shape()->mutable_dims()) = {
+      1, static_cast<int>(dims.size())};
+  reshape_op->inputs.push_back(shape_array_name);
+  shape_array.data_type = ArrayDataType::kInt32;
+  auto& shape_buffer = shape_array.GetMutableBuffer<ArrayDataType::kInt32>();
+  shape_buffer.data = dims;
+
+  // Delete perm array if unused
+  if (IsDiscardableArray(*model, perm_array_name) &&
+      CountOpsWithInput(*model, perm_array_name) == 1) {
+    model->EraseArray(perm_array_name);
+  }
+
+  // Replace the operator in the graph.
+  const auto reshape_it = model->operators.emplace(transpose_it, reshape_op);
+  transpose_it = reshape_it + 1;
+  CHECK_EQ(transpose_it->get(), transpose_op);
+  model->operators.erase(transpose_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc b/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc
index 1735b51e5b6ca517bad62bf55f0cc9f0c21ac440..076415ece8c1039caa32e947fe54ab3e101bec9e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc
@@ -35,7 +35,7 @@ bool CreateIm2colArrays::Run(Model* model, std::size_t op_index) {
     // We already have an im2col array
     return false;
   }
-  const auto& weights_array = *model->arrays[conv_op->inputs[1]];
+  const auto& weights_array = model->GetArray(conv_op->inputs[1]);
   if (!weights_array.has_shape()) {
     // We need to yield until weights dims have been resolved, because
     // from the weights dims we determine whether an im2col array is
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc
index b89e3f5310cd7364294ad875cfcdf9c14660366b..498c864bde6d656c8318e981204cb42cb3a4d03f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc
@@ -53,7 +53,7 @@ std::vector<std::unique_ptr<Operator>>::iterator FindFirstOpWithInput(
 }
 
 void ClearArrayQuantizationParams(const string& array_name, Model* model) {
-  auto* array = model->arrays.at(array_name).get();
+  auto* array = &model->GetArray(array_name);
   CHECK(array->quantization_params);
   for (auto& input_array : *model->flags.mutable_input_arrays()) {
     if (input_array.name() == array_name) {
@@ -77,7 +77,7 @@ void ClearArrayQuantizationParams(const string& array_name, Model* model) {
 
 bool DequantizeArray(const string& array_name,
                      GraphTransformation* transformation, Model* model) {
-  auto* array = model->arrays.at(array_name).get();
+  auto* array = &model->GetArray(array_name);
   if (!array->quantization_params) {
     return false;
   }
@@ -214,7 +214,9 @@ bool Dequantize::Run(Model* model, std::size_t op_index) {
   }
   bool changed = false;
   for (const string& array : arrays) {
-    changed |= DequantizeArray(array, this, model);
+    if (!model->IsOptionalArray(array)) {
+      changed |= DequantizeArray(array, this, model);
+    }
   }
 
   return changed;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc
index fea360740f4e645e1f00eaed42cbff48f430fe2a..95558ef5ece9a78825daf0203e2f6f6fee6f3cda 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc
@@ -45,7 +45,7 @@ bool DropFakeQuant::Run(Model* model, std::size_t op_index) {
   // Drop min/max inputs
   for (int i = 1; i < fakequant_op->inputs.size(); i++) {
     if (CountOpsWithInput(*model, fakequant_op->inputs[i]) == 1) {
-      model->arrays.erase(fakequant_op->inputs[i]);
+      model->EraseArray(fakequant_op->inputs[i]);
     }
   }
   fakequant_op->inputs.resize(1);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc b/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc
index a3ed6663bcc80c5fc642a399b1e5c0cf3336973a..f7fd878b7e8b1c834125130ea2a778cecefd3de0 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc
@@ -32,7 +32,7 @@ bool DropIm2colArrays::Run(Model* model, std::size_t op_index) {
 
   // Drop the im2col array.
   CHECK_EQ(conv_op->outputs.size(), 2);
-  model->arrays.erase(conv_op->outputs[1]);
+  model->EraseArray(conv_op->outputs[1]);
   conv_op->outputs.resize(1);
   AddMessageF("Dropped an im2col array for %s", LogName(*conv_op));
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
index d129b5ecf2615434b8ff8387a04af9561fe617a4..ab943f72d1dd87ae9ff4bd53a807cd4923a88c38 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -68,11 +68,7 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
-  // TODO(dkalenichenko): Great many ops don't support activation function
-  // fusing. Switch to the whilelist approach instead.
-  if (op->type == OperatorType::kConcatenation ||
-      op->type == OperatorType::kSlice ||
-      op->type == OperatorType::kTensorFlowSplit) {
+  if (!OperatorSupportsFusedActivation(op->type)) {
     AddMessageF(
         "Not fusing activation function because the %s op doesn't support it",
         LogName(*op));
@@ -90,7 +86,7 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
   } else {
     LOG(FATAL) << "Unhandled activation function type";
   }
-  model->arrays.erase(ac_op->inputs[0]);
+  model->EraseArray(ac_op->inputs[0]);
   op->outputs[0] = ac_op->outputs[0];
   model->operators.erase(ac_it);
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
index 4619d8bbee2e52483a523277f421de5bfa155635..dcbbead517f26a227363989b5af2a4040c98ff57 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
@@ -285,13 +285,13 @@ bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
   AddMessageF("Fusing %s into the following %s", LogName(*binary_op),
               LogName(*following_op));
 
-  model->arrays.erase(binary_op->outputs[0]);
+  model->EraseArray(binary_op->outputs[0]);
   following_op->inputs[0] = binary_op->inputs[index_of_variable_input];
   const auto& old_constant_param_name =
       binary_op->inputs[index_of_constant_input];
   CHECK(IsConstantParameterArray(*model, old_constant_param_name));
   if (CountOpsWithInput(*model, old_constant_param_name) == 1) {
-    model->arrays.erase(old_constant_param_name);
+    model->EraseArray(old_constant_param_name);
   }
   model->operators.erase(binary_it);
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
index 8948653ec38f5a5a6e92cfe9e6bafdbf1aa9a962..5b57178b18d2d60e1f301a1a8b257d8057618550 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -309,7 +309,7 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
     LOG(FATAL) << "should not get here";
   }
 
-  model->arrays.erase(preceding_op->outputs[0]);
+  model->EraseArray(preceding_op->outputs[0]);
   preceding_op->outputs[0] = binary_op->outputs[0];
   preceding_op->fused_activation_function =
       binary_op->fused_activation_function;
@@ -317,7 +317,7 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
       binary_op->inputs[index_of_constant_input];
   CHECK(IsConstantParameterArray(*model, old_constant_param_name));
   if (CountOpsWithInput(*model, old_constant_param_name) == 1) {
-    model->arrays.erase(old_constant_param_name);
+    model->EraseArray(old_constant_param_name);
   }
   model->operators.erase(binary_it);
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
index 323fec6cf864a798a02aecdbbbf7c2e7bb904d2b..6961e23690a5e53643f2b2c52bb62ce395d05c95 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
@@ -31,21 +31,110 @@ namespace {
 
 void PrintModelStats(const string& label, const Model& model) {
   int quantized_arrays = 0;
-  for (const auto& array : model.arrays) {
+  for (const auto& array : model.GetArrayMap()) {
     if (array.second->quantization_params) {
       quantized_arrays++;
     }
   }
   LOG(INFO) << label << ": " << model.operators.size() << " operators, "
-            << model.arrays.size() << " arrays (" << quantized_arrays
+            << model.GetArrayMap().size() << " arrays (" << quantized_arrays
             << " quantized)";
 }
 
+// Some graphs have RNN back-edges that are discardable, having been
+// created typically by TensorFlow import rather than specified by the user.
+// Such graphs might have cycles (closed by RNN back-edges) that may be pruned.
+// Local graph transformations can't identify such global features,
+// so this function performs this global transformation.
+//
+// The other (and related) thing that is peculiar about RNN back-edges
+// is that they do not prevent the arrays that they touch, from being
+// pruned. Thus, they may refer to array names which no longer exist.
+// The intent is for that to result in the eventual pruning of such
+// 'dangling' RNN back-edges. We perform this pruning at the end of this
+// function, as the pruning of connected components done here may leave
+// more RNN back-edges dangling.
+void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
+  // Identify the set of arrays that are in 'useful' connected components
+  // of the graph, which means connected to output arrays.
+  std::unordered_set<string> useful_arrays;
+  for (const string& output_array : model->flags.output_arrays()) {
+    useful_arrays.insert(output_array);
+  }
+  bool found_new_useful_arrays;
+  do {
+    found_new_useful_arrays = false;
+    for (const auto& op : model->operators) {
+      bool op_touches_useful_arrays = false;
+      for (const string& output : op->outputs) {
+        op_touches_useful_arrays |= useful_arrays.count(output);
+      }
+      if (op_touches_useful_arrays) {
+        for (const string& input : op->inputs) {
+          found_new_useful_arrays |= !useful_arrays.count(input);
+          useful_arrays.insert(input);
+        }
+        for (const string& output : op->outputs) {
+          found_new_useful_arrays |= !useful_arrays.count(output);
+          useful_arrays.insert(output);
+        }
+      }
+    }
+    for (const auto& rnn_state : model->flags.rnn_states()) {
+      bool rnn_back_edge_touches_useful_arrays =
+          useful_arrays.count(rnn_state.state_array());
+      if (rnn_back_edge_touches_useful_arrays) {
+        found_new_useful_arrays |=
+            !useful_arrays.count(rnn_state.back_edge_source_array());
+        useful_arrays.insert(rnn_state.back_edge_source_array());
+      }
+    }
+  } while (found_new_useful_arrays);
+  // Erase arrays that aren't useful, and that are discardable.
+  model->EraseArrays([&](const string& name) {
+    return (!useful_arrays.count(name) && IsDiscardableArray(*model, name));
+  });
+  // Erase operators that do not produce a useful output array.
+  for (auto it = model->operators.begin(); it != model->operators.end();) {
+    // Only need to test the first output, as we simultaneously added all of
+    // an operator's outputs to the list of output arrays.
+    if (useful_arrays.count((*it)->outputs[0])) {
+      ++it;
+    } else {
+      for (const string& output : (*it)->outputs) {
+        CHECK(!useful_arrays.count(output));
+      }
+      it = model->operators.erase(it);
+    }
+  }
+  // Erase RNN back-edges that are 'dangling' i.e. that touch an array
+  // that no longer exists. This should only happen for discardable RNN
+  // back-edges.
+  std::vector<RnnState> rnn_states_to_keep;
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    const bool dangling =
+        !model->HasArray(rnn_state.back_edge_source_array()) ||
+        !model->HasArray(rnn_state.state_array());
+    if (dangling) {
+      CHECK(rnn_state.discardable());
+    } else {
+      rnn_states_to_keep.push_back(rnn_state);
+    }
+  }
+  model->flags.clear_rnn_states();
+  for (const auto& rnn_state : rnn_states_to_keep) {
+    *model->flags.add_rnn_states() = rnn_state;
+  }
+}
+
 bool GraphTransformationsPass(int increment, Model* model,
                               const GraphTransformationsSet& transformations) {
   CHECK(increment == 1 || increment == -1);
   bool changed = false;
-  CHECK(!model->operators.empty());
+  if (model->operators.empty()) {
+    LOG(INFO) << "Model is empty!!!";
+    return false;
+  }
   int op_index = increment == 1 ? 0 : model->operators.size() - 1;
   while (true) {
     bool changed_now = false;
@@ -54,23 +143,28 @@ bool GraphTransformationsPass(int increment, Model* model,
       CHECK(!changed_now);
       CHECK(transformation->Messages().empty());
       changed_now = transformation->Run(model, op_index);
-      if (changed_now) {
-        DumpGraphvizVideoFrame(*model);
-        CHECK(!model->operators.empty());
-        op_index = std::min<int>(op_index, model->operators.size() - 1);
-        // Uncomment for debugging
-        // CheckInvariants(*model);
-      }
       const char* made_a_change_msg =
           changed_now ? "made a change" : "did NOT make a change";
       const int log_level =
           changed_now ? kLogLevelModelChanged : kLogLevelModelUnchanged;
+      if (transformation->Messages().empty()) {
+        VLOG(log_level) << transformation->Name() << " " << made_a_change_msg
+                        << " at op_index=" << op_index << "/"
+                        << model->operators.size() - 1;
+      }
       for (const string& message : transformation->Messages()) {
         VLOG(log_level) << transformation->Name() << " " << made_a_change_msg
                         << " at op_index=" << op_index << "/"
                         << model->operators.size() - 1 << ": " << message;
       }
       transformation->ClearMessages();
+      if (changed_now) {
+        DumpGraphvizVideoFrame(*model);
+        if (model->operators.empty()) return true;
+        op_index = std::min<int>(op_index, model->operators.size() - 1);
+        // Uncomment for debugging
+        // CheckInvariants(*model);
+      }
       if (changed_now) {
         break;
       }
@@ -86,6 +180,7 @@ bool GraphTransformationsPass(int increment, Model* model,
       op_index += increment;
     }
   }
+  DiscardUselessConnectedComponentsAndRNNBackEdges(model);
   return changed;
 }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 9ad1b9622fd4374d10bd83fdded2fcd7795ca47d..3ab01ae643b26cfe0c7ce30472f693794326b9b3 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
 
 #include <cstddef>
 #include <initializer_list>
@@ -112,7 +112,12 @@ void RunGraphTransformations(Model* model, const string& message,
   };
 
 // List of all graph transformations
+DECLARE_GRAPH_TRANSFORMATION(ConvertExpandDimsToReshape)
 DECLARE_GRAPH_TRANSFORMATION(ConvertPureConvToDepthwise)
+DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialAddNToAdd)
+DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialStackToReshape)
+DECLARE_GRAPH_TRANSFORMATION(ConvertTrivialTransposeToReshape)
+DECLARE_GRAPH_TRANSFORMATION(ConvertReorderAxes)
 DECLARE_GRAPH_TRANSFORMATION(EnsureBiasVectors)
 DECLARE_GRAPH_TRANSFORMATION(FuseActivationFunctions)
 DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoFollowingAffine)
@@ -120,6 +125,8 @@ DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoPrecedingAffine)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Normalization)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Pool)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyLstmCell)
+DECLARE_GRAPH_TRANSFORMATION(SplitLstmCellInputs)
+DECLARE_GRAPH_TRANSFORMATION(MergeLstmCellInputs)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyRelu1)
 DECLARE_GRAPH_TRANSFORMATION(MakeInitialDequantizeOperator)
 DECLARE_GRAPH_TRANSFORMATION(PropagateArrayDataTypes)
@@ -132,6 +139,7 @@ DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowIdentity)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialBinaryOperator)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenation)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenationInput)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialSlice)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialQuantizedActivationFunc)
 DECLARE_GRAPH_TRANSFORMATION(RemoveUnusedOp)
 DECLARE_GRAPH_TRANSFORMATION(ResolveBatchNormalization)
@@ -140,22 +148,32 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantUnaryOperator)
 DECLARE_GRAPH_TRANSFORMATION(CreateIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(DropIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(ReadFakeQuantMinMax)
+DECLARE_GRAPH_TRANSFORMATION(ReorderActivationFunctions)
 DECLARE_GRAPH_TRANSFORMATION(ResolveReorderAxes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowConcat)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMatMul)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMerge)
-DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowSqueeze)
+DECLARE_GRAPH_TRANSFORMATION(ResolveSqueezeAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowSwitch)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowTile)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantConcatenation)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantTranspose)
 DECLARE_GRAPH_TRANSFORMATION(DropFakeQuant)
 DECLARE_GRAPH_TRANSFORMATION(UnfuseActivationFunctions)
+DECLARE_GRAPH_TRANSFORMATION(UnrollBatchMatMul)
+DECLARE_GRAPH_TRANSFORMATION(ResolveSpaceToBatchNDAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveBatchToSpaceNDAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolvePadAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveStridedSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveMeanAttributes)
-DECLARE_GRAPH_TRANSFORMATION(ResolveConstantTensorFlowShape)
+DECLARE_GRAPH_TRANSFORMATION(ResolveTransposeAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRange)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantShapeOrRank)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStack)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStridedSlice)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFill)
 DECLARE_GRAPH_TRANSFORMATION(Dequantize)
 
 class ResolveReshapeAttributes : public GraphTransformation {
@@ -183,4 +201,4 @@ class RemoveTrivialReshape : public GraphTransformation {
 
 }  // end namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index 9cb26c8752c0d27a3d1138b9ad32e60f34177520..1b0be858107b54f5a6ecd2a1cb87c9dbde1c06bb 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -143,7 +143,7 @@ bool HardcodeMinMaxForAverageOrMaxPool(Model* model, Operator* op) {
   return true;
 }
 
-bool HardcodeMinMaxForReshapeOrSqueeze(Model* model, Operator* op) {
+bool HardcodeMinMaxFromFirstInput(Model* model, Operator* op) {
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.minmax) {
     return false;
@@ -177,6 +177,106 @@ bool HardcodeMinMaxForOutput(Model* model, Operator* op, double min,
   output_minmax.max = max;
   return true;
 }
+
+// Propagates MinMax from any of the listed arrays, to all others.
+// If multiple of these arrays have MinMax, then these are required
+// to agree with each other.
+bool PropagateMinMaxAmongArrays(Model* model,
+                                const std::vector<string> array_names) {
+  string reference_array_name;
+  MinMax* reference_minmax = nullptr;
+  for (const string& array_name : array_names) {
+    if (model->GetArray(array_name).minmax) {
+      reference_array_name = array_name;
+      reference_minmax = model->GetArray(array_name).minmax.get();
+      break;
+    }
+  }
+  // No MinMax info is available to propagate.
+  if (!reference_minmax) {
+    return false;
+  }
+  bool changed = false;
+  for (const string& array_name : array_names) {
+    auto& array = model->GetArray(array_name);
+    if (array.minmax) {
+      CHECK(*array.minmax == *reference_minmax)
+          << "Both the following arrays have minmax, and they disagree: "
+          << reference_array_name << " and " << array_name
+          << ". Expected that either only one of them would have minmax, or at "
+             "least that they would agree.";
+    } else {
+      array.GetOrCreateMinMax() = *reference_minmax;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HardcodeMinMaxForLstmCell(Model* model, Operator* op) {
+  CHECK_EQ(op->inputs.size(), LstmCellOperator::NUM_INPUTS);
+  CHECK_EQ(op->outputs.size(), LstmCellOperator::NUM_OUTPUTS);
+
+  bool changed = false;
+  changed |= PropagateMinMaxAmongArrays(
+      model, {op->inputs[LstmCellOperator::PREV_STATE_INPUT],
+              op->outputs[LstmCellOperator::STATE_OUTPUT]});
+
+  auto& input_activations =
+      model->GetArray(op->inputs[LstmCellOperator::DATA_INPUT]);
+  if (!input_activations.minmax) {
+    auto& minmax = input_activations.GetOrCreateMinMax();
+    minmax.min = -1;
+    minmax.max = 127. / 128.;
+    changed = true;
+  }
+
+  auto& prev_output_activations =
+      model->GetArray(op->inputs[LstmCellOperator::PREV_ACTIV_INPUT]);
+  if (!prev_output_activations.minmax) {
+    auto& minmax = prev_output_activations.GetOrCreateMinMax();
+    minmax.min = -1;
+    minmax.max = 127. / 128.;
+    changed = true;
+  }
+
+  auto& output_concat_temp =
+      model->GetArray(op->outputs[LstmCellOperator::CONCAT_TEMP]);
+  if (!output_concat_temp.minmax) {
+    auto& minmax = output_concat_temp.GetOrCreateMinMax();
+    minmax.min = -1;
+    minmax.max = 127. / 128.;
+    changed = true;
+  }
+
+  auto& output_activations =
+      model->GetArray(op->outputs[LstmCellOperator::ACTIV_OUTPUT]);
+  if (!output_activations.minmax) {
+    auto& minmax = output_activations.GetOrCreateMinMax();
+    minmax.min = -1;
+    minmax.max = 127. / 128.;
+    changed = true;
+  }
+
+  // (This comment should morph into proper documentation for
+  // quantization of LSTM models. It isn't just a local implementation detail,
+  // the training code for LSTM models needs to be adjusted to that.)
+  //
+  // Finally, output_activations_temp holds the output of the fully-connected
+  // node inside the LSTM cell. For it, we hardcode a minmax of [-8, 8].
+  // The rationale for that is given in a lengthy comment on the LstmCell
+  // quantized runtime implementation in reference_ops.h.
+  auto& output_activations_temp =
+      model->GetArray(op->outputs[LstmCellOperator::ACTIV_TEMP]);
+  if (!output_activations_temp.minmax) {
+    auto& minmax = output_activations_temp.GetOrCreateMinMax();
+    minmax.min = -8;
+    minmax.max = 8 * 32767. / 32768.;
+    changed = true;
+  }
+
+  return changed;
+}
 }  // namespace
 
 bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
@@ -203,7 +303,8 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
 
     case OperatorType::kSqueeze:
     case OperatorType::kTensorFlowReshape:
-      changed = HardcodeMinMaxForReshapeOrSqueeze(model, op);
+    case OperatorType::kPad:
+      changed = HardcodeMinMaxFromFirstInput(model, op);
       break;
 
     case OperatorType::kLogistic:
@@ -218,6 +319,16 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
       changed = HardcodeMinMaxForOutput(model, op, 0, 255. / 256.);
       break;
 
+    case OperatorType::kTanh:
+      // We hardcode quantization_params to: zero_point=127, scale=1/128.
+      // This choice of minmax is the one that is equivalent to that.
+      changed = HardcodeMinMaxForOutput(model, op, -127. / 128., 1.0);
+      break;
+
+    case OperatorType::kLstmCell:
+      changed = HardcodeMinMaxForLstmCell(model, op);
+      break;
+
     default:
       break;
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc
index 01b75e37c691d48fabf8832af04543be3f5eb3bc..419a0776a6b987a18df059d3c1d4bf4370cd24d8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc
@@ -150,19 +150,19 @@ bool IdentifyL2Normalization::Run(Model* model, std::size_t op_index) {
 
   // Erase the subgraph that is now replaced by L2Normalization
   model->operators.erase(FindOperator(model, square_op));
-  model->arrays.erase(sum_op->inputs[0]);
+  model->EraseArray(sum_op->inputs[0]);
   if (sum_op->inputs.size() > 1) {
-    model->arrays.erase(sum_op->inputs[1]);
+    model->EraseArray(sum_op->inputs[1]);
   }
   model->operators.erase(FindOperator(model, sum_op));
   if (add_op) {
-    model->arrays.erase(add_op->inputs[0]);
-    model->arrays.erase(add_op->inputs[1]);
+    model->EraseArray(add_op->inputs[0]);
+    model->EraseArray(add_op->inputs[1]);
     model->operators.erase(FindOperator(model, add_op));
   }
-  model->arrays.erase(sqrt_or_rsqrt_op->inputs[0]);
+  model->EraseArray(sqrt_or_rsqrt_op->inputs[0]);
   model->operators.erase(FindOperator(model, sqrt_or_rsqrt_op));
-  model->arrays.erase(div_or_mul_op->inputs[1]);
+  model->EraseArray(div_or_mul_op->inputs[1]);
   model->operators.erase(FindOperator(model, div_or_mul_op));
   return true;
 }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc
index 1865416fc2226d663dfd51a5c0a0e2129caf485c..e4d52476c649de53b3ab663f53ce7a5538dbb5ab 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc
@@ -92,8 +92,8 @@ bool IdentifyL2Pool::Run(Model* model, std::size_t op_index) {
   AddMessageF("Creating %s replacing equivalent subgraph", LogName(*l2pool_op));
 
   // Erase intermediate arrays, keeping input to square op.
-  model->arrays.erase(avpool_op->inputs[0]);
-  model->arrays.erase(sqrt_op->inputs[0]);
+  model->EraseArray(avpool_op->inputs[0]);
+  model->EraseArray(sqrt_op->inputs[0]);
 
   // Erase three operators being replaced.
   model->operators.erase(FindOperator(model, square_op));
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
index 082820fddcf137238867239bbc4d4eed8158e307..c363b93394f0af7bcfc37c1e8be5f98aca6667ae 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/strings/string_view.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -202,23 +201,6 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
   return true;
 }
 
-absl::string_view FindLongestCommonPrefix(absl::string_view a,
-                                          absl::string_view b) {
-  if (a.empty() || b.empty()) return absl::string_view();
-
-  const char* pa = a.data();
-  const char* pb = b.data();
-  size_t count = 0;
-  const ssize_t limit = std::min(a.size(), b.size());
-  while (count < limit && *pa == *pb) {
-    ++pa;
-    ++pb;
-    ++count;
-  }
-
-  return absl::string_view(a.data(), count);
-}
-
 }  // namespace
 
 bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45335fd78c99a577d535770d78acf4fcd6c04531
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
@@ -0,0 +1,185 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+bool MergeLstmCellInputs::Run(Model* model, std::size_t op_index) {
+  // Find lstm cell.
+  auto op_it = model->operators.begin() + op_index;
+  auto src_op = op_it->get();
+  if (src_op->type != OperatorType::kLstmCell) {
+    return false;
+  }
+
+  // Already a compact LstmCell with LstmCellOperator::NUM_INPUTS of inputs,
+  // do not need to merge cell inputs.
+  if (src_op->inputs.size() == LstmCellOperator::NUM_INPUTS) {
+    return false;
+  }
+
+  // Identify prev_activ_input, prev_state_input as required Op inputs,
+  // using the rnn_states in the model flag.
+  string prev_activ_input;
+  if (!GetMatchingRnnArray(model, src_op->outputs[kOutputTensor],
+                           &prev_activ_input)) {
+    return false;
+  }
+  string prev_state_input;
+  if (!GetMatchingRnnArray(model, src_op->outputs[kCellStateTensor],
+                           &prev_state_input)) {
+    return false;
+  }
+
+  // Get LstmCell's cell, input, output size.
+  int num_cell = model->GetArray(src_op->inputs[kInputToInputWeightsTensor])
+                     .shape()
+                     .dims(0);
+  int num_input = model->GetArray(src_op->inputs[kInputToInputWeightsTensor])
+                      .shape()
+                      .dims(1);
+  int num_output =
+      model->GetArray(src_op->inputs[kRecurrentToInputWeightsTensor])
+          .shape()
+          .dims(1);
+
+  // Make sure n_cell and n_output are equal as there is no projection.
+  CHECK_EQ(num_cell, num_output);
+
+  // Create tensorflow_graphdef style's one big weight tensor.
+  const string base_name(FindLongestCommonPrefix(
+      src_op->outputs[kOutputTensor], src_op->outputs[kCellStateTensor]));
+  string merged_weights = AvailableArrayName(*model, base_name + "weights");
+  auto& array = model->GetOrCreateArray(merged_weights);
+  array.data_type = ArrayDataType::kFloat;
+  int weights_dim1 = 4 * num_cell;
+  int weights_dim2 = num_input + num_output;
+  Shape shape = Shape({weights_dim1, weights_dim2});
+  array.copy_shape(shape);
+  auto& buffer = array.GetMutableBuffer<ArrayDataType::kFloat>();
+  buffer.data.resize(weights_dim1 * weights_dim2);
+
+  // Merge 8 small weight tensors to 1 weight tensor.
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kInputToInputWeightsTensor]), 0, 0);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kInputToCellWeightsTensor]), num_cell, 0);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kInputToForgetWeightsTensor]),
+      num_cell * 2, 0);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kInputToOutputWeightsTensor]),
+      num_cell * 3, 0);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kRecurrentToInputWeightsTensor]), 0,
+      num_input);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kRecurrentToCellWeightsTensor]), num_cell,
+      num_input);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kRecurrentToForgetWeightsTensor]),
+      num_cell * 2, num_input);
+  CopyArrayToSubArray(
+      buffer, weights_dim2,
+      model->GetArray(src_op->inputs[kRecurrentToOutputWeightsTensor]),
+      num_cell * 3, num_input);
+
+  // Create tensorflow_graphdef style's one big bias tensor.
+  string merged_biases = AvailableArrayName(*model, base_name + "biases");
+  auto& bias_array = model->GetOrCreateArray(merged_biases);
+  bias_array.data_type = ArrayDataType::kFloat;
+  bias_array.copy_shape(Shape({weights_dim1}));
+  auto& bias_buffer = bias_array.GetMutableBuffer<ArrayDataType::kFloat>();
+  bias_buffer.data.resize(weights_dim1);
+
+  // Merge 4 small bias tensors into a big one.
+  CopyArrayToSubArray(bias_buffer, weights_dim2,
+                      model->GetArray(src_op->inputs[kInputGateBiasTensor]), 0,
+                      0);
+  CopyArrayToSubArray(bias_buffer, weights_dim2,
+                      model->GetArray(src_op->inputs[kCellGateBiasTensor]),
+                      num_cell, 0);
+  CopyArrayToSubArray(bias_buffer, weights_dim2,
+                      model->GetArray(src_op->inputs[kForgetGateBiasTensor]),
+                      num_cell * 2, 0);
+  CopyArrayToSubArray(bias_buffer, weights_dim2,
+                      model->GetArray(src_op->inputs[kOutputGateBiasTensor]),
+                      num_cell * 3, 0);
+
+  // Emplace a new LSTM cell operator (use basic 5 inputs kernel).
+  auto lstm_cell_op = absl::make_unique<LstmCellOperator>();
+
+  // Compact LstmCell's 5 inputs.
+  lstm_cell_op->inputs.resize(LstmCellOperator::NUM_INPUTS);
+  lstm_cell_op->inputs[LstmCellOperator::DATA_INPUT] =
+      src_op->inputs[kInputTensor];
+  lstm_cell_op->inputs[LstmCellOperator::WEIGHTS_INPUT] = merged_weights;
+  lstm_cell_op->inputs[LstmCellOperator::BIASES_INPUT] = merged_biases;
+  lstm_cell_op->inputs[LstmCellOperator::PREV_ACTIV_INPUT] = prev_activ_input;
+  lstm_cell_op->inputs[LstmCellOperator::PREV_STATE_INPUT] = prev_state_input;
+
+  // Reorder LstmCell's 4 outputs.
+  lstm_cell_op->outputs.resize(LstmCellOperator::NUM_OUTPUTS);
+  lstm_cell_op->outputs[LstmCellOperator::ACTIV_OUTPUT] =
+      src_op->outputs[kOutputTensor];
+  lstm_cell_op->outputs[LstmCellOperator::STATE_OUTPUT] =
+      src_op->outputs[kCellStateTensor];
+  lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] =
+      src_op->outputs[kScratchBufferTensor];
+  lstm_cell_op->outputs[LstmCellOperator::ACTIV_TEMP] =
+      src_op->outputs[kOutputStateTensor];
+
+  // Add the op into model.
+  model->operators.emplace(op_it, std::move(lstm_cell_op));
+  AddMessageF("Creating compact LstmCell replacing previous lstm cell");
+
+  // Delete arrays and operators replaced by the LSTM cell operator. Order is
+  // important - DeleteArrayIfUnused() only succeeds if dependent operators
+  // have been removed first. Start at the output and work towards the input.
+  // Erase curr lstm op being replaced.
+  DeleteArrayIfUnused(src_op->inputs[kInputToInputWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kInputToForgetWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kInputToCellWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kInputToOutputWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kRecurrentToInputWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kRecurrentToForgetWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kRecurrentToCellWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kRecurrentToOutputWeightsTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kInputGateBiasTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kForgetGateBiasTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kCellGateBiasTensor], model);
+  DeleteArrayIfUnused(src_op->inputs[kOutputGateBiasTensor], model);
+  model->operators.erase(FindOp(*model, src_op));
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eca717680af281018b919c27068ba5d9f5699d69
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
@@ -0,0 +1,171 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+bool SplitLstmCellInputs::Run(Model* model, std::size_t op_index) {
+  // Find lstm cell.
+  auto op_it = model->operators.begin() + op_index;
+  auto curr_op = op_it->get();
+  if (curr_op->type != OperatorType::kLstmCell) {
+    return false;
+  }
+
+  // Already an extended LstmCell with kExtendedLstmInputCount of inputs,
+  // do not need to split cell inputs.
+  if (curr_op->inputs.size() == kExtendedLstmInputCount) {
+    return false;
+  }
+
+  // Make sure the WEIGHTS_INPUT and BIASES_INPUT are constant arrays,
+  // that are able to be split into smaller weight and bias tensors.
+  if (!IsConstantParameterArray(
+          *model, curr_op->inputs[LstmCellOperator::WEIGHTS_INPUT]) ||
+      !IsConstantParameterArray(
+          *model, curr_op->inputs[LstmCellOperator::BIASES_INPUT])) {
+    return false;
+  }
+
+  // Make sure propagate_fixed_sizes has defined the size of the output.
+  if (!model->GetArray(curr_op->outputs[LstmCellOperator::ACTIV_OUTPUT])
+           .has_shape()) {
+    return false;
+  }
+
+  // Emplace a new LstmCell operator with extended inputs (kernel/lstm.cc).
+  auto lstm_cell_op = absl::make_unique<LstmCellOperator>();
+  lstm_cell_op->inputs.resize(kExtendedLstmInputCount);
+  int num_input = model->GetArray(curr_op->inputs[LstmCellOperator::DATA_INPUT])
+                      .shape()
+                      .dims(1);
+
+  // n_cell and n_output have the same size when there is no projection.
+  int num_cell =
+      model->GetArray(curr_op->outputs[LstmCellOperator::ACTIV_OUTPUT])
+          .shape()
+          .dims(1);
+  int num_output = num_cell;
+
+  // Data input.
+  lstm_cell_op->inputs[kInputTensor] =
+      curr_op->inputs[LstmCellOperator::ACTIV_OUTPUT];
+
+  // Get original weight tensor and decompose 1 tensor to 8 sub tensors.
+  Array& kernel =
+      model->GetArray(curr_op->inputs[LstmCellOperator::WEIGHTS_INPUT]);
+  const string base_name(FindLongestCommonPrefix(
+      curr_op->outputs[LstmCellOperator::ACTIV_OUTPUT],
+      curr_op->outputs[LstmCellOperator::STATE_OUTPUT]));
+
+  // Input weight tensors of size {n_cell, n_input}.
+  CopySubArrayToArray(
+      model, &(lstm_cell_op->inputs[kInputToInputWeightsTensor]),
+      base_name + "weight_i_i", num_cell, num_input, kernel, 0, 0);
+  CopySubArrayToArray(model, &(lstm_cell_op->inputs[kInputToCellWeightsTensor]),
+                      base_name + "weight_c_i", num_cell, num_input, kernel,
+                      num_cell, 0);
+  CopySubArrayToArray(
+      model, &(lstm_cell_op->inputs[kInputToForgetWeightsTensor]),
+      base_name + "weight_f_i", num_cell, num_input, kernel, num_cell * 2, 0);
+  CopySubArrayToArray(
+      model, &(lstm_cell_op->inputs[kInputToOutputWeightsTensor]),
+      base_name + "weight_o_i", num_cell, num_input, kernel, num_cell * 3, 0);
+
+  // Recurrent weight tensors of size {n_cell, n_output}.
+  CopySubArrayToArray(
+      model, &(lstm_cell_op->inputs[kRecurrentToInputWeightsTensor]),
+      base_name + "weight_i_r", num_cell, num_output, kernel, 0, num_input);
+  CopySubArrayToArray(model,
+                      &(lstm_cell_op->inputs[kRecurrentToCellWeightsTensor]),
+                      base_name + "weight_c_r", num_cell, num_output, kernel,
+                      num_cell, num_input);
+  CopySubArrayToArray(model,
+                      &(lstm_cell_op->inputs[kRecurrentToForgetWeightsTensor]),
+                      base_name + "weight_f_r", num_cell, num_output, kernel,
+                      num_cell * 2, num_input);
+  CopySubArrayToArray(model,
+                      &(lstm_cell_op->inputs[kRecurrentToOutputWeightsTensor]),
+                      base_name + "weight_o_r", num_cell, num_output, kernel,
+                      num_cell * 3, num_input);
+
+  // Peephole (optional).
+  CreateOptionalArray(model, &(lstm_cell_op->inputs[kCellToInputWeightsTensor]),
+                      base_name + "peephole_c_i");
+  CreateOptionalArray(model,
+                      &(lstm_cell_op->inputs[kCellToForgetWeightsTensor]),
+                      base_name + "peephole_c_f");
+  CreateOptionalArray(model,
+                      &(lstm_cell_op->inputs[kCellToOutputWeightsTensor]),
+                      base_name + "peephole_c_o");
+
+  // Get original bias tensor and decompose 1 tensor to 4 sub tensors
+  Array& bias =
+      model->GetArray(curr_op->inputs[LstmCellOperator::BIASES_INPUT]);
+  CopySubArrayToArray(model, &(lstm_cell_op->inputs[kInputGateBiasTensor]),
+                      base_name + "bias_i", num_cell, 1, bias, 0, 0);
+  CopySubArrayToArray(model, &(lstm_cell_op->inputs[kCellGateBiasTensor]),
+                      base_name + "bias_c", num_cell, 1, bias, num_cell, 0);
+  CopySubArrayToArray(model, &(lstm_cell_op->inputs[kForgetGateBiasTensor]),
+                      base_name + "bias_f", num_cell, 1, bias, num_cell * 2, 0);
+  CopySubArrayToArray(model, &(lstm_cell_op->inputs[kOutputGateBiasTensor]),
+                      base_name + "bias_o", num_cell, 1, bias, num_cell * 3, 0);
+
+  // Projection (optional).
+  CreateOptionalArray(model, &(lstm_cell_op->inputs[kProjectionWeightsTensor]),
+                      base_name + "proj_weight");
+  CreateOptionalArray(model, &(lstm_cell_op->inputs[kProjectionBiasTensor]),
+                      base_name + "proj_bias");
+
+  // Reorder LstmCell's outputs.
+  lstm_cell_op->outputs.resize(LstmCellOperator::NUM_OUTPUTS);
+  lstm_cell_op->outputs[kScratchBufferTensor] =
+      curr_op->outputs[LstmCellOperator::CONCAT_TEMP];
+  lstm_cell_op->outputs[kOutputStateTensor] =
+      curr_op->outputs[LstmCellOperator::ACTIV_TEMP];
+  lstm_cell_op->outputs[kCellStateTensor] =
+      curr_op->outputs[LstmCellOperator::STATE_OUTPUT];
+  lstm_cell_op->outputs[kOutputTensor] =
+      curr_op->outputs[LstmCellOperator::ACTIV_OUTPUT];
+
+  // Add the op into model.
+  model->operators.emplace(op_it, std::move(lstm_cell_op));
+  AddMessageF("Creating extended LstmCell replacing previous lstm cell");
+
+  // Delete arrays and operators replaced by the LSTM cell operator. Order is
+  // important - DeleteArrayIfUnused() only succeeds if dependent operators
+  // have been removed first. Start at the output and work towards the input.
+  // Erase curr lstm op being replaced.
+  DeleteArrayIfUnused(curr_op->inputs[LstmCellOperator::WEIGHTS_INPUT], model);
+  DeleteArrayIfUnused(curr_op->inputs[LstmCellOperator::BIASES_INPUT], model);
+  DeleteArrayIfUnused(curr_op->inputs[LstmCellOperator::PREV_ACTIV_INPUT],
+                      model);
+  DeleteArrayIfUnused(curr_op->inputs[LstmCellOperator::PREV_STATE_INPUT],
+                      model);
+  model->operators.erase(FindOp(*model, curr_op));
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc
index cfc77024e7e56038878570c9d3a462715a53ae3f..de6d8889fb4ccdb56e9639ab0dd7d093bfa4b908 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc
@@ -57,45 +57,60 @@ int GetSingleScalarInputIndexOfBinaryOp(Model* model, const Operator* op,
 }  // namespace
 
 bool IdentifyRelu1::Run(Model* model, std::size_t op_index) {
-  const auto maximum_it = model->operators.begin() + op_index;
-  const auto* maximum_op = maximum_it->get();
-  if (maximum_op->type != OperatorType::kTensorFlowMaximum) {
+  // Follow sequences of min+max and max+min. First get the leading op.
+  const auto op_it = model->operators.begin() + op_index;
+  const auto* op_0 = op_it->get();
+  if (op_0->type != OperatorType::kTensorFlowMinimum &&
+      op_0->type != OperatorType::kTensorFlowMaximum) {
     return false;
   }
-  CHECK_EQ(maximum_op->inputs.size(), 2);
-  if (maximum_op->outputs.size() != 1) {
-    return false;
-  }
-  int scalar_input_index =
-      GetSingleScalarInputIndexOfBinaryOp(model, maximum_op, -1.0f);
-  if (scalar_input_index == -1) {
+
+  // Get the paired op and ensure it's the counter to the first.
+  const auto* op_1 = GetOpWithInput(*model, op_0->outputs[0]);
+  if (!op_1 ||
+      (op_1->type != OperatorType::kTensorFlowMinimum &&
+       op_1->type != OperatorType::kTensorFlowMaximum) ||
+      op_0->type == op_1->type) {
     return false;
   }
-  const auto* minimum_op = GetOpWithInput(*model, maximum_op->outputs[0]);
-  if (!minimum_op || minimum_op->type != OperatorType::kTensorFlowMinimum) {
+
+  const auto* min_op =
+      op_0->type == OperatorType::kTensorFlowMinimum ? op_0 : op_1;
+  const auto* max_op =
+      op_0->type == OperatorType::kTensorFlowMaximum ? op_0 : op_1;
+
+  CHECK_EQ(min_op->inputs.size(), 2);
+  CHECK_EQ(max_op->inputs.size(), 2);
+  if (min_op->outputs.size() != 1 || max_op->outputs.size() != 1) {
     return false;
   }
-  if (GetSingleScalarInputIndexOfBinaryOp(model, minimum_op, 1.0f) == -1) {
+
+  // Get the original input to the min+max pair.
+  int min_scalar_input_index =
+      GetSingleScalarInputIndexOfBinaryOp(model, min_op, 1.0f);
+  int max_scalar_input_index =
+      GetSingleScalarInputIndexOfBinaryOp(model, max_op, -1.0f);
+  if (min_scalar_input_index == -1 || max_scalar_input_index == -1) {
     return false;
   }
-  CHECK_EQ(minimum_op->inputs.size(), 2);
+  int op_0_scalar_input_index =
+      op_0 == min_op ? min_scalar_input_index : max_scalar_input_index;
 
-  // Create and emplace Relu1 node
+  // Create and emplace Relu1 node.
   auto* relu1_op = new Relu1Operator;
-  relu1_op->inputs = {maximum_op->inputs[!scalar_input_index]};
-  relu1_op->outputs = minimum_op->outputs;
-  model->operators.emplace(maximum_it, relu1_op);
+  relu1_op->inputs = {op_0->inputs[!op_0_scalar_input_index]};
+  relu1_op->outputs = op_1->outputs;
+  model->operators.emplace(op_it, relu1_op);
 
   AddMessageF("Creating %s replacing equivalent subgraph", LogName(*relu1_op));
 
-  // Erase Maximum scalar input & operator
-  model->arrays.erase(maximum_op->inputs[scalar_input_index]);
-  model->operators.erase(FindOperator(model, maximum_op));
-
-  // Erase Minimum inputs & operator
-  model->arrays.erase(minimum_op->inputs[0]);
-  model->arrays.erase(minimum_op->inputs[1]);
-  model->operators.erase(FindOperator(model, minimum_op));
+  // Erase op scalar inputs & operators. Note that we preserve the non-scalar
+  // input to the first op as that's been redirected to the relu1_op.
+  DeleteArrayIfUsedOnce(op_0->inputs[op_0_scalar_input_index], model);
+  DeleteArrayIfUsedOnce(op_1->inputs[0], model);
+  DeleteArrayIfUsedOnce(op_1->inputs[1], model);
+  model->operators.erase(FindOperator(model, op_0));
+  model->operators.erase(FindOperator(model, op_1));
 
   return true;
 }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.cc b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..910a96058979887972b41f27b2e570e8cb5b4f4c
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.cc
@@ -0,0 +1,97 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h"
+
+namespace toco {
+
+void CreateOptionalArray(Model* model, string* input_array_buffer,
+                         const string& array_name) {
+  *input_array_buffer = array_name;
+  model->CreateOptionalArray(array_name);
+}
+
+void CopyArrayData(const Buffer<ArrayDataType::kFloat>& src_buffer,
+                   int src_stride, int src_start_idx1, int src_start_idx2,
+                   Buffer<ArrayDataType::kFloat>* dst_buffer, int dst_stride,
+                   int dst_start_idx1, int dst_start_idx2, int dim1_copy_size,
+                   int dim2_copy_size) {
+  int src_offset = src_start_idx1 * src_stride + src_start_idx2;
+  int dst_offset = dst_start_idx1 * dst_stride + dst_start_idx2;
+  for (int i = 0; i < dim1_copy_size; i++) {
+    for (int j = 0; j < dim2_copy_size; j++) {
+      int idx_src = src_offset + i * src_stride + j;
+      int idx_dst = dst_offset + i * dst_stride + j;
+      dst_buffer->data[idx_dst] = src_buffer.data[idx_src];
+    }
+  }
+}
+
+Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
+                                                      string* array_name,
+                                                      const Shape& shape) {
+  *array_name = AvailableArrayName(*model, *array_name);
+  auto& array = model->GetOrCreateArray(*array_name);
+  array.data_type = ArrayDataType::kFloat;
+  array.copy_shape(shape);
+  Buffer<ArrayDataType::kFloat>* buffer =
+      &(array.GetMutableBuffer<ArrayDataType::kFloat>());
+  buffer->data.resize(RequiredBufferSizeForShape(shape));
+  return buffer;
+}
+
+void CopySubArrayToArray(Model* model, string* array_name,
+                         const string& tensor_name, int dim1_size,
+                         int dim2_size, const Array& original_array,
+                         int start_idx1, int start_idx2) {
+  // Determine whether it's bias or not, create shape, buffer.
+  bool is_bias = dim2_size == 1;
+  Shape shape = is_bias ? Shape({dim1_size}) : Shape({dim1_size, dim2_size});
+  Buffer<ArrayDataType::kFloat>* buffer =
+      CreateFloatArrayBuffer(model, array_name, shape);
+  auto& orig_buffer = original_array.GetBuffer<ArrayDataType::kFloat>();
+
+  // Copy data from big tensor.
+  CopyArrayData(orig_buffer, is_bias ? 1 : original_array.shape().dims(1),
+                start_idx1, start_idx2, buffer, dim2_size, 0, 0, dim1_size,
+                dim2_size);
+}
+
+void CopyArrayToSubArray(Buffer<ArrayDataType::kFloat>& tensor_buffer,
+                         int tensor_stride, const Array& sub_array,
+                         int start_idx1, int start_idx2) {
+  // Get tensor data.
+  bool is_bias = sub_array.shape().dims().size() == 1;
+  int dim1_copy_size = sub_array.shape().dims()[0];
+  int dim2_copy_size = is_bias ? 1 : sub_array.shape().dims(1);
+  auto& sub_buffer = sub_array.GetBuffer<ArrayDataType::kFloat>();
+
+  // Copy data from sub tensor.
+  CopyArrayData(sub_buffer, dim2_copy_size, 0, 0, &tensor_buffer,
+                is_bias ? 1 : tensor_stride, start_idx1, start_idx2,
+                dim1_copy_size, dim2_copy_size);
+}
+
+bool GetMatchingRnnArray(Model* model, const string& back_edge_source_array,
+                         string* rnn_array) {
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    if (rnn_state.back_edge_source_array() == back_edge_source_array) {
+      *rnn_array = rnn_state.state_array();
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..881c2d4dc892625d4640cac867a2f49c24b638f5
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+// For consistency with the parameters defined in extended LstmCell's kernel
+// (tensorflow/contrib/lite/kernels/lstm.cc),
+// use lowercase for these constants.
+
+enum ExtendedLstmCellInputs {
+  kInputTensor = 0,
+  kInputToInputWeightsTensor = 1,  // Optional
+  kInputToForgetWeightsTensor = 2,
+  kInputToCellWeightsTensor = 3,
+  kInputToOutputWeightsTensor = 4,
+  kRecurrentToInputWeightsTensor = 5,  // Optional
+  kRecurrentToForgetWeightsTensor = 6,
+  kRecurrentToCellWeightsTensor = 7,
+  kRecurrentToOutputWeightsTensor = 8,
+  kCellToInputWeightsTensor = 9,    // Optional
+  kCellToForgetWeightsTensor = 10,  // Optional
+  kCellToOutputWeightsTensor = 11,  // Optional
+  kInputGateBiasTensor = 12,        // Optional
+  kForgetGateBiasTensor = 13,
+  kCellGateBiasTensor = 14,
+  kOutputGateBiasTensor = 15,
+  kProjectionWeightsTensor = 16,  // Optional
+  kProjectionBiasTensor = 17,     // Optional
+  kExtendedLstmInputCount = 18
+};
+
+enum ExtendedLstmCellOutputs {
+  kScratchBufferTensor = 0,
+  kOutputStateTensor = 1,
+  kCellStateTensor = 2,
+  kOutputTensor = 3
+};
+
+// Create optional array used for optional tensor in ExtendedLstmCell inputs.
+void CreateOptionalArray(Model* model, string* input_array_buffer,
+                         const string& array_name);
+
+// Create float array and get its buffer.
+Buffer<ArrayDataType::kFloat>* CreateFloatArrayBuffer(Model* model,
+                                                      string* array_name,
+                                                      const Shape& shape);
+
+// Copy data from one array to the other one (supports 1D and 2D array),
+// for 1D array, the 2nd dim's size is 1.
+// Arguments:
+//   src_buffer: the source buffer
+//   src_stride: the stride of source buffer, i.e., 2nd dim's size
+//   src_start_idx1: the 1st dim index of start point in src matrix
+//   src_start_idx2: the 2nd dim index of start point in src matrix
+//   dst_buffer: the destination buffer
+//   dst_stride: the stride of destination buffer, i.e., 2nd dim's size
+//   dst_start_idx1: the 1st dim index of start point in dst matrix
+//   dst_start_idx2: the 2nd dim index of start point in dst matrix
+//   dim1_copy_size: 1st dim size of copy data
+//   dim2_copy_size: 2nd dim size of copy data
+void CopyArrayData(const Buffer<ArrayDataType::kFloat>& src_buffer,
+                   int src_stride, int src_start_idx1, int src_start_idx2,
+                   Buffer<ArrayDataType::kFloat>* dst_buffer, int dst_stride,
+                   int dst_start_idx1, int dst_start_idx2, int dim1_copy_size,
+                   int dim2_copy_size);
+
+// Copy a subset of array data and create a smaller array,
+// mostly used for spliting weights and bias for Lstm cell.
+void CopySubArrayToArray(Model* model, string* array_name,
+                         const string& tensor_name, int dim1_size,
+                         int dim2_size, const Array& original_array,
+                         int start_idx1, int start_idx2);
+
+// Copy array data to a large array's submatrix,
+// mostly used for merging weights and bias for Lstm cell.
+void CopyArrayToSubArray(Buffer<ArrayDataType::kFloat>& tensor_buffer,
+                         int tensor_stride, const Array& sub_array,
+                         int start_idx1, int start_idx2);
+
+// Get mating rnn array inputs using rnn_states flag.
+bool GetMatchingRnnArray(Model* model, const string& back_edge_source_array,
+                         string* rnn_array);
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index 1ff4e827aa043cbbb0515e10a6ae9bd33e6d819c..f0d107232b4517115aa3f64b39b825dbaffb83ce 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -24,23 +24,10 @@ limitations under the License.
 namespace toco {
 
 namespace {
-
-ArrayDataType CommonDataTypeOfAllInputs(const Model& model,
-                                        const Operator& op) {
-  CHECK_GT(op.inputs.size(), 0);
-  const ArrayDataType data_type = model.GetArray(op.inputs[0]).data_type;
-  for (const auto& input : op.inputs) {
-    const auto& array = model.GetArray(input);
-    CHECK(array.data_type == data_type)
-        << " Unexpected: this operator has inputs with different data types.";
-  }
-  return data_type;
-}
-
 void SetDataTypeForAllOutputs(Model* model, Operator* op,
                               ArrayDataType data_type) {
   for (const auto& output : op->outputs) {
-    model->arrays[output]->data_type = data_type;
+    model->GetArray(output).data_type = data_type;
   }
 }
 }  // namespace
@@ -51,7 +38,8 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
 
   // If the data type of some input is unknown, we need to yield.
   for (const auto& input : op->inputs) {
-    if (model->arrays[input]->data_type == ArrayDataType::kNone) {
+    if (!model->IsOptionalArray(input) &&
+        model->GetArray(input).data_type == ArrayDataType::kNone) {
       return false;
     }
   }
@@ -59,7 +47,7 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
   // end if we changed anything, and return the correct boolean value.
   std::unordered_map<string, ArrayDataType> old_output_data_types;
   for (const auto& output : op->outputs) {
-    old_output_data_types[output] = model->arrays[output]->data_type;
+    old_output_data_types[output] = model->GetArray(output).data_type;
   }
   // Do the actual output data types propagation.
   if (op->type == OperatorType::kDequantize ||
@@ -72,48 +60,41 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
              op->type == OperatorType::kTensorFlowGreaterEqual) {
     // These operators unconditionally produce bool outputs
     SetDataTypeForAllOutputs(model, op, ArrayDataType::kBool);
-  } else if (op->type == OperatorType::kTensorFlowShape) {
-    // These operators are assumed to produce int32 outputs.
+  } else if (op->type == OperatorType::kRank ||
+             op->type == OperatorType::kTensorFlowShape) {
+    // These operators only produce int32 outputs.
     SetDataTypeForAllOutputs(model, op, ArrayDataType::kInt32);
-  } else if (op->type == OperatorType::kAveragePool ||
-             op->type == OperatorType::kMaxPool ||
-             op->type == OperatorType::kL2Pool ||
-             op->type == OperatorType::kConv ||
-             op->type == OperatorType::kDepthwiseConv ||
-             op->type == OperatorType::kFullyConnected ||
-             op->type == OperatorType::kTensorFlowMax ||
-             op->type == OperatorType::kTensorFlowMin ||
-             op->type == OperatorType::kPad ||
-             op->type == OperatorType::kStridedSlice ||
-             op->type == OperatorType::kTensorFlowReshape ||
-             op->type == OperatorType::kSlice ||
-             op->type == OperatorType::kSqueeze ||
-             op->type == OperatorType::kTensorFlowSum ||
-             op->type == OperatorType::kTensorFlowSwitch ||
-             op->type == OperatorType::kTensorFlowTile ||
-             op->type == OperatorType::kTensorFlowAll ||
-             op->type == OperatorType::kReorderAxes ||
-             op->type == OperatorType::kTensorFlowConcatV2 ||
-             op->type == OperatorType::kFloor ||
-             op->type == OperatorType::kGather ||
-             op->type == OperatorType::kSpaceToBatchND ||
-             op->type == OperatorType::kBatchToSpaceND ||
-             op->type == OperatorType::kMean) {
-    // These operators produce outputs with the same type as their 1st input
-    CHECK_GT(op->inputs.size(), 0);
-    const ArrayDataType data_type = model->arrays[op->inputs[0]]->data_type;
-    SetDataTypeForAllOutputs(model, op, data_type);
   } else if (op->type == OperatorType::kTensorFlowSplit ||
-             op->type == OperatorType::kTensorFlowConcat) {
+             op->type == OperatorType::kTensorFlowConcat ||
+             op->type == OperatorType::kFill) {
     // These operators produce an output with the same type as their 2nd input
-    CHECK_GT(op->inputs.size(), 1);
-    const ArrayDataType data_type = model->arrays[op->inputs[1]]->data_type;
+    CHECK_GE(op->inputs.size(), 2);
+    const ArrayDataType data_type = model->GetArray(op->inputs[1]).data_type;
     SetDataTypeForAllOutputs(model, op, data_type);
   } else if (op->type == OperatorType::kCast) {
     // Data type of the Cast op is specified.
     CHECK_EQ(op->outputs.size(), 1);
     auto* cast_op = static_cast<CastOperator*>(op);
-    model->arrays[op->outputs[0]]->data_type = cast_op->dst_data_type;
+    model->GetArray(op->outputs[0]).data_type = cast_op->dst_data_type;
+  } else if (op->type == OperatorType::kArgMax) {
+    // Data type of the ArgMax op is specified.
+    CHECK_EQ(op->outputs.size(), 1);
+    auto* argmax_op = static_cast<ArgMaxOperator*>(op);
+    model->GetArray(op->outputs[0]).data_type = argmax_op->output_data_type;
+  } else if (op->type == OperatorType::kRange) {
+    auto* range_op = static_cast<RangeOperator*>(op);
+    // Output type of the Range op can be set via an attribute
+    ArrayDataType data_type;
+    if (range_op->dtype != ArrayDataType::kNone) {
+      // Use the type if specified
+      data_type = range_op->dtype;
+    } else {
+      // Otherwise use the first input
+      CHECK_GE(op->inputs.size(), 1);
+      data_type = model->GetArray(op->inputs[0]).data_type;
+    }
+    CHECK_EQ(op->outputs.size(), 1);
+    SetDataTypeForAllOutputs(model, op, data_type);
   } else if (op->type == OperatorType::kTensorFlowUnsupported) {
     auto* unsupported_op = static_cast<TensorFlowUnsupportedOperator*>(op);
     if (unsupported_op->output_data_types.size() != op->outputs.size()) {
@@ -122,17 +103,20 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
     for (int i = 0; i < unsupported_op->output_data_types.size(); ++i) {
       auto output = op->outputs[i];
       auto data_type = unsupported_op->output_data_types[i];
-      model->arrays[output]->data_type = data_type;
+      model->GetArray(output).data_type = data_type;
     }
+  } else if (op->type == OperatorType::kExpandDims) {
+    // Yield on ExpandDim until it is converted to Reshape
+    return false;
   } else {
-    // These operators produce an output with the same type as any of their
-    // inputs, which must always have the same type.
-    const ArrayDataType data_type = CommonDataTypeOfAllInputs(*model, *op);
+    // These operators produce outputs with the same type as their 1st input
+    CHECK_GT(op->inputs.size(), 0);
+    const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
     SetDataTypeForAllOutputs(model, op, data_type);
   }
   // Return true if any output data type changed, false if none changed.
   for (const auto& output : op->outputs) {
-    if (old_output_data_types[output] != model->arrays[output]->data_type) {
+    if (old_output_data_types[output] != model->GetArray(output).data_type) {
       return true;
     }
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index f6daad9020be52b7616b074ba966f4e9b079ebeb..ddcc03813ffce785b8d930073a2c97b4847b5f8d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -60,23 +61,42 @@ void ComputeConvSizes(const Shape& input_shape, int output_depth, int kwidth,
   output_shape->ReplaceDims({batch, output_height, output_width, output_depth});
 }
 
-void ComputeBinaryOperatorOutputSize(const Shape& input_shape1,
-                                     const Shape& input_shape2,
+void ComputeBinaryOperatorOutputSize(const Shape& input_shape_x,
+                                     const Shape& input_shape_y,
                                      Array* output_array) {
-  const int size1 = RequiredBufferSizeForShape(input_shape1);
-  const int size2 = RequiredBufferSizeForShape(input_shape2);
-  if (size1 > size2) {
-    output_array->copy_shape(input_shape1);
-  } else if (size2 > size1) {
-    output_array->copy_shape(input_shape2);
-  } else {
-    CHECK_EQ(size1, size2);
-    const int dims1 = input_shape1.dimensions_count();
-    const int dims2 = input_shape2.dimensions_count();
-    if (dims1 >= dims2) {
-      output_array->copy_shape(input_shape1);
+  // This matches the code in BroadcastBinaryOpShapeFn from tensorflow.
+  // It zips together the two input shapes and pads with 1 to make them the
+  // same length. For each dimension we broadcast if either dimension is 1 and
+  // otherwise expect them to match.
+  int rank_x = input_shape_x.dimensions_count();
+  int rank_y = input_shape_y.dimensions_count();
+  int rank_out = std::max(rank_x, rank_y);
+  std::vector<int>* dims_out = output_array->mutable_shape()->mutable_dims();
+  dims_out->clear();
+  dims_out->reserve(rank_out);
+  for (int i = 0; i < rank_out; ++i) {
+    int dim_x = i < (rank_out - rank_x)
+                    ? 1
+                    : input_shape_x.dims(i - (rank_out - rank_x));
+    bool dim_y_is_one = i < (rank_out - rank_y);
+    int dim_y = dim_y_is_one ? 1 : input_shape_y.dims(i - (rank_out - rank_y));
+    if (dim_x == -1 || dim_y == -1) {
+      // One or both dimensions is unknown.
+      QCHECK(false) << "Shapes must be specified";
+    } else if (dim_x == 1 || dim_y == 1) {
+      // Broadcast one dimension to the other that is 1.
+      if (dim_x == 1 && !dim_y_is_one) {
+        // Broadcast dim_y to dim_x (1).
+        dims_out->push_back(dim_y);
+      } else {
+        // Broadcast dim_x to dim_y (1).
+        DCHECK_EQ(dim_y, 1);
+        dims_out->push_back(dim_x);
+      }
     } else {
-      output_array->copy_shape(input_shape2);
+      // Expect the dimensions to match.
+      CHECK_EQ(dim_x, dim_y) << "Dimensions must match";
+      dims_out->push_back(dim_x);
     }
   }
   CHECK(output_array->has_shape());
@@ -84,7 +104,7 @@ void ComputeBinaryOperatorOutputSize(const Shape& input_shape1,
 
 int GetOutputDepthFromWeights(const Model& model, const Operator& op) {
   const string& weights_name = op.inputs[1];
-  const auto& weights_shape = model.arrays.at(weights_name)->shape();
+  const auto& weights_shape = model.GetArray(weights_name).shape();
   if (op.type == OperatorType::kConv ||
       op.type == OperatorType::kFullyConnected) {
     return weights_shape.dims(0);
@@ -97,7 +117,7 @@ int GetOutputDepthFromWeights(const Model& model, const Operator& op) {
 
 bool EnsureBiasVectorShape(Model* model, Operator* op) {
   const string& weights_name = op->inputs[1];
-  const auto& weights_array = *model->arrays[weights_name];
+  const auto& weights_array = model->GetArray(weights_name);
   // Yield until weights shape has been resolved.
   if (!weights_array.has_shape()) {
     return false;
@@ -106,7 +126,7 @@ bool EnsureBiasVectorShape(Model* model, Operator* op) {
   if (op->inputs.size() < 3) {
     return false;
   }
-  auto& bias_array = *model->arrays[op->inputs[2]];
+  auto& bias_array = model->GetArray(op->inputs[2]);
   if (bias_array.has_shape()) {
     return true;
   }
@@ -125,7 +145,7 @@ void ProcessConvOperator(Model* model, ConvOperator* op) {
     return;
   }
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -133,7 +153,7 @@ void ProcessConvOperator(Model* model, ConvOperator* op) {
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
 
-  const auto& weights_array = *model->arrays[op->inputs[1]];
+  const auto& weights_array = model->GetArray(op->inputs[1]);
   // Yield until weights dims have been resolved.
   if (!weights_array.has_shape()) {
     return;
@@ -155,7 +175,7 @@ void ProcessConvOperator(Model* model, ConvOperator* op) {
   if (op->outputs.size() == 2) {
     const auto& output_shape = output_array.shape();
     const int input_depth = weights_shape.dims(3);
-    auto& im2col_array = *model->arrays[op->outputs[1]];
+    auto& im2col_array = model->GetArray(op->outputs[1]);
     im2col_array.copy_shape(Shape{output_shape.dims(0), output_shape.dims(1),
                                   output_shape.dims(2),
                                   input_depth * kheight * kwidth});
@@ -167,7 +187,7 @@ void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) {
     return;
   }
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -175,7 +195,7 @@ void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) {
   const auto& input_shape = input_array.shape();
   CHECK_EQ(input_shape.dimensions_count(), 4);
 
-  const auto& weights_array = *model->arrays[op->inputs[1]];
+  const auto& weights_array = model->GetArray(op->inputs[1]);
   // Yield until weights dims have been resolved.
   if (!weights_array.has_shape()) {
     return;
@@ -208,7 +228,7 @@ void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) {
 }
 
 void ProcessDepthToSpaceOperator(Model* model, DepthToSpaceOperator* op) {
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -231,7 +251,7 @@ void ProcessDepthToSpaceOperator(Model* model, DepthToSpaceOperator* op) {
 }
 
 void ProcessSpaceToDepthOperator(Model* model, SpaceToDepthOperator* op) {
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -254,12 +274,39 @@ void ProcessSpaceToDepthOperator(Model* model, SpaceToDepthOperator* op) {
                          depth * block_size * block_size}));
 }
 
+void ProcessFillOperator(Model* model, FillOperator* op) {
+  CHECK_EQ(op->inputs.size(), 2);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) {
+    // We have already run
+    return;
+  }
+
+  auto& dims_array = model->GetArray(op->inputs[0]);
+  if (!dims_array.has_shape()) {
+    // Yield until dims shape been resolved.
+    return;
+  }
+  if (!dims_array.buffer) {
+    // Yield until the dims are constant
+    return;
+  }
+  CHECK(dims_array.data_type == ArrayDataType::kInt32) << "dims must be int32";
+  CHECK_LE(RequiredBufferSizeForShape(dims_array.shape()), 4)
+      << "dims vector can be no larger than 4 values";
+
+  std::vector<int32> const& dims =
+      dims_array.GetBuffer<ArrayDataType::kInt32>().data;
+  *(output_array.mutable_shape()->mutable_dims()) = dims;
+}
+
 void ProcessFullyConnectedOperator(Model* model, FullyConnectedOperator* op) {
   if (!EnsureBiasVectorShape(model, op)) {
     return;
   }
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -267,7 +314,7 @@ void ProcessFullyConnectedOperator(Model* model, FullyConnectedOperator* op) {
   const auto& input_shape = input_array.shape();
   CHECK_GE(input_shape.dimensions_count(), 1);
 
-  const auto& weights_array = *model->arrays[op->inputs[1]];
+  const auto& weights_array = model->GetArray(op->inputs[1]);
   // Yield until weights dims have been resolved.
   if (!weights_array.has_shape()) {
     return;
@@ -287,26 +334,31 @@ void ProcessFullyConnectedOperator(Model* model, FullyConnectedOperator* op) {
 
 void ProcessTensorFlowReshapeOperator(Model* model,
                                       TensorFlowReshapeOperator* op) {
-  auto& output_array = *model->arrays[op->outputs[0]];
-  // Bail if we already have output dims
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
+    // We have already run
     return;
   }
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
-  // Yield until input dims have been resolved.
+  const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
+    // Yield until input dims have been resolved.
     return;
   }
   const auto& input_shape = input_array.shape();
 
-  const string& shape_name = op->inputs[1];
-  auto& shape_array = model->GetArray(shape_name);
-  // Yield until the shape is resolved as a constant array
+  auto& shape_array = model->GetArray(op->inputs[1]);
+  if (!shape_array.has_shape()) {
+    // Yield until target_shape shape been resolved.
+    return;
+  }
   if (!shape_array.buffer) {
+    // Yield until the target_shape is constant
     return;
   }
-  CHECK(shape_array.data_type == ArrayDataType::kInt32);
+  CHECK(shape_array.data_type == ArrayDataType::kInt32)
+      << "Reshape dims must be int32";
+
   // shape_data is the raw array of ints describing the shape
   // in the TensorFlow node. We intentionally make a copy here, rather than
   // modify wildcards in-place below, because in some graphs, the same shape
@@ -329,23 +381,29 @@ void ProcessTensorFlowReshapeOperator(Model* model,
   }
   const int input_flat_size = RequiredBufferSizeForShape(input_shape);
   if (has_wildcard) {
+    CHECK_GE(input_flat_size, product_non_wildcard_dims)
+        << "Array not large enough to fill the requested dimensions for "
+           "Reshape op with output \""
+        << op->outputs[0] << "\". Are your input shapes correct?";
     shape_data[wildcard_index] = input_flat_size / product_non_wildcard_dims;
   }
   auto& output_shape = *output_array.mutable_shape();
   *output_shape.mutable_dims() = shape_data;
-  const int output_flat_size = RequiredBufferSizeForShape(output_shape);
-  CHECK_EQ(output_flat_size, input_flat_size);
+  CHECK_EQ(input_flat_size, RequiredBufferSizeForShape(output_shape))
+      << "Input cannot be reshaped to requested dimensions for Reshape op with "
+         "output \""
+      << op->outputs[0] << "\". Are your input shapes correct?";
 }
 
 void ProcessSimpleOperator(Model* model, Operator* op) {
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
   }
 
   const string& output_name = op->outputs[0];
-  auto& output_array = *model->arrays[output_name];
+  auto& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) {
     return;
   }
@@ -355,18 +413,40 @@ void ProcessSimpleOperator(Model* model, Operator* op) {
 
 void ProcessSimpleBinaryOperator(Model* model, Operator* op) {
   CHECK_EQ(op->inputs.size(), 2);
-  const auto& input0_array = *model->arrays[op->inputs[0]];
-  const auto& input1_array = *model->arrays[op->inputs[1]];
+  const auto& input0_array = model->GetArray(op->inputs[0]);
+  const auto& input1_array = model->GetArray(op->inputs[1]);
   // Yield until input dims have been resolved.
   if (!input0_array.has_shape() || !input1_array.has_shape()) {
     return;
   }
   const string& output_name = op->outputs[0];
-  auto& output_array = *model->arrays[output_name];
+  auto& output_array = model->GetArray(output_name);
   ComputeBinaryOperatorOutputSize(input0_array.shape(), input1_array.shape(),
                                   &output_array);
 }
 
+void ProcessAddNOperator(Model* model, Operator* op) {
+  // Yield until all input dims have been resolved.
+  //
+  // TODO(myenik): Since AddN does not support broadcasting, maybe we could
+  // actually use this to improve shape propagation by propagating the shape of
+  // one input to all other inputs once it is resolved instead of just the
+  // output, since all inputs must be the same size and shape for a well-formed
+  // graph.
+  for (const auto& input : op->inputs) {
+    const auto& input_array = model->GetArray(input);
+    if (!input_array.has_shape()) {
+      return;
+    }
+  }
+
+  // AddN does not support broadcasting, all inputs must be the same shape, so
+  // we just take the first input shape and apply it to the output.
+  const auto& input0_array = model->GetArray(op->inputs[0]);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  output_array.copy_shape(input0_array.shape());
+}
+
 bool KeepDims(const Operator& op) {
   switch (op.type) {
     case OperatorType::kTensorFlowMin:
@@ -385,11 +465,11 @@ bool KeepDims(const Operator& op) {
 
 void ProcessTensorFlowReductionOperator(Model* model, Operator* op) {
   CHECK_LE(op->inputs.size(), 2);
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
     return;
   }
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     return;
   }
@@ -397,7 +477,7 @@ void ProcessTensorFlowReductionOperator(Model* model, Operator* op) {
   const bool keep_dims = KeepDims(*op);
   if (op->inputs.size() == 2) {
     // There is a reduction_indices input.
-    const auto& reduction_array = *model->arrays[op->inputs[1]];
+    const auto& reduction_array = model->GetArray(op->inputs[1]);
     if (!reduction_array.buffer) {
       return;
     }
@@ -437,11 +517,11 @@ void ProcessSliceOperator(Model* model, SliceOperator* op) {
   if (op->begin.empty()) return;
 
   // Yield until input dims have been resolved.
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) return;
   const Shape& input_shape = input_array.shape();
 
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) return;
 
   CHECK_EQ(input_shape.dims().size(), op->size.size());
@@ -461,7 +541,7 @@ void ProcessSliceOperator(Model* model, SliceOperator* op) {
 
 void ProcessReorderAxesOperator(Model* model, ReorderAxesOperator* op) {
   const string& input_name = op->inputs[0];
-  const auto& input_array = *model->arrays[input_name];
+  const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -476,20 +556,23 @@ void ProcessReorderAxesOperator(Model* model, ReorderAxesOperator* op) {
 void ProcessConcatenationOperator(Model* model, ConcatenationOperator* op) {
   // Yield until input dims have been resolved.
   for (const auto& input_name : op->inputs) {
-    auto& input_array = *model->arrays[input_name];
+    auto& input_array = model->GetArray(input_name);
     if (!input_array.has_shape()) {
       return;
     }
   }
   auto& output_array = model->GetArray(op->outputs[0]);
   // Use 0 input as basis for output dimensions.
-  const auto& first_input_array = *model->arrays[op->inputs[0]];
+  const auto& first_input_array = model->GetArray(op->inputs[0]);
   output_array.copy_shape(first_input_array.shape());
+  // Negative axis means the count starts at the back of the dims().
+  int axis = op->axis;
+  if (axis < 0) axis += first_input_array.shape().dims().size();
   // Determine the concat size, and enfore that all inputs have
   // the same dimensions count.
   int concat_size = 0;
   for (const auto& input_name : op->inputs) {
-    auto& input_array = *model->arrays[input_name];
+    auto& input_array = model->GetArray(input_name);
     CHECK(input_array.has_shape());
     if (input_array.shape().dimensions_count() == 0) {
       continue;
@@ -497,45 +580,114 @@ void ProcessConcatenationOperator(Model* model, ConcatenationOperator* op) {
     CHECK_EQ(input_array.shape().dimensions_count(),
              output_array.shape().dimensions_count());
     const std::vector<int>& input_dims = input_array.shape().dims();
-    CHECK_LT(op->concat_dim, input_dims.size());
-    concat_size += input_dims[op->concat_dim];
+    CHECK_LT(axis, input_dims.size());
+    concat_size += input_dims[axis];
   }
   // Write out the concat_size on the output array shape.
   auto& output_shape = *output_array.mutable_shape();
   auto& output_dims = *output_shape.mutable_dims();
-  CHECK_LT(op->concat_dim, output_shape.dimensions_count());
-  output_dims[op->concat_dim] = concat_size;
+  CHECK_LT(axis, output_shape.dimensions_count());
+  output_dims[axis] = concat_size;
+}
+
+void ProcessRangeOperator(Model* model, RangeOperator* op) {
+  CHECK_EQ(op->inputs.size(), 3);
+  const auto& start_array = model->GetArray(op->inputs[0]);
+  if (!start_array.has_shape()) {
+    // Yield until input dims have been resolved.
+    return;
+  }
+  const auto& limit_array = model->GetArray(op->inputs[1]);
+  if (!limit_array.has_shape()) {
+    return;
+  }
+  const auto& delta_array = model->GetArray(op->inputs[2]);
+  if (!delta_array.has_shape()) {
+    return;
+  }
+
+  if (!IsConstantParameterArray(*model, op->inputs[0])) {
+    // Yield until inputs are constant.
+    return;
+  }
+  if (!IsConstantParameterArray(*model, op->inputs[1])) {
+    return;
+  }
+  if (!IsConstantParameterArray(*model, op->inputs[2])) {
+    return;
+  }
+
+  CHECK(start_array.data_type == ArrayDataType::kInt32)
+      << "Range op inputs must be int32.";
+  CHECK(limit_array.data_type == ArrayDataType::kInt32)
+      << "Range op inputs must be int32.";
+  CHECK(delta_array.data_type == ArrayDataType::kInt32)
+      << "Range op inputs must be int32.";
+  CHECK_EQ(RequiredBufferSizeForShape(start_array.shape()), 1)
+      << "Range op inputs must be scalar.";
+  CHECK_EQ(RequiredBufferSizeForShape(limit_array.shape()), 1)
+      << "Range op inputs must be scalar.";
+  CHECK_EQ(RequiredBufferSizeForShape(delta_array.shape()), 1)
+      << "Range op inputs must be scalar.";
+  int size = floor((limit_array.GetBuffer<ArrayDataType::kInt32>().data[0] -
+                    start_array.GetBuffer<ArrayDataType::kInt32>().data[0]) /
+                   delta_array.GetBuffer<ArrayDataType::kInt32>().data[0]);
+
+  // Only set the output shape. Contents are set by ResolveConstantRange.
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  Shape* output_shape = output_array.mutable_shape();
+  output_shape->ReplaceDims({size});
 }
 
 void ProcessTensorFlowSplitOperator(Model* model, TensorFlowSplitOperator* op) {
   CHECK_EQ(op->inputs.size(), 2);
   const string& input_name = op->inputs[1];
-  const auto& input_array = *model->arrays[input_name];
+  const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
   }
   const Shape& input_shape = input_array.shape();
 
-  // This code is slightly suspect.  The TensorFlow docs say that the axis
-  // selection defaults to 0, but we are splitting across the final axis.
-  const int input_dims_count = input_shape.dimensions_count();
-  const int input_depth = input_shape.dims(input_dims_count - 1);
-  CHECK_EQ(input_depth % op->num_split, 0);
-  const int split_depth = input_depth / op->num_split;
+  // Yield until axis is constant.
+  if (!IsConstantParameterArray(*model, op->inputs[0])) {
+    return;
+  }
+
+  const auto& axis_array = model->GetArray(op->inputs[0]);
+
+  // Yield until axis dims have been resolved.
+  if (!axis_array.has_shape()) {
+    return;
+  }
+
+  CHECK(axis_array.data_type == ArrayDataType::kInt32)
+      << "Axis array must be int32.";
+  CHECK_EQ(RequiredBufferSizeForShape(axis_array.shape()), 1)
+      << "Axis array must be scalar.";
+
+  int axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  if (axis < 0) {
+    axis += input_shape.dimensions_count();
+  }
+
+  const int split_dim = input_shape.dims(axis);
+  CHECK_EQ(split_dim % op->num_split, 0);
+  const int split_depth = split_dim / op->num_split;
 
   Shape output_shape = input_shape;
-  (*output_shape.mutable_dims())[input_dims_count - 1] = split_depth;
+  (*output_shape.mutable_dims())[axis] = split_depth;
 
   CHECK_EQ(op->outputs.size(), op->num_split);
   for (const auto& output : op->outputs) {
-    model->arrays[output]->copy_shape(output_shape);
+    model->GetArray(output).copy_shape(output_shape);
   }
 }
 
 void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
   const string& input_name = op->inputs[0];
-  const auto& input_array = *model->arrays[input_name];
+  const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -552,7 +704,7 @@ void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
 
 void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
   const string& input_name = op->inputs[0];
-  const auto& input_array = *model->arrays[input_name];
+  const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -569,7 +721,7 @@ void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
 
 void ProcessL2PoolOperator(Model* model, L2PoolOperator* op) {
   const string& input_name = op->inputs[0];
-  const auto& input_array = *model->arrays[input_name];
+  const auto& input_array = model->GetArray(input_name);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -590,33 +742,35 @@ void ProcessResizeBilinearOperator(Model* model, ResizeBilinearOperator* op) {
   CHECK_EQ(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
 
-  if (!model->arrays[op->inputs[0]]->has_shape() ||
-      !model->arrays[op->inputs[1]]->has_shape()) {
+  if (!model->GetArray(op->inputs[0]).has_shape() ||
+      !model->GetArray(op->inputs[1]).has_shape()) {
     return;
   }
-  const auto& input_data_shape = model->arrays[op->inputs[0]]->shape();
+  const auto& input_data_shape = model->GetArray(op->inputs[0]).shape();
 
   const string& output_size_name = op->inputs[1];
-  const auto& output_size_array = *model->arrays[output_size_name];
+  const auto& output_size_array = model->GetArray(output_size_name);
   CHECK(output_size_array.data_type == ArrayDataType::kInt32);
   CHECK(output_size_array.has_shape());
   const auto& output_size_shape = output_size_array.shape();
   CHECK_EQ(output_size_shape.dimensions_count(), 1);
   CHECK_EQ(output_size_shape.dims(0), 2);
+  if (!output_size_array.buffer) {
+    return;
+  }
   std::vector<int32> output_shape =
       output_size_array.GetBuffer<ArrayDataType::kInt32>().data;
-  model->arrays[op->outputs[0]]->copy_shape(
-      Shape({input_data_shape.dims(0), output_shape[0], output_shape[1],
-             input_data_shape.dims(3)}));
+  model->GetArray(op->outputs[0])
+      .copy_shape(Shape({input_data_shape.dims(0), output_shape[0],
+                         output_shape[1], input_data_shape.dims(3)}));
 }
 
 void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
-  // I/O arrays should be allocated on creation of op.
-  QCHECK_EQ(op->inputs.size(), LstmCellOperator::NUM_INPUTS);
-  QCHECK_EQ(op->outputs.size(), LstmCellOperator::NUM_OUTPUTS);
+  // Only required for compact LstmCell with default NUM_INPUTS of inputs.
+  if (op->inputs.size() != LstmCellOperator::NUM_INPUTS) return;
 
   const auto& input_array =
-      *model->arrays[op->inputs[LstmCellOperator::DATA_INPUT]];
+      model->GetArray(op->inputs[LstmCellOperator::DATA_INPUT]);
   // Yield until all input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -625,7 +779,7 @@ void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
   CHECK_GE(input_shape.dimensions_count(), 2);
 
   const auto& prev_activ_array =
-      *model->arrays[op->inputs[LstmCellOperator::PREV_ACTIV_INPUT]];
+      model->GetArray(op->inputs[LstmCellOperator::PREV_ACTIV_INPUT]);
   // Yield until all input dims have been resolved.
   if (!prev_activ_array.has_shape()) {
     return;
@@ -634,7 +788,7 @@ void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
   CHECK_GE(prev_activ_shape.dimensions_count(), 2);
 
   const auto& weights_array =
-      *model->arrays[op->inputs[LstmCellOperator::WEIGHTS_INPUT]];
+      model->GetArray(op->inputs[LstmCellOperator::WEIGHTS_INPUT]);
   // Yield until weights dims have been resolved.
   if (!weights_array.has_shape()) {
     return;
@@ -643,7 +797,7 @@ void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
   CHECK_EQ(weights_shape.dimensions_count(), 2);
 
   const auto& bias_array =
-      *model->arrays[op->inputs[LstmCellOperator::BIASES_INPUT]];
+      model->GetArray(op->inputs[LstmCellOperator::BIASES_INPUT]);
   // Yield until bias dims have been resolved.
   if (!bias_array.has_shape()) {
     return;
@@ -652,7 +806,7 @@ void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
   CHECK_GE(bias_shape.dimensions_count(), 1);
 
   const auto& prev_state_array =
-      *model->arrays[op->inputs[LstmCellOperator::PREV_STATE_INPUT]];
+      model->GetArray(op->inputs[LstmCellOperator::PREV_STATE_INPUT]);
   // Yield until all input dims have been resolved.
   if (!prev_state_array.has_shape()) {
     return;
@@ -692,18 +846,21 @@ void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
 }
 
 void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
   }
   const auto& input_shape = input_array.shape();
-  CHECK_EQ(input_shape.dimensions_count(), 4);
+  // This method only handles input dimensions of 4.
+  if (input_shape.dimensions_count() != 4) {
+    return;
+  }
   const auto input_height = input_shape.dims(1);
   const auto input_width = input_shape.dims(2);
 
-  const auto& block_shape_array = *model->arrays[op->inputs[1]];
-  const auto& paddings_array = *model->arrays[op->inputs[2]];
+  const auto& block_shape_array = model->GetArray(op->inputs[1]);
+  const auto& paddings_array = model->GetArray(op->inputs[2]);
   const auto& block_shape_array_shape = block_shape_array.shape();
   const auto& paddings_array_shape = paddings_array.shape();
   QCHECK_EQ(block_shape_array_shape.dimensions_count(), 1);
@@ -735,13 +892,13 @@ void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
   int output_height = height_with_paddings / block_height;
   int output_width = width_with_paddings / block_width;
 
-  model->arrays[op->outputs[0]]->copy_shape(
-      Shape({input_shape.dims(0) * block_height * block_width, output_height,
-             output_width, input_shape.dims(3)}));
+  model->GetArray(op->outputs[0])
+      .copy_shape(Shape({input_shape.dims(0) * block_height * block_width,
+                         output_height, output_width, input_shape.dims(3)}));
 }
 
 void ProcessBatchToSpaceNDOperator(Model* model, BatchToSpaceNDOperator* op) {
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -751,8 +908,8 @@ void ProcessBatchToSpaceNDOperator(Model* model, BatchToSpaceNDOperator* op) {
   const auto input_height = input_shape.dims(1);
   const auto input_width = input_shape.dims(2);
 
-  const auto& block_shape_array = *model->arrays[op->inputs[1]];
-  const auto& crops_array = *model->arrays[op->inputs[2]];
+  const auto& block_shape_array = model->GetArray(op->inputs[1]);
+  const auto& crops_array = model->GetArray(op->inputs[2]);
   const auto& block_shape_array_shape = block_shape_array.shape();
   const auto& crops_array_shape = crops_array.shape();
   QCHECK_EQ(block_shape_array_shape.dimensions_count(), 1);
@@ -787,15 +944,15 @@ void ProcessBatchToSpaceNDOperator(Model* model, BatchToSpaceNDOperator* op) {
   int output_height = input_height * block_height;
   int output_width = input_width * block_width;
 
-  model->arrays[op->outputs[0]]->copy_shape(
-      Shape({input_shape.dims(0) / (block_height * block_width), output_height,
-             output_width, input_shape.dims(3)}));
+  model->GetArray(op->outputs[0])
+      .copy_shape(Shape({input_shape.dims(0) / (block_height * block_width),
+                         output_height, output_width, input_shape.dims(3)}));
 }
 
 void ProcessGatherOperator(Model* model, GatherOperator* op) {
-  const auto& input_array = *model->arrays[op->inputs[0]];
-  const auto& indices_array = *model->arrays[op->inputs[1]];
-  auto& output_array = *model->arrays[op->outputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  const auto& indices_array = model->GetArray(op->inputs[1]);
+  auto& output_array = model->GetArray(op->outputs[0]);
 
   // Bail if we already know the output shape.
   if (output_array.has_shape()) {
@@ -817,6 +974,7 @@ void ProcessGatherOperator(Model* model, GatherOperator* op) {
 
   // Copy the input dimensions to the output except for dimension 0,
   // where the dimension of indices_shape is used.
+  // TODO(mgubin): if axis != 0 this is not true, change when it's supported.
   auto output_dims = output_array.mutable_shape()->mutable_dims();
   output_dims->push_back(indices_shape.dims(0));
   for (int dim = 1; dim < input_shape.dimensions_count(); dim++) {
@@ -828,7 +986,7 @@ void ProcessPadOperator(Model* model, PadOperator* op) {
   CHECK_EQ(op->inputs.size(), 2);
   CHECK_EQ(op->outputs.size(), 1);
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
 
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) return;
@@ -836,7 +994,7 @@ void ProcessPadOperator(Model* model, PadOperator* op) {
   if (op->left_padding.empty()) return;
   CHECK_EQ(op->left_padding.size(), op->right_padding.size());
 
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) return;
 
   Shape output_shape = input_array.shape();
@@ -850,47 +1008,179 @@ void ProcessPadOperator(Model* model, PadOperator* op) {
   output_array.copy_shape(output_shape);
 }
 
-void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
-  CHECK_EQ(op->inputs.size(), 4);
+void ProcessRankOperator(Model* model, RankOperator* op) {
+  CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) {
+    // Shape already propagated
+    return;
+  }
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // Yield until input dims have been resolved.
+    return;
+  }
 
-  // Yield until input dims have been resolved.
-  if (!input_array.has_shape()) return;
+  // Only set the output shape. Array contents are set by
+  // ResolveConstantShapeOrRank.
+  Shape* output_shape = output_array.mutable_shape();
+  output_shape->ReplaceDims({});
+}
 
-  if (op->start_indices.empty()) return;
-  CHECK_EQ(op->start_indices.size(), op->stop_indices.size());
-  CHECK_EQ(op->start_indices.size(), op->strides.size());
+void ProcessShapeOperator(Model* model, TensorFlowShapeOperator* op) {
+  CHECK_GE(op->inputs.size(), 1);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) {
+    // Shape already propagated
+    return;
+  }
 
-  auto& output_array = *model->arrays[op->outputs[0]];
-  if (output_array.has_shape()) return;
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // Yield until input dims have been resolved.
+    return;
+  }
 
-  Shape output_shape = input_array.shape();
-  std::vector<int>& dims = *output_shape.mutable_dims();
-  CHECK_EQ(op->start_indices.size(), dims.size());
+  // Only set the output shape. Array contents are set by
+  // ResolveConstantShapeOrRank.
+  Shape* output_shape = output_array.mutable_shape();
+  output_shape->ReplaceDims({input_array.shape().dimensions_count()});
+}
 
-  for (int i = 0; i < op->start_indices.size(); ++i) {
-    const int mask = 1 << i;
-    const int start = (op->begin_mask & mask) ? 0 : op->start_indices[i];
-    const int stop = (op->end_mask & mask) ? input_array.shape().dims()[i]
-                                           : op->stop_indices[i];
-    dims[i] = (stop - start) / op->strides[i];
+void ProcessStackOperator(Model* model, StackOperator* op) {
+  CHECK_GE(op->inputs.size(), 1);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) {
+    // Shape already propagated
+    return;
   }
 
-  output_array.copy_shape(output_shape);
+  std::unique_ptr<Shape> stacked_shape;
+  for (const auto& input : op->inputs) {
+    const auto& input_array = model->GetArray(input);
+    if (!input_array.has_shape()) {
+      // Yield until all input dims have been resolved.
+      return;
+    }
+
+    Shape shape = input_array.shape();
+    if (shape.dimensions_count() == 0) {
+      // Convert 0D scalars to 1D scalars of shape {1}.
+      shape.mutable_dims()->push_back(1);
+    }
+    if (!stacked_shape) {
+      stacked_shape.reset(new Shape(shape));
+    } else {
+      CHECK(*stacked_shape == shape) << "All input arrays to Stack operators "
+                                        "must have the same shape. Input \""
+                                     << input << "\" is different.";
+    }
+  }
+
+  int axis = op->axis;
+  if (axis < 0) {
+    // Handle negative axis
+    axis += stacked_shape->dims().size() + 1;
+  }
+  stacked_shape->mutable_dims()->insert(
+      stacked_shape->mutable_dims()->begin() + axis, op->inputs.size());
+  output_array.copy_shape(*stacked_shape);
+}
+
+void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
+  CHECK_GE(op->inputs.size(), 1);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) {
+    // Shape already propagated
+    return;
+  }
+
+  if (op->start_indices.empty() || op->stop_indices.empty() ||
+      op->strides.empty()) {
+    // ResolveStridedSliceAttributes has not run yet.
+    return;
+  }
+
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // Yield until input dims have been resolved.
+    return;
+  }
+
+  if (op->ellipsis_mask != 0) {
+    // Something like LOG_FIRST_N(WARNING, 10) would be prefferable to reduce
+    // log noise. However, the TensorFlow logging library does not appear to
+    // support this.
+    LOG(WARNING) << "Skipping StridedSlice op with output \"" << op->outputs[0]
+                 << "\". ellipsis_mask is not supported (mask="
+                 << op->ellipsis_mask << ")";
+    return;
+  }
+  if (op->new_axis_mask != 0) {
+    LOG(WARNING) << "Skipping StridedSlice op with output \"" << op->outputs[0]
+                 << "\". new_axis_mask is not supported (mask="
+                 << op->new_axis_mask << ")";
+    return;
+  }
+
+  int dim_count = input_array.shape().dimensions_count();
+  CHECK(op->start_indices.size() == dim_count)
+      << ": Incorrect number of start indices supplied to StridedSlice op with "
+         "output \""
+      << op->outputs[0] << "\". Op requires " << dim_count << " start indices";
+  CHECK(op->stop_indices.size() == dim_count)
+      << ": Incorrect number of stop indices supplied to StridedSlice op with "
+         "output \""
+      << op->outputs[0] << "\". Op requires " << dim_count << " stop indices";
+  CHECK(op->strides.size() == dim_count)
+      << ": Incorrect number of strides supplied to StridedSlice op with "
+         " output \""
+      << op->outputs[0] << "\". Op requires " << dim_count << " strides";
+
+  // Create output shape
+  std::vector<int>* dims = output_array.mutable_shape()->mutable_dims();
+
+  // Compute output shape
+  for (int i = 0; i < dim_count; ++i) {
+    const int mask = 1 << i;
+    int start = (op->begin_mask & mask) ? 0 : op->start_indices[i];
+    if (start < 0) {
+      // handle negative indices
+      start += input_array.shape().dims(i);
+    }
+    int stop = (op->end_mask & mask) ? input_array.shape().dims(i)
+                                     : op->stop_indices[i];
+    if (stop < 0) {
+      // handle negative indices
+      stop += input_array.shape().dims(i);
+    }
+
+    int dim_size = ceil((stop - start) / static_cast<float>(op->strides[i]));
+    dim_size = dim_size < 0 ? 0 : dim_size;
+    if (op->shrink_axis_mask & mask) {
+      CHECK_EQ(dim_size, 1) << "Output size for an axis must compute to 1 when "
+                               "shrinking that axis";
+    } else {
+      dims->push_back(dim_size);
+    }
+  }
 }
 
 void ProcessSqueezeOperator(Model* model, SqueezeOperator* op) {
   CHECK_EQ(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
 
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
 
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) return;
 
-  auto& output_array = *model->arrays[op->outputs[0]];
+  auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) return;
 
   const std::vector<int>& input_dims = input_array.shape().dims();
@@ -909,18 +1199,18 @@ void ProcessSqueezeOperator(Model* model, SqueezeOperator* op) {
 
 void ProcessSvdfOperator(Model* model, SvdfOperator* op) {
   CHECK(op->inputs.size() == 3 || op->inputs.size() == 4);
-  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) return;
 
-  auto& weights_feature_array = *model->arrays[op->inputs[1]];
+  auto& weights_feature_array = model->GetArray(op->inputs[1]);
   if (!weights_feature_array.has_shape()) return;
 
-  const auto& weights_time_array = *model->arrays[op->inputs[2]];
+  const auto& weights_time_array = model->GetArray(op->inputs[2]);
   if (!weights_time_array.has_shape()) return;
 
   const bool has_bias = (op->inputs.size() == 4);
   if (has_bias) {
-    const auto& bias_array = *model->arrays[op->inputs[3]];
+    const auto& bias_array = model->GetArray(op->inputs[3]);
     if (!bias_array.has_shape()) return;
   }
 
@@ -935,6 +1225,74 @@ void ProcessSvdfOperator(Model* model, SvdfOperator* op) {
   auto& output_array = model->GetArray(op->outputs[1]);
   output_array.mutable_shape()->ReplaceDims({batch_size, num_units});
 }
+
+void ProcessTransposeOperator(Model* model, TransposeOperator* op) {
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) {
+    // We have already run
+    return;
+  }
+
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // Yield until input dims have been resolved.
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+
+  auto& perm_array = model->GetArray(op->inputs[1]);
+  if (!perm_array.has_shape()) {
+    // Yield until permutation shape been resolved.
+    return;
+  }
+  if (!perm_array.buffer) {
+    // Yield until the permutation is constant
+    return;
+  }
+  CHECK(perm_array.data_type == ArrayDataType::kInt32)
+      << "Transpose permutation input must be int32";
+
+  std::vector<int32> const& perm =
+      perm_array.GetBuffer<ArrayDataType::kInt32>().data;
+  CHECK_EQ(perm.size(), input_shape.dimensions_count())
+      << "Transpose permutation input " << op->inputs[0]
+      << " must be same length as input dimensions";
+  std::vector<int>* output_dims = output_array.mutable_shape()->mutable_dims();
+  for (int i = 0; i < perm.size(); i++) {
+    int axis = perm[i];
+    CHECK_GE(axis, 0);
+    CHECK_LT(axis, input_shape.dimensions_count());
+    output_dims->push_back(input_shape.dims(axis));
+  }
+}
+
+void ProcessArgMaxOperator(Model* model, ArgMaxOperator* op) {
+  CHECK_EQ(op->inputs.size(), 2);
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+
+  // The current ArgMax implementation only supports 4-dimensional inputs with
+  // the last dimension as the axis to perform ArgMax for.
+  const std::vector<int>& input_dims = input_array.shape().dims();
+  CHECK_EQ(input_dims.size(), 4);
+  std::vector<int> output_dims;
+
+  output_dims.reserve(input_dims.size() - 1);
+  for (int i = 0; i < input_dims.size() - 1; ++i) {
+    output_dims.push_back(input_dims[i]);
+  }
+  output_dims.push_back(1);
+  const string& output_name = op->outputs[0];
+  auto& output_array = model->GetArray(output_name);
+  if (output_array.has_shape()) {
+    return;
+  }
+  *output_array.mutable_shape()->mutable_dims() = output_dims;
+}
+
 }  // namespace
 
 bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
@@ -942,8 +1300,8 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
   auto* op = it->get();
   std::unordered_map<string, std::vector<int>> old_output_dims;
   for (const auto& output : op->outputs) {
-    if (model->arrays[output]->has_shape()) {
-      old_output_dims[output] = model->arrays[output]->shape().dims();
+    if (model->GetArray(output).has_shape()) {
+      old_output_dims[output] = model->GetArray(output).shape().dims();
     }
   }
 
@@ -955,11 +1313,13 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kRelu1:
     case OperatorType::kRelu6:
     case OperatorType::kSoftmax:
+    case OperatorType::kLogSoftmax:
     case OperatorType::kLogistic:
     case OperatorType::kTanh:
     case OperatorType::kLocalResponseNormalization:
     case OperatorType::kTensorFlowIdentity:
     case OperatorType::kFakeQuant:
+    case OperatorType::kNeg:
     case OperatorType::kTensorFlowRsqrt:
     case OperatorType::kTensorFlowSqrt:
     case OperatorType::kTensorFlowSquare:
@@ -967,6 +1327,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kTensorFlowAssert:
     case OperatorType::kCast:
     case OperatorType::kFloor:
+    case OperatorType::kExp:
       ProcessSimpleOperator(model, op);
       break;
     case OperatorType::kGather:
@@ -977,6 +1338,8 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kSub:
     case OperatorType::kMul:
     case OperatorType::kDiv:
+    case OperatorType::kFloorDiv:
+    case OperatorType::kFloorMod:
     case OperatorType::kTensorFlowLess:
     case OperatorType::kTensorFlowLessEqual:
     case OperatorType::kTensorFlowGreater:
@@ -985,9 +1348,16 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kTensorFlowGreaterEqual:
       ProcessSimpleBinaryOperator(model, op);
       break;
+    case OperatorType::kAddN:
+      ProcessAddNOperator(model, op);
+      break;
     case OperatorType::kConv:
       ProcessConvOperator(model, static_cast<ConvOperator*>(op));
       break;
+    case OperatorType::kTransposeConv:
+      // Unimplemented, hopefully another graph transformation will drop it or
+      // rewrite it.
+      break;
     case OperatorType::kDepthwiseConv:
       ProcessDepthwiseConvOperator(model,
                                    static_cast<DepthwiseConvOperator*>(op));
@@ -1000,6 +1370,9 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       ProcessSpaceToDepthOperator(model,
                                   static_cast<SpaceToDepthOperator*>(op));
       break;
+    case OperatorType::kFill:
+      ProcessFillOperator(model, static_cast<FillOperator*>(op));
+      break;
     case OperatorType::kFullyConnected:
       ProcessFullyConnectedOperator(model,
                                     static_cast<FullyConnectedOperator*>(op));
@@ -1062,9 +1435,20 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       // a more general non-depth concatenation that will hopefully be dropped,
       // or else at the moment we will abort.
       break;
+    case OperatorType::kExpandDims:
+      // Yield until ExpandDims is converted to Reshape
+      break;
+    case OperatorType::kRange:
+      ProcessRangeOperator(model, static_cast<RangeOperator*>(op));
+      break;
+    case OperatorType::kRank:
+      ProcessRankOperator(model, static_cast<RankOperator*>(op));
+      break;
     case OperatorType::kTensorFlowShape:
-      // Unimplemented, hopefully another graph transformation will drop it or
-      // rewrite it.
+      ProcessShapeOperator(model, static_cast<TensorFlowShapeOperator*>(op));
+      break;
+    case OperatorType::kStack:
+      ProcessStackOperator(model, static_cast<StackOperator*>(op));
       break;
     case OperatorType::kReorderAxes:
       ProcessReorderAxesOperator(model, static_cast<ReorderAxesOperator*>(op));
@@ -1080,6 +1464,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kLstmCell:
       ProcessLstmCellOperator(model, static_cast<LstmCellOperator*>(op));
       break;
+    case OperatorType::kBatchMatMul:
     case OperatorType::kTensorFlowMatMul:
       // MatMul operators are converted to FullyConnected, after which their
       // shapes are propagated.
@@ -1099,11 +1484,17 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       ProcessStridedSliceOperator(model,
                                   static_cast<StridedSliceOperator*>(op));
       break;
+    case OperatorType::kArgMax:
+      ProcessArgMaxOperator(model, static_cast<ArgMaxOperator*>(op));
+      break;
     case OperatorType::kTensorFlowUnsupported:
       break;
     case OperatorType::kSvdf:
       ProcessSvdfOperator(model, static_cast<SvdfOperator*>(op));
       break;
+    case OperatorType::kTranspose:
+      ProcessTransposeOperator(model, static_cast<TransposeOperator*>(op));
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
@@ -1112,8 +1503,10 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
   // Return true if any output dim changed, false if none changed.
   // Assumption: no transformation clears an output shape, they only add shapes.
   for (const auto& output : op->outputs) {
-    if (model->arrays[output]->has_shape() &&
-        (old_output_dims[output] != model->arrays[output]->shape().dims())) {
+    if (model->GetArray(output).has_shape() &&
+        (old_output_dims[output] != model->GetArray(output).shape().dims())) {
+      AddMessageF("Set shape of %s to [%s]", output,
+                  absl::StrJoin(model->GetArray(output).shape().dims(), ","));
       return true;
     }
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index d33597d38144278dfca66edbdd9b3da68fbaa32c..d7f804ee432598cafe6b6c05d03219aa7d2783fa 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -41,11 +41,15 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kConcatenation ||
          type == OperatorType::kL2Normalization || type == OperatorType::kAdd ||
          type == OperatorType::kAveragePool || type == OperatorType::kMaxPool ||
+         type == OperatorType::kTensorFlowMinimum ||
+         type == OperatorType::kTensorFlowMaximum ||
          type == OperatorType::kLogistic || type == OperatorType::kSoftmax ||
-         type == OperatorType::kSqueeze ||
+         type == OperatorType::kTensorFlowSplit || type == OperatorType::kSub ||
+         type == OperatorType::kSqueeze || type == OperatorType::kPad ||
          type == OperatorType::kTensorFlowReshape ||
-         type == OperatorType::kMul || type == OperatorType::kSpaceToDepth ||
-         type == OperatorType::kDepthToSpace;
+         type == OperatorType::kTanh || type == OperatorType::kMul ||
+         type == OperatorType::kSpaceToDepth ||
+         type == OperatorType::kDepthToSpace || type == OperatorType::kLstmCell;
 }
 
 template <ArrayDataType A>
@@ -100,6 +104,9 @@ void QuantizeArray(GraphTransformation* transformation, Model* model,
     case ArrayDataType::kUint8:
       return QuantizeArray<ArrayDataType::kUint8>(transformation, model, name,
                                                   quantization_params);
+    case ArrayDataType::kInt16:
+      return QuantizeArray<ArrayDataType::kInt16>(transformation, model, name,
+                                                  quantization_params);
     case ArrayDataType::kInt32:
       return QuantizeArray<ArrayDataType::kInt32>(transformation, model, name,
                                                   quantization_params);
@@ -168,36 +175,62 @@ bool ChooseQuantizationForOperatorInput(
   if (array.data_type != ArrayDataType::kFloat) {
     return false;
   }
+
+  // Quantization of bias vectors
+  bool is_bias_vector = false;
+  int activations_input_index;
+  int weights_input_index;
   if (op.type == OperatorType::kConv ||
       op.type == OperatorType::kDepthwiseConv ||
       op.type == OperatorType::kFullyConnected) {
     if (input_index == 2) {
-      // Quantization of bias vector.
-      // We need both of the mandatory inputs (input activations and weights) to
-      // have
-      // been already quantized.
-      const auto& input_activations = model->GetArray(op.inputs[0]);
-      const auto& input_weights = model->GetArray(op.inputs[1]);
-      if (!input_activations.quantization_params ||
-          !input_weights.quantization_params) {
-        return false;
-      }
-      const auto input_activations_scale =
-          input_activations.quantization_params->scale;
-      const auto input_weights_scale = input_weights.quantization_params->scale;
-      quantization_params->scale =
-          input_activations_scale * input_weights_scale;
-      quantization_params->zero_point = 0;
-      *quantized_data_type = ArrayDataType::kInt32;
-      transformation->AddMessageF(
-          "Input array %s is a bias vector. Choosing quantization params "
-          "accordingly.",
-          input);
-      return true;
+      is_bias_vector = true;
+      activations_input_index = 0;
+      weights_input_index = 1;
     }
   }
+  if (op.type == OperatorType::kLstmCell) {
+    if (input_index == LstmCellOperator::BIASES_INPUT) {
+      is_bias_vector = true;
+      activations_input_index = LstmCellOperator::DATA_INPUT;
+      weights_input_index = LstmCellOperator::WEIGHTS_INPUT;
+    }
+  }
+  if (is_bias_vector) {
+    // Quantization of bias vector.
+    // We need both of the mandatory inputs (input activations and weights) to
+    // have been already quantized.
+    const auto& input_activations =
+        model->GetArray(op.inputs[activations_input_index]);
+    const auto& input_weights = model->GetArray(op.inputs[weights_input_index]);
+    if (!input_activations.quantization_params ||
+        !input_weights.quantization_params) {
+      return false;
+    }
+    const auto input_activations_scale =
+        input_activations.quantization_params->scale;
+    const auto input_weights_scale = input_weights.quantization_params->scale;
+    quantization_params->scale = input_activations_scale * input_weights_scale;
+    quantization_params->zero_point = 0;
+    *quantized_data_type = ArrayDataType::kInt32;
+    transformation->AddMessageF(
+        "Input array %s is a bias vector. Choosing quantization params "
+        "accordingly.",
+        input);
+    return true;
+  }
 
   const MinMax& minmax = GetOrComputeMinMax(model, input);
+
+  if (op.type == OperatorType::kLstmCell) {
+    if (input_index == LstmCellOperator::PREV_STATE_INPUT) {
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(
+          model->flags, minmax, quantization_params);
+      *quantized_data_type = ArrayDataType::kInt16;
+      return true;
+    }
+  }
+
   GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(model->flags, minmax,
                                                          quantization_params);
   transformation->AddMessageF(
@@ -258,6 +291,17 @@ bool ChooseHardcodedQuantizationForOperatorOutput(
                                  *quantization_params));
     return true;
   }
+  if (op.type == OperatorType::kTanh) {
+    // Tanh has the range: [-1, 1].
+    *quantized_data_type = ArrayDataType::kUint8;
+    quantization_params->zero_point = 128;
+    quantization_params->scale = 1. / 128.;
+    // 0 should be exactly representable, as values will typically be centered
+    // around 0, with many values near 0.
+    CHECK(
+        IsExactlyRepresentable(0., *quantized_data_type, *quantization_params));
+    return true;
+  }
   return false;
 }
 
@@ -295,6 +339,15 @@ bool ChooseQuantizationForOperatorOutput(
     return true;
   }
   const MinMax& minmax = GetOrComputeMinMax(model, output);
+  if (op.type == OperatorType::kLstmCell) {
+    if (output_index == LstmCellOperator::STATE_OUTPUT ||
+        output_index == LstmCellOperator::ACTIV_TEMP) {
+      GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(
+          model->flags, minmax, quantization_params);
+      *quantized_data_type = ArrayDataType::kInt16;
+      return true;
+    }
+  }
   GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(model->flags, minmax,
                                                          quantization_params);
   *quantized_data_type = ArrayDataType::kUint8;
@@ -390,30 +443,52 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
     if (ChooseQuantizationForOperatorInput(this, model, op, input_index,
                                            &quantized_data_type,
                                            &quantization_params)) {
-      changed = true;
       const auto& input = op.inputs[input_index];
       if (IsConstantParameterArray(*model, input)) {
         QuantizeArray(this, model, input, quantized_data_type,
                       quantization_params);
+        changed = true;
       } else {
         auto dequantize_it = FindOpWithOutput(*model, input);
-        CHECK(dequantize_it != model->operators.end());
-        auto* dequantize_op = dequantize_it->get();
-        CHECK(dequantize_op->type == OperatorType::kDequantize);
-        op.inputs[input_index] = dequantize_op->inputs[0];
-        // Check if the output of that Dequantize op was not used by any
-        // other operator. We will then erase that Dequantize op.
-        if (!CountOpsWithInput(*model, dequantize_op->outputs[0])) {
-          // If any of the model's output_arrays was pointing to the
-          // Dequantize op's output, let it point to the Dequantize op's
-          // input instead.
-          for (int i = 0; i < model->flags.output_arrays_size(); i++) {
-            if (model->flags.output_arrays(i) == dequantize_op->outputs[0]) {
-              model->flags.set_output_arrays(i, dequantize_op->inputs[0]);
+        if (dequantize_it != model->operators.end()) {
+          auto* dequantize_op = dequantize_it->get();
+          CHECK(dequantize_op->type == OperatorType::kDequantize);
+          op.inputs[input_index] = dequantize_op->inputs[0];
+          // Check if the output of that Dequantize op was not used by any
+          // other operator. We will then erase that Dequantize op.
+          if (!CountOpsWithInput(*model, dequantize_op->outputs[0])) {
+            // If any of the model's output_arrays was pointing to the
+            // Dequantize op's output, let it point to the Dequantize op's
+            // input instead.
+            for (int i = 0; i < model->flags.output_arrays_size(); i++) {
+              if (model->flags.output_arrays(i) == dequantize_op->outputs[0]) {
+                model->flags.set_output_arrays(i, dequantize_op->inputs[0]);
+              }
+            }
+            model->EraseArray(dequantize_op->outputs[0]);
+            model->operators.erase(dequantize_it);
+          }
+          changed = true;
+        } else {
+          // This input array is not produced by a Dequantize op.
+          // We have encountered this situation in RNN graphs, whose cyclic
+          // nature defeats the basic assumption underlying the quantization
+          // algorithm implemented here. For now, when we have seen this
+          // happening, the array in question was a RNN state array itself,
+          // so let us just implement this case here, and guard that assumption
+          // with a CHECK. A more general fix would involve revisiting the
+          // design of this whole Quantization transformation.
+          bool is_rnn_state_array = false;
+          for (const auto& rnn_state : model->flags.rnn_states()) {
+            if (rnn_state.state_array() == input) {
+              is_rnn_state_array = true;
+              break;
             }
           }
-          model->arrays.erase(dequantize_op->outputs[0]);
-          model->operators.erase(dequantize_it);
+          CHECK(is_rnn_state_array);
+          QuantizeArray(this, model, input, quantized_data_type,
+                        quantization_params);
+          changed = true;
         }
       }
     }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
index 371ced388a8111c18ada32cf31a784809479291d..11f8d4b6eea836c5fe4efcbd5136e6183a59dc62 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
@@ -80,7 +80,7 @@ bool ReadFakeQuantMinMax::Run(Model* model, std::size_t op_index) {
     // else.
     for (int i = 1; i <= 2; i++) {
       if (CountOpsWithInput(*model, fakequant_op->inputs[i]) == 1) {
-        model->arrays.erase(fakequant_op->inputs[i]);
+        model->EraseArray(fakequant_op->inputs[i]);
       }
     }
     fakequant_op->inputs.resize(1);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc
index 3992e7d1ef71edd4040e626d5848d2fd9bb3dab6..c3b2709a33d54213661ba96394b01aa2cfd1a278 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc
@@ -51,7 +51,7 @@ bool RemoveFinalDequantizeOp::Run(Model* model, std::size_t op_index) {
 
   // Remove the node and its output array.
   AddMessageF("Removed final %s", LogName(*dequantize_op));
-  model->arrays.erase(output);
+  model->EraseArray(output);
   model->operators.erase(dequantize_it);
   return true;
 }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
index 6add443f2d62fd06e8c0d17e03bc78c5d74732a1..95a50c61794092b02e518d1f08d8cf4a668353a8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
@@ -81,7 +81,7 @@ bool RemoveTrivialBinaryOperator::Run(Model* model, std::size_t op_index) {
   // Now check if the constant operand makes this binary
   // operator trivial.
   const auto& constant_input_array =
-      *model->arrays[binary_op->inputs[index_of_constant_input]];
+      model->GetArray(binary_op->inputs[index_of_constant_input]);
   // For now, we only handle floats here.
   if (constant_input_array.data_type != ArrayDataType::kFloat) {
     return false;
@@ -89,14 +89,14 @@ bool RemoveTrivialBinaryOperator::Run(Model* model, std::size_t op_index) {
   const auto& constant_input_float_data =
       constant_input_array.GetBuffer<ArrayDataType::kFloat>().data;
   bool is_trivial = false;
-  if (binary_op->type != OperatorType::kAdd) {
+  if (binary_op->type == OperatorType::kAdd) {
     is_trivial = AreAllBufferElementsEqualTo(constant_input_float_data, 0.f);
-  } else if (binary_op->type != OperatorType::kSub) {
+  } else if (binary_op->type == OperatorType::kSub) {
     is_trivial = index_of_constant_input == 1 &&
                  AreAllBufferElementsEqualTo(constant_input_float_data, 0.f);
-  } else if (binary_op->type != OperatorType::kMul) {
+  } else if (binary_op->type == OperatorType::kMul) {
     is_trivial = AreAllBufferElementsEqualTo(constant_input_float_data, 1.f);
-  } else if (binary_op->type != OperatorType::kDiv) {
+  } else if (binary_op->type == OperatorType::kDiv) {
     is_trivial = index_of_constant_input == 1 &&
                  AreAllBufferElementsEqualTo(constant_input_float_data, 1.f);
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
index b6037357047fc699ffb15cb40d539be148a0b637..936854a04fd600ea23ab5dda50370f85a311c28c 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
@@ -57,8 +57,9 @@ bool RemoveTrivialConcatenationInput::Run(Model* model, std::size_t op_index) {
 
   // Drop trivial inputs.
   for (const string& input : trivial_inputs) {
-    if (CountOpsWithInput(*model, input) == 1) {
-      model->arrays.erase(input);
+    if (IsDiscardableArray(*model, input) &&
+        CountOpsWithInput(*model, input) == 1) {
+      model->EraseArray(input);
     }
   }
   concat_op->inputs = nontrivial_inputs;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index 047389f69a1d8987b52b07478b0d3eaf46f433ba..587f171bbf823408a45083c36d52f1d38c300123 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -124,7 +124,7 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
       }
     }
     if (!is_referenced) {
-      model->arrays.erase(removal_candidate);
+      model->EraseArray(removal_candidate);
     }
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
index a06181ca0b5f1cbb930fa4295fec3d6adf66440d..9d448c3ee9088c16b96aa7ddc84457d2cab3231a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
 
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
@@ -54,4 +54,4 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_slice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0cbbcd7c814d38e32ee55e9d9271adf532d20924
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_slice.cc
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool IsSliceTrivial(const Model& model, const Operator& op,
+                    RemoveTrivialSlice* transformation) {
+  CHECK(op.type == OperatorType::kSlice);
+
+  // Slices are trivial if they are slicing the entire input contents.
+  const auto& input_array = model.GetArray(op.inputs[0]);
+  const auto& output_array = model.GetArray(op.outputs[0]);
+  if (input_array.has_shape() && output_array.has_shape()) {
+    if (input_array.shape() == output_array.shape()) {
+      transformation->AddMessageF(
+          "%s is trivial because its input and output shapes are equal",
+          LogName(op));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+}  // namespace
+
+bool RemoveTrivialSlice::Run(Model* model, std::size_t op_index) {
+  const auto reshape_it = model->operators.begin() + op_index;
+  auto* slice_op = reshape_it->get();
+  if (slice_op->type != OperatorType::kSlice) {
+    return false;
+  }
+
+  if (!IsSliceTrivial(*model, *slice_op, this)) {
+    return false;
+  }
+
+  AddMessageF("Removing trivial %s", LogName(*slice_op));
+
+  CHECK_EQ(slice_op->inputs.size(), 3);
+  return RemoveTrivialPassthroughOp(this, model, op_index);
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
index 0ab301552ff61405cd9c2ae42ddd11805eb707e3..aa2c293382a98b476bee783ed8e177b19d35b858 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
@@ -33,7 +33,7 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
   // the model. We allow specifying an arbitrary input_array,
   // treating the part of the graph leading up to it as unused.
   for (const auto& output : op->outputs) {
-    CHECK(model->arrays.count(output));
+    CHECK(model->HasArray(output));
     // If this output is provided as the model's input array,
     // then we don't need this operator to produce its contents.
     if (IsInputArray(*model, output)) {
@@ -47,10 +47,7 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
     bool found_output_as_rnn_state_array = false;
     for (const auto& rnn_state : model->flags.rnn_states()) {
       if (output == rnn_state.state_array()) {
-        CHECK(op->type == OperatorType::kTensorFlowUnsupported);
-        CHECK_EQ(static_cast<const TensorFlowUnsupportedOperator*>(op)
-                     ->tensorflow_op,
-                 "Fill");
+        CHECK(op->type == OperatorType::kFill);
         found_output_as_rnn_state_array = true;
         break;
       }
@@ -65,7 +62,12 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
     }
     for (const auto& rnn_state : model->flags.rnn_states()) {
       if (output == rnn_state.back_edge_source_array()) {
-        return false;
+        // The output is consumed by a RNN back-edge..
+        if (!IsDiscardableArray(*model, rnn_state.back_edge_source_array()) ||
+            !IsDiscardableArray(*model, rnn_state.state_array()) ||
+            CountOpsWithInput(*model, rnn_state.state_array())) {
+          return false;
+        }
       }
     }
     if (CountOpsWithInput(*model, output)) {
@@ -91,7 +93,7 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
     if (IsDiscardableArray(*model, input) &&
         CountOpsWithInput(*model, input) == 1 &&
         !GetOpWithOutput(*model, input)) {
-      model->arrays.erase(input);
+      model->EraseArray(input);
     }
   }
 
@@ -114,7 +116,7 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
       continue;
     }
     // Generic case: do delete this output array.
-    model->arrays.erase(output);
+    model->EraseArray(output);
   }
   model->operators.erase(it);
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30a005c789bb12e880e8e4534088d99ebacba84a
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
@@ -0,0 +1,103 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ReorderActivationFunctions::Run(Model* model, std::size_t op_index) {
+  const auto ac_it = model->operators.begin() + op_index;
+  std::unique_ptr<Operator>& ac_op = *ac_it;
+  DCHECK(ac_op);
+
+  if (ac_op->type != OperatorType::kRelu6 &&
+      ac_op->type != OperatorType::kRelu1 &&
+      ac_op->type != OperatorType::kRelu) {
+    return false;
+  }
+
+  auto exchange_it = FindOpWithOutput(*model, ac_op->inputs[0]);
+  if (exchange_it == model->operators.end()) return false;
+  // Find the op producing the array passed to this activation function
+  std::unique_ptr<Operator>& exchange_op = *exchange_it;
+  DCHECK(exchange_op);
+
+  if (exchange_op->type != OperatorType::kTensorFlowReshape) {
+    return false;
+  }
+
+  DCHECK_EQ(exchange_op->outputs[0], ac_op->inputs[0]);
+  const auto& exchange_op_input = exchange_op->inputs[0];
+  const auto& intermediate_array = exchange_op->outputs[0];
+  const auto& ac_op_output = ac_op->outputs[0];
+
+  int count_ops_consuming_output =
+      CountOpsWithInput(*model, intermediate_array);
+  DCHECK_GE(count_ops_consuming_output, 1);
+  if (count_ops_consuming_output > 1) {
+    AddMessageF(
+        "Not exchanging activation function with %s because it is consumed by "
+        "more than 1 other operator",
+        LogName(*exchange_op));
+    return false;
+  }
+
+  // If the ac_op was originally producing an output_array we can't reorder as
+  // otherwise the output array would change. It'd be nice to still be able to
+  // reorder but if code is relying on the fetch names instead of array indices
+  // this won't work.
+  for (int i = 0; i < model->flags.output_arrays_size(); ++i) {
+    if (model->flags.output_arrays(i) == ac_op->outputs[0]) {
+      AddMessageF(
+          "Not exchanging activation function with %s to preserve output array "
+          "name %s",
+          LogName(*exchange_op), ac_op->outputs[0]);
+      return false;
+    }
+  }
+
+  // Rewire by changing inputs, including all consumers.
+  Operator* consumer = GetFirstOpWithInput(*model, ac_op_output);
+  while (consumer) {
+    for (int i = 0; i < consumer->inputs.size(); ++i) {
+      if (consumer->inputs[i] == ac_op_output) {
+        consumer->inputs[i] = intermediate_array;
+      }
+    }
+    consumer = GetFirstOpWithInput(*model, ac_op_output);
+  }
+  ac_op->inputs[0] = exchange_op_input;
+  exchange_op->inputs[0] = ac_op_output;
+
+  // Clear shapes; this will allow shape propagation to fix the sizes for us.
+  model->GetOrCreateArray(ac_op->outputs[0]).clear_shape();
+  model->GetOrCreateArray(exchange_op->outputs[0]).clear_shape();
+
+  // Finally, reorder operators.  Note that this only works when there are no
+  // other direct descendents of the exchange_op.
+  ac_op.swap(exchange_op);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
index 3eb7fa3896c57ea612f21f8b4f3fa568d19420d4..fb109eb91b16e3a73005230f821c18b9ef82d2fb 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -121,9 +121,9 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   }
 
   // Remove the old param arrays
-  model->arrays.erase(bn_op->inputs[1]);
-  model->arrays.erase(bn_op->inputs[2]);
-  model->arrays.erase(bn_op->inputs[3]);
+  model->EraseArray(bn_op->inputs[1]);
+  model->EraseArray(bn_op->inputs[2]);
+  model->EraseArray(bn_op->inputs[3]);
 
   // Remove the old operator
   DCHECK_EQ(bn_it->get(), bn_op);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a06919e228dc2084f8943a714a0ca111d013c159
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveBatchToSpaceNDAttributes::Run(Model* model, std::size_t op_index) {
+  const auto op_it = model->operators.begin() + op_index;
+  if (op_it->get()->type != OperatorType::kBatchToSpaceND) return false;
+
+  auto* op = static_cast<BatchToSpaceNDOperator*>(op_it->get());
+
+  // The attributes are resolved only when the 3 attributes (block_shape,
+  // before_crops, after_crops) are all constant.
+  if (!op->block_shape.empty()) {
+    return false;
+  }
+
+  CHECK_EQ(op->inputs.size(), 3);
+  if (!IsConstantParameterArray(*model, op->inputs[1]) ||
+      !IsConstantParameterArray(*model, op->inputs[2]))
+    return false;
+
+  // Handle crops
+  const auto& crops_array = model->GetArray(op->inputs[2]);
+  if (!crops_array.has_shape()) return false;
+  const std::vector<int>& crops_dims = crops_array.shape().dims();
+  if (crops_dims.size() != 2) {
+    // Code only handles crops of 2 dimensions. Perhaps another transformation
+    // will delete this op.
+    return false;
+  }
+  std::vector<int> crops_buffer =
+      crops_array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (int i = 0; i < crops_dims[0]; ++i) {
+    op->before_crops.push_back(crops_buffer[i * 2]);
+    op->after_crops.push_back(crops_buffer[i * 2 + 1]);
+  }
+
+  // Handle block_shape
+  const auto& block_shape_array = model->GetArray(op->inputs[1]);
+  if (!block_shape_array.has_shape()) return false;
+  const std::vector<int>& block_shape_dims = block_shape_array.shape().dims();
+  CHECK_EQ(block_shape_dims.size(), 1);
+  std::vector<int> block_shape_buffer =
+      block_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (int i = 0; i < block_shape_dims[0]; ++i) {
+    op->block_shape.push_back(block_shape_buffer[i]);
+  }
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
index 53e1be7a05807cde305eca2a7a8901f652f986f6..5e779f6765262326bd59db886c2feed603e0102e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
@@ -141,6 +141,10 @@ void EvaluateBinaryOperatorOnConstantInputs(Model* model,
       outval = val0 - val1;
     } else if (binary_op->type == OperatorType::kDiv) {
       outval = val0 / val1;
+    } else if (binary_op->type == OperatorType::kFloorDiv) {
+      outval = floor(val0 / val1);
+    } else if (binary_op->type == OperatorType::kFloorMod) {
+      outval = val0 - (floor(val0 / val1) * val1);
     } else if (binary_op->type == OperatorType::kTensorFlowMinimum) {
       outval = std::min(val0, val1);
     } else if (binary_op->type == OperatorType::kTensorFlowMaximum) {
@@ -162,8 +166,9 @@ void EvaluateBinaryOperatorOnConstantInputs(Model* model,
 
 void EvaluateBinaryOperatorOnConstantInputs(Model* model,
                                             const Operator* binary_op) {
-  const auto inputs_data_type = model->arrays[binary_op->inputs[0]]->data_type;
-  const auto output_data_type = model->arrays[binary_op->outputs[0]]->data_type;
+  const auto inputs_data_type = model->GetArray(binary_op->inputs[0]).data_type;
+  const auto output_data_type =
+      model->GetArray(binary_op->outputs[0]).data_type;
 #define TOCO_HANDLE_CASE(InputsDataType, OutputDataType)                    \
   if (inputs_data_type == InputsDataType &&                                 \
       output_data_type == OutputDataType) {                                 \
@@ -191,6 +196,8 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
       binary_op->type != OperatorType::kMul &&
       binary_op->type != OperatorType::kSub &&
       binary_op->type != OperatorType::kDiv &&
+      binary_op->type != OperatorType::kFloorDiv &&
+      binary_op->type != OperatorType::kFloorMod &&
       binary_op->type != OperatorType::kTensorFlowMinimum &&
       binary_op->type != OperatorType::kTensorFlowMaximum &&
       binary_op->type != OperatorType::kTensorFlowLess &&
@@ -208,7 +215,7 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
-  auto& output_array = *model->arrays[binary_op->outputs[0]];
+  auto& output_array = model->GetArray(binary_op->outputs[0]);
   // Yield until the output array dims have been resolved.
   if (!output_array.has_shape()) {
     return false;
@@ -233,10 +240,10 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
 
   // Remove the binary operator and its inputs
   if (CountOpsWithInput(*model, binary_op->inputs[0]) == 1) {
-    model->arrays.erase(binary_op->inputs[0]);
+    model->EraseArray(binary_op->inputs[0]);
   }
   if (CountOpsWithInput(*model, binary_op->inputs[1]) == 1) {
-    model->arrays.erase(binary_op->inputs[1]);
+    model->EraseArray(binary_op->inputs[1]);
   }
   AddMessageF("Resolved constant %s to the equivalent constant array",
               LogName(*binary_op));
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 0983c438498fed28903f8facf8db239ec1a7c2c4..064810b53e7c3bee4601204c9dbd976c374a6a60 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -73,7 +73,7 @@ void CopyTensorSegments(const std::vector<Array*>& input_arrays,
 
 // Receives a series of input arrays of type Array and an integer showing the
 // axis on which those arrays will be concatenated. It returns the concatenated
-// arrray.
+// array.
 template <ArrayDataType A>
 void ConcatenateTensorBuffers(const std::vector<Array*>& input_arrays,
                               int concatenation_axis,
@@ -151,7 +151,7 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
     if (!IsDiscardableArray(*model, input_name)) return false;
   }
 
-  const int concatenation_axis = concat_op->concat_dim;
+  const int concatenation_axis = concat_op->axis;
 
   CHECK_EQ(concat_op->outputs.size(), 1);
   string concatenated_array_name = concat_op->outputs[0];
@@ -179,13 +179,20 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
       ConcatenateTensorBuffers<ArrayDataType::kInt64>(
           input_arrays, concatenation_axis, &concatenated_array);
       break;
+    case ArrayDataType::kString:
+      ConcatenateTensorBuffers<ArrayDataType::kString>(
+          input_arrays, concatenation_axis, &concatenated_array);
+      break;
     default:
       LOG(FATAL) << "ArrayDataType not supported";
   }
 
   // Remove all the resolved arrays.
   for (const string& input_name : concat_op->inputs) {
-    model->arrays.erase(input_name);
+    // Check to prevent removal of shared tensors
+    if (CountOpsWithInput(*model, input_name) == 1) {
+      model->EraseArray(input_name);
+    }
   }
 
   // Remove concatenate operator
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
index 244adcc4c46eda9de79dd753565113bbeca970c5..944901ece77430708013ea4ca340a30511ba0174 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
@@ -50,6 +50,7 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
   output_array.data_type = ArrayDataType::kFloat;
   CHECK(!output_array.buffer);
   const auto& input_buffer = input_array.GetBuffer<ArrayDataType::kFloat>();
+  output_array.GetOrCreateMinMax() = *fakequant_op->minmax;
   auto& output_buffer = output_array.GetMutableBuffer<ArrayDataType::kFloat>();
   const int size = input_buffer.data.size();
   output_buffer.data.resize(size);
@@ -66,7 +67,7 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
     output_buffer.data[i] = dst_val;
   }
   if (CountOpsWithInput(*model, fakequant_op->inputs[0]) == 1) {
-    model->arrays.erase(fakequant_op->inputs[0]);
+    model->EraseArray(fakequant_op->inputs[0]);
   }
   model->operators.erase(fakequant_it);
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f6f95481b57f58f497b119df73d331f13d9705c0
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc
@@ -0,0 +1,120 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+template <ArrayDataType Type>
+bool ComputeFillArray(Model* model, FillOperator* op) {
+  const auto& val_array = model->GetArray(op->inputs[1]);
+  auto& output_array = model->GetArray(op->outputs[0]);
+
+  CHECK(val_array.data_type == Type);
+  CHECK(output_array.data_type == Type);
+
+  // Compute the array data
+  std::vector<DataType<Type>>& data =
+      output_array.GetMutableBuffer<Type>().data;
+  data.resize(RequiredBufferSizeForShape(output_array.shape()));
+  DataType<Type> fill_val = val_array.GetBuffer<Type>().data[0];
+  for (size_t i = 0; i < data.size(); i++) {
+    data[i] = fill_val;
+  }
+
+  return true;
+}
+
+bool ResolveConstantFill::Run(Model* model, std::size_t op_index) {
+  const auto fill_it = model->operators.begin() + op_index;
+  auto* base_op = fill_it->get();
+  if (base_op->type != OperatorType::kFill) {
+    return false;
+  }
+  auto* op = static_cast<FillOperator*>(base_op);
+
+  CHECK_EQ(op->inputs.size(), 2);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return false;
+  }
+
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes
+    return false;
+  }
+
+  const auto& val_array = model->GetArray(op->inputs[1]);
+  if (!val_array.has_shape()) {
+    // Yield until the value shape has been resolved.
+    return false;
+  }
+  if (!IsConstantParameterArray(*model, op->inputs[1])) {
+    // Yield until the value is constant.
+    return false;
+  }
+  CHECK_EQ(RequiredBufferSizeForShape(val_array.shape()), 1);
+
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      if (!ComputeFillArray<ArrayDataType::kFloat>(model, op)) {
+        return false;
+      }
+      break;
+    case ArrayDataType::kUint8:
+      if (!ComputeFillArray<ArrayDataType::kUint8>(model, op)) {
+        return false;
+      }
+      break;
+    case ArrayDataType::kInt32:
+      if (!ComputeFillArray<ArrayDataType::kInt32>(model, op)) {
+        return false;
+      }
+      break;
+    case ArrayDataType::kInt64:
+      if (!ComputeFillArray<ArrayDataType::kInt64>(model, op)) {
+        return false;
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type given to Fill op with output \""
+                 << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input arrays if no longer used
+  if (IsDiscardableArray(*model, op->inputs[0]) &&
+      CountOpsWithInput(*model, op->inputs[0]) == 1) {
+    model->EraseArray(op->inputs[0]);
+  }
+  if (IsDiscardableArray(*model, op->inputs[1]) &&
+      CountOpsWithInput(*model, op->inputs[1]) == 1) {
+    model->EraseArray(op->inputs[1]);
+  }
+
+  // Erase the operator
+  model->operators.erase(fill_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a0ba9e2bc7235720b59210cdd6affa089613077
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc
@@ -0,0 +1,107 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveConstantRange::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  auto* base_op = it->get();
+  if (base_op->type != OperatorType::kRange) {
+    return false;
+  }
+  auto* op = static_cast<RangeOperator*>(base_op);
+
+  CHECK_EQ(op->inputs.size(), 3);
+  const auto& start_array = model->GetArray(op->inputs[0]);
+  if (!start_array.has_shape()) {
+    // Yield until all input dims have been resolved.
+    return false;
+  }
+  const auto& limit_array = model->GetArray(op->inputs[1]);
+  if (!limit_array.has_shape()) {
+    // Yield until all input dims have been resolved.
+    return false;
+  }
+  const auto& delta_array = model->GetArray(op->inputs[2]);
+  if (!delta_array.has_shape()) {
+    // Yield until all input dims have been resolved.
+    return false;
+  }
+
+  for (const auto& input : op->inputs) {
+    if (!IsConstantParameterArray(*model, input)) {
+      // yield if any input is mutable
+      return false;
+    }
+  }
+
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return false;
+  }
+
+  CHECK_EQ(RequiredBufferSizeForShape(start_array.shape()), 1)
+      << "Range op inputs must be scalar.";
+  CHECK_EQ(RequiredBufferSizeForShape(limit_array.shape()), 1)
+      << "Range op inputs must be scalar.";
+  CHECK_EQ(RequiredBufferSizeForShape(delta_array.shape()), 1)
+      << "Range op inputs must be scalar.";
+
+  CHECK(start_array.data_type == ArrayDataType::kInt32)
+      << "Range op inputs must be int32.";
+  CHECK(limit_array.data_type == ArrayDataType::kInt32)
+      << "Range op inputs must be int32.";
+  CHECK(delta_array.data_type == ArrayDataType::kInt32)
+      << "Range op inputs must be int32.";
+
+  // Compute buffer contents
+  int start = start_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  int limit = limit_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  int delta = delta_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  auto& buffer = output_array.GetMutableBuffer<ArrayDataType::kInt32>();
+  buffer.data.clear();
+  for (int32 val = start; val < limit; val += delta) {
+    buffer.data.push_back(val);
+  }
+  CHECK_EQ(floor((limit - start) / delta), buffer.data.size());
+  CHECK_EQ(buffer.data.size(), output_array.shape().dims()[0]);
+
+  // Delete the input array if no longer used
+  if (IsDiscardableArray(*model, op->inputs[0]) &&
+      CountOpsWithInput(*model, op->inputs[0]) == 1) {
+    model->EraseArray(op->inputs[0]);
+  }
+  if (IsDiscardableArray(*model, op->inputs[1]) &&
+      CountOpsWithInput(*model, op->inputs[1]) == 1) {
+    model->EraseArray(op->inputs[1]);
+  }
+  if (IsDiscardableArray(*model, op->inputs[2]) &&
+      CountOpsWithInput(*model, op->inputs[2]) == 1) {
+    model->EraseArray(op->inputs[2]);
+  }
+
+  // Delete the operator
+  model->operators.erase(it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ea01acd05364224ce219bed533c999793a2a2f1
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveConstantShapeOrRank::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  const auto* op = it->get();
+  if (!(op->type == OperatorType::kTensorFlowShape ||
+        op->type == OperatorType::kRank)) {
+    return false;
+  }
+
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been resolved
+    return false;
+  }
+
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // Yield until the input array's shape has been resolved.
+    return false;
+  }
+
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been resolved.
+    return false;
+  }
+
+  // Compute the output
+  CHECK(!output_array.buffer);
+  auto& output_buffer = output_array.GetMutableBuffer<ArrayDataType::kInt32>();
+  if (op->type == OperatorType::kTensorFlowShape) {
+    // Copy the input shape into the output buffer.
+    output_buffer.data = input_array.shape().dims();
+  } else if (op->type == OperatorType::kRank) {
+    // Copy the dimension count into the output buffer.
+    output_buffer.data.resize(1);
+    output_buffer.data[0] = input_array.shape().dimensions_count();
+  }
+  output_array.mutable_shape()->ReplaceDims(
+      {static_cast<int>(output_buffer.data.size())});
+
+  // Delete the input array if no longer used
+  if (IsDiscardableArray(*model, op->inputs[0]) &&
+      CountOpsWithInput(*model, op->inputs[0]) == 1) {
+    model->EraseArray(op->inputs[0]);
+  }
+
+  model->operators.erase(it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_stack.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_stack.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea0d6dc8200897db9266efbe41556dbf4c296db3
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_stack.cc
@@ -0,0 +1,113 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+template <ArrayDataType Type>
+void Stack(Model* model, StackOperator const& op) {
+  auto& output_array = model->GetArray(op.outputs[0]);
+  CHECK(output_array.data_type == Type);
+
+  // Create a buffer for the output array
+  std::vector<DataType<Type>>& output_data =
+      output_array.GetMutableBuffer<Type>().data;
+  output_data.resize(RequiredBufferSizeForShape(output_array.shape()));
+
+  // Stack inputs into buffer
+  CHECK_EQ(op.axis, 0) << "Stacking only supported along first axis";
+  int dst_offset = 0;
+  for (int i = 0; i < op.inputs.size(); i++) {
+    // Append array data to output for each input array
+    const auto& input_array = model->GetArray(op.inputs[i]);
+    int input_size = RequiredBufferSizeForShape(input_array.shape());
+    memcpy(&output_data[dst_offset], &input_array.GetBuffer<Type>().data[0],
+           input_size * sizeof(Type));
+    dst_offset += input_size;
+  }
+  CHECK_EQ(dst_offset, output_data.size());
+}
+
+}  // namespace
+
+bool ResolveConstantStack::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kStack) {
+    return false;
+  }
+  const auto* op = static_cast<const StackOperator*>(base_op);
+
+  CHECK_GE(op->inputs.size(), 1);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return false;
+  }
+
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes
+    return false;
+  }
+
+  for (const auto& input : op->inputs) {
+    if (!IsConstantParameterArray(*model, input)) {
+      // Yield if any input is mutable
+      return false;
+    }
+  }
+
+  CHECK(!output_array.buffer);
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      Stack<ArrayDataType::kFloat>(model, *op);
+      break;
+    case ArrayDataType::kUint8:
+      Stack<ArrayDataType::kUint8>(model, *op);
+      break;
+    case ArrayDataType::kInt32:
+      Stack<ArrayDataType::kInt32>(model, *op);
+      break;
+    case ArrayDataType::kInt64:
+      Stack<ArrayDataType::kInt64>(model, *op);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type given to Stack op with output \""
+                 << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input arrays if no longer used
+  for (const auto& input : op->inputs) {
+    if (IsDiscardableArray(*model, input) &&
+        CountOpsWithInput(*model, input) == 1) {
+      model->EraseArray(input);
+    }
+  }
+
+  // Erase the operator
+  model->operators.erase(it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0cfc3d59763dc1211ed4d1ac114d371a4a7ee0b
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -0,0 +1,198 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+int StartForAxis(StridedSliceOperator const& op, Shape const& input_shape,
+                 int axis) {
+  int start;
+  if (op.begin_mask & 1 << axis) {
+    // If begin mask bit is set, use the first element
+    start = 0;
+  } else {
+    // Otherwise, use the specified element
+    start = op.start_indices[axis];
+    if (start < 0) {
+      // Handle negative indices
+      start += input_shape.dims(axis);
+    }
+  }
+  return start;
+}
+
+int StopForAxis(StridedSliceOperator const& op, Shape const& input_shape,
+                int axis) {
+  int stop;
+  if (op.end_mask & (1 << axis)) {
+    // If end mask bit set, use the last element
+    stop = input_shape.dims(axis);
+  } else {
+    // Otherwise, use the specified element
+    stop = op.stop_indices[axis];
+    if (stop < 0) {
+      // Handle negative indices
+      stop += input_shape.dims(axis);
+    }
+  }
+  return stop;
+}
+
+template <ArrayDataType Type>
+void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
+                  Array* output_array) {
+  // The TensorFlow documentation for StridedSlice is a bit ambiguous in places
+  // (https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/strided-slice).
+  // Use the source code at /third_party/tensorflow/core/util/strided_op.cc as
+  // "master documentation".
+
+  CHECK(input_array.data_type == Type);
+  CHECK(output_array->data_type == Type);
+  CHECK_EQ(op.ellipsis_mask, 0);
+  CHECK_EQ(op.new_axis_mask, 0);
+
+  int num_input_axes = op.start_indices.size();
+  CHECK_EQ(num_input_axes, op.stop_indices.size());
+  CHECK_EQ(num_input_axes, op.strides.size());
+  for (int i = 0; i < op.strides.size(); i++) {
+    CHECK_GE(op.strides[i], 0) << "Negative strides usupported";
+  }
+
+  // Create a buffer for the output array
+  std::vector<DataType<Type>>& output_data =
+      output_array->GetMutableBuffer<Type>().data;
+  output_data.resize(RequiredBufferSizeForShape(output_array->shape()));
+
+  // Initialize source coordinate
+  Shape const& input_shape = input_array.shape();
+  Buffer<Type> const& input_buffer = input_array.GetBuffer<Type>();
+  std::vector<int> src_coord(op.start_indices.size());
+  for (int axis = 0; axis < num_input_axes; axis++) {
+    src_coord[axis] = StartForAxis(op, input_shape, axis);
+  }
+
+  // In order to handle any number (N) of dimensions, we copy elements one by
+  // one and treat the source coordinate as an N digit number (src_coord here).
+  // Each "digit" is incremented individually (by the stride). When it overflows
+  // (becomes greater than the stop), that digit is reset and a carry flag is
+  // used to increment the next digit.
+  int dst_offset = 0;
+  do {
+    // Copy element.
+    output_data[dst_offset] = input_buffer.data[Offset(input_shape, src_coord)];
+
+    // Compute next source input coordinates.
+    bool carry = true;
+    for (int axis = 0; axis < num_input_axes; axis++) {
+      // Increment this axis if we carried from the previous one
+      if (carry) {
+        src_coord[axis] += op.strides[axis];
+      }
+
+      // Check if we've overflowed.
+      if (src_coord[axis] >= StopForAxis(op, input_shape, axis)) {
+        // Reset axis and set carry
+        src_coord[axis] = StartForAxis(op, input_shape, axis);
+        carry = true;
+      } else {
+        carry = false;
+      }
+    }
+    // increment destination buffer offset
+    dst_offset++;
+  } while (dst_offset < output_data.size());
+}
+
+}  // anonymous namespace
+
+bool ResolveConstantStridedSlice::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kStridedSlice) {
+    return false;
+  }
+
+  const StridedSliceOperator* op =
+      static_cast<const StridedSliceOperator*>(base_op);
+
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return false;
+  }
+
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes
+    return false;
+  }
+
+  if (op->start_indices.empty() || op->stop_indices.empty() ||
+      op->strides.empty()) {
+    // Attributes have not resolved yet.
+    return false;
+  }
+
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // Yield until the value shape has been resolved.
+    return false;
+  }
+  if (!IsConstantParameterArray(*model, op->inputs[0])) {
+    // Yield until the value is constant.
+    return false;
+  }
+
+  CHECK(!output_array.buffer);
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      StridedSlice<ArrayDataType::kFloat>(*op, input_array, &output_array);
+      break;
+    case ArrayDataType::kUint8:
+      StridedSlice<ArrayDataType::kUint8>(*op, input_array, &output_array);
+      break;
+    case ArrayDataType::kInt32:
+      StridedSlice<ArrayDataType::kInt32>(*op, input_array, &output_array);
+      break;
+    case ArrayDataType::kInt64:
+      StridedSlice<ArrayDataType::kInt64>(*op, input_array, &output_array);
+      break;
+    default:
+      LOG(FATAL)
+          << "Unsupported data type input to StridedSlice op with output \""
+          << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input array if no longer used
+  if (IsDiscardableArray(*model, op->inputs[0]) &&
+      CountOpsWithInput(*model, op->inputs[0]) == 1) {
+    model->EraseArray(op->inputs[0]);
+  }
+
+  // Erase the operator
+  model->operators.erase(it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tensorflow_shape.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tensorflow_shape.cc
deleted file mode 100644
index 8cc6db161987bbd834212fdfed7e1f82cac958ce..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tensorflow_shape.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <cstddef>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ResolveConstantTensorFlowShape::Run(Model* model, std::size_t op_index) {
-  const auto tfshape_it = model->operators.begin() + op_index;
-  const auto* tfshape_base_op = tfshape_it->get();
-  if (tfshape_base_op->type != OperatorType::kTensorFlowShape) {
-    return false;
-  }
-
-  const auto* tfshape_op =
-      static_cast<const TensorFlowShapeOperator*>(tfshape_base_op);
-
-  const auto& input_array = model->GetArray(tfshape_op->inputs[0]);
-  auto& output_array = model->GetArray(tfshape_op->outputs[0]);
-
-  // Yield until the input array's shape has been resolved.
-  if (!input_array.has_shape()) {
-    return false;
-  }
-
-  // Create a buffer for the output array, making it a constant array, and
-  // copy the input shape into the output buffer.
-  CHECK(!output_array.buffer);
-  auto& output_buffer = output_array.GetMutableBuffer<ArrayDataType::kInt32>();
-  output_buffer.data = input_array.shape().dims();
-
-  // Erase the input array if no longer used
-  if (IsDiscardableArray(*model, tfshape_op->inputs[0]) &&
-      CountOpsWithInput(*model, tfshape_op->inputs[0]) == 1) {
-    model->arrays.erase(tfshape_op->inputs[0]);
-  }
-  model->operators.erase(tfshape_it);
-
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f984bfde55b3457694bb411bbfdf30723c7066e
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -0,0 +1,180 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+// Transposes an array up to rank 4.
+// This is ShuffleArrayTemplate with non-enum permutation.
+template <ArrayDataType Type>
+void Transpose(Model* model, const Array& input_array,
+               const std::vector<int>& perm, Array* output_array) {
+  const Shape& input_shape = input_array.shape();
+  const std::vector<DataType<Type>>& input_data =
+      input_array.GetBuffer<Type>().data;
+
+  const Shape& output_shape = output_array->shape();
+  std::vector<DataType<Type>>& output_data =
+      output_array->GetMutableBuffer<Type>().data;
+  output_data.resize(RequiredBufferSizeForShape(output_shape));
+
+  CHECK(input_shape.dimensions_count() == output_shape.dimensions_count());
+  const int dim = input_shape.dimensions_count();
+  CHECK_LE(dim, 4);
+  CHECK(perm.size() >= dim);
+  for (int i = 0; i < dim; i++) {
+    CHECK(perm[i] >= 0 && perm[i] < dim);
+    CHECK(input_shape.dims(perm[i]) == output_shape.dims(i));
+  }
+  Shape extended_input_shape = input_shape;
+  ExtendShape(&extended_input_shape, 4);
+  Shape extended_output_shape = output_shape;
+  ExtendShape(&extended_output_shape, 4);
+  std::vector<int> extended_perm;
+  ExtendShuffle(perm, 4, &extended_perm);
+
+  const std::vector<int>& extended_input_dims = extended_input_shape.dims();
+  const std::vector<int>& extended_output_dims = extended_output_shape.dims();
+
+  // TODO(starka): Rework to handle different numbers of dimensions.
+  int input_strides[4];
+  input_strides[3] = 1;
+  input_strides[2] = extended_input_dims[3];
+  input_strides[1] = input_strides[2] * extended_input_dims[2];
+  input_strides[0] = input_strides[1] * extended_input_dims[1];
+  const int input_stride_0 = input_strides[extended_perm[3]];
+  const int input_stride_1 = input_strides[extended_perm[2]];
+  const int input_stride_2 = input_strides[extended_perm[1]];
+  const int input_stride_3 = input_strides[extended_perm[0]];
+
+  const int output_size_0 = extended_output_dims[3];
+  const int output_size_1 = extended_output_dims[2];
+  const int output_size_2 = extended_output_dims[1];
+  const int output_size_3 = extended_output_dims[0];
+  const int output_stride_0 = 1;
+  const int output_stride_1 = output_size_0;
+  const int output_stride_2 = output_stride_1 * output_size_1;
+  const int output_stride_3 = output_stride_2 * output_size_2;
+
+  for (int i3 = 0; i3 < output_size_3; i3++) {
+    const DataType<Type>* const input_ptr_3 =
+        input_data.data() + i3 * input_stride_3;
+    DataType<Type>* const output_ptr_3 =
+        output_data.data() + i3 * output_stride_3;
+    for (int i2 = 0; i2 < output_size_2; i2++) {
+      const DataType<Type>* const input_ptr_2 =
+          input_ptr_3 + i2 * input_stride_2;
+      DataType<Type>* const output_ptr_2 = output_ptr_3 + i2 * output_stride_2;
+      for (int i1 = 0; i1 < output_size_1; i1++) {
+        const DataType<Type>* input_ptr = input_ptr_2 + i1 * input_stride_1;
+        DataType<Type>* output_ptr = output_ptr_2 + i1 * output_stride_1;
+        DataType<Type>* const output_ptr_end =
+            output_ptr + output_size_0 * output_stride_0;
+        while (output_ptr != output_ptr_end) {
+          *output_ptr = *input_ptr;
+          input_ptr += input_stride_0;
+          output_ptr += output_stride_0;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+
+bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kTranspose) {
+    return false;
+  }
+  const auto* op = static_cast<const TransposeOperator*>(base_op);
+
+  CHECK_EQ(op->inputs.size(), 2);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes.
+    return false;
+  }
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes.
+    return false;
+  }
+
+  // We require constant inputs.
+  if (!IsConstantParameterArray(*model, op->inputs[0]) ||
+      !IsConstantParameterArray(*model, op->inputs[1])) {
+    return false;
+  }
+  const Array& input_array = model->GetArray(op->inputs[0]);
+
+  if (input_array.minmax) {
+    output_array.GetOrCreateMinMax() = input_array.GetMinMax();
+  }
+
+  if (op->perm.empty()) {
+    // Yield until perm has been populated by ResolveTransposeAttributes.
+    return false;
+  }
+
+  // We currently only support 1-4 dimensions.
+  CHECK_LE(op->perm.size(), 4);
+
+  CHECK(!output_array.buffer);
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      Transpose<ArrayDataType::kFloat>(model, input_array, op->perm,
+                                       &output_array);
+      break;
+    case ArrayDataType::kUint8:
+      Transpose<ArrayDataType::kUint8>(model, input_array, op->perm,
+                                       &output_array);
+      break;
+    case ArrayDataType::kInt32:
+      Transpose<ArrayDataType::kInt32>(model, input_array, op->perm,
+                                       &output_array);
+      break;
+    case ArrayDataType::kInt64:
+      Transpose<ArrayDataType::kInt64>(model, input_array, op->perm,
+                                       &output_array);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type given to Transpose op with output \""
+                 << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input arrays if no longer used.
+  for (const auto& input : op->inputs) {
+    if (IsDiscardableArray(*model, input) &&
+        CountOpsWithInput(*model, input) == 1) {
+      model->EraseArray(input);
+    }
+  }
+
+  // Erase the operator.
+  model->operators.erase(it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
index bb9bda3c82cc9e9d3526efdabbb2c478fb172d80..f227554bc505efe6a758fdd9894fee43f2500641 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -32,7 +32,9 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
   const auto unary_it = model->operators.begin() + op_index;
   const auto* unary_op = unary_it->get();
   // Test for unary ops of types that we know how to resolve
-  if (unary_op->type != OperatorType::kTensorFlowRsqrt &&
+  if (unary_op->type != OperatorType::kCast &&
+      unary_op->type != OperatorType::kNeg &&
+      unary_op->type != OperatorType::kTensorFlowRsqrt &&
       unary_op->type != OperatorType::kTensorFlowSqrt &&
       unary_op->type != OperatorType::kTensorFlowSquare &&
       unary_op->type != OperatorType::kTensorFlowSum &&
@@ -56,6 +58,12 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     }
   }
 
+  auto& output_array = model->GetArray(unary_op->outputs[0]);
+  if (!output_array.has_shape()) {
+    // Yield until the output array dims have been resolved.
+    return false;
+  }
+
   // At the moment we don't want to care about fused activation functions.
   // The idea is that we should do the present constants-propagation before
   // activation functions get fused.
@@ -67,60 +75,86 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
         LogName(*unary_op));
     return false;
   }
+
   const auto& input_array = model->GetArray(unary_op->inputs[0]);
   // We have already tested above for existence of buffers (synonymous to being
   // a constant param).
   CHECK(input_array.buffer);
-  // At the moment we only support float buffers.
-  if (input_array.buffer->type != ArrayDataType::kFloat) {
-    return false;
-  }
-  const auto& input_float_data =
-      input_array.GetBuffer<ArrayDataType::kFloat>().data;
-  // Create the float buffer on the output array, effectively turning it into
-  // a constant parameter
-  const auto& output_name = unary_op->outputs[0];
-  auto& output_array = model->GetArray(output_name);
-  // Yield until the output array dims have been resolved.
-  if (!output_array.has_shape()) {
-    return false;
+  std::vector<DataType<ArrayDataType::kFloat>> const* input_float_data;
+  if (unary_op->type == OperatorType::kCast) {
+    CastOperator const* cast_op = static_cast<CastOperator const*>(unary_op);
+    if (cast_op->dst_data_type != ArrayDataType::kFloat) {
+      AddMessageF(
+          "Not resolving constant %s because we currently only support casting "
+          "to float",
+          LogName(*unary_op));
+      return false;
+    }
+    if (cast_op->src_data_type != input_array.buffer->type) {
+      AddMessageF(
+          "Not resolving constant %s because cast op source type does not "
+          "match input type",
+          LogName(*unary_op));
+    }
+  } else {
+    if (input_array.buffer->type != ArrayDataType::kFloat) {
+      return false;
+    }
+    input_float_data = &(input_array.GetBuffer<ArrayDataType::kFloat>().data);
   }
 
-  int input_buffer_size = RequiredBufferSizeForShape(input_array.shape());
-  int output_buffer_size = RequiredBufferSizeForShape(output_array.shape());
-  const Shape& input_shape = input_array.shape();
+  // Create a float buffer on the output array, which are always constant.
   const Shape& output_shape = output_array.shape();
-
+  const int output_dims_count = output_shape.dimensions_count();
+  const int output_buffer_size = RequiredBufferSizeForShape(output_shape);
   auto& output_float_data =
       output_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
   output_float_data.resize(output_buffer_size);
 
-  const int output_dims_count = output_shape.dimensions_count();
-  if (unary_op->type == OperatorType::kTensorFlowReshape) {
+  const Shape& input_shape = input_array.shape();
+  const int input_buffer_size = RequiredBufferSizeForShape(input_shape);
+  if (unary_op->type == OperatorType::kCast) {
+    for (int i = 0; i < output_buffer_size; i++) {
+      float outval = 0.0f;
+      if (input_array.buffer->type == ArrayDataType::kFloat) {
+        outval = static_cast<float>(
+            input_array.GetBuffer<ArrayDataType::kFloat>().data[i]);
+      } else if (input_array.buffer->type == ArrayDataType::kUint8) {
+        outval = static_cast<float>(
+            input_array.GetBuffer<ArrayDataType::kUint8>().data[i]);
+      } else if (input_array.buffer->type == ArrayDataType::kInt32) {
+        outval = static_cast<float>(
+            input_array.GetBuffer<ArrayDataType::kInt32>().data[i]);
+      } else if (input_array.buffer->type == ArrayDataType::kInt64) {
+        outval = static_cast<float>(
+            input_array.GetBuffer<ArrayDataType::kInt64>().data[i]);
+      } else {
+        LOG(FATAL) << "Unsupported cast op input type";
+      }
+      output_float_data[i] = outval;
+    }
+  } else if (unary_op->type == OperatorType::kTensorFlowReshape) {
     CHECK(input_buffer_size == output_buffer_size);
-    memcpy(output_float_data.data(), input_float_data.data(),
-           input_buffer_size * sizeof(input_float_data[0]));
+    memcpy(output_float_data.data(), (*input_float_data).data(),
+           output_buffer_size * sizeof(output_float_data[0]));
   } else if (unary_op->type == OperatorType::kTensorFlowSum) {
     // At the moment only full reduction across all dimensions is supported.
-    for (int i = 0; i < output_dims_count; i++) {
-      CHECK_EQ(output_shape.dims(i), 1);
-    }
     float sum = 0.f;
-    const int input_size = RequiredBufferSizeForShape(input_shape);
-    for (int i = 0; i < input_size; i++) {
-      sum += input_float_data[i];
+    for (int i = 0; i < input_buffer_size; i++) {
+      sum += (*input_float_data)[i];
+    }
+    for (int i = 0; i < output_buffer_size; ++i) {
+      output_float_data[i] = sum;
     }
-    output_float_data[0] = sum;
   } else if (unary_op->type == OperatorType::kTensorFlowMin) {
     // At the moment only full reduction across all dimensions is supported.
     // TODO(starka): Output should not be padded.
     for (int i = 0; i < output_dims_count; i++) {
       CHECK_EQ(output_shape.dims(i), 1);
     }
-    float min = input_float_data[0];
-    const int input_size = RequiredBufferSizeForShape(input_shape);
-    for (int i = 0; i < input_size; i++) {
-      min = std::min(min, input_float_data[i]);
+    float min = (*input_float_data)[0];
+    for (int i = 0; i < input_buffer_size; i++) {
+      min = std::min(min, (*input_float_data)[i]);
     }
     output_float_data[0] = min;
   } else if (unary_op->type == OperatorType::kTensorFlowMax) {
@@ -129,25 +163,26 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     for (int i = 0; i < output_dims_count; i++) {
       CHECK_EQ(output_shape.dims(i), 1);
     }
-    float max = input_float_data[0];
-    const int input_size = RequiredBufferSizeForShape(input_shape);
-    for (int i = 0; i < input_size; i++) {
-      max = std::max(max, input_float_data[i]);
+    float max = (*input_float_data)[0];
+    for (int i = 0; i < input_buffer_size; i++) {
+      max = std::max(max, (*input_float_data)[i]);
     }
     output_float_data[0] = max;
-  } else if (unary_op->type == OperatorType::kTensorFlowRsqrt ||
+  } else if (unary_op->type == OperatorType::kNeg ||
+             unary_op->type == OperatorType::kTensorFlowRsqrt ||
              unary_op->type == OperatorType::kTensorFlowSqrt ||
              unary_op->type == OperatorType::kTensorFlowSquare) {
     // Element-wise ops. Should have perfectly matching sizes here.
-    const int input_size = RequiredBufferSizeForShape(input_shape);
     for (int i = 0; i < output_dims_count; i++) {
       CHECK_EQ(output_shape.dims(i), input_shape.dims(i));
     }
 
-    for (int i = 0; i < input_size; i++) {
-      const float val = input_float_data[i];
+    for (int i = 0; i < output_buffer_size; i++) {
+      const float val = (*input_float_data)[i];
       float outval = 0.f;
-      if (unary_op->type == OperatorType::kTensorFlowRsqrt) {
+      if (unary_op->type == OperatorType::kNeg) {
+        outval = -val;
+      } else if (unary_op->type == OperatorType::kTensorFlowRsqrt) {
         outval = 1.0f / std::sqrt(val);
       } else if (unary_op->type == OperatorType::kTensorFlowSqrt) {
         outval = std::sqrt(val);
@@ -163,7 +198,7 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
   }
   for (const auto& input : unary_op->inputs) {
     if (CountOpsWithInput(*model, input) == 1) {
-      model->arrays.erase(input);
+      model->EraseArray(input);
     }
   }
   AddMessageF("Resolved constant %s to the equivalent constant array",
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc
index d25c773f195cea407251bf046f0b1f1924e01968..013b50ac9ba8a51c23b19953d987b2fbf63fcea1 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc
@@ -29,22 +29,16 @@ bool ResolveMeanAttributes::Run(Model* model, std::size_t op_index) {
   if (mean_op->type != OperatorType::kMean) return false;
   auto* op = static_cast<MeanOperator*>(mean_op);
 
-  if (!op->reduction_indices.empty()) return false;
+  if (!op->axis.empty()) {
+    // Attributes already resolved
+    return false;
+  }
   if (op->inputs.size() != 2) return false;
   if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
 
-  const auto& indices_array = *model->arrays[op->inputs[1]];
+  const auto& indices_array = model->GetArray(op->inputs[1]);
   if (!indices_array.has_shape()) return false;
-
-  op->reduction_indices = indices_array.GetBuffer<ArrayDataType::kInt32>().data;
-
-  // At the moment, we only support simultaneous reduction over width and
-  // height. This is mainly limited by the fact that currently, the runtime
-  // arrays are always 4-dimensional.
-  CHECK_EQ(op->reduction_indices.size(), 2);
-  CHECK((op->reduction_indices[0] == 1 && op->reduction_indices[1] == 2) ||
-        (op->reduction_indices[0] == 2 && op->reduction_indices[1] == 1));
-
+  op->axis = indices_array.GetBuffer<ArrayDataType::kInt32>().data;
   return true;
 }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc
index d5f5869c625f419a825f6bd652a04eca1bce4a6f..8a8e723cf7b2d77ec199e3817464a068bf85afdd 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc
@@ -35,7 +35,7 @@ bool ResolvePadAttributes::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(op->inputs.size(), 2);
   if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
 
-  const auto& array = *model->arrays[op->inputs[1]];
+  const auto& array = model->GetArray(op->inputs[1]);
   if (!array.has_shape()) return false;
 
   const std::vector<int>& dims = array.shape().dims();
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc
index 8fa7b83bedc0da99c3a5a60f38586f712eeb3c4e..bc70db0bd8c26319fa140616de96452260a01058 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc
@@ -25,6 +25,31 @@ limitations under the License.
 
 namespace toco {
 
+// Reorder the elements of an input_array according to the input_axes_order and
+// output_axes_order. Then adjust the shapes of the input and output arrays
+// accordingly. Note that input_array must have a buffer (that is, it is a
+// constant array).
+template <typename T, ArrayDataType DataType>
+void ReorderAxes(AxesOrder input_axes_order, AxesOrder output_axes_order,
+                 Array* input_array, Array* output_array) {
+  CHECK(input_array->buffer->type == DataType);
+  CHECK(!output_array->buffer);
+  auto& input_data = input_array->GetMutableBuffer<DataType>().data;
+  std::vector<T> reordered_data;
+  reordered_data.resize(RequiredBufferSizeForShape(output_array->shape()));
+  // TODO(b/62904716) Shapes should be used directly.
+  Shape input_shape = input_array->shape();
+  Shape output_shape = output_array->shape();
+  if (AxesCount(input_axes_order) == 2) {
+    UnextendShape(&input_shape, 2);
+    UnextendShape(&output_shape, 2);
+  }
+  ShuffleArray(input_shape, input_axes_order, output_axes_order, output_shape,
+               input_data.data(), reordered_data.data());
+  input_data = reordered_data;
+  input_array->copy_shape(output_array->shape());
+}
+
 bool ResolveReorderAxes::Run(Model* model, std::size_t op_index) {
   auto reorder_it = model->operators.begin() + op_index;
   auto* reorder_op = static_cast<ReorderAxesOperator*>(reorder_it->get());
@@ -35,16 +60,7 @@ bool ResolveReorderAxes::Run(Model* model, std::size_t op_index) {
   const auto& output_array_name = reorder_op->outputs[0];
   auto& input_array = model->GetArray(input_array_name);
   auto& output_array = model->GetArray(output_array_name);
-  string constant_input_array_name = input_array_name;
   if (!input_array.buffer) {
-    const auto* op_producing_input = GetOpWithOutput(*model, input_array_name);
-    if (op_producing_input &&
-        op_producing_input->type == OperatorType::kFakeQuant) {
-      constant_input_array_name = op_producing_input->inputs[0];
-    }
-  }
-  auto& constant_input_array = model->GetArray(constant_input_array_name);
-  if (!constant_input_array.buffer) {
     return false;
   }
   // Yield until output dims have been resolved.
@@ -52,26 +68,19 @@ bool ResolveReorderAxes::Run(Model* model, std::size_t op_index) {
     return false;
   }
   // Reorder the input array dims and buffer data
-  CHECK(constant_input_array.buffer->type == ArrayDataType::kFloat);
-  CHECK(!output_array.buffer);
-  auto& input_data =
-      constant_input_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
-  std::vector<float> reordered_data;
-  reordered_data.resize(RequiredBufferSizeForShape(output_array.shape()));
-  const auto input_axes_order = reorder_op->input_axes_order;
-  const auto output_axes_order = reorder_op->output_axes_order;
-  // TODO(b/62904716) Shapes should be used directly.
-  Shape input_shape = constant_input_array.shape();
-  Shape output_shape = output_array.shape();
-  if (AxesCount(input_axes_order) == 2) {
-    UnextendShape(&input_shape, 2);
-    UnextendShape(&output_shape, 2);
+  if (input_array.buffer->type == ArrayDataType::kFloat) {
+    ReorderAxes<float, ArrayDataType::kFloat>(reorder_op->input_axes_order,
+                                              reorder_op->output_axes_order,
+                                              &input_array, &output_array);
+  } else if (input_array.buffer->type == ArrayDataType::kInt32) {
+    ReorderAxes<uint8, ArrayDataType::kUint8>(reorder_op->input_axes_order,
+                                              reorder_op->output_axes_order,
+                                              &input_array, &output_array);
+  } else {
+    LOG(FATAL) << "Cannot ReorderAxes unless input buffer is float or uint8.";
   }
-  ShuffleArray(input_shape, input_axes_order, output_axes_order, output_shape,
-               input_data.data(), reordered_data.data());
-  input_data = reordered_data;
+
   input_array.copy_shape(output_array.shape());
-  constant_input_array.copy_shape(output_array.shape());
 
   // Update the edges of the graph to point to the input array
   for (const auto& other_op : model->operators) {
@@ -85,7 +94,7 @@ bool ResolveReorderAxes::Run(Model* model, std::size_t op_index) {
   AddMessageF("Reordered axes for array %s", input_array_name);
 
   // Remove the op and output array.
-  model->arrays.erase(output_array_name);
+  model->EraseArray(output_array_name);
   model->operators.erase(reorder_it);
   return true;
 }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc
index bed2a85bd262c49913f22e522d260c4dc6510246..2e063e35548aa5e51c3bcc94a2dfc7992180d014 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc
@@ -37,7 +37,7 @@ bool ResolveReshapeAttributes::Run(Model* model, std::size_t op_index) {
   if (!op->shape.empty()) return false;
 
   if (IsConstantParameterArray(*model, reshape_op->inputs[1])) {
-    const auto& constant_input_array = *model->arrays[reshape_op->inputs[1]];
+    const auto& constant_input_array = model->GetArray(reshape_op->inputs[1]);
     op->shape = constant_input_array.GetBuffer<ArrayDataType::kInt32>().data;
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc
index 1d0a2ec8f6c1f532f23873062534a37e07fff72b..e760d08e5a6c2f56db6b11fee922b701d33dd1a0 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc
@@ -36,10 +36,10 @@ bool ResolveSliceAttributes::Run(Model* model, std::size_t op_index) {
   if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
   if (!IsConstantParameterArray(*model, op->inputs[2])) return false;
 
-  const auto& begin_array = *model->arrays[op->inputs[1]];
+  const auto& begin_array = model->GetArray(op->inputs[1]);
   if (!begin_array.has_shape()) return false;
 
-  const auto& size_array = *model->arrays[op->inputs[2]];
+  const auto& size_array = model->GetArray(op->inputs[2]);
   if (!size_array.has_shape()) return false;
 
   op->begin = begin_array.GetBuffer<ArrayDataType::kInt32>().data;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dad6aceccfd201b3db07c29c99a8c6ef75bb89a1
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveSpaceToBatchNDAttributes::Run(Model* model, std::size_t op_index) {
+  const auto op_it = model->operators.begin() + op_index;
+  if (op_it->get()->type != OperatorType::kSpaceToBatchND) return false;
+
+  auto* op = static_cast<SpaceToBatchNDOperator*>(op_it->get());
+
+  // The attributes are resolved only when the 3 attributes (block_shape,
+  // before_paddings, after_paddings) are all constant.
+  if (!op->block_shape.empty()) {
+    return false;
+  }
+
+  const int block_shape_index = 1;
+  const int paddings_index = 2;
+
+  CHECK_EQ(op->inputs.size(), 3);
+  if (!IsConstantParameterArray(*model, op->inputs[block_shape_index]) ||
+      !IsConstantParameterArray(*model, op->inputs[paddings_index]))
+    return false;
+
+  // Handle paddings.
+  const auto& paddings_array = model->GetArray(op->inputs[paddings_index]);
+  if (!paddings_array.has_shape()) return false;
+  const std::vector<int>& paddings_dims = paddings_array.shape().dims();
+  if (paddings_dims.size() != 2) {
+    // Code only handles padding of 2 dimensions. Perhaps another transformation
+    // will delete this op.
+    return false;
+  }
+  std::vector<int> paddings_buffer =
+      paddings_array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (int i = 0; i < paddings_dims[0]; ++i) {
+    op->before_paddings.push_back(paddings_buffer[i * 2]);
+    op->after_paddings.push_back(paddings_buffer[i * 2 + 1]);
+  }
+
+  // Handle block_shape.
+  const auto& block_shape_array =
+      model->GetArray(op->inputs[block_shape_index]);
+  if (!block_shape_array.has_shape()) return false;
+  const std::vector<int>& block_shape_dims = block_shape_array.shape().dims();
+  CHECK_EQ(block_shape_dims.size(), 1);
+  std::vector<int> block_shape_buffer =
+      block_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (int i = 0; i < block_shape_dims[0]; ++i) {
+    op->block_shape.push_back(block_shape_buffer[i]);
+  }
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_squeeze.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_squeeze_attributes.cc
similarity index 86%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_squeeze.cc
rename to tensorflow/contrib/lite/toco/graph_transformations/resolve_squeeze_attributes.cc
index 1d3f42b5ec4cab29189c12043d12ea687d684832..dd3e73635ae0215510f0a8d1aee487da5af35700 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_squeeze.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_squeeze_attributes.cc
@@ -25,15 +25,13 @@ limitations under the License.
 
 namespace toco {
 
-bool ResolveTensorFlowSqueeze::Run(Model* model, std::size_t op_index) {
-  const auto squeeze_it = model->operators.begin() + op_index;
-  const auto* squeeze_op = squeeze_it->get();
+bool ResolveSqueezeAttributes::Run(Model* model, std::size_t op_index) {
+  auto* squeeze_op = model->operators[op_index].get();
   if (squeeze_op->type != OperatorType::kSqueeze) {
     return false;
   }
-
-  CHECK_EQ(squeeze_op->inputs.size(), 1);
-  CHECK_EQ(squeeze_op->outputs.size(), 1);
+  DCHECK_EQ(squeeze_op->inputs.size(), 1);
+  DCHECK_EQ(squeeze_op->outputs.size(), 1);
 
   // If the output is consumed by a reshape op, it's a trivial squeeze.
   if (CountOpsWithInput(*model, squeeze_op->outputs[0]) == 1) {
@@ -47,7 +45,6 @@ bool ResolveTensorFlowSqueeze::Run(Model* model, std::size_t op_index) {
       return RemoveTrivialPassthroughOp(this, model, op_index);
     }
   }
-
   return false;
 }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
index 5fc3b25bc12b0644ce2fcd3f7ee5e793791d54d5..7e8b249b07ecca551cbb75afd8007efad0b52eaf 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
@@ -12,11 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -30,33 +25,43 @@ bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
   if (slice_op->type != OperatorType::kStridedSlice) return false;
 
   auto* op = static_cast<StridedSliceOperator*>(slice_op);
-  if (!op->start_indices.empty()) return false;
+  if (!op->start_indices.empty()) {
+    // We have already resolved these attributes
+    return false;
+  }
 
   CHECK_EQ(op->inputs.size(), 4);
-  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
-  if (!IsConstantParameterArray(*model, op->inputs[2])) return false;
-  if (!IsConstantParameterArray(*model, op->inputs[3])) return false;
-
-  const auto& start_array = *model->arrays[op->inputs[1]];
+  const auto& start_array = model->GetArray(op->inputs[1]);
   if (!start_array.has_shape()) return false;
+  if (toco::RequiredBufferSizeForShape(start_array.shape()) > 4) {
+    // Only 1-4D arrays are supported for now.
+    return false;
+  }
 
-  const auto& stop_array = *model->arrays[op->inputs[2]];
+  const auto& stop_array = model->GetArray(op->inputs[2]);
   if (!stop_array.has_shape()) return false;
 
-  const auto& stride_array = *model->arrays[op->inputs[3]];
+  const auto& stride_array = model->GetArray(op->inputs[3]);
   if (!stride_array.has_shape()) return false;
 
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
+  if (!IsConstantParameterArray(*model, op->inputs[2])) return false;
+  if (!IsConstantParameterArray(*model, op->inputs[3])) return false;
+
   op->start_indices = start_array.GetBuffer<ArrayDataType::kInt32>().data;
   op->stop_indices = stop_array.GetBuffer<ArrayDataType::kInt32>().data;
   op->strides = stride_array.GetBuffer<ArrayDataType::kInt32>().data;
 
-  // Only 4D arrays are supported for now.
-  CHECK_EQ(op->start_indices.size(), 4);
-  CHECK_EQ(op->stop_indices.size(), 4);
-  CHECK_EQ(op->strides.size(), 4);
-
-  // TODO(dkalenichenko): Delete the extra inputs?
+  CHECK_GE(op->start_indices.size(), 1);
+  CHECK_LE(op->start_indices.size(), 4);
+  CHECK_EQ(op->stop_indices.size(), op->start_indices.size());
+  CHECK_EQ(op->strides.size(), op->stop_indices.size());
 
+  // Ideally, we would remove the input arrays after they have been resolved.
+  // However, we must then reconstitute these input arrays for all supported
+  // export formats. For now, leave the arrays so we don't have to modify our
+  // exporters. Ideally, we wouldn't have op attributes, and would work directly
+  // with the input arrays.
   return true;
 }
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
index b482f5cf51f7bde67e76792439203487402b75ce..5c0c1e3478fa0d94104d1b76bab176b98b314c50 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
@@ -35,37 +35,36 @@ bool ResolveTensorFlowConcat::Run(Model* model, std::size_t op_index) {
 
   CHECK_GE(tf_concat_op->inputs.size(), 2);
   // TensorFlow Concat and ConcatV2 nodes only differ by the ordering
-  // of inputs: in Concat, the concat_dim is the first input, while in
+  // of inputs: in Concat,the axis is the first input, while in
   // ConcatV2, it is the last input.
-  std::size_t concat_dim_pos = 0;
+  std::size_t axis_pos = 0;
   if (tf_concat_op->type == OperatorType::kTensorFlowConcatV2) {
-    concat_dim_pos = tf_concat_op->inputs.size() - 1;
+    axis_pos = tf_concat_op->inputs.size() - 1;
   }
-  const string concat_dim_name = tf_concat_op->inputs[concat_dim_pos];
+  const string axis_name = tf_concat_op->inputs[axis_pos];
   std::vector<string> concat_input_names;
   for (std::size_t i = 0; i < tf_concat_op->inputs.size(); i++) {
-    if (i != concat_dim_pos) {
+    if (i != axis_pos) {
       concat_input_names.push_back(tf_concat_op->inputs[i]);
     }
   }
-  // If the concat_dim array hasn't been resolved to a constant yet,
+  // If the axis array hasn't been resolved to a constant yet,
   // we need to yield.
-  const auto& concat_dim_array = model->GetArray(concat_dim_name);
-  if (!concat_dim_array.buffer) {
-    AddMessageF("Waiting for the concat_dim of %s to be resolved to a constant",
+  const auto& axis_array = model->GetArray(axis_name);
+  if (!axis_array.buffer) {
+    AddMessageF("Waiting for the axis of %s to be resolved to a constant",
                 LogName(*tf_concat_op));
     return false;
   }
 
-  CHECK(concat_dim_array.data_type == ArrayDataType::kInt32);
-  const auto& concat_dim_data =
-      concat_dim_array.GetBuffer<ArrayDataType::kInt32>().data;
-  CHECK_EQ(concat_dim_data.size(), 1);
-  const int concat_dim = concat_dim_data[0];
+  CHECK(axis_array.data_type == ArrayDataType::kInt32);
+  const auto& axis_data = axis_array.GetBuffer<ArrayDataType::kInt32>().data;
+  CHECK_EQ(axis_data.size(), 1);
+  const int axis = axis_data[0];
 
   // Create the Concatenation op replacing the TensorFlowConcat op.
   auto* concatenation_op = new ConcatenationOperator;
-  concatenation_op->concat_dim = concat_dim;
+  concatenation_op->axis = axis;
   concatenation_op->inputs = concat_input_names;
   concatenation_op->outputs = {tf_concat_op->outputs[0]};
   auto depth_concat_it = model->operators.emplace(concat_it, concatenation_op);
@@ -74,9 +73,9 @@ bool ResolveTensorFlowConcat::Run(Model* model, std::size_t op_index) {
   concat_it = depth_concat_it + 1;
   CHECK_EQ(concat_it->get(), tf_concat_op);
 
-  // Remove the concat_dim array if it is not used by anything else.
-  if (CountOpsWithInput(*model, concat_dim_name) == 1) {
-    model->arrays.erase(concat_dim_name);
+  // Remove the axis array if it is not used by anything else.
+  if (CountOpsWithInput(*model, axis_name) == 1) {
+    model->EraseArray(axis_name);
   }
   // Remove the TensorFlowConcat op
   model->operators.erase(concat_it);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
index bea7487051a58344a56a3186a05d0fdceebc8727..f38203c80fcb7ab8bc1639129fd98e4e342e5cb7 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -29,7 +29,36 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   if (matmul_it->get()->type != OperatorType::kTensorFlowMatMul) {
     return false;
   }
-  const auto* matmul_op = matmul_it->get();
+  const auto* matmul_op =
+      static_cast<const TensorFlowMatMulOperator*>(matmul_it->get());
+
+  // Reorder the axes on the second input. TensorFlow uses row-major ordering
+  // on both inputs, however this is inefficient for the FullyConnected
+  // operator. We'll transpose the second input to be in column-major order now
+  // and let constant propagation optimize things (if possible).
+  auto* transpose_op = new TransposeOperator;
+  transpose_op->inputs = {
+      matmul_op->inputs[1],
+      CreateInt32Array(
+          model,
+          AvailableArrayName(*model, matmul_op->inputs[1] + "/transpose/perm"),
+          {1, 0})};
+  transpose_op->outputs = {
+      AvailableArrayName(*model, matmul_op->inputs[1] + "/transpose")};
+  model->GetOrCreateArray(transpose_op->outputs[0]);
+  model->operators.emplace(matmul_it, transpose_op);
+
+  // Refresh iterator.
+  matmul_it = model->operators.begin();
+  for (; matmul_it != model->operators.end(); ++matmul_it) {
+    if (matmul_it->get() == matmul_op) {
+      break;
+    }
+  }
+  DCHECK_EQ(matmul_it->get(), matmul_op);
+
+  string input_lhs = matmul_op->inputs[0];
+  string input_rhs = transpose_op->outputs[0];
 
   // Find the op producing the array passed to this MatMul
   auto previous_op_it = model->operators.begin();
@@ -47,38 +76,42 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   }
   Operator* previous_op = (found) ? previous_op_it->get() : nullptr;
 
-  // construct the new FullyConnectedOperator
+  // Construct the new FullyConnectedOperator.
   auto* fc_op = new FullyConnectedOperator;
   fc_op->outputs = matmul_op->outputs;
 
-  // insert the newly constructed FullyConnectedOperator
-  auto fc_it = model->operators.emplace(matmul_it, fc_op);
+  // Insert the newly constructed FullyConnectedOperator.
+  model->operators.emplace(matmul_it, fc_op) + 1;
 
-  // refresh invalidated iterator
-  matmul_it = fc_it + 1;
+  // Refresh iterator.
+  matmul_it = model->operators.begin();
+  for (; matmul_it != model->operators.end(); ++matmul_it) {
+    if (matmul_it->get() == matmul_op) {
+      break;
+    }
+  }
   DCHECK_EQ(matmul_it->get(), matmul_op);
 
   // The way that TensorFlow encodes FullyConnected ops is as a pair
   // (Reshape, MatMul), so we want to remove the Reshape op and rewrite the
-  // MatMul
-  // op as a FullyConnected. However, TensorFlow skips the Reshape ops if the
-  // input doesn't need reshaping, so we can't just match (Reshape, MatMul)
+  // MatMul op as a FullyConnected. However, TensorFlow skips the Reshape ops if
+  // the input doesn't need reshaping, so we can't just match (Reshape, MatMul)
   // pairs.
   if (previous_op && previous_op->type == OperatorType::kTensorFlowReshape) {
     AddMessageF("Combining %s and %s into %s", LogName(*previous_op),
                 LogName(*matmul_op), LogName(*fc_op));
     const auto& previous_op_output = previous_op->outputs[0];
     if (CountOpsWithInput(*model, previous_op_output) == 1) {
-      model->arrays.erase(previous_op_output);
+      model->EraseArray(previous_op_output);
     }
     CHECK_EQ(previous_op->inputs.size(), 2);
-    fc_op->inputs = {previous_op->inputs[0], matmul_op->inputs[1]};
+    input_lhs = previous_op->inputs[0];
     // Only remove Reshape node if no other node uses its output.
     if (CountOpsWithInput(*model, previous_op_output) == 1) {
       const auto& previous_op_shape = previous_op->inputs[1];
       if (CountOpsWithInput(*model, previous_op_shape) == 1 &&
           !GetOpWithOutput(*model, previous_op_shape)) {
-        model->arrays.erase(previous_op_shape);
+        model->EraseArray(previous_op_shape);
       }
       model->operators.erase(previous_op_it);
     }
@@ -95,9 +128,10 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   } else {
     AddMessageF("Replacing %s by a FullyConnected operator",
                 LogName(*matmul_op));
-    fc_op->inputs = {matmul_op->inputs[0], matmul_op->inputs[1]};
   }
 
+  fc_op->inputs = {input_lhs, input_rhs};
+
   // erase the MatMul operator
   model->operators.erase(matmul_it);
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
index cfa5ce0716523adbfb0a76e89ce3b202f0595763..477e7f13da3d88a68547d494011cd4984936b909 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
@@ -55,7 +55,7 @@ bool ResolveTensorFlowMerge::Run(Model* model, std::size_t op_index) {
 
   // Remove the node and its output array.
   AddMessageF("Removing already-resolved %s", LogName(*merge_op));
-  model->arrays.erase(merge_op->outputs[0]);
+  model->EraseArray(merge_op->outputs[0]);
   model->operators.erase(merge_it);
   return true;
 }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
index 55adfca03739deb35cbeb50c67222768f8a02164..a418073441f1241a5acb1164b36f332828ea2e99 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
@@ -59,7 +59,7 @@ bool ResolveTensorFlowSwitch::Run(Model* model, std::size_t op_index) {
   // From the TensorFlow docs on .switch() in
   // third_party/tensorflow/python/ops/control_flow_ops.py
   //
-  //    If `pred` is false, the `data` input is forwared to the first output.
+  //    If `pred` is false, the `data` input is forwarded to the first output.
   //    Otherwise, the data goes to the second output.
   //
   // Note that this comment used to say the opposite and was recently fixed:
@@ -103,7 +103,7 @@ bool ResolveTensorFlowSwitch::Run(Model* model, std::size_t op_index) {
   // Remove the output arrays if they are now unused.
   for (int i = 0; i < 2; i++) {
     if (!GetOpWithInput(*model, switch_op->outputs[i])) {
-      model->arrays.erase(switch_op->outputs[i]);
+      model->EraseArray(switch_op->outputs[i]);
     }
   }
   // Remove input arrays if they are only used by the switch itself and aren't
@@ -111,7 +111,7 @@ bool ResolveTensorFlowSwitch::Run(Model* model, std::size_t op_index) {
   for (const auto& input : switch_op->inputs) {
     if (CountOpsWithInput(*model, input) == 1 &&
         !GetOpWithOutput(*model, input)) {
-      model->arrays.erase(input);
+      model->EraseArray(input);
     }
   }
   // Remove the switch node itself.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_tile.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_tile.cc
index 9f7e7c42a26b60c96573be6653babb78fdb5fd73..1ddf54c778cd1fae7a8fce0ecb97209274e71ac0 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_tile.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_tile.cc
@@ -45,10 +45,10 @@ void RemoveTileOperator(Model* model, Operator* tile_op, Operator* binary_op,
   model->operators.erase(tile_it);
   if (!CountOpsWithInput(*model, tile_multiplier_array) &&
       !GetOpWithOutput(*model, tile_multiplier_array)) {
-    model->arrays.erase(tile_multiplier_array);
+    model->EraseArray(tile_multiplier_array);
   }
   if (!CountOpsWithInput(*model, tile_output_array)) {
-    model->arrays.erase(tile_output_array);
+    model->EraseArray(tile_output_array);
   }
 }
 }  // namespace
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_transpose_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_transpose_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a657ee00af66bd431f96c361e12d5213e203b3df
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_transpose_attributes.cc
@@ -0,0 +1,53 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveTransposeAttributes::Run(Model* model, std::size_t op_index) {
+  const auto op_it = model->operators.begin() + op_index;
+  if (op_it->get()->type != OperatorType::kTranspose) return false;
+
+  auto* op = static_cast<TransposeOperator*>(op_it->get());
+  if (!op->perm.empty()) return false;
+
+  CHECK_EQ(op->inputs.size(), 2);
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
+
+  // Handling perm.
+  const auto& perm_array = model->GetArray(op->inputs[1]);
+  if (!perm_array.has_shape()) return false;
+
+  const std::vector<int>& perm_dims = perm_array.shape().dims();
+  CHECK_EQ(perm_dims.size(), 1);
+
+  std::vector<int> perm_buffer =
+      perm_array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (int i = 0; i < perm_dims[0]; ++i) {
+    op->perm.push_back(perm_buffer[i]);
+  }
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
index 893149878293c9ef2740effe331d3b6c51b49983..2f94f9cd8a9ab24809fb3d137b5d05ab12f43003 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
@@ -18,6 +18,17 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "lstm_utils_test",
+    srcs = ["lstm_utils_test.cc"],
+    deps = [
+        "//tensorflow/contrib/lite/toco:graph_transformations",
+        "//tensorflow/contrib/lite/toco:model",
+        "//tensorflow/contrib/lite/toco:tooling_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/lstm_utils_test.cc b/tensorflow/contrib/lite/toco/graph_transformations/tests/lstm_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6aae0775d3445daf7d990bcce09d335c5f686601
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/lstm_utils_test.cc
@@ -0,0 +1,442 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <tuple>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+namespace {
+
+// A gmock matcher that check that elements of a float vector match to a given
+// tolerance.
+std::vector<testing::Matcher<float>> ArrayFloatNear(
+    const std::vector<float>& values, float max_abs_error = 1e-5) {
+  std::vector<testing::Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float& v : values) {
+    matchers.emplace_back(testing::FloatNear(v, max_abs_error));
+  }
+  return matchers;
+}
+}  // namespace
+
+class CopyArrayDataTest : public ::testing::Test {
+ public:
+  CopyArrayDataTest() {}
+
+  void PrepareBuffers(Model* model, std::initializer_list<float> src_data,
+                      int src_dim_1, int src_dim_2,
+                      std::initializer_list<float> dst_data, int dst_dim_1,
+                      int dst_dim_2) {
+    string src_array = "src_array";
+    src_buffer_ = CreateFloatArrayBuffer(
+        model, &src_array,
+        src_dim_2 == 1 ? Shape({src_dim_1}) : Shape({src_dim_1, src_dim_2}));
+    PopulateBuffer(src_buffer_, src_data);
+    string dst_array = "dst_array";
+    dst_buffer_ = CreateFloatArrayBuffer(
+        model, &dst_array,
+        dst_dim_2 == 1 ? Shape({dst_dim_1}) : Shape({dst_dim_1, dst_dim_2}));
+    PopulateBuffer(dst_buffer_, dst_data);
+  }
+
+  Buffer<ArrayDataType::kFloat>* GetSrcBuffer() { return src_buffer_; }
+  Buffer<ArrayDataType::kFloat>* GetDstBuffer() { return dst_buffer_; }
+
+  void PopulateBuffer(Buffer<ArrayDataType::kFloat>* buffer,
+                      const std::vector<float>& init_data) {
+    for (int i = 0; i < init_data.size(); i++) {
+      buffer->data[i] = init_data[i];
+    }
+  }
+  void UpdateBuffer(Buffer<ArrayDataType::kFloat>* buffer,
+                    std::initializer_list<float> data) {
+    buffer->data.resize(data.size());
+    PopulateBuffer(buffer, data);
+  }
+
+ private:
+  Buffer<ArrayDataType::kFloat>* src_buffer_;
+  Buffer<ArrayDataType::kFloat>* dst_buffer_;
+};
+
+// Copy from 1 big 2D array to 8 smaller ones.
+TEST_F(CopyArrayDataTest, CopyFromBigArrayToSmallerArrayes2D) {
+  // Init src_buffer, dst_buffer.
+  Model model;
+  std::initializer_list<float> large_tf_weight_data = {
+      -0.320407, -0.108683, 0.406358,  -0.410811, -0.285786, -0.15769,
+      -0.194201, 0.170866,  0.084135,  0.201878,  0.21519,   -0.284458,
+      0.495906,  -0.073818, 0.045578,  0.149816,  -0.447073, -0.453578,
+      0.116766,  0.21808,   0.047326,  -0.001985, 0.402193,  0.315517,
+      0.38258,   0.43599,   0.11986,   0.465195,  0.33548,   -0.118789,
+      -0.414159, 0.049269,  0.156108,  0.093459,  -0.129103, -0.086274,
+      0.186188,  -0.324923, 0.4117,    -0.344439, 0.240465,  -0.343331,
+      -0.463082, -0.231706, -0.487465, -0.186592, -0.020756, -0.239007,
+      0.364817,  0.459106,  -0.171447, -0.006542, 0.204032,  -0.375317,
+      -0.041911, 0.051664,  0.320483,  0.155899,  0.156555,  -0.249823,
+      -0.353107, 0.031563,  -0.340771, -0.052532, 0.134631,  -0.257957,
+      -0.50141,  0.486939,  -0.43853,  0.268426,  -0.08754,  -0.109447,
+      -0.502462, -0.028055, -0.121838, -0.046016, 0.105309,  -0.070774,
+      0.495683,  -0.475088, 0.048654,  -0.38582,  0.411018,  -0.315606,
+      0.349628,  0.21698,   0.258989,  -0.097902, 0.331218,  0.034602,
+      0.418069,  -0.089025, -0.417513, 0.07609,   0.393821,  0.404733,
+      -0.055418, -0.43903,  -0.447049, 0.013125,  0.278503,  0.459869,
+      0.143755,  -0.177335, -0.162247, -0.432371, 0.153714,  -0.047403,
+      -0.446775, -0.418363, 0.019743,  0.042025};
+  std::initializer_list<float> tflite_lstm_input_weight = {0, 0, 0, 0, 0, 0,
+                                                           0, 0, 0, 0, 0, 0};
+  PrepareBuffers(&model, large_tf_weight_data, /*src_dim_1=*/16,
+                 /*src_dim_2=*/7, tflite_lstm_input_weight,
+                 /*dst_dim_1=*/4, /*dst_dim_2=*/3);
+
+  // Copy src starts at (0,0), size (4,3).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/3,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+  std::vector<float> expected = {-0.320407, -0.108683, 0.406358, 0.170866,
+                                 0.084135,  0.201878,  0.045578, 0.149816,
+                                 -0.447073, -0.001985, 0.402193, 0.315517};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy src starts at (4,0), size (4,3).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/4,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/3,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+  expected = {0.33548,   -0.118789, -0.414159, -0.086274, 0.186188,  -0.324923,
+              -0.463082, -0.231706, -0.487465, 0.459106,  -0.171447, -0.006542};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy src starts at (8,0), size (4,3).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/8,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/3,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+  expected = {0.320483, 0.155899,  0.156555,  -0.052532, 0.134631, -0.257957,
+              -0.08754, -0.109447, -0.502462, -0.070774, 0.495683, -0.475088};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy src starts at (12,0), size (4,3).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/12,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/3,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+  expected = {0.349628,  0.21698,  0.258989, -0.089025, -0.417513, 0.07609,
+              -0.447049, 0.013125, 0.278503, -0.432371, 0.153714,  -0.047403};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // New dst_buffer with size 16.
+  std::initializer_list<float> tflite_lstm_recurrent_weight = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  PrepareBuffers(&model, large_tf_weight_data, /*src_dim_1=*/16,
+                 /*src_dim_2=*/7, tflite_lstm_recurrent_weight,
+                 /*dst_dim_1=*/4, /*dst_dim_2=*/4);
+
+  // Copy src starts at (0,3), size (4,4).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/3, GetDstBuffer(), /*dst_stride=*/4,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+  expected = {-0.410811, -0.285786, -0.15769,  -0.194201, 0.21519, -0.284458,
+              0.495906,  -0.073818, -0.453578, 0.116766,  0.21808, 0.047326,
+              0.38258,   0.43599,   0.11986,   0.465195};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy src starts at (4,3), size (4,4).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/4,
+                /*src_start_idx2=*/3, GetDstBuffer(), /*dst_stride=*/4,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+  expected = {0.049269, 0.156108,  0.093459,  -0.129103, 0.4117,    -0.344439,
+              0.240465, -0.343331, -0.186592, -0.020756, -0.239007, 0.364817,
+              0.204032, -0.375317, -0.041911, 0.051664};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy src starts at (8,3), size (4,4).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/8,
+                /*src_start_idx2=*/3, GetDstBuffer(), /*dst_stride=*/4,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+  expected = {-0.249823, -0.353107, 0.031563,  -0.340771, -0.50141,  0.486939,
+              -0.43853,  0.268426,  -0.028055, -0.121838, -0.046016, 0.105309,
+              0.048654,  -0.38582,  0.411018,  -0.315606};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy src starts at (12,3), size (4,4).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/7, /*src_start_idx1=*/12,
+                /*src_start_idx2=*/3, GetDstBuffer(), /*dst_stride=*/4,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+  expected = {-0.097902, 0.331218,  0.034602, 0.418069, 0.393821,  0.404733,
+              -0.055418, -0.43903,  0.459869, 0.143755, -0.177335, -0.162247,
+              -0.446775, -0.418363, 0.019743, 0.042025};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+// Copy from 1 big 1D array to 4 small ones.
+TEST_F(CopyArrayDataTest, CopyFromBigArrayToSmallerArrayes1D) {
+  // Init src_buffer, dst_buffer.
+  Model model;
+  std::initializer_list<float> large_tf_bias_data = {
+      0.980304, 0.419808, 0.080278, 0.728548, 0.581674, 0.672433,
+      0.434190, 0.844357, 0.229587, 0.785629, 0.022065, 0.753082,
+      0.422080, 0.539481, 0.878386, 0.168965};
+  std::initializer_list<float> tflite_lstm_i_bias = {0, 0, 0, 0};
+  PrepareBuffers(&model, large_tf_bias_data, /*src_dim_1=*/16,
+                 /*src_dim_2=*/1, tflite_lstm_i_bias,
+                 /*dst_dim_1=*/4, /*dst_dim_2=*/1);
+
+  // Copy starts at (0,), size (4,).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+  std::vector<float> expected = {0.980304, 0.419808, 0.080278, 0.728548};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy starts at (4,), size (4,).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/4,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+  expected = {0.581674, 0.672433, 0.434190, 0.844357};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy starts at (8,), size (4,).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/8,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+  expected = {0.229587, 0.785629, 0.022065, 0.753082};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Copy starts at (12,), size (4,).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/12,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+  expected = {0.422080, 0.539481, 0.878386, 0.168965};
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+// Copy from 8 small 2D arrayes to 1 big one.
+TEST_F(CopyArrayDataTest, CopyFromSmallArrayesToBigArray2D) {
+  // Init src_buffer, dst_buffer.
+  Model model;
+  std::initializer_list<float> large_tf_weights_data = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  // Copy dst starts (0, 0), size (4, 3).
+  std::initializer_list<float> tflite_lstm_i2i_weight = {
+      -0.320407, -0.108683, 0.406358,  0.170866,  0.084135, 0.201878,
+      0.045578,  0.149816,  -0.447073, -0.001985, 0.402193, 0.315517};
+  PrepareBuffers(&model, tflite_lstm_i2i_weight, /*src_dim_1=*/4,
+                 /*src_dim_2=*/3, large_tf_weights_data,
+                 /*dst_dim_1=*/16, /*dst_dim_2=*/7);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/3, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+
+  // Copy dst starts (4, 0), size (4, 3).
+  std::initializer_list<float> tflite_lstm_i2c_weight = {
+      0.33548,   -0.118789, -0.414159, -0.086274, 0.186188,  -0.324923,
+      -0.463082, -0.231706, -0.487465, 0.459106,  -0.171447, -0.006542};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_i2c_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/3, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/4, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+
+  // Copy dst starts (8, 0), size (4, 3).
+  std::initializer_list<float> tflite_lstm_i2f_weight = {
+      0.320483, 0.155899,  0.156555,  -0.052532, 0.134631, -0.257957,
+      -0.08754, -0.109447, -0.502462, -0.070774, 0.495683, -0.475088};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_i2f_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/3, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/8, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+
+  // Copy dst starts (12, 0), size (4, 3).
+  std::initializer_list<float> tflite_lstm_i2o_weight = {
+      0.349628,  0.21698,  0.258989, -0.089025, -0.417513, 0.07609,
+      -0.447049, 0.013125, 0.278503, -0.432371, 0.153714,  -0.047403};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_i2o_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/3, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/12, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/3);
+
+  // Copy dst starts (0, 3), size (4, 4).
+  std::initializer_list<float> tflite_lstm_i2r_weight = {
+      -0.410811, -0.285786, -0.15769,  -0.194201, 0.21519, -0.284458,
+      0.495906,  -0.073818, -0.453578, 0.116766,  0.21808, 0.047326,
+      0.38258,   0.43599,   0.11986,   0.465195};
+  UpdateBuffer(GetSrcBuffer(), tflite_lstm_i2r_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/4, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/3,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+
+  // Copy dst starts (4, 3), size (4, 4).
+  std::initializer_list<float> tflite_lstm_c2r_weight = {
+      0.049269, 0.156108,  0.093459,  -0.129103, 0.4117,    -0.344439,
+      0.240465, -0.343331, -0.186592, -0.020756, -0.239007, 0.364817,
+      0.204032, -0.375317, -0.041911, 0.051664};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_c2r_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/4, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/4, /*dst_start_idx2=*/3,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+
+  // Copy dst starts (8, 3), size (4, 4).
+  std::initializer_list<float> tflite_lstm_f2r_weight = {
+      -0.249823, -0.353107, 0.031563,  -0.340771, -0.50141,  0.486939,
+      -0.43853,  0.268426,  -0.028055, -0.121838, -0.046016, 0.105309,
+      0.048654,  -0.38582,  0.411018,  -0.315606};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_f2r_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/4, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/8, /*dst_start_idx2=*/3,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+
+  // Copy dst starts (12, 3), size (4, 4).
+  std::initializer_list<float> tflite_lstm_o2r_weight = {
+      -0.097902, 0.331218,  0.034602, 0.418069, 0.393821,  0.404733,
+      -0.055418, -0.43903,  0.459869, 0.143755, -0.177335, -0.162247,
+      -0.446775, -0.418363, 0.019743, 0.042025};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_o2r_weight);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/4, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/7,
+                /*dst_start_idx1=*/12, /*dst_start_idx2=*/3,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/4);
+
+  std::vector<float> expected = {
+      -0.320407, -0.108683, 0.406358,  -0.410811, -0.285786, -0.15769,
+      -0.194201, 0.170866,  0.084135,  0.201878,  0.21519,   -0.284458,
+      0.495906,  -0.073818, 0.045578,  0.149816,  -0.447073, -0.453578,
+      0.116766,  0.21808,   0.047326,  -0.001985, 0.402193,  0.315517,
+      0.38258,   0.43599,   0.11986,   0.465195,  0.33548,   -0.118789,
+      -0.414159, 0.049269,  0.156108,  0.093459,  -0.129103, -0.086274,
+      0.186188,  -0.324923, 0.4117,    -0.344439, 0.240465,  -0.343331,
+      -0.463082, -0.231706, -0.487465, -0.186592, -0.020756, -0.239007,
+      0.364817,  0.459106,  -0.171447, -0.006542, 0.204032,  -0.375317,
+      -0.041911, 0.051664,  0.320483,  0.155899,  0.156555,  -0.249823,
+      -0.353107, 0.031563,  -0.340771, -0.052532, 0.134631,  -0.257957,
+      -0.50141,  0.486939,  -0.43853,  0.268426,  -0.08754,  -0.109447,
+      -0.502462, -0.028055, -0.121838, -0.046016, 0.105309,  -0.070774,
+      0.495683,  -0.475088, 0.048654,  -0.38582,  0.411018,  -0.315606,
+      0.349628,  0.21698,   0.258989,  -0.097902, 0.331218,  0.034602,
+      0.418069,  -0.089025, -0.417513, 0.07609,   0.393821,  0.404733,
+      -0.055418, -0.43903,  -0.447049, 0.013125,  0.278503,  0.459869,
+      0.143755,  -0.177335, -0.162247, -0.432371, 0.153714,  -0.047403,
+      -0.446775, -0.418363, 0.019743,  0.042025};
+
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+// Copy from 4 small 1D arrayes to 1 big one.
+TEST_F(CopyArrayDataTest, CopyFromSmallArrayesToBigArray1D) {
+  // Init src_buffer, dst_buffer.
+  Model model;
+  std::initializer_list<float> large_tf_bias_data = {0, 0, 0, 0, 0, 0, 0, 0,
+                                                     0, 0, 0, 0, 0, 0, 0, 0};
+
+  std::initializer_list<float> tflite_lstm_i_bias = {0.980304, 0.419808,
+                                                     0.080278, 0.728548};
+
+  PrepareBuffers(&model, tflite_lstm_i_bias, /*src_dim_1=*/4,
+                 /*src_dim_2=*/1, large_tf_bias_data,
+                 /*dst_dim_1=*/16, /*dst_dim_2=*/1);
+
+  // Copy starts at (0,), size (4,).
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/0, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+
+  // Copy starts at (4,), size (4,).
+  std::initializer_list<float> tflite_lstm_cell_bias = {0.581674, 0.672433,
+                                                        0.434190, 0.844357};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_cell_bias);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/4, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+
+  // Copy starts at (8,0), size (4,).
+  std::initializer_list<float> tflite_lstm_forget_bias = {0.229587, 0.785629,
+                                                          0.022065, 0.753082};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_forget_bias);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/8, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+
+  // Copy starts at (12,), size (4,).
+  std::initializer_list<float> tflite_lstm_output_bias = {0.422080, 0.539481,
+                                                          0.878386, 0.168965};
+  PopulateBuffer(GetSrcBuffer(), tflite_lstm_output_bias);
+  CopyArrayData(*(GetSrcBuffer()),
+                /*src_stride=*/1, /*src_start_idx1=*/0,
+                /*src_start_idx2=*/0, GetDstBuffer(), /*dst_stride=*/1,
+                /*dst_start_idx1=*/12, /*dst_start_idx2=*/0,
+                /*dim1_copy_size=*/4, /*dim2_copy_size=*/1);
+
+  std::vector<float> expected = {0.980304, 0.419808, 0.080278, 0.728548,
+                                 0.581674, 0.672433, 0.434190, 0.844357,
+                                 0.229587, 0.785629, 0.022065, 0.753082,
+                                 0.422080, 0.539481, 0.878386, 0.168965};
+
+  EXPECT_THAT(GetDstBuffer()->data, ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
index c6705ad305ac85f7098f40469ebc54fc6fa1b3ab..3a1d175b9823f085c9b8730caba8bedd7eb87d52 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-//#include "tensorflow/contrib/lite/kernels/test_util.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -109,7 +108,7 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
   // Prepare a hypothetical TOCO model with one Concatenation operator in it
   // together with 4 arrays as its inputs.
   // It receives the dimension of concatenation as input.
-  void PrepareModel(Model* model, int concat_dim) {
+  void PrepareModel(Model* model, int axis) {
     std::vector<string> concat_input_names = {"array0", "array1", "array2",
                                               "array3"};
 
@@ -142,7 +141,7 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
       cnt++;
     }
     auto* concatenation_op = new ConcatenationOperator;
-    concatenation_op->concat_dim = concat_dim;
+    concatenation_op->axis = axis;
     concatenation_op->inputs = concat_input_names;
     concatenation_op->outputs = {"concat_op_outputs"};
     Array& out_array = model->GetOrCreateArray(concatenation_op->outputs[0]);
@@ -151,7 +150,7 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
     std::vector<int>* out_array_shape_dim = out_array_shape->mutable_dims();
     out_array_shape_dim->resize(kDim);
     for (int i = 0; i < kDim; i++) {
-      if (i == concat_dim) {
+      if (i == axis) {
         (*out_array_shape_dim)[i] = kNumArrays * kElementPerDim;
       } else {
         (*out_array_shape_dim)[i] = kElementPerDim;
@@ -163,16 +162,16 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
 
 TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis0) {
   Model model;
-  const int concat_dim = 0;
-  PrepareModel(&model, concat_dim);
+  const int axis = 0;
+  PrepareModel(&model, axis);
 
   GraphTransformationsSet graph_transformation_set;
   graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
-  EXPECT_THAT(model.arrays.size(), 5);
+  EXPECT_THAT(model.GetArrayMap().size(), 5);
   (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
-  EXPECT_THAT(model.arrays.size(), 1);
+  EXPECT_THAT(model.GetArrayMap().size(), 1);
 
-  auto& concatenated_array = (*model.arrays.begin()).second;
+  auto& concatenated_array = (*model.GetArrayMap().begin()).second;
   EXPECT_THAT(concatenated_array->GetBuffer<toco::ArrayDataType::kFloat>().data,
               ElementsAreArray(ArrayFloatNear(
                   {0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  10., 11., 12.,
@@ -182,16 +181,16 @@ TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis0) {
 
 TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis1) {
   Model model;
-  const int concat_dim = 1;
-  PrepareModel(&model, concat_dim);
+  const int axis = 1;
+  PrepareModel(&model, axis);
 
   GraphTransformationsSet graph_transformation_set;
   graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
-  EXPECT_THAT(model.arrays.size(), 5);
+  EXPECT_THAT(model.GetArrayMap().size(), 5);
   (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
-  EXPECT_THAT(model.arrays.size(), 1);
+  EXPECT_THAT(model.GetArrayMap().size(), 1);
 
-  auto& concatenated_array = (*model.arrays.begin()).second;
+  auto& concatenated_array = (*model.GetArrayMap().begin()).second;
   EXPECT_THAT(concatenated_array->GetBuffer<toco::ArrayDataType::kFloat>().data,
               ElementsAreArray(ArrayFloatNear(
                   {0.,  1.,  2.,  3.,  10., 11., 12., 13., 20., 21., 22.,
@@ -201,16 +200,16 @@ TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis1) {
 
 TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis2) {
   Model model;
-  const int concat_dim = 2;
-  PrepareModel(&model, concat_dim);
+  const int axis = 2;
+  PrepareModel(&model, axis);
 
   GraphTransformationsSet graph_transformation_set;
   graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
-  EXPECT_THAT(model.arrays.size(), 5);
+  EXPECT_THAT(model.GetArrayMap().size(), 5);
   (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
-  EXPECT_THAT(model.arrays.size(), 1);
+  EXPECT_THAT(model.GetArrayMap().size(), 1);
 
-  auto& concatenated_array = (*model.arrays.begin()).second;
+  auto& concatenated_array = (*model.GetArrayMap().begin()).second;
   EXPECT_THAT(concatenated_array->GetBuffer<toco::ArrayDataType::kFloat>().data,
               ElementsAreArray(ArrayFloatNear(
                   {0.,  1.,  10., 11., 20., 21., 30., 31., 2.,  3.,  12.,
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc
index 4e273343df9f3e5ade8f23a2fbd868bcab72c62e..2c7046c8c77c94a89fc05a26d7d72b3661380475 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc
@@ -63,7 +63,7 @@ bool UnfuseActivationFunctions::Run(Model* model, std::size_t op_index) {
   ac_op->outputs = op->outputs;
   const string& tmp_array_name =
       AvailableArrayName(*model, op->outputs[0] + "_unfused");
-  CHECK(!model->arrays.count(tmp_array_name));
+  CHECK(!model->HasArray(tmp_array_name));
   model->GetOrCreateArray(tmp_array_name);
   ac_op->inputs = {tmp_array_name};
   op->outputs = {tmp_array_name};
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da81ea2ff3b4ab0bee0550874a9c4ea1044a3579
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -0,0 +1,172 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// Unrolls a BatchMatMul on the batch dimension.
+// We need to slice each batch out of the inputs, matmul them individually, then
+// stack them all back together at the end.
+//
+// This transform effectively looks like:
+//  result_slices = []
+//  for bat in B:
+//    slice_a = tf.reshape(tf.slice(a, [bat, 0, 0], [1, M, N]), [M, N])
+//    slice_b = tf.reshape(tf.slice(b, [bat, 0, 0], [1, M, N]), [M, N])
+//    slice_c = tf.matmul(slice_a, slice_b)
+//    result_slices[bat] = slice_c
+//  result = tf.stack(result_slices)
+bool UnrollBatchMatMul::Run(Model* model, std::size_t op_index) {
+  auto batch_op_it = model->operators.begin() + op_index;
+  if (batch_op_it->get()->type != OperatorType::kBatchMatMul) {
+    return false;
+  }
+  const auto* batch_op =
+      static_cast<const BatchMatMulOperator*>(batch_op_it->get());
+
+  // We must have the shape of at least one input to know our batch size.
+  const auto& input_array_a = model->GetArray(batch_op->inputs[0]);
+  const auto& input_array_b = model->GetArray(batch_op->inputs[1]);
+  if (!input_array_a.has_shape() || !input_array_b.has_shape()) return false;
+
+  // We only support the rank 3 case. If you are batching on rank > 3 you'll
+  // have to figure that out.
+  CHECK_EQ(input_array_a.shape().dimensions_count(),
+           input_array_b.shape().dimensions_count())
+      << "Input dimensions must have the same rank";
+  if (input_array_a.shape().dimensions_count() == 2) {
+    // This is really just a MatMul. This likely means that someone hand-crafted
+    // a graphdef with a BatchMatMul when they really wanted a MatMul.
+    AddMessageF("Replacing non-batch BatchMatMul %s by a MatMul operator",
+                LogName(*batch_op));
+    auto* matmul_op = new TensorFlowMatMulOperator;
+    matmul_op->inputs = batch_op->inputs;
+    matmul_op->outputs = batch_op->outputs;
+    const auto matmul_op_it = model->operators.emplace(batch_op_it, matmul_op);
+    batch_op_it = matmul_op_it + 1;
+    CHECK_EQ(batch_op_it->get(), batch_op);
+    model->operators.erase(batch_op_it);
+    return true;
+  }
+  CHECK_EQ(input_array_a.shape().dimensions_count(), 3)
+      << "Input arrays must have rank 3";
+
+  // Perform the matmul for each slice of the batch.
+  int batch_count = input_array_a.shape().dims(0);
+  AddMessageF("Unrolling BatchMatMul %s %d times", LogName(*batch_op),
+              batch_count);
+  auto tail_it = batch_op_it;
+  std::vector<string> stack_inputs;
+  for (int batch = 0; batch < batch_count; ++batch) {
+    std::string batch_name =
+        std::string(batch_op->outputs[0]) + "_b" + std::to_string(batch);
+
+    // tf.slice(a, ...).
+    auto* slice_a_op = new SliceOperator;
+    slice_a_op->inputs = {
+        batch_op->inputs[0],
+        CreateInt32Array(model, batch_name + "/slice_a/slice/begin",
+                         {batch, 0, 0}),
+        CreateInt32Array(
+            model, batch_name + "/slice_a/slice/size",
+            {1, input_array_a.shape().dims(1), input_array_a.shape().dims(2)}),
+    };
+    slice_a_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_a")};
+    auto& slice_a_op_output = model->GetOrCreateArray(slice_a_op->outputs[0]);
+    slice_a_op_output.data_type = input_array_a.data_type;
+    tail_it = model->operators.emplace(tail_it, slice_a_op) + 1;
+
+    // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
+    auto* slice_a_reshape_op = new TensorFlowReshapeOperator;
+    slice_a_reshape_op->inputs = {
+        slice_a_op->outputs[0],
+        CreateInt32Array(model, batch_name + "/slice_a/reshape/shape",
+                         {-1, input_array_a.shape().dims(2)})};
+    slice_a_reshape_op->outputs = {
+        AvailableArrayName(*model, batch_name + "/slice_a/reshape")};
+    auto& slice_a_reshape_op_output =
+        model->GetOrCreateArray(slice_a_reshape_op->outputs[0]);
+    slice_a_reshape_op_output.data_type = input_array_a.data_type;
+    tail_it = model->operators.emplace(tail_it, slice_a_reshape_op) + 1;
+
+    // tf.slice(b, ...).
+    auto* slice_b_op = new SliceOperator;
+    slice_b_op->inputs = {
+        batch_op->inputs[1],
+        CreateInt32Array(model, batch_name + "/slice_b/slice/begin", {0, 0, 0}),
+        CreateInt32Array(
+            model, batch_name + "/slice_b/slice/size",
+            {1, input_array_b.shape().dims(1), input_array_b.shape().dims(2)}),
+    };
+    slice_b_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_b")};
+    auto& slice_b_op_output = model->GetOrCreateArray(slice_b_op->outputs[0]);
+    slice_b_op_output.data_type = input_array_b.data_type;
+    tail_it = model->operators.emplace(tail_it, slice_b_op) + 1;
+
+    // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
+    auto* slice_b_reshape_op = new TensorFlowReshapeOperator;
+    slice_b_reshape_op->inputs = {
+        slice_b_op->outputs[0],
+        CreateInt32Array(model, batch_name + "/slice_b/reshape/shape",
+                         {-1, input_array_b.shape().dims(2)})};
+    slice_b_reshape_op->outputs = {
+        AvailableArrayName(*model, batch_name + "/slice_b/reshape")};
+    auto& slice_b_reshape_op_output =
+        model->GetOrCreateArray(slice_b_reshape_op->outputs[0]);
+    slice_b_reshape_op_output.data_type = input_array_b.data_type;
+    tail_it = model->operators.emplace(tail_it, slice_b_reshape_op) + 1;
+
+    // tf.matmul(slice_a, slice_b).
+    auto* matmul_op = new TensorFlowMatMulOperator;
+    matmul_op->inputs = {slice_a_reshape_op->outputs[0],
+                         slice_b_reshape_op->outputs[0]};
+    matmul_op->outputs = {AvailableArrayName(*model, batch_name)};
+    auto& matmul_op_output = model->GetOrCreateArray(matmul_op->outputs[0]);
+    matmul_op_output.data_type = input_array_a.data_type;
+    tail_it = model->operators.emplace(tail_it, matmul_op) + 1;
+
+    // Add to stack.
+    stack_inputs.push_back(matmul_op->outputs[0]);
+  }
+
+  // The stack that will join all the individual matmul results together.
+  auto* stack_op = new StackOperator;
+  stack_op->inputs = stack_inputs;
+  stack_op->outputs = {batch_op->outputs[0]};
+  stack_op->axis = 0;
+  model->operators.emplace(tail_it, stack_op);
+
+  // Remove the old batch matmul now that we've unrolled.
+  batch_op_it = model->operators.begin();
+  for (; batch_op_it != model->operators.end(); ++batch_op_it) {
+    if (batch_op_it->get() == batch_op) {
+      break;
+    }
+  }
+  CHECK(batch_op_it != model->operators.end());
+  CHECK(batch_op_it->get() == batch_op);
+  model->operators.erase(batch_op_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index cde5a936afd0f12dbd3f5adb333c0c7d73cde25f..02c3b2ed9fe6fe3f6b92de5e10b780361c8c0355 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
-//#include "absl/strings/string_view_utils.h"
 #include "absl/strings/strip.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
@@ -53,6 +52,8 @@ using tensorflow::DT_BOOL;
 using tensorflow::DT_FLOAT;
 using tensorflow::DT_INT32;
 using tensorflow::DT_INT64;
+using tensorflow::DT_QUINT8;
+using tensorflow::DT_STRING;
 using tensorflow::DT_UINT8;
 using tensorflow::GraphDef;
 using tensorflow::NodeDef;
@@ -136,6 +137,8 @@ ArrayDataType ConvertDataType(tensorflow::DataType dtype) {
     return ArrayDataType::kInt32;
   else if (dtype == DT_INT64)
     return ArrayDataType::kInt64;
+  else if (dtype == DT_STRING)
+    return ArrayDataType::kString;
   else
     LOG(INFO) << "Unsupported data type in placehoder op: " << dtype;
   return ArrayDataType::kNone;
@@ -170,8 +173,13 @@ void ImportFloatArray(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_float_data =
       output_array->GetMutableBuffer<ArrayDataType::kFloat>().data;
-  output_float_data.resize(input_flat_size);
-  if (input_tensor.float_val_size()) {
+  output_float_data.resize(RequiredBufferSizeForShape(output_array->shape()),
+                           0.f);
+  if (input_tensor.float_val_size() == 1) {
+    for (int i = 0; i < input_flat_size; i++) {
+      output_float_data[i] = input_tensor.float_val(0);
+    }
+  } else if (input_tensor.float_val_size() == input_flat_size) {
     for (int i = 0; i < input_tensor.float_val_size(); i++) {
       output_float_data[i] = input_tensor.float_val(i);
     }
@@ -185,6 +193,32 @@ void ImportFloatArray(const TensorProto& input_tensor, Array* output_array) {
   }
 }
 
+void ImportQuint8Array(const TensorProto& input_tensor, Array* output_array) {
+  CHECK_EQ(input_tensor.dtype(), DT_QUINT8);
+  const auto& input_shape = input_tensor.tensor_shape();
+  CHECK_LE(input_shape.dim_size(), 4);
+  ImportShape(input_shape.dim(), output_array->mutable_shape());
+  int input_flat_size = 1;
+  for (int k = 0; k < input_shape.dim_size(); k++) {
+    input_flat_size *= input_shape.dim(k).size();
+  }
+  auto& output_int_data =
+      output_array->GetMutableBuffer<ArrayDataType::kUint8>().data;
+  output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
+  if (input_tensor.int_val_size()) {
+    for (int i = 0; i < input_tensor.int_val_size(); i++) {
+      output_int_data[i] = input_tensor.int_val(i);
+    }
+  } else if (input_tensor.tensor_content().size() ==
+             input_flat_size * sizeof(uint8_t)) {
+    toco::port::CopyToBuffer(input_tensor.tensor_content(),
+                             reinterpret_cast<char*>(output_int_data.data()));
+  } else {
+    LOG(FATAL) << "Neither input_content nor int_val have the right "
+                  "dimensions for this uint8 tensor.";
+  }
+}
+
 void ImportInt32Array(const TensorProto& input_tensor, Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_INT32);
   const auto& input_shape = input_tensor.tensor_shape();
@@ -196,7 +230,7 @@ void ImportInt32Array(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt32>().data;
-  output_int_data.resize(input_flat_size);
+  output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
   if (input_tensor.int_val_size()) {
     for (int i = 0; i < input_tensor.int_val_size(); i++) {
       output_int_data[i] = input_tensor.int_val(i);
@@ -222,7 +256,7 @@ void ImportInt64Array(const TensorProto& input_tensor, Array* output_array) {
   }
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt64>().data;
-  output_int_data.resize(input_flat_size);
+  output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
   if (input_tensor.int64_val_size()) {
     for (int i = 0; i < input_tensor.int64_val_size(); i++) {
       output_int_data[i] = input_tensor.int64_val(i);
@@ -237,6 +271,27 @@ void ImportInt64Array(const TensorProto& input_tensor, Array* output_array) {
   }
 }
 
+void ImportStringArray(const TensorProto& input_tensor, Array* output_array) {
+  CHECK_EQ(input_tensor.dtype(), DT_STRING);
+  const auto& input_shape = input_tensor.tensor_shape();
+  CHECK_LE(input_shape.dim_size(), 4);
+  ImportShape(input_shape.dim(), output_array->mutable_shape());
+  int input_flat_size = 1;
+  for (int k = 0; k < input_shape.dim_size(); k++) {
+    input_flat_size *= input_shape.dim(k).size();
+  }
+  auto& output_string_data =
+      output_array->GetMutableBuffer<ArrayDataType::kString>().data;
+  output_string_data.resize(RequiredBufferSizeForShape(output_array->shape()));
+  if (input_flat_size != input_tensor.string_val_size()) {
+    LOG(FATAL) << "Input_content string_val doesn't have the right "
+                  "dimensions for this string tensor.";
+  }
+  for (int i = 0; i < input_flat_size; ++i) {
+    output_string_data[i] = input_tensor.string_val(i);
+  }
+}
+
 // Count the number of inputs of a given node. If
 // `tf_import_flags.drop_control_dependency` is true, count the number of
 // non-control-dependency inputs.
@@ -254,6 +309,14 @@ int GetInputsCount(const NodeDef& node,
   }
 }
 
+void CheckInputsCount(const NodeDef& node,
+                      const TensorFlowImportFlags& tf_import_flags,
+                      int expected_input_count) {
+  QCHECK_EQ(GetInputsCount(node, tf_import_flags), expected_input_count)
+      << node.op() << " node expects " << expected_input_count
+      << " input(s) other than control dependencies: " << node.DebugString();
+}
+
 void ConvertConstOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
@@ -262,23 +325,34 @@ void ConvertConstOperator(const NodeDef& node,
   const auto dtype = GetDataTypeAttr(node, "dtype");
 
   auto& array = model->GetOrCreateArray(node.name());
-  array.data_type = dtype == DT_FLOAT
-                        ? ArrayDataType::kFloat
-                        : dtype == DT_INT32
-                              ? ArrayDataType::kInt32
-                              : dtype == DT_INT64 ? ArrayDataType::kInt64
-                                                  : ArrayDataType::kNone;
-  if (dtype == DT_FLOAT) {
-    ImportFloatArray(tensor, &array);
-  } else if (dtype == DT_INT32) {
-    ImportInt32Array(tensor, &array);
-  } else if (dtype == DT_INT64) {
-    ImportInt64Array(tensor, &array);
-  } else {
-    // do nothing, silently ignore the Const data. For example, there are consts
-    // of string type. We just make a dummy buffer to indicate that this array
-    // does not rely on external input.
-    array.GetMutableBuffer<ArrayDataType::kNone>();
+  switch (dtype) {
+    case DT_FLOAT:
+      array.data_type = ArrayDataType::kFloat;
+      ImportFloatArray(tensor, &array);
+      break;
+    case DT_INT32:
+      array.data_type = ArrayDataType::kInt32;
+      ImportInt32Array(tensor, &array);
+      break;
+    case DT_QUINT8:
+      array.data_type = ArrayDataType::kUint8;
+      ImportQuint8Array(tensor, &array);
+      break;
+    case DT_INT64:
+      array.data_type = ArrayDataType::kInt64;
+      ImportInt64Array(tensor, &array);
+      break;
+    case DT_STRING:
+      array.data_type = ArrayDataType::kString;
+      ImportStringArray(tensor, &array);
+      break;
+    default:
+      array.data_type = ArrayDataType::kNone;
+      // do nothing, silently ignore the Const data.
+      // We just make a dummy buffer to indicate that
+      // this array does not rely on external input.
+      array.GetMutableBuffer<ArrayDataType::kNone>();
+      break;
   }
 }
 
@@ -286,7 +360,7 @@ void ConvertConvOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
   CHECK_EQ(node.op(), "Conv2D");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
 
   // We only support NHWC, which is the default data_format.
   // So if data_format is not defined, we're all good.
@@ -339,7 +413,7 @@ void ConvertDepthwiseConvOperator(const NodeDef& node,
                                   const TensorFlowImportFlags& tf_import_flags,
                                   Model* model) {
   CHECK_EQ(node.op(), "DepthwiseConv2dNative");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
 
   // We only support NHWC, which is the default data_format.
   // So if data_format is not defined, we're all good.
@@ -392,7 +466,8 @@ void ConvertDepthToSpaceOperator(const NodeDef& node,
                                  const TensorFlowImportFlags& tf_import_flags,
                                  Model* model) {
   CHECK_EQ(node.op(), "DepthToSpace");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
+
   CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
   auto* op = new DepthToSpaceOperator;
   op->inputs.push_back(node.input(0));
@@ -406,7 +481,8 @@ void ConvertSpaceToDepthOperator(const NodeDef& node,
                                  const TensorFlowImportFlags& tf_import_flags,
                                  Model* model) {
   CHECK_EQ(node.op(), "SpaceToDepth");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
+
   CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
   auto* op = new SpaceToDepthOperator;
   op->inputs.push_back(node.input(0));
@@ -420,7 +496,8 @@ void ConvertBiasAddOperator(const NodeDef& node,
                             const TensorFlowImportFlags& tf_import_flags,
                             Model* model) {
   CHECK_EQ(node.op(), "BiasAdd");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
+
   const auto& input_name = node.input(0);
   const auto& bias_name = node.input(1);
   CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
@@ -435,7 +512,7 @@ void ConvertReluOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
   CHECK_EQ(node.op(), "Relu");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
   const auto& input_name = node.input(0);
   auto* relu = new ReluOperator;
   relu->inputs.push_back(input_name);
@@ -447,7 +524,8 @@ void ConvertRelu6Operator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
   CHECK_EQ(node.op(), "Relu6");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
+
   const auto& input_name = node.input(0);
   auto* op = new Relu6Operator;
   op->inputs.push_back(input_name);
@@ -459,7 +537,8 @@ void ConvertLogisticOperator(const NodeDef& node,
                              const TensorFlowImportFlags& tf_import_flags,
                              Model* model) {
   CHECK_EQ(node.op(), "Sigmoid");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
+
   const auto& input_name = node.input(0);
   auto* op = new LogisticOperator;
   op->inputs.push_back(input_name);
@@ -471,7 +550,8 @@ void ConvertTanhOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
   CHECK_EQ(node.op(), "Tanh");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
+
   const auto& input_name = node.input(0);
   auto* op = new TanhOperator;
   op->inputs.push_back(input_name);
@@ -483,7 +563,7 @@ void ConvertDivOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
   CHECK(node.op() == "Div" || node.op() == "RealDiv");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new DivOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -495,14 +575,17 @@ void ConvertIdentityOperator(const NodeDef& node,
                              const TensorFlowImportFlags& tf_import_flags,
                              Model* model) {
   CHECK(node.op() == "Identity" || node.op() == "CheckNumerics" ||
-        node.op() == "PlaceholderWithDefault");
+        node.op() == "PlaceholderWithDefault" || node.op() == "StopGradient");
   auto* op = new TensorFlowIdentityOperator;
   // Amazingly, some TensorFlow graphs (at least rajeev_lstm.pb) have
   // identity nodes with multiple inputs, but the other inputs seem
   // to be gratuitous (in the case of rajeev_lstm.pb, these are
   // enumerating the LSTM state arrays). We will just ignore extra
   // inputs beyond the first input.
-  CHECK_GE(node.input_size(), 1);
+  QCHECK_GE(node.input_size(), 1)
+      << node.op()
+      << " node expects at least 1 input other than control dependencies: "
+      << node.DebugString();
   const auto& input_name = node.input(0);
   op->inputs.push_back(input_name);
   op->outputs.push_back(node.name());
@@ -513,7 +596,7 @@ void ConvertFakeQuantWithMinMaxArgs(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
   CHECK_EQ(node.op(), "FakeQuantWithMinMaxArgs");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
   auto* op = new FakeQuantOperator;
   op->inputs.push_back(node.input(0));
   op->minmax.reset(new MinMax);
@@ -529,7 +612,10 @@ void ConvertFakeQuantWithMinMaxVars(
     Model* model) {
   CHECK_EQ(node.op(), "FakeQuantWithMinMaxVars");
   const int num_inputs = GetInputsCount(node, tf_import_flags);
-  CHECK(num_inputs == 3 || num_inputs == 4);
+  QCHECK(num_inputs == 3 || num_inputs == 4)
+      << "FakeQuantWithMinMaxVars node expects 3 or 4 inputs other than "
+         "control dependencies: "
+      << node.DebugString();
   auto* op = new FakeQuantOperator;
   for (int i = 0; i < 3; i++) {
     op->inputs.push_back(node.input(i));
@@ -538,11 +624,22 @@ void ConvertFakeQuantWithMinMaxVars(
   model->operators.emplace_back(op);
 }
 
+void ConvertNegOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Neg");
+  CheckInputsCount(node, tf_import_flags, 1);
+  auto* op = new NegOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
 void ConvertRsqrtOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
   CHECK_EQ(node.op(), "Rsqrt");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
   auto* op = new TensorFlowRsqrtOperator;
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
@@ -553,7 +650,7 @@ void ConvertSqrtOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
   CHECK_EQ(node.op(), "Sqrt");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
   auto* op = new TensorFlowSqrtOperator;
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
@@ -564,7 +661,7 @@ void ConvertSqueezeOperator(const NodeDef& node,
                             const TensorFlowImportFlags& tf_import_flags,
                             Model* model) {
   CHECK_EQ(node.op(), "Squeeze");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
   auto* op = new SqueezeOperator;
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
@@ -581,7 +678,7 @@ void ConvertSquareOperator(const NodeDef& node,
                            const TensorFlowImportFlags& tf_import_flags,
                            Model* model) {
   CHECK_EQ(node.op(), "Square");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
   auto* op = new TensorFlowSquareOperator;
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
@@ -592,7 +689,7 @@ void ConvertAddOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
   CHECK_EQ(node.op(), "Add");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new AddOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -600,11 +697,24 @@ void ConvertAddOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+void ConvertAddNOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "AddN");
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  auto* op = new AddNOperator;
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
 void ConvertMulOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
   CHECK_EQ(node.op(), "Mul");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new MulOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -616,7 +726,7 @@ void ConvertSubOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
   CHECK_EQ(node.op(), "Sub");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new SubOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -628,7 +738,7 @@ void ConvertSumOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
   CHECK_EQ(node.op(), "Sum");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new TensorFlowSumOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -643,7 +753,7 @@ void ConvertTileOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
   CHECK_EQ(node.op(), "Tile");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new TensorFlowTileOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -655,7 +765,7 @@ void ConvertSliceOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
   CHECK_EQ(node.op(), "Slice");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 3);
+  CheckInputsCount(node, tf_import_flags, 3);
   auto* op = new SliceOperator;
   for (int i = 0; i < 3; ++i) {
     op->inputs.push_back(node.input(i));
@@ -668,7 +778,7 @@ void ConvertPadOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
   CHECK_EQ(node.op(), "Pad");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new PadOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -680,7 +790,7 @@ void ConvertShapeOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
   CHECK_EQ(node.op(), "Shape");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
   auto* op = new TensorFlowShapeOperator;
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
@@ -691,7 +801,7 @@ void ConvertSplitOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
   CHECK_EQ(node.op(), "Split");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new TensorFlowSplitOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -708,7 +818,7 @@ void ConvertMergeOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
   CHECK_EQ(node.op(), "Merge");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new TensorFlowMergeOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -720,7 +830,7 @@ void ConvertSwitchOperator(const NodeDef& node,
                            const TensorFlowImportFlags& tf_import_flags,
                            Model* model) {
   CHECK_EQ(node.op(), "Switch");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new TensorFlowSwitchOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -729,11 +839,12 @@ void ConvertSwitchOperator(const NodeDef& node,
   op->outputs.push_back(node.name() + ":1");
   model->operators.emplace_back(op);
 }
+
 void ConvertSoftmaxOperator(const NodeDef& node,
                             const TensorFlowImportFlags& tf_import_flags,
                             Model* model) {
   CHECK_EQ(node.op(), "Softmax");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
   const auto& input_name = node.input(0);
   auto* softmax = new SoftmaxOperator;
   softmax->inputs.push_back(input_name);
@@ -744,11 +855,23 @@ void ConvertSoftmaxOperator(const NodeDef& node,
   model->operators.emplace_back(softmax);
 }
 
+void ConvertLogSoftmaxOperator(const NodeDef& node,
+                               const TensorFlowImportFlags& tf_import_flags,
+                               Model* model) {
+  CHECK_EQ(node.op(), "LogSoftmax");
+  CheckInputsCount(node, tf_import_flags, 1);
+  const auto& input_name = node.input(0);
+  auto* log_softmax = new LogSoftmaxOperator;
+  log_softmax->inputs.push_back(input_name);
+  log_softmax->outputs.push_back(node.name());
+  model->operators.emplace_back(log_softmax);
+}
+
 void ConvertLRNOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
   CHECK_EQ(node.op(), "LRN");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
   const auto& input_name = node.input(0);
   auto* lrn = new LocalResponseNormalizationOperator;
   lrn->inputs.push_back(input_name);
@@ -764,7 +887,7 @@ void ConvertMaxPoolOperator(const NodeDef& node,
                             const TensorFlowImportFlags& tf_import_flags,
                             Model* model) {
   CHECK_EQ(node.op(), "MaxPool");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
   const auto& input_name = node.input(0);
   // We only support NHWC, which is the default data_format.
   // So if data_format is not defined, we're all good.
@@ -806,7 +929,7 @@ void ConvertAvgPoolOperator(const NodeDef& node,
                             const TensorFlowImportFlags& tf_import_flags,
                             Model* model) {
   CHECK_EQ(node.op(), "AvgPool");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
   const auto& input_name = node.input(0);
   // We only support NHWC, which is the default data_format.
   // So if data_format is not defined, we're all good.
@@ -844,7 +967,7 @@ void ConvertReshapeOperator(const NodeDef& node,
                             const TensorFlowImportFlags& tf_import_flags,
                             Model* model) {
   CHECK_EQ(node.op(), "Reshape");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new TensorFlowReshapeOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -852,37 +975,37 @@ void ConvertReshapeOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+void ConvertBatchMatMulOperator(const NodeDef& node,
+                                const TensorFlowImportFlags& tf_import_flags,
+                                Model* model) {
+  CheckInputsCount(node, tf_import_flags, 2);
+
+  // https://www.tensorflow.org/versions/r0.12/api_docs/python/math_ops/matrix_math_functions
+  CHECK(!HasAttr(node, "adj_a") || (GetBoolAttr(node, "adj_a") == false));
+  CHECK(!HasAttr(node, "adj_b") || (GetBoolAttr(node, "adj_b") == false));
+
+  auto* batch_matmul = new BatchMatMulOperator;
+  batch_matmul->inputs = {node.input(0), node.input(1)};
+  batch_matmul->outputs = {node.name()};
+  model->operators.emplace_back(batch_matmul);
+}
+
 void ConvertMatMulOperator(const NodeDef& node,
                            const TensorFlowImportFlags& tf_import_flags,
                            Model* model) {
-  CHECK_EQ(node.op(), "MatMul");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
+
   // Transpose flags should be easy to support, but we don't have a
   // GraphDef with them to test on at the moment.
   CHECK_EQ(GetBoolAttr(node, "transpose_a"), false);
   CHECK_EQ(GetBoolAttr(node, "transpose_b"), false);
-  const auto& input_name = node.input(0);
-  const auto& weights_name = node.input(1);
-  const auto& reordered_weights_name = weights_name + "_reordered";
-  // Check if a ReorderAxesOperator was already created for these weights
-  // (that happens when multiple layers share the same weights).
-  const Operator* existing_reorder =
-      GetOpWithOutput(*model, reordered_weights_name);
-  if (existing_reorder) {
-    // Check that it is safe to rely on the _reordered naming of the output
-    // array!
-    CHECK(existing_reorder->type == OperatorType::kReorderAxes);
-  } else {
-    // Create a new ReorderAxesOperator
-    auto* reorder = new ReorderAxesOperator;
-    reorder->inputs = {weights_name};
-    reorder->outputs = {reordered_weights_name};
-    reorder->input_axes_order = AxesOrder::kRC;
-    reorder->output_axes_order = AxesOrder::kCR;
-    model->operators.emplace_back(reorder);
-  }
+  CHECK(!HasAttr(node, "adjoint_a") ||
+        (GetBoolAttr(node, "adjoint_a") == false));
+  CHECK(!HasAttr(node, "adjoint_b") ||
+        (GetBoolAttr(node, "adjoint_b") == false));
+
   auto* matmul = new TensorFlowMatMulOperator;
-  matmul->inputs = {input_name, reordered_weights_name};
+  matmul->inputs = {node.input(0), node.input(1)};
   matmul->outputs = {node.name()};
   model->operators.emplace_back(matmul);
 }
@@ -899,7 +1022,10 @@ void ConvertConcatOperator(const NodeDef& node,
     LOG(FATAL) << "Expected Concat or ConcatV2";
   }
   const int num_inputs = GetInputsCount(node, tf_import_flags);
-  CHECK_GE(num_inputs, 2);
+  QCHECK_GE(num_inputs, 2)
+      << node.op()
+      << " node expects at least 2 inputs other than control dependencies: "
+      << node.DebugString();
   CHECK_EQ(num_inputs, 1 + GetIntAttr(node, "N"));
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
@@ -990,7 +1116,7 @@ void ConvertMaxOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
   CHECK_EQ(node.op(), "Max");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new TensorFlowMaxOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -1005,7 +1131,7 @@ void ConvertMinOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
   CHECK_EQ(node.op(), "Min");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new TensorFlowMinOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -1020,7 +1146,7 @@ void ConvertMaximumOperator(const NodeDef& node,
                             const TensorFlowImportFlags& tf_import_flags,
                             Model* model) {
   CHECK_EQ(node.op(), "Maximum");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new TensorFlowMaximumOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -1032,7 +1158,7 @@ void ConvertMinimumOperator(const NodeDef& node,
                             const TensorFlowImportFlags& tf_import_flags,
                             Model* model) {
   CHECK_EQ(node.op(), "Minimum");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new TensorFlowMinimumOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -1068,22 +1194,9 @@ void ConvertStridedSliceOperator(const NodeDef& node,
                                  const TensorFlowImportFlags& tf_import_flags,
                                  Model* model) {
   CHECK_EQ(node.op(), "StridedSlice");
-  CHECK_EQ(node.input_size(), 4);
-
-  // Only a subset of the full TF op functionality is supported now.
-  if (  // No 64-bit indices.
-      GetDataTypeAttr(node, "Index") != DT_INT32 ||
-      // No dimensionality changes.
-      GetIntAttr(node, "new_axis_mask") != 0 ||
-      GetIntAttr(node, "shrink_axis_mask") != 0 ||
-      // No sparse indices.
-      GetIntAttr(node, "ellipsis_mask") != 0 ||
-      // Only 4D tensors are supported.
-      GetIntAttr(node, "begin_mask") > 15 ||
-      GetIntAttr(node, "end_mask") > 15) {
-    ConvertUnsupportedOperator(node, tf_import_flags, model);
-    return;
-  }
+  // TODO(soroosh): The 4th input (strides) should be e optional, to be
+  // consistent with TF.
+  CheckInputsCount(node, tf_import_flags, 4);
 
   auto* op = new StridedSliceOperator;
   for (const auto& input : node.input()) {
@@ -1104,7 +1217,7 @@ void ConvertPlaceholderOperator(const NodeDef& node,
                                 Model* model) {
   CHECK(node.op() == "Placeholder" || node.op() == "LegacyFedInput");
   if (node.op() == "Placeholder") {
-    CHECK_EQ(GetInputsCount(node, tf_import_flags), 0);
+    CheckInputsCount(node, tf_import_flags, 0);
   }
   auto& array = model->GetOrCreateArray(node.name());
   if (node.attr().count("dtype")) {
@@ -1135,34 +1248,16 @@ void ConvertNoOpOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {}
 
-ArrayDataType GetArrayDataType(tensorflow::DataType tf_data_type) {
-  if (tf_data_type == DT_UINT8) {
-    return ArrayDataType::kUint8;
-  } else if (tf_data_type == DT_INT32) {
-    return ArrayDataType::kInt32;
-  } else if (tf_data_type == DT_FLOAT) {
-    return ArrayDataType::kFloat;
-  } else {
-    return ArrayDataType::kNone;
-  }
-}
-
 void ConvertCastOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
   CHECK_EQ(node.op(), "Cast");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
   const auto tf_src_dtype = GetDataTypeAttr(node, "SrcT");
   const auto tf_dst_dtype = GetDataTypeAttr(node, "DstT");
-  CHECK(tf_src_dtype == DT_UINT8 || tf_src_dtype == DT_INT32 ||
-        tf_src_dtype == DT_FLOAT);
-  CHECK(tf_dst_dtype == DT_UINT8 || tf_dst_dtype == DT_INT32 ||
-        tf_dst_dtype == DT_FLOAT);
-  CHECK_NE(tf_src_dtype, tf_dst_dtype)
-      << "Same input and output data type. No need to cast.";
   auto* op = new CastOperator;
-  op->src_data_type = GetArrayDataType(tf_src_dtype);
-  op->dst_data_type = GetArrayDataType(tf_dst_dtype);
+  op->src_data_type = ConvertDataType(tf_src_dtype);
+  op->dst_data_type = ConvertDataType(tf_dst_dtype);
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
@@ -1172,7 +1267,7 @@ void ConvertFloorOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
   CHECK_EQ(node.op(), "Floor");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CheckInputsCount(node, tf_import_flags, 1);
   const auto data_type = GetDataTypeAttr(node, "T");
   CHECK(data_type == DT_FLOAT);
   auto* op = new FloorOperator;
@@ -1185,9 +1280,9 @@ void ConvertGatherOperator(const NodeDef& node,
                            const TensorFlowImportFlags& tf_import_flags,
                            Model* model) {
   CHECK_EQ(node.op(), "Gather");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   const auto indices_data_type = GetDataTypeAttr(node, "Tindices");
-  CHECK(indices_data_type == DT_INT32);
+  CHECK(indices_data_type == DT_INT32 || indices_data_type == DT_INT64);
   auto* op = new GatherOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -1195,12 +1290,35 @@ void ConvertGatherOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+void ConvertArgMaxOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
+  CHECK_EQ(node.op(), "ArgMax");
+  CheckInputsCount(node, tf_import_flags, 2);
+  const auto axis_data_type = GetDataTypeAttr(node, "Tidx");
+  const auto output_type = GetDataTypeAttr(node, "output_type");
+  CHECK(axis_data_type == DT_INT64 || axis_data_type == DT_INT32);
+  CHECK(output_type == DT_INT64 || output_type == DT_INT32);
+  auto* op = new ArgMaxOperator;
+  op->output_data_type = ConvertDataType(output_type);
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
 void ConvertResizeBilinearOperator(const NodeDef& node,
                                    const TensorFlowImportFlags& tf_import_flags,
                                    Model* model) {
   CHECK_EQ(node.op(), "ResizeBilinear");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new ResizeBilinearOperator;
+
+  op->align_corners = false;
+  if (HasAttr(node, "align_corners")) {
+    op->align_corners = GetBoolAttr(node, "align_corners");
+  }
+
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
   op->outputs.push_back(node.name());
@@ -1211,7 +1329,7 @@ void ConvertBatchNormWithGlobalNormalizationOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
   CHECK_EQ(node.op(), "BatchNormWithGlobalNormalization");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 5);
+  CheckInputsCount(node, tf_import_flags, 5);
 
   // TODO(ahentz): to really match tensorflow we need to add variance_epsilon
   // to the input, before feeding it into TensorFlowRsqrtOperator.
@@ -1260,7 +1378,7 @@ void ConvertFusedBatchNormOperator(const NodeDef& node,
                                    const TensorFlowImportFlags& tf_import_flags,
                                    Model* model) {
   CHECK_EQ(node.op(), "FusedBatchNorm");
-  CHECK_EQ(node.input_size(), 5);
+  CheckInputsCount(node, tf_import_flags, 5);
 
   // Declare shortcuts for the inputs.
   const string& gamma_input = node.input(1);
@@ -1316,7 +1434,7 @@ void ConvertSpaceToBatchNDOperator(const NodeDef& node,
                                    const TensorFlowImportFlags& tf_import_flags,
                                    Model* model) {
   CHECK_EQ(node.op(), "SpaceToBatchND");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 3);
+  CheckInputsCount(node, tf_import_flags, 3);
   CHECK_EQ(GetDataTypeAttr(node, "Tblock_shape"), DT_INT32);
   CHECK_EQ(GetDataTypeAttr(node, "Tpaddings"), DT_INT32);
   auto* op = new SpaceToBatchNDOperator;
@@ -1331,7 +1449,7 @@ void ConvertBatchToSpaceNDOperator(const NodeDef& node,
                                    const TensorFlowImportFlags& tf_import_flags,
                                    Model* model) {
   CHECK_EQ(node.op(), "BatchToSpaceND");
-  CHECK_EQ(GetInputsCount(node, tf_import_flags), 3);
+  CheckInputsCount(node, tf_import_flags, 3);
   CHECK_EQ(GetDataTypeAttr(node, "Tblock_shape"), DT_INT32);
   CHECK_EQ(GetDataTypeAttr(node, "Tcrops"), DT_INT32);
   auto* op = new BatchToSpaceNDOperator;
@@ -1342,11 +1460,22 @@ void ConvertBatchToSpaceNDOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+void ConvertExpOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Exp");
+  CheckInputsCount(node, tf_import_flags, 1);
+  auto* op = new ExpOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
 void ConvertMeanOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
   CHECK_EQ(node.op(), "Mean");
-  CHECK_EQ(node.input_size(), 2);
+  CheckInputsCount(node, tf_import_flags, 2);
   auto* op = new MeanOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -1361,7 +1490,11 @@ void ConvertSvdfOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
   CHECK_EQ(node.op(), "Svdf");
-  bool has_bias = (node.input_size() == 4);
+  const int input_size = GetInputsCount(node, tf_import_flags);
+  QCHECK(input_size == 3 || input_size == 4)
+      << "Svdf node expects 3 or 4 inputs other than control dependencies: "
+      << node.DebugString();
+  bool has_bias = (input_size == 4);
   auto* op = new SvdfOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
@@ -1380,6 +1513,170 @@ void ConvertSvdfOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+// This is just bare bones support to get the shapes to propagate.
+void ConvertTransposeConvOperator(const NodeDef& node,
+                                  const TensorFlowImportFlags& tf_import_flags,
+                                  Model* model) {
+  CHECK_EQ(node.op(), "Conv2DBackpropInput");
+  CheckInputsCount(node, tf_import_flags, 3);
+  auto* op = new TransposeConvOperator;
+  op->inputs.push_back(node.input(2));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  const auto& strides = GetListAttr(node, "strides");
+  CHECK_EQ(strides.i_size(), 4);
+  CHECK_EQ(strides.i(0), 1);
+  op->stride_height = strides.i(1);
+  op->stride_width = strides.i(2);
+  CHECK_EQ(strides.i(3), 1);
+  auto const& padding = GetStringAttr(node, "padding");
+  if (padding == "SAME") {
+    op->padding.type = PaddingType::kSame;
+  } else if (padding == "VALID") {
+    op->padding.type = PaddingType::kValid;
+  } else {
+    LOG(FATAL) << "Only SAME and VALID padding supported on "
+                  "Conv2DBackpropInput nodes.";
+  }
+  model->operators.emplace_back(op);
+}
+
+void ConvertExpandDimsOperator(const NodeDef& node,
+                               const TensorFlowImportFlags& tf_import_flags,
+                               Model* model) {
+  CHECK_EQ(node.op(), "ExpandDims");
+  CheckInputsCount(node, tf_import_flags, 2);
+  auto* op = new ExpandDimsOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertFillOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Fill");
+  CheckInputsCount(node, tf_import_flags, 2);
+  auto* op = new FillOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertFloorDivOperator(const NodeDef& node,
+                             const TensorFlowImportFlags& tf_import_flags,
+                             Model* model) {
+  CHECK_EQ(node.op(), "FloorDiv");
+  CheckInputsCount(node, tf_import_flags, 2);
+  auto* op = new FloorDivOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertFloorModOperator(const NodeDef& node,
+                             const TensorFlowImportFlags& tf_import_flags,
+                             Model* model) {
+  CHECK(node.op() == "FloorMod");
+  CheckInputsCount(node, tf_import_flags, 2);
+  auto* op = new FloorModOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertRangeOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "Range");
+  CheckInputsCount(node, tf_import_flags, 3);
+  auto* op = new RangeOperator;
+  if (HasAttr(node, "Tidx")) {
+    const auto dtype = toco::GetDataTypeAttr(node, "Tidx");
+    CHECK(dtype == DT_UINT8 || dtype == DT_INT32 || dtype == DT_INT64 ||
+          dtype == DT_FLOAT);
+    op->dtype = ConvertDataType(dtype);
+  }
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(2));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertRankOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Rank");
+  CheckInputsCount(node, tf_import_flags, 1);
+  auto* op = new RankOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertStackOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK((node.op() == "Stack") || (node.op() == "Pack"));
+  auto* op = new StackOperator;
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  QCHECK_GE(num_inputs, 1)
+      << node.op()
+      << " node expects at least 1 input other than control dependencies: "
+      << node.DebugString();
+  CHECK_EQ(num_inputs, GetIntAttr(node, "N"));
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  // Both "Stack" and "Pack" have the "axis" attribute.
+  op->axis = GetIntAttr(node, "axis");
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertTransposeOperator(const NodeDef& node,
+                              const TensorFlowImportFlags& tf_import_flags,
+                              Model* model) {
+  CHECK_EQ(node.op(), "Transpose");
+  CheckInputsCount(node, tf_import_flags, 2);
+  auto* op = new TransposeOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+// Some TensorFlow ops only occur in graph cycles, representing
+// control flow. We do not currently support control flow, so we wouldn't
+// be able to fully support such graphs, including performing inference,
+// anyway. However, rather than erroring out early on graphs being cyclic,
+// it helps to at least support these just enough to allow getting a
+// graph visualization. This is not trivial, as we require graphs to be
+// acyclic aside from RNN back-edges. The solution is to special-case
+// such ops as RNN back-edges, which is technically incorrect (does not
+// allow representing the op's semantics) but good enough to get a
+// graph visualization.
+void ConvertOperatorSpecialCasedAsRNNBackEdge(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  // At the moment, the only type of operator special-cased in this way is
+  // NextIteration, occurring only in control-flow cycles.
+  CHECK_EQ(node.op(), "NextIteration");
+  CHECK_EQ(node.input_size(), 1);
+  auto* rnn_state = model->flags.add_rnn_states();
+  // This RNN state is not explicitly created by the user, so it's
+  // OK for some later graph transformation to discard it.
+  rnn_state->set_discardable(true);
+  rnn_state->set_state_array(node.name());
+  rnn_state->set_back_edge_source_array(node.input(0));
+}
+
 void StripCaretFromArrayNames(Model* model) {
   for (auto& op : model->operators) {
     for (auto& input : op->inputs) {
@@ -1389,7 +1686,7 @@ void StripCaretFromArrayNames(Model* model) {
       output = string(absl::StripPrefix(output, "^"));
     }
   }
-  for (auto& array : model->arrays) {
+  for (auto& array : model->GetArrayMap()) {
     if (absl::StartsWith(array.first, "^")) {
       LOG(FATAL) << "What?";
     }
@@ -1402,26 +1699,61 @@ void StripZeroOutputIndexFromInputs(NodeDef* node) {
   }
 }
 
-void AddExtraOutputsFedIntoOtherOps(Model* model) {
+// In TensorFlow GraphDef, when a node has multiple outputs, they are named
+// name:0, name:1, ...
+// where 'name' is the node's name(). Just 'name' is an equivalent shorthand
+// form for name:0.
+// A TensorFlow GraphDef does not explicitly list all the outputs of each node
+// (unlike inputs), it being implied by the node's name and operator type
+// (the latter implies the number of outputs).
+// This makes it non-trivial for us to reconstruct the list of all arrays
+// present in the graph and, for each operator, the list of its outputs.
+// We do that by taking advantage of the fact that
+// at least each node lists explicitly its inputs, so after we've loaded
+// all nodes, we can use that information.
+void AddExtraOutputs(Model* model) {
+  // Construct the list of all arrays consumed by anything in the graph.
+  std::vector<string> consumed_arrays;
+  // Add arrays consumed by an op.
   for (const auto& consumer_op : model->operators) {
     for (const string& input : consumer_op->inputs) {
-      const std::vector<string>& split = absl::StrSplit(input, ':');
-      if (split.size() != 2) {
-        continue;
-      }
-      int output_index = 0;
-      if (!absl::SimpleAtoi(split[1], &output_index)) {
-        continue;
-      }
-      auto* producer_op = GetOpWithOutput(*model, split[0]);
-      if (!producer_op) {
-        continue;
-      }
-      while (producer_op->outputs.size() <= output_index) {
-        using toco::port::StringF;
-        producer_op->outputs.push_back(
-            StringF("%s:%d", split[0], producer_op->outputs.size()));
-      }
+      consumed_arrays.push_back(input);
+    }
+  }
+  // Add global outputs of the model.
+  for (const string& output_array : model->flags.output_arrays()) {
+    consumed_arrays.push_back(output_array);
+  }
+  // Add arrays consumed by a RNN back-edge.
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    consumed_arrays.push_back(rnn_state.back_edge_source_array());
+  }
+  // Now add operator outputs so that all arrays that are consumed,
+  // are produced.
+  for (const string& consumed_array : consumed_arrays) {
+    // Split the consumed array name into the form name:output_index.
+    const std::vector<string>& split = absl::StrSplit(consumed_array, ':');
+    // If not of the form name:output_index, then this is not an additional
+    // output of a node with multiple outputs, so nothing to do here.
+    if (split.size() != 2) {
+      continue;
+    }
+    int output_index = 0;
+    if (!absl::SimpleAtoi(split[1], &output_index)) {
+      continue;
+    }
+    // Each op is initially recorded as producing at least the array that
+    // has its name. We use that to identify the producer node.
+    auto* producer_op = GetOpWithOutput(*model, split[0]);
+    if (!producer_op) {
+      continue;
+    }
+    // Add extra outputs to that producer node, all the way to the
+    // output_index.
+    while (producer_op->outputs.size() <= output_index) {
+      using toco::port::StringF;
+      producer_op->outputs.push_back(
+          StringF("%s:%d", split[0], producer_op->outputs.size()));
     }
   }
 }
@@ -1461,11 +1793,12 @@ bool InlineAllFunctions(GraphDef* graphdef) {
   flr = pflr.GetFLR("/job:localhost/replica:0/task:0/cpu:0");
 
   tensorflow::Graph graph(fld);
-  tensorflow::GraphConstructorOptions gc_opts;
-  const auto& tf_convert_status =
-      tensorflow::ConvertGraphDefToGraph(gc_opts, graphdef_copy, &graph);
+  tensorflow::ImportGraphDefOptions gc_opts;
+  gc_opts.validate_shape = false;
+  const auto& tf_convert_status = tensorflow::ImportGraphDef(
+      gc_opts, graphdef_copy, &graph, nullptr, nullptr);
   if (!tf_convert_status.ok()) {
-    LOG(ERROR) << "tensorflow::ConvertGraphDefToGraph failed with status: "
+    LOG(ERROR) << "tensorflow::ImportGraphDef failed with status: "
                << tf_convert_status.ToString();
     return false;
   }
@@ -1514,6 +1847,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertConstOperator(node, tf_import_flags, model);
     } else if (node.op() == "Conv2D") {
       ConvertConvOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Conv2DBackpropInput") {
+      ConvertTransposeConvOperator(node, tf_import_flags, model);
     } else if (node.op() == "DepthwiseConv2dNative") {
       ConvertDepthwiseConvOperator(node, tf_import_flags, model);
     } else if (node.op() == "DepthToSpace") {
@@ -1536,16 +1871,21 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertAvgPoolOperator(node, tf_import_flags, model);
     } else if (node.op() == "Reshape") {
       ConvertReshapeOperator(node, tf_import_flags, model);
+    } else if (node.op() == "BatchMatMul") {
+      ConvertBatchMatMulOperator(node, tf_import_flags, model);
     } else if (node.op() == "MatMul") {
       ConvertMatMulOperator(node, tf_import_flags, model);
     } else if (node.op() == "Div" || node.op() == "RealDiv") {
       ConvertDivOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Identity" || node.op() == "CheckNumerics") {
+    } else if (node.op() == "Identity" || node.op() == "CheckNumerics" ||
+               node.op() == "StopGradient") {
       ConvertIdentityOperator(node, tf_import_flags, model);
     } else if (node.op() == "FakeQuantWithMinMaxVars") {
       ConvertFakeQuantWithMinMaxVars(node, tf_import_flags, model);
     } else if (node.op() == "FakeQuantWithMinMaxArgs") {
       ConvertFakeQuantWithMinMaxArgs(node, tf_import_flags, model);
+    } else if (node.op() == "Neg") {
+      ConvertNegOperator(node, tf_import_flags, model);
     } else if (node.op() == "Rsqrt") {
       ConvertRsqrtOperator(node, tf_import_flags, model);
     } else if (node.op() == "Squeeze") {
@@ -1556,6 +1896,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertSquareOperator(node, tf_import_flags, model);
     } else if (node.op() == "Add") {
       ConvertAddOperator(node, tf_import_flags, model);
+    } else if (node.op() == "AddN") {
+      ConvertAddNOperator(node, tf_import_flags, model);
     } else if (node.op() == "Mul") {
       ConvertMulOperator(node, tf_import_flags, model);
     } else if (node.op() == "Sub") {
@@ -1570,6 +1912,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertLRNOperator(node, tf_import_flags, model);
     } else if (node.op() == "Softmax") {
       ConvertSoftmaxOperator(node, tf_import_flags, model);
+    } else if (node.op() == "LogSoftmax") {
+      ConvertLogSoftmaxOperator(node, tf_import_flags, model);
     } else if (node.op() == "All") {
       ConvertAllOperator(node, tf_import_flags, model);
     } else if (node.op() == "Assert") {
@@ -1633,6 +1977,28 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertMeanOperator(node, tf_import_flags, model);
     } else if (node.op() == "Svdf") {
       ConvertSvdfOperator(node, tf_import_flags, model);
+    } else if (node.op() == "NextIteration") {
+      ConvertOperatorSpecialCasedAsRNNBackEdge(node, tf_import_flags, model);
+    } else if (node.op() == "ExpandDims") {
+      ConvertExpandDimsOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Fill") {
+      ConvertFillOperator(node, tf_import_flags, model);
+    } else if (node.op() == "FloorDiv") {
+      ConvertFloorDivOperator(node, tf_import_flags, model);
+    } else if (node.op() == "FloorMod") {
+      ConvertFloorModOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Range") {
+      ConvertRangeOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Rank") {
+      ConvertRankOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Stack" || node.op() == "Pack") {
+      ConvertStackOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Transpose") {
+      ConvertTransposeOperator(node, tf_import_flags, model);
+    } else if (node.op() == "ArgMax") {
+      ConvertArgMaxOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Exp") {
+      ConvertExpOperator(node, tf_import_flags, model);
     } else {
       ConvertUnsupportedOperator(node, tf_import_flags, model);
     }
@@ -1641,7 +2007,7 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
   ResolveModelFlags(model_flags, model);
 
   StripCaretFromArrayNames(model);
-  AddExtraOutputsFedIntoOtherOps(model);
+  AddExtraOutputs(model);
   FixNoMissingArray(model);
   FixNoOrphanedArray(model);
   FixOperatorOrdering(model);
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.h b/tensorflow/contrib/lite/toco/import_tensorflow.h
index 312e3b8f17cfaa012bf25696937f97d396802bb2..2177872334bfec6147f865be1518e440c2c636ea 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.h
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
 
 #include <memory>
 #include <string>
@@ -39,4 +39,4 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 04b081352340b2ba14754fd2a4fea8894d7ad4fb..4c44f3fd66d8e733a59c8087faa012244cc434d0 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
 
+#include <functional>
 #include <initializer_list>
 #include <memory>
 #include <string>
@@ -32,7 +33,9 @@ enum class OperatorType {
   kNone,
   // General-purpose neural network operators.
   kAdd,
+  kAddN,
   kAveragePool,
+  kBatchMatMul,
   kBatchNormalization,
   kConv,
   kConcatenation,
@@ -41,6 +44,11 @@ enum class OperatorType {
   kSpaceToDepth,
   kDequantize,
   kDiv,
+  kExp,
+  kExpandDims,
+  kFill,
+  kFloorDiv,
+  kFloorMod,
   kFullyConnected,
   kL2Normalization,
   kL2Pool,
@@ -50,23 +58,29 @@ enum class OperatorType {
   kMaxPool,
   kFakeQuant,
   kMul,
+  kRange,
+  kRank,
   kRelu,
   kRelu1,
   kRelu6,
   kSoftmax,
+  kLogSoftmax,
   kSub,
   kTanh,
+  kTransposeConv,
   kCast,
   kFloor,
   kGather,
   kResizeBilinear,
   kSpaceToBatchND,
+  kStack,
   kBatchToSpaceND,
   kPad,
   kStridedSlice,
   kSlice,
   kSqueeze,
   kMean,
+  kArgMax,
   // The SVDF Op is a decomposition of a densely connected Op into
   // low rank filters. For details:
   // https://research.google.com/pubs/pub43813.html
@@ -89,6 +103,7 @@ enum class OperatorType {
   kTensorFlowMinimum,
   kTensorFlowMatMul,
   kTensorFlowMerge,
+  kNeg,
   kTensorFlowReshape,
   kTensorFlowRsqrt,
   kTensorFlowShape,
@@ -98,6 +113,7 @@ enum class OperatorType {
   kTensorFlowSum,
   kTensorFlowSwitch,
   kTensorFlowTile,
+  kTranspose,
   // An unsupported TF operation. It's only needed to be able to represent TF
   // graph internally and is expected to be dropped by graph transformations.
   kTensorFlowUnsupported,
@@ -142,7 +158,20 @@ enum class AxesOrder {
 // because we'll be dropping the array anyway (e.g. some exotic array types
 // may be involved only in debug-only subgraphs that we may not be interested
 // in actually supporting).
-enum class ArrayDataType { kNone, kBool, kFloat, kUint8, kInt32, kInt64 };
+enum class ArrayDataType {
+  kNone,
+  kBool,
+  kFloat,
+  kInt8,
+  kUint8,
+  kInt16,
+  kUint16,
+  kInt32,
+  kUint32,
+  kInt64,
+  kUint64,
+  kString
+};
 
 // Compile-time logic to map ArrayDataType to the corresponding C++ scalar type
 template <ArrayDataType A>
@@ -160,17 +189,41 @@ struct DataTypeImpl<ArrayDataType::kFloat> {
   typedef float Type;
 };
 template <>
+struct DataTypeImpl<ArrayDataType::kInt8> {
+  typedef int8 Type;
+};
+template <>
 struct DataTypeImpl<ArrayDataType::kUint8> {
   typedef uint8 Type;
 };
 template <>
+struct DataTypeImpl<ArrayDataType::kInt16> {
+  typedef int16 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kUint16> {
+  typedef uint16 Type;
+};
+template <>
 struct DataTypeImpl<ArrayDataType::kInt32> {
   typedef int32 Type;
 };
 template <>
+struct DataTypeImpl<ArrayDataType::kUint32> {
+  typedef uint32 Type;
+};
+template <>
 struct DataTypeImpl<ArrayDataType::kInt64> {
   typedef int64 Type;
 };
+template <>
+struct DataTypeImpl<ArrayDataType::kUint64> {
+  typedef uint64 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kString> {
+  typedef string Type;
+};
 
 template <ArrayDataType A>
 using DataType = typename DataTypeImpl<A>::Type;
@@ -302,6 +355,10 @@ struct ConvOperator : Operator {
   Padding padding;
   int stride_width = 0;
   int stride_height = 0;
+  // A dilation_rate of 0 is invalid and this field is an optional attribute.
+  // Thus initializing it to 1 to allow default conv behavior when the
+  // attribute is not present.
+  int dilation_rate = 1;
 };
 
 // Depthwise-separable convolution operator.
@@ -532,8 +589,18 @@ struct AddOperator : Operator {
   AddOperator() : Operator(OperatorType::kAdd) {}
 };
 
+// Element-wise addition operator for N inputs.
+//
+// Inputs:
+//   inputs[i]: The i-th array to add together to form the output.
+//
+// TensorFlow equivalent: AddN
+struct AddNOperator : Operator {
+  AddNOperator() : Operator(OperatorType::kAddN) {}
+};
+
 // Concatenation operator: concatenates its inputs
-// along the concat_dim dimension.
+// along the axis.
 //
 // Inputs: this operator accepts any number >= 1 of inputs.
 //   inputs[i]: the i-th array to concatenate.
@@ -541,7 +608,7 @@ struct AddOperator : Operator {
 // TensorFlow equivalent: Concat.
 struct ConcatenationOperator : Operator {
   ConcatenationOperator() : Operator(OperatorType::kConcatenation) {}
-  int concat_dim = 0;
+  int axis = 0;
 };
 
 // Reordering dimensions. Used only during tooling to transform graphs from
@@ -673,6 +740,19 @@ struct TensorFlowIdentityOperator : Operator {
   TensorFlowIdentityOperator() : Operator(OperatorType::kTensorFlowIdentity) {}
 };
 
+// Batch matrix multiplication operator. This comes from the (deprecated)
+// tf.batch_matmul or a tf.matmul that has rank 3. dims(0) is the batch count
+// and it can be trivially unrolled into a series of matmuls on each element.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side matrix
+//   inputs[1]: required: the right-hand side matrix
+//
+// TensorFlow equivalent: MatMul
+struct BatchMatMulOperator : Operator {
+  BatchMatMulOperator() : Operator(OperatorType::kBatchMatMul) {}
+};
+
 // General matrix multiplication operator. We don't want to support general
 // matrix multiplication at inference time, so we resolve it during tooling
 // to more specific operator types, namely, FullyConnected.
@@ -711,6 +791,9 @@ struct PadOperator : Operator {
 //
 // Inputs:
 //   inputs[0]: required: the input array
+//   inputs[1]: required: the begin array
+//   inputs[2]: required: the end array
+//   inputs[3]: optional: the strides array
 //
 // TensorFlow equivalent: StridedSlice
 struct StridedSliceOperator : Operator {
@@ -754,6 +837,123 @@ struct SqueezeOperator : Operator {
   std::vector<int> squeeze_dims;
 };
 
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the Conv weights
+//   channel.
+//
+// Outputs:
+//   outputs[0]: required: the output activations array
+//
+// TensorFlow equivalent: Conv2DBackpropInput
+struct TransposeConvOperator : Operator {
+  TransposeConvOperator() : Operator(OperatorType::kTransposeConv) {}
+  Padding padding;
+  int stride_width = 0;
+  int stride_height = 0;
+};
+
+// Given a tensor input, this operation calculates element-wise exponential
+// (y = e^x).
+//
+// Inputs:
+//   inputs[0]: required: input tensor
+//
+// TensorFlow equivalent: Exp
+struct ExpOperator : Operator {
+  ExpOperator() : Operator(OperatorType::kExp) {}
+};
+
+// Given a tensor input, this operation inserts a dimension of 1 at the
+// dimension index axis of input's shape. The dimension index axis starts at
+// zero; if you specify a negative number for axis it is counted backward from
+// the end.
+//
+// Inputs:
+//   inputs[0]: required: input tensor
+//   inputs[1]: required: 0-D (scalar). Specifies the dimension index at which
+//   to expand the shape of input
+//
+// TensorFlow equivalent: ExpandDims
+struct ExpandDimsOperator : Operator {
+  ExpandDimsOperator() : Operator(OperatorType::kExpandDims) {}
+};
+
+// Ceates a tensor of shape dims and fills it with the given scalar value.
+// Output type will be the same as the given scalar value.
+//
+// Inputs:
+//   inputs[0]: required: 1-D (int32) - the shape of the output tensor
+//   inputs[1]: required: 0-D (scalar) - value to fill the tensor with
+//
+// TensorFlow equivalent: Fill
+struct FillOperator : Operator {
+  FillOperator() : Operator(OperatorType::kFill) {}
+};
+
+// Element-wise floor division operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: FloorDiv
+struct FloorDivOperator : Operator {
+  FloorDivOperator() : Operator(OperatorType::kFloorDiv) {}
+};
+
+// Element-wise floor mod operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: FloorMod
+struct FloorModOperator : Operator {
+  FloorModOperator() : Operator(OperatorType::kFloorMod) {}
+};
+
+// Creates a sequence of numbers that begins at start and extends by increments
+// of delta up to but not including limit.
+//
+// The dtype of the resulting tensor is inferred from the inputs unless it is
+// provided explicitly.
+//
+// Inputs:
+//   inputs[0]: required: the start
+//   inputs[1]: required: the limit
+//   inputs[2]: required: the delta
+//
+// TensorFlow equivalent: Range
+struct RangeOperator : Operator {
+  RangeOperator() : Operator(OperatorType::kRange) {}
+  ArrayDataType dtype = ArrayDataType::kNone;
+};
+
+// Rank operator. Extracts the rank of the tensor.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// This operation outputs a 0-D integer tensor representing the rank of
+// the input.
+//
+// TensorFlow equivalent: Rank.  We currently assume that the output is int32
+// and not int64.  The output type could be stored herein.
+struct RankOperator : Operator {
+  RankOperator() : Operator(OperatorType::kRank) {}
+};
+
+// Element-wise negation (-x) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Neg
+struct NegOperator : Operator {
+  NegOperator() : Operator(OperatorType::kNeg) {}
+};
+
 // Element-wise reciprocal-square-root (x^-0.5) operator.
 //
 // Inputs:
@@ -764,6 +964,21 @@ struct TensorFlowRsqrtOperator : Operator {
   TensorFlowRsqrtOperator() : Operator(OperatorType::kTensorFlowRsqrt) {}
 };
 
+// Stacks a list of rank-R tensors into one rank-(R+1) tensor.
+//
+// Packs the list of tensors in values into a tensor with rank one higher than
+// each tensor in values, by packing them along the axis dimension. Given a list
+// of length N of tensors of shape (A, B, C);.
+//
+// Inputs: this operator accepts any number >= 1 of inputs.
+//   inputs[i]: the i-th array to merge.
+//
+// TensorFlow equivalent: Stack or Pack
+struct StackOperator : Operator {
+  StackOperator() : Operator(OperatorType::kStack) {}
+  int axis = 0;
+};
+
 // Shape operator. Extracts the shape of the tensor.
 //
 // Inputs:
@@ -798,6 +1013,20 @@ struct TensorFlowSquareOperator : Operator {
   TensorFlowSquareOperator() : Operator(OperatorType::kTensorFlowSquare) {}
 };
 
+// Transposes a tensor.
+//
+// By default, this operation performs a regular matrix transpose on 2-D input
+// tensors.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Transpose
+struct TransposeOperator : Operator {
+  TransposeOperator() : Operator(OperatorType::kTranspose) {}
+  std::vector<int> perm;
+};
+
 // Element-wise subtraction operator.
 //
 // Inputs:
@@ -1039,6 +1268,16 @@ struct SoftmaxOperator : Operator {
   float beta = 0.f;
 };
 
+// LogSoftmax activation function.
+//
+// Inputs:
+//   inputs[0]: required: the logits input array
+//
+// TensorFlow equivalent: LogSoftmax
+struct LogSoftmaxOperator : Operator {
+  LogSoftmaxOperator() : Operator(OperatorType::kLogSoftmax) {}
+};
+
 // Cast operator.
 //
 // Inputs:
@@ -1071,7 +1310,19 @@ struct FloorOperator : Operator {
 // TensorFlow equivalent: Gather
 struct GatherOperator : Operator {
   GatherOperator() : Operator(OperatorType::kGather) {}
-  int input_rank;
+  int axis = 0;
+  int input_rank = 0;
+};
+
+// ArgMax operator. It returns the index of the maximum value along axis.
+//
+// Inputs:
+//   inputs[0]: required: the input tensor
+//
+// TensorFlow equivalent: ArgMax
+struct ArgMaxOperator : Operator {
+  ArgMaxOperator() : Operator(OperatorType::kArgMax) {}
+  ArrayDataType output_data_type = ArrayDataType::kInt64;
 };
 
 // ResizeBilinear operator. It resizes input images with bilinear interpolation.
@@ -1084,6 +1335,8 @@ struct GatherOperator : Operator {
 // TensorFlow equivalent: ResizeBilinear
 struct ResizeBilinearOperator : Operator {
   ResizeBilinearOperator() : Operator(OperatorType::kResizeBilinear) {}
+
+  bool align_corners = false;
 };
 
 // SpaceToBatchND operator. It divides spatial dimensions into a grid of
@@ -1098,6 +1351,10 @@ struct ResizeBilinearOperator : Operator {
 // TensorFlow equivalent: SpaceToBatchND
 struct SpaceToBatchNDOperator : Operator {
   SpaceToBatchNDOperator() : Operator(OperatorType::kSpaceToBatchND) {}
+
+  std::vector<int> block_shape;
+  std::vector<int> before_paddings;
+  std::vector<int> after_paddings;
 };
 
 // BatchToSpaceND operator. Rearranges data from batch into blocks of
@@ -1112,6 +1369,10 @@ struct SpaceToBatchNDOperator : Operator {
 // TensorFlow equivalent: BatchToSpaceND
 struct BatchToSpaceNDOperator : Operator {
   BatchToSpaceNDOperator() : Operator(OperatorType::kBatchToSpaceND) {}
+
+  std::vector<int> block_shape;
+  std::vector<int> before_crops;
+  std::vector<int> after_crops;
 };
 
 // Mean operator.
@@ -1123,7 +1384,7 @@ struct BatchToSpaceNDOperator : Operator {
 struct MeanOperator : Operator {
   MeanOperator() : Operator(OperatorType::kMean) {}
 
-  std::vector<int> reduction_indices;
+  std::vector<int> axis;
   bool keep_dims = false;
 };
 
@@ -1339,29 +1600,54 @@ struct Array {
 
 // Our Model struct, represents an entire model (our "top-level" struct).
 // Owns everything.
-struct Model {
+class Model {
+ public:
+  using ArrayMap = std::unordered_map<string, std::unique_ptr<Array>>;
+
+  bool HasArray(const string& name) const { return arrays.count(name) > 0; }
   Array& GetArray(const string& name) const {
-    DCHECK(arrays.count(name));
+    DCHECK(HasArray(name)) << "Array not found: " << name;
     return *arrays.at(name);
   }
   Array& GetOrCreateArray(const string& name) {
-    if (!arrays.count(name)) {
+    // Make sure name is not used by an optional array
+    DCHECK(!optional_arrays.count(name));
+    if (!HasArray(name)) {
       Array* ptr = new Array;
       arrays[name] = std::unique_ptr<Array>(ptr);
     }
     Array& result = GetArray(name);
     return result;
   }
+  void CreateOptionalArray(const string& name) {
+    DCHECK(!arrays.count(name) && !optional_arrays.count(name));
+    optional_arrays.insert(name);
+  }
+  bool IsOptionalArray(const string& name) const {
+    return optional_arrays.count(name);
+  }
+
+  // Note that this invalidates all array iterators.
+  void EraseArray(const string& name) { arrays.erase(name); }
+  void EraseArrays(std::function<bool(const string&)> discardable) {
+    for (auto it = arrays.begin(); it != arrays.end();) {
+      if (discardable(it->first)) {
+        it = arrays.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+  const ArrayMap& GetArrayMap() const { return arrays; }
+
+  // Optional arrays are used for optional tensors,
+  // these tensors do not have data, but with reserved names as op inputs.
+  std::set<string> optional_arrays;
 
   // The list of operators. Notice how it's a list of unique_ptr's, implying
   // that the Model is what owns Operator's and keeps them alive.
   std::vector<std::unique_ptr<Operator>> operators;
-  // The associative array mapping names to Array's.
-  // Notice how it's a container of unique_ptr's, implying
-  // that the Model is what owns Array's and keeps them alive.
-  // The Operator's refer to these Array's by their name strings, not by their
-  // addresses. See Operator::inputs, Operator::outputs.
-  std::unordered_map<string, std::unique_ptr<Array>> arrays;
+
   // Generic flags, a place where we combine information passed to us via
   // command-line parameters (e.g. --input_width=N) with information that
   // we may or may not find in the input model file.
@@ -1370,7 +1656,15 @@ struct Model {
   std::size_t transient_data_size = 0;
   // For code-generation only: required alignment of the transient_data buffer
   std::size_t transient_data_alignment = 0;
+
+ private:
+  // The associative array mapping names to Array's.
+  // Notice how it's a container of unique_ptr's, implying
+  // that the Model is what owns Array's and keeps them alive.
+  // The Operator's refer to these Array's by their name strings, not by their
+  // addresses. See Operator::inputs, Operator::outputs.
+  std::unordered_map<string, std::unique_ptr<Array>> arrays;
 };
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index dde602e1868dc865ae7b37e7fa11985f013450de..4e2dec15a534607ef9207149a2e6061069eabcb1 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
@@ -28,6 +27,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/toco/toco_port.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
+
 // "batch" flag only exists internally
 #ifdef PLATFORM_GOOGLE
 #include "base/commandlineflags.h"
@@ -134,6 +134,26 @@ bool ParseModelFlagsFromCommandLineFlags(
            parsed_flags.dump_graphviz_video.default_value(),
            "If true, will dump graphviz at each "
            "graph transformation, which may be used to generate a video."),
+      Flag("allow_nonexistent_arrays",
+           parsed_flags.allow_nonexistent_arrays.bind(),
+           parsed_flags.allow_nonexistent_arrays.default_value(),
+           "If true, will allow passing inexistent arrays in --input_arrays "
+           "and --output_arrays. This makes little sense, is only useful to "
+           "more easily get graph visualizations."),
+      Flag("allow_nonascii_arrays", parsed_flags.allow_nonascii_arrays.bind(),
+           parsed_flags.allow_nonascii_arrays.default_value(),
+           "If true, will allow passing non-ascii-printable characters in "
+           "--input_arrays and --output_arrays. By default (if false), only "
+           "ascii printable characters are allowed, i.e. character codes "
+           "ranging from 32 to 127. This is disallowed by default so as to "
+           "catch common copy-and-paste issues where invisible unicode "
+           "characters are unwittingly added to these strings."),
+      Flag(
+          "arrays_extra_info_file", parsed_flags.arrays_extra_info_file.bind(),
+          parsed_flags.arrays_extra_info_file.default_value(),
+          "Path to an optional file containing a serialized ArraysExtraInfo "
+          "proto allowing to pass extra information about arrays not specified "
+          "in the input model file, such as extra MinMax information."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -265,10 +285,10 @@ void ReadModelFlagsFromCommandLineFlags(
       model_flags->add_input_arrays();
     }
     auto* shape = model_flags->mutable_input_arrays(0)->mutable_shape();
-    shape->Clear();
+    shape->clear_dims();
     const IntList& list = parsed_model_flags.input_shape.value();
     for (auto& dim : list.elements) {
-      shape->Add(dim);
+      shape->add_dims(dim);
     }
   }
   if (parsed_model_flags.input_shapes.specified()) {
@@ -278,25 +298,12 @@ void ReadModelFlagsFromCommandLineFlags(
     QCHECK(input_shapes.size() == model_flags->input_arrays_size());
     for (int i = 0; i < input_shapes.size(); ++i) {
       auto* shape = model_flags->mutable_input_arrays(i)->mutable_shape();
-      shape->Clear();
-      if (input_shapes[i].empty()) {
-        // empty i.e. 0-dimensional input shape.
-        // Unfortunately, the current toco::InputArray
-        // proto does not allow to distinguish between a known 0-D shape,
-        // and an unknown shape. Indeed, shape is currently a plain array,
-        // and it being empty means unknown shape. So here, we import a
-        // 0-D shape as a 1-D shape of size.
-        // TODO(benoitjacob): fix toco::InputArray to allow 0-D shape,
-        // probably by making shape an optional message,
-        // encapsulating the array.
-        shape->Add(1);
-      } else {
-        for (const auto& dim_str : absl::StrSplit(input_shapes[i], ',')) {
-          int size;
-          CHECK(absl::SimpleAtoi(dim_str, &size))
-              << "Failed to parse input_shape: " << input_shapes[i];
-          shape->Add(size);
-        }
+      shape->clear_dims();
+      for (const auto& dim_str : absl::StrSplit(input_shapes[i], ',')) {
+        int size;
+        CHECK(absl::SimpleAtoi(dim_str, &size))
+            << "Failed to parse input_shape: " << input_shapes[i];
+        shape->add_dims(size);
       }
     }
   }
@@ -326,9 +333,6 @@ void ReadModelFlagsFromCommandLineFlags(
         CHECK(absl::SimpleAtoi(value, &size));
         CHECK_GT(size, 0);
         rnn_state_proto->set_size(size);
-      } else if (key == "manually_create") {
-        CHECK_EQ(absl::AsciiStrToLower(value), "true");
-        rnn_state_proto->set_manually_create(true);
       } else {
         LOG(FATAL) << "Unknown key '" << key << "' in --rnn_states";
       }
@@ -362,6 +366,20 @@ void ReadModelFlagsFromCommandLineFlags(
       }
     }
   }
+
+  model_flags->set_allow_nonascii_arrays(
+      parsed_model_flags.allow_nonascii_arrays.value());
+  model_flags->set_allow_nonexistent_arrays(
+      parsed_model_flags.allow_nonexistent_arrays.value());
+
+  if (parsed_model_flags.arrays_extra_info_file.specified()) {
+    string arrays_extra_info_file_contents;
+    port::file::GetContents(parsed_model_flags.arrays_extra_info_file.value(),
+                            &arrays_extra_info_file_contents,
+                            port::file::Defaults());
+    ParseFromStringEitherTextOrBinary(arrays_extra_info_file_contents,
+                                      model_flags->mutable_arrays_extra_info());
+  }
 }
 
 ParsedModelFlags* UncheckedGlobalParsedModelFlags(bool must_already_exist) {
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.h b/tensorflow/contrib/lite/toco/model_cmdline_flags.h
index 027d7ae1aa62b5b31b8fcebdc29d4f547507b7fe..c868d5c7d0b5a6ee81d99423414c87e4e6e7cf66 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.h
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
 
 #include <string>
 #include <unordered_map>
@@ -40,5 +40,4 @@ ParsedModelFlags* GlobalParsedModelFlags();
 
 }  // namespace toco
 
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index 5b30904696b5cd71d3acfdeaee3af901c6bee884..e4b39b34e85e4d703c1b41cb68f8139abd1f6279 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -16,7 +16,11 @@ import "tensorflow/contrib/lite/toco/types.proto";
 
 package toco;
 
-// Next ID to USE: 6.
+message InputArrayShape {
+  repeated int32 dims = 2;
+}
+
+// Next ID to USE: 7.
 message InputArray {
   // Name of the input arrays, i.e. the arrays from which input activations
   // will be read.
@@ -28,7 +32,7 @@ message InputArray {
   //
   // The last dimension is typically called 'depth' or 'channels'. For example,
   // for an image model taking RGB images as input, this would have the value 3.
-  repeated int32 shape = 2;
+  optional InputArrayShape shape = 6;
 
   // mean_value and std_value parameters control the interpretation of raw input
   // activation values (elements of the input array) as real numbers. The
@@ -73,6 +77,32 @@ message InputArray {
   optional IODataType data_type = 5;
 }
 
+message RnnState {
+  optional string state_array = 1;
+  optional string back_edge_source_array = 2;
+  optional bool discardable = 5;
+  // size allows to specify a 1-D shape for the RNN state array.
+  // Will be expanded with 1's to fit the model.
+  // TODO(benoitjacob): should allow a generic, explicit shape.
+  optional int32 size = 3;
+}
+
+// An ArraysExtraInfo message stores a collection of additional Information
+// about arrays in a model, complementing the information in the model itself.
+// It is intentionally a separate message so that it may be serialized and
+// passed separately from the model. See --arrays_extra_info_file.
+//
+// A typical use case is to manually specify MinMax for specific arrays in a
+// model that does not already contain such MinMax information.
+message ArraysExtraInfo {
+  message Entry {
+    optional string name = 1;
+    optional float min = 2;
+    optional float max = 3;
+  }
+  repeated Entry entries = 1;
+}
+
 // ModelFlags encodes properties of a model that, depending on the file
 // format, may or may not be recorded in the model file. The purpose of
 // representing these properties in ModelFlags is to allow passing them
@@ -94,7 +124,7 @@ message InputArray {
 //   optional int32 input_dims = 11 [ default = 4];
 //   repeated int32 input_shape = 13;
 //
-// Next ID to USE: 16.
+// Next ID to USE: 19.
 message ModelFlags {
   // Information about the input arrays, i.e. the arrays from which input
   // activations will be read.
@@ -108,20 +138,6 @@ message ModelFlags {
   // the 'batch' field: at most one of these two fields can be set.
   optional bool variable_batch = 10;
 
-  message RnnState {
-    optional string state_array = 1;
-    optional string back_edge_source_array = 2;
-    optional int32 size = 3;
-    // TODO(benoitjacob): manually_create is a temporary hack:
-    // due to discrepancies between the current toco dims tracking and
-    // TensorFlow shapes, for some models we need to manually create RNN state
-    // arrays with a specified shape.
-    // Maybe we should actually implement back-edges as operators of their own,
-    // which would remove the need for much special-casing, including here,
-    // we could probably consistently let PropagateFixedSizes handle state
-    // arrays.
-    optional bool manually_create = 4;
-  }
   repeated RnnState rnn_states = 12;
 
   // Checks applied to the model, typically after toco's comprehensive
@@ -138,4 +154,21 @@ message ModelFlags {
     optional int32 count_max = 3 [default = -1];
   }
   repeated ModelCheck model_checks = 14;
+
+  // If true, will allow passing inexistent arrays in --input_arrays
+  // and --output_arrays. This makes little sense, is only useful to
+  // more easily get graph visualizations.
+  optional bool allow_nonexistent_arrays = 16;
+
+  // If true, will allow passing non-ascii-printable characters in
+  // --input_arrays and --output_arrays. By default (if false), only
+  // ascii printable characters are allowed, i.e. character codes
+  // ranging from 32 to 127. This is disallowed by default so as to
+  // catch common copy-and-paste issues where invisible unicode
+  // characters are unwittingly added to these strings.
+  optional bool allow_nonascii_arrays = 17;
+
+  // If set, this ArraysExtraInfo allows to pass extra information about arrays
+  // not specified in the input model file, such as extra MinMax information.
+  optional ArraysExtraInfo arrays_extra_info = 18;
 }
diff --git a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
index 28d52067a9a19ae240582f578e04776340a0cb2d..c35b6f99259b762aa83d92d21512169a7ab50b70 100644
--- a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
+++ b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
@@ -53,7 +53,7 @@ class TocoFromProtosTest(googletest.TestCase):
     model_flags = model_flags_pb2.ModelFlags()
     input_array = model_flags.input_arrays.add()
     input_array.name = TensorName(in_tensor)
-    input_array.shape.extend(map(int, in_tensor.get_shape()))
+    input_array.shape.dims.extend(map(int, in_tensor.get_shape()))
     model_flags.output_arrays.append(TensorName(out_tensor))
     # Shell out to run toco (in case it crashes)
     with tempfile.NamedTemporaryFile() as fp_toco, \
diff --git a/tensorflow/contrib/lite/toco/runtime/common.h b/tensorflow/contrib/lite/toco/runtime/common.h
index bd55544f57f9a266514e878edd8f1f7dec1cb7b7..3c6828840c4a963a4a68774ec5d559b7f80baf22 100644
--- a/tensorflow/contrib/lite/toco/runtime/common.h
+++ b/tensorflow/contrib/lite/toco/runtime/common.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
 
 #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
 #ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
@@ -23,4 +23,4 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
diff --git a/tensorflow/contrib/lite/toco/runtime/types.h b/tensorflow/contrib/lite/toco/runtime/types.h
index df63b2d59ea2a98f1ec9009614c18791e8822c14..f5de5a5781a5304634642680e6a3cef60e7b844b 100644
--- a/tensorflow/contrib/lite/toco/runtime/types.h
+++ b/tensorflow/contrib/lite/toco/runtime/types.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
 
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
@@ -29,4 +29,4 @@ using tflite::RequiredBufferSizeForDims;
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
index 664e828c19dca1117b81113f723416541f48d621..646d048496c27955aa641fd01a35d8acfbd8dd90 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
@@ -103,11 +103,11 @@ class ResolveSvdfTest : public ::testing::Test {
     // Add the float vector as an attribute to the node.
     (*node->mutable_attr())["dtype"].set_type(tensorflow::DT_FLOAT);
     tensorflow::TensorProto* allocated_tensor = new tensorflow::TensorProto;
-    tensorflow::TensorShapeProto* allocated_tesnor_shape =
+    tensorflow::TensorShapeProto* allocated_tensor_shape =
         new tensorflow::TensorShapeProto;
-    auto tensor_shape_dim0 = allocated_tesnor_shape->add_dim();
+    auto tensor_shape_dim0 = allocated_tensor_shape->add_dim();
     tensor_shape_dim0->set_size(values.size());
-    allocated_tensor->set_allocated_tensor_shape(allocated_tesnor_shape);
+    allocated_tensor->set_allocated_tensor_shape(allocated_tensor_shape);
     allocated_tensor->set_tensor_content(
         string(reinterpret_cast<const char*>(values.data()),
                values.size() * sizeof(float)));
@@ -122,11 +122,11 @@ class ResolveSvdfTest : public ::testing::Test {
     // Add the float vector as an attribute to the node.
     (*node->mutable_attr())["dtype"].set_type(tensorflow::DT_INT32);
     tensorflow::TensorProto* allocated_tensor = new tensorflow::TensorProto;
-    tensorflow::TensorShapeProto* allocated_tesnor_shape =
+    tensorflow::TensorShapeProto* allocated_tensor_shape =
         new tensorflow::TensorShapeProto;
-    auto tensor_shape_dim0 = allocated_tesnor_shape->add_dim();
+    auto tensor_shape_dim0 = allocated_tensor_shape->add_dim();
     tensor_shape_dim0->set_size(values.size());
-    allocated_tensor->set_allocated_tensor_shape(allocated_tesnor_shape);
+    allocated_tensor->set_allocated_tensor_shape(allocated_tensor_shape);
     allocated_tensor->set_tensor_content(
         string(reinterpret_cast<const char*>(values.data()),
                values.size() * sizeof(int)));
diff --git a/tensorflow/contrib/lite/toco/tensorflow_util.cc b/tensorflow/contrib/lite/toco/tensorflow_util.cc
index 82e2800ca2f5bb017f91b5bf43d8d3cd05e97b83..0e7e9c41a066581b14fe1b78f83d8d57b916be6c 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_util.cc
+++ b/tensorflow/contrib/lite/toco/tensorflow_util.cc
@@ -51,7 +51,8 @@ void LogDumpGraphDef(int log_level, const string& message,
 BEGIN DUMP OF TENSORFLOW GRAPHDEF (%s)
 There are %d nodes.
 There are %zu different op types:
-)MSG", message, tf_graph.node_size(), ops.size());
+)MSG",
+                      message, tf_graph.node_size(), ops.size());
   for (const auto& op : ops) {
     toco::port::AppendF(&dump, "  %s\n", op);
   }
@@ -63,7 +64,8 @@ PROTO DUMP
 BEGIN NODE: name = %s
   op = %s
   inputs = [
-)MSG", node.name(), node.op());
+)MSG",
+                        node.name(), node.op());
     for (const auto& input : node.input()) {
       toco::port::AppendF(&dump, "    %s\n", input);
     }
diff --git a/tensorflow/contrib/lite/toco/tensorflow_util.h b/tensorflow/contrib/lite/toco/tensorflow_util.h
index 152b4f7a727a88f721f1a63299ea4fa709bb5d52..61f91042685288a48ba19f8c67d4c7c1960a7787 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_util.h
+++ b/tensorflow/contrib/lite/toco/tensorflow_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
 
 #include <string>
 #include <vector>
@@ -29,4 +29,4 @@ void LogDumpGraphDef(int log_level, const string& message,
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD
index e910e3957f77fcf28ab379026bae4cc33ed00bc5..a2b8145a67278c3ac0065f9551da6ffd1de60772 100644
--- a/tensorflow/contrib/lite/toco/tflite/BUILD
+++ b/tensorflow/contrib/lite/toco/tflite/BUILD
@@ -1,3 +1,8 @@
+package(
+    # To suppress build cleaner error about inclusion of schema_generate.h.
+    features = ["-layering_check"],
+)
+
 licenses(["notice"])  # Apache 2.0
 
 load(
@@ -22,7 +27,7 @@ cc_library(
         "//tensorflow/contrib/lite/toco:model",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/memory",
-        "@flatbuffers//:flatbuffers",
+        "@flatbuffers",
     ],
 )
 
@@ -36,7 +41,7 @@ tf_cc_test(
         "//tensorflow/contrib/lite/toco:tooling_util",
         "//tensorflow/core:protos_all_cc",
         "@com_google_googletest//:gtest_main",
-        "@flatbuffers//:flatbuffers",
+        "@flatbuffers",
     ],
 )
 
@@ -82,7 +87,7 @@ cc_library(
         "//tensorflow/contrib/lite/toco:model",
         "//tensorflow/contrib/lite/toco:tooling_util",
         "@com_google_absl//absl/strings",
-        "@flatbuffers//:flatbuffers",
+        "@flatbuffers",
     ],
 )
 
@@ -93,6 +98,7 @@ tf_cc_test(
     ],
     deps = [
         ":export",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -111,7 +117,8 @@ cc_library(
         ":types",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/toco:model",
-        "@flatbuffers//:flatbuffers",
+        "//tensorflow/contrib/lite/toco:tooling_util",
+        "@flatbuffers",
     ],
 )
 
@@ -125,7 +132,7 @@ tf_cc_test(
         "//tensorflow/contrib/lite:schema_fbs_version",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
-        "@flatbuffers//:flatbuffers",
+        "@flatbuffers",
     ],
 )
 
diff --git a/tensorflow/contrib/lite/toco/tflite/builtin_operator.h b/tensorflow/contrib/lite/toco/tflite/builtin_operator.h
index 93cc79ddb64fbc46a97a47ecdc155a8aabf5c3ef..cfe7ecd9f982618dea3b3a5d02e69e3f15434bc2 100644
--- a/tensorflow/contrib/lite/toco/tflite/builtin_operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/builtin_operator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
 
 #include "absl/memory/memory.h"
 #include "tensorflow/contrib/lite/toco/tflite/operator.h"
@@ -71,4 +71,4 @@ class BuiltinOperator : public BaseOperator {
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/custom_operator.h b/tensorflow/contrib/lite/toco/tflite/custom_operator.h
index 1a4bfac7d4f684043d2a9ce8fc2c78dd738f4b69..bd5713618ff379be42fd1b76649cfb2cf55b843d 100644
--- a/tensorflow/contrib/lite/toco/tflite/custom_operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/custom_operator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
 
 #include "flatbuffers/flexbuffers.h"
 #include "absl/memory/memory.h"
@@ -71,4 +71,4 @@ class CustomOperator : public BaseOperator {
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index beda710614fd607a2e373582620d24dc3656fcf4..27719599708a7eb14f72a82f8e5d76b3b8af9dc4 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -26,6 +26,9 @@ namespace toco {
 
 namespace tflite {
 
+using flatbuffers::FlatBufferBuilder;
+using flatbuffers::Offset;
+using flatbuffers::Vector;
 using ::tflite::Buffer;
 using ::tflite::BuiltinOperator;
 using ::tflite::BuiltinOperator_CUSTOM;
@@ -39,9 +42,6 @@ using ::tflite::Operator;
 using ::tflite::OperatorCode;
 using ::tflite::SubGraph;
 using ::tflite::Tensor;
-using flatbuffers::FlatBufferBuilder;
-using flatbuffers::Offset;
-using flatbuffers::Vector;
 
 namespace {
 
@@ -62,7 +62,7 @@ namespace details {
 void LoadTensorsMap(const Model& model, TensorsMap* tensors_map) {
   // First find a list of unique array names.
   std::set<string> names;
-  for (const auto& array_pair : model.arrays) {
+  for (const auto& array_pair : model.GetArrayMap()) {
     names.insert(array_pair.first);
   }
 
@@ -96,7 +96,7 @@ Offset<Vector<Offset<Tensor>>> ExportTensors(
   // tensors in the tensors_map.
   std::map<int, Offset<Tensor>> ordered_tensors;
 
-  for (const auto& array_pair : model.arrays) {
+  for (const auto& array_pair : model.GetArrayMap()) {
     const string& tensor_name = array_pair.first;
     const toco::Array& array = *array_pair.second;
 
@@ -188,19 +188,26 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
     const details::OperatorKey operator_key = GetOperatorKey(*op);
     int op_index = operators_map.at(operator_key);
 
-    if (ops_by_type.count(op->type) == 0) {
-      LOG(FATAL) << "Unsupported operator: " << HelpfulOperatorTypeName(*op);
+    string name = HelpfulOperatorTypeName(*op);
+    bool is_builtin = false;
+    if (ops_by_type.count(op->type) != 0) {
+      name = ops_by_type.at(op->type)->name();
+      is_builtin = (builtin_ops.count(name) > 0);
     }
 
-    string name = ops_by_type.at(op->type)->name();
-    if (builtin_ops.count(name) > 0) {
+    if (is_builtin) {
       ordered_opcodes[op_index] =
           CreateOperatorCode(*builder, builtin_ops[name], 0);
     } else {
-      // If use the custom operation code if it's available in the OperatorKey.
+      // This could be a kTensorFlowUnsupported, in which case we should be
+      // able to retrieve the original Tensorflow name from the OperatorKey, or
+      // this could be a proper TOCO operator that is completely unknown to TF
+      // Lite.
       if (!operator_key.custom_code.empty()) {
         name = operator_key.custom_code;
       }
+      // Either way, this is an operator that is not supported by TF Lite,
+      // so we output it as a custom op and add it to the error summary.
       if (error_summary) {
         error_summary->insert(name);
       }
@@ -226,23 +233,26 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
   // The operators are in execution order, so we just follow tf.mini order.
   std::vector<Offset<Operator>> op_vector;
   for (const auto& op : model.operators) {
-    if (ops_by_type.count(op->type) == 0) {
-      LOG(FATAL) << "Op type '" << OperatorTypeName(op->type)
-                 << "' not supported";
-    }
-
     std::vector<int32_t> inputs;
     for (const string& input : op->inputs) {
-      inputs.push_back(tensors_map.at(input));
+      // -1 is the ID for optional tensor in TFLite output
+      int id = model.IsOptionalArray(input) ? -1 : tensors_map.at(input);
+      inputs.push_back(id);
     }
-
     std::vector<int32_t> outputs;
     for (const string& output : op->outputs) {
       outputs.push_back(tensors_map.at(output));
     }
 
-    auto options = ops_by_type.at(op->type)->Serialize(*op, builder);
     int op_index = operators_map.at(GetOperatorKey(*op));
+
+    // This is a custom op unless we can find it in ops_by_type, and even then
+    // it could be a custom op (such as kTensorFlowUnsupported).
+
+    auto options = Options::Custom(0);
+    if (ops_by_type.count(op->type) != 0) {
+      options = ops_by_type.at(op->type)->Serialize(*op, builder);
+    }
     // The only supported CustomOptionFormat is FLEXBUFFERS now.
     op_vector.push_back(CreateOperator(
         *builder, op_index, builder->CreateVector(inputs),
diff --git a/tensorflow/contrib/lite/toco/tflite/export.h b/tensorflow/contrib/lite/toco/tflite/export.h
index 44012b7126e17d730ea248551dea2414ad0072d9..8c79cb820015e16847ce48c171e8f6e41f60c319 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.h
+++ b/tensorflow/contrib/lite/toco/tflite/export.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
 
 #include "tensorflow/contrib/lite/toco/model.h"
 
@@ -73,4 +73,4 @@ void LoadOperatorsMap(const Model& model, OperatorsMap* operators_map);
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/export_test.cc b/tensorflow/contrib/lite/toco/tflite/export_test.cc
index e395645383144f663fa108f05ca9930a56cf26a6..6754372330797ae30230af26a3b478c24ad44005 100644
--- a/tensorflow/contrib/lite/toco/tflite/export_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export_test.cc
@@ -16,12 +16,14 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
 
 namespace toco {
-
 namespace tflite {
 namespace {
 
+using ::testing::ElementsAre;
+
 class ExportTest : public ::testing::Test {
  protected:
   // This is a very simplistic model. We are not interested in testing all the
@@ -31,11 +33,20 @@ class ExportTest : public ::testing::Test {
   void BuildTestModel() {
     input_model_.GetOrCreateArray("tensor_one");
     input_model_.GetOrCreateArray("tensor_two");
-    input_model_.operators.emplace_back(new ConvOperator);
+    {
+      auto* op = new ConvOperator;
+      op->padding.type = PaddingType::kSame;
+      input_model_.operators.emplace_back(op);
+    }
     input_model_.operators.emplace_back(new AddOperator);
-    auto unsupported_operator = new TensorFlowUnsupportedOperator;
-    unsupported_operator->tensorflow_op = "MyCrazyOp";
-    input_model_.operators.emplace_back(unsupported_operator);
+    {
+      auto* op = new TensorFlowUnsupportedOperator;
+      op->tensorflow_op = "MyCrazyOp";
+      input_model_.operators.emplace_back(op);
+    }
+    // Note that Sub is not know to TF Lite, so it gets exported as a custom
+    // op (and no options).
+    input_model_.operators.emplace_back(new SubOperator);
   }
 
   Model input_model_;
@@ -57,13 +68,44 @@ TEST_F(ExportTest, LoadOperatorsMap) {
   details::LoadOperatorsMap(input_model_, &operators);
   EXPECT_EQ(0, operators[details::OperatorKey(OperatorType::kAdd, "")]);
   EXPECT_EQ(1, operators[details::OperatorKey(OperatorType::kConv, "")]);
-  EXPECT_EQ(2, operators[details::OperatorKey(
+  EXPECT_EQ(2, operators[details::OperatorKey(OperatorType::kSub, "")]);
+  EXPECT_EQ(3, operators[details::OperatorKey(
                    OperatorType::kTensorFlowUnsupported, "MyCrazyOp")]);
 }
 
+TEST_F(ExportTest, Export) {
+  BuildTestModel();
+
+  string result;
+  Export(input_model_, true, &result);
+
+  auto* model = ::tflite::GetModel(result.data());
+
+  std::vector<string> names;
+  for (const ::tflite::OperatorCode* opcode : *model->operator_codes()) {
+    if (opcode->builtin_code() != ::tflite::BuiltinOperator_CUSTOM) {
+      names.push_back(string("builtin:") + ::tflite::EnumNameBuiltinOperator(
+                                               opcode->builtin_code()));
+    } else {
+      names.push_back(string("custom:") + opcode->custom_code()->c_str());
+    }
+  }
+
+  EXPECT_THAT(names, ElementsAre("builtin:ADD", "builtin:CONV_2D",
+                                 "builtin:SUB", "custom:MyCrazyOp"));
+
+  std::vector<uint32_t> indices;
+  auto operators = (*model->subgraphs())[0]->operators();
+  EXPECT_EQ(operators->Length(), 4);
+  for (const auto* op : *operators) {
+    indices.push_back(op->opcode_index());
+  }
+
+  EXPECT_THAT(indices, ElementsAre(1, 0, 3, 2));
+}
+
 // TODO(ahentz): tests for tensors, inputs, outpus, opcodes and operators.
 
 }  // namespace
 }  // namespace tflite
-
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/import.cc b/tensorflow/contrib/lite/toco/tflite/import.cc
index bbf201fd288140d990b8f739adcd9244e1196072..5b1ab514b23248cd98e66847185d0e8b9fe2d6aa 100644
--- a/tensorflow/contrib/lite/toco/tflite/import.cc
+++ b/tensorflow/contrib/lite/toco/tflite/import.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/toco/tflite/operator.h"
 #include "tensorflow/contrib/lite/toco/tflite/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
 
 namespace toco {
 
@@ -119,8 +120,16 @@ void ImportOperators(
     auto inputs = input_op->inputs();
     for (int i = 0; i < inputs->Length(); i++) {
       auto input_index = inputs->Get(i);
-      const string& input_name = tensors_table.at(input_index);
-      op->inputs.push_back(input_name);
+      // input_index == -1 indicates optional tensor.
+      if (input_index != -1) {
+        const string& input_name = tensors_table.at(input_index);
+        op->inputs.push_back(input_name);
+      } else {
+        const string& tensor_name =
+            toco::AvailableArrayName(*model, "OptionalTensor");
+        model->CreateOptionalArray(tensor_name);
+        op->inputs.push_back(tensor_name);
+      }
     }
     auto outputs = input_op->outputs();
     for (int i = 0; i < outputs->Length(); i++) {
diff --git a/tensorflow/contrib/lite/toco/tflite/import.h b/tensorflow/contrib/lite/toco/tflite/import.h
index 3c27a2843c47814ad46c8f1bbd77b7afcb324375..280677bae189fa345c2e19f6399a7b9ac7629ab5 100644
--- a/tensorflow/contrib/lite/toco/tflite/import.h
+++ b/tensorflow/contrib/lite/toco/tflite/import.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
 
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/toco/model.h"
@@ -46,4 +46,4 @@ void LoadOperatorsTable(const ::tflite::Model &input_model,
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/import_test.cc b/tensorflow/contrib/lite/toco/tflite/import_test.cc
index 309fa6d7f688ba1dd99a7e6eeda14d513a9e49d4..aad6e780d5eb5c3dbc880906df5053ad231ffd54 100644
--- a/tensorflow/contrib/lite/toco/tflite/import_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/import_test.cc
@@ -114,7 +114,7 @@ TEST_F(ImportTest, Tensors) {
 
   auto model = Import(ModelFlags(), InputModelAsString());
 
-  ASSERT_GT(model->arrays.count("tensor_one"), 0);
+  ASSERT_GT(model->HasArray("tensor_one"), 0);
   Array& a1 = model->GetArray("tensor_one");
   EXPECT_EQ(ArrayDataType::kFloat, a1.data_type);
   EXPECT_THAT(a1.GetBuffer<ArrayDataType::kFloat>().data,
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 8a33500ddcda67d97e68158ce40d8d7e086a27cc..2583ec0e3467aa1581bf76ac75d67aab468e7ced 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -130,6 +130,80 @@ class Add : public BuiltinOperator<AddOperator, ::tflite::AddOptions,
   }
 };
 
+class SpaceToBatchND
+    : public BuiltinOperator<SpaceToBatchNDOperator,
+                             ::tflite::SpaceToBatchNDOptions,
+                             ::tflite::BuiltinOptions_SpaceToBatchNDOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSpaceToBatchNDOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+};
+
+class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
+                                   ::tflite::BuiltinOptions_SubOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreateSubOptions(*builder, activation_function);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class Div : public BuiltinOperator<DivOperator, ::tflite::DivOptions,
+                                   ::tflite::BuiltinOptions_DivOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreateDivOptions(*builder, activation_function);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class BatchToSpaceND
+    : public BuiltinOperator<BatchToSpaceNDOperator,
+                             ::tflite::BatchToSpaceNDOptions,
+                             ::tflite::BuiltinOptions_BatchToSpaceNDOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateBatchToSpaceNDOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+};
+
 class Cast : public CustomOperator<CastOperator> {
  public:
   using CustomOperator::CustomOperator;
@@ -153,12 +227,12 @@ class Concatenation
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
-    return ::tflite::CreateConcatenationOptions(*builder, op.concat_dim);
+    return ::tflite::CreateConcatenationOptions(*builder, op.axis);
   }
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {
-    op->concat_dim = options.axis();
+    op->axis = options.axis();
   }
 };
 
@@ -211,6 +285,22 @@ class FullyConnected
   }
 };
 
+class Gather : public BuiltinOperator<GatherOperator, ::tflite::GatherOptions,
+                                      ::tflite::BuiltinOptions_GatherOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateGatherOptions(*builder, op.axis);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->axis = options.axis();
+  }
+};
+
 class Svdf : public BuiltinOperator<SvdfOperator, ::tflite::SVDFOptions,
                                     ::tflite::BuiltinOptions_SVDFOptions> {
  public:
@@ -348,6 +438,21 @@ class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
   }
 };
 
+class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
+                                   ::tflite::BuiltinOptions_PadOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreatePadOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+};
+
 class Reshape
     : public BuiltinOperator<TensorFlowReshapeOperator,
                              ::tflite::ReshapeOptions,
@@ -404,6 +509,98 @@ class SpaceToDepth
   }
 };
 
+class Transpose
+    : public BuiltinOperator<TransposeOperator, ::tflite::TransposeOptions,
+                             ::tflite::BuiltinOptions_TransposeOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateTransposeOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+};
+
+class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
+                                    ::tflite::BuiltinOptions_LSTMOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    // Current toco converter only supports tanh, no clip.
+    return ::tflite::CreateLSTMOptions(*builder, /*fused_activation_function=*/
+                                       ::tflite::ActivationFunctionType_TANH,
+                                       /*cell_clip=*/0.0,
+                                       /*proj_clip=*/0.0);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    // Only support tanh activation, so check that tflite type is tanh.
+    CHECK(options.fused_activation_function() ==
+          ::tflite::ActivationFunctionType_TANH);
+  }
+};
+
+class Mean : public BuiltinOperator<MeanOperator, ::tflite::MeanOptions,
+                                    ::tflite::BuiltinOptions_MeanOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateMeanOptions(*builder, op.keep_dims);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->keep_dims = options.keep_dims();
+  }
+};
+
+class ResizeBilinear
+    : public BuiltinOperator<ResizeBilinearOperator,
+                             ::tflite::ResizeBilinearOptions,
+                             ::tflite::BuiltinOptions_ResizeBilinearOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateResizeBilinearOptions(*builder, op.align_corners);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->align_corners = options.align_corners();
+  }
+};
+
+class Squeeze
+    : public BuiltinOperator<SqueezeOperator, ::tflite::SqueezeOptions,
+                             ::tflite::BuiltinOptions_SqueezeOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto squeeze_dims = builder->CreateVector(op.squeeze_dims);
+    return ::tflite::CreateSqueezeOptions(*builder, squeeze_dims);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->squeeze_dims.insert(op->squeeze_dims.end(),
+                            options.squeeze_dims()->begin(),
+                            options.squeeze_dims()->end());
+  }
+};
+
 class Split : public CustomOperator<TensorFlowSplitOperator> {
  public:
   using CustomOperator::CustomOperator;
@@ -416,6 +613,30 @@ class Split : public CustomOperator<TensorFlowSplitOperator> {
   }
 };
 
+class StridedSlice
+    : public BuiltinOperator<StridedSliceOperator,
+                             ::tflite::StridedSliceOptions,
+                             ::tflite::BuiltinOptions_StridedSliceOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateStridedSliceOptions(
+        *builder, op.begin_mask, op.end_mask, op.ellipsis_mask,
+        op.new_axis_mask, op.shrink_axis_mask);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->begin_mask = options.begin_mask();
+    op->end_mask = options.end_mask();
+    op->ellipsis_mask = options.ellipsis_mask();
+    op->new_axis_mask = options.new_axis_mask();
+    op->shrink_axis_mask = options.shrink_axis_mask();
+  }
+};
+
 class TensorFlowUnsupported : public BaseOperator {
  public:
   using BaseOperator::BaseOperator;
@@ -529,8 +750,16 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
 
   // Builtin Operators.
   ops.emplace_back(new Add(::tflite::BuiltinOperator_ADD, OperatorType::kAdd));
+  ops.emplace_back(new Div(::tflite::BuiltinOperator_DIV, OperatorType::kDiv));
+  ops.emplace_back(new Sub(::tflite::BuiltinOperator_SUB, OperatorType::kSub));
   ops.emplace_back(new AveragePool(::tflite::BuiltinOperator_AVERAGE_POOL_2D,
                                    OperatorType::kAveragePool));
+  ops.emplace_back(
+      new SpaceToBatchND(::tflite::BuiltinOperator_SPACE_TO_BATCH_ND,
+                         OperatorType::kSpaceToBatchND));
+  ops.emplace_back(
+      new BatchToSpaceND(::tflite::BuiltinOperator_BATCH_TO_SPACE_ND,
+                         OperatorType::kBatchToSpaceND));
   ops.emplace_back(new Concatenation(::tflite::BuiltinOperator_CONCATENATION,
                                      OperatorType::kConcatenation));
   ops.emplace_back(
@@ -540,6 +769,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
                                OperatorType::kDepthwiseConv));
   ops.emplace_back(new FullyConnected(::tflite::BuiltinOperator_FULLY_CONNECTED,
                                       OperatorType::kFullyConnected));
+  ops.emplace_back(
+      new Gather(::tflite::BuiltinOperator_GATHER, OperatorType::kGather));
   ops.emplace_back(
       new L2Normalization(::tflite::BuiltinOperator_L2_NORMALIZATION,
                           OperatorType::kL2Normalization));
@@ -551,6 +782,7 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
   ops.emplace_back(new MaxPool(::tflite::BuiltinOperator_MAX_POOL_2D,
                                OperatorType::kMaxPool));
   ops.emplace_back(new Mul(::tflite::BuiltinOperator_MUL, OperatorType::kMul));
+  ops.emplace_back(new Pad(::tflite::BuiltinOperator_PAD, OperatorType::kPad));
   ops.emplace_back(new Reshape(::tflite::BuiltinOperator_RESHAPE,
                                OperatorType::kTensorFlowReshape));
   ops.emplace_back(
@@ -559,6 +791,18 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
                                     OperatorType::kSpaceToDepth));
   ops.emplace_back(
       new Svdf(::tflite::BuiltinOperator_SVDF, OperatorType::kSvdf));
+  ops.emplace_back(new Transpose(::tflite::BuiltinOperator_TRANSPOSE,
+                                 OperatorType::kTranspose));
+  ops.emplace_back(
+      new Mean(::tflite::BuiltinOperator_MEAN, OperatorType::kMean));
+  ops.emplace_back(new ResizeBilinear(::tflite::BuiltinOperator_RESIZE_BILINEAR,
+                                      OperatorType::kResizeBilinear));
+  ops.emplace_back(
+      new Squeeze(::tflite::BuiltinOperator_SQUEEZE, OperatorType::kSqueeze));
+  ops.emplace_back(new StridedSlice(::tflite::BuiltinOperator_STRIDED_SLICE,
+                                    OperatorType::kStridedSlice));
+  ops.emplace_back(
+      new Lstm(::tflite::BuiltinOperator_LSTM, OperatorType::kLstmCell));
 
   // Custom Operators.
   ops.emplace_back(new Cast("CAST", OperatorType::kCast));
@@ -571,30 +815,27 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
 
   // There operators are supported by Toco, but not by TF Lite, and has no
   // attributes.
+  ops.emplace_back(
+      new SimpleOperator<AddNOperator>("ADDN", OperatorType::kAddN));
+  ops.emplace_back(new SimpleOperator<NegOperator>("NEG", OperatorType::kNeg));
   ops.emplace_back(new SimpleOperator<TensorFlowRsqrtOperator>(
       "RSQRT", OperatorType::kTensorFlowRsqrt));
-  ops.emplace_back(
-      new SimpleOperator<TensorFlowRsqrtOperator>("DIV", OperatorType::kDiv));
-
   // Simple Operators.
   ops.emplace_back(new SimpleOperator<DequantizeOperator>(
       "DEQUANTIZE", OperatorType::kDequantize));
   ops.emplace_back(
       new SimpleOperator<FloorOperator>("FLOOR", OperatorType::kFloor));
-  ops.emplace_back(
-      new SimpleOperator<GatherOperator>("GATHER", OperatorType::kGather));
   ops.emplace_back(
       new SimpleOperator<ReluOperator>("RELU", OperatorType::kRelu));
   ops.emplace_back(
-      new SimpleOperator<Relu1Operator>("RELU1", OperatorType::kRelu1));
+      new SimpleOperator<Relu1Operator>("RELU_N1_TO_1", OperatorType::kRelu1));
   ops.emplace_back(
       new SimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6));
-  ops.emplace_back(new SimpleOperator<ResizeBilinearOperator>(
-      "RESIZE_BILINEAR", OperatorType::kResizeBilinear));
   ops.emplace_back(new SimpleOperator<LogisticOperator>(
       "LOGISTIC", OperatorType::kLogistic));
   ops.emplace_back(
       new SimpleOperator<TanhOperator>("TANH", OperatorType::kTanh));
+  ops.emplace_back(new SimpleOperator<ExpOperator>("EXP", OperatorType::kExp));
 
   return ops;
 }
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.h b/tensorflow/contrib/lite/toco/tflite/operator.h
index 37df302d4697c78e0349bcd30e0e1adc540066bc..88af3d6ab6c6af150af83ed5c52931f9f089aa3c 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/operator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
 
 #include "flatbuffers/flatbuffers.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
@@ -86,4 +86,4 @@ class BaseOperator {
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 8e77c56d8aaa88d5c801ae246e1ee63e40b6f955..05c325ef910e9423296d3693cb4bc92422a33202 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -101,14 +101,12 @@ TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<DequantizeOperator>("DEQUANTIZE",
                                           OperatorType::kDequantize);
   CheckSimpleOperator<FloorOperator>("FLOOR", OperatorType::kFloor);
-  CheckSimpleOperator<GatherOperator>("GATHER", OperatorType::kGather);
   CheckSimpleOperator<ReluOperator>("RELU", OperatorType::kRelu);
-  CheckSimpleOperator<Relu1Operator>("RELU1", OperatorType::kRelu1);
+  CheckSimpleOperator<Relu1Operator>("RELU_N1_TO_1", OperatorType::kRelu1);
   CheckSimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6);
-  CheckSimpleOperator<ResizeBilinearOperator>("RESIZE_BILINEAR",
-                                              OperatorType::kResizeBilinear);
   CheckSimpleOperator<LogisticOperator>("LOGISTIC", OperatorType::kLogistic);
   CheckSimpleOperator<TanhOperator>("TANH", OperatorType::kTanh);
+  CheckSimpleOperator<ExpOperator>("EXP", OperatorType::kExp);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -120,6 +118,15 @@ TEST_F(OperatorTest, BuiltinAdd) {
             output_toco_op->fused_activation_function);
 }
 
+TEST_F(OperatorTest, BuiltinMean) {
+  MeanOperator op;
+  op.keep_dims = false;
+
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("MEAN", OperatorType::kMean), op);
+  EXPECT_EQ(op.keep_dims, output_toco_op->keep_dims);
+}
+
 TEST_F(OperatorTest, CustomCast) {
   CastOperator op;
   op.src_data_type = ArrayDataType::kFloat;
@@ -132,10 +139,10 @@ TEST_F(OperatorTest, CustomCast) {
 
 TEST_F(OperatorTest, CustomConcatenation) {
   ConcatenationOperator op;
-  op.concat_dim = 123;
+  op.axis = 123;
   auto output_toco_op = SerializeAndDeserialize(
       GetOperator("CONCATENATION", OperatorType::kConcatenation), op);
-  EXPECT_EQ(op.concat_dim, output_toco_op->concat_dim);
+  EXPECT_EQ(op.axis, output_toco_op->axis);
 }
 
 TEST_F(OperatorTest, CustomDepthToSpace) {
@@ -167,6 +174,13 @@ TEST_F(OperatorTest, CustomFullyConnected) {
             output_toco_op->fused_activation_function);
 }
 
+TEST_F(OperatorTest, BuiltinGather) {
+  GatherOperator op;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("GATHER", OperatorType::kGather), op);
+  ASSERT_NE(nullptr, output_toco_op.get());
+}
+
 TEST_F(OperatorTest, BuiltinL2Pool) {
   L2PoolOperator op;
   op.stride_width = 123;
@@ -316,6 +330,14 @@ TEST_F(OperatorTest, BuiltinMul) {
             output_toco_op->fused_activation_function);
 }
 
+TEST_F(OperatorTest, ResizeBilinear) {
+  ResizeBilinearOperator op;
+  op.align_corners = true;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("RESIZE_BILINEAR", OperatorType::kResizeBilinear), op);
+  EXPECT_EQ(op.align_corners, output_toco_op->align_corners);
+}
+
 TEST_F(OperatorTest, Svdf) {
   SvdfOperator op;
   op.fused_activation_function = FusedActivationFunctionType::kRelu;
@@ -327,6 +349,37 @@ TEST_F(OperatorTest, Svdf) {
   EXPECT_EQ(op.rank, output_toco_op->rank);
 }
 
+TEST_F(OperatorTest, Squeeze) {
+  SqueezeOperator op;
+  op.squeeze_dims = {-2, -3, 4, 1, 4};
+
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("SQUEEZE", OperatorType::kSqueeze), op);
+  EXPECT_EQ(op.squeeze_dims, output_toco_op->squeeze_dims);
+}
+
+TEST_F(OperatorTest, StridedSlice) {
+  StridedSliceOperator op;
+
+  op.begin_mask = 1;
+  op.end_mask = 2;
+  op.ellipsis_mask = 1;
+  op.new_axis_mask = 1;
+  op.shrink_axis_mask = 2;
+
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("STRIDED_SLICE", OperatorType::kStridedSlice), op);
+  EXPECT_EQ(op.start_indices, output_toco_op->start_indices);
+  EXPECT_EQ(op.stop_indices, output_toco_op->stop_indices);
+  EXPECT_EQ(op.strides, output_toco_op->strides);
+  EXPECT_EQ(op.begin_mask, output_toco_op->begin_mask);
+  EXPECT_EQ(op.end_mask, output_toco_op->end_mask);
+  EXPECT_EQ(op.end_mask, output_toco_op->end_mask);
+  EXPECT_EQ(op.ellipsis_mask, output_toco_op->ellipsis_mask);
+  EXPECT_EQ(op.new_axis_mask, output_toco_op->new_axis_mask);
+  EXPECT_EQ(op.shrink_axis_mask, output_toco_op->shrink_axis_mask);
+}
+
 TEST_F(OperatorTest, TensorFlowUnsupported) {
   TensorFlowUnsupportedOperator op;
   op.tensorflow_op = "MyCustomUnsupportedOp";
diff --git a/tensorflow/contrib/lite/toco/tflite/simple_operator.h b/tensorflow/contrib/lite/toco/tflite/simple_operator.h
index 992b98bacafecb080e792ae87a2940977482eed6..72678c82a22a7168f858747b0b1c6a2b515b6578 100644
--- a/tensorflow/contrib/lite/toco/tflite/simple_operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/simple_operator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
 
 #include "tensorflow/contrib/lite/toco/tflite/operator.h"
 
@@ -47,4 +47,4 @@ class SimpleOperator : public BaseOperator {
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/types.cc b/tensorflow/contrib/lite/toco/tflite/types.cc
index 5b4dbfae2477d629624a70bf7c6e93606c937605..b4c2851502a40a1ca36965d4ddd2c8a15b8fe60f 100644
--- a/tensorflow/contrib/lite/toco/tflite/types.cc
+++ b/tensorflow/contrib/lite/toco/tflite/types.cc
@@ -51,8 +51,12 @@ void CopyBuffer(const ::tflite::Buffer& buffer, Array* array) {
       return ::tflite::TensorType_FLOAT32;
     case ArrayDataType::kInt32:
       return ::tflite::TensorType_INT32;
+    case ArrayDataType::kInt64:
+      return ::tflite::TensorType_INT64;
     case ArrayDataType::kUint8:
       return ::tflite::TensorType_UINT8;
+    case ArrayDataType::kString:
+      return ::tflite::TensorType_STRING;
     default:
       // FLOAT32 is filled for unknown data types.
       // TODO(ycling): Implement type inference in TF Lite interpreter.
@@ -66,6 +70,10 @@ ArrayDataType DataType::Deserialize(int tensor_type) {
       return ArrayDataType::kFloat;
     case ::tflite::TensorType_INT32:
       return ArrayDataType::kInt32;
+    case ::tflite::TensorType_INT64:
+      return ArrayDataType::kInt64;
+    case ::tflite::TensorType_STRING:
+      return ArrayDataType::kString;
     case ::tflite::TensorType_UINT8:
       return ArrayDataType::kUint8;
     default:
@@ -82,6 +90,8 @@ flatbuffers::Offset<flatbuffers::Vector<uint8_t>> DataBuffer::Serialize(
       return CopyBuffer<ArrayDataType::kFloat>(array, builder);
     case ArrayDataType::kInt32:
       return CopyBuffer<ArrayDataType::kInt32>(array, builder);
+    case ArrayDataType::kString:
+      return CopyBuffer<ArrayDataType::kString>(array, builder);
     case ArrayDataType::kUint8:
       return CopyBuffer<ArrayDataType::kUint8>(array, builder);
     default:
@@ -99,6 +109,10 @@ void DataBuffer::Deserialize(const ::tflite::Tensor& tensor,
       return CopyBuffer<ArrayDataType::kFloat>(buffer, array);
     case ::tflite::TensorType_INT32:
       return CopyBuffer<ArrayDataType::kInt32>(buffer, array);
+    case ::tflite::TensorType_INT64:
+      return CopyBuffer<ArrayDataType::kInt64>(buffer, array);
+    case ::tflite::TensorType_STRING:
+      return CopyBuffer<ArrayDataType::kString>(buffer, array);
     case ::tflite::TensorType_UINT8:
       return CopyBuffer<ArrayDataType::kUint8>(buffer, array);
     default:
@@ -138,7 +152,7 @@ PaddingType Padding::Deserialize(int padding) {
     case FusedActivationFunctionType::kRelu6:
       return ::tflite::ActivationFunctionType_RELU6;
     case FusedActivationFunctionType::kRelu1:
-      return ::tflite::ActivationFunctionType_RELU1;
+      return ::tflite::ActivationFunctionType_RELU_N1_TO_1;
     default:
       LOG(FATAL) << "Unhandled fused activation function type.";
   }
@@ -153,7 +167,7 @@ FusedActivationFunctionType ActivationFunction::Deserialize(
       return FusedActivationFunctionType::kRelu;
     case ::tflite::ActivationFunctionType_RELU6:
       return FusedActivationFunctionType::kRelu6;
-    case ::tflite::ActivationFunctionType_RELU1:
+    case ::tflite::ActivationFunctionType_RELU_N1_TO_1:
       return FusedActivationFunctionType::kRelu1;
     default:
       LOG(FATAL) << "Unhandled fused activation function type.";
diff --git a/tensorflow/contrib/lite/toco/tflite/types.h b/tensorflow/contrib/lite/toco/tflite/types.h
index f7c51405107d954fa259809b72f56af193e344fb..3923756fc94e3175a6505740a96cce8d614c3990 100644
--- a/tensorflow/contrib/lite/toco/tflite/types.h
+++ b/tensorflow/contrib/lite/toco/tflite/types.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
 
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/toco/model.h"
@@ -55,4 +55,4 @@ struct ActivationFunction {
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/types_test.cc b/tensorflow/contrib/lite/toco/tflite/types_test.cc
index 174b78f3e632fde8dc6ea0ed83ed7a67fa12c16a..a040fe135841b92a6e668f32cc5e36cf812ab15b 100644
--- a/tensorflow/contrib/lite/toco/tflite/types_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/types_test.cc
@@ -28,8 +28,8 @@ using flatbuffers::Vector;
 
 // These are types that exist in TF Mini but don't have a correspondence
 // in TF Lite.
-static const ArrayDataType kUnsupportedTocoTypes[] = {
-    ArrayDataType::kNone, ArrayDataType::kBool, ArrayDataType::kInt64};
+static const ArrayDataType kUnsupportedTocoTypes[] = {ArrayDataType::kNone,
+                                                      ArrayDataType::kBool};
 
 // These are TF Lite types for which there is no correspondence in TF Mini.
 static const ::tflite::TensorType kUnsupportedTfLiteTypes[] = {
@@ -70,6 +70,7 @@ TEST(DataType, SupportedTypes) {
   std::vector<std::pair<ArrayDataType, ::tflite::TensorType>> testdata = {
       {ArrayDataType::kUint8, ::tflite::TensorType_UINT8},
       {ArrayDataType::kInt32, ::tflite::TensorType_INT32},
+      {ArrayDataType::kInt64, ::tflite::TensorType_INT64},
       {ArrayDataType::kFloat, ::tflite::TensorType_FLOAT32}};
   for (auto x : testdata) {
     EXPECT_EQ(x.second, DataType::Serialize(x.first));
@@ -172,7 +173,7 @@ TEST(ActivationFunction, All) {
                   {FusedActivationFunctionType::kRelu6,
                    ::tflite::ActivationFunctionType_RELU6},
                   {FusedActivationFunctionType::kRelu1,
-                   ::tflite::ActivationFunctionType_RELU1}};
+                   ::tflite::ActivationFunctionType_RELU_N1_TO_1}};
   for (auto x : testdata) {
     EXPECT_EQ(x.second, ActivationFunction::Serialize(x.first));
     EXPECT_EQ(x.first, ActivationFunction::Deserialize(x.second));
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index 83947d6b28010e6b75ff377648f51a0364a4d580..c5a62fdb620ee7d6b7195f6e8e2bc3cb208feb10 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -44,9 +44,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
            "For Protobuf formats, the binary format will be used."),
       Flag("input_format", parsed_flags.input_format.bind(),
            parsed_flags.input_format.default_value(),
-           "Input file format. One of: tensorflow_graphdef, "),
+           "Input file format. One of: TENSORFLOW_GRAPHDEF, TFLITE."),
       Flag("output_format", parsed_flags.output_format.bind(),
-           parsed_flags.output_format.default_value(), "Output file format."),
+           parsed_flags.output_format.default_value(),
+           "Output file format. "
+           "One of TENSORFLOW_GRAPHDEF, TFLITE, GRAPHVIZ_DOT."),
       Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
            parsed_flags.default_ranges_min.default_value(),
            "If defined, will be used as the default value for the min bound "
@@ -58,51 +60,51 @@ bool ParseTocoFlagsFromCommandLineFlags(
       Flag("inference_type", parsed_flags.inference_type.bind(),
            parsed_flags.inference_type.default_value(),
            "Target data type of arrays in the output file (for input_arrays, "
-           "this may be overridden by inference_input_type)."),
+           "this may be overridden by inference_input_type). "
+           "One of FLOAT, QUANTIZED_UINT8."),
       Flag("inference_input_type", parsed_flags.inference_input_type.bind(),
            parsed_flags.inference_input_type.default_value(),
-           "Target data type of input arrays. If not specified, inference_type "
-           "is used."),
+           "Target data type of input arrays. "
+           "If not specified, inference_type is used. "
+           "One of FLOAT, QUANTIZED_UINT8."),
       Flag("input_type", parsed_flags.input_type.bind(),
            parsed_flags.input_type.default_value(),
-           "Deprecated old name of inference_input_type."),
+           "Deprecated ambiguous flag that set both --input_data_types and "
+           "--inference_input_type."),
       Flag("input_types", parsed_flags.input_types.bind(),
            parsed_flags.input_types.default_value(),
-           "Deprecated old name of inference_input_type. Was meant to be a "
+           "Deprecated ambiguous flag that set both --input_data_types and "
+           "--inference_input_type. Was meant to be a "
            "comma-separated list, but this was deprecated before "
            "multiple-input-types was ever properly supported."),
 
       Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(),
            parsed_flags.drop_fake_quant.default_value(),
-           "Ignore and discard FakeQuant nodes. For instance, that can be used "
-           "to "
+           "Ignore and discard FakeQuant nodes. For instance, to "
            "generate plain float code without fake-quantization from a "
-           "quantized "
-           "graph."),
+           "quantized graph."),
       Flag(
           "reorder_across_fake_quant",
           parsed_flags.reorder_across_fake_quant.bind(),
           parsed_flags.reorder_across_fake_quant.default_value(),
           "Normally, FakeQuant nodes must be strict boundaries for graph "
           "transformations, in order to ensure that quantized inference has "
-          "the "
-          "exact same arithmetic behavior as quantized training --- which is "
-          "the "
-          "whole point of quantized training and of FakeQuant nodes in the "
-          "first "
-          "place. However, that entails subtle requirements on where exactly "
+          "the exact same arithmetic behavior as quantized training --- which "
+          "is the whole point of quantized training and of FakeQuant nodes in "
+          "the first place. "
+          "However, that entails subtle requirements on where exactly "
           "FakeQuant nodes must be placed in the graph. Some quantized graphs "
           "have FakeQuant nodes at unexpected locations, that prevent graph "
           "transformations that are necessary in order to generate inference "
           "code for these graphs. Such graphs should be fixed, but as a "
           "temporary work-around, setting this reorder_across_fake_quant flag "
-          "allows toco to perform necessary graph transformaitons on them, "
+          "allows TOCO to perform necessary graph transformaitons on them, "
           "at the cost of no longer faithfully matching inference and training "
           "arithmetic."),
       Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(),
            parsed_flags.allow_custom_ops.default_value(),
-           "If true, allow TOCO to create TF Lite Custom operators for all the"
-           "unsupported Tensorflow ops."),
+           "If true, allow TOCO to create TF Lite Custom operators for all the "
+           "unsupported TensorFlow ops."),
       Flag(
           "drop_control_dependency",
           parsed_flags.drop_control_dependency.bind(),
@@ -140,7 +142,6 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
           << #name;                                                          \
     }                                                                        \
   } while (false)
-
 #define READ_TOCO_FLAG(name, requirement)                     \
   ENFORCE_FLAG_REQUIREMENT(name, requirement);                \
   do {                                                        \
@@ -174,14 +175,26 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
 
   // Deprecated flag handling.
   if (parsed_toco_flags.input_type.specified()) {
-    LOG(WARNING) << "--input_type is deprecated. Use --inference_input_type.";
+    LOG(WARNING)
+        << "--input_type is deprecated. It was an ambiguous flag that set both "
+           "--input_data_types and --inference_input_type. If you are trying "
+           "to complement the input file with information about the type of "
+           "input arrays, use --input_data_type. If you are trying to control "
+           "the quantization/dequantization of real-numbers input arrays in "
+           "the output file, use --inference_input_type.";
     toco::IODataType input_type;
     QCHECK(toco::IODataType_Parse(parsed_toco_flags.input_type.value(),
                                   &input_type));
     toco_flags->set_inference_input_type(input_type);
   }
   if (parsed_toco_flags.input_types.specified()) {
-    LOG(WARNING) << "--input_types is deprecated. Use --inference_input_type.";
+    LOG(WARNING)
+        << "--input_types is deprecated. It was an ambiguous flag that set "
+           "both --input_data_types and --inference_input_type. If you are "
+           "trying to complement the input file with information about the "
+           "type of input arrays, use --input_data_type. If you are trying to "
+           "control the quantization/dequantization of real-numbers input "
+           "arrays in the output file, use --inference_input_type.";
     std::vector<string> input_types =
         absl::StrSplit(parsed_toco_flags.input_types.value(), ',');
     QCHECK(!input_types.empty());
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.h b/tensorflow/contrib/lite/toco/toco_cmdline_flags.h
index ba35ca8d5d23f07d843ae6fa2099cc7e15b1e9a3..46eb3f57283cc52bf2877f578500f3a4a633be86 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.h
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
 
 #include <string>
 #include <vector>
@@ -33,4 +33,4 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
diff --git a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h b/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
index ae0541f62b61581e3ba183725a85fe51c54116dc..d6c3ba6543378b3e15b5fb7816f52376fe05123d 100644
--- a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
+++ b/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
 
 #include <string>
 
@@ -31,4 +31,4 @@ struct GraphVizDumpOptions {
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index b5cb7a11e7c46d02d398ff937d46e52368e88098..4be3b5a0bf00ed204a1218545d9e66f7685a50d7 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
 
 // Portability layer for toco tool. Mainly, abstract filesystem access so we
 // can build and use on google internal environments and on OSX.
 
 #include <string>
+#include "google/protobuf/text_format.h"
 #include "tensorflow/contrib/lite/toco/format_port.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/platform.h"
@@ -75,6 +76,26 @@ void CopyToBuffer(const ::Cord& src, char* dest);
 #endif  // PLATFORM_GOOGLE
 void CopyToBuffer(const string& src, char* dest);
 }  // namespace port
+
+inline bool ParseFromStringOverload(const std::string& in,
+                                    TFLITE_PROTO_NS::Message* proto) {
+  return TFLITE_PROTO_NS::TextFormat::ParseFromString(in, proto);
+}
+
+template <typename Proto>
+bool ParseFromStringEitherTextOrBinary(const std::string& input_file_contents,
+                                       Proto* proto) {
+  if (proto->ParseFromString(input_file_contents)) {
+    return true;
+  }
+
+  if (ParseFromStringOverload(input_file_contents, proto)) {
+    return true;
+  }
+
+  return false;
+}
+
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index ca092b2d72d5c708a4db57cbb8810ec978446fab..5472c52c96ab93a6d3acf0522651d0f8876e08ce 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -51,24 +51,38 @@ void CheckUnsupportedOperations(const Model& model) {
 void MakeGeneralGraphTransformationsSet(
     GraphTransformationsSet* transformations) {
   CHECK(transformations->empty());
+  transformations->Add(new ConvertExpandDimsToReshape);
+  transformations->Add(new ConvertTrivialAddNToAdd);
+  transformations->Add(new ConvertTrivialStackToReshape);
+  transformations->Add(new ConvertTrivialTransposeToReshape);
+  transformations->Add(new ConvertReorderAxes);
   transformations->Add(new ResolveReshapeAttributes);
+  transformations->Add(new ResolveTransposeAttributes);
   transformations->Add(new PropagateArrayDataTypes);
   transformations->Add(new PropagateFixedSizes);
   transformations->Add(new RemoveTensorFlowAssert);
   transformations->Add(new RemoveTensorFlowIdentity);
   transformations->Add(new RemoveTrivialConcatenation);
   transformations->Add(new RemoveTrivialConcatenationInput);
+  transformations->Add(new RemoveTrivialSlice);
   transformations->Add(new RemoveUnusedOp);
   transformations->Add(new EnsureBiasVectors);
   transformations->Add(new ResolveReorderAxes);
+  transformations->Add(new UnrollBatchMatMul);
   transformations->Add(new ResolveTensorFlowMatMul);
   transformations->Add(new FuseBinaryIntoPrecedingAffine);
   transformations->Add(new FuseBinaryIntoFollowingAffine);
+  transformations->Add(new ReorderActivationFunctions);
   transformations->Add(new ResolveBatchNormalization);
   transformations->Add(new ResolveConstantBinaryOperator);
+  transformations->Add(new ResolveConstantFill);
+  transformations->Add(new ResolveConstantRange);
+  transformations->Add(new ResolveConstantStack);
+  transformations->Add(new ResolveConstantStridedSlice);
+  transformations->Add(new ResolveConstantTranspose);
   transformations->Add(new ResolveConstantUnaryOperator);
   transformations->Add(new ResolveTensorFlowMerge);
-  transformations->Add(new ResolveTensorFlowSqueeze);
+  transformations->Add(new ResolveSqueezeAttributes);
   transformations->Add(new ResolveTensorFlowSwitch);
   transformations->Add(new ResolveTensorFlowTile);
   transformations->Add(new ResolveTensorFlowConcat);
@@ -77,17 +91,19 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new IdentifyRelu1);
   transformations->Add(new RemoveTrivialBinaryOperator);
   transformations->Add(new ReadFakeQuantMinMax);
+  transformations->Add(new ResolveSpaceToBatchNDAttributes);
+  transformations->Add(new ResolveBatchToSpaceNDAttributes);
   transformations->Add(new ResolvePadAttributes);
   transformations->Add(new ResolveStridedSliceAttributes);
   transformations->Add(new ResolveSliceAttributes);
   transformations->Add(new ResolveMeanAttributes);
-  transformations->Add(new ResolveConstantTensorFlowShape);
+  transformations->Add(new ResolveConstantShapeOrRank);
   transformations->Add(new MakeInitialDequantizeOperator);
+  transformations->Add(new ResolveConstantFakeQuant);
 }
 
 bool SupportsQuantization(FileFormat format) {
   return (format == GRAPHVIZ_DOT || format == TFLITE);
-  ;
 }
 
 bool SupportsFusedActivationFunction(FileFormat format) {
@@ -95,11 +111,12 @@ bool SupportsFusedActivationFunction(FileFormat format) {
 }
 
 bool SupportsLstmCell(FileFormat format) {
-  return (format == TENSORFLOW_GRAPHDEF || format == GRAPHVIZ_DOT);
+  return (format == TENSORFLOW_GRAPHDEF || format == GRAPHVIZ_DOT ||
+          format == TFLITE);
 }
 
 bool SupportsPreallocatedWorkspace(FileFormat format) {
-  return (format == GRAPHVIZ_DOT || format == TFLITE);
+  return (format == TFLITE);
 }
 
 bool IsRealValued(toco::ArrayDataType type) {
@@ -124,7 +141,7 @@ void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
 
   for (int i = 0; i < model->flags.input_arrays_size(); i++) {
     string const& array_name = model->flags.input_arrays(i).name();
-    auto* array = model->arrays[array_name].get();
+    auto* array = &model->GetArray(array_name);
     // Note that the notion of changing data types only applies to real-numbers
     // arrays (see the documentation for inference_input_type).
     // TODO(benoitjacob) this is assuming that uint8 arrays are quantized,
@@ -183,6 +200,14 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   }
 
   SetFinalDataTypeOnInputs(toco_flags, model);
+  UseArraysExtraInfo(model);
+
+  // Remove unused ops before performing any other optimizations. This is to
+  // stop optimizations from crossing the input/output boundaries. For example
+  // this will stop BatchNorm fusing if the output node is in between a conv
+  // and BatchNorm layers.
+  RunGraphTransformations(model, "Removing unused ops",
+                          {new toco::RemoveUnusedOp});
 
   GraphTransformationsSet transformations;
   MakeGeneralGraphTransformationsSet(&transformations);
@@ -193,32 +218,30 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   } else {
     transformations.Add(new UnfuseActivationFunctions);
   }
-  if (output_format != TENSORFLOW_GRAPHDEF) {
-    transformations.Add(new ResolveConstantFakeQuant);
-  }
   if (toco_flags.drop_fake_quant()) {
     transformations.Add(new DropFakeQuant);
   } else {
     // See the doc for --reorder_across_fake_quant: that flag is needed to
     // support some existing models, e.g. WordLens, that have FakeQuant
     // nodes in the wrong places.
-    // We currently unconditionally enable that behavior when the output
-    // format is DarwiNN because the DarwiNN test code does not make it
-    // easy to pass a new toco flag. Once that is resolved on the DarwiNN
-    // tests side, the special-casing of DarwiNN here can go away.
-    // TODO(benoitjacob): so drop it when we can.
+    // TODO(benoitjacob): drop special casing when we can.
     if ((quantize_output && toco_flags.reorder_across_fake_quant())) {
       transformations.Add(new DropFakeQuant);
     }
   }
   transformations.Add(new ConvertPureConvToDepthwise);
-  // TFLite export does not yet support fused LSTM cell.
   if (SupportsLstmCell(output_format)) {
     transformations.Add(new IdentifyLstmCell);
+    if (output_format == TFLITE) {
+      transformations.Add(new toco::SplitLstmCellInputs);
+    } else {
+      transformations.Add(new toco::MergeLstmCellInputs);
+    }
   }
   transformations.Add(new ResolveConstantConcatenation);
   RunGraphTransformations(model, "general graph transformations",
                           transformations);
+
   if (quantize_output) {
     RunGraphTransformations(model, "pre-quantization graph transformations",
                             {new HardcodeMinMax, new DropFakeQuant});
@@ -229,6 +252,10 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
         toco_flags.has_default_ranges_max()) {
       UseDefaultMinMaxRangeValues(model, toco_flags.default_ranges_min(),
                                   toco_flags.default_ranges_max());
+      // The new MinMax info may need to be propagated a bit.
+      RunGraphTransformations(
+          model, "default min-max range propagation graph transformations",
+          {new HardcodeMinMax});
     }
     CheckIsReadyForQuantization(*model);
     RunGraphTransformations(
@@ -247,6 +274,10 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
                             dequantization_transformations);
   }
 
+  if (output_format == TENSORFLOW_GRAPHDEF) {
+    EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(model);
+  }
+
   LogDump(kLogLevelModelChanged, "AFTER TRANSFORMATIONS", *model);
 
   if (output_format != GRAPHVIZ_DOT && output_format != TFLITE) {
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.h b/tensorflow/contrib/lite/toco/toco_tooling.h
index 9c5a93a21170ba773b1160eb2e1261f85cdd70e5..e731c149eef412d3048a1d5f84145ce6ff87208d 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.h
+++ b/tensorflow/contrib/lite/toco/toco_tooling.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
 
 #include <memory>
 #include <string>
@@ -47,4 +47,4 @@ inline void Export(const TocoFlags& toco_flags, const Model& model,
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
diff --git a/tensorflow/contrib/lite/toco/toco_types.h b/tensorflow/contrib/lite/toco/toco_types.h
index ad42497ada6cb0dbda673bf3aad406c9fedfb078..d72a3bd1f382679f81061a51f35586631b571400 100644
--- a/tensorflow/contrib/lite/toco/toco_types.h
+++ b/tensorflow/contrib/lite/toco/toco_types.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
 
 #include <string>
 #include "tensorflow/core/platform/platform.h"
@@ -42,4 +42,4 @@ using tensorflow::uint8;
 
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 3f289817e061afb87e621ff23bb312ff8fe73ae7..627541595b327de7a670708e886072b1ae600ff2 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_set>
 #include <utility>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
@@ -30,9 +31,26 @@ limitations under the License.
 #include "tensorflow/contrib/lite/toco/toco_port.h"
 #include "tensorflow/core/platform/logging.h"
 
-
 namespace toco {
 
+// Find the longest common prefix of two strings.
+absl::string_view FindLongestCommonPrefix(absl::string_view a,
+                                          absl::string_view b) {
+  if (a.empty() || b.empty()) return absl::string_view();
+
+  const char* pa = a.data();
+  const char* pb = b.data();
+  size_t count = 0;
+  const size_t limit = std::min(a.size(), b.size());
+  while (count < limit && *pa == *pb) {
+    ++pa;
+    ++pb;
+    ++count;
+  }
+
+  return absl::string_view(a.data(), count);
+}
+
 string LogName(const Operator& op) {
   const string& opname = HelpfulOperatorTypeName(op);
   if (op.outputs.empty()) {
@@ -92,8 +110,18 @@ int CountOpsWithInput(const Model& model, const string& array_name) {
 }
 
 bool DeleteArrayIfUnused(const string& array_name, Model* model) {
-  if (CountOpsWithInput(*model, array_name) == 0) {
-    model->arrays.erase(array_name);
+  if (IsDiscardableArray(*model, array_name) &&
+      CountOpsWithInput(*model, array_name) == 0) {
+    model->EraseArray(array_name);
+    return true;
+  }
+  return false;
+}
+
+bool DeleteArrayIfUsedOnce(const string& array_name, Model* model) {
+  if (IsDiscardableArray(*model, array_name) &&
+      CountOpsWithInput(*model, array_name) == 1) {
+    model->EraseArray(array_name);
     return true;
   }
   return false;
@@ -141,6 +169,18 @@ std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
   return model.operators.end();
 }
 
+std::vector<std::unique_ptr<Operator>>::iterator FindOpWithInput(
+    Model& model, const string& array_name) {
+  for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
+    for (auto& input : it->get()->inputs) {
+      if (input == array_name) {
+        return it;
+      }
+    }
+  }
+  return model.operators.end();
+}
+
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOp(
     const Model& model, const Operator* op) {
   for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
@@ -197,7 +237,9 @@ const char* OperatorTypeName(OperatorType type) {
   case OperatorType::k##c:              \
     return #c;
     HANDLE_OPERATORTYPENAME_CASE(Add)
+    HANDLE_OPERATORTYPENAME_CASE(AddN)
     HANDLE_OPERATORTYPENAME_CASE(AveragePool)
+    HANDLE_OPERATORTYPENAME_CASE(BatchMatMul)
     HANDLE_OPERATORTYPENAME_CASE(BatchNormalization)
     HANDLE_OPERATORTYPENAME_CASE(Conv)
     HANDLE_OPERATORTYPENAME_CASE(Concatenation)
@@ -219,10 +261,15 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Relu6)
     HANDLE_OPERATORTYPENAME_CASE(ReorderAxes)
     HANDLE_OPERATORTYPENAME_CASE(Softmax)
+    HANDLE_OPERATORTYPENAME_CASE(LogSoftmax)
     HANDLE_OPERATORTYPENAME_CASE(Div)
     HANDLE_OPERATORTYPENAME_CASE(Tanh)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowAll)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowAssert)
+    HANDLE_OPERATORTYPENAME_CASE(ExpandDims)
+    HANDLE_OPERATORTYPENAME_CASE(Fill)
+    HANDLE_OPERATORTYPENAME_CASE(FloorMod)
+    HANDLE_OPERATORTYPENAME_CASE(FloorDiv)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowGreater)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowGreaterEqual)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowIdentity)
@@ -234,8 +281,12 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowMerge)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowMin)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowMinimum)
+    HANDLE_OPERATORTYPENAME_CASE(Neg)
     HANDLE_OPERATORTYPENAME_CASE(Pad)
     HANDLE_OPERATORTYPENAME_CASE(StridedSlice)
+    HANDLE_OPERATORTYPENAME_CASE(Stack)
+    HANDLE_OPERATORTYPENAME_CASE(Range)
+    HANDLE_OPERATORTYPENAME_CASE(Rank)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowReshape)
     HANDLE_OPERATORTYPENAME_CASE(Squeeze)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowRsqrt)
@@ -248,6 +299,8 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Sub)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowSum)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowTile)
+    HANDLE_OPERATORTYPENAME_CASE(Transpose)
+    HANDLE_OPERATORTYPENAME_CASE(TransposeConv)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowConcat)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowConcatV2)
     HANDLE_OPERATORTYPENAME_CASE(Cast)
@@ -258,7 +311,9 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(BatchToSpaceND)
     HANDLE_OPERATORTYPENAME_CASE(Mean)
     HANDLE_OPERATORTYPENAME_CASE(Svdf)
+    HANDLE_OPERATORTYPENAME_CASE(ArgMax)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowUnsupported)
+    HANDLE_OPERATORTYPENAME_CASE(Exp)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -274,9 +329,23 @@ string HelpfulOperatorTypeName(const Operator& op) {
   return OperatorTypeName(op.type);
 }
 
+bool OperatorSupportsFusedActivation(OperatorType type) {
+  switch (type) {
+    case OperatorType::kConcatenation:
+    case OperatorType::kGather:
+    case OperatorType::kSlice:
+    case OperatorType::kSqueeze:
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kTensorFlowSplit:
+      return false;
+    default:
+      return true;
+  }
+}
+
 void LogSummary(int log_level, const Model& model) {
   VLOG(log_level) << "Operators summary (" << model.operators.size()
-                  << " operators): ";
+                  << " operators):";
   std::unordered_multiset<OperatorType> ops_by_type;
   for (const auto& op : model.operators) {
     ops_by_type.insert(op->type);
@@ -305,6 +374,9 @@ void LogArray(int log_level, const Model& model, const string& name) {
     case ArrayDataType::kUint8:
       VLOG(log_level) << "  Data type: kUint8";
       break;
+    case ArrayDataType::kString:
+      VLOG(log_level) << "  Data type: kString";
+      break;
     default:
       VLOG(log_level) << "  Data type: other (numerical value: "
                       << static_cast<int>(array.data_type) << ")";
@@ -323,6 +395,9 @@ void LogArray(int log_level, const Model& model, const string& name) {
     case ArrayDataType::kUint8:
       VLOG(log_level) << "  Final type: kUint8";
       break;
+    case ArrayDataType::kString:
+      VLOG(log_level) << "  Final type: kString";
+      break;
     default:
       VLOG(log_level) << "  Final type: other (numerical value: "
                       << static_cast<int>(array.data_type) << ")";
@@ -357,7 +432,7 @@ void LogArray(int log_level, const Model& model, const string& name) {
   }
   if (array.quantization_params) {
     VLOG(log_level) << "  QuantizationParams: zero_point="
-                    << array.quantization_params->zero_point
+                    << static_cast<int>(array.quantization_params->zero_point)
                     << ", scale=" << array.quantization_params->scale;
   }
 }
@@ -387,6 +462,7 @@ void DumpGraphvizVideoFrame(const Model& model) {
   DumpGraphviz(model, &graphviz_dump);
   std::size_t hash = std::hash<string>{}(graphviz_dump);
   if (!dump_hashes.count(hash)) {
+    LOG(INFO) << "DUMPING GRAPHVIZ VIDEO FRAME: " << dump_id;
     dump_hashes.insert(hash);
     CHECK(port::file::SetContents(
               port::file::JoinPath(
@@ -430,7 +506,7 @@ void LogDump(int log_level, const string& message, const Model& model) {
         LogArray(log_level, model, input);
       }
     }
-    VLOG(log_level) << HelpfulOperatorTypeName(*op) << " : ";
+    VLOG(log_level) << HelpfulOperatorTypeName(*op) << " :";
     VLOG(log_level) << "  " << FormatArraysList(model, op->inputs) << " -> "
                     << FormatArraysList(model, op->outputs);
     if (op->fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -548,60 +624,132 @@ int RequiredBufferSizeForShape(const Shape& shape) {
 }
 
 bool IsConstantParameterArray(const Model& model, const string& name) {
-  if (!model.arrays.count(name)) {
+  if (!model.HasArray(name)) {
     return false;
   }
 
-  return !!model.arrays.at(name)->buffer;
+  return !!model.GetArray(name).buffer;
 }
 
-void CheckNoMissingArray(const Model& model) {
-  for (const auto& op : model.operators) {
-    for (const auto& input : op->inputs) {
-      CHECK(model.arrays.count(input));
+namespace {
+void CheckInputArraysAreNotOutputArrays(const ModelFlags& model_flags) {
+  for (const auto& input_array : model_flags.input_arrays()) {
+    for (const string& output_array : model_flags.output_arrays()) {
+      QCHECK_NE(input_array.name(), output_array)
+          << "The array " << output_array
+          << " is listed in both --input_arrays and --output_arrays.";
     }
-    for (const auto& output : op->outputs) {
-      CHECK(model.arrays.count(output));
+  }
+}
+
+bool IsAsciiPrintable(const string& name) {
+  for (char c : name) {
+    if (!absl::ascii_isprint(c)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+string DumpAscii(const string& name) {
+  string result;
+  port::AppendF(&result, "ASCII | Hex\n");
+  port::AppendF(&result, "------+----\n");
+  for (char c : name) {
+    if (absl::ascii_isprint(c)) {
+      port::AppendF(&result, "%c     | %x\n", c, c);
+    } else {
+      port::AppendF(&result, "      | %x   Not ASCII printable!\n", c);
     }
   }
+  return result;
+}
+
+void CheckNonAsciiIOArrays(const ModelFlags& model_flags) {
+  if (model_flags.allow_nonascii_arrays()) {
+    return;
+  }
+  for (const auto& input_array : model_flags.input_arrays()) {
+    QCHECK(IsAsciiPrintable(input_array.name()))
+        << "Non-ASCII-printable character found in --input_arrays: "
+        << input_array.name()
+        << ". Pass --allow_nonascii_arrays to allow that. "
+        << "Here is a dump of the string:\n\n"
+        << DumpAscii(input_array.name());
+  }
+  for (const string& output_array : model_flags.output_arrays()) {
+    QCHECK(IsAsciiPrintable(output_array))
+        << "Non-ASCII-printable character found in --output_arrays: "
+        << output_array << ". Pass --allow_nonascii_arrays to allow that. "
+        << "Here is a dump of the string:\n\n"
+        << DumpAscii(output_array);
+  }
+}
+
+void CheckNonExistentIOArrays(const Model& model) {
+  if (model.flags.allow_nonexistent_arrays()) {
+    return;
+  }
   for (const auto& input_array : model.flags.input_arrays()) {
-    CHECK(model.arrays.count(input_array.name()))
+    CHECK(model.HasArray(input_array.name()))
         << "Input array not found: " << input_array.name();
   }
   for (const string& output_array : model.flags.output_arrays()) {
-    CHECK(model.arrays.count(output_array))
+    CHECK(model.HasArray(output_array))
         << "Output array not found: " << output_array;
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
-    CHECK(model.arrays.count(rnn_state.state_array()));
-    CHECK(model.arrays.count(rnn_state.back_edge_source_array()));
+    if (!rnn_state.discardable()) {
+      CHECK(model.HasArray(rnn_state.state_array()));
+      CHECK(model.HasArray(rnn_state.back_edge_source_array()));
+    }
+  }
+}
+}  // namespace
+
+void CheckNoMissingArray(const Model& model) {
+  for (const auto& op : model.operators) {
+    for (const auto& input : op->inputs) {
+      CHECK(model.HasArray(input) || model.optional_arrays.count(input))
+          << "Input: " << input << " missing for op: " << op->outputs[0] << ".";
+    }
+    for (const auto& output : op->outputs) {
+      CHECK(model.HasArray(output)) << "Output: " << output << " missing.";
+    }
   }
+  CheckNonExistentIOArrays(model);
 }
 
 void FixNoMissingArray(Model* model) {
   for (const auto& op : model->operators) {
     for (const auto& input : op->inputs) {
-      if (!model->arrays.count(input)) {
+      if (!model->HasArray(input)) {
         model->GetOrCreateArray(input);
       }
     }
     for (const auto& output : op->outputs) {
-      if (!model->arrays.count(output)) {
+      if (!model->HasArray(output)) {
         model->GetOrCreateArray(output);
       }
     }
   }
-  for (const string& output_array : model->flags.output_arrays()) {
-    if (!model->arrays.count(output_array)) {
+  if (model->flags.allow_nonexistent_arrays()) {
+    for (const string& output_array : model->flags.output_arrays()) {
       model->GetOrCreateArray(output_array);
     }
+    for (const auto& rnn_state : model->flags.rnn_states()) {
+      model->GetOrCreateArray(rnn_state.state_array());
+      model->GetOrCreateArray(rnn_state.back_edge_source_array());
+    }
   }
 }
 
 void CheckNoOrphanedArray(const Model& model) {
   std::unordered_set<string> arrays_without_known_use;
-  for (const auto& array : model.arrays) {
-    arrays_without_known_use.insert(array.first);
+  for (const auto& array : model.GetArrayMap()) {
+    if (IsDiscardableArray(model, array.first)) {
+      arrays_without_known_use.insert(array.first);
+    }
   }
   for (const auto& op : model.operators) {
     for (const auto& input : op->inputs) {
@@ -611,6 +759,10 @@ void CheckNoOrphanedArray(const Model& model) {
       arrays_without_known_use.erase(output);
     }
   }
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    arrays_without_known_use.erase(rnn_state.state_array());
+    arrays_without_known_use.erase(rnn_state.back_edge_source_array());
+  }
   if (!arrays_without_known_use.empty()) {
     for (const auto& array : arrays_without_known_use) {
       LOG(INFO) << "Error: Orphaned array: " << array;
@@ -621,7 +773,7 @@ void CheckNoOrphanedArray(const Model& model) {
 
 void FixNoOrphanedArray(Model* model) {
   std::unordered_set<string> arrays_without_known_use;
-  for (const auto& array : model->arrays) {
+  for (const auto& array : model->GetArrayMap()) {
     arrays_without_known_use.insert(array.first);
   }
   for (const auto& op : model->operators) {
@@ -632,13 +784,19 @@ void FixNoOrphanedArray(Model* model) {
       arrays_without_known_use.erase(output);
     }
   }
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    arrays_without_known_use.erase(rnn_state.state_array());
+    arrays_without_known_use.erase(rnn_state.back_edge_source_array());
+  }
   for (const auto& array : arrays_without_known_use) {
-    model->arrays.erase(array);
+    if (IsDiscardableArray(*model, array)) {
+      model->EraseArray(array);
+    }
   }
 }
 
 void CheckArrayFieldsConsistent(const Model& model) {
-  for (const auto& array_entry : model.arrays) {
+  for (const auto& array_entry : model.GetArrayMap()) {
     const auto& array = array_entry.second;
     if (array->has_shape()) {
       for (int d : array->shape().dims()) {
@@ -657,11 +815,13 @@ void CheckArrayFieldsConsistent(const Model& model) {
 
 void CheckOperatorOrdering(const Model& model) {
   std::unordered_set<string> arrays_behind_us;
-  for (const auto& array_entry : model.arrays) {
+  for (const auto& array_entry : model.GetArrayMap()) {
     if (!GetOpWithOutput(model, array_entry.first)) {
       arrays_behind_us.insert(array_entry.first);
     }
   }
+  arrays_behind_us.insert(model.optional_arrays.begin(),
+                          model.optional_arrays.end());
   for (const auto& op : model.operators) {
     for (const auto& input : op->inputs) {
       if (!IsConstantParameterArray(model, input)) {
@@ -680,11 +840,13 @@ void CheckOperatorOrdering(const Model& model) {
 
 void FixOperatorOrdering(Model* model) {
   std::unordered_set<string> arrays_behind_us;
-  for (const auto& array_entry : model->arrays) {
+  for (const auto& array_entry : model->GetArrayMap()) {
     if (!GetOpWithOutput(*model, array_entry.first)) {
       arrays_behind_us.insert(array_entry.first);
     }
   }
+  arrays_behind_us.insert(model->optional_arrays.begin(),
+                          model->optional_arrays.end());
   std::vector<std::unique_ptr<Operator>> old_operators;
   std::swap(old_operators, model->operators);
   std::set<std::size_t> remaining;
@@ -791,52 +953,13 @@ void FixOperatorOrdering(Model* model) {
       << "the above code should have generated a FATAL error already!";
 }
 
-// Checks that the --input_arrays of the Model are actually used by at least
-// one of the --output_arrays or --rnn_states i.e. that the graph contains a
-// path from each one of the inputs to at least one of the outputs or RNN
-// states. This catches cases where the user passed the wrong --input_arrays or
-// --output_arrays or --rnn_states, which otherwise may result in cryptic error
-// messages.
-void CheckInputsActuallyUsed(const Model& model) {
-  std::set<string> used_arrays;
-  for (const string& output : model.flags.output_arrays()) {
-    used_arrays.insert(output);
-  }
-  for (const auto& rnn_state : model.flags.rnn_states()) {
-    used_arrays.insert(rnn_state.back_edge_source_array());
-  }
-  for (int i = model.operators.size() - 1; i >= 0; i--) {
-    bool is_op_used = false;
-    for (const string& op_output : model.operators[i]->outputs) {
-      if (used_arrays.count(op_output)) {
-        is_op_used = true;
-        break;
-      }
-    }
-    if (!is_op_used) {
-      continue;
-    }
-    for (const string& op_input : model.operators[i]->inputs) {
-      used_arrays.insert(op_input);
-    }
-  }
-  for (const auto& input_array : model.flags.input_arrays()) {
-    QCHECK(used_arrays.count(input_array.name()))
-        << "The graph does not connect the input (" << input_array.name()
-        << ") specified by --input_arrays to any of the specified "
-        << "--output_arrays ("
-        << absl::StrJoin(model.flags.output_arrays(), ", ")
-        << "). Did you pass the wrong flags for this model, "
-        << "or is that model's graph actually incomplete?";
-  }
-}
-
 void CheckInvariants(const Model& model) {
+  CheckInputArraysAreNotOutputArrays(model.flags);
+  CheckNonAsciiIOArrays(model.flags);
   CheckNoMissingArray(model);
   CheckNoOrphanedArray(model);
   CheckArrayFieldsConsistent(model);
   CheckOperatorOrdering(model);
-  CheckInputsActuallyUsed(model);
 }
 
 void CheckCountInRange(const ::toco::ModelFlags::ModelCheck& model_check,
@@ -872,7 +995,8 @@ void CheckModelCounts(const Model& model) {
     if (count_type == "None") {
       continue;
     } else if (count_type == "Arrays") {
-      CheckCountInRange(model_check, model.arrays.size(), "count of arrays");
+      CheckCountInRange(model_check, model.GetArrayMap().size(),
+                        "count of arrays");
     } else if (count_type == "Total") {
       CheckCountInRange(model_check, model.operators.size(),
                         "count of all operator instances");
@@ -892,7 +1016,9 @@ void CheckModelCounts(const Model& model) {
 void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
                    std::vector<int>* out_dims) {
   CHECK(out_dims->empty());
-  if (num_dims == 1) {
+  if (num_dims == 0) {
+    return;
+  } else if (num_dims == 1) {
     CHECK_EQ(batch, 1);
     *out_dims = {depth};
   } else if (num_dims == 2) {
@@ -914,9 +1040,9 @@ void CreateOrCheckRnnStateArray(const string& name, int size, Model* model) {
     // Pick 'num_dims' and 'batch' from the first input_arrays, unless we find
     // a better match by name.
     if (input_array.name() == name || num_dims == -1) {
-      num_dims = input_array.shape_size();
-      if (num_dims != 0) {
-        batch = input_array.shape(0);
+      num_dims = input_array.shape().dims_size();
+      if (num_dims > 0) {
+        batch = input_array.shape().dims(0);
       }
     }
   }
@@ -924,13 +1050,13 @@ void CreateOrCheckRnnStateArray(const string& name, int size, Model* model) {
   if (array.has_shape()) {
     num_dims = array.shape().dimensions_count();
   }
-  std::vector<int> dims;
-  MakeArrayDims(num_dims, batch, 1, 1, size, &dims);
   CHECK(array.data_type == ArrayDataType::kFloat ||
         array.data_type == ArrayDataType::kNone);
   array.data_type = ArrayDataType::kFloat;
-  if (!array.has_shape()) {
+  if (!array.has_shape() && num_dims >= 0) {
     Shape* shape = array.mutable_shape();
+    std::vector<int> dims;
+    MakeArrayDims(num_dims, batch, 1, 1, size, &dims);
     *shape->mutable_dims() = dims;
   }
 }
@@ -985,33 +1111,32 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
     RESOLVE_MODEL_FLAG(mean_value);
 #undef RESOLVE_MODEL_FLAG
 
-    if (!specified_input_array.shape().empty()) {
-      if (!dst_input_array->shape().empty()) {
-        QCHECK_EQ(specified_input_array.shape().size(),
-                  dst_input_array->shape().size())
+    if (specified_input_array.has_shape()) {
+      if (dst_input_array->has_shape()) {
+        QCHECK_EQ(specified_input_array.shape().dims_size(),
+                  dst_input_array->shape().dims_size())
             << "For input array '" << specified_input_array.name() << "', "
             << "size of specified input shape flag with size: "
-            << specified_input_array.shape().size()
+            << specified_input_array.shape().dims_size()
             << " does not agree with already defined input shape"
                " of this model, with size: "
-            << dst_input_array->shape().size();
+            << dst_input_array->shape().dims_size();
         // We treat the first dimension as a special case, since it is often
         // a batch size and the input_shape flag is effectively overriding
         // the model.
-        for (int i = 1; i < specified_input_array.shape().size(); i++) {
-          QCHECK_EQ(specified_input_array.shape().Get(i),
-                    dst_input_array->shape().Get(i))
+        for (int i = 1; i < specified_input_array.shape().dims_size(); i++) {
+          QCHECK_EQ(specified_input_array.shape().dims(i),
+                    dst_input_array->shape().dims(i))
               << "At dimension number " << i << " of input array "
               << specified_input_array.name() << ", the specified shape's "
               << "dimension flag with dimension: "
-              << specified_input_array.shape().Get(i)
+              << specified_input_array.shape().dims(i)
               << " does not agree with already defined shape"
               << " of this model, with dimension: "
-              << dst_input_array->shape().Get(i);
+              << dst_input_array->shape().dims(i);
         }
       } else {
-        dst_input_array->mutable_shape()->CopyFrom(
-            specified_input_array.shape());
+        *dst_input_array->mutable_shape() = specified_input_array.shape();
       }
     }
 
@@ -1042,25 +1167,14 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
 
 #undef RESOLVE_MODEL_FLAG
 
-  if (model->flags.rnn_states_size() == 0) {
+  if (!model_flags.rnn_states().empty()) {
     model->flags.mutable_rnn_states()->CopyFrom(model_flags.rnn_states());
-  } else {
-    CHECK_EQ(model->flags.rnn_states_size(), model_flags.rnn_states_size());
-    for (int i = 0; i < model->flags.rnn_states_size(); i++) {
-      CHECK_EQ(model->flags.rnn_states(i).state_array(),
-               model_flags.rnn_states(i).state_array());
-      CHECK_EQ(model->flags.rnn_states(i).back_edge_source_array(),
-               model_flags.rnn_states(i).back_edge_source_array());
-    }
   }
 
   if (model->flags.model_checks_size() == 0) {
     model->flags.mutable_model_checks()->CopyFrom(model_flags.model_checks());
   }
 
-  QCHECK_GT(model->flags.input_arrays_size(), 0)
-      << "This model does not define input arrays, so a "
-         "--input_arrays flag must be given on the command-line.";
   QCHECK_GT(model->flags.output_arrays_size(), 0)
       << "This model does not define output arrays, so a "
          "--output_arrays flag must be given on the command-line.";
@@ -1088,24 +1202,27 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
       input_array.data_type = ArrayDataType::kFloat;
     }
 
-    if (!input_array.has_shape()) {
-      QCHECK(!input_array_proto.shape().empty())
-          << "This model does not have shape defined for input array "
-          << input_array_proto.name();
-    }
-
     // Compare/merge the model->flags describing the input_shape with
     // the actual input array's shape.
-    auto& input_array_dims = *input_array.mutable_shape()->mutable_dims();
-    if (input_array_dims.empty()) {
-      for (auto dim : input_array_proto.shape()) {
-        CHECK_GE(dim, 1);
-        input_array_dims.push_back(dim);
+    if (!input_array.has_shape()) {
+      if (input_array_proto.has_shape()) {
+        auto& input_array_dims = *input_array.mutable_shape()->mutable_dims();
+        for (auto dim : input_array_proto.shape().dims()) {
+          CHECK_GE(dim, 1);
+          input_array_dims.push_back(dim);
+        }
       }
     } else {
-      CHECK_EQ(input_array_dims.size(), input_array_proto.shape_size());
-      for (int i = 0; i < input_array_dims.size(); i++) {
-        CHECK_EQ(input_array_dims[i], input_array_proto.shape(i));
+      if (input_array_proto.has_shape()) {
+        // If an input shape was specified on the flags ensure that it matches
+        // the actual shape in the model.
+        const auto& input_array_dims =
+            *input_array.mutable_shape()->mutable_dims();
+        CHECK_EQ(input_array_dims.size(),
+                 input_array_proto.shape().dims_size());
+        for (int i = 0; i < input_array_dims.size(); i++) {
+          CHECK_EQ(input_array_dims[i], input_array_proto.shape().dims(i));
+        }
       }
     }
 
@@ -1128,12 +1245,22 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
   }
   // Creation of the RNN state arrays
   for (const auto& rnn_state : model->flags.rnn_states()) {
-    if (!rnn_state.manually_create()) {
-      continue;
-    }
     CreateOrCheckRnnStateArray(rnn_state.state_array(), rnn_state.size(),
                                model);
   }
+
+  for (const auto& input_array : model->flags.input_arrays()) {
+    if (input_array.has_shape()) {
+      CHECK(input_array.shape().dims_size());
+    }
+  }
+
+  model->flags.set_allow_nonascii_arrays(model_flags.allow_nonascii_arrays());
+  model->flags.set_allow_nonexistent_arrays(
+      model_flags.allow_nonexistent_arrays());
+
+  CHECK(!model->flags.has_arrays_extra_info());
+  *model->flags.mutable_arrays_extra_info() = model_flags.arrays_extra_info();
 }
 
 void CheckIsReadyForQuantization(const Model& model) {
@@ -1193,10 +1320,28 @@ int ElementSize(ArrayDataType data_type) {
   switch (data_type) {
     case ArrayDataType::kFloat:
       return 4;
-    case ArrayDataType::kInt32:
-      return 4;
+    case ArrayDataType::kInt8:
+      return 1;
     case ArrayDataType::kUint8:
       return 1;
+    case ArrayDataType::kInt16:
+      return 2;
+    case ArrayDataType::kUint16:
+      return 2;
+    case ArrayDataType::kInt32:
+      return 4;
+    case ArrayDataType::kUint32:
+      return 4;
+    case ArrayDataType::kInt64:
+      return 8;
+    case ArrayDataType::kUint64:
+      return 8;
+
+    // Usually not critical limitation because strings are only input and/or
+    // output.
+    case ArrayDataType::kString:
+      LOG(FATAL) << "Transient arrays with strings are not supported yet";
+      return 0;
     default:
       LOG(FATAL) << "Should not get here.";
       return 0;
@@ -1213,6 +1358,8 @@ void DropMinMax(Model* model, const string& array_name) {
 }
 
 bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
+  // Optional array is not transient
+  if (model.IsOptionalArray(array_name)) return false;
   // The model's input and output arrays are externally allocated.
   // They are not transient arrays.
   if (IsInputArray(model, array_name)) {
@@ -1223,7 +1370,7 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
       return false;
     }
   }
-  const auto& array = model.arrays.at(array_name);
+  const auto& array = &model.GetArray(array_name);
   // An array with a constant buffer isn't a transient array.
   if (!!array->buffer) {
     return false;
@@ -1236,13 +1383,14 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
 }
 
 string AvailableArrayName(const Model& model, const string& name) {
-  if (!model.arrays.count(name)) {
+  if (!model.HasArray(name) && !model.IsOptionalArray(name)) {
     return name;
   }
   const int kNumSuffixesToTry = 1000;
   for (int i = 0; i < kNumSuffixesToTry; i++) {
     const string& name_with_suffix = toco::port::StringF("%s_%d", name, i);
-    if (!model.arrays.count(name_with_suffix)) {
+    if (!model.HasArray(name_with_suffix) &&
+        !model.IsOptionalArray(name_with_suffix)) {
       return name_with_suffix;
     }
   }
@@ -1260,12 +1408,12 @@ string ShapeToString(const Shape& shape) {
 }
 
 void PrintArrayShape(Model* model, const string& name) {
-  if (!model->arrays[name]->has_shape()) {
+  if (!model->GetArray(name).has_shape()) {
     LOG(INFO) << name << " has no shape";
     return;
   }
   LOG(INFO) << name
-            << " has shape: " << ShapeToString(model->arrays[name]->shape());
+            << " has shape: " << ShapeToString(model->GetArray(name).shape());
 }
 
 bool IsArrayFullyConnectedWeights(const Model& model, const string& name) {
@@ -1286,6 +1434,21 @@ bool IsArrayFullyConnectedWeights(const Model& model, const string& name) {
   return is_fc_weights;
 }
 
+string CreateInt32Array(Model* model, const string& param_name,
+                        const std::vector<int>& value) {
+  auto param_array_name = AvailableArrayName(*model, param_name);
+  auto& param_array = model->GetOrCreateArray(param_array_name);
+  param_array.mutable_shape()->ReplaceDims({static_cast<int>(value.size())});
+  param_array.data_type = ArrayDataType::kInt32;
+  auto& param_array_data =
+      param_array.GetMutableBuffer<ArrayDataType::kInt32>().data;
+  param_array_data.resize(RequiredBufferSizeForShape(param_array.shape()));
+  for (int i = 0; i < value.size(); ++i) {
+    param_array_data[i] = value[i];
+  }
+  return param_array_name;
+}
+
 bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
   int64 total = 0;
   for (const auto& op : model.operators) {
@@ -1321,8 +1484,19 @@ bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
         total += RequiredBufferSizeForShape(output_array.shape());
         break;
       }
+      case OperatorType::kAddN: {
+        const auto& output_array = model.GetArray(op->outputs[0]);
+        if (!output_array.has_shape()) {
+          return false;
+        }
+        // AddN cost is roughly the same cost as N-1 Adds.
+        const int num_adds = op->inputs.size() - 1;
+        total += num_adds * RequiredBufferSizeForShape(output_array.shape());
+        break;
+      }
       case OperatorType::kLogistic:
       case OperatorType::kSoftmax:
+      case OperatorType::kLogSoftmax:
       case OperatorType::kTanh: {
         const auto& output_array = model.GetArray(op->outputs[0]);
         if (!output_array.has_shape()) {
@@ -1388,8 +1562,6 @@ bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
   return true;
 }
 
-namespace {
-
 void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
                      std::vector<int>* shuffle) {
   CHECK_EQ(AxesCount(input_axes_order), AxesCount(output_axes_order));
@@ -1424,8 +1596,6 @@ void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
   }
 }
 
-// Extend shuffle is designed to match ExtendShape, which pads the shape with
-// unit dimensions at the beginning.
 void ExtendShuffle(const std::vector<int>& input_shuffle, int newdim,
                    std::vector<int>* extended_shuffle) {
   *extended_shuffle = input_shuffle;
@@ -1440,8 +1610,6 @@ void ExtendShuffle(const std::vector<int>& input_shuffle, int newdim,
   }
 }
 
-}  // end anonymous namespace
-
 void ShuffleDims(const Shape& input_shape, AxesOrder input_axes_order,
                  AxesOrder output_axes_order, Shape* output_shape) {
   if (input_axes_order == AxesOrder::kHWIM &&
@@ -1461,9 +1629,11 @@ void ShuffleDims(const Shape& input_shape, AxesOrder input_axes_order,
   }
 }
 
-void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
-                  AxesOrder output_axes_order, const Shape& output_shape,
-                  const float* input_data, float* output_data) {
+template <typename T>
+void ShuffleArrayTemplate(const Shape& input_shape, AxesOrder input_axes_order,
+                          AxesOrder output_axes_order,
+                          const Shape& output_shape, const T* input_data,
+                          T* output_data) {
   if (input_axes_order == AxesOrder::kHWIM &&
       output_axes_order == AxesOrder::k1HWO) {
     // This special case isn't just a permutation, the IM pair of dims get
@@ -1515,16 +1685,15 @@ void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
   const int output_stride_3 = output_stride_2 * output_size_2;
 
   for (int i3 = 0; i3 < output_size_3; i3++) {
-    const float* const input_ptr_3 = input_data + i3 * input_stride_3;
-    float* const output_ptr_3 = output_data + i3 * output_stride_3;
+    const T* const input_ptr_3 = input_data + i3 * input_stride_3;
+    T* const output_ptr_3 = output_data + i3 * output_stride_3;
     for (int i2 = 0; i2 < output_size_2; i2++) {
-      const float* const input_ptr_2 = input_ptr_3 + i2 * input_stride_2;
-      float* const output_ptr_2 = output_ptr_3 + i2 * output_stride_2;
+      const T* const input_ptr_2 = input_ptr_3 + i2 * input_stride_2;
+      T* const output_ptr_2 = output_ptr_3 + i2 * output_stride_2;
       for (int i1 = 0; i1 < output_size_1; i1++) {
-        const float* input_ptr = input_ptr_2 + i1 * input_stride_1;
-        float* output_ptr = output_ptr_2 + i1 * output_stride_1;
-        float* const output_ptr_end =
-            output_ptr + output_size_0 * output_stride_0;
+        const T* input_ptr = input_ptr_2 + i1 * input_stride_1;
+        T* output_ptr = output_ptr_2 + i1 * output_stride_1;
+        T* const output_ptr_end = output_ptr + output_size_0 * output_stride_0;
         while (output_ptr != output_ptr_end) {
           *output_ptr = *input_ptr;
           input_ptr += input_stride_0;
@@ -1535,6 +1704,20 @@ void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
   }
 }
 
+void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
+                  AxesOrder output_axes_order, const Shape& output_shape,
+                  const uint8* input_data, uint8* output_data) {
+  ShuffleArrayTemplate<uint8>(input_shape, input_axes_order, output_axes_order,
+                              output_shape, input_data, output_data);
+}
+
+void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
+                  AxesOrder output_axes_order, const Shape& output_shape,
+                  const float* input_data, float* output_data) {
+  ShuffleArrayTemplate<float>(input_shape, input_axes_order, output_axes_order,
+                              output_shape, input_data, output_data);
+}
+
 int AxesCount(AxesOrder axes_order) {
   switch (axes_order) {
     case AxesOrder::kOneAxis:
@@ -1571,18 +1754,20 @@ bool IsDiscardableArray(const Model& model, const string& array_name) {
     }
   }
   for (const auto& rnn_state : model.flags.rnn_states()) {
-    if (array_name == rnn_state.state_array()) {
-      return false;
-    }
-    if (array_name == rnn_state.back_edge_source_array()) {
-      return false;
+    if (!rnn_state.discardable()) {
+      if (array_name == rnn_state.state_array()) {
+        return false;
+      }
+      if (array_name == rnn_state.back_edge_source_array()) {
+        return false;
+      }
     }
   }
   return true;
 }
 
 void CheckFinalDataTypesSatisfied(const Model& model) {
-  for (const auto& array_entry : model.arrays) {
+  for (const auto& array_entry : model.GetArrayMap()) {
     const auto& array = *array_entry.second;
     if (array.final_data_type != ArrayDataType::kNone) {
       CHECK(array.final_data_type == array.data_type)
@@ -1609,4 +1794,15 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
   }
 }
 
+void UseArraysExtraInfo(Model* model) {
+  for (const auto& entry : model->flags.arrays_extra_info().entries()) {
+    QCHECK(model->HasArray(entry.name()))
+        << "ArraysExtraInfo refers to non-existent array name: "
+        << entry.name();
+    auto& minmax = model->GetArray(entry.name()).GetOrCreateMinMax();
+    minmax.min = entry.min();
+    minmax.max = entry.max();
+  }
+}
+
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index d820d619d0de425407e88076082a3e0f8d4783a9..a2dde09156284e1c1af45091b016520b0e353627 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
 
 #include <algorithm>
 #include <cmath>
@@ -23,7 +23,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "google/protobuf/text_format.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
 #if TOCO_SUPPORT_PORTABLE_PROTOS
 #include "third_party/protobuf/src/google/protobuf/text_format.h"
@@ -50,6 +50,8 @@ namespace toco {
 constexpr int kLogLevelModelChanged = 1;
 constexpr int kLogLevelModelUnchanged = 2;
 
+absl::string_view FindLongestCommonPrefix(absl::string_view a,
+                                          absl::string_view b);
 string LogName(const Operator& op);
 
 bool IsInputArray(const Model& model, const string& name);
@@ -58,6 +60,7 @@ int CountTrueOutputs(const Model& model, const Operator& op);
 
 int CountOpsWithInput(const Model& model, const string& array_name);
 bool DeleteArrayIfUnused(const string& array_name, Model* model);
+bool DeleteArrayIfUsedOnce(const string& array_name, Model* model);
 
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
     const Model& model, const string& array_name);
@@ -65,10 +68,15 @@ Operator* GetOpWithOutput(const Model& model, const string& array_name);
 
 std::vector<std::unique_ptr<Operator>>::iterator FindOpWithOutput(
     Model& model, const string& array_name);
+
 Operator* GetOpWithOutput(const Model& model, const string& array_name);
 
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
     const Model& model, const string& array_name);
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOpWithInput(
+    Model& model, const string& array_name);
+
 Operator* GetOpWithInput(const Model& model, const string& array_name);
 Operator* GetFirstOpWithInput(const Model& model, const string& array_name);
 
@@ -80,29 +88,12 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOp(Model& model,
 const char* OperatorTypeName(OperatorType type);
 string HelpfulOperatorTypeName(const Operator& op);
 
+bool OperatorSupportsFusedActivation(OperatorType type);
+
 void DumpGraphvizVideoFrame(const Model& model);
 void LogDump(int log_level, const string& message, const Model& model);
 void LogSummary(int log_level, const string& message, const Model& model);
 
-inline bool ParseFromStringOverload(const std::string& in,
-                                    TFLITE_PROTO_NS::Message* proto) {
-  return TFLITE_PROTO_NS::TextFormat::ParseFromString(in, proto);
-}
-
-template <typename Proto>
-bool ParseFromStringEitherTextOrBinary(const std::string& input_file_contents,
-                                       Proto* proto) {
-  if (proto->ParseFromString(input_file_contents)) {
-    return true;
-  }
-
-  if (ParseFromStringOverload(input_file_contents, proto)) {
-    return true;
-  }
-
-  return false;
-}
-
 // TODO(b/36075966): Clean up when dims superseded by array shape.
 void ExtendShape(Shape* shape, int new_shape_size);
 
@@ -270,15 +261,33 @@ void PrintArrayShape(Model* model, const string& name);
 void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
                    std::vector<int>* out_dims);
 
+// Defines a constant int32 array with the provided values formatted for use
+// as op parameters.
+string CreateInt32Array(Model* model, const string& param_name,
+                        const std::vector<int>& value);
+
 bool EstimateArithmeticOpsCount(const Model& model, int64* result);
 
 int AxesCount(AxesOrder axes_order);
 
+// Returns the permutation of the dimensions based on the input axes order and
+// output axes order.
+void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
+                     std::vector<int>* shuffle);
+
+// Extend shuffle is designed to match ExtendShape, which pads the shape with
+// unit dimensions at the beginning.
+void ExtendShuffle(const std::vector<int>& input_shuffle, int newdim,
+                   std::vector<int>* extended_shuffle);
+
 void ShuffleDims(const Shape& input_shape, AxesOrder input_axes_order,
                  AxesOrder output_axes_order, Shape* output_shape);
 void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
                   AxesOrder output_axes_order, const Shape& output_shape,
                   const float* input_data, float* output_data);
+void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
+                  AxesOrder output_axes_order, const Shape& output_shape,
+                  const uint8* input_data, uint8* output_data);
 
 // Returns true if it may be OK for any graph transformation to ever discard
 // that array. The idea is that we can't ever discard arrays that are either
@@ -290,6 +299,8 @@ void CheckFinalDataTypesSatisfied(const Model& model);
 
 ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type);
 
+void UseArraysExtraInfo(Model* model);
+
 }  // namespace toco
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 21b32d8434204ca625ba0c5d3f371ee8061b77d7..999ccf2ebc009b6b7c50a9a2d1667d69a3f690e7 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -6,6 +6,16 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
+py_binary(
+    name = "visualize",
+    srcs = ["visualize.py"],
+    data = [
+        "//tensorflow/contrib/lite/schema:schema.fbs",
+        "@flatbuffers//:flatc",
+    ],
+    srcs_version = "PY2AND3",
+)
+
 tf_cc_binary(
     name = "generate_op_registrations",
     srcs = ["gen_op_registration_main.cc"],
@@ -13,6 +23,28 @@ tf_cc_binary(
         "//tensorflow/contrib/lite/tools:gen_op_registration",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_binary(
+    name = "benchmark_model",
+    srcs = ["benchmark_model.cc"],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-pie",
+            "-landroid",
+            "-lm",
+            "-z defs",
+            "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":mutable_op_resolver",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
     ],
 )
 
@@ -61,3 +93,34 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+cc_library(
+    name = "verifier",
+    srcs = ["verifier.cc"],
+    hdrs = ["verifier.h"],
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+cc_test(
+    name = "verifier_test",
+    size = "small",
+    srcs = ["verifier_test.cc"],
+    deps = [
+        ":mutable_op_resolver",
+        ":verifier",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "//tensorflow/contrib/lite/testing:util",
+        "//tensorflow/core:framework_lite",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
diff --git a/tensorflow/contrib/lite/tools/benchmark_model.cc b/tensorflow/contrib/lite/tools/benchmark_model.cc
index f80949b23e417d074e070a28608688d8863765b5..6ae3ab57294a92162b15f326630ac202a9ba2a82 100644
--- a/tensorflow/contrib/lite/tools/benchmark_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark_model.cc
@@ -31,7 +31,12 @@ void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
 #endif
 
 #define LOG(x) std::cerr
-#define CHECK(x) if (!(x)) { LOG(ERROR) << #x << "failed"; exit(1); }
+
+#define CHECK(x)                  \
+  if (!(x)) {                     \
+    LOG(ERROR) << #x << "failed"; \
+    exit(1);                      \
+  }
 
 namespace tensorflow {
 namespace benchmark_tflite_model {
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration.h b/tensorflow/contrib/lite/tools/gen_op_registration.h
index 318859e23d7b404c130f003b0e249893f2ed92fe..5f2ac6ca97fde9a2fe6f4bcf20184f6ef6606f0b 100644
--- a/tensorflow/contrib/lite/tools/gen_op_registration.h
+++ b/tensorflow/contrib/lite/tools/gen_op_registration.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
 
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/string.h"
@@ -36,4 +36,4 @@ void ReadOpsFromModel(const ::tflite::Model* model,
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration_main.cc b/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
index 1b28b8bcd97125a67bdf8eecb2c61a999a72425d..17b514c9169817479e18eecf5799ea4371f3b051 100644
--- a/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
+++ b/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
@@ -13,30 +13,50 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cassert>
 #include <fstream>
+#include <map>
 #include <sstream>
 #include <string>
 #include <vector>
 
+#include "absl/strings/strip.h"
 #include "tensorflow/contrib/lite/tools/gen_op_registration.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
+const char kInputModelFlag[] = "input_model";
+const char kOutputRegistrationFlag[] = "output_registration";
+const char kTfLitePathFlag[] = "tflite_path";
+
 using tensorflow::Flag;
 using tensorflow::Flags;
 using tensorflow::string;
 
+void ParseFlagAndInit(int argc, char** argv, string* input_model,
+                      string* output_registration, string* tflite_path) {
+  std::vector<tensorflow::Flag> flag_list = {
+      Flag(kInputModelFlag, input_model, "path to the tflite model"),
+      Flag(kOutputRegistrationFlag, output_registration,
+           "filename for generated registration code"),
+      Flag(kTfLitePathFlag, tflite_path, "Path to tensorflow lite dir"),
+  };
+
+  Flags::Parse(&argc, argv, flag_list);
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+}
+
 namespace {
 
-void GenerateFileContent(const string& filename,
+void GenerateFileContent(const std::string& tflite_path,
+                         const std::string& filename,
                          const std::vector<string>& builtin_ops,
                          const std::vector<string>& custom_ops) {
   std::ofstream fout(filename);
 
-  fout << "#include "
-          "\"third_party/tensorflow/contrib/lite/model.h\"\n";
-  fout << "#include "
-          "\"third_party/tensorflow/contrib/lite/tools/mutable_op_resolver.h\"\n";
+  fout << "#include \"" << tflite_path << "/model.h\"\n";
+  fout << "#include \"" << tflite_path << "/tools/mutable_op_resolver.h\"\n";
+
   fout << "namespace tflite {\n";
   fout << "namespace ops {\n";
   if (!builtin_ops.empty()) {
@@ -78,22 +98,20 @@ void GenerateFileContent(const string& filename,
 int main(int argc, char** argv) {
   string input_model;
   string output_registration;
-  std::vector<tensorflow::Flag> flag_list = {
-      Flag("input_model", &input_model, "path to the tflite model"),
-      Flag("output_registration", &output_registration,
-           "filename for generated registration code"),
-  };
-  Flags::Parse(&argc, argv, flag_list);
+  string tflite_path;
+  ParseFlagAndInit(argc, argv, &input_model, &output_registration,
+                   &tflite_path);
 
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
   std::vector<string> builtin_ops;
   std::vector<string> custom_ops;
-
   std::ifstream fin(input_model);
   std::stringstream content;
   content << fin.rdbuf();
-  const ::tflite::Model* model = ::tflite::GetModel(content.str().data());
+  // Need to store content data first, otherwise, it won't work in bazel.
+  string content_str = content.str();
+  const ::tflite::Model* model = ::tflite::GetModel(content_str.data());
   ::tflite::ReadOpsFromModel(model, &builtin_ops, &custom_ops);
-  GenerateFileContent(output_registration, builtin_ops, custom_ops);
+  GenerateFileContent(tflite_path, output_registration, builtin_ops,
+                      custom_ops);
   return 0;
 }
diff --git a/tensorflow/contrib/lite/tools/mutable_op_resolver.h b/tensorflow/contrib/lite/tools/mutable_op_resolver.h
index 8206a5481d7c43a9c8fb8445d056dbc7f022cfcc..573a359c458acb6e4320c5a21cb378cdde720924 100644
--- a/tensorflow/contrib/lite/tools/mutable_op_resolver.h
+++ b/tensorflow/contrib/lite/tools/mutable_op_resolver.h
@@ -12,23 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
 
 #include <map>
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/model.h"
 
 // Needed to resolve unordered_set hash on older compilers.
-namespace std
-{
-template<>
-  struct hash<tflite::BuiltinOperator> {
-    size_t operator()(const tflite::BuiltinOperator &op) const {
-      return std::hash<int>()(op);
-    }
-  };
-}
+namespace std {
+template <>
+struct hash<tflite::BuiltinOperator> {
+  size_t operator()(const tflite::BuiltinOperator& op) const {
+    return std::hash<int>()(op);
+  }
+};
+}  // namespace std
 
 namespace tflite {
 
@@ -47,10 +46,10 @@ class MutableOpResolver : public OpResolver {
   void AddCustom(const char* name, TfLiteRegistration* registration);
 
  private:
-  std::map<tflite::BuiltinOperator, TfLiteRegistration*> builtins_;
+  std::map<int, TfLiteRegistration*> builtins_;
   std::map<std::string, TfLiteRegistration*> custom_ops_;
 };
 
 }  // namespace tflite
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/tools/verifier.cc b/tensorflow/contrib/lite/tools/verifier.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59c74205f0a311ec12ff87f46622041605fb493b
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/verifier.cc
@@ -0,0 +1,234 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/verifier.h"
+#include <climits>
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/version.h"
+
+namespace tflite {
+
+namespace {
+
+// Reports error message when the reporter is set.
+void ReportError(ErrorReporter* error_reporter, const char* format, ...) {
+  if (error_reporter) {
+    va_list args;
+    va_start(args, format);
+    error_reporter->Report(format, args);
+    va_end(args);
+  }
+}
+
+// Returns the int32_t value pointed by ptr.
+const uint32_t* GetIntPtr(const char* ptr) {
+  return reinterpret_cast<const uint32_t*>(ptr);
+}
+
+// Verifies flatbuffer format of the model contents and returns the in-memory
+// model.
+const Model* VerifyFlatbufferAndGetModel(const void* buf, size_t len) {
+  ::flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
+  if (VerifyModelBuffer(verifier)) {
+    return ::tflite::GetModel(buf);
+  } else {
+    return nullptr;
+  }
+}
+
+const uint32_t kMaxNumString = UINT_MAX / sizeof(int32_t) - 2;
+
+// Verifies string tensor has legit buffer contents that follow the schema
+// defined in lite/string_util.h
+bool VerifyStringTensorBuffer(const Buffer& buffer,
+                              ErrorReporter* error_reporter) {
+  uint32_t buffer_size = buffer.data()->size();
+  const char* buffer_ptr = reinterpret_cast<const char*>(buffer.data()->data());
+
+  uint32_t num_strings = *GetIntPtr(buffer_ptr);
+  if (num_strings > kMaxNumString) {
+    ReportError(error_reporter,
+                "String tensor has invalid num of string set: %d", num_strings);
+    return false;
+  }
+  uint32_t header_offsets =
+      static_cast<uint32_t>(num_strings + 2) * sizeof(int32_t);
+
+  if (buffer_size < header_offsets) {
+    ReportError(error_reporter,
+                "String tensor buffer requires at least %d bytes, but is "
+                "allocated with %d bytes",
+                header_offsets, buffer_size);
+    return false;
+  }
+
+  uint32_t prev_ptr = header_offsets;
+  uint32_t offset = sizeof(int32_t);
+
+  if (*GetIntPtr(buffer_ptr + offset) != header_offsets) {
+    ReportError(error_reporter,
+                "String tensor buffer initial offset must be: %d",
+                header_offsets);
+    return false;
+  }
+  offset += sizeof(int32_t);
+  for (int i = 1; i <= num_strings; i++, offset += sizeof(int32_t)) {
+    int string_offset = *GetIntPtr(buffer_ptr + offset);
+    if (string_offset < prev_ptr || string_offset > buffer_size) {
+      ReportError(error_reporter, "String tensor buffer is invalid: index %d",
+                  i);
+      return false;
+    }
+  }
+  if (*GetIntPtr(buffer_ptr + offset - sizeof(int32_t)) != buffer_size) {
+    ReportError(error_reporter, "String tensor buffer last offset must be %d",
+                buffer_size);
+    return false;
+  }
+  return true;
+}
+
+// Verifies numeric tensor has legit buffer.
+bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
+                               ErrorReporter* error_reporter) {
+  uint64_t bytes_required = 1;
+  for (int dim : *tensor.shape()) {
+    bytes_required *= dim;
+    if (bytes_required > UINT_MAX) {
+      ReportError(error_reporter, "Tensor dimension overflow");
+      return false;
+    }
+  }
+  switch (tensor.type()) {
+    case TensorType_FLOAT32:
+      bytes_required *= sizeof(float);
+      break;
+    case TensorType_INT32:
+      bytes_required *= sizeof(int32_t);
+      break;
+    case TensorType_UINT8:
+      bytes_required *= sizeof(uint8_t);
+      break;
+    case TensorType_INT64:
+      bytes_required *= sizeof(int64_t);
+      break;
+    case TensorType_FLOAT16:
+      // FALLTHROUGH_INTENDED;
+    default:
+      ReportError(error_reporter, "Invalid tensor type: %d", tensor.type());
+      return false;
+  }
+  if (bytes_required > UINT_MAX) {
+    ReportError(error_reporter, "Tensor dimension overflow");
+    return false;
+  }
+
+  if (bytes_required != buffer.data()->size()) {
+    ReportError(
+        error_reporter,
+        "Tensor requires %d bytes, but is allocated with %d bytes buffer",
+        bytes_required, buffer.data()->size());
+    return false;
+  }
+  return true;
+
+  // TODO(yichengfan): verify quantized tensors.
+}
+
+// Verifies tensors have valid properties and legit buffer if set.
+bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
+  if (!model.subgraphs()) {
+    return true;
+  }
+  for (const auto& subgraph : *model.subgraphs()) {
+    if (!subgraph->tensors()) {
+      continue;
+    }
+    for (const auto& tensor : *subgraph->tensors()) {
+      if (!tensor->buffer()) {
+        continue;
+      }
+      if (tensor->buffer() >= model.buffers()->size()) {
+        ReportError(error_reporter, "Invalid tensor buffer index: %d",
+                    tensor->buffer());
+        return false;
+      }
+      auto* buffer = model.buffers()->Get(tensor->buffer());
+      if (!buffer || !buffer->data()) {
+        ReportError(error_reporter, "Tensor buffer %d not set",
+                    tensor->buffer());
+        return false;
+      }
+
+      if (tensor->type() == TensorType_STRING) {
+        if (!VerifyStringTensorBuffer(*buffer, error_reporter)) {
+          return false;
+        }
+      } else {
+        if (!VerifyNumericTensorBuffer(*tensor, *buffer, error_reporter)) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool VerifyOps(const Model& model, const OpResolver& resolver,
+               ErrorReporter* error_reporter) {
+  if (!model.operator_codes()) {
+    return true;
+  }
+  for (const auto& opcode : *model.operator_codes()) {
+    if (opcode->builtin_code() == BuiltinOperator_CUSTOM) {
+      if (!resolver.FindOp(opcode->custom_code()->c_str())) {
+        ReportError(error_reporter, "Unsupported custom op: %s",
+                    opcode->custom_code()->c_str());
+        return false;
+      }
+    } else {
+      if (!resolver.FindOp(opcode->builtin_code())) {
+        ReportError(error_reporter, "Unsupported builtin op: %s",
+                    EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+bool Verify(const void* buf, size_t len, const OpResolver& resolver,
+            ErrorReporter* error_reporter) {
+  const Model* model = VerifyFlatbufferAndGetModel(buf, len);
+  if (model == nullptr) {
+    ReportError(error_reporter, "Invalid flatbuffer format");
+    return false;
+  }
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    ReportError(error_reporter, "Invalid model version %d", model->version());
+    return false;
+  }
+  if (!VerifyTensors(*model, error_reporter)) {
+    return false;
+  }
+  if (!VerifyOps(*model, resolver, error_reporter)) {
+    return false;
+  }
+  return true;
+}
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/verifier.h b/tensorflow/contrib/lite/tools/verifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2ee11215c861ed7b27696a8d786bb6e2a48e930
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/verifier.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_VERIFIER_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_VERIFIER_H_
+
+#include <stdio.h>
+
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+
+// Verifies the integrity of a Tensorflow Lite flatbuffer model file.
+// Currently, it verifies:
+// * The file is following a legit flatbuffer schema.
+// * The model is in supported version.
+// * All ops used in the model are supported by OpResolver.
+bool Verify(const void* buf, size_t len, const OpResolver& resolver,
+            ErrorReporter* error_reporter);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_VERIFIER_H_
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3e611f999b2837efbf8876bd989db44c408b8c7
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -0,0 +1,288 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <vector>
+
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/util.h"
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/allocation.h"
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
+#include "tensorflow/contrib/lite/tools/verifier.h"
+#include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/core/framework/numeric_types.h"
+
+namespace tflite {
+
+using flatbuffers::FlatBufferBuilder;
+using flatbuffers::Offset;
+using flatbuffers::Vector;
+
+// Build single subgraph model.
+class TfLiteFlatbufferModelBuilder {
+ public:
+  TfLiteFlatbufferModelBuilder() {
+    buffers_.push_back(
+        CreateBuffer(builder_, builder_.CreateVector(std::vector<uint8_t>{})));
+  }
+
+  TfLiteFlatbufferModelBuilder(const std::vector<BuiltinOperator>& builtin_ops,
+                               const std::vector<string>& custom_ops) {
+    buffers_.push_back(
+        CreateBuffer(builder_, builder_.CreateVector(std::vector<uint8_t>{})));
+
+    for (const auto& iter : builtin_ops) {
+      resolver_.AddBuiltin(iter, &fake_op_);
+    }
+    for (const auto& iter : custom_ops) {
+      resolver_.AddCustom(iter.data(), &fake_op_);
+    }
+  }
+
+  void AddTensor(const std::vector<int>& shape, tflite::TensorType type,
+                 const std::vector<uint8_t>& buffer, const char* name) {
+    int buffer_index = 0;
+    if (!buffer.empty()) {
+      buffer_index = buffers_.size();
+      buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector(buffer)));
+    }
+    tensors_.push_back(CreateTensorDirect(builder_, &shape, type, buffer_index,
+                                          name, /*quantization=*/0));
+  }
+
+  void AddOperator(const std::vector<int32_t>& inputs,
+                   const std::vector<int32_t>& outputs,
+                   tflite::BuiltinOperator builtin_op, const char* custom_op) {
+    operator_codes_.push_back(
+        CreateOperatorCodeDirect(builder_, builtin_op, custom_op));
+    operators_.push_back(CreateOperator(
+        builder_, operator_codes_.size() - 1, builder_.CreateVector(inputs),
+        builder_.CreateVector(outputs), BuiltinOptions_NONE,
+        /*builtin_options=*/0,
+        /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS));
+  }
+
+  void FinishModel(const std::vector<int32_t>& inputs,
+                   const std::vector<int32_t>& outputs) {
+    auto subgraph = std::vector<Offset<SubGraph>>({CreateSubGraph(
+        builder_, builder_.CreateVector(tensors_),
+        builder_.CreateVector(inputs), builder_.CreateVector(outputs),
+        builder_.CreateVector(operators_),
+        builder_.CreateString("test_subgraph"))});
+    auto result = CreateModel(
+        builder_, TFLITE_SCHEMA_VERSION, builder_.CreateVector(operator_codes_),
+        builder_.CreateVector(subgraph), builder_.CreateString("test_model"),
+        builder_.CreateVector(buffers_));
+    tflite::FinishModelBuffer(builder_, result);
+  }
+
+  bool Verify() {
+    return tflite::Verify(builder_.GetBufferPointer(), builder_.GetSize(),
+                          resolver_, DefaultErrorReporter());
+  }
+
+ private:
+  FlatBufferBuilder builder_;
+  MutableOpResolver resolver_;
+  TfLiteRegistration fake_op_;
+  std::vector<Offset<Operator>> operators_;
+  std::vector<Offset<OperatorCode>> operator_codes_;
+  std::vector<Offset<Tensor>> tensors_;
+  std::vector<Offset<Buffer>> buffers_;
+};
+
+TEST(VerifyModel, TestEmptyModel) {
+  FlatBufferBuilder builder;
+  auto model = CreateModel(builder, /*version=*/TFLITE_SCHEMA_VERSION,
+                           /*operator_codes=*/0, /*subgraphs=*/0,
+                           /*description=*/0, /*buffers=*/0);
+  ::tflite::FinishModelBuffer(builder, model);
+
+  ASSERT_TRUE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                     MutableOpResolver{}, DefaultErrorReporter()));
+}
+
+TEST(VerifyModel, TestSimpleModel) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_TRUE(builder.Verify());
+}
+
+TEST(VerifyModel, TestCorruptedData) {
+  std::string model = "123";
+  ASSERT_FALSE(Verify(model.data(), model.size(), MutableOpResolver{},
+                      /*error_reporter=*/nullptr));
+}
+
+TEST(VerifyModel, TestUnsupportedVersion) {
+  FlatBufferBuilder builder;
+  auto model = CreateModel(builder, /*version=*/1, /*operator_codes=*/0,
+                           /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
+  ::tflite::FinishModelBuffer(builder, model);
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                      MutableOpResolver{}, DefaultErrorReporter()));
+}
+
+TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
+  FlatBufferBuilder builder;
+  auto model = CreateModel(builder, /*version=*/TFLITE_SCHEMA_VERSION,
+                           /*operator_codes=*/0,
+                           /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
+  ::tflite::FinishModelBuffer(builder, model);
+
+  std::string model_content(reinterpret_cast<char*>(builder.GetBufferPointer()),
+                            builder.GetSize());
+  for (int i = 0; i < model_content.size(); i++) {
+    model_content[i] = (model_content[i] + 137) % 255;
+    EXPECT_FALSE(Verify(model_content.data(), model_content.size(),
+                        MutableOpResolver{}, DefaultErrorReporter()))
+        << "Fail at position: " << i;
+  }
+}
+
+TEST(VerifyModel, TestIntTensorShapeIsGreaterThanBuffer) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({2, 1}, TensorType_UINT8, {1, 2, 3, 4}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TestIntTensorShapeOverflow) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor({1024, 2048, 4096}, TensorType_UINT8, {1, 2, 3, 4},
+                    "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, TensorBufferIsNotValid) {
+  FlatBufferBuilder builder;
+  std::vector<int> shape = {2, 3};
+  auto tensors = builder.CreateVector(std::vector<Offset<Tensor>>{
+      CreateTensorDirect(builder, &shape, TensorType_INT32, /*buffer=*/2,
+                         "input", /*quantization=*/0)});
+  auto subgraph = std::vector<Offset<SubGraph>>(
+      {CreateSubGraph(builder, tensors, /*inputs=*/0, /*outputs=*/0,
+                      /*operators=*/0, builder.CreateString("Main"))});
+
+  auto buffers = builder.CreateVector(std::vector<Offset<Buffer>>{
+      CreateBuffer(builder,
+                   builder.CreateVector(std::vector<uint8>{1, 2, 3, 4, 5, 6})),
+  });
+
+  auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, /*operator_codes=*/0,
+                           builder.CreateVector(subgraph),
+                           builder.CreateString("SmartReply"), buffers);
+
+  ::tflite::FinishModelBuffer(builder, model);
+  ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
+                      MutableOpResolver{}, DefaultErrorReporter()));
+}
+
+TEST(VerifyModel, StringTensorHasInvalidNumString) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {0x00, 0x00, 0x00, 0x20, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'},
+      "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorOffsetTooSmall) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 12, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorOffsetOutOfRange) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 22, 0, 0, 0, 'A', 'B'}, "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, StringTensorIsLargerThanRequired) {
+  TfLiteFlatbufferModelBuilder builder;
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B', 'C'},
+      "input");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, AllOpsAreSupported) {
+  TfLiteFlatbufferModelBuilder builder({BuiltinOperator_ADD}, {"CustomOp"});
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "output");
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_ADD, nullptr);
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "CustomOp");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, UseUnsupportedBuiltinOps) {
+  TfLiteFlatbufferModelBuilder builder({BuiltinOperator_SUB}, {"CustomOp"});
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "output");
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_ADD, nullptr);
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+TEST(VerifyModel, UseUnsupportedCustomOps) {
+  TfLiteFlatbufferModelBuilder builder({BuiltinOperator_ADD}, {"NewOp"});
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "output");
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "Not supported");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+}
+
+// TODO(yichengfan): make up malicious files to test with.
+
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/visualize.py b/tensorflow/contrib/lite/tools/visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..f571dd59da0a3f4aff264b48fba3e41f75b50404
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/visualize.py
@@ -0,0 +1,391 @@
+#!/usr/bin/env python
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This tool creates an html visualization of a TensorFlow Lite graph.
+
+Example usage:
+
+python visualize.py foo.tflite foo.html
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+import sys
+
+# Schema to use for flatbuffers
+_SCHEMA = "third_party/tensorflow/contrib/lite/schema/schema.fbs"
+
+# Where the binary will be once built in for the flatc converter
+_BINARY = "third_party/flatbuffers/flatc"
+
+# A CSS description for making the visualizer
+_CSS = """
+<html>
+<head>
+<style>
+body {font-family: sans-serif; background-color: #ffaa00;}
+table {background-color: #eeccaa;}
+th {background-color: black; color: white;}
+h1 {
+  background-color: ffaa00;
+  padding:5px;
+  color: black;
+}
+
+div {
+  border-radius: 5px;
+  background-color: #ffeecc;
+  padding:5px;
+  margin:5px;
+}
+
+.tooltip {color: blue;}
+.tooltip .tooltipcontent  {
+    visibility: hidden;
+    color: black;
+    background-color: yellow;
+    padding: 5px;
+    border-radius: 4px;
+    position: absolute;
+    z-index: 1;
+}
+.tooltip:hover .tooltipcontent {
+    visibility: visible;
+}
+
+.edges line {
+  stroke: #333333;
+}
+
+.nodes text {
+  color: black;
+  pointer-events: none;
+  font-family: sans-serif;
+  font-size: 11px;
+}
+</style>
+
+<script src="https://d3js.org/d3.v4.min.js"></script>
+
+</head>
+<body>
+"""
+
+_D3_HTML_TEMPLATE = """
+  <script>
+    // Build graph data
+    var graph = %s;
+
+    var svg = d3.select("#subgraph%d");
+    var width = svg.attr("width");
+    var height = svg.attr("height");
+    var color = d3.scaleOrdinal(d3.schemeCategory20);
+
+    var simulation = d3.forceSimulation()
+        .force("link", d3.forceLink().id(function(d) {return d.id;}))
+        .force("charge", d3.forceManyBody())
+        .force("center", d3.forceCenter(0.5 * width, 0.5 * height));
+
+
+    function buildGraph() {
+      var edge = svg.append("g").attr("class", "edges").selectAll("line")
+        .data(graph.edges).enter().append("line")
+      // Make the node group
+      var node = svg.selectAll(".nodes")
+        .data(graph.nodes)
+        .enter().append("g")
+        .attr("class", "nodes")
+          .call(d3.drag()
+              .on("start", function(d) {
+                if(!d3.event.active) simulation.alphaTarget(1.0).restart();
+                d.fx = d.x;d.fy = d.y;
+              })
+              .on("drag", function(d) {
+                d.fx = d3.event.x; d.fy = d3.event.y;
+              })
+              .on("end", function(d) {
+                if (!d3.event.active) simulation.alphaTarget(0);
+                d.fx = d.fy = null;
+              }));
+      // Within the group, draw a circle for the node position and text
+      // on the side.
+      node.append("circle")
+          .attr("r", "5px")
+          .attr("fill", function(d) { return color(d.group); })
+      node.append("text")
+          .attr("dx", 8).attr("dy", 5).text(function(d) { return d.name; });
+      // Setup force parameters and update position callback
+      simulation.nodes(graph.nodes).on("tick", forceSimulationUpdated);
+      simulation.force("link").links(graph.edges);
+
+      function forceSimulationUpdated() {
+        // Update edges.
+        edge.attr("x1", function(d) {return d.source.x;})
+            .attr("y1", function(d) {return d.source.y;})
+            .attr("x2", function(d) {return d.target.x;})
+            .attr("y2", function(d) {return d.target.y;});
+        // Update node positions
+        node.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; });
+      }
+    }
+  buildGraph()
+</script>
+"""
+
+
+class OpCodeMapper(object):
+  """Maps an opcode index to an op name."""
+
+  def __init__(self, data):
+    self.code_to_name = {}
+    for idx, d in enumerate(data["operator_codes"]):
+      self.code_to_name[idx] = d["builtin_code"]
+
+  def __call__(self, x):
+    if x not in self.code_to_name:
+      s = "<UNKNOWN>"
+    else:
+      s = self.code_to_name[x]
+    return "%s (opcode=%d)" % (s, x)
+
+
+class DataSizeMapper(object):
+  """For buffers, report the number of bytes."""
+
+  def __call__(self, x):
+    if x is not None:
+      return "%d bytes" % len(x)
+    else:
+      return "--"
+
+
+class TensorMapper(object):
+  """Maps a list of tensor indices to a tooltip hoverable indicator of more."""
+
+  def __init__(self, subgraph_data):
+    self.data = subgraph_data
+
+  def __call__(self, x):
+    html = ""
+    html += "<span class='tooltip'><span class='tooltipcontent'>"
+    for i in x:
+      tensor = self.data["tensors"][i]
+      html += str(i) + " "
+      html += tensor["name"] + " "
+      html += str(tensor["type"]) + " "
+      html += repr(tensor["shape"]) + "<br>"
+    html += "</span>"
+    html += repr(x)
+    html += "</span>"
+    return html
+
+
+def GenerateGraph(subgraph_idx, g, opcode_mapper):
+  """Produces the HTML required to have a d3 visualization of the dag."""
+
+  def TensorName(idx):
+    return "t%d" % idx
+
+  def OpName(idx):
+    return "o%d" % idx
+
+  edges = []
+  nodes = []
+  first = {}
+  pixel_mult = 50  # TODO(aselle): multiplier for initial placement
+  for op_index, op in enumerate(g["operators"]):
+    for tensor_input_position, tensor_index in enumerate(op["inputs"]):
+      if tensor_index not in first:
+        first[tensor_index] = (
+            op_index * pixel_mult,
+            tensor_input_position * pixel_mult - pixel_mult / 2)
+      edges.append({
+          "source": TensorName(tensor_index),
+          "target": OpName(op_index)
+      })
+    for tensor_index in op["outputs"]:
+      edges.append({
+          "target": TensorName(tensor_index),
+          "source": OpName(op_index)
+      })
+    nodes.append({
+        "id": OpName(op_index),
+        "name": opcode_mapper(op["opcode_index"]),
+        "group": 2,
+        "x": pixel_mult,
+        "y": op_index * pixel_mult
+    })
+  for tensor_index, tensor in enumerate(g["tensors"]):
+    initial_y = (
+        first[tensor_index] if tensor_index in first else len(g["operators"]))
+
+    nodes.append({
+        "id": TensorName(tensor_index),
+        "name": "%s (%d)" % (tensor["name"], tensor_index),
+        "group": 1,
+        "x": 2,
+        "y": initial_y
+    })
+  graph_str = json.dumps({"nodes": nodes, "edges": edges})
+
+  html = _D3_HTML_TEMPLATE % (graph_str, subgraph_idx)
+  return html
+
+
+def GenerateTableHtml(items, keys_to_print, display_index=True):
+  """Given a list of object values and keys to print, make an HTML table.
+
+  Args:
+    items: Items to print an array of dicts.
+    keys_to_print: (key, display_fn). `key` is a key in the object. i.e.
+      items[0][key] should exist. display_fn is the mapping function on display.
+      i.e. the displayed html cell will have the string returned by
+      `mapping_fn(items[0][key])`.
+    display_index: add a column which is the index of each row in `items`.
+  Returns:
+    An html table.
+  """
+  html = ""
+  # Print the list of  items
+  html += "<table><tr>\n"
+  html += "<tr>\n"
+  if display_index:
+    html += "<th>index</th>"
+  for h, mapper in keys_to_print:
+    html += "<th>%s</th>" % h
+  html += "</tr>\n"
+  for idx, tensor in enumerate(items):
+    html += "<tr>\n"
+    if display_index:
+      html += "<td>%d</td>" % idx
+    # print tensor.keys()
+    for h, mapper in keys_to_print:
+      val = tensor[h] if h in tensor else None
+      val = val if mapper is None else mapper(val)
+      html += "<td>%s</td>\n" % val
+
+    html += "</tr>\n"
+  html += "</table>\n"
+  return html
+
+
+def CreateHtmlFile(tflite_input, html_output):
+  """Given a tflite model in `tflite_input` file, produce html description."""
+
+  # Convert the model into a JSON flatbuffer using flatc (build if doesn't
+  # exist.
+  if not os.path.exists(tflite_input):
+    raise RuntimeError("Invalid filename %r" % tflite_input)
+  if tflite_input.endswith(".tflite") or tflite_input.endswith(".bin"):
+
+    # Run convert
+    cmd = (
+        _BINARY + " -t "
+        "--strict-json --defaults-json -o /tmp {schema} -- {input}".format(
+            input=tflite_input, schema=_SCHEMA))
+    print(cmd)
+    os.system(cmd)
+    real_output = ("/tmp/" + os.path.splitext(
+        os.path.split(tflite_input)[-1])[0] + ".json")
+
+    data = json.load(open(real_output))
+  elif tflite_input.endswith(".json"):
+    data = json.load(open(tflite_input))
+  else:
+    raise RuntimeError("Input file was not .tflite or .json")
+  html = ""
+  html += _CSS
+  html += "<h1>TensorFlow Lite Model</h2>"
+
+  data["filename"] = tflite_input  # Avoid special case
+  toplevel_stuff = [("filename", None), ("version", None), ("description",
+                                                            None)]
+
+  html += "<table>\n"
+  for key, mapping in toplevel_stuff:
+    if not mapping:
+      mapping = lambda x: x
+    html += "<tr><th>%s</th><td>%s</td></tr>\n" % (key, mapping(data[key]))
+  html += "</table>\n"
+
+  # Spec on what keys to display
+  buffer_keys_to_display = [("data", DataSizeMapper())]
+  operator_keys_to_display = [("builtin_code", None)]
+
+  for subgraph_idx, g in enumerate(data["subgraphs"]):
+    # Subgraph local specs on what to display
+    html += "<div class='subgraph'>"
+    tensor_mapper = TensorMapper(g)
+    opcode_mapper = OpCodeMapper(data)
+    op_keys_to_display = [("inputs", tensor_mapper), ("outputs", tensor_mapper),
+                          ("builtin_options", None), ("opcode_index",
+                                                      opcode_mapper)]
+    tensor_keys_to_display = [("name", None), ("type", None), ("shape", None),
+                              ("buffer", None), ("quantization", None)]
+
+    html += "<h2>Subgraph %d</h2>\n" % subgraph_idx
+
+    # Inputs and outputs.
+    html += "<h3>Inputs/Outputs</h3>\n"
+    html += GenerateTableHtml(
+        [{
+            "inputs": g["inputs"],
+            "outputs": g["outputs"]
+        }], [("inputs", tensor_mapper), ("outputs", tensor_mapper)],
+        display_index=False)
+
+    # Print the tensors.
+    html += "<h3>Tensors</h3>\n"
+    html += GenerateTableHtml(g["tensors"], tensor_keys_to_display)
+
+    # Print the ops.
+    html += "<h3>Ops</h3>\n"
+    html += GenerateTableHtml(g["operators"], op_keys_to_display)
+
+    # Visual graph.
+    html += "<svg id='subgraph%d' width='960' height='1600'></svg>\n" % (
+        subgraph_idx,)
+    html += GenerateGraph(subgraph_idx, g, opcode_mapper)
+    html += "</div>"
+
+  # Buffers have no data, but maybe in the future they will
+  html += "<h2>Buffers</h2>\n"
+  html += GenerateTableHtml(data["buffers"], buffer_keys_to_display)
+
+  # Operator codes
+  html += "<h2>Operator Codes</h2>\n"
+  html += GenerateTableHtml(data["operator_codes"], operator_keys_to_display)
+
+  html += "</body></html>\n"
+
+  open(html_output, "w").write(html)
+
+
+def main(argv):
+  try:
+    tflite_input = argv[1]
+    html_output = argv[2]
+  except IndexError:
+    print("Usage: %s <input tflite> <output html>" % (argv[0]))
+  else:
+    CreateHtmlFile(tflite_input, html_output)
+
+
+if __name__ == "__main__":
+  main(sys.argv)
diff --git a/tensorflow/contrib/lite/version.h b/tensorflow/contrib/lite/version.h
index a751afabe7460f0c9e88385faf1497b2c0a25d6b..efd63f4006ae661c6fdbbaa81cb02fa8947271f3 100644
--- a/tensorflow/contrib/lite/version.h
+++ b/tensorflow/contrib/lite/version.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_VERSION_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_VERSION_H_
+#ifndef TENSORFLOW_CONTRIB_LITE_VERSION_H_
+#define TENSORFLOW_CONTRIB_LITE_VERSION_H_
 
 // The version number of the Schema. Ideally all changes will be backward
 // compatible. If that ever changes, we must ensure that version is the first
 // entry in the new tflite root so that we can see that version is not 1.
 #define TFLITE_SCHEMA_VERSION (3)
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_VERSION_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_VERSION_H_
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index b7b5418fe91e496f021b44fc32a33d2a549782e5..8ca03f4193f260ce32f942ccaf76a8260b282156 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -7,7 +7,7 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:internal"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 # TODO(yleon): Refactor after one we switching to the V2 kernels.
 py_library(
@@ -26,13 +26,14 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "lookup_ops_test",
     size = "small",
     srcs = ["lookup_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":lookup_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -43,9 +44,8 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
+    grpc_enabled = True,
 )
 
 filegroup(
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index 66caa6a2e5d17f74706965b7ca3f7928d63ae130..a430dac4ec43ce31f0b5aaae5e7b0b51d25c9632 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -399,7 +399,7 @@ class MutableHashTable(LookupInterface):
     Raises:
       TypeError: when `keys` do not match the table data types.
     """
-    if keys.dtype != self._key_dtype:
+    if keys.dtype.base_dtype != self._key_dtype:
       raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
                       (self._key_dtype, keys.dtype))
 
@@ -600,7 +600,7 @@ class MutableDenseHashTable(LookupInterface):
     Raises:
       TypeError: when `keys` do not match the table data types.
     """
-    if keys.dtype != self._key_dtype:
+    if keys.dtype.base_dtype != self._key_dtype:
       raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
                       (self._key_dtype, keys.dtype))
 
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index f0499010d476f68da541ee67d085b12e48faeaf5..f681b7b132750ef80aa56f25143418fbc4eaa1bb 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -187,6 +187,11 @@ class HashTableOpTest(test.TestCase):
           lookup.KeyValueTensorInitializer(keys, values), default_val)
       table.init.run()
 
+      # Ref types do not produce a lookup signature mismatch.
+      input_string_ref = variables.Variable("brain")
+      variables.global_variables_initializer().run()
+      self.assertEqual(0, table.lookup(input_string_ref).eval())
+
       input_string = constant_op.constant([1, 2, 3], dtypes.int64)
       with self.assertRaises(TypeError):
         table.lookup(input_string)
@@ -629,6 +634,17 @@ class MutableHashTableOpTest(test.TestCase):
       table.insert(keys, values).run()
       self.assertAllEqual(3, table.size().eval())
 
+      input_string_ref = variables.Variable("brain")
+      input_int64_ref = variables.Variable(-1, dtype=dtypes.int64)
+      variables.global_variables_initializer().run()
+
+      # Ref types do not produce an insert signature mismatch.
+      table.insert(input_string_ref, input_int64_ref).run()
+      self.assertAllEqual(3, table.size().eval())
+
+      # Ref types do not produce a lookup signature mismatch.
+      self.assertEqual(-1, table.lookup(input_string_ref).eval())
+
       # lookup with keys of the wrong type
       input_string = constant_op.constant([1, 2, 3], dtypes.int64)
       with self.assertRaises(TypeError):
@@ -1640,23 +1656,22 @@ class InitializeTableFromFileOpTest(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
+  @test_util.run_in_graph_and_eager_modes()
   def testInitializeStringTable(self):
     vocabulary_file = self._createVocabFile("one_column_1.txt")
+    default_value = -1
+    table = lookup.HashTable(
+        lookup.TextFileInitializer(vocabulary_file, dtypes.string,
+                                   lookup.TextFileIndex.WHOLE_LINE,
+                                   dtypes.int64,
+                                   lookup.TextFileIndex.LINE_NUMBER),
+        default_value)
+    self.evaluate(table.init)
 
-    with self.test_session():
-      default_value = -1
-      table = lookup.HashTable(
-          lookup.TextFileInitializer(vocabulary_file, dtypes.string,
-                                     lookup.TextFileIndex.WHOLE_LINE,
-                                     dtypes.int64,
-                                     lookup.TextFileIndex.LINE_NUMBER),
-          default_value)
-      table.init.run()
-
-      output = table.lookup(constant_op.constant(["brain", "salad", "tank"]))
+    output = table.lookup(constant_op.constant(["brain", "salad", "tank"]))
 
-      result = output.eval()
-      self.assertAllEqual([0, 1, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
 
   def testInitializeInt64Table(self):
     vocabulary_file = self._createVocabFile(
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 7c523ad49265aaf32c8d5a8ae04d3e93262a1b55..8c3a8afe7a0f6f5ad9ceae566288ba60be73d339 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -30,20 +30,13 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import deprecated_args
 
-__all__ = ["absolute_difference",
-           "add_loss",
-           "cosine_distance",
-           "compute_weighted_loss",
-           "get_losses",
-           "get_regularization_losses",
-           "get_total_loss",
-           "hinge_loss",
-           "log_loss",
-           "mean_pairwise_squared_error",
-           "mean_squared_error",
-           "sigmoid_cross_entropy",
-           "softmax_cross_entropy",
-           "sparse_softmax_cross_entropy"]
+__all__ = [
+    "absolute_difference", "add_loss", "cosine_distance",
+    "compute_weighted_loss", "get_losses", "get_regularization_losses",
+    "get_total_loss", "hinge_loss", "log_loss", "mean_pairwise_squared_error",
+    "mean_squared_error", "sigmoid_cross_entropy", "softmax_cross_entropy",
+    "sparse_softmax_cross_entropy"
+]
 
 
 def _scale_losses(losses, weights):
@@ -66,8 +59,8 @@ def _scale_losses(losses, weights):
   # First, compute the sum of the losses over all elements:
   start_index = max(0, weights.get_shape().ndims)
   reduction_indices = list(range(start_index, losses.get_shape().ndims))
-  reduced_losses = math_ops.reduce_sum(losses,
-                                       reduction_indices=reduction_indices)
+  reduced_losses = math_ops.reduce_sum(
+      losses, reduction_indices=reduction_indices)
   reduced_losses = math_ops.multiply(reduced_losses, weights)
   return math_ops.reduce_sum(reduced_losses)
 
@@ -90,9 +83,10 @@ def _safe_div(numerator, denominator, name="value"):
   """
   return array_ops.where(
       math_ops.greater(denominator, 0),
-      math_ops.div(numerator, array_ops.where(
-          math_ops.equal(denominator, 0),
-          array_ops.ones_like(denominator), denominator)),
+      math_ops.div(numerator,
+                   array_ops.where(
+                       math_ops.equal(denominator, 0),
+                       array_ops.ones_like(denominator), denominator)),
       array_ops.zeros_like(numerator),
       name=name)
 
@@ -176,14 +170,15 @@ def _num_present(losses, weights, per_batch=False):
   """
   # If weights is a scalar, its easy to compute:
   if weights.get_shape().ndims == 0:
-    batch_size = array_ops.reshape(array_ops.slice(array_ops.shape(losses),
-                                                   [0], [1]), [])
-    num_per_batch = math_ops.div(math_ops.to_float(array_ops.size(losses)),
-                                 math_ops.to_float(batch_size))
-    num_per_batch = array_ops.where(math_ops.equal(weights, 0),
-                                    0.0, num_per_batch)
-    num_per_batch = math_ops.multiply(array_ops.ones(
-        array_ops.reshape(batch_size, [1])), num_per_batch)
+    batch_size = array_ops.reshape(
+        array_ops.slice(array_ops.shape(losses), [0], [1]), [])
+    num_per_batch = math_ops.div(
+        math_ops.to_float(array_ops.size(losses)),
+        math_ops.to_float(batch_size))
+    num_per_batch = array_ops.where(
+        math_ops.equal(weights, 0), 0.0, num_per_batch)
+    num_per_batch = math_ops.multiply(
+        array_ops.ones(array_ops.reshape(batch_size, [1])), num_per_batch)
     return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)
 
   # First, count the number of nonzero weights:
@@ -194,8 +189,8 @@ def _num_present(losses, weights, per_batch=False):
         reduction_indices=reduction_indices)
 
   # Next, determine the number of elements that weights would broadcast to:
-  broadcast_dims = array_ops.slice(array_ops.shape(losses),
-                                   [weights.get_shape().ndims], [-1])
+  broadcast_dims = array_ops.slice(
+      array_ops.shape(losses), [weights.get_shape().ndims], [-1])
   num_to_broadcast = math_ops.to_float(math_ops.reduce_prod(broadcast_dims))
 
   num_per_batch = math_ops.multiply(num_nonzero_per_batch, num_to_broadcast)
@@ -303,8 +298,11 @@ def absolute_difference(predictions, labels=None, weights=1.0, scope=None):
 @deprecated("2016-12-30",
             "Use tf.losses.sigmoid_cross_entropy instead. Note that the order "
             "of the predictions and labels arguments has been changed.")
-def sigmoid_cross_entropy(
-    logits, multi_class_labels, weights=1.0, label_smoothing=0, scope=None):
+def sigmoid_cross_entropy(logits,
+                          multi_class_labels,
+                          weights=1.0,
+                          label_smoothing=0,
+                          scope=None):
   """Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -340,20 +338,22 @@ def sigmoid_cross_entropy(
     multi_class_labels = math_ops.cast(multi_class_labels, logits.dtype)
 
     if label_smoothing > 0:
-      multi_class_labels = (multi_class_labels * (1 - label_smoothing) +
-                            0.5 * label_smoothing)
+      multi_class_labels = (
+          multi_class_labels * (1 - label_smoothing) + 0.5 * label_smoothing)
 
-    losses = nn.sigmoid_cross_entropy_with_logits(labels=multi_class_labels,
-                                                  logits=logits,
-                                                  name="xentropy")
+    losses = nn.sigmoid_cross_entropy_with_logits(
+        labels=multi_class_labels, logits=logits, name="xentropy")
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
 @deprecated("2016-12-30",
             "Use tf.losses.softmax_cross_entropy instead. Note that the order "
             "of the logits and labels arguments has been changed.")
-def softmax_cross_entropy(
-    logits, onehot_labels, weights=1.0, label_smoothing=0, scope=None):
+def softmax_cross_entropy(logits,
+                          onehot_labels,
+                          weights=1.0,
+                          label_smoothing=0,
+                          scope=None):
   """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -393,9 +393,8 @@ def softmax_cross_entropy(
       smooth_negatives = label_smoothing / num_classes
       onehot_labels = onehot_labels * smooth_positives + smooth_negatives
 
-    losses = nn.softmax_cross_entropy_with_logits(labels=onehot_labels,
-                                                  logits=logits,
-                                                  name="xentropy")
+    losses = nn.softmax_cross_entropy_with_logits(
+        labels=onehot_labels, logits=logits, name="xentropy")
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
@@ -429,9 +428,8 @@ def sparse_softmax_cross_entropy(logits, labels, weights=1.0, scope=None):
                       [logits, labels, weights]) as scope:
     labels = array_ops.reshape(labels, shape=[array_ops.shape(labels)[0]])
 
-    losses = nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
-                                                         logits=logits,
-                                                         name="xentropy")
+    losses = nn.sparse_softmax_cross_entropy_with_logits(
+        labels=labels, logits=logits, name="xentropy")
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
@@ -470,8 +468,7 @@ def log_loss(predictions, labels=None, weights=1.0, epsilon=1e-7, scope=None):
     predictions = math_ops.to_float(predictions)
     labels = math_ops.to_float(labels)
     losses = -math_ops.multiply(
-        labels,
-        math_ops.log(predictions + epsilon)) - math_ops.multiply(
+        labels, math_ops.log(predictions + epsilon)) - math_ops.multiply(
             (1 - labels), math_ops.log(1 - predictions + epsilon))
     return compute_weighted_loss(losses, weights, scope=scope)
 
@@ -490,7 +487,8 @@ def hinge_loss(logits, labels=None, scope=None):
     scope: The scope for the operations performed in computing the loss.
 
   Returns:
-    An unweighted `Tensor` of same shape as `logits` and `labels` representing the
+    An unweighted `Tensor` of same shape as `logits` and `labels` representing
+    the
       loss values across the batch.
 
   Raises:
@@ -544,8 +542,10 @@ def mean_squared_error(predictions, labels=None, weights=1.0, scope=None):
 @deprecated("2016-12-30",
             "Use tf.losses.mean_pairwise_squared_error instead. Note that the "
             "order of the predictions and labels arguments has been changed.")
-def mean_pairwise_squared_error(
-    predictions, labels=None, weights=1.0, scope=None):
+def mean_pairwise_squared_error(predictions,
+                                labels=None,
+                                weights=1.0,
+                                scope=None):
   """Adds a pairwise-errors-squared loss to the training procedure.
 
   Unlike `mean_squared_error`, which is a measure of the differences between
@@ -602,31 +602,34 @@ def mean_pairwise_squared_error(
     reduction_indices = list(range(1, diffs.get_shape().ndims))
 
     sum_squares_diff_per_batch = math_ops.reduce_sum(
-        math_ops.square(diffs),
-        reduction_indices=reduction_indices)
+        math_ops.square(diffs), reduction_indices=reduction_indices)
     num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
-    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                            num_present_per_batch)
+    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch, num_present_per_batch)
 
     sum_diff = math_ops.reduce_sum(diffs, reduction_indices=reduction_indices)
-    term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                            math_ops.square(num_present_per_batch))
+    term2 = 2.0 * _safe_div(
+        math_ops.square(sum_diff), math_ops.square(num_present_per_batch))
 
     loss = _scale_losses(term1 - term2, weights)
 
-    mean_loss = array_ops.where(math_ops.reduce_sum(num_present_per_batch) > 0,
-                                loss,
-                                array_ops.zeros_like(loss),
-                                name="value")
+    mean_loss = array_ops.where(
+        math_ops.reduce_sum(num_present_per_batch) > 0,
+        loss,
+        array_ops.zeros_like(loss),
+        name="value")
     add_loss(mean_loss)
     return mean_loss
 
 
 @deprecated("2016-12-30", "Use tf.losses.cosine_distance instead.")
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
-def cosine_distance(
-    predictions, labels=None, axis=None, weights=1.0, scope=None, dim=None):
+def cosine_distance(predictions,
+                    labels=None,
+                    axis=None,
+                    weights=1.0,
+                    scope=None,
+                    dim=None):
   """Adds a cosine-distance loss to the training procedure.
 
   Note that the function assumes that `predictions` and `labels` are already
@@ -662,5 +665,8 @@ def cosine_distance(
     labels = math_ops.to_float(labels)
 
     radial_diffs = math_ops.multiply(predictions, labels)
-    losses = 1 - math_ops.reduce_sum(radial_diffs, reduction_indices=[axis,])
+    losses = 1 - math_ops.reduce_sum(
+        radial_diffs, reduction_indices=[
+            axis,
+        ])
     return compute_weighted_loss(losses, weights, scope=scope)
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops_test.py b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
index 9d0f95e6f3e7fa9666a99e31578b38d52e0b6b4a..1417772e0496cb571488e5b30bd4f3fb1b591730 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops_test.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -274,6 +275,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertAlmostEqual(np.average(weights) * 10.0, loss, 3)
 
 
+@test_util.with_c_api
 class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
   def testNoneWeightRaisesValueError(self):
@@ -471,7 +473,11 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0, 1], [2, 3]])
       weights = constant_op.constant([1.2, 3.4, 5.6, 7.8])
 
-      with self.assertRaises(errors_impl.InvalidArgumentError):
+      if ops._USE_C_API:
+        error_type = ValueError
+      else:
+        error_type = errors_impl.InvalidArgumentError
+      with self.assertRaises(error_type):
         loss_ops.sparse_softmax_cross_entropy(
             logits, labels, weights=weights).eval()
 
diff --git a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
index c3a57ba51bcf0a292490dfaa9e556f6e5811ed66..6842bc38eb108b46cc3eff715c9cbc74f991308b 100644
--- a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
+++ b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
@@ -53,12 +53,12 @@ def pairwise_distance(feature, squared=False):
       math_ops.reduce_sum(
           math_ops.square(feature),
           axis=[1],
-          keep_dims=True),
+          keepdims=True),
       math_ops.reduce_sum(
           math_ops.square(
               array_ops.transpose(feature)),
           axis=[0],
-          keep_dims=True)) - 2.0 * math_ops.matmul(
+          keepdims=True)) - 2.0 * math_ops.matmul(
               feature, array_ops.transpose(feature))
 
   # Deal with numerical inaccuracies. Set small negatives to zero.
@@ -132,10 +132,10 @@ def masked_maximum(data, mask, dim=1):
     masked_maximums: N-D `Tensor`.
       The maximized dimension is of size 1 after the operation.
   """
-  axis_minimums = math_ops.reduce_min(data, dim, keep_dims=True)
+  axis_minimums = math_ops.reduce_min(data, dim, keepdims=True)
   masked_maximums = math_ops.reduce_max(
       math_ops.multiply(
-          data - axis_minimums, mask), dim, keep_dims=True) + axis_minimums
+          data - axis_minimums, mask), dim, keepdims=True) + axis_minimums
   return masked_maximums
 
 
@@ -151,10 +151,10 @@ def masked_minimum(data, mask, dim=1):
     masked_minimums: N-D `Tensor`.
       The minimized dimension is of size 1 after the operation.
   """
-  axis_maximums = math_ops.reduce_max(data, dim, keep_dims=True)
+  axis_maximums = math_ops.reduce_max(data, dim, keepdims=True)
   masked_minimums = math_ops.reduce_min(
       math_ops.multiply(
-          data - axis_maximums, mask), dim, keep_dims=True) + axis_maximums
+          data - axis_maximums, mask), dim, keepdims=True) + axis_maximums
   return masked_minimums
 
 
@@ -203,7 +203,7 @@ def triplet_semihard_loss(labels, embeddings, margin=1.0):
       math_ops.greater(
           math_ops.reduce_sum(
               math_ops.cast(
-                  mask, dtype=dtypes.float32), 1, keep_dims=True),
+                  mask, dtype=dtypes.float32), 1, keepdims=True),
           0.0), [batch_size, batch_size])
   mask_final = array_ops.transpose(mask_final)
 
@@ -290,7 +290,7 @@ def npairs_loss(labels, embeddings_anchor, embeddings_positive,
 
   labels_remapped = math_ops.to_float(
       math_ops.equal(labels, array_ops.transpose(labels)))
-  labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keep_dims=True)
+  labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)
 
   # Add the softmax loss.
   xent_loss = nn.softmax_cross_entropy_with_logits(
@@ -395,7 +395,7 @@ def npairs_loss_multilabel(sparse_labels, embeddings_anchor,
 
     multilabel_adjacency_matrix = _build_multilabel_adjacency(sparse_labels)
     labels_remapped = math_ops.to_float(multilabel_adjacency_matrix)
-    labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keep_dims=True)
+    labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)
 
     # Add the softmax loss.
     xent_loss = nn.softmax_cross_entropy_with_logits(
@@ -448,10 +448,10 @@ def lifted_struct_loss(labels, embeddings, margin=1.0):
   # Safe maximum: Temporarily shift negative distances
   #   above zero before taking max.
   #     this is to take the max only among negatives.
-  row_minimums = math_ops.reduce_min(diff, 1, keep_dims=True)
+  row_minimums = math_ops.reduce_min(diff, 1, keepdims=True)
   row_negative_maximums = math_ops.reduce_max(
       math_ops.multiply(
-          diff - row_minimums, mask), 1, keep_dims=True) + row_minimums
+          diff - row_minimums, mask), 1, keepdims=True) + row_minimums
 
   # Compute the loss.
   # Keep track of matrix of maximums where M_ij = max(m_i, m_j)
@@ -470,7 +470,7 @@ def lifted_struct_loss(labels, embeddings, margin=1.0):
       math_ops.reduce_sum(math_ops.multiply(
           math_ops.exp(
               diff_tiled - max_elements_vect),
-          mask_tiled), 1, keep_dims=True), [batch_size, batch_size])
+          mask_tiled), 1, keepdims=True), [batch_size, batch_size])
 
   loss_mat = max_elements + math_ops.log(
       loss_exp_left + array_ops.transpose(loss_exp_left))
@@ -686,7 +686,7 @@ def _find_loss_augmented_facility_idx(pairwise_distances, labels, chosen_ids,
                   array_ops.reshape(pairwise_distances_candidate, [1, -1])
               ], 0),
               axis=0,
-              keep_dims=True), [num_candidates, -1]),
+              keepdims=True), [num_candidates, -1]),
       axis=1)
 
   nmi_scores = array_ops.zeros([num_candidates])
diff --git a/tensorflow/contrib/makefile/BUILD b/tensorflow/contrib/makefile/BUILD
index a8dd59f32a7f3b27993a7ee48ee7cc07ada59a4c..701eeb44fe3f814cb3fb1cedd8618753946cc3e5 100644
--- a/tensorflow/contrib/makefile/BUILD
+++ b/tensorflow/contrib/makefile/BUILD
@@ -12,20 +12,3 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
-
-sh_test(
-    name = "build_all_linux",
-    size = "enormous",
-    srcs = ["build_all_linux.sh"],
-    data = [
-        "//tensorflow:all_opensource_files",
-        "//third_party/eigen3:all_files",
-        "//third_party/fft2d:all_files",
-    ],
-    tags = [
-        "manual",
-        "no_gpu",
-        "no_oss",
-        "notap",
-    ],
-)
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 617ef25fa4b9da64bdb155c3f30874dc97784166..81327407d44b4317b7aecb964a689a35aa35c163 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -89,7 +89,6 @@ HOST_INCLUDES := \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
--I$(MAKEFILE_DIR)/downloads/double_conversion \
 -I$(HOST_GENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
 	HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include
@@ -126,9 +125,7 @@ PROTO_TEXT := $(HOST_BINDIR)proto_text
 # The list of dependencies is derived from the Bazel build file by running
 # the gen_file_lists.sh script on a system with a working Bazel setup.
 PROTO_TEXT_CC_FILES := $(shell cat $(MAKEFILE_DIR)/proto_text_cc_files.txt)
-PROTO_TEXT_PB_CC_LIST := \
-	$(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt) \
-	$(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc)
+PROTO_TEXT_PB_CC_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt)
 PROTO_TEXT_PB_H_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_h_files.txt)
 
 # Locations of the intermediate files proto_text generates.
@@ -174,7 +171,6 @@ INCLUDES := \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
--I$(MAKEFILE_DIR)/downloads/double_conversion \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
 ifeq ($(HAS_GEN_HOST_PROTOC),true)
@@ -304,7 +300,7 @@ ifeq ($(TARGET),ANDROID)
 	ifeq ($(ANDROID_ARCH),x86_64)
 		TOOLCHAIN := x86_64-4.9
 		SYSROOT_ARCH := x86_64
-		BIN_PREFIX := x86-64-linux-android
+		BIN_PREFIX := x86_64-linux-android
 		MARCH_OPTION :=
 	endif
     
@@ -330,8 +326,6 @@ $(MARCH_OPTION) \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
--I$(MAKEFILE_DIR)/downloads/double_conversion \
--I$(MAKEFILE_DIR)/gen/protobuf/include \
 -I$(MAKEFILE_DIR)/gen/protobuf_android/$(ANDROID_ARCH)/include \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
@@ -380,12 +374,72 @@ $(MARCH_OPTION) \
 	ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
 		CXXFLAGS += -DENABLE_EXPERIMENTAL_HEXNN_OPS
 	endif
-	
-	OBJDIR := $(OBJDIR)android_$(ANDROID_ARCH)/
-	LIBDIR := $(LIBDIR)android_$(ANDROID_ARCH)/
-	BINDIR := $(BINDIR)android_$(ANDROID_ARCH)/
-	DEPDIR := $(DEPDIR)android_$(ANDROID_ARCH)/
 
+	ifeq ($(BUILD_FOR_TEGRA),1)
+		NVCC := $(JETPACK)/cuda/bin/nvcc
+		NVCCFLAGS := -x=cu -D__CUDACC__ -DNVCC -DANDROID_TEGRA -ccbin $(NDK_ROOT)/toolchains/$(TOOLCHAIN)/prebuilt/$(ANDROID_HOST_OS_ARCH)/bin/$(BIN_PREFIX)-g++ --std c++11 --expt-relaxed-constexpr -m64 -gencode arch=compute_53,\"code=sm_53\" -gencode arch=compute_62,\"code=sm_62\" -DEIGEN_AVOID_STL_ARRAY -DTENSORFLOW_USE_EIGEN_THREADPOOL -DLANG_CXX11 -DEIGEN_HAS_C99_MATH -DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=5.3
+		CXXFLAGS4NVCC =\
+-DIS_SLIM_BUILD \
+-DANDROID_TEGRA \
+-fno-exceptions \
+-DNDEBUG $(OPTFLAGS) \
+-march=armv8-a \
+-fPIE \
+-D__ANDROID_TYPES_FULL__ \
+--sysroot $(NDK_ROOT)/platforms/android-21/arch-arm64
+
+		CXXFLAGS +=\
+-DGOOGLE_CUDA=1 \
+-D__ANDROID_TYPES_FULL__ \
+-DANDROID_TEGRA \
+-DEIGEN_AVOID_STL_ARRAY \
+-DEIGEN_HAS_C99_MATH \
+-DLANG_CXX11 -DTENSORFLOW_USE_EIGEN_THREADPOOL -DTF_EXTRA_CUDA_CAPABILITIES=5.3
+
+		INCLUDES += \
+-Itensorflow/core/kernels \
+-I$(MAKEFILE_DIR)/downloads/cub \
+-I$(MAKEFILE_DIR)/downloads/cub/cub_archive/cub/device \
+-Ithird_party/toolchains/gpus/cuda \
+-I$(JETPACK)/cuda/include \
+-I$(JETPACK) \
+-I$(JETPACK)/cuDNN/aarch64 \
+-I$(JETPACK)/cuda/extras/CUPTI/include
+
+
+		CUDA_LIBS := \
+-ltfcuda \
+-lcudart_static \
+-lcudnn \
+-lcublas_static \
+-lcufftw_static \
+-lcusolver_static \
+-lcusparse_static \
+-lcufft \
+-lcuda \
+-lculibos \
+-lcurand_static
+
+		OBJDIR := $(OBJDIR)android_arm64-v8a/
+		LIBDIR := $(LIBDIR)android_arm64-v8a/
+		BINDIR := $(BINDIR)android_arm64-v8a/
+		DEPDIR := $(DEPDIR)android_arm64-v8a/
+
+		TEGRA_LIBS := \
+-L$(JETPACK)/cuda/targets/aarch64-linux-androideabi/lib \
+-L$(JETPACK)/cuda/targets/aarch64-linux-androideabi/lib/stubs \
+-L$(JETPACK)/cuda/targets/aarch64-linux-androideabi/lib64 \
+-L$(JETPACK)/cuda/targets/aarch64-linux-androideabi/lib64/stubs \
+-L$(JETPACK)/cuDNN/aarch64/cuda/lib64 \
+-L$(LIBDIR)
+
+		CUDA_LIB_DEPS := $(LIBDIR)libtfcuda.a
+	else
+		OBJDIR := $(OBJDIR)android_$(ANDROID_ARCH)/
+		LIBDIR := $(LIBDIR)android_$(ANDROID_ARCH)/
+		BINDIR := $(BINDIR)android_$(ANDROID_ARCH)/
+		DEPDIR := $(DEPDIR)android_$(ANDROID_ARCH)/
+	endif # ifeq ($(BUILD_FOR_TEGRA),1)
 endif  # ANDROID
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
@@ -549,11 +603,11 @@ $(wildcard tensorflow/core/platform/*/*.cc) \
 $(wildcard tensorflow/core/platform/*/*/*.cc) \
 $(wildcard tensorflow/core/util/*.cc) \
 $(wildcard tensorflow/core/util/*/*.cc) \
-$(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc) \
 tensorflow/core/util/version_info.cc
 # Remove duplicates (for version_info.cc)
 CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
-CORE_CC_EXCLUDE_SRCS := \
+
+CORE_CC_EXCLUDE_SRCS_NON_GPU := \
 $(wildcard tensorflow/core/*/*test.cc) \
 $(wildcard tensorflow/core/*/*testutil*) \
 $(wildcard tensorflow/core/*/*testlib*) \
@@ -573,25 +627,52 @@ $(wildcard tensorflow/core/lib/jpeg/*) \
 $(wildcard tensorflow/core/lib/png/*) \
 $(wildcard tensorflow/core/util/events_writer.*) \
 $(wildcard tensorflow/core/util/reporter.*) \
-$(wildcard tensorflow/core/platform/default/cuda_libdevice_path.*) \
-$(wildcard tensorflow/core/platform/default/stream_executor.*) \
 $(wildcard tensorflow/core/platform/default/test_benchmark.*) \
-$(wildcard tensorflow/core/platform/cuda.h) \
-$(wildcard tensorflow/core/platform/cuda_libdevice_path.*) \
 $(wildcard tensorflow/core/platform/cloud/*) \
 $(wildcard tensorflow/core/platform/google/*) \
 $(wildcard tensorflow/core/platform/google/*/*) \
 $(wildcard tensorflow/core/platform/jpeg.*) \
 $(wildcard tensorflow/core/platform/png.*) \
 $(wildcard tensorflow/core/platform/s3/*) \
-$(wildcard tensorflow/core/platform/stream_executor.*) \
 $(wildcard tensorflow/core/platform/windows/*) \
-$(wildcard tensorflow/core/user_ops/*.cu.cc) \
-$(wildcard tensorflow/core/common_runtime/gpu/*) \
-$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*) \
 $(wildcard tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.*) \
 $(wildcard tensorflow/core/grappler/inputs/file_input_yielder.*) \
-$(wildcard tensorflow/core/grappler/clusters/single_machine.*)
+$(wildcard tensorflow/core/grappler/clusters/single_machine.*) \
+tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+
+CORE_CC_EXCLUDE_SRCS := \
+$(CORE_CC_EXCLUDE_SRCS_NON_GPU) \
+$(wildcard tensorflow/core/platform/stream_executor.*) \
+$(wildcard tensorflow/core/platform/default/cuda_libdevice_path.*) \
+$(wildcard tensorflow/core/platform/cuda.h) \
+$(wildcard tensorflow/core/platform/cuda_libdevice_path.*) \
+$(wildcard tensorflow/core/user_ops/*.cu.cc) \
+$(wildcard tensorflow/core/common_runtime/gpu/*) \
+$(wildcard tensorflow/core/common_runtime/gpu_device_factory.*)
+
+ifeq ($(BUILD_FOR_TEGRA),1)
+CORE_CC_ALL_SRCS := $(CORE_CC_ALL_SRCS) \
+tensorflow/core/kernels/concat_lib_gpu.cc \
+tensorflow/core/kernels/cuda_solvers.cc \
+tensorflow/core/kernels/cudnn_pooling_gpu.cc \
+tensorflow/core/kernels/dense_update_functor.cc \
+tensorflow/core/kernels/fractional_avg_pool_op.cc \
+tensorflow/core/kernels/fractional_max_pool_op.cc \
+tensorflow/core/kernels/fractional_pool_common.cc \
+tensorflow/core/kernels/pooling_ops_3d.cc \
+tensorflow/core/kernels/sparse_fill_empty_rows_op.cc \
+tensorflow/core/kernels/list_kernels.cc \
+$(wildcard tensorflow/core/common_runtime/gpu/*.cc) \
+$(wildcard tensorflow/stream_executor/*.cc) \
+$(wildcard tensorflow/stream_executor/*/*.cc)
+
+CORE_CC_EXCLUDE_SRCS := \
+$(CORE_CC_EXCLUDE_SRCS_NON_GPU)
+
+CUDA_CC_SRCS := $(wildcard tensorflow/core/kernels/*.cu.cc)
+CUDA_CC_OBJS := $(addprefix $(OBJDIR), $(CUDA_CC_SRCS:.cc=.o))
+endif  # TEGRA
+
 # Filter out all the excluded files.
 TF_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 # Add in any extra files that don't fit the patterns easily
@@ -644,11 +725,23 @@ $(LIB_PATH): $(LIB_OBJS)
 	@mkdir -p $(dir $@)
 	$(AR) $(ARFLAGS) $(LIB_PATH) $(LIB_OBJS)
 
-$(BENCHMARK_NAME): $(BENCHMARK_OBJS) $(LIB_PATH)
+$(BENCHMARK_NAME): $(BENCHMARK_OBJS) $(LIB_PATH) $(CUDA_LIB_DEPS)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(BENCHMARK_NAME) $(BENCHMARK_OBJS) \
-	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
+	$(LIBFLAGS) $(TEGRA_LIBS) $(LIB_PATH) $(LDFLAGS) $(LIBS) $(CUDA_LIBS)
+
+# NVCC compilation rules for Tegra
+ifeq ($(BUILD_FOR_TEGRA),1)
+$(OBJDIR)%.cu.o: %.cu.cc
+	@mkdir -p $(dir $@)
+	@mkdir -p $(dir $(DEPDIR)$*)
+	$(NVCC) $(NVCCFLAGS) -Xcompiler "$(CXXFLAGS4NVCC) $(DEPFLAGS)" $(INCLUDES) -c $< -o $@
+
+$(LIBDIR)libtfcuda.a: $(CUDA_CC_OBJS)
+	@mkdir -p $(dir $@)
+	$(AR) $(ARFLAGS) $@ $(CUDA_CC_OBJS)
+endif
 
 # Matches on the normal hand-written TensorFlow C++ source files.
 $(OBJDIR)%.o: %.cc | $(PBT_GEN_FILES)
@@ -737,6 +830,7 @@ clean_except_protobuf_libs:
 cleantarget:
 	rm -rf $(OBJDIR)
 	rm -rf $(BINDIR)
+	rm -rf $(LIBDIR)
 
 $(DEPDIR)/%.d: ;
 .PRECIOUS: $(DEPDIR)/%.d
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index 9345303ff11462a447ed6299b0ac3cba558ea68b..b0228c543505c3d14e41bf1dd540b027b00489e6 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -130,6 +130,105 @@ adb shell '/data/local/tmp/benchmark \
 
 For more details, see the [benchmark documentation](../../tools/benchmark).
 
+## CUDA support for Tegra devices running Android (Nvidia Shield TV, etc)
+
+With the release of TF 1.6 and JetPack for Android 3.2 (currently pending), you can now build a version of TensorFlow for compatible devices according to the following instructions which will receive the full benefits of GPU acceleration.
+
+#### Environment setup:
+
+First, download and install JetPack for Android version 3.2 or greater from [Nvidia](https://developers.nvidia.com). Note that as of the TF 1.6 release the JetPack for Android 3.2 release is still pending, and regular JetPack for L4T will not work.
+
+```bash
+git clone https://github.com/tensorflow/tensorflow.git
+cd tensorflow
+JETPACK=$HOME/JetPack_Android_3.2
+TEGRA_LIBS="$JETPACK/cuDNN/aarch64/cuda/lib64/libcudnn.so  $JETPACK/cuda-9.0/extras/CUPTI/lib64/libcupti.so $JETPACK/cuda/targets/aarch64-linux-androideabi/lib64/libcufft.so"
+```
+
+#### Building all CUDA-enabled native binaries:
+This will build CUDA-enabled versions of libtensorflow_inference.so and the benchmark binary. (libtensorflow_demo.so will also be built incidentally, but it does not support CUDA)
+
+```bash
+NDK_ROOT=$JETPACK/android-ndk-r13b
+CC_PREFIX=ccache tensorflow/contrib/makefile/build_all_android.sh -s tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in -t "libtensorflow_inference.so libtensorflow_demo.so all" -a tegra
+```
+(add -T on subsequent builds to skip protobuf downloading/building)
+
+
+#### Testing the the CUDA-enabled benchmark via adb:
+Build binaries first as above, then run:
+
+```bash
+adb shell mkdir -p /data/local/tmp/lib64
+adb push $TEGRA_LIBS /data/local/tmp/lib64
+adb push tensorflow/contrib/makefile/gen/bin/android_arm64-v8a/benchmark /data/local/tmp
+wget  https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk
+unzip tensorflow_demo.apk -d /tmp/tensorflow_demo
+adb push /tmp/tensorflow_demo/assets/*.pb /data/local/tmp
+adb shell "LD_LIBRARY_PATH=/data/local/tmp/lib64 /data/local/tmp/benchmark --graph=/data/local/tmp/tensorflow_inception_graph.pb"
+```
+
+#### Building the CUDA-enabled TensorFlow AAR with Bazel:
+Build the native binaries first as above. Then, build the aar and package the native libs by executing the following:
+```bash
+mkdir -p /tmp/tf/jni/arm64-v8a
+cp tensorflow/contrib/makefile/gen/lib/android_tegra/libtensorflow_*.so /tmp/tf/jni/arm64-v8a/
+cp $TEGRA_LIBS /tmp/tf/jni/arm64-v8a
+bazel build //tensorflow/contrib/android:android_tensorflow_inference_java.aar
+cp bazel-bin/tensorflow/contrib/android/android_tensorflow_inference_java.aar /tmp/tf/tensorflow.aar
+cd /tmp/tf
+chmod +w tensorflow.aar
+zip -ur tensorflow.aar $(find jni -name *.so)
+```
+
+#### Building the CUDA-enabled TensorFlow Android demo with Bazel:
+Build binaries first as above, then edit tensorflow/examples/android/BUILD and replace: 
+```
+    srcs = [
+       ":libtensorflow_demo.so",
+       "//tensorflow/contrib/android:libtensorflow_inference.so",
+    ],
+```
+with:
+```
+srcs = glob(["libs/arm64-v8a/*.so"]),
+```
+
+Then run:
+```bash
+# Create dir for native libs
+mkdir -p tensorflow/examples/android/libs/arm64-v8a
+
+# Copy JetPack libs
+cp $TEGRA_LIBS  tensorflow/examples/android/libs/arm64-v8a
+
+# Copy native TensorFlow libraries
+cp tensorflow/contrib/makefile/gen/lib/android_arm64-v8a/libtensorflow_*.so tensorflow/examples/android/libs/arm64-v8a/
+
+# Build APK
+bazel build -c opt --fat_apk_cpu=arm64-v8a tensorflow/android:tensorflow_demo
+
+# Install
+adb install -r -f bazel-bin/tensorflow/examples/android/tensorflow_demo.apk 
+```
+
+#### Building the CUDA-enabled Android demo with gradle/Android Studio:
+
+Add tensorflow/examples/android as an Android project in Android Studio as normal.
+
+Edit build.gradle and:
+* set nativeBuildSystem = 'makefile'
+* set cpuType = 'arm64-v8a'
+* in "buildNativeMake", replace cpuType with 'tegra' (optional speedups like -T and ccache also work) 
+* set the environment "NDK_ROOT" var to $JETPACK/android-ndk-r13b
+
+Click "build apk" to build.
+
+Install:
+```bash
+adb install -r -f tensorflow/examples/android/gradleBuild/outputs/apk/debug/android-debug.apk
+```
+
 ## iOS
 
 _Note: To use this library in an iOS application, see related instructions in
@@ -262,6 +361,14 @@ to register ops and kernels.
 
 #### Optimization
 
+The `build_all_ios.sh` script can take optional command-line arguments to
+selectively register only for the operators used in your graph.
+
+```bash
+tensorflow/contrib/makefile/build_all_ios.sh -a arm64 -g $HOME/graphs/inception/tensorflow_inception_graph.pb
+```
+Please note this is an aggressive optimization of the operators and the resulting library may not work with other graphs but will reduce the size of the final library.
+
 The `compile_ios_tensorflow.sh` script can take optional command-line arguments.
 The first argument will be passed as a C++ optimization flag and defaults to
 debug mode. If you are concerned about performance or are working on a release
diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index 81cb17a311fd94aa397eb7a766cd8c668268759a..fc88f59e0948e1d3ed7cce9b809bf30ba280af12 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -18,7 +18,7 @@
 set -e
 
 usage() {
-  echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-Es:t:Tx:a:X]"
+  echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-Es:t:Tx:a]"
   echo "-E enable experimental hexnn ops"
   echo "-s [sub_makefiles] sub makefiles separated by white space"
   echo "-t [build_target] build target for Android makefile [default=all]"
@@ -26,7 +26,7 @@ usage() {
   echo "-x [hexagon library path] copy and hexagon libraries in the specified path"
   echo "-a [architecture] Architecture of target android [default=armeabi-v7a] \
 (supported architecture list: \
-arm64-v8a armeabi armeabi-v7a mips mips64 x86 x86_64)"
+arm64-v8a armeabi armeabi-v7a mips mips64 x86 x86_64 tegra)"
   exit 1
 }
 
@@ -50,6 +50,26 @@ while getopts "Es:t:Tx:a:" opt_name; do
 done
 shift $((OPTIND - 1))
 
+if [ "$ARCH" == "tegra" ]; then
+    if [[ -z "${JETPACK}" ]]; then
+        export JETPACK="$HOME/JetPack_Android_3.2"
+    fi
+    if [ ! -d ${JETPACK} ]; then
+        echo "Can't find Jetpack at ${JETPACK}"
+        echo "Set JETPACK=<path to Jetpack Android> to specify a non-default Jetpack path"
+        exit -1
+    fi
+    if [ ! -d ${JETPACK}/cuda ]; then
+        ln -s $(ls -d ${JETPACK}/cuda-*/|sort -r|head -n1) ${JETPACK}/cuda
+    fi
+    if [ ! -d ${JETPACK}/cuda ]; then
+        ln -s $(ls -d ${JETPACK}/cuda-*/|sort -r|head -n1) ${JETPACK}/cuda
+    fi
+
+    export BUILD_FOR_TEGRA=1
+    ARCH="arm64-v8a"
+fi
+
 # Make sure we're in the correct directory, at the root of the source tree.
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
 cd "${SCRIPT_DIR}"/../../../
diff --git a/tensorflow/contrib/makefile/build_all_ios.sh b/tensorflow/contrib/makefile/build_all_ios.sh
index 988e12b48287300004cc23c31cb4a20e63f72a27..2d9979183975e6a17527b40ef5ee1795ced44a7b 100755
--- a/tensorflow/contrib/makefile/build_all_ios.sh
+++ b/tensorflow/contrib/makefile/build_all_ios.sh
@@ -26,13 +26,16 @@ fi
 usage() {
   echo "Usage: $(basename "$0") [-a:T]"
   echo "-a [build_arch] build only for specified arch x86_64 [default=all]"
+  echo "-g [graph] optimize and selectively register ops only for this graph"
   echo "-T only build tensorflow (dont download other deps etc)"
   exit 1
 }
 
-while getopts "a:T" opt_name; do
+DEFAULT_ARCH="i386 x86_64 armv7 armv7s arm64"
+while getopts "a:g:T" opt_name; do
   case "$opt_name" in
     a) BUILD_ARCH="${OPTARG}";;
+    g) OPTIMIZE_FOR_GRAPH="${OPTARG}";;
     T) ONLY_MAKE_TENSORFLOW="true";;
     *) usage;;
   esac
@@ -42,7 +45,8 @@ shift $((OPTIND - 1))
 
 # Make sure we're in the correct directory, at the root of the source tree.
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd ${SCRIPT_DIR}/../../../
+TOP_SRCDIR="${SCRIPT_DIR}/../../../"
+cd ${TOP_SRCDIR}
 
 source "${SCRIPT_DIR}/build_helper.subr"
 JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
@@ -56,6 +60,32 @@ if [[ -n MACOSX_DEPLOYMENT_TARGET ]]; then
     export MACOSX_DEPLOYMENT_TARGET=$(sw_vers -productVersion)
 fi
 
+PRNT_SLCTV_BIN="${TOP_SRCDIR}bazel-bin/tensorflow/python/tools/print_selective_registration_header"
+
+if [[ ! -z "${OPTIMIZE_FOR_GRAPH}" ]]; then
+    echo "Request to optimize for graph: ${OPTIMIZE_FOR_GRAPH}"
+    #Request to trim the OPs by selectively registering
+    if [ ! -f ${PRNT_SLCTV_BIN} ]; then
+        #Build bazel build tensorflow/python/tools:print_selective_registration_header
+        echo "${PRNT_SLCTV_BIN} not found. Trying to build it"
+        cd ${TOP_SRCDIR}
+        bazel build --copt="-DUSE_GEMM_FOR_CONV" tensorflow/python/tools:print_selective_registration_header
+         if [ ! -f ${PRNT_SLCTV_BIN} ]; then
+            echo "Building print_selective_registration_header failed"
+            echo "You may want to build TensorFlow with: "
+            echo "./configure"
+            echo "bazel build --copt="-DUSE_GEMM_FOR_CONV" tensorflow/python/tools:print_selective_registration_header"
+            echo "and then run this script again"
+            exit 1
+        fi
+    else
+        echo "${PRNT_SLCTV_BIN} found. Using it"
+        ${PRNT_SLCTV_BIN} --graphs=${OPTIMIZE_FOR_GRAPH} > ${TOP_SRCDIR}/tensorflow/core/framework/ops_to_register.h
+
+    fi
+
+fi
+
 if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
     # Remove any old files first.
     make -f tensorflow/contrib/makefile/Makefile clean
@@ -64,8 +94,13 @@ if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
     # Pull down the required versions of the frameworks we need.
     tensorflow/contrib/makefile/download_dependencies.sh
 
-    # Compile protobuf for the target iOS device architectures.
-    tensorflow/contrib/makefile/compile_ios_protobuf.sh
+    if [[ -z "${BUILD_ARCH}" ]]; then
+        # Compile protobuf for the target iOS device architectures.
+        tensorflow/contrib/makefile/compile_ios_protobuf.sh
+    else
+        # Compile protobuf for the target iOS device architectures.
+        tensorflow/contrib/makefile/compile_ios_protobuf.sh -a ${BUILD_ARCH}
+    fi
 fi
 
 # Compile nsync for the target iOS device architectures.
@@ -80,13 +115,24 @@ else
 fi
 export HOST_NSYNC_LIB TARGET_NSYNC_LIB
 
-if [[ -z "${BUILD_ARCH}" ]]; then
-    # build the ios tensorflow libraries.
-    tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -h $HOST_NSYNC_LIB -n $TARGET_NSYNC_LIB
-else
+TF_CC_FLAGS="-O3"
+TF_SCRIPT_FLAGS="-h ${HOST_NSYNC_LIB} -n ${TARGET_NSYNC_LIB}"
+
+if [[ ! -z "${OPTIMIZE_FOR_GRAPH}" ]]; then
+    # arch specified so build just that
+    TF_CC_FLAGS="${TF_CC_FLAGS} -DANDROID_TYPES=__ANDROID_TYPES_FULL__ -DSELECTIVE_REGISTRATION -DSUPPORT_SELECTIVE_REGISTRATION"
+    # The Makefile checks the env var to decide which ANDROID_TYPES to build
+    export ANDROID_TYPES="-D__ANDROID_TYPES_FULL__"
+fi
+
+if [[ ! -z "${BUILD_ARCH}" ]]; then
     # arch specified so build just that
-    tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -a "${BUILD_ARCH}" -h $HOST_NSYNC_LIB -n $TARGET_NSYNC_LIB
+    TF_SCRIPT_FLAGS="${TF_SCRIPT_FLAGS} -a ${BUILD_ARCH}"
 fi
 
+# build the ios tensorflow libraries.
+echo "Building TensorFlow with flags: ${TF_SCRIPT_FLAGS} -f ${TF_CC_FLAGS}"
+tensorflow/contrib/makefile/compile_ios_tensorflow.sh ${TF_SCRIPT_FLAGS} -f "${TF_CC_FLAGS}"
+
 # Creates a static universal library in
 # tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 19e25ad7672102b93e4a3d10874e31130049f26a..4ae18b2cef28335a90bbc967529c0cf76b0a5da2 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -26,15 +26,15 @@ if [ ! -f $BZL_FILE_PATH ]; then
   exit 1;
 fi
 
-EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 RE2_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
-FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
-DOUBLE_CONVERSION_URL="$(grep -o "https.*google/double-conversion.*\.zip" "${BZL_FILE_PATH}" | head -n1)"
+FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
+CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
@@ -63,12 +63,17 @@ download_and_extract() {
   elif [[ "${url}" == *zip ]]; then
     tempdir=$(mktemp -d)
     tempdir2=$(mktemp -d)
-    wget -P ${tempdir} ${url}
-    unzip ${tempdir}/* -d ${tempdir2}
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+      # macOS (AKA darwin) doesn't have wget.
+      (cd "${tempdir}"; curl --remote-name --silent --location "${url}")
+    else
+      wget -P "${tempdir}" "${url}"
+    fi
+    unzip "${tempdir}"/* -d "${tempdir2}"
     # unzip has no strip components, so unzip to a temp dir, and move the files
     # we want from the tempdir to destination.
-    cp -R ${tempdir2}/*/* ${dir}/
-    rm -rf ${tempdir2} ${tempdir}
+    cp -R "${tempdir2}"/*/* "${dir}"/
+    rm -rf "${tempdir2}" "${tempdir}"
   fi
 
   # Delete any potential BUILD files, which would interfere with Bazel builds.
@@ -82,8 +87,8 @@ download_and_extract "${NSYNC_URL}" "${DOWNLOADS_DIR}/nsync"
 download_and_extract "${PROTOBUF_URL}" "${DOWNLOADS_DIR}/protobuf"
 download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2"
 download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
-download_and_extract "${DOUBLE_CONVERSION_URL}" "${DOWNLOADS_DIR}/double_conversion"
 download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
+download_and_extract "${CUB_URL}" "${DOWNLOADS_DIR}/cub/external/cub_archive"
 
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
diff --git a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
index 861bb885c7031b996b48dbc50887cfce55c638f3..421ddd210fd5b1ac6487918d5797eab5953316df 100755
--- a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
+++ b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
@@ -36,7 +36,7 @@ while getopts "bc:Eps" opt_name; do
     b) BUILD_ONLY="true";;
     c) TEST_COUNT="${OPTARG}";;
     E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";;
-    p) USE_PREBUILT_HEXAOGON_BINARIES="true";;
+    p) USE_PREBUILT_HEXAGON_BINARIES="true";;
     s) SKIP_DOWNLOAD_IF_EXIST="true";;
     *) usage;;
   esac
@@ -49,7 +49,7 @@ if [[ -z "${NDK_ROOT}" ]]; then
     exit 1
 fi
 
-if [[ "${USE_PREBUILT_HEXAOGON_BINARIES}" != "true" &&
+if [[ "${USE_PREBUILT_HEXAGON_BINARIES}" != "true" &&
       -z "${QUALCOMM_SDK}" ]]; then
     echo "QUALCOMM_SDK is empty" 1>&2
     usage
@@ -76,13 +76,15 @@ GEN_LIBS_DIR="${GEN_DIR}/libs"
 GEN_DOWNLOAD_DIR="${GEN_DIR}/downloads"
 URL_BASE="https://storage.googleapis.com/download.tensorflow.org"
 
+ARCH="armeabi-v7a"
+
 source "${SCRIPT_DIR}/../build_helper.subr"
 
 rm -rf "${GEN_DIR}"
 mkdir -p "${GEN_LIBS_DIR}"
 mkdir -p "${GEN_DOWNLOAD_DIR}"
 
-if [[ "${USE_PREBUILT_HEXAOGON_BINARIES}" == "true" ]]; then
+if [[ "${USE_PREBUILT_HEXAGON_BINARIES}" == "true" ]]; then
     echo "Download prebuilt hexagon binaries"
     if [[ "${BUILD_ONLY}" != "true" ]]; then
         CONTROLLER_PUSH_DEST="/data/local/tmp"
@@ -219,7 +221,7 @@ if [[ "${BUILD_ONLY}" != "true" ]]; then
     adb push "${GEN_LIBS_DIR}/libhexagon_nn_skel.so" "/vendor/lib/rfsa/adsp"
 
     adb push -p \
-        "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/bin/hexagon_graph_execution" \
+        "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/bin/android_${ARCH}/hexagon_graph_execution" \
         "/data/local/tmp/"
     adb wait-for-device
     adb shell chmod "${ANDROID_EXEC_FILE_MODE}" \
diff --git a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
index 26c1ad4947363e98d9bb8e400f40290fb87b2e4e..3081084ee76e41de801f49a67c1fec07f4ff03b9 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in
@@ -48,13 +48,13 @@ INFERENCE_OBJS := $(addprefix $(OBJDIR), $(INFERENCE_SRCS:.cc=.o))
 INFERENCE_SO_NAME := libtensorflow_inference.so
 INFERENCE_SO_PATH := $(LIBDIR)$(INFERENCE_SO_NAME)
 
-$(INFERENCE_SO_PATH): $(LIB_OBJS) $(INFERENCE_OBJS)
+$(INFERENCE_SO_PATH): $(LIB_OBJS) $(INFERENCE_OBJS) $(CUDA_LIB_DEPS)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $@ $(INFERENCE_OBJS) $(LIB_OBJS) \
+	-o $@ $(INFERENCE_OBJS) $(LIB_OBJS) $(TEGRA_LIBS) \
 	$(LIBFLAGS) $(LDFLAGS) \
 	-shared -Wl,-soname,$(INFERENCE_SO_NAME) \
-	$(LIBS)
+	$(LIBS) $(CUDA_LIBS)
 
 $(INFERENCE_SO_NAME): $(INFERENCE_SO_PATH)
 
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index ff612f1fdf8c526322c4dfb997f32f78e2ae5609..5a812af4e95fe7a05b9c2634b0cc1d860fb7f619 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -68,6 +68,8 @@ tensorflow/core/kernels/scatter_nd_op_cpu_impl_2.cc
 tensorflow/core/kernels/scatter_nd_op_cpu_impl_3.cc
 tensorflow/core/kernels/scatter_nd_op_cpu_impl_4.cc
 tensorflow/core/kernels/scatter_nd_op_cpu_impl_5.cc
+tensorflow/core/kernels/scatter_nd_op_cpu_impl_6.cc
+tensorflow/core/kernels/scatter_nd_op_cpu_impl_7.cc
 tensorflow/core/kernels/scatter_nd_op.cc
 tensorflow/core/kernels/save_restore_tensor.cc
 tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -89,6 +91,7 @@ tensorflow/core/kernels/reduction_ops_max.cc
 tensorflow/core/kernels/reduction_ops_common.cc
 tensorflow/core/kernels/reduction_ops_any.cc
 tensorflow/core/kernels/reduction_ops_all.cc
+tensorflow/core/kernels/roll_op.cc
 tensorflow/core/kernels/queue_ops.cc
 tensorflow/core/kernels/queue_base.cc
 tensorflow/core/kernels/pooling_ops_common.cc
@@ -132,6 +135,8 @@ tensorflow/core/kernels/gather_nd_op_cpu_impl_2.cc
 tensorflow/core/kernels/gather_nd_op_cpu_impl_3.cc
 tensorflow/core/kernels/gather_nd_op_cpu_impl_4.cc
 tensorflow/core/kernels/gather_nd_op_cpu_impl_5.cc
+tensorflow/core/kernels/gather_nd_op_cpu_impl_6.cc
+tensorflow/core/kernels/gather_nd_op_cpu_impl_7.cc
 tensorflow/core/kernels/fused_batch_norm_op.cc
 tensorflow/core/kernels/function_ops.cc
 tensorflow/core/kernels/fill_functor.cc
@@ -144,6 +149,7 @@ tensorflow/core/kernels/dynamic_stitch_op.cc
 tensorflow/core/kernels/dynamic_partition_op.cc
 tensorflow/core/kernels/decode_bmp_op.cc
 tensorflow/core/kernels/depthtospace_op.cc
+tensorflow/core/kernels/data_format_ops.cc
 tensorflow/core/kernels/spacetodepth_op.cc
 tensorflow/core/kernels/dense_update_ops.cc
 tensorflow/core/kernels/deep_conv2d.cc
@@ -265,6 +271,7 @@ tensorflow/core/ops/parsing_ops.cc
 tensorflow/core/ops/no_op.cc
 tensorflow/core/ops/nn_ops.cc
 tensorflow/core/ops/nn_grad.cc
+tensorflow/core/ops/manip_ops.cc
 tensorflow/core/ops/math_ops.cc
 tensorflow/core/ops/math_grad.cc
 tensorflow/core/ops/logging_ops.cc
@@ -286,3 +293,4 @@ tensorflow/core/kernels/batchtospace_op.cc
 tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
 tensorflow/core/kernels/batch_util.cc
+tensorflow/core/ops/audio_ops.cc
diff --git a/tensorflow/contrib/memory_stats/__init__.py b/tensorflow/contrib/memory_stats/__init__.py
index a32302c854b68ed1b211a221f3026e8d5b6091ac..2ce849ca660076aa5d25db4f16b8d24051e315ae 100644
--- a/tensorflow/contrib/memory_stats/__init__.py
+++ b/tensorflow/contrib/memory_stats/__init__.py
@@ -19,6 +19,10 @@
 @@MaxBytesInUse
 """
 
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
 from tensorflow.contrib.memory_stats.python.ops.memory_stats_ops import BytesInUse
 from tensorflow.contrib.memory_stats.python.ops.memory_stats_ops import BytesLimit
 from tensorflow.contrib.memory_stats.python.ops.memory_stats_ops import MaxBytesInUse
diff --git a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
index 7e2e96e160167ae68d3bdabacbbbeb45df61778f..974fb537499c5ea4591a0a128f53d2dea67b9e57 100644
--- a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
+++ b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
@@ -59,7 +59,7 @@ REGISTER_KERNEL_BUILDER(Name("BytesInUse").Device(DEVICE_GPU).HostMemory("out"),
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
-    Name("BytesInUse").Device(DEVICE_SYCL).HostMemory("out"), MaxBytesInUseOp);
+    Name("BytesInUse").Device(DEVICE_SYCL).HostMemory("out"), BytesInUseOp);
 #endif  // TENSORFLOW_USE_SYCL
 
 // Op that measures the total memory (in bytes) of a device.
@@ -80,9 +80,9 @@ REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_GPU).HostMemory("out"),
                         BytesLimitOp);
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("BytesLimit").Device(DEVICE_SYCL).HostMemory("out"),
-                        BytesLimitOp);
-#endif // TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("BytesLimit").Device(DEVICE_SYCL).HostMemory("out"), BytesLimitOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 // Op that measures the peak memory in bytes.
 class MaxBytesInUseOp : public MemoryStatsOp {
@@ -107,6 +107,6 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("MaxBytesInUse").Device(DEVICE_SYCL).HostMemory("out"),
     MaxBytesInUseOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py b/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
index d1b430b8039fcf7e10bcb842c3f34b960b9026b3..02c2ac06fb7dc0c930deaaa4c21a6971d96f19a1 100644
--- a/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
+++ b/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
@@ -77,8 +77,9 @@ class MemoryStatsOpsTest(test_util.TensorFlowTestCase):
         bytes_in_use_op = memory_stats_ops.BytesInUse()
       with ops.control_dependencies([bytes_in_use_op]):
         b = random_ops.random_uniform(matrix_shape, dtype=dtype)
+        c = math_ops.matmul(a, b)
 
-      _, bytes_in_use, max_bytes_in_use = sess.run([a, bytes_in_use_op,
+      _, bytes_in_use, max_bytes_in_use = sess.run([c, bytes_in_use_op,
                                                     max_bytes_in_use_op])
 
       # intermediate result allocates 1 matrix, max usage is at least 2
diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
index 2932ae1c8df32cd936cff932b061571c513fda79..ff88b4fa841673fc52b9f6fdc5ca43d30c44bbfd 100644
--- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
+++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py
@@ -171,7 +171,14 @@ def _clean_save_and_restore(graph_def, op, removed_op_names):
   shape_op_value_tensor.tensor_shape.dim[0].size = len(shapes)
   op.attr['dtypes'].list.type[:] = dtypes
 
+  if not name_op.attr['_output_shapes'].list.shape:
+    name_op.attr['_output_shapes'].list.shape.add()
+    name_op.attr['_output_shapes'].list.shape[0].dim.add()
   name_op.attr['_output_shapes'].list.shape[0].dim[0].size = len(names)
+
+  if not shape_op.attr['_output_shapes'].list.shape:
+    shape_op.attr['_output_shapes'].list.shape.add()
+    shape_op.attr['_output_shapes'].list.shape[0].dim.add()
   shape_op.attr['_output_shapes'].list.shape[0].dim[0].size = len(shapes)
 
 
diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index 27dad5379a2e56b91960a1f2274610e4f2568dbc..d3dce46bfb6e9c77cc7ae107b323a9bc7074c47e 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -66,6 +66,7 @@ See the @{$python/contrib.metrics} guide.
 @@set_intersection
 @@set_size
 @@set_union
+@@cohen_kappa
 @@count
 @@precision_recall_at_equal_thresholds
 @@recall_at_precision
@@ -82,6 +83,7 @@ from tensorflow.contrib.metrics.python.ops.confusion_matrix_ops import confusion
 from tensorflow.contrib.metrics.python.ops.histogram_ops import auc_using_histogram
 from tensorflow.contrib.metrics.python.ops.metric_ops import aggregate_metric_map
 from tensorflow.contrib.metrics.python.ops.metric_ops import aggregate_metrics
+from tensorflow.contrib.metrics.python.ops.metric_ops import cohen_kappa
 from tensorflow.contrib.metrics.python.ops.metric_ops import count
 from tensorflow.contrib.metrics.python.ops.metric_ops import precision_recall_at_equal_thresholds
 from tensorflow.contrib.metrics.python.ops.metric_ops import recall_at_precision
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 6b08b749f86bc098ac511d142770362952b491d8..d3ce51a6112d955d012b4532ac727bf146f2c5cd 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -24,10 +24,12 @@ from __future__ import print_function
 
 import collections as collections_lib
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import confusion_matrix
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics
@@ -337,9 +339,9 @@ def streaming_mean_tensor(values,
       name=name)
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.accuracy. Note that the order of the '
-    'labels and predictions arguments has been switched.')
+@deprecated(None,
+            'Please switch to tf.metrics.accuracy. Note that the order of the '
+            'labels and predictions arguments has been switched.')
 def streaming_accuracy(predictions,
                        labels,
                        weights=None,
@@ -737,7 +739,7 @@ def _streaming_confusion_matrix_at_thresholds(predictions,
   else:
     for include in includes:
       if include not in all_includes:
-        raise ValueError('Invaild key: %s.' % include)
+        raise ValueError('Invalid key: %s.' % include)
 
   predictions, labels, weights = metrics_impl._remove_squeezable_dimensions(  # pylint: disable=protected-access
       predictions, labels, weights)
@@ -934,8 +936,9 @@ def streaming_curve_points(labels=None,
     if curve != 'ROC' and curve != 'PR':
       raise ValueError('curve must be either ROC or PR, %s unknown' % (curve))
     kepsilon = _EPSILON  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds - 2)]
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
     thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
     values, update_ops = _streaming_confusion_matrix_at_thresholds(
@@ -971,9 +974,8 @@ def streaming_curve_points(labels=None,
     return points, update_op
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.auc. Note that the order of the '
-    'labels and predictions arguments has been switched.')
+@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of the '
+            'labels and predictions arguments has been switched.')
 def streaming_auc(predictions,
                   labels,
                   weights=None,
@@ -1103,8 +1105,7 @@ def _compute_dynamic_auc(labels, predictions, curve='ROC'):
       # For conformance, set precision to 1 when the number of positive
       # classifications is 0.
       y_axis_values = array_ops.where(
-          math_ops.greater(splits, 0),
-          math_ops.truediv(true_positives, splits),
+          math_ops.greater(splits, 0), math_ops.truediv(true_positives, splits),
           array_ops.ones_like(true_positives, dtype=dtypes.float64))
 
     # Calculate trapezoid areas.
@@ -1117,9 +1118,8 @@ def _compute_dynamic_auc(labels, predictions, curve='ROC'):
   # exception seems excessive) so we return 0, otherwise we finish computing.
   return control_flow_ops.cond(
       math_ops.logical_or(
-          math_ops.equal(total_positive, 0),
-          math_ops.equal(total_positive, size)
-      ),
+          math_ops.equal(total_positive, 0), math_ops.equal(
+              total_positive, size)),
       true_fn=lambda: array_ops.constant(0, dtypes.float64),
       false_fn=continue_computing_dynamic_auc)
 
@@ -1183,10 +1183,10 @@ def streaming_dynamic_auc(labels,
             array_ops.ones_like(labels, dtypes.int64),
             message='labels must be 0 or 1, at least one is >1')
     ]):
-      preds_accum, update_preds = streaming_concat(predictions,
-                                                   name='concat_preds')
-      labels_accum, update_labels = streaming_concat(labels,
-                                                     name='concat_labels')
+      preds_accum, update_preds = streaming_concat(
+          predictions, name='concat_preds')
+      labels_accum, update_labels = streaming_concat(
+          labels, name='concat_labels')
       update_op = control_flow_ops.group(update_labels, update_preds)
       auc = _compute_dynamic_auc(labels_accum, preds_accum, curve=curve)
       if updates_collections:
@@ -1226,7 +1226,7 @@ def precision_recall_at_equal_thresholds(labels,
     predictions: A floating point `Tensor` of arbitrary shape and whose values
       are in the range `[0, 1]`.
     weights: Optional; If provided, a `Tensor` that has the same dtype as,
-      and broadcastable to, `predictions`. This tensor is multplied by counts.
+      and broadcastable to, `predictions`. This tensor is multiplied by counts.
     num_thresholds: Optional; Number of thresholds, evenly distributed in
       `[0, 1]`. Should be `>= 2`. Defaults to 201. Note that the number of bins
       is 1 less than `num_thresholds`. Using an even `num_thresholds` value
@@ -1569,9 +1569,9 @@ def streaming_precision_at_thresholds(predictions,
       name=name)
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.recall_at_thresholds. Note that the '
-    'order of the labels and predictions arguments has been switched.')
+@deprecated(None,
+            'Please switch to tf.metrics.recall_at_thresholds. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_recall_at_thresholds(predictions,
                                    labels,
                                    thresholds,
@@ -2268,7 +2268,7 @@ def recall_at_precision(labels,
     thresholds = [0.0 - _EPSILON] + thresholds + [1.0 + _EPSILON]
 
     values, update_ops = _streaming_confusion_matrix_at_thresholds(
-        labels, predictions, thresholds, weights)
+        predictions, labels, thresholds, weights)
 
     recall = _compute_recall_at_precision(values['tp'], values['fp'],
                                           values['fn'], precision, 'value')
@@ -3297,9 +3297,142 @@ def count(values,
     return count_, update_op
 
 
+def cohen_kappa(labels,
+                predictions_idx,
+                num_classes,
+                weights=None,
+                metrics_collections=None,
+                updates_collections=None,
+                name=None):
+  """Calculates Cohen's kappa.
+
+  [Cohen's kappa](https://en.wikipedia.org/wiki/Cohen's_kappa) is a statistic
+  that measures inter-annotator agreement.
+
+  The `cohen_kappa` function calculates the confusion matrix, and creates three
+  local variables to compute the Cohen's kappa: `po`, `pe_row`, and `pe_col`,
+  which refer to the diagonal part, rows and columns totals of the confusion
+  matrix, respectively. This value is ultimately returned as `kappa`, an
+  idempotent operation that is calculated by
+
+      pe = (pe_row * pe_col) / N
+      k = (sum(po) - sum(pe)) / (N - sum(pe))
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `kappa`. `update_op` weights each prediction by the corresponding value in
+  `weights`.
+
+  Class labels are expected to start at 0. E.g., if `num_classes`
+  was three, then the possible labels would be [0, 1, 2].
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  NOTE: Equivalent to `sklearn.metrics.cohen_kappa_score`, but the method
+  doesn't support weighted matrix yet.
+
+  Args:
+    labels: 1-D `Tensor` of real labels for the classification task. Must be
+      one of the following types: int16, int32, int64.
+    predictions_idx: 1-D `Tensor` of predicted class indices for a given
+      classification. Must have the same type as `labels`.
+    num_classes: The possible number of labels.
+    weights: Optional `Tensor` whose shape matches `predictions`.
+    metrics_collections: An optional list of collections that `kappa` should
+      be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    kappa: Scalar float `Tensor` representing the current Cohen's kappa.
+    update_op: `Operation` that increments `po`, `pe_row` and `pe_col`
+      variables appropriately and whose value matches `kappa`.
+
+  Raises:
+    ValueError: If `num_classes` is less than 2, or `predictions` and `labels`
+      have mismatched shapes, or if `weights` is not `None` and its shape
+      doesn't match `predictions`, or if either `metrics_collections` or
+      `updates_collections` are not a list or tuple.
+    RuntimeError: If eager execution is enabled.
+  """
+  if context.in_eager_mode():
+    raise RuntimeError('tf.contrib.metrics.cohen_kappa is not supported'
+                       'when eager execution is enabled.')
+  if num_classes < 2:
+    raise ValueError('`num_classes` must be >= 2.'
+                     'Found: {}'.format(num_classes))
+  with variable_scope.variable_scope(name, 'cohen_kappa',
+                                     (labels, predictions_idx, weights)):
+    # Convert 2-dim (num, 1) to 1-dim (num,)
+    labels.get_shape().with_rank_at_most(2)
+    if labels.get_shape().ndims == 2:
+      labels = array_ops.squeeze(labels, axis=[-1])
+    predictions_idx, labels, weights = (
+        metrics_impl._remove_squeezable_dimensions(  # pylint: disable=protected-access
+            predictions=predictions_idx,
+            labels=labels,
+            weights=weights))
+    predictions_idx.get_shape().assert_is_compatible_with(labels.get_shape())
+
+    stat_dtype = (
+        dtypes.int64
+        if weights is None or weights.dtype.is_integer else dtypes.float32)
+    po = metrics_impl.metric_variable((num_classes,), stat_dtype, name='po')
+    pe_row = metrics_impl.metric_variable(
+        (num_classes,), stat_dtype, name='pe_row')
+    pe_col = metrics_impl.metric_variable(
+        (num_classes,), stat_dtype, name='pe_col')
+
+    # Table of the counts of agreement:
+    counts_in_table = confusion_matrix.confusion_matrix(
+        labels,
+        predictions_idx,
+        num_classes=num_classes,
+        weights=weights,
+        dtype=stat_dtype,
+        name='counts_in_table')
+
+    po_t = array_ops.diag_part(counts_in_table)
+    pe_row_t = math_ops.reduce_sum(counts_in_table, axis=0)
+    pe_col_t = math_ops.reduce_sum(counts_in_table, axis=1)
+    update_po = state_ops.assign_add(po, po_t)
+    update_pe_row = state_ops.assign_add(pe_row, pe_row_t)
+    update_pe_col = state_ops.assign_add(pe_col, pe_col_t)
+
+    def _calculate_k(po, pe_row, pe_col, name):
+      po_sum = math_ops.reduce_sum(po)
+      total = math_ops.reduce_sum(pe_row)
+      pe_sum = math_ops.reduce_sum(
+          metrics_impl._safe_div(  # pylint: disable=protected-access
+              pe_row * pe_col, total, None))
+      po_sum, pe_sum, total = (math_ops.to_double(po_sum),
+                               math_ops.to_double(pe_sum),
+                               math_ops.to_double(total))
+      # kappa = (po - pe) / (N - pe)
+      k = metrics_impl._safe_scalar_div(  # pylint: disable=protected-access
+          po_sum - pe_sum,
+          total - pe_sum,
+          name=name)
+      return k
+
+    kappa = _calculate_k(po, pe_row, pe_col, name='value')
+    update_op = _calculate_k(
+        update_po, update_pe_row, update_pe_col, name='update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, kappa)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return kappa, update_op
+
+
 __all__ = [
     'aggregate_metric_map',
     'aggregate_metrics',
+    'cohen_kappa',
     'count',
     'precision_recall_at_equal_thresholds',
     'recall_at_precision',
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 7db06609de4e73fe5c18f81cef225829e9f54123..e067f08babd9a900e876545d427c91e5ff808f04 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -46,8 +46,7 @@ def _enqueue_vector(sess, queue, values, shape=None):
     shape = (1, len(values))
   dtype = queue.dtypes[0]
   sess.run(
-      queue.enqueue(constant_op.constant(
-          values, dtype=dtype, shape=shape)))
+      queue.enqueue(constant_op.constant(values, dtype=dtype, shape=shape)))
 
 
 def _binary_2d_label_to_sparse_value(labels):
@@ -79,8 +78,8 @@ def _binary_2d_label_to_sparse_value(labels):
     batch += 1
   shape = [len(labels), len(labels[0])]
   return sparse_tensor.SparseTensorValue(
-      np.array(indices, np.int64),
-      np.array(values, np.int64), np.array(shape, np.int64))
+      np.array(indices, np.int64), np.array(values, np.int64),
+      np.array(shape, np.int64))
 
 
 def _binary_2d_label_to_sparse(labels):
@@ -125,8 +124,8 @@ def _binary_3d_label_to_sparse_value(labels):
           assert label == 0
   shape = [len(labels), len(labels[0]), len(labels[0][0])]
   return sparse_tensor.SparseTensorValue(
-      np.array(indices, np.int64),
-      np.array(values, np.int64), np.array(shape, np.int64))
+      np.array(indices, np.int64), np.array(values, np.int64),
+      np.array(shape, np.int64))
 
 
 def _binary_3d_label_to_sparse(labels):
@@ -669,20 +668,18 @@ class StreamingTruePositivesTest(test.TestCase):
     for expand_predictions in [True, False]:
       for expand_labels in [True, False]:
         for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-          predictions = math_ops.cast(constant_op.constant(
-              ((1, 0, 1, 0),
-               (0, 1, 1, 1),
-               (0, 0, 0, 0))), dtype=dtype)
+          predictions = math_ops.cast(
+              constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_predictions:
             predictions = array_ops.expand_dims(predictions, 2)
-          labels = math_ops.cast(constant_op.constant(
-              ((0, 1, 1, 0),
-               (1, 0, 0, 0),
-               (0, 0, 0, 0))), dtype=dtype)
+          labels = math_ops.cast(
+              constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_labels:
             labels = array_ops.expand_dims(labels, 2)
-          tp, tp_update_op = metrics.streaming_true_positives(predictions,
-                                                              labels)
+          tp, tp_update_op = metrics.streaming_true_positives(
+              predictions, labels)
 
           with self.test_session() as sess:
             sess.run(variables.local_variables_initializer())
@@ -692,14 +689,12 @@ class StreamingTruePositivesTest(test.TestCase):
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
+      predictions = math_ops.cast(
+          constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+          dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+          dtype=dtype)
       tp, tp_update_op = metrics.streaming_true_positives(
           predictions, labels, weights=37.0)
 
@@ -717,28 +712,25 @@ class StreamingFalseNegativesTest(test.TestCase):
     ops.reset_default_graph()
 
   def testVars(self):
-    metrics.streaming_false_negatives((0, 1, 0),
-                                      (0, 1, 1))
+    metrics.streaming_false_negatives((0, 1, 0), (0, 1, 1))
     _assert_metric_variables(self, ('false_negatives/count:0',))
 
   def testUnweighted(self):
     for expand_predictions in [True, False]:
       for expand_labels in [True, False]:
         for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-          predictions = math_ops.cast(constant_op.constant(
-              ((1, 0, 1, 0),
-               (0, 1, 1, 1),
-               (0, 0, 0, 0))), dtype=dtype)
+          predictions = math_ops.cast(
+              constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_predictions:
             predictions = array_ops.expand_dims(predictions, 2)
-          labels = math_ops.cast(constant_op.constant(
-              ((0, 1, 1, 0),
-               (1, 0, 0, 0),
-               (0, 0, 0, 0))), dtype=dtype)
+          labels = math_ops.cast(
+              constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_labels:
             labels = array_ops.expand_dims(labels, 2)
-          fn, fn_update_op = metrics.streaming_false_negatives(predictions,
-                                                               labels)
+          fn, fn_update_op = metrics.streaming_false_negatives(
+              predictions, labels)
 
           with self.test_session() as sess:
             sess.run(variables.local_variables_initializer())
@@ -748,14 +740,12 @@ class StreamingFalseNegativesTest(test.TestCase):
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
+      predictions = math_ops.cast(
+          constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+          dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+          dtype=dtype)
       fn, fn_update_op = metrics.streaming_false_negatives(
           predictions, labels, weights=((3.0,), (5.0,), (7.0,)))
 
@@ -773,28 +763,25 @@ class StreamingFalsePositivesTest(test.TestCase):
     ops.reset_default_graph()
 
   def testVars(self):
-    metrics.streaming_false_positives((0, 1, 0),
-                                      (0, 1, 1))
+    metrics.streaming_false_positives((0, 1, 0), (0, 1, 1))
     _assert_metric_variables(self, ('false_positives/count:0',))
 
   def testUnweighted(self):
     for expand_predictions in [True, False]:
       for expand_labels in [True, False]:
         for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-          predictions = math_ops.cast(constant_op.constant(
-              ((1, 0, 1, 0),
-               (0, 1, 1, 1),
-               (0, 0, 0, 0))), dtype=dtype)
+          predictions = math_ops.cast(
+              constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_predictions:
             predictions = array_ops.expand_dims(predictions, 2)
-          labels = math_ops.cast(constant_op.constant(
-              ((0, 1, 1, 0),
-               (1, 0, 0, 0),
-               (0, 0, 0, 0))), dtype=dtype)
+          labels = math_ops.cast(
+              constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_labels:
             labels = array_ops.expand_dims(labels, 2)
-          fp, fp_update_op = metrics.streaming_false_positives(predictions,
-                                                               labels)
+          fp, fp_update_op = metrics.streaming_false_positives(
+              predictions, labels)
 
           with self.test_session() as sess:
             sess.run(variables.local_variables_initializer())
@@ -804,20 +791,17 @@ class StreamingFalsePositivesTest(test.TestCase):
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
+      predictions = math_ops.cast(
+          constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+          dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+          dtype=dtype)
       fp, fp_update_op = metrics.streaming_false_positives(
           predictions,
           labels,
-          weights=((1.0, 2.0, 3.0, 5.0),
-                   (7.0, 11.0, 13.0, 17.0),
-                   (19.0, 23.0, 29.0, 31.0)))
+          weights=((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0), (19.0, 23.0,
+                                                                   29.0, 31.0)))
 
       with self.test_session() as sess:
         sess.run(variables.local_variables_initializer())
@@ -833,28 +817,25 @@ class StreamingTrueNegativesTest(test.TestCase):
     ops.reset_default_graph()
 
   def testVars(self):
-    metrics.streaming_true_negatives((0, 1, 0),
-                                     (0, 1, 1))
+    metrics.streaming_true_negatives((0, 1, 0), (0, 1, 1))
     _assert_metric_variables(self, ('true_negatives/count:0',))
 
   def testUnweighted(self):
     for expand_predictions in [True, False]:
       for expand_labels in [True, False]:
         for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-          predictions = math_ops.cast(constant_op.constant(
-              ((1, 0, 1, 0),
-               (0, 1, 1, 1),
-               (0, 0, 0, 0))), dtype=dtype)
+          predictions = math_ops.cast(
+              constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_predictions:
             predictions = array_ops.expand_dims(predictions, 2)
-          labels = math_ops.cast(constant_op.constant(
-              ((0, 1, 1, 0),
-               (1, 0, 0, 0),
-               (0, 0, 0, 0))), dtype=dtype)
+          labels = math_ops.cast(
+              constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+              dtype=dtype)
           if expand_labels:
             labels = array_ops.expand_dims(labels, 2)
-          tn, tn_update_op = metrics.streaming_true_negatives(predictions,
-                                                              labels)
+          tn, tn_update_op = metrics.streaming_true_negatives(
+              predictions, labels)
 
           with self.test_session() as sess:
             sess.run(variables.local_variables_initializer())
@@ -864,14 +845,12 @@ class StreamingTrueNegativesTest(test.TestCase):
 
   def testWeighted(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
-      predictions = math_ops.cast(constant_op.constant(
-          ((1, 0, 1, 0),
-           (0, 1, 1, 1),
-           (0, 0, 0, 0))), dtype=dtype)
-      labels = math_ops.cast(constant_op.constant(
-          ((0, 1, 1, 0),
-           (1, 0, 0, 0),
-           (0, 0, 0, 0))), dtype=dtype)
+      predictions = math_ops.cast(
+          constant_op.constant(((1, 0, 1, 0), (0, 1, 1, 1), (0, 0, 0, 0))),
+          dtype=dtype)
+      labels = math_ops.cast(
+          constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0))),
+          dtype=dtype)
       tn, tn_update_op = metrics.streaming_true_negatives(
           predictions, labels, weights=((0.0, 2.0, 3.0, 5.0),))
 
@@ -894,12 +873,9 @@ class StreamingTruePositivesAtThresholdsTest(test.TestCase):
     _assert_metric_variables(self, ('true_positives:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     tp, tp_update_op = metrics.streaming_true_positives_at_thresholds(
         predictions, labels, thresholds=(0.15, 0.5, 0.85))
 
@@ -910,12 +886,9 @@ class StreamingTruePositivesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((3, 1, 0), tp.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     tp, tp_update_op = metrics.streaming_true_positives_at_thresholds(
         predictions, labels, weights=37.0, thresholds=(0.15, 0.5, 0.85))
 
@@ -937,16 +910,14 @@ class StreamingFalseNegativesAtThresholdsTest(test.TestCase):
         (0.0, 1.0, 0.0), (0, 1, 1), thresholds=(
             0.15,
             0.5,
-            0.85,))
+            0.85,
+        ))
     _assert_metric_variables(self, ('false_negatives:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     fn, fn_update_op = metrics.streaming_false_negatives_at_thresholds(
         predictions, labels, thresholds=(0.15, 0.5, 0.85))
 
@@ -957,12 +928,9 @@ class StreamingFalseNegativesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((0, 2, 3), fn.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     fn, fn_update_op = metrics.streaming_false_negatives_at_thresholds(
         predictions,
         labels,
@@ -988,12 +956,9 @@ class StreamingFalsePositivesAtThresholdsTest(test.TestCase):
     _assert_metric_variables(self, ('false_positives:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     fp, fp_update_op = metrics.streaming_false_positives_at_thresholds(
         predictions, labels, thresholds=(0.15, 0.5, 0.85))
 
@@ -1004,18 +969,14 @@ class StreamingFalsePositivesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((7, 4, 2), fp.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     fp, fp_update_op = metrics.streaming_false_positives_at_thresholds(
         predictions,
         labels,
-        weights=((1.0, 2.0, 3.0, 5.0),
-                 (7.0, 11.0, 13.0, 17.0),
-                 (19.0, 23.0, 29.0, 31.0)),
+        weights=((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0), (19.0, 23.0,
+                                                                 29.0, 31.0)),
         thresholds=(0.15, 0.5, 0.85))
 
     with self.test_session() as sess:
@@ -1037,12 +998,9 @@ class StreamingTrueNegativesAtThresholdsTest(test.TestCase):
     _assert_metric_variables(self, ('true_negatives:0',))
 
   def testUnweighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     tn, tn_update_op = metrics.streaming_true_negatives_at_thresholds(
         predictions, labels, thresholds=(0.15, 0.5, 0.85))
 
@@ -1053,12 +1011,9 @@ class StreamingTrueNegativesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((2, 5, 7), tn.eval())
 
   def testWeighted(self):
-    predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
-                                        (0.2, 0.9, 0.7, 0.6),
-                                        (0.1, 0.2, 0.4, 0.3)))
-    labels = constant_op.constant(((0, 1, 1, 0),
-                                   (1, 0, 0, 0),
-                                   (0, 0, 0, 0)))
+    predictions = constant_op.constant(
+        ((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6), (0.1, 0.2, 0.4, 0.3)))
+    labels = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0)))
     tn, tn_update_op = metrics.streaming_true_negatives_at_thresholds(
         predictions,
         labels,
@@ -1393,8 +1348,7 @@ class StreamingFPRTest(test.TestCase):
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
     labels = random_ops.random_uniform(
         (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1413,8 +1367,7 @@ class StreamingFPRTest(test.TestCase):
 
     predictions = constant_op.constant(np_inputs)
     labels = constant_op.constant(np_inputs)
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1424,8 +1377,7 @@ class StreamingFPRTest(test.TestCase):
   def testSomeCorrect(self):
     predictions = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
     labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1467,8 +1419,7 @@ class StreamingFPRTest(test.TestCase):
 
     predictions = constant_op.constant(np_inputs)
     labels = constant_op.constant(1 - np_inputs)
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1478,8 +1429,7 @@ class StreamingFPRTest(test.TestCase):
   def testZeroFalsePositivesAndTrueNegativesGivesZeroFPR(self):
     predictions = array_ops.ones((1, 4))
     labels = array_ops.ones((1, 4))
-    fpr, update_op = metrics.streaming_false_positive_rate(
-        predictions, labels)
+    fpr, update_op = metrics.streaming_false_positive_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1521,8 +1471,7 @@ class StreamingFNRTest(test.TestCase):
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
     labels = random_ops.random_uniform(
         (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1541,8 +1490,7 @@ class StreamingFNRTest(test.TestCase):
 
     predictions = constant_op.constant(np_inputs)
     labels = constant_op.constant(np_inputs)
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1552,8 +1500,7 @@ class StreamingFNRTest(test.TestCase):
   def testSomeCorrect(self):
     predictions = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
     labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1595,8 +1542,7 @@ class StreamingFNRTest(test.TestCase):
 
     predictions = constant_op.constant(np_inputs)
     labels = constant_op.constant(1 - np_inputs)
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1606,8 +1552,7 @@ class StreamingFNRTest(test.TestCase):
   def testZeroFalseNegativesAndTruePositivesGivesZeroFNR(self):
     predictions = array_ops.zeros((1, 4))
     labels = array_ops.zeros((1, 4))
-    fnr, update_op = metrics.streaming_false_negative_rate(
-        predictions, labels)
+    fnr, update_op = metrics.streaming_false_negative_rate(predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -1944,16 +1889,17 @@ class StreamingAUCTest(test.TestCase):
         enqueue_ops[i].append(x_queue.enqueue(x_batches[i, :]))
       return x_queue.dequeue()
 
-    for weights in (None, np.ones(num_samples), np.random.exponential(
-        scale=1.0, size=num_samples)):
+    for weights in (None, np.ones(num_samples),
+                    np.random.exponential(scale=1.0, size=num_samples)):
       expected_auc = _np_auc(predictions, labels, weights)
 
       with self.test_session() as sess:
         enqueue_ops = [[] for i in range(num_batches)]
         tf_predictions = _enqueue_as_batches(predictions, enqueue_ops)
         tf_labels = _enqueue_as_batches(labels, enqueue_ops)
-        tf_weights = (_enqueue_as_batches(weights, enqueue_ops) if
-                      weights is not None else None)
+        tf_weights = (
+            _enqueue_as_batches(weights, enqueue_ops)
+            if weights is not None else None)
 
         for i in range(num_batches):
           sess.run(enqueue_ops[i])
@@ -1985,17 +1931,18 @@ class StreamingDynamicAUCTest(test.TestCase):
   def testUnknownCurve(self):
     with self.assertRaisesRegexp(
         ValueError, 'curve must be either ROC or PR, TEST_CURVE unknown'):
-      metrics.streaming_dynamic_auc(labels=array_ops.ones((10, 1)),
-                                    predictions=array_ops.ones((10, 1)),
-                                    curve='TEST_CURVE')
+      metrics.streaming_dynamic_auc(
+          labels=array_ops.ones((10, 1)),
+          predictions=array_ops.ones((10, 1)),
+          curve='TEST_CURVE')
 
   def testVars(self):
     metrics.streaming_dynamic_auc(
         labels=array_ops.ones((10, 1)), predictions=array_ops.ones((10, 1)))
-    _assert_metric_variables(self, ['dynamic_auc/concat_labels/array:0',
-                                    'dynamic_auc/concat_labels/size:0',
-                                    'dynamic_auc/concat_preds/array:0',
-                                    'dynamic_auc/concat_preds/size:0'])
+    _assert_metric_variables(self, [
+        'dynamic_auc/concat_labels/array:0', 'dynamic_auc/concat_labels/size:0',
+        'dynamic_auc/concat_preds/array:0', 'dynamic_auc/concat_preds/size:0'
+    ])
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -2049,8 +1996,8 @@ class StreamingDynamicAUCTest(test.TestCase):
 
   def testNonZeroOnePredictions(self):
     with self.test_session() as sess:
-      predictions = constant_op.constant([2.5, -2.5, 2.5, -2.5],
-                                         dtype=dtypes_lib.float32)
+      predictions = constant_op.constant(
+          [2.5, -2.5, 2.5, -2.5], dtype=dtypes_lib.float32)
       labels = constant_op.constant([1, 0, 1, 0])
       auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
       sess.run(variables.local_variables_initializer())
@@ -2122,9 +2069,10 @@ class StreamingDynamicAUCTest(test.TestCase):
     num_batches = 100
     labels = np.array([])
     predictions = np.array([])
-    tf_labels = variables.Variable(array_ops.ones(batch_size, dtypes_lib.int32),
-                                   collections=[ops.GraphKeys.LOCAL_VARIABLES],
-                                   dtype=dtypes_lib.int32)
+    tf_labels = variables.Variable(
+        array_ops.ones(batch_size, dtypes_lib.int32),
+        collections=[ops.GraphKeys.LOCAL_VARIABLES],
+        dtype=dtypes_lib.int32)
     tf_predictions = variables.Variable(
         array_ops.ones(batch_size),
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -2195,8 +2143,7 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
       gotten_result: A PrecisionRecallData object.
     """
     gotten_dict = {k: t.eval() for k, t in gotten_result._asdict().items()}
-    self.assertItemsEqual(
-        list(expected_dict.keys()), list(gotten_dict.keys()))
+    self.assertItemsEqual(list(expected_dict.keys()), list(gotten_dict.keys()))
 
     for key, expected_values in expected_dict.items():
       self.assertAllClose(expected_values, gotten_dict[key])
@@ -2261,60 +2208,65 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
         sess.run(update_op)
 
       # Then verify idempotency.
-      initial_result = {k: value.eval().tolist() for k, value in
-                        result._asdict().items()}
+      initial_result = {
+          k: value.eval().tolist()
+          for k, value in result._asdict().items()
+      }
       for _ in range(3):
         self._testResultsEqual(initial_result, result)
 
   def testAllTruePositives(self):
-    self._testCase([[1]], [[True]], {
-        'tp': [1, 1, 1],
-        'fp': [0, 0, 0],
-        'tn': [0, 0, 0],
-        'fn': [0, 0, 0],
-        'precision': [1.0, 1.0, 1.0],
-        'recall': [1.0, 1.0, 1.0],
-        'thresholds': [0.0, 0.5, 1.0],
-    })
+    self._testCase(
+        [[1]], [[True]], {
+            'tp': [1, 1, 1],
+            'fp': [0, 0, 0],
+            'tn': [0, 0, 0],
+            'fn': [0, 0, 0],
+            'precision': [1.0, 1.0, 1.0],
+            'recall': [1.0, 1.0, 1.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        })
 
   def testAllTrueNegatives(self):
-    self._testCase([[0]], [[False]], {
-        'tp': [0, 0, 0],
-        'fp': [1, 0, 0],
-        'tn': [0, 1, 1],
-        'fn': [0, 0, 0],
-        'precision': [0.0, 0.0, 0.0],
-        'recall': [0.0, 0.0, 0.0],
-        'thresholds': [0.0, 0.5, 1.0],
-    })
+    self._testCase(
+        [[0]], [[False]], {
+            'tp': [0, 0, 0],
+            'fp': [1, 0, 0],
+            'tn': [0, 1, 1],
+            'fn': [0, 0, 0],
+            'precision': [0.0, 0.0, 0.0],
+            'recall': [0.0, 0.0, 0.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        })
 
   def testAllFalsePositives(self):
-    self._testCase([[1]], [[False]], {
-        'tp': [0, 0, 0],
-        'fp': [1, 1, 1],
-        'tn': [0, 0, 0],
-        'fn': [0, 0, 0],
-        'precision': [0.0, 0.0, 0.0],
-        'recall': [0.0, 0.0, 0.0],
-        'thresholds': [0.0, 0.5, 1.0],
-    })
+    self._testCase(
+        [[1]], [[False]], {
+            'tp': [0, 0, 0],
+            'fp': [1, 1, 1],
+            'tn': [0, 0, 0],
+            'fn': [0, 0, 0],
+            'precision': [0.0, 0.0, 0.0],
+            'recall': [0.0, 0.0, 0.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        })
 
   def testAllFalseNegatives(self):
-    self._testCase([[0]], [[True]], {
-        'tp': [1, 0, 0],
-        'fp': [0, 0, 0],
-        'tn': [0, 0, 0],
-        'fn': [0, 1, 1],
-        'precision': [1.0, 0.0, 0.0],
-        'recall': [1.0, 0.0, 0.0],
-        'thresholds': [0.0, 0.5, 1.0],
-    })
+    self._testCase(
+        [[0]], [[True]], {
+            'tp': [1, 0, 0],
+            'fp': [0, 0, 0],
+            'tn': [0, 0, 0],
+            'fn': [0, 1, 1],
+            'precision': [1.0, 0.0, 0.0],
+            'recall': [1.0, 0.0, 0.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        })
 
   def testManyValues(self):
     self._testCase(
         [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]],
-        [[True, False, False, True, True, True]],
-        {
+        [[True, False, False, True, True, True]], {
             'tp': [4, 3, 0],
             'fp': [2, 0, 0],
             'tn': [0, 2, 2],
@@ -2327,8 +2279,7 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
   def testManyValuesWithWeights(self):
     self._testCase(
         [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]],
-        [[True, False, False, True, True, True]],
-        {
+        [[True, False, False, True, True, True]], {
             'tp': [1.5, 1.5, 0.0],
             'fp': [2.5, 0.0, 0.0],
             'tn': [0.0, 2.5, 2.5],
@@ -2644,11 +2595,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
     labels = random_ops.random_uniform(
         (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
     thresholds = [0, 0.5, 1.0]
-    prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                              labels,
-                                                              thresholds)
-    rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                         thresholds)
+    prec, prec_op = metrics.streaming_precision_at_thresholds(
+        predictions, labels, thresholds)
+    rec, rec_op = metrics.streaming_recall_at_thresholds(
+        predictions, labels, thresholds)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -2672,11 +2622,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
       predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
       labels = constant_op.constant(inputs)
       thresholds = [0.5]
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
@@ -2690,11 +2639,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
           [1, 0, 1, 0], shape=(1, 4), dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
       thresholds = [0.5]
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
@@ -2709,11 +2657,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
       predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
       labels = constant_op.constant(1 - inputs, dtype=dtypes_lib.float32)
       thresholds = [0.5]
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
@@ -2779,11 +2726,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
           [1, 0, 1, 0], shape=(1, 4), dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 1, 1, 1], shape=(1, 4))
       thresholds = [-1.0, 2.0]  # lower/higher than any values
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       prec_low = prec[0]
       prec_high = prec[1]
@@ -2803,11 +2749,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
       predictions = array_ops.zeros([4], dtype=dtypes_lib.float32)
       labels = array_ops.zeros([4])
       thresholds = [0.5]
-      prec, prec_op = metrics.streaming_precision_at_thresholds(predictions,
-                                                                labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(predictions, labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          predictions, labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          predictions, labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       sess.run([prec_op, rec_op])
@@ -2872,12 +2817,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
       tf_predictions = predictions_queue.dequeue()
       tf_labels = labels_queue.dequeue()
 
-      prec, prec_op = metrics.streaming_precision_at_thresholds(tf_predictions,
-                                                                tf_labels,
-                                                                thresholds)
-      rec, rec_op = metrics.streaming_recall_at_thresholds(tf_predictions,
-                                                           tf_labels,
-                                                           thresholds)
+      prec, prec_op = metrics.streaming_precision_at_thresholds(
+          tf_predictions, tf_labels, thresholds)
+      rec, rec_op = metrics.streaming_recall_at_thresholds(
+          tf_predictions, tf_labels, thresholds)
 
       sess.run(variables.local_variables_initializer())
       for _ in range(int(num_samples / batch_size)):
@@ -2921,8 +2864,7 @@ class StreamingFPRThresholdsTest(test.TestCase):
         labels=array_ops.ones((10, 1)),
         thresholds=[0, 0.5, 1.0],
         updates_collections=[my_collection_name])
-    self.assertListEqual(
-        ops.get_collection(my_collection_name), [update_op])
+    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
@@ -3162,7 +3104,7 @@ class RecallAtPrecisionTest(test.TestCase):
     labels = random_ops.random_uniform(
         (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
     recall, update_op = metrics.recall_at_precision(
-        predictions, labels, precision=0.7)
+        labels, predictions, precision=0.7)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -3182,7 +3124,7 @@ class RecallAtPrecisionTest(test.TestCase):
     predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
     labels = constant_op.constant(inputs)
     recall, update_op = metrics.recall_at_precision(
-        predictions, labels, precision=1.0)
+        labels, predictions, precision=1.0)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -3197,7 +3139,7 @@ class RecallAtPrecisionTest(test.TestCase):
         predictions_values, dtype=dtypes_lib.float32)
     labels = constant_op.constant(labels_values)
     recall, update_op = metrics.recall_at_precision(
-        predictions, labels, precision=0.8)
+        labels, predictions, precision=0.8)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -3212,7 +3154,7 @@ class RecallAtPrecisionTest(test.TestCase):
         predictions_values, dtype=dtypes_lib.float32)
     labels = constant_op.constant(labels_values)
     recall, update_op = metrics.recall_at_precision(
-        predictions, labels, precision=0.4)
+        labels, predictions, precision=0.4)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -3230,7 +3172,7 @@ class RecallAtPrecisionTest(test.TestCase):
     labels = constant_op.constant(labels_values)
     weights = constant_op.constant(weights_values)
     recall, update_op = metrics.recall_at_precision(
-        predictions, labels, weights=weights, precision=0.4)
+        labels, predictions, weights=weights, precision=0.4)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -3271,8 +3213,7 @@ class StreamingFNRThresholdsTest(test.TestCase):
         labels=array_ops.ones((10, 1)),
         thresholds=[0, 0.5, 1.0],
         updates_collections=[my_collection_name])
-    self.assertListEqual(
-        ops.get_collection(my_collection_name), [update_op])
+    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
@@ -3492,8 +3433,7 @@ class StreamingRecallAtKTest(test.TestCase):
   def testVars(self):
     metrics.streaming_recall_at_k(
         predictions=array_ops.ones((self._batch_size, self._num_classes)),
-        labels=array_ops.ones(
-            (self._batch_size,), dtype=dtypes_lib.int32),
+        labels=array_ops.ones((self._batch_size,), dtype=dtypes_lib.int32),
         k=1)
     _assert_metric_variables(self,
                              ('recall_at_1/count:0', 'recall_at_1/total:0'))
@@ -3502,8 +3442,7 @@ class StreamingRecallAtKTest(test.TestCase):
     my_collection_name = '__metrics__'
     mean, _ = metrics.streaming_recall_at_k(
         predictions=array_ops.ones((self._batch_size, self._num_classes)),
-        labels=array_ops.ones(
-            (self._batch_size,), dtype=dtypes_lib.int32),
+        labels=array_ops.ones((self._batch_size,), dtype=dtypes_lib.int32),
         k=1,
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
@@ -3512,8 +3451,7 @@ class StreamingRecallAtKTest(test.TestCase):
     my_collection_name = '__updates__'
     _, update_op = metrics.streaming_recall_at_k(
         predictions=array_ops.ones((self._batch_size, self._num_classes)),
-        labels=array_ops.ones(
-            (self._batch_size,), dtype=dtypes_lib.int32),
+        labels=array_ops.ones((self._batch_size,), dtype=dtypes_lib.int32),
         k=1,
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
@@ -3715,9 +3653,17 @@ class StreamingSparsePrecisionTest(test.TestCase):
       # top_k_predictions has rank < 2.
       top_k_predictions = [9, 4, 6, 2, 0]
       sp_labels = sparse_tensor.SparseTensorValue(
-          indices=np.array([[0,], [1,], [2,]], np.int64),
+          indices=np.array([[
+              0,
+          ], [
+              1,
+          ], [
+              2,
+          ]], np.int64),
           values=np.array([2, 7, 8], np.int64),
-          dense_shape=np.array([10,], np.int64))
+          dense_shape=np.array([
+              10,
+          ], np.int64))
 
       with self.assertRaises(ValueError):
         precision, _ = metrics.streaming_sparse_precision_at_top_k(
@@ -3774,8 +3720,9 @@ class StreamingSparsePrecisionTest(test.TestCase):
     # average of the 2 examples.
     labels = np.array([labels_ex1, labels_ex2], dtype=np.int64)
     predictions = (predictions_ex1, predictions_ex2)
-    streaming_precision = [(ex1 + ex2) / 2
-                           for ex1, ex2 in zip(precision_ex1, precision_ex2)]
+    streaming_precision = [
+        (ex1 + ex2) / 2 for ex1, ex2 in zip(precision_ex1, precision_ex2)
+    ]
     streaming_average_precision = [
         (ex1 + ex2) / 2
         for ex1, ex2 in zip(avg_precision_ex1, avg_precision_ex2)
@@ -3835,29 +3782,29 @@ class StreamingSparsePrecisionTest(test.TestCase):
           (predictions_top_k_ex1[:k],), labels, expected=avg_precision_ex1[i])
 
   def test_average_precision_at_top_k_static_shape_check(self):
-    predictions_top_k = array_ops.placeholder(shape=(2, None),
-                                              dtype=dtypes_lib.int64)
+    predictions_top_k = array_ops.placeholder(
+        shape=(2, None), dtype=dtypes_lib.int64)
     labels = np.array(((1,), (2,)), dtype=np.int64)
     # Fails due to non-static predictions_idx shape.
     with self.assertRaises(ValueError):
-      metric_ops.streaming_sparse_average_precision_at_top_k(predictions_top_k,
-                                                             labels)
+      metric_ops.streaming_sparse_average_precision_at_top_k(
+          predictions_top_k, labels)
 
     predictions_top_k = (2, 1)
     # Fails since rank of predictions_idx is less than one.
     with self.assertRaises(ValueError):
-      metric_ops.streaming_sparse_average_precision_at_top_k(predictions_top_k,
-                                                             labels)
+      metric_ops.streaming_sparse_average_precision_at_top_k(
+          predictions_top_k, labels)
     predictions_top_k = ((2,), (1,))
     # Valid static shape.
-    metric_ops.streaming_sparse_average_precision_at_top_k(predictions_top_k,
-                                                           labels)
+    metric_ops.streaming_sparse_average_precision_at_top_k(
+        predictions_top_k, labels)
 
   def test_one_label_at_k1_nan(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -3871,8 +3818,8 @@ class StreamingSparsePrecisionTest(test.TestCase):
   def test_one_label_at_k1(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -3971,8 +3918,8 @@ class StreamingSparsePrecisionTest(test.TestCase):
         [5, 7, 2, 9, 6],
     ]
     sp_labels = sparse_tensor.SparseTensorValue(
-        indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2],
-                 [1, 3]],
+        indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2], [1,
+                                                                          3]],
         # values -1 and 10 are outside the [0, n_classes) range and are ignored.
         values=np.array([2, 7, -1, 8, 1, 2, 5, 10], np.int64),
         dense_shape=[2, 4])
@@ -4324,8 +4271,8 @@ class StreamingSparseRecallTest(test.TestCase):
   def test_one_label_at_k1_nan(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     # Classes 0,1 have 0 labels, 0 predictions, classes -1 and 4 are out of
@@ -4340,8 +4287,8 @@ class StreamingSparseRecallTest(test.TestCase):
   def test_one_label_at_k1_no_predictions(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -4354,8 +4301,8 @@ class StreamingSparseRecallTest(test.TestCase):
   def test_one_label_at_k1(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -4374,8 +4321,8 @@ class StreamingSparseRecallTest(test.TestCase):
   def test_one_label_at_k1_weighted(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
     top_k_predictions = [[3], [3]]
-    sparse_labels = _binary_2d_label_to_sparse_value(
-        [[0, 0, 0, 1], [0, 0, 1, 0]])
+    sparse_labels = _binary_2d_label_to_sparse_value([[0, 0, 0, 1],
+                                                      [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
 
     for labels in (sparse_labels, dense_labels):
@@ -4647,8 +4594,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [5, 7, 2, 9, 6],
     ]
     sp_labels = sparse_tensor.SparseTensorValue(
-        indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2],
-                 [1, 3]],
+        indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2], [1,
+                                                                          3]],
         # values -1 and 10 are outside the [0, n_classes) range.
         values=np.array([2, 7, -1, 8, 1, 2, 5, 10], np.int64),
         dense_shape=[2, 4])
@@ -4661,10 +4608,7 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=2.0 / 2,
         class_id=2)
     self._test_sparse_recall_at_top_k(
-        sp_labels,
-        top_k_predictions,
-        expected=2.0 / 2,
-        class_id=2)
+        sp_labels, top_k_predictions, expected=2.0 / 2, class_id=2)
 
     # Class 5: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -4674,10 +4618,7 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=1.0 / 1,
         class_id=5)
     self._test_sparse_recall_at_top_k(
-        sp_labels,
-        top_k_predictions,
-        expected=1.0 / 1,
-        class_id=5)
+        sp_labels, top_k_predictions, expected=1.0 / 1, class_id=5)
 
     # Class 7: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -4687,10 +4628,7 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=0.0 / 1,
         class_id=7)
     self._test_sparse_recall_at_top_k(
-        sp_labels,
-        top_k_predictions,
-        expected=0.0 / 1,
-        class_id=7)
+        sp_labels, top_k_predictions, expected=0.0 / 1, class_id=7)
 
     # All classes: 8 labels, 3 correct.
     self._test_streaming_sparse_recall_at_k(
@@ -4740,10 +4678,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [9, 4, 6, 2, 0],
     ]]
     sparse_labels = _binary_3d_label_to_sparse_value(
-        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
-          [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
-         [[0, 1, 1, 0, 0, 1, 0, 0, 0, 0],
-          [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]]])
+        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
+         [[0, 1, 1, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]]])
     dense_labels = np.array(
         [[[2, 7, 8], [1, 2, 5]], [
             [1, 2, 5],
@@ -4771,10 +4707,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [9, 4, 6, 2, 0],
     ]]
     labels = _binary_3d_label_to_sparse_value(
-        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
-          [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
-         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
-          [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
+        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
+         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
 
     # Class 2: 4 labels, all correct.
     self._test_streaming_sparse_recall_at_k(
@@ -4813,10 +4747,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [9, 4, 6, 2, 0],
     ]]
     labels = _binary_3d_label_to_sparse_value(
-        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
-          [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
-         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
-          [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
+        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
+         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
 
     for class_id in xrange(10):
       self._test_streaming_sparse_recall_at_k(
@@ -4867,10 +4799,8 @@ class StreamingSparseRecallTest(test.TestCase):
         [9, 4, 6, 2, 0],
     ]]
     labels = _binary_3d_label_to_sparse_value(
-        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
-          [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
-         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0],
-          [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
+        [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
+         [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
 
     # Class 2: 2 labels, both correct.
     self._test_streaming_sparse_recall_at_k(
@@ -4963,10 +4893,8 @@ class StreamingSparseRecallTest(test.TestCase):
         weights=[[0, 1], [0, 1]])
 
   def test_sparse_tensor_value(self):
-    predictions = [[0.1, 0.3, 0.2, 0.4],
-                   [0.1, 0.2, 0.3, 0.4]]
-    labels = [[0, 0, 1, 0],
-              [0, 0, 0, 1]]
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    labels = [[0, 0, 1, 0], [0, 0, 0, 1]]
     expected_recall = 0.5
     with self.test_session():
       _, recall = metrics.streaming_sparse_recall_at_k(
@@ -5009,8 +4937,8 @@ class StreamingMeanAbsoluteErrorTest(test.TestCase):
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
-    error, update_op = metrics.streaming_mean_absolute_error(predictions,
-                                                             labels)
+    error, update_op = metrics.streaming_mean_absolute_error(
+        predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5031,8 +4959,8 @@ class StreamingMeanAbsoluteErrorTest(test.TestCase):
         [1, 3, 2, 3], shape=(1, 4), dtype=dtypes_lib.float32)
     weights = constant_op.constant([0, 1, 0, 1], shape=(1, 4))
 
-    error, update_op = metrics.streaming_mean_absolute_error(predictions,
-                                                             labels, weights)
+    error, update_op = metrics.streaming_mean_absolute_error(
+        predictions, labels, weights)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5075,8 +5003,8 @@ class StreamingMeanRelativeErrorTest(test.TestCase):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     normalizer = random_ops.random_normal((10, 3), seed=3)
-    error, update_op = metrics.streaming_mean_relative_error(predictions,
-                                                             labels, normalizer)
+    error, update_op = metrics.streaming_mean_relative_error(
+        predictions, labels, normalizer)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5200,8 +5128,8 @@ class StreamingMeanSquaredErrorTest(test.TestCase):
         [1, 3, 2, 3], shape=(1, 4), dtype=dtypes_lib.float32)
     weights = constant_op.constant([0, 1, 0, 1], shape=(1, 4))
 
-    error, update_op = metrics.streaming_mean_squared_error(predictions, labels,
-                                                            weights)
+    error, update_op = metrics.streaming_mean_squared_error(
+        predictions, labels, weights)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5224,8 +5152,8 @@ class StreamingMeanSquaredErrorTest(test.TestCase):
       _enqueue_vector(sess, labels_queue, [2, 4, 6])
       labels = labels_queue.dequeue()
 
-      error, update_op = metrics.streaming_mean_squared_error(predictions,
-                                                              labels)
+      error, update_op = metrics.streaming_mean_squared_error(
+          predictions, labels)
 
       sess.run(variables.local_variables_initializer())
       sess.run(update_op)
@@ -5292,10 +5220,10 @@ class StreamingMeanSquaredErrorTest(test.TestCase):
       _enqueue_vector(sess, labels_queue, [2, 4, 6])
       labels = labels_queue.dequeue()
 
-      mae, ma_update_op = metrics.streaming_mean_absolute_error(predictions,
-                                                                labels)
-      mse, ms_update_op = metrics.streaming_mean_squared_error(predictions,
-                                                               labels)
+      mae, ma_update_op = metrics.streaming_mean_absolute_error(
+          predictions, labels)
+      mse, ms_update_op = metrics.streaming_mean_squared_error(
+          predictions, labels)
 
       sess.run(variables.local_variables_initializer())
       sess.run([ma_update_op, ms_update_op])
@@ -5336,8 +5264,8 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
-    error, update_op = metrics.streaming_root_mean_squared_error(predictions,
-                                                                 labels)
+    error, update_op = metrics.streaming_root_mean_squared_error(
+        predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5357,8 +5285,8 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
           0.0, shape=(1, 3), dtype=dtypes_lib.float32)
       labels = constant_op.constant(0.0, shape=(1, 3), dtype=dtypes_lib.float32)
 
-      rmse, update_op = metrics.streaming_root_mean_squared_error(predictions,
-                                                                  labels)
+      rmse, update_op = metrics.streaming_root_mean_squared_error(
+          predictions, labels)
 
       sess.run(variables.local_variables_initializer())
       self.assertEqual(0, sess.run(update_op))
@@ -5372,8 +5300,8 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
       labels = constant_op.constant(
           [1, 3, 2], shape=(1, 3), dtype=dtypes_lib.float32)
 
-      rmse, update_op = metrics.streaming_root_mean_squared_error(predictions,
-                                                                  labels)
+      rmse, update_op = metrics.streaming_root_mean_squared_error(
+          predictions, labels)
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(math.sqrt(6), update_op.eval(), 5)
@@ -5387,9 +5315,8 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
           [1, 3, 2, 3], shape=(1, 4), dtype=dtypes_lib.float32)
       weights = constant_op.constant([0, 1, 0, 1], shape=(1, 4))
 
-      rmse, update_op = metrics.streaming_root_mean_squared_error(predictions,
-                                                                  labels,
-                                                                  weights)
+      rmse, update_op = metrics.streaming_root_mean_squared_error(
+          predictions, labels, weights)
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(math.sqrt(13), sess.run(update_op))
@@ -5404,8 +5331,8 @@ class StreamingCovarianceTest(test.TestCase):
 
   def testVars(self):
     metrics.streaming_covariance(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]))
     _assert_metric_variables(self, (
         'covariance/comoment:0',
@@ -5417,8 +5344,8 @@ class StreamingCovarianceTest(test.TestCase):
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     cov, _ = metrics.streaming_covariance(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [cov])
@@ -5426,8 +5353,8 @@ class StreamingCovarianceTest(test.TestCase):
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.streaming_covariance(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
@@ -5487,9 +5414,8 @@ class StreamingCovarianceTest(test.TestCase):
       cov, update_op = metrics.streaming_covariance(
           predictions, labels, weights=weights)
 
-      expected_cov = np.cov([2, 4, 6, 8],
-                            [1, 3, 2, 7],
-                            fweights=[0, 1, 3, 1])[0, 1]
+      expected_cov = np.cov(
+          [2, 4, 6, 8], [1, 3, 2, 7], fweights=[0, 1, 3, 1])[0, 1]
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(expected_cov, sess.run(update_op))
       self.assertAlmostEqual(expected_cov, cov.eval())
@@ -5514,17 +5440,18 @@ class StreamingCovarianceTest(test.TestCase):
             predictions_t: predictions[stride * i:stride * (i + 1)],
             labels_t: labels[stride * i:stride * (i + 1)]
         }
-        self.assertEqual(np.isnan(prev_expected_cov),
-                         np.isnan(sess.run(cov, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(prev_expected_cov),
+            np.isnan(sess.run(cov, feed_dict=feed_dict)))
         if not np.isnan(prev_expected_cov):
-          self.assertAlmostEqual(
-              prev_expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
+          self.assertAlmostEqual(prev_expected_cov,
+                                 sess.run(cov, feed_dict=feed_dict), 5)
         expected_cov = np.cov(predictions[:stride * (i + 1)],
                               labels[:stride * (i + 1)])[0, 1]
-        self.assertAlmostEqual(
-            expected_cov, sess.run(update_op, feed_dict=feed_dict), 5)
-        self.assertAlmostEqual(
-            expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_cov,
+                               sess.run(update_op, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_cov, sess.run(cov, feed_dict=feed_dict),
+                               5)
         prev_expected_cov = expected_cov
 
   def testMultiUpdateWithErrorAndWeights(self):
@@ -5552,18 +5479,20 @@ class StreamingCovarianceTest(test.TestCase):
             labels_t: labels[stride * i:stride * (i + 1)],
             weights_t: weights[stride * i:stride * (i + 1)]
         }
-        self.assertEqual(np.isnan(prev_expected_cov),
-                         np.isnan(sess.run(cov, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(prev_expected_cov),
+            np.isnan(sess.run(cov, feed_dict=feed_dict)))
         if not np.isnan(prev_expected_cov):
-          self.assertAlmostEqual(
-              prev_expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
-        expected_cov = np.cov(predictions[:stride * (i + 1)],
-                              labels[:stride * (i + 1)],
-                              fweights=weights[:stride * (i + 1)])[0, 1]
-        self.assertAlmostEqual(
-            expected_cov, sess.run(update_op, feed_dict=feed_dict), 5)
-        self.assertAlmostEqual(
-            expected_cov, sess.run(cov, feed_dict=feed_dict), 5)
+          self.assertAlmostEqual(prev_expected_cov,
+                                 sess.run(cov, feed_dict=feed_dict), 5)
+        expected_cov = np.cov(
+            predictions[:stride * (i + 1)],
+            labels[:stride * (i + 1)],
+            fweights=weights[:stride * (i + 1)])[0, 1]
+        self.assertAlmostEqual(expected_cov,
+                               sess.run(update_op, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_cov, sess.run(cov, feed_dict=feed_dict),
+                               5)
         prev_expected_cov = expected_cov
 
 
@@ -5574,8 +5503,8 @@ class StreamingPearsonRTest(test.TestCase):
 
   def testVars(self):
     metrics.streaming_pearson_correlation(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]))
     _assert_metric_variables(self, (
         'pearson_r/covariance/comoment:0',
@@ -5595,8 +5524,8 @@ class StreamingPearsonRTest(test.TestCase):
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     pearson_r, _ = metrics.streaming_pearson_correlation(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [pearson_r])
@@ -5604,8 +5533,8 @@ class StreamingPearsonRTest(test.TestCase):
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.streaming_pearson_correlation(
-        predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
-            [10, 10]),
+        predictions=math_ops.to_float(math_ops.range(10)) +
+        array_ops.ones([10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]),
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
@@ -5613,8 +5542,8 @@ class StreamingPearsonRTest(test.TestCase):
   def testValueTensorIsIdempotent(self):
     labels = random_ops.random_normal((10, 3), seed=2)
     predictions = labels * 0.5 + random_ops.random_normal((10, 3), seed=1) * 0.5
-    pearson_r, update_op = metrics.streaming_pearson_correlation(predictions,
-                                                                 labels)
+    pearson_r, update_op = metrics.streaming_pearson_correlation(
+        predictions, labels)
 
     with self.test_session() as sess:
       sess.run(variables.local_variables_initializer())
@@ -5633,8 +5562,8 @@ class StreamingPearsonRTest(test.TestCase):
       predictions = math_ops.to_float(math_ops.range(10))
       labels = math_ops.to_float(math_ops.range(10))
 
-      pearson_r, update_op = metrics.streaming_pearson_correlation(predictions,
-                                                                   labels)
+      pearson_r, update_op = metrics.streaming_pearson_correlation(
+          predictions, labels)
 
       expected_r = np.corrcoef(np.arange(10), np.arange(10))[0, 1]
       sess.run(variables.local_variables_initializer())
@@ -5648,8 +5577,8 @@ class StreamingPearsonRTest(test.TestCase):
       labels = constant_op.constant(
           [1, 3, 2], shape=(1, 3), dtype=dtypes_lib.float32)
 
-      pearson_r, update_op = metrics.streaming_pearson_correlation(predictions,
-                                                                   labels)
+      pearson_r, update_op = metrics.streaming_pearson_correlation(
+          predictions, labels)
 
       expected_r = np.corrcoef([2, 4, 6], [1, 3, 2])[0, 1]
       sess.run(variables.local_variables_initializer())
@@ -5698,17 +5627,18 @@ class StreamingPearsonRTest(test.TestCase):
             predictions_t: predictions[stride * i:stride * (i + 1)],
             labels_t: labels[stride * i:stride * (i + 1)]
         }
-        self.assertEqual(np.isnan(prev_expected_r),
-                         np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(prev_expected_r),
+            np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
         if not np.isnan(prev_expected_r):
-          self.assertAlmostEqual(
-              prev_expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+          self.assertAlmostEqual(prev_expected_r,
+                                 sess.run(pearson_r, feed_dict=feed_dict), 5)
         expected_r = np.corrcoef(predictions[:stride * (i + 1)],
                                  labels[:stride * (i + 1)])[0, 1]
-        self.assertAlmostEqual(
-            expected_r, sess.run(update_op, feed_dict=feed_dict), 5)
-        self.assertAlmostEqual(
-            expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_r,
+                               sess.run(update_op, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_r,
+                               sess.run(pearson_r, feed_dict=feed_dict), 5)
         prev_expected_r = expected_r
 
   def testMultiUpdateWithErrorAndWeights(self):
@@ -5736,19 +5666,21 @@ class StreamingPearsonRTest(test.TestCase):
             labels_t: labels[stride * i:stride * (i + 1)],
             weights_t: weights[stride * i:stride * (i + 1)]
         }
-        self.assertEqual(np.isnan(prev_expected_r),
-                         np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(prev_expected_r),
+            np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
         if not np.isnan(prev_expected_r):
-          self.assertAlmostEqual(
-              prev_expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
-        cmat = np.cov(predictions[:stride * (i + 1)],
-                      labels[:stride * (i + 1)],
-                      fweights=weights[:stride * (i + 1)])
+          self.assertAlmostEqual(prev_expected_r,
+                                 sess.run(pearson_r, feed_dict=feed_dict), 5)
+        cmat = np.cov(
+            predictions[:stride * (i + 1)],
+            labels[:stride * (i + 1)],
+            fweights=weights[:stride * (i + 1)])
         expected_r = cmat[0, 1] / np.sqrt(cmat[0, 0] * cmat[1, 1])
-        self.assertAlmostEqual(
-            expected_r, sess.run(update_op, feed_dict=feed_dict), 5)
-        self.assertAlmostEqual(
-            expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_r,
+                               sess.run(update_op, feed_dict=feed_dict), 5)
+        self.assertAlmostEqual(expected_r,
+                               sess.run(pearson_r, feed_dict=feed_dict), 5)
         prev_expected_r = expected_r
 
   def testMultiUpdateWithErrorAndSingletonBatches(self):
@@ -5758,7 +5690,7 @@ class StreamingPearsonRTest(test.TestCase):
       predictions = np.random.randn(n)
       labels = 0.5 * predictions + np.random.randn(n)
       stride = 10
-      weights = (np.arange(n).reshape(n//stride, stride) % stride == 0)
+      weights = (np.arange(n).reshape(n // stride, stride) % stride == 0)
       for row in weights:
         np.random.shuffle(row)
       # Now, weights is one-hot by row - one item per batch has non-zero weight.
@@ -5778,19 +5710,20 @@ class StreamingPearsonRTest(test.TestCase):
             labels_t: labels[stride * i:stride * (i + 1)],
             weights_t: weights[stride * i:stride * (i + 1)]
         }
-        cmat = np.cov(predictions[:stride * (i + 1)],
-                      labels[:stride * (i + 1)],
-                      fweights=weights[:stride * (i + 1)])
+        cmat = np.cov(
+            predictions[:stride * (i + 1)],
+            labels[:stride * (i + 1)],
+            fweights=weights[:stride * (i + 1)])
         expected_r = cmat[0, 1] / np.sqrt(cmat[0, 0] * cmat[1, 1])
         actual_r = sess.run(update_op, feed_dict=feed_dict)
         self.assertEqual(np.isnan(expected_r), np.isnan(actual_r))
-        self.assertEqual(np.isnan(expected_r),
-                         np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
+        self.assertEqual(
+            np.isnan(expected_r),
+            np.isnan(sess.run(pearson_r, feed_dict=feed_dict)))
         if not np.isnan(expected_r):
-          self.assertAlmostEqual(
-              expected_r, actual_r, 5)
-          self.assertAlmostEqual(
-              expected_r, sess.run(pearson_r, feed_dict=feed_dict), 5)
+          self.assertAlmostEqual(expected_r, actual_r, 5)
+          self.assertAlmostEqual(expected_r,
+                                 sess.run(pearson_r, feed_dict=feed_dict), 5)
 
 
 class StreamingMeanCosineDistanceTest(test.TestCase):
@@ -6191,20 +6124,14 @@ class StreamingMeanIOUTest(test.TestCase):
       self.assertAlmostEqual(desired_output, miou.eval())
 
   def testUpdateOpEvalIsAccumulatedConfusionMatrix(self):
-    predictions = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[5]), constant_op.constant(
-                    1, shape=[5])
-        ],
-        0)
-    labels = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[3]), constant_op.constant(
-                    1, shape=[7])
-        ],
-        0)
+    predictions = array_ops.concat([
+        constant_op.constant(0, shape=[5]),
+        constant_op.constant(1, shape=[5])
+    ], 0)
+    labels = array_ops.concat([
+        constant_op.constant(0, shape=[3]),
+        constant_op.constant(1, shape=[7])
+    ], 0)
     num_classes = 2
     with self.test_session() as sess:
       miou, update_op = metrics.streaming_mean_iou(predictions, labels,
@@ -6238,29 +6165,20 @@ class StreamingMeanIOUTest(test.TestCase):
       self.assertEqual(0., miou.eval())
 
   def testResultsWithSomeMissing(self):
-    predictions = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[5]), constant_op.constant(
-                    1, shape=[5])
-        ],
-        0)
-    labels = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[3]), constant_op.constant(
-                    1, shape=[7])
-        ],
-        0)
+    predictions = array_ops.concat([
+        constant_op.constant(0, shape=[5]),
+        constant_op.constant(1, shape=[5])
+    ], 0)
+    labels = array_ops.concat([
+        constant_op.constant(0, shape=[3]),
+        constant_op.constant(1, shape=[7])
+    ], 0)
     num_classes = 2
-    weights = array_ops.concat(
-        [
-            constant_op.constant(
-                0, shape=[1]), constant_op.constant(
-                    1, shape=[8]), constant_op.constant(
-                        0, shape=[1])
-        ],
-        0)
+    weights = array_ops.concat([
+        constant_op.constant(0, shape=[1]),
+        constant_op.constant(1, shape=[8]),
+        constant_op.constant(0, shape=[1])
+    ], 0)
     with self.test_session() as sess:
       miou, update_op = metrics.streaming_mean_iou(
           predictions, labels, num_classes, weights=weights)
@@ -6270,56 +6188,45 @@ class StreamingMeanIOUTest(test.TestCase):
       self.assertAlmostEqual(desired_miou, miou.eval())
 
   def testMissingClassInLabels(self):
-    labels = constant_op.constant([
-      [[0, 0, 1, 1, 0, 0],
-       [1, 0, 0, 0, 0, 1]],
-      [[1, 1, 1, 1, 1, 1],
-       [0, 0, 0, 0, 0, 0]]])
-    predictions = constant_op.constant([
-      [[0, 0, 2, 1, 1, 0],
-       [0, 1, 2, 2, 0, 1]],
-      [[0, 0, 2, 1, 1, 1],
-       [1, 1, 2, 0, 0, 0]]])
+    labels = constant_op.constant([[[0, 0, 1, 1, 0, 0], [1, 0, 0, 0, 0, 1]],
+                                   [[1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0]]])
+    predictions = constant_op.constant(
+        [[[0, 0, 2, 1, 1, 0], [0, 1, 2, 2, 0, 1]], [[0, 0, 2, 1, 1, 1],
+                                                    [1, 1, 2, 0, 0, 0]]])
     num_classes = 3
     with self.test_session() as sess:
-      miou, update_op = metrics.streaming_mean_iou(
-          predictions, labels, num_classes)
+      miou, update_op = metrics.streaming_mean_iou(predictions, labels,
+                                                   num_classes)
       sess.run(variables.local_variables_initializer())
       self.assertAllEqual([[7, 4, 3], [3, 5, 2], [0, 0, 0]], update_op.eval())
-      self.assertAlmostEqual(
-        1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 / (0 + 5 + 0)),
-        miou.eval())
+      self.assertAlmostEqual(1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 /
+                                      (0 + 5 + 0)), miou.eval())
 
   def testMissingClassOverallSmall(self):
     labels = constant_op.constant([0])
     predictions = constant_op.constant([0])
     num_classes = 2
     with self.test_session() as sess:
-      miou, update_op = metrics.streaming_mean_iou(
-          predictions, labels, num_classes)
+      miou, update_op = metrics.streaming_mean_iou(predictions, labels,
+                                                   num_classes)
       sess.run(variables.local_variables_initializer())
       self.assertAllEqual([[1, 0], [0, 0]], update_op.eval())
       self.assertAlmostEqual(1, miou.eval())
 
   def testMissingClassOverallLarge(self):
-    labels = constant_op.constant([
-      [[0, 0, 1, 1, 0, 0],
-       [1, 0, 0, 0, 0, 1]],
-      [[1, 1, 1, 1, 1, 1],
-       [0, 0, 0, 0, 0, 0]]])
-    predictions = constant_op.constant([
-      [[0, 0, 1, 1, 0, 0],
-       [1, 1, 0, 0, 1, 1]],
-      [[0, 0, 0, 1, 1, 1],
-       [1, 1, 1, 0, 0, 0]]])
+    labels = constant_op.constant([[[0, 0, 1, 1, 0, 0], [1, 0, 0, 0, 0, 1]],
+                                   [[1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0]]])
+    predictions = constant_op.constant(
+        [[[0, 0, 1, 1, 0, 0], [1, 1, 0, 0, 1, 1]], [[0, 0, 0, 1, 1, 1],
+                                                    [1, 1, 1, 0, 0, 0]]])
     num_classes = 3
     with self.test_session() as sess:
-      miou, update_op = metrics.streaming_mean_iou(
-          predictions, labels, num_classes)
+      miou, update_op = metrics.streaming_mean_iou(predictions, labels,
+                                                   num_classes)
       sess.run(variables.local_variables_initializer())
       self.assertAllEqual([[9, 5, 0], [3, 7, 0], [0, 0, 0]], update_op.eval())
-      self.assertAlmostEqual(
-        1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)), miou.eval())
+      self.assertAlmostEqual(1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)),
+                             miou.eval())
 
 
 class StreamingConcatTest(test.TestCase):
@@ -6660,5 +6567,209 @@ class CountTest(test.TestCase):
       self.assertAlmostEqual(4.1, result.eval(), 5)
 
 
+class CohenKappaTest(test.TestCase):
+
+  def _confusion_matrix_to_samples(self, confusion_matrix):
+    x, y = confusion_matrix.shape
+    pairs = []
+    for label in range(x):
+      for feature in range(y):
+        pairs += [label, feature] * confusion_matrix[label, feature]
+    pairs = np.array(pairs).reshape((-1, 2))
+    return pairs[:, 0], pairs[:, 1]
+
+  def setUp(self):
+    np.random.seed(1)
+    ops.reset_default_graph()
+
+  def testVars(self):
+    metrics.cohen_kappa(
+        predictions_idx=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        num_classes=2)
+    _assert_metric_variables(self, (
+        'cohen_kappa/po:0',
+        'cohen_kappa/pe_row:0',
+        'cohen_kappa/pe_col:0',
+    ))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    kappa, _ = metrics.cohen_kappa(
+        predictions_idx=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        num_classes=2,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(ops.get_collection(my_collection_name), [kappa])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.cohen_kappa(
+        predictions_idx=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        num_classes=2,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = random_ops.random_uniform(
+        (10, 1), maxval=3, dtype=dtypes_lib.int64, seed=1)
+    labels = random_ops.random_uniform(
+        (10, 1), maxval=3, dtype=dtypes_lib.int64, seed=2)
+    kappa, update_op = metrics.cohen_kappa(labels, predictions, 3)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_kappa = kappa.eval()
+      for _ in range(10):
+        self.assertAlmostEqual(initial_kappa, kappa.eval(), 5)
+
+  def testBasic(self):
+    confusion_matrix = np.array([[9, 3, 1], [4, 8, 2], [2, 1, 6]])
+    # overall total = 36
+    # po = [9, 8, 6], sum(po) = 23
+    # pe_row = [15, 12, 9], pe_col = [13, 14, 9], so pe = [5.42, 4.67, 2.25]
+    # finally, kappa = (sum(po) - sum(pe)) / (N - sum(pe))
+    #                = (23 - 12.34) / (36 - 12.34)
+    #                = 0.45
+    # see: http://psych.unl.edu/psycrs/handcomp/hckappa.PDF
+    expect = 0.45
+    labels, predictions = self._confusion_matrix_to_samples(confusion_matrix)
+
+    dtypes = [dtypes_lib.int16, dtypes_lib.int32, dtypes_lib.int64]
+    shapes = [
+        (len(labels,)),  # 1-dim
+        (len(labels), 1)
+    ]  # 2-dim
+    weights = [None, np.ones_like(labels)]
+
+    for dtype in dtypes:
+      for shape in shapes:
+        for weight in weights:
+          with self.test_session() as sess:
+            predictions_tensor = constant_op.constant(
+                np.reshape(predictions, shape), dtype=dtype)
+            labels_tensor = constant_op.constant(
+                np.reshape(labels, shape), dtype=dtype)
+            kappa, update_op = metrics.cohen_kappa(
+                labels_tensor, predictions_tensor, 3, weights=weight)
+
+            sess.run(variables.local_variables_initializer())
+            self.assertAlmostEqual(expect, sess.run(update_op), 2)
+            self.assertAlmostEqual(expect, kappa.eval(), 2)
+
+  def testAllCorrect(self):
+    inputs = np.arange(0, 100) % 4
+    # confusion matrix
+    # [[25, 0, 0],
+    #  [0, 25, 0],
+    #  [0, 0, 25]]
+    # Calculated by v0.19: sklearn.metrics.cohen_kappa_score(inputs, inputs)
+    expect = 1.0
+
+    with self.test_session() as sess:
+      predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
+      labels = constant_op.constant(inputs)
+      kappa, update_op = metrics.cohen_kappa(labels, predictions, 4)
+
+      sess.run(variables.local_variables_initializer())
+      self.assertAlmostEqual(expect, sess.run(update_op), 5)
+      self.assertAlmostEqual(expect, kappa.eval(), 5)
+
+  def testAllIncorrect(self):
+    labels = np.arange(0, 100) % 4
+    predictions = (labels + 1) % 4
+    # confusion matrix
+    # [[0, 25, 0],
+    #  [0, 0, 25],
+    #  [25, 0, 0]]
+    # Calculated by v0.19: sklearn.metrics.cohen_kappa_score(labels, predictions)
+    expect = -0.333333333333
+
+    with self.test_session() as sess:
+      predictions = constant_op.constant(predictions, dtype=dtypes_lib.float32)
+      labels = constant_op.constant(labels)
+      kappa, update_op = metrics.cohen_kappa(labels, predictions, 4)
+
+      sess.run(variables.local_variables_initializer())
+      self.assertAlmostEqual(expect, sess.run(update_op), 5)
+      self.assertAlmostEqual(expect, kappa.eval(), 5)
+
+  def testWeighted(self):
+    confusion_matrix = np.array([[9, 3, 1], [4, 8, 2], [2, 1, 6]])
+    labels, predictions = self._confusion_matrix_to_samples(confusion_matrix)
+    num_samples = np.sum(confusion_matrix, dtype=np.int32)
+    weights = (np.arange(0, num_samples) % 5) / 5.0
+    # Calculated by v0.19: sklearn.metrics.cohen_kappa_score(
+    #                          labels, predictions, sample_weight=weights)
+    expect = 0.453466583385
+
+    with self.test_session() as sess:
+      predictions = constant_op.constant(predictions, dtype=dtypes_lib.float32)
+      labels = constant_op.constant(labels)
+      kappa, update_op = metrics.cohen_kappa(
+          labels, predictions, 4, weights=weights)
+
+      sess.run(variables.local_variables_initializer())
+      self.assertAlmostEqual(expect, sess.run(update_op), 5)
+      self.assertAlmostEqual(expect, kappa.eval(), 5)
+
+  def testWithMultipleUpdates(self):
+    confusion_matrix = np.array([[90, 30, 10, 20], [40, 80, 20, 30],
+                                 [20, 10, 60, 35], [15, 25, 30, 25]])
+    labels, predictions = self._confusion_matrix_to_samples(confusion_matrix)
+    num_samples = np.sum(confusion_matrix, dtype=np.int32)
+    weights = (np.arange(0, num_samples) % 5) / 5.0
+    num_classes = confusion_matrix.shape[0]
+
+    batch_size = num_samples // 10
+    predictions_t = array_ops.placeholder(
+        dtypes_lib.float32, shape=(batch_size,))
+    labels_t = array_ops.placeholder(dtypes_lib.int32, shape=(batch_size,))
+    weights_t = array_ops.placeholder(dtypes_lib.float32, shape=(batch_size,))
+    kappa, update_op = metrics.cohen_kappa(
+        labels_t, predictions_t, num_classes, weights=weights_t)
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+
+      for idx in range(0, num_samples, batch_size):
+        batch_start, batch_end = idx, idx + batch_size
+        sess.run(
+            update_op,
+            feed_dict={
+                labels_t: labels[batch_start:batch_end],
+                predictions_t: predictions[batch_start:batch_end],
+                weights_t: weights[batch_start:batch_end]
+            })
+      # Calculated by v0.19: sklearn.metrics.cohen_kappa_score(
+      #                          labels_np, predictions_np, sample_weight=weights_np)
+      expect = 0.289965397924
+      self.assertAlmostEqual(expect, kappa.eval(), 5)
+
+  def testInvalidNumClasses(self):
+    predictions = array_ops.placeholder(dtypes_lib.float32, shape=(4, 1))
+    labels = array_ops.placeholder(dtypes_lib.int32, shape=(4, 1))
+    with self.assertRaisesRegexp(ValueError, 'num_classes'):
+      metrics.cohen_kappa(labels, predictions, 1)
+
+  def testInvalidDimension(self):
+    predictions = array_ops.placeholder(dtypes_lib.float32, shape=(4, 1))
+    invalid_labels = array_ops.placeholder(dtypes_lib.int32, shape=(4, 2))
+    with self.assertRaises(ValueError):
+      metrics.cohen_kappa(invalid_labels, predictions, 3)
+
+    invalid_predictions = array_ops.placeholder(
+        dtypes_lib.float32, shape=(4, 2))
+    labels = array_ops.placeholder(dtypes_lib.int32, shape=(4, 1))
+    with self.assertRaises(ValueError):
+      metrics.cohen_kappa(labels, invalid_predictions, 3)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
index 764e126e0d64d5e6c6caf0a9f0d43a87995447eb..d286750c257e9a78a82c95c1fc872b3ca6972203 100644
--- a/tensorflow/contrib/model_pruning/README.md
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -42,10 +42,13 @@ The pruning library allows for specification of the following hyper parameters:
 | name | string | model_pruning | Name of the pruning specification. Used for adding summaries and ops under a common tensorflow name_scope |
 | begin_pruning_step | integer | 0 | The global step at which to begin pruning |
 | end_pruning_step   | integer | -1 | The global step at which to terminate pruning. Defaults to -1 implying that pruning continues till  the training stops |
-| do_not_prune | list of strings | [""] | list of layers strings that are not pruned |
+| do_not_prune | list of strings | [""] | list of layers names that are not pruned |
 | threshold_decay | float | 0.9 | The decay factor to use for exponential decay of the thresholds |
 | pruning_frequency | integer | 10 | How often should the masks be updated? (in # of global_steps) |
 | nbins | integer | 255 | Number of bins to use for histogram computation |
+| block_height|integer | 1 | Number of rows in a block for block sparse matrices|
+| block_width |integer | 1 | Number of cols in a block for block sparse matrices|
+| block_pooling_function| string | AVG | The function to use to pool weight values in a block: average (AVG) or max (MAX)|
 | initial_sparsity | float | 0.0 | Initial sparsity value |
 | target_sparsity | float | 0.5 | Target sparsity value |
 | sparsity_function_begin_step | integer | 0 | The global step at this which the gradual sparsity function begins to take effect |
@@ -128,3 +131,12 @@ Eval:
 ```shell
 $ bazel-bin/$examples_dir/cifar10/cifar10_eval --run_once
 ```
+
+### Block Sparsity
+
+For some hardware architectures, it may be beneficial to induce spatially correlated sparsity. To train models in which the weight tensors have block sparse structure, set *block_height* and *block_width* hyperparameters to the desired block configuration (2x2, 4x4, 4x1, 1x8, etc). Currently, block sparsity is supported for weight tensors with rank 2 only. The matrix is partitioned into non-overlapping blocks of size *[block_height, block_dim]* and the either the average or max absolute value in this block is taken as a proxy for the entire block (set by *block_pooling_function* hyperparameter).
+The convolution layer tensors are always pruned used block dimensions of [1,1].
+
+## References
+
+Michael Zhu and Suyog Gupta, “To prune, or not to prune: exploring the efficacy of pruning for model compression”, *2017 NIPS Workshop on Machine Learning of Phones and other Consumer Devices* (https://arxiv.org/pdf/1710.01878.pdf)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
index d07fece4bc668612d517e8dcaab1a35451a0238e..6a3b535eb447dd80f8e39d1d005f8f1d4f503549 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
@@ -58,6 +58,7 @@ def read_cifar10(filename_queue):
 
   class CIFAR10Record(object):
     pass
+
   result = CIFAR10Record()
 
   # Dimensions of the images in the CIFAR-10 dataset.
@@ -147,8 +148,9 @@ def distorted_inputs(data_dir, batch_size):
     images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
     labels: Labels. 1D tensor of [batch_size] size.
   """
-  filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
-               for i in xrange(1, 6)]
+  filenames = [
+      os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6)
+  ]
   for f in filenames:
     if not tf.gfile.Exists(f):
       raise ValueError('Failed to find file: ' + f)
@@ -174,10 +176,9 @@ def distorted_inputs(data_dir, batch_size):
 
   # Because these operations are not commutative, consider randomizing
   # the order their operation.
-  distorted_image = tf.image.random_brightness(distorted_image,
-                                               max_delta=63)
-  distorted_image = tf.image.random_contrast(distorted_image,
-                                             lower=0.2, upper=1.8)
+  distorted_image = tf.image.random_brightness(distorted_image, max_delta=63)
+  distorted_image = tf.image.random_contrast(
+      distorted_image, lower=0.2, upper=1.8)
 
   # Subtract off the mean and divide by the variance of the pixels.
   float_image = tf.image.per_image_standardization(distorted_image)
@@ -188,15 +189,18 @@ def distorted_inputs(data_dir, batch_size):
 
   # Ensure that the random shuffling has good mixing properties.
   min_fraction_of_examples_in_queue = 0.4
-  min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN *
-                           min_fraction_of_examples_in_queue)
-  print ('Filling queue with %d CIFAR images before starting to train. '
-         'This will take a few minutes.' % min_queue_examples)
+  min_queue_examples = int(
+      NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * min_fraction_of_examples_in_queue)
+  print('Filling queue with %d CIFAR images before starting to train. '
+        'This will take a few minutes.' % min_queue_examples)
 
   # Generate a batch of images and labels by building up a queue of examples.
-  return _generate_image_and_label_batch(float_image, read_input.label,
-                                         min_queue_examples, batch_size,
-                                         shuffle=True)
+  return _generate_image_and_label_batch(
+      float_image,
+      read_input.label,
+      min_queue_examples,
+      batch_size,
+      shuffle=True)
 
 
 def inputs(eval_data, data_dir, batch_size):
@@ -212,8 +216,9 @@ def inputs(eval_data, data_dir, batch_size):
     labels: Labels. 1D tensor of [batch_size] size.
   """
   if not eval_data:
-    filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
-                 for i in xrange(1, 6)]
+    filenames = [
+        os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6)
+    ]
     num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
   else:
     filenames = [os.path.join(data_dir, 'test_batch.bin')]
@@ -235,8 +240,8 @@ def inputs(eval_data, data_dir, batch_size):
 
   # Image processing for evaluation.
   # Crop the central [height, width] of the image.
-  resized_image = tf.image.resize_image_with_crop_or_pad(reshaped_image,
-                                                         width, height)
+  resized_image = tf.image.resize_image_with_crop_or_pad(
+      reshaped_image, width, height)
 
   # Subtract off the mean and divide by the variance of the pixels.
   float_image = tf.image.per_image_standardization(resized_image)
@@ -247,10 +252,13 @@ def inputs(eval_data, data_dir, batch_size):
 
   # Ensure that the random shuffling has good mixing properties.
   min_fraction_of_examples_in_queue = 0.4
-  min_queue_examples = int(num_examples_per_epoch *
-                           min_fraction_of_examples_in_queue)
+  min_queue_examples = int(
+      num_examples_per_epoch * min_fraction_of_examples_in_queue)
 
   # Generate a batch of images and labels by building up a queue of examples.
-  return _generate_image_and_label_batch(float_image, read_input.label,
-                                         min_queue_examples, batch_size,
-                                         shuffle=False)
+  return _generate_image_and_label_batch(
+      float_image,
+      read_input.label,
+      min_queue_examples,
+      batch_size,
+      shuffle=False)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
index 0d1de869f6ef91791a235cfe545b3b3a9b734e72..660f0168b10aa1e5b320cb476b051918804d2bde 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
@@ -48,16 +48,16 @@ from tensorflow.contrib.model_pruning.python import pruning
 # Global constants describing the CIFAR-10 data set.
 IMAGE_SIZE = cifar10_input.IMAGE_SIZE
 NUM_CLASSES = cifar10_input.NUM_CLASSES
-NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN  # pylint: disable=line-too-long
 NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL
 BATCH_SIZE = 128
 DATA_DIR = '/tmp/cifar10_data'
 
 # Constants describing the training process.
-MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
-NUM_EPOCHS_PER_DECAY = 350.0      # Epochs after which learning rate decays.
+MOVING_AVERAGE_DECAY = 0.9999  # The decay to use for the moving average.
+NUM_EPOCHS_PER_DECAY = 350.0  # Epochs after which learning rate decays.
 LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
-INITIAL_LEARNING_RATE = 0.1       # Initial learning rate.
+INITIAL_LEARNING_RATE = 0.1  # Initial learning rate.
 
 # If a model is trained with multiple GPUs, prefix all Op names with tower_name
 # to differentiate the operations. Note that this prefix is removed from the
@@ -82,8 +82,7 @@ def _activation_summary(x):
   # session. This helps the clarity of presentation on tensorboard.
   tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
   tf.summary.histogram(tensor_name + '/activations', x)
-  tf.summary.scalar(tensor_name + '/sparsity',
-                                       tf.nn.zero_fraction(x))
+  tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
 
 
 def _variable_on_cpu(name, shape, initializer):
@@ -120,10 +119,9 @@ def _variable_with_weight_decay(name, shape, stddev, wd):
     Variable Tensor
   """
   dtype = tf.float32
-  var = _variable_on_cpu(
-      name,
-      shape,
-      tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
+  var = _variable_on_cpu(name, shape,
+                         tf.truncated_normal_initializer(
+                             stddev=stddev, dtype=dtype))
   if wd is not None:
     weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
     tf.add_to_collection('losses', weight_decay)
@@ -188,10 +186,8 @@ def inference(images):
   # Note that the masks are applied only to the weight tensors
   # conv1
   with tf.variable_scope('conv1') as scope:
-    kernel = _variable_with_weight_decay('weights',
-                                         shape=[5, 5, 3, 64],
-                                         stddev=5e-2,
-                                         wd=0.0)
+    kernel = _variable_with_weight_decay(
+        'weights', shape=[5, 5, 3, 64], stddev=5e-2, wd=0.0)
 
     conv = tf.nn.conv2d(
         images, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME')
@@ -201,18 +197,20 @@ def inference(images):
     _activation_summary(conv1)
 
   # pool1
-  pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
-                         padding='SAME', name='pool1')
+  pool1 = tf.nn.max_pool(
+      conv1,
+      ksize=[1, 3, 3, 1],
+      strides=[1, 2, 2, 1],
+      padding='SAME',
+      name='pool1')
   # norm1
-  norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
-                    name='norm1')
+  norm1 = tf.nn.lrn(
+      pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm1')
 
   # conv2
   with tf.variable_scope('conv2') as scope:
-    kernel = _variable_with_weight_decay('weights',
-                                         shape=[5, 5, 64, 64],
-                                         stddev=5e-2,
-                                         wd=0.0)
+    kernel = _variable_with_weight_decay(
+        'weights', shape=[5, 5, 64, 64], stddev=5e-2, wd=0.0)
     conv = tf.nn.conv2d(
         norm1, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME')
     biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
@@ -221,19 +219,23 @@ def inference(images):
     _activation_summary(conv2)
 
   # norm2
-  norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
-                    name='norm2')
+  norm2 = tf.nn.lrn(
+      conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm2')
   # pool2
-  pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
-                         strides=[1, 2, 2, 1], padding='SAME', name='pool2')
+  pool2 = tf.nn.max_pool(
+      norm2,
+      ksize=[1, 3, 3, 1],
+      strides=[1, 2, 2, 1],
+      padding='SAME',
+      name='pool2')
 
   # local3
   with tf.variable_scope('local3') as scope:
     # Move everything into depth so we can perform a single matrix multiply.
     reshape = tf.reshape(pool2, [BATCH_SIZE, -1])
     dim = reshape.get_shape()[1].value
-    weights = _variable_with_weight_decay('weights', shape=[dim, 384],
-                                          stddev=0.04, wd=0.004)
+    weights = _variable_with_weight_decay(
+        'weights', shape=[dim, 384], stddev=0.04, wd=0.004)
     biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
     local3 = tf.nn.relu(
         tf.matmul(reshape, pruning.apply_mask(weights, scope)) + biases,
@@ -242,8 +244,8 @@ def inference(images):
 
   # local4
   with tf.variable_scope('local4') as scope:
-    weights = _variable_with_weight_decay('weights', shape=[384, 192],
-                                          stddev=0.04, wd=0.004)
+    weights = _variable_with_weight_decay(
+        'weights', shape=[384, 192], stddev=0.04, wd=0.004)
     biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
     local4 = tf.nn.relu(
         tf.matmul(local3, pruning.apply_mask(weights, scope)) + biases,
@@ -255,8 +257,8 @@ def inference(images):
   # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
   # and performs the softmax internally for efficiency.
   with tf.variable_scope('softmax_linear') as scope:
-    weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
-                                          stddev=1/192.0, wd=0.0)
+    weights = _variable_with_weight_decay(
+        'weights', [192, NUM_CLASSES], stddev=1 / 192.0, wd=0.0)
     biases = _variable_on_cpu('biases', [NUM_CLASSES],
                               tf.constant_initializer(0.0))
     softmax_linear = tf.add(
@@ -337,11 +339,12 @@ def train(total_loss, global_step):
   decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
 
   # Decay the learning rate exponentially based on the number of steps.
-  lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
-                                  global_step,
-                                  decay_steps,
-                                  LEARNING_RATE_DECAY_FACTOR,
-                                  staircase=True)
+  lr = tf.train.exponential_decay(
+      INITIAL_LEARNING_RATE,
+      global_step,
+      decay_steps,
+      LEARNING_RATE_DECAY_FACTOR,
+      staircase=True)
   tf.summary.scalar('learning_rate', lr)
 
   # Generate moving averages of all losses and associated summaries.
@@ -365,8 +368,8 @@ def train(total_loss, global_step):
       tf.summary.histogram(var.op.name + '/gradients', grad)
 
   # Track the moving averages of all trainable variables.
-  variable_averages = tf.train.ExponentialMovingAverage(
-      MOVING_AVERAGE_DECAY, global_step)
+  variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY,
+                                                        global_step)
   variables_averages_op = variable_averages.apply(tf.trainable_variables())
 
   with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
@@ -383,10 +386,13 @@ def maybe_download_and_extract():
   filename = DATA_URL.split('/')[-1]
   filepath = os.path.join(dest_directory, filename)
   if not os.path.exists(filepath):
+
     def _progress(count, block_size, total_size):
-      sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
-          float(count * block_size) / float(total_size) * 100.0))
+      sys.stdout.write('\r>> Downloading %s %.1f%%' %
+                       (filename,
+                        float(count * block_size) / float(total_size) * 100.0))
       sys.stdout.flush()
+
     filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
     print()
     statinfo = os.stat(filepath)
diff --git a/tensorflow/contrib/model_pruning/python/layers/core_layers.py b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
index 95dfd8f4213a8729f5954eb0626f28ecc9265bbb..764ab620bc2227ff5e8e3f473d689e0e133e83d4 100644
--- a/tensorflow/contrib/model_pruning/python/layers/core_layers.py
+++ b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
@@ -210,7 +210,7 @@ class _MaskedConv(base.Layer):
       return self.activation(outputs)
     return outputs
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_last':
       space = input_shape[1:-1]
@@ -467,7 +467,7 @@ class MaskedFullyConnected(base.Layer):
       return self.activation(outputs)  # pylint: disable=not-callable
     return outputs
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     input_shape = input_shape.with_rank_at_least(2)
     if input_shape[-1].value is None:
diff --git a/tensorflow/contrib/model_pruning/python/layers/layers.py b/tensorflow/contrib/model_pruning/python/layers/layers.py
index dfebb9a6794056dd43b0699ccbcc5797f2f172f7..988748ad75bdf72f1da3f4e1c6e85aabb04a5954 100644
--- a/tensorflow/contrib/model_pruning/python/layers/layers.py
+++ b/tensorflow/contrib/model_pruning/python/layers/layers.py
@@ -21,7 +21,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 import six
 
 from tensorflow.contrib.framework.python.ops import add_arg_scope
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index 42d91a71fde41d8681d7a0c439d6c49325730418..d16af9da19816211ee22f6ea48a347f0b9a4e612 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -72,8 +72,10 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
 from tensorflow.python.training import training_util
@@ -129,6 +131,23 @@ def _weight_threshold_variable(var, scope):
     return threshold
 
 
+def _kronecker_product(mat1, mat2):
+  """Computes the Kronecker product of two matrices mat1 and mat2.
+
+  Args:
+    mat1: A matrix of size m x n
+    mat2: A matrix of size p x q
+  Returns:
+    Kronecker product of matrices mat1 and mat2 of size mp x nq
+  """
+
+  m1, n1 = mat1.get_shape().as_list()
+  mat1_rsh = array_ops.reshape(mat1, [m1, 1, n1, 1])
+  m2, n2 = mat2.get_shape().as_list()
+  mat2_rsh = array_ops.reshape(mat2, [1, m2, 1, n2])
+  return array_ops.reshape(mat1_rsh * mat2_rsh, [m1 * m2, n1 * n2])
+
+
 def _histogram(values, value_range, nbins=100, dtype=np.int32, name=None):
   """Return histogram of values.
 
@@ -297,6 +316,13 @@ def get_pruning_hparams():
       How often should the masks be updated? (in # of global_steps)
     nbins: integer
       number of bins to use for histogram computation
+    block_height: integer
+      number of rows in a block (defaults to 1)
+    block_width: integer
+      number of cols in a block (defaults to 1)
+    block_pooling_function: string
+      Whether to perform average (AVG) or max (MAX) pooling in the block
+      (default: AVG)
     initial_sparsity: float
       initial sparsity value
     target_sparsity: float
@@ -332,6 +358,9 @@ def get_pruning_hparams():
       threshold_decay=0.9,
       pruning_frequency=10,
       nbins=255,
+      block_height=1,
+      block_width=1,
+      block_pooling_function='AVG',
       initial_sparsity=0,
       target_sparsity=0.5,
       sparsity_function_begin_step=0,
@@ -341,11 +370,7 @@ def get_pruning_hparams():
 
 class Pruning(object):
 
-  def __init__(self,
-               spec=None,
-               global_step=None,
-               sparsity=None,
-               partitioner=None):
+  def __init__(self, spec=None, global_step=None, sparsity=None):
     """Set up the specification for model pruning.
 
     If a spec is provided, the sparsity is set up based on the sparsity_function
@@ -358,8 +383,6 @@ class Pruning(object):
       global_step: A tensorflow variable that is used while setting up the
         sparsity function
       sparsity: A tensorflow scalar variable storing the sparsity
-      partitioner: The tensorflow partitioner function used to distribute
-        parameters across shards
     """
     # Pruning specification
     self._spec = spec if spec else get_pruning_hparams()
@@ -373,9 +396,6 @@ class Pruning(object):
     # Built using self._setup_sparsity() or provided externally
     self._sparsity = sparsity if sparsity else self._setup_sparsity()
 
-    # Stores the partitioner function uses to partition variables across tasks/
-    self._partitioner = partitioner
-
     # List of tensorflow assignments ops for new masks and thresholds
     self._assign_ops = []
 
@@ -383,6 +403,12 @@ class Pruning(object):
     # were updated
     self._last_update_step = self._setup_last_update_step()
 
+    # Block dimensions
+    self._block_dim = [self._spec.block_height, self._spec.block_width]
+
+    # Block pooling function
+    self._block_pooling_function = self._spec.block_pooling_function
+
   def _setup_global_step(self, global_step):
     graph_global_step = global_step
     if graph_global_step is None:
@@ -457,9 +483,10 @@ class Pruning(object):
 
     Returns:
       new_threshold: The new value of the threshold based on weights, and
-        desired_sparsity
-      new_mask: A n-D numpy array containing 0 or 1 to indicate which of the
-        values in weights falls below the threshold
+        sparsity at the current global_step
+      new_mask: A numpy array of the same size and shape as weights containing
+        0 or 1 to indicate which of the values in weights falls below
+        the threshold
 
     Raises:
       ValueError: if sparsity is not defined
@@ -492,6 +519,63 @@ class Pruning(object):
           math_ops.greater(abs_weights, smoothed_threshold), np.float32)
     return smoothed_threshold, new_mask
 
+  def _maybe_update_block_mask(self, weights, threshold):
+    """Performs block-granular masking of the weights.
+
+    Block pruning occurs only if the block_height or block_width is > 1 and
+    if the weight tensor has ndims = 2. Otherwise, elementwise pruning occurs.
+    Args:
+      weights: The weight tensor that needs to be masked.
+      threshold: The current threshold value. The function will compute a new
+        threshold and return the exponential moving average using the current
+        value of threshold
+
+    Returns:
+      new_threshold: The new value of the threshold based on weights, and
+        sparsity at the current global_step
+      new_mask: A numpy array of the same size and shape as weights containing
+        0 or 1 to indicate which of the values in weights falls below
+        the threshold
+
+    Raises:
+      ValueError: if block pooling function is not AVG or MAX
+    """
+    if weights.get_shape().ndims != 2 or self._block_dim == [1, 1]:
+      return self._update_mask(weights, threshold)
+
+    if self._block_pooling_function not in ['AVG', 'MAX']:
+      raise ValueError('Unknown pooling function for block sparsity: %s' %
+                       self._block_pooling_function)
+
+    with ops.name_scope(weights.op.name + '_pruning_ops'):
+      abs_weights = math_ops.abs(
+          array_ops.reshape(
+              weights, [1, weights.get_shape()[0],
+                        weights.get_shape()[1], 1]))
+      pool_window = [self._block_dim[0], self._block_dim[1]]
+      pooled_weights = nn_ops.pool(
+          abs_weights,
+          window_shape=pool_window,
+          pooling_type=self._block_pooling_function,
+          strides=pool_window,
+          padding='SAME',
+          name=weights.op.name + '_pooled')
+
+      smoothed_threshold, new_mask = self._update_mask(pooled_weights,
+                                                       threshold)
+
+      reshaped_mask = array_ops.reshape(
+          new_mask,
+          [pooled_weights.get_shape()[1],
+           pooled_weights.get_shape()[2]])
+      updated_mask = _kronecker_product(reshaped_mask,
+                                        array_ops.ones(self._block_dim))
+      sliced_mask = array_ops.slice(
+          updated_mask, [0, 0],
+          [weights.get_shape()[0],
+           weights.get_shape()[1]])
+    return smoothed_threshold, sliced_mask
+
   def _get_mask_assign_ops(self):
     # Make sure the assignment ops have not already been added to the list
     if self._assign_ops:
@@ -509,18 +593,21 @@ class Pruning(object):
 
     for index, mask in enumerate(masks):
       threshold = thresholds[index]
-      weight = weights[index] if self._partitioner is None else weights[
-          index].as_tensor()
+      weight = weights[index]
+      is_partitioned = isinstance(weight, variables.PartitionedVariable)
+      if is_partitioned:
+        weight = weight.as_tensor()
 
       if self._spec.do_not_prune:
         if self._exists_in_do_not_prune_list(mask.name):
           continue
 
-      new_threshold, new_mask = self._update_mask(weight, threshold)
+      new_threshold, new_mask = self._maybe_update_block_mask(weight, threshold)
       self._assign_ops.append(_variable_assign(threshold, new_threshold))
+
       self._assign_ops.append(
-          _variable_assign(mask, new_mask) if self._partitioner is None else
-          _partitioned_variable_assign(mask, new_mask))
+          _partitioned_variable_assign(mask, new_mask)
+          if is_partitioned else _variable_assign(mask, new_mask))
 
   def mask_update_op(self):
     with ops.name_scope(self._spec.name):
diff --git a/tensorflow/contrib/model_pruning/python/pruning_test.py b/tensorflow/contrib/model_pruning/python/pruning_test.py
index c23fd649ce1fc72a2e8d516bfa3750b7ced1b111..1767b4bb94a9bb56bc6a4933423ad27d8cf3ed35 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.model_pruning.python import pruning
+from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
@@ -111,6 +112,39 @@ class PruningTest(test.TestCase):
       masked_weights_val = masked_weights.eval()
       self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
 
+  def _blockMasking(self, hparams, weights, expected_mask):
+
+    threshold = variables.Variable(0.0, name="threshold")
+    sparsity = variables.Variable(0.51, name="sparsity")
+    test_spec = ",".join(hparams)
+    pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
+
+    # Set up pruning
+    p = pruning.Pruning(pruning_hparams, sparsity=sparsity)
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      _, new_mask = p._maybe_update_block_mask(weights, threshold)
+      # Check if the mask is the same size as the weights
+      self.assertAllEqual(new_mask.get_shape(), weights.get_shape())
+      mask_val = new_mask.eval()
+      self.assertAllEqual(mask_val, expected_mask)
+
+  def testBlockMasking(self):
+    param_list = ["block_height=2", "block_width=2", "threshold_decay=0"]
+
+    weights_avg = constant_op.constant(
+        [[0.1, 0.1, 0.2, 0.2], [0.1, 0.1, 0.2, 0.2], [0.3, 0.3, 0.4, 0.4],
+         [0.3, 0.3, 0.4, 0.4]])
+    weights_max = constant_op.constant(
+        [[0.1, 0.0, 0.2, 0.0], [0.0, -0.1, 0.0, -0.2], [0.3, 0.0, 0.4, 0.0],
+         [0.0, -0.3, 0.0, -0.4]])
+    expected_mask = [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]]
+
+    self._blockMasking(param_list + ["block_pooling_function=MAX"], weights_max,
+                       expected_mask)
+    self._blockMasking(param_list + ["block_pooling_function=AVG"],
+                       weights_avg, expected_mask)
+
   def testPartitionedVariableMasking(self):
     partitioner = partitioned_variables.variable_axis_size_partitioner(40)
     with self.test_session() as session:
@@ -120,7 +154,7 @@ class PruningTest(test.TestCase):
             "weights", initializer=math_ops.linspace(1.0, 100.0, 100))
         masked_weights = pruning.apply_mask(
             weights, scope=variable_scope.get_variable_scope())
-      p = pruning.Pruning(sparsity=sparsity, partitioner=partitioner)
+      p = pruning.Pruning(sparsity=sparsity)
       p._spec.threshold_decay = 0.0
       mask_update_op = p.mask_update_op()
       variables.global_variables_initializer().run()
diff --git a/tensorflow/contrib/mpi/BUILD b/tensorflow/contrib/mpi/BUILD
index d9d55faf50b7f5043bfd0ed3b3d9ca5c404c7627..23f90cf77ef0bde34f3938688aa6ca2f6e9bbc53 100644
--- a/tensorflow/contrib/mpi/BUILD
+++ b/tensorflow/contrib/mpi/BUILD
@@ -71,6 +71,8 @@ cc_library(
         "//tensorflow/core:protos_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:recent_request_ids",
+        "//tensorflow/core/distributed_runtime:request_id",
         "//tensorflow/core/distributed_runtime:session_mgr",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_env",
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
index 1a2563d20fdc33d3c5e4a85561b61d04d3eeabff..6a7f5efecdb4062874a09df227d139ad20d59f3f 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
@@ -24,17 +24,19 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
 
 namespace tensorflow {
 
 MPIRendezvousMgr::MPIRendezvousMgr(const WorkerEnv* env)
-    : BaseRendezvousMgr(env), worker_env_2(env), use_optimal_transfer_(false) {
-
+    : BaseRendezvousMgr(env),
+      worker_env_2(env),
+      use_optimal_transfer_(false),
+      recv_tensor_recent_request_ids_(100000) {
   const char* mpienv = getenv("MPI_OPTIMAL_PATH");
   if (mpienv && mpienv[0] == '1') {
     LOG(INFO) << "MPI Optimal copy path enabled (Requires CUDA-Aware MPI when "
@@ -60,7 +62,6 @@ BaseRemoteRendezvous* MPIRendezvousMgr::Create(int64 step_id,
 void MPIRemoteRendezvous::RecvFromRemoteAsync(
     const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
     DoneCallback done) {
-
   Status s = Status::OK();
   MPIRequestTensorCall* rendezvous_call = new MPIRequestTensorCall();
 
@@ -101,37 +102,37 @@ void MPIRemoteRendezvous::RecvFromRemoteAsync(
   // Create the function which is called when the Tensor is send by remote
   const int64 temp1 = step_id_;
   rendezvous_call->recv_call_ =
-      [this, parsed, recv_args, done, dst, temp1, rendezvous_call](
-          MPIRecvTensorResponse mpi_response) {
-    Status s;
-    Device* dst_device;
-    if (s.ok()) {
-      s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
-      CHECK(s.ok()) << "Device lookup failed";
-    }
-
-    VLOG(3) << "MPI Received tensor " << parsed.FullKey()
-            << " @ step: " << temp1
-            << " single-send: " << mpi_response.singlesend();
-
-    Tensor val;
-    if (mpi_response.singlesend()) {
-      dst_device->MakeTensorFromProto(mpi_response.response().tensor(),
-                                      recv_args.alloc_attrs, &val);
-    } else {
-      TensorResponse tr;
-      tr.InitAlloc(dst_device, recv_args.alloc_attrs);
-      tr.InitPartial(mpi_response.response());
-      const size_t nBytes = tr.tensor().TotalBytes();
-      void* data = const_cast<void*>(DMAHelper::base(&tr.tensor()));
-      MPI_Status status;
-      MPI_CHECK(MPI_Recv(data, static_cast<int>(nBytes), MPI_BYTE, dst,
-                         TAG_SENDTENSOR2, MPI_COMM_WORLD, &status));
-      val = std::move(tr.tensor());
-    }
-
-    done(s, Args(), recv_args, val, mpi_response.response().is_dead());
-  };
+      [this, parsed, recv_args, done, dst, temp1,
+       rendezvous_call](MPIRecvTensorResponse mpi_response) {
+        Status s;
+        Device* dst_device;
+        if (s.ok()) {
+          s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+          CHECK(s.ok()) << "Device lookup failed";
+        }
+
+        VLOG(3) << "MPI Received tensor " << parsed.FullKey()
+                << " @ step: " << temp1
+                << " single-send: " << mpi_response.singlesend();
+
+        Tensor val;
+        if (mpi_response.singlesend()) {
+          dst_device->MakeTensorFromProto(mpi_response.response().tensor(),
+                                          recv_args.alloc_attrs, &val);
+        } else {
+          TensorResponse tr;
+          tr.InitAlloc(dst_device, recv_args.alloc_attrs);
+          tr.InitPartial(mpi_response.response());
+          const size_t nBytes = tr.tensor().TotalBytes();
+          void* data = const_cast<void*>(DMAHelper::base(&tr.tensor()));
+          MPI_Status status;
+          MPI_CHECK(MPI_Recv(data, static_cast<int>(nBytes), MPI_BYTE, dst,
+                             TAG_SENDTENSOR2, MPI_COMM_WORLD, &status));
+          val = std::move(tr.tensor());
+        }
+
+        done(s, Args(), recv_args, val, mpi_response.response().is_dead());
+      };
 
   MPIRendezvousMgr* mgr =
       reinterpret_cast<MPIRendezvousMgr*>(this->rendezvous_mgr_);
@@ -149,15 +150,19 @@ MPIRemoteRendezvous::~MPIRemoteRendezvous() {}
  */
 void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
                                   const int mpi_dst) {
+  TF_CHECK_OK(recv_tensor_recent_request_ids_.TrackUnique(
+      request.request_id(), "RecvTensor (MPIRendezvousMgr)", request));
   const int64 step_id = request.step_id();
   const std::string& key = request.rendezvous_key();
   Rendezvous::ParsedKey parsed;
   TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
 
   MPIRecvTensorCallBack send_cb = [this, mpi_dst, parsed](
-      const Status& status, const Rendezvous::Args& send_args,
-      const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead,
-      MPISendTensorCall* mpi_send_call) {
+                                      const Status& status,
+                                      const Rendezvous::Args& send_args,
+                                      const Rendezvous::Args& recv_args,
+                                      const Tensor& val, bool is_dead,
+                                      MPISendTensorCall* mpi_send_call) {
     // TODO(jbedorf) this should be a loop over max size
     CHECK(mpi_send_call->mRes_.ByteSize() < INT_MAX)
         << "Buffer too large for single transfer";
@@ -190,74 +195,78 @@ void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
   };
 
   // Wrapper around the read callback to place the callback on our queue
-  Rendezvous::DoneCallback done_cb = [this, parsed, step_id, send_cb](
-      const Status& status, const Rendezvous::Args& send_args,
-      const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead) {
-    if (!status.ok()) {
-      CHECK(status.ok()) << "RecvLocalAsync was not ok, key: "
-                         << parsed.FullKey() << " step: " << step_id
-                         << " error message: " << status.error_message();
-      return;
-    }
-
-    VLOG(3) << "MPI Sending tensor " << parsed.FullKey()
-            << " @ step: " << step_id << std::endl;
-
-    auto mpi_send_call = new MPISendTensorCall();
-    mpi_send_call->Init(parsed, step_id, is_dead);
-
-    Device* src_dev = nullptr;
-    Status s = this->worker_env_2->device_mgr->LookupDevice(parsed.src_device,
-                                                            &src_dev);
-    CHECK(s.ok()) << "src device not found";
-
-    // Control if shape and data should be send together or if we can optimize
-    // it in two different transfers, thereby reducing memory copies
-    bool doOptimalTransfer = true;
-    if (!DataTypeCanUseMemcpy(val.dtype())) doOptimalTransfer = false;
-    if (val.TotalBytes() < 1024) doOptimalTransfer = false;
-
-    doOptimalTransfer = doOptimalTransfer && use_optimal_transfer_;
-
-    if (doOptimalTransfer) {
-      // First send the Tensor description and in a follow up transfer the data
-      mpi_send_call->mRes_.mutable_response()->mutable_tensor()->set_dtype(
-          val.dtype());
-      val.shape().AsProto(mpi_send_call->mRes_.mutable_response()
-                              ->mutable_tensor()
-                              ->mutable_tensor_shape());
-      mpi_send_call->mRes_.set_singlesend(false);
-    } else {
-      // Send the Tensor description and data in a single transfer
-      if (src_dev->tensorflow_gpu_device_info() &&
-          (!send_args.alloc_attrs.on_host())) {
-        Notification n;
-        GPUUtil::SetProtoFromGPU(
-            val, src_dev, send_args.device_context,
-            mpi_send_call->mRes_.mutable_response()->mutable_tensor(), is_dead,
-            [&n, &s](const Status& s_) {
-              s = s_;
-              n.Notify();
-            });
-        n.WaitForNotification();
-      } else {
-        val.AsProtoTensorContent(
-            mpi_send_call->mRes_.mutable_response()->mutable_tensor());
-      }
-    }
-
-    std::function<MPISendTensorCall*()> res = std::bind(
-        send_cb, status, send_args, recv_args, val, is_dead, mpi_send_call);
-
-    SendQueueEntry req(parsed.FullKey().ToString().c_str(), std::move(res));
-
-    this->QueueSendRequest(req);
-
-    // Wait for the notification that indicates the tensor has been
-    // successfully transmitted to the remote process. Only needed if we
-    // have not parsed the tensor to proto
-    if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification();
-  };  // done_cb
+  Rendezvous::DoneCallback done_cb =
+      [this, parsed, step_id, send_cb](
+          const Status& status, const Rendezvous::Args& send_args,
+          const Rendezvous::Args& recv_args, const Tensor& val, bool is_dead) {
+        if (!status.ok()) {
+          CHECK(status.ok())
+              << "RecvLocalAsync was not ok, key: " << parsed.FullKey()
+              << " step: " << step_id
+              << " error message: " << status.error_message();
+          return;
+        }
+
+        VLOG(3) << "MPI Sending tensor " << parsed.FullKey()
+                << " @ step: " << step_id << std::endl;
+
+        auto mpi_send_call = new MPISendTensorCall();
+        mpi_send_call->Init(parsed, step_id, is_dead);
+
+        Device* src_dev = nullptr;
+        Status s = this->worker_env_2->device_mgr->LookupDevice(
+            parsed.src_device, &src_dev);
+        CHECK(s.ok()) << "src device not found";
+
+        // Control if shape and data should be send together or if we can
+        // optimize it in two different transfers, thereby reducing memory
+        // copies
+        bool doOptimalTransfer = true;
+        if (!DataTypeCanUseMemcpy(val.dtype())) doOptimalTransfer = false;
+        if (val.TotalBytes() < 1024) doOptimalTransfer = false;
+
+        doOptimalTransfer = doOptimalTransfer && use_optimal_transfer_;
+
+        if (doOptimalTransfer) {
+          // First send the Tensor description and in a follow up transfer the
+          // data
+          mpi_send_call->mRes_.mutable_response()->mutable_tensor()->set_dtype(
+              val.dtype());
+          val.shape().AsProto(mpi_send_call->mRes_.mutable_response()
+                                  ->mutable_tensor()
+                                  ->mutable_tensor_shape());
+          mpi_send_call->mRes_.set_singlesend(false);
+        } else {
+          // Send the Tensor description and data in a single transfer
+          if (src_dev->tensorflow_gpu_device_info() &&
+              (!send_args.alloc_attrs.on_host())) {
+            Notification n;
+            GPUUtil::SetProtoFromGPU(
+                val, src_dev, send_args.device_context,
+                mpi_send_call->mRes_.mutable_response()->mutable_tensor(),
+                is_dead, [&n, &s](const Status& s_) {
+                  s = s_;
+                  n.Notify();
+                });
+            n.WaitForNotification();
+          } else {
+            val.AsProtoTensorContent(
+                mpi_send_call->mRes_.mutable_response()->mutable_tensor());
+          }
+        }
+
+        std::function<MPISendTensorCall*()> res = std::bind(
+            send_cb, status, send_args, recv_args, val, is_dead, mpi_send_call);
+
+        SendQueueEntry req(parsed.FullKey().ToString().c_str(), std::move(res));
+
+        this->QueueSendRequest(req);
+
+        // Wait for the notification that indicates the tensor has been
+        // successfully transmitted to the remote process. Only needed if we
+        // have not parsed the tensor to proto
+        if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification();
+      };  // done_cb
 
   worker_env_2->compute_pool->Schedule([this, step_id, parsed, done_cb]() {
     this->RecvLocalAsync(step_id, parsed, done_cb);
@@ -289,9 +298,8 @@ void MPIRendezvousMgr::MPIBackgroundThread() {
     }
 
     // Remove sends that have been completed
-    active_sends.remove_if([](std::unique_ptr<MPISendTensorCall>& i) {
-      return i->IsFinished();
-    });
+    active_sends.remove_if(
+        [](std::unique_ptr<MPISendTensorCall>& i) { return i->IsFinished(); });
 
     // send a Tensor request
     RequestQueueEntry req;
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
index b15748d63c9fdbc5134069b63fd998e46c499e16..5596601ddb9846c0e4f5be4bf33114fc19c0a59d 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.h
@@ -18,22 +18,24 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_MPI
 
-#include <queue>
-#include <thread>
 #include <list>
-#include <string>
-#include <memory>
 #include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <thread>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 
 #include <iostream>
 
+#include "tensorflow/contrib/mpi/mpi_msg.pb.h"
 #include "tensorflow/contrib/mpi/mpi_utils.h"
 #include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
+#include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
-#include "tensorflow/contrib/mpi/mpi_msg.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
 #define TAG_REQTENSOR 1010
@@ -104,6 +106,7 @@ class MPIRequestTensorCall {
   void Init(const Rendezvous::ParsedKey& parsed, const int64 step_id) {
     req_.set_step_id(step_id);
     req_.set_rendezvous_key(parsed.FullKey().data(), parsed.FullKey().size());
+    req_.set_request_id(GetUniqueRequestId());
     request_buffer_size_ = req_.ByteSize();
     //   request_buffer_ = new char[request_buffer_size_];
     //  req_.SerializeToArray(request_buffer_, request_buffer_size_);
@@ -158,7 +161,8 @@ class MPIRendezvousMgr : public BaseRendezvousMgr {
  private:
   typedef std::function<MPISendTensorCall*(
       const Status&, const Rendezvous::Args&, const Rendezvous::Args&,
-      const Tensor&, const bool, MPISendTensorCall*)> MPIRecvTensorCallBack;
+      const Tensor&, const bool, MPISendTensorCall*)>
+      MPIRecvTensorCallBack;
 
   typedef std::pair<std::string, std::function<void()>> RequestQueueEntry;
   typedef std::pair<std::string, std::function<MPISendTensorCall*()>>
@@ -177,6 +181,8 @@ class MPIRendezvousMgr : public BaseRendezvousMgr {
   std::map<std::string, std::shared_ptr<MPIRequestTensorCall>> recv_tensor_map_
       GUARDED_BY(mrq_);
 
+  RecentRequestIds recv_tensor_recent_request_ids_;
+
   void AddRequest(RecvTensorRequest, const int);
   void MPIBackgroundThread();
 
diff --git a/tensorflow/contrib/mpi/mpi_server_lib.cc b/tensorflow/contrib/mpi/mpi_server_lib.cc
index d585c0565eb234655e7a1bbc92df5741e18c8f33..a31fa9ce0b3110d875689d74a41ca9f9cc85f532 100644
--- a/tensorflow/contrib/mpi/mpi_server_lib.cc
+++ b/tensorflow/contrib/mpi/mpi_server_lib.cc
@@ -22,8 +22,8 @@ limitations under the License.
 
 #include "grpc/support/alloc.h"
 
-#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
 
diff --git a/tensorflow/contrib/mpi/mpi_utils.h b/tensorflow/contrib/mpi/mpi_utils.h
index 45e21f2b25ab4897641ffec776eb1b3c32ab9a2e..fa297c28cb47d43ba927ab941854bd472d90b465 100644
--- a/tensorflow/contrib/mpi/mpi_utils.h
+++ b/tensorflow/contrib/mpi/mpi_utils.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_MPI
 
-#include <string>
 #include <map>
+#include <string>
 #include <vector>
 
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/contrib/mpi_collectives/BUILD b/tensorflow/contrib/mpi_collectives/BUILD
index 11c5d6e776d6adbf7c439012027752e2235883ab..9f9802b8fe12356c0da82ebb2b48b565cf3f7319 100644
--- a/tensorflow/contrib/mpi_collectives/BUILD
+++ b/tensorflow/contrib/mpi_collectives/BUILD
@@ -6,20 +6,9 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_additional_mpi_lib_defines",
     "tf_proto_library_cc",
 )
 
@@ -33,26 +22,98 @@ tf_proto_library_cc(
     ],
 )
 
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
+cc_library(
+    name = "mpi_defines",
+    defines = tf_additional_mpi_lib_defines(),
+)
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_py_library",
+    "tf_custom_op_library",
+    "tf_gen_op_wrapper_py",
+    "tf_gen_op_libs",
+    "tf_kernel_library",
+    "tf_py_test",
+)
 
 tf_custom_op_library(
-    name = "mpi_collectives.so",
+    name = "python/ops/_mpi_ops.so",
     srcs = [
-        "mpi_ops.cc",
-        "ring.cc",
-        "ring.h",
+        "kernels/mpi_ops.cc",
+        "kernels/ring.cc",
+        "kernels/ring.h",
+        "ops/mpi_ops.cc",
     ],
     gpu_srcs = [
-        "ring.cu.cc",
-        "ring.h",
+        "kernels/ring.cu.cc",
+        "kernels/ring.h",
     ],
     deps = [
+        ":mpi_defines",
         ":mpi_message_proto_cc",
         "//third_party/mpi",
     ],
 )
 
+tf_kernel_library(
+    name = "mpi_ops_kernels",
+    srcs = [
+        "kernels/mpi_ops.cc",
+        "kernels/ring.cc",
+    ],
+    hdrs = [
+        "kernels/ring.h",
+    ],
+    gpu_srcs = [
+        "kernels/ring.cu.cc",
+    ],
+    deps = [
+        ":mpi_defines",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_headers_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:stream_executor",
+    ],
+    # TODO: Include?    alwayslink = 1,
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["mpi_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "mpi_ops",
+    deps = [":mpi_ops_op_lib"],
+)
+
+tf_custom_op_py_library(
+    name = "mpi_collectives_py",
+    srcs = [
+        "__init__.py",
+        "python/ops/mpi_ops.py",
+    ],
+    dso = [
+        ":python/ops/_mpi_ops.so",
+    ],
+    kernels = [
+        ":mpi_ops_kernels",
+        ":mpi_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":mpi_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:device",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+    ],
+)
+
 tf_py_test(
     name = "mpi_ops_test",
     srcs = ["mpi_ops_test.py"],
@@ -61,20 +122,19 @@ tf_py_test(
         "//tensorflow/python:platform",
     ],
     data = [
-        ":mpi_collectives.so",
+        ":python/ops/_mpi_ops.so",
     ],
     tags = ["manual"],
 )
 
-py_library(
-    name = "mpi_ops_py",
-    srcs = [
-        "__init__.py",
-        "mpi_ops.py",
-    ],
-    data = [
-        ":mpi_collectives.so",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
 )
diff --git a/tensorflow/contrib/mpi_collectives/__init__.py b/tensorflow/contrib/mpi_collectives/__init__.py
index 9ed16a6f078a506b60fd14f4356ff65a0a692203..52029cbc36a3bb77ea38d3973a75bfd37e93bfa4 100644
--- a/tensorflow/contrib/mpi_collectives/__init__.py
+++ b/tensorflow/contrib/mpi_collectives/__init__.py
@@ -37,7 +37,7 @@ for detecting the running MPI configuration.
 Example:
 
 ```python
-from tensorflow.contrib import mpi
+import tensorflow.contrib.mpi_collectives as mpi
 
 # Use `mpi.Session` instead of `tf.Session`
 with mpi.Session() as session:
@@ -48,8 +48,10 @@ with mpi.Session() as session:
         print("MPI Size:", session.run(mpi.size()))
 ```
 
-@@rank
+@@init
 @@size
+@@rank
+@@local_rank
 
 ### Ring Allreduce and Allgather
 
@@ -123,12 +125,12 @@ from __future__ import print_function
 
 import tensorflow as tf
 
-from tensorflow.contrib.mpi_collectives.mpi_ops import size
-from tensorflow.contrib.mpi_collectives.mpi_ops import rank
-from tensorflow.contrib.mpi_collectives.mpi_ops import local_rank
-from tensorflow.contrib.mpi_collectives.mpi_ops import allgather
-from tensorflow.contrib.mpi_collectives.mpi_ops import _allreduce
-from tensorflow.contrib.mpi_collectives.mpi_ops import init
+from tensorflow.contrib.mpi_collectives.python.ops.mpi_ops import init
+from tensorflow.contrib.mpi_collectives.python.ops.mpi_ops import size
+from tensorflow.contrib.mpi_collectives.python.ops.mpi_ops import rank
+from tensorflow.contrib.mpi_collectives.python.ops.mpi_ops import local_rank
+from tensorflow.contrib.mpi_collectives.python.ops.mpi_ops import allgather
+from tensorflow.contrib.mpi_collectives.python.ops.mpi_ops import _allreduce
 
 
 def allreduce(tensor, average=True):
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
similarity index 93%
rename from tensorflow/contrib/mpi_collectives/mpi_ops.cc
rename to tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
index a051ab0004626d034071112bb37671137ca5a3f0..8dca90a1e34d6a234c2b1479ca5594e88afcc194 100644
--- a/tensorflow/contrib/mpi_collectives/mpi_ops.cc
+++ b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/mutex.h"
 
@@ -36,8 +35,8 @@ limitations under the License.
 
 #define OMPI_SKIP_MPICXX
 #include "third_party/mpi/mpi.h"
+#include "tensorflow/contrib/mpi_collectives/kernels/ring.h"
 #include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h"
-#include "tensorflow/contrib/mpi_collectives/ring.h"
 
 /*
  * MPI Allreduce and Allgather Ops for TensorFlow.
@@ -81,7 +80,7 @@ using GPUDevice = Eigen::GpuDevice;
 
 namespace tensorflow {
 namespace contrib {
-namespace mpi {
+namespace mpi_collectives {
 
 // Make sure template specializations are generated in the ring.cu.cc and the
 // ring.cc file, not in this file.
@@ -877,14 +876,6 @@ REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_GPU),
                         MPIInitOp<GPUDevice>);
 #endif
 
-REGISTER_OP("MPIInit").Doc(R"doc(
-Initialize MPI for the current process.
-
-If this is run on a GPU, then that GPU must be used for all future MPI
-operations. If it is run on CPU, then all future MPI operations must also
-run on CPU.
-)doc");
-
 // Op to get the current MPI Size.
 template <typename Device>
 class MPISizeOp : public OpKernel {
@@ -911,21 +902,6 @@ REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_GPU).HostMemory("size"),
                         MPISizeOp<GPUDevice>);
 #endif
 
-REGISTER_OP("MPISize")
-    .Output("size: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the number of running MPI processes.
-
-More precisely, returns the number of MPI processes in the group associated
-with the MPI_COMM_WORLD communicator.
-
-size:   Size of the MPI group.
-)doc");
-
 // Op to get the current MPI Rank.
 template <typename Device>
 class MPIRankOp : public OpKernel {
@@ -952,21 +928,6 @@ REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_GPU).HostMemory("rank"),
                         MPIRankOp<GPUDevice>);
 #endif
 
-REGISTER_OP("MPIRank")
-    .Output("rank: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the index of the current process in the MPI group.
-
-More precisely, returns the rank of the calling process in the MPI_COMM_WORLD
-communicator.
-
-rank:   Rank of the calling process.
-)doc");
-
 // Op to get the current local MPI Rank.
 template <typename Device>
 class MPILocalRankOp : public OpKernel {
@@ -994,21 +955,6 @@ REGISTER_KERNEL_BUILDER(
     MPILocalRankOp<GPUDevice>);
 #endif
 
-REGISTER_OP("MPILocalRank")
-    .Output("rank: int32")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the index of the current process in the node it is on.
-
-More precisely, returns the rank of the calling process in communicator that
-only spans the MPI processes running on that node.
-
-rank:   Rank of the calling process on the node it is on.
-)doc");
-
 template <typename Device>
 class MPIAllreduceOp : public AsyncOpKernel {
  public:
@@ -1083,28 +1029,6 @@ REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_GPU),
                         MPIAllreduceOp<GPUDevice>);
 #endif
 
-REGISTER_OP("MPIAllreduce")
-    .Attr("T: {int32, int64, float32}")
-    .Input("tensor: T")
-    .Output("sum: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Perform an MPI Allreduce on a tensor. All other processes that do a reduction
-on a tensor with the same name must have the same dimension for that tensor.
-Tensors are reduced with other tensors that have the same node name for the
-allreduce.
-
-Arguments
-    tensor:     A tensor to reduce.
-
-Output
-    sum:        A tensor with the same shape as `tensor`, summed across all
-                MPI processes.
-)doc");
-
 template <typename Device>
 class MPIAllgatherOp : public AsyncOpKernel {
  public:
@@ -1192,34 +1116,6 @@ class MPIAllgatherOp : public AsyncOpKernel {
   }
 };
 
-REGISTER_OP("MPIAllgather")
-    .Attr("T: {int32, int64, float32}")
-    .Attr("S: {int64}")
-    .Input("tensor: T")
-    .Input("sizes: S")
-    .Output("gathered: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle output;
-      TF_RETURN_IF_ERROR(
-          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &output));
-      c->set_output(0, output);
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Perform an MPI Allgather on a tensor. All other processes that do a gather on a
-tensor with the same name must have the same rank for that tensor, and have the
-same dimension on all but the first dimension.
-
-Arguments
-    tensor:     A tensor to gather.
-    sizes:      A tensor containing the first-dimension sizes of tensors to be
-                gathered from other ranks
-
-Output
-    gathered:   A tensor with the same shape as `tensor` except for the first
-                dimension, which is the sum of dimensions in `sizes`.
-)doc");
-
 REGISTER_KERNEL_BUILDER(
     Name("MPIAllgather").Device(DEVICE_CPU).HostMemory("sizes"),
     MPIAllgatherOp<CPUDevice>);
@@ -1229,7 +1125,7 @@ REGISTER_KERNEL_BUILDER(
     MPIAllgatherOp<GPUDevice>);
 #endif
 
-}  // namespace mpi
+}  // namespace mpi_collectives
 }  // namespace contrib
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/mpi_collectives/ring.cc b/tensorflow/contrib/mpi_collectives/kernels/ring.cc
similarity index 96%
rename from tensorflow/contrib/mpi_collectives/ring.cc
rename to tensorflow/contrib/mpi_collectives/kernels/ring.cc
index d93233eb210b80df10fd9c2c7975ce77112d18a2..8970ceb1a206ff2f9d6e18f7d19e313b8a036042 100644
--- a/tensorflow/contrib/mpi_collectives/ring.cc
+++ b/tensorflow/contrib/mpi_collectives/kernels/ring.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/contrib/mpi_collectives/ring.h"
+#include "tensorflow/contrib/mpi_collectives/kernels/ring.h"
 
 namespace tensorflow {
 namespace contrib {
-namespace mpi {
+namespace mpi_collectives {
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
@@ -73,7 +73,7 @@ GENERATE_ACCUMULATE(long long);
 GENERATE_ACCUMULATE(float);
 #undef GENERATE_ACCUMULATE
 
-}  // namespace mpi
+}  // namespace mpi_collectives
 }  // namespace contrib
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/mpi_collectives/ring.cu.cc b/tensorflow/contrib/mpi_collectives/kernels/ring.cu.cc
similarity index 97%
rename from tensorflow/contrib/mpi_collectives/ring.cu.cc
rename to tensorflow/contrib/mpi_collectives/kernels/ring.cu.cc
index 2f3eef366a9a3c10e59cd5298fc1626e1094dff8..b04abde4694199d827a1738850bded9bf696d56c 100644
--- a/tensorflow/contrib/mpi_collectives/ring.cu.cc
+++ b/tensorflow/contrib/mpi_collectives/kernels/ring.cu.cc
@@ -19,11 +19,11 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/contrib/mpi_collectives/ring.h"
+#include "tensorflow/contrib/mpi_collectives/kernels/ring.h"
 
 namespace tensorflow {
 namespace contrib {
-namespace mpi {
+namespace mpi_collectives {
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 
@@ -109,7 +109,7 @@ GENERATE_ACCUMULATE(long long);
 GENERATE_ACCUMULATE(float);
 #undef GENERATE_ACCUMULATE
 
-}  // namespace mpi
+}  // namespace mpi_collectives
 }  // namespace contrib
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/mpi_collectives/ring.h b/tensorflow/contrib/mpi_collectives/kernels/ring.h
similarity index 99%
rename from tensorflow/contrib/mpi_collectives/ring.h
rename to tensorflow/contrib/mpi_collectives/kernels/ring.h
index cae57ce60eb09509af69f8ccab9eacedea361548..1d56d588bc49eda542303ae6ebb19602352ae01d 100644
--- a/tensorflow/contrib/mpi_collectives/ring.h
+++ b/tensorflow/contrib/mpi_collectives/kernels/ring.h
@@ -37,7 +37,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace contrib {
-namespace mpi {
+namespace mpi_collectives {
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
@@ -317,7 +317,7 @@ Status RingAllgather(OpKernelContext* context, const Tensor* input,
   return Status::OK();
 }
 
-}  // namespace mpi
+}  // namespace mpi_collectives
 }  // namespace contrib
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/mpi_collectives/mpi_message.proto b/tensorflow/contrib/mpi_collectives/mpi_message.proto
index 7fa5e203010465766b8ab9562cac010de51a7bbc..afbce981ae1bdd5ae143ba5c45a4d9790a52fafc 100644
--- a/tensorflow/contrib/mpi_collectives/mpi_message.proto
+++ b/tensorflow/contrib/mpi_collectives/mpi_message.proto
@@ -15,7 +15,7 @@ limitations under the License.
 
 syntax = "proto3";
 
-package tensorflow.contrib.mpi;
+package tensorflow.contrib.mpi_collectives;
 
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/contrib/mpi_collectives/ops/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/ops/mpi_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..18e6bb61cffc6471412cb4c5141655839d7ddb3a
--- /dev/null
+++ b/tensorflow/contrib/mpi_collectives/ops/mpi_ops.cc
@@ -0,0 +1,132 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_MPI
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+namespace contrib {
+namespace mpi_collectives {
+
+REGISTER_OP("MPIInit").Doc(R"doc(
+Initialize MPI for the current process.
+
+If this is run on a GPU, then that GPU must be used for all future MPI
+operations. If it is run on CPU, then all future MPI operations must also
+run on CPU.
+)doc");
+
+REGISTER_OP("MPISize")
+    .Output("size: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Returns the number of running MPI processes.
+
+More precisely, returns the number of MPI processes in the group associated
+with the MPI_COMM_WORLD communicator.
+
+size:   Size of the MPI group.
+)doc");
+
+REGISTER_OP("MPIRank")
+    .Output("rank: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Returns the index of the current process in the MPI group.
+
+More precisely, returns the rank of the calling process in the MPI_COMM_WORLD
+communicator.
+
+rank:   Rank of the calling process.
+)doc");
+
+REGISTER_OP("MPILocalRank")
+    .Output("rank: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Returns the index of the current process in the node it is on.
+
+More precisely, returns the rank of the calling process in communicator that
+only spans the MPI processes running on that node.
+
+rank:   Rank of the calling process on the node it is on.
+)doc");
+
+REGISTER_OP("MPIAllreduce")
+    .Attr("T: {int32, int64, float32}")
+    .Input("tensor: T")
+    .Output("sum: T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Perform an MPI Allreduce on a tensor. All other processes that do a reduction
+on a tensor with the same name must have the same dimension for that tensor.
+Tensors are reduced with other tensors that have the same node name for the
+allreduce.
+
+Arguments
+    tensor:     A tensor to reduce.
+
+Output
+    sum:        A tensor with the same shape as `tensor`, summed across all
+                MPI processes.
+)doc");
+
+REGISTER_OP("MPIAllgather")
+    .Attr("T: {int32, int64, float32}")
+    .Attr("S: {int64}")
+    .Input("tensor: T")
+    .Input("sizes: S")
+    .Output("gathered: T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle output;
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &output));
+      c->set_output(0, output);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Perform an MPI Allgather on a tensor. All other processes that do a gather on a
+tensor with the same name must have the same rank for that tensor, and have the
+same dimension on all but the first dimension.
+
+Arguments
+    tensor:     A tensor to gather.
+    sizes:      A tensor containing the first-dimension sizes of tensors to be
+                gathered from other ranks
+
+Output
+    gathered:   A tensor with the same shape as `tensor` except for the first
+                dimension, which is the sum of dimensions in `sizes`.
+)doc");
+
+}  // namespace mpi_collectives
+}  // namespace contrib
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_MPI
diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.py b/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
similarity index 67%
rename from tensorflow/contrib/mpi_collectives/mpi_ops.py
rename to tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
index 81567cc688ac8666c3755d5f84162a6dff869107..2fbefef0d36f6a1507827427ebbafe5e81e35ea3 100644
--- a/tensorflow/contrib/mpi_collectives/mpi_ops.py
+++ b/tensorflow/contrib/mpi_collectives/python/ops/mpi_ops.py
@@ -20,43 +20,13 @@ from __future__ import print_function
 
 import tensorflow as tf
 
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import load_library
+from tensorflow.contrib.mpi_collectives.ops import gen_mpi_ops
+from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import tf_logging as logging
 
-
-def _load_library(name, op_list=None):
-  """Loads a .so file containing the specified operators.
-
-  Args:
-    name: The name of the .so file to load.
-    op_list: A list of names of operators that the library should have. If None
-        then the .so file's contents will not be verified.
-
-  Raises:
-    NameError if one of the required ops is missing.
-  """
-  try:
-    filename = resource_loader.get_path_to_datafile(name)
-    library = load_library.load_op_library(filename)
-    for expected_op in (op_list or []):
-      for lib_op in library.OP_LIST.op:
-        if lib_op.name == expected_op:
-          break
-      else:
-        raise NameError(
-          'Could not find operator %s in dynamic library %s' %
-          (expected_op, name))
-    return library
-  except errors.NotFoundError:
-    logging.warning('%s file could not be loaded.', name)
-
-
-MPI_LIB = _load_library('mpi_collectives.so', ['MPISize', 'MPIRank',
-                                               'MPILocalRank', 'MPIAllgather',
-                                               'MPIAllreduce'])
+_mpi_ops_so = loader.load_op_library(
+    resource_loader.get_path_to_datafile('_mpi_ops.so'))
 
 
 def size(name=None):
@@ -68,7 +38,7 @@ def size(name=None):
   Returns:
     An integer scalar containing the number of MPI processes.
   """
-  return MPI_LIB.mpi_size(name=name)
+  return gen_mpi_ops.mpi_size(name=name)
 
 
 ops.NotDifferentiable('MPISize')
@@ -83,7 +53,7 @@ def rank(name=None):
   Returns:
     An integer scalar with the MPI rank of the calling process.
   """
-  return MPI_LIB.mpi_rank(name=name)
+  return gen_mpi_ops.mpi_rank(name=name)
 
 
 ops.NotDifferentiable('MPIRank')
@@ -95,7 +65,7 @@ def init(name=None):
   All future MPI ops must be run on the same device that the `init` op was run
   on.
   """
-  return MPI_LIB.mpi_init(name=name)
+  return gen_mpi_ops.mpi_init(name=name)
 
 
 ops.NotDifferentiable('MPIInit')
@@ -112,7 +82,7 @@ def local_rank(name=None):
   Returns:
     An integer scalar with the local MPI rank of the calling process.
   """
-  return MPI_LIB.mpi_local_rank(name=name)
+  return gen_mpi_ops.mpi_local_rank(name=name)
 
 
 ops.NotDifferentiable('MPILocalRank')
@@ -129,7 +99,7 @@ def _allreduce(tensor, name=None):
     A tensor of the same shape and type as `tensor`, summed across all
     processes.
   """
-  return MPI_LIB.mpi_allreduce(tensor, name=name)
+  return gen_mpi_ops.mpi_allreduce(tensor, name=name)
 
 
 ops.NotDifferentiable('MPIAllreduce')
@@ -151,15 +121,14 @@ def allgather(tensor, name=None):
   """
   # Specify that first allgather is to collect the tensor gather sizes,
   # indicated by passing in a scalar (0-D tensor) of value 0
-  sizes_flag = tf.constant(0, dtype=tf.int64, name="size_flag_const")
-  my_size = tf.slice(tf.shape(tensor, out_type=tf.int64), [0], [1], name="size_slice")
+  sizes_flag = tf.constant(0, dtype=tf.int64, name='size_flag_const')
+  my_size = tf.slice(
+      tf.shape(tensor, out_type=tf.int64), [0], [1], name='size_slice')
   if name is None:
-    name = "allgather"
-  sizing_name = "{}_sizing".format(name)
-  sizes = MPI_LIB.mpi_allgather(my_size, sizes_flag, name=sizing_name)
-  return MPI_LIB.mpi_allgather(tensor, sizes, name=name)
+    name = 'allgather'
+  sizing_name = '{}_sizing'.format(name)
+  sizes = gen_mpi_ops.mpi_allgather(my_size, sizes_flag, name=sizing_name)
+  return gen_mpi_ops.mpi_allgather(tensor, sizes, name=name)
 
 
 ops.NotDifferentiable('MPIAllgather')
-
-
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD
index df9dbb457ace32ab804f7fc736a23f5b08bd077a..5ac96007df7ee08b1e32aacd28f83768859810a9 100644
--- a/tensorflow/contrib/nccl/BUILD
+++ b/tensorflow/contrib/nccl/BUILD
@@ -23,15 +23,17 @@ load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 tf_custom_op_library(
     name = "python/ops/_nccl_ops.so",
     srcs = [
+        "ops/nccl_ops.cc",
+    ],
+    gpu_srcs = [
         "kernels/nccl_manager.cc",
         "kernels/nccl_manager.h",
         "kernels/nccl_ops.cc",
-        "ops/nccl_ops.cc",
     ],
-    deps = [
-        "//tensorflow/core:gpu_headers_lib",
+    deps = if_cuda([
         "@nccl_archive//:nccl",
-    ],
+        "//tensorflow/core:gpu_headers_lib",
+    ]),
 )
 
 tf_cuda_cc_test(
@@ -52,17 +54,14 @@ tf_cuda_cc_test(
         "no_oss",
         "notap",
     ],
-    deps = if_cuda(
+    deps =
         [
-            "@nccl_archive//:nccl",
             "//tensorflow/core:cuda",
+            "//tensorflow/core:test",
+            "//tensorflow/core:test_main",
+            "//tensorflow/core:testlib",
+            "@nccl_archive//:nccl",
         ],
-        [],
-    ) + [
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
 )
 
 tf_kernel_library(
@@ -103,11 +102,8 @@ tf_custom_op_py_library(
         "__init__.py",
         "python/ops/nccl_ops.py",
     ],
-    dso = [
-        ":python/ops/_nccl_ops.so",
-    ],
-    kernels = [
-        ":nccl_kernels",
+    dso = [":python/ops/_nccl_ops.so"],
+    kernels = if_cuda([":nccl_kernels"]) + [
         ":nccl_ops_op_lib",
     ],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index 31a35b0d53309bc2930b8a6f1b9d6a817b4a911e..913935b38246f1c5c0f7da4c1ea1f986bc00891b 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -258,9 +258,37 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
     devices[i] = collective->participants[i]->gpu_device_id;
   }
 
+  int device_count = num_devices;
+#if NCCL_MAJOR >= 2
+  // NCCL2 prevents InitAll for more communicators than devices (but doesn't
+  // check that device ids are unique). Work around it by initializing each
+  // rank individually.
+  cudaGetDeviceCount(&device_count);
+#endif
   std::vector<ncclComm_t> nccl_comms(num_devices);
-  auto result = ncclCommInitAll(nccl_comms.data(), num_devices, devices.data());
-  CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+  if (num_devices <= device_count) {
+    auto result =
+        ncclCommInitAll(nccl_comms.data(), num_devices, devices.data());
+    CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+  } else {
+    int savedDevice = 0;
+    CHECK_EQ(cudaGetDevice(&savedDevice), cudaSuccess);
+    ncclUniqueId commId;
+    ncclGetUniqueId(&commId);
+#if NCCL_MAJOR >= 2
+    CHECK_EQ(ncclGroupStart(), ncclSuccess);
+#endif
+    for (int rank = 0; rank < num_devices; ++rank) {
+      cudaSetDevice(devices[rank]);
+      auto result =
+          ncclCommInitRank(nccl_comms.data() + rank, num_devices, commId, rank);
+      CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+    }
+#if NCCL_MAJOR >= 2
+    CHECK_EQ(ncclGroupEnd(), ncclSuccess);
+#endif
+    cudaSetDevice(savedDevice);
+  }
   for (int rank = 0; rank < num_devices; ++rank) {
     members[rank].nccl_comm = nccl_comms[rank];
   }
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index cb1719c3be6a5c042db6e258d68663e70bfbfa15..bb219e0edc8a2c4ba0ce0583cbe4018a4fa3a1d1 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
+#define TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
 
 #ifdef GOOGLE_CUDA
 
@@ -136,4 +136,4 @@ class NcclManager {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_NCCL_COMMUNICATOR_H_
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
index 505c4b0d71028c64b5075cff7ea010597b4263b3..985b2bae2566c38dfb2c71a899e4b03bbb8fa55d 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #ifdef GOOGLE_CUDA
 
 #include <algorithm>
+#include <random>
 #include <vector>
 
 #include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
@@ -30,6 +31,8 @@ namespace tensorflow {
 static std::vector<BaseGPUDevice*> GetGPUDevices() {
   std::vector<Device*> devices;
   SessionOptions session_options;
+  session_options.config.mutable_gpu_options()
+      ->set_per_process_gpu_memory_fraction(0.1);
   session_options.env = Env::Default();
   Status s = DeviceFactory::GetFactory(DEVICE_GPU)
                  ->AddDevices(session_options, "", &devices);
@@ -173,7 +176,7 @@ class NcclManagerTest : public ::testing::Test {
       auto out_gpu_mem = AsDeviceMemory(out_gpu.flat<float>().data());
       stream->ThenMemcpy(out_cpu.flat<float>().data(), out_gpu_mem,
                          out_cpu.TotalBytes());
-      stream->BlockHostUntilDone();
+      SE_ASSERT_OK(stream->BlockHostUntilDone());
       test::ExpectTensorEqual<float>(test_case->expected, out_cpu);
     }
   }
@@ -234,10 +237,11 @@ TEST_F(NcclManagerTest, MultipleCallers) {
     for (int i = 0; i < num_ranks; ++i) {
       auto* device = devices->at(i % devices->size());
       auto* stream = device->tensorflow_gpu_device_info()->stream;
-      stream->BlockHostUntilDone();
+      SE_ASSERT_OK(stream->BlockHostUntilDone());
     }
 
-    std::random_shuffle(case_and_device_num.begin(), case_and_device_num.end());
+    std::shuffle(case_and_device_num.begin(), case_and_device_num.end(),
+                 std::mt19937(std::random_device()()));
 
     mutex mu;  // guards case_and_device_num.
     std::unique_ptr<thread::ThreadPool> pool(
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
index bad0abd44cc507c6ebbe4481f80b8cafd8480322..98fe394c5b38294700617591992d3207b0a4706b 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
@@ -77,10 +77,6 @@ class NcclTestCase(test.TestCase):
       # same communicator across multiple sessions.
       with self.test_session(use_gpu=True) as sess:
 
-        # Check GPU availability *after* creating test session, see b/68975239.
-        if not test.is_gpu_available():
-          return  # Test requires access to a GPU
-
         for devices in device_sets:
           shape = (3, 4)
           random = (np.random.random_sample(shape) - .5) * 1024
@@ -100,6 +96,11 @@ class NcclTestCase(test.TestCase):
 
           result_tensors = [array_ops.identity(t) for t in reduce_tensors]
 
+          # Check GPU availability *after* creating session, see b/68975239.
+          if not test.is_gpu_available():
+            # If no GPU is available, only test graph construction.
+            continue
+
           # Test execution and results.
           for t in sess.run(result_tensors):
             self.assertAllClose(t, np_ans)
@@ -114,6 +115,7 @@ class NcclTestCase(test.TestCase):
       numpy_fn: A function taking two tensors and returning the gradient of the
           reduction of the two.
     """
+
     def _Gradient(tensors, devices):
       inputs = [array_ops.placeholder(t.dtype, t.shape) for t in tensors]
       reduce_tensors = nccl_reduce(inputs, devices)
@@ -164,12 +166,17 @@ class BroadcastTest(NcclTestCase):
                (['/device:GPU:0', '/device:GPU:0'],))
 
   def testBroadcastToCpuError(self):
-    # Broadcasts to CPU is not supported.
-    with self.assertRaisesRegexp(
-        errors.NotFoundError,
-        "No registered '_NcclBroadcastRecv' OpKernel for CPU devices"):
+    try:
+      # Broadcasts to CPU is not supported.
       self._Test(_NcclBroadcast, lambda x, y: x,
                  (['/device:GPU:0', '/device:CPU:0'],))
+    except errors.NotFoundError as e:
+      self.assertRegexpMatches(
+          str(e), "No registered '_NcclBroadcastRecv' OpKernel for CPU devices")
+    else:
+      # Session isn't executed when no GPU is available.
+      if test.is_gpu_available():
+        self.fail("Didn't raise NotFoundError trying to broadcast to CPU")
 
 
 class CombinedTest(NcclTestCase):
diff --git a/tensorflow/contrib/ndlstm/BUILD b/tensorflow/contrib/ndlstm/BUILD
deleted file mode 100644
index 8403f841884d4640ce8156ff4db46868dbe1788c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/ndlstm/BUILD
+++ /dev/null
@@ -1,92 +0,0 @@
-# Description:
-#   Contains classes implementing 1D and 2D LSTMs for image and signal
-#   processing problems.
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-
-py_library(
-    name = "ndlstm",
-    srcs = [
-        "__init__.py",
-        "python/__init__.py",
-        "python/lstm1d.py",
-        "python/lstm2d.py",
-        "python/misc.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/rnn:rnn_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:rnn",
-        "//tensorflow/python:rnn_cell",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-    ],
-)
-
-tf_py_test(
-    name = "lstm1d_test",
-    srcs = ["python/lstm1d_test.py"],
-    additional_deps = [
-        ":ndlstm",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:variables",
-    ],
-)
-
-tf_py_test(
-    name = "lstm2d_test",
-    srcs = ["python/lstm2d_test.py"],
-    additional_deps = [
-        ":ndlstm",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variables",
-    ],
-)
-
-tf_py_test(
-    name = "misc_test",
-    srcs = ["python/misc_test.py"],
-    additional_deps = [
-        ":ndlstm",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variables",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/ndlstm/README.md b/tensorflow/contrib/ndlstm/README.md
deleted file mode 100644
index 7ccb57f1b34a24af7d776f7dbb12a2a00bb5ca30..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/ndlstm/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-Library of multidimensional LSTM models and related code.
-
-# 2D LSTM code
-
-The 2D LSTM layers take tensors of the form (batch_size, height, width,
-depth), compatible with convolutional layers, as inputs. The library
-transposes and reshapes these tensors in a way that allows batches of
-images to be processed by LSTMs.
-
-The library currently provides:
-
- - a separable 2D LSTM layer
- - a simple 2D convolutional layer that can be swapped out against 2D LSTM
- - layers to reduce images to sequences and images to final state vectors
- - layers for sequence classification, pixel-wise classification
-
-# Other Dimensions
-
-There is 1D LSTM code in `lstm1d.py`. This code implements 1D LSTM versions
-suitable as a basis for higher dimensional LSTMs. It is intended for constant
-batch size and uses a different layout.  Although the code is perfectly fine for
-1D use, you may find other 1D LSTM implementations to be more convenient if you
-are interested in sequence problems.
-
-# Upcoming Changes
-
- - PyramidLSTM
- - support for 3D and 4D
- - optional use of native fused LSTM op
- - easy-to-use command line drivers and examples
- - operators for patch-wise processing
diff --git a/tensorflow/contrib/ndlstm/python/lstm1d.py b/tensorflow/contrib/ndlstm/python/lstm1d.py
deleted file mode 100644
index d3c3531f405a74d89ce736dae0134939e189f7ae..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/ndlstm/python/lstm1d.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""LSTM layers for sequences."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-from tensorflow.contrib.framework.python.ops import variables
-from tensorflow.python.framework import constant_op
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import rnn
-from tensorflow.python.ops import rnn_cell
-from tensorflow.python.ops import variable_scope
-
-
-def _shape(tensor):
-  return tensor.get_shape().as_list()
-
-
-def ndlstm_base_unrolled(inputs, noutput, scope=None, reverse=False):
-  """Run an LSTM, either forward or backward.
-
-  This is a 1D LSTM implementation using unrolling and the TensorFlow
-  LSTM op.
-
-  Args:
-    inputs: input sequence (length, batch_size, ninput)
-    noutput: depth of output
-    scope: optional scope name
-    reverse: run LSTM in reverse
-
-  Returns:
-    Output sequence (length, batch_size, noutput)
-
-  """
-  with variable_scope.variable_scope(scope, "SeqLstmUnrolled", [inputs]):
-    length, batch_size, _ = _shape(inputs)
-    lstm_cell = rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
-    state = array_ops.zeros([batch_size, lstm_cell.state_size])
-    output_u = []
-    inputs_u = array_ops.unstack(inputs)
-    if reverse:
-      inputs_u = list(reversed(inputs_u))
-    for i in xrange(length):
-      if i > 0:
-        variable_scope.get_variable_scope().reuse_variables()
-      output, state = lstm_cell(inputs_u[i], state)
-      output_u += [output]
-    if reverse:
-      output_u = list(reversed(output_u))
-    outputs = array_ops.stack(output_u)
-    return outputs
-
-
-def ndlstm_base_dynamic(inputs, noutput, scope=None, reverse=False):
-  """Run an LSTM, either forward or backward.
-
-  This is a 1D LSTM implementation using dynamic_rnn and
-  the TensorFlow LSTM op.
-
-  Args:
-    inputs: input sequence (length, batch_size, ninput)
-    noutput: depth of output
-    scope: optional scope name
-    reverse: run LSTM in reverse
-
-  Returns:
-    Output sequence (length, batch_size, noutput)
-  """
-  with variable_scope.variable_scope(scope, "SeqLstm", [inputs]):
-    # TODO(tmb) make batch size, sequence_length dynamic
-    # example: sequence_length = tf.shape(inputs)[0]
-    _, batch_size, _ = _shape(inputs)
-    lstm_cell = rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
-    state = array_ops.zeros([batch_size, lstm_cell.state_size])
-    sequence_length = int(inputs.get_shape()[0])
-    sequence_lengths = math_ops.to_int64(
-        array_ops.fill([batch_size], sequence_length))
-    if reverse:
-      inputs = array_ops.reverse_v2(inputs, [0])
-    outputs, _ = rnn.dynamic_rnn(
-        lstm_cell, inputs, sequence_lengths, state, time_major=True)
-    if reverse:
-      outputs = array_ops.reverse_v2(outputs, [0])
-    return outputs
-
-
-def ndlstm_base(inputs, noutput, scope=None, reverse=False, dynamic=True):
-  """Implements a 1D LSTM, either forward or backward.
-
-  This is a base case for multidimensional LSTM implementations, which
-  tend to be used differently from sequence-to-sequence
-  implementations.  For general 1D sequence to sequence
-  transformations, you may want to consider another implementation
-  from TF slim.
-
-  Args:
-    inputs: input sequence (length, batch_size, ninput)
-    noutput: depth of output
-    scope: optional scope name
-    reverse: run LSTM in reverse
-    dynamic: use dynamic_rnn
-
-  Returns:
-    Output sequence (length, batch_size, noutput)
-
-  """
-  # TODO(tmb) maybe add option for other LSTM implementations, like
-  # slim.rnn.basic_lstm_cell
-  if dynamic:
-    return ndlstm_base_dynamic(inputs, noutput, scope=scope, reverse=reverse)
-  else:
-    return ndlstm_base_unrolled(inputs, noutput, scope=scope, reverse=reverse)
-
-
-def sequence_to_final(inputs, noutput, scope=None, name=None, reverse=False):
-  """Run an LSTM across all steps and returns only the final state.
-
-  Args:
-    inputs: (length, batch_size, depth) tensor
-    noutput: size of output vector
-    scope: optional scope name
-    name: optional name for output tensor
-    reverse: run in reverse
-
-  Returns:
-    Batch of size (batch_size, noutput).
-  """
-  with variable_scope.variable_scope(scope, "SequenceToFinal", [inputs]):
-    length, batch_size, _ = _shape(inputs)
-    lstm = rnn_cell.BasicLSTMCell(noutput, state_is_tuple=False)
-    state = array_ops.zeros([batch_size, lstm.state_size])
-    inputs_u = array_ops.unstack(inputs)
-    if reverse:
-      inputs_u = list(reversed(inputs_u))
-    for i in xrange(length):
-      if i > 0:
-        variable_scope.get_variable_scope().reuse_variables()
-      output, state = lstm(inputs_u[i], state)
-    outputs = array_ops.reshape(output, [batch_size, noutput], name=name)
-    return outputs
-
-
-def sequence_softmax(inputs, noutput, scope=None, name=None, linear_name=None):
-  """Run a softmax layer over all the time steps of an input sequence.
-
-  Args:
-    inputs: (length, batch_size, depth) tensor
-    noutput: output depth
-    scope: optional scope name
-    name: optional name for output tensor
-    linear_name: name for linear (pre-softmax) output
-
-  Returns:
-    A tensor of size (length, batch_size, noutput).
-
-  """
-  length, _, ninputs = _shape(inputs)
-  inputs_u = array_ops.unstack(inputs)
-  output_u = []
-  with variable_scope.variable_scope(scope, "SequenceSoftmax", [inputs]):
-    initial_w = random_ops.truncated_normal([0 + ninputs, noutput], stddev=0.1)
-    initial_b = constant_op.constant(0.1, shape=[noutput])
-    w = variables.model_variable("weights", initializer=initial_w)
-    b = variables.model_variable("biases", initializer=initial_b)
-    for i in xrange(length):
-      with variable_scope.variable_scope(scope, "SequenceSoftmaxStep",
-                                         [inputs_u[i]]):
-        # TODO(tmb) consider using slim.fully_connected(...,
-        # activation_fn=tf.nn.softmax)
-        linear = nn_ops.xw_plus_b(inputs_u[i], w, b, name=linear_name)
-        output = nn_ops.softmax(linear)
-        output_u += [output]
-    outputs = array_ops.stack(output_u, name=name)
-  return outputs
diff --git a/tensorflow/contrib/ndlstm/python/lstm1d_test.py b/tensorflow/contrib/ndlstm/python/lstm1d_test.py
deleted file mode 100644
index 49b15cc814cc54aaea7c67c4e509e5aa144e063e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/ndlstm/python/lstm1d_test.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for 1D LSTM."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.ndlstm.python import lstm1d as lstm1d_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-lstm1d = lstm1d_lib
-
-
-def _rand(*size):
-  return np.random.uniform(size=size).astype("f")
-
-
-class Lstm1DTest(test.TestCase):
-
-  def testSequenceToSequenceDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(17, 1, 5))
-      outputs = lstm1d.ndlstm_base(inputs, 8)
-      variables.global_variables_initializer().run()
-      names = [v.name for v in variables.trainable_variables()]
-      self.assertEqual(len(names), 2)
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (17, 1, 8))
-
-  def testSequenceToSequenceGradient(self):
-    with self.test_session():
-      size = (17, 1, 15)
-      output_size = (17, 1, 8)
-      inputs = constant_op.constant(_rand(*size))
-      outputs = lstm1d.ndlstm_base(inputs, 8, dynamic=False)
-      variables.global_variables_initializer().run()
-      gradients = gradients_impl.gradients(outputs, inputs)
-      if 1:  # pylint: disable=using-constant-test
-        gradients = gradients_impl.gradients(outputs, inputs)[0].eval()
-        self.assertEqual(gradients.shape, size)
-      else:
-        # TODO(tmb) tf.test.compute_gradient error is currently broken
-        # with dynamic_rnn. Enable this test case eventually.
-        err = gradient_checker.compute_gradient_error(
-            inputs, size, outputs, output_size, delta=1e-4)
-        self.assert_(not np.isnan(err))
-        self.assert_(err < 0.1)
-
-  def testSequenceToSequenceGradientReverse(self):
-    with self.test_session():
-      size = (17, 1, 15)
-      output_size = (17, 1, 8)
-      inputs = constant_op.constant(_rand(*size))
-      outputs = lstm1d.ndlstm_base(inputs, 8, reverse=1, dynamic=False)
-      variables.global_variables_initializer().run()
-      if 1:  # pylint: disable=using-constant-test
-        gradients = gradients_impl.gradients(outputs, inputs)[0].eval()
-        self.assertEqual(gradients.shape, size)
-      else:
-        # TODO(tmb) tf.test.compute_gradient error is currently broken
-        # with dynamic_rnn. Enable this test case eventually.
-        err = gradient_checker.compute_gradient_error(
-            inputs, size, outputs, output_size, delta=1e-4)
-        self.assert_(not np.isnan(err))
-        self.assert_(err < 0.1)
-
-  def testSequenceToFinalDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(17, 6, 5))
-      outputs = lstm1d.sequence_to_final(inputs, 8)
-      variables.global_variables_initializer().run()
-      names = [v.name for v in variables.trainable_variables()]
-      self.assertEqual(len(names), 2)
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (6, 8))
-
-  def testSequenceSoftmaxDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(17, 1, 5))
-      outputs = lstm1d.sequence_softmax(inputs, 8)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (17, 1, 8))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/ndlstm/python/lstm2d.py b/tensorflow/contrib/ndlstm/python/lstm2d.py
deleted file mode 100644
index ebbb4ccf11b219e86578d05e99a7a02ebe08271e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/ndlstm/python/lstm2d.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A small library of functions dealing with LSTMs applied to images.
-
-Tensors in this library generally have the shape (num_images, height, width,
-depth).
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.ndlstm.python import lstm1d
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variable_scope
-
-
-def _shape(tensor):
-  """Get the shape of a tensor as an int list."""
-  return tensor.get_shape().as_list()
-
-
-def images_to_sequence(tensor):
-  """Convert a batch of images into a batch of sequences.
-
-  Args:
-    tensor: a (num_images, height, width, depth) tensor
-
-  Returns:
-    (width, num_images*height, depth) sequence tensor
-  """
-
-  num_image_batches, height, width, depth = _shape(tensor)
-  transposed = array_ops.transpose(tensor, [2, 0, 1, 3])
-  return array_ops.reshape(transposed,
-                           [width, num_image_batches * height, depth])
-
-
-def sequence_to_images(tensor, num_image_batches):
-  """Convert a batch of sequences into a batch of images.
-
-  Args:
-    tensor: (num_steps, num_batches, depth) sequence tensor
-    num_image_batches: the number of image batches
-
-  Returns:
-    (num_images, height, width, depth) tensor
-  """
-
-  width, num_batches, depth = _shape(tensor)
-  height = num_batches // num_image_batches
-  reshaped = array_ops.reshape(tensor,
-                               [width, num_image_batches, height, depth])
-  return array_ops.transpose(reshaped, [1, 2, 0, 3])
-
-
-def horizontal_lstm(images, num_filters_out, scope=None):
-  """Run an LSTM bidirectionally over all the rows of each image.
-
-  Args:
-    images: (num_images, height, width, depth) tensor
-    num_filters_out: output depth
-    scope: optional scope name
-
-  Returns:
-    (num_images, height, width, num_filters_out) tensor, where
-    num_steps is width and new num_batches is num_image_batches * height
-  """
-  with variable_scope.variable_scope(scope, "HorizontalLstm", [images]):
-    batch_size, _, _, _ = _shape(images)
-    sequence = images_to_sequence(images)
-    with variable_scope.variable_scope("lr"):
-      hidden_sequence_lr = lstm1d.ndlstm_base(sequence, num_filters_out // 2)
-    with variable_scope.variable_scope("rl"):
-      hidden_sequence_rl = (lstm1d.ndlstm_base(
-          sequence, num_filters_out - num_filters_out // 2, reverse=1))
-    output_sequence = array_ops.concat([hidden_sequence_lr, hidden_sequence_rl],
-                                       2)
-    output = sequence_to_images(output_sequence, batch_size)
-    return output
-
-
-def get_blocks(images, kernel_size):
-  """Split images in blocks
-
-  Args:
-    images: (num_images, height, width, depth) tensor
-    kernel_size: A list of length 2 holding the [kernel_height, kernel_width] of
-      of the pooling. Can be an int if both values are the same.
-
-  Returns:
-    (num_images, height/kernel_height, width/kernel_width,
-    depth*kernel_height*kernel_width) tensor
-  """
-  with variable_scope.variable_scope("image_blocks"):
-    batch_size, height, width, chanels = _shape(images)
-
-    if height % kernel_size[0] != 0:
-      offset = array_ops.zeros([batch_size,
-                                kernel_size[0] - (height % kernel_size[0]),
-                                width,
-                                chanels])
-      images = array_ops.concat([images, offset], 1)
-      batch_size, height, width, chanels = _shape(images)
-    if width % kernel_size[1] != 0:
-      offset = array_ops.zeros([batch_size,
-                                height,
-                                kernel_size[1] - (width % kernel_size[1]),
-                                chanels])
-      images = array_ops.concat([images, offset], 2)
-      batch_size, height, width, chanels = _shape(images)
-
-    h, w = int(height / kernel_size[0]), int(width / kernel_size[1])
-    features = kernel_size[1] * kernel_size[0] * chanels
-
-    lines = array_ops.split(images, h, axis=1)
-    line_blocks = []
-    for line in lines:
-      line = array_ops.transpose(line, [0, 2, 3, 1])
-      line = array_ops.reshape(line, [batch_size, w, features])
-      line_blocks.append(line)
-
-    return array_ops.stack(line_blocks, axis=1)
-
-
-def separable_lstm(images, num_filters_out,
-                   kernel_size=None, nhidden=None, scope=None):
-  """Run bidirectional LSTMs first horizontally then vertically.
-
-  Args:
-    images: (num_images, height, width, depth) tensor
-    num_filters_out: output layer depth
-    kernel_size: A list of length 2 holding the [kernel_height, kernel_width] of
-      of the pooling. Can be an int if both values are the same. Set to None for
-      not using blocks
-    nhidden: hidden layer depth
-    scope: optional scope name
-
-  Returns:
-    (num_images, height/kernel_height, width/kernel_width,
-    num_filters_out) tensor
-  """
-  with variable_scope.variable_scope(scope, "SeparableLstm", [images]):
-    if nhidden is None:
-      nhidden = num_filters_out
-    if kernel_size is not None:
-      images = get_blocks(images, kernel_size)
-    hidden = horizontal_lstm(images, nhidden)
-    with variable_scope.variable_scope("vertical"):
-      transposed = array_ops.transpose(hidden, [0, 2, 1, 3])
-      output_transposed = horizontal_lstm(transposed, num_filters_out)
-    output = array_ops.transpose(output_transposed, [0, 2, 1, 3])
-    return output
-
-
-def reduce_to_sequence(images, num_filters_out, scope=None):
-  """Reduce an image to a sequence by scanning an LSTM vertically.
-
-  Args:
-    images: (num_images, height, width, depth) tensor
-    num_filters_out: output layer depth
-    scope: optional scope name
-
-  Returns:
-    A (width, num_images, num_filters_out) sequence.
-  """
-  with variable_scope.variable_scope(scope, "ReduceToSequence", [images]):
-    batch_size, height, width, depth = _shape(images)
-    transposed = array_ops.transpose(images, [1, 0, 2, 3])
-    reshaped = array_ops.reshape(transposed,
-                                 [height, batch_size * width, depth])
-    reduced = lstm1d.sequence_to_final(reshaped, num_filters_out)
-    output = array_ops.reshape(reduced, [batch_size, width, num_filters_out])
-    return output
-
-
-def reduce_to_final(images, num_filters_out, nhidden=None, scope=None):
-  """Reduce an image to a final state by running two LSTMs.
-
-  Args:
-    images: (num_images, height, width, depth) tensor
-    num_filters_out: output layer depth
-    nhidden: hidden layer depth (defaults to num_filters_out)
-    scope: optional scope name
-
-  Returns:
-    A (num_images, num_filters_out) batch.
-  """
-  with variable_scope.variable_scope(scope, "ReduceToFinal", [images]):
-    nhidden = nhidden or num_filters_out
-    batch_size, height, width, depth = _shape(images)
-    transposed = array_ops.transpose(images, [1, 0, 2, 3])
-    reshaped = array_ops.reshape(transposed,
-                                 [height, batch_size * width, depth])
-    with variable_scope.variable_scope("reduce1"):
-      reduced = lstm1d.sequence_to_final(reshaped, nhidden)
-      transposed_hidden = array_ops.reshape(reduced,
-                                            [batch_size, width, nhidden])
-      hidden = array_ops.transpose(transposed_hidden, [1, 0, 2])
-    with variable_scope.variable_scope("reduce2"):
-      output = lstm1d.sequence_to_final(hidden, num_filters_out)
-    return output
diff --git a/tensorflow/contrib/ndlstm/python/lstm2d_test.py b/tensorflow/contrib/ndlstm/python/lstm2d_test.py
deleted file mode 100644
index f1b37d701b868438dcbac4e713ccc2136dacd983..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/ndlstm/python/lstm2d_test.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for 2D LSTMs."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.ndlstm.python import lstm2d as lstm2d_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-lstm2d = lstm2d_lib
-
-
-def _rand(*size):
-  return np.random.uniform(size=size).astype("f")
-
-
-class Lstm2DTest(test_util.TensorFlowTestCase):
-
-  def testImagesToSequenceDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = lstm2d.images_to_sequence(inputs)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (11, 14, 5))
-
-  def testSequenceToImagesDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(11, 14, 5))
-      outputs = lstm2d.sequence_to_images(inputs, 2)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 7, 11, 5))
-
-  def testImagesAndSequenceDims(self):
-    with self.test_session():
-      size = (2, 7, 11, 5)
-      inputs = constant_op.constant(_rand(*size))
-      sequence = lstm2d.images_to_sequence(inputs)
-      outputs = lstm2d.sequence_to_images(sequence, size[0])
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), size)
-
-  def testSeparableLstmDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = lstm2d.separable_lstm(inputs, 8)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 7, 11, 8))
-
-  def testSeparableLstmDimsBlocks(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = lstm2d.separable_lstm(inputs, 8, kernel_size=[2, 2])
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 4, 6, 8))
-
-  def testReduceToSequenceDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = lstm2d.reduce_to_sequence(inputs, 8)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 11, 8))
-
-  def testReduceToFinalDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = lstm2d.reduce_to_final(inputs, 8, 12)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 8))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/ndlstm/python/misc.py b/tensorflow/contrib/ndlstm/python/misc.py
deleted file mode 100644
index 38eeff84ca4e5afbe45d6c9e0c52af9ae86de24f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/ndlstm/python/misc.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Miscellaneous functions useful for nD-LSTM models.
-
-Some of these functions duplicate functionality in tfslim with
-slightly different interfaces.
-
-Tensors in this library generally have the shape (num_images, height, width,
-depth).
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.layers.python.layers import layers
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
-
-
-def _shape(tensor):
-  """Get the shape of a tensor as an int list."""
-  return tensor.get_shape().as_list()
-
-
-def pixels_as_vector(images, scope=None):
-  """Reduce images to vectors by combining all pixels."""
-  with ops.name_scope(scope, "PixelsAsVector", [images]):
-    batch_size, height, width, depth = _shape(images)
-    return array_ops.reshape(images, [batch_size, height * width * depth])
-
-
-def pool_as_vector(images, scope=None):
-  """Reduce images to vectors by averaging all pixels."""
-  with ops.name_scope(scope, "PoolAsVector", [images]):
-    return math_ops.reduce_mean(images, [1, 2])
-
-
-def one_hot_planes(labels, num_classes, scope=None):
-  """Compute 1-hot encodings for planes.
-
-  Given a label, this computes a label image that contains
-  1 at all pixels in the plane corresponding to the target
-  class and 0 in all other planes.
-
-  Args:
-    labels: (batch_size,) tensor
-    num_classes: number of classes
-    scope: optional scope name
-
-  Returns:
-    Tensor of shape (batch_size, 1, 1, num_classes) with a 1-hot encoding.
-  """
-  with ops.name_scope(scope, "OneHotPlanes", [labels]):
-    batch_size, = _shape(labels)
-    batched = layers.one_hot_encoding(labels, num_classes)
-    return array_ops.reshape(batched, [batch_size, 1, 1, num_classes])
-
-
-def one_hot_mask(labels, num_classes, scope=None):
-  """Compute 1-hot encodings for masks.
-
-  Given a label image, this computes the one hot encoding at
-  each pixel.
-
-  Args:
-    labels: (batch_size, width, height, 1) tensor containing labels.
-    num_classes: number of classes
-    scope: optional scope name
-
-  Returns:
-    Tensor of shape (batch_size, width, height, num_classes) with
-    a 1-hot encoding.
-  """
-  with ops.name_scope(scope, "OneHotMask", [labels]):
-    height, width, depth = _shape(labels)
-    assert depth == 1
-    sparse_labels = math_ops.to_int32(array_ops.reshape(labels, [-1, 1]))
-    sparse_size, _ = _shape(sparse_labels)
-    indices = array_ops.reshape(math_ops.range(0, sparse_size, 1), [-1, 1])
-    concated = array_ops.concat([indices, sparse_labels], 1)
-    dense_result = sparse_ops.sparse_to_dense(concated,
-                                              [sparse_size, num_classes], 1.0,
-                                              0.0)
-    result = array_ops.reshape(dense_result, [height, width, num_classes])
-    return result
diff --git a/tensorflow/contrib/ndlstm/python/misc_test.py b/tensorflow/contrib/ndlstm/python/misc_test.py
deleted file mode 100644
index fac9023da3b23b89a5494358c6e7ad82c12f9bdf..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/ndlstm/python/misc_test.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Miscellaneous tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.ndlstm.python import misc as misc_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-misc = misc_lib
-
-
-def _rand(*size):
-  return np.random.uniform(size=size).astype("f")
-
-
-class LstmMiscTest(test_util.TensorFlowTestCase):
-
-  def testPixelsAsVectorDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = misc.pixels_as_vector(inputs)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 7 * 11 * 5))
-
-  def testPoolAsVectorDims(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(2, 7, 11, 5))
-      outputs = misc.pool_as_vector(inputs)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 5))
-
-  def testOneHotPlanes(self):
-    with self.test_session():
-      inputs = constant_op.constant([0, 1, 3])
-      outputs = misc.one_hot_planes(inputs, 4)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (3, 1, 1, 4))
-      target = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
-      self.assertAllClose(result.reshape(-1), target.reshape(-1))
-
-  def testOneHotMask(self):
-    with self.test_session():
-      data = np.array([[0, 1, 2], [2, 0, 1]]).reshape(2, 3, 1)
-      inputs = constant_op.constant(data)
-      outputs = misc.one_hot_mask(inputs, 3)
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (2, 3, 3))
-      target = np.array([[[1, 0, 0], [0, 1, 0]], [[0, 1, 0], [0, 0, 1]],
-                         [[0, 0, 1], [1, 0, 0]]]).transpose(1, 2, 0)
-      self.assertAllClose(result.reshape(-1), target.reshape(-1))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/nearest_neighbor/kernels/heap.h b/tensorflow/contrib/nearest_neighbor/kernels/heap.h
index 6e33a574e25d39a13a256383cbc9848fdb8b788f..a2dbb8052bfa1634d27c8b38a9bb6ca27fae42a2 100644
--- a/tensorflow/contrib/nearest_neighbor/kernels/heap.h
+++ b/tensorflow/contrib/nearest_neighbor/kernels/heap.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HEAP_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HEAP_H_
+#ifndef TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HEAP_H_
+#define TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HEAP_H_
 
 #include <cassert>
 #include <cstdint>
@@ -56,7 +56,7 @@ class HeapBase {
 
   // This method adds an element at the end of the internal array without
   // "heapifying" the array afterwards. This is useful for setting up a heap
-  // where a single call to heapify at the end of the inital insertion
+  // where a single call to heapify at the end of the initial insertion
   // operations suffices.
   void InsertUnsorted(const KeyType& key, const DataType& data) {
     if (v_.size() == static_cast<size_t>(num_elements_)) {
@@ -205,4 +205,4 @@ class AugmentedHeap : public HeapBase<KeyType, DataType> {
 }  // namespace nearest_neighbor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HEAP_H_
+#endif  // TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HEAP_H_
diff --git a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
index 62ee6630ac613c80a56d4e854cf7af4ae19f6faa..13db6f62f525b6318687e3bf4b6499eee2c61ea8 100644
--- a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
+++ b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
@@ -45,16 +45,16 @@ class HyperplaneLSHProbesOp : public OpKernel {
     const Tensor& products_tensor = context->input(0);
     OP_REQUIRES(context, products_tensor.dims() == 2,
                 InvalidArgument("Need a two-dimensional products tensor, got ",
-                                products_tensor.dims(), " dimensions."))
+                                products_tensor.dims(), " dimensions."));
 
     const Tensor& num_tables_tensor = context->input(1);
     OP_REQUIRES(context, num_tables_tensor.dims() == 0,
                 InvalidArgument("Need a scalar num_tables tensor, got ",
-                                num_tables_tensor.dims(), " dimensions."))
+                                num_tables_tensor.dims(), " dimensions."));
     int num_tables = num_tables_tensor.scalar<int32>()();
     OP_REQUIRES(context, num_tables >= 1,
                 InvalidArgument("num_tables must be at least 1 but got ",
-                                num_tables, "."))
+                                num_tables, "."));
     OP_REQUIRES(context, num_tables <= 1000,
                 InvalidArgument("Need num_tables <= 1000, got ", num_tables,
                                 ". This is mostly to protect against incorrect "
@@ -66,33 +66,36 @@ class HyperplaneLSHProbesOp : public OpKernel {
                 InvalidArgument("Need a scalar num_hyperplanes_per_table "
                                 "tensor, got ",
                                 num_hyperplanes_per_table_tensor.dims(),
-                                " dimensions."))
+                                " dimensions."));
     int num_hyperplanes_per_table =
         num_hyperplanes_per_table_tensor.scalar<int32>()();
     OP_REQUIRES(context, num_hyperplanes_per_table >= 1,
                 InvalidArgument("num_hyperplanes_per_table must be at least 1 "
-                                "but got ", num_hyperplanes_per_table, "."))
+                                "but got ",
+                                num_hyperplanes_per_table, "."));
     OP_REQUIRES(context, num_hyperplanes_per_table <= 30,
                 InvalidArgument("Need num_hyperplanes_per_table <= 30, got ",
-                                num_hyperplanes_per_table, ". "
+                                num_hyperplanes_per_table,
+                                ". "
                                 "If you need more hyperplanes, change this Op"
                                 " to work for larger integer types (int64)."));
 
     const Tensor& num_probes_tensor = context->input(3);
     OP_REQUIRES(context, num_probes_tensor.dims() == 0,
                 InvalidArgument("Need a scalar num_probes tensor, got ",
-                                num_probes_tensor.dims(), " dimensions."))
+                                num_probes_tensor.dims(), " dimensions."));
     int num_probes = num_probes_tensor.scalar<int32>()();
     OP_REQUIRES(context, num_probes >= 1,
-                InvalidArgument("num_probes must be at least 1."))
+                InvalidArgument("num_probes must be at least 1."));
 
     int expected_num_hyperplanes = num_tables * num_hyperplanes_per_table;
-    OP_REQUIRES(
-        context, products_tensor.dim_size(1) == expected_num_hyperplanes,
-        InvalidArgument("Expected number of hyperplanes is ",
-                        expected_num_hyperplanes, " but received ",
-                        products_tensor.dim_size(1), " inner products per "
-                        "point."));
+    OP_REQUIRES(context,
+                products_tensor.dim_size(1) == expected_num_hyperplanes,
+                InvalidArgument("Expected number of hyperplanes is ",
+                                expected_num_hyperplanes, " but received ",
+                                products_tensor.dim_size(1),
+                                " inner products per "
+                                "point."));
 
     auto products_eigen_tensor = products_tensor.matrix<CoordinateType>();
     ConstMatrixMap products_matrix(products_eigen_tensor.data(),
@@ -115,13 +118,11 @@ class HyperplaneLSHProbesOp : public OpKernel {
     // lschmidt's workstation.
     int64 cost_per_unit = 21 * num_hyperplanes_per_table * num_tables;
     if (num_probes > num_tables) {
-      cost_per_unit += 110 * num_hyperplanes_per_table
-          * (num_probes - num_tables);
+      cost_per_unit +=
+          110 * num_hyperplanes_per_table * (num_probes - num_tables);
     }
     context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
-        batch_size,
-        cost_per_unit,
-        [&](int64 start, int64 end) {
+        batch_size, cost_per_unit, [&](int64 start, int64 end) {
           HyperplaneMultiprobe<CoordinateType, int32> multiprobe(
               num_hyperplanes_per_table, num_tables);
 
diff --git a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.h b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.h
index 1670e2f83b3afa10ca76b765bf97cc1c08038fba..c53205e1a4089c8bb5159621662496b798acf242 100644
--- a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.h
+++ b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HYPERPLANE_LSH_PROBES_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HYPERPLANE_LSH_PROBES_H_
+#ifndef TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HYPERPLANE_LSH_PROBES_H_
+#define TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HYPERPLANE_LSH_PROBES_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
@@ -232,4 +232,4 @@ class HyperplaneMultiprobe {
 }  // namespace nearest_neighbor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HYPERPLANE_LSH_PROBES_H_
+#endif  // TENSORFLOW_CONTRIB_NEAREST_NEIGHBOR_KERNELS_HYPERPLANE_LSH_PROBES_H_
diff --git a/tensorflow/contrib/nn/BUILD b/tensorflow/contrib/nn/BUILD
index 56a24ac77f0b9a87b6e4db48cddacdf35f4855d0..5543eb6c6e3785978e9c878f309b9bd0863b0b0a 100644
--- a/tensorflow/contrib/nn/BUILD
+++ b/tensorflow/contrib/nn/BUILD
@@ -17,6 +17,7 @@ py_library(
         "python/ops/__init__.py",
         "python/ops/alpha_dropout.py",
         "python/ops/cross_entropy.py",
+        "python/ops/fwd_gradients.py",
         "python/ops/sampling_ops.py",
         "python/ops/scaled_softplus.py",
     ],
@@ -28,6 +29,7 @@ py_library(
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
+        "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
@@ -55,6 +57,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "fwd_gradients_test",
+    size = "small",
+    srcs = ["python/ops/fwd_gradients_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":nn_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
 py_test(
     name = "sampling_ops_test",
     size = "small",
diff --git a/tensorflow/contrib/nn/__init__.py b/tensorflow/contrib/nn/__init__.py
index 0bc133a00e619930f1d5fe4c7a8996556b833ddf..96d60e149809aff6fcb7eff77edc23737db177e8 100644
--- a/tensorflow/contrib/nn/__init__.py
+++ b/tensorflow/contrib/nn/__init__.py
@@ -21,6 +21,7 @@
 @@deprecated_flipped_sigmoid_cross_entropy_with_logits
 @@nth_element
 @@rank_sampled_softmax_loss
+@@sampled_sparse_softmax_loss
 @@scaled_softplus
 """
 
diff --git a/tensorflow/contrib/nn/python/ops/alpha_dropout.py b/tensorflow/contrib/nn/python/ops/alpha_dropout.py
index d7b61a584478f701726248a41c4992382189223d..2f92d05ba81f30a91f68f3c3ec51b6695d3d0371 100644
--- a/tensorflow/contrib/nn/python/ops/alpha_dropout.py
+++ b/tensorflow/contrib/nn/python/ops/alpha_dropout.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 
 import numbers
 
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -26,7 +25,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_impl
 
 
 def alpha_dropout(x, keep_prob, noise_shape=None, seed=None, name=None): # pylint: disable=invalid-name
diff --git a/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py b/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py
index 2ff978ab89727c0ba2a8654013466838732377e4..54a98e6f142b7ba58c9418a8ac88269d38944aab 100644
--- a/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py
+++ b/tensorflow/contrib/nn/python/ops/alpha_dropout_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.contrib.nn.python.ops.alpha_dropout import alpha_dropout
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import nn_impl
diff --git a/tensorflow/contrib/nn/python/ops/fwd_gradients.py b/tensorflow/contrib/nn/python/ops/fwd_gradients.py
new file mode 100644
index 0000000000000000000000000000000000000000..922497779b1d6ce426df9d7bb8fb343eea48502b
--- /dev/null
+++ b/tensorflow/contrib/nn/python/ops/fwd_gradients.py
@@ -0,0 +1,76 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Forward-mode derivatives."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops.gradients_impl import gradients
+
+
+def fwd_gradients(ys, xs, grad_xs=None, assert_unused=False):
+  """Computes forward-mode derivatives.
+
+  This is accomplished in pure-python using tensorflow's existing (reverse-mode)
+  gradients. There is additional overhead on graph construction, but runtime
+  performance should be equal to a manual implementation [citation needed].
+
+  See https://j-towns.github.io/2017/06/12/A-new-trick.html and
+  https://github.com/HIPS/autograd/pull/175 for the original discussion of this
+  method, and https://github.com/renmengye/tensorflow-forward-ad for a "direct"
+  implementation.
+
+  Args:
+    ys: A list of tensors.
+    xs: A list of tensors.
+    grad_xs: An optional list of tensors. If provided, must have the same length
+      and shapes compatible with xs.
+    assert_unused: Add assertions that intermediate values are not computed.
+  Returns:
+    A list of tensors of the same shapes as ys. The directional derivatives of
+    ys with respect to xs in the direction grad_xs. Leaving grad_xs unspecified
+    is equivalent to passing in 1s for each x in xs.
+  """
+  # This version of forward-mode autodiff is based on code by Tim Cooijmans
+  # and handles list arguments and certain special cases such as when the
+  # ys doesn't depend on one or more of the xs, and when tf.IndexedSlices are
+  # generated by the first tf.gradients call.
+
+  us = [array_ops.zeros_like(y) + float('nan') for y in ys]
+
+  dydxs = gradients(ys, xs, grad_ys=us)
+
+  # deal with strange types that tf.gradients returns but can't deal with
+  dydxs = [ops.convert_to_tensor(dydx) if isinstance(dydx, ops.IndexedSlices)
+           else dydx for dydx in dydxs]
+
+  if assert_unused:
+    with ops.control_dependencies(dydxs):
+      assert_unused = control_flow_ops.Assert(False, [1], name='fwd_gradients')
+    with ops.control_dependencies([assert_unused]):
+      dydxs = array_ops.identity_n(dydxs)
+
+  dydxs = [array_ops.zeros_like(x) if dydx is None else dydx
+           for x, dydx in zip(xs, dydxs)]
+  for x, dydx in zip(xs, dydxs):
+    dydx.set_shape(x.shape)
+
+  dysdx = gradients(dydxs, us, grad_ys=grad_xs)
+
+  return dysdx
diff --git a/tensorflow/contrib/nn/python/ops/fwd_gradients_test.py b/tensorflow/contrib/nn/python/ops/fwd_gradients_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..56062c3cab32d727dd22a78d1f60c823a2f86a79
--- /dev/null
+++ b/tensorflow/contrib/nn/python/ops/fwd_gradients_test.py
@@ -0,0 +1,52 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for forward_ad.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.nn.python.ops import fwd_gradients
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ForwardAdTest(test.TestCase):
+
+  def testSquare(self):
+    x = constant_op.constant(1.)
+    y = math_ops.square(x)
+    grad_x = 3.
+
+    dydx_tf = fwd_gradients.fwd_gradients([y], [x], [grad_x])[0]
+    dydx_py = 2. * grad_x
+
+    with self.test_session() as sess:
+      self.assertAllClose(sess.run(dydx_tf), dydx_py, 1e-6)
+
+  def testGather(self):
+    x = constant_op.constant([1., 2., 3.])
+    y = array_ops.gather(x, [0, 1])
+    y.set_shape([2])
+    dydx = fwd_gradients.fwd_gradients([y], [x], assert_unused=True)
+
+    with self.test_session() as sess:
+      sess.run(dydx)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/nn/python/ops/sampling_ops.py b/tensorflow/contrib/nn/python/ops/sampling_ops.py
index 98749cff7ee896436cdc40471929d9a3a8618dba..63fc487dca69a4777821595a0366d0ae0b393ce2 100644
--- a/tensorflow/contrib/nn/python/ops/sampling_ops.py
+++ b/tensorflow/contrib/nn/python/ops/sampling_ops.py
@@ -24,6 +24,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
 
 
 def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
@@ -240,3 +242,101 @@ def rank_sampled_softmax_loss(weights,
         remove_accidental_hits=remove_accidental_hits,
         partition_strategy=partition_strategy,
         name=name)
+
+
+def sampled_sparse_softmax_loss(weights,
+                                biases,
+                                labels,
+                                inputs,
+                                num_sampled,
+                                num_classes,
+                                sampled_values=None,
+                                remove_accidental_hits=True,
+                                partition_strategy="mod",
+                                name="sampled_sparse_softmax_loss"):
+  """Computes and returns the sampled sparse softmax training loss.
+
+  This is a faster way to train a softmax classifier over a huge number of
+  classes.
+
+  This operation is for training only.  It is generally an underestimate of
+  the full softmax loss.
+
+  A common use case is to use this method for training, and calculate the full
+  softmax loss for evaluation or inference. In this case, you must set
+  `partition_strategy="div"` for the two losses to be consistent, as in the
+  following example:
+
+  ```python
+  if mode == "train":
+    loss = tf.nn.sampled_sparse_softmax_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...,
+        partition_strategy="div")
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=tf.squeeze(labels),
+        logits=logits)
+  ```
+
+  See our [Candidate Sampling Algorithms Reference]
+  (https://www.tensorflow.org/extras/candidate_sampling.pdf)
+
+  Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
+  ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
+
+  Args:
+    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
+        objects whose concatenation along dimension 0 has shape
+        [num_classes, dim].  The (possibly-sharded) class embeddings.
+    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
+    labels: A `Tensor` of type `int64` and shape `[batch_size, 1]`.
+        The index of the single target class for each row of logits.  Note that
+        this format differs from the `labels` argument of
+        `nn.sparse_softmax_cross_entropy_with_logits`.
+    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
+        activations of the input network.
+    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_classes: An `int`. The number of possible classes.
+    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
+        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
+        (if None, we default to `log_uniform_candidate_sampler`)
+    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
+        where a sampled class equals one of the target classes.  Default is
+        True.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
+        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `batch_size` 1-D tensor of per-example sampled softmax losses.
+
+  """
+  logits, _ = nn_impl._compute_sampled_logits(
+      weights=weights,
+      biases=biases,
+      labels=labels,
+      inputs=inputs,
+      num_sampled=num_sampled,
+      num_classes=num_classes,
+      num_true=1,
+      sampled_values=sampled_values,
+      subtract_log_q=True,
+      remove_accidental_hits=remove_accidental_hits,
+      partition_strategy=partition_strategy,
+      name=name)
+
+  # There is only one true label. _compute_sampled_logits puts the true logit
+  # at index 0.
+  labels = array_ops.zeros([array_ops.shape(logits)[0], 1], dtype=dtypes.int64)
+
+  sampled_losses = nn_ops.sparse_softmax_cross_entropy_with_logits(
+      labels=array_ops.squeeze(labels), logits=logits)
+  # sampled_losses is a [batch_size] tensor.
+  return sampled_losses
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index a9a63cbce0de807059b4756c4f9057081721b15a..827279bd476f9666a972f43ad557fde6d0b6c59a 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -16,8 +16,10 @@ py_library(
         "__init__.py",
         "python/training/addsign.py",
         "python/training/drop_stale_gradient_optimizer.py",
+        "python/training/elastic_average_optimizer.py",
         "python/training/external_optimizer.py",
         "python/training/lazy_adam_optimizer.py",
+        "python/training/model_average_optimizer.py",
         "python/training/moving_average_optimizer.py",
         "python/training/multitask_optimizer_wrapper.py",
         "python/training/nadam_optimizer.py",
@@ -80,22 +82,22 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "variable_clipping_optimizer_test",
     srcs = ["python/training/variable_clipping_optimizer_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",  # Flaky: b/29892493
-        "notap",  # data race due to b/62910646
-    ],
-    deps = [
+    additional_deps = [
         ":opt_py",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//third_party/py/numpy",
+    ],
+    grpc_enabled = True,
+    tags = [
+        "manual",  # Flaky: b/29892493
+        "notap",  # data race due to b/62910646
     ],
 )
 
@@ -168,11 +170,51 @@ tf_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    grpc_enabled = True,
     tags = [
         "no_oss",  # Flaky due to port collisions
     ],
 )
 
+tf_py_test(
+    name = "elastic_average_optimizer_test",
+    srcs = ["python/training/elastic_average_optimizer_test.py"],
+    additional_deps = [
+        ":opt_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "model_average_optimizer_test",
+    srcs = ["python/training/model_average_optimizer_test.py"],
+    additional_deps = [
+        ":opt_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//third_party/py/numpy",
+    ],
+    tags = [
+        "notap",  # This test launches local server.
+    ],
+)
+
 py_test(
     name = "sign_decay_test",
     srcs = ["python/training/sign_decay_test.py"],
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index 3275ad8239aa91660f88b7bc149fb915b4fad9fe..6c1bb1adc096f5b8e6945ea1492727d16cf29e65 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -28,6 +28,8 @@ from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import *
 from tensorflow.contrib.opt.python.training.nadam_optimizer import *
 from tensorflow.contrib.opt.python.training.powersign import *
 from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
+from tensorflow.contrib.opt.python.training.elastic_average_optimizer import *
+from tensorflow.contrib.opt.python.training.model_average_optimizer import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -35,7 +37,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     'PowerSignOptimizer',
-    'AddSignOptimizer'
+    'AddSignOptimizer',
     'DelayCompensatedGradientDescentOptimizer',
     'DropStaleGradientOptimizer',
     'ExternalOptimizerInterface',
@@ -46,6 +48,10 @@ _allowed_symbols = [
     'VariableClippingOptimizer',
     'MultitaskOptimizerWrapper',
     'clip_gradients_by_global_norm',
+    'ElasticAverageOptimizer',
+    'ElasticAverageCustomGetter',
+    'ModelAverageOptimizer',
+    'ModelAverageCustomGetter'
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/opt/python/training/addsign.py b/tensorflow/contrib/opt/python/training/addsign.py
index 729e59cb0aab97e6cd657571647fc45a44ae0ab1..22da4453e205c9111056d6afd1ddb08e093653aa 100644
--- a/tensorflow/contrib/opt/python/training/addsign.py
+++ b/tensorflow/contrib/opt/python/training/addsign.py
@@ -30,8 +30,8 @@ from tensorflow.python.training import training_ops
 class AddSignOptimizer(optimizer.Optimizer):
   """Optimizer that implements the AddSign update.
 
-  See  Neural Optimizer Search with Reinforcement Learning
-  [Bello et al., ICML2017].
+  See [Bello et al., ICML2017],
+  [Neural Optimizer Search with RL](https://arxiv.org/abs/1709.07417).
   """
 
   def __init__(self,
@@ -45,6 +45,7 @@ class AddSignOptimizer(optimizer.Optimizer):
 
     Initialization:
 
+    ```
     m_0 <- 0 (Initialize initial 1st moment vector)
     t <- 0 (Initialize timestep)
     ```
@@ -54,7 +55,7 @@ class AddSignOptimizer(optimizer.Optimizer):
     ```
     t <- t + 1
     m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-    sign_decay <- sign_decay(t)
+    sign_decay <- sign_decay_fn(t)
     update <- (alpha + sign_decay * sign(g) *sign(m)) * g
     variable <- variable - lr_t * update
     ```
@@ -70,11 +71,9 @@ class AddSignOptimizer(optimizer.Optimizer):
       learning_rate: learning_rate used when taking a step.
       alpha: alpha used in optimizer.
       beta: decay used for computing the moving average m.
-      sign_decay_fn: decay function applied to the sign(g*m) quantity.
-          Takes global_step as an argument and returns the quantity to multiply
-          the sign(g*m) by.
-        compute (1.0 + alpha * decay * sign(g) * sign(m)) * m.
-      use_locking: If True use locks for update operations.
+      sign_decay_fn: decay function applied to the sign(g) sign(m) quantity.
+          Takes global_step as an argument. See sign_decay.py for some examples.
+      use_locking: If True, use locks for update operations.
       name: Optional name for the operations created when applying gradients.
         Defaults to "AddSignOptimizer".
     """
diff --git a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
index f20c172ee376d0a808a21fe96bec80367bf2e9f4..4a905b1b2a0c3b7c4002451f37102eb2abdc5a2b 100644
--- a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
@@ -78,10 +78,11 @@ class DropStaleGradientOptimizer(optimizer.Optimizer):
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     gradients = []
     # Number of stale gradients.
-    stale_counter = variable_scope.get_variable(
-        "stale_counter", [],
-        initializer=init_ops.zeros_initializer(),
-        trainable=False)
+    with ops.colocate_with(global_step):
+      stale_counter = variable_scope.get_variable(
+          "stale_counter", [],
+          initializer=init_ops.zeros_initializer(),
+          trainable=False)
 
     def _AcceptGradientOp():
       with ops.control_dependencies(
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5763593b81497f5d6945ff1e5d000042d295c093
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
@@ -0,0 +1,353 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrapper optimizer for Elastic Average SGD """
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import constant_op
+
+LOCAL_VARIABLE_NAME = 'local_center_variable'
+GLOBAL_VARIABLE_NAME = 'global_center_variable'
+
+
+class ElasticAverageCustomGetter(object):
+  """Custom_getter class is used to do:
+  1. Change trainable variables to local collection and place them at worker
+    device
+  2. Generate global variables(global center variables)
+  3. Generate local variables(local center variables) which record the global
+    variables and place them at worker device
+    Notice that the class should be used with tf.replica_device_setter,
+    so that the global center variables and global step variable can be placed
+    at ps device. Besides, use 'tf.get_variable' instead of 'tf.Variable' to
+    use this custom getter.
+
+  For example,
+  ea_custom_getter = ElasticAverageCustomGetter(worker_device)
+  with tf.device(
+    tf.train.replica_device_setter(
+      worker_device=worker_device,
+      ps_device="/job:ps/cpu:0",
+      cluster=cluster)),
+    tf.variable_scope('',custom_getter=ea_custom_getter):
+    hid_w = tf.get_variable(
+      initializer=tf.truncated_normal(
+          [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
+          stddev=1.0 / IMAGE_PIXELS),
+      name="hid_w")
+    hid_b = tf.get_variable(initializer=tf.zeros([FLAGS.hidden_units]),
+                            name="hid_b")
+  """
+
+  def __init__(self, worker_device):
+    """Create a new `ElasticAverageCustomGetter`.
+
+    Args:
+      worker_device: String.  Name of the `worker` job.
+    """
+    self._worker_device = worker_device
+    self._local_map = {}
+    self._global_map = {}
+
+  def __call__(self, getter, name, trainable, collections, *args, **kwargs):
+    if trainable:
+      with ops.device(self._worker_device):
+        local_var = getter(
+            name,
+            trainable=True,
+            collections=[ops.GraphKeys.LOCAL_VARIABLES],
+            *args,
+            **kwargs)
+      global_center_variable = variable_scope.variable(
+          name='%s/%s' % (GLOBAL_VARIABLE_NAME, name),
+          initial_value=local_var.initialized_value(),
+          trainable=False,
+          collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+
+      with ops.device(self._worker_device):
+        local_center_variable = variable_scope.variable(
+            name='%s/%s' % (LOCAL_VARIABLE_NAME, name),
+            initial_value=local_var.initialized_value(),
+            trainable=False,
+            collections=[ops.GraphKeys.LOCAL_VARIABLES])
+
+      self._local_map[local_var] = local_center_variable
+      self._global_map[local_var] = global_center_variable
+      return local_var
+    else:
+      return getter(name, trainable, collections, *args, **kwargs)
+
+
+class ElasticAverageOptimizer(optimizer.Optimizer):
+  """Wrapper optimizer that implements the Elastic Average SGD algorithm.
+  This is an async optimizer. During the training, Each worker will update
+  the local variables and maintains its own local_step, which starts from 0
+  and is incremented by 1 after each update of local variables. Whenever
+  the communication period divides the local step, the worker requests
+  the current global center variables and then computed the elastic difference
+  between global center variables and local variables. The elastic difference
+  then be used to update both local variables and global variables.
+  """
+
+  # Default value as paper described
+  BETA = 0.9
+
+  def __init__(self,
+               opt,
+               num_worker,
+               ea_custom_getter,
+               communication_period=10,
+               moving_rate=None,
+               rho=None,
+               use_locking=True,
+               name='ElasticAverageOptimizer'):
+    """Construct a new gradient descent optimizer.
+
+    Args:
+      opt: The actual optimizer that will be used to update local variables.
+        Must be one of the Optimizer classes.
+      num_worker: The number of workers
+      ea_custom_getter: The ElasticAverageCustomGetter
+      communication_period: An int point value to controls the frequency
+        of the communication between every worker and the ps.
+      moving_rate: A floating point value to control the elastic difference.
+      rho: the amount of exploration we allow ine the model. The default
+        value is moving_rate/learning_rate
+      use_locking: If True use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "ElasticAverageOptimizer".
+    """
+    super(ElasticAverageOptimizer, self).__init__(use_locking, name)
+    self._opt = opt
+    self._num_worker = num_worker
+    self._period = communication_period
+    self._local_map = ea_custom_getter._local_map
+    self._global_map = ea_custom_getter._global_map
+
+    if moving_rate is None:
+      self._moving_rate = self.BETA / communication_period / num_worker
+    else:
+      self._moving_rate = moving_rate
+    if rho is None:
+      self._rho = self._moving_rate / self._opt._learning_rate
+    else:
+      self._rho = rho
+
+    self._local_step = variable_scope.get_variable(
+        initializer=0,
+        trainable=False,
+        collections=[ops.GraphKeys.LOCAL_VARIABLES],
+        name='local_step')
+    self._opt._prepare()
+
+  def compute_gradients(self,
+                        loss,
+                        var_list=None,
+                        gate_gradients=optimizer.Optimizer.GATE_OP,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None):
+    """Compute gradients of `loss` for the variables in `var_list`.
+
+    Add rho*elastic_difference to loss to control the exploration
+    This is the first part of `minimize()`.  It returns a list
+    of (gradient, variable) pairs where "gradient" is the gradient
+    for "variable".  Note that "gradient" can be a `Tensor`, an
+    `IndexedSlices`, or `None` if there is no gradient for the
+    given variable.
+
+    Args:
+      loss: A Tensor containing the value to minimize.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph
+        under the key `GraphKey.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+
+    Returns:
+      A list of (gradient, variable) pairs. Variable is always present, but
+      gradient can be `None`.
+
+    Raises:
+      TypeError: If `var_list` contains anything else than `Variable` objects.
+      ValueError: If some arguments are invalid.
+    """
+    if not var_list:
+      var_list = variables.trainable_variables()
+
+    elastic_difference = [
+        math_ops.subtract(v, lv)
+        for v, lv in zip(variables.trainable_variables(),
+                         [self._local_map[var] for var in var_list])
+    ]
+
+    distance_loss = self._rho * math_ops.add_n(
+        [gen_nn_ops.l2_loss(ed) for ed in elastic_difference])
+
+    total_loss = loss + distance_loss
+    return self._opt.compute_gradients(total_loss, var_list, gate_gradients,
+                                       aggregation_method,
+                                       colocate_gradients_with_ops, grad_loss)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to global variables.
+
+    This is the second part of `minimize()`. It returns an `Operation` that
+    applies gradients.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients. If `global_step`
+      was not None, that operation also increments `global_step`.
+
+    Raises:
+      TypeError: If `grads_and_vars` is malformed.
+      ValueError: If none of the variables have gradients.
+    """
+    apply_updates = self._opt.apply_gradients(grads_and_vars)
+    with ops.control_dependencies([apply_updates]):
+      local_update = state_ops.assign_add(
+          self._local_step, 1, name='local_step_update').op
+
+    # update global variables.
+    def _Update_global_variables():
+      local_vars = [v for g, v in grads_and_vars if g is not None]
+      global_center_vars = [self._global_map[var] for var in local_vars]
+      local_center_vars = [self._local_map[var] for var in local_vars]
+      local_center_vars_update = []
+      for lvar, var in zip(local_center_vars, global_center_vars):
+        local_center_vars_update.append(lvar.assign(var))
+      update_ops = []
+      differences = []
+      with ops.control_dependencies(local_center_vars_update):
+        for v, lv in zip(local_vars, local_center_vars):
+          with ops.device(v.device):
+            differences.append(math_ops.subtract(v, lv))
+        for lvar, diff in zip(local_vars, differences):
+          with ops.device(lvar.device):
+            update_ops.append(
+                state_ops.assign_sub(lvar,
+                                     math_ops.multiply(self._moving_rate,
+                                                       diff)))
+        for var, diff in zip(global_center_vars, differences):
+          with ops.device(var.device):
+            update_ops.append(
+                state_ops.assign_add(var,
+                                     math_ops.multiply(self._moving_rate,
+                                                       diff)))
+        if global_step:
+          with ops.colocate_with(global_step):
+            update_ops.append(state_ops.assign_add(global_step, 1))
+      variable_update = control_flow_ops.group(*(update_ops))
+      return variable_update
+
+    with ops.control_dependencies([local_update]):
+      condition = math_ops.equal(
+          math_ops.mod(self._local_step, self._period), 0)
+      conditional_update = control_flow_ops.cond(
+          condition, _Update_global_variables, control_flow_ops.no_op)
+    return conditional_update
+
+  def get_init_op(self, task_index):
+    """Returns the op to let all the local variables and local center
+    variables equal to the global center variables before the training begins"""
+
+    def _Add_sync_queues_and_barrier(enqueue_after_list):
+      """Adds ops to enqueu on all worker queues"""
+      sync_queues = [
+          data_flow_ops.FIFOQueue(
+              self._num_worker, [dtypes.bool],
+              shapes=[[]],
+              shared_name='%s%s' % ('variable_init_sync_queue', i))
+          for i in range(self._num_worker)
+      ]
+      queue_ops = []
+      # For each other worker, add an entry in a queue
+      token = constant_op.constant(False)
+      with ops.control_dependencies(enqueue_after_list):
+        for i, q in enumerate(sync_queues):
+          if i == task_index:
+            queue_ops.append(control_flow_ops.no_op())
+          else:
+            queue_ops.append(q.enqueue(token))
+      queue_ops.append(
+          sync_queues[task_index].dequeue_many(len(sync_queues) - 1))
+      return control_flow_ops.group(*queue_ops)
+
+    init_ops = []
+    local_vars = variables.trainable_variables()
+    global_center_vars = [self._global_map[var] for var in local_vars]
+    local_center_vars = [self._local_map[var] for var in local_vars]
+    if not (local_vars and global_center_vars and local_center_vars):
+      raise ValueError('The lists of local_variables, global_center_variables, '
+                       'local_center_variables should not be empty  ')
+    for lvar, gc_var, lc_var in zip(local_vars, global_center_vars,
+                                    local_center_vars):
+      init_ops.append(state_ops.assign(lvar, gc_var))
+      init_ops.append(state_ops.assign(lc_var, gc_var))
+
+    init_op = control_flow_ops.group(*(init_ops))
+    sync_queue_op = _Add_sync_queues_and_barrier([init_op])
+    return sync_queue_op
+
+  def make_session_run_hook(self, is_chief, task_index):
+    """Creates a hook to handle ElasticAverageOptimizerHook ops such as initialization."""
+    return _ElasticAverageOptimizerHook(self, is_chief, task_index)
+
+
+class _ElasticAverageOptimizerHook(session_run_hook.SessionRunHook):
+
+  def __init__(self, ea_optimizer, is_chief, task_index):
+    """Creates hook to handle ElasticAverageOptimizer initialization ops.
+
+    Args:
+      ea_optimizer: `ElasticAverageOptimizer` which this hook will initialize.
+      is_chief: `Bool`, whether is this a chief replica or not.
+    """
+    self._ea_optimizer = ea_optimizer
+    self._is_chief = is_chief
+    self._task_index = task_index
+
+  def begin(self):
+    self._local_init_op = variables.local_variables_initializer()
+    self._global_init_op = None
+    if self._is_chief:
+      self._global_init_op = variables.global_variables_initializer()
+    self._variable_init_op = self._ea_optimizer.get_init_op(self._task_index)
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..37539b959959b5cf1f7b2c8e8d2b6b05191565ad
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
@@ -0,0 +1,220 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ElasticAverageOptimizer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import portpicker
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import server_lib
+from tensorflow.python.training import training
+from tensorflow.python.training import training_util
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import device_setter
+
+from tensorflow.contrib.opt.python.training.elastic_average_optimizer import \
+  ElasticAverageOptimizer, ElasticAverageCustomGetter, GLOBAL_VARIABLE_NAME
+
+
+def create_local_cluster(num_workers, num_ps, protocol="grpc"):
+  """Create local GRPC servers and return them."""
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+  cluster_dict = {
+      "worker": ["localhost:%s" % port for port in worker_ports],
+      "ps": ["localhost:%s" % port for port in ps_ports]
+  }
+  cs = server_lib.ClusterSpec(cluster_dict)
+
+  workers = [
+      server_lib.Server(
+          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_workers)
+  ]
+  ps_servers = [
+      server_lib.Server(
+          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_ps)
+  ]
+
+  return cluster_dict, workers, ps_servers
+
+
+# Creates the workers and return their sessions, graphs, train_ops.
+# Cheif worker will update at last
+def _get_workers(num_workers, period, workers, moving_rate):
+  sessions = []
+  graphs = []
+  train_ops = []
+  for worker_id in range(num_workers):
+    graph = ops.Graph()
+    is_chief = (worker_id == 0)
+    with graph.as_default():
+      worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
+      ea_coustom = ElasticAverageCustomGetter(worker_device=worker_device)
+      with variable_scope.variable_scope(
+          "", custom_getter=ea_coustom), ops.device(
+              device_setter.replica_device_setter(
+                  worker_device=worker_device,
+                  ps_device="/job:ps/task:0/cpu:0",
+                  ps_tasks=1)):
+        global_step = variables.Variable(0, name="global_step", trainable=False)
+        var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
+        var_1 = variable_scope.get_variable(initializer=1.0, name="v1")
+
+      with ops.device("/job:worker/task:" + str(worker_id)):
+        grads_0 = constant_op.constant(-1.0)
+        grads_1 = constant_op.constant(-1.0)
+
+        sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
+        opt = ElasticAverageOptimizer(
+            opt=sgd_opt,
+            num_worker=num_workers,
+            moving_rate=moving_rate,
+            communication_period=period,
+            ea_custom_getter=ea_coustom)
+        train_op = [
+            opt.apply_gradients(([grads_0, var_0], [grads_1, var_1]),
+                                global_step)
+        ]
+        easgd_hook = opt.make_session_run_hook(is_chief, worker_id)
+      # Creates MonitoredSession
+      sess = training.MonitoredTrainingSession(
+          workers[worker_id].target, hooks=[easgd_hook])
+
+    sessions.append(sess)
+    graphs.append(graph)
+    train_ops.append(train_op)
+
+  return sessions, graphs, train_ops
+
+
+class ElasticAverageOptimizerTest(test.TestCase):
+
+  def _run(self, train_op, sess):
+    sess.run(train_op)
+
+  def test1Workers2Period(self):
+    num_workers = 1
+    communication_period = 2
+    num_ps = 1
+    cluster, workers, _ = create_local_cluster(
+        num_workers=num_workers, num_ps=num_ps)
+
+    sessions, graphs, train_ops = _get_workers(
+        num_workers, communication_period, workers, 1.0)
+
+    var_0 = graphs[0].get_tensor_by_name("v0:0")
+    var_1 = graphs[0].get_tensor_by_name("v1:0")
+    global_step = training_util.get_global_step(graphs[0])
+    var_0_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
+    var_1_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(1.0, sessions[0].run(var_0))
+    self.assertAllEqual(2.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    # iteration 2, global variable update
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(2.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(3.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+    # iteration 3
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(1.0, sessions[0].run(var_0))
+    self.assertAllEqual(2.0, sessions[0].run(var_1))
+    self.assertAllEqual(2.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(3.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+  def test2Worker1Period(self):
+    num_workers = 2
+    communication_period = 1
+    num_ps = 2
+    cluster, workers, _ = create_local_cluster(
+        num_workers=num_workers, num_ps=num_ps)
+
+    sessions, graphs, train_ops = _get_workers(
+        num_workers, communication_period, workers, 0.5)
+
+    var_0 = graphs[0].get_tensor_by_name("v0:0")
+    var_1 = graphs[0].get_tensor_by_name("v1:0")
+
+    var_0_1 = graphs[1].get_tensor_by_name("v0:0")
+    var_1_1 = graphs[1].get_tensor_by_name("v1:0")
+
+    var_0_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
+    var_1_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[1].run(var_0_1))
+    self.assertAllEqual(1.0, sessions[1].run(var_1_1))
+    self.assertAllEqual(0.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.0, sessions[0].run(var_1_g))
+
+    sessions[0].run(train_ops[0])
+    sessions[1].run(train_ops[1])
+
+    self.assertAllEqual(0.5, sessions[0].run(var_0))
+    self.assertAllEqual(1.5, sessions[0].run(var_1))
+    self.assertAllEqual(0.75, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.75, sessions[0].run(var_1_g))
+    self.assertAllEqual(0.75, sessions[1].run(var_0_1))
+    self.assertAllEqual(1.75, sessions[1].run(var_1_1))
+
+  def testPS2TasksWithClusterSpecClass(self):
+    cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    ea_coustom = ElasticAverageCustomGetter(worker_device="/job:worker/task:0")
+    from tensorflow.python.training import device_setter
+    with ops.device(
+        device_setter.replica_device_setter(cluster=cluster_spec,
+                                            worker_device="/job:worker/task:0",
+                                            ps_device="/job:ps")), \
+         variable_scope.variable_scope("", custom_getter=ea_coustom):
+      v = variable_scope.get_variable(initializer=[1, 2], name="v")
+      w = variable_scope.get_variable(initializer=[2, 1], name="w")
+      v_g, w_g = ea_coustom._global_map[v], ea_coustom._global_map[w]
+      self.assertDeviceEqual("/job:worker/task:0", v.device)
+      self.assertDeviceEqual("job:ps/task:0", v_g.device)
+      self.assertDeviceEqual("/job:worker/task:0", w.device)
+      self.assertDeviceEqual("job:ps/task:1", w_g.device)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index f243317f1df2ec8d93d44ad534f3fa58527f3217..82ebca7f20306e5658c8321716e39f9c7f8b8970 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -397,10 +397,6 @@ class ScipyOptimizerInterface(ExternalOptimizerInterface):
             'automatically and cannot be injected manually'.format(kwarg))
 
     minimize_kwargs.update(optimizer_kwargs)
-    if method == 'SLSQP':
-      # SLSQP doesn't support step callbacks. Obviate associated warning
-      # message.
-      del minimize_kwargs['callback']
 
     import scipy.optimize  # pylint: disable=g-import-not-at-top
     result = scipy.optimize.minimize(*minimize_args, **minimize_kwargs)
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer_test.py b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
index 0f597d0a246a53892d72939edd1499a86c01017d..953586ee70cd4137295dd254bfb2d37cab0bcfe4 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer_test.py
@@ -299,6 +299,45 @@ class ScipyOptimizerInterfaceTest(TestCase):
       method = optimizer.optimizer_kwargs.get('method')
       self.assertEqual('SLSQP', method)
 
+  def test_callbacks(self):
+    vector_val = np.array([7., -2.], dtype=np.float32)
+    vector = variables.Variable(vector_val, 'vector')
+
+    minimum_location_val = np.arange(2)
+    minimum_location = constant_op.constant(
+        minimum_location_val, dtype=dtypes.float32)
+
+    loss = math_ops.reduce_sum(math_ops.square(vector - minimum_location)) / 2.
+    loss_val_first = ((vector_val - minimum_location_val)**2).sum() / 2.
+
+    optimizer = external_optimizer.ScipyOptimizerInterface(loss, method='SLSQP')
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+
+      initial_vector_val = sess.run(vector)
+
+      extra_fetches = [loss]
+
+      step_callback = test.mock.Mock()
+      loss_callback = test.mock.Mock()
+
+      optimizer.minimize(
+          sess,
+          fetches=extra_fetches,
+          loss_callback=loss_callback,
+          step_callback=step_callback)
+
+      loss_val_last = sess.run(loss)
+
+      call_first = test.mock.call(loss_val_first)
+      call_last = test.mock.call(loss_val_last)
+      loss_calls = [call_first, call_last]
+      loss_callback.assert_has_calls(loss_calls, any_order=True)
+
+      args, _ = step_callback.call_args
+      self.assertAllClose(minimum_location_val, args[0])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
index 4c3fec067287e8edefcc4e36ca9fa91f5657013b..aeca900bc8ff4c4cc26da490ce43dfec70fd9f11 100644
--- a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py
@@ -47,8 +47,9 @@ class LazyAdamOptimizer(adam.AdamOptimizer):
   """
 
   def _apply_sparse(self, grad, var):
-    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
-    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
     lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
     beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
     beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7c97a1da2baf29914337094c6153447c997af08
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
@@ -0,0 +1,308 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrapper optimizer for Model Average."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import session_run_hook
+
+GLOBAL_VARIABLE_NAME = "global_center_variable"
+
+
+class ModelAverageCustomGetter(object):
+  """Custom_getter class is used to do.
+
+  1. Change trainable variables to local collection and place them at worker
+    device
+  2. Generate global variables
+    Notice that the class should be used with tf.replica_device_setter,
+    so that the global center variables and global step variable can be placed
+    at ps device. Besides, use 'tf.get_variable' instead of 'tf.Variable' to
+    use this custom getter.
+
+  For example,
+  ma_custom_getter = ModelAverageCustomGetter(worker_device)
+  with tf.device(
+    tf.train.replica_device_setter(
+      worker_device=worker_device,
+      ps_device="/job:ps/cpu:0",
+      cluster=cluster)),
+    tf.variable_scope('',custom_getter=ma_custom_getter):
+    hid_w = tf.get_variable(
+      initializer=tf.truncated_normal(
+          [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
+          stddev=1.0 / IMAGE_PIXELS),
+      name="hid_w")
+    hid_b = tf.get_variable(initializer=tf.zeros([FLAGS.hidden_units]),
+                            name="hid_b")
+  """
+
+  def __init__(self, worker_device):
+    """Create a new `ElasticAverageCustomGetter`.
+
+    Args:
+      worker_device: String.  Name of the `worker` job.
+    """
+    self._worker_device = worker_device
+    self._local_2_global = {}
+
+  def __call__(self, getter, name, trainable, collections, *args, **kwargs):
+    if trainable:
+      with ops.device(self._worker_device):
+        local_var = getter(
+            name,
+            trainable=True,
+            collections=[ops.GraphKeys.LOCAL_VARIABLES],
+            *args,
+            **kwargs)
+
+      global_variable = variable_scope.variable(
+          name="%s/%s" % (GLOBAL_VARIABLE_NAME, name),
+          initial_value=local_var.initialized_value(),
+          trainable=False,
+          collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+
+      self._local_2_global[local_var] = global_variable
+      return local_var
+    else:
+      return getter(name, trainable, collections, *args, **kwargs)
+
+
+class ModelAverageOptimizer(optimizer.Optimizer):
+  """Wrapper optimizer that implements the Model Average algorithm.
+
+  This is a sync optimizer. During the training, each worker will update
+  the local variables and maintains its own local_step, which starts from 0
+  and is incremented by 1 after each update of local variables. Whenever the
+  interval_steps divides the local step, the local variables from all the
+  workers will be averaged and assigned to global center variables. Then the
+  local variables will be assigned by global center variables.
+  """
+
+  def __init__(self,
+               opt,
+               num_worker,
+               is_chief,
+               ma_custom_getter,
+               interval_steps=100,
+               use_locking=True,
+               name="ModelAverageOptimizer"):
+    """Construct a new model average optimizer.
+
+    Args:
+      opt: The actual optimizer that will be used to update local variables
+      num_worker: The number of workers
+      is_chief: whether chief worker
+      ma_custom_getter: ModelAverageCustomGetter
+      interval_steps: An int point value to controls the frequency of the
+        average of local variables
+      use_locking: If True use locks for update operations
+      name: string. Optional name of the returned operation
+    """
+    super(ModelAverageOptimizer, self).__init__(use_locking, name)
+    self._opt = opt
+    self._num_worker = num_worker
+    self._is_chief = is_chief
+    self._local_2_global = ma_custom_getter._local_2_global  # pylint:disable=protected-access
+    self._interval_steps = interval_steps
+    self._accumulator_list = []
+    self._chief_init_op = None
+
+    self._local_step = variable_scope.get_variable(
+        initializer=0,
+        trainable=False,
+        collections=[ops.GraphKeys.LOCAL_VARIABLES],
+        name="local_step")
+
+    self._opt._prepare()  # pylint:disable=protected-access
+
+  def compute_gradients(self, *args, **kwargs):
+    """Compute gradients of "loss" for the variables in "var_list".
+
+    This simply wraps the compute_gradients() from the real optimizer.
+
+    Args:
+      *args: Arguments for compute_gradients().
+      **kwargs: Keyword arguments for compute_gradients().
+
+    Returns:
+      A list of (gradient, variable) pairs.
+    """
+    return self._opt.compute_gradients(*args, **kwargs)
+
+  def _local_vars_update(self, var_list):
+    """Get the update ops for the local variables in "var_list".
+
+    Args:
+      var_list: Optional list or tuple of 'tf.Variable' to update
+
+    Returns:
+      An update op
+
+    Raises:
+      ValueError: if var_list is empty.
+    """
+    if not var_list:
+      raise ValueError("The list of local_variables should not be empty")
+    update_ops = []
+    global_center_vars = [self._local_2_global[var] for var in var_list]
+    for lvar, gvar in zip(var_list, global_center_vars):
+      with ops.device(lvar.device):
+        update_ops.append(state_ops.assign(lvar, gvar.read_value()))
+    return control_flow_ops.group(*(update_ops))
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to variables.
+
+    This contains most of the synchronization implementation and also wraps the
+    apply_gradients() from the real optimizer. The chief work updates global
+    variables.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        compute_gradients().
+      global_step: Optional Variable to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the Optimizer constructor.
+
+    Returns:
+      A conditional 'Operation' that update both local and global variables or
+      just local variables
+
+    Raises:
+      ValueError: If the grads_and_vars is empty.
+      ValueError: If global step is not provided, the staleness cannot be
+        checked.
+    """
+
+    # update local variables
+    if not grads_and_vars:
+      raise ValueError("Must supply at least one variable")
+    if global_step is None:
+      raise ValueError("Global step is required")
+
+    apply_updates = self._opt.apply_gradients(grads_and_vars)
+    with ops.control_dependencies([apply_updates]):
+      local_update = state_ops.assign_add(
+          self._local_step, 1, name="local_step_update").op
+
+    # update global variables.
+    def _update_global_variables():  # pylint: disable=missing-docstring
+      local_vars = [v for g, v in grads_and_vars if g is not None]
+      global_vars = [self._local_2_global[v] for v in local_vars]
+      # sync queue
+      with ops.colocate_with(global_step):
+        sync_queue = data_flow_ops.FIFOQueue(
+            -1, [dtypes.bool], shapes=[[]], shared_name="sync_queue")
+      train_ops = []
+      aggregated_vars = []
+      with ops.name_scope(None, self._name + "/global"):
+        for var, gvar in zip(local_vars, global_vars):
+          # pylint: disable=protected-access
+          with ops.device(gvar.device):
+            if isinstance(var._ref(), ops.Tensor):
+              var_accum = data_flow_ops.ConditionalAccumulator(
+                  var.dtype,
+                  shape=var.get_shape(),
+                  shared_name=gvar.name + "/var_accum")
+              train_ops.append(
+                  var_accum.apply_grad(var._ref(), local_step=global_step))
+              aggregated_vars.append(var_accum.take_grad(self._num_worker))
+            else:
+              raise ValueError("Unknown local variable type!")
+            self._accumulator_list.append((var_accum, gvar.device))
+      # chief worker updates global vars and enqueues tokens to the sync queue
+      if self._is_chief:
+        update_ops = []
+        with ops.control_dependencies(train_ops):
+          for avg_var, gvar in zip(aggregated_vars, global_vars):
+            with ops.device(gvar.device):
+              update_ops.append(state_ops.assign(gvar, avg_var))
+          with ops.device(global_step.device):
+            update_ops.append(state_ops.assign_add(global_step, 1))
+        with ops.control_dependencies(update_ops), ops.device(
+            global_step.device):
+          tokens = array_ops.fill([self._num_worker - 1],
+                                  constant_op.constant(False))
+          sync_op = sync_queue.enqueue_many(tokens)
+      else:
+        with ops.control_dependencies(train_ops), ops.device(
+            global_step.device):
+          sync_op = sync_queue.dequeue()
+
+      with ops.control_dependencies([sync_op]):
+        local_update_op = self._local_vars_update(local_vars)
+      return local_update_op
+
+    with ops.control_dependencies([local_update]):
+      condition = math_ops.equal(
+          math_ops.mod(self._local_step, self._interval_steps), 0)
+      conditional_update = control_flow_ops.cond(
+          condition, _update_global_variables, control_flow_ops.no_op)
+
+    chief_init_ops = []
+    for accum, dev in self._accumulator_list:
+      with ops.device(dev):
+        chief_init_ops.append(
+            accum.set_global_step(global_step, name="SetGlobalStep"))
+    self._chief_init_op = control_flow_ops.group(*(chief_init_ops))
+
+    return conditional_update
+
+  def get_init_op(self):
+    """Returns the op.
+
+    This method lets all the local variables equal to the global
+    variables before the training begins.
+    """
+    return self._local_vars_update(variables.trainable_variables())
+
+  def make_session_run_hook(self):
+    """Creates a hook to handle ModelAverage ops such as initialization."""
+    return _ModelAverageOptimizerHook(self, self._is_chief)
+
+
+class _ModelAverageOptimizerHook(session_run_hook.SessionRunHook):  # pylint: disable=missing-docstring
+
+  def __init__(self, ma_optimizer, is_chief):
+    """Creates hook to handle ModelAverageOptimizer initialization ops.
+
+    Args:
+      ma_optimizer: `ModelAverageOptimizer` which this hook will initialize.
+      is_chief: `Bool`, whether is this a chief replica or not.
+    """
+    self._ma_optimizer = ma_optimizer
+    self._is_chief = is_chief
+
+  def begin(self):
+    self._local_init_op = variables.local_variables_initializer()
+    self._global_init_op = None
+    if self._is_chief:
+      self._global_init_op = variables.global_variables_initializer()
+      self._chief_init_op = self._ma_optimizer._chief_init_op  # pylint: disable=protected-access
+    self._variable_init_op = self._ma_optimizer.get_init_op()
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cca0a8a009456f266245fd9a638bfab371c9b34
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
@@ -0,0 +1,198 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ModelAverageOptimizer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import portpicker
+
+from tensorflow.contrib.opt.python.training import model_average_optimizer
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import device_setter
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import server_lib
+from tensorflow.python.training import training
+from tensorflow.python.training import training_util
+
+
+def create_local_cluster(num_workers, num_ps, protocol="grpc"):
+  """Create local GRPC servers and return them."""
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+  cluster_dict = {
+      "worker": ["localhost:%s" % port for port in worker_ports],
+      "ps": ["localhost:%s" % port for port in ps_ports]
+  }
+  cs = server_lib.ClusterSpec(cluster_dict)
+
+  workers = [
+      server_lib.Server(
+          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_workers)
+  ]
+  ps_servers = [
+      server_lib.Server(
+          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_ps)
+  ]
+
+  return cluster_dict, workers, ps_servers
+
+
+# Creates the workers and return their sessions, graphs, train_ops.
+# Cheif worker will update at last
+def _get_workers(num_workers, steps, workers):
+  sessions = []
+  graphs = []
+  train_ops = []
+  for worker_id in range(num_workers):
+    graph = ops.Graph()
+    is_chief = (worker_id == 0)
+    with graph.as_default():
+      worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
+      ma_coustom = model_average_optimizer.ModelAverageCustomGetter(
+          worker_device=worker_device)
+      with variable_scope.variable_scope(
+          "", custom_getter=ma_coustom), ops.device(
+              device_setter.replica_device_setter(
+                  worker_device=worker_device,
+                  ps_device="/job:ps/task:0/cpu:0",
+                  ps_tasks=1)):
+
+        global_step = variables.Variable(0, name="global_step", trainable=False)
+        var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
+        var_1 = variable_scope.get_variable(initializer=1.0, name="v1")
+
+      with ops.device("/job:worker/task:" + str(worker_id)):
+        if worker_id == 0:
+          grads_0 = constant_op.constant(-1.0)
+          grads_1 = constant_op.constant(-1.0)
+        else:
+          grads_0 = constant_op.constant(-2.0)
+          grads_1 = constant_op.constant(-2.0)
+        sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
+        opt = model_average_optimizer.ModelAverageOptimizer(
+            opt=sgd_opt,
+            num_worker=num_workers,
+            ma_custom_getter=ma_coustom,
+            is_chief=is_chief,
+            interval_steps=steps)
+        train_op = [
+            opt.apply_gradients([[grads_0, var_0], [grads_1, var_1]],
+                                global_step)
+        ]
+      easgd_hook = opt.make_session_run_hook()
+      # Creates MonitoredSession
+      sess = training.MonitoredTrainingSession(
+          workers[worker_id].target, hooks=[easgd_hook])
+
+    sessions.append(sess)
+    graphs.append(graph)
+    train_ops.append(train_op)
+  return sessions, graphs, train_ops
+
+
+class ModelAverageOptimizerTest(test.TestCase):
+  def _run(self, train_op, sess):
+    sess.run(train_op)
+
+  def test1Workers2Period(self):
+    num_workers = 2
+    steps = 2
+    num_ps = 1
+    _, workers, _ = create_local_cluster(
+        num_workers=num_workers, num_ps=num_ps)
+
+    sessions, graphs, train_ops = _get_workers(num_workers, steps, workers)
+
+    var_0 = graphs[0].get_tensor_by_name("v0:0")
+    var_1 = graphs[0].get_tensor_by_name("v1:0")
+    global_step = training_util.get_global_step(graphs[0])
+    global_var_0 = graphs[0].get_tensor_by_name(
+        model_average_optimizer.GLOBAL_VARIABLE_NAME + "/v0:0")
+    global_var_1 = graphs[0].get_tensor_by_name(
+        model_average_optimizer.GLOBAL_VARIABLE_NAME + "/v1:0")
+
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(global_var_0))
+    self.assertAllEqual(1.0, sessions[0].run(global_var_1))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+    sessions[1].run(train_ops[1])
+
+    self.assertAllEqual(1.0, sessions[0].run(var_0))
+    self.assertAllEqual(2.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(global_var_0))
+    self.assertAllEqual(1.0, sessions[0].run(global_var_1))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    # iteration 2, global varibale update
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_0.start()
+    thread_1.start()
+    thread_0.join()
+    thread_1.join()
+
+    self.assertAllEqual(3.0, sessions[0].run(var_0))
+    self.assertAllEqual(4.0, sessions[0].run(var_1))
+    self.assertAllEqual(3.0, sessions[0].run(global_var_0))
+    self.assertAllEqual(4.0, sessions[0].run(global_var_1))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+    # iteration 3
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(4.0, sessions[0].run(var_0))
+    self.assertAllEqual(5.0, sessions[0].run(var_1))
+    self.assertAllEqual(3.0, sessions[0].run(global_var_0))
+    self.assertAllEqual(4.0, sessions[0].run(global_var_1))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+  def testPS2TasksWithClusterSpecClass(self):
+    cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    worker_device = "/job:worker/task:0"
+    ma_coustom = model_average_optimizer.ModelAverageCustomGetter(
+        worker_device=worker_device)
+    from tensorflow.python.training import device_setter
+    with ops.device(
+        device_setter.replica_device_setter(cluster=cluster_spec,
+                                            worker_device=worker_device,
+                                            ps_device="/job:ps")), \
+         variable_scope.variable_scope("", custom_getter=ma_coustom):
+      v = variable_scope.get_variable(initializer=[1, 2], name="v")
+      w = variable_scope.get_variable(initializer=[2, 1], name="w")
+      v_g, w_g = ma_coustom._local_2_global[v], ma_coustom._local_2_global[w]
+      self.assertDeviceEqual("/job:worker/task:0", v.device)
+      self.assertDeviceEqual("job:ps/task:0", v_g.device)
+      self.assertDeviceEqual("/job:worker/task:0", w.device)
+      self.assertDeviceEqual("job:ps/task:1", w_g.device)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
index c48494585eb66c40e69a87439265b9cd08d51712..9ce50bfe1054072b315adecb87f1ba729dfe0d83 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
@@ -83,14 +83,17 @@ class MovingAverageOptimizer(optimizer.Optimizer):
     self._optimizer = opt
     self._ema = moving_averages.ExponentialMovingAverage(
         average_decay, num_updates=num_updates)
-    self._variable_map = None
+    self._swapped_variable_name_map = None
     self._sequential_update = sequential_update
 
+  def compute_gradients(self, *args, **kwargs):
+    return self._optimizer.compute_gradients(*args, **kwargs)
+
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     train_op = self._optimizer.apply_gradients(
         grads_and_vars, global_step=global_step, name=name)
     var_list = [x[1] for x in grads_and_vars if x[0] is not None]
-    self._variable_map = {}
+    self._swapped_variable_name_map = {}
     if self._sequential_update:
       with ops.control_dependencies([train_op]):
         ma_op = self._ema.apply(var_list)
@@ -99,9 +102,9 @@ class MovingAverageOptimizer(optimizer.Optimizer):
 
     for v in var_list:
       v_avg = self._ema.average(v)
-      self._variable_map[v.op.name] = v_avg
-      self._variable_map[v_avg.op.name] = v
-    return control_flow_ops.group(train_op, ma_op, name="train_with_avg")
+      self._swapped_variable_name_map[v.op.name] = v_avg.op.name
+      self._swapped_variable_name_map[v_avg.op.name] = v.op.name
+    return control_flow_ops.group(train_op, ma_op, name='train_with_avg')
 
   def swapping_saver(self, var_list=None, name='swapping_saver', **kwargs):
     """Create a saver swapping moving averages and variables.
@@ -126,22 +129,45 @@ class MovingAverageOptimizer(optimizer.Optimizer):
 
     Raises:
       RuntimeError: If apply_gradients or minimize has not been called before.
+      ValueError: If var_list is provided and contains some variables but not
+        their moving average counterpart.
     """
 
-    if self._variable_map is None:
+    if self._swapped_variable_name_map is None:
       raise RuntimeError('Must call apply_gradients or minimize before '
                          'creating the swapping_saver')
     if var_list is None:
       var_list = variables.global_variables()
     if not isinstance(var_list, dict):
       var_list = saver.BaseSaverBuilder.OpListToDict(var_list)
+
+    # OpListToDict converts variables to tensors. We make sure we can get
+    # the unique variable name for normal and resource vaiables.
+    def get_v_name(tensor):
+      if tensor.op.type == 'ReadVariableOp':
+        return tensor.op.inputs[0].op.name
+      else:
+        return tensor.op.name
+
+    v_name_to_tensor = {}
+    for tensor in six.itervalues(var_list):
+      v_name = get_v_name(tensor)
+      v_name_to_tensor[v_name] = tensor
+
     # Now swap variables and moving averages
     swapped_var_list = {}
-    for k, v in six.iteritems(var_list):
-      v_swap = self._variable_map.get(v.op.name, None)
-      if v_swap:
-        swapped_var_list[k] = v_swap
-      else:
-        swapped_var_list[k] = v
+    for k, tensor in six.iteritems(var_list):
+      v_name = get_v_name(tensor)
+      swapped_v_name = self._swapped_variable_name_map.get(v_name, None)
+      tensor_to_save = tensor
+      if swapped_v_name is not None:
+        if swapped_v_name in v_name_to_tensor:
+          tensor_to_save = v_name_to_tensor[swapped_v_name]
+        else:
+          raise ValueError(
+              ('Variable to swap %s is not part of variables to save. '
+               'This breaks MovingAverageOptimizer.') % swapped_v_name)
+      swapped_var_list[k] = tensor_to_save
+
     # Build the swapping saver.
     return saver.Saver(swapped_var_list, name=name, **kwargs)
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
index a4ffbfe1c6bf8a63b10593e6c783047c99cad523..85e3e8d3791f2331ed249c0b7f67a3dbde4fca08 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
@@ -24,6 +24,10 @@ import six
 from tensorflow.contrib.opt.python.training import moving_average_optimizer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
@@ -33,13 +37,26 @@ from tensorflow.python.training import saver
 class MovingAverageOptimizerTest(test.TestCase):
 
   def testRun(self):
+    self._helpTestRun(use_resource=False)
+
+  def testRunUseResource(self):
+    # Test that MovingAverageOptimizer works with resource variables.
+    self._helpTestRun(use_resource=True)
+
+  def _helpTestRun(self, use_resource=False):
     for sequential_update in [True, False]:
       for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-        with self.test_session() as sess:
+        with self.test_session(graph=ops.Graph()) as sess:
           orig_val0 = [1.0, 2.0]
           orig_val1 = [3.0, 4.0]
-          var0 = variables.Variable(orig_val0, name='var0', dtype=dtype)
-          var1 = variables.Variable(orig_val1, name='var1', dtype=dtype)
+          var0 = variable_scope.get_variable(
+              'var0',
+              initializer=constant_op.constant(orig_val0, dtype=dtype),
+              use_resource=use_resource)
+          var1 = variable_scope.get_variable(
+              'var1',
+              initializer=constant_op.constant(orig_val1, dtype=dtype),
+              use_resource=use_resource)
           grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
           grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
 
@@ -52,22 +69,63 @@ class MovingAverageOptimizerTest(test.TestCase):
           save_path = os.path.join(save_dir, 'model')
           update = opt.apply_gradients(
               list(six.moves.zip([grads0, grads1], [var0, var1])))
+          global_vars = variables.global_variables()
+          ema_var0 = [
+              v for v in global_vars
+              if v.op.name == 'var0/ExponentialMovingAverage'
+          ][0]
+          ema_var1 = [
+              v for v in global_vars
+              if v.op.name == 'var1/ExponentialMovingAverage'
+          ][0]
+          perturb = control_flow_ops.group([
+              state_ops.assign_add(var0, [1.0, 1.0]),
+              state_ops.assign_add(var1, [2.0, 2.0]),
+              state_ops.assign_add(ema_var0, [3.0, 3.0]),
+              state_ops.assign_add(ema_var1, [4.0, 4.0])
+          ])
+
+          # Test taht saver with missing ema variables will fail.
+          with self.assertRaisesRegexp(ValueError, r'Variable to swap'):
+            opt.swapping_saver(var_list=[var0])
+
           train_saver = opt.swapping_saver()
+          train_saver_subset = opt.swapping_saver(var_list=[var0, ema_var0])
           inference_saver = saver.Saver()
           variables.global_variables_initializer().run()
           # Step 1.
           update.run()
-          val0 = var0.eval()
-          val1 = var1.eval()
           self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
           self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
+          if sequential_update:
+            self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
+            self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
           # Test that the swapping saver save/restore operation is identity.
           train_saver.save(sess, save_path)
           train_saver.restore(sess, save_path)
-          val0 = var0.eval()
-          val1 = var1.eval()
           self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
           self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
+          if sequential_update:
+            self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
+            self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
+          # Test that the subset saver saves the EMA variable as well.
+          if sequential_update:
+            subset_save_path = save_path + '_subset'
+            train_saver_subset.save(sess, subset_save_path)
+            perturb.run()
+            self.assertAllCloseAccordingToType([1.8, 2.8], var0.eval())
+            self.assertAllCloseAccordingToType([3.9, 4.9], ema_var0.eval())
+            self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
+            self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
+            # Restoring should only restore var0 and ema_var0.
+            train_saver_subset.restore(sess, subset_save_path)
+            self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
+            self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
+            self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
+            self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
+            # Restore back to previou state.
+            train_saver.restore(sess, save_path)
+
           # If updates are parallel, this is not always true after the 1st step.
           if sequential_update:
             # Test that the normal saver will have the averaged variables.
@@ -116,6 +174,37 @@ class MovingAverageOptimizerTest(test.TestCase):
       with self.assertRaises(RuntimeError):
         _ = opt.swapping_saver([var])
 
+  def testCorrectOverride(self):
+
+    class WrapperOptimizer(gradient_descent.GradientDescentOptimizer):
+
+      def compute_gradients(self, *args, **kwargs):
+        self.compute_gradients_called = True
+        return super(WrapperOptimizer, self).compute_gradients(
+            *args, **kwargs)
+
+      def apply_gradients(self, *args, **kwargs):
+        self.apply_gradients_called = True
+        return super(WrapperOptimizer, self).apply_gradients(*args, **kwargs)
+
+    with self.test_session() as sess:
+      var = variables.Variable([1.2], name='var', dtype=dtypes.float32)
+      loss = var ** 2
+      wrapper_opt = WrapperOptimizer(learning_rate=2.0)
+      opt = moving_average_optimizer.MovingAverageOptimizer(wrapper_opt)
+      train_op = opt.minimize(loss)
+
+      # Check that both methods are called on the underlying optimizer.
+      self.assertTrue(wrapper_opt.compute_gradients_called)
+      self.assertTrue(wrapper_opt.apply_gradients_called)
+
+      # Run train_op once, and verify that we've updated the variable.
+      variables.global_variables_initializer().run()
+      sess.run(train_op)
+      var_value = sess.run(var)
+      # Started at 1.2, gradient is 2*1.2=2.4, lr=2, so should now be -3.6.
+      self.assertNear(-3.6, var_value, 1e-6)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer.py b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
index a4421ecfe6b0af9759c6aaa51d644f1211965b6a..44a8890cb107440b79cf8fbbdfcfda503b1c910f 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
@@ -34,12 +34,13 @@ class NadamOptimizer(adam.AdamOptimizer):
   def _apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.apply_adam(
         var,
         m,
         v,
-        math_ops.cast(self._beta1_power, var.dtype.base_dtype),
-        math_ops.cast(self._beta2_power, var.dtype.base_dtype),
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(beta2_power, var.dtype.base_dtype),
         math_ops.cast(self._lr_t, var.dtype.base_dtype),
         math_ops.cast(self._beta1_t, var.dtype.base_dtype),
         math_ops.cast(self._beta2_t, var.dtype.base_dtype),
@@ -51,12 +52,13 @@ class NadamOptimizer(adam.AdamOptimizer):
   def _resource_apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.resource_apply_adam(
         var.handle,
         m.handle,
         v.handle,
-        math_ops.cast(self._beta1_power, grad.dtype.base_dtype),
-        math_ops.cast(self._beta2_power, grad.dtype.base_dtype),
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(beta2_power, grad.dtype.base_dtype),
         math_ops.cast(self._lr_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
@@ -66,8 +68,9 @@ class NadamOptimizer(adam.AdamOptimizer):
         use_nesterov=True)
 
   def _apply_sparse_shared(self, grad, var, indices, scatter_add):
-    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
-    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
     lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
     beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
     beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
index b0a257d264f83ae0a54cdc0e9265d6e7098b7b56..825c08a09a05894df1656a9bb6981f1862195244 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
@@ -21,12 +21,9 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.opt.python.training import nadam_optimizer
-from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/opt/python/training/powersign.py b/tensorflow/contrib/opt/python/training/powersign.py
index 7f7521581fd685c7a65119e2bd2b4af64aafcd69..828f3c51c9868c70d881fabb33995fb4e90c64e3 100644
--- a/tensorflow/contrib/opt/python/training/powersign.py
+++ b/tensorflow/contrib/opt/python/training/powersign.py
@@ -32,8 +32,8 @@ from tensorflow.python.training import training_ops
 class PowerSignOptimizer(optimizer.Optimizer):
   """Optimizer that implements the PowerSign update.
 
-  See  Neural Optimizer Search with Reinforcement Learning
-  [Bello et al., ICML2017].
+  See [Bello et al., ICML2017],
+  [Neural Optimizer Search with RL](https://arxiv.org/abs/1709.07417).
   """
 
   def __init__(self,
@@ -57,7 +57,7 @@ class PowerSignOptimizer(optimizer.Optimizer):
     ```
     t <- t + 1
     m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-    sign_decay <- sign_decay(t)
+    sign_decay <- sign_decay_fn(t)
     update <- base ** (sign_decay * sign(g) * sign(m)) * g
     variable <- variable - lr_t * update
     ```
@@ -73,10 +73,9 @@ class PowerSignOptimizer(optimizer.Optimizer):
       learning_rate: learning_rate used when taking a step.
       base: base used in optimizer.
       beta: decay used for computing the moving average m.
-      sign_decay_fn: decay function applied to the sign(g*m) quantity.
-          Takes global_step as an argument and returns the quantity to multiply
-          the sign(g*m) by.
-      use_locking: If True use locks for update operations.
+      sign_decay_fn: decay function applied to the sign(g) sign(m) quantity.
+          Takes global_step as an argument. See sign_decay.py for some examples.
+      use_locking: If True, use locks for update operations.
       name: Optional name for the operations created iwhen applying gradients.
         Defaults to "PowerSignOptimizer".
     """
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..bd9078ae76ee27ec26c09d1aa2012f871cbdf5e9
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -0,0 +1,131 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "py_test",
+    "tf_gen_op_libs",
+    "tf_custom_op_library",
+    "tf_custom_op_py_library",
+    "tf_gen_op_wrapper_py",
+)
+
+cc_library(
+    name = "all_ops",
+    srcs = [":custom_op_sources"],
+    hdrs = [":custom_op_headers"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_library(
+    name = "python/ops/_periodic_resample_op.so",
+    srcs = [
+        ":custom_op_headers",
+        ":custom_op_sources",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["array_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_periodic_resample_op_py",
+    out = "python/ops/gen_periodic_resample_op.py",
+    deps = [":array_ops_op_lib"],
+)
+
+tf_custom_op_py_library(
+    name = "periodic_resample_op_py",
+    srcs = ["python/ops/periodic_resample_op.py"],
+    dso = ["python/ops/_periodic_resample_op.so"],
+    kernels = [
+        ":array_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_periodic_resample_op_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
+py_library(
+    name = "init_py",
+    srcs = [
+        "__init__.py",
+        "python/__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    tags = [
+        "notap",
+    ],
+    deps = [
+        ":periodic_resample_op_py",
+    ],
+)
+
+py_test(
+    name = "periodic_resample_op_test",
+    srcs = ["python/kernel_tests/periodic_resample_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "notap",
+    ],
+    deps = [
+        ":init_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+# py_library(
+#     name = "periodic_resample_op_py",
+#     srcs = ["python/ops/periodic_resample_op.py"],
+#     data = ["python/ops/_periodic_resample_op.so"],
+#     srcs_version = "PY2AND3",
+# )
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "custom_op_sources",
+    srcs = glob(
+        [
+            "ops/*.cc",
+            "kernels/*.cc",
+        ],
+        exclude = [
+            "ops/*_test.cc",
+            "kernels/*_test.cc",
+        ],
+    ),
+)
+
+filegroup(
+    name = "custom_op_headers",
+    srcs = glob(
+        [
+            "kernels/*.h",
+            "ops/*.h",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/periodic_resample/__init__.py b/tensorflow/contrib/periodic_resample/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fde9091b88f96da8f880ea341c8fd809b619c807
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/__init__.py
@@ -0,0 +1,27 @@
+# =============================================================================
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Custom op used by periodic_resample."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.periodic_resample.python.ops.periodic_resample_op import periodic_resample
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ["periodic_resample"]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e18923c8aae74c66ce78f98eb5e615e99463af74
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -0,0 +1,25 @@
+// =============================================================================
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU),
+                        PeriodicResampleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ab588c45881c8f93b4c1bcdf7ccde39086a1ed7
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -0,0 +1,241 @@
+// =============================================================================
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
+#define TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
+
+#include <cmath>
+#include <type_traits>
+#include <vector>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace {
+
+template <class IndexVecT, class IndexT>
+IndexT compute_input_index(
+    IndexVecT* target_dimensions, const IndexT& output_index,
+    const IndexVecT& original_dimensions, const int& adjustable_dimension,
+    const std::vector<tensorflow::int64>& dimension_ceiling,
+    const std::vector<tensorflow::int64>& cumulative_dimensions, IndexT* result,
+    std::vector<IndexT>* output_indices, const int& rank) {
+  *result = 0;
+  output_indices->clear();
+
+  // un-rasterize the output index
+  auto last_reduced_i = output_index;
+  for (auto r = rank - 1; r >= 0; --r) {
+    (*output_indices)[r] = last_reduced_i % (*target_dimensions)[r];
+    last_reduced_i =
+        (last_reduced_i - (*output_indices)[r]) / (*target_dimensions)[r];
+  }
+
+  // rasterize the input index
+  IndexT last_index_factor = 1;
+  for (auto r = rank - 1; r >= 0; --r) {
+    IndexT index = 0;
+    if (r != adjustable_dimension)
+      index = (*output_indices)[r] / dimension_ceiling[r];
+    else {
+      for (int qi = 0; qi < rank; ++qi) {
+        if (qi == adjustable_dimension) continue;
+        index += cumulative_dimensions[qi] *
+                 ((*output_indices)[qi] % dimension_ceiling[qi]);
+      }
+      index *= (*target_dimensions)[adjustable_dimension];
+      index += (*output_indices)[r];
+    }
+    *result += last_index_factor * index;
+    last_index_factor *= original_dimensions[r];
+  }
+
+  return *result;
+}
+
+template <class InputDataT,
+          class IndexVecT>  // both types are needed here b/c IndexVecT and
+                            // InputDataT are not related
+                            void
+                            fill_periodic_tensor(
+                                tensorflow::OpKernelContext* context,
+                                const IndexVecT& desired_shape,
+                                const tensorflow::Tensor& input_tensor) {
+  // input is a strided array (last index is fastest, C-ordered)
+  auto input = input_tensor.flat<InputDataT>();
+  const int rank = input_tensor.dims();
+  // original and target dimensions
+  std::vector<tensorflow::int64> original_dimensions(rank),
+      target_dimensions(rank);
+  tensorflow::int64 total_size(input_tensor.NumElements()), new_sliced_size(1);
+  // factors by which original_dimensions increases/decreases w.r.t.
+  // target_dimensions
+  std::vector<tensorflow::int64> dimension_ceiling(rank),
+      cumulative_dimensions(rank);
+  // index of adjustable dimension
+  int adjustable_dimension;
+  tensorflow::TensorShape output_shape;
+
+  // requires that the rank of the input tensor and length of the desired shape
+  // are equal
+  OP_REQUIRES(context, rank == desired_shape.size(),
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects the rank of the input tensor, ",
+                  rank, ", to be the same as the length of the desired shape, ",
+                  desired_shape.size(), "."));
+
+  bool found = false;
+  const auto& input_tensor_shape = input_tensor.shape();
+
+  for (int i = 0; i < rank; ++i) {
+    // if (desired_shape(i) < 1) {
+    if (desired_shape[i] < 1) {
+      // only one index can be adjustable
+      OP_REQUIRES(context, !found,
+                  tensorflow::errors::InvalidArgument(
+                      "periodic_resample expects only "
+                      "one index to be marked as adjustable."));
+      adjustable_dimension = i;
+      found = true;
+    } else {
+      OP_REQUIRES(
+          context, desired_shape[i] >= input_tensor_shape.dim_size(i),
+          tensorflow::errors::InvalidArgument(
+              "periodic_resample expects the size of non-adjustable "
+              "dimensions be at least as large as size of input tensor."
+              " Dimension ",
+              i, " input tensor has size ", input_tensor_shape.dim_size(i),
+              ", desired shape has size ", desired_shape[i], "."));
+
+      // target_dimensions[i] = desired_shape(i);
+      target_dimensions[i] = desired_shape[i];
+      new_sliced_size *= target_dimensions[i];
+    }
+  }
+  // at least one index needs to be adjustable
+  OP_REQUIRES(context, found,
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects at least "
+                  "one index to be marked as adjustable."));
+
+  int count = 0;
+  for (const auto dim_info : input_tensor.shape()) {
+    original_dimensions[count] = dim_info.size;
+    ++count;
+  }
+
+  target_dimensions[adjustable_dimension] = total_size / new_sliced_size;
+
+  count = 0;
+  for (int i = 0; i < input_tensor.shape().dims(); ++i) {
+    dimension_ceiling[count] = tensorflow::int64(std::ceil(
+        float(target_dimensions[count]) / float(original_dimensions[count])));
+    if (count == 0)
+      cumulative_dimensions[count] = 1;
+    else
+      cumulative_dimensions[count] =
+          cumulative_dimensions[count - 1] * dimension_ceiling[count - 1];
+    ++count;
+  }
+
+  // ensure that the new dimension is greater than zero
+  OP_REQUIRES(context, target_dimensions[adjustable_dimension] > 0,
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample found that the "
+                  "adjustable dimension, ",
+                  adjustable_dimension, ", isn't greater than zero, ",
+                  target_dimensions[adjustable_dimension], "."));
+  for (int i = 0; i < rank; ++i) {
+    output_shape.AddDim(target_dimensions[i]);
+  }
+  const auto new_size =
+      new_sliced_size * target_dimensions[adjustable_dimension];
+
+  // Create an output tensor and attach it to the current context
+  tensorflow::Tensor* output_tensor = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(0, output_shape, &output_tensor));
+  auto output = output_tensor->flat<InputDataT>();
+
+  // memory is allocated for these variables outside the inner loop for
+  // efficiency (although, I could create a separate class scope for
+  // this purpose instead)
+  tensorflow::int64 result = 0;
+  std::vector<tensorflow::int64> output_indices(target_dimensions.size());
+
+  // Fill output tensor with periodically resampled input tensor values
+  for (tensorflow::int64 output_index = 0; output_index < new_size;
+       ++output_index) {
+    output(output_index) = input(compute_input_index(
+        &target_dimensions, output_index, original_dimensions,
+        adjustable_dimension, dimension_ceiling, cumulative_dimensions, &result,
+        &output_indices, rank));
+  }
+}
+
+void create_output_tensor(
+    tensorflow::OpKernelContext* context,
+    const tensorflow::Tensor& input_tensor,
+    const tensorflow::DataType& input_tensor_type,
+    const tensorflow::PartialTensorShape& desired_shape_tensor) {
+  auto desired_shape = desired_shape_tensor.dim_sizes();
+
+  // obligatory type switch
+  switch (input_tensor_type) {
+    case tensorflow::DataTypeToEnum<float>::value:
+      fill_periodic_tensor<float>(context, desired_shape, input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<double>::value:
+      fill_periodic_tensor<double>(context, desired_shape, input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<tensorflow::int32>::value:
+      fill_periodic_tensor<tensorflow::int32>(context, desired_shape,
+                                              input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<tensorflow::int64>::value:
+      fill_periodic_tensor<tensorflow::int64>(context, desired_shape,
+                                              input_tensor);
+      break;
+    default:;
+  }
+}
+
+}  // namespace
+
+class PeriodicResampleOp : public tensorflow::OpKernel {
+ public:
+  explicit PeriodicResampleOp(tensorflow::OpKernelConstruction* context)
+      : tensorflow::OpKernel(context) {
+    // Get the desired shape
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &desired_shape));
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // Grab the input tensor
+    const tensorflow::Tensor& input_tensor = context->input(0);
+    const tensorflow::DataType input_tensor_type = context->input_dtype(0);
+
+    create_output_tensor(context, input_tensor, input_tensor_type,
+                         desired_shape);
+  }
+
+ private:
+  tensorflow::PartialTensorShape desired_shape;
+};
+
+#endif  // TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops.cc b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..82bd79695646e3673c2c78ad99dd2bd200fc2fbf
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
@@ -0,0 +1,104 @@
+// =============================================================================
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("PeriodicResample")
+    .Attr("T: numbertype")
+    .Input("values: T")
+    .Attr("shape: shape")
+    .Output("output: T")
+    .SetShapeFn(shape_inference::ExplicitShape)
+    .Doc(R"doc(
+Periodically resample elements of a tensor to conform to `shape`.
+
+This function implements a slightly more generic version of the subpixel
+convolutions found in this [paper](https://arxiv.org/abs/1609.05158).
+
+The formula for computing the elements in the `output` tensor is as follows:
+
+  `T` = `values` tensor of rank `R`
+
+  `S` = desired `shape` of output tensor (vector of length `R`)
+
+  `P` = `output` tensor of rank `R`
+
+  \\((T_1,\\ldots,T_R)\\) = shape(`T`)
+
+  \\([S_1,\\ldots,S_q,\\ldots,S_R]\\) = elements of vector `S`
+
+  A single element in `S` is left unspecified (denoted \\(S_q=-1\\)).
+
+  Let \\(f_i\\) denote the (possibly non-integer) factor that relates the original
+  dimension to the desired dimensions, \\(S_i=f_i T_i\\), for \\(i\\neq q\\) where
+  \\(f_i>0\\).
+
+  Define the following:
+
+  \\(g_i=\\lceil f_i\\rceil\\)
+
+  \\(t=\\prod_i T_i\\)
+
+  \\(s=\\prod_{i\\neq q} S_i\\)
+
+  \\(S_q\\) can then be defined by \\(S_q=\\lfloor t/s\\rfloor\\).
+  The elements of the resulting tensor are defined as
+
+  \\(P_{s_1,\\ldots,s_R}=T_{h_1,\\ldots,h_q,\\ldots,h_R}\\).
+
+  The \\(h_i\\) (\\(i\\neq q\\)) are defined by \\(h_i=\\lfloor s_i/g_i\\rfloor\\).
+
+  \\(h_q=S_q\\sum_{j\\neq q}^{q-1}G_j \\mathrm{mod}(s_j,g_j) + s_q\\), where
+  \\(G_j=\\prod_{i}^{j-1}g_i\\) (\\(G_0=1\\)).
+
+One drawback of this method is that whenever the output dimensions are slightly
+less than integer multiples of the input dimensions, many of the tensor elements
+are repeated in an inefficient way. This is resolved by specifying that all
+desired dimensions are integer multiples of the input tensor.
+
+For example:
+
+```prettyprint
+`input` is [[ 0  1  2  3]
+            [ 4  5  6  7]
+            [ 8  9 10 11]]
+
+tf.periodic_resample(input, [6, None]) ==> [[ 0  1]
+                                            [ 2  3]
+                                            [ 4  5]
+                                            [ 6  7]
+                                            [ 8  9]
+                                            [10 11]]
+```
+
+values: The tensor of rank `R` to periodic_resample
+shape: A 1-D tensor representing the desired shape of the output tensor.
+  Exactly one element of this tensor must have the value `None` which represents
+  that this dimension of `values` can be adjusted downward in order to
+  accommodate increases in other dimensions. The specified sizes of the
+  non-adjustable dimensions must by at least as large as in the `values` tensor.
+output: Periodically resampled tensor that has dimensions specified as in
+  `shape` except that the dimension specified as `None` will be minimally
+  decreased as necessary.
+
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/python/__init__.py b/tensorflow/contrib/periodic_resample/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8b6ead0f594ad23e73901254857313635fbd1c5
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/python/__init__.py
@@ -0,0 +1,20 @@
+# =============================================================================
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Public API of periodic_resample."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a25de55e18b223db2b724aafb54b18d8f48a5baa
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -0,0 +1,108 @@
+# =============================================================================
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.contrib.periodic_resample import periodic_resample
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+class PeriodicResampleTest(test_util.TensorFlowTestCase):
+
+  def testPeriodicResampleBasic2D(self):
+
+    input_tensor = numpy.arange(12).reshape((3, 4))
+    desired_shape = numpy.array([6, None])
+    output_tensor = input_tensor.reshape((6, 2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      result = periodic_resample(input_tensor, desired_shape).eval()
+      self.assertAllEqual(result, output_tensor)
+
+  def testPeriodicResampleTruncatedBasic2D(self):
+
+    input_tensor = numpy.arange(12).reshape((3, 4))
+    desired_shape = numpy.array([5, None])
+    output_tensor = input_tensor.reshape((6, 2))[:-1]
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      result = periodic_resample(input_tensor, desired_shape).eval()
+      self.assertAllEqual(result, output_tensor)
+
+  def testPeriodicResampleBasic3D(self):
+
+    input_tensor = numpy.arange(2 * 2 * 4).reshape((2, 2, 4))
+    desired_shape = numpy.array([4, 4, None])
+    output_tensor = numpy.array([[[0], [2], [4], [6]], [[1], [3], [5], [7]],
+                                 [[8], [10], [12], [14]], [[9], [11], [13],
+                                                           [15]]])
+
+    # NOTE: output_tensor != input_tensor.reshape((4, 4, -1))
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      result = periodic_resample(input_tensor, desired_shape).eval()
+      # input_tensor[0, 0, 0] == result[0, 0, 0]
+      # input_tensor[0, 0, 1] == result[1, 0, 0]
+      # input_tensor[0, 0, 2] == result[0, 1, 0]
+      # input_tensor[0, 0, 3] == result[1, 1, 0]
+      self.assertAllEqual(result, output_tensor)
+
+  def testPeriodicResampleBasic4D(self):
+
+    input_tensor = numpy.arange(2 * 2 * 2 * 8).reshape((2, 2, 2, 8))
+    desired_shape = numpy.array([4, 4, 4, None])
+    output_tensor = numpy.array(
+        [[[[0], [4], [8], [12]], [[2], [6], [10], [14]],
+          [[16], [20], [24], [28]], [[18], [22], [26], [30]]],
+         [[[1], [5], [9], [13]], [[3], [7], [11], [15]], [[17], [21], [25],
+                                                          [29]],
+          [[19], [23], [27],
+           [31]]], [[[32], [36], [40], [44]], [[34], [38], [42], [46]],
+                    [[48], [52], [56], [60]], [[50], [54], [58], [62]]],
+         [[[33], [37], [41], [45]], [[35], [39], [43], [47]],
+          [[49], [53], [57], [61]], [[51], [55], [59], [63]]]])
+
+    # NOTE: output_tensor != input_tensor.reshape((4, 4, 4, -1))
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      result = periodic_resample(input_tensor, desired_shape).eval()
+      self.assertAllEqual(result, output_tensor)
+
+  def testPeriodicResampleErrors(self):
+    input_tensor = numpy.zeros(shape=[1, 2, 2, 4])
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          'Dimension 3 input tensor has size 4, desired shape has size 1'):
+        periodic_resample(input_tensor, [None, 4, 4, 1]).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          '4, to be the same as the length of the desired shape, 3'):
+        periodic_resample(input_tensor, [None, 4, 4]).eval()
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..348623d8f8d0c2ed60f559eca281343722038100
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
@@ -0,0 +1,31 @@
+# =============================================================================
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.contrib.periodic_resample.python.ops import gen_periodic_resample_op
+
+from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+# pylint: enable=unused-import
+
+_periodic_resample_op = loader.load_op_library(
+    resource_loader.get_path_to_datafile('_periodic_resample_op.so'))
diff --git a/tensorflow/contrib/pi_examples/camera/camera.cc b/tensorflow/contrib/pi_examples/camera/camera.cc
index cb20661662922a0a160272fcd02a9fe6f9daf6fb..8110185ea8d4684fbc6b4ce54a9b192a020a97f8 100644
--- a/tensorflow/contrib/pi_examples/camera/camera.cc
+++ b/tensorflow/contrib/pi_examples/camera/camera.cc
@@ -17,16 +17,16 @@ limitations under the License.
 
 #include <errno.h>
 #include <fcntl.h>
-#include <fstream>
 #include <libv4l2.h>
 #include <linux/videodev2.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/ioctl.h>
-#include <sys/types.h>
-#include <sys/time.h>
 #include <sys/mman.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <fstream>
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
@@ -46,10 +46,10 @@ limitations under the License.
 
 // These are all common classes it's handy to reference with no namespace.
 using tensorflow::Flag;
-using tensorflow::Tensor;
+using tensorflow::int32;
 using tensorflow::Status;
 using tensorflow::string;
-using tensorflow::int32;
+using tensorflow::Tensor;
 
 // Used to store the memory-mapped buffers we use for capture.
 struct CameraBuffer {
diff --git a/tensorflow/contrib/pi_examples/label_image/label_image.cc b/tensorflow/contrib/pi_examples/label_image/label_image.cc
index 0b18045789f3a87ceb228033407d6b696bdb33f6..c6935a093f728353caeeb79a9ed85c957d87f066 100644
--- a/tensorflow/contrib/pi_examples/label_image/label_image.cc
+++ b/tensorflow/contrib/pi_examples/label_image/label_image.cc
@@ -23,9 +23,9 @@ limitations under the License.
 //
 // Full build instructions are at tensorflow/contrib/pi_examples/README.md.
 
-#include <stdio.h>
 #include <jpeglib.h>
 #include <setjmp.h>
+#include <stdio.h>
 #include <fstream>
 #include <vector>
 
@@ -46,10 +46,10 @@ limitations under the License.
 
 // These are all common classes it's handy to reference with no namespace.
 using tensorflow::Flag;
-using tensorflow::Tensor;
+using tensorflow::int32;
 using tensorflow::Status;
 using tensorflow::string;
-using tensorflow::int32;
+using tensorflow::Tensor;
 
 // Takes a file name, and loads a list of labels from it, one per line, and
 // returns a vector of the strings. It pads with empty strings so the length
@@ -77,23 +77,22 @@ Status ReadLabelsFile(string file_name, std::vector<string>* result,
 // Error handling for JPEG decoding.
 void CatchError(j_common_ptr cinfo) {
   (*cinfo->err->output_message)(cinfo);
-  jmp_buf *jpeg_jmpbuf = reinterpret_cast<jmp_buf *>(cinfo->client_data);
+  jmp_buf* jpeg_jmpbuf = reinterpret_cast<jmp_buf*>(cinfo->client_data);
   jpeg_destroy(cinfo);
   longjmp(*jpeg_jmpbuf, 1);
 }
 
 // Decompresses a JPEG file from disk.
 Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
-		    int* width, int* height, int* channels) {
+                    int* width, int* height, int* channels) {
   struct jpeg_decompress_struct cinfo;
-  FILE * infile;
+  FILE* infile;
   JSAMPARRAY buffer;
   int row_stride;
 
   if ((infile = fopen(file_name.c_str(), "rb")) == NULL) {
     LOG(ERROR) << "Can't open " << file_name;
-    return tensorflow::errors::NotFound("JPEG file ", file_name,
-					" not found");
+    return tensorflow::errors::NotFound("JPEG file ", file_name, " not found");
   }
 
   struct jpeg_error_mgr jerr;
@@ -116,10 +115,11 @@ Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
   data->resize((*height) * (*width) * (*channels));
 
   row_stride = cinfo.output_width * cinfo.output_components;
-  buffer = (*cinfo.mem->alloc_sarray)
-    ((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1);
+  buffer = (*cinfo.mem->alloc_sarray)((j_common_ptr)&cinfo, JPOOL_IMAGE,
+                                      row_stride, 1);
   while (cinfo.output_scanline < cinfo.output_height) {
-    tensorflow::uint8* row_address = &((*data)[cinfo.output_scanline * row_stride]);
+    tensorflow::uint8* row_address =
+        &((*data)[cinfo.output_scanline * row_stride]);
     jpeg_read_scanlines(&cinfo, buffer, 1);
     memcpy(row_address, buffer[0], row_stride);
   }
@@ -141,24 +141,25 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
   int image_height;
   int image_channels;
   TF_RETURN_IF_ERROR(LoadJpegFile(file_name, &image_data, &image_width,
-				  &image_height, &image_channels));
-  LOG(INFO) << "Loaded JPEG: " << image_width << "x" << image_height
-	    << "x" << image_channels;
+                                  &image_height, &image_channels));
+  LOG(INFO) << "Loaded JPEG: " << image_width << "x" << image_height << "x"
+            << image_channels;
   const int wanted_channels = 3;
   if (image_channels < wanted_channels) {
-    return tensorflow::errors::FailedPrecondition("Image needs to have at least ",
-						  wanted_channels, " but only has ",
-						  image_channels);
+    return tensorflow::errors::FailedPrecondition(
+        "Image needs to have at least ", wanted_channels, " but only has ",
+        image_channels);
   }
-  // In these loops, we convert the eight-bit data in the image into float, resize
-  // it using bilinear filtering, and scale it numerically to the float range that
-  // the model expects (given by input_mean and input_std).
+  // In these loops, we convert the eight-bit data in the image into float,
+  // resize it using bilinear filtering, and scale it numerically to the float
+  // range that the model expects (given by input_mean and input_std).
   tensorflow::Tensor image_tensor(
-      tensorflow::DT_FLOAT, tensorflow::TensorShape(
-      {1, wanted_height, wanted_width, wanted_channels}));
+      tensorflow::DT_FLOAT,
+      tensorflow::TensorShape(
+          {1, wanted_height, wanted_width, wanted_channels}));
   auto image_tensor_mapped = image_tensor.tensor<float, 4>();
   tensorflow::uint8* in = image_data.data();
-  float *out = image_tensor_mapped.data();
+  float* out = image_tensor_mapped.data();
   const size_t image_rowlen = image_width * image_channels;
   const float width_scale = static_cast<float>(image_width) / wanted_width;
   const float height_scale = static_cast<float>(image_height) / wanted_height;
@@ -166,35 +167,37 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
     const float in_y = y * height_scale;
     const int top_y_index = static_cast<int>(floorf(in_y));
     const int bottom_y_index =
-      std::min(static_cast<int>(ceilf(in_y)), (image_height - 1));
+        std::min(static_cast<int>(ceilf(in_y)), (image_height - 1));
     const float y_lerp = in_y - top_y_index;
     tensorflow::uint8* in_top_row = in + (top_y_index * image_rowlen);
     tensorflow::uint8* in_bottom_row = in + (bottom_y_index * image_rowlen);
-    float *out_row = out + (y * wanted_width * wanted_channels);
+    float* out_row = out + (y * wanted_width * wanted_channels);
     for (int x = 0; x < wanted_width; ++x) {
       const float in_x = x * width_scale;
       const int left_x_index = static_cast<int>(floorf(in_x));
       const int right_x_index =
-	std::min(static_cast<int>(ceilf(in_x)), (image_width - 1));
+          std::min(static_cast<int>(ceilf(in_x)), (image_width - 1));
       tensorflow::uint8* in_top_left_pixel =
-	in_top_row + (left_x_index * wanted_channels);
+          in_top_row + (left_x_index * wanted_channels);
       tensorflow::uint8* in_top_right_pixel =
-	in_top_row + (right_x_index * wanted_channels);
+          in_top_row + (right_x_index * wanted_channels);
       tensorflow::uint8* in_bottom_left_pixel =
-	in_bottom_row + (left_x_index * wanted_channels);
+          in_bottom_row + (left_x_index * wanted_channels);
       tensorflow::uint8* in_bottom_right_pixel =
-	in_bottom_row + (right_x_index * wanted_channels);
+          in_bottom_row + (right_x_index * wanted_channels);
       const float x_lerp = in_x - left_x_index;
-      float *out_pixel = out_row + (x * wanted_channels);
+      float* out_pixel = out_row + (x * wanted_channels);
       for (int c = 0; c < wanted_channels; ++c) {
-	const float top_left((in_top_left_pixel[c] - input_mean) / input_std);
-	const float top_right((in_top_right_pixel[c] - input_mean) / input_std);
-	const float bottom_left((in_bottom_left_pixel[c] - input_mean) / input_std);
-	const float bottom_right((in_bottom_right_pixel[c] - input_mean) / input_std);
-	const float top = top_left + (top_right - top_left) * x_lerp;
-	const float bottom =
-	  bottom_left + (bottom_right - bottom_left) * x_lerp;
-	out_pixel[c] = top + (bottom - top) * y_lerp;
+        const float top_left((in_top_left_pixel[c] - input_mean) / input_std);
+        const float top_right((in_top_right_pixel[c] - input_mean) / input_std);
+        const float bottom_left((in_bottom_left_pixel[c] - input_mean) /
+                                input_std);
+        const float bottom_right((in_bottom_right_pixel[c] - input_mean) /
+                                 input_std);
+        const float top = top_left + (top_right - top_left) * x_lerp;
+        const float bottom =
+            bottom_left + (bottom_right - bottom_left) * x_lerp;
+        out_pixel[c] = top + (bottom - top) * y_lerp;
       }
     }
   }
@@ -233,10 +236,10 @@ Status GetTopLabels(const std::vector<Tensor>& outputs, int how_many_labels,
     scores.push_back(std::pair<int, float>({i, unsorted_scores_flat(i)}));
   }
   std::sort(scores.begin(), scores.end(),
-	    [](const std::pair<int, float> &left,
-	       const std::pair<int, float> &right) {
-	      return left.second > right.second;
-	    });
+            [](const std::pair<int, float>& left,
+               const std::pair<int, float>& right) {
+              return left.second > right.second;
+            });
   scores.resize(how_many_labels);
   Tensor sorted_indices(tensorflow::DT_INT32, {scores.size()});
   Tensor sorted_scores(tensorflow::DT_FLOAT, {scores.size()});
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index 82cd7b4c8aeb64cf461d9244c5aaf32a91691a5a..a80f060b91df3b6d5e2ca9ff63c721382f0cbb0a 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -136,6 +136,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "predictor_factories_test",
+    srcs = ["predictor_factories_test.py"],
+    data = [":test_export_dir"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":predictor_factories",
+        ":testing_common",
+    ],
+)
+
 py_test(
     name = "core_estimator_predictor_test",
     srcs = ["core_estimator_predictor_test.py"],
diff --git a/tensorflow/contrib/predictor/core_estimator_predictor.py b/tensorflow/contrib/predictor/core_estimator_predictor.py
index bd5174aef81a84488c896d259de83d0714745fee..d78d94c2699b14c80e7decee2181d190a6d91f99 100644
--- a/tensorflow/contrib/predictor/core_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/core_estimator_predictor.py
@@ -68,10 +68,10 @@ class CoreEstimatorPredictor(predictor.Predictor):
       serving_input_receiver = serving_input_receiver_fn()
       signature_def = _get_signature_def(
           serving_input_receiver, estimator, output_key)
-      checkpoint_path = estimator.model_dir
+      checkpoint_dir = estimator.model_dir
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
-              checkpoint_filename_with_path=checkpoint_path))
+              checkpoint_dir=checkpoint_dir))
 
     feed_tensor_info = signature_def.inputs
     self._feed_tensors = {k: self._graph.get_tensor_by_name(v.name)
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index e3f30d917d637d2e2d821a727e12b8d0b54942df..04b5d5bdf158dc6a478d7a24b538c75d1dca8d45 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Factory functions for `Predictor`s."""
 
 from __future__ import absolute_import
@@ -22,6 +21,8 @@ from __future__ import print_function
 from tensorflow.contrib.predictor import contrib_estimator_predictor
 from tensorflow.contrib.predictor import core_estimator_predictor
 from tensorflow.contrib.predictor import saved_model_predictor
+
+from tensorflow.contrib.learn.python.learn.estimators import estimator as contrib_estimator
 from tensorflow.python.estimator import estimator as core_estimator
 
 
@@ -59,9 +60,9 @@ def from_contrib_estimator(estimator,
   return contrib_estimator_predictor.ContribEstimatorPredictor(
       estimator,
       prediction_input_fn,
-      input_alternative_key,
-      output_alternative_key,
-      graph)
+      input_alternative_key=input_alternative_key,
+      output_alternative_key=output_alternative_key,
+      graph=graph)
 
 
 def from_estimator(estimator,
@@ -86,16 +87,13 @@ def from_estimator(estimator,
     TypeError: if `estimator` is a contrib `Estimator` instead of a core
       `Estimator`.
   """
-  if isinstance(estimator, estimator.Estimator):
+  if isinstance(estimator, contrib_estimator.Estimator):
     raise TypeError('Espected estimator to be of type '
                     'tf.python.estimator.Estimator, but got type '
                     'tf.contrib.learn.Estimator. You likely want to call '
                     'from_contrib_estimator.')
   return core_estimator_predictor.CoreEstimatorPredictor(
-      estimator,
-      serving_input_receiver_fn,
-      output_key,
-      graph)
+      estimator, serving_input_receiver_fn, output_key=output_key, graph=graph)
 
 
 def from_saved_model(export_dir,
@@ -125,8 +123,9 @@ def from_saved_model(export_dir,
     ValueError: More than one of `signature_def_key` and `signature_def` is
       specified.
   """
-  return saved_model_predictor.SavedModelPredictor(export_dir,
-                                                   signature_def_key,
-                                                   signature_def,
-                                                   tags,
-                                                   graph)
+  return saved_model_predictor.SavedModelPredictor(
+      export_dir,
+      signature_def_key=signature_def_key,
+      signature_def=signature_def,
+      tags=tags,
+      graph=graph)
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..578d9424b25dd38f1d77a267d1fdf1ff9ff2da88
--- /dev/null
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -0,0 +1,75 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for predictor.predictor_factories."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.predictor import predictor_factories
+from tensorflow.contrib.predictor import testing_common
+from tensorflow.python.platform import test
+
+MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
+
+
+class PredictorFactoriesTest(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    # Load a saved model exported from the arithmetic `Estimator`.
+    # See `testing_common.py`.
+    cls._export_dir = test.test_src_dir_path(MODEL_DIR_NAME)
+
+  def testFromSavedModel(self):
+    """Test loading from_saved_model."""
+    predictor_factories.from_saved_model(self._export_dir)
+
+  def testFromSavedModelWithTags(self):
+    """Test loading from_saved_model with tags."""
+    predictor_factories.from_saved_model(self._export_dir, tags='serve')
+
+  def testFromSavedModelWithBadTags(self):
+    """Test that loading fails for bad tags."""
+    bad_tags_regex = ('.*? could not be found in SavedModel')
+    with self.assertRaisesRegexp(RuntimeError, bad_tags_regex):
+      predictor_factories.from_saved_model(self._export_dir, tags='bad_tag')
+
+  def testFromContribEstimator(self):
+    estimator = testing_common.get_arithmetic_estimator(core=False)
+    input_fn = testing_common.get_arithmetic_input_fn(core=False)
+    predictor_factories.from_contrib_estimator(
+        estimator, input_fn, output_alternative_key='sum')
+
+  def testFromContribEstimatorWithCoreEstimatorRaises(self):
+    estimator = testing_common.get_arithmetic_estimator(core=True)
+    input_fn = testing_common.get_arithmetic_input_fn(core=True)
+    with self.assertRaises(TypeError):
+      predictor_factories.from_contrib_estimator(estimator, input_fn)
+
+  def testFromCoreEstimator(self):
+    estimator = testing_common.get_arithmetic_estimator(core=True)
+    input_fn = testing_common.get_arithmetic_input_fn(core=True)
+    predictor_factories.from_estimator(estimator, input_fn)
+
+  def testFromCoreEstimatorWithContribEstimatorRaises(self):
+    estimator = testing_common.get_arithmetic_estimator(core=False)
+    input_fn = testing_common.get_arithmetic_input_fn(core=False)
+    with self.assertRaises(TypeError):
+      predictor_factories.from_estimator(estimator, input_fn)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/BUILD b/tensorflow/contrib/py2tf/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..d91220f6ddb859ff52d4e5853948cb667981009b
--- /dev/null
+++ b/tensorflow/contrib/py2tf/BUILD
@@ -0,0 +1,31 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "py2tf",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/py2tf/impl",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/py2tf/utils",
+        "@gast_archive//:gast",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/README.md b/tensorflow/contrib/py2tf/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cd50675ad57316b9c749c137e6acd30b91c10073
--- /dev/null
+++ b/tensorflow/contrib/py2tf/README.md
@@ -0,0 +1,4 @@
+# Py2TF
+
+A compiler for generating TensorFlow numeric and control flow ops from Python
+code.
diff --git a/tensorflow/contrib/py2tf/__init__.py b/tensorflow/contrib/py2tf/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..379fa7fd5c2a22b5b16a21cca8c2ea8afdcaeefa
--- /dev/null
+++ b/tensorflow/contrib/py2tf/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Py2TF compiles Python code into equivalent TensorFlow code.
+
+Equivalent here means that they have the same effect when executed.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf import utils
+from tensorflow.contrib.py2tf.impl.api import convert
+from tensorflow.contrib.py2tf.impl.api import graph_ready
+from tensorflow.contrib.py2tf.impl.api import to_code
+from tensorflow.contrib.py2tf.impl.api import to_graph
+from tensorflow.contrib.py2tf.pyct.transformer import PyFlowParseError
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'to_graph', 'to_code', 'convert', 'graph_ready', 'utils', 'PyFlowParseError'
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/py2tf/converters/BUILD b/tensorflow/contrib/py2tf/converters/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..93c751b28dae3aa480aed839029bd37a2f47056b
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/BUILD
@@ -0,0 +1,173 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "converters",
+    srcs = [
+        "asserts.py",
+        "break_statements.py",
+        "builtin_functions.py",
+        "call_trees.py",
+        "continue_statements.py",
+        "control_flow.py",
+        "decorators.py",
+        "for_loops.py",
+        "list_comprehension.py",
+        "logical_expressions.py",
+        "side_effect_guards.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "@gast_archive//:gast",
+    ],
+)
+
+py_library(
+    name = "test_lib",
+    srcs = [
+        "converter_test_base.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        ":converters",
+        "//tensorflow/contrib/py2tf/pyct/static_analysis",
+        "//tensorflow/contrib/py2tf/utils",
+        "@gast_archive//:gast",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "asserts_test",
+    srcs = ["asserts_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "break_statements_test",
+    srcs = ["break_statements_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "builtin_functions_test",
+    srcs = ["builtin_functions_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "call_trees_test",
+    srcs = ["call_trees_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "continue_statements_test",
+    srcs = ["continue_statements_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "control_flow_test",
+    srcs = ["control_flow_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "decorators_test",
+    srcs = ["decorators_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "for_loops_test",
+    srcs = ["for_loops_test.py"],
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "list_comprehension_test",
+    srcs = ["list_comprehension_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "logical_expressions_test",
+    srcs = ["logical_expressions_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "side_effect_guards_test",
+    srcs = ["side_effect_guards_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/converters/__init__.py b/tensorflow/contrib/py2tf/converters/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca10896ee5c6c23d9b20ff23add9945de68e5bf9
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Code converters used by Py2TF."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# TODO(mdan): Define a base transformer class that can recognize skip_processing
+# TODO(mdan): All converters are incomplete, especially those that change blocks
diff --git a/tensorflow/contrib/py2tf/converters/asserts.py b/tensorflow/contrib/py2tf/converters/asserts.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b9b8e772bed82df2429fd6cb94dbf7b565e22b3
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/asserts.py
@@ -0,0 +1,53 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts Assert statements to their corresponding TF calls."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+
+
+class AssertsTransformer(transformer.Base):
+  """Transforms Print nodes to Call so they can be handled as functions."""
+
+  # pylint:disable=invalid-name
+
+  def visit_Assert(self, node):
+    self.generic_visit(node)
+
+    # Note: The lone tf.Assert call will be wrapped with control_dependencies
+    # by side_effect_guards.
+    template = """
+      tf.Assert(test, [msg])
+    """
+
+    if node.msg is None:
+      return templates.replace(
+          template, test=node.test, msg=gast.Str('Assertion error'))
+    elif isinstance(node.msg, gast.Str):
+      return templates.replace(template, test=node.test, msg=node.msg)
+    else:
+      raise NotImplementedError('Can only convert string messages for now.')
+
+  # pylint:enable=invalid-name
+
+
+def transform(node, context):
+  return AssertsTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/asserts_test.py b/tensorflow/contrib/py2tf/converters/asserts_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6611f2777a93a7e819c8becfa06a09b27f4e6aaf
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/asserts_test.py
@@ -0,0 +1,42 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for asserts module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.converters import asserts
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.python.platform import test
+
+
+class AssertsTest(converter_test_base.TestCase):
+
+  def test_transform(self):
+
+    def test_fn(a):
+      assert a > 0
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = asserts.transform(node, self.ctx)
+
+    self.assertTrue(isinstance(node.body[0].body[0].value, gast.Call))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/break_statements.py b/tensorflow/contrib/py2tf/converters/break_statements.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfb709c5e32c6f19dc0fd109df61ece925d701a3
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/break_statements.py
@@ -0,0 +1,120 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Canonicalizes break statements by de-sugaring into a control boolean."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+
+
+class BreakCanonicalizationTransformer(transformer.Base):
+  """Canonicalizes continue statements into additional conditionals."""
+
+  def __init__(self, context):
+    super(BreakCanonicalizationTransformer, self).__init__(context)
+    # This is a stack structure, to correctly process nested loops.
+    self.break_uses = []
+
+  def _create_break_check(self):
+    template = """
+      (not var_name)
+    """
+    expr, = templates.replace(template, var_name=self.break_uses[-1][1])
+    return expr.value
+
+  def _create_break_trigger(self):
+    template = """
+      var_name = True
+    """
+    block = templates.replace(template, var_name=self.break_uses[-1][1])
+    block.append(gast.Continue())
+    return block
+
+  def _create_break_init(self):
+    template = """
+      var_name = False
+    """
+    assign, = templates.replace(template, var_name=self.break_uses[-1][1])
+    return assign
+
+  # TODO(mdan): Surely the transformer supports this better?
+  def _manual_visit_list(self, block):
+    new_block = []
+    for n in block:
+      new_n = self.visit(n)
+      if isinstance(new_n, list):
+        new_block.extend(new_n)
+      else:
+        new_block.append(new_n)
+    return new_block
+
+  def visit_While(self, node):
+    self.generic_visit(node.test)
+    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+
+    break_var = self.context.namer.new_symbol('break_requested',
+                                              scope.referenced)
+    self.break_uses.append([False, break_var])
+    node.body = self._manual_visit_list(node.body)
+    if self.break_uses[-1][0]:
+      node.test = gast.BoolOp(gast.And(), [
+          node.test,
+          gast.UnaryOp(gast.Not(), gast.Name(break_var, gast.Load(), None))
+      ])
+      final_nodes = [self._create_break_init(), node]
+    else:
+      final_nodes = node
+    self.break_uses.pop()
+
+    for n in node.orelse:
+      self.generic_visit(n)
+    return final_nodes
+
+  def visit_For(self, node):
+    self.generic_visit(node.target)
+    self.generic_visit(node.iter)
+    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+
+    break_var = self.context.namer.new_symbol('break_requested',
+                                              scope.referenced)
+    self.break_uses.append([False, break_var])
+    node.body = self._manual_visit_list(node.body)
+    if self.break_uses[-1][0]:
+      anno.setanno(node, 'extra_cond',
+                   gast.UnaryOp(gast.Not(),
+                                gast.Name(break_var, gast.Load(), None)))
+      final_nodes = [self._create_break_init(), node]
+    else:
+      final_nodes = node
+    self.break_uses.pop()
+
+    for n in node.orelse:
+      self.generic_visit(n)
+    return final_nodes
+
+  def visit_Break(self, node):
+    self.break_uses[-1][0] = True
+    return self._create_break_trigger()
+
+
+def transform(node, context):
+  return BreakCanonicalizationTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/break_statements_test.py b/tensorflow/contrib/py2tf/converters/break_statements_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..095fcdff07d44ecc6b9bb7f8d3e2c7c43df72a02
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/break_statements_test.py
@@ -0,0 +1,112 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for break_statements module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.converters import break_statements
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.python.platform import test
+
+
+class BreakCanonicalizationTest(converter_test_base.TestCase):
+
+  def test_basic_break(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        if x % 2 == 0:
+          break
+        v.append(x)
+      return v
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = break_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      self.assertEqual(test_fn(0), result.test_fn(0))
+      self.assertEqual(test_fn(1), result.test_fn(1))
+      self.assertEqual(test_fn(2), result.test_fn(2))
+      self.assertEqual(test_fn(3), result.test_fn(3))
+      self.assertEqual(test_fn(4), result.test_fn(4))
+
+  def test_basic_break_for_loop(self):
+
+    def test_fn(a):
+      v = []
+      for x in a:
+        x -= 1
+        if x % 2 == 0:
+          break
+        v.append(x)
+      return v
+
+    # The break is incompletely canonicalized for for loops. Everything is
+    # in place except for the condition verification.
+    def test_equiv_fn(a):
+      v = []
+      for x in a:
+        x -= 1
+        if x % 2 == 0:
+          continue
+        v.append(x)
+      return v
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = break_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      # The break is incompletely canonicalized. Everything is in place, but
+      # the loop does not break.
+      self.assertEqual(test_equiv_fn([]), result.test_fn([]))
+      self.assertEqual(test_equiv_fn([1]), result.test_fn([1]))
+      self.assertEqual(test_equiv_fn([2]), result.test_fn([2]))
+      self.assertEqual(
+          test_equiv_fn([1, 2, 3, 4]), result.test_fn([1, 2, 3, 4]))
+
+  def test_continue_deeply_nested(self):
+
+    def test_fn(x):
+      v = []
+      u = []
+      w = []
+      while x > 0:
+        x -= 1
+        if x % 2 == 0:
+          if x % 3 != 0:
+            u.append(x)
+          else:
+            w.append(x)
+            continue
+        v.append(x)
+      return v, u, w
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = break_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      self.assertEqual(test_fn(0), result.test_fn(0))
+      self.assertEqual(test_fn(1), result.test_fn(1))
+      self.assertEqual(test_fn(2), result.test_fn(2))
+      self.assertEqual(test_fn(3), result.test_fn(3))
+      self.assertEqual(test_fn(4), result.test_fn(4))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/builtin_functions.py b/tensorflow/contrib/py2tf/converters/builtin_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..310681dd016ca94bf2b28d27a4968cc0c10a5842
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/builtin_functions.py
@@ -0,0 +1,75 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Handles builtins and other special functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+
+
+class BuiltinFunctionTransformer(transformer.Base):
+  """Handles builtin functions and canonicalizes old-style print statement.
+
+  This transformer only covers functions that are translated into a
+  TF equivalent, like `len`.
+  Note that the `print` statement is converted to a function call here, but
+  wrapping the print function to a `py_func` is done by `call_trees` as a
+  generic uncompilable function wrap.
+  """
+
+  # TODO(mdan): Handle print entirely in here.
+  # Fully handling print here makes sense especially since we're considering
+  # using tf.Print instead.
+
+  def __init__(self, context):
+    super(BuiltinFunctionTransformer, self).__init__(context)
+
+  def _convert_len(self, node):
+    template = """
+      tf.shape(args)[0]
+    """
+    new_call = templates.replace(template, args=node.args)[0].value
+    return new_call
+
+  # pylint:disable=invalid-name
+
+  def visit_Call(self, node):
+    self.generic_visit(node)
+    # TODO(mdan): This won't work if the function was hidden.
+    if isinstance(node.func, gast.Name) and node.func.id == 'len':
+      return self._convert_len(node)
+    return node
+
+  def visit_Print(self, node):
+    self.generic_visit(node)
+    args = node.values
+    # Following is the case when calling print(a, b)
+    if len(args) == 1 and isinstance(args[0], gast.Tuple):
+      args = args[0].elts
+    template = """
+      fname(args)
+    """
+    return templates.replace(template, fname='print', args=args)
+
+  # pylint:enable=invalid-name
+
+
+def transform(node, context):
+  return BuiltinFunctionTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/builtin_functions_test.py b/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..983d1ffc03466ab3e2148e8cdf6e54050b9d3947
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/builtin_functions_test.py
@@ -0,0 +1,88 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for builtin_functions module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+import six
+
+from tensorflow.contrib.py2tf.converters import builtin_functions
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class BuiltinFunctionsTest(converter_test_base.TestCase):
+
+  def test_len(self):
+
+    def test_fn(a):
+      return len(a)
+
+    node = self.parse_and_analyze(test_fn, {'len': len})
+    node = builtin_functions.transform(node, self.ctx)
+
+    with self.compiled(node, array_ops.shape) as result:
+      with self.test_session() as sess:
+        self.assertEqual(3,
+                         sess.run(
+                             result.test_fn(constant_op.constant([0, 0, 0]))))
+
+  def test_print(self):
+
+    def test_fn(a):
+      print(a)
+
+    node = self.parse_and_analyze(test_fn, {'print': print})
+    node = builtin_functions.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      try:
+        out_capturer = six.StringIO()
+        sys.stdout = out_capturer
+        result.test_fn('a')
+        self.assertEqual(out_capturer.getvalue(), 'a\n')
+      finally:
+        sys.stdout = sys.__stdout__
+
+  def test_print_tuple(self):
+
+    def test_fn(a, b, c):
+      print(a, b, c)
+
+    node = self.parse_and_analyze(test_fn, {'print': print})
+    node = builtin_functions.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      try:
+        out_capturer = six.StringIO()
+        sys.stdout = out_capturer
+        result.test_fn('a', 1, [2, 3])
+        # It appears that the print output looks odd only under Python 2.
+        if six.PY2:
+          self.assertEqual(out_capturer.getvalue(), "('a', 1, [2, 3])\n")
+        else:
+          self.assertEqual(out_capturer.getvalue(), 'a 1 [2, 3]\n')
+      finally:
+        sys.stdout = sys.__stdout__
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/call_trees.py b/tensorflow/contrib/py2tf/converters/call_trees.py
new file mode 100644
index 0000000000000000000000000000000000000000..1050ba654c63bb52c1c5e71c981a6a0baa3fc987
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/call_trees.py
@@ -0,0 +1,275 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Handles function calls, by generating compiled function names and calls.
+
+Note: this transformer does not rename the top level object being converted;
+that is the caller's responsibility.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import types
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.python.util import tf_inspect
+
+
+class FunctionNamer(object):
+  """Describes the interface for CallTreeTransformer's namer."""
+
+  def compiled_function_name(self,
+                             original_fqn,
+                             live_entity=None,
+                             owner_type=None):
+    """Generate the name corresponding to the compiled version of a function.
+
+    Args:
+      original_fqn: string or tuple(string)
+      live_entity: Callable, the actual target function, if known.
+      owner_type: Optional object. If present, it indicates that the function is
+          a member of the given type.
+    Returns:
+      string, bool
+    """
+    raise NotImplementedError()
+
+  def compiled_class_name(self, original_fqn, live_entity=None):
+    """Generate the name corresponding to the compiled version of a class.
+
+    Args:
+      original_fqn: string or tuple(string)
+      live_entity: The actual target class, if known.
+    Returns:
+      string
+    """
+    raise NotImplementedError()
+
+
+class CallTreeTransformer(transformer.Base):
+  """Transforms the call tree by renaming transformed symbols."""
+
+  def __init__(self, context, uncompiled_modules, nocompile_decorators):
+    super(CallTreeTransformer, self).__init__(context)
+    self.uncompiled_modules = uncompiled_modules
+    self.nocompile_decorators = nocompile_decorators
+
+  # pylint:disable=invalid-name
+
+  def _resolve_name(self, node):
+    if isinstance(node, gast.Call):
+      return self._resolve_name(node.func)
+    if isinstance(node, gast.Name):
+      return self.context.namespace.get(node.id)
+    if isinstance(node, gast.Attribute):
+      parent = self._resolve_name(node.value)
+      if parent is not None:
+        return getattr(parent, node.attr)
+      return None
+    raise ValueError(node)
+
+  def _try_resolve_target(self, node):
+    """Works for methods of objects of known type."""
+    if anno.hasanno(node, 'live_val'):
+      return anno.getanno(node, 'live_val')
+    if isinstance(node, gast.Attribute) and anno.hasanno(node, 'type'):
+      owner_type = anno.getanno(node, 'type')
+      if hasattr(owner_type, node.attr):
+        return getattr(owner_type, node.attr)
+      else:
+        raise ValueError('Type "%s" has not attribute "%s". Is it dynamic?' %
+                         (owner_type, node.attr))
+    return None
+
+  def _should_compile(self, node, fqn):
+    for i in range(1, len(fqn)):
+      if fqn[:i] in self.uncompiled_modules:
+        return False
+
+    # Check for local decorations
+    if anno.hasanno(node, 'graph_ready'):
+      return False
+
+    # The decorators themselves are not to be converted.
+    # If present, the decorators should appear as static functions.
+    target_entity = self._try_resolve_target(node.func)
+    if target_entity is not None:
+      # This attribute is set by the decorator itself.
+      # TODO(mdan): This may not play nicely with other wrapping decorators.
+      if hasattr(target_entity, '__pyct_is_compile_decorator'):
+        return False
+
+      if target_entity in self.nocompile_decorators:
+        return False
+
+      # Inspect the target function decorators. If any include a @convert
+      # or @graph_ready annotation, then they must be called as they are.
+      # TODO(mdan): This may be quite heavy.
+      # To parse and re-analize each function for every call site could be quite
+      # wasteful. Maybe we could cache the parsed AST?
+      try:
+        target_node, _ = parser.parse_entity(target_entity)
+        target_node = target_node.body[0]
+      except TypeError:
+        # Functions whose source we cannot access are compilable (e.g. wrapped
+        # to py_func).
+        return True
+
+      for dec in target_node.decorator_list:
+        decorator_fn = self._resolve_name(dec)
+        if (decorator_fn is not None and
+            decorator_fn in self.nocompile_decorators):
+          return False
+
+    return True
+
+  def _determine_function_owner(self, m):
+    # TODO(mdan): The parent type should be known at analysis. Use that instead.
+    if hasattr(m, 'im_class'):  # Python 2
+      return m.im_class
+    if hasattr(m, '__qualname__'):  # Python 3
+      # Object attributes: should be bound to "self".
+      if hasattr(m, '__self__'):
+        return type(m.__self__)
+
+      # Class attributes: should have the owner name in their namespace.
+      qn = m.__qualname__.split('.')
+      if len(qn) < 2:
+        return None
+      owner_name, func_name = qn[-2:]
+      if func_name != m.__name__:
+        raise ValueError('Inconsistent names detected '
+                         '(__qualname__[1] = "%s", __name__ = "%s") for %s.' %
+                         (func_name, m.__name__, m))
+      if owner_name == '<locals>':
+        return None
+      if owner_name not in self.context.namespace:
+        raise ValueError(
+            'Could not resolve name "%s" while analyzing %s. Namespace:\n%s' %
+            (owner_name, m, self.context.namespace))
+      return self.context.namespace[owner_name]
+    return None
+
+  def _rename_compilable_function(self, node):
+    assert anno.hasanno(node.func, 'live_val')
+    assert anno.hasanno(node.func, 'fqn')
+    target_entity = anno.getanno(node.func, 'live_val')
+    target_fqn = anno.getanno(node.func, 'fqn')
+
+    if not self._should_compile(node, target_fqn):
+      return node
+
+    if anno.hasanno(node, 'is_constructor'):
+      new_name = self.context.namer.compiled_class_name(
+          target_fqn, live_entity=target_entity)
+      do_rename = True
+    else:
+      owner_type = self._determine_function_owner(target_entity)
+      new_name, do_rename = self.context.namer.compiled_function_name(
+          target_fqn, live_entity=target_entity, owner_type=owner_type)
+
+    if do_rename:
+      if target_entity is not None:
+        if tf_inspect.ismethod(target_entity):
+          # The renaming process will transform it into a regular function.
+          # TODO(mdan): Is this complete? How does it work with nested members?
+          node.args = [node.func.value] + node.args
+      node.func = templates.replace('func_name', func_name=new_name)[0]
+    return node
+
+  def _wrap_to_py_func_no_return(self, node):
+    # TODO(mdan): Properly handle varargs, kwargs, etc.
+    template = """
+      py2tf_utils.wrap_py_func(func, None, (original_args,), True)
+    """
+    return templates.replace(template, func=node.func, original_args=node.args)
+
+  def _function_is_compilable(self, target_entity):
+    # TODO(mdan): This is just a placeholder. Implement.
+    return not isinstance(target_entity, types.BuiltinFunctionType)
+
+  def visit_Expr(self, node):
+    if isinstance(node.value, gast.Call):
+      if anno.hasanno(node.value.func, 'live_val'):
+        target_entity = anno.getanno(node.value.func, 'live_val')
+        if not self._function_is_compilable(target_entity):
+          if anno.hasanno(node.value.func, 'fqn'):
+            target_fqn = anno.getanno(node.value.func, 'fqn')
+            if not self._should_compile(node.value, target_fqn):
+              return node
+            node = self._wrap_to_py_func_no_return(node.value)
+            return node
+      # Only the case of py_func with no return value is special.
+      # Everything else is processed by visit_Call.
+      self.visit(node.value)
+    else:
+      self.generic_visit(node)
+    return node
+
+  def visit_Call(self, node):
+    # If the function is wrapped by one of the marker decorators,
+    # consider it graph ready.
+    if anno.hasanno(node.func, 'live_val'):
+      target_entity = anno.getanno(node.func, 'live_val')
+      if target_entity in self.nocompile_decorators:
+        if len(node.args) < 1:
+          raise ValueError(
+              'Found call to decorator function "%s", but it had no arguments. '
+              'A decorator needs at least an argument.')
+        anno.setanno(node.args[0], 'graph_ready', True)
+
+    self.generic_visit(node)
+    if anno.hasanno(node.func, 'live_val'):
+      target_entity = anno.getanno(node.func, 'live_val')
+      if self._function_is_compilable(target_entity):
+        node = self._rename_compilable_function(node)
+      else:
+        raise NotImplementedError('py_func with return values')
+    else:
+      if self.context.recursive:
+        raise NotImplementedError('Could not resolve target function.')
+      else:
+        # TODO(mdan): Double check. Is this reachable code?
+        pass
+    return node
+
+  # pylint:enable=invalid-name
+
+
+def transform(node, context, uncompiled_modules, nocompile_decorators):
+  """Transform function call to the compiled counterparts.
+
+  Args:
+    node: AST to transform.
+    context: An EntityContext object.
+    uncompiled_modules: set of string tuples, each tuple represents the fully
+        qualified name of a package containing functions that will not be
+        compiled.
+    nocompile_decorators: A tuple containing decorators to be stripped from
+        functions during conversion.
+  Returns:
+    A tuple (node, new_names):
+        node: The transformed AST
+        new_names: set(string), containing any newly-generated names
+  """
+  t = CallTreeTransformer(context, uncompiled_modules, nocompile_decorators)
+  node = t.visit(node)
+  return node
diff --git a/tensorflow/contrib/py2tf/converters/call_trees_test.py b/tensorflow/contrib/py2tf/converters/call_trees_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..777648dc0b31863227262fbf931aba680bb4ed98
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/call_trees_test.py
@@ -0,0 +1,118 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for call_trees module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.converters import call_trees
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class CallTreesTest(converter_test_base.TestCase):
+
+  def test_basic(self):
+
+    def test_fn_1(_):
+      raise ValueError('This should not be called in the compiled verison.')
+
+    def renamed_test_fn_1(a):
+      return a + 1
+
+    def test_fn_2(a):
+      return test_fn_1(a) + 1
+
+    node = self.parse_and_analyze(test_fn_2, {'test_fn_1': test_fn_1})
+    node = call_trees.transform(node, self.ctx, (), ())
+
+    with self.compiled(node) as result:
+      # Only test_fn_2 is transformed, so we'll insert renamed_test_fn_1
+      # manually.
+      result.renamed_test_fn_1 = renamed_test_fn_1
+      self.assertEquals(3, result.test_fn_2(1))
+
+  def test_simple_methods(self):
+
+    class TestClass(object):
+
+      def test_fn_1(self, a):
+        return a + 1
+
+      def test_fn_2(self, a):
+        return self.test_fn_1(a) + 1
+
+    node = self.parse_and_analyze(
+        TestClass.test_fn_2, {'TestClass': TestClass},
+        arg_types={'self': (TestClass.__name__, TestClass)})
+    node = call_trees.transform(node, self.ctx, (), ())
+
+    with self.compiled(node) as result:
+      tc = TestClass()
+      self.assertEquals(3, result.test_fn_2(tc, 1))
+
+  def test_py_func_wrap_no_retval(self):
+
+    def test_fn(a):
+      setattr(a, 'foo', 'bar')
+
+    node = self.parse_and_analyze(test_fn, {'setattr': setattr})
+    node = call_trees.transform(node, self.ctx, (), ())
+
+    with self.compiled(node) as result:
+      with self.test_session() as sess:
+        # The function has no return value, so we do some tricks to grab the
+        # generated py_func node and ensure its effect only happens at graph
+        # execution.
+
+        class Dummy(object):
+          pass
+
+        a = Dummy()
+        result.test_fn(a)
+        self.assertFalse(hasattr(a, 'foo'))
+        sess.run(sess.graph.get_operations()[0])
+        self.assertEquals('bar', a.foo)
+
+  def test_uncompiled_modules(self):
+
+    def test_fn(a):
+      a = math_ops.multiply(a, constant_op.constant(2))
+      a = math_ops.add(a, constant_op.constant(1))
+      return a
+
+    node = self.parse_and_analyze(test_fn, {
+        'math_ops': math_ops,
+        'constant_op': constant_op
+    })
+    node = call_trees.transform(node, self.ctx,
+                                set(((math_ops.__name__,),
+                                     (constant_op.__name__,))), ())
+
+    with self.compiled(node) as result:
+      result.math_ops = math_ops
+      result.constant_op = constant_op
+      with self.test_session() as sess:
+        # Not renamed, because the converter doesn't rename the definition
+        # itself (the caller is responsible for that).
+        result_tensor = result.test_fn(constant_op.constant(1))
+        self.assertEquals(3, sess.run(result_tensor))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/continue_statements.py b/tensorflow/contrib/py2tf/converters/continue_statements.py
new file mode 100644
index 0000000000000000000000000000000000000000..4069a678b118b56b59d2e5491bb80cf52efd8143
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/continue_statements.py
@@ -0,0 +1,127 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Canonicalizes continue statements by de-sugaring into a control boolean."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+
+
+class ContinueCanonicalizationTransformer(transformer.Base):
+  """Canonicalizes continue statements into additional conditionals."""
+
+  def __init__(self, context):
+    super(ContinueCanonicalizationTransformer, self).__init__(context)
+    # This is a stack structure, to correctly process nested loops.
+    self.continuation_uses = []
+
+  def _create_continuation_check(self):
+    template = """
+      if not var_name:
+        pass
+    """
+    cond, = templates.replace(template, var_name=self.continuation_uses[-1][1])
+    cond.body = []
+    return cond
+
+  def _create_continuation_trigger(self):
+    template = """
+      var_name = True
+    """
+    assign, = templates.replace(
+        template, var_name=self.continuation_uses[-1][1])
+    return assign
+
+  def _create_continuation_init(self):
+    template = """
+      var_name = False
+    """
+    assign, = templates.replace(
+        template, var_name=self.continuation_uses[-1][1])
+    return assign
+
+  def _visit_and_reindent_if_necessary(self, nodes):
+    reorganized_nodes = []
+    current_dest = reorganized_nodes
+    continue_used_in_block = False
+    for i, n in enumerate(nodes):
+      # TODO(mdan): This could be optimized if control structures are simple.
+      self.continuation_uses[-1][0] = False
+      n = self.visit(n)
+      current_dest.append(n)
+      if self.continuation_uses[-1][0]:
+        continue_used_in_block = True
+        if i < len(nodes) - 1:  # Last statement in block needs no protection.
+          cond = self._create_continuation_check()
+          current_dest.append(cond)
+          current_dest = cond.body
+    self.continuation_uses[-1][0] = continue_used_in_block
+    return reorganized_nodes
+
+  def _process_loop_block(self, block, scope):
+    cont_var = self.context.namer.new_symbol('cont_requested', scope.referenced)
+    self.continuation_uses.append([False, cont_var])
+    block = self._visit_and_reindent_if_necessary(block)
+    if self.continuation_uses[-1][0]:
+      block.insert(0, self._create_continuation_init())
+    self.continuation_uses.pop()
+    return block
+
+  def visit_While(self, node):
+    self.generic_visit(node.test)
+    node.body = self._process_loop_block(node.body,
+                                         anno.getanno(node,
+                                                      NodeAnno.BODY_SCOPE))
+    for n in node.orelse:
+      self.generic_visit(n)
+    return node
+
+  def visit_For(self, node):
+    self.generic_visit(node.target)
+    self.generic_visit(node.iter)
+    node.body = self._process_loop_block(node.body,
+                                         anno.getanno(node,
+                                                      NodeAnno.BODY_SCOPE))
+    for n in node.orelse:
+      self.generic_visit(n)
+    return node
+
+  def visit_If(self, node):
+    if self.continuation_uses:
+      self.generic_visit(node.test)
+      node.body = self._visit_and_reindent_if_necessary(node.body)
+      continue_used_in_body = self.continuation_uses[-1][0]
+      node.orelse = self._visit_and_reindent_if_necessary(node.orelse)
+      self.continuation_uses[-1][0] = (
+          continue_used_in_body or self.continuation_uses[-1][0])
+    else:
+      node = self.generic_visit(node)
+    return node
+
+  def visit_Continue(self, node):
+    self.continuation_uses[-1][0] = True
+    return self._create_continuation_trigger()
+
+  def visit_Break(self, node):
+    assert False, 'break statement should be desugared at this point'
+
+
+def transform(node, namer):
+  return ContinueCanonicalizationTransformer(namer).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/continue_statements_test.py b/tensorflow/contrib/py2tf/converters/continue_statements_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a598dcd1aed29478b7e3fe27e3c1b20010247dd9
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/continue_statements_test.py
@@ -0,0 +1,98 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for continue_statements module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.converters import continue_statements
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.python.platform import test
+
+
+class ContinueCanonicalizationTest(converter_test_base.TestCase):
+
+  def test_basic_continue(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        if x % 2 == 0:
+          continue
+        v.append(x)
+      return v
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = continue_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      self.assertEqual(test_fn(0), result.test_fn(0))
+      self.assertEqual(test_fn(1), result.test_fn(1))
+      self.assertEqual(test_fn(2), result.test_fn(2))
+      self.assertEqual(test_fn(3), result.test_fn(3))
+      self.assertEqual(test_fn(4), result.test_fn(4))
+
+  def test_basic_continue_for_loop(self):
+
+    def test_fn(a):
+      v = []
+      for x in a:
+        x -= 1
+        if x % 2 == 0:
+          continue
+        v.append(x)
+      return v
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = continue_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      self.assertEqual(test_fn([]), result.test_fn([]))
+      self.assertEqual(test_fn([1]), result.test_fn([1]))
+      self.assertEqual(test_fn([2]), result.test_fn([2]))
+      self.assertEqual(test_fn([1, 2, 3]), result.test_fn([1, 2, 3]))
+
+  def test_continue_deeply_nested(self):
+
+    def test_fn(x):
+      v = []
+      u = []
+      w = []
+      while x > 0:
+        x -= 1
+        if x % 2 == 0:
+          if x % 3 != 0:
+            u.append(x)
+          else:
+            w.append(x)
+            continue
+        v.append(x)
+      return v, u, w
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = continue_statements.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      self.assertEqual(test_fn(0), result.test_fn(0))
+      self.assertEqual(test_fn(1), result.test_fn(1))
+      self.assertEqual(test_fn(2), result.test_fn(2))
+      self.assertEqual(test_fn(3), result.test_fn(3))
+      self.assertEqual(test_fn(4), result.test_fn(4))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/control_flow.py b/tensorflow/contrib/py2tf/converters/control_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..d53e3e4fd6d87004cbe55bd430346ad263e898ea
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/control_flow.py
@@ -0,0 +1,221 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Handles control flow statements: while, if."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import ast_util
+from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+
+
+class SymbolNamer(object):
+  """Describes the interface for ControlFlowTransformer's namer."""
+
+  def new_symbol(self, name_root, reserved_locals):
+    """Generate a new unique symbol.
+
+    Args:
+      name_root: String, used as stem in the new name.
+      reserved_locals: Set(string), additional local symbols that are reserved
+          and which should not be used.
+    Returns:
+      String.
+    """
+    raise NotImplementedError()
+
+
+class ControlFlowTransformer(transformer.Base):
+  """Transforms control flow structures like loops an conditionals."""
+
+  def __init__(self, context):
+    super(ControlFlowTransformer, self).__init__(context)
+
+  # pylint:disable=invalid-name
+
+  def visit_For(self, node):
+    assert False, 'for statement should have been canonicalized at this point'
+
+  def _create_cond_branch(self, body_name, aliased_orig_names,
+                          aliased_new_names, body, returns):
+    if aliased_orig_names:
+      template = """
+        def body_name():
+          aliased_new_names, = aliased_orig_names,
+          body
+          return (returns,)
+      """
+      return templates.replace(
+          template,
+          body_name=body_name,
+          body=body,
+          aliased_orig_names=aliased_orig_names,
+          aliased_new_names=aliased_new_names,
+          returns=returns)
+    else:
+      template = """
+        def body_name():
+          body
+          return (returns,)
+      """
+      return templates.replace(
+          template, body_name=body_name, body=body, returns=returns)
+
+  def _create_cond_expr(self, results, test, body_name, orelse_name):
+    if results is not None:
+      template = """
+        results = py2tf_utils.run_cond(test, body_name, orelse_name)
+      """
+      return templates.replace(
+          template,
+          test=test,
+          results=results,
+          body_name=body_name,
+          orelse_name=orelse_name)
+    else:
+      template = """
+        py2tf_utils.run_cond(test, body_name, orelse_name)
+      """
+      return templates.replace(
+          template, test=test, body_name=body_name, orelse_name=orelse_name)
+
+  def visit_If(self, node):
+    self.generic_visit(node)
+
+    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    orelse_scope = anno.getanno(node, NodeAnno.ORELSE_SCOPE)
+
+    if body_scope.created - orelse_scope.created:
+      raise ValueError(
+          'The if branch creates new symbols that the else branch does not.')
+    if orelse_scope.created - body_scope.created:
+      raise ValueError(
+          'The else branch creates new symbols that the if branch does not.')
+
+    modified = tuple(body_scope.modified | orelse_scope.modified)
+    all_referenced = body_scope.referenced | orelse_scope.referenced
+
+    # Alias the closure variables inside the conditional functions
+    # to avoid errors caused by the local variables created in the branch
+    # functions.
+    need_alias = (
+        (body_scope.modified | orelse_scope.modified) -
+        (body_scope.created | orelse_scope.created))
+    aliased_orig_names = tuple(need_alias)
+    aliased_new_names = tuple(
+        self.context.namer.new_symbol(s.ssf(), all_referenced)
+        for s in aliased_orig_names)
+    alias_map = dict(zip(aliased_orig_names, aliased_new_names))
+    node_body = ast_util.rename_symbols(node.body, alias_map)
+    node_orelse = ast_util.rename_symbols(node.orelse, alias_map)
+
+    if not modified:
+      # When the cond would return no value, we leave the cond called without
+      # results. That in turn should trigger the side effect guards. The
+      # branch functions will return a dummy value that ensures cond
+      # actually has some return value as well.
+      results = None
+    elif len(modified) == 1:
+      results = modified[0]
+    else:
+      results = gast.Tuple([s.ast() for s in modified], None)
+
+    body_name = self.context.namer.new_symbol('if_true', all_referenced)
+    orelse_name = self.context.namer.new_symbol('if_false', all_referenced)
+    if modified:
+      body_returns = tuple(
+          alias_map[s] if s in aliased_orig_names else s for s in modified)
+    else:
+      body_returns = templates.replace('tf.ones(())')[0].value
+
+    body_def = self._create_cond_branch(
+        body_name,
+        aliased_orig_names=tuple(aliased_orig_names),
+        aliased_new_names=tuple(aliased_new_names),
+        body=node_body,
+        returns=body_returns)
+    orelse_def = self._create_cond_branch(
+        orelse_name,
+        aliased_orig_names=tuple(aliased_orig_names),
+        aliased_new_names=tuple(aliased_new_names),
+        body=node_orelse,
+        returns=body_returns)
+    cond_expr = self._create_cond_expr(results, node.test, body_name,
+                                       orelse_name)
+
+    return body_def + orelse_def + cond_expr
+
+  def visit_While(self, node):
+    self.generic_visit(node)
+
+    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    body_closure = body_scope.modified - body_scope.created
+    all_referenced = body_scope.referenced
+
+    state = list(body_closure)
+    state_ssf = [
+        self.context.namer.new_symbol(s.ssf(), all_referenced) for s in state
+    ]
+    ssf_map = {
+        name: ssf
+        for name, ssf in zip(state, state_ssf)
+        if str(name) != ssf
+    }
+
+    if len(state) == 1:
+      state = state[0]
+      state_ssf = state_ssf[0]
+      state_ast_tuple = state
+    else:
+      state_ast_tuple = gast.Tuple([n.ast() for n in state], None)
+
+    node_body = ast_util.rename_symbols(node.body, ssf_map)
+    test = ast_util.rename_symbols(node.test, ssf_map)
+
+    template = """
+      def test_name(state_ssf):
+        return test
+      def body_name(state_ssf):
+        body
+        return state_ssf,
+      state_ast_tuple = py2tf_utils.run_while(test_name, body_name, [state])
+    """
+    node = templates.replace(
+        template,
+        state=state,
+        state_ssf=state_ssf,
+        state_ast_tuple=state_ast_tuple,
+        test_name=self.context.namer.new_symbol('loop_test',
+                                                body_scope.referenced),
+        test=test,
+        body_name=self.context.namer.new_symbol('loop_body',
+                                                body_scope.referenced),
+        body=node_body)
+
+    return node
+
+  # pylint:enable=invalid-name
+
+
+def transform(node, context):
+  t = ControlFlowTransformer(context)
+  node = t.visit(node)
+  return node
diff --git a/tensorflow/contrib/py2tf/converters/control_flow_test.py b/tensorflow/contrib/py2tf/converters/control_flow_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b785b284a7fb7a0257551326c88b44a341b295ba
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/control_flow_test.py
@@ -0,0 +1,99 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for control_flow module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.converters import control_flow
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.platform import test
+
+
+class ControlFlowTest(converter_test_base.TestCase):
+
+  def test_simple_while(self):
+
+    def test_fn(n):
+      i = 0
+      s = 0
+      while i < n:
+        s += i
+        i += 1
+      return s, i, n
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node, control_flow_ops.while_loop) as result:
+      with self.test_session() as sess:
+        self.assertEqual((10, 5, 5),
+                         sess.run(result.test_fn(constant_op.constant(5))))
+
+  def test_while_single_var(self):
+
+    def test_fn(n):
+      while n > 0:
+        n -= 1
+      return n
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node, control_flow_ops.while_loop) as result:
+      with self.test_session() as sess:
+        self.assertEqual(0, sess.run(result.test_fn(constant_op.constant(5))))
+
+  def test_simple_if(self):
+
+    def test_fn(n):
+      a = 0
+      b = 0
+      if n > 0:
+        a = -n
+      else:
+        b = 2 * n
+      return a, b
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node, control_flow_ops.cond) as result:
+      with self.test_session() as sess:
+        self.assertEqual((-1, 0),
+                         sess.run(result.test_fn(constant_op.constant(1))))
+        self.assertEqual((0, -2),
+                         sess.run(result.test_fn(constant_op.constant(-1))))
+
+  def test_if_single_var(self):
+
+    def test_fn(n):
+      if n > 0:
+        n = -n
+      return n
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node, control_flow_ops.cond) as result:
+      with self.test_session() as sess:
+        self.assertEqual(-1, sess.run(result.test_fn(constant_op.constant(1))))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/converter_test_base.py b/tensorflow/contrib/py2tf/converters/converter_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..67747183dd323a799a04943ce4c7fe8c4093d002
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/converter_test_base.py
@@ -0,0 +1,103 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base class for tests in this module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import imp
+
+from tensorflow.contrib.py2tf import utils
+from tensorflow.contrib.py2tf.pyct import compiler
+from tensorflow.contrib.py2tf.pyct import context
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.py2tf.pyct.static_analysis import activity
+from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
+from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
+from tensorflow.python.platform import test
+
+
+class FakeNamer(object):
+
+  def new_symbol(self, name_root, used):
+    i = 0
+    while True:
+      name = '%s%d' % (name_root, i)
+      if name not in used:
+        return name
+      i += 1
+
+  def compiled_function_name(self,
+                             original_fqn,
+                             live_entity=None,
+                             owner_type=None):
+    del live_entity
+    if owner_type is not None:
+      return None, False
+    return ('renamed_%s' % '_'.join(original_fqn)), True
+
+
+class TestCase(test.TestCase):
+  """Base class for unit tests in this module. Contains relevant utilities."""
+
+  @contextlib.contextmanager
+  def compiled(self, node, *symbols):
+    source = '<compile failed>'
+    try:
+      result, source = compiler.ast_to_object(node)
+      result.tf = self.make_fake_tf(*symbols)
+      result.py2tf_utils = utils
+      yield result
+    except Exception:  # pylint:disable=broad-except
+      print('Offending compiled code:\n%s' % source)
+      raise
+
+  def make_fake_tf(self, *symbols):
+    fake_tf = imp.new_module('fake_tf')
+    for s in symbols:
+      setattr(fake_tf, s.__name__, s)
+    return fake_tf
+
+  def attach_namespace(self, module, **ns):
+    for k, v in ns.items():
+      setattr(module, k, v)
+
+  def parse_and_analyze(self,
+                        test_fn,
+                        namespace,
+                        namer=None,
+                        arg_types=None,
+                        include_type_analysis=True,
+                        recursive=True):
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=namer or FakeNamer(),
+        source_code=source,
+        source_file=None,
+        namespace=namespace,
+        arg_values=None,
+        arg_types=arg_types,
+        recursive=recursive)
+    node = qual_names.resolve(node)
+    node = activity.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
+    if include_type_analysis:
+      node = type_info.resolve(node, ctx)
+      node = live_values.resolve(node, ctx, {})
+    self.ctx = ctx
+    return node
diff --git a/tensorflow/contrib/py2tf/converters/decorators.py b/tensorflow/contrib/py2tf/converters/decorators.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f620c1cd2d9b75f82410754a7e812e13eabe3ae
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/decorators.py
@@ -0,0 +1,62 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Handles decorators.
+
+Note: this module only deals with functions whose decorators are still recorded
+in the AST. This does not always happen. See the unit test for an example.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import pretty_printer
+
+
+class DecoratorsTransformer(gast.NodeTransformer):
+  """Converts or removes decorators."""
+
+  def __init__(self, remove_decorators):
+    self.remove_decorators = remove_decorators
+
+  # pylint:disable=invalid-name
+
+  def visit_FunctionDef(self, node):
+    self.generic_visit(node)
+    kept_decorators = []
+    for dec in node.decorator_list:
+      if isinstance(dec, gast.Call):
+        dec_func = dec.func
+      else:
+        dec_func = dec
+      if not anno.hasanno(dec_func, 'live_val'):
+        raise ValueError(
+            'Could not resolve decorator: %s' % pretty_printer.fmt(dec_func))
+      dec_value = anno.getanno(dec_func, 'live_val')
+      if dec_value not in self.remove_decorators:
+        kept_decorators.append(dec)
+    node.decorator_list = kept_decorators
+    return node
+
+  # pylint:enable=invalid-name
+
+
+def transform(node, remove_decorators):
+  transformer = DecoratorsTransformer(remove_decorators)
+  node = transformer.visit(node)
+  return node
diff --git a/tensorflow/contrib/py2tf/converters/decorators_test.py b/tensorflow/contrib/py2tf/converters/decorators_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..402fa0dda28e696f70d0354ca4abf3a6c83506d9
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/decorators_test.py
@@ -0,0 +1,102 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for decorators module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import textwrap
+
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import decorators
+from tensorflow.contrib.py2tf.pyct import compiler
+from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
+
+
+class DecoratorsTest(converter_test_base.TestCase):
+
+  def test_function_decorator(self):
+
+    def function_decorator():
+
+      def decorator(f):
+        return lambda a: f(a) + 1
+
+      return decorator
+
+    # The Python parser does capture decorators into the AST.
+    # However, the interpreter desugars them on load, and refering to the
+    # decorated function at runtime usually loses any trace of the decorator.
+    # Below is an example when that doesn't happen.
+    def static_wrapper():
+
+      @function_decorator()
+      def test_fn(a):  # pylint:disable=unused-variable
+        return a
+
+    node = self.parse_and_analyze(static_wrapper,
+                                  {'function_decorator': function_decorator})
+    node = node.body[0].body[0]
+
+    node = decorators.transform(node, remove_decorators=())
+    # Since the decorator is not removed, we need to include its source
+    # code. We cannot do it after the fact because decorators are executed
+    # on load.
+    result, _ = compiler.ast_to_object(
+        node,
+        source_prefix=textwrap.dedent(tf_inspect.getsource(function_decorator)))
+    self.assertEqual(2, result.test_fn(1))
+
+    node = decorators.transform(node, remove_decorators=(function_decorator,))
+    with self.compiled(node) as result:
+      self.assertEqual(1, result.test_fn(1))
+
+  def test_simple_decorator(self):
+
+    def simple_decorator(f):
+      return lambda a: f(a) + 1
+
+    # The Python parser does capture decorators into the AST.
+    # However, the interpreter desugars them upon load, and refering to the
+    # decorated function at runtime usually loses any trace of the decorator.
+    # Below is an example when that doesn't happen.
+    def static_wrapper():
+
+      @simple_decorator
+      def test_fn(a):  # pylint:disable=unused-variable
+        return a
+
+    node = self.parse_and_analyze(static_wrapper,
+                                  {'simple_decorator': simple_decorator})
+    node = node.body[0].body[0]
+
+    node = decorators.transform(node, remove_decorators=())
+    # Since the decorator is not removed, we need to include its source
+    # code. We cannot do it after the fact because decorators are executed
+    # on load.
+    result, _ = compiler.ast_to_object(
+        node,
+        source_prefix=textwrap.dedent(tf_inspect.getsource(simple_decorator)))
+    self.assertEqual(2, result.test_fn(1))
+
+    node = decorators.transform(node, remove_decorators=(simple_decorator,))
+    with self.compiled(node) as result:
+      self.assertEqual(1, result.test_fn(1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/for_loops.py b/tensorflow/contrib/py2tf/converters/for_loops.py
new file mode 100644
index 0000000000000000000000000000000000000000..935dade0ed30975dd29c8ffe5be875993936d241
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/for_loops.py
@@ -0,0 +1,86 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Canonicalizes for loops into while loops.
+
+This canonicalizer uses the len function on its argument. That should be
+converted to a tf.shape separately.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+
+
+class ForLoopCanonicalizationTransformer(transformer.Base):
+  """Canonicalizes for loops (e.g. into while loops)."""
+
+  def __init__(self, context):
+    super(ForLoopCanonicalizationTransformer, self).__init__(context)
+
+  def visit_For(self, node):
+    self.generic_visit(node)
+    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+
+    if anno.hasanno(node, 'extra_cond'):
+      template = """
+        i = 0
+        n = len(loop_iter)
+        while i < n and extra_cond:
+          # TODO(mdan): Use TensorListFromTensor(loop_iter) here.
+          target = loop_iter[i]
+          body
+          i += 1
+      """
+      return templates.replace(
+          template,
+          loop_iter=node.iter,
+          target=node.target,
+          body=node.body,
+          i=self.context.namer.new_symbol('i', body_scope.referenced),
+          n=self.context.namer.new_symbol('n', body_scope.referenced),
+          extra_cond=anno.getanno(node, 'extra_cond'))
+    else:
+      template = """
+        i = 0
+        n = len(loop_iter)
+        while i < n:
+          # TODO(mdan): Use TensorListFromTensor(loop_iter) here.
+          target = loop_iter[i]
+          body  # pylint:disable=pointless-statement
+          i += 1
+      """
+      repl = templates.replace(
+          template,
+          loop_iter=node.iter,
+          target=node.target,
+          body=node.body,
+          i=self.context.namer.new_symbol('i', body_scope.referenced),
+          n=self.context.namer.new_symbol('n', body_scope.referenced))
+      return repl
+
+  def visit_Continue(self, node):
+    assert False, 'continue statement should be desugared at this point'
+
+  def visit_Break(self, node):
+    assert False, 'break statement should be desugared at this point'
+
+
+def transform(node, context):
+  return ForLoopCanonicalizationTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/for_loops_test.py b/tensorflow/contrib/py2tf/converters/for_loops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..70a367d3b517e528b67f260d607431d324d2ab7d
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/for_loops_test.py
@@ -0,0 +1,47 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for for_loops module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import for_loops
+from tensorflow.python.platform import test
+
+
+class ControlFlowTest(converter_test_base.TestCase):
+
+  def test_basic_for(self):
+
+    def test_fn(l):
+      s = 0
+      for e in l:
+        s += e
+      return s
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = for_loops.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      l = [1, 2, 3]
+      self.assertEqual(test_fn(l), result.test_fn(l))
+      l = []
+      self.assertEqual(test_fn(l), result.test_fn(l))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/list_comprehension.py b/tensorflow/contrib/py2tf/converters/list_comprehension.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8744831100e4852919b5cd1253b74acea4d790d
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/list_comprehension.py
@@ -0,0 +1,80 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Canonicalizing list comprehensions into for and if statements.
+
+e.g.
+result = [x * x for x in xs]
+
+becomes
+
+result = []
+for x in xs:
+  elt = x * x
+  result.append(elt)
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+
+
+class ListCompCanonicalizationTransformer(transformer.Base):
+  """NodeTransformer to canonicalize list comprehensions."""
+
+  def __init__(self, context):
+    super(ListCompCanonicalizationTransformer, self).__init__(context)
+
+  def make_update_list_node(self, list_, elt):
+    return templates.replace('list_.append(elt)', list_=list_, elt=elt)[0]
+
+  def instantiate_list_node(self):
+    return parser.parse_str('[]').body[0].value
+
+  def visit_Assign(self, node):
+    if not isinstance(node.value, gast.ListComp):
+      return node
+    if len(node.targets) > 1:
+      raise ValueError('Only support single assignment.')
+    return self.canonicalize_listcomp(node.targets[0], node.value)
+
+  def canonicalize_listcomp(self, result_node, list_comp_node):
+
+    make_list = templates.replace(
+        'list_ = create_list',
+        list_=result_node,
+        create_list=self.instantiate_list_node())
+    loop_body = self.make_update_list_node(result_node, list_comp_node.elt)
+
+    for gen in reversed(list_comp_node.generators):
+      for gen_if in reversed(gen.ifs):
+        loop_body = templates.replace(
+            'if test: loop_body', test=gen_if, loop_body=loop_body)
+      loop_body = templates.replace(
+          'for target in iter_: loop_body',
+          iter_=gen.iter,
+          target=gen.target,
+          loop_body=loop_body)
+
+    return make_list + loop_body
+
+
+def transform(node, context):
+  return ListCompCanonicalizationTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/list_comprehension_test.py b/tensorflow/contrib/py2tf/converters/list_comprehension_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..025fac11e41e6771fbb9b80ff3da70dc3ceec73e
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/list_comprehension_test.py
@@ -0,0 +1,75 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for list_comprehension module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import list_comprehension
+from tensorflow.python.platform import test
+
+
+class ListCompTest(converter_test_base.TestCase):
+
+  def test_basic(self):
+
+    def test_fn(l):
+      s = [e * e for e in l]
+      return s
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = list_comprehension.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      l = [1, 2, 3]
+      self.assertEqual(test_fn(l), result.test_fn(l))
+      l = []
+      self.assertEqual(test_fn(l), result.test_fn(l))
+
+  def test_multiple_generators(self):
+
+    def test_fn(l):
+      s = [e * e for sublist in l for e in sublist]
+      return s
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = list_comprehension.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      l = [[1], [2], [3]]
+      self.assertEqual(test_fn(l), result.test_fn(l))
+      l = []
+      self.assertEqual(test_fn(l), result.test_fn(l))
+
+  def test_conds(self):
+
+    def test_fn(l):
+      s = [e * e for e in l if e > 1]
+      return s
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = list_comprehension.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      l = [1, 2, 3]
+      self.assertEqual(test_fn(l), result.test_fn(l))
+      l = []
+      self.assertEqual(test_fn(l), result.test_fn(l))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/logical_expressions.py b/tensorflow/contrib/py2tf/converters/logical_expressions.py
new file mode 100644
index 0000000000000000000000000000000000000000..df980d41c9c57e325bee9a1fa870d9c95f46ea41
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/logical_expressions.py
@@ -0,0 +1,74 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converter for logical expressions.
+
+e.g. `a and b -> tf.logical_and(a, b)`. This is not done automatically in TF.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import parser
+
+
+class LogicalExpressionTransformer(gast.NodeTransformer):
+  """Converts logical expressions to corresponding TF calls."""
+
+  def __init__(self):
+    # TODO(mdan): Look into replacing with bitwise operators instead.
+    self.op_mapping = {
+        gast.And: 'tf.logical_and',
+        gast.Or: 'tf.logical_or',
+        gast.Not: 'tf.logical_not',
+        gast.Eq: 'tf.equal',
+    }
+
+  def visit_Compare(self, node):
+    node = self.generic_visit(node)
+    if len(node.ops) > 1:
+      raise NotImplementedError()
+    cmp_type = type(node.ops[0])
+    if cmp_type in self.op_mapping:
+      tf_function = parser.parse_str(self.op_mapping[cmp_type]).body[0].value
+      return gast.Call(
+          func=tf_function, args=[node.left, node.comparators[0]], keywords=[])
+    return node
+
+  def visit_UnaryOp(self, node):
+    node = self.generic_visit(node)
+    if isinstance(node.op, gast.Not):
+      tf_function = parser.parse_str(self.op_mapping[type(
+          node.op)]).body[0].value
+      node = gast.Call(func=tf_function, args=[node.operand], keywords=[])
+    return node
+
+  def visit_BoolOp(self, node):
+    # TODO(mdan): A normalizer may be useful here. Use ANF?
+    node = self.generic_visit(node)
+    tf_function = parser.parse_str(self.op_mapping[type(node.op)]).body[0].value
+    left = node.values[0]
+    for i in range(1, len(node.values)):
+      left = gast.Call(
+          func=tf_function, args=[left, node.values[i]], keywords=[])
+    return left
+
+
+def transform(node):
+  transformer = LogicalExpressionTransformer()
+  node = transformer.visit(node)
+  return node
diff --git a/tensorflow/contrib/py2tf/converters/logical_expressions_test.py b/tensorflow/contrib/py2tf/converters/logical_expressions_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a28326c517d468230f35e45f0fbfe5257d769895
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/logical_expressions_test.py
@@ -0,0 +1,57 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for logical_expressions module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import logical_expressions
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class GradientsFunctionTest(converter_test_base.TestCase):
+
+  def test_equals(self):
+
+    def test_fn(a, b):
+      return a == b
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = logical_expressions.transform(node)
+
+    with self.compiled(node, math_ops.equal) as result:
+      with self.test_session() as sess:
+        self.assertTrue(sess.run(result.test_fn(1, 1)))
+        self.assertFalse(sess.run(result.test_fn(1, 2)))
+
+  def test_bool_ops(self):
+
+    def test_fn(a, b, c):
+      return (a or b) and (a or b or c)
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = logical_expressions.transform(node)
+
+    with self.compiled(node, math_ops.logical_or,
+                       math_ops.logical_and) as result:
+      with self.test_session() as sess:
+        self.assertTrue(sess.run(result.test_fn(True, False, True)))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards.py b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
new file mode 100644
index 0000000000000000000000000000000000000000..30976b3ec6db5a6607023ac804d9d54cfb296190
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards.py
@@ -0,0 +1,190 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Adds guards against function calls with side effects.
+
+Only standalone calls are guarded.
+
+WARNING: This mechanism is incomplete. Particularly, it only guards the
+arguments passed to functions, and does not account for indirectly modified
+state.
+
+Example:
+  y = tf.layers.dense(x)       # Creates TF variable 'foo'
+  loss = loss(y)
+  opt.minimize(loss)           # indirectly affects 'foo'
+  z = tf.get_variable('foo')   # Indirectly affects `loss` and 'foo'
+  # Here, `loss` can be guarded. But `z` cannot.
+
+# TODO(mdan): We should probably define a safe mode where we guard everything.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import ast_util
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+
+
+class SymbolNamer(object):
+  """Describes the interface for SideEffectGuardTransformer's namer."""
+
+  def new_symbol(self, name_root, reserved_locals):
+    """Generate a new unique function_name.
+
+    Args:
+      name_root: String, used as stem in the new name.
+      reserved_locals: Set(string), additional local symbols that are reserved.
+    Returns:
+      String.
+    """
+    raise NotImplementedError()
+
+
+class SideEffectGuardTransformer(transformer.Base):
+  """Adds control dependencies to functions with side effects."""
+
+  def __init__(self, context):
+    super(SideEffectGuardTransformer, self).__init__(context)
+
+  # pylint:disable=invalid-name
+
+  def _visit_and_reindent(self, nodes):
+    new_nodes = []
+    current_dest = new_nodes
+    alias_map = {}
+    reindent_requested = False
+    for n in nodes:
+      n = self.visit(n)
+      # NOTE: the order in which these statements execute is important; in
+      # particular, watch out for ending up with cycles in the AST.
+      if alias_map:
+        n = ast_util.rename_symbols(n, alias_map)
+      if isinstance(n, (list, tuple)):
+        current_dest.extend(n)
+      else:
+        current_dest.append(n)
+      if anno.hasanno(n, anno.Basic.INDENT_BLOCK_REMAINDER):
+        reindent_requested = True
+        new_dest, new_alias_map = anno.getanno(
+            n, anno.Basic.INDENT_BLOCK_REMAINDER)
+        anno.delanno(n, anno.Basic.INDENT_BLOCK_REMAINDER)
+        new_alias_map.update(alias_map)
+        alias_map = new_alias_map
+        current_dest = new_dest
+    if reindent_requested and not current_dest:
+      # TODO(mdan): There may still be something that could be done.
+      raise ValueError('Unable to insert statement into the computation flow: '
+                       'it is not followed by any computation which '
+                       'the statement could gate.')
+    return new_nodes
+
+  def visit_FunctionDef(self, node):
+    node.body = self._visit_and_reindent(node.body)
+    return node
+
+  def visit_With(self, node):
+    node.body = self._visit_and_reindent(node.body)
+    return node
+
+  def visit_If(self, node):
+    node.body = self._visit_and_reindent(node.body)
+    node.orelse = self._visit_and_reindent(node.orelse)
+    return node
+
+  def visit_While(self, node):
+    node.body = self._visit_and_reindent(node.body)
+    node.orelse = self._visit_and_reindent(node.orelse)
+    return node
+
+  def visit_Expr(self, node):
+    self.generic_visit(node)
+    if isinstance(node.value, gast.Call):
+      # Patterns of single function calls, like:
+      #   opt.minimize(loss)
+      # or:
+      #   tf.py_func(...)
+
+      # First, attempt to gate future evaluation of args. If that's not
+      # possible, gate all remaining statements (and that may fail too, see
+      # _visit_and_reindent.
+      args_scope = anno.getanno(node.value, NodeAnno.ARGS_SCOPE)
+      # NOTE: We can't guard object attributes because they may not be writable.
+      # In addition, avoid renaming well-known names.
+      # TODO(mdan): Move these names into config.
+      unguarded_names = (qual_names.QN('self'), qual_names.QN('tf'))
+      guarded_args = tuple(s for s in args_scope.used
+                           if not s.is_composite() and s not in unguarded_names)
+
+      # TODO(mdan): Include all arguments which depended on guarded_args too.
+      # For example, the following will still cause a race:
+      #   tf.assign(a, a + 1)
+      #   b = a + 1
+      #   tf.assign(a, a + 1)  # Control deps here should include `b`
+      #   c = b + 1
+      # Or maybe we should just raise an "unsafe assign" error?
+
+      if guarded_args:
+        # The aliases may need new names to avoid incorrectly making them local.
+        # TODO(mdan): This is brutal. It will even rename modules - any fix?
+        need_alias = tuple(
+            s for s in guarded_args if s not in args_scope.parent.modified)
+        aliased_new_names = tuple(
+            qual_names.QN(
+                self.context.namer.new_symbol(
+                    s.ssf(), args_scope.parent.referenced)) for s in need_alias)
+        alias_map = dict(zip(need_alias, aliased_new_names))
+        if len(guarded_args) == 1:
+          s, = guarded_args
+          aliased_guarded_args = alias_map.get(s, s)
+        else:
+          aliased_guarded_args = gast.Tuple(
+              [alias_map.get(s, s).ast() for s in guarded_args], None)
+
+        template = """
+          with py2tf_utils.control_dependency_on_returns(call):
+            aliased_guarded_args = py2tf_utils.alias_tensors(guarded_args)
+        """
+        control_deps_guard = templates.replace(
+            template,
+            call=node.value,
+            aliased_guarded_args=aliased_guarded_args,
+            guarded_args=guarded_args)[-1]
+      else:
+        alias_map = {}
+
+        template = """
+          with py2tf_utils.control_dependency_on_returns(call):
+            pass
+        """
+        control_deps_guard = templates.replace(template, call=node.value)[-1]
+        control_deps_guard.body = []
+
+      node = control_deps_guard
+      anno.setanno(node, anno.Basic.INDENT_BLOCK_REMAINDER,
+                   (node.body, alias_map))
+    return node
+
+  # pylint:enable=invalid-name
+
+
+def transform(node, context):
+  return SideEffectGuardTransformer(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..463db2e770213ba9636d2537b095a77dece5d8f6
--- /dev/null
+++ b/tensorflow/contrib/py2tf/converters/side_effect_guards_test.py
@@ -0,0 +1,165 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for side_effect_guards module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.converters import converter_test_base
+from tensorflow.contrib.py2tf.converters import side_effect_guards
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class SideEffectGuardsTest(converter_test_base.TestCase):
+
+  def test_side_effect_on_return_only_variable(self):
+
+    tf = None
+
+    def test_fn(a):
+      tf.assign(a, a + 1)
+      return a
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = side_effect_guards.transform(node, self.ctx)
+
+    with self.compiled(node, state_ops.assign) as result:
+      self.assertEqual(len(node.body[0].body), 1)
+      with self.test_session() as sess:
+        v = variables.Variable(2)
+        sess.run(v.initializer)
+        # NOTE: We don't expect the assignment to execute in this case, because
+        # variables cannot be reliably guarded.
+        self.assertEqual(2, sess.run(result.test_fn(v)))
+
+  def test_side_effect_on_used_variable(self):
+
+    tf = None
+
+    def test_fn(a):
+      tf.assign(a, a + 1)
+      return a + 1
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = side_effect_guards.transform(node, self.ctx)
+
+    with self.compiled(node, state_ops.assign) as result:
+      self.assertEqual(len(node.body[0].body), 1)
+      with self.test_session() as sess:
+        v = variables.Variable(2)
+        sess.run(v.initializer)
+        # NOTE: Unlike test_side_effect_on_return_only_variable, the variable
+        # was used in the local scope and so we could catch the assign's side
+        # effect.
+        self.assertEqual(4, sess.run(result.test_fn(v)))
+
+  def test_side_effect_on_tensor(self):
+
+    tf = None
+
+    def test_fn(a):
+      tf.Assert(a > 0, ['expected in throw'])
+      return a
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = side_effect_guards.transform(node, self.ctx)
+
+    with self.compiled(node, control_flow_ops.Assert) as result:
+      self.assertEqual(len(node.body[0].body), 1)
+      with self.test_session() as sess:
+        # NOTE: In this case we can also capture the side effect because the
+        # argument is a tensor ans we can wrap it inside an identity.
+        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                     'expected in throw'):
+          sess.run(result.test_fn(constant_op.constant(-1)))
+
+  def test_multiline_block(self):
+
+    tf = None
+
+    def test_fn(a):
+      tf.assign(a, a + 1)
+      b = a + 1
+      tf.assign(a, b + 1)
+      c = b + 1
+      d = c + 1
+      return d
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = side_effect_guards.transform(node, self.ctx)
+
+    with self.compiled(node, state_ops.assign) as result:
+      self.assertEqual(len(node.body[0].body), 1)
+      with self.test_session() as sess:
+        v = variables.Variable(2)
+        sess.run(v.initializer)
+        self.assertEqual(6, sess.run(result.test_fn(v)))
+
+  def test_multiline_nested_block(self):
+
+    tf = None
+
+    def test_fn(a):
+      with tf.name_scope('foo'):
+        tf.assign(a, a + 1)
+        b = a + 1
+        c = b + 1
+        d = c + 1
+      return d
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = side_effect_guards.transform(node, self.ctx)
+
+    with self.compiled(node, state_ops.assign, ops.name_scope) as result:
+      self.assertEqual(len(node.body[0].body[0].body), 1)
+      with self.test_session() as sess:
+        v = variables.Variable(2)
+        sess.run(v.initializer)
+        self.assertEqual(6, sess.run(result.test_fn(v)))
+
+  def test_multiline_block_unsafe(self):
+
+    tf = None
+
+    def test_fn(a):
+      tf.assign(a, a + 1)
+      b = a + 1
+      tf.assign(a, a + 1)
+      c = b + 1
+      d = c + 1
+      return d
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = side_effect_guards.transform(node, self.ctx)
+
+    with self.compiled(node, state_ops.assign) as result:
+      self.assertEqual(len(node.body[0].body), 1)
+      with self.test_session() as sess:
+        v = variables.Variable(2)
+        sess.run(v.initializer)
+        # NOTE: This intentionally highlights the flakiness. The test should be
+        # tightened down once that is solved.
+        self.assertTrue(sess.run(result.test_fn(v)) in (6, 7))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/impl/BUILD b/tensorflow/contrib/py2tf/impl/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..90ffabbc9bf4524ec2ebf54b6dd847bd8768a486
--- /dev/null
+++ b/tensorflow/contrib/py2tf/impl/BUILD
@@ -0,0 +1,67 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "impl",
+    srcs = [
+        "api.py",
+        "config.py",
+        "conversion.py",
+        "naming.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/contrib/py2tf/converters",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/contrib/py2tf/pyct/static_analysis",
+        "//tensorflow/contrib/py2tf/utils",
+        "@gast_archive//:gast",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "api_test",
+    srcs = ["api_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/contrib/py2tf/utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "conversion_test",
+    srcs = ["conversion_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_test(
+    name = "naming_test",
+    srcs = ["naming_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/impl/api.py b/tensorflow/contrib/py2tf/impl/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ae1c701698ae9a4efbde45222ff6c3db6e92521
--- /dev/null
+++ b/tensorflow/contrib/py2tf/impl/api.py
@@ -0,0 +1,233 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Public API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from functools import wraps
+
+import gast
+import six
+
+from tensorflow.contrib.py2tf.impl import config
+from tensorflow.contrib.py2tf.impl import conversion
+from tensorflow.contrib.py2tf.pyct import compiler
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_inspect
+
+# TODO(mdan): Properly document the type hints.
+# TODO(mdan): Reduce the type hint information to (module, type).
+# (currently we require (module + class name, type))
+
+
+def graph_ready(f):
+  """No-op decorator that explicitly marks a function as graph-ready.
+
+  Graph-ready functions are assumed to not need any conversion.
+
+  Args:
+    f: Any callable.
+  Returns:
+    f itself.
+  """
+  setattr(f, '__pyct_is_compile_decorator', True)
+  return f
+
+
+def convert_inline(f, *args, **kwargs):
+  """Shorthand to convert and call a function.
+
+  For example, the following two statements are equivalent:
+
+      @convert()
+      def foo():
+        ...
+      foo(bar)
+
+      def foo():
+        ...
+      convert_inline(foo, bar)
+
+  Args:
+    f: Function to convert. Only this call will be converted.
+    *args: Passed through to f.
+    **kwargs: Passed through to f, with the following exceptions:
+        * arg_value_hints: A dict mapping parameter names to objects that can
+            hint at the type of those parameters.
+
+  Returns:
+    The result of the converted f applied to args and kwargs.
+  """
+  if 'arg_value_hints' in kwargs:
+    arg_value_hints = kwargs['arg_value_hints']
+    del kwargs['arg_value_hints']
+  else:
+    arg_value_hints = None
+  if tf_inspect.ismethod(f):
+    # When converting methods, the result is still an unbound function.
+    args = (f.__self__,) + args
+  return convert(arg_value_hints)(f)(*args, **kwargs)
+
+
+def convert(recursive=False, verbose=False, arg_types=None):
+  """Decorator that compiles a function to graph mode.
+
+  The decorator is dynamic - invoking compilation whenever the decorated
+  function is called. This means the parameter values are known at compilation.
+
+  Args:
+    recursive: Whether to recusrively convert any functions that the decorator
+        function may call.
+    verbose: Whether to output the compiled code in the logs.
+    arg_types: See to_graph.
+
+  Returns:
+    A decorator that compiles the given function to graph mode.
+
+  Raises:
+    ValueError: If any of the arguments are illegal.
+  """
+  if arg_types is None:
+    arg_types = {}
+
+  def decorator(f):
+    """Decorator implementation."""
+
+    @wraps(f)
+    def wrapper(*args, **kwargs):
+      """Wrapper that calls the compiled version of the wrapped function."""
+      partial_types = ()
+      arg_values = {}
+      arg_names = tf_inspect.getargspec(f)[0]
+      for name, arg in zip(arg_names, args):
+        arg_values[name] = arg
+        arg_class = arg.__class__
+        # If arg_value_hints specifies any name, use that instead.
+        if name not in arg_types:
+          arg_types[name] = (arg_class.__name__, arg_class)
+        if name == 'self' and tf_inspect.isclass(arg_class):
+          # Annotated methods need to specify that their owner type is partial,
+          # otherwise other members they call will not be converted.
+          partial_types = (arg_class,)
+      wrapped = to_graph(
+          f,
+          recursive=recursive,
+          verbose=verbose,
+          arg_values=arg_values,
+          arg_types=arg_types,
+          partial_types=partial_types)
+      return wrapped(*args, **kwargs)
+
+    # Sometimes the decorator is just desugared, making it impossible to detect.
+    # This attribute makes detection easier.
+    setattr(wrapper, '__pyct_is_compile_decorator', True)
+    return wrapper
+
+  return decorator
+
+
+def to_graph(e,
+             recursive=True,
+             verbose=False,
+             arg_values=None,
+             arg_types=None,
+             partial_types=None):
+  """Compile a Python entity into equivalent TensorFlow code.
+
+  Currently supported entities:
+    * functions
+    * classes
+
+  Classes are handled by converting all their methods into a new class.
+
+  Args:
+    e: A Python entity.
+    recursive: Whether to recusrively convert any functions that the decorator
+        function may call.
+    verbose: Whether to output the compiled code in the logs.
+    arg_values: A dict containing value hints for symbols like function
+        parameters.
+    arg_types: A dict containing type hints for symbols like function
+        parameters.
+    partial_types: A set of types (e.g. classes) that will not be converted
+        entirely. Calls to member functions for these types will be renamed
+        independently.
+
+  Returns:
+    A function with a signature identical to `o`, but which when executed it
+  creates TF a graph that has the same functionality as the original entity.
+  """
+  conversion_map = conversion.ConversionMap(
+      recursive=recursive,
+      nocompile_decorators=(convert, graph_ready, convert_inline),
+      partial_types=partial_types)
+  _, name = conversion.entity_to_graph(e, conversion_map, arg_values, arg_types)
+
+  module = gast.Module([])
+  for import_line in config.COMPILED_IMPORT_STATEMENTS:
+    module.body.append(parser.parse_str(import_line))
+  for dep in conversion_map.dependency_cache.values():
+    module.body.append(dep)
+  compiled_node, compiled_src = compiler.ast_to_object(module)
+
+  # The compiled code should see everything the entry function saw.
+  # TODO(mdan): This might not work well if the call tree spans modules?
+  if tf_inspect.isfunction(e):
+    compiled_node.__dict__.update(six.get_function_globals(e))
+  compiled_fn = getattr(compiled_node, name)
+
+  if verbose:
+    logging.info('Compiled output of %s:\n\n%s\n', e, compiled_src)
+
+  return compiled_fn
+
+
+def to_code(e,
+            recursive=True,
+            arg_values=None,
+            arg_types=None,
+            partial_types=None,
+            indentation='  '):
+  """Return the equivalent of an entity in TensorFlow code.
+
+  See `to_graph` for more details.
+
+  Args:
+    e: A Python entity.
+    recursive: See to_graph.
+    arg_values: See to_graph.
+    arg_types: See to_graph.
+    partial_types: See to_graph.
+    indentation: String, when to use for each level of indentation.
+
+  Returns:
+    String.
+  """
+  conversion_map = conversion.ConversionMap(
+      recursive=recursive,
+      nocompile_decorators=(convert, graph_ready, convert_inline),
+      partial_types=partial_types)
+  conversion.entity_to_graph(e, conversion_map, arg_values, arg_types)
+
+  imports = '\n'.join(config.COMPILED_IMPORT_STATEMENTS)
+  code = '\n'.join(
+      compiler.ast_to_source(dep, indentation)
+      for dep in reversed(tuple(
+          six.itervalues(conversion_map.dependency_cache))))
+
+  return imports + '\n\n' + code
diff --git a/tensorflow/contrib/py2tf/impl/api_test.py b/tensorflow/contrib/py2tf/impl/api_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..02cd8ed2d0ffee8ef2d31ea65902d2b493df9d64
--- /dev/null
+++ b/tensorflow/contrib/py2tf/impl/api_test.py
@@ -0,0 +1,193 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for api module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.impl import api
+from tensorflow.contrib.py2tf.impl import config
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ApiTest(test.TestCase):
+
+  def setUp(self):
+    config.DEFAULT_UNCOMPILED_MODULES.add((math_ops.__name__,))
+    config.COMPILED_IMPORT_STATEMENTS = (
+        'from tensorflow.python.ops '
+        'import control_flow_ops as tf',
+        'from tensorflow.contrib.py2tf import utils as '
+        'py2tf_utils')
+
+  def test_decorator_recurses(self):
+
+    class TestClass(object):
+
+      def called_member(self, a):
+        if a < 0:
+          a = -a
+        return a
+
+      @api.convert(recursive=True)
+      def test_method(self, x, s, a):
+        while math_ops.reduce_sum(x) > s:
+          x //= self.called_member(a)
+        return x
+
+    tc = TestClass()
+    with self.test_session() as sess:
+      x = tc.test_method(
+          constant_op.constant([2, 4]), constant_op.constant(1),
+          constant_op.constant(-2))
+      self.assertListEqual([0, 1], sess.run(x).tolist())
+
+  def test_decorator_does_not_recurse(self):
+
+    class TestClass(object):
+
+      def called_member(self, a):
+        return math_ops.negative(a)
+
+      @api.convert(recursive=False)
+      def test_method(self, x, s, a):
+        while math_ops.reduce_sum(x) > s:
+          x //= self.called_member(a)
+        return x
+
+    tc = TestClass()
+    with self.test_session() as sess:
+      x = tc.test_method(
+          constant_op.constant([2, 4]), constant_op.constant(1),
+          constant_op.constant(-2))
+      self.assertListEqual([0, 1], sess.run(x).tolist())
+
+  def test_decorator_calls_converted(self):
+
+    class TestClass(object):
+
+      @api.graph_ready
+      def called_member(self, a):
+        return math_ops.negative(a)
+
+      @api.convert(recursive=True)
+      def test_method(self, x, s, a):
+        while math_ops.reduce_sum(x) > s:
+          x //= self.called_member(a)
+        return x
+
+    tc = TestClass()
+    with self.test_session() as sess:
+      x = tc.test_method(
+          constant_op.constant([2, 4]), constant_op.constant(1),
+          constant_op.constant(-2))
+      self.assertListEqual([0, 1], sess.run(x).tolist())
+
+  def test_decorator_calls_decorated(self):
+
+    class TestClass(object):
+
+      @api.convert()
+      def called_member(self, a):
+        if a < 0:
+          a = -a
+        return a
+
+      @api.convert(recursive=True)
+      def test_method(self, x, s, a):
+        while math_ops.reduce_sum(x) > s:
+          x //= self.called_member(a)
+        return x
+
+    tc = TestClass()
+    with self.test_session() as sess:
+      x = tc.test_method(
+          constant_op.constant([2, 4]), constant_op.constant(1),
+          constant_op.constant(-2))
+      self.assertListEqual([0, 1], sess.run(x).tolist())
+
+  def test_convert_call_site_decorator(self):
+
+    class TestClass(object):
+
+      def called_member(self, a):
+        if a < 0:
+          a = -a
+        return a
+
+      @api.convert(recursive=True)
+      def test_method(self, x, s, a):
+        while math_ops.reduce_sum(x) > s:
+          x //= api.convert_inline(self.called_member, a)
+        return x
+
+    tc = TestClass()
+    with self.test_session() as sess:
+      x = tc.test_method(
+          constant_op.constant([2, 4]), constant_op.constant(1),
+          constant_op.constant(-2))
+      self.assertListEqual([0, 1], sess.run(x).tolist())
+
+  def test_graph_ready_call_site_decorator(self):
+
+    class TestClass(object):
+
+      def called_member(self, a):
+        return math_ops.negative(a)
+
+      @api.convert(recursive=True)
+      def test_method(self, x, s, a):
+        while math_ops.reduce_sum(x) > s:
+          x //= api.graph_ready(self.called_member(a))
+        return x
+
+    tc = TestClass()
+    with self.test_session() as sess:
+      x = tc.test_method(
+          constant_op.constant([2, 4]), constant_op.constant(1),
+          constant_op.constant(-2))
+      self.assertListEqual([0, 1], sess.run(x).tolist())
+
+  def test_to_graph_basic(self):
+    def test_fn(x, s):
+      while math_ops.reduce_sum(x) > s:
+        x //= 2
+      return x
+
+    compiled_fn = api.to_graph(test_fn)
+
+    with self.test_session() as sess:
+      x = compiled_fn(constant_op.constant([4, 8]), 4)
+      self.assertListEqual([1, 2], sess.run(x).tolist())
+
+  def test_to_code_basic(self):
+    def test_fn(x, s):
+      while math_ops.reduce_sum(x) > s:
+        x /= 2
+      return x
+
+    compiled_code = api.to_code(test_fn)
+
+    # Just check for some key words and that it is parseable Python code.
+    self.assertRegexpMatches(compiled_code, 'py2tf_utils\\.run_while')
+    self.assertIsNotNone(parser.parse_str(compiled_code))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/impl/config.py b/tensorflow/contrib/py2tf/impl/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c3ecefff0f8858d5505ff30e1270b2fd42c9ad8
--- /dev/null
+++ b/tensorflow/contrib/py2tf/impl/config.py
@@ -0,0 +1,45 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Global configuration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf import utils
+
+
+PYTHON_LITERALS = {
+    'None': None,
+    'False': False,
+    'True': True,
+    'float': float,
+}
+
+DEFAULT_UNCOMPILED_MODULES = set((
+    ('tensorflow',),
+    (utils.__name__,),
+))
+
+NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
+
+# TODO(mdan): Also allow controlling the generated names (for testability).
+# TODO(mdan): Verify that these names are not hidden by generated code.
+# TODO(mdan): Make sure copybara renames the reference below.
+COMPILED_IMPORT_STATEMENTS = (
+    'from __future__ import print_function',
+    'import tensorflow as tf',
+    'from tensorflow.contrib.py2tf import utils as '
+    'py2tf_utils')
diff --git a/tensorflow/contrib/py2tf/impl/conversion.py b/tensorflow/contrib/py2tf/impl/conversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca13910ae5cff2c914ab7a17c843fe963e02f0df
--- /dev/null
+++ b/tensorflow/contrib/py2tf/impl/conversion.py
@@ -0,0 +1,285 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""High level conversion support."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+import six
+
+from tensorflow.contrib.py2tf import utils
+from tensorflow.contrib.py2tf.converters import asserts
+from tensorflow.contrib.py2tf.converters import break_statements
+from tensorflow.contrib.py2tf.converters import builtin_functions
+from tensorflow.contrib.py2tf.converters import call_trees
+from tensorflow.contrib.py2tf.converters import continue_statements
+from tensorflow.contrib.py2tf.converters import control_flow
+from tensorflow.contrib.py2tf.converters import decorators
+from tensorflow.contrib.py2tf.converters import for_loops
+from tensorflow.contrib.py2tf.converters import logical_expressions
+from tensorflow.contrib.py2tf.converters import side_effect_guards
+from tensorflow.contrib.py2tf.impl import config
+from tensorflow.contrib.py2tf.impl import naming
+from tensorflow.contrib.py2tf.pyct import context
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.py2tf.pyct.static_analysis import activity
+from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
+from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
+from tensorflow.python.util import tf_inspect
+
+
+# TODO(mdan): Might we not need any renaming at all?
+
+
+class ConversionMap(object):
+  """ConversionMaps keep track of converting function hierarchies.
+
+  Attributes:
+    recursive: Whether to recusrively convert any functions that the decorator
+        function may call.
+    nocompile_decorators: tuple of decorator functions that toggle compilation
+        off.
+    dependency_cache: dict[object]: ast; maps original entities to their
+        converted AST
+    name_map: dict[string]: string; maps original entities to the name of
+        their converted counterparts
+  """
+
+  # TODO(mdan): Rename to ConversionContext, and pull in additional flags.
+
+  def __init__(self, recursive, nocompile_decorators, partial_types):
+    self.recursive = recursive
+    self.nocompile_decorators = nocompile_decorators
+    self.partial_types = partial_types if partial_types else ()
+    self.dependency_cache = {}
+    self.name_map = {}
+
+  def new_namer(self, namespace):
+    return naming.Namer(namespace, self.recursive, self.name_map,
+                        self.partial_types)
+
+  def update_name_map(self, namer):
+    for o, name in namer.renamed_calls.items():
+      if o in self.name_map:
+        if self.name_map[o] != name:
+          raise ValueError(
+              'Calls to %s were converted using multiple names (%s). This is '
+              'possible when an entity with one of these names already '
+              'existed. To fix, avoid using any of these names.')
+      else:
+        self.name_map[o] = name
+
+  def add_to_cache(self, original_entity, converted_ast):
+    self.dependency_cache[original_entity] = converted_ast
+
+
+def entity_to_graph(o, conversion_map, arg_values, arg_types):
+  """Compile a Python entity into equivalent TensorFlow.
+
+  The function will also recursively compile all the entities that `o`
+  references, updating `dependency_cache`.
+
+  This function is reentrant, and relies on dependency_cache to avoid
+  generating duplicate code.
+
+  Args:
+    o: A Python entity.
+    conversion_map: A ConversionMap object.
+    arg_values: A dict containing value hints for symbols like function
+        parameters.
+    arg_types: A dict containing type hints for symbols like function
+        parameters.
+
+  Returns:
+    A tuple (ast, new_name):
+        * ast: An AST representing an entity with interface equivalent to `o`,
+            but which when executed it creates TF a graph.
+        * new_name: The symbol name under which the new entity can be found.
+
+  Raises:
+    ValueError: if the entity type is not supported.
+  """
+  if tf_inspect.isclass(o):
+    node, new_name = class_to_graph(o, conversion_map)
+  elif tf_inspect.isfunction(o):
+    node, new_name = function_to_graph(o, conversion_map, arg_values, arg_types)
+  elif tf_inspect.ismethod(o):
+    node, new_name = function_to_graph(o, conversion_map, arg_values, arg_types)
+  else:
+    raise ValueError(
+        'Entity "%s" has unsupported type "%s". Only functions and classes are '
+        'supported for now.' % (o, type(o)))
+
+  conversion_map.add_to_cache(o, node)
+  if conversion_map.recursive:
+    for obj in conversion_map.name_map.keys():
+      if obj not in conversion_map.dependency_cache:
+        if (hasattr(obj, 'im_class') and
+            getattr(obj, 'im_class') not in conversion_map.partial_types):
+          # Class members are converted with their objects, unless they're
+          # only converted partially.
+          continue
+        entity_to_graph(obj, conversion_map, {}, {})
+
+  return node, new_name
+
+
+def class_to_graph(c, conversion_map):
+  """Specialization of `entity_to_graph` for classes."""
+  converted_members = {}
+  members = tf_inspect.getmembers(c, predicate=tf_inspect.ismethod)
+  if not members:
+    raise ValueError('Cannot convert %s: it has no member methods.')
+
+  class_globals = None
+  for _, m in members:
+    node, _ = function_to_graph(
+        m,
+        conversion_map=conversion_map,
+        arg_values={},
+        arg_types={'self': (c.__name__, c)},
+        owner_type=c)
+    # TODO(mdan): Do not assume all members have the same view of globals.
+    if class_globals is None:
+      class_globals = six.get_function_globals(m)
+    converted_members[m] = node
+  namer = conversion_map.new_namer(class_globals)
+  class_name = namer.compiled_class_name(c.__name__, c)
+  node = gast.ClassDef(
+      class_name,
+      bases=[],
+      keywords=[],
+      body=converted_members.values(),
+      decorator_list=[])
+
+  return node, class_name
+
+
+def function_to_graph(f, conversion_map, arg_values, arg_types,
+                      owner_type=None):
+  """Specialization of `entity_to_graph` for callable functions."""
+  node, source = parser.parse_entity(f)
+  node = node.body[0]
+  namespace = six.get_function_globals(f)
+
+  # This is needed for non-global functions.
+  closure = six.get_function_closure(f)
+  if closure:
+    for e in closure:
+      if callable(e.cell_contents):
+        fn = e.cell_contents
+        namespace[fn.__name__] = fn
+
+  # Manually add the utils namespace which may be used from generated code.
+  if 'py2tf_util' not in namespace:
+    namespace['py2tf_utils'] = utils
+  elif namespace['py2tf_utils'] != utils:
+    raise ValueError(
+        'The module name py2tf_utils is reserved and may not be used.')
+
+  namer = conversion_map.new_namer(namespace)
+  ctx = context.EntityContext(
+      namer=namer,
+      source_code=source,
+      source_file='<fragment>',
+      namespace=namespace,
+      arg_values=arg_values,
+      arg_types=arg_types,
+      recursive=conversion_map.recursive)
+  node = node_to_graph(node, ctx, conversion_map.nocompile_decorators)
+
+  # TODO(mdan): This somewhat duplicates the call rename logic in call_treest.py
+  new_name, did_rename = namer.compiled_function_name(f.__name__, f, owner_type)
+  if not did_rename:
+    new_name = f.__name__
+    if node.name != f.__name__:
+      raise NotImplementedError('Strange corner case. Send us offending code!')
+
+  node.name = new_name
+  conversion_map.update_name_map(namer)
+  return node, new_name
+
+
+def _static_analysis_pass(node, ctx):
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx, None)
+  node = live_values.resolve(node, ctx, config.PYTHON_LITERALS)
+  node = type_info.resolve(node, ctx)
+  return node
+
+
+def node_to_graph(node, ctx, nocompile_decorators):
+  """Convert Python code to equivalent TF graph mode code.
+
+  Args:
+    node: A Python AST node representing the code to convert.
+    ctx: An EntityContext object.
+    nocompile_decorators: A tuple containing decorators to be stripped from
+        functions during conversion.
+
+  Returns:
+    A tuple (node, deps):
+        * node: A Python ast node, representing the converted code.
+        * deps: A set of strings, the fully qualified names of entity
+            dependencies that this node has.
+  """
+  # TODO(mdan): Verify arguments for correctness.
+
+  # TODO(mdan): Factor out common elements.
+  # These include:
+  #   * code move between blocks
+  #   * visiting blocks in transformers
+
+  # Certain steps, especially canonicalization, insert new symbols into the
+  # tree, which must be accounted. Although less efficient, it is most robust
+  # to re-run the analysis.
+
+  node = _static_analysis_pass(node, ctx)
+  # Past this point, line numbers are no longer accurate so we ignore the
+  # source.
+  # TODO(mdan): Is it feasible to reconstruct intermediate source code?
+  ctx.source_code = None
+  node = decorators.transform(node, nocompile_decorators)
+  node = break_statements.transform(node, ctx)
+  node = asserts.transform(node, ctx)
+
+  # Note: sequencing continue canonicalization before for loop one avoids
+  # dealing with the extra loop increment operation that the for
+  # canonicalization creates.
+  node = continue_statements.transform(node, ctx)
+  ctx.namespace['len'] = len
+
+  node = _static_analysis_pass(node, ctx)
+  node = for_loops.transform(node, ctx)
+  # for_loops may insert new global references.
+  node = builtin_functions.transform(node, ctx)
+  # TODO(mdan): Kept for CL consistency. Remove.
+  # builtin_functions may insert new global references.
+  ctx.namespace['print'] = print
+
+  node = _static_analysis_pass(node, ctx)
+  node = call_trees.transform(node, ctx, config.DEFAULT_UNCOMPILED_MODULES,
+                              nocompile_decorators)
+  node = control_flow.transform(node, ctx)
+
+  # control_flow may create new symbols and change scopes.
+  node = _static_analysis_pass(node, ctx)
+  node = logical_expressions.transform(node)
+  node = side_effect_guards.transform(node, ctx)
+
+  return node
diff --git a/tensorflow/contrib/py2tf/impl/conversion_test.py b/tensorflow/contrib/py2tf/impl/conversion_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3888958f19b9fa13b759924c5188722e500e30a1
--- /dev/null
+++ b/tensorflow/contrib/py2tf/impl/conversion_test.py
@@ -0,0 +1,64 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for conversion module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.impl import conversion
+from tensorflow.python.platform import test
+
+
+class ConversionTest(test.TestCase):
+
+  def test_entity_to_graph_unsupported_types(self):
+    with self.assertRaises(ValueError):
+      conversion_map = conversion.ConversionMap(True, (), ())
+      conversion.entity_to_graph('dummy', conversion_map, None, None)
+
+  def test_entity_to_graph_callable(self):
+
+    def f(a):
+      return a
+
+    conversion_map = conversion.ConversionMap(True, (), ())
+    ast, new_name = conversion.entity_to_graph(f, conversion_map, None, None)
+    self.assertTrue(isinstance(ast, gast.FunctionDef), ast)
+    self.assertEqual('tf__f', new_name)
+
+  def test_entity_to_graph_call_tree(self):
+
+    def g(a):
+      return a
+
+    def f(a):
+      return g(a)
+
+    conversion_map = conversion.ConversionMap(True, (), ())
+    conversion.entity_to_graph(f, conversion_map, None, None)
+
+    self.assertTrue(f in conversion_map.dependency_cache)
+    self.assertTrue(g in conversion_map.dependency_cache)
+    self.assertEqual('tf__f', conversion_map.dependency_cache[f].name)
+    self.assertEqual(
+        'tf__g', conversion_map.dependency_cache[f].body[0].value.func.id)
+    self.assertEqual('tf__g', conversion_map.dependency_cache[g].name)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/impl/naming.py b/tensorflow/contrib/py2tf/impl/naming.py
new file mode 100644
index 0000000000000000000000000000000000000000..51326091de13715c32d0a79279f1d3274e48ad10
--- /dev/null
+++ b/tensorflow/contrib/py2tf/impl/naming.py
@@ -0,0 +1,132 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Symbol naming utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.pyct import qual_names
+
+
+class Namer(object):
+  """Implementation of the namer interfaces required by various converters.
+
+  This implementation performs additional tasks like keeping track of the
+  function calls that have been encountered and replaced with calls to their
+  corresponding compiled counterparts.
+
+  Interfaces currently implemented:
+    * call_trees.FunctionNamer
+    * control_flow.SymbolNamer
+    * side_effect_guards.SymbolNamer
+  """
+
+  def __init__(self, global_namespace, recursive, name_map, partial_types):
+    self.global_namespace = global_namespace
+    self.recursive = recursive
+    self.partial_types = partial_types
+
+    self.renamed_calls = {}
+    if name_map is not None:
+      self.renamed_calls.update(name_map)
+
+    self.generated_names = set()
+
+  def compiled_class_name(self, original_fqn, live_entity=None):
+    """See call_trees.FunctionNamer.compiled_class_name."""
+    if live_entity is not None and live_entity in self.renamed_calls:
+      return self.renamed_calls[live_entity]
+
+    if isinstance(original_fqn, tuple):
+      original_name = '__'.join(original_fqn)
+    else:
+      original_name = original_fqn
+
+    new_name_root = 'Tf%s' % original_name
+    new_name = new_name_root
+    n = 0
+    while new_name in self.global_namespace:
+      n += 1
+      new_name = '%s_%d' % (new_name_root, n)
+
+    if live_entity is not None:
+      self.renamed_calls[live_entity] = new_name
+    self.generated_names.add(new_name)
+    if live_entity is not None:
+      self.renamed_calls[live_entity] = new_name
+    return new_name
+
+  def compiled_function_name(self,
+                             original_fqn,
+                             live_entity=None,
+                             owner_type=None):
+    """See call_trees.FunctionNamer.compiled_function_name."""
+
+    if not self.recursive:
+      return None, False
+
+    if owner_type is not None and owner_type not in self.partial_types:
+      # Members are not renamed when part of an entire converted class.
+      return None, False
+
+    if isinstance(original_fqn, tuple):
+      original_name = '__'.join(original_fqn)
+    else:
+      original_name = original_fqn
+
+    if live_entity is not None and live_entity in self.renamed_calls:
+      return self.renamed_calls[live_entity], True
+
+    new_name_root = 'tf__%s' % original_name
+    new_name = new_name_root
+    n = 0
+    while new_name in self.global_namespace:
+      n += 1
+      new_name = '%s_%d' % (new_name_root, n)
+
+    if live_entity is not None:
+      self.renamed_calls[live_entity] = new_name
+    self.generated_names.add(new_name)
+
+    return new_name, True
+
+  def new_symbol(self, name_root, reserved_locals):
+    """See control_flow.SymbolNamer.new_symbol."""
+    # reserved_locals may contain QNs.
+    all_reserved_locals = set()
+    for s in reserved_locals:
+      if isinstance(s, qual_names.QN):
+        all_reserved_locals.update(s.qn)
+      elif isinstance(s, str):
+        all_reserved_locals.add(s)
+      else:
+        raise ValueError('Unexpected symbol type "%s"' % type(s))
+
+    pieces = name_root.split('_')
+    if pieces[-1].isdigit():
+      name_root = '_'.join(pieces[:-1])
+      n = int(pieces[-1])
+    else:
+      n = 0
+    new_name = name_root
+
+    while (new_name in self.global_namespace or
+           new_name in all_reserved_locals or new_name in self.generated_names):
+      n += 1
+      new_name = '%s_%d' % (name_root, n)
+
+    self.generated_names.add(new_name)
+    return new_name
diff --git a/tensorflow/contrib/py2tf/impl/naming_test.py b/tensorflow/contrib/py2tf/impl/naming_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..beb4e54937bbb91b19157c9b9e3c528353206c62
--- /dev/null
+++ b/tensorflow/contrib/py2tf/impl/naming_test.py
@@ -0,0 +1,77 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for naming module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.impl import naming
+from tensorflow.python.platform import test
+
+
+class NamerTest(test.TestCase):
+
+  def test_compiled_function_name_tracks_names(self):
+    def bar():
+      pass
+
+    namer = naming.Namer({}, True, None, ())
+    self.assertEqual(('tf__foo', True), namer.compiled_function_name('foo'))
+    self.assertEqual(('tf__bar', True), namer.compiled_function_name(
+        'bar', bar))
+    self.assertEqual({bar: 'tf__bar'}, namer.renamed_calls)
+    self.assertItemsEqual(('tf__bar', 'tf__foo'), namer.generated_names)
+
+  def test_compiled_function_name_consistent(self):
+    def foo():
+      pass
+
+    namer = naming.Namer({}, True, None, ())
+    self.assertEqual(('tf__foo', True), namer.compiled_function_name(
+        'foo', foo))
+    self.assertEqual(('tf__foo', True), namer.compiled_function_name(
+        'foo', foo))
+
+  def test_compiled_function_name_avoids_global_conflicts(self):
+    def foo():
+      pass
+
+    namer = naming.Namer({'tf__foo': 1}, True, None, ())
+    self.assertEqual(('tf__foo_1', True),
+                     namer.compiled_function_name('foo', foo))
+
+  def test_new_symbol_tracks_names(self):
+    namer = naming.Namer({}, True, None, ())
+    self.assertEqual('temp', namer.new_symbol('temp', set()))
+    self.assertItemsEqual(('temp',), namer.generated_names)
+
+  def test_new_symbol_avoids_duplicates(self):
+    namer = naming.Namer({}, True, None, ())
+    self.assertEqual('temp', namer.new_symbol('temp', set()))
+    self.assertEqual('temp_1', namer.new_symbol('temp', set()))
+    self.assertItemsEqual(('temp', 'temp_1'), namer.generated_names)
+
+  def test_new_symbol_avoids_conflicts(self):
+    namer = naming.Namer({'temp': 1}, True, None, ())
+    # temp is reserved in the global namespace
+    self.assertEqual('temp_1', namer.new_symbol('temp', set()))
+    # temp_2 is reserved in the local namespace
+    self.assertEqual('temp_3', namer.new_symbol('temp', set(('temp_2',))))
+    self.assertItemsEqual(('temp_1', 'temp_3'), namer.generated_names)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/BUILD b/tensorflow/contrib/py2tf/pyct/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e3c0da4b10f9ffbee1b2a906b64d4762f41d97b4
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/BUILD
@@ -0,0 +1,114 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "pyct",
+    srcs = [
+        "__init__.py",
+        "anno.py",
+        "ast_util.py",
+        "compiler.py",
+        "context.py",
+        "parser.py",
+        "pretty_printer.py",
+        "qual_names.py",
+        "templates.py",
+        "transformer.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@astor_archive//:astor",
+        "@gast_archive//:gast",
+        "@six_archive//:six",
+        "@termcolor_archive//:termcolor",
+    ],
+)
+
+py_test(
+    name = "anno_test",
+    srcs = ["anno_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "ast_util_test",
+    srcs = ["ast_util_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_test(
+    name = "compiler_test",
+    srcs = ["compiler_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_test(
+    name = "parser_test",
+    srcs = ["parser_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "pretty_printer_test",
+    srcs = ["pretty_printer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "qual_names_test",
+    srcs = ["qual_names_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "templates_test",
+    srcs = ["templates_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/pyct/__init__.py b/tensorflow/contrib/py2tf/pyct/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d787e56bbecbd1d891fdf41207256c4c5096224f
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python source code transformation library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/py2tf/pyct/anno.py b/tensorflow/contrib/py2tf/pyct/anno.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a0528b6d0b65b6604930b7a13d8493af9d61f02
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/anno.py
@@ -0,0 +1,72 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Handling annotations on AST nodes.
+
+Adapted from Tangent.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from enum import Enum
+
+
+class NoValue(Enum):
+
+  def __repr__(self):
+    return self.name
+
+
+class Basic(NoValue):
+  """Container for annotation keys.
+
+  The enum values are used strictly for documentation purposes.
+  """
+
+  QN = 'Qualified name, as it appeared in the code.'
+  SKIP_PROCESSING = (
+      'This node should be preserved as is and not processed any further.')
+  INDENT_BLOCK_REMAINDER = (
+      'When a node is annotated with this, the remainder of the block should '
+      'be indented below it. The annotation contains a tuple '
+      '(new_body, name_map), where `new_body` is the new indented block and '
+      '`name_map` allows renaming symbols.')
+
+
+def getanno(node, key, field_name='___pyct_anno'):
+  return getattr(node, field_name)[key]
+
+
+def hasanno(node, key, field_name='___pyct_anno'):
+  return hasattr(node, field_name) and key in getattr(node, field_name)
+
+
+def setanno(node, key, value, field_name='___pyct_anno'):
+  annotations = getattr(node, field_name, {})
+  setattr(node, field_name, annotations)
+  annotations[key] = value
+
+  # So that the annotations survive gast_to_ast() and ast_to_gast()
+  if field_name not in node._fields:
+    node._fields += (field_name,)
+
+
+def delanno(node, key, field_name='___pyct_anno'):
+  annotations = getattr(node, field_name)
+  del annotations[key]
+  if not annotations:
+    delattr(node, field_name)
+    node._fields = tuple(f for f in node._fields if f != field_name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py b/tensorflow/contrib/py2tf/pyct/anno_test.py
similarity index 51%
rename from tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
rename to tensorflow/contrib/py2tf/pyct/anno_test.py
index a640dfe7dfbcce96261589c7fc49107deaefdd54..ff40bfe1f50ae731648afdf509c26c3a70d3f6cb 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
+++ b/tensorflow/contrib/py2tf/pyct/anno_test.py
@@ -12,37 +12,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Sigmoid bijector."""
+"""Tests for anno module."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector
+import ast
 
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.python.platform import test
 
-__all__ = [
-    "Sigmoid",
-]
 
+class AnnoTest(test.TestCase):
 
-class Sigmoid(bijector.Bijector):
-  """Bijector which computes `Y = g(X) = 1 / (1 + exp(-X))`."""
+  def test_basic(self):
+    node = ast.Name()
 
-  def __init__(self, validate_args=False, name="sigmoid"):
-    super(Sigmoid, self).__init__(
-        event_ndims=0, validate_args=validate_args, name=name)
+    self.assertFalse(anno.hasanno(node, 'foo'))
+    with self.assertRaises(AttributeError):
+      anno.getanno(node, 'foo')
 
-  def _forward(self, x):
-    return math_ops.sigmoid(x)
+    anno.setanno(node, 'foo', 3)
+    self.assertTrue(anno.hasanno(node, 'foo'))
+    self.assertEqual(3, anno.getanno(node, 'foo'))
 
-  def _inverse(self, y):
-    return math_ops.log(y) - math_ops.log1p(-y)
+    anno.delanno(node, 'foo')
+    self.assertFalse(anno.hasanno(node, 'foo'))
+    with self.assertRaises(AttributeError):
+      anno.getanno(node, 'foo')
 
-  def _inverse_log_det_jacobian(self, y):
-    return -math_ops.log(y) - math_ops.log1p(-y)
 
-  def _forward_log_det_jacobian(self, x):
-    return -nn_ops.softplus(-x) - nn_ops.softplus(x)
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/ast_util.py b/tensorflow/contrib/py2tf/pyct/ast_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..f916775b9cf3cec960ec2896c334f1d737862205
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/ast_util.py
@@ -0,0 +1,96 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Copy an AST tree, discarding annotations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ast
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+
+
+class CleanCopier(gast.NodeVisitor):
+  """Copy AST nodes.
+
+  The copied nodes will ignore almost all fields that prefixed by '__'.
+  Exceptions make some annotations.
+  """
+
+  # TODO(mdan): Parametrize which annotations get carried over.
+
+  def generic_visit(self, node):
+    new_fields = {}
+    for f in node._fields:
+      if f.startswith('__'):
+        continue
+      if not hasattr(node, f):
+        continue
+      v = getattr(node, f)
+      if isinstance(v, list):
+        v = [self.generic_visit(n) for n in v]
+      elif isinstance(v, tuple):
+        v = tuple(self.generic_visit(n) for n in v)
+      elif isinstance(v, (gast.AST, ast.AST)):
+        v = self.generic_visit(v)
+      else:
+        # Assume everything else is a value type.
+        pass
+      new_fields[f] = v
+    new_node = type(node)(**new_fields)
+    if anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
+      anno.setanno(new_node, anno.Basic.SKIP_PROCESSING, True)
+    return new_node
+
+
+def copy_clean(node):
+  copier = CleanCopier()
+  if isinstance(node, list):
+    return [copier.visit(n) for n in node]
+  elif isinstance(node, tuple):
+    return tuple(copier.visit(n) for n in node)
+  else:
+    return copier.visit(node)
+
+
+class SymbolRenamer(gast.NodeTransformer):
+  """Transformer that can rename symbols to a simple names."""
+
+  def __init__(self, name_map):
+    self.name_map = name_map
+
+  def _process(self, node):
+    qn = anno.getanno(node, anno.Basic.QN)
+    if qn in self.name_map:
+      return gast.Name(str(self.name_map[qn]), node.ctx, None)
+    return self.generic_visit(node)
+
+  def visit_Name(self, node):
+    return self._process(node)
+
+  def visit_Attribute(self, node):
+    return self._process(node)
+
+
+def rename_symbols(node, name_map):
+  renamer = SymbolRenamer(name_map)
+  if isinstance(node, list):
+    return [renamer.visit(n) for n in node]
+  elif isinstance(node, tuple):
+    return tuple(renamer.visit(n) for n in node)
+  return renamer.visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/ast_util_test.py b/tensorflow/contrib/py2tf/pyct/ast_util_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0b00c178168f96e656c57cc75a76e6da8af1d8a
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/ast_util_test.py
@@ -0,0 +1,79 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ast_util module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ast
+
+from tensorflow.contrib.py2tf.pyct import ast_util
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.python.platform import test
+
+
+class AstUtilTest(test.TestCase):
+
+  def test_rename_symbols(self):
+    node = ast.Tuple([
+        ast.Name('a', ast.Load()),
+        ast.Name('b', ast.Load()),
+        ast.Attribute(ast.Name('b', None), 'c', ast.Store()),
+        ast.Attribute(
+            ast.Attribute(ast.Name('b', None), 'c', ast.Load()), 'd',
+            None)
+    ], None)
+    node = qual_names.resolve(node)
+    node = ast_util.rename_symbols(
+        node,
+        {
+            qual_names.QN('a'): qual_names.QN('renamed_a'),
+            qual_names.QN('b.c'): qual_names.QN('renamed_b_c'),
+        })
+
+    self.assertEqual(node.elts[0].id, 'renamed_a')
+    self.assertTrue(isinstance(node.elts[0].ctx, ast.Load))
+    self.assertEqual(node.elts[1].id, 'b')
+    self.assertEqual(node.elts[2].id, 'renamed_b_c')
+    self.assertTrue(isinstance(node.elts[2].ctx, ast.Store))
+    self.assertEqual(node.elts[3].value.id, 'renamed_b_c')
+    self.assertTrue(isinstance(node.elts[3].value.ctx, ast.Load))
+
+  def test_copy_clean(self):
+    ret = ast.Return(
+        ast.BinOp(
+            op=ast.Add(),
+            left=ast.Name(id='a', ctx=ast.Load()),
+            right=ast.Num(1)))
+    setattr(ret, '__foo', 'bar')
+    node = ast.FunctionDef(
+        name='f',
+        args=ast.arguments(
+            args=[ast.Name(id='a', ctx=ast.Param())],
+            vararg=None,
+            kwarg=None,
+            defaults=[]),
+        body=[ret],
+        decorator_list=[],
+        returns=None)
+    new_node = ast_util.copy_clean(node)
+    self.assertFalse(node is new_node)
+    self.assertFalse(ret is new_node.body[0])
+    self.assertFalse(hasattr(new_node.body[0], '__foo'))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/compiler.py b/tensorflow/contrib/py2tf/pyct/compiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..51cf6930e8bcb3728ee55bf5d4781f01a5ef73bd
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/compiler.py
@@ -0,0 +1,72 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converting AST to code.
+
+Adapted from Tangent.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# TODO(mdan): Use six for compatibility here.
+import atexit
+import imp
+import os
+import tempfile
+
+import astor
+import gast
+
+
+def ast_to_source(node, indentation):
+  """Return the source code of given AST."""
+  if isinstance(node, gast.AST):
+    node = gast.gast_to_ast(node)
+  generator = astor.codegen.SourceGenerator(indentation, False,
+                                            astor.string_repr.pretty_string)
+  generator.visit(node)
+  generator.result.append('\n')
+  return astor.source_repr.pretty_source(generator.result).lstrip()
+
+
+def ast_to_object(
+    node, indentation='  ', source_prefix=None, delete_on_exit=True):
+  """Return the Python objects represented by given AST.
+
+  Compiling the AST code this way ensures that the source code is readable by
+  e.g. `pdb` or `inspect`.
+
+  Args:
+    node: The code to compile, as an AST object.
+    indentation: The string to use for indentation.
+    source_prefix: Optional string to print as-is into the source file.
+    delete_on_exit: Whether to delete the temporary file used for compilation
+        on exit.
+
+  Returns:
+    A module object containing the compiled source code.
+  """
+  source = ast_to_source(node, indentation)
+
+  with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+    module_name = os.path.basename(f.name[:-3])
+    if source_prefix:
+      f.write(source_prefix)
+      f.write('\n')
+    f.write(source)
+  if delete_on_exit:
+    atexit.register(lambda: os.remove(f.name))
+  return imp.load_source(module_name, f.name), source
diff --git a/tensorflow/contrib/py2tf/pyct/compiler_test.py b/tensorflow/contrib/py2tf/pyct/compiler_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1f84238efa7dd6fc0748748a2cb4f074572b4c6
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/compiler_test.py
@@ -0,0 +1,91 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for compiler module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import textwrap
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import compiler
+from tensorflow.python.platform import test
+
+
+class CompilerTest(test.TestCase):
+
+  def test_ast_to_source(self):
+    node = gast.If(
+        test=gast.Num(1),
+        body=[
+            gast.Assign(
+                targets=[gast.Name('a', gast.Store(), None)],
+                value=gast.Name('b', gast.Load(), None))
+        ],
+        orelse=[
+            gast.Assign(
+                targets=[gast.Name('a', gast.Store(), None)],
+                value=gast.Str('c'))
+        ])
+
+    self.assertEqual(
+        textwrap.dedent("""
+            if 1:
+              a = b
+            else:
+              a = 'c'
+        """).strip(),
+        compiler.ast_to_source(node, indentation='  ').strip())
+
+  def test_ast_to_object(self):
+    node = gast.FunctionDef(
+        name='f',
+        args=gast.arguments(
+            args=[gast.Name('a', gast.Param(), None)],
+            vararg=None,
+            kwonlyargs=[],
+            kwarg=None,
+            defaults=[],
+            kw_defaults=[]),
+        body=[
+            gast.Return(
+                gast.BinOp(
+                    op=gast.Add(),
+                    left=gast.Name('a', gast.Load(), None),
+                    right=gast.Num(1)))
+        ],
+        decorator_list=[],
+        returns=None)
+
+    module, source = compiler.ast_to_object(node)
+
+    expected_source = """
+      def f(a):
+        return a + 1
+    """
+    self.assertEqual(
+        textwrap.dedent(expected_source).strip(),
+        source.strip())
+    self.assertEqual(2, module.f(1))
+    with open(module.__file__, 'r') as temp_output:
+      self.assertEqual(
+          textwrap.dedent(expected_source).strip(),
+          temp_output.read().strip())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/context.py b/tensorflow/contrib/py2tf/pyct/context.py
new file mode 100644
index 0000000000000000000000000000000000000000..fef74ebefa290369c7310af6d7e4faeef44d9aee
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/context.py
@@ -0,0 +1,43 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Conversion context containers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class EntityContext(object):
+  """Contains information about an entity, like source code.
+
+  Attributes:
+    namer: Namer that matches the contract of all converters.
+    source_code: The entity's source code.
+    source_file: The entity's source file.
+    namespace: Dict[str->*], containing symbols visible to the entity
+        (excluding parameters).
+    arg_values: Dict[str->*], containing parameter values, if known.
+    arg_types: Dict[str->*], containing parameter types, if known.
+  """
+
+  def __init__(self, namer, source_code, source_file, namespace, arg_values,
+               arg_types, recursive):
+    self.namer = namer
+    self.source_code = source_code
+    self.source_file = source_file
+    self.namespace = namespace
+    self.arg_values = {} if arg_values is None else arg_values
+    self.arg_types = {} if arg_types is None else arg_types
+    self.recursive = recursive
diff --git a/tensorflow/contrib/quantize/python/copy_graph.py b/tensorflow/contrib/py2tf/pyct/parser.py
similarity index 65%
rename from tensorflow/contrib/quantize/python/copy_graph.py
rename to tensorflow/contrib/py2tf/pyct/parser.py
index 0376fcba82b99feabdba3b683f9db9a32db51efb..dc7df883b349becd860bb0dbceab22cb39c750b5 100644
--- a/tensorflow/contrib/quantize/python/copy_graph.py
+++ b/tensorflow/contrib/py2tf/pyct/parser.py
@@ -12,21 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utility to copy a tf.Graph."""
+"""Converting code to AST.
+
+Adapted from Tangent.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import ops
-from tensorflow.python.training import saver as saver_lib
+import textwrap
+
+import gast
+
+from tensorflow.python.util import tf_inspect
+
+
+def parse_entity(entity):
+  """Return the AST of given entity."""
+  source = tf_inspect.getsource(entity)
+  source = textwrap.dedent(source)
+  return parse_str(source), source
 
 
-def CopyGraph(graph):
-  """Return a copy of graph."""
-  meta_graph = saver_lib.export_meta_graph(
-      graph=graph, collection_list=graph.get_all_collection_keys())
-  graph_copy = ops.Graph()
-  with graph_copy.as_default():
-    _ = saver_lib.import_meta_graph(meta_graph)
-  return graph_copy
+def parse_str(src):
+  """Return the AST of given piece of code."""
+  return gast.parse(src)
diff --git a/tensorflow/contrib/py2tf/pyct/parser_test.py b/tensorflow/contrib/py2tf/pyct/parser_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f35dfa04c70dc191078248c32f9a04d28133129a
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/parser_test.py
@@ -0,0 +1,47 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for parser module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import textwrap
+
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.python.platform import test
+
+
+def f(x):
+  return x + 1
+
+
+class ParserTest(test.TestCase):
+
+  def test_parse_entity(self):
+    mod, _ = parser.parse_entity(f)
+    self.assertEqual('f', mod.body[0].name)
+
+  def test_parse_str(self):
+    mod = parser.parse_str(
+        textwrap.dedent("""
+        def f(x):
+          return x + 1
+    """))
+    self.assertEqual('f', mod.body[0].name)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/pretty_printer.py b/tensorflow/contrib/py2tf/pyct/pretty_printer.py
new file mode 100644
index 0000000000000000000000000000000000000000..bacc1e4a7774ec5b84495255042392fe089150d5
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/pretty_printer.py
@@ -0,0 +1,113 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Print an AST tree in a form more readable than ast.dump."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+import termcolor
+
+
+class PrettyPrinter(gast.NodeVisitor):
+  """Print AST nodes."""
+
+  def __init__(self, color):
+    self.indent_lvl = 0
+    self.result = ''
+    self.color = color
+
+  def _color(self, string, color, attrs=None):
+    if self.color:
+      return termcolor.colored(string, color, attrs=attrs)
+    return string
+
+  def _type(self, node):
+    return self._color(node.__class__.__name__, None, ['bold'])
+
+  def _field(self, name):
+    return self._color(name, 'blue')
+
+  def _value(self, name):
+    return self._color(name, 'magenta')
+
+  def _warning(self, name):
+    return self._color(name, 'red')
+
+  def _indent(self):
+    return self._color('| ' * self.indent_lvl, None, ['dark'])
+
+  def _print(self, s):
+    self.result += s
+    self.result += '\n'
+
+  def generic_visit(self, node, name=None):
+    if node._fields:
+      cont = ':'
+    else:
+      cont = '()'
+
+    if name:
+      self._print('%s%s=%s%s' % (self._indent(), self._field(name),
+                                 self._type(node), cont))
+    else:
+      self._print('%s%s%s' % (self._indent(), self._type(node), cont))
+
+    self.indent_lvl += 1
+    for f in node._fields:
+      if not hasattr(node, f):
+        self._print('%s%s' % (self._indent(), self._warning('%s=<unset>' % f)))
+        continue
+      v = getattr(node, f)
+      if isinstance(v, list):
+        if v:
+          self._print('%s%s=[' % (self._indent(), self._field(f)))
+          self.indent_lvl += 1
+          for n in v:
+            self.generic_visit(n)
+          self.indent_lvl -= 1
+          self._print('%s]' % (self._indent()))
+        else:
+          self._print('%s%s=[]' % (self._indent(), self._field(f)))
+      elif isinstance(v, tuple):
+        if v:
+          self._print('%s%s=(' % (self._indent(), self._field(f)))
+          self.indent_lvl += 1
+          for n in v:
+            self.generic_visit(n)
+          self.indent_lvl -= 1
+          self._print('%s)' % (self._indent()))
+        else:
+          self._print('%s%s=()' % (self._indent(), self._field(f)))
+      elif isinstance(v, gast.AST):
+        self.generic_visit(v, f)
+      elif isinstance(v, str):
+        self._print('%s%s=%s' % (self._indent(), self._field(f),
+                                 self._value('"%s"' % v)))
+      else:
+        self._print('%s%s=%s' % (self._indent(), self._field(f),
+                                 self._value(v)))
+    self.indent_lvl -= 1
+
+
+def fmt(node, color=True):
+  printer = PrettyPrinter(color)
+  if isinstance(node, (list, tuple)):
+    for n in node:
+      printer.visit(n)
+  else:
+    printer.visit(node)
+  return printer.result
diff --git a/tensorflow/contrib/py2tf/pyct/pretty_printer_test.py b/tensorflow/contrib/py2tf/pyct/pretty_printer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..81e3f47b80b6cb3bb7ba9f4a1787d03df4151a99
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/pretty_printer_test.py
@@ -0,0 +1,52 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for pretty_printer module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ast
+
+from tensorflow.contrib.py2tf.pyct import pretty_printer
+from tensorflow.python.platform import test
+
+
+class PrettyPrinterTest(test.TestCase):
+
+  def test_format(self):
+    node = ast.FunctionDef(
+        name='f',
+        args=ast.arguments(
+            args=[ast.Name(id='a', ctx=ast.Param())],
+            vararg=None,
+            kwarg=None,
+            defaults=[]),
+        body=[
+            ast.Return(
+                ast.BinOp(
+                    op=ast.Add(),
+                    left=ast.Name(id='a', ctx=ast.Load()),
+                    right=ast.Num(1)))
+        ],
+        decorator_list=[],
+        returns=None)
+    # Just checking for functionality, the color control characters make it
+    # difficult to inspect the result.
+    self.assertIsNotNone(pretty_printer.fmt(node))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/qual_names.py b/tensorflow/contrib/py2tf/pyct/qual_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..8717ee6cff198ff31f6cbdb7213e5a8dd3df1149
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/qual_names.py
@@ -0,0 +1,104 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for manipulating qualified names.
+
+A qualified name is a uniform way to refer to simple (e.g. 'foo') and composite
+(e.g. 'foo.bar') syntactic symbols.
+
+This is *not* related to the __qualname__ attribute used by inspect, which
+refers to scopes.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+
+
+class QN(object):
+  """Represents a qualified name."""
+
+  def __init__(self, base, attr=None):
+    if attr:
+      if not isinstance(base, QN):
+        raise ValueError('For attribute QNs, base must be a QN.')
+      self._parent = base
+      self.qn = base.qn + (attr,)
+    else:
+      if isinstance(base, QN):
+        if base.is_composite():
+          self._parent = base.parent
+        else:
+          self._parent = None
+        self.qn = base.qn
+      else:
+        self._parent = None
+        self.qn = tuple(base.split('.'))
+
+  def is_composite(self):
+    return len(self.qn) > 1
+
+  @property
+  def parent(self):
+    if self._parent is None:
+      raise ValueError('Cannot get parent of simple name "%s".' % self.qn[0])
+    return self._parent
+
+  def __hash__(self):
+    return hash(self.qn)
+
+  def __eq__(self, other):
+    return self.qn == other.qn
+
+  def __str__(self):
+    return '.'.join(self.qn)
+
+  def __repr__(self):
+    return str(self)
+
+  def ssf(self):
+    """Simple symbol form."""
+    return '_'.join(self.qn)
+
+  def ast(self):
+    # The caller must adjust the context appropriately.
+    if self.is_composite():
+      return gast.Attribute(self.parent.ast(), self.qn[-1], None)
+    return gast.Name(self.qn[0], None, None)
+
+
+class QnResolver(gast.NodeTransformer):
+  """Annotates nodes with QN information.
+
+  Note: Not using NodeAnnos to avoid circular dependencies.
+  """
+
+  def visit_Name(self, node):
+    self.generic_visit(node)
+    anno.setanno(node, anno.Basic.QN, QN(node.id))
+    return node
+
+  def visit_Attribute(self, node):
+    self.generic_visit(node)
+    anno.setanno(node, anno.Basic.QN,
+                 QN(anno.getanno(node.value, anno.Basic.QN), node.attr))
+    return node
+
+
+def resolve(node):
+  return QnResolver().visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/qual_names_test.py b/tensorflow/contrib/py2tf/pyct/qual_names_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b1eee2deca18bb0540c17d6ee85d421602aa2b7
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/qual_names_test.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for qual_names module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import textwrap
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.python.platform import test
+
+
+class QNTest(test.TestCase):
+
+  def test_basic(self):
+    a = qual_names.QN('a')
+    self.assertEqual(a.qn, ('a',))
+    self.assertEqual(str(a), 'a')
+    self.assertEqual(a.ssf(), 'a')
+    self.assertEqual(a.ast().id, 'a')
+    self.assertFalse(a.is_composite())
+    with self.assertRaises(ValueError):
+      _ = a.parent
+
+    a_b = qual_names.QN(a, 'b')
+    self.assertEqual(a_b.qn, ('a', 'b'))
+    self.assertEqual(str(a_b), 'a.b')
+    self.assertEqual(a_b.ssf(), 'a_b')
+    self.assertEqual(a_b.ast().value.id, 'a')
+    self.assertEqual(a_b.ast().attr, 'b')
+    self.assertTrue(a_b.is_composite())
+    self.assertEqual(a_b.parent.qn, ('a',))
+
+    a2 = qual_names.QN(a)
+    self.assertEqual(a2.qn, ('a',))
+    with self.assertRaises(ValueError):
+      _ = a.parent
+
+    a_b2 = qual_names.QN(a_b)
+    self.assertEqual(a_b2.qn, ('a', 'b'))
+    self.assertEqual(a_b2.parent.qn, ('a',))
+
+    self.assertTrue(a2 == a)
+    self.assertFalse(a2 is a)
+
+    self.assertTrue(a_b.parent == a)
+    self.assertTrue(a_b2.parent == a)
+
+    self.assertTrue(a_b2 == a_b)
+    self.assertFalse(a_b2 is a_b)
+    self.assertFalse(a_b2 == a)
+
+    with self.assertRaises(ValueError):
+      qual_names.QN('a', 'b')
+
+  def test_hashable(self):
+    d = {qual_names.QN('a'): 'a', qual_names.QN('b'): 'b'}
+
+    self.assertEqual(d[qual_names.QN('a')], 'a')
+    self.assertEqual(d[qual_names.QN('b')], 'b')
+    self.assertTrue(qual_names.QN('c') not in d)
+
+
+class QNResolverTest(test.TestCase):
+
+  def assertQNStringIs(self, node, qn_str):
+    self.assertEqual(str(anno.getanno(node, anno.Basic.QN)), qn_str)
+
+  def test_resolve(self):
+    samples = """
+      a
+      a.b
+      (c, d.e)
+      [f, (g.h.i)]
+      j(k, l)
+    """
+    nodes = qual_names.resolve(parser.parse_str(textwrap.dedent(samples)))
+    nodes = tuple(n.value for n in nodes.body)
+
+    self.assertQNStringIs(nodes[0], 'a')
+    self.assertQNStringIs(nodes[1], 'a.b')
+    self.assertQNStringIs(nodes[2].elts[0], 'c')
+    self.assertQNStringIs(nodes[2].elts[1], 'd.e')
+    self.assertQNStringIs(nodes[3].elts[0], 'f')
+    self.assertQNStringIs(nodes[3].elts[1], 'g.h.i')
+    self.assertQNStringIs(nodes[4].func, 'j')
+    self.assertQNStringIs(nodes[4].args[0], 'k')
+    self.assertQNStringIs(nodes[4].args[1], 'l')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..fbfce18c60cca4b105e7de3c3ea7b9c3438f6b2a
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/BUILD
@@ -0,0 +1,65 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "static_analysis",
+    srcs = [
+        "activity.py",
+        "annos.py",
+        "live_values.py",
+        "type_info.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/py2tf/pyct",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_test(
+    name = "activity_test",
+    srcs = ["activity_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":static_analysis",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_test(
+    name = "live_values_test",
+    srcs = ["live_values_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":static_analysis",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "type_info_test",
+    srcs = ["type_info_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":static_analysis",
+        "//tensorflow/contrib/py2tf/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/__init__.py b/tensorflow/contrib/py2tf/pyct/static_analysis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c325e19f28376da3be6db4b00b9f664eac047af2
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Static information resolution.
+
+This module contains utilities to help annotate AST nodes with as much runtime
+information as can be possibly extracted without actually executing the code,
+under that assumption that the context in which the code will run is known.
+
+Note: It's a fair bet that this analysis cannot be reused across contexts
+without re-running it. In most cases, the context usually means referenced
+modules, which should be static enough to allow reuse, but that is not being
+reliably verified.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/activity.py b/tensorflow/contrib/py2tf/pyct/static_analysis/activity.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c93e1603113d48176af7a97a0f37321e6f67586
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/activity.py
@@ -0,0 +1,270 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Activity analysis."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+
+# TODO(mdan): Add support for PY3 (e.g. Param vs arg).
+
+
+class Scope(object):
+  """Encloses local symbol definition and usage information.
+
+  This can track for instance whether a symbol is modified in the current scope.
+  Note that scopes do not necessarily align with Python's scopes. For example,
+  the body of an if statement may be considered a separate scope.
+
+  Attributes:
+    modified: identifiers modified in this scope
+    created: identifiers created in this scope
+    used: identifiers referenced in this scope
+  """
+
+  def __init__(self, parent, isolated=True):
+    """Create a new scope.
+
+    Args:
+      parent: A Scope or None.
+      isolated: Whether the scope is isolated, that is, whether variables
+          created in this scope should be visible to the parent scope.
+    """
+    self.isolated = isolated
+    self.parent = parent
+    self.modified = set()
+    self.created = set()
+    self.used = set()
+    self.params = set()
+    self.returned = set()
+
+  # TODO(mdan): Rename to `locals`
+  @property
+  def referenced(self):
+    if not self.isolated and self.parent is not None:
+      return self.used | self.parent.referenced
+    return self.used
+
+  def __repr__(self):
+    return 'Scope{r=%s, c=%s, w=%s}' % (tuple(self.used), tuple(self.created),
+                                        tuple(self.modified))
+
+  def copy_from(self, other):
+    self.modified = copy.copy(other.modified)
+    self.created = copy.copy(other.created)
+    self.used = copy.copy(other.used)
+    self.params = copy.copy(other.params)
+    self.returned = copy.copy(other.returned)
+
+  def merge_from(self, other):
+    self.modified |= other.modified
+    self.created |= other.created
+    self.used |= other.used
+    self.params |= other.params
+    self.returned |= other.returned
+
+  def has(self, name):
+    if name in self.modified or name in self.params:
+      return True
+    elif self.parent is not None:
+      return self.parent.has(name)
+    return False
+
+  def is_modified_since_entry(self, name):
+    if name in self.modified:
+      return True
+    elif self.parent is not None and not self.isolated:
+      return self.parent.is_modified_since_entry(name)
+    return False
+
+  def is_param(self, name):
+    if name in self.params:
+      return True
+    elif self.parent is not None and not self.isolated:
+      return self.parent.is_param(name)
+    return False
+
+  def mark_read(self, name):
+    self.used.add(name)
+    if self.parent is not None and name not in self.created:
+      self.parent.mark_read(name)
+
+  def mark_param(self, name):
+    self.params.add(name)
+
+  def mark_creation(self, name):
+    if name.is_composite():
+      parent = name.parent
+      if self.has(parent):
+        # This is considered mutation of the parent, not creation.
+        # TODO(mdan): Is that really so?
+        return
+      else:
+        raise ValueError('Unknown symbol "%s".' % parent)
+    self.created.add(name)
+
+  def mark_write(self, name):
+    self.modified.add(name)
+    if self.isolated:
+      self.mark_creation(name)
+    else:
+      if self.parent is None:
+        self.mark_creation(name)
+      else:
+        if not self.parent.has(name):
+          self.mark_creation(name)
+        self.parent.mark_write(name)
+
+  def mark_returned(self, name):
+    self.returned.add(name)
+    if not self.isolated and self.parent is not None:
+      self.parent.mark_returned(name)
+
+
+class ActivityAnalizer(transformer.Base):
+  """Annotates nodes with local scope information. See Scope."""
+
+  def __init__(self, context, parent_scope):
+    super(ActivityAnalizer, self).__init__(context)
+    self.scope = Scope(parent_scope)
+    self._in_return_statement = False
+
+  def _track_symbol(self, node):
+    qn = anno.getanno(node, anno.Basic.QN)
+
+    if isinstance(node.ctx, gast.Store):
+      self.scope.mark_write(qn)
+    elif isinstance(node.ctx, gast.Load):
+      self.scope.mark_read(qn)
+    elif isinstance(node.ctx, gast.Param):
+      # Param contexts appear in function defs, so they have the meaning of
+      # defining a variable.
+      # TODO(mdan): This bay be incorrect with nested functions.
+      # For nested functions, we'll have to add the notion of hiding args from
+      # the parent scope, not writing to them.
+      self.scope.mark_creation(qn)
+      self.scope.mark_param(qn)
+    else:
+      raise ValueError('Unknown context %s for node %s.' % (type(node.ctx), qn))
+
+    anno.setanno(node, NodeAnno.IS_LOCAL, self.scope.has(qn))
+    anno.setanno(node, NodeAnno.IS_MODIFIED_SINCE_ENTRY,
+                 self.scope.is_modified_since_entry(qn))
+    anno.setanno(node, NodeAnno.IS_PARAM, self.scope.is_param(qn))
+
+    if self._in_return_statement:
+      self.scope.mark_returned(qn)
+
+  def visit_Name(self, node):
+    self.generic_visit(node)
+    self._track_symbol(node)
+    return node
+
+  def visit_Attribute(self, node):
+    self.generic_visit(node)
+    self._track_symbol(node)
+    return node
+
+  def visit_Print(self, node):
+    current_scope = self.scope
+    args_scope = Scope(current_scope)
+    self.scope = args_scope
+    for n in node.values:
+      self.visit(n)
+    anno.setanno(node, NodeAnno.ARGS_SCOPE, args_scope)
+    self.scope = current_scope
+    return node
+
+  def visit_Call(self, node):
+    current_scope = self.scope
+    args_scope = Scope(current_scope, isolated=False)
+    self.scope = args_scope
+    for n in node.args:
+      self.visit(n)
+    # TODO(mdan): Account starargs, kwargs
+    for n in node.keywords:
+      self.visit(n)
+    anno.setanno(node, NodeAnno.ARGS_SCOPE, args_scope)
+    self.scope = current_scope
+    self.visit(node.func)
+    return node
+
+  def _process_block_node(self, node, block, scope_name):
+    current_scope = self.scope
+    block_scope = Scope(current_scope, isolated=False)
+    self.scope = block_scope
+    for n in block:
+      self.visit(n)
+    anno.setanno(node, scope_name, block_scope)
+    self.scope = current_scope
+    return node
+
+  def _process_parallel_blocks(self, parent, children):
+    # Because the scopes are not isolated, processing any child block
+    # modifies the parent state causing the other child blocks to be
+    # processed incorrectly. So we need to checkpoint the parent scope so that
+    # each child sees the same context.
+    before_parent = Scope(None)
+    before_parent.copy_from(self.scope)
+    after_children = []
+    for child, scope_name in children:
+      self.scope.copy_from(before_parent)
+      parent = self._process_block_node(parent, child, scope_name)
+      after_child = Scope(None)
+      after_child.copy_from(self.scope)
+      after_children.append(after_child)
+    for after_child in after_children:
+      self.scope.merge_from(after_child)
+    return parent
+
+  def visit_If(self, node):
+    self.visit(node.test)
+    node = self._process_parallel_blocks(node,
+                                         ((node.body, NodeAnno.BODY_SCOPE),
+                                          (node.orelse, NodeAnno.ORELSE_SCOPE)))
+    return node
+
+  def visit_For(self, node):
+    self.visit(node.target)
+    self.visit(node.iter)
+    node = self._process_parallel_blocks(node,
+                                         ((node.body, NodeAnno.BODY_SCOPE),
+                                          (node.orelse, NodeAnno.ORELSE_SCOPE)))
+    return node
+
+  def visit_While(self, node):
+    self.visit(node.test)
+    node = self._process_parallel_blocks(node,
+                                         ((node.body, NodeAnno.BODY_SCOPE),
+                                          (node.orelse, NodeAnno.ORELSE_SCOPE)))
+    return node
+
+  def visit_Return(self, node):
+    self._in_return_statement = True
+    node = self.generic_visit(node)
+    self._in_return_statement = False
+    return node
+
+
+def resolve(node, context, parent_scope=None):
+  return ActivityAnalizer(context, parent_scope).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/activity_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/activity_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1eb954a5efef4d6a00ac492e7c85394d54e28c9
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/activity_test.py
@@ -0,0 +1,271 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for activity module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import context
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.py2tf.pyct.qual_names import QN
+from tensorflow.contrib.py2tf.pyct.static_analysis import activity
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+from tensorflow.python.platform import test
+
+
+class ScopeTest(test.TestCase):
+
+  def test_basic(self):
+    scope = activity.Scope(None)
+    self.assertFalse(scope.has(QN('foo')))
+
+    scope.mark_read(QN('foo'))
+    self.assertFalse(scope.has(QN('foo')))
+
+    scope.mark_write(QN('foo'))
+    self.assertTrue(scope.has(QN('foo')))
+
+    scope.mark_read(QN('bar'))
+    self.assertFalse(scope.has(QN('bar')))
+
+  def test_copy(self):
+    scope = activity.Scope(None)
+    scope.mark_write(QN('foo'))
+
+    other = activity.Scope(None)
+    other.copy_from(scope)
+
+    self.assertTrue(QN('foo') in other.created)
+
+    scope.mark_write(QN('bar'))
+    scope.copy_from(other)
+
+    self.assertFalse(QN('bar') in scope.created)
+
+    scope.mark_write(QN('bar'))
+    scope.merge_from(other)
+
+    self.assertTrue(QN('bar') in scope.created)
+    self.assertFalse(QN('bar') in other.created)
+
+  def test_nesting(self):
+    scope = activity.Scope(None)
+    scope.mark_write(QN('foo'))
+    scope.mark_read(QN('bar'))
+
+    child = activity.Scope(scope)
+    self.assertTrue(child.has(QN('foo')))
+    self.assertTrue(scope.has(QN('foo')))
+
+    child.mark_write(QN('bar'))
+    self.assertTrue(child.has(QN('bar')))
+    self.assertFalse(scope.has(QN('bar')))
+
+  def test_referenced(self):
+    scope = activity.Scope(None)
+    scope.mark_read(QN('a'))
+
+    child = activity.Scope(scope)
+    child.mark_read(QN('b'))
+
+    child2 = activity.Scope(child, isolated=False)
+    child2.mark_read(QN('c'))
+
+    self.assertTrue(QN('c') in child2.referenced)
+    self.assertTrue(QN('b') in child2.referenced)
+    self.assertFalse(QN('a') in child2.referenced)
+
+    self.assertTrue(QN('c') in child.referenced)
+    self.assertTrue(QN('b') in child.referenced)
+    self.assertFalse(QN('a') in child.referenced)
+
+
+class ActivityAnalizerTest(test.TestCase):
+
+  def _parse_and_analyze(self, test_fn):
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=source,
+        source_file=None,
+        namespace={},
+        arg_values=None,
+        arg_types=None,
+        recursive=True)
+    node = qual_names.resolve(node)
+    node = activity.resolve(node, ctx)
+    return node
+
+  def test_local_markers(self):
+
+    def test_fn(a):  # pylint:disable=unused-argument
+      b = c  # pylint:disable=undefined-variable
+      while b > 0:
+        b -= 1
+      return b
+
+    node = self._parse_and_analyze(test_fn)
+    self.assertFalse(
+        anno.getanno(node.body[0].body[0].value,
+                     NodeAnno.IS_LOCAL))  # c in b = c
+    self.assertTrue(
+        anno.getanno(node.body[0].body[1].test.left,
+                     NodeAnno.IS_LOCAL))  # b in b > 0
+    self.assertTrue(
+        anno.getanno(node.body[0].body[2].value,
+                     NodeAnno.IS_LOCAL))  # b in return b
+
+  def assertScopeIs(self, scope, used, modified, created):
+    self.assertItemsEqual(used, tuple(str(s) for s in scope.used))
+    self.assertItemsEqual(modified, tuple(str(s) for s in scope.modified))
+    self.assertItemsEqual(created, tuple(str(s) for s in scope.created))
+
+  def test_print_statement(self):
+
+    def test_fn(a):
+      b = 0
+      c = 1
+      print(a, b)
+      return c
+
+    node = self._parse_and_analyze(test_fn)
+    print_node = node.body[0].body[2]
+    if isinstance(print_node, gast.Print):
+      # Python 2
+      print_args_scope = anno.getanno(print_node, NodeAnno.ARGS_SCOPE)
+    else:
+      # Python 3
+      assert isinstance(print_node, gast.Expr)
+      # The call node should be the one being annotated.
+      print_node = print_node.value
+      print_args_scope = anno.getanno(print_node, NodeAnno.ARGS_SCOPE)
+    # We basically need to detect which variables are captured by the call
+    # arguments.
+    self.assertScopeIs(print_args_scope, ('a', 'b'), (), ())
+
+  def test_call(self):
+
+    def test_fn(a):
+      b = 0
+      c = 1
+      foo(a, b)  # pylint:disable=undefined-variable
+      return c
+
+    node = self._parse_and_analyze(test_fn)
+    call_node = node.body[0].body[2].value
+    # We basically need to detect which variables are captured by the call
+    # arguments.
+    self.assertScopeIs(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE), ('a', 'b'), (), ())
+
+  def test_while(self):
+
+    def test_fn(a):
+      b = a
+      while b > 0:
+        c = b
+        b -= 1
+      return b, c
+
+    node = self._parse_and_analyze(test_fn)
+    while_node = node.body[0].body[1]
+    self.assertScopeIs(
+        anno.getanno(while_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'),
+        ('c',))
+    self.assertScopeIs(
+        anno.getanno(while_node, NodeAnno.BODY_SCOPE).parent, ('a', 'b', 'c'),
+        ('b', 'c'), ('a', 'b', 'c'))
+
+  def test_for(self):
+
+    def test_fn(a):
+      b = a
+      for _ in a:
+        c = b
+        b -= 1
+      return b, c
+
+    node = self._parse_and_analyze(test_fn)
+    for_node = node.body[0].body[1]
+    self.assertScopeIs(
+        anno.getanno(for_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'), ('c',))
+    self.assertScopeIs(
+        anno.getanno(for_node, NodeAnno.BODY_SCOPE).parent, ('a', 'b', 'c'),
+        ('b', 'c', '_'), ('a', 'b', 'c', '_'))
+
+  def test_if(self):
+
+    def test_fn(x):
+      if x > 0:
+        x = -x
+        y = 2 * x
+        z = -y
+      else:
+        x = 2 * x
+        y = -x
+        u = -y
+      return z, u
+
+    node = self._parse_and_analyze(test_fn)
+    if_node = node.body[0].body[0]
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('x', 'y', 'z'),
+        ('y', 'z'))
+    # TODO(mdan): Double check: is it ok to not mark a local symbol as not read?
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE).parent, ('x', 'z', 'u'),
+        ('x', 'y', 'z', 'u'), ('x', 'y', 'z', 'u'))
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE), ('x', 'y'),
+        ('x', 'y', 'u'), ('y', 'u'))
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE).parent, ('x', 'z', 'u'),
+        ('x', 'y', 'z', 'u'), ('x', 'y', 'z', 'u'))
+
+  def test_call_with_composite_names(self):
+
+    def foo(*_):
+      pass
+
+    def test_fn(a):
+      foo(a.b, a.c)
+      if a > 0:
+        a.b = 2
+      else:
+        d = 2
+        d.e = a.c
+        f = d.e + 1
+        a.c = f
+
+    node = self._parse_and_analyze(test_fn)
+    call_node = node.body[0].body[0].value
+    self.assertScopeIs(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE), ('a', 'a.b', 'a.c'), (),
+        ())
+    if_node = node.body[0].body[1]
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('a',), ('a.b',), ())
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE),
+        ('a', 'a.c', 'd', 'd.e', 'f'), ('a.c', 'd', 'd.e', 'f'), ('d', 'f'))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/annos.py b/tensorflow/contrib/py2tf/pyct/static_analysis/annos.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d8e49442364fdd4a4752c8a83a5f3b76117fe57
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/annos.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Annotations used by the static analizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from enum import Enum
+
+
+class NoValue(Enum):
+
+  def __repr__(self):
+    return self.name
+
+
+class NodeAnno(NoValue):
+  """Additionnal annotations used by the static analyzer.
+
+  These are in addition to the basic annotations declared in anno.py.
+  """
+
+  # Symbols
+
+  IS_LOCAL = 'Symbol is local to the function scope being analized.'
+  IS_PARAM = 'Symbol is a parameter to the function being analized.'
+  IS_MODIFIED_SINCE_ENTRY = (
+      'Symbol has been explicitly replaced in the current function scope.')
+
+  # Scopes
+  ARGS_SCOPE = 'The scope for the argument list of a function call.'
+  BODY_SCOPE = (
+      'The scope for the main body of a statement (True branch for if '
+      'statements, main body for loops).')
+  ORELSE_SCOPE = (
+      'The scope for the orelse body of a statement (False branch for if '
+      'statements, orelse body for loops).')
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c0a9a9e74eccb3d22840032e8f0c2b81e051e7e
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values.py
@@ -0,0 +1,112 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Live value resolution.
+
+Live values are extracted from the known execution context.
+
+Requires activity analysis annotations.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.contrib.py2tf.pyct.static_analysis.annos import NodeAnno
+
+
+class LiveValueResolver(transformer.Base):
+  """Annotates nodes with live values."""
+
+  def __init__(self, context, literals):
+    super(LiveValueResolver, self).__init__(context)
+    self.literals = literals
+
+  def visit_ClassDef(self, node):
+    self.generic_visit(node)
+    anno.setanno(node, 'live_val', self.context.namespace[node.name])
+    return node
+
+  def visit_Name(self, node):
+    self.generic_visit(node)
+    if isinstance(node.ctx, gast.Load):
+      assert anno.hasanno(node, NodeAnno.IS_LOCAL), node
+      symbol_is_local = anno.getanno(node, NodeAnno.IS_LOCAL)
+      assert anno.hasanno(node, NodeAnno.IS_MODIFIED_SINCE_ENTRY), node
+      symbol_is_modified = anno.getanno(node, NodeAnno.IS_MODIFIED_SINCE_ENTRY)
+      assert anno.hasanno(node, NodeAnno.IS_PARAM), node
+      symbol_is_param = anno.getanno(node, NodeAnno.IS_PARAM)
+
+      if not symbol_is_local and not symbol_is_param:
+        if node.id in self.literals:
+          anno.setanno(node, 'live_val', self.literals[node.id])
+          # TODO(mdan): Could live values have FQNs? i.e. 'a'.join()
+        elif node.id in self.context.namespace:
+          obj = self.context.namespace[node.id]
+          anno.setanno(node, 'live_val', obj)
+          anno.setanno(node, 'fqn', (obj.__name__,))
+        else:
+          pass
+          # TODO(mdan): Should we raise an error here?
+          # Can encounter this when:
+          #  * a symbol truly lacks reference
+          #  * a symbol is new, like the new name of a function we just renamed.
+      else:
+        pass
+        # TODO(mdan): Attempt to trace its value through the local chain.
+        # TODO(mdan): Use type annotations as fallback.
+
+      if not symbol_is_modified:
+        if node.id in self.context.arg_values:
+          obj = self.context.arg_values[node.id]
+          anno.setanno(node, 'live_val', obj)
+          anno.setanno(node, 'fqn', (obj.__class__.__name__,))
+    return node
+
+  def visit_Attribute(self, node):
+    self.generic_visit(node)
+    if anno.hasanno(node.value, 'live_val'):
+      assert anno.hasanno(node.value, 'fqn')
+      parent_object = anno.getanno(node.value, 'live_val')
+      if not hasattr(parent_object, node.attr):
+        raise AttributeError('%s has no attribute %s' % (parent_object,
+                                                         node.attr))
+      anno.setanno(node, 'live_val', getattr(parent_object, node.attr))
+      anno.setanno(node, 'fqn', anno.getanno(node.value, 'fqn') + (node.attr,))
+    # TODO(mdan): Investigate the role built-in annotations can play here.
+    elif anno.hasanno(node.value, 'type'):
+      parent_type = anno.getanno(node.value, 'type')
+      if hasattr(parent_type, node.attr):
+        # This should hold for static members like methods.
+        # This would not hold for dynamic members like function attributes.
+        # For the dynamic case, we simply leave the node without an annotation,
+        # and let downstream consumers figure out what to do.
+        anno.setanno(node, 'live_val', getattr(parent_type, node.attr))
+        anno.setanno(node, 'fqn',
+                     anno.getanno(node.value, 'type_fqn') + (node.attr,))
+    elif isinstance(node.value, gast.Name):
+      stem_name = node.value
+      # All nonlocal symbols should be fully resolved.
+      assert anno.hasanno(stem_name, NodeAnno.IS_LOCAL), stem_name
+      # TODO(mdan): Figure out what to do when calling attribute on local object
+      # Maybe just leave as-is?
+    return node
+
+
+def resolve(node, context, literals):
+  return LiveValueResolver(context, literals).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f64689401e3594a77fbdd7b6f02880bd6e90492
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/live_values_test.py
@@ -0,0 +1,109 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for live_values module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import context
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.py2tf.pyct.static_analysis import activity
+from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
+from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
+from tensorflow.python.framework import constant_op
+from tensorflow.python.platform import test
+
+
+class LiveValuesResolverTest(test.TestCase):
+
+  def _parse_and_analyze(self,
+                         test_fn,
+                         namespace,
+                         literals=None,
+                         arg_types=None):
+    literals = literals or {}
+    arg_types = arg_types or {}
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=source,
+        source_file=None,
+        namespace=namespace,
+        arg_values=None,
+        arg_types=arg_types,
+        recursive=True)
+    node = qual_names.resolve(node)
+    node = activity.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, literals)
+    node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, literals)
+    return node
+
+  def test_literals(self):
+
+    def test_fn():
+      return Foo  # pylint: disable=undefined-variable
+
+    node = self._parse_and_analyze(test_fn, {}, {'Foo': 'bar'})
+    retval_node = node.body[0].body[0].value
+    self.assertEquals('bar', anno.getanno(retval_node, 'live_val'))
+
+  def test_namespace(self):
+
+    def foo():
+      return 'bar'
+
+    def test_fn():
+      return foo()
+
+    node = self._parse_and_analyze(test_fn, {'foo': foo})
+    func_node = node.body[0].body[0].value.func
+    self.assertEquals(foo, anno.getanno(func_node, 'live_val'))
+    self.assertEquals(('foo',), anno.getanno(func_node, 'fqn'))
+
+  def test_attribute_names(self):
+
+    def test_fn():
+      return constant_op.constant(0)
+
+    node = self._parse_and_analyze(test_fn, {'constant_op': constant_op})
+    func_node = node.body[0].body[0].value.func
+    self.assertEquals(constant_op.constant, anno.getanno(func_node, 'live_val'))
+    self.assertEquals((constant_op.__name__, 'constant'),
+                      anno.getanno(func_node, 'fqn'))
+
+  def test_attributes_with_type_hints(self):
+
+    class TestClass(object):
+
+      def member(self):
+        pass
+
+      def test_fn(self):
+        return self.member()
+
+    node = self._parse_and_analyze(
+        TestClass.test_fn, {'constant_op': constant_op},
+        arg_types={'self': (TestClass.__name__, TestClass)})
+    func_node = node.body[0].body[0].value.func
+    self.assertEquals(TestClass.member, anno.getanno(func_node, 'live_val'))
+    self.assertEquals(('TestClass', 'member'), anno.getanno(func_node, 'fqn'))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..8203bda0f9a792a5b24b9abb25d8f39b61625748
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info.py
@@ -0,0 +1,186 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Type resolution.
+
+Requires annotations generated by LiveValuesResolver.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import transformer
+from tensorflow.python.util import tf_inspect
+
+
+class Scope(object):
+  """Encloses symbol value references.
+
+  Attributes:
+    values: A dict mapping string to gast.Node, containing the value that was
+        most recently assigned to the symbol.
+  """
+
+  def __init__(self, parent):
+    """Create a new scope.
+
+    Args:
+      parent: A Scope or None.
+    """
+    self.parent = parent
+    self.values = {}
+
+  def __repr__(self):
+    return 'Scope[%s]' % self.values.keys()
+
+  def copy(self):
+    s = Scope(self.parent)
+    s.values = self.values.copy()
+    return s
+
+  def setval(self, name, value):
+    self.values[name] = value
+
+  def hasval(self, name):
+    return (name in self.values or
+            (self.parent is not None and self.parent.hasval(name)))
+
+  def getval(self, name):
+    if name in self.values:
+      return self.values[name]
+    if self.parent is not None:
+      return self.parent.getval(name)
+    raise KeyError(name)
+
+
+class TypeInfoResolver(transformer.Base):
+  """Annotates symbols with type information where possible.
+
+  Nodes currently annotated:
+    * Call (helps detect class constructors)
+    * Attribute (helps resolve object methods)
+  """
+
+  def __init__(self, context):
+    super(TypeInfoResolver, self).__init__(context)
+    self.scope = Scope(None)
+    self.function_level = 0
+
+  def visit_FunctionDef(self, node):
+    self.scope = Scope(self.scope)
+    self.function_level += 1
+    self.generic_visit(node)
+    self.function_level -= 1
+    self.scope = self.scope.parent
+    return node
+
+  def _visit_block(self, block):
+    self.scope = Scope(self.scope)
+    for i, n in enumerate(block):
+      block[i] = self.generic_visit(n)
+    self.scope = self.scope.parent
+    return block
+
+  def visit_For(self, node):
+    self.generic_visit(node.target)
+    self.generic_visit(node.iter)
+    node.body = self._visit_block(node.body)
+    node.orelse = self._visit_block(node.orelse)
+    return node
+
+  def visit_While(self, node):
+    self.generic_visit(node.test)
+    node.body = self._visit_block(node.body)
+    node.orelse = self._visit_block(node.orelse)
+    return node
+
+  def visit_If(self, node):
+    self.generic_visit(node.test)
+    node.body = self._visit_block(node.body)
+    node.orelse = self._visit_block(node.orelse)
+    return node
+
+  def _process_function_arg(self, arg_name):
+    str_name = str(arg_name)
+    if self.function_level == 1 and str_name in self.context.arg_types:
+      # Forge a node to hold the type information, so that method calls on
+      # it can resolve the type.
+      type_holder = arg_name.ast()
+      type_string, type_obj = self.context.arg_types[str_name]
+      anno.setanno(type_holder, 'type', type_obj)
+      anno.setanno(type_holder, 'type_fqn', tuple(type_string.split('.')))
+      self.scope.setval(arg_name, type_holder)
+
+  def visit_arg(self, node):
+    self._process_function_arg(anno.getanno(node.arg, anno.Basic.QN))
+    return node
+
+  def visit_Name(self, node):
+    self.generic_visit(node)
+    qn = anno.getanno(node, anno.Basic.QN)
+    if isinstance(node.ctx, gast.Param):
+      self._process_function_arg(qn)
+    elif isinstance(node.ctx, gast.Load) and self.scope.hasval(qn):
+      # E.g. if we had
+      # a = b
+      # then for future references to `a` we should have traced_source = `b`
+      traced_source = self.scope.getval(qn)
+      if anno.hasanno(traced_source, 'type'):
+        anno.setanno(node, 'type', anno.getanno(traced_source, 'type'))
+        anno.setanno(node, 'type_fqn', anno.getanno(traced_source, 'type_fqn'))
+    return node
+
+  def _process_variable_assignment(self, source, targets):
+    if isinstance(source, gast.Call):
+      func = source.func
+      if anno.hasanno(func, 'live_val'):
+        func_obj = anno.getanno(func, 'live_val')
+        if tf_inspect.isclass(func_obj):
+          anno.setanno(source, 'is_constructor', True)
+          anno.setanno(source, 'type', func_obj)
+          anno.setanno(source, 'type_fqn', anno.getanno(func, 'fqn'))
+          # TODO(mdan): Raise an error if constructor has side effects.
+          # We can have a whitelist of no-side-effects constructors.
+          # We can also step inside the constructor and further analyze.
+
+    for t in targets:
+      if isinstance(t, gast.Tuple):
+        for i, e in enumerate(t.elts):
+          self.scope.setval(
+              anno.getanno(e, anno.Basic.QN),
+              gast.Subscript(source, gast.Index(i), ctx=gast.Store()))
+      elif isinstance(t, (gast.Name, gast.Attribute)):
+        self.scope.setval(anno.getanno(t, anno.Basic.QN), source)
+      else:
+        raise ValueError('Dont know how to handle assignment to %s' % t)
+
+  def visit_With(self, node):
+    for wi in node.items:
+      if wi.optional_vars is not None:
+        self._process_variable_assignment(wi.context_expr, (wi.optional_vars,))
+    self.generic_visit(node)
+    return node
+
+  def visit_Assign(self, node):
+    self.generic_visit(node)
+    self._process_variable_assignment(node.value, node.targets)
+    return node
+
+
+def resolve(node, context):
+  return TypeInfoResolver(context).visit(node)
diff --git a/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3659f949db9910534870d8dd9e42fd4ee8297253
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/static_analysis/type_info_test.py
@@ -0,0 +1,179 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for type_info module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import context
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import qual_names
+from tensorflow.contrib.py2tf.pyct.static_analysis import activity
+from tensorflow.contrib.py2tf.pyct.static_analysis import live_values
+from tensorflow.contrib.py2tf.pyct.static_analysis import type_info
+from tensorflow.python.client import session
+from tensorflow.python.platform import test
+from tensorflow.python.training import training
+
+
+class ScopeTest(test.TestCase):
+
+  def test_basic(self):
+    scope = type_info.Scope(None)
+    self.assertFalse(scope.hasval('foo'))
+
+    scope.setval('foo', 'bar')
+    self.assertTrue(scope.hasval('foo'))
+
+    self.assertFalse(scope.hasval('baz'))
+
+  def test_nesting(self):
+    scope = type_info.Scope(None)
+    scope.setval('foo', '')
+
+    child = type_info.Scope(scope)
+    self.assertTrue(child.hasval('foo'))
+    self.assertTrue(scope.hasval('foo'))
+
+    child.setval('bar', '')
+    self.assertTrue(child.hasval('bar'))
+    self.assertFalse(scope.hasval('bar'))
+
+
+class TypeInfoResolverTest(test.TestCase):
+
+  def _parse_and_analyze(self, test_fn, namespace, arg_types=None):
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=source,
+        source_file=None,
+        namespace=namespace,
+        arg_values=None,
+        arg_types=arg_types,
+        recursive=True)
+    node = qual_names.resolve(node)
+    node = activity.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
+    node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
+    return node
+
+  def test_constructor_detection(self):
+
+    def test_fn():
+      opt = training.GradientDescentOptimizer(0.1)
+      return opt
+
+    node = self._parse_and_analyze(test_fn, {'training': training})
+    call_node = node.body[0].body[0].value
+    self.assertEquals(training.GradientDescentOptimizer,
+                      anno.getanno(call_node, 'type'))
+    self.assertEquals((training.__name__, 'GradientDescentOptimizer'),
+                      anno.getanno(call_node, 'type_fqn'))
+
+  def test_class_members_of_detected_constructor(self):
+
+    def test_fn():
+      opt = training.GradientDescentOptimizer(0.1)
+      opt.minimize(0)
+
+    node = self._parse_and_analyze(test_fn, {'training': training})
+    method_call = node.body[0].body[1].value.func
+    self.assertEquals(training.GradientDescentOptimizer.minimize,
+                      anno.getanno(method_call, 'live_val'))
+
+  def test_class_members_in_with_stmt(self):
+
+    def test_fn(x):
+      with session.Session() as sess:
+        sess.run(x)
+
+    node = self._parse_and_analyze(test_fn, {'session': session})
+    constructor_call = node.body[0].body[0].items[0].context_expr
+    self.assertEquals(session.Session, anno.getanno(constructor_call, 'type'))
+    self.assertEquals((session.__name__, 'Session'),
+                      anno.getanno(constructor_call, 'type_fqn'))
+
+    method_call = node.body[0].body[0].body[0].value.func
+    self.assertEquals(session.Session.run, anno.getanno(method_call,
+                                                        'live_val'))
+
+  def test_constructor_data_dependent(self):
+
+    def test_fn(x):
+      if x > 0:
+        opt = training.GradientDescentOptimizer(0.1)
+      else:
+        opt = training.GradientDescentOptimizer(0.01)
+      opt.minimize(0)
+
+    node = self._parse_and_analyze(test_fn, {'training': training})
+    method_call = node.body[0].body[1].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
+
+  def test_parameter_class_members(self):
+
+    def test_fn(opt):
+      opt.minimize(0)
+
+    node = self._parse_and_analyze(test_fn, {})
+    method_call = node.body[0].body[0].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
+
+  def test_parameter_class_members_with_value_hints(self):
+
+    def test_fn(opt):
+      opt.minimize(0)
+
+    node = self._parse_and_analyze(
+        test_fn, {'training': training},
+        arg_types={
+            'opt': (training.GradientDescentOptimizer.__name__,
+                    training.GradientDescentOptimizer)
+        })
+
+    method_call = node.body[0].body[0].value.func
+    self.assertEquals(training.GradientDescentOptimizer.minimize,
+                      anno.getanno(method_call, 'live_val'))
+
+  def test_function_variables(self):
+
+    def bar():
+      pass
+
+    def test_fn():
+      foo = bar
+      foo()
+
+    node = self._parse_and_analyze(test_fn, {'bar': bar})
+    method_call = node.body[0].body[1].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
+
+  def test_nested_members(self):
+
+    def test_fn():
+      foo = training.GradientDescentOptimizer(0.1)
+      foo.bar.baz()
+
+    node = self._parse_and_analyze(test_fn, {'training': training})
+    method_call = node.body[0].body[1].value.func
+    self.assertFalse(anno.hasanno(method_call, 'live_val'))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/templates.py b/tensorflow/contrib/py2tf/pyct/templates.py
new file mode 100644
index 0000000000000000000000000000000000000000..c40e4d0fb783191705a412ab2728daabb61eda0f
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/templates.py
@@ -0,0 +1,152 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""AST conversion templates.
+
+Adapted from Tangent.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ast
+import textwrap
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import ast_util
+from tensorflow.contrib.py2tf.pyct import parser
+from tensorflow.contrib.py2tf.pyct import qual_names
+
+
+class ReplaceTransformer(gast.NodeTransformer):
+  """Replace AST nodes."""
+
+  def __init__(self, replacements):
+    """Create a new ReplaceTransformer.
+
+    Args:
+      replacements: A mapping from placeholder names to (lists of) AST nodes
+          that these placeholders will be replaced by.
+    """
+    self.replacements = replacements
+    self.in_replacements = False
+
+  # TODO(mdan): Make a more detailed pass and clean up if needed.
+
+  def visit_Expr(self, node):
+    if (isinstance(node.value, gast.Name) and
+        node.value.id in self.replacements):
+      return self.visit(node.value)
+    self.generic_visit(node)
+    return node
+
+  def visit_FunctionDef(self, node):
+    node = self.generic_visit(node)
+    if node.name in self.replacements:
+      repl = self.replacements[node.name]
+      if not isinstance(repl, (gast.Name, ast.Name)):
+        raise ValueError(
+            'A function name can only be replaced by a Name node. Found: %s' %
+            repl)
+      node.name = repl.id
+    return node
+
+  def _set_inner_child_context(self, node, ctx):
+    if isinstance(node, gast.Attribute):
+      self._set_inner_child_context(node.value, ctx)
+      node.ctx = gast.Load()
+    elif isinstance(node, gast.Name):
+      node.ctx = ctx
+    elif isinstance(node, (gast.Str, gast.Num)):
+      pass
+    else:
+      raise ValueError('unexpected node type "%s"' % node)
+
+  def visit_Name(self, node):
+    if node.id not in self.replacements:
+      return node
+
+    new_nodes = ast_util.copy_clean(self.replacements[node.id])
+    if isinstance(new_nodes, gast.AST):
+      new_nodes = [new_nodes]
+
+    # Preserve the target context.
+    for n in new_nodes:
+      if isinstance(n, gast.Tuple):
+        for e in n.elts:
+          self._set_inner_child_context(e, node.ctx)
+      if isinstance(n, gast.Attribute):
+        # For attributes, the inner Name node receives the context, while the
+        # outer ones have it set to Load.
+        self._set_inner_child_context(n, node.ctx)
+      else:
+        n.ctx = node.ctx
+
+    if len(new_nodes) == 1:
+      new_nodes, = new_nodes
+
+    return new_nodes
+
+
+def _convert_to_ast(n):
+  """Convert from a known data type to AST."""
+  if isinstance(n, str):
+    # Note: the node will receive the ctx value from the template, see
+    # ReplaceTransformer.visit_Name.
+    return gast.Name(id=n, ctx=None, annotation=None)
+  if isinstance(n, qual_names.QN):
+    return n.ast()
+  if isinstance(n, list):
+    return [_convert_to_ast(e) for e in n]
+  if isinstance(n, tuple):
+    return tuple(_convert_to_ast(e) for e in n)
+  return n
+
+
+def replace(template, **replacements):
+  """Replace placeholders in a Python template.
+
+  AST Name and Tuple nodes always receive the context that inferred from
+  the template. However, when replacing more complex nodes (that can potentially
+  contain Name children), then the caller is responsible for setting the
+  appropriate context.
+
+  Args:
+    template: A string representing Python code. Any symbol name can be used
+        that appears in the template code can be used as placeholder.
+    **replacements: A mapping from placeholder names to (lists of) AST nodes
+        that these placeholders will be replaced by. String values are also
+        supported as a shorthand for AST Name nodes with the respective ID.
+
+  Returns:
+    An AST node or list of AST nodes with the replacements made. If the
+    template was a function, a list will be returned. If the template was a
+    node, the same node will be returned. If the template was a string, an
+    AST node will be returned (a `Module` node in the case of a multi-line
+    string, an `Expr` node otherwise).
+
+  Raises:
+    ValueError: if the arguments are incorrect.
+  """
+  if not isinstance(template, str):
+    raise ValueError('Expected string template, got %s' % type(template))
+  tree = parser.parse_str(textwrap.dedent(template))
+  for k in replacements:
+    replacements[k] = _convert_to_ast(replacements[k])
+  results = ReplaceTransformer(replacements).visit(tree).body
+  if isinstance(results, list):
+    return [qual_names.resolve(r) for r in results]
+  return qual_names.resolve(results)
diff --git a/tensorflow/contrib/py2tf/pyct/templates_test.py b/tensorflow/contrib/py2tf/pyct/templates_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ccfde8573724741b0bbe4eacb3c54beb381ee7e
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/templates_test.py
@@ -0,0 +1,84 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for templates module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.py2tf.pyct import compiler
+from tensorflow.contrib.py2tf.pyct import templates
+from tensorflow.python.platform import test
+
+
+class TemplatesTest(test.TestCase):
+
+  def test_replace_tuple(self):
+    template = """
+      def test_fn(a, c):
+        return b,
+    """
+
+    node = templates.replace(template, b=('a', 'c'))[0]
+    result, _ = compiler.ast_to_object(node)
+
+    self.assertEquals((2, 3), result.test_fn(2, 3))
+
+  def test_replace_variable(self):
+    template = """
+      def test_fn(a):
+        a += 1
+        a = 2 * a + 1
+        return b
+    """
+
+    node = templates.replace(template, a='b')[0]
+    result, _ = compiler.ast_to_object(node)
+    self.assertEquals(7, result.test_fn(2))
+
+  def test_replace_function_name(self):
+    template = """
+      def fname(a):
+        a += 1
+        a = 2 * a + 1
+        return a
+    """
+
+    node = templates.replace(template, fname='test_fn')[0]
+    result, _ = compiler.ast_to_object(node)
+    self.assertEquals(7, result.test_fn(2))
+
+  def test_code_block(self):
+    template = """
+      def test_fn(a):
+        block
+        return a
+    """
+
+    node = templates.replace(
+        template,
+        block=[
+            gast.Assign([
+                gast.Name('a', None, None)
+            ], gast.BinOp(gast.Name('a', None, None), gast.Add(), gast.Num(1))),
+        ] * 2)[0]
+    result, _ = compiler.ast_to_object(node)
+    self.assertEquals(3, result.test_fn(1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/pyct/transformer.py b/tensorflow/contrib/py2tf/pyct/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..877d52af016af720424c8a56257fec9ab64611cb
--- /dev/null
+++ b/tensorflow/contrib/py2tf/pyct/transformer.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A node transformer that includes utilities for SCT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+import gast
+import six
+
+from tensorflow.contrib.py2tf.pyct import anno
+from tensorflow.contrib.py2tf.pyct import pretty_printer
+
+
+class PyFlowParseError(SyntaxError):
+  pass
+
+
+class Base(gast.NodeTransformer):
+  """Base class for specialized transformers."""
+
+  def __init__(self, context):
+    """Initialize the transformer. Subclasses should call this.
+
+    Args:
+      context: An EntityContext.
+    """
+    self._lineno = 0
+    self._col_offset = 0
+    self.context = context
+
+  def visit(self, node):
+    source_code = self.context.source_code
+    source_file = self.context.source_file
+    try:
+      if source_code and hasattr(node, 'lineno'):
+        self._lineno = node.lineno
+        self._col_offset = node.col_offset
+      if anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
+        return node
+      return super(Base, self).visit(node)
+    except (ValueError, AttributeError, KeyError, NotImplementedError,
+            AssertionError) as e:
+      msg = '%s: %s\nOccurred at node:\n%s' % (
+          e.__class__.__name__, str(e), pretty_printer.fmt(node, color=False))
+      if source_code:
+        line = source_code.splitlines()[self._lineno - 1]
+      else:
+        line = '<no source available>'
+      six.reraise(PyFlowParseError,
+                  PyFlowParseError(
+                      msg,
+                      (source_file, self._lineno, self._col_offset + 1, line)),
+                  sys.exc_info()[2])
diff --git a/tensorflow/contrib/py2tf/utils/BUILD b/tensorflow/contrib/py2tf/utils/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a679cb90765f08f024b3b1bb52b19aa5a0bc06f6
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/BUILD
@@ -0,0 +1,97 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "utils",
+    srcs = [
+        "__init__.py",
+        "context_managers.py",
+        "misc.py",
+        "multiple_dispatch.py",
+        "py_func.py",
+        "tensor_list.py",
+        "type_check.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/python:script_ops",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "context_managers_test",
+    srcs = ["context_managers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "misc_test",
+    srcs = ["misc_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "multiple_dispatch_test",
+    srcs = ["multiple_dispatch_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "py_func_test",
+    srcs = ["py_func_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "type_check_test",
+    srcs = ["type_check_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "tensor_list_test",
+    srcs = ["tensor_list_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:list_ops",
+    ],
+)
diff --git a/tensorflow/contrib/py2tf/utils/__init__.py b/tensorflow/contrib/py2tf/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..838c29aafd8ab4c6b0165995d916291fdfcff10b
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility module that contains APIs usable in the generated code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.utils.context_managers import control_dependency_on_returns
+from tensorflow.contrib.py2tf.utils.misc import alias_tensors
+from tensorflow.contrib.py2tf.utils.multiple_dispatch import run_cond
+from tensorflow.contrib.py2tf.utils.multiple_dispatch import run_while
+from tensorflow.contrib.py2tf.utils.py_func import wrap_py_func
+from tensorflow.contrib.py2tf.utils.type_check import is_tensor
diff --git a/tensorflow/contrib/py2tf/utils/context_managers.py b/tensorflow/contrib/py2tf/utils/context_managers.py
new file mode 100644
index 0000000000000000000000000000000000000000..38d9e11fe9069722b9023fee848bf53e1f72de6a
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/context_managers.py
@@ -0,0 +1,42 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various context managers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+from tensorflow.python.framework import ops
+
+
+def control_dependency_on_returns(return_value):
+  """Create a TF control dependency on the return values of a function.
+
+  If the function had no return value, a no-op context is returned.
+
+  Args:
+    return_value: The return value to set as control dependency.
+
+  Returns:
+    A context manager.
+  """
+  if return_value is None:
+    return contextlib.contextmanager(lambda: (yield))()
+  # TODO(mdan): Filter to tensor objects.
+  if not isinstance(return_value, (list, tuple)):
+    return_value = (return_value,)
+  return ops.control_dependencies(return_value)
diff --git a/tensorflow/contrib/py2tf/utils/context_managers_test.py b/tensorflow/contrib/py2tf/utils/context_managers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..633ba93540e696889a6b2b71b40b999da39d48ff
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/context_managers_test.py
@@ -0,0 +1,42 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for context_managers module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.utils import context_managers
+from tensorflow.python.framework import constant_op
+from tensorflow.python.platform import test
+
+
+class ContextManagersTest(test.TestCase):
+
+  def test_control_dependency_on_returns(self):
+    # Just dry run them.
+    with context_managers.control_dependency_on_returns(None):
+      pass
+    with context_managers.control_dependency_on_returns(
+        constant_op.constant(1)):
+      pass
+    with context_managers.control_dependency_on_returns(
+        [constant_op.constant(1),
+         constant_op.constant(2)]):
+      pass
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/utils/misc.py b/tensorflow/contrib/py2tf/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b06caf0bdeb6f4a079e33f2e887d2dca017adc2
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/misc.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Miscellaneous utilities that don't fit anywhere else."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+
+
+def alias_tensors(*args):
+  """Wrap any Tensor arguments with an identity op.
+
+  Any other argument, including Variables, is returned unchanged.
+
+  Args:
+    *args: Any arguments. Must contain at least one element.
+
+  Returns:
+    Same as *args, with Tensor instances replaced as described.
+
+  Raises:
+    ValueError: If args doesn't meet the requirements.
+  """
+
+  def alias_if_tensor(a):
+    return array_ops.identity(a) if isinstance(a, ops.Tensor) else a
+
+  # TODO(mdan): Recurse into containers?
+  # TODO(mdan): Anything we can do about variables? Fake a scope reuse?
+  if len(args) > 1:
+    return (alias_if_tensor(a) for a in args)
+  elif len(args) == 1:
+    return alias_if_tensor(args[0])
+
+  raise ValueError('at least one argument required')
diff --git a/tensorflow/contrib/py2tf/utils/misc_test.py b/tensorflow/contrib/py2tf/utils/misc_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfcb304c838df69e9e3961907362c7939c065117
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/misc_test.py
@@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for misc module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.utils import misc
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class ContextManagersTest(test.TestCase):
+
+  def test_alias_single_tensor(self):
+    a = constant_op.constant(1)
+
+    new_a = misc.alias_tensors(a)
+    self.assertFalse(new_a is a)
+    with self.test_session() as sess:
+      self.assertEqual(1, sess.run(new_a))
+
+  def test_alias_tensors(self):
+    a = constant_op.constant(1)
+    v = variables.Variable(2)
+    s = 'a'
+    l = [1, 2, 3]
+
+    new_a, new_v, new_s, new_l = misc.alias_tensors(a, v, s, l)
+
+    self.assertFalse(new_a is a)
+    self.assertTrue(new_v is v)
+    self.assertTrue(new_s is s)
+    self.assertTrue(new_l is l)
+    with self.test_session() as sess:
+      self.assertEqual(1, sess.run(new_a))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/utils/multiple_dispatch.py b/tensorflow/contrib/py2tf/utils/multiple_dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..a855fdc075941915035d1e3380846ff912803494
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/multiple_dispatch.py
@@ -0,0 +1,90 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for type-dependent behavior used in py2tf-generated code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.py2tf.utils.type_check import is_tensor
+from tensorflow.python.ops import control_flow_ops
+
+
+def run_cond(condition, true_fn, false_fn):
+  """Type-dependent functional conditional.
+
+  Args:
+    condition: A Tensor or Python bool.
+    true_fn: A Python callable implementing the true branch of the conditional.
+    false_fn: A Python callable implementing the false branch of the
+      conditional.
+
+  Returns:
+    result: The result of calling the appropriate branch. If condition is a
+    Tensor, tf.cond will be used. Otherwise, a standard Python if statement will
+    be ran.
+  """
+  if is_tensor(condition):
+    return control_flow_ops.cond(condition, true_fn, false_fn)
+  else:
+    return py_cond(condition, true_fn, false_fn)
+
+
+def py_cond(condition, true_fn, false_fn):
+  if condition:
+    return true_fn()
+  else:
+    return false_fn()
+
+
+def run_while(cond_fn, body_fn, init_args):
+  """Type-dependent functional while loop.
+
+  Args:
+    cond_fn: A Python callable implementing the stop conditions of the loop.
+    body_fn: A Python callable implementing the body of the loop.
+    init_args: The initial values of the arguments that will be passed to both
+      cond_fn and body_fn.
+
+  Returns:
+    result: A list of values with the same shape and type as init_args. If any
+    of the init_args, or any variables closed-over in cond_fn are Tensors,
+    tf.while_loop will be used, otherwise a Python while loop will be ran.
+
+  Raises:
+    ValueError: if init_args is not a tuple or list with one or more elements.
+  """
+  if not isinstance(init_args, (tuple, list)) or not init_args:
+    raise ValueError(
+        'init_args must be a non-empty list or tuple, found %s' % init_args)
+
+  # TODO(alexbw): statically determine all active variables in cond_fn,
+  # and pass them directly
+  closure_vars = tuple(
+      [c.cell_contents for c in six.get_function_closure(cond_fn) or []])
+  possibly_tensors = tuple(init_args) + closure_vars
+  if is_tensor(*possibly_tensors):
+    return control_flow_ops.while_loop(cond_fn, body_fn, init_args)
+  else:
+    return py_while_loop(cond_fn, body_fn, init_args)
+
+
+def py_while_loop(cond_fn, body_fn, init_args):
+  state = init_args
+  while cond_fn(*state):
+    state = body_fn(*state)
+  return state
diff --git a/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py b/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bb4d4086b002211eebb86783bb7212c707a1418
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/multiple_dispatch_test.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multiple_dispatch."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from tensorflow.contrib.py2tf.utils import multiple_dispatch
+from tensorflow.python.client.session import Session
+from tensorflow.python.framework.constant_op import constant
+from tensorflow.python.platform import test
+
+
+class MultipleDispatchTest(test.TestCase):
+
+  def test_run_cond_python(self):
+    true_fn = lambda: 2.0
+    false_fn = lambda: 3.0
+    self.assertEqual(multiple_dispatch.run_cond(True, true_fn, false_fn), 2.0)
+    self.assertEqual(multiple_dispatch.run_cond(False, true_fn, false_fn), 3.0)
+
+  def test_run_cond_tf(self):
+
+    true_fn = lambda: constant([2.0])
+    false_fn = lambda: constant([3.0])
+    with Session() as sess:
+      out = multiple_dispatch.run_cond(constant(True), true_fn, false_fn)
+      self.assertEqual(sess.run(out), 2.0)
+      out = multiple_dispatch.run_cond(constant(False), true_fn, false_fn)
+      self.assertEqual(sess.run(out), 3.0)
+
+  def test_run_while_python(self):
+    cond_fn = lambda x, t, s: x > t
+    body_fn = lambda x, t, s: (x * s, t, s)
+
+    x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn, [3.0, 1.0, 0.5])
+    self.assertEqual(x, 0.75)
+
+    x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn, [3.0, 4.0, 0.5])
+    self.assertEqual(x, 3.0)
+
+  def test_run_while_tf(self):
+    cond_fn = lambda x, t, s: x > t
+    body_fn = lambda x, t, s: (x * s, t, s)
+
+    with Session() as sess:
+      x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn,
+                                            [constant(3.0), 1.0, 0.5])
+      self.assertEqual(sess.run(x), 0.75)
+
+      x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn,
+                                            [constant(3.0), 4.0, 0.5])
+      self.assertEqual(sess.run(x), 3.0)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/utils/py_func.py b/tensorflow/contrib/py2tf/utils/py_func.py
new file mode 100644
index 0000000000000000000000000000000000000000..838872d092a3ab07e965180eff4fec7ff6c4ccf9
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/py_func.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Pyfunc creation utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import script_ops
+
+
+def wrap_py_func(f, return_dtypes, arguments, use_dummy_return=False):
+  """Helper that wraps a callable to py_func.
+
+  The helper passes tensor arguments through the py_func interface. Non-tensor
+  arguments are allowed, and will be passed to f directly. Note that non-tensor
+  arguments are captured by f will not update every time the wrapper is
+  called (this is consistent with its argument list, which only includes
+  the tensor arguments). In general, it's safest not to reuse this wrapper.
+
+  Args:
+    f: Callable
+    return_dtypes: DType, tuple, list or None, the data type for each of f's
+        return value. None if f has no return values or use_dummy_return is
+        True.
+    arguments: Arguments for f
+    use_dummy_return: If True, the function will return a dummy value of 1
+        and discard its actual return value.
+  Returns:
+    The return values of f converted to tensor.
+  Raises:
+    ValueError: if the arguments are incorrect.
+  """
+
+  if return_dtypes and use_dummy_return:
+    raise ValueError('if use_dummy_return is True, return_dtypes must be empty')
+
+  n = len(arguments)
+  arg_is_tensor = tuple(map(tensor_util.is_tensor, arguments))
+  index_in_tensor_list = [0] * n
+  i = 0
+  for j in range(n):
+    index_in_tensor_list[j] = i
+    if arg_is_tensor[j]:
+      i += 1
+
+  def f_wrapper(*tensor_args):
+    f_args = tuple(tensor_args[index_in_tensor_list[i]]
+                   if arg_is_tensor[i] else arguments[i] for i in range(n))
+    retval = f(*f_args)
+    return 1 if use_dummy_return else retval
+
+  return script_ops.py_func(
+      f_wrapper, tuple(arguments[i] for i in range(n) if arg_is_tensor[i]),
+      dtypes.int64 if use_dummy_return else return_dtypes)
diff --git a/tensorflow/contrib/py2tf/utils/py_func_test.py b/tensorflow/contrib/py2tf/utils/py_func_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..776b5309c6f027bb2008aa83d48e4155e817ed97
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/py_func_test.py
@@ -0,0 +1,91 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for wrap_py_func module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.utils import py_func
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+class PyFuncTest(test.TestCase):
+
+  def test_wrap_py_func_simple(self):
+
+    def test_fn(a, b, c):
+      return a + b + c
+
+    with self.test_session() as sess:
+      tensor_1 = constant_op.constant(1)
+      self.assertEqual(3,
+                       sess.run(
+                           py_func.wrap_py_func(test_fn, dtypes.int64,
+                                                (1, tensor_1, 1))))
+      self.assertEqual(3,
+                       sess.run(
+                           py_func.wrap_py_func(test_fn, dtypes.int64,
+                                                (1, 1, 1))))
+      self.assertEqual(3,
+                       sess.run(
+                           py_func.wrap_py_func(test_fn, dtypes.int64,
+                                                (tensor_1, 1, tensor_1))))
+
+  def test_wrap_py_func_complex_args(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.foo = 5
+
+    def test_fn(a, b):
+      return a * b.foo
+
+    with self.test_session() as sess:
+      self.assertEqual(35,
+                       sess.run(
+                           py_func.wrap_py_func(test_fn, dtypes.int64,
+                                                (7, TestClass()))))
+      self.assertEqual(
+          35,
+          sess.run(
+              py_func.wrap_py_func(test_fn, dtypes.int64,
+                                   (constant_op.constant(7), TestClass()))))
+
+  def test_wrap_py_func_dummy_return(self):
+
+    side_counter = [0]
+
+    def test_fn(_):
+      side_counter[0] += 1
+
+    with self.test_session() as sess:
+      self.assertEqual(1,
+                       sess.run(
+                           py_func.wrap_py_func(test_fn, None, (5,), True)))
+      self.assertEqual([1], side_counter)
+      self.assertEqual(1,
+                       sess.run(
+                           py_func.wrap_py_func(test_fn, None,
+                                                (constant_op.constant(5),),
+                                                True)))
+      self.assertEqual([2], side_counter)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/py2tf/utils/tensor_list.py b/tensorflow/contrib/py2tf/utils/tensor_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6ff49e2a0eff384f10903e12212ab929e267804
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/tensor_list.py
@@ -0,0 +1,49 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A typed list in Python."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import list_ops
+
+
+class TensorList(object):
+  """Tensor list wrapper API-compatible with Python built-in list."""
+
+  def __init__(self, shape, dtype):
+    self.dtype = dtype
+    self.shape = shape
+    self.clear()
+
+  def append(self, value):
+    self.list_ = list_ops.tensor_list_push_back(self.list_, value)
+
+  def pop(self):
+    self.list_, value = list_ops.tensor_list_pop_back(self.list_, self.dtype)
+    return value
+
+  def clear(self):
+    self.list_ = list_ops.empty_tensor_list(self.shape, self.dtype)
+
+  def count(self):
+    return list_ops.tensor_list_length(self.list_)
+
+  def __getitem__(self, key):
+    return list_ops.tensor_list_get_item(self.list_, key, self.dtype)
+
+  def __setitem__(self, key, value):
+    self.list_ = list_ops.tensor_list_set_item(self.list_, key, value)
diff --git a/tensorflow/contrib/py2tf/utils/tensor_list_test.py b/tensorflow/contrib/py2tf/utils/tensor_list_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5e554a162674e08da21785dcbe193c54647f128
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/tensor_list_test.py
@@ -0,0 +1,89 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for PyFlow list."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.py2tf.utils import tensor_list as tl
+from tensorflow.python.client.session import Session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.framework.constant_op import constant
+from tensorflow.python.platform import test
+
+
+class TensorListTest(test.TestCase):
+
+  def test_list_append_python(self):
+    with context.eager_mode():
+      a = constant(3.0)
+      l = tl.TensorList(a.shape, a.dtype)
+      l.append(a)
+      self.assertEqual(l.count().numpy(), 1)
+      l.append(a)
+      self.assertEqual(l.count().numpy(), 2)
+      _ = l.pop()
+      self.assertEqual(l.count().numpy(), 1)
+      a2 = l.pop()
+      self.assertEqual(l.count().numpy(), 0)
+      self.assertEqual(a.numpy(), a2.numpy())
+
+  def test_list_index_python(self):
+    with context.eager_mode():
+      a = constant(3.0)
+      b = constant(2.0)
+      l = tl.TensorList(a.shape, a.dtype)
+      l.append(a)
+      self.assertEqual(l[0].numpy(), a.numpy())
+      l[0] = ops.convert_to_tensor(b)
+      self.assertEqual(l[0].numpy(), b.numpy())
+
+  def test_list_append_tf(self):
+    a = constant(3.0)
+    l = tl.TensorList(a.shape, a.dtype)
+    l.append(a)
+    c1 = l.count()
+    l.append(a)
+    c2 = l.count()
+    _ = l.pop()
+    c3 = l.count()
+    a2 = l.pop()
+    c4 = l.count()
+    with Session() as sess:
+      c1, c2, c3, c4, a, a2 = sess.run([c1, c2, c3, c4, a, a2])
+      self.assertEqual(c1, 1)
+      self.assertEqual(c2, 2)
+      self.assertEqual(c3, 1)
+      self.assertEqual(c4, 0)
+      self.assertEqual(a, a2)
+
+  def test_list_index_tf(self):
+    a = constant(3.0)
+    b = constant(2.0)
+    l = tl.TensorList(a.shape, a.dtype)
+    l.append(a)
+    l0 = l[0]
+    l[0] = b
+    l1 = l[0]
+    with self.test_session() as sess:
+      l0, l1, a, b = sess.run([l0, l1, a, b])
+      self.assertEqual(l0, a)
+      self.assertEqual(l1, b)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered_impl.py b/tensorflow/contrib/py2tf/utils/type_check.py
similarity index 58%
rename from tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered_impl.py
rename to tensorflow/contrib/py2tf/utils/type_check.py
index 223bc9d042c69be05b0e578835a31ed6e83c0c97..9ca2dec872c8a9ca7bedaa8603f70e3214a3e24a 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered_impl.py
+++ b/tensorflow/contrib/py2tf/utils/type_check.py
@@ -12,28 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""SigmoidCentered bijector."""
+"""Utilities used in py2tf-generated code."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import softmax_centered
+from tensorflow.python.framework import tensor_util
 
 
-__all__ = [
-    "SigmoidCentered",
-]
+def is_tensor(*args):
+  """Check if all arguments are tensors.
 
+  Args:
+    *args: Python objects that may or may not be tensors.
 
-class SigmoidCentered(softmax_centered.SoftmaxCentered):
-  """Bijector which computes Y = g(X) = exp([X 0]) / (1 + exp(-X)).
-
-  Equivalent to: `bijector.SoftmaxCentered(event_ndims=0)`.
-
-  See `bijector.SoftmaxCentered` for more details.
+  Returns:
+    True if all *args are TensorFlow types, False if one or more are not.
   """
-
-  def __init__(self, validate_args=False, name="sigmoid_centered"):
-    super(SigmoidCentered, self).__init__(
-        event_ndims=0, validate_args=validate_args, name=name)
+  return any([tensor_util.is_tensor(a) for a in args])
diff --git a/tensorflow/contrib/py2tf/utils/type_check_test.py b/tensorflow/contrib/py2tf/utils/type_check_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d0428e9cccecdc67511e236bc00655a055aea29
--- /dev/null
+++ b/tensorflow/contrib/py2tf/utils/type_check_test.py
@@ -0,0 +1,43 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for type_check."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.contrib.py2tf.utils import type_check
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class TypeCheckTest(test.TestCase):
+
+  def test_checks(self):
+    self.assertTrue(type_check.is_tensor(constant_op.constant([1, 2, 3])))
+    self.assertTrue(
+        type_check.is_tensor(test_util.variables.Variable([1, 2, 3])))
+    self.assertTrue(
+        type_check.is_tensor(
+            test_util.array_ops.placeholder(test_util.dtypes.float32)))
+    self.assertFalse(type_check.is_tensor(3))
+    self.assertFalse(type_check.is_tensor(numpy.eye(3)))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 389e26cca3eb04fe43abbee62a1efde7ae0d204d..aec9f47ccb20349c08bbe2fd813ee24a807f9fe3 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -13,6 +13,20 @@ py_library(
     deps = [],
 )
 
+py_test(
+    name = "common_test",
+    size = "small",
+    srcs = ["python/common_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":common",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:session",
+    ],
+)
+
 py_library(
     name = "graph_matcher",
     srcs = [
@@ -75,11 +89,18 @@ py_library(
         ":graph_matcher",
         ":input_to_ops",
         "//tensorflow/contrib/graph_editor:graph_editor_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
     ],
 )
 
@@ -94,34 +115,15 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
-    ],
-)
-
-py_library(
-    name = "copy_graph",
-    srcs = ["python/copy_graph.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:session",
         "//tensorflow/python:training",
-    ],
-)
-
-py_test(
-    name = "copy_graph_test",
-    size = "small",
-    srcs = ["python/copy_graph_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":copy_graph",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
 )
@@ -152,7 +154,6 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:session",
         "//tensorflow/python:variables",
@@ -164,7 +165,7 @@ py_library(
     srcs = ["python/quantize.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":common",
+        ":graph_matcher",
         ":input_to_ops",
         ":quant_ops",
         "//tensorflow/contrib/graph_editor:graph_editor_py",
@@ -211,7 +212,6 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
     ],
 )
 
@@ -223,12 +223,9 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":copy_graph",
         ":fold_batch_norms",
         ":quantize",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:util",
-        "//tensorflow/python:variables",
     ],
 )
 
@@ -241,13 +238,11 @@ py_test(
         ":quantize_graph",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
     ],
 )
 
diff --git a/tensorflow/contrib/quantize/__init__.py b/tensorflow/contrib/quantize/__init__.py
index 5d4e4575c935e0a888c6e5e4d0db640d93e1bd49..933200e60749e62094040672793953c1c79de6cf 100644
--- a/tensorflow/contrib/quantize/__init__.py
+++ b/tensorflow/contrib/quantize/__init__.py
@@ -27,6 +27,8 @@ from tensorflow.python.util.all_util import remove_undocumented
 _allowed_symbols = [
     "create_eval_graph",
     "create_training_graph",
+    "experimental_create_eval_graph",
+    "experimental_create_training_graph",
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/quantize/python/common.py b/tensorflow/contrib/quantize/python/common.py
index d0b0674c31239ee903f5ab7ef9ae0262bb20d189..3a1fa61e43986af1a1315d5a9e6f010e802ea157 100644
--- a/tensorflow/contrib/quantize/python/common.py
+++ b/tensorflow/contrib/quantize/python/common.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Constants used across this package."""
+"""Common utilities used across this package."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +21,12 @@ from __future__ import print_function
 import collections
 import re
 
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+
 # Skip all operations that are backprop related or export summaries.
 SKIPPED_PREFIXES = (
     'gradients/', 'RMSProp/', 'Adagrad/', 'Const_', 'HistogramSummary',
@@ -86,3 +92,31 @@ def _GetOperationByNameDontThrow(graph, name):
     return graph.get_operation_by_name(name)
   except KeyError:
     return None
+
+
+def CreateOrGetQuantizationStep():
+  """Returns a Tensor of the number of steps the quantized graph has run.
+
+  Returns:
+    Quantization step Tensor.
+  """
+  quantization_step_name = 'fake_quantization_step'
+  quantization_step_tensor_name = quantization_step_name + '/AssignAdd:0'
+  g = ops.get_default_graph()
+  try:
+    return g.get_tensor_by_name(quantization_step_tensor_name)
+  except KeyError:
+    # Create in proper graph and base name_scope.
+    with g.name_scope(None):
+      quantization_step_tensor = variable_scope.get_variable(
+          quantization_step_name,
+          shape=[],
+          dtype=dtypes.int64,
+          initializer=init_ops.zeros_initializer(),
+          trainable=False,
+          collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+      with g.name_scope(quantization_step_tensor.op.name + '/'):
+        # We return the incremented variable tensor. Since this is used in conds
+        # for quant_delay and freeze_bn_delay, it will run once per graph
+        # execution.
+        return state_ops.assign_add(quantization_step_tensor, 1)
diff --git a/tensorflow/contrib/quantize/python/common_test.py b/tensorflow/contrib/quantize/python/common_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6237fe5e38d905bf262d7be3746b9ee6046da47
--- /dev/null
+++ b/tensorflow/contrib/quantize/python/common_test.py
@@ -0,0 +1,59 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for common utilities in this package."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.quantize.python import common
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+class CommonTest(test_util.TensorFlowTestCase):
+
+  def testCreateOrGetQuantizationStep(self):
+    g = ops.Graph()
+    with session.Session(graph=g) as sess:
+      quantization_step_tensor = common.CreateOrGetQuantizationStep()
+
+      # Check that operations are added to the graph.
+      num_nodes = len(g.get_operations())
+      self.assertGreater(num_nodes, 0)
+
+      # Check that getting the quantization step doesn't change the graph.
+      get_quantization_step_tensor = common.CreateOrGetQuantizationStep()
+      self.assertEqual(quantization_step_tensor, get_quantization_step_tensor)
+      self.assertEqual(num_nodes, len(g.get_operations()))
+
+      # Ensure that running the graph increments the quantization step.
+      sess.run(variables.global_variables_initializer())
+      step_val = sess.run(quantization_step_tensor)
+      self.assertEqual(step_val, 1)
+
+      # Ensure that even running a graph that depends on the quantization step
+      # multiple times only executes it once.
+      a = quantization_step_tensor + 1
+      b = a + quantization_step_tensor
+      _, step_val = sess.run([b, quantization_step_tensor])
+      self.assertEqual(step_val, 2)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/quantize/python/copy_graph_test.py b/tensorflow/contrib/quantize/python/copy_graph_test.py
deleted file mode 100644
index 7ff9ad9f8412d7076bf12d6cf10772244444013f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/quantize/python/copy_graph_test.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for copy_graph."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.quantize.python import copy_graph
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
-
-
-class CopyGraphTest(test_util.TensorFlowTestCase):
-
-  def _CompareNodeInGraph(self, node, graph):
-    graph_node = graph.get_operation_by_name(node.name)
-    self.assertEqual(str(node.node_def), str(graph_node.node_def))
-
-  def testCopyGraph(self):
-    graph = ops.Graph()
-    with graph.as_default():
-      a = constant_op.constant(1.0)
-      b = variables.Variable(2.0)
-      c = a + b
-    graph_copy = copy_graph.CopyGraph(graph)
-    # Ensure that the three original nodes are in the new graph.
-    # import_meta_graph also adds a saver node to the graph which we don't care
-    # about in this specific use case.
-    for tensor in [a, b, c]:
-      self._CompareNodeInGraph(tensor.op, graph_copy)
-    # Test that the graph collections are the same.
-    for key in graph.get_all_collection_keys():
-      self.assertEqual(
-          len(graph.get_collection(key)),
-          len(graph_copy.get_collection(key)), 'Collection %s differs.')
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 647d4044001f7be701037d07dc46db86c0aa3a0e..75d9eb0e58d96e4bb2946684febd250e2e1a6b4a 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -23,14 +23,18 @@ from tensorflow.contrib import graph_editor
 from tensorflow.contrib.quantize.python import common
 from tensorflow.contrib.quantize.python import graph_matcher
 from tensorflow.contrib.quantize.python import input_to_ops
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.util import compat
 
 
-def FoldBatchNorms(graph):
+def FoldBatchNorms(graph, is_training, freeze_batch_norm_delay=None):
   """Finds batch norm layers and folds them into preceding layers.
 
   Folding only affects the following layers: Conv2D, fully connected, depthwise
@@ -38,15 +42,22 @@ def FoldBatchNorms(graph):
 
   Args:
     graph: Graph to walk and modify.
-
+    is_training: Bool, true if training.
+    freeze_batch_norm_delay: How many steps to wait before freezing moving mean
+      and variance and using them for batch normalization. This value is used
+      only when is_training is True.
   Raises:
     ValueError: When batch norm folding fails.
   """
-  _FoldFusedBatchNorms(graph)
-  _FoldUnfusedBatchNorms(graph)
+  _FoldFusedBatchNorms(
+      graph, is_training, freeze_batch_norm_delay=freeze_batch_norm_delay)
+  _FoldUnfusedBatchNorms(
+      graph,
+      is_training=is_training,
+      freeze_batch_norm_delay=freeze_batch_norm_delay)
 
 
-def _FoldFusedBatchNorms(graph):
+def _FoldFusedBatchNorms(graph, is_training, freeze_batch_norm_delay):
   """Finds fused batch norm layers and folds them into preceding layers.
 
   Folding only affects the following layers: Conv2D, fully connected, depthwise
@@ -54,6 +65,9 @@ def _FoldFusedBatchNorms(graph):
 
   Args:
     graph: Graph to walk and modify.
+    is_training: Bool, true if training.
+    freeze_batch_norm_delay: How many steps to wait before freezing moving mean
+      and variance and using them for batch normalization.
 
   Raises:
     ValueError: When batch norm folding fails.
@@ -64,35 +78,56 @@ def _FoldFusedBatchNorms(graph):
     # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope
     # named `scope`. Otherwise, TF creates a unique scope whose name starts with
     # `scope`.
-    with graph.as_default(), graph.name_scope(scope + sep), ops.device(
-        match.bn_op.device):
-      # new weights = old weights * gamma / sqrt(variance + epsilon)
-      # new biases = -mean * gamma / sqrt(variance + epsilon) + beta
-      multiplier_tensor = match.gamma_tensor * math_ops.rsqrt(
-          match.variance_tensor + match.bn_op.get_attr('epsilon'))
-      bias_tensor = math_ops.subtract(
-          match.beta_tensor, match.mean_tensor * multiplier_tensor, name='bias')
-
-      # The shape of depthwise weights is different, so we need to reshape the
-      # multiplier_tensor to ensure that the scaled_weight_tensor has the
-      # expected shape.
-      if match.layer_op.type == 'DepthwiseConv2dNative':
-        new_shape = [
-            match.weight_tensor.get_shape().as_list()[2],
-            match.weight_tensor.get_shape().as_list()[3]
-        ]
-        multiplier_tensor = array_ops.reshape(
-            multiplier_tensor, new_shape, name='scale_reshape')
-
-      # TODO(suharshs): This naming of the following ops needs to carefully
-      # follow the naming expected by quantize.py. Generalize the quantize code
-      # to not require these delicate naming conventions.
-      scaled_weight_tensor = math_ops.multiply(
-          match.weight_tensor, multiplier_tensor, name='mul_fold')
+    with graph.as_default(), graph.name_scope(scope + sep):
+      with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep):
+        # new weights = old weights * gamma / sqrt(variance + epsilon)
+        # new biases = -mean * gamma / sqrt(variance + epsilon) + beta
+        multiplier_tensor = match.gamma_tensor * math_ops.rsqrt(
+            match.variance_tensor + match.bn_op.get_attr('epsilon'))
+        bias_tensor = math_ops.subtract(
+            match.beta_tensor,
+            match.mean_tensor * multiplier_tensor,
+            name='bias')
+
+        correction_scale, correction_recip, correction_offset = None, None, None
+        if is_training:
+          correction_scale, correction_recip, correction_offset = (
+              _ComputeBatchNormCorrections(
+                  context='',
+                  match=match,
+                  freeze_batch_norm_delay=freeze_batch_norm_delay,
+                  fused_batch_norm=True))
+        # The shape of depthwise weights is different, so we need to reshape the
+        # multiplier_tensor to ensure that the scaled_weight_tensor has the
+        # expected shape.
+        weights = match.weight_tensor
+        if match.layer_op.type == 'DepthwiseConv2dNative':
+          new_shape = [
+              match.weight_tensor.get_shape().as_list()[2],
+              match.weight_tensor.get_shape().as_list()[3]
+          ]
+          multiplier_tensor = array_ops.reshape(
+              multiplier_tensor, new_shape, name='scale_reshape')
+
+          if correction_scale is not None:
+            correction_scale = array_ops.reshape(
+                correction_scale, new_shape, name='correction_reshape')
+
+      if correction_scale is not None:
+        weights = math_ops.multiply(
+            correction_scale, weights, name='correction_mult')
 
+      scaled_weight_tensor = math_ops.multiply(
+          weights, multiplier_tensor, name='mul_fold')
       new_layer_tensor = _CloneWithNewOperands(
           match.layer_op, match.input_tensor, scaled_weight_tensor)
 
+      if correction_recip is not None:
+        new_layer_tensor = math_ops.multiply(
+            correction_recip, new_layer_tensor, name='post_conv_mul')
+        new_layer_tensor = math_ops.add(new_layer_tensor, (correction_offset),
+                                        'correction_add')
+
       bias_add_tensor = math_ops.add(
           new_layer_tensor, bias_tensor, name='add_fold')
 
@@ -103,36 +138,6 @@ def _FoldFusedBatchNorms(graph):
             'Unexpected inputs to op: %s' % match.output_tensor.name)
 
 
-def _CloneWithNewOperands(layer_op, input_tensor, weight_tensor):
-  """Clones layer_op with input_tensor and weight_tensor as new inputs."""
-  new_layer_name = layer_op.name.split('/')[-1] + '_Fold'
-  if layer_op.type == 'Conv2D':
-    return nn_ops.conv2d(
-        input_tensor,
-        weight_tensor,
-        strides=layer_op.get_attr('strides'),
-        padding=layer_op.get_attr('padding'),
-        use_cudnn_on_gpu=layer_op.get_attr('use_cudnn_on_gpu'),
-        data_format=layer_op.get_attr('data_format'),
-        name=new_layer_name)
-  elif layer_op.type == 'MatMul':
-    return math_ops.matmul(
-        input_tensor,
-        weight_tensor,
-        transpose_a=layer_op.get_attr('transpose_a'),
-        transpose_b=layer_op.get_attr('transpose_b'),
-        name=new_layer_name)
-  elif layer_op.type == 'DepthwiseConv2dNative':
-    return nn.depthwise_conv2d(
-        input_tensor,
-        weight_tensor,
-        strides=layer_op.get_attr('strides'),
-        padding=layer_op.get_attr('padding'),
-        name=new_layer_name)
-  else:
-    raise ValueError('Cannot handle operation of type: %s' % layer_op.type)
-
-
 def _FindFusedBatchNorms(graph):
   """Finds all ops and tensors related to found FusedBatchNorms.
 
@@ -149,37 +154,59 @@ def _FindFusedBatchNorms(graph):
   mean_pattern = graph_matcher.OpTypePattern('*')
   variance_pattern = graph_matcher.OpTypePattern('*')
 
-  conv_pattern = graph_matcher.OpTypePattern(
-      'Conv2D|DepthwiseConv2dNative', inputs=[input_pattern, weight_pattern])
+  moving_average_pattern = graph_matcher.OpTypePattern('*')
+  bn_decay_pattern = graph_matcher.OpTypePattern('*')
+  layer_pattern = graph_matcher.OpTypePattern(
+      'Conv2D|DepthwiseConv2dNative|MatMul',
+      inputs=[input_pattern, weight_pattern])
   # MatMul has a Reshape between it and FusedBatchNorm.
-  matmul_pattern = graph_matcher.OpTypePattern(
-      'MatMul', inputs=[input_pattern, weight_pattern])
   matmul_reshape_pattern = graph_matcher.OpTypePattern(
-      'Reshape', inputs=[matmul_pattern,
+      'Reshape', inputs=[layer_pattern,
                          graph_matcher.OpTypePattern('*')])
 
-  conv_batch_norm_pattern = graph_matcher.OpTypePattern(
+  batch_norm_pattern = graph_matcher.OpTypePattern(
       'FusedBatchNorm',
       inputs=[
-          conv_pattern, gamma_pattern, beta_pattern, mean_pattern,
-          variance_pattern
-      ])
-  matmul_batch_norm_pattern = graph_matcher.OpTypePattern(
-      'FusedBatchNorm',
-      inputs=[
-          matmul_reshape_pattern, gamma_pattern, beta_pattern, mean_pattern,
-          variance_pattern
+          graph_matcher.OneofPattern([matmul_reshape_pattern, layer_pattern]),
+          gamma_pattern, beta_pattern, mean_pattern, variance_pattern
       ])
   matmul_bn_output_reshape_pattern = graph_matcher.OpTypePattern(
-      'Reshape',
-      inputs=[matmul_batch_norm_pattern,
-              graph_matcher.OpTypePattern('*')])
+      'Reshape', inputs=[batch_norm_pattern,
+                         graph_matcher.OpTypePattern('*')])
+
+  bn_matcher = graph_matcher.GraphMatcher(
+      graph_matcher.OneofPattern(
+          [matmul_bn_output_reshape_pattern, batch_norm_pattern]))
 
-  conv_matcher = graph_matcher.GraphMatcher(conv_batch_norm_pattern)
-  matmul_matcher = graph_matcher.GraphMatcher(matmul_bn_output_reshape_pattern)
+  moving_average_sub_pattern = graph_matcher.OpTypePattern(
+      'Sub', inputs=[moving_average_pattern, batch_norm_pattern])
+  moving_average_mul_pattern = graph_matcher.OpTypePattern(
+      'Mul', inputs=[moving_average_sub_pattern, bn_decay_pattern])
+
+  moving_avg_mul_matcher = graph_matcher.GraphMatcher(
+      moving_average_mul_pattern)
+
+  for match_result in bn_matcher.match_graph(graph):
+    moving_mean_tensor = None
+    moving_variance_tensor = None
+    bn_decay_mean_tensor = None
+    bn_decay_var_tensor = None
+    layer_op = match_result.get_op(layer_pattern)
+    layer_tensor = match_result.get_tensor(layer_pattern)
+    bn_op = match_result.get_op(batch_norm_pattern)
+    batch_epsilon_tensor = bn_op.get_attr('epsilon')
+
+    # In the MatMul case, the output of batch norm is reshaped back into a
+    # 2D tensor, so the output_tensor is the output of the Reshape op.
+    output_tensor = bn_op.outputs[0]
+    if layer_op.type == 'MatMul':
+      output_reshape_op = match_result.get_op(matmul_bn_output_reshape_pattern)
+      # If the matcher didn't match matmul_bn_output_reshape, there will be
+      # another match for this 'MatMul' later, so we can skip this one.
+      if output_reshape_op is None:
+        continue
+      output_tensor = output_reshape_op.outputs[0]
 
-  def _GetCommonTensors(match_result):
-    """Gets tensors needed for FusedBatchNormMatch from match_result."""
     input_tensor = match_result.get_tensor(input_pattern)
     weight_tensor = match_result.get_tensor(weight_pattern)
     gamma_tensor = match_result.get_tensor(gamma_pattern)
@@ -191,44 +218,45 @@ def _FindFusedBatchNorms(graph):
     # respectively; when is_training is false, they point to bn_op's inputs.
     is_training = bn_op.get_attr('is_training')
     if is_training:
+      # FusedBatchNormGrad doesn't compute gradients of the batch_mean and
+      # batch_variance outputs, so we need to substitute our own custom
+      # gradient.
+      # TODO(suharshs, raghuramank): Find a way to avoid needing this hack.
+      # pylint: disable=protected-access
+      bn_op._set_attr(
+          '_gradient_op_type',
+          attr_value_pb2.AttrValue(s=compat.as_bytes('FoldFusedBatchNormGrad')))
+      # pylint: enable=protected-access
       mean_tensor = bn_op.outputs[1]
-      variance_tensor = bn_op.outputs[2]
+      # The batch variance used during forward and backward prop is biased,
+      # i.e it is calculated as: V=sum(x(k)-mu)^2/N. For the moving average
+      # calculation, the variance is corrected by the term N/N-1 (Bessel's
+      # correction). The variance tensor read from FuseBatchNorm has bessel's
+      # correction applied, so we undo it here.
+      scope, sep, _ = bn_op.name.rpartition('/')
+      g = ops.get_default_graph()
+      with g.as_default(), g.name_scope(scope + sep):
+        n = math_ops.cast(
+            array_ops.size(layer_tensor) / array_ops.size(mean_tensor),
+            dtypes.float32)
+        variance_tensor = math_ops.multiply(
+            bn_op.outputs[2], (n - 1) / n, name='Undo_Bessel_Correction')
+      # TODO(suharshs): Find a way to get rid of this inner match.
+      for mul_match_result in moving_avg_mul_matcher.match_graph(graph):
+        sub_op = mul_match_result.get_op(moving_average_sub_pattern)
+        if sub_op.inputs[1].name == bn_op.outputs[1].name:
+          # During training: Batch Mean is bn_op.outputs[1]
+          moving_mean_tensor = sub_op.inputs[0]
+          bn_decay_mean_tensor = mul_match_result.get_tensor(bn_decay_pattern)
+        if sub_op.inputs[1].name == bn_op.outputs[2].name:
+          # During training: Batch Var is bn_op.outputs[2]
+          moving_variance_tensor = sub_op.inputs[0]
+          bn_decay_var_tensor = mul_match_result.get_tensor(bn_decay_pattern)
     else:
       mean_tensor = match_result.get_tensor(mean_pattern)
       variance_tensor = match_result.get_tensor(variance_pattern)
-    return (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-            variance_tensor)
-
-  for match_result in conv_matcher.match_graph(graph):
-    layer_op = match_result.get_op(conv_pattern)
-    bn_op = match_result.get_op(conv_batch_norm_pattern)
-    # In the case of convolution the output_tensor is the output of bn_op.
-    output_tensor = bn_op.outputs[0]
-
-    (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-     variance_tensor) = _GetCommonTensors(match_result)
-    yield _FusedBatchNormMatch(
-        layer_op=layer_op,
-        bn_op=bn_op,
-        output_tensor=output_tensor,
-        input_tensor=input_tensor,
-        weight_tensor=weight_tensor,
-        gamma_tensor=gamma_tensor,
-        beta_tensor=beta_tensor,
-        mean_tensor=mean_tensor,
-        variance_tensor=variance_tensor)
 
-  for match_result in matmul_matcher.match_graph(graph):
-    layer_op = match_result.get_op(matmul_pattern)
-    bn_op = match_result.get_op(matmul_batch_norm_pattern)
-    # In the MatMul case, the output of batch norm is reshaped back into a
-    # 2D tensor, so the output_tensor is the output of the Reshape op.
-    output_reshape_op = match_result.get_op(matmul_bn_output_reshape_pattern)
-    output_tensor = output_reshape_op.outputs[0]
-
-    (input_tensor, weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-     variance_tensor) = _GetCommonTensors(match_result)
-    yield _FusedBatchNormMatch(
+    yield _BatchNormMatch(
         layer_op=layer_op,
         bn_op=bn_op,
         output_tensor=output_tensor,
@@ -237,63 +265,156 @@ def _FindFusedBatchNorms(graph):
         gamma_tensor=gamma_tensor,
         beta_tensor=beta_tensor,
         mean_tensor=mean_tensor,
-        variance_tensor=variance_tensor)
-
+        variance_tensor=variance_tensor,
+        moving_mean_tensor=moving_mean_tensor,
+        moving_variance_tensor=moving_variance_tensor,
+        bn_decay_mean_tensor=bn_decay_mean_tensor,
+        bn_decay_var_tensor=bn_decay_var_tensor,
+        batch_epsilon_tensor=batch_epsilon_tensor)
+
+
+def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
+                                 fused_batch_norm):
+  """Computes batch norm correction params.
+
+     Before batch normalization is frozen:
+     We use batch statistics for batch norm.
+       correction_scale = sigma_b/sigma_mv
+       correction_recip = 1/correction_scale
+       correction_offset = 0
+
+     After batch normalization is frozen:
+      correction_scale = sigma_b/sigma_mv
+      correction_recip = 1
+      correction_offset =  gamma*(mu_b/sigma_b-mu_mv/sigma_mv).
+
+     Batch norm is frozen if global_step > bn_freeze_delay.
+     The corrections ensure that:
+     a) The weights are quantized after scaling by gamma/sigma_mv. This enables
+     smoother training as the scaling on the weights changes slowly, rather than
+     jump across mini-batches
+     b) Changing the values of the corrections allows for one to switch between
+     using batch statistics to using moving mean and average, without requiring
+     changes to batch_norm
 
-class _FusedBatchNormMatch(object):
-  """Contains all information related to a found FusedBatchNorm."""
 
-  def __init__(self, layer_op, bn_op, output_tensor, input_tensor,
-               weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
-               variance_tensor):
-    self._layer_op = layer_op
-    self._bn_op = bn_op
-    self._output_tensor = output_tensor
-    self._input_tensor = input_tensor
-    self._weight_tensor = weight_tensor
-    self._gamma_tensor = gamma_tensor
-    self._beta_tensor = beta_tensor
-    self._mean_tensor = mean_tensor
-    self._variance_tensor = variance_tensor
-
-  @property
-  def layer_op(self):
-    return self._layer_op
-
-  @property
-  def bn_op(self):
-    return self._bn_op
-
-  @property
-  def output_tensor(self):
-    return self._output_tensor
+  Args:
+    context: The scope under which we look for batch norm params
+    match: Object containg required batch norm tensors for correction
+      computation.
+    freeze_batch_norm_delay: Delay in steps at which computation switches
+      from regular batch norm to frozen mean and variance.
+    fused_batch_norm: Bool, true if fused batch norm is used.
 
-  @property
-  def input_tensor(self):
-    return self._input_tensor
+  Returns:
+    A tuple of correction_scale, correction_recip, correction_offset
+  """
 
-  @property
-  def weight_tensor(self):
-    return self._weight_tensor
+  g = ops.get_default_graph()
+  with g.name_scope(context + '/batch_norm_correction'):
+    recip_sigma_mv = math_ops.rsqrt(
+        match.moving_variance_tensor + match.batch_epsilon_tensor)
+    recip_sigma = math_ops.rsqrt(
+        match.variance_tensor + match.batch_epsilon_tensor)
+    correction_scale = math_ops.divide(
+        recip_sigma_mv, recip_sigma, name='scale_compute')
+    correction_scale = array_ops.identity(
+        correction_scale, name='correction_scale')
+    correction_recip = math_ops.reciprocal(
+        correction_scale, name='reciprocal_compute')
+    correction_offset = math_ops.multiply(
+        match.gamma_tensor,
+        match.mean_tensor * recip_sigma -
+        match.moving_mean_tensor * recip_sigma_mv,
+        name='offset_compute')
+
+    if freeze_batch_norm_delay is not None:
+      use_mv_avg = math_ops.greater_equal(
+          common.CreateOrGetQuantizationStep(),
+          freeze_batch_norm_delay,
+          name='use_moving_average')
+    else:
+      use_mv_avg = False
+
+    bn_decay_zero = 0.0
+    bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers())
+    bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers())
+
+    bn_decay_mean_out = utils.smart_cond(
+        use_mv_avg,
+        lambda: bn_decay_zero,
+        lambda: match.bn_decay_mean_tensor,
+        name='freeze_moving_mean')
+    graph_editor.reroute_ts(
+        [bn_decay_mean_out], [match.bn_decay_mean_tensor],
+        can_modify=bn_decay_mean_consumers)
+
+    if fused_batch_norm is False:
+      bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers())
+      bn_decay_var_out = utils.smart_cond(
+          use_mv_avg,
+          lambda: bn_decay_zero,
+          lambda: match.bn_decay_var_tensor,
+          name='freeze_moving_var')
+      graph_editor.reroute_ts(
+          [bn_decay_var_out], [match.bn_decay_var_tensor],
+          can_modify=bn_decay_var_consumers)
+
+    correction_recip = utils.smart_cond(
+        use_mv_avg,
+        lambda: array_ops.ones(correction_scale.shape),
+        lambda: correction_recip,
+        name='correction_recip')
+
+    correction_offset = utils.smart_cond(
+        use_mv_avg,
+        lambda: correction_offset,
+        lambda: array_ops.zeros(correction_offset.shape),
+        name='correction_offset')
+  return correction_scale, correction_recip, correction_offset
 
-  @property
-  def gamma_tensor(self):
-    return self._gamma_tensor
 
-  @property
-  def beta_tensor(self):
-    return self._beta_tensor
+def _CloneWithNewOperands(layer_op, input_tensor, weight_tensor):
+  """Clones layer_op with input_tensor and weight_tensor as new inputs."""
+  new_layer_name = layer_op.name.split('/')[-1] + '_Fold'
+  if layer_op.type == 'Conv2D':
+    return nn_ops.conv2d(
+        input_tensor,
+        weight_tensor,
+        strides=layer_op.get_attr('strides'),
+        padding=layer_op.get_attr('padding'),
+        use_cudnn_on_gpu=layer_op.get_attr('use_cudnn_on_gpu'),
+        data_format=layer_op.get_attr('data_format'),
+        name=new_layer_name)
+  elif layer_op.type == 'MatMul':
+    return math_ops.matmul(
+        input_tensor,
+        weight_tensor,
+        transpose_a=layer_op.get_attr('transpose_a'),
+        transpose_b=layer_op.get_attr('transpose_b'),
+        name=new_layer_name)
+  elif layer_op.type == 'DepthwiseConv2dNative':
+    return nn.depthwise_conv2d(
+        input_tensor,
+        weight_tensor,
+        strides=layer_op.get_attr('strides'),
+        padding=layer_op.get_attr('padding'),
+        name=new_layer_name)
+  else:
+    raise ValueError('Cannot handle operation of type: %s' % layer_op.type)
 
-  @property
-  def mean_tensor(self):
-    return self._mean_tensor
 
-  @property
-  def variance_tensor(self):
-    return self._variance_tensor
+@ops.RegisterGradient('FoldFusedBatchNormGrad')
+def _FoldFusedBatchNormGrad(op, unused_grad_y, grad_mean, grad_var, unused_1,
+                            unused_2):
+  x = op.inputs[0]
+  n = x.get_shape().num_elements() / grad_mean.get_shape().num_elements()
+  dmean_dx = grad_mean / n
+  dvar_dx = 2 * grad_var * (x - op.outputs[1]) / (n - 1)
+  return (dmean_dx + dvar_dx), None, None, None, None
 
 
-def _FoldUnfusedBatchNorms(graph):
+def _FoldUnfusedBatchNorms(graph, is_training, freeze_batch_norm_delay):
   """Finds unfused batch norm layers and folds them into preceding layers.
 
   Folding only affects the following layers: Conv2D, fully connected, depthwise
@@ -301,6 +422,9 @@ def _FoldUnfusedBatchNorms(graph):
 
   Args:
     graph: Graph to walk and modify.
+    is_training: Bool, True if training.
+    freeze_batch_norm_delay: How many steps to wait before freezing moving mean
+      and variance and using them for batch normalization.
 
   Raises:
     ValueError: When batch norm folding fails.
@@ -311,7 +435,12 @@ def _FoldUnfusedBatchNorms(graph):
     has_scaling = _HasScaling(graph, input_to_ops_map, bn)
 
     # The mangling code intimately depends on BatchNorm node's internals.
-    original_op, folded_op = _CreateFoldedOp(graph, bn, has_scaling=has_scaling)
+    original_op, folded_op = _CreateFoldedOp(
+        graph,
+        bn,
+        has_scaling=has_scaling,
+        freeze_batch_norm_delay=freeze_batch_norm_delay,
+        is_training=is_training)
 
     activation = common.GetEndpointActivationOp(graph, bn)
     if activation:
@@ -333,46 +462,84 @@ def _FoldUnfusedBatchNorms(graph):
       raise ValueError('Unexpected inputs to op: %s' % add_bypass.name)
 
 
-def _HasScaling(graph, input_to_ops_map, bn):
-  r"""Checks if batch norm  has scaling enabled.
-
-  Difference between batch norm with scaling and without is that with scaling:
-
-  Rsqrt -> mul -> mul_1
-              \-> mul_2
-
-  where
-    mul multiplies gamma by inverse square root of EMA of batch variance,
-    mul_1 multiplies output of mul with output from the base operation
-      (convolution, FC or depthwise convolution),
-    mul_2 multiplies output of mul with EMA of batch mean,
-  and without scaling:
-
-  Rsqrt -> mul
-       \-> mul_1
-
-  where
-    mul multiplies the inverse square root of EMA of batch variance with output
-      from the base operation,
-    mul_1 multiplies inverse square root of EMA of batch variance with EMA
-      of batch mean.
+def _GetBatchNormParams(graph, context, has_scaling):
+  """Extracts relevant tensors for folding batch norms.
 
   Args:
     graph: Graph to inspect.
-    input_to_ops_map: InputToOps object containing mapping from tensor's name
-      to ops that take it as input.
-    bn: Batch norm layer prefix string.
+    context: The scope under which we look for batch norm params
+    has_scaling: Bool that specifies if scaling is done as part of batch norm.
 
   Returns:
-    A boolean indicating whether this batch norm layer has scaling enabled.
+    _BatchNormMatch containing all required batch norm parameters.
   """
-  rsqrt_op = graph.get_operation_by_name(bn + '/BatchNorm/batchnorm/Rsqrt')
-  rsqrt_consumers = input_to_ops_map.ConsumerOperations(rsqrt_op)
-
-  return sum(1 for op in rsqrt_consumers if op.type == 'Mul') == 1
-
-
-def _CreateFoldedOp(graph, context, has_scaling):
+  gamma_tensor = None
+  batch_mean_tensor = None
+  batch_variance_tensor = None
+  moving_mean_tensor = None
+  moving_variance_tensor = None
+  batch_epsilon_tensor = None
+  bn_decay_mean_tensor = None
+  bn_decay_var_tensor = None
+
+  split_context = context.split('/')
+  base_context = split_context[-1]
+
+  oplist = graph.get_operations()
+  op_suffix_gamma = base_context + '/BatchNorm/gamma'
+  op_suffix_mean = base_context + '/BatchNorm/moments/Squeeze'
+  op_suffix_variance = base_context + '/BatchNorm/moments/Squeeze_1'
+  op_suffix_moving_variance = base_context + '/BatchNorm/moving_variance/read'
+  op_suffix_moving_mean = base_context + '/BatchNorm/moving_mean/read'
+  op_suffix_epsilon = base_context + '/BatchNorm/batchnorm/add/y'
+  op_suffix_bn_decay_mean = base_context + '/BatchNorm/AssignMovingAvg/decay'
+  op_suffix_bn_decay_var = base_context + '/BatchNorm/AssignMovingAvg_1/decay'
+
+  # Parse through list of ops to find relevant ops
+  for op in oplist:
+    if op.name.endswith(op_suffix_mean):
+      # This is an efficient way to check for two things:
+      # Is batch norm present and is it training mode?
+      # Batch statistics are computed only during batch norm in training
+      batch_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_variance):
+      batch_variance_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_moving_mean):
+      moving_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_moving_variance):
+      moving_variance_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_epsilon):
+      batch_epsilon_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_bn_decay_mean):
+      bn_decay_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if op.name.endswith(op_suffix_bn_decay_var):
+      bn_decay_var_tensor = graph.get_tensor_by_name(op.name + ':0')
+    if has_scaling:
+      if op.name.endswith(op_suffix_gamma):
+        gamma_tensor = graph.get_tensor_by_name(op.name + ':0')
+
+  if not has_scaling:
+    gamma_tensor = array_ops.ones(batch_mean_tensor.shape)
+
+  return _BatchNormMatch(
+      layer_op=None,
+      bn_op=None,
+      output_tensor=None,
+      input_tensor=None,
+      weight_tensor=None,
+      gamma_tensor=gamma_tensor,
+      beta_tensor=None,
+      mean_tensor=batch_mean_tensor,
+      variance_tensor=batch_variance_tensor,
+      moving_mean_tensor=moving_mean_tensor,
+      moving_variance_tensor=moving_variance_tensor,
+      bn_decay_mean_tensor=bn_decay_mean_tensor,
+      bn_decay_var_tensor=bn_decay_var_tensor,
+      batch_epsilon_tensor=batch_epsilon_tensor)
+
+
+def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
+                    is_training):
   """Folds in batch norm layer into preceding convolution or FC layer.
 
   Creates 3 new nodes, connects their inputs and adds them to the graph:
@@ -382,17 +549,20 @@ def _CreateFoldedOp(graph, context, has_scaling):
   Args:
     graph: Graph to modify.
     context: String, batch norm context, i.e. node into which BatchNorm is
-        nested.
+      nested.
     has_scaling: Whether the batch norm has scaling enabled.
+    freeze_batch_norm_delay: How many steps to wait before freezing moving mean
+      and variance and using them for batch normalization.
+    is_training: Bool, true if training.
 
   Raises:
     ValueError: When operation type is not supported, or input and output tensor
-        shapes mismatch for created operations: mul_fold, add_fold.
+      shapes mismatch for created operations: mul_fold, add_fold.
 
   Returns:
     A pair of Operations, the first is the original consumer node of the batch
-        norm (../BatchNorm/batchnorm/add_1), the second is the consumer node of
-        the folded graph (add_fold).
+      norm (../BatchNorm/batchnorm/add_1), the second is the consumer node of
+      the folded graph (add_fold).
   """
   mul_scale_name = 'mul_1' if has_scaling else 'mul'
   mul_scale = graph.get_operation_by_name(context +
@@ -400,19 +570,43 @@ def _CreateFoldedOp(graph, context, has_scaling):
                                           mul_scale_name)
   op_below = mul_scale.inputs[0].op
   weights = op_below.inputs[1]
-
+  match = _GetBatchNormParams(
+      graph=graph, context=context, has_scaling=has_scaling)
+  correction_scale, correction_recip, correction_offset = None, None, None
+  if is_training:
+    correction_scale, correction_recip, correction_offset = (
+        _ComputeBatchNormCorrections(
+            context=context,
+            match=match,
+            freeze_batch_norm_delay=freeze_batch_norm_delay,
+            fused_batch_norm=False))
   # Special handling for weights of depthwise convolution.
   if op_below.type == 'DepthwiseConv2dNative':
-    new_shape = [weights.get_shape().as_list()[2],
-                 weights.get_shape().as_list()[3]]
+    new_shape = [
+        weights.get_shape().as_list()[2],
+        weights.get_shape().as_list()[3]
+    ]
     scale_name = 'mul' if has_scaling else 'Rsqrt'
-    scale = graph.get_operation_by_name(context + '/BatchNorm/batchnorm/' +
-                                        scale_name)
+    scale = graph.get_operation_by_name(
+        context + '/BatchNorm/batchnorm/' + scale_name)
     scale = array_ops.reshape(scale.outputs[0], new_shape,
                               context + '/scale_reshape')
-    mul_fold = _CloneOp(mul_scale, context + '/mul_fold',
-                        [(0, weights), (1, scale)])
+
+    if correction_scale is not None:
+      correction_scale = array_ops.reshape(correction_scale, new_shape,
+                                           context + '/correction_reshape')
+      with ops.device(mul_scale.device):
+        weights = math_ops.multiply(correction_scale, weights,
+                                    context + '/correction_mult')
+
+    mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights),
+                                                           (1, scale)])
   elif op_below.type in ['Conv2D', 'MatMul']:
+
+    if correction_scale is not None:
+      with ops.device(mul_scale.device):
+        weights = math_ops.multiply(correction_scale, weights,
+                                    context + '/correction_mult')
     mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights)])
   else:
     raise ValueError('Cannot handle operation of type: %s' % op_below.op)
@@ -421,10 +615,17 @@ def _CreateFoldedOp(graph, context, has_scaling):
   conv_or_fc_folded = _CloneOp(op_below, op_below.name + '_Fold',
                                [(1, mul_fold.outputs[0])])
 
-  add_shift = graph.get_operation_by_name(context +
-                                          '/BatchNorm/batchnorm/add_1')
-  add_fold = _CloneOp(add_shift, context + '/add_fold',
-                      [(0, conv_or_fc_folded.outputs[0])])
+  add_shift = graph.get_operation_by_name(
+      context + '/BatchNorm/batchnorm/add_1')
+
+  corrected_output = conv_or_fc_folded.outputs[0]
+  if correction_offset is not None:
+    with ops.device(conv_or_fc_folded.device):
+      corrected_output = math_ops.multiply(correction_recip, corrected_output,
+                                           context + '/post_conv_mul')
+      corrected_output = math_ops.add(corrected_output, (correction_offset),
+                                      context + '/correction_add')
+  add_fold = _CloneOp(add_shift, context + '/add_fold', [(0, corrected_output)])
   _AssertShapesMatch('add_fold', add_fold.inputs[0], add_fold.outputs[0])
   return add_shift, add_fold
 
@@ -436,7 +637,7 @@ def _CloneOp(op, new_name, new_inputs):
     op: Operation to modify.
     new_name: String, a new name to set on cloned op.
     new_inputs: A list of tuples (idx, tensor), each input with corresponding
-        index will be replaced by the given Tensor in the cloned op.
+      index will be replaced by the given Tensor in the cloned op.
 
   Returns:
     Operation, the cloned op.
@@ -568,3 +769,121 @@ def _AssertShapesMatch(op_name, in_tensor, out_tensor):
   if not in_shape.is_compatible_with(out_shape):
     raise ValueError('%s should not change tensor shape: input %s, '
                      'output %s' % (op_name, in_shape, out_shape))
+
+
+def _HasScaling(graph, input_to_ops_map, bn):
+  r"""Checks if batch norm  has scaling enabled.
+
+  Difference between batch norm with scaling and without is that with scaling:
+
+  Rsqrt -> mul -> mul_1
+              \-> mul_2
+
+  where
+    mul multiplies gamma by inverse square root of EMA of batch variance,
+    mul_1 multiplies output of mul with output from the base operation
+      (convolution, FC or depthwise convolution),
+    mul_2 multiplies output of mul with EMA of batch mean,
+  and without scaling:
+
+  Rsqrt -> mul
+       \-> mul_1
+
+  where
+    mul multiplies the inverse square root of EMA of batch variance with output
+      from the base operation,
+    mul_1 multiplies inverse square root of EMA of batch variance with EMA
+      of batch mean.
+
+  Args:
+    graph: Graph to inspect.
+    input_to_ops_map: InputToOps object containing mapping from tensor's name
+      to ops that take it as input.
+    bn: Batch norm layer prefix string.
+
+  Returns:
+    A boolean indicating whether this batch norm layer has scaling enabled.
+  """
+  rsqrt_op = graph.get_operation_by_name(bn + '/BatchNorm/batchnorm/Rsqrt')
+  rsqrt_consumers = input_to_ops_map.ConsumerOperations(rsqrt_op)
+
+  return sum(1 for op in rsqrt_consumers if op.type == 'Mul') == 1
+
+
+class _BatchNormMatch(object):
+  """Contains all information related to a found Fused/UnfusedBatchNorm."""
+
+  def __init__(self, layer_op, bn_op, output_tensor, input_tensor,
+               weight_tensor, gamma_tensor, beta_tensor, mean_tensor,
+               variance_tensor, moving_mean_tensor, moving_variance_tensor,
+               bn_decay_mean_tensor, bn_decay_var_tensor, batch_epsilon_tensor):
+    self._layer_op = layer_op
+    self._bn_op = bn_op
+    self._output_tensor = output_tensor
+    self._input_tensor = input_tensor
+    self._weight_tensor = weight_tensor
+    self._gamma_tensor = gamma_tensor
+    self._beta_tensor = beta_tensor
+    self._mean_tensor = mean_tensor
+    self._variance_tensor = variance_tensor
+    self._moving_mean_tensor = moving_mean_tensor
+    self._moving_variance_tensor = moving_variance_tensor
+    self._bn_decay_mean_tensor = bn_decay_mean_tensor
+    self._bn_decay_var_tensor = bn_decay_var_tensor
+    self._batch_epsilon_tensor = batch_epsilon_tensor
+
+  @property
+  def layer_op(self):
+    return self._layer_op
+
+  @property
+  def bn_op(self):
+    return self._bn_op
+
+  @property
+  def output_tensor(self):
+    return self._output_tensor
+
+  @property
+  def input_tensor(self):
+    return self._input_tensor
+
+  @property
+  def weight_tensor(self):
+    return self._weight_tensor
+
+  @property
+  def gamma_tensor(self):
+    return self._gamma_tensor
+
+  @property
+  def beta_tensor(self):
+    return self._beta_tensor
+
+  @property
+  def mean_tensor(self):
+    return self._mean_tensor
+
+  @property
+  def variance_tensor(self):
+    return self._variance_tensor
+
+  @property
+  def moving_mean_tensor(self):
+    return self._moving_mean_tensor
+
+  @property
+  def moving_variance_tensor(self):
+    return self._moving_variance_tensor
+
+  @property
+  def batch_epsilon_tensor(self):
+    return self._batch_epsilon_tensor
+
+  @property
+  def bn_decay_mean_tensor(self):
+    return self._bn_decay_mean_tensor
+
+  @property
+  def bn_decay_var_tensor(self):
+    return self._bn_decay_var_tensor
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
index 2cecf6851467f82675bd67bf1fb108e9a39df1b0..c90a18ab0357f1bcbc5d8ccd48edf894d7baf5f9 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
@@ -20,14 +20,20 @@ from __future__ import print_function
 
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.quantize.python import fold_batch_norms
+from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
+from tensorflow.python.training import saver as saver_lib
 
 batch_norm = layers.batch_norm
 conv2d = layers.conv2d
@@ -40,26 +46,27 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
 
   def _RunTestOverParameters(self, test_fn):
     parameters_list = [
-        # (relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm)
-        (nn_ops.relu6, 'Relu6', False, False, False),
-        (nn_ops.relu, 'Relu', False, False, False),
-        (nn_ops.relu6, 'Relu6', True, False, False),
-        (nn_ops.relu, 'Relu', True, False, False),
-        (nn_ops.relu6, 'Relu6', False, True, False),
-        (nn_ops.relu, 'Relu', False, True, False),
-        (nn_ops.relu6, 'Relu6', True, True, False),
-        (nn_ops.relu, 'Relu', True, True, False),
+        # (relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm,
+        # freeze_batch_norm_delay)
+        (nn_ops.relu6, 'Relu6', False, False, False, 100),
+        (nn_ops.relu, 'Relu', False, False, False, None),
+        (nn_ops.relu6, 'Relu6', True, False, False, 100),
+        (nn_ops.relu, 'Relu', True, False, False, None),
+        (nn_ops.relu6, 'Relu6', False, True, False, 100),
+        (nn_ops.relu, 'Relu', False, True, False, None),
+        (nn_ops.relu6, 'Relu6', True, True, False, 100),
+        (nn_ops.relu, 'Relu', True, True, False, None),
         # Fused batch norm always has scaling enabled.
-        (nn_ops.relu6, 'Relu6', False, True, True),
-        (nn_ops.relu, 'Relu', False, True, True),
-        (nn_ops.relu6, 'Relu6', True, True, True),
-        (nn_ops.relu, 'Relu', True, True, True),
+        (nn_ops.relu6, 'Relu6', False, True, True, None),
+        (nn_ops.relu, 'Relu', False, True, True, 100),
+        (nn_ops.relu6, 'Relu6', True, True, True, None),
+        (nn_ops.relu, 'Relu', True, True, True, 100),
     ]
     for params in parameters_list:
-      test_fn(params[0], params[1], params[2], params[3], params[4])
+      test_fn(params[0], params[1], params[2], params[3], params[4], params[5])
 
   def _TestFoldConv2d(self, relu, relu_op_name, with_bypass, has_scaling,
-                      fused_batch_norm):
+                      fused_batch_norm, freeze_batch_norm_delay):
     """Tests folding cases: inputs -> Conv2d with batch norm -> Relu*.
 
     Args:
@@ -69,6 +76,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -93,12 +102,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/weights/read',
+        scope + '/correction_mult',
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
     ])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold'])
@@ -107,12 +117,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self.assertEqual(folded_conv.type, 'Conv2D')
     self._AssertInputOpsAre(folded_conv,
                             [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/Conv2D_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -122,7 +132,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldConv2d)
 
   def _TestFoldConv2dUnknownShape(self, relu, relu_op_name, with_bypass,
-                                  has_scaling, fused_batch_norm):
+                                  has_scaling, fused_batch_norm,
+                                  freeze_batch_norm_delay):
     """Tests folding cases: inputs -> Conv2d with batch norm -> Relu*.
 
     Tests that folding works even with an input shape where some dimensions are
@@ -135,6 +146,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -158,12 +171,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/weights/read',
+        scope + '/correction_mult',
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
     ])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold'])
@@ -171,12 +185,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     folded_conv = g.get_operation_by_name(scope + '/Conv2D_Fold')
     self.assertEqual(folded_conv.type, 'Conv2D')
     self._AssertInputOpsAre(folded_conv, [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/Conv2D_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -186,7 +200,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldConv2dUnknownShape)
 
   def _TestFoldFullyConnectedLayer(self, relu, relu_op_name, with_bypass,
-                                   has_scaling, fused_batch_norm):
+                                   has_scaling, fused_batch_norm,
+                                   freeze_batch_norm_delay):
     """Tests folding cases: inputs -> FC with batch norm -> Relu*.
 
     Args:
@@ -196,6 +211,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -217,12 +234,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/weights/read',
+        scope + '/correction_mult',
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
     ])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/MatMul_Fold'])
@@ -231,12 +249,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self.assertEqual(folded_conv.type, 'MatMul')
     self._AssertInputOpsAre(folded_conv,
                             [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/MatMul_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -246,7 +264,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldFullyConnectedLayer)
 
   def _TestFoldDepthwiseConv2d(self, relu, relu_op_name, with_bypass,
-                               has_scaling, fused_batch_norm):
+                               has_scaling, fused_batch_norm,
+                               freeze_batch_norm_delay):
     """Tests folding: inputs -> DepthwiseConv2d with batch norm -> Relu*.
 
     Args:
@@ -256,6 +275,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         inputs to just before Relu*.
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -280,20 +301,24 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
 
-      fold_batch_norms.FoldBatchNorms(g)
+      fold_batch_norms.FoldBatchNorms(
+          g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
     folded_mul = g.get_operation_by_name(scope + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
+    if fused_batch_norm:
+      scale_reshape_op_name = scope + '/BatchNorm_Fold/scale_reshape'
+    else:
+      scale_reshape_op_name = scope + '/scale_reshape'
     self._AssertInputOpsAre(folded_mul,
-                            [scope + '/depthwise_weights/read',
-                             scope + '/scale_reshape'])
+                            [scope + '/correction_mult', scale_reshape_op_name])
     self._AssertOutputGoesToOps(folded_mul, g, [scope + '/depthwise_Fold'])
 
-    scale_reshape = g.get_operation_by_name(scope + '/scale_reshape')
+    scale_reshape = g.get_operation_by_name(scale_reshape_op_name)
     self.assertEqual(scale_reshape.type, 'Reshape')
     self._AssertInputOpsAre(scale_reshape, [
         self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm),
-        scope + '/scale_reshape/shape'
+        scale_reshape_op_name + '/shape'
     ])
     self._AssertOutputGoesToOps(scale_reshape, g, [scope + '/mul_fold'])
 
@@ -301,12 +326,12 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self.assertEqual(folded_conv.type, 'DepthwiseConv2dNative')
     self._AssertInputOpsAre(folded_conv,
                             [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/add_fold'])
+    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
 
     folded_add = g.get_operation_by_name(scope + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/depthwise_Fold',
+        scope + '/correction_add',
         self._BathNormBiasName(scope, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
@@ -315,6 +340,72 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
   def testFoldDepthwiseConv2d(self):
     self._RunTestOverParameters(self._TestFoldDepthwiseConv2d)
 
+  def _TestCompareFoldAndUnfolded(self, relu, relu_op_name, with_bypass,
+                                  has_scaling, fused_batch_norm,
+                                  freeze_batch_norm_delay):
+    """Tests that running folded and unfolded BN returns the same results.
+
+    Args:
+      relu: Callable that returns an Operation, a factory method for the Relu*.
+      relu_op_name: String, name of the Relu* operation.
+      with_bypass: Bool, when true there is an extra connection added from
+        inputs to just before Relu*.
+      has_scaling: Bool, when true the batch norm has scaling.
+      fused_batch_norm: Bool, when true the batch norm is fused.
+      freeze_batch_norm_delay: None or the number of steps after which training
+      switches to using frozen mean and variance
+    """
+    random_seed.set_random_seed(1234)
+    unfolded_g = ops.Graph()
+    with unfolded_g.as_default():
+      batch_size, height, width = 5, 128, 128
+      inputs = random_ops.random_uniform(
+          (batch_size, height, width, 3), dtype=dtypes.float32, seed=1234)
+      out_depth = 3 if with_bypass else 32
+      stride = 1 if with_bypass else 2
+      activation_fn = None if with_bypass else relu
+      scope = 'test/test2' if with_bypass else 'test'
+      node = conv2d(
+          inputs,
+          out_depth, [5, 5],
+          stride=stride,
+          padding='SAME',
+          weights_initializer=self._WeightInit(0.09),
+          activation_fn=activation_fn,
+          normalizer_fn=batch_norm,
+          normalizer_params=self._BatchNormParams(
+              scale=has_scaling, fused=fused_batch_norm),
+          scope=scope)
+      if with_bypass:
+        node = math_ops.add(inputs, node, name='test/Add')
+      relu_node = relu(node, name='test/' + relu_op_name)
+    folded_g = self._CopyGraph(unfolded_g)
+    with folded_g.as_default():
+      fold_batch_norms.FoldBatchNorms(
+          folded_g,
+          is_training=True,
+          freeze_batch_norm_delay=freeze_batch_norm_delay)
+    with session.Session(graph=unfolded_g) as sess:
+      sess.run(variables.global_variables_initializer())
+      grad_node = gradients.gradients(relu_node, inputs)
+      results = sess.run([relu_node, grad_node])
+      unfolded_forward, unfolded_backward = results[0], results[1]
+
+    with session.Session(graph=folded_g) as sess:
+      sess.run(variables.global_variables_initializer())
+      relu_node = folded_g.get_tensor_by_name(relu_node.name)
+      inputs = folded_g.get_tensor_by_name(inputs.name)
+      grad_node = gradients.gradients(relu_node, inputs)
+      results = sess.run([relu_node, grad_node])
+      folded_forward, folded_backward = results[0], results[1]
+
+    # Check that the folded and unfolded results match.
+    self.assertAllClose(unfolded_forward, folded_forward, atol=1e-3)
+    self.assertAllClose(unfolded_backward, folded_backward, atol=1e-3)
+
+  def testCompareFoldAndUnfolded(self):
+    self._RunTestOverParameters(self._TestCompareFoldAndUnfolded)
+
   def _BatchNormParams(self, scale=True, fused=False):
     return {
         'center': True,
@@ -326,13 +417,13 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
   def _BatchNormMultiplierName(self, scope, has_scaling, fused):
     if has_scaling:
       if fused:
-        return scope + '/mul'
+        return scope + '/BatchNorm_Fold/mul'
       return scope + '/BatchNorm/batchnorm/mul'
     return scope + '/BatchNorm/batchnorm/Rsqrt'
 
   def _BathNormBiasName(self, scope, fused):
     if fused:
-      return scope + '/bias'
+      return scope + '/BatchNorm_Fold/bias'
     return scope + '/BatchNorm/batchnorm/sub'
 
   def _WeightInit(self, stddev):
@@ -346,7 +437,7 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     Returns:
       An initializer that initializes with a truncated normal variable.
     """
-    return init_ops.truncated_normal_initializer(stddev=stddev)
+    return init_ops.truncated_normal_initializer(stddev=stddev, seed=1234)
 
   def _AssertInputOpsAre(self, op, in_op_names):
     """Asserts that all inputs to op come from in_op_names (disregarding order).
@@ -371,5 +462,15 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       out_op = graph.get_operation_by_name(out_op_name)
       self.assertIn(op.outputs[0].name, [str(t.name) for t in out_op.inputs])
 
+  def _CopyGraph(self, graph):
+    """Return a copy of graph."""
+    meta_graph = saver_lib.export_meta_graph(
+        graph=graph, collection_list=graph.get_all_collection_keys())
+    graph_copy = ops.Graph()
+    with graph_copy.as_default():
+      _ = saver_lib.import_meta_graph(meta_graph)
+    return graph_copy
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/quantize/python/graph_matcher.py b/tensorflow/contrib/quantize/python/graph_matcher.py
index e3581cc55905a0af7d0464bc0ec673d3ed7f0363..b458f039df0523b5b8b07cff7d14643154124b95 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher.py
@@ -18,8 +18,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 
-class OpTypePattern(object):
+
+class Pattern(object):
+  """The parent class of all patterns (e.g. OpTypePattern and OneofPattern)."""
+
+  @abc.abstractmethod
+  def match(self, op, tensor):
+    """Returns the result of matching op/tensor against this pattern."""
+    raise NotImplementedError('Method "match" not implemented.')
+
+
+class OpTypePattern(Pattern):
   """A tree pattern that matches TF expressions with certain op types."""
 
   def __init__(self, op_type, name=None, inputs=None):
@@ -34,7 +45,7 @@ class OpTypePattern(object):
         similar TF op types.
       name: Optional string. The name of the pattern that can be looked up in
         MatchResult.
-      inputs: Optional list of `OpTypePattern`s or strings that specify the
+      inputs: Optional list of `Pattern`s or strings that specify the
         patterns for the inputs of a matching op. If None, this pattern accepts
         any inputs of a matching op.
     """
@@ -43,22 +54,51 @@ class OpTypePattern(object):
     if inputs is None:
       inputs = []
     self._inputs = [
-        input_pattern if isinstance(input_pattern, OpTypePattern) else
-        OpTypePattern(input_pattern) for input_pattern in inputs
+        input_pattern
+        if isinstance(input_pattern, Pattern) else OpTypePattern(input_pattern)
+        for input_pattern in inputs
     ]
 
-  @property
-  def op_type(self):
-    return self._op_type
-
-  @property
-  def inputs(self):
-    return self._inputs
-
   @property
   def name(self):
     return self._name
 
+  def match(self, op, tensor):
+    if self._op_type != '*':
+      if op.type not in self._op_type.split('|'):
+        return None
+
+    match_result = MatchResult()
+    match_result.add(self, op, tensor)
+
+    if not self._inputs:
+      # If pattern.inputs is empty, skips the rest and accepts all the inputs.
+      return match_result
+
+    if len(op.inputs) != len(self._inputs):
+      return None
+
+    for input_tensor, input_pattern in zip(op.inputs, self._inputs):
+      input_match_result = input_pattern.match(input_tensor.op, input_tensor)
+      if input_match_result is None:
+        return None
+      match_result.merge_from(input_match_result)
+    return match_result
+
+
+class OneofPattern(Pattern):
+  """Matches one of the given sub-patterns."""
+
+  def __init__(self, sub_patterns):
+    self._sub_patterns = sub_patterns
+
+  def match(self, op, tensor):
+    for sub_pattern in self._sub_patterns:
+      match_result = sub_pattern.match(op, tensor)
+      if match_result is not None:
+        return match_result
+    return None
+
 
 class MatchResult(object):
   r"""Encapsulates the result of a match done by GraphMatcher.
@@ -102,16 +142,36 @@ class MatchResult(object):
       return pattern_or_name
 
     if isinstance(pattern_or_name, str):
+      if pattern_or_name not in self._name_to_pattern:
+        return None
       return self._name_to_pattern[pattern_or_name]
 
     raise ValueError('pattern_or_name has type %s. Expect OpTypePattern or str.'
                      % type(pattern_or_name))
 
+  def _get_op_tensor(self, pattern_or_name):
+    pattern = self._to_pattern(pattern_or_name)
+    if pattern is None:
+      return None
+
+    if pattern not in self._pattern_to_op_tensor:
+      return None
+
+    return self._pattern_to_op_tensor[pattern]
+
   def get_op(self, pattern_or_name):
-    return self._pattern_to_op_tensor[self._to_pattern(pattern_or_name)][0]
+    op_tensor = self._get_op_tensor(pattern_or_name)
+    return op_tensor[0] if op_tensor else None
 
   def get_tensor(self, pattern_or_name):
-    return self._pattern_to_op_tensor[self._to_pattern(pattern_or_name)][1]
+    op_tensor = self._get_op_tensor(pattern_or_name)
+    return op_tensor[1] if op_tensor else None
+
+  def merge_from(self, other_match_result):
+    # pylint: disable=protected-access
+    self._pattern_to_op_tensor.update(other_match_result._pattern_to_op_tensor)
+    self._name_to_pattern.update(other_match_result._name_to_pattern)
+    # pylint: enable=protected-access
 
 
 class GraphMatcher(object):
@@ -121,7 +181,7 @@ class GraphMatcher(object):
     """Initializes a GraphMatcher.
 
     Args:
-      pattern: The `OpTypePattern` against which `GraphMatcher` matches
+      pattern: The `Pattern` against which `GraphMatcher` matches
         subgraphs.
     """
     self._pattern = pattern
@@ -133,7 +193,7 @@ class GraphMatcher(object):
     with key `pattern`.
 
     Args:
-      pattern: An `OpTypePattern`.
+      pattern: An `Pattern`.
       op: A `tf.Operation` to match against the pattern.
       tensor: the output `tf.Tensor` of `op` that is used by the matching op of
         `pattern`'s parent. Can be None if `pattern` is already the root of the
@@ -142,20 +202,11 @@ class GraphMatcher(object):
     Returns:
       True if an TF expression rooted at `op` matches `pattern`.
     """
-    if pattern.op_type != '*':
-      if op.type not in pattern.op_type.split('|'):
-        return False
-
-    self._match_result.add(pattern, op, tensor)
-
-    if not pattern.inputs:
-      # If pattern.inputs is empty, skips the rest and accepts all the inputs.
-      return True
-
-    return len(op.inputs) == len(pattern.inputs) and all([
-        self._match_pattern(input_pattern, input_tensor.op, input_tensor)
-        for input_tensor, input_pattern in zip(op.inputs, pattern.inputs)
-    ])
+    match_result = pattern.match(op, tensor)
+    if match_result is None:
+      return False
+    self._match_result.merge_from(match_result)
+    return True
 
   def match_op(self, op):
     """Matches `op` against `self._pattern`.
diff --git a/tensorflow/contrib/quantize/python/graph_matcher_test.py b/tensorflow/contrib/quantize/python/graph_matcher_test.py
index e1572865e423e569ee3b280036c0e02b71b70648..6d587572181c125faa02d36fb54933cff24f11c6 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher_test.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher_test.py
@@ -105,7 +105,7 @@ class GraphMatcherTest(test_util.TensorFlowTestCase):
     self.assertEqual(match_result.get_op(y1_pattern), y1.op)
     self.assertEqual(match_result.get_tensor(y1_pattern), y1)
 
-  def test_oneof_pattern(self):
+  def test_oneof_type_pattern(self):
     #   -   +
     #  / \ / \
     # x   y   z
@@ -125,6 +125,44 @@ class GraphMatcherTest(test_util.TensorFlowTestCase):
         for match_result in matcher.match_graph(g)
     ], [plus.op, minus.op])
 
+  def test_oneof_pattern(self):
+    reshape_pattern = graph_matcher.OpTypePattern('Reshape')
+    transpose_pattern = graph_matcher.OneofPattern([
+        graph_matcher.OpTypePattern(
+            'Transpose',
+            name='transpose',
+            inputs=[
+                graph_matcher.OpTypePattern(
+                    'Slice', name='slice', inputs=[reshape_pattern, '*', '*']),
+                '*'
+            ]),
+        graph_matcher.OpTypePattern(
+            'Transpose', name='transpose', inputs=[reshape_pattern, '*'])
+    ])
+
+    matcher = graph_matcher.GraphMatcher(transpose_pattern)
+
+    g = ops.Graph()
+    with g.as_default():
+      inputs = array_ops.placeholder(dtypes.float32, shape=[6])
+      reshape = array_ops.reshape(inputs, [2, 3])
+      transpose = array_ops.transpose(reshape)
+      [match_result] = list(matcher.match_graph(g))
+      self.assertEqual(match_result.get_tensor(reshape_pattern), reshape)
+      self.assertEqual(match_result.get_tensor('slice'), None)
+      self.assertEqual(match_result.get_op('transpose'), transpose.op)
+
+    g = ops.Graph()
+    with g.as_default():
+      inputs = array_ops.placeholder(dtypes.float32, shape=[6])
+      reshape = array_ops.reshape(inputs, [2, 3])
+      slicing = array_ops.slice(reshape, [0, 0], [-1, -1])
+      transpose = array_ops.transpose(slicing)
+      [match_result] = list(matcher.match_graph(g))
+      self.assertEqual(match_result.get_tensor(reshape_pattern), reshape)
+      self.assertEqual(match_result.get_tensor('slice'), slicing)
+      self.assertEqual(match_result.get_op('transpose'), transpose.op)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index f80d427ff0a6573ecd6562c443182797b5d22527..0a8e35080cb08f71dc28e33c6138a12656e5a5ea 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -53,7 +53,7 @@ def LastValueQuantize(inputs,
                       init_max=6.0,
                       updates_collection=ops.GraphKeys.UPDATE_OPS,
                       vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
-                      scope=None,
+                      name_prefix='LastValueQuant',
                       reuse=None,
                       is_training=True,
                       num_bits=8,
@@ -73,7 +73,7 @@ def LastValueQuantize(inputs,
       computation.
     vars_collection: (Optional) collection where to store variables for
       quantization interval ends.
-    scope: Optional scope for variable_scope.
+    name_prefix: name_prefix for created nodes.
     reuse: whether or not the layer and its variables should be reused. To be
       able to reuse the layer scope must be given.
     is_training: Whether the op is applied to a training or eval graph.
@@ -84,13 +84,13 @@ def LastValueQuantize(inputs,
     a tensor containing quantized values.
   """
   with variable_scope.variable_scope(
-      scope, 'LastValueQuantize', values=[inputs], reuse=reuse):
+      None, default_name=name_prefix, values=[inputs], reuse=reuse):
     input_shape = inputs.get_shape()
     input_dim = len(input_shape)
     if per_channel:
       # Only support quantizing 1-, 2- and 4-dimensional tensors.
       assert input_dim in [1, 2, 4], ('Expected 1D, 2D or 4D input, was: %s in '
-                                      ' scope: %s' % (input_shape, scope))
+                                      ' scope: %s' % (input_shape, name_prefix))
       min_max_shape = [input_shape[-1]]
     else:
       min_max_shape = []
@@ -165,7 +165,7 @@ def MovingAvgQuantize(inputs,
                       ema_decay=0.999,
                       updates_collection=ops.GraphKeys.UPDATE_OPS,
                       vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
-                      scope=None,
+                      name_prefix='MovingAvgQuantize',
                       reuse=None,
                       is_training=True,
                       num_bits=8,
@@ -186,7 +186,7 @@ def MovingAvgQuantize(inputs,
       computation.
     vars_collection: (Optional) collection where to store variables for
       quantization interval ends.
-    scope: Optional scope for variable_scope.
+    name_prefix: name_prefix for created nodes.
     reuse: whether or not the layer and its variables should be reused. To be
       able to reuse the layer scope must be given.
     is_training: Whether the op is applied to a training or eval graph.
@@ -197,13 +197,13 @@ def MovingAvgQuantize(inputs,
     a tensor containing quantized values.
   """
   with variable_scope.variable_scope(
-      scope, 'MovingAvgQuantize', values=[inputs], reuse=reuse):
+      None, default_name=name_prefix, values=[inputs], reuse=reuse):
     input_shape = inputs.get_shape()
     input_dim = len(input_shape)
     if per_channel:
       # Only support quantizing 1-, 2- and 4-dimensional tensors.
       assert input_dim in [1, 2, 4], ('Expected 1D, 2D or 4D input, was: %s in '
-                                      ' scope: %s' % (input_shape, scope))
+                                      ' scope: %s' % (input_shape, name_prefix))
       min_max_shape = [input_shape[-1]]
     else:
       min_max_shape = []
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 50a2b4c91c9e7a2681f6041646a023a4225fb0c5..7a3f92f503a5d6f2b0fab2a499f8e8758809d0ed 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Logic to update a Tensorflow model graph with quantization operations."""
+"""Logic to update a TensorFlow model graph with quantization operations."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,37 +21,37 @@ from __future__ import print_function
 import re
 from tensorflow.contrib import graph_editor
 from tensorflow.contrib.quantize.python import common
+from tensorflow.contrib.quantize.python import graph_matcher
 from tensorflow.contrib.quantize.python import input_to_ops
 from tensorflow.contrib.quantize.python import quant_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.training import training_util
 
-# Operation types used to select operations of interest.
+# Quantizable operation types that are supported by the quantization rewrite.
 _QUANTIZABLE_TYPES = {'Conv2D', 'MatMul', 'DepthwiseConv2dNative'}
 
-# Custom key for storing and retrieving update ops used by quantizing nodes.
-_UPDATE_QUANT_OPS = 'update_quant_ops'
+# Activations that are supported by the quantization rewrite.
+_ACTIVATION_TYPES = {'Relu', 'Relu6', 'Identity'}
+
+# Weight types that are supported by the quantization rewrite.
+# TODO(suharshs): Add support for ResourceVariable.
+_WEIGHT_TYPES = {'Variable', 'VariableV2'}
 
 
 def Quantize(graph,
+             is_training,
              weight_bits=8,
-             weight_narrow_range=False,
              activation_bits=8,
              ema_decay=0.999,
              quant_delay=None,
-             vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
-             is_training=True,
-             quantize_folded_weights_use_ema=False):
+             vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES):
   """Updates graph with quantization operations.
 
   Args:
     graph: Graph to modify.
+    is_training: Whether quantizing training graph or eval graph.
     weight_bits: Number of bits to use for quantizing weights.
-    weight_narrow_range: Whether to use a more efficient narrow range for
-      weights quantization.  With weight_narrow_range true, the range is
-      [1; 2^weight_bits - 1], with it false [0; 2^weight_bits - 1].
     activation_bits: Number of bits to use for quantizing activations.
     ema_decay: (Optional) Float, EMA decay parameter.  EMA is used to update
       quantization intervals for quantizing activations (see here about EMA:
@@ -61,346 +61,280 @@ def Quantize(graph,
       training.
     vars_collection: (Optional) Collection where to store the variables for
       quantization interval ends.
-    is_training: (Optional) Whether quantizing training graph or eval graph.
-    quantize_folded_weights_use_ema: (Optional, default False) Whether to
-      quantize weights after batchnorm-folding with exponential average
-      quantization.
   Raises:
     ValueError: When quantization fails.
   """
-  context = _QuantizeContext(graph, weight_bits, weight_narrow_range,
-                             activation_bits, ema_decay, quant_delay,
-                             vars_collection, is_training,
-                             quantize_folded_weights_use_ema)
-
-  graph_ops = graph.get_operations()
-
-  # Filter out backprop and summary related operations, leave only interesting
-  # op types.
-  def _IsInterestingOpWithWeights(op):
-    return (op.type in _QUANTIZABLE_TYPES and
-            not op.name.startswith(common.SKIPPED_PREFIXES))
-
-  for op in (op for op in graph_ops if _IsInterestingOpWithWeights(op)):
-    if op.name.endswith('/depthwise'):
-      # Separable convolution may consist of 2 convolution nodes. If so, skip
-      # .../depthwise and only quantize the top one.
-      separable_conv = context.GetOperationByNameDontThrow(
-          op.name[:-len('/depthwise')])
-      if separable_conv and separable_conv.type == 'Conv2D':
-        continue
-    # Quantize add ops that come after Conv2D or DepthwiseConv2dNative.
-    if op.type in ['Conv2D', 'DepthwiseConv2dNative']:
-      add_context_re = re.search(r'^(.*)/[^/]+/', op.name)
-      if add_context_re is not None:
-        context.add_contexts.add(add_context_re.group(1))
-    if not op.name.endswith('_Fold'):
-      folded_op = context.GetOperationByNameDontThrow(op.name + '_Fold')
-      # Do nothing if found, it will be quantized when it is iterated over.
-      if not folded_op:
-        context.QuantizeOpWithWeights(op, folded=False)
-    else:
-      context.QuantizeOpWithWeights(op, folded=True)
-
-  context.QuantizeAddContexts()
-
-  # Once all quantization ops have been inserted in the graph, collect update
-  # ops for their variables and modify the TF Slim update barrier (see
-  # https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/learning.py)
-  # to depend on them.
-  try:
-    update_barrier = graph.get_operation_by_name('update_barrier')
-  except KeyError:
-    # In evaluation graph, this barrier may not exist.
-    return None
-  update_quant_ops = graph.get_collection_ref(_UPDATE_QUANT_OPS)
-  graph_editor.add_control_inputs(update_barrier, update_quant_ops)
-
-
-class _QuantizeContext(object):
-  """Context holds references needed for quantization."""
-
-  def __init__(self,
-               graph,
-               weight_bits,
-               weight_narrow_range,
-               activation_bits,
-               ema_decay=0.999,
-               quant_delay=None,
-               vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
-               is_training=True,
-               quantize_folded_weights_use_ema=False):
-    """Initializes context to hold references needed for quantization.
-
-    Args:
-      graph: Graph to modify.
-      weight_bits: Number of bits to use for quantizing weights.
-      weight_narrow_range: Whether to use a more efficient narrow range for
-        weights quantization.  With weight_narrow_range true, the range is
-        [1; 2^weight_bits - 1], with it false [0; 2^weight_bits - 1].
-      activation_bits: Number of bits to use for quantizing activations.
-      ema_decay: (Optional) Float, EMA decay parameter.
-      quant_delay: (Optional, default None) Int, count of global steps for which
-        to delay quantization.  This helps weights stabilize at the start of
-        training.
-      vars_collection: (Optional) Collection where to store the variables for
-        quantization interval ends.
-      is_training: (Optional) Whether quantizing training or eval graph.
-      quantize_folded_weights_use_ema: (Optional, default False) Whether to
-        quantize weights after batchnorm-folding with exponential average
-        quantization.
-    """
-    self.graph = graph
-    self.weight_bits = weight_bits
-    self.weight_narrow_range = weight_narrow_range
-    self.activation_bits = activation_bits
-    self.ema_decay = ema_decay
-    self.quant_delay = quant_delay
-    self.vars_collection = vars_collection
-    self.is_training = is_training
-    self.quantize_folded_weights_use_ema = quantize_folded_weights_use_ema
-    self.input_to_ops_map = input_to_ops.InputToOps(graph)
-    self.add_contexts = set()
-
-  def QuantizeAddContexts(self):
-    """Quantizes all add ops in self.add_contexts."""
-    # Loop through sorted self.add_contexts so that op creation is
-    # deterministic. This is needed when using multiple worker replicas so that
-    # the ops can be initialized consistently.
-    for add_context in sorted(self.add_contexts):
-      add_op = self.GetOperationByNamesDontThrow([
-          add_context + '/Add', add_context + '/add'])
-      if add_op is not None:
-        self._InsertQuantOp(
-            add_context,
-            add_op,
-            self.input_to_ops_map.ConsumerOperations(add_op),
-            name='add_quant',
-            moving_avg=True,
-            bits=self.activation_bits,
-            narrow_range=False)
-
-  def QuantizeOpWithWeights(self, op, folded):
-    """Quantizes around the specific operation with or without batch norm.
-
-    Args:
-      op: Operation to quantize.
-      folded: Operation has been folded and needs special handling if True.
-    Raises:
-      ValueError: When quantization fails.
-    """
-    # Op name component before the last slash will be used as context.
-    context = re.search(r'^(.*)/([^/]+)', op.name).group(1)
-
-    # Quantize weights.
-    if folded:
-      producer_op = self.graph.get_operation_by_name(context + '/mul_fold')
-    else:
-      try:
-        input_idx = next(i for i, v in enumerate(op.inputs)
-                         if '/weights/' in v.name or
-                         '/depthwise_weights' in v.name)
-      except StopIteration:
-        raise ValueError('No inputs to quantize for op: %s' % op)
-      producer_op = op.inputs[input_idx].op
-
-    # If batch norm is used, the folded weights depend on the batch std, hence
-    # it is sensible to use EMA during training to smooth out the noise. This is
-    # controlled by the flag quantize_folded_weights_use_ema. Its default is
-    # False for backward compatibility.
-    # If there is no batch norm, weights do not depend on the batch and using
-    # the latest value of min and max is more efficient.
-    weight_use_ema = folded and self.quantize_folded_weights_use_ema
-    self._InsertQuantOp(
-        context,
-        producer_op, [op],
-        name='weights_quant',
-        moving_avg=weight_use_ema,
-        delay_requested=weight_use_ema,
-        bits=self.weight_bits,
-        narrow_range=self.weight_narrow_range)
-
-    # Important: do not quantize biases here.  During inference they are
-    # quantized to 32 bits, which is much finer than 8 bit quantization and
-    # depends on weight and input activation ranges.
-
-    # Find activation and (optionally) Add operations to quantize.
-    activation_op, add_op, add_context = self._GetReluAndAddOperations(context,
-                                                                       op)
-    if add_op:
-      original_context = context
-      context = add_context
-
-    # Quantize activation outputs.
-    consumer_ops = self.input_to_ops_map.ConsumerOperations(activation_op)
-    self._InsertQuantOp(
+  input_to_ops_map = input_to_ops.InputToOps(graph)
+  for layer_match in _FindLayersToQuantize(graph):
+    # Quantize the weights.
+    context = _GetContextFromOp(layer_match.layer_op)
+    _InsertQuantOp(
         context,
-        activation_op,
+        'weights_quant',
+        layer_match.weight_tensor.op, [layer_match.layer_op],
+        is_training,
+        moving_avg=False,
+        ema_decay=ema_decay,
+        quant_delay=quant_delay,
+        narrow_range=True,
+        vars_collection=vars_collection,
+        bits=weight_bits)
+
+    # Quantize the activations.
+    consumer_ops = input_to_ops_map.ConsumerOperations(
+        layer_match.activation_op)
+    add_context = context
+    if layer_match.bypass_op:
+      add_context = re.search(r'^(.*)/([^/]+)', context).group(1)
+    _InsertQuantOp(
+        add_context,
+        'act_quant',
+        layer_match.activation_op,
         consumer_ops,
-        name='act_quant',
+        is_training,
         moving_avg=True,
-        init_min=0.0,
-        bits=self.activation_bits,
-        narrow_range=False)
-
-    # When a bypass connection was found, also quantize Add op input.
-    if add_op:
-      def _QuantizeAddInput(add_input):
-        if folded:
-          return add_input.op.name.endswith('/add_fold')
-        else:
-          return add_input.op.name.startswith(original_context + '/')
-
-      for add_input in add_op.inputs:
-        if _QuantizeAddInput(add_input):
-          self._InsertQuantOp(
-              original_context,
-              add_input.op, [add_op],
-              name='conv_quant',
-              moving_avg=True,
-              bits=self.activation_bits,
-              narrow_range=False)
-
-  def _GetReluAndAddOperations(self, context, op):
-    """Looks up a Relu* and Add operations in given context.
-
-    Args:
-      context: Context where to look for operations.
-      op: Operation to quantize.
-
-    Returns:
-      A triplet (Operation, Operation, string), the first element is an end
-      point operation, the second is Add operation (optional), the third element
-      is string context where the Add operation was found (optional).
-
-    Raises:
-      ValueError: When operations cannot be found.
-    """
-    activation_op = common.GetEndpointActivationOp(self.graph, context)
-    if activation_op:
-      return activation_op, None, None
-
-    if '/' in context:
-      # If no activation op is there, look for them one level up.
-      add_context = re.search(r'^(.*)/([^/]+)', context).group(1)
-      activation_op = common.GetEndpointActivationOp(self.graph, add_context)
-    if not activation_op:
-      # Still no Relu, can happen on the top layer, just find the next node up,
-      # make sure it is BiasAdd.
-      consumers = [c for outp in op.outputs for c in outp.consumers()]
-      if len(consumers) != 1 or consumers[0].type != 'BiasAdd':
-        raise ValueError('Failed to quantize op: %s, %s' % (op.name, op.type))
-      return consumers[0], None, None
-    if add_context:
-      add_op = self.GetOperationByNamesDontThrow([
-          add_context + '/Add', add_context + '/add'])
-      return activation_op, add_op, add_context
-    else:
-      raise ValueError('Failed to quantize op: %s, %s' % (op.name, op.type))
-
-  def GetOperationByNameDontThrow(self, name):
-    """Returns an Operation with the given name.
-
-    Args:
-      name: Name of Operation to return.
-
-    Returns:
-      The Operation with the given name. None if the name does not correspond to
-      any operation in the graph.
-    """
-    try:
-      return self.graph.get_operation_by_name(name)
-    except KeyError:
-      return None
-
-  def GetOperationByNamesDontThrow(self, names):
-    """Returns an Operation with one of the given names.
-
-    Args:
-      names: Names of Operation to return.
-
-    Returns:
-      The Operation with one of the given names. None if none of the names
-      corresponds to any operation in the graph.
-    """
-    for name in names:
-      op = self.GetOperationByNameDontThrow(name)
-      if op is not None:
-        return op
-    return None
-
-  def _InsertQuantOp(
-      self,
-      context,
-      producer,
-      consumers,
-      name,
-      moving_avg=True,
-      init_min=-6.0,
-      init_max=6.0,
-      delay_requested=True,
-      bits=8,
-      narrow_range=False,):
-    """Inserts a quant op between a producer op and (multiple) consumer ops.
-
-    Args:
-      context: Context where producer and consumer operations are nested.
-      producer: Producer operation of the pairs where quantization will be
-        inserted.
-      consumers: Consumer operations of the pairs.
-      name: Name for the new quantization op within the context.
-      moving_avg: Specifies whether to use exponential moving average or just
-        the last value seen.
-      init_min: Starting minimum value for the new quantization op.
-      init_max: Starting maximum value for the new quantization op.
-      delay_requested: If true, implement quantization delay where needed.
-        False value explicitly disables delay quantization everywhere.
-      bits: Number of bits to use for quantization, must be between 2 and 8.
-      narrow_range: Whether to use the narrow quantization range
-        [1; 2^bits - 1] or wide range [0; 2^bits - 1].
-    Raises:
-      ValueError: When producer operation is not directly connected to the
-        consumer operation.
-    """
-    scope = context + '/' + name
-    inputs = producer.outputs[0]
-    if moving_avg:
-      quant = (quant_ops.MovingAvgQuantize(
-          inputs,
-          init_min=init_min,
-          init_max=init_max,
-          ema_decay=self.ema_decay,
-          is_training=self.is_training,
-          num_bits=bits,
-          narrow_range=narrow_range,
-          updates_collection=_UPDATE_QUANT_OPS,
-          vars_collection=self.vars_collection,
-          scope=scope))
-    else:
-      quant = (quant_ops.LastValueQuantize(
-          inputs,
-          init_min=init_min,
-          init_max=init_max,
-          is_training=self.is_training,
-          num_bits=bits,
-          narrow_range=narrow_range,
-          updates_collection=_UPDATE_QUANT_OPS,
-          vars_collection=self.vars_collection,
-          scope=scope))
-
-    if delay_requested and self.quant_delay and self.quant_delay > 0:
-      activate_quant = math_ops.greater_equal(
-          training_util.get_or_create_global_step(),
-          self.quant_delay,
-          name=scope + '/activate_quant')
-      quant = control_flow_ops.cond(
-          activate_quant,
-          lambda: quant,
-          lambda: inputs,
-          name=scope + '/delayed_quant')
-
-    nodes_modified_count = graph_editor.reroute_ts(
-        [quant], [inputs], can_modify=consumers)
-    if nodes_modified_count != len(consumers):
-      raise ValueError('Some inputs not quantized for ops: [%s]' %
-                       ', '.join([consumer.name for consumer in consumers]))
+        ema_decay=ema_decay,
+        quant_delay=quant_delay,
+        vars_collection=vars_collection,
+        bits=activation_bits,
+        init_min=0.0)
+
+    # Quantize the inputs and output to the bypass (if it exists). The input to
+    # the bypass is the bias add, and the output is the activation.
+    if layer_match.bypass_op is not None:
+      _InsertQuantOp(
+          context,
+          'conv_quant',
+          layer_match.bias_add_op, [layer_match.bypass_op],
+          is_training,
+          moving_avg=True,
+          ema_decay=ema_decay,
+          quant_delay=quant_delay,
+          vars_collection=vars_collection,
+          bits=activation_bits)
+      _InsertQuantOp(
+          add_context,
+          'add_quant',
+          layer_match.bypass_op,
+          input_to_ops_map.ConsumerOperations(layer_match.bypass_op),
+          is_training,
+          moving_avg=True,
+          ema_decay=ema_decay,
+          quant_delay=quant_delay,
+          vars_collection=vars_collection,
+          bits=activation_bits)
+
+
+def _FindLayersToQuantize(graph):
+  """Matches layers in graph to quantize.
+
+  Args:
+    graph: Graph to perform match on.
+
+  Yields:
+    _LayerMatches.
+  """
+  input_pattern = graph_matcher.OpTypePattern('*')
+  weight_var_pattern = graph_matcher.OpTypePattern('|'.join(_WEIGHT_TYPES))
+  weight_pattern = graph_matcher.OpTypePattern(
+      'Identity', inputs=[weight_var_pattern])
+
+  folded_weight_pattern = graph_matcher.OpTypePattern('Mul')
+
+  # The weights inputs to the layer operation can either be from the Variable or
+  # the folded weight (Mul).
+  layer_pattern = graph_matcher.OpTypePattern(
+      '|'.join(_QUANTIZABLE_TYPES),
+      inputs=[
+          input_pattern,
+          graph_matcher.OneofPattern([weight_pattern, folded_weight_pattern])
+      ])
+
+  folded_bias_mul_pattern = graph_matcher.OpTypePattern(
+      'Mul', inputs=[graph_matcher.OpTypePattern('*'), layer_pattern])
+  post_layer_op_correction_pattern = graph_matcher.OpTypePattern(
+      'Add', inputs=[folded_bias_mul_pattern,
+                     graph_matcher.OpTypePattern('*')])
+  folded_bias_add_pattern = graph_matcher.OpTypePattern(
+      'Add',
+      inputs=[
+          post_layer_op_correction_pattern,
+          graph_matcher.OpTypePattern('*')
+      ])
+
+  bias_add_pattern = graph_matcher.OpTypePattern(
+      'Add|BiasAdd', inputs=[layer_pattern, '*'])
+
+  # The bias can come from the bias add or the folded bias add.
+  bypass_pattern_a = graph_matcher.OpTypePattern(
+      'Add',
+      inputs=[
+          graph_matcher.OneofPattern(
+              [bias_add_pattern, folded_bias_add_pattern]), '*'
+      ])
+  bypass_pattern_b = graph_matcher.OpTypePattern(
+      'Add',
+      inputs=[
+          '*',
+          graph_matcher.OneofPattern(
+              [bias_add_pattern, folded_bias_add_pattern])
+      ])
+
+  # The input to the activation can come from bias add, fold bias add or the
+  # bypasses.
+  activation_pattern = graph_matcher.OpTypePattern(
+      '|'.join(_ACTIVATION_TYPES),
+      inputs=[
+          graph_matcher.OneofPattern([
+              bias_add_pattern, folded_bias_add_pattern, bypass_pattern_a,
+              bypass_pattern_b
+          ])
+      ])
+
+  layer_matcher = graph_matcher.GraphMatcher(activation_pattern)
+  for match_result in layer_matcher.match_graph(graph):
+    layer_op = match_result.get_op(layer_pattern)
+    weight_tensor = match_result.get_tensor(weight_pattern)
+    if weight_tensor is None:
+      weight_tensor = match_result.get_tensor(folded_weight_pattern)
+    activation_op = match_result.get_op(activation_pattern)
+    bias_add_op = match_result.get_op(bias_add_pattern)
+    if bias_add_op is None:
+      bias_add_op = match_result.get_op(folded_bias_add_pattern)
+    bypass_op = match_result.get_op(bypass_pattern_a)
+    if bypass_op is None:
+      bypass_op = match_result.get_op(bypass_pattern_b)
+    yield _LayerMatch(layer_op, weight_tensor, activation_op, bypass_op,
+                      bias_add_op)
+
+
+class _LayerMatch(object):
+  """Contains all information related to a matched Layer."""
+
+  def __init__(self, layer_op, weight_tensor, activation_op, bypass_op,
+               bias_add_op):
+    self._layer_op = layer_op
+    self._weight_tensor = weight_tensor
+    self._activation_op = activation_op
+    self._bypass_op = bypass_op
+    self._bias_add_op = bias_add_op
+
+  @property
+  def layer_op(self):
+    return self._layer_op
+
+  @property
+  def weight_tensor(self):
+    return self._weight_tensor
+
+  @property
+  def activation_op(self):
+    return self._activation_op
+
+  @property
+  def bypass_op(self):
+    return self._bypass_op
+
+  @property
+  def bias_add_op(self):
+    return self._bias_add_op
+
+
+def _InsertQuantOp(context,
+                   name,
+                   producer,
+                   consumers,
+                   is_training,
+                   moving_avg=True,
+                   init_min=-6.0,
+                   init_max=6.0,
+                   bits=8,
+                   ema_decay=0.999,
+                   quant_delay=None,
+                   vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
+                   narrow_range=False):
+  """Inserts a quant op between a producer op and (multiple) consumer ops.
+
+  Args:
+    context: Context w,here producer and consumer operations are nested.
+    name: Name for the new quantization op within the context.
+    producer: Producer operation of the pairs where quantization will be
+      inserted.
+    consumers: Consumer operations of the pairs.
+    is_training: Whether quantizing training graph or eval graph.
+    moving_avg: Specifies whether to use exponential moving average or just
+      the last value seen.
+    init_min: Starting minimum value for the new quantization op.
+    init_max: Starting maximum value for the new quantization op.
+    bits: Number of bits to use for quantization, must be between 2 and 8.
+    ema_decay: (Optional) Float, EMA decay parameter.  EMA is used to update
+      quantization intervals for quantizing activations (see here about EMA:
+      https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average).
+    quant_delay: (Optional, default None) Int, count of global steps for which
+      to delay quantization.  This helps weights stabilize at the start of
+      training.
+    vars_collection: (Optional) Collection where to store the variables for
+      quantization interval ends.
+    narrow_range: Whether to use the narrow quantization range
+      [1; 2^bits - 1] or wide range [0; 2^bits - 1].
+  Raises:
+    ValueError: When producer operation is not directly connected to the
+      consumer operation.
+  """
+  name_prefix = _AddContextToName(context, name)
+  inputs = producer.outputs[0]
+  if moving_avg:
+    quant = (
+        quant_ops.MovingAvgQuantize(
+            inputs,
+            init_min=init_min,
+            init_max=init_max,
+            ema_decay=ema_decay,
+            is_training=is_training,
+            num_bits=bits,
+            narrow_range=narrow_range,
+            vars_collection=vars_collection,
+            name_prefix=name_prefix))
+  else:
+    quant = (
+        quant_ops.LastValueQuantize(
+            inputs,
+            init_min=init_min,
+            init_max=init_max,
+            is_training=is_training,
+            num_bits=bits,
+            narrow_range=narrow_range,
+            vars_collection=vars_collection,
+            name_prefix=name_prefix))
+
+  if quant_delay and quant_delay > 0:
+    activate_quant = math_ops.greater_equal(
+        common.CreateOrGetQuantizationStep(),
+        quant_delay,
+        name=name_prefix + '/activate_quant')
+    quant = control_flow_ops.cond(
+        activate_quant,
+        lambda: quant,
+        lambda: inputs,
+        name=name_prefix + '/delayed_quant')
+
+  nodes_modified_count = graph_editor.reroute_ts(
+      [quant], [inputs], can_modify=consumers)
+  if nodes_modified_count != len(consumers):
+    raise ValueError('Some inputs not quantized for ops: [%s]' % ', '.join(
+        [consumer.name for consumer in consumers]))
+
+
+def _GetContextFromOp(op):
+  """Gets the root context name from the op name."""
+  context_re = re.search(r'^(.*)/([^/]+)', op.name)
+  if context_re:
+    return context_re.group(1)
+  return ''
+
+
+def _AddContextToName(context, name):
+  """Adds the context to the name if it exists."""
+  if not context:
+    return name
+  return context + '/' + name
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index d647bb94e849c713c2aca93c53f372bae5857c43..0dfe78fd0238d233ed80198259581ebd90ebad20 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -18,113 +18,182 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.quantize.python import copy_graph
 from tensorflow.contrib.quantize.python import fold_batch_norms
 from tensorflow.contrib.quantize.python import quantize
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import variables
 
 
-def _create_graph(input_graph,
-                  is_training,
-                  elements=None,
-                  device_name_or_function=None):
-  """Returns a transformed training input_graph for simulated quantization.
+def _create_graph(input_graph=None,
+                  is_training=True,
+                  weight_bits=8,
+                  activation_bits=8,
+                  quant_delay=None,
+                  freeze_bn_delay=None):
+  """Rewrites an input_graph in place for simulated quantization.
 
-  The forward pass has fake quantization ops inserted to simulate the error
-  introduced by quantization.
+  The graph has fake quantization ops inserted to simulate the error
+  introduced by quantization. Since the graph is transformed in place,
+  the expected behavior of previously held references to nodes and tensors may
+  change.
 
   Args:
-    input_graph: The tf.Graph to be transformed.
+    input_graph: The tf.Graph to be transformed, if None then defaults to the
+      default graph.
     is_training: Whether quantizing training or eval graph.
-    elements: (Optional) List of Tensors and Operations in input_graph whose
-        corresponding elements in the new graph will be returned.
-    device_name_or_function: (Optional) The device name or function to use.
-
-  Returns:
-    g is new tf.Graph that is rewritten for simulated quantization.
-    l is a list of Tensors/Operations in g corresponding to the provided input
-        elements, if elements is not None.
+    weight_bits: Number of bits to use for quantizing weights.
+    activation_bits: Number of bits to use for quantizing activations.
+    quant_delay: Number of steps after which weights and activations are
+      quantized during training.
+    freeze_bn_delay: Number of steps after which moving mean and variance are
+      frozen and used instead of batch statistics during training.
+      freeze_bn_delay should be greater than quant_delay and should correspond
+      to the number of steps when training has almost converged
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
-        tf.Operation.
+      tf.Operation.
   """
-  # TODO(suharshs): Describe the process in more detail in the doc string.
-  g = copy_graph.CopyGraph(input_graph)
-  with g.as_default():
-    with ops.device(device_name_or_function):
-      fold_batch_norms.FoldBatchNorms(g)
-      quantize.Quantize(g, is_training=is_training)
-  if elements is None:
-    return g
-
-  return_elements = []
-  for element in elements:
-    if isinstance(element, (ops.Tensor, variables.Variable)):
-      return_elements.append(g.get_tensor_by_name(element.name))
-    elif isinstance(element, ops.Operation):
-      return_elements.append(g.get_operation_by_name(element.name))
-    else:
-      raise ValueError(
-          'elements must consist of Tensor or Operation objects, got: ',
-          str(element))
-  return g, return_elements
-
-
-def create_training_graph(input_graph,
-                          elements=None,
-                          device_name_or_function=None):
-  """Returns a transformed training input_graph for simulated quantization.
-
-  The forward pass has fake quantization ops inserted to simulate the error
-  introduced by quantization.
+
+  if input_graph is None:
+    input_graph = ops.get_default_graph()
+  with input_graph.as_default():
+    fold_batch_norms.FoldBatchNorms(
+        input_graph,
+        freeze_batch_norm_delay=freeze_bn_delay,
+        is_training=is_training)
+    quantize.Quantize(
+        input_graph,
+        is_training,
+        quant_delay=quant_delay,
+        weight_bits=weight_bits,
+        activation_bits=activation_bits)
+
+
+def create_training_graph(input_graph=None, quant_delay=250000):
+  """Rewrites a training input_graph in place for simulated quantization.
+
+  The graph has fake quantization ops inserted to simulate the error
+  introduced by quantization. Since the graph is transformed in place,
+  the expected behavior of previously held references to nodes and tensors may
+  change.
 
   Args:
     input_graph: The tf.Graph to be transformed.
-    elements: (Optional) List of Tensors and Operations in input_graph whose
-        corresponding elements in the new graph will be returned.
-    device_name_or_function: (Optional) The device name or function to use.
+    quant_delay: Number of steps after which weights and activations are
+      quantized during training.
 
-  Returns:
-    g is new tf.Graph that is rewritten for simulated quantization.
-    l is a list of Tensors/Operations in g corresponding to the provided input
-        elements, if elements is not None.
+  Raises:
+    ValueError: If elements contains an element that isn't a tf.Tensor or
+      tf.Operation.
+  """
+  # TODO(raghuramank) Need to have freeze_bn_delay be a function of batch size
+  # Currently the values below are hardcoded for mobilenetV1 on imagenet
+  # Please use the experimental API if you need to tune these values.
+  if quant_delay == 0:
+    # Corresponds to case of restoring from a floating point checkpoint
+    # In this case, we can freeze the moving mean and variance early on and
+    # switch to using them during training. Therefore, freeze_bn_delay is set to
+    # 200000
+    freeze_bn_delay = 200000
+  else:
+    # If training from scratch, set freeze_bn_delay to 100 epochs after quant
+    # delay. With a batch size of 64, this corresponds to 20000*100=2M steps.
+    freeze_bn_delay = quant_delay + 2000000
+
+  _create_graph(
+      input_graph=input_graph,
+      is_training=True,
+      quant_delay=quant_delay,
+      freeze_bn_delay=freeze_bn_delay)
+
+
+def create_eval_graph(input_graph=None):
+  """Rewrites an eval input_graph in place for simulated quantization.
+
+  The graph has fake quantization ops inserted to simulate the error
+  introduced by quantization. Since the graph is transformed in place,
+  the expected behavior of previously held references to nodes and tensors may
+  change.
+
+  Args:
+    input_graph: The tf.Graph to be transformed, if None then defaults to the
+      default graph.
+
+  Raises:
+    ValueError: If elements contains an element that isn't a tf.Tensor or
+      tf.Operation.
+  """
+  _create_graph(input_graph=input_graph, is_training=False)
+
+
+def experimental_create_training_graph(input_graph=None,
+                                       weight_bits=8,
+                                       activation_bits=8,
+                                       quant_delay=250000,
+                                       freeze_bn_delay=500000):
+  """Rewrites a training input_graph in place for simulated quantization.
+
+  This function has additional experimental options not (yet) available to
+  create_training_graph. The resulting behavior may be undefined.
+
+  The graph has fake quantization ops inserted to simulate the error
+  introduced by quantization. Since the graph is transformed in place,
+  the expected behavior of previously held references to nodes and tensors may
+  change.
+
+  Args:
+    input_graph: The tf.Graph to be transformed,if None then defaults to the
+      default graph.
+    weight_bits: Number of bits to use for quantizing weights.
+    activation_bits: Number of bits to use for quantizing activations.
+    quant_delay: Number of steps after which weights and activations are
+      quantized during training.
+    freeze_bn_delay: Number of steps after which moving mean and variance are
+      frozen and used instead of batch statistics during training.
+      freeze_bn_delay should be greater than quant_delay and should correspond
+      to when training has almost converged
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
         tf.Operation.
   """
-  return _create_graph(
+
+  _create_graph(
       input_graph=input_graph,
       is_training=True,
-      elements=elements,
-      device_name_or_function=device_name_or_function)
+      weight_bits=weight_bits,
+      activation_bits=activation_bits,
+      quant_delay=quant_delay,
+      freeze_bn_delay=freeze_bn_delay)
+
 
+def experimental_create_eval_graph(input_graph=None,
+                                   weight_bits=8,
+                                   activation_bits=8):
+  """Rewrites an eval input_graph in place for simulated quantization.
 
-def create_eval_graph(input_graph, elements=None, device_name_or_function=None):
-  """Returns a transformed eval input_graph for simulated quantization.
+  This function has additional experimental options not (yet) available to
+  create_eval_graph. The resulting behavior may be undefined.
 
-  The forward pass has fake quantization ops inserted to simulate the error
-  introduced by quantization.
+  The graph has fake quantization ops inserted to simulate the error
+  introduced by quantization. Since the graph is transformed in place,
+  the expected behavior of previously held references to nodes and tensors may
+  change.
 
   Args:
-    input_graph: The tf.Graph to be transformed.
-    elements: (Optional) List of Tensors and Operations in input_graph whose
-        corresponding elements in the new graph will be returned.
-    device_name_or_function: (Optional) The device name or function to use.
+    input_graph: The tf.Graph to be transformed, if None then defaults to the
+      default graph.
+    weight_bits: Number of bits to use for quantizing weights.
+    activation_bits: Number of bits to use for quantizing activations.
+
 
-  Returns:
-    g is new tf.Graph that is rewritten for simulated quantization.
-    l is a list of Tensors/Operations in g corresponding to the provided input
-        elements, if elements is not None.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
-        tf.Operation.
+      tf.Operation.
   """
-  return _create_graph(
+  _create_graph(
       input_graph=input_graph,
       is_training=False,
-      elements=elements,
-      device_name_or_function=device_name_or_function)
+      weight_bits=weight_bits,
+      activation_bits=activation_bits)
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index 3407ace3914fe2de2506a2952ea5d1bf19028bb9..6b9289ef5f4b847172e1f093a1e4b5b2d3bdab57 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -20,121 +20,211 @@ from __future__ import print_function
 
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.quantize.python import quantize_graph
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
 class QuantizeGraphTest(test_util.TensorFlowTestCase):
-
   # We have a lot of other tests that test the details of the rewrite, here we
   # just the specific features of the quantize_graph API.
-  def testReturnedElementsTraining(self):
-    self._TestReturnElements(True)
-
-  def testReturnedElementsEval(self):
-    self._TestReturnElements(False)
 
-  def _TestReturnElements(self, is_training):
-    graph = ops.Graph()
-    with graph.as_default():
-      a = constant_op.constant(1.0)
-      b = variables.Variable(2.0)
-      c = a + b
-    elements = [a, b, c.op]
-    if is_training:
-      q_graph, returned_elements = quantize_graph.create_training_graph(
-          graph, elements=elements)
-    else:
-      q_graph, returned_elements = quantize_graph.create_eval_graph(
-          graph, elements=elements)
-    # Make sure q_graph is different from graph.
-    self.assertTrue(graph != q_graph)
-    # Check that the returned elements are part of the new graph.
-    for returned_element in returned_elements:
-      self.assertEqual(q_graph, returned_element.graph)
-    # Check that the elements match with the one from the input graph.
-    for element, returned_element in zip(elements, returned_elements):
-      self.assertEqual(element.name, returned_element.name)
-
-  def testNoReturnElementsTraining(self):
-    self._TestNoReturnElements(True)
-
-  def testNoReturnElementsEval(self):
-    self._TestNoReturnElements(False)
-
-  def _TestNoReturnElements(self, is_training):
-    graph = ops.Graph()
-    with graph.as_default():
-      a = constant_op.constant(1.0)
-      b = variables.Variable(2.0)
-      _ = a + b
-    if is_training:
-      q_graph = quantize_graph.create_training_graph(graph)
-    else:
-      q_graph = quantize_graph.create_eval_graph(graph)
-    # Check that quantize_graph didn't return a tuple when elements isn't
-    # provided.
-    self.assertTrue(isinstance(q_graph, ops.Graph))
-    # Make sure q_graph is different from graph.
-    self.assertTrue(graph != q_graph)
-
-  def testDeviceNameTraining(self):
-    self._TestDeviceName(True)
-
-  def testDeviceNameEval(self):
-    self._TestDeviceName(False)
-
-  def _TestDeviceName(self, is_training):
+  def _RunTestOverAllRewrites(self, test_fn):
+    rewrite_fns = [
+        quantize_graph.create_training_graph,
+        quantize_graph.create_eval_graph,
+        quantize_graph.experimental_create_training_graph,
+        quantize_graph.experimental_create_eval_graph,
+    ]
+    for fn in rewrite_fns:
+      test_fn(fn)
+
+  def _RunTestOverTrainingRewrites(self, test_fn):
+    rewrite_fns = [
+        quantize_graph.create_training_graph,
+        quantize_graph.experimental_create_training_graph,
+    ]
+    for fn in rewrite_fns:
+      test_fn(fn)
+
+  def _RunTestOverEvalRewrites(self, test_fn):
+    rewrite_fns = [
+        quantize_graph.create_eval_graph,
+        quantize_graph.experimental_create_eval_graph,
+    ]
+    for fn in rewrite_fns:
+      test_fn(fn)
+
+  def _RunTestOverExperimentalRewrites(self, test_fn):
+    rewrite_fns = [
+        quantize_graph.experimental_create_training_graph,
+        quantize_graph.experimental_create_eval_graph,
+    ]
+    for fn in rewrite_fns:
+      test_fn(fn)
+
+  def testRewrite(self):
+    self._RunTestOverAllRewrites(self._TestRewrite)
+
+  def _TestRewrite(self, rewrite_fn):
     graph = ops.Graph()
     with graph.as_default():
-      batch_size, height, width, depth = 5, 128, 128, 3
-      inputs = array_ops.zeros((batch_size, height, width, depth))
-      conv = layers.conv2d(
-          inputs,
-          32, [5, 5],
-          stride=2,
-          padding='SAME',
-          weights_initializer=self._WeightInit(0.09),
-          activation_fn=None,
-          scope='test')
-      _ = nn_ops.relu6(conv)
-
-    device_name = '/job:oink/task:0/device:CPU:0'
-    if is_training:
-      q_graph = quantize_graph.create_training_graph(
-          graph, device_name_or_function=device_name)
-    else:
-      q_graph = quantize_graph.create_eval_graph(
-          graph, device_name_or_function=device_name)
+      self._ConvLayer()
 
     orig_variable_names = set(
         [v.name for v in graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-    q_variables = q_graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    # Ensure that variables were added.
-    self.assertTrue(len(orig_variable_names) < len(q_variables))
-    # All added variables should have the specified device name.
-    for var in q_variables:
-      if var.name not in orig_variable_names:
-        self.assertEqual(var.device, device_name)
-
-  def _WeightInit(self, stddev):
-    """Returns truncated normal variable initializer.
 
-    Function is defined purely to shorten the name so that it stops wrapping.
+    rewrite_fn(graph)
 
-    Args:
-      stddev: Standard deviation of normal variable.
+    q_variables = graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    # Ensure that variables were added.
+    self.assertTrue(len(orig_variable_names) < len(q_variables))
 
-    Returns:
-      An initialized that initialzes with a truncated normal variable.
-    """
-    return init_ops.truncated_normal_initializer(stddev=stddev)
+  def testDefaultGraph(self):
+    self._RunTestOverAllRewrites(self._TestRewrite)
+
+  def _TestDefaultGraph(self, rewrite_fn):
+    # Tests that the default graph is correctly used when no args are provided
+    # to rewrite_fn.
+    with ops.Graph().as_default() as g:
+      self._ConvLayer()
+      orig_variable_names = set(
+          [v.name for v in g.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      rewrite_fn()
+
+      q_variables = g.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      # Ensure that variables were added.
+      self.assertTrue(len(orig_variable_names) < len(q_variables))
+
+  def testQuantDelay(self):
+    self._RunTestOverTrainingRewrites(self._TestQuantDelay)
+
+  def _TestQuantDelay(self, rewrite_fn):
+    with ops.Graph().as_default() as g:
+      self._ConvLayer()
+      quant_delay = 100
+      rewrite_fn(quant_delay=quant_delay)
+
+    quant_delay_found = False
+    for op in g.get_operations():
+      # Check to see if the quant_delay is correctly set.
+      if 'activate_quant' in op.name and op.type == 'Const':
+        quant_delay_found = True
+        const_value = str(op.get_attr('value'))
+        self.assertTrue(('int64_val: %i' % quant_delay) in const_value)
+    self.assertTrue(quant_delay_found)
+
+  def testWeightBits(self):
+    self._RunTestOverExperimentalRewrites(self._TestWeightBits)
+
+  def _TestWeightBits(self, rewrite_fn):
+    with ops.Graph().as_default() as g:
+      self._ConvLayer()
+      weight_bits = 4
+      rewrite_fn(weight_bits=weight_bits)
+
+    weights_quant_found = False
+    for op in g.get_operations():
+      # Check to see if FakeQuant operations for weights have the right bits
+      # set.
+      if 'weights_quant' in op.name and op.type == 'FakeQuantWithMinMaxVars':
+        weights_quant_found = True
+        self.assertEqual(op.get_attr('num_bits'), weight_bits)
+    self.assertTrue(weights_quant_found)
+
+  def testActivationBits(self):
+    self._RunTestOverExperimentalRewrites(self._TestActivationBits)
+
+  def _TestActivationBits(self, rewrite_fn):
+    with ops.Graph().as_default() as g:
+      self._ConvLayer()
+      activation_bits = 4
+      rewrite_fn(activation_bits=activation_bits)
+
+    act_quant_found = False
+    for op in g.get_operations():
+      # Check to see if FakeQuant operations for activations have the right bits
+      # set.
+      act_quant_names = ['act_quant', 'conv_quant', 'add_quant']
+      if any(s in op.name
+             for s in act_quant_names) and op.type == 'FakeQuantWithMinMaxVars':
+        act_quant_found = True
+        self.assertEqual(op.get_attr('num_bits'), activation_bits)
+    self.assertTrue(act_quant_found)
+
+  def testTrainingQuantization(self):
+    self._RunTestOverTrainingRewrites(self._TestTrainingQuantization)
+
+  def _TestTrainingQuantization(self, rewrite_fn):
+    with ops.Graph().as_default() as g:
+      self._ConvLayer()
+      rewrite_fn()
+
+    # Ensure that FakeQuant and variable update nodes were found.
+    quant_found = False
+    assign_min_last_found = False
+    assign_min_ema_found = False
+    assign_max_last_found = False
+    assign_max_ema_found = False
+    for op in g.get_operations():
+      # Check that FakeQuant operations were added.
+      if op.type == 'FakeQuantWithMinMaxVars':
+        quant_found = True
+      # Check that update operations for the added min max variables exist in
+      # the graph.
+      if 'AssignMinLast' in op.name:
+        assign_min_last_found = True
+      elif 'AssignMinEma' in op.name:
+        assign_min_ema_found = True
+      elif 'AssignMaxLast' in op.name:
+        assign_max_last_found = True
+      elif 'AssignMaxEma' in op.name:
+        assign_max_ema_found = True
+    self.assertTrue(assign_min_last_found)
+    self.assertTrue(assign_min_ema_found)
+    self.assertTrue(assign_max_last_found)
+    self.assertTrue(assign_max_ema_found)
+    self.assertTrue(quant_found)
+
+  def testEvalQuantization(self):
+    self._RunTestOverEvalRewrites(self._TestEvalQuantization)
+
+  def _TestEvalQuantization(self, rewrite_fn):
+    with ops.Graph().as_default() as g:
+      self._ConvLayer()
+      rewrite_fn()
+
+    # Ensure that FakeQuant and variable update nodes were found.
+    quant_found = False
+    for op in g.get_operations():
+      # Check that FakeQuant operations were added.
+      if op.type == 'FakeQuantWithMinMaxVars':
+        quant_found = True
+      # Check that update operations for the added min max variables don't
+      # exist in the graph.
+      update_names = [
+          'AssignMinLast', 'AssignMinEma', 'AssignMaxLast', 'AssignMaxEma'
+      ]
+      self.assertFalse(any(s in op.name for s in update_names))
+    self.assertTrue(quant_found)
+
+  def _ConvLayer(self):
+    """Add a basic convolution layer to the default graph."""
+    batch_size, height, width, depth = 5, 128, 128, 3
+    inputs = array_ops.zeros((batch_size, height, width, depth))
+    weight_init = init_ops.truncated_normal_initializer
+    conv = layers.conv2d(
+        inputs,
+        32, [5, 5],
+        stride=2,
+        padding='SAME',
+        weights_initializer=weight_init(0.09),
+        activation_fn=None,
+        scope='test')
+    _ = nn_ops.relu6(conv)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
index 57dab03f162629f84adf1d15521b05f4014c4a80..639a7454a92aebd7289c59498cebff82cc003f75 100644
--- a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
@@ -29,7 +29,6 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import googletest
-from tensorflow.python.training import training
 
 batch_norm = layers.batch_norm
 conv2d = layers.conv2d
@@ -73,8 +72,6 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     """
     graph = ops.Graph()
     with graph.as_default():
-      training.create_global_step(graph)
-
       batch_size, height, width, depth = 5, 128, 128, 3
       inputs = array_ops.zeros((batch_size, height, width, depth))
       stride = 1 if with_bypass else 2
@@ -91,7 +88,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
 
-      quantize.Quantize(graph, quant_delay=delay)
+      quantize.Quantize(graph, True, quant_delay=delay)
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
                                                 quantization_node_name)
@@ -101,7 +98,11 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         scope + '/weights_quant/AssignMaxLast', scope + '/weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
-    output_op_name = scope + '/Conv2D'
+    if delay and delay > 0:
+      output_op_name = scope + '/weights_quant/delayed_quant/Switch_1'
+    else:
+      output_op_name = scope + '/Conv2D'
+
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
@@ -148,8 +149,6 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     """
     graph = ops.Graph()
     with graph.as_default():
-      training.create_global_step(graph)
-
       batch_size, depth = 5, 256
       inputs = array_ops.zeros((batch_size, depth))
       out_depth = 256 if with_bypass else 128
@@ -165,7 +164,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
 
-      quantize.Quantize(graph, quant_delay=delay)
+      quantize.Quantize(graph, True, quant_delay=delay)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
@@ -176,7 +175,10 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         scope + '/weights_quant/AssignMaxLast', scope + '/weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
-    output_op_name = scope + '/MatMul'
+    if delay and delay > 0:
+      output_op_name = scope + '/weights_quant/delayed_quant/Switch_1'
+    else:
+      output_op_name = scope + '/MatMul'
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
@@ -222,8 +224,6 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     """
     graph = ops.Graph()
     with graph.as_default():
-      training.create_global_step(graph)
-
       batch_size, height, width, depth = 5, 128, 128, 3
       inputs = array_ops.zeros((batch_size, height, width, depth))
       stride = 1 if with_bypass else 2
@@ -240,7 +240,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
 
-      quantize.Quantize(graph, quant_delay=delay)
+      quantize.Quantize(graph, True, quant_delay=delay)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
@@ -252,7 +252,10 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         scope + '/depthwise_weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
-    output_op_name = scope + '/depthwise'
+    if delay and delay > 0:
+      output_op_name = scope + '/weights_quant/delayed_quant/Switch_1'
+    else:
+      output_op_name = scope + '/depthwise'
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
@@ -316,40 +319,11 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     for params in parameters_list:
       test_fn(params[0], params[1], params[2], params[3], params[4])
 
-  def _TestQuantize_Conv2dWithBatchNorm(self, activation, activation_op_name,
-                                        with_bypass, delay, fused_batch_norm):
-    """Tests quantization: inputs -> Conv2d with batch norm -> Activation.
-
-    Args:
-      activation: Callable that returns an Operation, a factory method for the
-        Activation.
-      activation_op_name: String, name of the Activation operation.
-      with_bypass: Bool, when true there is an extra connection added from
-        inputs to just before Activation.
-      delay: Int (optional), delay in number of steps until quantization starts.
-      fused_batch_norm: Bool, when true use FusedBatchNorm.
-    """
-    self._testQuantize_Conv2dWithBatchNorm(
-        activation,
-        activation_op_name,
-        with_bypass,
-        delay,
-        fused_batch_norm,
-        use_ema=True)
-    self._testQuantize_Conv2dWithBatchNorm(
-        activation,
-        activation_op_name,
-        with_bypass,
-        delay,
-        fused_batch_norm,
-        use_ema=False)
-
   def testQuantize_Conv2dWithBatchNorm(self):
     self._RunBatchNormTestOverParameters(self._TestQuantize_Conv2dWithBatchNorm)
 
-  def _testQuantize_Conv2dWithBatchNorm(self, activation, activation_op_name,
-                                        with_bypass, delay, fused_batch_norm,
-                                        use_ema):
+  def _TestQuantize_Conv2dWithBatchNorm(self, activation, activation_op_name,
+                                        with_bypass, delay, fused_batch_norm):
     """Tests quantization: inputs -> Conv2d with batch norm -> Activation.
 
     Args:
@@ -360,12 +334,9 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         inputs to just before Activation.
       delay: Int (optional), delay in number of steps until quantization starts.
       fused_batch_norm: Bool, when true use FusedBatchNorm.
-      use_ema: Bool, when true uses EMA quantization for BN folded weights.
     """
     graph = ops.Graph()
     with graph.as_default():
-      training.create_global_step(graph)
-
       batch_size, height, width, depth = 5, 128, 128, 3
       inputs = array_ops.zeros((batch_size, height, width, depth))
       stride = 1 if with_bypass else 2
@@ -392,25 +363,21 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
 
-      fold_batch_norms.FoldBatchNorms(graph)
+      fold_batch_norms.FoldBatchNorms(graph, is_training=True)
 
-      quantize.Quantize(
-          graph, quant_delay=delay, quantize_folded_weights_use_ema=use_ema)
+      quantize.Quantize(graph, True, quant_delay=delay)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/' + ('AssignMinEma'
-                                     if use_ema else 'AssignMinLast'),
-        scope + '/weights_quant/' + ('AssignMaxEma'
-                                     if use_ema else 'AssignMaxLast'),
-        scope + '/mul_fold'
+        scope + '/weights_quant/' + 'AssignMinLast',
+        scope + '/weights_quant/' + 'AssignMaxLast', scope + '/mul_fold'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
     output_op_name = scope + ('/weights_quant/delayed_quant/Switch_1'
-                              if (delay and use_ema) else '/Conv2D_Fold')
+                              if delay else '/Conv2D_Fold')
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
@@ -438,40 +405,11 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                       if delay else 'control_dependency')
     self._AssertOutputGoesToOps(act_quant, graph, [output_op_name])
 
-  def _TestQuantize_FCWithBatchNorm(self, activation, activation_op_name,
-                                    with_bypass, delay, fused_batch_norm):
-    """Tests quantization: inputs -> FC with batch norm -> Activation.
-
-    Args:
-      activation: Callable that returns an Operation, a factory method for the
-        Activation.
-      activation_op_name: String, name of the Activation operation.
-      with_bypass: Bool, when true there is an extra connection added from
-        inputs to just before Activation.
-      delay: Int (optional), delay in number of steps until quantization starts.
-      fused_batch_norm: Bool, when true use FusedBatchNorm.
-    """
-    self._testQuantize_FCWithBatchNorm(
-        activation,
-        activation_op_name,
-        with_bypass,
-        delay,
-        fused_batch_norm,
-        use_ema=True)
-    self._testQuantize_FCWithBatchNorm(
-        activation,
-        activation_op_name,
-        with_bypass,
-        delay,
-        fused_batch_norm,
-        use_ema=False)
-
   def testQuantize_FCWithBatchNorm(self):
     self._RunBatchNormTestOverParameters(self._TestQuantize_FCWithBatchNorm)
 
-  def _testQuantize_FCWithBatchNorm(self, activation, activation_op_name,
-                                    with_bypass, delay, fused_batch_norm,
-                                    use_ema):
+  def _TestQuantize_FCWithBatchNorm(self, activation, activation_op_name,
+                                    with_bypass, delay, fused_batch_norm):
     """Tests quantization: inputs -> FC with batch norm -> Activation.
 
     Args:
@@ -482,12 +420,9 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         inputs to just before Activation.
       delay: Int (optional), delay in number of steps until quantization starts.
       fused_batch_norm: Bool, when true use FusedBatchNorm.
-      use_ema: Bool, when true uses EMA quantization for BN folded weights.
     """
     graph = ops.Graph()
     with graph.as_default():
-      training.create_global_step(graph)
-
       batch_size, depth = 5, 256
       inputs = array_ops.zeros((batch_size, depth))
       out_depth = 256 if with_bypass else 128
@@ -511,25 +446,21 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
 
-      fold_batch_norms.FoldBatchNorms(graph)
+      fold_batch_norms.FoldBatchNorms(graph, is_training=True)
 
-      quantize.Quantize(
-          graph, quant_delay=delay, quantize_folded_weights_use_ema=use_ema)
+      quantize.Quantize(graph, True, quant_delay=delay)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/' + ('AssignMinEma'
-                                     if use_ema else 'AssignMinLast'),
-        scope + '/weights_quant/' + ('AssignMaxEma'
-                                     if use_ema else 'AssignMaxLast'),
-        scope + '/mul_fold'
+        scope + '/weights_quant/' + 'AssignMinLast',
+        scope + '/weights_quant/' + 'AssignMaxLast', scope + '/mul_fold'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
     output_op_name = scope + ('/weights_quant/delayed_quant/Switch_1'
-                              if delay and use_ema else '/MatMul_Fold')
+                              if delay else '/MatMul_Fold')
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
@@ -557,42 +488,13 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                       if delay else 'control_dependency')
     self._AssertOutputGoesToOps(act_quant, graph, [output_op_name])
 
-  def _TestQuantize_DepthwiseConv2dWithBatchNorm(
-      self, activation, activation_op_name, with_bypass, delay,
-      fused_batch_norm):
-    """Tests quantization: inputs -> DWConv2d with batch norm -> Activation.
-
-    Args:
-      activation: Callable that returns an Operation, a factory method for the
-        Activation.
-      activation_op_name: String, name of the Activation operation.
-      with_bypass: Bool, when true there is an extra connection added from
-        inputs to just before Activation.
-      delay: Int (optional), delay in number of steps until quantization starts.
-      fused_batch_norm: Bool, when true use FusedBatchNorm.
-    """
-    self._testQuantize_DepthwiseConv2dWithBatchNorm(
-        activation,
-        activation_op_name,
-        with_bypass,
-        delay,
-        fused_batch_norm,
-        use_ema=True)
-    self._testQuantize_DepthwiseConv2dWithBatchNorm(
-        activation,
-        activation_op_name,
-        with_bypass,
-        delay,
-        fused_batch_norm,
-        use_ema=False)
-
   def testQuantize_DepthwiseConv2dWithBatchNorm(self):
     self._RunBatchNormTestOverParameters(
         self._TestQuantize_DepthwiseConv2dWithBatchNorm)
 
-  def _testQuantize_DepthwiseConv2dWithBatchNorm(
+  def _TestQuantize_DepthwiseConv2dWithBatchNorm(
       self, activation, activation_op_name, with_bypass, delay,
-      fused_batch_norm, use_ema):
+      fused_batch_norm):
     """Tests quantization: inputs -> DWConv2d with batch norm -> Activation.
 
     Args:
@@ -603,12 +505,9 @@ class QuantizeTest(test_util.TensorFlowTestCase):
         inputs to just before Activation.
       delay: Int (optional), delay in number of steps until quantization starts.
       fused_batch_norm: Bool, when true use FusedBatchNorm.
-      use_ema: Bool, when true uses EMA quantization for BN folded weights.
     """
     graph = ops.Graph()
     with graph.as_default():
-      training.create_global_step(graph)
-
       batch_size, height, width, depth = 5, 128, 128, 3
       inputs = array_ops.zeros((batch_size, height, width, depth))
       stride = 1 if with_bypass else 2
@@ -635,24 +534,20 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
 
-      fold_batch_norms.FoldBatchNorms(graph)
+      fold_batch_norms.FoldBatchNorms(graph, is_training=True)
 
-      quantize.Quantize(
-          graph, quant_delay=delay, quantize_folded_weights_use_ema=use_ema)
+      quantize.Quantize(graph, True, quant_delay=delay)
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     weights_quant = graph.get_operation_by_name(scope + '/weights_quant/' +
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/' + ('AssignMinEma'
-                                     if use_ema else 'AssignMinLast'),
-        scope + '/weights_quant/' + ('AssignMaxEma'
-                                     if use_ema else 'AssignMaxLast'),
-        scope + '/mul_fold'
+        scope + '/weights_quant/' + 'AssignMinLast',
+        scope + '/weights_quant/' + 'AssignMaxLast', scope + '/mul_fold'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
     output_op_name = scope + ('/weights_quant/delayed_quant/Switch_1'
-                              if delay and use_ema else '/depthwise_Fold')
+                              if delay else '/depthwise_Fold')
     self._AssertOutputGoesToOps(weights_quant, graph, [output_op_name])
 
     if with_bypass:
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index 1e4dd7cf67dbfbd16386fd740c7dcc83e05ad82a..bb7be0809421b64a019e73f00aac6c58524222e8 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -35,7 +35,15 @@ separable_conv2d = layers.separable_conv2d
 
 class QuantizeTest(test_util.TensorFlowTestCase):
 
+  def _RunTestOverParameters(self, test_fn):
+    params = [True, False]
+    for is_training in params:
+      test_fn(is_training)
+
   def testInsertQuantOpFailsWhenOpsNotConnected(self):
+    pass
+
+  def _TestInsertQuantOpFailsWhenOpsNotConnected(self, is_training):
     graph = ops.Graph()
     with graph.as_default():
       batch_size, height, width, depth = 5, 128, 128, 3
@@ -45,17 +53,18 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                     activation_fn=None, scope='test')
       relu = nn_ops.relu6(inputs)
 
-    context = quantize._QuantizeContext(graph=graph, weight_bits=8,
-                                        weight_narrow_range=True,
-                                        activation_bits=8)
     # Inserting a quantization op between two unconnected ops should fail with
     # ValueError.
     with self.assertRaises(ValueError) as err:
-      context._InsertQuantOp('test', conv.op, [relu.op], 'FailingQuantOp')
+      quantize._InsertQuantOp('test', is_training, conv.op, [relu.op],
+                              'FailingQuantOp')
     self.assertEqual(
         str(err.exception), 'Some inputs not quantized for ops: [Relu6]')
 
   def testInsertQuantOpForAddAfterConv2d(self):
+    self._RunTestOverParameters(self._TestInsertQuantOpForAddAfterConv2d)
+
+  def _TestInsertQuantOpForAddAfterConv2d(self, is_training):
     graph = ops.Graph()
     with graph.as_default():
       batch_size, height, width, depth = 5, 128, 128, 3
@@ -70,8 +79,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
 
-    quantize.Quantize(graph=graph, weight_bits=8, weight_narrow_range=True,
-                      activation_bits=8)
+    quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     add_quant = graph.get_operation_by_name('test/add_quant/' +
@@ -79,6 +87,10 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     self.assertEqual(add_quant.type, quantization_node_name)
 
   def testInsertQuantOpForAddAfterSeparableConv2d(self):
+    self._RunTestOverParameters(
+        self._TestInsertQuantOpForAddAfterSeparableConv2d)
+
+  def _TestInsertQuantOpForAddAfterSeparableConv2d(self, is_training):
     graph = ops.Graph()
     with graph.as_default():
       batch_size, height, width, depth = 5, 128, 128, 3
@@ -94,8 +106,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
 
-    quantize.Quantize(graph=graph, weight_bits=8, weight_narrow_range=True,
-                      activation_bits=8)
+    quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
 
     quantization_node_name = 'FakeQuantWithMinMaxVars'
     add_quant = graph.get_operation_by_name('test/add_quant/' +
diff --git a/tensorflow/contrib/receptive_field/BUILD b/tensorflow/contrib/receptive_field/BUILD
index d16b2908a0285e04ef5d3ede2050bf24c508228d..e975aeaea7ee78f8e912be8ab1be61b9acc7b418 100644
--- a/tensorflow/contrib/receptive_field/BUILD
+++ b/tensorflow/contrib/receptive_field/BUILD
@@ -15,7 +15,6 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 py_library(
     name = "receptive_field_pip",
     deps = [
-        ":graph_compute_order_py",
         ":receptive_field_py",
     ],
 )
@@ -23,28 +22,75 @@ py_library(
 py_library(
     name = "graph_compute_order_py",
     srcs = [
-        "__init__.py",
         "python/util/graph_compute_order.py",
     ],
     srcs_version = "PY2AND3",
+    deps = [
+        ":parse_layer_parameters_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_library(
+    name = "parse_layer_parameters_py",
+    srcs = [
+        "python/util/parse_layer_parameters.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+    ],
 )
 
 py_library(
     name = "receptive_field_py",
     srcs = [
-        "__init__.py",
+        "python/util/parse_layer_parameters.py",
         "python/util/receptive_field.py",
+        "receptive_field_api.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":graph_compute_order_py",
-        "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:platform",
         "//third_party/py/numpy",
     ],
 )
 
+py_test(
+    name = "graph_compute_order_test",
+    srcs = ["python/util/graph_compute_order_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":graph_compute_order_py",
+        ":receptive_field_py",
+        "//tensorflow/contrib/slim",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:nn",
+    ],
+)
+
+py_test(
+    name = "parse_layer_parameters_test",
+    srcs = ["python/util/parse_layer_parameters_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":graph_compute_order_py",
+        ":parse_layer_parameters_py",
+        "//tensorflow/contrib/slim",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:nn",
+    ],
+)
+
 py_test(
     name = "receptive_field_test",
     srcs = ["python/util/receptive_field_test.py"],
diff --git a/tensorflow/contrib/receptive_field/README.md b/tensorflow/contrib/receptive_field/README.md
index dfe53cdf14299b055fa4cdc13008d88234e93aa8..3ff85faf611afad71b6e6203453bbe97c56f9242 100644
--- a/tensorflow/contrib/receptive_field/README.md
+++ b/tensorflow/contrib/receptive_field/README.md
@@ -17,7 +17,6 @@ For example, if your model is constructed using the function
 
 ```python
 import tensorflow as tf
-from tensorflow.contrib import receptive_field
 
 # Construct graph.
 g = tf.Graph()
@@ -27,7 +26,7 @@ with g.as_default():
 
 # Compute receptive field parameters.
 rf_x, rf_y, eff_stride_x, eff_stride_y, eff_pad_x, eff_pad_y = \
-  receptive_field.compute_receptive_field_from_graph_def( \
+  tf.contrib.receptive_field.compute_receptive_field_from_graph_def( \
     g.as_graph_def(), 'input_image', 'my_output_endpoint')
 ```
 
@@ -47,7 +46,6 @@ You can then compute the receptive field parameters for Inception-Resnet-v2 as:
 ```python
 from nets import inception
 import tensorflow as tf
-from tensorflow.contrib import receptive_field
 
 # Construct graph.
 g = tf.Graph()
@@ -57,7 +55,7 @@ with g.as_default():
 
 # Compute receptive field parameters.
 rf_x, rf_y, eff_stride_x, eff_stride_y, eff_pad_x, eff_pad_y = \
-  receptive_field.compute_receptive_field_from_graph_def( \
+  tf.contrib.receptive_field.compute_receptive_field_from_graph_def( \
     g.as_graph_def(), 'input_image', 'InceptionResnetV2/Conv2d_7b_1x1/Relu')
 ```
 
diff --git a/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py b/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py
index 1cf978b90a3661a075130790d82a499da4d8a0cc..d6fdd12bbe37fb0e0cb12f1d0adc3fce29b19e8a 100644
--- a/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py
+++ b/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py
@@ -26,7 +26,7 @@ import sys
 
 from google.protobuf import text_format
 
-from tensorflow.contrib import receptive_field
+from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python.platform import app
 from tensorflow.python.platform import gfile
diff --git a/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py b/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py
index 94228dfa61b1de617f131611173fda7c3917d250..a298b4d49038468299b58140758c69675368e855 100644
--- a/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py
+++ b/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py
@@ -28,19 +28,19 @@ import argparse
 import csv
 import sys
 
-from nets import alexnet
-from nets import inception
-from nets import mobilenet_v1
-from nets import resnet_v1
-from nets import resnet_v2
-from nets import vgg
 from tensorflow.contrib import framework
-from tensorflow.contrib import receptive_field
 from tensorflow.contrib import slim
+from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import app
+from nets import alexnet
+from nets import inception
+from nets import mobilenet_v1
+from nets import resnet_v1
+from nets import resnet_v2
+from nets import vgg
 
 cmd_args = None
 
@@ -245,7 +245,8 @@ def _model_rf(graphdef,
               end_points,
               desired_end_point_keys,
               model_type='resnet_v1_50',
-              csv_writer=None):
+              csv_writer=None,
+              input_resolution=None):
   """Computes receptive field information for a given CNN model.
 
   The information will be printed to stdout. If the RF parameters are the same
@@ -261,45 +262,93 @@ def _model_rf(graphdef,
       information will be computed.
     model_type: Type of model to be used, used only for printing purposes.
     csv_writer: A CSV writer for RF parameters, which is used if it is not None.
+    input_resolution: Input resolution to use when computing RF
+      parameters. This is important for the case where padding can only be
+      defined if the input resolution is known, which may happen if using SAME
+      padding. This is assumed the resolution for both height and width. If
+      None, we consider the resolution is unknown.
   """
   for desired_end_point_key in desired_end_point_keys:
     print('- %s:' % desired_end_point_key)
     output_node_with_colon = end_points[desired_end_point_key].name
     pos = output_node_with_colon.rfind(':')
     output_node = output_node_with_colon[:pos]
-    (receptive_field_x, receptive_field_y, effective_stride_x,
-     effective_stride_y, effective_padding_x, effective_padding_y
-    ) = receptive_field.compute_receptive_field_from_graph_def(
-        graphdef, _INPUT_NODE, output_node)
-    # If values are the same in horizontal/vertical directions, just report one
-    # of them. Otherwise, report both.
-    if (receptive_field_x == receptive_field_y) and (
-        effective_stride_x == effective_stride_y) and (
-            effective_padding_x == effective_padding_y):
-      print('Receptive field size = %5s, effective stride = %5s, effective '
-            'padding = %5s' % (str(receptive_field_x), str(effective_stride_x),
-                               str(effective_padding_x)))
-    else:
-      print('Receptive field size: horizontal = %5s, vertical = %5s. '
-            'Effective stride: horizontal = %5s, vertical = %5s. Effective '
-            'padding: horizontal = %5s, vertical = %5s' %
-            (str(receptive_field_x), str(receptive_field_y),
-             str(effective_stride_x), str(effective_stride_y),
-             str(effective_padding_x), str(effective_padding_y)))
-    if csv_writer is not None:
-      csv_writer.writerow({
-          'CNN': model_type,
-          'end_point': desired_end_point_key,
-          'RF size hor': str(receptive_field_x),
-          'RF size ver': str(receptive_field_y),
-          'effective stride hor': str(effective_stride_x),
-          'effective stride ver': str(effective_stride_y),
-          'effective padding hor': str(effective_padding_x),
-          'effective padding ver': str(effective_padding_y)
-      })
-
-
-def _process_model_rf(model_type='resnet_v1_50', csv_writer=None, arg_sc=None):
+    try:
+      (receptive_field_x, receptive_field_y, effective_stride_x,
+       effective_stride_y, effective_padding_x, effective_padding_y
+      ) = receptive_field.compute_receptive_field_from_graph_def(
+          graphdef, _INPUT_NODE, output_node, input_resolution=input_resolution)
+      # If values are the same in horizontal/vertical directions, just report
+      # one of them. Otherwise, report both.
+      if (receptive_field_x == receptive_field_y) and (
+          effective_stride_x == effective_stride_y) and (
+              effective_padding_x == effective_padding_y):
+        print('Receptive field size = %5s, effective stride = %5s, effective '
+              'padding = %5s' % (str(receptive_field_x),
+                                 str(effective_stride_x),
+                                 str(effective_padding_x)))
+      else:
+        print('Receptive field size: horizontal = %5s, vertical = %5s. '
+              'Effective stride: horizontal = %5s, vertical = %5s. Effective '
+              'padding: horizontal = %5s, vertical = %5s' %
+              (str(receptive_field_x), str(receptive_field_y),
+               str(effective_stride_x), str(effective_stride_y),
+               str(effective_padding_x), str(effective_padding_y)))
+      if csv_writer is not None:
+        csv_writer.writerow({
+            'CNN':
+                model_type,
+            'input resolution':
+                str(input_resolution[0])
+                if input_resolution is not None else 'None',
+            'end_point':
+                desired_end_point_key,
+            'RF size hor':
+                str(receptive_field_x),
+            'RF size ver':
+                str(receptive_field_y),
+            'effective stride hor':
+                str(effective_stride_x),
+            'effective stride ver':
+                str(effective_stride_y),
+            'effective padding hor':
+                str(effective_padding_x),
+            'effective padding ver':
+                str(effective_padding_y)
+        })
+    except ValueError as e:
+      print('---->ERROR: Computing RF parameters for model %s with final end '
+            'point %s and input resolution %s did not work' %
+            (model_type, desired_end_point_key, input_resolution))
+      print('---->The returned error is: %s' % e)
+      if csv_writer is not None:
+        csv_writer.writerow({
+            'CNN':
+                model_type,
+            'input resolution':
+                str(input_resolution[0])
+                if input_resolution is not None else 'None',
+            'end_point':
+                desired_end_point_key,
+            'RF size hor':
+                'None',
+            'RF size ver':
+                'None',
+            'effective stride hor':
+                'None',
+            'effective stride ver':
+                'None',
+            'effective padding hor':
+                'None',
+            'effective padding ver':
+                'None'
+        })
+
+
+def _process_model_rf(model_type='resnet_v1_50',
+                      csv_writer=None,
+                      arg_sc=None,
+                      input_resolutions=None):
   """Contructs model graph and desired end-points, and compute RF.
 
   The computed RF parameters are printed to stdout by the _model_rf function.
@@ -308,13 +357,30 @@ def _process_model_rf(model_type='resnet_v1_50', csv_writer=None, arg_sc=None):
     model_type: Type of model to be used.
     csv_writer: A CSV writer for RF parameters, which is used if it is not None.
     arg_sc: Optional arg scope to use in constructing the graph.
+    input_resolutions: List of 1D input resolutions to use when computing RF
+      parameters. This is important for the case where padding can only be
+      defined if the input resolution is known, which may happen if using SAME
+      padding. The entries in the list are assumed the resolution for both
+      height and width. If one of the elements in the list is None, we consider
+      it to mean that the resolution is unknown. If the list itself is None,
+      we use the default list [None, 224, 321].
 
   """
-  print('********************%s' % model_type)
-  graphdef, end_points = _model_graph_def(model_type, arg_sc)
-  desired_end_point_keys = _get_desired_end_point_keys(model_type)
-  _model_rf(graphdef, end_points, desired_end_point_keys, model_type,
-            csv_writer)
+  # Process default value for this list.
+  if input_resolutions is None:
+    input_resolutions = [None, 224, 321]
+
+  for n in input_resolutions:
+    print('********************%s, input resolution = %s' % (model_type, n))
+    graphdef, end_points = _model_graph_def(model_type, arg_sc)
+    desired_end_point_keys = _get_desired_end_point_keys(model_type)
+    _model_rf(
+        graphdef,
+        end_points,
+        desired_end_point_keys,
+        model_type,
+        csv_writer,
+        input_resolution=[n, n] if n is not None else None)
 
 
 def _resnet_rf(csv_writer=None):
@@ -421,7 +487,7 @@ def main(unused_argv):
   if cmd_args.csv_path:
     csv_file = open(cmd_args.csv_path, 'w')
     field_names = [
-        'CNN', 'end_point', 'RF size hor', 'RF size ver',
+        'CNN', 'input resolution', 'end_point', 'RF size hor', 'RF size ver',
         'effective stride hor', 'effective stride ver', 'effective padding hor',
         'effective padding ver'
     ]
diff --git a/tensorflow/contrib/receptive_field/python/util/examples/write_inception_resnet_v2_graph.py b/tensorflow/contrib/receptive_field/python/util/examples/write_inception_resnet_v2_graph.py
index 793ae163d807fdda62c2025cb8176b96832cb61a..a494883396614bcee04886b5c05d0393df958580 100644
--- a/tensorflow/contrib/receptive_field/python/util/examples/write_inception_resnet_v2_graph.py
+++ b/tensorflow/contrib/receptive_field/python/util/examples/write_inception_resnet_v2_graph.py
@@ -22,12 +22,12 @@ from __future__ import print_function
 import argparse
 import sys
 
-from nets import inception
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import app
+from nets import inception
 
 cmd_args = None
 
diff --git a/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py b/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py
index 8af4be16d6c17286287713a1fb6f5017355e3b32..0388079f20dee0a6b249d568e2c51d1407d7466f 100644
--- a/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py
+++ b/tensorflow/contrib/receptive_field/python/util/graph_compute_order.py
@@ -20,69 +20,173 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import math
+from tensorflow.contrib.receptive_field.python.util import parse_layer_parameters
+from tensorflow.python.platform import tf_logging as logging
 
 
-class GraphDefHelper(object):
-  """Helper class to collect node names and definitions.
+def parse_graph_nodes(graph_def):
+  """Helper function to parse GraphDef's nodes.
 
-  Example:
-    b = GraphDefHelper(graph_def)
-    # Prints node that produces given output.
-    print b.output_of['conv/foo/bar']
+  It returns a dict mapping from node name to NodeDef.
+
+  Args:
+    graph_def: A GraphDef object.
+
+  Returns:
+    name_to_node: Dict keyed by node name, each entry containing the node's
+      NodeDef.
   """
+  name_to_node = {}
+  for node_def in graph_def.node:
+    name_to_node[node_def.name] = node_def
+  return name_to_node
 
-  def __init__(self, gd):
-    self.output_of = {}
-    for each in gd.node:
-      self.output_of[each.name] = each
 
+# Named tuple used to collect information from each node in a computation graph.
+_node_info = collections.namedtuple(
+    'NodeInfo', field_names=['order', 'node', 'input_size', 'output_size'])
 
-# pylint: disable=invalid-name
-_NodeEntry = collections.namedtuple('NodeEntry', field_names=['order', 'node'])
 
+def _compute_output_resolution(input_spatial_resolution, kernel_size, stride,
+                               total_padding):
+  """Computes output resolution, given input resolution and layer parameters.
 
-def _get_computed_nodes(g, output, seen):
-  """Traverses the graph in topological order.
+  Note that this computation is done only over one dimension (eg, x or y).
+  If any of the inputs is None, returns None.
+
+  Args:
+    input_spatial_resolution: Input spatial resolution (int).
+    kernel_size: Kernel size (int).
+    stride: Stride (int).
+    total_padding: Total padding to be applied (int).
+  Returns:
+    output_resolution: Output dimension (int) or None.
+  """
+  if (input_spatial_resolution is None) or (kernel_size is None) or (
+      stride is None) or (total_padding is None):
+    return None
+  return int(
+      math.ceil((
+          input_spatial_resolution + total_padding - kernel_size + 1) / stride))
+
+
+def _get_computed_nodes(name_to_node,
+                        current,
+                        node_info,
+                        input_node_name='',
+                        input_node_size=None):
+  """Traverses the graph recursively to compute its topological order.
+
+  Optionally, the function may also compute the input and output feature map
+  resolutions at each node. In this case, input_node_name and input_node_size
+  must be set. Note that if a node's op type is unknown, the input and output
+  resolutions are ignored and set to None.
 
   Args:
-    g: GraphDefHelper object.
-    output: current node.
-    seen: map of nodes we've already traversed.
+    name_to_node: Dict keyed by node name, each entry containing the node's
+      NodeDef.
+    current: Current node name.
+    node_info: Map of nodes we've already traversed, containing their _node_info
+      information.
+    input_node_name: Name of node with fixed input resolution (optional).
+    input_node_size: Fixed input resolution to use (optional).
   Returns:
-    order in topological sort for 'output'.
+    order: Order in topological sort for 'current'.
+    input_size: Tensor spatial resolution at input of current node.
+    output_size: Tensor spatial resolution at output of current node.
   """
-  if output in seen:
-    return seen[output].order
-  node_def = g.output_of.get(output, None)
-  if node_def is None:
-    seen[output] = _NodeEntry(0, None)
-    return 0
-
-  r = 0
+  if current in node_info:
+    return (node_info[current].order, node_info[current].input_size,
+            node_info[current].output_size)
+
+  node_def = name_to_node[current]
+
+  if current == input_node_name:
+    order = 0
+    input_size = None
+    output_size = input_node_size
+    node_info[current] = _node_info(order, node_def, input_size, output_size)
+    return (order, input_size, output_size)
+
+  input_size = None
+  output_size = None
+
+  order = 0
+  number_inputs = 0
   for each in node_def.input:
     # Parses name of input node.
     if each.startswith('^'):
-      each = each[1:]
+      # The character '^' denotes a control dependency, so this input node can
+      # be safely ignored.
+      continue
     each = each.split(':')[0]
     # Recursively computes ordering.
-    new_v = _get_computed_nodes(g, each, seen)
-    r = max(r, new_v + 1)
-
-  seen[output] = _NodeEntry(r, node_def)
-
-  return seen[output].order
-
-
-def get_compute_order(graph_def):
-  """Computes order of computation for a given graph.
+    (parent_order, _, parent_output_size) = _get_computed_nodes(
+        name_to_node, each, node_info, input_node_name, input_node_size)
+    order = max(order, parent_order + 1)
+    if number_inputs == 0:
+      # For all the types of nodes we consider, the first input corresponds to
+      # the feature map.
+      input_size = parent_output_size
+    number_inputs += 1
+
+  # Figure out output size for this layer.
+  logging.vlog(3, 'input_size = %s', input_size)
+  if input_size is None:
+    output_size = None
+  else:
+    (kernel_size_x, kernel_size_y, stride_x, stride_y, _, _, total_padding_x,
+     total_padding_y) = (
+         parse_layer_parameters.get_layer_params(
+             node_def, name_to_node, input_size, force=True))
+    logging.vlog(3, 'kernel_size_x = %s, kernel_size_y = %s, '
+                 'stride_x = %s, stride_y = %s, '
+                 'total_padding_x = %s, total_padding_y = %s' %
+                 (kernel_size_x, kernel_size_y, stride_x, stride_y,
+                  total_padding_x, total_padding_y))
+    output_size = [None] * 2
+    output_size[0] = _compute_output_resolution(input_size[0], kernel_size_x,
+                                                stride_x, total_padding_x)
+    output_size[1] = _compute_output_resolution(input_size[1], kernel_size_y,
+                                                stride_y, total_padding_y)
+
+  logging.vlog(3, 'output_size = %s', output_size)
+  node_info[current] = _node_info(order, node_def, input_size, output_size)
+
+  return order, input_size, output_size
+
+
+def get_compute_order(graph_def, input_node_name='', input_node_size=None):
+  """Computes order of computation for a given CNN graph.
+
+  Optionally, the function may also compute the input and output feature map
+  resolutions at each node. In this case, input_node_name and input_node_size
+  must be set. Note that if a node's op type is unknown, the input and output
+  resolutions are ignored and set to None.
 
   Args:
     graph_def: GraphDef object.
+    input_node_name: Name of node with fixed input resolution (optional). This
+      is usually the node name for the input image in a CNN.
+    input_node_size: 2D list of integers, fixed input resolution to use
+      (optional). This is usually the input resolution used for the input image
+      in a CNN (common examples are: [224, 224], [299, 299], [321, 321]).
   Returns:
-    map: name -> {order, node}
+    node_info: Default dict keyed by node name, mapping to a named tuple with
+      the following fields:
+      - order: Integer denoting topological order;
+      - node: NodeDef for the given node;
+      - input_size: 2D list of integers, denoting the input spatial resolution
+        to the node;
+      - output_size: 2D list of integers, denoting the output spatial resolution
+        of the node.
+    name_to_node: Dict keyed by node name, each entry containing the node's
+      NodeDef.
   """
-  helper = GraphDefHelper(graph_def)
-  seen = collections.defaultdict(_NodeEntry)
+  name_to_node = parse_graph_nodes(graph_def)
+  node_info = collections.defaultdict(_node_info)
   for each in graph_def.node:
-    _get_computed_nodes(helper, each.name, seen)
-  return seen
+    _get_computed_nodes(name_to_node, each.name, node_info, input_node_name,
+                        input_node_size)
+  return node_info, name_to_node
diff --git a/tensorflow/contrib/receptive_field/python/util/graph_compute_order_test.py b/tensorflow/contrib/receptive_field/python/util/graph_compute_order_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..94c992ad2155d9c80cceee260a90c478a54371fb
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/python/util/graph_compute_order_test.py
@@ -0,0 +1,152 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for graph_compute_order module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import slim
+from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+def create_test_network():
+  """Convolutional neural network for test.
+
+  Returns:
+    g: Tensorflow graph object (Graph proto).
+  """
+  g = ops.Graph()
+  with g.as_default():
+    # An input test image with unknown spatial resolution.
+    x = array_ops.placeholder(
+        dtypes.float32, (None, None, None, 1), name='input_image')
+    # Left branch before first addition.
+    l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID')
+    # Right branch before first addition.
+    l2_pad = array_ops.pad(x, [[0, 0], [1, 0], [1, 0], [0, 0]], name='L2_pad')
+    l2 = slim.conv2d(l2_pad, 1, [3, 3], stride=2, scope='L2', padding='VALID')
+    l3 = slim.max_pool2d(l2, [3, 3], stride=2, scope='L3', padding='SAME')
+    # First addition.
+    l4 = nn.relu(l1 + l3, name='L4_relu')
+    # Left branch after first addition.
+    l5 = slim.conv2d(l4, 1, [1, 1], stride=2, scope='L5', padding='SAME')
+    # Right branch after first addition.
+    l6 = slim.conv2d(l4, 1, [3, 3], stride=2, scope='L6', padding='SAME')
+    # Final addition.
+    gen_math_ops.add(l5, l6, name='L7_add')
+
+  return g
+
+
+class GraphComputeOrderTest(test.TestCase):
+
+  def check_topological_sort_and_sizes(self,
+                                       node_info,
+                                       expected_input_sizes=None,
+                                       expected_output_sizes=None):
+    """Helper function to check topological sorting and sizes are correct.
+
+    The arguments expected_input_sizes and expected_output_sizes are used to
+    check that the sizes are correct, if they are given.
+
+    Args:
+      node_info: Default dict keyed by node name, mapping to a named tuple with
+        the following keys: {order, node, input_size, output_size}.
+      expected_input_sizes: Dict mapping node names to expected input sizes
+        (optional).
+      expected_output_sizes: Dict mapping node names to expected output sizes
+        (optional).
+    """
+    # Loop over nodes in sorted order, collecting those that were already seen.
+    # These will be used to make sure that the graph is topologically sorted.
+    # At the same time, we construct dicts from node name to input/output size,
+    # which will be used to check those.
+    already_seen_nodes = []
+    input_sizes = {}
+    output_sizes = {}
+    for _, (_, node, input_size, output_size) in sorted(
+        node_info.items(), key=lambda x: x[1].order):
+      for inp_name in node.input:
+        # Since the graph is topologically sorted, the inputs to the current
+        # node must have been seen beforehand.
+        self.assertIn(inp_name, already_seen_nodes)
+      input_sizes[node.name] = input_size
+      output_sizes[node.name] = output_size
+      already_seen_nodes.append(node.name)
+
+    # Check input sizes, if desired.
+    if expected_input_sizes is not None:
+      for k, v in expected_input_sizes.items():
+        self.assertIn(k, input_sizes)
+        self.assertEqual(input_sizes[k], v)
+
+    # Check output sizes, if desired.
+    if expected_output_sizes is not None:
+      for k, v in expected_output_sizes.items():
+        self.assertIn(k, output_sizes)
+        self.assertEqual(output_sizes[k], v)
+
+  def testGraphOrderIsCorrect(self):
+    """Tests that the order and sizes of create_test_network() are correct."""
+
+    graph_def = create_test_network().as_graph_def()
+
+    # Case 1: Input node name/size are not given.
+    node_info, _ = receptive_field.get_compute_order(graph_def)
+    self.check_topological_sort_and_sizes(node_info)
+
+    # Case 2: Input node name is given, but not size.
+    node_info, _ = receptive_field.get_compute_order(
+        graph_def, input_node_name='input_image')
+    self.check_topological_sort_and_sizes(node_info)
+
+    # Case 3: Input node name and size (224) are given.
+    node_info, _ = receptive_field.get_compute_order(
+        graph_def, input_node_name='input_image', input_node_size=[224, 224])
+    expected_input_sizes = {
+        'input_image': None,
+        'L1/Conv2D': [224, 224],
+        'L2_pad': [224, 224],
+        'L2/Conv2D': [225, 225],
+        'L3/MaxPool': [112, 112],
+        'L4_relu': [56, 56],
+        'L5/Conv2D': [56, 56],
+        'L6/Conv2D': [56, 56],
+        'L7_add': [28, 28],
+    }
+    expected_output_sizes = {
+        'input_image': [224, 224],
+        'L1/Conv2D': [56, 56],
+        'L2_pad': [225, 225],
+        'L2/Conv2D': [112, 112],
+        'L3/MaxPool': [56, 56],
+        'L4_relu': [56, 56],
+        'L5/Conv2D': [28, 28],
+        'L6/Conv2D': [28, 28],
+        'L7_add': [28, 28],
+    }
+    self.check_topological_sort_and_sizes(node_info, expected_input_sizes,
+                                          expected_output_sizes)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc383a803496380aaba4d0248d2b7f93253b2b50
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
@@ -0,0 +1,326 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions to parse RF-related parameters from TF layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from tensorflow.contrib.util import make_ndarray
+from tensorflow.python.platform import tf_logging as logging
+
+# White-listed layer operations, which do not affect the receptive field
+# computation.
+_UNCHANGED_RF_LAYER_OPS = [
+    "Add", "BiasAdd", "Cast", "Ceil", "ConcatV2", "Const", "Floor",
+    "FusedBatchNorm", "Identity", "Log", "Mul", "Pow", "RealDiv", "Relu",
+    "Relu6", "Round", "Rsqrt", "Softplus", "Sub", "VariableV2"
+]
+
+# Different ways in which padding modes may be spelled.
+_VALID_PADDING = ["VALID", b"VALID"]
+_SAME_PADDING = ["SAME", b"SAME"]
+
+
+def _stride_size(node, name_to_node):
+  """Computes stride size given a TF node.
+
+  Args:
+    node: Tensorflow node (NodeDef proto).
+    name_to_node: For MaxPoolV2, mapping from variable name Tensorflow node.
+
+  Returns:
+    stride_x: Stride size for horizontal direction (integer).
+    stride_y: Stride size for vertical direction (integer).
+
+  Raises:
+    ValueError: If stride input cannot be found in `name_to_node`.
+  """
+  if node.op == "MaxPoolV2":
+    strides_input_name = node.input[2]
+    if not strides_input_name.endswith("/strides"):
+      raise ValueError("Strides name does not end with '/strides'")
+    strides_node = name_to_node[strides_input_name]
+    value = strides_node.attr["value"]
+    t = make_ndarray(value.tensor)
+    stride_y = t[1]
+    stride_x = t[2]
+  else:
+    strides_attr = node.attr["strides"]
+    logging.vlog(4, "strides_attr = %s", strides_attr)
+    stride_y = strides_attr.list.i[1]
+    stride_x = strides_attr.list.i[2]
+  return stride_x, stride_y
+
+
+def _conv_kernel_size(node, name_to_node):
+  """Computes kernel size given a TF convolution or pooling node.
+
+  Args:
+    node: Tensorflow node (NodeDef proto).
+    name_to_node: Dict keyed by node name, each entry containing the node's
+      NodeDef.
+
+  Returns:
+    kernel_size_x: Kernel size for horizontal direction (integer).
+    kernel_size_y: Kernel size for vertical direction (integer).
+
+  Raises:
+    ValueError: If the weight layer node is invalid.
+  """
+  weights_layer_read_name = node.input[1]
+  if not weights_layer_read_name.endswith("/read"):
+    raise ValueError(
+        "Weight layer's name input to conv layer does not end with '/read'")
+  weights_layer_param_name = weights_layer_read_name[:-5]
+  weights_node = name_to_node[weights_layer_param_name]
+  if weights_node.op != "VariableV2":
+    raise ValueError("Weight layer is not of type VariableV2")
+  shape = weights_node.attr["shape"]
+  logging.vlog(4, "weight shape = %s", shape)
+  kernel_size_y = shape.shape.dim[0].size
+  kernel_size_x = shape.shape.dim[1].size
+  return kernel_size_x, kernel_size_y
+
+
+def _padding_size_conv_pool(node, kernel_size, stride, input_resolution=None):
+  """Computes padding size given a TF convolution or pooling node.
+
+  Args:
+    node: Tensorflow node (NodeDef proto).
+    kernel_size: Kernel size of node (integer).
+    stride: Stride size of node (integer).
+    input_resolution: Input resolution to assume, if not None (integer).
+
+  Returns:
+    total_padding: Total padding size (integer).
+    padding: Padding size, applied to the left or top (integer).
+
+  Raises:
+    ValueError: If padding is invalid.
+  """
+  # In this case, we need to carefully consider the different TF padding modes.
+  # The padding depends on kernel size, and may depend on input size. If it
+  # depends on input size and input_resolution is None, we raise an exception.
+  padding_attr = node.attr["padding"]
+  logging.vlog(4, "padding_attr = %s", padding_attr)
+  if padding_attr.s in _VALID_PADDING:
+    total_padding = 0
+    padding = 0
+  elif padding_attr.s in _SAME_PADDING:
+    if input_resolution is None:
+      # In this case, we do not know the input resolution, so we can only know
+      # the padding in some special cases.
+      if kernel_size == 1:
+        total_padding = 0
+        padding = 0
+      elif stride == 1:
+        total_padding = kernel_size - 1
+        padding = int(math.floor(float(total_padding) / 2))
+      elif stride == 2 and kernel_size % 2 == 0:
+        # In this case, we can be sure of the left/top padding, but not of the
+        # total padding.
+        total_padding = None
+        padding = int(math.floor((float(kernel_size) - 1) / 2))
+      else:
+        total_padding = None
+        padding = None
+        logging.warning(
+            "Padding depends on input size, which means that the effective "
+            "padding may be different depending on the input image "
+            "dimensionality. In this case, alignment check will be skipped. If"
+            " you know the input resolution, please set it.")
+    else:
+      # First, compute total_padding based on documentation.
+      if input_resolution % stride == 0:
+        total_padding = int(max(float(kernel_size - stride), 0.0))
+      else:
+        total_padding = int(
+            max(float(kernel_size - (input_resolution % stride)), 0.0))
+      # Then, compute left/top padding.
+      padding = int(math.floor(float(total_padding) / 2))
+
+  else:
+    raise ValueError("Invalid padding operation %s" % padding_attr.s)
+  return total_padding, padding
+
+
+def _pool_kernel_size(node, name_to_node):
+  """Computes kernel size given a TF pooling node.
+
+  Args:
+    node: Tensorflow node (NodeDef proto).
+    name_to_node: For MaxPoolV2, mapping from node name to NodeDef.
+
+  Returns:
+    kernel_size_x: Kernel size for horizontal direction (integer).
+    kernel_size_y: Kernel size for vertical direction (integer).
+
+  Raises:
+    ValueError: If pooling is invalid.
+  """
+  if node.op == "MaxPoolV2":
+    ksize_input_name = node.input[1]
+    if not ksize_input_name.endswith("/ksize"):
+      raise ValueError("Kernel size name does not end with '/ksize'")
+    ksize_node = name_to_node[ksize_input_name]
+    value = ksize_node.attr["value"]
+    t = make_ndarray(value.tensor)
+    kernel_size_y = t[1]
+    kernel_size_x = t[2]
+    if t[0] != 1:
+      raise ValueError("pool ksize for first dim is not 1")
+    if t[3] != 1:
+      raise ValueError("pool ksize for last dim is not 1")
+  else:
+    ksize = node.attr["ksize"]
+    kernel_size_y = ksize.list.i[1]
+    kernel_size_x = ksize.list.i[2]
+    if ksize.list.i[0] != 1:
+      raise ValueError("pool ksize for first dim is not 1")
+    if ksize.list.i[3] != 1:
+      raise ValueError("pool ksize for last dim is not 1")
+  return kernel_size_x, kernel_size_y
+
+
+def _padding_size_pad_layer(node, name_to_node):
+  """Computes padding size given a TF padding node.
+
+  Args:
+    node: Tensorflow node (NodeDef proto).
+    name_to_node: Dict keyed by node name, each entry containing the node's
+      NodeDef.
+
+  Returns:
+    total_padding_x: Total padding size for horizontal direction (integer).
+    padding_x: Padding size for horizontal direction, left side (integer).
+    total_padding_y: Total padding size for vertical direction (integer).
+    padding_y: Padding size for vertical direction, top side (integer).
+
+  Raises:
+    ValueError: If padding layer is invalid.
+  """
+  paddings_layer_name = node.input[1]
+  if not paddings_layer_name.endswith("/paddings"):
+    raise ValueError("Padding layer name does not end with '/paddings'")
+  paddings_node = name_to_node[paddings_layer_name]
+  if paddings_node.op != "Const":
+    raise ValueError("Padding op is not Const")
+  value = paddings_node.attr["value"]
+  t = make_ndarray(value.tensor)
+  padding_y = t[1][0]
+  padding_x = t[2][0]
+  total_padding_y = padding_y + t[1][1]
+  total_padding_x = padding_x + t[2][1]
+  if (t[0][0] != 0) or (t[0][1] != 0):
+    raise ValueError("padding is not zero for first tensor dim")
+  if (t[3][0] != 0) or (t[3][1] != 0):
+    raise ValueError("padding is not zero for last tensor dim")
+  return total_padding_x, padding_x, total_padding_y, padding_y
+
+
+def get_layer_params(node, name_to_node, input_resolution=None, force=False):
+  """Gets layer parameters relevant for RF computation.
+
+  Currently, only these nodes are supported:
+  - Conv2D
+  - DepthwiseConv2dNative
+  - Pad
+  - MaxPool
+  - AvgPool
+  - all nodes listed in _UNCHANGED_RF_LAYER_OPS
+
+  Args:
+    node: Tensorflow node (NodeDef proto).
+    name_to_node: Dict keyed by node name, each entry containing the node's
+      NodeDef.
+    input_resolution: List with 2 dimensions, denoting the height/width of the
+      input feature map to this layer. If set to None, then the padding may be
+      undefined (in tensorflow, SAME padding depends on input spatial
+      resolution).
+    force: If True, the function does not raise a ValueError if the layer op is
+      unknown. Instead, in this case it sets each of the returned parameters to
+      None.
+
+  Returns:
+    kernel_size_x: Kernel size for horizontal direction (integer).
+    kernel_size_y: Kernel size for vertical direction (integer).
+    stride_x: Stride size for horizontal direction (integer).
+    stride_y: Stride size for vertical direction (integer).
+    padding_x: Padding size for horizontal direction, left side (integer).
+    padding_y: Padding size for vertical direction, top side (integer).
+    total_padding_x: Total padding size for horizontal direction (integer).
+    total_padding_y: Total padding size for vertical direction (integer).
+
+  Raises:
+    ValueError: If layer op is unknown and force is False.
+  """
+  logging.vlog(3, "node.name = %s", node.name)
+  logging.vlog(3, "node.op = %s", node.op)
+  logging.vlog(4, "node = %s", node)
+  if node.op == "Conv2D" or node.op == "DepthwiseConv2dNative":
+    stride_x, stride_y = _stride_size(node, name_to_node)
+    kernel_size_x, kernel_size_y = _conv_kernel_size(node, name_to_node)
+    # Compute the padding for this node separately for each direction.
+    total_padding_x, padding_x = _padding_size_conv_pool(
+        node, kernel_size_x, stride_x, input_resolution[1]
+        if input_resolution is not None else None)
+    total_padding_y, padding_y = _padding_size_conv_pool(
+        node, kernel_size_y, stride_y, input_resolution[0]
+        if input_resolution is not None else None)
+  elif node.op == "Pad":
+    # Kernel and stride are simply 1 in this case.
+    kernel_size_x = 1
+    kernel_size_y = 1
+    stride_x = 1
+    stride_y = 1
+    total_padding_x, padding_x, total_padding_y, padding_y = (
+        _padding_size_pad_layer(node, name_to_node))
+  elif node.op == "MaxPool" or node.op == "MaxPoolV2" or node.op == "AvgPool":
+    stride_x, stride_y = _stride_size(node, name_to_node)
+    kernel_size_x, kernel_size_y = _pool_kernel_size(node, name_to_node)
+    # Compute the padding for this node separately for each direction.
+    total_padding_x, padding_x = _padding_size_conv_pool(
+        node, kernel_size_x, stride_x, input_resolution[1]
+        if input_resolution is not None else None)
+    total_padding_y, padding_y = _padding_size_conv_pool(
+        node, kernel_size_y, stride_y, input_resolution[0]
+        if input_resolution is not None else None)
+  elif node.op in _UNCHANGED_RF_LAYER_OPS:
+    # These nodes do not modify the RF parameters.
+    kernel_size_x = 1
+    kernel_size_y = 1
+    stride_x = 1
+    stride_y = 1
+    total_padding_x = 0
+    padding_x = 0
+    total_padding_y = 0
+    padding_y = 0
+  else:
+    if force:
+      kernel_size_x = None
+      kernel_size_y = None
+      stride_x = None
+      stride_y = None
+      total_padding_x = None
+      padding_x = None
+      total_padding_y = None
+      padding_y = None
+    else:
+      raise ValueError("Unknown layer for operation '%s': %s" % (node.name,
+                                                                 node.op))
+  return (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
+          padding_y, total_padding_x, total_padding_y)
diff --git a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters_test.py b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..369758a28402f7c9e55cca9c6f9ffa9182c91140
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters_test.py
@@ -0,0 +1,149 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for parse_layer_parameters module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import slim
+from tensorflow.contrib.receptive_field.python.util import graph_compute_order
+from tensorflow.contrib.receptive_field.python.util import parse_layer_parameters
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+def create_test_network():
+  """Convolutional neural network for test.
+
+  Returns:
+    name_to_node: Dict keyed by node name, each entry containing the node's
+      NodeDef.
+  """
+  g = ops.Graph()
+  with g.as_default():
+    # An input test image with unknown spatial resolution.
+    x = array_ops.placeholder(
+        dtypes.float32, (None, None, None, 1), name='input_image')
+    # Left branch before first addition.
+    l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID')
+    # Right branch before first addition.
+    l2_pad = array_ops.pad(x, [[0, 0], [1, 0], [1, 0], [0, 0]], name='L2_pad')
+    l2 = slim.conv2d(l2_pad, 1, [3, 3], stride=2, scope='L2', padding='VALID')
+    l3 = slim.max_pool2d(l2, [3, 3], stride=2, scope='L3', padding='SAME')
+    # First addition.
+    l4 = nn.relu(l1 + l3, name='L4_relu')
+    # Left branch after first addition.
+    l5 = slim.conv2d(l4, 1, [1, 1], stride=2, scope='L5', padding='SAME')
+    # Right branch after first addition.
+    l6 = slim.conv2d(l4, 1, [3, 3], stride=2, scope='L6', padding='SAME')
+    # Final addition.
+    gen_math_ops.add(l5, l6, name='L7_add')
+
+  name_to_node = graph_compute_order.parse_graph_nodes(g.as_graph_def())
+  return name_to_node
+
+
+class ParseLayerParametersTest(test.TestCase):
+
+  def testParametersAreParsedCorrectly(self):
+    """Checks parameters from create_test_network() are parsed correctly."""
+    name_to_node = create_test_network()
+
+    # L1.
+    l1_node_name = 'L1/Conv2D'
+    l1_params = parse_layer_parameters.get_layer_params(
+        name_to_node[l1_node_name], name_to_node)
+    expected_l1_params = (1, 1, 4, 4, 0, 0, 0, 0)
+    self.assertEqual(l1_params, expected_l1_params)
+
+    # L2 padding.
+    l2_pad_name = 'L2_pad'
+    l2_pad_params = parse_layer_parameters.get_layer_params(
+        name_to_node[l2_pad_name], name_to_node)
+    expected_l2_pad_params = (1, 1, 1, 1, 1, 1, 1, 1)
+    self.assertEqual(l2_pad_params, expected_l2_pad_params)
+
+    # L2.
+    l2_node_name = 'L2/Conv2D'
+    l2_params = parse_layer_parameters.get_layer_params(
+        name_to_node[l2_node_name], name_to_node)
+    expected_l2_params = (3, 3, 2, 2, 0, 0, 0, 0)
+    self.assertEqual(l2_params, expected_l2_params)
+
+    # L3.
+    l3_node_name = 'L3/MaxPool'
+    # - Without knowing input size.
+    l3_params = parse_layer_parameters.get_layer_params(
+        name_to_node[l3_node_name], name_to_node)
+    expected_l3_params = (3, 3, 2, 2, None, None, None, None)
+    self.assertEqual(l3_params, expected_l3_params)
+    # - Input size is even.
+    l3_even_params = parse_layer_parameters.get_layer_params(
+        name_to_node[l3_node_name], name_to_node, input_resolution=[4, 4])
+    expected_l3_even_params = (3, 3, 2, 2, 0, 0, 1, 1)
+    self.assertEqual(l3_even_params, expected_l3_even_params)
+    # - Input size is odd.
+    l3_odd_params = parse_layer_parameters.get_layer_params(
+        name_to_node[l3_node_name], name_to_node, input_resolution=[5, 5])
+    expected_l3_odd_params = (3, 3, 2, 2, 1, 1, 2, 2)
+    self.assertEqual(l3_odd_params, expected_l3_odd_params)
+
+    # L4.
+    l4_node_name = 'L4_relu'
+    l4_params = parse_layer_parameters.get_layer_params(
+        name_to_node[l4_node_name], name_to_node)
+    expected_l4_params = (1, 1, 1, 1, 0, 0, 0, 0)
+    self.assertEqual(l4_params, expected_l4_params)
+
+    # L5.
+    l5_node_name = 'L5/Conv2D'
+    l5_params = parse_layer_parameters.get_layer_params(
+        name_to_node[l5_node_name], name_to_node)
+    expected_l5_params = (1, 1, 2, 2, 0, 0, 0, 0)
+    self.assertEqual(l5_params, expected_l5_params)
+
+    # L6.
+    l6_node_name = 'L6/Conv2D'
+    # - Without knowing input size.
+    l6_params = parse_layer_parameters.get_layer_params(
+        name_to_node[l6_node_name], name_to_node)
+    expected_l6_params = (3, 3, 2, 2, None, None, None, None)
+    self.assertEqual(l6_params, expected_l6_params)
+    # - Input size is even.
+    l6_even_params = parse_layer_parameters.get_layer_params(
+        name_to_node[l6_node_name], name_to_node, input_resolution=[4, 4])
+    expected_l6_even_params = (3, 3, 2, 2, 0, 0, 1, 1)
+    self.assertEqual(l6_even_params, expected_l6_even_params)
+    # - Input size is odd.
+    l6_odd_params = parse_layer_parameters.get_layer_params(
+        name_to_node[l6_node_name], name_to_node, input_resolution=[5, 5])
+    expected_l6_odd_params = (3, 3, 2, 2, 1, 1, 2, 2)
+    self.assertEqual(l6_odd_params, expected_l6_odd_params)
+
+    # L7.
+    l7_node_name = 'L7_add'
+    l7_params = parse_layer_parameters.get_layer_params(
+        name_to_node[l7_node_name], name_to_node)
+    expected_l7_params = (1, 1, 1, 1, 0, 0, 0, 0)
+    self.assertEqual(l7_params, expected_l7_params)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/receptive_field/python/util/receptive_field.py b/tensorflow/contrib/receptive_field/python/util/receptive_field.py
index 8b34465d21d14508c24056b588f2533d8fea6a1d..b9bd2f09761ab10a62d37e8e2580b93b9b8a4453 100644
--- a/tensorflow/contrib/receptive_field/python/util/receptive_field.py
+++ b/tensorflow/contrib/receptive_field/python/util/receptive_field.py
@@ -23,242 +23,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
+import numpy as np
 from tensorflow.contrib.receptive_field.python.util import graph_compute_order
-from tensorflow.contrib.util import make_ndarray
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.contrib.receptive_field.python.util import parse_layer_parameters
 from tensorflow.python.framework import ops as framework_ops
-import numpy as np
-
-# White-listed layer operations, which do not affect the receptive field
-# computation.
-_UNCHANGED_RF_LAYER_OPS = [
-  'Add', 'BiasAdd', 'Ceil', 'ConcatV2', 'Const', 'Floor', 'Identity', 'Log',
-  'Mul', 'Pow', 'RealDiv', 'Relu', 'Round', 'Rsqrt', 'Softplus', 'Sub',
-  'VariableV2']
-
-# Different ways in which padding modes may be spelled.
-_VALID_PADDING = ["VALID", b"VALID"]
-_SAME_PADDING = ["SAME", b"SAME"]
-
-
-def _stride_size(node):
-  """Computes stride size given a TF node.
-
-  Args:
-    node: Tensorflow node (NodeDef proto).
-
-  Returns:
-    stride_x: Stride size for horizontal direction (integer).
-    stride_y: Stride size for vertical direction (integer).
-  """
-  strides_attr = node.attr["strides"]
-  logging.vlog(4, "strides_attr = %s", strides_attr)
-  stride_y = strides_attr.list.i[1]
-  stride_x = strides_attr.list.i[2]
-  return stride_x, stride_y
-
-
-def _conv_kernel_size(node, name_to_order_node):
-  """Computes kernel size given a TF convolution or pooling node.
-
-  Args:
-    node: Tensorflow node (NodeDef proto).
-    name_to_order_node: Map from name to {order, node}. Output of
-      graph_compute_order.get_compute_order().
-
-  Returns:
-    kernel_size_x: Kernel size for horizontal direction (integer).
-    kernel_size_y: Kernel size for vertical direction (integer).
-
-  Raises:
-    ValueError: If the weight layer node is invalid.
-  """
-  weights_layer_read_name = node.input[1]
-  if not weights_layer_read_name.endswith("/read"):
-    raise ValueError(
-        "Weight layer's name input to conv layer does not end with '/read'")
-  weights_layer_param_name = weights_layer_read_name[:-5]
-  weights_node = name_to_order_node[weights_layer_param_name].node
-  if weights_node.op != "VariableV2":
-    raise ValueError("Weight layer is not of type VariableV2")
-  shape = weights_node.attr["shape"]
-  logging.vlog(4, "weight shape = %s", shape)
-  kernel_size_y = shape.shape.dim[0].size
-  kernel_size_x = shape.shape.dim[1].size
-  return kernel_size_x, kernel_size_y
-
-
-def _padding_size_conv_pool(node, kernel_size, stride):
-  """Computes padding size given a TF convolution or pooling node.
-
-  Args:
-    node: Tensorflow node (NodeDef proto).
-    kernel_size: Kernel size of node (integer).
-    stride: Stride size of node (integer).
-
-  Returns:
-    padding: Padding size (integer).
-
-  Raises:
-    ValueError: If padding is invalid.
-  """
-  # In this case, we need to carefully consider the different TF padding modes.
-  # The padding depends on kernel size, and may depend on input size. If it
-  # depends on input size, we raise an exception.
-  padding_attr = node.attr["padding"]
-  logging.vlog(4, "padding_attr = %s", padding_attr)
-  if padding_attr.s in _VALID_PADDING:
-    padding = 0
-  elif padding_attr.s in _SAME_PADDING:
-    if kernel_size == 1:
-      padding = 0
-    elif stride == 1:
-      padding = int(math.floor((float(kernel_size) - 1) / 2))
-    elif stride == 2 and kernel_size % 2 == 0:
-      padding = int(math.floor((float(kernel_size) - 1) / 2))
-    else:
-      padding = None
-      logging.warning(
-          "Padding depends on input size, which means that the effective "
-          "padding may be different depending on the input image "
-          "dimensionality. In this case, alignment check will be skipped.")
-  else:
-    raise ValueError("Invalid padding operation %s" % padding_attr.s)
-  return padding
-
-
-def _pool_kernel_size(node):
-  """Computes kernel size given a TF pooling node.
-
-  Args:
-    node: Tensorflow node (NodeDef proto).
-
-  Returns:
-    kernel_size_x: Kernel size for horizontal direction (integer).
-    kernel_size_y: Kernel size for vertical direction (integer).
-
-  Raises:
-    ValueError: If pooling is invalid.
-  """
-  ksize = node.attr["ksize"]
-  kernel_size_y = ksize.list.i[1]
-  kernel_size_x = ksize.list.i[2]
-  if ksize.list.i[0] != 1:
-    raise ValueError("pool ksize for first dim is not 1")
-  if ksize.list.i[3] != 1:
-    raise ValueError("pool ksize for last dim is not 1")
-  return kernel_size_x, kernel_size_y
-
-
-def _padding_size_pad_layer(node, name_to_order_node):
-  """Computes padding size given a TF padding node.
-
-  Args:
-    node: Tensorflow node (NodeDef proto).
-    name_to_order_node: Map from name to {order, node}. Output of
-      graph_compute_order.get_compute_order().
-
-  Returns:
-    padding_x: Padding size for horizontal direction (integer).
-    padding_y: Padding size for vertical direction (integer).
-
-  Raises:
-    ValueError: If padding layer is invalid.
-  """
-  paddings_layer_name = node.input[1]
-  if not paddings_layer_name.endswith("/paddings"):
-    raise ValueError("Padding layer name does not end with '/paddings'")
-  paddings_node = name_to_order_node[paddings_layer_name].node
-  if paddings_node.op != "Const":
-    raise ValueError("Padding op is not Const")
-  value = paddings_node.attr["value"]
-  t = make_ndarray(value.tensor)
-  padding_y = t[1][0]
-  padding_x = t[2][0]
-  if t[0][0] != 0:
-    raise ValueError("padding is not zero for first tensor dim")
-  if t[3][0] != 0:
-    raise ValueError("padding is not zero for last tensor dim")
-  return padding_x, padding_y
-
-
-def _get_layer_params(node, name_to_order_node):
-  """Gets layer parameters relevant for RF computation.
-
-  Currently, only these nodes are supported:
-  - Conv2D
-  - DepthwiseConv2dNative
-  - Pad
-  - MaxPool
-  - AvgPool
-  - all nodes listed in _UNCHANGED_RF_LAYER_OPS
-
-  Args:
-    node: Tensorflow node (NodeDef proto).
-    name_to_order_node: Map from name to {order, node}. Output of
-      graph_compute_order.get_compute_order().
-
-  Returns:
-    kernel_size_x: Kernel size for horizontal direction (integer).
-    kernel_size_y: Kernel size for vertical direction (integer).
-    stride_x: Stride size for horizontal direction (integer).
-    stride_y: Stride size for vertical direction (integer).
-    padding_x: Padding size for horizontal direction (integer).
-    padding_y: Padding size for vertical direction (integer).
-
-  Raises:
-    ValueError: If layer op is unknown.
-  """
-  logging.vlog(3, "node.op = %s", node.op)
-  logging.vlog(4, "node = %s", node)
-  if node.op == "Conv2D" or node.op == "DepthwiseConv2dNative":
-    stride_x, stride_y = _stride_size(node)
-    kernel_size_x, kernel_size_y = _conv_kernel_size(node, name_to_order_node)
-    # Compute the padding for this node separately for each direction.
-    padding_x = _padding_size_conv_pool(node, kernel_size_x, stride_x)
-    padding_y = _padding_size_conv_pool(node, kernel_size_y, stride_y)
-  elif node.op == "Pad":
-    # Kernel and stride are simply 1 in this case.
-    kernel_size_x = 1
-    kernel_size_y = 1
-    stride_x = 1
-    stride_y = 1
-    padding_x, padding_y = _padding_size_pad_layer(node, name_to_order_node)
-  elif node.op == "MaxPool" or node.op == "AvgPool":
-    stride_x, stride_y = _stride_size(node)
-    kernel_size_x, kernel_size_y = _pool_kernel_size(node)
-    # Compute the padding for this node separately for each direction.
-    padding_x = _padding_size_conv_pool(node, kernel_size_x, stride_x)
-    padding_y = _padding_size_conv_pool(node, kernel_size_y, stride_y)
-  elif node.op in _UNCHANGED_RF_LAYER_OPS:
-    # These nodes do not modify the RF parameters.
-    kernel_size_x = 1
-    kernel_size_y = 1
-    stride_x = 1
-    stride_y = 1
-    padding_x = 0
-    padding_y = 0
-  else:
-    raise ValueError("Unknown layer for operation '%s': %s" %
-                     (node.name, node.op))
-  return kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x, padding_y
-
-
-def _reverse_sort_by_order(name_to_order_node):
-  """Sorts map of name_to_order_node nodes in reverse order.
-
-  The output is such that the nodes in name_to_order_node are sorted in
-  descending order of the "order" field.
-
-  Args:
-    name_to_order_node: Map from name to {order, node}. Output of
-      graph_compute_order.get_compute_order().
-
-  Returns:
-    sorted_name_to_order_node: Sorted version of the input, in descending order.
-  """
-  return sorted(name_to_order_node.items(), key=lambda x: -x[1].order)
+from tensorflow.python.platform import tf_logging as logging
 
 
 def _get_rf_size_node_input(stride, kernel_size, rf_size_output):
@@ -307,23 +76,22 @@ def _get_effective_padding_node_input(stride, padding,
   return stride * effective_padding_output + padding
 
 
-class ReceptiveField:
-  """
-  Receptive field of a convolutional neural network.
+class ReceptiveField(object):
+  """Receptive field of a convolutional neural network.
 
   Args:
     size: Receptive field size.
     stride: Effective stride.
     padding: Effective padding.
   """
+
   def __init__(self, size, stride, padding):
     self.size = np.asarray(size)
     self.stride = np.asarray(stride)
     self.padding = np.asarray(padding)
 
   def compute_input_center_coordinates(self, y, axis=None):
-    """
-    Computes the center of the receptive field that generated a feature.
+    """Computes the center of the receptive field that generated a feature.
 
     Args:
       y: An array of feature coordinates with shape `(..., d)`, where `d` is the
@@ -350,12 +118,11 @@ class ReceptiveField:
       raise ValueError("Dimensionality of the feature coordinates `y` (%d) "
                        "does not match dimensionality of `axis` (%d)" %
                        (y.shape[-1], len(axis)))
-    return - self.padding[axis] + y * self.stride[axis] + \
-      (self.size[axis] - 1) / 2
+    return -self.padding[axis] + y * self.stride[axis] + (
+        self.size[axis] - 1) / 2
 
   def compute_feature_coordinates(self, x, axis=None):
-    """
-    Computes the position of a feature given the center of a receptive field.
+    """Computes the position of a feature given the center of a receptive field.
 
     Args:
       x: An array of input center coordinates with shape `(..., d)`, where `d`
@@ -381,15 +148,18 @@ class ReceptiveField:
       raise ValueError("Dimensionality of the input center coordinates `x` "
                        "(%d) does not match dimensionality of `axis` (%d)" %
                        (x.shape[-1], len(axis)))
-    return (x + self.padding[axis] + (1 - self.size[axis]) / 2) / \
-      self.stride[axis]
+    return (x + self.padding[axis] +
+            (1 - self.size[axis]) / 2) / self.stride[axis]
 
   def __iter__(self):
     return iter(np.concatenate([self.size, self.stride, self.padding]))
 
 
-def compute_receptive_field_from_graph_def(graph_def, input_node, output_node,
-                                           stop_propagation=None):
+def compute_receptive_field_from_graph_def(graph_def,
+                                           input_node,
+                                           output_node,
+                                           stop_propagation=None,
+                                           input_resolution=None):
   """Computes receptive field (RF) parameters from a Graph or GraphDef object.
 
   The algorithm stops the calculation of the receptive field whenever it
@@ -402,8 +172,14 @@ def compute_receptive_field_from_graph_def(graph_def, input_node, output_node,
     graph_def: Graph or GraphDef object.
     input_node: Name of the input node or Tensor object from graph.
     output_node: Name of the output node or Tensor object from graph.
-    stop_propagation: List of operation or scope names for which to stop the
+    stop_propagation: List of operations or scope names for which to stop the
       propagation of the receptive field.
+    input_resolution: 2D list. If the input resolution to the model is fixed and
+      known, this may be set. This is helpful for cases where the RF parameters
+      vary depending on the input resolution (this happens since SAME padding in
+      tensorflow depends on input resolution in general). If this is None, it is
+      assumed that the input resolution is unknown, so some RF parameters may be
+      unknown (depending on the model architecture).
 
   Returns:
     rf_size_x: Receptive field size of network in the horizontal direction, with
@@ -437,11 +213,13 @@ def compute_receptive_field_from_graph_def(graph_def, input_node, output_node,
   stop_propagation = stop_propagation or []
 
   # Computes order of computation for a given graph.
-  name_to_order_node = graph_compute_order.get_compute_order(
-      graph_def=graph_def)
+  node_info, name_to_node = graph_compute_order.get_compute_order(
+      graph_def=graph_def,
+      input_node_name=input_node,
+      input_node_size=input_resolution)
 
   # Sort in reverse topological order.
-  order = _reverse_sort_by_order(name_to_order_node)
+  ordered_node_info = sorted(node_info.items(), key=lambda x: -x[1].order)
 
   # Dictionaries to keep track of receptive field, effective stride and
   # effective padding of different nodes.
@@ -470,7 +248,7 @@ def compute_receptive_field_from_graph_def(graph_def, input_node, output_node,
   # alignment checks are skipped, and the effective padding is None.
   undefined_padding = False
 
-  for _, (o, node) in order:
+  for _, (o, node, _, _) in ordered_node_info:
     if node:
       logging.vlog(3, "%10d %-100s %-20s" % (o, node.name[:90], node.op))
     else:
@@ -496,13 +274,14 @@ def compute_receptive_field_from_graph_def(graph_def, input_node, output_node,
         continue
 
       # Get params for this layer.
-      kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x, padding_y = (
-          _get_layer_params(node, name_to_order_node))
+      (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
+       padding_y, _, _) = parse_layer_parameters.get_layer_params(
+           node, name_to_node, node_info[node.name].input_size)
       logging.vlog(3, "kernel_size_x = %s, kernel_size_y = %s, "
                    "stride_x = %s, stride_y = %s, "
-                   "padding_x = %s, padding_y = %s" %
+                   "padding_x = %s, padding_y = %s, input size = %s" %
                    (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
-                    padding_y))
+                    padding_y, node_info[node.name].input_size))
       if padding_x is None or padding_y is None:
         undefined_padding = True
 
@@ -524,72 +303,93 @@ def compute_receptive_field_from_graph_def(graph_def, input_node, output_node,
       else:
         effective_padding_input_x = None
         effective_padding_input_y = None
+      logging.vlog(
+          4, "rf_size_input_x = %s, rf_size_input_y = %s, "
+          "effective_stride_input_x = %s, effective_stride_input_y = %s, "
+          "effective_padding_input_x = %s, effective_padding_input_y = %s" %
+          (rf_size_input_x, rf_size_input_y, effective_stride_input_x,
+           effective_stride_input_y, effective_padding_input_x,
+           effective_padding_input_y))
 
       # Loop over this node's inputs and potentially propagate information down.
       for inp_name in node.input:
         # Stop the propagation of the receptive field.
         if any(inp_name.startswith(stop) for stop in stop_propagation):
-          logging.vlog(3, "Skipping explicitly ignored node %s.", node.name)
+          logging.vlog(3, "Skipping explicitly ignored node %s.", inp_name)
           continue
+
         logging.vlog(4, "inp_name = %s", inp_name)
-        inp_node = name_to_order_node[inp_name].node
+        if inp_name.startswith("^"):
+          # The character "^" denotes a control dependency, so this input node
+          # can be safely ignored.
+          continue
+
+        inp_node = name_to_node[inp_name]
         logging.vlog(4, "inp_node = \n%s", inp_node)
-        if inp_node.name in rf_sizes_x:
-          assert inp_node.name in rf_sizes_y, (
-              "Node %s is in rf_sizes_x, but "
-              "not in rf_sizes_y" % inp_node.name)
+        if inp_name in rf_sizes_x:
+          assert inp_name in rf_sizes_y, ("Node %s is in rf_sizes_x, but "
+                                          "not in rf_sizes_y" % inp_name)
+          logging.vlog(
+              4, "rf_sizes_x[inp_name] = %s,"
+              " rf_sizes_y[inp_name] = %s, "
+              "effective_strides_x[inp_name] = %s,"
+              " effective_strides_y[inp_name] = %s, "
+              "effective_paddings_x[inp_name] = %s,"
+              " effective_paddings_y[inp_name] = %s" %
+              (rf_sizes_x[inp_name], rf_sizes_y[inp_name],
+               effective_strides_x[inp_name], effective_strides_y[inp_name],
+               effective_paddings_x[inp_name], effective_paddings_y[inp_name]))
           # This node was already discovered through a previous path, so we need
           # to make sure that graph is aligned. This alignment check is skipped
           # if the padding is not defined, since in this case alignment cannot
           # be checked.
           if not undefined_padding:
-            if effective_strides_x[inp_node.name] != effective_stride_input_x:
+            if effective_strides_x[inp_name] != effective_stride_input_x:
               raise ValueError(
                   "Graph is not aligned since effective stride from different "
                   "paths is different in horizontal direction")
-            if effective_strides_y[inp_node.name] != effective_stride_input_y:
+            if effective_strides_y[inp_name] != effective_stride_input_y:
               raise ValueError(
                   "Graph is not aligned since effective stride from different "
                   "paths is different in vertical direction")
-            if (rf_sizes_x[inp_node.name] - 1
-               ) / 2 - effective_paddings_x[inp_node.name] != (
+            if (rf_sizes_x[inp_name] - 1
+               ) / 2 - effective_paddings_x[inp_name] != (
                    rf_size_input_x - 1) / 2 - effective_padding_input_x:
               raise ValueError(
                   "Graph is not aligned since center shift from different "
                   "paths is different in horizontal direction")
-            if (rf_sizes_y[inp_node.name] - 1
-               ) / 2 - effective_paddings_y[inp_node.name] != (
+            if (rf_sizes_y[inp_name] - 1
+               ) / 2 - effective_paddings_y[inp_name] != (
                    rf_size_input_y - 1) / 2 - effective_padding_input_y:
               raise ValueError(
                   "Graph is not aligned since center shift from different "
                   "paths is different in vertical direction")
           # Keep track of path with largest RF, for both directions.
-          if rf_sizes_x[inp_node.name] < rf_size_input_x:
-            rf_sizes_x[inp_node.name] = rf_size_input_x
-            effective_strides_x[inp_node.name] = effective_stride_input_x
-            effective_paddings_x[inp_node.name] = effective_padding_input_x
-          if rf_sizes_y[inp_node.name] < rf_size_input_y:
-            rf_sizes_y[inp_node.name] = rf_size_input_y
-            effective_strides_y[inp_node.name] = effective_stride_input_y
-            effective_paddings_y[inp_node.name] = effective_padding_input_y
+          if rf_sizes_x[inp_name] < rf_size_input_x:
+            rf_sizes_x[inp_name] = rf_size_input_x
+            effective_strides_x[inp_name] = effective_stride_input_x
+            effective_paddings_x[inp_name] = effective_padding_input_x
+          if rf_sizes_y[inp_name] < rf_size_input_y:
+            rf_sizes_y[inp_name] = rf_size_input_y
+            effective_strides_y[inp_name] = effective_stride_input_y
+            effective_paddings_y[inp_name] = effective_padding_input_y
         else:
-          assert inp_node.name not in rf_sizes_y, (
-              "Node %s is in rf_sizes_y, but "
-              "not in rf_sizes_x" % inp_node.name)
+          assert inp_name not in rf_sizes_y, ("Node %s is in rf_sizes_y, but "
+                                              "not in rf_sizes_x" % inp_name)
           # In this case, it is the first time we encounter this node. So we
           # propagate the RF parameters.
-          rf_sizes_x[inp_node.name] = rf_size_input_x
-          rf_sizes_y[inp_node.name] = rf_size_input_y
-          effective_strides_x[inp_node.name] = effective_stride_input_x
-          effective_strides_y[inp_node.name] = effective_stride_input_y
-          effective_paddings_x[inp_node.name] = effective_padding_input_x
-          effective_paddings_y[inp_node.name] = effective_padding_input_y
+          rf_sizes_x[inp_name] = rf_size_input_x
+          rf_sizes_y[inp_name] = rf_size_input_y
+          effective_strides_x[inp_name] = effective_stride_input_x
+          effective_strides_y[inp_name] = effective_stride_input_y
+          effective_paddings_x[inp_name] = effective_padding_input_x
+          effective_paddings_y[inp_name] = effective_padding_input_y
 
   if not found_output_node:
     raise ValueError("Output node was not found")
   if input_node not in rf_sizes_x:
     raise ValueError("Input node was not found")
   return ReceptiveField(
-    (rf_sizes_x[input_node], rf_sizes_y[input_node]),
-    (effective_strides_x[input_node], effective_strides_y[input_node]),
-    (effective_paddings_x[input_node], effective_paddings_y[input_node]))
+      (rf_sizes_x[input_node], rf_sizes_y[input_node]),
+      (effective_strides_x[input_node], effective_strides_y[input_node]),
+      (effective_paddings_x[input_node], effective_paddings_y[input_node]))
diff --git a/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py b/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py
index 8d7d5440f630a3a78749e04a5eb058d637c258fc..cf55da27236d17c709cbde689831ad68da9a8a7b 100644
--- a/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py
+++ b/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py
@@ -18,16 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.contrib import slim
-from tensorflow.contrib.receptive_field.python.util import receptive_field
+from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
-import numpy as np
 
 
+# TODO(andrearaujo): Rename the create_test_network_* functions in order to have
+# more descriptive names.
 def create_test_network_1():
   """Aligned network for test.
 
@@ -39,8 +44,9 @@ def create_test_network_1():
   """
   g = ops.Graph()
   with g.as_default():
-    # An 8x8 test image.
-    x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image')
+    # An input test image with unknown spatial resolution.
+    x = array_ops.placeholder(
+        dtypes.float32, (None, None, None, 1), name='input_image')
     # Left branch.
     l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID')
     # Right branch.
@@ -66,8 +72,9 @@ def create_test_network_2():
   """
   g = ops.Graph()
   with g.as_default():
-    # An 8x8 test image.
-    x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image')
+    # An input test image with unknown spatial resolution.
+    x = array_ops.placeholder(
+        dtypes.float32, (None, None, None, 1), name='input_image')
     # Left branch.
     l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID')
     # Right branch.
@@ -90,8 +97,9 @@ def create_test_network_3():
   """
   g = ops.Graph()
   with g.as_default():
-    # An 8x8 test image.
-    x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image')
+    # An input test image with unknown spatial resolution.
+    x = array_ops.placeholder(
+        dtypes.float32, (None, None, None, 1), name='input_image')
     # Left branch.
     l1_pad = array_ops.pad(x, [[0, 0], [2, 1], [2, 1], [0, 0]])
     l1 = slim.conv2d(l1_pad, 1, [5, 5], stride=2, scope='L1', padding='VALID')
@@ -117,8 +125,9 @@ def create_test_network_4():
   """
   g = ops.Graph()
   with g.as_default():
-    # An 8x8 test image.
-    x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image')
+    # An input test image with unknown spatial resolution.
+    x = array_ops.placeholder(
+        dtypes.float32, (None, None, None, 1), name='input_image')
     # Left branch.
     l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID')
     # Right branch.
@@ -141,8 +150,9 @@ def create_test_network_5():
   """
   g = ops.Graph()
   with g.as_default():
-    # An 8x8 test image.
-    x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image')
+    # An input test image with unknown spatial resolution.
+    x = array_ops.placeholder(
+        dtypes.float32, (None, None, None, 1), name='input_image')
     # Two convolutional layers, where the first one has non-square kernel.
     l1 = slim.conv2d(x, 1, [3, 5], stride=2, scope='L1', padding='VALID')
     l2 = slim.conv2d(l1, 1, [3, 1], stride=2, scope='L2', padding='VALID')
@@ -162,8 +172,9 @@ def create_test_network_6():
   """
   g = ops.Graph()
   with g.as_default():
-    # An 8x8 test image.
-    x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image')
+    # An input test image with unknown spatial resolution.
+    x = array_ops.placeholder(
+        dtypes.float32, (None, None, None, 1), name='input_image')
     # Left branch.
     l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID')
     # Right branch.
@@ -176,7 +187,102 @@ def create_test_network_6():
   return g
 
 
-class RfUtilsTest(test.TestCase):
+def create_test_network_7():
+  """Aligned network for test, with a control dependency.
+
+  The graph is similar to create_test_network_1(), except that it includes an
+  assert operation on the left branch.
+
+  Returns:
+    g: Tensorflow graph object (Graph proto).
+  """
+  g = ops.Graph()
+  with g.as_default():
+    # An 8x8 test image.
+    x = array_ops.placeholder(dtypes.float32, (1, 8, 8, 1), name='input_image')
+    # Left branch.
+    l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID')
+    l1_shape = array_ops.shape(l1)
+    assert_op = control_flow_ops.Assert(
+        gen_math_ops.equal(l1_shape[1], 2), [l1_shape], summarize=4)
+    # Right branch.
+    l2_pad = array_ops.pad(x, [[0, 0], [1, 0], [1, 0], [0, 0]])
+    l2 = slim.conv2d(l2_pad, 1, [3, 3], stride=2, scope='L2', padding='VALID')
+    l3 = slim.conv2d(l2, 1, [1, 1], stride=2, scope='L3', padding='VALID')
+    # Addition.
+    with ops.control_dependencies([assert_op]):
+      nn.relu(l1 + l3, name='output')
+  return g
+
+
+def create_test_network_8():
+  """Aligned network for test, including an intermediate addition.
+
+  The graph is similar to create_test_network_1(), except that it includes a few
+  more layers on top. The added layers compose two different branches whose
+  receptive fields are different. This makes this test case more challenging; in
+  particular, this test fails if a naive DFS-like algorithm is used for RF
+  computation.
+
+  Returns:
+    g: Tensorflow graph object (Graph proto).
+  """
+  g = ops.Graph()
+  with g.as_default():
+    # An input test image with unknown spatial resolution.
+    x = array_ops.placeholder(
+        dtypes.float32, (None, None, None, 1), name='input_image')
+    # Left branch before first addition.
+    l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='VALID')
+    # Right branch before first addition.
+    l2_pad = array_ops.pad(x, [[0, 0], [1, 0], [1, 0], [0, 0]])
+    l2 = slim.conv2d(l2_pad, 1, [3, 3], stride=2, scope='L2', padding='VALID')
+    l3 = slim.conv2d(l2, 1, [1, 1], stride=2, scope='L3', padding='VALID')
+    # First addition.
+    l4 = nn.relu(l1 + l3)
+    # Left branch after first addition.
+    l5 = slim.conv2d(l4, 1, [1, 1], stride=2, scope='L5', padding='VALID')
+    # Right branch after first addition.
+    l6_pad = array_ops.pad(l4, [[0, 0], [1, 0], [1, 0], [0, 0]])
+    l6 = slim.conv2d(l6_pad, 1, [3, 3], stride=2, scope='L6', padding='VALID')
+    # Final addition.
+    nn.relu(l5 + l6, name='output')
+
+  return g
+
+
+def create_test_network_9():
+  """Aligned network for test, including an intermediate addition.
+
+  The graph is the same as create_test_network_8(), except that VALID padding is
+  changed to SAME.
+
+  Returns:
+    g: Tensorflow graph object (Graph proto).
+  """
+  g = ops.Graph()
+  with g.as_default():
+    # An input test image with unknown spatial resolution.
+    x = array_ops.placeholder(
+        dtypes.float32, (None, None, None, 1), name='input_image')
+    # Left branch before first addition.
+    l1 = slim.conv2d(x, 1, [1, 1], stride=4, scope='L1', padding='SAME')
+    # Right branch before first addition.
+    l2 = slim.conv2d(x, 1, [3, 3], stride=2, scope='L2', padding='SAME')
+    l3 = slim.conv2d(l2, 1, [1, 1], stride=2, scope='L3', padding='SAME')
+    # First addition.
+    l4 = nn.relu(l1 + l3)
+    # Left branch after first addition.
+    l5 = slim.conv2d(l4, 1, [1, 1], stride=2, scope='L5', padding='SAME')
+    # Right branch after first addition.
+    l6 = slim.conv2d(l4, 1, [3, 3], stride=2, scope='L6', padding='SAME')
+    # Final addition.
+    nn.relu(l5 + l6, name='output')
+
+  return g
+
+
+class ReceptiveFieldTest(test.TestCase):
 
   def testComputeRFFromGraphDefAligned(self):
     graph_def = create_test_network_1().as_graph_def()
@@ -216,7 +322,7 @@ class RfUtilsTest(test.TestCase):
       receptive_field.compute_receptive_field_from_graph_def(
           graph_def, input_node, output_node)
 
-  def testComputeRFFromGraphDefUnaligned2(self):
+  def testComputeRFFromGraphDefUndefinedPadding(self):
     graph_def = create_test_network_4().as_graph_def()
     input_node = 'input_image'
     output_node = 'output'
@@ -231,6 +337,29 @@ class RfUtilsTest(test.TestCase):
     self.assertEqual(effective_padding_x, None)
     self.assertEqual(effective_padding_y, None)
 
+  def testComputeRFFromGraphDefFixedInputDim(self):
+    graph_def = create_test_network_4().as_graph_def()
+    input_node = 'input_image'
+    output_node = 'output'
+    (receptive_field_x, receptive_field_y, effective_stride_x,
+     effective_stride_y, effective_padding_x, effective_padding_y) = (
+         receptive_field.compute_receptive_field_from_graph_def(
+             graph_def, input_node, output_node, input_resolution=[9, 9]))
+    self.assertEqual(receptive_field_x, 3)
+    self.assertEqual(receptive_field_y, 3)
+    self.assertEqual(effective_stride_x, 4)
+    self.assertEqual(effective_stride_y, 4)
+    self.assertEqual(effective_padding_x, 1)
+    self.assertEqual(effective_padding_y, 1)
+
+  def testComputeRFFromGraphDefUnalignedFixedInputDim(self):
+    graph_def = create_test_network_4().as_graph_def()
+    input_node = 'input_image'
+    output_node = 'output'
+    with self.assertRaises(ValueError):
+      receptive_field.compute_receptive_field_from_graph_def(
+          graph_def, input_node, output_node, input_resolution=[8, 8])
+
   def testComputeRFFromGraphDefNonSquareRF(self):
     graph_def = create_test_network_5().as_graph_def()
     input_node = 'input_image'
@@ -269,7 +398,7 @@ class RfUtilsTest(test.TestCase):
     input_node = 'input_image'
     output_node = 'output'
     rf = receptive_field.compute_receptive_field_from_graph_def(
-      graph_def, input_node, output_node)
+        graph_def, input_node, output_node)
 
     x = np.random.randint(0, 100, (50, 2))
     y = rf.compute_feature_coordinates(x)
@@ -277,5 +406,52 @@ class RfUtilsTest(test.TestCase):
 
     self.assertAllEqual(x, x2)
 
+  def testComputeRFFromGraphDefAlignedWithControlDependencies(self):
+    graph_def = create_test_network_7().as_graph_def()
+    input_node = 'input_image'
+    output_node = 'output'
+    (receptive_field_x, receptive_field_y, effective_stride_x,
+     effective_stride_y, effective_padding_x, effective_padding_y) = (
+         receptive_field.compute_receptive_field_from_graph_def(
+             graph_def, input_node, output_node))
+    self.assertEqual(receptive_field_x, 3)
+    self.assertEqual(receptive_field_y, 3)
+    self.assertEqual(effective_stride_x, 4)
+    self.assertEqual(effective_stride_y, 4)
+    self.assertEqual(effective_padding_x, 1)
+    self.assertEqual(effective_padding_y, 1)
+
+  def testComputeRFFromGraphDefWithIntermediateAddNode(self):
+    graph_def = create_test_network_8().as_graph_def()
+    input_node = 'input_image'
+    output_node = 'output'
+    (receptive_field_x, receptive_field_y, effective_stride_x,
+     effective_stride_y, effective_padding_x, effective_padding_y) = (
+         receptive_field.compute_receptive_field_from_graph_def(
+             graph_def, input_node, output_node))
+    self.assertEqual(receptive_field_x, 11)
+    self.assertEqual(receptive_field_y, 11)
+    self.assertEqual(effective_stride_x, 8)
+    self.assertEqual(effective_stride_y, 8)
+    self.assertEqual(effective_padding_x, 5)
+    self.assertEqual(effective_padding_y, 5)
+
+  def testComputeRFFromGraphDefWithIntermediateAddNodeSamePaddingFixedInputDim(
+      self):
+    graph_def = create_test_network_9().as_graph_def()
+    input_node = 'input_image'
+    output_node = 'output'
+    (receptive_field_x, receptive_field_y, effective_stride_x,
+     effective_stride_y, effective_padding_x, effective_padding_y) = (
+         receptive_field.compute_receptive_field_from_graph_def(
+             graph_def, input_node, output_node, input_resolution=[17, 17]))
+    self.assertEqual(receptive_field_x, 11)
+    self.assertEqual(receptive_field_y, 11)
+    self.assertEqual(effective_stride_x, 8)
+    self.assertEqual(effective_stride_y, 8)
+    self.assertEqual(effective_padding_x, 5)
+    self.assertEqual(effective_padding_y, 5)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/receptive_field/__init__.py b/tensorflow/contrib/receptive_field/receptive_field_api.py
similarity index 89%
rename from tensorflow/contrib/receptive_field/__init__.py
rename to tensorflow/contrib/receptive_field/receptive_field_api.py
index 10745a6a53d5b3ef9521b2313ddc28799ee8b886..4d81b4292df5f696b761b2977fec078abc28569f 100644
--- a/tensorflow/contrib/receptive_field/__init__.py
+++ b/tensorflow/contrib/receptive_field/receptive_field_api.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Module to compute receptive field parameters for CNN tensorflow models."""
+"""Module that declares the functions in tf.contrib.receptive_field's API."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -21,3 +21,7 @@ from __future__ import print_function
 from tensorflow.contrib.receptive_field.python.util.graph_compute_order import get_compute_order
 from tensorflow.contrib.receptive_field.python.util.receptive_field import compute_receptive_field_from_graph_def
 # pylint: enable=unused-import
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
index c33804906fc21cf2573b79091a76ab1ea86f5966..2def4f3f176b8d4d26c2c94168e9698f14649d94 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include <algorithm>
 #include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h"
+#include <algorithm>
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
index fc3a2da9b398b16df223d60e2e913f952fa24434..d8c0a0631d38e55ef9653e0e88e90604ec0f0329 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #define Sum(a, b) ((a) + (b))
 #define Prod(a, b) ((a) * (b))
@@ -58,11 +58,11 @@ inline T negative_infinity() {
 
 }  // namespace reduce_functions
 
-#define CALL_ALL_REDUCEOPS(func, ...)                                          \
-  func(Sum, functor::reduce_functions::zero, ##__VA_ARGS__)                    \
-  func(Prod, functor::reduce_functions::one, ##__VA_ARGS__)                    \
-  func(Max, functor::reduce_functions::negative_infinity, ##__VA_ARGS__)       \
-  func(Min, functor::reduce_functions::infinity, ##__VA_ARGS__)
+#define CALL_ALL_REDUCEOPS(func, ...)                                       \
+  func(Sum, functor::reduce_functions::zero, ##__VA_ARGS__)                 \
+      func(Prod, functor::reduce_functions::one, ##__VA_ARGS__) func(       \
+          Max, functor::reduce_functions::negative_infinity, ##__VA_ARGS__) \
+          func(Min, functor::reduce_functions::infinity, ##__VA_ARGS__)
 
 #define ReduceSliceFunctorReduceop(reduceop, dummy)                         \
   template <typename Device, typename T, typename Index>                    \
@@ -81,4 +81,4 @@ CALL_ALL_REDUCEOPS(ReduceSliceFunctorReduceop)
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_PARTIAL_REDUCTION_OPS_H_
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
index 8e6870fadd428ae8a1937a5c0cb43b6763f6be28..9f2be03d718364058da6b63add8752c046798c5b 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
@@ -34,9 +34,9 @@ namespace functor {
   __global__ void ReduceSliceDeviceKernel##reduceop(                           \
       Cuda3DLaunchConfig config, Index indices_width, Index bound,             \
       const T begin, const Index *indices, const T *input, T *out) {           \
-    CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {                 \
-      CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {               \
-        CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {             \
+    CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {               \
+      CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {             \
+        CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) {           \
           Index outidx = x * config.virtual_thread_count.y *                   \
                              config.virtual_thread_count.z +                   \
                          y * config.virtual_thread_count.z + z;                \
@@ -68,8 +68,9 @@ namespace functor {
       if (sizex * sizey * sizez == 0) {                                        \
         return;                                                                \
       }                                                                        \
-      Cuda3DLaunchConfig config = GetCuda3DLaunchConfig(sizex, sizey, sizez, d,\
-          ReduceSliceDeviceKernel##reduceop<T, Index>, 0, 0);                  \
+      Cuda3DLaunchConfig config = GetCuda3DLaunchConfig(                       \
+          sizex, sizey, sizez, d, ReduceSliceDeviceKernel##reduceop<T, Index>, \
+          0, 0);                                                               \
                                                                                \
       ReduceSliceDeviceKernel##reduceop<T, Index>                              \
           <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(    \
diff --git a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
index b8b56c0e229563a4e9bc930512c9fe49bd636e31..92879ab5356623dfa82fce8dff8db4d3036ae46c 100644
--- a/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
+++ b/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc
@@ -87,9 +87,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[ 1, 2, 3]
-                        [ 0, 0, 0]
-                        [41,52,63]].
+the output will be [[ 1, 2, 3]
+                    [ 0, 0, 0]
+                    [41,52,63]].
 ```
 
 The data must be at least rank 1. The indices must be of shape (?,2) where the
@@ -132,9 +132,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[ 1,  2,  3]
-                        [ 1,  1,  1]
-                        [40,100,180]].
+the output will be [[ 1,  2,  3]
+                    [ 1,  1,  1]
+                    [40,100,180]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
@@ -189,9 +189,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[          1,         20,          3]
-                        [ -BIG_VALUE, -BIG_VALUE, -BIG_VALUE]
-                        [        400,         20,         60]].
+the output will be [[          1,         20,          3]
+                    [ -BIG_VALUE, -BIG_VALUE, -BIG_VALUE]
+                    [        400,         20,         60]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
@@ -246,9 +246,9 @@ and 'indices' is [[0,1]
                   [1,1]
                   [0,2]],
 
-the the output will be [[          1,         20,          3]
-                        [ +BIG_VALUE, +BIG_VALUE, +BIG_VALUE]
-                        [          1,          5,          3]].
+the output will be [[          1,         20,          3]
+                    [ +BIG_VALUE, +BIG_VALUE, +BIG_VALUE]
+                    [          1,          5,          3]].
 ```
 
 The data must be at least rank 1. The indices can be of shape (?,2) where the
diff --git a/tensorflow/contrib/reduce_slice_ops/python/kernel_tests/reduce_slice_ops_test.py b/tensorflow/contrib/reduce_slice_ops/python/kernel_tests/reduce_slice_ops_test.py
index 60a193db4c7f084d3262a69e2b8c5df66273e138..468886da20021646089bd1d222da1ebd4b5c7822 100644
--- a/tensorflow/contrib/reduce_slice_ops/python/kernel_tests/reduce_slice_ops_test.py
+++ b/tensorflow/contrib/reduce_slice_ops/python/kernel_tests/reduce_slice_ops_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import unittest
 
 from tensorflow.contrib.reduce_slice_ops.python.ops import reduce_slice_ops
 from tensorflow.python.framework.test_util import TensorFlowTestCase
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.cc b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
index 7d9ef14cefc578e9401d95db9a625428cc0e2605..63c72836d793a3df4e96a0134f3a1534c288c8c8 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
@@ -36,17 +36,12 @@ using GPUDevice = Eigen::GpuDevice;
 namespace functor {
 
 template <typename T>
-struct Resampler2DFunctor<CPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const CPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   T* __restrict__ output,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points){
+struct Resampler2DFunctor<CPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  T* __restrict__ output, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
     const int warp_batch_stride = num_sampling_points * 2;
     const int data_batch_stride = data_height * data_width * data_channels;
     const int output_batch_stride = num_sampling_points * data_channels;
@@ -59,24 +54,19 @@ struct Resampler2DFunctor<CPUDevice, T>{
         // The functions take care of performing the relevant pointer
         // arithmetics abstracting away the low level details in the
         // main loop over samples. Note that data is stored in NHWC format.
-        auto set_output = [&](const int sample_id,
-                              const int channel,
+        auto set_output = [&](const int sample_id, const int channel,
                               const T value) {
-          output[batch_id * output_batch_stride +
-                 sample_id * data_channels +
+          output[batch_id * output_batch_stride + sample_id * data_channels +
                  channel] = value;
         };
 
-        auto get_data_point = [&](const int x,
-                                  const int y,
-                                  const int chan) {
+        auto get_data_point = [&](const int x, const int y, const int chan) {
           const bool point_is_in_range =
               (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
           return point_is_in_range
-                 ? data[batch_id * data_batch_stride +
-                        data_channels * (y * data_width + x) +
-                        chan]
-                 : zero;
+                     ? data[batch_id * data_batch_stride +
+                            data_channels * (y * data_width + x) + chan]
+                     : zero;
         };
 
         for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) {
@@ -89,8 +79,7 @@ struct Resampler2DFunctor<CPUDevice, T>{
           // The effect is that the sampled signal smoothly goes to 0 outside
           // the original input domain, rather than presenting a jump
           // discontinuity at the image boundaries.
-          if (x > static_cast<T>(-1.0) &&
-              y > static_cast<T>(-1.0) &&
+          if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
               x < static_cast<T>(data_width) &&
               y < static_cast<T>(data_height)) {
             // Precompute floor (f) and ceil (c) values for x and y.
@@ -103,12 +92,10 @@ struct Resampler2DFunctor<CPUDevice, T>{
 
             for (int chan = 0; chan < data_channels; ++chan) {
               const T img_fxfy = dx * dy * get_data_point(fx, fy, chan);
-              const T img_cxcy = (one - dx) * (one - dy) *
-                                   get_data_point(cx, cy, chan);
-              const T img_fxcy = dx * (one - dy) *
-                                   get_data_point(fx, cy, chan);
-              const T img_cxfy = (one - dx) * dy *
-                                   get_data_point(cx, fy, chan);
+              const T img_cxcy =
+                  (one - dx) * (one - dy) * get_data_point(cx, cy, chan);
+              const T img_fxcy = dx * (one - dy) * get_data_point(fx, cy, chan);
+              const T img_cxfy = (one - dx) * dy * get_data_point(cx, fy, chan);
               set_output(sample_id, chan,
                          img_fxfy + img_cxcy + img_fxcy + img_cxfy);
             }
@@ -125,8 +112,8 @@ struct Resampler2DFunctor<CPUDevice, T>{
     // estimate of the cost of each work unit is needed to correctly shard the
     // workload. Shard assumes each cost unit is 1ns, minimum cost per shard
     // being 10us.
-    const int64 cost =  static_cast<int64>(num_sampling_points) *
-        data_channels * 1000;
+    const int64 cost =
+        static_cast<int64>(num_sampling_points) * data_channels * 1000;
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
     ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
                         batch_size, cost, resample_batches);
@@ -138,8 +125,8 @@ struct Resampler2DFunctor<CPUDevice, T>{
 template <typename Device, typename T>
 class ResamplerOp : public ::tensorflow::OpKernel {
  public:
-  explicit ResamplerOp(::tensorflow::OpKernelConstruction* context) :
-      ::tensorflow::OpKernel(context) {}
+  explicit ResamplerOp(::tensorflow::OpKernelConstruction* context)
+      : ::tensorflow::OpKernel(context) {}
 
   void Compute(::tensorflow::OpKernelContext* ctx) override {
     const ::tensorflow::Tensor& data = ctx->input(0);
@@ -158,16 +145,17 @@ class ResamplerOp : public ::tensorflow::OpKernel {
                 ::tensorflow::errors::InvalidArgument(
                     "warp should be at least a matrix, got shape ",
                     warp_shape.DebugString()));
-    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2,
+    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
                 ::tensorflow::errors::Unimplemented(
                     "Only bilinear interpolation is supported, warping "
                     "coordinates must be 2D; warp shape last entry should be "
-                    "2, but shape vector is: ", warp_shape.DebugString()));
+                    "2, but shape vector is: ",
+                    warp_shape.DebugString()));
     OP_REQUIRES(ctx, data_shape.dim_size(0) == warp_shape.dim_size(0),
                 ::tensorflow::errors::InvalidArgument(
                     "Batch size of data and warp tensor must be the same, but "
-                    "input shapes are: ", data_shape.DebugString(), ", ",
-                    warp_shape.DebugString()));
+                    "input shapes are: ",
+                    data_shape.DebugString(), ", ", warp_shape.DebugString()));
     const int batch_size = data_shape.dim_size(0);
     const int data_height = data_shape.dim_size(1);
     const int data_width = data_shape.dim_size(2);
@@ -180,16 +168,10 @@ class ResamplerOp : public ::tensorflow::OpKernel {
 
     // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU.
     if (num_sampling_points > 0) {
-      functor::Resampler2DFunctor<Device, T>()(ctx,
-                                               ctx->eigen_device<Device>(),
-                                               data.flat<T>().data(),
-                                               warp.flat<T>().data(),
-                                               output->flat<T>().data(),
-                                               batch_size,
-                                               data_height,
-                                               data_width,
-                                               data_channels,
-                                               num_sampling_points);
+      functor::Resampler2DFunctor<Device, T>()(
+          ctx, ctx->eigen_device<Device>(), data.flat<T>().data(),
+          warp.flat<T>().data(), output->flat<T>().data(), batch_size,
+          data_height, data_width, data_channels, num_sampling_points);
     }
   }
 
@@ -197,12 +179,9 @@ class ResamplerOp : public ::tensorflow::OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ResamplerOp);
 };
 
-
-#define REGISTER(TYPE)                       \
-  REGISTER_KERNEL_BUILDER(                   \
-      Name("Resampler")                      \
-          .Device(DEVICE_CPU)  \
-          .TypeConstraint<TYPE>("T"),        \
+#define REGISTER(TYPE)                                                \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Resampler").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
       ResamplerOp<CPUDevice, TYPE>);
 
 TF_CALL_half(REGISTER);
@@ -211,40 +190,32 @@ TF_CALL_double(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
-#define REGISTER(TYPE)                                           \
-  REGISTER_KERNEL_BUILDER(Name("Resampler")                      \
-                              .Device(DEVICE_GPU)  \
-                              .TypeConstraint<TYPE>("T"),        \
-                          ResamplerOp<GPUDevice, TYPE>)
+#define REGISTER(TYPE)                                                \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Resampler").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      ResamplerOp<GPUDevice, TYPE>)
 TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
 #undef REGISTER
 #endif  // GOOGLE_CUDA
 
-
 namespace functor {
 
 template <typename T>
-struct ResamplerGrad2DFunctor<CPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const CPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   const T* __restrict__ grad_output,
-                   T* __restrict__ grad_data,
-                   T* __restrict__ grad_warp,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points){
+struct ResamplerGrad2DFunctor<CPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const CPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  const T* __restrict__ grad_output, T* __restrict__ grad_data,
+                  T* __restrict__ grad_warp, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
     // Set gradients to 0, because the kernel incrementally updates the
     // tensor entries by adding partial contributions.
-    const int resampler_output_size = batch_size * num_sampling_points *
-        data_channels;
+    const int resampler_output_size =
+        batch_size * num_sampling_points * data_channels;
     const int grad_warp_size = resampler_output_size / data_channels * 2;
-    const int grad_data_size = data_height * data_width * data_channels *
-        batch_size;
+    const int grad_data_size =
+        data_height * data_width * data_channels * batch_size;
     memset(grad_data, 0, sizeof(T) * grad_data_size);
     memset(grad_warp, 0, sizeof(T) * grad_warp_size);
 
@@ -260,35 +231,29 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
         // The functions take care of performing the relevant pointer
         // arithmetics abstracting away the low level details in the
         // main loop over samples. Note that data is stored in NHWC format.
-        auto get_data_point = [&](const int x,
-                                  const int y,
-                                  const int chan) {
+        auto get_data_point = [&](const int x, const int y, const int chan) {
           const bool point_is_in_range =
-            (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
+              (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
           return point_is_in_range
-                 ? data[batch_id * data_batch_stride +
-                        data_channels * (y * data_width + x) +
-                        chan]
-                 : zero;
+                     ? data[batch_id * data_batch_stride +
+                            data_channels * (y * data_width + x) + chan]
+                     : zero;
         };
 
         auto update_grad_data = [&](const int x, const int y, const int chan,
                                     const T value) {
           const bool point_is_in_range =
               (x >= 0 && y >= 0 && x <= data_width - 1 && y <= data_height - 1);
-          if (point_is_in_range){
+          if (point_is_in_range) {
             grad_data[batch_id * data_batch_stride +
-                      data_channels * (y * data_width + x) +
-                      chan] += value;
+                      data_channels * (y * data_width + x) + chan] += value;
           }
         };
 
-        auto update_grad_warp = [&](const int sample_id,
-                                    const int channel,
+        auto update_grad_warp = [&](const int sample_id, const int channel,
                                     const T value) {
-          grad_warp[batch_id * warp_batch_stride +
-                    sample_id * 2 +
-                    channel] += value;
+          grad_warp[batch_id * warp_batch_stride + sample_id * 2 + channel] +=
+              value;
         };
 
         for (int sample_id = 0; sample_id < num_sampling_points; ++sample_id) {
@@ -301,8 +266,7 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
           // The effect is that the sampled signal smoothly goes to 0 outside
           // the original input domain, rather than presenting a jump
           // discontinuity at the image boundaries.
-          if (x > static_cast<T>(-1.0) &&
-              y > static_cast<T>(-1.0) &&
+          if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
               x < static_cast<T>(data_width) &&
               y < static_cast<T>(data_height)) {
             // Precompute floor (f) and ceil (c) values for x and y.
@@ -316,27 +280,25 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
             for (int chan = 0; chan < data_channels; ++chan) {
               const T grad_output_value =
                   grad_output[batch_id * output_batch_stride +
-                              sample_id * data_channels +
-                              chan];
+                              sample_id * data_channels + chan];
               const T img_fxfy = get_data_point(fx, fy, chan);
               const T img_cxcy = get_data_point(cx, cy, chan);
               const T img_fxcy = get_data_point(fx, cy, chan);
               const T img_cxfy = get_data_point(cx, fy, chan);
 
               // Update partial gradients wrt relevant warp field entries
-              update_grad_warp(sample_id, 0,
-                               grad_output_value *
-                                   ((one - dy) * (img_cxcy - img_fxcy) +
-                                    dy * (img_cxfy - img_fxfy)));
+              update_grad_warp(
+                  sample_id, 0,
+                  grad_output_value * ((one - dy) * (img_cxcy - img_fxcy) +
+                                       dy * (img_cxfy - img_fxfy)));
 
-              update_grad_warp(sample_id, 1,
-                               grad_output_value *
-                                   ((one - dx) * (img_cxcy - img_cxfy) +
-                                    dx * (img_fxcy - img_fxfy)));
+              update_grad_warp(
+                  sample_id, 1,
+                  grad_output_value * ((one - dx) * (img_cxcy - img_cxfy) +
+                                       dx * (img_fxcy - img_fxfy)));
 
               // Update partial gradients wrt sampled data
-              update_grad_data(fx, fy, chan,
-                               grad_output_value * dx * dy);
+              update_grad_data(fx, fy, chan, grad_output_value * dx * dy);
               update_grad_data(cx, cy, chan,
                                grad_output_value * (one - dx) * (one - dy));
               update_grad_data(fx, cy, chan,
@@ -355,8 +317,8 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
     // being 10us.
     // TODO(fviola): Check out if there is a better way of doing this.
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
-    const int64 cost =  static_cast<int64>(num_sampling_points) *
-        data_channels * 1000;
+    const int64 cost =
+        static_cast<int64>(num_sampling_points) * data_channels * 1000;
     ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
                         batch_size, cost, update_grads_for_batches);
   }
@@ -364,12 +326,11 @@ struct ResamplerGrad2DFunctor<CPUDevice, T>{
 
 }  // namespace functor
 
-
 template <typename Device, typename T>
 class ResamplerGradOp : public ::tensorflow::OpKernel {
  public:
-  explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context) :
-      ::tensorflow::OpKernel(context) {}
+  explicit ResamplerGradOp(::tensorflow::OpKernelConstruction* context)
+      : ::tensorflow::OpKernel(context) {}
 
   void Compute(::tensorflow::OpKernelContext* ctx) override {
     const ::tensorflow::Tensor& data = ctx->input(0);
@@ -383,7 +344,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
                     "tensor must be a batch of 2d data; data shape should have "
                     "4 entries corresponding to [batch_size, data_height, "
                     "data_width, data_channels], but is: ",
-                data_shape.DebugString()));
+                    data_shape.DebugString()));
     const int batch_size = data_shape.dim_size(0);
     const int data_height = data_shape.dim_size(1);
     const int data_width = data_shape.dim_size(2);
@@ -394,7 +355,7 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
                 ::tensorflow::errors::InvalidArgument(
                     "warp should be at least a matrix, got shape ",
                     warp_shape.DebugString()));
-    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims()-1) == 2,
+    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
                 ::tensorflow::errors::Unimplemented(
                     "Only bilinear interpolation is supported, warping "
                     "coordinates must be 2D; warp shape last entry should be "
@@ -406,10 +367,10 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
                                    data_channels);
     OP_REQUIRES(ctx, grad_output_shape == resampler_output_shape,
                 ::tensorflow::errors::InvalidArgument(
-                   "grad_output shape is not consistent with data and warp "
-                   "shapes; it should be ",
-                   resampler_output_shape.DebugString(), " but is ",
-                   grad_output_shape.DebugString()))
+                    "grad_output shape is not consistent with data and warp "
+                    "shapes; it should be ",
+                    resampler_output_shape.DebugString(), " but is ",
+                    grad_output_shape.DebugString()));
     const int num_sampling_points = warp.NumElements() / batch_size / 2;
     ::tensorflow::Tensor* grad_data = nullptr;
     ::tensorflow::Tensor* grad_warp = nullptr;
@@ -417,18 +378,11 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(1, warp.shape(), &grad_warp));
     // Execute kernel only for nonempty output; otherwise Eigen crashes on GPU.
     if (num_sampling_points > 0) {
-      functor::ResamplerGrad2DFunctor<Device, T>()(ctx,
-                                                   ctx->eigen_device<Device>(),
-                                                   data.flat<T>().data(),
-                                                   warp.flat<T>().data(),
-                                                   grad_output.flat<T>().data(),
-                                                   grad_data->flat<T>().data(),
-                                                   grad_warp->flat<T>().data(),
-                                                   batch_size,
-                                                   data_height,
-                                                   data_width,
-                                                   data_channels,
-                                                   num_sampling_points);
+      functor::ResamplerGrad2DFunctor<Device, T>()(
+          ctx, ctx->eigen_device<Device>(), data.flat<T>().data(),
+          warp.flat<T>().data(), grad_output.flat<T>().data(),
+          grad_data->flat<T>().data(), grad_warp->flat<T>().data(), batch_size,
+          data_height, data_width, data_channels, num_sampling_points);
     }
   }
 
@@ -436,11 +390,9 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ResamplerGradOp);
 };
 
-#define REGISTER(TYPE)                       \
-  REGISTER_KERNEL_BUILDER(                   \
-      Name("ResamplerGrad")                  \
-          .Device(DEVICE_CPU)  \
-          .TypeConstraint<TYPE>("T"),        \
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("ResamplerGrad").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
       ResamplerGradOp<CPUDevice, TYPE>);
 
 TF_CALL_half(REGISTER);
@@ -449,11 +401,10 @@ TF_CALL_double(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
-#define REGISTER(TYPE)                                           \
-  REGISTER_KERNEL_BUILDER(Name("ResamplerGrad")                  \
-                              .Device(DEVICE_GPU)  \
-                              .TypeConstraint<TYPE>("T"),        \
-                          ResamplerGradOp<GPUDevice, TYPE>)
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("ResamplerGrad").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      ResamplerGradOp<GPUDevice, TYPE>)
 // Disable half and double precision since atomicAdds are not supported
 // TF_CALL_half(REGISTER);
 // TF_CALL_double(REGISTER);
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.h b/tensorflow/contrib/resampler/kernels/resampler_ops.h
index 8258ecaf5d3ba67094194c5cb12ca6d4d6efc85f..7fe3b9c0df71f51e07d38ea15a672d79fdc70453 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.h
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 // =============================================================================
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
+#define TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
 
 #if PLATFORM_WINDOWS
 #define __restrict__ __restrict
@@ -29,40 +29,26 @@ namespace functor {
 
 // Helper functor for the Resampler Op in 2D
 template <typename Device, typename T>
-struct Resampler2DFunctor{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const Device& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   T* __restrict__ output,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points);
+struct Resampler2DFunctor {
+  void operator()(::tensorflow::OpKernelContext* ctx, const Device& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  T* __restrict__ output, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points);
 };
 
-
 // Helper functor for the Resampler Gradient Op in 2D
 template <typename Device, typename T>
-struct ResamplerGrad2DFunctor{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const Device& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   const T* __restrict__ grad_output,
-                   T* __restrict__ grad_data,
-                   T* __restrict__ grad_warp,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points);
+struct ResamplerGrad2DFunctor {
+  void operator()(::tensorflow::OpKernelContext* ctx, const Device& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  const T* __restrict__ grad_output, T* __restrict__ grad_data,
+                  T* __restrict__ grad_warp, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points);
 };
 
-
 }  // namespace functor
 }  // namespace tensorflow
 
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_RESAMPLER_KERNELS_RESAMPLER_OPS_H_
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
index 636847a212f27c738032128e3f3f653ec32f851b..3c07051f685c74b6e45fb782c80871f38dffbbf4 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc
@@ -31,18 +31,15 @@ using GPUDevice = Eigen::GpuDevice;
 
 namespace {
 
-#define GET_DATA_POINT(x, y)                   \
-  data[batch_id * data_batch_stride +          \
-       data_channels * (y * data_width + x) +  \
+#define GET_DATA_POINT(x, y)                                                 \
+  data[batch_id * data_batch_stride + data_channels * (y * data_width + x) + \
        chan]
 
 template <typename T>
 __global__ void Resampler2DKernel(const T* __restrict__ data,
                                   const T* __restrict__ warp,
-                                  T* __restrict__ output,
-                                  const int batch_size,
-                                  const int data_height,
-                                  const int data_width,
+                                  T* __restrict__ output, const int batch_size,
+                                  const int data_height, const int data_width,
                                   const int data_channels,
                                   const int num_sampling_points) {
   const int output_data_size = batch_size * num_sampling_points * data_channels;
@@ -75,10 +72,8 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
     // The effect is that the sampled signal smoothly goes to 0 outside
     // the original input domain, rather than presenting a jump
     // discontinuity at the image boundaries.
-    if (x > static_cast<T>(-1.0) &&
-        y > static_cast<T>(-1.0) &&
-        x < static_cast<T>(data_width) &&
-        y < static_cast<T>(data_height)) {
+    if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
+        x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) {
       // Precompute floor (f) and ceil (c) values for x and y.
       const int fx = std::floor(static_cast<float>(x));
       const int fy = std::floor(static_cast<float>(y));
@@ -87,21 +82,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
       const T dx = static_cast<T>(cx) - x;
       const T dy = static_cast<T>(cy) - y;
 
-      const T img_fxfy = (fx >= 0 && fy >= 0)
-                         ? dx * dy * GET_DATA_POINT(fx, fy)
-                         : zero;
+      const T img_fxfy =
+          (fx >= 0 && fy >= 0) ? dx * dy * GET_DATA_POINT(fx, fy) : zero;
 
       const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1)
-                         ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy)
-                         : zero;
+                             ? (one - dx) * (one - dy) * GET_DATA_POINT(cx, cy)
+                             : zero;
 
       const T img_fxcy = (fx >= 0 && cy <= data_height - 1)
-                         ? dx * (one - dy) * GET_DATA_POINT(fx, cy)
-                         : zero;
+                             ? dx * (one - dy) * GET_DATA_POINT(fx, cy)
+                             : zero;
 
       const T img_cxfy = (cx <= data_width - 1 && fy >= 0)
-                         ? (one - dx) * dy * GET_DATA_POINT(cx, fy)
-                         : zero;
+                             ? (one - dx) * dy * GET_DATA_POINT(cx, fy)
+                             : zero;
 
       output[out_index] = img_fxfy + img_cxcy + img_fxcy + img_cxfy;
     } else {
@@ -115,24 +109,20 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
 namespace functor {
 
 template <typename T>
-struct Resampler2DFunctor<GPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const GPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   T* __restrict__ output,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points) {
-  const int output_data_size = batch_size * num_sampling_points * data_channels;
-  ::tensorflow::CudaLaunchConfig config =
-      ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
-  Resampler2DKernel<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          data, warp, output, batch_size, data_height, data_width,
-          data_channels, num_sampling_points);
+struct Resampler2DFunctor<GPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  T* __restrict__ output, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
+    const int output_data_size =
+        batch_size * num_sampling_points * data_channels;
+    ::tensorflow::CudaLaunchConfig config =
+        ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
+    Resampler2DKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            data, warp, output, batch_size, data_height, data_width,
+            data_channels, num_sampling_points);
   }
 };
 
@@ -145,26 +135,20 @@ template struct Resampler2DFunctor<GPUDevice, double>;
 
 namespace {
 
-#define UPDATE_GRAD_DATA_POINT(x, y, v)                  \
-  atomicAdd(grad_data + (batch_id * data_batch_stride +  \
-            data_channels * (y * data_width + x) +       \
-            chan),                                       \
+#define UPDATE_GRAD_DATA_POINT(x, y, v)                                \
+  atomicAdd(grad_data + (batch_id * data_batch_stride +                \
+                         data_channels * (y * data_width + x) + chan), \
             v)
 
-
 template <typename T>
-__global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
-                                      const T* __restrict__ warp,
-                                      const T* __restrict__ grad_output,
-                                      T* __restrict__ grad_data,
-                                      T* __restrict__ grad_warp,
-                                      const int batch_size,
-                                      const int data_height,
-                                      const int data_width,
-                                      const int data_channels,
-                                      const int num_sampling_points) {
-  const int resampler_output_size = batch_size * num_sampling_points *
-      data_channels;
+__global__ void ResamplerGrad2DKernel(
+    const T* __restrict__ data, const T* __restrict__ warp,
+    const T* __restrict__ grad_output, T* __restrict__ grad_data,
+    T* __restrict__ grad_warp, const int batch_size, const int data_height,
+    const int data_width, const int data_channels,
+    const int num_sampling_points) {
+  const int resampler_output_size =
+      batch_size * num_sampling_points * data_channels;
   CUDA_1D_KERNEL_LOOP(index, resampler_output_size) {
     const int out_index = index;
 
@@ -199,10 +183,8 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
     // The effect is that the sampled signal smoothly goes to 0 outside
     // the original input domain, rather than presenting a jump
     // discontinuity at the image boundaries.
-    if (x > static_cast<T>(-1.0) &&
-        y > static_cast<T>(-1.0) &&
-        x < static_cast<T>(data_width) &&
-        y < static_cast<T>(data_height)) {
+    if (x > static_cast<T>(-1.0) && y > static_cast<T>(-1.0) &&
+        x < static_cast<T>(data_width) && y < static_cast<T>(data_height)) {
       // Precompute floor (f) and ceil (c) values for x and y.
       const int fx = std::floor(static_cast<float>(x));
       const int fy = std::floor(static_cast<float>(y));
@@ -211,21 +193,17 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
       const T dx = static_cast<T>(cx) - x;
       const T dy = static_cast<T>(cy) - y;
 
-      const T img_fxfy = (fx >= 0 && fy >= 0)
-                         ? GET_DATA_POINT(fx, fy)
-                         : zero;
+      const T img_fxfy = (fx >= 0 && fy >= 0) ? GET_DATA_POINT(fx, fy) : zero;
 
       const T img_cxcy = (cx <= data_width - 1 && cy <= data_height - 1)
-                         ? GET_DATA_POINT(cx, cy)
-                         : zero;
+                             ? GET_DATA_POINT(cx, cy)
+                             : zero;
 
-      const T img_fxcy = (fx >= 0 && cy <= data_height - 1)
-                         ? GET_DATA_POINT(fx, cy)
-                         : zero;
+      const T img_fxcy =
+          (fx >= 0 && cy <= data_height - 1) ? GET_DATA_POINT(fx, cy) : zero;
 
-      const T img_cxfy = (cx <= data_width - 1 && fy >= 0)
-                         ? GET_DATA_POINT(cx, fy)
-                         : zero;
+      const T img_cxfy =
+          (cx <= data_width - 1 && fy >= 0) ? GET_DATA_POINT(cx, fy) : zero;
 
       // Update partial gradients wrt relevant warp field entries
       atomicAdd(grad_warp + warp_id_x,
@@ -241,7 +219,7 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
       }
       if (cx <= data_width - 1 && cy <= data_height - 1) {
         UPDATE_GRAD_DATA_POINT(cx, cy,
-                               grad_output_value  * (one - dx) * (one - dy));
+                               grad_output_value * (one - dx) * (one - dy));
       }
       if (fx >= 0 && cy <= data_height - 1) {
         UPDATE_GRAD_DATA_POINT(fx, cy, grad_output_value * dx * (one - dy));
@@ -261,43 +239,37 @@ __global__ void ResamplerGrad2DKernel(const T* __restrict__ data,
 namespace functor {
 
 template <typename T>
-struct ResamplerGrad2DFunctor<GPUDevice, T>{
-  void operator ()(::tensorflow::OpKernelContext* ctx,
-                   const GPUDevice& d,
-                   const T* __restrict__ data,
-                   const T* __restrict__ warp,
-                   const T* __restrict__ grad_output,
-                   T* __restrict__ grad_data,
-                   T* __restrict__ grad_warp,
-                   const int batch_size,
-                   const int data_height,
-                   const int data_width,
-                   const int data_channels,
-                   const int num_sampling_points) {
-  // Set gradients to 0, because the kernel incrementally updates the
-  // tensor entries by adding partial contributions.
-  const int grad_warp_size = batch_size * num_sampling_points * 2;
-  const int grad_data_size = batch_size * data_height * data_width *
-      data_channels;
-
-  ::tensorflow::CudaLaunchConfig config =
-     ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
-  ::tensorflow::SetZero
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          grad_warp_size, grad_warp);
-
-  config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
-  ::tensorflow::SetZero
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          grad_data_size, grad_data);
-
-  const int resampler_output_size = batch_size * num_sampling_points *
-      data_channels;
-  config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
-  ResamplerGrad2DKernel<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          data, warp, grad_output, grad_data, grad_warp, batch_size,
-          data_height, data_width, data_channels, num_sampling_points);
+struct ResamplerGrad2DFunctor<GPUDevice, T> {
+  void operator()(::tensorflow::OpKernelContext* ctx, const GPUDevice& d,
+                  const T* __restrict__ data, const T* __restrict__ warp,
+                  const T* __restrict__ grad_output, T* __restrict__ grad_data,
+                  T* __restrict__ grad_warp, const int batch_size,
+                  const int data_height, const int data_width,
+                  const int data_channels, const int num_sampling_points) {
+    // Set gradients to 0, because the kernel incrementally updates the
+    // tensor entries by adding partial contributions.
+    const int grad_warp_size = batch_size * num_sampling_points * 2;
+    const int grad_data_size =
+        batch_size * data_height * data_width * data_channels;
+
+    ::tensorflow::CudaLaunchConfig config =
+        ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
+    ::tensorflow::
+        SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            grad_warp_size, grad_warp);
+
+    config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
+    ::tensorflow::
+        SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            grad_data_size, grad_data);
+
+    const int resampler_output_size =
+        batch_size * num_sampling_points * data_channels;
+    config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
+    ResamplerGrad2DKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            data, warp, grad_output, grad_data, grad_warp, batch_size,
+            data_height, data_width, data_channels, num_sampling_points);
   }
 };
 
diff --git a/tensorflow/contrib/rnn/__init__.py b/tensorflow/contrib/rnn/__init__.py
index c568c6760fd67b1902b0c1e6dc1aa439cb63de9b..67f31785b57fddef67733c18c3b744322532c28c 100644
--- a/tensorflow/contrib/rnn/__init__.py
+++ b/tensorflow/contrib/rnn/__init__.py
@@ -18,6 +18,7 @@ See @{$python/contrib.rnn} guide.
 
 <!--From core-->
 @@RNNCell
+@@LayerRNNCell
 @@BasicRNNCell
 @@BasicLSTMCell
 @@GRUCell
@@ -68,6 +69,10 @@ See @{$python/contrib.rnn} guide.
 @@static_bidirectional_rnn
 @@stack_bidirectional_dynamic_rnn
 @@stack_bidirectional_rnn
+
+<!--RNN utilities-->
+@@transpose_batch_time
+@@best_effort_input_batch_size
 """
 
 from __future__ import absolute_import
@@ -85,6 +90,8 @@ from tensorflow.contrib.rnn.python.ops.lstm_ops import *
 from tensorflow.contrib.rnn.python.ops.rnn import *
 from tensorflow.contrib.rnn.python.ops.rnn_cell import *
 
+from tensorflow.python.ops.rnn import _best_effort_input_batch_size as best_effort_input_batch_size
+from tensorflow.python.ops.rnn import _transpose_batch_time as transpose_batch_time
 from tensorflow.python.ops.rnn import static_bidirectional_rnn
 from tensorflow.python.ops.rnn import static_rnn
 from tensorflow.python.ops.rnn import static_state_saving_rnn
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.cc b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
index e62501e9b100484a7be3cc6ae0fc25905c0d0724..03006dab323a7c6dc83d9a17c035ef705f7b0366 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.cc
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
@@ -36,11 +36,10 @@ perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
 
 namespace functor {
 template <typename T>
-void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx,
-                                     bool transa, bool transb, uint64 m,
-                                     uint64 n, uint64 k, T alpha, const T* a,
-                                     int lda, const T* b, int ldb, T beta, T* c,
-                                     int ldc) {
+void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx, bool transa,
+                                     bool transb, uint64 m, uint64 n, uint64 k,
+                                     T alpha, const T* a, int lda, const T* b,
+                                     int ldb, T beta, T* c, int ldc) {
 #if GOOGLE_CUDA
   perftools::gputools::blas::Transpose trans[] = {
       perftools::gputools::blas::Transpose::kNoTranspose,
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.h b/tensorflow/contrib/rnn/kernels/blas_gemm.h
index e33eceadff17fc3811f98fc29b3cb916b6a79766..a52c934233af3dc63e1a60d70fac6a9eba6a655b 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.h
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_BLAS_GEMM_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_BLAS_GEMM_H_
+#ifndef TENSORFLOW_CONTRIB_RNN_KERNELS_BLAS_GEMM_H_
+#define TENSORFLOW_CONTRIB_RNN_KERNELS_BLAS_GEMM_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -74,4 +74,4 @@ struct TensorBlasGemm<Device, T, false /* USE_CUBLAS */> {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_BLAS_GEMM_H_
+#endif  // TENSORFLOW_CONTRIB_RNN_KERNELS_BLAS_GEMM_H_
diff --git a/tensorflow/contrib/rnn/kernels/gru_ops.cc b/tensorflow/contrib/rnn/kernels/gru_ops.cc
index 0796f82b214620dd71d154fb8f8ec953dbcbb9ec..bd3d898fb09da0f490050c85b1e585502d8ecb2c 100644
--- a/tensorflow/contrib/rnn/kernels/gru_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/gru_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/contrib/rnn/kernels/gru_ops.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
@@ -61,9 +61,9 @@ class GRUCellBlockOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'w_ru' must be [input_size+cell_size, 2*cell_size]
     OP_REQUIRES(ctx, w_ru_tensor->dim_size(0) == input_size + cell_size,
@@ -82,10 +82,10 @@ class GRUCellBlockOp : public OpKernel {
                     "w_c.dim_size(0) != input_size + cell_size: ",
                     w_c_tensor->dim_size(0), " vs. ", input_size + cell_size));
 
-    OP_REQUIRES(
-        ctx, w_c_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("w_c.dim_size(1) != cell_size: ",
-                                w_c_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, w_c_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "w_c.dim_size(1) != cell_size: ", w_c_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'b_ru' must be [2*cell_size]
     OP_REQUIRES(ctx, b_ru_tensor->dim_size(0) == cell_size * 2,
@@ -97,10 +97,10 @@ class GRUCellBlockOp : public OpKernel {
                 errors::InvalidArgument("Rank of b_ru must be 1",
                                         b_ru_tensor->dims(), " vs. 1", 1));
     // Shape of 'b_c' must be [cell_size]
-    OP_REQUIRES(
-        ctx, b_c_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("b_c.dim_size(0) != cell_size: ",
-                                b_c_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, b_c_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "b_c.dim_size(0) != cell_size: ", b_c_tensor->dim_size(0),
+                    " vs. ", cell_size));
     OP_REQUIRES(ctx, b_c_tensor->dims() == 1,
                 errors::InvalidArgument("Rank of b_c must be 1",
                                         b_c_tensor->dims(), " vs. 1"));
@@ -216,9 +216,9 @@ class GRUBlockCellGradOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'w_ru' must be [input_size+cell_size, 2*cell_size]
     OP_REQUIRES(ctx, w_ru_tensor->dim_size(0) == input_size + cell_size,
@@ -237,10 +237,10 @@ class GRUBlockCellGradOp : public OpKernel {
                     "w_c.dim_size(0) != input_size + cell_size: ",
                     w_c_tensor->dim_size(0), " vs. ", input_size + cell_size));
 
-    OP_REQUIRES(
-        ctx, w_c_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("w_c.dim_size(1) != cell_size: ",
-                                w_c_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, w_c_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "w_c.dim_size(1) != cell_size: ", w_c_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Shape of 'b_ru' must be [2*cell_size]
     OP_REQUIRES(ctx, b_ru_tensor->dim_size(0) == cell_size * 2,
@@ -253,54 +253,54 @@ class GRUBlockCellGradOp : public OpKernel {
                                         b_ru_tensor->dims(), " vs. 1"));
 
     // Shape of 'b_c' must be [cell_size]
-    OP_REQUIRES(
-        ctx, b_c_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("b_c.dim_size(0) != cell_size: ",
-                                b_c_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, b_c_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "b_c.dim_size(0) != cell_size: ", b_c_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, b_c_tensor->dims() == 1,
                 errors::InvalidArgument("Rank of b_c must be 1 ",
                                         b_c_tensor->dims(), " vs. 1"));
 
     // Shape of 'r' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, r_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("r.dims(0) != batch_size: ",
-                                r_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, r_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("r.dims(1) != cell_size: ",
-                                r_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, r_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "r.dims(0) != batch_size: ", r_tensor->dim_size(0), " vs. ",
+                    batch_size));
+    OP_REQUIRES(ctx, r_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "r.dims(1) != cell_size: ", r_tensor->dim_size(1), " vs. ",
+                    cell_size));
 
     // Shape of 'u' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, u_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("u.dims(0) != batch_size: ",
-                                u_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, u_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("u.dims(1) != cell_size: ",
-                                u_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, u_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "u.dims(0) != batch_size: ", u_tensor->dim_size(0), " vs. ",
+                    batch_size));
+    OP_REQUIRES(ctx, u_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "u.dims(1) != cell_size: ", u_tensor->dim_size(1), " vs. ",
+                    cell_size));
 
     // Shape of 'c' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, c_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("c.dims(0) != batch_size: ",
-                                c_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, c_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("c.dims(1) != cell_size: ",
-                                c_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, c_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "c.dims(0) != batch_size: ", c_tensor->dim_size(0), " vs. ",
+                    batch_size));
+    OP_REQUIRES(ctx, c_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "c.dims(1) != cell_size: ", c_tensor->dim_size(1), " vs. ",
+                    cell_size));
 
     // Shape of 'd_h' must be [batch_size, cell_size]
-    OP_REQUIRES(
-        ctx, d_h_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("d_h.dims(0) != batch_size: ",
-                                d_h_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, d_h_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("d_h.dims(1) != cell_size: ",
-                                d_h_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, d_h_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "d_h.dims(0) != batch_size: ", d_h_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, d_h_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "d_h.dims(1) != cell_size: ", d_h_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     // Create output tensors.
     Tensor* d_x_tensor = nullptr;
diff --git a/tensorflow/contrib/rnn/kernels/gru_ops.h b/tensorflow/contrib/rnn/kernels/gru_ops.h
index 06a566506296dd658a01bb3038407f77a32cde84..3e2cb39e64bb3f0b22ea66c5601af36c5fb9b0fd 100644
--- a/tensorflow/contrib/rnn/kernels/gru_ops.h
+++ b/tensorflow/contrib/rnn/kernels/gru_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_GRU_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_GRU_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_RNN_KERNELS_GRU_OPS_H_
+#define TENSORFLOW_CONTRIB_RNN_KERNELS_GRU_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/contrib/rnn/kernels/blas_gemm.h"
@@ -181,4 +181,4 @@ struct GRUBlockCellBprop : public GRUCell {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_GRU_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_RNN_KERNELS_GRU_OPS_H_
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.cc b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
index 941a457fd3ada312b981fb23c769ff9ecea9ff13..5e7cf0ce84d332bd24088cd78995f7843813328b 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
@@ -281,23 +281,23 @@ class LSTMBlockCellOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size,
                 errors::InvalidArgument(
                     "w.dim_size(0) != input_size + cell_size: ",
                     w_tensor->dim_size(0), " vs. ", input_size + cell_size));
-    OP_REQUIRES(
-        ctx, w_tensor->dim_size(1) == cell_size * 4,
-        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
-                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4,
+                errors::InvalidArgument(
+                    "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1),
+                    " vs. ", cell_size * 4));
 
-    OP_REQUIRES(
-        ctx, b_tensor->dim_size(0) == cell_size * 4,
-        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
-                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4,
+                errors::InvalidArgument(
+                    "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0),
+                    " vs. ", cell_size * 4));
 
     // Allocate our output tensors.
     Tensor* i_tensor = nullptr;
@@ -484,77 +484,77 @@ class LSTMBlockCellGradOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, w_tensor->dim_size(0) == input_size + cell_size,
                 errors::InvalidArgument(
                     "w.dim_size(0) != input_size + cell_size: ",
                     w_tensor->dim_size(0), " vs. ", input_size + cell_size));
-    OP_REQUIRES(
-        ctx, w_tensor->dim_size(1) == cell_size * 4,
-        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
-                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4,
+                errors::InvalidArgument(
+                    "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1),
+                    " vs. ", cell_size * 4));
 
-    OP_REQUIRES(
-        ctx, b_tensor->dim_size(0) == cell_size * 4,
-        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
-                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4,
+                errors::InvalidArgument(
+                    "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0),
+                    " vs. ", cell_size * 4));
 
-    OP_REQUIRES(
-        ctx, i_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("i.dim_size(0) != batch_size: ",
-                                i_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, i_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("i.dim_size(1) != cell_size: ",
-                                i_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, i_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "i.dim_size(0) != batch_size: ", i_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, i_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "i.dim_size(1) != cell_size: ", i_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, cs_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("cs.dim_size(0) != batch_size: ",
-                                cs_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, cs_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("cs.dim_size(1) != cell_size: ",
-                                cs_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, cs_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "cs.dim_size(0) != batch_size: ", cs_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, cs_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "cs.dim_size(1) != cell_size: ", cs_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, f_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("f.dim_size(0) != batch_size: ",
-                                f_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, f_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("i.dim_size(1) != cell_size: ",
-                                f_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, f_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "f.dim_size(0) != batch_size: ", f_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, f_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "i.dim_size(1) != cell_size: ", f_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, o_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("o.dim_size(0) != batch_size: ",
-                                o_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, o_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("o.dim_size(1) != cell_size: ",
-                                o_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, o_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "o.dim_size(0) != batch_size: ", o_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, o_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "o.dim_size(1) != cell_size: ", o_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, ci_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("ci.dim_size(0) != batch_size: ",
-                                ci_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, ci_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("ci.dim_size(1) != cell_size: ",
-                                ci_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, ci_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "ci.dim_size(0) != batch_size: ", ci_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, ci_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "ci.dim_size(1) != cell_size: ", ci_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
-    OP_REQUIRES(
-        ctx, co_tensor->dim_size(0) == batch_size,
-        errors::InvalidArgument("co.dim_size(0) != batch_size: ",
-                                co_tensor->dim_size(0), " vs. ", batch_size));
-    OP_REQUIRES(
-        ctx, co_tensor->dim_size(1) == cell_size,
-        errors::InvalidArgument("co.dim_size(1) != cell_size: ",
-                                co_tensor->dim_size(1), " vs. ", cell_size));
+    OP_REQUIRES(ctx, co_tensor->dim_size(0) == batch_size,
+                errors::InvalidArgument(
+                    "co.dim_size(0) != batch_size: ", co_tensor->dim_size(0),
+                    " vs. ", batch_size));
+    OP_REQUIRES(ctx, co_tensor->dim_size(1) == cell_size,
+                errors::InvalidArgument(
+                    "co.dim_size(1) != cell_size: ", co_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     OP_REQUIRES(ctx, cs_grad_tensor->dim_size(0) == batch_size,
                 errors::InvalidArgument(
@@ -860,9 +860,9 @@ class BlockLSTMOp : public OpKernel {
                                         h_prev_tensor->dim_size(0), " vs. ",
                                         batch_size));
     OP_REQUIRES(ctx, h_prev_tensor->dim_size(1) == cell_size,
-                errors::InvalidArgument("h_prev.dims(1) != cell_size: ",
-                                        h_prev_tensor->dim_size(1), " vs. ",
-                                        cell_size));
+                errors::InvalidArgument(
+                    "h_prev.dims(1) != cell_size: ", h_prev_tensor->dim_size(1),
+                    " vs. ", cell_size));
 
     const Tensor* w_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor));
@@ -872,46 +872,46 @@ class BlockLSTMOp : public OpKernel {
                 errors::InvalidArgument(
                     "w.dim_size(0) != input_size + cell_size: ",
                     w_tensor->dim_size(0), " vs. ", input_size + cell_size));
-    OP_REQUIRES(
-        ctx, w_tensor->dim_size(1) == cell_size * 4,
-        errors::InvalidArgument("w.dim_size(1) != cell_size * 4: ",
-                                w_tensor->dim_size(1), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, w_tensor->dim_size(1) == cell_size * 4,
+                errors::InvalidArgument(
+                    "w.dim_size(1) != cell_size * 4: ", w_tensor->dim_size(1),
+                    " vs. ", cell_size * 4));
 
     const Tensor* wci_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor));
     OP_REQUIRES(ctx, wci_tensor->dims() == 1,
                 errors::InvalidArgument("wci must be 1D"));
-    OP_REQUIRES(
-        ctx, wci_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("wci.dim_size(0) != cell_size: ",
-                                wci_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, wci_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "wci.dim_size(0) != cell_size: ", wci_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     const Tensor* wcf_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wcf", &wcf_tensor));
     OP_REQUIRES(ctx, wcf_tensor->dims() == 1,
                 errors::InvalidArgument("wcf must be 1D"));
-    OP_REQUIRES(
-        ctx, wcf_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("wcf.dim_size(0) != cell_size: ",
-                                wcf_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, wcf_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "wcf.dim_size(0) != cell_size: ", wcf_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     const Tensor* wco_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wco", &wco_tensor));
     OP_REQUIRES(ctx, wco_tensor->dims() == 1,
                 errors::InvalidArgument("wco must be 1D"));
-    OP_REQUIRES(
-        ctx, wco_tensor->dim_size(0) == cell_size,
-        errors::InvalidArgument("wco.dim_size(0) != cell_size: ",
-                                wco_tensor->dim_size(0), " vs. ", cell_size));
+    OP_REQUIRES(ctx, wco_tensor->dim_size(0) == cell_size,
+                errors::InvalidArgument(
+                    "wco.dim_size(0) != cell_size: ", wco_tensor->dim_size(0),
+                    " vs. ", cell_size));
 
     const Tensor* b_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("b", &b_tensor));
     OP_REQUIRES(ctx, b_tensor->dims() == 1,
                 errors::InvalidArgument("b must be 1D"));
-    OP_REQUIRES(
-        ctx, b_tensor->dim_size(0) == cell_size * 4,
-        errors::InvalidArgument("b.dim_size(0) != cell_size * 4: ",
-                                b_tensor->dim_size(0), " vs. ", cell_size * 4));
+    OP_REQUIRES(ctx, b_tensor->dim_size(0) == cell_size * 4,
+                errors::InvalidArgument(
+                    "b.dim_size(0) != cell_size * 4: ", b_tensor->dim_size(0),
+                    " vs. ", cell_size * 4));
 
     TensorShape batch_cell_shape({timelen, batch_size, cell_size});
     Tensor* i_out;
@@ -1065,9 +1065,9 @@ class BlockLSTMGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("w", &w_tensor));
     const int64 cell_size = w_tensor->dim_size(1) / 4;
     OP_REQUIRES(ctx, input_size + cell_size == w_tensor->dim_size(0),
-                errors::InvalidArgument("w matrix rows don't match: ",
-                                        input_size + cell_size, " vs. ",
-                                        w_tensor->dim_size(0)));
+                errors::InvalidArgument(
+                    "w matrix rows don't match: ", input_size + cell_size,
+                    " vs. ", w_tensor->dim_size(0)));
 
     const Tensor* wci_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->input("wci", &wci_tensor));
@@ -1193,7 +1193,6 @@ class BlockLSTMGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::v(),
                                            batch_cell_shape, &h_grad_tensor));
 
-
     const Device& device = ctx->eigen_device<Device>();
 
     functor::TensorZero<Device, T>()(device, cs_grad_tensor.flat<float>());
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.h b/tensorflow/contrib/rnn/kernels/lstm_ops.h
index 1906581b16b2e76243320bc67c8ac831323fb8e7..d23cedc234b8c0e1a784346f28164ae79b8cbf89 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.h
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
+#define TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/contrib/rnn/kernels/blas_gemm.h"
@@ -92,7 +92,6 @@ struct TensorZeroPadding {
   }
 };
 
-
 struct LSTMBlockCell {
   LSTMBlockCell(const int batch_size, const int input_size, const int cell_size)
       : batch_size_(batch_size),
@@ -291,4 +290,4 @@ struct BlockLSTMBprop : public LSTMBlockCell {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_RNN_KERNELS_LSTM_OPS_H_
diff --git a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
index 544cd163c50062093acf7f5e942f67606936c0e3..68184b643e5e7a04ffecb804703051638514b7b2 100644
--- a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
+++ b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
@@ -149,8 +149,9 @@ TEST_F(LSTMOpsTest, BlockLSTMGrad_ShapeFn) {
   INFER_ERROR("must be rank 1", op, "?;?;?;?;?;?;?;?;[1,?]" + suffix);
 
   // Output with all input knowns makes known rank outputs.
-  INFER_OK(op, JoinedCopies("?", 18), "[?,?,?];" + JoinedCopies("[?,?]", 3) +
-                                          ";" + JoinedCopies("[?]", 4));
+  INFER_OK(
+      op, JoinedCopies("?", 18),
+      "[?,?,?];" + JoinedCopies("[?,?]", 3) + ";" + JoinedCopies("[?]", 4));
 
   // Output with copies input shapes to output.
   string input = strings::StrCat("?;[?,?,?];", JoinedCopies("[?,?]", 3), ";",
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index cfecab00440ed72f385de8b9cd41fa689ae7b5eb..0e62b315b61cb3ceeb5cfd33bf5102a71abef83b 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -39,10 +39,6 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.framework import test_util
-from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
-
-
 
 # pylint: enable=protected-access
 Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
@@ -85,19 +81,22 @@ class RNNCellTest(test.TestCase):
         ], [v.name for v in cell.trainable_variables])
         self.assertFalse(cell.non_trainable_variables)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[0].shape, (1, 2))
 
   def testBasicRNNCellNotTrainable(self):
     with self.test_session() as sess:
+
       def not_trainable_getter(getter, *args, **kwargs):
         kwargs["trainable"] = False
         return getter(*args, **kwargs)
 
       with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5),
+          "root",
+          initializer=init_ops.constant_initializer(0.5),
           custom_getter=not_trainable_getter):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 2])
@@ -109,9 +108,10 @@ class RNNCellTest(test.TestCase):
             "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
         ], [v.name for v in cell.non_trainable_variables])
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[0].shape, (1, 2))
 
   def testGRUCell(self):
@@ -122,9 +122,10 @@ class RNNCellTest(test.TestCase):
         m = array_ops.zeros([1, 2])
         g, _ = rnn_cell_impl.GRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         # Smoke test
         self.assertAllClose(res[0], [[0.175991, 0.175991]])
       with variable_scope.variable_scope(
@@ -134,13 +135,43 @@ class RNNCellTest(test.TestCase):
         m = array_ops.zeros([1, 2])
         g, _ = rnn_cell_impl.GRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g],
-            {x.name: np.array([[1., 1., 1.]]),
-             m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         # Smoke test
         self.assertAllClose(res[0], [[0.156736, 0.156736]])
 
+  def testSRUCell(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.509682, 0.509682]])
+
+  def testSRUCellWithDiffSize(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.55255556, 0.55255556]])
+
   def testBasicLSTMCell(self):
     for dtype in [dtypes.float16, dtypes.float32]:
       np_dtype = dtype.as_numpy_dtype
@@ -151,8 +182,7 @@ class RNNCellTest(test.TestCase):
           m = array_ops.zeros([1, 8], dtype=dtype)
           cell = rnn_cell_impl.MultiRNNCell(
               [
-                  rnn_cell_impl.BasicLSTMCell(
-                      2, state_is_tuple=False)
+                  rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
                   for _ in range(2)
               ],
               state_is_tuple=False)
@@ -170,22 +200,21 @@ class RNNCellTest(test.TestCase):
               "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
               rnn_cell_impl._BIAS_VARIABLE_NAME
           ]
-          self.assertEqual(
-              expected_variable_names,
-              [v.name for v in cell.trainable_variables])
+          self.assertEqual(expected_variable_names,
+                           [v.name for v in cell.trainable_variables])
           self.assertFalse(cell.non_trainable_variables)
           sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run(
-              [g, out_m],
-              {x.name: np.array([[1., 1.]]),
-               m.name: 0.1 * np.ones([1, 8])})
+          res = sess.run([g, out_m], {
+              x.name: np.array([[1., 1.]]),
+              m.name: 0.1 * np.ones([1, 8])
+          })
           self.assertEqual(len(res), 2)
           variables = variables_lib.global_variables()
           self.assertEqual(expected_variable_names, [v.name for v in variables])
           # The numbers in results were not calculated, this is just a
           # smoke test.
-          self.assertAllClose(
-              res[0], np.array([[0.240, 0.240]], dtype=np_dtype), 1e-2)
+          self.assertAllClose(res[0], np.array(
+              [[0.240, 0.240]], dtype=np_dtype), 1e-2)
           expected_mem = np.array(
               [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]],
               dtype=np_dtype)
@@ -195,13 +224,13 @@ class RNNCellTest(test.TestCase):
           # Test BasicLSTMCell with input_size != num_units.
           x = array_ops.zeros([1, 3], dtype=dtype)
           m = array_ops.zeros([1, 4], dtype=dtype)
-          g, out_m = rnn_cell_impl.BasicLSTMCell(
-              2, state_is_tuple=False)(x, m)
+          g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m)
           sess.run([variables_lib.global_variables_initializer()])
           res = sess.run(
-              [g, out_m],
-              {x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
-               m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)})
+              [g, out_m], {
+                  x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
+                  m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)
+              })
           self.assertEqual(len(res), 2)
 
   def testBasicLSTMCellDimension0Error(self):
@@ -219,9 +248,11 @@ class RNNCellTest(test.TestCase):
           g, out_m = rnn_cell_impl.BasicLSTMCell(
               num_units, state_is_tuple=False)(x, m)
           sess.run([variables_lib.global_variables_initializer()])
-          sess.run([g, out_m],
-                   {x.name: 1 * np.ones([batch_size, input_size]),
-                    m.name: 0.1 * np.ones([batch_size - 1, state_size])})
+          sess.run(
+              [g, out_m], {
+                  x.name: 1 * np.ones([batch_size, input_size]),
+                  m.name: 0.1 * np.ones([batch_size - 1, state_size])
+              })
 
   def testBasicLSTMCellStateSizeError(self):
     """Tests that state_size must be num_units * 2."""
@@ -238,9 +269,11 @@ class RNNCellTest(test.TestCase):
           g, out_m = rnn_cell_impl.BasicLSTMCell(
               num_units, state_is_tuple=False)(x, m)
           sess.run([variables_lib.global_variables_initializer()])
-          sess.run([g, out_m],
-                   {x.name: 1 * np.ones([batch_size, input_size]),
-                    m.name: 0.1 * np.ones([batch_size, state_size])})
+          sess.run(
+              [g, out_m], {
+                  x.name: 1 * np.ones([batch_size, input_size]),
+                  m.name: 0.1 * np.ones([batch_size, state_size])
+              })
 
   def testBasicLSTMCellStateTupleType(self):
     with self.test_session():
@@ -288,11 +321,12 @@ class RNNCellTest(test.TestCase):
             state_is_tuple=True)
         g, (out_m0, out_m1) = cell(x, (m0, m1))
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, out_m0, out_m1], {
-            x.name: np.array([[1., 1.]]),
-            m0.name: 0.1 * np.ones([1, 4]),
-            m1.name: 0.1 * np.ones([1, 4])
-        })
+        res = sess.run(
+            [g, out_m0, out_m1], {
+                x.name: np.array([[1., 1.]]),
+                m0.name: 0.1 * np.ones([1, 4]),
+                m1.name: 0.1 * np.ones([1, 4])
+            })
         self.assertEqual(len(res), 3)
         # The numbers in results were not calculated, this is just a smoke test.
         # Note, however, these values should match the original
@@ -323,10 +357,11 @@ class RNNCellTest(test.TestCase):
             state_is_tuple=False)
         output, state = cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([output, state], {
-            x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]),
-            m.name: 0.1 * np.ones((batch_size, state_size))
-        })
+        res = sess.run(
+            [output, state], {
+                x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]),
+                m.name: 0.1 * np.ones((batch_size, state_size))
+            })
         self.assertEqual(len(res), 2)
         # The numbers in results were not calculated, this is mostly just a
         # smoke test.
@@ -429,10 +464,10 @@ class RNNCellTest(test.TestCase):
             rnn_cell_impl.GRUCell(3), num_proj=3)
         g, new_m = cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, new_m],
-            {x.name: np.array([[1., 1.]]),
-             m.name: np.array([[0.1, 0.1, 0.1]])})
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1, 0.1]])
+        })
         self.assertEqual(res[1].shape, (1, 3))
         # The numbers in results were not calculated, this is just a smoke test.
         self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]])
@@ -466,9 +501,11 @@ class RNNCellTest(test.TestCase):
         base_cell = rnn_cell_impl.GRUCell(3)
         g, m_new = base_cell(x, m)
         variable_scope.get_variable_scope().reuse_variables()
+
         def residual_with_slice_fn(inp, out):
           inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
           return inp_sliced + out
+
         g_res, m_new_res = rnn_cell_impl.ResidualWrapper(
             base_cell, residual_with_slice_fn)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
@@ -538,10 +575,10 @@ class RNNCellTest(test.TestCase):
         self.assertEqual(embedding_cell.output_size, 2)
         g, new_m = embedding_cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, new_m],
-            {x.name: np.array([[1]]),
-             m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[1].shape, (1, 2))
         # The numbers in results were not calculated, this is just a smoke test.
         self.assertAllClose(res[0], [[0.17139, 0.17139]])
@@ -571,8 +608,8 @@ class RNNCellTest(test.TestCase):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 4])
         _, ml = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2)
-             for _ in range(2)], state_is_tuple=False)(x, m)
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=False)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(ml, {
             x.name: np.array([[1., 1.]]),
@@ -592,19 +629,20 @@ class RNNCellTest(test.TestCase):
         # Test incorrectness of state
         with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
           rnn_cell_impl.MultiRNNCell(
-              [rnn_cell_impl.GRUCell(2)
-               for _ in range(2)], state_is_tuple=True)(x, m_bad)
+              [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+              state_is_tuple=True)(x, m_bad)
 
         _, ml = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2)
-             for _ in range(2)], state_is_tuple=True)(x, m_good)
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=True)(x, m_good)
 
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(ml, {
-            x.name: np.array([[1., 1.]]),
-            m_good[0].name: np.array([[0.1, 0.1]]),
-            m_good[1].name: np.array([[0.1, 0.1]])
-        })
+        res = sess.run(
+            ml, {
+                x.name: np.array([[1., 1.]]),
+                m_good[0].name: np.array([[0.1, 0.1]]),
+                m_good[1].name: np.array([[0.1, 0.1]])
+            })
 
         # The numbers in results were not calculated, this is just a
         # smoke test.  However, these numbers should match those of
@@ -615,8 +653,11 @@ class RNNCellTest(test.TestCase):
 
 class DropoutWrapperTest(test.TestCase):
 
-  def _testDropoutWrapper(self, batch_size=None, time_steps=None,
-                          parallel_iterations=None, **kwargs):
+  def _testDropoutWrapper(self,
+                          batch_size=None,
+                          time_steps=None,
+                          parallel_iterations=None,
+                          **kwargs):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -627,14 +668,14 @@ class DropoutWrapperTest(test.TestCase):
           x = constant_op.constant(
               [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32)
           m = rnn_cell_impl.LSTMStateTuple(
-              *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32)
-               ] * 2)
+              *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32
+                                    )] * 2)
         else:
           x = constant_op.constant(
               np.random.randn(time_steps, batch_size, 3).astype(np.float32))
           m = rnn_cell_impl.LSTMStateTuple(*[
-              constant_op.constant(
-                  [[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)
+              constant_op.
+              constant([[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)
           ] * 2)
         outputs, final_state = rnn.dynamic_rnn(
             cell=rnn_cell_impl.DropoutWrapper(
@@ -650,13 +691,19 @@ class DropoutWrapperTest(test.TestCase):
         self.assertEqual(res[1].h.shape, (batch_size, 3))
         return res
 
+  def testWrappedCellProperty(self):
+    cell = rnn_cell_impl.BasicRNNCell(10)
+    wrapper = rnn_cell_impl.DropoutWrapper(cell)
+    # Github issue 15810
+    self.assertEqual(wrapper.wrapped_cell, cell)
+
   def testDropoutWrapperKeepAllConstantInput(self):
     keep = array_ops.ones([])
     res = self._testDropoutWrapper(
         input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     self.assertAllClose(true_full_output, res[0])
@@ -668,8 +715,8 @@ class DropoutWrapperTest(test.TestCase):
     res = self._testDropoutWrapper(
         input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     self.assertAllClose(true_full_output, res[0])
@@ -684,16 +731,20 @@ class DropoutWrapperTest(test.TestCase):
     ## consistent across both calls.  Otherwise the seed may not end
     ## up being munged consistently across both graphs.
     res_standard_1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, seed=10,
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
         parallel_iterations=1)
     # Clear away the graph and the test session (which keeps variables around)
     ops.reset_default_graph()
     self._ClearCachedSession()
     random_seed.set_random_seed(2)
     res_standard_2 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, seed=10,
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
         parallel_iterations=1)
     self.assertAllClose(res_standard_1[0], res_standard_2[0])
     self.assertAllClose(res_standard_1[1].c, res_standard_2[1].c)
@@ -703,11 +754,12 @@ class DropoutWrapperTest(test.TestCase):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
     keep_none = variable_scope.get_variable("none", initializer=1e-10)
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_all, output_keep_prob=keep_none,
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_none,
         state_keep_prob=keep_all)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     self.assertAllClose(np.zeros(res[0].shape), res[0])
@@ -720,13 +772,13 @@ class DropoutWrapperTest(test.TestCase):
     # Even though we dropout state, by default DropoutWrapper never
     # drops out the memory ("c") term of an LSTMStateTuple.
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_all, output_keep_prob=keep_all,
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_all,
         state_keep_prob=keep_none)
-    true_c_state = np.array(
-        [[1.713925, 1.713925, 1.713925]], dtype=np.float32)
+    true_c_state = np.array([[1.713925, 1.713925, 1.713925]], dtype=np.float32)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     self.assertAllClose(true_full_output[0], res[0][0])
     # Second output is modified by zero input state
     self.assertGreater(np.linalg.norm(true_full_output[1] - res[0][1]), 1e-4)
@@ -739,13 +791,14 @@ class DropoutWrapperTest(test.TestCase):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
     keep_none = variable_scope.get_variable("none", initializer=1e-10)
     true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]],
-         [[0.895509, 0.895509, 0.895509]]], dtype=np.float32)
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
     true_full_final_c = np.array(
         [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
     # All outputs are different because inputs are zeroed out
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_none, output_keep_prob=keep_all,
+        input_keep_prob=keep_none,
+        output_keep_prob=keep_all,
         state_keep_prob=keep_all)
     self.assertGreater(np.linalg.norm(res[0] - true_full_output), 1e-4)
     self.assertGreater(np.linalg.norm(res[1].h - true_full_output[1]), 1e-4)
@@ -755,9 +808,13 @@ class DropoutWrapperTest(test.TestCase):
     keep_some = 0.8
     keep_all = variable_scope.get_variable("all", initializer=1.0)
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_all, output_keep_prob=keep_some,
-        state_keep_prob=keep_all, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7)
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_all,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
     # Ensure the same dropout pattern for all time steps
     output_mask = np.abs(res[0]) > 1e-6
     for m in output_mask[1:]:
@@ -766,9 +823,13 @@ class DropoutWrapperTest(test.TestCase):
   def testDropoutWrapperRecurrentStateInputAndOutput(self):
     keep_some = 0.9
     res = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7)
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
 
     # Smoke test for the state/input masks.
     output_mask = np.abs(res[0]) > 1e-6
@@ -792,17 +853,27 @@ class DropoutWrapperTest(test.TestCase):
     random_seed.set_random_seed(2347)
     np.random.seed(23487)
     res0 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7, seed=-234987)
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987)
     ops.reset_default_graph()
     self._ClearCachedSession()
     random_seed.set_random_seed(2347)
     np.random.seed(23487)
     res1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some, output_keep_prob=keep_some,
-        state_keep_prob=keep_some, variational_recurrent=True,
-        input_size=3, batch_size=5, time_steps=7, seed=-234987)
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987)
 
     output_mask = np.abs(res0[0]) > 1e-6
     for time_step in output_mask:
@@ -839,9 +910,10 @@ class SlimRNNCellTest(test.TestCase):
         g, _ = rnn_cell_impl._SlimRNNCell(my_cell)(x, m)
         # pylint: enable=protected-access
         sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g], {x.name: np.array([[1., 1.]]),
-                  m.name: np.array([[0.1, 0.1]])})
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
         self.assertEqual(res[0].shape, (1, 2))
 
   def testBasicRNNCellMatch(self):
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 9cea2ec79a982e4fb362ec564eb72b3894917842..57521c6a9ba0b2d66639017b09c541e270276323 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -45,6 +45,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import nest
 
+
 class Plus1RNNCell(rnn_lib.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
 
@@ -160,8 +161,7 @@ class RNNTest(test.TestCase):
     input_size = 5
     max_length = 8  # unrolled up to this length
     inputs = max_length * [
-        array_ops.placeholder(
-            dtypes.float32, shape=(batch_size, input_size))
+        array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
     ]
     outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
     self.assertEqual(len(outputs), len(inputs))
@@ -178,10 +178,9 @@ class RNNTest(test.TestCase):
         self.assertAllClose(v, input_value + 1.0)
 
       # Final state
-      self.assertAllClose(
-          values[-1],
-          max_length * np.ones(
-              (batch_size, input_size), dtype=np.float32))
+      self.assertAllClose(values[-1],
+                          max_length * np.ones(
+                              (batch_size, input_size), dtype=np.float32))
 
   def testDropout(self):
     cell = Plus1RNNCell()
@@ -191,8 +190,7 @@ class RNNTest(test.TestCase):
     input_size = 5
     max_length = 8
     inputs = max_length * [
-        array_ops.placeholder(
-            dtypes.float32, shape=(batch_size, input_size))
+        array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
     ]
     with variable_scope.variable_scope("share_scope"):
       outputs, state = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
@@ -207,8 +205,10 @@ class RNNTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       input_value = np.random.randn(batch_size, input_size)
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
-      full_dropout_values = sess.run(dropped_outputs,
-                                     feed_dict={inputs[0]: input_value})
+      full_dropout_values = sess.run(
+          dropped_outputs, feed_dict={
+              inputs[0]: input_value
+          })
 
       for v in values[:-1]:
         self.assertAllClose(v, input_value + 1.0)
@@ -222,8 +222,7 @@ class RNNTest(test.TestCase):
     input_size = 5
     max_length = 8
     inputs = max_length * [
-        array_ops.placeholder(
-            dtypes.float32, shape=(batch_size, input_size))
+        array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
     ]
     with variable_scope.variable_scope("drop_scope"):
       dynamic_outputs, dynamic_state = rnn.static_rnn(
@@ -234,12 +233,16 @@ class RNNTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       dynamic_values = sess.run(
           dynamic_outputs,
-          feed_dict={inputs[0]: input_value,
-                     sequence_length: [2, 3]})
+          feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
       dynamic_state_value = sess.run(
           [dynamic_state],
-          feed_dict={inputs[0]: input_value,
-                     sequence_length: [2, 3]})
+          feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
 
       # outputs are fully calculated for t = 0, 1
       for v in dynamic_values[:2]:
@@ -289,8 +292,7 @@ class RNNTest(test.TestCase):
       input_size = 5
       max_length = 8  # unrolled up to this length
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       return rnn.static_rnn(cell, inputs, dtype=dtypes.float32, scope=scope)
 
@@ -316,8 +318,7 @@ class LSTMTest(test.TestCase):
       cell = rnn_cell.LSTMCell(
           num_units, initializer=initializer, state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       self.assertEqual(len(outputs), len(inputs))
@@ -343,8 +344,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       outputs, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
       self.assertEqual(len(outputs), len(inputs))
@@ -374,8 +374,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       with variable_scope.variable_scope("share_scope"):
         outputs, state = rnn.static_state_saving_rnn(
@@ -388,7 +387,9 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       (last_state_value, saved_state_value) = sess.run(
           [state, state_saver.saved_state["save_lstm"]],
-          feed_dict={inputs[0]: input_value})
+          feed_dict={
+              inputs[0]: input_value
+          })
       self.assertAllEqual(last_state_value, saved_state_value)
 
   def testNoProjNoShardingTupleStateSaver(self):
@@ -406,8 +407,7 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=True)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       with variable_scope.variable_scope("share_scope"):
         outputs, state = rnn.static_state_saving_rnn(
@@ -420,7 +420,9 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       last_and_saved_states = sess.run(
           state + (state_saver.saved_state["c"], state_saver.saved_state["m"]),
-          feed_dict={inputs[0]: input_value})
+          feed_dict={
+              inputs[0]: input_value
+          })
       self.assertEqual(4, len(last_and_saved_states))
       self.assertAllEqual(last_and_saved_states[:2], last_and_saved_states[2:])
 
@@ -432,16 +434,17 @@ class LSTMTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
-      state_saver = TestStateSaver(batch_size, {
-          "c0": num_units,
-          "m0": num_units,
-          "c1": num_units + 1,
-          "m1": num_units + 1,
-          "c2": num_units + 2,
-          "m2": num_units + 2,
-          "c3": num_units + 3,
-          "m3": num_units + 3
-      })
+      state_saver = TestStateSaver(
+          batch_size, {
+              "c0": num_units,
+              "m0": num_units,
+              "c1": num_units + 1,
+              "m1": num_units + 1,
+              "c2": num_units + 2,
+              "m2": num_units + 2,
+              "c3": num_units + 3,
+              "m3": num_units + 3
+          })
 
       def _cell(i):
         return rnn_cell.LSTMCell(
@@ -459,8 +462,7 @@ class LSTMTest(test.TestCase):
         self.assertEqual(len(cell.state_size[i]), 2)
 
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
 
       state_names = (("c0", "m0"), ("c1", "m1"), ("c2", "m2"), ("c3", "m3"))
@@ -475,10 +477,15 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      last_states = sess.run(list(nest.flatten(state)),
-                             feed_dict={inputs[0]: input_value})
-      saved_states = sess.run(list(state_saver.saved_state.values()),
-                              feed_dict={inputs[0]: input_value})
+      last_states = sess.run(
+          list(nest.flatten(state)), feed_dict={
+              inputs[0]: input_value
+          })
+      saved_states = sess.run(
+          list(state_saver.saved_state.values()),
+          feed_dict={
+              inputs[0]: input_value
+          })
       self.assertEqual(8, len(last_states))
       self.assertEqual(8, len(saved_states))
       flat_state_names = nest.flatten(state_names)
@@ -499,8 +506,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell = rnn_cell.LSTMCell(
           num_units,
@@ -526,8 +532,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell_notuple = rnn_cell.LSTMCell(
           num_units,
@@ -569,14 +574,20 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      outputs_notuple_v = sess.run(outputs_notuple,
-                                   feed_dict={inputs[0]: input_value})
-      outputs_tuple_v = sess.run(outputs_tuple,
-                                 feed_dict={inputs[0]: input_value})
+      outputs_notuple_v = sess.run(
+          outputs_notuple, feed_dict={
+              inputs[0]: input_value
+          })
+      outputs_tuple_v = sess.run(
+          outputs_tuple, feed_dict={
+              inputs[0]: input_value
+          })
       self.assertAllEqual(outputs_notuple_v, outputs_tuple_v)
 
-      (state_notuple_v,) = sess.run((state_notuple,),
-                                    feed_dict={inputs[0]: input_value})
+      (state_notuple_v,) = sess.run(
+          (state_notuple,), feed_dict={
+              inputs[0]: input_value
+          })
       state_tuple_v = sess.run(state_tuple, feed_dict={inputs[0]: input_value})
       self.assertAllEqual(state_notuple_v, np.hstack(state_tuple_v))
 
@@ -593,8 +604,7 @@ class LSTMTest(test.TestCase):
           -0.01, 0.01, seed=self._seed)
 
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
 
       cell = rnn_cell.LSTMCell(
@@ -625,8 +635,7 @@ class LSTMTest(test.TestCase):
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float64, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float64, shape=(None, input_size))
       ]
 
       cell = rnn_cell.LSTMCell(
@@ -661,8 +670,7 @@ class LSTMTest(test.TestCase):
     max_length = 8
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       initializer = init_ops.constant_initializer(0.001)
 
@@ -721,8 +729,7 @@ class LSTMTest(test.TestCase):
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float64, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float64, shape=(None, input_size))
       ]
 
       cell = rnn_cell.LSTMCell(
@@ -743,16 +750,21 @@ class LSTMTest(test.TestCase):
 
       self.assertEqual(len(outputs), len(inputs))
 
-      variables_lib.global_variables_initializer().run(
-          feed_dict={sequence_length: [2, 3]})
+      variables_lib.global_variables_initializer().run(feed_dict={
+          sequence_length: [2, 3]
+      })
       input_value = np.asarray(
           np.random.randn(batch_size, input_size), dtype=np.float64)
       values = sess.run(
-          outputs, feed_dict={inputs[0]: input_value,
-                              sequence_length: [2, 3]})
+          outputs, feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
       state_value = sess.run(
-          [state], feed_dict={inputs[0]: input_value,
-                              sequence_length: [2, 3]})
+          [state], feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
       self.assertEqual(values[0].dtype, input_value.dtype)
       self.assertEqual(state_value[0].dtype, input_value.dtype)
 
@@ -767,8 +779,7 @@ class LSTMTest(test.TestCase):
       initializer_d = init_ops.random_uniform_initializer(
           -1, 1, seed=self._seed + 1)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell = rnn_cell.LSTMCell(
           num_units,
@@ -792,8 +803,10 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      output_values = sess.run(outputs0 + outputs1 + outputs2,
-                               feed_dict={inputs[0]: input_value})
+      output_values = sess.run(
+          outputs0 + outputs1 + outputs2, feed_dict={
+              inputs[0]: input_value
+          })
       outputs0_values = output_values[:max_length]
       outputs1_values = output_values[max_length:2 * max_length]
       outputs2_values = output_values[2 * max_length:]
@@ -814,8 +827,7 @@ class LSTMTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
       cell = rnn_cell.LSTMCell(
           num_units,
@@ -833,8 +845,10 @@ class LSTMTest(test.TestCase):
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
-      output_values = sess.run(outputs0 + outputs1,
-                               feed_dict={inputs[0]: input_value})
+      output_values = sess.run(
+          outputs0 + outputs1, feed_dict={
+              inputs[0]: input_value
+          })
       outputs0_values = output_values[:max_length]
       outputs1_values = output_values[max_length:]
       self.assertEqual(len(outputs0_values), len(outputs1_values))
@@ -861,8 +875,7 @@ class LSTMTest(test.TestCase):
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
         inputs = max_length * [
-            array_ops.placeholder(
-                dtypes.float32, shape=(None, input_size))
+            array_ops.placeholder(dtypes.float32, shape=(None, input_size))
         ]
       else:
         inputs = max_length * [
@@ -939,8 +952,7 @@ class LSTMTest(test.TestCase):
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
         inputs = max_length * [
-            array_ops.placeholder(
-                dtypes.float32, shape=(None, input_size))
+            array_ops.placeholder(dtypes.float32, shape=(None, input_size))
         ]
       else:
         inputs = max_length * [
@@ -1100,8 +1112,8 @@ class LSTMTest(test.TestCase):
         # Test gradients to inputs and variables w.r.t. outputs & final state
         static_grad_values = sess.run(static_gradients, feed_dict=feeds)
 
-        static_individual_grad_values = sess.run(static_individual_gradients,
-                                                 feed_dict=feeds)
+        static_individual_grad_values = sess.run(
+            static_individual_gradients, feed_dict=feeds)
 
         static_individual_var_grad_values = sess.run(
             static_individual_variable_gradients, feed_dict=feeds)
@@ -1148,8 +1160,10 @@ class LSTMTest(test.TestCase):
         # Generate gradients of several individual outputs w.r.t. inputs
         dynamic_individual_gradients = nest.flatten([
             gradients_impl.gradients(y, [concat_inputs])
-            for y in
-            [split_outputs_dynamic[0], split_outputs_dynamic[-1], state_dynamic]
+            for y in [
+                split_outputs_dynamic[0], split_outputs_dynamic[-1],
+                state_dynamic
+            ]
         ])
 
         # Generate gradients of individual variables w.r.t. inputs
@@ -1159,8 +1173,10 @@ class LSTMTest(test.TestCase):
             "Count of trainable variables: %d" % len(trainable_variables))
         dynamic_individual_variable_gradients = nest.flatten([
             gradients_impl.gradients(y, trainable_variables)
-            for y in
-            [split_outputs_dynamic[0], split_outputs_dynamic[-1], state_dynamic]
+            for y in [
+                split_outputs_dynamic[0], split_outputs_dynamic[-1],
+                state_dynamic
+            ]
         ])
 
         # Test forward pass
@@ -1170,8 +1186,8 @@ class LSTMTest(test.TestCase):
         # Test gradients to inputs and variables w.r.t. outputs & final state
         dynamic_grad_values = sess.run(dynamic_gradients, feed_dict=feeds)
 
-        dynamic_individual_grad_values = sess.run(dynamic_individual_gradients,
-                                                  feed_dict=feeds)
+        dynamic_individual_grad_values = sess.run(
+            dynamic_individual_gradients, feed_dict=feeds)
 
         dynamic_individual_var_grad_values = sess.run(
             dynamic_individual_variable_gradients, feed_dict=feeds)
@@ -1207,8 +1223,8 @@ class LSTMTest(test.TestCase):
       for i, (a, b) in enumerate(
           zip(static_individual_var_grad_values,
               dynamic_individual_var_grad_values)):
-        tf_logging.info("Comparing individual variable gradients iteration %d" %
-                        i)
+        tf_logging.info(
+            "Comparing individual variable gradients iteration %d" % i)
         self.assertAllEqual(a, b)
 
   @test_util.run_in_graph_and_eager_modes()
@@ -1223,10 +1239,7 @@ class BidirectionalRNNTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
-  def _createBidirectionalRNN(self,
-                              use_shape,
-                              use_sequence_length,
-                              scope=None):
+  def _createBidirectionalRNN(self, use_shape, use_sequence_length, scope=None):
     num_units = 3
     input_size = 5
     batch_size = 2
@@ -1270,8 +1283,10 @@ class BidirectionalRNNTest(test.TestCase):
       # Run with pre-specified sequence length of 2, 3
       out, s_fw, s_bw = sess.run(
           [outputs, state_fw, state_bw],
-          feed_dict={inputs[0]: input_value,
-                     sequence_length: [2, 3]})
+          feed_dict={
+              inputs[0]: input_value,
+              sequence_length: [2, 3]
+          })
 
       # Since the forward and backward LSTM cells were initialized with the
       # same parameters, the forward and backward output has to be the same,
@@ -1312,8 +1327,10 @@ class BidirectionalRNNTest(test.TestCase):
       input_value, inputs, outputs, state_fw, state_bw, _ = (
           self._createBidirectionalRNN(use_shape, False))
       variables_lib.global_variables_initializer().run()
-      out, s_fw, s_bw = sess.run([outputs, state_fw, state_bw],
-                                 feed_dict={inputs[0]: input_value})
+      out, s_fw, s_bw = sess.run(
+          [outputs, state_fw, state_bw], feed_dict={
+              inputs[0]: input_value
+          })
 
       # Since the forward and backward LSTM cells were initialized with the
       # same parameters, the forward and backward output has to be the same,
@@ -1396,13 +1413,11 @@ class BidirectionalRNNTest(test.TestCase):
                                    use_time_major, use_sequence_length):
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
-          self._createBidirectionalDynamicRNN(use_shape,
-                                              use_state_tuple, use_time_major,
-                                              use_sequence_length))
+          self._createBidirectionalDynamicRNN(
+              use_shape, use_state_tuple, use_time_major, use_sequence_length))
       variables_lib.global_variables_initializer().run()
       # Run with pre-specified sequence length of 2, 3
-      feed_dict = (
-          {sequence_length: [2, 3]} if use_sequence_length else {})
+      feed_dict = ({sequence_length: [2, 3]} if use_sequence_length else {})
       feed_dict.update({inputs[0]: input_value})
       if use_state_tuple:
         out, c_fw, m_fw, c_bw, m_bw = sess.run(
@@ -1538,8 +1553,7 @@ class MultiDimensionalLSTMTest(test.TestCase):
     sequence_length = [4, 6]
     with self.test_session(graph=ops_lib.Graph()) as sess:
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(None,) + input_size)
+          array_ops.placeholder(dtypes.float32, shape=(None,) + input_size)
       ]
       inputs_using_dim = max_length * [
           array_ops.placeholder(
@@ -1585,14 +1599,22 @@ class MultiDimensionalLSTMTest(test.TestCase):
 
       input_total_size = (batch_size,) + input_size
       input_value = np.random.randn(*input_total_size)
-      outputs_static_v = sess.run(outputs_static,
-                                  feed_dict={inputs[0]: input_value})
-      outputs_dynamic_v = sess.run(outputs_dynamic,
-                                   feed_dict={inputs[0]: input_value})
-      outputs_bid_v = sess.run(outputs_bid,
-                               feed_dict={inputs_using_dim[0]: input_value})
-      outputs_sav_v = sess.run(outputs_sav,
-                               feed_dict={inputs_using_dim[0]: input_value})
+      outputs_static_v = sess.run(
+          outputs_static, feed_dict={
+              inputs[0]: input_value
+          })
+      outputs_dynamic_v = sess.run(
+          outputs_dynamic, feed_dict={
+              inputs[0]: input_value
+          })
+      outputs_bid_v = sess.run(
+          outputs_bid, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
+      outputs_sav_v = sess.run(
+          outputs_sav, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
 
       self.assertAllEqual(outputs_static_v, outputs_dynamic_v)
       self.assertAllEqual(outputs_static_v, outputs_sav_v)
@@ -1602,16 +1624,26 @@ class MultiDimensionalLSTMTest(test.TestCase):
       outputs_bid_array = np.array(outputs_bid_v)
       self.assertAllEqual(outputs_static_array_double, outputs_bid_array)
 
-      state_static_v = sess.run(state_static,
-                                feed_dict={inputs[0]: input_value})
-      state_dynamic_v = sess.run(state_dynamic,
-                                 feed_dict={inputs[0]: input_value})
-      state_bid_fw_v = sess.run(state_fw,
-                                feed_dict={inputs_using_dim[0]: input_value})
-      state_bid_bw_v = sess.run(state_bw,
-                                feed_dict={inputs_using_dim[0]: input_value})
-      state_sav_v = sess.run(state_sav,
-                             feed_dict={inputs_using_dim[0]: input_value})
+      state_static_v = sess.run(
+          state_static, feed_dict={
+              inputs[0]: input_value
+          })
+      state_dynamic_v = sess.run(
+          state_dynamic, feed_dict={
+              inputs[0]: input_value
+          })
+      state_bid_fw_v = sess.run(
+          state_fw, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
+      state_bid_bw_v = sess.run(
+          state_bw, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
+      state_sav_v = sess.run(
+          state_sav, feed_dict={
+              inputs_using_dim[0]: input_value
+          })
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_dynamic_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_sav_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_bid_fw_v))
@@ -1633,16 +1665,17 @@ class NestedLSTMTest(test.TestCase):
     with self.test_session(graph=ops_lib.Graph()) as sess:
       state_saver = TestStateSaver(batch_size, state_size)
       single_input = (array_ops.placeholder(
-          dtypes.float32, shape=(None, input_size)), array_ops.placeholder(
-              dtypes.float32, shape=(None, input_size)))
+          dtypes.float32, shape=(None, input_size)),
+                      array_ops.placeholder(
+                          dtypes.float32, shape=(None, input_size)))
       inputs = max_length * [single_input]
       inputs_c = (array_ops.stack([input_[0] for input_ in inputs]),
                   array_ops.stack([input_[1] for input_ in inputs]))
-      single_input_using_dim = (
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size)),
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size)))
+      single_input_using_dim = (array_ops.placeholder(
+          dtypes.float32, shape=(batch_size, input_size)),
+                                array_ops.placeholder(
+                                    dtypes.float32,
+                                    shape=(batch_size, input_size)))
       inputs_using_dim = max_length * [single_input_using_dim]
 
       # Create a cell for the whole test. This is fine because the cell has no
@@ -1688,14 +1721,22 @@ class NestedLSTMTest(test.TestCase):
       input_total_size = (batch_size, input_size)
       input_value = (np.random.randn(*input_total_size),
                      np.random.randn(*input_total_size))
-      outputs_dynamic_v = sess.run(outputs_dynamic,
-                                   feed_dict={single_input: input_value})
-      outputs_static_v = sess.run(outputs_static,
-                                  feed_dict={single_input: input_value})
-      outputs_sav_v = sess.run(outputs_sav,
-                               feed_dict={single_input_using_dim: input_value})
-      outputs_bid_v = sess.run(outputs_bid,
-                               feed_dict={single_input_using_dim: input_value})
+      outputs_dynamic_v = sess.run(
+          outputs_dynamic, feed_dict={
+              single_input: input_value
+          })
+      outputs_static_v = sess.run(
+          outputs_static, feed_dict={
+              single_input: input_value
+          })
+      outputs_sav_v = sess.run(
+          outputs_sav, feed_dict={
+              single_input_using_dim: input_value
+          })
+      outputs_bid_v = sess.run(
+          outputs_bid, feed_dict={
+              single_input_using_dim: input_value
+          })
 
       self.assertAllEqual(outputs_static_v,
                           np.transpose(outputs_dynamic_v, (1, 0, 2, 3)))
@@ -1706,16 +1747,26 @@ class NestedLSTMTest(test.TestCase):
       outputs_bid_array = np.array(outputs_bid_v)
       self.assertAllEqual(outputs_static_array_double, outputs_bid_array)
 
-      state_dynamic_v = sess.run(state_dynamic,
-                                 feed_dict={single_input: input_value})
-      state_static_v = sess.run(state_static,
-                                feed_dict={single_input: input_value})
-      state_bid_fw_v = sess.run(state_fw,
-                                feed_dict={single_input_using_dim: input_value})
-      state_bid_bw_v = sess.run(state_bw,
-                                feed_dict={single_input_using_dim: input_value})
-      state_sav_v = sess.run(state_sav,
-                             feed_dict={single_input_using_dim: input_value})
+      state_dynamic_v = sess.run(
+          state_dynamic, feed_dict={
+              single_input: input_value
+          })
+      state_static_v = sess.run(
+          state_static, feed_dict={
+              single_input: input_value
+          })
+      state_bid_fw_v = sess.run(
+          state_fw, feed_dict={
+              single_input_using_dim: input_value
+          })
+      state_bid_bw_v = sess.run(
+          state_bw, feed_dict={
+              single_input_using_dim: input_value
+          })
+      state_sav_v = sess.run(
+          state_sav, feed_dict={
+              single_input_using_dim: input_value
+          })
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_dynamic_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_sav_v))
       self.assertAllEqual(np.hstack(state_static_v), np.hstack(state_bid_fw_v))
@@ -1764,8 +1815,7 @@ class StateSaverRNNTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
       return rnn.static_state_saving_rnn(
           cell,
@@ -1931,8 +1981,10 @@ class RawRNNTest(test.TestCase):
       (outputs_val, outputs_dynamic_rnn_val, final_state_val,
        final_state_dynamic_rnn_val) = sess.run(
            [outputs, outputs_dynamic_rnn, final_state, final_state_dynamic_rnn],
-           feed_dict={inputs: rand_input,
-                      sequence_length: rand_seq_len})
+           feed_dict={
+               inputs: rand_input,
+               sequence_length: rand_seq_len
+           })
 
       self.assertAllClose(outputs_dynamic_rnn_val, outputs_val)
       self.assertAllClose(final_state_dynamic_rnn_val, final_state_val)
@@ -1945,12 +1997,16 @@ class RawRNNTest(test.TestCase):
         self.assertEqual(len(gradients), len(gradients_dynamic_rnn))
         gradients_val = sess.run(
             gradients,
-            feed_dict={inputs: rand_input,
-                       sequence_length: rand_seq_len})
+            feed_dict={
+                inputs: rand_input,
+                sequence_length: rand_seq_len
+            })
         gradients_dynamic_rnn_val = sess.run(
             gradients_dynamic_rnn,
-            feed_dict={inputs: rand_input,
-                       sequence_length: rand_seq_len})
+            feed_dict={
+                inputs: rand_input,
+                sequence_length: rand_seq_len
+            })
         self.assertEqual(len(gradients_val), len(gradients_dynamic_rnn_val))
         input_gradients_val = gradients_val[0]
         input_gradients_dynamic_rnn_val = gradients_dynamic_rnn_val[0]
@@ -2067,14 +2123,13 @@ class RawRNNTest(test.TestCase):
 
       def loop_fn(time_, cell_output, cell_state, _):
         if cell_output is None:
-          emit_output = (array_ops.zeros(
-              [2, 3], dtype=dtypes.int32), array_ops.zeros(
-                  [unknown_dim], dtype=dtypes.int64))
+          emit_output = (array_ops.zeros([2, 3], dtype=dtypes.int32),
+                         array_ops.zeros([unknown_dim], dtype=dtypes.int64))
           next_state = cell.zero_state(batch_size, dtypes.float32)
         else:
-          emit_output = (array_ops.ones(
-              [batch_size, 2, 3], dtype=dtypes.int32), array_ops.ones(
-                  [batch_size, unknown_dim], dtype=dtypes.int64))
+          emit_output = (array_ops.ones([batch_size, 2, 3], dtype=dtypes.int32),
+                         array_ops.ones(
+                             [batch_size, unknown_dim], dtype=dtypes.int64))
           next_state = cell_state
         elements_finished = array_ops.tile([time_ >= max_time], [batch_size])
         finished = math_ops.reduce_all(elements_finished)
@@ -2175,9 +2230,9 @@ class DeviceWrapperCell(rnn_cell.RNNCell):
   def __call__(self, input_, state, scope=None):
     if self._device is not None:
       with ops_lib.device(self._device):
-        return self._cell(input_, state, scope)
+        return self._cell(input_, state, scope=scope)
     else:
-      return self._cell(input_, state, scope)
+      return self._cell(input_, state, scope=scope)
 
 
 class TensorArrayOnCorrectDeviceTest(test.TestCase):
@@ -2193,8 +2248,8 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
 
     cell = rnn_cell.LSTMCell(num_units, use_peepholes=True)
     gpu_cell = DeviceWrapperCell(cell, cell_device)
-    inputs = np.random.randn(batch_size, time_steps,
-                             input_size).astype(np.float32)
+    inputs = np.random.randn(batch_size, time_steps, input_size).astype(
+        np.float32)
     sequence_length = np.random.randint(0, time_steps, size=batch_size)
 
     if input_device is not None:
@@ -2262,8 +2317,7 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
 
     gpu_dev = test.gpu_device_name()
     run_metadata = self._execute_rnn_on(
-        rnn_device="/cpu:0", cell_device="/cpu:0",
-        input_device=gpu_dev)
+        rnn_device="/cpu:0", cell_device="/cpu:0", input_device=gpu_dev)
     cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
 
     def _assert_in(op_str, in_stats, out_stats):
@@ -2278,8 +2332,7 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
       return  # Test requires access to a GPU
 
     gpu_dev = test.gpu_device_name()
-    run_metadata = self._execute_rnn_on(
-        input_device=gpu_dev)
+    run_metadata = self._execute_rnn_on(input_device=gpu_dev)
     cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
 
     def _assert_in(op_str, in_stats, out_stats):
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
index a288072ae5da0751f1999128029f38bea933490e..7957edf68cc8a1461fccfc2de93ad5250dc9fdb5 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
@@ -49,6 +49,7 @@ def blocks_match(sess, use_peephole):
     inp = ops.convert_to_tensor(
         np.random.randn(batch_size, input_size), dtype=dtypes.float32)
     inputs.append(inp)
+  stacked_inputs = array_ops.stack(inputs)
 
   initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212)
 
@@ -72,23 +73,6 @@ def blocks_match(sess, use_peephole):
         dtype=dtypes.float32,
         initializer=init_ops.zeros_initializer())
 
-    if use_peephole:
-      wci_block = variable_scope.get_variable(
-          "rnn/lstm_cell/lstm_block_wrapper/w_i_diag",
-          initializer=wci.initialized_value())
-      wcf_block = variable_scope.get_variable(
-          "rnn/lstm_cell/lstm_block_wrapper/w_f_diag",
-          initializer=wcf.initialized_value())
-      wco_block = variable_scope.get_variable(
-          "rnn/lstm_cell/lstm_block_wrapper/w_o_diag",
-          initializer=wco.initialized_value())
-    w_block = variable_scope.get_variable(
-        "rnn/lstm_cell/lstm_block_wrapper/kernel",
-        initializer=w.initialized_value())
-    b_block = variable_scope.get_variable(
-        "rnn/lstm_cell/lstm_block_wrapper/bias",
-        initializer=b.initialized_value())
-
     basic_cell = rnn_cell.LSTMCell(
         cell_size, use_peepholes=use_peephole, state_is_tuple=True, reuse=True)
     basic_outputs_op, basic_state_op = rnn.static_rnn(
@@ -113,11 +97,11 @@ def blocks_match(sess, use_peephole):
           b,
           cell_clip=0)
 
-    with variable_scope.variable_scope("rnn/lstm_cell", reuse=True):
-      fused_cell = lstm_ops.LSTMBlockFusedCell(
-          cell_size, cell_clip=0, use_peephole=use_peephole)
-      fused_outputs_op, fused_state_op = fused_cell(
-          inputs, dtype=dtypes.float32)
+    fused_cell = lstm_ops.LSTMBlockFusedCell(
+        cell_size, cell_clip=0, use_peephole=use_peephole, reuse=True,
+        name="rnn/lstm_cell")
+    fused_outputs_op, fused_state_op = fused_cell(
+        stacked_inputs, dtype=dtypes.float32)
 
     sess.run([variables.global_variables_initializer()])
     basic_outputs, basic_state = sess.run([basic_outputs_op, basic_state_op[0]])
@@ -131,9 +115,9 @@ def blocks_match(sess, use_peephole):
     block_grads = sess.run(gradients_impl.gradients(block_outputs_op, inputs))
     block_wgrads = sess.run(gradients_impl.gradients(block_outputs_op, xs))
 
-    xs = [w_block, b_block]
+    xs = [w, b]
     if use_peephole:
-      xs += [wci_block, wcf_block, wco_block]
+      xs += [wci, wcf, wco]
     fused_outputs, fused_state = sess.run([fused_outputs_op, fused_state_op[0]])
     fused_grads = sess.run(gradients_impl.gradients(fused_outputs_op, inputs))
     fused_wgrads = sess.run(gradients_impl.gradients(fused_outputs_op, xs))
@@ -216,7 +200,7 @@ class LSTMBlockCellTest(test.TestCase):
     with self.test_session(use_gpu=True, graph=ops.Graph()):
       cell = lstm_ops.LSTMBlockFusedCell(10)
       pcell = lstm_ops.LSTMBlockFusedCell(10, use_peephole=True)
-      inputs = [array_ops.zeros([4, 5])] * 6
+      inputs = array_ops.stack([array_ops.zeros([4, 5])] * 6)
       cell(inputs, dtype=dtypes.float32, scope="basic/lstm_cell")
       pcell(inputs, dtype=dtypes.float32, scope="peephole/lstm_cell")
       fused_names = {
@@ -380,13 +364,13 @@ class LSTMBlockCellTest(test.TestCase):
             np.random.randn(batch_size, input_size), dtype=dtypes.float32)
         inputs.append(inp)
       seq_lengths = constant_op.constant([3, 4, 5])
+      cell_inputs = array_ops.stack(inputs)
 
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=19890213)
 
-      with variable_scope.variable_scope(
-          "lstm_block_wrapper", initializer=initializer):
-        # magic naming so that the cells pick up these variables and resuse them
+      with variable_scope.variable_scope("lstm_cell", initializer=initializer):
+        # magic naming so that the cells pick up these variables and reuse them
         variable_scope.get_variable(
             "kernel",
             shape=[input_size + cell_size, cell_size * 4],
@@ -398,13 +382,12 @@ class LSTMBlockCellTest(test.TestCase):
             dtype=dtypes.float32,
             initializer=init_ops.zeros_initializer())
 
-      with variable_scope.variable_scope(
-          variable_scope.get_variable_scope(), reuse=True):
-        cell = lstm_ops.LSTMBlockFusedCell(
-            cell_size, cell_clip=0, use_peephole=False)
+      cell = lstm_ops.LSTMBlockFusedCell(
+          cell_size, cell_clip=0, use_peephole=False, reuse=True,
+          name="lstm_cell")
 
-        fused_outputs_op, fused_state_op = cell(
-            inputs, dtype=dtypes.float32, sequence_length=seq_lengths)
+      fused_outputs_op, fused_state_op = cell(
+          cell_inputs, dtype=dtypes.float32, sequence_length=seq_lengths)
 
       cell_vars = [
           v for v in variables.trainable_variables()
@@ -420,7 +403,7 @@ class LSTMBlockCellTest(test.TestCase):
         for i, inp in enumerate(inputs):
           lengths = [int(i < l) for l in seq_lengths.eval()]
           output, state = cell(
-              [inp],
+              array_ops.expand_dims(inp, 0),
               initial_state=state,
               dtype=dtypes.float32,
               sequence_length=lengths)
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 46823fa3643c5b4a3d857fa38d1a70792d97ca40..7b883ebc5d7756f1bdf445f900500a4b89e6cffd 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -53,14 +53,12 @@ class RNNCellTest(test.TestCase):
       batch_size = 3
       input_size = 4
       expected_output = np.array(
-          [[0.121753, 0.121753],
-           [0.103349, 0.103349],
-           [0.100178, 0.100178]],
+          [[0.121753, 0.121753], [0.103349, 0.103349], [0.100178, 0.100178]],
           dtype=np.float32)
       expected_state = np.array(
-          [[0.137523, 0.137523, 0.121753, 0.121753],
-           [0.105450, 0.105450, 0.103349, 0.103349],
-           [0.100742, 0.100742, 0.100178, 0.100178]],
+          [[0.137523, 0.137523, 0.121753, 0.121753], [
+              0.105450, 0.105450, 0.103349, 0.103349
+          ], [0.100742, 0.100742, 0.100178, 0.100178]],
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -69,14 +67,14 @@ class RNNCellTest(test.TestCase):
         output, state = contrib_rnn_cell.CoupledInputForgetGateLSTMCell(
             num_units=num_units, forget_bias=1.0, state_is_tuple=False)(x, m)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([output, state], {
-            x.name:
-                np.array([[1., 1., 1., 1.],
-                          [2., 2., 2., 2.],
-                          [3., 3., 3., 3.]]),
-            m.name:
-                0.1 * np.ones((batch_size, state_size))
-        })
+        res = sess.run(
+            [output, state], {
+                x.name:
+                    np.array([[1., 1., 1., 1.], [2., 2., 2., 2.],
+                              [3., 3., 3., 3.]]),
+                m.name:
+                    0.1 * np.ones((batch_size, state_size))
+            })
         # This is a smoke test: Only making sure expected values didn't change.
         self.assertEqual(len(res), 2)
         self.assertAllClose(res[0], expected_output)
@@ -101,14 +99,14 @@ class RNNCellTest(test.TestCase):
             frequency_skip=frequency_skip,
             forget_bias=1.0)(x, m)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([output, state], {
-            x.name:
-                np.array([[1., 1., 1., 1.],
-                          [2., 2., 2., 2.],
-                          [3., 3., 3., 3.]]),
-            m.name:
-                0.1 * np.ones((batch_size, int(state_size * (num_shifts))))
-        })
+        res = sess.run(
+            [output, state], {
+                x.name:
+                    np.array([[1., 1., 1., 1.], [2., 2., 2., 2.],
+                              [3., 3., 3., 3.]]),
+                m.name:
+                    0.1 * np.ones((batch_size, int(state_size * (num_shifts))))
+            })
         self.assertEqual(len(res), 2)
         # The numbers in results were not calculated, this is mostly just a
         # smoke test.
@@ -141,17 +139,14 @@ class RNNCellTest(test.TestCase):
             state_is_tuple=True)
         inputs = constant_op.constant(
             np.array(
-                [[1., 1., 1., 1.],
-                 [2., 2., 2., 2.],
-                 [3., 3., 3., 3.]],
+                [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
                 dtype=np.float32),
             dtype=dtypes.float32)
         state_value = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
-        init_state = cell.state_tuple_type(
-            *([state_value, state_value] * num_shifts))
+        init_state = cell.state_tuple_type(*(
+            [state_value, state_value] * num_shifts))
         output, state = cell(inputs, init_state)
         sess.run([variables.global_variables_initializer()])
         res = sess.run([output, state])
@@ -198,11 +193,10 @@ class RNNCellTest(test.TestCase):
                 dtype=np.float32),
             dtype=dtypes.float32)
         state_value = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
-        init_state = cell.state_tuple_type(
-            *([state_value, state_value] * total_blocks))
+        init_state = cell.state_tuple_type(*(
+            [state_value, state_value] * total_blocks))
         output, state = cell(inputs, init_state)
         sess.run([variables.global_variables_initializer()])
         res = sess.run([output, state])
@@ -230,20 +224,28 @@ class RNNCellTest(test.TestCase):
     frequency_skip = 1
     num_shifts = int((input_size - feature_size) / frequency_skip + 1)
     expected_output = np.array(
-        [[0.416383, 0.416383, 0.403238, 0.403238, 0.524020, 0.524020,
-          0.565425, 0.565425, 0.557865, 0.557865, 0.609699, 0.609699],
-         [0.627331, 0.627331, 0.622393, 0.622393, 0.688342, 0.688342,
-          0.708078, 0.708078, 0.694245, 0.694245, 0.715171, 0.715171],
-         [0.711050, 0.711050, 0.709197, 0.709197, 0.736533, 0.736533,
-          0.744264, 0.744264, 0.737390, 0.737390, 0.745250, 0.745250]],
+        [[
+            0.416383, 0.416383, 0.403238, 0.403238, 0.524020, 0.524020,
+            0.565425, 0.565425, 0.557865, 0.557865, 0.609699, 0.609699
+        ], [
+            0.627331, 0.627331, 0.622393, 0.622393, 0.688342, 0.688342,
+            0.708078, 0.708078, 0.694245, 0.694245, 0.715171, 0.715171
+        ], [
+            0.711050, 0.711050, 0.709197, 0.709197, 0.736533, 0.736533,
+            0.744264, 0.744264, 0.737390, 0.737390, 0.745250, 0.745250
+        ]],
         dtype=np.float32)
     expected_state = np.array(
-        [[0.625556, 0.625556, 0.416383, 0.416383, 0.759134, 0.759134,
-          0.524020, 0.524020, 0.798795, 0.798795, 0.557865, 0.557865],
-         [0.875488, 0.875488, 0.627331, 0.627331, 0.936432, 0.936432,
-          0.688342, 0.688342, 0.941961, 0.941961, 0.694245, 0.694245],
-         [0.957327, 0.957327, 0.711050, 0.711050, 0.979522, 0.979522,
-          0.736533, 0.736533, 0.980245, 0.980245, 0.737390, 0.737390]],
+        [[
+            0.625556, 0.625556, 0.416383, 0.416383, 0.759134, 0.759134,
+            0.524020, 0.524020, 0.798795, 0.798795, 0.557865, 0.557865
+        ], [
+            0.875488, 0.875488, 0.627331, 0.627331, 0.936432, 0.936432,
+            0.688342, 0.688342, 0.941961, 0.941961, 0.694245, 0.694245
+        ], [
+            0.957327, 0.957327, 0.711050, 0.711050, 0.979522, 0.979522,
+            0.736533, 0.736533, 0.980245, 0.980245, 0.737390, 0.737390
+        ]],
         dtype=np.float32)
     for state_is_tuple in [False, True]:
       with self.test_session() as sess:
@@ -259,18 +261,16 @@ class RNNCellTest(test.TestCase):
               couple_input_forget_gates=True,
               state_is_tuple=state_is_tuple)
           inputs = constant_op.constant(
-              np.array([[1., 1., 1., 1.],
-                        [2., 2., 2., 2.],
-                        [3., 3., 3., 3.]],
-                       dtype=np.float32),
+              np.array(
+                  [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
+                  dtype=np.float32),
               dtype=dtypes.float32)
           if state_is_tuple:
             state_value = constant_op.constant(
-                0.1 * np.ones(
-                    (batch_size, num_units), dtype=np.float32),
+                0.1 * np.ones((batch_size, num_units), dtype=np.float32),
                 dtype=dtypes.float32)
-            init_state = cell.state_tuple_type(
-                *([state_value, state_value] * num_shifts))
+            init_state = cell.state_tuple_type(*(
+                [state_value, state_value] * num_shifts))
           else:
             init_state = constant_op.constant(
                 0.1 * np.ones(
@@ -302,32 +302,40 @@ class RNNCellTest(test.TestCase):
       frequency_skip = 1
       num_shifts = int((input_size - feature_size) / frequency_skip + 1)
       expected_output = np.array(
-          [[0.464130, 0.464130, 0.419165, 0.419165, 0.593283, 0.593283,
-            0.738350, 0.738350, 0.661638, 0.661638, 0.866774, 0.866774,
-            0.520789, 0.520789, 0.476968, 0.476968, 0.604341, 0.604341,
-            0.760207, 0.760207, 0.635773, 0.635773, 0.850218, 0.850218],
-           [0.669636, 0.669636, 0.628966, 0.628966, 0.736057, 0.736057,
-            0.895927, 0.895927, 0.755559, 0.755559, 0.954359, 0.954359,
-            0.692621, 0.692621, 0.652363, 0.652363, 0.737517, 0.737517,
-            0.899558, 0.899558, 0.745984, 0.745984, 0.946840, 0.946840],
-           [0.751109, 0.751109, 0.711716, 0.711716, 0.778357, 0.778357,
-            0.940779, 0.940779, 0.784530, 0.784530, 0.980604, 0.980604,
-            0.759940, 0.759940, 0.720652, 0.720652, 0.778552, 0.778552,
-            0.941606, 0.941606, 0.781035, 0.781035, 0.977731, 0.977731]],
+          [[
+              0.464130, 0.464130, 0.419165, 0.419165, 0.593283, 0.593283,
+              0.738350, 0.738350, 0.661638, 0.661638, 0.866774, 0.866774,
+              0.520789, 0.520789, 0.476968, 0.476968, 0.604341, 0.604341,
+              0.760207, 0.760207, 0.635773, 0.635773, 0.850218, 0.850218
+          ], [
+              0.669636, 0.669636, 0.628966, 0.628966, 0.736057, 0.736057,
+              0.895927, 0.895927, 0.755559, 0.755559, 0.954359, 0.954359,
+              0.692621, 0.692621, 0.652363, 0.652363, 0.737517, 0.737517,
+              0.899558, 0.899558, 0.745984, 0.745984, 0.946840, 0.946840
+          ], [
+              0.751109, 0.751109, 0.711716, 0.711716, 0.778357, 0.778357,
+              0.940779, 0.940779, 0.784530, 0.784530, 0.980604, 0.980604,
+              0.759940, 0.759940, 0.720652, 0.720652, 0.778552, 0.778552,
+              0.941606, 0.941606, 0.781035, 0.781035, 0.977731, 0.977731
+          ]],
           dtype=np.float32)
       expected_state = np.array(
-          [[0.710660, 0.710660, 0.464130, 0.464130, 0.877293, 0.877293,
-            0.593283, 0.593283, 0.958505, 0.958505, 0.661638, 0.661638,
-            0.785405, 0.785405, 0.520789, 0.520789, 0.890836, 0.890836,
-            0.604341, 0.604341, 0.928512, 0.928512, 0.635773, 0.635773],
-           [0.967579, 0.967579, 0.669636, 0.669636, 1.038811, 1.038811,
-            0.736057, 0.736057, 1.058201, 1.058201, 0.755559, 0.755559,
-            0.993088, 0.993088, 0.692621, 0.692621, 1.040288, 1.040288,
-            0.737517, 0.737517, 1.048773, 1.048773, 0.745984, 0.745984],
-           [1.053842, 1.053842, 0.751109, 0.751109, 1.079919, 1.079919,
-            0.778357, 0.778357, 1.085620, 1.085620, 0.784530, 0.784530,
-            1.062455, 1.062455, 0.759940, 0.759940, 1.080101, 1.080101,
-            0.778552, 0.778552, 1.082402, 1.082402, 0.781035, 0.781035]],
+          [[
+              0.710660, 0.710660, 0.464130, 0.464130, 0.877293, 0.877293,
+              0.593283, 0.593283, 0.958505, 0.958505, 0.661638, 0.661638,
+              0.785405, 0.785405, 0.520789, 0.520789, 0.890836, 0.890836,
+              0.604341, 0.604341, 0.928512, 0.928512, 0.635773, 0.635773
+          ], [
+              0.967579, 0.967579, 0.669636, 0.669636, 1.038811, 1.038811,
+              0.736057, 0.736057, 1.058201, 1.058201, 0.755559, 0.755559,
+              0.993088, 0.993088, 0.692621, 0.692621, 1.040288, 1.040288,
+              0.737517, 0.737517, 1.048773, 1.048773, 0.745984, 0.745984
+          ], [
+              1.053842, 1.053842, 0.751109, 0.751109, 1.079919, 1.079919,
+              0.778357, 0.778357, 1.085620, 1.085620, 0.784530, 0.784530,
+              1.062455, 1.062455, 0.759940, 0.759940, 1.080101, 1.080101,
+              0.778552, 0.778552, 1.082402, 1.082402, 0.781035, 0.781035
+          ]],
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -339,17 +347,16 @@ class RNNCellTest(test.TestCase):
             forget_bias=1.0,
             num_frequency_blocks=[num_shifts])
         inputs = constant_op.constant(
-            np.array([[1.0, 1.1, 1.2, 1.3],
-                      [2.0, 2.1, 2.2, 2.3],
-                      [3.0, 3.1, 3.2, 3.3]],
-                     dtype=np.float32),
+            np.array(
+                [[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3],
+                 [3.0, 3.1, 3.2, 3.3]],
+                dtype=np.float32),
             dtype=dtypes.float32)
         state_value = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
-        init_state = cell.state_tuple_type(
-            *([state_value, state_value] * num_shifts * 2))
+        init_state = cell.state_tuple_type(*(
+            [state_value, state_value] * num_shifts * 2))
         output, state = cell(inputs, init_state)
         sess.run([variables.global_variables_initializer()])
         res = sess.run([output, state])
@@ -375,32 +382,40 @@ class RNNCellTest(test.TestCase):
       frequency_skip = 1
       num_shifts = int((input_size - feature_size) / frequency_skip + 1)
       expected_output = np.array(
-          [[0.464130, 0.464130, 0.419165, 0.419165, 0.593283, 0.593283,
-            0.738350, 0.738350, 0.661638, 0.661638, 0.866774, 0.866774,
-            0.322645, 0.322645, 0.276068, 0.276068, 0.584654, 0.584654,
-            0.690292, 0.690292, 0.640446, 0.640446, 0.840071, 0.840071],
-           [0.669636, 0.669636, 0.628966, 0.628966, 0.736057, 0.736057,
-            0.895927, 0.895927, 0.755559, 0.755559, 0.954359, 0.954359,
-            0.493625, 0.493625, 0.449236, 0.449236, 0.730828, 0.730828,
-            0.865996, 0.865996, 0.749429, 0.749429, 0.944958, 0.944958],
-           [0.751109, 0.751109, 0.711716, 0.711716, 0.778357, 0.778357,
-            0.940779, 0.940779, 0.784530, 0.784530, 0.980604, 0.980604,
-            0.608587, 0.608587, 0.566683, 0.566683, 0.777345, 0.777345,
-            0.925820, 0.925820, 0.782597, 0.782597, 0.976858, 0.976858]],
+          [[
+              0.464130, 0.464130, 0.419165, 0.419165, 0.593283, 0.593283,
+              0.738350, 0.738350, 0.661638, 0.661638, 0.866774, 0.866774,
+              0.322645, 0.322645, 0.276068, 0.276068, 0.584654, 0.584654,
+              0.690292, 0.690292, 0.640446, 0.640446, 0.840071, 0.840071
+          ], [
+              0.669636, 0.669636, 0.628966, 0.628966, 0.736057, 0.736057,
+              0.895927, 0.895927, 0.755559, 0.755559, 0.954359, 0.954359,
+              0.493625, 0.493625, 0.449236, 0.449236, 0.730828, 0.730828,
+              0.865996, 0.865996, 0.749429, 0.749429, 0.944958, 0.944958
+          ], [
+              0.751109, 0.751109, 0.711716, 0.711716, 0.778357, 0.778357,
+              0.940779, 0.940779, 0.784530, 0.784530, 0.980604, 0.980604,
+              0.608587, 0.608587, 0.566683, 0.566683, 0.777345, 0.777345,
+              0.925820, 0.925820, 0.782597, 0.782597, 0.976858, 0.976858
+          ]],
           dtype=np.float32)
       expected_state = np.array(
-          [[0.710660, 0.710660, 0.464130, 0.464130, 0.877293, 0.877293,
-            0.593283, 0.593283, 0.958505, 0.958505, 0.661638, 0.661638,
-            0.516575, 0.516575, 0.322645, 0.322645, 0.866628, 0.866628,
-            0.584654, 0.584654, 0.934002, 0.934002, 0.640446, 0.640446],
-           [0.967579, 0.967579, 0.669636, 0.669636, 1.038811, 1.038811,
-            0.736057, 0.736057, 1.058201, 1.058201, 0.755559, 0.755559,
-            0.749836, 0.749836, 0.493625, 0.493625, 1.033488, 1.033488,
-            0.730828, 0.730828, 1.052186, 1.052186, 0.749429, 0.749429],
-           [1.053842, 1.053842, 0.751109, 0.751109, 1.079919, 1.079919,
-            0.778357, 0.778357, 1.085620, 1.085620, 0.784530, 0.784530,
-            0.895999, 0.895999, 0.608587, 0.608587, 1.078978, 1.078978,
-            0.777345, 0.777345, 1.083843, 1.083843, 0.782597, 0.782597]],
+          [[
+              0.710660, 0.710660, 0.464130, 0.464130, 0.877293, 0.877293,
+              0.593283, 0.593283, 0.958505, 0.958505, 0.661638, 0.661638,
+              0.516575, 0.516575, 0.322645, 0.322645, 0.866628, 0.866628,
+              0.584654, 0.584654, 0.934002, 0.934002, 0.640446, 0.640446
+          ], [
+              0.967579, 0.967579, 0.669636, 0.669636, 1.038811, 1.038811,
+              0.736057, 0.736057, 1.058201, 1.058201, 0.755559, 0.755559,
+              0.749836, 0.749836, 0.493625, 0.493625, 1.033488, 1.033488,
+              0.730828, 0.730828, 1.052186, 1.052186, 0.749429, 0.749429
+          ], [
+              1.053842, 1.053842, 0.751109, 0.751109, 1.079919, 1.079919,
+              0.778357, 0.778357, 1.085620, 1.085620, 0.784530, 0.784530,
+              0.895999, 0.895999, 0.608587, 0.608587, 1.078978, 1.078978,
+              0.777345, 0.777345, 1.083843, 1.083843, 0.782597, 0.782597
+          ]],
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -413,17 +428,16 @@ class RNNCellTest(test.TestCase):
             num_frequency_blocks=[num_shifts],
             backward_slice_offset=1)
         inputs = constant_op.constant(
-            np.array([[1.0, 1.1, 1.2, 1.3],
-                      [2.0, 2.1, 2.2, 2.3],
-                      [3.0, 3.1, 3.2, 3.3]],
-                     dtype=np.float32),
+            np.array(
+                [[1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3],
+                 [3.0, 3.1, 3.2, 3.3]],
+                dtype=np.float32),
             dtype=dtypes.float32)
         state_value = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
-        init_state = cell.state_tuple_type(
-            *([state_value, state_value] * num_shifts * 2))
+        init_state = cell.state_tuple_type(*(
+            [state_value, state_value] * num_shifts * 2))
         output, state = cell(inputs, init_state)
         sess.run([variables.global_variables_initializer()])
         res = sess.run([output, state])
@@ -474,8 +488,8 @@ class RNNCellTest(test.TestCase):
     for state_is_tuple in [False, True]:
       with ops.Graph().as_default():
         with self.test_session() as sess:
-          with variable_scope.variable_scope("state_is_tuple_" + str(
-              state_is_tuple)):
+          with variable_scope.variable_scope(
+              "state_is_tuple_" + str(state_is_tuple)):
             lstm_cell = rnn_cell.BasicLSTMCell(
                 num_units, state_is_tuple=state_is_tuple)
             cell = contrib_rnn_cell.AttentionCellWrapper(
@@ -525,16 +539,15 @@ class RNNCellTest(test.TestCase):
     for state_is_tuple in [False, True]:
       with ops.Graph().as_default():
         with self.test_session() as sess:
-          with variable_scope.variable_scope("state_is_tuple_" + str(
-              state_is_tuple)):
+          with variable_scope.variable_scope(
+              "state_is_tuple_" + str(state_is_tuple)):
             lstm_cell = rnn_cell.BasicLSTMCell(
                 num_units, state_is_tuple=state_is_tuple)
             cell = contrib_rnn_cell.AttentionCellWrapper(
                 lstm_cell, attn_length, state_is_tuple=state_is_tuple)
             if state_is_tuple:
               zeros = constant_op.constant(
-                  0.1 * np.ones(
-                      [batch_size, num_units], dtype=np.float32),
+                  0.1 * np.ones([batch_size, num_units], dtype=np.float32),
                   dtype=dtypes.float32)
               attn_state_zeros = constant_op.constant(
                   0.1 * np.ones(
@@ -579,22 +592,25 @@ class RNNCellTest(test.TestCase):
          [1.018088, 0.378983, -0.572179, 0.268591]],
         dtype=np.float32)
     expected_state = np.array(
-        [[0.74946702, 0.34681597, 0.26474735, 1.06485605, 0.38465962,
-          0.11420801, 0.10272158, 0.30925757, 0.63899988, 0.7181077,
-          0.47534478, 0.33715725, 0.58086717, 0.49446869, 0.7641536,
-          0.12814975, 0.92231739, 0.89857256, 0.21889746, 0.38442063,
-          0.53481543, 0.8876909, 0.45823169, 0.5905602, 0.78038228,
-          0.56501579, 0.03971386, 0.09870267, 0.8074435, 0.66821432,
-          0.99211812, 0.12295902, 1.14606023, 0.34370938, -0.79251152,
-          0.51843399],
-         [0.5179342, 0.48682183, -0.25426468, 0.96810579, 0.28809637,
-          0.13607743, -0.11446252, 0.26792109, 0.78047138, 0.63460857,
-          0.49122369, 0.52007174, 0.73000264, 0.66986895, 0.73576689,
-          0.86301267, 0.87887371, 0.35185754, 0.93417215, 0.64732957,
-          0.63173044, 0.66627824, 0.53644657, 0.20477486, 0.98458421,
-          0.38277245, 0.03746676, 0.92510188, 0.57714164, 0.84932971,
-          0.36127412, 0.12125921, 1.1362772, 0.34361625, -0.78150457,
-          0.70582712]],
+        [[
+            0.74946702, 0.34681597, 0.26474735, 1.06485605, 0.38465962,
+            0.11420801, 0.10272158, 0.30925757, 0.63899988, 0.7181077,
+            0.47534478, 0.33715725, 0.58086717, 0.49446869, 0.7641536,
+            0.12814975, 0.92231739, 0.89857256, 0.21889746, 0.38442063,
+            0.53481543, 0.8876909, 0.45823169, 0.5905602, 0.78038228,
+            0.56501579, 0.03971386, 0.09870267, 0.8074435, 0.66821432,
+            0.99211812, 0.12295902, 1.14606023, 0.34370938, -0.79251152,
+            0.51843399
+        ], [
+            0.5179342, 0.48682183, -0.25426468, 0.96810579, 0.28809637,
+            0.13607743, -0.11446252, 0.26792109, 0.78047138, 0.63460857,
+            0.49122369, 0.52007174, 0.73000264, 0.66986895, 0.73576689,
+            0.86301267, 0.87887371, 0.35185754, 0.93417215, 0.64732957,
+            0.63173044, 0.66627824, 0.53644657, 0.20477486, 0.98458421,
+            0.38277245, 0.03746676, 0.92510188, 0.57714164, 0.84932971,
+            0.36127412, 0.12125921, 1.1362772, 0.34361625, -0.78150457,
+            0.70582712
+        ]],
         dtype=np.float32)
     seed = 12345
     random_seed.set_random_seed(seed)
@@ -602,7 +618,8 @@ class RNNCellTest(test.TestCase):
     for state_is_tuple in [False, True]:
       with session.Session() as sess:
         with variable_scope.variable_scope(
-            "state_is_tuple", reuse=state_is_tuple,
+            "state_is_tuple",
+            reuse=state_is_tuple,
             initializer=init_ops.glorot_uniform_initializer()):
           lstm_cell = rnn_cell.BasicLSTMCell(
               num_units, state_is_tuple=state_is_tuple)
@@ -646,36 +663,31 @@ class RNNCellTest(test.TestCase):
   def testNASCell(self):
     num_units = 6
     batch_size = 3
-    expected_output = np.array([[0.576751, 0.576751, 0.576751, 0.576751,
-                                 0.576751, 0.576751],
-                                [0.618936, 0.618936, 0.618936, 0.618936,
-                                 0.618936, 0.618936],
-                                [0.627393, 0.627393, 0.627393, 0.627393,
-                                 0.627393, 0.627393]])
-    expected_state = np.array([[0.71579772, 0.71579772, 0.71579772, 0.71579772,
-                                0.71579772, 0.71579772, 0.57675087, 0.57675087,
-                                0.57675087, 0.57675087, 0.57675087, 0.57675087],
-                               [0.78041625, 0.78041625, 0.78041625, 0.78041625,
-                                0.78041625, 0.78041625, 0.6189357, 0.6189357,
-                                0.61893570, 0.6189357, 0.6189357, 0.6189357],
-                               [0.79457647, 0.79457647, 0.79457647, 0.79457647,
-                                0.79457653, 0.79457653, 0.62739348, 0.62739348,
-                                0.62739348, 0.62739348, 0.62739348, 0.62739348]
-                              ])
+    expected_output = np.array(
+        [[0.576751, 0.576751, 0.576751, 0.576751, 0.576751, 0.576751],
+         [0.618936, 0.618936, 0.618936, 0.618936, 0.618936, 0.618936],
+         [0.627393, 0.627393, 0.627393, 0.627393, 0.627393, 0.627393]])
+    expected_state = np.array([[
+        0.71579772, 0.71579772, 0.71579772, 0.71579772, 0.71579772, 0.71579772,
+        0.57675087, 0.57675087, 0.57675087, 0.57675087, 0.57675087, 0.57675087
+    ], [
+        0.78041625, 0.78041625, 0.78041625, 0.78041625, 0.78041625, 0.78041625,
+        0.6189357, 0.6189357, 0.61893570, 0.6189357, 0.6189357, 0.6189357
+    ], [
+        0.79457647, 0.79457647, 0.79457647, 0.79457647, 0.79457653, 0.79457653,
+        0.62739348, 0.62739348, 0.62739348, 0.62739348, 0.62739348, 0.62739348
+    ]])
     with self.test_session() as sess:
       with variable_scope.variable_scope(
-          "nas_test",
-          initializer=init_ops.constant_initializer(0.5)):
+          "nas_test", initializer=init_ops.constant_initializer(0.5)):
         cell = contrib_rnn_cell.NASCell(num_units=num_units)
         inputs = constant_op.constant(
-            np.array([[1., 1., 1., 1.],
-                      [2., 2., 2., 2.],
-                      [3., 3., 3., 3.]],
-                     dtype=np.float32),
+            np.array(
+                [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
+                dtype=np.float32),
             dtype=dtypes.float32)
         state_value = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
         init_state = rnn_cell.LSTMStateTuple(state_value, state_value)
         output, state = cell(inputs, init_state)
@@ -699,39 +711,34 @@ class RNNCellTest(test.TestCase):
     num_units = 6
     batch_size = 3
     num_proj = 5
-    expected_output = np.array([[1.697418, 1.697418, 1.697418, 1.697418,
-                                 1.697418],
-                                [1.840037, 1.840037, 1.840037, 1.840037,
-                                 1.840037],
-                                [1.873985, 1.873985, 1.873985, 1.873985,
-                                 1.873985]])
-    expected_state = np.array([[0.69855207, 0.69855207, 0.69855207, 0.69855207,
-                                0.69855207, 0.69855207, 1.69741797, 1.69741797,
-                                1.69741797, 1.69741797, 1.69741797],
-                               [0.77073824, 0.77073824, 0.77073824, 0.77073824,
-                                0.77073824, 0.77073824, 1.84003687, 1.84003687,
-                                1.84003687, 1.84003687, 1.84003687],
-                               [0.78973997, 0.78973997, 0.78973997, 0.78973997,
-                                0.78973997, 0.78973997, 1.87398517, 1.87398517,
-                                1.87398517, 1.87398517, 1.87398517]])
+    expected_output = np.array(
+        [[1.697418, 1.697418, 1.697418, 1.697418,
+          1.697418], [1.840037, 1.840037, 1.840037, 1.840037, 1.840037],
+         [1.873985, 1.873985, 1.873985, 1.873985, 1.873985]])
+    expected_state = np.array([[
+        0.69855207, 0.69855207, 0.69855207, 0.69855207, 0.69855207, 0.69855207,
+        1.69741797, 1.69741797, 1.69741797, 1.69741797, 1.69741797
+    ], [
+        0.77073824, 0.77073824, 0.77073824, 0.77073824, 0.77073824, 0.77073824,
+        1.84003687, 1.84003687, 1.84003687, 1.84003687, 1.84003687
+    ], [
+        0.78973997, 0.78973997, 0.78973997, 0.78973997, 0.78973997, 0.78973997,
+        1.87398517, 1.87398517, 1.87398517, 1.87398517, 1.87398517
+    ]])
     with self.test_session() as sess:
       with variable_scope.variable_scope(
-          "nas_proj_test",
-          initializer=init_ops.constant_initializer(0.5)):
+          "nas_proj_test", initializer=init_ops.constant_initializer(0.5)):
         cell = contrib_rnn_cell.NASCell(num_units=num_units, num_proj=num_proj)
         inputs = constant_op.constant(
-            np.array([[1., 1., 1., 1.],
-                      [2., 2., 2., 2.],
-                      [3., 3., 3., 3.]],
-                     dtype=np.float32),
+            np.array(
+                [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
+                dtype=np.float32),
             dtype=dtypes.float32)
         state_value_c = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
         state_value_h = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_proj), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_proj), dtype=np.float32),
             dtype=dtypes.float32)
         init_state = rnn_cell.LSTMStateTuple(state_value_c, state_value_h)
         output, state = cell(inputs, init_state)
@@ -755,24 +762,20 @@ class RNNCellTest(test.TestCase):
     num_units = 2
     batch_size = 3
     expected_state_and_output = np.array(
-        [[0.13752282, 0.13752282],
-         [0.10545051, 0.10545051],
+        [[0.13752282, 0.13752282], [0.10545051, 0.10545051],
          [0.10074195, 0.10074195]],
         dtype=np.float32)
     with self.test_session() as sess:
       with variable_scope.variable_scope(
-          "ugrnn_cell_test",
-          initializer=init_ops.constant_initializer(0.5)):
+          "ugrnn_cell_test", initializer=init_ops.constant_initializer(0.5)):
         cell = contrib_rnn_cell.UGRNNCell(num_units=num_units)
         inputs = constant_op.constant(
-            np.array([[1., 1., 1., 1.],
-                      [2., 2., 2., 2.],
-                      [3., 3., 3., 3.]],
-                     dtype=np.float32),
+            np.array(
+                [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
+                dtype=np.float32),
             dtype=dtypes.float32)
         init_state = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
         output, state = cell(inputs, init_state)
         sess.run([variables.global_variables_initializer()])
@@ -786,13 +789,11 @@ class RNNCellTest(test.TestCase):
     num_units = 2
     batch_size = 3
     expected_state = np.array(
-        [[0.13752282, 0.13752282],
-         [0.10545051, 0.10545051],
+        [[0.13752282, 0.13752282], [0.10545051, 0.10545051],
          [0.10074195, 0.10074195]],
         dtype=np.float32)
     expected_output = np.array(
-        [[2.00431061, 2.00431061],
-         [4.00060606, 4.00060606],
+        [[2.00431061, 2.00431061], [4.00060606, 4.00060606],
          [6.00008249, 6.00008249]],
         dtype=np.float32)
     with self.test_session() as sess:
@@ -802,14 +803,12 @@ class RNNCellTest(test.TestCase):
         cell = contrib_rnn_cell.IntersectionRNNCell(
             num_units=num_units, num_in_proj=num_units)
         inputs = constant_op.constant(
-            np.array([[1., 1., 1., 1.],
-                      [2., 2., 2., 2.],
-                      [3., 3., 3., 3.]],
-                     dtype=np.float32),
+            np.array(
+                [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
+                dtype=np.float32),
             dtype=dtypes.float32)
         init_state = constant_op.constant(
-            0.1 * np.ones(
-                (batch_size, num_units), dtype=np.float32),
+            0.1 * np.ones((batch_size, num_units), dtype=np.float32),
             dtype=dtypes.float32)
         output, state = cell(inputs, init_state)
         sess.run([variables.global_variables_initializer()])
@@ -824,19 +823,17 @@ class RNNCellTest(test.TestCase):
     batch_size = 3
     cell = contrib_rnn_cell.IntersectionRNNCell(num_units=num_units)
     inputs = constant_op.constant(
-        np.array([[1., 1., 1., 1.],
-                  [2., 2., 2., 2.],
-                  [3., 3., 3., 3.]],
-                 dtype=np.float32),
+        np.array(
+            [[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]],
+            dtype=np.float32),
         dtype=dtypes.float32)
     init_state = constant_op.constant(
-        0.1 * np.ones(
-            (batch_size, num_units), dtype=np.float32),
+        0.1 * np.ones((batch_size, num_units), dtype=np.float32),
         dtype=dtypes.float32)
-    with self.assertRaisesRegexp(
-        ValueError, "Must have input size == output size for "
-                    "Intersection RNN. To fix, num_in_proj should "
-                    "be set to num_units at cell init."):
+    with self.assertRaisesRegexp(ValueError,
+                                 "Must have input size == output size for "
+                                 "Intersection RNN. To fix, num_in_proj should "
+                                 "be set to num_units at cell init."):
       cell(inputs, init_state)
 
   def testPhasedLSTMCell(self):
@@ -845,12 +842,12 @@ class RNNCellTest(test.TestCase):
       batch_size = 3
       input_size = 4
       expected_state_c = np.array(
-          [[0.00072015, 0.00036633], [0.00083481, 0.00047266],
-           [0.00085111, 0.00053054]],
+          [[6.450831e-04, 4.697885e-04], [9.862894e-05, 7.212213e-04],
+           [4.401947e-04, 9.143004e-04]],
           dtype=np.float32)
       expected_state_h = np.array(
-          [[0.0005159, 0.00026243], [0.00062958, 0.00035646],
-           [0.00064732, 0.00040351]],
+          [[4.621217e-04, 3.365449e-04], [7.438179e-05, 5.439147e-04],
+           [3.347936e-04, 6.953785e-04]],
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -862,14 +859,14 @@ class RNNCellTest(test.TestCase):
         output, state = contrib_rnn_cell.PhasedLSTMCell(num_units=num_units)(
             (t, x), state0)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([output, state], {
-            t.name:
-                np.array([[1.], [2.], [3.]]),
-            x.name:
-                np.array([[1., 1., 1., 1.],
-                          [2., 2., 2., 2.],
-                          [3., 3., 3., 3.]]),
-        })
+        res = sess.run(
+            [output, state], {
+                t.name:
+                    np.array([[1.], [2.], [3.]]),
+                x.name:
+                    np.array([[1., 1., 1., 1.], [2., 2., 2., 2.],
+                              [3., 3., 3., 3.]]),
+            })
         # This is a smoke test, making sure expected values are unchanged.
         self.assertEqual(len(res), 2)
         self.assertAllClose(res[0], res[1].h)
@@ -878,36 +875,32 @@ class RNNCellTest(test.TestCase):
 
   def testConv1DLSTMCell(self):
     with self.test_session() as sess:
-      shape = [2,1]
+      shape = [2, 1]
       filter_size = [3]
       num_features = 1
       batch_size = 2
       expected_state_c = np.array(
-          [[[1.4375670191], [1.4375670191]],
-           [[2.7542609292], [2.7542609292]]],
+          [[[1.4375670191], [1.4375670191]], [[2.7542609292], [2.7542609292]]],
           dtype=np.float32)
       expected_state_h = np.array(
-          [[[0.6529865603], [0.6529865603]],
-           [[0.8736877431], [0.8736877431]]],
+          [[[0.6529865603], [0.6529865603]], [[0.8736877431], [0.8736877431]]],
           dtype=np.float32)
       with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(1.0/2.0)):
+          "root", initializer=init_ops.constant_initializer(1.0 / 2.0)):
         x = array_ops.placeholder(dtypes.float32, [None, None, 1])
-        cell = contrib_rnn_cell.Conv1DLSTMCell(input_shape=shape,
-                                               kernel_shape=filter_size,
-                                               output_channels=num_features)
+        cell = contrib_rnn_cell.Conv1DLSTMCell(
+            input_shape=shape,
+            kernel_shape=filter_size,
+            output_channels=num_features)
         hidden = cell.zero_state(array_ops.shape(x)[0], dtypes.float32)
         output, state = cell(x, hidden)
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([output, state], {
-            hidden[0].name:
-                np.array([[[1.],[1.]],
-                          [[2.],[2.]]]),
-            x.name:
-                np.array([[[1.],[1.]],
-                          [[2.],[2.]]]),
-        })
+        res = sess.run(
+            [output, state], {
+                hidden[0].name: np.array([[[1.], [1.]], [[2.], [2.]]]),
+                x.name: np.array([[[1.], [1.]], [[2.], [2.]]]),
+            })
         # This is a smoke test, making sure expected values are unchanged.
         self.assertEqual(len(res), 2)
         self.assertAllClose(res[0], res[1].h)
@@ -916,44 +909,40 @@ class RNNCellTest(test.TestCase):
 
   def testConv2DLSTMCell(self):
     with self.test_session() as sess:
-      shape = [2,2,1]
-      filter_size = [3,3]
+      shape = [2, 2, 1]
+      filter_size = [3, 3]
       num_features = 1
       batch_size = 2
       expected_state_c = np.array(
-          [[[[1.4375670191], [1.4375670191]],
-            [[1.4375670191], [1.4375670191]]],
-           [[[2.7542609292], [2.7542609292]],
-            [[2.7542609292], [2.7542609292]]]],
+          [[[[1.4375670191], [1.4375670191]], [[1.4375670191], [1.4375670191]]],
+           [[[2.7542609292], [2.7542609292]], [[2.7542609292], [2.7542609292]]
+           ]],
           dtype=np.float32)
       expected_state_h = np.array(
-          [[[[0.6529865603], [0.6529865603]],
-            [[0.6529865603], [0.6529865603]]],
-           [[[0.8736877431], [0.8736877431]],
-            [[0.8736877431], [0.8736877431]]]],
+          [[[[0.6529865603], [0.6529865603]], [[0.6529865603], [0.6529865603]]],
+           [[[0.8736877431], [0.8736877431]], [[0.8736877431], [0.8736877431]]
+           ]],
           dtype=np.float32)
       with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(1.0/4.0)):
+          "root", initializer=init_ops.constant_initializer(1.0 / 4.0)):
         x = array_ops.placeholder(dtypes.float32, [None, None, None, 1])
-        cell = contrib_rnn_cell.Conv2DLSTMCell(input_shape=shape,
-                                               kernel_shape=filter_size,
-                                               output_channels=num_features)
+        cell = contrib_rnn_cell.Conv2DLSTMCell(
+            input_shape=shape,
+            kernel_shape=filter_size,
+            output_channels=num_features)
         hidden = cell.zero_state(array_ops.shape(x)[0], dtypes.float32)
         output, state = cell(x, hidden)
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([output, state], {
-            hidden[0].name:
-                np.array([[[[1.],[1.]],
-                           [[1.],[1.]]],
-                          [[[2.],[2.]],
-                           [[2.],[2.]]]]),
-            x.name:
-                np.array([[[[1.],[1.]],
-                           [[1.],[1.]]],
-                          [[[2.],[2.]],
-                           [[2.],[2.]]]]),
-        })
+        res = sess.run(
+            [output, state], {
+                hidden[0].name:
+                    np.array([[[[1.], [1.]], [[1.], [1.]]], [[[2.], [2.]],
+                                                             [[2.], [2.]]]]),
+                x.name:
+                    np.array([[[[1.], [1.]], [[1.], [1.]]], [[[2.], [2.]],
+                                                             [[2.], [2.]]]]),
+            })
         # This is a smoke test, making sure expected values are unchanged.
         self.assertEqual(len(res), 2)
         self.assertAllClose(res[0], res[1].h)
@@ -962,36 +951,33 @@ class RNNCellTest(test.TestCase):
 
   def testConv3DLSTMCell(self):
     with self.test_session() as sess:
-      shape = [2,2,2,1]
-      filter_size = [3,3,3]
+      shape = [2, 2, 2, 1]
+      filter_size = [3, 3, 3]
       num_features = 1
       batch_size = 2
       expected_state_c = np.array(
-         [[[[[1.4375670191], [1.4375670191]],
-            [[1.4375670191], [1.4375670191]]],
-           [[[1.4375670191], [1.4375670191]],
-            [[1.4375670191], [1.4375670191]]]],
-          [[[[2.7542609292], [2.7542609292]],
-            [[2.7542609292], [2.7542609292]]],
-           [[[2.7542609292], [2.7542609292]],
-            [[2.7542609292], [2.7542609292]]]]],
+          [[[[[1.4375670191], [1.4375670191]], [[1.4375670191], [1.4375670191]]
+            ], [[[1.4375670191], [1.4375670191]], [[1.4375670191],
+                                                   [1.4375670191]]]],
+           [[[[2.7542609292], [2.7542609292]], [[2.7542609292], [2.7542609292]]
+            ], [[[2.7542609292], [2.7542609292]], [[2.7542609292],
+                                                   [2.7542609292]]]]],
           dtype=np.float32)
       expected_state_h = np.array(
-         [[[[[0.6529865603], [0.6529865603]],
-            [[0.6529865603], [0.6529865603]]],
-           [[[0.6529865603], [0.6529865603]],
-            [[0.6529865603], [0.6529865603]]]],
-          [[[[0.8736877431], [0.8736877431]],
-            [[0.8736877431], [0.8736877431]]],
-           [[[0.8736877431], [0.8736877431]],
-            [[0.8736877431], [0.8736877431]]]]],
+          [[[[[0.6529865603], [0.6529865603]], [[0.6529865603], [0.6529865603]]
+            ], [[[0.6529865603], [0.6529865603]], [[0.6529865603],
+                                                   [0.6529865603]]]],
+           [[[[0.8736877431], [0.8736877431]], [[0.8736877431], [0.8736877431]]
+            ], [[[0.8736877431], [0.8736877431]], [[0.8736877431],
+                                                   [0.8736877431]]]]],
           dtype=np.float32)
       with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(1.0/8.0)):
+          "root", initializer=init_ops.constant_initializer(1.0 / 8.0)):
         x = array_ops.placeholder(dtypes.float32, [None, None, None, None, 1])
-        cell = contrib_rnn_cell.Conv3DLSTMCell(input_shape=shape,
-                                               kernel_shape=filter_size,
-                                               output_channels=num_features)
+        cell = contrib_rnn_cell.Conv3DLSTMCell(
+            input_shape=shape,
+            kernel_shape=filter_size,
+            output_channels=num_features)
         hidden = cell.zero_state(array_ops.shape(x)[0], dtypes.float32)
         output, state = cell(x, hidden)
 
@@ -1054,8 +1040,8 @@ class RNNCellTest(test.TestCase):
             num_units=num_units, number_of_groups=number_of_groups)
         cell = rnn_cell.LSTMCell(num_units=num_units)
         self.assertTrue(isinstance(gcell.state_size, tuple))
-        zero_state = gcell.zero_state(batch_size=batch_size,
-                                      dtype=dtypes.float32)
+        zero_state = gcell.zero_state(
+            batch_size=batch_size, dtype=dtypes.float32)
         gh, gs = gcell(x, zero_state)
         h, g = cell(x, zero_state)
 
@@ -1078,16 +1064,16 @@ class RNNCellTest(test.TestCase):
         glstm_input = array_ops.ones([batch_size, num_units])
         gcell = contrib_rnn_cell.GLSTMCell(
             num_units=num_units, number_of_groups=number_of_groups)
-        gcell_zero_state = gcell.zero_state(batch_size=batch_size,
-                                            dtype=dtypes.float32)
+        gcell_zero_state = gcell.zero_state(
+            batch_size=batch_size, dtype=dtypes.float32)
         gh, gs = gcell(glstm_input, gcell_zero_state)
 
         # input for LSTM cell simulating single G-LSTM group
         lstm_input = array_ops.ones([batch_size, num_units / number_of_groups])
         # note division by number_of_groups. This cell one simulates G-LSTM group
         cell = rnn_cell.LSTMCell(num_units=int(num_units / number_of_groups))
-        cell_zero_state = cell.zero_state(batch_size=batch_size,
-                                          dtype=dtypes.float32)
+        cell_zero_state = cell.zero_state(
+            batch_size=batch_size, dtype=dtypes.float32)
         h, g = cell(lstm_input, cell_zero_state)
 
         sess.run([variables.global_variables_initializer()])
@@ -1097,6 +1083,7 @@ class RNNCellTest(test.TestCase):
         self.assertAllClose(gh_res[:, int(num_units / number_of_groups):],
                             h_res, 1e-5)
 
+
 class LayerNormBasicLSTMCellTest(test.TestCase):
 
   # NOTE: all the values in the current test case have been calculated.
@@ -1117,13 +1104,14 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         cell = rnn_cell.MultiRNNCell([single_cell() for _ in range(2)])
         g, out_m = cell(x, state)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, out_m], {
-            x.name: np.array([[1., 1.]]),
-            c0.name: 0.1 * np.asarray([[0, 1]]),
-            h0.name: 0.1 * np.asarray([[2, 3]]),
-            c1.name: 0.1 * np.asarray([[4, 5]]),
-            h1.name: 0.1 * np.asarray([[6, 7]]),
-        })
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.array([[1., 1.]]),
+                c0.name: 0.1 * np.asarray([[0, 1]]),
+                h0.name: 0.1 * np.asarray([[2, 3]]),
+                c1.name: 0.1 * np.asarray([[4, 5]]),
+                h1.name: 0.1 * np.asarray([[6, 7]]),
+            })
 
         expected_h = np.array([[-0.38079708, 0.38079708]])
         expected_state0_c = np.array([[-1.0, 1.0]])
@@ -1153,11 +1141,12 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         cell = contrib_rnn_cell.LayerNormBasicLSTMCell(2)
         g, out_m = cell(x, state)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, out_m], {
-            x.name: np.array([[1., 1., 1.]]),
-            c.name: 0.1 * np.asarray([[0, 1]]),
-            h.name: 0.1 * np.asarray([[2, 3]]),
-        })
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.array([[1., 1., 1.]]),
+                c.name: 0.1 * np.asarray([[0, 1]]),
+                h.name: 0.1 * np.asarray([[2, 3]]),
+            })
 
         expected_h = np.array([[-0.38079708, 0.38079708]])
         expected_c = np.array([[-1.0, 1.0]])
@@ -1166,7 +1155,6 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         self.assertAllClose(res[1].c, expected_c, 1e-5)
         self.assertAllClose(res[1].h, expected_h, 1e-5)
 
-
   def testBasicLSTMCellWithoutNorm(self):
     """Tests that BasicLSTMCell with layer_norm=False."""
     with self.test_session() as sess:
@@ -1184,19 +1172,20 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         cell = rnn_cell.MultiRNNCell([single_cell() for _ in range(2)])
         g, out_m = cell(x, state)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, out_m], {
-          x.name: np.array([[1., 1.]]),
-          c0.name: 0.1 * np.asarray([[0, 1]]),
-          h0.name: 0.1 * np.asarray([[2, 3]]),
-          c1.name: 0.1 * np.asarray([[4, 5]]),
-          h1.name: 0.1 * np.asarray([[6, 7]]),
-        })
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.array([[1., 1.]]),
+                c0.name: 0.1 * np.asarray([[0, 1]]),
+                h0.name: 0.1 * np.asarray([[2, 3]]),
+                c1.name: 0.1 * np.asarray([[4, 5]]),
+                h1.name: 0.1 * np.asarray([[6, 7]]),
+            })
 
-        expected_h = np.array([[ 0.70230919, 0.72581059]])
-        expected_state0_c = np.array([[ 0.8020075,  0.89599884]])
-        expected_state0_h = np.array([[ 0.56668288,  0.60858738]])
-        expected_state1_c = np.array([[ 1.17500675,  1.26892781]])
-        expected_state1_h = np.array([[ 0.70230919,  0.72581059]])
+        expected_h = np.array([[0.70230919, 0.72581059]])
+        expected_state0_c = np.array([[0.8020075, 0.89599884]])
+        expected_state0_h = np.array([[0.56668288, 0.60858738]])
+        expected_state1_c = np.array([[1.17500675, 1.26892781]])
+        expected_state1_h = np.array([[0.70230919, 0.72581059]])
 
         actual_h = res[0]
         actual_state0_c = res[1][0].c
@@ -1213,21 +1202,22 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
       with variable_scope.variable_scope(
           "other", initializer=init_ops.constant_initializer(0.5)) as vs:
         x = array_ops.zeros(
-          [1, 3])  # Test BasicLSTMCell with input_size != num_units.
+            [1, 3])  # Test BasicLSTMCell with input_size != num_units.
         c = array_ops.zeros([1, 2])
         h = array_ops.zeros([1, 2])
         state = rnn_cell.LSTMStateTuple(c, h)
         cell = contrib_rnn_cell.LayerNormBasicLSTMCell(2, layer_norm=False)
         g, out_m = cell(x, state)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, out_m], {
-          x.name: np.array([[1., 1., 1.]]),
-          c.name: 0.1 * np.asarray([[0, 1]]),
-          h.name: 0.1 * np.asarray([[2, 3]]),
-        })
-
-        expected_h = np.array([[ 0.64121795, 0.68166804]])
-        expected_c = np.array([[ 0.88477188, 0.98103917]])
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.array([[1., 1., 1.]]),
+                c.name: 0.1 * np.asarray([[0, 1]]),
+                h.name: 0.1 * np.asarray([[2, 3]]),
+            })
+
+        expected_h = np.array([[0.64121795, 0.68166804]])
+        expected_c = np.array([[0.88477188, 0.98103917]])
         self.assertEqual(len(res), 2)
         self.assertAllClose(res[0], expected_h, 1e-5)
         self.assertAllClose(res[1].c, expected_c, 1e-5)
@@ -1248,13 +1238,14 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
             [contrib_rnn_cell.LayerNormBasicLSTMCell(2) for _ in range(2)])
         h, (s0, s1) = cell(x, (state0, state1))
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([h, s0, s1], {
-            x.name: np.array([[1., 1.]]),
-            c0.name: 0.1 * np.asarray([[0, 1]]),
-            h0.name: 0.1 * np.asarray([[2, 3]]),
-            c1.name: 0.1 * np.asarray([[4, 5]]),
-            h1.name: 0.1 * np.asarray([[6, 7]]),
-        })
+        res = sess.run(
+            [h, s0, s1], {
+                x.name: np.array([[1., 1.]]),
+                c0.name: 0.1 * np.asarray([[0, 1]]),
+                h0.name: 0.1 * np.asarray([[2, 3]]),
+                c1.name: 0.1 * np.asarray([[4, 5]]),
+                h1.name: 0.1 * np.asarray([[6, 7]]),
+            })
 
         expected_h = np.array([[-0.38079708, 0.38079708]])
         expected_h0 = np.array([[-0.38079708, 0.38079708]])
@@ -1328,7 +1319,7 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
     h_low = 0.761552567265
     h_high = 0.995008519604
     num_units = 5
-    allowed_low = [2, 3]
+    allowed_low = [1, 2, 3]
 
     with self.test_session() as sess:
       with variable_scope.variable_scope(
@@ -1342,11 +1333,12 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
 
         g, s = cell(x, state)
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x.name: np.ones([1, 5]),
-            c.name: np.ones([1, 5]),
-            h.name: np.ones([1, 5]),
-        })
+        res = sess.run(
+            [g, s], {
+                x.name: np.ones([1, 5]),
+                c.name: np.ones([1, 5]),
+                h.name: np.ones([1, 5]),
+            })
 
         # Since the returned tensors are of size [1,n]
         # get the first component right now.
@@ -1372,35 +1364,35 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         self.assertIn(dropped_count, allowed_low)
 
 
-def _create_multi_lstm_cell_ops(batch_size, num_units, input_depth,
-                                num_layers, max_time, compiled):
+def _create_multi_lstm_cell_ops(batch_size, num_units, input_depth, num_layers,
+                                max_time, compiled):
   with variable_scope.variable_scope(
       "root",
       initializer=init_ops.random_uniform_initializer(-0.1, 0.1, seed=2)):
     inputs = variable_scope.get_variable(
-        "inputs", initializer=random_ops.random_uniform(
+        "inputs",
+        initializer=random_ops.random_uniform(
             (max_time, batch_size, input_depth), seed=1))
     maybe_xla = lambda c: contrib_rnn_cell.CompiledWrapper(c) if compiled else c
     cell = rnn_cell.MultiRNNCell(
         [maybe_xla(rnn_cell.LSTMCell(num_units)) for _ in range(num_layers)])
-    initial_state = cell.zero_state(
-        batch_size=batch_size, dtype=dtypes.float32)
+    initial_state = cell.zero_state(batch_size=batch_size, dtype=dtypes.float32)
     outputs, final_state = rnn.dynamic_rnn(
-        cell=cell, inputs=inputs, initial_state=initial_state,
-        time_major=True)
+        cell=cell, inputs=inputs, initial_state=initial_state, time_major=True)
     flat_final_state = nest.flatten(final_state)
     trainable_variables = variables.trainable_variables()
     outputs_grad = gradients_impl.gradients(
-        [outputs],
-        trainable_variables + [inputs] + nest.flatten(initial_state))
+        [outputs], trainable_variables + [inputs] + nest.flatten(initial_state))
     final_state_grad = gradients_impl.gradients(
         flat_final_state,
         trainable_variables + [inputs] + nest.flatten(initial_state))
 
-    return {"outputs": outputs,
-            "final_state": flat_final_state,
-            "outputs_grad": outputs_grad,
-            "final_state_grad": final_state_grad}
+    return {
+        "outputs": outputs,
+        "final_state": flat_final_state,
+        "outputs_grad": outputs_grad,
+        "final_state_grad": final_state_grad
+    }
 
 
 class CompiledWrapperTest(test.TestCase):
@@ -1418,8 +1410,10 @@ class CompiledWrapperTest(test.TestCase):
     random_seed.set_random_seed(1234)
     with self.test_session(graph=ops.Graph()) as sess:
       xla_ops = _create_multi_lstm_cell_ops(
-          batch_size=batch_size, num_units=num_units,
-          input_depth=input_depth, num_layers=num_layers,
+          batch_size=batch_size,
+          num_units=num_units,
+          input_depth=input_depth,
+          num_layers=num_layers,
           max_time=max_time,
           compiled=True)
       sess.run([variables.global_variables_initializer()])
@@ -1428,8 +1422,10 @@ class CompiledWrapperTest(test.TestCase):
     random_seed.set_random_seed(1234)
     with self.test_session(graph=ops.Graph()) as sess:
       non_xla_ops = _create_multi_lstm_cell_ops(
-          batch_size=batch_size, num_units=num_units,
-          input_depth=input_depth, num_layers=num_layers,
+          batch_size=batch_size,
+          num_units=num_units,
+          input_depth=input_depth,
+          num_layers=num_layers,
           max_time=max_time,
           compiled=False)
       sess.run([variables.global_variables_initializer()])
@@ -1438,16 +1434,16 @@ class CompiledWrapperTest(test.TestCase):
     self.assertAllClose(
         non_xla_results["outputs"], xla_results["outputs"], atol=atol)
 
-    for xla_value, non_xla_value in zip(
-        xla_results["final_state"], non_xla_results["final_state"]):
+    for xla_value, non_xla_value in zip(xla_results["final_state"],
+                                        non_xla_results["final_state"]):
       self.assertAllClose(xla_value, non_xla_value, atol=atol)
 
-    for xla_g, non_xla_g in zip(
-        xla_results["outputs_grad"], non_xla_results["outputs_grad"]):
+    for xla_g, non_xla_g in zip(xla_results["outputs_grad"],
+                                non_xla_results["outputs_grad"]):
       self.assertAllClose(xla_g, non_xla_g, atol=atol)
 
-    for xla_g, non_xla_g in zip(
-        xla_results["final_state_grad"], non_xla_results["final_state_grad"]):
+    for xla_g, non_xla_g in zip(xla_results["final_state_grad"],
+                                non_xla_results["final_state_grad"]):
       self.assertAllClose(xla_g, non_xla_g, atol=atol)
 
   def testMultiRNNCellWithStateTuple(self):
@@ -1461,19 +1457,20 @@ class CompiledWrapperTest(test.TestCase):
         # Test incorrectness of state
         with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
           rnn_cell.MultiRNNCell(
-              [rnn_cell.GRUCell(2)
-               for _ in range(2)], state_is_tuple=True)(x, m_bad)
+              [rnn_cell.GRUCell(2) for _ in range(2)],
+              state_is_tuple=True)(x, m_bad)
 
         _, ml = rnn_cell.MultiRNNCell(
-            [rnn_cell.GRUCell(2)
-             for _ in range(2)], state_is_tuple=True)(x, m_good)
+            [rnn_cell.GRUCell(2) for _ in range(2)],
+            state_is_tuple=True)(x, m_good)
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run(ml, {
-            x.name: np.array([[1., 1.]]),
-            m_good[0].name: np.array([[0.1, 0.1]]),
-            m_good[1].name: np.array([[0.1, 0.1]])
-        })
+        res = sess.run(
+            ml, {
+                x.name: np.array([[1., 1.]]),
+                m_good[0].name: np.array([[0.1, 0.1]]),
+                m_good[1].name: np.array([[0.1, 0.1]])
+            })
 
         # The numbers in results were not calculated, this is just a
         # smoke test.  However, these numbers should match those of
@@ -1488,24 +1485,20 @@ class BenchmarkLSTMCellXLA(test.Benchmark):
     num_layers = 3
     max_time = 50
     print("benchmarkDynamicRNNWithMultiLSTMCell")
-    print("\t" +
-          "\t".join(["inter_th", "intra_th",
-                     "batch_size", "num_units", "input_depth", "device",
-                     "compiled", "wall_time"]))
+    print("\t" + "\t".join([
+        "inter_th", "intra_th", "batch_size", "num_units", "input_depth",
+        "device", "compiled", "wall_time"
+    ]))
 
     warmup_run = True
-    for (threads,
-         device,
-         num_units,
-         batch_size,
-         input_depth,
-         compiled) in itertools.product(
-             [{"inter": 0, "intra": 0}, {"inter": 1, "intra": 4}],
-             ["cpu", "gpu"],
-             [32, 512],
-             [1, 32, 256],
-             [32, 512],
-             [False, True]):
+    for (threads, device, num_units, batch_size, input_depth,
+         compiled) in itertools.product([{
+             "inter": 0,
+             "intra": 0
+         }, {
+             "inter": 1,
+             "intra": 4
+         }], ["cpu", "gpu"], [32, 512], [1, 32, 256], [32, 512], [False, True]):
       if threads["inter"] != 0:
         # We only care about testing inter/intra op limitations on
         # CPU with small batch size, to mimic embedded devices.
@@ -1521,31 +1514,131 @@ class BenchmarkLSTMCellXLA(test.Benchmark):
       with session.Session(config=config, graph=ops.Graph()) as sess:
         with ops.device("/%s:0" % device):
           ops_dict = _create_multi_lstm_cell_ops(
-              batch_size=batch_size, num_units=num_units,
-              input_depth=input_depth, num_layers=num_layers,
+              batch_size=batch_size,
+              num_units=num_units,
+              input_depth=input_depth,
+              num_layers=num_layers,
               max_time=max_time,
               compiled=compiled)
         sess.run([variables.global_variables_initializer()])
         all_ops = nest.flatten(ops_dict.values())
         all_ops_group = control_flow_ops.group(*all_ops)
-        name_suffix = (
-            "inter_th_%d_intra_th_%d_bs_%d_units_%d_inputdepth_%d"
-            "_device_%s_xla_%s" % (
-                threads["inter"], threads["intra"],
-                batch_size, num_units, input_depth, device, compiled))
+        name_suffix = ("inter_th_%d_intra_th_%d_bs_%d_units_%d_inputdepth_%d"
+                       "_device_%s_xla_%s" %
+                       (threads["inter"], threads["intra"], batch_size,
+                        num_units, input_depth, device, compiled))
         if warmup_run:
           self.run_op_benchmark(
               sess, all_ops_group, min_iters=30, name="ignore_warmup")
           warmup_run = False
         benchmark_results = self.run_op_benchmark(
-            sess, all_ops_group, min_iters=50,
+            sess,
+            all_ops_group,
+            min_iters=50,
             name="benchmarkDynamicRNNWithMultiLSTMCell_%s" % name_suffix)
-        print("\t" +
-              "\t".join(["%s" % x for x in [
-                  threads["inter"], threads["intra"],
-                  batch_size, num_units, input_depth, device, compiled,
-                  benchmark_results["wall_time"]]]))
+        print("\t" + "\t".join([
+            "%s" % x
+            for x in [
+                threads["inter"], threads["intra"], batch_size, num_units,
+                input_depth, device, compiled, benchmark_results["wall_time"]
+            ]
+        ]))
+
+
+class WeightNormLSTMCellTest(test.TestCase):
+  """Compared cell output with pre-calculated values."""
+
+  def _cell_output(self, cell):
+    """Calculate cell output"""
+
+    with self.test_session() as sess:
+      init = init_ops.constant_initializer(0.5)
+      with variable_scope.variable_scope("root",
+                                         initializer=init):
+        x = array_ops.zeros([1, 2])
+        c0 = array_ops.zeros([1, 2])
+        h0 = array_ops.zeros([1, 2])
+
+        state0 = rnn_cell.LSTMStateTuple(c0, h0)
+
+        xout, sout = cell()(x, state0)
+
+      sess.run([variables.global_variables_initializer()])
+      res = sess.run([xout, sout], {
+          x.name: np.array([[1., 1.]]),
+          c0.name: 0.1 * np.asarray([[0, 1]]),
+          h0.name: 0.1 * np.asarray([[2, 3]]),
+      })
+
+    actual_state_c = res[1].c
+    actual_state_h = res[1].h
+
+    return actual_state_c, actual_state_h
+
+  def testBasicCell(self):
+    """Tests cell w/o peepholes and w/o normalisation"""
+
+    def cell():
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=False,
+                                                 use_peepholes=False)
+
+    actual_c, actual_h = self._cell_output(cell)
+
+    expected_c = np.array([[0.65937078, 0.74983585]])
+    expected_h = np.array([[0.44923624, 0.49362513]])
+
+    self.assertAllClose(expected_c, actual_c, 1e-5)
+    self.assertAllClose(expected_h, actual_h, 1e-5)
+
+  def testNonbasicCell(self):
+    """Tests cell with peepholes and w/o normalisation"""
+
+    def cell():
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=False,
+                                                 use_peepholes=True)
+
+    actual_c, actual_h = self._cell_output(cell)
+
+    expected_c = np.array([[0.65937084, 0.7574988]])
+    expected_h = np.array([[0.4792085, 0.53470564]])
+
+    self.assertAllClose(expected_c, actual_c, 1e-5)
+    self.assertAllClose(expected_h, actual_h, 1e-5)
+
+
+  def testBasicCellWithNorm(self):
+    """Tests cell w/o peepholes and with normalisation"""
+
+    def cell():
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=True,
+                                                 use_peepholes=False)
+
+    actual_c, actual_h = self._cell_output(cell)
+
+    expected_c = np.array([[0.50125383, 0.58805949]])
+    expected_h = np.array([[0.32770363, 0.37397948]])
+
+    self.assertAllClose(expected_c, actual_c, 1e-5)
+    self.assertAllClose(expected_h, actual_h, 1e-5)
+
+  def testNonBasicCellWithNorm(self):
+    """Tests cell with peepholes and with normalisation"""
+
+    def cell():
+      return contrib_rnn_cell.WeightNormLSTMCell(2,
+                                                 norm=True,
+                                                 use_peepholes=True)
+
+    actual_c, actual_h = self._cell_output(cell)
+
+    expected_c = np.array([[0.50125383, 0.59587258]])
+    expected_h = np.array([[0.35041603, 0.40873795]])
 
+    self.assertAllClose(expected_c, actual_c, 1e-5)
+    self.assertAllClose(expected_h, actual_h, 1e-5)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/rnn/python/ops/gru_ops.py b/tensorflow/contrib/rnn/python/ops/gru_ops.py
index 75536e3f5f8cbe44231f19d4d455537e654f7a08..81ca12317be484ba420b7bbfac822e91d6d38bff 100644
--- a/tensorflow/contrib/rnn/python/ops/gru_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/gru_ops.py
@@ -20,18 +20,20 @@ from __future__ import print_function
 from tensorflow.contrib.rnn.ops import gen_gru_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
+from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
-from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.util.deprecation import deprecated_args
 
 _gru_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_gru_ops.so"))
 
+LayerRNNCell = rnn_cell_impl.LayerRNNCell  # pylint: disable=invalid-name
+
 
 @ops.RegisterGradient("GRUBlockCell")
 def _GRUBlockCellGrad(op, *grad):
@@ -95,7 +97,7 @@ def _GRUBlockCellGrad(op, *grad):
   return d_x, d_h_prev, d_w_ru, d_w_c, d_b_ru, d_b_c
 
 
-class GRUBlockCell(rnn_cell_impl.RNNCell):
+class GRUBlockCell(LayerRNNCell):
   r"""Block GRU cell implementation.
 
   Deprecated: use GRUBlockCellV2 instead.
@@ -132,22 +134,37 @@ class GRUBlockCell(rnn_cell_impl.RNNCell):
 
   @deprecated_args(None, "cell_size is deprecated, use num_units instead",
                    "cell_size")
-  def __init__(self, num_units=None, cell_size=None):
+  def __init__(self,
+               num_units=None,
+               cell_size=None,
+               reuse=None,
+               name="gru_cell"):
     """Initialize the Block GRU cell.
 
     Args:
       num_units: int, The number of units in the GRU cell.
       cell_size: int, The old (deprecated) name for `num_units`.
+      reuse: (optional) boolean describing whether to reuse variables in an
+        existing scope.  If not `True`, and the existing scope already has the
+        given variables, an error is raised.
+      name: String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require reuse=True in such
+        cases.  By default this is "lstm_cell", for variable-name compatibility
+        with `tf.nn.rnn_cell.GRUCell`.
 
     Raises:
       ValueError: if both cell_size and num_units are not None;
         or both are None.
     """
+    super(GRUBlockCell, self).__init__(_reuse=reuse, name=name)
     if (cell_size is None) == (num_units is None):
-      raise ValueError("Exactly one of num_units or cell_size must be provided.")
+      raise ValueError(
+          "Exactly one of num_units or cell_size must be provided.")
     if num_units is None:
       num_units = cell_size
     self._cell_size = num_units
+    # Inputs must be 2-dimensional.
+    self.input_spec = base_layer.InputSpec(ndim=2)
 
   @property
   def state_size(self):
@@ -157,40 +174,43 @@ class GRUBlockCell(rnn_cell_impl.RNNCell):
   def output_size(self):
     return self._cell_size
 
-  def __call__(self, x, h_prev, scope=None):
+  def build(self, input_shape):
+    # Check if the input size exist.
+    input_size = input_shape[1].value
+    if input_size is None:
+      raise ValueError("Expecting input_size to be set.")
+
+    self._gate_kernel = self.add_variable(
+        "w_ru", [input_size + self._cell_size, self._cell_size * 2])
+    self._gate_bias = self.add_variable(
+        "b_ru", [self._cell_size * 2],
+        initializer=init_ops.constant_initializer(1.0))
+    self._candidate_kernel = self.add_variable(
+        "w_c", [input_size + self._cell_size, self._cell_size])
+    self._candidate_bias = self.add_variable(
+        "b_c", [self._cell_size],
+        initializer=init_ops.constant_initializer(0.0))
+
+    self.built = True
+
+  def call(self, inputs, h_prev):
     """GRU cell."""
-    with vs.variable_scope(scope or type(self).__name__):
-      input_size = x.get_shape().with_rank(2)[1]
-
-      # Check if the input size exist.
-      if input_size is None:
-        raise ValueError("Expecting input_size to be set.")
-
-      # Check cell_size == state_size from h_prev.
-      cell_size = h_prev.get_shape().with_rank(2)[1]
-      if cell_size != self._cell_size:
-        raise ValueError("Shape of h_prev[1] incorrect: cell_size %i vs %s" %
-                         (self._cell_size, cell_size))
-
-      if cell_size is None:
-        raise ValueError("cell_size from `h_prev` should not be None.")
-
-      w_ru = vs.get_variable("w_ru", [input_size + self._cell_size,
-                                      self._cell_size * 2])
-      b_ru = vs.get_variable(
-          "b_ru", [self._cell_size * 2],
-          initializer=init_ops.constant_initializer(1.0))
-      w_c = vs.get_variable("w_c",
-                            [input_size + self._cell_size, self._cell_size])
-      b_c = vs.get_variable(
-          "b_c", [self._cell_size],
-          initializer=init_ops.constant_initializer(0.0))
+    # Check cell_size == state_size from h_prev.
+    cell_size = h_prev.get_shape().with_rank(2)[1]
+    if cell_size != self._cell_size:
+      raise ValueError("Shape of h_prev[1] incorrect: cell_size %i vs %s" %
+                       (self._cell_size, cell_size))
 
-      _gru_block_cell = gen_gru_ops.gru_block_cell  # pylint: disable=invalid-name
-      _, _, _, new_h = _gru_block_cell(
-          x=x, h_prev=h_prev, w_ru=w_ru, w_c=w_c, b_ru=b_ru, b_c=b_c)
+    _gru_block_cell = gen_gru_ops.gru_block_cell  # pylint: disable=invalid-name
+    _, _, _, new_h = _gru_block_cell(
+        x=inputs,
+        h_prev=h_prev,
+        w_ru=self._gate_kernel,
+        w_c=self._candidate_kernel,
+        b_ru=self._gate_bias,
+        b_c=self._candidate_bias)
 
-      return new_h, new_h
+    return new_h, new_h
 
 
 class GRUBlockCellV2(GRUBlockCell):
@@ -199,39 +219,21 @@ class GRUBlockCellV2(GRUBlockCell):
   Only differs from GRUBlockCell by variable names.
   """
 
-  def __call__(self, x, h_prev, scope=None):
+  def build(self, input_shape):
     """GRU cell."""
-    with vs.variable_scope(scope or type(self).__name__):
-      input_size = x.get_shape().with_rank(2)[1]
-
-      # Check if the input size exist.
-      if input_size is None:
-        raise ValueError("Expecting input_size to be set.")
-
-      # Check cell_size == state_size from h_prev.
-      cell_size = h_prev.get_shape().with_rank(2)[1]
-      if cell_size != self._cell_size:
-        raise ValueError("Shape of h_prev[1] incorrect: cell_size %i vs %s" %
-                         (self._cell_size, cell_size))
-
-      if cell_size is None:
-        raise ValueError("cell_size from `h_prev` should not be None.")
-
-      with vs.variable_scope("gates"):
-        w_ru = vs.get_variable("kernel", [input_size + self._cell_size,
-                                          self._cell_size * 2])
-        b_ru = vs.get_variable(
-            "bias", [self._cell_size * 2],
-            initializer=init_ops.constant_initializer(1.0))
-      with vs.variable_scope("candidate"):
-        w_c = vs.get_variable("kernel",
-                              [input_size + self._cell_size, self._cell_size])
-        b_c = vs.get_variable(
-            "bias", [self._cell_size],
-            initializer=init_ops.constant_initializer(0.0))
-
-      _gru_block_cell = gen_gru_ops.gru_block_cell  # pylint: disable=invalid-name
-      _, _, _, new_h = _gru_block_cell(
-          x=x, h_prev=h_prev, w_ru=w_ru, w_c=w_c, b_ru=b_ru, b_c=b_c)
-
-      return new_h, new_h
+    input_size = input_shape[1].value
+    if input_size is None:
+      raise ValueError("Expecting input_size to be set.")
+
+    self._gate_kernel = self.add_variable(
+        "gates/kernel", [input_size + self._cell_size, self._cell_size * 2])
+    self._gate_bias = self.add_variable(
+        "gates/bias", [self._cell_size * 2],
+        initializer=init_ops.constant_initializer(1.0))
+    self._candidate_kernel = self.add_variable(
+        "candidate/kernel", [input_size + self._cell_size, self._cell_size])
+    self._candidate_bias = self.add_variable(
+        "candidate/bias", [self._cell_size],
+        initializer=init_ops.constant_initializer(0.0))
+
+    self.built = True
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index df910a3423083972bdee42bec10733e37b8e5f96..f7007173943c99d08791c125b906d4befe6387ea 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -20,21 +20,22 @@ from __future__ import print_function
 import abc
 
 from tensorflow.contrib.rnn.ops import gen_lstm_ops
-from tensorflow.contrib.rnn.python.ops import fused_rnn_cell
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
-from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import resource_loader
 
 _lstm_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_lstm_ops.so"))
 
+LayerRNNCell = rnn_cell_impl.LayerRNNCell  # pylint: disable=invalid-name
+
 
 # pylint: disable=invalid-name
 def _lstm_block_cell(x,
@@ -327,7 +328,7 @@ def _BlockLSTMGrad(op, *grad):
   ]
 
 
-class LSTMBlockCell(rnn_cell_impl.RNNCell):
+class LSTMBlockCell(LayerRNNCell):
   """Basic LSTM recurrent network cell.
 
   The implementation is based on: http://arxiv.org/abs/1409.2329.
@@ -345,7 +346,8 @@ class LSTMBlockCell(rnn_cell_impl.RNNCell):
                forget_bias=1.0,
                cell_clip=None,
                use_peephole=False,
-               reuse=None):
+               reuse=None,
+               name="lstm_cell"):
     """Initialize the basic LSTM cell.
 
     Args:
@@ -356,11 +358,15 @@ class LSTMBlockCell(rnn_cell_impl.RNNCell):
       reuse: (optional) boolean describing whether to reuse variables in an
         existing scope.  If not `True`, and the existing scope already has the
         given variables, an error is raised.
+      name: String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require reuse=True in such
+        cases.  By default this is "lstm_cell", for variable-name compatibility
+        with `tf.nn.rnn_cell.LSTMCell`.
 
       When restoring from CudnnLSTM-trained checkpoints, must use
       CudnnCompatibleLSTMBlockCell instead.
     """
-    super(LSTMBlockCell, self).__init__(_reuse=reuse)
+    super(LSTMBlockCell, self).__init__(_reuse=reuse, name=name)
     self._num_units = num_units
     self._forget_bias = forget_bias
     self._use_peephole = use_peephole
@@ -373,6 +379,8 @@ class LSTMBlockCell(rnn_cell_impl.RNNCell):
         "wco": "w_o_diag",
         "scope": "lstm_cell"
     }
+    # Inputs must be 2-dimensional.
+    self.input_spec = base_layer.InputSpec(ndim=2)
 
   @property
   def state_size(self):
@@ -382,45 +390,54 @@ class LSTMBlockCell(rnn_cell_impl.RNNCell):
   def output_size(self):
     return self._num_units
 
-  def __call__(self, x, states_prev, scope=None):
+  def build(self, inputs_shape):
+    if not inputs_shape[1].value:
+      raise ValueError(
+          "Expecting inputs_shape[1] to be set: %s" % str(inputs_shape))
+    input_size = inputs_shape[1].value
+    self._kernel = self.add_variable(
+        self._names["W"], [input_size + self._num_units, self._num_units * 4])
+    self._bias = self.add_variable(
+        self._names["b"], [self._num_units * 4],
+        initializer=init_ops.constant_initializer(0.0))
+    if self._use_peephole:
+      self._w_i_diag = self.add_variable(self._names["wci"], [self._num_units])
+      self._w_f_diag = self.add_variable(self._names["wcf"], [self._num_units])
+      self._w_o_diag = self.add_variable(self._names["wco"], [self._num_units])
+
+    self.built = True
+
+  def call(self, inputs, state):
     """Long short-term memory cell (LSTM)."""
-    with vs.variable_scope(scope or self._names["scope"]):
-      x_shape = x.get_shape().with_rank(2)
-      if not x_shape[1].value:
-        raise ValueError("Expecting x_shape[1] to be set: %s" % str(x_shape))
-      if len(states_prev) != 2:
-        raise ValueError("Expecting states_prev to be a tuple with length 2.")
-      input_size = x_shape[1].value
-      w = vs.get_variable(self._names["W"], [input_size + self._num_units,
-                                             self._num_units * 4])
-      b = vs.get_variable(
-          self._names["b"], [w.get_shape().with_rank(2)[1].value],
-          initializer=init_ops.constant_initializer(0.0))
-      if self._use_peephole:
-        wci = vs.get_variable(self._names["wci"], [self._num_units])
-        wcf = vs.get_variable(self._names["wcf"], [self._num_units])
-        wco = vs.get_variable(self._names["wco"], [self._num_units])
-      else:
-        wci = wcf = wco = array_ops.zeros([self._num_units])
-      (cs_prev, h_prev) = states_prev
-      (_, cs, _, _, _, _, h) = _lstm_block_cell(
-          x,
-          cs_prev,
-          h_prev,
-          w,
-          b,
-          wci=wci,
-          wcf=wcf,
-          wco=wco,
-          forget_bias=self._forget_bias,
-          cell_clip=self._cell_clip,
-          use_peephole=self._use_peephole)
-
-      new_state = rnn_cell_impl.LSTMStateTuple(cs, h)
-      return h, new_state
-
-
-class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
+    if len(state) != 2:
+      raise ValueError("Expecting state to be a tuple with length 2.")
+
+    if self._use_peephole:
+      wci = self._w_i_diag
+      wcf = self._w_f_diag
+      wco = self._w_o_diag
+    else:
+      wci = wcf = wco = array_ops.zeros([self._num_units])
+
+    (cs_prev, h_prev) = state
+    (_, cs, _, _, _, _, h) = _lstm_block_cell(
+        inputs,
+        cs_prev,
+        h_prev,
+        self._kernel,
+        self._bias,
+        wci=wci,
+        wcf=wcf,
+        wco=wco,
+        forget_bias=self._forget_bias,
+        cell_clip=self._cell_clip,
+        use_peephole=self._use_peephole)
+
+    new_state = rnn_cell_impl.LSTMStateTuple(cs, h)
+    return h, new_state
+
+
+class LSTMBlockWrapper(base_layer.Layer):
   """This is a helper class that provides housekeeping for LSTM cells.
 
   This may be useful for alternative LSTM and similar type of cells.
@@ -459,12 +476,7 @@ class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
     """
     pass
 
-  def __call__(self,
-               inputs,
-               initial_state=None,
-               dtype=None,
-               sequence_length=None,
-               scope=None):
+  def call(self, inputs, initial_state=None, dtype=None, sequence_length=None):
     """Run this LSTM on inputs, starting from the given state.
 
     Args:
@@ -480,7 +492,6 @@ class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
         `int32` or `int64` vector (tensor) size `[batch_size]`, values in `[0,
         time_len).`
         Defaults to `time_len` for each element.
-      scope: `VariableScope` for the created subgraph; defaults to class name.
 
     Returns:
       A pair containing:
@@ -493,75 +504,71 @@ class LSTMBlockWrapper(fused_rnn_cell.FusedRNNCell):
     Raises:
       ValueError: in case of shape mismatches
     """
-    with vs.variable_scope(scope or "lstm_block_wrapper"):
-      is_list = isinstance(inputs, list)
-      if is_list:
-        inputs = array_ops.stack(inputs)
-      inputs_shape = inputs.get_shape().with_rank(3)
-      if not inputs_shape[2]:
-        raise ValueError("Expecting inputs_shape[2] to be set: %s" %
-                         inputs_shape)
-      batch_size = inputs_shape[1].value
-      if batch_size is None:
-        batch_size = array_ops.shape(inputs)[1]
-      time_len = inputs_shape[0].value
-      if time_len is None:
-        time_len = array_ops.shape(inputs)[0]
-
-      # Provide default values for initial_state and dtype
-      if initial_state is None:
-        if dtype is None:
-          raise ValueError(
-              "Either initial_state or dtype needs to be specified")
-        z = array_ops.zeros(
-            array_ops.stack([batch_size, self.num_units]), dtype=dtype)
-        initial_state = z, z
-      else:
-        if len(initial_state) != 2:
-          raise ValueError(
-              "Expecting initial_state to be a tuple with length 2 or None")
-        if dtype is None:
-          dtype = initial_state[0].dtype
-
-      # create the actual cell
-      if sequence_length is not None:
-        sequence_length = ops.convert_to_tensor(sequence_length)
-      initial_cell_state, initial_output = initial_state  # pylint: disable=unpacking-non-sequence
-      cell_states, outputs = self._call_cell(inputs, initial_cell_state,
-                                             initial_output, dtype,
-                                             sequence_length)
-
-      if sequence_length is not None:
-        # Mask out the part beyond sequence_length
-        mask = array_ops.transpose(
-            array_ops.sequence_mask(
-                sequence_length, time_len, dtype=dtype), [1, 0])
-        mask = array_ops.tile(
-            array_ops.expand_dims(mask, [-1]), [1, 1, self.num_units])
-        outputs *= mask
-        # Prepend initial states to cell_states and outputs for indexing to work
-        # correctly,since we want to access the last valid state at
-        # sequence_length - 1, which can even be -1, corresponding to the
-        # initial state.
-        mod_cell_states = array_ops.concat(
-            [array_ops.expand_dims(initial_cell_state, [0]), cell_states], 0)
-        mod_outputs = array_ops.concat(
-            [array_ops.expand_dims(initial_output, [0]), outputs], 0)
-        final_cell_state = self._gather_states(mod_cell_states, sequence_length,
-                                               batch_size)
-        final_output = self._gather_states(mod_outputs, sequence_length,
-                                           batch_size)
-      else:
-        # No sequence_lengths used: final state is the last state
-        final_cell_state = cell_states[-1]
-        final_output = outputs[-1]
-
-      if is_list:
-        # Input was a list, so return a list
-        outputs = array_ops.unstack(outputs)
-
-      final_state = rnn_cell_impl.LSTMStateTuple(final_cell_state, final_output)
-      return outputs, final_state
+    is_list = isinstance(inputs, list)
+    if is_list:
+      inputs = array_ops.stack(inputs)
+    inputs_shape = inputs.get_shape().with_rank(3)
+    if not inputs_shape[2]:
+      raise ValueError("Expecting inputs_shape[2] to be set: %s" % inputs_shape)
+    batch_size = inputs_shape[1].value
+    if batch_size is None:
+      batch_size = array_ops.shape(inputs)[1]
+    time_len = inputs_shape[0].value
+    if time_len is None:
+      time_len = array_ops.shape(inputs)[0]
+
+    # Provide default values for initial_state and dtype
+    if initial_state is None:
+      if dtype is None:
+        raise ValueError("Either initial_state or dtype needs to be specified")
+      z = array_ops.zeros(
+          array_ops.stack([batch_size, self.num_units]), dtype=dtype)
+      initial_state = z, z
+    else:
+      if len(initial_state) != 2:
+        raise ValueError(
+            "Expecting initial_state to be a tuple with length 2 or None")
+      if dtype is None:
+        dtype = initial_state[0].dtype
+
+    # create the actual cell
+    if sequence_length is not None:
+      sequence_length = ops.convert_to_tensor(sequence_length)
+    initial_cell_state, initial_output = initial_state  # pylint: disable=unpacking-non-sequence
+    cell_states, outputs = self._call_cell(
+        inputs, initial_cell_state, initial_output, dtype, sequence_length)
+
+    if sequence_length is not None:
+      # Mask out the part beyond sequence_length
+      mask = array_ops.transpose(
+          array_ops.sequence_mask(sequence_length, time_len, dtype=dtype),
+          [1, 0])
+      mask = array_ops.tile(
+          array_ops.expand_dims(mask, [-1]), [1, 1, self.num_units])
+      outputs *= mask
+      # Prepend initial states to cell_states and outputs for indexing to work
+      # correctly,since we want to access the last valid state at
+      # sequence_length - 1, which can even be -1, corresponding to the
+      # initial state.
+      mod_cell_states = array_ops.concat(
+          [array_ops.expand_dims(initial_cell_state, [0]), cell_states], 0)
+      mod_outputs = array_ops.concat(
+          [array_ops.expand_dims(initial_output, [0]), outputs], 0)
+      final_cell_state = self._gather_states(mod_cell_states, sequence_length,
+                                             batch_size)
+      final_output = self._gather_states(mod_outputs, sequence_length,
+                                         batch_size)
+    else:
+      # No sequence_lengths used: final state is the last state
+      final_cell_state = cell_states[-1]
+      final_output = outputs[-1]
+
+    if is_list:
+      # Input was a list, so return a list
+      outputs = array_ops.unstack(outputs)
+
+    final_state = rnn_cell_impl.LSTMStateTuple(final_cell_state, final_output)
+    return outputs, final_state
 
   def _gather_states(self, data, indices, batch_size):
     """Produce `out`, s.t. out(i, j) = data(indices(i), i, j)."""
@@ -589,7 +596,9 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
                num_units,
                forget_bias=1.0,
                cell_clip=None,
-               use_peephole=False):
+               use_peephole=False,
+               reuse=None,
+               name="lstm_fused_cell"):
     """Initialize the LSTM cell.
 
     Args:
@@ -597,19 +606,48 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
       forget_bias: float, The bias added to forget gates (see above).
       cell_clip: clip the cell to this value. Default is no cell clipping.
       use_peephole: Whether to use peephole connections or not.
+      reuse: (optional) boolean describing whether to reuse variables in an
+        existing scope.  If not `True`, and the existing scope already has the
+        given variables, an error is raised.
+      name: String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require reuse=True in such
+        cases.  By default this is "lstm_cell", for variable-name compatibility
+        with `tf.nn.rnn_cell.LSTMCell`.
     """
+    super(LSTMBlockFusedCell, self).__init__(_reuse=reuse, name=name)
     self._num_units = num_units
     self._forget_bias = forget_bias
     self._cell_clip = cell_clip if cell_clip is not None else -1
     self._use_peephole = use_peephole
 
+    # Inputs must be 3-dimensional.
+    self.input_spec = base_layer.InputSpec(ndim=3)
+
   @property
   def num_units(self):
     """Number of units in this cell (output dimension)."""
     return self._num_units
 
-  def _call_cell(self, inputs, initial_cell_state, initial_output, dtype,
-                 sequence_length):
+  def build(self, input_shape):
+    input_size = input_shape[2].value
+    self._kernel = self.add_variable(
+        "kernel", [input_size + self._num_units, self._num_units * 4])
+    self._bias = self.add_variable(
+        "bias", [self._num_units * 4],
+        initializer=init_ops.constant_initializer(0.0))
+    if self._use_peephole:
+      self._w_i_diag = self.add_variable("w_i_diag", [self._num_units])
+      self._w_f_diag = self.add_variable("w_f_diag", [self._num_units])
+      self._w_o_diag = self.add_variable("w_o_diag", [self._num_units])
+
+    self.built = True
+
+  def _call_cell(self,
+                 inputs,
+                 initial_cell_state=None,
+                 initial_output=None,
+                 dtype=None,
+                 sequence_length=None):
     """Run this LSTM on inputs, starting from the given state.
 
     Args:
@@ -636,18 +674,11 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
     time_len = inputs_shape[0].value
     if time_len is None:
       time_len = array_ops.shape(inputs)[0]
-    input_size = inputs_shape[2].value
-    w = vs.get_variable(
-        "kernel",
-        [input_size + self._num_units, self._num_units * 4], dtype=dtype)
-    b = vs.get_variable(
-        "bias", [w.get_shape().with_rank(2)[1]],
-        initializer=init_ops.constant_initializer(0.0),
-        dtype=dtype)
+
     if self._use_peephole:
-      wci = vs.get_variable("w_i_diag", [self._num_units], dtype=dtype)
-      wcf = vs.get_variable("w_f_diag", [self._num_units], dtype=dtype)
-      wco = vs.get_variable("w_o_diag", [self._num_units], dtype=dtype)
+      wci = self._w_i_diag
+      wco = self._w_o_diag
+      wcf = self._w_f_diag
     else:
       wci = wcf = wco = array_ops.zeros([self._num_units], dtype=dtype)
 
@@ -661,11 +692,11 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
         x=inputs,
         cs_prev=initial_cell_state,
         h_prev=initial_output,
-        w=w,
+        w=self._kernel,
         wci=wci,
         wcf=wcf,
         wco=wco,
-        b=b,
+        b=self._bias,
         forget_bias=self._forget_bias,
         cell_clip=self._cell_clip,
         use_peephole=self._use_peephole)
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 91cb04daedf07ed60ff0a2c722c108ffb783a41b..a6c2d9cdbb2b6f61d59960f708000e945c6115e9 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Module for constructing RNN Cells."""
 from __future__ import absolute_import
 from __future__ import division
@@ -28,15 +27,17 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_impl  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import partitioned_variables  # pylint: disable=unused-import
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
@@ -54,16 +55,15 @@ def _get_concat_variable(name, shape, dtype, num_shards):
       return value
 
   concat_variable = array_ops.concat(sharded_variable, 0, name=concat_name)
-  ops.add_to_collection(ops.GraphKeys.CONCATENATED_VARIABLES,
-                        concat_variable)
+  ops.add_to_collection(ops.GraphKeys.CONCATENATED_VARIABLES, concat_variable)
   return concat_variable
 
 
 def _get_sharded_variable(name, shape, dtype, num_shards):
   """Get a list of sharded variables with the given dtype."""
   if num_shards > shape[0]:
-    raise ValueError("Too many shards: shape=%s, num_shards=%d" %
-                     (shape, num_shards))
+    raise ValueError("Too many shards: shape=%s, num_shards=%d" % (shape,
+                                                                   num_shards))
   unit_shard_size = int(math.floor(shape[0] / num_shards))
   remaining_rows = shape[0] - unit_shard_size * num_shards
 
@@ -72,8 +72,9 @@ def _get_sharded_variable(name, shape, dtype, num_shards):
     current_size = unit_shard_size
     if i < remaining_rows:
       current_size += 1
-    shards.append(vs.get_variable(name + "_%d" % i, [current_size] + shape[1:],
-                                  dtype=dtype))
+    shards.append(
+        vs.get_variable(
+            name + "_%d" % i, [current_size] + shape[1:], dtype=dtype))
   return shards
 
 
@@ -175,9 +176,8 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
     """
     super(CoupledInputForgetGateLSTMCell, self).__init__(_reuse=reuse)
     if not state_is_tuple:
-      logging.warn(
-          "%s: Using a concatenated state is slower and will soon be "
-          "deprecated.  Use state_is_tuple=True.", self)
+      logging.warn("%s: Using a concatenated state is slower and will soon be "
+                   "deprecated.  Use state_is_tuple=True.", self)
     self._num_units = num_units
     self._use_peepholes = use_peepholes
     self._initializer = initializer
@@ -194,12 +194,14 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
     self._norm_shift = norm_shift
 
     if num_proj:
-      self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
-                          if state_is_tuple else num_units + num_proj)
+      self._state_size = (
+          rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
+          if state_is_tuple else num_units + num_proj)
       self._output_size = num_proj
     else:
-      self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_units)
-                          if state_is_tuple else 2 * num_units)
+      self._state_size = (
+          rnn_cell_impl.LSTMStateTuple(num_units, num_units)
+          if state_is_tuple else 2 * num_units)
       self._output_size = num_units
 
   @property
@@ -249,8 +251,8 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
     concat_w = _get_concat_variable(
-        "W", [input_size.value + num_proj, 3 * self._num_units],
-        dtype, self._num_unit_shards)
+        "W", [input_size.value + num_proj, 3 * self._num_units], dtype,
+        self._num_unit_shards)
 
     b = vs.get_variable(
         "B",
@@ -297,9 +299,9 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
       m = sigmoid(o) * self._activation(c)
 
     if self._num_proj is not None:
-      concat_w_proj = _get_concat_variable(
-          "W_P", [self._num_units, self._num_proj],
-          dtype, self._num_proj_shards)
+      concat_w_proj = _get_concat_variable("W_P",
+                                           [self._num_units, self._num_proj],
+                                           dtype, self._num_proj_shards)
 
       m = math_ops.matmul(m, concat_w_proj)
       if self._proj_clip is not None:
@@ -307,8 +309,9 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
         m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
         # pylint: enable=invalid-unary-operand-type
 
-    new_state = (rnn_cell_impl.LSTMStateTuple(c, m)
-                 if self._state_is_tuple else array_ops.concat([c, m], 1))
+    new_state = (
+        rnn_cell_impl.LSTMStateTuple(c, m)
+        if self._state_is_tuple else array_ops.concat([c, m], 1))
     return m, new_state
 
 
@@ -324,10 +327,15 @@ class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
   It uses peep-hole connections and optional cell clipping.
   """
 
-  def __init__(self, num_units, use_peepholes=False,
-               cell_clip=None, initializer=None,
-               num_unit_shards=1, forget_bias=1.0,
-               feature_size=None, frequency_skip=None,
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               cell_clip=None,
+               initializer=None,
+               num_unit_shards=1,
+               forget_bias=1.0,
+               feature_size=None,
+               frequency_skip=1,
                reuse=None):
     """Initialize the parameters for an LSTM cell.
 
@@ -397,7 +405,7 @@ class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
     actual_input_size = freq_inputs[0].get_shape().as_list()[1]
 
     concat_w = _get_concat_variable(
-        "W", [actual_input_size + 2*self._num_units, 4 * self._num_units],
+        "W", [actual_input_size + 2 * self._num_units, 4 * self._num_units],
         dtype, self._num_unit_shards)
 
     b = vs.get_variable(
@@ -416,23 +424,24 @@ class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
           "W_O_diag", shape=[self._num_units], dtype=dtype)
 
     # initialize the first freq state to be zero
-    m_prev_freq = array_ops.zeros([int(inputs.get_shape()[0]),
-                                   self._num_units], dtype)
+    m_prev_freq = array_ops.zeros(
+        [inputs.shape[0].value or inputs.get_shape()[0], self._num_units],
+        dtype)
     for fq in range(len(freq_inputs)):
-      c_prev = array_ops.slice(state, [0, 2*fq*self._num_units],
+      c_prev = array_ops.slice(state, [0, 2 * fq * self._num_units],
                                [-1, self._num_units])
-      m_prev = array_ops.slice(state, [0, (2*fq+1)*self._num_units],
+      m_prev = array_ops.slice(state, [0, (2 * fq + 1) * self._num_units],
                                [-1, self._num_units])
       # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-      cell_inputs = array_ops.concat([freq_inputs[fq], m_prev, m_prev_freq],
-                                     1)
+      cell_inputs = array_ops.concat([freq_inputs[fq], m_prev, m_prev_freq], 1)
       lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
       i, j, f, o = array_ops.split(
           value=lstm_matrix, num_or_size_splits=4, axis=1)
 
       if self._use_peepholes:
-        c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
-             sigmoid(i + w_i_diag * c_prev) * tanh(j))
+        c = (
+            sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
+            sigmoid(i + w_i_diag * c_prev) * tanh(j))
       else:
         c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * tanh(j))
 
@@ -470,11 +479,11 @@ class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
     input_size = input_feat.get_shape().with_rank(2)[-1].value
     if input_size is None:
       raise ValueError("Cannot infer input_size from static shape inference.")
-    num_feats = int((input_size - self._feature_size) / (
-        self._frequency_skip)) + 1
+    num_feats = int(
+        (input_size - self._feature_size) / (self._frequency_skip)) + 1
     freq_inputs = []
     for f in range(num_feats):
-      cur_input = array_ops.slice(input_feat, [0, f*self._frequency_skip],
+      cur_input = array_ops.slice(input_feat, [0, f * self._frequency_skip],
                                   [-1, self._feature_size])
       freq_inputs.append(cur_input)
     return freq_inputs
@@ -496,11 +505,16 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
   The code uses optional peephole connections, shared_weights and cell clipping.
   """
 
-  def __init__(self, num_units, use_peepholes=False,
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
                share_time_frequency_weights=False,
-               cell_clip=None, initializer=None,
-               num_unit_shards=1, forget_bias=1.0,
-               feature_size=None, frequency_skip=None,
+               cell_clip=None,
+               initializer=None,
+               num_unit_shards=1,
+               forget_bias=1.0,
+               feature_size=None,
+               frequency_skip=None,
                num_frequency_blocks=None,
                start_freqindex_list=None,
                end_freqindex_list=None,
@@ -578,10 +592,10 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         for freq_index in range(self._num_frequency_blocks[block_index]):
           name_prefix = "state_f%02d_b%02d" % (freq_index, block_index)
           state_names += ("%s_c, %s_m," % (name_prefix, name_prefix))
-      self._state_tuple_type = collections.namedtuple(
-          "GridLSTMStateTuple", state_names.strip(","))
-      self._state_size = self._state_tuple_type(
-          *([num_units, num_units] * self._total_blocks))
+      self._state_tuple_type = collections.namedtuple("GridLSTMStateTuple",
+                                                      state_names.strip(","))
+      self._state_size = self._state_tuple_type(*(
+          [num_units, num_units] * self._total_blocks))
     else:
       self._state_tuple_type = None
       self._state_size = num_units * self._total_blocks * 2
@@ -624,7 +638,10 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
     state_out_lst = []
     for block in range(len(freq_inputs)):
       m_out_lst_current, state_out_lst_current = self._compute(
-          freq_inputs[block], block, state, batch_size,
+          freq_inputs[block],
+          block,
+          state,
+          batch_size,
           state_is_tuple=self._state_is_tuple)
       m_out_lst.extend(m_out_lst_current)
       state_out_lst.extend(state_out_lst_current)
@@ -635,7 +652,11 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
     m_out = array_ops.concat(m_out_lst, 1)
     return m_out, state_out
 
-  def _compute(self, freq_inputs, block, state, batch_size,
+  def _compute(self,
+               freq_inputs,
+               block,
+               state,
+               batch_size,
                state_prefix="state",
                state_is_tuple=True):
     """Run the actual computation of one step LSTM.
@@ -664,8 +685,8 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
     actual_input_size = freq_inputs[0].get_shape().as_list()[1]
 
     concat_w_f = _get_concat_variable(
-        "W_f_%d" % block, [actual_input_size + 2 * self._num_units,
-                           num_gates * self._num_units],
+        "W_f_%d" % block,
+        [actual_input_size + 2 * self._num_units, num_gates * self._num_units],
         dtype, self._num_unit_shards)
     b_f = vs.get_variable(
         "B_f_%d" % block,
@@ -673,10 +694,9 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         initializer=init_ops.zeros_initializer(),
         dtype=dtype)
     if not self._share_time_frequency_weights:
-      concat_w_t = _get_concat_variable(
-          "W_t_%d" % block, [actual_input_size + 2 * self._num_units,
-                             num_gates * self._num_units],
-          dtype, self._num_unit_shards)
+      concat_w_t = _get_concat_variable("W_t_%d" % block, [
+          actual_input_size + 2 * self._num_units, num_gates * self._num_units
+      ], dtype, self._num_unit_shards)
       b_t = vs.get_variable(
           "B_t_%d" % block,
           shape=[num_gates * self._num_units],
@@ -689,7 +709,7 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         w_f_diag_freqf = vs.get_variable(
             "W_F_diag_freqf_%d" % block, shape=[self._num_units], dtype=dtype)
         w_f_diag_freqt = vs.get_variable(
-            "W_F_diag_freqt_%d"% block, shape=[self._num_units], dtype=dtype)
+            "W_F_diag_freqt_%d" % block, shape=[self._num_units], dtype=dtype)
       w_i_diag_freqf = vs.get_variable(
           "W_I_diag_freqf_%d" % block, shape=[self._num_units], dtype=dtype)
       w_i_diag_freqt = vs.get_variable(
@@ -723,8 +743,7 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         m_prev_time = getattr(state, name_prefix + "_m")
       else:
         c_prev_time = array_ops.slice(
-            state, [0, 2 * freq_index * self._num_units],
-            [-1, self._num_units])
+            state, [0, 2 * freq_index * self._num_units], [-1, self._num_units])
         m_prev_time = array_ops.slice(
             state, [0, (2 * freq_index + 1) * self._num_units],
             [-1, self._num_units])
@@ -734,8 +753,8 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
           [freq_inputs[freq_index], m_prev_time, m_prev_freq], 1)
 
       # F-LSTM
-      lstm_matrix_freq = nn_ops.bias_add(math_ops.matmul(cell_inputs,
-                                                         concat_w_f), b_f)
+      lstm_matrix_freq = nn_ops.bias_add(
+          math_ops.matmul(cell_inputs, concat_w_f), b_f)
       if self._couple_input_forget_gates:
         i_freq, j_freq, o_freq = array_ops.split(
             value=lstm_matrix_freq, num_or_size_splits=num_gates, axis=1)
@@ -750,8 +769,8 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         f_time = f_freq
         o_time = o_freq
       else:
-        lstm_matrix_time = nn_ops.bias_add(math_ops.matmul(cell_inputs,
-                                                           concat_w_t), b_t)
+        lstm_matrix_time = nn_ops.bias_add(
+            math_ops.matmul(cell_inputs, concat_w_t), b_t)
         if self._couple_input_forget_gates:
           i_time, j_time, o_time = array_ops.split(
               value=lstm_matrix_time, num_or_size_splits=num_gates, axis=1)
@@ -763,8 +782,7 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       # F-LSTM c_freq
       # input gate activations
       if self._use_peepholes:
-        i_freq_g = sigmoid(i_freq +
-                           w_i_diag_freqf * c_prev_freq +
+        i_freq_g = sigmoid(i_freq + w_i_diag_freqf * c_prev_freq +
                            w_i_diag_freqt * c_prev_time)
       else:
         i_freq_g = sigmoid(i_freq)
@@ -773,9 +791,8 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         f_freq_g = 1.0 - i_freq_g
       else:
         if self._use_peepholes:
-          f_freq_g = sigmoid(f_freq + self._forget_bias +
-                             w_f_diag_freqf * c_prev_freq +
-                             w_f_diag_freqt * c_prev_time)
+          f_freq_g = sigmoid(f_freq + self._forget_bias + w_f_diag_freqf *
+                             c_prev_freq + w_f_diag_freqt * c_prev_time)
         else:
           f_freq_g = sigmoid(f_freq + self._forget_bias)
       # cell state
@@ -790,12 +807,10 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       # input gate activations
       if self._use_peepholes:
         if self._share_time_frequency_weights:
-          i_time_g = sigmoid(i_time +
-                             w_i_diag_freqf * c_prev_freq +
+          i_time_g = sigmoid(i_time + w_i_diag_freqf * c_prev_freq +
                              w_i_diag_freqt * c_prev_time)
         else:
-          i_time_g = sigmoid(i_time +
-                             w_i_diag_timef * c_prev_freq +
+          i_time_g = sigmoid(i_time + w_i_diag_timef * c_prev_freq +
                              w_i_diag_timet * c_prev_time)
       else:
         i_time_g = sigmoid(i_time)
@@ -805,13 +820,11 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       else:
         if self._use_peepholes:
           if self._share_time_frequency_weights:
-            f_time_g = sigmoid(f_time + self._forget_bias +
-                               w_f_diag_freqf * c_prev_freq +
-                               w_f_diag_freqt * c_prev_time)
+            f_time_g = sigmoid(f_time + self._forget_bias + w_f_diag_freqf *
+                               c_prev_freq + w_f_diag_freqt * c_prev_time)
           else:
-            f_time_g = sigmoid(f_time + self._forget_bias +
-                               w_f_diag_timef * c_prev_freq +
-                               w_f_diag_timet * c_prev_time)
+            f_time_g = sigmoid(f_time + self._forget_bias + w_f_diag_timef *
+                               c_prev_freq + w_f_diag_timet * c_prev_time)
         else:
           f_time_g = sigmoid(f_time + self._forget_bias)
       # cell state
@@ -824,8 +837,7 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
 
       # F-LSTM m_freq
       if self._use_peepholes:
-        m_freq = sigmoid(o_freq +
-                         w_o_diag_freqf * c_freq +
+        m_freq = sigmoid(o_freq + w_o_diag_freqf * c_freq +
                          w_o_diag_freqt * c_time) * tanh(c_freq)
       else:
         m_freq = sigmoid(o_freq) * tanh(c_freq)
@@ -833,12 +845,10 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       # T-LSTM m_time
       if self._use_peepholes:
         if self._share_time_frequency_weights:
-          m_time = sigmoid(o_time +
-                           w_o_diag_freqf * c_freq +
+          m_time = sigmoid(o_time + w_o_diag_freqf * c_freq +
                            w_o_diag_freqt * c_time) * tanh(c_time)
         else:
-          m_time = sigmoid(o_time +
-                           w_o_diag_timef * c_freq +
+          m_time = sigmoid(o_time + w_o_diag_timef * c_freq +
                            w_o_diag_timet * c_time) * tanh(c_time)
       else:
         m_time = sigmoid(o_time) * tanh(c_time)
@@ -877,16 +887,18 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       raise ValueError("Cannot infer input_size from static shape inference.")
     if slice_offset > 0:
       # Padding to the end
-      inputs = array_ops.pad(
-          input_feat, array_ops.constant([0, 0, 0, slice_offset], shape=[2, 2],
-                                         dtype=dtypes.int32),
-          "CONSTANT")
+      inputs = array_ops.pad(input_feat,
+                             array_ops.constant(
+                                 [0, 0, 0, slice_offset],
+                                 shape=[2, 2],
+                                 dtype=dtypes.int32), "CONSTANT")
     elif slice_offset < 0:
       # Padding to the front
-      inputs = array_ops.pad(
-          input_feat, array_ops.constant([0, 0, -slice_offset, 0], shape=[2, 2],
-                                         dtype=dtypes.int32),
-          "CONSTANT")
+      inputs = array_ops.pad(input_feat,
+                             array_ops.constant(
+                                 [0, 0, -slice_offset, 0],
+                                 shape=[2, 2],
+                                 dtype=dtypes.int32), "CONSTANT")
       slice_offset = 0
     else:
       inputs = input_feat
@@ -896,13 +908,13 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         raise ValueError("Length of num_frequency_blocks"
                          " is not 1, but instead is %d",
                          len(self._num_frequency_blocks))
-      num_feats = int((input_size - self._feature_size) / (
-          self._frequency_skip)) + 1
+      num_feats = int(
+          (input_size - self._feature_size) / (self._frequency_skip)) + 1
       if num_feats != self._num_frequency_blocks[0]:
         raise ValueError(
             "Invalid num_frequency_blocks, requires %d but gets %d, please"
-            " check the input size and filter config are correct." % (
-                self._num_frequency_blocks[0], num_feats))
+            " check the input size and filter config are correct." %
+            (self._num_frequency_blocks[0], num_feats))
       block_inputs = []
       for f in range(num_feats):
         cur_input = array_ops.slice(
@@ -925,18 +937,18 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
         start_index = self._start_freqindex_list[b]
         end_index = self._end_freqindex_list[b]
         cur_size = end_index - start_index
-        block_feats = int((cur_size - self._feature_size) / (
-            self._frequency_skip)) + 1
+        block_feats = int(
+            (cur_size - self._feature_size) / (self._frequency_skip)) + 1
         if block_feats != self._num_frequency_blocks[b]:
           raise ValueError(
               "Invalid num_frequency_blocks, requires %d but gets %d, please"
-              " check the input size and filter config are correct." % (
-                  self._num_frequency_blocks[b], block_feats))
+              " check the input size and filter config are correct." %
+              (self._num_frequency_blocks[b], block_feats))
         block_inputs = []
         for f in range(block_feats):
           cur_input = array_ops.slice(
-              inputs, [0, start_index + slice_offset + f *
-                       self._frequency_skip],
+              inputs,
+              [0, start_index + slice_offset + f * self._frequency_skip],
               [-1, self._feature_size])
           block_inputs.append(cur_input)
         freq_inputs.append(block_inputs)
@@ -952,11 +964,16 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
   The current implementation uses different weights for the two directions.
   """
 
-  def __init__(self, num_units, use_peepholes=False,
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
                share_time_frequency_weights=False,
-               cell_clip=None, initializer=None,
-               num_unit_shards=1, forget_bias=1.0,
-               feature_size=None, frequency_skip=None,
+               cell_clip=None,
+               initializer=None,
+               num_unit_shards=1,
+               forget_bias=1.0,
+               feature_size=None,
+               frequency_skip=None,
                num_frequency_blocks=None,
                start_freqindex_list=None,
                end_freqindex_list=None,
@@ -1015,8 +1032,8 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
           state_names += ("%s_c, %s_m," % (name_prefix, name_prefix))
     self._state_tuple_type = collections.namedtuple(
         "BidirectionalGridLSTMStateTuple", state_names.strip(","))
-    self._state_size = self._state_tuple_type(
-        *([num_units, num_units] * self._total_blocks * 2))
+    self._state_size = self._state_tuple_type(*(
+        [num_units, num_units] * self._total_blocks * 2))
     self._output_size = 2 * num_units * self._total_blocks * 2
 
   def call(self, inputs, state):
@@ -1050,8 +1067,12 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
       fwd_state_out_lst = []
       for block in range(len(fwd_inputs)):
         fwd_m_out_lst_current, fwd_state_out_lst_current = self._compute(
-            fwd_inputs[block], block, state, batch_size,
-            state_prefix="fwd_state", state_is_tuple=True)
+            fwd_inputs[block],
+            block,
+            state,
+            batch_size,
+            state_prefix="fwd_state",
+            state_is_tuple=True)
         fwd_m_out_lst.extend(fwd_m_out_lst_current)
         fwd_state_out_lst.extend(fwd_state_out_lst_current)
     # Backward processing
@@ -1062,8 +1083,12 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
         # Reverse the blocks
         bwd_inputs_reverse = bwd_inputs[block][::-1]
         bwd_m_out_lst_current, bwd_state_out_lst_current = self._compute(
-            bwd_inputs_reverse, block, state, batch_size,
-            state_prefix="bwd_state", state_is_tuple=True)
+            bwd_inputs_reverse,
+            block,
+            state,
+            batch_size,
+            state_prefix="bwd_state",
+            state_is_tuple=True)
         bwd_m_out_lst.extend(bwd_m_out_lst_current)
         bwd_state_out_lst.extend(bwd_state_out_lst_current)
     state_out = self._state_tuple_type(*(fwd_state_out_lst + bwd_state_out_lst))
@@ -1074,6 +1099,7 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
 
 # pylint: disable=protected-access
 _Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
+
 # pylint: enable=protected-access
 
 
@@ -1083,8 +1109,14 @@ class AttentionCellWrapper(rnn_cell_impl.RNNCell):
   Implementation based on https://arxiv.org/abs/1409.0473.
   """
 
-  def __init__(self, cell, attn_length, attn_size=None, attn_vec_size=None,
-               input_size=None, state_is_tuple=True, reuse=None):
+  def __init__(self,
+               cell,
+               attn_length,
+               attn_size=None,
+               attn_vec_size=None,
+               input_size=None,
+               state_is_tuple=True,
+               reuse=None):
     """Create a cell with attention.
 
     Args:
@@ -1114,16 +1146,15 @@ class AttentionCellWrapper(rnn_cell_impl.RNNCell):
     if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
       raise TypeError("The parameter cell is not RNNCell.")
     if nest.is_sequence(cell.state_size) and not state_is_tuple:
-      raise ValueError("Cell returns tuple of states, but the flag "
-                       "state_is_tuple is not set. State size is: %s"
-                       % str(cell.state_size))
+      raise ValueError(
+          "Cell returns tuple of states, but the flag "
+          "state_is_tuple is not set. State size is: %s" % str(cell.state_size))
     if attn_length <= 0:
-      raise ValueError("attn_length should be greater than zero, got %s"
-                       % str(attn_length))
+      raise ValueError(
+          "attn_length should be greater than zero, got %s" % str(attn_length))
     if not state_is_tuple:
-      logging.warn(
-          "%s: Using a concatenated state is slower and will soon be "
-          "deprecated.  Use state_is_tuple=True.", self)
+      logging.warn("%s: Using a concatenated state is slower and will soon be "
+                   "deprecated.  Use state_is_tuple=True.", self)
     if attn_size is None:
       attn_size = cell.output_size
     if attn_vec_size is None:
@@ -1159,8 +1190,8 @@ class AttentionCellWrapper(rnn_cell_impl.RNNCell):
     else:
       states = state
       state = array_ops.slice(states, [0, 0], [-1, self._cell.state_size])
-      attns = array_ops.slice(
-          states, [0, self._cell.state_size], [-1, self._attn_size])
+      attns = array_ops.slice(states, [0, self._cell.state_size],
+                              [-1, self._attn_size])
       attn_states = array_ops.slice(
           states, [0, self._cell.state_size + self._attn_size],
           [-1, self._attn_size * self._attn_length])
@@ -1198,8 +1229,8 @@ class AttentionCellWrapper(rnn_cell_impl.RNNCell):
     tanh = math_ops.tanh
 
     with vs.variable_scope("attention"):
-      k = vs.get_variable(
-          "attn_w", [1, 1, self._attn_size, self._attn_vec_size])
+      k = vs.get_variable("attn_w",
+                          [1, 1, self._attn_size, self._attn_vec_size])
       v = vs.get_variable("attn_v", [self._attn_vec_size])
       hidden = array_ops.reshape(attn_states,
                                  [-1, self._attn_length, 1, self._attn_size])
@@ -1226,7 +1257,8 @@ class HighwayWrapper(rnn_cell_impl.RNNCell):
     https://arxiv.org/abs/1505.00387
   """
 
-  def __init__(self, cell,
+  def __init__(self,
+               cell,
                couple_carry_transform_gates=True,
                carry_bias_init=1.0):
     """Constructs a `HighwayWrapper` for `cell`.
@@ -1258,8 +1290,7 @@ class HighwayWrapper(rnn_cell_impl.RNNCell):
     carry_weight = vs.get_variable("carry_w", [input_size, input_size])
     carry_bias = vs.get_variable(
         "carry_b", [input_size],
-        initializer=init_ops.constant_initializer(
-            self._carry_bias_init))
+        initializer=init_ops.constant_initializer(self._carry_bias_init))
     carry = math_ops.sigmoid(nn_ops.xw_plus_b(inp, carry_weight, carry_bias))
     if self._couple_carry_transform_gates:
       transform = 1 - carry
@@ -1268,11 +1299,9 @@ class HighwayWrapper(rnn_cell_impl.RNNCell):
                                          [input_size, input_size])
       transform_bias = vs.get_variable(
           "transform_b", [input_size],
-          initializer=init_ops.constant_initializer(
-              -self._carry_bias_init))
-      transform = math_ops.sigmoid(nn_ops.xw_plus_b(inp,
-                                                    transform_weight,
-                                                    transform_bias))
+          initializer=init_ops.constant_initializer(-self._carry_bias_init))
+      transform = math_ops.sigmoid(
+          nn_ops.xw_plus_b(inp, transform_weight, transform_bias))
     return inp * carry + out * transform
 
   def __call__(self, inputs, state, scope=None):
@@ -1292,9 +1321,11 @@ class HighwayWrapper(rnn_cell_impl.RNNCell):
     """
     outputs, new_state = self._cell(inputs, state, scope=scope)
     nest.assert_same_structure(inputs, outputs)
+
     # Ensure shapes match
     def assert_shape_match(inp, out):
       inp.get_shape().assert_is_compatible_with(out.get_shape())
+
     nest.map_structure(assert_shape_match, inputs, outputs)
     res_outputs = nest.map_structure(self._highway, inputs, outputs)
     return (res_outputs, new_state)
@@ -1320,10 +1351,16 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
   Stanislau Semeniuta, Aliaksei Severyn, Erhardt Barth.
   """
 
-  def __init__(self, num_units, forget_bias=1.0,
-               input_size=None, activation=math_ops.tanh,
-               layer_norm=True, norm_gain=1.0, norm_shift=0.0,
-               dropout_keep_prob=1.0, dropout_prob_seed=None,
+  def __init__(self,
+               num_units,
+               forget_bias=1.0,
+               input_size=None,
+               activation=math_ops.tanh,
+               layer_norm=True,
+               norm_gain=1.0,
+               norm_shift=0.0,
+               dropout_keep_prob=1.0,
+               dropout_prob_seed=None,
                reuse=None):
     """Initializes the basic LSTM cell.
 
@@ -1408,8 +1445,8 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
     if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
       g = nn_ops.dropout(g, self._keep_prob, seed=self._seed)
 
-    new_c = (c * math_ops.sigmoid(f + self._forget_bias)
-             + math_ops.sigmoid(i) * g)
+    new_c = (
+        c * math_ops.sigmoid(f + self._forget_bias) + math_ops.sigmoid(i) * g)
     if self._layer_norm:
       new_c = self._norm(new_c, "state", dtype=dtype)
     new_h = self._activation(new_c) * math_ops.sigmoid(o)
@@ -1431,8 +1468,7 @@ class NASCell(rnn_cell_impl.RNNCell):
   The class uses an optional projection layer.
   """
 
-  def __init__(self, num_units, num_proj=None,
-               use_biases=False, reuse=None):
+  def __init__(self, num_units, num_proj=None, use_biases=False, reuse=None):
     """Initialize the parameters for a NAS cell.
 
     Args:
@@ -1502,12 +1538,10 @@ class NASCell(rnn_cell_impl.RNNCell):
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
     # Variables for the NAS cell. W_m is all matrices multiplying the
     # hiddenstate and W_inputs is all matrices multiplying the inputs.
-    concat_w_m = vs.get_variable(
-        "recurrent_kernel", [num_proj, 8 * self._num_units],
-        dtype)
+    concat_w_m = vs.get_variable("recurrent_kernel",
+                                 [num_proj, 8 * self._num_units], dtype)
     concat_w_inputs = vs.get_variable(
-        "kernel", [input_size.value, 8 * self._num_units],
-        dtype)
+        "kernel", [input_size.value, 8 * self._num_units], dtype)
 
     m_matrix = math_ops.matmul(m_prev, concat_w_m)
     inputs_matrix = math_ops.matmul(inputs, concat_w_inputs)
@@ -1522,10 +1556,10 @@ class NASCell(rnn_cell_impl.RNNCell):
 
     # The NAS cell branches into 8 different splits for both the hiddenstate
     # and the input
-    m_matrix_splits = array_ops.split(axis=1, num_or_size_splits=8,
-                                      value=m_matrix)
-    inputs_matrix_splits = array_ops.split(axis=1, num_or_size_splits=8,
-                                           value=inputs_matrix)
+    m_matrix_splits = array_ops.split(
+        axis=1, num_or_size_splits=8, value=m_matrix)
+    inputs_matrix_splits = array_ops.split(
+        axis=1, num_or_size_splits=8, value=inputs_matrix)
 
     # First layer
     layer1_0 = sigmoid(inputs_matrix_splits[0] + m_matrix_splits[0])
@@ -1557,9 +1591,8 @@ class NASCell(rnn_cell_impl.RNNCell):
 
     # Projection layer if specified
     if self._num_proj is not None:
-      concat_w_proj = vs.get_variable(
-          "projection_weights", [self._num_units, self._num_proj],
-          dtype)
+      concat_w_proj = vs.get_variable("projection_weights",
+                                      [self._num_units, self._num_proj], dtype)
       new_m = math_ops.matmul(new_m, concat_w_proj)
 
     new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_m)
@@ -1582,8 +1615,12 @@ class UGRNNCell(rnn_cell_impl.RNNCell):
   "Capacity and Trainability in Recurrent Neural Networks" Proc. ICLR 2017.
   """
 
-  def __init__(self, num_units, initializer=None, forget_bias=1.0,
-               activation=math_ops.tanh, reuse=None):
+  def __init__(self,
+               num_units,
+               initializer=None,
+               forget_bias=1.0,
+               activation=math_ops.tanh,
+               reuse=None):
     """Initialize the parameters for an UGRNN cell.
 
     Args:
@@ -1638,8 +1675,8 @@ class UGRNNCell(rnn_cell_impl.RNNCell):
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
 
-    with vs.variable_scope(vs.get_variable_scope(),
-                           initializer=self._initializer):
+    with vs.variable_scope(
+        vs.get_variable_scope(), initializer=self._initializer):
       cell_inputs = array_ops.concat([inputs, state], 1)
       if self._linear is None:
         self._linear = _Linear(cell_inputs, 2 * self._num_units, True)
@@ -1679,9 +1716,13 @@ class IntersectionRNNCell(rnn_cell_impl.RNNCell):
   RNNs so it may not achieve best performance with depth 1.
   """
 
-  def __init__(self, num_units, num_in_proj=None,
-               initializer=None, forget_bias=1.0,
-               y_activation=nn_ops.relu, reuse=None):
+  def __init__(self,
+               num_units,
+               num_in_proj=None,
+               initializer=None,
+               forget_bias=1.0,
+               y_activation=nn_ops.relu,
+               reuse=None):
     """Initialize the parameters for an +RNN cell.
 
     Args:
@@ -1745,8 +1786,8 @@ class IntersectionRNNCell(rnn_cell_impl.RNNCell):
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
 
-    with vs.variable_scope(vs.get_variable_scope(),
-                           initializer=self._initializer):
+    with vs.variable_scope(
+        vs.get_variable_scope(), initializer=self._initializer):
       # read-in projections (should be used for first layer in deep +RNN
       # to transform size of inputs from I --> N)
       if input_size.value != self._num_units:
@@ -1763,13 +1804,13 @@ class IntersectionRNNCell(rnn_cell_impl.RNNCell):
       n_dim = i_dim = self._num_units
       cell_inputs = array_ops.concat([inputs, state], 1)
       if self._linear2 is None:
-        self._linear2 = _Linear(cell_inputs, 2*n_dim + 2*i_dim, True)
+        self._linear2 = _Linear(cell_inputs, 2 * n_dim + 2 * i_dim, True)
       rnn_matrix = self._linear2(cell_inputs)
 
-      gh_act = rnn_matrix[:, :n_dim]                           # b x n
-      h_act = rnn_matrix[:, n_dim:2*n_dim]                     # b x n
-      gy_act = rnn_matrix[:, 2*n_dim:2*n_dim+i_dim]            # b x i
-      y_act = rnn_matrix[:, 2*n_dim+i_dim:2*n_dim+2*i_dim]     # b x i
+      gh_act = rnn_matrix[:, :n_dim]  # b x n
+      h_act = rnn_matrix[:, n_dim:2 * n_dim]  # b x n
+      gy_act = rnn_matrix[:, 2 * n_dim:2 * n_dim + i_dim]  # b x i
+      y_act = rnn_matrix[:, 2 * n_dim + i_dim:2 * n_dim + 2 * i_dim]  # b x i
 
       h = tanh(h_act)
       y = self._y_activation(y_act)
@@ -1815,6 +1856,7 @@ class CompiledWrapper(rnn_cell_impl.RNNCell):
     if self._compile_stateful:
       compile_ops = True
     else:
+
       def compile_ops(node_def):
         global _REGISTERED_OPS
         if _REGISTERED_OPS is None:
@@ -1822,13 +1864,10 @@ class CompiledWrapper(rnn_cell_impl.RNNCell):
         return not _REGISTERED_OPS[node_def.op].is_stateful
 
     with jit.experimental_jit_scope(compile_ops=compile_ops):
-      return self._cell(inputs, state, scope)
+      return self._cell(inputs, state, scope=scope)
 
 
-def _random_exp_initializer(minval,
-                            maxval,
-                            seed=None,
-                            dtype=dtypes.float32):
+def _random_exp_initializer(minval, maxval, seed=None, dtype=dtypes.float32):
   """Returns an exponential distribution initializer.
 
   Args:
@@ -1847,10 +1886,7 @@ def _random_exp_initializer(minval,
     del partition_info  # Unused.
     return math_ops.exp(
         random_ops.random_uniform(
-            shape,
-            math_ops.log(minval),
-            math_ops.log(maxval),
-            dtype,
+            shape, math_ops.log(minval), math_ops.log(maxval), dtype,
             seed=seed))
 
   return _initializer
@@ -1954,8 +1990,7 @@ class PhasedLSTMCell(rnn_cell_impl.RNNCell):
       if self._linear1 is None:
         self._linear1 = _Linear(in_mask_gates, 2 * self._num_units, True)
 
-      mask_gates = math_ops.sigmoid(
-          self._linear1(in_mask_gates))
+      mask_gates = math_ops.sigmoid(self._linear1(in_mask_gates))
       [input_gate, forget_gate] = array_ops.split(
           axis=1, num_or_size_splits=2, value=mask_gates)
 
@@ -1979,12 +2014,12 @@ class PhasedLSTMCell(rnn_cell_impl.RNNCell):
 
     period = vs.get_variable(
         "period", [self._num_units],
-        initializer=_random_exp_initializer(
-            self._period_init_min, self._period_init_max))
+        initializer=_random_exp_initializer(self._period_init_min,
+                                            self._period_init_max))
     phase = vs.get_variable(
         "phase", [self._num_units],
-        initializer=init_ops.random_uniform_initializer(
-            0., period.initial_value))
+        initializer=init_ops.random_uniform_initializer(0.,
+                                                        period.initial_value))
     ratio_on = vs.get_variable(
         "ratio_on", [self._num_units],
         initializer=init_ops.constant_initializer(self._ratio_on),
@@ -2006,6 +2041,7 @@ class PhasedLSTMCell(rnn_cell_impl.RNNCell):
 
     return new_h, new_state
 
+
 class ConvLSTMCell(rnn_cell_impl.RNNCell):
   """Convolutional LSTM recurrent network cell.
 
@@ -2039,7 +2075,7 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
     """
     super(ConvLSTMCell, self).__init__(name=name)
 
-    if conv_ndims != len(input_shape)-1:
+    if conv_ndims != len(input_shape) - 1:
       raise ValueError("Invalid input_shape {} for conv_ndims={}.".format(
           input_shape, conv_ndims))
 
@@ -2058,8 +2094,8 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
     state_size = tensor_shape.TensorShape(
         self._input_shape[:-1] + [self._output_channels])
     self._state_size = rnn_cell_impl.LSTMStateTuple(state_size, state_size)
-    self._output_size = tensor_shape.TensorShape(self._input_shape[:-1]
-                                                 + [self._total_output_channels])
+    self._output_size = tensor_shape.TensorShape(
+        self._input_shape[:-1] + [self._total_output_channels])
 
   @property
   def output_size(self):
@@ -2071,13 +2107,10 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
 
   def call(self, inputs, state, scope=None):
     cell, hidden = state
-    new_hidden = _conv([inputs, hidden],
-                       self._kernel_shape,
-                       4*self._output_channels,
-                       self._use_bias)
-    gates = array_ops.split(value=new_hidden,
-                            num_or_size_splits=4,
-                            axis=self._conv_ndims+1)
+    new_hidden = _conv([inputs, hidden], self._kernel_shape,
+                       4 * self._output_channels, self._use_bias)
+    gates = array_ops.split(
+        value=new_hidden, num_or_size_splits=4, axis=self._conv_ndims + 1)
 
     input_gate, new_input, forget_gate, output_gate = gates
     new_cell = math_ops.sigmoid(forget_gate + self._forget_bias) * cell
@@ -2089,29 +2122,35 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
     new_state = rnn_cell_impl.LSTMStateTuple(new_cell, output)
     return output, new_state
 
+
 class Conv1DLSTMCell(ConvLSTMCell):
   """1D Convolutional LSTM recurrent network cell.
 
   https://arxiv.org/pdf/1506.04214v1.pdf
   """
+
   def __init__(self, name="conv_1d_lstm_cell", **kwargs):
     """Construct Conv1DLSTM. See `ConvLSTMCell` for more details."""
     super(Conv1DLSTMCell, self).__init__(conv_ndims=1, **kwargs)
 
+
 class Conv2DLSTMCell(ConvLSTMCell):
   """2D Convolutional LSTM recurrent network cell.
 
   https://arxiv.org/pdf/1506.04214v1.pdf
   """
+
   def __init__(self, name="conv_2d_lstm_cell", **kwargs):
     """Construct Conv2DLSTM. See `ConvLSTMCell` for more details."""
     super(Conv2DLSTMCell, self).__init__(conv_ndims=2, **kwargs)
 
+
 class Conv3DLSTMCell(ConvLSTMCell):
   """3D Convolutional LSTM recurrent network cell.
 
   https://arxiv.org/pdf/1506.04214v1.pdf
   """
+
   def __init__(self, name="conv_3d_lstm_cell", **kwargs):
     """Construct Conv3DLSTM. See `ConvLSTMCell` for more details."""
     super(Conv3DLSTMCell, self).__init__(conv_ndims=3, **kwargs)
@@ -2136,7 +2175,7 @@ def _conv(args, filter_size, num_features, bias, bias_start=0.0):
   shapes = [a.get_shape().as_list() for a in args]
   shape_length = len(shapes[0])
   for shape in shapes:
-    if len(shape) not in [3,4,5]:
+    if len(shape) not in [3, 4, 5]:
       raise ValueError("Conv Linear expects 3D, 4D "
                        "or 5D arguments: %s" % str(shapes))
     if len(shape) != len(shapes[0]):
@@ -2147,40 +2186,36 @@ def _conv(args, filter_size, num_features, bias, bias_start=0.0):
   dtype = [a.dtype for a in args][0]
 
   # determine correct conv operation
-  if   shape_length == 3:
+  if shape_length == 3:
     conv_op = nn_ops.conv1d
     strides = 1
   elif shape_length == 4:
     conv_op = nn_ops.conv2d
-    strides = shape_length*[1]
+    strides = shape_length * [1]
   elif shape_length == 5:
     conv_op = nn_ops.conv3d
-    strides = shape_length*[1]
+    strides = shape_length * [1]
 
   # Now the computation.
   kernel = vs.get_variable(
-      "kernel",
-      filter_size + [total_arg_size_depth, num_features],
-      dtype=dtype)
+      "kernel", filter_size + [total_arg_size_depth, num_features], dtype=dtype)
   if len(args) == 1:
-    res = conv_op(args[0],
-                  kernel,
-                  strides,
-                  padding='SAME')
+    res = conv_op(args[0], kernel, strides, padding="SAME")
   else:
-    res = conv_op(array_ops.concat(axis=shape_length-1, values=args),
-                  kernel,
-                  strides,
-                  padding='SAME')
+    res = conv_op(
+        array_ops.concat(axis=shape_length - 1, values=args),
+        kernel,
+        strides,
+        padding="SAME")
   if not bias:
     return res
   bias_term = vs.get_variable(
       "biases", [num_features],
       dtype=dtype,
-      initializer=init_ops.constant_initializer(
-          bias_start, dtype=dtype))
+      initializer=init_ops.constant_initializer(bias_start, dtype=dtype))
   return res + bias_term
 
+
 class GLSTMCell(rnn_cell_impl.RNNCell):
   """Group LSTM cell (G-LSTM).
 
@@ -2192,8 +2227,13 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
   "Factorization Tricks for LSTM Networks", ICLR 2017 workshop.
   """
 
-  def __init__(self, num_units, initializer=None, num_proj=None,
-               number_of_groups=1, forget_bias=1.0, activation=math_ops.tanh,
+  def __init__(self,
+               num_units,
+               initializer=None,
+               num_proj=None,
+               number_of_groups=1,
+               forget_bias=1.0,
+               activation=math_ops.tanh,
                reuse=None):
     """Initialize the parameters of G-LSTM cell.
 
@@ -2230,11 +2270,15 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
     if self._num_proj:
       if self._num_proj % self._number_of_groups != 0:
         raise ValueError("num_proj must be divisible by number_of_groups")
-      self._group_shape = [int(self._num_proj / self._number_of_groups),
-                           int(self._num_units / self._number_of_groups)]
+      self._group_shape = [
+          int(self._num_proj / self._number_of_groups),
+          int(self._num_units / self._number_of_groups)
+      ]
     else:
-      self._group_shape = [int(self._num_units / self._number_of_groups),
-                           int(self._num_units / self._number_of_groups)]
+      self._group_shape = [
+          int(self._num_units / self._number_of_groups),
+          int(self._num_units / self._number_of_groups)
+      ]
 
     if num_proj:
       self._state_size = rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
@@ -2242,7 +2286,7 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
     else:
       self._state_size = rnn_cell_impl.LSTMStateTuple(num_units, num_units)
       self._output_size = num_units
-    self._linear1 = None
+    self._linear1 = [None] * number_of_groups
     self._linear2 = None
 
   @property
@@ -2266,10 +2310,11 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
       subset of inputs corresponding to group "group_id",
       a Tensor, 2D, [batch x num_units/number_of_groups]
     """
-    return array_ops.slice(input_=inputs,
-                           begin=[0, group_id * group_size],
-                           size=[self._batch_size, group_size],
-                           name=("GLSTM_group%d_input_generation" % group_id))
+    return array_ops.slice(
+        input_=inputs,
+        begin=[0, group_id * group_size],
+        size=[self._batch_size, group_size],
+        name=("GLSTM_group%d_input_generation" % group_id))
 
   def call(self, inputs, state):
     """Run one step of G-LSTM.
@@ -2308,13 +2353,18 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
       for group_id in range(self._number_of_groups):
         with vs.variable_scope("group%d" % group_id):
           x_g_id = array_ops.concat(
-            [self._get_input_for_group(inputs, group_id,
-                                       self._group_shape[0]),
-             self._get_input_for_group(m_prev, group_id,
-                                       self._group_shape[0])], axis=1)
-          if self._linear1 is None:
-            self._linear1 = _Linear(x_g_id, 4 * self._group_shape[1], False)
-          R_k = self._linear1(x_g_id)  # pylint: disable=invalid-name
+              [
+                  self._get_input_for_group(inputs, group_id,
+                                            self._group_shape[0]),
+                  self._get_input_for_group(m_prev, group_id,
+                                            self._group_shape[0])
+              ],
+              axis=1)
+          linear = self._linear1[group_id]
+          if linear is None:
+            linear = _Linear(x_g_id, 4 * self._group_shape[1], False)
+            self._linear1[group_id] = linear
+          R_k = linear(x_g_id)  # pylint: disable=invalid-name
           i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1)
 
         i_parts.append(i_k)
@@ -2322,34 +2372,35 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
         f_parts.append(f_k)
         o_parts.append(o_k)
 
-      bi = vs.get_variable(name="bias_i",
-                           shape=[self._num_units],
-                           dtype=dtype,
-                           initializer=
-                           init_ops.constant_initializer(0.0, dtype=dtype))
-      bj = vs.get_variable(name="bias_j",
-                           shape=[self._num_units],
-                           dtype=dtype,
-                           initializer=
-                           init_ops.constant_initializer(0.0, dtype=dtype))
-      bf = vs.get_variable(name="bias_f",
-                           shape=[self._num_units],
-                           dtype=dtype,
-                           initializer=
-                           init_ops.constant_initializer(0.0, dtype=dtype))
-      bo = vs.get_variable(name="bias_o",
-                           shape=[self._num_units],
-                           dtype=dtype,
-                           initializer=
-                           init_ops.constant_initializer(0.0, dtype=dtype))
+      bi = vs.get_variable(
+          name="bias_i",
+          shape=[self._num_units],
+          dtype=dtype,
+          initializer=init_ops.constant_initializer(0.0, dtype=dtype))
+      bj = vs.get_variable(
+          name="bias_j",
+          shape=[self._num_units],
+          dtype=dtype,
+          initializer=init_ops.constant_initializer(0.0, dtype=dtype))
+      bf = vs.get_variable(
+          name="bias_f",
+          shape=[self._num_units],
+          dtype=dtype,
+          initializer=init_ops.constant_initializer(0.0, dtype=dtype))
+      bo = vs.get_variable(
+          name="bias_o",
+          shape=[self._num_units],
+          dtype=dtype,
+          initializer=init_ops.constant_initializer(0.0, dtype=dtype))
 
       i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi)
       j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj)
       f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf)
       o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo)
 
-    c = (math_ops.sigmoid(f + self._forget_bias) * c_prev +
-         math_ops.sigmoid(i) * math_ops.tanh(j))
+    c = (
+        math_ops.sigmoid(f + self._forget_bias) * c_prev +
+        math_ops.sigmoid(i) * math_ops.tanh(j))
     m = math_ops.sigmoid(o) * self._activation(c)
 
     if self._num_proj is not None:
@@ -2630,3 +2681,342 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
 
     new_state = (rnn_cell_impl.LSTMStateTuple(c, m))
     return m, new_state
+
+
+class SRUCell(rnn_cell_impl.LayerRNNCell):
+  """SRU, Simple Recurrent Unit
+
+     Implementation based on
+     Training RNNs as Fast as CNNs (cf. https://arxiv.org/abs/1709.02755).
+
+     This variation of RNN cell is characterized by the simplified data
+     dependence
+     between hidden states of two consecutive time steps. Traditionally, hidden
+     states from a cell at time step t-1 needs to be multiplied with a matrix
+     W_hh before being fed into the ensuing cell at time step t.
+     This flavor of RNN replaces the matrix multiplication between h_{t-1}
+     and W_hh with a pointwise multiplication, resulting in performance
+     gain.
+
+  Args:
+    num_units: int, The number of units in the SRU cell.
+    activation: Nonlinearity to use.  Default: `tanh`.
+    reuse: (optional) Python boolean describing whether to reuse variables
+      in an existing scope.  If not `True`, and the existing scope already has
+      the given variables, an error is raised.
+    name: (optional) String, the name of the layer. Layers with the same name
+      will share weights, but to avoid mistakes we require reuse=True in such
+      cases.
+  """
+
+  def __init__(self, num_units, activation=None, reuse=None, name=None):
+    super(SRUCell, self).__init__(_reuse=reuse, name=name)
+    self._num_units = num_units
+    self._activation = activation or math_ops.tanh
+
+    # Restrict inputs to be 2-dimensional matrices
+    self.input_spec = base_layer.InputSpec(ndim=2)
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def build(self, inputs_shape):
+    if inputs_shape[1].value is None:
+      raise ValueError(
+          "Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape)
+
+    input_depth = inputs_shape[1].value
+
+    self._kernel = self.add_variable(
+        rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+        shape=[input_depth, 4 * self._num_units])
+
+    self._bias = self.add_variable(
+        rnn_cell_impl._BIAS_VARIABLE_NAME,
+        shape=[2 * self._num_units],
+        initializer=init_ops.constant_initializer(0.0, dtype=self.dtype))
+
+    self._built = True
+
+  def call(self, inputs, state):
+    """Simple recurrent unit (SRU) with num_units cells."""
+
+    U = math_ops.matmul(inputs, self._kernel)
+    x_bar, f_intermediate, r_intermediate, x_tx = array_ops.split(
+        value=U, num_or_size_splits=4, axis=1)
+
+    f_r = math_ops.sigmoid(
+        nn_ops.bias_add(
+            array_ops.concat([f_intermediate, r_intermediate], 1), self._bias))
+    f, r = array_ops.split(value=f_r, num_or_size_splits=2, axis=1)
+
+    c = f * state + (1.0 - f) * x_bar
+    h = r * self._activation(c) + (1.0 - r) * x_tx
+
+    return h, c
+
+
+class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
+  """Weight normalized LSTM Cell. Adapted from `rnn_cell_impl.LSTMCell`.
+
+    The weight-norm implementation is based on:
+    https://arxiv.org/abs/1602.07868
+    Tim Salimans, Diederik P. Kingma.
+    Weight Normalization: A Simple Reparameterization to Accelerate
+    Training of Deep Neural Networks
+
+    The default LSTM implementation based on:
+    http://www.bioinf.jku.at/publications/older/2604.pdf
+    S. Hochreiter and J. Schmidhuber.
+    "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
+
+    The class uses optional peephole connections, optional cell clipping
+    and an optional projection layer.
+
+    The optional peephole implementation is based on:
+    https://research.google.com/pubs/archive/43905.pdf
+    Hasim Sak, Andrew Senior, and Francoise Beaufays.
+    "Long short-term memory recurrent neural network architectures for
+    large scale acoustic modeling." INTERSPEECH, 2014.
+  """
+
+  def __init__(self,
+               num_units,
+               norm=True,
+               use_peepholes=False,
+               cell_clip=None,
+               initializer=None,
+               num_proj=None,
+               proj_clip=None,
+               forget_bias=1,
+               activation=None,
+               reuse=None):
+    """Initialize the parameters of a weight-normalized LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell
+      norm: If `True`, apply normalization to the weight matrices. If False,
+        the result is identical to that obtained from `rnn_cell_impl.LSTMCell`
+      use_peepholes: bool, set `True` to enable diagonal/peephole connections.
+      cell_clip: (optional) A float value, if provided the cell state is clipped
+        by this value prior to the cell output activation.
+      initializer: (optional) The initializer to use for the weight matrices.
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
+        provided, then the projected values are clipped elementwise to within
+        `[-proj_clip, proj_clip]`.
+      forget_bias: Biases of the forget gate are initialized by default to 1
+        in order to reduce the scale of forgetting at the beginning of
+        the training.
+      activation: Activation function of the inner states.  Default: `tanh`.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+    """
+    super(WeightNormLSTMCell, self).__init__(_reuse=reuse)
+
+    self._scope = "wn_lstm_cell"
+    self._num_units = num_units
+    self._norm = norm
+    self._initializer = initializer
+    self._use_peepholes = use_peepholes
+    self._cell_clip = cell_clip
+    self._num_proj = num_proj
+    self._proj_clip = proj_clip
+    self._activation = activation or math_ops.tanh
+    self._forget_bias = forget_bias
+
+    self._weights_variable_name = "kernel"
+    self._bias_variable_name = "bias"
+
+    if num_proj:
+      self._state_size = rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
+      self._output_size = num_proj
+    else:
+      self._state_size = rnn_cell_impl.LSTMStateTuple(num_units, num_units)
+      self._output_size = num_units
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def _normalize(self, weight, name):
+    """Apply weight normalization.
+
+    Args:
+      weight: a 2D tensor with known number of columns.
+      name: string, variable name for the normalizer.
+    Returns:
+      A tensor with the same shape as `weight`.
+    """
+
+    output_size = weight.get_shape().as_list()[1]
+    g = vs.get_variable(name, [output_size], dtype=weight.dtype)
+    return nn_impl.l2_normalize(weight, dim=0) * g
+
+  def _linear(self,
+              args,
+              output_size,
+              norm,
+              bias,
+              bias_initializer=None,
+              kernel_initializer=None):
+    """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
+
+    Args:
+      args: a 2D Tensor or a list of 2D, batch x n, Tensors.
+      output_size: int, second dimension of W[i].
+      bias: boolean, whether to add a bias term or not.
+      bias_initializer: starting value to initialize the bias
+        (default is all zeros).
+      kernel_initializer: starting value to initialize the weight.
+
+    Returns:
+      A 2D Tensor with shape [batch x output_size] equal to
+      sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
+
+    Raises:
+      ValueError: if some of the arguments has unspecified or wrong shape.
+    """
+    if args is None or (nest.is_sequence(args) and not args):
+      raise ValueError("`args` must be specified")
+    if not nest.is_sequence(args):
+      args = [args]
+
+    # Calculate the total size of arguments on dimension 1.
+    total_arg_size = 0
+    shapes = [a.get_shape() for a in args]
+    for shape in shapes:
+      if shape.ndims != 2:
+        raise ValueError("linear is expecting 2D arguments: %s" % shapes)
+      if shape[1].value is None:
+        raise ValueError("linear expects shape[1] to be provided for shape %s, "
+                         "but saw %s" % (shape, shape[1]))
+      else:
+        total_arg_size += shape[1].value
+
+    dtype = [a.dtype for a in args][0]
+
+    # Now the computation.
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope) as outer_scope:
+      weights = vs.get_variable(
+          self._weights_variable_name, [total_arg_size, output_size],
+          dtype=dtype,
+          initializer=kernel_initializer)
+      if norm:
+        wn = []
+        st = 0
+        with ops.control_dependencies(None):
+          for i in range(len(args)):
+            en = st + shapes[i][1].value
+            wn.append(
+                self._normalize(weights[st:en, :], name="norm_{}".format(i)))
+            st = en
+
+          weights = array_ops.concat(wn, axis=0)
+
+      if len(args) == 1:
+        res = math_ops.matmul(args[0], weights)
+      else:
+        res = math_ops.matmul(array_ops.concat(args, 1), weights)
+      if not bias:
+        return res
+
+      with vs.variable_scope(outer_scope) as inner_scope:
+        inner_scope.set_partitioner(None)
+        if bias_initializer is None:
+          bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
+
+        biases = vs.get_variable(
+            self._bias_variable_name, [output_size],
+            dtype=dtype,
+            initializer=bias_initializer)
+
+      return nn_ops.bias_add(res, biases)
+
+  def call(self, inputs, state):
+    """Run one step of LSTM.
+
+    Args:
+      inputs: input Tensor, 2D, batch x num_units.
+      state: A tuple of state Tensors, both `2-D`, with column sizes
+       `c_state` and `m_state`.
+
+    Returns:
+      A tuple containing:
+
+      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
+        LSTM after reading `inputs` when previous state was `state`.
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - Tensor(s) representing the new state of LSTM after reading `inputs` when
+        the previous state was `state`.  Same type and shape(s) as `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    dtype = inputs.dtype
+    num_units = self._num_units
+    sigmoid = math_ops.sigmoid
+    c, h = state
+
+    input_size = inputs.get_shape().with_rank(2)[1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+    with vs.variable_scope(self._scope, initializer=self._initializer):
+
+      concat = self._linear(
+          [inputs, h], 4 * num_units, norm=self._norm, bias=True)
+
+      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+      i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
+
+      if self._use_peepholes:
+        w_f_diag = vs.get_variable("w_f_diag", shape=[num_units], dtype=dtype)
+        w_i_diag = vs.get_variable("w_i_diag", shape=[num_units], dtype=dtype)
+        w_o_diag = vs.get_variable("w_o_diag", shape=[num_units], dtype=dtype)
+
+        new_c = (
+            c * sigmoid(f + self._forget_bias + w_f_diag * c) +
+            sigmoid(i + w_i_diag * c) * self._activation(j))
+      else:
+        new_c = (
+            c * sigmoid(f + self._forget_bias) +
+            sigmoid(i) * self._activation(j))
+
+      if self._cell_clip is not None:
+        # pylint: disable=invalid-unary-operand-type
+        new_c = clip_ops.clip_by_value(new_c, -self._cell_clip, self._cell_clip)
+        # pylint: enable=invalid-unary-operand-type
+      if self._use_peepholes:
+        new_h = sigmoid(o + w_o_diag * new_c) * self._activation(new_c)
+      else:
+        new_h = sigmoid(o) * self._activation(new_c)
+
+      if self._num_proj is not None:
+        with vs.variable_scope("projection"):
+          new_h = self._linear(
+              new_h, self._num_proj, norm=self._norm, bias=False)
+
+        if self._proj_clip is not None:
+          # pylint: disable=invalid-unary-operand-type
+          new_h = clip_ops.clip_by_value(new_h, -self._proj_clip,
+                                         self._proj_clip)
+          # pylint: enable=invalid-unary-operand-type
+
+      new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_h)
+      return new_h, new_state
diff --git a/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py b/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py
index 5536a01328676e5fe01251fefdaaecb0f9569918..460e172a6d949804319b8833e34b6590f5fcf93b 100644
--- a/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py
+++ b/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py
@@ -128,10 +128,8 @@ RNN_NAME_REPLACEMENTS = collections.OrderedDict([
      'attention_cell_wrapper/attention/bias'),
     ############################################################################
     # contrib/legacy_seq2seq/python/ops/seq2seq.py
-    ('attention_decoder/weights',
-     'attention_decoder/kernel'),
-    ('attention_decoder/biases',
-     'attention_decoder/bias'),
+    ('attention_decoder/weights', 'attention_decoder/kernel'),
+    ('attention_decoder/biases', 'attention_decoder/bias'),
     ('attention_decoder/Attention_0/weights',
      'attention_decoder/Attention_0/kernel'),
     ('attention_decoder/Attention_0/biases',
@@ -140,6 +138,19 @@ RNN_NAME_REPLACEMENTS = collections.OrderedDict([
      'attention_decoder/AttnOutputProjection/kernel'),
     ('attention_decoder/AttnOutputProjection/biases',
      'attention_decoder/AttnOutputProjection/bias'),
+    # contrib/legacy_seq2seq/python/ops/seq2seq.py before cl/140060366
+    ('attention_decoder/Attention_0/Linear/Bias',
+     'attention_decoder/Attention_0/bias'),
+    ('attention_decoder/Attention_0/Linear/Matrix',
+     'attention_decoder/Attention_0/kernel'),
+    ('attention_decoder/AttnOutputProjection/Linear/Bias',
+     'attention_decoder/AttnOutputProjection/bias'),
+    ('attention_decoder/AttnOutputProjection/Linear/Matrix',
+     'attention_decoder/AttnOutputProjection/kernel'),
+    ('attention_decoder/LSTMCell/B', 'attention_decoder/lstm_cell/bias'),
+    ('attention_decoder/LSTMCell/W_0', 'attention_decoder/lstm_cell/kernel'),
+    ('attention_decoder/Linear/Bias', 'attention_decoder/bias'),
+    ('attention_decoder/Linear/Matrix', 'attention_decoder/kernel')
 ])
 
 _RNN_SHARDED_NAME_REPLACEMENTS = collections.OrderedDict([
diff --git a/tensorflow/contrib/rnn/python/tools/checkpoint_convert_test.py b/tensorflow/contrib/rnn/python/tools/checkpoint_convert_test.py
index a9e79494639418c22b7380b5b78092052fbf305d..b4785ee395a2452d9595d81c3bdb88711a8fe66a 100644
--- a/tensorflow/contrib/rnn/python/tools/checkpoint_convert_test.py
+++ b/tensorflow/contrib/rnn/python/tools/checkpoint_convert_test.py
@@ -67,7 +67,7 @@ class CheckpointConvertTest(test.TestCase):
         self._old_ckpt_path, self._new_ckpt_path)
     self.assertTrue(glob.glob(self._new_ckpt_path + "*"))
     self.assertItemsEqual(
-        ["a"] + list(checkpoint_convert.RNN_NAME_REPLACEMENTS.values()),
+        set(checkpoint_convert.RNN_NAME_REPLACEMENTS.values()).union(["a"]),
         new_var_map.keys())
     self.assertEqual(checkpoint_convert.RNN_NAME_REPLACEMENTS, conversion_map)
 
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index 20be819e07d0e47a0b24b5cc2548727322093e50..245fe07f2bcdaddb2bc47c0e1234dc1f19bd85e3 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -82,22 +82,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "utils_test",
-    size = "small",
-    srcs = ["python/saved_model/utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":saved_model_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/saved_model:loader",
-        "//tensorflow/python/saved_model:signature_constants",
-        "//tensorflow/python/saved_model:tag_constants",
-    ],
-)
-
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h
index c0df224bc8cffcb485db38dea270600c71070dff..b732cdd41e5c39793c17fa920c115e2bbe96f5de 100644
--- a/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h
+++ b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Helpers for working with the SignatureDefs of TensorFlow SavedModels.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
+#define TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
 
 #include <string>
 #include <utility>
@@ -66,4 +66,4 @@ Status FindOutputTensorNameByKey(const SignatureDef& signature_def,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
diff --git a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h
index 693b02dc437afdf14c38e4224c5469bb3e569540..34da8c82cdab9b6f82af328c49a365ae1cb951ed 100644
--- a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
+#ifndef TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
+#define TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -38,4 +38,4 @@ struct GatherTree {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
+#endif  // TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/__init__.py b/tensorflow/contrib/seq2seq/python/kernel_tests/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..52e83069cb0c68b510da46149248369dce376647 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/__init__.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 01a5540121ae9ebf22de0493daadff6c7710d29a..b427dff88b2d586ccf8c512bb498cdaf879ac781 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -69,7 +69,7 @@ class AttentionWrapperTest(test.TestCase):
   def assertAllCloseOrEqual(self, x, y, **kwargs):
     if isinstance(x, np.ndarray) or isinstance(x, float):
       return super(AttentionWrapperTest, self).assertAllClose(
-          x, y, atol=1e-4, **kwargs)
+          x, y, atol=1e-3, **kwargs)
     else:
       self.assertAllEqual(x, y, **kwargs)
 
@@ -80,6 +80,28 @@ class AttentionWrapperTest(test.TestCase):
     self.assertEqual(state.time, None)
     self.assertEqual(new_state.time, 1)
 
+  def testAttentionWrapperStateShapePropgation(self):
+    batch_size = 5
+    max_time = 5
+    num_units = 5
+
+    memory = random_ops.random_uniform(
+        [batch_size, max_time, num_units], seed=1)
+    mechanism = wrapper.LuongAttention(num_units, memory)
+    cell = wrapper.AttentionWrapper(rnn_cell.LSTMCell(num_units), mechanism)
+
+    # Create zero state with static batch size.
+    static_state = cell.zero_state(batch_size, dtypes.float32)
+    # Create zero state without static batch size.
+    state = cell.zero_state(array_ops.shape(memory)[0], dtypes.float32)
+
+    state = static_state.clone(
+        cell_state=state.cell_state, attention=state.attention)
+
+    self.assertEqual(state.cell_state.c.shape, static_state.cell_state.c.shape)
+    self.assertEqual(state.cell_state.h.shape, static_state.cell_state.h.shape)
+    self.assertEqual(state.attention.shape, static_state.attention.shape)
+
   def _testWithAttention(self,
                          create_attention_mechanism,
                          expected_final_output,
@@ -254,6 +276,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
     expected_final_alignment_history = ResultSummary(
         shape=(3, 5, 8), dtype=dtype('float32'), mean=0.12500001)
@@ -274,7 +298,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00597103),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.6))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -286,6 +310,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
 
     self._testWithAttention(
@@ -301,7 +327,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.4666666666666666))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.3333333333))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -313,6 +339,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
 
     self._testWithAttention(
@@ -330,7 +358,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.4666666666666666))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.3333333333333333))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -342,6 +370,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
 
     self._testWithAttention(
@@ -370,6 +400,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
         alignment_history=())
 
     self._testWithAttention(
@@ -545,6 +577,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.032228071),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.032228071),
         alignment_history=())
     expected_final_alignment_history = ResultSummary(
         shape=(3, 5, 8), dtype=dtype('float32'), mean=0.050430927)
@@ -566,7 +600,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0025896581),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.8666666666666667))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.6))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -578,9 +612,11 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.028698336),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.028698336),
         alignment_history=())
     expected_final_alignment_history = ResultSummary(
-        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.046009291)
+        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.04865776002407074)
 
     self._testWithAttention(
         create_attention_mechanism,
@@ -599,7 +635,8 @@ class AttentionWrapperTest(test.TestCase):
           random_ops.random_normal((b, t, u)),
           mode='hard')
       # Just feed previous attention as [1, 0, 0, ...]
-      attn = a(random_ops.random_normal((b, d)), array_ops.one_hot([0]*b, t))
+      attn, unused_state = a(
+          random_ops.random_normal((b, d)), array_ops.one_hot([0]*b, t))
       sess.run(variables.global_variables_initializer())
       attn_out = attn.eval()
       # All values should be 0 or 1
@@ -629,6 +666,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.032198936),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.032198936),
         alignment_history=())
     expected_final_alignment_history = ResultSummary(
         shape=(3, 5, 8), dtype=dtype('float32'), mean=0.050387777)
@@ -663,6 +702,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.032198936),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.032198936),
         alignment_history=())
     expected_final_alignment_history = ResultSummary(
         shape=(3, 5, 8), dtype=dtype('float32'), mean=0.050387777)
@@ -697,6 +738,9 @@ class AttentionWrapperTest(test.TestCase):
         alignments=(
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        attention_state=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
         alignment_history=())
 
     expected_final_alignment_history = (
@@ -723,7 +767,8 @@ class AttentionWrapperTest(test.TestCase):
           random_ops.random_normal((b, t, u)),
           mode='hard')
       # Just feed previous attention as [1, 0, 0, ...]
-      attn = a(random_ops.random_normal((b, d)), array_ops.one_hot([0]*b, t))
+      attn, unused_state = a(
+          random_ops.random_normal((b, d)), array_ops.one_hot([0]*b, t))
       sess.run(variables.global_variables_initializer())
       attn_out = attn.eval()
       # All values should be 0 or 1
@@ -738,9 +783,9 @@ class AttentionWrapperTest(test.TestCase):
 
     expected_final_output = BasicDecoderOutput(
         rnn_output=ResultSummary(
-            shape=(5, 3, 20), dtype=dtype('float32'), mean=0.11691988),
+            shape=(5, 3, 20), dtype=dtype('float32'), mean=0.11798714846372604),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=7.2666666666666666))
+            shape=(5, 3), dtype=dtype('int32'), mean=7.933333333333334))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -748,11 +793,14 @@ class AttentionWrapperTest(test.TestCase):
             h=ResultSummary(
                 shape=(5, 9), dtype=dtype('float32'), mean=-0.0018835809)),
         attention=ResultSummary(
-            shape=(5, 20), dtype=dtype('float32'), mean=0.11680689),
+            shape=(5, 20), dtype=dtype('float32'), mean=0.11798714846372604),
         time=3,
         alignments=(
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
+        attention_state=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125)),
         alignment_history=())
     expected_final_alignment_history = (
         ResultSummary(shape=(3, 5, 8), dtype=dtype('float32'), mean=0.125),
@@ -787,6 +835,8 @@ class AttentionWrapperTest(test.TestCase):
         time=3,
         alignments=(
             ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),),
+        attention_state=(
+            ResultSummary(shape=(5, 8), dtype=dtype('float32'), mean=0.125),),
         alignment_history=())
 
     expected_final_alignment_history = (
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
index d2beac5f31460ec1c0d978a9f6fcd0e0f09cb9b4..926554031775202d7f7d9018cf6ae4efb34fe96b 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -46,20 +46,18 @@ class TestGatherTree(test.TestCase):
 
     # create (batch_size, max_time, beam_width) matrix and transpose it
     predicted_ids = np.array(
-        [[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-         [[2, 3, 4], [5, 6, 7], [8, 9, 10]]],
+        [[[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[2, 3, 4], [5, 6, 7], [8, 9, 10]]],
         dtype=np.int32).transpose([1, 0, 2])
     parent_ids = np.array(
-        [[[0, 0, 0], [0, 1, 1], [2, 1, 2]],
-         [[0, 0, 0], [1, 2, 0], [2, 1, 1]]],
+        [[[0, 0, 0], [0, 1, 1], [2, 1, 2]], [[0, 0, 0], [1, 2, 0], [2, 1, 1]]],
         dtype=np.int32).transpose([1, 0, 2])
 
     # sequence_lengths is shaped (batch_size = 3)
     max_sequence_lengths = [3, 3]
 
-    expected_result = np.array(
-        [[[2, 2, 2], [6, 5, 6], [7, 8, 9]],
-         [[2, 4, 4], [7, 6, 6], [8, 9, 10]]]).transpose([1, 0, 2])
+    expected_result = np.array([[[2, 2, 2], [6, 5, 6], [7, 8, 9]],
+                                [[2, 4, 4], [7, 6, 6],
+                                 [8, 9, 10]]]).transpose([1, 0, 2])
 
     res = beam_search_ops.gather_tree(
         predicted_ids,
@@ -157,8 +155,8 @@ class TestBeamStep(test.TestCase):
     self.assertAllEqual(outputs_.predicted_ids, [[3, 3, 2], [2, 2, 1]])
     self.assertAllEqual(outputs_.parent_ids, [[1, 0, 0], [2, 1, 0]])
     self.assertAllEqual(next_state_.lengths, [[3, 3, 3], [3, 3, 3]])
-    self.assertAllEqual(next_state_.finished, [[False, False, False],
-                                               [False, False, False]])
+    self.assertAllEqual(next_state_.finished,
+                        [[False, False, False], [False, False, False]])
 
     expected_log_probs = []
     expected_log_probs.append(state_.log_probs[0][[1, 0, 0]])
@@ -212,8 +210,8 @@ class TestBeamStep(test.TestCase):
     self.assertAllEqual(outputs_.parent_ids, [[1, 0, 0], [1, 2, 0]])
     self.assertAllEqual(outputs_.predicted_ids, [[0, 3, 2], [2, 0, 1]])
     self.assertAllEqual(next_state_.lengths, [[1, 3, 3], [3, 1, 3]])
-    self.assertAllEqual(next_state_.finished, [[True, False, False],
-                                               [False, True, False]])
+    self.assertAllEqual(next_state_.finished,
+                        [[True, False, False], [False, True, False]])
 
     expected_log_probs = []
     expected_log_probs.append(state_.log_probs[0][[1, 0, 0]])
@@ -225,6 +223,100 @@ class TestBeamStep(test.TestCase):
     self.assertAllEqual(next_state_.log_probs, expected_log_probs)
 
 
+class TestLargeBeamStep(test.TestCase):
+  """Tests large beam step.
+
+  Tests a single step of beam search in such case that beam size is larger than
+  vocabulary size.
+  """
+
+  def setUp(self):
+    super(TestLargeBeamStep, self).setUp()
+    self.batch_size = 2
+    self.beam_width = 8
+    self.vocab_size = 5
+    self.end_token = 0
+    self.length_penalty_weight = 0.6
+
+  def test_step(self):
+
+    def get_probs():
+      """this simulates the initialize method in BeamSearchDecoder."""
+      log_prob_mask = array_ops.one_hot(
+          array_ops.zeros([self.batch_size], dtype=dtypes.int32),
+          depth=self.beam_width,
+          on_value=True,
+          off_value=False,
+          dtype=dtypes.bool)
+
+      log_prob_zeros = array_ops.zeros(
+          [self.batch_size, self.beam_width], dtype=dtypes.float32)
+      log_prob_neg_inf = array_ops.ones(
+          [self.batch_size, self.beam_width], dtype=dtypes.float32) * -np.Inf
+
+      log_probs = array_ops.where(log_prob_mask, log_prob_zeros,
+                                  log_prob_neg_inf)
+      return log_probs
+
+    log_probs = get_probs()
+    dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width])
+
+    # pylint: disable=invalid-name
+    _finished = array_ops.one_hot(
+        array_ops.zeros([self.batch_size], dtype=dtypes.int32),
+        depth=self.beam_width,
+        on_value=False,
+        off_value=True,
+        dtype=dtypes.bool)
+    _lengths = np.zeros([self.batch_size, self.beam_width], dtype=np.int64)
+    _lengths[:, 0] = 2
+    _lengths = constant_op.constant(_lengths, dtype=dtypes.int64)
+
+    beam_state = beam_search_decoder.BeamSearchDecoderState(
+        cell_state=dummy_cell_state,
+        log_probs=log_probs,
+        lengths=_lengths,
+        finished=_finished)
+
+    logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size],
+                      0.0001)
+    logits_[0, 0, 2] = 1.9
+    logits_[0, 0, 3] = 2.1
+    logits_[0, 1, 3] = 3.1
+    logits_[0, 1, 4] = 0.9
+    logits_[1, 0, 1] = 0.5
+    logits_[1, 1, 2] = 2.7
+    logits_[1, 2, 2] = 10.0
+    logits_[1, 2, 3] = 0.2
+    logits = constant_op.constant(logits_, dtype=dtypes.float32)
+    log_probs = nn_ops.log_softmax(logits)
+
+    outputs, next_beam_state = beam_search_decoder._beam_search_step(
+        time=2,
+        logits=logits,
+        next_cell_state=dummy_cell_state,
+        beam_state=beam_state,
+        batch_size=ops.convert_to_tensor(self.batch_size),
+        beam_width=self.beam_width,
+        end_token=self.end_token,
+        length_penalty_weight=self.length_penalty_weight)
+
+    with self.test_session() as sess:
+      outputs_, next_state_, _, _ = sess.run(
+          [outputs, next_beam_state, beam_state, log_probs])
+
+    self.assertEqual(outputs_.predicted_ids[0, 0], 3)
+    self.assertEqual(outputs_.predicted_ids[0, 1], 2)
+    self.assertEqual(outputs_.predicted_ids[1, 0], 1)
+    neg_inf = -np.Inf
+    self.assertAllEqual(
+        next_state_.log_probs[:, -3:],
+        [[neg_inf, neg_inf, neg_inf], [neg_inf, neg_inf, neg_inf]])
+    self.assertEqual((next_state_.log_probs[:, :-3] > neg_inf).all(), True)
+    self.assertEqual((next_state_.lengths[:, :-3] > 0).all(), True)
+    self.assertAllEqual(next_state_.lengths[:, -3:], [[0, 0, 0], [0, 0, 0]])
+
+
 class BeamSearchDecoderTest(test.TestCase):
 
   def _testDynamicDecodeRNN(self, time_major, has_attention):
@@ -250,8 +342,8 @@ class BeamSearchDecoderTest(test.TestCase):
       initial_state = cell.zero_state(batch_size, dtypes.float32)
       if has_attention:
         inputs = array_ops.placeholder_with_default(
-            np.random.randn(batch_size, decoder_max_time,
-                            input_depth).astype(np.float32),
+            np.random.randn(batch_size, decoder_max_time, input_depth).astype(
+                np.float32),
             shape=(None, None, input_depth))
         tiled_inputs = beam_search_decoder.tile_batch(
             inputs, multiplier=beam_width)
@@ -271,8 +363,7 @@ class BeamSearchDecoderTest(test.TestCase):
       cell_state = cell.zero_state(
           dtype=dtypes.float32, batch_size=batch_size_tensor * beam_width)
       if has_attention:
-        cell_state = cell_state.clone(
-            cell_state=initial_state)
+        cell_state = cell_state.clone(cell_state=initial_state)
       bsd = beam_search_decoder.BeamSearchDecoder(
           cell=cell,
           embedding=embedding,
diff --git a/tensorflow/contrib/seq2seq/python/ops/__init__.py b/tensorflow/contrib/seq2seq/python/ops/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..52e83069cb0c68b510da46149248369dce376647 100644
--- a/tensorflow/contrib/seq2seq/python/ops/__init__.py
+++ b/tensorflow/contrib/seq2seq/python/ops/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index e87ef413880e37e553c604ec8cfbaef307569682..0a53fd66dbe4d28ea102773b9c5bae50b9d18e9c 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -24,6 +24,7 @@ import math
 
 import numpy as np
 
+from tensorflow.contrib.framework.python.framework import tensor_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -61,7 +62,14 @@ _zero_state_tensors = rnn_cell_impl._zero_state_tensors  # pylint: disable=prote
 
 
 class AttentionMechanism(object):
-  pass
+
+  @property
+  def alignments_size(self):
+    raise NotImplementedError
+
+  @property
+  def state_size(self):
+    raise NotImplementedError
 
 
 def _prepare_memory(memory, memory_sequence_length, check_inner_dims_defined):
@@ -161,7 +169,7 @@ class _BaseAttentionMechanism(AttentionMechanism):
         tensor should be shaped `[batch_size, max_time, ...]`.
       probability_fn: A `callable`.  Converts the score and previous alignments
         to probabilities. Its signature should be:
-        `probabilities = probability_fn(score, previous_alignments)`.
+        `probabilities = probability_fn(score, state)`.
       memory_sequence_length (optional): Sequence lengths for the batch entries
         in memory.  If provided, the memory tensor rows are masked with zeros
         for values past the respective sequence lengths.
@@ -235,6 +243,10 @@ class _BaseAttentionMechanism(AttentionMechanism):
   def alignments_size(self):
     return self._alignments_size
 
+  @property
+  def state_size(self):
+    return self._alignments_size
+
   def initial_alignments(self, batch_size, dtype):
     """Creates the initial alignment values for the `AttentionWrapper` class.
 
@@ -254,6 +266,23 @@ class _BaseAttentionMechanism(AttentionMechanism):
     max_time = self._alignments_size
     return _zero_state_tensors(max_time, batch_size, dtype)
 
+  def initial_state(self, batch_size, dtype):
+    """Creates the initial state values for the `AttentionWrapper` class.
+
+    This is important for AttentionMechanisms that use the previous alignment
+    to calculate the alignment at the next time step (e.g. monotonic attention).
+
+    The default behavior is to return the same output as initial_alignments.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A structure of all-zero tensors with shapes as described by `state_size`.
+    """
+    return self.initial_alignments(batch_size, dtype)
+
 
 def _luong_score(query, keys, scale):
   """Implements Luong-style (multiplicative) scoring function.
@@ -302,7 +331,7 @@ def _luong_score(query, keys, scale):
   # batched matmul on:
   #   [batch_size, 1, depth] . [batch_size, depth, max_time]
   # resulting in an output shape of:
-  #   [batch_time, 1, max_time].
+  #   [batch_size, 1, max_time].
   # we then squeeze out the center singleton dimension.
   score = math_ops.matmul(query, keys, transpose_b=True)
   score = array_ops.squeeze(score, [1])
@@ -381,13 +410,13 @@ class LuongAttention(_BaseAttentionMechanism):
     self._scale = scale
     self._name = name
 
-  def __call__(self, query, previous_alignments):
+  def __call__(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
       query: Tensor of dtype matching `self.values` and shape
         `[batch_size, query_depth]`.
-      previous_alignments: Tensor of dtype matching `self.values` and shape
+      state: Tensor of dtype matching `self.values` and shape
         `[batch_size, alignments_size]`
         (`alignments_size` is memory's `max_time`).
 
@@ -398,8 +427,9 @@ class LuongAttention(_BaseAttentionMechanism):
     """
     with variable_scope.variable_scope(None, "luong_attention", [query]):
       score = _luong_score(query, self._keys, self._scale)
-    alignments = self._probability_fn(score, previous_alignments)
-    return alignments
+    alignments = self._probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
 
 
 def _bahdanau_score(processed_query, keys, normalize):
@@ -526,13 +556,13 @@ class BahdanauAttention(_BaseAttentionMechanism):
     self._normalize = normalize
     self._name = name
 
-  def __call__(self, query, previous_alignments):
+  def __call__(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
       query: Tensor of dtype matching `self.values` and shape
         `[batch_size, query_depth]`.
-      previous_alignments: Tensor of dtype matching `self.values` and shape
+      state: Tensor of dtype matching `self.values` and shape
         `[batch_size, alignments_size]`
         (`alignments_size` is memory's `max_time`).
 
@@ -544,8 +574,9 @@ class BahdanauAttention(_BaseAttentionMechanism):
     with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
       processed_query = self.query_layer(query) if self.query_layer else query
       score = _bahdanau_score(processed_query, self._keys, self._normalize)
-    alignments = self._probability_fn(score, previous_alignments)
-    return alignments
+    alignments = self._probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
 
 
 def safe_cumprod(x, *args, **kwargs):
@@ -805,13 +836,13 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
     self._name = name
     self._score_bias_init = score_bias_init
 
-  def __call__(self, query, previous_alignments):
+  def __call__(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
       query: Tensor of dtype matching `self.values` and shape
         `[batch_size, query_depth]`.
-      previous_alignments: Tensor of dtype matching `self.values` and shape
+      state: Tensor of dtype matching `self.values` and shape
         `[batch_size, alignments_size]`
         (`alignments_size` is memory's `max_time`).
 
@@ -828,8 +859,9 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
           "attention_score_bias", dtype=processed_query.dtype,
           initializer=self._score_bias_init)
       score += score_bias
-    alignments = self._probability_fn(score, previous_alignments)
-    return alignments
+    alignments = self._probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
 
 
 class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
@@ -892,8 +924,7 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
         _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
         seed=sigmoid_noise_seed)
     super(LuongMonotonicAttention, self).__init__(
-        query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False, dtype=dtype),
+        query_layer=None,
         memory_layer=layers_core.Dense(
             num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
@@ -906,13 +937,13 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
     self._score_bias_init = score_bias_init
     self._name = name
 
-  def __call__(self, query, previous_alignments):
+  def __call__(self, query, state):
     """Score the query based on the keys and values.
 
     Args:
       query: Tensor of dtype matching `self.values` and shape
         `[batch_size, query_depth]`.
-      previous_alignments: Tensor of dtype matching `self.values` and shape
+      state: Tensor of dtype matching `self.values` and shape
         `[batch_size, alignments_size]`
         (`alignments_size` is memory's `max_time`).
 
@@ -928,14 +959,15 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
           "attention_score_bias", dtype=query.dtype,
           initializer=self._score_bias_init)
       score += score_bias
-    alignments = self._probability_fn(score, previous_alignments)
-    return alignments
+    alignments = self._probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
 
 
 class AttentionWrapperState(
     collections.namedtuple("AttentionWrapperState",
                            ("cell_state", "attention", "time", "alignments",
-                            "alignment_history"))):
+                            "alignment_history", "attention_state"))):
   """`namedtuple` storing the state of a `AttentionWrapper`.
 
   Contains:
@@ -949,11 +981,18 @@ class AttentionWrapperState(
     - `alignment_history`: (if enabled) a single or tuple of `TensorArray`(s)
        containing alignment matrices from all time steps for each attention
        mechanism. Call `stack()` on each to convert to a `Tensor`.
+    - `attention_state`: A single or tuple of nested objects
+       containing attention mechanism state for each attention mechanism.
+       The objects may contain Tensors or TensorArrays.
   """
 
   def clone(self, **kwargs):
     """Clone this object, overriding components provided by kwargs.
 
+    The new state fields' shape must match original state fields' shape. This
+    will be validated, and original fields' shape will be propagated to new
+    fields.
+
     Example:
 
     ```python
@@ -969,7 +1008,16 @@ class AttentionWrapperState(
       A new `AttentionWrapperState` whose properties are the same as
       this one, except any overridden properties as provided in `kwargs`.
     """
-    return super(AttentionWrapperState, self)._replace(**kwargs)
+    def with_same_shape(old, new):
+      """Check and set new tensor's shape."""
+      if isinstance(old, ops.Tensor) and isinstance(new, ops.Tensor):
+        return tensor_util.with_same_shape(old, new)
+      return new
+
+    return nest.map_structure(
+        with_same_shape,
+        self,
+        super(AttentionWrapperState, self)._replace(**kwargs))
 
 
 def hardmax(logits, name=None):
@@ -993,11 +1041,11 @@ def hardmax(logits, name=None):
         math_ops.argmax(logits, -1), depth, dtype=logits.dtype)
 
 
-def _compute_attention(attention_mechanism, cell_output, previous_alignments,
+def _compute_attention(attention_mechanism, cell_output, attention_state,
                        attention_layer):
   """Computes the attention and alignments for a given attention_mechanism."""
-  alignments = attention_mechanism(
-      cell_output, previous_alignments=previous_alignments)
+  alignments, next_attention_state = attention_mechanism(
+      cell_output, state=attention_state)
 
   # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
   expanded_alignments = array_ops.expand_dims(alignments, 1)
@@ -1018,7 +1066,7 @@ def _compute_attention(attention_mechanism, cell_output, previous_alignments,
   else:
     attention = context
 
-  return attention, alignments
+  return attention, alignments, next_attention_state
 
 
 class AttentionWrapper(rnn_cell_impl.RNNCell):
@@ -1229,6 +1277,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         attention=self._attention_layer_size,
         alignments=self._item_or_tuple(
             a.alignments_size for a in self._attention_mechanisms),
+        attention_state=self._item_or_tuple(
+            a.state_size for a in self._attention_mechanisms),
         alignment_history=self._item_or_tuple(
             () for _ in self._attention_mechanisms))  # sometimes a TensorArray
 
@@ -1278,6 +1328,9 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
           alignments=self._item_or_tuple(
               attention_mechanism.initial_alignments(batch_size, dtype)
               for attention_mechanism in self._attention_mechanisms),
+          attention_state=self._item_or_tuple(
+              attention_mechanism.initial_state(batch_size, dtype)
+              for attention_mechanism in self._attention_mechanisms),
           alignment_history=self._item_or_tuple(
               tensor_array_ops.TensorArray(dtype=dtype, size=0,
                                            dynamic_size=True)
@@ -1339,33 +1392,36 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
           cell_output, name="checked_cell_output")
 
     if self._is_multi:
-      previous_alignments = state.alignments
+      previous_attention_state = state.attention_state
       previous_alignment_history = state.alignment_history
     else:
-      previous_alignments = [state.alignments]
+      previous_attention_state = [state.attention_state]
       previous_alignment_history = [state.alignment_history]
 
     all_alignments = []
     all_attentions = []
-    all_histories = []
+    all_attention_states = []
+    maybe_all_histories = []
     for i, attention_mechanism in enumerate(self._attention_mechanisms):
-      attention, alignments = _compute_attention(
-          attention_mechanism, cell_output, previous_alignments[i],
+      attention, alignments, next_attention_state = _compute_attention(
+          attention_mechanism, cell_output, previous_attention_state[i],
           self._attention_layers[i] if self._attention_layers else None)
       alignment_history = previous_alignment_history[i].write(
           state.time, alignments) if self._alignment_history else ()
 
+      all_attention_states.append(next_attention_state)
       all_alignments.append(alignments)
-      all_histories.append(alignment_history)
       all_attentions.append(attention)
+      maybe_all_histories.append(alignment_history)
 
     attention = array_ops.concat(all_attentions, 1)
     next_state = AttentionWrapperState(
         time=state.time + 1,
         cell_state=next_cell_state,
         attention=attention,
+        attention_state=self._item_or_tuple(all_attention_states),
         alignments=self._item_or_tuple(all_alignments),
-        alignment_history=self._item_or_tuple(all_histories))
+        alignment_history=self._item_or_tuple(maybe_all_histories))
 
     if self._output_attention:
       return attention, next_state
diff --git a/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
index c7c4182f0d9a17dacebc1cda693cda6eaaf8451f..ed226239b860e2250072a28a5538b816642ec54b 100644
--- a/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
@@ -90,7 +90,7 @@ class BasicDecoder(decoder.Decoder):
       output_shape_with_unknown_batch = nest.map_structure(
           lambda s: tensor_shape.TensorShape([None]).concatenate(s),
           size)
-      layer_output_shape = self._output_layer._compute_output_shape(  # pylint: disable=protected-access
+      layer_output_shape = self._output_layer.compute_output_shape(
           output_shape_with_unknown_batch)
       return nest.map_structure(lambda s: s[1:], layer_output_shape)
 
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index 5be0c92243da10af438be97fab982515266be1de..d6184d61095f727f9dcab56fe59e2601868c1624 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-
 import numpy as np
 
 from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
@@ -38,7 +37,6 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
 
-
 __all__ = [
     "BeamSearchDecoderOutput",
     "BeamSearchDecoderState",
@@ -49,8 +47,8 @@ __all__ = [
 
 
 class BeamSearchDecoderState(
-    collections.namedtuple("BeamSearchDecoderState", ("cell_state", "log_probs",
-                                                      "finished", "lengths"))):
+    collections.namedtuple("BeamSearchDecoderState",
+                           ("cell_state", "log_probs", "finished", "lengths"))):
   pass
 
 
@@ -67,7 +65,8 @@ class FinalBeamSearchDecoderOutput(
 
   Args:
     predicted_ids: The final prediction. A tensor of shape
-      `[T, batch_size, beam_width]`.
+      `[batch_size, T, beam_width]` (or `[T, batch_size, beam_width]` if
+      `output_time_major` is True). Beams are ordered from best to worst.
     beam_search_decoder_output: An instance of `BeamSearchDecoderOutput` that
       describes the state of the beam search.
   """
@@ -85,11 +84,12 @@ def _tile_batch(t, multiplier):
   tiled_static_batch_size = (
       t.shape[0].value * multiplier if t.shape[0].value is not None else None)
   tiled = array_ops.tile(array_ops.expand_dims(t, 1), tiling)
-  tiled = array_ops.reshape(
-      tiled, array_ops.concat(([shape_t[0] * multiplier], shape_t[1:]), 0))
+  tiled = array_ops.reshape(tiled,
+                            array_ops.concat(
+                                ([shape_t[0] * multiplier], shape_t[1:]), 0))
   tiled.set_shape(
-      tensor_shape.TensorShape(
-          [tiled_static_batch_size]).concatenate(t.shape[1:]))
+      tensor_shape.TensorShape([tiled_static_batch_size]).concatenate(
+          t.shape[1:]))
   return tiled
 
 
@@ -197,8 +197,8 @@ class BeamSearchDecoder(decoder.Decoder):
     """
     if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
       raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
-    if (output_layer is not None
-        and not isinstance(output_layer, layers_base.Layer)):
+    if (output_layer is not None and
+        not isinstance(output_layer, layers_base.Layer)):
       raise TypeError(
           "output_layer must be a Layer, received: %s" % type(output_layer))
     self._cell = cell
@@ -223,13 +223,17 @@ class BeamSearchDecoder(decoder.Decoder):
     self._beam_width = beam_width
     self._length_penalty_weight = length_penalty_weight
     self._initial_cell_state = nest.map_structure(
-        self._maybe_split_batch_beams,
-        initial_state, self._cell.state_size)
+        self._maybe_split_batch_beams, initial_state, self._cell.state_size)
     self._start_tokens = array_ops.tile(
         array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
     self._start_inputs = self._embedding_fn(self._start_tokens)
-    self._finished = array_ops.zeros(
-        [self._batch_size, self._beam_width], dtype=dtypes.bool)
+
+    self._finished = array_ops.one_hot(
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=False,
+        off_value=True,
+        dtype=dtypes.bool)
 
   @property
   def batch_size(self):
@@ -247,9 +251,8 @@ class BeamSearchDecoder(decoder.Decoder):
       # dimensions to get the output size of the rnn with the layer
       # applied to the top.
       output_shape_with_unknown_batch = nest.map_structure(
-          lambda s: tensor_shape.TensorShape([None]).concatenate(s),
-          size)
-      layer_output_shape = self._output_layer._compute_output_shape(  # pylint: disable=protected-access
+          lambda s: tensor_shape.TensorShape([None]).concatenate(s), size)
+      layer_output_shape = self._output_layer.compute_output_shape(
           output_shape_with_unknown_batch)
       return nest.map_structure(lambda s: s[1:], layer_output_shape)
 
@@ -297,11 +300,16 @@ class BeamSearchDecoder(decoder.Decoder):
     """
     finished, start_inputs = self._finished, self._start_inputs
 
+    log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=0.0,
+        off_value=-np.Inf,
+        dtype=nest.flatten(self._initial_cell_state)[0].dtype)
+
     initial_state = BeamSearchDecoderState(
         cell_state=self._initial_cell_state,
-        log_probs=array_ops.zeros(
-            [self._batch_size, self._beam_width],
-            dtype=nest.flatten(self._initial_cell_state)[0].dtype),
+        log_probs=log_probs,
         finished=finished,
         lengths=array_ops.zeros(
             [self._batch_size, self._beam_width], dtype=dtypes.int64))
@@ -358,11 +366,12 @@ class BeamSearchDecoder(decoder.Decoder):
     t_shape = array_ops.shape(t)
     static_batch_size = tensor_util.constant_value(self._batch_size)
     batch_size_beam_width = (
-        None if static_batch_size is None
-        else static_batch_size * self._beam_width)
+        None
+        if static_batch_size is None else static_batch_size * self._beam_width)
     reshaped_t = array_ops.reshape(
-        t, array_ops.concat(
-            ([self._batch_size * self._beam_width], t_shape[2:]), 0))
+        t,
+        array_ops.concat(([self._batch_size * self._beam_width], t_shape[2:]),
+                         0))
     reshaped_t.set_shape(
         (tensor_shape.TensorShape([batch_size_beam_width]).concatenate(s)))
     return reshaped_t
@@ -391,8 +400,9 @@ class BeamSearchDecoder(decoder.Decoder):
       s = tensor_shape.TensorShape(s)
     t_shape = array_ops.shape(t)
     reshaped_t = array_ops.reshape(
-        t, array_ops.concat(
-            ([self._batch_size, self._beam_width], t_shape[1:]), 0))
+        t,
+        array_ops.concat(([self._batch_size, self._beam_width], t_shape[1:]),
+                         0))
     static_batch_size = tensor_util.constant_value(self._batch_size)
     expected_reshaped_shape = tensor_shape.TensorShape(
         [static_batch_size, self._beam_width]).concatenate(s)
@@ -402,8 +412,8 @@ class BeamSearchDecoder(decoder.Decoder):
                        "We expected it to have shape "
                        "(batch_size, beam_width, depth) == %s.  Perhaps you "
                        "forgot to create a zero_state with "
-                       "batch_size=encoder_batch_size * beam_width?"
-                       % (reshaped_t.shape, expected_reshaped_shape))
+                       "batch_size=encoder_batch_size * beam_width?" %
+                       (reshaped_t.shape, expected_reshaped_shape))
     reshaped_t.set_shape(expected_reshaped_shape)
     return reshaped_t
 
@@ -475,15 +485,13 @@ class BeamSearchDecoder(decoder.Decoder):
       cell_state = state.cell_state
       inputs = nest.map_structure(
           lambda inp: self._merge_batch_beams(inp, s=inp.shape[2:]), inputs)
-      cell_state = nest.map_structure(
-          self._maybe_merge_batch_beams,
-          cell_state, self._cell.state_size)
+      cell_state = nest.map_structure(self._maybe_merge_batch_beams, cell_state,
+                                      self._cell.state_size)
       cell_outputs, next_cell_state = self._cell(inputs, cell_state)
       cell_outputs = nest.map_structure(
           lambda out: self._split_batch_beams(out, out.shape[1:]), cell_outputs)
       next_cell_state = nest.map_structure(
-          self._maybe_split_batch_beams,
-          next_cell_state, self._cell.state_size)
+          self._maybe_split_batch_beams, next_cell_state, self._cell.state_size)
 
       if self._output_layer is not None:
         cell_outputs = self._output_layer(cell_outputs)
@@ -546,7 +554,8 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
   lengths_to_add = array_ops.one_hot(
       indices=array_ops.fill([batch_size, beam_width], end_token),
       depth=vocab_size,
-      on_value=np.int64(0), off_value=np.int64(1),
+      on_value=np.int64(0),
+      off_value=np.int64(1),
       dtype=dtypes.int64)
   add_mask = math_ops.to_int64(math_ops.logical_not(previously_finished))
   lengths_to_add *= array_ops.expand_dims(add_mask, 2)
@@ -562,18 +571,11 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
   time = ops.convert_to_tensor(time, name="time")
   # During the first time step we only consider the initial beam
   scores_shape = array_ops.shape(scores)
-  scores_flat = control_flow_ops.cond(
-      time > 0,
-      lambda: array_ops.reshape(scores, [batch_size, -1]),
-      lambda: scores[:, 0])
-  num_available_beam = control_flow_ops.cond(
-      time > 0, lambda: math_ops.reduce_prod(scores_shape[1:]),
-      lambda: math_ops.reduce_prod(scores_shape[2:]))
+  scores_flat = array_ops.reshape(scores, [batch_size, -1])
 
   # Pick the next beams according to the specified successors function
-  next_beam_size = math_ops.minimum(
-      ops.convert_to_tensor(beam_width, dtype=dtypes.int32, name="beam_width"),
-      num_available_beam)
+  next_beam_size = ops.convert_to_tensor(
+      beam_width, dtype=dtypes.int32, name="beam_width")
   next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=next_beam_size)
 
   next_beam_scores.set_shape([static_batch_size, beam_width])
@@ -592,11 +594,11 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
   #       name="next_beam_word_ids")
   # would be a lot cleaner but for reasons unclear, that hides the results of
   # the op which prevents capturing it with tfdbg debug ops.
-  raw_next_word_ids = math_ops.mod(word_indices, vocab_size,
-                                   name="next_beam_word_ids")
+  raw_next_word_ids = math_ops.mod(
+      word_indices, vocab_size, name="next_beam_word_ids")
   next_word_ids = math_ops.to_int32(raw_next_word_ids)
-  next_beam_ids = math_ops.to_int32(word_indices / vocab_size,
-                                    name="next_beam_parent_ids")
+  next_beam_ids = math_ops.to_int32(
+      word_indices / vocab_size, name="next_beam_parent_ids")
 
   # Append new ids to current predictions
   previously_finished = _tensor_gather_helper(
@@ -605,9 +607,10 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
       batch_size=batch_size,
       range_size=beam_width,
       gather_shape=[-1])
-  next_finished = math_ops.logical_or(previously_finished,
-                                      math_ops.equal(next_word_ids, end_token),
-                                      name="next_beam_finished")
+  next_finished = math_ops.logical_or(
+      previously_finished,
+      math_ops.equal(next_word_ids, end_token),
+      name="next_beam_finished")
 
   # Calculate the length of the next predictions.
   # 1. Finished beams remain unchanged.
@@ -768,8 +771,12 @@ def _maybe_tensor_gather_helper(gather_indices, gather_from, batch_size,
     return gather_from
 
 
-def _tensor_gather_helper(gather_indices, gather_from, batch_size,
-                          range_size, gather_shape, name=None):
+def _tensor_gather_helper(gather_indices,
+                          gather_from,
+                          batch_size,
+                          range_size,
+                          gather_shape,
+                          name=None):
   """Helper for gathering the right indices from the tensor.
 
   This works by reshaping gather_from to gather_shape (e.g. [-1]) and then
@@ -800,9 +807,9 @@ def _tensor_gather_helper(gather_indices, gather_from, batch_size,
         array_ops.reshape(gather_from, gather_shape), gather_indices)
     final_shape = array_ops.shape(gather_from)[:1 + len(gather_shape)]
     static_batch_size = tensor_util.constant_value(batch_size)
-    final_static_shape = (tensor_shape.TensorShape([static_batch_size])
-                          .concatenate(
-                              gather_from.shape[1:1 + len(gather_shape)]))
+    final_static_shape = (
+        tensor_shape.TensorShape([static_batch_size]).concatenate(
+            gather_from.shape[1:1 + len(gather_shape)]))
     output = array_ops.reshape(output, final_shape, name="output")
     output.set_shape(final_static_shape)
     return output
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index b55d90cbabcc0bb63aaff86ba74c9fa2c6c917cf..3245cc5e72154289ea3ba000b9a30586a7ad03a9 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -184,6 +184,7 @@ class TrainingHelper(Helper):
     """
     with ops.name_scope(name, "TrainingHelper", [inputs, sequence_length]):
       inputs = ops.convert_to_tensor(inputs, name="inputs")
+      self._inputs = inputs
       if not time_major:
         inputs = nest.map_structure(_transpose_batch_time, inputs)
 
@@ -200,6 +201,14 @@ class TrainingHelper(Helper):
 
       self._batch_size = array_ops.size(sequence_length)
 
+  @property
+  def inputs(self):
+    return self._inputs
+
+  @property
+  def sequence_length(self):
+    return self._sequence_length
+
   @property
   def batch_size(self):
     return self._batch_size
@@ -540,8 +549,7 @@ class GreedyEmbeddingHelper(Helper):
     if not isinstance(outputs, ops.Tensor):
       raise TypeError("Expected outputs to be a single Tensor, got: %s" %
                       type(outputs))
-    sample_ids = math_ops.cast(
-        math_ops.argmax(outputs, axis=-1), dtypes.int32)
+    sample_ids = math_ops.argmax(outputs, axis=-1, output_type=dtypes.int32)
     return sample_ids
 
   def next_inputs(self, time, outputs, state, sample_ids, name=None):
diff --git a/tensorflow/contrib/session_bundle/bundle_shim.cc b/tensorflow/contrib/session_bundle/bundle_shim.cc
index a367ea059c9a2017e94c1541e42d6296665cc466..4fc36d85edf4ac2d48769d209f0b78d6d29d9a62 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim.cc
+++ b/tensorflow/contrib/session_bundle/bundle_shim.cc
@@ -371,9 +371,15 @@ Status LoadSessionBundleOrSavedModelBundle(
     return LoadSavedModelFromLegacySessionBundlePath(
         session_options, run_options, export_dir, saved_model_bundle);
   }
-  return Status(error::Code::NOT_FOUND,
-                "Session bundle or SavedModel bundle not found at specified "
-                "export location");
+  return Status(
+      error::Code::NOT_FOUND,
+      strings::StrCat(
+          "Specified file path does not appear to contain a:\n"
+          "- Session bundle (should have a file called `export.meta`)\n"
+          "- or, SavedModel bundle (should have a file called "
+          "`saved_model.pb`)\n"
+          "Specified file path: ",
+          export_dir));
 }
 
 }  // namespace serving
diff --git a/tensorflow/contrib/session_bundle/bundle_shim.h b/tensorflow/contrib/session_bundle/bundle_shim.h
index e24efa0de14824044591b954b8465ebeebc10dd5..4628b6ab1b1164addef6aaf930a0dbe7091cd16d 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim.h
+++ b/tensorflow/contrib/session_bundle/bundle_shim.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Shim for systems that need to load both SessionBundle and
 // SavedModelBundle interchangeably during migration to SavedModel.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_BUNDLE_SHIM_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_BUNDLE_SHIM_H_
+#ifndef TENSORFLOW_CONTRIB_SESSION_BUNDLE_BUNDLE_SHIM_H_
+#define TENSORFLOW_CONTRIB_SESSION_BUNDLE_BUNDLE_SHIM_H_
 
 #include <memory>
 
@@ -67,4 +67,4 @@ Status LoadSessionBundleOrSavedModelBundle(
 
 }  // namespace serving
 }  // namespace tensorflow
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_BUNDLE_SHIM_H_
+#endif  // TENSORFLOW_CONTRIB_SESSION_BUNDLE_BUNDLE_SHIM_H_
diff --git a/tensorflow/contrib/session_bundle/bundle_shim.py b/tensorflow/contrib/session_bundle/bundle_shim.py
index 062c9cc68046c59ffd04190dad0fa69f5f9dfa0a..1db97020a2a81f4d034543e722a6cb7ba823f44a 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim.py
+++ b/tensorflow/contrib/session_bundle/bundle_shim.py
@@ -82,7 +82,8 @@ def _convert_default_signature_to_signature_def(signatures):
   """
   default_signature = signatures.default_signature
   signature_def = meta_graph_pb2.SignatureDef()
-  if default_signature.WhichOneof("type") == "regression_signature":
+  if (default_signature.WhichOneof("type") ==
+      legacy_constants.REGRESSION_SIGNATURE):
     regression_signature = default_signature.regression_signature
     signature_def.method_name = signature_constants.REGRESS_METHOD_NAME
     _add_input_to_signature_def(regression_signature.input.tensor_name,
@@ -91,7 +92,8 @@ def _convert_default_signature_to_signature_def(signatures):
     _add_output_to_signature_def(regression_signature.output.tensor_name,
                                  signature_constants.REGRESS_OUTPUTS,
                                  signature_def)
-  elif default_signature.WhichOneof("type") == "classification_signature":
+  elif (default_signature.WhichOneof("type") ==
+        legacy_constants.CLASSIFICATION_SIGNATURE):
     classification_signature = default_signature.classification_signature
     signature_def.method_name = signature_constants.CLASSIFY_METHOD_NAME
     _add_input_to_signature_def(classification_signature.input.tensor_name,
@@ -132,8 +134,9 @@ def _convert_named_signatures_to_signature_def(signatures):
       signature_constants.PREDICT_OUTPUTS]
   # TODO(pdudnik): what if there are other signatures? Mimic cr/140900781 once
   # it is submitted.
-  if (input_signature.WhichOneof("type") != "generic_signature" or
-      output_signature.WhichOneof("type") != "generic_signature"):
+  if (input_signature.WhichOneof("type") != legacy_constants.GENERIC_SIGNATURE
+      or output_signature.WhichOneof("type") !=
+      legacy_constants.GENERIC_SIGNATURE):
     raise RuntimeError("Named input and output signatures can only be "
                        "up-converted if they are generic signature. "
                        "Input signature type is %s, output signature type is "
diff --git a/tensorflow/contrib/session_bundle/bundle_shim_test.cc b/tensorflow/contrib/session_bundle/bundle_shim_test.cc
index 72f32a0f5554e4dd3e7cbf498a57ee6bfba57211..9a1dd9303f43591888dc49984d81c4a0c6af9846 100644
--- a/tensorflow/contrib/session_bundle/bundle_shim_test.cc
+++ b/tensorflow/contrib/session_bundle/bundle_shim_test.cc
@@ -493,17 +493,15 @@ TEST(BundleShimTest, DefaultAndNamedSignatureWithPredict) {
   ASSERT_FALSE(
       actual_signature_def_predict->second.inputs().find("foo-input") ==
       actual_signature_def_predict->second.inputs().end());
-  EXPECT_EQ("foo-input",
-            actual_signature_def_predict->second.inputs()
-                .find("foo-input")
-                ->second.name());
+  EXPECT_EQ("foo-input", actual_signature_def_predict->second.inputs()
+                             .find("foo-input")
+                             ->second.name());
   ASSERT_FALSE(
       actual_signature_def_predict->second.outputs().find("foo-output") ==
       actual_signature_def_predict->second.outputs().end());
-  EXPECT_EQ("foo-output",
-            actual_signature_def_predict->second.outputs()
-                .find("foo-output")
-                ->second.name());
+  EXPECT_EQ("foo-output", actual_signature_def_predict->second.outputs()
+                              .find("foo-output")
+                              ->second.name());
   EXPECT_EQ(kPredictMethodName,
             actual_signature_def_predict->second.method_name());
 }
diff --git a/tensorflow/contrib/session_bundle/constants.py b/tensorflow/contrib/session_bundle/constants.py
index 6ced73241afdda047b8feacb26fedd72363b6240..e833baee791f97df5829ee289bcaf17c31a17deb 100644
--- a/tensorflow/contrib/session_bundle/constants.py
+++ b/tensorflow/contrib/session_bundle/constants.py
@@ -32,3 +32,6 @@ INIT_OP_KEY = "serving_init_op"
 SIGNATURES_KEY = "serving_signatures"
 ASSETS_KEY = "serving_assets"
 GRAPH_KEY = "serving_graph"
+REGRESSION_SIGNATURE = "regression_signature"
+CLASSIFICATION_SIGNATURE = "classification_signature"
+GENERIC_SIGNATURE = "generic_signature"
diff --git a/tensorflow/contrib/session_bundle/exporter.py b/tensorflow/contrib/session_bundle/exporter.py
index f6f663aae766b783b85139f57a93e10f553e6bd1..08983337fccc138d40eb959cecc5bf9e47cf6cac 100644
--- a/tensorflow/contrib/session_bundle/exporter.py
+++ b/tensorflow/contrib/session_bundle/exporter.py
@@ -281,11 +281,12 @@ class Exporter(object):
     tmp_export_dir = compat.as_text(export_dir) + "-tmp"
     gfile.MakeDirs(tmp_export_dir)
 
-    self._saver.save(sess,
-                     os.path.join(
-                         compat.as_text(tmp_export_dir),
-                         compat.as_text(constants.EXPORT_BASE_NAME)),
-                     meta_graph_suffix=constants.EXPORT_SUFFIX_NAME)
+    self._saver.save(
+        sess,
+        os.path.join(
+            compat.as_text(tmp_export_dir),
+            compat.as_text(constants.EXPORT_BASE_NAME)),
+        meta_graph_suffix=constants.EXPORT_SUFFIX_NAME)
 
     # Run the asset callback.
     if self._assets_callback and self._assets_to_copy:
@@ -301,12 +302,12 @@ class Exporter(object):
     if exports_to_keep:
       # create a simple parser that pulls the export_version from the directory.
       def parser(path):
-        if os.name == 'nt':
-          match = re.match("^" + export_dir_base.replace('\\','/') + "/(\\d{8})$",
-                           path.path.replace('\\','/'))
+        if os.name == "nt":
+          match = re.match(
+              "^" + export_dir_base.replace("\\", "/") + "/(\\d{8})$",
+              path.path.replace("\\", "/"))
         else:
-          match = re.match("^" + export_dir_base + "/(\\d{8})$",
-                           path.path)
+          match = re.match("^" + export_dir_base + "/(\\d{8})$", path.path)
         if not match:
           return None
         return path._replace(export_version=int(match.group(1)))
diff --git a/tensorflow/contrib/session_bundle/gc.py b/tensorflow/contrib/session_bundle/gc.py
index 249c23c88f3043403e322b73b6c9df97e932a92a..514cc0f652c8d174bdb9bff2b2cf1ea38fdd7b1f 100644
--- a/tensorflow/contrib/session_bundle/gc.py
+++ b/tensorflow/contrib/session_bundle/gc.py
@@ -70,7 +70,6 @@ import heapq
 import math
 import os
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.platform import gfile
 from tensorflow.python.util.deprecation import deprecated
 
diff --git a/tensorflow/contrib/session_bundle/session_bundle.h b/tensorflow/contrib/session_bundle/session_bundle.h
index 2ff258411d1928cea7da4f637ffe94f144b2b60a..b2be46efa6d1e7ceb1fb66a7148735b86cc68dd3 100644
--- a/tensorflow/contrib/session_bundle/session_bundle.h
+++ b/tensorflow/contrib/session_bundle/session_bundle.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Low-level functionality for setting up a inference Session.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_SESSION_BUNDLE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_SESSION_BUNDLE_H_
+#ifndef TENSORFLOW_CONTRIB_SESSION_BUNDLE_SESSION_BUNDLE_H_
+#define TENSORFLOW_CONTRIB_SESSION_BUNDLE_SESSION_BUNDLE_H_
 
 #include <memory>
 
@@ -82,4 +82,4 @@ bool IsPossibleExportDirectory(const StringPiece export_dir);
 }  // namespace serving
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_SESSION_BUNDLE_H_
+#endif  // TENSORFLOW_CONTRIB_SESSION_BUNDLE_SESSION_BUNDLE_H_
diff --git a/tensorflow/contrib/session_bundle/signature.cc b/tensorflow/contrib/session_bundle/signature.cc
index 7133875ad53e77625bbe799f4f886c074a08f1bd..ed70a5b91b231067e8e69951ef7010406e6b22cf 100644
--- a/tensorflow/contrib/session_bundle/signature.cc
+++ b/tensorflow/contrib/session_bundle/signature.cc
@@ -38,9 +38,9 @@ namespace {
 Status BatchSizesMatch(const Tensor& input, const Tensor& output) {
   // Ensure the number of outputs match the number of inputs.
   if (input.dim_size(0) != output.dim_size(0)) {
-    return errors::Internal(
-        strings::StrCat("Input batch size did not match output batch size: ",
-                        input.dim_size(0), " vs. ", output.dim_size(0)));
+    return errors::Internal(strings::StrCat(
+        "Input batch size did not match output batch size: ", input.dim_size(0),
+        " vs. ", output.dim_size(0)));
   }
   return Status::OK();
 }
@@ -100,8 +100,8 @@ Status GetNamedClassificationSignature(
   const auto& it = signatures.named_signatures().find(name);
   if (it == signatures.named_signatures().end()) {
     return errors::NotFound(
-        strings::StrCat("Missing signature named \"", name, "\" in: ",
-                        DebugStringIfAvailable(signatures)));
+        strings::StrCat("Missing signature named \"", name,
+                        "\" in: ", DebugStringIfAvailable(signatures)));
   }
   if (!it->second.has_classification_signature()) {
     return errors::FailedPrecondition(
@@ -232,8 +232,8 @@ Status GetNamedSignature(const string& name,
   const auto& it = signatures.named_signatures().find(name);
   if (it == signatures.named_signatures().end()) {
     return errors::NotFound(
-        strings::StrCat("Missing signature named \"", name, "\" in: ",
-                        DebugStringIfAvailable(signatures)));
+        strings::StrCat("Missing signature named \"", name,
+                        "\" in: ", DebugStringIfAvailable(signatures)));
   }
   *signature = it->second;
   return Status::OK();
diff --git a/tensorflow/contrib/session_bundle/signature.h b/tensorflow/contrib/session_bundle/signature.h
index 0049bea00822db85c606b9e6d00ae4db83804bab..4ef1277cec413a6fcfb54721520279d024f18bc1 100644
--- a/tensorflow/contrib/session_bundle/signature.h
+++ b/tensorflow/contrib/session_bundle/signature.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Helpers for working with TensorFlow exports and their signatures.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_SIGNATURE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_SIGNATURE_H_
+#ifndef TENSORFLOW_CONTRIB_SESSION_BUNDLE_SIGNATURE_H_
+#define TENSORFLOW_CONTRIB_SESSION_BUNDLE_SIGNATURE_H_
 
 #include <string>
 #include <utility>
@@ -121,4 +121,4 @@ Status BindGenericNames(const GenericSignature& signature,
 }  // namespace serving
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_SIGNATURE_H_
+#endif  // TENSORFLOW_CONTRIB_SESSION_BUNDLE_SIGNATURE_H_
diff --git a/tensorflow/contrib/session_bundle/test_util.h b/tensorflow/contrib/session_bundle/test_util.h
index dd0fc8d1c0c47c444ac6fe807435fa671f3939f0..f0d41ce5a4b901db80a7a01475dc0917e966dc89 100644
--- a/tensorflow/contrib/session_bundle/test_util.h
+++ b/tensorflow/contrib/session_bundle/test_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_TEST_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_TEST_UTIL_H_
+#ifndef TENSORFLOW_CONTRIB_SESSION_BUNDLE_TEST_UTIL_H_
+#define TENSORFLOW_CONTRIB_SESSION_BUNDLE_TEST_UTIL_H_
 
 #include <string>
 
@@ -35,4 +35,4 @@ string TestSrcDirPath(const string& relative_path);
 }  // namespace serving
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SESSION_BUNDLE_TEST_UTIL_H_
+#endif  // TENSORFLOW_CONTRIB_SESSION_BUNDLE_TEST_UTIL_H_
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
index b861476b67fc360f383465145ccd1cc620de5a99..35c4b5bec172858b39dd4628a37e164efe87bdbf 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
@@ -158,6 +158,9 @@ class LinearToMelTest(test.TestCase):
     with self.assertRaises(ValueError):
       mel_ops.linear_to_mel_weight_matrix(lower_edge_hertz=100,
                                           upper_edge_hertz=10)
+    with self.assertRaises(ValueError):
+      mel_ops.linear_to_mel_weight_matrix(upper_edge_hertz=1000,
+                                          sample_rate=800)
     with self.assertRaises(ValueError):
       mel_ops.linear_to_mel_weight_matrix(dtype=dtypes.int32)
 
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
index c04f1cf5bad358a14a1827df05a129339502c86f..e7743bdcba180929007d17bdf3b143c64643aacc 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.signal.python.ops import mfcc_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import spectral_ops_test_util
@@ -49,6 +50,14 @@ class MFCCTest(test.TestCase):
         signal = random_ops.random_normal((2, 3, 5))
         mfcc_ops.mfccs_from_log_mel_spectrograms(signal).eval()
 
+  def test_unknown_shape(self):
+    """A test that the op runs when shape and rank are unknown."""
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.test_session(use_gpu=True):
+        signal = array_ops.placeholder_with_default(
+            random_ops.random_normal((2, 3, 5)), tensor_shape.TensorShape(None))
+        self.assertIsNone(signal.shape.ndims)
+        mfcc_ops.mfccs_from_log_mel_spectrograms(signal).eval()
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/signal/python/ops/mel_ops.py b/tensorflow/contrib/signal/python/ops/mel_ops.py
index 2ad07027aa73f1ae7f44fd684a18fc67400a4f90..d1a36548d95cf44d2bf7e6108141aeb00853db04 100644
--- a/tensorflow/contrib/signal/python/ops/mel_ops.py
+++ b/tensorflow/contrib/signal/python/ops/mel_ops.py
@@ -80,6 +80,10 @@ def _validate_arguments(num_mel_bins, num_spectrogram_bins, sample_rate,
   if lower_edge_hertz >= upper_edge_hertz:
     raise ValueError('lower_edge_hertz %.1f >= upper_edge_hertz %.1f' %
                      (lower_edge_hertz, upper_edge_hertz))
+  if upper_edge_hertz > sample_rate / 2:
+    raise ValueError('upper_edge_hertz must not be larger than the Nyquist '
+                     'frequency (sample_rate / 2). Got: %s for sample_rate: %s'
+                     % (upper_edge_hertz, sample_rate))
   if not dtype.is_floating:
     raise ValueError('dtype must be a floating point type. Got: %s' % dtype)
 
@@ -138,8 +142,8 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
 
   Raises:
     ValueError: If num_mel_bins/num_spectrogram_bins/sample_rate are not
-      positive, lower_edge_hertz is negative, or frequency edges are incorrectly
-      ordered.
+      positive, lower_edge_hertz is negative, frequency edges are incorrectly
+      ordered, or upper_edge_hertz is larger than the Nyquist frequency.
 
   [mel]: https://en.wikipedia.org/wiki/Mel_scale
   """
diff --git a/tensorflow/contrib/signal/python/ops/mfcc_ops.py b/tensorflow/contrib/signal/python/ops/mfcc_ops.py
index 7bc7b57cd4f1033a8bda0845ccd8e777e0213d6b..4e842f7f10ae07448cc07e5f636ae80a820e656f 100644
--- a/tensorflow/contrib/signal/python/ops/mfcc_ops.py
+++ b/tensorflow/contrib/signal/python/ops/mfcc_ops.py
@@ -50,7 +50,7 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   # A 1024-point STFT with frames of 64 ms and 75% overlap.
   stfts = tf.contrib.signal.stft(pcm, frame_length=1024, frame_step=256,
                                  fft_length=1024)
-  spectrograms = tf.abs(stft)
+  spectrograms = tf.abs(stfts)
 
   # Warp the linear scale spectrograms into the mel-scale.
   num_spectrogram_bins = stfts.shape[-1].value
@@ -105,4 +105,4 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
       num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1]
 
     dct2 = spectral_ops.dct(log_mel_spectrograms)
-    return dct2 * math_ops.rsqrt(num_mel_bins * 2.0)
+    return dct2 * math_ops.rsqrt(math_ops.to_float(num_mel_bins) * 2.0)
diff --git a/tensorflow/contrib/signal/python/ops/spectral_ops.py b/tensorflow/contrib/signal/python/ops/spectral_ops.py
index bca2e01d7bbefb18fd69a0eba27e3afb8f636724..a8b5deff6ca3a4a756d31b904e577f08f6155fd7 100644
--- a/tensorflow/contrib/signal/python/ops/spectral_ops.py
+++ b/tensorflow/contrib/signal/python/ops/spectral_ops.py
@@ -144,7 +144,7 @@ def inverse_stft_window_fn(frame_step,
       overlaps = -(-frame_length // frame_step)  # Ceiling division.
       denom = array_ops.pad(denom, [(0, overlaps * frame_step - frame_length)])
       denom = array_ops.reshape(denom, [overlaps, frame_step])
-      denom = math_ops.reduce_sum(denom, 0, keep_dims=True)
+      denom = math_ops.reduce_sum(denom, 0, keepdims=True)
       denom = array_ops.tile(denom, [overlaps, 1])
       denom = array_ops.reshape(denom, [overlaps * frame_step])
 
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index dc92ae0c859394f44ba83d814adbef7d324a9ada..2d9df8f27ee98431f51fd39c168325b8f625dce9 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -145,7 +145,7 @@ regular_variables_and_model_variables = slim.get_variables()
 
 How does this work? When you create a model variable via TF-Slim's layers or
 directly via the `slim.model_variable` function, TF-Slim adds the variable to
-a the `tf.GraphKeys.MODEL_VARIABLES` collection. What if you have your own
+the `tf.GraphKeys.MODEL_VARIABLES` collection. What if you have your own
 custom layers or variable creation routine but still want TF-Slim to manage or
 be aware of your model variables? TF-Slim provides a convenience function for
 adding the model variable to its collection:
@@ -676,7 +676,7 @@ file were implicitly obtained from each provided variable's `var.op.name`.
 
 This works well when the variable names in the checkpoint file match those in
 the graph. However, sometimes, we want to restore a model from a checkpoint
-whose variables have different names those in the current graph. In this case,
+whose variables have different names to those in the current graph. In this case,
 we must provide the `Saver` a dictionary that maps from each checkpoint variable
 name to each graph variable. Consider the following example where the checkpoint
 variables names are obtained via a simple function:
diff --git a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
index 82c6b5a619662ba5cbaba1b3a238045a8d9a2cd2..c42c7b3391db40fd0aad89c45f449487f484f371 100644
--- a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
+++ b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
@@ -62,7 +62,9 @@ class DatasetDataProvider(data_provider.DataProvider):
                seed=None,
                scope=None):
     """Creates a DatasetDataProvider.
-
+    Note: if `num_epochs` is not `None`,  local counter `epochs` will be created
+    by relevant function. Use `local_variables_initializer()` to initialize
+    local variables.
     Args:
       dataset: An instance of the Dataset class.
       num_readers: The number of parallel readers to use.
@@ -96,12 +98,12 @@ class DatasetDataProvider(data_provider.DataProvider):
     items = dataset.decoder.list_items()
     tensors = dataset.decoder.decode(data, items)
 
-    if record_key in items:
+    items_to_tensors = dict(zip(items, tensors))
+    if record_key in items_to_tensors:
       raise ValueError('The item name used for `record_key` cannot also be '
                        'used for a dataset item: %s', record_key)
-    items.append(record_key)
-    tensors.append(key)
+    items_to_tensors[record_key] = key
 
     super(DatasetDataProvider, self).__init__(
-        items_to_tensors=dict(zip(items, tensors)),
+        items_to_tensors=items_to_tensors,
         num_samples=dataset.num_samples)
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index 0544404e9e252cca6d3650b805b91be25d705eea..b3b61e1dfe5671a7fbbee20b0c577ee5fad0fb9b 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -349,7 +349,8 @@ class Image(ItemHandler):
                shape=None,
                channels=3,
                dtype=dtypes.uint8,
-               repeated=False):
+               repeated=False,
+               dct_method=''):
     """Initializes the image.
 
     Args:
@@ -368,6 +369,11 @@ class Image(ItemHandler):
               tf.decode_raw,
       repeated: if False, decodes a single image. If True, decodes a
         variable number of image strings from a 1D tensor of strings.
+      dct_method: An optional string. Defaults to empty string. It only takes
+        effect when image format is jpeg, used to specify a hint about the
+        algorithm used for jpeg decompression. Currently valid values
+        are ['INTEGER_FAST', 'INTEGER_ACCURATE']. The hint may be ignored, for
+        example, the jpeg library does not have that specific option.
     """
     if not image_key:
       image_key = 'image/encoded'
@@ -381,6 +387,7 @@ class Image(ItemHandler):
     self._channels = channels
     self._dtype = dtype
     self._repeated = repeated
+    self._dct_method = dct_method
 
   def tensors_to_item(self, keys_to_tensors):
     """See base class."""
@@ -406,9 +413,25 @@ class Image(ItemHandler):
       A tensor that represents decoded image of self._shape, or
       (?, ?, self._channels) if self._shape is not specified.
     """
+
     def decode_image():
-      """Decodes a png or jpg based on the headers."""
-      return image_ops.decode_image(image_buffer, self._channels)
+      """Decodes a image based on the headers."""
+      return image_ops.decode_image(image_buffer, channels=self._channels)
+
+    def decode_jpeg():
+      """Decodes a jpeg image with specified '_dct_method'."""
+      return image_ops.decode_jpeg(
+          image_buffer, channels=self._channels, dct_method=self._dct_method)
+
+    def check_jpeg():
+      """Checks if an image is jpeg."""
+      # For jpeg, we directly use image_ops.decode_jpeg rather than decode_image
+      # in order to feed the jpeg specify parameter 'dct_method'.
+      return control_flow_ops.cond(
+          image_ops.is_jpeg(image_buffer),
+          decode_jpeg,
+          decode_image,
+          name='cond_jpeg')
 
     def decode_raw():
       """Decodes a raw image."""
@@ -420,7 +443,7 @@ class Image(ItemHandler):
             math_ops.equal(image_format, 'RAW')): decode_raw,
     }
     image = control_flow_ops.case(
-        pred_fn_pairs, default=decode_image, exclusive=True)
+        pred_fn_pairs, default=check_jpeg, exclusive=True)
 
     image.set_shape([None, None, self._channels])
     if self._shape is not None:
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 870f504d10362ed5226951adefc3ba9a934900c1..7ab6805fac631d6f6b475c4c91f7e3873e7ffea5 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -29,7 +29,6 @@ from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
-from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.wrappers import hooks
 from tensorflow.python.framework import constant_op
@@ -42,7 +41,7 @@ from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary_iterator
-from tensorflow.python.training import input
+from tensorflow.python.training import input  # pylint: disable=redefined-builtin
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import session_run_hook
 
@@ -236,7 +235,7 @@ class SingleEvaluationTest(test.TestCase):
   def _prepareCheckpoint(self, checkpoint_path):
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
-    saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V1)
+    saver = saver_lib.Saver()
     with self.test_session() as sess:
       sess.run(init_op)
       saver.save(sess, checkpoint_path)
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index def00b76184ba4e1fc630cd83d8e055448100562..6a200de1ea172b4ccb38c0f5d889566ccaeef893 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -738,6 +738,7 @@ def train(train_op,
   if summary_writer is not None:
     train_step_kwargs['summary_writer'] = sv.summary_writer
 
+  total_loss = None
   should_retry = True
   while should_retry:
     try:
@@ -753,9 +754,10 @@ def train(train_op,
           if logdir:
             sv.start_standard_services(sess)
         elif startup_delay_steps > 0:
+           # (use sys.maxsize because sys.maxint doesn't exist in Python 3)
           _wait_for_step(sess, global_step,
                          min(startup_delay_steps, number_of_steps or
-                             sys.maxint))
+                             sys.maxsize))
         threads = sv.start_queue_runners(sess)
         logging.info('Starting Queues.')
         if is_chief and sync_optimizer is not None:
@@ -769,10 +771,10 @@ def train(train_op,
               logging.info('Stopping Training.')
               sv.request_stop()
               break
-        except errors.OutOfRangeError:
+        except errors.OutOfRangeError as e:
           # OutOfRangeError is thrown when epoch limit per
           # tf.train.limit_epochs is reached.
-          logging.info('Caught OutOfRangeError. Stopping Training.')
+          logging.info('Caught OutOfRangeError. Stopping Training. %s', e)
         if logdir and sv.is_chief:
           logging.info('Finished training! Saving model to disk.')
           sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index 4e816f9b11be2986d042f336bdc320ff47d8cc49..831c6e427ae78932bec09cea935f05a87723f1a3 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -197,9 +197,7 @@ class MultiplyGradientsTest(test.TestCase):
     gradient = constant_op.constant(self._grad_vec, dtype=dtypes.float32)
     variable = variables_lib.Variable(array_ops.zeros_like(gradient))
     multiplier_flag = variables_lib.Variable(True)
-    tensor_multiplier = array_ops.where(multiplier_flag,
-                                        self._multiplier,
-                                        1.0)
+    tensor_multiplier = array_ops.where(multiplier_flag, self._multiplier, 1.0)
     grad_to_var = (gradient, variable)
     gradient_multipliers = {variable: tensor_multiplier}
 
@@ -212,11 +210,8 @@ class MultiplyGradientsTest(test.TestCase):
       sess.run(multiplier_flag.assign(False))
       gradient_false_flag = sess.run(grad_to_var[0])
     np_testing.assert_almost_equal(gradient_true_flag,
-                                   self._multiplied_grad_vec,
-                                   5)
-    np_testing.assert_almost_equal(gradient_false_flag,
-                                   self._grad_vec,
-                                   5)
+                                   self._multiplied_grad_vec, 5)
+    np_testing.assert_almost_equal(gradient_false_flag, self._grad_vec, 5)
 
 
 def LogisticClassifier(inputs):
@@ -502,6 +497,7 @@ class TrainTest(test.TestCase):
     purpose.
     """
     dump_root = tempfile.mkdtemp()
+
     def dumping_wrapper(sess):  # pylint: disable=invalid-name
       return dumping_wrapper_lib.DumpingDebugWrapperSession(sess, dump_root)
 
@@ -519,16 +515,13 @@ class TrainTest(test.TestCase):
       train_op = learning.create_train_op(total_loss, optimizer)
 
       loss = learning.train(
-          train_op,
-          None,
-          number_of_steps=1,
-          session_wrapper=dumping_wrapper)
+          train_op, None, number_of_steps=1, session_wrapper=dumping_wrapper)
     self.assertIsNotNone(loss)
 
     run_root = glob.glob(os.path.join(dump_root, 'run_*'))[-1]
     dump = debug_data.DebugDumpDir(run_root)
-    self.assertAllEqual(
-        0, dump.get_tensors('global_step', 0, 'DebugIdentity')[0])
+    self.assertAllEqual(0,
+                        dump.get_tensors('global_step', 0, 'DebugIdentity')[0])
 
   def testTrainWithTrace(self):
     logdir = os.path.join(
@@ -961,8 +954,8 @@ class TrainTest(test.TestCase):
     self.assertGreater(losses[0], losses[1])
 
   def testTrainWithEpochLimit(self):
-    logdir = os.path.join(tempfile.mkdtemp(prefix=self.get_temp_dir()),
-                          'tmp_logs')
+    logdir = os.path.join(
+        tempfile.mkdtemp(prefix=self.get_temp_dir()), 'tmp_logs')
     with ops.Graph().as_default():
       random_seed.set_random_seed(0)
       tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
@@ -982,7 +975,8 @@ class TrainTest(test.TestCase):
     self.assertIsNotNone(loss)
     self.assertLess(loss, .015)
     self.assertTrue(os.path.isfile('{}/model.ckpt-300.index'.format(logdir)))
-    self.assertTrue(os.path.isfile('{}/model.ckpt-300.data-00000-of-00001'.format(logdir)))
+    self.assertTrue(
+        os.path.isfile('{}/model.ckpt-300.data-00000-of-00001'.format(logdir)))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/slim/python/slim/nets/inception_v3.py b/tensorflow/contrib/slim/python/slim/nets/inception_v3.py
index e3c0c036d90c95a5f371bef2ca9f960926d82166..afe261e43a9f144992318086f958e21d50286d11 100644
--- a/tensorflow/contrib/slim/python/slim/nets/inception_v3.py
+++ b/tensorflow/contrib/slim/python/slim/nets/inception_v3.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework.python.ops import arg_scope
+from tensorflow.contrib.layers.python.layers import initializers
 from tensorflow.contrib.layers.python.layers import layers as layers_lib
 from tensorflow.contrib.layers.python.layers import regularizers
 from tensorflow.python.framework import ops
@@ -547,7 +548,10 @@ def inception_v3(inputs,
       parameters or computation cost of the model.
     prediction_fn: a function to get predictions out of logits.
     spatial_squeeze: if True, logits is of shape is [B, C], if false logits is
-        of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
+      of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
+      To use this parameter, the input images must be smaller
+      than 300x300 pixels, in which case the output logit layer
+      does not contain spatial information and can be removed.
     reuse: whether or not the network and its variables should be reused. To be
       able to reuse 'scope' must be given.
     scope: Optional variable_scope.
@@ -675,16 +679,20 @@ def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
 
 
 def inception_v3_arg_scope(weight_decay=0.00004,
-                           stddev=0.1,
                            batch_norm_var_collection='moving_vars',
+                           batch_norm_decay=0.9997,
+                           batch_norm_epsilon=0.001,
+                           updates_collections=ops.GraphKeys.UPDATE_OPS,
                            use_fused_batchnorm=True):
   """Defines the default InceptionV3 arg scope.
 
   Args:
     weight_decay: The weight decay to use for regularizing the model.
-    stddev: The standard deviation of the trunctated normal weight initializer.
     batch_norm_var_collection: The name of the collection for the batch norm
       variables.
+    batch_norm_decay: Decay for batch norm moving average
+    batch_norm_epsilon: Small float added to variance to avoid division by zero
+    updates_collections: Collections for the update ops of the layer
     use_fused_batchnorm: Enable fused batchnorm.
 
   Returns:
@@ -692,11 +700,11 @@ def inception_v3_arg_scope(weight_decay=0.00004,
   """
   batch_norm_params = {
       # Decay for the moving averages.
-      'decay': 0.9997,
+      'decay': batch_norm_decay,
       # epsilon to prevent 0s in variance.
-      'epsilon': 0.001,
+      'epsilon': batch_norm_epsilon,
       # collection containing update_ops.
-      'updates_collections': ops.GraphKeys.UPDATE_OPS,
+      'updates_collections': updates_collections,
       # Use fused batch norm if possible.
       'fused': use_fused_batchnorm,
       # collection containing the moving mean and moving variance.
@@ -714,8 +722,7 @@ def inception_v3_arg_scope(weight_decay=0.00004,
       weights_regularizer=regularizers.l2_regularizer(weight_decay)):
     with arg_scope(
         [layers.conv2d],
-        weights_initializer=init_ops.truncated_normal_initializer(
-            stddev=stddev),
+        weights_initializer=initializers.variance_scaling_initializer(),
         activation_fn=nn_ops.relu,
         normalizer_fn=layers_lib.batch_norm,
         normalizer_params=batch_norm_params) as sc:
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py b/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
index 930df2414bc907703c2670ffd92134727a28e856..a1282847bef981717d7fdb1474adbbaaae4621c0 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
@@ -45,32 +45,67 @@ def _get_linear_equations_tests(dtype_, use_static_shape_, shape_):
         low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_)
     # Make a selfadjoint, positive definite.
     a_np = np.dot(a_np.T, a_np)
+    # jacobi preconditioner
+    jacobi_np = np.zeros_like(a_np)
+    jacobi_np[range(a_np.shape[0]), range(a_np.shape[1])] = (
+        1.0 / a_np.diagonal())
     rhs_np = np.random.uniform(
         low=-1.0, high=1.0, size=shape_[0]).astype(dtype_)
+    x_np = np.zeros_like(rhs_np)
     tol = 1e-6 if dtype_ == np.float64 else 1e-3
     max_iter = 20
     with self.test_session() as sess:
       if use_static_shape_:
         a = constant_op.constant(a_np)
         rhs = constant_op.constant(rhs_np)
+        x = constant_op.constant(x_np)
+        jacobi = constant_op.constant(jacobi_np)
       else:
         a = array_ops.placeholder(dtype_)
         rhs = array_ops.placeholder(dtype_)
+        x = array_ops.placeholder(dtype_)
+        jacobi = array_ops.placeholder(dtype_)
       operator = util.create_operator(a)
-      cg_graph = linear_equations.conjugate_gradient(
-          operator, rhs, tol=tol, max_iter=max_iter)
-      if use_static_shape_:
-        cg_val = sess.run(cg_graph)
-      else:
-        cg_val = sess.run(cg_graph, feed_dict={a: a_np, rhs: rhs_np})
-      norm_r0 = np.linalg.norm(rhs_np)
-      norm_r = np.sqrt(cg_val.gamma)
-      self.assertLessEqual(norm_r, tol * norm_r0)
-      # Validate that we get an equally small residual norm with numpy
-      # using the computed solution.
-      r_np = rhs_np - np.dot(a_np, cg_val.x)
-      norm_r_np = np.linalg.norm(r_np)
-      self.assertLessEqual(norm_r_np, tol * norm_r0)
+      preconditioners = [
+          None, util.identity_operator(a),
+          util.create_operator(jacobi)
+      ]
+      cg_results = []
+      for preconditioner in preconditioners:
+        cg_graph = linear_equations.conjugate_gradient(
+            operator,
+            rhs,
+            preconditioner=preconditioner,
+            x=x,
+            tol=tol,
+            max_iter=max_iter)
+        if use_static_shape_:
+          cg_val = sess.run(cg_graph)
+        else:
+          cg_val = sess.run(
+              cg_graph,
+              feed_dict={
+                  a: a_np,
+                  rhs: rhs_np,
+                  x: x_np,
+                  jacobi: jacobi_np
+              })
+        norm_r0 = np.linalg.norm(rhs_np)
+        norm_r = np.linalg.norm(cg_val.r)
+        self.assertLessEqual(norm_r, tol * norm_r0)
+        # Validate that we get an equally small residual norm with numpy
+        # using the computed solution.
+        r_np = rhs_np - np.dot(a_np, cg_val.x)
+        norm_r_np = np.linalg.norm(r_np)
+        self.assertLessEqual(norm_r_np, tol * norm_r0)
+        cg_results.append(cg_val)
+      # Validate that we get same results using identity_preconditioner
+      # and None
+      self.assertEqual(cg_results[0].i, cg_results[1].i)
+      self.assertAlmostEqual(cg_results[0].gamma, cg_results[1].gamma)
+      self.assertAllClose(cg_results[0].r, cg_results[1].r, rtol=tol)
+      self.assertAllClose(cg_results[0].x, cg_results[1].x, rtol=tol)
+      self.assertAllClose(cg_results[0].p, cg_results[1].p, rtol=tol)
 
   return [test_conjugate_gradient]
 
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/util_test.py b/tensorflow/contrib/solvers/python/kernel_tests/util_test.py
index 1566984b27fdab4c2a8c91bd16f587747e69e9e5..5d7534657bff27f7169e6a97bf4b03d4f6a35bc9 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/util_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/util_test.py
@@ -63,6 +63,43 @@ class UtilTest(test.TestCase):
   def testCreateOperatorUnknownShape(self):
     self._testCreateOperator(False)
 
+  def _testIdentityOperator(self, use_static_shape_):
+    for dtype in np.float32, np.float64:
+      a_np = np.array([[1., 2.], [3., 4.], [5., 6.]], dtype=dtype)
+      x_np = np.array([[2.], [-3.]], dtype=dtype)
+      y_np = np.array([[2], [-3.], [5.]], dtype=dtype)
+      with self.test_session() as sess:
+        if use_static_shape_:
+          a = constant_op.constant(a_np, dtype=dtype)
+          x = constant_op.constant(x_np, dtype=dtype)
+          y = constant_op.constant(y_np, dtype=dtype)
+        else:
+          a = array_ops.placeholder(dtype)
+          x = array_ops.placeholder(dtype)
+          y = array_ops.placeholder(dtype)
+        id_op = util.identity_operator(a)
+        ax = id_op.apply(x)
+        aty = id_op.apply_adjoint(y)
+        op_shape = ops.convert_to_tensor(id_op.shape)
+        if use_static_shape_:
+          op_shape_val, ax_val, aty_val = sess.run([op_shape, ax, aty])
+        else:
+          op_shape_val, ax_val, aty_val = sess.run(
+              [op_shape, ax, aty], feed_dict={
+                  a: a_np,
+                  x: x_np,
+                  y: y_np
+              })
+      self.assertAllEqual(op_shape_val, [3, 2])
+      self.assertAllClose(ax_val, x_np)
+      self.assertAllClose(aty_val, y_np)
+
+  def testIdentityOperator(self):
+    self._testIdentityOperator(True)
+
+  def testIdentityOperatorUnknownShape(self):
+    self._testIdentityOperator(False)
+
   def testL2Norm(self):
     with self.test_session():
       x_np = np.array([[2], [-3.], [5.]])
diff --git a/tensorflow/contrib/solvers/python/ops/linear_equations.py b/tensorflow/contrib/solvers/python/ops/linear_equations.py
index 8cba56eba6b3046b8efbbbbf130705255e1c13bb..d791d467639b572e7831c1d1a582aa15585649b6 100644
--- a/tensorflow/contrib/solvers/python/ops/linear_equations.py
+++ b/tensorflow/contrib/solvers/python/ops/linear_equations.py
@@ -26,11 +26,15 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import linalg_ops
 
 
 def conjugate_gradient(operator,
                        rhs,
+                       preconditioner=None,
+                       x=None,
                        tol=1e-4,
                        max_iter=20,
                        name="conjugate_gradient"):
@@ -55,6 +59,15 @@ def conjugate_gradient(operator,
         vector with the result of applying the operator to `x`, i.e. if
        `operator` represents matrix `A`, `apply` should return `A * x`.
     rhs: A rank-1 `Tensor` of shape `[N]` containing the right-hand size vector.
+    preconditioner: An object representing a linear operator, see `operator`
+      for detail. The preconditioner should approximate the inverse of `A`.
+      An efficient preconditioner could dramatically improve the rate of
+      convergence. If `preconditioner` represents matrix `M`(`M` approximates
+      `A^{-1}`), the algorithm uses `preconditioner.apply(x)` to estimate
+      `A^{-1}x`. For this to be useful, the cost of applying `M` should be
+      much lower than computing `A^{-1}` directly.
+    x: A rank-1 `Tensor` of shape `[N]` containing the initial guess for the
+      solution.
     tol: A float scalar convergence tolerance.
     max_iter: An integer giving the maximum number of iterations.
     name: A name scope for the operation.
@@ -65,35 +78,49 @@ def conjugate_gradient(operator,
       - x: A rank-1 `Tensor` of shape `[N]` containing the computed solution.
       - r: A rank-1 `Tensor` of shape `[M]` containing the residual vector.
       - p: A rank-1 `Tensor` of shape `[N]`. `A`-conjugate basis vector.
-      - gamma: \\(||r||_2^2\\)
+      - gamma: \\(r \dot M \dot r\\), equivalent to  \\(||r||_2^2\\) when
+        `preconditioner=None`.
   """
   # ephemeral class holding CG state.
   cg_state = collections.namedtuple("CGState", ["i", "x", "r", "p", "gamma"])
 
   def stopping_criterion(i, state):
-    return math_ops.logical_and(i < max_iter, state.gamma > tol)
+    return math_ops.logical_and(i < max_iter, linalg_ops.norm(state.r) > tol)
 
-  # TODO(rmlarsen): add preconditioning
-  def cg_step(i, state):
+  def cg_step(i, state):  # pylint: disable=missing-docstring
     z = operator.apply(state.p)
     alpha = state.gamma / util.dot(state.p, z)
     x = state.x + alpha * state.p
     r = state.r - alpha * z
-    gamma = util.l2norm_squared(r)
-    beta = gamma / state.gamma
-    p = r + beta * state.p
+    if preconditioner is None:
+      gamma = util.dot(r, r)
+      beta = gamma / state.gamma
+      p = r + beta * state.p
+    else:
+      q = preconditioner.apply(r)
+      gamma = util.dot(r, q)
+      beta = gamma / state.gamma
+      p = q + beta * state.p
     return i + 1, cg_state(i + 1, x, r, p, gamma)
 
   with ops.name_scope(name):
     n = operator.shape[1:]
     rhs = array_ops.expand_dims(rhs, -1)
-    gamma0 = util.l2norm_squared(rhs)
-    tol = tol * tol * gamma0
-    x = array_ops.expand_dims(
-        array_ops.zeros(
-            n, dtype=rhs.dtype.base_dtype), -1)
+    if x is None:
+      x = array_ops.expand_dims(
+          array_ops.zeros(n, dtype=rhs.dtype.base_dtype), -1)
+      r0 = rhs
+    else:
+      x = array_ops.expand_dims(x, -1)
+      r0 = rhs - operator.apply(x)
+    if preconditioner is None:
+      p0 = r0
+    else:
+      p0 = preconditioner.apply(r0)
+    gamma0 = util.dot(r0, p0)
+    tol *= linalg_ops.norm(r0)
     i = constant_op.constant(0, dtype=dtypes.int32)
-    state = cg_state(i=i, x=x, r=rhs, p=rhs, gamma=gamma0)
+    state = cg_state(i=i, x=x, r=r0, p=p0, gamma=gamma0)
     _, state = control_flow_ops.while_loop(stopping_criterion, cg_step,
                                            [i, state])
     return cg_state(
diff --git a/tensorflow/contrib/solvers/python/ops/util.py b/tensorflow/contrib/solvers/python/ops/util.py
index 777e0c185d6c9fffab6a7fe6e6ae4c133c62ad1a..96947e8eea1006bcd03cf09cd13cd1266695cc2e 100644
--- a/tensorflow/contrib/solvers/python/ops/util.py
+++ b/tensorflow/contrib/solvers/python/ops/util.py
@@ -45,6 +45,23 @@ def create_operator(matrix):
       apply_adjoint=lambda v: math_ops.matmul(matrix, v, adjoint_a=True))
 
 
+def identity_operator(matrix):
+  """Creates a linear operator from a rank-2 identity tensor."""
+
+  linear_operator = collections.namedtuple(
+      "LinearOperator", ["shape", "dtype", "apply", "apply_adjoint"])
+  shape = matrix.get_shape()
+  if shape.is_fully_defined():
+    shape = shape.as_list()
+  else:
+    shape = array_ops.shape(matrix)
+  return linear_operator(
+      shape=shape,
+      dtype=matrix.dtype,
+      apply=lambda v: v,
+      apply_adjoint=lambda v: v)
+
+
 # TODO(rmlarsen): Measure if we should just call matmul.
 def dot(x, y):
   return math_ops.reduce_sum(math_ops.conj(x) * y)
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
index c8b4e472c99e0bf081a7222a7976b1fbbb680825..360e7dbe75f595ff61fb83379089294371203813 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
@@ -105,8 +105,8 @@ class SparsemaxLossTest(test.TestCase):
     tf_loss_op, tf_loss_out = self._tf_sparsemax_loss(z, q, dtype, use_gpu)
     np_loss = self._np_sparsemax_loss(z, q).astype(dtype)
 
-    self.assertAllCloseAccordingToType(np_loss, tf_loss_out,
-                                       half_atol=1e-2, half_rtol=5e-3)
+    self.assertAllCloseAccordingToType(
+        np_loss, tf_loss_out, half_atol=1e-2, half_rtol=5e-3)
     self.assertShapeEqual(np_loss, tf_loss_op)
 
   def _test_constant_add(self, dtype, random, use_gpu):
@@ -116,17 +116,17 @@ class SparsemaxLossTest(test.TestCase):
     q = np.zeros((test_obs, 10))
     q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1
 
-    _, tf_loss_zpc = self._tf_sparsemax_loss(
-        z + c, q, dtype, use_gpu
-    )
+    _, tf_loss_zpc = self._tf_sparsemax_loss(z + c, q, dtype, use_gpu)
 
-    _, tf_loss_z = self._tf_sparsemax_loss(
-        z, q, dtype, use_gpu
-    )
+    _, tf_loss_z = self._tf_sparsemax_loss(z, q, dtype, use_gpu)
 
-    self.assertAllCloseAccordingToType(tf_loss_zpc, tf_loss_z,
-                                       float_atol=5e-6, float_rtol=5e-6,
-                                       half_atol=1e-2, half_rtol=1e-2)
+    self.assertAllCloseAccordingToType(
+        tf_loss_zpc,
+        tf_loss_z,
+        float_atol=5e-6,
+        float_rtol=5e-6,
+        half_atol=1e-2,
+        half_rtol=1e-2)
 
   def _test_sparsemax_loss_positive(self, dtype, random, use_gpu):
     """check sparsemax-loss proposition 4"""
@@ -170,10 +170,7 @@ class SparsemaxLossTest(test.TestCase):
 
     with self.test_session(use_gpu=use_gpu):
       err = gradient_checker.compute_gradient_error(
-        logits, z.shape,
-        loss_op, (test_obs, ),
-        x_init_value=z, delta=1e-9
-      )
+          logits, z.shape, loss_op, (test_obs,), x_init_value=z, delta=1e-9)
 
     self.assertLess(err, 1e-4)
 
@@ -192,8 +189,8 @@ class SparsemaxLossTest(test.TestCase):
       tf_grad = loss_grad_op.eval()
       np_grad = self._np_sparsemax_loss_grad(z, q).astype(dtype)
 
-      self.assertAllCloseAccordingToType(np_grad, tf_grad,
-                                         half_atol=1e-2, half_rtol=5e-3)
+      self.assertAllCloseAccordingToType(
+          np_grad, tf_grad, half_atol=1e-2, half_rtol=5e-3)
       self.assertShapeEqual(np_grad, loss_grad_op)
 
   def _test_dtype(self, dtype):
@@ -220,5 +217,6 @@ class SparsemaxLossTest(test.TestCase):
   def testDouble(self):
     self._test_dtype('float64')
 
-if __name__ == "__main__":
+
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
index 82d36ee9cb21fb822e6df0c3632c49a4fd616825..259e62bd864fba3cc7d9aa387e02c8319438d658 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
@@ -83,8 +83,8 @@ class SparsemaxTest(test.TestCase):
     tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax(z, dtype, use_gpu)
     p_sparemax = self._np_sparsemax(z).astype(dtype)
 
-    self.assertAllCloseAccordingToType(p_sparemax, tf_sparsemax_out,
-                                       half_atol=5e-3)
+    self.assertAllCloseAccordingToType(
+        p_sparemax, tf_sparsemax_out, half_atol=5e-3)
     self.assertShapeEqual(p_sparemax, tf_sparsemax_op)
 
   def _test_sparsemax_of_zero(self, dtype, random, use_gpu):
@@ -111,9 +111,8 @@ class SparsemaxTest(test.TestCase):
     p_expected = np.zeros((test_obs, 10), dtype=dtype)
     p_expected[np.arange(0, test_obs), z_sort_arg[:, 0]] = 1
 
-    tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax(
-        (1 / epsilon) * z, dtype, use_gpu
-    )
+    tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax((1 / epsilon) * z,
+                                                           dtype, use_gpu)
 
     self.assertAllCloseAccordingToType(p_expected, tf_sparsemax_out)
     self.assertShapeEqual(p_expected, tf_sparsemax_op)
@@ -123,16 +122,12 @@ class SparsemaxTest(test.TestCase):
     z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype)
     c = random.uniform(low=-3, high=3, size=(test_obs, 1)).astype(dtype)
 
-    _, tf_sparsemax_zpc = self._tf_sparsemax(
-        z + c, dtype, use_gpu
-    )
+    _, tf_sparsemax_zpc = self._tf_sparsemax(z + c, dtype, use_gpu)
 
-    _, tf_sparsemax_z = self._tf_sparsemax(
-        z, dtype, use_gpu
-    )
+    _, tf_sparsemax_z = self._tf_sparsemax(z, dtype, use_gpu)
 
-    self.assertAllCloseAccordingToType(tf_sparsemax_zpc, tf_sparsemax_z,
-                                       half_atol=5e-3)
+    self.assertAllCloseAccordingToType(
+        tf_sparsemax_zpc, tf_sparsemax_z, half_atol=5e-3)
 
   def _test_permutation(self, dtype, random, use_gpu):
     """check sparsemax proposition 3"""
@@ -143,12 +138,11 @@ class SparsemaxTest(test.TestCase):
       per = random.permutation(10)
 
       tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax(
-        z[i, per].reshape(1, -1), dtype, use_gpu
-      )
+          z[i, per].reshape(1, -1), dtype, use_gpu)
       p_expected = p[i, per].reshape(1, -1)
 
-      self.assertAllCloseAccordingToType(p_expected, tf_sparsemax_out,
-                                         half_atol=5e-3)
+      self.assertAllCloseAccordingToType(
+          p_expected, tf_sparsemax_out, half_atol=5e-3)
       self.assertShapeEqual(p_expected, tf_sparsemax_op)
 
   def _test_diffrence(self, dtype, random, use_gpu):
@@ -166,18 +160,14 @@ class SparsemaxTest(test.TestCase):
             continue
 
           self.assertTrue(
-            0 <= p[val, j] - p[val, i] <= z[val, j] - z[val, i] + etol,
-            "0 <= %.10f <= %.10f" % (
-              p[val, j] - p[val, i], z[val, j] - z[val, i] + etol
-            )
-          )
+              0 <= p[val, j] - p[val, i] <= z[val, j] - z[val, i] + etol,
+              '0 <= %.10f <= %.10f' % (p[val, j] - p[val, i],
+                                       z[val, j] - z[val, i] + etol))
 
   def _test_two_dimentional(self, dtype, random, use_gpu):
     """check two dimentation sparsemax case"""
     t = np.linspace(-2, 2, test_obs, dtype=dtype)
-    z = np.vstack([
-      t, np.zeros(test_obs, dtype=dtype)
-    ]).T
+    z = np.vstack([t, np.zeros(test_obs, dtype=dtype)]).T
 
     tf_sparsemax_op, tf_sparsemax_out = self._tf_sparsemax(z, dtype, use_gpu)
 
@@ -196,10 +186,7 @@ class SparsemaxTest(test.TestCase):
 
     with self.test_session(use_gpu=use_gpu):
       err = gradient_checker.compute_gradient_error(
-        logits, z.shape,
-        sparsemax_op, z.shape,
-        x_init_value=z, delta=1e-9
-      )
+          logits, z.shape, sparsemax_op, z.shape, x_init_value=z, delta=1e-9)
 
     self.assertLess(err, 1e-4)
 
@@ -248,5 +235,6 @@ class SparsemaxTest(test.TestCase):
   def testDouble(self):
     self._test_dtype('float64')
 
-if __name__ == "__main__":
+
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
index 73a5cf1e9287ea4e4350d88165744cf12db954bb..890ca20f4cabd65146e803e54e554a5c97e72427 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
@@ -23,7 +23,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.platform import resource_loader
 
 __all__ = ["sparsemax"]
 
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
index ba18f89e16c76a6ef3cb05df0c13f62eace6bbb1..582d1e6136df4d3ad3c8108ae9607d5fef519145 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.util import loader
-from tensorflow.python.platform import resource_loader
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/contrib/specs/BUILD b/tensorflow/contrib/specs/BUILD
index 4b688690aef513dd683817b0b5c2ba4cb50f73d9..084953a0a226cde46ebd9d2031d20cb839180ca8 100644
--- a/tensorflow/contrib/specs/BUILD
+++ b/tensorflow/contrib/specs/BUILD
@@ -23,7 +23,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/ndlstm",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:logging_ops",
diff --git a/tensorflow/contrib/specs/README.md b/tensorflow/contrib/specs/README.md
index b764e6e714ea907cd4474a07843bda300a8e4c8b..bcf34e601f1ffe3ab7a8c0d2ad573da4c8c977e9 100644
--- a/tensorflow/contrib/specs/README.md
+++ b/tensorflow/contrib/specs/README.md
@@ -59,17 +59,6 @@ Reshaping:
  - `Squeeze` = tf.squeeze
  - `Expand` = tf.expand_dims
 
-Multidimensional LSTM:
-
-These are intended as alternatives to 2D convolutions.  For sequence models,
-there will be other modeling primitives.
-
- - `Lstm2` = Fun(lstm2d.separable_lstm)  # 2D-to-2D
- - `Lstm2to1` = Fun(lstm2d.reduce_to_sequence)  # 2D-to-1D
- - `Lstm2to0` = Fun(lstm2d.reduce_to_final)  # 2D-to-vector
- - `Clstm2(n, m)` is a `Cl(n, [3,3])` followed by `Lstm2(m)`
- - `Dws(n)` is a depthwise convolution `Cs(n, [1, 1])`
-
 Other:
 
  - `Id` = identity
diff --git a/tensorflow/contrib/specs/__init__.py b/tensorflow/contrib/specs/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..52e83069cb0c68b510da46149248369dce376647 100644
--- a/tensorflow/contrib/specs/__init__.py
+++ b/tensorflow/contrib/specs/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/specs/python/__init__.py b/tensorflow/contrib/specs/python/__init__.py
index 52db61e421a52f4106ab1e2a4d7ee5c100b6b4bc..b6cc754023859f8d3668545dd5c2fd1d1581ecf5 100644
--- a/tensorflow/contrib/specs/python/__init__.py
+++ b/tensorflow/contrib/specs/python/__init__.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=wildcard-import,g-importing-member
+# pylint: disable=wildcard-import,g-importing-member,redefined-builtin
 from tensorflow.contrib.specs.python.params_ops import *
 from tensorflow.contrib.specs.python.specs import *
 from tensorflow.contrib.specs.python.specs_lib import *
 from tensorflow.contrib.specs.python.specs_ops import *
 from tensorflow.contrib.specs.python.summaries import *
-# pylint: enable=wildcard-import
+# pylint: enable=wildcard-import,redefined-builtin
diff --git a/tensorflow/contrib/specs/python/specs_ops.py b/tensorflow/contrib/specs/python/specs_ops.py
index a6bd4d16c284a8b1a370005a7c55d3b74b4fbf95..49b989b8d0fc83a3793263a2b59a98a8fe292c6a 100644
--- a/tensorflow/contrib/specs/python/specs_ops.py
+++ b/tensorflow/contrib/specs/python/specs_ops.py
@@ -23,8 +23,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.layers.python.layers import layers
-from tensorflow.contrib.ndlstm.python import lstm1d
-from tensorflow.contrib.ndlstm.python import lstm2d
 from tensorflow.contrib.specs.python import specs_lib
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
@@ -122,17 +120,6 @@ Sig = Fun(math_ops.sigmoid)
 Tanh = Fun(math_ops.tanh)
 Smax = Fun(nn_ops.softmax)
 
-# 2D LSTM
-
-Lstm2 = Fun(lstm2d.separable_lstm)
-Lstm2to1 = Fun(lstm2d.reduce_to_sequence)  # 2D to 1D
-Lstm2to0 = Fun(lstm2d.reduce_to_final)  # 2D to depth-only
-
-
-def Clstm2(n, *args, **kw):
-  """2D LSTM with 3x3 pre-convolution."""
-  return Cl(n, [3, 3]) | Lstm2(*args, **kw)
-
 
 def Dws(n):
   """Depth-wise convolution + sigmoid (used after LSTM)."""
@@ -143,13 +130,6 @@ def Dwm(n):
   """Depth-wise convolution + softmax (used after LSTM)."""
   return Cm(n, [1, 1])
 
-
-# 1D LSTM
-
-Lstm1 = Fun(lstm1d.ndlstm_base)
-Lstm1to0 = Fun(lstm1d.sequence_to_final)  # 1D to depth-only
-Ssm = Fun(lstm1d.sequence_softmax)
-
 # Sharing of Variables
 
 
diff --git a/tensorflow/contrib/specs/python/specs_test.py b/tensorflow/contrib/specs/python/specs_test.py
index 41782a9fc9ada3d8a1ff30847971aea18f0ca1c7..9a4ad36793542a83105ad0dc1ef7c0624a6c1f99 100644
--- a/tensorflow/contrib/specs/python/specs_test.py
+++ b/tensorflow/contrib/specs/python/specs_test.py
@@ -149,36 +149,6 @@ class SpecsTest(test.TestCase):
       self.assertEqual(tuple(result.shape), (10, 20))
       self.assertEqual(summaries.tf_spec_structure(spec, inputs), "_ sig sig")
 
-  def testLstm2(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(1, 64, 64, 5))
-      spec = "net = Lstm2(15)"
-      outputs = specs.create_net(spec, inputs)
-      self.assertEqual(outputs.get_shape().as_list(), [1, 64, 64, 15])
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (1, 64, 64, 15))
-
-  def testLstm2to1(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(1, 64, 64, 5))
-      spec = "net = Lstm2to1(15)"
-      outputs = specs.create_net(spec, inputs)
-      self.assertEqual(outputs.get_shape().as_list(), [1, 64, 15])
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (1, 64, 15))
-
-  def testLstm2to0(self):
-    with self.test_session():
-      inputs = constant_op.constant(_rand(1, 64, 64, 5))
-      spec = "net = Lstm2to0(15)"
-      outputs = specs.create_net(spec, inputs)
-      self.assertEqual(outputs.get_shape().as_list(), [1, 15])
-      variables.global_variables_initializer().run()
-      result = outputs.eval()
-      self.assertEqual(tuple(result.shape), (1, 15))
-
   def testKeywordRestriction(self):
     with self.test_session():
       inputs = constant_op.constant(_rand(10, 20))
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index f34291c203d468603abfaebe4a00f1847e71934d..b58c83fdaf574fb349fac57c922f1178b7d13b66 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -13,10 +13,7 @@ load(
 tf_gen_op_wrapper_py(
     name = "gen_summary_ops",
     out = "gen_summary_ops.py",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:summary_ops_op_lib",
-    ],
+    deps = ["//tensorflow/core:summary_ops_op_lib"],
 )
 
 py_test(
@@ -115,5 +112,6 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
+        "@org_sqlite//:python",
     ],
 )
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index f783179f61495f33c80b897d00aecb46743fddd9..2d6d7ea6a3eff2562ba8def4117e3aa6f818b6fd 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -18,6 +18,42 @@ The operations in this package are safe to use with eager execution turned on or
 off. It has a more flexible API that allows summaries to be written directly
 from ops to places other than event log files, rather than propagating protos
 from @{tf.summary.merge_all} to @{tf.summary.FileWriter}.
+
+To use with eager execution enabled, write your code as follows:
+
+global_step = tf.train.get_or_create_global_step()
+summary_writer = tf.contrib.summary.create_file_writer(
+    train_dir, flush_millis=10000)
+with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
+  # model code goes here
+  # and in it call
+  tf.contrib.summary.scalar("loss", my_loss)
+  # In this case every call to tf.contrib.summary.scalar will generate a record
+  # ...
+
+To use it with graph execution, write your code as follows:
+
+global_step = tf.train.get_or_create_global_step()
+summary_writer = tf.contrib.summary.create_file_writer(
+    train_dir, flush_millis=10000)
+with summary_writer.as_default(), tf.contrib.summary.always_record_summaries():
+  # model definition code goes here
+  # and in it call
+  tf.contrib.summary.scalar("loss", my_loss)
+  # In this case every call to tf.contrib.summary.scalar will generate an op,
+  # note the need to run tf.contrib.summary.all_summary_ops() to make sure these
+  # ops get executed.
+  # ...
+  train_op = ....
+
+with tf.Session(...) as sess:
+  tf.global_variables_initializer().run()
+  tf.contrib.summary.initialize(graph=tf.get_default_graph())
+  # ...
+  while not_done_training:
+    sess.run([train_op, tf.contrib.summary.all_summary_ops()])
+    # ...
+
 """
 
 from __future__ import absolute_import
@@ -28,9 +64,11 @@ from __future__ import print_function
 from tensorflow.contrib.summary.summary_ops import all_summary_ops
 from tensorflow.contrib.summary.summary_ops import always_record_summaries
 from tensorflow.contrib.summary.summary_ops import audio
-from tensorflow.contrib.summary.summary_ops import create_summary_db_writer
+from tensorflow.contrib.summary.summary_ops import create_db_writer
+from tensorflow.contrib.summary.summary_ops import create_file_writer
 from tensorflow.contrib.summary.summary_ops import create_summary_file_writer
 from tensorflow.contrib.summary.summary_ops import eval_dir
+from tensorflow.contrib.summary.summary_ops import flush
 from tensorflow.contrib.summary.summary_ops import generic
 from tensorflow.contrib.summary.summary_ops import graph
 from tensorflow.contrib.summary.summary_ops import histogram
diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/contrib/summary/summary_ops.py
index 8e37987cb71c570d4c3bcea0be2a06e182290815..068ae35c712622117127bb5b3dfa341a48254c54 100644
--- a/tensorflow/contrib/summary/summary_ops.py
+++ b/tensorflow/contrib/summary/summary_ops.py
@@ -38,9 +38,11 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import summary_op_util
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
 from tensorflow.python.util import tf_contextlib
 
+
 # Name for a collection which is expected to have at most a single boolean
 # Tensor. If this tensor is True the summary ops will record summaries.
 _SHOULD_RECORD_SUMMARIES_NAME = "ShouldRecordSummaries"
@@ -69,7 +71,7 @@ def should_record_summaries():
 def record_summaries_every_n_global_steps(n, global_step=None):
   """Sets the should_record_summaries Tensor to true if global_step % n == 0."""
   if global_step is None:
-    global_step = training_util.get_global_step()
+    global_step = training_util.get_or_create_global_step()
   collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
   old = collection_ref[:]
   with ops.device("cpu:0"):
@@ -102,8 +104,8 @@ class SummaryWriter(object):
   """Encapsulates a stateful summary writer resource.
 
   See also:
-  - @{tf.contrib.summary.create_summary_file_writer}
-  - @{tf.contrib.summary.create_summary_db_writer}
+  - @{tf.contrib.summary.create_file_writer}
+  - @{tf.contrib.summary.create_db_writer}
   """
 
   def  __init__(self, resource):
@@ -152,10 +154,12 @@ def initialize(
       to @{tf.get_default_session}.
 
   Raises:
-    RuntimeError: If in eager mode, or if the current thread has no
-      default @{tf.contrib.summary.SummaryWriter}.
+    RuntimeError: If  the current thread has no default
+      @{tf.contrib.summary.SummaryWriter}.
     ValueError: If session wasn't passed and no default session.
   """
+  if context.in_eager_mode():
+    return
   if context.context().summary_writer_resource is None:
     raise RuntimeError("No default tf.contrib.summary.SummaryWriter found")
   if session is None:
@@ -169,11 +173,11 @@ def initialize(
     session.run(_graph(x, 0), feed_dict={x: data})
 
 
-def create_summary_file_writer(logdir,
-                               max_queue=None,
-                               flush_millis=None,
-                               filename_suffix=None,
-                               name=None):
+def create_file_writer(logdir,
+                       max_queue=None,
+                       flush_millis=None,
+                       filename_suffix=None,
+                       name=None):
   """Creates a summary file writer in the current context.
 
   Args:
@@ -200,7 +204,7 @@ def create_summary_file_writer(logdir,
     if flush_millis is None:
       flush_millis = constant_op.constant(2 * 60 * 1000)
     if filename_suffix is None:
-      filename_suffix = constant_op.constant("")
+      filename_suffix = constant_op.constant(".v2")
     return _make_summary_writer(
         name,
         gen_summary_ops.create_summary_file_writer,
@@ -210,11 +214,11 @@ def create_summary_file_writer(logdir,
         filename_suffix=filename_suffix)
 
 
-def create_summary_db_writer(db_uri,
-                             experiment_name=None,
-                             run_name=None,
-                             user_name=None,
-                             name=None):
+def create_db_writer(db_uri,
+                     experiment_name=None,
+                     run_name=None,
+                     user_name=None,
+                     name=None):
   """Creates a summary database writer in the current context.
 
   This can be used to write tensors from the execution graph directly
@@ -290,13 +294,9 @@ def all_summary_ops():
 
   Returns:
     The summary ops.
-
-  Raises:
-    RuntimeError: If in Eager mode.
   """
   if context.in_eager_mode():
-    raise RuntimeError(
-        "tf.contrib.summary.all_summary_ops is only supported in graph mode.")
+    return None
   return ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
 
 
@@ -498,7 +498,7 @@ _graph = graph  # for functions with a graph parameter
 def import_event(tensor, name=None):
   """Writes a @{tf.Event} binary proto.
 
-  When using create_summary_db_writer(), this can be used alongside
+  When using create_db_writer(), this can be used alongside
   @{tf.TFRecordReader} to load event logs into the database. Please
   note that this is lower level than the other summary functions and
   will ignore any conditions set by methods like
@@ -516,11 +516,39 @@ def import_event(tensor, name=None):
       context.context().summary_writer_resource, tensor, name=name)
 
 
+def flush(writer=None, name=None):
+  """Forces summary writer to send any buffered data to storage.
+
+  This operation blocks until that finishes.
+
+  Args:
+    writer: The @{tf.contrib.summary.SummaryWriter} resource to flush.
+      The thread default will be used if this parameter is None.
+      Otherwise a @{tf.no_op} is returned.
+    name: A name for the operation (optional).
+
+  Returns:
+    The created @{tf.Operation}.
+  """
+  if writer is None:
+    writer = context.context().summary_writer_resource
+    if writer is None:
+      return control_flow_ops.no_op()
+  return gen_summary_ops.flush_summary_writer(writer, name=name)
+
+
 def eval_dir(model_dir, name=None):
   """Construct a logdir for an eval summary writer."""
   return os.path.join(model_dir, "eval" if not name else "eval_" + name)
 
 
+def create_summary_file_writer(*args, **kwargs):
+  """Please use @{tf.contrib.summary.create_file_writer}."""
+  logging.warning("Deprecation Warning: create_summary_file_writer was renamed "
+                  "to create_file_writer")
+  return create_file_writer(*args, **kwargs)
+
+
 def _serialize_graph(arbitrary_graph):
   if isinstance(arbitrary_graph, ops.Graph):
     return arbitrary_graph.as_graph_def(add_shapes=True).SerializeToString()
@@ -530,7 +558,7 @@ def _serialize_graph(arbitrary_graph):
 
 def _choose_step(step):
   if step is None:
-    return training_util.get_global_step()
+    return training_util.get_or_create_global_step()
   if not isinstance(step, ops.Tensor):
     return ops.convert_to_tensor(step, dtypes.int64)
   return step
diff --git a/tensorflow/contrib/summary/summary_ops_graph_test.py b/tensorflow/contrib/summary/summary_ops_graph_test.py
index 703adb7b46c47ee505e24ce2434e293b9c19729f..2b7806f80d020e0064b0f5cf32fd765a9ee993d1 100644
--- a/tensorflow/contrib/summary/summary_ops_graph_test.py
+++ b/tensorflow/contrib/summary/summary_ops_graph_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import training_util
 
@@ -48,16 +49,47 @@ class DbTest(summary_test_util.SummaryDbTest):
     name = 'hi'
     graph = graph_pb2.GraphDef(node=(node_def_pb2.NodeDef(name=name),))
     with self.test_session():
-      with self.create_summary_db_writer().as_default():
+      with self.create_db_writer().as_default():
         summary_ops.initialize(graph=graph)
     six.assertCountEqual(self, [name],
                          get_all(self.db, 'SELECT node_name FROM Nodes'))
 
+  def testScalarSummary(self):
+    """Test record_summaries_every_n_global_steps and all_summaries()."""
+    with ops.Graph().as_default(), self.test_session() as sess:
+      global_step = training_util.get_or_create_global_step()
+      global_step.initializer.run()
+      with ops.device('/cpu:0'):
+        step_increment = state_ops.assign_add(global_step, 1)
+      sess.run(step_increment)  # Increment global step from 0 to 1
+
+      logdir = tempfile.mkdtemp()
+      with summary_ops.create_file_writer(logdir, max_queue=0,
+                                          name='t2').as_default():
+        with summary_ops.record_summaries_every_n_global_steps(2):
+          summary_ops.initialize()
+          summary_op = summary_ops.scalar('my_scalar', 2.0)
+
+          # Neither of these should produce a summary because
+          # global_step is 1 and "1 % 2 != 0"
+          sess.run(summary_ops.all_summary_ops())
+          sess.run(summary_op)
+          events = summary_test_util.events_from_logdir(logdir)
+          self.assertEqual(len(events), 1)
+
+          # Increment global step from 1 to 2 and check that the summary
+          # is now written
+          sess.run(step_increment)
+          sess.run(summary_ops.all_summary_ops())
+          events = summary_test_util.events_from_logdir(logdir)
+          self.assertEqual(len(events), 2)
+          self.assertEqual(events[1].summary.value[0].tag, 'my_scalar')
+
   def testSummaryGraphModeCond(self):
     with ops.Graph().as_default(), self.test_session():
       training_util.get_or_create_global_step()
       logdir = tempfile.mkdtemp()
-      with summary_ops.create_summary_file_writer(
+      with summary_ops.create_file_writer(
           logdir, max_queue=0,
           name='t2').as_default(), summary_ops.always_record_summaries():
         summary_ops.initialize()
@@ -78,7 +110,7 @@ class DbTest(summary_test_util.SummaryDbTest):
     with ops.Graph().as_default(), self.test_session():
       training_util.get_or_create_global_step()
       logdir = tempfile.mkdtemp()
-      with summary_ops.create_summary_file_writer(
+      with summary_ops.create_file_writer(
           logdir, max_queue=0,
           name='t2').as_default(), summary_ops.always_record_summaries():
         summary_ops.initialize()
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index d20300c858f2b925bbc813e07b798b2d45bdab8f..bb7215f879411e91a1c47b87f5caede63fffea74 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -18,16 +18,17 @@ from __future__ import print_function
 
 import tempfile
 
+import numpy as np
 import six
 
 from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
+from tensorflow.core.framework import types_pb2
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
@@ -37,14 +38,25 @@ from tensorflow.python.training import training_util
 get_all = summary_test_util.get_all
 get_one = summary_test_util.get_one
 
+_NUMPY_NUMERIC_TYPES = {
+    types_pb2.DT_HALF: np.float16,
+    types_pb2.DT_FLOAT: np.float32,
+    types_pb2.DT_DOUBLE: np.float64,
+    types_pb2.DT_INT8: np.int8,
+    types_pb2.DT_INT16: np.int16,
+    types_pb2.DT_INT32: np.int32,
+    types_pb2.DT_INT64: np.int64,
+    types_pb2.DT_UINT8: np.uint8,
+    types_pb2.DT_UINT16: np.uint16,
+    types_pb2.DT_UINT32: np.uint32,
+    types_pb2.DT_UINT64: np.uint64,
+    types_pb2.DT_COMPLEX64: np.complex64,
+    types_pb2.DT_COMPLEX128: np.complex128,
+    types_pb2.DT_BOOL: np.bool_,
+}
 
-class TargetTest(test_util.TensorFlowTestCase):
 
-  def testInvalidDirectory(self):
-    logdir = '/tmp/apath/that/doesnt/exist'
-    self.assertFalse(gfile.Exists(logdir))
-    with self.assertRaises(errors.NotFoundError):
-      summary_ops.create_summary_file_writer(logdir, max_queue=0, name='t0')
+class TargetTest(test_util.TensorFlowTestCase):
 
   def testShouldRecordSummary(self):
     self.assertFalse(summary_ops.should_record_summaries())
@@ -54,7 +66,7 @@ class TargetTest(test_util.TensorFlowTestCase):
   def testSummaryOps(self):
     training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name='t0').as_default(), summary_ops.always_record_summaries():
       summary_ops.generic('tensor', 1, '')
@@ -69,7 +81,7 @@ class TargetTest(test_util.TensorFlowTestCase):
   def testDefunSummarys(self):
     training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name='t1').as_default(), summary_ops.always_record_summaries():
 
@@ -85,7 +97,7 @@ class TargetTest(test_util.TensorFlowTestCase):
   def testSummaryName(self):
     training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name='t2').as_default(), summary_ops.always_record_summaries():
 
@@ -98,7 +110,7 @@ class TargetTest(test_util.TensorFlowTestCase):
   def testSummaryGlobalStep(self):
     step = training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name='t2').as_default(), summary_ops.always_record_summaries():
 
@@ -108,11 +120,39 @@ class TargetTest(test_util.TensorFlowTestCase):
       self.assertEqual(len(events), 2)
       self.assertEqual(events[1].summary.value[0].tag, 'scalar')
 
+  def testMaxQueue(self):
+    logs = tempfile.mkdtemp()
+    with summary_ops.create_file_writer(
+        logs, max_queue=2, flush_millis=999999,
+        name='lol').as_default(), summary_ops.always_record_summaries():
+      get_total = lambda: len(summary_test_util.events_from_logdir(logs))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      summary_ops.scalar('scalar', 2.0, step=1)
+      self.assertEqual(1, get_total())
+      summary_ops.scalar('scalar', 2.0, step=2)
+      self.assertEqual(3, get_total())
+
+  def testFlush(self):
+    logs = tempfile.mkdtemp()
+    with summary_ops.create_file_writer(
+        logs, max_queue=999999, flush_millis=999999,
+        name='lol').as_default(), summary_ops.always_record_summaries():
+      get_total = lambda: len(summary_test_util.events_from_logdir(logs))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      summary_ops.scalar('scalar', 2.0, step=1)
+      summary_ops.scalar('scalar', 2.0, step=2)
+      self.assertEqual(1, get_total())
+      summary_ops.flush()
+      self.assertEqual(3, get_total())
+
 
 class DbTest(summary_test_util.SummaryDbTest):
 
   def testIntegerSummaries(self):
     step = training_util.create_global_step()
+    writer = self.create_db_writer()
 
     def adder(x, y):
       state_ops.assign_add(step, 1)
@@ -123,11 +163,12 @@ class DbTest(summary_test_util.SummaryDbTest):
       return sum_
 
     with summary_ops.always_record_summaries():
-      with self.create_summary_db_writer().as_default():
+      with writer.as_default():
         self.assertEqual(5, adder(int64(2), int64(3)).numpy())
 
-    six.assertCountEqual(self, [1, 1, 1],
-                         get_all(self.db, 'SELECT step FROM Tensors'))
+    six.assertCountEqual(
+        self, [1, 1, 1],
+        get_all(self.db, 'SELECT step FROM Tensors WHERE dtype IS NOT NULL'))
     six.assertCountEqual(self, ['x', 'y', 'sum'],
                          get_all(self.db, 'SELECT tag_name FROM Tags'))
     x_id = get_one(self.db, 'SELECT tag_id FROM Tags WHERE tag_name = "x"')
@@ -135,11 +176,12 @@ class DbTest(summary_test_util.SummaryDbTest):
     sum_id = get_one(self.db, 'SELECT tag_id FROM Tags WHERE tag_name = "sum"')
 
     with summary_ops.always_record_summaries():
-      with self.create_summary_db_writer().as_default():
+      with writer.as_default():
         self.assertEqual(9, adder(int64(4), int64(5)).numpy())
 
-    six.assertCountEqual(self, [1, 1, 1, 2, 2, 2],
-                         get_all(self.db, 'SELECT step FROM Tensors'))
+    six.assertCountEqual(
+        self, [1, 1, 1, 2, 2, 2],
+        get_all(self.db, 'SELECT step FROM Tensors WHERE dtype IS NOT NULL'))
     six.assertCountEqual(self, [x_id, y_id, sum_id],
                          get_all(self.db, 'SELECT tag_id FROM Tags'))
     self.assertEqual(2, get_tensor(self.db, x_id, 1))
@@ -158,35 +200,41 @@ class DbTest(summary_test_util.SummaryDbTest):
 
   def testBadExperimentName(self):
     with self.assertRaises(ValueError):
-      self.create_summary_db_writer(experiment_name='\0')
+      self.create_db_writer(experiment_name='\0')
 
   def testBadRunName(self):
     with self.assertRaises(ValueError):
-      self.create_summary_db_writer(run_name='\0')
+      self.create_db_writer(run_name='\0')
 
   def testBadUserName(self):
     with self.assertRaises(ValueError):
-      self.create_summary_db_writer(user_name='-hi')
+      self.create_db_writer(user_name='-hi')
     with self.assertRaises(ValueError):
-      self.create_summary_db_writer(user_name='hi-')
+      self.create_db_writer(user_name='hi-')
     with self.assertRaises(ValueError):
-      self.create_summary_db_writer(user_name='@')
+      self.create_db_writer(user_name='@')
 
   def testGraphSummary(self):
     training_util.get_or_create_global_step()
     name = 'hi'
     graph = graph_pb2.GraphDef(node=(node_def_pb2.NodeDef(name=name),))
     with summary_ops.always_record_summaries():
-      with self.create_summary_db_writer().as_default():
+      with self.create_db_writer().as_default():
         summary_ops.graph(graph)
     six.assertCountEqual(self, [name],
                          get_all(self.db, 'SELECT node_name FROM Nodes'))
 
 
 def get_tensor(db, tag_id, step):
-  return get_one(
-      db, 'SELECT tensor FROM Tensors WHERE tag_id = ? AND step = ?', tag_id,
-      step)
+  cursor = db.execute(
+      'SELECT dtype, shape, data FROM Tensors WHERE series = ? AND step = ?',
+      (tag_id, step))
+  dtype, shape, data = cursor.fetchone()
+  assert dtype in _NUMPY_NUMERIC_TYPES
+  buf = np.frombuffer(data, dtype=_NUMPY_NUMERIC_TYPES[dtype])
+  if not shape:
+    return buf[0]
+  return buf.reshape([int(i) for i in shape.split(',')])
 
 
 def int64(x):
diff --git a/tensorflow/contrib/summary/summary_test_internal.py b/tensorflow/contrib/summary/summary_test_internal.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0d3384735fb1eb1a048c7aa6da0037ee9fc6936
--- /dev/null
+++ b/tensorflow/contrib/summary/summary_test_internal.py
@@ -0,0 +1,60 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Internal helpers for tests in this directory."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+import sqlite3
+
+from tensorflow.contrib.summary import summary_ops
+from tensorflow.python.framework import test_util
+
+
+class SummaryDbTest(test_util.TensorFlowTestCase):
+  """Helper for summary database testing."""
+
+  def setUp(self):
+    super(SummaryDbTest, self).setUp()
+    self.db_path = os.path.join(self.get_temp_dir(), 'DbTest.sqlite')
+    if os.path.exists(self.db_path):
+      os.unlink(self.db_path)
+    self.db = sqlite3.connect(self.db_path)
+    self.create_db_writer = functools.partial(
+        summary_ops.create_db_writer,
+        db_uri=self.db_path,
+        experiment_name='experiment',
+        run_name='run',
+        user_name='user')
+
+  def tearDown(self):
+    self.db.close()
+    super(SummaryDbTest, self).tearDown()
+
+
+def get_one(db, q, *p):
+  return db.execute(q, p).fetchone()[0]
+
+
+def get_all(db, q, *p):
+  return unroll(db.execute(q, p).fetchall())
+
+
+def unroll(list_of_tuples):
+  return sum(list_of_tuples, ())
diff --git a/tensorflow/contrib/summary/summary_test_util.py b/tensorflow/contrib/summary/summary_test_util.py
index 94767c8df25023cfe6dd050df6d34153834df70a..8506c4be9c4ca8305b62da17c7246e6e18313bd3 100644
--- a/tensorflow/contrib/summary/summary_test_util.py
+++ b/tensorflow/contrib/summary/summary_test_util.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import functools
 import os
+
 import sqlite3
 
 from tensorflow.contrib.summary import summary_ops
@@ -39,8 +40,8 @@ class SummaryDbTest(test_util.TensorFlowTestCase):
     if os.path.exists(self.db_path):
       os.unlink(self.db_path)
     self.db = sqlite3.connect(self.db_path)
-    self.create_summary_db_writer = functools.partial(
-        summary_ops.create_summary_db_writer,
+    self.create_db_writer = functools.partial(
+        summary_ops.create_db_writer,
         db_uri=self.db_path,
         experiment_name='experiment',
         run_name='run',
@@ -83,7 +84,7 @@ def events_from_logdir(logdir):
   """
   assert gfile.Exists(logdir)
   files = gfile.ListDirectory(logdir)
-  assert len(files) == 1, "Found not exactly one file in logdir: %s" % files
+  assert len(files) == 1, 'Found not exactly one file in logdir: %s' % files
   return events_from_file(os.path.join(logdir, files[0]))
 
 
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index f54daa71255f2a49edf30f73e16dfc211dc92e39..58a7fa095d8356229fdb5879bea99d316113c828 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -530,7 +530,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":client_lib",
-        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
         "//tensorflow/python:array_ops",
@@ -555,6 +554,7 @@ py_test(
     tags = [
         "no_windows",
         "nomac",  # b/63258195
+        "notsan",
     ],
     deps = [
         ":random_forest",
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index 807c8398439b9d225c974d2a1cdc9b845df4d26e..4abcc20ed334e706c8ae59e2127dfd6f4e152361 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -17,9 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib import layers
-
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
@@ -44,8 +43,8 @@ from tensorflow.python.training import training_util
 KEYS_NAME = 'keys'
 LOSS_NAME = 'rf_training_loss'
 TREE_PATHS_PREDICTION_KEY = 'tree_paths'
-VARIANCE_PREDICTION_KEY = 'regression_variance'
-
+VARIANCE_PREDICTION_KEY = 'prediction_variance'
+ALL_SERVING_KEY = 'tensorforest_all'
 EPSILON = 0.000001
 
 
@@ -135,7 +134,8 @@ def get_model_fn(params,
                  trainer_id=0,
                  report_feature_importances=False,
                  local_eval=False,
-                 head_scope=None):
+                 head_scope=None,
+                 include_all_in_serving=False):
   """Return a model function given a way to construct a graph builder."""
   if model_head is None:
     model_head = get_default_head(params, weights_name)
@@ -190,7 +190,7 @@ def get_model_fn(params,
                 features, labels, input_weights=weights,
                 num_trainers=num_trainers,
                 trainer_id=trainer_id),
-            state_ops.assign_add(contrib_framework.get_global_step(), 1))
+            state_ops.assign_add(training_util.get_global_step(), 1))
 
     # Put weights back in
     if weights is not None:
@@ -238,9 +238,14 @@ def get_model_fn(params,
     if params.inference_tree_paths:
       model_ops.predictions[TREE_PATHS_PREDICTION_KEY] = tree_paths
 
-    if params.regression:
-      model_ops.predictions[VARIANCE_PREDICTION_KEY] = regression_variance
-
+    model_ops.predictions[VARIANCE_PREDICTION_KEY] = regression_variance
+    if include_all_in_serving:
+      # In order to serve the variance we need to add the prediction dict
+      # to output_alternatives dict.
+      if not model_ops.output_alternatives:
+        model_ops.output_alternatives = {}
+      model_ops.output_alternatives[ALL_SERVING_KEY] = (
+          constants.ProblemType.UNSPECIFIED, model_ops.predictions)
     return model_ops
 
   return _model_fn
@@ -295,7 +300,8 @@ class TensorForestEstimator(estimator.Estimator):
                report_feature_importances=False,
                local_eval=False,
                version=None,
-               head=None):
+               head=None,
+               include_all_in_serving=False):
     """Initializes a TensorForestEstimator instance.
 
     Args:
@@ -341,6 +347,23 @@ class TensorForestEstimator(estimator.Estimator):
       version: Unused.
       head: A heads_lib.Head object that calculates losses and such. If None,
         one will be automatically created based on params.
+      include_all_in_serving: if True, allow preparation of the complete
+        prediction dict including the variance to be exported for serving with
+        the Servo lib; and it also requires calling export_savedmodel with
+        default_output_alternative_key=ALL_SERVING_KEY, i.e.
+        estimator.export_savedmodel(export_dir_base=your_export_dir,
+          serving_input_fn=your_export_input_fn,
+          default_output_alternative_key=ALL_SERVING_KEY)
+        if False, resort to default behavior, i.e. export scores and
+          probabilities but no variances. In this case
+          default_output_alternative_key should be None while calling
+          export_savedmodel().
+        Note, that due to backward compatibility we cannot always set
+        include_all_in_serving to True because in this case calling
+        export_saved_model() without
+        default_output_alternative_key=ALL_SERVING_KEY (legacy behavior) the
+        saved_model_export_utils.get_output_alternatives() would raise
+        ValueError.
 
     Returns:
       A `TensorForestEstimator` instance.
@@ -359,7 +382,9 @@ class TensorForestEstimator(estimator.Estimator):
             num_trainers=num_trainers,
             trainer_id=trainer_id,
             report_feature_importances=report_feature_importances,
-            local_eval=local_eval),
+            local_eval=local_eval,
+            include_all_in_serving=include_all_in_serving,
+        ),
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
index 76cfb4c9ca02269f9fee61c767acc6cb4a0b4ca7..cf0db788a419f64ed891df8aa097fa8826f6de91 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/hard_routing_function_op.cc
@@ -99,18 +99,17 @@ class HardRoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
 
     Tensor* output_probability = nullptr;
     TensorShape output_probability_shape;
@@ -125,9 +124,8 @@ class HardRoutingFunction : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_probability_shape,
                                             &output_probability));
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, output_path_shape,
-                                            &output_path));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, output_path_shape, &output_path));
 
     auto out_probability = output_probability->tensor<float, 2>();
     auto out_path = output_path->tensor<int32, 2>();
@@ -144,12 +142,11 @@ class HardRoutingFunction : public OpKernel {
       out_probability(i, 0) = 1.0;
       out_path(i, 0) = 0;
       for (int j = 0; j < tree_depth_ - 1; j++) {
-        float left_prob = LeftProbability(point,
-                                          tree_parameters_tensor.Slice(j, j+1),
-                                          tree_biases(j),
-                                          num_features);
+        float left_prob =
+            LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1),
+                            tree_biases(j), num_features);
 
-        int32 left_child = 2*node + 1;
+        int32 left_child = 2 * node + 1;
         int32 right_child = left_child + 1;
 
         float dot_product = 0.0;
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc
index 28f50f1a32eb1827a242d527cd42c58487877959..f64155fa55af22d57c6619d8a39da0455dc0de65 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_gradient_op.cc
@@ -85,12 +85,9 @@ REGISTER_OP("KFeatureGradient")
 
 class KFeatureGradient : public OpKernel {
  public:
-  explicit KFeatureGradient(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("layer_num",
-                                             &layer_num_));
-    OP_REQUIRES_OK(context, context->GetAttr("random_seed",
-                                             &random_seed_));
+  explicit KFeatureGradient(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("layer_num", &layer_num_));
+    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -101,14 +98,14 @@ class KFeatureGradient : public OpKernel {
     const Tensor& routing_tensor = context->input(3);
 
     // Extract dimensions from input tensors.
-    const int32 num_data = static_cast<int32>(
-        input_data_tensor.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data_tensor.shape().dim_size(1));
-    const int32 num_nodes = static_cast<int32>(
-        tree_parameters_tensor.shape().dim_size(0));
-    const int32 num_features_per_node = static_cast<int32>(
-        tree_parameters_tensor.shape().dim_size(1));
+    const int32 num_data =
+        static_cast<int32>(input_data_tensor.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data_tensor.shape().dim_size(1));
+    const int32 num_nodes =
+        static_cast<int32>(tree_parameters_tensor.shape().dim_size(0));
+    const int32 num_features_per_node =
+        static_cast<int32>(tree_parameters_tensor.shape().dim_size(1));
 
     // Construct output tensors.
     Tensor* out_routes = nullptr;
@@ -127,12 +124,12 @@ class KFeatureGradient : public OpKernel {
     out_weights_shape.AddDim(num_nodes);
     out_weights_shape.AddDim(num_features_per_node);
 
-    OP_REQUIRES_OK(context, context->allocate_output(
-        0, out_routes_shape, &out_routes));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        1, out_data_shape, &out_data));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        2, out_weights_shape, &out_weights));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, out_routes_shape, &out_routes));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, out_data_shape, &out_data));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(2, out_weights_shape, &out_weights));
 
     tensorforest::Initialize(*out_data, 0.0f);
 
@@ -148,18 +145,13 @@ class KFeatureGradient : public OpKernel {
 
     std::vector<int32> feature_set;
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data_tensor.Slice(i, i+1);
+      const Tensor point = input_data_tensor.Slice(i, i + 1);
       feature_set.clear();
 
       // Traverse the tree from the bottom up.
       for (int j = num_nodes - 1; j >= 0; j--) {
-        tensorforest::GetFeatureSet(
-            layer_num_,
-            j,
-            random_seed_,
-            num_features,
-            num_features_per_node,
-            &feature_set);
+        tensorforest::GetFeatureSet(layer_num_, j, random_seed_, num_features,
+                                    num_features_per_node, &feature_set);
 
         // Compute routing gradient.
         // j is a leaf node.
@@ -170,12 +162,8 @@ class KFeatureGradient : public OpKernel {
           int32 right_child = left_child + 1;
 
           float left_prob = LeftProbabilityK(
-              point,
-              feature_set,
-              tree_parameters_tensor.Slice(j, j+1),
-              tree_biases(j),
-              num_features,
-              num_features_per_node);
+              point, feature_set, tree_parameters_tensor.Slice(j, j + 1),
+              tree_biases(j), num_features, num_features_per_node);
 
           float right_prob = 1.0f - left_prob;
 
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc
index 9bc42eb61fae013de3e4ea73aaf371cdaa4ccf9a..e7cafb144da84865ad2b4ea0c33866ddb89119a5 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/k_feature_routing_function_op.cc
@@ -43,7 +43,6 @@ using shape_inference::ShapeHandle;
 using tensorforest::CheckTensorBounds;
 using tensorforest::LeftProbabilityK;
 
-
 // The term 'routing function' is synonymous with 'the probability
 // that an instance is routed to each leaf node.'  It is defined in
 // 'Deep Neural Decision Forests' by Kontschieder et al.
@@ -96,10 +95,8 @@ class KFeatureRoutingFunction : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("max_nodes", &max_nodes_));
     OP_REQUIRES_OK(context, context->GetAttr("num_features_per_node",
                                              &num_features_per_node_));
-    OP_REQUIRES_OK(context, context->GetAttr("layer_num",
-                                             &layer_num_));
-    OP_REQUIRES_OK(context, context->GetAttr("random_seed",
-                                             &random_seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("layer_num", &layer_num_));
+    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -108,27 +105,25 @@ class KFeatureRoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
 
     Tensor* output_probabilities = nullptr;
     TensorShape output_shape;
     output_shape.AddDim(num_data);
     output_shape.AddDim(max_nodes_);
 
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, output_shape,
-                                            &output_probabilities));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
+                                                     &output_probabilities));
 
     auto out_probs = output_probabilities->tensor<float, 2>();
     const auto tree_biases = tree_biases_tensor.tensor<float, 1>();
@@ -136,30 +131,22 @@ class KFeatureRoutingFunction : public OpKernel {
     // Iteratively compute the probability of reaching each leaf.
     std::vector<int32> feature_set;
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data.Slice(i, i+1);
+      const Tensor point = input_data.Slice(i, i + 1);
 
       out_probs(i, 0) = 1.0f;
 
       for (int j = 0; j < max_nodes_ / 2; j++) {
         feature_set.clear();
-        tensorforest::GetFeatureSet(
-            layer_num_,
-            i,
-            random_seed_,
-            num_features,
-            num_features_per_node_,
-            &feature_set);
-
-        int32 left_child = 2*j + 1;
+        tensorforest::GetFeatureSet(layer_num_, i, random_seed_, num_features,
+                                    num_features_per_node_, &feature_set);
+
+        int32 left_child = 2 * j + 1;
         int32 right_child = left_child + 1;
 
         float prob = out_probs(i, j);
-        float left_prob = LeftProbabilityK(point,
-                                           feature_set,
-                                           tree_parameters_tensor.Slice(j, j+1),
-                                           tree_biases(j),
-                                           num_features,
-                                           num_features_per_node_);
+        float left_prob = LeftProbabilityK(
+            point, feature_set, tree_parameters_tensor.Slice(j, j + 1),
+            tree_biases(j), num_features, num_features_per_node_);
 
         out_probs(i, left_child) = prob * left_prob;
         out_probs(i, right_child) = prob * (1.0f - left_prob);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
index 4027e732b3f52585c2149c3cdc71535664f04ed4..0c2eaabe8f3e1e1377a8d5c5308aaec00030a20f 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/routing_function_op.cc
@@ -90,46 +90,43 @@ class RoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
 
     Tensor* output_probabilities = nullptr;
     TensorShape output_shape;
     output_shape.AddDim(num_data);
     output_shape.AddDim(max_nodes_);
 
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, output_shape,
-                                            &output_probabilities));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
+                                                     &output_probabilities));
 
     auto out_probs = output_probabilities->tensor<float, 2>();
     const auto tree_biases = tree_biases_tensor.tensor<float, 1>();
 
     // Iteratively compute the probability of reaching each leaf.
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data.Slice(i, i+1);
+      const Tensor point = input_data.Slice(i, i + 1);
 
       out_probs(i, 0) = 1.0;
 
       for (int j = 0; j < max_nodes_ / 2; j++) {
-        int32 left_child = 2*j + 1;
+        int32 left_child = 2 * j + 1;
         int32 right_child = left_child + 1;
 
         float prob = out_probs(i, j);
-        float left_prob = LeftProbability(point,
-                                          tree_parameters_tensor.Slice(j, j+1),
-                                          tree_biases(j),
-                                          num_features);
+        float left_prob =
+            LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1),
+                            tree_biases(j), num_features);
 
         out_probs(i, left_child) = prob * left_prob;
         out_probs(i, right_child) = prob * (1.0 - left_prob);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
index 66aa293dc1cb93b82f06d838ad7b0f9c09761585..c9df09bfda44e665ed013da383e1e9a2c665c454 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
@@ -96,10 +96,9 @@ class StochasticHardRoutingFunction : public OpKernel {
   explicit StochasticHardRoutingFunction(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("tree_depth", &tree_depth_));
-    OP_REQUIRES_OK(context, context->GetAttr("random_seed",
-                                             &random_seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("random_seed", &random_seed_));
     single_rand_ = std::unique_ptr<random::PhiloxRandom>(
-          new random::PhiloxRandom(random_seed_));
+        new random::PhiloxRandom(random_seed_));
     rng_ = std::unique_ptr<random::SimplePhilox>(
         new random::SimplePhilox(single_rand_.get()));
   }
@@ -111,20 +110,19 @@ class StochasticHardRoutingFunction : public OpKernel {
     const Tensor& tree_biases_tensor = context->input(2);
 
     if (input_data.shape().dim_size(0) > 0) {
-      OP_REQUIRES(context, input_data.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "input_data should be two-dimensional"));
+      OP_REQUIRES(
+          context, input_data.shape().dims() == 2,
+          errors::InvalidArgument("input_data should be two-dimensional"));
     }
 
     // Check tensor bounds.
     if (!CheckTensorBounds(context, input_data)) return;
 
-    const int32 num_data = static_cast<int32>(
-        input_data.shape().dim_size(0));
-    const int32 num_features = static_cast<int32>(
-        input_data.shape().dim_size(1));
-    const int32 num_nodes = static_cast<int32>(
-        tree_parameters_tensor.shape().dim_size(0));
+    const int32 num_data = static_cast<int32>(input_data.shape().dim_size(0));
+    const int32 num_features =
+        static_cast<int32>(input_data.shape().dim_size(1));
+    const int32 num_nodes =
+        static_cast<int32>(tree_parameters_tensor.shape().dim_size(0));
 
     Tensor* output_probability = nullptr;
     TensorShape output_probability_shape;
@@ -139,9 +137,8 @@ class StochasticHardRoutingFunction : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_probability_shape,
                                             &output_probability));
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, output_path_shape,
-                                            &output_path));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, output_path_shape, &output_path));
 
     auto out_probability = output_probability->tensor<float, 2>();
     auto out_path = output_path->tensor<int32, 2>();
@@ -150,19 +147,18 @@ class StochasticHardRoutingFunction : public OpKernel {
     // Stochastically traverse the tree to a leaf.
 
     for (int i = 0; i < num_data; i++) {
-      const Tensor point = input_data.Slice(i, i+1);
+      const Tensor point = input_data.Slice(i, i + 1);
 
       int32 node = 0;
       out_probability(i, 0) = 1.0;
       out_path(i, 0) = 0;
       for (int j = 0; j < tree_depth_ - 1; j++) {
-        int32 left_child = 2*node + 1;
+        int32 left_child = 2 * node + 1;
         int32 right_child = left_child + 1;
 
-        float left_prob = LeftProbability(point,
-                                          tree_parameters_tensor.Slice(j, j+1),
-                                          tree_biases(j),
-                                          num_features);
+        float left_prob =
+            LeftProbability(point, tree_parameters_tensor.Slice(j, j + 1),
+                            tree_biases(j), num_features);
 
         if (left_prob < rng_->RandFloat()) {
           CHECK_LT(i, num_data);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
index 0b5afe464f4b9608af0feca584aaa799f5980f46..b0d8b832b5437db7a4b3026e80ae99d0391d7f7a 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_gradient_op.cc
@@ -149,14 +149,14 @@ class StochasticHardRoutingGradient : public OpKernel {
     TensorShape output_bias_shape;
     output_bias_shape.AddDim(num_data);
 
-    OP_REQUIRES_OK(context, context->allocate_output(
-        0, output_routing_shape, &output_routing));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        1, output_data_shape, &output_data));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        2, output_parameters_shape, &output_parameters));
-    OP_REQUIRES_OK(context, context->allocate_output(
-        3, output_bias_shape, &output_bias));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_routing_shape,
+                                                     &output_routing));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, output_data_shape, &output_data));
+    OP_REQUIRES_OK(context, context->allocate_output(2, output_parameters_shape,
+                                                     &output_parameters));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(3, output_bias_shape, &output_bias));
 
     tensorforest::Initialize(*output_routing, 0.0);
     tensorforest::Initialize(*output_data, 0.0);
@@ -178,7 +178,7 @@ class StochasticHardRoutingGradient : public OpKernel {
       const Tensor point = input_data.Slice(i, i + 1);
 
       // Traverses the tree from the bottom up.
-      for (int j = tree_depth_-1; j > -1; j--) {
+      for (int j = tree_depth_ - 1; j > -1; j--) {
         int32 node = path(i, j);
 
         CHECK_LT(node, num_nodes);
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
index 9d5e1400a58cce75c03dfe3e0b5c973c11b89199..25825a78a1498490009fe4ff6bbfc67493727037 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/unpack_path_op.cc
@@ -13,16 +13,6 @@
 // limitations under the License.
 // =============================================================================
 
-#include <stdlib.h>
-#include <time.h>
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
 #include "tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h"
 #include "tensorflow/contrib/tensor_forest/kernels/tree_utils.h"
 #include "tensorflow/core/framework/op.h"
@@ -30,6 +20,7 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/top_n.h"
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
@@ -73,8 +64,7 @@ REGISTER_OP("UnpackPath")
 
 class UnpackPath : public OpKernel {
  public:
-  explicit UnpackPath(OpKernelConstruction* context)
-      : OpKernel(context) {}
+  explicit UnpackPath(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     VLOG(1) << "unpack start";
@@ -82,10 +72,10 @@ class UnpackPath : public OpKernel {
     const Tensor& path_values_tensor = context->input(1);
 
     const int32 num_data = static_cast<int32>(path_tensor.shape().dim_size(0));
-    const int32 tree_depth = static_cast<int32>(
-        path_tensor.shape().dim_size(1));
+    const int32 tree_depth =
+        static_cast<int32>(path_tensor.shape().dim_size(1));
 
-    const int32 num_nodes = pow(2, tree_depth) - 1;
+    const int32 num_nodes = MathUtil::IPow(2, tree_depth) - 1;
 
     VLOG(1) << "num_data: " << num_data;
     VLOG(1) << "tree_depth: " << tree_depth;
@@ -116,6 +106,6 @@ class UnpackPath : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("UnpackPath").Device(DEVICE_CPU),
-                        UnpackPath);
+REGISTER_KERNEL_BUILDER(Name("UnpackPath").Device(DEVICE_CPU), UnpackPath);
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc
index c091a73c4e48a47bdccea3ec99371faab9c586c2..34388fe1aab72895a805141ec66a71ecf0f42ba4 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.cc
@@ -25,9 +25,7 @@ namespace tensorforest {
 
 using tensorflow::Tensor;
 
-float LeftProbability(const Tensor& point,
-                      const Tensor& weight,
-                      float bias,
+float LeftProbability(const Tensor& point, const Tensor& weight, float bias,
                       int num_features) {
   const auto p = point.unaligned_flat<float>();
   const auto w = weight.unaligned_flat<float>();
@@ -41,11 +39,8 @@ float LeftProbability(const Tensor& point,
   return 1.0 / (1.0 + exp(-dot_product + bias));
 }
 
-float LeftProbabilityK(const Tensor& point,
-                       std::vector<int32> feature_set,
-                       const Tensor& weight,
-                       float bias,
-                       int num_features,
+float LeftProbabilityK(const Tensor& point, std::vector<int32> feature_set,
+                       const Tensor& weight, float bias, int num_features,
                        int k) {
   const auto p = point.unaligned_flat<float>();
   const auto w = weight.unaligned_flat<float>();
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
index c5902184f95ea8f97be4a10d1101a38333359d44..69a0143a4e319157a4526ca80fbb3f6472902b31 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/utils.h
@@ -24,16 +24,11 @@ namespace tensorflow {
 namespace tensorforest {
 
 // Returns the probability that the point falls to the left.
-float LeftProbability(const Tensor& point,
-                      const Tensor& weight,
-                      float bias,
+float LeftProbability(const Tensor& point, const Tensor& weight, float bias,
                       int num_features);
 
-float LeftProbabilityK(const Tensor& point,
-                       std::vector<int32> feature_set,
-                       const Tensor& weight,
-                       float bias,
-                       int num_features,
+float LeftProbabilityK(const Tensor& point, std::vector<int32> feature_set,
+                       const Tensor& weight, float bias, int num_features,
                        int k);
 
 // Returns a random set of num_features_to_pick features in the
@@ -49,5 +44,3 @@ void GetFeatureSet(int32 tree_num, int32 node_num, int32 random_seed,
 }  // namespace tensorflow
 
 #endif  // LEARNING_LIB_TENSOR_FOREST_HYBRID_CORE_OPS_UTILS_H_
-
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/data_spec.h b/tensorflow/contrib/tensor_forest/kernels/data_spec.h
index 05590d6992e2fd7eeee8d242561229ab53bb16de..0a3abe56dfc4f611ac8ed0815e4c74a639d2477e 100644
--- a/tensorflow/contrib/tensor_forest/kernels/data_spec.h
+++ b/tensorflow/contrib/tensor_forest/kernels/data_spec.h
@@ -15,8 +15,8 @@
 // This is a surrogate for using a proto, since it doesn't seem to be possible
 // to use protos in a dynamically-loaded/shared-linkage library, which is
 // what is used for custom ops in tensorflow/contrib.
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
 #include <unordered_map>
 
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -138,4 +138,4 @@ class TensorForestDataSpec {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_DATA_SPEC_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc b/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
index 47b49a379c4b7a17d35b52c1403f67c2f07aeeaf..b21a9179777c21f65435e136aa6082e27fb3b78c 100644
--- a/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/reinterpret_string_to_float_op.cc
@@ -30,15 +30,13 @@ namespace tensorflow {
 
 using tensorforest::CheckTensorBounds;
 
-
 float Convert(const string& in) {
   const std::size_t intval = std::hash<string>()(in);
   return static_cast<float>(intval);
 }
 
-
-void Evaluate(const Tensor& input_data, Tensor output_data,
-              int32 start, int32 end) {
+void Evaluate(const Tensor& input_data, Tensor output_data, int32 start,
+              int32 end) {
   auto out_data = output_data.unaligned_flat<float>();
   const auto in_data = input_data.unaligned_flat<string>();
 
@@ -59,9 +57,8 @@ class ReinterpretStringToFloat : public OpKernel {
     if (!CheckTensorBounds(context, input_data)) return;
 
     Tensor* output_data = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input_data.shape(),
-                                            &output_data));
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, input_data.shape(), &output_data));
 
     // Evaluate input data in parallel.
     const int32 num_data = static_cast<int32>(input_data.NumElements());
@@ -73,8 +70,8 @@ class ReinterpretStringToFloat : public OpKernel {
       auto work = [&input_data, output_data, num_data](int64 start, int64 end) {
         CHECK(start <= end);
         CHECK(end <= num_data);
-        Evaluate(input_data, *output_data,
-                 static_cast<int32>(start), static_cast<int32>(end));
+        Evaluate(input_data, *output_data, static_cast<int32>(start),
+                 static_cast<int32>(end));
       };
       Shard(num_threads, worker_threads->workers, num_data, 100, work);
     }
diff --git a/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc b/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
index dd2a98b08cdb486c98c161390a3a1f81d31e1f4b..60740c2be3703141805c7eae0ac384edf934ab3d 100644
--- a/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/scatter_add_ndim_op.cc
@@ -22,7 +22,6 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/platform/logging.h"
 
-
 namespace tensorflow {
 
 using tensorforest::CheckTensorBounds;
@@ -38,20 +37,19 @@ class ScatterAddNdim : public OpKernel {
 
     if (indices_tensor.shape().dim_size(0) > 0) {
       OP_REQUIRES(context, indices_tensor.shape().dims() == 2,
-                  errors::InvalidArgument(
-                      "indices should be two-dimensional"));
+                  errors::InvalidArgument("indices should be two-dimensional"));
       const int32 delta_dims = deltas_tensor.shape().dims();
       OP_REQUIRES(
           context,
           indices_tensor.shape().dim_size(1) + delta_dims ==
-          input_tensor.shape().dims() + 1,
+              input_tensor.shape().dims() + 1,
           errors::InvalidArgument(
               "Number of indices dimensions should be the same as input "
               "rank."));
       OP_REQUIRES(
           context,
           indices_tensor.shape().dim_size(0) ==
-          deltas_tensor.shape().dim_size(0),
+              deltas_tensor.shape().dim_size(0),
           errors::InvalidArgument(
               "Number of updates should be same as number of indices."));
     } else {
@@ -68,8 +66,8 @@ class ScatterAddNdim : public OpKernel {
     const auto indices = indices_tensor.tensor<int32, 2>();
     const auto deltas = deltas_tensor.unaligned_flat<float>();
 
-    const int32 num_dims = static_cast<int32>(
-        indices_tensor.shape().dim_size(1));
+    const int32 num_dims =
+        static_cast<int32>(indices_tensor.shape().dim_size(1));
 
     // Figure out if indices don't specify a complete position in the
     // input tensor.
@@ -80,10 +78,9 @@ class ScatterAddNdim : public OpKernel {
 
     // Calculate index multipliers.
     std::vector<int32> multipliers;
-    OP_REQUIRES(
-        context, input.size() < std::numeric_limits<int32>::max(),
-        errors::InvalidArgument(
-            "Input must contain less than 2^31 total elements"));
+    OP_REQUIRES(context, input.size() < std::numeric_limits<int32>::max(),
+                errors::InvalidArgument(
+                    "Input must contain less than 2^31 total elements"));
     int32 last_size = static_cast<int32>(input.size());
 
     for (int32 j = 0; j < num_dims; j++) {
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
index 94e12cea5a072f0746e642196d55f3a3b13a16c3..44997ec5d6d5fdb9aab52ab7a50f46a731bfda66 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
@@ -65,8 +65,8 @@ void GetTwoBest(int max, const std::function<float(int)>& score_fn,
 
 float ClassificationSplitScore(
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& splits,
-    const Eigen::Tensor<float, 1, Eigen::RowMajor>& rights,
-    int32 num_classes, int i) {
+    const Eigen::Tensor<float, 1, Eigen::RowMajor>& rights, int32 num_classes,
+    int i) {
   Eigen::array<int, 1> offsets;
   // Class counts are stored with the total in [0], so the length of each
   // count vector is num_classes + 1.
@@ -74,7 +74,7 @@ float ClassificationSplitScore(
   Eigen::array<int, 1> extents;
   extents[0] = num_classes;
   return WeightedGiniImpurity(splits.slice(offsets, extents)) +
-      WeightedGiniImpurity(rights.slice(offsets, extents));
+         WeightedGiniImpurity(rights.slice(offsets, extents));
 }
 
 void GetTwoBestClassification(const Tensor& total_counts,
@@ -90,29 +90,28 @@ void GetTwoBestClassification(const Tensor& total_counts,
   // in seg faults, so we have to go with flat views of these tensors.  However,
   // it is still pretty efficient because we put off evaluation until the
   // score is actually returned.
-  const auto tc = total_counts.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto tc =
+      total_counts.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
 
   // TODO(gilberth): See if we can delay evaluation here by templating the
   // arguments to ClassificationSplitScore.
-  const Eigen::Tensor<float, 1, Eigen::RowMajor> splits = split_counts.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
+  const Eigen::Tensor<float, 1, Eigen::RowMajor> splits =
+      split_counts.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
   Eigen::array<int, 1> bcast;
   bcast[0] = num_splits;
   const Eigen::Tensor<float, 1, Eigen::RowMajor> rights =
       tc.broadcast(bcast) - splits;
 
-  std::function<float(int)> score_fn = std::bind(
-      ClassificationSplitScore, splits, rights, num_classes,
-      std::placeholders::_1);
+  std::function<float(int)> score_fn =
+      std::bind(ClassificationSplitScore, splits, rights, num_classes,
+                std::placeholders::_1);
 
   GetTwoBest(num_splits, score_fn, best_score, best_index, second_best_score,
              second_best_index);
 }
 
-int32 BestFeatureClassification(
-    const Tensor& total_counts, const Tensor& split_counts,
-    int32 accumulator) {
+int32 BestFeatureClassification(const Tensor& total_counts,
+                                const Tensor& split_counts, int32 accumulator) {
   float best_score;
   float second_best_score;
   int best_feature_index;
@@ -130,8 +129,7 @@ float RegressionSplitScore(
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& splits_square,
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& right_sums,
     const Eigen::Tensor<float, 1, Eigen::RowMajor>& right_squares,
-    int32 accumulator,
-    int32 num_regression_dims, int i) {
+    int32 accumulator, int32 num_regression_dims, int i) {
   Eigen::array<int, 1> offsets = {i * num_regression_dims + 1};
   Eigen::array<int, 1> extents = {num_regression_dims - 1};
   float left_count = splits_count_accessor(accumulator, i, 0);
@@ -141,15 +139,15 @@ float RegressionSplitScore(
 
   // Guard against divide-by-zero.
   if (left_count > 0) {
-    score += WeightedVariance(
-        splits_sum.slice(offsets, extents),
-        splits_square.slice(offsets, extents), left_count);
+    score +=
+        WeightedVariance(splits_sum.slice(offsets, extents),
+                         splits_square.slice(offsets, extents), left_count);
   }
 
   if (right_count > 0) {
-    score += WeightedVariance(right_sums.slice(offsets, extents),
-                              right_squares.slice(offsets, extents),
-                              right_count);
+    score +=
+        WeightedVariance(right_sums.slice(offsets, extents),
+                         right_squares.slice(offsets, extents), right_count);
   }
   return score;
 }
@@ -159,20 +157,20 @@ void GetTwoBestRegression(const Tensor& total_sums, const Tensor& total_squares,
                           int32 accumulator, float* best_score, int* best_index,
                           float* second_best_score, int* second_best_index) {
   const int32 num_splits = static_cast<int32>(split_sums.shape().dim_size(1));
-  const int32 num_regression_dims = static_cast<int32>(
-      split_sums.shape().dim_size(2));
+  const int32 num_regression_dims =
+      static_cast<int32>(split_sums.shape().dim_size(2));
   // Ideally, Eigen::Tensor::chip would be best to use here but it results
   // in seg faults, so we have to go with flat views of these tensors.  However,
   // it is still pretty efficient because we put off evaluation until the
   // score is actually returned.
-  const auto tc_sum = total_sums.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
-  const auto tc_square = total_squares.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
-  const auto splits_sum = split_sums.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
-  const auto splits_square = split_squares.Slice(
-      accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto tc_sum =
+      total_sums.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto tc_square =
+      total_squares.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto splits_sum =
+      split_sums.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
+  const auto splits_square =
+      split_squares.Slice(accumulator, accumulator + 1).unaligned_flat<float>();
   // Eigen is infuriating to work with, usually resulting in all kinds of
   // unhelpful compiler errors when trying something that seems sane.  This
   // helps us do a simple thing like access the first element (the counts)
@@ -193,10 +191,10 @@ void GetTwoBestRegression(const Tensor& total_sums, const Tensor& total_squares,
              best_score, best_index, second_best_score, second_best_index);
 }
 
-int32 BestFeatureRegression(
-    const Tensor& total_sums, const Tensor& total_squares,
-    const Tensor& split_sums, const Tensor& split_squares,
-    int32 accumulator) {
+int32 BestFeatureRegression(const Tensor& total_sums,
+                            const Tensor& total_squares,
+                            const Tensor& split_sums,
+                            const Tensor& split_squares, int32 accumulator) {
   float best_score;
   float second_best_score;
   int best_feature_index;
@@ -207,10 +205,11 @@ int32 BestFeatureRegression(
   return best_feature_index;
 }
 
-bool BestSplitDominatesRegression(
-    const Tensor& total_sums, const Tensor& total_squares,
-    const Tensor& split_sums, const Tensor& split_squares,
-    int32 accumulator) {
+bool BestSplitDominatesRegression(const Tensor& total_sums,
+                                  const Tensor& total_squares,
+                                  const Tensor& split_sums,
+                                  const Tensor& split_squares,
+                                  int32 accumulator) {
   // TODO(thomaswc): Implement this, probably as part of v3.
   return false;
 }
@@ -599,7 +598,6 @@ bool Decide(float value, float bias, DataColumnTypes type) {
   }
 }
 
-
 void GetParentWeightedMean(float leaf_sum, const float* leaf_data,
                            float parent_sum, const float* parent_data,
                            float valid_leaf_threshold, int num_outputs,
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index 35f9fb7eaf4d73e98293f1ea6a4b45b71212a92c..edbac6700677633cbd4d41f7040b4859ca599c4a 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
 
 #include <limits>
 
@@ -45,13 +45,10 @@ const int32 LEAF_NODE = -1;
 const int32 FREE_NODE = -2;
 
 // Used to indicate column types, e.g. categorical vs. float
-enum DataColumnTypes {
-  kDataFloat = 0,
-  kDataCategorical = 1
-};
+enum DataColumnTypes { kDataFloat = 0, kDataCategorical = 1 };
 
 // Calculates the sum of a tensor.
-template<typename T>
+template <typename T>
 T Sum(Tensor counts) {
   Eigen::Tensor<T, 0, Eigen::RowMajor> count_sum =
       counts.unaligned_flat<T>().sum();
@@ -97,7 +94,7 @@ float WeightedGiniImpurity(const T& counts) {
   return RawWeightedGiniImpurity(smoothed);
 }
 
-template<typename T1, typename T2>
+template <typename T1, typename T2>
 float WeightedVariance(const T1& sums, const T2& squares, float count) {
   const auto e_x = sums / count;
   const auto e_x2 = squares / count;
@@ -120,10 +117,11 @@ int32 BestFeatureRegression(const Tensor& total_sums,
 
 // Returns true if the best split's variance is sufficiently smaller than
 // that of the next best split.
-bool BestSplitDominatesRegression(
-    const Tensor& total_sums, const Tensor& total_squares,
-    const Tensor& split_sums, const Tensor& split_squares,
-    int32 accumulator);
+bool BestSplitDominatesRegression(const Tensor& total_sums,
+                                  const Tensor& total_squares,
+                                  const Tensor& split_sums,
+                                  const Tensor& split_squares,
+                                  int32 accumulator);
 
 // Performs booststrap_samples bootstrap samples of the best split's class
 // counts and the second best splits's class counts, and returns true if at
@@ -178,10 +176,8 @@ bool DecideNode(const GetFeatureFnType& get_dense,
 // isn't present in sparse_input_indices.  sparse_input_indices is assumed
 // to be sorted.
 template <typename T1, typename T2>
-float FindSparseValue(
-    const T1& sparse_input_indices,
-    const T2& sparse_input_values,
-    int32 i, int32 j) {
+float FindSparseValue(const T1& sparse_input_indices,
+                      const T2& sparse_input_values, int32 i, int32 j) {
   int32 low = 0;
   int32 high = sparse_input_values.dimension(0);
   while (low < high) {
@@ -273,7 +269,6 @@ int32 GetNumSparseFeatures(const T1& indices, int32 input_index,
 // categorical data, it is value != bias.
 bool Decide(float value, float bias, DataColumnTypes type = kDataFloat);
 
-
 // Returns true if all the splits are initialized. Since they get initialized
 // in order, we can simply infer this from the last split.
 // This should only be called for a single allocator's candidate features
@@ -307,4 +302,4 @@ void GetParentWeightedMean(float leaf_sum, const float* leaf_data,
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_CORE_OPS_TREE_UTILS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc
index 7485a695dfba93fd3f57c19096b205b10e2fa8b5..08553545502c21eb8f2d68bfd342f8ba7c081adb 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils_test.cc
@@ -44,11 +44,13 @@ TEST(TestWeightedVariance, Basic) {
   Tensor squares = test::AsTensor<float>({29, 12}, {2});
 
   EXPECT_FLOAT_EQ(WeightedVariance(sums.unaligned_flat<float>(),
-                                   squares.unaligned_flat<float>(), 3), 2.0);
+                                   squares.unaligned_flat<float>(), 3),
+                  2.0);
 
   Tensor zero = test::AsTensor<float>({0}, {1});
   EXPECT_FLOAT_EQ(WeightedVariance(zero.unaligned_flat<float>(),
-                                   zero.unaligned_flat<float>(), 1), 0);
+                                   zero.unaligned_flat<float>(), 1),
+                  0);
 }
 
 TEST(TestInitialize, Basic) {
@@ -94,17 +96,16 @@ TEST(BestFeatureClassification, Basic) {
   const int32 num_accumulators = 4;
   const int32 num_splits = 3;
   const int32 num_classes = 4;
-  Tensor totals = test::AsTensor<float>({1, 5, 6, 7,
-                                         0, 0, 0, 0,
-                                         30, 10, 10, 10,      // this one
-                                         -1, -1, -1, -1},
-                                        {num_accumulators, num_classes});
-  Tensor splits = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       30, 10, 10, 10, 10, 0, 0, 10, 19, 5, 6, 8,  // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor totals = test::AsTensor<float>(
+      {1, 5, 6, 7, 0, 0, 0, 0, 30, 10, 10, 10,  // this one
+       -1, -1, -1, -1},
+      {num_accumulators, num_classes});
+  Tensor splits =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  30, 10,
+                             10, 10, 10, 0,  0,  10, 19, 5,  6,  8,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureClassification(totals, splits, 2), 1);
 }
@@ -114,17 +115,16 @@ TEST(BestFeatureClassification, NoWinner) {
   const int32 num_splits = 3;
   const int32 num_classes = 4;
   // When counts are all the same, the most reasonable thing to do is pick 0.
-  Tensor totals = test::AsTensor<float>({1, 5, 6, 7,
-                                         0, 0, 0, 0,
-                                         18, 6, 6, 6,      // this one
-                                         -1, -1, -1, -1},
-                                        {num_accumulators, num_classes});
-  Tensor splits = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       9, 3, 3, 3, 9, 3, 3, 3, 9, 3, 3, 3,     // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor totals =
+      test::AsTensor<float>({1, 5, 6, 7, 0, 0, 0, 0, 18, 6, 6, 6,  // this one
+                             -1, -1, -1, -1},
+                            {num_accumulators, num_classes});
+  Tensor splits =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4, 0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  9, 3,
+                             3,  3,  9,  3,  3,  3,  9,  3,  3,  3,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureClassification(totals, splits, 2), 0);
 }
@@ -133,36 +133,34 @@ TEST(BestFeatureRegression, Basic) {
   const int32 num_accumulators = 4;
   const int32 num_splits = 3;
   const int32 num_classes = 4;
-  Tensor total_sums = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       10, 8, 6, 9,      // this one
-       -1, -1, -1, -1},
-      {num_accumulators, num_classes});
+  Tensor total_sums =
+      test::AsTensor<float>({1, 5, 6, 7, 0, 0, 0, 0, 10, 8, 6, 9,  // this one
+                             -1, -1, -1, -1},
+                            {num_accumulators, num_classes});
   Tensor total_squares = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       100, 50, 40, 45,      // this one
+      {1, 5, 6, 7, 0, 0, 0, 0, 100, 50, 40, 45,  // this one
        -1, -1, -1, -1},
       {num_accumulators, num_classes});
 
-  Tensor split_sums = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       10, 8, 6, 9, 9, 8, 5, 9, 0, 0, 0, 0,      // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor split_sums =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  10, 8,
+                             6,  9,  9,  8,  5,  9,  0,  0,  0,  0,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   // lower the variance by lowering one of the squares just a little.
-  Tensor split_squares = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       100, 50, 40, 45, 100, 50, 40, 43, 0, 0, 0, 0,    // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor split_squares =
+      test::AsTensor<float>(
+          {1,   2,  3,  4,  1,   2,  3,  4,  1,  2,  3,  4,
+           0,   0,  0,  0,  0,   0,  0,  0,  0,  0,  0,  0,
+           100, 50, 40, 45, 100, 50, 40, 43, 0,  0,  0,  0,  // this one
+           -1,  -1, -1, -1, -1,  -1, -1, -1, -1, -1, -1, -1},
+          {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureRegression(total_sums, total_squares, split_sums,
-                                  split_squares, 2), 1);
+                                  split_squares, 2),
+            1);
 }
 
 TEST(BestFeatureRegression, NoWinner) {
@@ -170,37 +168,33 @@ TEST(BestFeatureRegression, NoWinner) {
   const int32 num_splits = 3;
   const int32 num_classes = 4;
   // when counts are all the same, the most reasonable thing to do is pick 0.
-  Tensor total_sums = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       10, 8, 6, 9,      // this one
-       -1, -1, -1, -1},
-      {num_accumulators, num_classes});
+  Tensor total_sums =
+      test::AsTensor<float>({1, 5, 6, 7, 0, 0, 0, 0, 10, 8, 6, 9,  // this one
+                             -1, -1, -1, -1},
+                            {num_accumulators, num_classes});
   Tensor total_squares = test::AsTensor<float>(
-      {1, 5, 6, 7,
-       0, 0, 0, 0,
-       100, 50, 40, 45,      // this one
+      {1, 5, 6, 7, 0, 0, 0, 0, 100, 50, 40, 45,  // this one
        -1, -1, -1, -1},
       {num_accumulators, num_classes});
 
-  Tensor split_sums = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       10, 8, 6, 9, 10, 8, 6, 9, 10, 8, 6, 9,      // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
-      {num_accumulators, num_splits, num_classes});
+  Tensor split_sums =
+      test::AsTensor<float>({1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  0,
+                             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  10, 8,
+                             6,  9,  10, 8,  6,  9,  10, 8,  6,  9,  // this one
+                             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                            {num_accumulators, num_splits, num_classes});
 
   Tensor split_squares = test::AsTensor<float>(
-      {1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
-       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-       100, 50, 40, 45, 100, 50, 40, 45, 100, 50, 40, 45,    // this one
-       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+      {1,   2,  3,  4,  1,   2,  3,  4,  1,   2,  3,  4,
+       0,   0,  0,  0,  0,   0,  0,  0,  0,   0,  0,  0,
+       100, 50, 40, 45, 100, 50, 40, 45, 100, 50, 40, 45,  // this one
+       -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1},
       {num_accumulators, num_splits, num_classes});
 
   EXPECT_EQ(BestFeatureRegression(total_sums, total_squares, split_sums,
-                                  split_squares, 2), 0);
+                                  split_squares, 2),
+            0);
 }
 
 }  // namespace tensorforest
 }  // namespace tensorflow
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
index b7876e1df6c77d1fa3fa31abb37fc16d06540f15..794b76d8583c3608d540d34a5aaf1d1a799f35e3 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
@@ -302,6 +302,7 @@ cc_library(
             "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_cc",
         ],
         [
+            "//third_party/eigen3",
             "//tensorflow/contrib/decision_trees/proto:generic_tree_model_cc_headers_only",
             "//tensorflow/contrib/tensor_forest/proto:fertile_stats_proto_cc_headers_only",
         ],
@@ -322,6 +323,7 @@ cc_library(
     srcs = ["params.cc"],
     hdrs = ["params.h"],
     deps = [
+        "//third_party/eigen3",
         "//tensorflow/core:framework_headers_lib",
     ] + if_static(
         [
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
index 81e2a1b2a1b720574210e376fa786923367794a6..f4a7058ddb8bfdd6393a9369006aabc29d058d3b 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.cc
@@ -14,8 +14,8 @@
 // =============================================================================
 #include "tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h"
 
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
@@ -58,8 +58,7 @@ CandidateGraphRunner::CandidateGraphRunner(
   // Features don't change, store them in a tensor.
   const auto& oblique = split.inequality_left_child_test().oblique();
   const int32 feat_size = oblique.features_size();
-  features_.reset(
-      new Tensor(tensorflow::DT_INT32, TensorShape({feat_size})));
+  features_.reset(new Tensor(tensorflow::DT_INT32, TensorShape({feat_size})));
   auto feat = features_->flat<int32>();
   int i = 0;
   for (const auto& id : oblique.features()) {
@@ -67,10 +66,10 @@ CandidateGraphRunner::CandidateGraphRunner(
   }
 }
 
-void CandidateGraphRunner::RunOp(
-    const string& name, const TensorNameValueList& inputs,
-    const std::vector<string>& output_tensor_names,
-    std::vector<Tensor>* outputs) {
+void CandidateGraphRunner::RunOp(const string& name,
+                                 const TensorNameValueList& inputs,
+                                 const std::vector<string>& output_tensor_names,
+                                 std::vector<Tensor>* outputs) {
   std::vector<string> op_name;
   if (name != kNoOp) {
     op_name.push_back(name);
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h
index 4bd1f06c72945f73e50301c337692e0b510d3693..2e7368dc12c74b9dc44b72394668bf2de71f2f90 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/candidate_graph_runner.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
 #include <string>
 #include <vector>
 
@@ -70,4 +70,4 @@ class CandidateGraphRunner {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_CANDIDATE_GRAPH_RUNNER_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
index bf88216d663cc9b69746a93379124bf1d9a30df9..328af28725af016e90b30ae2d303ffba15c81c1f 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
 
 #include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
 #include "tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h"
@@ -26,7 +26,6 @@
 namespace tensorflow {
 namespace tensorforest {
 
-
 // Keep a tree ensemble in memory for efficient evaluation and mutation.
 class DecisionTreeResource : public ResourceBase {
  public:
@@ -35,15 +34,12 @@ class DecisionTreeResource : public ResourceBase {
 
   string DebugString() override {
     return strings::StrCat("DecisionTree[size=",
-                           decision_tree_->decision_tree().nodes_size(),
-                           "]");
+                           decision_tree_->decision_tree().nodes_size(), "]");
   }
 
   void MaybeInitialize();
 
-  const decision_trees::Model& decision_tree() const {
-    return *decision_tree_;
-  }
+  const decision_trees::Model& decision_tree() const { return *decision_tree_; }
 
   decision_trees::Model* mutable_decision_tree() {
     return decision_tree_.get();
@@ -59,9 +55,7 @@ class DecisionTreeResource : public ResourceBase {
 
   // Resets the resource and frees the proto.
   // Caller needs to hold the mutex lock while calling this.
-  void Reset() {
-    decision_tree_.reset(new decision_trees::Model());
-  }
+  void Reset() { decision_tree_.reset(new decision_trees::Model()); }
 
   mutex* get_mutex() { return &mu_; }
 
@@ -84,8 +78,7 @@ class DecisionTreeResource : public ResourceBase {
   std::vector<std::unique_ptr<DecisionNodeEvaluator>> node_evaluators_;
 };
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_TREE_RESOURCE_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
index 3f03c2d05bb1090fa75f4b6e7ad4f00caaea61a4..bf2b2aaa3c8f433ab4fc145217857112f7a0a579 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
 
 #include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
 #include "tensorflow/contrib/decision_trees/proto/generic_tree_model_extensions.pb.h"
@@ -22,7 +22,6 @@
 namespace tensorflow {
 namespace tensorforest {
 
-
 // Base class for evaluators of decision nodes that effectively copy proto
 // contents into C++ structures for faster execution.
 class DecisionNodeEvaluator {
@@ -104,4 +103,4 @@ struct CandidateEvalatorCollection {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_DECISION_NODE_EVALUATOR_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
index 5c49b87443e7b1f4ef532256ae2efdc9fa985d8a..af5cf72a3c0bea0eef45c3446acf52ff389c6751 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator_test.cc
@@ -20,11 +20,11 @@
 namespace tensorflow {
 namespace {
 
+using tensorflow::decision_trees::InequalityTest;
+using tensorflow::decision_trees::MatchingValuesTest;
 using tensorflow::tensorforest::InequalityDecisionNodeEvaluator;
 using tensorflow::tensorforest::MatchingValuesDecisionNodeEvaluator;
 using tensorflow::tensorforest::ObliqueInequalityDecisionNodeEvaluator;
-using tensorflow::decision_trees::InequalityTest;
-using tensorflow::decision_trees::MatchingValuesTest;
 
 TEST(InequalityDecisionNodeEvaluatorTest, TestLessOrEqual) {
   InequalityTest test;
@@ -124,4 +124,3 @@ TEST(ObliqueDecisionNodeEvaluatorTest, Basic) {
 
 }  // namespace
 }  // namespace tensorflow
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
index dacf033d99018d47787b644b12d3181780df7113..eea0be27caf0a022ba7acaacd359c75a2df4eedb 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
 
 #include <vector>
 
@@ -40,9 +40,7 @@ class FertileStatsResource : public ResourceBase {
     model_op_ = LeafModelOperatorFactory::CreateLeafModelOperator(params_);
   }
 
-  string DebugString() override {
-    return "FertileStats";
-  }
+  string DebugString() override { return "FertileStats"; }
 
   void ExtractFromProto(const FertileStats& stats);
 
@@ -50,8 +48,7 @@ class FertileStatsResource : public ResourceBase {
 
   // Resets the resource and frees the proto.
   // Caller needs to hold the mutex lock while calling this.
-  void Reset() {
-  }
+  void Reset() {}
 
   // Reset the stats for a node, but leave the leaf_stats intact.
   void ResetSplitStats(int32 node_id, int32 depth) {
@@ -84,7 +81,6 @@ class FertileStatsResource : public ResourceBase {
   // was found.
   bool BestSplit(int32 node_id, SplitCandidate* best, int32* depth);
 
-
  private:
   mutex mu_;
   std::shared_ptr<LeafModelOperator> model_op_;
@@ -94,8 +90,7 @@ class FertileStatsResource : public ResourceBase {
   void AllocateNode(int32 node_id, int32 depth);
 };
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_FERTILE_STATS_RESOURCE_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.h b/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.h
index 2ae3a79b3dd69b3fd3d31a055589b2edc63afa3c..4ae48179afc8452e6a3ec61dede16b9941482bcc 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/graph_collection_operator.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
 
 #include <vector>
 #include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
@@ -78,4 +78,4 @@ class GraphRunnerSplitCollectionOperator : public SplitCollectionOperator {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GRAPH_COLLECTION_OPERATOR_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
index 3ce630e3a9691b87ad291a9f29616f741953dd84..da600d34eacdf27514709240723e5bb730cfe7f0 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.cc
@@ -20,7 +20,6 @@
 #include "tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h"
 #include "tensorflow/core/lib/random/distribution_sampler.h"
 
-
 namespace tensorflow {
 namespace tensorforest {
 
@@ -454,14 +453,14 @@ void DenseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
     class_stats->add_value()->set_float_value(total_counts_[i]);
   }
 
-  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+  for (int split_num = 0; split_num < num_splits(); ++split_num) {
     auto* cand = slot->add_candidates();
     *cand->mutable_split() = splits_[split_num];
     auto* left_stats = cand->mutable_left_stats()
                            ->mutable_classification()
                            ->mutable_dense_counts();
     for (int i = 0; i < num_outputs_; ++i) {
-       left_stats->add_value()->set_float_value(left_count(split_num, i));
+      left_stats->add_value()->set_float_value(left_count(split_num, i));
     }
   }
 }
@@ -546,7 +545,7 @@ void SparseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
     (*class_stats)[entry.first] = val;
   }
 
-  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+  for (int split_num = 0; split_num < num_splits(); ++split_num) {
     auto* cand = slot->add_candidates();
     *cand->mutable_split() = splits_[split_num];
     auto* left_stats = cand->mutable_left_stats()
@@ -561,8 +560,8 @@ void SparseClassificationGrowStats::PackToProto(FertileSlot* slot) const {
   }
 }
 
-float SparseClassificationGrowStats::GiniScore(
-    int split, float* left_sum, float* right_sum) const {
+float SparseClassificationGrowStats::GiniScore(int split, float* left_sum,
+                                               float* right_sum) const {
   float left_square = 0, right_square = 0;
   *left_sum = 0;
   *right_sum = 0;
@@ -844,12 +843,11 @@ void LeastSquaresRegressionGrowStats::PackToProto(FertileSlot* slot) const {
     total_squares->add_value()->set_float_value(total_sum_squares_[i]);
   }
 
-  for (int split_num = 0;  split_num < num_splits(); ++split_num) {
+  for (int split_num = 0; split_num < num_splits(); ++split_num) {
     auto* cand = slot->add_candidates();
     *cand->mutable_split() = splits_[split_num];
-    auto* sums = cand->mutable_left_stats()
-                           ->mutable_regression()
-                           ->mutable_mean_output();
+    auto* sums =
+        cand->mutable_left_stats()->mutable_regression()->mutable_mean_output();
     auto* squares = cand->mutable_left_stats()
                         ->mutable_regression()
                         ->mutable_mean_output_squares();
@@ -891,20 +889,17 @@ float LeastSquaresRegressionGrowStats::SplitVariance(int split) const {
   float total_variance = 0;
   for (int i = 0; i < params_.num_outputs(); ++i) {
     // Left side
-    const float le_x =
-        left_sum(split, i) / left_counts_[split];
+    const float le_x = left_sum(split, i) / left_counts_[split];
 
-    const float le_x2 =
-        left_square(split, i) / left_counts_[split];
+    const float le_x2 = left_square(split, i) / left_counts_[split];
     total_variance += le_x2 - le_x * le_x;
 
     // Right side
     const float re_x = (total_sum_[i] - left_sum(split, i)) /
                        (weight_sum_ - left_counts_[split]);
 
-    const float re_x2 =
-        (total_sum_squares_[i] - left_square(split, i)) /
-        (weight_sum_ - left_counts_[split]);
+    const float re_x2 = (total_sum_squares_[i] - left_square(split, i)) /
+                        (weight_sum_ - left_counts_[split]);
     total_variance += re_x2 - re_x * re_x;
   }
   return total_variance;
@@ -937,8 +932,7 @@ bool LeastSquaresRegressionGrowStats::BestSplit(SplitCandidate* best) const {
   left->set_weight_sum(left_counts_[best_index]);
   auto* left_output_sum = left_reg_stats->mutable_mean_output();
   for (int i = 0; i < num_outputs; ++i) {
-    left_output_sum->add_value()->set_float_value(
-        left_sum(best_index, i));
+    left_output_sum->add_value()->set_float_value(left_sum(best_index, i));
   }
 
   // Right
@@ -947,8 +941,8 @@ bool LeastSquaresRegressionGrowStats::BestSplit(SplitCandidate* best) const {
   right->set_weight_sum(weight_sum_ - left_counts_[best_index]);
   auto* right_output_sum = right_reg_stats->mutable_mean_output();
   for (int i = 0; i < num_outputs; ++i) {
-    right_output_sum->add_value()->set_float_value(
-        total_sum_[i] - left_sum(best_index, i));
+    right_output_sum->add_value()->set_float_value(total_sum_[i] -
+                                                   left_sum(best_index, i));
   }
   return true;
 }
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
index 3e41ab50b9d78943db8ee58aab85a8c7541e2320..04e6b0a735320dd024e326a94ef910593a326245 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
 #include <unordered_map>
 #include <vector>
 
@@ -73,21 +73,15 @@ class GrowStats {
       const InputTarget* target, int example) {}
   void RemoveSplit(int split_num);
 
-  int num_splits() const {
-    return splits_.size();
-  }
+  int num_splits() const { return splits_.size(); }
 
-  float weight_sum() const {
-    return weight_sum_;
-  }
+  float weight_sum() const { return weight_sum_; }
 
   virtual bool IsInitialized() const {
     return weight_sum_ > 0 || splits_.size() == num_splits_to_consider_;
   }
 
-  int32 depth() const {
-    return depth_;
-  }
+  int32 depth() const { return depth_; }
 
  protected:
   GrowStats(const TensorForestParams& params, int32 depth);
@@ -206,8 +200,8 @@ class ClassificationStats : public GrowStats {
   virtual float left_count(int split, int class_num) const = 0;
   virtual float right_count(int split, int class_num) const = 0;
 
-  virtual void ClassificationAddLeftExample(
-      int split, int64 int_label, float weight) = 0;
+  virtual void ClassificationAddLeftExample(int split, int64 int_label,
+                                            float weight) = 0;
   virtual void ClassificationAddRightExample(int split, int64 int_label,
                                              float weight) {
     // Does nothing by default, but sub-classes can override.
@@ -316,7 +310,7 @@ class DenseClassificationGrowStats : public ClassificationStats {
   void PackToProto(FertileSlot* slot) const override;
 
   void InitLeafClassStats(int best_split_index, LeafStat* left_stats,
-                          LeafStat* right_stats) const;
+                          LeafStat* right_stats) const override;
 
  protected:
   void ClassificationAddSplitStats() override {
@@ -375,15 +369,13 @@ class SparseClassificationGrowStats : public ClassificationStats {
   SparseClassificationGrowStats(const TensorForestParams& params, int32 depth)
       : ClassificationStats(params, depth) {}
 
-  void Initialize() override {
-    Clear();
-  }
+  void Initialize() override { Clear(); }
 
   void ExtractFromProto(const FertileSlot& slot) override;
   void PackToProto(FertileSlot* slot) const override;
 
   void InitLeafClassStats(int best_split_index, LeafStat* left_stats,
-                          LeafStat* right_stats) const;
+                          LeafStat* right_stats) const override;
 
  protected:
   void ClassificationAddSplitStats() override {
@@ -562,9 +554,9 @@ class LeastSquaresRegressionGrowStats : public GrowStats {
   }
   void RemoveSplitStats(int split_num) override {
     left_sums_.erase(left_sums_.begin() + num_outputs_ * split_num,
-                       left_sums_.begin() + num_outputs_ * (split_num + 1));
+                     left_sums_.begin() + num_outputs_ * (split_num + 1));
     left_squares_.erase(left_squares_.begin() + num_outputs_ * split_num,
-                       left_squares_.begin() + num_outputs_ * (split_num + 1));
+                        left_squares_.begin() + num_outputs_ * (split_num + 1));
     left_counts_.erase(left_counts_.begin() + split_num,
                        left_counts_.begin() + (split_num + 1));
   }
@@ -605,8 +597,7 @@ class LeastSquaresRegressionGrowStats : public GrowStats {
   std::vector<int64> left_counts_;
 };
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_GROW_STATS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
index ceb58d2ead5c2f148c96d9cb9532a73688593d33..26e989928e00de1b2ae1646abf216adfbec2be4f 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/grow_stats_test.cc
@@ -24,21 +24,21 @@
 namespace tensorflow {
 namespace {
 
-using tensorflow::tensorforest::GrowStats;
-using tensorflow::tensorforest::TestableInputTarget;
-using tensorflow::tensorforest::FertileSlot;
+using tensorflow::decision_trees::BinaryNode;
+using tensorflow::decision_trees::FeatureId;
+using tensorflow::decision_trees::InequalityTest;
 using tensorflow::tensorforest::DenseClassificationGrowStats;
-using tensorflow::tensorforest::SparseClassificationGrowStats;
+using tensorflow::tensorforest::FertileSlot;
 using tensorflow::tensorforest::FixedSizeClassStats;
 using tensorflow::tensorforest::FixedSizeSparseClassificationGrowStats;
+using tensorflow::tensorforest::GrowStats;
 using tensorflow::tensorforest::LeastSquaresRegressionGrowStats;
-using tensorflow::tensorforest::TensorForestParams;
+using tensorflow::tensorforest::SparseClassificationGrowStats;
 using tensorflow::tensorforest::SPLIT_FINISH_BASIC;
 using tensorflow::tensorforest::SPLIT_FINISH_DOMINATE_HOEFFDING;
 using tensorflow::tensorforest::SPLIT_PRUNE_HOEFFDING;
-using tensorflow::decision_trees::BinaryNode;
-using tensorflow::decision_trees::InequalityTest;
-using tensorflow::decision_trees::FeatureId;
+using tensorflow::tensorforest::TensorForestParams;
+using tensorflow::tensorforest::TestableInputTarget;
 
 BinaryNode MakeSplit(const string& feat, float val) {
   BinaryNode split;
@@ -52,8 +52,7 @@ BinaryNode MakeSplit(const string& feat, float val) {
   return split;
 }
 
-void RunBatch(GrowStats* stats,
-              const TestableInputTarget* target) {
+void RunBatch(GrowStats* stats, const TestableInputTarget* target) {
   std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
       new tensorflow::tensorforest::TestableDataSet(
           {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, 2));
@@ -102,18 +101,10 @@ class TestableRunningStats : public DenseClassificationGrowStats {
   TestableRunningStats(const TensorForestParams& params, int32 depth)
       : DenseClassificationGrowStats(params, depth) {}
 
-  float test_left_sum(int split) {
-    return get_left_gini()->sum(split);
-  }
-  float test_left_square(int split) {
-    return get_left_gini()->square(split);
-  }
-  float test_right_sum(int split) {
-    return get_right_gini()->sum(split);
-  }
-  float test_right_square(int split) {
-    return get_right_gini()->square(split);
-  }
+  float test_left_sum(int split) { return get_left_gini()->sum(split); }
+  float test_left_square(int split) { return get_left_gini()->square(split); }
+  float test_right_sum(int split) { return get_right_gini()->sum(split); }
+  float test_right_square(int split) { return get_right_gini()->square(split); }
 };
 
 TEST(GrowStatsDenseClassificationTest, BasicRunningStats) {
@@ -166,9 +157,7 @@ class TestableFinishEarly : public DenseClassificationGrowStats {
   int num_times_called_;
 
  protected:
-  void CheckFinishEarlyHoeffding() override {
-    ++num_times_called_;
-  }
+  void CheckFinishEarlyHoeffding() override { ++num_times_called_; }
 };
 
 TEST(GrowStatsDenseClassificationTest, TestFinishEarly) {
@@ -212,7 +201,6 @@ TEST(GrowStatsDenseClassificationTest, TestFinishEarly) {
   ASSERT_EQ(stat->num_times_called_, 9);
 }
 
-
 TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) {
   TensorForestParams params;
   params.set_num_outputs(2);
@@ -224,7 +212,8 @@ TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) {
   finish->set_type(SPLIT_FINISH_BASIC);
   finish->mutable_check_every_steps()->set_constant_value(100);
   params.mutable_pruning_type()->set_type(SPLIT_PRUNE_HOEFFDING);
-  params.mutable_pruning_type()->mutable_prune_every_samples()
+  params.mutable_pruning_type()
+      ->mutable_prune_every_samples()
       ->set_constant_value(1);
 
   // On each iteration, we add two examples, one of class 0 and one
@@ -234,8 +223,8 @@ TEST(GrowStatsDenseClassificationTest, TestCheckPruneHoeffding) {
   std::vector<float> weights = {1, 1};
   TestableInputTarget target(labels, weights, 1);
   std::unique_ptr<tensorflow::tensorforest::TensorDataSet> dataset(
-      new tensorflow::tensorforest::TestableDataSet(
-          {-1.0, -1.0, 1.0, -1.0}, 2));
+      new tensorflow::tensorforest::TestableDataSet({-1.0, -1.0, 1.0, -1.0},
+                                                    2));
 
   DenseClassificationGrowStats stats(params, 1);
   stats.Initialize();
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
index 14cb19d36f33e478728aba3e28b7bea11b691d34..d43884481afbbbc988d6eb80e01e49663df6914b 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.cc
@@ -21,8 +21,6 @@ namespace tensorflow {
 namespace tensorforest {
 namespace {
 
-const int32 SPARSE_DEFAULT = 0;
-
 bool DecideInequalityTest(const decision_trees::InequalityTest& test,
                           float value) {
   float bias = test.threshold().float_value();
@@ -111,10 +109,10 @@ void TensorDataSet::set_input_tensors(const Tensor& dense,
     dense_data_.reset(new DenseStorageType(dense.tensor<float, 2>()));
   }
   if (sparse_indices.shape().dims() == 2) {
-    sparse_indices_.reset(new SparseIndicesStorageType(
-        sparse_indices.tensor<int64, 2>()));
-    sparse_values_.reset(new SparseValuesStorageType(
-        sparse_values.tensor<float, 1>()));
+    sparse_indices_.reset(
+        new SparseIndicesStorageType(sparse_indices.tensor<int64, 2>()));
+    sparse_values_.reset(
+        new SparseValuesStorageType(sparse_values.tensor<float, 1>()));
     sparse_batch_size_ = sparse_shape.tensor<int64, 1>()(0);
   }
   original_dense_tensor_ = dense;
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
index e3d4edbf8a512a027e4b67916d1f2ad3f347a18b..c544a8c75e9bfe8fe6bbea8913e7be17d868bfef 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_data.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
 #include <ctime>
 #include <unordered_map>
 #include "google/protobuf/any.pb.h"
@@ -93,9 +93,7 @@ class TensorDataSet {
   // an int32 you can avoid the atoi32.
   virtual float GetExampleValue(int example, int32 feature_id) const;
 
-  int num_features() {
-    return available_features_.size();
-  }
+  int num_features() { return available_features_.size(); }
 
   const Tensor& original_tensor() const { return original_dense_tensor_; }
 
@@ -123,4 +121,4 @@ class TensorDataSet {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_DATA_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
index 0309ec1de9aec1044eb87e01cafc40c26ba3de14..d4402b6055a36d38042a0e6cfa07b532ec11c093 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/input_target.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
@@ -79,9 +79,7 @@ class TensorInputTarget : public StoredInputTarget<SingleDimStorageType> {
     return (*target_)(example_index * num_targets_ + target_index);
   }
 
-  const Tensor& original_tensor() const {
-    return original_tensor_;
-  }
+  const Tensor& original_tensor() const { return original_tensor_; }
 
  protected:
   Tensor original_tensor_;
@@ -89,4 +87,4 @@ class TensorInputTarget : public StoredInputTarget<SingleDimStorageType> {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_INPUT_TARGET_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
index d43c068e462ff78b114fb29bd8cf0ee0c6080fcd..83614a25314117ef9ba29b4dcf6ebee8f7f3e226 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.cc
@@ -160,6 +160,5 @@ void RegressionLeafModelOperator::ExportModel(
   }
 }
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h
index 946a648f22ff4175782c42cc70c59440e6ac0e17..cc4ec8dc9e330784bbcfeb54fa92e0a2db9449a8 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
 
 #include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
 #include "tensorflow/contrib/tensor_forest/kernels/v4/input_target.h"
@@ -146,4 +146,4 @@ class LeafModelOperatorFactory {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_LEAF_MODEL_OPERATORS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
index ffd92c01f9a59719e6bb2458c2f28253c364a2e8..ab4191809b6a7400114acf85991c74acfac55505 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/leaf_model_operators_test.cc
@@ -26,19 +26,19 @@ namespace {
 using tensorflow::decision_trees::Leaf;
 using tensorflow::tensorforest::DenseClassificationLeafModelOperator;
 using tensorflow::tensorforest::LeafModelOperator;
-using tensorflow::tensorforest::SparseClassificationLeafModelOperator;
-using tensorflow::tensorforest::SparseOrDenseClassificationLeafModelOperator;
 using tensorflow::tensorforest::LeafStat;
 using tensorflow::tensorforest::RegressionLeafModelOperator;
-using tensorflow::tensorforest::TestableInputTarget;
+using tensorflow::tensorforest::SparseClassificationLeafModelOperator;
+using tensorflow::tensorforest::SparseOrDenseClassificationLeafModelOperator;
 using tensorflow::tensorforest::TensorForestParams;
+using tensorflow::tensorforest::TestableInputTarget;
 
 const int32 kNumClasses = 3;
 
 constexpr char kRegressionStatProto[] =
-  "weight_sum: 3 "
-  "regression { "
-  "mean_output { "
+    "weight_sum: 3 "
+    "regression { "
+    "mean_output { "
     "value { "
     "  float_value: 27 "
     "} "
@@ -48,8 +48,8 @@ constexpr char kRegressionStatProto[] =
     "value { "
     "  float_value: 10 "
     "} "
-  "} "
-  "mean_output_squares { "
+    "} "
+    "mean_output_squares { "
     "value {"
     "  float_value: 245"
     "}"
@@ -59,8 +59,8 @@ constexpr char kRegressionStatProto[] =
     "value {"
     "  float_value: 46"
     "}"
-  "}"
-"}";
+    "}"
+    "}";
 
 void TestClassificationNormalUse(const std::unique_ptr<LeafModelOperator>& op) {
   Leaf l;
@@ -83,7 +83,6 @@ void TestClassificationNormalUse(const std::unique_ptr<LeafModelOperator>& op) {
   EXPECT_FLOAT_EQ(op->GetOutputValue(l, 1), 3.4);
 }
 
-
 TEST(DenseLeafModelOperatorsTest, NormalUse) {
   TensorForestParams params;
   params.set_num_outputs(kNumClasses);
@@ -182,7 +181,7 @@ TEST(SparseLeafModelOperatorsTest, InitWithExisting) {
 
   std::unique_ptr<Leaf> leaf(new Leaf);
 
-  op->ExportModel( *stat, leaf.get());
+  op->ExportModel(*stat, leaf.get());
 
   // Make sure it was initialized correctly.
   EXPECT_FLOAT_EQ(op->GetOutputValue(*leaf, 0), 1.1);
@@ -194,7 +193,6 @@ TEST(SparseLeafModelOperatorsTest, InitWithExisting) {
   EXPECT_EQ(leaf->sparse_vector().sparse_value().size(), kNumClasses);
 }
 
-
 TEST(RegressionLeafModelOperatorsTest, NormalUse) {
   TensorForestParams params;
   params.set_num_outputs(kNumClasses);
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params.h b/tensorflow/contrib/tensor_forest/kernels/v4/params.h
index 97a9d8d096311faaae774e9e4b2e45f28ed7fa29..7583e3d0402a3a1d07f3696727b285747dc887de 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/params.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/params.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
 
 #include "tensorflow/contrib/tensor_forest/proto/tensor_forest_params.pb.h"
 #include "tensorflow/core/platform/types.h"
@@ -24,9 +24,7 @@ namespace tensorforest {
 // Return the value of the given depth-dependent parameter given a leaf's depth.
 float ResolveParam(const DepthDependentParam& param, int32 depth);
 
-
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_PARAMS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc b/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
index 801881af1368dc33f00b356d12bea07ae3161ef6..4010a71006d58df0bec6d3686a9c47433b46fdd4 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/params_test.cc
@@ -71,5 +71,3 @@ TEST(ParamsTest, TestThreshold) {
 }
 
 }  // namespace
-
-
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
index cdb1d80a4bbd47d1481ecde2348bef500bd125f1..b7b60d0ab8c2670cec8b029d1f42c5edd3690afe 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.cc
@@ -52,8 +52,8 @@ std::unique_ptr<GrowStats> SplitCollectionOperator::CreateGrowStats(
           new SparseClassificationGrowStats(params_, depth));
 
     case STATS_LEAST_SQUARES_REGRESSION:
-      return std::unique_ptr<GrowStats>(new LeastSquaresRegressionGrowStats(
-          params_, depth));
+      return std::unique_ptr<GrowStats>(
+          new LeastSquaresRegressionGrowStats(params_, depth));
 
     case STATS_FIXED_SIZE_SPARSE_GINI:
       return std::unique_ptr<GrowStats>(
@@ -136,8 +136,7 @@ void SplitCollectionOperator::CreateAndInitializeCandidateWithExample(
   stats_.at(node_id)->AddSplit(split, input_data, target, example);
 }
 
-bool SplitCollectionOperator::BestSplit(int32 node_id,
-                                        SplitCandidate* best,
+bool SplitCollectionOperator::BestSplit(int32 node_id, SplitCandidate* best,
                                         int32* depth) const {
   auto* slot = stats_.at(node_id).get();
   *depth = slot->depth();
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
index 6c21c0bd3443347bdb0102727b15b26754a0ed53..c606ff98c67f411a5817f0282238fdaf3be03642 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/split_collection_operators.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
 
 #include <vector>
 #include "tensorflow/contrib/decision_trees/proto/generic_tree_model.pb.h"
@@ -71,9 +71,7 @@ class SplitCollectionOperator {
   }
 
   // Perform any necessary cleanup for any tracked state for the slot.
-  virtual void ClearSlot(int32 node_id) {
-    stats_.erase(node_id);
-  }
+  virtual void ClearSlot(int32 node_id) { stats_.erase(node_id); }
 
   // Return true if slot is fully initialized.
   virtual bool IsInitialized(int32 node_id) const;
@@ -128,6 +126,4 @@ class AnyCollectionCreator : public CollectionCreator {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_SPLIT_COLLECTION_OPERATORS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
index 0bec198e97e8215d2cfdb9ada5355dd5b0d2d97b..c749fbe69e17769c2f2b69bcf541eb0eb8b9e7e8 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.cc
@@ -32,9 +32,9 @@ namespace tensorforest {
 //   smoothed_sum = stats.sum() + #_classes
 float GiniImpurity(const LeafStat& stats, int32 num_classes) {
   const float smoothed_sum = num_classes + stats.weight_sum();
-  return 1.0 - (
-      (stats.classification().gini().square()
-       + 2 * stats.weight_sum() + num_classes) / (smoothed_sum * smoothed_sum));
+  return 1.0 - ((stats.classification().gini().square() +
+                 2 * stats.weight_sum() + num_classes) /
+                (smoothed_sum * smoothed_sum));
 }
 
 float WeightedGiniImpurity(const LeafStat& stats, int32 num_classes) {
@@ -46,21 +46,20 @@ void UpdateGini(LeafStat* stats, float old_val, float weight) {
   // Equivalent to stats->square() - old_val * old_val + new_val * new_val,
   // (for new_val = old_val + weight), but more numerically stable.
   stats->mutable_classification()->mutable_gini()->set_square(
-      stats->classification().gini().square()
-      + weight * weight + 2 * old_val * weight);
+      stats->classification().gini().square() + weight * weight +
+      2 * old_val * weight);
 }
 
-
 float Variance(const LeafStat& stats, int output) {
   if (stats.weight_sum() == 0) {
     return 0;
   }
   const float e_x =
-      stats.regression().mean_output().value(output).float_value()
-      / stats.weight_sum();
+      stats.regression().mean_output().value(output).float_value() /
+      stats.weight_sum();
   const auto e_x2 =
-      stats.regression().mean_output_squares().value(output).float_value()
-      / stats.weight_sum();
+      stats.regression().mean_output_squares().value(output).float_value() /
+      stats.weight_sum();
   return e_x2 - e_x * e_x;
 }
 
@@ -75,8 +74,7 @@ float TotalVariance(const LeafStat& stats) {
 float SmoothedGini(float sum, float square, int num_classes) {
   // See comments for GiniImpurity above.
   const float smoothed_sum = num_classes + sum;
-  return 1.0 -
-         (square + 2 * sum + num_classes) / (smoothed_sum * smoothed_sum);
+  return 1.0 - (square + 2 * sum + num_classes) / (smoothed_sum * smoothed_sum);
 }
 
 float WeightedSmoothedGini(float sum, float square, int num_classes) {
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h
index 8e002d0414f48a1f409952f56c57b4e37815bca0..e6140065bbf12f2eb92c28e4affb3327f86af5d3 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/stat_utils.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
 #include "tensorflow/contrib/tensor_forest/proto/fertile_stats.pb.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -47,4 +47,4 @@ float WeightedSmoothedGini(float sum, float square, int num_classes);
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_STAT_UTILS_H_
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
index b6e543b96fd5a00f78555eaf8558f0a95d0a6713..38deb3e3cd816aae5fe66f26cd4b934316d03ce4 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/test_utils.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
+#ifndef TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
 #include "tensorflow/contrib/tensor_forest/kernels/v4/input_data.h"
 #include "tensorflow/contrib/tensor_forest/kernels/v4/input_target.h"
 
@@ -27,9 +27,7 @@ class TestableInputTarget : public StoredInputTarget<std::vector<float>> {
       : StoredInputTarget(new std::vector<float>(t), new std::vector<float>(w),
                           num_t) {}
 
-  int NumItems() const {
-    return target_->size();
-  }
+  int NumItems() const { return target_->size(); }
 
   int32 GetTargetAsClassIndex(int example_index,
                               int target_index) const override {
@@ -51,7 +49,6 @@ class TestableInputTarget : public StoredInputTarget<std::vector<float>> {
   }
 };
 
-
 class TestableDataSet : public TensorDataSet {
  public:
   TestableDataSet(const std::vector<float>& data, int num_features)
@@ -71,4 +68,4 @@ class TestableDataSet : public TensorDataSet {
 }  // namespace tensorforest
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
+#endif  // TENSORFLOW_CONTRIB_TENSOR_FOREST_KERNELS_V4_TEST_UTILS_H_
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index eb938763f12efd9281bec4321384acd4617cdfcf..3650b5d52fe8a1b87a239d41ecfa3de677fffc72 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -478,8 +478,7 @@ class RandomForestGraphs(object):
       **inference_args: Keyword arguments to pass through to each tree.
 
     Returns:
-      A tuple of (probabilities, tree_paths, variance), where variance
-      is the variance over all the trees for regression problems only.
+      A tuple of (probabilities, tree_paths, variance).
 
     Raises:
       NotImplementedError: If trying to use feature bagging with sparse
@@ -513,13 +512,12 @@ class RandomForestGraphs(object):
           self.params.num_trees,
           name='probabilities')
       tree_paths = array_ops.stack(paths, axis=1)
-      regression_variance = None
-      if self.params.regression:
-        expected_squares = math_ops.div(
-            math_ops.reduce_sum(all_predict * all_predict, 1),
-            self.params.num_trees)
-        regression_variance = math_ops.maximum(
-            0., expected_squares - average_values * average_values)
+
+      expected_squares = math_ops.div(
+          math_ops.reduce_sum(all_predict * all_predict, 1),
+          self.params.num_trees)
+      regression_variance = math_ops.maximum(
+          0., expected_squares - average_values * average_values)
       return average_values, tree_paths, regression_variance
 
   def average_size(self):
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
index 113dfb85d3bf671e0a9448e0cb0fbfd7f3ea04e7..bbe627b15773fafe83a0700da696f429876c0968 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
@@ -108,7 +108,7 @@ class TensorForestTest(test_util.TensorFlowTestCase):
     probs, paths, var = graph_builder.inference_graph(input_data)
     self.assertTrue(isinstance(probs, ops.Tensor))
     self.assertTrue(isinstance(paths, ops.Tensor))
-    self.assertIsNone(var)
+    self.assertTrue(isinstance(var, ops.Tensor))
 
   def testTrainingConstructionClassificationSparse(self):
     input_data = sparse_tensor.SparseTensor(
diff --git a/tensorflow/contrib/tensorboard/db/BUILD b/tensorflow/contrib/tensorboard/db/BUILD
index 9d3d60c24d72e28cf449cd196e34e53d5450d85f..4175d8adb58a85728519042a9870e8c4590232ba 100644
--- a/tensorflow/contrib/tensorboard/db/BUILD
+++ b/tensorflow/contrib/tensorboard/db/BUILD
@@ -5,12 +5,18 @@ package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
+    "tf_cc_test",
+    "tf_copts",
+)
 
 cc_library(
     name = "schema",
     srcs = ["schema.cc"],
     hdrs = ["schema.h"],
+    copts = tf_copts(),
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/lib/db:sqlite",
@@ -19,6 +25,7 @@ cc_library(
 
 tf_cc_test(
     name = "schema_test",
+    size = "small",
     srcs = ["schema_test.cc"],
     deps = [
         ":schema",
@@ -31,8 +38,10 @@ cc_library(
     name = "summary_db_writer",
     srcs = ["summary_db_writer.cc"],
     hdrs = ["summary_db_writer.h"],
+    copts = tf_copts(),
     deps = [
-        ":schema",
+        ":summary_converter",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -46,6 +55,7 @@ tf_cc_test(
     size = "small",
     srcs = ["summary_db_writer_test.cc"],
     deps = [
+        ":schema",
         ":summary_db_writer",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -55,6 +65,77 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "summary_file_writer",
+    srcs = ["summary_file_writer.cc"],
+    hdrs = ["summary_file_writer.h"],
+    copts = tf_copts(),
+    deps = [
+        ":summary_converter",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/core/kernels:summary_interface",
+    ],
+)
+
+tf_cc_test(
+    name = "summary_file_writer_test",
+    size = "medium",  # file i/o
+    timeout = "short",
+    srcs = ["summary_file_writer_test.cc"],
+    deps = [
+        ":summary_file_writer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "summary_converter",
+    srcs = ["summary_converter.cc"],
+    hdrs = ["summary_converter.h"],
+    copts = tf_copts(),
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_binary(
+    name = "loader",
+    srcs = ["loader.cc"],
+    linkstatic = 1,
+    deps = [
+        ":schema",
+        ":summary_db_writer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/db:sqlite",
+    ],
+)
+
+tf_cc_binary(
+    name = "vacuum",
+    srcs = ["vacuum.cc"],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/lib/db:sqlite",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(["*"]),
diff --git a/tensorflow/contrib/tensorboard/db/loader.cc b/tensorflow/contrib/tensorboard/db/loader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d7337a53d025f29ae5f85151b7f60d2cca6f771
--- /dev/null
+++ b/tensorflow/contrib/tensorboard/db/loader.cc
@@ -0,0 +1,124 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iostream>
+#include <vector>
+
+#include "tensorflow/contrib/tensorboard/db/schema.h"
+#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
+#include "tensorflow/core/lib/db/sqlite.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/core/util/event.pb.h"
+
+namespace tensorflow {
+namespace {
+
+template <typename T>
+string AddCommas(T n) {
+  static_assert(std::is_integral<T>::value, "is_integral");
+  string s = strings::StrCat(n);
+  if (s.size() > 3) {
+    int extra = s.size() / 3 - (s.size() % 3 == 0 ? 1 : 0);
+    s.append(extra, 'X');
+    int c = 0;
+    for (int i = s.size() - 1; i > 0; --i) {
+      s[i] = s[i - extra];
+      if (++c % 3 == 0) {
+        s[--i] = ',';
+        --extra;
+      }
+    }
+  }
+  return s;
+}
+
+int main(int argc, char* argv[]) {
+  string path;
+  string events;
+  string experiment_name;
+  string run_name;
+  string user_name;
+  std::vector<Flag> flag_list = {
+      Flag("db", &path, "Path of SQLite DB file"),
+      Flag("events", &events, "TensorFlow record proto event log file"),
+      Flag("experiment_name", &experiment_name, "The DB experiment_name value"),
+      Flag("run_name", &run_name, "The DB run_name value"),
+      Flag("user_name", &user_name, "The DB user_name value"),
+  };
+  string usage = Flags::Usage(argv[0], flag_list);
+  bool parse_result = Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result || path.empty()) {
+    std::cerr << "The loader tool imports tf.Event record files, created by\n"
+              << "SummaryFileWriter, into the sorts of SQLite database files\n"
+              << "created by SummaryDbWriter.\n\n"
+              << "In addition to the flags below, the environment variables\n"
+              << "defined by core/lib/db/sqlite.cc can also be set.\n\n"
+              << usage;
+    return -1;
+  }
+  port::InitMain(argv[0], &argc, &argv);
+  Env* env = Env::Default();
+
+  LOG(INFO) << "Opening SQLite file: " << path;
+  Sqlite* db;
+  TF_CHECK_OK(Sqlite::Open(
+      path, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE | SQLITE_OPEN_NOMUTEX,
+      &db));
+  core::ScopedUnref unref_db(db);
+
+  LOG(INFO) << "Initializing TensorBoard schema";
+  TF_CHECK_OK(SetupTensorboardSqliteDb(db));
+
+  LOG(INFO) << "Creating SummaryDbWriter";
+  SummaryWriterInterface* db_writer;
+  TF_CHECK_OK(CreateSummaryDbWriter(db, experiment_name, run_name, user_name,
+                                    env, &db_writer));
+  core::ScopedUnref unref(db_writer);
+
+  LOG(INFO) << "Loading TF event log: " << events;
+  std::unique_ptr<RandomAccessFile> file;
+  TF_CHECK_OK(env->NewRandomAccessFile(events, &file));
+  io::RecordReader reader(file.get());
+
+  uint64 start = env->NowMicros();
+  uint64 records = 0;
+  uint64 offset = 0;
+  string record;
+  while (true) {
+    std::unique_ptr<Event> event = std::unique_ptr<Event>(new Event);
+    Status s = reader.ReadRecord(&offset, &record);
+    if (s.code() == error::OUT_OF_RANGE) break;
+    TF_CHECK_OK(s);
+    if (!ParseProtoUnlimited(event.get(), record)) {
+      LOG(FATAL) << "Corrupt tf.Event record"
+                 << " offset=" << (offset - record.size())
+                 << " size=" << static_cast<int>(record.size());
+    }
+    TF_CHECK_OK(db_writer->WriteEvent(std::move(event)));
+    ++records;
+  }
+  uint64 elapsed = env->NowMicros() - start;
+  LOG(INFO) << "Loaded " << AddCommas(offset) << " bytes with "
+            << AddCommas(records) << " records at "
+            << AddCommas(offset / (elapsed / 1000000)) << " bps";
+
+  return 0;
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+int main(int argc, char* argv[]) { return tensorflow::main(argc, argv); }
diff --git a/tensorflow/contrib/tensorboard/db/schema.cc b/tensorflow/contrib/tensorboard/db/schema.cc
index d63b2c6cc23248c2dc5bdd4433047d3fa58c1d14..3c7bc87e4a2dbeadef2b9589d58c845204049123 100644
--- a/tensorflow/contrib/tensorboard/db/schema.cc
+++ b/tensorflow/contrib/tensorboard/db/schema.cc
@@ -14,437 +14,430 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/tensorboard/db/schema.h"
 
+#include "tensorflow/core/lib/core/errors.h"
+
 namespace tensorflow {
 namespace {
 
-class SqliteSchema {
- public:
-  explicit SqliteSchema(std::shared_ptr<Sqlite> db) : db_(std::move(db)) {}
-
-  /// \brief Creates Tensors table.
-  ///
-  /// Fields:
-  ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   tag_id: ID of associated Tag.
-  ///   computed_time: Float UNIX timestamp with microsecond precision.
-  ///     In the old summaries system that uses FileWriter, this is the
-  ///     wall time around when tf.Session.run finished. In the new
-  ///     summaries system, it is the wall time of when the tensor was
-  ///     computed. On systems with monotonic clocks, it is calculated
-  ///     by adding the monotonic run duration to Run.started_time.
-  ///     This field is not indexed because, in practice, it should be
-  ///     ordered the same or nearly the same as TensorIndex, so local
-  ///     insertion sort might be more suitable.
-  ///   step: User-supplied number, ordering this tensor in Tag.
-  ///     If NULL then the Tag must have only one Tensor.
-  ///   tensor: Can be an INTEGER (DT_INT64), FLOAT (DT_DOUBLE), or
-  ///     BLOB. The structure of a BLOB is currently undefined, but in
-  ///     essence it is a Snappy tf.TensorProto that spills over into
-  ///     TensorChunks.
-  Status CreateTensorsTable() {
-    return Run(R"sql(
-      CREATE TABLE IF NOT EXISTS Tensors (
-        rowid INTEGER PRIMARY KEY,
-        tag_id INTEGER NOT NULL,
-        computed_time REAL,
-        step INTEGER,
-        tensor BLOB
-      )
-    )sql");
-  }
-
-  /// \brief Creates TensorChunks table.
-  ///
-  /// This table can be used to split up a tensor across many rows,
-  /// which has the advantage of not slowing down table scans on the
-  /// main table, allowing asynchronous fetching, minimizing copying,
-  /// and preventing large buffers from being allocated.
-  ///
-  /// Fields:
-  ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   tag_id: ID of associated Tag.
-  ///   step: Same as corresponding Tensors.step.
-  ///   sequence: 1-indexed sequence number for ordering chunks. Please
-  ///     note that the 0th index is Tensors.tensor.
-  ///   chunk: Bytes of next chunk in tensor.
-  Status CreateTensorChunksTable() {
-    return Run(R"sql(
-      CREATE TABLE IF NOT EXISTS TensorChunks (
-        rowid INTEGER PRIMARY KEY,
-        tag_id INTEGER NOT NULL,
-        step INTEGER,
-        sequence INTEGER,
-        chunk BLOB
-      )
-    )sql");
-  }
-
-  /// \brief Creates Tags table.
-  ///
-  /// Fields:
-  ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   tag_id: Permanent >0 unique ID.
-  ///   run_id: Optional ID of associated Run.
-  ///   tag_name: The tag field in summary.proto, unique across Run.
-  ///   inserted_time: Float UNIX timestamp with µs precision. This is
-  ///     always the wall time of when the row was inserted into the
-  ///     DB. It may be used as a hint for an archival job.
-  ///   metadata: Optional BLOB of SummaryMetadata proto.
-  ///   display_name: Optional for GUI and defaults to tag_name.
-  ///   summary_description: Optional markdown information.
-  Status CreateTagsTable() {
-    return Run(R"sql(
-      CREATE TABLE IF NOT EXISTS Tags (
-        rowid INTEGER PRIMARY KEY,
-        run_id INTEGER,
-        tag_id INTEGER NOT NULL,
-        tag_name TEXT,
-        inserted_time DOUBLE,
-        metadata BLOB,
-        display_name TEXT,
-        description TEXT
-      )
-    )sql");
-  }
-
-  /// \brief Creates Runs table.
-  ///
-  /// This table stores information about runs. Each row usually
-  /// represents a single attempt at training or testing a TensorFlow
-  /// model, with a given set of hyper-parameters, whose summaries are
-  /// written out to a single event logs directory with a monotonic step
-  /// counter.
-  ///
-  /// When a run is deleted from this table, TensorBoard should treat all
-  /// information associated with it as deleted, even if those rows in
-  /// different tables still exist.
-  ///
-  /// Fields:
-  ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   run_id: Permanent >0 unique ID.
-  ///   experiment_id: Optional ID of associated Experiment.
-  ///   run_name: User-supplied string, unique across Experiment.
-  ///   inserted_time: Float UNIX timestamp with µs precision. This is
-  ///     always the time the row was inserted into the database. It
-  ///     does not change.
-  ///   started_time: Float UNIX timestamp with µs precision. In the
-  ///     old summaries system that uses FileWriter, this is
-  ///     approximated as the first tf.Event.wall_time. In the new
-  ///     summaries system, it is the wall time of when summary writing
-  ///     started, from the perspective of whichever machine talks to
-  ///     the database. This field will be mutated if the run is
-  ///     restarted.
-  ///   description: Optional markdown information.
-  ///   graph_id: ID of associated Graphs row.
-  Status CreateRunsTable() {
-    return Run(R"sql(
-      CREATE TABLE IF NOT EXISTS Runs (
-        rowid INTEGER PRIMARY KEY,
-        experiment_id INTEGER,
-        run_id INTEGER NOT NULL,
-        run_name TEXT,
-        inserted_time REAL,
-        started_time REAL,
-        description TEXT,
-        graph_id INTEGER
-      )
-    )sql");
-  }
-
-  /// \brief Creates Experiments table.
-  ///
-  /// This table stores information about experiments, which are sets of
-  /// runs.
-  ///
-  /// Fields:
-  ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   user_id: Optional ID of associated User.
-  ///   experiment_id: Permanent >0 unique ID.
-  ///   experiment_name: User-supplied string, unique across User.
-  ///   inserted_time: Float UNIX timestamp with µs precision. This is
-  ///     always the time the row was inserted into the database. It
-  ///     does not change.
-  ///   started_time: Float UNIX timestamp with µs precision. This is
-  ///     the MIN(experiment.started_time, run.started_time) of each
-  ///     Run added to the database.
-  ///   description: Optional markdown information.
-  Status CreateExperimentsTable() {
-    return Run(R"sql(
-      CREATE TABLE IF NOT EXISTS Experiments (
-        rowid INTEGER PRIMARY KEY,
-        user_id INTEGER,
-        experiment_id INTEGER NOT NULL,
-        experiment_name TEXT,
-        inserted_time REAL,
-        started_time REAL,
-        description TEXT
-      )
-    )sql");
-  }
-
-  /// \brief Creates Users table.
-  ///
-  /// Fields:
-  ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   user_id: Permanent >0 unique ID.
-  ///   user_name: Unique user name.
-  ///   email: Optional unique email address.
-  ///   inserted_time: Float UNIX timestamp with µs precision. This is
-  ///     always the time the row was inserted into the database. It
-  ///     does not change.
-  Status CreateUsersTable() {
-    return Run(R"sql(
-      CREATE TABLE IF NOT EXISTS Users (
-        rowid INTEGER PRIMARY KEY,
-        user_id INTEGER NOT NULL,
-        user_name TEXT,
-        email TEXT,
-        inserted_time REAL
-      )
-    )sql");
-  }
-
-  /// \brief Creates Graphs table.
-  ///
-  /// Fields:
-  ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   graph_id: Permanent >0 unique ID.
-  ///   inserted_time: Float UNIX timestamp with µs precision. This is
-  ///     always the wall time of when the row was inserted into the
-  ///     DB. It may be used as a hint for an archival job.
-  ///   node_def: Contains Snappy tf.GraphDef proto. All fields will be
-  ///     cleared except those not expressed in SQL.
-  Status CreateGraphsTable() {
-    return Run(R"sql(
-      CREATE TABLE IF NOT EXISTS Graphs (
-        rowid INTEGER PRIMARY KEY,
-        graph_id INTEGER NOT NULL,
-        inserted_time REAL,
-        graph_def BLOB
-      )
-    )sql");
-  }
-
-  /// \brief Creates Nodes table.
-  ///
-  /// Fields:
-  ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   graph_id: Permanent >0 unique ID.
-  ///   node_id: ID for this node. This is more like a 0-index within
-  ///     the Graph. Please note indexes are allowed to be removed.
-  ///   node_name: Unique name for this Node within Graph. This is
-  ///     copied from the proto so it can be indexed. This is allowed
-  ///     to be NULL to save space on the index, in which case the
-  ///     node_def.name proto field must not be cleared.
-  ///   op: Copied from tf.NodeDef proto.
-  ///   device: Copied from tf.NodeDef proto.
-  ///   node_def: Contains Snappy tf.NodeDef proto. All fields will be
-  ///     cleared except those not expressed in SQL.
-  Status CreateNodesTable() {
-    return Run(R"sql(
-      CREATE TABLE IF NOT EXISTS Nodes (
-        rowid INTEGER PRIMARY KEY,
-        graph_id INTEGER NOT NULL,
-        node_id INTEGER NOT NULL,
-        node_name TEXT,
-        op TEXT,
-        device TEXT,
-        node_def BLOB
-      )
-    )sql");
-  }
-
-  /// \brief Creates NodeInputs table.
-  ///
-  /// Fields:
-  ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   graph_id: Permanent >0 unique ID.
-  ///   node_id: Index of Node in question. This can be considered the
-  ///     'to' vertex.
-  ///   idx: Used for ordering inputs on a given Node.
-  ///   input_node_id: Nodes.node_id of the corresponding input node.
-  ///     This can be considered the 'from' vertex.
-  ///   is_control: If non-zero, indicates this input is a controlled
-  ///     dependency, which means this isn't an edge through which
-  ///     tensors flow. NULL means 0.
-  Status CreateNodeInputsTable() {
-    return Run(R"sql(
-      CREATE TABLE IF NOT EXISTS NodeInputs (
-        rowid INTEGER PRIMARY KEY,
-        graph_id INTEGER NOT NULL,
-        node_id INTEGER NOT NULL,
-        idx INTEGER NOT NULL,
-        input_node_id INTEGER NOT NULL,
-        is_control INTEGER
-      )
-    )sql");
-  }
-
-  /// \brief Uniquely indexes (tag_id, step) on Tensors table.
-  Status CreateTensorIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS TensorIndex
-      ON Tensors (tag_id, step)
-    )sql");
-  }
-
-  /// \brief Uniquely indexes (tag_id, step, sequence) on TensorChunks table.
-  Status CreateTensorChunkIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS TensorChunkIndex
-      ON TensorChunks (tag_id, step, sequence)
-    )sql");
-  }
-
-  /// \brief Uniquely indexes tag_id on Tags table.
-  Status CreateTagIdIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS TagIdIndex
-      ON Tags (tag_id)
-    )sql");
-  }
-
-  /// \brief Uniquely indexes run_id on Runs table.
-  Status CreateRunIdIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS RunIdIndex
-      ON Runs (run_id)
-    )sql");
-  }
-
-  /// \brief Uniquely indexes experiment_id on Experiments table.
-  Status CreateExperimentIdIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS ExperimentIdIndex
-      ON Experiments (experiment_id)
-    )sql");
-  }
-
-  /// \brief Uniquely indexes user_id on Users table.
-  Status CreateUserIdIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS UserIdIndex
-      ON Users (user_id)
-    )sql");
-  }
-
-  /// \brief Uniquely indexes graph_id on Graphs table.
-  Status CreateGraphIdIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS GraphIdIndex
-      ON Graphs (graph_id)
-    )sql");
-  }
-
-  /// \brief Uniquely indexes (graph_id, node_id) on Nodes table.
-  Status CreateNodeIdIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS NodeIdIndex
-      ON Nodes (graph_id, node_id)
-    )sql");
-  }
-
-  /// \brief Uniquely indexes (graph_id, node_id, idx) on NodeInputs table.
-  Status CreateNodeInputsIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS NodeInputsIndex
-      ON NodeInputs (graph_id, node_id, idx)
-    )sql");
-  }
-
-  /// \brief Uniquely indexes (run_id, tag_name) on Tags table.
-  Status CreateTagNameIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS TagNameIndex
-      ON Tags (run_id, tag_name)
-      WHERE tag_name IS NOT NULL
-    )sql");
-  }
-
-  /// \brief Uniquely indexes (experiment_id, run_name) on Runs table.
-  Status CreateRunNameIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS RunNameIndex
-      ON Runs (experiment_id, run_name)
-      WHERE run_name IS NOT NULL
-    )sql");
-  }
-
-  /// \brief Uniquely indexes (user_id, experiment_name) on Experiments table.
-  Status CreateExperimentNameIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS ExperimentNameIndex
-      ON Experiments (user_id, experiment_name)
-      WHERE experiment_name IS NOT NULL
-    )sql");
-  }
-
-  /// \brief Uniquely indexes user_name on Users table.
-  Status CreateUserNameIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS UserNameIndex
-      ON Users (user_name)
-      WHERE user_name IS NOT NULL
-    )sql");
-  }
-
-  /// \brief Uniquely indexes email on Users table.
-  Status CreateUserEmailIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS UserEmailIndex
-      ON Users (email)
-      WHERE email IS NOT NULL
-    )sql");
-  }
-
-  /// \brief Uniquely indexes (graph_id, node_name) on Nodes table.
-  Status CreateNodeNameIndex() {
-    return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS NodeNameIndex
-      ON Nodes (graph_id, node_name)
-      WHERE node_name IS NOT NULL
-    )sql");
-  }
-
-  Status Run(const char* sql) {
-    auto stmt = db_->Prepare(sql);
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(stmt.StepAndReset(), sql);
-    return Status::OK();
-  }
-
- private:
-  std::shared_ptr<Sqlite> db_;
-};
+Status Run(Sqlite* db, const char* sql) {
+  SqliteStatement stmt;
+  TF_RETURN_IF_ERROR(db->Prepare(sql, &stmt));
+  return stmt.StepAndReset();
+}
 
 }  // namespace
 
-Status SetupTensorboardSqliteDb(std::shared_ptr<Sqlite> db) {
-  SqliteSchema s(std::move(db));
-  TF_RETURN_IF_ERROR(s.CreateTensorsTable());
-  TF_RETURN_IF_ERROR(s.CreateTensorChunksTable());
-  TF_RETURN_IF_ERROR(s.CreateTagsTable());
-  TF_RETURN_IF_ERROR(s.CreateRunsTable());
-  TF_RETURN_IF_ERROR(s.CreateExperimentsTable());
-  TF_RETURN_IF_ERROR(s.CreateUsersTable());
-  TF_RETURN_IF_ERROR(s.CreateGraphsTable());
-  TF_RETURN_IF_ERROR(s.CreateNodeInputsTable());
-  TF_RETURN_IF_ERROR(s.CreateNodesTable());
-  TF_RETURN_IF_ERROR(s.CreateTensorIndex());
-  TF_RETURN_IF_ERROR(s.CreateTensorChunkIndex());
-  TF_RETURN_IF_ERROR(s.CreateTagIdIndex());
-  TF_RETURN_IF_ERROR(s.CreateRunIdIndex());
-  TF_RETURN_IF_ERROR(s.CreateExperimentIdIndex());
-  TF_RETURN_IF_ERROR(s.CreateUserIdIndex());
-  TF_RETURN_IF_ERROR(s.CreateGraphIdIndex());
-  TF_RETURN_IF_ERROR(s.CreateNodeIdIndex());
-  TF_RETURN_IF_ERROR(s.CreateNodeInputsIndex());
-  TF_RETURN_IF_ERROR(s.CreateTagNameIndex());
-  TF_RETURN_IF_ERROR(s.CreateRunNameIndex());
-  TF_RETURN_IF_ERROR(s.CreateExperimentNameIndex());
-  TF_RETURN_IF_ERROR(s.CreateUserNameIndex());
-  TF_RETURN_IF_ERROR(s.CreateUserEmailIndex());
-  TF_RETURN_IF_ERROR(s.CreateNodeNameIndex());
-  return Status::OK();
+Status SetupTensorboardSqliteDb(Sqlite* db) {
+  // Note: GCC raw strings macros are broken.
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55971
+  TF_RETURN_IF_ERROR(
+      db->PrepareOrDie(strings::StrCat("PRAGMA application_id=",
+                                       kTensorboardSqliteApplicationId))
+          .StepAndReset());
+  db->PrepareOrDie("PRAGMA user_version=0").StepAndResetOrDie();
+  Status s;
+
+  // Ids identify resources.
+  //
+  // This table can be used to efficiently generate Permanent IDs in
+  // conjunction with a random number generator. Unlike rowids these
+  // IDs safe to use in URLs and unique across tables.
+  //
+  // Within any given system, there can't be any foo_id == bar_id for
+  // all rows of any two (Foos, Bars) tables. A row should only be
+  // deleted from this table if there's a very high level of confidence
+  // it exists nowhere else in the system.
+  //
+  // Fields:
+  //   id: The system-wide ID. This must be in the range [1,2**47). 0
+  //     is assigned the same meaning as NULL and shouldn't be stored
+  //     and all other int64 values are reserved for future use. Please
+  //     note that id is also the rowid.
+  s.Update(Run(db, R"sql(
+    CREATE TABLE IF NOT EXISTS Ids (
+      id INTEGER PRIMARY KEY
+    )
+  )sql"));
+
+  // Descriptions are Markdown text that can be associated with any
+  // resource that has a Permanent ID.
+  //
+  // Fields:
+  //   id: The foo_id of the associated row in Foos.
+  //   description: Arbitrary NUL-terminated Markdown text.
+  s.Update(Run(db, R"sql(
+    CREATE TABLE IF NOT EXISTS Descriptions (
+      id INTEGER PRIMARY KEY,
+      description TEXT
+    )
+  )sql"));
+
+  // Tensors are 0..n-dimensional numbers or strings.
+  //
+  // Fields:
+  //   rowid: Ephemeral b-tree ID.
+  //   series: The Permanent ID of a different resource, e.g. tag_id. A
+  //     tensor will be vacuumed if no series == foo_id exists for all
+  //     rows of all Foos. When series is NULL this tensor may serve
+  //     undefined purposes. This field should be set on placeholders.
+  //   step: Arbitrary number to uniquely order tensors within series.
+  //     The meaning of step is undefined when series is NULL. This may
+  //     be set on placeholders to prepopulate index pages.
+  //   computed_time: Float UNIX timestamp with microsecond precision.
+  //     In the old summaries system that uses FileWriter, this is the
+  //     wall time around when tf.Session.run finished. In the new
+  //     summaries system, it is the wall time of when the tensor was
+  //     computed. On systems with monotonic clocks, it is calculated
+  //     by adding the monotonic run duration to Run.started_time.
+  //   dtype: The tensorflow::DataType ID. For example, DT_INT64 is 9.
+  //     When NULL or 0 this must be treated as a placeholder row that
+  //     does not officially exist.
+  //   shape: A comma-delimited list of int64 >=0 values representing
+  //     length of each dimension in the tensor. This must be a valid
+  //     shape. That means no -1 values and, in the case of numeric
+  //     tensors, length(data) == product(shape) * sizeof(dtype). Empty
+  //     means this is a scalar a.k.a. 0-dimensional tensor.
+  //   data: Little-endian raw tensor memory. If dtype is DT_STRING and
+  //     shape is empty, the nullness of this field indicates whether or
+  //     not it contains the tensor contents; otherwise TensorStrings
+  //     must be queried. If dtype is NULL then ZEROBLOB can be used on
+  //     this field to reserve row space to be updated later.
+  s.Update(Run(db, R"sql(
+    CREATE TABLE IF NOT EXISTS Tensors (
+      rowid INTEGER PRIMARY KEY,
+      series INTEGER,
+      step INTEGER,
+      dtype INTEGER,
+      computed_time REAL,
+      shape TEXT,
+      data BLOB
+    )
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS
+      TensorSeriesStepIndex
+    ON
+      Tensors (series, step)
+    WHERE
+      series IS NOT NULL
+      AND step IS NOT NULL
+  )sql"));
+
+  // TensorStrings are the flat contents of 1..n dimensional DT_STRING
+  // Tensors.
+  //
+  // The number of rows associated with a Tensor must be equal to the
+  // product of its Tensors.shape.
+  //
+  // Fields:
+  //   rowid: Ephemeral b-tree ID.
+  //   tensor_rowid: References Tensors.rowid.
+  //   idx: Index in flattened tensor, starting at 0.
+  //   data: The string value at a particular index. NUL characters are
+  //     permitted.
+  s.Update(Run(db, R"sql(
+    CREATE TABLE IF NOT EXISTS TensorStrings (
+      rowid INTEGER PRIMARY KEY,
+      tensor_rowid INTEGER NOT NULL,
+      idx INTEGER NOT NULL,
+      data BLOB
+    )
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS TensorStringIndex
+    ON TensorStrings (tensor_rowid, idx)
+  )sql"));
+
+  // Tags are series of Tensors.
+  //
+  // Fields:
+  //   rowid: Ephemeral b-tree ID.
+  //   tag_id: The Permanent ID of the Tag.
+  //   run_id: Optional ID of associated Run.
+  //   inserted_time: Float UNIX timestamp with µs precision. This is
+  //     always the wall time of when the row was inserted into the
+  //     DB. It may be used as a hint for an archival job.
+  //   tag_name: The tag field in summary.proto, unique across Run.
+  //   display_name: Optional for GUI and defaults to tag_name.
+  //   plugin_name: Arbitrary TensorBoard plugin name for dispatch.
+  //   plugin_data: Arbitrary data that plugin wants.
+  //
+  // TODO(jart): Maybe there should be a Plugins table?
+  s.Update(Run(db, R"sql(
+    CREATE TABLE IF NOT EXISTS Tags (
+      rowid INTEGER PRIMARY KEY,
+      run_id INTEGER,
+      tag_id INTEGER NOT NULL,
+      inserted_time DOUBLE,
+      tag_name TEXT,
+      display_name TEXT,
+      plugin_name TEXT,
+      plugin_data BLOB
+    )
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS TagIdIndex
+    ON Tags (tag_id)
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS
+      TagRunNameIndex
+    ON
+      Tags (run_id, tag_name)
+    WHERE
+      run_id IS NOT NULL
+      AND tag_name IS NOT NULL
+  )sql"));
+
+  // Runs are groups of Tags.
+  //
+  // Each Run usually represents a single attempt at training or testing
+  // a TensorFlow model, with a given set of hyper-parameters, whose
+  // summaries are written out to a single event logs directory with a
+  // monotonic step counter.
+  //
+  // Fields:
+  //   rowid: Ephemeral b-tree ID.
+  //   run_id: The Permanent ID of the Run. This has a 1:1 mapping
+  //     with a SummaryWriter instance. If two writers spawn for a
+  //     given (user_name, run_name, run_name) then each should
+  //     allocate its own run_id and whichever writer puts it in the
+  //     database last wins. The Tags / Tensors associated with the
+  //     previous invocations will then enter limbo, where they may be
+  //     accessible for certain operations, but should be garbage
+  //     collected eventually.
+  //   run_name: User-supplied string, unique across Experiment.
+  //   experiment_id: Optional ID of associated Experiment.
+  //   inserted_time: Float UNIX timestamp with µs precision. This is
+  //     always the time the row was inserted into the database. It
+  //     does not change.
+  //   started_time: Float UNIX timestamp with µs precision. In the
+  //     old summaries system that uses FileWriter, this is
+  //     approximated as the first tf.Event.wall_time. In the new
+  //     summaries system, it is the wall time of when summary writing
+  //     started, from the perspective of whichever machine talks to
+  //     the database. This field will be mutated if the run is
+  //     restarted.
+  //   finished_time: Float UNIX timestamp with µs precision of when
+  //     SummaryWriter resource that created this run was destroyed.
+  //     Once this value becomes non-NULL a Run and its Tags and
+  //     Tensors should be regarded as immutable.
+  s.Update(Run(db, R"sql(
+    CREATE TABLE IF NOT EXISTS Runs (
+      rowid INTEGER PRIMARY KEY,
+      experiment_id INTEGER,
+      run_id INTEGER NOT NULL,
+      inserted_time REAL,
+      started_time REAL,
+      finished_time REAL,
+      run_name TEXT
+    )
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS RunIdIndex
+    ON Runs (run_id)
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS RunNameIndex
+    ON Runs (experiment_id, run_name)
+    WHERE run_name IS NOT NULL
+  )sql"));
+
+  // Experiments are groups of Runs.
+  //
+  // Fields:
+  //   rowid: Ephemeral b-tree ID.
+  //   user_id: Optional ID of associated User.
+  //   experiment_id: The Permanent ID of the Experiment.
+  //   experiment_name: User-supplied string, unique across User.
+  //   inserted_time: Float UNIX timestamp with µs precision. This is
+  //     always the time the row was inserted into the database. It
+  //     does not change.
+  //   started_time: Float UNIX timestamp with µs precision. This is
+  //     the MIN(experiment.started_time, run.started_time) of each
+  //     Run added to the database, including Runs which have since
+  //     been overwritten.
+  //   is_watching: A boolean indicating if someone is actively
+  //     looking at this Experiment in the TensorBoard GUI. Tensor
+  //     writers that do reservoir sampling can query this value to
+  //     decide if they want the "keep last" behavior. This improves
+  //     the performance of long running training while allowing low
+  //     latency feedback in TensorBoard.
+  s.Update(Run(db, R"sql(
+    CREATE TABLE IF NOT EXISTS Experiments (
+      rowid INTEGER PRIMARY KEY,
+      user_id INTEGER,
+      experiment_id INTEGER NOT NULL,
+      inserted_time REAL,
+      started_time REAL,
+      is_watching INTEGER,
+      experiment_name TEXT
+    )
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS ExperimentIdIndex
+    ON Experiments (experiment_id)
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS ExperimentNameIndex
+    ON Experiments (user_id, experiment_name)
+    WHERE experiment_name IS NOT NULL
+  )sql"));
+
+  // Users are people who love TensorBoard.
+  //
+  // Fields:
+  //   rowid: Ephemeral b-tree ID.
+  //   user_id: The Permanent ID of the User.
+  //   user_name: Unique user name.
+  //   email: Optional unique email address.
+  //   inserted_time: Float UNIX timestamp with µs precision. This is
+  //     always the time the row was inserted into the database. It
+  //     does not change.
+  s.Update(Run(db, R"sql(
+    CREATE TABLE IF NOT EXISTS Users (
+      rowid INTEGER PRIMARY KEY,
+      user_id INTEGER NOT NULL,
+      inserted_time REAL,
+      user_name TEXT,
+      email TEXT
+    )
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS UserIdIndex
+    ON Users (user_id)
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS UserNameIndex
+    ON Users (user_name)
+    WHERE user_name IS NOT NULL
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS UserEmailIndex
+    ON Users (email)
+    WHERE email IS NOT NULL
+  )sql"));
+
+  // Graphs define how Tensors flowed in Runs.
+  //
+  // Fields:
+  //   rowid: Ephemeral b-tree ID.
+  //   run_id: The Permanent ID of the associated Run. Only one Graph
+  //     can be associated with a Run.
+  //   graph_id: The Permanent ID of the Graph.
+  //   inserted_time: Float UNIX timestamp with µs precision. This is
+  //     always the wall time of when the row was inserted into the
+  //     DB. It may be used as a hint for an archival job.
+  //   graph_def: Contains the tf.GraphDef proto parts leftover which
+  //     haven't been defined in SQL yet.
+  s.Update(Run(db, R"sql(
+    CREATE TABLE IF NOT EXISTS Graphs (
+      rowid INTEGER PRIMARY KEY,
+      run_id INTEGER,
+      graph_id INTEGER NOT NULL,
+      inserted_time REAL,
+      graph_def BLOB
+    )
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS GraphIdIndex
+    ON Graphs (graph_id)
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS GraphRunIndex
+    ON Graphs (run_id)
+    WHERE run_id IS NOT NULL
+  )sql"));
+
+  // Nodes are the vertices in Graphs.
+  //
+  // Fields:
+  //   rowid: Ephemeral b-tree ID.
+  //   graph_id: The Permanent ID of the associated Graph.
+  //   node_id: ID for this node. This is more like a 0-index within
+  //     the Graph. Please note indexes are allowed to be removed.
+  //   node_name: Unique name for this Node within Graph. This is
+  //     copied from the proto so it can be indexed. This is allowed
+  //     to be NULL to save space on the index, in which case the
+  //     node_def.name proto field must not be cleared.
+  //   op: Copied from tf.NodeDef proto.
+  //   device: Copied from tf.NodeDef proto.
+  //   node_def: Contains the tf.NodeDef proto parts leftover which
+  //     haven't been defined in SQL yet.
+  //
+  // TODO(jart): Make separate tables for op and device strings.
+  s.Update(Run(db, R"sql(
+    CREATE TABLE IF NOT EXISTS Nodes (
+      rowid INTEGER PRIMARY KEY,
+      graph_id INTEGER NOT NULL,
+      node_id INTEGER NOT NULL,
+      node_name TEXT,
+      op TEXT,
+      device TEXT,
+      node_def BLOB
+    )
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS NodeIdIndex
+    ON Nodes (graph_id, node_id)
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS NodeNameIndex
+    ON Nodes (graph_id, node_name)
+    WHERE node_name IS NOT NULL
+  )sql"));
+
+  // NodeInputs are directed edges between Nodes in Graphs.
+  //
+  // Fields:
+  //   rowid: Ephemeral b-tree ID.
+  //   graph_id: The Permanent ID of the associated Graph.
+  //   node_id: Index of Node in question. This can be considered the
+  //     'to' vertex.
+  //   idx: Used for ordering inputs on a given Node.
+  //   input_node_id: Nodes.node_id of the corresponding input node.
+  //     This can be considered the 'from' vertex.
+  //   input_node_idx: Since a Node can output multiple Tensors, this
+  //     is the integer index of which of those outputs is our input.
+  //     NULL is treated as 0.
+  //   is_control: If non-zero, indicates this input is a controlled
+  //     dependency, which means this isn't an edge through which
+  //     tensors flow. NULL means 0.
+  //
+  // TODO(jart): Rename to NodeEdges.
+  s.Update(Run(db, R"sql(
+    CREATE TABLE IF NOT EXISTS NodeInputs (
+      rowid INTEGER PRIMARY KEY,
+      graph_id INTEGER NOT NULL,
+      node_id INTEGER NOT NULL,
+      idx INTEGER NOT NULL,
+      input_node_id INTEGER NOT NULL,
+      input_node_idx INTEGER,
+      is_control INTEGER
+    )
+  )sql"));
+
+  s.Update(Run(db, R"sql(
+    CREATE UNIQUE INDEX IF NOT EXISTS NodeInputsIndex
+    ON NodeInputs (graph_id, node_id, idx)
+  )sql"));
+
+  return s;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorboard/db/schema.h b/tensorflow/contrib/tensorboard/db/schema.h
index 900c10298ce0a69b92f7528db9742517243c3c51..3da450422523dbe4304446869a38d43981d76eb5 100644
--- a/tensorflow/contrib/tensorboard/db/schema.h
+++ b/tensorflow/contrib/tensorboard/db/schema.h
@@ -15,18 +15,18 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SCHEMA_H_
 #define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SCHEMA_H_
 
-#include <memory>
-
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 
 namespace tensorflow {
 
+constexpr uint32 kTensorboardSqliteApplicationId = 0xfeedabee;
+
 /// \brief Creates TensorBoard SQLite tables and indexes.
 ///
 /// If they are already created, this has no effect. If schema
 /// migrations are necessary, they will be performed with logging.
-Status SetupTensorboardSqliteDb(std::shared_ptr<Sqlite> db);
+Status SetupTensorboardSqliteDb(Sqlite* db);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensorboard/db/schema_test.cc b/tensorflow/contrib/tensorboard/db/schema_test.cc
index 463c4e59e7e76e6460b7ddfbd92262ac249aa9ed..4d3f2880bd02682ad00a90760f2a4478f1e6b2a2 100644
--- a/tensorflow/contrib/tensorboard/db/schema_test.cc
+++ b/tensorflow/contrib/tensorboard/db/schema_test.cc
@@ -23,7 +23,9 @@ namespace tensorflow {
 namespace {
 
 TEST(SchemaTest, SmokeTestTensorboardSchema) {
-  auto db = Sqlite::Open(":memory:").ValueOrDie();
+  Sqlite* db;
+  TF_ASSERT_OK(Sqlite::Open(":memory:", SQLITE_OPEN_READWRITE, &db));
+  core::ScopedUnref unref_db(db);
   TF_ASSERT_OK(SetupTensorboardSqliteDb(db));
 }
 
diff --git a/tensorflow/contrib/tensorboard/db/summary_converter.cc b/tensorflow/contrib/tensorboard/db/summary_converter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..93c1183072b4d791843e740f970234ba52857463
--- /dev/null
+++ b/tensorflow/contrib/tensorboard/db/summary_converter.cc
@@ -0,0 +1,322 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/tensorboard/db/summary_converter.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/png/png_io.h"
+#include "tensorflow/core/lib/wav/wav_io.h"
+
+namespace tensorflow {
+namespace {
+
+template <typename T>
+Status TensorValueAt(Tensor t, int64 i, T* out) {
+#define CASE(I)                            \
+  case DataTypeToEnum<I>::value:           \
+    *out = static_cast<T>(t.flat<I>()(i)); \
+    break;
+#define COMPLEX_CASE(I)                           \
+  case DataTypeToEnum<I>::value:                  \
+    *out = static_cast<T>(t.flat<I>()(i).real()); \
+    break;
+  // clang-format off
+  switch (t.dtype()) {
+    TF_CALL_half(CASE)
+    TF_CALL_float(CASE)
+    TF_CALL_double(CASE)
+    TF_CALL_int8(CASE)
+    TF_CALL_int16(CASE)
+    TF_CALL_int32(CASE)
+    TF_CALL_int64(CASE)
+    TF_CALL_uint8(CASE)
+    TF_CALL_uint16(CASE)
+    TF_CALL_uint32(CASE)
+    TF_CALL_uint64(CASE)
+    TF_CALL_complex64(COMPLEX_CASE)
+    TF_CALL_complex128(COMPLEX_CASE)
+    default:
+        return errors::Unimplemented("SummaryFileWriter ",
+                                     DataTypeString(t.dtype()),
+                                     " not supported.");
+  }
+  // clang-format on
+  return Status::OK();
+#undef CASE
+#undef COMPLEX_CASE
+}
+
+typedef Eigen::Tensor<uint8, 2, Eigen::RowMajor> Uint8Image;
+
+// Add the sequence of images specified by ith_image to the summary.
+//
+// Factoring this loop out into a helper function lets ith_image behave
+// differently in the float and uint8 cases: the float case needs a temporary
+// buffer which can be shared across calls to ith_image, but the uint8 case
+// does not.
+Status AddImages(const string& tag, int max_images, int batch_size, int w,
+                 int h, int depth,
+                 const std::function<Uint8Image(int)>& ith_image, Summary* s) {
+  const int N = std::min<int>(max_images, batch_size);
+  for (int i = 0; i < N; ++i) {
+    Summary::Value* v = s->add_value();
+    // The tag depends on the number of requested images (not the number
+    // produced.)
+    //
+    // Note that later on avisu uses "/" to figure out a consistent naming
+    // convention for display, so we append "/image" to guarantee that the
+    // image(s) won't be displayed in the global scope with no name.
+    if (max_images > 1) {
+      v->set_tag(strings::StrCat(tag, "/image/", i));
+    } else {
+      v->set_tag(strings::StrCat(tag, "/image"));
+    }
+
+    const auto image = ith_image(i);
+    Summary::Image* si = v->mutable_image();
+    si->set_height(h);
+    si->set_width(w);
+    si->set_colorspace(depth);
+    const int channel_bits = 8;
+    const int compression = -1;  // Use zlib default
+    if (!png::WriteImageToBuffer(image.data(), w, h, w * depth, depth,
+                                 channel_bits, compression,
+                                 si->mutable_encoded_image_string(), nullptr)) {
+      return errors::Internal("PNG encoding failed");
+    }
+  }
+  return Status::OK();
+}
+
+template <class T>
+void NormalizeFloatImage(int hw, int depth,
+                         typename TTypes<T>::ConstMatrix values,
+                         typename TTypes<uint8>::ConstVec bad_color,
+                         Uint8Image* image) {
+  if (!image->size()) return;  // Nothing to do for empty images
+
+  // Rescale the image to uint8 range.
+  //
+  // We are trying to generate an RGB image from a float/half tensor.  We do
+  // not have any info about the expected range of values in the tensor
+  // but the generated image needs to have all RGB values within [0, 255].
+  //
+  // We use two different algorithms to generate these values.  If the
+  // tensor has only positive values we scale them all by 255/max(values).
+  // If the tensor has both negative and positive values we scale them by
+  // the max of their absolute values and center them around 127.
+  //
+  // This works for most cases, but does not respect the relative dynamic
+  // range across different instances of the tensor.
+
+  // Compute min and max ignoring nonfinite pixels
+  float image_min = std::numeric_limits<float>::infinity();
+  float image_max = -image_min;
+  for (int i = 0; i < hw; i++) {
+    bool finite = true;
+    for (int j = 0; j < depth; j++) {
+      if (!Eigen::numext::isfinite(values(i, j))) {
+        finite = false;
+        break;
+      }
+    }
+    if (finite) {
+      for (int j = 0; j < depth; j++) {
+        float value(values(i, j));
+        image_min = std::min(image_min, value);
+        image_max = std::max(image_max, value);
+      }
+    }
+  }
+
+  // Pick an affine transform into uint8
+  const float kZeroThreshold = 1e-6;
+  T scale, offset;
+  if (image_min < 0) {
+    const float max_val = std::max(std::abs(image_min), std::abs(image_max));
+    scale = T(max_val < kZeroThreshold ? 0.0f : 127.0f / max_val);
+    offset = T(128.0f);
+  } else {
+    scale = T(image_max < kZeroThreshold ? 0.0f : 255.0f / image_max);
+    offset = T(0.0f);
+  }
+
+  // Transform image, turning nonfinite values to bad_color
+  for (int i = 0; i < hw; i++) {
+    bool finite = true;
+    for (int j = 0; j < depth; j++) {
+      if (!Eigen::numext::isfinite(values(i, j))) {
+        finite = false;
+        break;
+      }
+    }
+    if (finite) {
+      image->chip<0>(i) =
+          (values.template chip<0>(i) * scale + offset).template cast<uint8>();
+    } else {
+      image->chip<0>(i) = bad_color;
+    }
+  }
+}
+
+template <class T>
+Status NormalizeAndAddImages(const Tensor& tensor, int max_images, int h, int w,
+                             int hw, int depth, int batch_size,
+                             const string& base_tag, Tensor bad_color_tensor,
+                             Summary* s) {
+  // For float and half images, nans and infs are replaced with bad_color.
+  if (bad_color_tensor.dim_size(0) < depth) {
+    return errors::InvalidArgument(
+        "expected depth <= bad_color.size, got depth = ", depth,
+        ", bad_color.size = ", bad_color_tensor.dim_size(0));
+  }
+  auto bad_color_full = bad_color_tensor.vec<uint8>();
+  typename TTypes<uint8>::ConstVec bad_color(bad_color_full.data(), depth);
+
+  // Float images must be scaled and translated.
+  Uint8Image image(hw, depth);
+  auto ith_image = [&tensor, &image, bad_color, batch_size, hw, depth](int i) {
+    auto tensor_eigen = tensor.template shaped<T, 3>({batch_size, hw, depth});
+    typename TTypes<T>::ConstMatrix values(
+        &tensor_eigen(i, 0, 0), Eigen::DSizes<Eigen::DenseIndex, 2>(hw, depth));
+    NormalizeFloatImage<T>(hw, depth, values, bad_color, &image);
+    return image;
+  };
+  return AddImages(base_tag, max_images, batch_size, w, h, depth, ith_image, s);
+}
+
+}  // namespace
+
+Status AddTensorAsScalarToSummary(const Tensor& t, const string& tag,
+                                  Summary* s) {
+  Summary::Value* v = s->add_value();
+  v->set_tag(tag);
+  float value;
+  TF_RETURN_IF_ERROR(TensorValueAt<float>(t, 0, &value));
+  v->set_simple_value(value);
+  return Status::OK();
+}
+
+Status AddTensorAsHistogramToSummary(const Tensor& t, const string& tag,
+                                     Summary* s) {
+  Summary::Value* v = s->add_value();
+  v->set_tag(tag);
+  histogram::Histogram histo;
+  for (int64 i = 0; i < t.NumElements(); i++) {
+    double double_val;
+    TF_RETURN_IF_ERROR(TensorValueAt<double>(t, i, &double_val));
+    if (Eigen::numext::isnan(double_val)) {
+      return errors::InvalidArgument("Nan in summary histogram for: ", tag);
+    } else if (Eigen::numext::isinf(double_val)) {
+      return errors::InvalidArgument("Infinity in summary histogram for: ",
+                                     tag);
+    }
+    histo.Add(double_val);
+  }
+  histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */);
+  return Status::OK();
+}
+
+Status AddTensorAsImageToSummary(const Tensor& tensor, const string& tag,
+                                 int max_images, const Tensor& bad_color,
+                                 Summary* s) {
+  if (!(tensor.dims() == 4 &&
+        (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
+         tensor.dim_size(3) == 4))) {
+    return errors::InvalidArgument(
+        "Tensor must be 4-D with last dim 1, 3, or 4, not ",
+        tensor.shape().DebugString());
+  }
+  if (!(tensor.dim_size(0) < (1LL << 31) && tensor.dim_size(1) < (1LL << 31) &&
+        tensor.dim_size(2) < (1LL << 31) &&
+        (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29))) {
+    return errors::InvalidArgument("Tensor too large for summary ",
+                                   tensor.shape().DebugString());
+  }
+  // The casts and h * w cannot overflow because of the limits above.
+  const int batch_size = static_cast<int>(tensor.dim_size(0));
+  const int h = static_cast<int>(tensor.dim_size(1));
+  const int w = static_cast<int>(tensor.dim_size(2));
+  const int hw = h * w;  // Compact these two dims for simplicity
+  const int depth = static_cast<int>(tensor.dim_size(3));
+  if (tensor.dtype() == DT_UINT8) {
+    // For uint8 input, no normalization is necessary
+    auto ith_image = [&tensor, batch_size, hw, depth](int i) {
+      auto values = tensor.shaped<uint8, 3>({batch_size, hw, depth});
+      return typename TTypes<uint8>::ConstMatrix(
+          &values(i, 0, 0), Eigen::DSizes<Eigen::DenseIndex, 2>(hw, depth));
+    };
+    TF_RETURN_IF_ERROR(
+        AddImages(tag, max_images, batch_size, w, h, depth, ith_image, s));
+  } else if (tensor.dtype() == DT_HALF) {
+    TF_RETURN_IF_ERROR(NormalizeAndAddImages<Eigen::half>(
+        tensor, max_images, h, w, hw, depth, batch_size, tag, bad_color, s));
+  } else if (tensor.dtype() == DT_FLOAT) {
+    TF_RETURN_IF_ERROR(NormalizeAndAddImages<float>(
+        tensor, max_images, h, w, hw, depth, batch_size, tag, bad_color, s));
+  } else {
+    return errors::InvalidArgument(
+        "Only DT_INT8, DT_HALF, and DT_FLOAT images are supported. Got ",
+        DataTypeString(tensor.dtype()));
+  }
+  return Status::OK();
+}
+
+Status AddTensorAsAudioToSummary(const Tensor& tensor, const string& tag,
+                                 int max_outputs, float sample_rate,
+                                 Summary* s) {
+  if (sample_rate <= 0.0f) {
+    return errors::InvalidArgument("sample_rate must be > 0");
+  }
+  const int batch_size = tensor.dim_size(0);
+  const int64 length_frames = tensor.dim_size(1);
+  const int64 num_channels =
+      tensor.dims() == 2 ? 1 : tensor.dim_size(tensor.dims() - 1);
+  const int N = std::min<int>(max_outputs, batch_size);
+  for (int i = 0; i < N; ++i) {
+    Summary::Value* v = s->add_value();
+    if (max_outputs > 1) {
+      v->set_tag(strings::StrCat(tag, "/audio/", i));
+    } else {
+      v->set_tag(strings::StrCat(tag, "/audio"));
+    }
+
+    Summary::Audio* sa = v->mutable_audio();
+    sa->set_sample_rate(sample_rate);
+    sa->set_num_channels(num_channels);
+    sa->set_length_frames(length_frames);
+    sa->set_content_type("audio/wav");
+
+    auto values =
+        tensor.shaped<float, 3>({batch_size, length_frames, num_channels});
+    auto channels_by_frames = typename TTypes<float>::ConstMatrix(
+        &values(i, 0, 0),
+        Eigen::DSizes<Eigen::DenseIndex, 2>(length_frames, num_channels));
+    size_t sample_rate_truncated = lrintf(sample_rate);
+    if (sample_rate_truncated == 0) {
+      sample_rate_truncated = 1;
+    }
+    TF_RETURN_IF_ERROR(wav::EncodeAudioAsS16LEWav(
+        channels_by_frames.data(), sample_rate_truncated, num_channels,
+        length_frames, sa->mutable_encoded_audio_string()));
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorboard/db/summary_converter.h b/tensorflow/contrib/tensorboard/db/summary_converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..329c7f9f2f9fe25cdff8d5ac2e52c25362f624c2
--- /dev/null
+++ b/tensorflow/contrib/tensorboard/db/summary_converter.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_CONVERTER_H_
+#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_CONVERTER_H_
+
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// TODO(jart): Delete these methods in favor of new Python implementation.
+Status AddTensorAsScalarToSummary(const Tensor& t, const string& tag,
+                                  Summary* s);
+Status AddTensorAsHistogramToSummary(const Tensor& t, const string& tag,
+                                     Summary* s);
+Status AddTensorAsImageToSummary(const Tensor& tensor, const string& tag,
+                                 int max_images, const Tensor& bad_color,
+                                 Summary* s);
+Status AddTensorAsAudioToSummary(const Tensor& tensor, const string& tag,
+                                 int max_outputs, float sample_rate,
+                                 Summary* s);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_CONVERTER_H_
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc b/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
index 37a32acb1eac7c43df00e33486c4f7676728c796..6590d6f7df4f35cad78db1fa9c4407bfb1270a2f 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
@@ -14,180 +14,242 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
 
-#include "tensorflow/contrib/tensorboard/db/schema.h"
+#include "tensorflow/contrib/tensorboard/db/summary_converter.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/fingerprint.h"
-#include "tensorflow/core/platform/snappy.h"
 #include "tensorflow/core/util/event.pb.h"
 
+// TODO(jart): Break this up into multiple files with excellent unit tests.
+// TODO(jart): Make decision to write in separate op.
+// TODO(jart): Add really good busy handling.
+
+// clang-format off
+#define CALL_SUPPORTED_TYPES(m) \
+  TF_CALL_string(m)             \
+  TF_CALL_half(m)               \
+  TF_CALL_float(m)              \
+  TF_CALL_double(m)             \
+  TF_CALL_complex64(m)          \
+  TF_CALL_complex128(m)         \
+  TF_CALL_int8(m)               \
+  TF_CALL_int16(m)              \
+  TF_CALL_int32(m)              \
+  TF_CALL_int64(m)              \
+  TF_CALL_uint8(m)              \
+  TF_CALL_uint16(m)             \
+  TF_CALL_uint32(m)             \
+  TF_CALL_uint64(m)
+// clang-format on
+
 namespace tensorflow {
 namespace {
 
-double GetWallTime(Env* env) {
+// https://www.sqlite.org/fileformat.html#record_format
+const uint64 kIdTiers[] = {
+    0x7fffffULL,        // 23-bit (3 bytes on disk)
+    0x7fffffffULL,      // 31-bit (4 bytes on disk)
+    0x7fffffffffffULL,  // 47-bit (5 bytes on disk)
+                        // remaining bits for future use
+};
+const int kMaxIdTier = sizeof(kIdTiers) / sizeof(uint64);
+const int kIdCollisionDelayMicros = 10;
+const int kMaxIdCollisions = 21;  // sum(2**i*10µs for i in range(21))~=21s
+const int64 kAbsent = 0LL;
+
+const char* kScalarPluginName = "scalars";
+const char* kImagePluginName = "images";
+const char* kAudioPluginName = "audio";
+const char* kHistogramPluginName = "histograms";
+
+const int kScalarSlots = 10000;
+const int kImageSlots = 10;
+const int kAudioSlots = 10;
+const int kHistogramSlots = 1;
+const int kTensorSlots = 10;
+
+const int64 kReserveMinBytes = 32;
+const double kReserveMultiplier = 1.5;
+
+// Flush is a misnomer because what we're actually doing is having lots
+// of commits inside any SqliteTransaction that writes potentially
+// hundreds of megs but doesn't need the transaction to maintain its
+// invariants. This ensures the WAL read penalty is small and might
+// allow writers in other processes a chance to schedule.
+const uint64 kFlushBytes = 1024 * 1024;
+
+double DoubleTime(uint64 micros) {
   // TODO(@jart): Follow precise definitions for time laid out in schema.
   // TODO(@jart): Use monotonic clock from gRPC codebase.
-  return static_cast<double>(env->NowMicros()) / 1.0e6;
+  return static_cast<double>(micros) / 1.0e6;
 }
 
-int64 MakeRandomId() {
-  // TODO(@jart): Try generating ID in 2^24 space, falling back to 2^63
-  //              https://sqlite.org/src4/doc/trunk/www/varint.wiki
-  int64 id = static_cast<int64>(random::New64() & ((1ULL << 63) - 1));
-  if (id == 0) {
-    ++id;
+string StringifyShape(const TensorShape& shape) {
+  string result;
+  bool first = true;
+  for (const auto& dim : shape) {
+    if (first) {
+      first = false;
+    } else {
+      strings::StrAppend(&result, ",");
+    }
+    strings::StrAppend(&result, dim.size);
   }
-  return id;
+  return result;
 }
 
-Status Serialize(const protobuf::MessageLite& proto, string* output) {
-  output->clear();
-  if (!proto.SerializeToString(output)) {
-    return errors::DataLoss("SerializeToString failed");
+Status CheckSupportedType(const Tensor& t) {
+#define CASE(T)                  \
+  case DataTypeToEnum<T>::value: \
+    break;
+  switch (t.dtype()) {
+    CALL_SUPPORTED_TYPES(CASE)
+    default:
+      return errors::Unimplemented(DataTypeString(t.dtype()),
+                                   " tensors unsupported on platform");
   }
   return Status::OK();
+#undef CASE
 }
 
-Status Compress(const string& data, string* output) {
-  output->clear();
-  if (!port::Snappy_Compress(data.data(), data.size(), output)) {
-    return errors::FailedPrecondition("TensorBase needs Snappy");
+Tensor AsScalar(const Tensor& t) {
+  Tensor t2{t.dtype(), {}};
+#define CASE(T)                        \
+  case DataTypeToEnum<T>::value:       \
+    t2.scalar<T>()() = t.flat<T>()(0); \
+    break;
+  switch (t.dtype()) {
+    CALL_SUPPORTED_TYPES(CASE)
+    default:
+      t2 = {DT_FLOAT, {}};
+      t2.scalar<float>()() = NAN;
+      break;
   }
-  return Status::OK();
+  return t2;
+#undef CASE
 }
 
-Status BindProto(SqliteStatement* stmt, int parameter,
-                 const protobuf::MessageLite& proto) {
-  string serialized;
-  TF_RETURN_IF_ERROR(Serialize(proto, &serialized));
-  string compressed;
-  TF_RETURN_IF_ERROR(Compress(serialized, &compressed));
-  stmt->BindBlob(parameter, compressed);
-  return Status::OK();
+void PatchPluginName(SummaryMetadata* metadata, const char* name) {
+  if (metadata->plugin_data().plugin_name().empty()) {
+    metadata->mutable_plugin_data()->set_plugin_name(name);
+  }
 }
 
-Status BindTensor(SqliteStatement* stmt, int parameter, const Tensor& t) {
-  // TODO(@jart): Make portable between little and big endian systems.
-  // TODO(@jart): Use TensorChunks with minimal copying for big tensors.
-  // TODO(@jart): Add field to indicate encoding.
-  // TODO(@jart): Allow crunch tool to re-compress with zlib instead.
-  TensorProto p;
-  t.AsProtoTensorContent(&p);
-  return BindProto(stmt, parameter, p);
+int GetSlots(const Tensor& t, const SummaryMetadata& metadata) {
+  if (metadata.plugin_data().plugin_name() == kScalarPluginName) {
+    return kScalarSlots;
+  } else if (metadata.plugin_data().plugin_name() == kImagePluginName) {
+    return kImageSlots;
+  } else if (metadata.plugin_data().plugin_name() == kAudioPluginName) {
+    return kAudioSlots;
+  } else if (metadata.plugin_data().plugin_name() == kHistogramPluginName) {
+    return kHistogramSlots;
+  } else if (t.dims() == 0 && t.dtype() != DT_STRING) {
+    return kScalarSlots;
+  } else {
+    return kTensorSlots;
+  }
 }
 
-// Tries to fudge shape and dtype to something with smaller storage.
-Status CoerceScalar(const Tensor& t, Tensor* out) {
-  switch (t.dtype()) {
-    case DT_DOUBLE:
-      *out = t;
-      break;
-    case DT_INT64:
-      *out = t;
-      break;
-    case DT_FLOAT:
-      *out = {DT_DOUBLE, {}};
-      out->scalar<double>()() = t.scalar<float>()();
-      break;
-    case DT_HALF:
-      *out = {DT_DOUBLE, {}};
-      out->scalar<double>()() = static_cast<double>(t.scalar<Eigen::half>()());
-      break;
-    case DT_INT32:
-      *out = {DT_INT64, {}};
-      out->scalar<int64>()() = t.scalar<int32>()();
-      break;
-    case DT_INT16:
-      *out = {DT_INT64, {}};
-      out->scalar<int64>()() = t.scalar<int16>()();
-      break;
-    case DT_INT8:
-      *out = {DT_INT64, {}};
-      out->scalar<int64>()() = t.scalar<int8>()();
-      break;
-    case DT_UINT32:
-      *out = {DT_INT64, {}};
-      out->scalar<int64>()() = t.scalar<uint32>()();
-      break;
-    case DT_UINT16:
-      *out = {DT_INT64, {}};
-      out->scalar<int64>()() = t.scalar<uint16>()();
-      break;
-    case DT_UINT8:
-      *out = {DT_INT64, {}};
-      out->scalar<int64>()() = t.scalar<uint8>()();
-      break;
-    default:
-      return errors::Unimplemented("Scalar summary for dtype ",
-                                   DataTypeString(t.dtype()),
-                                   " is not supported.");
-  }
-  return Status::OK();
+Status SetDescription(Sqlite* db, int64 id, const StringPiece& markdown) {
+  const char* sql = R"sql(
+    INSERT OR REPLACE INTO Descriptions (id, description) VALUES (?, ?)
+  )sql";
+  SqliteStatement insert_desc;
+  TF_RETURN_IF_ERROR(db->Prepare(sql, &insert_desc));
+  insert_desc.BindInt(1, id);
+  insert_desc.BindText(2, markdown);
+  return insert_desc.StepAndReset();
 }
 
-class Transactor {
+/// \brief Generates unique IDs randomly in the [1,2**63-1] range.
+///
+/// This class starts off generating IDs in the [1,2**23-1] range,
+/// because it's human friendly and occupies 4 bytes max on disk with
+/// SQLite's zigzag varint encoding. Then, each time a collision
+/// happens, the random space is increased by 8 bits.
+///
+/// This class uses exponential back-off so writes gradually slow down
+/// as IDs become exhausted but reads are still possible.
+///
+/// This class is thread safe.
+class IdAllocator {
  public:
-  explicit Transactor(std::shared_ptr<Sqlite> db)
-      : db_(std::move(db)),
-        begin_(db_->Prepare("BEGIN TRANSACTION")),
-        commit_(db_->Prepare("COMMIT TRANSACTION")),
-        rollback_(db_->Prepare("ROLLBACK TRANSACTION")) {}
-
-  template <typename T, typename... Args>
-  Status Transact(T callback, Args&&... args) {
-    TF_RETURN_IF_ERROR(begin_.StepAndReset());
-    Status s = callback(std::forward<Args>(args)...);
-    if (s.ok()) {
-      TF_RETURN_IF_ERROR(commit_.StepAndReset());
-    } else {
-      TF_RETURN_WITH_CONTEXT_IF_ERROR(rollback_.StepAndReset(), s.ToString());
+  IdAllocator(Env* env, Sqlite* db) : env_{env}, db_{db} {
+    DCHECK(env_ != nullptr);
+    DCHECK(db_ != nullptr);
+  }
+
+  Status CreateNewId(int64* id) LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    Status s;
+    SqliteStatement stmt;
+    TF_RETURN_IF_ERROR(db_->Prepare("INSERT INTO Ids (id) VALUES (?)", &stmt));
+    for (int i = 0; i < kMaxIdCollisions; ++i) {
+      int64 tid = MakeRandomId();
+      stmt.BindInt(1, tid);
+      s = stmt.StepAndReset();
+      if (s.ok()) {
+        *id = tid;
+        break;
+      }
+      // SQLITE_CONSTRAINT maps to INVALID_ARGUMENT in sqlite.cc
+      if (s.code() != error::INVALID_ARGUMENT) break;
+      if (tier_ < kMaxIdTier) {
+        LOG(INFO) << "IdAllocator collision at tier " << tier_ << " (of "
+                  << kMaxIdTier << ") so auto-adjusting to a higher tier";
+        ++tier_;
+      } else {
+        LOG(WARNING) << "IdAllocator (attempt #" << i << ") "
+                     << "resulted in a collision at the highest tier; this "
+                        "is problematic if it happens often; you can try "
+                        "pruning the Ids table; you can also file a bug "
+                        "asking for the ID space to be increased; otherwise "
+                        "writes will gradually slow down over time until they "
+                        "become impossible";
+      }
+      env_->SleepForMicroseconds((1 << i) * kIdCollisionDelayMicros);
     }
     return s;
   }
 
  private:
-  std::shared_ptr<Sqlite> db_;
-  SqliteStatement begin_;
-  SqliteStatement commit_;
-  SqliteStatement rollback_;
+  int64 MakeRandomId() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    int64 id = static_cast<int64>(random::New64() & kIdTiers[tier_]);
+    if (id == kAbsent) ++id;
+    return id;
+  }
+
+  mutex mu_;
+  Env* const env_;
+  Sqlite* const db_;
+  int tier_ GUARDED_BY(mu_) = 0;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(IdAllocator);
 };
 
-class GraphSaver {
+class GraphWriter {
  public:
-  static Status SaveToRun(Env* env, Sqlite* db, GraphDef* graph, int64 run_id) {
-    auto get = db->Prepare("SELECT graph_id FROM Runs WHERE run_id = ?");
-    get.BindInt(1, run_id);
-    bool is_done;
-    TF_RETURN_IF_ERROR(get.Step(&is_done));
-    int64 graph_id = is_done ? 0 : get.ColumnInt(0);
-    if (graph_id == 0) {
-      graph_id = MakeRandomId();
-      // TODO(@jart): Check for ID collision.
-      auto set = db->Prepare("UPDATE Runs SET graph_id = ? WHERE run_id = ?");
-      set.BindInt(1, graph_id);
-      set.BindInt(2, run_id);
-      TF_RETURN_IF_ERROR(set.StepAndReset());
-    }
-    return Save(env, db, graph, graph_id);
-  }
-
-  static Status Save(Env* env, Sqlite* db, GraphDef* graph, int64 graph_id) {
-    GraphSaver saver{env, db, graph, graph_id};
+  static Status Save(Sqlite* db, SqliteTransaction* txn, IdAllocator* ids,
+                     GraphDef* graph, uint64 now, int64 run_id, int64* graph_id)
+      SQLITE_EXCLUSIVE_TRANSACTIONS_REQUIRED(*db) {
+    TF_RETURN_IF_ERROR(ids->CreateNewId(graph_id));
+    GraphWriter saver{db, txn, graph, now, *graph_id};
     saver.MapNameToNodeId();
-    TF_RETURN_IF_ERROR(saver.SaveNodeInputs());
-    TF_RETURN_IF_ERROR(saver.SaveNodes());
-    TF_RETURN_IF_ERROR(saver.SaveGraph());
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(saver.SaveNodeInputs(), "SaveNodeInputs");
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(saver.SaveNodes(), "SaveNodes");
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(saver.SaveGraph(run_id), "SaveGraph");
     return Status::OK();
   }
 
  private:
-  GraphSaver(Env* env, Sqlite* db, GraphDef* graph, int64 graph_id)
-      : env_(env), db_(db), graph_(graph), graph_id_(graph_id) {}
+  GraphWriter(Sqlite* db, SqliteTransaction* txn, GraphDef* graph, uint64 now,
+              int64 graph_id)
+      : db_(db), txn_(txn), graph_(graph), now_(now), graph_id_(graph_id) {}
 
   void MapNameToNodeId() {
     size_t toto = static_cast<size_t>(graph_->node_size());
@@ -202,344 +264,983 @@ class GraphSaver {
   }
 
   Status SaveNodeInputs() {
-    auto purge = db_->Prepare("DELETE FROM NodeInputs WHERE graph_id = ?");
-    purge.BindInt(1, graph_id_);
-    TF_RETURN_IF_ERROR(purge.StepAndReset());
-    auto insert = db_->Prepare(R"sql(
-      INSERT INTO NodeInputs (graph_id, node_id, idx, input_node_id, is_control)
-      VALUES (?, ?, ?, ?, ?)
-    )sql");
+    const char* sql = R"sql(
+      INSERT INTO NodeInputs (
+        graph_id,
+        node_id,
+        idx,
+        input_node_id,
+        input_node_idx,
+        is_control
+      ) VALUES (?, ?, ?, ?, ?, ?)
+    )sql";
+    SqliteStatement insert;
+    TF_RETURN_IF_ERROR(db_->Prepare(sql, &insert));
     for (int node_id = 0; node_id < graph_->node_size(); ++node_id) {
       const NodeDef& node = graph_->node(node_id);
       for (int idx = 0; idx < node.input_size(); ++idx) {
         StringPiece name = node.input(idx);
-        insert.BindInt(1, graph_id_);
-        insert.BindInt(2, node_id);
-        insert.BindInt(3, idx);
+        int64 input_node_id;
+        int64 input_node_idx = 0;
+        int64 is_control = 0;
+        size_t i = name.rfind(':');
+        if (i != StringPiece::npos) {
+          if (!strings::safe_strto64(name.substr(i + 1, name.size() - i - 1),
+                                     &input_node_idx)) {
+            return errors::DataLoss("Bad NodeDef.input: ", name);
+          }
+          name.remove_suffix(name.size() - i);
+        }
         if (!name.empty() && name[0] == '^') {
           name.remove_prefix(1);
-          insert.BindInt(5, 1);
+          is_control = 1;
         }
         auto e = name_to_node_id_.find(name);
         if (e == name_to_node_id_.end()) {
           return errors::DataLoss("Could not find node: ", name);
         }
-        insert.BindInt(4, e->second);
+        input_node_id = e->second;
+        insert.BindInt(1, graph_id_);
+        insert.BindInt(2, node_id);
+        insert.BindInt(3, idx);
+        insert.BindInt(4, input_node_id);
+        insert.BindInt(5, input_node_idx);
+        insert.BindInt(6, is_control);
+        unflushed_bytes_ += insert.size();
         TF_RETURN_WITH_CONTEXT_IF_ERROR(insert.StepAndReset(), node.name(),
                                         " -> ", name);
+        TF_RETURN_IF_ERROR(MaybeFlush());
       }
     }
     return Status::OK();
   }
 
   Status SaveNodes() {
-    auto purge = db_->Prepare("DELETE FROM Nodes WHERE graph_id = ?");
-    purge.BindInt(1, graph_id_);
-    TF_RETURN_IF_ERROR(purge.StepAndReset());
-    auto insert = db_->Prepare(R"sql(
-      INSERT INTO Nodes (graph_id, node_id, node_name, op, device, node_def)
+    const char* sql = R"sql(
+      INSERT INTO Nodes (
+        graph_id,
+        node_id,
+        node_name,
+        op,
+        device,
+        node_def)
       VALUES (?, ?, ?, ?, ?, ?)
-    )sql");
+    )sql";
+    SqliteStatement insert;
+    TF_RETURN_IF_ERROR(db_->Prepare(sql, &insert));
     for (int node_id = 0; node_id < graph_->node_size(); ++node_id) {
       NodeDef* node = graph_->mutable_node(node_id);
       insert.BindInt(1, graph_id_);
       insert.BindInt(2, node_id);
       insert.BindText(3, node->name());
+      insert.BindText(4, node->op());
+      insert.BindText(5, node->device());
       node->clear_name();
-      if (!node->op().empty()) {
-        insert.BindText(4, node->op());
-        node->clear_op();
-      }
-      if (!node->device().empty()) {
-        insert.BindText(5, node->device());
-        node->clear_device();
-      }
+      node->clear_op();
+      node->clear_device();
       node->clear_input();
-      TF_RETURN_IF_ERROR(BindProto(&insert, 6, *node));
+      string node_def;
+      if (node->SerializeToString(&node_def)) {
+        insert.BindBlobUnsafe(6, node_def);
+      }
+      unflushed_bytes_ += insert.size();
       TF_RETURN_WITH_CONTEXT_IF_ERROR(insert.StepAndReset(), node->name());
+      TF_RETURN_IF_ERROR(MaybeFlush());
     }
     return Status::OK();
   }
 
-  Status SaveGraph() {
-    auto insert = db_->Prepare(R"sql(
-      INSERT OR REPLACE INTO Graphs (graph_id, inserted_time, graph_def)
-      VALUES (?, ?, ?)
-    )sql");
-    insert.BindInt(1, graph_id_);
-    insert.BindDouble(2, GetWallTime(env_));
+  Status SaveGraph(int64 run_id) {
+    const char* sql = R"sql(
+      INSERT OR REPLACE INTO Graphs (
+        run_id,
+        graph_id,
+        inserted_time,
+        graph_def
+      ) VALUES (?, ?, ?, ?)
+    )sql";
+    SqliteStatement insert;
+    TF_RETURN_IF_ERROR(db_->Prepare(sql, &insert));
+    if (run_id != kAbsent) insert.BindInt(1, run_id);
+    insert.BindInt(2, graph_id_);
+    insert.BindDouble(3, DoubleTime(now_));
     graph_->clear_node();
-    TF_RETURN_IF_ERROR(BindProto(&insert, 3, *graph_));
+    string graph_def;
+    if (graph_->SerializeToString(&graph_def)) {
+      insert.BindBlobUnsafe(4, graph_def);
+    }
     return insert.StepAndReset();
   }
 
-  Env* env_;
-  Sqlite* db_;
-  GraphDef* graph_;
-  int64 graph_id_;
+  Status MaybeFlush() {
+    if (unflushed_bytes_ >= kFlushBytes) {
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(txn_->Commit(), "flushing ",
+                                      unflushed_bytes_, " bytes");
+      unflushed_bytes_ = 0;
+    }
+    return Status::OK();
+  }
+
+  Sqlite* const db_;
+  SqliteTransaction* const txn_;
+  uint64 unflushed_bytes_ = 0;
+  GraphDef* const graph_;
+  const uint64 now_;
+  const int64 graph_id_;
   std::vector<string> name_copies_;
   std::unordered_map<StringPiece, int64, StringPieceHasher> name_to_node_id_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GraphWriter);
+};
+
+/// \brief Run metadata manager.
+///
+/// This class gives us Tag IDs we can pass to SeriesWriter. In order
+/// to do that, rows are created in the Ids, Tags, Runs, Experiments,
+/// and Users tables.
+///
+/// This class is thread safe.
+class RunMetadata {
+ public:
+  RunMetadata(IdAllocator* ids, const string& experiment_name,
+              const string& run_name, const string& user_name)
+      : ids_{ids},
+        experiment_name_{experiment_name},
+        run_name_{run_name},
+        user_name_{user_name} {
+    DCHECK(ids_ != nullptr);
+  }
+
+  const string& experiment_name() { return experiment_name_; }
+  const string& run_name() { return run_name_; }
+  const string& user_name() { return user_name_; }
+
+  int64 run_id() LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    return run_id_;
+  }
+
+  Status SetGraph(Sqlite* db, uint64 now, double computed_time,
+                  std::unique_ptr<GraphDef> g) SQLITE_TRANSACTIONS_EXCLUDED(*db)
+      LOCKS_EXCLUDED(mu_) {
+    int64 run_id;
+    {
+      mutex_lock lock(mu_);
+      TF_RETURN_IF_ERROR(InitializeRun(db, now, computed_time));
+      run_id = run_id_;
+    }
+    int64 graph_id;
+    SqliteTransaction txn(*db);  // only to increase performance
+    TF_RETURN_IF_ERROR(
+        GraphWriter::Save(db, &txn, ids_, g.get(), now, run_id, &graph_id));
+    return txn.Commit();
+  }
+
+  Status GetTagId(Sqlite* db, uint64 now, double computed_time,
+                  const string& tag_name, int64* tag_id,
+                  const SummaryMetadata& metadata) LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    TF_RETURN_IF_ERROR(InitializeRun(db, now, computed_time));
+    auto e = tag_ids_.find(tag_name);
+    if (e != tag_ids_.end()) {
+      *tag_id = e->second;
+      return Status::OK();
+    }
+    TF_RETURN_IF_ERROR(ids_->CreateNewId(tag_id));
+    tag_ids_[tag_name] = *tag_id;
+    TF_RETURN_IF_ERROR(
+        SetDescription(db, *tag_id, metadata.summary_description()));
+    const char* sql = R"sql(
+      INSERT INTO Tags (
+        run_id,
+        tag_id,
+        tag_name,
+        inserted_time,
+        display_name,
+        plugin_name,
+        plugin_data
+      ) VALUES (
+        :run_id,
+        :tag_id,
+        :tag_name,
+        :inserted_time,
+        :display_name,
+        :plugin_name,
+        :plugin_data
+      )
+    )sql";
+    SqliteStatement insert;
+    TF_RETURN_IF_ERROR(db->Prepare(sql, &insert));
+    if (run_id_ != kAbsent) insert.BindInt(":run_id", run_id_);
+    insert.BindInt(":tag_id", *tag_id);
+    insert.BindTextUnsafe(":tag_name", tag_name);
+    insert.BindDouble(":inserted_time", DoubleTime(now));
+    insert.BindTextUnsafe(":display_name", metadata.display_name());
+    insert.BindTextUnsafe(":plugin_name", metadata.plugin_data().plugin_name());
+    insert.BindBlobUnsafe(":plugin_data", metadata.plugin_data().content());
+    return insert.StepAndReset();
+  }
+
+  Status GetIsWatching(Sqlite* db, bool* is_watching)
+      SQLITE_TRANSACTIONS_EXCLUDED(*db) LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    if (experiment_id_ == kAbsent) {
+      *is_watching = true;
+      return Status::OK();
+    }
+    const char* sql = R"sql(
+      SELECT is_watching FROM Experiments WHERE experiment_id = ?
+    )sql";
+    SqliteStatement stmt;
+    TF_RETURN_IF_ERROR(db->Prepare(sql, &stmt));
+    stmt.BindInt(1, experiment_id_);
+    TF_RETURN_IF_ERROR(stmt.StepOnce());
+    *is_watching = stmt.ColumnInt(0) != 0;
+    return Status::OK();
+  }
+
+ private:
+  Status InitializeUser(Sqlite* db, uint64 now) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    if (user_id_ != kAbsent || user_name_.empty()) return Status::OK();
+    const char* get_sql = R"sql(
+      SELECT user_id FROM Users WHERE user_name = ?
+    )sql";
+    SqliteStatement get;
+    TF_RETURN_IF_ERROR(db->Prepare(get_sql, &get));
+    get.BindText(1, user_name_);
+    bool is_done;
+    TF_RETURN_IF_ERROR(get.Step(&is_done));
+    if (!is_done) {
+      user_id_ = get.ColumnInt(0);
+      return Status::OK();
+    }
+    TF_RETURN_IF_ERROR(ids_->CreateNewId(&user_id_));
+    const char* insert_sql = R"sql(
+      INSERT INTO Users (
+        user_id,
+        user_name,
+        inserted_time
+      ) VALUES (?, ?, ?)
+    )sql";
+    SqliteStatement insert;
+    TF_RETURN_IF_ERROR(db->Prepare(insert_sql, &insert));
+    insert.BindInt(1, user_id_);
+    insert.BindText(2, user_name_);
+    insert.BindDouble(3, DoubleTime(now));
+    TF_RETURN_IF_ERROR(insert.StepAndReset());
+    return Status::OK();
+  }
+
+  Status InitializeExperiment(Sqlite* db, uint64 now, double computed_time)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    if (experiment_name_.empty()) return Status::OK();
+    if (experiment_id_ == kAbsent) {
+      TF_RETURN_IF_ERROR(InitializeUser(db, now));
+      const char* get_sql = R"sql(
+        SELECT
+          experiment_id,
+          started_time
+        FROM
+          Experiments
+        WHERE
+          user_id IS ?
+          AND experiment_name = ?
+      )sql";
+      SqliteStatement get;
+      TF_RETURN_IF_ERROR(db->Prepare(get_sql, &get));
+      if (user_id_ != kAbsent) get.BindInt(1, user_id_);
+      get.BindText(2, experiment_name_);
+      bool is_done;
+      TF_RETURN_IF_ERROR(get.Step(&is_done));
+      if (!is_done) {
+        experiment_id_ = get.ColumnInt(0);
+        experiment_started_time_ = get.ColumnInt(1);
+      } else {
+        TF_RETURN_IF_ERROR(ids_->CreateNewId(&experiment_id_));
+        experiment_started_time_ = computed_time;
+        const char* insert_sql = R"sql(
+          INSERT INTO Experiments (
+            user_id,
+            experiment_id,
+            experiment_name,
+            inserted_time,
+            started_time,
+            is_watching
+          ) VALUES (?, ?, ?, ?, ?, ?)
+        )sql";
+        SqliteStatement insert;
+        TF_RETURN_IF_ERROR(db->Prepare(insert_sql, &insert));
+        if (user_id_ != kAbsent) insert.BindInt(1, user_id_);
+        insert.BindInt(2, experiment_id_);
+        insert.BindText(3, experiment_name_);
+        insert.BindDouble(4, DoubleTime(now));
+        insert.BindDouble(5, computed_time);
+        insert.BindInt(6, 0);
+        TF_RETURN_IF_ERROR(insert.StepAndReset());
+      }
+    }
+    if (computed_time < experiment_started_time_) {
+      experiment_started_time_ = computed_time;
+      const char* update_sql = R"sql(
+        UPDATE
+          Experiments
+        SET
+          started_time = ?
+        WHERE
+          experiment_id = ?
+      )sql";
+      SqliteStatement update;
+      TF_RETURN_IF_ERROR(db->Prepare(update_sql, &update));
+      update.BindDouble(1, computed_time);
+      update.BindInt(2, experiment_id_);
+      TF_RETURN_IF_ERROR(update.StepAndReset());
+    }
+    return Status::OK();
+  }
+
+  Status InitializeRun(Sqlite* db, uint64 now, double computed_time)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    if (run_name_.empty()) return Status::OK();
+    TF_RETURN_IF_ERROR(InitializeExperiment(db, now, computed_time));
+    if (run_id_ == kAbsent) {
+      TF_RETURN_IF_ERROR(ids_->CreateNewId(&run_id_));
+      run_started_time_ = computed_time;
+      const char* insert_sql = R"sql(
+        INSERT OR REPLACE INTO Runs (
+          experiment_id,
+          run_id,
+          run_name,
+          inserted_time,
+          started_time
+        ) VALUES (?, ?, ?, ?, ?)
+      )sql";
+      SqliteStatement insert;
+      TF_RETURN_IF_ERROR(db->Prepare(insert_sql, &insert));
+      if (experiment_id_ != kAbsent) insert.BindInt(1, experiment_id_);
+      insert.BindInt(2, run_id_);
+      insert.BindText(3, run_name_);
+      insert.BindDouble(4, DoubleTime(now));
+      insert.BindDouble(5, computed_time);
+      TF_RETURN_IF_ERROR(insert.StepAndReset());
+    }
+    if (computed_time < run_started_time_) {
+      run_started_time_ = computed_time;
+      const char* update_sql = R"sql(
+        UPDATE
+          Runs
+        SET
+          started_time = ?
+        WHERE
+          run_id = ?
+      )sql";
+      SqliteStatement update;
+      TF_RETURN_IF_ERROR(db->Prepare(update_sql, &update));
+      update.BindDouble(1, computed_time);
+      update.BindInt(2, run_id_);
+      TF_RETURN_IF_ERROR(update.StepAndReset());
+    }
+    return Status::OK();
+  }
+
+  mutex mu_;
+  IdAllocator* const ids_;
+  const string experiment_name_;
+  const string run_name_;
+  const string user_name_;
+  int64 experiment_id_ GUARDED_BY(mu_) = kAbsent;
+  int64 run_id_ GUARDED_BY(mu_) = kAbsent;
+  int64 user_id_ GUARDED_BY(mu_) = kAbsent;
+  double experiment_started_time_ GUARDED_BY(mu_) = 0.0;
+  double run_started_time_ GUARDED_BY(mu_) = 0.0;
+  std::unordered_map<string, int64> tag_ids_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RunMetadata);
+};
+
+/// \brief Tensor writer for a single series, e.g. Tag.
+///
+/// This class can be used to write an infinite stream of Tensors to the
+/// database in a fixed block of contiguous disk space. This is
+/// accomplished using Algorithm R reservoir sampling.
+///
+/// The reservoir consists of a fixed number of rows, which are inserted
+/// using ZEROBLOB upon receiving the first sample, which is used to
+/// predict how big the other ones are likely to be. This is done
+/// transactionally in a way that tries to be mindful of other processes
+/// that might be trying to access the same DB.
+///
+/// Once the reservoir fills up, rows are replaced at random, and writes
+/// gradually become no-ops. This allows long training to go fast
+/// without configuration. The exception is when someone is actually
+/// looking at TensorBoard. When that happens, the "keep last" behavior
+/// is turned on and Append() will always result in a write.
+///
+/// If no one is watching training, this class still holds on to the
+/// most recent "dangling" Tensor, so if Finish() is called, the most
+/// recent training state can be written to disk.
+///
+/// The randomly selected sampling points should be consistent across
+/// multiple instances.
+///
+/// This class is thread safe.
+class SeriesWriter {
+ public:
+  SeriesWriter(int64 series, int slots, RunMetadata* meta)
+      : series_{series},
+        slots_{slots},
+        meta_{meta},
+        rng_{std::mt19937_64::default_seed} {
+    DCHECK(series_ > 0);
+    DCHECK(slots_ > 0);
+  }
+
+  Status Append(Sqlite* db, int64 step, uint64 now, double computed_time,
+                Tensor t) SQLITE_TRANSACTIONS_EXCLUDED(*db)
+      LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    if (rowids_.empty()) {
+      Status s = Reserve(db, t);
+      if (!s.ok()) {
+        rowids_.clear();
+        return s;
+      }
+    }
+    DCHECK(rowids_.size() == slots_);
+    int64 rowid;
+    size_t i = count_;
+    if (i < slots_) {
+      rowid = last_rowid_ = rowids_[i];
+    } else {
+      i = rng_() % (i + 1);
+      if (i < slots_) {
+        rowid = last_rowid_ = rowids_[i];
+      } else {
+        bool keep_last;
+        TF_RETURN_IF_ERROR(meta_->GetIsWatching(db, &keep_last));
+        if (!keep_last) {
+          ++count_;
+          dangling_tensor_.reset(new Tensor(std::move(t)));
+          dangling_step_ = step;
+          dangling_computed_time_ = computed_time;
+          return Status::OK();
+        }
+        rowid = last_rowid_;
+      }
+    }
+    Status s = Write(db, rowid, step, computed_time, t);
+    if (s.ok()) {
+      ++count_;
+      dangling_tensor_.reset();
+    }
+    return s;
+  }
+
+  Status Finish(Sqlite* db) SQLITE_TRANSACTIONS_EXCLUDED(*db)
+      LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    // Short runs: Delete unused pre-allocated Tensors.
+    if (count_ < rowids_.size()) {
+      SqliteTransaction txn(*db);
+      const char* sql = R"sql(
+        DELETE FROM Tensors WHERE rowid = ?
+      )sql";
+      SqliteStatement deleter;
+      TF_RETURN_IF_ERROR(db->Prepare(sql, &deleter));
+      for (size_t i = count_; i < rowids_.size(); ++i) {
+        deleter.BindInt(1, rowids_[i]);
+        TF_RETURN_IF_ERROR(deleter.StepAndReset());
+      }
+      TF_RETURN_IF_ERROR(txn.Commit());
+      rowids_.clear();
+    }
+    // Long runs: Make last sample be the very most recent one.
+    if (dangling_tensor_) {
+      DCHECK(last_rowid_ != kAbsent);
+      TF_RETURN_IF_ERROR(Write(db, last_rowid_, dangling_step_,
+                               dangling_computed_time_, *dangling_tensor_));
+      dangling_tensor_.reset();
+    }
+    return Status::OK();
+  }
+
+ private:
+  Status Write(Sqlite* db, int64 rowid, int64 step, double computed_time,
+               const Tensor& t) SQLITE_TRANSACTIONS_EXCLUDED(*db) {
+    if (t.dtype() == DT_STRING) {
+      if (t.dims() == 0) {
+        return Update(db, step, computed_time, t, t.scalar<string>()(), rowid);
+      } else {
+        SqliteTransaction txn(*db);
+        TF_RETURN_IF_ERROR(
+            Update(db, step, computed_time, t, StringPiece(), rowid));
+        TF_RETURN_IF_ERROR(UpdateNdString(db, t, rowid));
+        return txn.Commit();
+      }
+    } else {
+      return Update(db, step, computed_time, t, t.tensor_data(), rowid);
+    }
+  }
+
+  Status Update(Sqlite* db, int64 step, double computed_time, const Tensor& t,
+                const StringPiece& data, int64 rowid) {
+    // TODO(jart): How can we ensure reservoir fills on replace?
+    const char* sql = R"sql(
+      UPDATE OR REPLACE
+        Tensors
+      SET
+        step = ?,
+        computed_time = ?,
+        dtype = ?,
+        shape = ?,
+        data = ?
+      WHERE
+        rowid = ?
+    )sql";
+    SqliteStatement stmt;
+    TF_RETURN_IF_ERROR(db->Prepare(sql, &stmt));
+    stmt.BindInt(1, step);
+    stmt.BindDouble(2, computed_time);
+    stmt.BindInt(3, t.dtype());
+    stmt.BindText(4, StringifyShape(t.shape()));
+    stmt.BindBlobUnsafe(5, data);
+    stmt.BindInt(6, rowid);
+    TF_RETURN_IF_ERROR(stmt.StepAndReset());
+    return Status::OK();
+  }
+
+  Status UpdateNdString(Sqlite* db, const Tensor& t, int64 tensor_rowid)
+      SQLITE_EXCLUSIVE_TRANSACTIONS_REQUIRED(*db) {
+    DCHECK_EQ(t.dtype(), DT_STRING);
+    DCHECK_GT(t.dims(), 0);
+    const char* deleter_sql = R"sql(
+      DELETE FROM TensorStrings WHERE tensor_rowid = ?
+    )sql";
+    SqliteStatement deleter;
+    TF_RETURN_IF_ERROR(db->Prepare(deleter_sql, &deleter));
+    deleter.BindInt(1, tensor_rowid);
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(deleter.StepAndReset(), tensor_rowid);
+    const char* inserter_sql = R"sql(
+      INSERT INTO TensorStrings (
+        tensor_rowid,
+        idx,
+        data
+      ) VALUES (?, ?, ?)
+    )sql";
+    SqliteStatement inserter;
+    TF_RETURN_IF_ERROR(db->Prepare(inserter_sql, &inserter));
+    auto flat = t.flat<string>();
+    for (int64 i = 0; i < flat.size(); ++i) {
+      inserter.BindInt(1, tensor_rowid);
+      inserter.BindInt(2, i);
+      inserter.BindBlobUnsafe(3, flat(i));
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(inserter.StepAndReset(), "i=", i);
+    }
+    return Status::OK();
+  }
+
+  Status Reserve(Sqlite* db, const Tensor& t) SQLITE_TRANSACTIONS_EXCLUDED(*db)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    SqliteTransaction txn(*db);  // only for performance
+    unflushed_bytes_ = 0;
+    if (t.dtype() == DT_STRING) {
+      if (t.dims() == 0) {
+        TF_RETURN_IF_ERROR(ReserveData(db, &txn, t.scalar<string>()().size()));
+      } else {
+        TF_RETURN_IF_ERROR(ReserveTensors(db, &txn, kReserveMinBytes));
+      }
+    } else {
+      TF_RETURN_IF_ERROR(ReserveData(db, &txn, t.tensor_data().size()));
+    }
+    return txn.Commit();
+  }
+
+  Status ReserveData(Sqlite* db, SqliteTransaction* txn, size_t size)
+      SQLITE_EXCLUSIVE_TRANSACTIONS_REQUIRED(*db)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    int64 space =
+        static_cast<int64>(static_cast<double>(size) * kReserveMultiplier);
+    if (space < kReserveMinBytes) space = kReserveMinBytes;
+    return ReserveTensors(db, txn, space);
+  }
+
+  Status ReserveTensors(Sqlite* db, SqliteTransaction* txn,
+                        int64 reserved_bytes)
+      SQLITE_EXCLUSIVE_TRANSACTIONS_REQUIRED(*db)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    const char* sql = R"sql(
+      INSERT INTO Tensors (
+        series,
+        data
+      ) VALUES (?, ZEROBLOB(?))
+    )sql";
+    SqliteStatement insert;
+    TF_RETURN_IF_ERROR(db->Prepare(sql, &insert));
+    // TODO(jart): Maybe preallocate index pages by setting step. This
+    //             is tricky because UPDATE OR REPLACE can have a side
+    //             effect of deleting preallocated rows.
+    for (int64 i = 0; i < slots_; ++i) {
+      insert.BindInt(1, series_);
+      insert.BindInt(2, reserved_bytes);
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(insert.StepAndReset(), "i=", i);
+      rowids_.push_back(db->last_insert_rowid());
+      unflushed_bytes_ += reserved_bytes;
+      TF_RETURN_IF_ERROR(MaybeFlush(db, txn));
+    }
+    return Status::OK();
+  }
+
+  Status MaybeFlush(Sqlite* db, SqliteTransaction* txn)
+      SQLITE_EXCLUSIVE_TRANSACTIONS_REQUIRED(*db)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    if (unflushed_bytes_ >= kFlushBytes) {
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(txn->Commit(), "flushing ",
+                                      unflushed_bytes_, " bytes");
+      unflushed_bytes_ = 0;
+    }
+    return Status::OK();
+  }
+
+  mutex mu_;
+  const int64 series_;
+  const int slots_;
+  RunMetadata* const meta_;
+  std::mt19937_64 rng_ GUARDED_BY(mu_);
+  uint64 count_ GUARDED_BY(mu_) = 0;
+  int64 last_rowid_ GUARDED_BY(mu_) = kAbsent;
+  std::vector<int64> rowids_ GUARDED_BY(mu_);
+  uint64 unflushed_bytes_ GUARDED_BY(mu_) = 0;
+  std::unique_ptr<Tensor> dangling_tensor_ GUARDED_BY(mu_);
+  int64 dangling_step_ GUARDED_BY(mu_) = 0;
+  double dangling_computed_time_ GUARDED_BY(mu_) = 0.0;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SeriesWriter);
+};
+
+/// \brief Tensor writer for a single Run.
+///
+/// This class farms out tensors to SeriesWriter instances. It also
+/// keeps track of whether or not someone is watching the TensorBoard
+/// GUI, so it can avoid writes when possible.
+///
+/// This class is thread safe.
+class RunWriter {
+ public:
+  explicit RunWriter(RunMetadata* meta) : meta_{meta} {}
+
+  Status Append(Sqlite* db, int64 tag_id, int64 step, uint64 now,
+                double computed_time, Tensor t, int slots)
+      SQLITE_TRANSACTIONS_EXCLUDED(*db) LOCKS_EXCLUDED(mu_) {
+    SeriesWriter* writer = GetSeriesWriter(tag_id, slots);
+    return writer->Append(db, step, now, computed_time, std::move(t));
+  }
+
+  Status Finish(Sqlite* db) SQLITE_TRANSACTIONS_EXCLUDED(*db)
+      LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    if (series_writers_.empty()) return Status::OK();
+    for (auto i = series_writers_.begin(); i != series_writers_.end(); ++i) {
+      if (!i->second) continue;
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(i->second->Finish(db),
+                                      "finish tag_id=", i->first);
+      i->second.reset();
+    }
+    return Status::OK();
+  }
+
+ private:
+  SeriesWriter* GetSeriesWriter(int64 tag_id, int slots) LOCKS_EXCLUDED(mu_) {
+    mutex_lock sl(mu_);
+    auto spot = series_writers_.find(tag_id);
+    if (spot == series_writers_.end()) {
+      SeriesWriter* writer = new SeriesWriter(tag_id, slots, meta_);
+      series_writers_[tag_id].reset(writer);
+      return writer;
+    } else {
+      return spot->second.get();
+    }
+  }
+
+  mutex mu_;
+  RunMetadata* const meta_;
+  std::unordered_map<int64, std::unique_ptr<SeriesWriter>> series_writers_
+      GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RunWriter);
 };
 
+/// \brief SQLite implementation of SummaryWriterInterface.
+///
+/// This class is thread safe.
 class SummaryDbWriter : public SummaryWriterInterface {
  public:
-  SummaryDbWriter(Env* env, std::shared_ptr<Sqlite> db)
+  SummaryDbWriter(Env* env, Sqlite* db, const string& experiment_name,
+                  const string& run_name, const string& user_name)
       : SummaryWriterInterface(),
-        env_(env),
-        db_(std::move(db)),
-        txn_(db_),
-        run_id_{0LL} {}
-  ~SummaryDbWriter() override {}
-
-  Status Initialize(const string& experiment_name, const string& run_name,
-                    const string& user_name) {
-    mutex_lock ml(mu_);
-    insert_tensor_ = db_->Prepare(R"sql(
-      INSERT OR REPLACE INTO Tensors (tag_id, step, computed_time, tensor)
-      VALUES (?, ?, ?, ?)
-    )sql");
-    update_metadata_ = db_->Prepare(R"sql(
-      UPDATE Tags SET metadata = ? WHERE tag_id = ?
-    )sql");
-    experiment_name_ = experiment_name;
-    run_name_ = run_name;
-    user_name_ = user_name;
-    return Status::OK();
+        env_{env},
+        db_{db},
+        ids_{env_, db_},
+        meta_{&ids_, experiment_name, run_name, user_name},
+        run_{&meta_} {
+    DCHECK(env_ != nullptr);
+    db_->Ref();
+  }
+
+  ~SummaryDbWriter() override {
+    core::ScopedUnref unref(db_);
+    Status s = run_.Finish(db_);
+    if (!s.ok()) {
+      // TODO(jart): Retry on transient errors here.
+      LOG(ERROR) << s.ToString();
+    }
+    int64 run_id = meta_.run_id();
+    if (run_id == kAbsent) return;
+    const char* sql = R"sql(
+      UPDATE Runs SET finished_time = ? WHERE run_id = ?
+    )sql";
+    SqliteStatement update;
+    s = db_->Prepare(sql, &update);
+    if (s.ok()) {
+      update.BindDouble(1, DoubleTime(env_->NowMicros()));
+      update.BindInt(2, run_id);
+      s = update.StepAndReset();
+    }
+    if (!s.ok()) {
+      LOG(ERROR) << "Failed to set Runs[" << run_id
+                 << "].finish_time: " << s.ToString();
+    }
   }
 
-  // TODO(@jart): Use transactions that COMMIT on Flush()
-  // TODO(@jart): Retry Commit() on SQLITE_BUSY with exponential back-off.
   Status Flush() override { return Status::OK(); }
 
   Status WriteTensor(int64 global_step, Tensor t, const string& tag,
                      const string& serialized_metadata) override {
-    mutex_lock ml(mu_);
-    TF_RETURN_IF_ERROR(InitializeParents());
-    // TODO(@jart): Memoize tag_id.
-    int64 tag_id;
-    TF_RETURN_IF_ERROR(GetTagId(run_id_, tag, &tag_id));
-    if (!serialized_metadata.empty()) {
-      // TODO(@jart): Only update metadata for first tensor.
-      update_metadata_.BindBlobUnsafe(1, serialized_metadata);
-      update_metadata_.BindInt(2, tag_id);
-      TF_RETURN_IF_ERROR(update_metadata_.StepAndReset());
-    }
-    // TODO(@jart): Lease blocks of rowids and *_ids to minimize fragmentation.
-    // TODO(@jart): Check for random ID collisions without needing txn retry.
-    insert_tensor_.BindInt(1, tag_id);
-    insert_tensor_.BindInt(2, global_step);
-    insert_tensor_.BindDouble(3, GetWallTime(env_));
-    if (t.shape().dims() == 0 && t.dtype() == DT_INT64) {
-      insert_tensor_.BindInt(4, t.scalar<int64>()());
-    } else if (t.shape().dims() == 0 && t.dtype() == DT_DOUBLE) {
-      insert_tensor_.BindDouble(4, t.scalar<double>()());
-    } else {
-      TF_RETURN_IF_ERROR(BindTensor(&insert_tensor_, 4, t));
+    TF_RETURN_IF_ERROR(CheckSupportedType(t));
+    SummaryMetadata metadata;
+    if (!metadata.ParseFromString(serialized_metadata)) {
+      return errors::InvalidArgument("Bad serialized_metadata");
     }
-    return insert_tensor_.StepAndReset();
+    return Write(global_step, t, tag, metadata);
   }
 
   Status WriteScalar(int64 global_step, Tensor t, const string& tag) override {
-    Tensor t2;
-    TF_RETURN_IF_ERROR(CoerceScalar(t, &t2));
-    // TODO(jart): Generate scalars plugin metadata on this value.
-    return WriteTensor(global_step, std::move(t2), tag, "");
+    TF_RETURN_IF_ERROR(CheckSupportedType(t));
+    SummaryMetadata metadata;
+    PatchPluginName(&metadata, kScalarPluginName);
+    return Write(global_step, AsScalar(t), tag, metadata);
   }
 
   Status WriteGraph(int64 global_step, std::unique_ptr<GraphDef> g) override {
-    mutex_lock ml(mu_);
-    TF_RETURN_IF_ERROR(InitializeParents());
-    return txn_.Transact(GraphSaver::SaveToRun, env_, db_.get(), g.get(),
-                         run_id_);
+    uint64 now = env_->NowMicros();
+    return meta_.SetGraph(db_, now, DoubleTime(now), std::move(g));
   }
 
   Status WriteEvent(std::unique_ptr<Event> e) override {
-    switch (e->what_case()) {
-      case Event::WhatCase::kSummary: {
-        mutex_lock ml(mu_);
-        TF_RETURN_IF_ERROR(InitializeParents());
-        const Summary& summary = e->summary();
-        for (int i = 0; i < summary.value_size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteSummary(e.get(), summary.value(i)));
-        }
-        return Status::OK();
-      }
-      case Event::WhatCase::kGraphDef: {
-        std::unique_ptr<GraphDef> graph{new GraphDef};
-        if (!ParseProtoUnlimited(graph.get(), e->graph_def())) {
-          return errors::DataLoss("parse event.graph_def failed");
-        }
-        return WriteGraph(e->step(), std::move(graph));
-      }
-      default:
-        // TODO(@jart): Handle other stuff.
-        return Status::OK();
-    }
+    return MigrateEvent(std::move(e));
   }
 
   Status WriteHistogram(int64 global_step, Tensor t,
                         const string& tag) override {
-    return errors::Unimplemented(
-        "SummaryDbWriter::WriteHistogram not supported. Please use ",
-        "tensorboard.summary.histogram() instead.");
+    uint64 now = env_->NowMicros();
+    std::unique_ptr<Event> e{new Event};
+    e->set_step(global_step);
+    e->set_wall_time(DoubleTime(now));
+    TF_RETURN_IF_ERROR(
+        AddTensorAsHistogramToSummary(t, tag, e->mutable_summary()));
+    return MigrateEvent(std::move(e));
   }
 
-  Status WriteImage(int64 global_step, Tensor tensor, const string& tag,
+  Status WriteImage(int64 global_step, Tensor t, const string& tag,
                     int max_images, Tensor bad_color) override {
-    return errors::Unimplemented(
-        "SummaryDbWriter::WriteImage not supported. Please use ",
-        "tensorboard.summary.image() instead.");
+    uint64 now = env_->NowMicros();
+    std::unique_ptr<Event> e{new Event};
+    e->set_step(global_step);
+    e->set_wall_time(DoubleTime(now));
+    TF_RETURN_IF_ERROR(AddTensorAsImageToSummary(t, tag, max_images, bad_color,
+                                                 e->mutable_summary()));
+    return MigrateEvent(std::move(e));
   }
 
-  Status WriteAudio(int64 global_step, Tensor tensor, const string& tag,
+  Status WriteAudio(int64 global_step, Tensor t, const string& tag,
                     int max_outputs, float sample_rate) override {
-    return errors::Unimplemented(
-        "SummaryDbWriter::WriteAudio not supported. Please use ",
-        "tensorboard.summary.audio() instead.");
+    uint64 now = env_->NowMicros();
+    std::unique_ptr<Event> e{new Event};
+    e->set_step(global_step);
+    e->set_wall_time(DoubleTime(now));
+    TF_RETURN_IF_ERROR(AddTensorAsAudioToSummary(
+        t, tag, max_outputs, sample_rate, e->mutable_summary()));
+    return MigrateEvent(std::move(e));
   }
 
   string DebugString() override { return "SummaryDbWriter"; }
 
  private:
-  Status InitializeParents() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (run_id_ > 0) {
-      return Status::OK();
-    }
-    int64 user_id;
-    TF_RETURN_IF_ERROR(GetUserId(user_name_, &user_id));
-    int64 experiment_id;
+  Status Write(int64 step, const Tensor& t, const string& tag,
+               const SummaryMetadata& metadata) {
+    uint64 now = env_->NowMicros();
+    double computed_time = DoubleTime(now);
+    int64 tag_id;
     TF_RETURN_IF_ERROR(
-        GetExperimentId(user_id, experiment_name_, &experiment_id));
-    TF_RETURN_IF_ERROR(GetRunId(experiment_id, run_name_, &run_id_));
+        meta_.GetTagId(db_, now, computed_time, tag, &tag_id, metadata));
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        run_.Append(db_, tag_id, step, now, computed_time, t,
+                    GetSlots(t, metadata)),
+        meta_.user_name(), "/", meta_.experiment_name(), "/", meta_.run_name(),
+        "/", tag, "@", step);
     return Status::OK();
   }
 
-  Status GetUserId(const string& user_name, int64* user_id)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (user_name.empty()) {
-      *user_id = 0LL;
-      return Status::OK();
-    }
-    SqliteStatement get_user_id = db_->Prepare(R"sql(
-      SELECT user_id FROM Users WHERE user_name = ?
-    )sql");
-    get_user_id.BindText(1, user_name);
-    bool is_done;
-    TF_RETURN_IF_ERROR(get_user_id.Step(&is_done));
-    if (!is_done) {
-      *user_id = get_user_id.ColumnInt(0);
-    } else {
-      *user_id = MakeRandomId();
-      SqliteStatement insert_user = db_->Prepare(R"sql(
-        INSERT INTO Users (user_id, user_name, inserted_time) VALUES (?, ?, ?)
-      )sql");
-      insert_user.BindInt(1, *user_id);
-      insert_user.BindText(2, user_name);
-      insert_user.BindDouble(3, GetWallTime(env_));
-      TF_RETURN_IF_ERROR(insert_user.StepAndReset());
+  Status MigrateEvent(std::unique_ptr<Event> e) {
+    switch (e->what_case()) {
+      case Event::WhatCase::kSummary: {
+        uint64 now = env_->NowMicros();
+        auto summaries = e->mutable_summary();
+        for (int i = 0; i < summaries->value_size(); ++i) {
+          Summary::Value* value = summaries->mutable_value(i);
+          TF_RETURN_WITH_CONTEXT_IF_ERROR(
+              MigrateSummary(e.get(), value, now), meta_.user_name(), "/",
+              meta_.experiment_name(), "/", meta_.run_name(), "/", value->tag(),
+              "@", e->step());
+        }
+        break;
+      }
+      case Event::WhatCase::kGraphDef:
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(
+            MigrateGraph(e.get(), e->graph_def()), meta_.user_name(), "/",
+            meta_.experiment_name(), "/", meta_.run_name(), "/__graph__@",
+            e->step());
+        break;
+      default:
+        // TODO(@jart): Handle other stuff.
+        break;
     }
     return Status::OK();
   }
 
-  Status GetExperimentId(int64 user_id, const string& experiment_name,
-                         int64* experiment_id) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    // TODO(@jart): Compute started_time.
-    return GetId("Experiments", "user_id", user_id, "experiment_name",
-                 experiment_name, "experiment_id", experiment_id);
+  Status MigrateGraph(const Event* e, const string& graph_def) {
+    uint64 now = env_->NowMicros();
+    std::unique_ptr<GraphDef> graph{new GraphDef};
+    if (!ParseProtoUnlimited(graph.get(), graph_def)) {
+      return errors::InvalidArgument("bad proto");
+    }
+    return meta_.SetGraph(db_, now, e->wall_time(), std::move(graph));
   }
 
-  Status GetRunId(int64 experiment_id, const string& run_name, int64* run_id)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    // TODO(@jart): Compute started_time.
-    return GetId("Runs", "experiment_id", experiment_id, "run_name", run_name,
-                 "run_id", run_id);
+  Status MigrateSummary(const Event* e, Summary::Value* s, uint64 now) {
+    switch (s->value_case()) {
+      case Summary::Value::ValueCase::kTensor:
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(MigrateTensor(e, s, now), "tensor");
+        break;
+      case Summary::Value::ValueCase::kSimpleValue:
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(MigrateScalar(e, s, now), "scalar");
+        break;
+      case Summary::Value::ValueCase::kHisto:
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(MigrateHistogram(e, s, now), "histo");
+        break;
+      case Summary::Value::ValueCase::kImage:
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(MigrateImage(e, s, now), "image");
+        break;
+      case Summary::Value::ValueCase::kAudio:
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(MigrateAudio(e, s, now), "audio");
+        break;
+      default:
+        break;
+    }
+    return Status::OK();
   }
 
-  Status GetTagId(int64 run_id, const string& tag_name, int64* tag_id)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    return GetId("Tags", "run_id", run_id, "tag_name", tag_name, "tag_id",
-                 tag_id);
+  Status MigrateTensor(const Event* e, Summary::Value* s, uint64 now) {
+    Tensor t;
+    if (!t.FromProto(s->tensor())) return errors::InvalidArgument("bad proto");
+    TF_RETURN_IF_ERROR(CheckSupportedType(t));
+    int64 tag_id;
+    TF_RETURN_IF_ERROR(meta_.GetTagId(db_, now, e->wall_time(), s->tag(),
+                                      &tag_id, s->metadata()));
+    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(), t,
+                       GetSlots(t, s->metadata()));
   }
 
-  Status GetId(const char* table, const char* parent_id_field, int64 parent_id,
-               const char* name_field, const string& name, const char* id_field,
-               int64* id) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (name.empty()) {
-      *id = 0LL;
-      return Status::OK();
-    }
-    SqliteStatement select = db_->Prepare(
-        strings::Printf("SELECT %s FROM %s WHERE %s = ? AND %s = ?", id_field,
-                        table, parent_id_field, name_field));
-    if (parent_id > 0) {
-      select.BindInt(1, parent_id);
+  // TODO(jart): Refactor Summary -> Tensor logic into separate file.
+
+  Status MigrateScalar(const Event* e, Summary::Value* s, uint64 now) {
+    // See tensorboard/plugins/scalar/summary.py and data_compat.py
+    Tensor t{DT_FLOAT, {}};
+    t.scalar<float>()() = s->simple_value();
+    int64 tag_id;
+    PatchPluginName(s->mutable_metadata(), kScalarPluginName);
+    TF_RETURN_IF_ERROR(meta_.GetTagId(db_, now, e->wall_time(), s->tag(),
+                                      &tag_id, s->metadata()));
+    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(),
+                       std::move(t), kScalarSlots);
+  }
+
+  Status MigrateHistogram(const Event* e, Summary::Value* s, uint64 now) {
+    const HistogramProto& histo = s->histo();
+    int k = histo.bucket_size();
+    if (k != histo.bucket_limit_size()) {
+      return errors::InvalidArgument("size mismatch");
     }
-    select.BindText(2, name);
-    bool is_done;
-    TF_RETURN_IF_ERROR(select.Step(&is_done));
-    if (!is_done) {
-      *id = select.ColumnInt(0);
-    } else {
-      *id = MakeRandomId();
-      SqliteStatement insert = db_->Prepare(strings::Printf(
-          "INSERT INTO %s (%s, %s, %s, inserted_time) VALUES (?, ?, ?, ?)",
-          table, parent_id_field, id_field, name_field));
-      if (parent_id > 0) {
-        insert.BindInt(1, parent_id);
-      }
-      insert.BindInt(2, *id);
-      insert.BindText(3, name);
-      insert.BindDouble(4, GetWallTime(env_));
-      TF_RETURN_IF_ERROR(insert.StepAndReset());
+    // See tensorboard/plugins/histogram/summary.py and data_compat.py
+    Tensor t{DT_DOUBLE, {k, 3}};
+    auto data = t.flat<double>();
+    for (int i = 0; i < k; ++i) {
+      double left_edge = ((i - 1 >= 0) ? histo.bucket_limit(i - 1)
+                                       : std::numeric_limits<double>::min());
+      double right_edge = ((i + 1 < k) ? histo.bucket_limit(i + 1)
+                                       : std::numeric_limits<double>::max());
+      data(i + 0) = left_edge;
+      data(i + 1) = right_edge;
+      data(i + 2) = histo.bucket(i);
     }
-    return Status::OK();
+    int64 tag_id;
+    PatchPluginName(s->mutable_metadata(), kHistogramPluginName);
+    TF_RETURN_IF_ERROR(meta_.GetTagId(db_, now, e->wall_time(), s->tag(),
+                                      &tag_id, s->metadata()));
+    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(),
+                       std::move(t), kHistogramSlots);
   }
 
-  Status WriteSummary(const Event* e, const Summary::Value& summary)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  Status MigrateImage(const Event* e, Summary::Value* s, uint64 now) {
+    // See tensorboard/plugins/image/summary.py and data_compat.py
+    Tensor t{DT_STRING, {3}};
+    auto img = s->mutable_image();
+    t.flat<string>()(0) = strings::StrCat(img->width());
+    t.flat<string>()(1) = strings::StrCat(img->height());
+    t.flat<string>()(2) = std::move(*img->mutable_encoded_image_string());
     int64 tag_id;
-    TF_RETURN_IF_ERROR(GetTagId(run_id_, summary.tag(), &tag_id));
-    insert_tensor_.BindInt(1, tag_id);
-    insert_tensor_.BindInt(2, e->step());
-    insert_tensor_.BindDouble(3, e->wall_time());
-    switch (summary.value_case()) {
-      case Summary::Value::ValueCase::kSimpleValue:
-        insert_tensor_.BindDouble(4, summary.simple_value());
-        break;
-      default:
-        // TODO(@jart): Handle the rest.
-        return Status::OK();
-    }
-    return insert_tensor_.StepAndReset();
+    PatchPluginName(s->mutable_metadata(), kImagePluginName);
+    TF_RETURN_IF_ERROR(meta_.GetTagId(db_, now, e->wall_time(), s->tag(),
+                                      &tag_id, s->metadata()));
+    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(),
+                       std::move(t), kImageSlots);
   }
 
-  mutex mu_;
-  Env* env_;
-  std::shared_ptr<Sqlite> db_ GUARDED_BY(mu_);
-  Transactor txn_ GUARDED_BY(mu_);
-  SqliteStatement insert_tensor_ GUARDED_BY(mu_);
-  SqliteStatement update_metadata_ GUARDED_BY(mu_);
-  string user_name_ GUARDED_BY(mu_);
-  string experiment_name_ GUARDED_BY(mu_);
-  string run_name_ GUARDED_BY(mu_);
-  int64 run_id_ GUARDED_BY(mu_);
+  Status MigrateAudio(const Event* e, Summary::Value* s, uint64 now) {
+    // See tensorboard/plugins/audio/summary.py and data_compat.py
+    Tensor t{DT_STRING, {1, 2}};
+    auto wav = s->mutable_audio();
+    t.flat<string>()(0) = std::move(*wav->mutable_encoded_audio_string());
+    t.flat<string>()(1) = "";
+    int64 tag_id;
+    PatchPluginName(s->mutable_metadata(), kAudioPluginName);
+    TF_RETURN_IF_ERROR(meta_.GetTagId(db_, now, e->wall_time(), s->tag(),
+                                      &tag_id, s->metadata()));
+    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(),
+                       std::move(t), kAudioSlots);
+  }
+
+  Env* const env_;
+  Sqlite* const db_;
+  IdAllocator ids_;
+  RunMetadata meta_;
+  RunWriter run_;
 };
 
 }  // namespace
 
-Status CreateSummaryDbWriter(std::shared_ptr<Sqlite> db,
-                             const string& experiment_name,
+Status CreateSummaryDbWriter(Sqlite* db, const string& experiment_name,
                              const string& run_name, const string& user_name,
                              Env* env, SummaryWriterInterface** result) {
-  TF_RETURN_IF_ERROR(SetupTensorboardSqliteDb(db));
-  SummaryDbWriter* w = new SummaryDbWriter(env, std::move(db));
-  const Status s = w->Initialize(experiment_name, run_name, user_name);
-  if (!s.ok()) {
-    w->Unref();
-    *result = nullptr;
-    return s;
-  }
-  *result = w;
+  *result = new SummaryDbWriter(env, db, experiment_name, run_name, user_name);
   return Status::OK();
 }
 
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer.h b/tensorflow/contrib/tensorboard/db/summary_db_writer.h
index 74f61e50b7cdf4b4151162a2e1e5e0af0d468be2..746da1533b157bf7b2be5c85ada8b61ba224cc3e 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer.h
+++ b/tensorflow/contrib/tensorboard/db/summary_db_writer.h
@@ -19,21 +19,21 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
 /// \brief Creates SQLite SummaryWriterInterface.
 ///
 /// This can be used to write tensors from the execution graph directly
-/// to a database. The schema will be created automatically, but only
-/// if necessary. Entries in the Users, Experiments, and Runs tables
-/// will be created automatically if they don't already exist.
+/// to a database. The schema must be created beforehand. Entries in
+/// Users, Experiments, and Runs tables will be created automatically
+/// if they don't already exist.
 ///
 /// Please note that the type signature of this function may change in
 /// the future if support for other DBs is added to core.
-Status CreateSummaryDbWriter(std::shared_ptr<Sqlite> db,
-                             const string& experiment_name,
+///
+/// The result holds a new reference to db.
+Status CreateSummaryDbWriter(Sqlite* db, const string& experiment_name,
                              const string& run_name, const string& user_name,
                              Env* env, SummaryWriterInterface** result);
 
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
index 625861fa6b137c6880c2072d7522f11c22720774..29b8063218de72aac1a73bbfb440e75fcdd5013f 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
 
+#include "tensorflow/contrib/tensorboard/db/schema.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
@@ -27,8 +29,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-const float kTolerance = 1e-5;
-
 Tensor MakeScalarInt64(int64 x) {
   Tensor t(DT_INT64, TensorShape({}));
   t.scalar<int64>()() = x;
@@ -48,17 +48,22 @@ class FakeClockEnv : public EnvWrapper {
 
 class SummaryDbWriterTest : public ::testing::Test {
  protected:
-  void SetUp() override { db_ = Sqlite::Open(":memory:").ValueOrDie(); }
+  void SetUp() override {
+    TF_ASSERT_OK(Sqlite::Open(":memory:", SQLITE_OPEN_READWRITE, &db_));
+    TF_ASSERT_OK(SetupTensorboardSqliteDb(db_));
+  }
 
   void TearDown() override {
     if (writer_ != nullptr) {
       writer_->Unref();
       writer_ = nullptr;
     }
+    db_->Unref();
+    db_ = nullptr;
   }
 
   int64 QueryInt(const string& sql) {
-    SqliteStatement stmt = db_->Prepare(sql);
+    SqliteStatement stmt = db_->PrepareOrDie(sql);
     bool is_done;
     Status s = stmt.Step(&is_done);
     if (!s.ok() || is_done) {
@@ -69,7 +74,7 @@ class SummaryDbWriterTest : public ::testing::Test {
   }
 
   double QueryDouble(const string& sql) {
-    SqliteStatement stmt = db_->Prepare(sql);
+    SqliteStatement stmt = db_->PrepareOrDie(sql);
     bool is_done;
     Status s = stmt.Step(&is_done);
     if (!s.ok() || is_done) {
@@ -80,7 +85,7 @@ class SummaryDbWriterTest : public ::testing::Test {
   }
 
   string QueryString(const string& sql) {
-    SqliteStatement stmt = db_->Prepare(sql);
+    SqliteStatement stmt = db_->PrepareOrDie(sql);
     bool is_done;
     Status s = stmt.Step(&is_done);
     if (!s.ok() || is_done) {
@@ -91,7 +96,7 @@ class SummaryDbWriterTest : public ::testing::Test {
   }
 
   FakeClockEnv env_;
-  std::shared_ptr<Sqlite> db_;
+  Sqlite* db_ = nullptr;
   SummaryWriterInterface* writer_ = nullptr;
 };
 
@@ -101,6 +106,7 @@ TEST_F(SummaryDbWriterTest, NothingWritten_NoRowsCreated) {
   TF_ASSERT_OK(writer_->Flush());
   writer_->Unref();
   writer_ = nullptr;
+  EXPECT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Ids"));
   EXPECT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Users"));
   EXPECT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Experiments"));
   EXPECT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Runs"));
@@ -109,20 +115,31 @@ TEST_F(SummaryDbWriterTest, NothingWritten_NoRowsCreated) {
 }
 
 TEST_F(SummaryDbWriterTest, TensorsWritten_RowsGetInitialized) {
+  SummaryMetadata metadata;
+  metadata.set_display_name("display_name");
+  metadata.set_summary_description("description");
+  metadata.mutable_plugin_data()->set_plugin_name("plugin_name");
+  metadata.mutable_plugin_data()->set_content("plugin_data");
+  SummaryMetadata metadata_nope;
+  metadata_nope.set_display_name("nope");
+  metadata_nope.set_summary_description("nope");
+  metadata_nope.mutable_plugin_data()->set_plugin_name("nope");
+  metadata_nope.mutable_plugin_data()->set_content("nope");
   TF_ASSERT_OK(CreateSummaryDbWriter(db_, "mad-science", "train", "jart", &env_,
                                      &writer_));
   env_.AdvanceByMillis(23);
   TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy",
-                                    "this-is-metaaa"));
+                                    metadata.SerializeAsString()));
   env_.AdvanceByMillis(23);
-  TF_ASSERT_OK(writer_->WriteTensor(2, MakeScalarInt64(314LL), "taggy", ""));
+  TF_ASSERT_OK(writer_->WriteTensor(2, MakeScalarInt64(314LL), "taggy",
+                                    metadata_nope.SerializeAsString()));
   TF_ASSERT_OK(writer_->Flush());
 
   ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Users"));
   ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Experiments"));
   ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Runs"));
   ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Tags"));
-  ASSERT_EQ(2LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
+  ASSERT_EQ(10000LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
 
   int64 user_id = QueryInt("SELECT user_id FROM Users");
   int64 experiment_id = QueryInt("SELECT experiment_id FROM Experiments");
@@ -148,33 +165,30 @@ TEST_F(SummaryDbWriterTest, TensorsWritten_RowsGetInitialized) {
   EXPECT_EQ(run_id, QueryInt("SELECT run_id FROM Tags"));
   EXPECT_EQ("taggy", QueryString("SELECT tag_name FROM Tags"));
   EXPECT_EQ(0.023, QueryDouble("SELECT inserted_time FROM Tags"));
-  EXPECT_EQ("this-is-metaaa", QueryString("SELECT metadata FROM Tags"));
 
-  EXPECT_EQ(tag_id, QueryInt("SELECT tag_id FROM Tensors WHERE step = 1"));
+  EXPECT_EQ("display_name", QueryString("SELECT display_name FROM Tags"));
+  EXPECT_EQ("plugin_name", QueryString("SELECT plugin_name FROM Tags"));
+  EXPECT_EQ("plugin_data", QueryString("SELECT plugin_data FROM Tags"));
+  EXPECT_EQ("description", QueryString("SELECT description FROM Descriptions"));
+
+  EXPECT_EQ(tag_id, QueryInt("SELECT series FROM Tensors WHERE step = 1"));
   EXPECT_EQ(0.023,
             QueryDouble("SELECT computed_time FROM Tensors WHERE step = 1"));
-  EXPECT_EQ("this-is-metaaa", QueryString("SELECT metadata FROM Tags"));
-  EXPECT_FALSE(
-      QueryString("SELECT tensor FROM Tensors WHERE step = 1").empty());
 
-  EXPECT_EQ(tag_id, QueryInt("SELECT tag_id FROM Tensors WHERE step = 2"));
+  EXPECT_EQ(tag_id, QueryInt("SELECT series FROM Tensors WHERE step = 2"));
   EXPECT_EQ(0.046,
             QueryDouble("SELECT computed_time FROM Tensors WHERE step = 2"));
-  EXPECT_EQ("this-is-metaaa", QueryString("SELECT metadata FROM Tags"));
-  EXPECT_FALSE(
-      QueryString("SELECT tensor FROM Tensors WHERE step = 2").empty());
 }
 
 TEST_F(SummaryDbWriterTest, EmptyParentNames_NoParentsCreated) {
   TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
-  TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy",
-                                    "this-is-metaaa"));
+  TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy", ""));
   TF_ASSERT_OK(writer_->Flush());
   ASSERT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Users"));
   ASSERT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Experiments"));
   ASSERT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Runs"));
   ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Tags"));
-  ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
+  ASSERT_EQ(10000LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
 }
 
 TEST_F(SummaryDbWriterTest, WriteEvent_Scalar) {
@@ -191,33 +205,24 @@ TEST_F(SummaryDbWriterTest, WriteEvent_Scalar) {
   TF_ASSERT_OK(writer_->WriteEvent(std::move(e)));
   TF_ASSERT_OK(writer_->Flush());
   ASSERT_EQ(2LL, QueryInt("SELECT COUNT(*) FROM Tags"));
-  ASSERT_EQ(2LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
+  ASSERT_EQ(20000LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
   int64 tag1_id = QueryInt("SELECT tag_id FROM Tags WHERE tag_name = 'π'");
   int64 tag2_id = QueryInt("SELECT tag_id FROM Tags WHERE tag_name = 'φ'");
   EXPECT_GT(tag1_id, 0LL);
   EXPECT_GT(tag2_id, 0LL);
   EXPECT_EQ(123.456, QueryDouble(strings::StrCat(
-                         "SELECT computed_time FROM Tensors WHERE tag_id = ",
+                         "SELECT computed_time FROM Tensors WHERE series = ",
                          tag1_id, " AND step = 7")));
   EXPECT_EQ(123.456, QueryDouble(strings::StrCat(
-                         "SELECT computed_time FROM Tensors WHERE tag_id = ",
+                         "SELECT computed_time FROM Tensors WHERE series = ",
                          tag2_id, " AND step = 7")));
-  EXPECT_NEAR(3.14,
-              QueryDouble(strings::StrCat(
-                  "SELECT tensor FROM Tensors WHERE tag_id = ", tag1_id,
-                  " AND step = 7")),
-              kTolerance);  // Summary::simple_value is float
-  EXPECT_NEAR(1.61,
-              QueryDouble(strings::StrCat(
-                  "SELECT tensor FROM Tensors WHERE tag_id = ", tag2_id,
-                  " AND step = 7")),
-              kTolerance);
 }
 
 TEST_F(SummaryDbWriterTest, WriteGraph) {
   TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "R", "", &env_, &writer_));
   env_.AdvanceByMillis(23);
   GraphDef graph;
+  graph.mutable_library()->add_gradient()->set_function_name("funk");
   NodeDef* node = graph.add_node();
   node->set_name("x");
   node->set_op("Placeholder");
@@ -243,11 +248,17 @@ TEST_F(SummaryDbWriterTest, WriteGraph) {
   ASSERT_EQ(4LL, QueryInt("SELECT COUNT(*) FROM Nodes"));
   ASSERT_EQ(3LL, QueryInt("SELECT COUNT(*) FROM NodeInputs"));
 
+  ASSERT_EQ(QueryInt("SELECT run_id FROM Runs"),
+            QueryInt("SELECT run_id FROM Graphs"));
+
   int64 graph_id = QueryInt("SELECT graph_id FROM Graphs");
   EXPECT_GT(graph_id, 0LL);
-  EXPECT_EQ(graph_id, QueryInt("SELECT graph_id FROM Runs"));
   EXPECT_EQ(0.023, QueryDouble("SELECT inserted_time FROM Graphs"));
-  EXPECT_FALSE(QueryString("SELECT graph_def FROM Graphs").empty());
+
+  GraphDef graph2;
+  graph2.ParseFromString(QueryString("SELECT graph_def FROM Graphs"));
+  EXPECT_EQ(0, graph2.node_size());
+  EXPECT_EQ("funk", graph2.library().gradient(0).function_name());
 
   EXPECT_EQ("x", QueryString("SELECT node_name FROM Nodes WHERE node_id = 0"));
   EXPECT_EQ("y", QueryString("SELECT node_name FROM Nodes WHERE node_id = 1"));
@@ -290,31 +301,38 @@ TEST_F(SummaryDbWriterTest, WriteGraph) {
   EXPECT_EQ(1LL, QueryInt("SELECT is_control FROM NodeInputs WHERE idx = 2"));
 }
 
-TEST_F(SummaryDbWriterTest, WriteScalarInt32_CoercesToInt64) {
-  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
-  Tensor t(DT_INT32, {});
-  t.scalar<int32>()() = -17;
-  TF_ASSERT_OK(writer_->WriteScalar(1, t, "t"));
-  TF_ASSERT_OK(writer_->Flush());
-  ASSERT_EQ(-17LL, QueryInt("SELECT tensor FROM Tensors"));
-}
-
-TEST_F(SummaryDbWriterTest, WriteScalarInt8_CoercesToInt64) {
-  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
-  Tensor t(DT_INT8, {});
-  t.scalar<int8>()() = static_cast<int8>(-17);
-  TF_ASSERT_OK(writer_->WriteScalar(1, t, "t"));
+TEST_F(SummaryDbWriterTest, UsesIdsTable) {
+  SummaryMetadata metadata;
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "mad-science", "train", "jart", &env_,
+                                     &writer_));
+  env_.AdvanceByMillis(23);
+  TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy",
+                                    metadata.SerializeAsString()));
   TF_ASSERT_OK(writer_->Flush());
-  ASSERT_EQ(-17LL, QueryInt("SELECT tensor FROM Tensors"));
+  ASSERT_EQ(4LL, QueryInt("SELECT COUNT(*) FROM Ids"));
+  EXPECT_EQ(4LL, QueryInt(strings::StrCat(
+                     "SELECT COUNT(*) FROM Ids WHERE id IN (",
+                     QueryInt("SELECT user_id FROM Users"), ", ",
+                     QueryInt("SELECT experiment_id FROM Experiments"), ", ",
+                     QueryInt("SELECT run_id FROM Runs"), ", ",
+                     QueryInt("SELECT tag_id FROM Tags"), ")")));
 }
 
-TEST_F(SummaryDbWriterTest, WriteScalarUint8_CoercesToInt64) {
-  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
-  Tensor t(DT_UINT8, {});
-  t.scalar<uint8>()() = static_cast<uint8>(254);
-  TF_ASSERT_OK(writer_->WriteScalar(1, t, "t"));
+TEST_F(SummaryDbWriterTest, SetsRunFinishedTime) {
+  SummaryMetadata metadata;
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "mad-science", "train", "jart", &env_,
+                                     &writer_));
+  env_.AdvanceByMillis(23);
+  TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy",
+                                    metadata.SerializeAsString()));
   TF_ASSERT_OK(writer_->Flush());
-  ASSERT_EQ(254LL, QueryInt("SELECT tensor FROM Tensors"));
+  ASSERT_EQ(0.023, QueryDouble("SELECT started_time FROM Runs"));
+  ASSERT_EQ(0.0, QueryDouble("SELECT finished_time FROM Runs"));
+  env_.AdvanceByMillis(23);
+  writer_->Unref();
+  writer_ = nullptr;
+  ASSERT_EQ(0.023, QueryDouble("SELECT started_time FROM Runs"));
+  ASSERT_EQ(0.046, QueryDouble("SELECT finished_time FROM Runs"));
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..85b3e7231bcb433e9510522597c03c5f764f06cf
--- /dev/null
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
@@ -0,0 +1,190 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
+
+#include "tensorflow/contrib/tensorboard/db/summary_converter.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/util/events_writer.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace {
+
+class SummaryFileWriter : public SummaryWriterInterface {
+ public:
+  SummaryFileWriter(int max_queue, int flush_millis, Env* env)
+      : SummaryWriterInterface(),
+        is_initialized_(false),
+        max_queue_(max_queue),
+        flush_millis_(flush_millis),
+        env_(env) {}
+
+  Status Initialize(const string& logdir, const string& filename_suffix) {
+    const Status is_dir = env_->IsDirectory(logdir);
+    if (!is_dir.ok()) {
+      if (is_dir.code() != tensorflow::error::NOT_FOUND) {
+        return is_dir;
+      }
+      TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(logdir));
+    }
+    mutex_lock ml(mu_);
+    events_writer_ =
+        tensorflow::MakeUnique<EventsWriter>(io::JoinPath(logdir, "events"));
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        events_writer_->InitWithSuffix(filename_suffix),
+        "Could not initialize events writer.");
+    last_flush_ = env_->NowMicros();
+    is_initialized_ = true;
+    return Status::OK();
+  }
+
+  Status Flush() override {
+    mutex_lock ml(mu_);
+    if (!is_initialized_) {
+      return errors::FailedPrecondition("Class was not properly initialized.");
+    }
+    return InternalFlush();
+  }
+
+  ~SummaryFileWriter() override {
+    (void)Flush();  // Ignore errors.
+  }
+
+  Status WriteTensor(int64 global_step, Tensor t, const string& tag,
+                     const string& serialized_metadata) override {
+    std::unique_ptr<Event> e{new Event};
+    e->set_step(global_step);
+    e->set_wall_time(GetWallTime());
+    Summary::Value* v = e->mutable_summary()->add_value();
+    t.AsProtoTensorContent(v->mutable_tensor());
+    v->set_tag(tag);
+    if (!serialized_metadata.empty()) {
+      v->mutable_metadata()->ParseFromString(serialized_metadata);
+    }
+    return WriteEvent(std::move(e));
+  }
+
+  Status WriteScalar(int64 global_step, Tensor t, const string& tag) override {
+    std::unique_ptr<Event> e{new Event};
+    e->set_step(global_step);
+    e->set_wall_time(GetWallTime());
+    TF_RETURN_IF_ERROR(
+        AddTensorAsScalarToSummary(t, tag, e->mutable_summary()));
+    return WriteEvent(std::move(e));
+  }
+
+  Status WriteHistogram(int64 global_step, Tensor t,
+                        const string& tag) override {
+    std::unique_ptr<Event> e{new Event};
+    e->set_step(global_step);
+    e->set_wall_time(GetWallTime());
+    TF_RETURN_IF_ERROR(
+        AddTensorAsHistogramToSummary(t, tag, e->mutable_summary()));
+    return WriteEvent(std::move(e));
+  }
+
+  Status WriteImage(int64 global_step, Tensor t, const string& tag,
+                    int max_images, Tensor bad_color) override {
+    std::unique_ptr<Event> e{new Event};
+    e->set_step(global_step);
+    e->set_wall_time(GetWallTime());
+    TF_RETURN_IF_ERROR(AddTensorAsImageToSummary(t, tag, max_images, bad_color,
+                                                 e->mutable_summary()));
+    return WriteEvent(std::move(e));
+  }
+
+  Status WriteAudio(int64 global_step, Tensor t, const string& tag,
+                    int max_outputs, float sample_rate) override {
+    std::unique_ptr<Event> e{new Event};
+    e->set_step(global_step);
+    e->set_wall_time(GetWallTime());
+    TF_RETURN_IF_ERROR(AddTensorAsAudioToSummary(
+        t, tag, max_outputs, sample_rate, e->mutable_summary()));
+    return WriteEvent(std::move(e));
+  }
+
+  Status WriteGraph(int64 global_step,
+                    std::unique_ptr<GraphDef> graph) override {
+    std::unique_ptr<Event> e{new Event};
+    e->set_step(global_step);
+    e->set_wall_time(GetWallTime());
+    graph->SerializeToString(e->mutable_graph_def());
+    return WriteEvent(std::move(e));
+  }
+
+  Status WriteEvent(std::unique_ptr<Event> event) override {
+    mutex_lock ml(mu_);
+    queue_.emplace_back(std::move(event));
+    if (queue_.size() >= max_queue_ ||
+        env_->NowMicros() - last_flush_ > 1000 * flush_millis_) {
+      return InternalFlush();
+    }
+    return Status::OK();
+  }
+
+  string DebugString() override { return "SummaryFileWriter"; }
+
+ private:
+  double GetWallTime() {
+    return static_cast<double>(env_->NowMicros()) / 1.0e6;
+  }
+
+  Status InternalFlush() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    for (const std::unique_ptr<Event>& e : queue_) {
+      events_writer_->WriteEvent(*e);
+    }
+    queue_.clear();
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(events_writer_->Flush(),
+                                    "Could not flush events file.");
+    last_flush_ = env_->NowMicros();
+    return Status::OK();
+  }
+
+  bool is_initialized_;
+  const int max_queue_;
+  const int flush_millis_;
+  uint64 last_flush_;
+  Env* env_;
+  mutex mu_;
+  std::vector<std::unique_ptr<Event>> queue_ GUARDED_BY(mu_);
+  // A pointer to allow deferred construction.
+  std::unique_ptr<EventsWriter> events_writer_ GUARDED_BY(mu_);
+  std::vector<std::pair<string, SummaryMetadata>> registered_summaries_
+      GUARDED_BY(mu_);
+};
+
+}  // namespace
+
+Status CreateSummaryFileWriter(int max_queue, int flush_millis,
+                               const string& logdir,
+                               const string& filename_suffix, Env* env,
+                               SummaryWriterInterface** result) {
+  SummaryFileWriter* w = new SummaryFileWriter(max_queue, flush_millis, env);
+  const Status s = w->Initialize(logdir, filename_suffix);
+  if (!s.ok()) {
+    w->Unref();
+    *result = nullptr;
+    return s;
+  }
+  *result = w;
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.h b/tensorflow/contrib/tensorboard/db/summary_file_writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..73b0a5542beabdc460c32156dd44aacc5f08610a
--- /dev/null
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer.h
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_FILE_WRITER_H_
+#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_FILE_WRITER_H_
+
+#include "tensorflow/core/kernels/summary_interface.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+/// \brief Creates SummaryWriterInterface which writes to a file.
+///
+/// The file is an append-only records file of tf.Event protos. That
+/// makes this summary writer suitable for file systems like GCS.
+///
+/// It will enqueue up to max_queue summaries, and flush at least every
+/// flush_millis milliseconds. The summaries will be written to the
+/// directory specified by logdir and with the filename suffixed by
+/// filename_suffix. The caller owns a reference to result if the
+/// returned status is ok. The Env object must not be destroyed until
+/// after the returned writer.
+Status CreateSummaryFileWriter(int max_queue, int flush_millis,
+                               const string& logdir,
+                               const string& filename_suffix, Env* env,
+                               SummaryWriterInterface** result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_FILE_WRITER_H_
diff --git a/tensorflow/core/kernels/summary_interface_test.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
similarity index 93%
rename from tensorflow/core/kernels/summary_interface_test.cc
rename to tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
index 58e021a0b3e889ce1efe1bb5c73bcc74e16db139..c61b4655961664a6c9c22a5f6d6f26a55c34bfcd 100644
--- a/tensorflow/core/kernels/summary_interface_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/summary_interface.h"
+#include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
 
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -37,7 +37,7 @@ class FakeClockEnv : public EnvWrapper {
   uint64 current_millis_;
 };
 
-class SummaryInterfaceTest : public ::testing::Test {
+class SummaryFileWriterTest : public ::testing::Test {
  protected:
   Status SummaryTestHelper(
       const string& test_name,
@@ -47,8 +47,8 @@ class SummaryInterfaceTest : public ::testing::Test {
     CHECK(tests->insert(test_name).second) << ": " << test_name;
 
     SummaryWriterInterface* writer;
-    TF_CHECK_OK(CreateSummaryWriter(1, 1, testing::TmpDir(), test_name, &env_,
-                                    &writer));
+    TF_CHECK_OK(CreateSummaryFileWriter(1, 1, testing::TmpDir(), test_name,
+                                        &env_, &writer));
     core::ScopedUnref deleter(writer);
 
     TF_CHECK_OK(writer_fn(writer));
@@ -87,7 +87,7 @@ class SummaryInterfaceTest : public ::testing::Test {
   FakeClockEnv env_;
 };
 
-TEST_F(SummaryInterfaceTest, WriteTensor) {
+TEST_F(SummaryFileWriterTest, WriteTensor) {
   TF_CHECK_OK(SummaryTestHelper("tensor_test",
                                 [](SummaryWriterInterface* writer) {
                                   Tensor one(DT_FLOAT, TensorShape({}));
@@ -105,7 +105,7 @@ TEST_F(SummaryInterfaceTest, WriteTensor) {
                                 }));
 }
 
-TEST_F(SummaryInterfaceTest, WriteScalar) {
+TEST_F(SummaryFileWriterTest, WriteScalar) {
   TF_CHECK_OK(SummaryTestHelper(
       "scalar_test",
       [](SummaryWriterInterface* writer) {
@@ -123,7 +123,7 @@ TEST_F(SummaryInterfaceTest, WriteScalar) {
       }));
 }
 
-TEST_F(SummaryInterfaceTest, WriteHistogram) {
+TEST_F(SummaryFileWriterTest, WriteHistogram) {
   TF_CHECK_OK(SummaryTestHelper("hist_test",
                                 [](SummaryWriterInterface* writer) {
                                   Tensor one(DT_FLOAT, TensorShape({}));
@@ -141,7 +141,7 @@ TEST_F(SummaryInterfaceTest, WriteHistogram) {
                                 }));
 }
 
-TEST_F(SummaryInterfaceTest, WriteImage) {
+TEST_F(SummaryFileWriterTest, WriteImage) {
   TF_CHECK_OK(SummaryTestHelper(
       "image_test",
       [](SummaryWriterInterface* writer) {
@@ -162,7 +162,7 @@ TEST_F(SummaryInterfaceTest, WriteImage) {
       }));
 }
 
-TEST_F(SummaryInterfaceTest, WriteAudio) {
+TEST_F(SummaryFileWriterTest, WriteAudio) {
   TF_CHECK_OK(SummaryTestHelper(
       "audio_test",
       [](SummaryWriterInterface* writer) {
@@ -180,7 +180,7 @@ TEST_F(SummaryInterfaceTest, WriteAudio) {
       }));
 }
 
-TEST_F(SummaryInterfaceTest, WriteEvent) {
+TEST_F(SummaryFileWriterTest, WriteEvent) {
   TF_CHECK_OK(
       SummaryTestHelper("event_test",
                         [](SummaryWriterInterface* writer) {
@@ -198,7 +198,7 @@ TEST_F(SummaryInterfaceTest, WriteEvent) {
                         }));
 }
 
-TEST_F(SummaryInterfaceTest, WallTime) {
+TEST_F(SummaryFileWriterTest, WallTime) {
   env_.AdvanceByMillis(7023);
   TF_CHECK_OK(SummaryTestHelper(
       "wall_time_test",
diff --git a/tensorflow/contrib/tensorboard/db/vacuum.cc b/tensorflow/contrib/tensorboard/db/vacuum.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5febe63f0612046f96b89053811952e67d4c449b
--- /dev/null
+++ b/tensorflow/contrib/tensorboard/db/vacuum.cc
@@ -0,0 +1,137 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iostream>
+
+#include "tensorflow/core/lib/db/sqlite.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace {
+
+void Vacuum(const char* path) {
+  LOG(INFO) << "Opening SQLite DB: " << path;
+  Sqlite* db;
+  TF_CHECK_OK(Sqlite::Open(path, SQLITE_OPEN_READWRITE, &db));
+  core::ScopedUnref db_unref(db);
+
+  // TODO(jart): Maybe defragment rowids on Tensors.
+  // TODO(jart): Maybe LIMIT deletes and incremental VACUUM.
+
+  // clang-format off
+
+  LOG(INFO) << "Deleting orphaned Experiments";
+  db->PrepareOrDie(R"sql(
+    DELETE FROM
+      Experiments
+    WHERE
+      user_id IS NOT NULL
+      AND user_id NOT IN (SELECT user_id FROM Users)
+  )sql").StepAndResetOrDie();
+
+  LOG(INFO) << "Deleting orphaned Runs";
+  db->PrepareOrDie(R"sql(
+    DELETE FROM
+      Runs
+    WHERE
+      experiment_id IS NOT NULL
+      AND experiment_id NOT IN (SELECT experiment_id FROM Experiments)
+  )sql").StepAndResetOrDie();
+
+  LOG(INFO) << "Deleting orphaned Tags";
+  db->PrepareOrDie(R"sql(
+    DELETE FROM
+      Tags
+    WHERE
+      run_id IS NOT NULL
+      AND run_id NOT IN (SELECT run_id FROM Runs)
+  )sql").StepAndResetOrDie();
+
+  // TODO(jart): What should we do if plugins define non-tag tensor series?
+  LOG(INFO) << "Deleting orphaned Tensors";
+  db->PrepareOrDie(R"sql(
+    DELETE FROM
+      Tensors
+    WHERE
+      series IS NOT NULL
+      AND series NOT IN (SELECT tag_id FROM Tags)
+  )sql").StepAndResetOrDie();
+
+  LOG(INFO) << "Deleting orphaned TensorStrings";
+  db->PrepareOrDie(R"sql(
+    DELETE FROM
+      TensorStrings
+    WHERE
+      tensor_rowid NOT IN (SELECT rowid FROM Tensors)
+  )sql").StepAndResetOrDie();
+
+  LOG(INFO) << "Deleting orphaned Graphs";
+  db->PrepareOrDie(R"sql(
+    DELETE FROM
+      Graphs
+    WHERE
+      run_id IS NOT NULL
+      AND run_id NOT IN (SELECT run_id FROM Runs)
+  )sql").StepAndResetOrDie();
+
+  LOG(INFO) << "Deleting orphaned Nodes";
+  db->PrepareOrDie(R"sql(
+    DELETE FROM
+      Nodes
+    WHERE
+      graph_id NOT IN (SELECT graph_id FROM Graphs)
+  )sql").StepAndResetOrDie();
+
+  LOG(INFO) << "Deleting orphaned NodeInputs";
+  db->PrepareOrDie(R"sql(
+    DELETE FROM
+      NodeInputs
+    WHERE
+      graph_id NOT IN (SELECT graph_id FROM Graphs)
+  )sql").StepAndResetOrDie();
+
+  LOG(INFO) << "Running VACUUM";
+  db->PrepareOrDie("VACUUM").StepAndResetOrDie();
+
+  // clang-format on
+}
+
+int main(int argc, char* argv[]) {
+  string usage = Flags::Usage(argv[0], {});
+  bool parse_result = Flags::Parse(&argc, argv, {});
+  if (!parse_result) {
+    std::cerr << "The vacuum tool rebuilds SQLite database files created by\n"
+              << "SummaryDbWriter, which makes them smaller.\n\n"
+              << "This means deleting orphaned rows and rebuilding b-tree\n"
+              << "pages so empty space from deleted rows is cleared. Any\n"
+              << "superfluous padding of Tensor BLOBs is also removed.\n\n"
+              << usage;
+    return -1;
+  }
+  port::InitMain(argv[0], &argc, &argv);
+  if (argc < 2 || argv[1][0] == '-') {
+    std::cerr << "Need at least one SQLite DB path.\n";
+    return -1;
+  }
+  for (int i = 1; i < argc; ++i) {
+    Vacuum(argv[i]);
+  }
+  return 0;
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+int main(int argc, char* argv[]) { return tensorflow::main(argc, argv); }
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..cf67c27b70f1a8c761b71074d3eb5cd962a68488
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -0,0 +1,246 @@
+# Description:
+#   Wrap NVIDIA TensorRT (http://developer.nvidia.com/tensorrt) with tensorflow
+#   and provide TensorRT operators and converter package.
+#   APIs are meant to change over time.
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_copts",
+    "tf_cuda_library",
+    "tf_custom_op_library",
+    "tf_custom_op_library_additional_deps",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
+
+tf_cuda_cc_test(
+    name = "tensorrt_test_cc",
+    size = "small",
+    srcs = ["tensorrt_test.cc"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_custom_op_library(
+    name = "python/ops/_trt_engine_op.so",
+    srcs = ["ops/trt_engine_op.cc"],
+    deps = [
+        ":trt_engine_op_kernel",
+        ":trt_shape_function",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_cuda_library(
+    name = "trt_shape_function",
+    srcs = ["shape_fn/trt_shfn.cc"],
+    hdrs = ["shape_fn/trt_shfn.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":trt_logging",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]) + tf_custom_op_library_additional_deps(),
+)
+
+cc_library(
+    name = "trt_engine_op_kernel",
+    srcs = ["kernels/trt_engine_op.cc"],
+    hdrs = ["kernels/trt_engine_op.h"],
+    copts = tf_copts(),
+    deps = [
+        ":trt_logging",
+        "//tensorflow/core:gpu_headers_lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:stream_executor_headers_lib",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]) + tf_custom_op_library_additional_deps(),
+    alwayslink = 1,
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["trt_engine_op"],
+    deps = if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_cuda_library(
+    name = "trt_logging",
+    srcs = ["log/trt_logger.cc"],
+    hdrs = ["log/trt_logger.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_gen_op_wrapper_py(
+    name = "trt_engine_op",
+    deps = [
+        ":trt_engine_op_op_lib",
+        ":trt_logging",
+        ":trt_shape_function",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "trt_engine_op_loader",
+    srcs = ["python/ops/trt_engine_op.py"],
+    dso = [
+        ":python/ops/_trt_engine_op.so",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:resources",
+    ],
+)
+
+py_library(
+    name = "init_py",
+    srcs = [
+        "__init__.py",
+        "python/__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":trt_convert_py",
+        ":trt_ops_py",
+    ],
+)
+
+py_library(
+    name = "trt_ops_py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":trt_engine_op",
+        ":trt_engine_op_loader",
+    ],
+)
+
+py_library(
+    name = "trt_convert_py",
+    srcs = ["python/trt_convert.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":wrap_conversion",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "wrap_conversion",
+    srcs = ["trt_conversion.i"],
+    copts = tf_copts(),
+    deps = [
+        ":trt_conversion",
+        "//tensorflow/core:framework_lite",
+        "//util/python:python_headers",
+    ],
+)
+
+# Library for the node-level conversion portion of TensorRT operation creation
+tf_cuda_library(
+    name = "trt_conversion",
+    srcs = [
+        "convert/convert_graph.cc",
+        "convert/convert_nodes.cc",
+    ],
+    hdrs = [
+        "convert/convert_graph.h",
+        "convert/convert_nodes.h",
+    ],
+    deps = [
+        ":segment",
+        ":trt_logging",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/optimizers:constant_folding",
+        "//tensorflow/core/grappler/optimizers:layout_optimizer",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]) + tf_custom_op_library_additional_deps(),
+)
+
+# Library for the segmenting portion of TensorRT operation creation
+cc_library(
+    name = "segment",
+    srcs = ["segment/segment.cc"],
+    hdrs = [
+        "segment/segment.h",
+        "segment/union_find.h",
+    ],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+tf_cc_test(
+    name = "segment_test",
+    size = "small",
+    srcs = ["segment/segment_test.cc"],
+    deps = [
+        ":segment",
+        "//tensorflow/c:c_api",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..dfcce0fd00eedf3341850bbc23927dc3b2e2d2aa
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/README.md
@@ -0,0 +1,40 @@
+Using TensorRT in TensorFlow
+============================
+
+This module provides necessary bindings and introduces TRT_engine_op
+operator that wraps a subgraph in TensorRT.
+
+Compilation
+-----------
+
+In order to compile the module, you need to have a local TensorRT
+installation (libnvinfer.so and respective include files). During the
+configuration step, TensorRT should be enabled and installation path
+should be set. If installed through package managers (deb,rpm),
+configure script should find the necessary components from the system
+automatically. If installed from tar packages, user has to set path to
+location where the library is installed during configuration.
+
+
+```
+bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package
+bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
+```
+
+After the installation of tensorflow package, TensorRT transformation
+will be available. An example use is shown below.
+
+```python
+import tensorflow as tf
+import tensorflow.contrib.tensorrt as trt
+#... create and train or load model
+gdef = sess.graph.as_graph_def()
+trt_gdef = trt.create_inference_graph(
+    gdef, #original graph_def
+    ["output"], #name of output node(s)
+    max_batch_size, #maximum batch size to run the inference
+    max_workspace_size_bytes) # max memory for TensorRT to use
+tf.reset_default_graph()
+tf.import_graph_def(graph_def=trt_gdef)
+#...... run inference
+```
diff --git a/tensorflow/contrib/tensorrt/__init__.py b/tensorflow/contrib/tensorrt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd551d70b4385b14b84b7b98a6d16b0c03733d38
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Exposes the python wrapper for TensorRT graph transforms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.tensorrt.python import *
+# pylint: enable=unused-import,wildcard-import
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..899448004f917b36b35fb871a66a9d857736a338
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -0,0 +1,273 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+
+#include <map>
+#include <set>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+namespace {
+
+static bool IsTensorRTCandidate(const tensorflow::NodeDef& node_def) {
+  // LINT.IfChange
+  // TODO(jie): Segmentation shouldn't associated with op name.
+  //            Split it into a registration for each kernel.
+  static const std::set<string> candidate_ops = {
+      "Identity", "Const", "Conv2D", "MaxPool", "BiasAdd", "Relu",
+      "Add",      "Mul",   "Sub",    "Rsqrt",   "Pad"  // "Placeholder" ,"Mean"
+  };
+  // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
+  return candidate_ops.count(node_def.op());
+}
+
+void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
+                              const std::set<int>& subgraph_node_ids,
+                              tensorflow::EdgeSet* incoming_edges) {
+  for (int node_id : subgraph_node_ids) {
+    const tensorflow::Node* node = graph.FindNodeId(node_id);
+    for (const tensorflow::Edge* edge : node->in_edges()) {
+      if (!subgraph_node_ids.count(edge->src()->id()) &&
+          !edge->src()->IsSource()) {
+        incoming_edges->insert(edge);
+      }
+    }
+  }
+}
+
+void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
+                              const std::set<int>& subgraph_node_ids,
+                              tensorflow::EdgeSet* outgoing_edges) {
+  for (int node_id : subgraph_node_ids) {
+    const tensorflow::Node* node = graph.FindNodeId(node_id);
+    for (const tensorflow::Edge* edge : node->out_edges()) {
+      if (!subgraph_node_ids.count(edge->dst()->id()) &&
+          !edge->dst()->IsSink()) {
+        outgoing_edges->insert(edge);
+      }
+    }
+  }
+}
+
+std::pair<string, int> ParseTensorName(string name, int default_idx = 0) {
+  int idx = default_idx;
+  size_t sep = name.find_last_of(':');
+  if (sep != string::npos) {
+    name = name.substr(0, sep);
+    idx = std::stoi(name.substr(sep + 1));
+  }
+  return std::make_pair(name, idx);
+}
+
+std::unordered_map<string, std::vector<int>> BuildTensorNameMap(
+    const std::vector<string>& tensor_names) {
+  std::unordered_map<string, std::vector<int>> result;
+  for (string const& tensor_name : tensor_names) {
+    string node_name;
+    int index;
+    std::tie(node_name, index) = ParseTensorName(tensor_name);
+    result[node_name].push_back(index);
+  }
+  return result;
+}
+
+tensorflow::Status ConvertSubGraphToTensorRT(
+    const std::vector<string>& output_names,
+    const std::set<int>& subgraph_node_ids,
+    size_t max_batch_size,  // Max batch size that engine will be created for
+    // Max amount of memory that engine will be allowed to consume, in bytes
+    size_t max_workspace_size_bytes,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    tensorflow::Graph* graph) {
+  tensorflow::EdgeSet subgraph_incoming_edges;
+  GetSubGraphIncomingEdges(*graph, subgraph_node_ids, &subgraph_incoming_edges);
+
+  std::vector<std::pair<int, int>> subgraph_inputs;
+
+  // Collect inputs by looking for incoming edges
+  for (const tensorflow::Edge* edge : subgraph_incoming_edges) {
+    subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
+  }
+  std::set<std::pair<int, int>> subgraph_outputs_set;
+  // Collect outputs referenced from output_names
+  auto output_name_to_index_map = BuildTensorNameMap(output_names);
+  for (int node_id : subgraph_node_ids) {
+    tensorflow::Node* node = graph->FindNodeId(node_id);
+    if (output_name_to_index_map.count(node->name())) {
+      for (int index : output_name_to_index_map.at(node->name())) {
+        subgraph_outputs_set.insert({node_id, index});
+      }
+    }
+  }
+  // Collect outputs referenced from outgoing edges
+  tensorflow::EdgeSet subgraph_outgoing_edges;
+  GetSubGraphOutgoingEdges(*graph, subgraph_node_ids, &subgraph_outgoing_edges);
+  for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
+    subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
+  }
+  // Impose an ordering on the outputs
+  std::vector<std::pair<int, int>> subgraph_outputs(
+      subgraph_outputs_set.begin(), subgraph_outputs_set.end());
+  // Build TensorRT node and add it to the graph
+  tensorflow::NodeDef trt_node_def;
+  TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(
+      *graph, subgraph_node_ids, subgraph_inputs, subgraph_outputs,
+      max_batch_size, max_workspace_size_bytes, graph_properties,
+      &trt_node_def));
+  tensorflow::Status status;
+  tensorflow::Node* trt_node = graph->AddNode(trt_node_def, &status);
+  TF_RETURN_IF_ERROR(status);
+
+  // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
+  std::map<std::pair<int, int>, int> subgraph_edge_to_output_map;
+  for (size_t i = 0; i < subgraph_outputs.size(); ++i) {
+    subgraph_edge_to_output_map.insert({subgraph_outputs.at(i), i});
+  }
+  TF_RETURN_IF_ERROR(status);
+  for (const tensorflow::Edge* edge : subgraph_outgoing_edges) {
+    std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
+    int new_src_output = subgraph_edge_to_output_map.at(old_src);
+    TF_RETURN_IF_ERROR(graph->UpdateEdge(trt_node, new_src_output, edge->dst(),
+                                         edge->dst_input()));
+  }
+  // Remove the original subgraph
+  for (int node_id : subgraph_node_ids) {
+    tensorflow::Node* node = graph->FindNodeId(node_id);
+    // Don't remove the input placeholders
+    if (node->type_string() == "Placeholder") {
+      continue;
+    }
+    graph->RemoveNode(node);
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status BuildNodeMap(
+    const tensorflow::Graph& graph,
+    std::unordered_map<string, tensorflow::Node*>* node_map) {
+  for (auto* node : graph.op_nodes()) {
+    if (!node_map->insert({node->name(), node}).second) {
+      return tensorflow::errors::AlreadyExists(
+          "Node name is not unique in graph: " + node->name());
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+}  // namespace
+
+tensorflow::Status ConvertGraphDefToTensorRT(
+    const tensorflow::GraphDef& graph_def,
+    const std::vector<string>& output_names, size_t max_batch_size,
+    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def) {
+  // Optimization pass
+  tensorflow::grappler::GrapplerItem item;
+  item.fetch = output_names;
+  tensorflow::GraphDef gdef;
+
+  // Layout optimization
+  item.graph = graph_def;
+  tensorflow::grappler::LayoutOptimizer optimizer;
+  tensorflow::grappler::Cluster* cluster;
+
+  // Virtual cluster
+  tensorflow::DeviceProperties device_properties;
+  device_properties.set_type("GPU");
+  device_properties.mutable_environment()->insert({"architecture", "6"});
+  cluster =
+      new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
+
+  TF_RETURN_IF_ERROR(optimizer.Optimize(cluster, item, &gdef));
+
+  // Constant folding
+  item.graph = gdef;
+  tensorflow::grappler::ConstantFolding fold(nullptr);
+  TF_RETURN_IF_ERROR(fold.Optimize(nullptr, item, &gdef));
+
+  // AJ refactoring shape inference through grappler/GraphProperties.
+  tensorflow::grappler::GraphProperties static_graph_properties(item);
+  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(false));
+
+  // Build full graph
+  tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
+                                             gdef.library());
+  tensorflow::Graph graph(flib);
+  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
+      tensorflow::GraphConstructorOptions(), gdef, &graph));
+
+  // Segment the graph into subgraphs that can be converted to TensorRT
+  tensorflow::tensorrt::segment::SegmentOptions segment_options;
+
+  // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
+  for (auto node : output_names) {
+    segment_options.exclude_node_list.insert(node);
+  }
+
+  // TODO(sami): this should be passed as a knob!!!!
+  segment_options.minimum_segment_size = 2;
+  tensorflow::tensorrt::segment::SegmentNodesVector segments;
+  TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
+      gdef, IsTensorRTCandidate, segment_options, &segments));
+  if (segments.size() > 1) {
+    VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
+  }
+  std::unordered_map<string, tensorflow::Node*> node_map;
+  TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
+  for (const std::set<string>& subgraph_node_names : segments) {
+    std::set<int> subgraph_node_ids;
+    for (const string& node_name : subgraph_node_names) {
+      subgraph_node_ids.insert(node_map.at(node_name)->id());
+    }
+    TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRT(
+        output_names, subgraph_node_ids, max_batch_size,
+        max_workspace_size_bytes, static_graph_properties, &graph));
+  }
+  graph.ToGraphDef(new_graph_def);
+  return tensorflow::Status::OK();
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..154ad3f2e8fb0ae702448097fbdece510df30223
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+// max_batch_size: maximum batch size which can be used for inference for
+//                 optimization targets inference run with max batch size.
+// max_workspace_size_bytes: The upper bound of memory allowence for
+//                 engine building.
+tensorflow::Status ConvertGraphDefToTensorRT(
+    const tensorflow::GraphDef& graph_def,
+    const std::vector<string>& output_names, size_t max_batch_size,
+    size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def);
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ee717dd7fb1eff4a11fb104cf5806ec8ab853d2
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -0,0 +1,1601 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+
+#include <algorithm>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"  // NOLINT
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/tensor_coding.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorrt/include/NvInfer.h"
+
+//  Check if the types are equal. Cast to int first so that failure log message
+//  would work!
+#define CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+namespace {
+
+inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
+                                       nvinfer1::DataType* trt_dtype) {
+  switch (tf_dtype) {
+    case tensorflow::DataType::DT_FLOAT:
+      *trt_dtype = nvinfer1::DataType::kFLOAT;
+      break;
+    case tensorflow::DataType::DT_INT8:
+      *trt_dtype = nvinfer1::DataType::kINT8;
+      break;
+    case tensorflow::DataType::DT_HALF:
+      *trt_dtype = nvinfer1::DataType::kHALF;
+      break;
+    default:
+      return tensorflow::errors::InvalidArgument("Unsupported data type");
+  }
+  return tensorflow::Status::OK();
+}
+
+inline nvinfer1::Dims GetTensorShape(const tensorflow::Tensor& tensor) {
+  nvinfer1::Dims dims;
+  dims.nbDims = tensor.dims();
+  for (int i = 0; i < dims.nbDims; i++) {
+    dims.d[i] = tensor.dim_size(i);
+  }
+  return dims;
+}
+
+inline int64_t GetShapeSize(nvinfer1::Dims shape) {
+  // Returns total number of elements in shape
+  int64_t count = 1;
+  for (int d = 0; d < shape.nbDims; ++d) {
+    count *= shape.d[d];
+  }
+  return count;
+}
+
+static std::vector<std::pair<int, int>> CreateSamePadding(
+    const nvinfer1::DimsHW& stride, const nvinfer1::DimsHW& kernel,
+    const std::vector<int64_t>& input_dims) {
+  std::vector<std::pair<int, int>> padding(input_dims.size());
+  CHECK_EQ((size_t)stride.nbDims, input_dims.size());  // TODO(jie): N+C? NC+?
+
+  for (size_t i = 0; i < input_dims.size(); ++i) {
+    // Formula to calculate the padding
+    int p = ((input_dims[i] - 1) / stride.d[i]) * stride.d[i] + kernel.d[i] -
+            input_dims[i];
+    p = (p > 0) ? p : 0;
+
+    // Right precedence padding, like in TensorFlow
+    int left = p / 2;
+    int right = p - left;
+
+    VLOG(2) << "PADDING_" << i << " pre: " << left << ", post: " << right
+            << "paras: " << input_dims[i] << ", " << stride.d[i] << ", "
+            << "kernel: " << kernel.d[i];
+    padding[i] = {left, right};
+  }
+  return padding;
+}
+
+class TRT_ShapedWeights {
+ public:
+  TRT_ShapedWeights(tensorflow::DataType type, const void* values,
+                    nvinfer1::Dims shape)
+      : shape_(shape), type_(type), values_(values), empty_weight_flag_(false) {
+    // Note: this->shape.type[] is not used
+  }
+
+  explicit TRT_ShapedWeights(tensorflow::DataType type)
+      : shape_(), type_(type), values_(nullptr), empty_weight_flag_(true) {}
+
+  TRT_ShapedWeights(const TRT_ShapedWeights& rhs)
+      : shape_(rhs.shape_),
+        type_(rhs.type_),
+        values_(rhs.values_),
+        empty_weight_flag_(rhs.empty_weight_flag_) {}
+
+  int64_t count() const {
+    int64_t c = 1;
+    for (int i = 0; i < shape_.nbDims; i++) c *= shape_.d[i];
+    return c;
+  }
+
+  nvinfer1::Weights GetWeightsForTRT() const {
+    nvinfer1::DataType trt_type(nvinfer1::DataType::kFLOAT);
+    TF_CHECK_OK(ConvertDType(type_, &trt_type));
+    if (empty_weight_flag_) return nvinfer1::Weights{trt_type, nullptr, 0};
+
+    // Note: this->shape.type[] is not used
+    return nvinfer1::Weights{trt_type, GetValues(), GetShapeSize(shape_)};
+  }
+
+  const void* GetValues() const { return values_; }
+
+  void SetValues(const void* values) { values_ = values; }
+
+  size_t size_bytes() const {
+    int type_size = tensorflow::DataTypeSize(this->type_);
+    return this->count() * type_size;
+  }
+
+  // Default converter
+  operator nvinfer1::Weights() const { return GetWeightsForTRT(); }
+
+  nvinfer1::Dims shape_;
+  tensorflow::DataType type_;
+
+ private:
+  const void* values_;
+  bool empty_weight_flag_;
+};
+
+class TRT_TensorOrWeights {
+ public:
+  explicit TRT_TensorOrWeights(nvinfer1::ITensor* tensor)
+      : tensor_(tensor), weights_(DT_FLOAT), variant_(TRT_NODE_TENSOR) {}
+  explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights)
+      : tensor_(nullptr), weights_(weights), variant_(TRT_NODE_WEIGHTS) {}
+  TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs)
+      : tensor_(rhs.tensor_), weights_(rhs.weights_), variant_(rhs.variant_) {}
+  ~TRT_TensorOrWeights() {}
+
+  bool is_tensor() const { return variant_ == TRT_NODE_TENSOR; }
+  bool is_weights() const { return variant_ == TRT_NODE_WEIGHTS; }
+
+  nvinfer1::ITensor* tensor() {
+    CHECK_EQ(is_tensor(), true);
+    return tensor_;
+  }
+  const nvinfer1::ITensor* tensor() const {
+    CHECK_EQ(is_tensor(), true);
+    return tensor_;
+  }
+  TRT_ShapedWeights& weights() {
+    CHECK_EQ(is_weights(), true);
+    return weights_;
+  }
+  const TRT_ShapedWeights& weights() const {
+    CHECK_EQ(is_weights(), true);
+    return weights_;
+  }
+  nvinfer1::Dims shape() const {
+    if (is_tensor()) {
+      return tensor()->getDimensions();
+    } else {
+      return weights().shape_;
+    }
+  }
+
+ private:
+  nvinfer1::ITensor* tensor_;
+  TRT_ShapedWeights weights_;
+  enum { TRT_NODE_TENSOR, TRT_NODE_WEIGHTS } variant_;
+};
+
+class TFAttrs {
+ public:
+  explicit TFAttrs(const tensorflow::NodeDef& tf_node) {
+    for (const auto& attr : tf_node.attr()) {
+      attrs_.insert({attr.first, &attr.second});
+    }
+  }
+  bool count(string key) const { return attrs_.count(key); }
+  tensorflow::AttrValue const* at(string key) const {
+    if (!attrs_.count(key)) {
+      LOG(FATAL) << "Attribute not found: " << key;
+    }
+    return attrs_.at(key);
+  }
+  template <typename T>
+  T get(string key) const;
+  template <typename T>
+  T get(string key, const T& default_value) const {
+    return attrs_.count(key) ? this->get<T>(key) : default_value;
+  }
+
+ private:
+  typedef std::map<string, tensorflow::AttrValue const*> AttrMap;
+  AttrMap attrs_;
+};
+
+template <>
+string TFAttrs::get<string>(string key) const {
+  return this->at(key)->s();
+}
+
+template <>
+std::vector<int> TFAttrs::get<std::vector<int>>(string key) const {
+  auto attr = this->at(key)->list().i();
+  return std::vector<int>(attr.begin(), attr.end());
+}
+
+template <>
+nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
+  auto values = this->get<std::vector<int>>(key);
+  nvinfer1::Dims dims;
+  dims.nbDims = values.size();
+  std::copy(values.begin(), values.end(), dims.d);
+  // Note: No dimension type information is included
+  return dims;
+}
+
+template <>
+nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(string key) const {
+  nvinfer1::DataType trt_dtype(nvinfer1::DataType::kFLOAT);
+  TF_CHECK_OK(ConvertDType(this->at(key)->type(), &trt_dtype));
+  return trt_dtype;
+}
+
+template <>
+tensorflow::DataType TFAttrs::get<tensorflow::DataType>(string key) const {
+  return this->at(key)->type();
+}
+
+template <typename T>
+void Reorder4(nvinfer1::DimsNCHW shape, const T* idata,
+              nvinfer1::DimsNCHW istrides, T* odata,
+              nvinfer1::DimsNCHW ostrides) {
+  for (int n = 0; n < shape.n(); ++n) {
+    for (int c = 0; c < shape.c(); ++c) {
+      for (int h = 0; h < shape.h(); ++h) {
+        for (int w = 0; w < shape.w(); ++w) {
+          odata[n * ostrides.n() + c * ostrides.c() + h * ostrides.h() +
+                w * ostrides.w()] = idata[n * istrides.n() + c * istrides.c() +
+                                          h * istrides.h() + w * istrides.w()];
+        }
+      }
+    }
+  }
+}
+
+void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
+                       TRT_ShapedWeights* oweights) {
+  CHECK_EQ(iweights.type_, oweights->type_);
+  CHECK_EQ(iweights.size_bytes(), oweights->size_bytes());
+  int r = iweights.shape_.d[0];
+  int s = iweights.shape_.d[1];
+  int c = iweights.shape_.d[2];
+  int k = iweights.shape_.d[3];
+  oweights->shape_.d[0] = k;
+  oweights->shape_.d[1] = c;
+  oweights->shape_.d[2] = r;
+  oweights->shape_.d[3] = s;
+  nvinfer1::DimsNCHW istrides = {1, k, s * k * c, c * k};
+  nvinfer1::DimsNCHW ostrides = {c * r * s, r * s, s, 1};
+  switch (iweights.type_) {
+    case tensorflow::DataType::DT_FLOAT:
+      Reorder4({k, c, r, s}, static_cast<float const*>(iweights.GetValues()),
+               istrides,
+               static_cast<float*>(const_cast<void*>(oweights->GetValues())),
+               ostrides);
+      break;
+    default:
+      LOG(FATAL) << "!!!!!!!!!!!!!!!!!!!!!!!!broke!!!!!!!!!!!!";
+  }
+}
+
+struct InferDeleter {
+  template <typename T>
+  void operator()(T* obj) const {
+    if (obj) {
+      obj->destroy();
+    }
+  }
+};
+
+template <typename T>
+inline std::shared_ptr<T> infer_object(T* obj) {
+  return std::shared_ptr<T>(obj, InferDeleter());
+}
+
+// Logger for GIE info/warning/errors
+class Converter;
+
+using OpConverter =
+    std::function<tensorflow::Status(Converter&, const tensorflow::NodeDef&,
+                                     std::vector<TRT_TensorOrWeights> const&,
+                                     std::vector<TRT_TensorOrWeights>*)>;
+
+class Converter {
+  std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_;
+  std::unordered_map<string, OpConverter> op_registry_;
+  nvinfer1::INetworkDefinition* trt_network_;
+  std::list<std::vector<uint8_t>> temp_bufs_;
+
+  void register_op_converters();
+
+  std::vector<TRT_TensorOrWeights> get_inputs(
+      const tensorflow::NodeDef& node_def) {
+    std::vector<TRT_TensorOrWeights> inputs;
+    for (const auto& input_name : node_def.input()) {
+      VLOG(2) << "Retrieve input: " << input_name;
+      inputs.push_back(trt_tensors_.at(input_name));
+    }
+    return inputs;
+  }
+
+ public:
+  explicit Converter(nvinfer1::INetworkDefinition* trt_network)
+      : trt_network_(trt_network) {
+    this->register_op_converters();
+  }
+
+  TRT_ShapedWeights get_temp_weights(tensorflow::DataType type,
+                                     nvinfer1::Dims shape) {
+    TRT_ShapedWeights weights(type, nullptr, shape);
+    // TODO(jie): check weights size_bytes. 0 means type error
+    temp_bufs_.push_back(std::vector<uint8_t>(weights.size_bytes()));
+    weights.SetValues(temp_bufs_.back().data());
+    return weights;
+  }
+
+  TRT_ShapedWeights get_temp_weights_like(const TRT_ShapedWeights& weights) {
+    return this->get_temp_weights(weights.type_, weights.shape_);
+  }
+
+  tensorflow::Status convert_node(const tensorflow::NodeDef& node_def) {
+    std::vector<TRT_TensorOrWeights> inputs = this->get_inputs(node_def);
+    string op = node_def.op();
+    if (!op_registry_.count(op)) {
+      return tensorflow::errors::Unimplemented(
+          "No converter registered for op: " + op);
+    }
+    OpConverter op_converter = op_registry_.at(op);
+    std::vector<TRT_TensorOrWeights> outputs;
+    TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs));
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      TRT_TensorOrWeights output = outputs.at(i);
+      // TODO(jie): tf protobuf seems to be omitting the :0 suffix
+      string output_name = node_def.name();
+      if (i != 0) output_name = output_name + ":" + std::to_string(i);
+      if (output.is_tensor()) {
+        output.tensor()->setName(output_name.c_str());
+      }
+      VLOG(2) << "Write out tensor: " << output_name;
+      if (!trt_tensors_.insert({output_name, output}).second) {
+        return tensorflow::errors::AlreadyExists(
+            "Output tensor already exists for op: " + op);
+      }
+    }
+    return tensorflow::Status::OK();
+  }
+
+  nvinfer1::INetworkDefinition* network() { return trt_network_; }
+
+  TRT_TensorOrWeights get_tensor(string name) {
+    if (!trt_tensors_.count(name)) {
+      return TRT_TensorOrWeights(nullptr);
+    }
+    return trt_tensors_.at(name);
+  }
+
+  bool insert_input_tensor(string name, nvinfer1::ITensor* tensor) {
+    return trt_tensors_.insert({name, TRT_TensorOrWeights(tensor)}).second;
+  }
+
+  nvinfer1::ITensor* TransposeTensor(nvinfer1::ITensor* input_tensor,
+                                     std::vector<int> order) {
+    auto dims = input_tensor->getDimensions();
+
+    // TODO(jie): change the return to status and properly exit
+    if (order.size() - 1 != size_t(dims.nbDims))
+      LOG(ERROR) << "Dimension does not match, fail gracefully";
+
+    nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(*input_tensor);
+    nvinfer1::Permutation permutation;
+    for (int32_t i = 0; i < dims.nbDims; ++i) {
+      permutation.order[i] = order[i + 1] - 1;
+    }
+    layer->setFirstTranspose(permutation);
+
+    nvinfer1::Dims reshape_dims;
+    reshape_dims.nbDims = dims.nbDims;
+    for (int32_t i = 0; i < reshape_dims.nbDims; ++i) {
+      reshape_dims.d[i] = 0;
+      reshape_dims.type[i] = dims.type[i];
+    }
+    layer->setReshapeDimensions(reshape_dims);
+    return layer->getOutput(0);
+  }
+};
+
+// ****************************************************************************
+// Constant folding functions
+// TODO(jie): once optimizer kicks in, we should have done constant folding
+// there.
+//*****************************************************************************/
+struct LambdaFactory {
+  enum class OP_CATEGORY : int { RSQRT = 0, NEG, ADD, MUL, SUB };
+  OP_CATEGORY op;
+
+  template <typename T>
+  std::function<T(T)> unary() {
+    switch (op) {
+      case OP_CATEGORY::RSQRT: {
+        VLOG(2) << "RSQRT GETS DONE";
+        return [](T t) -> T { return 1.0 / std::sqrt(t); };
+      }
+      case OP_CATEGORY::NEG:
+        return [](T t) -> T { return -t; };
+      default:
+        VLOG(2) << "Not supported op for unary: " << static_cast<int>(op);
+        return nullptr;
+    }
+  }
+
+  template <typename T>
+  std::function<T(T, T)> binary() {
+    switch (op) {
+      case OP_CATEGORY::ADD:
+        return [](T l, T r) -> T { return l + r; };
+      case OP_CATEGORY::SUB:
+        return [](T l, T r) -> T { return l - r; };
+      case OP_CATEGORY::MUL:
+        return [](T l, T r) -> T { return l * r; };
+      default:
+        LOG(WARNING) << "Not supported op for binary: " << static_cast<int>(op);
+    }
+    return [](T l, T r) -> T {
+      LOG(FATAL) << "Unsupported op type ";
+      return l;
+    };
+  }
+
+  template <typename T>
+  std::function<T(T)> broadcast_r(T val) {
+    VLOG(2) << "LAMBDA VAL : " << val;
+    switch (op) {
+      case OP_CATEGORY::ADD:
+        return [val](T l) -> T {
+          VLOG(2) << "LAMBDA VAL : " << val;
+          return l + val;
+        };
+      // Return [val](T l)-> T {return l+val;};
+      case OP_CATEGORY::SUB:
+        return [val](T l) -> T {
+          VLOG(2) << "LAMBDA VAL : " << val;
+          return l - val;
+        };
+      case OP_CATEGORY::MUL:
+        return [val](T l) -> T {
+          VLOG(2) << "LAMBDA VAL : " << val;
+          return l * val;
+        };
+      default:
+        LOG(WARNING) << "Not supported op for binary: " << static_cast<int>(op);
+    }
+    return [val](T l) -> T {
+      LOG(FATAL) << "Unsupported op type ";
+      return l;
+    };
+  }
+
+  template <typename T>
+  std::function<T(T)> broadcast_l(T val) {
+    VLOG(2) << "LAMBDA VAL : " << val;
+    switch (op) {
+      case OP_CATEGORY::ADD:
+        return [val](T l) -> T {
+          VLOG(2) << "LAMBDA VAL : " << val;
+          return val + l;
+        };
+      case OP_CATEGORY::SUB:
+        return [val](T l) -> T {
+          VLOG(2) << "LAMBDA VAL : " << val;
+          return val - l;
+        };
+      case OP_CATEGORY::MUL:
+        return [val](T l) -> T {
+          VLOG(2) << "LAMBDA VAL : " << val;
+          return val * l;
+        };
+      default:
+        LOG(ERROR) << "Not supported op for binary: " << static_cast<int>(op);
+    }
+    return [val](T l) -> T {
+      LOG(FATAL) << "Unsupported op type ";
+      return l;
+    };
+  }
+};
+
+tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
+                                TRT_ShapedWeights* oweights,
+                                LambdaFactory unary_op) {
+  CHECK_EQ(iweights.type_, oweights->type_);
+  switch (iweights.type_) {
+    case tensorflow::DataType::DT_FLOAT: {
+      auto inp = static_cast<float const*>(iweights.GetValues());
+      auto oup = static_cast<float*>(const_cast<void*>(oweights->GetValues()));
+      std::transform(inp, inp + iweights.count(), oup, unary_op.unary<float>());
+      break;
+    }
+    default:
+      return tensorflow::errors::Unimplemented(
+          "Data type not supported: " +
+          tensorflow::DataTypeString(iweights.type_));
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status BinaryCompute(const TRT_ShapedWeights& iweights_l,
+                                 const TRT_ShapedWeights& iweights_r,
+                                 TRT_ShapedWeights* oweights,
+                                 LambdaFactory binary_op) {
+  // Assume iweights_l.type == iweight_r.type
+  CHECK_EQ(iweights_l.type_, oweights->type_);
+  CHECK_EQ(iweights_r.type_, oweights->type_);
+  VLOG(2) << "SANITY CHECK!";
+
+  switch (iweights_l.type_) {
+    case tensorflow::DataType::DT_FLOAT: {
+      auto inp_l = static_cast<const float*>(iweights_l.GetValues());
+      auto inp_r = static_cast<const float*>(iweights_r.GetValues());
+      auto oup = static_cast<float*>(const_cast<void*>(oweights->GetValues()));
+
+      if (iweights_l.count() != iweights_r.count()) {
+        // We only supports broadcast of RankZero
+        if (iweights_l.count() == 1) {
+          VLOG(2) << "I bet it is not working!" << (*inp_l);
+          std::transform(inp_r, inp_r + iweights_r.count(), oup,
+                         binary_op.broadcast_l<float>(*inp_l));
+        } else if (iweights_r.count() == 1) {
+          VLOG(2) << "I bet it is not working!" << (*inp_r);
+          std::transform(inp_l, inp_l + iweights_l.count(), oup,
+                         binary_op.broadcast_r<float>(*inp_r));
+        } else {
+          return tensorflow::errors::Unimplemented(
+              "Binary op with non-rankZero broadcast not supported");
+        }
+      } else {
+        std::transform(inp_l, inp_l + iweights_l.count(), inp_r, oup,
+                       binary_op.binary<float>());
+      }
+      break;
+    }
+    default:
+      return tensorflow::errors::Unimplemented(
+          "Data type not supported: " +
+          tensorflow::DataTypeString(iweights_l.type_));
+  }
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConstantFoldUnary(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    std::vector<TRT_TensorOrWeights> const& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  TRT_ShapedWeights weights_input = inputs.at(0).weights();
+
+  // Allocate output weights
+  TRT_ShapedWeights weights_output = ctx.get_temp_weights_like(weights_input);
+
+  // FIXME assume type matches input weights
+  // Get trt type & shape
+  // Maybe this part has to be moved into the block of rsqrt later
+  // Check type consistency
+  CHECK_EQ(weights_input.type_,
+           TFAttrs(node_def).get<tensorflow::DataType>("T"));
+
+  // Maybe I should do a switch
+  LambdaFactory unary_op;
+  if (node_def.op() == "Rsqrt") {
+    // Compute rsqrt
+    unary_op.op = LambdaFactory::OP_CATEGORY::RSQRT;
+    auto ret = UnaryCompute(weights_input, &weights_output, unary_op);
+    // PAss the output
+    if (ret == tensorflow::Status::OK()) {
+      outputs->push_back(TRT_TensorOrWeights(weights_output));
+    }
+    return ret;
+  } else {
+    return tensorflow::errors::Unimplemented("Binary op not supported: " +
+                                             node_def.op());
+  }
+}
+
+// TODO(jie,ben) broadcast is needed yet not implemented
+// Let's get the simple stuff working first. Maybe we should fall bakc to TF
+//   approach for constant folding
+tensorflow::Status ConstantFoldBinary(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    std::vector<TRT_TensorOrWeights> const& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  TRT_ShapedWeights weights_input_l = inputs.at(0).weights();
+  TRT_ShapedWeights weights_input_r = inputs.at(1).weights();
+
+  // Check type consistency
+  CHECK_EQ(weights_input_l.type_, weights_input_r.type_);
+
+  if (weights_input_l.shape_.nbDims != weights_input_r.shape_.nbDims)
+    return tensorflow::errors::Unimplemented(
+        "Binary op implicit broadcast not supported: " + node_def.op());
+
+  // TODO(jie): constant fold should really fall back to TF.
+  int nb_dims = weights_input_l.shape_.nbDims;
+  nvinfer1::Dims output_shape;
+  output_shape.nbDims = nb_dims;
+  VLOG(2) << "nb_dims: " << nb_dims
+          << ", the other: " << weights_input_r.shape_.nbDims;
+  for (int i = 0; i < nb_dims; i++) {
+    if (weights_input_l.shape_.d[i] == weights_input_r.shape_.d[i]) {
+      output_shape.d[i] = weights_input_l.shape_.d[i];
+    } else if (weights_input_l.shape_.d[i] == 1 ||
+               weights_input_r.shape_.d[i] == 1) {
+      output_shape.d[i] =
+          std::max(weights_input_l.shape_.d[i], weights_input_r.shape_.d[i]);
+    } else {
+      return tensorflow::errors::Unimplemented(
+          "Binary op with incompatible shape at, " + node_def.op());
+    }
+    VLOG(2) << "left: " << weights_input_l.shape_.d[i]
+            << "right: " << weights_input_r.shape_.d[i]
+            << "output: " << output_shape.d[i];
+  }
+
+  // FIXME assume type matches input weights
+  // Get trt type & shape
+  TFAttrs attrs(node_def);
+  // Maybe this part has to be moved into the block of rsqrt later
+  tensorflow::DataType dtype = attrs.get<tensorflow::DataType>("T");
+
+  // Allocate output weights
+  TRT_ShapedWeights weights_output = ctx.get_temp_weights(dtype, output_shape);
+
+  // Maybe I should do a switch
+  LambdaFactory binary_op;
+  if (node_def.op() == "Sub") {
+    binary_op.op = LambdaFactory::OP_CATEGORY::SUB;
+  } else if (node_def.op() == "Mul") {
+    binary_op.op = LambdaFactory::OP_CATEGORY::MUL;
+  } else if (node_def.op() == "Add") {
+    binary_op.op = LambdaFactory::OP_CATEGORY::ADD;
+  } else {
+    return tensorflow::errors::Unimplemented("Binary op not supported: " +
+                                             node_def.op());
+  }
+  auto ret = BinaryCompute(weights_input_l, weights_input_r, &weights_output,
+                           binary_op);
+
+  // Pass the output
+  if (ret == tensorflow::Status::OK()) {
+    outputs->push_back(TRT_TensorOrWeights(weights_output));
+  }
+
+  return ret;
+}
+
+// TODO(jie): broadcast is needed yet not implemented.
+// Only implemented channel wise for the time being
+tensorflow::Status BinaryTensorOpWeight(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    const nvinfer1::ITensor* tensor, TRT_ShapedWeights weights,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  // FIXME assume type matches input weights
+  // Get trt type & shape
+  // Maybe this part has to be moved into the block of rsqrt later
+
+  // Check type consistency
+  auto dtype = TFAttrs(node_def).get<nvinfer1::DataType>("T");
+  CHECK_EQ_TYPE(tensor->getType(), dtype);  // Cast to int for error messages
+  nvinfer1::DataType ttype;
+  TF_CHECK_OK(ConvertDType(weights.type_, &ttype));
+  CHECK_EQ_TYPE(ttype, dtype);  // Cast to int for error message
+
+  // Check scale mode
+  auto dims_w = weights.shape_;
+  auto dims_t = tensor->getDimensions();
+
+  // Default to channel-wise
+  auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+
+  if (weights.count() == 1) {
+    VLOG(2) << "UNIFORM";
+    scale_mode = nvinfer1::ScaleMode::kUNIFORM;
+  } else {
+    // No broadcasting on Batch dimension;
+    assert(dims_w.d[0] == 1);
+
+    // Broadcasting on Channel dimension only allowed in kUNIFORM
+    assert(dims_w.d[1] == dims_t.d[0]);
+    assert(dims_w.nbDims == dims_t.nbDims);
+
+    // Default is element;
+    for (int i = 2; i < dims_w.nbDims; i++) {
+      if (dims_w.d[i] != dims_t.d[i - 1]) {
+        scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+        break;
+      }
+    }
+    if (scale_mode == nvinfer1::ScaleMode::kELEMENTWISE) {
+      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+      for (int i = 2; i < dims_w.nbDims; i++) {
+        if (dims_w.d[i] != 1)
+          return tensorflow::errors::InvalidArgument(
+              "Weight shape not compatible at, " + node_def.name());
+      }
+    }
+  }
+
+  // Prepare weights
+  TRT_ShapedWeights shift_weights(weights.type_);
+  TRT_ShapedWeights scale_weights(weights.type_);
+  TRT_ShapedWeights power_weights(weights.type_);
+
+  // Maybe I should do a switch
+  if (node_def.op() == "Sub") {
+    TRT_ShapedWeights neg_weights = ctx.get_temp_weights_like(weights);
+    LambdaFactory unary_op;
+    unary_op.op = LambdaFactory::OP_CATEGORY::NEG;
+    TF_RETURN_IF_ERROR(UnaryCompute(weights, &neg_weights, unary_op));
+    shift_weights = neg_weights;
+  } else if (node_def.op() == "Mul") {
+    scale_weights = weights;
+  } else if (node_def.op() == "Add") {
+    shift_weights = weights;
+  } else {
+    return tensorflow::errors::Unimplemented("Binary op not supported: " +
+                                             node_def.op());
+  }
+
+  nvinfer1::IScaleLayer* layer = ctx.network()->addScale(
+      *const_cast<nvinfer1::ITensor*>(tensor), scale_mode, shift_weights,
+      scale_weights, power_weights);
+
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  // Pass the output
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status BinaryTensorOpTensor(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    const nvinfer1::ITensor* tensor_l, const nvinfer1::ITensor* tensor_r,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
+      {"Add", nvinfer1::ElementWiseOperation::kSUM},
+      {"Mul", nvinfer1::ElementWiseOperation::kPROD},
+      // {"max", nvinfer1::ElementWiseOperation::kMAX},
+      // {"min", nvinfer1::ElementWiseOperation::kMIN},
+      {"Sub", nvinfer1::ElementWiseOperation::kSUB},
+      {"Div", nvinfer1::ElementWiseOperation::kDIV},
+  };
+
+  // FIXME assume type matches input weights
+  // Get trt type & shape
+  TFAttrs attrs(node_def);
+  // Maybe this part has to be moved into the block of rsqrt later
+  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
+
+  // Check type consistency
+  CHECK_EQ_TYPE(tensor_l->getType(), dtype);
+  CHECK_EQ_TYPE(tensor_r->getType(), dtype);
+  auto op_pair = ops.find(node_def.op());
+  if (op_pair == ops.end())
+    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
+                                             " not supported at: " +
+                                             node_def.name());
+
+  nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
+      *const_cast<nvinfer1::ITensor*>(tensor_l),
+      *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
+
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  // Pass the output
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertPlaceholder(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    std::vector<TRT_TensorOrWeights> const& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  VLOG(2) << "Placeholder should have been replace already";
+  return tensorflow::errors::Unimplemented(", cannot convert Placeholder op");
+  // OK this make sense since we are supposed to replace it with input
+  TFAttrs attrs(node_def);
+  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("dtype");
+  nvinfer1::Dims dims = attrs.get<nvinfer1::Dims>("shape");
+
+  dims.nbDims--;
+  for (int i = 0; i < dims.nbDims; i++) dims.d[i] = dims.d[i + 1];
+
+  nvinfer1::ITensor* output =
+      ctx.network()->addInput(node_def.name().c_str(), dtype, dims);
+  if (!output) {
+    return tensorflow::errors::InvalidArgument("Failed to create Input layer");
+  }
+  outputs->push_back(TRT_TensorOrWeights(output));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertConv2D(Converter& ctx,
+                                 const tensorflow::NodeDef& node_def,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  // TODO(jie): handle NHWC/NCHW transpose;
+  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
+  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck);
+  ReorderRSCKToKCRS(weights_rsck, &weights);
+  TRT_ShapedWeights biases(weights.type_);
+  int noutput = weights.shape_.d[0];
+  nvinfer1::DimsHW kernel_size;
+  kernel_size.h() = weights.shape_.d[2];
+  kernel_size.w() = weights.shape_.d[3];
+  TFAttrs attrs(node_def);
+
+  int h_index = 2;
+  int w_index = 3;
+  auto data_format = attrs.get<string>("data_format");
+  if (data_format == "NHWC") {
+    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
+                                 {0, 3, 1, 2});
+    h_index = 1;
+    w_index = 2;
+    // TODO(jie): transpose it
+  }
+
+  // TODO(jie): stride. (NHWC/NCHW)
+  auto tf_stride = attrs.get<std::vector<int>>("strides");
+  nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
+
+  auto tensor_dim = tensor->getDimensions();
+  std::vector<std::pair<int, int>> padding;
+  // TODO(jie): padding.
+  if (attrs.get<string>("padding") == "SAME") {
+    // This is NCHW tensor with no batch dimension.
+    //  1 -> h
+    //  2 -> w
+    padding = CreateSamePadding(
+        stride, kernel_size,
+        {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
+  } else {
+    padding = {{0, 0}, {0, 0}};
+  }
+
+  if (padding[0].first != padding[0].second ||
+      padding[1].first != padding[1].second) {
+    // TODO(jie): handle asymmetric padding
+    VLOG(2) << "Padding!!!: " << padding[0].first << padding[0].second
+            << padding[1].first << padding[1].second;
+
+    auto dim_before = tensor->getDimensions();
+    VLOG(2) << "TENSOR before: " << dim_before.d[0] << ", " << dim_before.d[1]
+            << dim_before.d[2] << ", " << dim_before.d[3];
+    auto pad_layer = ctx.network()->addPadding(
+        *const_cast<nvinfer1::ITensor*>(tensor),
+        nvinfer1::DimsHW(padding[0].first, padding[1].first),
+        nvinfer1::DimsHW(padding[0].second, padding[1].second));
+    padding = {{0, 0}, {0, 0}};
+    tensor = pad_layer->getOutput(0);
+    auto dim_after = tensor->getDimensions();
+    VLOG(2) << "TENSOR after: " << dim_after.d[0] << ", " << dim_after.d[1]
+            << dim_after.d[2] << ", " << dim_after.d[3];
+  }
+
+  nvinfer1::IConvolutionLayer* layer =
+      ctx.network()->addConvolution(*const_cast<nvinfer1::ITensor*>(tensor),
+                                    noutput, kernel_size, weights, biases);
+
+  layer->setStride(stride);
+  layer->setPadding({padding[0].first, padding[1].first});
+  layer->setName(node_def.name().c_str());
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  auto dim_after = output_tensor->getDimensions();
+  VLOG(2) << "TENSOR out: " << dim_after.d[0] << ", " << dim_after.d[1]
+          << dim_after.d[2] << ", " << dim_after.d[3];
+
+  if (data_format == "NHWC") {
+    // TODO(jie): transpose it back!
+    output_tensor = ctx.TransposeTensor(output_tensor, {0, 2, 3, 1});
+  } else {
+    VLOG(2) << "NCHW !!!!";
+  }
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertPool(Converter& ctx,
+                               const tensorflow::NodeDef& node_def,
+                               std::vector<TRT_TensorOrWeights> const& inputs,
+                               std::vector<TRT_TensorOrWeights>* outputs) {
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  TFAttrs attrs(node_def);
+
+  int h_index = 2;
+  int w_index = 3;
+  auto data_format = attrs.get<string>("data_format");
+  if (data_format == "NHWC") {
+    h_index = 1;
+    w_index = 2;
+    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
+                                 {0, 3, 1, 2});
+  } else {
+    VLOG(2) << "NCHW !!!!";
+  }
+  nvinfer1::PoolingType type;
+  // TODO(jie): support other pooling type
+  if (node_def.op() == "MaxPool")
+    type = nvinfer1::PoolingType::kMAX;
+  else
+    return tensorflow::errors::Unimplemented("Only supports Max pool");
+
+  // TODO(jie): NCHW
+  auto tf_stride = attrs.get<std::vector<int>>("strides");
+  nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
+
+  auto tf_kernel = attrs.get<std::vector<int>>("ksize");
+  nvinfer1::DimsHW ksize(tf_kernel[h_index], tf_kernel[w_index]);
+
+  auto tensor_dim = tensor->getDimensions();
+  std::vector<std::pair<int, int>> padding;
+  // TODO(jie): padding.
+  if (attrs.get<string>("padding") == "SAME") {
+    // This is NCHW tensor with no batch dimension.
+    //  1 -> h
+    //  2 -> w
+    padding = CreateSamePadding(
+        stride, ksize,
+        {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
+  } else if (attrs.get<string>("padding") == "VALID") {
+    // No padding for valid padding here
+    VLOG(2) << "No padding added for VALID padding in pool" << node_def.name();
+    padding = {{0, 0}, {0, 0}};
+  } else {
+    return tensorflow::errors::Unimplemented(
+        "Current MaxPool cannot support padding other than SAME");
+  }
+
+  if (padding[0].first != padding[0].second ||
+      padding[1].first != padding[1].second) {
+    // TODO(jie): handle asymmetric padding
+    VLOG(2) << "Padding!!!: " << padding[0].first << padding[0].second
+            << padding[1].first << padding[1].second;
+    auto pad_layer = ctx.network()->addPadding(
+        *const_cast<nvinfer1::ITensor*>(tensor),
+        nvinfer1::DimsHW(padding[0].first, padding[1].first),
+        nvinfer1::DimsHW(padding[0].second, padding[1].second));
+    padding = {{0, 0}, {0, 0}};
+    tensor = pad_layer->getOutput(0);
+  }
+
+  nvinfer1::IPoolingLayer* layer = ctx.network()->addPooling(
+      *const_cast<nvinfer1::ITensor*>(tensor), type, ksize);
+
+  layer->setStride(stride);
+  layer->setPadding({padding[0].first, padding[1].first});
+  layer->setName(node_def.name().c_str());
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  if (data_format == "NHWC") {
+    // TODO(jie): transpose it back!
+    output_tensor = ctx.TransposeTensor(output_tensor, {0, 2, 3, 1});
+  } else {
+    VLOG(2) << "NCHW !!!!";
+  }
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertActivation(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    std::vector<TRT_TensorOrWeights> const& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  nvinfer1::IActivationLayer* layer = ctx.network()->addActivation(
+      *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ActivationType::kRELU);
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertScale(Converter& ctx,
+                                const tensorflow::NodeDef& node_def,
+                                std::vector<TRT_TensorOrWeights> const& inputs,
+                                std::vector<TRT_TensorOrWeights>* outputs) {
+  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
+      !inputs.at(1).is_weights())
+    return tensorflow::errors::Unimplemented(
+        "Only supports tensor op weight for now, at " + node_def.name());
+  // Implement tensor binaryOp weight [channel wise] for now;
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+
+  // TODO(jie): handle NHWC/NCHW transpose;
+  TRT_ShapedWeights weights = inputs.at(1).weights();
+  TRT_ShapedWeights empty_weights(weights.type_);
+
+  TFAttrs attrs(node_def);
+
+  // Transpose NHWC
+  auto data_format = attrs.get<string>("data_format");
+  if (data_format == "NHWC") {
+    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
+                                 {0, 3, 1, 2});
+    // TODO(jie): transpose it
+  } else {
+    VLOG(2) << "NCHW !!!!";
+  }
+  nvinfer1::IScaleLayer* layer = ctx.network()->addScale(
+      *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ScaleMode::kCHANNEL,
+      weights, empty_weights, empty_weights);
+
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  if (data_format == "NHWC") {
+    // TODO(jie): transpose it back!
+    output_tensor = ctx.TransposeTensor(output_tensor, {0, 2, 3, 1});
+  } else {
+    VLOG(2) << "NCHW !!!!";
+  }
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertConst(Converter& ctx,
+                                const tensorflow::NodeDef& node_def,
+                                std::vector<TRT_TensorOrWeights> const& inputs,
+                                std::vector<TRT_TensorOrWeights>* outputs) {
+  const auto& weights_tensor = node_def.attr().at("value").tensor();
+
+  // Get trt type & shape
+  TFAttrs attrs(node_def);
+  const tensorflow::DataType dtype = attrs.get<tensorflow::DataType>("dtype");
+
+  // Create shaped weights as output
+  tensorflow::Tensor tensor;
+  if (!tensor.FromProto(weights_tensor))
+    return tensorflow::errors::Internal("Cannot parse weight tensor proto: " +
+                                        node_def.name());
+
+  TRT_ShapedWeights weights(dtype);
+  if (!weights_tensor.float_val().empty()) {
+    VLOG(2) << "SCALAR!!!" << node_def.name();
+    nvinfer1::Dims scalar_shape;
+    if (tensor.dims() > 0) {
+      VLOG(2) << "Dimensions: " << tensor.dims();
+      weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
+                                  GetTensorShape(tensor));
+    } else {
+      VLOG(2) << "Dimensions: " << tensor.dims();
+      scalar_shape.nbDims = 1;
+      scalar_shape.d[0] = 1;
+      scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
+      for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; i++) {
+        scalar_shape.d[i] = 0;
+        scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL;
+      }
+      weights = TRT_ShapedWeights(dtype, weights_tensor.float_val().data(),
+                                  scalar_shape);
+    }
+  } else if (!weights_tensor.tensor_content().empty()) {
+    VLOG(2) << "TENSOR!!!" << node_def.name();
+    const auto& content = weights_tensor.tensor_content();
+
+    weights = ctx.get_temp_weights(dtype, GetTensorShape(tensor));
+    if (content.size() > 0) {
+      const int dtype_size = tensorflow::DataTypeSize(dtype);
+      CHECK_EQ(0, content.size() % dtype_size)
+          << "Tensor content size (" << content.size()
+          << ") is not a multiple of " << dtype_size;
+      port::CopyToArray(
+          content, static_cast<char*>(const_cast<void*>(weights.GetValues())));
+    }
+  } else {
+    return tensorflow::errors::Unimplemented(
+        "Not supported constant type, at " + node_def.name());
+  }
+  // Pass the output
+  outputs->push_back(TRT_TensorOrWeights(weights));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertIdentity(
+    Converter& ctx, const tensorflow::NodeDef& node_def,
+    std::vector<TRT_TensorOrWeights> const& inputs,
+    std::vector<TRT_TensorOrWeights>* outputs) {
+  outputs->push_back(inputs.at(0));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertBinary(Converter& ctx,
+                                 const tensorflow::NodeDef& node_def,
+                                 std::vector<TRT_TensorOrWeights> const& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  if (inputs.size() != 2)
+    return tensorflow::errors::FailedPrecondition(
+        "Binary ops require two tensor input, at " + node_def.name());
+
+  if (inputs.at(0).is_weights() && inputs.at(1).is_weights())
+    return ConstantFoldBinary(ctx, node_def, inputs, outputs);
+
+  if (inputs.at(0).is_tensor() && inputs.at(1).is_weights())
+    return BinaryTensorOpWeight(ctx, node_def, inputs.at(0).tensor(),
+                                inputs.at(1).weights(), outputs);
+
+  if (inputs.at(0).is_weights() && inputs.at(1).is_tensor())
+    return BinaryTensorOpWeight(ctx, node_def, inputs.at(1).tensor(),
+                                inputs.at(0).weights(), outputs);
+
+  if (inputs.at(0).is_tensor() && inputs.at(1).is_tensor())
+    return BinaryTensorOpTensor(ctx, node_def, inputs.at(0).tensor(),
+                                inputs.at(1).tensor(), outputs);
+
+  return tensorflow::errors::Unknown("Binary op input error, at " +
+                                     node_def.name());
+}
+
+tensorflow::Status ConvertUnary(Converter& ctx,
+                                const tensorflow::NodeDef& node_def,
+                                std::vector<TRT_TensorOrWeights> const& inputs,
+                                std::vector<TRT_TensorOrWeights>* outputs) {
+  if (inputs.size() != 1)
+    return tensorflow::errors::FailedPrecondition(
+        "Unary ops require single tensor input, at " + node_def.name());
+
+  if (inputs.at(0).is_weights())
+    return ConstantFoldUnary(ctx, node_def, inputs, outputs);
+  else if (inputs.at(0).is_tensor())
+    return tensorflow::errors::Unimplemented(
+        "Unary op for tensor not supported, at " + node_def.name());
+
+  return tensorflow::errors::Unknown("Binary op input error, at " +
+                                     node_def.name());
+}
+
+tensorflow::Status ConvertReduce(Converter& ctx,
+                                 const tensorflow::NodeDef& node_def,
+                                 std::vector<TRT_TensorOrWeights> const& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
+      !inputs.at(1).is_weights())
+    return tensorflow::errors::InvalidArgument(
+        "Input expects tensor and weights, at" + node_def.name());
+
+  // Implement tensor binaryOp weight [channel wise] for now;
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  auto dims = tensor->getDimensions();
+  // Restore implicit batch dimension
+  int nb_dims = dims.nbDims + 1;
+
+  TRT_ShapedWeights index_list = inputs.at(1).weights();
+
+  TFAttrs attrs(node_def);
+  // TODO(jie): handle data type.
+  // Index type here is done through TF type, so I can leverage their
+  // EnumToDataType for my cast
+  auto index_type = attrs.get<tensorflow::DataType>("Tidx");
+
+  // Only expect to handle INT32 as attributes for now
+  if (index_type != tensorflow::DataType::DT_INT32)
+    return tensorflow::errors::Unimplemented("Tidx supports only DT_INT32");
+  auto index_list_data =
+      static_cast<int*>(const_cast<void*>(index_list.GetValues()));
+
+  // Hack warning: have to fall back to pool layer since reduce is not in public
+  // TRT yet.
+  if (nb_dims != 4)
+    return tensorflow::errors::InvalidArgument(
+        "TRT only support reduce on 4 dimensional tensors, at" +
+        node_def.name());
+  if (index_list.count() > 2)
+    return tensorflow::errors::InvalidArgument(
+        "TRT cannot support reduce on more than 2 dimensions, at" +
+        node_def.name());
+
+  std::set<int> idx_set;
+  // We cannot operate on Channel. permutation flag used to transpose tensor
+  int permuted_index = -1;
+  for (int i = 0; i < index_list.count(); i++) {
+    if (index_list_data[i] == 0)
+      return tensorflow::errors::InvalidArgument("TRT cannot reduce at 0, at" +
+                                                 node_def.name());
+    if (index_list_data[i] == 1) permuted_index = 1;
+    idx_set.emplace(index_list_data[i]);
+  }
+
+  std::vector<int> permutation_order(nb_dims);
+  nvinfer1::DimsHW pool_kernel;
+  if (permuted_index == 1) {
+    for (int i = 2; i < nb_dims; i++) {
+      if (idx_set.count(i)) {
+        permuted_index = i;
+        break;
+      }
+    }
+    for (int i = 0; i < nb_dims; i++) permutation_order[i] = i;
+
+    permutation_order[permuted_index] = 1;
+    permutation_order[1] = permuted_index;
+
+    // Apply permutation before extracting dimension for pool_kernel
+    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
+                                 permutation_order);
+  }
+
+  // Apply permutation before extracting dimension for pool_kernel
+  pool_kernel.d[0] = (idx_set.count(2) || permuted_index == 2) ? dims.d[1] : 1;
+  pool_kernel.d[1] = (idx_set.count(3) || permuted_index == 3) ? dims.d[2] : 1;
+
+  nvinfer1::ITensor* output_tensor;
+
+  if (node_def.op() == "Mean") {
+    nvinfer1::IPoolingLayer* layer =
+        ctx.network()->addPooling(*const_cast<nvinfer1::ITensor*>(tensor),
+                                  nvinfer1::PoolingType::kAVERAGE, pool_kernel);
+    output_tensor = layer->getOutput(0);
+  } else {
+    return tensorflow::errors::Unimplemented(
+        "Op not supported " + node_def.op() + " , at " + node_def.name());
+  }
+  if (permuted_index != -1) {
+    // Apply permutation before extracting dimension for pool_kernel
+    output_tensor = ctx.TransposeTensor(
+        const_cast<nvinfer1::ITensor*>(output_tensor), permutation_order);
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertPad(Converter& ctx,
+                              const tensorflow::NodeDef& node_def,
+                              std::vector<TRT_TensorOrWeights> const& inputs,
+                              std::vector<TRT_TensorOrWeights>* outputs) {
+  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
+      !inputs.at(1).is_weights())
+    return tensorflow::errors::InvalidArgument(
+        "Input expects tensor and weights, at" + node_def.name());
+
+  // Implement tensor binaryOp weight [channel wise] for now;
+  nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
+  auto dims = tensor->getDimensions();
+  // Restore implicit batch dimension
+  int nb_dims = dims.nbDims + 1;
+
+  TRT_ShapedWeights pads = inputs.at(1).weights();
+
+  TFAttrs attrs(node_def);
+  // Padding type here is done through TF type
+  //   so I can leverage their EnumToDataType for my cast
+  auto padding_type = attrs.get<tensorflow::DataType>("Tpaddings");
+  // TODO(jie): handle data type conversion for TRT?
+
+  if (pads.shape_.d[0] != nb_dims || pads.shape_.d[1] != 2)
+    return tensorflow::errors::InvalidArgument(
+        "Pad only supports explicit padding on 4 dimensional tensor, at " +
+        node_def.name());
+
+  // Only expect to handle INT32 as attributes for now
+  if (padding_type != tensorflow::DataType::DT_INT32)
+    return tensorflow::errors::Unimplemented(
+        "Tpaddings supports only DT_INT32");
+  auto pad_data = static_cast<int*>(const_cast<void*>(pads.GetValues()));
+
+  std::vector<int32_t> pad_index;
+  for (int i = 0; i < nb_dims; i++) {
+    if (pad_data[2 * i] != 0 || pad_data[2 * i + 1] != 0)
+      pad_index.push_back(i);
+  }
+
+  // No padding at all, we should exit
+  if (pad_index.size() == 0) {
+    outputs->push_back(inputs.at(0));
+    return tensorflow::Status::OK();
+  }
+
+  // Only supports padding on less than 2 axis GIE-2579
+  if (pad_index.size() > 2)
+    return tensorflow::errors::InvalidArgument(
+        "Padding layer does not support padding on > 2");
+
+  // Padding on batch dimension is not supported
+  if (pad_index[0] == 0)
+    return tensorflow::errors::InvalidArgument(
+        "Padding layer does not support padding on batch dimension");
+
+  // Not doing the legit thing here. ignoring padding on dim 1 and 3;
+  // TODO(jie): implement pad as uff parser
+  if (pad_index.size() == 2 && pad_index[0] == 0 && pad_index[1] == 3)
+    return tensorflow::errors::Unimplemented(
+        "Padding layer does not support padding on dimension 1 and 3 yet");
+
+  bool legit_pad = true;
+  nvinfer1::DimsHW pre_padding(0, 0);
+  nvinfer1::DimsHW post_padding(0, 0);
+
+  std::vector<int32_t> permuted_pad_index(pad_index);
+  if (pad_index[0] == 1) {
+    legit_pad = false;
+    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
+                                 {0, 3, 2, 1});
+    permuted_pad_index[0] = 3;
+  }
+
+  for (size_t i = 0; i < pad_index.size(); i++) {
+    int index = pad_index[i];
+    if (permuted_pad_index[i] == 2) {
+      pre_padding.h() = pad_data[index * 2];
+      post_padding.h() = pad_data[index * 2 + 1];
+    } else if (permuted_pad_index[i] == 3) {
+      pre_padding.w() = pad_data[index * 2];
+      post_padding.w() = pad_data[index * 2 + 1];
+    }
+  }
+
+  nvinfer1::IPaddingLayer* layer = ctx.network()->addPadding(
+      *const_cast<nvinfer1::ITensor*>(tensor), pre_padding, post_padding);
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  if (!legit_pad)
+    output_tensor = ctx.TransposeTensor(
+        const_cast<nvinfer1::ITensor*>(output_tensor), {0, 3, 2, 1});
+
+  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
+void Converter::register_op_converters() {
+  // vgg_16 slim implementation
+  op_registry_["Placeholder"] = ConvertPlaceholder;
+  op_registry_["Conv2D"] = ConvertConv2D;
+  op_registry_["Relu"] = ConvertActivation;
+  op_registry_["MaxPool"] = ConvertPool;
+  // This could be really handled as ConvertBinary
+  op_registry_["BiasAdd"] = ConvertScale;
+  op_registry_["Const"] = ConvertConst;
+  // op_registry_["MatMul"] = ConvertFullyConnected;  // Not used in vgg
+  // TODO(ben,jie): this is a temp hack.
+  op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
+  // op_registry_["AvgPool"] = ConvertPool;
+
+  // resnet_50_v1 slim implementation
+  op_registry_["Add"] = ConvertBinary;
+  op_registry_["Mul"] = ConvertBinary;
+  op_registry_["Sub"] = ConvertBinary;
+  op_registry_["Rsqrt"] = ConvertUnary;
+  op_registry_["Mean"] = ConvertReduce;
+  op_registry_["Pad"] = ConvertPad;
+  // TODO(ben,jie): Add more ops
+}
+
+}  // namespace
+
+tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
+    const tensorflow::Graph& graph, const std::set<int>& subgraph_node_ids,
+    const std::vector<std::pair<int, int>>& input_inds,
+    const std::vector<std::pair<int, int>>& output_inds, size_t max_batch_size,
+    size_t max_workspace_size_bytes,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    tensorflow::NodeDef* trt_node) {
+  // Visit nodes in reverse topological order and construct the TRT network.
+
+  // Toposort
+  std::vector<tensorflow::Node*> order_vec;
+  tensorflow::GetPostOrder(graph, &order_vec);
+  // Select just the subgraph
+  std::list<tensorflow::Node*> order;
+  for (tensorflow::Node* node : order_vec) {
+    if (subgraph_node_ids.count(node->id())) {
+      // We want topological order to contstruct the
+      // network layer by layer
+      order.push_front(node);
+    }
+  }
+  // Topological order is needed to build TRT network
+
+  tensorflow::tensorrt::Logger trt_logger;
+
+  auto trt_builder = infer_object(nvinfer1::createInferBuilder(trt_logger));
+  if (!trt_builder) {
+    return tensorflow::errors::Internal(
+        "Failed to create TensorRT builder object");
+  }
+
+  auto trt_network = infer_object(trt_builder->createNetwork());
+  if (!trt_network) {
+    return tensorflow::errors::Internal(
+        "Failed to create TensorRT network object");
+  }
+
+  // Build the network
+  Converter converter(trt_network.get());
+
+  std::vector<string> input_names;
+  std::vector<tensorflow::DataType> input_dtypes;
+  for (std::pair<int, int> const& input : input_inds) {
+    int node_id = input.first;
+    int output_idx = input.second;
+    tensorflow::Node* node = graph.FindNodeId(node_id);
+    auto node_name = node->name();
+    input_names.push_back(node_name);  // Insert original node name without port
+    // TODO(jie): alternative :)
+    if (!graph_properties.HasOutputProperties(node_name))
+      return tensorflow::errors::Internal("Failed to find input node: " +
+                                          node_name);
+
+    auto op_info_vec = graph_properties.GetOutputProperties(node_name);
+    if (static_cast<int>(op_info_vec.size()) < output_idx)
+      return tensorflow::errors::Internal(
+          "Accessing output index of: " + std::to_string(output_idx) +
+          ", at node: " + node_name + " with output entry from shape_map: " +
+          std::to_string(op_info_vec.size()));
+
+    auto op_info = op_info_vec.at(output_idx);
+
+    tensorflow::DataType tf_dtype = op_info.dtype();
+    input_dtypes.push_back(tf_dtype);
+
+    nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
+    TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
+
+    VLOG(2) << "Accessing output index of: " << std::to_string(output_idx)
+            << ", at node: " << node_name
+            << " with output entry from shape_map: "
+            << std::to_string(op_info_vec.size());
+
+    // TODO(ben,jie): update TRT input format/dimension
+    nvinfer1::DimsCHW input_dim_psuedo_chw;
+    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
+
+    for (int i = 1; i < op_info.shape().dim_size(); i++) {
+      VLOG(2) << "dimension: " << i
+              << " , size: " << op_info.shape().dim(i).size();
+      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
+    }
+
+    // TODO(ben,jie): proper way to restore input tensor name?
+    auto input_tensor_name = node_name;
+    if (output_idx != 0)
+      input_tensor_name = node_name + ":" + std::to_string(output_idx);
+
+    nvinfer1::ITensor* input_tensor = converter.network()->addInput(
+        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
+
+    if (!input_tensor)
+      return tensorflow::errors::InvalidArgument(
+          "Failed to create Input layer");
+    VLOG(2) << "Input tensor name :" << input_tensor_name;
+
+    if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
+      return tensorflow::errors::AlreadyExists(
+          "Output tensor already exists for op: " + input_tensor_name);
+  }
+
+  VLOG(2) << "Finished sorting";
+
+  for (const tensorflow::Node* node : order) {
+    const tensorflow::NodeDef& node_def = node->def();
+    VLOG(2) << "Converting node: " << node_def.name() << " , " << node_def.op();
+    TF_RETURN_IF_ERROR(converter.convert_node(node_def));
+  }
+
+  VLOG(2) << "Finished conversion";
+
+  // Gather output metadata
+  std::vector<string> output_names;
+  std::vector<tensorflow::DataType> output_dtypes;
+  for (std::pair<int, int> const& output : output_inds) {
+    int node_id = output.first;
+    int output_idx = output.second;
+    tensorflow::Node* node = graph.FindNodeId(node_id);
+    string op_name = node->name();
+    string tensor_name = op_name;
+    if (output_idx != 0)
+      tensor_name = tensor_name + ":" + std::to_string(output_idx);
+    VLOG(2) << "Output tensor name: " << tensor_name;
+    output_names.push_back(tensor_name);
+    auto tensor_or_weights = converter.get_tensor(tensor_name);
+    if (!tensor_or_weights.is_tensor()) {
+      return tensorflow::errors::InvalidArgument(
+          "Output node is weights not tensor");
+    }
+    nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
+    if (!tensor) {
+      return tensorflow::errors::NotFound("Output tensor not found: " +
+                                          tensor_name);
+    }
+    converter.network()->markOutput(*tensor);
+    tensorflow::DataType tf_dtype = node->output_type(output_idx);
+    output_dtypes.push_back(tf_dtype);
+    nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
+    TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
+    tensor->setType(trt_dtype);
+  }
+
+  VLOG(2) << "Finished output";
+  // TODO(jie): static_id is not thread safe.
+  static int static_id = 0;
+
+  // Build the engine
+  trt_builder->setMaxBatchSize(max_batch_size);
+  trt_builder->setMaxWorkspaceSize(max_workspace_size_bytes);
+  VLOG(0) << "Starting build engine " << static_id;
+  // TODO(ben,jie): half2 and int8 mode support
+  string engine_plan_string;
+  {
+    auto trt_engine =
+        infer_object(trt_builder->buildCudaEngine(*converter.network()));
+    VLOG(0) << "Built network";
+    auto engine_plan = infer_object(trt_engine->serialize());
+    VLOG(0) << "Serialized engine";
+    const char* engine_plan_data =
+        static_cast<const char*>(engine_plan->data());
+    engine_plan_string =
+        string(engine_plan_data, engine_plan_data + engine_plan->size());
+  }
+
+  VLOG(0) << "Finished engine";
+
+  // Build the TRT op
+  // TODO(sami,ben,jie): proper naming!
+  tensorflow::NodeDefBuilder op_builder(
+      tensorflow::strings::StrCat("my_trt_op", static_id++), "TRTEngineOp");
+  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    int output_idx = input_inds.at(i).second;
+    // We wired up the input here already, it is redundant to do it again in
+    // ConvertSubGraphToTensorRT(convert_graph.cc)
+    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
+        input_names.at(i), output_idx, input_dtypes.at(i));
+    income_edges.push_back(incoming_edge);
+  }
+  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
+      income_edges);
+  op_builder.Input(input_list);
+
+  VLOG(0) << "Finished op preparation";
+
+  auto status = op_builder.Attr("serialized_engine", engine_plan_string)
+                    .Attr("input_nodes", input_names)
+                    .Attr("output_nodes", output_names)
+                    .Attr("OutT", output_dtypes)
+                    .Finalize(trt_node);
+
+  VLOG(0) << status.ToString() << " finished op building";
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e7fd19566e1ed3719b932c7443a9c3f652b2d3e
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
+
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/lib/core/status.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
+    const tensorflow::Graph& graph, const std::set<int>& subgraph_node_ids,
+    const std::vector<std::pair<int, int>>&
+        input_inds,  // {node_id, output_idx}
+    const std::vector<std::pair<int, int>>&
+        output_inds,  // {node_id, output_idx}
+    size_t max_batch_size, size_t max_workspace_size_bytes,
+    const tensorflow::grappler::GraphProperties& graph_prop,
+    tensorflow::NodeDef* trt_node);
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8efdf63ebebc4d7a199c60635ca64348d2b30505
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -0,0 +1,140 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
+
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+
+namespace tensorflow {
+namespace tensorrt {
+static ::tensorflow::tensorrt::Logger logger;
+
+TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
+  // read serialized_engine
+  string serialized_engine;
+  OP_REQUIRES_OK(context,
+                 context->GetAttr("serialized_engine", &serialized_engine));
+
+  // register input output node name in trt_sub_graph
+  OP_REQUIRES_OK(context, context->GetAttr("input_nodes", &input_nodes_));
+  OP_REQUIRES_OK(context, context->GetAttr("output_nodes", &output_nodes_));
+
+  // TODO(samikama) runtime should be taken from a resourcemanager as well.
+  // Only engine should be in the op and context and runtime should be taken
+  // from resourcemanager
+  nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
+  trt_engine_ptr_.reset(infer->deserializeCudaEngine(
+      serialized_engine.c_str(), serialized_engine.size(), nullptr));
+
+  trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
+  // Runtime is safe to delete after engine creation
+  infer->destroy();
+}
+
+void TRTEngineOp::Compute(OpKernelContext* context) {
+  int num_binding = context->num_inputs() + context->num_outputs();
+  std::vector<void*> buffers(num_binding);
+
+  size_t binding_index;
+  int num_batch = 0;
+  bool valid = true;
+  for (int i = 0; i < context->num_inputs(); i++) {
+    // Grab the input tensor
+    binding_index = trt_engine_ptr_->getBindingIndex(input_nodes_[i].c_str());
+
+    const Tensor& input_tensor = context->input(i);
+    const TensorShape& input_shape = input_tensor.shape();
+    if (i == 0) {
+      num_batch = input_shape.dim_size(0);
+    } else if (num_batch != input_shape.dim_size(0)) {
+      valid = false;
+      break;
+    }
+    switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
+      case nvinfer1::DataType::kFLOAT:
+        buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
+        break;
+      case nvinfer1::DataType::kHALF:
+        LOG(FATAL) << "half size is not supported yet!";
+        break;
+      case nvinfer1::DataType::kINT8:
+        LOG(FATAL) << "int8 is not supported yet!";
+        break;
+    }
+  }
+
+  // Might want a different way to inform the user of batch size inconsistency
+  if (!valid) LOG(WARNING) << "input data inconsistent batch size";
+
+  for (int i = 0; i < static_cast<int>(output_nodes_.size()); i++) {
+    // This is bad that we have to reallocate output buffer every run.
+    // Create an output tensor
+    binding_index = trt_engine_ptr_->getBindingIndex(output_nodes_[i].c_str());
+    Tensor* output_tensor = nullptr;
+
+    TensorShape output_shape;
+    if (binding_index != -1) {
+      auto dims = trt_engine_ptr_->getBindingDimensions(binding_index);
+      std::vector<int> trt_shape(dims.nbDims + 1);
+      trt_shape[0] = num_batch;
+      for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j];
+      OP_REQUIRES_OK(context,
+                     TensorShapeUtils::MakeShape(
+                         trt_shape.data(), trt_shape.size(), &output_shape));
+    } else {
+      LOG(FATAL) << "output node not found, at " << output_nodes_[i];
+      break;
+    }
+
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(i, output_shape, &output_tensor));
+    switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
+      case nvinfer1::DataType::kFLOAT:
+        buffers[binding_index] =
+            reinterpret_cast<void*>(output_tensor->flat<float>().data());
+        break;
+      case nvinfer1::DataType::kHALF:
+        LOG(FATAL) << "half size is not supported yet!";
+        break;
+      case nvinfer1::DataType::kINT8:
+        LOG(FATAL) << "int8 is not supported yet!";
+        break;
+    }
+  }
+  // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
+  const cudaStream_t* stream = CHECK_NOTNULL(
+      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+                                                ->stream()
+                                                ->implementation()
+                                                ->CudaStreamMemberHack()));
+
+  // execution handled by TF since we are getting stream from TF.
+  // it is safe for CPU pointer array (buffers) to go out of scope after enqueue
+  trt_execution_context_ptr_->enqueue(num_batch, &buffers[0], *stream, nullptr);
+}
+
+REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0964b4b18a781143fdd7884a2904321b9d14e354
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+class Logger;
+
+class TRTEngineOp : public OpKernel {
+ public:
+  explicit TRTEngineOp(OpKernelConstruction* context);
+
+  void Compute(OpKernelContext* context) override;
+
+ private:
+  template <typename T>
+  struct Destroyer {
+    void operator()(T* d) { d->destroy(); }
+  };
+
+  template <typename T>
+  using destroyed_ptr = std::unique_ptr<T, Destroyer<T>>;
+  destroyed_ptr<nvinfer1::ICudaEngine> trt_engine_ptr_;
+  // TODO(samikama): context should go to a resource manager!
+  destroyed_ptr<nvinfer1::IExecutionContext> trt_execution_context_ptr_;
+
+  std::vector<string> input_nodes_;
+  std::vector<string> output_nodes_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/contrib/tensorrt/log/trt_logger.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7add8cb8b3d2a04206ee4174e79a1a4b86e05f30
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// Use TF logging for TensorRT informations
+void Logger::log(Severity severity, const char* msg) {
+  // Suppress info-level messages
+  switch (severity) {
+    case Severity::kINFO: {  // Mark TRT info messages as debug!
+      VLOG(2) << msg;
+      break;
+    }
+    case Severity::kWARNING: {
+      LOG(WARNING) << msg;
+      break;
+    }
+    case Severity::kERROR: {
+      LOG(ERROR) << msg;
+      break;
+    }
+    case Severity::kINTERNAL_ERROR: {
+      LOG(FATAL) << msg;
+      break;
+    }
+    // This is useless for now. But would catch it in future if enum changes. It
+    // is always good to have default case!
+    default: {
+      LOG(FATAL) << name_ << "Got unknown severity level from TRT " << msg;
+      break;
+    }
+  }
+}
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
new file mode 100644
index 0000000000000000000000000000000000000000..d71f66b933a8068a6276a7e070755e0075543bb5
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
+
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// Logger for GIE info/warning/errors
+class Logger : public nvinfer1::ILogger {
+ private:
+  void log(nvinfer1::ILogger::Severity severity, const char* msg) override;
+
+  string name_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..079d73f7bec3f9a9740e455b31a259cec287f849
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+namespace shape_inference {
+extern Status TRTEngineOpShapeInference(InferenceContext* c);
+}
+
+REGISTER_OP("TRTEngineOp")
+    .Attr("serialized_engine: string")
+    .Attr("input_nodes: list(string)")
+    .Attr("output_nodes: list(string)")
+    .Attr("InT: list({float32})")
+    .Attr("OutT: list({float32})")
+    .Input("in_tensor: InT")
+    .Output("out_tensor: OutT")
+    .SetShapeFn(shape_inference::TRTEngineOpShapeInference);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/python/__init__.py b/tensorflow/contrib/tensorrt/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e050a768ce97af1fc1d2c85cb52640b4c6a6a97
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/python/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Exposes the python wrapper for TensorRT graph transforms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long
+from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
+from tensorflow.contrib.tensorrt.python.trt_convert import create_inference_graph
+# pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py b/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..31a313182be9a2fca7457a539670dbc911ccabb1
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
@@ -0,0 +1,34 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Exposes the Python wrapper of TRTEngineOp."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import platform
+
+if platform.system() != "Windows":
+  # pylint: disable=wildcard-import,unused-import,g-import-not-at-top
+  from tensorflow.contrib.tensorrt.ops.gen_trt_engine_op import *
+
+  from tensorflow.contrib.util import loader
+  from tensorflow.python.platform import resource_loader
+  # pylint: enable=wildcard-import,unused-import,g-import-not-at-top
+
+  _trt_engine_op = loader.load_op_library(
+      resource_loader.get_path_to_datafile("_trt_engine_op.so"))
+else:
+  raise RuntimeError("Windows platforms are not supported")
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..9454862f857ab743712ce409ff007de55e72a68e
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -0,0 +1,103 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Exposes the Python wrapper conversion to trt_graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long
+import six as _six
+from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import errors_impl as _impl
+from tensorflow.python.framework import ops
+
+
+# TODO(skama): get outputs from session when implemented as c++
+# optimization pass
+def create_inference_graph(input_graph_def,
+                           outputs,
+                           max_batch_size=1,
+                           max_workspace_size_bytes=2 << 20):
+  """Python wrapper for the TRT transormation.
+
+
+  Args:
+    input_graph_def: GraphDef object containing a model to be transformed.
+    outputs: List of tensors or node names for the model outputs.
+    max_batch_size: max size for the input batch
+    max_workspace_size_bytes: parameter to control memory allocation (in Bytes)
+
+  Returns:
+    New GraphDef with TRTEngineOps placed in graph replacing subgraphs.
+
+  Raises:
+    RuntimeError: if the returned status message is malformed.
+  """
+
+  def py2bytes(inp):
+    return inp
+
+  def py3bytes(inp):
+    return inp.encode("utf-8", errors="surrogateescape")
+
+  def py2string(inp):
+    return inp
+
+  def py3string(inp):
+    return inp.decode("utf-8")
+
+  if _six.PY2:
+    to_bytes = py2bytes
+    to_string = py2string
+  else:
+    to_bytes = py3bytes
+    to_string = py3string
+
+  out_names = []
+  for i in outputs:
+    if isinstance(i, ops.Tensor):
+      out_names.append(to_bytes(i.name))
+    else:
+      out_names.append(to_bytes(i))
+
+  input_graph_def_str = input_graph_def.SerializeToString()
+
+  # TODO(sami): Fix this when we can return status from C++ library
+  # There is a problem with the TF internal library setup that doesn't
+  # allow us to return a status object from C++.  Thus we return a
+  # pair or strings where first one is encoded status and the second
+  # one is the transformed graphs protobuf string.
+  out = trt_convert(input_graph_def_str, out_names, max_batch_size,
+                    max_workspace_size_bytes)
+  status = to_string(out[0])
+  output_graph_def_string = out[1]
+  del input_graph_def_str  # Save some memory
+  if len(status) < 2:
+    raise _impl.UnknownError(None, None, status)
+  if status[:2] != "OK":
+    msg = status.split(";")
+    if len(msg) == 1:
+      raise RuntimeError("Status message is malformed {}".format(status))
+    # pylint: disable=protected-access
+    raise _impl._make_specific_exception(None, None, ";".join(msg[1:]),
+                                         int(msg[0]))
+    # pylint: enable=protected-access
+  output_graph_def = graph_pb2.GraphDef()
+  output_graph_def.ParseFromString(output_graph_def_string)
+  del output_graph_def_string  # Save some memory
+  return output_graph_def
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6193f0b0a13f6985d5fc8dd4c6fc09b15f72f139
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -0,0 +1,253 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/segment/segment.h"
+
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/tensorrt/segment/union_find.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace segment {
+
+namespace {
+
+bool CanContractEdge(const tensorflow::Edge* edge,
+                     const tensorflow::Graph& graph) {
+  const tensorflow::Node* src = edge->src();
+  const tensorflow::Node* dst = edge->dst();
+
+  // Can't contract edge if doing so would cause a cycle in the
+  // graph. So, if there is a directed path from 'src' to 'dst', other
+  // than 'edge' (or any other direct edge from 'src' to 'dst'), then
+  // combining 'src' and 'dst' will cause a cycle along that path.
+  //
+  // In practice, to avoid modifying the graph and to take advantage
+  // of existing graph functions, we perform an equivalent.
+  //   1. Get all nodes incoming to 'dst', excluding 'src'
+  //   2. Reverse DFS from those nodes
+  //   3. If reverse DFS reaches 'src' then we have a cycle
+  std::vector<tensorflow::Node*> dfs_start_nodes;
+  for (tensorflow::Node* node : dst->in_nodes()) {
+    if (node != src) {
+      dfs_start_nodes.push_back(node);
+    }
+  }
+
+  bool is_cycle = false;
+  if (!dfs_start_nodes.empty()) {
+    tensorflow::ReverseDFSFrom(graph, dfs_start_nodes, {},
+                               [&is_cycle, src](tensorflow::Node* node) {
+                                 if (node == src) {
+                                   is_cycle = true;
+                                 }
+                               });
+  }
+
+  return !is_cycle;
+}
+
+void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
+                  std::vector<const tensorflow::Edge*>* remove_edges) {
+  // Transfer all inputs and outputs of 'dst' to 'src' except edges
+  // connecting the two.
+  tensorflow::Node* src = edge->src();
+  tensorflow::Node* dst = edge->dst();
+
+  // We can use '0' for input/output index because we don't need them
+  // to be accurate for the way we are using the graph.
+  std::vector<const tensorflow::Edge*> in_edges(dst->in_edges().begin(),
+                                                dst->in_edges().end());
+  for (const tensorflow::Edge* in_edge : in_edges) {
+    if (in_edge->src() != src) {
+      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+      if (e->src() == graph->source_node()) {
+        graph->AddEdge(e->src(), e->src_output(), src,
+                       tensorflow::Graph::kControlSlot);
+      } else {
+        graph->AddEdge(e->src(), e->src_output(), src, 0 /* input index */);
+      }
+    }
+  }
+
+  std::vector<const tensorflow::Edge*> out_edges(dst->out_edges().begin(),
+                                                 dst->out_edges().end());
+  for (const tensorflow::Edge* out_edge : out_edges) {
+    tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+    if (e->dst() == graph->sink_node()) {
+      graph->AddEdge(src, tensorflow::Graph::kControlSlot, e->dst(),
+                     e->dst_input());
+    } else {
+      graph->AddEdge(src, 0 /* output index */, e->dst(), e->dst_input());
+    }
+  }
+
+  // Return the edges that must be removed to disconnect 'dst' from
+  // the graph. We don't actually remove 'dst' since the caller holds
+  // references to all the nodes.
+  for (const auto& in_edge : dst->in_edges()) {
+    remove_edges->push_back(in_edge);
+  }
+  for (const auto& out_edge : dst->out_edges()) {
+    remove_edges->push_back(out_edge);
+  }
+}
+
+}  // namespace
+
+tensorflow::Status SegmentGraph(
+    const tensorflow::GraphDef& gdef,
+    const std::function<bool(const tensorflow::NodeDef&)>& candidate_fn,
+    const SegmentOptions& options, SegmentNodesVector* segments) {
+  // Create a Graph representation of the GraphDef.
+  tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
+                                             gdef.library());
+  tensorflow::Graph graph(flib);
+  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
+      tensorflow::GraphConstructorOptions(), gdef, &graph));
+
+  // tensorflow::DumpGraph("Pre-Segment", &graph);
+
+  // Use a union-find to collect the nodes that belong to the same
+  // segment. A node value of nullptr indicates that the node is not a
+  // candidate for TRT.
+  std::vector<UnionFind<tensorflow::Node*>> node_segments;
+  for (int i = 0; i < graph.num_node_ids(); ++i) {
+    tensorflow::Node* node = graph.FindNodeId(i);
+    if (options.exclude_node_list.count(node->name()) != 0 ||
+        !candidate_fn(node->def())) {
+      node = nullptr;
+    }
+    node_segments.emplace_back(node);
+  }
+
+  // The segmentation algorithm below visits nodes in reverse
+  // topological order and attempts to merge nodes along output
+  // edges. That means that subgraphs grow from the output-side of the
+  // network towards the inputs. In general this is not guaranteed to
+  // produce a globally optimal segmentation. In the future if we have
+  // a measure of how beneficial it is to include a given node in a
+  // TRT subgraph then we can revisit this algorithm to take advantage
+  // of that information.
+  std::vector<tensorflow::Node*> order;
+  tensorflow::GetPostOrder(graph, &order);
+
+  for (const tensorflow::Node* node : order) {
+    // All output nodes of 'node' have been visited...
+    VLOG(2) << "Trying node " << node->name();
+
+    // 'node' must be a TRT candidate...
+    if (node_segments[node->id()].Value() == nullptr) {
+      VLOG(2) << "... not a TRT candidate";
+      continue;
+    }
+
+    // Contract output edges to combine 'node' with output
+    // nodes. Iterate since combining two nodes may unblock other
+    // combining.
+    while (true) {
+      std::set<const tensorflow::Edge*> contract_edges;
+      for (const tensorflow::Edge* out_edge : node->out_edges()) {
+        VLOG(2) << "... out node " << out_edge->dst()->name();
+
+        // Out node must be TRT candidate...
+        if (node_segments[out_edge->dst()->id()].Value() == nullptr) {
+          VLOG(2) << "... ... not a TRT candidate";
+          continue;
+        }
+
+        if (CanContractEdge(out_edge, graph)) {
+          VLOG(2) << "... ... can contract";
+          contract_edges.insert(out_edge);
+        } else {
+          VLOG(2) << "... ... cannot contract, would form cycle";
+        }
+      }
+
+      if (contract_edges.empty()) {
+        break;
+      }
+
+      // Contract edges and collect the adjacent nodes into the same
+      // segment/subgraph.
+      while (!contract_edges.empty()) {
+        const tensorflow::Edge* contract_edge = *contract_edges.begin();
+        const tensorflow::Node* src = contract_edge->src();
+        const tensorflow::Node* dst = contract_edge->dst();
+
+        VLOG(2) << "Merge " << src->name() << " <- " << dst->name();
+        node_segments[src->id()].Merge(&node_segments[dst->id()]);
+
+        // Contracting the edge leaves disconnected graph edges.
+        // Remove these from the graph and from 'contract_edges' so we
+        // don't visit them again.
+        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(contract_edge);
+        std::vector<const tensorflow::Edge*> remove_edges;
+        ContractEdge(e, &graph, &remove_edges);
+
+        for (const tensorflow::Edge* r : remove_edges) {
+          contract_edges.erase(r);
+          graph.RemoveEdge(r);
+        }
+      }
+    }
+  }
+
+  // Collect the segments/subgraphs. Each subgraph is represented by a
+  // set of the names of the nodes in that subgraph.
+  std::unordered_map<string, std::set<string>> sg_map;
+  for (auto& u : node_segments) {
+    if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) {
+      sg_map[u.ParentValue()->name()].insert(u.Value()->name());
+    }
+  }
+
+  // Convert the segments into the expected return format
+  for (const auto& itr : sg_map) {
+    const auto& segment_node_names = itr.second;
+    if (VLOG_IS_ON(1)) {
+      string s;
+      for (const auto& name : segment_node_names) {
+        s += " " + name;
+      }
+      VLOG(1) << "Segment " << segments->size() << ":" << s;
+    }
+
+    // Don't use small segments.
+    if (static_cast<int>(segment_node_names.size()) <
+        options.minimum_segment_size) {
+      VLOG(1) << "Segment " << segments->size() << " has only "
+              << segment_node_names.size() << " nodes, dropping";
+      continue;
+    }
+
+    segments->emplace_back(segment_node_names);
+  }
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace segment
+}  // namespace tensorrt
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee6e2b3ed26cd1fabc0e952d882d549046cd9a30
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
+
+#include <set>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace segment {
+
+using SegmentNodesVector = std::vector<std::set<string>>;
+
+struct SegmentOptions {
+  // Segment must contain at least this many nodes.
+  int minimum_segment_size = 2;
+  std::set<string> exclude_node_list;
+};
+
+// Get the subgraphs of a graph that can be handled by TensorRT.
+//
+// @param gdef The GraphDef describing the network
+// @param candidate_fn A function that returns true for a NodeDef if
+// that node can be handled by TensorRT.
+// @param segments Returns the TensorRT segments/subgraphs. Each entry
+// in the vector describes a subgraph by giving a set of the names of
+// all the NodeDefs in that subgraph.
+// @return the status.
+tensorflow::Status SegmentGraph(
+    const tensorflow::GraphDef& gdef,
+    const std::function<bool(const tensorflow::NodeDef&)>& candidate_fn,
+    const SegmentOptions& options, SegmentNodesVector* segments);
+
+}  // namespace segment
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..74cbc5f2b376b76324eed06d251767da6f928e3e
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -0,0 +1,367 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace segment {
+namespace test {
+
+class SegmentTest : public ::testing::Test {
+ public:
+  bool GetGraphDef(TF_Graph* graph, tensorflow::GraphDef* graph_def);
+
+  TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, const char* name);
+  TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                    TF_Status* s, const char* name);
+
+  std::function<bool(const NodeDef&)> MakeCandidateFn(
+      const std::set<string>& node_names);
+
+ protected:
+  void PlaceholderHelper(TF_Graph* graph, TF_Status* s, const char* name,
+                         TF_Operation** op);
+  void AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                 TF_Status* s, const char* name, TF_Operation** op, bool check);
+
+  SegmentOptions default_options_;
+};
+
+bool SegmentTest::GetGraphDef(TF_Graph* graph,
+                              tensorflow::GraphDef* graph_def) {
+  TF_Status* s = TF_NewStatus();
+  TF_Buffer* buffer = TF_NewBuffer();
+  TF_GraphToGraphDef(graph, buffer, s);
+  bool ret = TF_GetCode(s) == TF_OK;
+  EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  if (ret) ret = graph_def->ParseFromArray(buffer->data, buffer->length);
+  TF_DeleteBuffer(buffer);
+  TF_DeleteStatus(s);
+  return ret;
+}
+
+std::function<bool(const NodeDef&)> SegmentTest::MakeCandidateFn(
+    const std::set<string>& node_names) {
+  return [node_names](const NodeDef& node) -> bool {
+    return node_names.find(node.name()) != node_names.end();
+  };
+}
+
+void SegmentTest::PlaceholderHelper(TF_Graph* graph, TF_Status* s,
+                                    const char* name, TF_Operation** op) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "Placeholder", name);
+  TF_SetAttrType(desc, "dtype", TF_INT32);
+  *op = TF_FinishOperation(desc, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  ASSERT_NE(*op, nullptr);
+}
+
+TF_Operation* SegmentTest::Placeholder(TF_Graph* graph, TF_Status* s,
+                                       const char* name) {
+  TF_Operation* op;
+  PlaceholderHelper(graph, s, name, &op);
+  return op;
+}
+
+void SegmentTest::AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
+                            TF_Status* s, const char* name, TF_Operation** op,
+                            bool check) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
+  TF_Output add_inputs[2] = {{l, 0}, {r, 0}};
+  TF_AddInputList(desc, add_inputs, 2);
+  *op = TF_FinishOperation(desc, s);
+  if (check) {
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    ASSERT_NE(*op, nullptr);
+  }
+}
+
+TF_Operation* SegmentTest::Add(TF_Operation* l, TF_Operation* r,
+                               TF_Graph* graph, TF_Status* s,
+                               const char* name) {
+  TF_Operation* op;
+  AddHelper(l, r, graph, s, name, &op, true);
+  return op;
+}
+
+TEST_F(SegmentTest, Empty) {
+  TF_Graph* graph = TF_NewGraph();
+
+  GraphDef graph_def;
+  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
+
+  SegmentNodesVector segments;
+  ASSERT_EQ(
+      SegmentGraph(graph_def, MakeCandidateFn({}), default_options_, &segments),
+      tensorflow::Status::OK());
+
+  // Expect no segments/subgraphs.
+  EXPECT_TRUE(segments.empty());
+  TF_DeleteGraph(graph);
+}
+
+TEST_F(SegmentTest, Simple) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  //           feed
+  //         //    ||
+  //       add0    add1
+  //        | |    /
+  //        |  add2
+  //        |  /  ||
+  //       add3    add4
+  //           |  /
+  //          <sink>
+  //
+  TF_Operation* feed = Placeholder(graph, s, "feed");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
+
+  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
+  TF_Operation* add4 = Add(add2, add2, graph, s, "add4");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
+
+  GraphDef graph_def;
+  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
+
+  SegmentNodesVector segments;
+  ASSERT_EQ(
+      SegmentGraph(graph_def,
+                   MakeCandidateFn({"add0", "add1", "add2", "add3", "add4"}),
+                   default_options_, &segments),
+      tensorflow::Status::OK());
+
+  // Expect all Add operations to be collapsed into a single segment
+  ASSERT_EQ(segments.size(), 1);
+  std::vector<string> expected{"add0", "add1", "add2", "add3", "add4"};
+  for (const auto& ex : expected) {
+    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+        << "Missing expected node " << ex;
+  }
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+TEST_F(SegmentTest, AvoidCycle) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // add2 is not a TRT candidate so add0/add3 cannot be formed as a
+  // subgraph
+  //
+  //           feed
+  //         //    ||
+  //       add0    add1
+  //        | |    /
+  //        |  add2
+  //        |  /  ||
+  //       add3    add4
+  //           |  /
+  //          <sink>
+  //
+  TF_Operation* feed = Placeholder(graph, s, "feed");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
+
+  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
+  TF_Operation* add4 = Add(add2, add2, graph, s, "add4");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
+
+  GraphDef graph_def;
+  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
+
+  SegmentNodesVector segments;
+  ASSERT_EQ(
+      SegmentGraph(graph_def, MakeCandidateFn({"add0", "add1", "add3", "add4"}),
+                   default_options_, &segments),
+      tensorflow::Status::OK());
+
+  // Expect no subgraphs
+  EXPECT_EQ(segments.size(), 0);
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+TEST_F(SegmentTest, Multiple) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // add5 is not a TRT candidate so two subgraphs should be formed
+  //
+  //                feed
+  //         //      ||     ||
+  //       add0    add1      add7
+  //        | |    /        /   ||
+  //        |  add2-----add5    add8
+  //        |  /  |    |  |    |
+  //       add3   add4     add6
+  //           |     |     /
+  //               <sink>
+  //
+  TF_Operation* feed = Placeholder(graph, s, "feed");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
+
+  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add7 = Add(feed, feed, graph, s, "add7");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add5 = Add(add2, add7, graph, s, "add5");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add8 = Add(add7, add7, graph, s, "add8");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
+  TF_Operation* add4 = Add(add2, add5, graph, s, "add4");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
+  TF_Operation* add6 = Add(add5, add8, graph, s, "add6");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add6"), string(TF_OperationName(add6)));
+
+  GraphDef graph_def;
+  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
+
+  SegmentNodesVector segments;
+  ASSERT_EQ(SegmentGraph(graph_def,
+                         MakeCandidateFn({"add0", "add1", "add2", "add3",
+                                          "add4", "add6", "add7", "add8"}),
+                         default_options_, &segments),
+            tensorflow::Status::OK());
+
+  // Expect two subgraphs
+  EXPECT_EQ(segments.size(), 2);
+
+  std::vector<string> expected0{"add0", "add1", "add2", "add3"};
+  for (const auto& ex : expected0) {
+    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+        << "Missing expected node " << ex;
+  }
+
+  std::vector<string> expected1{"add6", "add8"};
+  for (const auto& ex : expected1) {
+    EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
+        << "Missing expected node " << ex;
+  }
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+TEST_F(SegmentTest, BigIfElse) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // add2 is not a TRT candidate
+  //
+  //           feed
+  //            ||
+  //           add0
+  //         //    ||
+  //       add1    add4
+  //        ||      ||
+  //       add2    add5
+  //        ||      ||
+  //       add3    add6
+  //         ||    //
+  //           add7
+  //            ||
+  //          <sink>
+  //
+  TF_Operation* feed = Placeholder(graph, s, "feed");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
+
+  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add1 = Add(add0, add0, graph, s, "add1");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add2 = Add(add1, add1, graph, s, "add2");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add3 = Add(add2, add2, graph, s, "add3");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add4 = Add(add0, add0, graph, s, "add4");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add5 = Add(add4, add4, graph, s, "add5");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add6 = Add(add5, add5, graph, s, "add6");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_Operation* add7 = Add(add3, add6, graph, s, "add7");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(string("add7"), string(TF_OperationName(add7)));
+
+  GraphDef graph_def;
+  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
+
+  SegmentNodesVector segments;
+  ASSERT_EQ(SegmentGraph(graph_def,
+                         MakeCandidateFn({"add0", "add1", "add3", "add4",
+                                          "add5", "add6", "add7"}),
+                         default_options_, &segments),
+            tensorflow::Status::OK());
+
+  // Expect 2 subgraphs
+  EXPECT_EQ(segments.size(), 2);
+
+  std::vector<string> expected0{"add3", "add4", "add5", "add6", "add7"};
+  for (const auto& ex : expected0) {
+    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+        << "Missing expected node " << ex;
+  }
+
+  std::vector<string> expected1{"add0", "add1"};
+  for (const auto& ex : expected1) {
+    EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
+        << "Missing expected node " << ex;
+  }
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
+}  // namespace test
+}  // namespace segment
+}  // namespace tensorrt
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/segment/union_find.h b/tensorflow/contrib/tensorrt/segment/union_find.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c64ebbb0ae532a4776ab8963515d19fd3b23b4c
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/segment/union_find.h
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
+
+namespace tensorflow {
+namespace tensorrt {
+namespace segment {
+
+// Union-Find data structure.
+// Each cluster has an associated value; when merging clusters we can control
+// which value becomes the representative of the merged clusters. Values must be
+// copyable.
+template <typename T>
+class UnionFind {
+ public:
+  UnionFind() : size_(1), parent_(nullptr) {}
+  explicit UnionFind(const T& v) : size_(1), parent_(nullptr), value_(v) {}
+
+  // Returns the number of elements in a cluster.
+  int Size() { return FindRoot()->size_; }
+
+  // Merges this cluster with 'other'. This cluster's value becomes
+  // the value of the merged cluster; the value of 'other' is ignored.
+  void Merge(UnionFind* other);
+
+  // Each cluster has an associated value. Retrieves the value associated
+  // with this cluster.
+  T& ParentValue() { return FindRoot()->value_; }
+
+  // Get the original value of this node.
+  T& Value() { return value_; }
+
+ private:
+  // Finds the root element of the cluster. Performs path compression.
+  UnionFind* FindRoot();
+
+  int size_;
+  UnionFind* parent_;
+  T value_;
+};
+
+template <typename T>
+void UnionFind<T>::Merge(UnionFind* other) {
+  UnionFind<T>* a = FindRoot();
+  UnionFind<T>* b = other->FindRoot();
+  if (a == b) return;
+
+  b->parent_ = a;
+  a->size_ += b->size_;
+}
+
+template <typename T>
+UnionFind<T>* UnionFind<T>::FindRoot() {
+  if (!parent_) return this;
+  // Path compression: update intermediate nodes to point to the root of the
+  // equivalence class.
+  parent_ = parent_->FindRoot();
+  return parent_;
+}
+
+}  // namespace segment
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8b475177bc670ddae2b26b6a494f758eba20b2c3
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -0,0 +1,89 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
+
+#include <string>
+#include <vector>
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace shape_inference {
+
+tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) {
+  tensorflow::tensorrt::Logger logger;
+  string serialized_engine;
+  TF_RETURN_IF_ERROR(context->GetAttr("serialized_engine", &serialized_engine));
+  nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
+  nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
+      serialized_engine.c_str(), serialized_engine.size(), nullptr);
+
+  int num_batch = -1;
+  std::vector<::tensorflow::DataType> input_type;
+  TF_RETURN_IF_ERROR(context->GetAttr("InT", &input_type));
+  for (size_t i = 0; i < context->num_inputs(); i++) {
+    // Check if input shape is legit
+    auto input_shape = context->input(i);
+    for (int j = 0; j < context->Rank(input_shape); j++) {
+      auto dim_handler = context->Dim(input_shape, j);
+      if (j == 0) {
+        if (i == 0) {
+          num_batch = context->Value(dim_handler);
+        } else if (num_batch != context->Value(dim_handler)) {
+          // TODO(jie): TensorRT engine requires consistent batch between inputs
+          //            tensors. Segmenter should be aware of this.
+          LOG(FATAL) << "TensorRT engine requires consistent batch size";
+        }
+      }
+    }
+  }
+
+  // Arrange input here
+  std::vector<string> input_nodes;
+  TF_RETURN_IF_ERROR(context->GetAttr("input_nodes", &input_nodes));
+
+  // Arrange output here
+  std::vector<string> output_nodes;
+  TF_RETURN_IF_ERROR(context->GetAttr("output_nodes", &output_nodes));
+  for (size_t i = 0; i < output_nodes.size(); i++) {
+    int binding_index = trt_engine->getBindingIndex(output_nodes[i].c_str());
+    ShapeHandle output_shape;
+    std::vector<DimensionHandle> dim_vec;
+    dim_vec.emplace_back(context->MakeDim(num_batch));
+    if (binding_index != -1) {
+      auto dims = trt_engine->getBindingDimensions(binding_index);
+      for (int j = 0; j < dims.nbDims; j++) {
+        dim_vec.emplace_back(context->MakeDim(dims.d[j]));
+      }
+    } else {
+      LOG(FATAL) << "TensorRT engine cannot find binding: " << output_nodes[i];
+    }
+    output_shape = context->MakeShape(dim_vec);
+    context->set_output(i, output_shape);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace shape_inference
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b50f66699f0965639e22169ee7d71e860314bf0
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_SHAPE_FN_TRT_SHFN_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_SHAPE_FN_TRT_SHFN_H_
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace shape_inference {
+Status TRTEngineOpShapeInference(InferenceContext* c);
+}  // namespace shape_inference
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_SHAPE_FN_TRT_SHFN_H_
diff --git a/tensorflow/contrib/tensorrt/tensorrt_test.cc b/tensorflow/contrib/tensorrt/tensorrt_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e11522ea5bda7f5a303d6ea332148dbd7b17f162
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/tensorrt_test.cc
@@ -0,0 +1,159 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda.h"
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace {
+
+class Logger : public nvinfer1::ILogger {
+ public:
+  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
+    switch (severity) {
+      case Severity::kINFO:
+        LOG(INFO) << msg;
+        break;
+      case Severity::kWARNING:
+        LOG(WARNING) << msg;
+        break;
+      case Severity::kINTERNAL_ERROR:
+      case Severity::kERROR:
+        LOG(ERROR) << msg;
+        break;
+      default:
+        break;
+    }
+  }
+};
+
+class ScopedWeights {
+ public:
+  ScopedWeights(float value) : value_(value) {
+    w.type = nvinfer1::DataType::kFLOAT;
+    w.values = &value_;
+    w.count = 1;
+  }
+  const nvinfer1::Weights& get() { return w; }
+
+ private:
+  float value_;
+  nvinfer1::Weights w;
+};
+
+const char* kInputTensor = "input";
+const char* kOutputTensor = "output";
+
+// Creates a network to compute y=2x+3.
+nvinfer1::IHostMemory* CreateNetwork() {
+  Logger logger;
+  nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
+  ScopedWeights weights(2.0);
+  ScopedWeights bias(3.0);
+
+  nvinfer1::INetworkDefinition* network = builder->createNetwork();
+  // Add the input.
+  auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT,
+                                 nvinfer1::DimsCHW{1, 1, 1});
+  EXPECT_NE(input, nullptr);
+  // Add the hidden layer.
+  auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get());
+  EXPECT_NE(layer, nullptr);
+  // Mark the output.
+  auto output = layer->getOutput(0);
+  output->setName(kOutputTensor);
+  network->markOutput(*output);
+  // Build the engine
+  builder->setMaxBatchSize(1);
+  builder->setMaxWorkspaceSize(1 << 10);
+  auto engine = builder->buildCudaEngine(*network);
+  EXPECT_NE(engine, nullptr);
+  // Serialize the engine to create a model, then close everything.
+  nvinfer1::IHostMemory* model = engine->serialize();
+  network->destroy();
+  engine->destroy();
+  builder->destroy();
+  return model;
+}
+
+// Executes the network.
+void Execute(nvinfer1::IExecutionContext& context, const float* input,
+             float* output) {
+  const nvinfer1::ICudaEngine& engine = context.getEngine();
+
+  // We have two bindings: input and output.
+  ASSERT_EQ(engine.getNbBindings(), 2);
+  const int input_index = engine.getBindingIndex(kInputTensor);
+  const int output_index = engine.getBindingIndex(kOutputTensor);
+
+  // Create GPU buffers and a stream
+  void* buffers[2];
+  ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float)));
+  ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float)));
+  cudaStream_t stream;
+  ASSERT_EQ(0, cudaStreamCreate(&stream));
+
+  // Copy the input to the GPU, execute the network, and copy the output back.
+  //
+  // Note that since the host buffer was not created as pinned memory, these
+  // async copies are turned into sync copies. So the following synchronization
+  // could be removed.
+  ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
+                               cudaMemcpyHostToDevice, stream));
+  context.enqueue(1, buffers, stream, nullptr);
+  ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
+                               cudaMemcpyDeviceToHost, stream));
+  cudaStreamSynchronize(stream);
+
+  // Release the stream and the buffers
+  cudaStreamDestroy(stream);
+  ASSERT_EQ(0, cudaFree(buffers[input_index]));
+  ASSERT_EQ(0, cudaFree(buffers[output_index]));
+}
+
+TEST(TensorrtTest, BasicFunctions) {
+  // Create the network model.
+  nvinfer1::IHostMemory* model = CreateNetwork();
+  // Use the model to create an engine and then an execution context.
+  Logger logger;
+  nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
+  nvinfer1::ICudaEngine* engine =
+      runtime->deserializeCudaEngine(model->data(), model->size(), nullptr);
+  model->destroy();
+  nvinfer1::IExecutionContext* context = engine->createExecutionContext();
+
+  // Execute the network.
+  float input = 1234;
+  float output;
+  Execute(*context, &input, &output);
+  EXPECT_EQ(output, input * 2 + 3);
+
+  // Destroy the engine.
+  context->destroy();
+  engine->destroy();
+  runtime->destroy();
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
new file mode 100644
index 0000000000000000000000000000000000000000..18dba94acb3724cb2b5a1c53227bcf08bf9f8fcc
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -0,0 +1,88 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+# normally we should do import tensorflow as tf and then
+# tf.placeholder, tf.constant, tf.nn.conv2d etc but
+# it looks like internal builds don't like it so
+# importing every module individually
+
+from tensorflow.contrib import tensorrt as trt
+from tensorflow.core.protobuf import config_pb2 as cpb2
+from tensorflow.python.client import session as csess
+from tensorflow.python.framework import constant_op as cop
+from tensorflow.python.framework import dtypes as dtypes
+from tensorflow.python.framework import importer as importer
+from tensorflow.python.framework import ops as ops
+from tensorflow.python.ops import array_ops as aops
+from tensorflow.python.ops import nn as nn
+from tensorflow.python.ops import nn_ops as nn_ops
+
+
+def get_simple_graph_def():
+  """Create a simple graph and return its graph_def"""
+  g = ops.Graph()
+  with g.as_default():
+    a = aops.placeholder(
+        dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
+    e = cop.constant(
+        [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
+        name="weights",
+        dtype=dtypes.float32)
+    conv = nn.conv2d(
+        input=a, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
+    b = cop.constant(
+        [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
+    t = nn.bias_add(conv, b, name="biasAdd")
+    relu = nn.relu(t, "relu")
+    idty = aops.identity(relu, "ID")
+    v = nn_ops.max_pool(
+        idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+    aops.squeeze(v, name="output")
+  return g.as_graph_def()
+
+
+def run_graph(gdef, dumm_inp):
+  gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  ops.reset_default_graph()
+  g = ops.Graph()
+  with g.as_default():
+    inp, out = importer.import_graph_def(
+        graph_def=gdef, return_elements=["input", "output"])
+    inp = inp.outputs[0]
+    out = out.outputs[0]
+  with csess.Session(
+      config=cpb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
+    val = sess.run(out, {inp: dumm_inp})
+  return val
+
+
+if "__main__" in __name__:
+  inp_dims = (100, 24, 24, 2)
+  dummy_input = np.random.random_sample(inp_dims)
+  gdef = get_simple_graph_def()
+  # Get optimized graph
+  trt_graph = trt.create_inference_graph(gdef, ["output"], inp_dims[0])
+  o1 = run_graph(gdef, dummy_input)
+  o2 = run_graph(trt_graph, dummy_input)
+  o3 = run_graph(trt_graph, dummy_input)
+  assert np.array_equal(o1, o2)
+  assert np.array_equal(o3, o2)  # sanity check
+  print("Pass")
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i
new file mode 100644
index 0000000000000000000000000000000000000000..d679945d569c1784448b6cb09c2f431b9cda56d7
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/trt_conversion.i
@@ -0,0 +1,131 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* Wrap trt_conversion */
+%{
+#define SWIG_FILE_WITH_INIT
+%}
+%include "std_pair.i"
+%include "tensorflow/python/platform/base.i"
+
+%{
+PyObject* pair_helper(std::pair<string, string>* in) {
+  PyObject *first(nullptr), *second(nullptr), *tuple(nullptr);
+  first = PyBytes_FromStringAndSize(in->first.data(), in->first.length());
+  if (!first) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_TypeError, "Pair conversion first argument failed");
+    }
+    return NULL;
+  }
+  second = PyBytes_FromStringAndSize(in->second.data(), in->second.length());
+  if (!second) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_TypeError,
+                      "Pair conversion second argument failed");
+    }
+    return NULL;
+  }
+  tuple = Py_BuildValue("(OO)", first, second);
+  if (!tuple) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_TypeError,
+                      "Tuple creation from pair<string,string> failed!");
+    }
+    return NULL;
+  }
+  return tuple;
+}
+%}
+%typemap(out) std::pair<string, string> {
+  PyObject *tuple = pair_helper(&$1);
+  if (!tuple) SWIG_fail;
+  $result = tuple;
+}
+%{
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/stat_summarizer.h"
+#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+%}
+
+%ignoreall
+%unignore tensorflow;
+%unignore trt_convert;
+
+%{
+std::pair<string, string> trt_convert(
+    string graph_def_string,  // The serialized GraphDef string.
+    std::vector<string> output_names,
+    size_t max_batch_size,
+    size_t max_workspace_size_bytes
+    // Unfortunately we can't use TF_Status here since it
+    // is in c/c_api and brings in a lot of other libraries
+    // which in turn declare ops. These ops are included
+    // statically in our library and cause an abort when
+    // module is loaded due to double registration
+    // until Tensorflow properly exposes these headers
+    // we have to work around this by returning a string
+    // and converting it to exception on python side.
+    //,TF_Status* out_status) {
+) {
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+  string out_status;
+
+  tensorflow::GraphDef graph_def;
+  if (!graph_def.ParseFromString(graph_def_string)) {
+    out_status = "InvalidArgument;Couldn't interpret input as a GraphDef";
+    return std::pair<string, string>{out_status, ""};
+  }
+
+  if (!output_names.size()) {
+    out_status = "InvalidArgument;Size of the output_names vector is 0";
+    return std::pair<string, string>{out_status, ""};
+    // return "";
+  }
+  tensorflow::GraphDef outGraph;
+  tensorflow::Status conversion_status =
+      tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT(
+          graph_def, output_names, max_batch_size, max_workspace_size_bytes,
+          &outGraph);
+  if (!conversion_status.ok()) {
+    auto retCode = (int)conversion_status.code();
+    char buff[2000];
+    snprintf(buff, 2000, "%d;%s", retCode,
+             conversion_status.error_message().c_str());
+    out_status = buff;
+    return std::pair<string, string>{out_status, ""};
+  }
+  string result;
+  if (!outGraph.SerializeToString(&result)) {
+    out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
+    return std::pair<string, string>{out_status, ""};
+  }
+  out_status = "OK;All good!";
+  return std::pair<string, string>{out_status, result};
+#else
+  // Returns FAILED_PRECONDITION.
+  return std::pair<string, string>{"9;TensorRT is not enabled!", ""};
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+}
+%}
+
+std::pair<string, string> trt_convert(string graph_def_string,
+                                      std::vector<string> output_names,
+                                      size_t max_batch_size,
+                                      size_t max_workspace_size_bytes);
+
+
+%unignoreall
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index 755b0657e9fb29c167911407cee340ac7e3e9b7a..bb86ecb2209f9bed3ad6c37f4b23bc7b361e1bd6 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -103,6 +103,7 @@ py_test(
     deps = [
         ":lstm",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/estimator:estimator_py",
     ],
 )
 
diff --git a/tensorflow/contrib/timeseries/examples/__init__.py b/tensorflow/contrib/timeseries/examples/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..52e83069cb0c68b510da46149248369dce376647 100644
--- a/tensorflow/contrib/timeseries/examples/__init__.py
+++ b/tensorflow/contrib/timeseries/examples/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv b/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv
index 02a60d1cf61765c7c916803fe918d8b7b186405e..b49a0662c29b1d810f4be31ca1f318f0571f533e 100644
--- a/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv
+++ b/tensorflow/contrib/timeseries/examples/data/multivariate_periods.csv
@@ -1,100 +1,100 @@
-0,0.926906299771,1.99107237682,2.56546245685,3.07914768197,4.04839057867
-1,0.108010001864,1.41645361423,2.1686839775,2.94963962176,4.1263503303
-2,-0.800567600028,1.0172132907,1.96434754116,2.99885333086,4.04300485864
-3,0.0607042871898,0.719540073421,1.9765012584,2.89265588817,4.0951014426
-4,0.933712200629,0.28052120776,1.41018552514,2.69232603996,4.06481164223
-5,-0.171730652974,0.260054421028,1.48770816369,2.62199129293,4.44572807842
-6,-1.00180162933,0.333045158863,1.50006392277,2.88888309683,4.24755865606
-7,0.0580061875336,0.688929398826,1.56543458772,2.99840358953,4.52726873347
-8,0.764139447412,1.24704875327,1.77649279698,3.13578593851,4.63238922951
-9,-0.230331874785,1.47903998963,2.03547545751,3.20624030377,4.77980005228
-10,-1.03846045211,2.01133000781,2.31977503972,3.67951536251,5.09716775897
-11,0.188643592253,2.23285349038,2.68338482249,3.49817168611,5.24928239634
-12,0.91207302309,2.24244446841,2.71362604985,3.96332587625,5.37802271594
-13,-0.296588665881,2.02594634141,3.07733910479,3.99698324956,5.56365901394
-14,-0.959961476551,1.45078629833,3.18996420137,4.3763059609,5.65356015609
-15,0.46313530679,1.01141441548,3.4980215948,4.20224896882,5.88842247449
-16,0.929354125798,0.626635305936,3.70508262244,4.51791573544,5.73945973251
-17,-0.519110731957,0.269249223148,3.39866823332,4.46802003061,5.82768174382
-18,-0.924330981367,0.349602834684,3.21762413294,4.72803587499,5.94918925767
-19,0.253239387885,0.345158023497,3.11071425333,4.79311566935,5.9489259713
-20,0.637408390225,0.698996675371,3.25232492145,4.73814732384,5.9612010251
-21,-0.407396859412,1.17456342803,2.49526823723,4.59323415742,5.82501686811
-22,-0.967485452118,1.66655933642,2.47284606244,4.58316034754,5.88721406681
-23,0.474480867904,1.95018556323,2.0228950072,4.48651142819,5.8255943735
-24,1.04309652155,2.23519892356,1.91924131572,4.19094661783,5.87457348436
-25,-0.517861513772,2.12501967336,1.70266619979,4.05280882887,5.72160912899
-26,-0.945301585146,1.65464653549,1.81567174251,3.92309850635,5.58270493814
-27,0.501153868974,1.40600764889,1.53991387719,3.72853247942,5.60169001727
-28,0.972859524418,1.00344321868,1.5175642828,3.64092376655,5.10567722582
-29,-0.70553406135,0.465306263885,1.7038540803,3.33236870312,5.09182481555
-30,-0.946093634916,0.294539309453,1.88052827037,2.93011492669,4.97354922696
-31,0.47922123231,0.308465865031,2.03445883031,2.90772899045,4.86241793548
-32,0.754030014252,0.549752241167,2.46115815089,2.95063349534,4.71834614627
-33,-0.64875949826,0.894615488148,2.5922463381,2.81269864022,4.43480095104
-34,-0.757829951086,1.39123914261,2.69258079904,2.61834837315,4.36580046156
-35,0.565653301088,1.72360022693,2.97794913834,2.80403840334,4.27327248459
-36,0.867440092372,2.21100730052,3.38648090792,2.84057515729,4.12210169576
-37,-0.894567758095,2.17549105818,3.45532493329,2.90446025717,4.00251740584
-38,-0.715442356893,2.15105389965,3.52041791902,3.03650393392,4.12809249577
-39,0.80671703672,1.81504564517,3.60463324866,3.00747789871,3.98440762467
-40,0.527014790142,1.31803513865,3.43842186337,3.3332594663,4.03232406566
-41,-0.795936862129,0.847809114454,3.09875133548,3.52863155938,3.94883924909
-42,-0.610245806946,0.425530441018,2.92581949152,3.77238736123,4.27287245021
-43,0.611662279431,0.178432049837,2.48128214822,3.73212087883,4.17319013831
-44,0.650866553108,0.220341648392,2.41694642022,4.2609098519,4.27271645905
-45,-0.774156982023,0.632667602331,2.05474356052,4.32889204886,4.18029723271
-46,-0.714058448409,0.924562377599,1.75706135146,4.52492718422,4.3972678094
-47,0.889627293379,1.46207968841,1.78299357672,4.64466731095,4.56317887554
-48,0.520140662861,1.8996333843,1.41377633823,4.48899091177,4.78805049769
-49,-1.03816935616,2.08997002059,1.51218375351,4.84167764204,4.93026048606
-50,-0.40772951362,2.30878972136,1.44144415128,4.76854460997,5.01538444629
-51,0.792730684781,1.91367048509,1.58887384677,4.71739397335,5.25690012199
-52,0.371311881576,1.67565079528,1.81688563053,4.60353107555,5.44265822961
-53,-0.814398070371,1.13374634126,1.80328814859,4.72264252878,5.52674761122
-54,-0.469017949323,0.601244136627,2.29690896736,4.49859178859,5.54126153454
-55,0.871044371426,0.407597593794,2.7499112487,4.19060637761,5.57693767301
-56,0.523764933017,0.247705192709,3.09002071379,4.02095509006,5.80510362182
-57,-0.881326403531,0.31513103164,3.11358205718,3.96079100808,5.81000652365
-58,-0.357928025339,0.486163915865,3.17884556771,3.72634990659,5.85693642011
-59,0.853038779822,1.04218094475,3.45835384454,3.36703969978,5.9585988449
-60,0.435311516013,1.59715085283,3.63313338588,3.11276729421,5.93643818229
-61,-1.02703719138,1.92205832542,3.47606111735,3.06247155999,6.02106646259
-62,-0.246661325557,2.14653802542,3.29446326567,2.89936259181,5.67531541272
-63,1.02554736569,2.25943737733,3.07031591528,2.78176218013,5.78206328989
-64,0.337814475969,2.07589147224,2.80356226089,2.55888206331,5.7094075496
-65,-1.12023369929,1.25333011618,2.56497288445,2.77361359194,5.50799418376
-66,-0.178980246554,1.11937139901,2.51598681313,2.91438309151,5.47469577206
-67,0.97550951531,0.60553823137,2.11657741073,2.88081098981,5.37034999502
-68,0.136653357206,0.365828836075,1.97386033165,3.13217903204,5.07254490219
-69,-1.05607596951,0.153152115069,1.52110743825,3.01308794192,5.08902539125
-70,-0.13095280331,0.337113974483,1.52703079853,3.16687131599,4.86649398514
-71,1.07081057754,0.714247566736,1.53761382634,3.45151989484,4.75892309166
-72,0.0153410376082,1.24631231847,1.61690939161,3.85481994498,4.35683752832
-73,-0.912801257303,1.60791309476,1.8729264524,4.03037260012,4.36072588913
-74,-0.0894895640338,2.02535207407,1.93484909619,4.09557485132,4.35327025188
-75,0.978646999652,2.20085086625,2.09003440427,4.27542353033,4.1805058388
-76,-0.113312642876,2.2444100761,2.50789248839,4.4151861502,4.03267168136
-77,-1.00215099149,1.84305628445,2.61691237246,4.45425147595,3.81203553766
-78,-0.0183234614205,1.49573923116,2.99308471214,4.71134960112,4.0273804959
-79,1.0823738177,1.12211589848,3.27079386925,4.94288270502,4.01851068083
-80,0.124370187893,0.616474412808,3.4284236674,4.76942168327,3.9749536483
-81,-0.929423379352,0.290977090976,3.34131726136,4.78590392707,4.10190661656
-82,0.23766302648,0.155302052254,3.49779513794,4.64605656795,4.15571321107
-83,1.03531486192,0.359702776204,3.4880725919,4.48167586667,4.21134561991
-84,-0.261234571382,0.713877760378,3.42756426614,4.426443869,4.25208300527
-85,-1.03572442277,1.25001113691,2.96908341113,4.25500915322,4.25723010649
-86,0.380034261243,1.70543355622,2.73605932518,4.16703432307,4.63700400788
-87,1.03734873488,1.97544410562,2.55586572141,3.84976673263,4.55282864289
-88,-0.177344253372,2.22614526325,2.09565864891,3.77378097953,4.82577400298
-89,-0.976821526892,2.18385079177,1.78522284118,3.67768223554,5.06302440873
-90,0.264820472091,1.86981946157,1.50048403865,3.43619796921,5.05651761669
-91,1.05642344868,1.47568646076,1.51347671977,3.20898518885,5.50149047462
-92,-0.311607433358,1.04226467636,1.52089650905,3.02291865417,5.4889046232
-93,-0.724285777937,0.553052311957,1.48573560173,2.7365973598,5.72549174225
-94,0.519859192905,0.226520626591,1.61543723167,2.84102086852,5.69330622288
-95,1.0323195039,0.260873217055,1.81913034804,2.83951143848,5.90325028086
-96,-0.53285682538,0.387695521405,1.70935609313,2.57977050631,5.79579213161
-97,-0.975127997215,0.920948771589,2.51292643636,2.71004616612,5.87016469227
-98,0.540246804099,1.36445470181,2.61949412896,2.98482553485,6.02447664937
-99,0.987764008058,1.85581989607,2.84685706149,2.94760204892,6.0212151724
+0,0.926906299771,1.99107237682,2.56546245685,3.07914768197,4.04839057867,1.,0.
+1,0.108010001864,1.41645361423,2.1686839775,2.94963962176,4.1263503303,1.,0.
+2,-0.800567600028,1.0172132907,1.96434754116,2.99885333086,4.04300485864,1.,0.
+3,0.0607042871898,0.719540073421,1.9765012584,2.89265588817,4.0951014426,1.,0.
+4,0.933712200629,0.28052120776,1.41018552514,2.69232603996,4.06481164223,1.,0.
+5,-0.171730652974,0.260054421028,1.48770816369,2.62199129293,4.44572807842,1.,0.
+6,-1.00180162933,0.333045158863,1.50006392277,2.88888309683,4.24755865606,1.,0.
+7,0.0580061875336,0.688929398826,1.56543458772,2.99840358953,4.52726873347,1.,0.
+8,0.764139447412,1.24704875327,1.77649279698,3.13578593851,4.63238922951,1.,0.
+9,-0.230331874785,1.47903998963,2.03547545751,3.20624030377,4.77980005228,1.,0.
+10,-1.03846045211,2.01133000781,2.31977503972,3.67951536251,5.09716775897,1.,0.
+11,0.188643592253,2.23285349038,2.68338482249,3.49817168611,5.24928239634,1.,0.
+12,0.91207302309,2.24244446841,2.71362604985,3.96332587625,5.37802271594,1.,0.
+13,-0.296588665881,2.02594634141,3.07733910479,3.99698324956,5.56365901394,1.,0.
+14,-0.959961476551,1.45078629833,3.18996420137,4.3763059609,5.65356015609,1.,0.
+15,0.46313530679,1.01141441548,3.4980215948,4.20224896882,5.88842247449,1.,0.
+16,0.929354125798,0.626635305936,3.70508262244,4.51791573544,5.73945973251,1.,0.
+17,-0.519110731957,0.269249223148,3.39866823332,4.46802003061,5.82768174382,1.,0.
+18,-0.924330981367,0.349602834684,3.21762413294,4.72803587499,5.94918925767,1.,0.
+19,0.253239387885,0.345158023497,3.11071425333,4.79311566935,5.9489259713,1.,0.
+20,0.637408390225,0.698996675371,3.25232492145,4.73814732384,5.9612010251,1.,0.
+21,-0.407396859412,1.17456342803,2.49526823723,4.59323415742,5.82501686811,1.,0.
+22,-0.967485452118,1.66655933642,2.47284606244,4.58316034754,5.88721406681,1.,0.
+23,0.474480867904,1.95018556323,2.0228950072,4.48651142819,5.8255943735,1.,0.
+24,1.04309652155,2.23519892356,1.91924131572,4.19094661783,5.87457348436,1.,0.
+25,-0.517861513772,2.12501967336,1.70266619979,4.05280882887,5.72160912899,1.,0.
+26,-0.945301585146,1.65464653549,1.81567174251,3.92309850635,5.58270493814,1.,0.
+27,0.501153868974,1.40600764889,1.53991387719,3.72853247942,5.60169001727,1.,0.
+28,0.972859524418,1.00344321868,1.5175642828,3.64092376655,5.10567722582,1.,0.
+29,-0.70553406135,0.465306263885,1.7038540803,3.33236870312,5.09182481555,1.,0.
+30,-0.946093634916,0.294539309453,1.88052827037,2.93011492669,4.97354922696,1.,0.
+31,0.47922123231,0.308465865031,2.03445883031,2.90772899045,4.86241793548,1.,0.
+32,0.754030014252,0.549752241167,2.46115815089,2.95063349534,4.71834614627,1.,0.
+33,-0.64875949826,0.894615488148,2.5922463381,2.81269864022,4.43480095104,1.,0.
+34,-0.757829951086,1.39123914261,2.69258079904,2.61834837315,4.36580046156,1.,0.
+35,0.565653301088,1.72360022693,2.97794913834,2.80403840334,4.27327248459,1.,0.
+36,0.867440092372,2.21100730052,3.38648090792,2.84057515729,4.12210169576,1.,0.
+37,-0.894567758095,2.17549105818,3.45532493329,2.90446025717,4.00251740584,1.,0.
+38,-0.715442356893,2.15105389965,3.52041791902,3.03650393392,4.12809249577,1.,0.
+39,0.80671703672,1.81504564517,3.60463324866,3.00747789871,3.98440762467,1.,0.
+40,0.527014790142,1.31803513865,3.43842186337,3.3332594663,4.03232406566,1.,0.
+41,-0.795936862129,0.847809114454,3.09875133548,3.52863155938,3.94883924909,1.,0.
+42,-0.610245806946,0.425530441018,2.92581949152,3.77238736123,4.27287245021,1.,0.
+43,0.611662279431,0.178432049837,2.48128214822,3.73212087883,4.17319013831,1.,0.
+44,0.650866553108,0.220341648392,2.41694642022,4.2609098519,4.27271645905,1.,0.
+45,-0.774156982023,0.632667602331,2.05474356052,4.32889204886,4.18029723271,1.,0.
+46,-0.714058448409,0.924562377599,1.75706135146,4.52492718422,4.3972678094,1.,0.
+47,0.889627293379,1.46207968841,1.78299357672,4.64466731095,4.56317887554,1.,0.
+48,0.520140662861,1.8996333843,1.41377633823,4.48899091177,4.78805049769,1.,0.
+49,-1.03816935616,2.08997002059,1.51218375351,4.84167764204,4.93026048606,1.,0.
+50,-0.40772951362,2.30878972136,1.44144415128,4.76854460997,5.01538444629,1.,0.
+51,0.792730684781,1.91367048509,1.58887384677,4.71739397335,5.25690012199,1.,0.
+52,0.371311881576,1.67565079528,1.81688563053,4.60353107555,5.44265822961,1.,0.
+53,-0.814398070371,1.13374634126,1.80328814859,4.72264252878,5.52674761122,1.,0.
+54,-0.469017949323,0.601244136627,2.29690896736,4.49859178859,5.54126153454,1.,0.
+55,0.871044371426,0.407597593794,2.7499112487,4.19060637761,5.57693767301,1.,0.
+56,0.523764933017,0.247705192709,3.09002071379,4.02095509006,5.80510362182,1.,0.
+57,-0.881326403531,0.31513103164,3.11358205718,3.96079100808,5.81000652365,1.,0.
+58,-0.357928025339,0.486163915865,3.17884556771,3.72634990659,5.85693642011,1.,0.
+59,0.853038779822,1.04218094475,3.45835384454,3.36703969978,5.9585988449,1.,0.
+60,0.435311516013,1.59715085283,3.63313338588,3.11276729421,5.93643818229,1.,0.
+61,-1.02703719138,1.92205832542,3.47606111735,3.06247155999,6.02106646259,1.,0.
+62,-0.246661325557,2.14653802542,3.29446326567,2.89936259181,5.67531541272,1.,0.
+63,1.02554736569,2.25943737733,3.07031591528,2.78176218013,5.78206328989,1.,0.
+64,0.337814475969,2.07589147224,2.80356226089,2.55888206331,5.7094075496,1.,0.
+65,-1.12023369929,1.25333011618,2.56497288445,2.77361359194,5.50799418376,1.,0.
+66,-0.178980246554,1.11937139901,2.51598681313,2.91438309151,5.47469577206,1.,0.
+67,0.97550951531,0.60553823137,2.11657741073,2.88081098981,5.37034999502,1.,0.
+68,0.136653357206,0.365828836075,1.97386033165,3.13217903204,5.07254490219,1.,0.
+69,-1.05607596951,0.153152115069,1.52110743825,3.01308794192,5.08902539125,1.,0.
+70,-0.13095280331,0.337113974483,1.52703079853,3.16687131599,4.86649398514,1.,0.
+71,1.07081057754,0.714247566736,1.53761382634,3.45151989484,4.75892309166,1.,0.
+72,0.0153410376082,1.24631231847,1.61690939161,3.85481994498,4.35683752832,1.,0.
+73,-0.912801257303,1.60791309476,1.8729264524,4.03037260012,4.36072588913,1.,0.
+74,-0.0894895640338,2.02535207407,1.93484909619,4.09557485132,4.35327025188,1.,0.
+75,0.978646999652,2.20085086625,2.09003440427,4.27542353033,4.1805058388,1.,0.
+76,-0.113312642876,2.2444100761,2.50789248839,4.4151861502,4.03267168136,1.,0.
+77,-1.00215099149,1.84305628445,2.61691237246,4.45425147595,3.81203553766,1.,0.
+78,-0.0183234614205,1.49573923116,2.99308471214,4.71134960112,4.0273804959,1.,0.
+79,1.0823738177,1.12211589848,3.27079386925,4.94288270502,4.01851068083,1.,0.
+80,0.124370187893,0.616474412808,3.4284236674,4.76942168327,3.9749536483,1.,0.
+81,-0.929423379352,0.290977090976,3.34131726136,4.78590392707,4.10190661656,1.,0.
+82,0.23766302648,0.155302052254,3.49779513794,4.64605656795,4.15571321107,1.,0.
+83,1.03531486192,0.359702776204,3.4880725919,4.48167586667,4.21134561991,1.,0.
+84,-0.261234571382,0.713877760378,3.42756426614,4.426443869,4.25208300527,1.,0.
+85,-1.03572442277,1.25001113691,2.96908341113,4.25500915322,4.25723010649,1.,0.
+86,0.380034261243,1.70543355622,2.73605932518,4.16703432307,4.63700400788,1.,0.
+87,1.03734873488,1.97544410562,2.55586572141,3.84976673263,4.55282864289,1.,0.
+88,-0.177344253372,2.22614526325,2.09565864891,3.77378097953,4.82577400298,1.,0.
+89,-0.976821526892,2.18385079177,1.78522284118,3.67768223554,5.06302440873,1.,0.
+90,0.264820472091,1.86981946157,1.50048403865,3.43619796921,5.05651761669,1.,0.
+91,1.05642344868,1.47568646076,1.51347671977,3.20898518885,5.50149047462,1.,0.
+92,-0.311607433358,1.04226467636,1.52089650905,3.02291865417,5.4889046232,1.,0.
+93,-0.724285777937,0.553052311957,1.48573560173,2.7365973598,5.72549174225,1.,0.
+94,0.519859192905,0.226520626591,1.61543723167,2.84102086852,5.69330622288,1.,0.
+95,1.0323195039,0.260873217055,1.81913034804,2.83951143848,5.90325028086,1.,0.
+96,-0.53285682538,0.387695521405,1.70935609313,2.57977050631,5.79579213161,1.,0.
+97,-0.975127997215,0.920948771589,2.51292643636,2.71004616612,5.87016469227,1.,0.
+98,0.540246804099,1.36445470181,2.61949412896,2.98482553485,6.02447664937,1.,0.
+99,0.987764008058,1.85581989607,2.84685706149,2.94760204892,6.0212151724,1.,0.
diff --git a/tensorflow/contrib/timeseries/examples/lstm.py b/tensorflow/contrib/timeseries/examples/lstm.py
index 3ba823f638da8f750981bc910d960706ff652fb7..f37cafcc502dc9415db0829b9b067b862f87dca7 100644
--- a/tensorflow/contrib/timeseries/examples/lstm.py
+++ b/tensorflow/contrib/timeseries/examples/lstm.py
@@ -18,13 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 from os import path
+import tempfile
 
 import numpy
 import tensorflow as tf
 
 from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators
 from tensorflow.contrib.timeseries.python.timeseries import model as ts_model
+from tensorflow.contrib.timeseries.python.timeseries import state_management
 
 try:
   import matplotlib  # pylint: disable=g-import-not-at-top
@@ -45,7 +48,8 @@ _DATA_FILE = path.join(_MODULE_PATH, "data/multivariate_periods.csv")
 class _LSTMModel(ts_model.SequentialTimeSeriesModel):
   """A time series model-building example using an RNNCell."""
 
-  def __init__(self, num_units, num_features, dtype=tf.float32):
+  def __init__(self, num_units, num_features, exogenous_feature_columns=None,
+               dtype=tf.float32):
     """Initialize/configure the model object.
 
     Note that we do not start graph building here. Rather, this object is a
@@ -55,6 +59,10 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
       num_units: The number of units in the model's LSTMCell.
       num_features: The dimensionality of the time series (features per
         timestep).
+      exogenous_feature_columns: A list of tf.contrib.layers.FeatureColumn
+          objects representing features which are inputs to the model but are
+          not predicted by it. These must then be present for training,
+          evaluation, and prediction.
       dtype: The floating point data type to use.
     """
     super(_LSTMModel, self).__init__(
@@ -62,6 +70,7 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
         train_output_names=["mean"],
         predict_output_names=["mean"],
         num_features=num_features,
+        exogenous_feature_columns=exogenous_feature_columns,
         dtype=dtype)
     self._num_units = num_units
     # Filled in by initialize_graph()
@@ -69,7 +78,7 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
     self._lstm_cell_run = None
     self._predict_from_lstm_output = None
 
-  def initialize_graph(self, input_statistics):
+  def initialize_graph(self, input_statistics=None):
     """Save templates for components, which can then be used repeatedly.
 
     This method is called every time a new graph is created. It's safe to start
@@ -80,18 +89,19 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
       input_statistics: A math_utils.InputStatistics object.
     """
     super(_LSTMModel, self).initialize_graph(input_statistics=input_statistics)
-    self._lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self._num_units)
-    # Create templates so we don't have to worry about variable reuse.
-    self._lstm_cell_run = tf.make_template(
-        name_="lstm_cell",
-        func_=self._lstm_cell,
-        create_scope_now_=True)
-    # Transforms LSTM output into mean predictions.
-    self._predict_from_lstm_output = tf.make_template(
-        name_="predict_from_lstm_output",
-        func_=
-        lambda inputs: tf.layers.dense(inputs=inputs, units=self.num_features),
-        create_scope_now_=True)
+    with tf.variable_scope("", use_resource=True):
+      # Use ResourceVariables to avoid race conditions.
+      self._lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self._num_units)
+      # Create templates so we don't have to worry about variable reuse.
+      self._lstm_cell_run = tf.make_template(
+          name_="lstm_cell",
+          func_=self._lstm_cell,
+          create_scope_now_=True)
+      # Transforms LSTM output into mean predictions.
+      self._predict_from_lstm_output = tf.make_template(
+          name_="predict_from_lstm_output",
+          func_=functools.partial(tf.layers.dense, units=self.num_features),
+          create_scope_now_=True)
 
   def get_start_state(self):
     """Return initial state for the time series model."""
@@ -100,6 +110,8 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
         tf.zeros([], dtype=tf.int64),
         # The previous observation or prediction.
         tf.zeros([self.num_features], dtype=self.dtype),
+        # The most recently seen exogenous features.
+        tf.zeros(self._get_exogenous_embedding_shape(), dtype=self.dtype),
         # The state of the RNNCell (batch dimension removed since this parent
         # class will broadcast).
         [tf.squeeze(state_element, axis=0)
@@ -127,7 +139,7 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
       loss (note that we could also return other measures of goodness of fit,
       although only "loss" will be optimized).
     """
-    state_from_time, prediction, lstm_state = state
+    state_from_time, prediction, exogenous, lstm_state = state
     with tf.control_dependencies(
         [tf.assert_equal(current_times, state_from_time)]):
       # Subtract the mean and divide by the variance of the series.  Slightly
@@ -139,16 +151,22 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
           (prediction - transformed_values) ** 2, axis=-1)
       # Keep track of the new observation in model state. It won't be run
       # through the LSTM until the next _imputation_step.
-      new_state_tuple = (current_times, transformed_values, lstm_state)
+      new_state_tuple = (current_times, transformed_values,
+                         exogenous, lstm_state)
     return (new_state_tuple, predictions)
 
   def _prediction_step(self, current_times, state):
     """Advance the RNN state using a previous observation or prediction."""
-    _, previous_observation_or_prediction, lstm_state = state
+    _, previous_observation_or_prediction, exogenous, lstm_state = state
+    # Update LSTM state based on the most recent exogenous and endogenous
+    # features.
+    inputs = tf.concat([previous_observation_or_prediction, exogenous],
+                       axis=-1)
     lstm_output, new_lstm_state = self._lstm_cell_run(
-        inputs=previous_observation_or_prediction, state=lstm_state)
+        inputs=inputs, state=lstm_state)
     next_prediction = self._predict_from_lstm_output(lstm_output)
-    new_state_tuple = (current_times, next_prediction, new_lstm_state)
+    new_state_tuple = (current_times, next_prediction,
+                       exogenous, new_lstm_state)
     return new_state_tuple, {"mean": self._scale_back_data(next_prediction)}
 
   def _imputation_step(self, current_times, state):
@@ -160,35 +178,75 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
 
   def _exogenous_input_step(
       self, current_times, current_exogenous_regressors, state):
-    """Update model state based on exogenous regressors."""
-    raise NotImplementedError(
-        "Exogenous inputs are not implemented for this example.")
+    """Save exogenous regressors in model state for use in _prediction_step."""
+    state_from_time, prediction, _, lstm_state = state
+    return (state_from_time, prediction,
+            current_exogenous_regressors, lstm_state)
 
 
-def train_and_predict(csv_file_name=_DATA_FILE, training_steps=200):
+def train_and_predict(
+    csv_file_name=_DATA_FILE, training_steps=200, estimator_config=None,
+    export_directory=None):
   """Train and predict using a custom time series model."""
   # Construct an Estimator from our LSTM model.
+  exogenous_feature_columns = [
+      # Exogenous features are not part of the loss, but can inform
+      # predictions. In this example the features have no extra information, but
+      # are included as an API example.
+      tf.contrib.layers.real_valued_column(
+          "2d_exogenous_feature", dimension=2)]
   estimator = ts_estimators.TimeSeriesRegressor(
-      model=_LSTMModel(num_features=5, num_units=128),
-      optimizer=tf.train.AdamOptimizer(0.001))
+      model=_LSTMModel(num_features=5, num_units=128,
+                       exogenous_feature_columns=exogenous_feature_columns),
+      optimizer=tf.train.AdamOptimizer(0.001), config=estimator_config,
+      # Set state to be saved across windows.
+      state_manager=state_management.ChainingStateManager())
   reader = tf.contrib.timeseries.CSVReader(
       csv_file_name,
       column_names=((tf.contrib.timeseries.TrainEvalFeatures.TIMES,)
-                    + (tf.contrib.timeseries.TrainEvalFeatures.VALUES,) * 5))
+                    + (tf.contrib.timeseries.TrainEvalFeatures.VALUES,) * 5
+                    + ("2d_exogenous_feature",) * 2))
   train_input_fn = tf.contrib.timeseries.RandomWindowInputFn(
       reader, batch_size=4, window_size=32)
   estimator.train(input_fn=train_input_fn, steps=training_steps)
   evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader)
   evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1)
   # Predict starting after the evaluation
+  predict_exogenous_features = {
+      "2d_exogenous_feature": numpy.concatenate(
+          [numpy.ones([1, 100, 1]), numpy.zeros([1, 100, 1])],
+          axis=-1)}
   (predictions,) = tuple(estimator.predict(
       input_fn=tf.contrib.timeseries.predict_continuation_input_fn(
-          evaluation, steps=100)))
+          evaluation, steps=100,
+          exogenous_features=predict_exogenous_features)))
   times = evaluation["times"][0]
   observed = evaluation["observed"][0, :, :]
   predicted_mean = numpy.squeeze(numpy.concatenate(
       [evaluation["mean"][0], predictions["mean"]], axis=0))
   all_times = numpy.concatenate([times, predictions["times"]], axis=0)
+
+  # Export the model in SavedModel format.
+  if export_directory is None:
+    export_directory = tempfile.mkdtemp()
+  input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
+  export_location = estimator.export_savedmodel(
+      export_directory, input_receiver_fn)
+  # Predict using the SavedModel
+  with tf.Graph().as_default():
+    with tf.Session() as session:
+      signatures = tf.saved_model.loader.load(
+          session, [tf.saved_model.tag_constants.SERVING], export_location)
+      saved_model_output = (
+          tf.contrib.timeseries.saved_model_utils.predict_continuation(
+              continue_from=evaluation, signatures=signatures,
+              session=session, steps=100,
+              exogenous_features=predict_exogenous_features))
+      # The exported model gives the same results as the Estimator.predict()
+      # call above.
+      numpy.testing.assert_allclose(
+          predictions["mean"],
+          numpy.squeeze(saved_model_output["mean"], axis=0))
   return times, observed, all_times, predicted_mean
 
 
diff --git a/tensorflow/contrib/timeseries/examples/lstm_test.py b/tensorflow/contrib/timeseries/examples/lstm_test.py
index 56daa1e10d9d1e7e96d71f33afc72671512dbaf8..ca56e38ca079f71b38cf29605a295a50929945e8 100644
--- a/tensorflow/contrib/timeseries/examples/lstm_test.py
+++ b/tensorflow/contrib/timeseries/examples/lstm_test.py
@@ -20,14 +20,24 @@ from __future__ import print_function
 
 from tensorflow.contrib.timeseries.examples import lstm
 
+from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.platform import test
 
 
+class _SeedRunConfig(estimator_lib.RunConfig):
+
+  @property
+  def tf_random_seed(self):
+    return 3
+
+
 class LSTMExampleTest(test.TestCase):
 
   def test_periodicity_learned(self):
     (observed_times, observed_values,
-     all_times, predicted_values) = lstm.train_and_predict(training_steps=100)
+     all_times, predicted_values) = lstm.train_and_predict(
+         training_steps=100, estimator_config=_SeedRunConfig(),
+         export_directory=self.get_temp_dir())
     self.assertAllEqual([100], observed_times.shape)
     self.assertAllEqual([100, 5], observed_values.shape)
     self.assertAllEqual([200], all_times.shape)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index 5f04eb2f5a4af031ad19662b05a8a2396299925d..fff972c1f3277ad5d83673a202a50d1e6f7df210 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -296,6 +296,8 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",  # b/63709811
+        "no_pip",  # b/63709811
         "no_pip_gpu",  # b/63391119
     ],
     deps = [
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index 3738dfa154d4f39b9562446972443ed88f3fbe8b..f8355f366fe8e191ab570fd271bbe4a8bf71c73d 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.layers.python.layers import feature_column
+
 from tensorflow.contrib.timeseries.python.timeseries import ar_model
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
 from tensorflow.contrib.timeseries.python.timeseries import head as ts_head_lib
@@ -72,15 +74,14 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
   # tf.Example containing all features (times, values, any exogenous features)
   # and serialized model state (possibly also as a tf.Example).
   def build_raw_serving_input_receiver_fn(self,
-                                          exogenous_features=None,
                                           default_batch_size=None,
                                           default_series_length=None):
     """Build an input_receiver_fn for export_savedmodel which accepts arrays.
 
+    Automatically creates placeholders for exogenous `FeatureColumn`s passed to
+    the model.
+
     Args:
-      exogenous_features: A dictionary mapping feature keys to exogenous
-        features (either Numpy arrays or Tensors). Used to determine the shapes
-        of placeholders for these features.
       default_batch_size: If specified, must be a scalar integer. Sets the batch
         size in the static shape information of all feature Tensors, which means
         only this batch size will be accepted by the exported model. If None
@@ -94,9 +95,6 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
       An input_receiver_fn which may be passed to the Estimator's
       export_savedmodel.
     """
-    if exogenous_features is None:
-      exogenous_features = {}
-
     def _serving_input_receiver_fn():
       """A receiver function to be passed to export_savedmodel."""
       placeholders = {}
@@ -119,14 +117,22 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
                   dtype=self._model.dtype),
               shape=(default_batch_size, default_series_length,
                      self._model.num_features)))
-      for feature_key, feature_value in exogenous_features.items():
-        value_tensor = ops.convert_to_tensor(feature_value)
-        value_tensor.get_shape().with_rank_at_least(2)
-        feature_shape = value_tensor.get_shape().as_list()
-        feature_shape[0] = default_batch_size
-        feature_shape[1] = default_series_length
+      with ops.Graph().as_default():
+        # Default placeholders have only an unknown batch dimension. Make them
+        # in a separate graph, then splice in the series length to the shapes
+        # and re-create them in the outer graph.
+        exogenous_feature_shapes = {
+            key: (value.get_shape(), value.dtype) for key, value
+            in feature_column.make_place_holder_tensors_for_base_features(
+                self._model.exogenous_feature_columns).items()}
+      for feature_key, (batch_only_feature_shape, value_dtype) in (
+          exogenous_feature_shapes.items()):
+        batch_only_feature_shape = batch_only_feature_shape.with_rank_at_least(
+            1).as_list()
+        feature_shape = ([default_batch_size, default_series_length]
+                         + batch_only_feature_shape[1:])
         placeholders[feature_key] = array_ops.placeholder(
-            dtype=value_tensor.dtype, name=feature_key, shape=feature_shape)
+            dtype=value_dtype, name=feature_key, shape=feature_shape)
       # Models may not know the shape of their state without creating some
       # variables/ops. Avoid polluting the default graph by making a new one. We
       # use only static metadata from the returned Tensors.
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
index 5896fc2a206bc747688b5b012e0f87465592dd8a..f0330bfbbd6e8067e5d085376acdf2e6bcaccb6a 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import re
 
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import optimizers
 
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
@@ -79,7 +79,7 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
 
     train_op = optimizers.optimize_loss(
         model_outputs.loss,
-        global_step=variables.get_global_step(),
+        global_step=training_util.get_global_step(),
         optimizer=self.optimizer,
         # Learning rate is set in the Optimizer object
         learning_rate=None)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/model.py b/tensorflow/contrib/timeseries/python/timeseries/model.py
index b32b5c5494ae14187954b900119678a5b53a3602..bac7d1ebf59b28d4688a3d1a69ecdc1fc12248e0 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/model.py
@@ -22,6 +22,7 @@ import abc
 import collections
 
 from tensorflow.contrib import layers
+from tensorflow.contrib.layers import feature_column
 
 from tensorflow.contrib.timeseries.python.timeseries import math_utils
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
@@ -83,6 +84,11 @@ class TimeSeriesModel(object):
     self._stats_means = None
     self._stats_sigmas = None
 
+  @property
+  def exogenous_feature_columns(self):
+    """`FeatureColumn` objects for features which are not predicted."""
+    return self._exogenous_feature_columns
+
   # TODO(allenl): Move more of the generic machinery for generating and
   # predicting into TimeSeriesModel, and possibly share it between generate()
   # and predict()
@@ -250,6 +256,23 @@ class TimeSeriesModel(object):
     """
     pass
 
+  def _get_exogenous_embedding_shape(self):
+    """Computes the shape of the vector returned by _process_exogenous_features.
+
+    Returns:
+      The shape as a list. Does not include a batch dimension.
+    """
+    if not self._exogenous_feature_columns:
+      return (0,)
+    with ops.Graph().as_default():
+      placeholder_features = (
+          feature_column.make_place_holder_tensors_for_base_features(
+              self._exogenous_feature_columns))
+      embedded = layers.input_from_feature_columns(
+          columns_to_tensors=placeholder_features,
+          feature_columns=self._exogenous_feature_columns)
+      return embedded.get_shape().as_list()[1:]
+
   def _process_exogenous_features(self, times, features):
     """Create a single vector from exogenous features.
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/__init__.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..52e83069cb0c68b510da46149248369dce376647 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/__init__.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
index ca57715e2b2e6bbadd276d641703c0a3b842652e..1fb4a3c121c8d7c1daf8fc4a3f59a8b8de38bf8f 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
@@ -187,9 +187,7 @@ class StateSpaceEquivalenceTests(test.TestCase):
     estimator.train(combined_input_fn, steps=1)
     export_location = estimator.export_savedmodel(
         self.get_temp_dir(),
-        estimator.build_raw_serving_input_receiver_fn(
-            exogenous_features={
-                "exogenous": numpy.zeros((0, 0), dtype=numpy.float32)}))
+        estimator.build_raw_serving_input_receiver_fn())
     with ops.Graph().as_default() as graph:
       random_model.initialize_graph()
       with self.test_session(graph=graph) as session:
@@ -209,7 +207,7 @@ class StateSpaceEquivalenceTests(test.TestCase):
             features={
                 feature_keys.FilteringFeatures.TIMES: [1, 2],
                 feature_keys.FilteringFeatures.VALUES: [1., 2.],
-                "exogenous": [-1., -2.]})
+                "exogenous": [[-1.], [-2.]]})
         second_split_filtering = saved_model_utils.filter_continuation(
             continue_from=first_split_filtering,
             signatures=signatures,
@@ -217,7 +215,7 @@ class StateSpaceEquivalenceTests(test.TestCase):
             features={
                 feature_keys.FilteringFeatures.TIMES: [3, 4],
                 feature_keys.FilteringFeatures.VALUES: [3., 4.],
-                "exogenous": [-3., -4.]
+                "exogenous": [[-3.], [-4.]]
             })
         combined_filtering = saved_model_utils.filter_continuation(
             continue_from={
@@ -227,7 +225,7 @@ class StateSpaceEquivalenceTests(test.TestCase):
             features={
                 feature_keys.FilteringFeatures.TIMES: [1, 2, 3, 4],
                 feature_keys.FilteringFeatures.VALUES: [1., 2., 3., 4.],
-                "exogenous": [-1., -2., -3., -4.]
+                "exogenous": [[-1.], [-2.], [-3.], [-4.]]
             })
         split_predict = saved_model_utils.predict_continuation(
             continue_from=second_split_filtering,
@@ -235,14 +233,14 @@ class StateSpaceEquivalenceTests(test.TestCase):
             session=session,
             steps=1,
             exogenous_features={
-                "exogenous": [[-5.]]})
+                "exogenous": [[[-5.]]]})
         combined_predict = saved_model_utils.predict_continuation(
             continue_from=combined_filtering,
             signatures=signatures,
             session=session,
             steps=1,
             exogenous_features={
-                "exogenous": [[-5.]]})
+                "exogenous": [[[-5.]]]})
     for state_key, combined_state_value in combined_filtering.items():
       if state_key == feature_keys.FilteringResults.TIMES:
         continue
@@ -627,9 +625,8 @@ class UnknownShapeModel(TimeDependentStateSpaceModel):
 
   def get_observation_model(self, times):
     parent_model = super(UnknownShapeModel, self).get_observation_model(times)
-    parent_model._shape = tensor_shape.unknown_shape()
-    assert parent_model.get_shape().ndims is None
-    return parent_model
+    return array_ops.placeholder_with_default(
+        input=parent_model, shape=tensor_shape.unknown_shape())
 
 
 class TimeDependentTests(test.TestCase):
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index f542d9413944052bd5ad3c351793185c63e8ae19..c48e84ddfaac8ac9c07e061847315eab3fd72152 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -28,21 +28,7 @@ cc_library(
         ":outfeed_ops_op_lib",
         ":replication_ops_op_lib",
         ":tpu_configuration_ops_op_lib",
-    ],
-)
-
-py_library(
-    name = "tpu_test_util",
-    srcs = ["python/tpu/test_util.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":tpu_lib",
-        ":tpu_py",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variables",
+        ":tpu_embedding_ops_op_lib",
     ],
 )
 
@@ -50,13 +36,16 @@ py_library(
     name = "tpu_estimator",
     srcs = [
         "python/tpu/tpu_config.py",
+        "python/tpu/tpu_context.py",
         "python/tpu/tpu_estimator.py",
+        "python/tpu/tpu_system_metadata.py",
         "python/tpu/util.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":tpu_lib",
         ":tpu_py",
+        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -84,9 +73,11 @@ tf_gen_op_libs(
         "outfeed_ops",
         "replication_ops",
         "tpu_configuration_ops",
+        "tpu_embedding_ops",
     ],
     deps = [
-        "//tensorflow/core:lib",
+        "//tensorflow/contrib/tpu/proto:tpu_embedding_config_proto_cc",
+        "//tensorflow/core:lib_proto_parsing",
     ],
 )
 
@@ -98,6 +89,11 @@ tf_custom_op_library(
         "ops/outfeed_ops.cc",
         "ops/replication_ops.cc",
         "ops/tpu_configuration_ops.cc",
+        "ops/tpu_embedding_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/tpu/proto:tpu_embedding_config_proto_cc",
+        "//tensorflow/core:lib_proto_parsing",
     ],
 )
 
@@ -109,6 +105,7 @@ tf_gen_op_wrapper_py(
         ":outfeed_ops_op_lib",
         ":replication_ops_op_lib",
         ":tpu_configuration_ops_op_lib",
+        ":tpu_embedding_ops_op_lib",
     ],
 )
 
@@ -172,6 +169,7 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_util",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
@@ -183,6 +181,19 @@ py_library(
     ],
 )
 
+tf_py_test(
+    name = "tpu_test",
+    size = "small",
+    srcs = ["python/tpu/tpu_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:layers",
+    ],
+)
+
 tf_py_test(
     name = "tpu_sharding_test",
     size = "small",
@@ -216,6 +227,17 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "tpu_config_test",
+    size = "small",
+    srcs = ["python/tpu/tpu_config_test.py"],
+    additional_deps = [
+        ":tpu_estimator",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/tpu/ops/cross_replica_ops.cc b/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
index cbbd19800eb2e336fc343671fb82bb3ed631c129..d389050e67f9a9e48b91583e5088058ec4e2832f 100644
--- a/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
+++ b/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
@@ -22,7 +22,7 @@ namespace tensorflow {
 REGISTER_OP("CrossReplicaSum")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {float}")
+    .Attr("T: {bfloat16, float}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 An Op to sum inputs across replicated TPU instances. Each
diff --git a/tensorflow/contrib/tpu/ops/replication_ops.cc b/tensorflow/contrib/tpu/ops/replication_ops.cc
index 36e865bf3c461878e12cca5f46f24411ceb61a78..cba71c6b98e1079de6c6c4c32fa2ffc44a9ce71e 100644
--- a/tensorflow/contrib/tpu/ops/replication_ops.cc
+++ b/tensorflow/contrib/tpu/ops/replication_ops.cc
@@ -72,10 +72,12 @@ REGISTER_OP("TPUReplicate")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("Tbroadcast_inputs: list(type) >= 0")
     .Attr("NumVariables: int >= 0")
+    .Attr("Tguaranteed_constants: list(type) >= 0")
     .Attr("output_types: list(type) >= 0")
     .Input("inputs: Tinputs")
     .Input("broadcast_inputs: Tbroadcast_inputs")
     .Input("variables: NumVariables * resource")
+    .Input("guaranteed_constants: Tguaranteed_constants")
     .Output("outputs: output_types")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
@@ -95,9 +97,13 @@ Tinputs: the types of the arguments to 'computation'.
 inputs: the inputs to 'computation', flattened, in replica-major order.
 Tbroadcast_inputs: the types of the additional arguments to broadcast to all
   replicas.
+Tguaranteed_constants: the types of the arguments to 'guaranteed_constants'.
 broadcast_inputs: additional arguments to broadcast to all replicas. The
   broadcast inputs are appended to the per-replica inputs when calling
   computation.
+guaranteed_constants: arguments which have been guaranteed to not
+change their values during the session lifetime. These contain tensors marked as
+constant using the GuaranteeConstOp.
 output_types: the types of the outputs of 'computation'.
 outputs: the outputs of 'computation'.
 )doc");
diff --git a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
index 8c4fe5538d832f390845fe2d31aa6a08342b280b..f8de8baa65339383c7f92284ee274a434f12f8c2 100644
--- a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
+++ b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
@@ -26,29 +26,25 @@ using shape_inference::ShapeHandle;
 // Configuring a distributed TPU system is achieved by running
 // the following Ops:
 //
-// 1 Run _DisconnectHostFromDistributedTPUSystem on the CPU of each
-// host. This is needed in case the system had previously been
-// configured. It returns, for each host, the number of TPU chips on
-// the host.
+// 1 Run _DisconnectHostFromDistributedTPUSystem on the TPU_SYSTEM of each
+// host. This is needed in case the system had previously been configured. It
+// returns, for each host, the number of TPU chips on the host.
 //
-// 2 Run _ConfigureDistributedTPU on TPU_SYSTEM. Takes as input the
-// number of chips on each host. Validates that all hosts have the
-// same number of chips, and that the chips are consistent with the
-// topology set by flags. Has a single output which is a proto
-// describing the requested system configuration, which is sent to all
-// hosts.
+// 2 Run _ConfigureDistributedTPU on TPU_SYSTEM of worker 0. Takes as input the
+// number of chips on each host. Validates that all hosts have the same number
+// of chips, and that the chips are consistent with the topology set by
+// flags. Has a single output which is a proto describing the requested system
+// configuration, which is sent to all hosts.
 //
-// 3 Run _InitializeHostForDistributedTPU on the CPU of each host,
-// taking as input the output from ConfigureDistributedTPU. Has a
-// single Tensor output which is a vector of int32 indicating, for
-// each TPU on the host, what its global TPU system id is.
+// 3 Run _InitializeHostForDistributedTPU on the TPU_SYSTEM of each host, taking
+// as input the output from ConfigureDistributedTPU. Has a single Tensor output
+// which is a vector of int32 indicating, for each TPU on the host, what its
+// global TPU system id is.
 //
 // 4 Run _WaitForDistributedTPU on TPU_SYSTEM, taking as input the
 // outputs from all the _InitializeHostForDistributedTPU
-// Ops. _WaitForDistributedTPU has an attr host_specs which is a
-// vector<string> giving the partial device spec for each host. These
-// partial specs are combined in the Op with the outputs from the host
-// initialization Ops to construct a mapping from full TPU device
+// Ops. _These partial specs are combined in the Op with the outputs from
+// the host initialization Ops to construct a mapping from full TPU device
 // specs to global TPU ids. Has a single Tensor output which is a
 // matrix of int32 indicating, for each host (outer dimension) and for
 // each TPU on the host (inner dimension) what that TPU's global id
@@ -56,29 +52,28 @@ using shape_inference::ShapeHandle;
 // system to initialize fully, which may take several minutes for a
 // large system.
 //
-// 5 Run _SetGlobalTPUArray on the CPU of each host, taking as input
-// the output from _WaitForDistributedTPU. This Op tells each host the
-// global Id of every TPU on every host.
+// 5 Run _SetGlobalTPUArray on the TPU_SYSTEM of each host, taking as input the
+// output from _WaitForDistributedTPU. This Op tells each host the global Id of
+// every TPU on every host.
 //
-// Most user code works by placing the ConfigureDistributedTPU Op on
-// the desired TPU_SYSTEM device, and a graph rewrite replaces it by
-// the subgraph described above.
+// Most user code works by placing the ConfigureDistributedTPU Op on the desired
+// TPU_SYSTEM device, and a graph rewrite replaces it by the subgraph described
+// above.
 //
 //
-// A distributed TPU system can be cleanly shut down by running
-// the following Ops:
+// A distributed TPU system can be cleanly shut down by running the following
+// Ops:
 //
-// 1 Run _DisconnectHostFromDistributedTPUSystem on the CPU of each
-// host.
+// 1 Run _DisconnectHostFromDistributedTPUSystem on the TPU_SYSTEM of each host.
 //
 // 2 Run _ShutdownDistributedTPU on the TPU_SYSTEM where
-// _ConfigureDistributedTPU was run. The Op will return an error if no
-// system is configured.
+// _ConfigureDistributedTPU was run. The Op will return an error if no system is
+// configured.
 //
 //
-// Most user code works by placing the ShutdownDistributedTPU Op on
-// the desired TPU_SYSTEM device, and a graph rewrite replaces it by
-// the subgraph described above.
+// Most user code works by placing the ShutdownDistributedTPU Op on the desired
+// TPU_SYSTEM device, and a graph rewrite replaces it by the subgraph described
+// above.
 
 REGISTER_OP("_ConfigureDistributedTPU")
     .Input("inputs: N * int32")
@@ -108,7 +103,6 @@ in a host.
 REGISTER_OP("_WaitForDistributedTPU")
     .Input("inputs: N * int32")
     .Output("topology: string")
-    .Attr("host_specs: list(string)")
     .Attr("startup_timeout_sec: int = 20")
     .Attr("N: int")
     .SetIsStateful()
@@ -196,6 +190,7 @@ chips on the host.
 REGISTER_OP("ConfigureDistributedTPU")
     .Output("topology: string")
     .Attr("embedding_config: string = ''")
+    .Attr("tpu_embedding_config: string = ''")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
@@ -204,6 +199,9 @@ system.
 
 topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
 topology.
+tpu_embedding_config: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+describes the embedding lookups of the program.
+embedding_config: Reserved. Do not use.
 )doc");
 
 REGISTER_OP("ShutdownDistributedTPU")
@@ -214,4 +212,20 @@ An op that shuts down a running distributed TPU system. The Op returns
 an error if no system is running.
 )doc");
 
-}  // namespace tensorflow
+REGISTER_OP("SessionStatus")
+    .Input("fetch_start_timestamp: double")
+    .Output("status: string")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Not for public usage.
+
+Returns messages from the current session as a serialized SessionStatusProto.
+
+This includes the current state of the compiler, along with any critical
+logging or warning messages.
+
+fetch_start_timestamp: any messages earlier than this will be excluded from the
+returned proto.
+)doc");
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc32a265286951a1e4d59228da6b3ac83a75c5e9
--- /dev/null
+++ b/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
@@ -0,0 +1,328 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tpu/proto/tpu_embedding_config.pb.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// TPUs use a specialized mechanism for performing embedding lookups,
+// necessitating differences in TF Graphs that use embeddings on TPUs relative
+// to CPUs. Embedding lookups on TPU systems are achieved by including the
+// following in the TF Graph.
+//
+// 0. Construct a TPUEmbeddingConfiguration, specifying the embedding tables
+//    in the model, the size of the TPU system to be used, and the optimizer to
+//    be used for each table. Some of this information is redundant with other
+//    pieces of the TF Graph.
+// 1. Pass this TPUEmbeddingConfiguration to tpu.initialize_system() as the
+//    tpu_embedding_config parameter.
+// 2. Use the TPUEmbeddingLoad Op to initialize the embedding tables in TPU
+//    memories, sharded across the memories attached to each Host.
+// 3. Use TPUEmbeddingEnqueueSparseBatch to provide the TPU with embedding
+//    indices and aggregation weights.
+// 4. TPUEmbeddingReceiveActivations returns a list of Tensors, containing the
+//    activations from each table specified in the configuration.
+// 5. TPUEmbeddingActivations, when used with appropriate Python libraries,
+//    enables the automatic differentiation of models that use embeddings.
+// 6. TPUEmbeddingSendGradients takes a list of Tensors (of the same shapes
+//    as those returned by TPUEmbeddingReceivActivations) containing gradients
+//    to use in updating the embedding tables.
+// 7. Before saving a checkpoint, use the TPUEmbeddingRetrieve Op to update
+//    the Graph's embedding table Variables from the updated tables in the
+//    TPU memories.
+//
+// TPU Embeddings use dedicated ops to enforce Host/TPU consistency in the
+// state of embedding table variables. Before beginning training or inference,
+// the model must Load the optimizer parameters into the TPU memories. Before
+// saving a checkpoint, the model must Retreieve the parameters back into the
+// host CPU memory.
+
+REGISTER_OP("TPUEmbeddingLoadGradientDescentParameters")
+    .Input("parameters: float32")
+    .Attr("tpu_embedding_config: string")
+    .Attr("table_id: int >= 0")
+    .Attr("num_hosts: int >= 1")
+    .Attr("host_id: int >= 0")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Load an embedding table shard into TPU memory for use with GradientDescent.
+
+TPU embeddings use dedicated per-optimizer Ops for loading and retrieving 
+trainable variables and optimizer state from TPU memory. This op enables
+functionality equivalent to GradientDescentOptimizer.
+
+parameters: The shard of the embedding table resident on the host executing this
+    op. For single-TPU models, this is the entire embedding table.
+tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
+table_id: The id of the table specified in the tpu_embedding_config.
+num_hosts: The number of CPU hosts in the distributed training job.
+host_id: Which CPU host in the distributed training job will execute this op.
+)doc");
+
+namespace tpu_embedding_config_util {
+
+Status GradientDescentShapes(shape_inference::InferenceContext *c) {
+  string config_string;
+  TF_RETURN_IF_ERROR(c->GetAttr("tpu_embedding_config", &config_string));
+  tpu::TPUEmbeddingConfiguration config;
+  if (!config.ParseFromString(config_string)) {
+    return errors::InvalidArgument("Malformed tpu_embedding_config.");
+  }
+
+  int table_id;
+  TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
+  int64 num_tables = config.table_config_size();
+  if (table_id >= num_tables) {
+    return errors::InvalidArgument("Table id >= num_tables");
+  }
+  int64 width = config.table_config(table_id).width();
+  int64 num_rows = config.table_config(table_id).num_rows();
+
+  TF_RETURN_IF_ERROR(c->set_output("parameters", {c->Matrix(num_rows, width)}));
+  return Status::OK();
+}
+
+}  // namespace tpu_embedding_config_util
+
+REGISTER_OP("TPUEmbeddingRetrieveGradientDescentParameters")
+    .Output("parameters: float32")
+    .Attr("tpu_embedding_config: string")
+    .Attr("table_id: int")
+    .Attr("num_hosts: int")
+    .Attr("host_id: int")
+    .SetIsStateful()
+    .SetShapeFn(tpu_embedding_config_util::GradientDescentShapes)
+    .Doc(R"doc(
+Retrieve an embedding table shard from TPU memory.
+
+TPU embeddings use dedicated per-optimizer Ops for loading and retrieving 
+trainable variables and optimizer state from TPU memory. This op enables
+functionality equivalent to GradientDescentOptimizer.
+
+tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
+table_id: The id of the table specified in tpu_embedding_config.
+num_hosts: The number of CPU hosts in the distributed training job.
+host_id: Which CPU host in the distributed training job will execute this op.
+)doc");
+
+REGISTER_OP("TPUEmbeddingLoadAdagradParameters")
+    .Input("parameters: float32")
+    .Input("accumulators: float32")
+    .Attr("tpu_embedding_config: string")
+    .Attr("table_id: int >= 0")
+    .Attr("num_hosts: int >= 1")
+    .Attr("host_id: int >= 0")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Load an embedding table shard into TensorNode memories for use with Adagrad.
+
+TPU embeddings use dedicated per-optimizer Ops for loading and retrieving
+trainable variables and optimizer state from TPU memory. This op enables
+functionality equivalent to AdagradOptimizer.
+
+parameters: The shard of the embedding table resident on the host executing this
+    op. For single-TPU models, this is the entire embedding table.
+accumulators: Shard of the Adagrad accumulators resident on the host executing
+    this op.
+tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
+table_id: The id of the table specified in the embedding_config.
+num_hosts: The number of CPU hosts in the distributed training job.
+host_id: Which CPU host in the distributed training job will execute this op.
+)doc");
+
+namespace tpu_embedding_config_util {
+
+Status AdagradShapes(shape_inference::InferenceContext *c) {
+  string config_string;
+  TF_RETURN_IF_ERROR(c->GetAttr("tpu_embedding_config", &config_string));
+  tpu::TPUEmbeddingConfiguration config;
+  if (!config.ParseFromString(config_string)) {
+    return errors::InvalidArgument("Malformed tpu_embedding_config.");
+  }
+
+  int table_id;
+  TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
+  int64 num_tables = config.table_config_size();
+  if (table_id >= num_tables) {
+    return errors::InvalidArgument("Table id >= num_tables");
+  }
+  int64 width = config.table_config(table_id).width();
+  int64 num_rows = config.table_config(table_id).num_rows();
+
+  TF_RETURN_IF_ERROR(c->set_output("parameters", {c->Matrix(num_rows, width)}));
+  TF_RETURN_IF_ERROR(
+      c->set_output("accumulators", {c->Matrix(num_rows, width)}));
+  return Status::OK();
+}
+
+}  // namespace tpu_embedding_config_util
+
+REGISTER_OP("TPUEmbeddingRetrieveAdagradParameters")
+    .Output("parameters: float32")
+    .Output("accumulators: float32")
+    .Attr("tpu_embedding_config: string")
+    .Attr("table_id: int >= 0")
+    .Attr("num_hosts: int >= 1")
+    .Attr("host_id: int >= 0")
+    .SetIsStateful()
+    .SetShapeFn(tpu_embedding_config_util::AdagradShapes)
+    .Doc(R"doc(
+Retrieve an embedding table shard from TPU memory.
+
+TPU embeddings use dedicated per-optimizer Ops for loading and retrieving 
+trainable variables and optimizer state from TPU memory. This op enables
+functionality equivalent to AdagradOptimizer.
+
+tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
+table_id: The id of the table specified in the embedding_config_json.
+num_hosts: The number of CPU hosts in the distributed training job.
+host_id: Which CPU host in the distributed training job will execute this op.
+)doc");
+
+REGISTER_OP("TPUEmbeddingEnqueueSparseBatch")
+    .Input("sample_indices: num_tables * int32")
+    .Input("embedding_indices: num_tables * int32")
+    .Input("aggregation_weights: num_tables * float32")
+    .Attr("num_tables: int")
+    .Attr("device_ordinal: int = -1")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+An op that feeds a batch of embedding indices and weights to the TPU.
+
+Embedding lookups are equivalent to sparse-dense matrix multiplications: the
+sparse matrix contains nonzeros in column j in order to retrieve row j from the
+embedding table.
+
+The three Tensor list arguments (sample_indices, embedding_indices, and
+aggregation_weights) represent these sparse matrices in COO format. The Tensor
+lists each have one entry for each embedding table specified in the model.
+For the kth embedding table, the three Tensors at position k in the list
+specify a COO-format sparse matrix. For the kth table, the row indices,
+column indices, and nonzero values of the COO sparse matrix are specified by
+sample_indices[k], embedding_indices[k], and aggregation_weights[k],
+respectively. Entries must be sorted by row index, then by column index.
+
+There should be at most one TPUEmbeddingEnqueueSparseBatch op in a signle
+training step per TPU shard.
+
+sample_indices: A list of rank 1 Tensors specifying row indices of the COO
+    sparse matrix representing the embedding lookups for each table.
+embedding_indices: A list of rank 1 Tensors  specifying column indices of the
+    COO sparse matrix representing the embedding lookups for each table.
+aggregation_weights: A list of rank 1 Tensors specifying the nonzero values
+    of the COO sparse matrix representing the embedding lookups for each table.
+device_ordinal: The TPU device to use. This should be -1 when the Op
+    is running on a TPU device, and >= 0 when the Op is running on the CPU
+    device.
+)doc");
+
+namespace tpu_embedding_config_util {
+
+Status ActivationShapes(shape_inference::InferenceContext *c) {
+  string config_string;
+  TF_RETURN_IF_ERROR(c->GetAttr("tpu_embedding_config", &config_string));
+  tpu::TPUEmbeddingConfiguration config;
+  if (!config.ParseFromString(config_string)) {
+    return errors::InvalidArgument("Malformed tpu_embedding_config.");
+  }
+  int64 batch_size = config.batch_size();
+  int64 num_tables = config.table_config_size();
+  for (int table_id = 0; table_id < num_tables; ++table_id) {
+    int64 width = config.table_config(table_id).width();
+    int64 num_features = config.table_config(table_id).num_features();
+    c->set_output(table_id, c->Matrix(batch_size * num_features, width));
+  }
+  return Status::OK();
+}
+
+}  // namespace tpu_embedding_config_util
+
+REGISTER_OP("TPUEmbeddingReceiveActivations")
+    .Output("outputs: num_tables * float")
+    .Attr("num_tables: int >= 1")
+    .Attr("tpu_embedding_config: string")
+    .SetIsStateful()
+    .SetShapeFn(tpu_embedding_config_util::ActivationShapes)
+    .Doc(R"doc(
+An op that receives embeddng activations on the TPU.
+
+The TPU system performs the embedding lookups and aggregations specified by
+the arguments to TPUEmbeddingEnqueueSparseBatch. The results of these
+aggregations are visible to the Tensorflow Graph as the outputs of a
+TPUEmbeddingDequeueActivations Op. This op returns a list containing one
+Tensor of activations per table specified in the model. There can be at most
+one ReceieveActivations op in the TPU graph.
+
+outputs: A TensorList of embedding activations containing one Tensor per
+    embedding table in the model.
+num_tables: The number of output activation tensors, equal to the number of
+    embedding tables in the model.
+tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
+)doc");
+
+REGISTER_OP("TPUEmbeddingActivations")
+    .Input("embedding_variable: float32")
+    .Input("sliced_activations: float32")
+    .Output("output: float32")
+    .Attr("table_id: int >= 0")
+    .Attr("lookup_id: int >= 0")
+    .SetShapeFn([](shape_inference::InferenceContext *c) {
+      c->set_output(0, c->input(1));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+An op enabling differentiation of TPU Embeddings.
+
+This op simply returns its first input, which is assumed to have been sliced
+from the Tensors returnd by TPUEmbeddingDequeueActivations. The presence of this
+op, and its first argument being a trainable Variable, enables automatic
+differentiation of graphs containing embeddings via the TPU Embedding Python
+libraries.
+
+embedding_variable: A trainable variable, enabling optimizers to find this op.
+sliced_activations: The embedding activations Tensor to return.
+table_id: The id of the table in the embedding layer configuration from which
+    these activations were computed.
+lookup_id: Identifier of the set of embedding indices which produced these
+    activations.
+)doc");
+
+REGISTER_OP("TPUEmbeddingSendGradients")
+    .Input("gradients: num_tables * float32")
+    .Attr("num_tables: int >= 1")
+    .Attr("tpu_embedding_config: string")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+An op that performs gradient updates of embedding tables.
+
+The TensorList argument has the same length and shapes as the return value of
+TPUEmbeddingReceiveActivations, but contains gradients of the model's loss
+with respect to the embedding activations. The embedding tables are updated
+from these gradients via the optimizer specified in the configuration given
+to tpu.initialize_system.
+
+gradients: A TensorList of gradients with which to update embedding tables.
+tpu_embedding_config: Serialized TPUEmbeddingConfiguration proto.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 0e1fca3d3c8b6f3a19b3e989dbee1863475796c5..198da0203a7d17249c4f50110713121b74d5ca4f 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -44,13 +44,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "version",
+    hdrs = ["version.h"],
+    visibility = ["//visibility:public"],
+)
+
 tf_cc_binary(
     name = "capture_tpu_profile",
-    srcs = ["capture_tpu_profile.cc"],
-    visibility = ["//tensorflow/contrib/tpu/profiler:__subpackages__"],
+    srcs = [
+        "capture_tpu_profile.cc",
+    ],
+    visibility = ["//visibility:public"],
     deps = [
         ":dump_tpu_profile",
         ":tpu_profiler_proto_cc",
+        ":version",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index bff23a447f841339d9bf5bd3bf125d705bf1fee7..b1ef9fde37fe0647965f0818895be37d2d56d207 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -26,8 +26,10 @@ limitations under the License.
 
 #include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
 #include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
+#include "tensorflow/contrib/tpu/profiler/version.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
@@ -46,21 +48,40 @@ string GetCurrentTimeStampAsString() {
   return s;
 }
 
-ProfileResponse Profile(const string& service_addr, int duration_ms) {
+Status ValidateHostPortPair(const string& host_port) {
+  uint32 port;
+  std::vector<string> parts = str_util::Split(host_port, ':');
+  // Must be host:port, port must be a number, host must not contain a '/',
+  // host also must not be empty.
+  if (parts.size() != 2 || !strings::safe_strtou32(parts[1], &port) ||
+      parts[0].find("/") != string::npos || parts[0].empty()) {
+    return errors::InvalidArgument("Could not interpret \"", host_port,
+                                   "\" as a host-port pair.");
+  }
+  return Status::OK();
+}
+
+ProfileResponse Profile(const string& service_addr, int duration_ms,
+                        const ProfileOptions& opts) {
   ProfileRequest request;
   request.set_duration_ms(duration_ms);
   request.set_max_events(kMaxEvents);
   request.add_tools("input_pipeline");
+  request.add_tools("overview_page");
+  *request.mutable_opts() = opts;
   std::cout << "Limiting the number of trace events to " << kMaxEvents
             << std::endl;
   ::grpc::ClientContext context;
   ::grpc::ChannelArguments channel_args;
   // TODO(ioeric): use `SetMaxReceiveMessageSize` instead once it's available.
+  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+  // `ValidateHostPortPair` checks for empty host string case.
   channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
                       std::numeric_limits<int32>::max());
   std::unique_ptr<TPUProfiler::Stub> stub =
       TPUProfiler::NewStub(::grpc::CreateCustomChannel(
-          service_addr, ::grpc::InsecureChannelCredentials(), channel_args));
+          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+          channel_args));
   ProfileResponse response;
   TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response)));
   return response;
@@ -74,26 +95,69 @@ int main(int argc, char** argv) {
   tensorflow::string FLAGS_service_addr;
   tensorflow::string FLAGS_logdir;
   int FLAGS_duration_ms = 2000;
+  int FLAGS_num_tracing_attempts = 3;
+  bool FLAGS_include_dataset_ops = true;
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("service_addr", &FLAGS_service_addr,
                        "Address of TPU profiler service e.g. localhost:8466"),
       tensorflow::Flag("logdir", &FLAGS_logdir,
-                       "Path of TensorBoard log directory e.g. /tmp/tb_log"),
+                       "Path of TensorBoard log directory e.g. /tmp/tb_log, "
+                       "gs://tb_bucket"),
       tensorflow::Flag("duration_ms", &FLAGS_duration_ms,
                        "Duration of tracing in ms. Default is 2000ms."),
+      tensorflow::Flag("num_tracing_attempts", &FLAGS_num_tracing_attempts,
+                       "Automatically retry N times when no trace event "
+                       "is collected. Default is 3."),
+      tensorflow::Flag("include_dataset_ops", &FLAGS_include_dataset_ops,
+                       "Set to false to profile longer TPU device traces."),
   };
 
+  std::cout << "Welcome to the Cloud TPU Profiler v" << TPU_PROFILER_VERSION
+            << std::endl;
+
   tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_ok || FLAGS_service_addr.empty() || FLAGS_logdir.empty()) {
-    std::printf("%s", usage.c_str());
+    std::cout << usage.c_str() << std::endl;
+    return 2;
+  }
+  tensorflow::Status status =
+      tensorflow::tpu::ValidateHostPortPair(FLAGS_service_addr);
+  if (!status.ok()) {
+    std::cout << status.error_message() << std::endl;
+    std::cout << usage.c_str() << std::endl;
     return 2;
   }
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
-  int duration_ms = FLAGS_duration_ms;
-  tensorflow::ProfileResponse response =
-      tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms);
+  // Sets the minimum duration_ms and tracing attempts to one.
+  int duration_ms = std::max(FLAGS_duration_ms, 1);
+  int remaining_attempts = std::max(FLAGS_num_tracing_attempts, 1);
+  tensorflow::ProfileOptions opts;
+  opts.set_include_dataset_ops(FLAGS_include_dataset_ops);
+  tensorflow::ProfileResponse response;
+
+  while (true) {
+    std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
+              << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
+    response = tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms, opts);
+    if (remaining_attempts <= 0 || !response.encoded_trace().empty()) break;
+    std::cout << "No trace event is collected. Automatically retrying."
+              << std::endl
+              << std::endl;
+  }
+
+  if (response.encoded_trace().empty()) {
+    std::cout << "No trace event is collected after "
+              << FLAGS_num_tracing_attempts << " attempt(s). "
+              << "Perhaps, you want to try again (with more attempts?)."
+              << std::endl
+              << "Tip: increase number of attempts with --num_tracing_attempts."
+              << std::endl;
+    // Don't dump profile data if no trace is collected.
+    return 0;
+  }
+
   // Use the current timestamp as the run name.
   tensorflow::string run = tensorflow::tpu::GetCurrentTimeStampAsString();
   TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index 120a38b6c2353deaf0b86d330cda999ba6be7dbf..ebd6185faad28ae7a22eb33f6b358eb2344c9c22 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -149,10 +149,9 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
   // Dumps profile data to <logdir>/plugins/profile/<run>/.
   string profile_run_dir = JoinPath(logdir, kProfilePluginDirectory, run);
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(profile_run_dir));
+
   // Ignore computation_graph for now.
-  if (response.encoded_trace().empty()) {
-    *os << "No trace event is collected." << std::endl;
-  } else {
+  if (!response.encoded_trace().empty()) {
     LOG(INFO) << "Converting trace events to TraceViewer JSON.";
     TF_RETURN_IF_ERROR(
         DumpTraceToLogDirectory(profile_run_dir, response.encoded_trace(), os));
@@ -163,13 +162,10 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
     TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir,
                                                    response.op_profile(), os));
   }
-  if (!response.tool_data().empty()) {
-    for (const auto& tool_data : response.tool_data()) {
-      TF_RETURN_IF_ERROR(
-          DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
-    }
+  for (const auto& tool_data : response.tool_data()) {
+    TF_RETURN_IF_ERROR(
+        DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
   }
-  TF_RETURN_IF_ERROR(DumpGraphEvents(logdir, run, response, os));
 
   return Status::OK();
 }
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
index 65b92aa41867ed9e2e8b06c9e34dd99068bb459c..29ef977bacfd61e163be49558c5b94277ed479c1 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+#ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+#define TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
 
 #include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -27,7 +27,10 @@ namespace tpu {
 // The following tools are supported:
 //   - Trace viewer
 //   - Op profile
-//   - HLO computation graph
+//   - Input pipeline analyzer
+//   - Overview page
+// Note: this function creates a directory even when all fields in
+// ProfileResponse are unset/empty.
 Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
                                   const ProfileResponse& response,
                                   std::ostream* os);
@@ -35,4 +38,4 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
 }  // namespace tpu
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+#endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 7970c20a2693cbbe91a136080240f676d29f2053..a730d6142d890cc41f72176cf617ac0b0434192c 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -17,6 +17,7 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from absl import flags
 
 import os
 import subprocess
@@ -24,13 +25,36 @@ import sys
 
 import tensorflow as tf
 
-tf.flags.DEFINE_string('service_addr', '',
-                       'Address of TPU profiler service e.g. localhost:8466')
-tf.flags.DEFINE_string('logdir', '',
-                       'Path of TensorBoard log directory e.g. /tmp/tb_log')
-tf.flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.')
+# Cloud TPU Cluster Resolvers
+flags.DEFINE_string(
+    'gcp_project', None,
+    'Project name for the Cloud TPU-enabled project. If not specified, we '
+    'will attempt to automatically detect the GCE project from metadata.')
+flags.DEFINE_string(
+    'tpu_zone',
+    None,
+    help='GCE zone where the Cloud TPU is located in. If not specified, we '
+    'will attempt to automatically detect the GCE project from metadata.')
+flags.DEFINE_string('tpu_name', None,
+                    'Name of the Cloud TPU for Cluster Resolvers. You must '
+                    'specify either this flag or --master.')
 
-FLAGS = tf.flags.FLAGS
+# Tool specific parameters
+flags.DEFINE_string(
+    'service_addr', None, 'Address of TPU profiler service e.g. '
+    'localhost:8466, you must specify either this flag or --tpu_name.')
+flags.DEFINE_string('logdir', None,
+                    'Path of TensorBoard log directory e.g. /tmp/tb_log, '
+                    'gs://tb_bucket')
+flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.')
+flags.DEFINE_integer('num_tracing_attempts', 3,
+                     'Automatically retry N times when no trace '
+                     'event is collected.')
+flags.DEFINE_boolean('include_dataset_ops', True,
+                     'Set to false to profile longer TPU '
+                     'device traces.')
+
+FLAGS = flags.FLAGS
 EXECUTABLE = 'data/capture_tpu_profile'
 
 
@@ -39,13 +63,35 @@ def run_main():
 
 
 def main(unused_argv=None):
-  if not FLAGS.service_addr or not FLAGS.logdir:
-    sys.exit('service_addr and logdir must be provided.')
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  if FLAGS.service_addr is None and FLAGS.tpu_name is None:
+    sys.exit('You must specify either --service_addr or --tpu_name.')
+
+  if FLAGS.service_addr is not None:
+    if FLAGS.tpu_name is not None:
+      tf.logging.warn('Both --service_addr and --tpu_name are set. Ignoring '
+                      '--tpu_name and using --service_addr.')
+    service_addr = FLAGS.service_addr
+  else:
+    tpu_cluster_resolver = (
+        tf.contrib.cluster_resolver.TPUClusterResolver(
+            tpu_names=[FLAGS.tpu_name],
+            zone=FLAGS.tpu_zone,
+            project=FLAGS.gcp_project))
+    service_addr = tpu_cluster_resolver.get_master()
+  service_addr = service_addr.replace('grpc://', '').replace(':8470', ':8466')
+
+  if not FLAGS.logdir:
+    sys.exit('logdir must be provided.')
   executable_path = os.path.join(os.path.dirname(__file__), EXECUTABLE)
+  logdir = os.path.expandvars(os.path.expanduser(FLAGS.logdir))
   cmd = [executable_path]
-  cmd.append('--logdir='+FLAGS.logdir)
-  cmd.append('--service_addr='+FLAGS.service_addr)
-  cmd.append('--duration_ms='+str(FLAGS.duration_ms))
+  cmd.append('--logdir=' + logdir)
+  cmd.append('--service_addr=' + service_addr)
+  cmd.append('--duration_ms=' + str(FLAGS.duration_ms))
+  cmd.append('--num_tracing_attempts=' + str(FLAGS.num_tracing_attempts))
+  cmd.append('--include_dataset_ops=' + str(FLAGS.include_dataset_ops).lower())
   subprocess.call(cmd)
 
 
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index ee6950699e740139b75f3f061ca0ca455fe2a1af..76f1dd2a567b570be6d1e127d1382773bf94493d 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -20,16 +20,12 @@ from __future__ import print_function
 
 from setuptools import setup
 
-_VERSION = '1.3.0-a1'
+_VERSION = '1.6.0-rc0'
 
 CONSOLE_SCRIPTS = [
     'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
 ]
 
-REQUIRED_PACKAGES = [
-    'tensorflow >= 1.2.0',
-]
-
 setup(
     name='cloud_tpu_profiler',
     version=_VERSION.replace('-', ''),
@@ -45,33 +41,29 @@ setup(
     entry_points={
         'console_scripts': CONSOLE_SCRIPTS,
     },
-    install_requires=REQUIRED_PACKAGES,
     classifiers=[
         # How mature is this project? Common values are
         #   3 - Alpha
         #   4 - Beta
         #   5 - Production/Stable
-        'Development Status :: 3 - Alpha',
-        
+        'Development Status :: 4 - Beta',
         'Intended Audience :: Developers',
         'Intended Audience :: Education',
         'Intended Audience :: Science/Research',
-        
         'License :: OSI Approved :: Apache Software License',
-        
         'Programming Language :: Python :: 2',
         'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
-        
         'Topic :: Scientific/Engineering',
         'Topic :: Scientific/Engineering :: Mathematics',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
         'Topic :: Software Development',
-        'Topic :: Software Development :: Libraries',  
+        'Topic :: Software Development :: Libraries',
         'Topic :: Software Development :: Libraries :: Python Modules',
     ],
     license='Apache 2.0',
-    keywords='tensorflow performance tpu',)
+    keywords='tensorflow performance tpu',
+)
diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
index 6943ff5f471d7cb0c5302261ec9aa7273ef5ae35..2094294baad63ae73712c8648b588accd4551ef8 100644
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
@@ -61,6 +61,11 @@ message OpMetricsResult {
 message OpMetricsDbResult {
   // A bunch of OpMetricsResults.
   repeated OpMetricsResult metrics_db = 1;
+  // The total host infeed-enqueue duration in picoseconds.
+  optional uint64 total_host_infeed_enq_duration_ps = 2;
+  // The total of the difference between the start times of two
+  // consecutive infeed-enqueues (per host) in picoseconds.
+  optional uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3;
 }
 
 // Result proto for StepInfo.
@@ -114,6 +119,26 @@ message HloExtraInfoMapResult {
   map<string, HloExtraInfoResult> hlo_extrainfo_map = 1;
 }
 
+// Result proto for host-independent job information.
+message HostIndependentJobInfoResult {
+  // The change-list number of this build.
+  optional int64 change_list = 1;
+  // The time of this build.
+  optional int64 build_time = 2;
+  // The target of this build.
+  optional string build_target = 3;
+}
+
+// Result proto for host-dependent job information.
+message HostDependentJobInfoResult {
+  // This ID of the host where the job was run on.
+  optional string host_id = 1;
+  // The command line used to run the job.
+  optional string command_line = 2;
+  // The start time of the job on this host.
+  optional int64 start_time = 3;
+}
+
 // Result proto for RunEnvironment (the run environment of a profiling session).
 message RunEnvironmentResult {
   // Number of hosts used.
@@ -124,8 +149,10 @@ message RunEnvironmentResult {
   optional int32 tpu_core_count = 3;
   // The per-TPU-core batch size.
   optional int32 per_core_batch_size = 4;
-  // Job information including build target and command line.
-  optional string job_info = 5;
+  // Host-independent job information.
+  optional HostIndependentJobInfoResult host_independent_job_info = 5;
+  // Host-dependent job information.
+  repeated HostDependentJobInfoResult host_dependent_job_info = 6;
 }
 
 // Result proto for TfStatsHelper.
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index 9c3fd45fd1ec9736b638b45907e585165d4d9057..f3f3302ceb3d27dbb21bdce753aeb2d7fcd77448 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -13,6 +13,14 @@ service TPUProfiler {
   }
 }
 
+message ProfileOptions {
+  // We don't collect the dataset ops by default for better trace-viewer
+  // scalability. The caller can mannually set this field to include the ops.
+  bool include_dataset_ops = 1;
+
+  // next-field: 2
+}
+
 message ProfileRequest {
   // In future, the caller will be able to customize when profiling starts and
   // stops. For now, it collects `duration_ms` milliseconds worth of data.
@@ -25,10 +33,13 @@ message ProfileRequest {
   // required profiling tools name such as "input_pipeline_analyzer" etc
   repeated string tools = 3;
 
+  // Optional profiling options that control how a TF session will be profiled.
+  ProfileOptions opts = 4;
+
   // In future, the caller will indicate which TF session is being profiled, and
   // only data relating to that program will be returned. For now, we assume
   // all activity during the profiling period is relevant.
-  // next-field: 4
+  // next-field: 5
 }
 
 message ProfileToolData {
@@ -40,7 +51,7 @@ message ProfileToolData {
 }
 
 message ProfileResponse {
-  uint64 xprof_response_size = 1;  // Placeholder: return something meaningful.
+  reserved 1;  // was uint64 placeholder for returning something meaningful.
   // Graphs of programs executed on TPUs during the profiling period.
   repeated GraphDef computation_graph = 2;
 
diff --git a/tensorflow/contrib/tpu/profiler/trace_events_to_json.h b/tensorflow/contrib/tpu/profiler/trace_events_to_json.h
index 992eae43d903db495850ced7a59e38120d3fed34..3bd76dd01c7d0f35bad9386c11811743e1709fca 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events_to_json.h
+++ b/tensorflow/contrib/tpu/profiler/trace_events_to_json.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
+#ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
+#define TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
 
 #include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
 #include "tensorflow/core/platform/types.h"
@@ -29,4 +29,4 @@ string TraceEventsToJson(const Trace &trace);
 }  // namespace tpu
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
+#endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc6a934891138018d32d511750120453bdf290cf
--- /dev/null
+++ b/tensorflow/contrib/tpu/profiler/version.h
@@ -0,0 +1,21 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
+#define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
+
+#define TPU_PROFILER_VERSION "1.5.0"
+
+#endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_
diff --git a/tensorflow/contrib/tpu/proto/BUILD b/tensorflow/contrib/tpu/proto/BUILD
index 79a79efb6b62d3e98127558e951ceefd276b580c..e1660985676e8c2efe3b01e32b48b211391885b7 100644
--- a/tensorflow/contrib/tpu/proto/BUILD
+++ b/tensorflow/contrib/tpu/proto/BUILD
@@ -15,6 +15,15 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
+tf_proto_library(
+    name = "tpu_embedding_config_proto",
+    srcs = [
+        "tpu_embedding_config.proto",
+    ],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
 tf_proto_library(
     name = "topology_proto",
     srcs = [
diff --git a/tensorflow/contrib/tpu/proto/tpu_embedding_config.proto b/tensorflow/contrib/tpu/proto/tpu_embedding_config.proto
new file mode 100644
index 0000000000000000000000000000000000000000..b0ec968d3a401f1b80ed1bf6fd7a83a69c068fe2
--- /dev/null
+++ b/tensorflow/contrib/tpu/proto/tpu_embedding_config.proto
@@ -0,0 +1,76 @@
+syntax = "proto3";
+
+package tensorflow.tpu;
+
+// The TPUEmbeddingConfiguration contains specification of TPU Embedding lookups
+// and gradient updates separate from the TF Graph.
+message TPUEmbeddingConfiguration {
+  // model_mode specifies whether the model is to be run in training or
+  // inference. In inference mode, gradient updates to embedding tables are not
+  // performed.
+  enum ModelMode {
+    INVALID = 0;
+    TRAINING = 1;
+    INFERENCE = 2;
+  }
+
+  ModelMode model_mode = 1;
+
+  // num_hosts is the number of host CPU systems in the training/inference job.
+  // Each embedding table must be sharded into num_hosts separate Variables,
+  // placed separately on the num_hosts CPU devices in the cluster. Sharding
+  // will be performed equivalently to the 'div' sharding_strategy option of
+  // embedding_lookup() and embedding_lookup_sparse().
+  int32 num_hosts = 2;
+
+  // The total number of TensorNodes. This is equal to num_hosts times the
+  // number of TensorNodes attached to each host.
+  int32 num_tensornodes = 3;
+
+  // The number of training examples per TensorNode.
+  int32 batch_size = 4;
+
+  message GradientDescentOptimizer {
+    float learning_rate = 1;
+  }
+
+  message AdagradOptimizer {
+    float learning_rate = 1;
+    float initial_accumulator = 2;
+  }
+
+  // Each Embedding
+  message TPUEmbeddingTable {
+    // Name of the embedding table. This will be used to name Variables in the
+    // Tensorflow Graph.
+    string name = 1;
+
+    // Number of rows of the embedding table. The Variable created to hold the
+    // learned embedding table values will have shape (num_rows, width).
+    int32 num_rows = 3;
+
+    // Width of the embedding table. The Variable created to hold the
+    // learned embedding table values will have shape (num_rows, width).
+    int32 width = 4;
+
+    // Number of distinct embedding activation vectors per training example
+    // produced by lookups into this table during model evaluation. For each
+    // table, the Graph will receive an activations Tensor of shape
+    //   (batch_size * table.num_features, table.width).
+    // For example, num_features = 1 produces equivalent behavior to a single
+    // tf.nn.embedding_lookup() call. In the case of 'multivalent' embeddings,
+    // (i.e. tf.nn.embedding_lookup_sparse()) which compute weighted averages of
+    // embedding table rows, num_features is the number of vectors produced
+    // after averaging. In sequence models num_features is typically equal
+    // to the sequence length, since each sequence element must be represented
+    // separately to the convolutional or recurrent network.
+    int32 num_features = 5;
+
+    oneof optimizer {
+      GradientDescentOptimizer gradient_descent = 6;
+      AdagradOptimizer adagrad = 7;
+    }
+  }
+
+  repeated TPUEmbeddingTable table_config = 5;
+}
diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
index 33e47f674d798f622fb08121dabb67d7f45af15b..97876216793e0e6b20b7c072cac4f575b8fd48be 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ops.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import platform
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 
 if platform.system() != "Windows":
@@ -40,6 +41,64 @@ if platform.system() != "Windows":
     del op  # Unused
     # The gradient of a cross replica sum is also a cross-replica sum.
     return gen_tpu_ops.cross_replica_sum(grad)
+
+  # This extra type checking exists to give a more helpful error message in
+  # the common case that uint8 and int64 values are infed. Remove when both
+  # types are supported.
+
+  _SUPPORTED_INFEED_DTYPES = set([
+      dtypes.bool, dtypes.int32, dtypes.bfloat16, dtypes.float32,
+      dtypes.complex64
+  ])
+
+  def infeed_dequeue(dtype, shape, name=None):
+    """A placeholder op for a value that will be fed into the computation.
+
+    Args:
+      dtype: A `tf.DType`. The type of elements in the tensor.
+      shape: A `tf.TensorShape` or list of `ints`. The shape of the tensor.
+      name: A name for the operation (optional).
+
+    Returns:
+      A `Tensor` of type `dtype`.
+      A tensor that will be provided using the infeed mechanism.
+
+    Raises:
+      TypeError: If 'dtype` is not a supported infeed type.
+    """
+    if dtype not in _SUPPORTED_INFEED_DTYPES:
+      raise TypeError(
+          "{} is not a supported TPU infeed type. Supported types are: "
+          "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
+
+    return gen_tpu_ops.infeed_dequeue(dtype, shape, name=name)
+
+  # pylint: disable=redefined-outer-name
+  def infeed_dequeue_tuple(dtypes, shapes, name=None):
+    """A placeholder op for values fed into the TPU simultaneously as a tuple.
+
+    Args:
+      dtypes: A list of `tf.DType`s that has length `>= 1`.
+        The element types of each element in `outputs`.
+      shapes: A list of shapes (each a `tf.TensorShape` or list of `ints`).
+        The shapes of each tensor in `outputs`.
+      name: A name for the operation (optional).
+
+    Returns:
+      A list of `Tensor` objects of type `dtypes`.
+      A list of tensors that will be provided using the infeed mechanism.
+
+    Raises:
+      TypeError: If a type in 'dtypes` is not a supported infeed type.
+    """
+    for dtype in dtypes:
+      if dtype not in _SUPPORTED_INFEED_DTYPES:
+        raise TypeError(
+            "{} is not a supported TPU infeed type. Supported types are: "
+            "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
+    return gen_tpu_ops.infeed_dequeue_tuple(dtypes, shapes, name=name)
+  # pylint: enable=redefined-outer-name
+
 else:
   # We have already built the appropriate libraries into the binary via CMake
   # if we have built contrib, so we don't need this
diff --git a/tensorflow/contrib/tpu/python/tpu/device_assignment.py b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
index ee202610a8a8a1406363b3010771e7806d5d84bf..bdd9b88af55fa4fb483ddbdbe5c51d7076cce675 100644
--- a/tensorflow/contrib/tpu/python/tpu/device_assignment.py
+++ b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
@@ -87,6 +87,8 @@ class DeviceAssignment(object):
                                            core_assignment.shape))
 
     self._core_assignment = core_assignment
+    self._task_and_cores_to_replicas = self._compute_task_and_cores_to_replicas(
+        self._core_assignment, self._topology_tasks)
 
   def _invert_topology(self, topology):
     """Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
@@ -100,6 +102,34 @@ class DeviceAssignment(object):
         devices[x, y, z] = device
     return tasks, devices
 
+  def _compute_task_and_cores_to_replicas(self, core_assignment,
+                                          topology_tasks):
+    """Computes a nested dict which maps task and logical core to replicas."""
+    task_and_cores_to_replicas = {}
+    for replica in xrange(core_assignment.shape[0]):
+      for dx in xrange(core_assignment.shape[1]):
+        for dy in xrange(core_assignment.shape[2]):
+          for dz in xrange(core_assignment.shape[3]):
+            x, y, z = core_assignment[replica, dx, dy, dz, :]
+            task_id = topology_tasks[x, y, z]
+            if task_id not in task_and_cores_to_replicas:
+              task_and_cores_to_replicas[task_id] = {}
+            logical_core = (dx, dy, dz)
+            if logical_core not in task_and_cores_to_replicas[task_id]:
+              task_and_cores_to_replicas[task_id][logical_core] = set()
+
+            task_and_cores_to_replicas[task_id][logical_core].add(replica)
+
+    task_to_sorted_replica_id = {}
+
+    for task, core_to_replicas in task_and_cores_to_replicas.items():
+      core_to_sorted_replicas = {}
+      for core, replicas in core_to_replicas.items():
+        core_to_sorted_replicas[core] = sorted(replicas)
+
+      task_to_sorted_replica_id[task] = core_to_sorted_replicas
+    return task_to_sorted_replica_id
+
   @property
   def topology(self):
     """A `Topology` that describes the TPU topology."""
@@ -119,6 +149,11 @@ class DeviceAssignment(object):
     """
     return self._computation_shape
 
+  @property
+  def num_cores_per_replica(self):
+    """The number of cores per replica."""
+    return np.prod(self.computation_shape)
+
   @property
   def num_replicas(self):
     """The number of replicas of the computation."""
@@ -148,6 +183,26 @@ class DeviceAssignment(object):
     logical_offset = tuple([replica] + logical_core.tolist() + [slice(3)])
     return tuple(self.core_assignment[logical_offset])
 
+  def lookup_replicas(self, task_id, logical_core):
+    """Lookup replica ids by task number and logical core.
+
+    Args:
+      task_id: TensorFlow task number.
+      logical_core: A tuple of three integers which represents a logical core.
+    Returns:
+      A sorted list of the replicas that are attached to that task and
+      loical_core.
+    Raises:
+      ValueError: If no replica exisis in the task which contains the logical
+      core.
+    """
+    try:
+      return self._task_and_cores_to_replicas[task_id][logical_core]
+    except KeyError:
+      raise ValueError(
+          "Can not find any replica in task: {} contains logical_core: {} ".
+          format(task_id, logical_core))
+
   def tpu_ordinal(self, replica=0, logical_core=None):
     """Returns the ordinal of the TPU device assigned to a logical core."""
     coordinates = self._coordinates(replica, logical_core)
diff --git a/tensorflow/contrib/tpu/python/tpu/test_util.py b/tensorflow/contrib/tpu/python/tpu/test_util.py
deleted file mode 100644
index a5d4ff972277cda0bd6f5b3ecdb4bef59a2f8d0e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tpu/python/tpu/test_util.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ===================================================================
-"""Utilities to ease testing on TPU devices."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os.path
-import pickle
-import tempfile
-
-import numpy as np
-
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_config
-from tensorflow.contrib.tpu.python.tpu import tpu_estimator
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import saver as tf_saver
-
-
-def has_tpu():
-  """Check if a TPU device is available.
-
-  Device enumeration via `device_lib` currently fails for TPU systems.
-  (http://b/68333779).  To work around this, we determine the existence of a
-  TPU by a successful call to `initialize_system`.
-
-  Returns:
-    boolean, True if a TPU device is available, otherwise False.
-  """
-
-  def _check():
-    with tf_session.Session() as sess:
-      sess.run(tpu.initialize_system())
-      sess.run(tpu.shutdown_system())
-
-  try:
-    _check()
-    return True
-  except errors.OpError as _:
-    return False
-
-
-def _available_devices():
-  devices = ["cpu"]
-  if not test_util.gpu_device_name():
-    devices.append("gpu")
-
-  if has_tpu():
-    devices.append("tpu")
-
-  return tuple(devices)
-
-
-def copy_dir(src, tgt):
-  """Copy src to tgt."""
-  gfile.MakeDirs(tgt)
-  seen_dirs = set()
-  for dirname, _, files in gfile.Walk(src):
-    for f in files:
-      src_f = os.path.join(dirname, f)
-      tgt_f = src_f.replace(src, tgt)
-      tgt_d = os.path.dirname(tgt_f)
-      if tgt_d not in seen_dirs:
-        gfile.MkDir(tgt_d)
-        seen_dirs.add(tgt_d)
-      gfile.Copy(src_f, tgt_f, overwrite=True)
-
-
-def compare_model(model_fn,
-                  input_fn,
-                  params,
-                  master="local",
-                  temp_dir=None,
-                  num_shards=2,
-                  tolerance=1e-4):
-  """Compare the results of running `model_fn` on the TPU and CPU."""
-  if not temp_dir:
-    temp_dir = tempfile.mkdtemp()
-
-  cpu_model_dir = "%s/cpu-model" % temp_dir
-  tpu_model_dir = "%s/tpu-model" % temp_dir
-  initial_model_dir = "%s/initial-model" % temp_dir
-
-  logging.info("Checkpoints and weights will be written to %s", temp_dir)
-
-  num_steps = 1
-
-  def _model_adapter(features, labels, mode, params):
-    """Run users model function with random seeds fixed to known values."""
-    random_seed.set_random_seed(0)
-    np.random.seed(0)
-    return model_fn(features, labels, mode, params)
-
-  def _input_adapter(params):
-    random_seed.set_random_seed(0)
-    np.random.seed(0)
-    return input_fn(params)
-
-  def _make_run_config(model_dir):
-    return tpu_config.RunConfig(
-        master=master,
-        model_dir=model_dir,
-        save_checkpoints_secs=10000,
-        session_config=config_pb2.ConfigProto(
-            allow_soft_placement=True, log_device_placement=False),
-        tpu_config=tpu_config.TPUConfig(
-            iterations_per_loop=num_steps,
-            num_shards=num_shards,
-        ),
-    )
-
-  def _make_estimator(use_tpu, model_dir):
-    return tpu_estimator.TPUEstimator(
-        model_fn=_model_adapter,
-        use_tpu=use_tpu,
-        config=_make_run_config(model_dir),
-        train_batch_size=num_shards,
-        params=dict(params, use_tpu=use_tpu),
-    )
-
-  def _extract_weights(checkpoint):
-    """Extract model weights from the given checkpoint file."""
-    weights = {}
-    graph = ops.Graph()
-    with graph.as_default():
-      features, labels = _input_adapter(dict(params, batch_size=num_shards))
-      model_fn(
-          features, labels,
-          params=dict(params, use_tpu=False),
-          mode=model_fn_lib.ModeKeys.TRAIN)
-      saver = tf_saver.Saver()
-      with tf_session.Session(graph=graph) as sess:
-        saver.restore(sess, checkpoint)
-        all_vars = []
-        all_vars.extend(graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-        all_vars.extend(graph.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-        all_vars.extend(graph.get_collection(ops.GraphKeys.MODEL_VARIABLES))
-
-        for var in all_vars:
-          weights[var.name] = sess.run(var)
-    return weights
-
-  def _run_step(use_tpu, model_dir):
-    """Create an estimator and run a single step on the given device."""
-    tf_session.Session.reset(target=master)
-
-    logging.info("Running step.  TPU=%d.  model_dir=%s", use_tpu, model_dir)
-    est = _make_estimator(use_tpu=use_tpu, model_dir=model_dir)
-    est.train(input_fn=_input_adapter, steps=num_steps)
-    weights = _extract_weights(est.latest_checkpoint())
-    with gfile.Open(os.path.join(temp_dir, "tpu-%d.weights" % use_tpu),
-                    "wb") as f:
-      f.write(pickle.dumps(weights))
-    return weights
-
-  # initialize models to the same weights by running a single step on the CPU
-  _run_step(use_tpu=False, model_dir=initial_model_dir)
-
-  copy_dir(initial_model_dir, cpu_model_dir)
-  copy_dir(initial_model_dir, tpu_model_dir)
-
-  cpu_weights = _run_step(use_tpu=False, model_dir=cpu_model_dir)
-  tpu_weights = _run_step(use_tpu=True, model_dir=tpu_model_dir)
-
-  bad_weights = False
-  for k in cpu_weights:
-    if k not in tpu_weights:
-      raise KeyError("Missing weight %s from TPU checkpoint.", k)
-
-    if not np.allclose(
-        cpu_weights[k], tpu_weights[k], rtol=tolerance, atol=tolerance):
-      bad_weights = True
-      logging.error("Weights for layer %s have diverged.", k)
-
-  if bad_weights:
-    raise ValueError("Some weights have diverged.  Output pickle files have "
-                     "been written to %s for inspection." % temp_dir)
-
-
-class TPUTestCase(test_util.TensorFlowTestCase):
-  """Adds helpers for testing on TPU devices to `TensorFlowTestCase`.
-
-  Example usage:
-
-  ```
-  def model_fn(features):
-    return tf.reduce_sum(features * 2)
-
-  class ModelTests(test_util.TPUTestCase):
-    def test_sum(self):
-      v = np.random.randn(10, 10).astype("float32")
-      self.assert_device_output(model_fn, [v], (v*2).sum(),
-                                devices=("cpu", "tpu"))
-  ```
-  """
-
-  def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
-    super(TPUTestCase, self).__init__(methodName)
-    self._available_devices = _available_devices()
-
-  def run_on_device(self, model_fn, model_inputs, device):
-    """Runs `model_fn` on the given device.
-
-    Raises an exception if no such device is available.  `model_fn` should
-    return one or more tensors as a list or tuple.
-
-    Args:
-      model_fn: Function returning one or more tensors.
-      model_inputs: An iterable of Numpy arrays or scalars.
-                    These will be passed as arguments to `model_fn`.
-      device: Device to run on.  One of ("tpu", "gpu", "cpu").
-
-    Returns:
-      Output from the model function.
-    """
-
-    def _make_placeholders():
-      return dict([(gen_array_ops.placeholder_with_default(v, v.shape), v)
-                   for v in model_inputs])
-
-    if device == "tpu":
-      with self.test_session(graph=ops.Graph()) as sess:
-        placeholders = _make_placeholders()
-        tpu_computation = tpu.rewrite(model_fn, placeholders.keys())
-        sess.run(tpu.initialize_system())
-        sess.run(variables.global_variables_initializer())
-        result = sess.run(tpu_computation, placeholders)
-        sess.run(tpu.shutdown_system())
-        # TODO(b/36891278): supports non-flat returns lists in tpu.rewrite().
-        if len(result) == 1:
-          return result[0]
-        return result
-    elif device == "gpu":
-      with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
-        placeholders = _make_placeholders()
-        sess.run(variables.global_variables_initializer())
-        return sess.run(model_fn(placeholders.keys()), placeholders)
-    elif device == "cpu":
-      # TODO(power) -- will this interact poorly with cached GPU sessions?
-      with self.test_session(graph=ops.Graph(), use_gpu=False) as sess:
-        placeholders = _make_placeholders()
-        sess.run(variables.global_variables_initializer())
-        return sess.run(model_fn(placeholders.keys()), placeholders)
-
-  def _compare_values(self, actual_outputs, expected_outputs):
-    if isinstance(expected_outputs, (list, tuple)):
-      for a, b in zip(actual_outputs, expected_outputs):
-        self.assertAllCloseAccordingToType(a, b)
-    else:
-      self.assertAllCloseAccordingToType(actual_outputs, expected_outputs)
-
-  def assert_device_output(self,
-                           model_fn,
-                           model_inputs,
-                           expected_outputs,
-                           devices=("cpu", "gpu", "tpu")):
-    """Run `model_fn` on the given devices.
-
-    Results are compared via `assertAllCloseAccordingToType`.
-
-    Args:
-      model_fn: Function returning one or more tensors
-      model_inputs: Numpy arrays or scalars passed as arguments to model_fn
-      expected_outputs: Numpy arrays or scalars to compare against.
-      devices: Set of devices to run on.  If a device is not available, tests
-               will be skipped for that device.
-    """
-    devices = set(devices).intersection(self._available_devices)
-
-    for device in devices:
-      device_out = self.run_on_device(model_fn, model_inputs, device=device)
-      self._compare_values(device_out, expected_outputs)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 7fb8a33698fdd2b37f42464e934331de65904bfe..d5f54ff4fd278f0c84f79e0079bfb7a409dfba8d 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import compat
 
 
 # Operations that indicate some error in the users graph, e.g. a placeholder
@@ -52,6 +53,10 @@ _NOT_IMPLEMENTED_OPS = set([
     "TensorSummaryV2",
     ])
 
+_MAX_WARNING_LINES = 5
+
+_TPU_REPLICATE_ATTR = "_tpu_replicate"
+
 
 def _tpu_system_device_name(job):
   """Returns the device name for the TPU_SYSTEM device of `job`."""
@@ -101,7 +106,7 @@ def core(num):
   return "device:TPU_REPLICATED_CORE:{}".format(num)
 
 
-class TPUReplicateContext(control_flow_ops.ControlFlowContext):
+class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   """A `ControlFlowContext` for nodes inside a TPU computation.
 
   The primary role of `TPUReplicateContext` is to mark operators inside a
@@ -117,8 +122,19 @@ class TPUReplicateContext(control_flow_ops.ControlFlowContext):
   """
 
   def __init__(self, name):
-    control_flow_ops.ControlFlowContext.__init__(self)
+    super(TPUReplicateContext, self).__init__()
     self._name = name
+    self._unsupported_ops = []
+
+  def report_unsupported_operations(self):
+    if self._unsupported_ops:
+      op_str = "\n".join(["  %s (%s)" % (op.type, op.name)
+                          for op in self._unsupported_ops[:_MAX_WARNING_LINES]])
+      logging.warning("%d unsupported operations found: \n%s",
+                      len(self._unsupported_ops), op_str)
+      if len(self._unsupported_ops) > _MAX_WARNING_LINES:
+        logging.warning("... and %d more" %
+                        (len(self._unsupported_ops) - _MAX_WARNING_LINES))
 
   def AddOp(self, op):
     self._AddOpInternal(op)
@@ -126,21 +142,22 @@ class TPUReplicateContext(control_flow_ops.ControlFlowContext):
   def _AddOpInternal(self, op):
     # pylint: disable=protected-access
     if op.type in _BLACKLISTED_OPS:
-      raise ValueError("Operation of type %s (%s) is not supported on the TPU" %
-                       (op.type, op.name))
+      logging.error("Operation of type %s (%s) is not supported on the TPU. "
+                    "Execution will fail if this op is used in the graph. " %
+                    (op.type, op.name))
 
     if op.type in _NOT_IMPLEMENTED_OPS:
-      logging.warning(
-          "Operation %s (%s) is not currently supported", op.type, op.name)
+      self._unsupported_ops.append(op)
 
     if any(x.dtype._is_ref_dtype for x in op.inputs):
       raise NotImplementedError(
           "Non-resource Variables are not supported inside TPU computations "
           "(operator name: %s)" % op.name)
-    # pylint: enable=protected-access
-    if "_tpu_replicate" in op.node_def.attr:
+    if _TPU_REPLICATE_ATTR in op.node_def.attr:
       raise ValueError("TPU computations cannot be nested")
-    op.node_def.attr["_tpu_replicate"].s = self._name
+    op._set_attr(_TPU_REPLICATE_ATTR,
+                 attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
+    # pylint: enable=protected-access
     op.graph.prevent_feeding(op)
     op.graph.prevent_fetching(op)
 
@@ -344,6 +361,7 @@ def replicate(computation,
           new_output_tensors.append(array_ops.identity(t))
       output_tensors = new_output_tensors
     finally:
+      context.report_unsupported_operations()
       context.Exit()
 
     # Fan-out: Builds a TPUReplicatedOutput node for each output.
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 916b9b3082fc197694933bdd6042706891be115c..644070218214643923b9ca3ee138615ec568e8b5 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -20,29 +20,57 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import json
+import os
+
+import numpy as np
 
 from tensorflow.contrib.tpu.python.tpu import util as util_lib
 from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.platform import tf_logging as logging
+
+# pylint: disable=protected-access
+_TF_CONFIG_ENV = run_config_lib._TF_CONFIG_ENV
+_SERVICE_KEY = run_config_lib._SERVICE_KEY
+_TPU_WORKER_JOB_NAME = 'tpu_worker_job_name'
+_NUM_CORES_PER_HOST = 8
+
+# pylint: enable=protected-access
 
 
+# TODO(b/72511246) Provide a simplified api to configure model parallelism.
 class TPUConfig(
     collections.namedtuple('TPUConfig', [
         'iterations_per_loop',
         'num_shards',
+        'computation_shape',
         'per_host_input_for_training',
         'tpu_job_name',
+        'initial_infeed_sleep_secs',
     ])):
-  """TPU related configuration required by `TPUEstimator`.
+  r"""TPU related configuration required by `TPUEstimator`.
 
   Args:
-    iterations_per_loop: This is the number of train steps runnining in TPU
+    iterations_per_loop: This is the number of train steps running in TPU
       system before returning to CPU host for each `Session.run`. This means
       global step is increased `iterations_per_loop` times in one `Session.run`.
       It is recommended to be set as number of global steps for next checkpoint.
-    num_shards: The number of TPU shards in the system.
+    num_shards: (Deprecated, ignored by TPUEstimator).
+      The number of model replicas in the system. For non-model-parallelism
+      case, this number equals the total number of TPU cores. For
+      model-parallelism, the total number of TPU cores equals
+      product(computation_shape) * num_shards.
+    computation_shape: Defaults to `None`, which disables model parallelism. A
+      list of size 3 which describes the shape of a model replica's block of
+      cores. This is required by model-parallelism which enables partitioning
+      the model to multiple cores. For example, [2, 2, 1] means the model is
+      partitioned across 4 cores which span two cores in both x and y
+      coordinates.  Please refer to ${tf.contrib.tpu.TopologyProto} for the
+      geometry of a TPU mesh.
     per_host_input_for_training: If `True`, `input_fn` is invoked Per-Host
       rather than Per-Core. With Per-Host input pipeline deployment, `input_fn`
-      is invoked once on each host. To be precise, with a global batch size
+      is invoked once on each host. With Per-Core input pipeline deployment, it
+      is invoked once for each core. To be precise, with a global batch size
       `train_batch_size` in `TPUEstimator` constructor, the batch size for each
       shard is `train_batch_size` // #hosts. With Per-Core input pipeline
       deployment, the shard batch size is `train_batch_size` // #cores.
@@ -50,32 +78,68 @@ class TPUConfig(
       within TPUEstimator, however when using ClusterSpec propagation in more
       esoteric cluster configurations, you may need to specify the job name as a
       string.
+    initial_infeed_sleep_secs: The number of seconds the infeed thread should
+      wait before enqueueing the first batch. This helps avoid timeouts for
+      models that require a long compilation time.
+
+    Raises:
+      ValueError: If `computation_shape` or `computation_shape` are invalid.
   """
 
   def __new__(cls,
               iterations_per_loop=2,
-              num_shards=2,
+              num_shards=None,
+              computation_shape=None,
               per_host_input_for_training=True,
-              tpu_job_name=None):
+              tpu_job_name=None,
+              initial_infeed_sleep_secs=None):
 
     # Check iterations_per_loop.
     util_lib.check_positive_integer(iterations_per_loop,
                                     'TPUConfig iterations_per_loop')
 
     # Check num_shards.
-    util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
+    if num_shards is not None:
+      util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
+
+    # Check computation_shape
+    if computation_shape is not None and len(computation_shape) != 3:
+      raise ValueError(
+          'computation_shape must be a list with length 3 or None; got {}'.
+          format(str(computation_shape)))
+
+    if computation_shape is not None:
+      computation_shape_array = np.asarray(computation_shape, dtype=np.int32)
+      # This prevents any computation being replicated across multiple hosts, so
+      # that each host feeds the same number of computations.
+      if any(computation_shape_array < 1) or any(computation_shape_array > 2):
+        raise ValueError('computation_shape elements can only be 1 or 2; got '
+                         'computation_shape={}'.format(computation_shape))
+
+    # Check initial_infeed_sleep_secs.
+    if initial_infeed_sleep_secs:
+      util_lib.check_positive_integer(initial_infeed_sleep_secs,
+                                      'TPUConfig initial_infeed_sleep_secs')
+
+    tpu_job_name = tpu_job_name or _get_tpu_job_name_from_tf_config()
+
     return super(TPUConfig, cls).__new__(
         cls,
         iterations_per_loop=iterations_per_loop,
         num_shards=num_shards,
+        computation_shape=computation_shape,
         per_host_input_for_training=per_host_input_for_training,
-        tpu_job_name=tpu_job_name)
+        tpu_job_name=tpu_job_name,
+        initial_infeed_sleep_secs=initial_infeed_sleep_secs)
 
 
 class RunConfig(run_config_lib.RunConfig):
   """RunConfig with TPU support."""
 
-  def __init__(self, tpu_config=None, evaluation_master=None, master='',
+  def __init__(self,
+               tpu_config=None,
+               evaluation_master=None,
+               master=None,
                **kwargs):
     """Constructs a RunConfig.
 
@@ -84,16 +148,27 @@ class RunConfig(run_config_lib.RunConfig):
       evaluation_master: a string. The address of the master to use for eval.
         Defaults to master if not set.
       master: a string. The address of the master to use for training.
-      tf_random_seed: an int. Sets the TensorFlow random seed. Defaults to None,
-        which initializes it randomly based on the environment.
+      **kwargs: keyword config parameters.
     """
     super(RunConfig, self).__init__(**kwargs)
     self._tpu_config = tpu_config or TPUConfig()
-    if evaluation_master is None:
-      self._evaluation_master = master
-    else:
+
+    # If user sets master and/or evaluation_master explicilty, including empty
+    # string '', take it. Otherwise, take the values set by parent class.
+    if master is not None:
+      self._master = master
+
+    if evaluation_master is not None:
       self._evaluation_master = evaluation_master
-    self._master = master
+    elif (not self._evaluation_master and
+          self.task_type != run_config_lib.TaskType.EVALUATOR):
+      # If the task type is EVALUATOR, it means some cluster manager sets the
+      # TF_CONFIG. In that case, we respect the configuration in TF_CONFIG.
+      #
+      # Otherwise, it means user executes the code without external cluster
+      # manager. For that, we optimize the user experience by setting
+      # evaluation_master to master, unless user overwrites it.
+      self._evaluation_master = self._master
 
   @property
   def evaluation_master(self):
@@ -115,3 +190,14 @@ class RunConfig(run_config_lib.RunConfig):
     new_instance = super(RunConfig, self).replace(**kwargs)
     new_instance._tpu_config = tpu_config  # pylint: disable=protected-access
     return new_instance
+
+
+def _get_tpu_job_name_from_tf_config():
+  """Extracts the TPU job name from TF_CONFIG env variable."""
+  # TODO(xiejw): Extends this to support both TF_CONFIG env variable and cluster
+  # spec propagation.
+  tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
+  tpu_job_name = tf_config.get(_SERVICE_KEY, {}).get(_TPU_WORKER_JOB_NAME)
+  if tpu_job_name:
+    logging.info('Load TPU job name from TF_CONFIG: %s', tpu_job_name)
+  return tpu_job_name
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..37ef3dbe1e66efe18b13ab9153ee346c08b9774a
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
@@ -0,0 +1,144 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TPU RunConfig tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.contrib.tpu.python.tpu import tpu_config as tpu_config_lib
+from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.platform import test
+
+
+def _set_tf_config_env_variable(tf_config):
+  return test.mock.patch.dict('os.environ', {
+      'TF_CONFIG': json.dumps(tf_config)
+  })
+
+
+class TPURunConfigTest(test.TestCase):
+
+  def test_fail_with_invalid_num_shards(self):
+    with self.assertRaisesRegexp(ValueError, 'must be positive'):
+      tpu_config_lib.RunConfig(
+          tpu_config=tpu_config_lib.TPUConfig(num_shards=0))
+
+  def test_fail_with_iterations_per_loop(self):
+    with self.assertRaisesRegexp(ValueError, 'must be positive'):
+      tpu_config_lib.RunConfig(
+          tpu_config=tpu_config_lib.TPUConfig(iterations_per_loop=0))
+
+  def test_fail_with_invalid_computation_shape(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'computation_shape must be a list with length'
+                                 ' 3 or None'):
+      tpu_config_lib.TPUConfig(computation_shape=[2, 1])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'computation_shape elements can only be'):
+      tpu_config_lib.TPUConfig(computation_shape=[1, 3, 1])
+
+
+class TPURunConfigMasterTest(test.TestCase):
+
+  def test_default_values(self):
+    run_config = tpu_config_lib.RunConfig()
+    self.assertEqual('', run_config.master)
+    self.assertEqual('', run_config.evaluation_master)
+
+  def test_user_provided_master_and_evaluation_master(self):
+    run_config = tpu_config_lib.RunConfig(
+        master='_master_123', evaluation_master='_eval_master_123')
+    self.assertEqual('_master_123', run_config.master)
+    self.assertEqual('_eval_master_123', run_config.evaluation_master)
+
+  def test_evaluation_master_defaults_to_master(self):
+    run_config = tpu_config_lib.RunConfig(master='_master_123')
+    self.assertEqual('_master_123', run_config.master)
+    self.assertEqual('_master_123', run_config.evaluation_master)
+
+  def test_tf_config(self):
+    tf_config = {
+        'session_master': '_master_123',
+        'eval_session_master': '_eval_master_123'
+    }
+    with _set_tf_config_env_variable(tf_config):
+      run_config = tpu_config_lib.RunConfig()
+      self.assertEqual('_master_123', run_config.master)
+      self.assertEqual('_eval_master_123', run_config.evaluation_master)
+
+  def test_evaluation_master_defaults_to_master_in_tf_config(self):
+    tf_config = {
+        'session_master': '_master_123',
+    }
+    with _set_tf_config_env_variable(tf_config):
+      run_config = tpu_config_lib.RunConfig()
+      self.assertEqual('_master_123', run_config.master)
+      self.assertEqual('_master_123', run_config.evaluation_master)
+
+  def test_respect_evaluation_master_in_tf_config(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+        },
+        'task': {
+            'type': run_config_lib.TaskType.EVALUATOR,
+            'index': 0
+        },
+    }
+    with _set_tf_config_env_variable(tf_config):
+      run_config = tpu_config_lib.RunConfig(master='_something')
+      self.assertEqual('', run_config.evaluation_master)
+
+  def test_user_overwrites_tf_config(self):
+    tf_config = {
+        'session_master': '_master_123',
+        'eval_session_master': '_eval_master_123'
+    }
+    with _set_tf_config_env_variable(tf_config):
+      run_config = tpu_config_lib.RunConfig(
+          master='_new_master_123', evaluation_master='_new_eval_master_123')
+      self.assertEqual('_new_master_123', run_config.master)
+      self.assertEqual('_new_eval_master_123', run_config.evaluation_master)
+
+  def test_user_overwrites_master_in_tf_config(self):
+    tf_config = {
+        'session_master': '_master_123',
+        'eval_session_master': '_eval_master_123'
+    }
+    with _set_tf_config_env_variable(tf_config):
+      run_config = tpu_config_lib.RunConfig(master='_new_master_123')
+      self.assertEqual('_new_master_123', run_config.master)
+      self.assertEqual('_eval_master_123', run_config.evaluation_master)
+
+
+class TPUJobNameTest(test.TestCase):
+
+  def test_default_name(self):
+    config = tpu_config_lib.RunConfig()
+    self.assertIsNone(config.tpu_config.tpu_job_name)
+
+  def test_with_tf_config(self):
+    tf_config = {'service': {'tpu_worker_job_name': '_my_new_name',}}
+    with _set_tf_config_env_variable(tf_config):
+      config = tpu_config_lib.RunConfig()
+      self.assertEqual('_my_new_name', config.tpu_config.tpu_job_name)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..344ff9a37fc79ab1360fae8d3d2f9ec73e24f2b3
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -0,0 +1,517 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPU system metdata and associated tooling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from contextlib import contextmanager
+import copy
+
+import numpy as np
+
+from tensorflow.contrib.tpu.python.tpu import device_assignment  as tpu_device_assignment
+from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.platform import tf_logging as logging
+
+
+_DEFAULT_JOB_NAME = 'tpu_worker'
+_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
+_LOCAL_MASTERS = ('', 'local')
+
+
+class _TPUContext(object):
+  """A context holds immutable states of TPU computation.
+
+  This immutable object holds TPUEstimator config, train/eval batch size, and
+  `TPUEstimator.use_tpu`, which is expected to be passed around. It also
+  provides utility functions, basded on the current state, to determine other
+  information commonly required by TPU computation, such as TPU device names,
+  TPU hosts, shard batch size, etc.
+
+  N.B. As `mode` is not immutable state in Estimator, but essential to
+  distinguish between TPU training and evaluation, a common usage for
+  _TPUContext with `mode` is as follows:
+  ```
+  with _ctx.with_mode(mode) as ctx:
+    if ctx.is_running_on_cpu():
+       ...
+  ```
+  """
+
+  def __init__(self, config, train_batch_size, eval_batch_size,
+               predict_batch_size, use_tpu):
+    self._config = config
+    self._train_batch_size = train_batch_size
+    self._eval_batch_size = eval_batch_size
+    self._predict_batch_size = predict_batch_size
+    self._use_tpu = use_tpu
+    self._model_parallelism_enabled = (
+        use_tpu and config.tpu_config.computation_shape)
+    self._mode = None
+
+    self._lazy_tpu_system_metadata_dict = {}  # key by master address
+    self._lazy_device_assignment_dict = {}  # key by master address
+    self._lazy_validation_dict = {}  # key by ModeKeys
+
+  def _assert_mode(self):
+    if self._mode is None:
+      raise RuntimeError(
+          '`mode` needs to be set via contextmanager `with_mode`.')
+    return self._mode
+
+  @contextmanager
+  def with_mode(self, mode):
+    # NOTE(xiejw): Shallow copy is enough. It will share he lazy dictionaries,
+    # such as _lazy_tpu_system_metadata_dict between new copy and the original
+    # one. Note that all lazy states stored in properties _lazy_foo are sort of
+    # immutable as they should be same for the process lifetime.
+    new_ctx = copy.copy(self)
+    new_ctx._mode = mode  # pylint: disable=protected-access
+    yield new_ctx
+
+  @property
+  def mode(self):
+    return self._assert_mode()
+
+  def _get_master_address(self):
+    mode = self._assert_mode()
+    config = self._config
+    master = (
+        config.master
+        if mode != model_fn_lib.ModeKeys.EVAL else config.evaluation_master)
+    return master
+
+  def _get_tpu_system_metadata(self):
+    """Gets the (maybe cached) TPU system metadata."""
+    master = self._get_master_address()
+    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
+    if tpu_system_metadata is not None:
+      return tpu_system_metadata
+
+    # pylint: disable=protected-access
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._query_tpu_system_metadata(
+            master,
+            run_config=self._config,
+            query_topology=self.model_parallelism_enabled))
+
+    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
+    return tpu_system_metadata
+
+  def _get_device_assignment(self):
+    """Gets the (maybe cached) TPU device assignment."""
+    master = self._get_master_address()
+    device_assignment = self._lazy_device_assignment_dict.get(master)
+    if device_assignment is not None:
+      return device_assignment
+
+    tpu_system_metadata = self._get_tpu_system_metadata()
+
+    device_assignment = tpu_device_assignment.device_assignment(
+        tpu_system_metadata.topology,
+        computation_shape=self._config.tpu_config.computation_shape,
+        num_replicas=self.num_replicas)
+
+    logging.info('computation_shape: %s',
+                 str(self._config.tpu_config.computation_shape))
+    logging.info('num_replicas: %d', self.num_replicas)
+    logging.info('device_assignment.topology.device_coordinates: %s',
+                 str(device_assignment.topology.device_coordinates))
+    logging.info('device_assignment.core_assignment: %s',
+                 str(device_assignment.core_assignment))
+
+    self._lazy_device_assignment_dict[master] = device_assignment
+    return device_assignment
+
+  @property
+  def model_parallelism_enabled(self):
+    return self._model_parallelism_enabled
+
+  @property
+  def device_assignment(self):
+    return (self._get_device_assignment()
+            if self._model_parallelism_enabled else None)
+
+  @property
+  def num_of_cores_per_host(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_of_cores_per_host
+
+  @property
+  def num_cores(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_cores
+
+  @property
+  def num_of_replicas_per_host(self):
+    if self.model_parallelism_enabled:
+      return self.num_replicas // self.num_hosts
+    else:
+      return self.num_of_cores_per_host
+
+  @property
+  def num_replicas(self):
+    num_cores_in_system = self.num_cores
+
+    if self.model_parallelism_enabled:
+      computation_shape_array = np.asarray(
+          self._config.tpu_config.computation_shape, dtype=np.int32)
+      num_cores_per_replica = np.prod(computation_shape_array)
+      if num_cores_per_replica > num_cores_in_system:
+        raise ValueError(
+            'The num of cores required by the model parallelism, specified by '
+            'TPUConfig.computation_shape, is larger than the total num of '
+            'TPU cores in the system. computation_shape: {}, num cores '
+            'in the system: {}'.format(
+                self._config.tpu_config.computation_shape,
+                num_cores_in_system))
+
+      if num_cores_in_system % num_cores_per_replica != 0:
+        raise RuntimeError(
+            'The num of cores in the system ({}) is not divisible by the num '
+            'of cores ({}) required by the model parallelism, specified by '
+            'TPUConfig.computation_shape. This should never happen!'.format(
+                num_cores_in_system, num_cores_per_replica))
+
+      return num_cores_in_system // num_cores_per_replica
+    else:
+      return num_cores_in_system
+
+  @property
+  def num_hosts(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_hosts
+
+  @property
+  def config(self):
+    return self._config
+
+  def is_input_sharded_per_core(self):
+    """Return true if input_fn is invoked per-core (other than per-host)."""
+    mode = self._assert_mode()
+    return (mode == model_fn_lib.ModeKeys.TRAIN and
+            not self._config.tpu_config.per_host_input_for_training)
+
+  def is_running_on_cpu(self, is_export_mode=False):
+    """Determines whether the input_fn and model_fn should be invoked on CPU.
+
+    This API also validates user provided configuration, such as batch size,
+    according the lazy initialized TPU system metadata.
+
+    Args:
+      is_export_mode: Indicates whether the current mode is for exporting the
+        model, when mode == PREDICT. Only with this bool, we could
+        tell whether user is calling the Estimator.predict or
+        Estimator.export_savedmodel, which are running on TPU and CPU
+        respectively. Parent class Estimator does not distingush these two.
+
+    Returns:
+      bool, whether current input_fn or model_fn should be running on CPU.
+
+    Raises:
+      ValueError: any configuration is invalid.
+    """
+
+    is_running_on_cpu = self._is_running_on_cpu(is_export_mode)
+    if not is_running_on_cpu:
+      self._validate_tpu_configuration()
+    return is_running_on_cpu
+
+  def _is_running_on_cpu(self, is_export_mode):
+    """Determines whether the input_fn and model_fn should be invoked on CPU."""
+    mode = self._assert_mode()
+
+    if not self._use_tpu:
+      return True
+
+    if mode != model_fn_lib.ModeKeys.PREDICT:
+      return False
+
+    # There are actually 2 use cases when running with mode.PREDICT: prediction
+    # and saving the model.  We run actual predictions on the TPU, but
+    # model export is run on the CPU.
+    if is_export_mode:
+      return True
+
+    return False
+
+  @property
+  def global_batch_size(self):
+    mode = self._assert_mode()
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      return self._train_batch_size
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      return self._eval_batch_size
+    elif mode == model_fn_lib.ModeKeys.PREDICT:
+      return self._predict_batch_size
+    else:
+      return None
+
+  @property
+  def batch_size_for_input_fn(self):
+    """Returns the shard batch size for `input_fn`."""
+    global_batch_size = self.global_batch_size
+
+    if self.is_running_on_cpu():
+      return global_batch_size
+
+    # On TPU
+    if self.is_input_sharded_per_core():
+      # We prohibit per core input sharding for the model parallelism case,
+      # therefore it is safe to use num_cores here.
+      return global_batch_size // self.num_cores
+    else:
+      return global_batch_size // self.num_hosts
+
+  @property
+  def batch_size_for_model_fn(self):
+    """Returns the shard batch size for `model_fn`."""
+    global_batch_size = self.global_batch_size
+
+    if self.is_running_on_cpu():
+      return global_batch_size
+
+    # On TPU. always sharded per shard.
+    return global_batch_size // self.num_replicas
+
+  @property
+  def master_job(self):
+    """Returns the job name to use to place TPU computations on.
+
+    Returns:
+      A string containing the job name, or None if no job should be specified.
+
+    Raises:
+      ValueError: If the user needs to specify a tpu_job_name, because we are
+        unable to infer the job name automatically, or if the user-specified job
+        names are inappropriate.
+    """
+    run_config = self._config
+    # If the user specifies the tpu_job_name, use that.
+    if run_config.tpu_config.tpu_job_name:
+      return run_config.tpu_config.tpu_job_name
+
+    # The tpu job is determined by the run_config. Right now, this method is
+    # required as tpu_config is not part of the RunConfig.
+    mode = self._assert_mode()
+    master = (
+        run_config.evaluation_master
+        if mode == model_fn_lib.ModeKeys.EVAL else run_config.master)
+    if master in _LOCAL_MASTERS:
+      return None
+
+    if (not run_config.session_config or
+        not run_config.session_config.cluster_def.job):
+      return _DEFAULT_JOB_NAME
+    cluster_def = run_config.session_config.cluster_def
+    job_names = set([job.name for job in cluster_def.job])
+    if _DEFAULT_JOB_NAME in job_names:
+      # b/37868888 tracks allowing ClusterSpec propagation to reuse job names.
+      raise ValueError('Currently, tpu_worker is not an allowed job name.')
+    if len(job_names) == 1:
+      return cluster_def.job[0].name
+    if len(job_names) == 2:
+      if _DEFAULT_COORDINATOR_JOB_NAME in job_names:
+        job_names.remove(_DEFAULT_COORDINATOR_JOB_NAME)
+        return job_names.pop()
+      # TODO(b/67716447): Include more sophisticated heuristics.
+    raise ValueError(
+        'Could not infer TPU job name. Please specify a tpu_job_name as part '
+        'of your TPUConfig.')
+
+  @property
+  def tpu_host_placement_function(self):
+    """Returns the TPU host place function."""
+    master = self.master_job
+
+    def _placement_function(_sentinal=None, core_id=None, host_id=None):  # pylint: disable=invalid-name
+      assert _sentinal is None
+      if core_id is not None and host_id is not None:
+        raise RuntimeError(
+            'core_id and host_id can have only one non-None value.')
+
+      if master is None:
+        return '/replica:0/task:0/device:CPU:0'
+      else:
+        if core_id is not None:
+          host_id = core_id / self.num_of_cores_per_host
+        return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
+
+    return _placement_function
+
+  @property
+  def tpu_device_placement_function(self):
+    """Returns a TPU device placement Fn."""
+    master = self.master_job
+    job_device = '' if master is None else ('/job:%s' % master)
+
+    def _placement_function(i):
+      if self.model_parallelism_enabled:
+        return self.device_assignment.tpu_device(replica=i, job=master)
+      else:
+        num_of_cores_per_host = self.num_of_cores_per_host
+        host_id = i / num_of_cores_per_host
+        ordinal_id = i % num_of_cores_per_host
+        return '%s/task:%d/device:TPU:%d' % (job_device, host_id, ordinal_id)
+
+    return _placement_function
+
+  @property
+  def tpu_ordinal_function(self):
+    """Returns the TPU ordinal fn."""
+
+    def _tpu_ordinal_function(index):
+      """Return the TPU ordinal associated with a shard.
+
+      Required because the enqueue ops are placed on CPU.
+
+      Args:
+        index: the shard index
+
+      Returns:
+        The ordinal of the TPU device the shard's infeed should be placed on.
+      """
+      if self.model_parallelism_enabled:
+        return self.device_assignment.tpu_ordinal(replica=index)
+      else:
+        return index % self.num_of_cores_per_host
+
+    return _tpu_ordinal_function
+
+  def _validate_tpu_configuration(self):
+    """Validates the configuration based on the TPU system metadata."""
+    mode = self._assert_mode()
+    if self._lazy_validation_dict.get(mode):
+      return
+
+    # All following information is obtained from TPU system metadata.
+    num_cores = self.num_cores
+    num_replicas = self.num_replicas
+    num_hosts = self.num_hosts
+
+    if not num_cores:
+      tpu_system_metadata = self._get_tpu_system_metadata()
+      raise RuntimeError(
+          'Cannot find any TPU cores in the system. Please double check '
+          'Tensorflow master address and TPU worker(s). Available devices '
+          'are {}.'.format(tpu_system_metadata.devices))
+
+    if self._config.tpu_config.num_shards:
+      user_provided_num_replicas = self._config.tpu_config.num_shards
+      if user_provided_num_replicas != num_replicas:
+        message = (
+            'TPUConfig.num_shards is not set correctly. According to TPU '
+            'system metadata for Tensorflow master ({}): num_replicas should '
+            'be ({}), got ({}). For non-model-parallelism, num_replicas should '
+            'be the total num of TPU cores in the system. For '
+            'model-parallelism, the total number of TPU cores should be '
+            'product(computation_shape) * num_replicas. Please set it '
+            'accordingly or leave it as `None`'.format(
+                self._get_master_address(), num_replicas,
+                user_provided_num_replicas))
+
+        if self.model_parallelism_enabled:
+          raise ValueError(message)
+        else:
+          logging.warning(message)
+          logging.warning(
+              'For non-model-parallelism, TPUEstimator currently '
+              'automatically queries the TPU system information so ignores '
+              'this field.')
+
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      if self._train_batch_size % num_replicas != 0:
+        raise ValueError(
+            'train batch size {} must be divisible by number of replicas {}'
+            .format(self._train_batch_size, num_replicas))
+
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      if self._eval_batch_size is None:
+        raise ValueError(
+            'eval_batch_size in TPUEstimator constructor cannot be `None`'
+            'if .evaluate is running on TPU.')
+      if self._eval_batch_size % num_replicas != 0:
+        raise ValueError(
+            'eval batch size {} must be divisible by number of replicas {}'
+            .format(self._eval_batch_size, num_replicas))
+      if num_hosts > 1:
+        raise ValueError(
+            'TPUEstimator.evaluate should be running on single TPU worker. '
+            'got {}.'.format(num_hosts))
+    else:
+      assert mode == model_fn_lib.ModeKeys.PREDICT
+      if self._predict_batch_size is None:
+        raise ValueError(
+            'predict_batch_size in TPUEstimator constructor should not be '
+            '`None` if .predict is running on TPU.')
+      if self._predict_batch_size % num_replicas != 0:
+        raise ValueError(
+            'predict batch size {} must be divisible by number of replicas {}'
+            .format(self._predict_batch_size, num_replicas))
+      if num_hosts > 1:
+        raise ValueError(
+            'TPUEstimator.predict should be running on single TPU worker. '
+            'got {}.'.format(num_hosts))
+
+    # Record the state "validated" into lazy dictionary.
+    self._lazy_validation_dict[mode] = True
+
+
+class _OneCoreTPUContext(_TPUContext):
+  """Special _TPUContext for one core usage."""
+
+  def __init__(self, config, train_batch_size, eval_batch_size,
+               predict_batch_size, use_tpu):
+
+    super(_OneCoreTPUContext, self).__init__(
+        config, train_batch_size, eval_batch_size,
+        predict_batch_size, use_tpu)
+
+  def _get_tpu_system_metadata(self):
+    """Gets the (maybe cached) TPU system metadata."""
+    master = self._get_master_address()
+    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
+    if tpu_system_metadata is not None:
+      return tpu_system_metadata
+
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._TPUSystemMetadata(  # pylint: disable=protected-access
+            num_cores=1,
+            num_hosts=1,
+            num_of_cores_per_host=1,
+            topology=None,
+            devices=[]))
+
+    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
+    return tpu_system_metadata
+
+
+def _get_tpu_context(config, train_batch_size, eval_batch_size,
+                     predict_batch_size, use_tpu):
+  """Returns an instance of `_TPUContext`."""
+
+  if (config.tpu_config.num_shards == 1 and
+      config.tpu_config.computation_shape is None):
+    logging.warning(
+        'Setting TPUConfig.num_shards==1 is an unsupported behavior. '
+        'Please fix as soon as possible (leaving num_shards as None.')
+    return _OneCoreTPUContext(config, train_batch_size, eval_batch_size,
+                              predict_batch_size, use_tpu)
+
+  return _TPUContext(config, train_batch_size, eval_batch_size,
+                     predict_batch_size, use_tpu)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index fe17664d7f4afd033a795f22ebc1bc5819b7d108..ff53fe4f5d0e219f56d77d3476640bb023c7535a 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ===================================================================
-
 """TPUEstimator class."""
 
 from __future__ import absolute_import
@@ -20,29 +19,33 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-from contextlib import contextmanager
 import copy
+import signal
 import threading
 import time
+import traceback
 
 import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.contrib.summary import summary_ops as contrib_summary
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
+from tensorflow.contrib.tpu.python.tpu import tpu_context
 from tensorflow.contrib.tpu.python.tpu import tpu_feed
-from tensorflow.contrib.tpu.python.tpu import tpu_function
 from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.contrib.tpu.python.tpu import util as util_lib
-
+from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
-
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -53,11 +56,12 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
+from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import evaluation
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
-
+from tensorflow.python.util import tf_inspect
 
 _INITIAL_LOSS = 1e7
 _ZERO_LOSS = 0.
@@ -65,9 +69,15 @@ _TPU_ESTIMATOR = 'tpu_estimator'
 _ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
 _BATCH_SIZE_KEY = 'batch_size'
 _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
+
 _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 
-# TODO(b/65703635): Flip the value and remove all dead code.
+
+# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
+# only used for per-core based deployments. For per-host based pipelines, if a
+# user returns a Dataset instance it will be automatically wrapped in a
+# tf.while_loop (This can be disabled by returning features and labels
+# explicitly).
 _WRAP_INPUT_FN_INTO_WHILE_LOOP = False
 
 
@@ -84,28 +94,28 @@ def _create_global_step(graph):
         initializer=init_ops.zeros_initializer(),
         trainable=False,
         use_resource=True,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES,
-                     ops.GraphKeys.GLOBAL_STEP])
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP])
 
 
 def _create_or_get_iterations_per_loop():
   graph = ops.get_default_graph()
-  iter_vars = graph.get_collection(_TPU_ESTIMATOR)
+  collection_name = '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR)
+  iter_vars = graph.get_collection(collection_name)
   if len(iter_vars) == 1:
     return iter_vars[0]
   elif len(iter_vars) > 1:
     raise RuntimeError('Multiple iterations_per_loop_var in collection.')
 
   with ops.colocate_with(training_util.get_global_step()):
-    with variable_scope.variable_scope(_TPU_ESTIMATOR,
-                                       reuse=variable_scope.AUTO_REUSE):
+    with variable_scope.variable_scope(
+        _TPU_ESTIMATOR, reuse=variable_scope.AUTO_REUSE):
       return variable_scope.get_variable(
           _ITERATIONS_PER_LOOP_VAR,
           initializer=init_ops.zeros_initializer(),
           shape=[],
           dtype=dtypes.int32,
           trainable=False,
-          collections=[_TPU_ESTIMATOR],
+          collections=[collection_name, ops.GraphKeys.LOCAL_VARIABLES],
           use_resource=True)
 
 
@@ -137,218 +147,6 @@ def _increase_eval_step_op(iterations_per_loop):
       use_locking=True)
 
 
-_DEFAULT_JOB_NAME = 'tpu_worker'
-_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
-_LOCAL_MASTERS = ('', 'local')
-
-
-class _TPUContext(object):
-  """A context holds immutable states of TPU computation.
-
-  This immutable object holds TPUEstimator config, train/eval batch size, and
-  `TPUEstimator.use_tpu`, which is expected to be passed around. It also
-  provides utility functions, basded on the current state, to determine other
-  information commonly required by TPU computation, such as TPU device names,
-  TPU hosts, shard batch size, etc.
-
-  N.B. As `mode` is not immutable state in Estimator, but essential to
-  distinguish between TPU training and evaluation, a common usage for
-  _TPUContext with `mode` is as follows:
-  ```
-  with _ctx.with_mode(mode) as ctx:
-    if ctx.is_running_on_cpu():
-       ...
-  ```
-  """
-
-  def __init__(self, config, train_batch_size, eval_batch_size, use_tpu):
-    self._config = config
-    self._train_batch_size = train_batch_size
-    self._eval_batch_size = eval_batch_size
-    self._use_tpu = use_tpu
-    self._num_shards_or_none = self._config.tpu_config.num_shards
-    self._mode = None
-
-  def _assert_mode(self):
-    if self._mode is None:
-      raise RuntimeError(
-          '`mode` needs to be set via contextmanager `with_mode`.')
-    return self._mode
-
-  @property
-  def num_of_cores_per_host(self):
-    num_cores = self.num_cores
-    return min(num_cores, 8)
-
-  @contextmanager
-  def with_mode(self, mode):
-    new_ctx = copy.copy(self)  # Shallow copy is enough.
-    new_ctx._mode = mode  # pylint: disable=protected-access
-    yield new_ctx
-
-  @property
-  def mode(self):
-    return self._assert_mode()
-
-  @property
-  def num_cores(self):
-    # TODO(xiejw): Adds lazy num_shards initialization.
-    return self._num_shards_or_none
-
-  @property
-  def num_hosts(self):
-    return self.num_cores // self.num_of_cores_per_host
-
-  @property
-  def config(self):
-    return self._config
-
-  def is_input_sharded_per_core(self):
-    """Return true if input_fn is invoked per-core (other than per-host)."""
-    self._assert_mode()
-    return (self._mode == model_fn_lib.ModeKeys.TRAIN and
-            not self._config.tpu_config.per_host_input_for_training)
-
-  def is_running_on_cpu(self):
-    """Determines whether the input_fn and model_fn should be invoked on CPU."""
-    mode = self._assert_mode()
-    return ((not self._use_tpu) or mode == model_fn_lib.ModeKeys.PREDICT or
-            (mode == model_fn_lib.ModeKeys.EVAL and
-             self._eval_batch_size is None))
-
-  @property
-  def batch_size_for_input_fn(self):
-    """Returns the shard batch size for `input_fn`."""
-    mode = self._assert_mode()
-    # Special case for eval.
-    if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None:
-      return None
-    if self.is_running_on_cpu():
-      if mode == model_fn_lib.ModeKeys.TRAIN:
-        return self._train_batch_size
-      if mode == model_fn_lib.ModeKeys.EVAL:
-        return self._eval_batch_size
-      return None
-
-    global_batch_size = (self._train_batch_size if
-                         mode == model_fn_lib.ModeKeys.TRAIN
-                         else self._eval_batch_size)
-    # On TPU
-    if self.is_input_sharded_per_core():
-      return global_batch_size // self.num_cores
-    else:
-      return global_batch_size // self.num_hosts
-
-  @property
-  def batch_size_for_model_fn(self):
-    """Returns the shard batch size for `model_fn`."""
-    mode = self._assert_mode()
-    # Special case for eval.
-    if mode == model_fn_lib.ModeKeys.EVAL and self._eval_batch_size is None:
-      return None
-    if self.is_running_on_cpu():
-      if mode == model_fn_lib.ModeKeys.TRAIN:
-        return self._train_batch_size
-      if mode == model_fn_lib.ModeKeys.EVAL:
-        return self._eval_batch_size
-      return None
-
-    # On TPU. always sharded per core.
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      return self._train_batch_size // self.num_cores
-    else:
-      return self._eval_batch_size // self.num_cores
-
-  @property
-  def master_job(self):
-    """Returns the job name to use to place TPU computations on.
-
-    Returns:
-      A string containing the job name, or None if no job should be specified.
-
-    Raises:
-      ValueError: If the user needs to specify a tpu_job_name, because we are
-        unable to infer the job name automatically, or if the user-specified job
-        names are inappropriate.
-    """
-    run_config = self._config
-    # If the user specifies the tpu_job_name, use that.
-    if run_config.tpu_config.tpu_job_name:
-      return run_config.tpu_config.tpu_job_name
-
-    # The tpu job is determined by the run_config. Right now, this method is
-    # required as tpu_config is not part of the RunConfig.
-    mode = self._assert_mode()
-    master = (run_config.evaluation_master if mode == model_fn_lib.ModeKeys.EVAL
-              else run_config.master)
-    if master in _LOCAL_MASTERS:
-      return None
-
-    if (not run_config.session_config or
-        not run_config.session_config.cluster_def.job):
-      return _DEFAULT_JOB_NAME
-    cluster_def = run_config.session_config.cluster_def
-    job_names = set([job.name for job in cluster_def.job])
-    if _DEFAULT_JOB_NAME in job_names:
-      # b/37868888 tracks allowing ClusterSpec propagation to reuse job names.
-      raise ValueError('Currently, tpu_worker is not an allowed job name.')
-    if len(job_names) == 1:
-      return cluster_def.job[0].name
-    if len(job_names) == 2:
-      if _DEFAULT_COORDINATOR_JOB_NAME in job_names:
-        job_names.remove(_DEFAULT_COORDINATOR_JOB_NAME)
-        return job_names.pop()
-      # TODO(b/67716447): Include more sophisticated heuristics.
-    raise ValueError(
-        'Could not infer TPU job name. Please specify a tpu_job_name as part '
-        'of your TPUConfig.')
-
-  @property
-  def tpu_host_placement_function(self):
-    """Returns the TPU host place function."""
-    master = self.master_job
-    def _placement_function(_sentinal=None, core_id=None, host_id=None):  # pylint: disable=invalid-name
-      assert _sentinal is None
-      if core_id is not None and host_id is not None:
-        raise RuntimeError(
-            'core_id and host_id can have only one non-None value.')
-
-      if master is None:
-        return '/replica:0/task:0/device:CPU:0'
-      else:
-        # This assumes that if using more than 8 shards,
-        # the job configuration varies 'task'.
-        if core_id is not None:
-          host_id = core_id / 8
-        return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
-    return _placement_function
-
-  @property
-  def tpu_device_placement_function(self):
-    master = self.master_job
-    job_device = '' if master is None else ('/job:%s' % master)
-    def _placement_function(i):
-      return '%s/task:%d/device:TPU:%d' % (job_device, i / 8, i % 8)
-    return _placement_function
-
-  @property
-  def tpu_ordinal_function(self):
-    """Returns the TPU ordinal fn."""
-    def _tpu_ordinal_function(index):
-      """Return the TPU ordinal associated with a shard.
-
-      Required because the enqueue ops are placed on CPU.
-
-      Args:
-        index: the shard index
-
-      Returns:
-        The ordinal of the TPU device the shard's infeed should be placed on.
-      """
-      return index % 8
-    return _tpu_ordinal_function
-
-
 class _SIGNAL(object):
   """Signal used to control the thread of infeed/outfeed.
 
@@ -359,20 +157,27 @@ class _SIGNAL(object):
   STOP = -2
 
 
-class TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
-    'mode',
-    'predictions',
-    'loss',
-    'train_op',
-    'eval_metrics',
-    'export_outputs'])):
+class TPUEstimatorSpec(
+    collections.namedtuple('TPUEstimatorSpec', [
+        'mode',
+        'predictions',
+        'loss',
+        'train_op',
+        'eval_metrics',
+        'export_outputs',
+        'scaffold_fn',
+        'host_call'
+    ])):
   """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
 
   See `EstimatorSpec` for `mode`, 'predictions, 'loss', 'train_op', and
   'export_outputs`.
 
-  TPU evaluation expects a slightly different signature from the
-  ${tf.estimator.Estimator}. While `EstimatorSpec.eval_metric_ops` expects a
+  For evaluation, `eval_metrics `is a tuple of `metric_fn` and `tensors`, where
+  `metric_fn` runs on CPU to generate metrics and `tensors` represents the
+  `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`.
+  To be precise, TPU evaluation expects a slightly different signature from the
+  @{tf.estimator.Estimator}. While `EstimatorSpec.eval_metric_ops` expects a
   dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`.
   The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The
   `tensors` usually specify the model logits, which are transferred back from
@@ -382,9 +187,20 @@ class TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
   to the `metric_fn` if `tensors` is list or keyword arguments if `tensors` is
   dict. `metric_fn` takes the `tensors` and returns a dict from metric string
   name to the result of calling a metric function, namely a `(metric_tensor,
-  update_op)` tuple.
-
-  See `TPUEstimator` for MNIST example how to specify the `eval_metrics`.
+  update_op)` tuple. See `TPUEstimator` for MNIST example how to specify the
+  `eval_metrics`.
+
+  `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This
+  function should not capture any Tensors in `model_fn`.
+
+  `host_call` is a tuple of a `function` and a list or dictionary of `tensors`
+  to pass to that function and returns a list of Tensors. `host_call` currently
+  works for train() and evaluate(). The Tensors returned by the function is
+  executed on the CPU on every step, so there is communication overhead when
+  sending tensors from TPU to CPU. To reduce the overhead, try reducing the
+  size of the tensors. The `tensors` are concatenated along their major (batch)
+  dimension, and so must be >= rank 1. The `host_call` is useful for writing
+  summaries with @{tf.contrib.summary.create_file_writer}.
   """
 
   def __new__(cls,
@@ -393,111 +209,177 @@ class TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
               loss=None,
               train_op=None,
               eval_metrics=None,
-              export_outputs=None):
+              export_outputs=None,
+              scaffold_fn=None,
+              host_call=None):
     """Creates a validated `TPUEstimatorSpec` instance."""
+    host_calls = {}
     if eval_metrics is not None:
-      _EvalMetrics.validate(eval_metrics)
-    return super(TPUEstimatorSpec, cls).__new__(cls,
-                                                mode=mode,
-                                                predictions=predictions,
-                                                loss=loss,
-                                                train_op=train_op,
-                                                eval_metrics=eval_metrics,
-                                                export_outputs=export_outputs)
+      host_calls['eval_metrics'] = eval_metrics
+    if host_call is not None:
+      host_calls['host_call'] = host_call
+    _OutfeedHostCall.validate(host_calls)
+    return super(TPUEstimatorSpec, cls).__new__(
+        cls,
+        mode=mode,
+        predictions=predictions,
+        loss=loss,
+        train_op=train_op,
+        eval_metrics=eval_metrics,
+        export_outputs=export_outputs,
+        scaffold_fn=scaffold_fn,
+        host_call=host_call)
 
   def as_estimator_spec(self):
     """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
-    eval_metric_ops = _EvalMetrics.to_metric_metric_ops_for_cpu(
-        self.eval_metrics)
-    return model_fn_lib.EstimatorSpec(mode=self.mode,
-                                      predictions=self.predictions,
-                                      loss=self.loss,
-                                      train_op=self.train_op,
-                                      eval_metric_ops=eval_metric_ops,
-                                      export_outputs=self.export_outputs)
+    host_calls = {}
+    if self.eval_metrics is not None:
+      host_calls['eval_metrics'] = self.eval_metrics
+    if self.host_call is not None:
+      host_calls['host_call'] = self.host_call
+    host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
+    eval_metric_ops = None
+    if self.eval_metrics is not None:
+      eval_metric_ops = host_call_ret['eval_metrics']
+    hooks = None
+    if self.host_call is not None:
+      hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
+    scaffold = self.scaffold_fn() if self.scaffold_fn else None
+    return model_fn_lib.EstimatorSpec(
+        mode=self.mode,
+        predictions=self.predictions,
+        loss=self.loss,
+        train_op=self.train_op,
+        eval_metric_ops=eval_metric_ops,
+        export_outputs=self.export_outputs,
+        scaffold=scaffold,
+        training_hooks=hooks,
+        evaluation_hooks=hooks,
+        prediction_hooks=hooks)
+
+
+class _OpQueueContext(object):
+  """Manages work queue and thread for a infeed/outfeed thread."""
+
+  def __init__(self, name, target, args):
+    self._name = name
+    self._queue = Queue.Queue()
+    args = (self,) + args
+    self._thread = threading.Thread(name=name, target=target, args=args)
+    self._thread.daemon = True
+    self._thread.start()
+
+  def stop(self):
+    self._queue.put(_SIGNAL.STOP)
+
+  def send_next_batch_signal(self, iterations):
+    self._queue.put(iterations)
+
+  def read_iteration_counts(self):
+    while True:
+      iterations = self._queue.get(block=True)
+      logging.debug('%s read iterations %s', self._name, iterations)
+      if iterations == _SIGNAL.STOP:
+        logging.info('%s received shutdown signal, stopping.', self._name)
+        return
+      yield iterations
 
+  def join(self):
+    logging.info('Shutting down %s thread.' % self._name)
+    self.stop()
+    self._thread.join()
 
-class _InfeedOutfeedThreadBaseController(object):
-  """This wraps the infeed/outfeed thread and stops when Estimator finishes."""
 
-  def __init__(self, thd):
-    self._signal_queue = Queue.Queue()
-    thd.daemon = True
-    thd.start()
-    self._thd = thd
+class _OpSignalOnceQueueContext(_OpQueueContext):
+  """Manages work queue and thread for a infeed/outfeed thread.
 
-  def block_and_get_signal(self):
-    return self._signal_queue.get()
+  This subclass only signals once.
+  """
 
-  def send_next_batch_signal(self, signal=_SIGNAL.NEXT_BATCH):
-    self._signal_queue.put(signal)
+  def __init__(self, name, target, args):
+    super(_OpSignalOnceQueueContext, self).__init__(name, target, args)
+    self._has_signaled = False
 
-  def join(self):
-    self._signal_queue.put(_SIGNAL.STOP)
-    self._thd.join()
+  def send_next_batch_signal(self, iterations):
+    if not self._has_signaled:
+      self._queue.put(iterations)
+      self._has_signaled = True
+
+
+class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
+  """A Session hook setting up the TPU initialization, infeed, and outfeed.
 
+  This hook does two major things:
+  1. initialize and shutdown TPU system.
+  2. launch and join the threads for infeed enqueue and (optional) outfeed
+     dequeue.
+  """
 
-class _OutfeedThreadController(_InfeedOutfeedThreadBaseController):
-  """This wraps the outfeed thread and stops when Estimator finishes."""
+  def __init__(self,
+               ctx,
+               enqueue_ops,
+               dequeue_ops,
+               run_infeed_loop_on_coordinator=True):
+    self._master_job = ctx.master_job
+    self._enqueue_ops = enqueue_ops
+    self._dequeue_ops = dequeue_ops
 
-  def __init__(self, session, dequeue_ops):
-    super(_OutfeedThreadController, self).__init__(
-        threading.Thread(target=self._execute_dequeue_ops,
-                         args=(session, dequeue_ops)))
+    self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
+    self._initial_infeed_sleep_secs = (
+        ctx.config.tpu_config.initial_infeed_sleep_secs)
 
-  def _execute_dequeue_ops(self, session, dequeue_ops):
-    count = 0
-    while True:
-      signal = self.block_and_get_signal()
-      if signal == _SIGNAL.STOP:
-        logging.info('Stop outfeed thread.')
-        return
+    self._session_cancel_timer = None
 
-      iterations = signal
-      for i in range(iterations):
-        logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
-        session.run(dequeue_ops)
-      count += 1
+    self._feed_error = None
+    self._finished = False
 
-  def join(self):
-    logging.info('Waiting for Outfeed Thread to exit.')
-    super(_OutfeedThreadController, self).join()
+  def begin(self):
+    logging.info('TPU job name %s', self._master_job)
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+    self._init_ops = [tpu.initialize_system(job=self._master_job)]
+    self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
 
+    summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
+    self._init_ops.extend(summary_writer_init_ops)
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    for op in summary_writer_init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
 
-class _InfeedThreadController(_InfeedOutfeedThreadBaseController):
-  """This wraps the infeed thread and stops when Estimator finishes."""
+  def _log_error(self, session, error):
+    """Log an infeed or outfeed error.
 
-  def __init__(self, session, enqueue_ops):
-    super(_InfeedThreadController, self).__init__(
-        threading.Thread(target=self._input_thread_fn_for_loading,
-                         args=(session, enqueue_ops)))
+    This logs a short error message immediately, and schedules a timer to
+    emit the full stack trace and error message after a short period of time.
+    If the main session has terminated by the time the timer triggers, we
+    assume the real source of the error was from the main session and avoid
+    emitting a stack trace for the infeed.
 
-  def _input_thread_fn_for_loading(self, session, enqueue_ops):
-    count = 0
-    try:
-      while True:
-        signal = self._signal_queue.get()
-        if signal == _SIGNAL.STOP:
-          logging.info('Stop Infeed input thread.')
-          return
-
-        if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-          # Enqueue batches for next loop.
-          session.run(enqueue_ops)
-        else:
-          iterations = signal
-          for i in range(iterations):
-            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
-            session.run(enqueue_ops)
-          count += 1
+    Args:
+      session: `tf.Session`, session to be terminated error: exception that
+        triggered logging.
+      error: the Exception to log.
+    """
+    logging.warning(
+        '\n\n'
+        'Error occurred during infeed/outfeed.  This may be due to a compile '
+        'error in the main session.  Waiting for a short time for the main '
+        'session to come back.\n\n%s', error)
 
-    except Exception:  # pylint: disable=broad-except
+    self._feed_error = traceback.format_exc()
+
+    # If we've already encountered a feed error, don't schedule another
+    # cancellation op.
+    if self._session_cancel_timer:
+      return
+
+    def _cancel_session():
       # Close the session to avoid the main thread from hanging. If input
       # pipeline triggers any error, the infeed thread dies but the main thread
       # for TPU computation waits for the infeed enqueue forever. Close the
       # Session to cancel the main thread Session.run execution.
       #
-      # However, sleep for 2 minutes before explicit closing to give some time
+      # We sleep for a few seconds before closing to give some time
       # for the TPU compilation error, if any, propagating, from TPU to CPU
       # host. Compilation errors should be reported by the main thread so that
       # the program can be interrupted and users can take action.  Due to a race
@@ -506,77 +388,108 @@ class _InfeedThreadController(_InfeedOutfeedThreadBaseController):
       # exception in the main thread, instead of the expected compile error.
       # User code that depends on having the proper exception type will
       # therefore be confused.
-      logging.error(
-          'Failed running infeed, closing session.\n'
-          'You may see an exception from your main session after this. '
-          'Sleep for 2 minutes before close Session from infeed thread to '
-          'allow the main thread returning an error first, if any.',
-          exc_info=1
-      )
-      time.sleep(120)
-      session.close()
-
-  def join(self):
-    logging.info('Waiting for Infeed Thread to exit.')
-    super(_InfeedThreadController, self).join()
-
-
-class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
-  """A Session hook setting up the TPU initialization, infeed, and outfeed.
-
-  This hook does two major things:
-  1. initialize and shutdown TPU system.
-  2. launch and join the threads for infeed enqueue and (optional) outfeed
-     dequeue.
-  """
+      time.sleep(5)
+
+      # If the main session is still running, the infeed/outfeed errors are
+      # legitimate, and should be logged.
+      if not self._finished and self._feed_error:
+        logging.error('Feed error: %s', self._feed_error)
+        logging.error('Closing session.  A RuntimeError should follow.')
+        session.close()
+
+    self._session_cancel_timer = threading.Thread(target=_cancel_session)
+    self._session_cancel_timer.daemon = True
+    self._session_cancel_timer.start()
+
+  def _run_infeed(self, queue_ctx, session):
+    logging.info('Starting infeed thread controller.')
+    if self._initial_infeed_sleep_secs:
+      logging.info('%s thread sleeping for %d seconds.', self._name,
+                   self._initial_infeed_sleep_secs)
+      time.sleep(self._initial_infeed_sleep_secs)
+      logging.info('%s thread starting after sleep', self._name)
 
-  def __init__(self, ctx, enqueue_ops, dequeue_ops=None):
-    self._master_job = ctx.master_job
-    self._enqueue_ops = enqueue_ops
-    self._dequeue_ops = dequeue_ops
+    try:
+      if self._run_infeed_loop_on_coordinator:
+        for count, steps in enumerate(queue_ctx.read_iteration_counts()):
+          for i in xrange(steps):
+            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
+            session.run(self._enqueue_ops)
+      else:
+        for _ in queue_ctx.read_iteration_counts():
+          session.run(self._enqueue_ops)
+      logging.info('Infeed thread finished, shutting down.')
+    except Exception as e:  # pylint: disable=broad-except
+      self._log_error(session, e)
+
+  def _run_outfeed(self, queue_ctx, session):
+    logging.info('Starting outfeed thread controller.')
+    try:
+      for count, steps in enumerate(queue_ctx.read_iteration_counts()):
+        for i in xrange(steps):
+          logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
+          session.run(self._dequeue_ops)
+      logging.info('Outfeed thread finished, shutting down.')
+    except Exception as e:  # pylint: disable=broad-except
+      self._log_error(session, e)
 
-  def begin(self):
-    logging.info('TPU job name %s', self._master_job)
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-    self._init_op = [tpu.initialize_system(job=self._master_job)]
-    self._finalize_op = [tpu.shutdown_system(job=self._master_job)]
+  def _create_infeed_controller(self, name, target, args):
+    return _OpQueueContext(name=name, target=target, args=args)
 
   def after_create_session(self, session, coord):
     logging.info('Init TPU system')
-    session.run(self._init_op,
-                options=config_pb2.RunOptions(timeout_in_ms=5*60*1000))
+    session.run(self._init_ops,
+                options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
 
     logging.info('Start infeed thread controller')
-    self._infeed_thd_controller = _InfeedThreadController(
-        session, self._enqueue_ops)
+    self._infeed_controller = self._create_infeed_controller(
+        name='InfeedController', target=self._run_infeed, args=(session,))
 
-    if self._dequeue_ops is not None:
-      logging.info('Start outfeed thread controller')
-      self._outfeed_thd_controller = _OutfeedThreadController(
-          session, self._dequeue_ops)
+    logging.info('Start outfeed thread controller')
+    self._outfeed_controller = _OpQueueContext(
+        name='OutfeedController', target=self._run_outfeed, args=(session,))
 
   def before_run(self, run_context):
+    self._feed_error = None
+
+    # Wait for the cancellation timer to complete before continuing.
+    if self._session_cancel_timer:
+      self._session_cancel_timer.join()
+      self._session_cancel_timer = None
+
     iterations = run_context.session.run(self._iterations_per_loop_var)
 
     logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
+    self._infeed_controller.send_next_batch_signal(iterations)
 
-    self._infeed_thd_controller.send_next_batch_signal(iterations)
-    if self._dequeue_ops is not None:
-      # TODO(xiejw): Refactor the outfeed dequeue into tf.while_loop.
-      logging.info(
-          'Dequeue next (%d) batch(es) of data from outfeed.', iterations)
-      self._outfeed_thd_controller.send_next_batch_signal(iterations)
+    logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
+                 iterations)
+    self._outfeed_controller.send_next_batch_signal(iterations)
 
   def end(self, session):
+    if self._session_cancel_timer:
+      logging.warning('Feed error occurred; waiting for message.')
+      self._session_cancel_timer.join()
+
+    self._finished = True
     logging.info('Stop infeed thread controller')
-    self._infeed_thd_controller.join()
+    self._infeed_controller.join()
 
-    if self._dequeue_ops is not None:
-      logging.info('Stop output thread controller')
-      self._outfeed_thd_controller.join()
+    logging.info('Stop output thread controller')
+    self._outfeed_controller.join()
 
     logging.info('Shutdown TPU system.')
-    session.run(self._finalize_op)
+    session.run(self._finalize_ops)
+
+
+class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
+
+  def __init__(self, ctx, enqueue_ops, dequeue_ops):
+    super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
+        ctx, enqueue_ops, dequeue_ops, run_infeed_loop_on_coordinator=False)
+
+  def _create_infeed_controller(self, name, target, args):
+    return _OpSignalOnceQueueContext(name=name, target=target, args=args)
 
 
 class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
@@ -644,8 +557,8 @@ class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
       run_context.request_stop()
     else:
       iterations = self._next_iterations(global_step, self._last_step)
-      self._iterations_per_loop_var.load(iterations,
-                                         session=run_context.session)
+      self._iterations_per_loop_var.load(
+          iterations, session=run_context.session)
 
 
 class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
@@ -666,10 +579,51 @@ class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
     self._iterations_per_loop_var.load(self._num_steps, session=session)
 
 
-def generate_per_core_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder):
+class _StoppingPredictHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop according to the stopping signal in prediction."""
+
+  def __init__(self, scalar_stopping_signal):
+    self._scalar_stopping_signal = scalar_stopping_signal
+
+  def begin(self):
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  def after_create_session(self, session, coord):
+    # This is not necessary as we do not run infeed enqueue and outfeed dequeue
+    # in side threads for prediction model. But it makes the
+    # TPUInfeedOutfeedSessionHook prints nice message.
+    self._iterations_per_loop_var.load(1, session=session)
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs(self._scalar_stopping_signal)
+
+  def after_run(self, run_context, run_values):
+    _ = run_context
+    scalar_stopping_signal = run_values.results
+    if _StopSignals.should_stop(scalar_stopping_signal):
+      # NOTE(xiejw): In prediction, stopping signals are inserted for each
+      # batch. And we append one more batch to signal the system it should stop.
+      # The data flow might look like
+      #
+      #  batch   0: images, labels, stop = 0  (user provideded)
+      #  batch   1: images, labels, stop = 0  (user provideded)
+      #  ...
+      #  batch  99: images, labels, stop = 0  (user provideded)
+      #  batch 100: images, labels, stop = 1  (TPUEstimator appended)
+      #
+      # where the final batch (id = 100) is appended by TPUEstimator, so we
+      # should drop it before returning the predictions to user.
+      # To achieve that, we throw the OutOfRangeError in after_run. Once
+      # Monitored Session sees this error in SessionRunHook.after_run, the
+      # "current" prediciton, i.e., batch with id=100, will be discarded
+      # immediately
+      raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
+
+
+def generate_per_core_enqueue_ops_fn_for_host(ctx, input_fn,
+                                              inputs_structure_recorder):
   """Generates infeed enqueue ops for per-core input_fn on a single host."""
-  infeed_queue_holder = {'instance': None}
+  captured_infeed_queue = _CapturedObject()
 
   def enqueue_ops_fn():
     """A fn returns enqueue_ops."""
@@ -677,11 +631,14 @@ def generate_per_core_enqueue_ops_fn_for_host(
     per_host_sharded_inputs = []
     for core_ordinal in range(num_cores_per_host):
       with ops.name_scope('ordinal_%d' % (core_ordinal)):
-        inputs = input_fn()
-        if isinstance(inputs, tuple):
-          features, labels = inputs
-        else:
-          features, labels = inputs, None
+        inputs = _Inputs.from_input_fn(input_fn())
+        if inputs.is_dataset:
+          raise TypeError(
+              '`input_fn` returning `Dataset`  is not yet supported in '
+              'per-Core input pipeline deployment yet. Please set '
+              'TPUConfig.per_host_input_for_training to True or return '
+              '`features` and `labels` from `input_fn`')
+        features, labels = inputs.features_and_labels()
 
         inputs_structure_recorder.validate_and_record_structure(
             features, labels)
@@ -692,49 +649,88 @@ def generate_per_core_enqueue_ops_fn_for_host(
 
     infeed_queue = tpu_feed.InfeedQueue(
         number_of_tuple_elements=len(per_host_sharded_inputs[0]))
-    infeed_queue_holder['instance'] = infeed_queue
+    captured_infeed_queue.capture(infeed_queue)
     infeed_queue.set_configuration_from_sharded_input_tensors(
         per_host_sharded_inputs)
 
     per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-        per_host_sharded_inputs,
-        tpu_ordinal_function=ctx.tpu_ordinal_function)
+        per_host_sharded_inputs, tpu_ordinal_function=ctx.tpu_ordinal_function)
     return per_host_enqueue_ops
-  return enqueue_ops_fn, (lambda: infeed_queue_holder['instance'])
+
+  return enqueue_ops_fn, captured_infeed_queue
 
 
 def generate_per_host_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, batch_axis, device):
+    ctx, input_fn, inputs_structure_recorder, batch_axis, device, host_id):
   """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  infeed_queue_holder = {'instance': None}
+  captured_infeed_queue = _CapturedObject()
+
+  hooks = []
+
+  with ops.device(device):
+    inputs = _Inputs.from_input_fn(input_fn())
+
+    is_dataset = inputs.is_dataset
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      if not is_dataset:
+        raise TypeError(
+            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
+            '`features` and `labels`.')
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset, batch_size=ctx.batch_size_for_input_fn)
+
+    if is_dataset:
+      hooks.append(inputs.dataset_initializer_hook())
+
+  # TODO(ylc): Refactoring the code to merge the tpu ordinal logic here and the
+  # _TPUContext.tpu_ordinal_function. We should either introduce another
+  # abstraction or a different helper method.
+  def _tpu_ordinal_function_impl(shard_index_in_host):
+    # We put both enqueue/dequeue op at tpu.core(0) in each replica.
+    replica = ctx.device_assignment.lookup_replicas(
+        host_id, (0, 0, 0))[shard_index_in_host]
+    return ctx.device_assignment.tpu_ordinal(replica=replica)
+
+  if ctx.model_parallelism_enabled:
+    tpu_ordinal_function = _tpu_ordinal_function_impl
+  else:
+    tpu_ordinal_function = None
 
   def enqueue_ops_fn():
     with ops.device(device):
-      num_cores_per_host = ctx.num_of_cores_per_host
-      inputs = input_fn()
-      if isinstance(inputs, tuple):
-        features, labels = inputs
-      else:
-        features, labels = inputs, None
+      num_of_replicas_per_host = ctx.num_of_replicas_per_host
+      # Convert user input to features and labels.  If the user returns a
+      # dataset, it is initialized and the features and labels extracted via
+      # `dataset.iterator.get_next()`
+      features, labels = inputs.features_and_labels()
+      signals = inputs.signals()
+
       inputs_structure_recorder.validate_and_record_structure(
-          features, labels)
+          features, labels, signals)
       unsharded_tensor_list = (
           inputs_structure_recorder.flatten_features_and_labels(
-              features, labels))
+              features, labels, signals))
 
       infeed_queue = tpu_feed.InfeedQueue(
           tuple_types=[t.dtype for t in unsharded_tensor_list],
           tuple_shapes=[t.shape for t in unsharded_tensor_list],
           shard_dimensions=batch_axis)
-      infeed_queue_holder['instance'] = infeed_queue
-      infeed_queue.set_number_of_shards(num_cores_per_host)
-
+      captured_infeed_queue.capture(infeed_queue)
+      infeed_queue.set_number_of_shards(num_of_replicas_per_host)
       per_host_enqueue_ops = (
           infeed_queue.split_inputs_and_generate_enqueue_ops(
               unsharded_tensor_list,
-              placement_function=lambda x: device))
-      return per_host_enqueue_ops
-  return enqueue_ops_fn, (lambda: infeed_queue_holder['instance'])
+              placement_function=lambda x: device,
+              tpu_ordinal_function=tpu_ordinal_function))
+      if signals is None:
+        return per_host_enqueue_ops
+      else:
+        return {
+            'ops': per_host_enqueue_ops,
+            'signals': signals,
+        }
+
+  return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
 
 
 class _InputPipeline(object):
@@ -774,6 +770,7 @@ class _InputPipeline(object):
       self._feature_names = []
       self._label_names = []
       self._has_labels = False
+      self._signals_helper = None
 
       # Internal state.
       self._initialized = False
@@ -781,8 +778,9 @@ class _InputPipeline(object):
     def has_labels(self):
       return self._has_labels
 
-    def validate_and_record_structure(self, features, labels):
+    def validate_and_record_structure(self, features, labels, signals=None):
       """Validates and records the structure of features` and `labels`."""
+
       def _extract_key_names(tensor_or_dict):
         if tensor_or_dict is None:
           return []
@@ -793,6 +791,10 @@ class _InputPipeline(object):
       feature_names = _extract_key_names(features)
       label_names = _extract_key_names(labels)
 
+      if signals is not None and self._signals_helper is None:
+        # Record signals helper.
+        self._signals_helper = _SignalsHelper(signals)
+
       if self._initialized:
         # Verify the structure is same. The following should never happen.
         assert feature_names == self._feature_names, 'feature keys mismatched'
@@ -805,13 +807,13 @@ class _InputPipeline(object):
         self._label_names = label_names
         self._has_labels = has_labels
 
-    def flatten_features_and_labels(self, features, labels):
+    def flatten_features_and_labels(self, features, labels, signals=None):
       """Flattens the `features` and `labels` to a single tensor list."""
       flattened_inputs = []
       if self._feature_names:
         # We need a fixed ordering for enqueueing and dequeueing.
-        flattened_inputs.extend([features[name]
-                                 for name in self._feature_names])
+        flattened_inputs.extend(
+            [features[name] for name in self._feature_names])
       else:
         flattened_inputs.append(features)
 
@@ -821,6 +823,9 @@ class _InputPipeline(object):
           flattened_inputs.extend([labels[name] for name in self._label_names])
         else:
           flattened_inputs.append(labels)
+
+      if signals is not None:
+        flattened_inputs.extend(_SignalsHelper.as_tensor_list(signals))
       return flattened_inputs
 
     def unflatten_features_and_labels(self, flattened_inputs):
@@ -838,15 +843,19 @@ class _InputPipeline(object):
         ValueError: If the number of expected tensors from `flattened_inputs`
           mismatches the recorded structure.
       """
-      expected_num_features = (len(self._feature_names) if self._feature_names
-                               else 1)
+      expected_num_features = (
+          len(self._feature_names) if self._feature_names else 1)
       if self._has_labels:
-        expected_num_labels = (len(self._label_names) if self._label_names
-                               else 1)
+        expected_num_labels = (
+            len(self._label_names) if self._label_names else 1)
       else:
         expected_num_labels = 0
 
-      expected_num_tensors = expected_num_features + expected_num_labels
+      expected_num_signals = (
+          self._signals_helper.num_signals if self._signals_helper else 0)
+
+      expected_num_tensors = (
+          expected_num_features + expected_num_labels + expected_num_signals)
 
       if expected_num_tensors != len(flattened_inputs):
         raise ValueError(
@@ -863,13 +872,20 @@ class _InputPipeline(object):
       if expected_num_labels == 0:
         unflattened_label = None
       elif self._label_names:
-        unflattened_label = dict(zip(self._label_names,
-                                     flattened_inputs[expected_num_features:]))
+        label_list = flattened_inputs[
+            expected_num_features:expected_num_features + expected_num_labels]
+        unflattened_label = dict(zip(self._label_names, label_list))
       else:
         # Single tensor case.
         unflattened_label = flattened_inputs[expected_num_features]
 
-      return unflattened_features, unflattened_label
+      signals = None
+      if expected_num_signals != 0:
+        tensor_list_for_signals = flattened_inputs[
+            expected_num_features + expected_num_labels:]
+        signals = self._signals_helper.unflatten(tensor_list_for_signals)
+
+      return _Inputs(unflattened_features, unflattened_label, signals=signals)
 
   def __init__(self, input_fn, batch_axis, ctx):
     """Constructor.
@@ -897,25 +913,34 @@ class _InputPipeline(object):
     # While tf.while_loop is called, the body function, which invokes
     # `enqueue_fn` passed in, is called to construct the graph. So, input_fn
     # structure is recorded.
-    enqueue_ops = self._invoke_input_fn_and_record_structure()
+    enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = (
+        self._invoke_input_fn_and_record_structure())
 
     self._validate_input_pipeline()
 
     def dequeue_fn():
       """dequeue_fn is used by TPU to retrieve the tensors."""
-      values = self._infeed_queue.generate_dequeue_op()
+      # In the model-parallel case, both the host-side and device-side
+      # computations must agree on the core on which infeed takes place. We
+      # choose to perform infeed on logical core 0 of each replica.
+      with ops.device(tpu.core(0)):
+        values = self._infeed_queue.generate_dequeue_op()
       # The unflatten process uses the structure information recorded above.
       return self._inputs_structure_recorder.unflatten_features_and_labels(
           values)
 
-    return (enqueue_ops, dequeue_fn)
+    return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator)
 
   def _invoke_input_fn_and_record_structure(self):
     """Deploys the input pipeline and record input structure."""
     enqueue_ops = []
     infeed_queues = []
+    all_hooks = []
     num_hosts = self._ctx.num_hosts
     tpu_host_placement_fn = self._ctx.tpu_host_placement_function
+
+    run_infeed_loop_on_coordinator = True
+
     if self._sharded_per_core:
       # Per-Core input pipeline deployment.
       # Invoke input pipeline for each core and placed on the corresponding
@@ -924,39 +949,59 @@ class _InputPipeline(object):
         host_device = tpu_host_placement_fn(host_id=host_id)
         with ops.device(host_device):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, infeed_queue_getter = (
+            enqueue_ops_fn, captured_infeed_queue = (
                 generate_per_core_enqueue_ops_fn_for_host(
                     self._ctx, self._input_fn, self._inputs_structure_recorder))
 
             if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-              enqueue_ops.append(_wrap_computation_in_while_loop(
-                  device=host_device, op_fn=enqueue_ops_fn))
+              run_infeed_loop_on_coordinator = False
+              enqueue_ops.append(
+                  _wrap_computation_in_while_loop(
+                      device=host_device, op_fn=enqueue_ops_fn))
             else:
               enqueue_ops.append(enqueue_ops_fn())
             # Infeed_queue_getter must be called after enqueue_ops_fn is called.
-            infeed_queues.append(infeed_queue_getter())
+            infeed_queues.append(captured_infeed_queue.get())
 
     else:
       for host_id in range(num_hosts):
         host_device = tpu_host_placement_fn(host_id=host_id)
         with ops.device(host_device):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, infeed_queue_getter = (
+            enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
                 generate_per_host_enqueue_ops_fn_for_host(
                     self._ctx, self._input_fn, self._inputs_structure_recorder,
-                    self._batch_axis, host_device))
-
-            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-              enqueue_ops.append(_wrap_computation_in_while_loop(
-                  device=host_device, op_fn=enqueue_ops_fn))
+                    self._batch_axis, host_device, host_id))
+            all_hooks.extend(hooks)
+
+            # NOTE(xiejw): We dispatch here based on the return type of the
+            # users `input_fn`.
+            #
+            # 1. If input_fn returns a Dataset instance, we initialize the
+            # iterator outside of tf.while_loop, and call the iterator.get_next
+            # inside tf.while_loop.  This should be always safe.
+            #
+            # 2. If input_fn returns (features, labels), it is too late to wrap
+            # them inside tf.while_loop, as resource initialization cannot be
+            # handled in TF control flow properly. In this case, we will use
+            # python loop to enqueue the data into TPU system.  This may be
+            # slow compared to the previous case.
+            if is_dataset:
+              run_infeed_loop_on_coordinator = False
+              wrap_fn = (
+                  _wrap_computation_in_while_loop
+                  if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
+                  _wrap_computation_in_while_loop_with_stopping_signals)
+              enqueue_ops.append(
+                  wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
             else:
               enqueue_ops.append(enqueue_ops_fn())
-            infeed_queues.append(infeed_queue_getter())
+            infeed_queues.append(captured_infeed_queue.get())
     # infeed_queue is used to generate dequeue ops. The only thing it uses for
     # dequeue is dtypes and types. So, any one can be used. Here, grab the
     # first one.
     self._infeed_queue = infeed_queues[0]
-    return enqueue_ops
+    return enqueue_ops, all_hooks, run_infeed_loop_on_coordinator
 
   def _validate_input_pipeline(self):
     # Perform some sanity checks to log user friendly information. We should
@@ -992,10 +1037,7 @@ class _ModelFnWrapper(object):
     self._ctx = ctx
 
   def call_without_tpu(self, features, labels):
-    # Let CrossShardOptimizer be called without TPU in model_fn, since it's
-    # common to set the train_op even when running evaluate() or predict().
-    with tpu_function.tpu_shard_context(1):
-      return self._call_model_fn(features, labels)
+    return self._call_model_fn(features, labels)
 
   def convert_to_single_tpu_train_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single train step on TPU.
@@ -1016,20 +1058,40 @@ class _ModelFnWrapper(object):
         infeed dequeue channel.
 
     Returns:
-      A Fn representing the train step for TPU.
+      A tuple of train_fn, host_calls, and captured scaffold_fn. The train_fn
+      representing the train step for TPU.
     """
 
+    host_call = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
+
     def train_step(loss):
       """Training step function for use inside a while loop."""
       del loss  # unused; required in function signature.
-      features, labels = dequeue_fn()
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
 
       estimator_spec = self._verify_estimator_spec(
           self._call_model_fn(features, labels))
       loss, train_op = estimator_spec.loss, estimator_spec.train_op
+
+      if isinstance(estimator_spec, TPUEstimatorSpec):
+        captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
+      else:
+        captured_scaffold_fn.capture(None)
+
+      # We must run train_op to update the variables prior to running the
+      # outfeed.
       with ops.control_dependencies([train_op]):
-        return array_ops.identity(loss)
-    return train_step
+        host_call_outfeed_ops = []
+        if (isinstance(estimator_spec, TPUEstimatorSpec) and
+            estimator_spec.host_call is not None):
+          host_call.record({'host_call': estimator_spec.host_call})
+          host_call_outfeed_ops = host_call.create_enqueue_op()
+        with ops.control_dependencies(host_call_outfeed_ops):
+          return array_ops.identity(loss)
+
+    return train_step, host_call, captured_scaffold_fn
 
   def convert_to_single_tpu_eval_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single eval step on TPU.
@@ -1054,14 +1116,16 @@ class _ModelFnWrapper(object):
         infeed dequeue channel.
 
     Returns:
-      A tuple of eval_fn and eval_metrics. The eval_fn representing the eval
-      step for TPU. and eval_metrics is an `_EvalMetrics` instance.
+      A tuple of eval_fn, host_calls, and captured scaffold_fn. The eval_fn
+      representing the eval step for TPU.
     """
-    eval_metrics = _EvalMetrics(self._ctx)
+    host_calls = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
 
     def eval_step(total_loss):
       """Evaluation step function for use inside a while loop."""
-      features, labels = dequeue_fn()
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
 
       tpu_estimator_spec = self._call_model_fn(features, labels)
       if not isinstance(tpu_estimator_spec, TPUEstimatorSpec):
@@ -1070,14 +1134,69 @@ class _ModelFnWrapper(object):
             '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
 
       loss = tpu_estimator_spec.loss
-      eval_metrics.record(tpu_estimator_spec)
-      outfeed_ops = tpu_ops.outfeed_enqueue_tuple(eval_metrics.outfeed_tensors)
-
-      with ops.control_dependencies([outfeed_ops]):
+      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      to_record = {}
+      to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
+      if tpu_estimator_spec.host_call is not None:
+        # We assume that evaluate won't update global step, so we don't wrap
+        # this host_call.
+        to_record['host_call'] = tpu_estimator_spec.host_call
+      host_calls.record(to_record)
+
+      with ops.control_dependencies(host_calls.create_enqueue_op()):
         return math_ops.add(total_loss, loss)
-    return eval_step, eval_metrics
 
-  def _call_model_fn(self, features, labels):
+    return eval_step, host_calls, captured_scaffold_fn
+
+  def convert_to_single_tpu_predict_step(self, dequeue_fn):
+    """Converts user provided model_fn` as a single predict step on TPU.
+
+    Args:
+      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
+        infeed dequeue channel.
+
+    Returns:
+      A tuple of predict_fn, host_calls, and captured scaffold_fn. The
+      predict_fn representing the predict step for TPU.
+    """
+    host_calls = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
+
+    def predict_step(unused_scalar_stopping_signal):
+      """Evaluation step function for use inside a while loop."""
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
+      stopping_signals = inputs.signals()
+
+      assert stopping_signals is not None, (
+          'Internal Error: `signals` is missing.')
+
+      tpu_estimator_spec = self._call_model_fn(
+          features, labels, is_export_mode=False)
+      if not isinstance(tpu_estimator_spec, TPUEstimatorSpec):
+        raise RuntimeError(
+            'estimator_spec used by TPU prediction must have type'
+            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
+
+      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      to_record = {}
+      identity_fn = lambda **kwargs: kwargs
+      # TODO(xiejw): Adds validation for prediction dictionrary.
+      # TODO(xiejw): Adds support for single tensor as predictions.
+      if not isinstance(tpu_estimator_spec.predictions, dict):
+        raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
+      to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
+      to_record['signals'] = [identity_fn, stopping_signals]
+      if tpu_estimator_spec.host_call is not None:
+        to_record['host_call'] = tpu_estimator_spec.host_call
+      host_calls.record(to_record)
+
+      with ops.control_dependencies(host_calls.create_enqueue_op()):
+        return _StopSignals.as_scalar_stopping_signal(stopping_signals)
+
+    return predict_step, host_calls, captured_scaffold_fn
+
+  def _call_model_fn(self, features, labels, is_export_mode=True):
     """Calls the model_fn with required parameters."""
     model_fn_args = util.fn_args(self._model_fn)
     kwargs = {}
@@ -1099,17 +1218,16 @@ class _ModelFnWrapper(object):
       kwargs['params'] = params
 
     if 'params' not in model_fn_args:
-      raise ValueError(
-          'model_fn ({}) does not include params argument, '
-          'required by TPUEstimator to pass batch size as '
-          'params[\'batch_size\']'.format(self._model_fn))
+      raise ValueError('model_fn ({}) does not include params argument, '
+                       'required by TPUEstimator to pass batch size as '
+                       'params[\'batch_size\']'.format(self._model_fn))
 
     batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
     if batch_size_for_model_fn is not None:
       params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
 
     estimator_spec = self._model_fn(features=features, **kwargs)
-    if (self._ctx.is_running_on_cpu() and
+    if (self._ctx.is_running_on_cpu(is_export_mode) and
         isinstance(estimator_spec, TPUEstimatorSpec)):
       # The estimator_spec will be passed to `Estimator` directly, which expects
       # type `EstimatorSpec`.
@@ -1129,161 +1247,262 @@ class _ModelFnWrapper(object):
       raise ValueError(err_msg.format('training_hooks'))
     if estimator_spec.evaluation_hooks:
       raise ValueError(err_msg.format('evaluation_hooks'))
+
+    if estimator_spec.scaffold:
+      logging.warning('EstimatorSpec.Scaffold is ignored by TPU train/eval. '
+                      'Please use TPUEstimatorSpec.')
     return estimator_spec
 
 
-class _EvalMetrics(object):
-  """Class wraps TPUEstimator.eval_metrics."""
+class _OutfeedHostCall(object):
+  """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec."""
 
   def __init__(self, ctx):
     self._ctx = ctx
-    self._metric_fn = None
-    self._is_dict = False
-    self._tensor_keys = []
-    self._tensors = []
-    self._tensor_dtypes = []
-    self._tensor_shapes = []
-    self._recorded = False
+    self._names = []
+    # All of these are dictionaries of lists keyed on the name.
+    self._host_fns = {}
+    self._tensor_keys = collections.defaultdict(list)
+    self._tensors = collections.defaultdict(list)
+    self._tensor_dtypes = collections.defaultdict(list)
+    self._tensor_shapes = collections.defaultdict(list)
 
   @staticmethod
-  def validate(eval_metrics):
-    """Validates the `eval_metrics` in `TPUEstimatorSpec`."""
-
-    if not isinstance(eval_metrics, (tuple, list)):
-      raise ValueError('eval_metrics should be tuple or list')
-    if len(eval_metrics) != 2:
-      raise ValueError('eval_metrics should have two elements.')
-    if not callable(eval_metrics[0]):
-      raise TypeError('eval_metrics[0] should be callable.')
-    if not isinstance(eval_metrics[1], (tuple, list, dict)):
-      raise ValueError('eval_metrics[1] should be tuple or list, or dict.')
-
-    if isinstance(eval_metrics[1], (tuple, list)):
-      fn_args = util.fn_args(eval_metrics[0])
-      if len(eval_metrics[1]) != len(fn_args):
-        raise RuntimeError(
-            'In TPUEstimatorSpec.eval_metrics, length of tensors does not '
-            'match method args of metric_fn.')
+  def validate(host_calls):
+    """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`."""
+
+    for name, host_call in host_calls.items():
+      if not isinstance(host_call, (tuple, list)):
+        raise ValueError('{} should be tuple or list'.format(name))
+      if len(host_call) != 2:
+        raise ValueError('{} should have two elements.'.format(name))
+      if not callable(host_call[0]):
+        raise TypeError('{}[0] should be callable.'.format(name))
+      if not isinstance(host_call[1], (tuple, list, dict)):
+        raise ValueError('{}[1] should be tuple or list, or dict.'.format(name))
+
+      if isinstance(host_call[1], (tuple, list)):
+        fullargspec = tf_inspect.getfullargspec(host_call[0])
+        fn_args = util.fn_args(host_call[0])
+        # wrapped_hostcall_with_global_step uses varargs, so we allow that.
+        if fullargspec.varargs is None and len(host_call[1]) != len(fn_args):
+          raise RuntimeError(
+              'In TPUEstimatorSpec.{}, length of tensors {} does not match '
+              'method args of the function, which takes {}.'.format(
+                  name, len(host_call[1]), len(fn_args)))
 
   @staticmethod
-  def to_metric_metric_ops_for_cpu(eval_metrics):
-    """Converts `TPUEstimatorSpec.eval_metrics` to `eval_metric_ops` for CPU."""
-    if not eval_metrics:
-      return None
-
-    _EvalMetrics.validate(eval_metrics)
+  def create_cpu_hostcall(host_calls):
+    """Runs on the host_call on CPU instead of TPU when use_tpu=False."""
+
+    _OutfeedHostCall.validate(host_calls)
+    ret = {}
+    for name, host_call in host_calls.items():
+      host_fn, tensors = host_call
+      if isinstance(tensors, (tuple, list)):
+        ret[name] = host_fn(*tensors)
+      else:
+        # Must be dict.
+        try:
+          ret[name] = host_fn(**tensors)
+        except TypeError as e:
+          logging.warning(
+              'Exception while calling %s: %s. It is likely the tensors '
+              '(%s[1]) do not match the '
+              'function\'s arguments', name, e, name)
+          raise e
+    return ret
+
+  def record(self, host_calls):
+    """Records the host_call structure."""
+
+    for name, host_call in host_calls.items():
+      host_fn, tensor_list_or_dict = host_call
+      self._names.append(name)
+      self._host_fns[name] = host_fn
+
+      if isinstance(tensor_list_or_dict, dict):
+        for (key, tensor) in six.iteritems(tensor_list_or_dict):
+          self._tensor_keys[name].append(key)
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
+      else:
+        # List or tuple.
+        self._tensor_keys[name] = None
+        for tensor in tensor_list_or_dict:
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
 
-    metric_fn, tensors = eval_metrics
+  def create_enqueue_op(self):
+    """Create the op to enqueue the recorded host_calls.
 
-    if isinstance(tensors, (tuple, list)):
-      return metric_fn(*tensors)
-    else:
-      # Must be dict.
-      try:
-        return metric_fn(**tensors)
-      except TypeError as e:
-        logging.warning(
-            'Exception while calling metric_fn for evalution: %s. '
-            'It is likely the tensors (eval_metrics[1]) do not match the '
-            'metric_fn arguments', e)
-        raise e
-
-  def record(self, spec):
-    """Records the eval_metrics structure in `spec`."""
-    if self._recorded:
-      raise RuntimeError('Eval metrics have been recorded already.')
-
-    self._metric_fn, tensor_list_or_dict = spec.eval_metrics
-
-    if isinstance(tensor_list_or_dict, dict):
-      self._is_dict = True
-      for (key, tensor) in six.iteritems(tensor_list_or_dict):
-        self._tensor_keys.append(key)
-        self._tensors.append(tensor)
-        self._tensor_dtypes.append(tensor.dtype)
-        self._tensor_shapes.append(tensor.shape)
-    else:
-      # List or tuple.
-      self._is_dict = False
-      self._tensors = tensor_list_or_dict
-      for tensor in tensor_list_or_dict:
-        self._tensor_dtypes.append(tensor.dtype)
-        self._tensor_shapes.append(tensor.shape)
-    self._recorded = True
+    Returns:
+      A list of enqueue ops, which is empty if there are no host calls.
+    """
+    if not self._names:
+      return []
 
-  @property
-  def outfeed_tensors(self):
-    if not self._recorded:
-      raise RuntimeError('Eval metrics have not been recorded yet')
-    return self._tensors
+    tensors = []
+    # TODO(jhseu): Consider deduping tensors.
+    for name in self._names:
+      tensors.extend(self._tensors[name])
 
-  def to_metric_metric_ops_for_tpu(self, dummy_update_op):
-    """Creates the eval_metric_ops now based on the TPU outfeed.
+    with ops.device(tpu.core(0)):
+      return [tpu_ops.outfeed_enqueue_tuple(tensors)]
 
-    `eval_metric_ops` is defined in `EstimatorSpec`. From all shards, tensors
-    are dequeued from outfeed and then concatenated (along batch size dimension)
-    to form  global-like tensors. All global-like tensors are passed to the
-    metric fn.
+  def create_tpu_hostcall(self):
+    """Sends the tensors through outfeed and runs the host_fn on CPU.
 
-    Args:
-      dummy_update_op: A dummy update op.
+    The tensors are concatenated along dimension 0 to form a global tensor
+    across all shards. The concatenated function is passed to the host_fn and
+    executed on the first host.
 
     Returns:
-      A tuple of (`eval_metric_ops` and `update_ops`), where `update_ops` should
-      be invoked in Outfeed dequeue thread, which drive the outfeed dequeue and
-      update the state of metrics.
+      A dictionary mapping name to the return type of the host_call by that
+      name.
 
     Raises:
       RuntimeError: If outfeed tensor is scalar.
     """
+    if not self._names:
+      return []
 
-    num_cores = self._ctx.num_cores
-
+    ret = {}
     # For each i, dequeue_ops[i] is a list containing the tensors from all
     # shards. This list is concatenated later.
     dequeue_ops = []
-    for i in xrange(len(self._tensors)):
-      dequeue_ops.append([])
-
-    # Outfeed ops execute on each JF node.
+    tensor_dtypes = []
+    tensor_shapes = []
+    for name in self._names:
+      for _ in self._tensors[name]:
+        dequeue_ops.append([])
+      for dtype in self._tensor_dtypes[name]:
+        tensor_dtypes.append(dtype)
+      for shape in self._tensor_shapes[name]:
+        tensor_shapes.append(shape)
+
+    # Outfeed ops execute on each replica's first logical core. Note: we must
+    # constraint it such that we have at most one outfeed dequeue and enqueue
+    # per replica.
     tpu_device_placement_fn = self._ctx.tpu_device_placement_function
-    for i in xrange(num_cores):
+    for i in xrange(self._ctx.num_replicas):
       with ops.device(tpu_device_placement_fn(i)):
         outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
-            dtypes=self._tensor_dtypes, shapes=self._tensor_shapes)
+            dtypes=tensor_dtypes, shapes=tensor_shapes)
         for j, item in enumerate(outfeed_tensors):
           dequeue_ops[j].append(item)
 
-    # It is assumed evaluation always happends on single host TPU system. So,
+    # Deconstruct dequeue ops.
+    dequeue_ops_by_name = {}
+    pos = 0
+    for name in self._names:
+      dequeue_ops_by_name[name] = dequeue_ops[pos:pos+len(self._tensors[name])]
+      pos += len(self._tensors[name])
+
+    # It is assumed evaluation always happens on single host TPU system. So,
     # place all ops on tpu host if possible.
+    #
+    # TODO(jhseu): Evaluate whether this is right for summaries.
     with ops.device(self._ctx.tpu_host_placement_function(core_id=0)):
-      for i, item in enumerate(dequeue_ops):
-        if dequeue_ops[i][0].shape.ndims == 0:
-          raise RuntimeError(
-              'All tensors outfed from TPU should preseve batch size '
-              'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
-        # TODO(xiejw): Allow users to specify the axis for batch size dimension.
-        dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
+      for name in self._names:
+        dequeue_ops = dequeue_ops_by_name[name]
+        for i, item in enumerate(dequeue_ops):
+          if dequeue_ops[i][0].shape.ndims == 0:
+            raise RuntimeError(
+                'All tensors outfed from TPU should preserve batch size '
+                'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
+          # TODO(xiejw): Allow users to specify the axis for batch size
+          # dimension.
+          dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
+
+        if self._tensor_keys[name] is not None:
+          # The user-provided eval_metrics[1] is a dict.
+          dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops))
+          try:
+            ret[name] = self._host_fns[name](**dequeue_ops)
+          except TypeError as e:
+            logging.warning(
+                'Exception while calling %s: %s. It is likely the tensors '
+                '(%s[1]) do not match the '
+                'function\'s arguments', name, e, name)
+            raise e
+        else:
+          ret[name] = self._host_fns[name](*dequeue_ops)
 
-      if self._is_dict:
-        dequeue_ops = dict(zip(self._tensor_keys, dequeue_ops))
-        try:
-          eval_metric_ops = self._metric_fn(**dequeue_ops)
-        except TypeError as e:
-          logging.warning(
-              'Exception while calling metric_fn for evalution: %s. '
-              'It is likely the tensors (eval_metrics[1]) do not match the '
-              'metric_fn arguments', e)
-          raise e
-      else:
-        eval_metric_ops = self._metric_fn(*dequeue_ops)
+    return ret
+
+
+class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
+  """Hook to run host calls when use_tpu=False."""
+
+  def __init__(self, tensors):
+    self._tensors = tensors
+
+  def begin(self):
+    # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than
+    # create a separate hook to guarantee execution order, because summaries
+    # need to be initialized before the outfeed thread starts.
+    # TODO(jhseu): Make a wrapper hook instead?
+    self._init_ops = contrib_summary.summary_writer_initializer_op()
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    self._finalize_ops = []
+    for op in self._init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
+
+  def after_create_session(self, session, coord):
+    session.run(self._init_ops)
 
-    eval_update_ops = []
-    for k, v in eval_metric_ops.items():
-      eval_metric_ops[k] = (v[0], dummy_update_op)
-      eval_update_ops.append(v[1])
+  def before_run(self, run_context):
+    return basic_session_run_hooks.SessionRunArgs(self._tensors)
+
+  def end(self, session):
+    session.run(self._finalize_ops)
+
+
+class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
+  """Count examples during runtime."""
+
+  def __init__(self,
+               batch_size,
+               every_n_steps=100,
+               every_n_secs=None,
+               output_dir=None,
+               summary_writer=None):
+    self._batch_size = batch_size
+    super(ExamplesPerSecondHook, self).__init__(
+        every_n_steps=every_n_steps,
+        every_n_secs=every_n_secs,
+        output_dir=output_dir,
+        summary_writer=summary_writer)
+
+  def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
+    examples_per_sec = self._batch_size * elapsed_steps / elapsed_time
+    if self._summary_writer is not None:
+      example_summary = Summary(value=[
+          Summary.Value(tag='examples_sec', simple_value=examples_per_sec)
+      ])
+      self._summary_writer.add_summary(example_summary, global_step)
+    logging.info('examples/sec: %g', examples_per_sec)
+
+
+class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
+  """Change SIGINT (CTRL^C) handler to force quit the process.
+
+  The default behavior often results in hanging processes.
+  The original handler is restored after training/evaluation.
+  """
+
+  def __init__(self):
+    self._signal_fn = signal.getsignal(signal.SIGINT)
+
+  def before_run(self, run_context):
+    signal.signal(signal.SIGINT, signal.SIG_DFL)
 
-    return eval_metric_ops, eval_update_ops
+  def end(self, session):
+    signal.signal(signal.SIGINT, self._signal_fn)
 
 
 class TPUEstimator(estimator_lib.Estimator):
@@ -1293,30 +1512,28 @@ class TPUEstimator(estimator_lib.Estimator):
   replicating inputs and models for each core, and returning to host
   periodically to run hooks.
 
-  If `use_tpu` is false, all training, evaluation, and predict are executed on
-  CPU.
-
-  For training, TPUEstimator transforms a global batch size in params to a
-  per-shard batch size when calling the `input_fn` and `model_fn`. Users should
-  specify `train_batch_size` in constructor, and then get the batch size for
-  each shard in `input_fn` and `model_fn` by `params['batch_size']`. If
-  `TPUConfig.per_host_input_for_training` is `True`, `input_fn` is invoked per
-  host rather than per core. In this case, a global batch size is transformed a
-  per-host batch size in params for `input_fn`, but `model_fn` still gets
-  per-core batch size.
-
-  For evaluation, if `eval_batch_size` is None, it is executed on CPU, even if
-  `use_tpu` is `True`. If `eval_batch_size` is not `None`, it is executed on
-  TPU, which is an experimental feature. In this case, `model_fn` should return
-  `TPUEstimatorSpec` instead of `EstimatorSpec`, which expects the
-  `eval_metrics` for TPU evaluation.
-
+  TPUEstimator transforms a global batch size in params to a per-shard batch
+  size when calling the `input_fn` and `model_fn`. Users should specify
+  global batch size in constructor, and then get the batch size for each shard
+  in `input_fn` and `model_fn` by `params['batch_size']`.
+  For training, `model_fn` gets per-core batch size; `input_fn` may get
+  per-core or per-host batch size depending on
+  `per_host_input_for_training` in `TPUConfig`.
+  For evaluation, `model_fn` gets per-core batch size and `input_fn` get
+  per-host batch size.
+
+  `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
+  for TPU evaluation.
   `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
   `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. (See
   `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
   a dict from metric string name to the result of calling a metric function,
   namely a `(metric_tensor, update_op)` tuple.
 
+  One can set `use_tpu` to `False` for testing. All training, evaluation, and
+  predict will be executed on CPU. `input_fn` and `model_fn` will receive
+  `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`.
+
   Current limitations:
 
   1. TPU evaluation only works on single host.
@@ -1371,6 +1588,7 @@ class TPUEstimator(estimator_lib.Estimator):
                use_tpu=True,
                train_batch_size=None,
                eval_batch_size=None,
+               predict_batch_size=None,
                batch_axis=None):
     """Constructs an `TPUEstimator` instance.
 
@@ -1389,18 +1607,17 @@ class TPUEstimator(estimator_lib.Estimator):
         basic python types. There are reserved keys for `TPUEstimator`,
         including 'batch_size'.
       use_tpu: A bool indicating whether TPU support is enabled. Currently,
-        - TPU training respects this bit.
-        - If true, see `eval_batch_size` for evaluate support.
+        - TPU training and evaluation respect this bit.
         - Predict still happens on CPU.
       train_batch_size: An int representing the global training batch size.
         TPUEstimator transforms this global batch size to a per-shard batch
         size, as params['batch_size'], when calling `input_fn` and `model_fn`.
-        Cannot be `None` if `use_tpu` is `True`. Must be divisible by
-        `config.tpu_config.num_shards`.
-      eval_batch_size: An int representing the global training batch size.
-        Currently, if `None`, evaluation is still executed on CPU (even when
-        `use_tpu` is True). In near future, `use_tpu` will be the only option to
-        switch between TPU/CPU evaluation.
+        Cannot be `None` if `use_tpu` is `True`.
+        Must be divisible by total number of replicas.
+      eval_batch_size: An int representing evaluation batch size.
+        Must be divisible by total number of replicas.
+      predict_batch_size: An int representing the prediction batch size.
+        Must be divisible by total number of replicas.
       batch_axis: A python tuple of int values describing how each tensor
         produced by the Estimator `input_fn` should be split across the TPU
         compute shards. For example, if your input_fn produced (images, labels)
@@ -1420,35 +1637,28 @@ class TPUEstimator(estimator_lib.Estimator):
           '`config` must be provided with type `tpu_config.RunConfig`')
 
     if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS):
-      raise ValueError(
-          '{} are reserved keys but existed in params {}.'.format(
-              _RESERVED_PARAMS_KEYS, params))
+      raise ValueError('{} are reserved keys but existed in params {}.'.format(
+          _RESERVED_PARAMS_KEYS, params))
 
     if use_tpu:
+      # Perform some very basic validations. More validations will be found in
+      # _TPUContext.
       if train_batch_size is None:
         raise ValueError('`train_batch_size` cannot be `None`')
-      if not isinstance(train_batch_size, int):
-        raise ValueError('`train_batch_size` must be an int')
-      if train_batch_size < 1:
-        raise ValueError('`train_batch_size` must be positive')
-
-      # The specified batch size is the batch size for the entire computation.
-      # The input_fn and model_fn are called per-shard, so we want to calculate
-      # the per-shard batch size and pass that.
-      if train_batch_size % config.tpu_config.num_shards != 0:
+      util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
+
+      if (not config.tpu_config.per_host_input_for_training and
+          config.tpu_config.computation_shape):
         raise ValueError(
-            'train batch size {} must be divisible by number of shards {}'
-            .format(train_batch_size, config.tpu_config.num_shards))
+            'Model parallelism only supports per host input for training. '
+            'Please adjust TPURunconfig.per_host_input_for_training.')
 
       if eval_batch_size is not None:
-        if config.tpu_config.num_shards > 8:
-          raise NotImplementedError(
-              'TPU evaluation is only supported with one host.')
+        util_lib.check_positive_integer(eval_batch_size, 'eval_batch_size')
 
-        if eval_batch_size % config.tpu_config.num_shards != 0:
-          raise ValueError(
-              'eval batch size {} must be divisible by number of shards {}'
-              .format(eval_batch_size, config.tpu_config.num_shards))
+      if predict_batch_size is not None:
+        util_lib.check_positive_integer(predict_batch_size,
+                                        'predict_batch_size')
 
     # Verifies the model_fn signature according to Estimator framework.
     estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
@@ -1468,8 +1678,11 @@ class TPUEstimator(estimator_lib.Estimator):
         self._config.tpu_config.iterations_per_loop)
 
     # All properties passed to _TPUContext are immutable.
-    self._ctx = _TPUContext(self._config, train_batch_size, eval_batch_size,
-                            use_tpu)
+    # pylint: disable=protected-access
+    self._ctx = tpu_context._get_tpu_context(
+        self._config, train_batch_size,
+        eval_batch_size, predict_batch_size,
+        use_tpu)
 
   def _create_global_step(self, graph):
     """Creates a global step suitable for TPUs.
@@ -1503,8 +1716,9 @@ class TPUEstimator(estimator_lib.Estimator):
     if max_steps is not None:
       util_lib.check_positive_integer(max_steps, 'Train max_steps')
 
-    return [_TPUStopAtStepHook(self._iterations_per_training_loop,
-                               steps, max_steps)]
+    return [
+        _TPUStopAtStepHook(self._iterations_per_training_loop, steps, max_steps)
+    ]
 
   def _convert_eval_steps_to_hooks(self, steps):
     with self._ctx.with_mode(model_fn_lib.ModeKeys.EVAL) as ctx:
@@ -1516,11 +1730,11 @@ class TPUEstimator(estimator_lib.Estimator):
 
     util_lib.check_positive_integer(steps, 'Eval steps')
 
-    hooks = []
-    hooks.append(evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
-        num_evals=steps))
-    hooks.append(_SetEvalIterationsHook(steps))
-    return hooks
+    return [
+        evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
+            num_evals=steps),
+        _SetEvalIterationsHook(steps)
+    ]
 
   def _call_input_fn(self, input_fn, mode):
     """Calls the input function.
@@ -1556,7 +1770,9 @@ class TPUEstimator(estimator_lib.Estimator):
       if batch_size_for_input_fn is not None:
         kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn
 
-      if ctx.is_running_on_cpu():
+      # For export_savedmodel, input_fn is never passed to Estimator. So,
+      # `is_export_mode` must be False.
+      if ctx.is_running_on_cpu(is_export_mode=False):
         with ops.device('/device:CPU:0'):
           return input_fn(**kwargs)
 
@@ -1572,6 +1788,7 @@ class TPUEstimator(estimator_lib.Estimator):
       # `features` in `model_fn` signature.
       def _input_fn():
         return input_fn(**kwargs)
+
       return _input_fn
 
   def _augment_model_fn(self, model_fn, batch_axis):
@@ -1582,8 +1799,13 @@ class TPUEstimator(estimator_lib.Estimator):
       with self._ctx.with_mode(mode) as ctx:
         model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
 
-        # TODO(jhseu): Move to PREDICT to TPU.
-        if ctx.is_running_on_cpu():
+        # For export_savedmodel, input_fn is never passed to Estimator. So,
+        # if features is callable, it means it is the input_fn passed by
+        # TPUEstimator._call_input_fn. Then we can know if the mode == PREDICT,
+        # it implies, it is the .predict API, not export_savedmodel API.
+        is_export_mode = not callable(features)
+
+        if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
           logging.info('Running %s on CPU', mode)
           return model_fn_wrapper.call_without_tpu(features, labels)
 
@@ -1593,18 +1815,31 @@ class TPUEstimator(estimator_lib.Estimator):
         input_fn = features
 
         input_holders = _InputPipeline(input_fn, batch_axis, ctx)
-        enqueue_ops, dequeue_fn = (
+        enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
             input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
 
         if mode == model_fn_lib.ModeKeys.TRAIN:
-          loss = _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn)
+          loss, host_call, scaffold = (
+              _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
+          host_ops = host_call.create_tpu_hostcall()
+          if host_ops is None:
+            host_ops = []
           hooks = [
-              TPUInfeedOutfeedSessionHook(ctx, enqueue_ops),
+              TPUInfeedOutfeedSessionHook(
+                  ctx,
+                  enqueue_ops,
+                  host_ops,
+                  run_infeed_loop_on_coordinator=(
+                      run_infeed_loop_on_coordinator)),
+              ExamplesPerSecondHook(ctx.global_batch_size),
+              InstallSignalHandlerHook(),
               training.LoggingTensorHook(
-                  {'loss': array_ops.identity(loss),
-                   'step': training.get_global_step()},
+                  {
+                      'loss': array_ops.identity(loss),
+                      'step': training.get_global_step()
+                  },
                   every_n_secs=30)
-          ]
+          ] + input_hooks
           summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
           with ops.control_dependencies([loss]):
             update_ops = _sync_variables_ops()
@@ -1616,89 +1851,201 @@ class TPUEstimator(estimator_lib.Estimator):
               mode,
               loss=loss,
               training_hooks=hooks,
-              train_op=control_flow_ops.group(*update_ops))
+              train_op=control_flow_ops.group(*update_ops),
+              scaffold=scaffold)
+
+        if mode == model_fn_lib.ModeKeys.EVAL:
+          total_loss, host_calls, scaffold = _eval_on_tpu_system(
+              ctx, model_fn_wrapper, dequeue_fn)
+          iterations_per_loop_var = _create_or_get_iterations_per_loop()
+          mean_loss = math_ops.div(total_loss,
+                                   math_ops.cast(
+                                       iterations_per_loop_var,
+                                       dtype=total_loss.dtype))
+
+          # Creates a dummy metric update_op for all metrics. Estimator expects
+          # all metrics in eval_metric_ops have update_op and calls them one by
+          # one. The real metric update_ops are invoked in a separated thread.
+          # So, here give Estimator the dummy op for all metrics.
+          with ops.control_dependencies([mean_loss]):
+            # After TPU evaluation computation is done (the mean_loss tensor),
+            # reads all variables back from TPU and updates the eval step
+            # counter properly
+            internal_ops_to_run = _sync_variables_ops()
+            internal_ops_to_run.append(
+                _increase_eval_step_op(iterations_per_loop_var))
+            with ops.control_dependencies(internal_ops_to_run):
+              dummy_update_op = control_flow_ops.no_op()
+
+          host_call_ret = host_calls.create_tpu_hostcall()
+          eval_metric_ops = {}
+          eval_update_ops = []
+          for k, v in host_call_ret['eval_metrics'].items():
+            eval_metric_ops[k] = (v[0], dummy_update_op)
+            eval_update_ops.append(v[1])
+
+          if 'host_call' not in host_call_ret:
+            host_ops = []
+          else:
+            host_ops = host_call_ret['host_call']
+          hooks = [
+              TPUInfeedOutfeedSessionHook(
+                  ctx,
+                  enqueue_ops,
+                  eval_update_ops + host_ops,
+                  run_infeed_loop_on_coordinator=(
+                      run_infeed_loop_on_coordinator)),
+          ] + input_hooks
 
-        # Now eval.
-        total_loss, eval_metric_ops = _eval_on_tpu_system(
+          return model_fn_lib.EstimatorSpec(
+              mode,
+              loss=mean_loss,
+              evaluation_hooks=hooks,
+              eval_metric_ops=eval_metric_ops,
+              scaffold=scaffold)
+
+        # Predict
+        assert mode == model_fn_lib.ModeKeys.PREDICT
+
+        dummy_predict_op, host_calls, scaffold = _predict_on_tpu_system(
             ctx, model_fn_wrapper, dequeue_fn)
-        iterations_per_loop_var = _create_or_get_iterations_per_loop()
-        mean_loss = math_ops.div(
-            total_loss,
-            math_ops.cast(iterations_per_loop_var, dtype=total_loss.dtype))
-
-        # Creates a dummy metric update_op for all metrics. Estimator expects
-        # all metrics in eval_metric_ops have update_op and calls them one by
-        # one. The real metric update_ops are invoked in a separated thread. So,
-        # here give Estimator the dummy op for all metrics.
-        with ops.control_dependencies([mean_loss]):
-          # After TPU evaluation computation is done (the mean_loss tensor),
-          # reads all variables back from TPU and updates the eval step counter
-          # properly
+        with ops.control_dependencies([dummy_predict_op]):
           internal_ops_to_run = _sync_variables_ops()
-          internal_ops_to_run.append(
-              _increase_eval_step_op(iterations_per_loop_var))
           with ops.control_dependencies(internal_ops_to_run):
-            dummy_update_op = control_flow_ops.no_op()
+            dummy_predict_op = control_flow_ops.no_op()
+
+        # In train and evaluation, the main TPU program is passed to monitored
+        # training session to run. Infeed enqueue and outfeed dequeue are
+        # executed in side threads. This is not the configuration for
+        # prediction mode.
+        #
+        # For prediction, the Estimator executes the EstimatorSpec.predictions
+        # directly and yield the element (via generator) to call site. So, the
+        # outfeed based prediction must be passed to MonitoredSession directly.
+        # Other parts of the TPU execution are organized as follows.
+        #
+        # 1. All outfeed based Tensors must be grouped with predictions Tensors
+        #    to form a single invocation. This avoid the issue we might trigger
+        #    multiple outfeeds incorrectly. To achieve this, `host_call` is
+        #    placed in control_dependencies of `stopping_signals`, and
+        #    `stopping_signals` is passed into _StoppingPredictHook, which sets
+        #    the `stopping_signals` as SessionRunArgs. MonitoredSession merges
+        #    all SessionRunArgs with the fetch in session.run together.
+        #
+        # 2. The TPU program (dummy_predict_op) and enqueue_ops (infeed Enqueue)
+        #    are grouped together. They will be launched once and only once in
+        #    side threads and they quit naturally according to the SAME stopping
+        #    condition.
+        enqueue_ops.append(dummy_predict_op)
+
+        host_call_ret = host_calls.create_tpu_hostcall()
+        if 'host_call' not in host_call_ret:
+          host_ops = []
+        else:
+          host_ops = host_call_ret['host_call']
+
+        predictions = host_call_ret['predictions']
+        stopping_signals = host_call_ret['signals']
+
+        with ops.control_dependencies(host_ops):
+          host_ops = []  # Empty, we do do not need it anymore.
+          scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal(
+              stopping_signals)
 
-        eval_metric_ops, eval_update_ops = (
-            eval_metric_ops.to_metric_metric_ops_for_tpu(dummy_update_op))
         hooks = [
-            TPUInfeedOutfeedSessionHook(ctx, enqueue_ops, eval_update_ops),
-        ]
+            _StoppingPredictHook(scalar_stopping_signal),
+            TPUInfeedOutfeedSessionHookForPrediction(ctx, enqueue_ops,
+                                                     host_ops),
+        ] + input_hooks
 
         return model_fn_lib.EstimatorSpec(
             mode,
-            loss=mean_loss,
-            evaluation_hooks=hooks,
-            eval_metric_ops=eval_metric_ops)
+            prediction_hooks=hooks,
+            predictions=predictions,
+            scaffold=scaffold)
+
     return _model_fn
 
 
 def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  num_cores = ctx.num_cores
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_eval_step, eval_metric_ops = (
+  single_tpu_eval_step, host_calls, captured_scaffold_fn = (
       model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn))
 
   def multi_tpu_eval_steps_on_single_shard():
-    return training_loop.repeat(iterations_per_loop_var,
-                                single_tpu_eval_step,
-                                [_ZERO_LOSS],
-                                name='loop')
+    return training_loop.repeat(
+        iterations_per_loop_var,
+        single_tpu_eval_step, [_ZERO_LOSS],
+        name='loop')
 
-  (loss,) = tpu.shard(multi_tpu_eval_steps_on_single_shard,
-                      inputs=[],
-                      num_shards=num_cores,
-                      outputs_from_all_shards=False)
-  return loss, eval_metric_ops
+  (loss,) = tpu.shard(
+      multi_tpu_eval_steps_on_single_shard,
+      inputs=[],
+      num_shards=ctx.num_replicas,
+      outputs_from_all_shards=False,
+      device_assignment=ctx.device_assignment)
+
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return loss, host_calls, scaffold
 
 
 def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  num_cores = ctx.num_cores
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_train_step = model_fn_wrapper.convert_to_single_tpu_train_step(
-      dequeue_fn)
+  single_tpu_train_step, host_call, captured_scaffold_fn = (
+      model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
 
   def multi_tpu_train_steps_on_single_shard():
     return training_loop.repeat(
         iterations_per_loop_var,
-        single_tpu_train_step,
-        [_INITIAL_LOSS],
+        single_tpu_train_step, [_INITIAL_LOSS],
         name=b'loop')
 
-  (loss,) = tpu.shard(multi_tpu_train_steps_on_single_shard,
-                      inputs=[],
-                      num_shards=num_cores,
-                      outputs_from_all_shards=False)
-  return loss
+  (loss,) = tpu.shard(
+      multi_tpu_train_steps_on_single_shard,
+      inputs=[],
+      num_shards=ctx.num_replicas,
+      outputs_from_all_shards=False,
+      device_assignment=ctx.device_assignment)
+
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return loss, host_call, scaffold
+
+
+def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
+  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
+  num_cores = ctx.num_cores
+
+  single_tpu_predict_step, host_calls, captured_scaffold_fn = (
+      model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn))
+
+  def multi_tpu_predict_steps_on_single_shard():
+
+    def cond(scalar_stopping_signal):
+      return math_ops.logical_not(
+          _StopSignals.should_stop(scalar_stopping_signal))
+
+    inputs = [_StopSignals.NON_STOPPING_SIGNAL]
+    outputs = training_loop.while_loop(
+        cond, single_tpu_predict_step, inputs=inputs, name=b'loop')
+    return outputs
+
+  (dummy_predict_op,) = tpu.shard(
+      multi_tpu_predict_steps_on_single_shard,
+      inputs=[],
+      num_shards=num_cores,
+      outputs_from_all_shards=False)
+
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return dummy_predict_op, host_calls, scaffold
 
 
 def _wrap_computation_in_while_loop(device, op_fn):
   """Wraps the ops generated by `op_fn` in tf.while_loop."""
+
   def computation(i):
     with ops.control_dependencies(op_fn()):
       return i + 1
@@ -1710,7 +2057,31 @@ def _wrap_computation_in_while_loop(device, op_fn):
     iterations = array_ops.identity(iterations_per_loop_var)
     return control_flow_ops.while_loop(
         lambda i: i < iterations,
-        computation, [constant_op.constant(0)], parallel_iterations=1)
+        computation, [constant_op.constant(0)],
+        parallel_iterations=1)
+
+
+def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn):
+  """Wraps the ops generated by `op_fn` in tf.while_loop."""
+
+  def cond(scalar_stopping_signal):
+    return math_ops.logical_not(
+        _StopSignals.should_stop(scalar_stopping_signal))
+
+  def computation(unused_scalar_stopping_signal):
+    return_value = op_fn()
+    execute_ops = return_value['ops']
+    signals = return_value['signals']
+    with ops.control_dependencies(execute_ops):
+      return _StopSignals.as_scalar_stopping_signal(signals)
+
+  # By setting parallel_iterations=1, the parallel execution in while_loop is
+  # basically turned off.
+  with ops.device(device):
+    return control_flow_ops.while_loop(
+        cond,
+        computation, [_StopSignals.NON_STOPPING_SIGNAL],
+        parallel_iterations=1)
 
 
 def _validate_tpu_training_graph():
@@ -1723,10 +2094,274 @@ def _validate_tpu_training_graph():
 
   # Check if there is atleast one CrossReplicaSum operation in the graph
   # This should be introduced by using the CrossShardOptimizer wrapper
-  cross_replica_sum_ops = [o for o in operations
-                           if o.type == _CROSS_REPLICA_SUM_OP]
+  cross_replica_sum_ops = [
+      o for o in operations if o.type == _CROSS_REPLICA_SUM_OP
+  ]
   if not cross_replica_sum_ops:
     raise ValueError(
         'CrossShardOptimizer must be used for model training on TPUs.')
 
 
+class _CapturedObject(object):
+  """A placeholder to capture an object.
+
+  This is useful when we need to capture a Python object in the Tensorflow
+  control flow body function and use it outside the control flow.
+  """
+
+  def __init__(self):
+    self._object = None
+    self._captured = False
+
+  def capture(self, o):
+    if self._captured:
+      raise RuntimeError(
+          'InternalError: Object can be captured only. Please file bug .')
+
+    self._captured = True
+    self._object = o
+
+  def get(self):
+    if not self._captured:
+      raise RuntimeError(
+          'InternalError: Object is not captured properly before `get`. '
+          'Please file bug .')
+    return self._object
+
+
+def _get_scaffold(captured_scaffold_fn):
+  """Retrieves the Scaffold from `captured_scaffold_fn`."""
+  with _CapturingContext(message='Inside scaffold_fn'):
+    scaffold_fn = captured_scaffold_fn.get()
+    if scaffold_fn:
+      scaffold = scaffold_fn()
+      if scaffold is None:
+        raise ValueError(
+            'TPUEstimatorSpec.scaffold_fn returns None, which is not allowed')
+    else:
+      scaffold = None
+
+  if scaffold:
+    wrapped_finalize = scaffold.finalize
+
+    def _finalize():
+      with _CapturingContext('Inside Scaffold.finalize'):
+        wrapped_finalize()
+
+    scaffold.finalize = _finalize
+  return scaffold
+
+
+class _CapturingContext(control_flow_ops.ControlFlowContext):
+  """Tracks references to Tensors defined in TPU replication."""
+
+  def __init__(self, message):
+    control_flow_ops.ControlFlowContext.__init__(self)
+    self._message = message
+
+  def AddOp(self, op):  # pylint: disable=invalid-name
+    for c in op.inputs:
+      if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr:  # pylint: disable=protected-access
+        raise ValueError('{}: Op {} depends on TPU computation {}, '
+                         'which is not allowed.'.format(self._message, op, c))
+
+  def __enter__(self):
+    # pylint: disable=protected-access
+    self._g = ops.get_default_graph()
+    self._old = self._g._get_control_flow_context()
+    self._g._set_control_flow_context(self)
+    # pylint: enable=protected-access
+
+  def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
+    self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
+
+
+class _Inputs(object):
+  """A data structure representing the input_fn returned values.
+
+  This also supports the returned value from input_fn as `Dataset`.
+  """
+
+  def __init__(self, features=None, labels=None, dataset=None, signals=None):
+    if dataset is not None and (features is not None or labels is not None or
+                                signals is not None):
+      raise RuntimeError('Internal Error: Either (features and labels) or '
+                         'dataset should be provided, not both. Please file '
+                         'bug')
+
+    self._features = features
+    self._labels = labels
+    self._signals = signals
+
+    self._dataset = dataset
+    self._iterator = None
+
+  @staticmethod
+  def from_input_fn(return_values):
+    """Returns an `_Inputs` instance according to `input_fn` return value."""
+    if isinstance(return_values, dataset_ops.Dataset):
+      dataset = return_values
+      return _Inputs(dataset=dataset)
+
+    features, labels = _Inputs._parse_inputs(return_values)
+    return _Inputs(features, labels)
+
+  @staticmethod
+  def _parse_inputs(return_values):
+    if isinstance(return_values, tuple):
+      features, labels = return_values
+    else:
+      features, labels = return_values, None
+    return features, labels
+
+  @property
+  def is_dataset(self):
+    """Returns True if the return value from input_fn is Dataset."""
+    return self._dataset is not None
+
+  def dataset_initializer_hook(self):
+    """Returns a `SessionRunHook` to initialize this dataset.
+
+    This must be called before `features_and_labels`.
+    """
+    iterator = self._dataset.make_initializable_iterator()
+    # pylint: disable=protected-access
+    hook = estimator_lib._DatasetInitializerHook(iterator)
+    self._iterator = iterator
+    return hook
+
+  def features_and_labels(self):
+    """Gets `features` and `labels`."""
+    if self.is_dataset:
+      return _Inputs._parse_inputs(self._iterator.get_next())
+
+    return (self._features, self._labels)
+
+  def signals(self):
+    return self._signals
+
+  @property
+  def dataset(self):
+    return self._dataset
+
+
+# TODO(xiejw): Extend this to support final partial batch.
+class _InputsWithStoppingSignals(_Inputs):
+  """Inputs with `_StopSignals` inserted into the dataset."""
+
+  def __init__(self, dataset, batch_size):
+
+    assert dataset is not None
+
+    user_provided_dataset = dataset.map(
+        _InputsWithStoppingSignals.insert_stopping_signal(
+            stop=False, batch_size=batch_size))
+    final_batch_dataset = dataset.take(1).map(
+        _InputsWithStoppingSignals.insert_stopping_signal(
+            stop=True, batch_size=batch_size))
+    dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
+
+    super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
+    self._current_inputs = None
+
+  def features_and_labels(self):
+    if self._current_inputs is not None:
+      raise RuntimeError(
+          'Internal Error: The previous inputs have not been properly '
+          'consumed. First call features_and_labels, then call signals.')
+
+    inputs_with_signals = self._iterator.get_next()
+    features = inputs_with_signals['features']
+    labels = inputs_with_signals.get('labels')
+
+    self._current_inputs = inputs_with_signals
+    return features, labels
+
+  def signals(self):
+    """Returns the `Signals` from `_Inputs`."""
+    if self._current_inputs is None:
+      raise RuntimeError(
+          'Internal Error: The current inputs have not been properly '
+          'generated. First call features_and_labels, then call signals.')
+    signals = self._current_inputs['signals']
+    self._current_inputs = None
+    return signals
+
+  @staticmethod
+  def insert_stopping_signal(stop, batch_size):
+    """Inserts stopping_signal into dataset via _map_fn.
+
+    Here we change the data structure in the dataset, such that the return value
+    is a dictionary now and `features`, `labels`, and `signals` are three
+    distinguished keys in that dict. This provides a better structure, which
+    eases the process to decompose the inputs (see `features_and_labels`).
+
+    Args:
+      stop: bool, state of current stopping signals.
+      batch_size: int, batch size.
+
+    Returns:
+      A map_fn passed to dataset.map API.
+    """
+
+    def _map_fn(*args):
+      features, labels = _Inputs._parse_inputs(args)
+      new_input_dict = {}
+      new_input_dict['features'] = features
+      if labels is not None:
+        new_input_dict['labels'] = labels
+      new_input_dict['signals'] = _StopSignals(
+          stop=stop, batch_size=batch_size).as_dict()
+      return new_input_dict
+
+    return _map_fn
+
+
+class _StopSignals(object):
+  """Signals class holding all logic to handle TPU stopping condition."""
+
+  NON_STOPPING_SIGNAL = 0.0
+  STOPPING_SIGNAL = 1.0
+
+  def __init__(self, stop, batch_size):
+    self._stop = stop
+    self._batch_size = batch_size
+
+  def as_dict(self):
+    shape = [self._batch_size, 1]
+    dtype = dtypes.float32
+
+    if self._stop:
+      stopping = array_ops.ones(shape=shape, dtype=dtype)
+    else:
+      stopping = array_ops.zeros(shape=shape, dtype=dtype)
+
+    return {'stopping': stopping}
+
+  @staticmethod
+  def as_scalar_stopping_signal(signals):
+    return array_ops.identity(signals['stopping'][0][0])
+
+  @staticmethod
+  def should_stop(scalar_stopping_signal):
+    return scalar_stopping_signal >= _StopSignals.STOPPING_SIGNAL
+
+
+class _SignalsHelper(object):
+  """A general helper class to handle common signals manipulation."""
+
+  def __init__(self, signals):
+    self._signal_keys = []
+    for key in sorted(signals.iterkeys()):
+      self._signal_keys.append(key)
+
+  @property
+  def num_signals(self):
+    return len(self._signal_keys)
+
+  def unflatten(self, tensor_list):
+    return dict(zip(self._signal_keys, tensor_list))
+
+  @staticmethod
+  def as_tensor_list(signals):
+    return [signals[key] for key in sorted(signals.iterkeys())]
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
index a00fd1d0869ab4403d879d2fc08f2bba0a13a7a8..e76cf83e4ddcd86ab3971bcecefe2e2dc979bf63 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu_function
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer
 
 
@@ -74,8 +75,10 @@ class CrossShardOptimizer(optimizer.Optimizer):
     """
     num_shards = tpu_function.get_tpu_context().number_of_shards
     if num_shards is None:
-      raise ValueError("CrossShardOptimizer must be used within a "
-                       "tpu_shard_context.")
+      logging.warning(
+          "CrossShardOptimizer should be used within a tpu_shard_context, but "
+          "got unset number_of_shards. Assuming 1.")
+      num_shards = 1
     if num_shards > 1 and self._reduction == losses.Reduction.MEAN:
       scale = 1.0 / num_shards
       loss *= scale
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py b/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
index f8ba7d45e20b2f48e1409427665878df40a6db02..f5af03f33ca8f13af517007672e9ce0e12be6205 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
@@ -244,7 +244,8 @@ class ShardingPolicy(object):
               str(shapes), self.number_of_shards))
     unsharded_shapes = [self._unshard_shape(s) for s in shapes]
     for i in xrange(self.number_of_shards - 1):
-      if unsharded_shapes[i] != unsharded_shapes[self.number_of_shards - 1]:
+      if not unsharded_shapes[i].is_compatible_with(
+          unsharded_shapes[self.number_of_shards - 1]):
         raise ValueError(
             "sharded shapes %s are not consistent shards of a full shape "
             "sharded %d ways along dimension %d" % (
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..493d1848c072caa5254fc87c67badc2e99ec16ee
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -0,0 +1,155 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPU system metadata and associated tooling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from tensorflow.contrib.tpu.python.tpu import tpu
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
+
+_PINGING_MASTER_TIMEOUT_IN_MS = 60 * 1000  # 1 min
+_RETRY_TIMES = 120
+_INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS = 300 * 1000  # 5 mins
+
+_TPU_DEVICE_REG = re.compile(r'.*task:(\d+)/.*device:TPU:(\d+)$')
+
+# _TPUSystemMetadata is used by TPUEstimator to hold TPU configuration,
+# including num_cores and num_hosts.
+_TPUSystemMetadata = collections.namedtuple('_TPUSystemMetadata', [
+    'num_cores',
+    'num_hosts',
+    'num_of_cores_per_host',
+    'topology',
+    'devices',
+])
+
+
+def _query_tpu_system_metadata(master_address, run_config,
+                               query_topology=False):
+  """Automatically detects the TPU system metadata in the system."""
+  tpu_core_count = 0
+  devices = []
+  device_dict = collections.defaultdict(list)
+
+  retry_count = 1
+  while True:
+    logging.info('Querying Tensorflow master (%s) for TPU system metadata.',
+                 master_address)
+    try:
+      with ops.Graph().as_default():
+        with session_lib.Session(
+            master_address,
+            config=_get_session_config_with_timeout(
+                _PINGING_MASTER_TIMEOUT_IN_MS, run_config)) as sess:
+          devices = sess.list_devices()
+          for device in devices:
+            match = _TPU_DEVICE_REG.match(device.name)
+            if match:
+              host_id = match.group(1)
+              core_id = match.group(2)
+              device_dict[host_id].append(core_id)
+              tpu_core_count += 1
+          break
+    except errors.DeadlineExceededError:
+      msg = ('Fail to connect Tensorflow master. It could be the TPU worker is '
+             'not ready (still under scheduling) or Tensorflow '
+             'master address is correct: got (%s).' %
+             (master_address))
+
+      # TODO(xiejw): For local or grpc master we might not need retry logic
+      # here.
+      if retry_count <= _RETRY_TIMES:
+        logging.warning('%s', msg)
+        logging.warning('Retrying (%d/%d).', retry_count, _RETRY_TIMES)
+        retry_count += 1
+      else:
+        raise ValueError(msg)
+
+  num_of_cores_per_host = 0
+  if tpu_core_count:
+    num_cores_per_host_set = set(
+        [len(core_ids) for core_ids in device_dict.values()])
+    if len(num_cores_per_host_set) != 1:
+      raise RuntimeError(
+          'TPU cores on each host is not same. This should not happen!. '
+          'devices: {}'.format(devices))
+    num_of_cores_per_host = num_cores_per_host_set.pop()
+
+  topology = None
+  if query_topology:
+    if not tpu_core_count:
+      raise RuntimeError(
+          'Cannot find any TPU cores in the system (master address {}). '
+          'This usually means the master address is incorrect or the '
+          'TPU worker has some problems. Available devices: {}'.format(
+              master_address, devices))
+
+    topology = _obtain_topology(master_address, run_config)
+
+  metadata = _TPUSystemMetadata(
+      num_cores=tpu_core_count,
+      num_hosts=len(device_dict),
+      num_of_cores_per_host=num_of_cores_per_host,
+      topology=topology,
+      devices=devices)
+
+  if tpu_core_count:
+    logging.info('Found TPU system:')
+    logging.info('*** Num TPU Cores: %d', metadata.num_cores)
+    logging.info('*** Num TPU Workers: %d', metadata.num_hosts)
+    logging.info('*** Num TPU Cores Per Worker: %d',
+                 metadata.num_of_cores_per_host)
+    logging.info('*** Available Devices: %s', metadata.devices)
+  else:
+    logging.info('Failed to find TPU: %s', metadata)
+  return metadata
+
+
+def _obtain_topology(master_address, run_config):
+  try:
+    logging.info('Initializing TPU system (master: %s) to fetch topology '
+                 'for model parallelism. This might take a while.',
+                 master_address)
+    with ops.Graph().as_default():
+      session_config = _get_session_config_with_timeout(
+          _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, run_config)
+      with session_lib.Session(
+          master_address, config=session_config) as sess:
+        topology = sess.run(tpu.initialize_system())
+        return topology
+  except errors.DeadlineExceededError:
+    raise ValueError(
+        'Fail to initialize TPU system with master (%s). '
+        'Please double check the TPU system is functional.' % (
+            master_address))
+
+
+def _get_session_config_with_timeout(timeout_in_secs, run_config):
+  cluster_def = None
+  if run_config.session_config and run_config.session_config.cluster_def.job:
+    cluster_def = run_config.session_config.cluster_def
+
+  config = config_pb2.ConfigProto(
+      operation_timeout_in_ms=timeout_in_secs, cluster_def=cluster_def)
+  return config
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..336d8260c3c8a5c30efa603e3faeabcc0944b8d0
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
@@ -0,0 +1,80 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for tpu_function helpers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tpu.python.tpu import tpu
+from tensorflow.contrib.tpu.python.tpu import tpu_feed
+from tensorflow.contrib.tpu.python.tpu import training_loop
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.layers import convolutional
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import math_ops
+
+from tensorflow.python.platform import test
+
+
+class TPUContextTest(test.TestCase):
+
+  def testIsInContext(self):
+    """Test that control_flow_util can check that we're in a TPU context."""
+    z1 = array_ops.identity(1)
+    context = tpu.TPUReplicateContext(b"context")
+    context.Enter()
+    z2 = array_ops.identity(1)
+    context.Exit()
+    self.assertFalse(control_flow_util.IsInXLAContext(z1.op))
+    self.assertTrue(control_flow_util.IsInXLAContext(z2.op))
+
+
+class TPULayerRewriteTest(test.TestCase):
+
+  def testUsingInfeedQueueWithRegularizer(self):
+    """Test that Layer regularizers can reference data created in loops."""
+
+    def make_regularizer(scale):
+      return lambda inputs: scale * math_ops.reduce_sum(math_ops.square(inputs))
+
+    def training_step(inputs, scale):
+      outputs = convolutional.conv2d(
+          inputs,
+          filters=16,
+          kernel_size=(3, 3),
+          data_format="channels_first",
+          kernel_regularizer=make_regularizer(scale))
+      loss = math_ops.reduce_mean(math_ops.square(outputs))
+      return loss.op
+
+    inputs = array_ops.zeros(shape=(128, 32, 32, 16))
+    scale = array_ops.ones(shape=())
+    infeed = tpu_feed.InfeedQueue(
+        tuple_types=[dtypes.float32, dtypes.float32],
+        tuple_shapes=[inputs.shape, scale.shape])
+
+    def loop():
+      return training_loop.repeat(5, training_step, infeed_queue=infeed)
+
+    # This should not throw an error.
+    tpu.rewrite(loop)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tpu/tpu_estimator.md b/tensorflow/contrib/tpu/tpu_estimator.md
new file mode 100644
index 0000000000000000000000000000000000000000..4ef8f9eebdb165e5fe221be8670276bf943159b3
--- /dev/null
+++ b/tensorflow/contrib/tpu/tpu_estimator.md
@@ -0,0 +1,241 @@
+# Using the Estimator API with TPUs
+
+
+This document describes how to train a TensorFlow model on TPUs using the
+Estimator API. If you are interested in the hardware itself, check out the
+[Cloud TPU documentation](https://cloud.google.com/tpu/docs).
+
+The TPU Estimator simplifies running models on a Cloud TPU by automatically
+handling numerous low-level hardware-specific details
+
+[TOC]
+
+## Introduction to Estimator
+
+[TensorFlow
+tutorials](https://www.tensorflow.org/extend/estimators) cover the Estimator
+API. At a high-level, the Estimator API provides:
+
+*   `Estimator.train()` - train a model on a given input for a fixed number of
+    steps.
+*   `Estimator.evaluate()` - evaluate the model on a test set.
+*   `Estimator.predict()` - run inference using the trained model.
+*   `Estimator.export_savedmodel()` - export your model for serving.
+
+In addition, `Estimator` includes default behavior common to training jobs,
+such as saving and restoring checkpoints, creating summaries for TensorBoard,
+etc.
+
+`Estimator` requires you to write a `model_fn` and an `input_fn`, which
+correspond to the model and input portions of your TensorFlow graph.
+
+The following code demonstrates using `TPUEstimator` with MNIST example to
+handle training:
+
+    def model_fn(features, labels, mode, params):
+      """A simple CNN."""
+      del params  # unused
+
+      input_layer = tf.reshape(features, [-1, 28, 28, 1])
+      conv1 = tf.layers.conv2d(
+          inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same",
+          activation=tf.nn.relu)
+      pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
+      conv2 = tf.layers.conv2d(
+          inputs=pool1, filters=64, kernel_size=[5, 5],
+          padding="same", activation=tf.nn.relu)
+      pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
+      pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
+      dense = tf.layers.dense(inputs=pool2_flat, units=128, activation=tf.nn.relu)
+      dropout = tf.layers.dropout(
+          inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
+      logits = tf.layers.dense(inputs=dropout, units=10)
+      onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
+
+      loss = tf.losses.softmax_cross_entropy(
+          onehot_labels=onehot_labels, logits=logits)
+
+      learning_rate = tf.train.exponential_decay(
+          FLAGS.learning_rate, tf.train.get_global_step(), 100000, 0.96)
+
+      optimizer = tpu_optimizer.CrossShardOptimizer(
+          tf.train.GradientDescentOptimizer(learning_rate=learning_rate))
+
+      train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
+      return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op)
+
+
+    def get_input_fn(filename):
+      """Returns an `input_fn` for train and eval."""
+
+      def input_fn(params):
+        """An input_fn to parse 28x28 images from filename using tf.data."""
+        batch_size = params["batch_size"]
+
+        def parser(serialized_example):
+          """Parses a single tf.Example into image and label tensors."""
+          features = tf.parse_single_example(
+              serialized_example,
+              features={
+                  "image_raw": tf.FixedLenFeature([], tf.string),
+                  "label": tf.FixedLenFeature([], tf.int64),
+              })
+          image = tf.decode_raw(features["image_raw"], tf.uint8)
+          image.set_shape([28 * 28])
+          # Normalize the values of the image from the range [0, 255] to [-0.5, 0.5]
+          image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
+          label = tf.cast(features["label"], tf.int32)
+          return image, label
+
+        dataset = tf.contrib.data.TFRecordDataset(
+            filename, buffer_size=FLAGS.dataset_reader_buffer_size)
+        dataset = dataset.map(parser).cache().repeat().batch(batch_size)
+        images, labels = dataset.make_one_shot_iterator().get_next()
+        # set_shape to give inputs statically known shapes.
+        images.set_shape([batch_size, 28 * 28])
+        labels.set_shape([batch_size])
+        return images, labels
+      return input_fn
+
+
+    def main(unused_argv):
+
+      tf.logging.set_verbosity(tf.logging.INFO)
+
+      run_config = tpu_config.RunConfig(
+          master=FLAGS.master,
+          model_dir=FLAGS.model_dir,
+          session_config=tf.ConfigProto(
+              allow_soft_placement=True, log_device_placement=True),
+          tpu_config=tpu_config.TPUConfig(FLAGS.iterations, FLAGS.num_shards),)
+
+      estimator = tpu_estimator.TPUEstimator(
+          model_fn=model_fn,
+          use_tpu=FLAGS.use_tpu,
+          train_batch_size=FLAGS.batch_size,
+          eval_batch_size=FLAGS.batch_size,
+          config=run_config)
+
+      estimator.train(input_fn=get_input_fn(FLAGS.train_file),
+                      max_steps=FLAGS.train_steps)
+
+
+Although this code is quite simple by appearance, there are some new
+concepts to learn for using `TPU`s. The next section will cover the most
+important details.
+
+## New Concepts Related to TPU/TPUEstimator
+
+TF programs run with `TPU Estimator` use an [in-graph
+replication](https://www.tensorflow.org/deploy/distributed) approach.
+
+In-graph replication (also known as single-session replication) differs from
+the between-graph replication (also known as multi-session replication)
+training typically used in distributed TensorFlow. The major
+differences include:
+
+1. The TensorFlow Session master is not local anymore. The user python program
+   creates one single graph that is replicated across all the cores in the Cloud
+   TPU. The typical configuration today sets the TensorFlow session master to be
+   the first worker.
+
+1. The input pipeline is placed on remote hosts (instead of local) to ensure the
+   training examples can be fed as fast as possible to TPU system. All queue-based 
+   input pipelines do not work effectively. Dataset (tf.data) is
+   required.
+
+1. Workers in the TPU system operate in synchronous fashion, and each perform
+   the same step at the same time.
+
+Regarding programming model, _"The programmer picks a (large) batch size B and
+writes the program (and sets hyperparameters) based on that batch size. The
+system distributes the computation across the available devices."
+
+To align these, `TPUEstimator` wraps the computation (the `model_fn`) and
+distributes it to all available TPU chips. 
+
+To summarize:
+
+- The `input_fn` models the input pipeline running on remote host CPU. Use
+  `tf.data` to program the input Ops. `input_fn` is expected to be invoked
+  multiple times when using TPU pods. Each handles one device's input of the
+  global batch. The shard batch size should be retrieved from
+  `params['batch_size']`. We plan to provide better abstraction about the
+  sharding mechanism for `tf.data` to remove the `params['batch_size']`.
+
+- The `model_fn` models the computation which will be replicated and distributed
+  to all TPU chips. It should only contains ops that are supported by TPUs.
+
+## Convert from Vanilla Estimator to TPUEstimator
+
+It is always recommended to port a small, simple model first to make sure that
+you are familiar with the basic concepts of `TPUEstimator` and test end-to-end
+behavior. Once your simple model runs, gradually add more functionality.
+In addition, there are several sample models, available at
+[github.com/tensorflow/tpu-demos](https://github.com/tensorflow/tpu-demos).
+
+To convert your code from the vanilla `Estimator` class to use TPUs, change the
+following (note some of the details may change over time):
+
+- Switch from `tf.estimator.RunConfig` to `tf.contrib.tpu.RunConfig`.
+- Set the `TPUConfig` (part of the `tf.contrib.tpu.RunConfig`) to specify the
+  `iterations_per_loop`, number of iterations to run on the TPU device for one
+  `session.run` call (per training loop), and `num_shards`, the number of shards
+  (typically the number of TPU cores you’re running on). TPUs run a number of
+  iterations of the training loop before returning to host. Until all iterations
+  on the TPU device are run, no checkpoints or summaries will be saved. In the
+  future, we’ll choose a reasonable default.
+- In `model_fn`, use `tf.contrib.tpu.CrossShardOptimizer` to wrap your
+  optimizer. Example:
+
+         optimizer = tpu_optimizer.CrossShardOptimizer(
+              tf.train.GradientDescentOptimizer(learning_rate=learning_rate))
+
+- Switch from `tf.estimator.Estimator` to `tf.contrib.tpu.TPUEstimator`.
+
+The default `RunConfig` will save summaries for TensorBoard every 100 steps and
+write checkpoints every 10 minutes.
+
+
+## FAQ
+
+### Why `tf.data` is Required for the Input Pipeline
+
+There are two reasons:
+
+1. The user code runs on the client, while the TPU computation is executed on
+   the `worker`. Input pipeline ops must be placed on the remote worker for
+   good performance. Only `tf.data` (Dataset) supports this.
+
+1. In order to amortize the TPU launch cost, the model train step is wrapped in
+   a `tf.while_loop`, such that one `Session.run` actually runs many iterations
+   for one train loop.  To remove network back and forth, the input pipeline
+   in the future will be wrapped in a `tf.while_loop` and be placed on the
+   corresponding `worker`. Withou this, unnecessary network latency becomes
+   the performance bottleneck for models with short training-step times, or in
+   environments where network latency is higher. Only `tf.data` can be wrapped
+   by a `tf.while_loop`.
+
+
+### How to add other CPU Ops into Graph
+As `model_fn` only allows TPU Ops for computation, the easier workaround to add
+CPU Ops into Graph is:
+
+1. Create a [SessionRunHook](https://www.tensorflow.org/api_docs/python/tf/train/SessionRunHook).
+1. Modify the graph in the `def begin(self)`,
+1. Pass the hook to `TPUEstimator.train`.
+
+### Running On GCP Cloud TPUs
+To run your models on GCP Cloud TPUs refer to the [Cloud Documentation](https://cloud.google.com/tpu/docs/tutorials/mnist).
+Refer to this link for all [Cloud TPU documentation](https://cloud.google.com/tpu/docs).
+
+
+### Profiling
+You can profile the `worker` by using instructions as specified in the [Cloud TPU Tools](https://cloud.google.com/tpu/docs/cloud-tpu-tools).
+
+
+### Is `int64` supported?
+`int64` is not supported by TPU. Cast to int32 if applicable. The only exception
+is global step, which relies on `assign_add`. `int64` support for global step
+is added to ensure checkpoint compatibility between `TPUEstimator` and non-TPU
+`Estimator`.
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 6139c1d5838c24414549b4e2bc4722175f2d1925..6db373d2d5e20ea7da449530b2730403c3bb64cc 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -26,7 +26,7 @@ py_library(
         "python/training/resample.py",
         "python/training/sampling_ops.py",
         "python/training/sequence_queueing_state_saver.py",
-        "python/training/sgdr_learning_rate_decay.py",
+        "python/training/tensor_queue_dataset.py",
         "python/training/training.py",
         "python/training/tuner.py",
     ],
@@ -286,6 +286,28 @@ py_test(
     ],
 )
 
+py_test(
+    name = "tensor_queue_dataset_test",
+    size = "large",
+    srcs = ["python/training/tensor_queue_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":training_py",
+        "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data",
+        "//third_party/py/numpy",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/training/python/__init__.py b/tensorflow/contrib/training/python/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..52e83069cb0c68b510da46149248369dce376647 100644
--- a/tensorflow/contrib/training/python/__init__.py
+++ b/tensorflow/contrib/training/python/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/training/python/training/__init__.py b/tensorflow/contrib/training/python/training/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..52e83069cb0c68b510da46149248369dce376647 100644
--- a/tensorflow/contrib/training/python/training/__init__.py
+++ b/tensorflow/contrib/training/python/training/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index 2a0ef0e6b3750b4f0464f1f4390819e1fc2c7872..dbdbb08a8252c799924812c83fff7f0631424761 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -53,7 +53,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
     sp_tensor1 = sparse_tensor.SparseTensor(
         array_ops.constant(ind1, dtypes.int64),
         array_ops.constant(val1, dtypes.int64),
-        array_ops.constant(shape1, dtypes.int64))
+        array_ops.placeholder_with_default(shape1, shape=[2]))
     ind2 = np.array([
         [0, 0, 1],
         [0, 1, 0],
@@ -68,7 +68,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
     sp_tensor2 = sparse_tensor.SparseTensor(
         array_ops.constant(ind2, dtypes.int64),
         array_ops.constant(val2, dtypes.int64),
-        array_ops.constant(shape2, dtypes.int64))
+        array_ops.placeholder_with_default(shape2, shape=[3]))
     sp_tensor3 = sparse_tensor.SparseTensor(
         array_ops.constant([[1, 9], [2, 2], [2, 10]], dtypes.int64),
         array_ops.constant([7, 15, 2], dtypes.int64),
@@ -320,6 +320,18 @@ class BatchSequencesWithStatesTest(test.TestCase):
   def testNotAMultiple(self):
     num_unroll = 3  # Not a divisor of value_length -
     # so padding would have been necessary.
+
+    # Use placeholder_with_default in sequences to make sure we get runtime
+    # error instead of shape inference error
+    sequences = {
+        "seq1": array_ops.placeholder_with_default(self.sequences["seq1"],
+                                                   shape=(None, 5)),
+        "seq2": array_ops.placeholder_with_default(self.sequences["seq2"],
+                                                   shape=(None, 4, 2)),
+        "seq3": self.sequences["seq3"],
+        "seq4": self.sequences["seq4"],
+    }
+
     with self.test_session() as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    ".*should be a multiple of: 3, but saw "
@@ -330,7 +342,7 @@ class BatchSequencesWithStatesTest(test.TestCase):
           with coord.stop_on_exception():
             next_batch = sqss.batch_sequences_with_states(
                 input_key=self.key,
-                input_sequences=self.sequences,
+                input_sequences=sequences,
                 input_context=self.context,
                 input_length=3,
                 initial_states=self.initial_states,
@@ -493,6 +505,18 @@ class BatchSequencesWithStatesTest(test.TestCase):
         expected_seq4_batch2=expected_seq4_batch2)
 
 
+class BatchSequencesWithStatesTestWithCApi(BatchSequencesWithStatesTest):
+
+  def setUp(self):
+    self._prev_value = ops._USE_C_API
+    ops._USE_C_API = True
+    super(BatchSequencesWithStatesTestWithCApi, self).setUp()
+
+  def tearDown(self):
+    super(BatchSequencesWithStatesTestWithCApi, self).tearDown()
+    ops._USE_C_API = self._prev_value
+
+
 class PaddingTest(test.TestCase):
 
   def testPaddingInvalidLengths(self):
diff --git a/tensorflow/contrib/training/python/training/bucket_ops.py b/tensorflow/contrib/training/python/training/bucket_ops.py
index 95fbc50cba73b25b748c31ecd443eb19c0b6fc8a..e7f23edc901eacfa3a753792c2dbf738bb5a9421 100644
--- a/tensorflow/contrib/training/python/training/bucket_ops.py
+++ b/tensorflow/contrib/training/python/training/bucket_ops.py
@@ -265,16 +265,22 @@ def bucket(tensors,
         for i, (q, bs) in enumerate(zip(bucket_queues, batch_size))
     ]
 
-    for i, q in enumerate(bucket_queues):
-      queue_runner.add_queue_runner(
-          queue_runner.QueueRunner(
-              q, [enqueues_to_top[i]],
-              queue_closed_exception_types=(errors.OutOfRangeError,
-                                            errors.CancelledError)))
+    queue_runner.add_queue_runner(
+        queue_runner.QueueRunner(
+            bucket_queues[0], enqueues_to_top,
+            close_op=top_queue.close(),
+            cancel_op=top_queue.close(cancel_pending_enqueues=True),
+            queue_closed_exception_types=(errors.OutOfRangeError,
+                                          errors.CancelledError)))
     queue_runner.add_queue_runner(
         queue_runner.QueueRunner(
             top_queue,
             bucket_enqueue_ops,
+            close_op=control_flow_ops.group(
+                *[q.close() for q in bucket_queues]),
+            cancel_op=control_flow_ops.group(
+                *[q.close(cancel_pending_enqueues=True)
+                  for q in bucket_queues]),
             queue_closed_exception_types=(errors.OutOfRangeError,
                                           errors.CancelledError)))
 
diff --git a/tensorflow/contrib/training/python/training/bucket_ops_test.py b/tensorflow/contrib/training/python/training/bucket_ops_test.py
index 330bee8a3fb13cd703fb260952d33e58623ca09c..504f1fcd417f99a8aaa72504f1852e523da1a4c9 100644
--- a/tensorflow/contrib/training/python/training/bucket_ops_test.py
+++ b/tensorflow/contrib/training/python/training/bucket_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.contrib.training.python.training import bucket_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
@@ -321,7 +322,8 @@ class BucketBySequenceLengthTest(test.TestCase):
 
   def _testBucketBySequenceLength(self,
                                   allow_small_batch,
-                                  bucket_capacities=None):
+                                  bucket_capacities=None,
+                                  drain_entire_queue=True):
     ops.reset_default_graph()
 
     # All inputs must be identical lengths across tuple index.
@@ -339,6 +341,7 @@ class BucketBySequenceLengthTest(test.TestCase):
 
     batch_size = 8
     bucket_boundaries = [3, 4, 5, 10]
+    num_pairs_to_enqueue = 50 * batch_size + 100
 
     # Make capacity very large so we can feed all the inputs in the
     # main thread without blocking
@@ -366,34 +369,47 @@ class BucketBySequenceLengthTest(test.TestCase):
                      [expected_batch_size, labels_len])
 
     def _read_test(sess):
-      for _ in range(50):
-        (out_lengths, (data, labels)) = sess.run(
-            (out_lengths_t, data_and_labels_t))
+      num_pairs_dequeued = 0
+      try:
+        while drain_entire_queue or num_pairs_dequeued < 40 * batch_size:
+          (out_lengths, (data, labels)) = sess.run(
+              (out_lengths_t, data_and_labels_t))
+          num_pairs_dequeued += out_lengths.shape[0]
+          if allow_small_batch:
+            self.assertEqual(data_len, data.shape[1])
+            self.assertEqual(labels_len, labels.shape[1])
+            self.assertGreaterEqual(batch_size, out_lengths.shape[0])
+            self.assertGreaterEqual(batch_size, data.shape[0])
+            self.assertGreaterEqual(batch_size, labels.shape[0])
+          else:
+            self.assertEqual((batch_size, data_len), data.shape)
+            self.assertEqual((batch_size, labels_len), labels.shape)
+            self.assertEqual((batch_size,), out_lengths.shape)
+          for (lr, dr, tr) in zip(out_lengths, data, labels):
+            # Make sure length matches data (here it's the same value).
+            self.assertEqual(dr[0], lr)
+            # Make sure data & labels match.
+            self.assertEqual(dr[0], int(tr[0].decode("ascii")))
+            # Make sure for each row, data came from the same bucket.
+            self.assertEqual(
+                _which_bucket(bucket_boundaries, dr[0]),
+                _which_bucket(bucket_boundaries, dr[1]))
+      except errors.OutOfRangeError:
         if allow_small_batch:
-          self.assertEqual(data_len, data.shape[1])
-          self.assertEqual(labels_len, labels.shape[1])
-          self.assertGreaterEqual(batch_size, out_lengths.shape[0])
-          self.assertGreaterEqual(batch_size, data.shape[0])
-          self.assertGreaterEqual(batch_size, labels.shape[0])
+          self.assertEqual(num_pairs_to_enqueue, num_pairs_dequeued)
         else:
-          self.assertEqual((batch_size, data_len), data.shape)
-          self.assertEqual((batch_size, labels_len), labels.shape)
-          self.assertEqual((batch_size,), out_lengths.shape)
-        for (lr, dr, tr) in zip(out_lengths, data, labels):
-          # Make sure length matches data (here it's the same value).
-          self.assertEqual(dr[0], lr)
-          # Make sure data & labels match.
-          self.assertEqual(dr[0], int(tr[0].decode("ascii")))
-          # Make sure for each row, data came from the same bucket.
-          self.assertEqual(
-              _which_bucket(bucket_boundaries, dr[0]),
-              _which_bucket(bucket_boundaries, dr[1]))
+          # Maximum left over in the queues should be at most one less than the
+          # batch_size, for every bucket.
+          num_buckets = len(bucket_boundaries) + 2
+          self.assertLessEqual(
+              num_pairs_to_enqueue - (batch_size - 1) * num_buckets,
+              num_pairs_dequeued)
 
     with self.test_session() as sess:
       coord = coordinator.Coordinator()
 
       # Feed the inputs, then close the input thread.
-      for _ in range(50 * batch_size + 100):
+      for _ in range(num_pairs_to_enqueue):
         which = random.randint(0, len(input_pairs) - 1)
         length, pair = input_pairs[which]
         sess.run(input_enqueue_op,
@@ -425,6 +441,10 @@ class BucketBySequenceLengthTest(test.TestCase):
     self._testBucketBySequenceLength(allow_small_batch=True,
                                      bucket_capacities=capacities)
 
+  def testBucketBySequenceLengthShutdown(self):
+    self._testBucketBySequenceLength(allow_small_batch=True,
+                                     drain_entire_queue=False)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 80de0f6eb7e36a1c86f7d44e4053a9757b09f0ae..fdfd27d6a414933b0bec824bae512c45dac24d3c 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -40,7 +40,7 @@ PARAM_RE = re.compile(r"""
   ((?P<val>[^,\[]*)            # single value: "a" or None
    |
    \[(?P<vals>[^\]]*)\])       # list of values: None or "1,2,3"
-  ($|,)""", re.VERBOSE)
+  ($|,\s*)""", re.VERBOSE)
 
 
 def _parse_fail(name, var_type, value, values):
diff --git a/tensorflow/contrib/training/python/training/hparam_test.py b/tensorflow/contrib/training/python/training/hparam_test.py
index 28e4b4d01eda9bef07ff7929f74894e09a3e987c..16397622edd382bc6dcb12870de5fa22130a2c2b 100644
--- a/tensorflow/contrib/training/python/training/hparam_test.py
+++ b/tensorflow/contrib/training/python/training/hparam_test.py
@@ -55,7 +55,7 @@ class HParamsTest(test.TestCase):
     self.assertEqual(12, hparams.aaa)
     self.assertEqual(2.0, hparams.b)
     self.assertEqual('relu6', hparams.c_c)
-    hparams.parse('c_c=relu4,b=-2.0e10')
+    hparams.parse('c_c=relu4, b=-2.0e10')
     self.assertDictEqual({
         'aaa': 12,
         'b': -2.0e10,
diff --git a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
deleted file mode 100644
index ed0f398e30a7f3c0b1b9378f8fc5d5bfbea1536a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""SGDR learning rate decay function."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops, control_flow_ops
-
-
-def sgdr_decay(learning_rate, global_step, initial_period_steps,
-               t_mul=2.0, m_mul=1.0, name=None):
-  """Implements Stochastic Gradient Descent with Warm Restarts (SGDR).
-
-  As described in "SGDR: Stochastic Gradient Descent
-  with Warm Restarts" by Ilya Loshchilov & Frank Hutter, Proceedings of
-  ICLR'2017, available at https://arxiv.org/pdf/1608.03983.pdf
-
-  The learning rate decreases according to cosine annealing:
-
-  ```python
-  learning_rate * 0.5 * (1 + cos(x_val * pi)) # for x_val defined in [0, 1]
-  ```
-
-  Thus, at the beginning (when the restart index i = 0),
-  the learning rate decreases for `initial_period_steps` steps from the initial
-  learning rate `learning_rate` (when `x_val=0`, we get `cos(0)=1`) to
-  0 (when `x_val=1`, we get `cos(pi)=-1`).
-
-  The decrease within the i-th period takes `t_i` steps,
-  where `t_0` = `initial_period_steps` is the user-defined number of batch
-  iterations (not epochs as in the paper) to be performed before the first
-  restart is launched.
-
-  Then, we perform the first restart (i=1) by setting the learning rate to
-  `learning_rate*(m_mul^i)`, where `m_mul in [0,1]` (set to 1 by default).
-  The i-th restart runs for `t_i=t_0*(t_mul^i)` steps, i.e., every new
-  restart runs `t_mul` times longer than the previous one.
-
-  Importantly, when one has no access to a validation set, SGDR suggests
-  to report the best expected / recommended solution in the following way:
-  When we are within our initial run (i=0), every new solution represents
-  SGDR's recommended solution. Instead, when i>0, the recommended solution is
-  the one obtained at the end of each restart.
-
-  Note that the minimum learning rate is set to 0 for simplicity,
-  you can adjust the code to deal with any positive minimum learning rate
-  as defined in the paper.
-
-  `initial_period_steps` is the duration of the first period measured in terms
-  of number of minibatch updates. If one wants to use epochs, one should compute
-  the number of updates required for an epoch.
-
-  For example, assume the following parameters and intention:
-      Minibatch size: 100
-      Training dataset size: 10000
-      If the user wants the first decay period to span across 5 epochs, then
-      `initial_period_steps` = 5 * 10000/100 = 500
-
-      Train for 10000 batch iterations with the initial learning rate set to
-      0.1, then restart to run 2 times longer, i.e, for 20000 batch iterations
-      and with the initial learning rate 0.05, then restart again and again,
-      doubling the runtime of each new period and with two times smaller
-      initial learning rate.
-
-  To accomplish the above, one would write:
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  starter_learning_rate = 0.1
-  learning_rate = sgdr_decay(starter_learning_rate, global_step,
-                             initial_period_steps=10000, t_mul=2, m_mul=0.5)
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate)
-      .minimize(...my loss..., global_step=global_step)
-  )
-
-  # Step  | 0   | 1000  | 5000 | 9000  | 9999 | 10000 | 11000  |
-  # LR    | 0.1 | 0.097 | 0.05 | 0.002 | 0.00 | 0.05  | 0.0496 |
-
-  # Step  | 20000 | 29000  | 29999 | 30000 |
-  # LR    | 0.025 | 0.0003 | 0.00  | 0.025 |
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.  Must not be negative.
-    initial_period_steps: Duration of the first period measured as the number
-      of minibatch updates, if one wants to use epochs, one should compute
-      the number of updates required for an epoch.
-    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-      Must be positive.
-      Used to derive the number of iterations in the i-th period:
-      `initial_period_steps * (t_mul^i)`. Defaults to 2.0.
-    m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-      Must be positive.
-      Used to derive the initial learning rate of the i-th period:
-      `learning_rate * (m_mul^i)`. Defaults to 1.0
-
-  Returns:
-    A scalar `Tensor` of the same type as `learning_rate`.
-    The learning rate for a provided global_step.
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-
-  if global_step is None:
-    raise ValueError("global_step is required for sgdr_decay.")
-  with ops.name_scope(name, "SGDRDecay",
-                      [learning_rate, global_step,
-                       initial_period_steps, t_mul, m_mul]) as name:
-    learning_rate = ops.convert_to_tensor(learning_rate,
-                                          name="initial_learning_rate")
-    dtype = learning_rate.dtype
-    global_step = math_ops.cast(global_step, dtype)
-    t_0 = math_ops.cast(initial_period_steps, dtype)
-    t_mul = math_ops.cast(t_mul, dtype)
-    m_mul = math_ops.cast(m_mul, dtype)
-
-    c_one = math_ops.cast(constant_op.constant(1.0), dtype)
-    c_half = math_ops.cast(constant_op.constant(0.5), dtype)
-    c_pi = math_ops.cast(constant_op.constant(math.pi), dtype)
-
-    # Find normalized value of the current step
-    x_val = math_ops.div(global_step, t_0)
-
-    def compute_step(x_val, geometric=False):
-      if geometric:
-        # Consider geometric series where t_mul != 1
-        # 1 + t_mul + t_mul^2 ... = (1 - t_mul^i_restart) / (1 - t_mul)
-
-        # First find how many restarts were performed for a given x_val
-        # Find maximal integer i_restart value for which this equation holds
-        # x_val >= (1 - t_mul^i_restart) / (1 - t_mul)
-        # x_val * (1 - t_mul) <= (1 - t_mul^i_restart)
-        # t_mul^i_restart <= (1 - x_val * (1 - t_mul))
-
-        # tensorflow allows only log with base e
-        # i_restart <= log(1 - x_val * (1 - t_mul) / log(t_mul)
-        # Find how many restarts were performed
-
-        i_restart = math_ops.floor(
-            math_ops.log(c_one - x_val * (c_one - t_mul)) / math_ops.log(t_mul))
-        # Compute the sum of all restarts before the current one
-        sum_r = (c_one - t_mul ** i_restart) / (c_one - t_mul)
-        # Compute our position within the current restart
-        x_val = (x_val - sum_r) / t_mul ** i_restart
-
-      else:
-        # Find how many restarts were performed
-        i_restart = math_ops.floor(x_val)
-        # Compute our position within the current restart
-        x_val = x_val - i_restart
-      return i_restart, x_val
-
-    i_restart, x_val = control_flow_ops.cond(
-        math_ops.equal(t_mul, c_one),
-        lambda: compute_step(x_val, geometric=False),
-        lambda: compute_step(x_val, geometric=True))
-
-    # If m_mul < 1, then the initial learning rate of every new restart will be
-    # smaller, i.e., by a factor of m_mul ** i_restart at i_restart-th restart
-    m_fac = learning_rate * (m_mul ** i_restart)
-
-  return math_ops.multiply(c_half * m_fac,
-                           (math_ops.cos(x_val * c_pi) + c_one), name=name)
diff --git a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py
deleted file mode 100644
index 4a46e9a49ef203384e36698f81d6cbe3a3881ef8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay_test.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Functional test for sgdr learning rate decay."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-from sgdr_learning_rate_decay import sgdr_decay
-from tensorflow.python.platform import googletest
-from tensorflow.python.framework import test_util
-from tensorflow.python.framework import dtypes
-from tensorflow import placeholder
-
-
-class SGDRDecayTest(test_util.TensorFlowTestCase):
-  """Unit tests for SGDR learning rate decay."""
-
-  def get_original_values(self, lr, t_e, mult_factor, iter_per_epoch, epochs):
-    """Get an array with learning rate values from the consecutive steps using
-    the original implementation
-    (https://github.com/loshchil/SGDR/blob/master/SGDR_WRNs.py)."""
-    t0 = math.pi / 2.0
-    tt = 0
-    te_next = t_e
-
-    lr_values = []
-    sh_lr = lr
-    for epoch in range(epochs):
-      for _ in range(iter_per_epoch):
-        # In the original approach training function is executed here
-        lr_values.append(sh_lr)
-        dt = 2.0 * math.pi / float(2.0 * t_e)
-        tt = tt + float(dt) / iter_per_epoch
-        if tt >= math.pi:
-          tt = tt - math.pi
-        cur_t = t0 + tt
-        new_lr = lr * (1.0 + math.sin(cur_t)) / 2.0  # lr_min = 0, lr_max = lr
-        sh_lr = new_lr
-      if (epoch + 1) == te_next:  # time to restart
-        sh_lr = lr
-        tt = 0                # by setting to 0 we set lr to lr_max, see above
-        t_e = t_e * mult_factor  # change the period of restarts
-        te_next = te_next + t_e  # note the next restart's epoch
-
-    return lr_values
-
-  def get_sgdr_values(self, lr, initial_period_steps, t_mul, iters):
-    """Get an array with learning rate values from the consecutive steps
-    using current tensorflow implementation."""
-    with self.test_session():
-      step = placeholder(dtypes.int32)
-
-      decay = sgdr_decay(lr, step, initial_period_steps, t_mul)
-      lr_values = []
-      for i in range(iters):
-        lr_values.append(decay.eval(feed_dict={step: i}))
-
-      return lr_values
-
-  def testCompareToOriginal(self):
-    """Compare values generated by tensorflow implementation to the values
-    generated by the original implementation
-    (https://github.com/loshchil/SGDR/blob/master/SGDR_WRNs.py)."""
-    with self.test_session():
-      lr = 10.0
-      init_steps = 2
-      t_mul = 3
-      iters = 10
-      epochs = 50
-
-      org_lr = self.get_original_values(lr, init_steps, t_mul, iters, epochs)
-      sgdr_lr = self.get_sgdr_values(lr, init_steps*iters, t_mul, iters*epochs)
-
-      for org, sgdr in zip(org_lr, sgdr_lr):
-        self.assertAllClose(org, sgdr)
-
-  def testMDecay(self):
-    """Test m_mul argument. Check values for learning rate at the beginning
-    of the first, second, third and fourth period. """
-    with self.test_session():
-      step = placeholder(dtypes.int32)
-
-      lr = 0.1
-      t_e = 10
-      t_mul = 3
-      m_mul = 0.9
-
-      decay = sgdr_decay(lr, step, t_e, t_mul, m_mul)
-
-      test_step = 0
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
-                          lr)
-
-      test_step = t_e
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
-                          lr * m_mul)
-
-      test_step = t_e + t_e*t_mul
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
-                          lr * m_mul**2)
-
-      test_step = t_e + t_e*t_mul + t_e * (t_mul**2)
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}),
-                          lr * (m_mul**3))
-
-  def testCos(self):
-    """Check learning rate values at the beginning, in the middle
-    and at the end of the period."""
-    with self.test_session():
-      step = placeholder(dtypes.int32)
-      lr = 0.2
-      t_e = 1000
-      t_mul = 1
-
-      decay = sgdr_decay(lr, step, t_e, t_mul)
-
-      test_step = 0
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr)
-
-      test_step = t_e//2
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr/2)
-
-      test_step = t_e
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr)
-
-      test_step = t_e*3//2
-      self.assertAllClose(decay.eval(feed_dict={step: test_step}), lr/2)
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..409aba817c1ec37003eb98f000f6cf8918234c5d
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
@@ -0,0 +1,200 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for Datasets and Iterators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util import nest as tf_nest
+
+
+class _PrependFromQueueAndPaddedBatchDataset(dataset_ops.Dataset):
+  """A `Dataset` that prepends a queue to another `Dataset`.
+
+  A vector of handles to the queue is returned as the first component of
+  the associated iterator.  This vector can be passed to
+  `enqueue_in_queue_dataset` to add new elements to the queue.
+  """
+
+  def __init__(self, input_dataset, batch_size, padded_shapes, padding_values):
+    """Initialize `PrependFromQueueAndPaddedBatchDataset`."""
+    super(_PrependFromQueueAndPaddedBatchDataset, self).__init__()
+    if sparse.any_sparse(input_dataset.output_classes):
+      raise TypeError(
+          "Batching of padded sparse tensors is not currently supported")
+    self._input_dataset = input_dataset
+    self._batch_size = ops.convert_to_tensor(
+        batch_size, dtype=dtypes.int64, name="batch_size")
+    # pylint: disable=protected-access
+    if padded_shapes is None:
+      self._padded_shapes = nest.map_structure(
+          dataset_ops._partial_shape_to_tensor, input_dataset.output_shapes)
+    else:
+      self._padded_shapes = nest.map_structure_up_to(
+          input_dataset.output_shapes, dataset_ops._partial_shape_to_tensor,
+          padded_shapes)
+    padding_values = (
+        padding_values if padding_values is not None else
+        dataset_ops._default_padding(input_dataset))
+    self._padding_values = nest.map_structure_up_to(
+        input_dataset.output_shapes, dataset_ops._padding_value_to_tensor,
+        padding_values, input_dataset.output_types)
+    # pylint: enable=protected-access
+
+  def _as_variant_tensor(self):
+    # pylint: disable=protected-access
+    return gen_dataset_ops.prepend_from_queue_and_padded_batch_dataset(
+        self._input_dataset._as_variant_tensor(),
+        batch_size=self._batch_size,
+        padded_shapes=[
+            ops.convert_to_tensor(s, dtype=dtypes.int64)
+            for s in nest.flatten(self._padded_shapes)
+        ],
+        padding_values=nest.flatten(self._padding_values),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+    # pylint: enable=protected-access
+
+  @property
+  def output_classes(self):
+    return (ops.Tensor, self._input_dataset.output_classes)
+
+  def _as_batch_shape(self, shape_like):
+    return tensor_shape.vector(None).concatenate(
+        tensor_util.constant_value_as_shape(shape_like))
+
+  @property
+  def output_shapes(self):
+    # First output is a variant representing the Queue
+    return (tensor_shape.vector(None),
+            nest.map_structure(self._as_batch_shape, self._padded_shapes))
+
+  @property
+  def output_types(self):
+    # First output is a variant representing the Queue
+    return (dtypes.variant, self._input_dataset.output_types)
+
+
+def prepend_from_queue_and_padded_batch_dataset(batch_size,
+                                                padding_values=None,
+                                                padded_shapes=None):
+  """A transformation that prepends a queue to a `Dataset` and batches results.
+
+  A vector of handles to the queue is returned as the first component of the
+  associated iterator.  This vector can be passed to `enqueue_in_queue_dataset`
+  to add new elements to the queue.
+
+  Below is an example of how this dataset might be used to split incoming
+  variable-length sequences into "head" and "rest" parts, where "rest" parts
+  are re-enqueued back into the dataset.  A more realistic example would
+  perform some calculation on the "head" and modify some components of "rest"
+  with the result (before re-enqueueing).
+
+  ```python
+  dataset = tf.data.Dataset.from_tensor_slices([2*x for x in range(10)])
+  # Make a dataset of variable-length vectors and their lengths.
+  dataset = dataset.map(lambda count: (count, tf.ones((count,))))
+  # Emit a queue we can prepend to, and counts/values as padded batch.
+  dataset = dataset.apply(
+      tf.contrib.training.prepend_from_queue_and_padded_batch_dataset(
+        batch_size=10))
+  dataset = dataset.prefetch(1)
+
+  iterator = dataset.make_one_shot_iterator()
+  queue, (count, padded_value) = iterator.get_next()
+
+  # Split the padded_value into two pieces: head and rest
+  rest_indices = tf.squeeze(tf.where(count > 3), axis=1)
+  bound = tf.minimum(3, tf.reduce_max(count))
+  value_head = padded_value[:, :bound]
+  count_rest = tf.gather(count - 3, rest_indices)
+  value_rest = tf.gather(padded_value[:, bound:], rest_indices)
+  queue_rest = tf.gather(queue, rest_indices)
+  enqueue_rest_op = tf.contrib.training.enqueue_in_queue_dataset(
+    queue_rest, (count_rest, value_rest))
+  with tf.control_dependencies([enqueue_rest_op]):
+    calculation = fn(value_head)
+
+  while True:  # Will raise OutOfRange when finished with all pieces.
+    session.run(calculation)
+  ```
+
+  Args:
+    batch_size: `int64` scalar tensor.  The batch size to use when performing
+      padded batching.
+    padding_values: (optional) Nested tuple of scalar tensors.  If provided,
+      the structure and dtypes of padding_values should match that of
+      incoming dataset's `output_types`.
+    padded_shapes: (optional) Nested tuple of `int64` vector tensors.
+      If provided, the structure must match that of the incoming dataset's
+      `output_types`.  If not provided, the incoming dataset's `output_shapes`
+      is used.  Any unknown (`None` or `-1`) dimensions in the shapes are
+      treated as being unique per-batch: for each batch time, an unknown
+      dimension is replaced with the maximum given value of this dimension
+      across all tensors for the given component in the batch.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    return _PrependFromQueueAndPaddedBatchDataset(
+        dataset,
+        batch_size=batch_size,
+        padding_values=padding_values,
+        padded_shapes=padded_shapes)
+
+  return _apply_fn
+
+
+def enqueue_in_queue_dataset(queue, components):
+  """Enqueue components into queue from `PrependFromQueueAndPaddedBatchDataset`.
+
+  The components' dtypes and shapes must be compatible with the `output_shapes`
+  attribute of the `dataset` created by
+  `prepend_from_queue_and_padded_batch_dataset`.  This operation supports both
+  non-batched and batched modes.
+
+  For more details, see the example in the docstring for
+  `prepend_from_queue_and_padded_batch_dataset`.
+
+  Args:
+    queue: `variant` scalar or vector tensor.
+      The tensor emitted by the first component of the iterator associated with
+      `prepend_from_queue_and_padded_batch_dataset`.  If this is a scalar,
+      then the `components` input tensors should not have a prepended batch
+      dimension.
+    components: Nested tuple of tensors, each with a leading batch dimension
+      if `queue` is a vector.  The structure, dtypes, and shapes
+      (excluding batch dimension) must match the nested tuples
+      `dataset.output_types[1]` and `dataset.output_shapes[1]` (the non-queue
+      output types and shapes) of the `dataset` emitted by
+      the original `prepend_from_queue_and_padded_batch_dataset` call.
+
+  Returns:
+    An `Operation` that enqueues `components` into the dataset(s) associated
+    with entries of `queue`.
+  """
+  return gen_dataset_ops.enqueue_in_queue_dataset(
+      queue=queue, components=tf_nest.flatten(components))
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0338f409a203c232e63e99534a8f6d6a43fa661e
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
@@ -0,0 +1,355 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TensorQueueDataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.training.python.training import tensor_queue_dataset as tqd
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
+
+  def testNoEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    self.assertEqual((dtypes.variant, dtypes.int32), dataset.output_types)
+    self.assertAllEqual(([None],) * 2,
+                        [x.as_list() for x in dataset.output_shapes])
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    self.assertEqual([0], self.evaluate(value))
+    self.assertEqual([1], self.evaluate(value))
+    self.assertEqual([2], self.evaluate(value))
+    with self.assertRaisesOpError("End of sequence"):
+      self.evaluate(value)
+
+  def testBatchedNoEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    self.assertAllEqual([0, 1], self.evaluate(value))
+    self.assertAllEqual([2], self.evaluate(value))
+    with self.assertRaisesOpError("End of sequence"):
+      self.evaluate(value)
+
+  def testBatchedWithBiggerPaddingNoEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=2, padded_shapes=[3]))
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    self.assertAllEqual([[0, 0, 0], [1, 0, 0]], self.evaluate(value))
+    self.assertAllEqual([[2, 0, 0]], self.evaluate(value))
+    with self.assertRaisesOpError("End of sequence"):
+      self.evaluate(value)
+
+  def testBatchedWithBiggerPaddingOneEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=1, padded_shapes=[3]))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
+    with self.test_session() as sess:
+      self.assertAllEqual([[0, 0, 0]], sess.run(value))
+      value_1, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([[1, 0, 0]], value_1)
+      value_2, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([[-1, 0, 0]], value_2)
+      value_3 = sess.run(value)
+      self.assertAllEqual([[1, 0, 0]], value_3)
+      value_4, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([[2, 0, 0]], value_4)
+      value_5 = sess.run(value)
+      self.assertAllEqual([[-2, 0, 0]], value_5)
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testOneEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
+    with self.test_session() as sess:
+      self.assertEqual([0], sess.run(value))
+      value_1, _ = sess.run([value, enqueue_negative])
+      self.assertEqual([1], value_1)
+      value_2, _ = sess.run([value, enqueue_negative])
+      self.assertEqual([-1], value_2)
+      value_3 = sess.run(value)
+      self.assertEqual([1], value_3)
+      value_4, _ = sess.run([value, enqueue_negative])
+      self.assertEqual([2], value_4)
+      value_5 = sess.run(value)
+      self.assertEqual([-2], value_5)
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testBatchedOneEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
+    enqueue_zeroth = tqd.enqueue_in_queue_dataset([queue_handle[0]],
+                                                  array_ops.expand_dims(
+                                                      value[0], axis=0))
+    with self.test_session() as sess:
+      value_0, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([0, 1], value_0)
+      value_1, _ = sess.run([value, enqueue_zeroth])
+      self.assertAllEqual([0, -1], value_1)
+      value_2, _ = sess.run([value, enqueue_negative])
+      self.assertAllEqual([0, 2], value_2)
+      self.assertAllEqual([0, -2], sess.run(value))
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testManyEnqueue(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue_many_more = [
+        tqd.enqueue_in_queue_dataset(queue_handle, value + 100 + i)
+        for i in range(1000)
+    ]
+    with self.test_session() as sess:
+      value_0, _ = sess.run((value, enqueue_many_more))
+      self.assertEqual([0], value_0)
+      rest = []
+      for _ in range(1000):
+        rest.append(sess.run(value))
+      self.assertEquals([[100 + i] for i in range(1000)], sorted(rest))
+      # Going back to the original input.
+      value_1, _ = sess.run((value, enqueue_many_more))
+      self.assertEqual(1, value_1)
+      rest = []
+      for _ in range(1000):
+        rest.append(sess.run(value))
+      self.assertEquals([[100 + i + 1] for i in range(1000)], sorted(rest))
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(value)
+
+  def testEnqueueWithPrefetch(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    # Prefetching will request additional values before they are
+    # available to the queue.
+    dataset = dataset.prefetch(buffer_size=3)
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+    enqueue = tqd.enqueue_in_queue_dataset(queue_handle, value + 1)
+    with self.test_session() as sess:
+      i = 0
+      while i < 4:
+        received, _ = sess.run((value, enqueue))
+        if received.size > 0:
+          self.assertAllEqual([i], received)
+          i += 1
+      received_last = False
+      while True:
+        try:
+          received = sess.run(value)
+          if received.size > 0:
+            self.assertAllEqual([4], received)
+            received_last = True
+        except errors.OutOfRangeError:
+          break
+      self.assertTrue(received_last)
+
+  def testDatasetWithPaddedShapeSmallerThanInputFails(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([[0, 0, 0]]).repeat(None)
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=1, padded_shapes=[2]))
+    iterator = dataset.make_one_shot_iterator()
+    _, value = iterator.get_next()
+    with self.test_session() as sess:
+      with self.assertRaisesOpError(
+          r"Incompatible input shapes at component 0 between "
+          r"input dataset this dataset: \[3\] vs. \[2\]"):
+        sess.run(value)
+
+  def testEnqueueWithIncompatibleInputsFailsWithInformativeError(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0]).repeat(None)
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    iterator = dataset.make_one_shot_iterator()
+    queue_handle, value = iterator.get_next()
+
+    enqueue_bad_structure = tqd.enqueue_in_queue_dataset(
+        queue_handle, (value, value))
+    enqueue_bad_dtype = tqd.enqueue_in_queue_dataset(queue_handle,
+                                                     np.array(
+                                                         [1.0],
+                                                         dtype=np.float32))
+    enqueue_bad_shape_no_batch_dim = tqd.enqueue_in_queue_dataset(
+        queue_handle, ([1],))
+    enqueue_bad_shape = tqd.enqueue_in_queue_dataset(queue_handle,
+                                                     np.array(
+                                                         [[1]], dtype=np.int32))
+
+    with self.test_session() as sess:
+      with self.assertRaisesOpError(
+          "mismatched number of tensors.  Queue expects 1 tensors but "
+          "tried to insert 2"):
+        sess.run(enqueue_bad_structure)
+      with self.assertRaisesOpError(r"Expected component 0 to have batched "
+                                    r"shape \[1,...\], but saw shape: \[\]"):
+        sess.run(enqueue_bad_shape_no_batch_dim)
+      with self.assertRaisesOpError(
+          r"mismatched shapes at component 0.  Attempted to insert tensor "
+          r"with shape \[1\] but queue expected shape: \[\]"):
+        sess.run(enqueue_bad_shape)
+      with self.assertRaisesOpError(
+          r"mismatched dtypes at component 0.  Attempted to insert tensor "
+          r"of type float but queue expected type: int32"):
+        sess.run(enqueue_bad_dtype)
+
+  def testEnqueueWithPaddedBatchFailsWithInformativeError(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
+    with self.assertRaisesRegexp(
+        TypeError, r"Unable to create padding for field of type 'variant'"):
+      dataset.padded_batch(batch_size=10, padded_shapes=[1])
+
+  def testOneEnqueueWithPadding(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
+    # Make a dataset of variable-length vectors and their lengths.
+    dataset = dataset.map(
+        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
+    # Emit a queue we can prepend to, and counts/values as padded
+    # batch.
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=3))
+
+    iterator = dataset.make_one_shot_iterator()
+    queue, (count, padded_value) = iterator.get_next()
+
+    # Split the padded_value into two pieces: head and rest
+    rest_indices = array_ops.squeeze(array_ops.where(count > 2), axis=1)
+    bound = math_ops.minimum(2, math_ops.reduce_max(count))
+    value_head = padded_value[:, :bound]
+    count_rest = array_ops.gather(count - 2, rest_indices)
+    value_rest = array_ops.gather(padded_value, rest_indices)[:, bound:]
+    queue_rest = array_ops.gather(queue, rest_indices)
+    enqueue_rest_op = tqd.enqueue_in_queue_dataset(queue_rest,
+                                                   (count_rest, value_rest))
+    with ops.control_dependencies([enqueue_rest_op]):
+      calc = array_ops.identity(value_head)
+
+    with self.test_session() as sess:
+      self.assertAllEqual([[0, 0], [2, 2], [4, 4]], sess.run(calc))
+      self.assertAllEqual([[4, 4], [6, 6]], sess.run(calc))
+      self.assertAllEqual([[6, 6]], sess.run(calc))
+      self.assertAllEqual([[6, 6]], sess.run(calc))
+      # Get some final batches due to prefetching.
+      for _ in range(3):
+        try:
+          self.assertAllEqual(
+              np.empty(shape=(0, 0), dtype=np.int32), sess.run(calc))
+        except errors.OutOfRangeError as e:
+          self.assertTrue(str(e).startswith("End of sequence"))
+
+  def testNonstandardPadding(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
+    # Make a dataset of variable-length vectors and their lengths.
+    dataset = dataset.map(
+        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
+    # Emit a queue we can prepend to, and counts/values as padded
+    # batch.
+    dataset = dataset.apply(
+        tqd.prepend_from_queue_and_padded_batch_dataset(
+            batch_size=3, padding_values=(
+                0,
+                -1,
+            )))
+
+    iterator = dataset.make_one_shot_iterator()
+    _, (unused_count, padded_value) = iterator.get_next()
+
+    with self.test_session() as sess:
+      self.assertAllEqual([[-1, -1, -1, -1], [2, 2, -1, -1], [4, 4, 4, 4]],
+                          sess.run(padded_value))
+      self.assertAllEqual([[6] * 6], sess.run(padded_value))
+      with self.assertRaisesOpError("End of sequence"):
+        sess.run(padded_value)
+
+
+# TODO(ebrevdo): Figure out how to use run_core_tests to test state
+# saving of an iterator that's had some tensors enqueued into its queue.
+class PrependFromQueueAndPaddedBatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testPrependFromQueueAndPaddedBatch(self):
+
+    def build_dataset(seq_lens):
+      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+          lambda x: array_ops.fill([x], x)).apply(
+              tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=4))
+
+    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens1),
+                        lambda: build_dataset(seq_lens2), 8)
+
+  def testPrependFromQueueAndPaddedBatchNonDefaultPadding(self):
+
+    def build_dataset(seq_lens):
+
+      def fill_tuple(x):
+        filled = array_ops.fill([x], x)
+        return (filled, string_ops.as_string(filled))
+
+      padded_shape = [-1]
+      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+          fill_tuple).apply(
+              tqd.prepend_from_queue_and_padded_batch_dataset(
+                  batch_size=4,
+                  padded_shapes=(padded_shape, padded_shape),
+                  padding_values=(-1, "<end>")))
+
+    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens1),
+                        lambda: build_dataset(seq_lens2), 8)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
index 2992a61ea8186caada394208e9c27ddffe896dd1..9675428e56e93c9669753371dbca47d56325b0c4 100644
--- a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
+++ b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.cc
@@ -142,9 +142,9 @@ Status ConvertConstantsToImmutable(const string& in_graph_filename,
   const auto load_graph_status =
       ReadBinaryProto(default_env, in_graph_filename, &graph_def);
   if (!load_graph_status.ok()) {
-    return tensorflow::errors::NotFound("Failed to load graph at '",
-                                        in_graph_filename, "' : ",
-                                        load_graph_status.error_message());
+    return tensorflow::errors::NotFound(
+        "Failed to load graph at '", in_graph_filename,
+        "' : ", load_graph_status.error_message());
   }
 
   NodeConverter node_converter;
diff --git a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.h b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.h
index 6518e7a10f587a687f2ee3258f5399d74d87364e..61fc6f36f7e5211e43c279506faf09624086d167 100644
--- a/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.h
+++ b/tensorflow/contrib/util/convert_graphdef_memmapped_format_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_UTIL_CONVERT_GRAPHDEF_MEMMAPPED_FORMAT_LIB_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_UTIL_CONVERT_GRAPHDEF_MEMMAPPED_FORMAT_LIB_H_
+#ifndef TENSORFLOW_CONTRIB_UTIL_CONVERT_GRAPHDEF_MEMMAPPED_FORMAT_LIB_H_
+#define TENSORFLOW_CONTRIB_UTIL_CONVERT_GRAPHDEF_MEMMAPPED_FORMAT_LIB_H_
 
 #include <string>
 
@@ -31,4 +31,4 @@ Status ConvertConstantsToImmutable(const string& in_graph_filename,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_UTIL_CONVERT_GRAPHDEF_MEMMAPPED_FORMAT_LIB_H_
+#endif  // TENSORFLOW_CONTRIB_UTIL_CONVERT_GRAPHDEF_MEMMAPPED_FORMAT_LIB_H_
diff --git a/tensorflow/contrib/util/inspect_checkpoint.cc b/tensorflow/contrib/util/inspect_checkpoint.cc
index 39088aeaad68e26344b2e89ce10ae6da8026e481..9b578ceb07548b8d198f64bc859d31c92774a286 100644
--- a/tensorflow/contrib/util/inspect_checkpoint.cc
+++ b/tensorflow/contrib/util/inspect_checkpoint.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/tensor_slice_reader.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
index 38a84ffb10e594568a18dbd06debf32545cb2229..80a5d07ea43531ed2532443b6ff9327b9ece6df7 100644
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@@ -99,7 +99,7 @@ cc_library(
     alwayslink = 1,
 )
 
-tf_cuda_library(
+cc_library(
     name = "rdma_rendezvous_mgr",
     srcs = ["rdma_rendezvous_mgr.cc"],
     hdrs = ["rdma_rendezvous_mgr.h"],
@@ -114,7 +114,7 @@ tf_cuda_library(
     ],
 )
 
-cc_library(
+tf_cuda_library(
     name = "rdma_mgr",
     srcs = ["rdma_mgr.cc"],
     hdrs = ["rdma_mgr.h"],
@@ -141,6 +141,8 @@ tf_cuda_library(
         "//conditions:default": [],
     }),
     deps = [
+        ":grpc_verbs_client",
+        ":verbs_service_proto_cc",
         ":verbs_util",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/contrib/verbs/README.md b/tensorflow/contrib/verbs/README.md
index 7c1c8ea45912be8c471efbe42f43e083639e91fc..58fed4e5cb4c24b0f21dfe9b99cf4c665d2591c7 100644
--- a/tensorflow/contrib/verbs/README.md
+++ b/tensorflow/contrib/verbs/README.md
@@ -24,66 +24,144 @@ The design is based on TensorFlow r1.0. An RDMA path is added between servers fo
 
 During the server setup, an RDMA manager is created to manage low-level RDMA components such as RDMA channel and RDMA adapter, an RDMA rendezvous manager is created to oversee send/recv operations between servers. Following the distributed TensorFlow design philosophy, the send operation is passive, i.e. merely placing a tensor in the local out-going table. It is the receive operation that actually initiates the tensor transfer.
 
-TensorFlow dynamically allocates memory for tensors that are to be sent or received. This causes difficulty for RDMA operations where pinned memory is required. Two remedies are possible, either the memory is pinned, transfer, then unpinned for each and every tensor to be transferred, or a buffer is pre-allocated and pinned for each tensor. The former incurs significant operation overhead since pinning and unpinning memory for each dynamically generated tensor is slow. The latter incurs large memory overhead and extra copying from the tensor to its pinned buffer, but may still be faster than the former. The second approach is adopted in this design. Each RDMA channel, representing a RDMA connection to a peer, contains a table of pinned buffers for all the seen tensors that requires transfer. It is assumed that the tensor size rarely changes across different steps. So only one buffer is created for the same tensor across all the steps. In the rare case when the tensor size does increases, the old buffer is discarded and new buffer of larger size is created and pinned.
+TensorFlow dynamically allocates memory for tensors that are to be sent or received. This causes difficulty for RDMA operations where pinned memory is required. Few remedies are possible:
+1. The memory is pinned, transferred, then unpinned for each and every tensor to be transferred. This incurs significant operation overhead since pinning and unpinning memory for each dynamically generated tensor is slow. 
+2. Buffer is pre-allocated and pinned for each tensor. This incurs large memory overhead and extra copying from the tensor to its pinned buffer, but may still be faster than the former.
+3. Following HKUST research on the use of GPU direct, and their [GDR implementation](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/gdr/README.md), there is a smart way to benefit from the TensorFlow allocation theme which is mostly pool based, i.e allocators pre-allocate a large memory block, and allocate the tensors from there. By attaching a custom Visitor to relevant allocators, we can do a single registration of the entire memory block, which zeros the registration overhead. Once the block is registered, each new tensor allocated will be at a registered address, which will allow us to do direct RDMA writes to it.
 
-When a tensor is prepared for transfer, it is first converted to TensorProto, then the proto is serialized to byte array and copied to the pinned buffer. The content of the buffer is transferred to the remote node via RDMA write. On the remote side, the process is reversed. This is illustrated in the diagram below. The conversion of TensorProto is introduced to simplify transfer of string-tensors. Also since the TensorProto lives in host memory, even if the origin tensor lives in the device, the pinned buffers are all allocated in the host memory.
-![TensorFlow RDMA path](./design_diagram.png)
+For best performance, we will adopt HKUST 0 copies approach in our solution. This means:
+
+1. Tensor writes will be done directly from the source tensor to the **result** tensor, with no memory copies in between. This should be done for all DMAable tensors which are located either on CPU or on a RDMA compatible GPU device (GPU direct). 
+2. Non DMAable tensors (CanMemCopy == false) will be serialized to a TensorProto on the sender side, RDMA written to a registered buffer on the receiver side, and then deserialized by the receiver.
+3. Tensors which are located on a non-RDMA-compatible GPU, will be RDMA written to a registered CPU **proxy** buffer on the receiver side, and then copied to GPU by the receiver.
 
-The following improvements can be made in the future. First, conversion to TensorProto and serialization can be avoided for numeric (float/int) tensors since their internal buffer can be access directly as byte array. Second, the pinned buffer may be allocated on device if the tensor is located in the device. This avoids extra device-to-host copy at the expense of extra device memory consumption.
 ## Design details
 
-### RDMA components
+### Terminology
 
-* **RDMA adapter:** The base for RDMA communications. It may contain multiple channels and buffers.  It is responsible for handling various incoming RDMA messages.
-* **RDMA channel:** Responsible for RDMA connection to a particular node. It manages multiple buffers. A channel has a callback table which stores all the callbacks for the requested tensors.
-* **RDMA buffer:** Responsible for sending or receiving data. It has a fixed size memory to store the data. It has a queue to store the pending jobs. There are three types of buffers, message buffer, ACK buffer and tensor buffer. A channel has two message buffers, two ack buffers and many tensor buffers.
-* **RDMA manager:** Manages the adapter and channels, including channel creation, channel setup via GRPC service, channel lookup, etc.
-* **RDMA rendezvous manager:** manages multiple rdma rendezvous.
-* **RDMA rendezvous:** a derived class of BaseRemoteRendezvous. This class is the back end for "send" and "recv" ops. When the sendrecv_op wants to send or receive a tensor, it calls the rendezvous' "send" and "recv" functions respectively. Rendezvous are identified by "step_id", a random number, so that tensors for different iterations don't get mixed up.
+* **Sender** - The node which sends the tensor.
+* **Receiver** - The node which receives the tensor.
+* **Result tensor** - The destination tensor, allocated on its appropriate device.
+* **Proxy tensor** - A CPU allocated tensor, which will be used in the case where the result tensor cannot be RDMA written to directly (GPU direct is disabled or not available). The RDMA write will therefore be done to the proxy tensor, and afterwards we will do a manual local copy from it to the result tensor.
 
-### The SEND operation
+### Messages
 
-In TensorFlow, when rendezvous sends a tensor, it merely puts a tensor in a local table in the corresponding rendezvous. If the tensor has been requested, a callback exists in the table. "send" will activate the callback, which tries to send the tensor across the node.
+* RDMA_MESSAGE_TENSOR_REQUEST
+* RDMA_MESSAGE_META_DATA_RESPONSE
+* RDMA_MESSAGE_TENSOR_RE_REQUEST
 
+### Transport protocol
 
-### The RECV operation
+The tensor transfer process is initiated when the receiver requests a tensor. In code it is done by calling **Rendezvous::Recv()** or **Rendezvous::RecvAsync()**. The TensorFlow base implementation handles the case where the requested tensor is located on the same node. The more interesting case where the requested tensor is located on a remote node (receiver != sender) is to be handled in a derivation of the pure virtual **BaseRemoteRendezvous::RecvFromRemoteAsync()**. TensorFlow provides a default GRPC based implementation which comes in the vanilla version but suffers in scalability when running large models. Our RDMA based implementation presumes to be more scalable. HKUST's contrib GDR implementation is more scalable than GRPC, and less scalable than ours, only because we did our evolution based on it.
 
-When a tensor is requested, rendezvous' recv function is called. The function first places a callback in the channel's callback table, which will be activated once the tensor is sent from the source. In the next step, a message is sent to notify the source of the requested tensor. Once the source receives the message, it will check locally for the tensor, if not found, a callback is placed in the table, otherwise, the tensor id will be placed at corresponding RDMA buffer's job queue for future transmission. When a tensor is scheduled to be transmitted, the RDMA buffer needs to have the memory allocated and initialized (registered with the remote buffer info). If the memory is not ready, the transmission is deferred, a message is sent to the destination to establish the memory first. The other case a transmission can be deferred is when the buffer is still being used by an on-going transmission.
+Our entry point is the implementation of **RdmaRemoteRendezvous::RecvFromRemoteAsync()**, located in rdma_rendezvous_mgr.cc. The implementation creates a new **RdmaTensorRequest** object, keyed by request index (uint32_t), stores it in a list of pending requests, and calls its **Start()** method. The **Start()** method basically does 2 things:
 
-### Three types of RDMA buffers
+1. Allocate the result tensor (and the proxy tensor if required).
+2. Send a **RDMA_MESSAGE_TENSOR_REQUEST** to the sender, containing the address of the destination tensor (result/proxy) for RDMA write.
 
-* **Message buffer:** responsible for sending message only.
-* **Ack buffer:** once a message is sent, the recipient needs to send an ack via the ack buffer to free up the message buffer. An ack buffer is exclusively for its coupled message buffer.
-* **Tensor buffer:** responsible for sending tensors. The recipient needs to send back a message to free up the sending buffer.
+In order to allocate the result and proxy tensors, we need to know the tensor's meta-data, i.e. shape and data-type for DMAable tensors, and proto-size for serialized tensors. Unfortunately, this information is only available on the sender side which complicates manners. In order to avoid sending extra messages for querying the meta-data at each step, we store a local meta-data cache per tensor, which will only be update upon changes. Based on the assumption that the meta-data of a tensor rarely changes between steps, we expect that on most times the cache will only be updated once. The sender is responsible to detect changes in the meta-data, and update the receiver. In order for the sender to know that the meta-data had changed, each **RDMA_MESSAGE_TENSOR_REQUEST** will contain the meta-data that the receiver had grabbed from the local cache. The sender will then compare the meta-data from the message to the tensor's new meta-data.
 
-### RDMA packet format
+When the sender receives an **RDMA_MESSAGE_TENSOR_REQUEST**, it will create a new **RdmaTensorResponse** object for the given request message, store it in a list of pending responses, and will invoke its **Start()** method. The **Start()** method does the following:
 
-|type|name_size|name|step_id|buffer_size|remote_addr|rkey|is_dead|data_type|tensor_shape|tensor_bytes|tensor_buffer|
+1. Grab the source tensor from the local table (In code, **RecvLocalAsync()**).
+2. If the source tensor is not DMAable, serialize it to a TensorProto.
+3. If the source tensor is located on a device which cannot be DMA written from, copy it to CPU.
+4. If it is the first time this tensor is requested, or if the tensor's meta-data changed:
+	1. Clone the tensor's data to be sent later.
+	2. Send a **RDMA_MESSAGE_META_DATA_RESPONSE** containing the new meta-data.
+5. Otherwise:
+	1. RDMA write the tensor (or TensorProto) to the destination address and rkey specified in the request message. The immediate value for the write will be the request index.
 
-### Six types of RDMA messages
-* RDMA_MESSAGE_ACK
-* RDMA_MESSAGE_BUFFER_IDLE
-* RDMA_MESSAGE_BUFFER_REQUEST
-* RDMA_MESSAGE_BUFFER_RESPONSE
-* RDMA_MESSAGE_TENSOR_REQUEST
-* RDMA_MESSAGE_TENSOR_WRITE
-
-### Actions upon receiving RDMA messages
-* RDMA_MESSAGE_ACK
-  * sender: mark local ack buffer idle.
-  * receiver: mark remote message buffer idle, send next item.
-* RDMA_MESSAGE_BUFFER_IDLE
-  * sender: mark local message buffer idle, send next item.
-  * receiver: send ack, set remote tensor buffer idle, send next item.
-* RDMA_MESSAGE_BUFFER_REQUEST
-  * sender: mark local message buffer idle, send next item.
-  * receiver: send ack, find or create tensor buffer, send BUFFER_RESPONSE.
-* RDMA_MESSAGE_BUFFER_RESPONSE
-  * sender: mark local message buffer idle, send next item.
-  * receiver: send ack, set remote buffer info, set local and remote buffer idle, send next item.
-* RDMA_MESSAGE_TENSOR_REQUEST
-  * sender: mark local message buffer idle, send next item.
-  * receiver: send ack, find or create tensor buffer, enqueue tensor id, send next item.
-* RDMA_MESSAGE_TENSOR_WRITE
-  * sender: mark local message buffer idle, send next item.
-  * receiver: run callback.
+
+When the receiver receives the **RDMA_MESSAGE_META_DATA_RESPONSE**, it will locate the relevant **RdmaTensorRequest** using the request index specified in the message, and invoke its **RecvTensorMetaData()** which does the following:
+
+1. Update the local meta-data cache.
+2. Reallocate the result/proxy tensors. 
+3. Re-send the tensor request. For traceability, the new message has a different name: **RDMA_MESSAGE_TENSOR_RE_REQUEST**.
+
+When the sender receives a **RDMA_MESSAGE_TENSOR_RE_REQUEST**, it will locate the relevant **RdmaTensorResponse** using the request index specified in the message, and invoke its **Resume()** method, which will RDMA write the contents of the tensor that was cloned earlier, to the new remote address specified in the re-request.
+
+When the receiver receives the RDMA write, it will locate the relevant **RdmaTensorRequest** using the request index which is the immediate value. It will then invoke its **RecvTensorContent()** which does the following:
+
+1. Proxy copy/deserialize if required.
+2. Invoke the done callback.
+3. Deallocate the result/proxy tensors and remove the request from the pending list.
+
+![alt text](verbs_with_0_copies.png "Transport protocol")
+
+### Additional design notes
+
+1. When the sender receives a tensor request, the source tensor may or may not be ready yet. The situation is handled through a process of tag matching:
+	* If the request arrives before the tensor is ready, then a callback is put in a local table, and will be invoked once the tensor arrives.
+	* If the tensor is ready before the request arives, than the tensor is put in a local table. When the request arrives, it will invoke the callback immediately.
+   In code it is done by calling **RecvLocalAsync()**, which receives the tensor's key, step-id, and the callback.
+2. When the callback is invoked, the relevant tensor is removed from the tag matching table. In the case where we need to send the tensor's meta-data, the **RdmaTensorResponse** will store a copy of the tensor until the re-request arrives.
+3. The sending of protocol messages (**RDMA_MESSAGE_TENSOR_REQUEST**, **RDMA_MESSAGE_META_DATA_RESPONSE** and **RDMA_MESSAGE_TENSOR_RE_REQUEST**) is done by the class **RdmaMessageBuffer**. All messages are sent using RDMA writes from/to fixed messages buffers. This implies that we cannot send on a specific channel more than one message at a time. In order to synchronize the messages, the **RdmaMessageBuffer** holds the a local and remote buffer statuses which can be either busy or idle. When a write is issued, both statuses will be changed to busy. When the write-complete event is received, the local status is changed to idle. When the write is received on the remote side, the remote side will parse the message, and return an ACK back to the sending side on which the sending side will update the remote status to idle. When both the local and remote statuses are idle, the next message can be sent.
+5. ACK writes are empty writes (hence they require no buffer) with immediate value 0xFFFFFFFE. Message writes have the immediate value 0xFFFFFFFF. All other writes are tensor-content writes whose immediate value is the request-index.
+
+### RDMA components
+
+* **enum RdmaImmDataType**       - Immediate types to distinguish between different RDMA writes on the remote side. Ack writes and control-message writes have a fixed immediate value. The rest of the writes are tensor writes and the immediate value is the relevant request index.
+* **enum  RdmaWriteIDType**      - Types to distinguish between different RDMA write-complete events: Ack, control message and tensor writes.
+* **class RdmaWriteID**          - Context for RDMA write complete events. Holds the RdmaWriteIDType and additional data.
+* **class RdmaTensorMetaData**   - Meta-data for a tensor (type, shape, is_dead, proto_size).
+* **class RdmaMemoryMgr**        - Manages the meta-data cache, and the registered memory regions.
+* **class RdmaTensorRequest**    - Holds and manages information for a single tensor request throughout the entire receive cycle. API:
+	* **Start()**                - Start the request sequence.
+		* Allocate the result tensor (and proxy tensor if required).
+		* Send RDMA_MESSAGE_TENSOR_REQUEST to the remote side.
+	* **RecvTensorMetaData()**   - Receive meta-data from the remote side.
+		* Update the local meta-data cache.
+		* Reallocate the result tensor (and proxy tensor if required).
+		* Re-send the request to the remote side.
+	* **RecvTensorContent()**    - Receive tensor content from the remote side (RDMA write was completed).
+		* Decode proto if required and/or move to GPU if the content was not written to it directly (GPU direct is not available).
+		* Invoke the done callback.
+* **class RdmaTensorResponse**   - Holds and manages information for a single tensor response throughout the entire send cycle. API:
+	* **Start()**                - Start the response sequence. 
+		* Find the tensor in the local tag-match table.
+		* Compare the tensor's meta-data to the meta-data in the message (taken from the requester's local cache). 
+			* If meta-data changed:
+				* Clone the tensor to be sent later.
+				* Send a meta-data update message and wait for re-request.
+			* Else:
+				* Send the tensor's content (using direct RDMA write).
+	* **Resume()**               - Resume the response sequence after a re-request. Send the tensor's content that was cloned earlier.
+	* **Destroy()**              - Destroy the response's resources and remove it form the pending list.
+* **class RdmaAdapter**          - The base for RDMA communications. It may contain multiple channels and buffers.  It is responsible for handling various incoming RDMA messages.
+* **class RdmaChannel**          - Responsible for RDMA connection to a particular node. It manages messagee buffers. A channel has a request table which stores all the pending tensor requests.
+* **class RdmaMessageBuffer**    - Responsible for sending or receiving messages. It has a fixed size memory to store the data. It has a queue to store the pending jobs. A channel has two message buffers one for tx and one for rx.
+* **class RdmaMgr**              - Manages the adapter and channels, including channel creation, channel setup via GRPC service, channel lookup, etc.
+* **class RdmaRendezvousMgr**    - Manages multiple rdma rendezvous.
+* **class RdmaRemoteRendezvous** - A derived class of BaseRemoteRendezvous. This class is the back end for "send" and "recv" ops. When the sendrecv_op wants to send or receive a tensor, it calls the rendezvous' "send" and "recv" functions respectively. Rendezvous are identified by "step_id", a random number, so that tensors for different iterations don't get mixed up.
+
+### Message structure:
+
+| type | name_size | name | step_id | request_index | remote_addr/checksum | rkey | is_dead | data_type | tensor_shape | tensor_bytes | error_status          |
+|------|---------- |------|---------|---------------|----------------------|------|---------|-----------|--------------|--------------|-----------------------|
+|  1B  |    2B     | 512  |  8B     |      8B       |           8B         |   4B |      1B |     XB    |    XB        |    8B        | Size - 4B, proto - XB |
+
+* **RDMA_MESSAGE_TENSOR_REQUEST**  - (receiver ==> sender) The original tensor request. 
+	* type - The message type.
+	* name (name_size) - Name of the requested tensor.
+	* step_id - Step ID.
+	* request_index - Request index.
+	* remote_addr/rkey - Address/rkey of the result/proxy tensor. Irrelevant for first-time request.
+	* is_dead/data_type/tensor_shape/tensor_bytes - The current meta-data as stored in the receiver local cache. The sender will use that information to know if the receiver's cache requires updating.
+* **RDMA_MESSAGE_META_DATA_RESPONSE**  - (sender ==> receiver) The meta-data update message in case meta-data had changed (or if it is the first time the tensor is requested).
+	* type - The message type.
+	* request_index - Request index.
+	* is_dead/data_type/tensor_shape/tensor_bytes - The up-to-date meta-data.
+	* checksum - In data validation mode, this will hold the checksum of the source tensor.
+* **RDMA_MESSAGE_TENSOR_RE_REQUEST** - (receiver ==> sender) Tensor re-request after meta-data update and reallocation of result/proxy tensors.
+	* type - The message type.
+	* name (name_size) - Name of the requested tensor.
+	* step_id - Step ID.
+	* request_index - Request index.
+	* remote_addr/rkey - Address/rkey of the reallocated result/proxy tensor.
+* **RDMA_MESSAGE_ERROR_STATUS** - (sender ==> receiver) Notify the receiver that an error had occured on the sender side, so it can propagate it to the upper levels.
+	* type - The message type.
+	* name (name_size) - Name of the requested tensor.
+	* step_id - Step ID.
+	* request_index - Request index.
+	* error_status - The error status (code, message, details).
diff --git a/tensorflow/contrib/verbs/grpc_verbs_client.h b/tensorflow/contrib/verbs/grpc_verbs_client.h
index 358977f92543e1a38b594cf45cdbff34f89277be..2cfaa4986cb0923d9687cb77b8e1116a937594a1 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_client.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_client.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
+#ifndef TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
+#define TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
 
 #include "tensorflow/contrib/verbs/grpc_verbs_service_impl.h"
 #include "tensorflow/contrib/verbs/verbs_service.pb.h"
@@ -47,4 +47,4 @@ class GrpcVerbsClient {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
+#endif  // TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.cc b/tensorflow/contrib/verbs/grpc_verbs_service.cc
index f2af6b79fba6a480afbfe88fcbefcbf8a6670ce6..742f946c9536973eb8a6a11afda1b32ae4a7726b 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service.cc
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.cc
@@ -122,17 +122,15 @@ Status GrpcVerbsService::GetRemoteAddressSync(
   rc->SetRemoteAddress(ra, false);
   rc->Connect();
   int i = 0;
-  int idx[] = {1, 0, 3, 2};
-  std::vector<RdmaBuffer*> mb(rc->message_buffers());
-  CHECK_EQ(request->mr_size(), 4);
+  int idx[] = {1, 0};
+  std::vector<RdmaMessageBuffer*> mb(rc->message_buffers());
+  CHECK_EQ(request->mr_size(), RdmaChannel::kNumMessageBuffers);
   for (const auto& mr : request->mr()) {
     // the connections are crossed, i.e.
     // local tx_message_buffer <---> remote rx_message_buffer_
     // local rx_message_buffer <---> remote tx_message_buffer_
-    // local tx_ack_buffer <---> remote rx_ack_buffer_
-    // local rx_ack_buffer <---> remote tx_ack_buffer_
-    // hence idx[] = {1, 0, 3, 2}.
-    RdmaBuffer* rb = mb[idx[i]];
+    // hence idx[] = {1, 0}.
+    RdmaMessageBuffer* rb = mb[idx[i]];
     RemoteMR rmr;
     rmr.remote_addr = mr.remote_addr();
     rmr.rkey = mr.rkey();
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.h b/tensorflow/contrib/verbs/grpc_verbs_service.h
index aa509602b51e7749547f1ff8eb5193acd1a3ec65..444c863b942ef8bce8d54d59765563b12eb6087e 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
+#ifndef TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
+#define TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
 
 #ifdef TENSORFLOW_USE_VERBS
 
@@ -69,4 +69,4 @@ void SetNewVerbsService(GrpcVerbsService** handle, const WorkerEnv* worker_env,
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_VERBS
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
+#endif  // TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
index 86431ca030c38c56155801202714ee4a49b764df..1f0f10517e98a32ae882c027330091928f1a6ee2 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
+#ifndef TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
+#define TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
 
 #include "grpc++/impl/codegen/async_stream.h"
 #include "grpc++/impl/codegen/async_unary_call.h"
@@ -86,4 +86,4 @@ class VerbsService GRPC_FINAL {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
+#endif  // TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
diff --git a/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md b/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md
new file mode 100644
index 0000000000000000000000000000000000000000..956b8f2147cf8154b6f1ade006d7bff194864c9b
--- /dev/null
+++ b/tensorflow/contrib/verbs/patch_notes_verbs_with_0_copies.md
@@ -0,0 +1,87 @@
+## Verbs implementation to use direct tensor writes (0 copies)
+
+### Motivation:
+
+Following HKUST research on the use of GPU direct, and their [GDR implementation](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/gdr/README.md), we wish to adopt the 0 copies approach and apply it to the current verbs implementation, while keeping the current implementation advantages, such as configurability and the use of RDMA for control messages.
+
+### Performance:
+
+Compared with the current GRPC, verbs and GDR implementation, the result implementation gave the best performance for every model, with any number of nodes. For VGG16 on 8 nodes with 4 P100 GPUs each, the prototype beat the second place by over 15%.
+
+### Implementation requirements:
+
+1. Tensor writes need to be done directly from the source Tensor to the destination Tensor, with no memory copies in between. This should be done for all DMAble tensors which are located either on CPU or on a RDMA compatible GPU device (GPU direct). 
+2. Non DMAble tensors (CanMemCopy == false) will be serialized to proto on the sender side, RDMA written to a registered buffer on the receiver side, and then deserialized by the receiver.
+3. Tensors which are located on a non-RDMA-compatible GPU, will be RDMA written to a registered CPU proxy buffer on the receiver side, and then copied to GPU by the receiver.
+
+### Implementation constrains:
+
+For best stability and proof of correctness, we will divide the implementation to two stages:
+1. At first stage we will keep changes to the current implementation to the minimum possible. The expense will be that we may have unused or unnecessary code leftovers, which may also affect performance. 
+2. At second stage, we will re-iterate over the code and remove irrelevant code parts.
+The design of the solution aims that we will achieve both stages with relative ease. 
+
+### Design guidelines:
+
+1. Since we do not want to do any unnecessary memory copying, we will no longer allocate a fixed CPU buffer as the destination for the RDMA write. Instead we will do the writing directly to the result tensor, or if the result tensor is on a device which does not support RDMA, we will do the writing to a proxy CPU tensor and then copy its content to the result tensor.
+2. The address of the destination Tensor needs to be sent to the sender side for writing, meaning that the result/proxy tensor should be pre-allocated on the receiver side, prior to sending the tensor request. In order to do that, we need to know its meta-data, i.e. shape and data-type for DMAble tensors, and proto-size for serialized tensors. Unfortunately, this information is only available on the sender side which complicates manners. In order to avoid sending extra messages for querying the meta-data on each step, we store a local meta-data cache per tensor. Based on the assumption that the meta-data of a tensor rarely changes between steps, we expect that on most times the cache will only be updated once. When the sender receives a request for a tensor, if it is the first time this tensor is requested, or in the rare case that the meta-data did change, the sender will first send a meta-data response, on which the receiver will update the local cache, and reallocate the result/proxy tensors if required. When the receiver sends the tensor request, it will contain also the meta-data currently stored in its local cache, so the sender can compare it to see if there was a change.
+3. When the sender writes the tensor content to the result tensor, no additional data is being written with it. That means we need to reside on ibverbs immediate (uint32_t) to indicate which request we are responding to (in order to trigger the receive callback). The easiest and most elegant way is to key the recv callback with a unique request_index (uint32_t), instead of the current key_with_step_id (string). 
+4. Since the sender no longer writes the tensor from/to fixed buffers, we no longer need to schedule the writes using the local/remote status. In addition we no longer rely on the RmdaTensorBuffer members as the source/destination addresses and rkey/lkey. Instead, each RdmaTensorBuffer will hold multiple "Response" objects (one per step-id), from which we derive destination address and rkey. The source address and lkey are always the ones of the source Tensor.
+5. With the addition of tensor pre-allocation, we noticed there is a large code similarity between sending the first tensor request and re-sending the request in case of meta-data changes. After implementing a common method for tensor pre-allocation, it turned out that implementation becomes much simpler by encapsulating the process of request sending/re-sending, meta-data response callback and content response callback, all in a single "Request" class. The request class holds all the relevant request information, which reduces excessive parameter passing and lambda capturing. This decision is purely for elegance and code simplicity, and we decided to implement it in first stage because it makes the implementation much easier.
+
+### New types/classes:
+
+* **enum RdmaImmDataType** - Immediate types to distinguish between different RDMA writes on the remote side. Ack writes and control-message writes have a fixed immediate value. The rest of the writes are tensor writes and the immediate value is the relevant request index.
+* **enum  RdmaWriteIDType**    - Types to distinguish between different RDMA write-complete events: Ack, control message, tensor DMA write and tensor proto write.
+* **class RdmaWriteID**        - Context for RDMA write complete events. Holds the RdmaWriteIDType and additional data.
+* **class RemoteAddressContext** - Remote address information (address + mr). Will be passed as write context for tensor proto writes.
+* **class RdmaTensorMetaData** - Meta-data for a tensor (type, shape, is_dead, proto_size).
+* **class RdmaMemoryMgr**      - Manages the meta-data cache, and the registered memory regions.
+* **class RdmaTensorRequest**  - Holds and manages information for a single tensor request throughout the entire receive cycle. API:
+	* Start() - Start the request.
+	* RecvTensorMetaData() - Receive meta-data from the remote side.
+	* RecvTensorContent() - Receive tensor content from the remote side and invoke the done() callback. 
+* **class RdmaTensorResponse** - Holds information for a single tensor response, such as destination address and rkey.
+
+### Protocol changes:
+
+The protocol messages themselves will remain mostly unchanged at the first stage, but will be used differently, as described below. The current messages structures already have most of the required fields for the new implementation. The only change is the "buffer_size" field which is no longer used since we are no longer sending additional information with the tensor, and thus it is now always equal to the "tensor_bytes" field. Instead, we use that field to pass the "request_index".
+
+### Message structure:
+
+| type | name_size | name | step_id | request_index | remote_addr | rkey | is_dead | data_type | tensor_shape | tensor_bytes |
+|------|---------- |------|---------|---------------|-------------|------|---------|-----------|--------------|--------------|
+|  1B  |    2B     | 512  |  8B     |      8B       |         8B  |   4B |      1B |     XB    |    XB        |    8B        |
+
+* **RDMA_MESSAGE_TENSOR_REQUEST**  - (receiver ==> sender) The original tensor request. 
+	* type - The message type.
+	* name (name_size) - Name of the requested tensor.
+	* step_id - Step ID.
+	* request_index - Request index.
+	* remote_addr/rkey - Address/rkey of the result/proxy tensor. Irrelevant for first-time request.
+	* is_dead/data_type/tensor_shape/tensor_bytes - The current meta-data as stored in the receiver local cache. The sender will use that information to know if the receiver's cache requires updating.
+* **RDMA_MESSAGE_BUFFER_REQUEST**  - (sender ==> receiver) The meta-data update message in case meta-data had changed (or if it is the first time the tensor is requested).
+	* type - The message type.
+	* request_index - Request index.
+	* is_dead/data_type/tensor_shape/tensor_bytes - The up-to-date meta-data.
+* **RDMA_MESSAGE_BUFFER_RESPONSE** - (receiver ==> sender) Tensor re-requset after meta-data update and reallocation of result/proxy tensors.
+	* type - The message type.
+	* name (name_size) - Name of the requested tensor.
+	* step_id - Step ID.
+	* request_index - Request index.
+	* remote_addr/rkey - Address/rkey of the reallocated result/proxy tensor.
+	* is_dead/data_type/tensor_shape/tensor_bytes - The new meta-data. Will be removed in the next phase.
+* **RDMA_MESSAGE_TENSOR_WRITE**    - (sender ==> receiver) No longer sent. There is only a direct write of the tensor content to the result/proxy tensor. Request index passed as the immediate value of the write.
+* **RDMA_MESSAGE_TENSOR_IDLE**     - (receiver ==> sender) No longer sent.
+
+![alt text](verbs_with_0_copies_phase1_protocol.jpg "Phase 1 message protocol")
+
+### Second stage optimizations:
+1. Remove unused code leftovers.
+2. Remove the ACK buffer completely, since we can rely completely on its immediate value.
+
+### Future optimizations:
+1. Map the tensor names to indexes, to significantly reduce the request message size.
+2. Understand the purpose of empty tensors and if we can skip remote fetching for them.
+3. Consider concatenating multiple requests and/or using multiple message buffers.
+4. Consider a no-request architecture.
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index ae9a384565a6ad0e63a6cf3acf07c591c65f0637..86350a08e57e5050f18d019fe80d70f6381c1f7d 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -15,58 +15,48 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_VERBS
 
-#include "tensorflow/contrib/verbs/rdma.h"
 #include <fcntl.h>
 #include <cstdlib>
-#include <fcntl.h>
-#include "tensorflow/contrib/verbs/verbs_util.h"
+
+#include "tensorflow/contrib/verbs/rdma.h"
+#include "tensorflow/contrib/verbs/verbs_service.pb.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
 #if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
 #endif
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/lib/core/threadpool.h"
 
 namespace tensorflow {
 
 #define RoCE_V2 "RoCE v2"
 
 namespace {
-// hash name to 32-bit integer
-uint32_t NameHash(const string& name) {
-  return Hash32(name.data(), name.size(), 0x1234ABCD);
-}
 
 // convenience function for printing message
 string MessageTypeToString(RdmaMessageType rmt) {
   switch (rmt) {
-    case RDMA_MESSAGE_ACK:
-      return "RDMA_MESSAGE_ACK";
-      break;
-    case RDMA_MESSAGE_BUFFER_IDLE:
-      return "RDMA_MESSAGE_BUFFER_IDLE";
+    case RDMA_MESSAGE_META_DATA_UPDATE:
+      return "RDMA_MESSAGE_META_DATA_UPDATE";
       break;
-    case RDMA_MESSAGE_BUFFER_REQUEST:
-      return "RDMA_MESSAGE_BUFFER_REQUEST";
-      break;
-    case RDMA_MESSAGE_BUFFER_RESPONSE:
-      return "RDMA_MESSAGE_BUFFER_RESPONSE";
+    case RDMA_MESSAGE_TENSOR_RE_REQUEST:
+      return "RDMA_MESSAGE_TENSOR_RE_REQUEST";
       break;
     case RDMA_MESSAGE_TENSOR_REQUEST:
       return "RDMA_MESSAGE_TENSOR_REQUEST";
       break;
-    case RDMA_MESSAGE_TENSOR_WRITE:
-      return "RDMA_MESSAGE_TENSOR_WRITE";
-      break;
     default:
       return "UNKNOWN MESSAGE";
   }
@@ -347,7 +337,7 @@ uint32_t set_param(uint32_t default_val, const char* env_param) {
 
 enum ibv_mtu set_mtu(uint8_t port_num, ibv_context* context) {
   ibv_port_attr port_attr;
-  enum ibv_mtu mtu;
+  enum ibv_mtu mtu = IBV_MTU_512;
   string mtu_s;
   int rc, mtu_i;
 
@@ -459,106 +449,79 @@ void RdmaAdapter::Process_CQ() {
     CHECK_GE(ne, 0);
     for (int i = 0; i < ne; ++i) {
       CHECK(wc_[i].status == IBV_WC_SUCCESS)
-          << "Failed status \n" << ibv_wc_status_str(wc_[i].status) << " "
-          << wc_[i].status << " " << static_cast<int>(wc_[i].wr_id) << " "
-          << wc_[i].vendor_err;
+          << "Failed status \n"
+          << ibv_wc_status_str(wc_[i].status) << " " << wc_[i].status << " "
+          << static_cast<int>(wc_[i].wr_id) << " " << wc_[i].vendor_err;
       if (wc_[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
         RdmaChannel* rc = reinterpret_cast<RdmaChannel*>(wc_[i].wr_id);
         // put back a recv wr.
         rc->Recv();
         // imm_data is the index of RX buffer in the buffer table.
         uint32_t imm_data = wc_[i].imm_data;
-        RdmaBuffer* rb = rc->FindBuffer(imm_data);
+        RdmaMessageBuffer* rb;
         RdmaMessage rm;
-        RdmaMessage::ParseMessage(rm, rb->buffer_);
-        VLOG(2) << "recv RDMA message: " << MessageTypeToString(rm.type_);
 
-        if (rm.type_ == RDMA_MESSAGE_ACK) {
+        if (imm_data == RDMA_IMM_DATA_ACK) {
           // receive an ack to a message
           rb = rc->tx_message_buffer_;
           rb->SetBufferStatus(remote, idle);
           rb->SendNextItem();
-        } else if (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) {
-          // received a request-for-tensor message
-          // send ack to release remote tx message buffer
-          RdmaBuffer* ab = rc->tx_ack_buffer_;
-          ab->SendNextItem();
-          // find or create buffer
-          RdmaBuffer* tb = rc->FindOrCreateBuffer(rm.name_);
-          string key_with_step_id =
-              VerbsUtil::AppendStepidToKey(rm.name_, rm.step_id_);
-          tb->EnqueueItem(key_with_step_id);
-          // send the next tensor
-          worker_env_->compute_pool->Schedule([tb]() { tb->SendNextItem(); });
-        } else if (rm.type_ == RDMA_MESSAGE_BUFFER_IDLE) {
-          // receive tensor-buffer-ready message
-          // send ack to release remote tx message buffer
-          RdmaBuffer* ab = rc->tx_ack_buffer_;
-          ab->SendNextItem();
-          // find buffer
-          RdmaTensorBuffer* tb =
-              reinterpret_cast<RdmaTensorBuffer*>(rc->FindBuffer(rm.name_));
-          tb->SetBufferStatus(remote, idle);
-          worker_env_->compute_pool->Schedule([tb]() { tb->ReSendNextItem(); });
-        } else if (rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) {
-          // remote host requests to create a tensor buffer;
-          // send ack to release remote tx message buffer
-          RdmaBuffer* ab = rc->tx_ack_buffer_;
-          ab->SendNextItem();
-          // find or create the buffer
-          RdmaBuffer* tb = rc->FindOrCreateBuffer(rm.name_, TENSOR);
-          RemoteMR rmr;
-          rmr.remote_addr = rm.remote_addr_;
-          rmr.rkey = rm.rkey_;
-          tb->SetRemoteMR(rmr, true);
-          tb->CreateCPUBuffer(rm.buffer_size_);
-          // create RDMA_MESSAGE_BUFFER_RESPONSE message
-          RdmaMessage br;
-          br.type_ = RDMA_MESSAGE_BUFFER_RESPONSE;
-          br.name_size_ = rm.name_.size();
-          br.name_ = rm.name_;
-          br.buffer_size_ = rm.buffer_size_;
-          br.remote_addr_ = reinterpret_cast<uint64_t>(tb->buffer_);
-          br.rkey_ = tb->self_->rkey;
-          string message = RdmaMessage::CreateMessage(br);
-          RdmaBuffer* mb = rc->tx_message_buffer_;
-          mb->EnqueueItem(message);
-          mb->SendNextItem();
-        } else if (rm.type_ == RDMA_MESSAGE_BUFFER_RESPONSE) {
-          // remote creates a buffer and responds
-          // send ack to release remote tx message buffer
-          RdmaBuffer* ab = rc->tx_ack_buffer_;
-          ab->SendNextItem();
-          // find buffer
-          RdmaTensorBuffer* tb =
-              reinterpret_cast<RdmaTensorBuffer*>(rc->FindBuffer(rm.name_));
-          CHECK(rm.buffer_size_ == tb->size_)
-              << "rm.buffer_size = " << rm.buffer_size_
-              << "tb->size_ = " << tb->size_ << "rm.name_ = " << rm.name_;
-          RemoteMR rmr;
-          rmr.remote_addr = rm.remote_addr_;
-          rmr.rkey = rm.rkey_;
-          tb->SetRemoteMR(rmr, true);
-          tb->SetBufferStatus(local, idle);
-          tb->SetBufferStatus(remote, idle);
-          worker_env_->compute_pool->Schedule([tb]() { tb->ReSendNextItem(); });
-        } else if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) {
-          // tensor RDMA write completed
-          worker_env_->compute_pool->Schedule([rm, rc]() {
-            string key_with_step_id =
-                VerbsUtil::AppendStepidToKey(rm.name_, rm.step_id_);
-            rc->RunRecvCallback(key_with_step_id);
-          });
+          continue;
         }
-      } else if (wc_[i].opcode == IBV_WC_RDMA_WRITE) {
-        RdmaBuffer* rb = reinterpret_cast<RdmaBuffer*>(wc_[i].wr_id);
-        rb->SetBufferStatus(local, idle);
-        RdmaMessage rm;
+
+        if (imm_data <= RDMA_IMM_MAX_REQUEST_ID) {
+          // receive a tensor RDMA write
+          uint32_t request_index = imm_data;
+          RdmaTensorRequest* request = rc->GetTensorRequest(request_index);
+          request->RecvTensorContent();
+          continue;
+        }
+
+        // receive a control message
+        rb = rc->rx_message_buffer_;
         RdmaMessage::ParseMessage(rm, rb->buffer_);
-        VLOG(2) << "sent RDMA message: " << MessageTypeToString(rm.type_);
-        if (rm.type_ != RDMA_MESSAGE_ACK) {
-          worker_env_->compute_pool->Schedule([rb]() { rb->SendNextItem(); });
+        RdmaMessageBuffer::SendAck(rc);
+        RDMA_LOG(1) << "Step 0x" << std::hex << rm.step_id_ << std::dec
+                    << ": Received " << MessageTypeToString(rm.type_) << " "
+                    << "#" << rm.request_index_ << ": " << rm.name_;
+
+        if (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) {
+          RdmaTensorResponse* response = rc->AddTensorResponse(rm);
+          response->Start();
+        } else if (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) {
+          RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_);
+          request->RecvTensorMetaData(rm.data_type_, rm.tensor_shape_,
+                                      rm.is_dead_, rm.tensor_bytes_);
+#ifdef RDMA_DATA_VALIDATION
+          request->RecvTensorChecksum(rm.checksum_);
+#endif
+        } else if (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST) {
+          RdmaTensorResponse* response = rc->UpdateTensorResponse(rm);
+          response->Resume();
+        } else if (rm.type_ == RDMA_MESSAGE_ERROR_STATUS) {
+          RdmaTensorRequest* request = rc->GetTensorRequest(rm.request_index_);
+          request->RecvErrorStatus(rm.status_);
         }
+      } else if (wc_[i].opcode == IBV_WC_RDMA_WRITE) {
+        RdmaWriteID* wr_id = reinterpret_cast<RdmaWriteID*>(wc_[i].wr_id);
+        RDMA_LOG(2) << "Write complete of type " << wr_id->write_type;
+        switch (wr_id->write_type) {
+          case RDMA_WRITE_ID_ACK:
+            break;
+          case RDMA_WRITE_ID_MESSAGE: {
+            RdmaMessageBuffer* rb =
+                reinterpret_cast<RdmaMessageBuffer*>(wr_id->write_context);
+            rb->SetBufferStatus(local, idle);
+            rb->SendNextItem();
+            break;
+          }
+          case RDMA_WRITE_ID_TENSOR_WRITE: {
+            RdmaTensorResponse* response =
+                reinterpret_cast<RdmaTensorResponse*>(wr_id->write_context);
+            response->Destroy();
+          }
+        }
+        delete wr_id;
       }
     }
   }
@@ -577,7 +540,7 @@ int RdmaChannel::PingPostRecv() {
 int RdmaChannel::PingPostSend() {
   struct ibv_send_wr wr, *bad_wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t) this;
+  wr.wr_id = (uint64_t)this;
   wr.sg_list = &ping_sge_list_;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_SEND;
@@ -588,8 +551,10 @@ int RdmaChannel::PingPostSend() {
 
 RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
                          const string remote_name)
-    : adapter_(adapter), local_name_(local_name), remote_name_(remote_name) {
-
+    : adapter_(adapter),
+      local_name_(local_name),
+      remote_name_(remote_name),
+      request_serial_(0) {
   struct ibv_sge list;
 
   mr_ = ibv_reg_mr(adapter_->pd_, ping_buff_, kPingBuffSize,
@@ -651,29 +616,15 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
 
   // create message and ack buffers, then initialize the tables.
   {
-    const string buffer_names[] = {"tx_message_buffer", "rx_message_buffer",
-                                   "tx_ack_buffer",     "rx_ack_buffer"};
+    const string buffer_names[] = {"tx_message_buffer", "rx_message_buffer"};
     tx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[0]);
     rx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[1]);
-    tx_ack_buffer_ = new RdmaAckBuffer(this, buffer_names[2]);
-    rx_ack_buffer_ = new RdmaAckBuffer(this, buffer_names[3]);
     message_buffers_.reserve(kNumMessageBuffers);
     message_buffers_.push_back(tx_message_buffer_);
     message_buffers_.push_back(rx_message_buffer_);
-    message_buffers_.push_back(tx_ack_buffer_);
-    message_buffers_.push_back(rx_ack_buffer_);
     // create buffer on host
     tx_message_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize);
     rx_message_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize);
-    tx_ack_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaAckBufferSize);
-    rx_ack_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaAckBufferSize);
-    // bt_mu_.lock() is not used in constructor.
-    for (int i = 0; i < kNumMessageBuffers; i++) {
-      uint32_t index = NameHash(buffer_names[i]);
-      buffer_table_.insert({index, message_buffers_[i]});
-      buffer_index_name_table_.insert({index, buffer_names[i]});
-      buffer_name_index_table_.insert({buffer_names[i], index});
-    }
   }
   CHECK(PingPostRecv() == 0) << "Couldn't post receive from " << remote_name_
                              << " with error " << std::strerror(errno);
@@ -684,8 +635,6 @@ RdmaChannel::~RdmaChannel() {
   CHECK(!ibv_destroy_qp(qp_)) << "Failed to destroy QP";
   delete tx_message_buffer_;
   delete rx_message_buffer_;
-  delete tx_ack_buffer_;
-  delete rx_ack_buffer_;
 }
 
 void RdmaChannel::SetRemoteAddress(const RdmaAddress& ra, bool override) {
@@ -711,119 +660,36 @@ void RdmaChannel::SetRemoteAddress(const RdmaAddress& ra, bool override) {
 void RdmaChannel::Recv() {
   struct ibv_recv_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t) this;
+  wr.wr_id = (uint64_t)this;
   struct ibv_recv_wr* bad_wr;
   CHECK(!ibv_post_recv(qp_, &wr, &bad_wr)) << "Failed to post recv";
 }
 
-// Lookup 32-bit buffer index from buffer name
-// Args:
-//   buffer_name: name of the buffer
-// Returns:
-//   32-bit index
-uint32_t RdmaChannel::LookupBufferIndex(const string& buffer_name) {
-  mutex_lock lock{bt_mu_};
-  BufferNameIndexTable::iterator iter =
-      buffer_name_index_table_.find(buffer_name);
-  CHECK(iter != buffer_name_index_table_.end());
-  return iter->second;
-}
-
-// Find a buffer by its 32-bit index
-// Args:
-//   index: 32-bit hash code of the tensor buffer name
-// Returns:
-//   name of the tensor buffer
-RdmaBuffer* RdmaChannel::FindBuffer(const uint32_t index) {
-  mutex_lock lock{bt_mu_};
-  BufferTable::iterator iter = buffer_table_.find(index);
-  CHECK(iter != buffer_table_.end());
-  return iter->second;
-}
-
-// Find a buffer by its name
-// Args:
-//   name: name of the buffer
-// Returns:
-//   the named rdma buffer
-RdmaBuffer* RdmaChannel::FindBuffer(const string& name) {
-  uint32_t index = LookupBufferIndex(name);
-  return FindBuffer(index);
-}
-
-// Find a buffer if it exists, otherwise create one.
-// The memory inside the created buffer is not allocated.
-// Args:
-//   name: the name of the buffer
-//   buffer_type: TENSOR, MESSAGE or ACK.
-// Returns:
-//   the named buffer
-RdmaBuffer* RdmaChannel::FindOrCreateBuffer(const string& name,
-                                            BufferType buffer_type) {
-  mutex_lock lock{bt_mu_};
-  RdmaBuffer* rb;
-  // find index
-  BufferNameIndexTable::iterator iter = buffer_name_index_table_.find(name);
-  if (iter != buffer_name_index_table_.end()) {
-    uint32_t index = iter->second;
-    // find buffer
-    BufferTable::iterator iter = buffer_table_.find(index);
-    CHECK(iter != buffer_table_.end());
-    rb = iter->second;
-  } else {
-    uint32_t index = NameHash(name);
-    if (buffer_type == TENSOR) {
-      rb = new RdmaTensorBuffer(this, name);
-    } else if (buffer_type == MESSAGE) {
-      rb = new RdmaMessageBuffer(this, name);
-    } else if (buffer_type == ACK) {
-      rb = new RdmaAckBuffer(this, name);
-    }
-    buffer_name_index_table_.insert({name, index});
-    buffer_index_name_table_.insert({index, name});
-    buffer_table_.insert({index, rb});
+RdmaTensorRequest* RdmaChannel::InsertTensorRequest(
+    const string& key, int64 step_id, Device* dst_dev,
+    const Rendezvous::Args recv_args,
+    const RdmaTensorRequest::RecvDoneCallback& done) {
+  mutex_lock lock{ct_mu_};
+  uint32_t request_index = request_serial_++;
+  if (request_serial_ > RDMA_IMM_MAX_REQUEST_ID) {
+    request_serial_ = 0;
   }
-  CHECK(rb);
-  return rb;
+  RdmaTensorRequest request(request_index, key, step_id, this, dst_dev,
+                            recv_args, done);
+  auto it = request_table_.emplace(request_index, request);
+  return &it.first->second;
 }
 
-// Insert callback to the callback_table.
-// The callback is activated when the corresponding tensor is received.
-// Arg:
-//   key: the name of the tensor
-//   recv_done: the callback associated with the tensor.
-// Returns:
-//   None
-void RdmaChannel::InsertRecvCallback(const string& key,
-                                     std::function<void()> recv_done) {
+void RdmaChannel::RemoveTensorRequest(uint32_t request_index) {
   mutex_lock lock{ct_mu_};
-  callback_table_.insert({key, recv_done});
+  request_table_.erase(request_index);
 }
 
-// Remove callback from the callback_table.
-// Arg:
-//   key: the name of the tensor
-// Returns:
-//   None
-void RdmaChannel::RemoveRecvCallback(const string& key) {
+RdmaTensorRequest* RdmaChannel::GetTensorRequest(uint32_t request_index) {
   mutex_lock lock{ct_mu_};
-  callback_table_.erase(key);
-}
-
-// Run named callback in the callback_table.
-// Arg:
-//   key: the name of the tensor
-// Returns:
-//   None
-void RdmaChannel::RunRecvCallback(const string& key) {
-  std::function<void()> recv_done;
-  {
-    mutex_lock lock{ct_mu_};
-    CallbackTable::iterator iter = callback_table_.find(key);
-    CHECK(iter != callback_table_.end());
-    recv_done = iter->second;
-  }
-  recv_done();
+  RequestTable::iterator iter = request_table_.find(request_index);
+  CHECK(iter != request_table_.end());
+  return &iter->second;
 }
 
 void RdmaChannel::Connect() {
@@ -865,11 +731,11 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
     attr.ah_attr.grh.traffic_class = adapter_->params_.traffic_class;
 
     int r;
-    CHECK(!(r = ibv_modify_qp(qp_, &attr, IBV_QP_STATE | IBV_QP_AV |
-                                              IBV_QP_PATH_MTU |
-                                              IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
-                                              IBV_QP_MAX_DEST_RD_ATOMIC |
-                                              IBV_QP_MIN_RNR_TIMER)))
+    CHECK(!(r = ibv_modify_qp(qp_, &attr,
+                              IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
+                                  IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
+                                  IBV_QP_MAX_DEST_RD_ATOMIC |
+                                  IBV_QP_MIN_RNR_TIMER)))
         << "QP to Ready to Receive " << r;
 
     memset(&attr, 0, sizeof(ibv_qp_attr));
@@ -880,33 +746,30 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
     attr.rnr_retry = 7; /* infinite */
     attr.max_rd_atomic = 1;
 
-    CHECK(!(r = ibv_modify_qp(qp_, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT |
-                                              IBV_QP_RETRY_CNT |
-                                              IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
-                                              IBV_QP_MAX_QP_RD_ATOMIC)))
+    CHECK(!(r = ibv_modify_qp(qp_, &attr,
+                              IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
+                                  IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
+                                  IBV_QP_MAX_QP_RD_ATOMIC)))
         << "QP to Ready to Send " << r;
 
     connected_ = true;
   } else {
-    LOG(INFO) << "channel already connected";
+    RDMA_LOG(2) << "channel already connected";
   }
 }
 
-RdmaBuffer::RdmaBuffer(RdmaChannel* channel, string name)
+RdmaMessageBuffer::RdmaMessageBuffer(RdmaChannel* channel, string name)
     : channel_(channel), name_(name) {}
 
-RdmaBuffer::~RdmaBuffer() {
+RdmaMessageBuffer::~RdmaMessageBuffer() {
   CHECK(!ibv_dereg_mr(self_)) << "ibv_dereg_mr failed";
   FreeBuffer();
 }
 
-void RdmaBuffer::FreeBuffer() {
+void RdmaMessageBuffer::FreeBuffer() {
   if ((buffer_ != nullptr) && buffer_on_host_) {
     free(buffer_);
   }
-  // TODO
-  // release buffer if it is on device.
-  // We don't support RDMABuffer on device at this moment.
 }
 
 // Allocate CPU memory for the Rdma buffer
@@ -915,7 +778,7 @@ void RdmaBuffer::FreeBuffer() {
 //   lock: whether or not mutex_lock the process to protect concurrency.
 // Returns:
 //   None
-void RdmaBuffer::CreateCPUBuffer(size_t size, bool lock) {
+void RdmaMessageBuffer::CreateCPUBuffer(size_t size, bool lock) {
   CHECK(size > 0);
   if (lock) {
     mu_.lock();
@@ -943,7 +806,7 @@ void RdmaBuffer::CreateCPUBuffer(size_t size, bool lock) {
 //   override: whether override existing information
 // Returns:
 //   None
-void RdmaBuffer::SetRemoteMR(RemoteMR rmr, bool override) {
+void RdmaMessageBuffer::SetRemoteMR(RemoteMR rmr, bool override) {
   mutex_lock lock{mu_};
   if ((override) || (remote_status_ == none)) {
     remote_.remote_addr = rmr.remote_addr;
@@ -956,63 +819,51 @@ void RdmaBuffer::SetRemoteMR(RemoteMR rmr, bool override) {
 }
 
 // Put a task in the buffer's job queue
-void RdmaBuffer::EnqueueItem(string item) {
+void RdmaMessageBuffer::EnqueueItem(string item) {
   mutex_lock lock{mu_};
   queue_.push(item);
 }
 
 // Rdma-Write the content of the buffer
-void RdmaBuffer::Write(uint32_t imm_data, size_t buffer_size) {
+void RdmaMessageBuffer::Write(uint32_t imm_data, size_t buffer_size) {
+  Write(channel_, imm_data, buffer_size, (uint64_t)buffer_, self_->lkey,
+        remote_.remote_addr, remote_.rkey, RDMA_WRITE_ID_MESSAGE, this);
+}
+
+// Generalized Write method
+void RdmaMessageBuffer::Write(const RdmaChannel* channel, uint32_t imm_data,
+                              size_t buffer_size, uint64_t src_addr,
+                              uint32_t lkey, uint64_t remote_addr,
+                              uint32_t rkey, RdmaWriteIDType write_type,
+                              void* write_context) {
   struct ibv_sge list;
-  list.addr = (uint64_t)buffer_;
+  list.addr = src_addr;
   list.length = buffer_size;
-  list.lkey = self_->lkey;
+  list.lkey = lkey;
 
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t) this;
+  wr.wr_id = (uint64_t) new RdmaWriteID(write_type, write_context);
   wr.sg_list = &list;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
   wr.send_flags = IBV_SEND_SIGNALED;
   wr.imm_data = imm_data;
-  wr.wr.rdma.remote_addr = (uint64_t)remote_.remote_addr;
-  wr.wr.rdma.rkey = remote_.rkey;
+  wr.wr.rdma.remote_addr = remote_addr;
+  wr.wr.rdma.rkey = rkey;
 
   struct ibv_send_wr* bad_wr;
-  CHECK(!ibv_post_send(channel_->qp_, &wr, &bad_wr)) << "Failed to post send";
-}
-
-RdmaAckBuffer::RdmaAckBuffer(RdmaChannel* channel, string name)
-    : RdmaBuffer(channel, name) {}
-
-RdmaMessageBuffer::RdmaMessageBuffer(RdmaChannel* channel, string name)
-    : RdmaBuffer(channel, name) {}
-
-RdmaTensorBuffer::RdmaTensorBuffer(RdmaChannel* channel, string name)
-    : RdmaBuffer(channel, name) {}
-
-RdmaTensorBuffer::~RdmaTensorBuffer() {
-  for (Itable it = retable.begin(); it != retable.end(); ++it) {
-    delete (it->second);
-  }
+  CHECK(!ibv_post_send(channel->qp_, &wr, &bad_wr)) << "Failed to post send";
 }
 
 // Send the next ack from the buffer's job queue.
-void RdmaAckBuffer::SendNextItem() {
-  uint32_t imm_data = LookupBufferIndex("rx_ack_buffer");
-  RdmaMessage rm;
-  rm.name_ = "rx_ack_buffer";
-  rm.type_ = RDMA_MESSAGE_ACK;
-  rm.name_size_ = rm.name_.size();
-  string message = RdmaMessage::CreateMessage(rm);
-  memcpy(buffer_, message.data(), message.size());
-  Write(imm_data, message.size());
+void RdmaMessageBuffer::SendAck(const RdmaChannel* channel) {
+  Write(channel, RDMA_IMM_DATA_ACK, 0, 0, 0, 0, 0, RDMA_WRITE_ID_ACK, nullptr);
 }
 
 // Send the next message from the buffer's job queue.
 void RdmaMessageBuffer::SendNextItem() {
-  uint32_t imm_data = LookupBufferIndex("rx_message_buffer");
+  uint32_t imm_data = RDMA_IMM_DATA_MESSAGE;
   mu_.lock();
   if (!queue_.empty() && (local_status_ == idle) && (remote_status_ == idle)) {
     local_status_ = busy;
@@ -1029,244 +880,390 @@ void RdmaMessageBuffer::SendNextItem() {
   }
 }
 
-Rendezvous::DoneCallback RdmaTensorBuffer::getRecvTensorCallback(
-    const string& key_with_step_id, const string& key, int64 step_id,
-    const Rendezvous::ParsedKey& parsed) {
-  Rendezvous::DoneCallback cb = [this, key_with_step_id, key, step_id, parsed](
-      const Status& status, const Rendezvous::Args& send_args,
-      const Rendezvous::Args& recv_args, const Tensor& in, bool is_dead) {
-    CHECK(status.ok()) << "RecvLocalAsync was not ok, key" << key_with_step_id
-                       << " error message: " << status.error_message();
-    size_t buffer_size = RdmaMessage::kMessageTotalBytes;
-    size_t tensor_bytes = 0;
-    // Figures out which device the tensor is hosted on.
-    Device* src_dev = nullptr;
-    Status s = channel_->adapter_->worker_env_->device_mgr->LookupDevice(
-        parsed.src_device, &src_dev);
-    CHECK(s.ok()) << "src device not found";
-    // Does the device have the right incarnation number we expect?
-    CHECK(src_dev->attributes().incarnation() == parsed.src_incarnation)
-        << "RecvTensor expects a different device incarnation: "
-        << parsed.src_incarnation << " vs. "
-        << src_dev->attributes().incarnation()
-        << ". Your worker job was probably restarted. Check your "
-        << "worker job for the reason why it was restarted.";
-    Device* dst_dev = nullptr;
-    // destination is on CPU.
-    s = channel_->adapter_->worker_env_->device_mgr->LookupDevice("CPU:0",
-                                                                  &dst_dev);
-    CHECK(s.ok()) << "dst device not found";
-    AllocatorAttributes dst_alloc_attr;
-    dst_alloc_attr.set_on_host(true);
-
-    bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
-    // string tensor needs to be serialized
-    Tensor copy;
-    TensorProto proto;
-    if (src_dev->tensorflow_gpu_device_info() &&
-        (!send_args.alloc_attrs.on_host())) {
 #if GOOGLE_CUDA
-      CHECK(send_args.device_context) << "send dev name: " << src_dev->name()
-                                      << " gpu_info: "
-                                      << src_dev->tensorflow_gpu_device_info();
-
-      if (can_memcpy) {
-        AllocatorAttributes host_alloc_attrs;
-        host_alloc_attrs.set_gpu_compatible(true);
-        host_alloc_attrs.set_on_host(true);
-        Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
-        copy = Tensor(alloc, in.dtype(), in.shape());
-        tensor_bytes = in.TotalBytes();
-        buffer_size += tensor_bytes;
-        GPUUtil::CopyGPUTensorToCPU(
-            src_dev, send_args.device_context, &in, &copy,
-            [this, copy, tensor_bytes, buffer_size, key, in, step_id,
-             key_with_step_id, is_dead, send_args, recv_args](const Status& s) {
-              CHECK(s.ok()) << "copy tensor from gpu sync";
-              StringPiece copy_buf;
-              copy_buf = copy.tensor_data();
-              PostCopyOperations(true, buffer_size, tensor_bytes, key, in,
-                                 step_id, is_dead, key_with_step_id, &copy,
-                                 NULL, &copy_buf, send_args, recv_args);
-            });
-      } else {
-        // "val" is on a GPU. No longer uses GPUUtil to fill the proto, use
-        // aync instead
-        GPUUtil::SetProtoFromGPU(
-            in, src_dev, send_args.device_context, &proto, is_dead,
-	    [this, proto, buffer_size, key, in, step_id, key_with_step_id,
-            is_dead, send_args, recv_args](const Status& s) mutable {
-              CHECK(s.ok()) << "copy proto from gpu sync";
-              auto tensor_bytes = proto.ByteSize();
-              buffer_size += tensor_bytes;
-              PostCopyOperations(false, buffer_size, tensor_bytes, key, in,
-                                 step_id, is_dead, key_with_step_id, NULL,
-                                 &proto, NULL, send_args, recv_args);
-            });
-      }
+static void CountCopies(const std::string& key, void* src_addr, void* dst_addr,
+                        size_t tensor_bytes, bool is_gpu_to_cpu) {
+#ifdef RDMA_COUNT_COPIES
+  static uint64_t numGPUToCPUCopies = 0;
+  static uint64_t numGPUToCPUCopiedBytes = 0;
+  static uint64_t numCPUToGPUCopies = 0;
+  static uint64_t numCPUToGPUCopiedBytes = 0;
+  static uint64_t numTotalCopies = 0;
+
+  if (is_gpu_to_cpu) {
+    ++numGPUToCPUCopies;
+    numGPUToCPUCopiedBytes += tensor_bytes;
+  } else {
+    ++numCPUToGPUCopies;
+    numCPUToGPUCopiedBytes += tensor_bytes;
+  }
+  if ((++numTotalCopies % 0x400) == 0) {
+    RDMA_LOG(0) << "Tensor copies:"
+                << " GPU to CPU: " << numGPUToCPUCopies << " ("
+                << numGPUToCPUCopiedBytes << " Bytes)"
+                << " CPU to GPU: " << numCPUToGPUCopies << " ("
+                << numCPUToGPUCopiedBytes << " Bytes)";
+  }
+  RDMA_LOG(2) << "Copying tensor " << key << " From: " << src_addr
+              << " To: " << dst_addr;
+#endif  // RDMA_COUNT_COPIES
+}
 #endif  // GOOGLE_CUDA
-    } else {
-      // tensor is in CPU memory.
-      StringPiece copy_buf;
-      if (can_memcpy) {
-        copy_buf = in.tensor_data();
-        tensor_bytes = in.TotalBytes();
-      } else {
-        in.AsProtoTensorContent(&proto);
-        tensor_bytes = proto.ByteSize();
-      }
-      buffer_size += tensor_bytes;
-      PostCopyOperations(can_memcpy, buffer_size, tensor_bytes, key, in,
-                         step_id, is_dead, key_with_step_id, &copy, &proto,
-                         &copy_buf, send_args, recv_args);
+
+#ifdef RDMA_DATA_VALIDATION
+static uint64_t Checksum(Device* device, const DeviceContext* device_context,
+                         const Tensor& in) {
+  uint64 checksum = 0;
+  if (DataTypeCanUseMemcpy(in.dtype())) {
+#if GOOGLE_CUDA
+    if (in.TotalBytes() == 0) {
+      return 0;
     }
-  };
-  return cb;
+    checksum = (device_context != nullptr)
+                   ? GPUUtil::Checksum(device, device_context, in)
+                   : GPUUtil::Checksum(in);
+#endif  // GOOGLE_CUDA
+  } else {
+    string s = in.SummarizeValue(999999);
+    checksum = Hash64(s.c_str(), s.size(), 0);
+  }
+  return checksum;
 }
 
-// Send the next tensor from the buffer's job queue.
-void RdmaTensorBuffer::SendNextItem() {
-  // get the key
-  string key_with_step_id = "";
-  {
-    mutex_lock lock{mu_};
-    if (!queue_.empty()) {
-      key_with_step_id = queue_.front();
-      queue_.pop();
+static void ValidateChecksum(uint64_t expected, uint64_t actual,
+                             const Tensor& in, uint32_t request_index,
+                             const std::string& key, const std::string& msg) {
+  RDMA_LOG(2) << "Request #" << request_index << ": " << key
+              << ": Checksum: " << std::hex << " Expected = 0x" << expected
+              << ". Actual = 0x" << actual << ".";
+
+  if (expected != actual) {
+    // Checksum failed. There is one case where this is allowed - if the
+    // tensor is an AssignAdd of the global step. Since the data-validation
+    // always postpones the Tensor response in order to send a checksum message,
+    // it is possible that the global-step was updated while the response was
+    // still in queue.
+    if ((in.TotalBytes() == 8) && (in.dtype() == DT_INT64)) {
+      int64_t prev_val = *(int64_t*)DMAHelper::base(&in) - 1;
+      actual = Hash64((const char*)&prev_val, 8, 0);
+    }
+    if (expected != actual) {
+      LOG(FATAL) << "[" << msg << "]: Checksum validation failed for request #"
+                 << request_index << ": " << key << std::hex << " "
+                 << DataTypeString(in.dtype()) << " "
+                 << in.shape().DebugString() << " (0x" << in.TotalBytes()
+                 << " bytes): "
+                 << " Expected 0x" << expected << ". Got 0x" << actual << ".";
     }
   }
+}
+#endif  // RDMA_DATA_VALIDATION
+
+#if GOOGLE_CUDA
+// Sync the 'done' operation on the GPU stream, but without all the data
+// copying.
+static void StreamGPUOp(Device* gpu_device, const DeviceContext* device_context,
+                        StatusCallback done) {
+  Tensor dummy1, dummy2;
+  GPUUtil::CopyGPUTensorToCPU(gpu_device, device_context, &dummy1, &dummy2,
+                              done);
+}
+#endif  // GOOGLE_CUDA
+
+RdmaTensorResponse* RdmaChannel::AddTensorResponse(const RdmaMessage& rm) {
+  mutex_lock lock{mu_};
+  auto it =
+      responses_table_.emplace(rm.request_index_, RdmaTensorResponse(this, rm));
+  CHECK(it.second) << "Response with the ID " << rm.request_index_
+                   << " already exists.";
+  return &it.first->second;
+}
+
+RdmaTensorResponse* RdmaChannel::UpdateTensorResponse(const RdmaMessage& rm) {
+  mutex_lock lock{mu_};
+  auto it = responses_table_.find(rm.request_index_);
+  CHECK(it != responses_table_.end()) << "No response found.";
+  RdmaTensorResponse* response = &it->second;
+  response->Update(rm);
+  return response;
+}
+
+void RdmaChannel::RemoveTensorResponse(uint32_t request_index) {
+  mutex_lock lock{mu_};
+  responses_table_.erase(request_index);
+}
+
+void RdmaTensorResponse::Start() {
+  Rendezvous::ParsedKey parsed;
+  Status s = Rendezvous::ParseKey(rm_.name_, &parsed);
+  if (!s.ok()) {
+    SendErrorStatus(s);
+    return;
+  }
 
-  // send the tensor if a key is acquired.
-  if (key_with_step_id != "") {
-    VLOG(2) << "try to send tensor: " << key_with_step_id;
-    string key;
-    int64 step_id;
-    VerbsUtil::GetKeyAndStepId(key_with_step_id, key, step_id);
-    CHECK(key.compare(name_) == 0);
-    Rendezvous::ParsedKey parsed;
-    Rendezvous::ParseKey(key, &parsed);
-    Rendezvous::DoneCallback cb =
-        getRecvTensorCallback(key_with_step_id, key, step_id, parsed);
-    channel_->adapter_->worker_env_->rendezvous_mgr->RecvLocalAsync(step_id,
-                                                                    parsed, cb);
+  channel_->adapter_->worker_env_->rendezvous_mgr->RecvLocalAsync(
+      rm_.step_id_, parsed,
+      [this, parsed](const Status& status, const Rendezvous::Args& send_args,
+                     const Rendezvous::Args& recv_args, const Tensor& in,
+                     bool is_dead) {
+        CHECK(status.ok()) << "RecvLocalAsync was not ok."
+                           << " error message: " << status.error_message();
+        RecvHandler(parsed, send_args, recv_args, in, is_dead);
+      });
+}
+
+void RdmaTensorResponse::Resume() { SendContent(*tensor_, *proto_, is_dead_); }
+
+// Helper for RecvTensor. Validates "key" and returns the source
+// device in "*src_dev".
+Status RdmaTensorResponse::PrepareRecvTensor(
+    const Rendezvous::ParsedKey& parsed, Device** src_dev) {
+  // Figures out which device the tensor is hosted on.
+  string local_name = DeviceNameUtils::LocalName(parsed.src_device);
+  TF_RETURN_IF_ERROR(channel_->adapter_->worker_env_->device_mgr->LookupDevice(
+      local_name, src_dev));
+
+  // Does the device have the right incarnation number we expect?
+  if ((*src_dev)->attributes().incarnation() != parsed.src_incarnation) {
+    return errors::Aborted(
+        "RecvTensor expects a different device incarnation: ",
+        parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
+        ". Your worker job was probably restarted. Check your "
+        "worker job for the reason why it was restarted.");
   }
+
+  return Status::OK();
 }
 
-void RdmaTensorBuffer::ReSendNextItem() {
-  // get the key
-  string key_with_step_id = "";
-  {
-    mutex_lock lock{mu_};
-    if (!requeue.empty()) {
-      key_with_step_id = requeue.front();
-      requeue.pop();
-    }
+void RdmaTensorResponse::RecvHandler(Rendezvous::ParsedKey parsed,
+                                     const Rendezvous::Args& send_args,
+                                     const Rendezvous::Args& recv_args,
+                                     const Tensor& in, bool is_dead) {
+  Status s = PrepareRecvTensor(parsed, &src_dev_);
+  if (!s.ok()) {
+    SendErrorStatus(s);
+    return;
   }
 
-  // send the tensor if a key is acquired.
-  if (key_with_step_id != "") {
-    VLOG(2) << "try to send tensor: " << key_with_step_id;
-    string key;
-    int64 step_id;
-    VerbsUtil::GetKeyAndStepId(key_with_step_id, key, step_id);
-    CHECK(key.compare(name_) == 0);
-    Rendezvous::ParsedKey parsed;
-    Rendezvous::ParseKey(key, &parsed);
-    Rendezvous::DoneCallback cb =
-        getRecvTensorCallback(key_with_step_id, key, step_id, parsed);
-    ReItem* item;
-    {
-      mutex_lock lock{mu_};
-      Itable it = retable.find(key_with_step_id);
-      CHECK(it != retable.end()) << "Could not find dup-recv context";
-      item = it->second;
-      retable.erase(it);
+  meta_data_changed_ = TensorMetaDataChanged(in, is_dead);
+#ifdef RDMA_DATA_VALIDATION
+  // Always send a meta data message with the source checksum
+  meta_data_changed_ = rm_.type_ == RDMA_MESSAGE_TENSOR_REQUEST;
+  checksum_ = Checksum(src_dev_, send_args.device_context, in);
+#endif
+  bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+  // string tensor needs to be serialized
+  Tensor copy;
+  TensorProto proto;
+  const bool on_host = send_args.alloc_attrs.on_host();
+  if (src_dev_->tensorflow_gpu_device_info() && !on_host) {
+#if GOOGLE_CUDA
+    DeviceContext* send_dev_context = send_args.device_context;
+    CHECK(send_dev_context)
+        << "send dev name: " << src_dev_->name()
+        << " gpu_info: " << src_dev_->tensorflow_gpu_device_info();
+
+    if (can_memcpy) {
+      // If the tensor is located on a GDR compatible GPU, there is no need to
+      // copy it. We can send directly from the source, just need to make sure
+      // we are in sync with the GPU stream.
+      // If the tensor's meta-data changed however, we will need to clone it,
+      // so anyway we'll have to copy it from GPU to CPU first. If at some
+      // point in time Clone() is changed to only save a shallow copy, we can
+      // skip the copy here as well.
+      if ((in.TotalBytes() > 0) && !meta_data_changed_ &&
+          (RdmaMemoryMgr::Singleton().FindMemoryRegion(
+               (void*)DMAHelper::base(&in), in.TotalBytes()) != nullptr)) {
+        StreamGPUOp(src_dev_, send_dev_context,
+                    [this, in, proto, is_dead](const Status& s) {
+                      Send(in, proto, is_dead, s);
+                    });
+        return;
+      }
+
+      // The tensor must be copied from GPU to CPU, because either:
+      // 1. The tensor is located on a non GDR compatible GPU.
+      // 2. The tensor's meta-data has changed.
+      Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+      copy = Tensor(alloc, in.dtype(), in.shape());
+      CountCopies(rm_.name_, (void*)DMAHelper::base(&in),
+                  (void*)DMAHelper::base(&copy), in.TotalBytes(), true);
+      GPUUtil::CopyGPUTensorToCPU(
+          src_dev_, send_dev_context, &in, &copy,
+          [this, copy, proto, is_dead](const Status& s) {
+            Send(copy, proto, is_dead, s);
+          });
+    } else {
+      GPUUtil::SetProtoFromGPU(
+          in, src_dev_, send_args.device_context, &proto, is_dead,
+          [this, in, proto, is_dead](const Status& s) mutable {
+            Send(in, proto, is_dead, s);
+          });
+    }
+#else
+    SendErrorStatus(errors::Internal("No GPU device in process"));
+#endif  // GOOGLE_CUDA
+  } else {
+    // tensor is in CPU memory.
+    if (!can_memcpy) {
+      in.AsProtoTensorContent(&proto);
     }
-    cb(Status::OK(), item->send_args, item->recv_args, item->in, item->is_dead);
-    delete (item);
+    Send(in, proto, is_dead, Status::OK());
+  }
+}
+
+void RdmaTensorResponse::Send(const Tensor& in, const TensorProto& proto,
+                              bool is_dead, const Status& status) {
+  if (!status.ok()) {
+    SendErrorStatus(status);
+    return;
+  }
+  bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+  bool proto_size_changed =
+      (!can_memcpy) && (proto.ByteSize() != rm_.tensor_bytes_);
+  if (meta_data_changed_ || proto_size_changed) {
+    Clone(in, proto, is_dead);
+    SendMetaData(in, proto, is_dead);
+  } else {
+    SendContent(in, proto, is_dead);
   }
 }
 
-void RdmaTensorBuffer::PostCopyOperations(
-    bool can_memcpy, size_t buffer_size, size_t tensor_bytes, const string& key,
-    const Tensor& in, int64 step_id, bool is_dead,
-    const string& key_with_step_id, const Tensor* copy,
-    const TensorProto* proto, const StringPiece* copy_buf,
-    const Rendezvous::Args& send_args, const Rendezvous::Args& recv_args) {
-  // prepare message
+bool RdmaTensorResponse::TensorMetaDataChanged(const Tensor& in, bool is_dead) {
+  return (rm_.data_type_ != in.dtype()) || (rm_.tensor_shape_ != in.shape()) ||
+         (rm_.is_dead_ != is_dead);
+}
+
+void RdmaTensorResponse::Clone(const Tensor& in, const TensorProto& proto,
+                               bool is_dead) {
+  // Clone the data to be sent later. For simplicity, we clone the tensor's
+  // data even if it is already a copy. Performance is less of a concern here
+  // since the meta-data hardly ever changes. The reason we create a copy, is
+  // that some tensors share their buffer between different step-ids, so the
+  // tensor content may change before re-request was completed.
+  bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+  if (can_memcpy && (in.TotalBytes() > 0)) {
+    AllocatorAttributes host_alloc_attrs;
+    host_alloc_attrs.set_nic_compatible(true);
+    host_alloc_attrs.set_on_host(true);
+    Allocator* allocator = src_dev_->GetAllocator(host_alloc_attrs);
+    tensor_ = new Tensor(allocator, in.dtype(), in.shape());
+    memcpy(DMAHelper::base(tensor_), DMAHelper::base(&in), in.TotalBytes());
+  } else {
+    tensor_ = new Tensor(in.dtype(), in.shape());
+  }
+  if (!can_memcpy) {
+    proto_ = new TensorProto(proto);
+  }
+  is_dead_ = is_dead;
+}
+
+void RdmaTensorResponse::SendMetaData(const Tensor& in,
+                                      const TensorProto& proto, bool is_dead) {
+  RDMA_LOG(2) << "Request #" << rm_.request_index_
+              << ": Meta data changed: " << rm_.name_;
+  bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+  size_t tensor_bytes = (can_memcpy) ? in.TotalBytes() : proto.ByteSize();
+
+  // Send meta-data update:
   RdmaMessage rm;
-  rm.name_size_ = key.size();
-  rm.name_ = key;
+  rm.type_ = RDMA_MESSAGE_META_DATA_UPDATE;
+  rm.name_size_ = rm_.name_.size();
+  rm.name_ = rm_.name_;
   rm.tensor_shape_ = in.shape();
   rm.data_type_ = in.dtype();
-  rm.step_id_ = step_id;
+  rm.step_id_ = rm_.step_id_;
   rm.is_dead_ = is_dead;
   rm.tensor_bytes_ = tensor_bytes;
-  rm.buffer_size_ = buffer_size;
-  mu_.lock();
-  if (local_status_ == none || (buffer_size > size_ && local_status_ == idle &&
-                                remote_status_ == idle)) {
-    if ((local_status_ != none) && (buffer_size > size_)) {
-      VLOG(2) << "Extend RDMA buffer from " << size_ << " to " << buffer_size;
-    }
-    CreateCPUBuffer(buffer_size, false);
-    // Need to be received again, put into the re-recv queue and the table
-    requeue.push(key_with_step_id);
-    ReItem* item = new ReItem(send_args, recv_args, in, is_dead);
-    retable.insert(std::pair<string, ReItem*>(key_with_step_id, item));
-    mu_.unlock();
-    // no longer used: put back the key since it is not sent;
-    // ask the remote to create the same buffer
-    rm.type_ = RDMA_MESSAGE_BUFFER_REQUEST;
-    rm.remote_addr_ = reinterpret_cast<uint64_t>(buffer_);
-    rm.rkey_ = self_->rkey;
-    string message = RdmaMessage::CreateMessage(rm);
-    channel_->tx_message_buffer_->EnqueueItem(message);
-    channel_->tx_message_buffer_->SendNextItem();
-  } else if ((local_status_ == idle) && (remote_status_ == idle)) {
-    // both buffers are ready, send the tensor
-    local_status_ = busy;
-    remote_status_ = busy;
-    // local/remote_status_ won't be set back to idle
-    // unitl Write() is successful
-    mu_.unlock();
-    if (!((buffer_size == size_ && rm.data_type_ != DT_STRING) ||
-          (buffer_size <= size_ && rm.data_type_ == DT_STRING))) {
-      VLOG(2) << "Tensor and buffer size do not agree,"
-              << " buffer_size = " << size_
-              << " requested tensor size = " << buffer_size << in.DebugString();
-    }
-    uint32_t imm_data = LookupBufferIndex(key);
-    rm.type_ = RDMA_MESSAGE_TENSOR_WRITE;
-    string message = RdmaMessage::CreateMessage(rm);
-    memcpy(buffer_, message.data(), message.size());
-    if (!is_dead) {
-      // copy the tensor buffer content
-      void* output = static_cast<void*>(static_cast<char*>(buffer_) +
-                                        RdmaMessage::kTensorBufferStartIndex);
-      CHECK(tensor_bytes + RdmaMessage::kTensorBufferStartIndex <= size_);
-      if (can_memcpy) {
-        CHECK(copy != NULL) << "callback missing pointer to copy tensor";
-        CHECK(copy_buf != NULL) << "callback missing pointer to copy buffer";
-        CHECK(copy_buf->size() == tensor_bytes)
-            << "unexpected tensor size: " << copy_buf->size()
-            << " != " << tensor_bytes;
-        memcpy(output, copy_buf->data(), tensor_bytes);
-      } else {
-        CHECK(proto != NULL) << "callback missing pointer to proto tensor";
-        proto->SerializeToArray(output, tensor_bytes);
+  rm.request_index_ = rm_.request_index_;
+#ifdef RDMA_DATA_VALIDATION
+  rm.checksum_ = checksum_;
+#endif
+  RDMA_LOG(1) << "Step 0x" << std::hex << rm.step_id_ << std::dec
+              << ": Sending RDMA_MESSAGE_META_DATA_UPDATE #"
+              << rm.request_index_ << ": " << rm.name_
+              << " (shape = " << rm.tensor_shape_.DebugString() << "."
+              << " data-type = " << DataTypeString(rm.data_type_) << "."
+              << " is-dead = " << rm.is_dead_ << ")";
+
+  string message = RdmaMessage::CreateMessage(rm);
+  channel_->tx_message_buffer_->EnqueueItem(message);
+  channel_->tx_message_buffer_->SendNextItem();
+}
+
+void RdmaTensorResponse::SendContent(const Tensor& in, const TensorProto& proto,
+                                     bool is_dead) {
+  bool can_memcpy = DataTypeCanUseMemcpy(in.dtype());
+  size_t tensor_bytes = (can_memcpy) ? in.TotalBytes() : proto.ByteSize();
+  uint32_t imm_data = rm_.request_index_;
+  if (!is_dead) {
+    if (can_memcpy) {
+      src_buffer_ = const_cast<TensorBuffer*>(DMAHelper::buffer(&in));
+      if (src_buffer_ != nullptr) {
+        src_buffer_->Ref();  // Keep buffer alive until write is complete
+        src_addr_ = src_buffer_->data();
+        mr_ = RdmaMemoryMgr::Singleton().FindMemoryRegion(src_addr_,
+                                                          tensor_bytes);
       }
     } else {
-      buffer_size = RdmaMessage::kMessageTotalBytes;
+      RDMA_LOG(2) << "Encoding proto: " << rm_.name_
+                  << " (Size: " << tensor_bytes << ") " << in.DebugString();
+      src_addr_ = malloc(tensor_bytes);
+      mr_ = ibv_reg_mr(channel_->adapter_->pd_, src_addr_, tensor_bytes,
+                       IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+      proto.SerializeToArray(src_addr_, tensor_bytes);
     }
-    Write(imm_data, buffer_size);
   } else {
-    // Need to be received again, put into the re-recv queue and the table
-    requeue.push(key_with_step_id);
-    ReItem* item = new ReItem(send_args, recv_args, in, is_dead);
-    retable.insert(std::pair<string, ReItem*>(key_with_step_id, item));
-    mu_.unlock();
+    tensor_bytes = 0;
+  }
+
+  uint32_t lkey = (mr_ == nullptr) ? 0 : mr_->lkey;
+  RDMA_LOG(1) << "Step 0x" << std::hex << rm_.step_id_ << std::dec
+              << ": Sending tensor content #" << rm_.request_index_ << " from "
+              << std::hex << src_addr_ << " (0x" << lkey << ")"
+              << " to " << rm_.remote_addr_ << " (0x" << rm_.rkey_
+              << "): " << rm_.name_ << " (size: 0x" << std::hex << tensor_bytes
+              << ")";
+
+  RdmaMessageBuffer::Write(channel_, imm_data, tensor_bytes,
+                           (uint64_t)src_addr_, lkey, rm_.remote_addr_,
+                           rm_.rkey_, RDMA_WRITE_ID_TENSOR_WRITE, this);
+}
+
+void RdmaTensorResponse::SendErrorStatus(const Status& status) {
+  RdmaMessage rm;
+  rm.type_ = RDMA_MESSAGE_ERROR_STATUS;
+  rm.name_size_ = rm_.name_.size();
+  rm.name_ = rm_.name_;
+  rm.step_id_ = rm_.step_id_;
+  rm.request_index_ = rm_.request_index_;
+  rm.status_ = status;
+  LOG(ERROR) << "Step 0x" << std::hex << rm.step_id_ << std::dec
+             << ": Sending RDMA_MESSAGE_ERROR_STATUS #" << rm.request_index_
+             << ": " << rm.name_ << ". Status: " << status.ToString();
+
+  string message = RdmaMessage::CreateMessage(rm);
+  channel_->tx_message_buffer_->EnqueueItem(message);
+  channel_->tx_message_buffer_->SendNextItem();
+
+  // Destroy the response.
+  Destroy();
+}
+
+void RdmaTensorResponse::Destroy() {
+  if (src_buffer_ != nullptr) {
+    src_buffer_->Unref();
+  }
+  if (tensor_ != nullptr) {
+    delete tensor_;
+  }
+  if (proto_ != nullptr) {
+    ibv_dereg_mr(mr_);
+    free(src_addr_);
+    delete proto_;
   }
+  // Remove response from the pending list:
+  channel_->RemoveTensorResponse(rm_.request_index_);
 }
 
 // Create a RdmaMessage according to the pre-defined format
@@ -1276,43 +1273,46 @@ void RdmaTensorBuffer::PostCopyOperations(
 //   message in string format
 string RdmaMessage::CreateMessage(const RdmaMessage& rm) {
   // Rdma Message format
-  // type|name_size|name|step_id|buffer_size|remote_addr|rkey|is_dead|...
-  //   1B|    2B   | 512|  8B   |    8B     |       8B  | 4B |    1B |...
-  // ...|data_type|tensor_shape|tensor_bytes|tensor_buffer
-  // ...|   XB    |    XB      |    8B      |...
+  // type|name_size|name|step_id|request_index|remote_addr|rkey|is_dead|...
+  //   1B|    2B   | 512|  8B   |     8B      |       8B  | 4B |    1B |...
+  // ...|data_type|tensor_shape|tensor_bytes|error_status          |
+  // ...|   XB    |    XB      |    8B      |size - 4B, proto - XB |
   //
-  // ACK:             type|13|"rx_ack_buffer"
-  // TENSOR_REQUEST:  type|name_size|tensor_name|step_id
-  // TENSOR_WRITE:    type|name_size|tensor_name|step_id|...|is_dead
-  //                 |data_type|tensor_shape|tensor_bytes
-  // BUFFER_IDLE:     type|name_size|buffer_name
-  // BUFFER_REQUEST:
-  // type|name_size|buffer_name|...|buffer_size|remote_addr|rkey|
-  // BUFFER_RESPONSE:
-  // type|name_size|buffer_name|...|buffer_size|remote_addr|rkey|
-  char message[kMessageTotalBytes];
+  // ACK:             Imm-type: ACK
+  // TENSOR_REQUEST:  Imm-type: MESSAGE
+  //                  Fields: type, request_index, name, step_id, remote_addr,
+  //                      rkey, is_dead, data_type, tensor_shape, tensor_bytes
+  // META_DATA_UPDATE: Imm-type: MESSAGE
+  //                  Fields: type, request_index, is_dead, data_type,
+  //                      tensor_shape, tensor_bytes
+  // TENSOR_RE_REQUST: Imm-type: MESSAGE
+  //                  Fields: type, request_index, name, step_id, remote_addr,
+  //                      rkey, is_dead, data_type, tensor_shape, tensor_bytes
+  // ERROR_STATUS:    Imm-type: MESSAGE
+  //                  Fields: type, request_index, name, step_id, error_status
+  // Tensor content:  Imm-type: request_index
+  size_t message_size = kMessageTotalBytes;
+  char message[kMessageTotalBytes + kErrorStatusMaxSize];
   // type
   message[kTypeStartIndex] = static_cast<char>(rm.type_) & 0xff;
-  // size of name
-  memcpy(&message[kNameSizeStartIndex], &rm.name_size_, sizeof(rm.name_size_));
-  // name
-  memcpy(&message[kNameStartIndex], rm.name_.data(), rm.name_.size());
-  // buffer_size, remote_addr, rkey
-  if ((rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) ||
-      (rm.type_ == RDMA_MESSAGE_BUFFER_RESPONSE)) {
-    memcpy(&message[kBufferSizeStartIndex], &rm.buffer_size_,
-           sizeof(rm.buffer_size_));
+  // request index
+  memcpy(&message[kRequestIndexStartIndex], &rm.request_index_,
+         sizeof(rm.request_index_));
+  // name, step_id, remote_addr, rkey
+  if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) ||
+      (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) {
+    memcpy(&message[kNameSizeStartIndex], &rm.name_size_,
+           sizeof(rm.name_size_));
+    memcpy(&message[kNameStartIndex], rm.name_.data(), rm.name_.size());
     memcpy(&message[kRemoteAddrStartIndex], &rm.remote_addr_,
            sizeof(rm.remote_addr_));
     memcpy(&message[kRkeyStartIndex], &rm.rkey_, sizeof(rm.rkey_));
-  }
-  // step_id
-  if ((rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) ||
-      (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST)) {
     memcpy(&message[kStepIdStartIndex], &rm.step_id_, sizeof(rm.step_id_));
   }
   // is_dead, data_type, tensor_shape, tensor_bytes
-  if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) {
+  if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) ||
+      (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) ||
+      (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) {
     memcpy(&message[kIsDeadStartIndex], &rm.is_dead_, sizeof(rm.is_dead_));
 
     memcpy(&message[kDataTypeStartIndex], &rm.data_type_,
@@ -1322,7 +1322,30 @@ string RdmaMessage::CreateMessage(const RdmaMessage& rm) {
     memcpy(&message[kTensorBytesStartIndex], &rm.tensor_bytes_,
            sizeof(rm.tensor_bytes_));
   }
-  return string(message, kMessageTotalBytes);
+  // checksum
+#ifdef RDMA_DATA_VALIDATION
+  memcpy(&message[kChecksumStartIndex], &rm.checksum_, sizeof(rm.checksum_));
+#endif
+  // error status
+  if (rm.type_ == RDMA_MESSAGE_ERROR_STATUS) {
+    ::grpc::Status gs = ToGrpcStatus(rm.status_);
+    ErrorStatusProto gsProto;
+    gsProto.set_error_code(gs.error_code());
+    gsProto.set_error_message(gs.error_message());
+    gsProto.set_error_details(gs.error_details());
+    uint32_t gsProtoSize = gsProto.ByteSize();
+    if (gsProtoSize + 4 > kErrorStatusMaxSize) {
+      LOG(ERROR) << "Error status (" << gsProtoSize + 4 << " bytes) "
+                 << "is too big to fit in RDMA message (" << kErrorStatusMaxSize
+                 << " bytes). Truncated.";
+      gsProtoSize = kErrorStatusMaxSize - 4;
+    }
+    uint32_t* proto_size = (uint32_t*)&message[kErrorStatusStartIndex];
+    *proto_size = gsProtoSize;
+    gsProto.SerializeToArray(&message[kErrorStatusStartIndex + 4], gsProtoSize);
+    message_size += gsProtoSize + 4;
+  }
+  return string(message, message_size);
 }
 
 // Parse a RdmaMessage according to the pre-defined format
@@ -1335,26 +1358,24 @@ void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) {
   char* message = static_cast<char*>(buffer);
   // type
   rm.type_ = static_cast<RdmaMessageType>(message[kTypeStartIndex]);
-  // name_size_
-  memcpy(&rm.name_size_, &message[kNameSizeStartIndex], sizeof(rm.name_size_));
-  // name
-  rm.name_ = string(&message[kNameStartIndex], rm.name_size_);
-  // buffer_size, remote_addr, rkey
-  if ((rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) ||
-      (rm.type_ == RDMA_MESSAGE_BUFFER_RESPONSE)) {
-    memcpy(&rm.buffer_size_, &message[kBufferSizeStartIndex],
-           sizeof(rm.buffer_size_));
+  // request index
+  memcpy(&rm.request_index_, &message[kRequestIndexStartIndex],
+         sizeof(rm.request_index_));
+  // name, step_id, remote_addr, rkey
+  if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) ||
+      (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) {
+    memcpy(&rm.name_size_, &message[kNameSizeStartIndex],
+           sizeof(rm.name_size_));
+    rm.name_ = string(&message[kNameStartIndex], rm.name_size_);
     memcpy(&rm.remote_addr_, &message[kRemoteAddrStartIndex],
            sizeof(rm.remote_addr_));
     memcpy(&rm.rkey_, &message[kRkeyStartIndex], sizeof(rm.rkey_));
-  }
-  // step_id
-  if ((rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) ||
-      (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST)) {
     memcpy(&rm.step_id_, &message[kStepIdStartIndex], sizeof(rm.step_id_));
   }
   // data_type, tensor_bytes, tensor_shape, is_dead
-  if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) {
+  if ((rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) ||
+      (rm.type_ == RDMA_MESSAGE_META_DATA_UPDATE) ||
+      (rm.type_ == RDMA_MESSAGE_TENSOR_RE_REQUEST)) {
     memcpy(&rm.is_dead_, &message[kIsDeadStartIndex], sizeof(rm.is_dead_));
     memcpy(&rm.data_type_, &message[kDataTypeStartIndex],
            sizeof(rm.data_type_));
@@ -1363,6 +1384,291 @@ void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) {
     memcpy(&rm.tensor_bytes_, &message[kTensorBytesStartIndex],
            sizeof(rm.tensor_bytes_));
   }
+  // checksum
+#ifdef RDMA_DATA_VALIDATION
+  memcpy(&rm.checksum_, &message[kChecksumStartIndex], sizeof(rm.checksum_));
+#endif
+  // error status
+  if (rm.type_ == RDMA_MESSAGE_ERROR_STATUS) {
+    ErrorStatusProto gsProto;
+    uint32_t gsProtoSize = *(uint32_t*)&message[kErrorStatusStartIndex];
+    CHECK(ParseProtoUnlimited(&gsProto, &message[kErrorStatusStartIndex + 4],
+                              gsProtoSize))
+        << "Failed to parse error status proto from message. Aborting.";
+    ::grpc::Status gs((::grpc::StatusCode)gsProto.error_code(),
+                      gsProto.error_message(), gsProto.error_details());
+    rm.status_ = FromGrpcStatus(gs);
+  }
+}
+
+//*****************************************************************************
+// RdmaMemoryMgr
+//*****************************************************************************
+
+ibv_mr* RdmaMemoryMgr::FindMemoryRegion(void* addr, size_t length) {
+  mutex_lock l(mrs_mu_);
+  auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
+  if (iter == std::end(mrs_) || iter->get()->addr > addr) {
+    return nullptr;
+  } else {
+    return iter->get();
+  }
+}
+
+void RdmaMemoryMgr::InsertMemoryRegion(void* addr, size_t length,
+                                       const std::string& allocator_name) {
+  if (length == 0) return;
+  ibv_mr* mr = ibv_reg_mr(pd_, addr, length,
+                          IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+  RDMA_LOG(1) << "Insert memory region 0x" << std::hex << mr->rkey << ". ["
+              << addr << "-" << (void*)((uint64_t)addr + length - 1) << "]"
+              << " SIZE: 0x" << length << " (" << allocator_name << ").";
+  if (mr != nullptr) {
+    mutex_lock l(mrs_mu_);
+    auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
+    mrs_.insert(iter, {mr, &MRDeleter});
+  } else {
+    LOG(WARNING) << "Cannot register memory region";
+  }
+}
+
+void RdmaMemoryMgr::EvictMemoryRegion(void* addr, size_t length) {
+  if (length == 0) return;
+  mutex_lock l(mrs_mu_);
+  auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
+  if (iter != std::end(mrs_) && iter->get()->addr == addr) {
+    mrs_.erase(iter);
+    RDMA_LOG(1) << "Evict memory region 0x" << std::hex << iter->get()->rkey;
+
+  } else {
+    LOG(WARNING) << "Failed to de-register memory region";
+  }
+}
+
+const TensorMetaData* RdmaMemoryMgr::GetTensorMetaData(
+    const std::string& tensor_name) {
+  mutex_lock l(tensor_meta_data_mu_);
+  auto it = tensors_meta_data_.find(tensor_name);
+  if (it == tensors_meta_data_.end()) {
+    return nullptr;
+  }
+  return &it->second;
+}
+
+const TensorMetaData* RdmaMemoryMgr::SetTensorMetaData(
+    const std::string& tensor_name, DataType dtype, const TensorShape& shape,
+    bool is_dead, size_t proto_size) {
+  mutex_lock l(tensor_meta_data_mu_);
+  TensorMetaData& meta_data = tensors_meta_data_[tensor_name];
+  meta_data.data_type_ = dtype;
+  meta_data.tensor_shape_ = shape;
+  meta_data.proto_size_ = proto_size;
+  meta_data.is_dead_ = is_dead;
+  return &meta_data;
+}
+
+//*****************************************************************************
+// RdmaTensorRequest
+//*****************************************************************************
+
+RdmaTensorRequest::RdmaTensorRequest(
+    uint32_t index, const string& key, int64 step_id, RdmaChannel* channel,
+    Device* dst_dev, const Rendezvous::Args recv_args,
+    const RdmaTensorRequest::RecvDoneCallback& done)
+    : index_(index),
+      key_(key),
+      step_id_(step_id),
+      channel_(channel),
+      dst_dev_(dst_dev),
+      recv_args_(recv_args),
+      meta_data_(RdmaMemoryMgr::Singleton().GetTensorMetaData(key)),
+      result_tensor_(nullptr),
+      proxy_tensor_(nullptr),
+      rdma_addr_(nullptr),
+      mr_(nullptr),
+      done_(done) {}
+
+RdmaTensorRequest::~RdmaTensorRequest() { DeallocateTensors(); }
+
+void RdmaTensorRequest::Done(const Status& s) {
+  Tensor val = std::move(*result_tensor_);
+
+#ifdef RDMA_DATA_VALIDATION
+  // Validate checksum
+  // Unfortunately we can't always do a Checksum directly on the result tensor.
+  // If the result tensor is on GPU, then we need to copy it back to CPU. If
+  // we happen to be in the midst of a proxy callback, then the copying will
+  // get stuck.
+  uint64_t checksum = (proxy_tensor_ != nullptr)
+                          ? Checksum(nullptr, nullptr, *proxy_tensor_)
+                          : Checksum(dst_dev_, recv_args_.device_context, val);
+  ValidateChecksum(checksum_, checksum, val, index_, key_, "RDMA");
+#endif
+
+  Rendezvous::Args recv_args = std::move(recv_args_);
+  bool is_dead = (meta_data_ == nullptr) ? false : meta_data_->is_dead_;
+  RecvDoneCallback done = done_;
+  DeallocateTensors();
+  channel_->RemoveTensorRequest(index_);
+  done(s, Rendezvous::Args(), recv_args, val, is_dead);
+}
+
+void RdmaTensorRequest::DeallocateTensors() {
+  if (result_tensor_ != nullptr) {
+    delete result_tensor_;
+    result_tensor_ = nullptr;
+  }
+  if (proxy_tensor_ != nullptr) {
+    delete proxy_tensor_;
+    proxy_tensor_ = nullptr;
+  }
+}
+
+bool RdmaTensorRequest::AllocateTensors() {
+  result_tensor_ =
+      new Tensor(dst_dev_->GetAllocator(recv_args_.alloc_attrs),
+                 meta_data_->data_type_, meta_data_->tensor_shape_);
+
+  size_t tensor_size = result_tensor_->TotalBytes();
+  bool can_memcpy = DataTypeCanUseMemcpy(result_tensor_->dtype());
+  if (can_memcpy) {
+    if (tensor_size == 0) {
+      return true;
+    }
+    rdma_addr_ = DMAHelper::base(result_tensor_);
+    mr_ = RdmaMemoryMgr::Singleton().FindMemoryRegion(rdma_addr_, tensor_size);
+#if GOOGLE_CUDA
+    if (mr_ == nullptr) {
+      // Can't RDMA directly to result. Use a proxy.
+      proxy_tensor_ =
+          new Tensor(ProcessState::singleton()->GetCUDAHostAllocator(0),
+                     result_tensor_->dtype(), result_tensor_->shape());
+      rdma_addr_ = DMAHelper::base(proxy_tensor_);
+      mr_ =
+          RdmaMemoryMgr::Singleton().FindMemoryRegion(rdma_addr_, tensor_size);
+    }
+#endif
+  } else {
+    uint32_t proto_size = meta_data_->proto_size_;
+    rdma_addr_ = malloc(proto_size);
+    mr_ = ibv_reg_mr(RdmaMemoryMgr::Singleton().pd_, rdma_addr_, proto_size,
+                     IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+  }
+  CHECK(mr_ != nullptr) << " No memory region found for address " << rdma_addr_
+                        << ": " << key_;
+  return true;
+}
+
+void RdmaTensorRequest::AllocateTensorsAsync(StatusCallback done) {
+  AllocateTensors();
+  bool on_host = recv_args_.alloc_attrs.on_host();
+  if (dst_dev_->tensorflow_gpu_device_info() && !on_host &&
+      (proxy_tensor_ == nullptr)) {
+#if GOOGLE_CUDA
+    // We need to sync the memory allocation on the GPU:
+    StreamGPUOp(dst_dev_, recv_args_.device_context, done);
+#endif
+  } else {
+    done(Status::OK());
+  }
+}
+
+void RdmaTensorRequest::Send(RdmaMessageType message_type) {
+  RdmaMessageBuffer* rb = channel_->tx_message_buffer_;
+  RdmaMessage rm;
+  rm.type_ = message_type;
+  rm.request_index_ = index_;
+  rm.name_size_ = key_.size();
+  rm.name_ = key_;
+  rm.step_id_ = step_id_;
+  rm.remote_addr_ = (uint64_t)rdma_addr_;
+  if (meta_data_ != nullptr) {
+    rm.data_type_ = meta_data_->data_type_;
+    rm.tensor_shape_ = meta_data_->tensor_shape_;
+    rm.is_dead_ = meta_data_->is_dead_;
+    rm.tensor_bytes_ = meta_data_->proto_size_;
+  } else {
+    rm.data_type_ = DT_INVALID;
+  }
+  rm.rkey_ = (mr_ == nullptr) ? 0 : mr_->rkey;
+
+  RDMA_LOG(1) << "Step 0x" << std::hex << rm.step_id_ << std::dec
+              << ": Sending  " << MessageTypeToString(message_type) << " #"
+              << index_ << ": " << rm.name_ << " on " << rdma_addr_
+              << " (rkey: 0x" << std::hex << rm.rkey_ << ")";
+
+  string message = RdmaMessage::CreateMessage(rm);
+  rb->EnqueueItem(message);
+  rb->SendNextItem();
+}
+
+void RdmaTensorRequest::RecvTensorMetaData(DataType dtype, TensorShape shape,
+                                           bool is_dead, size_t proto_size) {
+  meta_data_ = RdmaMemoryMgr::Singleton().SetTensorMetaData(
+      key_, dtype, shape, is_dead, proto_size);
+
+  DeallocateTensors();
+  AllocateTensorsAsync(
+      [this](const Status& s) { Send(RDMA_MESSAGE_TENSOR_RE_REQUEST); });
+}
+
+void RdmaTensorRequest::RecvTensorContent() {
+  bool can_memcpy = DataTypeCanUseMemcpy(meta_data_->data_type_);
+  size_t message_size =
+      can_memcpy ? result_tensor_->TotalBytes() : meta_data_->proto_size_;
+  RDMA_LOG(1) << "Step 0x" << std::hex << step_id_ << std::dec
+              << ": Received tensor content #" << index_ << ": " << key_
+              << " (Size: 0x" << std::hex << message_size << ")";
+
+  Tensor val;
+
+#if GOOGLE_CUDA
+  if (proxy_tensor_ != nullptr) {
+    CountCopies(key_, (void*)DMAHelper::base(proxy_tensor_),
+                (void*)DMAHelper::base(result_tensor_),
+                result_tensor_->TotalBytes(), false);
+    GPUUtil::CopyCPUTensorToGPU(proxy_tensor_, recv_args_.device_context,
+                                dst_dev_, result_tensor_,
+                                [this](const Status& s) {
+                                  CHECK(s.ok()) << "copy tensor to gpu sync";
+                                  Done(s);
+                                });
+    return;
+  }
+#endif
+
+  if (can_memcpy) {
+    Done(Status::OK());
+  } else {
+    RDMA_LOG(2) << "Decoding proto: " << key_
+                << " (Size: " << meta_data_->proto_size_ << ")";
+    TensorProto proto;
+    CHECK(ParseProtoUnlimited(&proto, rdma_addr_, meta_data_->proto_size_))
+        << "fail to parse proto from array";
+    ibv_dereg_mr(mr_);
+    free(rdma_addr_);
+    Status s = dst_dev_->MakeTensorFromProto(proto, recv_args_.alloc_attrs,
+                                             result_tensor_);
+    Done(s);
+  }
+}
+
+void RdmaTensorRequest::RecvErrorStatus(const Status& status) {
+  if (result_tensor_ == nullptr) {
+    result_tensor_ = new Tensor();
+  }
+  LOG(ERROR) << "Received RDMA_MESSAGE_ERROR_STATUS: " << status.ToString();
+  Done(status);
+}
+
+void RdmaTensorRequest::Start() {
+  meta_data_ = RdmaMemoryMgr::Singleton().GetTensorMetaData(key_);
+  if (meta_data_ != nullptr) {
+    AllocateTensorsAsync(
+        [this](const Status& s) { Send(RDMA_MESSAGE_TENSOR_REQUEST); });
+  } else {
+    Send(RDMA_MESSAGE_TENSOR_REQUEST);
+  }
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index fea2327d77ffff67c4b3c45835a81f790bbd1574..94203ee2b3654bffe82d203cde8780a64f63ba2a 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_H_
+#ifndef TENSORFLOW_CONTRIB_VERBS_RDMA_H_
+#define TENSORFLOW_CONTRIB_VERBS_RDMA_H_
 
 #ifdef TENSORFLOW_USE_VERBS
 
@@ -27,6 +27,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/contrib/verbs/verbs_util.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -43,6 +44,11 @@ namespace tensorflow {
 #define SL_DEFAULT 0
 #define TRAFFIC_CLASS 0
 
+#define RDMA_LOG_0 LOG(INFO)
+#define RDMA_LOG_1 VLOG(1)
+#define RDMA_LOG_2 VLOG(2)
+#define RDMA_LOG(LEVEL) RDMA_LOG_##LEVEL
+
 struct RdmaParams {
   uint8_t port_num;
   uint8_t sgid_index;
@@ -67,38 +73,305 @@ struct RemoteMR {
   uint64_t remote_addr;
   uint32_t rkey;
 };
-enum BufferStatus {
-  none,
-  idle,
-  busy
+enum BufferStatus { none, idle, busy };
+enum Location { local, remote };
+
+enum RdmaMessageType {
+  RDMA_MESSAGE_META_DATA_UPDATE,
+  RDMA_MESSAGE_TENSOR_RE_REQUEST,
+  RDMA_MESSAGE_TENSOR_REQUEST,
+  RDMA_MESSAGE_ERROR_STATUS,
+};
+
+struct RdmaMessage {
+  RdmaMessageType type_;
+  uint16_t name_size_;
+  string name_;
+  int64 step_id_;
+  uint64_t request_index_;
+  union {
+    uint64_t remote_addr_;
+#ifdef RDMA_DATA_VALIDATION
+    uint64_t checksum_;
+#endif
+  };
+  uint32_t rkey_;
+  bool is_dead_;
+  DataType data_type_;
+  TensorShape tensor_shape_;
+  size_t tensor_bytes_;
+
+  // For error status:
+  Status status_;
+
+  // type|name_size|name|step_id|request_index|remote_addr/checksum|rkey|...
+  //   1B|    2B   | 512|  8B   |     8B      |       8B           | 4B |...
+  // ...|is_dead|data_type|tensor_shape|tensor_bytes|error_status          |
+  // ...|    1B |   XB    |    XB      |    8B      |size - 4B, proto - XB |
+  static const size_t kNameCapacity = 512;
+  static const size_t kTypeStartIndex = 0;
+  static const size_t kNameSizeStartIndex = kTypeStartIndex + sizeof(type_);
+  static const size_t kNameStartIndex =
+      kNameSizeStartIndex + sizeof(name_size_);
+  static const size_t kStepIdStartIndex = kNameStartIndex + kNameCapacity;
+  static const size_t kRequestIndexStartIndex =
+      kStepIdStartIndex + sizeof(step_id_);
+  static const size_t kRemoteAddrStartIndex =
+      kRequestIndexStartIndex + sizeof(request_index_);
+  static const size_t kChecksumStartIndex = kRemoteAddrStartIndex;
+  static const size_t kRkeyStartIndex =
+      kRemoteAddrStartIndex + sizeof(remote_addr_);
+  static const size_t kIsDeadStartIndex = kRkeyStartIndex + sizeof(rkey_);
+  static const size_t kDataTypeStartIndex =
+      kIsDeadStartIndex + sizeof(is_dead_);
+  static const size_t kTensorShapeStartIndex =
+      kDataTypeStartIndex + sizeof(data_type_);
+  static const size_t kTensorBytesStartIndex =
+      kTensorShapeStartIndex + sizeof(TensorShape);
+  static const size_t kErrorStatusStartIndex =
+      kTensorBytesStartIndex + sizeof(tensor_bytes_);
+  static const size_t kErrorStatusMaxSize = 4096;
+
+  static const size_t kMessageTotalBytes = kErrorStatusStartIndex;
+  static const size_t kRdmaMessageBufferSize =
+      kMessageTotalBytes + kErrorStatusMaxSize;
+  static string CreateMessage(const RdmaMessage& rm);
+  static void ParseMessage(RdmaMessage& rm, void* buffer);
+};
+
+// Immediate types for RDMA write
+enum RdmaImmDataType {
+  RDMA_IMM_MAX_REQUEST_ID = 0xFFFFFFFD,
+  RDMA_IMM_DATA_ACK = 0xFFFFFFFE,
+  RDMA_IMM_DATA_MESSAGE = 0xFFFFFFFF
 };
-enum Location {
-  local,
-  remote
+
+// Write types for RDMA write-complete events
+enum RdmaWriteIDType {
+  RDMA_WRITE_ID_ACK,
+  RDMA_WRITE_ID_MESSAGE,
+  RDMA_WRITE_ID_TENSOR_WRITE
 };
-enum BufferType {
-  ACK,
-  MESSAGE,
-  TENSOR
+
+// Context for RDMA write-complete events
+class RdmaWriteID {
+ public:
+  RdmaWriteID(RdmaWriteIDType write_type, void* write_context)
+      : write_type(write_type), write_context(write_context) {}
+
+  RdmaWriteIDType write_type;
+  void* write_context;
 };
-enum RdmaMessageType {
-  RDMA_MESSAGE_ACK,
-  RDMA_MESSAGE_BUFFER_IDLE,
-  RDMA_MESSAGE_BUFFER_REQUEST,
-  RDMA_MESSAGE_BUFFER_RESPONSE,
-  RDMA_MESSAGE_TENSOR_REQUEST,
-  RDMA_MESSAGE_TENSOR_WRITE
+
+// Tensor meta-data
+class TensorMetaData {
+ public:
+  TensorShape tensor_shape_;
+  DataType data_type_;
+  size_t proto_size_;
+  bool is_dead_;
+
+  std::ostream& print(std::ostream& out) const {
+    out << "Dtype = " << DataTypeString(data_type_)
+        << ", Shape = " << tensor_shape_.DebugString() << ", Proto size = 0x"
+        << std::hex << proto_size_ << ", Is dead = " << is_dead_;
+    return out;
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& out,
+                                const TensorMetaData& meta_data) {
+  return meta_data.print(out);
+}
+
+class RdmaChannel;
+
+void MRDeleter(ibv_mr* mr);
+using MemoryRegionPtr = std::unique_ptr<ibv_mr, decltype(&MRDeleter)>;
+
+// RdmaMemoryMgr
+// Manages the local meta-data cache, and the registered RDMA memory regions.
+class RdmaMemoryMgr {
+ public:
+  static RdmaMemoryMgr& Singleton() {
+    static RdmaMemoryMgr instance;
+    return instance;
+  }
+
+  // Memory regions
+  ibv_mr* FindMemoryRegion(void* addr, size_t length);
+  void InsertMemoryRegion(void* addr, size_t length,
+                          const std::string& allocator_name);
+  void EvictMemoryRegion(void* addr, size_t length);
+
+  // Tensor meta-data cache
+  const TensorMetaData* GetTensorMetaData(const std::string& tensor_name);
+  const TensorMetaData* SetTensorMetaData(const std::string& tensor_name,
+                                          DataType dtype,
+                                          const TensorShape& shape,
+                                          bool is_dead, size_t proto_size);
+
+  struct ibv_pd* pd_;
+
+ protected:
+  RdmaMemoryMgr() : pd_(nullptr) {}
+
+  static bool Comparator(const void* ptr, const MemoryRegionPtr& other) {
+    return ptr < reinterpret_cast<char*>(other->addr) + other->length;
+  }
+
+ private:
+  mutex tensor_meta_data_mu_;
+  std::unordered_map<std::string, TensorMetaData> tensors_meta_data_;
+
+  // Managed memory regions
+  mutex mrs_mu_;
+  std::vector<MemoryRegionPtr> mrs_ GUARDED_BY(mrs_mu_);
 };
-class RdmaBuffer;
+
+// RdmaTensorRequest
+// Represents a single tensor request.
+class RdmaTensorRequest {
+ public:
+  typedef Rendezvous::DoneCallback RecvDoneCallback;
+
+  // Creates a tensor request identified by index.
+  RdmaTensorRequest(uint32_t index, const string& key, int64 step_id,
+                    RdmaChannel* channel, Device* dst_dev,
+                    const Rendezvous::Args recv_args,
+                    const RecvDoneCallback& done);
+  ~RdmaTensorRequest();
+
+  // Request unique index.
+  uint32_t index() { return index_; }
+
+  // Start the tensor request sequence.
+  //
+  // 1. Allocate the result tensor (and proxy tensor if required).
+  // 2. Send RDMA_MESSAGE_TENSOR_REQUEST to the remote side.
+  void Start();
+
+  // Receive tensor meta-data.
+  //
+  // 1. Update the local meta-data cache.
+  // 2. Reallocate the result tensor (and proxy tensor if required).
+  // 3. Re-send the request to the remote side.
+  void RecvTensorMetaData(DataType dtype, TensorShape shape, bool is_dead,
+                          size_t proto_size);
+
+  // Receive tensor content (RDMA write was completed).
+  //
+  // Decode proto if required and/or move to GPU if the content was not
+  // written to it directly (GPU direct is not avaliable). Afterwards,
+  // invoke Done().
+  void RecvTensorContent();
+
+  // Receive error status (in case of a remote error).
+  // Invoke Done() with the status code.
+  void RecvErrorStatus(const Status& status);
+
+#ifdef RDMA_DATA_VALIDATION
+  // Receive tensor checksum
+  //
+  // For validation: Get and store the Tensor's expected checksum for the
+  // current request. Compare the result Tensor's checksum with the stored
+  // checksum right before invoking Done().
+  void RecvTensorChecksum(uint64_t checksum) { checksum_ = checksum; }
+#endif
+
+ private:
+  void Done(const Status& s);
+  void Send(RdmaMessageType message_type);
+  bool AllocateTensors();
+  void AllocateTensorsAsync(StatusCallback done);
+  void DeallocateTensors();
+
+  uint32_t index_;
+  string key_;
+  int64 step_id_;
+  RdmaChannel* channel_;
+  Device* dst_dev_;
+  Rendezvous::Args recv_args_;
+  const TensorMetaData* meta_data_;
+  Tensor* result_tensor_;
+  Tensor* proxy_tensor_;
+  void* rdma_addr_;
+  ibv_mr* mr_;
+  RecvDoneCallback done_;
+#ifdef RDMA_DATA_VALIDATION
+  uint64_t checksum_;
+#endif
+};
+
+// RdmaTensorResponse
+// Represents a single tensor response.
+class RdmaTensorResponse {
+ public:
+  // Creates a response for request message.
+  RdmaTensorResponse(RdmaChannel* channel, const RdmaMessage& rm)
+      : channel_(channel), rm_(rm) {}
+
+  void Update(const RdmaMessage& rm) { rm_ = rm; }
+
+  // Start the tensor response sequence.
+  //
+  // 1. Find the tensor in the local tag-match table and invoke RecvHandler.
+  //    (Using RecvLocalAsync()).
+  // 2. Compare the tensor's meta-data to the meta-data in the message (taken
+  //    from the requester's local cache).
+  //    If meta-data changed:
+  //    a. Clone the tensor to be sent later.
+  //    b. Send a meta-data update message and wait for re-request.
+  //    Else:
+  //    a. Send the tensor's content (using direct RDMA write).
+  void Start();
+
+  // Resume the response sequence, after a re-request.
+  //
+  // 1. Send the tensor's content that was cloned earlier.
+  void Resume();
+
+  // Destroy the response's resources and remove it from the pending list.
+  void Destroy();
+
+ private:
+  void RecvHandler(Rendezvous::ParsedKey parsed,
+                   const Rendezvous::Args& send_args,
+                   const Rendezvous::Args& recv_args, const Tensor& in,
+                   bool is_dead);
+  void Clone(const Tensor& in, const TensorProto& proto, bool is_dead);
+  void Send(const Tensor& in, const TensorProto& proto, bool is_dead,
+            const Status& status);
+  bool TensorMetaDataChanged(const Tensor& in, bool is_dead);
+  Status PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
+                           Device** src_dev);
+  void SendMetaData(const Tensor& in, const TensorProto& proto, bool is_dead);
+  void SendContent(const Tensor& in, const TensorProto& proto, bool is_dead);
+  void SendErrorStatus(const Status& status);
+
+  RdmaChannel* channel_;
+  RdmaMessage rm_;  // The request message
+  Device* src_dev_ = nullptr;
+  TensorBuffer* src_buffer_ = nullptr;
+  void* src_addr_ = nullptr;
+  ibv_mr* mr_ = nullptr;
+  uint64_t checksum_ = 0;
+  bool meta_data_changed_ = false;
+
+  // Re-item:
+  TensorProto* proto_ = nullptr;
+  Tensor* tensor_ = nullptr;
+  bool is_dead_ = false;
+};
+
+class RdmaMessageBuffer;
 // Class that represents the Rdma Adapter.
 // Responsible for creation of the completion queue, and handling
 // of work completions.
 class RdmaAdapter {
   friend class RdmaChannel;
-  friend class RdmaBuffer;
-  friend class RdmaAckBuffer;
   friend class RdmaMessageBuffer;
-  friend class RdmaTensorBuffer;
+  friend class RdmaTensorResponse;
   friend class RdmaMgr;
   friend class RdmaRemoteRendezvous;
 
@@ -133,10 +406,10 @@ class RdmaAdapter {
 // Responsible for connecting queue pairs.
 class RdmaChannel {
   friend class RdmaAdapter;
-  friend class RdmaBuffer;
-  friend class RdmaAckBuffer;
   friend class RdmaMessageBuffer;
   friend class RdmaTensorBuffer;
+  friend class RdmaTensorRequest;
+  friend class RdmaTensorResponse;
   friend class RdmaMgr;
   friend class RdmaRemoteRendezvous;
 
@@ -146,22 +419,28 @@ class RdmaChannel {
   ~RdmaChannel();
   inline const RdmaAddress& self() { return self_; }
   RdmaAddress address() const;
-  inline const std::vector<RdmaBuffer*>& message_buffers() const {
+  inline const std::vector<RdmaMessageBuffer*>& message_buffers() const {
     return message_buffers_;
   }
   void Connect(const RdmaAddress& remoteAddr);
   void Connect();
   void Recv();
-  RdmaBuffer* FindBuffer(const uint32_t index);
-  RdmaBuffer* FindBuffer(const string& name);
-  RdmaBuffer* FindOrCreateBuffer(const string& name,
-                                 BufferType buffer_type = TENSOR);
-  uint32_t LookupBufferIndex(const string& buffer_name);
   void SetRemoteAddress(const RdmaAddress& ra, bool override);
-  void InsertRecvCallback(const string& key, std::function<void()> recv_done);
-  void RemoveRecvCallback(const string& key);
-  void RunRecvCallback(const string& key);
-  static const int kNumMessageBuffers = 4;
+
+  // Requests:
+  RdmaTensorRequest* InsertTensorRequest(
+      const string& key, int64 step_id, Device* dst_dev,
+      const Rendezvous::Args recv_args,
+      const RdmaTensorRequest::RecvDoneCallback& done);
+  void RemoveTensorRequest(uint32_t request_index);
+  RdmaTensorRequest* GetTensorRequest(uint32_t request_index);
+
+  // Responses:
+  RdmaTensorResponse* AddTensorResponse(const RdmaMessage& rm);
+  RdmaTensorResponse* UpdateTensorResponse(const RdmaMessage& rm);
+  void RemoveTensorResponse(uint32_t request_index);
+
+  static const int kNumMessageBuffers = 2;
   static const int kPingRecvWrid = 0;
 
  private:
@@ -179,36 +458,31 @@ class RdmaChannel {
   string remote_name_;
   ibv_qp* qp_;
   mutex mu_;
-  bool connected_ GUARDED_BY(bt_mu_) = false;
-  RdmaAddress remote_ GUARDED_BY(bt_mu_);
-  bool remote_set_ GUARDED_BY(bt_mu_) = false;
+  bool connected_ GUARDED_BY(mu_) = false;
+  RdmaAddress remote_ GUARDED_BY(mu_);
+  bool remote_set_ GUARDED_BY(mu_) = false;
   mutex ct_mu_;
-  typedef std::unordered_map<string, std::function<void()> > CallbackTable;
-  CallbackTable callback_table_ GUARDED_BY(ct_mu_);
-  mutex bt_mu_;
-  typedef std::unordered_map<unsigned int, RdmaBuffer*> BufferTable;
-  BufferTable buffer_table_ GUARDED_BY(bt_mu_);
-  typedef std::unordered_map<uint32_t, string> BufferIndexNameTable;
-  BufferIndexNameTable buffer_index_name_table_ GUARDED_BY(bt_mu_);
-  typedef std::unordered_map<string, uint32_t> BufferNameIndexTable;
-  BufferNameIndexTable buffer_name_index_table_ GUARDED_BY(bt_mu_);
-  RdmaBuffer* tx_message_buffer_;
-  RdmaBuffer* rx_message_buffer_;
-  RdmaBuffer* tx_ack_buffer_;
-  RdmaBuffer* rx_ack_buffer_;
-  std::vector<RdmaBuffer*> message_buffers_;
+  typedef std::unordered_map<uint32_t, RdmaTensorRequest> RequestTable;
+  RequestTable request_table_ GUARDED_BY(ct_mu_);
+  uint32_t request_serial_ GUARDED_BY(ct_mu_);
+  mutex responses_mu_;
+  typedef std::unordered_map<uint32_t, RdmaTensorResponse> ResponsesTable;
+  ResponsesTable responses_table_ GUARDED_BY(responses_mu_);
+  RdmaMessageBuffer* tx_message_buffer_;
+  RdmaMessageBuffer* rx_message_buffer_;
+  std::vector<RdmaMessageBuffer*> message_buffers_;
 };
 
-// Class that represents a buffer for Rdma writes and reads.
-class RdmaBuffer {
+// Class that represents a buffer for Rdma message sending.
+class RdmaMessageBuffer {
   friend class RdmaChannel;
   friend class RdmaAdapter;
   friend class RdmaMgr;
   friend class RdmaRemoteRendezvous;
 
  public:
-  explicit RdmaBuffer(RdmaChannel* channel, string name);
-  virtual ~RdmaBuffer();
+  explicit RdmaMessageBuffer(RdmaChannel* channel, string name);
+  ~RdmaMessageBuffer();
 
   inline void* buffer() const { return buffer_; }
   inline ibv_mr* self() const { return self_; }
@@ -223,13 +497,15 @@ class RdmaBuffer {
   }
   void FreeBuffer();
   void EnqueueItem(string Item);
-  virtual void SendNextItem() {};
+  void SendNextItem();
   void CreateCPUBuffer(size_t size, bool lock = true);
   void SetRemoteMR(RemoteMR rmi, bool override);
-  uint32_t LookupBufferIndex(const string& buffer_name) {
-    return const_cast<RdmaChannel*>(channel_)->LookupBufferIndex(buffer_name);
-  }
   void Write(uint32_t imm_data, size_t buffer_size);
+  static void Write(const RdmaChannel* channel, uint32_t imm_data,
+                    size_t buffer_size, uint64_t src_addr, uint32_t lkey,
+                    uint64_t remote_addr, uint32_t rkey,
+                    RdmaWriteIDType write_type, void* write_context);
+  static void SendAck(const RdmaChannel* channel);
 
  protected:
   const RdmaChannel* channel_;
@@ -245,126 +521,7 @@ class RdmaBuffer {
   BufferStatus remote_status_ GUARDED_BY(mu_) = none;
 };
 
-class RdmaAckBuffer : public RdmaBuffer {
- public:
-  explicit RdmaAckBuffer(RdmaChannel* channel, string name);
-  virtual ~RdmaAckBuffer() override {}
-  void SendNextItem() override;
-};
-
-class RdmaMessageBuffer : public RdmaBuffer {
-  friend class RdmaChannel;
-  friend class RdmaAapater;
-
- public:
-  explicit RdmaMessageBuffer(RdmaChannel* channel, string name);
-  virtual ~RdmaMessageBuffer() override {}
-  void SendNextItem() override;
-};
-
-class RdmaTensorBuffer : public RdmaBuffer {
- public:
-  explicit RdmaTensorBuffer(RdmaChannel* channel, string name);
-  virtual ~RdmaTensorBuffer() override;
-  void SendNextItem() override;
-  void PostCopyOperations(bool can_memcpy, size_t buffer_size,
-                          size_t tensor_bytes, const string& key,
-                          const Tensor& in, int64 step_id, bool is_dead,
-                          const string& key_with_step_id, const Tensor* copy,
-                          const TensorProto* proto, const StringPiece* copy_buf,
-                          const Rendezvous::Args& send_args,
-                          const Rendezvous::Args& recv_args);
-
-  void ReSendNextItem();
-
- private:
-  Rendezvous::DoneCallback getRecvTensorCallback(
-      const string& key_with_step_id, const string& key, int64 step_id,
-      const Rendezvous::ParsedKey& parsed);
-
-  struct ReItem {
-    Rendezvous::Args send_args;
-    Rendezvous::Args recv_args;
-    Tensor in;
-    bool is_dead;
-
-    ReItem(const Rendezvous::Args& send_args_,
-           const Rendezvous::Args& recv_args_, const Tensor& in_, bool is_dead_)
-        : send_args(send_args_),
-          recv_args(recv_args_),
-          in(in_),
-          is_dead(is_dead_) {
-      if (send_args.device_context) {
-        send_args.device_context->Ref();
-      }
-      if (recv_args.device_context) {
-        recv_args.device_context->Ref();
-      }
-    }
-
-    ~ReItem() {
-      if (send_args.device_context) {
-        send_args.device_context->Unref();
-      }
-      if (recv_args.device_context) {
-        recv_args.device_context->Unref();
-      }
-    }
-  };
-  typedef std::map<string, ReItem*> Table;
-  typedef Table::iterator Itable;
-
-  std::queue<string> requeue GUARDED_BY(mu_);
-  Table retable GUARDED_BY(mu_);
-};
-
-struct RdmaMessage {
-  RdmaMessageType type_;
-  uint16_t name_size_;
-  string name_;
-  int64 step_id_;
-  uint64_t buffer_size_;
-  uint64_t remote_addr_;
-  uint32_t rkey_;
-  bool is_dead_;
-  DataType data_type_;
-  TensorShape tensor_shape_;
-  size_t tensor_bytes_;
-
-  // type|name_size|name|step_id|buffer_size|remote_addr|rkey|is_dead|...
-  //   1B|    2B   | 512|  8B   |    8B     |       8B  | 4B |    1B |...
-  // ...|data_type|tensor_shape|tensor_bytes|tensor_buffer
-  // ...|   XB    |    XB      |    8B      |...
-  //
-  static const size_t kNameCapacity = 512;
-  static const size_t kTypeStartIndex = 0;
-  static const size_t kNameSizeStartIndex = kTypeStartIndex + sizeof(type_);
-  static const size_t kNameStartIndex =
-      kNameSizeStartIndex + sizeof(name_size_);
-  static const size_t kStepIdStartIndex = kNameStartIndex + kNameCapacity;
-  static const size_t kBufferSizeStartIndex =
-      kStepIdStartIndex + sizeof(step_id_);
-  static const size_t kRemoteAddrStartIndex =
-      kBufferSizeStartIndex + sizeof(buffer_size_);
-  static const size_t kRkeyStartIndex =
-      kRemoteAddrStartIndex + sizeof(remote_addr_);
-  static const size_t kIsDeadStartIndex = kRkeyStartIndex + sizeof(rkey_);
-  static const size_t kDataTypeStartIndex =
-      kIsDeadStartIndex + sizeof(is_dead_);
-  static const size_t kTensorShapeStartIndex =
-      kDataTypeStartIndex + sizeof(data_type_);
-  static const size_t kTensorBytesStartIndex =
-      kTensorShapeStartIndex + sizeof(TensorShape);
-  static const size_t kTensorBufferStartIndex =
-      kTensorBytesStartIndex + sizeof(tensor_bytes_);
-  static const size_t kMessageTotalBytes = kTensorBufferStartIndex;
-  static const size_t kRdmaMessageBufferSize = kMessageTotalBytes;
-  static const size_t kRdmaAckBufferSize = kMessageTotalBytes;
-  static string CreateMessage(const RdmaMessage& rm);
-  static void ParseMessage(RdmaMessage& rm, void* buffer);
-};
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_VERBS
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_H_
+#endif  // TENSORFLOW_CONTRIB_VERBS_RDMA_H_
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index 9cb307bcfa06cfdf5ecb9b4faa1d3710e5701080..369bd986df5313955bc22d6e5c6d38815908ada3 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -16,11 +16,16 @@ limitations under the License.
 #ifdef TENSORFLOW_USE_VERBS
 
 #include "tensorflow/contrib/verbs/rdma_mgr.h"
+#include <fstream>
 #include <vector>
 #include "tensorflow/contrib/verbs/grpc_verbs_client.h"
 #include "tensorflow/contrib/verbs/verbs_service.pb.h"
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -53,7 +58,7 @@ RdmaMgr::RdmaMgr(const WorkerEnv* const worker_env,
 void RdmaMgr::SetupChannels() {
   for (const auto& p : channel_table_) {
     string worker_name = p.first;
-    LOG(INFO) << "connecting to remote node " << worker_name;
+    RDMA_LOG(2) << "Connecting to remote node " << worker_name;
     RdmaChannel* rc = p.second;
     GetRemoteAddressRequest req;
     GetRemoteAddressResponse resp;
@@ -78,39 +83,49 @@ void RdmaMgr::SetupChannels() {
       mr->set_rkey(rc->message_buffers_[i]->self_->rkey);
     }
     // synchronous call
-    Status s = client->GetRemoteAddress(&req, &resp);
-    // save obtained remote addresses
-    // connect to the remote channel
-    if (s.ok()) {
-      CHECK(worker_name.compare(resp.host_name()) == 0);
-      RdmaAddress ra;
-      ra.lid = resp.channel().lid();
-      ra.qpn = resp.channel().qpn();
-      ra.psn = resp.channel().psn();
-      ra.snp = resp.channel().snp();
-      ra.iid = resp.channel().iid();
-      rc->SetRemoteAddress(ra, false);
-      rc->Connect();
-      int i = 0;
-      int idx[] = {1, 0, 3, 2};
-      for (const auto& mr : resp.mr()) {
-        // the connections are crossed, i.e.
-        // local tx_message_buffer <---> remote rx_message_buffer_
-        // local rx_message_buffer <---> remote tx_message_buffer_
-        // local tx_ack_buffer <---> remote rx_ack_buffer_
-        // local rx_ack_buffer <---> remote tx_ack_buffer_
-        // hence idx[] = {1, 0, 3, 2}.
-        RdmaBuffer* rb = rc->message_buffers_[idx[i]];
-        RemoteMR rmr;
-        rmr.remote_addr = mr.remote_addr();
-        rmr.rkey = mr.rkey();
-        rb->SetRemoteMR(rmr, false);
-        i++;
+    Status s;
+    int attempts = 0;
+    static const int max_num_attempts = 5;
+    do {
+      s = client->GetRemoteAddress(&req, &resp);
+      // save obtained remote addresses
+      // connect to the remote channel
+      if (s.ok()) {
+        CHECK(worker_name.compare(resp.host_name()) == 0);
+        RdmaAddress ra;
+        ra.lid = resp.channel().lid();
+        ra.qpn = resp.channel().qpn();
+        ra.psn = resp.channel().psn();
+        ra.snp = resp.channel().snp();
+        ra.iid = resp.channel().iid();
+        rc->SetRemoteAddress(ra, false);
+        rc->Connect();
+        int i = 0;
+        int idx[] = {1, 0};
+        for (const auto& mr : resp.mr()) {
+          // the connections are crossed, i.e.
+          // local tx_message_buffer <---> remote rx_message_buffer_
+          // local rx_message_buffer <---> remote tx_message_buffer_
+          // hence idx[] = {1, 0}.
+          RdmaMessageBuffer* rb = rc->message_buffers_[idx[i]];
+          RemoteMR rmr;
+          rmr.remote_addr = mr.remote_addr();
+          rmr.rkey = mr.rkey();
+          rb->SetRemoteMR(rmr, false);
+          i++;
+        }
+        CHECK(i == RdmaChannel::kNumMessageBuffers);
+      } else {
+        LOG(ERROR) << "Connecting to " << worker_name << ": Got "
+                   << s.error_message() << ". Retrying (" << (attempts + 1)
+                   << "/" << max_num_attempts << ")...";
+        if (++attempts == max_num_attempts) {
+          break;
+        }
+        worker_env_->env->SleepForMicroseconds(2000000);
       }
-      CHECK(i == RdmaChannel::kNumMessageBuffers);
-    } else {
-      LOG(ERROR) << s.error_message();
-    }
+    } while (!s.ok());
+    RDMA_LOG(0) << "Connected to remote node " << worker_name;
     delete client;
   }
 }
@@ -144,19 +159,17 @@ bool RdmaMgr::ConnectivityCheck() {
       ibv_wc_status s = rdma_adapter_->wc_[i].status;
       // recv complete
       if ((int)rdma_adapter_->wc_[i].wr_id == RdmaChannel::kPingRecvWrid) {
-        CHECK(s == IBV_WC_SUCCESS) << ": " << ibv_wc_status_str(
-                                                  rdma_adapter_->wc_[i].status)
-                                   << "(" << rdma_adapter_->wc_[i].status
-                                   << ") for PING_RECV_WRID";
+        CHECK(s == IBV_WC_SUCCESS)
+            << ": " << ibv_wc_status_str(rdma_adapter_->wc_[i].status) << "("
+            << rdma_adapter_->wc_[i].status << ") for PING_RECV_WRID";
         ++rcnt;
         // send complete
       } else {
         RdmaChannel* rc =
             reinterpret_cast<RdmaChannel*>(rdma_adapter_->wc_[i].wr_id);
-        CHECK(s == IBV_WC_SUCCESS) << ": " << ibv_wc_status_str(
-                                                  rdma_adapter_->wc_[i].status)
-                                   << "(" << rdma_adapter_->wc_[i].status
-                                   << ") to " << rc->remote_name_;
+        CHECK(s == IBV_WC_SUCCESS)
+            << ": " << ibv_wc_status_str(rdma_adapter_->wc_[i].status) << "("
+            << rdma_adapter_->wc_[i].status << ") to " << rc->remote_name_;
         ++scnt;
       }
     }  // for
@@ -183,6 +196,139 @@ RdmaChannel* RdmaMgr::FindChannel(const string& name) {
   return iter->second;
 }
 
+bool IsGDRAvailable() {
+#if defined(__APPLE__)
+  return false;
+#elif defined(PLATFORM_WINDOWS)
+  return false;
+#else
+  std::ifstream ifs("/proc/modules");
+  string line;
+  while (std::getline(ifs, line)) {
+    auto sep = line.find(' ');
+    CHECK_NE(sep, std::string::npos);
+    if (line.substr(0, sep) == "nv_peer_mem") {
+      return true;
+    }
+  }
+  return false;
+#endif
+}
+
+int TryToReadNumaNode(ibv_device* device) {
+#if defined(__APPLE__)
+  LOG(INFO) << "OS X does not support NUMA - returning NUMA node 0";
+  return 0;
+#elif defined(PLATFORM_WINDOWS)
+  // Windows support for NUMA is not currently implemented. Return node 0.
+  return 0;
+#else
+  VLOG(2) << "Trying to read NUMA node for device: " << device->name;
+  static const int kUnknownNumaNode = -1;
+
+  auto filename = string(device->ibdev_path) + "/device/numa_node";
+
+  std::ifstream ifs(filename.c_str());
+  string content;
+  CHECK(std::getline(ifs, content));
+
+  int32 value;
+  if (strings::safe_strto32(content, &value)) {
+    if (value < 0) {
+      LOG(INFO) << "Successful NUMA node read from SysFS had negative value ("
+                << value
+                << "), but there must be at least one NUMA node"
+                   ", so returning NUMA node zero";
+      return 0;
+    }
+    LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
+    return value;
+  }
+  return kUnknownNumaNode;
+#endif
+}
+
+void MRDeleter(ibv_mr* mr) {
+  if (mr) {
+    ibv_dereg_mr(mr);
+  }
+}
+
+// TODO(byronyi): remove this class duplicated from the one in
+// common/runtime/gpu/pool_allocator.h when it is available in common_runtime
+class BasicCPUAllocator : public SubAllocator {
+ public:
+  ~BasicCPUAllocator() override {}
+
+  void* Alloc(size_t alignment, size_t num_bytes) override {
+    return port::AlignedMalloc(num_bytes, alignment);
+  }
+  void Free(void* ptr, size_t) override { port::AlignedFree(ptr); }
+};
+
+// TODO(byronyi): remove this class and its registration when the default
+// cpu_allocator() returns visitable allocator
+class BFCRdmaAllocator : public BFCAllocator {
+ public:
+  BFCRdmaAllocator()
+      : BFCAllocator(new BasicCPUAllocator(), 1LL << 36, true, "cpu_rdma_bfc") {
+  }
+};
+
+REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocator);
+
+void RdmaMgr::InitAllocators() {
+  RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_;
+
+  Allocator* allocators[] = {
+#if GOOGLE_CUDA
+    ProcessState::singleton()->GetCUDAHostAllocator(0),
+    ProcessState::singleton()->GetCPUAllocator(0),
+#endif  // GOOGLE_CUDA
+    cpu_allocator(),
+  };
+
+  using namespace std::placeholders;
+
+  std::set<Allocator*> instrumented_;
+
+  // Host memory allocators
+  for (Allocator* allocator : allocators) {
+    VisitableAllocator::Visitor alloc_visitor =
+        std::bind(&RdmaMemoryMgr::InsertMemoryRegion,
+                  &RdmaMemoryMgr::Singleton(), _1, _2, allocator->Name());
+    VisitableAllocator::Visitor free_visitor = std::bind(
+        &RdmaMemoryMgr::EvictMemoryRegion, &RdmaMemoryMgr::Singleton(), _1, _2);
+
+    auto* visitable_allocator = dynamic_cast<VisitableAllocator*>(allocator);
+    CHECK(visitable_allocator)
+        << "is not visitable for instrumentation" << allocator->Name();
+    // Make sure we don't instrument the same allocator twice
+    if (instrumented_.find(allocator) == std::end(instrumented_)) {
+      visitable_allocator->AddAllocVisitor(alloc_visitor);
+      visitable_allocator->AddFreeVisitor(free_visitor);
+      instrumented_.insert(allocator);
+      LOG(INFO) << "Instrumenting CPU allocator " << allocator->Name();
+    }
+  }
+
+#if GOOGLE_CUDA
+  if (IsGDRAvailable()) {
+    // Note we don't free allocated GPU memory so there is no free visitor
+    int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1;
+
+    char buf[8];
+    sprintf(buf, "gpu");
+    VisitableAllocator::Visitor cuda_alloc_visitor =
+        std::bind(&RdmaMemoryMgr::InsertMemoryRegion,
+                  &RdmaMemoryMgr::Singleton(), _1, _2, std::string(buf));
+
+    ProcessState::singleton()->AddGPUAllocVisitor(bus_id, cuda_alloc_visitor);
+    LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
+  }
+#endif  // GOOGLE_CUDA
+}
+
 }  // end namespace tensorflow
 
 #endif
diff --git a/tensorflow/contrib/verbs/rdma_mgr.h b/tensorflow/contrib/verbs/rdma_mgr.h
index e711e604788b12ff0c1a0977a90db21f9f8fa50e..9fffc335bbe2bf47a626736f6d3073f52b32a9c2 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.h
+++ b/tensorflow/contrib/verbs/rdma_mgr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
+#ifndef TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
+#define TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
 
 #ifdef TENSORFLOW_USE_VERBS
 
@@ -38,6 +38,7 @@ class RdmaMgr {
   RdmaChannel* FindChannel(const string& key);
   void SetupChannels();
   bool ConnectivityCheck();
+  void InitAllocators();
   const string& local_worker() { return local_worker_; }
 
  private:
@@ -54,4 +55,4 @@ class RdmaMgr {
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_VERBS
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
+#endif  // TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
index 74f6681af3c29f370d6cdb37d64e10a30cbb7b84..ad3dce17844c5a43237372fb7fe074416e8b7117 100644
--- a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
@@ -21,10 +21,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#if GOOGLE_CUDA
-#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#include "tensorflow/core/common_runtime/gpu/process_state.h"
-#endif  // GOOGLE_CUDA
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -36,11 +32,6 @@ class RdmaRemoteRendezvous : public BaseRemoteRendezvous {
   RdmaRemoteRendezvous(const WorkerEnv* env, int64 step_id, RdmaMgr* rdma_mgr)
       : BaseRemoteRendezvous(env, step_id), rdma_mgr_(rdma_mgr) {}
 
-  void RecvPostCopyOps(const string& key, const string& key_with_step_id,
-                       const Rendezvous::Args& recv_args,
-                       const DoneCallback& done, const RdmaMessage& rm,
-                       RdmaChannel* rc, Tensor& val, const Status& s);
-
  protected:
   void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
                            const Rendezvous::Args& args,
@@ -74,101 +65,18 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
   RdmaChannel* rc = rdma_mgr_->FindChannel(src_name);
   string key(std::move(parsed.FullKey().ToString()));
   string key_with_step_id = VerbsUtil::AppendStepidToKey(key, step_id_);
-  // insert callback
-  rc->InsertRecvCallback(key_with_step_id, [this, key, key_with_step_id, rc,
-                                            recv_args, parsed, done]() {
-    Status src_s, dst_s, s;
-    Device* src_dev, *dst_dev;
-    src_s = env_->device_mgr->LookupDevice("CPU:0", &src_dev);
-    dst_s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_dev);
-    if (!src_s.ok() || !dst_s.ok()) {
-      s = src_s.ok() ? dst_s : src_s;
-      LOG(ERROR) << "s is not ok, error code " << s.error_message();
-      done(s, Args(), recv_args, Tensor(), true);
-      return;
-    }
-    RdmaBuffer* rb = rc->FindBuffer(key);
-    RdmaMessage rm;
-    CHECK(rb->size_ >= RdmaMessage::kMessageTotalBytes);
-    RdmaMessage::ParseMessage(rm, rb->buffer_);
-    CHECK(rm.type_ == RDMA_MESSAGE_TENSOR_WRITE);
-    Tensor val;
-    if (!rm.is_dead_) {
-      void* input = static_cast<char*>(rb->buffer_) +
-                    RdmaMessage::kTensorBufferStartIndex;
-      bool can_memcpy = DataTypeCanUseMemcpy(rm.data_type_);
-      if (can_memcpy) {
-        if (dst_dev->tensorflow_gpu_device_info() &&
-            (!recv_args.alloc_attrs.on_host())) {
-#if GOOGLE_CUDA
-          CHECK(recv_args.device_context)
-              << "send dev name: " << src_dev->name()
-              << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
-          Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
-          Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
-          memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
-
-          Allocator* dst_alloc = dst_dev->GetAllocator(recv_args.alloc_attrs);
-          Tensor gpu_copy(dst_alloc, rm.data_type_, rm.tensor_shape_);
-
-          GPUUtil::CopyCPUTensorToGPU(
-              &copy, recv_args.device_context, dst_dev, &gpu_copy,
-              [this, gpu_copy, key, key_with_step_id, recv_args, done, rm, rc](
-                  const Status& s) {
-                CHECK(s.ok()) << "copy tensor to gpu sync";
-                Tensor val;
-                val = std::move(gpu_copy);
-                RecvPostCopyOps(key, key_with_step_id, recv_args, done, rm, rc,
-                                val, s);
-              });
-#endif  // GOOGLE_CUDA
-          return;
-        } else {
-          AllocatorAttributes host_alloc_attrs;
-          host_alloc_attrs.set_gpu_compatible(true);
-          host_alloc_attrs.set_on_host(true);
-          Allocator* alloc = dst_dev->GetAllocator(host_alloc_attrs);
-          Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
-          memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
-          val = std::move(copy);
-        }
-      } else {
-        TensorProto proto;
-        CHECK(rm.tensor_bytes_ + RdmaMessage::kTensorBufferStartIndex <=
-              rb->size_);
-        CHECK(ParseProtoUnlimited(&proto, input, rm.tensor_bytes_))
-            << "fail to parse proto from array";
-        s = dst_dev->MakeTensorFromProto(proto, recv_args.alloc_attrs, &val);
-      }
-    }
-    RecvPostCopyOps(key, key_with_step_id, recv_args, done, rm, rc, val, s);
-  });
-  // append key to message queue
-  RdmaBuffer* rb = rc->tx_message_buffer_;
-  RdmaMessage rm;
-  rm.type_ = RDMA_MESSAGE_TENSOR_REQUEST;
-  rm.name_size_ = key.size();
-  rm.name_ = key;
-  rm.step_id_ = step_id_;
-  string message = RdmaMessage::CreateMessage(rm);
-  rb->EnqueueItem(message);
-  rb->SendNextItem();
-}
 
-void RdmaRemoteRendezvous::RecvPostCopyOps(
-    const string& key, const string& key_with_step_id,
-    const Rendezvous::Args& recv_args, const DoneCallback& done,
-    const RdmaMessage& rm, RdmaChannel* rc, Tensor& val, const Status& s) {
-  rc->RemoveRecvCallback(key_with_step_id);
-  RdmaMessage br;
-  br.type_ = RDMA_MESSAGE_BUFFER_IDLE;
-  br.name_size_ = key.size();
-  br.name_ = key;
-  string message = RdmaMessage::CreateMessage(br);
-  RdmaBuffer* tb = rc->tx_message_buffer_;
-  tb->EnqueueItem(message);
-  tb->SendNextItem();
-  done(s, Args(), recv_args, val, rm.is_dead_);
+  Device* dst_dev;
+  s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_dev);
+  CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
+  if (!s.ok()) {
+    done(s, Args(), recv_args, Tensor(), true);
+    return;
+  }
+
+  RdmaTensorRequest* request =
+      rc->InsertTensorRequest(key, step_id_, dst_dev, recv_args, done);
+  request->Start();
 }
 
 RdmaRendezvousMgr::RdmaRendezvousMgr(const WorkerEnv* env)
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h
index 2dedd6c48f96a6ecf2b69c757f525ac1bfd6f2d0..c0d6f59c4842e28e37b2a3b45e955f8d92712dd7 100644
--- a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
+#ifndef TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
+#define TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
 
 #ifdef TENSORFLOW_USE_VERBS
 
@@ -60,4 +60,4 @@ class RdmaRendezvousMgr : public BaseRendezvousMgr {
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_USE_VERBS
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
+#endif  // TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index a606ef75a42069b3c32eb13a69e981a5c4c8f83c..1a0b5028febb7b11f979abd179a3227a2615252d 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -49,8 +49,8 @@ VerbsServer::~VerbsServer() {
 Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
                                         GrpcChannelCache** channel_cache) {
   string name_prefix =
-      strings::StrCat("/job:", server_def.job_name(), "/replica:0", "/task:",
-                      server_def.task_index());
+      strings::StrCat("/job:", server_def.job_name(), "/replica:0",
+                      "/task:", server_def.task_index());
 
   GrpcChannelSpec channel_spec;
   TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec));
@@ -104,6 +104,7 @@ Status VerbsServer::Start() {
           [this] { verbs_service_->HandleRPCsLoop(); }));
       rdma_mgr_->SetupChannels();
       CHECK(rdma_mgr_->ConnectivityCheck()) << "Connectivity check failed!";
+      rdma_mgr_->InitAllocators();
       verbs_state_ = CONNECTED;
     }
   }
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.h b/tensorflow/contrib/verbs/verbs_server_lib.h
index 855380129f21bd8162cdf28a4d88c098db7ddc55..54ce8c1d47737f4da742925f99e3d1cd73160ffb 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.h
+++ b/tensorflow/contrib/verbs/verbs_server_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
+#ifndef TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
+#define TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
 
 #ifdef TENSORFLOW_USE_VERBS
 
@@ -63,4 +63,4 @@ class VerbsServer : public GrpcServer {
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_VERBS
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
+#endif  // TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
diff --git a/tensorflow/contrib/verbs/verbs_service.proto b/tensorflow/contrib/verbs/verbs_service.proto
index 0df1fed4b9de81d7d99be3de9fba4be8b88ad404..abdae1d84f74b076bb5f457d0cf6f74bf07d75b4 100644
--- a/tensorflow/contrib/verbs/verbs_service.proto
+++ b/tensorflow/contrib/verbs/verbs_service.proto
@@ -50,6 +50,12 @@ message GetRemoteAddressResponse {
   repeated MemoryRegion mr = 3;
 }
 
+message ErrorStatusProto {
+  int32 error_code = 1;
+  string error_message = 2;
+  string error_details = 3;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // VerbsService
diff --git a/tensorflow/contrib/verbs/verbs_with_0_copies.png b/tensorflow/contrib/verbs/verbs_with_0_copies.png
new file mode 100644
index 0000000000000000000000000000000000000000..0641e2fd50da3738e3b8113f4324156063bf52a2
Binary files /dev/null and b/tensorflow/contrib/verbs/verbs_with_0_copies.png differ
diff --git a/tensorflow/contrib/verbs/verbs_with_0_copies.xml b/tensorflow/contrib/verbs/verbs_with_0_copies.xml
new file mode 100644
index 0000000000000000000000000000000000000000..16130a961ba5185c415463ccb9636d010fe05e68
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_with_0_copies.xml
@@ -0,0 +1 @@
+<mxfile userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" version="7.8.7" editor="www.draw.io" type="device"><diagram name="Page-1" id="74e2e168-ea6b-b213-b513-2b3c1d86103e">7Vxtc9o4EP41zKQfmsGW3/hIgPQ60/RyIZ1rPzHClsFXY1FZEOivP8mW8ZsAB2yXtHQ6jb2SJXl3n0e7K6cdMFhsPhC4nD9gB/kdtetsOmDYUVVFUw32g0u2scRUu7FgRjxHdEoFY+8nEsKk28pzUJjrSDH2qbfMC20cBMimORkkBL/ku7nYz8+6hDNUEoxt6Jel/3oOnQup0u2mDX8hbzYXU1u6aJhC+/uM4FUg5uuowI3+xM0LmIwl+odz6OCXjAiMOmBAMKbx1WIzQD7XbaK2+Ln7Pa27dRMU0CoP6CB+Yg39FUqWHC2MbhNlRK+D+APdDrh7mXsUjZfQ5q0vzPxMNqcLn90p7NL1fH+AfUzYfYAD1ulOzIAIRZu9y1R2L8+cCuEFomTLumx2mo8fEf5kiduX1DhWIptn7GIkQigcYrYbOlUKuxB6kevIkqjI8NkMd463zqnK+LHihrtjL0rfQ9+bBR3QZz185NK0lV3NxM9olHAJg0Q2ppDQm3dJE1tatjUjjqbOS+tfTSKbEskK2lqYcsua+r6PbUgRJwIUhJiEt03OqfI5xyhwbp5Hn8d/P02eRv98GY2f37VhAam2c7PVBVDGTg5Elmtzu1OCv6NMi2FbaOru5iuBVQLp/fgFefwqRhnAalcC4B3jngPghGxrR/ATstfPkTs+IAqHkMKbC/GQ2oD3ZenEsGNah+/ZNeTbLrTnqHkAPiHYLuxlHAjKVCBjg8q0+XsBGahtAlkxjkcryGGRnLjFhM7xDAfQH6XSu7yWMxr9D1G6FcEoXFHMROkInzBein579RjiFbGTEFIsjW3oM5R002IZX+NBbRPkQ+qt89HoWZpTG6LAiCQGBMUgJShc4iBshRvs9SfGDX4/3AZ2s7QbUcAAL7dRGsKvH79wyCPisWd/8hf/6EZv/2PlEeTsffs3B3dT+aX7tv4r0M20RbZfszff+GC3Or/dePRr7u6bmKgaJ2hlTkhS5fo4QTz6iD22lJ0pe728KU2jYKF4UeKp1Eh9QuA2023JO4QH5jELO4RZyECP9E9SttRH4hWkHrPTSTXm00rMd++RkD57Cw7cjjngf1UDLjjggmm4LPJGjN4kwhvMYTBDDmMccF8V53O8mK7C4xh3GHvY1MOMjYbMbzjCLgP3LW/zvePbfDiHS37p+mjT5xWfCKyOuBzaPgxDzz5Ioa5lI9vOIr5bRnxJzVNL1/TKiLckQYBaEfAZXesSVSeyM3kBFCI6tcgL8euUeKE0kNbt3jK/MUxLUSxD10wjP65SjW9OgHjiHm35y5k+Id0FuhflFIbSu1WtHlAVy1KApqs5U2pFU1Z1kcPDgoo70ikeUi5zfkNhyUl4OJh3gbypRUFTUuMUMeTQZoZHTH7Hudbj4aloWHiOE8Unsi0gH7M0wd+gzN+axH3UOqK28ob7Gf++qraK8U6bqpblw00VQqJM75GFyfI0r61yMM/+5MFadgVPwwc+xAvxorz0Jq7SdaITI8qM/e61S3/zqZsh8cvmQjigH9+S28zlRMK2i+2q7tWqKdmrW8rYrKIFtWr74/6M7Zwd1CwZdFcaztDBm4eJ1mqFA1Q4fn0jmU6yn+WQYlZESjtRrVpIdTTzxDi2OJBe9IWaaimleZR6ayOgQj29EfeTlNbOdT+j7H7gstzvcPZjgkaSqtIXEPUlVaC8JdR9rDqIo7Xf6VRVUlqMI2uCN9soQOXnDPcOyh4veJWOF0oDyyLj6PTkY7BmYGMXDs+p+IGu7/NPl45Fxa9S0ZEz1aHkdJf7aeBE77rAayReGoX0tEzjzQfxxePWdoP4JN6UAHyuVIKAyNFlCEeMSEnGUHzEPXY6vVbAXMX2ghkT6Ondc5QevFf3GZ05HnH9aHebe46DgibKBoqp5yzbKxt299Fb1rDFHOAku6pN2ZV/J/EnW9U0jlq115RRZamEYO0iL7o4CiDsHWmldgSu2+1y8ihBdvjQnzyMxuP+h9Ek/1Vcxt7xyCUWnjbgBRdWBwRWnqoVyTeq0uiyjkKgVq65Nmf8h9FzfzLss3+eRuPHvz+PR1cHkDgA0Np0AFm9rXH0XwnggP21Vglg/0lALfYv1s+vFpdY3NDbtHhT2XfNSXUi+LhYxP2ruCF3Qv47M8VRBQO9yvsOJYiyVLLm9773kO+E+VfPLpBulyzPHaSp7sRjXrqJRQFciMaQouWE2V70XGCKJtBxiBB8R9v4in+mPYk/0z6SGZ9w6nUMuTwvNqaGbnTKNUDXVaMa4KVh2MhjWJV86QRkGbZeB4ab/tWihjAsg1m31PvPBbpUPweBfoXtebAFkmCrOdjKvk+8wvYPhO3r9ucrsk9Att7mhpyMcUV2ceqC818+viueVF1xtwd3hqR2XRfu2G36XxzEJ8/p/yMBRv8D</diagram></mxfile>
\ No newline at end of file
diff --git a/tensorflow/contrib/verbs/verbs_with_0_copies_phase1_protocol.jpg b/tensorflow/contrib/verbs/verbs_with_0_copies_phase1_protocol.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8bc69b889d68f0a6a8faa64f08eab22637646842
Binary files /dev/null and b/tensorflow/contrib/verbs/verbs_with_0_copies_phase1_protocol.jpg differ
diff --git a/tensorflow/contrib/verbs/verbs_with_0_copies_phase1_protocol.xml b/tensorflow/contrib/verbs/verbs_with_0_copies_phase1_protocol.xml
new file mode 100644
index 0000000000000000000000000000000000000000..484e7c78ae863b543dee360969e2ef1c8ae6114d
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_with_0_copies_phase1_protocol.xml
@@ -0,0 +1 @@
+<mxfile userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36" version="7.8.4" editor="www.draw.io" type="device"><diagram name="Page-1" id="74e2e168-ea6b-b213-b513-2b3c1d86103e">7Vxbc5s4FP41nuk+pMMd8ujYTrYzTZqNk9nmyaOAbNhgRIWc2P31K4G4yzZ1AHtaZzJjOBLS8TnnOzeRDNTRcn2DQejeIgf6A0Vy1gN1PFAUWVMM+sEom4RiGpcJYYE9h0/KCVPvJ+REiVNXngOj0kSCkE+8sEy0URBAm5RoAGP0Xp42R3551xAsYI0wtYFfp/7rOcTlVFmS8oG/obdw+daWzgdegP26wGgV8P0GijqPf5LhJUjX4vMjFzjovUBSJwN1hBEiydVyPYI+k20qtuS56y2jGd8YBqTJA1bywBvwVzDl2PDpo1eO98b4IxsuE+PHijF1ReCaXADfWwQDdUhn+HBO8lF6teCf8SpRCIKUNiUAk09/pUOUqeJogRxvXaa2z01Ke8ECDnpkDCxDehG8ROzjgs4c+j6yAYGPMIgQjkoC64WBKQycT4+Tu+m3h9nD5J+nyfSxax62a6K0m1LaRYlxBpklS3T43fUInIbAZqPv1C9RmkuWPr2Ts6ffIKZMbQWLnEGQujaIlpDgDZ3CH7A4aLlTkw1+/567CCX1EG7BO2RuA3C3tMiWzqFJLzg6xUhNPUbrUH2A9ltia7eQgDEgoHOTa6juNnZjBj2GoE9M1UHc/X4xZq+erq8nDLPT+29308nWTU8LRqrSJ4xkQwCjikCgQ5MBfoswcdECBcCf5NSrssgK4vkPErLh+QxYEURJ+QpfEQpLYmQb7RYi5QutsJ3O4rzSQLqA6TRNLGwMfUC8t/L6H5Kc0pEDivHiON/wU+hQyDzAKERBBLsHDfN8XylM/WG0Cezu9/syj/XyY+VhZjvDPgP7gKGsJRL7LiMUbm7unx7R6F6UXT2dUp7XqSCmEHuUsZ/wqN/45HMnQztq8qQfw8lT0eDN9+LNM1vss85u1x75Xrp75hsdGBq0emhIqvAPhAb+6D3y6M6ZKi+VsipNo6KhhAf+VK6kIcZgU5gWsglR831Us1LL7plvWLvnW9rO+fQi4Ti3sEyGzQKmVguY1x6OyKO3hMyZmCP2W/MyBbeQYDdNy0cuCBbQoX5GvW6KchctX1bRfoQ7NCTZxEPU2YypWTFEdoH6nnO9y/25XuSCkF3Ofbgess5RDFWHX45tH0SRZ5eFNfd8f4R8hOMl0gZPAe9SHe+HodoS5HuKWOIFieoCgaa0D2K/mrwrVewn3NewX1tI1aXPpiwZpiXLlqFrplFeV27mUw6AZWoEfVlFi/5cGhxR9bpx+VmxLlVFtixZ1XSlpDCtqrCmhrB7WbVhbDnEDtSaHTzDqGYKLA8rKzoiGL3CVNUBCmBF+5zEk7exTfUMKf2KuVKPlRt8YOk5TpxpiJxzOfvowherdV+sCcxHaSP/qofCO/T7itqSjihqUYOjq+KKFUD3KBSW7H2VQBfbUqgiAw/jW7bCO6bap5+fkr7cID5BIlSrv8r4iVVThsDAusurVH1/BO2zvOI1VJZwHRx0FVMQdLsposyqBrVmgW57EfWRUGjWFLqlIXdidq/12kVQ6xlDTSKnXU+kAaZk4KZY5P1klXKloNDMA/PI6kJ6VeMtdSVq+8jtdg3UBgcUnRiZoEl1oJEZdSNTj2pku2sMU+2kdEnbSR2ULmrdX7d9FjxK8qLf6ShY0Fo78FSmvyOWETtiubl/aqSHftgaw0h45LGbq/hJWqzte+TEEm3rmHl2muwIYO7KjYDA62ERziF1p7ggdbbiFqEfXpfTUsr2ggUl6PndY5zBXyjbNIgoY3M/jmQuLdth0E2JXlLsZUO9VrP0g9QqakC2olb2FsifrNRqedCrVkXFAQ9vVSc3R3EaYWfpWK5ImphJEuOxBtnx7XB2O5lOhzeTWfntvILCk5VrLvWlAzM4sZ5b1mNLD5gtgfJFOWYbTTet3t/sTvnZa15n5W9Tvqr1qXxRO6xz5Sfv+J21L9C+1iv0t/fbW9F+loLHK1T71tloVe1na8hydr1Pa+iqMm+54E4JX5bLZH4TE2UGyv6Spboq902/ZH27akDRAUzL3/vag74Tlb96kUGyCeFAGfHOAIzIzKNWuk5IAVjywYjAcEZ1z2cuEYEz4DiYE17hJrmidgRmDiBgb/F7wOHTPuRSzTnGi6EbA1EXULHtE8SwXMawInhvSBVl8nobGO76j6I6wrAIZlJt9p8LdKF8dgL9DNuPwVYVJGLdwVb0tt8Ztn8gbD8Un7e+kXvG/i9hX+8zZKdrnLFf3boCj9P3AA2PAs+424I7Q9D0bgt39Db/1wTJuXX+/x/Uyf8=</diagram></mxfile>
\ No newline at end of file
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index bd7617fa9641ce47f93bd2104029f20798fd2815..d1fb9f444514fee4ce339d4308da0d583ae36aa0 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -79,6 +79,7 @@ load(
     "if_linux_x86_64",
     "if_mobile",
     "if_not_mobile",
+    "if_windows",
     "if_not_windows",
     "tf_copts",
     "tf_cc_test",
@@ -135,6 +136,8 @@ load(
     "tf_nano_proto_library",
     "tf_protos_all",
     "tf_protos_all_impl",
+    "tf_protos_grappler",
+    "tf_protos_grappler_impl",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -190,6 +193,7 @@ CORE_PROTO_SRCS = [
     "protobuf/rewriter_config.proto",
     "protobuf/tensor_bundle.proto",
     "protobuf/saver.proto",
+    "util/event.proto",
     "util/memmapped_file_system.proto",
     "util/saved_tensor_slice.proto",
 ]
@@ -202,11 +206,12 @@ CORE_PROTO_SRCS = [
 ADDITIONAL_CORE_PROTO_SRCS = [
     "example/example_parser_configuration.proto",
     "protobuf/control_flow.proto",
+    # TODO(ebrevdo): Re-enable once CriticalSection is in core.
+    # "protobuf/critical_section.proto",
     "protobuf/meta_graph.proto",
     "protobuf/named_tensor.proto",
     "protobuf/saved_model.proto",
     "protobuf/tensorflow_server.proto",
-    "util/event.proto",
     "util/test_log.proto",
 ]
 
@@ -274,11 +279,11 @@ cc_library(
         "platform/platform.h",
         "platform/protobuf.h",
         "platform/types.h",
-    ] + glob(tf_additional_proto_hdrs()) + glob(tf_env_time_hdrs()),
+        "platform/windows/cpu_info.h",
+        "lib/bfloat16/bfloat16.h",
+    ] + tf_additional_proto_hdrs() + glob(tf_env_time_hdrs()),
     copts = tf_copts(),
-    deps = tf_lib_proto_parsing_deps() + [
-        "@double_conversion//:double-conversion",
-    ],
+    deps = tf_lib_proto_parsing_deps(),
 )
 
 # This build rule (along with :lib_internal, :framework, and
@@ -288,6 +293,7 @@ cc_library(
 cc_library(
     name = "lib",
     hdrs = [
+        "lib/bfloat16/bfloat16.h",
         "lib/core/arena.h",
         "lib/core/bitmap.h",
         "lib/core/bits.h",
@@ -364,6 +370,33 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "abi",
+    srcs = ["platform/abi.cc"],
+    hdrs = ["platform/abi.h"],
+)
+
+cc_library(
+    name = "session_message",
+    srcs = ["util/session_message.cc"],
+    hdrs = ["util/session_message.h"],
+    deps = [
+        ":framework",
+        ":lib",
+        ":protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "stacktrace_handler",
+    srcs = ["platform/stacktrace_handler.cc"],
+    hdrs = ["platform/stacktrace_handler.h"],
+    deps = [
+        ":lib",
+        ":lib_platform",
+    ],
+)
+
 # Test support library needed for all tests
 # This is currently public, but may be made internal in the
 # future.  Try to avoid depending on it.
@@ -410,6 +443,8 @@ tf_cuda_library(
         "framework/cancellation.h",
         "framework/common_shape_fns.h",
         "framework/control_flow.h",  # TODO(josh11b): Make internal?
+        "framework/dataset.h",
+        "framework/dataset_stateful_op_whitelist.h",
         "framework/device_base.h",
         "framework/function.h",
         "framework/graph_def_util.h",
@@ -430,6 +465,7 @@ tf_cuda_library(
         "framework/reader_interface.h",
         "framework/reader_op_kernel.h",
         "framework/register_types.h",
+        "framework/register_types_traits.h",
         "framework/resource_mgr.h",
         "framework/resource_op_kernel.h",
         "framework/selective_registration.h",
@@ -513,14 +549,6 @@ cc_library(
     ],
 )
 
-tf_proto_library_cc(
-    name = "op_gen_overrides_proto",
-    srcs = ["framework/op_gen_overrides.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos(),
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "op_gen_lib",
     srcs = ["framework/op_gen_lib.cc"],
@@ -529,7 +557,6 @@ cc_library(
     deps = [
         ":lib",
         ":lib_internal",
-        ":op_gen_overrides_proto_cc",
         ":protos_all_cc",
     ],
 )
@@ -551,6 +578,7 @@ cc_library(
         "framework/numeric_types.h",
         "framework/tensor_types.h",
         "framework/type_traits.h",
+        "lib/bfloat16/bfloat16.h",
         "platform/default/dynamic_annotations.h",
         "platform/default/integral_types.h",
         "platform/default/logging.h",
@@ -564,7 +592,7 @@ cc_library(
         "platform/prefetch.h",
         "platform/thread_annotations.h",
         "platform/types.h",
-    ],
+    ] + if_windows(["platform/windows/integral_types.h"]),
     visibility = ["//visibility:public"],
     deps =
         [
@@ -577,7 +605,9 @@ cc_library(
 
 # Generates library per group of ops.
 tf_gen_op_libs(
+    is_external = False,
     op_lib_names = [
+        "batch_ops",
         "bitwise_ops",
         "candidate_sampling_ops",
         "checkpoint_ops",
@@ -590,8 +620,10 @@ tf_gen_op_libs(
         "image_ops",
         "io_ops",
         "linalg_ops",
+        "list_ops",
         "lookup_ops",
         "logging_ops",
+        "manip_ops",
         "math_ops",
         "nn_ops",
         "no_op",
@@ -658,6 +690,7 @@ cc_library(
     deps = [
         ":array_ops_op_lib",
         ":audio_ops_op_lib",
+        ":batch_ops_op_lib",
         ":bitwise_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":checkpoint_ops_op_lib",
@@ -670,8 +703,10 @@ cc_library(
         ":image_ops_op_lib",
         ":io_ops_op_lib",
         ":linalg_ops_op_lib",
+        ":list_ops_op_lib",
         ":logging_ops_op_lib",
         ":lookup_ops_op_lib",
+        ":manip_ops_op_lib",
         ":math_ops_op_lib",
         ":nn_ops_op_lib",
         ":no_op_op_lib",
@@ -763,6 +798,7 @@ tf_cuda_library(
         "graph/graph.h",
         "graph/graph_constructor.h",
         "graph/graph_def_builder.h",
+        "graph/graph_def_builder_util.h",
         "graph/node_builder.h",
         "graph/validate.h",
         "graph/while_context.h",
@@ -792,6 +828,7 @@ cc_library(
     deps = [
         "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:audio",
+        "//tensorflow/core/kernels:batch_kernels",
         "//tensorflow/core/kernels:bincount_op",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:checkpoint_ops",
@@ -801,12 +838,15 @@ cc_library(
         "//tensorflow/core/kernels:dataset_ops",
         "//tensorflow/core/kernels:fake_quant_ops",
         "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:functional_ops",
         "//tensorflow/core/kernels:histogram_op",
         "//tensorflow/core/kernels:image",
         "//tensorflow/core/kernels:io",
         "//tensorflow/core/kernels:linalg",
+        "//tensorflow/core/kernels:list_kernels",
         "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
+        "//tensorflow/core/kernels:manip",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:multinomial_op",
         "//tensorflow/core/kernels:nn",
@@ -843,6 +883,7 @@ cc_library(
         "//tensorflow/core/kernels:mkl_pooling_ops",
         "//tensorflow/core/kernels:mkl_relu_op",
         "//tensorflow/core/kernels:mkl_reshape_op",
+        "//tensorflow/core/kernels:mkl_softmax_op",
         "//tensorflow/core/kernels:mkl_tfconv_op",
         "//tensorflow/core/kernels:mkl_aggregate_ops",
     ]),
@@ -1016,7 +1057,7 @@ filegroup(
 cc_library(
     name = "android_tensorflow_lib_lite",
     srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts() + if_not_android_mips_and_mips64(["-Os"]),
+    copts = tf_copts(android_optimization_level_override = None),
     linkopts = ["-lz"],
     tags = [
         "manual",
@@ -1026,7 +1067,6 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
-        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1041,19 +1081,31 @@ cc_library(
     name = "ios_tensorflow_lib",
     srcs = if_ios([
         ":android_op_registrations_and_gradients",
-        "//tensorflow/core:android_srcs",
         "//tensorflow/core/kernels:android_core_ops",
         "//tensorflow/core/kernels:android_extended_ops",
     ]),
     copts = tf_copts() + ["-Os"] + ["-std=c++11"],
     visibility = ["//visibility:public"],
     deps = [
+        ":ios_tensorflow_lib_lite",
         ":protos_all_cc_impl",
         "//third_party/eigen3",
         "//third_party/fft2d:fft2d_headers",
-        "@double_conversion//:double-conversion",
-        "@fft2d//:fft2d",
-        "@gemmlowp//:gemmlowp",
+        "@fft2d",
+        "@gemmlowp",
+        "@protobuf_archive//:protobuf",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "ios_tensorflow_lib_lite",
+    srcs = if_ios(["//tensorflow/core:android_srcs"]),
+    copts = tf_copts() + ["-Os"] + ["-std=c++11"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":protos_all_cc_impl",
+        "//third_party/eigen3",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1106,8 +1158,7 @@ cc_library(
 cc_library(
     name = "android_tensorflow_lib_selective_registration",
     srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts() + [
-        "-Os",
+    copts = tf_copts(android_optimization_level_override = None) + [
         "-DSUPPORT_SELECTIVE_REGISTRATION",
     ],
     tags = [
@@ -1118,7 +1169,7 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
-        "@double_conversion//:double-conversion",
+        "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
     alwayslink = 1,
@@ -1129,8 +1180,7 @@ cc_library(
 cc_library(
     name = "android_tensorflow_lib_selective_registration_nortti",
     srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts() + tf_opts_nortti_if_android() + [
-        "-Os",
+    copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_android() + [
         "-DSUPPORT_SELECTIVE_REGISTRATION",
     ],
     tags = [
@@ -1141,7 +1191,6 @@ cc_library(
     deps = [
         ":protos_all_cc_impl",
         "//third_party/eigen3",
-        "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
         "@protobuf_archive//:protobuf",
     ],
@@ -1210,7 +1259,7 @@ cc_library(
         "framework/tensor_testutil.h",
         "util/reporter.h",
     ],
-    copts = tf_copts() + ["-Os"],
+    copts = tf_copts(android_optimization_level_override = None),
     tags = [
         "manual",
         "notap",
@@ -1293,6 +1342,13 @@ tf_pyclif_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "framework/function_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "framework/function.proto",
+    visibility = ["//visibility:public"],
+)
+
 tf_pyclif_proto_library(
     name = "framework/graph_pyclif",
     proto_lib = ":protos_all_cc",
@@ -1314,6 +1370,13 @@ tf_pyclif_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "protobuf/device_properties_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "protobuf/device_properties.proto",
+    visibility = ["//visibility:public"],
+)
+
 # -----------------------------------------------------------------------------
 # Internal targets
 
@@ -1503,9 +1566,8 @@ cc_library(
         "//tensorflow/core/platform/default/build_config:platformlib",
         "@snappy",
         "@zlib_archive//:zlib",
-        "@double_conversion//:double-conversion",
         "@protobuf_archive//:protobuf",
-    ] + tf_protos_all_impl(),
+    ] + tf_protos_all_impl() + tf_protos_grappler_impl(),
 )
 
 # File compiled with extra flags to get cpu-specific acceleration.
@@ -1570,6 +1632,7 @@ cc_library(
         "platform/jpeg.h",
     ]),
     hdrs = [
+        "lib/bfloat16/bfloat16.h",
         "lib/core/stringpiece.h",
         "lib/jpeg/jpeg_handle.h",
         "lib/jpeg/jpeg_mem.h",
@@ -1597,6 +1660,7 @@ cc_library(
         "platform/gif.h",
     ]),
     hdrs = [
+        "lib/bfloat16/bfloat16.h",
         "lib/core/stringpiece.h",
         "lib/gif/gif_io.h",
         "lib/gtl/cleanup.h",
@@ -1624,6 +1688,7 @@ cc_library(
         "platform/png.h",
     ]),
     hdrs = [
+        "lib/bfloat16/bfloat16.h",
         "lib/core/casts.h",
         "lib/core/stringpiece.h",
         "lib/png/png_io.h",
@@ -1671,6 +1736,9 @@ FRAMEWORK_INTERNAL_PRIVATE_HEADERS = [
     "platform/variant_coding.h",
     "graph/edgeset.h",
     "graph/graph.h",
+    "graph/graph_def_builder.h",
+    "graph/node_builder.h",
+    "graph/tensor_id.h",
 ] + glob(
     [
         "example/**/*.h",
@@ -1688,6 +1756,7 @@ FRAMEWORK_INTERNAL_PRIVATE_HEADERS = [
         "framework/reader_base.*",
         "util/memmapped_file_system.*",
         "util/memmapped_file_system_writer.*",
+        "util/session_message.*",
         "util/version_info.cc",
     ],
 ) + select({
@@ -1757,6 +1826,9 @@ tf_cuda_library(
         ] + [
             "graph/edgeset.cc",
             "graph/graph.cc",
+            "graph/graph_def_builder.cc",
+            "graph/node_builder.cc",
+            "graph/tensor_id.cc",
             "graph/while_context.h",
             "graph/while_context.cc",
         ],
@@ -1771,6 +1843,7 @@ tf_cuda_library(
             "framework/resource_handle.cc",
             "util/memmapped_file_system.*",
             "util/memmapped_file_system_writer.*",
+            "util/session_message.cc",
             "util/version_info.cc",
         ],
     ) + select({
@@ -1856,17 +1929,26 @@ cc_library(
     ],
 )
 
+tf_cuda_library(
+    name = "cuda_device_functions",
+    hdrs = ["util/cuda_device_functions.h"],
+    visibility = ["//visibility:public"],
+    deps = [":framework_lite"],
+)
+
 # TODO(josh11b): Is this needed, or can we just use ":protos_all_cc"?
 cc_library(
     name = "protos_cc",
     deps = ["//tensorflow/core/platform/default/build_config:protos_cc"],
 )
 
-CORE_CPU_BASE_HDRS = [
-    "common_runtime/device.h",
-    "common_runtime/graph_runner.h",
-    "common_runtime/shape_refiner.h",
-    "framework/versions.h",
+# Library containing all of the graph construction code that is
+# independent of the runtime.
+#
+# TODO(mrry): Refactor graph_constructor.cc so that it does not depend on code
+# in "common_runtime/", and then the entire "graph/" directory can be included
+# in this library.
+GRAPH_HDRS = [
     "graph/algorithm.h",
     "graph/colors.h",
     "graph/control_flow.h",
@@ -1874,8 +1956,9 @@ CORE_CPU_BASE_HDRS = [
     "graph/default_device.h",
     "graph/edgeset.h",
     "graph/graph.h",
-    "graph/graph_constructor.h",
+    "graph/graph_constructor.h",  # NOTE(mrry): Don't include the .cc since it depends on common_runtime.
     "graph/graph_def_builder.h",
+    "graph/graph_def_builder_util.h",
     "graph/graph_partition.h",
     "graph/mkl_layout_pass.h",
     "graph/mkl_tfconversion_pass.h",
@@ -1890,23 +1973,44 @@ CORE_CPU_BASE_HDRS = [
 ]
 
 tf_cuda_library(
-    name = "core_cpu_base",
+    name = "graph",
     srcs = [
-        "common_runtime/shape_refiner.cc",
-        "common_runtime/shape_refiner.h",
-        "framework/versions.h",
         "graph/algorithm.cc",
         "graph/colors.cc",
         "graph/control_flow.cc",
         "graph/costmodel.cc",
-        "graph/graph_constructor.cc",
-        "graph/graph_def_builder.cc",
         "graph/graph_partition.cc",
-        "graph/node_builder.cc",
         "graph/optimizer_cse.cc",
         "graph/subgraph.cc",
-        "graph/tensor_id.cc",
         "graph/validate.cc",
+    ],
+    hdrs = GRAPH_HDRS,
+    deps = [
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":proto_text",
+        ":protos_all_cc",
+        "//third_party/eigen3",
+    ],
+)
+
+CORE_CPU_BASE_HDRS = GRAPH_HDRS + [
+    "common_runtime/device.h",
+    "common_runtime/graph_runner.h",
+    "common_runtime/shape_refiner.h",
+    "framework/versions.h",
+]
+
+tf_cuda_library(
+    name = "core_cpu_base",
+    srcs = [
+        "common_runtime/shape_refiner.cc",
+        "common_runtime/shape_refiner.h",
+        "framework/versions.h",
+        "graph/graph_constructor.cc",  # Depends on common_runtime.
+        "graph/graph_def_builder_util.cc",  # Depends on common_runtime.
         "public/session.h",
         "public/session_options.h",
         "public/version.h",
@@ -1914,6 +2018,7 @@ tf_cuda_library(
     hdrs = CORE_CPU_BASE_HDRS,
     copts = tf_copts(),
     deps = [
+        ":graph",
         ":framework",
         ":framework_internal",
         ":lib",
@@ -2017,6 +2122,7 @@ tf_cuda_library(
     hdrs = CORE_CPU_LIB_HEADERS,
     copts = tf_copts(),
     deps = [
+        ":graph",
         ":framework",
         ":framework_internal",
         ":lib",
@@ -2040,7 +2146,7 @@ tf_cuda_library(
         ":core_cpu_base",
         ":proto_text",
         "//tensorflow/core/grappler:grappler_item",
-    ] + if_static([":core_cpu_impl"]) + tf_protos_all(),
+    ] + if_static([":core_cpu_impl"]) + tf_protos_all() + tf_protos_grappler(),
 )
 
 tf_cuda_library(
@@ -2058,6 +2164,7 @@ tf_cuda_library(
         ":function_ops_op_lib",
         ":functional_grad",
         ":functional_ops_op_lib",
+        ":graph",
         ":lib",
         ":lib_internal",
         ":proto_text",
@@ -2103,6 +2210,7 @@ tf_cuda_library(
         ":core_cpu_internal",
         ":device_tracer",
         ":framework",
+        ":graph",
         ":lib",
         ":lib_internal",
         ":proto_text",
@@ -2147,17 +2255,31 @@ tf_cuda_library(
     ] + tf_additional_device_tracer_deps(),
 )
 
+cc_library(
+    name = "gpu_id",
+    srcs = ["common_runtime/gpu/gpu_id_manager.cc"],
+    hdrs = [
+        "common_runtime/gpu/gpu_id.h",
+        "common_runtime/gpu/gpu_id_manager.h",
+    ],
+    deps = [
+        ":lib",
+    ],
+)
+
 GPU_RUNTIME_HEADERS = [
     "common_runtime/gpu/gpu_bfc_allocator.h",
     "common_runtime/gpu/gpu_cudamalloc_allocator.h",
     "common_runtime/gpu/gpu_debug_allocator.h",
     "common_runtime/gpu/gpu_device.h",
+    "common_runtime/gpu/gpu_id_utils.h",
     "common_runtime/gpu/gpu_init.h",
     "common_runtime/gpu/gpu_managed_allocator.h",
     "common_runtime/gpu/gpu_stream_util.h",
     "common_runtime/gpu/gpu_util.h",
     "common_runtime/gpu/pool_allocator.h",
     "common_runtime/gpu/process_state.h",
+    "common_runtime/gpu_device_context.h",
 ]
 
 tf_cuda_library(
@@ -2174,7 +2296,6 @@ tf_cuda_library(
         "common_runtime/gpu/gpu_util_platform_specific.cc",
         "common_runtime/gpu/pool_allocator.cc",
         "common_runtime/gpu/process_state.cc",
-        "common_runtime/gpu_device_context.h",
     ],
     hdrs = GPU_RUNTIME_HEADERS,
     copts = tf_copts(),
@@ -2183,8 +2304,10 @@ tf_cuda_library(
         ":core_cpu_lib",
         ":framework",
         ":framework_internal",
+        ":gpu_id",
         ":gpu_init_impl",
         ":gpu_lib",
+        ":graph",
         ":lib",
         ":lib_internal",
         ":protos_all_cc",
@@ -2271,7 +2394,7 @@ cc_library(
         ":lib_internal",
         ":proto_text",
         "//third_party/eigen3",
-        "@local_config_sycl//sycl:sycl",
+        "@local_config_sycl//sycl",
     ],
     alwayslink = 0,
 )
@@ -2403,6 +2526,7 @@ tf_cc_tests(
         "platform/net_test.cc",
         "platform/port_test.cc",
         "platform/profile_utils/cpu_utils_test.cc",
+        "platform/stacktrace_handler_test.cc",
         "platform/subprocess_test.cc",
     ],
     deps = [
@@ -2758,15 +2882,31 @@ tf_cc_test_mkl(
         "//tensorflow/core/kernels:mkl_pooling_ops",
         "//tensorflow/core/kernels:mkl_relu_op",
         "//tensorflow/core/kernels:mkl_reshape_op",
+        "//tensorflow/core/kernels:mkl_softmax_op",
         "//tensorflow/core/kernels:mkl_tfconv_op",
     ]),
 )
 
+tf_cc_tests_gpu(
+    name = "gpu_device_on_non_gpu_machine_test",
+    size = "small",
+    srcs = ["common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":gpu_headers_lib",
+        ":gpu_id",
+        ":gpu_runtime",
+        ":test",
+    ],
+)
+
 tf_cc_tests_gpu(
     name = "gpu_related_tests",
     size = "small",
     srcs = glob(["user_ops/**/*_test.cc"]) + [
         "common_runtime/gpu/gpu_bfc_allocator_test.cc",
+        "common_runtime/gpu/gpu_device_test.cc",
+        "common_runtime/gpu/gpu_id_manager_test.cc",
         "common_runtime/gpu/gpu_event_mgr_test.cc",
         "common_runtime/gpu/pool_allocator_test.cc",
     ],
@@ -2778,6 +2918,7 @@ tf_cc_tests_gpu(
         ":direct_session",
         ":framework",
         ":framework_internal",
+        ":gpu_id",
         ":gpu_runtime",
         ":lib",
         ":lib_internal",
@@ -3131,6 +3272,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:random_ops",
         "//tensorflow/core/kernels:shape_ops",
         "//third_party/eigen3",
     ],
@@ -3172,6 +3314,7 @@ tf_cc_test_gpu(
         ":direct_session",
         ":framework",
         ":framework_internal",
+        ":gpu_id",
         ":gpu_runtime",
         ":lib",
         ":lib_internal",
@@ -3381,37 +3524,6 @@ tf_cc_test(
     ],
 )
 
-filegroup(
-    name = "base_api_def",
-    srcs = glob(["api_def/base_api/*"]),
-)
-
-filegroup(
-    name = "python_api_def",
-    data = glob(["api_def/python_api/*"]),
-)
-
-tf_cc_test(
-    name = "api_test",
-    srcs = ["api_def/api_test.cc"],
-    data = [
-        ":base_api_def",
-        "//tensorflow/cc:ops/op_gen_overrides.pbtxt",
-    ],
-    deps = [
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":lib_test_internal",
-        ":op_gen_lib",
-        ":op_gen_overrides_proto_cc",
-        ":ops",
-        ":protos_all_cc",
-        ":test",
-    ],
-)
-
 tf_cc_test_gpu(
     name = "device_tracer_test",
     size = "small",
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..58dbac4e8edac7079d315fbfcdafbd136793df0b
--- /dev/null
+++ b/tensorflow/core/api_def/BUILD
@@ -0,0 +1,114 @@
+# Description:
+#   Provides ApiDef access and ApiDef validation for TensorFlow.
+#
+# The following targets can be used to access ApiDefs:
+#   :base_api_def
+#   :python_api_def
+
+package(
+    default_visibility = ["//visibility:private"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
+    "tf_cc_test",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "base_api_def",
+    srcs = glob(["base_api/*"]),
+    visibility = ["//tensorflow:internal"],
+)
+
+filegroup(
+    name = "python_api_def",
+    srcs = glob(["python_api/*"]),
+    visibility = ["//tensorflow:internal"],
+)
+
+cc_library(
+    name = "excluded_ops_lib",
+    srcs = ["excluded_ops.cc"],
+    hdrs = ["excluded_ops.h"],
+)
+
+cc_library(
+    name = "update_api_def_lib",
+    srcs = ["update_api_def.cc"],
+    hdrs = ["update_api_def.h"],
+    deps = [
+        ":excluded_ops_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:op_gen_lib",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "update_api_def_test",
+    srcs = ["update_api_def_test.cc"],
+    deps = [
+        ":update_api_def_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_binary(
+    name = "update_api_def",
+    srcs = [
+        "update_api_def_main.cc",
+    ],
+    data = [
+        ":base_api_def",
+    ],
+    deps = [
+        ":update_api_def_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "api_test",
+    srcs = ["api_test.cc"],
+    data = [
+        ":base_api_def",
+        ":python_api_def",
+    ],
+    deps = [
+        ":excluded_ops_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_test_internal",
+        "//tensorflow/core:op_gen_lib",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc
index 2cdc14843f61a2585b61e214527e0a0b5bdea446..477a0b670e49f8aa4ee8c250d4957886eb865ed5 100644
--- a/tensorflow/core/api_def/api_test.cc
+++ b/tensorflow/core/api_def/api_test.cc
@@ -13,9 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Test that verifies tensorflow/core/api_def/base_api/api_def*.pbtxt files
-// are correct. If api_def*.pbtxt do not match expected contents, run
-// tensorflow/core/api_def/base_api/update_api_def.sh script to update them.
+// Test that validates tensorflow/core/api_def/base_api/api_def*.pbtxt files.
 
 #include <ctype.h>
 #include <algorithm>
@@ -23,12 +21,11 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/api_def/excluded_ops.h"
 #include "tensorflow/core/framework/api_def.pb.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/framework/op_gen_overrides.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -44,309 +41,235 @@ namespace tensorflow {
 namespace {
 constexpr char kDefaultApiDefDir[] =
     "tensorflow/core/api_def/base_api";
-constexpr char kOverridesFilePath[] =
-    "tensorflow/cc/ops/op_gen_overrides.pbtxt";
-constexpr char kApiDefFileFormat[] = "api_def_%s.pbtxt";
+constexpr char kPythonApiDefDir[] =
+    "tensorflow/core/api_def/python_api";
 constexpr char kApiDefFilePattern[] = "api_def_*.pbtxt";
 
-void FillBaseApiDef(ApiDef* api_def, const OpDef& op) {
-  api_def->set_graph_op_name(op.name());
-  // Add arg docs
-  for (auto& input_arg : op.input_arg()) {
-    if (!input_arg.description().empty()) {
-      auto* api_def_in_arg = api_def->add_in_arg();
-      api_def_in_arg->set_name(input_arg.name());
-      api_def_in_arg->set_description(input_arg.description());
-    }
-  }
-  for (auto& output_arg : op.output_arg()) {
-    if (!output_arg.description().empty()) {
-      auto* api_def_out_arg = api_def->add_out_arg();
-      api_def_out_arg->set_name(output_arg.name());
-      api_def_out_arg->set_description(output_arg.description());
-    }
-  }
-  // Add attr docs
-  for (auto& attr : op.attr()) {
-    if (!attr.description().empty()) {
-      auto* api_def_attr = api_def->add_attr();
-      api_def_attr->set_name(attr.name());
-      api_def_attr->set_description(attr.description());
-    }
+// Reads golden ApiDef files and returns a map from file name to ApiDef file
+// contents.
+void GetGoldenApiDefs(Env* env, const string& api_files_dir,
+                      std::unordered_map<string, ApiDef>* name_to_api_def) {
+  std::vector<string> matching_paths;
+  TF_CHECK_OK(env->GetMatchingPaths(
+      io::JoinPath(api_files_dir, kApiDefFilePattern), &matching_paths));
+
+  for (auto& file_path : matching_paths) {
+    string file_contents;
+    TF_CHECK_OK(ReadFileToString(env, file_path, &file_contents));
+    file_contents = PBTxtFromMultiline(file_contents);
+
+    ApiDefs api_defs;
+    CHECK(tensorflow::protobuf::TextFormat::ParseFromString(file_contents,
+                                                            &api_defs))
+        << "Failed to load " << file_path;
+    CHECK_EQ(api_defs.op_size(), 1);
+    (*name_to_api_def)[api_defs.op(0).graph_op_name()] = api_defs.op(0);
   }
-  // Add docs
-  api_def->set_summary(op.summary());
-  api_def->set_description(op.description());
 }
 
-// Checks if arg1 should be before arg2 according to ordering in args.
-bool CheckArgBefore(const ApiDef::Arg* arg1, const ApiDef::Arg* arg2,
-                    const protobuf::RepeatedPtrField<OpDef::ArgDef>& args) {
-  for (auto& arg : args) {
-    if (arg.name() == arg2->name()) {
-      return false;
-    } else if (arg.name() == arg1->name()) {
-      return true;
-    }
+void TestAllApiDefsHaveCorrespondingOp(
+    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+  std::unordered_set<string> op_names;
+  for (const auto& op : ops.op()) {
+    op_names.insert(op.name());
+  }
+  for (const auto& name_and_api_def : api_defs_map) {
+    ASSERT_TRUE(op_names.find(name_and_api_def.first) != op_names.end())
+        << name_and_api_def.first << " op has ApiDef but missing from ops. "
+        << "Does api_def_" << name_and_api_def.first << " need to be deleted?";
   }
-  return false;
 }
 
-// Checks if attr1 should be before attr2 according to ordering in op_def.
-bool CheckAttrBefore(const ApiDef::Attr* attr1, const ApiDef::Attr* attr2,
-                     const OpDef& op_def) {
-  for (auto& attr : op_def.attr()) {
-    if (attr.name() == attr2->name()) {
-      return false;
-    } else if (attr.name() == attr1->name()) {
-      return true;
+void TestAllApiDefInputArgsAreValid(
+    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+  for (const auto& op : ops.op()) {
+    const auto api_def_iter = api_defs_map.find(op.name());
+    if (api_def_iter == api_defs_map.end()) {
+      continue;
+    }
+    const auto& api_def = api_def_iter->second;
+    for (const auto& api_def_arg : api_def.in_arg()) {
+      bool found_arg = false;
+      for (const auto& op_arg : op.input_arg()) {
+        if (api_def_arg.name() == op_arg.name()) {
+          found_arg = true;
+          break;
+        }
+      }
+      ASSERT_TRUE(found_arg)
+          << "Input argument " << api_def_arg.name()
+          << " (overwritten in api_def_" << op.name()
+          << ".pbtxt) is not defined in OpDef for " << op.name();
     }
   }
-  return false;
 }
 
-// Applies renames to args.
-void ApplyArgOverrides(
-    protobuf::RepeatedPtrField<ApiDef::Arg>* args,
-    const protobuf::RepeatedPtrField<OpGenOverride::Rename>& renames,
-    const protobuf::RepeatedPtrField<OpDef::ArgDef>& op_args,
-    const string& op_name) {
-  for (auto& rename : renames) {
-    // First check if rename is valid.
-    bool valid = false;
-    for (const auto& op_arg : op_args) {
-      if (op_arg.name() == rename.from()) {
-        valid = true;
-      }
+void TestAllApiDefOutputArgsAreValid(
+    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+  for (const auto& op : ops.op()) {
+    const auto api_def_iter = api_defs_map.find(op.name());
+    if (api_def_iter == api_defs_map.end()) {
+      continue;
     }
-    QCHECK(valid) << rename.from() << " is not a valid argument for "
-                  << op_name;
-    bool found_arg = false;
-    // If Arg is already in ApiDef, just update it.
-    for (int i = 0; i < args->size(); ++i) {
-      auto* arg = args->Mutable(i);
-      if (arg->name() == rename.from()) {
-        arg->set_rename_to(rename.to());
-        found_arg = true;
-        break;
+    const auto& api_def = api_def_iter->second;
+    for (const auto& api_def_arg : api_def.out_arg()) {
+      bool found_arg = false;
+      for (const auto& op_arg : op.output_arg()) {
+        if (api_def_arg.name() == op_arg.name()) {
+          found_arg = true;
+          break;
+        }
       }
-    }
-    if (!found_arg) {  // not in ApiDef, add a new arg.
-      auto* new_arg = args->Add();
-      new_arg->set_name(rename.from());
-      new_arg->set_rename_to(rename.to());
+      ASSERT_TRUE(found_arg)
+          << "Output argument " << api_def_arg.name()
+          << " (overwritten in api_def_" << op.name()
+          << ".pbtxt) is not defined in OpDef for " << op.name();
     }
   }
-  // We don't really need a specific order here right now.
-  // However, it is clearer if order follows OpDef.
-  std::sort(args->pointer_begin(), args->pointer_end(),
-            [&](ApiDef::Arg* arg1, ApiDef::Arg* arg2) {
-              return CheckArgBefore(arg1, arg2, op_args);
-            });
 }
 
-// Returns existing attribute with the given name if such
-// attribute exists. Otherwise, adds a new attribute and returns it.
-ApiDef::Attr* FindOrAddAttr(ApiDef* api_def, const string attr_name) {
-  // If Attr is already in ApiDef, just update it.
-  for (int i = 0; i < api_def->attr_size(); ++i) {
-    auto* attr = api_def->mutable_attr(i);
-    if (attr->name() == attr_name) {
-      return attr;
+void TestAllApiDefAttributeNamesAreValid(
+    const OpList& ops, const std::unordered_map<string, ApiDef>& api_defs_map) {
+  for (const auto& op : ops.op()) {
+    const auto api_def_iter = api_defs_map.find(op.name());
+    if (api_def_iter == api_defs_map.end()) {
+      continue;
+    }
+    const auto& api_def = api_def_iter->second;
+    for (const auto& api_def_attr : api_def.attr()) {
+      bool found_attr = false;
+      for (const auto& op_attr : op.attr()) {
+        if (api_def_attr.name() == op_attr.name()) {
+          found_attr = true;
+        }
+      }
+      ASSERT_TRUE(found_attr)
+          << "Attribute " << api_def_attr.name() << " (overwritten in api_def_"
+          << op.name() << ".pbtxt) is not defined in OpDef for " << op.name();
     }
   }
-  // Add a new Attr.
-  auto* new_attr = api_def->add_attr();
-  new_attr->set_name(attr_name);
-  return new_attr;
 }
+}  // namespace
 
-// Applies renames and default values to attributes.
-void ApplyAttrOverrides(ApiDef* api_def, const OpGenOverride& op_override,
-                        const OpDef& op_def) {
-  for (auto& attr_rename : op_override.attr_rename()) {
-    auto* attr = FindOrAddAttr(api_def, attr_rename.from());
-    attr->set_rename_to(attr_rename.to());
-  }
-
-  for (auto& attr_default : op_override.attr_default()) {
-    auto* attr = FindOrAddAttr(api_def, attr_default.name());
-    *(attr->mutable_default_value()) = attr_default.value();
-  }
-  // We don't really need a specific order here right now.
-  // However, it is clearer if order follows OpDef.
-  std::sort(api_def->mutable_attr()->pointer_begin(),
-            api_def->mutable_attr()->pointer_end(),
-            [&](ApiDef::Attr* attr1, ApiDef::Attr* attr2) {
-              return CheckAttrBefore(attr1, attr2, op_def);
-            });
-}
+class BaseApiTest : public ::testing::Test {
+ protected:
+  BaseApiTest() {
+    OpRegistry::Global()->Export(false, &ops_);
+    const std::vector<string> multi_line_fields = {"description"};
 
-void ApplyOverridesToApiDef(ApiDef* api_def, const OpDef& op,
-                            const OpGenOverride& op_override) {
-  // Fill ApiDef with data based on op and op_override.
-  // Set visibility
-  if (op_override.skip()) {
-    api_def->set_visibility(ApiDef_Visibility_SKIP);
-  } else if (op_override.hide()) {
-    api_def->set_visibility(ApiDef_Visibility_HIDDEN);
+    Env* env = Env::Default();
+    GetGoldenApiDefs(env, kDefaultApiDefDir, &api_defs_map_);
   }
-  // Add endpoints
-  if (!op_override.rename_to().empty()) {
-    api_def->add_endpoint()->set_name(op_override.rename_to());
-  } else if (!op_override.alias().empty()) {
-    api_def->add_endpoint()->set_name(op.name());
-  }
-
-  for (auto& alias : op_override.alias()) {
-    auto* endpoint = api_def->add_endpoint();
-    endpoint->set_name(alias);
+  OpList ops_;
+  std::unordered_map<string, ApiDef> api_defs_map_;
+};
+
+// Check that all ops have an ApiDef.
+TEST_F(BaseApiTest, AllOpsAreInApiDef) {
+  auto* excluded_ops = GetExcludedOps();
+  for (const auto& op : ops_.op()) {
+    if (excluded_ops->find(op.name()) != excluded_ops->end()) {
+      continue;
+    }
+    ASSERT_TRUE(api_defs_map_.find(op.name()) != api_defs_map_.end())
+        << op.name() << " op does not have api_def_*.pbtxt file. "
+        << "Please add api_def_" << op.name() << ".pbtxt file "
+        << "under tensorflow/core/api_def/base_api/ directory.";
   }
-
-  ApplyArgOverrides(api_def->mutable_in_arg(), op_override.input_rename(),
-                    op.input_arg(), api_def->graph_op_name());
-  ApplyArgOverrides(api_def->mutable_out_arg(), op_override.output_rename(),
-                    op.output_arg(), api_def->graph_op_name());
-  ApplyAttrOverrides(api_def, op_override, op);
 }
 
-// Get map from ApiDef file path to corresponding ApiDefs proto.
-std::unordered_map<string, ApiDefs> GenerateApiDef(
-    const string& api_def_dir, const OpList& ops,
-    const OpGenOverrides& overrides) {
-  std::unordered_map<string, OpGenOverride> name_to_override;
-  for (const auto& op_override : overrides.op()) {
-    name_to_override[op_override.name()] = op_override;
-  }
+// Check that ApiDefs have a corresponding op.
+TEST_F(BaseApiTest, AllApiDefsHaveCorrespondingOp) {
+  TestAllApiDefsHaveCorrespondingOp(ops_, api_defs_map_);
+}
 
-  std::unordered_map<string, ApiDefs> api_defs_map;
+string GetOpDefHasDocStringError(const string& op_name) {
+  return strings::Printf(
+      "OpDef for %s has a doc string. "
+      "Doc strings must be defined in ApiDef instead of OpDef. "
+      "Please, add summary and descriptions in api_def_%s"
+      ".pbtxt file instead",
+      op_name.c_str(), op_name.c_str());
+}
 
-  // These ops are included in OpList only if TF_NEED_GCP
-  // is set to true. So, we skip them for now so that this test passes
-  // whether TF_NEED_GCP is set or not.
-  const std::unordered_set<string> ops_to_exclude = {
-      "BigQueryReader", "GenerateBigQueryReaderPartitions"};
-  for (const auto& op : ops.op()) {
-    CHECK(!op.name().empty())
-        << "Encountered empty op name: %s" << op.DebugString();
-    if (ops_to_exclude.find(op.name()) != ops_to_exclude.end()) {
-      LOG(INFO) << "Skipping " << op.name();
+// Check that OpDef's do not have descriptions and summaries.
+// Descriptions and summaries must be in corresponding ApiDefs.
+TEST_F(BaseApiTest, OpDefsShouldNotHaveDocs) {
+  auto* excluded_ops = GetExcludedOps();
+  for (const auto& op : ops_.op()) {
+    if (excluded_ops->find(op.name()) != excluded_ops->end()) {
       continue;
     }
-    string file_path = io::JoinPath(api_def_dir, kApiDefFileFormat);
-    file_path = strings::Printf(file_path.c_str(), op.name().c_str());
-    ApiDef* api_def = api_defs_map[file_path].add_op();
-    FillBaseApiDef(api_def, op);
-
-    if (name_to_override.find(op.name()) != name_to_override.end()) {
-      ApplyOverridesToApiDef(api_def, op, name_to_override[op.name()]);
+    ASSERT_TRUE(op.summary().empty()) << GetOpDefHasDocStringError(op.name());
+    ASSERT_TRUE(op.description().empty())
+        << GetOpDefHasDocStringError(op.name());
+    for (const auto& arg : op.input_arg()) {
+      ASSERT_TRUE(arg.description().empty())
+          << GetOpDefHasDocStringError(op.name());
+    }
+    for (const auto& arg : op.output_arg()) {
+      ASSERT_TRUE(arg.description().empty())
+          << GetOpDefHasDocStringError(op.name());
+    }
+    for (const auto& attr : op.attr()) {
+      ASSERT_TRUE(attr.description().empty())
+          << GetOpDefHasDocStringError(op.name());
     }
   }
-  return api_defs_map;
 }
 
-// Reads golden ApiDef files and returns a map from file name to ApiDef file
-// contents.
-std::unordered_map<string, string> GetGoldenApiDefs(
-    Env* env, const string& api_files_dir) {
-  std::vector<string> matching_paths;
-  TF_CHECK_OK(env->GetMatchingPaths(
-      io::JoinPath(api_files_dir, kApiDefFilePattern), &matching_paths));
-
-  std::unordered_map<string, string> file_path_to_api_def;
-  for (auto& file_path : matching_paths) {
-    string file_contents;
-    TF_CHECK_OK(ReadFileToString(env, file_path, &file_contents));
-    file_path_to_api_def[file_path] = file_contents;
-  }
-  return file_path_to_api_def;
+// Checks that input arg names in an ApiDef match input
+// arg names in corresponding OpDef.
+TEST_F(BaseApiTest, AllApiDefInputArgsAreValid) {
+  TestAllApiDefInputArgsAreValid(ops_, api_defs_map_);
 }
 
-void RunApiTest(bool update_api_def, const string& api_files_dir) {
-  // Read C++ overrides file
-  OpGenOverrides overrides;
-  Env* env = Env::Default();
-  TF_EXPECT_OK(ReadTextProto(env, kOverridesFilePath, &overrides));
-
-  // Read all ops
-  OpList ops;
-  OpRegistry::Global()->Export(false, &ops);
-  const std::vector<string> multi_line_fields = {"description"};
+// Checks that output arg names in an ApiDef match output
+// arg names in corresponding OpDef.
+TEST_F(BaseApiTest, AllApiDefOutputArgsAreValid) {
+  TestAllApiDefOutputArgsAreValid(ops_, api_defs_map_);
+}
 
-  // Get expected ApiDefs
-  const auto new_api_defs_map = GenerateApiDef(api_files_dir, ops, overrides);
+// Checks that attribute names in an ApiDef match attribute
+// names in corresponding OpDef.
+TEST_F(BaseApiTest, AllApiDefAttributeNamesAreValid) {
+  TestAllApiDefAttributeNamesAreValid(ops_, api_defs_map_);
+}
 
-  bool updated_at_least_one_file = false;
-  const auto golden_api_defs_map = GetGoldenApiDefs(env, api_files_dir);
+class PythonApiTest : public ::testing::Test {
+ protected:
+  PythonApiTest() {
+    OpRegistry::Global()->Export(false, &ops_);
+    const std::vector<string> multi_line_fields = {"description"};
 
-  for (auto new_api_entry : new_api_defs_map) {
-    const auto& file_path = new_api_entry.first;
-    std::string golden_api_defs_str = "";
-    if (golden_api_defs_map.find(file_path) != golden_api_defs_map.end()) {
-      golden_api_defs_str = golden_api_defs_map.at(file_path);
-    }
-    string new_api_defs_str = new_api_entry.second.DebugString();
-    new_api_defs_str = PBTxtToMultiline(new_api_defs_str, multi_line_fields);
-    if (golden_api_defs_str == new_api_defs_str) {
-      continue;
-    }
-    if (update_api_def) {
-      std::cout << "Updating " << file_path << "..." << std::endl;
-      TF_EXPECT_OK(WriteStringToFile(env, file_path, new_api_defs_str));
-      updated_at_least_one_file = true;
-    } else {
-      EXPECT_EQ(golden_api_defs_str, new_api_defs_str)
-          << "To update golden API files, run "
-          << "tensorflow/core/api_def/update_api_def.sh.";
-    }
+    Env* env = Env::Default();
+    GetGoldenApiDefs(env, kPythonApiDefDir, &api_defs_map_);
   }
+  OpList ops_;
+  std::unordered_map<string, ApiDef> api_defs_map_;
+};
 
-  for (const auto& golden_api_entry : golden_api_defs_map) {
-    const auto& file_path = golden_api_entry.first;
-    if (new_api_defs_map.find(file_path) == new_api_defs_map.end()) {
-      if (update_api_def) {
-        std::cout << "Deleting " << file_path << "..." << std::endl;
-        TF_EXPECT_OK(env->DeleteFile(file_path));
-        updated_at_least_one_file = true;
-      } else {
-        EXPECT_EQ("", golden_api_entry.second)
-            << "To update golden API files, run "
-            << "tensorflow/core/api_def/update_api_def.sh.";
-      }
-    }
-  }
+// Check that ApiDefs have a corresponding op.
+TEST_F(PythonApiTest, AllApiDefsHaveCorrespondingOp) {
+  TestAllApiDefsHaveCorrespondingOp(ops_, api_defs_map_);
+}
 
-  if (update_api_def && !updated_at_least_one_file) {
-    std::cout << "Api def files are already up to date." << std::endl;
-  }
+// Checks that input arg names in an ApiDef match input
+// arg names in corresponding OpDef.
+TEST_F(PythonApiTest, AllApiDefInputArgsAreValid) {
+  TestAllApiDefInputArgsAreValid(ops_, api_defs_map_);
 }
 
-TEST(ApiTest, GenerateBaseAPIDef) { RunApiTest(false, kDefaultApiDefDir); }
-}  // namespace
-}  // namespace tensorflow
+// Checks that output arg names in an ApiDef match output
+// arg names in corresponding OpDef.
+TEST_F(PythonApiTest, AllApiDefOutputArgsAreValid) {
+  TestAllApiDefOutputArgsAreValid(ops_, api_defs_map_);
+}
 
-int main(int argc, char** argv) {
-  bool update_api_def = false;
-  tensorflow::string api_files_dir = tensorflow::kDefaultApiDefDir;
-  std::vector<tensorflow::Flag> flag_list = {
-      tensorflow::Flag(
-          "update_api_def", &update_api_def,
-          "Whether to update tensorflow/core/api_def/base_api/api_def*.pbtxt "
-          "files if they differ from expected API."),
-      tensorflow::Flag("api_def_dir", &api_files_dir,
-                       "Base directory of api_def*.pbtxt files.")};
-  std::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  bool parsed_values_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  if (!parsed_values_ok) {
-    std::cerr << usage << std::endl;
-    return 2;
-  }
-  if (update_api_def) {
-    tensorflow::port::InitMain(argv[0], &argc, &argv);
-    tensorflow::RunApiTest(update_api_def, api_files_dir);
-    return 0;
-  }
-  testing::InitGoogleTest(&argc, argv);
-  // Run tests
-  return RUN_ALL_TESTS();
+// Checks that attribute names in an ApiDef match attribute
+// names in corresponding OpDef.
+TEST_F(PythonApiTest, AllApiDefAttributeNamesAreValid) {
+  TestAllApiDefAttributeNamesAreValid(ops_, api_defs_map_);
 }
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
index 5d21d7bab699ff481c65ed44eb9bf66ec14ea387..ac05b54eea95f70e4a6db843aab13adf7b94602c 100644
--- a/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
@@ -20,10 +20,7 @@ END
   }
   summary: "Adds a value to the current value of a variable."
   description: <<END
-Any ReadVariableOp which depends directly or indirectly on this assign is
-guaranteed to see the incremented value or a subsequent newer one.
-
-Outputs the incremented value, which can be used to totally order the
-increments to this variable.
+Any ReadVariableOp with a control dependency on this op is guaranteed to
+see the incremented value or a subsequent newer one.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
index 102201c4cb07b080c46a28a91af8a4176034f6e6..9dd28f8711222754da531a961e1029f5bcf35ce8 100644
--- a/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
@@ -20,10 +20,7 @@ END
   }
   summary: "Subtracts a value from the current value of a variable."
   description: <<END
-Any ReadVariableOp which depends directly or indirectly on this assign is
-guaranteed to see the incremented value or a subsequent newer one.
-
-Outputs the incremented value, which can be used to totally order the
-increments to this variable.
+Any ReadVariableOp with a control dependency on this op is guaranteed to
+see the decremented value or a subsequent newer one.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_Batch.pbtxt b/tensorflow/core/api_def/base_api/api_def_Batch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aea11b64fdc08576e619616856d9f7cf12392eab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Batch.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "Batch"
+  summary: "Batches all input tensors nondeterministically."
+  description: <<END
+When many instances of this Op are being run concurrently with the same
+container/shared_name in the same device, some will output zero-shaped Tensors
+and others will output Tensors of size up to max_batch_size.
+
+All Tensors in in_tensors are batched together (so, for example, labels and
+features should be batched with a single instance of this operation.
+
+Each invocation of batch emits an `id` scalar which will be used to identify
+this particular invocation when doing unbatch or its gradient.
+
+Each op which emits a non-empty batch will also emit a non-empty batch_index
+Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+start, and length of elements of each set of Tensors present in batched_tensors.
+
+Batched tensors are concatenated along the first dimension, and all tensors in
+in_tensors must have the first dimension of the same size.
+
+in_tensors: The tensors to be batched.
+num_batch_threads: Number of scheduling threads for processing batches of work.
+ Determines the number of batches processed in parallel.
+max_batch_size: Batch sizes will never be bigger than this.
+batch_timeout_micros: Maximum number of microseconds to wait before outputting
+ an incomplete batch.
+allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+ nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+ batches up to one of those sizes. The entries must increase monotonically, and
+ the final entry must equal max_batch_size.
+grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+batch_index: If out_tensors is non-empty, has information to invert it.
+container: Controls the scope of sharing of this batch.
+id: always contains a scalar with a unique ID for this invocation of Batch.
+shared_name: Concurrently running instances of batch in the same device with the
+ same container and shared_name will batch their elements together. If left
+ empty, the op name will be used as the shared name.
+T: the types of tensors to be batched.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
index 6522ce976f2b507c4c66d4d3709427b5fa8222e9..070d6adb978e4a62e7209f299dba08515aa21e83 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
@@ -26,7 +26,7 @@ END
     description: <<END
 1-D tensor of length 4.  The stride of the sliding window for each
 dimension of `input`. The dimension order is determined by the value of
-  `data_format`, see below for details.
+`data_format`, see below for details.
 END
   }
   attr {
@@ -43,6 +43,16 @@ default format "NHWC", the data is stored in the order of:
     [batch, height, width, channels].
 Alternatively, the format could be "NCHW", the data storage order of:
     [batch, channels, height, width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
 END
   }
   summary: "Computes a 2-D convolution given 4-D `input` and `filter` tensors."
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
index 4ea3374dbbc8c690143a3a7a5fb9e67aca5bf1b0..ff2d9d71db646a27a88763f79bb6beb6b5ede44b 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
@@ -51,6 +51,16 @@ default format "NHWC", the data is stored in the order of:
     [batch, in_height, in_width, in_channels].
 Alternatively, the format could be "NCHW", the data storage order of:
     [batch, in_channels, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
 END
   }
   summary: "Computes the gradients of convolution with respect to the filter."
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
index 4420073e384c1c24d3109b8c6c4cadb59e9ed9d0..2de38b4263a380b5d0aec45270b9b67347c7021d 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
@@ -50,6 +50,16 @@ default format "NHWC", the data is stored in the order of:
     [batch, in_height, in_width, in_channels].
 Alternatively, the format could be "NCHW", the data storage order of:
     [batch, in_channels, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
 END
   }
   summary: "Computes the gradients of convolution with respect to the input."
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv3D.pbtxt
index 8f3cd4493c7af152c7a4eab78d1f96e02e325bbc..d26564097e976013fbb7f026c6a403cf6bd808e0 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv3D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv3D.pbtxt
@@ -34,6 +34,16 @@ default format "NDHWC", the data is stored in the order of:
     [batch, in_depth, in_height, in_width, in_channels].
 Alternatively, the format could be "NCDHW", the data storage order is:
     [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 5.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
 END
   }
   summary: "Computes a 3-D convolution given 5-D `input` and `filter` tensors."
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilterV2.pbtxt
index 6f9b917237b5748ac91c0a3bfbe35a21954dfd9d..937c9c8eadaaeceaadc180ad44f35a12ba9a2dfb 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilterV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilterV2.pbtxt
@@ -43,6 +43,16 @@ default format "NDHWC", the data is stored in the order of:
     [batch, in_depth, in_height, in_width, in_channels].
 Alternatively, the format could be "NCDHW", the data storage order is:
     [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 5.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
 END
   }
   summary: "Computes the gradients of 3-D convolution with respect to the filter."
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInputV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInputV2.pbtxt
index 19aba156d5907eb79d1438c16f866dfbd99ed548..414e418dc5a91e55f22dc5eec93d16fabad3d8fb 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInputV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInputV2.pbtxt
@@ -43,6 +43,16 @@ default format "NDHWC", the data is stored in the order of:
     [batch, in_depth, in_height, in_width, in_channels].
 Alternatively, the format could be "NCDHW", the data storage order is:
     [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 5.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
 END
   }
   summary: "Computes the gradients of 3-D convolution with respect to the input."
diff --git a/tensorflow/core/api_def/base_api/api_def_CriticalSectionOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_CriticalSectionOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5027fa861e7d8914b1e8ae06cd1ffa2ed06b6ad2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CriticalSectionOp.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "CriticalSectionOp"
+  attr {
+    name: "container"
+    description: <<END
+the container this critical section is placed in.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+the name by which this critical section is referred to.
+END
+  }
+  summary: "Creates a handle to a CriticalSection resource."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DataFormatDimMap.pbtxt b/tensorflow/core/api_def/base_api/api_def_DataFormatDimMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..994d3b8ddb6e44804b8d64a76bef7c9136f943f4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DataFormatDimMap.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "DataFormatDimMap"
+  in_arg {
+    name: "x"
+    description: <<END
+A Tensor with each element as a dimension index in source data format.
+Must be in the range [-4, 4).
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+A Tensor with each element as a dimension index in destination data format.
+END
+  }
+  attr {
+    name: "src_format"
+    description: <<END
+source data format.
+END
+  }
+  attr {
+    name: "dst_format"
+    description: <<END
+destination data format.
+END
+  }
+  summary: "Returns the dimension index in the destination data format given the one in"
+  description: <<END
+the source data format.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt b/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d87c088899e26bdd8a86f41c07681fa5aa49a07a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "DataFormatVecPermute"
+  in_arg {
+    name: "x"
+    description: <<END
+Vector of size 4 or Tensor of shape (4, 2) in source data format.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+END
+  }
+  attr {
+    name: "src_format"
+    description: <<END
+source data format.
+END
+  }
+  attr {
+    name: "dst_format"
+    description: <<END
+destination data format.
+END
+  }
+  summary: "Returns the permuted vector/tensor in the destination data format given the"
+  description: <<END
+one in the source data format.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DebugGradientIdentity.pbtxt b/tensorflow/core/api_def/base_api/api_def_DebugGradientIdentity.pbtxt
index 38fd6877e9d26e7ab86a4e7f95352a4a39efb7c2..6f932eb80cd969d345bd22514491643f28a92536 100644
--- a/tensorflow/core/api_def/base_api/api_def_DebugGradientIdentity.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DebugGradientIdentity.pbtxt
@@ -4,5 +4,6 @@ op {
   description: <<END
 This op is hidden from public in Python. It is used by TensorFlow Debugger to
 register gradient tensors for gradient debugging.
+This op operates on non-reference-type tensors.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_DebugGradientRefIdentity.pbtxt b/tensorflow/core/api_def/base_api/api_def_DebugGradientRefIdentity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e4d23fe6506f6df7881f41c858b0b6b40f049201
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DebugGradientRefIdentity.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "DebugGradientRefIdentity"
+  summary: "Identity op for gradient debugging."
+  description: <<END
+This op is hidden from public in Python. It is used by TensorFlow Debugger to
+register gradient tensors for gradient debugging.
+This op operates on reference-type tensors.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeCompressed.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeCompressed.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9babd822938dce8609a91816bcfb3988dd6a06d4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeCompressed.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "DecodeCompressed"
+  in_arg {
+    name: "bytes"
+    description: <<END
+A Tensor of string which is compressed.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor with the same shape as input `bytes`, uncompressed
+from bytes.
+END
+  }
+  attr {
+    name: "compression_type"
+    description: <<END
+A scalar containing either (i) the empty string (no
+compression), (ii) "ZLIB", or (iii) "GZIP".
+END
+  }
+  summary: "Decompress strings."
+  description: <<END
+This op decompresses each element of the `bytes` input `Tensor`, which
+is assumed to be compressed using the given `compression_type`.
+
+The `output` is a string `Tensor` of the same shape as `bytes`,
+each element containing the decompressed data from the corresponding
+element in `bytes`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt
index f2f5594c7c16b20ef934539b96bc78d324c1542d..e275cfdd3de5de36979967b1d85d1ae9cd0582a8 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt
@@ -21,5 +21,5 @@ SparseTensor. The shape may be partially specified, using `-1` to indicate
 that a particular dimension should use the maximum size of all batch elements.
 END
   }
-  summary: "Creates a dataset that yields a SparseTensor for each element of the input."
+  summary: "Creates a dataset that batches input elements into a SparseTensor."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_DepthToSpace.pbtxt b/tensorflow/core/api_def/base_api/api_def_DepthToSpace.pbtxt
index e7a18cd6b474d34bcc839f51fd13218c76c61294..d20b47a3ed50f9a8bb65f0cd6c332d03172e6bd0 100644
--- a/tensorflow/core/api_def/base_api/api_def_DepthToSpace.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DepthToSpace.pbtxt
@@ -28,7 +28,7 @@ with the following options:
   "NHWC": `[ batch, height, width, channels ]`
   "NCHW": `[ batch, channels, height, width ]`
   "NCHW_VECT_C":
-      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
+      `qint8 [ batch, channels / 4, height, width, 4 ]`
 
 It is useful to consider the operation as transforming a 6-D Tensor.
 e.g. for data_format = NHWC,
diff --git a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNative.pbtxt b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNative.pbtxt
index cc10ebe923870426bc9076ca6c96f0497bce1d51..3c313f7be6b38317ab7721a0d494fec42bdb52f4 100644
--- a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNative.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNative.pbtxt
@@ -21,6 +21,16 @@ default format "NHWC", the data is stored in the order of:
     [batch, height, width, channels].
 Alternatively, the format could be "NCHW", the data storage order of:
     [batch, channels, height, width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
 END
   }
   summary: "Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors."
diff --git a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
index 9126be2afa9bafb3372cfe38fe43f73239e86c72..e66aa3b70707c2216ff5195b9d2dda407c50ec74 100644
--- a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
@@ -54,6 +54,16 @@ default format "NHWC", the data is stored in the order of:
     [batch, height, width, channels].
 Alternatively, the format could be "NCHW", the data storage order of:
     [batch, channels, height, width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
 END
   }
   summary: "Computes the gradients of depthwise convolution with respect to the filter."
diff --git a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
index f1d16858dbf17e2974f6f1487857b63a40c99b91..f501ad21b35b6ad8d3ee16650919b1ff897cdccb 100644
--- a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
@@ -54,6 +54,16 @@ default format "NHWC", the data is stored in the order of:
     [batch, height, width, channels].
 Alternatively, the format could be "NCHW", the data storage order of:
     [batch, channels, height, width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
 END
   }
   summary: "Computes the gradients of depthwise convolution with respect to the input."
diff --git a/tensorflow/core/api_def/base_api/api_def_DeserializeSparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeserializeSparse.pbtxt
index 00e96c8a15b3529b13cb6eecfecd5e1551f390f2..dfaa531cbcc8adf46e5c6c57164fa7f674cda18d 100644
--- a/tensorflow/core/api_def/base_api/api_def_DeserializeSparse.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DeserializeSparse.pbtxt
@@ -14,4 +14,47 @@ The `dtype` of the serialized `SparseTensor` objects.
 END
   }
   summary: "Deserialize `SparseTensor` objects."
+  description: <<END
+The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+the last dimension stores serialized `SparseTensor` objects and the other N
+dimensions (N >= 0) correspond to a batch. The ranks of the original
+`SparseTensor` objects must all match. When the final `SparseTensor` is
+created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+the sparse tensors have been concatenated along new dimensions, one for each
+batch.
+
+The output `SparseTensor` object's shape values for the original dimensions
+are the max across the input `SparseTensor` objects' shape values for the
+corresponding dimensions. The new dimensions match the size of the batch.
+
+The input `SparseTensor` objects' indices are assumed ordered in
+standard lexicographic order.  If this is not the case, after this
+step run `SparseReorder` to restore index ordering.
+
+For example, if the serialized input is a `[2 x 3]` matrix representing two
+original `SparseTensor` objects:
+
+    index = [ 0]
+            [10]
+            [20]
+    values = [1, 2, 3]
+    shape = [50]
+
+and
+
+    index = [ 2]
+            [10]
+    values = [4, 5]
+    shape = [30]
+
+then the final deserialized `SparseTensor` will be:
+
+    index = [0  0]
+            [0 10]
+            [0 20]
+            [1  2]
+            [1 10]
+    values = [1, 2, 3, 4, 5]
+    shape = [2 50]
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_EagerPyFunc.pbtxt b/tensorflow/core/api_def/base_api/api_def_EagerPyFunc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9231368e1654d6bb710a128e076e93005f31116d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EagerPyFunc.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "EagerPyFunc"
+  summary: "Eagerly executes a python function to compute func(input)->output. The"
+  description: <<END
+semantics of the input, output, and attributes are the same as those for
+PyFunc.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EmptyTensorList.pbtxt b/tensorflow/core/api_def/base_api/api_def_EmptyTensorList.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c2ad003d01770c3fe823a40555ef3548ce318e8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EmptyTensorList.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "EmptyTensorList"
+  summary: "Creates and returns an empty tensor list."
+  description: <<END
+All list elements must be tensors of dtype element_dtype and shape compatible
+with element_shape.
+
+handle: an empty tensor list.
+element_dtype: the type of elements in the list.
+element_shape: a shape compatible with that of elements in the list.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9722f5ede30cb0b893171bfc36a0eb8c1ab3c7e2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "EnqueueInQueueDataset"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExecuteInCriticalSection.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExecuteInCriticalSection.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd5fc84a74faa209262da0402c546bcc3b4256fe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExecuteInCriticalSection.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "ExecuteInCriticalSection"
+  in_arg {
+    name: "critical_section"
+    description: <<END
+The handle of the `critical_section`.
+END
+  }
+  in_arg {
+    name: "arguments"
+    description: <<END
+Arguments for `f`, including any captured inputs appended at the end.
+END
+  }
+  out_arg {
+    name: "outputs"
+    description: <<END
+The outputs of `f`.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+The `Function` to execute.
+END
+  }
+  summary: "Executes function `f` within critical section `critical_section`."
+  description: <<END
+While `f` is running in `critical_section`, no other functions which wish to
+use this critical section may run.
+
+Often the use case is that two executions of the same graph, in parallel,
+wish to run `f`; and we wish to ensure that only one of them executes
+at a time.  This is especially important if `f` modifies one or more
+variables at a time.
+
+It is also useful if two separate functions must share a resource, but we
+wish to ensure the usage is exclusive.
+
+The signature of `f` is expected to be:
+
+```
+  outputs <- F(arguments)
+```
+Typically, but this is not required, `arguments` contain resources.  The
+primary purpose of this op is to limit access to these resources to one
+execution of `F` at a time.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt
index a72f2bfe5fc90ed7055a0d5354af81f8eee6a7d8..118d0e2178ada7c7b217cd43297830d2bae84f1c 100644
--- a/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt
@@ -30,9 +30,8 @@ END
   attr {
     name: "resize_align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1),
-which exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
index c7f8b6c21ba9fd85ee20c259425b04a8d4aade75..6cd76ff340efeb970e95aefe6544a1e52a9931a0 100644
--- a/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
@@ -43,6 +43,10 @@ of `params`.  The output tensor has shape
 
     indices.shape[:-1] + params.shape[indices.shape[-1]:]
 
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, a 0 is stored in the
+corresponding output value.
+
 Some examples below.
 
 Simple indexing into a matrix:
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
index c020176a3b41b257b54601aecab0d47d36849c81..162ef2b033ef9e789251d4e1a04844bae6aeac46 100644
--- a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
@@ -50,5 +50,9 @@ params.shape[axis + 1:]` where:
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
 </div>
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, a 0 is stored in the
+corresponding output value.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_GuaranteeConst.pbtxt b/tensorflow/core/api_def/base_api/api_def_GuaranteeConst.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2a2e1aaef84f8c978f8c9312cc52b9bdcd35ca8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GuaranteeConst.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "GuaranteeConst"
+  summary: "Gives a guarantee to the TF runtime that the input tensor is a constant."
+  description: <<END
+The runtime is then free to make optimizations based on this.
+
+Only accepts value typed tensors as inputs and rejects resource variable handles
+as input.
+
+Returns the input tensor without modification.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IgnoreErrorsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_IgnoreErrorsDataset.pbtxt
deleted file mode 100644
index e492d90287f0f1da04ca5a1eba72ed2a6c18e47a..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_IgnoreErrorsDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "IgnoreErrorsDataset"
-  summary: "Creates a dataset that contains the elements of `input_dataset` ignoring errors."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_IteratorGetNextSync.pbtxt b/tensorflow/core/api_def/base_api/api_def_IteratorGetNextSync.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..641679e8ea4e831abf76989984cc50693a7af971
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IteratorGetNextSync.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "IteratorGetNextSync"
+  summary: "Gets the next output from the given iterator."
+  description: <<END
+This operation is a synchronous version IteratorGetNext. It should only be used
+in situations where the iterator does not block the calling thread, or where
+the calling thread is not a member of the thread pool used to execute parallel
+operations (e.g. in eager mode).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
index 8da76684e5d360dd642167100b04543e93beed0a..97fd39f6478edd87cdbeac0c08928a4c4d451ed5 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
@@ -16,5 +16,6 @@ END
   description: <<END
 Note that this routine only supports wildcard characters in the
 basename portion of the pattern, not in the directory portion.
+Note also that the order of filenames returned can be non-deterministic.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixLogarithm.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixLogarithm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6c4d0d4008f368cd07bfcaafd0b3266a1f6207b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixLogarithm.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "MatrixLogarithm"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+Shape is `[..., M, M]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[..., M, M]`.
+
+@compatibility(scipy)
+Equivalent to scipy.linalg.logm
+@end_compatibility
+END
+  }
+  summary: "Computes the matrix logarithm of one or more square matrices:"
+  description: <<END
+
+log(exp(A)) = A
+
+This op is only defined for complex matrices. If A is positive-definite and
+real, then casting to a complex matrix, taking the logarithm and casting back
+to a real matrix will give the correct result.
+
+This function computes the matrix logarithm using the Schur-Parlett algorithm.
+Details of the algorithm can be found in Section 11.6.2 of:
+Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+ISBN 978-0-898716-46-7.
+
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices. The output is a tensor of the same shape as the input
+containing the exponential for all input submatrices `[..., :, :]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ParseSingleExample.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParseSingleExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..476c01d0add5df174ead50c9ebfc7b86cfc6aed9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ParseSingleExample.pbtxt
@@ -0,0 +1,78 @@
+op {
+  graph_op_name: "ParseSingleExample"
+  in_arg {
+    name: "serialized"
+    description: <<END
+A vector containing a batch of binary serialized Example protos.
+END
+  }
+  in_arg {
+    name: "dense_defaults"
+    description: <<END
+A list of Tensors (some may be empty), whose length matches
+the length of `dense_keys`. dense_defaults[j] provides default values
+when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+The input type is inferred from dense_defaults[j], even when it's empty.
+If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+then the shape of dense_defaults[j] must match that of dense_shapes[j].
+If dense_shapes[j] has an undefined major dimension (variable strides dense
+feature), dense_defaults[j] must contain a single element:
+the padding element.
+END
+  }
+  attr {
+    name: "num_sparse"
+    description: <<END
+The number of sparse features to be parsed from the example. This
+must match the lengths of `sparse_keys` and `sparse_types`.
+END
+  }
+  attr {
+    name: "sparse_keys"
+    description: <<END
+A list of `num_sparse` strings.
+The keys expected in the Examples' features associated with sparse values.
+END
+  }
+  attr {
+    name: "dense_keys"
+    description: <<END
+The keys expected in the Examples' features associated with dense
+values.
+END
+  }
+  attr {
+    name: "sparse_types"
+    description: <<END
+A list of `num_sparse` types; the data types of data in each
+Feature given in sparse_keys.
+Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+DT_INT64 (Int64List), and DT_STRING (BytesList).
+END
+  }
+  attr {
+    name: "Tdense"
+    description: <<END
+The data types of data in each Feature given in dense_keys.
+The length of this list must match the length of `dense_keys`.
+Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+DT_INT64 (Int64List), and DT_STRING (BytesList).
+END
+  }
+  attr {
+    name: "dense_shapes"
+    description: <<END
+The shapes of data in each Feature given in dense_keys.
+The length of this list must match the length of `dense_keys`.  The
+number of elements in the Feature corresponding to dense_key[j] must
+always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+(D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+..., DN), the shape of the output Tensor dense_values[j] will be (M,
+D1, .., DN), where M is the number of blocks of elements of length
+D1 * .... * DN, in the input.
+END
+  }
+  summary: "Transforms a tf.Example proto (as a string) into typed tensors."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d4549340fac6d59cc994050e65f5a0016f2d52ab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2D.pbtxt
index b19bbeab12db322064dcbf31779ce01adffadeb9..d18bafdce9b3aaccfae6eff0c489e133b492f26d 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2D.pbtxt
@@ -53,6 +53,16 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
 END
   }
   summary: "Computes a 2D convolution given quantized 4D input and filter tensors."
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt
index 6b3ba72e530043746a33a9467594ed8fb49dd2e5..a08ed710b72709e3bed8e682d53076eb66ac783e 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize quantized `images` to `size` using quantized bilinear interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0466b40f85eb118c94404e2f0d7670392bc7afdf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
@@ -0,0 +1,18 @@
+op {
+  graph_op_name: "RandomDataset"
+  in_arg {
+    name: "seed"
+    description: <<END
+A scalar seed for the random number generator. If either seed or
+seed2 is set to be non-zero, the random number generator is seeded
+by the given seed.  Otherwise, a random seed is used.
+END
+  }
+  in_arg {
+    name: "seed2"
+    description: <<END
+A second scalar seed to avoid seed collision.
+END
+  }
+  summary: "Creates a Dataset that returns pseudorandom numbers."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RecordInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_RecordInput.pbtxt
index 7efc8cd8334e80be3b1cc8ba5b50c2259931b1b6..333144d76e3f78204a8e35cbbf195871bbed3aef 100644
--- a/tensorflow/core/api_def/base_api/api_def_RecordInput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RecordInput.pbtxt
@@ -41,6 +41,13 @@ END
     name: "batch_size"
     description: <<END
 The batch size.
+END
+  }
+  attr {
+    name: "compression_type"
+    description: <<END
+The type of compression for the file. Currently ZLIB and
+GZIP are supported. Defaults to none.
 END
   }
   summary: "Emits randomized records."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt
index 6dc321a54496ea3f91b7efe0d28c8596cd18fc1c..3730ef5ef9075610886f55c1f5545f36d9f9bb7e 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt
@@ -23,15 +23,19 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize `images` to `size` using area interpolation."
   description: <<END
 Input images can be of different types but output images are always float.
 
+The range of pixel values for the output image might be slightly different
+from the range for the input image because of limited numerical precision.
+To guarantee an output range, for example `[0.0, 1.0]`, apply
+`tf.clip_by_value` to the output.
+
 Each output pixel is computed by first transforming the pixel's footprint into
 the input tensor and then averaging the pixels that intersect the footprint. An
 input pixel's contribution to the average is weighted by the fraction of its
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt
index 06e645e3ee937f81e3b546d24250f1c1d6ad2680..d4f8233d25be656c0440b4547104b2bacd3d776e 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize `images` to `size` using bicubic interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt
index bf5201d82e4b7b6fb463e17138e92f3033c6992e..eeb0680ab8f88b6f8543d06c30ab5dadc42f5545 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt
@@ -25,9 +25,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale grads by (orig_height - 1) / (height - 1), which
-exactly aligns the 4 corners of grads and original_image. If false, rescale by
-orig_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and grad tensors are
+aligned. Defaults to false.
 END
   }
   summary: "Computes the gradient of bicubic interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt
index 0768e437fa00a9adeec00498e968986125602822..0673baa703946b40d9294e1f381ad162c06091d9 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize `images` to `size` using bilinear interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt
index fba64203c236399e79a051206e936ec3ebb27b14..9a1a5fb69a5bfd0d537ddeede2f5af3856856f9b 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt
@@ -25,9 +25,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale grads by (orig_height - 1) / (height - 1), which
-exactly aligns the 4 corners of grads and original_image. If false, rescale by
-orig_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and grad tensors are
+aligned. Defaults to false.
 END
   }
   summary: "Computes the gradient of bilinear interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt
index a74db4c9dc340b90817567751da110ef8989850f..e6f8dc1941be24d457bb455f8c20131fe84d173c 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt
@@ -23,9 +23,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale input by (new_height - 1) / (height - 1), which
-exactly aligns the 4 corners of images and resized images. If false, rescale
-by new_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and output tensors are
+aligned, preserving the values at the corner pixels. Defaults to false.
 END
   }
   summary: "Resize `images` to `size` using nearest neighbor interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt
index 4ef1547eb4fab02392bc2b98a21ef01340b621f3..8d52ca833479d63dc70884b8a6cc06762d16edb7 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt
@@ -24,9 +24,8 @@ END
   attr {
     name: "align_corners"
     description: <<END
-If true, rescale grads by (orig_height - 1) / (height - 1), which
-exactly aligns the 4 corners of grads and original_image. If false, rescale by
-orig_height / height. Treat similarly the width dimension.
+If true, the centers of the 4 corner pixels of the input and grad tensors are
+aligned. Defaults to false.
 END
   }
   summary: "Computes the gradient of nearest neighbor interpolation."
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b07ee9fda94851b7bc64a02dbf748b74eb63cdee
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdUpdate.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "ResourceScatterNdUpdate"
+  in_arg {
+    name: "ref"
+    description: <<END
+A resource handle. Must be from a VarHandleOp.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A Tensor. Must be one of the following types: int32, int64.
+A tensor of indices into ref.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A Tensor. Must have the same type as ref. A tensor of updated
+values to add to ref.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+An optional bool. Defaults to True. If True, the assignment will
+be protected by a lock; otherwise the behavior is undefined,
+but may exhibit less contention.
+END
+  }
+  summary: "Applies sparse `updates` to individual values or slices within a given"
+  description: <<END
+variable according to `indices`.
+
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+```
+
+For example, say we want to update 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that update would look like this:
+
+```python
+    ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+    indices = tf.constant([[4], [3], [1] ,[7]])
+    updates = tf.constant([9, 10, 11, 12])
+    update = tf.scatter_nd_update(ref, indices, updates)
+    with tf.Session() as sess:
+      print sess.run(update)
+```
+
+The resulting update to ref would look like this:
+
+    [1, 11, 3, 10, 9, 6, 7, 12]
+
+See @{tf.scatter_nd} for more details about how to make updates to
+slices.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b308ad1f9d2f9d500cec4314b32b87541fe2348f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Roll.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "Roll"
+  in_arg {
+    name: "shift"
+    description: <<END
+Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+elements are shifted positively (towards larger indices) along the dimension
+specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+direction.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+`shift[i]` should occur. If the same axis is referenced more than once, the
+total shift for that axis will be the sum of all the shifts that belong to that
+axis.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has the same shape and size as the input. The elements are shifted
+positively (towards larger indices) by the offsets of `shift` along the
+dimensions of `axis`.
+END
+  }
+  summary: "Rolls the elements of a tensor along an axis."
+  description: <<END
+The elements are shifted positively (towards larger indices) by the offset of
+`shift` along the dimension of `axis`. Negative `shift` values will shift
+elements in the opposite direction. Elements that roll passed the last position
+will wrap around to the first and vice versa. Multiple shifts along multiple
+axes may be specified.
+
+For example:
+
+```
+# 't' is [0, 1, 2, 3, 4]
+roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+
+# shifting along multiple dimensions
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+
+# shifting along the same axis multiple times
+# 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt
index 0716b2611403b54d894007fad801380f30e70acc..6f1121dd37d4b01a0b6dab8a650f1c7a3f01fb60 100644
--- a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt
@@ -117,7 +117,7 @@ For example,
     # Draw the bounding box in an image summary.
     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
                                                   bbox_for_draw)
-    tf.image_summary('images_with_box', image_with_box)
+    tf.summary.image('images_with_box', image_with_box)
 
     # Employ the bounding box to distort the image.
     distorted_image = tf.slice(image, begin, size)
diff --git a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
index e9912609729fbadf7a3dd706903ecc4d915d72eb..473aec50aa214e6d285f20407d4274ce3ccd9a1f 100644
--- a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
@@ -117,7 +117,7 @@ For example,
     # Draw the bounding box in an image summary.
     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
                                                   bbox_for_draw)
-    tf.image_summary('images_with_box', image_with_box)
+    tf.summary.image('images_with_box', image_with_box)
 
     # Employ the bounding box to distort the image.
     distorted_image = tf.slice(image, begin, size)
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
index 23732546edaf120eb1a1a9b45219014ba55c6d81..4cb8c064fce615ace8e971505518e85f303d4c12 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -98,5 +98,8 @@ The resulting tensor would look like this:
      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
index db890cb2f51256fd9dabaa8aa590ccde37eec343..5e2912fcdd7324f219b430860784903f85f31dca 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
@@ -16,7 +16,7 @@ END
   }
   summary: "Computes the maximum along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Computes a tensor such that
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
index 4713c523102a66204bcd4b0480e194ec5d14a420..a7d85b3f4ecb2f0cd66e592478d921d9724fbfcc 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
@@ -16,7 +16,7 @@ END
   }
   summary: "Computes the mean along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Computes a tensor such that
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
index 6316bfd1a5779ca28b4437c0324844b98e819e1a..74fc5982182716c33a0a2087acb2f89e6e3e4640 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
@@ -16,7 +16,7 @@ END
   }
   summary: "Computes the minimum along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Computes a tensor such that
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
index a16d03d467e9ac70e0752f29e042d50e878114b5..4c4363e524a9fe63c5af4a309cd27c45d3d128aa 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
@@ -16,7 +16,7 @@ END
   }
   summary: "Computes the product along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Computes a tensor such that
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
index 0686e17f9bdeb09076157fd664ddf58766c22560..583ab3904f1498407a4ecdedd1ad85a043cb9310 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
@@ -16,7 +16,7 @@ END
   }
   summary: "Computes the sum along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Computes a tensor such that
diff --git a/tensorflow/core/api_def/base_api/api_def_SerializeManySparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_SerializeManySparse.pbtxt
index 0010bca0b017bcaac5552f7aa9462b0c56d4c01a..d46b4b20eeb58ef1cc261372d69acfe5a70668fe 100644
--- a/tensorflow/core/api_def/base_api/api_def_SerializeManySparse.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SerializeManySparse.pbtxt
@@ -18,7 +18,14 @@ END
 1-D.  The `shape` of the minibatch `SparseTensor`.
 END
   }
-  summary: "Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`."
+  attr {
+    name: "out_type"
+    description: <<END
+The `dtype` to use for serialization; the supported types are `string`
+(default) and `variant`.
+END
+  }
+  summary: "Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object."
   description: <<END
 The `SparseTensor` must have rank `R` greater than 1, and the first dimension
 is treated as the minibatch dimension.  Elements of the `SparseTensor`
diff --git a/tensorflow/core/api_def/base_api/api_def_SerializeSparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_SerializeSparse.pbtxt
index bb4a352d489c597b6e953bc79e307b0d74042e14..491f69fda088edb8a051b81e65d581094823ca5a 100644
--- a/tensorflow/core/api_def/base_api/api_def_SerializeSparse.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SerializeSparse.pbtxt
@@ -18,5 +18,12 @@ END
 1-D.  The `shape` of the `SparseTensor`.
 END
   }
-  summary: "Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object."
+  attr {
+    name: "out_type"
+    description: <<END
+The `dtype` to use for serialization; the supported types are `string`
+(default) and `variant`.
+END
+  }
+  summary: "Serialize a `SparseTensor` into a `[3]` `Tensor` object."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb425b24a4134366df1129df63dc0361537dd746
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "ShuffleAndRepeatDataset"
+  in_arg {
+    name: "buffer_size"
+    description: <<END
+The number of output elements to buffer in an iterator over
+this dataset. Compare with the `min_after_dequeue` attr when creating a
+`RandomShuffleQueue`.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+A scalar seed for the random number generator. If either `seed` or
+`seed2` is set to be non-zero, the random number generator is seeded
+by the given seed.  Otherwise, a random seed is used.
+END
+  }
+  in_arg {
+    name: "seed2"
+    description: <<END
+A second scalar seed to avoid seed collision.
+END
+  }
+  in_arg {
+    name: "count"
+    description: <<END
+A scalar representing the number of times the underlying dataset
+should be repeated. The default is `-1`, which results in infinite repetition.
+END
+  }
+  summary: "Creates a dataset that shuffles and repeats elements from `input_dataset`"
+  description: <<END
+pseudorandomly.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
index b12d3af9d74411fb46fb50d7dba57b7e60bbe933..ea5c52c0ee3826076b855ca243f03cb940b8e0b2 100644
--- a/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
@@ -11,8 +11,8 @@ END
   in_arg {
     name: "seed"
     description: <<END
-A scalar seed for the random number generator. If either seed or
-seed2 is set to be non-zero, the random number generator is seeded
+A scalar seed for the random number generator. If either `seed` or
+`seed2` is set to be non-zero, the random number generator is seeded
 by the given seed.  Otherwise, a random seed is used.
 END
   }
diff --git a/tensorflow/core/api_def/base_api/api_def_Snapshot.pbtxt b/tensorflow/core/api_def/base_api/api_def_Snapshot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49b7f5798cd58d7c96c9b0a582a6d79df4dab5a6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Snapshot.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Snapshot"
+  summary: "Returns a copy of the input tensor."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SpaceToDepth.pbtxt b/tensorflow/core/api_def/base_api/api_def_SpaceToDepth.pbtxt
index 8fd3966f7038a507ea3402e300f9362bd4f3d54b..b808ff5f9cf9072bdb95e779589668160d909b8f 100644
--- a/tensorflow/core/api_def/base_api/api_def_SpaceToDepth.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SpaceToDepth.pbtxt
@@ -25,7 +25,7 @@ with the following options:
   "NHWC": `[ batch, height, width, channels ]`
   "NCHW": `[ batch, channels, height, width ]`
   "NCHW_VECT_C":
-      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
+      `qint8 [ batch, channels / 4, height, width, 4 ]`
 
 It is useful to consider the operation as transforming a 6-D Tensor.
 e.g. for data_format = NHWC,
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
index 18e66605951afcba96f5e1cca10e959850ca2bf1..866e04e97b96752b5a32816feefbeeaff7ed0ea2 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
@@ -21,7 +21,7 @@ END
   }
   summary: "Computes the mean along sparse segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af4bc75fa099254877595174cb479651a53c5b25
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "SparseSegmentMeanWithNumSegments"
+  in_arg {
+    name: "indices"
+    description: <<END
+A 1-D tensor. Has same rank as `segment_ids`.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor. Values should be sorted and can be repeated.
+END
+  }
+  in_arg {
+    name: "num_segments"
+    description: <<END
+Should equal the number of distinct segment IDs.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which has size
+`num_segments`.
+END
+  }
+  summary: "Computes the mean along sparse segments of a tensor."
+  description: <<END
+Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+segments.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
index 3fdeb66aed79f73b10096fafa8846e79a4180394..194bcea726b51491b4f9c7414fa56747bdc0047a 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
@@ -23,7 +23,7 @@ END
   description: <<END
 N is the size of the segment being reduced.
 
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8b502928a5c03fefc67ae54a752b4e41a6ccaedd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "SparseSegmentSqrtNWithNumSegments"
+  in_arg {
+    name: "indices"
+    description: <<END
+A 1-D tensor. Has same rank as `segment_ids`.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor. Values should be sorted and can be repeated.
+END
+  }
+  in_arg {
+    name: "num_segments"
+    description: <<END
+Should equal the number of distinct segment IDs.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `k`, the number of segments.
+END
+  }
+  summary: "Computes the sum along sparse segments of a tensor divided by the sqrt of N."
+  description: <<END
+N is the size of the segment being reduced.
+
+Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+segments.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
index cdf44a89a386f7ab5fb702de96a83f307e531597..dfd50bf273b5e2107966d0400d0156fff8276403 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
@@ -21,7 +21,7 @@ END
   }
   summary: "Computes the sum along sparse segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3bc16577ff2f9d45aac1d8cd7c08cba2614bec9a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
@@ -0,0 +1,57 @@
+op {
+  graph_op_name: "SparseSegmentSumWithNumSegments"
+  in_arg {
+    name: "indices"
+    description: <<END
+A 1-D tensor. Has same rank as `segment_ids`.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor. Values should be sorted and can be repeated.
+END
+  }
+  in_arg {
+    name: "num_segments"
+    description: <<END
+Should equal the number of distinct segment IDs.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `num_segments`.
+END
+  }
+  summary: "Computes the sum along sparse segments of a tensor."
+  description: <<END
+Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+segments.
+
+For example:
+
+```python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+
+tf.sparse_segment_sum_with_num_segments(
+    c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+# => [[0 0 0 0]
+#     [0 0 0 0]
+#     [0 0 0 0]]
+
+tf.sparse_segment_sum_with_num_segments(c,
+                                        tf.constant([0, 1]),
+                                        tf.constant([0, 2],
+                                        num_segments=4))
+# => [[ 1  2  3  4]
+#     [ 0  0  0  0]
+#     [-1 -2 -3 -4]
+#     [ 0  0  0  0]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorGetItem.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorGetItem.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2869967d831534839312cf525de39a438704a898
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorGetItem.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "TensorListGetItem"
+  summary: "Returns the item in the list with the given index."
+  description: <<END
+input_handle: the list
+index: the position in the list from which an element will be retrieved
+item: the element at that position
+
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListElementShape.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListElementShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee20f4575c8a68d4bd1039b39aa8d37780b33ec1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListElementShape.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "TensorListElementShape"
+  summary: "The shape of the elements of the given list, as a tensor."
+  description: <<END
+  input_handle: the list
+  element_shape: the shape of elements of the list
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListFromTensor.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListFromTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..633b7c18019ba8429e154d9f98835e82815ca528
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListFromTensor.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "TensorListFromTensor"
+  summary: "Creates a TensorList which, when stacked, has the value of `tensor`."
+  description: <<END
+Each tensor in the result list corresponds to one row of the input tensor.
+
+tensor: The input tensor.
+output_handle: The list.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListLength.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListLength.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f450c20f86b3442567f0eeee0bf4c1d45c2e2987
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListLength.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "TensorListLength"
+  summary: "Returns the number of tensors in the input tensor list."
+  description: <<END
+input_handle: the input list
+length: the number of tensors in the list
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListPopBack.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListPopBack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f752f9cf45d96e8d90fb2826f7cb9eb349a8cad
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListPopBack.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "TensorListPopBack"
+  summary: "Returns the last element of the input list as well as a list with all but that element."
+  description: <<END
+Fails if the list is empty.
+
+input_handle: the input list
+tensor: the withdrawn last element of the list
+element_dtype: the type of elements in the list
+element_shape: the shape of the output tensor
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListPushBack.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListPushBack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73297c03003d91e16f288802c5223730af7c766c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListPushBack.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "TensorListPushBack"
+  summary: "Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`."
+  description: <<END
+tensor: The tensor to put on the list.
+input_handle: The old list.
+output_handle: A list with the elements of the old list followed by tensor.
+element_dtype: the type of elements in the list.
+element_shape: a shape compatible with that of elements in the list.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListReserve.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListReserve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b5640f0ffa9737ade3ee28e63964608bc6204d9d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListReserve.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "TensorListReserve"
+  summary: "List of the given size with empty elements."
+  description: <<END
+element_shape: the shape of the future elements of the list
+num_elements: the number of elements to reserve
+handle: the output list
+element_dtype: the desired type of elements in the list.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListStack.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListStack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2402875951848cf922a3252d1a5d8e53312fb4e1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListStack.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "TensorListStack"
+  summary: "Stacks all tensors in the list."
+  description: <<END
+Requires that all tensors have the same shape.
+
+input_handle: the input list
+tensor: the gathered result
+num_elements: optional. If not -1, the number of elements in the list.
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorSetItem.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorSetItem.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..682cf69ee2dae02fd1265ae062e64b66076e51a5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorSetItem.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "TensorListSetItem"
+  summary: "Sets the index-th position of the list to contain the given tensor."
+  description: <<END
+input_handle: the list
+index: the position in the list to which the tensor will be assigned
+item: the element to be assigned to that position
+output_handle: the new list, with the element in the proper position
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Unbatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_Unbatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6d10ea606d0fb8cf63c8dace06ca7c9aa78ab7e1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Unbatch.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "Unbatch"
+  summary: "Reverses the operation of Batch for a single output Tensor."
+  description: <<END
+An instance of Unbatch either receives an empty batched_tensor, in which case it
+asynchronously waits until the values become available from a concurrently
+running instance of Unbatch with the same container and shared_name, or receives
+a non-empty batched_tensor in which case it finalizes all other concurrently
+running instances and outputs its own element from the batch.
+
+batched_tensor: The possibly transformed output of Batch. The size of the first
+ dimension should remain unchanged by the transformations for the operation to
+ work.
+batch_index: The matching batch_index obtained from Batch.
+id: The id scalar emitted by Batch.
+unbatched_tensor: The Tensor corresponding to this execution.
+timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
+ batched input tensor associated with a given invocation of the op.
+container: Container to control resource sharing.
+shared_name: Instances of Unbatch with the same container and shared_name are
+ assumed to possibly belong to the same batch. If left empty, the op name will
+ be used as the shared name.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnbatchGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnbatchGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..487b4218d593be631927b1ee69eb0b633113e1df
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnbatchGrad.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "UnbatchGrad"
+  summary: "Gradient of Unbatch."
+  description: <<END
+Acts like Batch but using the given batch_index index of batching things as they
+become available. This ensures that the gradients are propagated back in the
+same session which did the forward pass.
+
+original_input: The input to the Unbatch operation this is the gradient of.
+batch_index: The batch_index given to the Unbatch operation this is the gradient
+of.
+grad: The downstream gradient.
+id: The id scalar emitted by Batch.
+batched_grad: The return value, either an empty tensor or the batched gradient.
+container: Container to control resource sharing.
+shared_name: Instances of UnbatchGrad with the same container and shared_name
+ are assumed to possibly belong to the same batch. If left empty, the op name
+ will be used as the shared name.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UniqueDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniqueDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..009256916908c412fdebd0775387a7f7f4d30a25
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UniqueDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueDataset"
+  summary: "Creates a dataset that contains the unique elements of `input_dataset`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UniqueV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniqueV2.pbtxt
index cd7ec6e5518c5a7788bb4fff88a38b74295e9df4..2fb5bd5b88652f5124761bc27f68ffb9859b76c5 100644
--- a/tensorflow/core/api_def/base_api/api_def_UniqueV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UniqueV2.pbtxt
@@ -9,7 +9,7 @@ END
   in_arg {
     name: "axis"
     description: <<END
-A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+A `Tensor` of type `int32` (default: None). The axis of the Tensor to
 find the unique elements.
 END
   }
@@ -26,12 +26,15 @@ A 1-D Tensor. Has the same type as x that contains the index of each
 value of x in the output y.
 END
   }
-  summary: "Finds unique elements in a 1-D tensor."
+  summary: "Finds unique elements along an axis of a tensor."
   description: <<END
-This operation returns a tensor `y` containing all of the unique elements of `x`
-sorted in the same order that they occur in `x`. This operation also returns a
-tensor `idx` the same size as `x` that contains the index of each value of `x`
-in the unique output `y`. In other words:
+This operation either returns a tensor `y` containing unique elements
+along the `axis` of a tensor. The returned unique elements is sorted
+in the same order as they occur along `axis` in `x`.
+This operation also returns a tensor `idx` that is the same size as
+the number of the elements in `x` along the `axis` dimension. It
+contains the index in the unique output `y`.
+In other words, for an `1-D` tensor `x` with `axis = None:
 
 `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 
@@ -43,5 +46,30 @@ y, idx = unique(x)
 y ==> [1, 2, 4, 7, 8]
 idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
 ```
+
+For an `2-D` tensor `x` with `axis = 0`:
+
+```
+# tensor 'x' is [[1, 0, 0],
+#                [1, 0, 0],
+#                [2, 0, 0]]
+y, idx = unique(x, axis=0)
+y ==> [[1, 0, 0],
+       [2, 0, 0]]
+idx ==> [0, 0, 1]
+```
+
+For an `2-D` tensor `x` with `axis = 1`:
+
+```
+# tensor 'x' is [[1, 0, 0],
+#                [1, 0, 0],
+#                [2, 0, 0]]
+y, idx = unique(x, axis=1)
+y ==> [[1, 0],
+       [1, 0],
+       [2, 0]]
+idx ==> [0, 1, 1]
+```
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97c380700a2a951a46a95f5b8abe432e759a32bf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnravelIndex.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "UnravelIndex"
+  in_arg {
+    name: "indices"
+    description: <<END
+An 0-D or 1-D `int` Tensor whose elements are indices into the
+flattened version of an array of dimensions dims.
+END
+  }
+  in_arg {
+    name: "dims"
+    description: <<END
+An 1-D `int` Tensor. The shape of the array to use for unraveling
+indices.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+An 2-D (or 1-D if indices is 0-D) tensor where each row has the
+same shape as the indices array.
+END
+  }
+  summary: "Converts a flat index or array of flat indices into a tuple of"
+  description: <<END
+coordinate arrays.
+
+@compatibility(numpy)
+Equivalent to np.unravel_index
+@end_compatibility
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
index 8298d62f253160847ee34bcdea5a81c7370e5124..4ca6780c95629de06db319db228f440219989793 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
@@ -14,20 +14,21 @@ Has same shape as data, except for dimension 0 which
 has size `num_segments`.
 END
   }
-  summary: "Computes the Max along segments of a tensor."
+  summary: "Computes the maximum along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
-This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-Instead of computing the sum over segments, it computes the maximum
-such that:
+This operator is similar to the unsorted segment sum operator found
+[(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+Instead of computing the sum over segments, it computes the maximum such that:
 
 \\(output_i = \max_j data_j\\) where max is over `j` such
 that `segment_ids[j] == i`.
 
-If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
- `output[i] = numeric_limits<T>::min()`.
+If the maximum is empty for a given segment ID `i`, it outputs the smallest
+possible value for the specific numeric type,
+`output[i] = numeric_limits<T>::lowest()`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..55ea69b5dd5f7fda5c877ca5771ec2cbb86e3a9a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "UnsortedSegmentMin"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `num_segments`.
+END
+  }
+  summary: "Computes the minimum along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+This operator is similar to the unsorted segment sum operator found
+[(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+Instead of computing the sum over segments, it computes the minimum such that:
+
+\\(output_i = \min_j data_j\\) where min is over `j` such
+that `segment_ids[j] == i`.
+
+If the minimum is empty for a given segment ID `i`, it outputs the largest
+possible value for the specific numeric type,
+`output[i] = numeric_limits<T>::max()`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..577ff53d60c5a174b4ba43a667885a6983b2dfb9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "UnsortedSegmentProd"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `num_segments`.
+END
+  }
+  summary: "Computes the product along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+This operator is similar to the unsorted segment sum operator found
+[(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+Instead of computing the sum over segments, it computes the product of all
+entries belonging to a segment such that:
+
+\\(output_i = \prod_j data_j\\) where the product is over `j` such
+that `segment_ids[j] == i`.
+
+If there is no entry for a given segment ID `i`, it outputs 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
index 77a96d1e03d577ca0f6dfd69c51d2551d1ad4b2a..eb5d0d124726c2671a8f0d615200f3c737ae0bbe 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
@@ -16,7 +16,7 @@ END
   }
   summary: "Computes the sum along segments of a tensor."
   description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
 segments.
 
 Computes a tensor such that
diff --git a/tensorflow/core/api_def/excluded_ops.cc b/tensorflow/core/api_def/excluded_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07ac974ff9aa7e66d9bb3c4e536f91d1249abb90
--- /dev/null
+++ b/tensorflow/core/api_def/excluded_ops.cc
@@ -0,0 +1,26 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/api_def/excluded_ops.h"
+
+namespace tensorflow {
+
+const std::unordered_set<std::string>* GetExcludedOps() {
+  static std::unordered_set<std::string>* excluded_ops =
+      new std::unordered_set<std::string>(
+          {"BigQueryReader", "GenerateBigQueryReaderPartitions"});
+  return excluded_ops;
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/api_def/excluded_ops.h b/tensorflow/core/api_def/excluded_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..409e5d32a7c0a9b3d724ee3e36e98e4f5dfddd85
--- /dev/null
+++ b/tensorflow/core/api_def/excluded_ops.h
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_API_DEF_EXCLUDED_OPS_H_
+#define TENSORFLOW_CORE_API_DEF_EXCLUDED_OPS_H_
+
+#include <string>
+#include <unordered_set>
+
+namespace tensorflow {
+
+// Returns a list of ops excluded from ApiDef.
+// TODO(annarev): figure out if we should keep ApiDefs for these ops as well
+const std::unordered_set<std::string>* GetExcludedOps();
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_API_DEF_EXCLUDED_OPS_H_
diff --git a/tensorflow/core/api_def/python_api/api_def_DebugGradientRefIdentity.pbtxt b/tensorflow/core/api_def/python_api/api_def_DebugGradientRefIdentity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e14e5fffd6e3683eec6eca65f587b5f0ab0016b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DebugGradientRefIdentity.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DebugGradientRefIdentity"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_EagerPyFunc.pbtxt b/tensorflow/core/api_def/python_api/api_def_EagerPyFunc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee0f95dacbc09702039da97fccd98a2d8bb83b1b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_EagerPyFunc.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "EagerPyFunc"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Snapshot.pbtxt b/tensorflow/core/api_def/python_api/api_def_Snapshot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea9ccee39765b659cc27e04a48cffc1caf97d5af
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Snapshot.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Snapshot"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Unique.pbtxt b/tensorflow/core/api_def/python_api/api_def_Unique.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e763d66e9a42c809eda574ecd8419ee452cbc829
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Unique.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Unique"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UniqueV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_UniqueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0d5046858b0fb38b88d0965bd1f0f28890a0b26
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UniqueV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/update_api_def.cc b/tensorflow/core/api_def/update_api_def.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea9a148260504320a7865b66f412307b4c62bce7
--- /dev/null
+++ b/tensorflow/core/api_def/update_api_def.cc
@@ -0,0 +1,272 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/api_def/update_api_def.h"
+
+#include <ctype.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/api_def/excluded_ops.h"
+#include "tensorflow/core/framework/api_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+namespace {
+constexpr char kApiDefFileFormat[] = "api_def_%s.pbtxt";
+// TODO(annarev): look into supporting other prefixes, not just 'doc'.
+constexpr char kDocStart[] = ".Doc(R\"doc(";
+constexpr char kDocEnd[] = ")doc\")";
+
+// Updates api_def based on the given op.
+void FillBaseApiDef(ApiDef* api_def, const OpDef& op) {
+  api_def->set_graph_op_name(op.name());
+  // Add arg docs
+  for (auto& input_arg : op.input_arg()) {
+    if (!input_arg.description().empty()) {
+      auto* api_def_in_arg = api_def->add_in_arg();
+      api_def_in_arg->set_name(input_arg.name());
+      api_def_in_arg->set_description(input_arg.description());
+    }
+  }
+  for (auto& output_arg : op.output_arg()) {
+    if (!output_arg.description().empty()) {
+      auto* api_def_out_arg = api_def->add_out_arg();
+      api_def_out_arg->set_name(output_arg.name());
+      api_def_out_arg->set_description(output_arg.description());
+    }
+  }
+  // Add attr docs
+  for (auto& attr : op.attr()) {
+    if (!attr.description().empty()) {
+      auto* api_def_attr = api_def->add_attr();
+      api_def_attr->set_name(attr.name());
+      api_def_attr->set_description(attr.description());
+    }
+  }
+  // Add docs
+  api_def->set_summary(op.summary());
+  api_def->set_description(op.description());
+}
+
+// Returns true if op has any description or summary.
+bool OpHasDocs(const OpDef& op) {
+  if (!op.summary().empty() || !op.description().empty()) {
+    return true;
+  }
+  for (const auto& arg : op.input_arg()) {
+    if (!arg.description().empty()) {
+      return true;
+    }
+  }
+  for (const auto& arg : op.output_arg()) {
+    if (!arg.description().empty()) {
+      return true;
+    }
+  }
+  for (const auto& attr : op.attr()) {
+    if (!attr.description().empty()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if summary and all descriptions are the same in op1
+// and op2.
+bool CheckDocsMatch(const OpDef& op1, const OpDef& op2) {
+  if (op1.summary() != op2.summary() ||
+      op1.description() != op2.description() ||
+      op1.input_arg_size() != op2.input_arg_size() ||
+      op1.output_arg_size() != op2.output_arg_size() ||
+      op1.attr_size() != op2.attr_size()) {
+    return false;
+  }
+  // Iterate over args and attrs to compare their docs.
+  for (int i = 0; i < op1.input_arg_size(); ++i) {
+    if (op1.input_arg(i).description() != op2.input_arg(i).description()) {
+      return false;
+    }
+  }
+  for (int i = 0; i < op1.output_arg_size(); ++i) {
+    if (op1.output_arg(i).description() != op2.output_arg(i).description()) {
+      return false;
+    }
+  }
+  for (int i = 0; i < op1.attr_size(); ++i) {
+    if (op1.attr(i).description() != op2.attr(i).description()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Returns true if descriptions and summaries in op match a
+// given single doc-string.
+bool ValidateOpDocs(const OpDef& op, const string& doc) {
+  OpDefBuilder b(op.name());
+  // We don't really care about type we use for arguments and
+  // attributes. We just want to make sure attribute and argument names
+  // are added so that descriptions can be assigned to them when parsing
+  // documentation.
+  for (const auto& arg : op.input_arg()) {
+    b.Input(arg.name() + ":string");
+  }
+  for (const auto& arg : op.output_arg()) {
+    b.Output(arg.name() + ":string");
+  }
+  for (const auto& attr : op.attr()) {
+    b.Attr(attr.name() + ":string");
+  }
+  b.Doc(doc);
+  OpRegistrationData op_reg_data;
+  TF_CHECK_OK(b.Finalize(&op_reg_data));
+  return CheckDocsMatch(op, op_reg_data.op_def);
+}
+}  // namespace
+
+string RemoveDoc(const OpDef& op, const string& file_contents,
+                 size_t start_location) {
+  // Look for a line starting with .Doc( after the REGISTER_OP.
+  const auto doc_start_location = file_contents.find(kDocStart, start_location);
+  const string format_error = strings::Printf(
+      "Could not find %s doc for removal. Make sure the doc is defined with "
+      "'%s' prefix and '%s' suffix or remove the doc manually.",
+      op.name().c_str(), kDocStart, kDocEnd);
+  if (doc_start_location == string::npos) {
+    std::cerr << format_error << std::endl;
+    LOG(ERROR) << "Didn't find doc start";
+    return file_contents;
+  }
+  const auto doc_end_location = file_contents.find(kDocEnd, doc_start_location);
+  if (doc_end_location == string::npos) {
+    LOG(ERROR) << "Didn't find doc start";
+    std::cerr << format_error << std::endl;
+    return file_contents;
+  }
+
+  const auto doc_start_size = sizeof(kDocStart) - 1;
+  string doc_text = file_contents.substr(
+      doc_start_location + doc_start_size,
+      doc_end_location - doc_start_location - doc_start_size);
+
+  // Make sure the doc text we found actually matches OpDef docs to
+  // avoid removing incorrect text.
+  if (!ValidateOpDocs(op, doc_text)) {
+    LOG(ERROR) << "Invalid doc: " << doc_text;
+    std::cerr << format_error << std::endl;
+    return file_contents;
+  }
+  // Remove .Doc call.
+  auto before_doc = file_contents.substr(0, doc_start_location);
+  str_util::StripTrailingWhitespace(&before_doc);
+  return before_doc +
+         file_contents.substr(doc_end_location + sizeof(kDocEnd) - 1);
+}
+
+namespace {
+// Remove .Doc calls that follow REGISTER_OP calls for the given ops.
+// We search for REGISTER_OP calls in the given op_files list.
+void RemoveDocs(const std::vector<const OpDef*>& ops,
+                const std::vector<string>& op_files) {
+  // Set of ops that we already found REGISTER_OP calls for.
+  std::set<string> processed_ops;
+
+  for (const auto& file : op_files) {
+    string file_contents;
+    bool file_contents_updated = false;
+    TF_CHECK_OK(ReadFileToString(Env::Default(), file, &file_contents));
+
+    for (auto op : ops) {
+      if (processed_ops.find(op->name()) != processed_ops.end()) {
+        // We already found REGISTER_OP call for this op in another file.
+        continue;
+      }
+      string register_call =
+          strings::Printf("REGISTER_OP(\"%s\")", op->name().c_str());
+      const auto register_call_location = file_contents.find(register_call);
+      // Find REGISTER_OP(OpName) call.
+      if (register_call_location == string::npos) {
+        continue;
+      }
+      std::cout << "Removing .Doc call for " << op->name() << " from " << file
+                << "." << std::endl;
+      file_contents = RemoveDoc(*op, file_contents, register_call_location);
+      file_contents_updated = true;
+
+      processed_ops.insert(op->name());
+    }
+    if (file_contents_updated) {
+      TF_CHECK_OK(WriteStringToFile(Env::Default(), file, file_contents))
+          << "Could not remove .Doc calls in " << file
+          << ". Make sure the file is writable.";
+    }
+  }
+}
+}  // namespace
+
+// Returns ApiDefs text representation in multi-line format
+// constructed based on the given op.
+string CreateApiDef(const OpDef& op) {
+  ApiDefs api_defs;
+  FillBaseApiDef(api_defs.add_op(), op);
+
+  const std::vector<string> multi_line_fields = {"description"};
+  string new_api_defs_str = api_defs.DebugString();
+  return PBTxtToMultiline(new_api_defs_str, multi_line_fields);
+}
+
+// Creates ApiDef files for any new ops.
+// If op_file_pattern is not empty, then also removes .Doc calls from
+// new op registrations in these files.
+void CreateApiDefs(const OpList& ops, const string& api_def_dir,
+                   const string& op_file_pattern) {
+  auto* excluded_ops = GetExcludedOps();
+  std::vector<const OpDef*> new_ops_with_docs;
+
+  for (const auto& op : ops.op()) {
+    if (excluded_ops->find(op.name()) != excluded_ops->end()) {
+      continue;
+    }
+    // Form the expected ApiDef path.
+    string file_path =
+        io::JoinPath(tensorflow::string(api_def_dir), kApiDefFileFormat);
+    file_path = strings::Printf(file_path.c_str(), op.name().c_str());
+
+    // Create ApiDef if it doesn't exist.
+    if (!Env::Default()->FileExists(file_path).ok()) {
+      std::cout << "Creating ApiDef file " << file_path << std::endl;
+      const auto& api_def_text = CreateApiDef(op);
+      TF_CHECK_OK(WriteStringToFile(Env::Default(), file_path, api_def_text));
+
+      if (OpHasDocs(op)) {
+        new_ops_with_docs.push_back(&op);
+      }
+    }
+  }
+  if (!op_file_pattern.empty()) {
+    std::vector<string> op_files;
+    TF_CHECK_OK(Env::Default()->GetMatchingPaths(op_file_pattern, &op_files));
+    RemoveDocs(new_ops_with_docs, op_files);
+  }
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/api_def/update_api_def.h b/tensorflow/core/api_def/update_api_def.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e285c06883efa9e8952339f952e341a5bee7406
--- /dev/null
+++ b/tensorflow/core/api_def/update_api_def.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
+#define TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
+// Functions for updating ApiDef when new ops are added.
+
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Returns ApiDefs text representation in multi-line format
+// constructed based on the given op.
+string CreateApiDef(const OpDef& op);
+
+// Removes .Doc call for the given op.
+// If unsuccessful, returns original file_contents and prints an error.
+// start_location - We search for .Doc call starting at this location
+//   in file_contents.
+string RemoveDoc(const OpDef& op, const string& file_contents,
+                 size_t start_location);
+
+// Creates api_def_*.pbtxt files for any new ops (i.e. ops that don't have an
+// api_def_*.pbtxt file yet).
+// If op_file_pattern is non-empty, then this method will also
+// look for a REGISTER_OP call for the new ops and removes corresponding
+// .Doc() calls since the newly generated api_def_*.pbtxt files will
+// store the doc strings.
+void CreateApiDefs(const OpList& ops, const string& api_def_dir,
+                   const string& op_file_pattern);
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_API_DEF_UPDATE_API_DEF_H_
diff --git a/tensorflow/core/api_def/update_api_def.sh b/tensorflow/core/api_def/update_api_def.sh
index 07c76e65620956a2cb7a44093314d89303d2d921..21d0aa3c34c3679e93afcc4b3d4b99b16ea33277 100755
--- a/tensorflow/core/api_def/update_api_def.sh
+++ b/tensorflow/core/api_def/update_api_def.sh
@@ -14,15 +14,15 @@
 # limitations under the License.
 # ==============================================================================
 
-# Script to update tensorflow/core/api_def/base_api/api_def*.pbtxt files.
+# Script to create tensorflow/core/api_def/base_api/api_def*.pbtxt
+# files for new ops.
 
 set -e
 
 current_file="$(readlink -f "$0")"
 current_dir="$(dirname "$current_file")"
 
-bazel build //tensorflow/core:api_test
-bazel-bin/tensorflow/core/api_test \
-  --update_api_def \
-  --api_def_dir="${current_dir}/base_api"
-
+bazel build //tensorflow/core/api_def:update_api_def
+bazel-bin/tensorflow/core/api_def/update_api_def \
+  --api_def_dir="${current_dir}/base_api" \
+  --op_file_pattern="${current_dir}/../ops/*_ops.cc"
diff --git a/tensorflow/core/api_def/update_api_def_main.cc b/tensorflow/core/api_def/update_api_def_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3fd975ce178b5ff779b90305cb0ce9f8e8116494
--- /dev/null
+++ b/tensorflow/core/api_def/update_api_def_main.cc
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This program can be used to automatically create an api_def_*.pbtxt
+// file based on op definition.
+//
+// To run, use the following script:
+// tensorflow/core/api_def/update_api_def.sh
+//
+// There are 2 ways to use this script:
+//   1. Define a REGISTER_OP call without a .Doc() call. Then, run
+//      this script and add summaries and descriptions in the generated
+//      api_def_*.pbtxt file manually.
+//   2. Add .Doc() call to a REGISTER_OP call. Then run this script
+//      to remove that .Doc() call and instead add corresponding summaries
+//      and descriptions in api_def_*.pbtxt file automatically.
+//      Note that .Doc() call must have the following format for this to work:
+//      .Doc(R"doc(<doc goes here>)doc").
+#include "tensorflow/core/api_def/update_api_def.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+int main(int argc, char** argv) {
+  tensorflow::string api_files_dir;
+  tensorflow::string op_file_pattern;
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("api_def_dir", &api_files_dir,
+                       "Base directory of api_def*.pbtxt files."),
+      tensorflow::Flag("op_file_pattern", &op_file_pattern,
+                       "Pattern that matches C++ files containing REGISTER_OP "
+                       "calls. If specified, we will try to remove .Doc() "
+                       "calls for new ops defined in these files.")};
+  std::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  bool parsed_values_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parsed_values_ok) {
+    std::cerr << usage << std::endl;
+    return 2;
+  }
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+
+  tensorflow::OpList ops;
+  tensorflow::OpRegistry::Global()->Export(false, &ops);
+  tensorflow::CreateApiDefs(ops, api_files_dir, op_file_pattern);
+}
diff --git a/tensorflow/core/api_def/update_api_def_test.cc b/tensorflow/core/api_def/update_api_def_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4200c9da23c09335d8edca217f68b2ae5d8c2bdf
--- /dev/null
+++ b/tensorflow/core/api_def/update_api_def_test.cc
@@ -0,0 +1,207 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/api_def/update_api_def.h"
+
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(UpdateApiDefTest, TestRemoveDocSingleOp) {
+  const string op_def_text = R"opdef(
+REGISTER_OP("Op1")
+    .Input("a: T")
+    .Output("output: T")
+    .Attr("b: type")
+    .SetShapeFn(shape_inference::UnchangedShape);
+)opdef";
+
+  const string op_def_text_with_doc = R"opdef(
+REGISTER_OP("Op1")
+    .Input("a: T")
+    .Output("output: T")
+    .Attr("b: type")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Summary for Op1.
+
+Description
+for Op1.
+
+b :   Description for b.
+a: Description for a.
+output: Description for output.
+)doc");
+)opdef";
+
+  const string op_text = R"(
+name: "Op1"
+input_arg {
+  name: "a"
+  description: "Description for a."
+}
+output_arg {
+  name: "output"
+  description: "Description for output."
+}
+attr {
+  name: "b"
+  description: "Description for b."
+}
+summary: "Summary for Op1."
+description: "Description\nfor Op1."
+)";
+  OpDef op;
+  protobuf::TextFormat::ParseFromString(op_text, &op);  // NOLINT
+
+  EXPECT_EQ(op_def_text,
+            RemoveDoc(op, op_def_text_with_doc, 0 /* start_location */));
+}
+
+TEST(UpdateApiDefTest, TestRemoveDocMultipleOps) {
+  const string op_def_text = R"opdef(
+REGISTER_OP("Op1")
+    .Input("a: T")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("Op2")
+    .Input("a: T")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("Op3")
+    .Input("c: T")
+    .SetShapeFn(shape_inference::UnchangedShape);
+)opdef";
+
+  const string op_def_text_with_doc = R"opdef(
+REGISTER_OP("Op1")
+    .Input("a: T")
+    .Doc(R"doc(
+Summary for Op1.
+)doc")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("Op2")
+    .Input("a: T")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Summary for Op2.
+)doc");
+
+REGISTER_OP("Op3")
+    .Input("c: T")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Summary for Op3.
+)doc");
+)opdef";
+
+  const string op1_text = R"(
+name: "Op1"
+input_arg {
+  name: "a"
+}
+summary: "Summary for Op1."
+)";
+  const string op2_text = R"(
+name: "Op2"
+input_arg {
+  name: "a"
+}
+summary: "Summary for Op2."
+)";
+  const string op3_text = R"(
+name: "Op3"
+input_arg {
+  name: "c"
+}
+summary: "Summary for Op3."
+)";
+  OpDef op1, op2, op3;
+  protobuf::TextFormat::ParseFromString(op1_text, &op1);  // NOLINT
+  protobuf::TextFormat::ParseFromString(op2_text, &op2);  // NOLINT
+  protobuf::TextFormat::ParseFromString(op3_text, &op3);  // NOLINT
+
+  string updated_text =
+      RemoveDoc(op2, op_def_text_with_doc,
+                op_def_text_with_doc.find("Op2") /* start_location */);
+  EXPECT_EQ(string::npos, updated_text.find("Summary for Op2"));
+  EXPECT_NE(string::npos, updated_text.find("Summary for Op1"));
+  EXPECT_NE(string::npos, updated_text.find("Summary for Op3"));
+
+  updated_text = RemoveDoc(op3, updated_text,
+                           updated_text.find("Op3") /* start_location */);
+  updated_text = RemoveDoc(op1, updated_text,
+                           updated_text.find("Op1") /* start_location */);
+  EXPECT_EQ(op_def_text, updated_text);
+}
+
+TEST(UpdateApiDefTest, TestCreateApiDef) {
+  const string op_text = R"(
+name: "Op1"
+input_arg {
+  name: "a"
+  description: "Description for a."
+}
+output_arg {
+  name: "output"
+  description: "Description for output."
+}
+attr {
+  name: "b"
+  description: "Description for b."
+}
+summary: "Summary for Op1."
+description: "Description\nfor Op1."
+)";
+  OpDef op;
+  protobuf::TextFormat::ParseFromString(op_text, &op);  // NOLINT
+
+  const string expected_api_def = R"(op {
+  graph_op_name: "Op1"
+  in_arg {
+    name: "a"
+    description: <<END
+Description for a.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Description for output.
+END
+  }
+  attr {
+    name: "b"
+    description: <<END
+Description for b.
+END
+  }
+  summary: "Summary for Op1."
+  description: <<END
+Description
+for Op1.
+END
+}
+)";
+  EXPECT_EQ(expected_api_def, CreateApiDef(op));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index a1e3b21e4f2d6af1b7e3c68d82a77f96bd34e613..832a55f2556f46efe6a94fb62d0420330917faac 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -13,11 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/graph/node_builder.h"
 
-
 namespace tensorflow {
 namespace {
 
@@ -44,7 +42,6 @@ Tensor make_zeros(const DataType& dtype, const TensorShapeProto& shape) {
 // third-party libraries aren't currently supported.
 class AccumulateNV2RemovePass : public GraphOptimizationPass {
  public:
-
   Status Run(const GraphOptimizationPassOptions& options) override {
     // TODO(freiss.oss@gmail.com): Substantial shared code with
     // ParallelConcatRemovePass::Run(). Consider refactoring if someone makes
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 6399b8cf55b98f330a93ae28b516c59bee5c9d79..e9f839289af482fdeec546c45e5173efd4efeaf5 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <atomic>
+
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
@@ -206,20 +208,20 @@ void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
   if (allocation_attr.no_retry_on_failure) {
     // Return immediately upon the first failure if this is for allocating an
     // optional scratch space.
-    void* result = AllocateRawInternal(unused_alignment, num_bytes, false);
+    bool dump_log_on_failure = VLOG_IS_ON(2);
+    void* result =
+        AllocateRawInternal(unused_alignment, num_bytes, dump_log_on_failure);
     if (result == nullptr) {
-      // The counter incrementing is not thread-safe. But we don't really care.
-      // TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N for
-      // more general usage.
-      static int log_counter = 0;
-      if (log_counter < 10) {
-        log_counter++;
+      static std::atomic<int32> log_counter{0};
+      int32 counter_value = log_counter.load(std::memory_order_relaxed);
+      if (counter_value < 10) {
+        log_counter.store(counter_value + 1, std::memory_order_relaxed);
         LOG(WARNING)
             << "Allocator (" << Name() << ") ran out of memory trying "
             << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
             << ". The caller indicates that this is not a failure, but"
             << " may mean that there could be performance gains if more"
-            << " memory is available.";
+            << " memory were available.";
       }
     }
     return result;
@@ -519,7 +521,7 @@ void BFCAllocator::AddAllocVisitor(Visitor visitor) {
 
 bool BFCAllocator::TracksAllocationSizes() { return true; }
 
-size_t BFCAllocator::RequestedSize(void* ptr) {
+size_t BFCAllocator::RequestedSize(const void* ptr) {
   mutex_lock l(lock_);
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle)
@@ -528,7 +530,7 @@ size_t BFCAllocator::RequestedSize(void* ptr) {
   return c->requested_size;
 }
 
-size_t BFCAllocator::AllocatedSize(void* ptr) {
+size_t BFCAllocator::AllocatedSize(const void* ptr) {
   mutex_lock l(lock_);
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle)
@@ -537,7 +539,7 @@ size_t BFCAllocator::AllocatedSize(void* ptr) {
   return c->size;
 }
 
-int64 BFCAllocator::AllocationId(void* ptr) {
+int64 BFCAllocator::AllocationId(const void* ptr) {
   mutex_lock l(lock_);
   BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
   CHECK(h != kInvalidChunkHandle)
@@ -659,17 +661,9 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
       const Chunk* c = ChunkFromHandle(h);
       if (c->in_use()) {
         in_use_by_size[c->size]++;
-        LOG(INFO) << "Chunk at " << c->ptr << " of size " << c->size;
-      }
-      h = c->next;
-    }
-
-    h = region_manager_.get_handle(region.ptr());
-    while (h != kInvalidChunkHandle) {
-      const Chunk* c = ChunkFromHandle(h);
-      if (!c->in_use()) {
-        LOG(INFO) << "Free at " << c->ptr << " of size " << c->size;
       }
+      LOG(INFO) << (c->in_use() ? "Chunk" : "Free ") << " at " << c->ptr
+                << " of size " << c->size;
       h = c->next;
     }
   }
@@ -691,6 +685,13 @@ void BFCAllocator::GetStats(AllocatorStats* stats) {
   *stats = stats_;
 }
 
+void BFCAllocator::ClearStats() {
+  mutex_lock l(lock_);
+  stats_.num_allocs = 0;
+  stats_.max_bytes_in_use = stats_.bytes_in_use;
+  stats_.max_alloc_size = 0;
+}
+
 std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins>
 BFCAllocator::get_bin_debug_info() {
   std::array<BinDebugInfo, kNumBins> bin_infos;
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 20fa05f0d228c754ca0093ca7f360592cdaa23f2..b8e773503c7a2f8024e8a6f58247ad343a762f71 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -62,14 +62,16 @@ class BFCAllocator : public VisitableAllocator {
 
   bool TracksAllocationSizes() override;
 
-  size_t RequestedSize(void* ptr) override;
+  size_t RequestedSize(const void* ptr) override;
 
-  size_t AllocatedSize(void* ptr) override;
+  size_t AllocatedSize(const void* ptr) override;
 
-  int64 AllocationId(void* ptr) override;
+  int64 AllocationId(const void* ptr) override;
 
   void GetStats(AllocatorStats* stats) override;
 
+  void ClearStats() override;
+
  private:
   struct Bin;
 
@@ -125,10 +127,10 @@ class BFCAllocator : public VisitableAllocator {
     string DebugString(BFCAllocator* a,
                        bool recurse) NO_THREAD_SAFETY_ANALYSIS {
       string dbg;
-      strings::StrAppend(&dbg, "  Size: ", strings::HumanReadableNumBytes(size),
-                         " | Requested Size: ",
-                         strings::HumanReadableNumBytes(requested_size),
-                         " | in_use: ", in_use());
+      strings::StrAppend(
+          &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
+          " | Requested Size: ", strings::HumanReadableNumBytes(requested_size),
+          " | in_use: ", in_use());
       if (recurse && prev != BFCAllocator::kInvalidChunkHandle) {
         Chunk* p = a->ChunkFromHandle(prev);
         strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false));
@@ -418,11 +420,13 @@ class BFCAllocator : public VisitableAllocator {
   mutable mutex lock_;
   RegionManager region_manager_ GUARDED_BY(lock_);
 
-  std::vector<Chunk> chunks_;
-  ChunkHandle free_chunks_list_;  // Ptr to head of linked list of free Chunks
+  std::vector<Chunk> chunks_ GUARDED_BY(lock_);
+
+  // Pointer to head of linked list of free Chunks
+  ChunkHandle free_chunks_list_ GUARDED_BY(lock_);
 
   // Called once on each region, ASAP.
-  std::vector<Visitor> region_visitors_;
+  std::vector<Visitor> region_visitors_ GUARDED_BY(lock_);
 
   // Counter containing the next unique identifier to assign to a
   // newly-created chunk.
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 0398c2a60d1fe4dfeed91e242272f13dd45389b2..b5a51d2526d95313d4564337ae0420472bc0b3da 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -328,7 +328,8 @@ void FindConstantFoldableNodes(
                ConsiderConstantFoldableNode(
                    n, opts, nodes, constant_control_deps, shape_replacement_map,
                    &internal_node_inserted);
-             });
+             },
+             NodeComparatorName());
   // If we have inserted just leaf level nodes, then there is nothing to fold.
   if (!internal_node_inserted) {
     nodes->clear();
@@ -339,8 +340,8 @@ void FindConstantFoldableNodes(
 typedef std::pair<Node*, int> NodeAndOutput;
 
 int64 UniqueConstantId() {
-  static std::atomic_int_fast64_t id;
-  return id.fetch_add(1);
+  static std::atomic_int_fast64_t unique_constant_id;
+  return unique_constant_id.fetch_add(1);
 }
 
 // Adds n to constant_graph which is being built up for subsequent evaluation of
@@ -386,14 +387,12 @@ void AddShapeNodeToConstantGraph(
     const std::unordered_map<const Node*, std::vector<Tensor>>&
         shape_replacement_map,
     std::unordered_map<Node*, std::vector<Node*>>* node_map,
-    Graph* constant_graph) {
+    const ConstantFoldNameGenerator& generate_new_name, Graph* constant_graph) {
   std::vector<Node*>& added = (*node_map)[n];
   const string& node_name = n->name();
   for (const Tensor& t : shape_replacement_map.at(n)) {
     auto builder =
-        NodeDefBuilder(strings::StrCat(constant_graph->NewName(node_name),
-                                       "__cf__", UniqueConstantId()),
-                       "Const")
+        NodeDefBuilder(generate_new_name(constant_graph, node_name), "Const")
             .Attr("dtype", t.dtype())
             .Attr("value", t);
     NodeDef def;
@@ -414,7 +413,8 @@ Graph* GetConstantGraph(
     const Graph* orig_graph, const std::vector<Node*>& nodes,
     const std::unordered_map<const Node*, std::vector<Tensor>>&
         shape_replacement_map,
-    std::map<NodeAndOutput, Node*>* tensors_to_fetch) {
+    std::map<NodeAndOutput, Node*>* tensors_to_fetch,
+    const ConstantFoldNameGenerator& generate_new_name) {
   Graph* constant_graph = new Graph(orig_graph->op_registry());
   std::unordered_map<Node*, std::vector<Node*>> node_map;
   node_map[orig_graph->source_node()] = {constant_graph->source_node()};
@@ -424,7 +424,7 @@ Graph* GetConstantGraph(
       AddNodeToConstantGraph(n, &node_map, constant_graph);
     } else {
       AddShapeNodeToConstantGraph(n, shape_replacement_map, &node_map,
-                                  constant_graph);
+                                  generate_new_name, constant_graph);
     }
   }
 
@@ -458,10 +458,11 @@ Graph* GetConstantGraph(
 // replacement was successful, false otherwise.
 // 'control_deps' is the set of nodes that should be control predecessors of the
 // new constant node.
-bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
-                               NodeAndOutput tensor, const Tensor& constant,
-                               const gtl::FlatSet<Node*>& control_deps,
-                               int64 max_constant_size_in_bytes) {
+bool ReplaceTensorWithConstant(
+    Graph* graph, Device* partition_device, NodeAndOutput tensor,
+    const Tensor& constant, const gtl::FlatSet<Node*>& control_deps,
+    int64 max_constant_size_in_bytes,
+    const ConstantFoldNameGenerator& generate_new_name) {
   // Be conservative when replacing a tensor with a constant, when not
   // running on CPU.
   // 1) If the destination tensor is not an int32 tensor, and has HOST_MEMORY
@@ -509,9 +510,7 @@ bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
   }
   const string& node_name = n->name();
   Node* constant_node;
-  auto builder = NodeDefBuilder(strings::StrCat(graph->NewName(node_name),
-                                                "__cf__", UniqueConstantId()),
-                                "Const")
+  auto builder = NodeDefBuilder(generate_new_name(graph, node_name), "Const")
                      .Attr("dtype", constant.dtype())
                      .Attr("value", constant);
   if (partition_device) {
@@ -555,6 +554,13 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
                     FunctionLibraryRuntime* function_library, Env* env,
                     Device* partition_device, Graph* graph, bool* was_mutated) {
   DumpGraph("Before", graph);
+  ConstantFoldNameGenerator generate_new_name = opts.generate_new_name;
+  if (generate_new_name == nullptr) {
+    generate_new_name = [](Graph* graph, string old_name) {
+      return strings::StrCat(graph->NewName(old_name), "__cf__",
+                             UniqueConstantId());
+    };
+  }
 
   std::vector<Node*> constant_foldable_nodes;
   std::unordered_map<const Node*, gtl::FlatSet<Node*>> constant_control_deps;
@@ -571,7 +577,7 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
   std::map<NodeAndOutput, Node*> tensors_to_fetch;
   std::unique_ptr<Graph> constant_graph(
       GetConstantGraph(graph, constant_foldable_nodes, shape_replacement_map,
-                       &tensors_to_fetch));
+                       &tensors_to_fetch, generate_new_name));
   DumpGraph("Constant graph", constant_graph.get());
 
   if (tensors_to_fetch.empty()) {
@@ -585,7 +591,16 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
 
   std::vector<string> tensors_to_fetch_names;
   std::vector<NodeAndOutput> tensors_to_replace;
-  for (auto n : tensors_to_fetch) {
+  // Sorting the nodes based on the name gives us a stable ordering between runs
+  // for the same graph.
+  std::vector<std::pair<NodeAndOutput, Node*>> tensors_to_fetch_sorted(
+      tensors_to_fetch.begin(), tensors_to_fetch.end());
+  std::sort(tensors_to_fetch_sorted.begin(), tensors_to_fetch_sorted.end(),
+            [](const std::pair<NodeAndOutput, Node*>& n1,
+               const std::pair<NodeAndOutput, Node*>& n2) {
+              return n1.first.first->name() < n2.first.first->name();
+            });
+  for (auto n : tensors_to_fetch_sorted) {
     tensors_to_fetch_names.push_back(
         strings::StrCat(n.first.first->name(), ":", n.first.second));
     tensors_to_replace.push_back({n.second, n.first.second});
@@ -617,7 +632,7 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
         constant_control_deps[tensors_to_replace[c].first];
     if (ReplaceTensorWithConstant(
             graph, partition_device, tensors_to_replace[c], outputs[c],
-            control_deps, opts.max_constant_size_in_bytes)) {
+            control_deps, opts.max_constant_size_in_bytes, generate_new_name)) {
       ++num_nodes_replaced;
     }
   }
diff --git a/tensorflow/core/common_runtime/constant_folding.h b/tensorflow/core/common_runtime/constant_folding.h
index e4d724c58a25347db3e40a0d024acf1ac97ea575..b1e1fb831963bccb81731752ec76b9d5be123d9f 100644
--- a/tensorflow/core/common_runtime/constant_folding.h
+++ b/tensorflow/core/common_runtime/constant_folding.h
@@ -24,6 +24,11 @@ limitations under the License.
 
 namespace tensorflow {
 
+// This generator type is used to generate a name for the newly folded node
+// based on the node's old name.
+using ConstantFoldNameGenerator =
+    std::function<string(Graph* graph, string old_name)>;
+
 // Options specific to constant folding optimizations.
 struct ConstantFoldingOptions {
   // If "consider" is not a nullptr, then only constant fold a node "n" if
@@ -37,6 +42,11 @@ struct ConstantFoldingOptions {
   // The maximum size of each constant created during constant folding
   // optimization.
   int64 max_constant_size_in_bytes = 10 * 1024 * 1024;
+
+  // A generator for the name suffix of constant folded nodes. A
+  // default id generator that monotonically increases is used if nullptr is
+  // passed.
+  ConstantFoldNameGenerator generate_new_name = nullptr;
 };
 
 // Perform constant folding optimization on "graph".
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index 923a4d924936386ce0e06c6355c2a4d0af5cc4a4..6ac9319ad1e2c4953c2d82257dac6a3aeeffcd5c 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -121,6 +121,58 @@ TEST_F(ConstantFoldingTest, Basic) {
                          {2, 2});
 }
 
+// Tests that different node creation ordering creates same graph after constant
+// folding.
+TEST_F(ConstantFoldingTest, DeterministicFolding) {
+  auto build_graph_and_constant_folding = [](Graph& g, bool swap) -> Status {
+    Scope s = Scope::NewRootScope();
+    auto a = ops::Const<float>(s, {1.0}, {});
+    auto b = ops::Const<float>(s, {2.0}, {});
+
+    if (swap) {
+      auto add1 = ops::Add(s.WithOpName("add1"), a, b);
+      auto add2 = ops::Add(s.WithOpName("add2"), a, b);
+      auto s1 =
+          ops::_Send(s.WithOpName("s1"), add1, "add1", "sender", 0, "receiver");
+      auto s2 =
+          ops::_Send(s.WithOpName("s2"), add2, "add2", "sender", 0, "receiver");
+    } else {
+      // Swap the order of node creation.
+      auto add2 = ops::Add(s.WithOpName("add2"), a, b);
+      auto add1 = ops::Add(s.WithOpName("add1"), a, b);
+      auto s1 =
+          ops::_Send(s.WithOpName("s1"), add1, "add1", "sender", 0, "receiver");
+      auto s2 =
+          ops::_Send(s.WithOpName("s2"), add2, "add2", "sender", 0, "receiver");
+    }
+
+    TF_CHECK_OK(s.ToGraph(&g));
+    bool was_mutated;
+    int64 unique_id = 0;
+    auto generate_new_name = [&unique_id](Graph* graph, string old_name) {
+      return strings::StrCat(graph->NewName(old_name), "__cf__", unique_id++);
+    };
+    ConstantFoldingOptions opt{};
+    opt.generate_new_name = generate_new_name;
+    TF_CHECK_OK(
+        ConstantFold(opt, nullptr, Env::Default(), nullptr, &g, &was_mutated));
+    return Status::OK();
+  };
+
+  Graph g1(OpRegistry::Global());
+  TF_ASSERT_OK(build_graph_and_constant_folding(g1, false));
+  Graph g2(OpRegistry::Global());
+  TF_ASSERT_OK(build_graph_and_constant_folding(g2, true));
+  EXPECT_EQ(g1.num_nodes(), g2.num_nodes());
+  auto index = NodeNameIndex(g2);
+
+  // All the nodes in g1 are expected to be present in g2.
+  for (int64 i = 0; i < g1.num_nodes(); ++i) {
+    Node* n1 = g1.FindNodeId(i);
+    EXPECT_GT(index.count(n1->name()), 0);
+  }
+}
+
 TEST_F(ConstantFoldingTest, ConsiderFunction) {
   Scope s = Scope::NewRootScope();
   BuildSimpleGraph(&s);
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index d5a452a796d67400d56ca08c675e0386348dea13..5918cd9bbf35a7e277ec8d7e17f9008400e1eea3 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -148,6 +148,9 @@ class Device : public DeviceBase {
     return BuildDeviceAttributes(name, device, memory_limit, locality, "");
   }
 
+  // Clears the resource manager associated with this device.
+  void ClearResourceMgr() { rmgr_->Clear(); }
+
  protected:
   void DeleteResourceMgr() {
     delete rmgr_;
diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index 0507076c8c3734083ac0ef7ffea0edebf180ad1a..fd9c4222a7afd4914415c9c62e1ced118ea75d1f 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -88,7 +88,9 @@ TEST_F(DeviceSetTest, PrioritizedDeviceTypeList) {
   // D3 is prioritized below D1.
   AddDevice("d3", "/job:a/replica:0/task:0/device:d3:0");
   EXPECT_EQ((std::vector<DeviceType>{
-                DeviceType("d2"), DeviceType("d1"), DeviceType("d3"),
+                DeviceType("d2"),
+                DeviceType("d1"),
+                DeviceType("d3"),
             }),
             types());
 }
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 2d4f2a2d902a6a1457aa7a90b172dd9c9d5f8f5c..ecbffcbf6c4030bde82f2abe0e7779bf9c5a9870 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -61,7 +61,6 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
 
-
 namespace tensorflow {
 
 namespace {
@@ -259,9 +258,10 @@ DirectSession::DirectSession(const SessionOptions& options,
       factory_(factory),
       cancellation_manager_(new CancellationManager()),
       operation_timeout_in_ms_(options_.config.operation_timeout_in_ms()) {
-  if (options_.config.session_inter_op_thread_pool_size() > 0) {
-    for (int i = 0; i < options_.config.session_inter_op_thread_pool_size();
-         ++i) {
+  const int thread_pool_size =
+      options_.config.session_inter_op_thread_pool_size();
+  if (thread_pool_size > 0) {
+    for (int i = 0; i < thread_pool_size; ++i) {
       thread::ThreadPool* pool = nullptr;
       bool owned = false;
       init_error_.Update(NewThreadPoolFromThreadPoolOptions(
@@ -321,6 +321,10 @@ DirectSession::~DirectSession() {
   for (auto d : device_mgr_->ListDevices()) {
     d->op_segment()->RemoveHold(session_handle_);
   }
+  for (auto d : device_mgr_->ListDevices()) {
+    d->ClearResourceMgr();
+  }
+  functions_.clear();
   delete cancellation_manager_;
   for (const auto& p_and_owned : thread_pools_) {
     if (p_and_owned.second) delete p_and_owned.first;
@@ -467,9 +471,9 @@ Status DirectSession::Run(const RunOptions& run_options,
   Executor::Args args;
   args.step_id = step_id_counter_.fetch_add(1);
 
-  TF_RETURN_IF_ERROR(
-      GetOrCreateExecutors(input_tensor_names, output_names, target_nodes,
-                           &executors_and_keys, &run_state_args));
+  TF_RETURN_IF_ERROR(GetOrCreateExecutors(input_tensor_names, output_names,
+                                          target_nodes, &executors_and_keys,
+                                          &run_state_args));
   const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
 
   std::unique_ptr<DebuggerStateInterface> debugger_state;
@@ -521,9 +525,7 @@ Status DirectSession::Run(const RunOptions& run_options,
 
   args.rendezvous = run_state.rendez;
   args.cancellation_manager = &step_cancellation_manager;
-  args.runner = [this, pool](Executor::Args::Closure c) {
-    SchedClosure(pool, std::move(c));
-  };
+
   args.session_state = &session_state_;
   args.tensor_store = &run_state.tensor_store;
   args.step_container = &run_state.step_container;
@@ -584,7 +586,23 @@ Status DirectSession::Run(const RunOptions& run_options,
     return errors::Cancelled("Run call was cancelled");
   }
 
+  Executor::Args::Runner default_runner = [this,
+                                           pool](Executor::Args::Closure c) {
+    SchedClosure(pool, std::move(c));
+  };
   for (const auto& item : executors_and_keys->items) {
+    // TODO(zhengxq): support partial run.
+    // TODO(zhengxq): if the device picks its own threadpool, we need to assign
+    //     less threads to the main compute pool by default.
+    thread::ThreadPool* device_thread_pool =
+        item.device->tensorflow_device_thread_pool();
+    if (!device_thread_pool) {
+      args.runner = default_runner;
+    } else {
+      args.runner = [this, device_thread_pool](Executor::Args::Closure c) {
+        SchedClosure(device_thread_pool, std::move(c));
+      };
+    }
     item.executor->RunAsync(args, barrier->Get());
   }
 
@@ -1124,12 +1142,13 @@ Status DirectSession::GetOrCreateExecutors(
     options.debug_options = run_state_args->debug_options;
   }
 
+  std::unique_ptr<FunctionInfo> func_info(new FunctionInfo);
   std::shared_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
 
   // The executor_lock_ is intentionally released while executor is
   // being created.
   std::unordered_map<string, std::unique_ptr<Graph>> graphs;
-  TF_RETURN_IF_ERROR(CreateGraphs(options, &graphs, &ek->flib_def,
+  TF_RETURN_IF_ERROR(CreateGraphs(options, &graphs, &func_info->flib_def,
                                   run_state_args, &ek->input_types,
                                   &ek->output_types));
 
@@ -1160,9 +1179,9 @@ Status DirectSession::GetOrCreateExecutors(
     graph_def_version =
         execution_state_->original_graph_def().versions().producer();
   }
-  ek->proc_flr.reset(new ProcessFunctionLibraryRuntime(
-      device_mgr_.get(), options_.env, graph_def_version, ek->flib_def.get(),
-      optimizer_opts));
+  func_info->proc_flr.reset(new ProcessFunctionLibraryRuntime(
+      device_mgr_.get(), options_.env, graph_def_version,
+      func_info->flib_def.get(), optimizer_opts));
 
   GraphOptimizer optimizer(optimizer_opts);
   for (auto iter = graphs.begin(); iter != graphs.end(); ++iter) {
@@ -1174,7 +1193,7 @@ Status DirectSession::GetOrCreateExecutors(
 
     ek->items.resize(ek->items.size() + 1);
     auto* item = &(ek->items.back());
-    auto lib = ek->proc_flr->GetFLR(partition_name);
+    auto lib = func_info->proc_flr->GetFLR(partition_name);
     if (lib == nullptr) {
       return errors::Internal("Could not find device: ", partition_name);
     }
@@ -1186,8 +1205,14 @@ Status DirectSession::GetOrCreateExecutors(
     auto opseg = device->op_segment();
     params.create_kernel = [this, lib, opseg](const NodeDef& ndef,
                                               OpKernel** kernel) {
-      // Caches the kernel only if the node is stateful.
-      if (!lib->IsStateful(ndef.op())) {
+      // We do not share the kernel via the OpSegment if the node is
+      // stateless, or a function.
+      // NOTE(mrry): We must not share function kernels (implemented
+      // using `CallOp`) between subgraphs, because `CallOp::handle_`
+      // is tied to a particular subgraph. Even if the function itself
+      // is stateful, the `CallOp` that invokes it is not.
+      if (!lib->IsStateful(ndef.op()) ||
+          lib->GetFunctionLibraryDefinition()->Find(ndef.op()) != nullptr) {
         return lib->CreateKernel(ndef, kernel);
       }
       auto create_fn = [lib, &ndef](OpKernel** kernel) {
@@ -1222,9 +1247,10 @@ Status DirectSession::GetOrCreateExecutors(
     // NewLocalExecutor takes ownership of partition_graph.
     item->graph = partition_graph.get();
     item->executor = nullptr;
+    item->device = device;
     Executor* executor;
     TF_RETURN_IF_ERROR(
-        NewLocalExecutor(params, partition_graph.release(), &executor));
+        NewLocalExecutor(params, std::move(partition_graph), &executor));
     item->executor.reset(executor);
   }
 
@@ -1263,6 +1289,7 @@ Status DirectSession::GetOrCreateExecutors(
 
   // Reacquire the lock, try to insert into the map.
   mutex_lock l(executor_lock_);
+  functions_.push_back(std::move(func_info));
 
   // Another thread may have created the entry before us, in which case we will
   // reuse the already created one.
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 780d0b46a8c2c7440a5a739c27b368af8aad5bc2..45d765f8498e5e12eef3a47cd4a7ff0ad22aa495 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -112,6 +112,7 @@ class DirectSession : public Session {
   // every partition.
   struct PerPartitionExecutorsAndLib {
     Graph* graph = nullptr;                  // not owned.
+    Device* device = nullptr;                // not owned.
     FunctionLibraryRuntime* flib = nullptr;  // not owned.
     std::unique_ptr<Executor> executor;
   };
@@ -124,20 +125,12 @@ class DirectSession : public Session {
   // a partition of the graph bundled with its dependent library runtime.
   // 'input_keys' are the rendezvous keys for the feeds and 'output_keys'
   // are rendezvous keys for the fetches.
-  // 'flib_def' is the function library used by graphs in 'items'.
-  // 'proc_flr' is the collection of FunctionLibraryRuntime objects, one per
-  // device.
-  // TODO(phawkins): currently partitions always share the same function
-  // library. Consider giving each partition its own function library to enable
-  // per-partition rewrites.
   struct ExecutorsAndKeys {
     ExecutorsAndKeys() : step_count(0) {}
 
     std::atomic_int_fast64_t step_count;
     std::unique_ptr<Graph> graph;
     NameNodeMap name_to_node;
-    std::unique_ptr<FunctionLibraryDefinition> flib_def;
-    std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr;
     std::vector<PerPartitionExecutorsAndLib> items;
     std::unordered_map<string, size_t> input_name_to_index;
     std::unordered_map<string, string> input_name_to_rendezvous_key;
@@ -148,6 +141,22 @@ class DirectSession : public Session {
     DataTypeVector output_types;
   };
 
+  // A FunctionInfo object is created for every unique set of feeds/fetches.
+  // This info could be folded into the ExecutorsAndKeys object but we would
+  // like to maintain a deletion order in which the OpKernels (owned by the
+  // executor) should be destroyed first, followed by the resources in the
+  // device and then followed by the function stuff.
+  // TODO(rohanj): Consolidate function library definitions so that we can
+  // instantiate only one ProcFLR and lib_def and make this just a member
+  // variable and not a vector.
+  // 'flib_def' is the function library used.
+  // 'proc_flr' is the collection of FunctionLibraryRuntime objects, one per
+  // device.
+  struct FunctionInfo {
+    std::unique_ptr<FunctionLibraryDefinition> flib_def;
+    std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr;
+  };
+
   // For each live partial execution, the session maintains a RunState.
   // 'status' is the current status of this partial execution. 'executor_done'
   // is "notified" when all executors are done. 'pending_inputs' are the set
@@ -282,6 +291,9 @@ class DirectSession : public Session {
   // Schedules 'c' for execution on pool.
   void SchedClosure(thread::ThreadPool* pool, std::function<void()> c);
 
+  std::vector<std::unique_ptr<FunctionInfo>> functions_
+      GUARDED_BY(executor_lock_);
+
   mutex executor_lock_;  // protects executors_
   // Holds mappings from signature to the executors that process
   // it. The reason for a level of indirection around mapped_type is
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 15edce6a68200dc45532d2b69779f48601a47c25..b75a4f76d94f704cf38a6c4657b6089a863c085f 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -436,10 +436,7 @@ TEST(DirectSessionTest, FetchMultipleTimes) {
   }
 }
 
-REGISTER_OP("Darth")
-    .Input("x: float")
-    .Output("y: float")
-    .Doc(R"doc(
+REGISTER_OP("Darth").Input("x: float").Output("y: float").Doc(R"doc(
 Darth promises one return value.
 
 x: float
@@ -972,39 +969,38 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
 
   std::atomic<int32> num_done(0);
   // Runs session to compute <node>:0 using inter_op thread pool <pool>.
-  auto add_session_run_call = [use_global_pools, &def, &options, &sessions,
-                               &sessions_mu,
-                               &num_done](thread::ThreadPool* tp, Node* node,
-                                          int inter_op_pool) {
-    auto fn = [use_global_pools, &def, &options, &sessions, &sessions_mu,
-               inter_op_pool, node, &num_done]() {
-      RunOptions run_options;
-      run_options.set_inter_op_thread_pool(inter_op_pool);
-      std::vector<Tensor> outputs;
-
-      Session* session;
-      if (use_global_pools) {
-        std::unique_ptr<Session> s(NewSession(options));
-        TF_ASSERT_OK(s->Create(def));
-        session = s.get();
-
-        mutex_lock l(sessions_mu);
-        sessions.emplace_back(std::move(s));
-      } else {
-        session = sessions[0].get();
-      }
+  auto add_session_run_call =
+      [use_global_pools, &def, &options, &sessions, &sessions_mu, &num_done](
+          thread::ThreadPool* tp, Node* node, int inter_op_pool) {
+        auto fn = [use_global_pools, &def, &options, &sessions, &sessions_mu,
+                   inter_op_pool, node, &num_done]() {
+          RunOptions run_options;
+          run_options.set_inter_op_thread_pool(inter_op_pool);
+          std::vector<Tensor> outputs;
+
+          Session* session;
+          if (use_global_pools) {
+            std::unique_ptr<Session> s(NewSession(options));
+            TF_ASSERT_OK(s->Create(def));
+            session = s.get();
+
+            mutex_lock l(sessions_mu);
+            sessions.emplace_back(std::move(s));
+          } else {
+            session = sessions[0].get();
+          }
 
-      Status s = session->Run(run_options, {} /* inputs */,
-                              {node->name() + ":0"} /* output_names */, {},
-                              &outputs, nullptr /* run_metadata */);
-      TF_CHECK_OK(s);
-      ASSERT_EQ(1, outputs.size());
-      auto flat = outputs[0].flat<float>();
-      EXPECT_FLOAT_EQ(1.2, flat(0));
-      num_done.fetch_add(1);
-    };
-    tp->Schedule(fn);
-  };
+          Status s = session->Run(run_options, {} /* inputs */,
+                                  {node->name() + ":0"} /* output_names */, {},
+                                  &outputs, nullptr /* run_metadata */);
+          TF_CHECK_OK(s);
+          ASSERT_EQ(1, outputs.size());
+          auto flat = outputs[0].flat<float>();
+          EXPECT_FLOAT_EQ(1.2, flat(0));
+          num_done.fetch_add(1);
+        };
+        tp->Schedule(fn);
+      };
 
   // For blocking states:
   // - Starts at 0, BlockingOp::Compute will move to 1.
@@ -1265,7 +1261,7 @@ TEST(DirectSessionTest, LocalDeviceManager) {
 
 // A simple benchmark for the overhead of `DirectSession::Run()` calls
 // with varying numbers of feeds/fetches.
-void FeedFetchBenchmarkHelper(int num_feeds, int iters) {
+void FeedFetchBenchmarkHelper(int iters, int num_feeds) {
   testing::StopTiming();
 
   Tensor value(DT_FLOAT, TensorShape());
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 14f5fdc5d304e1d64dca313b2aca673f691288e1..31fb128f937ae46eefb309fc9bab8167e54846a7 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -142,7 +142,7 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelWarmup) {
   DirectSession* ds = static_cast<DirectSession*>(session.get());
   CostModelManager::CostModelMap cost_models;
   ds->ExportCostModels(&cost_models);
-  CHECK_EQ(cost_models.size(), 1);
+  CHECK_GE(cost_models.size(), 1);
   const CostModel* cm = (*cost_models.begin()).second;
   EXPECT_EQ(measure_steps, cm->GetUpdateTimes());
 }
@@ -161,14 +161,14 @@ static void TestHWAccelerator(bool enableHWTrace) {
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
   // y = A * x
   Node* y = test::graph::Matmul(&graph, a, x, false, false);
   y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
 #ifdef TENSORFLOW_USE_SYCL
-y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
-#endif // TENSORFLOW_USE_SYCL
+  y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
+#endif  // TENSORFLOW_USE_SYCL
 
   Node* y_neg = test::graph::Unary(&graph, "Neg", y);
   y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
@@ -181,7 +181,7 @@ y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
   (*options.config.mutable_device_count())["GPU"] = 1;
 #ifdef TENSORFLOW_USE_SYCL
   (*options.config.mutable_device_count())["SYCL"] = 1;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   options.config.set_allow_soft_placement(true);
   options.config.mutable_graph_options()->set_build_cost_model(1);
   std::unique_ptr<Session> session(NewSession(options));
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 1896baaf668864fc1b29ac3ea6c9b1ab6eaaaeaa..b06b75d6585f01640374eb7ab9842bf441cf9411 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -172,17 +172,11 @@ void SetMemory(NodeExecStatsWrapper* stats, OpKernelContext* ctx) {
     stats->AddAllocation(allocator_pair.first, allocator_pair.second);
   }
   auto* ms = stats->stats()->mutable_memory_stats();
-  ms->set_host_temp_memory_size(ctx->host_temp_memory_size());
-  ms->set_device_temp_memory_size(ctx->device_temp_memory_size());
-  for (const auto& alloc_id : ctx->host_persistent_alloc_ids()) {
-    ms->mutable_host_persistent_tensor_alloc_ids()->Add(alloc_id);
+  ms->set_temp_memory_size(ctx->temp_memory_allocated());
+  for (const auto& alloc_id : ctx->persistent_alloc_ids()) {
+    ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
   }
-  for (const auto& alloc_id : ctx->device_persistent_alloc_ids()) {
-    ms->mutable_device_persistent_tensor_alloc_ids()->Add(alloc_id);
-  }
-  ms->set_host_persistent_memory_size(ctx->host_persistent_memory_allocated());
-  ms->set_device_persistent_memory_size(
-      ctx->device_persistent_memory_allocated());
+  ms->set_persistent_memory_size(ctx->persistent_memory_allocated());
 }
 
 void SetReferencedTensors(NodeExecStatsWrapper* stats,
@@ -338,8 +332,8 @@ class GraphView {
 
 class ExecutorImpl : public Executor {
  public:
-  ExecutorImpl(const LocalExecutorParams& p, const Graph* g)
-      : params_(p), graph_(g), gview_() {
+  ExecutorImpl(const LocalExecutorParams& p, std::unique_ptr<const Graph> g)
+      : params_(p), graph_(std::move(g)), gview_() {
     CHECK(p.create_kernel != nullptr);
     CHECK(p.delete_kernel != nullptr);
   }
@@ -354,7 +348,6 @@ class ExecutorImpl : public Executor {
     for (auto fiter : frame_info_) {
       delete fiter.second;
     }
-    delete graph_;
   }
 
   Status Initialize();
@@ -418,7 +411,7 @@ class ExecutorImpl : public Executor {
 
   // Owned.
   LocalExecutorParams params_;
-  const Graph* graph_;
+  std::unique_ptr<const Graph> graph_;
   GraphView gview_;
 
   // A cached value of params_
@@ -611,11 +604,11 @@ void GetMaxPendingCounts(const Node* n, size_t* max_pending,
 }
 
 Status ExecutorImpl::Initialize() {
-  gview_.Initialize(graph_);
+  gview_.Initialize(graph_.get());
 
   // Build the information about frames in this subgraph.
   ControlFlowInfo cf_info;
-  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph_, &cf_info));
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph_.get(), &cf_info));
 
   // Cache this value so we make this virtual function call once, rather
   // that O(# steps * # nodes per step) times.
@@ -682,9 +675,9 @@ Status ExecutorImpl::Initialize() {
 
   // Initialize PendingCounts only after item->pending_id is initialized for
   // all nodes.
-  InitializePending(graph_, cf_info);
+  InitializePending(graph_.get(), cf_info);
 
-  return gview_.SetAllocAttrs(graph_, params_.device);
+  return gview_.SetAllocAttrs(graph_.get(), params_.device);
 }
 
 Status GraphView::SetAllocAttrs(const Graph* g, const Device* device) {
@@ -1188,7 +1181,7 @@ class ExecutorState {
   // QUESTION: Make it a checkpoint::TensorSliceReaderCacheWrapper
   // instead of a pointer?  (avoids having to delete).
   checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache_;
-  FunctionCallFrame* call_frame_;
+  CallFrameInterface* call_frame_;
   const ExecutorImpl* impl_;
   CancellationManager* cancellation_manager_;
   Executor::Args::Runner runner_;
@@ -1421,7 +1414,7 @@ void ExecutorImpl::InitializePending(const Graph* graph,
 }
 
 void ExecutorState::RunAsync(Executor::DoneCallback done) {
-  const Graph* graph = impl_->graph_;
+  const Graph* graph = impl_->graph_.get();
   TaggedNodeSeq ready;
 
   // Ask the device to fill in the device context map.
@@ -1615,7 +1608,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
         auto done = [this, state]() {
           Device* device = impl_->params_.device;
           NodeExecStatsWrapper* stats = state->stats;  // Shorthand
-          Entry* first_input = state->first_input;  // Shorthand
+          Entry* first_input = state->first_input;     // Shorthand
 
           nodestats::SetOpEnd(stats);
           EntryVector outputs;
@@ -1782,6 +1775,19 @@ Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input,
         entry->ref_mu = nullptr;
 
         inp->tensor = entry->val.get();
+        // The dtype of entry->ref could have been changed by another operation
+        // that ran after the operation that "produced" it executed, so
+        // re-validate that the type of the dereferenced tensor matches the
+        // expected input type.
+        if (item.input_type(i) != inp->tensor->dtype()) {
+          return AttachDef(
+              errors::InvalidArgument(
+                  i, "-th input expects type ",
+                  DataTypeString(item.input_type(i)),
+                  " but automatically dereferenced input tensor has type ",
+                  DataTypeString(inp->tensor->dtype())),
+              item.kernel->def());
+        }
       }
     }
   }
@@ -2599,9 +2605,10 @@ void ExecutorImpl::RunAsync(const Args& args, DoneCallback done) {
 
 }  // end namespace
 
-Status NewLocalExecutor(const LocalExecutorParams& params, const Graph* graph,
+Status NewLocalExecutor(const LocalExecutorParams& params,
+                        std::unique_ptr<const Graph> graph,
                         Executor** executor) {
-  ExecutorImpl* impl = new ExecutorImpl(params, graph);
+  ExecutorImpl* impl = new ExecutorImpl(params, std::move(graph));
   const Status s = impl->Initialize();
   if (s.ok()) {
     *executor = impl;
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index e09dc4e34630fc0ab22615b7204bd0ec2d117d35..adf80a2417e2a86e874dd1d1068a1bbb611ff882 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -84,7 +84,7 @@ class Executor {
     int64 step_id = 0;
     Rendezvous* rendezvous = nullptr;
     StepStatsCollector* stats_collector = nullptr;
-    FunctionCallFrame* call_frame = nullptr;
+    CallFrameInterface* call_frame = nullptr;
     CancellationManager* cancellation_manager = nullptr;
     SessionState* session_state = nullptr;
     TensorStore* tensor_store = nullptr;
@@ -122,9 +122,8 @@ class Executor {
 
 // Creates an Executor that computes the given "graph".
 //
-// If successful, returns the constructed executor in "*executor". The
-// caller keeps the ownership of "device". The returned executor takes
-// the ownership of "graph". Otherwise, returns an error status.
+// If successful, returns the constructed executor in "*executor". Otherwise,
+// returns an error status.
 //
 // "params" provides a set of context for the executor. We expect that
 // different context would provide different implementations.
@@ -143,7 +142,8 @@ struct LocalExecutorParams {
   Executor::Args::NodeOutputsCallback node_outputs_cb;
 };
 ::tensorflow::Status NewLocalExecutor(const LocalExecutorParams& params,
-                                      const Graph* graph, Executor** executor);
+                                      std::unique_ptr<const Graph> graph,
+                                      Executor** executor);
 
 // A class to help run multiple executors in parallel and wait until
 // all of them are complete.
@@ -202,11 +202,12 @@ class ExecutorBarrier {
       // below.
       if (--pending_ == 0) {
         CHECK(done_cb_ != nullptr);
-        done = done_cb_;
-        done_cb_ = nullptr;
+        std::swap(done, done_cb_);
       }
 
-      status = status_;
+      if (!status_.ok()) {
+        status = status_;
+      }
     }
 
     if (error) {
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 23d0f331c5d096cfb944da48e9b5ce58e04daf65..b941819838a7b155d8c8f54985bd6ae8bc15ce9d 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/gradients.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/optimizer_cse.h"
@@ -96,12 +97,11 @@ static Node* AddNoOp(Graph* g) {
 
 static Node* AddIdentity(Graph* g, Endpoint input) {
   DCHECK_LT(0, input.dtype());
-  DCHECK_LT(input.dtype(), DT_FLOAT_REF);
   NodeDef ndef;
   ndef.set_name(g->NewName(kNodeLabel));
   ndef.set_op("Identity");
   ndef.add_input(input.name());
-  AddNodeAttr("T", input.dtype(), &ndef);
+  AddNodeAttr("T", BaseType(input.dtype()), &ndef);
   Status s;
   Node* ret = g->AddNode(ndef, &s);
   TF_CHECK_OK(s);
@@ -151,20 +151,29 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   ~FunctionLibraryRuntimeImpl() override;
 
   Status Instantiate(const string& function_name, AttrSlice attrs,
+                     const InstantiateOptions& options,
                      Handle* handle) override;
 
+  Status ReleaseHandle(Handle handle) override;
+
   const FunctionBody* GetFunctionBody(Handle handle) override;
 
   Status CreateKernel(const NodeDef& ndef, OpKernel** kernel) override;
 
   void Run(const Options& opts, Handle handle, gtl::ArraySlice<Tensor> args,
            std::vector<Tensor>* rets, DoneCallback done) override;
+  // NOTE(mrry): This overload is currently only implemented for local function
+  // execution.
+  // TODO(b/70346412): Implement support for remote function execution when
+  // passing a call frame.
+  void Run(const Options& opts, Handle handle, CallFrameInterface* frame,
+           DoneCallback done) override;
 
   bool IsStateful(const string& function) override;
 
   const FunctionLibraryDefinition* GetFunctionLibraryDefinition()
       const override {
-    return lib_def_;
+    return base_lib_def_;
   }
 
   Device* device() override { return device_; }
@@ -173,6 +182,10 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   string DebugString(Handle h) override;
 
+  Status Clone(std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+               std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
+               FunctionLibraryRuntime** out_flr) override;
+
  private:
   typedef FunctionLibraryRuntimeImpl ME;
 
@@ -180,7 +193,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   Device* const device_;
   Env* const env_;
   const int graph_def_version_;
-  const FunctionLibraryDefinition* const lib_def_;
+  const FunctionLibraryDefinition* const base_lib_def_;
   GraphOptimizer optimizer_;
   const CustomKernelCreator custom_kernel_creator_;
   const string device_name_;
@@ -190,28 +203,37 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   mutable mutex mu_;
 
-  // func_graphs_ never shrinks or reorders its members.
-  std::vector<FunctionBody*> func_graphs_ GUARDED_BY(mu_);
+  int next_handle_ GUARDED_BY(mu_);
 
   // The instantiated and transformed function is encoded as a Graph
   // object, and an executor is created for the graph.
   struct Item : public core::RefCounted {
-    const Graph* graph = nullptr;  // Owned by exec.
+    const Graph* graph = nullptr;                            // Owned by exec.
+    const FunctionLibraryDefinition* overlay_lib = nullptr;  // Not owned.
+    FunctionBody* func_graph = nullptr;
     Executor* exec = nullptr;
 
-    ~Item() override { delete this->exec; }
+    ~Item() override {
+      delete this->func_graph;
+      delete this->exec;
+    }
   };
-  std::vector<Item*> items_;
+  std::unordered_map<Handle, Item*> items_ GUARDED_BY(mu_);
 
   ProcessFunctionLibraryRuntime* parent_ = nullptr;  // not owned.
 
+  Status CreateKernel(const NodeDef& ndef,
+                      const FunctionLibraryDefinition* lib_def,
+                      OpKernel** kernel);
   Status FunctionDefToBody(const FunctionDef& fdef, AttrSlice attrs,
+                           const FunctionLibraryDefinition* lib_def,
                            FunctionBody** fbody);
   Status CreateItem(Handle handle, Item** item);
   Status GetOrCreateItem(Handle handle, Item** item);
   Status InstantiateSymbolicGradient(const NameAttrList& func,
+                                     const FunctionLibraryDefinition* lib_def,
                                      FunctionBody** g_body);
-  bool IsLocalTarget(const AttrSlice& attrs);
+  bool IsLocalTarget(const InstantiateOptions& options);
   AttrValueMap FixAttrs(const AttrSlice& attrs);
   void RunRemote(const Options& opts, Handle handle,
                  gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
@@ -230,15 +252,16 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
       device_(device),
       env_(env),
       graph_def_version_(graph_def_version),
-      lib_def_(lib_def),
+      base_lib_def_(lib_def),
       optimizer_(optimizer_options),
       custom_kernel_creator_(std::move(custom_kernel_creator)),
       device_name_(device_ == nullptr
                        ? ProcessFunctionLibraryRuntime::kDefaultFLRDevice
                        : device_->name()),
+      next_handle_(0),
       parent_(parent) {
   get_func_sig_ = [this](const string& op, const OpDef** sig) {
-    return lib_def_->LookUpOpDef(op, sig);
+    return base_lib_def_->LookUpOpDef(op, sig);
   };
   create_kernel_ = [this](const NodeDef& ndef, OpKernel** kernel) {
     return CreateKernel(ndef, kernel);
@@ -246,9 +269,15 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
 }
 
 FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {
-  for (FunctionBody* p : func_graphs_) delete p;
-  for (Item* item : items_)
-    if (item) item->Unref();
+  // The most common patterns of FLR usage don't require the caller to
+  // explicitly release handles. As a result, we try to unref each item until
+  // it's erased.
+  for (auto item : items_) {
+    if (item.second) {
+      while (!item.second->Unref()) {
+      }
+    }
+  }
 }
 
 // An asynchronous op kernel which executes an instantiated function
@@ -309,13 +338,18 @@ const FunctionBody* FunctionLibraryRuntimeImpl::GetFunctionBody(Handle h) {
   }
 
   mutex_lock l(mu_);
-  CHECK_LE(0, local_handle);
-  CHECK_LT(local_handle, func_graphs_.size());
-  return func_graphs_[local_handle];
+  CHECK_EQ(1, items_.count(local_handle));
+  return items_[local_handle]->func_graph;
 }
 
 Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
                                                 OpKernel** kernel) {
+  return CreateKernel(ndef, base_lib_def_, kernel);
+}
+
+Status FunctionLibraryRuntimeImpl::CreateKernel(
+    const NodeDef& ndef, const FunctionLibraryDefinition* lib_def,
+    OpKernel** kernel) {
   // If a custom kernel creator is given, try that.
   Status s;
   if (custom_kernel_creator_) {
@@ -331,16 +365,21 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
     }
   }
 
-  if (lib_def_->Find(ndef.op()) == nullptr) {
+  if (lib_def->Find(ndef.op()) == nullptr) {
     // A primitive operation. Creates the registered kernel.
     return CreateNonCachedKernel(device_, this, ndef, graph_def_version_,
                                  kernel);
   }
 
-  // Try to instantiate this function for the func/attr. Maybe its
+  // Try to instantiate this function for the func/attr. Maybe it's
   // cached already.
+  InstantiateOptions options;
+  if (lib_def != base_lib_def_) {
+    options.overlay_lib = lib_def;
+  }
   Handle handle;
-  TF_RETURN_IF_ERROR(Instantiate(ndef.op(), AttrSlice(&ndef.attr()), &handle));
+  TF_RETURN_IF_ERROR(
+      Instantiate(ndef.op(), AttrSlice(&ndef.attr()), options, &handle));
 
   const FunctionBody* fbody = GetFunctionBody(handle);
   CHECK_NOTNULL(fbody);
@@ -372,15 +411,23 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
   return s;
 }
 
-Status FunctionLibraryRuntimeImpl::FunctionDefToBody(const FunctionDef& fdef,
-                                                     AttrSlice attrs,
-                                                     FunctionBody** fbody) {
-  return FunctionDefToBodyHelper(fdef, attrs, lib_def_, get_func_sig_, fbody);
+Status FunctionLibraryRuntimeImpl::FunctionDefToBody(
+    const FunctionDef& fdef, AttrSlice attrs,
+    const FunctionLibraryDefinition* lib_def, FunctionBody** fbody) {
+  if (lib_def == base_lib_def_) {
+    return FunctionDefToBodyHelper(fdef, attrs, lib_def, get_func_sig_, fbody);
+  } else {
+    auto get_func_sig = [lib_def](const string& op, const OpDef** sig) {
+      return lib_def->LookUpOpDef(op, sig);
+    };
+    return FunctionDefToBodyHelper(fdef, attrs, lib_def, get_func_sig, fbody);
+  }
 }
 
 Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient(
-    const NameAttrList& func, FunctionBody** g_body) {
-  const FunctionDef* fdef = lib_def_->Find(func.name());
+    const NameAttrList& func, const FunctionLibraryDefinition* lib_def,
+    FunctionBody** g_body) {
+  const FunctionDef* fdef = lib_def->Find(func.name());
   if (fdef == nullptr) {
     // f is a primitive op.
     gradient::Creator creator;
@@ -394,12 +441,16 @@ Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient(
     // by the gradient function.
     TF_RETURN_IF_ERROR(creator(AttrSlice(&func.attr()), &grad_fdef));
     TF_RETURN_IF_ERROR(
-        FunctionDefToBody(grad_fdef, AttrSlice(&func.attr()), g_body));
+        FunctionDefToBody(grad_fdef, AttrSlice(&func.attr()), lib_def, g_body));
   } else {
     // f is a user-defined function.
+    InstantiateOptions options;
+    if (lib_def != base_lib_def_) {
+      options.overlay_lib = lib_def;
+    }
     Handle f_handle;
     TF_RETURN_IF_ERROR(
-        Instantiate(func.name(), AttrSlice(&func.attr()), &f_handle));
+        Instantiate(func.name(), AttrSlice(&func.attr()), options, &f_handle));
     const FunctionBody* f_body = GetFunctionBody(f_handle);
     CHECK_NOTNULL(f_body);
     *g_body = SymbolicGradient(*f_body);
@@ -407,51 +458,42 @@ Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient(
   return Status::OK();
 }
 
-bool FunctionLibraryRuntimeImpl::IsLocalTarget(const AttrSlice& attrs) {
+bool FunctionLibraryRuntimeImpl::IsLocalTarget(
+    const InstantiateOptions& options) {
   if (device_ == nullptr) return true;
-  string target = ProcessFunctionLibraryRuntime::ObtainFunctionTarget(attrs);
-  if (target.empty()) return true;
+  if (options.target.empty()) return true;
   Device* target_device;
-  if (!device_mgr_->LookupDevice(target, &target_device).ok()) {
+  if (!device_mgr_->LookupDevice(options.target, &target_device).ok()) {
     return false;
   }
   return target_device == device_;
 }
 
-AttrValueMap FunctionLibraryRuntimeImpl::FixAttrs(const AttrSlice& attrs) {
-  AttrValueMap value_map;
-  for (auto it : attrs) {
-    value_map[it.first] = it.second;
-  }
-  if (attrs.Find("_target") != nullptr) {
-    return value_map;
-  }
-  AttrValue v;
-  v.set_s(device_name_);
-  AddAttr("_target", v, &value_map);
-  return value_map;
-}
-
-Status FunctionLibraryRuntimeImpl::Instantiate(const string& function_name,
-                                               AttrSlice attrs,
-                                               Handle* handle) {
-  AttrValueMap value_map = FixAttrs(attrs);
-  AttrSlice new_attrs(&value_map);
-
-  if (!IsLocalTarget(new_attrs)) {
-    return parent_->Instantiate(function_name, new_attrs, handle);
+Status FunctionLibraryRuntimeImpl::Instantiate(
+    const string& function_name, AttrSlice attrs,
+    const InstantiateOptions& options, Handle* handle) {
+  if (!IsLocalTarget(options)) {
+    return parent_->Instantiate(function_name, attrs, options, handle);
   }
 
-  const string key = Canonicalize(function_name, new_attrs);
+  // Since this is a local target, ensure that the local `device_name_` appears
+  // in the canonical key.
+  InstantiateOptions options_copy(options);
+  options_copy.target = device_name_;
+  const string key = Canonicalize(function_name, attrs, options_copy);
   *handle = parent_->GetHandle(key);
   if (*handle != kInvalidHandle) {
+    mutex_lock l(mu_);
+    items_[parent_->GetHandleOnDevice(device_name_, *handle)]->Ref();
     return Status::OK();
   }
 
   Status s;
+  const FunctionLibraryDefinition* lib_def =
+      options.overlay_lib ? options.overlay_lib : base_lib_def_;
   FunctionBody* fbody = nullptr;
   if (function_name == kGradientOp) {
-    const AttrValue* f = new_attrs.Find(kFuncAttr);
+    const AttrValue* f = attrs.Find(kFuncAttr);
     if (f == nullptr) {
       return errors::InvalidArgument("SymbolicGradient is missing attr: f");
     }
@@ -459,17 +501,17 @@ Status FunctionLibraryRuntimeImpl::Instantiate(const string& function_name,
     if (func.name() == kGradientOp) {
       return errors::InvalidArgument("Can't take gradient of SymbolicGradient");
     }
-    const string grad = lib_def_->FindGradient(func.name());
+    const string grad = lib_def->FindGradient(func.name());
     if (!grad.empty()) {
-      return Instantiate(grad, AttrSlice(&func.attr()), handle);
+      return Instantiate(grad, AttrSlice(&func.attr()), options, handle);
     }
-    TF_RETURN_IF_ERROR(InstantiateSymbolicGradient(func, &fbody));
+    TF_RETURN_IF_ERROR(InstantiateSymbolicGradient(func, lib_def, &fbody));
   } else {
-    const FunctionDef* fdef = lib_def_->Find(function_name);
+    const FunctionDef* fdef = lib_def->Find(function_name);
     if (fdef == nullptr) {
       return errors::NotFound("Function ", function_name, " is not defined.");
     }
-    TF_RETURN_IF_ERROR(FunctionDefToBody(*fdef, new_attrs, &fbody));
+    TF_RETURN_IF_ERROR(FunctionDefToBody(*fdef, attrs, lib_def, &fbody));
   }
 
   {
@@ -477,15 +519,35 @@ Status FunctionLibraryRuntimeImpl::Instantiate(const string& function_name,
     *handle = parent_->GetHandle(key);
     if (*handle != kInvalidHandle) {
       delete fbody;
+      items_[parent_->GetHandleOnDevice(device_name_, *handle)]->Ref();
     } else {
-      *handle = parent_->AddHandle(key, device_name_, func_graphs_.size());
-      func_graphs_.push_back(fbody);
-      items_.resize(func_graphs_.size());
+      *handle = parent_->AddHandle(key, device_name_, next_handle_);
+      Item* item = new Item;
+      item->func_graph = fbody;
+      item->overlay_lib = options.overlay_lib;
+      items_.insert({next_handle_, item});
+      next_handle_++;
     }
   }
   return Status::OK();
 }
 
+Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
+  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
+    return parent_->ReleaseHandle(handle);
+  }
+
+  LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle);
+  mutex_lock l(mu_);
+  CHECK_EQ(1, items_.count(h));
+  Item* item = items_[h];
+  if (item->Unref()) {
+    items_.erase(h);
+    TF_RETURN_IF_ERROR(parent_->RemoveHandle(handle));
+  }
+  return Status::OK();
+}
+
 void DumpGraph(StringPiece label, const Graph* g) {
   // TODO(zhifengc): Change Graph to record #nodes.
   VLOG(1) << "Graph " << label << " #nodes " << g->num_nodes() << " #edges "
@@ -506,12 +568,47 @@ void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g) {
   optimizer.Optimize(lib, lib->env(), lib->device(), g, /*shape_map=*/nullptr);
 }
 
+namespace {
+// Removes all stateless nodes that do not contribute to a return
+// value from the function body.  Unlike `RemoveDeadNodes()`, which is
+// triggered by `OptimizerOptions.do_function_inlining`, this pass
+// ignores the SINK node, from which (by definition) all nodes are
+// reverse reachable.
+void PruneFunctionBody(Graph* g) {
+  VLOG(2) << "Pruning function body";
+  std::unordered_set<const Node*> nodes;
+  for (auto n : g->nodes()) {
+    // NOTE(mrry): "_Retval" nodes are stateful, and so will be added
+    // to the seed set of `nodes`.
+    // TODO(mrry): Investigate whether the `n->IsControlFlow()` test is
+    // still needed. It would be preferable to prune entire loops and/or
+    // conditionals if they are not used in the graph.
+    if (n->IsControlFlow() || n->op_def().is_stateful()) {
+      nodes.insert(n);
+    }
+  }
+  bool changed = PruneForReverseReachability(g, std::move(nodes));
+  if (changed) {
+    FixupSourceAndSinkEdges(g);
+  }
+}
+}  // namespace
+
 Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
-  const FunctionBody* fbody = GetFunctionBody(handle);
-  CHECK_NOTNULL(fbody);
-  std::unique_ptr<Graph> g(new Graph(lib_def_));
+  const FunctionBody* fbody;
+  const FunctionLibraryDefinition* lib_def;
+  {
+    mutex_lock l(mu_);
+    fbody = (*item)->func_graph;
+    lib_def = (*item)->overlay_lib;
+  }
+  if (!lib_def) {
+    lib_def = base_lib_def_;
+  }
+  std::unique_ptr<Graph> g(new Graph(lib_def));
   CopyGraph(*fbody->graph, g.get());
 
+  PruneFunctionBody(g.get());
   optimizer_.Optimize(this, env(), device(), &g, /*shape_map=*/nullptr);
   TF_RETURN_IF_ERROR(EnsureMemoryTypes(DeviceType(device()->device_type()),
                                        device()->name(), g.get()));
@@ -521,17 +618,31 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
   LocalExecutorParams params;
   params.device = device_;
   params.function_library = this;
-  params.create_kernel = create_kernel_;
+  if (lib_def == base_lib_def_) {
+    params.create_kernel = create_kernel_;
+  } else {
+    params.create_kernel = [this, lib_def](const NodeDef& ndef,
+                                           OpKernel** kernel) {
+      return CreateKernel(ndef, lib_def, kernel);
+    };
+  }
   params.delete_kernel = [](OpKernel* kernel) {
     DeleteNonCachedKernel(kernel);
   };
   Graph* graph = g.get();
   Executor* exec;
-  TF_RETURN_IF_ERROR(NewLocalExecutor(params, g.release(), &exec));
+  TF_RETURN_IF_ERROR(NewLocalExecutor(params, std::move(g), &exec));
 
-  *item = new Item;
-  (*item)->graph = graph;
-  (*item)->exec = exec;
+  {
+    // Guard item since it is already inserted in items_.
+    mutex_lock l(mu_);
+    if ((*item)->exec) {
+      delete exec;
+    } else {
+      (*item)->graph = graph;
+      (*item)->exec = exec;
+    }
+  }
   return Status::OK();
 }
 
@@ -539,29 +650,18 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) {
   LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
   {
     mutex_lock l(mu_);
-    if (local_handle >= items_.size()) {
+    if (items_.count(local_handle) == 0) {
       return errors::NotFound("Function handle ", handle,
                               " is not valid. Likely an internal error.");
     }
     *item = items_[local_handle];
-    if (*item != nullptr) {
-      (*item)->Ref();
+    if ((*item)->exec != nullptr) {
       return Status::OK();
     }
   }
   // NOTE: We need to call CreateItem out of mu_ because creating an
   // executor needs to call CreateKernel.
-  TF_RETURN_IF_ERROR(CreateItem(handle, item));
-
-  {
-    mutex_lock l(mu_);
-    if (items_[local_handle] == nullptr) {
-      // Install *item in items_.
-      items_[local_handle] = *item;
-      (*item)->Ref();
-    }
-  }
-  return Status::OK();
+  return CreateItem(handle, item);
 }
 
 void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
@@ -569,14 +669,13 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
                                            std::vector<Tensor>* rets,
                                            Executor::Args* exec_args,
                                            Item* item, DoneCallback done) {
-  FunctionCallFrame* frame = exec_args->call_frame;
+  DCHECK(exec_args->call_frame == nullptr);
   string target_device = parent_->GetDeviceName(handle);
   string source_device = opts.source_device;
   Rendezvous* rendezvous = opts.rendezvous;
   DeviceContext* device_context;
   Status s = parent_->GetDeviceContext(target_device, &device_context);
   if (!s.ok()) {
-    delete frame;
     delete exec_args;
     done(s);
     return;
@@ -584,6 +683,16 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   int64 src_incarnation, target_incarnation;
   s = parent_->GetDeviceIncarnation(source_device, &src_incarnation);
   s.Update(parent_->GetDeviceIncarnation(target_device, &target_incarnation));
+  if (!s.ok()) {
+    delete exec_args;
+    done(s);
+    return;
+  }
+
+  const FunctionBody* fbody = GetFunctionBody(handle);
+  FunctionCallFrame* frame =
+      new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
+  exec_args->call_frame = frame;
   if (!s.ok()) {
     delete frame;
     delete exec_args;
@@ -617,7 +726,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
             *exec_args, [item, frame, rets, done, source_device, target_device,
                          target_incarnation, rendezvous, device_context,
                          remote_args, exec_args](const Status& status) {
-              item->Unref();
               Status s = status;
               if (s.ok()) {
                 s = frame->ConsumeRetvals(rets);
@@ -661,17 +769,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     parent_->Run(run_opts, handle, args, rets, done);
     return;
   }
-  const FunctionBody* fbody = GetFunctionBody(handle);
-  FunctionCallFrame* frame =
-      new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
 
-  Item* item = nullptr;
-  Status s = GetOrCreateItem(handle, &item);
-  if (!s.ok()) {
-    delete frame;
-    done(s);
-    return;
-  }
   DCHECK(run_opts.runner != nullptr);
 
   Executor::Args* exec_args = new Executor::Args;
@@ -679,16 +777,28 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   exec_args->step_id = run_opts.step_id;
   exec_args->rendezvous = run_opts.rendezvous;
   exec_args->stats_collector = run_opts.stats_collector;
-  exec_args->call_frame = frame;
   exec_args->cancellation_manager = run_opts.cancellation_manager;
   exec_args->step_container = run_opts.step_container;
   exec_args->runner = *run_opts.runner;
 
+  Item* item = nullptr;
+  Status s = GetOrCreateItem(handle, &item);
+  if (!s.ok()) {
+    delete exec_args;
+    done(s);
+    return;
+  }
+
   if (run_opts.remote_execution) {
+    // NOTE(mrry): `RunRemote()` will set `exec_args->call_frame` for us.
     RunRemote(run_opts, handle, args, rets, exec_args, item, done);
     return;
   }
 
+  const FunctionBody* fbody = GetFunctionBody(handle);
+  FunctionCallFrame* frame =
+      new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
+  exec_args->call_frame = frame;
   s = frame->SetArgs(args);
   if (!s.ok()) {
     delete frame;
@@ -696,12 +806,12 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     done(s);
     return;
   }
+
   item->exec->RunAsync(
       // Executor args
       *exec_args,
       // Done callback.
       [item, frame, rets, done, exec_args](const Status& status) {
-        item->Unref();
         Status s = status;
         if (s.ok()) {
           s = frame->ConsumeRetvals(rets);
@@ -712,9 +822,69 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
       });
 }
 
+void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
+                                     CallFrameInterface* frame,
+                                     DoneCallback done) {
+  if (opts.cancellation_manager && opts.cancellation_manager->IsCancelled()) {
+    done(errors::Cancelled(""));
+    return;
+  }
+  if (!parent_->IsInstantiatedOnDevice(device_name_, handle) ||
+      opts.remote_execution) {
+    done(errors::Unimplemented("Remote calling with CallFrameInterface"));
+    return;
+  }
+
+  Options run_opts = opts;
+  if (opts.create_rendezvous) {
+    Rendezvous* rendezvous = new IntraProcessRendezvous(device_mgr_);
+    run_opts.rendezvous = rendezvous;
+    run_opts.create_rendezvous = false;
+    done = std::bind(
+        [rendezvous](DoneCallback done,
+                     // Begin unbound arguments.
+                     const Status& status) {
+          rendezvous->Unref();
+          done(status);
+        },
+        std::move(done), std::placeholders::_1);
+  }
+
+  Item* item = nullptr;
+  Status s = GetOrCreateItem(handle, &item);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+  DCHECK(run_opts.runner != nullptr);
+
+  Executor::Args* exec_args = new Executor::Args;
+  // Inherit the step_id from the caller.
+  exec_args->step_id = run_opts.step_id;
+  exec_args->rendezvous = run_opts.rendezvous;
+  exec_args->stats_collector = run_opts.stats_collector;
+  exec_args->cancellation_manager = run_opts.cancellation_manager;
+  exec_args->step_container = run_opts.step_container;
+  exec_args->runner = *run_opts.runner;
+  exec_args->call_frame = frame;
+
+  item->exec->RunAsync(
+      // Executor args
+      *exec_args,
+      // Done callback.
+      std::bind(
+          [item, frame, exec_args](DoneCallback done,
+                                   // Start unbound arguments.
+                                   const Status& status) {
+            delete exec_args;
+            done(status);
+          },
+          std::move(done), std::placeholders::_1));
+}
+
 bool FunctionLibraryRuntimeImpl::IsStateful(const string& func) {
   const OpDef* op_def;
-  const Status s = lib_def_->LookUpOpDef(func, &op_def);
+  const Status s = base_lib_def_->LookUpOpDef(func, &op_def);
   return s.ok() && op_def->is_stateful();
 }
 
@@ -728,6 +898,21 @@ string FunctionLibraryRuntimeImpl::DebugString(Handle handle) {
   }
 }
 
+Status FunctionLibraryRuntimeImpl::Clone(
+    std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+    std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
+    FunctionLibraryRuntime** out_flr) {
+  TF_RETURN_IF_ERROR(
+      parent_->Clone(env_, graph_def_version_, optimizer_.options(),
+                     custom_kernel_creator_, out_lib_def, out_pflr));
+  *out_flr = (*out_pflr)->GetFLR(device_->name());
+  if (out_flr != nullptr) {
+    return Status::OK();
+  } else {
+    return errors::Internal("Cloning FunctionLibraryRuntime failed.");
+  }
+}
+
 namespace {
 
 struct CustomCreatorSingleton {
@@ -1387,17 +1572,23 @@ Status FunctionDefToBodyHelper(
   InstantiationResult result;
   TF_RETURN_IF_ERROR(InstantiateFunction(fdef, attrs, get_func_sig, &result));
 
-  Graph* graph = new Graph(lib_def);
+  std::unique_ptr<Graph> graph(new Graph(lib_def));
   GraphConstructorOptions opts;
   opts.allow_internal_ops = true;
   opts.expect_device_spec = false;
-  Status s = ConvertNodeDefsToGraph(opts, result.nodes, graph);
-  if (!s.ok()) {
-    delete graph;
-  } else {
-    *fbody = new FunctionBody(fdef, result.arg_types, result.ret_types, graph);
-  }
-  return s;
+  TF_RETURN_IF_ERROR(ConvertNodeDefsToGraph(opts, result.nodes, graph.get()));
+
+  // Call BuildControlFlowInfo to validate that this function body has
+  // well-formed control flow.
+  // NOTE(skyewm): this is usually done in Partition(), but we don't partition
+  // function bodies. This should be removed if function bodies ever go through
+  // the Partition() path.
+  std::vector<ControlFlowInfo> dummy;
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph.get(), &dummy));
+
+  *fbody = new FunctionBody(fdef, result.arg_types, result.ret_types,
+                            graph.release());
+  return Status::OK();
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index d183bf7c978f1a39882b6f2f0a94386e25e5f0cd..63ad0d231c28a5af144b61e967a73e8ecfe6049a 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function_testlib.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op.h"
@@ -70,11 +71,11 @@ class FunctionTest : public ::testing::Test {
     arg_types_ = result.arg_types;
     ret_types_ = result.ret_types;
 
-    Graph* g = new Graph(OpRegistry::Global());
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
     GraphConstructorOptions opts;
     opts.allow_internal_ops = true;
     opts.expect_device_spec = false;
-    TF_CHECK_OK(ConvertNodeDefsToGraph(opts, result.nodes, g));
+    TF_CHECK_OK(ConvertNodeDefsToGraph(opts, result.nodes, g.get()));
 
     const int version = g->versions().producer();
     LocalExecutorParams params;
@@ -88,7 +89,7 @@ class FunctionTest : public ::testing::Test {
       DeleteNonCachedKernel(kernel);
     };
     Executor* exec;
-    TF_CHECK_OK(NewLocalExecutor(params, g, &exec));
+    TF_CHECK_OK(NewLocalExecutor(params, std::move(g), &exec));
     exec_.reset(exec);
   }
 
@@ -190,24 +191,113 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   Status Instantiate(FunctionLibraryRuntime* flr, const string& name,
                      test::function::Attrs attrs,
                      FunctionLibraryRuntime::Handle* handle) {
-    Status status = flr->Instantiate(name, attrs, handle);
-    if (!status.ok()) {
-      return status;
-    }
-    return Status::OK();
+    return flr->Instantiate(name, attrs, handle);
+  }
+
+  Status Instantiate(FunctionLibraryRuntime* flr, const string& name,
+                     test::function::Attrs attrs,
+                     const FunctionLibraryRuntime::InstantiateOptions& options,
+                     FunctionLibraryRuntime::Handle* handle) {
+    return flr->Instantiate(name, attrs, options, handle);
   }
 
   Status InstantiateAndRun(FunctionLibraryRuntime* flr, const string& name,
                            test::function::Attrs attrs,
                            const std::vector<Tensor>& args,
                            std::vector<Tensor*> rets) {
+    return InstantiateAndRun(flr, name, attrs,
+                             FunctionLibraryRuntime::InstantiateOptions(), args,
+                             std::move(rets));
+  }
+
+  Status InstantiateAndRun(
+      FunctionLibraryRuntime* flr, const string& name,
+      test::function::Attrs attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options,
+      const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
+    FunctionLibraryRuntime::Handle handle;
+    Status status = flr->Instantiate(name, attrs, options, &handle);
+    if (!status.ok()) {
+      return status;
+    }
+    FunctionLibraryRuntime::Options opts;
+    status = Run(flr, handle, opts, args, rets);
+    if (!status.ok()) return status;
+
+    // Release the handle and try running again. It should not succeed.
+    status = flr->ReleaseHandle(handle);
+    if (!status.ok()) return status;
+
+    Status status2 = Run(flr, handle, opts, args, std::move(rets));
+    EXPECT_TRUE(errors::IsInvalidArgument(status2));
+    EXPECT_TRUE(
+        StringPiece(status2.error_message()).contains("remote execution."));
+
+    return status;
+  }
+
+  Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
+             FunctionLibraryRuntime::Options opts, CallFrameInterface* frame) {
+    std::atomic<int32> call_count(0);
+    std::function<void(std::function<void()>)> runner =
+        [&call_count](std::function<void()> fn) {
+          ++call_count;
+          test::function::FunctionTestSchedClosure(fn);
+        };
+
+    Notification done;
+    opts.runner = &runner;
+    std::vector<Tensor> out;
+    Status status;
+    flr->Run(opts, handle, frame, [&status, &done](const Status& s) {
+      status = s;
+      done.Notify();
+    });
+    done.WaitForNotification();
+    if (!status.ok()) {
+      return status;
+    }
+
+    EXPECT_GE(call_count, 1);  // Test runner is used.
+
+    return Status::OK();
+  }
+
+  Status InstantiateAndRunViaCallFrameInterface(FunctionLibraryRuntime* flr,
+                                                const string& name,
+                                                test::function::Attrs attrs,
+                                                const std::vector<Tensor>& args,
+                                                std::vector<Tensor*> rets) {
     FunctionLibraryRuntime::Handle handle;
     Status status = flr->Instantiate(name, attrs, &handle);
     if (!status.ok()) {
       return status;
     }
+    const FunctionBody* fbody = flr->GetFunctionBody(handle);
+    FunctionCallFrame frame(fbody->arg_types, fbody->ret_types);
+    TF_RETURN_IF_ERROR(frame.SetArgs(args));
+
     FunctionLibraryRuntime::Options opts;
-    return Run(flr, handle, opts, args, std::move(rets));
+    status = Run(flr, handle, opts, &frame);
+    if (!status.ok()) return status;
+
+    std::vector<Tensor> retvals;
+    TF_RETURN_IF_ERROR(frame.GetRetvals(&retvals));
+    CHECK_EQ(rets.size(), retvals.size());
+    for (size_t i = 0; i < rets.size(); ++i) {
+      *rets[i] = retvals[i];
+    }
+
+    // Release the handle and try running again. It should not succeed.
+    status = flr->ReleaseHandle(handle);
+    if (!status.ok()) return status;
+
+    Status status2 = Run(flr, handle, opts, args, std::move(rets));
+    EXPECT_TRUE(errors::IsInvalidArgument(status2));
+    EXPECT_TRUE(
+        StringPiece(status2.error_message()).contains("remote execution."));
+
+    return status;
   }
 
   std::unique_ptr<Graph> GetFuncBody(FunctionLibraryRuntime* flr,
@@ -268,6 +358,9 @@ TEST_F(FunctionLibraryRuntimeTest, XTimesTwo) {
   TF_CHECK_OK(
       InstantiateAndRun(flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, {x}, {&y}));
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+  TF_CHECK_OK(InstantiateAndRunViaCallFrameInterface(
+      flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, {x}, {&y}));
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
 }
 
 TEST_F(FunctionLibraryRuntimeTest, XTimesN) {
@@ -286,6 +379,142 @@ TEST_F(FunctionLibraryRuntimeTest, XTimesN) {
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({16, 32, 48, 64}));
 }
 
+TEST_F(FunctionLibraryRuntimeTest, XTimesNInOverlayLib) {
+  Init({});
+  FunctionDefLibrary proto;
+  *proto.add_function() = test::function::XTimesTwo();
+  *proto.add_function() = test::function::XTimesFour();
+  *proto.add_function() = test::function::XTimes16();
+  std::unique_ptr<FunctionLibraryDefinition> overlay_lib(
+      new FunctionLibraryDefinition(OpRegistry::Global(), proto));
+
+  FunctionLibraryRuntime::InstantiateOptions options;
+  options.overlay_lib = overlay_lib.get();
+
+  auto x = test::AsTensor<float>({1, 2, 3, 4});
+  Tensor y;
+
+  // Ensure that the function is not installed in the base library.
+  HasError(InstantiateAndRun(flr0_, "XTimesTwo", {{"T", DT_FLOAT}},
+                             {} /* options */, {x}, {&y}),
+           "Not found: Function XTimesTwo is not defined.");
+
+  TF_CHECK_OK(InstantiateAndRun(flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, options,
+                                {x}, {&y}));
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+  TF_CHECK_OK(InstantiateAndRun(flr0_, "XTimesFour", {{"T", DT_FLOAT}}, options,
+                                {x}, {&y}));
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({4, 8, 12, 16}));
+  TF_CHECK_OK(InstantiateAndRun(flr0_, "XTimes16", {{"T", DT_FLOAT}}, options,
+                                {x}, {&y}));
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({16, 32, 48, 64}));
+
+  // Ensure that the use of the overlay has not leaked into the base library.
+  HasError(InstantiateAndRun(flr0_, "XTimesTwo", {{"T", DT_FLOAT}},
+                             {} /* options */, {x}, {&y}),
+           "Not found: Function XTimesTwo is not defined.");
+}
+
+TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
+  auto T = DT_INT32;
+
+  // The expected sequence of outputs from this function is [6, 4, 0, 1, ...].
+  FunctionDef stateful_func = FDH::Define(
+      // Name
+      "RandomUniformWrapper",
+      // Args
+      {},
+      // Return values
+      {"y: int32"},
+      // Attrs
+      {},
+      // Nodes
+      {FDH::Const<int32>("shape", gtl::ArraySlice<int32>({1})),
+       FDH::Const<int32>("minval", 0),
+       FDH::Const<int32>("maxval", 10),
+       // A stateful node.
+       {{"y"},
+        "RandomUniformInt",
+        {"shape", "minval", "maxval"},
+        {{"seed", 37}, {"seed2", 48}, {"Tout", T}, {"T", T}}}});
+  Init({stateful_func});
+
+  FunctionLibraryRuntime::Handle handle;
+  TF_CHECK_OK(Instantiate(flr0_, "RandomUniformWrapper", {}, &handle));
+
+  FunctionLibraryRuntime::Options opts;
+  Tensor y;
+  {
+    // Simple case: instantiating with no state_handle.
+    for (int32 expected : {6, 4}) {
+      TF_CHECK_OK(Run(flr0_, handle, opts, {}, {&y}));
+      test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+    }
+  }
+
+  {
+    // Instantiating again with no state_handle should yield the same handle and
+    // the continuation of the same sequence.
+    FunctionLibraryRuntime::Handle handle_non_isolated;
+    TF_CHECK_OK(
+        Instantiate(flr0_, "RandomUniformWrapper", {}, &handle_non_isolated));
+    EXPECT_EQ(handle, handle_non_isolated);
+    for (int32 expected : {0, 1}) {
+      TF_CHECK_OK(Run(flr0_, handle_non_isolated, opts, {}, {&y}));
+      test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+    }
+  }
+
+  {
+    // Instantiating with a given state handle will create new state and yield
+    // the original sequence.
+    FunctionLibraryRuntime::InstantiateOptions options;
+    FunctionLibraryRuntime::Handle handle_isolated;
+    options.state_handle = "handle_1";
+    TF_CHECK_OK(Instantiate(flr0_, "RandomUniformWrapper", {}, options,
+                            &handle_isolated));
+    EXPECT_NE(handle, handle_isolated);
+    for (int32 expected : {6, 4, 0, 1}) {
+      TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}));
+      test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+    }
+  }
+
+  {
+    // Instantiating with a different given state handle will create new state
+    // and yield the original sequence.
+    FunctionLibraryRuntime::InstantiateOptions options;
+    FunctionLibraryRuntime::Handle handle_isolated;
+    options.state_handle = "handle_2";
+    TF_CHECK_OK(Instantiate(flr0_, "RandomUniformWrapper", {}, options,
+                            &handle_isolated));
+    EXPECT_NE(handle, handle_isolated);
+    for (int32 expected : {6, 4, 0, 1}) {
+      TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}));
+      test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+    }
+  }
+
+  {
+    // Reinstantiating after releasing a handle will yield the original sequence
+    // multiple times.
+    FunctionLibraryRuntime::InstantiateOptions options;
+    FunctionLibraryRuntime::Handle handle_isolated;
+    options.state_handle = "handle_3";
+
+    for (int i = 0; i < 2; ++i) {
+      TF_CHECK_OK(Instantiate(flr0_, "RandomUniformWrapper", {}, options,
+                              &handle_isolated));
+      EXPECT_NE(handle, handle_isolated);
+      for (int32 expected : {6, 4, 0, 1}) {
+        TF_CHECK_OK(Run(flr0_, handle_isolated, opts, {}, {&y}));
+        test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({expected}));
+      }
+      TF_CHECK_OK(flr0_->ReleaseHandle(handle_isolated));
+    }
+  }
+}
+
 TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
@@ -487,6 +716,66 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
   }
 }
 
+TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
+  auto T = DT_INT32;
+  FunctionDef stateful_func = FDH::Define(
+      // Name
+      "SquareAndAddOneWithStatefulNodes",
+      // Args
+      {"x: int32"},
+      // Return values
+      {"y: int32"},
+      // Attrs
+      {},
+      // Nodes
+      {// a = Square<T>(x)
+       {{"a"}, "Square", {"x"}, {{"T", T}}},
+       // 1
+       FDH::Const("o", 1),
+       // A bunch of extra arithmetic that y doesn't depend on
+       {{"x1"}, "Add", {"o", "o"}, {{"T", T}}},
+       {{"x2"}, "Mul", {"a", "x1"}, {{"T", T}}},
+       {{"x3"}, "Mul", {"x1", "x2"}, {{"T", T}}},
+       FDH::Const<int32>("shape", {1, 2}),
+       // A stateful node.
+       {{"keep_me"},
+        "RandomUniform",
+        {"shape"},
+        {{"T", T}, {"dtype", DT_FLOAT}}},
+       // y = Add<T>(a, o)
+       {{"y"}, "Add", {"a", "o"}, {{"T", T}}}});
+  Init({stateful_func});
+
+  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  Tensor y;
+
+  FunctionLibraryRuntime::Handle handle;
+  TF_CHECK_OK(
+      Instantiate(flr0_, "SquareAndAddOneWithStatefulNodes", {}, &handle));
+
+  StepStats stats;
+  StepStatsCollector stats_collector(&stats);
+  FunctionLibraryRuntime::Options opts;
+  opts.stats_collector = &stats_collector;
+  TF_CHECK_OK(Run(flr0_, handle, opts, {x}, {&y}));
+  TF_CHECK_OK(flr0_->ReleaseHandle(handle));
+
+  TF_CHECK_OK(InstantiateAndRun(flr0_, "SquareAndAddOneWithStatefulNodes", {},
+                                {x}, {&y}));
+  test::ExpectTensorEqual<int>(y, test::AsTensor<int32>({2, 5, 10, 17}));
+
+  stats_collector.FinalizeAndSwap(&stats);
+
+  // Note that we do not expect the nodes named "x1", "x2", or "x3" to execute.
+  std::set<string> expected_node_names(
+      {"_SOURCE", "shape", "x", "o", "a", "keep_me", "y", "y_RetVal"});
+  std::set<string> executed_node_names;
+  for (const auto& node_stats : stats.dev_stats()[0].node_stats()) {
+    executed_node_names.insert(node_stats.node_name());
+  }
+  EXPECT_EQ(expected_node_names, executed_node_names);
+}
+
 TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
@@ -498,7 +787,7 @@ TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto x4_x2_scale = ops::Const<float>(
-        s.WithOpName("x4/x2/scale/_12__cf__2")
+        s.WithOpName("x4/x2/scale/_12__cf__6")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         2.0f);
     auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
@@ -644,6 +933,16 @@ TEST_F(FunctionLibraryRuntimeTest, Error_InstantiaionError) {
            "type attr not found");
 }
 
+TEST_F(FunctionLibraryRuntimeTest, Error_BadControlFlow) {
+  Init({test::function::InvalidControlFlow()});
+  auto x = test::AsTensor<int32>({0});
+  DCHECK_EQ(x.dtype(), DT_INT32);
+  Tensor y;
+  HasError(InstantiateAndRun(flr0_, "InvalidControlFlow", {}, {x}, {&y}),
+           "The node 'add' has inputs from different frames. The input 'enter' "
+           "is in frame 'while'. The input 'i' is in frame ''.");
+}
+
 TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
@@ -694,13 +993,13 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
     auto scale = ops::Const(
-        s.WithOpName("scale/_5__cf__6")
+        s.WithOpName("scale/_6__cf__11")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         2.0f);
     auto func1_gx = ops::Mul(s.WithOpName("Func/_1/gx"), func0, scale);
     auto func1_sx = ops::Shape(s.WithOpName("Func/_1/sx"), x);
     auto const0 = ops::Const(
-        s.WithOpName("Func/_1/sy/_6__cf__7")
+        s.WithOpName("Func/_1/sy/_5__cf__10")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         0, {0});
     auto func1_rx = ops::internal::BroadcastGradientArgs(
@@ -938,9 +1237,10 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
 
 TEST_F(FunctionLibraryRuntimeTest, CrossDevice) {
   Init({test::function::FindDevice()});
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/device:CPU:1";
   FunctionLibraryRuntime::Handle handle;
-  TF_CHECK_OK(Instantiate(flr0_, "FindDevice", {{"_target", "/device:CPU:1"}},
-                          &handle));
+  TF_CHECK_OK(Instantiate(flr0_, "FindDevice", {}, instantiate_opts, &handle));
 
   Tensor y;
   FunctionLibraryRuntime::Options opts;
diff --git a/tensorflow/core/common_runtime/function_testlib.h b/tensorflow/core/common_runtime/function_testlib.h
index 0bf6699f5aa13b7f125f7f3bb2c1781c90ee9ed9..3ddb26de929dc19792142dffde345672aafaadce 100644
--- a/tensorflow/core/common_runtime/function_testlib.h
+++ b/tensorflow/core/common_runtime/function_testlib.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/framework/function.h"
@@ -34,4 +34,4 @@ Output Call(Scope* scope, const string& op_name, const string& fn_name,
 }  // namespace test
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_FUNCTION_TESTLIB_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
index 646cd88a3a340a7ce3d85f19cb55fea27d9dc1b2..2f7fbbbec2a285976701b94c426bc3f870c65cf5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -15,20 +15,23 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
 
-GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory)
-    : GPUBFCAllocator(device_id, total_memory, GPUOptions()) {}
+GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
+                                 const string& name)
+    : GPUBFCAllocator(cuda_gpu_id, total_memory, GPUOptions(), name) {}
 
-GPUBFCAllocator::GPUBFCAllocator(int device_id, size_t total_memory,
-                                 const GPUOptions& gpu_options)
+GPUBFCAllocator::GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
+                                 const GPUOptions& gpu_options,
+                                 const string& name)
     : BFCAllocator(
           new GPUMemAllocator(
-              GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie()),
-          total_memory, gpu_options.allow_growth(),
-          strings::StrCat("GPU_", device_id, "_bfc")) {}
+              GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie()),
+          total_memory, gpu_options.allow_growth(), name) {}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index 2c23340b6d3fb26a38304fd2d0544dcdfcdeeb5e..c2c0b020c7409e7be168d42e83579a2ff3c29a60 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
@@ -36,11 +37,12 @@ namespace tensorflow {
 // algorithm.
 class GPUBFCAllocator : public BFCAllocator {
  public:
-  // 'device_id' refers to the StreamExecutor ID of the device within
+  // 'cuda_gpu_id' refers to the ID of the GPU device within
   // the process and must reference a valid ID in the process.
-  GPUBFCAllocator(int device_id, size_t total_memory);
-  GPUBFCAllocator(int device_id, size_t total_memory,
-                  const GPUOptions& gpu_options);
+  GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
+                  const string& name);
+  GPUBFCAllocator(CudaGpuId cuda_gpu_id, size_t total_memory,
+                  const GPUOptions& gpu_options, const string& name);
   virtual ~GPUBFCAllocator() {}
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUBFCAllocator);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 00ef130d34bbbe06ad9dabae124ff3fa0d38450a..67caeb3495c6b0600f12c9b20ef73ee90f8b3e0d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -45,7 +46,7 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
 }
 
 TEST(GPUBFCAllocatorTest, NoDups) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   CheckStats(&a, 0, 0, 0, 0);
 
   // Allocate a lot of raw pointers
@@ -74,7 +75,7 @@ TEST(GPUBFCAllocatorTest, NoDups) {
 }
 
 TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   // Allocate 256 raw pointers of sizes between 100 bytes and about
   // a meg
   random::PhiloxRandom philox(123, 17);
@@ -132,7 +133,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
 }
 
 TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   CheckStats(&a, 0, 0, 0, 0);
 
   float* first_ptr = a.Allocate<float>(1024);
@@ -153,8 +154,9 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
     a.DeallocateRaw(t3);
     a.DeallocateRaw(t4);
   }
-  CheckStats(&a, 4097, 0, 1024 * sizeof(float) + 1048576 * sizeof(int64) +
-                              2048 * sizeof(double) + 10485760 * sizeof(float),
+  CheckStats(&a, 4097, 0,
+             1024 * sizeof(float) + 1048576 * sizeof(int64) +
+                 2048 * sizeof(double) + 10485760 * sizeof(float),
              10485760 * sizeof(float));
 
   // At the end, we should have coalesced all memory into one region
@@ -166,18 +168,18 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
 }
 
 TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   float* ptr = a.Allocate<float>(0);
   EXPECT_EQ(nullptr, ptr);
 }
 
 TEST(GPUBFCAllocatorTest, TracksSizes) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   EXPECT_EQ(true, a.TracksAllocationSizes());
 }
 
 TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   float* t1 = a.Allocate<float>(1);
   EXPECT_EQ(4, a.RequestedSize(t1));
   EXPECT_EQ(256, a.AllocatedSize(t1));
@@ -186,7 +188,7 @@ TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
 
 TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
   // Configure a 1MiB byte limit
-  GPUBFCAllocator a(0, 1 << 20);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 20, "GPU_0_bfc");
 
   float* first_ptr = a.Allocate<float>(1 << 6);
   float* second_ptr = a.Allocate<float>(1 << 20);
@@ -201,7 +203,7 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
   options.set_allow_growth(true);
 
   // Max of 2GiB, but starts out small.
-  GPUBFCAllocator a(0, 1LL << 31, options);
+  GPUBFCAllocator a(CudaGpuId(0), 1LL << 31, options, "GPU_0_bfc");
 
   // Allocate 10 raw pointers of sizes between 100 bytes and about
   // 64 megs.
@@ -262,8 +264,8 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
 }
 
 TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
-  GPUBFCAllocator a(0, 1UL << 60);
-  GPUBFCAllocator b(0, 1UL << 60);
+  GPUBFCAllocator a(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
+  GPUBFCAllocator b(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
   void* amem = a.AllocateRaw(1, 1);
   void* bmem = b.AllocateRaw(1, 1 << 30);
   a.DeallocateRaw(amem);
@@ -271,7 +273,7 @@ TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
 }
 
 static void BM_Allocation(int iters) {
-  GPUBFCAllocator a(0, 1uLL << 33);
+  GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
   // Exercise a few different allocation sizes
   std::vector<size_t> sizes = {256,        4096,      16384,    524288,
                                512,        1048576,   10485760, 104857600,
@@ -287,7 +289,7 @@ static void BM_Allocation(int iters) {
 BENCHMARK(BM_Allocation);
 
 static void BM_AllocationThreaded(int iters, int num_threads) {
-  GPUBFCAllocator a(0, 1uLL << 33);
+  GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
   thread::ThreadPool pool(Env::Default(), "test", num_threads);
   std::atomic_int_fast32_t count(iters);
   mutex done_lock;
@@ -323,7 +325,7 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16);
 // A more complex benchmark that defers deallocation of an object for
 // "delay" allocations.
 static void BM_AllocationDelayed(int iters, int delay) {
-  GPUBFCAllocator a(0, 1 << 30);
+  GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
   // Exercise a few different allocation sizes
   std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024};
   int size_index = 0;
@@ -361,7 +363,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
   // only methods inside this class can access private members of BFCAllocator.
 
   void TestBinDebugInfo() {
-    GPUBFCAllocator a(0, 1 << 30);
+    GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
 
     std::vector<void*> initial_ptrs;
     std::vector<size_t> initial_ptrs_allocated_sizes;
@@ -439,7 +441,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
   }
 
   void TestLog2FloorNonZeroSlow() {
-    GPUBFCAllocator a(0 /* device_id */, 1 /* total_memory */);
+    GPUBFCAllocator a(CudaGpuId(0), 1 /* total_memory */, "GPU_0_bfc");
     EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0));
     EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1));
     EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index 70c2d96763e72909bd1d58ae637d8393f1368197..08961fc1055b67902e85887f69158ece5dd68e76 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -20,17 +20,17 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
 
 GPUcudaMallocAllocator::GPUcudaMallocAllocator(VisitableAllocator* allocator,
-                                               int device_id)
+                                               CudaGpuId cuda_gpu_id)
     : base_allocator_(allocator) {
-  stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+  stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 }
 
 GPUcudaMallocAllocator::~GPUcudaMallocAllocator() { delete base_allocator_; }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
index 23552b809a8a735aaeb8ac9643eccd0b0542f03b..208697361d2dfc4f3b8290ea511d15c9bd86857b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -30,7 +31,8 @@ namespace tensorflow {
 // allocated memory.
 class GPUcudaMallocAllocator : public VisitableAllocator {
  public:
-  explicit GPUcudaMallocAllocator(VisitableAllocator* allocator, int device_id);
+  explicit GPUcudaMallocAllocator(VisitableAllocator* allocator,
+                                  CudaGpuId cuda_gpu_id);
   ~GPUcudaMallocAllocator() override;
   string Name() override { return "gpu_debug"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index 6480f0b256b2fe05db5ac5bc8037b4fa216682ac..63ed0b8be16ecb187113311db5283c8d4f3b1a5e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -15,20 +15,21 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
 
+#include <cstddef>
 #include <vector>
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/platform/stream_executor.h"
 
-namespace gpu = ::perftools::gputools;
-
-namespace tensorflow {
-
 #define MASK_WORDS 2
 #define MASK_BYTES (MASK_WORDS * sizeof(int64))
 
+namespace tensorflow {
 namespace {
 
-static int64* NewMask(int64 word) {
+int64* NewMask(int64 word) {
   int64* m = new int64[MASK_WORDS];
   for (int i = 0; i < MASK_WORDS; ++i) {
     m[i] = word;
@@ -36,8 +37,8 @@ static int64* NewMask(int64 word) {
   return m;
 }
 
-static int64* before_mask = NewMask(0xabababababababab);
-static int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
+int64* before_mask = NewMask(0xabababababababab);
+int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
 
 bool CheckMask(perftools::gputools::StreamExecutor* exec, void* ptr,
                int64* mask) {
@@ -75,9 +76,9 @@ void InitMask(perftools::gputools::StreamExecutor* exec, void* ptr,
 // GPUDebugAllocator
 // -----------------------------------------------------------------------------
 GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator,
-                                     int device_id)
+                                     CudaGpuId cuda_gpu_id)
     : base_allocator_(allocator) {
-  stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+  stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 }
 
 GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; }
@@ -120,24 +121,28 @@ void GPUDebugAllocator::AddFreeVisitor(Visitor visitor) {
 
 bool GPUDebugAllocator::TracksAllocationSizes() { return true; }
 
-size_t GPUDebugAllocator::RequestedSize(void* ptr) {
-  auto req_size =
-      base_allocator_->RequestedSize(static_cast<char*>(ptr) - MASK_BYTES);
+size_t GPUDebugAllocator::RequestedSize(const void* ptr) {
+  auto req_size = base_allocator_->RequestedSize(static_cast<const char*>(ptr) -
+                                                 MASK_BYTES);
   return req_size - 2 * MASK_BYTES;
 }
 
-size_t GPUDebugAllocator::AllocatedSize(void* ptr) {
-  return base_allocator_->AllocatedSize(static_cast<char*>(ptr) - MASK_BYTES);
+size_t GPUDebugAllocator::AllocatedSize(const void* ptr) {
+  return base_allocator_->AllocatedSize(static_cast<const char*>(ptr) -
+                                        MASK_BYTES);
 }
 
-int64 GPUDebugAllocator::AllocationId(void* ptr) {
-  return base_allocator_->AllocationId(static_cast<char*>(ptr) - MASK_BYTES);
+int64 GPUDebugAllocator::AllocationId(const void* ptr) {
+  return base_allocator_->AllocationId(static_cast<const char*>(ptr) -
+                                       MASK_BYTES);
 }
 
 void GPUDebugAllocator::GetStats(AllocatorStats* stats) {
   base_allocator_->GetStats(stats);
 }
 
+void GPUDebugAllocator::ClearStats() { base_allocator_->ClearStats(); }
+
 bool GPUDebugAllocator::CheckHeader(void* ptr) {
   return CheckMask(stream_exec_, static_cast<char*>(ptr) - MASK_BYTES,
                    before_mask);
@@ -154,9 +159,9 @@ bool GPUDebugAllocator::CheckFooter(void* ptr) {
 // GPUNanResetAllocator
 // -----------------------------------------------------------------------------
 GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator,
-                                           int device_id)
+                                           CudaGpuId cuda_gpu_id)
     : base_allocator_(allocator) {
-  stream_exec_ = GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+  stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 }
 
 GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; }
@@ -198,11 +203,11 @@ void GPUNanResetAllocator::AddFreeVisitor(Visitor visitor) {
   return base_allocator_->AddFreeVisitor(visitor);
 }
 
-size_t GPUNanResetAllocator::RequestedSize(void* ptr) {
+size_t GPUNanResetAllocator::RequestedSize(const void* ptr) {
   return base_allocator_->RequestedSize(ptr);
 }
 
-size_t GPUNanResetAllocator::AllocatedSize(void* ptr) {
+size_t GPUNanResetAllocator::AllocatedSize(const void* ptr) {
   return base_allocator_->AllocatedSize(ptr);
 }
 
@@ -210,4 +215,6 @@ void GPUNanResetAllocator::GetStats(AllocatorStats* stats) {
   base_allocator_->GetStats(stats);
 }
 
+void GPUNanResetAllocator::ClearStats() { base_allocator_->ClearStats(); }
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index 9fbaf64f8a296fe012511cc91d845566f52f13d5..adce3a84368ced958002443721016778cb6df028 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <unordered_map>
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -32,7 +33,8 @@ namespace tensorflow {
 // allocated memory.
 class GPUDebugAllocator : public VisitableAllocator {
  public:
-  explicit GPUDebugAllocator(VisitableAllocator* allocator, int device_id);
+  explicit GPUDebugAllocator(VisitableAllocator* allocator,
+                             CudaGpuId cuda_gpu_id);
   ~GPUDebugAllocator() override;
   string Name() override { return "gpu_debug"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
@@ -40,10 +42,11 @@ class GPUDebugAllocator : public VisitableAllocator {
   void AddAllocVisitor(Visitor visitor) override;
   void AddFreeVisitor(Visitor visitor) override;
   bool TracksAllocationSizes() override;
-  size_t RequestedSize(void* ptr) override;
-  size_t AllocatedSize(void* ptr) override;
-  int64 AllocationId(void* ptr) override;
+  size_t RequestedSize(const void* ptr) override;
+  size_t AllocatedSize(const void* ptr) override;
+  int64 AllocationId(const void* ptr) override;
   void GetStats(AllocatorStats* stats) override;
+  void ClearStats() override;
 
   // For testing.
   bool CheckHeader(void* ptr);
@@ -62,16 +65,18 @@ class GPUDebugAllocator : public VisitableAllocator {
 // user forgets to initialize the memory.
 class GPUNanResetAllocator : public VisitableAllocator {
  public:
-  explicit GPUNanResetAllocator(VisitableAllocator* allocator, int device_id);
+  explicit GPUNanResetAllocator(VisitableAllocator* allocator,
+                                CudaGpuId cuda_gpu_id);
   ~GPUNanResetAllocator() override;
   string Name() override { return "gpu_nan_reset"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void* ptr) override;
   void AddAllocVisitor(Visitor visitor) override;
   void AddFreeVisitor(Visitor visitor) override;
-  size_t RequestedSize(void* ptr) override;
-  size_t AllocatedSize(void* ptr) override;
+  size_t RequestedSize(const void* ptr) override;
+  size_t AllocatedSize(const void* ptr) override;
   void GetStats(AllocatorStats* stats) override;
+  void ClearStats() override;
 
  private:
   VisitableAllocator* base_allocator_ = nullptr;  // owned
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index 14d8591731f0d544976a661c591920fb937f0cbd..d34f0cb3c28af5d2720c61cc7c5016622b1c0876 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
@@ -28,15 +30,14 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
+namespace {
 
 TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
-  const int device_id = 0;
-  GPUDebugAllocator a(new GPUBFCAllocator(device_id, 1 << 30), device_id);
-  auto stream_exec =
-      GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+  const CudaGpuId cuda_gpu_id(0);
+  GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                      cuda_gpu_id);
+  auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 
   for (int s : {8}) {
     std::vector<int64> cpu_array(s);
@@ -57,11 +58,11 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
   for (int s : {8, 211}) {
     EXPECT_DEATH(
         {
-          const int device_id = 0;
-          GPUDebugAllocator a(new GPUBFCAllocator(device_id, 1 << 30),
-                              device_id);
+          const CudaGpuId cuda_gpu_id(0);
+          GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                              cuda_gpu_id);
           auto stream_exec =
-              GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+              GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 
           std::vector<int64> cpu_array(s);
           memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
@@ -90,11 +91,11 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
   for (int s : {8, 22}) {
     EXPECT_DEATH(
         {
-          const int device_id = 0;
-          GPUDebugAllocator a(new GPUBFCAllocator(device_id, 1 << 30),
-                              device_id);
+          const CudaGpuId cuda_gpu_id(0);
+          GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                              cuda_gpu_id);
           auto stream_exec =
-              GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+              GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 
           std::vector<int64> cpu_array(s);
           memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64));
@@ -120,10 +121,10 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
 }
 
 TEST(GPUDebugAllocatorTest, ResetToNan) {
-  const int device_id = 0;
-  GPUNanResetAllocator a(new GPUBFCAllocator(device_id, 1 << 30), device_id);
-  auto stream_exec =
-      GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+  const CudaGpuId cuda_gpu_id(0);
+  GPUNanResetAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                         cuda_gpu_id);
+  auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 
   std::vector<float> cpu_array(1024);
   std::vector<float> cpu_array_result(1024);
@@ -160,13 +161,13 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
 }
 
 TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
-  const int device_id = 0;
+  const CudaGpuId cuda_gpu_id(0);
   // NaN reset must be the outer-most allocator.
   GPUNanResetAllocator a(
-      new GPUDebugAllocator(new GPUBFCAllocator(device_id, 1 << 30), device_id),
-      device_id);
-  auto stream_exec =
-      GPUMachineManager()->ExecutorForDevice(device_id).ValueOrDie();
+      new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                            cuda_gpu_id),
+      cuda_gpu_id);
+  auto stream_exec = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
 
   std::vector<float> cpu_array(1024);
   std::vector<float> cpu_array_result(1024);
@@ -203,19 +204,25 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
 }
 
 TEST(GPUDebugAllocatorTest, TracksSizes) {
-  GPUDebugAllocator a(new GPUBFCAllocator(0, 1 << 30), 0);
+  const CudaGpuId cuda_gpu_id(0);
+  GPUDebugAllocator a(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                      cuda_gpu_id);
   EXPECT_EQ(true, a.TracksAllocationSizes());
 }
 
 TEST(GPUDebugAllocatorTest, AllocatedVsRequested) {
+  const CudaGpuId cuda_gpu_id(0);
   GPUNanResetAllocator a(
-      new GPUDebugAllocator(new GPUBFCAllocator(0, 1 << 30), 0), 0);
+      new GPUDebugAllocator(new GPUBFCAllocator(cuda_gpu_id, 1 << 30, ""),
+                            cuda_gpu_id),
+      cuda_gpu_id);
   float* t1 = a.Allocate<float>(1);
   EXPECT_EQ(4, a.RequestedSize(t1));
   EXPECT_EQ(256, a.AllocatedSize(t1));
   a.DeallocateRaw(t1);
 }
 
+}  // namespace
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index eff169640f6eef4f82a3b0ef205990bef4237a74..15ff15fd5ab28605c4ab0904e62305edc3815adb 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -32,6 +32,9 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
@@ -60,8 +63,13 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
+#if !defined(PLATFORM_GOOGLE)
+#include "cuda/cuda_config.h"
+#endif
+
 namespace tensorflow {
 
 // Eigen Ops directly allocate memory only for temporary buffers used
@@ -84,7 +92,8 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
   }
   ~EigenCudaStreamDevice() override {}
   void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
-                    int gpu_id, ::tensorflow::Allocator* alloc, char* scratch) {
+                    TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
+                    char* scratch) {
     if (LogMemory::IsEnabled()) {
       operation_ = context->op_kernel().name() + "/EigenAllocator";
       step_id_ = context->step_id();
@@ -95,7 +104,8 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
         reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
     stream_ = cuda_stream;
     allocator_ = alloc;
-    device_prop_ = &Eigen::m_deviceProperties[gpu_id];
+    const int cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id).value();
+    device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id];
   }
 
   const cudaStream_t& stream() const override { return *stream_; }
@@ -185,13 +195,15 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
 class BaseGPUDevice::StreamGroupFactory {
  public:
   // Returns the unique stream group for use with the stream defined by
-  // {gpu_id, stream_group_within_gpu}, creating it if it does not yet exist.
+  // {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet
+  // exist.
   // This function is thread safe.
-  BaseGPUDevice::StreamGroup* GetOrCreate(int gpu_id,
+  BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id,
                                           int stream_group_within_gpu,
                                           gpu::StreamExecutor* executor) {
     mutex_lock guard(lock_);
-    StreamGroup* group = &streams_[key_type(gpu_id, stream_group_within_gpu)];
+    StreamGroup* group =
+        &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
     if (!group->compute) {
       group->compute = new gpu::Stream(executor);
       group->compute->Init();
@@ -236,7 +248,8 @@ class BaseGPUDevice::StreamGroupFactory {
 
 BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
                              Bytes memory_limit, const DeviceLocality& locality,
-                             int gpu_id, const string& physical_device_desc,
+                             TfGpuId tf_gpu_id,
+                             const string& physical_device_desc,
                              Allocator* gpu_allocator, Allocator* cpu_allocator,
                              bool sync_every_op, int32 max_streams)
     : LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU,
@@ -244,7 +257,7 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
                                                          physical_device_desc)),
       gpu_allocator_(gpu_allocator),
       cpu_allocator_(cpu_allocator),
-      gpu_id_(gpu_id),
+      tf_gpu_id_(tf_gpu_id),
       sync_every_op_(sync_every_op),
       max_streams_(max_streams) {
   ProcessState::singleton()->EnableGPUDevice();
@@ -256,10 +269,10 @@ BaseGPUDevice::~BaseGPUDevice() {
 }
 
 Status BaseGPUDevice::Init(const SessionOptions& options) {
-  auto executor_status = GPUMachineManager()->ExecutorForDevice(gpu_id_);
+  auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_);
   if (!executor_status.status().ok()) {
     return errors::Internal("Failed to get StreamExecutor for device ",
-                            gpu_id_);
+                            tf_gpu_id_.value());
   }
 
   executor_ = executor_status.ValueOrDie();
@@ -272,14 +285,14 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
   // Create the specified number of GPU streams
   for (int i = 0; i < max_streams_; i++) {
     streams_.push_back(
-        StreamGroupFactory::Global().GetOrCreate(gpu_id_, i, executor_));
+        StreamGroupFactory::Global().GetOrCreate(tf_gpu_id_, i, executor_));
 
     size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int);
     void* scratch_buffer = gpu_allocator_->AllocateRaw(
         Allocator::kAllocatorAlignment, scratch_buffer_size);
     if (scratch_buffer == nullptr) {
       return errors::FailedPrecondition(
-          "Failed to allocate scratch buffer for device ", gpu_id_);
+          "Failed to allocate scratch buffer for device ", tf_gpu_id_.value());
     }
     scratch_.push_back(static_cast<char*>(scratch_buffer));
 
@@ -291,7 +304,8 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
         &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
     if (!ok) {
       return errors::FailedPrecondition(
-          "Failed to memcopy into scratch buffer for device ", gpu_id_);
+          "Failed to memcopy into scratch buffer for device ",
+          tf_gpu_id_.value());
     }
 
     device_contexts_.push_back(new GPUDeviceContext(
@@ -302,9 +316,49 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
   gpu_device_info_->stream = streams_[0]->compute;
   gpu_device_info_->default_context = device_contexts_[0];
   gpu_device_info_->event_mgr = em_.get();
-  gpu_device_info_->gpu_id = gpu_id_;
+  gpu_device_info_->gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id_).value();
   set_tensorflow_gpu_device_info(gpu_device_info_);
 
+  // Whether and how the GPU device uses its own threadpool.
+  // This option is experimental. Once we confirm the best setting, we
+  // may change the default behavior and completely remove this flag.
+  // Default values might change in future releases.
+  // Possible values:
+  //   * global: GPU uses threads shared with CPU in the main compute
+  //          thread-pool. This is currently the default.
+  //   * gpu_private: GPU uses threads dedicated to this device.
+  //   * gpu_shared: All GPUs share a dedicated thread pool.
+  string gpu_thread_mode;
+  TF_RETURN_IF_ERROR(
+      ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global", &gpu_thread_mode));
+  gpu_thread_mode = str_util::Lowercase(gpu_thread_mode);
+  if (gpu_thread_mode != "global") {
+    int64 gpu_thread_count = -1;
+    // Default to two threads. One for device compute and another for memory
+    // copies.
+    TF_RETURN_IF_ERROR(
+        ReadInt64FromEnvVar("TF_GPU_THREAD_COUNT", 2, &gpu_thread_count));
+    if (gpu_thread_mode == "gpu_private") {
+      // TODO(zhengxq): since these threads only serve a single GPU device,
+      //   we should set the device context once for each thread, and avoid
+      //   setting them for each kernel.
+      // TODO(zhengxq): pin the thread to the same socket of the target GPU.
+      thread_pool_.reset(new thread::ThreadPool(
+          options.env, strings::StrCat("gpu_private_", tf_gpu_id_.value()),
+          static_cast<int32>(gpu_thread_count)));
+      set_tensorflow_device_thread_pool(thread_pool_.get());
+    } else if (gpu_thread_mode == "gpu_shared") {
+      static thread::ThreadPool* thread_pool = new thread::ThreadPool(
+          options.env, "gpu_shared", static_cast<int32>(gpu_thread_count));
+      set_tensorflow_device_thread_pool(thread_pool);
+    } else {
+      string error_message =
+          strings::StrCat("Invalid gpu_thread_mode: ", gpu_thread_mode);
+      LOG(WARNING) << error_message;
+      return errors::InvalidArgument(error_message);
+    }
+  }
+
   return Status::OK();
 }
 
@@ -394,7 +448,7 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
 
   if (vlog_1) {
     VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op "
-            << op_kernel->type_string() << " on GPU" << gpu_id_ << " stream["
+            << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
             << stream_id << "]";
   }
 
@@ -469,7 +523,7 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
   const auto stream_id = gpu_device_context->stream_id();
 
   VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
-          << op_kernel->type_string() << " on GPU" << gpu_id_ << " stream["
+          << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
           << stream_id << "]";
 
   // When TraceMe profiling is off (which is the default), the
@@ -594,8 +648,9 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
   ConcretePerOpGpuDevice() : device_(&stream_device_) {}
 
   void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
-                    int gpu_id, Allocator* base_allocator, char* scratch) {
-    stream_device_.Reinitialize(context, cuda_stream, gpu_id, base_allocator,
+                    TfGpuId tf_gpu_id, Allocator* base_allocator,
+                    char* scratch) {
+    stream_device_.Reinitialize(context, cuda_stream, tf_gpu_id, base_allocator,
                                 scratch);
   }
 
@@ -605,6 +660,152 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
   EigenCudaStreamDevice stream_device_;
   Eigen::GpuDevice device_;
 };
+
+// Parse 'visible_device_list' into a list of CUDA GPU ids.
+Status ParseVisibleDeviceList(const string& visible_device_list,
+                              std::vector<CudaGpuId>* visible_gpu_order) {
+  visible_gpu_order->clear();
+  gpu::Platform* gpu_manager = GPUMachineManager();
+
+  // If the user wants to remap the visible to virtual GPU mapping,
+  // check for that here.
+  if (visible_device_list.empty()) {
+    visible_gpu_order->resize(gpu_manager->VisibleDeviceCount());
+    // By default, visible to virtual mapping is unchanged.
+    int deviceNo = 0;
+    std::generate(visible_gpu_order->begin(), visible_gpu_order->end(),
+                  [&deviceNo] { return deviceNo++; });
+  } else {
+    const std::vector<string> order_str =
+        str_util::Split(visible_device_list, ',');
+    for (const string& cuda_gpu_id_str : order_str) {
+      int32 cuda_gpu_id;
+      if (!strings::safe_strto32(cuda_gpu_id_str, &cuda_gpu_id)) {
+        return errors::InvalidArgument(
+            "Could not parse entry in 'visible_device_list': '",
+            cuda_gpu_id_str, "'. visible_device_list = ", visible_device_list);
+      }
+      if (cuda_gpu_id < 0 || cuda_gpu_id >= gpu_manager->VisibleDeviceCount()) {
+        return errors::InvalidArgument(
+            "'visible_device_list' listed an invalid GPU id '", cuda_gpu_id,
+            "' but visible device count is ",
+            gpu_manager->VisibleDeviceCount());
+      }
+      visible_gpu_order->push_back(CudaGpuId(cuda_gpu_id));
+    }
+  }
+
+  // Validate no repeats.
+  std::set<CudaGpuId> visible_device_set(visible_gpu_order->begin(),
+                                         visible_gpu_order->end());
+  if (visible_device_set.size() != visible_gpu_order->size()) {
+    return errors::InvalidArgument(
+        "visible_device_list contained a duplicate entry: ",
+        visible_device_list);
+  }
+  return Status::OK();
+}
+
+Status VerifyVirtualDeviceSettings(
+    const size_t num_gpus_to_use, const GPUOptions& gpu_options,
+    const std::vector<CudaGpuId>& visible_gpu_order,
+    const std::vector<CudaGpuId>& valid_cuda_gpu_ids) {
+  const auto& virtual_devices = gpu_options.experimental().virtual_devices();
+  CHECK(!virtual_devices.empty());
+  if (gpu_options.per_process_gpu_memory_fraction() > 0) {
+    return errors::InvalidArgument(
+        "It's invalid to set per_process_gpu_memory_fraction when "
+        "virtual_devices is set.");
+  }
+  if (num_gpus_to_use < virtual_devices.size()) {
+    return errors::Unknown(
+        "Not enough GPUs to create virtual devices."
+        " num_gpus_to_use: ",
+        num_gpus_to_use, " #virtual_devices: ", virtual_devices.size());
+  }
+  if (!gpu_options.visible_device_list().empty() &&
+      visible_gpu_order.size() != virtual_devices.size()) {
+    return errors::InvalidArgument(
+        "The number of GPUs in visible_device_list doesn't match the number "
+        "of elements in the virtual_devices list.",
+        " #GPUs in visible_device_list: ", visible_gpu_order.size(),
+        " virtual_devices.size(): ", virtual_devices.size());
+  }
+  if (valid_cuda_gpu_ids.size() != virtual_devices.size()) {
+    return errors::Unknown(
+        "The number of valid GPUs doesn't match the number of elements in "
+        "the virtual_devices list.",
+        " #valid GPUs: ", valid_cuda_gpu_ids.size(),
+        " virtual_devices.size(): ", virtual_devices.size());
+  }
+  return Status::OK();
+}
+
+int64 MinSystemMemory(int64 available_memory) {
+  // We use the following heuristic for now:
+  //
+  // If the available_memory is < 2GiB, we allocate 225MiB to system memory.
+  // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
+  //
+  // In the future we could be more sophisticated by using a table of devices.
+  int64 min_system_memory;
+  if (available_memory < (1LL << 31)) {
+    // 225MiB
+    min_system_memory = 225 * 1024 * 1024;
+  } else {
+    // max(300 MiB, 0.05 * available_memory)
+    min_system_memory =
+        std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
+  }
+#if defined(__GNUC__) && defined(__OPTIMIZE__)
+// Do nothing
+#elif !defined(__GNUC__) && defined(NDEBUG)
+// Do nothing
+#else
+  // Double the amount of available GPU memory in non-opt builds (debug
+  // builds in windows); because in non-opt builds more system memory
+  // is necessary.
+  min_system_memory *= 2;
+#endif
+
+#if defined(ANDROID_TEGRA)
+  // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM
+  // and Video RAM
+  min_system_memory = 1 << 30;
+#endif
+  return min_system_memory;
+}
+
+// Get the memory limit for the virtual device being created on GPU with
+// 'cuda_gpu_id', when that virtual device is the only virtual device being
+// created on that GPU.
+Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
+                                      CudaGpuId cuda_gpu_id,
+                                      int64* memory_limit) {
+  int64 total_memory = 0;
+  int64 available_memory = 0;
+  gpu::StreamExecutor* se =
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
+  if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
+    return errors::Unknown("Failed to query available memory for GPU ",
+                           cuda_gpu_id.value());
+  }
+
+  int64 allocated_memory = 0;
+  const double per_process_gpu_memory_fraction =
+      gpu_options.per_process_gpu_memory_fraction();
+  if (per_process_gpu_memory_fraction == 0) {
+    allocated_memory = available_memory;
+    const int64 min_system_memory = MinSystemMemory(available_memory);
+    if (min_system_memory < allocated_memory) {
+      allocated_memory -= min_system_memory;
+    }
+  } else {
+    allocated_memory = total_memory * per_process_gpu_memory_fraction;
+  }
+  *memory_limit = allocated_memory;
+  return Status::OK();
+}
 }  // namespace
 
 void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
@@ -615,7 +816,7 @@ void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
   DCHECK(concrete_device);
   const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
       streams_[stream_id]->compute->implementation()->CudaStreamMemberHack());
-  concrete_device->Reinitialize(context, cuda_stream, gpu_id_, allocator,
+  concrete_device->Reinitialize(context, cuda_stream, tf_gpu_id_, allocator,
                                 scratch_[stream_id]);
 }
 
@@ -639,21 +840,38 @@ void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
   }
 }
 
+const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = 1000;
+const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = 1;
+
 Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
                                            const string& name_prefix,
                                            std::vector<Device*>* devices) {
-  size_t n = INT_MAX;
+  TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
+  gpu::Platform* gpu_manager = GPUMachineManager();
+  if (gpu_manager == nullptr) {
+    return Status::OK();
+  }
+  // If there are no GPUs visible, do nothing.
+  if (gpu_manager->VisibleDeviceCount() <= 0) {
+    return Status::OK();
+  }
+
+  size_t num_gpus_to_use = INT_MAX;
   auto iter = options.config.device_count().find("GPU");
   if (iter != options.config.device_count().end()) {
-    n = iter->second;
+    num_gpus_to_use = iter->second;
   }
-  std::vector<int> valid_gpu_ids;
-  TF_RETURN_IF_ERROR(GetValidDeviceIds(
-      options.config.gpu_options().visible_device_list(), &valid_gpu_ids));
-  if (static_cast<size_t>(n) > valid_gpu_ids.size()) {
-    n = valid_gpu_ids.size();
+  const auto& gpu_options = options.config.gpu_options();
+  std::vector<CudaGpuId> visible_gpu_order;
+  TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(),
+                                            &visible_gpu_order));
+
+  std::vector<CudaGpuId> valid_cuda_gpu_ids;
+  TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &valid_cuda_gpu_ids));
+  if (num_gpus_to_use > valid_cuda_gpu_ids.size()) {
+    num_gpus_to_use = valid_cuda_gpu_ids.size();
   }
-  if (!valid_gpu_ids.empty()) {
+  if (!valid_cuda_gpu_ids.empty()) {
     // Save the original device.
     int original_device = 0;
     cudaError_t err = cudaGetDevice(&original_device);
@@ -663,16 +881,16 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
     }
     // Force to implicitly initialize CUDA runtime on each valid GPU before
     // CreateGPUDevice().
-    for (int gpu_id : valid_gpu_ids) {
-      err = cudaSetDevice(gpu_id);
+    for (CudaGpuId cuda_gpu_id : valid_cuda_gpu_ids) {
+      err = cudaSetDevice(cuda_gpu_id.value());
       if (err != cudaSuccess) {
-        return errors::Internal("cudaSetDevice() on GPU:", gpu_id,
+        return errors::Internal("cudaSetDevice() on GPU:", cuda_gpu_id.value(),
                                 " failed. Status: ", cudaGetErrorString(err));
       }
       err = cudaFree(nullptr);
       if (err != cudaSuccess) {
         return errors::Internal(
-            "CUDA runtime implicit initialization on GPU:", gpu_id,
+            "CUDA runtime implicit initialization on GPU:", cuda_gpu_id.value(),
             " failed. Status: ", cudaGetErrorString(err));
       }
     }
@@ -683,51 +901,91 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
                               " failed. Status: ", cudaGetErrorString(err));
     }
   }
-  for (int i = 0; i < n; i++) {
-    BaseGPUDevice* gpu_device;
-    TF_RETURN_IF_ERROR(CreateGPUDevice(
-        options, strings::StrCat(name_prefix, "/device:GPU:", i),
-        valid_gpu_ids[i], &gpu_device));
-    TF_RETURN_IF_ERROR(gpu_device->Init(options));
-    devices->push_back(gpu_device);
+
+  std::vector<InterconnectMap> interconnect_maps;
+  TF_RETURN_IF_ERROR(
+      GetInterconnectMaps(visible_gpu_order, gpu_manager, &interconnect_maps));
+
+  // Print each interconnect map to the log.
+  for (const InterconnectMap& im : interconnect_maps) {
+    LOG(INFO) << "Device interconnect " << im.name << " with strength "
+              << im.strength << " edge matrix:";
+    string line_buf = "     ";
+    for (int i = 0; i < visible_gpu_order.size(); ++i) {
+      strings::StrAppend(&line_buf, visible_gpu_order[i].value(), " ");
+    }
+    LOG(INFO) << line_buf;
+    for (int i = 0; i < visible_gpu_order.size(); ++i) {
+      line_buf = strings::StrCat(visible_gpu_order[i].value(), ":   ");
+      CudaGpuId cuda_id_i = visible_gpu_order[i];
+      for (int j = 0; j < visible_gpu_order.size(); ++j) {
+        CudaGpuId cuda_id_j = visible_gpu_order[j];
+        if (im.directed_links.find({cuda_id_i, cuda_id_j}) !=
+            im.directed_links.end()) {
+          line_buf.append("Y ");
+        } else {
+          line_buf.append("N ");
+        }
+      }
+      LOG(INFO) << line_buf;
+    }
   }
 
-  return Status::OK();
-}
+  const auto& virtual_devices = gpu_options.experimental().virtual_devices();
+  if (!virtual_devices.empty()) {
+    TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
+        num_gpus_to_use, gpu_options, visible_gpu_order, valid_cuda_gpu_ids));
+    // We've verified that num_gpus_to_use >= virtual_devices.size().
+    num_gpus_to_use = virtual_devices.size();
+    CHECK(gpu_options.visible_device_list().empty() ||
+          valid_cuda_gpu_ids == visible_gpu_order);
+  }
+  int next_tf_gpu_id = 0;
+  std::vector<int64> memory_limit_bytes;
+  for (int i = 0; i < num_gpus_to_use; ++i) {
+    const CudaGpuId cuda_gpu_id = valid_cuda_gpu_ids[i];
+    if (virtual_devices.empty() ||
+        virtual_devices.Get(i).memory_limit_mb_size() == 0) {
+      int64 single_virtual_device_memory_limit = 0;
+      TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit(
+          gpu_options, cuda_gpu_id, &single_virtual_device_memory_limit));
+      memory_limit_bytes.push_back(single_virtual_device_memory_limit);
+    } else {
+      const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
+      std::transform(memory_limit_mb.begin(), memory_limit_mb.end(),
+                     std::back_inserter(memory_limit_bytes), [](float mb) {
+                       return static_cast<int64>(mb) * (1ll << 20);
+                     });
+    }
+    while (next_tf_gpu_id < memory_limit_bytes.size()) {
+      TfGpuId tf_gpu_id(next_tf_gpu_id);
+      ++next_tf_gpu_id;
+      GpuIdManager::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id);
+    }
+  }
+  const int num_tf_gpus = next_tf_gpu_id;
 
-namespace {
-int64 MinSystemMemory(int64 available_memory) {
-  // We use the following heuristic for now:
-  //
-  // If the available_memory is < 2GiB, we allocate 225MiB to system memory.
-  // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
-  //
-  // In the future we could be more sophisticated by using a table of devices.
-  int64 min_system_memory;
-  if (available_memory < (1LL << 31)) {
-    // 225MiB
-    min_system_memory = 225 * 1024 * 1024;
-  } else {
-    // max(300 MiB, 0.05 * available_memory)
-    min_system_memory =
-        std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
+  LocalityMap device_localities;
+  TF_RETURN_IF_ERROR(
+      GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities));
+
+  // Build the GPUDevices
+  CHECK_EQ(next_tf_gpu_id, memory_limit_bytes.size());
+  for (int di = 0; di < num_tf_gpus; ++di) {
+    TfGpuId tf_gpu_id(di);
+    int64 bytes = memory_limit_bytes[di];
+    auto it = device_localities.find(tf_gpu_id);
+    if (it == device_localities.end()) {
+      return errors::Internal("Failed to find DeviceLocality for GPU device ",
+                              tf_gpu_id.value());
+    }
+    TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes,
+                                       it->second, devices));
   }
-#if defined(__GNUC__) && defined(__OPTIMIZE__)
-// Do nothing
-#elif !defined(__GNUC__) && defined(NDEBUG)
-// Do nothing
-#else
-  // Double the amount of available GPU memory in non-opt builds (debug
-  // builds in windows); because in non-opt builds more system memory
-  // is necessary.
-  min_system_memory *= 2;
-#endif
-  return min_system_memory;
+  return Status::OK();
 }
 
-}  // namespace
-
-static string GetShortDeviceDescription(int device_id,
+static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id,
                                         const gpu::DeviceDescription& desc) {
   int cc_major;
   int cc_minor;
@@ -736,85 +994,166 @@ static string GetShortDeviceDescription(int device_id,
     cc_minor = 0;
   }
   // LINT.IfChange
-  return strings::StrCat("device: ", device_id, ", name: ", desc.name(),
+  return strings::StrCat("device: ", cuda_gpu_id.value(),
+                         ", name: ", desc.name(),
                          ", pci bus id: ", desc.pci_bus_id(),
                          ", compute capability: ", cc_major, ".", cc_minor);
   // LINT.ThenChange(//tensorflow/python/platform/test.py)
 }
 
 Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
-                                             const string& name, int gpu_id,
-                                             BaseGPUDevice** out_device) {
-  CHECK_GE(gpu_id, 0);
+                                             const string& name_prefix,
+                                             TfGpuId tf_gpu_id,
+                                             int64 memory_limit,
+                                             const DeviceLocality& dev_locality,
+                                             std::vector<Device*>* devices) {
+  CHECK_GE(tf_gpu_id.value(), 0);
+  const string device_name =
+      strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
+  GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
+  CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
+  int numa_node = dev_locality.numa_node();
+  Bytes allocated_bytes = static_cast<Bytes>(memory_limit);
 
-  // Look up the device, to see its attributes.
-  gpu::Platform* gpu_platform = GPUMachineManager();
-  CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount());
   gpu::StreamExecutor* se =
-      gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+      GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
   const gpu::DeviceDescription& desc = se->GetDeviceDescription();
-  int numa_node = desc.numa_node();
-  if (numa_node < 0) {
-    // For some reason the StreamExecutor couldn't get the NUMA
-    // affinity of the GPU.  If this is not a multi-socket mobo with
-    // GPUs local to different buses, it doesn't matter.  If it is, we
-    // may run into trouble later with data transfer operations.  The
-    // trouble may manifest as slower than expected performance, or
-    // outright failures.
-    LOG(INFO) << "Could not identify NUMA node of " << name
-              << ", defaulting to 0.  Your kernel may not have been built "
-              << "with NUMA support.";
-    numa_node = 0;
-  }
-
-  int64 total_memory, available_memory;
-  if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
-    return errors::Unknown(
-        strings::StrCat("Failed to query available memory for GPU ", gpu_id));
-  }
+  LOG(INFO) << "Creating TensorFlow device (" << device_name << " with "
+            << (memory_limit >> 20) << " MB memory) -> physical GPU ("
+            << GetShortDeviceDescription(cuda_gpu_id, desc) << ")";
+  ProcessState* process_state = ProcessState::singleton();
+  BaseGPUDevice* gpu_device = CreateGPUDevice(
+      options, device_name, allocated_bytes, dev_locality, tf_gpu_id,
+      GetShortDeviceDescription(cuda_gpu_id, desc),
+      process_state->GetGPUAllocator(options.config.gpu_options(), tf_gpu_id,
+                                     memory_limit),
+      process_state->GetCPUAllocator(numa_node));
+  TF_RETURN_IF_ERROR(gpu_device->Init(options));
+  devices->push_back(gpu_device);
 
-  int64 allocated_memory;
-  double config_memory_fraction =
-      options.config.gpu_options().per_process_gpu_memory_fraction();
-  if (config_memory_fraction == 0) {
-    allocated_memory = available_memory;
-    const int64 min_system_memory = MinSystemMemory(available_memory);
-    if (min_system_memory < allocated_memory) {
-      allocated_memory -= min_system_memory;
+  return Status::OK();
+}
+
+namespace {
+std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>>
+GetPeerAccessMap(gpu::Platform* platform,
+                 const std::vector<CudaGpuId>& visible_gpu_order) {
+  std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> map(
+      new std::map<std::pair<CudaGpuId, CudaGpuId>, bool>);
+  for (CudaGpuId cuda_gpu_i : visible_gpu_order) {
+    for (CudaGpuId cuda_gpu_j : visible_gpu_order) {
+      gpu::StreamExecutor* from =
+          GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
+      gpu::StreamExecutor* to =
+          GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
+      (*map)[{cuda_gpu_i, cuda_gpu_j}] = from->CanEnablePeerAccessTo(to);
     }
-  } else {
-    allocated_memory = total_memory * config_memory_fraction;
   }
 
-  Bytes allocated_bytes = static_cast<Bytes>(allocated_memory);
+  return map;
+}
 
-  // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
-  // virtualized in some environments, we can't just use the GPU id.
-  // NUMA locales are indexed from 0, buses are indexed from 1.
-  DeviceLocality dev_locality;
-  dev_locality.set_bus_id(numa_node + 1);
-  VLOG(1) << "GPUDevice id " << gpu_id << " on bus " << dev_locality.bus_id()
-          << " numa: " << numa_node << " pci: " << desc.pci_bus_id();
+}  // namespace
 
-  ProcessState* process_state = ProcessState::singleton();
-  *out_device = CreateGPUDevice(
-      options, name, allocated_bytes, dev_locality, gpu_id,
-      GetShortDeviceDescription(gpu_id, desc),
-      process_state->GetGPUAllocator(options.config.gpu_options(), gpu_id,
-                                     allocated_memory),
-      process_state->GetCPUAllocator(numa_node));
+Status BaseGPUDeviceFactory::GetInterconnectMaps(
+    const std::vector<CudaGpuId>& visible_gpu_order, gpu::Platform* gpu_manager,
+    std::vector<InterconnectMap>* maps) {
+  // The default interconnect map is obtained from the StreamExecutor.
+  auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
+  maps->resize(1);
+  InterconnectMap& imap = maps->at(0);
+  imap.name = "StreamExecutor";
+  imap.strength = InterconnectMap::kStreamExecutorStrength;
+  for (CudaGpuId cuda_id_i : visible_gpu_order) {
+    for (CudaGpuId cuda_id_j : visible_gpu_order) {
+      if (cuda_id_i == cuda_id_j) continue;
+      if ((*access_map)[{cuda_id_i, cuda_id_j}]) {
+        imap.directed_links.insert({cuda_id_i, cuda_id_j});
+      }
+    }
+  }
+  return Status::OK();
+}
 
+Status BaseGPUDeviceFactory::GetDeviceLocalities(
+    int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
+    LocalityMap* localities) {
+  std::vector<TfGpuId> all_tf_gpu_ids;
+  for (int i = 0; i < num_tf_gpus; ++i) {
+    all_tf_gpu_ids.push_back(TfGpuId(i));
+  }
+  for (TfGpuId tf_gpu_id : all_tf_gpu_ids) {
+    CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
+    // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
+    // virtualized in some environments, we can't just use the GPU id.
+    // NUMA locales are indexed from 0, buses are indexed from 1.
+    gpu::StreamExecutor* se =
+        GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
+    const gpu::DeviceDescription& desc = se->GetDeviceDescription();
+    int numa_node = desc.numa_node();
+    if (numa_node < 0) {
+      // For some reason the StreamExecutor couldn't get the NUMA
+      // affinity of the GPU.  If this is not a multi-socket mobo with
+      // GPUs local to different buses, it doesn't matter.  If it is, we
+      // may run into trouble later with data transfer operations.  The
+      // trouble may manifest as slower than expected performance, or
+      // outright failures.
+      LOG(INFO) << "Could not identify NUMA node of CUDA gpu id " << cuda_gpu_id
+                << ", defaulting to 0.  Your kernel may not have been built "
+                << "with NUMA support.";
+      numa_node = 0;
+    }
+    DeviceLocality dev_locality;
+    dev_locality.set_numa_node(numa_node);
+    dev_locality.set_bus_id(numa_node + 1);
+
+    // Set LocalLinks from InterconnectMaps.
+    LocalLinks* links = dev_locality.mutable_links();
+    for (const InterconnectMap& imap : interconnects) {
+      for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
+        CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst);
+        if (imap.directed_links.find({cuda_gpu_id, cuda_gpu_dst}) !=
+            imap.directed_links.end()) {
+          InterconnectLink* ilink = links->add_link();
+          ilink->set_device_id(tf_gpu_dst.value());
+          ilink->set_type(imap.name);
+          ilink->set_strength(imap.strength);
+        }
+      }
+    }
+
+    // If this is one of multiple virtual GPUs on the same physical GPU
+    // add high strength links to the others.
+    for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
+      if (tf_gpu_id == tf_gpu_dst) continue;
+      CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst);
+      if (cuda_gpu_id == cuda_gpu_dst) {
+        InterconnectLink* ilink = links->add_link();
+        ilink->set_device_id(tf_gpu_dst.value());
+        ilink->set_type("SAME_DEVICE");
+        ilink->set_strength(InterconnectMap::kSameDeviceStrength);
+      }
+    }
+
+    (*localities)[tf_gpu_id] = dev_locality;
+    VLOG(1) << "GPUDevice CudaGpuId " << cuda_gpu_id << " TfGpuId " << tf_gpu_id
+            << " on bus " << dev_locality.bus_id() << " numa: " << numa_node
+            << " pci: " << desc.pci_bus_id()
+            << " DeviceLocality: " << dev_locality.DebugString();
+  }
   return Status::OK();
 }
 
 static int GetDefaultMinGPUMultiprocessorCount(
-    gpu::Platform* gpu_manager, const std::vector<int>& visible_gpu_order) {
+    gpu::Platform* gpu_manager,
+    const std::vector<CudaGpuId>& visible_gpu_order) {
   static const int kDefaultMinGPUMultiprocessorCount = 8;
 
   // Find the highest multi-processor count across all visible GPUs.
   int max_count = -1;
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    auto exec_status = gpu_manager->ExecutorForDevice(visible_gpu_order[i]);
+    auto exec_status =
+        GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_order[i]);
     if (!exec_status.ok()) {
       continue;
     }
@@ -832,7 +1171,8 @@ static int GetDefaultMinGPUMultiprocessorCount(
 }
 
 static int GetMinGPUMultiprocessorCount(
-    gpu::Platform* gpu_manager, const std::vector<int>& visible_gpu_order) {
+    gpu::Platform* gpu_manager,
+    const std::vector<CudaGpuId>& visible_gpu_order) {
   const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
 
   if (tf_min_gpu_core_count == nullptr ||
@@ -909,39 +1249,19 @@ std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
   return cuda_caps;
 }
 
-std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap(
-    gpu::Platform* platform, const std::vector<int>& visible_gpu_order) {
-  std::unique_ptr<std::map<std::pair<int, int>, bool>> map(
-      new std::map<std::pair<int, int>, bool>);
-  for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    const int i_gpu_id = visible_gpu_order[i];
-    for (int j = 0; j < visible_gpu_order.size(); ++j) {
-      const int j_gpu_id = visible_gpu_order[j];
-      gpu::StreamExecutor* from =
-          platform->ExecutorForDevice(i_gpu_id).ValueOrDie();
-      gpu::StreamExecutor* to =
-          platform->ExecutorForDevice(j_gpu_id).ValueOrDie();
-      (*map)[{i, j}] = from->CanEnablePeerAccessTo(to);
-    }
-  }
-
-  return map;
-}
-
 Status EnablePeerAccess(gpu::Platform* platform,
-                        const std::vector<int>& visible_gpu_order) {
+                        const std::vector<CudaGpuId>& visible_gpu_order) {
   int possible_peer_count = 0;
   int enabled_peer_count = 0;
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    const int i_gpu_id = visible_gpu_order[i];
+    const CudaGpuId cuda_gpu_i = visible_gpu_order[i];
     for (int j = 0; j < visible_gpu_order.size(); ++j) {
-      const int j_gpu_id = visible_gpu_order[j];
-      // We have already validated that ExecutorForDevice() calls
-      // return OK.
+      const CudaGpuId cuda_gpu_j = visible_gpu_order[j];
+      // We have already validated that ExecutorForDevice() calls return OK.
       gpu::StreamExecutor* from =
-          platform->ExecutorForDevice(i_gpu_id).ValueOrDie();
+          GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
       gpu::StreamExecutor* to =
-          platform->ExecutorForDevice(j_gpu_id).ValueOrDie();
+          GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
 
       if (from->CanEnablePeerAccessTo(to)) {
         ++possible_peer_count;
@@ -949,7 +1269,7 @@ Status EnablePeerAccess(gpu::Platform* platform,
         if (!status.ok()) {
           LOG(WARNING)
               << "Unable to enable peer access between device ordinals "
-              << i_gpu_id << " and " << j_gpu_id;
+              << cuda_gpu_i << " and " << cuda_gpu_j << ", status: " << status;
         } else {
           ++enabled_peer_count;
         }
@@ -972,73 +1292,22 @@ Status EnablePeerAccess(gpu::Platform* platform,
 }  // namespace
 
 Status BaseGPUDeviceFactory::GetValidDeviceIds(
-    const string& visible_device_list, std::vector<int>* ids) {
-  TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
-
+    const std::vector<CudaGpuId>& visible_gpu_order,
+    std::vector<CudaGpuId>* ids) {
   gpu::Platform* gpu_manager = GPUMachineManager();
-  if (gpu_manager == nullptr) {
-    return Status::OK();
-  }
-
-  // If there are no GPUs visible, do nothing.
-  if (gpu_manager->VisibleDeviceCount() <= 0) {
-    return Status::OK();
-  }
-
-  // If the user wants to remap the visible to virtual GPU mapping,
-  // check for that here.
-  std::vector<int> visible_gpu_order;
-  if (visible_device_list.empty()) {
-    visible_gpu_order.resize(gpu_manager->VisibleDeviceCount());
-    // By default, visible to virtual mapping is unchanged.
-    int deviceNo = 0;
-    std::generate(visible_gpu_order.begin(), visible_gpu_order.end(),
-                  [&deviceNo] { return deviceNo++; });
-  } else {
-    std::vector<string> order_str = str_util::Split(visible_device_list, ',');
-    for (int i = 0; i < order_str.size(); ++i) {
-      const string& gpu_id_str = order_str[i];
-      int32 gpu_id;
-      if (!strings::safe_strto32(gpu_id_str, &gpu_id)) {
-        return errors::InvalidArgument(
-            "Could not parse entry in 'visible_device_list': '", gpu_id_str,
-            "'.  visible_device_list = ", visible_device_list);
-      }
-
-      if (gpu_id < 0 || gpu_id >= gpu_manager->VisibleDeviceCount()) {
-        return errors::InvalidArgument(
-            "'visible_device_list' listed an invalid GPU id '", gpu_id,
-            "' but visible device count is ",
-            gpu_manager->VisibleDeviceCount());
-      }
-
-      visible_gpu_order.push_back(gpu_id);
-    }
-  }
-
-  // Validate no repeats.
-  std::set<int> visible_device_set(visible_gpu_order.begin(),
-                                   visible_gpu_order.end());
-  if (visible_device_set.size() != visible_gpu_order.size()) {
-    return errors::InvalidArgument(
-        "visible_device_list contained "
-        "a duplicate entry: ",
-        visible_device_list);
-  }
-
   bool new_gpu_found = false;
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    int gpu_id = visible_gpu_order[i];
+    const CudaGpuId cuda_gpu_id = visible_gpu_order[i];
 
-    // Only perform this once per visible gpu id.
-    if (visible_gpu_initialized_[gpu_id]) {
+    // Only perform this once per visible cuda gpu id.
+    if (visible_gpu_initialized_[cuda_gpu_id.value()]) {
       continue;
     }
 
-    visible_gpu_initialized_[gpu_id] = true;
+    visible_gpu_initialized_[cuda_gpu_id.value()] = true;
     new_gpu_found = true;
 
-    auto executor = gpu_manager->ExecutorForDevice(gpu_id);
+    auto executor = GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, cuda_gpu_id);
     if (!executor.ok()) {
       return StreamExecutorUtil::ConvertStatus(executor.status());
     }
@@ -1071,27 +1340,6 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
   if (new_gpu_found && visible_gpu_order.size() > 1) {
     // Enable peer access
     TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order));
-
-    // Print out a matrix showing which devices can DMA to one
-    // another.
-    LOG(INFO) << "Device peer to peer matrix";
-    auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
-    string line_buf = "DMA: ";
-    for (int i = 0; i < visible_gpu_order.size(); ++i) {
-      strings::StrAppend(&line_buf, visible_gpu_order[i], " ");
-    }
-    LOG(INFO) << line_buf;
-    for (int i = 0; i < visible_gpu_order.size(); ++i) {
-      line_buf = strings::StrCat(visible_gpu_order[i], ":   ");
-      for (int j = 0; j < visible_gpu_order.size(); ++j) {
-        if ((*access_map)[{i, j}]) {
-          line_buf.append("Y ");
-        } else {
-          line_buf.append("N ");
-        }
-      }
-      LOG(INFO) << line_buf;
-    }
   }
 
   auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
@@ -1107,9 +1355,13 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
 
   // Filter out devices that don't have the right capability or power.
   for (int i = 0; i < visible_gpu_order.size(); ++i) {
-    const int32 visible_gpu_id = visible_gpu_order[i];
-    auto exec_status = gpu_manager->ExecutorForDevice(visible_gpu_id);
+    const CudaGpuId visible_gpu_id = visible_gpu_order[i];
+    auto exec_status =
+        GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_id);
     if (!exec_status.ok()) {
+      LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id
+                << " whose executor is in invalid state: "
+                << exec_status.status().ToString();
       continue;
     }
     gpu::StreamExecutor* se = exec_status.ValueOrDie();
@@ -1117,6 +1369,10 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     CudaVersion device_capability;
     if (!desc.cuda_compute_capability(&device_capability.major_part,
                                       &device_capability.minor_part)) {
+      LOG(INFO) << "Ignoring visible gpu device "
+                << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
+                << ") "
+                << "whose CUDA compute capability is not available.";
       continue;
     }
     // Only GPUs with no less than the minimum supported compute capability is
@@ -1136,7 +1392,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
     // multiprocessors. If the TF_MIN_GPU_MULTIPROCESSOR_COUNT environment
     // variable is set, its value will be used to filter out GPUs.
     if (desc.core_count() < min_gpu_core_count) {
-      LOG(INFO) << "Ignoring gpu device "
+      LOG(INFO) << "Ignoring visible gpu device "
                 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
                 << ") "
                 << "with Cuda multiprocessor count: " << desc.core_count()
@@ -1145,12 +1401,14 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
                    "TF_MIN_GPU_MULTIPROCESSOR_COUNT.";
       continue;
     }
-
-    size_t new_id = ids->size();
     ids->push_back(visible_gpu_id);
-
-    LOG(INFO) << "Creating TensorFlow device (/device:GPU:" << new_id << ") -> "
-              << "(" << GetShortDeviceDescription(visible_gpu_id, desc) << ")";
+  }
+  if (!ids->empty()) {
+    std::vector<int> raw_ids(ids->size());
+    std::transform(ids->begin(), ids->end(), raw_ids.begin(),
+                   [](CudaGpuId id) -> int { return id.value(); });
+    LOG(INFO) << "Adding visible gpu devices: "
+              << str_util::Join(raw_ids, ", ");
   }
 
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 442496437af5f4796f6d216f7c688d31f2f457d7..c88daa8ff87589a3fc48f4c7693d073d6adf9a5a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -28,6 +28,9 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -45,10 +48,10 @@ namespace tensorflow {
 class BaseGPUDevice : public LocalDevice {
  public:
   BaseGPUDevice(const SessionOptions& options, const string& name,
-                Bytes memory_limit, const DeviceLocality& locality, int gpu_id,
-                const string& physical_device_desc, Allocator* gpu_allocator,
-                Allocator* cpu_allocator, bool sync_every_op,
-                int32 max_streams);
+                Bytes memory_limit, const DeviceLocality& locality,
+                TfGpuId tf_gpu_id, const string& physical_device_desc,
+                Allocator* gpu_allocator, Allocator* cpu_allocator,
+                bool sync_every_op, int32 max_streams);
 
   ~BaseGPUDevice() override;
 
@@ -84,9 +87,9 @@ class BaseGPUDevice : public LocalDevice {
   void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
                              DeviceContext* dc, Allocator* allocator) override;
 
-  // Returns the id of this device within the native driver system; e.g., for
-  // CUDA this is the ordinal of the GPU within the system.
-  int gpu_id() const { return gpu_id_; }
+  // Returns the CUDA GPU id of this device within the native driver system;
+  // e.g., for CUDA this is the ordinal of the GPU within the system.
+  int gpu_id() const { return GpuIdManager::TfToCudaGpuId(tf_gpu_id_).value(); }
 
   // The executor that provides control for the device; e.g., for CUDA this
   // corresponds to the cuda context.
@@ -112,10 +115,11 @@ class BaseGPUDevice : public LocalDevice {
   std::vector<GPUDeviceContext*> device_contexts_;
   GpuDeviceInfo* gpu_device_info_ = nullptr;
   mutex trace_mu_;
-  int gpu_id_ = -1;
+  TfGpuId tf_gpu_id_;
   const bool sync_every_op_ = false;
   const int32 max_streams_;
   std::unique_ptr<EventMgr> em_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
 
   void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
                           int stream_id, Allocator* allocator);
@@ -137,26 +141,64 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
                        std::vector<Device*>* devices) override;
 
+  struct InterconnectMap {
+    // Name of interconnect technology, if known.
+    string name;
+    // If possible, strength should approximate Gb/sec bandwidth rate.
+    // Where architecture-specific subclassing is not done that won't
+    // always be possible.  The minimum expectation is that
+    // faster links should have a higher value than slower links.
+    int32 strength;
+    static const int kSameDeviceStrength;
+    static const int kStreamExecutorStrength;
+    std::set<std::pair<CudaGpuId, CudaGpuId>> directed_links;
+  };
+
+ protected:
+  // Populates *maps with interconnect maps for all local direct access
+  // pathways between GPUs.
+  virtual Status GetInterconnectMaps(
+      const std::vector<CudaGpuId>& visible_gpu_order,
+      gpu::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
+
+  struct TfGpuIdHash {
+    std::size_t operator()(const TfGpuId& id) const noexcept {
+      return std::hash<int>{}(id.value());
+    }
+  };
+  typedef std::unordered_map<TfGpuId, DeviceLocality, TfGpuIdHash> LocalityMap;
+  // Populates *localities with the DeviceLocality descriptor for
+  // every TfGpuId.
+  virtual Status GetDeviceLocalities(
+      int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
+      LocalityMap* localities);
+
  private:
-  Status CreateGPUDevice(const SessionOptions& options, const string& name,
-                         int gpu_id, BaseGPUDevice** out_device);
+  // Creates a BaseGPUDevice associated with 'tf_gpu_id', allocates (strictly)
+  // 'memory_limit' bytes of GPU memory to it, and adds it to the 'devices'
+  // vector.
+  Status CreateGPUDevice(const SessionOptions& options,
+                         const string& name_prefix, TfGpuId tf_gpu_id,
+                         int64 memory_limit, const DeviceLocality& dev_locality,
+                         std::vector<Device*>* devices);
 
   virtual BaseGPUDevice* CreateGPUDevice(const SessionOptions& options,
                                          const string& name, Bytes memory_limit,
-                                         const DeviceLocality& locality,
-                                         int gpu_id,
+                                         const DeviceLocality& dev_locality,
+                                         TfGpuId tf_gpu_id,
                                          const string& physical_device_desc,
                                          Allocator* gpu_allocator,
                                          Allocator* cpu_allocator) = 0;
 
-  // Returns into 'ids' the list of valid GPU ids, in the order that
-  // they should map to logical gpu ids "/device:GPU:0", "/device:GPU:1", etc, based
-  // upon 'visible_device_list', a comma-separated list of 'visible
-  // gpu ids'.
-  Status GetValidDeviceIds(const string& visible_device_list,
-                           std::vector<int>* ids);
+  // Returns into 'ids' the list of valid CUDA GPU ids, in the order that
+  // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
+  // based upon 'visible_gpu_order' which was generated by parsing
+  // GPUOptions::visible_device_list which is a comma-separated list of CUDA GPU
+  // ids.
+  Status GetValidDeviceIds(const std::vector<CudaGpuId>& visible_gpu_order,
+                           std::vector<CudaGpuId>* ids);
 
-  // visible_gpu_initialized_[gpu_id] is true if visible GPU gpu_id
+  // visible_gpu_initialized_[cuda_gpu_id] is true if visible GPU cuda_gpu_id
   // has been initialized by the process.
   std::unordered_map<int, bool> visible_gpu_initialized_;
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index 63ac3daba142b0076407110509034a512b00ff37..9a000749c6e677743ea700eb941f4147646ddc55 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
 
@@ -26,10 +27,10 @@ namespace tensorflow {
 class GPUDevice : public BaseGPUDevice {
  public:
   GPUDevice(const SessionOptions& options, const string& name,
-            Bytes memory_limit, const DeviceLocality& locality, int gpu_id,
-            const string& physical_device_desc, Allocator* gpu_allocator,
-            Allocator* cpu_allocator)
-      : BaseGPUDevice(options, name, memory_limit, locality, gpu_id,
+            Bytes memory_limit, const DeviceLocality& locality,
+            TfGpuId tf_gpu_id, const string& physical_device_desc,
+            Allocator* gpu_allocator, Allocator* cpu_allocator)
+      : BaseGPUDevice(options, name, memory_limit, locality, tf_gpu_id,
                       physical_device_desc, gpu_allocator, cpu_allocator,
                       false /* sync every op */, 1 /* max_streams */) {
     if (options.config.has_gpu_options()) {
@@ -59,11 +60,12 @@ class GPUDeviceFactory : public BaseGPUDeviceFactory {
  private:
   BaseGPUDevice* CreateGPUDevice(const SessionOptions& options,
                                  const string& name, Bytes memory_limit,
-                                 const DeviceLocality& locality, int gpu_id,
+                                 const DeviceLocality& locality,
+                                 TfGpuId tf_gpu_id,
                                  const string& physical_device_desc,
                                  Allocator* gpu_allocator,
                                  Allocator* cpu_allocator) override {
-    return new GPUDevice(options, name, memory_limit, locality, gpu_id,
+    return new GPUDevice(options, name, memory_limit, locality, tf_gpu_id,
                          physical_device_desc, gpu_allocator, cpu_allocator);
   }
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..75be6d60b86af101fb9de7497490e72c523d632b
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/test.h"
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(GPUDeviceOnNonGPUMachineTest, CreateGPUDevicesOnNonGPUMachine) {
+  SessionOptions opts;
+  std::vector<tensorflow::Device*> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, "/job:localhost/replica:0/task:0", &devices));
+  EXPECT_TRUE(devices.empty());
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+
+int main(int argc, char** argv) {
+#if GOOGLE_CUDA
+  // Sets CUDA_VISIBLE_DEVICES to empty string to simulate non-gpu environment.
+  setenv("CUDA_VISIBLE_DEVICES", "", 1);
+#endif  // GOOGLE_CUDA
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b56823204afe8ee52e0ea376b1a79d91d6932fa0
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -0,0 +1,201 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0";
+
+static SessionOptions MakeSessionOptions(
+    const string& visible_device_list = "",
+    double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1,
+    const std::vector<std::vector<float>>& memory_limit_mb = {}) {
+  SessionOptions options;
+  ConfigProto* config = &options.config;
+  (*config->mutable_device_count())["GPU"] = gpu_device_count;
+  GPUOptions* gpu_options = config->mutable_gpu_options();
+  gpu_options->set_visible_device_list(visible_device_list);
+  gpu_options->set_per_process_gpu_memory_fraction(
+      per_process_gpu_memory_fraction);
+  for (const auto& v : memory_limit_mb) {
+    auto virtual_devices =
+        gpu_options->mutable_experimental()->add_virtual_devices();
+    for (float mb : v) {
+      virtual_devices->add_memory_limit_mb(mb);
+    }
+  }
+  return options;
+}
+
+static bool StartsWith(const string& lhs, const string& rhs) {
+  if (rhs.length() > lhs.length()) return false;
+  return lhs.substr(0, rhs.length()) == rhs;
+}
+
+TEST(GPUDeviceTest, FailedToParseVisibleDeviceList) {
+  SessionOptions opts = MakeSessionOptions("0,abc");
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(StartsWith(status.error_message(), "Could not parse entry"))
+      << status;
+}
+
+TEST(GPUDeviceTest, InvalidGpuId) {
+  SessionOptions opts = MakeSessionOptions("100");
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(StartsWith(status.error_message(),
+                         "'visible_device_list' listed an invalid GPU id"))
+      << status;
+}
+
+TEST(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
+  SessionOptions opts = MakeSessionOptions("0,0");
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(StartsWith(status.error_message(),
+                         "visible_device_list contained a duplicate entry"))
+      << status;
+}
+
+TEST(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
+  SessionOptions opts = MakeSessionOptions("0", 0.1, 1, {{}});
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(StartsWith(status.error_message(),
+                         "It's invalid to set per_process_gpu_memory_fraction"))
+      << status;
+}
+
+TEST(GPUDeviceTest, GpuDeviceCountTooSmall) {
+  // device_count is 0, but with one entry in visible_device_list and one
+  // (empty) VirtualDevices messages.
+  SessionOptions opts = MakeSessionOptions("0", 0, 0, {{}});
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::UNKNOWN);
+  EXPECT_TRUE(StartsWith(status.error_message(),
+                         "Not enough GPUs to create virtual devices."))
+      << status;
+}
+
+TEST(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
+  // Single entry in visible_device_list with two (empty) VirtualDevices
+  // messages.
+  SessionOptions opts = MakeSessionOptions("0", 0, 8, {{}, {}});
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::UNKNOWN);
+  EXPECT_TRUE(StartsWith(status.error_message(),
+                         "Not enough GPUs to create virtual devices."))
+      << status;
+}
+
+TEST(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
+  // This test requires at least two visible GPU hardware.
+  if (GPUMachineManager()->VisibleDeviceCount() < 2) return;
+  // Three entries in visible_device_list with two (empty) VirtualDevices
+  // messages.
+  SessionOptions opts = MakeSessionOptions("0,1", 0, 8, {{}});
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_TRUE(StartsWith(status.error_message(),
+                         "The number of GPUs in visible_device_list doesn't "
+                         "match the number of elements in the virtual_devices "
+                         "list."))
+      << status;
+}
+
+TEST(GPUDeviceTest, EmptyVirtualDeviceConfig) {
+  // It'll create single virtual device when the virtual device config is empty.
+  SessionOptions opts = MakeSessionOptions("0");
+  std::vector<tensorflow::Device*> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  EXPECT_EQ(1, devices.size());
+  EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
+  for (auto d : devices) delete d;
+}
+
+TEST(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
+  // It'll create single virtual device for the gpu in question when
+  // memory_limit_mb is unset.
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{}});
+  std::vector<tensorflow::Device*> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  EXPECT_EQ(1, devices.size());
+  EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
+  for (auto d : devices) delete d;
+}
+
+TEST(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}});
+  std::vector<tensorflow::Device*> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  EXPECT_EQ(1, devices.size());
+  EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
+  for (auto d : devices) delete d;
+}
+
+TEST(GPUDeviceTest, MultipleVirtualDevices) {
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}});
+  std::vector<tensorflow::Device*> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  EXPECT_EQ(2, devices.size());
+  EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
+  EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
+  ASSERT_EQ(1, devices[0]->attributes().locality().links().link_size());
+  ASSERT_EQ(1, devices[1]->attributes().locality().links().link_size());
+  EXPECT_EQ(1, devices[0]->attributes().locality().links().link(0).device_id());
+  EXPECT_EQ("SAME_DEVICE",
+            devices[0]->attributes().locality().links().link(0).type());
+  EXPECT_EQ(BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength,
+            devices[0]->attributes().locality().links().link(0).strength());
+  EXPECT_EQ(0, devices[1]->attributes().locality().links().link(0).device_id());
+  EXPECT_EQ("SAME_DEVICE",
+            devices[1]->attributes().locality().links().link(0).type());
+  EXPECT_EQ(BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength,
+            devices[1]->attributes().locality().links().link(0).strength());
+  for (auto d : devices) delete d;
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id.h b/tensorflow/core/common_runtime/gpu/gpu_id.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a6caea2967dcd0a1d3d6550aa428a882408ea17
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_id.h
@@ -0,0 +1,88 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
+
+#include "tensorflow/core/lib/gtl/int_type.h"
+
+namespace tensorflow {
+
+// There are three types of GPU ids:
+// - *physical* GPU id: this is the integer index of a GPU hardware in the
+//   physical machine, it can be filtered by CUDA environment variable
+//   CUDA_VISIBLE_DEVICES. Note that this id is not visible to Tensorflow, but
+//   result after filtering by CUDA_VISIBLE_DEVICES is visible to TF and is
+//   called CUDA GPU id as below. See
+//   http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
+//   for more details.
+// - CUDA GPU id (also called *visible* GPU id in
+//   third_party/tensorflow/core/protobuf/config.proto): this is the id that is
+//   visible to Tensorflow after filtering by CUDA_VISIBLE_DEVICES, and is
+//   generated by the CUDA GPU driver. It starts from 0 and is used for CUDA API
+//   calls like cuDeviceGet().
+// - TF GPU id (also called *virtual* GPU id in
+//   third_party/tensorflow/core/protobuf/config.proto): this is the id that
+//   Tensorflow generates and exposes to its users. It is the id in the <id>
+//   field of the device name "/device:GPU:<id>", and is also the identifier of
+//   a BaseGPUDevice. Note that the configuration allows us to create multiple
+//   BaseGPUDevice per GPU hardware in order to use multi CUDA streams on the
+//   hardware, so the mapping between TF GPU id and CUDA GPU id is not a 1:1
+//   mapping, see the example below.
+//
+// For example, assuming that in the machine we have GPU device with index 0, 1,
+// 2 and 3 (physical GPU id). Setting "CUDA_VISIBLE_DEVICES=1,2,3" will create
+// the following mapping between CUDA GPU id and physical GPU id:
+//
+//        CUDA GPU id ->  physical GPU id
+//                 0  ->  1
+//                 1  ->  2
+//                 2  ->  3
+//
+// Note that physical GPU id 0 is invisible to TF so there is no mapping entry
+// for it.
+//
+// Assuming we configure the Session to create one BaseGPUDevice per GPU
+// hardware, then setting GPUOptions::visible_device_list to "2,0" will create
+// the following mappting between TF GPU id and CUDA GPU id:
+//
+//                  TF GPU id  ->  CUDA GPU ID
+//      0 (i.e. /device:GPU:0) ->  2
+//      1 (i.e. /device:GPU:1) ->  0
+//
+// Note that CUDA GPU id 1 is filtered out by GPUOptions::visible_device_list,
+// so it won't be used by the TF process.
+//
+// On the other hand, if we configure it to create 2 BaseGPUDevice per GPU
+// hardware, then setting GPUOptions::visible_device_list to "2,0" will create
+// the following mappting between TF GPU id and CUDA GPU id:
+//
+//                  TF GPU id  ->  CUDA GPU ID
+//      0 (i.e. /device:GPU:0) ->  2
+//      1 (i.e. /device:GPU:1) ->  2
+//      2 (i.e. /device:GPU:2) ->  0
+//      3 (i.e. /device:GPU:3) ->  0
+//
+// We create strong-typed integer classes for both TF GPU id and CUDA GPU id to
+// minimize programming errors and improve code readability. Except for the
+// StreamExecutor interface (as we don't change its API), whenever we need a
+// TF GPU id (or CUDA GPU id) we should use TfGpuId (or CudaGpuId) instead of a
+// raw integer.
+TF_LIB_GTL_DEFINE_INT_TYPE(TfGpuId, int32);
+TF_LIB_GTL_DEFINE_INT_TYPE(CudaGpuId, int32);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..207afdca75642b14c1617c8abae4fd5e9916f020
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
@@ -0,0 +1,74 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace {
+// Manages the map between TfGpuId and CUDA GPU id.
+class TfToCudaGpuIdMap {
+ public:
+  static TfToCudaGpuIdMap* singleton() {
+    static auto* manager = new TfToCudaGpuIdMap;
+    return manager;
+  }
+
+  void InsertOrDie(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id)
+      LOCKS_EXCLUDED(mu_) {
+    std::pair<IdMapType::iterator, bool> result;
+    {
+      mutex_lock lock(mu_);
+      result = id_map_.insert({tf_gpu_id.value(), cuda_gpu_id.value()});
+    }
+    if (!result.second) {
+      CHECK_EQ(cuda_gpu_id.value(), result.first->second)
+          << "Mapping the same TfGpuId to a different CUDA GPU id."
+          << " TfGpuId: " << tf_gpu_id
+          << " Existing mapped CUDA GPU id: " << result.first->second
+          << " CUDA GPU id being tried to map to: " << cuda_gpu_id;
+    }
+  }
+
+  int32 FindOrDie(TfGpuId tf_gpu_id) const LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    auto result = id_map_.find(tf_gpu_id.value());
+    CHECK(result != id_map_.end())
+        << "Could not find the mapping for TfGpuId: " << tf_gpu_id;
+    return result->second;
+  }
+
+ private:
+  using IdMapType = std::unordered_map<int32, int32>;
+  mutable mutex mu_;
+  IdMapType id_map_ GUARDED_BY(mu_);
+};
+}  // namespace
+
+void GpuIdManager::InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id,
+                                         CudaGpuId cuda_gpu_id) {
+  TfToCudaGpuIdMap::singleton()->InsertOrDie(tf_gpu_id, cuda_gpu_id);
+}
+
+CudaGpuId GpuIdManager::TfToCudaGpuId(TfGpuId tf_gpu_id) {
+  return CudaGpuId(TfToCudaGpuIdMap::singleton()->FindOrDie(tf_gpu_id));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.h b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..33925d8c36f44a9d2c7abc8f2801f3f203bcb982
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_MANAGER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_MANAGER_H_
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+
+namespace tensorflow {
+
+// Class that manages the translation between Tensorflow GPU ids and CUDA GPU
+// ids.
+class GpuIdManager {
+ public:
+  static void InsertTfCudaGpuIdPair(TfGpuId tf_gpu_id, CudaGpuId cuda_gpu_id);
+  static CudaGpuId TfToCudaGpuId(TfGpuId tf_gpu_id);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_MANAGER_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bdbd8d065b398159305504202ed342c08cc3ee7d
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc
@@ -0,0 +1,55 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace test {
+
+TEST(GpuIdManagerTest, Basics) {
+  TfGpuId key_0(0);
+  CudaGpuId value_0(0);
+  GpuIdManager::InsertTfCudaGpuIdPair(key_0, value_0);
+  EXPECT_EQ(value_0, GpuIdManager::TfToCudaGpuId(key_0));
+
+  // Multiple calls to map the same value is ok.
+  GpuIdManager::InsertTfCudaGpuIdPair(key_0, value_0);
+  EXPECT_EQ(value_0, GpuIdManager::TfToCudaGpuId(key_0));
+
+  // Map a different TfGpuId to a different value.
+  TfGpuId key_1(3);
+  CudaGpuId value_1(2);
+  GpuIdManager::InsertTfCudaGpuIdPair(key_1, value_1);
+  EXPECT_EQ(value_1, GpuIdManager::TfToCudaGpuId(key_1));
+
+  // Mapping a different TfGpuId to the same value is ok.
+  TfGpuId key_2(10);
+  GpuIdManager::InsertTfCudaGpuIdPair(key_2, value_1);
+  EXPECT_EQ(value_1, GpuIdManager::TfToCudaGpuId(key_2));
+
+  // Mapping the same TfGpuId to a different value will crash the program.
+  ASSERT_DEATH(GpuIdManager::InsertTfCudaGpuIdPair(key_2, value_0),
+               "Mapping the same TfGpuId to a different CUDA GPU id");
+
+  // Getting an nonexistent mapping will crash the program.
+  ASSERT_DEATH(GpuIdManager::TfToCudaGpuId(TfGpuId(100)),
+               "Could not find the mapping for TfGpuId");
+}
+
+}  // namespace test
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e90687fe8854460dc2ec683d8587ab2ceadf42e
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
@@ -0,0 +1,59 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_UTILS_H_
+
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/lib/gtl/int_type.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+namespace gpu = ::perftools::gputools;
+
+// Utility methods for translation between Tensorflow GPU ids and CUDA GPU ids.
+class GpuIdUtil {
+ public:
+  // Convenient methods for getting the associated executor given a TfGpuId or
+  // CudaGpuId.
+  static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForCudaGpuId(
+      gpu::Platform* gpu_manager, CudaGpuId cuda_gpu_id) {
+    return gpu_manager->ExecutorForDevice(cuda_gpu_id.value());
+  }
+  static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForCudaGpuId(
+      CudaGpuId cuda_gpu_id) {
+    return ExecutorForCudaGpuId(GPUMachineManager(), cuda_gpu_id);
+  }
+  static gpu::port::StatusOr<gpu::StreamExecutor*> ExecutorForTfGpuId(
+      TfGpuId tf_gpu_id) {
+    return ExecutorForCudaGpuId(GpuIdManager::TfToCudaGpuId(tf_gpu_id));
+  }
+
+  // Verify that the cuda_gpu_id associated with a TfGpuId is legitimate.
+  static void CheckValidTfGpuId(TfGpuId tf_gpu_id) {
+    const CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
+    const int visible_device_count = GPUMachineManager()->VisibleDeviceCount();
+    CHECK_LT(cuda_gpu_id.value(), visible_device_count)
+        << "cuda_gpu_id is outside discovered device range."
+        << " TF GPU id: " << tf_gpu_id << " CUDA GPU id: " << cuda_gpu_id
+        << " visible device count: " << visible_device_count;
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_ID_UTILS_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h
index 006b2ca44817a37dd7d88018d6f1edef18f07787..2d49a64c0fd93bde2f9ddf4503ea9adc97571b5d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
 
 #include "tensorflow/core/framework/allocator.h"
 
@@ -33,4 +33,4 @@ class GpuManagedAllocator : public Allocator {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_MANAGED_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
index 7763a4f2e6f50292e78b4d16d8d4a3ee84d4163b..2500425359c424fa479af6dd34d6a0312c404577 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
@@ -108,7 +108,8 @@ TEST_F(GpuStreamUtilTest, StreamOverrides) {
   ops::_Recv(root.WithOpName("input"), DT_FLOAT, "input", "/cpu:0", 0,
              "/device:GPU:0");
   Output n = ops::MatMul(root, {}, {});
-  ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0, "/cpu:0");
+  ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0,
+             "/cpu:0");
   Graph g(OpRegistry::Global());
   TF_ASSERT_OK(root.ToGraph(&g));
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 657bdf0601bcc721c36209060654a19c3b6afb8a..a0f5877d62f0c889c2a598b8e03771e4bb49e0a9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -352,11 +352,7 @@ Status GPUUtil::Sync(Device* gpu_device) {
   if (!dev_info) {
     return errors::Internal("Failed to find dest device GPUDeviceInfo");
   }
-  dev_info->stream->BlockHostUntilDone();
-  if (!dev_info->stream->ok()) {
-    return errors::Internal("GPU sync failed");
-  }
-  return Status::OK();
+  return dev_info->stream->BlockHostUntilDone();
 }
 
 Status GPUUtil::SyncAll(Device* gpu_device) {
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 0675dbf3fcdc772f4d45025d296eaddbf4397271..61013bd1acd254b6e927a8d41accaeda424d6ebc 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -15,11 +15,15 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
 
+#include <cstring>
 #include <vector>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/pool_allocator.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -45,27 +49,19 @@ const bool FLAGS_brain_mem_reg_cuda_dma = true;
 // performance issues.
 const bool FLAGS_brain_gpu_record_mem_types = false;
 
-namespace gpu = ::perftools::gputools;
-
 namespace tensorflow {
-
 namespace {
+
 bool useCudaMallocAllocator() {
   const char* debug_allocator_str = std::getenv("TF_GPU_ALLOCATOR");
-  if (debug_allocator_str != nullptr &&
-      strcmp(debug_allocator_str, "cuda_malloc") == 0)
-    return true;
-  else
-    return false;
+  return debug_allocator_str != nullptr &&
+         std::strcmp(debug_allocator_str, "cuda_malloc") == 0;
 }
 
 bool useCudaMemoryGuardAllocator() {
   const char* debug_allocator_str = std::getenv("TF_GPU_ALLOCATOR");
-  if (debug_allocator_str != nullptr &&
-      strcmp(debug_allocator_str, "memory_guard") == 0)
-    return true;
-  else
-    return false;
+  return debug_allocator_str != nullptr &&
+         std::strcmp(debug_allocator_str, "memory_guard") == 0;
 }
 
 }  // namespace
@@ -93,8 +89,8 @@ ProcessState::~ProcessState() {
 }
 
 string ProcessState::MemDesc::DebugString() {
-  return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index, ", dma: ",
-                         gpu_registered, ", nic: ", nic_registered);
+  return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index,
+                         ", dma: ", gpu_registered, ", nic: ", nic_registered);
 }
 
 ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
@@ -107,23 +103,20 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
   return MemDesc();
 }
 
-Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options, int gpu_id,
+Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options,
+                                         TfGpuId tf_gpu_id,
                                          size_t total_bytes) {
 #if GOOGLE_CUDA
   const string& allocator_type = options.allocator_type();
   mutex_lock lock(mu_);
-  gpu::Platform* gpu_platform = GPUMachineManager();
+  GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
 
-  // Verify that gpu_id is legitimate.
-  CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount())
-      << "gpu_id is outside discovered device range";
-
-  if (gpu_id >= static_cast<int64>(gpu_allocators_.size())) {
-    gpu_allocators_.resize(gpu_id + 1);
-    if (FLAGS_brain_gpu_record_mem_types) gpu_al_.resize(gpu_id + 1);
+  if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
+    gpu_allocators_.resize(tf_gpu_id.value() + 1);
+    if (FLAGS_brain_gpu_record_mem_types) gpu_al_.resize(tf_gpu_id.value() + 1);
   }
 
-  if (gpu_allocators_[gpu_id] == nullptr) {
+  if (gpu_allocators_[tf_gpu_id.value()] == nullptr) {
     VisitableAllocator* gpu_allocator;
 
     // Validate allocator types.
@@ -132,45 +125,49 @@ Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options, int gpu_id,
       return nullptr;
     }
 
-    gpu_allocator = new GPUBFCAllocator(gpu_id, total_bytes, options);
+    const CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
+    gpu_allocator =
+        new GPUBFCAllocator(cuda_gpu_id, total_bytes, options,
+                            strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
 
     // If true, checks for memory overwrites by writing
     // distinctive patterns on both ends of allocated memory.
     if (useCudaMemoryGuardAllocator()) {
-      gpu_allocator = new GPUDebugAllocator(gpu_allocator, gpu_id);
-      gpu_allocator = new GPUNanResetAllocator(gpu_allocator, gpu_id);
+      gpu_allocator = new GPUDebugAllocator(gpu_allocator, cuda_gpu_id);
+      gpu_allocator = new GPUNanResetAllocator(gpu_allocator, cuda_gpu_id);
     } else if (useCudaMallocAllocator()) {
       // If true, passes all allocation requests through to cudaMalloc
       // useful for doing memory debugging with tools like cuda-memcheck
       // **WARNING** probably will not work in a multi-gpu scenario
-      gpu_allocator = new GPUcudaMallocAllocator(gpu_allocator, gpu_id);
+      gpu_allocator = new GPUcudaMallocAllocator(gpu_allocator, cuda_gpu_id);
     }
-    gpu_allocators_[gpu_id] = gpu_allocator;
+    gpu_allocators_[tf_gpu_id.value()] = gpu_allocator;
 
     // If there are any pending AllocVisitors for this bus, add
     // them now.
     gpu::StreamExecutor* se =
-        gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+        GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
     int bus_id = se->GetDeviceDescription().numa_node();
     if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) {
       for (const auto& v : gpu_visitors_[bus_id]) {
-        gpu_allocators_[gpu_id]->AddAllocVisitor(v);
+        gpu_allocator->AddAllocVisitor(v);
       }
     }
     if (FLAGS_brain_gpu_record_mem_types) {
       MemDesc md;
       md.loc = MemDesc::GPU;
-      md.dev_index = gpu_id;
+      md.dev_index = cuda_gpu_id.value();
       md.gpu_registered = false;
       md.nic_registered = true;
-      if (static_cast<int64>(gpu_al_.size()) <= gpu_id)
-        gpu_al_.resize(gpu_id + 1);
-      gpu_al_[gpu_id] = new internal::RecordingAllocator(
-          &mem_desc_map_, gpu_allocators_[gpu_id], md, &mu_);
+      if (static_cast<int64>(gpu_al_.size()) <= tf_gpu_id.value()) {
+        gpu_al_.resize(tf_gpu_id.value() + 1);
+      }
+      gpu_al_[tf_gpu_id.value()] = new internal::RecordingAllocator(
+          &mem_desc_map_, gpu_allocator, md, &mu_);
     }
   }
-  if (FLAGS_brain_gpu_record_mem_types) return gpu_al_[gpu_id];
-  return gpu_allocators_[gpu_id];
+  if (FLAGS_brain_gpu_record_mem_types) return gpu_al_[tf_gpu_id.value()];
+  return gpu_allocators_[tf_gpu_id.value()];
 #else
   LOG(FATAL) << "GPUAllocator unavailable. Not compiled with --config=cuda.";
   return nullptr;
@@ -234,8 +231,24 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
   // TODO(tucker): actually maintain separate CPUAllocators for
   // different numa_nodes.  For now, just one.
   numa_node = 0;
-  mutex_lock lock(mu_);
 
+  {
+    // Here we optimize the most common use case where cuda_host_allocators_
+    // and cuda_al_ have already been populated and since we're only reading
+    // these vectors, we can get by with a shared lock. In the slower case,
+    // we take a unique lock and populate these vectors.
+    tf_shared_lock lock(mu_);
+
+    if (FLAGS_brain_gpu_record_mem_types &&
+        static_cast<int>(cuda_al_.size()) > 0) {
+      return cuda_al_[0];
+    }
+    if (static_cast<int>(cuda_host_allocators_.size()) > numa_node) {
+      return cuda_host_allocators_[0];
+    }
+  }
+
+  mutex_lock lock(mu_);
   // Find the first valid StreamExecutor to request CUDA host memory
   // through, since any will work.
   //
@@ -246,7 +259,7 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
   gpu::StreamExecutor* se = nullptr;
   for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
     if (gpu_allocators_[i] != nullptr) {
-      se = GPUMachineManager()->ExecutorForDevice(i).ValueOrDie();
+      se = GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
       break;
     }
   }
@@ -290,14 +303,12 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
 void ProcessState::AddGPUAllocVisitor(int bus_id, AllocVisitor visitor) {
 #if GOOGLE_CUDA
   mutex_lock lock(mu_);
-  gpu::Platform* gpu_platform = GPUMachineManager();
-  for (int gpu_id = 0; gpu_id < static_cast<int64>(gpu_allocators_.size());
-       ++gpu_id) {
+  for (int i = 0; i < static_cast<int64>(gpu_allocators_.size()); ++i) {
     gpu::StreamExecutor* se =
-        gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
-    if (gpu_allocators_[gpu_id] &&
+        GpuIdUtil::ExecutorForTfGpuId(TfGpuId(i)).ValueOrDie();
+    if (gpu_allocators_[i] &&
         (se->GetDeviceDescription().numa_node() + 1) == bus_id) {
-      gpu_allocators_[gpu_id]->AddAllocVisitor(visitor);
+      gpu_allocators_[i]->AddAllocVisitor(visitor);
     }
   }
   while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
diff --git a/tensorflow/core/common_runtime/gpu/process_state.h b/tensorflow/core/common_runtime/gpu/process_state.h
index 319c508b92f539cdac04ff5acfa4740b0697bcd5..f6e234967306476542cec3038ea2e271cca2dc8c 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.h
+++ b/tensorflow/core/common_runtime/gpu/process_state.h
@@ -17,9 +17,11 @@ limitations under the License.
 #define TENSORFLOW_COMMON_RUNTIME_GPU_PROCESS_STATE_H_
 
 #include <functional>
+#include <map>
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -80,17 +82,17 @@ class ProcessState {
   //
   // 'total_bytes' is the total number of bytes that should be made
   // available to the allocator.  The first call to this function for
-  // a given gpu_id creates the allocator, so only the total_bytes
+  // a given tf_gpu_id creates the allocator, so only the total_bytes
   // used on that first call is used.
   //
   // "Allocator type" describes the type of algorithm to use for the
   // underlying allocator.  REQUIRES: Must be a valid type (see
   // config.proto for the list of supported strings.).
   //
-  // REQUIRES: gpu_id must be a valid ordinal for a GPU available in the
+  // REQUIRES: tf_gpu_id must be a valid id for a BaseGPUDevice available in the
   // current system environment.  Otherwise returns nullptr.
-  virtual Allocator* GetGPUAllocator(const GPUOptions& options, int gpu_id,
-                                     size_t total_bytes);
+  virtual Allocator* GetGPUAllocator(const GPUOptions& options,
+                                     TfGpuId tf_gpu_id, size_t total_bytes);
 
   virtual Allocator* GetCUDAHostAllocator(int numa_node);
 
@@ -153,9 +155,10 @@ class RecordingAllocator : public Allocator {
     a_->DeallocateRaw(p);
   }
   bool TracksAllocationSizes() override { return a_->TracksAllocationSizes(); }
-  size_t RequestedSize(void* p) override { return a_->RequestedSize(p); }
-  size_t AllocatedSize(void* p) override { return a_->AllocatedSize(p); }
-  void GetStats(AllocatorStats* stats) override { return a_->GetStats(stats); }
+  size_t RequestedSize(const void* p) override { return a_->RequestedSize(p); }
+  size_t AllocatedSize(const void* p) override { return a_->AllocatedSize(p); }
+  void GetStats(AllocatorStats* stats) override { a_->GetStats(stats); }
+  void ClearStats() override { a_->ClearStats(); }
   ProcessState::MDMap* mm_;  // not owned
   Allocator* a_;             // not owned
   ProcessState::MemDesc md_;
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 3b309e915cdd2c6d5eead9ed0312f3873bcf7335..33a5d60eb7ec4de829d3c0784f909ef42cf994d1 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -340,8 +340,11 @@ Status GraphExecutionState::OptimizeGraph(
     std::unordered_map<string, DeviceProperties> device_map;
     Device* cpu_device = nullptr;
     for (const auto& device : device_set_->devices()) {
-      device_map[device->name()] =
-          grappler::GetDeviceInfo(device->parsed_name());
+      DeviceProperties props = grappler::GetDeviceInfo(device->parsed_name());
+      if (props.type() == "UNKNOWN") {
+        continue;
+      }
+      device_map[device->name()] = props;
       if (device->parsed_name().id == 0 &&
           StringPiece(device->parsed_name().type) == "CPU" &&
           device->GetAllocator(AllocatorAttributes()) != nullptr) {
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index db2686ce2c45aa4c9997a624bb12720d63710b65..2312e1a89fd1fd5734fab4316c25ca2e39f16ae5 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -139,9 +139,7 @@ class GraphExecutionState {
 
   // The graph returned by BuildGraph may contain only the pruned
   // graph, whereas some clients may want access to the full graph.
-  const Graph* full_graph() {
-    return graph_;
-  }
+  const Graph* full_graph() { return graph_; }
 
   // Returns the node with the given name, or null if it does not exist.
   const Node* get_node_by_name(const string& name) const {
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index def185e52280bf004bf67cb1daef675c2f6ccff5..96ecfb41d4cae37112f73e6e60ece013a2a14bc0 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -37,7 +37,8 @@ void GraphOptimizer::Optimize(
     FunctionLibraryRuntime* runtime, Env* env, Device* device,
     std::unique_ptr<Graph>* graph,
     const std::unordered_map<string, std::vector<PartialTensorShape>>*
-        shape_map) {
+        shape_map,
+    const std::function<bool(const Node*)>& cse_consider_fn) {
   Graph* g = graph->get();
   DumpGraph("Initial", g);
 
@@ -80,7 +81,7 @@ void GraphOptimizer::Optimize(
       changed = true;
     }
     if (opts_.do_common_subexpression_elimination() &&
-        OptimizeCSE(g, nullptr)) {
+        OptimizeCSE(g, cse_consider_fn)) {
       DumpGraph("OptimizeCSE", g);
       changed = true;
     }
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index f5be1f870db711e0e0d64371c6595009cfef1092..80246281cde373863e4da1bb8d86bee39bfb9dfd 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
 
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
@@ -42,11 +42,17 @@ class GraphOptimizer {
   // pass may replace a node with a different node of the same name that has a
   // different number of outputs, or outputs with different known shapes.
   // TODO(b/65453533) introduce a unique way to name nodes in a graph.
+  //
+  // If cse_consider_fn is not null then only nodes for which cse_consider_fn
+  // returns true will be considered for CSE.
   void Optimize(
       FunctionLibraryRuntime* runtime, Env* env, Device* device,
       std::unique_ptr<Graph>* graph,
       const std::unordered_map<string, std::vector<PartialTensorShape>>*
-          shape_map);
+          shape_map,
+      const std::function<bool(const Node*)>& cse_consider_fn = nullptr);
+
+  const OptimizerOptions& options() { return opts_; }
 
  private:
   OptimizerOptions opts_;
@@ -56,4 +62,4 @@ class GraphOptimizer {
 
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GRAPH_OPTIMIZER_H_
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index a21304f7ef843706d564bd3f3a511324fd3189d6..f1082a60030fb3c289de35b4cab397c527f8afca 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -156,21 +156,21 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
   // should not be running expensive operators.
   auto runner = [](Executor::Args::Closure c) { c(); };
 
-  // Take ownership and pass to NewLocalExecutor
-  Graph* g = graph_to_run.release();
-
   LocalExecutorParams params;
   // The ownership of the output tensors are bound to this device's lifetime.
   params.device = cpu_device_.get();
   params.function_library = function_library;
-  params.create_kernel = [this, g](const NodeDef& ndef, OpKernel** kernel) {
-    return CreateNonCachedKernel(cpu_device_.get(), nullptr, ndef,
-                                 g->versions().producer(), kernel);
+  const int producer = graph_to_run->versions().producer();
+  params.create_kernel = [this, producer](const NodeDef& ndef,
+                                          OpKernel** kernel) {
+    return CreateNonCachedKernel(cpu_device_.get(), nullptr, ndef, producer,
+                                 kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) { delete kernel; };
 
   Executor* executor;
-  TF_RETURN_IF_ERROR(NewLocalExecutor(params, g, &executor));
+  TF_RETURN_IF_ERROR(
+      NewLocalExecutor(params, std::move(graph_to_run), &executor));
   std::unique_ptr<Executor> executor_unref(executor);
 
   Executor::Args args;
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 420dfe338efb473e36eb02a757fa957d15ba64df..64d884947568381eb2e5f60ab181b3c8c709d53b 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -39,6 +39,7 @@ limitations under the License.
 namespace tensorflow {
 namespace test {
 
+// TODO(hongm): Convert `g` and `init` to using std::unique_ptr.
 Benchmark::Benchmark(const string& device, Graph* g,
                      const SessionOptions* options, Graph* init,
                      Rendezvous* rendez) {
@@ -85,7 +86,8 @@ Benchmark::Benchmark(const string& device, Graph* g,
 
   if (init) {
     Executor* init_exec;
-    TF_CHECK_OK(NewLocalExecutor(params, init, &init_exec));
+    TF_CHECK_OK(
+        NewLocalExecutor(params, std::unique_ptr<Graph>(init), &init_exec));
     Executor::Args args;
     args.rendezvous = rendez_;
     args.runner = runner;
@@ -93,7 +95,7 @@ Benchmark::Benchmark(const string& device, Graph* g,
     delete init_exec;
   }
 
-  TF_CHECK_OK(NewLocalExecutor(params, g, &exec_));
+  TF_CHECK_OK(NewLocalExecutor(params, std::unique_ptr<Graph>(g), &exec_));
 }
 
 Benchmark::~Benchmark() {
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index 76b926ba40053288360f0e4e6fe2a37bd44ff0b4..090a16ebeb10007261666aeb6491a1785dd2e5c4 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -47,7 +47,7 @@ struct EndpointEq {
 static Status ProcessMemoryTypes(
     const DeviceType& device_type, const Graph* g,
     const std::function<Status(const Edge*, MemoryType, MemoryType)>& fn) {
-  if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL ) {
+  if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL) {
     // On non-GPU and non-SYCL devices, HOST_MEMORY and DEVICE_MEMORY are always
     // compatible.
     return Status::OK();
diff --git a/tensorflow/core/common_runtime/memory_types.h b/tensorflow/core/common_runtime/memory_types.h
index fa0a7595f32ac8bb43010dcd3a407825ef79f618..f854acfdc55d66c1ffa93acc0954edae393b2359 100644
--- a/tensorflow/core/common_runtime/memory_types.h
+++ b/tensorflow/core/common_runtime/memory_types.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
 
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/graph/graph.h"
@@ -45,4 +45,4 @@ Status MemoryTypeForOutput(const DeviceType& device_type, const Graph* g,
 
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_MEMORY_TYPES_H_
diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc
index 2a834ddca4236c626c6252f63c97118e8e1f0bd0..a093585571994e8b161b46a7fc397cdc3cd4254c 100644
--- a/tensorflow/core/common_runtime/memory_types_test.cc
+++ b/tensorflow/core/common_runtime/memory_types_test.cc
@@ -36,7 +36,7 @@ TEST(MemoryTypeChecker, Int32OK) {
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -64,7 +64,7 @@ TEST(MemoryTypeChecker, Int32NotOk) {
   // But we can insert _HostSend/_HostRecv to ensure the invariant.
   TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_SYCL, "/device:SYCL:0", g));
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -91,7 +91,7 @@ TEST(MemoryTypeChecker, MemoryTypeForOutput) {
   TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_SYCL, g, si, 0, &memory_type));
   // int Switch's output on GPU has HOST_MEMORY constraint.
   EXPECT_EQ(memory_type, HOST_MEMORY);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 63b74e8dbf1ac6482579e96fba32c952e0fe561e..0eb47f4e56dafff93736bd8c7112098fd11c0fed 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -46,7 +46,7 @@ class MklSubAllocator : public SubAllocator {
 
 /// CPU allocator for MKL that wraps BFC allocator and intercepts
 /// and redirects memory allocation calls from MKL.
-class MklCPUAllocator : public Allocator {
+class MklCPUAllocator : public VisitableAllocator {
  public:
   // Constructor and other standard functions
 
@@ -115,7 +115,17 @@ class MklCPUAllocator : public Allocator {
     allocator_->DeallocateRaw(ptr);
   }
 
-  void GetStats(AllocatorStats* stats) { return allocator_->GetStats(stats); }
+  void GetStats(AllocatorStats* stats) override { allocator_->GetStats(stats); }
+
+  void ClearStats() override { allocator_->ClearStats(); }
+
+  void AddAllocVisitor(Visitor visitor) override {
+    allocator_->AddAllocVisitor(visitor);
+  }
+
+  void AddFreeVisitor(Visitor visitor) override {
+    allocator_->AddFreeVisitor(visitor);
+  }
 
  private:
   // Hooks provided by this allocator for memory allocation routines from MKL
diff --git a/tensorflow/core/common_runtime/pending_counts.h b/tensorflow/core/common_runtime/pending_counts.h
index 9e39b6b7b93a8e35ad3b47c1c637f7d906649823..5e1925c40167fca0abe534e95bed487c77cd2215 100644
--- a/tensorflow/core/common_runtime/pending_counts.h
+++ b/tensorflow/core/common_runtime/pending_counts.h
@@ -1,5 +1,5 @@
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
 
 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
@@ -44,7 +44,7 @@ namespace tensorflow {
 
 //    PendingCounts counts(layout);
 //    ...
-//    counts.decrement_panding(h[id], 1);
+//    counts.decrement_pending(h[id], 1);
 class PendingCounts {
  public:
   // The state machine for a node's execution.
@@ -328,4 +328,4 @@ inline PendingCounts::Handle PendingCounts::Layout::CreateHandle(
 
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PENDING_COUNTS_H_
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 54f082e823d463301fc5f437781d01ce96741568..a913f2075181a3896015579d79093395d67101ff 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -369,7 +369,8 @@ class ColocationGraph {
                 "Could not satisfy explicit device specification '",
                 node->requested_device(), "' because no supported kernel for ",
                 specified_device_name.type, " devices is available.",
-                debug_info);
+                debug_info, "\nRegistered kernels:\n",
+                KernelsRegisteredForOp(node->type_string()));
           } else {
             return errors::InvalidArgument(
                 "Could not satisfy explicit device specification '",
diff --git a/tensorflow/core/common_runtime/placer.h b/tensorflow/core/common_runtime/placer.h
index c5b76592e1b4b86863009ef42b7bb7106377d054..75dce7c7feb2269fc994cbb8c5efd4b3799e75dd 100644
--- a/tensorflow/core/common_runtime/placer.h
+++ b/tensorflow/core/common_runtime/placer.h
@@ -88,9 +88,9 @@ class Placer {
   void AssignAndLog(int assigned_device, Node* node) const;
   void LogDeviceAssignment(const Node* node) const;
 
-  Graph* const graph_;                           // Not owned.
-  const DeviceSet* const devices_;               // Not owned.
-  const SessionOptions* options_;                // Not owned.
+  Graph* const graph_;              // Not owned.
+  const DeviceSet* const devices_;  // Not owned.
+  const SessionOptions* options_;   // Not owned.
   const bool log_device_placement_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Placer);
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 5d87b1e279ab0390a642df8f285fd451803ba29a..098024d2195aad8ef651120181ab271be168f92a 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -193,7 +194,7 @@ class PlacerTest : public ::testing::Test {
   // Builds the given graph, and (if successful) indexes the node
   // names for use in placement, and later lookup.
   Status BuildGraph(const GraphDefBuilder& builder, Graph* out_graph) {
-    TF_RETURN_IF_ERROR(builder.ToGraph(out_graph));
+    TF_RETURN_IF_ERROR(GraphDefBuilderToGraph(builder, out_graph));
     nodes_by_name_.clear();
     for (Node* node : out_graph->nodes()) {
       nodes_by_name_[node->name()] = node->id();
@@ -619,9 +620,9 @@ TEST_F(PlacerTest, TestReferenceConnectionIgnoreInfeasible) {
     Node* input = ops::SourceOp(
         "TestDevice",
         b.opts().WithName("in").WithDevice("/job:a/task:0/device:fakegpu:0"));
-    Node* var = ops::SourceOp("TestVariable",
-                              b.opts().WithName("var_0").WithDevice(
-                                  "/job:a/task:0/device:fakegpu:0"));
+    Node* var =
+        ops::SourceOp("TestVariable", b.opts().WithName("var_0").WithDevice(
+                                          "/job:a/task:0/device:fakegpu:0"));
 
     // This op is specified on CPU, but in practice will be ignored,
     // because the reference edges forces it on GPU.
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 142ff2339b90d56381e211c4c7b73009c8134949..e205e34aa0f6afb1363d65bd23403d4b50f056eb 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -30,7 +30,10 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     const FunctionLibraryDefinition* lib_def,
     const OptimizerOptions& optimizer_options,
     DistributedFunctionLibraryRuntime* parent)
-    : device_mgr_(device_mgr), lib_def_(lib_def), parent_(parent) {
+    : device_mgr_(device_mgr),
+      lib_def_(lib_def),
+      next_handle_(0),
+      parent_(parent) {
   if (device_mgr == nullptr) {
     flr_map_[nullptr] =
         NewFunctionLibraryRuntime(nullptr, env, nullptr, graph_def_version,
@@ -50,7 +53,10 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     const OptimizerOptions& optimizer_options,
     CustomKernelCreator custom_kernel_creator,
     DistributedFunctionLibraryRuntime* parent)
-    : device_mgr_(device_mgr), lib_def_(lib_def), parent_(parent) {
+    : device_mgr_(device_mgr),
+      lib_def_(lib_def),
+      next_handle_(0),
+      parent_(parent) {
   if (device_mgr == nullptr) {
     flr_map_[nullptr] = NewFunctionLibraryRuntime(
         nullptr, env, nullptr, graph_def_version, lib_def, optimizer_options,
@@ -64,33 +70,6 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
   }
 }
 
-ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
-    const DeviceMgr* device_mgr, Env* env, int graph_def_version,
-    const FunctionLibraryDefinition* lib_def,
-    const OptimizerOptions& optimizer_options)
-    : ProcessFunctionLibraryRuntime(device_mgr, env, graph_def_version, lib_def,
-                                    optimizer_options,
-                                    nullptr /* cluster_flr */) {}
-
-ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
-    const DeviceMgr* device_mgr, Env* env, int graph_def_version,
-    const FunctionLibraryDefinition* lib_def,
-    const OptimizerOptions& optimizer_options,
-    CustomKernelCreator custom_kernel_creator)
-    : ProcessFunctionLibraryRuntime(
-          device_mgr, env, graph_def_version, lib_def, optimizer_options,
-          std::move(custom_kernel_creator), nullptr /* cluster_flr */) {}
-
-/* static */
-string ProcessFunctionLibraryRuntime::ObtainFunctionTarget(
-    const AttrSlice& attrs) {
-  const AttrValue* value;
-  if (!attrs.Find("_target", &value).ok()) {
-    return "";
-  }
-  return DeviceNameUtils::CanonicalizeDeviceName(value->s());
-}
-
 /* static */
 Status ProcessFunctionLibraryRuntime::SendTensors(
     const string& source_device, const string& target_device,
@@ -162,7 +141,7 @@ Status ProcessFunctionLibraryRuntime::GetDeviceContext(
 }
 
 FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
-    const string& device_name) {
+    const string& device_name) const {
   Device* device = nullptr;
   if (device_name != kDefaultFLRDevice) {
     if (!device_mgr_->LookupDevice(device_name, &device).ok()) {
@@ -185,30 +164,38 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
   FunctionLibraryRuntime::Handle h =
       gtl::FindWithDefault(table_, function_key, kInvalidHandle);
   if (h != kInvalidHandle) {
-    return h;
+    if (function_data_.count(h) != 0) return h;
   }
-  h = function_data_.size();
-  function_data_.emplace_back(device_name, local_handle);
+  h = next_handle_;
+  function_data_.insert({h, FunctionData(device_name, local_handle)});
   table_[function_key] = h;
+  next_handle_++;
   return h;
 }
 
 FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::GetHandle(
     const string& function_key) const {
   mutex_lock l(mu_);
-  return gtl::FindWithDefault(table_, function_key, kInvalidHandle);
+  FunctionLibraryRuntime::Handle h =
+      gtl::FindWithDefault(table_, function_key, kInvalidHandle);
+  if (h != kInvalidHandle) {
+    if (function_data_.count(h) == 0) return kInvalidHandle;
+  }
+  return h;
 }
 
 bool ProcessFunctionLibraryRuntime::IsInstantiatedOnDevice(
     const string& device_name, FunctionLibraryRuntime::Handle handle) {
-  return GetHandleOnDevice(device_name, handle) != -1;
+  return GetHandleOnDevice(device_name, handle) != kInvalidHandle;
 }
 
 FunctionLibraryRuntime::LocalHandle
 ProcessFunctionLibraryRuntime::GetHandleOnDevice(
     const string& device_name, FunctionLibraryRuntime::Handle handle) {
   mutex_lock l(mu_);
-  CHECK_LE(handle, function_data_.size());
+  if (function_data_.count(handle) == 0) {
+    return kInvalidLocalHandle;
+  }
   const FunctionData& function_data = function_data_[handle];
   if (function_data.target_device != device_name) {
     return kInvalidLocalHandle;
@@ -219,32 +206,56 @@ ProcessFunctionLibraryRuntime::GetHandleOnDevice(
 string ProcessFunctionLibraryRuntime::GetDeviceName(
     FunctionLibraryRuntime::Handle handle) {
   mutex_lock l(mu_);
-  CHECK_LE(handle, function_data_.size());
+  CHECK_EQ(1, function_data_.count(handle));
   const FunctionData& function_data = function_data_[handle];
   return function_data.target_device;
 }
 
 Status ProcessFunctionLibraryRuntime::Instantiate(
     const string& function_name, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::Handle* handle) {
   *handle = kInvalidHandle;
-  string target = ObtainFunctionTarget(attrs);
-  FunctionLibraryRuntime* flr = GetFLR(target);
+  FunctionLibraryRuntime* flr = GetFLR(options.target);
   if (flr != nullptr) {
-    return flr->Instantiate(function_name, attrs, handle);
+    return flr->Instantiate(function_name, attrs, options, handle);
   }
   if (parent_ == nullptr) {
     return errors::Internal(
-        "Currently don't support instantiating functions on device: ", target);
+        "Currently don't support instantiating functions on device: ",
+        options.target);
   }
   FunctionLibraryRuntime::Handle cluster_handle;
-  TF_RETURN_IF_ERROR(
-      parent_->Instantiate(function_name, *lib_def_, attrs, &cluster_handle));
+  TF_RETURN_IF_ERROR(parent_->Instantiate(function_name, *lib_def_, attrs,
+                                          options, &cluster_handle));
   string function_key = Canonicalize(function_name, attrs);
-  *handle = AddHandle(function_key, target, cluster_handle);
+  *handle = AddHandle(function_key, options.target, cluster_handle);
   return Status::OK();
 }
 
+Status ProcessFunctionLibraryRuntime::RemoveHandle(
+    FunctionLibraryRuntime::Handle handle) {
+  mutex_lock l(mu_);
+  function_data_.erase(handle);
+  return Status::OK();
+}
+
+Status ProcessFunctionLibraryRuntime::ReleaseHandle(
+    FunctionLibraryRuntime::Handle handle) {
+  FunctionLibraryRuntime* flr = nullptr;
+  string target_device;
+  {
+    mutex_lock l(mu_);
+    CHECK_EQ(1, function_data_.count(handle)) << " handle: " << handle;
+    target_device = function_data_[handle].target_device;
+  }
+  flr = GetFLR(target_device);
+  if (flr != nullptr) {
+    return flr->ReleaseHandle(handle);
+  }
+  return errors::InvalidArgument("Handle not found: ", handle);
+}
+
 void ProcessFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
@@ -261,7 +272,10 @@ void ProcessFunctionLibraryRuntime::Run(
   FunctionLibraryRuntime::LocalHandle local_handle;
   {
     mutex_lock l(mu_);
-    CHECK_LE(handle, function_data_.size());
+    if (function_data_.count(handle) == 0) {
+      done(errors::NotFound("Handle: ", handle, " not found."));
+      return;
+    }
     target_device = function_data_[handle].target_device;
     local_handle = function_data_[handle].local_handle;
   }
@@ -319,4 +333,16 @@ void ProcessFunctionLibraryRuntime::Run(
   done(errors::Internal("Could not find device"));
 }
 
+Status ProcessFunctionLibraryRuntime::Clone(
+    Env* env, int graph_def_version, const OptimizerOptions& optimizer_options,
+    CustomKernelCreator custom_kernel_creator,
+    std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+    std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr) {
+  out_lib_def->reset(new FunctionLibraryDefinition(*lib_def_));
+  out_pflr->reset(new ProcessFunctionLibraryRuntime(
+      device_mgr_, env, graph_def_version, out_lib_def->get(),
+      optimizer_options, std::move(custom_kernel_creator), parent_));
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index a267bc3601f990206f7fb5202f6186543e42eb19..0473e16d242814930a9de17c88d4851d0d73edbe 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
 
 #include <unordered_map>
 
@@ -29,12 +29,13 @@ class ProcessFunctionLibraryRuntime {
   // Creates FunctionLibraryRuntime objects for each device in the provided
   // DeviceMgr. Caller needs to make sure that device_mgr, lib_def and parent
   // (if provided) outlive this object.
-  ProcessFunctionLibraryRuntime(const DeviceMgr* device_mgr, Env* env,
-                                int graph_def_version,
-                                const FunctionLibraryDefinition* lib_def,
-                                const OptimizerOptions& optimizer_options,
-                                DistributedFunctionLibraryRuntime* parent);
+  ProcessFunctionLibraryRuntime(
+      const DeviceMgr* device_mgr, Env* env, int graph_def_version,
+      const FunctionLibraryDefinition* lib_def,
+      const OptimizerOptions& optimizer_options,
+      DistributedFunctionLibraryRuntime* parent = nullptr);
 
+  // With `custom_kernel_creator`.
   ProcessFunctionLibraryRuntime(const DeviceMgr* device_mgr, Env* env,
                                 int graph_def_version,
                                 const FunctionLibraryDefinition* lib_def,
@@ -42,22 +43,6 @@ class ProcessFunctionLibraryRuntime {
                                 CustomKernelCreator custom_kernel_creator,
                                 DistributedFunctionLibraryRuntime* parent);
 
-  ProcessFunctionLibraryRuntime(const DeviceMgr* device_mgr, Env* env,
-                                int graph_def_version,
-                                const FunctionLibraryDefinition* lib_def,
-                                const OptimizerOptions& optimizer_options);
-
-  ProcessFunctionLibraryRuntime(const DeviceMgr* device_mgr, Env* env,
-                                int graph_def_version,
-                                const FunctionLibraryDefinition* lib_def,
-                                const OptimizerOptions& optimizer_options,
-                                CustomKernelCreator custom_kernel_creator);
-
-  // Given a list of attrs on a function, extracts the "_target" attribute which
-  // indicates which device to run the function on. If it can't find the _target
-  // attribute, returns "". Canonicalizes the device name.
-  static string ObtainFunctionTarget(const AttrSlice& attrs);
-
   // Sends `tensors_to_send` from `source_device` to `target_device` using
   // `rendezvous`. `key_prefix` is used as a prefix for the keys sent to the
   // Rendezvous. `device_context` should be the DeviceContext of the device
@@ -90,7 +75,7 @@ class ProcessFunctionLibraryRuntime {
 
   static const char kDefaultFLRDevice[];
   // Returns the FunctionLibraryRuntime for the corresponding device_name.
-  FunctionLibraryRuntime* GetFLR(const string& device_name);
+  FunctionLibraryRuntime* GetFLR(const string& device_name) const;
 
   // Returns the device incarnation for the given device_name.
   Status GetDeviceIncarnation(const string& device_name, int64* incarnation);
@@ -121,8 +106,15 @@ class ProcessFunctionLibraryRuntime {
   // Allows for function_name to be instantiated on different devices
   // as specified in attrs.
   Status Instantiate(const string& function_name, AttrSlice attrs,
+                     const FunctionLibraryRuntime::InstantiateOptions& options,
                      FunctionLibraryRuntime::Handle* handle);
 
+  // Delegates to the local FLR that owns state corresponding to `handle` and
+  // tells it to release it. If the `handle` isnt' needed at all, the local FLR
+  // might call RemoveHandle on this to get rid of the state owned by the Proc
+  // FLR.
+  Status ReleaseHandle(FunctionLibraryRuntime::Handle handle);
+
   // Runs the function with given `handle`. Function could have been
   // instantiated on any device. More details in framework/function.h
   void Run(const FunctionLibraryRuntime::Options& opts,
@@ -140,6 +132,15 @@ class ProcessFunctionLibraryRuntime {
   // of the device where the function is registered.
   string GetDeviceName(FunctionLibraryRuntime::Handle handle);
 
+  // Removes handle from the state owned by this object.
+  Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
+
+  Status Clone(Env* env, int graph_def_version,
+               const OptimizerOptions& optimizer_options,
+               CustomKernelCreator custom_kernel_creator,
+               std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+               std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr);
+
   friend class FunctionLibraryRuntimeImpl;
 
   mutable mutex mu_;
@@ -151,6 +152,7 @@ class ProcessFunctionLibraryRuntime {
     FunctionData(const string& target_device,
                  FunctionLibraryRuntime::LocalHandle local_handle)
         : target_device(target_device), local_handle(local_handle) {}
+    FunctionData() : FunctionData("", -1) {}
   };
 
   const DeviceMgr* const device_mgr_;
@@ -158,11 +160,13 @@ class ProcessFunctionLibraryRuntime {
   // Holds all the function invocations here.
   std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
       GUARDED_BY(mu_);
-  std::vector<FunctionData> function_data_ GUARDED_BY(mu_);
+  std::unordered_map<FunctionLibraryRuntime::Handle, FunctionData>
+      function_data_ GUARDED_BY(mu_);
   std::unordered_map<Device*, std::unique_ptr<FunctionLibraryRuntime>> flr_map_;
+  int next_handle_ GUARDED_BY(mu_);
   DistributedFunctionLibraryRuntime* const parent_;
 };
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_FUNCTION_LIBRARY_RUNTIME_H_
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 6bc8f980c7ab508f80a7c85a8e557880b8a4ab58..439ba1ce965ebe4addb525cd3d17d794feaecd1f 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -49,10 +49,12 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   Status Run(const string& name, FunctionLibraryRuntime::Options opts,
-             test::function::Attrs attrs, const std::vector<Tensor>& args,
-             std::vector<Tensor*> rets) {
+             test::function::Attrs attrs,
+             const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
+             const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
     FunctionLibraryRuntime::Handle handle;
-    Status status = proc_flr_->Instantiate(name, attrs, &handle);
+    Status status =
+        proc_flr_->Instantiate(name, attrs, instantiate_opts, &handle);
     if (!status.ok()) {
       return status;
     }
@@ -82,6 +84,22 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 
     EXPECT_GE(call_count, 1);  // Test runner is used.
 
+    // Release the handle and then try running the function. It shouldn't
+    // succeed.
+    status = proc_flr_->ReleaseHandle(handle);
+    if (!status.ok()) {
+      return status;
+    }
+    Notification done2;
+    proc_flr_->Run(opts, handle, args, &out,
+                   [&status, &done2](const Status& s) {
+                     status = s;
+                     done2.Notify();
+                   });
+    done2.WaitForNotification();
+    EXPECT_TRUE(errors::IsNotFound(status));
+    EXPECT_TRUE(StringPiece(status.error_message()).contains("not found."));
+
     return Status::OK();
   }
 
@@ -126,21 +144,6 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, Basic) {
   rendezvous_->Unref();
 }
 
-TEST_F(ProcessFunctionLibraryRuntimeTest, ObtainFunctionTarget) {
-  AttrSlice empty_attrs;
-  string target =
-      ProcessFunctionLibraryRuntime::ObtainFunctionTarget(empty_attrs);
-  EXPECT_EQ("", target);
-
-  AttrValueMap attr_values;
-  AttrValue v;
-  v.set_s("/job:a/replica:0/task:0/cpu:1");
-  AddAttr("_target", v, &attr_values);
-  AttrSlice attrs(&attr_values);
-  target = ProcessFunctionLibraryRuntime::ObtainFunctionTarget(attrs);
-  EXPECT_EQ("/job:a/replica:0/task:0/device:CPU:1", target);
-}
-
 TEST_F(ProcessFunctionLibraryRuntimeTest, GetDeviceIncarnation) {
   Init({});
   int64 incarnation;
@@ -160,12 +163,12 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, SingleCall) {
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
   opts.rendezvous = rendezvous_;
   opts.remote_execution = true;
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
   auto x = test::AsTensor<float>({1, 2, 3, 4});
   Tensor y;
   TF_CHECK_OK(
-      Run("XTimesTwo", opts,
-          {{"T", DT_FLOAT}, {"_target", "/job:a/replica:0/task:0/cpu:0"}}, {x},
-          {&y}));
+      Run("XTimesTwo", opts, {{"T", DT_FLOAT}}, instantiate_opts, {x}, {&y}));
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
   rendezvous_->Unref();
 }
@@ -176,9 +179,10 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, SingleCallFindDevice) {
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
   opts.rendezvous = rendezvous_;
   opts.remote_execution = true;
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
   Tensor y;
-  TF_CHECK_OK(Run("FindDevice", opts,
-                  {{"_target", "/job:a/replica:0/task:0/cpu:0"}}, {}, {&y}));
+  TF_CHECK_OK(Run("FindDevice", opts, {}, instantiate_opts, {}, {&y}));
   test::ExpectTensorEqual<string>(
       y, test::AsTensor<string>({"/job:a/replica:0/task:0/device:CPU:0"},
                                 TensorShape({})));
@@ -192,16 +196,14 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceXTimes) {
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
   opts.rendezvous = rendezvous_;
   opts.remote_execution = true;
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:a/replica:0/task:0/cpu:0";
   Tensor y;
   TF_CHECK_OK(
-      Run("XTimesTwo", opts,
-          {{"T", DT_FLOAT}, {"_target", "/job:a/replica:0/task:0/cpu:0"}}, {x},
-          {&y}));
+      Run("XTimesTwo", opts, {{"T", DT_FLOAT}}, instantiate_opts, {x}, {&y}));
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
   TF_CHECK_OK(
-      Run("XTimesFour", opts,
-          {{"T", DT_FLOAT}, {"_target", "/job:a/replica:0/task:0/cpu:0"}}, {x},
-          {&y}));
+      Run("XTimesFour", opts, {{"T", DT_FLOAT}}, instantiate_opts, {x}, {&y}));
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({4, 8, 12, 16}));
   rendezvous_->Unref();
 }
@@ -212,14 +214,14 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsSameDeviceFindDevice) {
   opts.source_device = "/job:a/replica:0/task:0/cpu:0";
   opts.rendezvous = rendezvous_;
   opts.remote_execution = true;
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:a/replica:0/task:0/cpu:1";
   Tensor y;
-  TF_CHECK_OK(Run("FindDevice", opts,
-                  {{"_target", "/job:a/replica:0/task:0/cpu:1"}}, {}, {&y}));
+  TF_CHECK_OK(Run("FindDevice", opts, {}, instantiate_opts, {}, {&y}));
   test::ExpectTensorEqual<string>(
       y, test::AsTensor<string>({"/job:a/replica:0/task:0/device:CPU:1"},
                                 TensorShape({})));
-  TF_CHECK_OK(Run("FindDevice", opts,
-                  {{"_target", "/job:a/replica:0/task:0/cpu:1"}}, {}, {&y}));
+  TF_CHECK_OK(Run("FindDevice", opts, {}, instantiate_opts, {}, {&y}));
   test::ExpectTensorEqual<string>(
       y, test::AsTensor<string>({"/job:a/replica:0/task:0/device:CPU:1"},
                                 TensorShape({})));
@@ -233,11 +235,15 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsDiffDeviceFindDevice) {
   opts.rendezvous = rendezvous_;
   opts.remote_execution = true;
   Tensor y;
-  TF_CHECK_OK(Run("FindDevice", opts, {{"_target", "/cpu:0"}}, {}, {&y}));
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts_0;
+  instantiate_opts_0.target = "/job:a/replica:0/task:0/device:CPU:0";
+  TF_CHECK_OK(Run("FindDevice", opts, {}, instantiate_opts_0, {}, {&y}));
   test::ExpectTensorEqual<string>(
       y, test::AsTensor<string>({"/job:a/replica:0/task:0/device:CPU:0"},
                                 TensorShape({})));
-  TF_CHECK_OK(Run("FindDevice", opts, {{"_target", "/cpu:1"}}, {}, {&y}));
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts_1;
+  instantiate_opts_1.target = "/job:a/replica:0/task:0/device:CPU:1";
+  TF_CHECK_OK(Run("FindDevice", opts, {}, instantiate_opts_1, {}, {&y}));
   test::ExpectTensorEqual<string>(
       y, test::AsTensor<string>({"/job:a/replica:0/task:0/device:CPU:1"},
                                 TensorShape({})));
diff --git a/tensorflow/core/common_runtime/profile_handler.h b/tensorflow/core/common_runtime/profile_handler.h
index 57c83c2e6f3c281c83c2596d3ca83dca221d5965..9d31b1aecbce210e8409db66aeb20a8e9245d9bc 100644
--- a/tensorflow/core/common_runtime/profile_handler.h
+++ b/tensorflow/core/common_runtime/profile_handler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
 
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/graph/types.h"
@@ -80,4 +80,4 @@ class ProfileHandler {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PROFILE_HANDLER_H_
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index c5c204d4faff8c5016cc0a48fec266b06409b668..fe4df1c106c5a86d4a9cdb73bafed7f4431e76a0 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -134,4 +134,4 @@ class RenamedDevice : public Device {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/rendezvous_util.cc b/tensorflow/core/common_runtime/rendezvous_util.cc
index a1e31016c2bc93aeae76175320255e0d43602265..92dc03812e9941e07500a9dc26baa7c1227430dc 100644
--- a/tensorflow/core/common_runtime/rendezvous_util.cc
+++ b/tensorflow/core/common_runtime/rendezvous_util.cc
@@ -32,6 +32,10 @@ Status SendTensorsToRendezvous(
         "; alloc_attrs.size() = ", alloc_attrs.size());
   }
 
+  if (!rendezvous) {
+    return errors::InvalidArgument("Rendezvous is null.");
+  }
+
   Rendezvous::ParsedKey parsed;
   for (int i = 0; i < keys.size(); ++i) {
     Rendezvous::Args rendez_args;
diff --git a/tensorflow/core/common_runtime/rendezvous_util.h b/tensorflow/core/common_runtime/rendezvous_util.h
index 3b6354603b2925dd7a1d2abe34308e9c8865f6bb..aad910f6d800f0043fba0fbad43801fd3b0ba914 100644
--- a/tensorflow/core/common_runtime/rendezvous_util.h
+++ b/tensorflow/core/common_runtime/rendezvous_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
 
 #include <map>
 
@@ -49,4 +49,4 @@ Status RecvOutputsFromRendezvous(Rendezvous* rendezvous, NamedTensors* out,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RENDEZVOUS_UTIL_H_
diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc
index 0234d4c37250d8ed3c645759dd17f94093e57df0..4dbe113e44ee0b7a6eba44ace3c1ff8daa17059f 100644
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@@ -60,8 +60,8 @@ const string RegisteredFactoriesErrorMessageLocked() {
                          str_util::Join(factory_types, ", "), "}.");
 }
 string SessionOptionsToString(const SessionOptions& options) {
-  return strings::StrCat("target: \"", options.target, "\" config: ",
-                         ProtoShortDebugString(options.config));
+  return strings::StrCat("target: \"", options.target,
+                         "\" config: ", ProtoShortDebugString(options.config));
 }
 }  // namespace
 
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index c82d57694a72f1f66381f957e8654ad1b13db21c..45cdab98e0642a3fbfee3dfa415696b98251600a 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -127,7 +127,7 @@ Status InferShapesForFunctionSubNode(const Node* node, ShapeRefiner* refiner,
 //
 // NOTE: Recursive user-defined functions are not supported.
 // Maybe we won't support recursive functions at all in TF, because of
-// other maintanabilty issues.
+// other maintainability issues.
 Status ShapeRefiner::InferShapesForFunction(
     const tensorflow::FunctionDef* function_def, bool keep_nested_shapes,
     ExtendedInferenceContext* outer_context) {
@@ -558,6 +558,13 @@ Status ShapeRefiner::ExtractConstantSubgraph(
     return Status::OK();
   }
 
+  if (target_node->type_string() == "PlaceholderWithDefault") {
+    return Status::OK();
+  }
+
+  // TODO(skyewm): more of the filtering applied in input nodes below should be
+  // applied to target_node here
+
   struct NodeAndRecursed {
     Node* new_node = nullptr;
     bool recursed = false;
@@ -608,6 +615,14 @@ Status ShapeRefiner::ExtractConstantSubgraph(
       return Status::OK();
     }
 
+    // Placeholders should never be constant folded because their outputs are
+    // fed by the user. Note that "Placeholder" nodes have no inputs so are
+    // handled below.
+    if (current_node->type_string() == "PlaceholderWithDefault") {
+      *is_constant_graph = false;
+      return Status::OK();
+    }
+
     // If there is nothing more to recurse down, see if
     // the generator node is a constant.
     if (current_node->num_inputs() == 0) {
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index da42c30ce949dbc3a953d20d0ff3333b6ba1b1d5..75eb5bf0d2972e6bccdd9c2c265f3494821210cc 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
 
 #include <vector>
 
@@ -303,4 +303,4 @@ class ShapeRefiner {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SHAPE_REFINER_H_
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index e4eef1dbe28bc79d2838b90ba6595a04ad1e4e2e..adf5a9afff2ebc6848db8811506ebd4a031df2bb 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -724,6 +724,25 @@ TEST_F(ShapeRefinerTest, PropagateRange) {
   EXPECT_EQ("[1,4,7,10]", ctx->DebugString(ctx->output(0)));
 }
 
+// Make sure PlaceholderWithDefaults aren't treated as constants.
+TEST_F(ShapeRefinerTest, NoPropagatePlaceholderWithDefault) {
+  Scope root = Scope::NewRootScope();
+  auto constant = ops::Const<int>(root, 2);
+  auto placeholder =
+      ops::PlaceholderWithDefault(root, constant, PartialTensorShape());
+  Node* shape_data;
+  TF_ASSERT_OK(NodeBuilder("Test", "ShapeData")
+                   .Input(placeholder.node())
+                   .Finalize(root.graph(), &shape_data));
+
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(constant.node()));
+  TF_ASSERT_OK(m.AddNode(placeholder.node()));
+  TF_ASSERT_OK(m.AddNode(shape_data));
+  shape_inference::InferenceContext* ic = m.GetContext(shape_data);
+  EXPECT_EQ(ic->DebugString(ic->output(0)), "?");
+}
+
 TEST_F(ShapeRefinerTest, ConstantValueTwoInputsToSameNode) {
   Scope root = Scope::NewRootScope();
   // This node is used as two inputs to 'range'.
diff --git a/tensorflow/core/common_runtime/stats_publisher_interface.h b/tensorflow/core/common_runtime/stats_publisher_interface.h
index b285420798761d70822f94afd622afbd1c2b5e0e..f063ee5297deed168abf9807792a0342dcf5f963 100644
--- a/tensorflow/core/common_runtime/stats_publisher_interface.h
+++ b/tensorflow/core/common_runtime/stats_publisher_interface.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
 
 #include "tensorflow/core/common_runtime/build_graph_options.h"
 #include "tensorflow/core/common_runtime/profile_handler.h"
@@ -61,4 +61,4 @@ std::unique_ptr<StatsPublisherInterface> CreateNoOpStatsPublisher(
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_STATS_PUBLISHER_INTERFACE_H_
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index d7e01144c9ef3aa09ddd212947eafe48ccff555b..cb900db10af98496cfdfafa5a38296bfdc4e996b 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -226,22 +226,23 @@ void StepStatsCollector::BuildCostModel(
       if (node) {
         for (int i = 0; i < stats.output_size(); ++i) {
           const auto& output = stats.output(i);
-          cm->RecordMaxMemorySize(node, i, Bytes(output.tensor_description()
-                                                     .allocation_description()
-                                                     .allocated_bytes()),
+          cm->RecordMaxMemorySize(node, i,
+                                  Bytes(output.tensor_description()
+                                            .allocation_description()
+                                            .allocated_bytes()),
                                   stats.output(i).tensor_description().shape(),
                                   node->output_types()[i]);
-          cm->RecordAllocationId(node, i, output.tensor_description()
-                                              .allocation_description()
-                                              .allocation_id());
+          cm->RecordAllocationId(node, i,
+                                 output.tensor_description()
+                                     .allocation_description()
+                                     .allocation_id());
         }
         cm->RecordMemoryStats(node, stats.memory_stats());
         // Use hardware stats to record the execution time if they're available,
         // otherwise use the regular (less accurate) stats
         string node_name = dev_stats.regular_stats->node_stats(i).node_name();
-        if (dev_stats.hardware_stats &&
-            name_to_hw_node_stats.find(node_name) !=
-                name_to_hw_node_stats.end()) {
+        if (dev_stats.hardware_stats && name_to_hw_node_stats.find(node_name) !=
+                                            name_to_hw_node_stats.end()) {
           const NodeExecStats& hw_stats = name_to_hw_node_stats[node_name];
           cm->RecordMaxExecutionTime(
               node, Microseconds(hw_stats.op_end_rel_micros()));
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
index 65b0db5bf610989f2b6187cb85f8c5e6ecae14d5..02bd8b8f3bc692728ce73176f6268d95f860dc9b 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
@@ -71,9 +71,16 @@ void SYCLAllocator::GetStats(AllocatorStats* stats) {
   *stats = stats_;
 }
 
+void SYCLAllocator::ClearStats() override {
+  mutex_lock l(mu_);
+  stats_.num_allocs = 0;
+  stats_.max_bytes_in_use = stats_.bytes_in_use;
+  stats_.max_alloc_size = 0;
+}
+
 size_t SYCLAllocator::RequestedSize(void* ptr) {
   mutex_lock lock(mu_);
-  if(!sycl_device_) {
+  if (!sycl_device_) {
     return 0;
   }
   const auto& buffer = sycl_device_->get_sycl_buffer(ptr);
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
index 3066e0e4426cbe6688307cf2ee9aae6b8c2e7d34..550f1933322420fc97da2bb588c719c73ea5ae4d 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
@@ -20,10 +20,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
 #define TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -44,6 +44,8 @@ class SYCLAllocator : public Allocator {
   }
   bool Ok() { return sycl_device_ && sycl_device_->ok(); }
   void GetStats(AllocatorStats* stats) override;
+  void ClearStats() override;
+
   // The SYCL buffers keep track of their size, so we already have tracking.
   bool TracksAllocationSizes() override { return true; }
   // Get the size of the corresponding SYCL buffer.
@@ -54,14 +56,13 @@ class SYCLAllocator : public Allocator {
   // Clear the SYCL device used by the Allocator
   void ClearSYCLDevice() {
     mutex_lock lock(mu_);
-    if(sycl_device_) {
+    if (sycl_device_) {
       delete sycl_device_;
       sycl_device_ = nullptr;
     }
   }
 
  private:
-
   mutable mutex mu_;
   Eigen::SyclDevice* sycl_device_ GUARDED_BY(mu_);  // owned
   AllocatorStats stats_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index cc272d156ef67a4f4f93f35603ffe301d154932a..7c09e0b8f194c7dc8a594aa487ec62e00d5b5e39 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -187,9 +187,9 @@ class GSYCLInterface {
       type = "Unknown";
     }
 
-    return strings::StrCat("id: ", device_id, ", type: ", type, ", name: ",
-                           name.c_str(), ", vendor: ", vendor.c_str(),
-                           ", profile: ", profile.c_str());
+    return strings::StrCat(
+        "id: ", device_id, ", type: ", type, ", name: ", name.c_str(),
+        ", vendor: ", vendor.c_str(), ", profile: ", profile.c_str());
   }
 };
 
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
index 19c14770dcad7a3ca045ccb4ff68189c943d8cff..14f7727659d91db2373a1ac8ad0e46258cc32fbe 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
@@ -26,7 +26,6 @@ class SYCLDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions &options, const string &name_prefix,
                        std::vector<Device *> *devices) override {
-
     auto syclInterface = GSYCLInterface::instance();
 
     size_t n = 1;
@@ -37,13 +36,11 @@ class SYCLDeviceFactory : public DeviceFactory {
 
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:SYCL:", i);
-      devices->push_back(
-          new SYCLDevice(options, name, Bytes(256 << 20), DeviceLocality()
-                         , syclInterface->GetShortDeviceDescription(i)
-                         , syclInterface->GetSYCLAllocator(i)
-                         , syclInterface->GetCPUAllocator(i)
-                         , syclInterface->GetSYCLContext(i))
-                       );
+      devices->push_back(new SYCLDevice(
+          options, name, Bytes(256 << 20), DeviceLocality(),
+          syclInterface->GetShortDeviceDescription(i),
+          syclInterface->GetSYCLAllocator(i), syclInterface->GetCPUAllocator(i),
+          syclInterface->GetSYCLContext(i)));
     }
 
     return Status::OK();
@@ -51,6 +48,6 @@ class SYCLDeviceFactory : public DeviceFactory {
 };
 
 REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory, 200);
-}
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_util.h b/tensorflow/core/common_runtime/sycl/sycl_util.h
index 83016b706a57033bfdaec932f763bc118434db90..3124ed23c92eb542e90e6c077fc703fb84b38a18 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_util.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_util.h
@@ -20,8 +20,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
 
-#include "tensorflow/core/common_runtime/device.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/device.h"
 // For DMA helper
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 6d796768de0a33f0f16e6954474ac95b92ec29e4..a32badef6dfdb8b62662da880c99842b1cafd13c 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -56,6 +56,7 @@ tf_proto_library(
     cc_grpc_version = 1,
     protodeps = [
         ":debugger_event_metadata_proto",
+        "//tensorflow/core/profiler:protos_all",
     ] + tf_additional_all_protos(),
     visibility = ["//tensorflow:__subpackages__"],
 )
@@ -123,6 +124,7 @@ tf_cuda_library(
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:proto_text",
@@ -144,6 +146,7 @@ tf_cuda_library(
         ":debugger_event_metadata_proto_cc",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:proto_text",
diff --git a/tensorflow/core/debug/debug_gateway.cc b/tensorflow/core/debug/debug_gateway.cc
index 616ced3d0f3d9cfed683120e792b40eb9010fe06..2e1aabd1cc8066df6a5f7e6dd0aa27c6a16ef614 100644
--- a/tensorflow/core/debug/debug_gateway.cc
+++ b/tensorflow/core/debug/debug_gateway.cc
@@ -24,31 +24,31 @@ limitations under the License.
 namespace tensorflow {
 
 DebugGateway::DebugGateway(DirectSession* session) : session_(session) {
-  session_->node_outputs_callback_ = [this](
-      const string& node_name, const int output_slot, const Tensor* tensor,
-      const bool is_ref, OpKernelContext* ctx) {
-    if (comp_cb_ != nullptr && output_slot <= 0) {
-      // The node completion callback is invoked once for a node regardless
-      // of whether the node has zero, one or more outputs.
-      // The output_slot can be negative (-1, or kControlSlot) if
-      // node_outputs_callback_ is invoked for a node with no output. If that
-      // is the case, notify the callback that the node in question has no
-      // output.
-      comp_cb_(node_name, output_slot == 0);
-    }
-
-    // Copy tensor values (e.g., from GPU to host) only if the
-    // value callback is not nullptr.
-    if (val_cb_ != nullptr && output_slot >= 0) {
-      CopyTensor(
-          node_name, output_slot, tensor, ctx,
-          [this, node_name, output_slot, is_ref](const Tensor* copied_tensor) {
-            val_cb_(node_name, output_slot, *copied_tensor, is_ref);
-          });
-    }
-
-    return Status::OK();
-  };
+  session_->node_outputs_callback_ =
+      [this](const string& node_name, const int output_slot,
+             const Tensor* tensor, const bool is_ref, OpKernelContext* ctx) {
+        if (comp_cb_ != nullptr && output_slot <= 0) {
+          // The node completion callback is invoked once for a node regardless
+          // of whether the node has zero, one or more outputs.
+          // The output_slot can be negative (-1, or kControlSlot) if
+          // node_outputs_callback_ is invoked for a node with no output. If
+          // that is the case, notify the callback that the node in question has
+          // no output.
+          comp_cb_(node_name, output_slot == 0);
+        }
+
+        // Copy tensor values (e.g., from GPU to host) only if the
+        // value callback is not nullptr.
+        if (val_cb_ != nullptr && output_slot >= 0) {
+          CopyTensor(node_name, output_slot, tensor, ctx,
+                     [this, node_name, output_slot,
+                      is_ref](const Tensor* copied_tensor) {
+                       val_cb_(node_name, output_slot, *copied_tensor, is_ref);
+                     });
+        }
+
+        return Status::OK();
+      };
 }
 
 DebugGateway::~DebugGateway() {
@@ -86,7 +86,8 @@ void DebugGateway::CopyTensor(const string& node_name, const int output_slot,
     // Determine if the tensor is on device (GPU) or host (CPU).
     // The second part of the check is necessary because even an OpKernel on
     // may have output tensors allocated on CPU.
-    if ((device->name().find("GPU:") != string::npos || device->name().find("SYCL:") != string::npos) &&
+    if ((device->name().find("GPU:") != string::npos ||
+         device->name().find("SYCL:") != string::npos) &&
         !ctx->output_alloc_attr(output_slot).on_host()) {
       // GPU tensors: Copy it to host (CPU).
       DeviceContext* device_ctxt = ctx->op_device_context();
diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
index 57583349069a0b4deb137cb09564cdbb3909a4b0..b1bbd3f6980b16c13a1e5c9cd3a0f6c4bb8c1217 100644
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@@ -390,9 +390,9 @@ TEST_F(SessionDebugMinusAXTest,
   debug_gateway.SetNodeValueCallback(
       [this, &mu, &val_callback_count, &a_debug_identity_node_name,
        &x_debug_identity_node_name, &y_debug_identity_node_name,
-       &debug_identity_tensor_vals, &callbacks_done, &kConcurrentRuns](
-           const string& node_name, const int output_slot,
-           const Tensor& tensor_value, const bool is_ref) {
+       &debug_identity_tensor_vals, &callbacks_done,
+       &kConcurrentRuns](const string& node_name, const int output_slot,
+                         const Tensor& tensor_value, const bool is_ref) {
         mutex_lock l(mu);
 
         if (node_name == a_debug_identity_node_name && output_slot == 0) {
@@ -560,21 +560,21 @@ TEST_F(SessionDebugOutputSlotWithoutOutgoingEdgeTest,
   Notification callbacks_done;
 
   std::vector<Tensor> debug_identity_tensor_vals;
-  debug_gateway.SetNodeValueCallback([this, &mu, &callbacks_done,
-                                      &debug_identity_node_name,
-                                      &debug_identity_tensor_vals](
-      const string& node_name, const int output_slot,
-      const Tensor& tensor_value, const bool is_ref) {
-    mutex_lock l(mu);
+  debug_gateway.SetNodeValueCallback(
+      [this, &mu, &callbacks_done, &debug_identity_node_name,
+       &debug_identity_tensor_vals](
+          const string& node_name, const int output_slot,
+          const Tensor& tensor_value, const bool is_ref) {
+        mutex_lock l(mu);
 
-    if (node_name == debug_identity_node_name && output_slot == 0) {
-      debug_identity_tensor_vals.push_back(tensor_value);
+        if (node_name == debug_identity_node_name && output_slot == 0) {
+          debug_identity_tensor_vals.push_back(tensor_value);
 
-      if (!callbacks_done.HasBeenNotified()) {
-        callbacks_done.Notify();
-      }
-    }
-  });
+          if (!callbacks_done.HasBeenNotified()) {
+            callbacks_done.Notify();
+          }
+        }
+      });
 
   // Add DebugIdentity watch on c:0, which does not have an outgoing edge.
   RunOptions run_opts;
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index a312f789d8444360a0892faa4b3a0f9a0bdf7a32..f70931e926507c72287588da278a3b8d6bb19122 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -30,7 +30,7 @@ namespace test {
 
 ::grpc::Status TestEventListenerImpl::SendEvents(
     ::grpc::ServerContext* context,
-    ::grpc::ServerReaderWriter< ::tensorflow::EventReply, ::tensorflow::Event>*
+    ::grpc::ServerReaderWriter<::tensorflow::EventReply, ::tensorflow::Event>*
         stream) {
   Event event;
 
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index f81445c20bd2ba56a6d7d3bb4ddefc71f5199784..baa8c08fdf1508cd599d4c9523b06954280a609d 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -574,8 +574,6 @@ Status DebugIO::CloseDebugURL(const string& debug_url) {
   }
 }
 
-static Status CloseDebugURL(const string& debug_url) { return Status::OK(); }
-
 Status DebugFileIO::DumpTensorToDir(const DebugNodeKey& debug_node_key,
                                     const Tensor& tensor,
                                     const uint64 wall_time_us,
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 2f83c2415b831cc1a2b90d4e6a2046218e6fe5f6..0807a85b8b39cf8bf479227bd6b6bd581e2ba9b0 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -57,7 +57,8 @@ class DebugIOUtilsTest : public ::testing::Test {
 TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) {
   DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/device:GPU:2",
                               "hidden_1/MatMul", 0, "DebugIdentity");
-  EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2", debug_node_key.device_name);
+  EXPECT_EQ("/job:worker/replica:1/task:0/device:GPU:2",
+            debug_node_key.device_name);
   EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name);
   EXPECT_EQ(0, debug_node_key.output_slot);
   EXPECT_EQ("DebugIdentity", debug_node_key.debug_op);
diff --git a/tensorflow/core/debug/debug_service.proto b/tensorflow/core/debug/debug_service.proto
index 547c0576f08769f9e373a98231caf172a9312937..4bef74dfc5706b0033ff91b5e6cf09bb119d657d 100644
--- a/tensorflow/core/debug/debug_service.proto
+++ b/tensorflow/core/debug/debug_service.proto
@@ -18,6 +18,8 @@ syntax = "proto3";
 package tensorflow;
 
 import "tensorflow/core/framework/tensor.proto";
+import "tensorflow/core/profiler/tfprof_log.proto";
+import "tensorflow/core/protobuf/debug.proto";
 import "tensorflow/core/util/event.proto";
 
 // Reply message from EventListener to the client, i.e., to the source of the
@@ -46,6 +48,38 @@ message EventReply {
   // during debugging.
 }
 
+// Data on the traceback of a debugged call, e.g., a Session.run() call, or the
+// execution of an eager operation.
+message CallTraceback {
+  enum CallType {
+    UNSPECIFIED = 0;
+    GRAPH_EXECUTION = 1;
+    EAGER_EXECUTION = 2;
+  }
+
+  CallType call_type = 1;
+
+  // A key for the call. For example, for graph execution, this is a key
+  // consisting of the names of the fed and fetched tensors.
+  string call_key = 2;
+
+  // Traceback stack for the origin of the call event.
+  // For graph execution, this is the stack of the Session.run() call.
+  // For eager execution, this is the stack of the Python line that invokes
+  // the execution of the eager op.
+  tfprof.CodeDef origin_stack = 3;
+
+  // Keeps track of the mapping from integer IDs in `origin_stack` to actual
+  // string values (e.g., file paths, function names).
+  map<int64, string> origin_id_to_string = 4;
+
+  // Traceback for the graph (if any) involved in the call.
+  tfprof.OpLogProto graph_traceback = 5;
+
+  // Version of the graph in `graph_traceback` (if any).
+  int64 graph_version = 6;
+}
+
 // EventListener: Receives Event protos, e.g., from debugged TensorFlow
 // runtime(s).
 service EventListener {
@@ -57,4 +91,10 @@ service EventListener {
   //      ops that get executed immediately after the beginning of the graph
   //      execution.
   rpc SendEvents(stream Event) returns (stream EventReply);
+
+  // Send the tracebacks of a TensorFlow execution call.
+  rpc SendTracebacks(CallTraceback) returns (EventReply);
+
+  // Send a collection of source code files being debugged.
+  rpc SendSourceFiles(DebuggedSourceFiles) returns (EventReply);
 }
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 29164bbffe0fa586d0ca7ac72ee94c4fbb91aa3e..9e152aa0823b67fceb7f103cc6e090f00870f88a 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -145,6 +145,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:worker_proto_cc",
     ],
 )
 
@@ -269,7 +270,6 @@ cc_library(
     hdrs = ["worker_cache_wrapper.h"],
     deps = [
         ":worker_cache",
-        ":worker_interface",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -335,6 +335,7 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
@@ -372,6 +373,7 @@ cc_library(
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:tensorflow_opensource",
     ],
 )
@@ -415,6 +417,7 @@ cc_library(
         ":worker_env",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -554,3 +557,47 @@ tf_cuda_cc_test(
         "//tensorflow/core/kernels:array",
     ],
 )
+
+cc_library(
+    name = "request_id",
+    srcs = ["request_id.cc"],
+    hdrs = ["request_id.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "request_id_test",
+    size = "small",
+    srcs = ["request_id_test.cc"],
+    deps = [
+        ":request_id",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "recent_request_ids",
+    srcs = ["recent_request_ids.cc"],
+    hdrs = ["recent_request_ids.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:worker_proto_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "recent_request_ids_test",
+    size = "small",
+    srcs = ["recent_request_ids_test.cc"],
+    deps = [
+        ":recent_request_ids",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:worker_proto_cc",
+    ],
+)
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 593fe0e363edc543a74572ed51128777e048a47d..3a8d5912369525253904bd700dfdc6e3eb26e0ae 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -26,10 +26,10 @@ namespace tensorflow {
 
 /* static */
 Status ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
-    const OpDef& sig, AttrSlice attrs, GraphDef* g,
+    const OpDef& sig, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options, GraphDef* g,
     std::vector<string>* send_keys, std::vector<string>* recv_keys) {
-  const string& target =
-      ProcessFunctionLibraryRuntime::ObtainFunctionTarget(attrs);
+  const string& target = options.target;
   // Construct recv nodes for each input argument.
   int i = 0;
   for (const auto& in : sig.input_arg()) {
@@ -105,6 +105,7 @@ Status ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
         Rendezvous::CreateKey(target, 1 /* src_incarnation */, target,
                               out.name(), FrameAndIter(0, 0));
     recv_keys->push_back(key);
+    ++i;
   }
   return Status::OK();
 }
@@ -118,14 +119,17 @@ ClusterFunctionLibraryRuntime::~ClusterFunctionLibraryRuntime() {
 
 Status ClusterFunctionLibraryRuntime::Instantiate(
     const string& function_name, const FunctionLibraryDefinition& lib_def,
-    AttrSlice attrs, FunctionLibraryRuntime::LocalHandle* handle) {
-  const string& target =
-      ProcessFunctionLibraryRuntime::ObtainFunctionTarget(attrs);
-  WorkerInterface* wi = worker_session_->worker_cache->CreateWorker(target);
+    AttrSlice attrs, const FunctionLibraryRuntime::InstantiateOptions& options,
+    FunctionLibraryRuntime::LocalHandle* handle) {
+  WorkerInterface* wi =
+      worker_session_->worker_cache->CreateWorker(options.target);
 
   if (wi == nullptr) {
-    return errors::InvalidArgument("Could not find worker with target: ",
-                                   target);
+    std::vector<string> workers;
+    worker_session_->worker_cache->ListWorkers(&workers);
+    return errors::InvalidArgument(
+        "Could not find worker with target: ", options.target,
+        " Available workers: ", str_util::Join(workers, ", "));
   }
 
   // Make RPC and obtain a graph handle.
@@ -133,8 +137,8 @@ Status ClusterFunctionLibraryRuntime::Instantiate(
   const OpDef& sig = fdef->signature();
   GraphDef gdef;
   std::vector<string> send_keys, recv_keys;
-  TF_RETURN_IF_ERROR(
-      ConstructFunctionGraph(sig, attrs, &gdef, &send_keys, &recv_keys));
+  TF_RETURN_IF_ERROR(ConstructFunctionGraph(sig, attrs, options, &gdef,
+                                            &send_keys, &recv_keys));
   *gdef.mutable_library() = lib_def.ToProto();
 
   RegisterGraphRequest req;
@@ -148,8 +152,8 @@ Status ClusterFunctionLibraryRuntime::Instantiate(
 
   mutex_lock l(mu_);
   *handle = function_data_.size();
-  function_data_.push_back(
-      FunctionData(resp.graph_handle(), target, wi, send_keys, recv_keys));
+  function_data_.push_back(FunctionData(resp.graph_handle(), options.target, wi,
+                                        send_keys, recv_keys));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index dd4ea68f57121d491f6352cbf13ed47b7ecf6bd4..d3ca350e3659ffa9f8248d2be80a1b1f0303addc 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
 
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
@@ -34,6 +34,7 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
 
   Status Instantiate(const string& function_name,
                      const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
+                     const FunctionLibraryRuntime::InstantiateOptions& options,
                      FunctionLibraryRuntime::LocalHandle* handle) override;
 
   void Run(const FunctionLibraryRuntime::Options& opts,
@@ -42,10 +43,10 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
            FunctionLibraryRuntime::DoneCallback done) override;
 
  private:
-  static Status ConstructFunctionGraph(const OpDef& sig, AttrSlice attrs,
-                                       GraphDef* g,
-                                       std::vector<string>* send_keys,
-                                       std::vector<string>* recv_keys);
+  static Status ConstructFunctionGraph(
+      const OpDef& sig, AttrSlice attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options, GraphDef* g,
+      std::vector<string>* send_keys, std::vector<string>* recv_keys);
   friend class ClusterFunctionLibraryRuntimeTest;
 
   mutable mutex mu_;
@@ -73,4 +74,4 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CLUSTER_FUNCTION_LIBRARY_RUNTIME_H_
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index 04587dd8ca8638d031d840b0b53b5168bdab63c2..1810996ab8c2a8e4901c007517ae276829b4fc2a 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -47,30 +47,31 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
         new ClusterFunctionLibraryRuntime(worker_session_.get()));
   }
 
-  Status ConstructFunctionGraphHelper(const OpDef& sig,
-                                      test::function::Attrs attrs, GraphDef* g,
-                                      std::vector<string>* send_keys,
-                                      std::vector<string>* recv_keys) {
+  Status ConstructFunctionGraphHelper(
+      const OpDef& sig, test::function::Attrs attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options, GraphDef* g,
+      std::vector<string>* send_keys, std::vector<string>* recv_keys) {
     return ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
-        sig, attrs, g, send_keys, recv_keys);
+        sig, attrs, options, g, send_keys, recv_keys);
   }
 
   Status Instantiate(const string& function_name,
                      const FunctionLibraryDefinition& lib_def,
                      test::function::Attrs attrs,
+                     const FunctionLibraryRuntime::InstantiateOptions& options,
                      FunctionLibraryRuntime::LocalHandle* local_handle) {
-    return cluster_flr_->Instantiate(function_name, lib_def, attrs,
+    return cluster_flr_->Instantiate(function_name, lib_def, attrs, options,
                                      local_handle);
   }
 
-  Status InstantiateAndRun(const string& function_name,
-                           const FunctionLibraryDefinition& lib_def,
-                           test::function::Attrs attrs,
-                           const std::vector<Tensor>& args,
-                           std::vector<Tensor*> rets) {
+  Status InstantiateAndRun(
+      const string& function_name, const FunctionLibraryDefinition& lib_def,
+      test::function::Attrs attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options,
+      const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
     FunctionLibraryRuntime::LocalHandle handle;
-    TF_RETURN_IF_ERROR(
-        cluster_flr_->Instantiate(function_name, lib_def, attrs, &handle));
+    TF_RETURN_IF_ERROR(cluster_flr_->Instantiate(function_name, lib_def, attrs,
+                                                 options, &handle));
 
     Notification done;
     FunctionLibraryRuntime::Options opts;
@@ -102,15 +103,15 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
 TEST_F(ClusterFunctionLibraryRuntimeTest, ConstructFunctionGraph) {
   GraphDef actual;
   std::vector<string> send_keys, recv_keys;
-  TF_CHECK_OK(ConstructFunctionGraphHelper(
-      test::function::XTimesTwo().signature(),
-      {{"T", DT_FLOAT}, {"_target", "/job:a/replica:0/task:0/cpu:0"}}, &actual,
-      &send_keys, &recv_keys));
-
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:a/replica:0/task:0/device:CPU:0";
+  TF_CHECK_OK(ConstructFunctionGraphHelper(test::function::Swap().signature(),
+                                           {{"T", DT_FLOAT}}, instantiate_opts,
+                                           &actual, &send_keys, &recv_keys));
   GraphDef expected;
   protobuf::TextFormat::ParseFromString(R"(
 node {
-  name: "_recv_x_0"
+  name: "_recv_i0_0"
   op: "_Recv"
   device: "/job:a/replica:0/task:0/device:CPU:0"
   attr {
@@ -140,7 +141,7 @@ node {
   attr {
     key: "tensor_name"
     value {
-      s: "x"
+      s: "i0"
     }
   }
   attr {
@@ -151,9 +152,51 @@ node {
   }
 }
 node {
-  name: "XTimesTwo"
-  op: "XTimesTwo"
-  input: "_recv_x_0"
+  name: "_recv_i1_1"
+  op: "_Recv"
+  device: "/job:a/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:a/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:a/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "i1"
+    }
+  }
+  attr {
+    key: "tensor_type"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Swap"
+  op: "Swap"
+  input: "_recv_i0_0"
+  input: "_recv_i1_1"
   device: "/job:a/replica:0/task:0/device:CPU:0"
   attr {
     key: "T"
@@ -169,9 +212,51 @@ node {
   }
 }
 node {
-  name: "_send_y_0"
+  name: "_send_o0_0"
+  op: "_Send"
+  input: "Swap"
+  device: "/job:a/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:a/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:a/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "o0"
+    }
+  }
+}
+node {
+  name: "_send_o1_1"
   op: "_Send"
-  input: "XTimesTwo"
+  input: "Swap:1"
   device: "/job:a/replica:0/task:0/device:CPU:0"
   attr {
     key: "T"
@@ -206,10 +291,11 @@ node {
   attr {
     key: "tensor_name"
     value {
-      s: "y"
+      s: "o1"
     }
   }
-})",
+}
+)",
                                         &expected);
   TF_EXPECT_GRAPH_EQ(expected, actual);
 }
@@ -222,28 +308,30 @@ TEST_F(ClusterFunctionLibraryRuntimeTest, DISABLED_InstantiateAndRun) {
   FunctionDefLibrary proto;
   *(proto.add_function()) = test::function::XTimesTwoInt32();
   FunctionLibraryDefinition lib_def(OpRegistry::Global(), proto);
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:localhost/replica:0/task:1/cpu:0";
 
   Tensor y;
   auto x = test::AsTensor<int32>({1, 2, 3, 4});
-  TF_EXPECT_OK(InstantiateAndRun(
-      "XTimesTwoInt32", lib_def,
-      {{"_target", "/job:localhost/replica:0/task:1/cpu:0"}}, {x}, {&y}));
+  TF_EXPECT_OK(InstantiateAndRun("XTimesTwoInt32", lib_def, {},
+                                 instantiate_opts, {x}, {&y}));
   test::ExpectTensorEqual<int32>(y, test::AsTensor<int32>({2, 4, 6, 8}));
 }
 
 TEST_F(ClusterFunctionLibraryRuntimeTest,
        DISABLED_InstantiateAndRunAttrSubstitution) {
   FunctionDefLibrary proto;
-  *(proto.add_function()) = test::function::XTimesTwo();
+  *(proto.add_function()) = test::function::Swap();
   FunctionLibraryDefinition lib_def(OpRegistry::Global(), proto);
-
-  Tensor y;
-  auto x = test::AsTensor<float>({1, 2, 3, 4});
-  TF_EXPECT_OK(InstantiateAndRun(
-      "XTimesTwo", lib_def,
-      {{"T", DT_FLOAT}, {"_target", "/job:localhost/replica:0/task:1/cpu:0"}},
-      {x}, {&y}));
-  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+  FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+  instantiate_opts.target = "/job:localhost/replica:0/task:1/cpu:0";
+  Tensor y1, y2;
+  auto x1 = test::AsTensor<float>({1, 2, 3, 4});
+  auto x2 = test::AsTensor<float>({4, 3, 2, 1});
+  TF_EXPECT_OK(InstantiateAndRun("Swap", lib_def, {{"T", DT_FLOAT}},
+                                 instantiate_opts, {x1, x2}, {&y1, &y2}));
+  test::ExpectTensorEqual<float>(y1, test::AsTensor<float>({4, 3, 2, 1}));
+  test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({1, 2, 3, 4}));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/executor_test.cc b/tensorflow/core/distributed_runtime/executor_test.cc
index 5b115f9a4d4ea3e9b99228918e16fc354d5a99fe..e34224205bac48a2dba1bf8cb07f9c623cd38281 100644
--- a/tensorflow/core/distributed_runtime/executor_test.cc
+++ b/tensorflow/core/distributed_runtime/executor_test.cc
@@ -57,7 +57,7 @@ class ExecutorTest : public ::testing::Test {
   }
 
   // Resets executor_ with a new executor based on a graph 'gdef'.
-  void Create(const Graph* graph) {
+  void Create(std::unique_ptr<const Graph> graph) {
     const int version = graph->versions().producer();
     LocalExecutorParams params;
     params.device = device_;
@@ -69,7 +69,7 @@ class ExecutorTest : public ::testing::Test {
       DeleteNonCachedKernel(kernel);
     };
     delete exec_;
-    TF_CHECK_OK(NewLocalExecutor(params, graph, &exec_));
+    TF_CHECK_OK(NewLocalExecutor(params, std::move(graph), &exec_));
     runner_ = [this](std::function<void()> fn) { thread_pool_->Schedule(fn); };
     rendez_ = NewLocalRendezvous();
   }
@@ -144,12 +144,12 @@ Rendezvous::ParsedKey Key(const string& sender, const uint64 incarnation,
 
 TEST_F(ExecutorTest, SimpleAdd) {
   // c = a + b
-  Graph* g = new Graph(OpRegistry::Global());
-  auto in0 = test::graph::Recv(g, "a", "float", ALICE, 1, BOB);
-  auto in1 = test::graph::Recv(g, "b", "float", ALICE, 1, BOB);
-  auto tmp = test::graph::Add(g, in0, in1);
-  test::graph::Send(g, tmp, "c", BOB, 1, ALICE);
-  Create(g);
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  auto in0 = test::graph::Recv(g.get(), "a", "float", ALICE, 1, BOB);
+  auto in1 = test::graph::Recv(g.get(), "b", "float", ALICE, 1, BOB);
+  auto tmp = test::graph::Add(g.get(), in0, in1);
+  test::graph::Send(g.get(), tmp, "c", BOB, 1, ALICE);
+  Create(std::move(g));
   Rendezvous::Args args;
   TF_ASSERT_OK(rendez_->Send(Key(ALICE, kIncarnation, BOB, "a"), args, V(1.0),
                              false));  // in0 = 1.0
@@ -172,15 +172,15 @@ TEST_F(ExecutorTest, SelfAdd) {
   //
   // b <- v10
   // All nodes are executed by one thread.
-  Graph* g = new Graph(OpRegistry::Global());
-  auto v = test::graph::Recv(g, "a", "float", ALICE, 1, BOB);
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  auto v = test::graph::Recv(g.get(), "a", "float", ALICE, 1, BOB);
   const int N = 10;
   for (int i = 1; i <= N; ++i) {
-    v = test::graph::Add(g, v, v);
+    v = test::graph::Add(g.get(), v, v);
   }
   // out <- v10
-  test::graph::Send(g, v, "b", BOB, 1, ALICE);
-  Create(g);
+  test::graph::Send(g.get(), v, "b", BOB, 1, ALICE);
+  Create(std::move(g));
   Rendezvous::Args args;
   // a = 1.0
   TF_ASSERT_OK(
@@ -229,9 +229,9 @@ void BuildTree(int N, Graph* g) {
 }
 
 TEST_F(ExecutorTest, RandomTree) {
-  Graph* g = new Graph(OpRegistry::Global());
-  BuildTree(4096, g);
-  Create(g);
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  BuildTree(4096, g.get());
+  Create(std::move(g));
   Rendezvous::Args args;
   TF_ASSERT_OK(
       rendez_->Send(Key(ALICE, kIncarnation, BOB, "a"), args, V(1.0), false));
@@ -262,9 +262,9 @@ void BuildConcurrentAddAssign(Graph* g) {
 
 #ifndef THREAD_SANITIZER
 TEST_F(ExecutorTest, ConcurrentAddAssign) {
-  Graph* g = new Graph(OpRegistry::Global());
-  BuildConcurrentAddAssign(g);
-  Create(g);
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  BuildConcurrentAddAssign(g.get());
+  Create(std::move(g));
   for (int iters = 0; iters < 16; ++iters) {
     Rendezvous* rendez = NewLocalRendezvous();
     TF_ASSERT_OK(Run(rendez));
@@ -281,12 +281,12 @@ TEST_F(ExecutorTest, ConcurrentAddAssign) {
 #endif
 
 TEST_F(ExecutorTest, SimpleSwitchLive) {
-  Graph* g = new Graph(OpRegistry::Global());
-  auto in0 = test::graph::Recv(g, "a", "float", ALICE, 1, BOB);
-  auto in1 = test::graph::Constant(g, VB(false));
-  auto tmp = test::graph::Switch(g, in0, in1);
-  test::graph::Send(g, tmp, "c", BOB, 1, ALICE);
-  Create(g);
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  auto in0 = test::graph::Recv(g.get(), "a", "float", ALICE, 1, BOB);
+  auto in1 = test::graph::Constant(g.get(), VB(false));
+  auto tmp = test::graph::Switch(g.get(), in0, in1);
+  test::graph::Send(g.get(), tmp, "c", BOB, 1, ALICE);
+  Create(std::move(g));
   Rendezvous::Args args;
   TF_ASSERT_OK(rendez_->Send(Key(ALICE, kIncarnation, BOB, "a"), args, V(1.0),
                              false));  // in0 = 1.0
@@ -300,12 +300,12 @@ TEST_F(ExecutorTest, SimpleSwitchLive) {
 }
 
 TEST_F(ExecutorTest, SimpleSwitchDead) {
-  Graph* g = new Graph(OpRegistry::Global());
-  auto in0 = test::graph::Recv(g, "a", "float", ALICE, 1, BOB);
-  auto in1 = test::graph::Constant(g, VB(true));
-  auto tmp = test::graph::Switch(g, in0, in1);
-  test::graph::Send(g, tmp, "c", BOB, 1, ALICE);
-  Create(g);
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  auto in0 = test::graph::Recv(g.get(), "a", "float", ALICE, 1, BOB);
+  auto in1 = test::graph::Constant(g.get(), VB(true));
+  auto tmp = test::graph::Switch(g.get(), in0, in1);
+  test::graph::Send(g.get(), tmp, "c", BOB, 1, ALICE);
+  Create(std::move(g));
   Rendezvous::Args args;
   TF_ASSERT_OK(rendez_->Send(Key(ALICE, kIncarnation, BOB, "a"), args, V(1.0),
                              false));  // in0 = 1.0
@@ -319,16 +319,16 @@ TEST_F(ExecutorTest, SimpleSwitchDead) {
 
 TEST_F(ExecutorTest, Abort) {
   // e = a + b + c + d
-  Graph* g = new Graph(OpRegistry::Global());
-  auto in0 = test::graph::Recv(g, "a", "float", ALICE, 1, BOB);
-  auto in1 = test::graph::Recv(g, "b", "float", ALICE, 1, BOB);
-  auto in2 = test::graph::Recv(g, "c", "float", ALICE, 1, BOB);
-  auto in3 = test::graph::Recv(g, "d", "float", ALICE, 1, BOB);
-  auto add0 = test::graph::Add(g, in0, in1);
-  auto add1 = test::graph::Add(g, in2, in3);
-  auto add2 = test::graph::Add(g, add0, add1);
-  test::graph::Send(g, add2, "e", BOB, 1, ALICE);
-  Create(g);
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  auto in0 = test::graph::Recv(g.get(), "a", "float", ALICE, 1, BOB);
+  auto in1 = test::graph::Recv(g.get(), "b", "float", ALICE, 1, BOB);
+  auto in2 = test::graph::Recv(g.get(), "c", "float", ALICE, 1, BOB);
+  auto in3 = test::graph::Recv(g.get(), "d", "float", ALICE, 1, BOB);
+  auto add0 = test::graph::Add(g.get(), in0, in1);
+  auto add1 = test::graph::Add(g.get(), in2, in3);
+  auto add2 = test::graph::Add(g.get(), add0, add1);
+  test::graph::Send(g.get(), add2, "e", BOB, 1, ALICE);
+  Create(std::move(g));
 
   // Needs 4 inputs (recv). One of them is aborted.
   rendez_->Ref();
@@ -371,17 +371,17 @@ TEST_F(ExecutorTest, Abort) {
 }
 
 TEST_F(ExecutorTest, RecvInvalidDtype) {
-  Graph* g = new Graph(OpRegistry::Global());
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
   // An input vector of type float of size 1.
-  auto one = test::graph::Recv(g, "one", "float", ALICE, 1, BOB);
+  auto one = test::graph::Recv(g.get(), "one", "float", ALICE, 1, BOB);
   // A floating point variable vector of size 1.
-  auto var = test::graph::Var(g, DT_FLOAT, TensorShape({1}));
+  auto var = test::graph::Var(g.get(), DT_FLOAT, TensorShape({1}));
   // Initialize the variable with input.
-  auto init = test::graph::Assign(g, var, one);
+  auto init = test::graph::Assign(g.get(), var, one);
   // Output
-  auto* two = test::graph::Send(g, var, "two", BOB, 1, ALICE);
+  auto* two = test::graph::Send(g.get(), var, "two", BOB, 1, ALICE);
   g->AddControlEdge(init, two);  // Ensures run after init.
-  Create(g);
+  Create(std::move(g));
   Rendezvous* rendez = NewLocalRendezvous();
   // Send a double instead of float.
   TF_ASSERT_OK(rendez->Send(Key(ALICE, 1, BOB, "one"), Rendezvous::Args(),
@@ -396,11 +396,11 @@ TEST_F(ExecutorTest, RecvInvalidDtype) {
 }
 
 TEST_F(ExecutorTest, RecvInvalidRefDtype) {
-  Graph* g = new Graph(OpRegistry::Global());
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
   // A var that always produces as invalid dtype.
-  auto var = test::graph::InvalidRefType(g, DT_FLOAT, DT_DOUBLE);
-  test::graph::Send(g, var, "out", BOB, 1, ALICE);
-  Create(g);
+  auto var = test::graph::InvalidRefType(g.get(), DT_FLOAT, DT_DOUBLE);
+  test::graph::Send(g.get(), var, "out", BOB, 1, ALICE);
+  Create(std::move(g));
   Rendezvous* rendez = NewLocalRendezvous();
   EXPECT_TRUE(errors::IsInternal(Run(rendez)));
   Tensor output;
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 60d58af61dad56fbb09df041fb5ca1429fd451ad..7878ebb5f06db0f64e9216250da2a79352274ab3 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -228,8 +228,14 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
     params.function_library = lib;
     params.create_kernel = [session, lib, opseg](const NodeDef& ndef,
                                                  OpKernel** kernel) {
-      // Caches the kernel only if the node is stateful.
-      if (!lib->IsStateful(ndef.op())) {
+      // We do not share the kernel via the OpSegment if the node is
+      // stateless, or a function.
+      // NOTE(mrry): We must not share function kernels (implemented
+      // using `CallOp`) between subgraphs, because `CallOp::handle_`
+      // is tied to a particular subgraph. Even if the function itself
+      // is stateful, the `CallOp` that invokes it is not.
+      if (!lib->IsStateful(ndef.op()) ||
+          lib->GetFunctionLibraryDefinition()->Find(ndef.op()) != nullptr) {
         return lib->CreateKernel(ndef, kernel);
       }
       auto create_fn = [lib, &ndef](OpKernel** kernel) {
@@ -265,7 +271,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
       skip_cost_models_ = false;
     }
     TF_RETURN_IF_ERROR(
-        NewLocalExecutor(params, subgraph.release(), &unit->root));
+        NewLocalExecutor(params, std::move(subgraph), &unit->root));
   }
   return Status::OK();
 }
@@ -475,8 +481,18 @@ void GraphMgr::StartParallelExecutors(const string& handle, int64 step_id,
   using std::placeholders::_1;
   // Line below is equivalent to this code, but does one less indirect call:
   //  args.runner = [pool](std::function<void()> fn) { pool->Schedule(fn); };
-  args.runner = std::bind(&thread::ThreadPool::Schedule, pool, _1);
+  auto default_runner = std::bind(&thread::ThreadPool::Schedule, pool, _1);
   for (const auto& unit : item->units) {
+    // TODO(zhengxq): if the device picks its own threadpool, we need to assign
+    //     less threads to the main compute pool by default.
+    thread::ThreadPool* device_thread_pool =
+        unit.device->tensorflow_device_thread_pool();
+    if (!device_thread_pool) {
+      args.runner = default_runner;
+    } else {
+      args.runner =
+          std::bind(&thread::ThreadPool::Schedule, device_thread_pool, _1);
+    }
     unit.root->RunAsync(args, barrier->Get());
   }
 }
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index d0ca2a625778ff73c6d40492cc5d02ec81ef3cc6..cc35264b8fe0b6decc325dab793c6a5fe6ad097f 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -140,7 +140,7 @@ class GraphMgr {
     GraphMgr* graph_mgr;
   };
 
-  const WorkerEnv* worker_env_;             // Not owned.
+  const WorkerEnv* worker_env_;  // Not owned.
   DeviceMgr* device_mgr_;
 
   CostModelManager cost_model_manager_;
diff --git a/tensorflow/core/distributed_runtime/local_master.h b/tensorflow/core/distributed_runtime/local_master.h
index 5fc21d3a1e25faa5f6478914c69a3d513b50530c..c20b40329ab1712b3dd0cae673d337481ee40196 100644
--- a/tensorflow/core/distributed_runtime/local_master.h
+++ b/tensorflow/core/distributed_runtime/local_master.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
 
 #include <memory>
 
@@ -98,4 +98,4 @@ class LocalMaster : public MasterInterface {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index d1dc622ce79df1a98c3712e447a66bad3baecba1..1a488303ac73b8628b9d3fe4050ad9144724348e 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -528,8 +528,8 @@ void Master::ListDevices(const ListDevicesRequest* req,
       auto session = FindMasterSession(req->session_handle());
       if (session == nullptr) {
         done(errors::InvalidArgument(
-             "Session ", req->session_handle(),
-             " is not found. Possibly, this master has restarted."));
+            "Session ", req->session_handle(),
+            " is not found. Possibly, this master has restarted."));
         return;
       }
       core::ScopedUnref ref(session);
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 03b65d8cba9112e272f52518ca6050ce5f16eb5d..878a1398c9d382a4b2018712ca9f9e48c11a9345 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -446,7 +446,13 @@ class RunManyGraphs {
   // When the index-th call is done, updates the overall status.
   void WhenDone(int index, const Status& s) {
     TRACEPRINTF("Partition %d %s", index, s.ToString().c_str());
-    if (!s.ok()) {
+    auto resp = get(index)->resp.get();
+    if (resp->status_code() != error::Code::OK) {
+      // resp->status_code will only be non-OK if s.ok().
+      mutex_lock l(mu_);
+      UpdateStatusLocked(
+          Status(resp->status_code(), resp->status_error_message()));
+    } else if (!s.ok()) {
       mutex_lock l(mu_);
       UpdateStatusLocked(s);
     }
@@ -539,6 +545,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
     c->req->set_graph_handle(part.graph_handle);
     c->req->set_step_id(step_id);
     *c->req->mutable_exec_opts() = exec_opts;
+    c->req->set_store_errors_in_response_body(true);
     // If any feeds are provided, send the feed values together
     // in the RunGraph request.
     // In the partial case, we only want to include feeds provided in the req.
@@ -1441,6 +1448,7 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     const auto count = run_state->count;
     pss.collect_timeline =
         req.options().trace_level() == RunOptions::FULL_TRACE;
+    pss.collect_rpcs = req.options().trace_level() == RunOptions::FULL_TRACE;
     pss.report_tensor_allocations_upon_oom =
         req.options().report_tensor_allocations_upon_oom();
 
@@ -1603,6 +1611,7 @@ Status MasterSession::DoRunWithLocalExecution(
   TRACEPRINTF("stepid %llu", step_id);
 
   pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE;
+  pss.collect_rpcs = req.options().trace_level() == RunOptions::FULL_TRACE;
   pss.report_tensor_allocations_upon_oom =
       req.options().report_tensor_allocations_upon_oom();
   // Build the cost model every 'build_cost_model_every' steps after skipping an
diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc
index 121c58762f10a87fea059ce43b190f70e49e1f64..f2c1f3489c388d6a5fff729b1c8f98136532105c 100644
--- a/tensorflow/core/distributed_runtime/master_test.cc
+++ b/tensorflow/core/distributed_runtime/master_test.cc
@@ -61,7 +61,7 @@ class MasterTest : public ::testing::Test {
   // rpc calls.
 
   Status CreateSession(const GraphDef& def, string* handle,
-                            int64* initial_version) {
+                       int64* initial_version) {
     ::grpc::ClientContext ctx;
     CreateSessionRequest req;
     *(req.mutable_graph_def()) = def;
@@ -77,7 +77,7 @@ class MasterTest : public ::testing::Test {
   }
 
   Status ExtendSession(const string& handle, const GraphDef& def,
-                            int64 current_version, int64* new_version) {
+                       int64 current_version, int64* new_version) {
     ::grpc::ClientContext ctx;
     ExtendSessionRequest req;
     req.set_session_handle(handle);
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index a4a88e6e3b9ec734c0720b715dc9b3e30850c0ae..66ebb3080af7cd01021362b5ea0c0b54458aebfc 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -93,6 +93,15 @@ const RunOptions& InMemoryRunStepRequest::options() const { return options_; }
 
 RunOptions* InMemoryRunStepRequest::mutable_options() { return &options_; }
 
+bool InMemoryRunStepRequest::store_errors_in_response_body() const {
+  return store_errors_in_response_body_;
+}
+
+void InMemoryRunStepRequest::set_store_errors_in_response_body(
+    bool store_errors) {
+  store_errors_in_response_body_ = store_errors;
+}
+
 string InMemoryRunStepRequest::DebugString() const {
   return ToProto().DebugString();
 }
@@ -192,6 +201,15 @@ RunOptions* MutableProtoRunStepRequest::mutable_options() {
   return request_.mutable_options();
 }
 
+bool MutableProtoRunStepRequest::store_errors_in_response_body() const {
+  return request_.store_errors_in_response_body();
+}
+
+void MutableProtoRunStepRequest::set_store_errors_in_response_body(
+    bool store_errors) {
+  request_.set_store_errors_in_response_body(store_errors);
+}
+
 string MutableProtoRunStepRequest::DebugString() const {
   return request_.DebugString();
 }
@@ -250,6 +268,10 @@ const RunOptions& ProtoRunStepRequest::options() const {
   return request_->options();
 }
 
+bool ProtoRunStepRequest::store_errors_in_response_body() const {
+  return request_->store_errors_in_response_body();
+}
+
 string ProtoRunStepRequest::DebugString() const {
   return request_->DebugString();
 }
@@ -329,6 +351,15 @@ void InMemoryRunGraphRequest::set_is_last_partial_run(
   is_last_partial_run_ = is_last_partial_run;
 }
 
+bool InMemoryRunGraphRequest::store_errors_in_response_body() const {
+  return store_errors_in_response_body_;
+}
+
+void InMemoryRunGraphRequest::set_store_errors_in_response_body(
+    bool store_errors) {
+  store_errors_in_response_body_ = store_errors;
+}
+
 const RunGraphRequest& InMemoryRunGraphRequest::ToProto() const {
   if (!proto_version_) {
     proto_version_.reset(new RunGraphRequest);
@@ -437,6 +468,15 @@ void MutableProtoRunGraphRequest::set_is_last_partial_run(
   request_.set_is_last_partial_run(is_last_partial_run);
 }
 
+bool MutableProtoRunGraphRequest::store_errors_in_response_body() const {
+  return request_.store_errors_in_response_body();
+}
+
+void MutableProtoRunGraphRequest::set_store_errors_in_response_body(
+    bool store_errors) {
+  request_.set_store_errors_in_response_body(store_errors);
+}
+
 const RunGraphRequest& MutableProtoRunGraphRequest::ToProto() const {
   return request_;
 }
@@ -486,6 +526,10 @@ bool ProtoRunGraphRequest::is_last_partial_run() const {
   return request_->is_last_partial_run();
 }
 
+bool ProtoRunGraphRequest::store_errors_in_response_body() const {
+  return request_->store_errors_in_response_body();
+}
+
 const RunGraphRequest& ProtoRunGraphRequest::ToProto() const {
   return *request_;
 }
@@ -518,6 +562,18 @@ CostGraphDef* InMemoryRunGraphResponse::mutable_cost_graph() {
   return &cost_graph_;
 }
 
+errors::Code InMemoryRunGraphResponse::status_code() const {
+  return status_.code();
+}
+
+const string& InMemoryRunGraphResponse::status_error_message() const {
+  return status_.error_message();
+}
+
+void InMemoryRunGraphResponse::set_status(const Status& status) {
+  status_ = status;
+}
+
 RunGraphResponse* InMemoryRunGraphResponse::get_proto() {
   LOG(FATAL) << "Cannot get a mutable protobuf for an InMemoryRunGraphResponse";
   return nullptr;
@@ -574,6 +630,19 @@ CostGraphDef* OwnedProtoRunGraphResponse::mutable_cost_graph() {
   return response_.mutable_cost_graph();
 }
 
+errors::Code OwnedProtoRunGraphResponse::status_code() const {
+  return response_.status_code();
+}
+
+const string& OwnedProtoRunGraphResponse::status_error_message() const {
+  return response_.status_error_message();
+}
+
+void OwnedProtoRunGraphResponse::set_status(const Status& status) {
+  response_.set_status_code(status.code());
+  response_.set_status_error_message(status.error_message());
+}
+
 RunGraphResponse* OwnedProtoRunGraphResponse::get_proto() { return &response_; }
 
 size_t OwnedProtoRunGraphResponse::num_partition_graphs() const {
@@ -632,6 +701,19 @@ CostGraphDef* NonOwnedProtoRunGraphResponse::mutable_cost_graph() {
   return response_->mutable_cost_graph();
 }
 
+errors::Code NonOwnedProtoRunGraphResponse::status_code() const {
+  return response_->status_code();
+}
+
+const string& NonOwnedProtoRunGraphResponse::status_error_message() const {
+  return response_->status_error_message();
+}
+
+void NonOwnedProtoRunGraphResponse::set_status(const Status& status) {
+  response_->set_status_code(status.code());
+  response_->set_status_error_message(status.error_message());
+}
+
 RunGraphResponse* NonOwnedProtoRunGraphResponse::get_proto() {
   return response_;
 }
@@ -678,6 +760,18 @@ Status InMemoryRunStepResponse::AddTensorFromRunGraphResponse(
 
 RunMetadata* InMemoryRunStepResponse::mutable_metadata() { return &metadata_; }
 
+errors::Code InMemoryRunStepResponse::status_code() const {
+  return status_.code();
+}
+
+const string& InMemoryRunStepResponse::status_error_message() const {
+  return status_.error_message();
+}
+
+void InMemoryRunStepResponse::set_status(const Status& status) {
+  status_ = status;
+}
+
 RunStepResponse* InMemoryRunStepResponse::get_proto() {
   LOG(FATAL) << "Cannot get a mutable protobuf for an InMemoryRunStepResponse";
   return nullptr;
@@ -716,6 +810,19 @@ RunMetadata* OwnedProtoRunStepResponse::mutable_metadata() {
   return response_.mutable_metadata();
 }
 
+errors::Code OwnedProtoRunStepResponse::status_code() const {
+  return response_.status_code();
+}
+
+const string& OwnedProtoRunStepResponse::status_error_message() const {
+  return response_.status_error_message();
+}
+
+void OwnedProtoRunStepResponse::set_status(const Status& status) {
+  response_.set_status_code(status.code());
+  response_.set_status_error_message(status.error_message());
+}
+
 RunStepResponse* OwnedProtoRunStepResponse::get_proto() { return &response_; }
 
 NonOwnedProtoRunStepResponse::NonOwnedProtoRunStepResponse(
@@ -755,6 +862,19 @@ RunMetadata* NonOwnedProtoRunStepResponse::mutable_metadata() {
   return response_->mutable_metadata();
 }
 
+errors::Code NonOwnedProtoRunStepResponse::status_code() const {
+  return response_->status_code();
+}
+
+const string& NonOwnedProtoRunStepResponse::status_error_message() const {
+  return response_->status_error_message();
+}
+
+void NonOwnedProtoRunStepResponse::set_status(const Status& status) {
+  response_->set_status_code(status.code());
+  response_->set_status_error_message(status.error_message());
+}
+
 RunStepResponse* NonOwnedProtoRunStepResponse::get_proto() { return response_; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 0e3f5b98cb58bb76f599ca67938a420c9b3ffdce..79fa6f926ea6afb351eacf279d3cf493b6d4713f 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_MESSAGE_WRAPPERS_H_
 
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
@@ -80,6 +80,13 @@ class RunStepRequestWrapper {
   // Options for the run call.
   virtual const RunOptions& options() const = 0;
 
+  // If true then some errors, e.g., execution errors that have long
+  // error messages, may return an OK RunStepResponse with the actual
+  // error saved in the status_code/status_error_message fields of the
+  // response body. This is a workaround since the RPC subsystem may
+  // truncate long metadata messages.
+  virtual bool store_errors_in_response_body() const = 0;
+
   // Returns a human-readable representation of this message for debugging.
   virtual string DebugString() const = 0;
 
@@ -98,6 +105,7 @@ class MutableRunStepRequestWrapper : public RunStepRequestWrapper {
   virtual void add_fetch(const string& name) = 0;
   virtual void add_target(const string& name) = 0;
   virtual RunOptions* mutable_options() = 0;
+  virtual void set_store_errors_in_response_body(bool store_errors) = 0;
 };
 
 // Specialized (and mutable) wrapper for RunStep requests between a client and
@@ -118,6 +126,7 @@ class InMemoryRunStepRequest : public MutableRunStepRequestWrapper {
   const RunOptions& options() const override;
   string DebugString() const override;
   const RunStepRequest& ToProto() const override;
+  bool store_errors_in_response_body() const override;
 
   // MutableRunStepRequestWrapper methods.
   void set_session_handle(const string& handle) override;
@@ -126,6 +135,7 @@ class InMemoryRunStepRequest : public MutableRunStepRequestWrapper {
   void add_fetch(const string& name) override;
   void add_target(const string& name) override;
   RunOptions* mutable_options() override;
+  void set_store_errors_in_response_body(bool store_errors) override;
 
  private:
   string session_handle_;
@@ -134,6 +144,7 @@ class InMemoryRunStepRequest : public MutableRunStepRequestWrapper {
   gtl::InlinedVector<string, 4> fetches_;
   gtl::InlinedVector<string, 4> targets_;
   RunOptions options_;
+  bool store_errors_in_response_body_ = false;
 
   // Holds a cached and owned representation of the proto
   // representation of this request, if needed, so that `ToProto()`
@@ -165,6 +176,7 @@ class MutableProtoRunStepRequest : public MutableRunStepRequestWrapper {
   const RunOptions& options() const override;
   string DebugString() const override;
   const RunStepRequest& ToProto() const override;
+  bool store_errors_in_response_body() const override;
 
   // MutableRunStepRequestWrapper methods.
   void set_session_handle(const string& handle) override;
@@ -173,6 +185,7 @@ class MutableProtoRunStepRequest : public MutableRunStepRequestWrapper {
   void add_fetch(const string& name) override;
   void add_target(const string& name) override;
   RunOptions* mutable_options() override;
+  void set_store_errors_in_response_body(bool store_errors) override;
 
  private:
   RunStepRequest request_;
@@ -202,6 +215,7 @@ class ProtoRunStepRequest : public RunStepRequestWrapper {
   const RunOptions& options() const override;
   string DebugString() const override;
   const RunStepRequest& ToProto() const override;
+  bool store_errors_in_response_body() const override;
 
  private:
   const RunStepRequest* const request_;  // Not owned.
@@ -262,6 +276,13 @@ class RunGraphRequestWrapper {
   // True if this is the last partial run request in a sequence of requests.
   virtual bool is_last_partial_run() const = 0;
 
+  // If true then some errors, e.g., execution errors that have long
+  // error messages, may return an OK RunStepResponse with the actual
+  // error saved in the status_code/status_error_message fields of the
+  // response body. This is a workaround since the RPC subsystem may
+  // truncate long metadata messages.
+  virtual bool store_errors_in_response_body() const = 0;
+
   // Returns the wrapped data as a protocol buffer message.
   virtual const RunGraphRequest& ToProto() const = 0;
 };
@@ -285,6 +306,7 @@ class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
   virtual void add_recv_key(const string& recv_key) = 0;
   virtual void set_is_partial(bool is_partial) = 0;
   virtual void set_is_last_partial_run(bool is_last_partial_run) = 0;
+  virtual void set_store_errors_in_response_body(bool store_errors) = 0;
 };
 
 class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
@@ -302,6 +324,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   bool is_partial() const override;
   bool is_last_partial_run() const override;
   const RunGraphRequest& ToProto() const override;
+  bool store_errors_in_response_body() const override;
 
   // MutableRunGraphRequestWrapper methods.
   void set_session_handle(const string& handle) override;
@@ -314,6 +337,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   void add_recv_key(const string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
+  void set_store_errors_in_response_body(bool store_errors) override;
 
  private:
   string session_handle_;
@@ -324,6 +348,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   gtl::InlinedVector<string, 4> recvs_;
   bool is_partial_ = false;
   bool is_last_partial_run_ = false;
+  bool store_errors_in_response_body_ = false;
 
   // Holds a cached and owned representation of the proto
   // representation of this request, if needed, so that `ToProto()`
@@ -349,6 +374,7 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
   const string& recv_key(size_t i) const override;
   bool is_partial() const override;
   bool is_last_partial_run() const override;
+  bool store_errors_in_response_body() const override;
   const RunGraphRequest& ToProto() const override;
 
   // MutableRunGraphRequestWrapper methods.
@@ -362,6 +388,7 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
   void add_recv_key(const string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
+  void set_store_errors_in_response_body(bool store_errors) override;
 
  private:
   RunGraphRequest request_;
@@ -383,6 +410,7 @@ class ProtoRunGraphRequest : public RunGraphRequestWrapper {
   const string& recv_key(size_t i) const override;
   bool is_partial() const override;
   bool is_last_partial_run() const override;
+  bool store_errors_in_response_body() const override;
   const RunGraphRequest& ToProto() const override;
 
  private:
@@ -429,6 +457,11 @@ class MutableRunGraphResponseWrapper {
   virtual GraphDef* mutable_partition_graph(size_t i) = 0;
   virtual void AddPartitionGraph(const GraphDef& partition_graph) = 0;
 
+  // Returned status if requested.
+  virtual errors::Code status_code() const = 0;
+  virtual const string& status_error_message() const = 0;
+  virtual void set_status(const Status& status) = 0;
+
  protected:
   // Returns a mutable protobuf message that represents the contents of
   // this wrapper, for passing to an RPC subsystem that will populate
@@ -458,6 +491,9 @@ class InMemoryRunGraphResponse : public MutableRunGraphResponseWrapper {
   size_t num_partition_graphs() const override;
   GraphDef* mutable_partition_graph(size_t i) override;
   void AddPartitionGraph(const GraphDef& partition_graph) override;
+  errors::Code status_code() const override;
+  const string& status_error_message() const override;
+  void set_status(const Status& status) override;
 
  protected:
   // NOTE: This method is not implemented. See
@@ -469,6 +505,9 @@ class InMemoryRunGraphResponse : public MutableRunGraphResponseWrapper {
   StepStats step_stats_;
   CostGraphDef cost_graph_;
   std::vector<GraphDef> partition_graphs_;
+  // Store the code and message separately so that they can be updated
+  // independently by setters.
+  Status status_;
 };
 
 // Proto-based message wrapper for use on the client side of the RunGraph RPC.
@@ -485,6 +524,9 @@ class OwnedProtoRunGraphResponse : public MutableRunGraphResponseWrapper {
   size_t num_partition_graphs() const override;
   GraphDef* mutable_partition_graph(size_t i) override;
   void AddPartitionGraph(const GraphDef& partition_graph) override;
+  errors::Code status_code() const override;
+  const string& status_error_message() const override;
+  void set_status(const Status& status) override;
 
  protected:
   RunGraphResponse* get_proto() override;
@@ -509,6 +551,9 @@ class NonOwnedProtoRunGraphResponse : public MutableRunGraphResponseWrapper {
   size_t num_partition_graphs() const override;
   GraphDef* mutable_partition_graph(size_t i) override;
   void AddPartitionGraph(const GraphDef& partition_graph) override;
+  errors::Code status_code() const override;
+  const string& status_error_message() const override;
+  void set_status(const Status& status) override;
 
  protected:
   RunGraphResponse* get_proto() override;
@@ -558,6 +603,11 @@ class MutableRunStepResponseWrapper {
   virtual const RunMetadata& metadata() const = 0;
   virtual RunMetadata* mutable_metadata() = 0;
 
+  // Returned status if requested.
+  virtual errors::Code status_code() const = 0;
+  virtual const string& status_error_message() const = 0;
+  virtual void set_status(const Status& status) = 0;
+
  protected:
   // Returns a mutable protobuf message that represents the contents of
   // this wrapper, for passing to an RPC subsystem that will populate
@@ -585,6 +635,9 @@ class InMemoryRunStepResponse : public MutableRunStepResponseWrapper {
       size_t i) override;
   const RunMetadata& metadata() const override;
   RunMetadata* mutable_metadata() override;
+  errors::Code status_code() const override;
+  const string& status_error_message() const override;
+  void set_status(const Status& status) override;
 
  protected:
   // NOTE: This method is not implemented. See
@@ -594,6 +647,9 @@ class InMemoryRunStepResponse : public MutableRunStepResponseWrapper {
  private:
   gtl::InlinedVector<std::pair<string, Tensor>, 4> tensors_;
   RunMetadata metadata_;
+  // Store the code and message separately so that they can be updated
+  // independently by setters.
+  Status status_;
 };
 
 // Proto-based message wrapper for use on the client side of the RunStep RPC.
@@ -608,6 +664,9 @@ class OwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
       size_t i) override;
   const RunMetadata& metadata() const override;
   RunMetadata* mutable_metadata() override;
+  errors::Code status_code() const override;
+  const string& status_error_message() const override;
+  void set_status(const Status& status) override;
 
  protected:
   RunStepResponse* get_proto() override;
@@ -630,6 +689,9 @@ class NonOwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
       size_t i) override;
   const RunMetadata& metadata() const override;
   RunMetadata* mutable_metadata() override;
+  errors::Code status_code() const override;
+  const string& status_error_message() const override;
+  void set_status(const Status& status) override;
 
  protected:
   RunStepResponse* get_proto() override;
@@ -640,4 +702,4 @@ class NonOwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW
+#endif  // TENSORFLOW
diff --git a/tensorflow/core/distributed_runtime/partial_run_mgr.h b/tensorflow/core/distributed_runtime/partial_run_mgr.h
index af56e723a9a7e6710b06943c3806ca3690667810..e95f4da6c30b14b9766ef43bf8ef231a1db91ca8 100644
--- a/tensorflow/core/distributed_runtime/partial_run_mgr.h
+++ b/tensorflow/core/distributed_runtime/partial_run_mgr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
 
 #include <unordered_map>
 
@@ -84,4 +84,4 @@ class PartialRunMgr {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.cc b/tensorflow/core/distributed_runtime/recent_request_ids.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c30879406c6924aa85ad4bf8279b278eaf5d29fd
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.cc
@@ -0,0 +1,57 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+RecentRequestIds::RecentRequestIds(int num_tracked_request_ids)
+    : circular_buffer_(num_tracked_request_ids) {
+  set_.reserve(num_tracked_request_ids);
+}
+
+Status RecentRequestIds::TrackUnique(int64 request_id,
+                                     const string& method_name,
+                                     const protobuf::Message& request) {
+  mutex_lock l(mu_);
+  if (request_id == 0) {
+    // For backwards compatibility, allow all requests with request_id 0.
+    return Status::OK();
+  }
+  if (set_.count(request_id) > 0) {
+    // Note: RecentRequestIds is not strict LRU because we don't update
+    // request_id's age in the circular_buffer_ if it's tracked again. Strict
+    // LRU is not useful here because returning this error will close the
+    // current Session.
+    return errors::Aborted("The same ", method_name,
+                           " request was received twice. ",
+                           request.ShortDebugString());
+  }
+
+  // Remove the oldest request_id from the set_. circular_buffer_ is
+  // zero-initialized, and zero is never tracked, so it's safe to do this even
+  // when the buffer is not yet full.
+  set_.erase(circular_buffer_[next_index_]);
+  circular_buffer_[next_index_] = request_id;
+  set_.insert(request_id);
+  next_index_ = (next_index_ + 1) % circular_buffer_.size();
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.h b/tensorflow/core/distributed_runtime/recent_request_ids.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8e45331dd5a26e2230bb92e8ce73888d3f28505
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.h
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
+
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace tensorflow {
+
+// RecentRequestIds tracks recent 64-bit request_ids. When maximum capacity is
+// reached, the oldest request_id is evicted. Thread safe.
+//
+// Some RPCs like RecvTensor are unsafe to retry. For example, RecvTensor pairs
+// one sender and one receiver, and the receiver waits for the sender's tensor.
+// Retried RecvTensor requests are problematic, because the original RecvTensor
+// request may have consumed the sender's tensor, so a retried request might
+// block forever. RecentRequestIds identifies retried requests, so we can fail
+// them instead of blocking forever.
+//
+// Internally, recent request_ids are stored in two data structures: a set and a
+// circular buffer. The set is used for efficient lookups, and the circular
+// buffer tracks the oldest request_id. When the buffer is full, the new
+// request_id replaces the oldest request_id in the circular buffer, and the
+// oldest request_id is removed from the set.
+class RecentRequestIds {
+ public:
+  // num_tracked_request_ids should be much larger than the number of RPCs that
+  // can be received in a small time window. For example, we observed a peak RPC
+  // rate of ~700 RecvTensor RPC/s when training inception v3 on TPUs, so we
+  // currently set num_tracked_request_ids to 100,000 for RecvTensor.
+  RecentRequestIds(int num_tracked_request_ids);
+
+  // Returns OK iff request_id has not been seen in the last
+  // num_tracked_request_ids insertions. For backwards compatibility, this
+  // always returns OK for request_id 0. The method_name and the request's
+  // ShortDebugString are added to returned errors.
+  Status TrackUnique(int64 request_id, const string& method_name,
+                     const protobuf::Message& request);
+
+ private:
+  mutex mu_;
+  // next_index_ indexes into circular_buffer_, and points to the next storage
+  // space to use. When the buffer is full, next_index_ points at the oldest
+  // request_id.
+  int next_index_ GUARDED_BY(mu_) = 0;
+  std::vector<int64> circular_buffer_ GUARDED_BY(mu_);
+  gtl::FlatSet<int64> set_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RECENT_REQUEST_IDS_H_
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids_test.cc b/tensorflow/core/distributed_runtime/recent_request_ids_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a0facf5404bb4e6d0d57f55bcd1f2a4f4f99dba
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/recent_request_ids_test.cc
@@ -0,0 +1,96 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
+
+#include <algorithm>
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace tensorflow {
+
+Status TrackUnique(int64 request_id, RecentRequestIds* recent_request_ids) {
+  RecvTensorRequest request;
+  request.set_request_id(request_id);
+  return recent_request_ids->TrackUnique(request_id, "recent_request_ids_test",
+                                         request);
+}
+
+// request_id 0 is always valid.
+TEST(RecentRequestIds, Zero) {
+  RecentRequestIds recent_request_ids(1);
+  EXPECT_TRUE(TrackUnique(0, &recent_request_ids).ok());
+  EXPECT_TRUE(TrackUnique(0, &recent_request_ids).ok());
+  EXPECT_TRUE(TrackUnique(0, &recent_request_ids).ok());
+}
+
+TEST(RecentRequestIds, Unordered) {
+  // Capacity for 6 numbers.
+  RecentRequestIds recent_request_ids(6);
+
+  // Some unordered numbers to insert into request_id_set.
+  std::vector<int64> numbers = {53754,  23351,  164101, 7476,
+                                162432, 130761, 164102};
+
+  // Insert numbers[0..6) and check that all previously inserted numbers remain
+  // in the set.
+  for (int i = 0; i < 6; ++i) {
+    TF_EXPECT_OK(TrackUnique(numbers[i], &recent_request_ids));
+
+    for (int j = 0; j <= i; ++j) {
+      EXPECT_FALSE(TrackUnique(numbers[j], &recent_request_ids).ok())
+          << "i=" << i << " j=" << j;
+    }
+  }
+
+  // Insert numbers[6]. Inserting this 7th number should evict the first number
+  // from the set. The set should only contain numbers[1..7).
+  TF_EXPECT_OK(TrackUnique(numbers[6], &recent_request_ids));
+  for (int i = 1; i < 7; ++i) {
+    EXPECT_FALSE(TrackUnique(numbers[i], &recent_request_ids).ok())
+        << "i=" << i;
+  }
+
+  // Insert numbers[0] again. This should succeed because we just evicted it
+  // from the set.
+  TF_EXPECT_OK(TrackUnique(numbers[0], &recent_request_ids));
+}
+
+// Check that the oldest request_id is evicted.
+void TestOrdered(int num_request_ids) {
+  RecentRequestIds recent_request_ids(num_request_ids);
+
+  // Insert [1..101). The current number and the (num_request_ids - 1) preceding
+  // numbers should still be in the set.
+  for (int i = 1; i < 101; ++i) {
+    TF_EXPECT_OK(TrackUnique(i, &recent_request_ids));
+
+    for (int j = std::max(1, i - num_request_ids + 1); j <= i; ++j) {
+      EXPECT_FALSE(TrackUnique(j, &recent_request_ids).ok())
+          << "i=" << i << " j=" << j;
+    }
+  }
+}
+
+// Test eviction with various numbers of buckets.
+TEST(RecentRequestIds, Ordered2) { TestOrdered(2); }
+TEST(RecentRequestIds, Ordered3) { TestOrdered(3); }
+TEST(RecentRequestIds, Ordered4) { TestOrdered(4); }
+TEST(RecentRequestIds, Ordered5) { TestOrdered(5); }
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/request_id.cc b/tensorflow/core/distributed_runtime/request_id.cc
new file mode 100644
index 0000000000000000000000000000000000000000..230c6f9601355d4f6e904f4c3a762cd9d44f72c9
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/request_id.cc
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/request_id.h"
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+int64 GetUniqueRequestId() {
+  int64 request_id = 0;
+  while (request_id == 0) {
+    request_id = random::New64();
+  }
+  return request_id;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/request_id.h b/tensorflow/core/distributed_runtime/request_id.h
new file mode 100644
index 0000000000000000000000000000000000000000..a882b69ab16bea32c0f0fae394a8cce5dc469d27
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/request_id.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REQUEST_ID_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REQUEST_ID_H_
+
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// Returns a request_id for use with RecentRequestIds. This number will not be
+// zero, and must be unique over RecentRequestIds' window of
+// num_tracked_request_ids. See recent_request_ids.h for more details.
+int64 GetUniqueRequestId();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_REQUEST_ID_H_
diff --git a/tensorflow/core/distributed_runtime/request_id_test.cc b/tensorflow/core/distributed_runtime/request_id_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0dc9d934723cfa5bea8ad3bf6377ab47bbe40a0
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/request_id_test.cc
@@ -0,0 +1,29 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/request_id.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+// Try requesting some request_ids and verify that none are zero.
+TEST(GetUniqueRequestId, Basic) {
+  for (int i = 0; i < 1000000; ++i) {
+    EXPECT_NE(GetUniqueRequestId(), 0);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 80640c806deedccbe15bdca3216e0c0d195045e1..dade26abc6a3c58f24c759ad863600a156985708 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -186,6 +186,7 @@ tf_cuda_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:graph_mgr",
+        "//tensorflow/core/distributed_runtime:recent_request_ids",
         "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
         "//tensorflow/core/distributed_runtime:worker",
         "//tensorflow/core/distributed_runtime:worker_cache",
@@ -270,6 +271,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:request_id",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_cache",
         "//tensorflow/core/distributed_runtime:worker_env",
diff --git a/tensorflow/core/distributed_runtime/rpc/async_service_interface.h b/tensorflow/core/distributed_runtime/rpc/async_service_interface.h
index 63b0f2272d6aa711c8ce77f00b1f2619efafccc9..b2730a583b1252d8703495782e30caf8f5fa3a46 100644
--- a/tensorflow/core/distributed_runtime/rpc/async_service_interface.h
+++ b/tensorflow/core/distributed_runtime/rpc/async_service_interface.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
 
 namespace tensorflow {
 
@@ -38,4 +38,4 @@ class AsyncServiceInterface {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_ASYNC_SERVICE_INTERFACE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_call.h b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
index 2ab0a40f333bf995a3847ef9bf35d1381512c16c..ecad1274cc14c7f03eddf6fbb806e886b0c7d0b2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_call.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
 
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/platform/macros.h"
@@ -265,4 +265,4 @@ class Call : public UntypedCall<Service> {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 7efc0ba6d8510fb0d462df13f7b3ebf68e939313..613188244fcb196a2bca7307d536a652a0f7f551 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -157,7 +157,7 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
     }
   }
 
-  void ListWorkers(std::vector<string>* workers) const override {
+  void ListWorkers(std::vector<string>* workers) override {
     for (GrpcChannelCache* cache : caches_) {
       cache->ListWorkers(workers);
     }
@@ -216,7 +216,7 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
   }
   ~SparseGrpcChannelCache() override {}
 
-  void ListWorkers(std::vector<string>* workers) const override {
+  void ListWorkers(std::vector<string>* workers) override {
     workers->reserve(workers->size() + host_ports_.size());
     for (const auto& id_host_port : host_ports_) {
       workers->emplace_back(MakeAddress(job_id_, id_host_port.first));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
index c662cde9be8998b8303b345403620ca920f3ca92..48b9d958aa921b0e758fc17a0f4da7c3a13e6c16 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
 
 #include <map>
 #include <memory>
@@ -65,7 +65,7 @@ class GrpcChannelCache {
   // was created to handle.  Worker names are in the format
   //  /job:<job identifier>/task:<task id>
   // e.g. /job:mnist/task:2
-  virtual void ListWorkers(std::vector<string>* workers) const = 0;
+  virtual void ListWorkers(std::vector<string>* workers) = 0;
 
   // If found, returns a gRPC channel that is connected to the remote
   // worker named by 'target'. 'target' is of the following
@@ -93,4 +93,4 @@ Status NewHostPortGrpcChannel(const string& target,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CHANNEL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h b/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
index 95c2c935f091abc808a7fb0ee8446ced5e1d184b..d367b83ee7fac5001bd83737531689b64a7e3774 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
 
 #include "grpc++/grpc++.h"
 
@@ -41,4 +41,4 @@ class GrpcClientCQTag {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index 41ee81c01d6ebb9085d3271eae86484bb786ecfb..b4d18d8607eaddd75f4e395e71fbd75554645a61 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -185,15 +185,22 @@ class GrpcMasterService : public AsyncServiceInterface {
     MutableRunStepResponseWrapper* wrapped_response =
         new NonOwnedProtoRunStepResponse(&call->response);
     call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-    master_impl_->RunStep(call_opts, wrapped_request, wrapped_response,
-                          [call, call_opts, wrapped_request, wrapped_response,
-                           trace](const Status& status) {
-                            call->ClearCancelCallback();
-                            delete call_opts;
-                            delete wrapped_request;
-                            delete trace;
-                            call->SendResponse(ToGrpcStatus(status));
-                          });
+    master_impl_->RunStep(
+        call_opts, wrapped_request, wrapped_response,
+        [call, call_opts, wrapped_request, wrapped_response,
+         trace](const Status& status) {
+          call->ClearCancelCallback();
+          delete call_opts;
+          delete wrapped_request;
+          delete trace;
+          if (call->request.store_errors_in_response_body() && !status.ok()) {
+            call->response.set_status_code(status.code());
+            call->response.set_status_error_message(status.error_message());
+            call->SendResponse(ToGrpcStatus(Status::OK()));
+          } else {
+            call->SendResponse(ToGrpcStatus(status));
+          }
+        });
     ENQUEUE_REQUEST(RunStep, true);
   }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
index 8770dcc3ac9bf7f0b6c7544a34ccb6d6fa5966b5..473604f257607456d0fb4dcb6d9189f2f6dba135 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
 
 #include <memory>
 #include "tensorflow/core/platform/types.h"
@@ -34,4 +34,4 @@ AsyncServiceInterface* NewGrpcMasterService(Master* master,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index 412395c52635d5c3cda95dddea50f7cd2d8c8e4f..6ae94b74417c3fb6c4da1589bb9f532cb6d79930 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
 
 #include "grpc++/impl/codegen/async_stream.h"
 #include "grpc++/impl/codegen/async_unary_call.h"
@@ -89,9 +89,9 @@ class MasterService final {
     ::grpc::Status ExtendSession(::grpc::ClientContext* context,
                                  const ExtendSessionRequest& request,
                                  ExtendSessionResponse* response) override;
-    ::grpc::Status PartialRunSetup(
-        ::grpc::ClientContext* context, const PartialRunSetupRequest& request,
-        PartialRunSetupResponse* response) override;
+    ::grpc::Status PartialRunSetup(::grpc::ClientContext* context,
+                                   const PartialRunSetupRequest& request,
+                                   PartialRunSetupResponse* response) override;
     ::grpc::Status RunStep(::grpc::ClientContext* context,
                            const RunStepRequest& request,
                            RunStepResponse* response) override;
@@ -186,4 +186,4 @@ class MasterService final {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index 70418f63686843414dca6c5ae4907ee263dc2904..1088e9be66ceb7fbddfaed0691423745f362343f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -69,8 +69,7 @@ class GrpcRemoteMaster : public MasterInterface {
     ::grpc::ClientContext ctx;
     auto trace = TraceRpc("RunStep/Client", &ctx);
     return Call(&ctx, call_options, &request->ToProto(),
-                get_proto_from_wrapper(response),
-                &MasterServiceStub::RunStep);
+                get_proto_from_wrapper(response), &MasterServiceStub::RunStep);
   }
 
   Status CloseSession(CallOptions* call_options,
@@ -114,8 +113,9 @@ class GrpcRemoteMaster : public MasterInterface {
   template <typename Request, typename Response>
   Status Call(::grpc::ClientContext* ctx, CallOptions* call_options,
               const Request* request, Response* response,
-              ::grpc::Status (MasterServiceStub::*pfunc)(
-                  ::grpc::ClientContext*, const Request&, Response*)) {
+              ::grpc::Status (MasterServiceStub::*pfunc)(::grpc::ClientContext*,
+                                                         const Request&,
+                                                         Response*)) {
     ctx->set_fail_fast(false);
     SetDeadline(ctx, call_options->GetTimeout());
     return FromGrpcStatus((stub_.get()->*pfunc)(ctx, *request, response));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
index d661caaa6029dc29c9eb8983c009f232fb2b3cbf..c80668e899d100edd65649c5588177655d1d0b7e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
 
 #include "tensorflow/core/distributed_runtime/master_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
@@ -24,4 +24,4 @@ namespace tensorflow {
 MasterInterface* NewGrpcMaster(const SharedGrpcChannelPtr& channel);
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
index 8ad41335409e0a7f7576134ed12b1a233aa341e0..709c3833e7aaa8b61656693e376c1d3060e0bb35 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
-#define THIRD_PARTY_TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
+#ifndef TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
+#define TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
 
 #include <memory>
 
@@ -35,4 +35,4 @@ WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
+#endif  // TENSORFLOW_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_WORKER_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
index b35d4843e8482dc15c6013f9cd0486f8feea754a..730124c25e9a3e8d102a9dd39e4c4a17f2ce39d1 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
 
 #include "grpc++/impl/codegen/proto_utils.h"
 #include "grpc++/support/slice.h"
@@ -66,7 +66,7 @@ class GrpcBufferWriter final
     }
     // It's dangerous to keep an inlined grpc_slice as the backup slice, since
     // on a following Next() call, a reference will be returned to this slice
-    // via GRPC_SLICE_START_PTR, which will not be an adddress held by
+    // via GRPC_SLICE_START_PTR, which will not be an address held by
     // slice_buffer_.
     have_backup_ = backup_slice_.refcount != NULL;
     byte_count_ -= count;
@@ -231,4 +231,4 @@ class UnlimitedSizeProtoSerializationTraits {
       : public UnlimitedSizeProtoSerializationTraits<MessageType> {}; \
   }  // namespace grpc
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index c3f513d4926e9abe59561e4146237d3ced244ea7..8b12ac1461d6b1fa3098197aa7697031a5d3075b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
 
 #include <memory>
 
@@ -141,4 +141,4 @@ class GrpcServer : public ServerInterface {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 9a08335c1c93c56e8bbd61a76bae211482555e62..120a33f17b0d1f81e50dfbc844f56e3d85def096 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -190,6 +190,9 @@ Status GrpcSession::RunHelper(
     req->add_feed(it.first, it.second);
   }
 
+  // Support long error messages by storing the error code in the response body.
+  req->set_store_errors_in_response_body(true);
+
   // Build an index from fetch tensor name to first index in
   // output_tensor_names.
   std::unordered_map<string, int> output_name_to_offset;
@@ -207,6 +210,11 @@ Status GrpcSession::RunHelper(
   call_options.SetTimeout(req->options().timeout_in_ms());
   TF_RETURN_IF_ERROR(RunProto(&call_options, req.get(), resp.get()));
 
+  // Look for an extended error returned in the response body.
+  if (resp->status_code() != error::Code::OK) {
+    return Status(resp->status_code(), resp->status_error_message());
+  }
+
   if (!output_tensor_names.empty()) {
     outputs->resize(output_tensor_names.size());
   }
@@ -322,7 +330,7 @@ Status GrpcSession::Close() {
   {
     mutex_lock l(mu_);
     if (handle_.empty()) {
-      return errors::InvalidArgument("A session is not created yet....");
+      return Status::OK();
     }
     req.set_session_handle(handle_);
     handle_.clear();
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index 300f7271249d88e4aa2153e64d2b2671a6168b65..d87956a13515fde533e746d2abd04e4a2f4959ae 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
 
 #include <memory>
 #include <string>
@@ -130,4 +130,4 @@ class GrpcSession : public Session {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SESSION_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index b673f200ccaaccbdab7b0f589af3d3450a6c44b6..335c3febe20e17e5b5ea57dc68c69e616997e14b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -572,6 +572,66 @@ TEST(GrpcSessionTest, Error) {
   Env::Default()->SleepForMicroseconds(2000000);
 }
 
+TEST(GrpcSessionTest, LongErrorMessage) {
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  const string& master = cluster->targets()[0];
+  const string& dev_a = cluster->devices()[0].name();
+  const string& dev_b = cluster->devices()[1].name();
+  LOG(INFO) << "master " << master << "dev_a " << dev_a << "dev_b " << dev_b;
+  GraphDef gdef;
+  std::vector<string> fetches;
+  {
+    Graph g(OpRegistry::Global());
+
+    // a2 = a + error(a)
+    //
+    // Subgraph for "a" fails. The master will cancel the subgraph for
+    // "b" and then returns the Session::Run.
+    auto a = test::graph::Constant(&g, Tensor());
+    a->set_assigned_device_name(dev_a);
+    std::vector<char> long_string_buffer(1024 * 1024, 'x');
+    StringPiece long_string(long_string_buffer.data(), 1024 * 1024);
+    string name = strings::StrCat(long_string, "fantasia!");
+    auto a_err = test::graph::Error(&g, a, name);
+    a_err->set_assigned_device_name(dev_a);
+    auto a2 = test::graph::Add(&g, a, a_err);
+    a2->set_assigned_device_name(dev_a);
+    fetches.push_back(a2->name());
+
+    // b2 = b + delay(b)
+    //
+    // Subgraph for "b" sleeps at the node "b_delay". When the sleep
+    // finishes, the subgraph "b" will continue execution till it
+    // notices that it is canceled. Meanwhile, subgraph's executor
+    // and its related state (registered ops) should still be alive.
+    auto b = test::graph::Constant(&g, Tensor());
+    b->set_assigned_device_name(dev_b);
+    auto b_delay = test::graph::Delay(&g, b, Microseconds(1000000));
+    b_delay->set_assigned_device_name(dev_b);
+    auto b2 = test::graph::Add(&g, b, b_delay);
+    b2->set_assigned_device_name(dev_b);
+    fetches.push_back(b2->name());
+    test::graph::ToGraphDef(&g, &gdef);
+  }
+  std::unique_ptr<Session> session(NewRemote(Options(master, 1)));
+  ASSERT_TRUE(session != nullptr);
+
+  TF_CHECK_OK(session->Create(gdef));
+  {
+    Status status = session->Run({}, fetches, {}, nullptr);
+    EXPECT_FALSE(status.ok());
+    EXPECT_NE(status.ToString().find("fantasia!"), string::npos);
+  }
+  // session->Close() shall clean up all states related to the session->
+  // E.g., deregisters subgraph with workers, etc.
+  TF_CHECK_OK(session->Close());
+
+  // Sleep a bit so that most of asynchronous works finishes before
+  // the test process finishes.
+  Env::Default()->SleepForMicroseconds(2000000);
+}
+
 TEST(SessionTest, SharedVar) {
   std::unique_ptr<test::TestCluster> cluster;
   TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 1, &cluster));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 3f80bdfb70d0f3054b35a17ee34ec53655ccccc1..0b6f9474dd9e520b21c1915578cd8071a28ac7fd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
 
 #include <utility>
 
@@ -96,4 +96,4 @@ class RPCState : public GrpcClientCQTag {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
index 5e81b90189484053907f9b3f70154d1f2ce25775..4b3a03b1d708744bded25ff4d320979bb7eb38b2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
 
 #include <memory>
 #include <string>
@@ -70,4 +70,4 @@ class TestCluster {
 }  // end namespace test
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_TESTLIB_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
index 373eecffcab1dded60de7ffea96ba58208bb692c..5597ee7a76a55f125dd0db82eceb58f5e922ab13 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
@@ -21,11 +21,8 @@ namespace tensorflow {
 namespace test {
 
 // ErrorOp::Compute returns an error.
-REGISTER_OP("Error")
-    .Input("in: T")
-    .Output("out: T")
-    .Attr("T: type")
-    .Attr("message: string");
+REGISTER_OP("Error").Input("in: T").Output("out: T").Attr("T: type").Attr(
+    "message: string");
 class ErrorOp : public OpKernel {
  public:
   explicit ErrorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -66,11 +63,8 @@ REGISTER_KERNEL_BUILDER(Name("InvalidRefType").Device(DEVICE_CPU),
 
 // DelayOp::AsyncCompute sleeps for "micros"-econd and then returns
 // its input.
-REGISTER_OP("Delay")
-    .Input("in: T")
-    .Output("out: T")
-    .Attr("T: type")
-    .Attr("micros: int");
+REGISTER_OP("Delay").Input("in: T").Output("out: T").Attr("T: type").Attr(
+    "micros: int");
 class DelayOp : public AsyncOpKernel {
  public:
   explicit DelayOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index 0ddcd89130b3b1b1209c255b6200d8ce88d4cb7c..d5e7e9f5b39e9f1ab9704de3f8ec7964096ae569 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
 
 #include <memory>
 
@@ -23,15 +23,36 @@ limitations under the License.
 #include "grpc++/support/byte_buffer.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 
+constexpr char kStreamRemovedMessage[] = "Stream removed";
+
+// Identify if the given grpc::Status corresponds to an HTTP stream removed
+// error (see chttp2_transport.cc).
+//
+// When auto-reconnecting to a remote TensorFlow worker after it restarts, gRPC
+// can return an UNKNOWN error code with a "Stream removed" error message.
+// This should not be treated as an unrecoverable error.
+//
+// N.B. This is dependent on the error message from grpc remaining consistent.
+inline bool IsStreamRemovedError(const ::grpc::Status& s) {
+  return !s.ok() && s.error_code() == ::grpc::StatusCode::UNKNOWN &&
+         s.error_message() == kStreamRemovedMessage;
+}
+
 inline Status FromGrpcStatus(const ::grpc::Status& s) {
   if (s.ok()) {
     return Status::OK();
   } else {
+    // Convert "UNKNOWN" stream removed errors into unavailable, to allow
+    // for retry upstream.
+    if (IsStreamRemovedError(s)) {
+      return Status(tensorflow::error::UNAVAILABLE, s.error_message());
+    }
     return Status(static_cast<tensorflow::error::Code>(s.error_code()),
                   s.error_message());
   }
@@ -41,6 +62,13 @@ inline ::grpc::Status ToGrpcStatus(const ::tensorflow::Status& s) {
   if (s.ok()) {
     return ::grpc::Status::OK;
   } else {
+    if (s.error_message().size() > 3072 /* 3k bytes */) {
+      // TODO(b/62947679): Remove truncation once the gRPC issue is resolved.
+      string scratch =
+          strings::Printf("%.3072s ... [truncated]", s.error_message().c_str());
+      LOG(ERROR) << "Truncated error message: " << s;
+      return ::grpc::Status(static_cast<::grpc::StatusCode>(s.code()), scratch);
+    }
     return ::grpc::Status(static_cast<::grpc::StatusCode>(s.code()),
                           s.error_message());
   }
@@ -86,4 +114,4 @@ class GrpcByteBufferSource : public ::grpc::protobuf::io::ZeroCopyInputStream {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index a7b93e04607fe2dbb9bd87b372441607b5a19b0c..2ed07e3669a3badd82b8ef27f45bac2b712c8978 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
 
+#include <unordered_map>
+
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h"
@@ -23,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_cache_partial.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 
@@ -30,29 +33,21 @@ namespace {
 
 class GrpcWorkerCache : public WorkerCachePartial {
  public:
+  // TODO(ncteisen): consider adding a config var or flag for this
+  static constexpr const size_t kGrpcWorkerCacheThreadCount = 2;
+
   explicit GrpcWorkerCache(GrpcChannelCache* channel_cache,
                            WorkerInterface* local_worker,
                            const string& local_target)
       : local_target_(local_target),
         local_worker_(local_worker),
-        channel_cache_(channel_cache) {
-    // TODO(mrry): Investigate possible performance improvements by
-    // replacing this thread with a threadpool.
-    polling_thread_ = Env::Default()->StartThread(
-        ThreadOptions(), "grpc_worker_cache", [this]() {
-          void* tag;
-          bool ok;
-          while (completion_queue_.Next(&tag, &ok)) {
-            GrpcClientCQTag* callback_tag = static_cast<GrpcClientCQTag*>(tag);
-            callback_tag->OnCompleted(ok);
-          }
-        });
-  }
+        channel_cache_(channel_cache),
+        threads_(kGrpcWorkerCacheThreadCount),
+        next_round_robin_assignment_(0) {}
 
   // Explicit destructor to control destruction order.
   ~GrpcWorkerCache() override {
-    completion_queue_.Shutdown();
-    delete polling_thread_;  // Blocks until thread exits.
+    threads_.clear();  // Blocks until threads exit.
     delete channel_cache_;
   }
 
@@ -66,7 +61,9 @@ class GrpcWorkerCache : public WorkerCachePartial {
     } else {
       SharedGrpcChannelPtr channel = channel_cache_->FindWorkerChannel(target);
       if (!channel) return nullptr;
-      return NewGrpcRemoteWorker(channel, &completion_queue_, &logger_);
+      return NewGrpcRemoteWorker(
+          channel, threads_[AssignWorkerToThread(target)].completion_queue(),
+          &logger_);
     }
   }
 
@@ -88,12 +85,59 @@ class GrpcWorkerCache : public WorkerCachePartial {
   }
 
  private:
+  // Thread wrapping class that drives work over a single gRPC
+  // CompletionQueue.
+  class GrpcWorkerCacheThread {
+   public:
+    GrpcWorkerCacheThread() {
+      thread_.reset(Env::Default()->StartThread(
+          ThreadOptions(), "grpc_worker_cache", [this]() {
+            void* tag;
+            bool ok;
+            while (completion_queue_.Next(&tag, &ok)) {
+              GrpcClientCQTag* callback_tag =
+                  static_cast<GrpcClientCQTag*>(tag);
+              callback_tag->OnCompleted(ok);
+            }
+          }));
+    }
+
+    ~GrpcWorkerCacheThread() {
+      completion_queue_.Shutdown();
+      thread_.reset();
+    }
+
+    ::grpc::CompletionQueue* completion_queue() { return &completion_queue_; }
+
+   private:
+    ::grpc::CompletionQueue completion_queue_;
+    std::unique_ptr<Thread> thread_;
+  };  // GrpcWorkerCacheThread
+
+  size_t AssignWorkerToThread(const string& target) {
+    // Round-robin target assignment, but keeps the same target on the same
+    // polling thread always, as this is important for gRPC performace
+    mutex_lock lock(assignment_mu_);
+    auto it = target_assignments_.find(target);
+    if (it == target_assignments_.end()) {
+      it = target_assignments_
+               .insert(std::make_pair(
+                   target, (next_round_robin_assignment_++) % threads_.size()))
+               .first;
+    }
+    return it->second;
+  }
+
   const string local_target_;
   WorkerInterface* const local_worker_;  // Not owned.
-  GrpcChannelCache* channel_cache_;  // Owned.
-  ::grpc::CompletionQueue completion_queue_;
-  Thread* polling_thread_;  // Owned.
+  GrpcChannelCache* channel_cache_;      // Owned.
   WorkerCacheLogger logger_;
+  std::vector<GrpcWorkerCacheThread> threads_;
+
+  mutex assignment_mu_;
+  std::unordered_map<std::string, size_t> target_assignments_
+      GUARDED_BY(assignment_mu_);
+  size_t next_round_robin_assignment_ GUARDED_BY(assignment_mu_);
 };
 
 }  // namespace
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
index 17a307a6d99748c4d5daa689c9967622ed933d87..7a35fdbca08e1f7a79e77418f69efb3e4fa80e0a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
@@ -29,4 +29,4 @@ WorkerCacheInterface* NewGrpcWorkerCacheWithLocalWorker(
     const string& local_target);
 
 }  // namespace tensorflow
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_CACHE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index eee93ec65726b416fdf8d4fe8a339c0fc3bf2d48..1beb198732ad40ed9e21f66c665ff82a231eebb6 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -51,19 +51,23 @@ namespace tensorflow {
 namespace {
 
 class GrpcWorkerService : public AsyncServiceInterface {
+  // TODO(ncteisen): consider adding a config var or flag for this
+  static constexpr const size_t kGrpcWorkerServiceThreadCount = 2;
+
  public:
   GrpcWorkerService(GrpcWorker* worker, ::grpc::ServerBuilder* builder)
-      : worker_(worker), is_shutdown_(false) {
+      : is_shutdown_(false) {
     builder->RegisterService(&worker_service_);
-    cq_ = builder->AddCompletionQueue();
+    for (int i = 0; i < kGrpcWorkerServiceThreadCount; i++) {
+      threads_.emplace_back(
+          new GrpcWorkerServiceThread(worker, builder, &worker_service_));
+    }
   }
 
-  ~GrpcWorkerService() override { delete shutdown_alarm_; }
-
   void Shutdown() override {
     bool did_shutdown = false;
     {
-      mutex_lock l(shutdown_mu_);
+      mutex_lock l(service_shutdown_mu_);
       if (!is_shutdown_) {
         LOG(INFO) << "Shutting down GrpcWorkerService.";
         is_shutdown_ = true;
@@ -71,11 +75,9 @@ class GrpcWorkerService : public AsyncServiceInterface {
       }
     }
     if (did_shutdown) {
-      // NOTE(mrry): This enqueues a special event (with a null tag)
-      // that causes the completion queue to be shut down on the
-      // polling thread.
-      shutdown_alarm_ =
-          new ::grpc::Alarm(cq_.get(), gpr_now(GPR_CLOCK_MONOTONIC), nullptr);
+      for (auto& worker_thread : threads_) {
+        worker_thread->Shutdown();
+      }
     }
   }
 
@@ -90,227 +92,270 @@ class GrpcWorkerService : public AsyncServiceInterface {
 // The implementation of the request handler for each RPC method
 // must ensure that it calls ENQUEUE_REQUEST() for that RPC method,
 // to keep accepting new requests.
-#define ENQUEUE_REQUEST(method, supports_cancel)                       \
-  do {                                                                 \
-    mutex_lock l(shutdown_mu_);                                        \
-    if (!is_shutdown_) {                                               \
-      Call<GrpcWorkerService, grpc::WorkerService::AsyncService,       \
-           method##Request, method##Response>::                        \
-          EnqueueRequestForMethod(                                     \
-              &worker_service_, cq_.get(),                             \
-              static_cast<int>(GrpcWorkerMethod::k##method),           \
-              &GrpcWorkerService::method##Handler, (supports_cancel)); \
-    }                                                                  \
+#define ENQUEUE_REQUEST(method, supports_cancel)                             \
+  do {                                                                       \
+    mutex_lock l(shutdown_mu_);                                              \
+    if (!is_shutdown_) {                                                     \
+      Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,       \
+           method##Request, method##Response>::                              \
+          EnqueueRequestForMethod(                                           \
+              worker_service_, cq_.get(),                                    \
+              static_cast<int>(GrpcWorkerMethod::k##method),                 \
+              &GrpcWorkerServiceThread::method##Handler, (supports_cancel)); \
+    }                                                                        \
   } while (0)
 
   // This method blocks forever handling requests from the completion queue.
   void HandleRPCsLoop() override {
-    // TODO(mrry): This may require performance engineering. We can
-    // add more threads to service the completion queue, and add more
-    // of various request types if they are short and frequent.
-    // Currently we allow unbounded numbers of pending calls for each
-    // method, by re-enqueuing a request before the previous one
-    // completes, and we may decide to bound some of the request
-    // types.
-    ENQUEUE_REQUEST(GetStatus, false);
-    ENQUEUE_REQUEST(CreateWorkerSession, false);
-    ENQUEUE_REQUEST(DeleteWorkerSession, false);
-    ENQUEUE_REQUEST(CleanupAll, false);
-    ENQUEUE_REQUEST(RegisterGraph, false);
-    ENQUEUE_REQUEST(DeregisterGraph, false);
-
-    // TODO(mrry): Determine a better policy for enqueuing the appropriate
-    // number of each request type.
-    for (int i = 0; i < 1000; ++i) {
-      EnqueueRecvTensorRequestRaw();
+    for (auto& worker_thread : threads_) {
+      worker_thread->Start();
     }
-    for (int i = 0; i < 100; ++i) {
-      ENQUEUE_REQUEST(RunGraph, true);
+    for (auto& worker_thread : threads_) {
+      worker_thread->Join();
     }
-    for (int i = 0; i < 100; ++i) {
-      ENQUEUE_REQUEST(CleanupGraph, false);
+  }
+
+ private:
+  // Thread wrapping class that drives work over a single gRPC
+  // CompletionQueue.
+  class GrpcWorkerServiceThread {
+   public:
+    explicit GrpcWorkerServiceThread(
+        GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+        grpc::WorkerService::AsyncService* worker_service)
+        : worker_(worker),
+          worker_service_(worker_service),
+          is_shutdown_(false) {
+      cq_ = builder->AddCompletionQueue();
     }
 
-    ENQUEUE_REQUEST(Logging, false);
-    ENQUEUE_REQUEST(Tracing, false);
+    void Start() {
+      thread_.reset(worker_->env()->env->StartThread(
+          ThreadOptions(), "grpc_worker_service",
+          [this]() { HandleRPCsLoop(); }));
+    }
 
-    void* tag;
-    bool ok;
+    void Join() { thread_.reset(); }  // Blocks until thread exits
 
-    while (cq_->Next(&tag, &ok)) {
-      UntypedCall<GrpcWorkerService>::Tag* callback_tag =
-          static_cast<UntypedCall<GrpcWorkerService>::Tag*>(tag);
-      if (callback_tag) {
-        callback_tag->OnCompleted(this, ok);
-      } else {
-        // NOTE(mrry): A null `callback_tag` indicates that this is
-        // the shutdown alarm.
-        cq_->Shutdown();
+    void Shutdown() {
+      {
+        mutex_lock lock(shutdown_mu_);
+        is_shutdown_ = true;
       }
+      cq_->Shutdown();
     }
-  }
 
- private:
-  GrpcWorker* worker_ = nullptr;  // Not owned.
-  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+   private:
+    void HandleRPCsLoop() {
+      // TODO(ncteisen): This may require performance engineering. We can
+      // change the number of threads, the number of handlers per thread,
+      // or even decide to specialize certain threads to certain methods.
+      ENQUEUE_REQUEST(GetStatus, false);
+      ENQUEUE_REQUEST(CreateWorkerSession, false);
+      ENQUEUE_REQUEST(DeleteWorkerSession, false);
+      ENQUEUE_REQUEST(CleanupAll, false);
+      ENQUEUE_REQUEST(RegisterGraph, false);
+      ENQUEUE_REQUEST(DeregisterGraph, false);
+
+      // TODO(ncteisen): Determine a better policy for enqueuing the
+      // appropriate number of each request type.
+      for (int i = 0; i < 1000; ++i) {
+        EnqueueRecvTensorRequestRaw();
+      }
+      for (int i = 0; i < 100; ++i) {
+        ENQUEUE_REQUEST(RunGraph, true);
+      }
+      for (int i = 0; i < 100; ++i) {
+        ENQUEUE_REQUEST(CleanupGraph, false);
+      }
 
-  grpc::WorkerService::AsyncService worker_service_;
+      ENQUEUE_REQUEST(Logging, false);
+      ENQUEUE_REQUEST(Tracing, false);
 
-  mutex shutdown_mu_;
-  bool is_shutdown_ GUARDED_BY(shutdown_mu_);
-  ::grpc::Alarm* shutdown_alarm_ = nullptr;
+      void* tag;
+      bool ok;
 
-  void Schedule(std::function<void()> f) {
-    worker_->env()->compute_pool->Schedule(std::move(f));
-  }
+      while (cq_->Next(&tag, &ok)) {
+        UntypedCall<GrpcWorkerServiceThread>::Tag* callback_tag =
+            static_cast<UntypedCall<GrpcWorkerServiceThread>::Tag*>(tag);
+        CHECK(callback_tag);
+        callback_tag->OnCompleted(this, ok);
+      }
+    }
 
-  // The following section contains one request handler method per
-  // RPC. The `FooHandler` method is called (indirectly) by
-  // `HandleRPCsLoop()` when the next Foo RPC is received. Each
-  // `FooHandler` call schedules a closure on `worker_->env()->compute_pool`,
-  // and is responsible for requesting the next Foo call by calling
-  // `ENQUEUE_REQUEST(Foo)`.
-
-  template <class RequestMessage, class ResponseMessage>
-  using WorkerCall = Call<GrpcWorkerService, grpc::WorkerService::AsyncService,
-                          RequestMessage, ResponseMessage>;
-
-  void GetStatusHandler(WorkerCall<GetStatusRequest, GetStatusResponse>* call) {
-    Schedule([this, call]() {
-      Status s = worker_->GetStatus(&call->request, &call->response);
-      call->SendResponse(ToGrpcStatus(s));
-    });
-    ENQUEUE_REQUEST(GetStatus, false);
-  }
+   private:
+    void Schedule(std::function<void()> f) {
+      worker_->env()->compute_pool->Schedule(std::move(f));
+    }
 
-  void CreateWorkerSessionHandler(
-      WorkerCall<CreateWorkerSessionRequest, CreateWorkerSessionResponse>*
-          call) {
-    Schedule([this, call]() {
-      Status s = worker_->CreateWorkerSession(&call->request, &call->response);
-      call->SendResponse(ToGrpcStatus(s));
-    });
-    ENQUEUE_REQUEST(CreateWorkerSession, false);
-  }
+    // The following section contains one request handler method per
+    // RPC. The `FooHandler` method is called (indirectly) by
+    // `HandleRPCsLoop()` when the next Foo RPC is received. Each
+    // `FooHandler` call schedules a closure on `worker_->env()->compute_pool`,
+    // and is responsible for requesting the next Foo call by calling
+    // `ENQUEUE_REQUEST(Foo)`.
+
+    template <class RequestMessage, class ResponseMessage>
+    using WorkerCall =
+        Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
+             RequestMessage, ResponseMessage>;
+
+    void GetStatusHandler(
+        WorkerCall<GetStatusRequest, GetStatusResponse>* call) {
+      Schedule([this, call]() {
+        Status s = worker_->GetStatus(&call->request, &call->response);
+        call->SendResponse(ToGrpcStatus(s));
+      });
+      ENQUEUE_REQUEST(GetStatus, false);
+    }
 
-  void DeleteWorkerSessionHandler(
-      WorkerCall<DeleteWorkerSessionRequest, DeleteWorkerSessionResponse>*
-          call) {
-    Schedule([this, call]() {
-      Status s = worker_->DeleteWorkerSession(&call->request, &call->response);
-      call->SendResponse(ToGrpcStatus(s));
-    });
-    ENQUEUE_REQUEST(DeleteWorkerSession, false);
-  }
+    void CreateWorkerSessionHandler(
+        WorkerCall<CreateWorkerSessionRequest, CreateWorkerSessionResponse>*
+            call) {
+      Schedule([this, call]() {
+        Status s =
+            worker_->CreateWorkerSession(&call->request, &call->response);
+        call->SendResponse(ToGrpcStatus(s));
+      });
+      ENQUEUE_REQUEST(CreateWorkerSession, false);
+    }
 
-  void CleanupAllHandler(
-      WorkerCall<CleanupAllRequest, CleanupAllResponse>* call) {
-    Schedule([this, call]() {
-      Status s = worker_->CleanupAll(&call->request, &call->response);
-      call->SendResponse(ToGrpcStatus(s));
-    });
-    ENQUEUE_REQUEST(CleanupAll, false);
-  }
+    void DeleteWorkerSessionHandler(
+        WorkerCall<DeleteWorkerSessionRequest, DeleteWorkerSessionResponse>*
+            call) {
+      Schedule([this, call]() {
+        Status s =
+            worker_->DeleteWorkerSession(&call->request, &call->response);
+        call->SendResponse(ToGrpcStatus(s));
+      });
+      ENQUEUE_REQUEST(DeleteWorkerSession, false);
+    }
 
-  void RegisterGraphHandler(
-      WorkerCall<RegisterGraphRequest, RegisterGraphResponse>* call) {
-    Schedule([this, call]() {
-      Status s = worker_->RegisterGraph(&call->request, &call->response);
-      call->SendResponse(ToGrpcStatus(s));
-    });
-    ENQUEUE_REQUEST(RegisterGraph, false);
-  }
+    void CleanupAllHandler(
+        WorkerCall<CleanupAllRequest, CleanupAllResponse>* call) {
+      Schedule([this, call]() {
+        Status s = worker_->CleanupAll(&call->request, &call->response);
+        call->SendResponse(ToGrpcStatus(s));
+      });
+      ENQUEUE_REQUEST(CleanupAll, false);
+    }
 
-  void DeregisterGraphHandler(
-      WorkerCall<DeregisterGraphRequest, DeregisterGraphResponse>* call) {
-    Schedule([this, call]() {
-      Status s = worker_->DeregisterGraph(&call->request, &call->response);
-      call->SendResponse(ToGrpcStatus(s));
-    });
-    ENQUEUE_REQUEST(DeregisterGraph, false);
-  }
+    void RegisterGraphHandler(
+        WorkerCall<RegisterGraphRequest, RegisterGraphResponse>* call) {
+      Schedule([this, call]() {
+        Status s = worker_->RegisterGraph(&call->request, &call->response);
+        call->SendResponse(ToGrpcStatus(s));
+      });
+      ENQUEUE_REQUEST(RegisterGraph, false);
+    }
 
-  void RunGraphHandler(WorkerCall<RunGraphRequest, RunGraphResponse>* call) {
-    Schedule([this, call]() {
-      CallOptions* call_opts = new CallOptions;
-      ProtoRunGraphRequest* wrapped_request =
-          new ProtoRunGraphRequest(&call->request);
-      NonOwnedProtoRunGraphResponse* wrapped_response =
-          new NonOwnedProtoRunGraphResponse(&call->response);
-      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-      worker_->RunGraphAsync(call_opts, wrapped_request, wrapped_response,
-                             [call, call_opts, wrapped_request,
-                              wrapped_response](const Status& s) {
-                               call->ClearCancelCallback();
-                               delete call_opts;
-                               delete wrapped_request;
-                               delete wrapped_response;
-                               call->SendResponse(ToGrpcStatus(s));
-                             });
-    });
-    ENQUEUE_REQUEST(RunGraph, true);
-  }
+    void DeregisterGraphHandler(
+        WorkerCall<DeregisterGraphRequest, DeregisterGraphResponse>* call) {
+      Schedule([this, call]() {
+        Status s = worker_->DeregisterGraph(&call->request, &call->response);
+        call->SendResponse(ToGrpcStatus(s));
+      });
+      ENQUEUE_REQUEST(DeregisterGraph, false);
+    }
 
-  void RecvTensorHandlerRaw(
-      WorkerCall<RecvTensorRequest, ::grpc::ByteBuffer>* call) {
-    Schedule([this, call]() {
-      CallOptions* call_opts = new CallOptions;
-      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-      worker_->GrpcRecvTensorAsync(call_opts, &call->request, &call->response,
-                                   [call, call_opts](const Status& s) {
-                                     call->ClearCancelCallback();
-                                     delete call_opts;
-                                     call->SendResponse(ToGrpcStatus(s));
-                                   });
-    });
-    EnqueueRecvTensorRequestRaw();
-  }
+    void RunGraphHandler(WorkerCall<RunGraphRequest, RunGraphResponse>* call) {
+      Schedule([this, call]() {
+        CallOptions* call_opts = new CallOptions;
+        ProtoRunGraphRequest* wrapped_request =
+            new ProtoRunGraphRequest(&call->request);
+        NonOwnedProtoRunGraphResponse* wrapped_response =
+            new NonOwnedProtoRunGraphResponse(&call->response);
+        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+        worker_->RunGraphAsync(call_opts, wrapped_request, wrapped_response,
+                               [call, call_opts, wrapped_request,
+                                wrapped_response](const Status& s) {
+                                 call->ClearCancelCallback();
+                                 delete call_opts;
+                                 delete wrapped_request;
+                                 delete wrapped_response;
+                                 call->SendResponse(ToGrpcStatus(s));
+                               });
+      });
+      ENQUEUE_REQUEST(RunGraph, true);
+    }
 
-  void CleanupGraphHandler(
-      WorkerCall<CleanupGraphRequest, CleanupGraphResponse>* call) {
-    Schedule([this, call]() {
-      Status s = worker_->CleanupGraph(&call->request, &call->response);
-      call->SendResponse(ToGrpcStatus(s));
-    });
-    ENQUEUE_REQUEST(CleanupGraph, false);
-  }
+    void RecvTensorHandlerRaw(
+        WorkerCall<RecvTensorRequest, ::grpc::ByteBuffer>* call) {
+      Schedule([this, call]() {
+        CallOptions* call_opts = new CallOptions;
+        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+        worker_->GrpcRecvTensorAsync(call_opts, &call->request, &call->response,
+                                     [call, call_opts](const Status& s) {
+                                       call->ClearCancelCallback();
+                                       delete call_opts;
+                                       call->SendResponse(ToGrpcStatus(s));
+                                     });
+      });
+      EnqueueRecvTensorRequestRaw();
+    }
 
-  void LoggingHandler(WorkerCall<LoggingRequest, LoggingResponse>* call) {
-    Schedule([this, call]() {
-      Status s = worker_->Logging(&call->request, &call->response);
-      call->SendResponse(ToGrpcStatus(s));
-    });
-    ENQUEUE_REQUEST(Logging, false);
-  }
+    void CleanupGraphHandler(
+        WorkerCall<CleanupGraphRequest, CleanupGraphResponse>* call) {
+      Schedule([this, call]() {
+        Status s = worker_->CleanupGraph(&call->request, &call->response);
+        call->SendResponse(ToGrpcStatus(s));
+      });
+      ENQUEUE_REQUEST(CleanupGraph, false);
+    }
 
-  void TracingHandler(WorkerCall<TracingRequest, TracingResponse>* call) {
-    Schedule([this, call]() {
-      Status s = worker_->Tracing(&call->request, &call->response);
-      call->SendResponse(ToGrpcStatus(s));
-    });
-    ENQUEUE_REQUEST(Tracing, false);
-  }
+    void LoggingHandler(WorkerCall<LoggingRequest, LoggingResponse>* call) {
+      Schedule([this, call]() {
+        Status s = worker_->Logging(&call->request, &call->response);
+        call->SendResponse(ToGrpcStatus(s));
+      });
+      ENQUEUE_REQUEST(Logging, false);
+    }
+
+    void TracingHandler(WorkerCall<TracingRequest, TracingResponse>* call) {
+      Schedule([this, call]() {
+        Status s = worker_->Tracing(&call->request, &call->response);
+        call->SendResponse(ToGrpcStatus(s));
+      });
+      ENQUEUE_REQUEST(Tracing, false);
+    }
 #undef ENQUEUE_REQUEST
 
-  void EnqueueRecvTensorRequestRaw() {
-    mutex_lock l(shutdown_mu_);
-    if (!is_shutdown_) {
-      Call<GrpcWorkerService, grpc::WorkerService::AsyncService,
-           RecvTensorRequest, ::grpc::ByteBuffer>::
-          EnqueueRequestForMethod(
-              &worker_service_, cq_.get(),
-              static_cast<int>(GrpcWorkerMethod::kRecvTensor),
-              &GrpcWorkerService::RecvTensorHandlerRaw,
-              true /* supports cancel*/);
+    void EnqueueRecvTensorRequestRaw() {
+      mutex_lock l(shutdown_mu_);
+      if (!is_shutdown_) {
+        Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
+             RecvTensorRequest, ::grpc::ByteBuffer>::
+            EnqueueRequestForMethod(
+                worker_service_, cq_.get(),
+                static_cast<int>(GrpcWorkerMethod::kRecvTensor),
+                &GrpcWorkerServiceThread::RecvTensorHandlerRaw,
+                true /* supports cancel*/);
+      }
     }
-  }
+
+    GrpcWorker* const worker_ = nullptr;  // Not owned.
+    std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+    std::unique_ptr<Thread> thread_;
+    grpc::WorkerService::AsyncService* const worker_service_;
+
+    mutex shutdown_mu_;
+    bool is_shutdown_ GUARDED_BY(shutdown_mu_);
+    TF_DISALLOW_COPY_AND_ASSIGN(GrpcWorkerServiceThread);
+  };  // GrpcWorkerServiceThread
+
+  grpc::WorkerService::AsyncService worker_service_;
+  std::vector<std::unique_ptr<GrpcWorkerServiceThread>> threads_;
+
+  mutex service_shutdown_mu_;
+  bool is_shutdown_ GUARDED_BY(service_shutdown_mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcWorkerService);
 };
 
 }  // namespace
 
-GrpcWorker::GrpcWorker(WorkerEnv* worker_env) : Worker(worker_env) {}
+GrpcWorker::GrpcWorker(WorkerEnv* worker_env)
+    : Worker(worker_env), recv_tensor_recent_request_ids_(100000) {}
 
 // GrpcRecvTensorAsync: unlike the other Worker methods, which use protocol
 // buffers for a response object, to avoid extra protocol buffer serialization
@@ -319,11 +364,18 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
                                      const RecvTensorRequest* request,
                                      ::grpc::ByteBuffer* response,
                                      StatusCallback done) {
+  Status s = recv_tensor_recent_request_ids_.TrackUnique(
+      request->request_id(), "RecvTensor (GrpcWorker)", *request);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+
   const int64 step_id = request->step_id();
   const string& key = request->rendezvous_key();
   TRACEPRINTF("RecvTensor: %lld %s", step_id, key.c_str());
   Rendezvous::ParsedKey parsed;
-  Status s = Rendezvous::ParseKey(key, &parsed);
+  s = Rendezvous::ParseKey(key, &parsed);
   Device* src_dev = nullptr;
   if (s.ok()) {
     s = PrepareRecvTensor(parsed, &src_dev);
@@ -392,6 +444,24 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
       });
 }
 
+void GrpcWorker::LoggingAsync(const LoggingRequest* request,
+                              LoggingResponse* response, StatusCallback done) {
+  auto env = this->env();
+  if (env) {
+    auto session_mgr = (SessionMgr*)env->session_mgr;
+    if (session_mgr) {
+      session_mgr->SetLogging(request->rpc_logging());
+      for (const auto& step_id : request->fetch_step_id()) {
+        session_mgr->RetrieveLogs(step_id, response);
+      }
+      if (request->clear()) {
+        session_mgr->ClearLogs();
+      }
+    }
+  }
+  done(Status::OK());
+}
+
 WorkerEnv* GrpcWorker::env() { return env_; }
 
 std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* env) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 64d7c986daf1f78dafdbdf459034fd51db4d699d..fbddbda9e6f9e5561d4db0e035a48ed8db0d8559 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
 
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
 #include "tensorflow/core/distributed_runtime/worker.h"
 
 namespace grpc {
@@ -39,7 +40,13 @@ class GrpcWorker : public Worker {
                                    ::grpc::ByteBuffer* response,
                                    StatusCallback done);
 
+  virtual void LoggingAsync(const LoggingRequest* request,
+                            LoggingResponse* response, StatusCallback done);
+
   WorkerEnv* env();
+
+ private:
+  RecentRequestIds recv_tensor_recent_request_ids_;
 };
 
 std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* worker_env);
@@ -50,4 +57,4 @@ std::unique_ptr<AsyncServiceInterface> NewGrpcWorkerService(
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index fb23f8631fd17a7533fde01cde9453dc8ea8505a..1a5e2edfb240198c50d3b5d00bec1127fceff725 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
 
 #include "grpc++/impl/codegen/async_stream.h"
 #include "grpc++/impl/codegen/async_unary_call.h"
@@ -147,4 +147,4 @@ class WorkerService final {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 72dfe5c062177de7039980ece31778e7cac06592..067dc5dff5bb81f8cc1da883d226ee3cfa5638f2 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
@@ -67,6 +68,7 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
     done_ = std::move(done);
     req_.set_step_id(step_id);
     req_.set_rendezvous_key(key.data(), key.size());
+    req_.set_request_id(GetUniqueRequestId());
   }
 
   void Reset(WorkerCacheInterface* wc) {
diff --git a/tensorflow/core/distributed_runtime/rpcbench_test.cc b/tensorflow/core/distributed_runtime/rpcbench_test.cc
index b2668fae25a8a6bc60b37ddfaa83b8b523c3a6f5..d3af7417e61105c788b8029c84c222e49a0d2830 100644
--- a/tensorflow/core/distributed_runtime/rpcbench_test.cc
+++ b/tensorflow/core/distributed_runtime/rpcbench_test.cc
@@ -184,8 +184,8 @@ static void BM_Helper(int iters, int width, int num_stages, int tensor_size,
 
   testing::SetLabel(
       strings::StrCat(def.node_size(), " nodes; ",
-             use_multiple_devices ? "Multi device" : "Single device",
-             "; tensor bytes/send: ", tensor_size * sizeof(float)));
+                      use_multiple_devices ? "Multi device" : "Single device",
+                      "; tensor bytes/send: ", tensor_size * sizeof(float)));
 
   std::vector<Tensor> outputs;
 
diff --git a/tensorflow/core/distributed_runtime/scheduler.cc b/tensorflow/core/distributed_runtime/scheduler.cc
index 4766f4c33b654481f7d99ab82939e33e77564771..9dae5b3b926fab14c2b36955436d3956baa29fdd 100644
--- a/tensorflow/core/distributed_runtime/scheduler.cc
+++ b/tensorflow/core/distributed_runtime/scheduler.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <queue>
 
-#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/util/util.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/distributed_runtime/scheduler.h b/tensorflow/core/distributed_runtime/scheduler.h
index eabcaccdd1e6c1a732f8871bc9da6265bd9a8dd8..ef87b9834dba50cf628a8c29c70b0266661d6227 100644
--- a/tensorflow/core/distributed_runtime/scheduler.h
+++ b/tensorflow/core/distributed_runtime/scheduler.h
@@ -16,15 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SCHEDULER_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SCHEDULER_H_
 
-#include <functional>
 #include <deque>
+#include <functional>
 #include <map>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/core/graph/costmodel.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/graph/costmodel.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/server_lib.h b/tensorflow/core/distributed_runtime/server_lib.h
index a064d20cdb84fe82a53e85a95944301e9761bb03..275f526d311aec571a2f2ffb8a377d952b6ae8dc 100644
--- a/tensorflow/core/distributed_runtime/server_lib.h
+++ b/tensorflow/core/distributed_runtime/server_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
 
 #include <memory>
 
@@ -95,4 +95,4 @@ Status NewServer(const ServerDef& server_def,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SERVER_LIB_H_
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index fabcbd00f5e59a68a8db54c441dcc74377c44617..51b9547f53ba687c863b0fd11647e7bb82d80e03 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -33,17 +33,18 @@ SessionMgr::SessionMgr(
     WorkerCacheFactory worker_cache_factory)
     : worker_env_(worker_env),
       default_worker_cache_(std::move(default_worker_cache)),
-      legacy_session_("", default_worker_name,
-                      std::unique_ptr<WorkerCacheInterface>(
-                          new WorkerCacheWrapper(default_worker_cache_.get())),
-                      std::unique_ptr<DeviceMgr>(worker_env->device_mgr),
-                      std::unique_ptr<GraphMgr>(
-                          new GraphMgr(worker_env, worker_env->device_mgr))),
+      legacy_session_(new WorkerSession(
+          "", default_worker_name,
+          std::unique_ptr<WorkerCacheInterface>(
+              new WorkerCacheWrapper(default_worker_cache_.get())),
+          std::unique_ptr<DeviceMgr>(worker_env->device_mgr),
+          std::unique_ptr<GraphMgr>(
+              new GraphMgr(worker_env, worker_env->device_mgr)))),
       worker_cache_factory_(std::move(worker_cache_factory)) {}
 
 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
-  return strings::StrCat("/job:", server_def.job_name(),
-                         "/replica:0/task:", server_def.task_index());
+  return strings::StrCat("/job:", server_def.job_name(), "/replica:0/task:",
+                         server_def.task_index());
 }
 
 Status SessionMgr::CreateSession(const string& session,
@@ -63,8 +64,13 @@ Status SessionMgr::CreateSession(const string& session,
     TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
   }
 
+  if (worker_cache != nullptr & default_worker_cache_.get() != nullptr) {
+    worker_cache->SetLogging(this->is_logging_active_);
+  }
+
   CHECK(!worker_env_->local_devices.empty())
       << "The WorkerEnv must have at least one device in `local_devices`.";
+
   std::vector<Device*> renamed_devices;
   for (Device* d : worker_env_->local_devices) {
     renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
@@ -75,7 +81,7 @@ Status SessionMgr::CreateSession(const string& session,
   std::unique_ptr<GraphMgr> graph_mgr(
       new GraphMgr(worker_env_, device_mgr.get()));
 
-  std::unique_ptr<WorkerSession> worker_session(new WorkerSession(
+  std::shared_ptr<WorkerSession> worker_session(new WorkerSession(
       session, worker_name, std::unique_ptr<WorkerCacheInterface>(worker_cache),
       std::move(device_mgr), std::move(graph_mgr)));
 
@@ -92,21 +98,97 @@ Status SessionMgr::DeleteSession(const string& session) {
   return Status::OK();
 }
 
-WorkerSession* SessionMgr::WorkerSessionForSessionUnlocked(
+std::shared_ptr<WorkerSession> SessionMgr::WorkerSessionForSessionUnlocked(
     const string& session) {
   auto it = sessions_.find(session);
   if (it == sessions_.end()) {
-    return &legacy_session_;
+    return legacy_session_;
   } else {
-    return it->second.get();
+    return it->second;
   }
 }
 
-WorkerSession* SessionMgr::WorkerSessionForSession(const string& session) {
+std::shared_ptr<WorkerSession> SessionMgr::WorkerSessionForSession(
+    const string& session) {
   mutex_lock l(mu_);
   return WorkerSessionForSessionUnlocked(session);
 }
 
-WorkerSession* SessionMgr::LegacySession() { return &legacy_session_; }
+std::shared_ptr<WorkerSession> SessionMgr::LegacySession() {
+  return legacy_session_;
+}
+
+void SessionMgr::SetLogging(bool active) {
+  mutex_lock l(mu_);
+  this->is_logging_active_ = active;
+  // Legacy Session
+  if (legacy_session_) {
+    auto* worker_cache = legacy_session_->worker_cache.get();
+    if (worker_cache) {
+      worker_cache->SetLogging(active);
+    }
+  }
+
+  for (const auto& session_kv : sessions_) {
+    auto session = session_kv.second.get();
+    if (session) {
+      auto* worker_cache = session->worker_cache.get();
+      if (worker_cache) {
+        worker_cache->SetLogging(active);
+      }
+    }
+  }
+}
 
+void SessionMgr::RetrieveLogs(tensorflow::int64 step_id,
+                              LoggingResponse* response) {
+  mutex_lock l(mu_);
+  // Legacy Session
+  if (legacy_session_) {
+    auto* worker_cache = legacy_session_->worker_cache.get();
+    if (worker_cache) {
+      auto step_stats = StepStats();
+      if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
+        auto* labeled_step_stats = response->add_step();
+        labeled_step_stats->set_step_id(step_id);
+        labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
+      }
+    }
+  }
+  for (const auto& session_kv : sessions_) {
+    auto session = session_kv.second.get();
+    if (session) {
+      auto* worker_cache = session->worker_cache.get();
+      if (worker_cache) {
+        auto step_stats = StepStats();
+        if (worker_cache->RetrieveLogs(step_id, &step_stats)) {
+          auto* labeled_step_stats = response->add_step();
+          labeled_step_stats->set_step_id(step_id);
+          labeled_step_stats->mutable_step_stats()->Swap(&step_stats);
+        }
+      }
+    }
+  }
+}
+
+void SessionMgr::ClearLogs() {
+  mutex_lock l(mu_);
+  // Legacy Session
+  if (legacy_session_) {
+    auto* worker_cache = legacy_session_->worker_cache.get();
+    if (worker_cache) {
+      worker_cache->ClearLogs();
+    }
+  }
+
+  for (const auto& session_kv : sessions_) {
+    auto session = session_kv.second.get();
+    if (session) {
+      auto* worker_cache = session->worker_cache.get();
+      if (worker_cache) {
+        worker_cache->ClearLogs();
+      }
+    }
+  }
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index d85b6c305941014fb52c4b4da6d646a707054c3a..4c9702d522cede454d5efd15669eaec2b0c1c1b1 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
 
 #include <functional>
 
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace tensorflow {
 
@@ -49,13 +50,19 @@ class SessionMgr {
                        bool isolate_session_state);
 
   // Locates the worker session for a given session handle
-  WorkerSession* WorkerSessionForSession(const string& session);
-  WorkerSession* LegacySession();
+  std::shared_ptr<WorkerSession> WorkerSessionForSession(const string& session);
+  std::shared_ptr<WorkerSession> LegacySession();
 
   Status DeleteSession(const string& session);
 
   static string WorkerNameFromServerDef(const ServerDef& server_def);
 
+  void SetLogging(bool active);
+
+  void RetrieveLogs(tensorflow::int64 step_id, LoggingResponse* response);
+
+  void ClearLogs();
+
  private:
   const WorkerEnv* const worker_env_;  // Not owned.
 
@@ -73,18 +80,20 @@ class SessionMgr {
   // device_mgr is deleted after WorkerSession's graph_mgr.
 
   std::unique_ptr<WorkerCacheInterface> default_worker_cache_;
-  WorkerSession legacy_session_;
+  std::shared_ptr<WorkerSession> legacy_session_;
+
+  bool is_logging_active_ = false;
 
   const WorkerCacheFactory worker_cache_factory_;
 
-  WorkerSession* WorkerSessionForSessionUnlocked(const string& session)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  std::shared_ptr<WorkerSession> WorkerSessionForSessionUnlocked(
+      const string& session) EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
   // A map from session identifier to internal session structure.
-  std::map<string, std::unique_ptr<WorkerSession>> sessions_ GUARDED_BY(mu_);
+  std::map<string, std::shared_ptr<WorkerSession>> sessions_ GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_SESSION_MGR_H_
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index ffe4809f2b10398ca4c7dc503dd82236cbc8dd18..4d028f7f4a9e5eea7cd52b67ac41b03de3f0078f 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -59,7 +59,7 @@ class SessionMgrTest : public ::testing::Test {
         return Status::OK();
       };
   SessionMgr mgr_;
-  WorkerSession* legacy_session_;
+  std::shared_ptr<WorkerSession> legacy_session_;
 };
 
 TEST_F(SessionMgrTest, CreateSessionSimple) {
@@ -69,7 +69,7 @@ TEST_F(SessionMgrTest, CreateSessionSimple) {
 
   string session_handle = "test_session_handle";
   TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true));
-  WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
+  auto session = mgr_.WorkerSessionForSession(session_handle);
   EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
   EXPECT_NE(mgr_.LegacySession(), session);
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
@@ -81,22 +81,22 @@ TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
   server_def.set_task_index(3);
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_1", server_def, false));
-  WorkerSession* session_1 = mgr_.WorkerSessionForSession("handle_1");
+  auto session_1 = mgr_.WorkerSessionForSession("handle_1");
   std::vector<Device*> devices_1 = session_1->device_mgr->ListDevices();
   EXPECT_EQ(1, devices_1.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_2", server_def, false));
-  WorkerSession* session_2 = mgr_.WorkerSessionForSession("handle_2");
+  auto session_2 = mgr_.WorkerSessionForSession("handle_2");
   std::vector<Device*> devices_2 = session_2->device_mgr->ListDevices();
   EXPECT_EQ(1, devices_2.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_3", server_def, true));
-  WorkerSession* session_3 = mgr_.WorkerSessionForSession("handle_3");
+  auto session_3 = mgr_.WorkerSessionForSession("handle_3");
   std::vector<Device*> devices_3 = session_3->device_mgr->ListDevices();
   EXPECT_EQ(1, devices_3.size());
 
   TF_EXPECT_OK(mgr_.CreateSession("handle_4", server_def, true));
-  WorkerSession* session_4 = mgr_.WorkerSessionForSession("handle_4");
+  auto session_4 = mgr_.WorkerSessionForSession("handle_4");
   std::vector<Device*> devices_4 = session_4->device_mgr->ListDevices();
   EXPECT_EQ(1, devices_4.size());
 
@@ -109,7 +109,7 @@ TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
 TEST_F(SessionMgrTest, LegacySession) {
   ServerDef server_def;
   string session_handle = "";
-  WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
+  auto session = mgr_.WorkerSessionForSession(session_handle);
   EXPECT_EQ(mgr_.LegacySession(), session);
 
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index fe2d1a12934dde814344b70f52fbc972f74347e0..34a4013547b5feef12b49198bff4e733f1b9e932 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -81,7 +81,7 @@ void TensorResponse::InitPartial(const RecvTensorResponse& response) {
 Status TensorResponse::ParseFrom(Source* source) {
   if (!on_host_) {
     protobuf::io::CodedInputStream input(source->contents());
-    input.SetTotalBytesLimit(INT_MAX, INT_MAX);  // Unlimited
+    input.SetTotalBytesLimit(INT_MAX);  // Unlimited
 
     // Pre-parse into local storage, then delegate to device.
     if (!meta_.ParseFromCodedStream(&input) || !input.ConsumedEntireMessage()) {
@@ -217,7 +217,7 @@ bool TensorResponse::ParseTensorSubmessage(
 
 bool TensorResponse::ParseFast(Source* source) {
   protobuf::io::CodedInputStream input(source->contents());
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);  // Unlimited
+  input.SetTotalBytesLimit(INT_MAX);  // Unlimited
   while (true) {
     auto p = input.ReadTagWithCutoff(127);
     int tag = GetTagFieldNumber(p.first);
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 6cd92f5fe7a9edaef1ed7db0926281d1a91cdcf2..63455493671fcd1f4282bc804f8f2a521c056dce 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -59,7 +59,7 @@ void Worker::DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
 void Worker::RegisterGraphAsync(const RegisterGraphRequest* request,
                                 RegisterGraphResponse* response,
                                 StatusCallback done) {
-  WorkerSession* session =
+  auto session =
       env_->session_mgr->WorkerSessionForSession(request->session_handle());
   Status s = session->graph_mgr->Register(
       request->session_handle(), request->graph_def(), request->graph_options(),
@@ -71,7 +71,7 @@ void Worker::RegisterGraphAsync(const RegisterGraphRequest* request,
 void Worker::DeregisterGraphAsync(const DeregisterGraphRequest* request,
                                   DeregisterGraphResponse* response,
                                   StatusCallback done) {
-  WorkerSession* session =
+  auto session =
       env_->session_mgr->WorkerSessionForSession(request->session_handle());
   Status s = session->graph_mgr->Deregister(request->graph_handle());
 
@@ -109,6 +109,12 @@ Status Worker::PrepareRunGraph(RunGraphRequestWrapper* req,
 void Worker::RunGraphAsync(CallOptions* opts, RunGraphRequestWrapper* request,
                            MutableRunGraphResponseWrapper* response,
                            StatusCallback done) {
+  if (request->store_errors_in_response_body()) {
+    done = [response, done](const Status& status) {
+      response->set_status(status);
+      done(Status::OK());
+    };
+  }
   if (request->is_partial()) {
     DoPartialRunGraph(opts, request, response, std::move(done));
   } else {
@@ -129,7 +135,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
                         StatusCallback done) {
   const int64 step_id = request->step_id();
   TRACEPRINTF("RunGraph: %lld", step_id);
-  WorkerSession* session =
+  auto session =
       env_->session_mgr->WorkerSessionForSession(request->session_handle());
   GraphMgr::NamedTensors in;
   GraphMgr::NamedTensors* out = new GraphMgr::NamedTensors;
@@ -167,7 +173,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
     }
   }
   session->graph_mgr->ExecuteAsync(
-      request->graph_handle(), step_id, session, request->exec_opts(),
+      request->graph_handle(), step_id, session.get(), request->exec_opts(),
       collector, response, cm, in,
       [this, step_id, response, session, cm, out, token, collector, opts,
        done](Status s) {
@@ -203,7 +209,7 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
   const int64 step_id = request->step_id();
   const string& graph_handle = request->graph_handle();
   TRACEPRINTF("PartialRunGraph: %lld", step_id);
-  WorkerSession* session =
+  auto session =
       env_->session_mgr->WorkerSessionForSession(request->session_handle());
 
   GraphMgr::NamedTensors in;
@@ -239,9 +245,9 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
                                               [cm]() { cm->StartCancel(); });
     }
     session->graph_mgr->ExecuteAsync(
-        graph_handle, step_id, session, request->exec_opts(),
+        graph_handle, step_id, session.get(), request->exec_opts(),
         nullptr /* collector */, nullptr /* response */, cm, in,
-        [this, token, step_id, cm](Status s) {
+        [this, token, step_id, session, cm](Status s) {
           {
             mutex_lock l(mu_);
             cancellation_manager_->DeregisterCallback(token);
diff --git a/tensorflow/core/distributed_runtime/worker.h b/tensorflow/core/distributed_runtime/worker.h
index c62347926fa11c135b6116d17f6545007e9f6115..62fa5f3cf54202c91b27ae03d9d34fc09b8392ec 100644
--- a/tensorflow/core/distributed_runtime/worker.h
+++ b/tensorflow/core/distributed_runtime/worker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
 
 #include <unordered_map>
 
@@ -120,4 +120,4 @@ class Worker : public WorkerInterface {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_H_
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
index 702af78c88014d54fe2f72a8266e5e7e43b3cfb9..95ca3c3b4d11fac0d103eb52f19d5b0b2f4ad3ea 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
@@ -97,9 +97,8 @@ void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs,
                                            const string& tensor_name,
                                            const string& src_device,
                                            const string& dst_device,
-                                           int64 bytes,
-                                           const string& details,
-                                           const string& transfer_method_name){
+                                           int64 bytes, const string& details,
+                                           const string& transfer_method_name) {
   NodeExecStats* ns = new NodeExecStats;
   ns->set_node_name(transfer_method_name);
   if (details.empty()) {
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index 9da3bb253f838efdf6d4dd97575f7ae48ba95ab1..0fd19ac27f20edbf8a2ed85d1c3d97eaabab3347 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
 
 #include <string>
 
@@ -61,4 +61,4 @@ struct WorkerSession {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_SESSION_H_
diff --git a/tensorflow/core/example/example_parser_configuration.h b/tensorflow/core/example/example_parser_configuration.h
index 69955ec4cb3deb92587e4ed95382e5eaf9f74eab..3d06bd55e2bdd845c598078438dac79edf7e475e 100644
--- a/tensorflow/core/example/example_parser_configuration.h
+++ b/tensorflow/core/example/example_parser_configuration.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSER_CONFIGURATION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSER_CONFIGURATION_H_
+#ifndef TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSER_CONFIGURATION_H_
+#define TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSER_CONFIGURATION_H_
 
 #include <string>
 #include <vector>
@@ -53,4 +53,4 @@ Status ExampleParserConfigurationProtoToFeatureVectors(
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSE_CONFIGURATION_H_
+#endif  // TENSORFLOW_CORE_EXAMPLE_EXAMPLE_PARSE_CONFIGURATION_H_
diff --git a/tensorflow/core/example/feature_util.h b/tensorflow/core/example/feature_util.h
index a87c2c9a57c7c80692359dc88be3aca2ce7779b6..d977935b8a392adf1f78c38955f77f6f364502c9 100644
--- a/tensorflow/core/example/feature_util.h
+++ b/tensorflow/core/example/feature_util.h
@@ -33,7 +33,7 @@ limitations under the License.
 //   GetFeatureValues<int64>("tag", &example)->Add(id);
 //
 // Modification of bytes features is slightly different:
-//   auto tag = GetFeatureValues<string>("tag", example);
+//   auto tag = GetFeatureValues<string>("tag", &example);
 //   *tag->Add() = "lorem ipsum";
 //
 // To copy multiple values into a feature:
@@ -56,9 +56,9 @@ limitations under the License.
 //
 // To add values to feature_lists:
 //   AppendFeatureValues({4.0},
-//                       GetFeatureList("movie_ratings", &se)->Add());
+//                       GetFeatureList("images", &se)->Add());
 //   AppendFeatureValues({5.0, 3.0},
-//                       GetFeatureList("movie_ratings", &se)->Add());
+//                       GetFeatureList("images", &se)->Add());
 // This will create a feature list keyed as "images" with two features:
 //   feature_lists {
 //     feature_list {
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index f5dadf76daf8d351e509c4ae538b31abf00d9566..94bf34afa49f586e1bb61c1654865a5abc9abe19 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -106,7 +106,14 @@ class CPUAllocator : public Allocator {
     *stats = stats_;
   }
 
-  size_t AllocatedSizeSlow(void* ptr) override {
+  void ClearStats() override {
+    mutex_lock l(mu_);
+    stats_.num_allocs = 0;
+    stats_.max_bytes_in_use = stats_.bytes_in_use;
+    stats_.max_alloc_size = 0;
+  }
+
+  size_t AllocatedSizeSlow(const void* ptr) override {
     return port::MallocExtension_GetAllocatedSize(ptr);
   }
 
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 5e048a028d2dd9bf60722c3bab6a81330a16d2d8..3ce1b612464291eceb6e08d9b0f2deca70cda27a 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -156,7 +156,7 @@ class Allocator {
   //
   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
   // allocated by this allocator.
-  virtual size_t RequestedSize(void* ptr) {
+  virtual size_t RequestedSize(const void* ptr) {
     CHECK(false) << "allocator doesn't track sizes";
     return size_t(0);
   }
@@ -169,7 +169,7 @@ class Allocator {
   //
   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
   // allocated by this allocator.
-  virtual size_t AllocatedSize(void* ptr) { return RequestedSize(ptr); }
+  virtual size_t AllocatedSize(const void* ptr) { return RequestedSize(ptr); }
 
   // Returns either 0 or an identifier assigned to the buffer at 'ptr'
   // when the buffer was returned by AllocateRaw. If non-zero, the
@@ -180,7 +180,7 @@ class Allocator {
   //
   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
   // allocated by this allocator.
-  virtual int64 AllocationId(void* ptr) { return 0; }
+  virtual int64 AllocationId(const void* ptr) { return 0; }
 
   // Returns the allocated size of the buffer at 'ptr' if known,
   // otherwise returns 0. This method can be called when
@@ -188,7 +188,7 @@ class Allocator {
   //
   // REQUIRES: 'ptr!=nullptr' and points to a buffer previously
   // allocated by this allocator.
-  virtual size_t AllocatedSizeSlow(void* ptr) {
+  virtual size_t AllocatedSizeSlow(const void* ptr) {
     if (TracksAllocationSizes()) {
       return AllocatedSize(ptr);
     }
@@ -198,6 +198,9 @@ class Allocator {
   // Fills in 'stats' with statistics collected by this allocator.
   virtual void GetStats(AllocatorStats* stats) { stats->Clear(); }
 
+  // Clears the internal stats except for the `in_use` field.
+  virtual void ClearStats() {}
+
  private:
   // No constructors or destructors are run for simple types
   template <typename T>
@@ -309,17 +312,19 @@ class AllocatorWrapper : public Allocator {
     return wrapped_->TracksAllocationSizes();
   }
 
-  size_t RequestedSize(void* ptr) override {
+  size_t RequestedSize(const void* ptr) override {
     return wrapped_->RequestedSize(ptr);
   }
 
-  size_t AllocatedSize(void* ptr) override {
+  size_t AllocatedSize(const void* ptr) override {
     return wrapped_->AllocatedSize(ptr);
   }
 
-  int64 AllocationId(void* ptr) override { return wrapped_->AllocationId(ptr); }
+  int64 AllocationId(const void* ptr) override {
+    return wrapped_->AllocationId(ptr);
+  }
 
-  size_t AllocatedSizeSlow(void* ptr) override {
+  size_t AllocatedSizeSlow(const void* ptr) override {
     return wrapped_->AllocatedSizeSlow(ptr);
   }
 
diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index 032aeec161bb6978cb942747d3e0f8cff12f8853..a409cb2de7fbae20f435f464ca07155a36fede4a 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -110,6 +110,8 @@ TEST(CPUAllocatorTest, Simple) {
 
   CheckStats(a, 1025, 0, 1048576 * sizeof(double) + 1024 * sizeof(float),
              1048576 * sizeof(double));
+  a->ClearStats();
+  CheckStats(a, 0, 0, 0, 0);
   EnableCPUAllocatorStats(false);
 }
 
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index 5aba091840ed0cd32bf85980c7d12dc74e7f3fd9..a1c39d2a7a78354239f2cdbb718160906b233ddd 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -33,7 +33,19 @@ namespace tensorflow {
 namespace {
 
 string SummarizeString(const string& str) {
-  return strings::StrCat("\"", str_util::CEscape(str), "\"");
+  string escaped = str_util::CEscape(str);
+
+  // If the string is long, replace the middle with ellipses.
+  constexpr int kMaxStringSummarySize = 80;
+  if (escaped.size() >= kMaxStringSummarySize) {
+    StringPiece prefix(escaped);
+    StringPiece suffix = prefix;
+    prefix.remove_suffix(escaped.size() - 10);
+    suffix.remove_prefix(escaped.size() - 10);
+    return strings::StrCat("\"", prefix, "...", suffix, "\"");
+  } else {
+    return strings::StrCat("\"", escaped, "\"");
+  }
 }
 
 string SummarizeTensor(const TensorProto& tensor_proto) {
@@ -74,54 +86,47 @@ string SummarizeAttrValue(const AttrValue& attr_value) {
     case AttrValue::kTensor:
       return SummarizeTensor(attr_value.tensor());
     case AttrValue::kList: {
-      string ret = "[";
+      std::vector<string> pieces;
       if (attr_value.list().s_size() > 0) {
         for (int i = 0; i < attr_value.list().s_size(); ++i) {
-          if (i > 0) strings::StrAppend(&ret, ", ");
-          strings::StrAppend(&ret, SummarizeString(attr_value.list().s(i)));
+          pieces.push_back(SummarizeString(attr_value.list().s(i)));
         }
       } else if (attr_value.list().i_size() > 0) {
         for (int i = 0; i < attr_value.list().i_size(); ++i) {
-          if (i > 0) strings::StrAppend(&ret, ", ");
-          strings::StrAppend(&ret, attr_value.list().i(i));
+          pieces.push_back(strings::StrCat(attr_value.list().i(i)));
         }
       } else if (attr_value.list().f_size() > 0) {
         for (int i = 0; i < attr_value.list().f_size(); ++i) {
-          if (i > 0) strings::StrAppend(&ret, ", ");
-          strings::StrAppend(&ret, attr_value.list().f(i));
+          pieces.push_back(strings::StrCat(attr_value.list().f(i)));
         }
       } else if (attr_value.list().b_size() > 0) {
         for (int i = 0; i < attr_value.list().b_size(); ++i) {
-          if (i > 0) strings::StrAppend(&ret, ", ");
-          strings::StrAppend(&ret, attr_value.list().b(i) ? "true" : "false");
+          pieces.push_back(attr_value.list().b(i) ? "true" : "false");
         }
       } else if (attr_value.list().type_size() > 0) {
         for (int i = 0; i < attr_value.list().type_size(); ++i) {
-          if (i > 0) strings::StrAppend(&ret, ", ");
-          strings::StrAppend(&ret,
-                             EnumName_DataType(attr_value.list().type(i)));
+          pieces.push_back(EnumName_DataType(attr_value.list().type(i)));
         }
       } else if (attr_value.list().shape_size() > 0) {
         for (int i = 0; i < attr_value.list().shape_size(); ++i) {
-          if (i > 0) strings::StrAppend(&ret, ", ");
-          strings::StrAppend(
-              &ret, TensorShape::DebugString(attr_value.list().shape(i)));
+          pieces.push_back(
+              TensorShape::DebugString(attr_value.list().shape(i)));
         }
       } else if (attr_value.list().tensor_size() > 0) {
         for (int i = 0; i < attr_value.list().tensor_size(); ++i) {
-          if (i > 0) strings::StrAppend(&ret, ", ");
-          strings::StrAppend(&ret,
-                             SummarizeTensor(attr_value.list().tensor(i)));
+          pieces.push_back(SummarizeTensor(attr_value.list().tensor(i)));
         }
       } else if (attr_value.list().func_size() > 0) {
         for (int i = 0; i < attr_value.list().func_size(); ++i) {
-          if (i > 0) strings::StrAppend(&ret, ", ");
-          strings::StrAppend(&ret, SummarizeFunc(attr_value.list().func(i)));
+          pieces.push_back(SummarizeFunc(attr_value.list().func(i)));
         }
       }
-
-      strings::StrAppend(&ret, "]");
-      return ret;
+      constexpr int kMaxListSummarySize = 15;
+      if (pieces.size() >= kMaxListSummarySize) {
+        pieces.erase(pieces.begin() + 5, pieces.begin() + (pieces.size() - 6));
+        pieces[5] = "...";
+      }
+      return strings::StrCat("[", str_util::Join(pieces, ", "), "]");
     }
     case AttrValue::kFunc: {
       return SummarizeFunc(attr_value.func());
diff --git a/tensorflow/core/framework/attr_value_util_test.cc b/tensorflow/core/framework/attr_value_util_test.cc
index 1c9a209f05bcab1a0b4304aaddb2d0421e4df45f..e4fad917ffe1d4a0790bf1fd56e3c72f841523d8 100644
--- a/tensorflow/core/framework/attr_value_util_test.cc
+++ b/tensorflow/core/framework/attr_value_util_test.cc
@@ -135,6 +135,38 @@ TEST(AttrValueUtil, DeepAttr) {
             "f[F=f[F=f[F=[f[T=x[]], g[T=x[]]], T=x[]], T=x[]], T=x[]]");
 }
 
+TEST(AttrValueUtil, SummarizeAttrValueDoesNotElideShortStrings) {
+  AttrValue attr_value;
+  SetAttrValue(string(40, '-'), &attr_value);
+  EXPECT_EQ(strings::StrCat("\"", string(40, '-'), "\""),
+            SummarizeAttrValue(attr_value));
+}
+
+TEST(AttrValueUtil, SummarizeAttrValueElidesLongStrings) {
+  AttrValue attr_value;
+  SetAttrValue(string(80, '-'), &attr_value);
+  EXPECT_EQ("\"----------...----------\"", SummarizeAttrValue(attr_value));
+}
+
+TEST(AttrValueUtil, SummarizeAttrValueDoesNotElideShortLists) {
+  std::vector<int> alist(10);
+  std::iota(alist.begin(), alist.end(), 0);
+
+  AttrValue attr_value;
+  SetAttrValue(alist, &attr_value);
+  EXPECT_EQ("[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]", SummarizeAttrValue(attr_value));
+}
+
+TEST(AttrValueUtil, SummarizeAttrValueElidesLongLists) {
+  std::vector<int> alist(30);
+  std::iota(alist.begin(), alist.end(), 0);
+
+  AttrValue attr_value;
+  SetAttrValue(alist, &attr_value);
+  EXPECT_EQ("[0, 1, 2, 3, 4, ..., 25, 26, 27, 28, 29]",
+            SummarizeAttrValue(attr_value));
+}
+
 AttrValue FromText(const string& text) {
   AttrValue attr;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &attr));
diff --git a/tensorflow/core/framework/bfloat16.cc b/tensorflow/core/framework/bfloat16.cc
index 0efe43fde2dadd42aa03d3bf2968d2cbfb113e8d..6025be517048d33b20f7af15ef7ad1339adebdf9 100644
--- a/tensorflow/core/framework/bfloat16.cc
+++ b/tensorflow/core/framework/bfloat16.cc
@@ -21,13 +21,13 @@ void FloatToBFloat16(const float* src, bfloat16* dst, int64 size) {
   const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
   uint16_t* q = reinterpret_cast<uint16_t*>(dst);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    for (; size != 0; p += 2, q++, size--) {
-      *q = p[0];
-    }
+  for (; size != 0; p += 2, q++, size--) {
+    *q = p[0];
+  }
 #else
-    for (; size != 0; p += 2, q++, size--) {
-     *q = p[1];
-    }
+  for (; size != 0; p += 2, q++, size--) {
+    *q = p[1];
+  }
 #endif
 }
 
@@ -35,15 +35,15 @@ void BFloat16ToFloat(const bfloat16* src, float* dst, int64 size) {
   const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
   uint16_t* q = reinterpret_cast<uint16_t*>(dst);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    for (; size != 0; p++, q += 2, size--) {
-      q[0] = *p;
-      q[1] = 0;
-    }
+  for (; size != 0; p++, q += 2, size--) {
+    q[0] = *p;
+    q[1] = 0;
+  }
 #else
-    for (; size != 0; p++, q += 2, size--) {
-      q[0] = 0;
-      q[1] = *p;
-    }
+  for (; size != 0; p++, q += 2, size--) {
+    q[0] = 0;
+    q[1] = *p;
+  }
 #endif
 }
 
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index 6e4533875160120229877664cff7429cfaf71d43..17e6209f8e5ad5240dfc8ca1def75c178da45c27 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/bfloat16.h"
 
+#include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -104,6 +105,17 @@ TEST(Bfloat16Test, Conversion) {
   }
 }
 
+TEST(Bfloat16Test, Epsilon) {
+  EXPECT_LT(1.0f, static_cast<float>(bfloat16::epsilon() + bfloat16(1.0f)));
+  EXPECT_EQ(1.0f, static_cast<float>((bfloat16::epsilon() / bfloat16(2.0f)) +
+                                     bfloat16(1.0f)));
+}
+
+TEST(Bfloat16Test, Negate) {
+  EXPECT_EQ(-3.0f, static_cast<float>(-bfloat16(3.0f)));
+  EXPECT_EQ(4.5f, static_cast<float>(-bfloat16(-4.5f)));
+}
+
 static void BM_FloatToBFloat16(int iters) {
   testing::StopTiming();
   static const int N = 32 << 20;
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index ea66863bed3f3c9d91587a64370f635766d0794d..623248b6ce6adff8ed323acad7dae300742f8eba 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -49,7 +49,11 @@ Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
       break;
   }
   if (*output_size < 0) {
-    return errors::InvalidArgument("computed output size would be negative");
+    return errors::InvalidArgument(
+        "Computed output size would be negative: ", *output_size,
+        " [input_size: ", input_size,
+        ", effective_filter_size: ", effective_filter_size,
+        ", stride: ", stride, "]");
   }
   return Status::OK();
 }
@@ -397,6 +401,15 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(
       CheckFormatConstraintsOnShape(data_format, filter_shape, "filter", c));
 
+  std::vector<int32> dilations;
+  TF_RETURN_IF_ERROR(c->GetAttr("dilations", &dilations));
+
+  if (dilations.size() != 4) {
+    return errors::InvalidArgument(
+        "Conv2D requires the dilation attribute to contain 4 values, but got: ",
+        dilations.size());
+  }
+
   std::vector<int32> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
 
@@ -410,6 +423,8 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
 
   const int32 stride_rows = GetTensorDim(strides, data_format, 'H');
   const int32 stride_cols = GetTensorDim(strides, data_format, 'W');
+  const int32 dilation_rows = GetTensorDim(dilations, data_format, 'H');
+  const int32 dilation_cols = GetTensorDim(dilations, data_format, 'W');
 
   DimensionHandle batch_size_dim;
   DimensionHandle input_depth_dim;
@@ -447,12 +462,12 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
 
   DimensionHandle output_rows, output_cols;
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(c, input_spatial_dims[0],
-                                                   filter_rows_dim, stride_rows,
-                                                   padding, &output_rows));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(c, input_spatial_dims[1],
-                                                   filter_cols_dim, stride_cols,
-                                                   padding, &output_cols));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, input_spatial_dims[0], filter_rows_dim, dilation_rows, stride_rows,
+      padding, &output_rows));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, input_spatial_dims[1], filter_cols_dim, dilation_cols, stride_cols,
+      padding, &output_cols));
 
   ShapeHandle output_shape;
   TF_RETURN_IF_ERROR(
@@ -1114,16 +1129,20 @@ Status ConcatShapeHelper(InferenceContext* c, int start_value_index,
     for (int i = start_value_index; i < end_value_index; ++i) {
       if (rank == InferenceContext::kUnknownRank) rank = c->Rank(c->input(i));
       if (rank != InferenceContext::kUnknownRank) {
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), rank, &unused));
+        break;
       }
     }
     if (rank == InferenceContext::kUnknownRank) {
       c->set_output(0, c->UnknownShape());
       return Status::OK();
-    }
-    if (rank == 0) {
+    } else if (rank == 0) {
       return errors::InvalidArgument(
           "Can't concatenate scalars (use tf.stack instead)");
+    } else {
+      for (int i = start_value_index; i < end_value_index; ++i) {
+        // Check that all the inputs are of the correct rank.
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), rank, &unused));
+      }
     }
     // Build result of <rank> different unknown dims.
     std::vector<DimensionHandle> dims;
@@ -1307,6 +1326,9 @@ Status ValidateSparseTensor(InferenceContext* c, ShapeHandle indices_shape,
 
 Status ScatterNdUpdateShape(InferenceContext* c) {
   ShapeHandle input_shape = c->input(0);
+  if (c->input_handle_shapes_and_types(0) != nullptr) {
+    input_shape = (*c->input_handle_shapes_and_types(0))[0].shape;
+  }
   ShapeHandle indices_shape;
   TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &indices_shape));
   ShapeHandle updates_shape;
@@ -1338,10 +1360,11 @@ Status ScatterNdUpdateShape(InferenceContext* c) {
       Status s = c->Merge(prefix_indices, prefix_updates, &unused);
       if (!s.ok()) {
         return errors::InvalidArgument(
-            "The outer ", num_outer_dims, " dimensions of indices.shape=",
-            c->DebugString(indices_shape), " must match the outer ",
-            num_outer_dims, " dimensions of updates.shape=",
-            c->DebugString(updates_shape), ": ", s.error_message());
+            "The outer ", num_outer_dims,
+            " dimensions of indices.shape=", c->DebugString(indices_shape),
+            " must match the outer ", num_outer_dims,
+            " dimensions of updates.shape=", c->DebugString(updates_shape),
+            ": ", s.error_message());
       }
 
       ShapeHandle input_suffix;
@@ -1361,7 +1384,9 @@ Status ScatterNdUpdateShape(InferenceContext* c) {
     }
   }
 
-  c->set_output(0, input_shape);
+  if (c->input_handle_shapes_and_types(0) == nullptr) {
+    c->set_output(0, input_shape);
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index c0deb473a25cf19b99ae79903c1a2014b6e378f7..293c40e04d6ad9b57aabfda678216b1805a006f4 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
+#ifndef TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
+#define TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
 
 #include <array>
 
@@ -287,4 +287,4 @@ Status ExplicitShape(InferenceContext* c);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
+#endif  // TENSORFLOW_CORE_OPS_COMMON_SHAPE_FNS_H_
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index ec9746b2af1ed0da348fbe7459c5d93d842b25d9..5f3e5ad45731750bfd73181c41cd029f23aab55f 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -423,6 +423,15 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
                     .Finalize(&op.node_def));
   };
 
+  // Invalid rank for input
+  INFER_ERROR("must be rank 4", op, "[4,4];[2,1,1,1]");
+  // Invalid rank for filter
+  INFER_ERROR("must be rank 4", op, "[1,4,4,1];[2,1,1]");
+
+  // Invalid value for strides
+  set_op({{1, 1, 0, 1}}, "VALID", "NHWC", "HWIO");
+  INFER_ERROR("must be > 0", op, "[1,2,2,1];[1,1,1,1]");
+
   // 1x1 filter
   set_op({{1, 1, 1, 1}}, "VALID", "NHWC", "HWIO");
   INFER_OK(op, "[1,2,2,1];[1,1,1,1]", "[d0_0,2,2,d1_3]");
@@ -443,11 +452,6 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   set_op({{1, 1, 2, 1}}, "VALID", "NHWC", "HWIO");
   INFER_OK(op, "[1,4,4,1];[2,1,1,1]", "[d0_0,3,2,d1_3]");
 
-  // Invalid rank for input
-  INFER_ERROR("must be rank 4", op, "[4,4];[2,1,1,1]");
-  // Invalid rank for filter
-  INFER_ERROR("must be rank 4", op, "[1,4,4,1];[2,1,1]");
-
   // Unknown dims in the critical fields lead to partial inference.
   INFER_OK(op, "[1,4,4,1];[2,1,1,1]", "[d0_0,3,2,d1_3]");
   INFER_OK(op, "[1,?,4,1];[2,1,1,1]", "[d0_0,?,2,d1_3]");
@@ -538,6 +542,98 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   INFER_OK(op, "[1,4,4,?];[?,?,?,?]", "[d0_0,2,2,d1_3]");
 }
 
+TEST(CommonShapeFnsTest, Conv2DDilatedShapeTest) {
+  ShapeInferenceTestOp op("Conv2D");
+  auto set_op = [&op](const std::vector<int32>& dilations,
+                      const std::vector<int32>& strides, const string& padding,
+                      const string& data_format) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Conv2D")
+                    .Input("input", 0, DT_FLOAT)
+                    .Input("filter", 0, DT_FLOAT)
+                    .Attr("dilations", dilations)
+                    .Attr("strides", strides)
+                    .Attr("padding", padding)
+                    .Attr("data_format", data_format)
+                    .Finalize(&op.node_def));
+  };
+
+  // Invalid rank for dilation
+  set_op({{1, 2, 1}}, {{1, 1, 1, 1}}, "VALID", "NHWC");
+  INFER_ERROR("contain 4 values", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Invalid value for dilation
+  set_op({{1, 0, 1, 1}}, {{1, 1, 1, 1}}, "VALID", "NHWC");
+  INFER_ERROR("must be >= 1", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Tests for NHWC
+  // 1x1 filter, 2x1 dilations, 1x1 strides
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,2,2,1];[1,1,1,1]", "[d0_0,2,2,d1_3]");
+
+  // 1x1 filter, 2x1 dilations, 2x1 strides
+  set_op({{1, 2, 1, 1}}, {{1, 2, 1, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,2,4,d1_3]");
+
+  // 1x1 filter, 2x1 dilations, 2x2 strides
+  set_op({{1, 2, 1, 1}}, {{1, 2, 2, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,2,2,d1_3]");
+
+  // 3x3 filter, 2x1 dilations, 1x1 strides
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,5,5,1];[3,3,1,1]", "[d0_0,1,3,d1_3]");
+
+  // 3x3 filter, 2x1 dilations, 2x1 strides
+  set_op({{1, 2, 1, 1}}, {{1, 2, 1, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,5,5,1];[3,3,1,1]", "[d0_0,1,3,d1_3]");
+
+  // 3x3 filter, 1x2 dilations, 2x2 strides
+  set_op({{1, 1, 2, 1}}, {{1, 2, 2, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,5,5,1];[3,3,1,1]", "[d0_0,2,1,d1_3]");
+
+  // Tests for NCHW
+  // 1x1 filter, 2x1 dilations, 1x1 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 1, 1}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,2,2];[1,1,1,1]", "[d0_0,d1_3,2,2]");
+
+  // 1x1 filter, 2x1 dilations, 2x1 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 2, 1}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,4,4];[1,1,1,1]", "[d0_0,d1_3,2,4]");
+
+  // 1x1 filter, 2x1 dilations, 2x2 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 2, 2}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,4,4];[1,1,1,1]", "[d0_0,d1_3,2,2]");
+
+  // 3x3 filter, 2x1 dilations, 1x1 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 1, 1}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,5,5];[3,3,1,1]", "[d0_0,d1_3,1,3]");
+
+  // 3x3 filter, 2x1 dilations, 2x1 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 2, 1}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,5,5];[3,3,1,1]", "[d0_0,d1_3,1,3]");
+
+  // 3x3 filter, 1x2 dilations, 2x2 strides
+  set_op({{1, 1, 1, 2}}, {{1, 1, 2, 2}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,5,5];[3,3,1,1]", "[d0_0,d1_3,2,1]");
+
+  // Some tests for "SAME" padding
+
+  // 4x4 input, 1x1 filter, 2x1 dilations, 1x1 stride
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1}}, "SAME", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
+
+  // 3x3 input, 2x2 filter, 2x2 dilations, 1x1 stride
+  set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "SAME", "NHWC");
+  INFER_OK(op, "[1,3,3,1];[2,2,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
+
+  // 4x4 input, 2x2 filter, 1x2 dilations, 2x2 stride
+  set_op({{1, 1, 2, 1}}, {{1, 2, 2, 1}}, "SAME", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,2,2,d1_3]");
+
+  // 4x4 input, 2x2 filter, 2x2 dilations, 1x1 stride
+  set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "SAME", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
+}
+
 TEST(CommonShapeFnsTest, Conv3DShapeTest) {
   ShapeInferenceTestOp op("Conv3D");
   auto set_op = [&op](const std::vector<int32>& strides,
diff --git a/tensorflow/core/framework/cost_graph.proto b/tensorflow/core/framework/cost_graph.proto
index f4837fbfc55dc266bad01c9300e3a8b63c67f1e0..7885b0171a55a408878a127eb1259b65fb9466ea 100644
--- a/tensorflow/core/framework/cost_graph.proto
+++ b/tensorflow/core/framework/cost_graph.proto
@@ -45,10 +45,12 @@ message CostGraphDef {
     // Temporary memory used by this node.
     int64 temporary_memory_size = 6;
 
-    int64 host_temp_memory_size = 10;
-    int64 device_temp_memory_size = 11;
-    int64 host_persistent_memory_size = 12;
-    int64 device_persistent_memory_size = 16;
+    // Persistent memory used by this node.
+    int64 persistent_memory_size = 12;
+
+    int64 host_temp_memory_size = 10 [deprecated = true];
+    int64 device_temp_memory_size = 11 [deprecated = true];
+    int64 device_persistent_memory_size = 16 [deprecated = true];
 
     // Estimate of the computational cost of this node, in microseconds.
     int64 compute_cost = 9;
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4145ef7bc9d22632db3d0a71f8901a671dd95ee5
--- /dev/null
+++ b/tensorflow/core/framework/dataset.cc
@@ -0,0 +1,272 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
+
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/node_builder.h"
+
+namespace tensorflow {
+
+namespace {
+
+// A wrapper class for storing a `DatasetBase` instance in a DT_VARIANT tensor.
+// Objects of the wrapper class own a reference on an instance of `DatasetBase`,
+// and the wrapper's copy constructor and destructor take care of managing the
+// reference count.
+//
+// NOTE(mrry): This is not a feature-complete implementation of the DT_VARIANT
+// specification. In particular, we cannot currently serialize an arbitrary
+// `DatasetBase` object, so the `Encode()` and `Decode()` methods are not
+// implemented.
+class DatasetVariantWrapper {
+ public:
+  DatasetVariantWrapper() : dataset_(nullptr) {}
+
+  // Transfers ownership of `dataset` to `*this`.
+  explicit DatasetVariantWrapper(DatasetBase* dataset) : dataset_(dataset) {}
+
+  DatasetVariantWrapper(const DatasetVariantWrapper& other)
+      : dataset_(other.dataset_) {
+    if (dataset_) dataset_->Ref();
+  }
+
+  ~DatasetVariantWrapper() {
+    if (dataset_) dataset_->Unref();
+  }
+
+  DatasetBase* get() const { return dataset_; }
+
+  string TypeName() const { return "tensorflow::DatasetVariantWrapper"; }
+  string DebugString() const {
+    if (dataset_) {
+      return dataset_->DebugString();
+    } else {
+      return "<Uninitialized DatasetVariantWrapper>";
+    }
+  }
+  void Encode(VariantTensorData* data) const {
+    LOG(ERROR) << "The Encode() method is not implemented for "
+                  "DatasetVariantWrapper objects.";
+  }
+  bool Decode(const VariantTensorData& data) {
+    LOG(ERROR) << "The Decode() method is not implemented for "
+                  "DatasetVariantWrapper objects.";
+    return false;
+  }
+
+ private:
+  DatasetBase* const dataset_;  // Owns one reference.
+};
+
+}  // namespace
+
+Status GraphDefBuilderWrapper::AddDataset(
+    const GraphDatasetBase* dataset,
+    const std::vector<std::pair<size_t, Node*>>& inputs,
+    const std::vector<std::pair<size_t, gtl::ArraySlice<Node*>>>& list_inputs,
+    const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
+    Node** output) {
+  const string& op_type_name = dataset->op_name();
+  std::unique_ptr<const GraphDefBuilder::Options> opts(
+      new GraphDefBuilder::Options(b_->opts()));
+  // TODO(srbs|mrry): Not all datasets have output_types and output_shapes
+  // attributes defined. It will be nice to have a consistent pattern.
+  bool has_output_types_attr = HasAttr(op_type_name, "output_types");
+  bool has_output_shapes_attr = HasAttr(op_type_name, "output_shapes");
+  if (has_output_shapes_attr) {
+    opts.reset(new GraphDefBuilder::Options(
+        opts->WithAttr("output_shapes", dataset->output_shapes())));
+  }
+  if (has_output_types_attr) {
+    opts.reset(new GraphDefBuilder::Options(
+        opts->WithAttr("output_types", dataset->output_dtypes())));
+  }
+  for (auto attr : attrs) {
+    opts.reset(
+        new GraphDefBuilder::Options(opts->WithAttr(attr.first, attr.second)));
+  }
+  if (opts->HaveError()) {
+    return errors::Internal("AddDataset: Failed to build Options with error ",
+                            opts->StatusToString());
+  }
+  NodeBuilder node_builder(opts->GetNameForOp(op_type_name), op_type_name,
+                           opts->op_registry());
+  {
+    size_t total_size = inputs.size() + list_inputs.size();
+    auto inputs_iter = inputs.begin();
+    auto list_inputs_iter = list_inputs.begin();
+    for (int i = 0; i < total_size; i++) {
+      if (inputs_iter != inputs.end() && inputs_iter->first == i) {
+        node_builder.Input(NodeBuilder::NodeOut(inputs_iter->second));
+        inputs_iter++;
+      } else if (list_inputs_iter != list_inputs.end() &&
+                 list_inputs_iter->first == i) {
+        std::vector<NodeBuilder::NodeOut> nodeout_inputs;
+        nodeout_inputs.reserve(list_inputs_iter->second.size());
+        for (Node* n : list_inputs_iter->second) {
+          nodeout_inputs.emplace_back(n);
+        }
+        node_builder.Input(nodeout_inputs);
+        list_inputs_iter++;
+      } else {
+        return errors::InvalidArgument("No input found for index ", i);
+      }
+    }
+  }
+  *output = opts->FinalizeBuilder(&node_builder);
+  if (*output == nullptr) {
+    return errors::Internal("AddDataset: Failed to build ", op_type_name,
+                            " op with error ", opts->StatusToString());
+  }
+  return Status::OK();
+}
+
+Status GraphDefBuilderWrapper::AddFunction(OpKernelContext* ctx,
+                                           const string& function_name) {
+  if (b_->HasFunction(function_name)) {
+    LOG(INFO) << "Function with name " << function_name << "already exists in"
+              << " the graph. It will not be added again.";
+    return Status::OK();
+  }
+  TF_RETURN_IF_ERROR(EnsureFunctionIsStateless(ctx, function_name));
+  const FunctionLibraryDefinition* flib_def =
+      ctx->function_library()->GetFunctionLibraryDefinition();
+  const FunctionDef* f_def = flib_def->Find(function_name);
+  if (f_def == nullptr) {
+    return errors::InvalidArgument("Unable to find FunctionDef for ",
+                                   function_name, " in the registry.");
+  }
+  FunctionDefLibrary def;
+  *def.add_function() = *f_def;
+  const string gradient_func = flib_def->FindGradient(function_name);
+  if (!gradient_func.empty()) {
+    GradientDef* g_def = def.add_gradient();
+    g_def->set_function_name(function_name);
+    g_def->set_gradient_func(gradient_func);
+  }
+  TF_RETURN_IF_ERROR(b_->AddFunctionLibrary(def));
+
+  // Recursively add functions in inputs of function_name.
+  for (const NodeDef& node_def : f_def->node_def()) {
+    const OpRegistrationData* op_reg_data = nullptr;
+    TF_RETURN_IF_ERROR(flib_def->LookUp(node_def.op(), &op_reg_data));
+    if (op_reg_data->is_function_op) {
+      TF_RETURN_IF_ERROR(AddFunction(ctx, op_reg_data->op_def.name()));
+    }
+    // Recursively add functions in attrs of this NodeDef.
+    for (const auto& pair : node_def.attr()) {
+      TF_RETURN_IF_ERROR(AddAttrFunctions(pair.second, ctx));
+    }
+  }
+
+  // Recursively add functions in attrs of function_name.
+  for (auto iter = f_def->attr().begin(); iter != f_def->attr().end(); iter++) {
+    TF_RETURN_IF_ERROR(AddAttrFunctions(iter->second, ctx));
+  }
+  return Status::OK();
+}
+
+void GraphDefBuilderWrapper::AddTensorInternal(const Tensor& val,
+                                               Node** output) {
+  *output = ops::SourceOp(
+      "Const",
+      b_->opts().WithAttr("dtype", val.dtype()).WithAttr("value", val));
+}
+
+bool GraphDefBuilderWrapper::HasAttr(const string& op_type_name,
+                                     const string& attr_name) const {
+  const OpDef* op_def = nullptr;
+  Status s = b_->opts().op_registry()->LookUpOpDef(op_type_name, &op_def);
+  if (!s.ok() || op_def == nullptr) {
+    return false;
+  }
+  return HasAttr(op_def, attr_name);
+}
+
+Status GraphDatasetBase::Serialize(OpKernelContext* ctx,
+                                   string* serialized_graph_def,
+                                   string* output_node) const {
+  GraphDefBuilder b;
+  DatasetGraphDefBuilder db(&b);
+  Node* node = nullptr;
+  TF_RETURN_IF_ERROR(AsGraphDefInternal(ctx, &db, &node));
+  *output_node = node->name();
+  GraphDef graph_def;
+  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
+  graph_def.SerializeToString(serialized_graph_def);
+  return Status::OK();
+}
+
+Status GetDatasetFromVariantTensor(const Tensor& tensor,
+                                   DatasetBase** out_dataset) {
+  if (!(tensor.dtype() == DT_VARIANT ||
+        TensorShapeUtils::IsScalar(tensor.shape()))) {
+    return errors::InvalidArgument(
+        "Dataset tensor must be a scalar of dtype DT_VARIANT.");
+  }
+  const Variant& variant = tensor.scalar<Variant>()();
+  const DatasetVariantWrapper* wrapper = variant.get<DatasetVariantWrapper>();
+  if (wrapper == nullptr) {
+    return errors::InvalidArgument("Tensor must be a Dataset object.");
+  }
+  *out_dataset = wrapper->get();
+  if (*out_dataset == nullptr) {
+    return errors::Internal("Read uninitialized Dataset variant.");
+  }
+  return Status::OK();
+}
+
+Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor) {
+  if (!(tensor->dtype() == DT_VARIANT ||
+        TensorShapeUtils::IsScalar(tensor->shape()))) {
+    return errors::InvalidArgument(
+        "Dataset tensor must be a scalar of dtype DT_VARIANT.");
+  }
+  tensor->scalar<Variant>()() = DatasetVariantWrapper(dataset);
+  return Status::OK();
+}
+
+void DatasetOpKernel::Compute(OpKernelContext* ctx) {
+  DatasetBase* dataset = nullptr;
+  MakeDataset(ctx, &dataset);
+  if (ctx->status().ok()) {
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    OP_REQUIRES_OK(ctx, StoreDatasetInVariantTensor(dataset, output));
+  }
+}
+
+void UnaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
+                                       DatasetBase** output) {
+  DatasetBase* input;
+  OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &input));
+  MakeDataset(ctx, input, output);
+}
+
+void BinaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
+                                        DatasetBase** output) {
+  DatasetBase* input;
+  OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &input));
+  DatasetBase* another_input;
+  OP_REQUIRES_OK(ctx,
+                 GetDatasetFromVariantTensor(ctx->input(1), &another_input));
+  MakeDataset(ctx, input, another_input, output);
+}
+
+const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
+const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] =
+    "_DATASET_GRAPH_OUTPUT_NODE";
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ab23d92a421df8b5fb9bcf637ad805d67577aa1
--- /dev/null
+++ b/tensorflow/core/framework/dataset.h
@@ -0,0 +1,616 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/tracing.h"
+
+// Polymorphic datasets should support all primitive TensorFlow
+// types. Use this macro to expand `m(T)` once for each primitive type
+// `T`, e.g. to build a `switch` statement.
+#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
+
+namespace tensorflow {
+
+// Interface for reading values from a key-value store.
+// Used for restoring iterator state.
+class IteratorStateReader {
+ public:
+  virtual Status ReadScalar(StringPiece key, int64* val) = 0;
+  virtual Status ReadScalar(StringPiece key, string* val) = 0;
+  virtual Status ReadTensor(StringPiece key, Tensor* val) = 0;
+  virtual bool Contains(StringPiece key) = 0;
+
+  virtual ~IteratorStateReader() {}
+};
+
+// Interface for writing values to a key-value store.
+// Used for saving iterator state.
+class IteratorStateWriter {
+ public:
+  virtual Status WriteScalar(StringPiece key, const int64 val) = 0;
+  virtual Status WriteScalar(StringPiece key, const string& val) = 0;
+  virtual Status WriteTensor(StringPiece key, const Tensor& val) = 0;
+
+  virtual ~IteratorStateWriter() {}
+};
+
+// Forward declarations to avoid introducing a dependency on headers in
+// "tensorflow/core/graph/...".
+class GraphDefBuilder;
+class GraphDatasetBase;
+class Node;
+
+// Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
+class GraphDefBuilderWrapper {
+ public:
+  explicit GraphDefBuilderWrapper(GraphDefBuilder* b) : b_(b) {}
+
+  // Adds a Const node with scalar value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  template <typename T>
+  Status AddScalar(const T& val, Node** output) {
+    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
+    val_t.scalar<T>()() = val;
+    AddTensorInternal(val_t, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddScalar: Failed to build Const op.");
+    }
+    return Status::OK();
+  }
+
+  // Adds a Const node with vector value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  // TODO(shivaniagrawal): Consider changing to gtl::ArraySlice?
+  template <typename T>
+  Status AddVector(const std::vector<T>& val, Node** output) {
+    Tensor val_t = Tensor(DataTypeToEnum<T>::v(),
+                          TensorShape({static_cast<int64>(val.size())}));
+    for (int i = 0; i < val.size(); i++) {
+      val_t.flat<T>()(i) = val[i];
+    }
+    AddTensorInternal(val_t, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddVector: Failed to build Const op.");
+    }
+    return Status::OK();
+  }
+
+  // Adds a Const node with Tensor value to the Graph.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  Status AddTensor(const Tensor& val, Node** output) {
+    AddTensorInternal(val, output);
+    if (*output == nullptr) {
+      return errors::Internal("AddTensor: Failed to build Const op.");
+    }
+    return Status::OK();
+  }
+
+  Status AddDataset(const GraphDatasetBase* dataset,
+                    const std::vector<Node*>& inputs, Node** output) {
+    return AddDataset(dataset, inputs, {}, output);
+  }
+
+  // Adds a node corresponding to the `DatasetType` to the Graph.
+  // Return value of `DatasetType::op_name()` is used as the op type for the
+  // node.
+  // Values for the output_types and output_shapes node attributes are also
+  // written if those attributes are defined in the OpDef.
+  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
+  // non-null if the method returns with an OK status.
+  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  Status AddDataset(const GraphDatasetBase* dataset,
+                    const std::vector<Node*>& inputs,
+                    const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
+                    Node** output) {
+    std::vector<std::pair<size_t, Node*>> enumerated_inputs(inputs.size());
+    for (int i = 0; i < inputs.size(); i++) {
+      enumerated_inputs[i] = std::make_pair(i, inputs[i]);
+    }
+    return AddDataset(dataset, enumerated_inputs, {}, attrs, output);
+  }
+
+  Status AddDataset(
+      const GraphDatasetBase* dataset,
+      const std::vector<std::pair<size_t, Node*>>& inputs,
+      const std::vector<std::pair<size_t, gtl::ArraySlice<Node*>>>& list_inputs,
+      const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
+      Node** output);
+
+  // Adds a user-defined function with name `function_name` to the graph and
+  // recursively adds all functions it references. If a function with a matching
+  // name has already been added, returns with OK status. If a user-defined with
+  // name `function_name` is not found in the FunctionLibraryDefinition, returns
+  // an InvalidArgumentError. If the function with name `function_name` or any
+  // of its dependent functions are stateful, returns an InvalidArgument error.
+  Status AddFunction(OpKernelContext* ctx, const string& function_name);
+
+  template <typename T>
+  void BuildAttrValue(const T& value, AttrValue* attr) {
+    SetAttrValue(value, attr);
+  }
+
+ private:
+  void AddTensorInternal(const Tensor& val, Node** output);
+
+  Status EnsureFunctionIsStateless(OpKernelContext* ctx,
+                                   const string& function_name) const {
+    const FunctionLibraryDefinition* lib_def =
+        ctx->function_library()->GetFunctionLibraryDefinition();
+    const FunctionDef* function_def = lib_def->Find(function_name);
+    if (!function_def) {
+      return errors::InvalidArgument("Unable to find FunctionDef for ",
+                                     function_name, " in registry.");
+    }
+    for (const NodeDef& node_def : function_def->node_def()) {
+      const OpDef* op_def;
+      TF_RETURN_IF_ERROR(lib_def->LookUpOpDef(node_def.op(), &op_def));
+      // TODO(b/65524810): Hack to allow functions to capture Dataset op
+      // nodes needed for FlatMap. Currently, source datasets nodes have been
+      // marked stateful to avoid constant folding since we do not have a
+      // good way of serializing them.
+      if (IsOpWhitelisted(op_def)) {
+        continue;
+      }
+      if (op_def->is_stateful()) {
+        return errors::InvalidArgument(
+            "Op[name: ", node_def.name(), ", type: ", node_def.op(), "] ",
+            "in function ", function_name, " is stateful. ",
+            "Saving stateful functions is not supported yet.");
+      }
+    }
+    return Status::OK();
+  }
+
+  // Returns whether an op has been whitelisted for use inside map_fns.
+  // Uses a heuristic to whitelist source dataset ops which have been
+  // marked stateful due to b/65524810.
+  // Also looks up the `op_def->name` in the global
+  // `WhitelistedStatefulOpRegistry`.
+  bool IsOpWhitelisted(const OpDef* op_def) const {
+    return (StringPiece(op_def->name()).ends_with("Dataset") &&
+            op_def->output_arg_size() == 1 &&
+            op_def->output_arg(0).type() == DT_VARIANT) ||
+           dataset::WhitelistedStatefulOpRegistry::Global()->Contains(
+               op_def->name());
+  }
+
+  bool HasAttr(const string& op_type_name, const string& attr_name) const;
+
+  bool HasAttr(const OpDef* op_def, const string& attr_name) const {
+    for (auto attr : op_def->attr()) {
+      if (attr.name() == attr_name) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  Status AddAttrFunctions(const AttrValue& attr_value, OpKernelContext* ctx) {
+    if (attr_value.has_func()) {
+      TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name()));
+    } else if (attr_value.has_list()) {
+      for (const NameAttrList& name_attr_list : attr_value.list().func()) {
+        TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name()));
+      }
+    }
+    return Status::OK();
+  }
+
+  GraphDefBuilder* b_;
+};
+
+class StatsAggregator;
+
+// A cut-down version of OpKernelContext for running computations in
+// iterators. Note that we cannot simply use OpKernelContext here
+// because we might run computation in an iterator whose lifetime is
+// not nested within the lifetime of a single OpKernelContext
+// (e.g. asynchronous prefetching).
+//
+// TODO(mrry): We will probably need to support more of
+// OpKernelContext here. For example, should allocation be handled by
+// the IteratorContext?
+// TODO(mrry): We're making some daring assumptions about the lifetime
+// of the runner passed in here. A runner will be deleted when the original
+// step ends, but all existing runners only close over session-lifetime (or
+// longer-lived) state, so we can make a copy of the function. There's nothing
+// in the definition of the API from which we took the runner to guarantee that
+// what we are doing is safe. We should formalize the properties here.
+class IteratorContext {
+ public:
+  struct Params {
+    // Interface to operating system functionality.
+    Env* env;
+
+    // Function call support.
+    std::function<void(std::function<void()>)> runner = nullptr;
+
+    // A function that returns the current `StatsAggregator` instance to be
+    // used when recording statistics about the iterator.
+    //
+    // NOTE(mrry): This is somewhat awkward, because (i) the `StatsAggregator`
+    // is a property of the `IteratorResource` (which this class does not know
+    // about), and (ii) it can change after the `IteratorContext` has been
+    // created. Better suggestions are welcome!
+    std::function<std::shared_ptr<StatsAggregator>()> stats_aggregator_getter =
+        nullptr;
+
+    // The FunctionLibraryRuntime object to be used to make function calls.
+    FunctionLibraryRuntime* lib = nullptr;
+    std::shared_ptr<const FunctionLibraryDefinition> function_library = nullptr;
+
+    // The Allocator to be used to allocate the output of an iterator.
+    std::function<Allocator*(AllocatorAttributes)> allocator_getter = nullptr;
+  };
+
+  explicit IteratorContext(Params params) : params_(std::move(params)) {}
+
+  Env* env() const { return params_.env; }
+
+  std::function<void(std::function<void()>)>* runner() {
+    return &params_.runner;
+  }
+
+  std::shared_ptr<StatsAggregator> stats_aggregator() {
+    if (params_.stats_aggregator_getter) {
+      return params_.stats_aggregator_getter();
+    } else {
+      return nullptr;
+    }
+  }
+
+  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
+    return params_.function_library;
+  }
+
+  FunctionLibraryRuntime* lib() { return params_.lib; }
+
+  void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; }
+
+  Allocator* allocator(AllocatorAttributes attrs) {
+    return params_.allocator_getter(attrs);
+  }
+
+ private:
+  Params params_;
+};
+
+// Represents the current position in a range of outputs, where the
+// range of outputs is typically represented by an `DatasetBase`,
+// defined below.
+class IteratorBase {
+ public:
+  virtual ~IteratorBase() {}
+
+  // Gets the next output from the range that this iterator is traversing.
+  //
+  // If at least one output remains in this iterator's range, that
+  // output will be stored in `*out_tensors` and `false` will be
+  // stored in `*end_of_sequence`.
+  //
+  // If no more outputs remain in this iterator's range, `true` will
+  // be stored in `*end_of_sequence`, and the content of
+  // `*out_tensors` will be undefined.
+  //
+  // This method is thread-safe.
+  //
+  // TODO(mrry): Define `GetNextAsync()` or `GetNextManyAsync()`, and
+  // potentially remove this method.
+  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // iterator.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this iterator.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+
+  // Saves the state of this iterator.
+  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
+    return SaveInternal(writer);
+  }
+
+  // Restores the state of this iterator.
+  virtual Status Restore(IteratorContext* ctx, IteratorStateReader* reader) {
+    return RestoreInternal(ctx, reader);
+  }
+
+ protected:
+  // This is needed so that sub-classes of IteratorBase can call
+  // `SaveInternal` on their parent iterators, e.g., in
+  // `RepeatDataasetOp::Dataset`.
+  Status SaveParent(IteratorStateWriter* writer,
+                    const std::unique_ptr<IteratorBase>& parent) {
+    return parent->SaveInternal(writer);
+  }
+
+  // This is needed so that sub-classes of IteratorBase can call
+  // `RestoreInternal` on their parent iterators, e.g., in
+  // `RepeatDataasetOp::Dataset`.
+  Status RestoreParent(IteratorContext* ctx, IteratorStateReader* reader,
+                       const std::unique_ptr<IteratorBase>& parent) {
+    return parent->RestoreInternal(ctx, reader);
+  }
+
+  // Saves the state of this iterator recursively.
+  virtual Status SaveInternal(IteratorStateWriter* writer) {
+    return errors::Unimplemented("SaveInternal");
+  }
+
+  // Restores the state of this iterator recursively.
+  virtual Status RestoreInternal(IteratorContext* ctx,
+                                 IteratorStateReader* reader) {
+    return errors::Unimplemented("RestoreInternal");
+  }
+};
+
+// Represents a (potentially infinite) range of outputs, where each
+// output is a tuple of tensors.
+class DatasetBase : public core::RefCounted {
+ public:
+  // Returns a new iterator for iterating over the range of elements in
+  // this dataset.
+  //
+  // This method may be called multiple times on the same instance,
+  // and the resulting iterators will have distinct state. Each
+  // iterator will traverse all elements in this dataset from the
+  // start.
+  //
+  // Ownership of the created iterator will be transferred to the caller.
+  //
+  // The prefix identifies the sequence of iterators leading up to the newly
+  // created iterator.
+  virtual std::unique_ptr<IteratorBase> MakeIterator(
+      const string& prefix) const = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // dataset.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this dataset.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+
+  // A human-readable debug string for this dataset.
+  virtual string DebugString() = 0;
+
+  // Serializes the dataset and writes it to the `writer`.
+  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const {
+    return errors::Unimplemented("DatasetBase::Save");
+  }
+
+ protected:
+  // TODO(srbs): Ideally all graph related logic should reside in
+  // GraphDatasetBase. However, that would require Datasets defined in all ops
+  // to derive from GraphDatasetBase. Once that is done we can move
+  // DatasetGraphDefBuilder and AsGraphDefInternal to GraphDatasetBase.
+  class DatasetGraphDefBuilder : public GraphDefBuilderWrapper {
+   public:
+    DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {}
+    Status AddParentDataset(OpKernelContext* ctx, const DatasetBase* dataset,
+                            Node** output) {
+      return dataset->AsGraphDefInternal(ctx, this, output);
+    }
+  };
+
+  virtual Status AsGraphDefInternal(OpKernelContext* ctx,
+                                    DatasetGraphDefBuilder* b,
+                                    Node** node) const {
+    return AsGraphDefInternal(b, node);
+  }
+
+  virtual Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                                    Node** node) const {
+    return errors::Unimplemented("AsGraphDefInternal");
+  }
+};
+
+// Base-class for datasets that are built by ops.
+class GraphDatasetBase : public DatasetBase {
+ public:
+  GraphDatasetBase(OpKernelContext* ctx)
+      : op_name_(ctx->op_kernel().type_string()) {}
+
+  const string op_name() const { return op_name_; }
+
+  Status Save(OpKernelContext* ctx,
+              IteratorStateWriter* writer) const override {
+    string serialized_graph_def;
+    string output_node;
+    TF_RETURN_IF_ERROR(Serialize(ctx, &serialized_graph_def, &output_node));
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
+    return Status::OK();
+  }
+
+  // Key for storing the Dataset graph in the serialized format.
+  static const char kDatasetGraphKey[];
+
+  // Key for storing the output node of the Dataset graph in the serialized
+  // format.
+  static const char kDatasetGraphOutputNodeKey[];
+
+ private:
+  Status Serialize(OpKernelContext* ctx, string* serialized_graph_def,
+                   string* output_node) const;
+
+  const string op_name_;
+};
+
+// Represents an iterator that is associated with a particular parent dataset.
+template <class DatasetType>
+class DatasetIterator : public IteratorBase {
+ public:
+  struct Params {
+    // Owns one reference on the shared dataset resource.
+    const DatasetType* dataset;
+
+    // Identifies the sequence of iterators leading up to this iterator.
+    const string prefix;
+  };
+
+  explicit DatasetIterator(const Params& params) : params_(params) {
+    params_.dataset->Ref();
+  }
+
+  ~DatasetIterator() override { params_.dataset->Unref(); }
+
+  // The dataset from which this iterator was created.
+  const DatasetType* dataset() const { return params_.dataset; }
+
+  // The sequence of iterators leading up to this iterator.
+  const string prefix() const { return params_.prefix; }
+
+  const DataTypeVector& output_dtypes() const override {
+    return params_.dataset->output_dtypes();
+  }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return params_.dataset->output_shapes();
+  }
+
+  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                 bool* end_of_sequence) final {
+    port::Tracing::TraceMe activity(params_.prefix);
+    Status s = GetNextInternal(ctx, out_tensors, end_of_sequence);
+    if (TF_PREDICT_FALSE(errors::IsOutOfRange(s) && !*end_of_sequence)) {
+      s = errors::Internal(
+          "Iterator \"", params_.prefix,
+          "\" returned OutOfRange without setting `*end_of_sequence`. This "
+          "indicates that an error may have occurred. Original message: ",
+          s.error_message());
+      LOG(ERROR) << s;
+    }
+    return s;
+  }
+
+  Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) final {
+    TF_RETURN_IF_ERROR(dataset()->Save(ctx, writer));
+    return IteratorBase::Save(ctx, writer);
+  }
+
+ protected:
+  // Internal implementation of GetNext that is wrapped in tracing logic.
+  virtual Status GetNextInternal(IteratorContext* ctx,
+                                 std::vector<Tensor>* out_tensors,
+                                 bool* end_of_sequence) = 0;
+
+  string full_name(const string& name) const {
+    return strings::StrCat(prefix(), ":", name);
+  }
+
+ private:
+  Params params_;
+};
+
+// Encapsulates the work required to plug a DatasetBase into the core TensorFlow
+// graph execution engine.
+class DatasetOpKernel : public OpKernel {
+ public:
+  DatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) final;
+
+ protected:
+  // Subclasses should implement this method. It will be called during Compute
+  // execution.
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase** output) = 0;
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece& argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
+    }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
+  }
+};
+
+// Encapsulates the work required to plug unary Datasets into the core
+// TensorFlow graph execution engine.
+class UnaryDatasetOpKernel : public DatasetOpKernel {
+ public:
+  UnaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                           DatasetBase** output) = 0;
+};
+
+// Encapsulates the work required to plug binary Datasets into the core
+// TensorFlow graph execution engine.
+class BinaryDatasetOpKernel : public DatasetOpKernel {
+ public:
+  BinaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
+  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                           DatasetBase* another_input,
+                           DatasetBase** output) = 0;
+};
+
+// Validates and extracts a `DatasetBase` object from `tensor`.
+//
+// `tensor` must have been written by a call to SetVariantTensorToDataset().
+//
+// The retrieved pointer is a borrowed reference to the dataset, which is owned
+// by the tensor. The consumer must either acquire its own reference to the
+// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
+// destroyed or mutated while the retrieved pointer is in use.
+Status GetDatasetFromVariantTensor(const Tensor& tensor,
+                                   DatasetBase** out_dataset);
+
+// Stores a `DatasetBase` object in `tensor`.
+//
+// The ownership of `dataset` is transferred to `tensor`.
+Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
diff --git a/tensorflow/core/framework/dataset_stateful_op_whitelist.h b/tensorflow/core/framework/dataset_stateful_op_whitelist.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b48999edb37da4fdf232f2cbcd61df7affb40f2
--- /dev/null
+++ b/tensorflow/core/framework/dataset_stateful_op_whitelist.h
@@ -0,0 +1,77 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_WHITELIST_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_WHITELIST_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace dataset {
+// Registry for stateful ops that need to be used in dataset functions.
+// See below macro for usage details.
+class WhitelistedStatefulOpRegistry {
+ public:
+  Status Add(StringPiece op_name) {
+    op_names_.insert(op_name);
+    return Status::OK();
+  }
+
+  bool Contains(StringPiece op_name) {
+    return op_names_.find(op_name) != op_names_.end();
+  }
+
+  static WhitelistedStatefulOpRegistry* Global() {
+    static WhitelistedStatefulOpRegistry* reg =
+        new WhitelistedStatefulOpRegistry;
+    return reg;
+  }
+
+ private:
+  WhitelistedStatefulOpRegistry() {}
+  WhitelistedStatefulOpRegistry(WhitelistedStatefulOpRegistry const& copy);
+  WhitelistedStatefulOpRegistry operator=(
+      WhitelistedStatefulOpRegistry const& copy);
+  std::set<StringPiece> op_names_;
+};
+
+}  // namespace dataset
+
+// Use this macro to whitelist an op that is marked stateful but needs to be
+// used inside a map_fn in an input pipeline. This is only needed if you wish
+// to be able to checkpoint the state of the input pipeline. We currently
+// do not allow stateful ops to be defined inside of map_fns since it is not
+// possible to save their state.
+// Note that the state of the whitelisted ops inside functions will not be
+// saved during checkpointing, hence this should only be used if the op is
+// marked stateful for reasons like to avoid constant folding during graph
+// optimiztion but is not stateful.
+// If possible, try to remove the stateful flag on the op first.
+// Example usage:
+//
+//   WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LegacyStatefulReader");
+//
+#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS(name) \
+  WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(__COUNTER__, name)
+#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ_HELPER(ctr, name) \
+  WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)
+#define WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS_UNIQ(ctr, name)        \
+  static ::tensorflow::Status whitelist_op##ctr TF_ATTRIBUTE_UNUSED =      \
+      ::tensorflow::dataset::WhitelistedStatefulOpRegistry::Global()->Add( \
+          name)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DATASET_STATEFUL_OP_WHITELIST_H_
diff --git a/tensorflow/core/framework/device_attributes.proto b/tensorflow/core/framework/device_attributes.proto
index 9983bcb6bec63602c2e624a183a111622f7f2ace..0b3c0d5bdf9f3db858d631dcf67d1120022520f2 100644
--- a/tensorflow/core/framework/device_attributes.proto
+++ b/tensorflow/core/framework/device_attributes.proto
@@ -6,10 +6,26 @@ option java_outer_classname = "DeviceAttributesProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
 
+message InterconnectLink {
+  int32 device_id = 1;
+  string type = 2;
+  int32 strength = 3;
+};
+
+message LocalLinks {
+  repeated InterconnectLink link = 1;
+};
+
 message DeviceLocality {
   // Optional bus locality of device.  Default value of 0 means
   // no specific locality.  Specific localities are indexed from 1.
   int32 bus_id = 1;
+
+  // Optional NUMA locality of device.
+  int32 numa_node = 2;
+
+  // Optional local interconnect links to other devices.
+  LocalLinks links = 3;
 };
 
 message DeviceAttributes {
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 33bd5d250cd6b5df8c933e3f353efd9a1eee592c..1838a8ad02d2bd5522ce3162fea53e3f5afc0309 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -145,6 +145,12 @@ class DeviceBase {
     return gpu_device_info_;
   }
 
+  // The preferred thread pool for this device. If it is nullptr, the system
+  // automatically assigns a thread pool for execution.
+  virtual thread::ThreadPool* tensorflow_device_thread_pool() {
+    return device_thread_pool_;
+  }
+
   // Does not take ownership.
   void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d) {
     eigen_cpu_device_ = d;
@@ -215,10 +221,17 @@ class DeviceBase {
     return errors::Internal("Device does not implement MakeTensorFromProto()");
   }
 
+ protected:
+  // Does not take ownership.
+  void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) {
+    device_thread_pool_ = thread_pool;
+  }
+
  private:
   Env* const env_;
   CpuWorkerThreads* cpu_worker_threads_ = nullptr;
   GpuDeviceInfo* gpu_device_info_ = nullptr;
+  thread::ThreadPool* device_thread_pool_ = nullptr;
   Eigen::ThreadPoolDevice* eigen_cpu_device_ = nullptr;
 #ifdef TENSORFLOW_USE_SYCL
   Eigen::SyclDevice* eigen_sycl_device_ = nullptr;
diff --git a/tensorflow/core/framework/fake_input.cc b/tensorflow/core/framework/fake_input.cc
index ad301a8aa4ba4be5b7031d00984d8e6febf1583e..70d1e20a17c6cbf75a15d32a97216f6a1354ccf4 100644
--- a/tensorflow/core/framework/fake_input.cc
+++ b/tensorflow/core/framework/fake_input.cc
@@ -104,8 +104,8 @@ Status FakeInputImpl::AddInputToBuilder() {
       Status status = GetNodeAttr(*node_def_, arg_->type_list_attr(), &dts);
       if (!status.ok()) {
         return errors::InvalidArgument(
-            "Could not infer list of types for input '", arg_->name(), "': ",
-            status.error_message());
+            "Could not infer list of types for input '", arg_->name(),
+            "': ", status.error_message());
       }
       SourceList(dts);
       return Status::OK();
@@ -131,8 +131,8 @@ Status FakeInputImpl::GetN(int* n) const {
     Status status = GetNodeAttr(*node_def_, arg_->number_attr(), n);
     if (!status.ok()) {
       return errors::InvalidArgument("Could not infer length of input '",
-                                     arg_->name(), "': ",
-                                     status.error_message());
+                                     arg_->name(),
+                                     "': ", status.error_message());
     }
   }
   return Status::OK();
@@ -153,8 +153,8 @@ Status FakeInputImpl::GetDataType(DataType* dt) const {
         *dt = attr->default_value().type();
       } else {
         return errors::InvalidArgument("Could not infer type for input '",
-                                       arg_->name(), "': ",
-                                       status.error_message());
+                                       arg_->name(),
+                                       "': ", status.error_message());
       }
     }
   } else {
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index d757e962e522f801243a35a362f0c6821814d948..eae8e6c3c10c4b49081aed0e253d9a6f382f562b 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -795,12 +795,25 @@ uint64 FunctionDefHash(const FunctionDef& fdef) {
   return h;
 }
 
-string Canonicalize(const string& funcname, AttrSlice attrs) {
+string Canonicalize(const string& funcname, AttrSlice attrs,
+                    const FunctionLibraryRuntime::InstantiateOptions& options) {
   std::vector<string> entries;
-  entries.reserve(attrs.size());
+  entries.reserve(options.target.empty() ? attrs.size() : (attrs.size() + 1));
   for (auto p : attrs) {
     entries.push_back(strings::StrCat(p.first, "=", Print(p.second)));
   }
+  if (!options.target.empty()) {
+    entries.push_back(
+        strings::StrCat("_target", "=", str_util::CEscape(options.target)));
+  }
+  if (options.overlay_lib) {
+    entries.push_back(strings::StrCat(
+        "_overlay_lib", "=", reinterpret_cast<uintptr_t>(options.overlay_lib)));
+  }
+  if (!options.state_handle.empty()) {
+    entries.push_back(
+        strings::StrCat("_state_handle", "=", options.state_handle));
+  }
   std::sort(entries.begin(), entries.end());
   return strings::StrCat(funcname, "[", str_util::Join(entries, ","), "]");
 }
@@ -1051,26 +1064,36 @@ Status FunctionLibraryDefinition::AddLibrary(
   return Status::OK();
 }
 
-void FunctionLibraryDefinition::RemoveFunction(const string& func) {
+Status FunctionLibraryDefinition::RemoveFunction(const string& func) {
   const auto& i = function_defs_.find(func);
-  DCHECK(i != function_defs_.end());
+  if (i == function_defs_.end()) {
+    return errors::InvalidArgument("Tried to remove non-existent function ",
+                                   func);
+  }
   function_defs_.erase(i);
+  return Status::OK();
 }
 
-void FunctionLibraryDefinition::RemoveGradient(const string& func) {
+Status FunctionLibraryDefinition::RemoveGradient(const string& func) {
   const auto& i = func_grad_.find(func);
-  DCHECK(i != func_grad_.end());
+  if (i == func_grad_.end()) {
+    return errors::InvalidArgument("Tried to remove non-existent gradient ",
+                                   func);
+  }
   func_grad_.erase(i);
+  return Status::OK();
 }
 
 void FunctionLibraryDefinition::Remove(
     const std::vector<string>& funcs,
     const std::vector<string>& funcs_with_grads) {
   for (const string& f : funcs) {
-    RemoveFunction(f);
+    Status s = RemoveFunction(f);
+    DCHECK(s.ok());
   }
   for (const string& f : funcs_with_grads) {
-    RemoveGradient(f);
+    Status s = RemoveGradient(f);
+    DCHECK(s.ok());
   }
 }
 
@@ -1251,8 +1274,8 @@ FunctionDef FunctionDefHelper::Define(const string& name,
     }
     for (const string& a : src.arg) {
       const auto iter = ret_index.find(a);
-      CHECK(iter != ret_index.end()) << "Node input '" << a << "' in '"
-                                     << src.ret[0] << "' of " << name;
+      CHECK(iter != ret_index.end())
+          << "Node input '" << a << "' in '" << src.ret[0] << "' of " << name;
       n->add_input(iter->second);
     }
     for (const string& d : src.dep) {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 305b140a446171ddc4b249c97967057aa3e00152..e27001133bbb5056abf1a3e1f5b9d69c8e01bc56 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -35,6 +35,7 @@ namespace tensorflow {
 class CancellationManager;
 class GraphDef;
 class OpKernel;
+class ProcessFunctionLibraryRuntime;
 class ResourceMgr;
 class Rendezvous;
 class ScopedStepContainer;
@@ -234,14 +235,16 @@ bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2);
 // same.
 uint64 FunctionDefHash(const FunctionDef& fdef);
 
-// Returns a canonicalized string for the instantiation of the
-// function of the given "name" and attributes "attrs".
-//
-// The returned string is guaranteed to be stable within one address
-// space. But it may be change as the implementation
-// evolves. Therefore, it should not be persisted or compared across
-// address spaces.
-string Canonicalize(const string& funcname, AttrSlice attrs);
+class CallFrameInterface {
+ public:
+  virtual ~CallFrameInterface() {}
+
+  virtual size_t num_args() const = 0;
+  virtual size_t num_retvals() const = 0;
+
+  virtual Status GetArg(int index, Tensor* val) const = 0;
+  virtual Status SetRetval(int index, const Tensor& val) = 0;
+};
 
 // Represents a function call frame. I.e., the data structure used to
 // pass arguments to a function and retrieve its results.
@@ -249,7 +252,7 @@ string Canonicalize(const string& funcname, AttrSlice attrs);
 // Runtime must arrange accesses to one FunctionCallFrame s.t.
 //   1. SetArgs() happens before any GetArg();
 //   2. GetRetvals happens after all SetRetval();
-class FunctionCallFrame {
+class FunctionCallFrame : public CallFrameInterface {
  public:
   FunctionCallFrame(DataTypeSlice arg_types, DataTypeSlice ret_types);
   ~FunctionCallFrame();
@@ -259,9 +262,12 @@ class FunctionCallFrame {
   Status GetRetvals(std::vector<Tensor>* rets) const;
   Status ConsumeRetvals(std::vector<Tensor>* rets);
 
+  size_t num_args() const override { return arg_types_.size(); }
+  size_t num_retvals() const override { return ret_types_.size(); }
+
   // Callee methods.
-  Status GetArg(int index, Tensor* val) const;
-  Status SetRetval(int index, const Tensor& val);
+  Status GetArg(int index, Tensor* val) const override;
+  Status SetRetval(int index, const Tensor& val) override;
 
  private:
   DataTypeVector arg_types_;
@@ -307,6 +313,14 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // This operation is atomic.
   Status AddGradientDef(const GradientDef& grad);
 
+  // Remove function `func` from the library. Returns non-OK Status unless
+  // `func` is in the library.
+  Status RemoveFunction(const string& func);
+
+  // Remove gradient of function `func` from the library. Returns non-OK Status
+  // unless `func` has a gradient.
+  Status RemoveGradient(const string& func);
+
   // Adds the functions and gradients in 'other' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
@@ -349,6 +363,8 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Returns a proto representation of the state of this function library.
   FunctionDefLibrary ToProto() const;
 
+  size_t num_functions() const { return function_defs_.size(); }
+
   const OpRegistryInterface* default_registry() const {
     return default_registry_;
   }
@@ -377,13 +393,6 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // attr from.
   const FunctionDef* GetAttrImpl(const NodeDef& ndef) const;
 
-  // Remove function `func` from the library. `func` must be in the library.
-  void RemoveFunction(const string& func);
-
-  // Remove gradient of function `func` from the library. `func` must have
-  // a gradient.
-  void RemoveGradient(const string& func);
-
   // Remove all functions in `funcs` and all gradients of
   // functions in `funcs_with_grads` from this library.
   void Remove(const std::vector<string>& funcs,
@@ -404,9 +413,47 @@ class FunctionLibraryRuntime {
   //
   // Returns OK and fills in "handle" if the instantiation succeeds.
   // Otherwise returns an error and "handle" is undefined.
+  struct InstantiateOptions {
+    // The canonical device name of the device on which the function
+    // should be instantiated. If empty, the function will be
+    // instantiated on the local device.
+    string target;
+
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // If non-null, the runtime will use `overlay_lib` to resolve
+    // function(s) named in `function_name` and `attrs`. Otherwise,
+    // the runtime will use its internal library.
+    // NOTE(mrry): If provided, all functions defined in `overlay_lib`
+    // must be self-contained, and cannot refer to functions defined
+    // in other libraries.
+    // TODO(mrry): Provide a mechanism for sharing core functions
+    // between a set of libraries (e.g. by allowing a
+    // `FunctionLibraryDefinition` to store an `outer_scope` pointer
+    // and implementing name resolution across libraries).
+    const FunctionLibraryDefinition* overlay_lib = nullptr;
+
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // If non-empty, the runtime will use `state_handle` to identify
+    // cached state related the instantiated function. Two functions
+    // of the same name and attrs, instantiated with the same
+    // `state_handle` will have the same handle and share the same
+    // state (in stateful kernels); and two functions with different
+    // values for `state_handle` will have independent state.
+    string state_handle;
+  };
   typedef uint64 Handle;
   virtual Status Instantiate(const string& function_name, AttrSlice attrs,
+                             const InstantiateOptions& options,
                              Handle* handle) = 0;
+  Status Instantiate(const string& function_name, AttrSlice attrs,
+                     Handle* handle) {
+    return Instantiate(function_name, attrs, {}, handle);
+  }
+
+  // Releases state associated with the handle.
+  virtual Status ReleaseHandle(Handle handle) = 0;
 
   // Returns the function body for the instantiated function given its
   // handle 'h'. Returns nullptr if "h" is not found.
@@ -453,6 +500,8 @@ class FunctionLibraryRuntime {
   virtual void Run(const Options& opts, Handle handle,
                    gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
                    DoneCallback done) = 0;
+  virtual void Run(const Options& opts, Handle handle,
+                   CallFrameInterface* call_frame, DoneCallback done) = 0;
 
   // Creates a "kernel" for the given node def "ndef".
   //
@@ -460,13 +509,19 @@ class FunctionLibraryRuntime {
   // returned "*kernel". Otherwise, returns an error.
   virtual Status CreateKernel(const NodeDef& ndef, OpKernel** kernel) = 0;
 
-  // Returns true iff 'function' is stateful.
+  // Returns true iff the function named `function_name` is stateful.
+  // NOTE(mrry): This method assumes that the runtime is associated with a
+  // default function library, and looks up `function_name` in that library.
+  // It does not support overlay libraries.
   virtual bool IsStateful(const string& function_name) = 0;
 
   // Returns the device on which the function executes.
   virtual Device* device() = 0;
 
   // Returns the function library definition that backs this runtime.
+  // NOTE(mrry): The returned library definition is the default function library
+  // for this runtime. The runtime may instantiate functions from separate
+  // overlay libraries, which are not returned by this function.
   virtual const FunctionLibraryDefinition* GetFunctionLibraryDefinition()
       const = 0;
 
@@ -481,8 +536,25 @@ class FunctionLibraryRuntime {
   virtual int graph_def_version() = 0;
 
   typedef uint64 LocalHandle;
+
+  virtual Status Clone(std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+                       std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
+                       FunctionLibraryRuntime** out_flr) = 0;
 };
 
+// Returns a canonicalized string for the instantiation of the
+// function of the given "name", attributes "attrs", and "options".
+//
+// The returned string is guaranteed to be stable within one address
+// space. But it may be change as the implementation
+// evolves. Therefore, it should not be persisted or compared across
+// address spaces.
+string Canonicalize(const string& funcname, AttrSlice attrs,
+                    const FunctionLibraryRuntime::InstantiateOptions& options);
+inline string Canonicalize(const string& funcname, AttrSlice attrs) {
+  return Canonicalize(funcname, attrs, {});
+}
+
 const FunctionLibraryRuntime::Handle kInvalidHandle = -1;
 const FunctionLibraryRuntime::LocalHandle kInvalidLocalHandle = -1;
 typedef std::function<Status(FunctionLibraryRuntime*, const NodeDef&,
@@ -495,10 +567,11 @@ class DistributedFunctionLibraryRuntime {
   virtual ~DistributedFunctionLibraryRuntime() {}
 
   // The _target attr in attrs determines where the function is instantiated.
-  virtual Status Instantiate(const string& function_name,
-                             const FunctionLibraryDefinition& lib_def,
-                             AttrSlice attrs,
-                             FunctionLibraryRuntime::LocalHandle* handle) = 0;
+  virtual Status Instantiate(
+      const string& function_name, const FunctionLibraryDefinition& lib_def,
+      AttrSlice attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options,
+      FunctionLibraryRuntime::LocalHandle* handle) = 0;
 
   // opts.runner isn't used for execution.
   virtual void Run(const FunctionLibraryRuntime::Options& opts,
@@ -589,7 +662,7 @@ bool RegisterOp(const string& op, Creator func);
 // Returns OK the gradient creator for the "op" is found (may be
 // nullptr if REGISTER_OP_NO_GRADIENT is used.
 Status GetOpGradientCreator(const string& op, Creator* creator);
-};
+};  // namespace gradient
 
 // Declare explicit instantiations of GetAttr
 #define GET_ATTR(T)                                          \
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index f8b456051b76241104febd29d55fe82a9146a239..2b5a0fe1bb897ed2a43785637e873afcb7b3e45d 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -149,33 +149,25 @@ FunctionDef XTimes16() {
       {{"y", "y:y:0"}});
 }
 
-FunctionDef WXPlusB(){return FDH::Define(
-    // Name
-    "WXPlusB",
-    // Args
-    {"w: T", "x: T", "b: T"},
-    // Return values
-    {"y: T"},
-    // Attr def
-    {"T: {float, double}"},
-    // Nodes
-    {
-      {{"mm"},
-       "MatMul",
-       {"w", "x"},
-       {
-           {"T", "$T"}, {"transpose_a", false}, {"transpose_b", false},
-#ifdef INTEL_MKL
-       }},
-#else
+FunctionDef WXPlusB() {
+  return FDH::Define(
+      // Name
+      "WXPlusB",
+      // Args
+      {"w: T", "x: T", "b: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double}"},
+      // Nodes
+      {{{"mm"},
+        "MatMul",
+        {"w", "x"},
+        {{"T", "$T"},
+         {"transpose_a", false},
+         {"transpose_b", false},
          {"_kernel", "eigen"}}},
-#endif
-      {
-        {"y"}, "Add", {"mm", "b"}, {
-          { "T", "$T" }
-        }
-      }
-    });
+       {{"y"}, "Add", {"mm", "b"}, {{"T", "$T"}}}});
 }
 
 FunctionDef Swap() {
@@ -193,6 +185,23 @@ FunctionDef Swap() {
        {{"o1"}, "Identity", {"i0"}, {{"T", "$T"}}}});
 }
 
+FunctionDef InvalidControlFlow() {
+  return FDH::Create(
+      // Name
+      "InvalidControlFlow",
+      // Args
+      {"i: int32"},
+      // Return values
+      {"o: int32"},
+      // Attr def
+      {},
+      // Nodes
+      {{{"enter"}, "Enter", {"i"}, {{"T", DT_INT32}, {"frame_name", "while"}}},
+       {{"add"}, "Add", {"enter:output", "i"}, {{"T", DT_INT32}}}},
+      // Output mapping
+      {{"o", "add:z"}});
+}
+
 void FunctionTestSchedClosure(std::function<void()> fn) {
   static thread::ThreadPool* w =
       new thread::ThreadPool(Env::Default(), "Test", 8);
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index fbf273fa015c9326e01f45d1c603d22ab239fe25..b67c5cb1ab94f9e203f99b2a5982e282c76f942c 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -81,6 +81,9 @@ FunctionDef NonZero();
 // x:T, y:T -> y:T, x:T
 FunctionDef Swap();
 
+// Contains malformed control flow which can't be run by the executor.
+FunctionDef InvalidControlFlow();
+
 void FunctionTestSchedClosure(std::function<void()> fn);
 
 }  // end namespace function
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index bd018b7243897a5b45aa35d7fb94ca1ee1b12e75..1f670535d575e9bbc4196fb1f1e1c381d33ae204 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -35,8 +35,8 @@ namespace tensorflow {
 
 string SummarizeGraphDef(const GraphDef& graph_def) {
   string ret;
-  strings::StrAppend(&ret, "versions = ",
-                     ProtoShortDebugString(graph_def.versions()), ";\n");
+  strings::StrAppend(
+      &ret, "versions = ", ProtoShortDebugString(graph_def.versions()), ";\n");
   for (const NodeDef& node : graph_def.node()) {
     strings::StrAppend(&ret, SummarizeNodeDef(node), ";\n");
   }
@@ -90,9 +90,9 @@ static Status RemoveNewDefaultAttrsFromNodeDef(
           FindAttr(attr.first, *producer_op_def);
       if (producer_attr_def == nullptr) {
         return errors::InvalidArgument(
-            "Attr '", attr.first, "' missing in producer's OpDef: ",
-            SummarizeOpDef(*producer_op_def), " but found in node: ",
-            SummarizeNodeDef(*node_def));
+            "Attr '", attr.first,
+            "' missing in producer's OpDef: ", SummarizeOpDef(*producer_op_def),
+            " but found in node: ", SummarizeNodeDef(*node_def));
       }
       // ...and it has the same value as the default in producer,
       if (producer_attr_def->has_default_value() &&
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index 6a2eed94b94971d20faffa1608627290c1109d66..270118bb678e110269be9aa67a3904e36c34c512 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -61,7 +61,8 @@ void MemoryTypesHelper(const NameRangeMap& name_map,
 }
 
 MemoryType MTypeFromDType(const DataType dtype) {
-  return (dtype == DT_INT32) ? HOST_MEMORY : DEVICE_MEMORY;
+  return (dtype == DT_INT32 || DataTypeAlwaysOnHost(dtype)) ? HOST_MEMORY
+                                                            : DEVICE_MEMORY;
 }
 
 }  // namespace
@@ -118,6 +119,20 @@ Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
         "HostMemory args '", str_util::Join(host_memory_args, "', '"),
         "' not found in OpDef: ", SummarizeOpDef(*op_def));
   }
+  CHECK_LE(inp_mtypes->size(), inp_dtypes.size());
+  CHECK_LE(out_mtypes->size(), out_dtypes.size());
+
+  // Mark e.g. all resource and string types as host memory.
+  for (int i = 0; i < inp_mtypes->size(); ++i) {
+    if (DataTypeAlwaysOnHost(inp_dtypes[i])) {
+      (*inp_mtypes)[i] = HOST_MEMORY;
+    }
+  }
+  for (int i = 0; i < out_mtypes->size(); ++i) {
+    if (DataTypeAlwaysOnHost(out_dtypes[i])) {
+      (*out_mtypes)[i] = HOST_MEMORY;
+    }
+  }
 
   std::vector<int32> hostmem_attr;
   if (GetNodeAttr(ndef, "_input_hostmem", &hostmem_attr).ok()) {
diff --git a/tensorflow/core/framework/memory_types_test.cc b/tensorflow/core/framework/memory_types_test.cc
index 4704da9a119c2b06db5c8b1a3874417a0b1c3617..3126ea8e5f8974cb11f88301de613eb5b920830f 100644
--- a/tensorflow/core/framework/memory_types_test.cc
+++ b/tensorflow/core/framework/memory_types_test.cc
@@ -36,11 +36,13 @@ REGISTER_OP("HostMemoryTest")
     .Input("b: T")
     .Input("c: N * string")
     .Input("d: Tlist")
+    .Input("e: Rlist")
     .Output("o: N * T")
     .Output("p: Tlist")
     .Attr("T: type")
     .Attr("N: int")
-    .Attr("Tlist: list(type)");
+    .Attr("Tlist: list(type)")
+    .Attr("Rlist: list(type)");
 REGISTER_KERNEL_BUILDER(Name("HostMemoryTest").Device(DEVICE_CPU), DummyKernel);
 REGISTER_KERNEL_BUILDER(Name("HostMemoryTest")
                             .Device(DEVICE_GPU)
@@ -57,15 +59,20 @@ TEST(MemoryTypesForNode, Simple) {
                    .Input(FakeInput(DT_BOOL))
                    .Input(FakeInput(3))
                    .Input(FakeInput({DT_INT32, DT_FLOAT, DT_INT32}))
+                   .Input(FakeInput({DT_RESOURCE, DT_STRING, DT_RESOURCE}))
                    .Finalize(&node_def));
   MemoryTypeVector input, output;
 
   TF_EXPECT_OK(MemoryTypesForNode(OpRegistry::Global(), DEVICE_CPU, node_def,
                                   &input, &output));
-  EXPECT_EQ(MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
-                              DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
-                              DEVICE_MEMORY, DEVICE_MEMORY}),
-            input);
+  // a:float, b:bool, c:3*string, d:(int32, float, int32),
+  // e:(resource, string, resource)
+  EXPECT_EQ(
+      MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, HOST_MEMORY, HOST_MEMORY,
+                        HOST_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
+                        DEVICE_MEMORY, HOST_MEMORY, HOST_MEMORY, HOST_MEMORY}),
+      input);
+  // o:3*bool, p:(int32, float, int32)
   EXPECT_EQ(MemoryTypeVector({DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY,
                               DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY}),
             output);
@@ -74,7 +81,8 @@ TEST(MemoryTypesForNode, Simple) {
                                   &input, &output));
   EXPECT_EQ(
       MemoryTypeVector({HOST_MEMORY, DEVICE_MEMORY, HOST_MEMORY, HOST_MEMORY,
-                        HOST_MEMORY, HOST_MEMORY, HOST_MEMORY, HOST_MEMORY}),
+                        HOST_MEMORY, HOST_MEMORY, HOST_MEMORY, HOST_MEMORY,
+                        HOST_MEMORY, HOST_MEMORY, HOST_MEMORY}),
       input);
   EXPECT_EQ(MemoryTypeVector({HOST_MEMORY, HOST_MEMORY, HOST_MEMORY,
                               DEVICE_MEMORY, DEVICE_MEMORY, DEVICE_MEMORY}),
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 477184022df4bb7e4d329cc5ed09572f9dbe9585..95fb3863144e8150d78f5d21722f6bc102c451ea 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -347,6 +347,36 @@ Status AddArgToSig(const NodeDef& node_def, const OpDef::ArgDef& arg_def,
 
 }  // namespace
 
+Status InputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
+                        int input_port, DataType* input_type) {
+  DataTypeVector input_types;
+  for (const auto& arg : op_def.input_arg()) {
+    TF_RETURN_IF_ERROR(AddArgToSig(node_def, arg, &input_types));
+    if (input_types.size() > input_port) {
+      const DataType dtype = input_types[input_port];
+      *input_type = dtype;
+      return Status::OK();
+    }
+  }
+  return errors::InvalidArgument("Input ", input_port, " not found for node ",
+                                 node_def.name());
+}
+
+Status OutputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
+                         int output_port, DataType* output_type) {
+  DataTypeVector output_types;
+  for (const auto& arg : op_def.output_arg()) {
+    TF_RETURN_IF_ERROR(AddArgToSig(node_def, arg, &output_types));
+    if (output_types.size() > output_port) {
+      const DataType dtype = output_types[output_port];
+      *output_type = dtype;
+      return Status::OK();
+    }
+  }
+  return errors::InvalidArgument("Output ", output_port, " not found for node ",
+                                 node_def.name());
+}
+
 Status InOutTypesForNode(const NodeDef& node_def, const OpDef& op_def,
                          DataTypeVector* inputs, DataTypeVector* outputs) {
   for (const auto& arg : op_def.input_arg()) {
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index f6f28aac4811d30b845191735536b389e41bf259..b8a1e84f2e79d6537f58e9ac15ff8e1a22f877c7 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
@@ -237,6 +239,14 @@ bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,
 // REQUIRES: Must not use the returned value beyond the lifetime of node_def.
 const string& GetNodeAttrString(const AttrSlice& attrs, StringPiece attr_name);
 
+// Computes the input type for a specific node input.
+// REQUIRES: ValidateOpDef(op_def).ok()
+Status InputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
+                        int input_port, DataType* input_type);
+// Computes the output type for a specific node output.
+// REQUIRES: ValidateOpDef(op_def).ok()
+Status OutputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
+                         int output_port, DataType* output_type);
 // Computes the input and output types for a specific node.
 // REQUIRES: ValidateOpDef(op_def).ok()
 Status InOutTypesForNode(const NodeDef& node_def, const OpDef& op_def,
@@ -253,8 +263,12 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def);
 // corresponding input/output index range.  For example,
 // input "foo" corresponds to input indices
 //   [ (*inputs)["foo"].first, (*inputs)["foo"].second ).
-// TODO(irving): Remove the NodeDef version; keep only the Node version.
-typedef std::unordered_map<string, std::pair<int, int>> NameRangeMap;
+// NOTE(mrry): To reduce allocations when the map is used and save
+// space, the returned `NameRangeMap` objects borrow the input/output
+// argument names from `op_def`. The `op_def` must outlive the
+// returned `NameRangeMap` objects.
+typedef gtl::FlatMap<StringPiece, std::pair<int, int>, hash<StringPiece>>
+    NameRangeMap;
 Status NameRangesForNode(const NodeDef& node_def, const OpDef& op_def,
                          NameRangeMap* inputs, NameRangeMap* outputs);
 Status NameRangesForNode(const Node& node, const OpDef& op_def,
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index bfd598a97202e4bcbf1f869b2687f7cbca36b36b..ae3a93eafeefb2be3a85e546c085691a72caf2e1 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -151,8 +151,9 @@ TEST(NodeDefUtilTest, Out) {
   AddNodeAttr("T", DT_STRING, &bad);
   ExpectFailure(bad, op,
                 "Value for attr 'T' of string is not in the list of allowed "
-                "values: float, double, int64, int32, uint8, uint16, int16, "
-                "int8, complex64, complex128, qint8, quint8, qint32");
+                "values: float, double, int32, uint8, int16, int8, complex64, "
+                "int64, qint8, quint8, qint32, bfloat16, uint16, complex128, "
+                "half, uint32, uint64");
 }
 
 TEST(NodeDefUtilTest, Enum) {
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index 2b080e13fdb8308f71c967ab14c6ed71ccd8f357..4c38fbbe591a5d07ba4cbbea00dcbfb41ca2f403 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_NUMERIC_TYPES_H_
 
 #include <complex>
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 // Disable clang-format to prevent 'FixedPoint' header from being included
 // before 'Tensor' header on which it depends.
@@ -41,108 +40,77 @@ typedef Eigen::QInt32 qint32;
 typedef Eigen::QInt16 qint16;
 typedef Eigen::QUInt16 quint16;
 
-// see framework/bfloat16.h for description.
-struct bfloat16 {
-  EIGEN_DEVICE_FUNC bfloat16() {}
+}  // namespace tensorflow
 
-  EIGEN_DEVICE_FUNC explicit bfloat16(const float v) {
-    const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    value = p[0];
-#else
-    value = p[1];
-#endif
-  }
 
-  template <class T>
-  explicit EIGEN_DEVICE_FUNC bfloat16(const T& val)
-      : bfloat16(static_cast<float>(val)) {}
 
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
-    float result;
-
-    uint16_t* q = reinterpret_cast<uint16_t*>(&result);
 
+static inline tensorflow::bfloat16 FloatToBFloat16(float float_val) {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    q[0] = value;
-    q[1] = 0;
+    return *reinterpret_cast<tensorflow::bfloat16*>(
+        reinterpret_cast<uint16_t*>(&float_val));
 #else
-    q[0] = 0;
-    q[1] = value;
+    return *reinterpret_cast<tensorflow::bfloat16*>(
+        &(reinterpret_cast<uint16_t*>(&float_val)[1]));
 #endif
-    return result;
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator bool() const {
-    return static_cast<bool>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator Eigen::half() const {
-    return static_cast<Eigen::half>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator short() const {
-    return static_cast<short>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator int() const {
-    return static_cast<int>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator char() const {
-    return static_cast<char>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator signed char() const {
-    return static_cast<signed char>(float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC explicit operator unsigned char() const {
-    return static_cast<unsigned char>(float(*this));
+}
+    
+namespace Eigen {
+// TODO(xpan): We probably need to overwrite more methods to have correct eigen
+// behavior. E.g. epsilon(), dummy_precision, etc. See NumTraits.h in eigen.
+template <>
+struct NumTraits<tensorflow::bfloat16>
+    : GenericNumTraits<tensorflow::bfloat16> {
+  enum {
+    IsInteger = 0,
+    IsSigned = 1,
+    RequireInitialization = 0
+  };
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 highest() {
+    return FloatToBFloat16(NumTraits<float>::highest());
   }
 
-  EIGEN_DEVICE_FUNC explicit operator unsigned int() const {
-    return static_cast<unsigned int>(float(*this));
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 lowest() {
+    return FloatToBFloat16(NumTraits<float>::lowest());
   }
 
-  EIGEN_DEVICE_FUNC explicit operator unsigned long() const {
-    return static_cast<unsigned long>(float(*this));
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 infinity() {
+    return FloatToBFloat16(NumTraits<float>::infinity());
   }
 
-  EIGEN_DEVICE_FUNC explicit operator unsigned long long() const {
-    return static_cast<unsigned long long>(float(*this));
+  static EIGEN_STRONG_INLINE tensorflow::bfloat16 quiet_NaN() {
+    return FloatToBFloat16(NumTraits<float>::quiet_NaN());
   }
+};
 
-  EIGEN_DEVICE_FUNC explicit operator long long() const {
-    return static_cast<long long>(float(*this));
-  }
 
-  EIGEN_DEVICE_FUNC explicit operator double() const {
-    return static_cast<double>(float(*this));
-  }
+using ::tensorflow::operator==;
+using ::tensorflow::operator!=;
 
-  uint16_t value;
-};
+namespace numext {
 
-inline bool operator==(const bfloat16 a, const bfloat16 b) {
-  return a.value == b.value;
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE tensorflow::bfloat16 log(
+    const tensorflow::bfloat16& x) {
+  return static_cast<tensorflow::bfloat16>(::logf(static_cast<float>(x)));
 }
 
-inline bool operator!=(const bfloat16 a, const bfloat16 b) {
-  return a.value != b.value;
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE tensorflow::bfloat16 exp(
+    const tensorflow::bfloat16& x) {
+  return static_cast<tensorflow::bfloat16>(::expf(static_cast<float>(x)));
 }
 
-}  // end namespace tensorflow
-
-namespace Eigen {
 template <>
-struct NumTraits<tensorflow::bfloat16> : GenericNumTraits<uint16_t> {};
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE tensorflow::bfloat16 abs(
+    const tensorflow::bfloat16& x) {
+  return static_cast<tensorflow::bfloat16>(::fabsf(static_cast<float>(x)));
+}
 
-using ::tensorflow::operator==;
-using ::tensorflow::operator!=;
+}  // namespace numext
 }  // namespace Eigen
 
-#ifdef COMPILER_MSVC
+#if defined(COMPILER_MSVC) && !defined(__clang__)
 namespace std {
 template <>
 struct hash<Eigen::half> {
diff --git a/tensorflow/core/framework/op_compatibility_test.cc b/tensorflow/core/framework/op_compatibility_test.cc
index ae2fdae379a21289df2e0eb2dd5cbda0a6d5ed81..b57bdcb841592578de4a2026d70b0e91bae66b02 100644
--- a/tensorflow/core/framework/op_compatibility_test.cc
+++ b/tensorflow/core/framework/op_compatibility_test.cc
@@ -163,6 +163,26 @@ class OpCompatibilityTest : public OpsTestBase {
 
     ExpectIncompatible(old_op_def, *new_op_def, compatibility_error);
   }
+
+  void ExpectDefaultChangeFailure(const OpDef& old_op_def,
+                                  const string& compatibility_error) {
+    // This should be all that is needed to get compatibility.
+    const OpDef* new_op_def = RegisteredOpDef();
+    AddDefaultsToNodeDef(*new_op_def, node_def());
+
+    // Validate that the NodeDef is valid.
+    TF_ASSERT_OK(ValidateNodeDef(*node_def(), *new_op_def));
+
+    Status status = OpDefAttrDefaultsUnchanged(old_op_def, *new_op_def);
+    if (status.ok()) {
+      ADD_FAILURE() << SummarizeOpDef(old_op_def) << " vs. "
+                    << SummarizeOpDef(*new_op_def);
+    } else {
+      EXPECT_TRUE(
+          StringPiece(status.error_message()).contains(compatibility_error))
+          << status << " does not contain " << compatibility_error;
+    }
+  }
 };
 
 // Should be compatible if the Op hasn't changed (sanity check).
@@ -260,40 +280,6 @@ TEST_F(OpCompatibilityTest, AttrOrder) {
   EXPECT_EQ("attr_order = AttrOrder[a=7, b=true]()", Result());
 }
 
-// Should be able to add a default to an attr.
-REGISTER_OP("AddDefault").Output("ndef: string").Attr("a: int = 1234");
-REGISTER_KERNEL_BUILDER(Name("AddDefault").Device(DEVICE_CPU), TestKernel);
-
-TEST_F(OpCompatibilityTest, AddDefault) {
-  OpRegistrationData old_op;
-  TF_ASSERT_OK(OpDefBuilder("AddDefault")
-                   .Output("ndef: string")
-                   .Attr("a: int")
-                   .Finalize(&old_op));
-  TF_ASSERT_OK(NodeDefBuilder("add_default", &old_op.op_def)
-                   .Attr("a", 765)
-                   .Finalize(node_def()));
-  ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("add_default = AddDefault[a=765]()", Result());
-}
-
-// Should be able to remove a default from an attr, *as long as that
-// attr has always existed*.
-REGISTER_OP("RemoveDefault").Output("ndef: string").Attr("a: int");
-REGISTER_KERNEL_BUILDER(Name("RemoveDefault").Device(DEVICE_CPU), TestKernel);
-
-TEST_F(OpCompatibilityTest, RemoveDefault) {
-  OpRegistrationData old_op;
-  TF_ASSERT_OK(OpDefBuilder("RemoveDefault")
-                   .Output("ndef: string")
-                   .Attr("a: int = 91")
-                   .Finalize(&old_op));
-  TF_ASSERT_OK(
-      NodeDefBuilder("remove_default", &old_op.op_def).Finalize(node_def()));
-  ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("remove_default = RemoveDefault[a=91]()", Result());
-}
-
 // Should be able to make an input/output polymorphic.
 // Changing from int32 -> T (where T: type = DT_INT32 by default).
 REGISTER_OP("TypePolymorphic")
@@ -1054,9 +1040,56 @@ TEST_F(OpCompatibilityTest, RenameOutputListFails) {
                       "Output signature mismatch 'old:T' vs. 'new:T'");
 }
 
-// Changing an attr's default is not technically illegal, but should
-// be forbidden if it the attr ever didn't exist since it likely
-// affects semantics.
+// Should not be able to add a default to an attr.
+REGISTER_OP("AddDefault").Output("ndef: string").Attr("a: int = 1234");
+REGISTER_KERNEL_BUILDER(Name("AddDefault").Device(DEVICE_CPU), TestKernel);
+
+TEST_F(OpCompatibilityTest, AddDefault) {
+  OpRegistrationData old_op;
+  TF_ASSERT_OK(OpDefBuilder("AddDefault")
+                   .Output("ndef: string")
+                   .Attr("a: int")
+                   .Finalize(&old_op));
+  TF_ASSERT_OK(NodeDefBuilder("add_default", &old_op.op_def)
+                   .Attr("a", 765)
+                   .Finalize(node_def()));
+  ExpectDefaultChangeFailure(
+      old_op.op_def,
+      "Attr 'a' has added/removed it's default; from no default to 1234");
+}
+
+// Should not be able to remove a default from an attr.
+REGISTER_OP("RemoveDefault").Output("ndef: string").Attr("a: int");
+REGISTER_KERNEL_BUILDER(Name("RemoveDefault").Device(DEVICE_CPU), TestKernel);
+
+TEST_F(OpCompatibilityTest, RemoveDefault) {
+  OpRegistrationData old_op;
+  TF_ASSERT_OK(OpDefBuilder("RemoveDefault")
+                   .Output("ndef: string")
+                   .Attr("a: int = 91")
+                   .Finalize(&old_op));
+  TF_ASSERT_OK(
+      NodeDefBuilder("remove_default", &old_op.op_def).Finalize(node_def()));
+  ExpectDefaultChangeFailure(
+      old_op.op_def,
+      "Attr 'a' has added/removed it's default; from 91 to no default");
+}
+
+// Should not be able to change a default for an attr.
+REGISTER_OP("ChangeDefault").Output("ndef: string").Attr("a: int = 1");
+REGISTER_KERNEL_BUILDER(Name("ChangeDefault").Device(DEVICE_CPU), TestKernel);
+
+TEST_F(OpCompatibilityTest, ChangeDefault) {
+  OpRegistrationData old_op;
+  TF_ASSERT_OK(OpDefBuilder("ChangeDefault")
+                   .Output("ndef: string")
+                   .Attr("a: int = 2")
+                   .Finalize(&old_op));
+  TF_ASSERT_OK(
+      NodeDefBuilder("change_default", &old_op.op_def).Finalize(node_def()));
+  ExpectDefaultChangeFailure(
+      old_op.op_def, "Attr 'a' has changed it's default value; from 2 to 1");
+}
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_def_builder_test.cc b/tensorflow/core/framework/op_def_builder_test.cc
index c1511ebe340d99fc67f588596e028cca92e23250..9b24e3aa00425321eda2e196b1e7b243a552c730 100644
--- a/tensorflow/core/framework/op_def_builder_test.cc
+++ b/tensorflow/core/framework/op_def_builder_test.cc
@@ -124,22 +124,23 @@ TEST_F(OpDefBuilderTest, AttrWithRestrictions) {
       "attr: { name: 'a' type: 'type' allowed_values { list { type: "
       "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
       "DT_UINT16, DT_INT8, DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, DT_QUINT8, "
-      "DT_QINT32, DT_UINT32, DT_UINT64] } } }");
+      "DT_QINT32, DT_UINT32, DT_UINT64, DT_BFLOAT16] } } }");
   ExpectSuccess(
       b().Attr("a:{numbertype, variant}"),
       "attr: { name: 'a' type: 'type' allowed_values { list { type: "
       "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
       "DT_UINT16, DT_INT8, DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, DT_QUINT8, "
-      "DT_QINT32, DT_UINT32, DT_UINT64, DT_VARIANT] } } }");
+      "DT_QINT32, DT_UINT32, DT_UINT64, DT_BFLOAT16, DT_VARIANT] } } }");
   ExpectSuccess(b().Attr("a:realnumbertype"),
                 "attr: { name: 'a' type: 'type' allowed_values { list { type: "
                 "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, "
-                "DT_INT16, DT_UINT16, DT_INT8, DT_UINT32, DT_UINT64] } } }");
+                "DT_INT16, DT_UINT16, DT_INT8, DT_UINT32, DT_UINT64, "
+                "DT_BFLOAT16] } } }");
   ExpectSuccess(b().Attr("a:{realnumbertype,  variant , string, }"),
                 "attr: { name: 'a' type: 'type' allowed_values { list { type: "
                 "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, "
                 "DT_INT16, DT_UINT16, DT_INT8, DT_UINT32, DT_UINT64, "
-                "DT_VARIANT, DT_STRING] } } }");
+                "DT_BFLOAT16, DT_VARIANT, DT_STRING] } } }");
   ExpectSuccess(b().Attr("a:quantizedtype"),
                 "attr: { name: 'a' type: 'type' allowed_values { list { type: "
                 "[DT_QINT8, DT_QUINT8, DT_QINT32, DT_QINT16, DT_QUINT16]} } }");
@@ -216,12 +217,14 @@ TEST_F(OpDefBuilderTest, AttrListOfRestricted) {
       b().Attr("a:list(realnumbertype)"),
       "attr: { name: 'a' type: 'list(type)' allowed_values { list { type: "
       "[DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
-      "DT_UINT16, DT_INT8, DT_HALF, DT_UINT32, DT_UINT64] } } }");
+      "DT_UINT16, DT_INT8, DT_HALF, DT_BFLOAT16, DT_UINT32, DT_UINT64"
+      "] } } }");
   ExpectSuccess(
       b().Attr("a:list({realnumbertype, variant})"),
       "attr: { name: 'a' type: 'list(type)' allowed_values { list { type: "
       "[DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
-      "DT_UINT16, DT_INT8, DT_HALF, DT_UINT32, DT_UINT64, DT_VARIANT] } } }");
+      "DT_UINT16, DT_INT8, DT_HALF, DT_BFLOAT16, DT_UINT32, DT_UINT64, "
+      "DT_VARIANT] } } }");
   ExpectSuccess(
       b().Attr("a:list(quantizedtype)"),
       "attr: { name: 'a' type: 'list(type)' allowed_values { list { type: "
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index 29feda499fd2646a00c1f5bc9fc7223e9f134af9..2d035ab90d0f4493f6b6f572d0dd8550f5098e7e 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -170,20 +170,20 @@ const OpDef::ArgDef* FindInputArg(StringPiece name, const OpDef& op_def) {
   return nullptr;
 }
 
-#define VALIDATE(EXPR, ...)                                          \
-  do {                                                               \
-    if (!(EXPR)) {                                                   \
-      return errors::InvalidArgument(__VA_ARGS__, "; in OpDef: ",    \
-                                     ProtoShortDebugString(op_def)); \
-    }                                                                \
+#define VALIDATE(EXPR, ...)                                            \
+  do {                                                                 \
+    if (!(EXPR)) {                                                     \
+      return errors::InvalidArgument(                                  \
+          __VA_ARGS__, "; in OpDef: ", ProtoShortDebugString(op_def)); \
+    }                                                                  \
   } while (false)
 
 static Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
                           bool output, std::set<string>* names) {
   const string suffix = strings::StrCat(
       output ? " for output '" : " for input '", arg.name(), "'");
-  VALIDATE(gtl::InsertIfNotPresent(names, arg.name()), "Duplicate name: ",
-           arg.name());
+  VALIDATE(gtl::InsertIfNotPresent(names, arg.name()),
+           "Duplicate name: ", arg.name());
   VALIDATE(HasAttrStyleType(arg), "Missing type", suffix);
 
   if (!arg.number_attr().empty()) {
@@ -250,8 +250,8 @@ Status ValidateOpDef(const OpDef& op_def) {
   std::set<string> names;  // for detecting duplicate names
   for (const auto& attr : op_def.attr()) {
     // Validate name
-    VALIDATE(gtl::InsertIfNotPresent(&names, attr.name()), "Duplicate name: ",
-             attr.name());
+    VALIDATE(gtl::InsertIfNotPresent(&names, attr.name()),
+             "Duplicate name: ", attr.name());
     DataType dt;
     VALIDATE(!DataTypeFromString(attr.name(), &dt), "Attr can't have name ",
              attr.name(), " that matches a data type");
@@ -449,6 +449,11 @@ string AllowedStr(const OpDef::AttrDef& attr) {
   return SummarizeAttrValue(attr.allowed_values());
 }
 
+string DefaultAttrStr(const OpDef::AttrDef& attr) {
+  if (!attr.has_default_value()) return "no default";
+  return SummarizeAttrValue(attr.default_value());
+}
+
 bool HigherMinimum(const OpDef::AttrDef& old_attr,
                    const OpDef::AttrDef& new_attr) {
   // Anything -> no restriction : not more restrictive.
@@ -675,8 +680,8 @@ Status OpDefAddedDefaultsUnchanged(const OpDef& old_op,
     if (!penultimate_attr.has_default_value() ||
         !new_attr->has_default_value()) {
       return errors::InvalidArgument("Missing default for attr '",
-                                     penultimate_attr.name(), "' in op: ",
-                                     SummarizeOpDef(new_op));
+                                     penultimate_attr.name(),
+                                     "' in op: ", SummarizeOpDef(new_op));
     }
 
     // Actually test that the attr's default value hasn't changed.
@@ -692,6 +697,32 @@ Status OpDefAddedDefaultsUnchanged(const OpDef& old_op,
   return Status::OK();
 }
 
+Status OpDefAttrDefaultsUnchanged(const OpDef& old_op, const OpDef& new_op) {
+  AttrMap new_attrs, old_attrs;
+  FillAttrMap(old_op, &old_attrs);
+  FillAttrMap(new_op, &new_attrs);
+
+  for (const auto& old_attr : old_op.attr()) {
+    const OpDef::AttrDef* new_attr =
+        gtl::FindPtrOrNull(new_attrs, old_attr.name());
+    if (new_attr == nullptr) continue;
+    if (old_attr.has_default_value() != new_attr->has_default_value()) {
+      return errors::InvalidArgument(
+          "Attr '", old_attr.name(), "' has added/removed it's default; ",
+          "from ", DefaultAttrStr(old_attr), " to ", DefaultAttrStr(*new_attr));
+    }
+    if (old_attr.has_default_value() &&
+        !AreAttrValuesEqual(old_attr.default_value(),
+                            new_attr->default_value())) {
+      return errors::InvalidArgument(
+          "Attr '", old_attr.name(), "' has changed it's default value; ",
+          "from ", DefaultAttrStr(old_attr), " to ", DefaultAttrStr(*new_attr));
+    }
+  }
+
+  return Status::OK();
+}
+
 void RemoveNonDeprecationDescriptionsFromOpDef(OpDef* op_def) {
   for (int i = 0; i < op_def->input_arg_size(); ++i) {
     op_def->mutable_input_arg(i)->clear_description();
diff --git a/tensorflow/core/framework/op_def_util.h b/tensorflow/core/framework/op_def_util.h
index f9661dceddc1a3de694024dddb9afce1cae8680c..0ba1325a03b148e0a1c8fe94723e2dc5503773d1 100644
--- a/tensorflow/core/framework/op_def_util.h
+++ b/tensorflow/core/framework/op_def_util.h
@@ -63,6 +63,10 @@ Status OpDefAddedDefaultsUnchanged(const OpDef& old_op,
                                    const OpDef& penultimate_op,
                                    const OpDef& new_op);
 
+// Returns an error if the default value for any attr is added/removed/modified
+// in new_op compared to old_op.
+Status OpDefAttrDefaultsUnchanged(const OpDef& old_op, const OpDef& new_op);
+
 // Remove all docs from *op_def / *op_list.
 void RemoveDescriptionsFromOpDef(OpDef* op_def);
 void RemoveDescriptionsFromOpList(OpList* op_list);
@@ -78,7 +82,7 @@ bool AttrDefEqual(const OpDef::AttrDef& a1, const OpDef::AttrDef& a2);
 uint64 AttrDefHash(const OpDef::AttrDef& a);
 
 // Returns true if all AttrDefs in `a1` equal corresponding AttrDefs in
-// `a2`. Corrspondence is established by name.
+// `a2`. Correspondence is established by name.
 bool RepeatedAttrDefEqual(const protobuf::RepeatedPtrField<OpDef::AttrDef>& a1,
                           const protobuf::RepeatedPtrField<OpDef::AttrDef>& a2);
 
diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc
index 28809c11c58704479c9c45b1de96dffef3d575bd..2b9812d4fcbc145540155959b19dd37cf902c1a2 100644
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@@ -200,10 +200,11 @@ TEST_F(ValidateOpDefTest, BadAttrDefault) {
                           "default_value { list { s: ['foo'] } } }"),
                 "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op "
                 "'BadAttrDef'");
-  ExpectFailure(TestBuilder(OpDefBuilder("GoodAttrDef")
-                                .Attr("a: list(type) >=2 = [DT_STRING]")),
-                "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op "
-                "'GoodAttrDef'");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("GoodAttrDef").Attr("a: list(type) >=2 = [DT_STRING]")),
+      "Length for attr 'a' of 1 must be at least minimum 2\n\t in Op "
+      "'GoodAttrDef'");
 }
 
 TEST_F(ValidateOpDefTest, NoRefTypes) {
@@ -213,9 +214,10 @@ TEST_F(ValidateOpDefTest, NoRefTypes) {
   ExpectFailure(
       TestBuilder(OpDefBuilder("BadAttrDef").Attr("T: type = DT_INT32_REF")),
       "AttrValue must not have reference type value of int32_ref");
-  ExpectFailure(TestBuilder(OpDefBuilder("BadAttrDef")
-                                .Attr("T: list(type) = [DT_STRING_REF]")),
-                "AttrValue must not have reference type value of string_ref");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("BadAttrDef").Attr("T: list(type) = [DT_STRING_REF]")),
+      "AttrValue must not have reference type value of string_ref");
 }
 
 TEST_F(ValidateOpDefTest, BadAttrMin) {
@@ -245,9 +247,10 @@ TEST_F(ValidateOpDefTest, BadAttrAllowed) {
   TF_EXPECT_OK(TestBuilder(
       OpDefBuilder("GoodAttrtude").Attr("x: numbertype = DT_INT32")));
   // Not in list of allowed types.
-  ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude")
-                                .Attr("x: numbertype = DT_STRING")),
-                "attr 'x' of string is not in the list of allowed values");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("BadAttrtude").Attr("x: numbertype = DT_STRING")),
+      "attr 'x' of string is not in the list of allowed values");
   ExpectFailure(
       TestBuilder(OpDefBuilder("BadAttrtude")
                       .Attr("x: list(realnumbertype) = [DT_COMPLEX64]")),
@@ -260,9 +263,10 @@ TEST_F(ValidateOpDefTest, BadAttrAllowed) {
   TF_EXPECT_OK(TestBuilder(
       OpDefBuilder("GoodAttrtude").Attr("x: {'foo', 'bar'} = 'bar'")));
   // Not in list of allowed strings.
-  ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude")
-                                .Attr("x: {'foo', 'bar'} = 'baz'")),
-                "attr 'x' of \"baz\" is not in the list of allowed values");
+  ExpectFailure(
+      TestBuilder(
+          OpDefBuilder("BadAttrtude").Attr("x: {'foo', 'bar'} = 'baz'")),
+      "attr 'x' of \"baz\" is not in the list of allowed values");
   ExpectFailure(TestBuilder(OpDefBuilder("BadAttrtude")
                                 .Attr("x: list({'foo', 'bar'}) = ['baz']")),
                 "attr 'x' of \"baz\" is not in the list of allowed values");
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 95a9b763f92d1688d97cdbafc51d7e32d7875315..5f2eb9d99ab11f9862bd277d93af61c05e2517f4 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/op_gen_overrides.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -246,29 +245,6 @@ string PBTxtFromMultiline(StringPiece multiline_pbtxt) {
   return pbtxt;
 }
 
-OpGenOverrideMap::OpGenOverrideMap() {}
-OpGenOverrideMap::~OpGenOverrideMap() {}
-
-Status OpGenOverrideMap::LoadFileList(Env* env, const string& filenames) {
-  std::vector<string> v = str_util::Split(filenames, ",");
-  for (const string& f : v) {
-    TF_RETURN_IF_ERROR(LoadFile(env, f));
-  }
-  return Status::OK();
-}
-
-Status OpGenOverrideMap::LoadFile(Env* env, const string& filename) {
-  if (filename.empty()) return Status::OK();
-  string contents;
-  TF_RETURN_IF_ERROR(ReadFileToString(env, filename, &contents));
-  OpGenOverrides all;
-  protobuf::TextFormat::ParseFromString(contents, &all);
-  for (const auto& one : all.op()) {
-    map_[one.name()].reset(new OpGenOverride(one));
-  }
-  return Status::OK();
-}
-
 static void StringReplace(const string& from, const string& to, string* s) {
   // Split *s into pieces delimited by `from`.
   std::vector<string> split;
@@ -290,35 +266,6 @@ static void StringReplace(const string& from, const string& to, string* s) {
   *s = str_util::Join(split, to.c_str());
 }
 
-static void RenameInDocs(const string& from, const string& to, OpDef* op_def) {
-  const string from_quoted = strings::StrCat("`", from, "`");
-  const string to_quoted = strings::StrCat("`", to, "`");
-  for (int i = 0; i < op_def->input_arg_size(); ++i) {
-    if (!op_def->input_arg(i).description().empty()) {
-      StringReplace(from_quoted, to_quoted,
-                    op_def->mutable_input_arg(i)->mutable_description());
-    }
-  }
-  for (int i = 0; i < op_def->output_arg_size(); ++i) {
-    if (!op_def->output_arg(i).description().empty()) {
-      StringReplace(from_quoted, to_quoted,
-                    op_def->mutable_output_arg(i)->mutable_description());
-    }
-  }
-  for (int i = 0; i < op_def->attr_size(); ++i) {
-    if (!op_def->attr(i).description().empty()) {
-      StringReplace(from_quoted, to_quoted,
-                    op_def->mutable_attr(i)->mutable_description());
-    }
-  }
-  if (!op_def->summary().empty()) {
-    StringReplace(from_quoted, to_quoted, op_def->mutable_summary());
-  }
-  if (!op_def->description().empty()) {
-    StringReplace(from_quoted, to_quoted, op_def->mutable_description());
-  }
-}
-
 static void RenameInDocs(const string& from, const string& to,
                          ApiDef* api_def) {
   const string from_quoted = strings::StrCat("`", from, "`");
@@ -349,84 +296,6 @@ static void RenameInDocs(const string& from, const string& to,
   }
 }
 
-const OpGenOverride* OpGenOverrideMap::ApplyOverride(OpDef* op_def) const {
-  // Look up
-  const auto iter = map_.find(op_def->name());
-  if (iter == map_.end()) return nullptr;
-  const OpGenOverride& proto = *iter->second;
-
-  // Apply overrides from `proto`.
-  if (!proto.rename_to().empty()) {
-    op_def->set_name(proto.rename_to());
-    RenameInDocs(proto.name(), proto.rename_to(), op_def);
-  }
-  for (const auto& attr_default : proto.attr_default()) {
-    bool found = false;
-    for (int i = 0; i < op_def->attr_size(); ++i) {
-      if (op_def->attr(i).name() == attr_default.name()) {
-        *op_def->mutable_attr(i)->mutable_default_value() =
-            attr_default.value();
-        found = true;
-        break;
-      }
-    }
-    if (!found) {
-      LOG(WARNING) << proto.name() << " can't find attr " << attr_default.name()
-                   << " to override default";
-    }
-  }
-  for (const auto& attr_rename : proto.attr_rename()) {
-    bool found = false;
-    for (int i = 0; i < op_def->attr_size(); ++i) {
-      if (op_def->attr(i).name() == attr_rename.from()) {
-        *op_def->mutable_attr(i)->mutable_name() = attr_rename.to();
-        found = true;
-        break;
-      }
-    }
-    if (found) {
-      RenameInDocs(attr_rename.from(), attr_rename.to(), op_def);
-    } else {
-      LOG(WARNING) << proto.name() << " can't find attr " << attr_rename.from()
-                   << " to rename";
-    }
-  }
-  for (const auto& input_rename : proto.input_rename()) {
-    bool found = false;
-    for (int i = 0; i < op_def->input_arg_size(); ++i) {
-      if (op_def->input_arg(i).name() == input_rename.from()) {
-        *op_def->mutable_input_arg(i)->mutable_name() = input_rename.to();
-        found = true;
-        break;
-      }
-    }
-    if (found) {
-      RenameInDocs(input_rename.from(), input_rename.to(), op_def);
-    } else {
-      LOG(WARNING) << proto.name() << " can't find input "
-                   << input_rename.from() << " to rename";
-    }
-  }
-  for (const auto& output_rename : proto.output_rename()) {
-    bool found = false;
-    for (int i = 0; i < op_def->output_arg_size(); ++i) {
-      if (op_def->output_arg(i).name() == output_rename.from()) {
-        *op_def->mutable_output_arg(i)->mutable_name() = output_rename.to();
-        found = true;
-        break;
-      }
-    }
-    if (found) {
-      RenameInDocs(output_rename.from(), output_rename.to(), op_def);
-    } else {
-      LOG(WARNING) << proto.name() << " can't find output "
-                   << output_rename.from() << " to rename";
-    }
-  }
-
-  return &proto;
-}
-
 namespace {
 
 // Initializes given ApiDef with data in OpDef.
@@ -629,14 +498,11 @@ Status ApiDefMap::LoadApiDef(const string& api_def_file_contents) {
   ApiDefs api_defs;
   protobuf::TextFormat::ParseFromString(contents, &api_defs);
   for (const auto& api_def : api_defs.op()) {
-    // Check if the op definition is already loaded.
+    // Check if the op definition is loaded. If op definition is not
+    // loaded, then we just skip this ApiDef.
     if (map_.find(api_def.graph_op_name()) != map_.end()) {
       // Overwrite current api def with data in api_def.
       TF_RETURN_IF_ERROR(MergeApiDefs(&map_[api_def.graph_op_name()], api_def));
-    } else {
-      return errors::FailedPrecondition(
-          "Unexpected ApiDef override: ", api_def.graph_op_name(),
-          " is not defined in base ApiDef.");
     }
   }
   return Status::OK();
diff --git a/tensorflow/core/framework/op_gen_lib.h b/tensorflow/core/framework/op_gen_lib.h
index 1ede3af8d7cf8f591ba3927f7fc99d646629109d..ff38e4b22141a7f1b7212a516ec5adbd5c7aad79 100644
--- a/tensorflow/core/framework/op_gen_lib.h
+++ b/tensorflow/core/framework/op_gen_lib.h
@@ -28,7 +28,6 @@ namespace tensorflow {
 
 // Forward declare protos so their symbols can be removed from .so exports
 class OpDef;
-class OpGenOverride;
 
 inline string Spaces(int n) { return string(n, ' '); }
 
@@ -48,34 +47,6 @@ string PBTxtToMultiline(StringPiece pbtxt,
                         const std::vector<string>& multi_line_fields);
 string PBTxtFromMultiline(StringPiece multiline_pbtxt);
 
-// Takes a list of files with OpGenOverrides text protos, and allows you to
-// look up the specific override for any given op.
-class OpGenOverrideMap {
- public:
-  OpGenOverrideMap();
-  ~OpGenOverrideMap();
-
-  // `filenames` is a comma-separated list of file names.  If an op
-  // is mentioned in more than one file, the last one takes priority.
-  Status LoadFileList(Env* env, const string& filenames);
-
-  // Load a single file.  If more than one file is loaded, later ones
-  // take priority for any ops in common.
-  Status LoadFile(Env* env, const string& filename);
-
-  // Look up the override for `*op_def` from the loaded files, and
-  // mutate `*op_def` to reflect the requested changes. Does not apply
-  // 'skip', 'hide', or 'alias' overrides. Caller has to deal with
-  // those since they can't be simulated by mutating `*op_def`.
-  // Returns nullptr if op is not in any loaded file. Otherwise, the
-  // pointer must not be referenced beyond the lifetime of *this or
-  // the next file load.
-  const OpGenOverride* ApplyOverride(OpDef* op_def) const;
-
- private:
-  std::unordered_map<string, std::unique_ptr<OpGenOverride>> map_;
-};
-
 // Takes a list of files with ApiDefs text protos, and allows you to
 // look up the specific ApiDef for any given op.
 class ApiDefMap {
diff --git a/tensorflow/core/framework/op_gen_lib_test.cc b/tensorflow/core/framework/op_gen_lib_test.cc
index bbe57bdd622d4339b33d15e2e92252f60ffbbcf8..857b1c8dbcac66899f98bb4f2ef87f65f7442f6b 100644
--- a/tensorflow/core/framework/op_gen_lib_test.cc
+++ b/tensorflow/core/framework/op_gen_lib_test.cc
@@ -410,8 +410,8 @@ op {
 
   ApiDefMap api_map(op_list);
   TF_CHECK_OK(api_map.LoadApiDef(kTestApiDef));
-  auto status = api_map.LoadApiDef(api_def1);
-  ASSERT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
+  TF_CHECK_OK(api_map.LoadApiDef(api_def1));
+  ASSERT_EQ(nullptr, api_map.GetApiDef("different_testop"));
 }
 
 TEST(OpGenLibTest, ApiDefInvalidArgOrder) {
diff --git a/tensorflow/core/framework/op_gen_overrides.proto b/tensorflow/core/framework/op_gen_overrides.proto
deleted file mode 100644
index 8e66d39a7c7f4a9ff05c91f46a11446e18bc1aed..0000000000000000000000000000000000000000
--- a/tensorflow/core/framework/op_gen_overrides.proto
+++ /dev/null
@@ -1,67 +0,0 @@
-// Defines the text format for adding per-op overrides for client
-// language op code generators.
-
-syntax = "proto3";
-
-package tensorflow;
-import "tensorflow/core/framework/attr_value.proto";
-
-// Used to override the default API & behavior in the generated code
-// for client languages, from what you would get from the OpDef alone.
-// This is so we can evolve the API while remaining backwards
-// compatible when interpretting old graphs.  Overrides go in an
-// "op_gen_overrides.pbtxt" file with a text-format OpGenOverrides
-// message.  Right now these only apply to the C++ API.
-// TODO(josh11b): In the future there will be a common set of overrides
-// and per-client-language overrides.
-//
-// WARNING: Be *very* careful using these features -- these overrides
-// can change the semantics of existing code.  These changes may need
-// to wait until a major release of TensorFlow to avoid breaking our
-// compatibility promises.
-message OpGenOverride {
-  // Name of the op to apply overrides to.
-  string name = 1;
-
-  // Do not include this op in the generated API.
-  // If `skip` is true, all other overrides are ignored for this op.
-  bool skip = 2;
-
-  // Hide this op by putting it into an internal namespace (or whatever
-  // is appropriate in the target language).
-  bool hide = 3;
-
-  // Use a different name in the API than the op's name. Note that
-  // the op's name in `backticks` will also be replaced in the docs.
-  string rename_to = 4;
-
-  // Create *additional* API endpoints with different names (contrast
-  // with rename_to, which affects the original name).
-  repeated string alias = 5;
-
-  // Map the name of an attr to a new default value to use.  This
-  // default will be used when creating new graphs, as opposed to the
-  // default in the OpDef, which will be used when interpreting old
-  // GraphDefs.  If this attr is also renamed (using attr_rename
-  // below), use the original name of the attr.
-  message AttrDefault {
-    string name = 1;
-    AttrValue value = 2;
-  }
-  repeated AttrDefault attr_default = 6;
-
-  // Change the name used to access attrs/inputs/outputs in the API
-  // from what is used in the GraphDef.  Note that these names in
-  // `backticks` will also be replaced in the docs.
-  message Rename {
-    string from = 1;
-    string to = 2;
-  }
-  repeated Rename attr_rename = 7;
-  repeated Rename input_rename = 8;
-  repeated Rename output_rename = 9;
-}
-
-message OpGenOverrides {
-  repeated OpGenOverride op = 1;
-}
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 4d410809e77bd6ba7cd24f78c0ef2f97fa54e588..8654437059ca449432e6381b9eb3c4ba15e56f48 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -78,8 +79,14 @@ Status MatchSignatureHelper(const DataTypeSlice expected_inputs,
 
 // OpKernel ------------------------------------------------------------------
 
+// TODO(mrry): Convert to std::make_unique when available.
 OpKernel::OpKernel(OpKernelConstruction* context)
-    : def_(new NodeDef(context->def())),
+    : OpKernel(context,
+               std::unique_ptr<const NodeDef>(new NodeDef(context->def()))) {}
+
+OpKernel::OpKernel(OpKernelConstruction* context,
+                   std::unique_ptr<const NodeDef> node_def)
+    : def_(std::move(node_def)),
       input_types_(context->input_types().begin(),
                    context->input_types().end()),
       input_memory_types_(context->input_memory_types().begin(),
@@ -100,7 +107,8 @@ OpKernel::OpKernel(OpKernelConstruction* context)
 
   // Kernels executing on GPU/SYCL tie very few resources on the CPU where the
   // scheduler runs: we consider them as inexpensive.
-  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) && context->device_type() != DeviceType(DEVICE_SYCL);
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) &&
+               context->device_type() != DeviceType(DEVICE_SYCL);
 }
 
 OpKernel::~OpKernel() {}
@@ -112,7 +120,7 @@ const string& OpKernel::requested_input(int i) const { return def_->input(i); }
 
 Status OpKernel::InputRange(StringPiece input_name, int* start,
                             int* stop) const {
-  const auto result = input_name_map_.find(input_name.ToString());
+  const auto result = input_name_map_.find(input_name);
   if (result == input_name_map_.end()) {
     return errors::InvalidArgument("Unknown input name: ", input_name);
   } else {
@@ -124,7 +132,7 @@ Status OpKernel::InputRange(StringPiece input_name, int* start,
 
 Status OpKernel::OutputRange(StringPiece output_name, int* start,
                              int* stop) const {
-  const auto result = output_name_map_.find(output_name.ToString());
+  const auto result = output_name_map_.find(output_name);
   if (result == output_name_map_.end()) {
     return errors::InvalidArgument("Unknown output name: ", output_name);
   } else {
@@ -252,10 +260,8 @@ OpKernelContext::OpKernelContext(Params* params)
 OpKernelContext::OpKernelContext(Params* params, int num_outputs)
     : params_(params),
       outputs_(num_outputs),
-      host_temp_memory_size_(0),
-      device_temp_memory_size_(0),
-      host_persistent_memory_allocated_(0),
-      device_persistent_memory_allocated_(0) {
+      temp_memory_allocated_(0),
+      persistent_memory_allocated_(0) {
   Allocator* eigen_gpu_allocator = get_allocator(AllocatorAttributes());
   params_->ensure_eigen_gpu_device();
   params_->device->ReinitializeGpuDevice(this, params_->eigen_gpu_device,
@@ -468,7 +474,7 @@ std::unique_ptr<Tensor> OpKernelContext::forward_input(
     return nullptr;
   }
   // Check that input and output memory types match, i.e.
-  // that they either both live in host or both live in device memmory.
+  // that they either both live in host or both live in device memory.
   if (input_memory_type(input_index) != output_memory_type) {
     return nullptr;
   }
@@ -663,16 +669,11 @@ Status OpKernelContext::allocate_temp(
     const AllocationAttributes& allocation_attr) {
   Status s =
       allocate_tensor(type, shape, out_temp, allocator_attr, allocation_attr);
-  if (track_allocations() && out_temp->TotalBytes() > 0) {
+  if (track_allocations() && s.ok() && out_temp->TotalBytes() > 0) {
     Allocator* a = get_allocator(allocator_attr);
     if (a->TracksAllocationSizes()) {
-      int64 alloc_size =
-          a->AllocatedSize(const_cast<char*>(out_temp->tensor_data().data()));
-      if (allocate_on_host(allocator_attr)) {
-        record_host_temp_memory_size(alloc_size);
-      } else {
-        record_device_temp_memory_size(alloc_size);
-      }
+      int64 alloc_size = a->AllocatedSize(out_temp->tensor_data().data());
+      record_temp_memory_allocation(alloc_size, *out_temp);
     }
   }
   return s;
@@ -690,6 +691,15 @@ Status OpKernelContext::allocate_persistent(DataType type,
     if (out_tensor) {
       *out_tensor = out_persistent->AccessTensor(this);
     }
+    if (track_allocations()) {
+      Tensor* t = out_persistent->AccessTensor(this);
+      Allocator* a = get_allocator(attr);
+      if (a->TracksAllocationSizes()) {
+        int64 alloc_size = a->AllocatedSize(t->tensor_data().data());
+        int64 alloc_id = a->AllocationId(t->tensor_data().data());
+        record_persistent_memory_allocation(alloc_size, alloc_id);
+      }
+    }
   }
   return s;
 }
@@ -714,6 +724,22 @@ void OpKernelContext::set_output(int index, const Tensor& tensor) {
   DCHECK_EQ(mutable_output(index), nullptr);
   record_tensor_reference(tensor);
   outputs_[index] = TensorValue(new Tensor(tensor));
+  if (track_allocations() && tensor.TotalBytes() > 0) {
+    mutex_lock l(stats_mu_);
+    if (!temp_tensor_buffer_and_size_) {
+      return;
+    }
+    auto it = std::find_if(temp_tensor_buffer_and_size_->begin(),
+                           temp_tensor_buffer_and_size_->end(),
+                           [&tensor](const std::pair<const void*, int64>& e) {
+                             return e.first == static_cast<const void*>(
+                                                   tensor.tensor_data().data());
+                           });
+    if (it != temp_tensor_buffer_and_size_->end()) {
+      temp_memory_allocated_ -= it->second;
+      temp_tensor_buffer_and_size_->erase(it);
+    }
+  }
 }
 
 void OpKernelContext::set_output_ref(int index, mutex* mu,
@@ -791,30 +817,60 @@ Status OpKernelContext::MatchSignature(const DataTypeSlice expected_inputs,
                               outputs);
 }
 
-bool OpKernelContext::allocate_on_host(AllocatorAttributes alloc_attr) const {
-  return alloc_attr.on_host() || device()->attributes().device_type() == "CPU";
+void OpKernelContext::record_temp_memory_allocation(int64 size,
+                                                    const Tensor& t) {
+  mutex_lock l(stats_mu_);
+  temp_memory_allocated_ += size;
+  if (!temp_tensor_buffer_and_size_) {
+    temp_tensor_buffer_and_size_.reset(
+        new gtl::InlinedVector<std::pair<const void*, int64>, 2>());
+  }
+  temp_tensor_buffer_and_size_->emplace_back(
+      static_cast<const void*>(t.tensor_data().data()), size);
 }
 
-void OpKernelContext::record_host_persistent_memory_allocation(int64 size,
-                                                               int64 alloc_id) {
-  host_persistent_memory_allocated_ += size;
-  host_persistent_alloc_ids_.push_back(alloc_id);
+int64 OpKernelContext::temp_memory_allocated() const {
+  mutex_lock l(stats_mu_);
+  return temp_memory_allocated_;
 }
 
-void OpKernelContext::record_device_persistent_memory_allocation(
-    int64 size, int64 alloc_id) {
-  device_persistent_memory_allocated_ += size;
-  device_persistent_alloc_ids_.push_back(alloc_id);
+void OpKernelContext::record_persistent_memory_allocation(int64 size,
+                                                          int64 alloc_id) {
+  mutex_lock l(stats_mu_);
+  persistent_memory_allocated_ += size;
+  if (alloc_id >= 0) {
+    if (!persistent_alloc_ids_) {
+      persistent_alloc_ids_.reset(new gtl::InlinedVector<int64, 2>());
+    }
+    persistent_alloc_ids_->push_back(alloc_id);
+  }
+}
+
+int64 OpKernelContext::persistent_memory_allocated() const {
+  mutex_lock l(stats_mu_);
+  return persistent_memory_allocated_;
 }
 
-std::vector<int64> OpKernelContext::host_persistent_alloc_ids() const {
-  return std::vector<int64>(host_persistent_alloc_ids_.begin(),
-                            host_persistent_alloc_ids_.end());
+std::vector<int64> OpKernelContext::persistent_alloc_ids() const {
+  mutex_lock l(stats_mu_);
+  if (persistent_alloc_ids_) {
+    return std::vector<int64>(persistent_alloc_ids_->begin(),
+                              persistent_alloc_ids_->end());
+  } else {
+    return std::vector<int64>();
+  }
 }
 
-std::vector<int64> OpKernelContext::device_persistent_alloc_ids() const {
-  return std::vector<int64>(device_persistent_alloc_ids_.begin(),
-                            device_persistent_alloc_ids_.end());
+void OpKernelContext::clear_recorded_memory() {
+  mutex_lock l(stats_mu_);
+  temp_memory_allocated_ = 0;
+  persistent_memory_allocated_ = 0;
+  if (temp_tensor_buffer_and_size_) {
+    temp_tensor_buffer_and_size_->clear();
+  }
+  if (persistent_alloc_ids_) {
+    persistent_alloc_ids_->clear();
+  }
 }
 
 // OpKernel registration ------------------------------------------------------
@@ -959,13 +1015,6 @@ Status FindKernelRegistration(const DeviceType& device_type,
   return Status::OK();
 }
 
-Status FindKernelRegistration(const DeviceType& device_type, const Node& node,
-                              const KernelRegistration** reg,
-                              bool* was_attr_mismatch) {
-  return FindKernelRegistration(device_type, node.def(), reg,
-                                was_attr_mismatch);
-}
-
 }  // namespace
 
 // TODO(irving): Change const NodeDef& to const Node&
@@ -1179,24 +1228,51 @@ const Eigen::SyclDevice& OpKernelContext::eigen_device() const {
 }
 #endif
 
-void OpKernelConstruction::CtxFailure(Status s) {
+void OpKernelConstruction::CtxFailure(const Status& s) {
   VLOG(1) << s;
   SetStatus(s);
 }
 
-void OpKernelConstruction::CtxFailureWithWarning(Status s) {
+void OpKernelConstruction::CtxFailureWithWarning(const Status& s) {
   LOG(WARNING) << s;
   SetStatus(s);
 }
 
-void OpKernelContext::CtxFailure(Status s) {
+void OpKernelConstruction::CtxFailure(const char* file, int line,
+                                      const Status& s) {
+  VLOG(1) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
+          << " : " << s;
+  SetStatus(s);
+}
+
+void OpKernelConstruction::CtxFailureWithWarning(const char* file, int line,
+                                                 const Status& s) {
+  LOG(WARNING) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
+               << " : " << s;
+  SetStatus(s);
+}
+
+void OpKernelContext::CtxFailure(const Status& s) {
   VLOG(1) << s;
   SetStatus(s);
 }
 
-void OpKernelContext::CtxFailureWithWarning(Status s) {
+void OpKernelContext::CtxFailureWithWarning(const Status& s) {
   LOG(WARNING) << s;
   SetStatus(s);
 }
 
+void OpKernelContext::CtxFailure(const char* file, int line, const Status& s) {
+  VLOG(1) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
+          << " : " << s;
+  SetStatus(s);
+}
+
+void OpKernelContext::CtxFailureWithWarning(const char* file, int line,
+                                            const Status& s) {
+  LOG(WARNING) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
+               << " : " << s;
+  SetStatus(s);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index da0dc549435a35cb1dec25b9e8e5ddbea7b904b3..5ccd45efc980393aa02582595dde873be7426e26 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -61,7 +61,7 @@ class TensorSliceReaderCacheWrapper;
 }  // namespace checkpoint
 
 class AsyncOpKernel;
-class FunctionCallFrame;
+class CallFrameInterface;
 class FunctionLibraryRuntime;
 class OpKernelConstruction;  // declared below
 class OpKernelContext;       // declared below
@@ -75,6 +75,14 @@ class OpKernel {
   // OpKernel won't be instantiated by the scheduler, so you may perform
   // expensive initialization in the descendant's constructor.
   explicit OpKernel(OpKernelConstruction* context);
+
+  // Specialized constructor that enables the descendant to provide a different
+  // `NodeDef` value. For example, this constructor can be used to provide a
+  // stripped-down `NodeDef` that does not contain the full set of attrs (such
+  // as tensor values) if the descendant stores them in a different form.
+  explicit OpKernel(OpKernelConstruction* context,
+                    std::unique_ptr<const NodeDef> node_def);
+
   virtual ~OpKernel();
 
   // An OpKernel's computation can be either synchronous or
@@ -316,8 +324,10 @@ class OpKernelConstruction {
   int graph_def_version() const { return graph_def_version_; }
 
   // Helper routines for the OP_REQUIRES macros
-  void CtxFailure(Status s);
-  void CtxFailureWithWarning(Status s);
+  void CtxFailure(const Status& s);
+  void CtxFailureWithWarning(const Status& s);
+  void CtxFailure(const char* file, int line, const Status& s);
+  void CtxFailureWithWarning(const char* file, int line, const Status& s);
 
   // Unrecommended functions: these are functions that have some
   // current uses but are not recommended for use, and may go away at
@@ -548,7 +558,7 @@ class OpKernelContext {
     FrameAndIter frame_iter;
 
     // Function call supports.
-    FunctionCallFrame* call_frame = nullptr;
+    CallFrameInterface* call_frame = nullptr;
     FunctionLibraryRuntime* function_library = nullptr;
     std::function<void(std::function<void()>)>* runner = nullptr;
     StepStatsCollector* stats_collector = nullptr;
@@ -899,9 +909,13 @@ class OpKernelContext {
   }
 
   AllocatorAttributes input_alloc_attr(int index) const {
-    DCHECK_GE(index, 0);
-    DCHECK_LT(index, params_->input_alloc_attrs->size());
-    return (*params_->input_alloc_attrs)[index];
+    if (params_->input_alloc_attrs == nullptr) {
+      return AllocatorAttributes();
+    } else {
+      DCHECK_GE(index, 0);
+      DCHECK_LT(index, params_->input_alloc_attrs->size());
+      return (*params_->input_alloc_attrs)[index];
+    }
   }
 
   AllocatorAttributes output_alloc_attr(int index) const {
@@ -930,7 +944,7 @@ class OpKernelContext {
   //
   // If this kernel invocation is within a function execution,
   // call_frame() returns the call frame for the function call.
-  FunctionCallFrame* call_frame() const { return params_->call_frame; }
+  CallFrameInterface* call_frame() const { return params_->call_frame; }
 
   // If not nullptr, the kernel invoke functions defined in the
   // library. E.g., CHECK_NOTNULL(function_library())->Run("Foo", ...).
@@ -1014,8 +1028,10 @@ class OpKernelContext {
   }
 
   // Helper routines for the OP_REQUIRES macros
-  void CtxFailure(Status s);
-  void CtxFailureWithWarning(Status s);
+  void CtxFailure(const Status& s);
+  void CtxFailureWithWarning(const Status& s);
+  void CtxFailure(const char* file, int line, const Status& s);
+  void CtxFailureWithWarning(const char* file, int line, const Status& s);
 
   // Unrecommended functions: these are functions that have some
   // current uses but are not recommended for use, and may go away at
@@ -1030,36 +1046,27 @@ class OpKernelContext {
   TensorValue release_output(int index);
 
   bool track_allocations() const { return params_->track_allocations; }
-  bool allocate_on_host(AllocatorAttributes alloc_attr) const;
 
-  // Records temporary memory sizes.
-  void record_host_temp_memory_size(int64 size) {
-    host_temp_memory_size_ += size;
-  }
-  void record_device_temp_memory_size(int64 size) {
-    device_temp_memory_size_ += size;
-  }
+  // Records temp memory allocation. Tensor object is recorded to identify the
+  // case where temp memory is used as output memory.
+  void record_temp_memory_allocation(int64 size, const Tensor& t)
+      LOCKS_EXCLUDED(stats_mu_);
 
   // Returns recorded size of temporary memory;
-  int64 host_temp_memory_size() const { return host_temp_memory_size_; }
-  int64 device_temp_memory_size() const { return device_temp_memory_size_; }
+  int64 temp_memory_allocated() const LOCKS_EXCLUDED(stats_mu_);
 
   // Records persistent memory allocation, size can be negative indicating
   // deallocation.
-  void record_host_persistent_memory_allocation(int64 size,
-                                                int64 alloc_id = -1);
-  void record_device_persistent_memory_allocation(int64 size,
-                                                  int64 alloc_id = -1);
+  void record_persistent_memory_allocation(int64 size, int64 alloc_id = -1)
+      LOCKS_EXCLUDED(stats_mu_);
 
   // Returns recorded size and ids of persistent memory.
-  int64 host_persistent_memory_allocated() const {
-    return host_persistent_memory_allocated_;
-  }
-  int64 device_persistent_memory_allocated() const {
-    return device_persistent_memory_allocated_;
-  }
-  std::vector<int64> host_persistent_alloc_ids() const;
-  std::vector<int64> device_persistent_alloc_ids() const;
+  int64 persistent_memory_allocated() const LOCKS_EXCLUDED(stats_mu_);
+
+  std::vector<int64> persistent_alloc_ids() const LOCKS_EXCLUDED(stats_mu_);
+
+  // Resets counters for temp and persistent memory and recorded ids.
+  void clear_recorded_memory() LOCKS_EXCLUDED(stats_mu_);
 
   bool input_is_ref(int index) const;
 
@@ -1104,12 +1111,15 @@ class OpKernelContext {
 
   bool is_output_dead_ = false;
 
-  int64 host_temp_memory_size_;
-  int64 device_temp_memory_size_;
-  gtl::InlinedVector<int64, 2> host_persistent_alloc_ids_;
-  gtl::InlinedVector<int64, 2> device_persistent_alloc_ids_;
-  int64 host_persistent_memory_allocated_;
-  int64 device_persistent_memory_allocated_;
+  // The following data members are only used when allocation tracking is
+  // enabled.
+  mutable mutex stats_mu_;
+  int64 temp_memory_allocated_ GUARDED_BY(stats_mu_);
+  int64 persistent_memory_allocated_ GUARDED_BY(stats_mu_);
+  std::unique_ptr<gtl::InlinedVector<std::pair<const void*, int64>, 2>>
+      temp_tensor_buffer_and_size_ GUARDED_BY(stats_mu_);
+  std::unique_ptr<gtl::InlinedVector<int64, 2>> persistent_alloc_ids_
+      GUARDED_BY(stats_mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernelContext);
 };
@@ -1491,36 +1501,40 @@ inline void OpOutputList::set_ref(int i, mutex* mu, Tensor* tensor_for_ref) {
 //   ...
 // }
 
-#define OP_REQUIRES(CTX, EXP, STATUS) \
-  if (!TF_PREDICT_TRUE(EXP)) {        \
-    (CTX)->CtxFailure((STATUS));      \
-    return;                           \
-  }
+#define OP_REQUIRES(CTX, EXP, STATUS)                  \
+  do {                                                 \
+    if (!TF_PREDICT_TRUE(EXP)) {                       \
+      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS)); \
+      return;                                          \
+    }                                                  \
+  } while (0)
 
-#define OP_REQUIRES_OK(CTX, ...)          \
-  do {                                    \
-    ::tensorflow::Status _s(__VA_ARGS__); \
-    if (!TF_PREDICT_TRUE(_s.ok())) {      \
-      (CTX)->CtxFailureWithWarning(_s);   \
-      return;                             \
-    }                                     \
+#define OP_REQUIRES_OK(CTX, ...)                            \
+  do {                                                      \
+    ::tensorflow::Status _s(__VA_ARGS__);                   \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
+      return;                                               \
+    }                                                       \
   } while (0)
 
-#define OP_REQUIRES_ASYNC(CTX, EXP, STATUS, CALLBACK) \
-  if (!TF_PREDICT_TRUE(EXP)) {                        \
-    (CTX)->CtxFailure((STATUS));                      \
-    (CALLBACK)();                                     \
-    return;                                           \
-  }
+#define OP_REQUIRES_ASYNC(CTX, EXP, STATUS, CALLBACK)  \
+  do {                                                 \
+    if (!TF_PREDICT_TRUE(EXP)) {                       \
+      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS)); \
+      (CALLBACK)();                                    \
+      return;                                          \
+    }                                                  \
+  } while (0)
 
-#define OP_REQUIRES_OK_ASYNC(CTX, STATUS, CALLBACK) \
-  do {                                              \
-    ::tensorflow::Status _s(STATUS);                \
-    if (!TF_PREDICT_TRUE(_s.ok())) {                \
-      (CTX)->CtxFailureWithWarning(_s);             \
-      (CALLBACK)();                                 \
-      return;                                       \
-    }                                               \
+#define OP_REQUIRES_OK_ASYNC(CTX, STATUS, CALLBACK)         \
+  do {                                                      \
+    ::tensorflow::Status _s(STATUS);                        \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
+      (CALLBACK)();                                         \
+      return;                                               \
+    }                                                       \
   } while (0)
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 47523358bed40898cf82c531dc1a89fea0de88a3..b53b877f28d2c80e969fb418aa316ad96c6e2eaa 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/version.h"
 
 class DummyKernel : public tensorflow::OpKernel {
@@ -509,10 +510,9 @@ TEST_F(OpKernelBuilderTest, BuilderBoth) {
 }
 
 REGISTER_OP("BuildTypeAttr").Attr("T: type");
-REGISTER_KERNEL_BUILDER(Name("BuildTypeAttr")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("BuildTypeAttr").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    DummyKernel);
 
 TEST_F(OpKernelBuilderTest, BuilderTypeAttr) {
   ExpectSuccess("BuildTypeAttr", DEVICE_CPU, {"T|type|DT_FLOAT"});
@@ -524,10 +524,9 @@ TEST_F(OpKernelBuilderTest, BuilderTypeAttr) {
 }
 
 REGISTER_OP("BuildTypeListAttr").Attr("T: list(type)");
-REGISTER_KERNEL_BUILDER(Name("BuildTypeListAttr")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<bool>("T"),
-                        DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("BuildTypeListAttr").Device(DEVICE_CPU).TypeConstraint<bool>("T"),
+    DummyKernel);
 
 TEST_F(OpKernelBuilderTest, BuilderTypeListAttr) {
   ExpectSuccess("BuildTypeListAttr", DEVICE_CPU, {"T|list(type)|[]"});
@@ -573,14 +572,12 @@ TEST_F(OpKernelBuilderTest, DuplicateKernel) {
 }
 
 REGISTER_OP("DuplicateKernelForT").Attr("T: type");
-REGISTER_KERNEL_BUILDER(Name("DuplicateKernelForT")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DummyKernel);
-REGISTER_KERNEL_BUILDER(Name("DuplicateKernelForT")
-                            .Device(DEVICE_CPU)
-                            .TypeConstraint<float>("T"),
-                        DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("DuplicateKernelForT").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    DummyKernel);
+REGISTER_KERNEL_BUILDER(
+    Name("DuplicateKernelForT").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    DummyKernel);
 
 TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   const NodeDef ndef =
@@ -898,5 +895,73 @@ TEST_F(LabelTest, Duplicate) {
                 error::INVALID_ARGUMENT);
 }
 
+void BM_InputRangeHelper(int iters, const NodeDef& node_def,
+                         const char* input_name, int expected_start,
+                         int expected_stop) {
+  Status status;
+  std::unique_ptr<DummyDevice> device(new DummyDevice(Env::Default(), false));
+
+  std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
+                                              cpu_allocator(), node_def,
+                                              TF_GRAPH_DEF_VERSION, &status));
+  TF_CHECK_OK(status);
+
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    int start;
+    int stop;
+    TF_CHECK_OK(op->InputRange(input_name, &start, &stop));
+    EXPECT_EQ(expected_start, start);
+    EXPECT_EQ(expected_stop, stop);
+  }
+  testing::StopTiming();
+}
+
+REGISTER_KERNEL_BUILDER(Name("ConcatV2").Device(DEVICE_CPU), DummyKernel);
+REGISTER_KERNEL_BUILDER(Name("Select").Device(DEVICE_CPU), DummyKernel);
+
+void BM_ConcatInputRange(int iters) {
+  testing::StopTiming();
+
+  // Create a ConcatV2 NodeDef with 4 inputs (plus the axis).
+  NodeDef node_def;
+  node_def.set_name("concat-op");
+  node_def.set_op("ConcatV2");
+  AttrValue attr_N;
+  attr_N.set_i(4);
+  AttrValue attr_T;
+  attr_T.set_type(DT_FLOAT);
+  AttrValue attr_Tidx;
+  attr_Tidx.set_type(DT_INT32);
+  node_def.mutable_attr()->insert({"N", attr_N});
+  node_def.mutable_attr()->insert({"T", attr_T});
+  node_def.mutable_attr()->insert({"Tidx", attr_Tidx});
+  for (size_t i = 0; i < 5; ++i) {
+    node_def.add_input(strings::StrCat("a:", i));
+  }
+
+  BM_InputRangeHelper(iters, node_def, "values", 0, 4);
+}
+
+void BM_SelectInputRange(int iters) {
+  testing::StopTiming();
+
+  // Create a Select NodeDef with 3 inputs.
+  NodeDef node_def;
+  node_def.set_name("select-op");
+  node_def.set_op("Select");
+  AttrValue attr_T;
+  attr_T.set_type(DT_FLOAT);
+  node_def.mutable_attr()->insert({"T", attr_T});
+  for (size_t i = 0; i < 3; ++i) {
+    node_def.add_input(strings::StrCat("a:", i));
+  }
+
+  BM_InputRangeHelper(iters, node_def, "condition", 0, 1);
+}
+
+BENCHMARK(BM_ConcatInputRange);
+BENCHMARK(BM_SelectInputRange);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/reader_base.cc b/tensorflow/core/framework/reader_base.cc
index b8c771a0a1955b29f78478f60972b22d804351b2..f84ef0f953cf23e3fb2af210706586f95cfbb8ad 100644
--- a/tensorflow/core/framework/reader_base.cc
+++ b/tensorflow/core/framework/reader_base.cc
@@ -178,9 +178,9 @@ void ReaderBase::Read(QueueInterface* queue, string* key, string* value,
           " must set *at_end=true, *produced=true, or return an error.");
     }
     if (!status.ok() && produced) {
-      status = errors::Internal("ReadLocked() for ", name(),
-                                " set *produced=true *and* returned an error: ",
-                                status.ToString());
+      status = errors::Internal(
+          "ReadLocked() for ", name(),
+          " set *produced=true *and* returned an error: ", status.ToString());
     }
     if (status.ok() && at_end) {
       status = OnWorkFinishedLocked();
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index 4bb37e4f6ede54b96f34963890b56ae8774edced..e90596980f840588768c7883031f1ad179628833 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -52,7 +52,8 @@ limitations under the License.
    #undef REGISTER_PARTITION
 */
 
-#if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION)
+#if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION) || \
+    defined(ANDROID_TEGRA)
 
 // All types are supported, so all macros are invoked.
 //
@@ -155,11 +156,16 @@ limitations under the License.
       TF_CALL_uint8(m) TF_CALL_int8(m)
 
 #define TF_CALL_REAL_NUMBER_TYPES(m) \
+  TF_CALL_INTEGRAL_TYPES(m)          \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)
+
+#define TF_CALL_REAL_NUMBER_TYPES_NO_BFLOAT16(m) \
   TF_CALL_INTEGRAL_TYPES(m) TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m)
 
-#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                         \
-  TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m) TF_CALL_int64(m) \
-      TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
+#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                              \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)   \
+      TF_CALL_int64(m) TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) \
+          TF_CALL_int8(m)
 
 // Call "m" for all number types, including complex64 and complex128.
 #define TF_CALL_NUMBER_TYPES(m) \
@@ -173,7 +179,7 @@ limitations under the License.
 
 // Call "m" on all types.
 #define TF_CALL_ALL_TYPES(m) \
-  TF_CALL_POD_TYPES(m) TF_CALL_string(m) TF_CALL_resource(m)
+  TF_CALL_POD_TYPES(m) TF_CALL_string(m) TF_CALL_resource(m) TF_CALL_variant(m)
 
 // Call "m" on POD and string types.
 #define TF_CALL_POD_STRING_TYPES(m) TF_CALL_POD_TYPES(m) TF_CALL_string(m)
@@ -194,18 +200,23 @@ limitations under the License.
 #define TF_CALL_QUANTIZED_TYPES(m) \
   TF_CALL_qint8(m) TF_CALL_quint8(m) TF_CALL_qint32(m)
 
+// Types used for save and restore ops.
+#define TF_CALL_SAVE_RESTORE_TYPES(m)                                     \
+  TF_CALL_INTEGRAL_TYPES(m)                                               \
+  TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m) TF_CALL_complex64(m) \
+      TF_CALL_complex128(m) TF_CALL_bool(m) TF_CALL_string(m)             \
+          TF_CALL_QUANTIZED_TYPES(m)
+
 #ifdef TENSORFLOW_SYCL_NO_DOUBLE
 #define TF_CALL_SYCL_double(m)
 #else  // TENSORFLOW_SYCL_NO_DOUBLE
 #define TF_CALL_SYCL_double(m) TF_CALL_double(m)
-#endif // TENSORFLOW_SYCL_NO_DOUBLE
+#endif  // TENSORFLOW_SYCL_NO_DOUBLE
 
 #ifdef __ANDROID_TYPES_SLIM__
-#define TF_CALL_SYCL_NUMBER_TYPES(m)  TF_CALL_float(m)
+#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m)
 #else  // __ANDROID_TYPES_SLIM__
-#define TF_CALL_SYCL_NUMBER_TYPES(m)    \
-    TF_CALL_float(m)                    \
-    TF_CALL_SYCL_double(m)
-#endif // __ANDROID_TYPES_SLIM__
+#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m) TF_CALL_SYCL_double(m)
+#endif  // __ANDROID_TYPES_SLIM__
 
 #endif  // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_H_
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index c1fe5517c6986838a07f67c0f2fa5474f89ffa33..ab35c2f0951d21e63fe06e378461c019e45495f1 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -23,7 +23,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/types.h"
@@ -79,7 +79,7 @@ template <>
 struct proxy_type_pod<SYCLDevice, 4> {
   typedef float type;
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 /// If POD we use proxy_type_pod, otherwise this maps to identiy.
 template <typename Device, typename T>
@@ -99,7 +99,7 @@ struct proxy_type {
 #ifdef TENSORFLOW_USE_SYCL
 #define TF_CALL_SYCL_PROXY_TYPES(m) \
   TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m)
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index 32b8ad784d5228a40a073d166f33972def380280..de148f0bd3474421c1361cf7ae4aa681107aa883 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -69,9 +69,7 @@ class LocalRendezvousTest : public ::testing::Test {
     rendez_ = NewLocalRendezvous();
   }
 
-  ~LocalRendezvousTest() override {
-    rendez_->Unref();
-  }
+  ~LocalRendezvousTest() override { rendez_->Unref(); }
 
   void SchedClosure(std::function<void()> fn) {
     threads_.Schedule(std::move(fn));
@@ -99,8 +97,8 @@ string V(const Tensor& tensor) {
 
 Rendezvous::ParsedKey MakeKey(const string& name) {
   string s = Rendezvous::CreateKey("/job:mnist/replica:1/task:2/CPU:0", 7890,
-                                   "/job:mnist/replica:1/task:2/device:GPU:0", name,
-                                   FrameAndIter(0, 0));
+                                   "/job:mnist/replica:1/task:2/device:GPU:0",
+                                   name, FrameAndIter(0, 0));
   Rendezvous::ParsedKey k;
   TF_EXPECT_OK(Rendezvous::ParseKey(s, &k));
   return k;
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index ee9192d4a1475776a892f4da9703c6e7a38f9844..641681973a1004f15163217684001c96592731d8 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -168,7 +168,7 @@ Status InferenceContext::Run(
 
 Status InferenceContext::set_output(StringPiece output_name,
                                     const std::vector<ShapeHandle>& shapes) {
-  const auto result = output_name_map_.find(output_name.ToString());
+  auto result = output_name_map_.find(output_name);
   if (result == output_name_map_.end()) {
     return errors::InvalidArgument("Unknown output name: ", output_name);
   } else {
@@ -187,7 +187,7 @@ Status InferenceContext::set_output(StringPiece output_name,
 
 Status InferenceContext::input(StringPiece input_name,
                                std::vector<ShapeHandle>* output) const {
-  const auto result = input_name_map_.find(input_name.ToString());
+  const auto result = input_name_map_.find(input_name);
   if (result == input_name_map_.end()) {
     return errors::InvalidArgument("Unknown input name: ", input_name);
   } else {
@@ -201,7 +201,7 @@ Status InferenceContext::input(StringPiece input_name,
 
 Status InferenceContext::output(StringPiece output_name,
                                 std::vector<ShapeHandle>* output) const {
-  const auto result = output_name_map_.find(output_name.ToString());
+  const auto result = output_name_map_.find(output_name);
   if (result == output_name_map_.end()) {
     return errors::InvalidArgument("Unknown output name: ", output_name);
   } else {
@@ -342,8 +342,8 @@ Status InferenceContext::WithRank(ShapeHandle shape, int64 rank,
     for (int i = 0; i < rank; ++i) {
       dims.push_back(UnknownDim());
     }
-    *out = shape_manager_.MakeShape(dims);
-    return Status::OK();
+    ShapeHandle shp = shape_manager_.MakeShape(dims);
+    return Merge(shape, shp, out);
   }
   *out = nullptr;
 
@@ -357,13 +357,10 @@ Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int64 rank,
     return errors::InvalidArgument("Rank cannot exceed kint32max");
   }
   const int32 existing = Rank(shape);
-  if (existing >= rank) {
+  if (existing >= rank || existing == kUnknownRank) {
     *out = shape;
     return Status::OK();
   }
-  if (existing == kUnknownRank) {
-    return ReturnUnknownShape(out);
-  }
   *out = nullptr;
   return errors::InvalidArgument("Shape must be at least rank ", rank,
                                  " but is rank ", existing);
@@ -375,10 +372,7 @@ Status InferenceContext::WithRankAtMost(ShapeHandle shape, int64 rank,
     return errors::InvalidArgument("Rank cannot exceed kint32max");
   }
   const int32 existing = Rank(shape);
-  if (existing == kUnknownRank) {
-    return ReturnUnknownShape(out);
-  }
-  if (existing <= rank) {
+  if (existing <= rank || existing == kUnknownRank) {
     *out = shape;
     return Status::OK();
   }
@@ -395,8 +389,8 @@ Status InferenceContext::WithValue(DimensionHandle dim, int64 value,
     return Status::OK();
   }
   if (existing == kUnknownDim) {
-    *out = MakeDim(value);
-    return Status::OK();
+    DimensionHandle d = MakeDim(value);
+    return Merge(dim, d, out);
   }
   *out = nullptr;
   return errors::InvalidArgument("Dimension must be ", value, " but is ",
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index 4a4ef12635f867fccb594d50a2c9e8f3059ce337..e3cc848a169bd848b8f3617d552938ba1ced3663 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
 
 #include <vector>
 
@@ -32,7 +32,7 @@ class ShapeRefinerTest;
 namespace grappler {
 class GraphProperties;
 class SymbolicShapeManager;
-}
+}  // namespace grappler
 
 namespace shape_inference {
 
@@ -787,4 +787,4 @@ Status InferenceContext::GetAttr(StringPiece attr_name, T* value) const {
 }  // namespace shape_inference
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_H_
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index 68156e63ca77b9506b7549f9eb7c1fc302eee89a..f48a7b9c47df3cfa93434ccf585dda8c5a29a2ba 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -359,11 +359,11 @@ TEST_F(ShapeInferenceTest, WithRankAtMost) {
   // WithRankAtMost on a shape with unknown dimensionality always succeeds.
   EXPECT_TRUE(c.WithRankAtMost(in0, 1, &s1).ok());
   EXPECT_EQ("?", c.DebugString(s1));
-  EXPECT_FALSE(SameHandle(in0, s1));
+  EXPECT_TRUE(SameHandle(in0, s1));
 
   EXPECT_TRUE(c.WithRankAtMost(in0, 2, &s2).ok());
   EXPECT_EQ("?", c.DebugString(s2));
-  EXPECT_FALSE(SameHandle(s1, s2));
+  EXPECT_TRUE(SameHandle(s1, s2));
 
   // WithRankAtMost on shape with known dimensionality.
   s1 = in1;
@@ -398,11 +398,11 @@ TEST_F(ShapeInferenceTest, WithRankAtLeast) {
   // WithRankAtLeast on a shape with unknown dimensionality always succeeds.
   EXPECT_TRUE(c.WithRankAtLeast(in0, 1, &s1).ok());
   EXPECT_EQ("?", c.DebugString(s1));
-  EXPECT_FALSE(SameHandle(in0, s1));
+  EXPECT_TRUE(SameHandle(in0, s1));
 
   EXPECT_TRUE(c.WithRankAtLeast(in0, 2, &s2).ok());
   EXPECT_EQ("?", c.DebugString(s2));
-  EXPECT_FALSE(SameHandle(s1, s2));
+  EXPECT_TRUE(SameHandle(s1, s2));
 
   // WithRankAtLeast on shape with known dimensionality.
   s1 = in1;
@@ -760,7 +760,10 @@ TEST_F(ShapeInferenceTest, MergePrefix) {
   NodeDef def;
   InferenceContext c(kVersion, &def, MakeOpDef(4, 2),
                      {
-                         Unknown(), S({-1, 2}), S({1, -1, 3}), S({2, 4}),
+                         Unknown(),
+                         S({-1, 2}),
+                         S({1, -1, 3}),
+                         S({2, 4}),
                      },
                      {}, {}, {});
 
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index fbfd24538bc7a5b1f3ee3805d4a803a0e7239fca..7977841482efa396c8e0797d8c80a40c11b4df56 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
 
 #include <vector>
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -98,4 +98,4 @@ class ShapeInferenceTestutil {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
diff --git a/tensorflow/core/framework/step_stats.proto b/tensorflow/core/framework/step_stats.proto
index 99dee2257e0a4ccab4098f5ee49feda9ed21d2cf..65c8089d51141b915db69ef2f562ba911ea2994e 100644
--- a/tensorflow/core/framework/step_stats.proto
+++ b/tensorflow/core/framework/step_stats.proto
@@ -40,12 +40,13 @@ message NodeOutput {
 
 // For memory tracking.
 message MemoryStats {
-  int64 host_temp_memory_size = 1;
-  int64 device_temp_memory_size = 2;
-  int64 host_persistent_memory_size = 3;
-  int64 device_persistent_memory_size = 4;
-  repeated int64 host_persistent_tensor_alloc_ids = 5;
-  repeated int64 device_persistent_tensor_alloc_ids = 6;
+  int64 temp_memory_size = 1;
+  int64 persistent_memory_size = 3;
+  repeated int64 persistent_tensor_alloc_ids = 5;
+
+  int64 device_temp_memory_size = 2 [deprecated = true];
+  int64 device_persistent_memory_size = 4 [deprecated = true];
+  repeated int64 device_persistent_tensor_alloc_ids = 6 [deprecated = true];
 }
 
 // Time/size stats recorded for a single execution of a graph node.
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 24b7b08ebcb8371dfa5d46c788a3146ca727da3f..5d32b71628263fe89d6f54fd07b2fe18bbb55e53 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -415,18 +415,10 @@ struct ProtoHelper<qint32> {
 
 template <>
 struct ProtoHelper<bfloat16> {
-  typedef Helper<float>::RepeatedFieldType FieldType;
-  static const bfloat16* Begin(const TensorProto& proto) {
-    // TODO: Isn't this wrong, given that int_val is 32 bits long?
-    return reinterpret_cast<const bfloat16*>(proto.int_val().data());
-  }
-  static size_t NumElements(const TensorProto& proto) {
-    return proto.int_val().size();
-  }
   static void Fill(const bfloat16* data, size_t n, TensorProto* proto) {
-    proto->mutable_int_val()->Reserve(n);
+    proto->mutable_half_val()->Reserve(n);
     for (size_t i = 0; i < n; ++i) {
-      proto->mutable_int_val()->AddAlreadyReserved(data[i].value);
+      proto->mutable_half_val()->AddAlreadyReserved(data[i].value);
     }
   }
 };
@@ -529,9 +521,9 @@ TensorBuffer* FromProtoField<Variant>(Allocator* a, const TensorProto& in,
   return buf;
 }
 
-// fp16 is opaque to the protobuf, so we deserialize these identical to uint16
-// but with data stored in half_val instead of int_val (ie., we don't use
-// ProtoHelper<uint16>).
+// fp16 and bfloat16 are opaque to the protobuf, so we deserialize these
+// identical to uint16 but with data stored in half_val instead of int_val (ie.,
+// we don't use ProtoHelper<uint16>).
 template <>
 TensorBuffer* FromProtoField<Eigen::half>(Allocator* a, const TensorProto& in,
                                           int64 n) {
@@ -556,6 +548,30 @@ TensorBuffer* FromProtoField<Eigen::half>(Allocator* a, const TensorProto& in,
   return buf;
 }
 
+template <>
+TensorBuffer* FromProtoField<bfloat16>(Allocator* a, const TensorProto& in,
+                                       int64 n) {
+  CHECK_GT(n, 0);
+  Buffer<bfloat16>* buf = new Buffer<bfloat16>(a, n);
+  uint16* data = buf->template base<uint16>();
+  if (data == nullptr) {
+    buf->Unref();
+    return nullptr;
+  }
+  const int64 in_n = in.half_val().size();
+  auto begin = in.half_val().begin();
+  if (n <= in_n) {
+    std::copy_n(begin, n, data);
+  } else if (in_n > 0) {
+    std::copy_n(begin, in_n, data);
+    const uint16 last = *(data + in_n - 1);
+    std::fill_n(data + in_n, n - in_n, last);
+  } else {
+    std::fill_n(data, n, 0);
+  }
+  return buf;
+}
+
 // Copies T[n] stored in the buffer "in" into the repeated field in
 // "out" corresponding to type T.
 template <typename T>
@@ -599,11 +615,11 @@ void Tensor::CheckType(DataType expected_dtype) const {
 
 void Tensor::CheckTypeAndIsAligned(DataType expected_dtype) const {
   CHECK_EQ(dtype(), expected_dtype);
-  CHECK(IsAligned());
+  CHECK(IsAligned()) << "CheckTypeAndIsAligned";
 }
 
 void Tensor::CheckIsAlignedAndSingleElement() const {
-  CHECK(IsAligned());
+  CHECK(IsAligned()) << "Aligned and single element";
   CHECK_EQ(1, NumElements()) << "Must have a one element tensor";
 }
 
@@ -870,8 +886,9 @@ bool Tensor::CanUseDMA() const {
 namespace {
 // Print from left dim to right dim recursively.
 template <typename T>
-void PrintOneDim(int dim_index, gtl::InlinedVector<int64, 4> shape, int64 limit,
-                 int shape_size, T* data, int64* data_index, string* result) {
+void PrintOneDim(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
+                 int64 limit, int shape_size, const T* data, int64* data_index,
+                 string* result) {
   if (*data_index >= limit) return;
   int64 element_count = shape[dim_index];
   // We have reached the right-most dimension of the tensor.
@@ -1008,9 +1025,8 @@ StringPiece Tensor::tensor_data() const {
 }
 
 bool Tensor::SharesBufferWith(const Tensor& b) const {
-  CHECK_NE(nullptr, buf_);
-  CHECK_NE(nullptr, b.buf_);
-  return buf_->root_buffer() == b.buf_->root_buffer();
+  return buf_ != nullptr && b.buf_ != nullptr &&
+         buf_->root_buffer() == b.buf_->root_buffer();
 }
 
 string Tensor::DebugString() const {
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index c195623b279a4275ab2646483851ec3a65a1f0d4..62c42ba652356a5128d4a337e34a3b449781b445 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -424,7 +424,8 @@ class Tensor {
   typename TTypes<T, NDIMS>::ConstTensor flat_outer_dims() const;
 
   template <typename T, size_t NDIMS = 3>
-  typename TTypes<T, NDIMS>::ConstTensor flat_inner_outer_dims(int64 begin) const;
+  typename TTypes<T, NDIMS>::ConstTensor flat_inner_outer_dims(
+      int64 begin) const;
 
   /// Render the first `max_entries` values in `*this` into a string.
   string SummarizeValue(int64 max_entries) const;
@@ -464,10 +465,6 @@ class Tensor {
   void CheckTypeAndIsAligned(DataType expected_dtype) const;
   void CheckIsAlignedAndSingleElement() const;
   void set_dtype(DataType t) { shape_.set_data_type(t); }
-  template <size_t NDIMS>
-  void FillDimsAndValidateCompatibleShape(
-      gtl::ArraySlice<int64> new_sizes,
-      Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const;
 
   // TensorShape's InlineVector.
   static gtl::InlinedVector<int64, 4> ComputeFlatInnerDims(
@@ -520,8 +517,13 @@ class Tensor {
 
   template <size_t NDIMS>
   void FillDimsAndValidateCompatibleShape(
-      Eigen::array<Eigen::DenseIndex, NDIMS>* dims,
-      gtl::ArraySlice<int64> new_sizes) const;
+      gtl::ArraySlice<int64> new_sizes,
+      Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const;
+
+  template <typename T, size_t NDIMS>
+  void FillDimsAndValidateCompatibleShape(
+      gtl::ArraySlice<int64> new_sizes,
+      Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const;
 };
 
 // Implementation details
@@ -631,12 +633,36 @@ void Tensor::FillDimsAndValidateCompatibleShape(
   CHECK_EQ(new_num_elements, NumElements());
 }
 
+template <typename T, size_t NDIMS>
+void Tensor::FillDimsAndValidateCompatibleShape(
+    gtl::ArraySlice<int64> new_sizes,
+    Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const {
+  CHECK_EQ(NDIMS, new_sizes.size());
+  int64 new_num_elements = 1;
+  for (size_t d = 0; d < NDIMS; d++) {
+    new_num_elements *= new_sizes[d];
+    (*dims)[d] = new_sizes[d];
+  }
+  const int element_size = DataTypeSize(BaseType(dtype()));
+  if (element_size > 0) {
+    CHECK_EQ(new_num_elements * sizeof(T), NumElements() * element_size);
+  } else {
+    // DataTypeSize() returns 0 for some data types. In this case, assume that T
+    // has the same size as the buffer type.
+    // NOTE: If we can be sure that DataTypeSize() does not return 0 for all POD
+    // types, then we should check DataTypeToEnum<T>::v() == dtype(). Or simply
+    // check if `element_size > 0` to err when bit cast is attempted on Tensor
+    // of unknown data type size.
+    CHECK_EQ(new_num_elements, NumElements());
+  }
+}
+
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::shaped(
     gtl::ArraySlice<int64> new_sizes) {
   CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
-  FillDimsAndValidateCompatibleShape<NDIMS>(new_sizes, &dims);
+  FillDimsAndValidateCompatibleShape(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
 }
 
@@ -645,7 +671,7 @@ typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_shaped(
     gtl::ArraySlice<int64> new_sizes) {
   CHECK(IsAligned());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
-  FillDimsAndValidateCompatibleShape<NDIMS>(new_sizes, &dims);
+  FillDimsAndValidateCompatibleShape<T>(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
 }
 
@@ -654,29 +680,17 @@ typename TTypes<T, NDIMS>::UnalignedTensor Tensor::unaligned_shaped(
     gtl::ArraySlice<int64> new_sizes) {
   CheckType(DataTypeToEnum<T>::v());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
-  FillDimsAndValidateCompatibleShape<NDIMS>(new_sizes, &dims);
+  FillDimsAndValidateCompatibleShape(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::UnalignedTensor(base<T>(), dims);
 }
 
-template <size_t NDIMS>
-void Tensor::FillDimsAndValidateCompatibleShape(
-    Eigen::array<Eigen::DenseIndex, NDIMS>* dims,
-    gtl::ArraySlice<int64> new_sizes) const {
-  CHECK_EQ(NDIMS, new_sizes.size());
-  int64 new_num_elements = 1;
-  for (size_t d = 0; d < NDIMS; d++) {
-    new_num_elements *= new_sizes[d];
-    (*dims)[d] = new_sizes[d];
-  }
-  CHECK_EQ(new_num_elements, NumElements());
-}
-
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::shaped(
     gtl::ArraySlice<int64> new_sizes) const {
-  CheckTypeAndIsAligned(DataTypeToEnum<T>::v());
+  CheckType(DataTypeToEnum<T>::v());
+  CHECK(IsAligned());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
-  FillDimsAndValidateCompatibleShape(&dims, new_sizes);
+  FillDimsAndValidateCompatibleShape(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::ConstTensor(base<T>(), dims);
 }
 
@@ -685,7 +699,7 @@ typename TTypes<T, NDIMS>::ConstTensor Tensor::bit_casted_shaped(
     gtl::ArraySlice<int64> new_sizes) const {
   CHECK(IsAligned());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
-  FillDimsAndValidateCompatibleShape(&dims, new_sizes);
+  FillDimsAndValidateCompatibleShape<T>(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::ConstTensor(base<T>(), dims);
 }
 
@@ -694,7 +708,7 @@ typename TTypes<T, NDIMS>::UnalignedConstTensor Tensor::unaligned_shaped(
     gtl::ArraySlice<int64> new_sizes) const {
   CheckType(DataTypeToEnum<T>::v());
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
-  FillDimsAndValidateCompatibleShape(&dims, new_sizes);
+  FillDimsAndValidateCompatibleShape(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::UnalignedConstTensor(base<T>(), dims);
 }
 
@@ -722,8 +736,8 @@ typename TTypes<T, NDIMS>::Tensor Tensor::flat_outer_dims() {
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_outer_dims(int64 begin) {
-  gtl::InlinedVector<int64,4> flat_outer = ComputeFlatOuterDims(
-      shape_.dim_sizes(), begin + NDIMS);
+  gtl::InlinedVector<int64, 4> flat_outer =
+      ComputeFlatOuterDims(shape_.dim_sizes(), begin + NDIMS);
   return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
 }
 
@@ -738,9 +752,10 @@ typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_outer_dims() const {
 }
 
 template <typename T, size_t NDIMS>
-typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_inner_outer_dims(int64 begin) const {
-  gtl::InlinedVector<int64,4> flat_outer = ComputeFlatOuterDims(
-      shape_.dim_sizes(), begin + NDIMS);
+typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_inner_outer_dims(
+    int64 begin) const {
+  gtl::InlinedVector<int64, 4> flat_outer =
+      ComputeFlatOuterDims(shape_.dim_sizes(), begin + NDIMS);
   return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
 }
 
diff --git a/tensorflow/core/framework/tensor.proto b/tensorflow/core/framework/tensor.proto
index 6dab325969bacbda15552a79eb3c0862dbde20a1..abbf16e8103326011525feb0017922474ff8d2cf 100644
--- a/tensorflow/core/framework/tensor.proto
+++ b/tensorflow/core/framework/tensor.proto
@@ -40,8 +40,8 @@ message TensorProto {
   // be set.  The values hold the flattened representation of the tensor in
   // row major order.
 
-  // DT_HALF. Note that since protobuf has no int16 type, we'll have some
-  // pointless zero padding for each value here.
+  // DT_HALF, DT_BFLOAT16. Note that since protobuf has no int16 type, we'll
+  // have some pointless zero padding for each value here.
   repeated int32 half_val = 13 [packed = true];
 
   // DT_FLOAT.
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index 06c576c7d41e5bf48f9db6754e5814142632a371..d7517bb311d517351f4dd2a59438780482485dff 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -359,7 +359,8 @@ Status TensorShapeOld::IsValidShape(const TensorShapeProto& proto) {
   for (const auto& d : proto.dim()) {
     if (d.size() < 0) {
       return errors::InvalidArgument("Shape ", DebugString(proto),
-                                     " has negative dimensions");
+                                     " has negative dimensions; ",
+                                     "perhaps an un-fed placeholder?");
     }
     num_elements *= d.size();
     if (num_elements > kMaxElements) {
@@ -581,7 +582,8 @@ TEST(TensorShapeTest, Large) {
 TEST(TensorShapeTest, Overflow) {
   int64 one = 1;
   std::vector<std::vector<int64>> overflows = {
-      {1 << 30, 1 << 30, 1 << 30}, {1 << 5, (one << 60) + 1},
+      {1 << 30, 1 << 30, 1 << 30},
+      {1 << 5, (one << 60) + 1},
   };
   for (const auto& overflow : overflows) {
     TensorShapeProto proto;
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 47ff29fbe1a4d118e52c8faaa04019f88db0e1ae..b613effd18bbbaf107a56b518859024db1c9bbb2 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -20,12 +20,14 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
+
 class TensorTestHelper {
  public:
   // This is an operation that can be done by VariableOp.
@@ -33,13 +35,13 @@ class TensorTestHelper {
 };
 
 // To make TestCopies do the right thing.
-inline bool operator==(const ResourceHandle& a, const ResourceHandle& b) {
+bool operator==(const ResourceHandle& a, const ResourceHandle& b) {
   return a.device() == b.device() && a.container() == b.container() &&
          a.name() == b.name() && a.hash_code() == b.hash_code() &&
          a.maybe_type_name() == b.maybe_type_name();
 }
 
-inline bool operator==(const Variant& a, const Variant& b) {
+bool operator==(const Variant& a, const Variant& b) {
   if (a.is_empty()) {
     return b.is_empty();
   }
@@ -72,6 +74,8 @@ inline bool operator==(const Variant& a, const Variant& b) {
   return true;
 }
 
+namespace {
+
 TEST(TensorTest, Default) {
   Tensor t;
   EXPECT_EQ(t.dtype(), DT_FLOAT);
@@ -175,6 +179,28 @@ void TestCopies(const Tensor& t) {
   }
 }
 
+TEST(Tensor_Half, Simple) {
+  Tensor t(DT_HALF, TensorShape({5, 7}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({5, 7})));
+  for (int64 a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64 b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<Eigen::half>()(a, b) = static_cast<Eigen::half>(a * b);
+    }
+  }
+  TestCopies<Eigen::half>(t);
+}
+
+TEST(Tensor_Bfloat16, Simple) {
+  Tensor t(DT_BFLOAT16, TensorShape({5, 7}));
+  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({5, 7})));
+  for (int64 a = 0; a < t.shape().dim_size(0); a++) {
+    for (int64 b = 0; b < t.shape().dim_size(1); b++) {
+      t.matrix<bfloat16>()(a, b) = static_cast<bfloat16>(a * b);
+    }
+  }
+  TestCopies<bfloat16>(t);
+}
+
 TEST(Tensor_Float, Simple) {
   Tensor t(DT_FLOAT, TensorShape({10, 20}));
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({10, 20})));
@@ -334,41 +360,126 @@ class TensorReshapeTest : public ::testing::Test {
     tensor(0, 0, 0, 0) = 0.01f;
     tensor(1, 2, 3, 4) = 0.02f;
   }
-};
 
-TEST_F(TensorReshapeTest, Reshape) {
-  LOG(INFO) << "shaped";
-  {
-    auto shaped = t.shaped<float, 1>({120});
-    EXPECT_EQ(120, shaped.dimension(0));
-    EXPECT_EQ(shaped(0), 0.01f);
-    EXPECT_EQ(shaped(119), 0.02f);
-  }
-  {
-    auto shaped = t.shaped<float, 2>({6, 20});
-    EXPECT_EQ(6, shaped.dimension(0));
-    EXPECT_EQ(20, shaped.dimension(1));
-    EXPECT_EQ(shaped(0, 0), 0.01f);
-    EXPECT_EQ(shaped(5, 19), 0.02f);
+  template <typename T>
+  using ReshapeFunc = T (Tensor::*)(gtl::ArraySlice<int64>);
+  template <typename T>
+  using ConstReshapeFunc = T (Tensor::*)(gtl::ArraySlice<int64>) const;
+
+  template <typename T, ReshapeFunc<T> Func>
+  void TestReshape(std::initializer_list<int64> sizes) {
+    T shaped = (t.*Func)(sizes);
+    TestReshapeImpl(shaped, sizes);
   }
-  {
-    auto shaped = t.shaped<float, 3>({6, 4, 5});
-    EXPECT_EQ(6, shaped.dimension(0));
-    EXPECT_EQ(4, shaped.dimension(1));
-    EXPECT_EQ(5, shaped.dimension(2));
-    EXPECT_EQ(shaped(0, 0, 0), 0.01f);
-    EXPECT_EQ(shaped(5, 3, 4), 0.02f);
+
+  template <typename T, ConstReshapeFunc<T> Func>
+  void TestReshape(std::initializer_list<int64> sizes) {
+    T shaped = (static_cast<const Tensor&>(t).*Func)(sizes);
+    TestReshapeImpl(shaped, sizes);
   }
-  {
-    auto shaped = t.shaped<float, 4>({2, 3, 4, 5});
-    EXPECT_EQ(2, shaped.dimension(0));
-    EXPECT_EQ(3, shaped.dimension(1));
-    EXPECT_EQ(4, shaped.dimension(2));
-    EXPECT_EQ(5, shaped.dimension(3));
 
-    EXPECT_EQ(shaped(0, 0, 0, 0), 0.01f);
-    EXPECT_EQ(shaped(1, 2, 3, 4), 0.02f);
+  template <typename T>
+  void TestReshapeImpl(T shaped, std::initializer_list<int64> sizes) {
+    auto iter = sizes.begin();
+    for (int i = 0; i < shaped.rank(); ++i, ++iter) {
+      EXPECT_EQ(*iter, shaped.dimension(i));
+    }
+
+    using Index = typename T::Index;
+    using Scalar = typename T::Scalar;
+    constexpr int N = T::NumIndices;
+
+    // To handle the cast when `shaped` is bit casted into a different type.
+    const float expected_first = 0.01f;
+    Eigen::DSizes<Index, N> coord;
+    EXPECT_EQ(shaped(coord), *reinterpret_cast<const Scalar*>(&expected_first));
+
+    for (int i = 0; i < N; ++i) {
+      coord[i] = shaped.dimension(i) - 1;
+    }
+    const float expected_last = 0.02f;
+    constexpr int kNumScalarPerFloat =
+        sizeof(float) / sizeof(Scalar);  // Assuming even divide.
+    EXPECT_EQ(shaped(coord), reinterpret_cast<const Scalar*>(
+                                 &expected_last)[kNumScalarPerFloat - 1]);
   }
+};
+
+TEST_F(TensorReshapeTest, Reshape) {
+  LOG(INFO) << "shaped";
+
+#define TEST_RESHAPE(...)                                                  \
+  {                                                                        \
+    constexpr int N = (sizeof((int[]){__VA_ARGS__}) / sizeof(int));        \
+    TestReshape<TTypes<float, N>::Tensor, &Tensor::shaped<float, N>>(      \
+        {__VA_ARGS__});                                                    \
+    TestReshape<TTypes<float, N>::ConstTensor, &Tensor::shaped<float, N>>( \
+        {__VA_ARGS__});                                                    \
+    TestReshape<TTypes<float, N>::UnalignedTensor,                         \
+                &Tensor::unaligned_shaped<float, N>>({__VA_ARGS__});       \
+    TestReshape<TTypes<float, N>::UnalignedConstTensor,                    \
+                &Tensor::unaligned_shaped<float, N>>({__VA_ARGS__});       \
+    TestReshape<TTypes<float, N>::Tensor,                                  \
+                &Tensor::bit_casted_shaped<float, N>>({__VA_ARGS__});      \
+    TestReshape<TTypes<float, N>::ConstTensor,                             \
+                &Tensor::bit_casted_shaped<float, N>>({__VA_ARGS__});      \
+    TestReshape<TTypes<int32, N>::Tensor,                                  \
+                &Tensor::bit_casted_shaped<int32, N>>({__VA_ARGS__});      \
+    TestReshape<TTypes<int32, N>::ConstTensor,                             \
+                &Tensor::bit_casted_shaped<int32, N>>({__VA_ARGS__});      \
+  }
+
+  TEST_RESHAPE(120);
+  TEST_RESHAPE(6, 20);
+  TEST_RESHAPE(6, 4, 5);
+  TEST_RESHAPE(2, 3, 4, 5);
+#undef TEST_RESHAPE
+}
+
+TEST_F(TensorReshapeTest, BitcastReshapeDifferentSize) {
+#define TEST_BITCAST8_RESHAPE(...)                                    \
+  {                                                                   \
+    constexpr int N = (sizeof((int[]){__VA_ARGS__}) / sizeof(int));   \
+    TestReshape<TTypes<uint8, N>::Tensor,                             \
+                &Tensor::bit_casted_shaped<uint8, N>>({__VA_ARGS__}); \
+  }
+
+  TEST_BITCAST8_RESHAPE(480);
+  TEST_BITCAST8_RESHAPE(24, 20);
+  TEST_BITCAST8_RESHAPE(6, 16, 5);
+  TEST_BITCAST8_RESHAPE(2, 3, 4, 20);
+#undef TEST_BITCAST8_RESHAPE
+#define TEST_BITCAST16_RESHAPE(...)                                   \
+  {                                                                   \
+    constexpr int N = (sizeof((int[]){__VA_ARGS__}) / sizeof(int));   \
+    TestReshape<TTypes<int16, N>::Tensor,                             \
+                &Tensor::bit_casted_shaped<int16, N>>({__VA_ARGS__}); \
+  }
+
+  TEST_BITCAST16_RESHAPE(240);
+  TEST_BITCAST16_RESHAPE(6, 40);
+  TEST_BITCAST16_RESHAPE(12, 4, 5);
+  TEST_BITCAST16_RESHAPE(2, 3, 8, 5);
+  TEST_BITCAST16_RESHAPE(2, 3, 4, 1, 10);
+#undef TEST_BITCAST16_RESHAPE
+}
+
+TEST_F(TensorReshapeTest, ReshapeError) {
+  EXPECT_DEATH((t.shaped<float, 0>({})), "1 vs. 120");
+  EXPECT_DEATH((t.shaped<float, 1>({119})), "119 vs. 120");
+  EXPECT_DEATH((t.shaped<float, 4>({2, 3, 4, 6})), "144 vs. 120");
+
+  EXPECT_DEATH((t.unaligned_shaped<float, 0>({})), "1 vs. 120");
+  EXPECT_DEATH((t.unaligned_shaped<float, 1>({119})), "119 vs. 120");
+  EXPECT_DEATH((t.unaligned_shaped<float, 4>({2, 3, 4, 6})), "144 vs. 120");
+
+  EXPECT_DEATH((t.bit_casted_shaped<float, 0>({})), "4 vs. 480");
+  EXPECT_DEATH((t.bit_casted_shaped<float, 1>({119})), "476 vs. 480");
+  EXPECT_DEATH((t.bit_casted_shaped<float, 4>({2, 3, 4, 6})), "576 vs. 480");
+
+  Tensor string_tensor{DT_STRING, {10}};
+  // Note that the error message compare # of elements, not # of bytes.
+  EXPECT_DEATH((string_tensor.bit_casted_shaped<string, 1>({9})), "9 vs. 10");
 }
 
 TEST_F(TensorReshapeTest, Flat) {
@@ -890,7 +1001,7 @@ TEST(Tensor_Complex, SimpleWithHelper64) {
     // x contains all the 8-th root of unity.
     Tensor x(DT_COMPLEX64, TensorShape({8}));
     for (int i = 0; i < 8; ++i) {
-      x.vec<complex64>()(i) = std::pow(rotate_45, i);
+      x.vec<complex64>()(i) = MathUtil::IPow(rotate_45, i);
     }
 
     // Shift the roots by 45 degree.
@@ -898,7 +1009,7 @@ TEST(Tensor_Complex, SimpleWithHelper64) {
     y.vec<complex64>() = x.vec<complex64>() * rotate_45;
     Tensor y_expected(DT_COMPLEX64, TensorShape({8}));
     for (int i = 0; i < 8; ++i) {
-      y_expected.vec<complex64>()(i) = std::pow(rotate_45, i + 1);
+      y_expected.vec<complex64>()(i) = MathUtil::IPow(rotate_45, i + 1);
     }
     test::ExpectTensorNear<complex64>(y, y_expected, 1e-5);
 
@@ -939,7 +1050,7 @@ TEST(Tensor_Complex, SimpleWithHelper128) {
     // x contains all the 8-th root of unity.
     Tensor x(DT_COMPLEX128, TensorShape({8}));
     for (int i = 0; i < 8; ++i) {
-      x.vec<complex128>()(i) = std::pow(rotate_45, i);
+      x.vec<complex128>()(i) = MathUtil::IPow(rotate_45, i);
     }
 
     // Shift the roots by 45 degree.
@@ -947,7 +1058,7 @@ TEST(Tensor_Complex, SimpleWithHelper128) {
     y.vec<complex128>() = x.vec<complex128>() * rotate_45;
     Tensor y_expected(DT_COMPLEX128, TensorShape({8}));
     for (int i = 0; i < 8; ++i) {
-      y_expected.vec<complex128>()(i) = std::pow(rotate_45, i + 1);
+      y_expected.vec<complex128>()(i) = MathUtil::IPow(rotate_45, i + 1);
     }
     test::ExpectTensorNear<complex128>(y, y_expected, 1e-5);
 
@@ -962,8 +1073,6 @@ TEST(Tensor_Complex, SimpleWithHelper128) {
   }
 }
 
-namespace {
-
 // An allocator that always returns nullptr, for testing
 // failures to allocate.
 class DummyCPUAllocator : public Allocator {
@@ -976,6 +1085,21 @@ class DummyCPUAllocator : public Allocator {
   void DeallocateRaw(void* ptr) override {}
 };
 
+TEST(Tensor, SharesBufferWith) {
+  Tensor a_empty;
+  Tensor b_empty;
+  Tensor a(DT_FLOAT, TensorShape({1}));
+  Tensor b(DT_FLOAT, TensorShape({1}));
+  Tensor copy(a);
+  EXPECT_FALSE(a_empty.SharesBufferWith(a_empty));
+  EXPECT_FALSE(a_empty.SharesBufferWith(b_empty));
+  EXPECT_FALSE(a_empty.SharesBufferWith(a));
+  EXPECT_FALSE(a_empty.SharesBufferWith(copy));
+  EXPECT_TRUE(a.SharesBufferWith(a));
+  EXPECT_FALSE(a.SharesBufferWith(b));
+  EXPECT_TRUE(a.SharesBufferWith(copy));
+}
+
 TEST(Tensor, FailureToAllocate) {
   TensorShape shape({1});
   DummyCPUAllocator allocator;
@@ -1103,7 +1227,6 @@ TEST(Tensor, Slice_Basic) {
   }
 }
 
-namespace {
 template <typename T>
 Tensor MkTensor(DataType dt, const TensorShape& shape,
                 std::vector<T> init_values) {
@@ -1116,7 +1239,6 @@ Tensor MkTensor(DataType dt, const TensorShape& shape,
   }
   return x;
 }
-}  // namespace
 
 TEST(SummarizeValue, Uninitialized) {
   Tensor x(DT_INT32);
@@ -1165,7 +1287,7 @@ TEST(SummarizeValue, STRING) {
   EXPECT_EQ("one two three four five one...", x.SummarizeValue(6));
 }
 
-static void BM_CreateAndDestroy(int iters) {
+void BM_CreateAndDestroy(int iters) {
   TensorShape shape({10, 20});
   while (--iters) {
     Tensor t(DT_FLOAT, shape);
@@ -1173,7 +1295,7 @@ static void BM_CreateAndDestroy(int iters) {
 }
 BENCHMARK(BM_CreateAndDestroy);
 
-static void BM_Assign(int iters) {
+void BM_Assign(int iters) {
   Tensor a(DT_FLOAT, TensorShape({10, 20}));
   Tensor b(DT_FLOAT, TensorShape({10, 20}));
   bool a_to_b = true;
@@ -1195,7 +1317,7 @@ TEST(Tensor, EmptyTensorData) {
 }
 
 // Benchmark create and destroy a tensor, with an allocated buffer.
-static void BM_CreateAndDestroyWithBuf(int iters) {
+void BM_CreateAndDestroyWithBuf(int iters) {
   TensorShape shape({10, 20});
   Allocator* allocator = cpu_allocator();
   while (--iters) {
@@ -1205,7 +1327,7 @@ static void BM_CreateAndDestroyWithBuf(int iters) {
 BENCHMARK(BM_CreateAndDestroyWithBuf);
 
 // Benchmark create+copy a tensor, with an allocated buffer.
-static void BM_CreateAndCopyCtrWithBuf(int iters) {
+void BM_CreateAndCopyCtrWithBuf(int iters) {
   TensorShape shape({10, 20});
   Allocator* allocator = cpu_allocator();
   while (--iters) {
@@ -1216,7 +1338,7 @@ static void BM_CreateAndCopyCtrWithBuf(int iters) {
 BENCHMARK(BM_CreateAndCopyCtrWithBuf);
 
 // Benchmark create+move a tensor, with an allocated buffer.
-static void BM_CreateAndMoveCtrWithBuf(int iters) {
+void BM_CreateAndMoveCtrWithBuf(int iters) {
   TensorShape shape({10, 20});
   Allocator* allocator = cpu_allocator();
   while (--iters) {
diff --git a/tensorflow/core/framework/tensor_testutil.cc b/tensorflow/core/framework/tensor_testutil.cc
index a8d141230093152397c792588a716c00556df77d..8f480d65f25012b858d7d375196b2693d3a533b9 100644
--- a/tensorflow/core/framework/tensor_testutil.cc
+++ b/tensorflow/core/framework/tensor_testutil.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cmath>
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include <cmath>
 
 namespace tensorflow {
 namespace test {
diff --git a/tensorflow/core/framework/tensor_types.h b/tensorflow/core/framework/tensor_types.h
index 921f88dc0ba09e7904333613b728021751d5425c..a5c1a56bfc06a9785f08c468f78bda5111e15409 100644
--- a/tensorflow/core/framework/tensor_types.h
+++ b/tensorflow/core/framework/tensor_types.h
@@ -25,7 +25,8 @@ template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
 struct TTypes {
   // Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Tensor;
+                           Eigen::Aligned>
+      Tensor;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstTensor;
@@ -33,35 +34,42 @@ struct TTypes {
   // Unaligned Rank-<NDIMS> tensor of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType> >
       UnalignedTensor;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor,
-                                         IndexType> > UnalignedConstTensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType> >
+      UnalignedConstTensor;
 
   typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>,
-                           Eigen::Aligned> Tensor32Bit;
+                           Eigen::Aligned>
+      Tensor32Bit;
 
   // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
   typedef Eigen::TensorMap<
       Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
-      Eigen::Aligned> Scalar;
+      Eigen::Aligned>
+      Scalar;
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
                                                   Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> ConstScalar;
+                           Eigen::Aligned>
+      ConstScalar;
 
   // Unaligned Scalar tensor of scalar type T.
-  typedef Eigen::TensorMap<Eigen::TensorFixedSize<
-      T, Eigen::Sizes<>, Eigen::RowMajor, IndexType> > UnalignedScalar;
+  typedef Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType> >
+      UnalignedScalar;
   typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
                                                   Eigen::RowMajor, IndexType> >
       UnalignedConstScalar;
 
   // Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Flat;
+                           Eigen::Aligned>
+      Flat;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Vec;
+                           Eigen::Aligned>
+      Vec;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstVec;
@@ -69,16 +77,19 @@ struct TTypes {
   // Unaligned Rank-1 tensor (vector) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
       UnalignedFlat;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor,
-                                         IndexType> > UnalignedConstFlat;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> >
+      UnalignedConstFlat;
   typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType> >
       UnalignedVec;
   typedef Eigen::TensorMap<
-      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> > UnalignedConstVec;
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType> >
+      UnalignedConstVec;
 
   // Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>,
-                           Eigen::Aligned> Matrix;
+                           Eigen::Aligned>
+      Matrix;
   typedef Eigen::TensorMap<
       Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
       ConstMatrix;
@@ -86,8 +97,9 @@ struct TTypes {
   // Unaligned Rank-2 tensor (matrix) of scalar type T.
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType> >
       UnalignedMatrix;
-  typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor,
-                                         IndexType> > UnalignedConstMatrix;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType> >
+      UnalignedConstMatrix;
 };
 
 typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32;
diff --git a/tensorflow/core/framework/tracking_allocator.cc b/tensorflow/core/framework/tracking_allocator.cc
index 239dfd13ec2e45acb0a65700f2a8882c61fc03b3..2df402573a58ad3728e03a22d391b32766c49b00 100644
--- a/tensorflow/core/framework/tracking_allocator.cc
+++ b/tensorflow/core/framework/tracking_allocator.cc
@@ -113,7 +113,7 @@ bool TrackingAllocator::TracksAllocationSizes() {
   return track_sizes_locally_ || allocator_->TracksAllocationSizes();
 }
 
-size_t TrackingAllocator::RequestedSize(void* ptr) {
+size_t TrackingAllocator::RequestedSize(const void* ptr) {
   if (track_sizes_locally_) {
     mutex_lock lock(mu_);
     auto it = in_use_.find(ptr);
@@ -126,7 +126,7 @@ size_t TrackingAllocator::RequestedSize(void* ptr) {
   }
 }
 
-size_t TrackingAllocator::AllocatedSize(void* ptr) {
+size_t TrackingAllocator::AllocatedSize(const void* ptr) {
   if (track_sizes_locally_) {
     mutex_lock lock(mu_);
     auto it = in_use_.find(ptr);
@@ -139,7 +139,7 @@ size_t TrackingAllocator::AllocatedSize(void* ptr) {
   }
 }
 
-int64 TrackingAllocator::AllocationId(void* ptr) {
+int64 TrackingAllocator::AllocationId(const void* ptr) {
   if (track_sizes_locally_) {
     mutex_lock lock(mu_);
     auto it = in_use_.find(ptr);
@@ -156,6 +156,8 @@ void TrackingAllocator::GetStats(AllocatorStats* stats) {
   allocator_->GetStats(stats);
 }
 
+void TrackingAllocator::ClearStats() { allocator_->ClearStats(); }
+
 std::tuple<size_t, size_t, size_t> TrackingAllocator::GetSizes() {
   size_t high_watermark;
   size_t total_bytes;
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index a6c26c89e51f1fec01886672b91f863ee36bedc8..f6c3c0b71b951c3b89b0d444c0d2d588a395dadd 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -64,10 +64,11 @@ class TrackingAllocator : public Allocator {
                     const AllocationAttributes& allocation_attr) override;
   void DeallocateRaw(void* ptr) override;
   bool TracksAllocationSizes() override;
-  size_t RequestedSize(void* ptr) override;
-  size_t AllocatedSize(void* ptr) override;
-  int64 AllocationId(void* ptr) override;
+  size_t RequestedSize(const void* ptr) override;
+  size_t AllocatedSize(const void* ptr) override;
+  int64 AllocationId(const void* ptr) override;
   void GetStats(AllocatorStats* stats) override;
+  void ClearStats() override;
 
   // If the underlying allocator tracks allocation sizes, this returns
   // a tuple where the first value is the total number of bytes
@@ -124,7 +125,7 @@ class TrackingAllocator : public Allocator {
     size_t allocated_size;
     int64 allocation_id;
   };
-  std::unordered_map<void*, Chunk> in_use_ GUARDED_BY(mu_);
+  std::unordered_map<const void*, Chunk> in_use_ GUARDED_BY(mu_);
   int64 next_allocation_id_ GUARDED_BY(mu_);
 };
 
diff --git a/tensorflow/core/framework/tracking_allocator_test.cc b/tensorflow/core/framework/tracking_allocator_test.cc
index 4e32a907f20f34183abbbc57b93c38197710fa51..2cdc7edd2d1e9f2634a96e85879dc45a53f633cc 100644
--- a/tensorflow/core/framework/tracking_allocator_test.cc
+++ b/tensorflow/core/framework/tracking_allocator_test.cc
@@ -39,7 +39,7 @@ class TestableSizeTrackingAllocator : public Allocator {
     port::Free(ptr);
   }
   bool TracksAllocationSizes() override { return true; }
-  size_t RequestedSize(void* ptr) override {
+  size_t RequestedSize(const void* ptr) override {
     const auto& iter = size_map_.find(ptr);
     EXPECT_NE(size_map_.end(), iter);
     return iter->second;
@@ -47,7 +47,7 @@ class TestableSizeTrackingAllocator : public Allocator {
   void GetStats(AllocatorStats* stats) override { stats->Clear(); }
 
  private:
-  std::unordered_map<void*, size_t> size_map_;
+  std::unordered_map<const void*, size_t> size_map_;
 };
 
 class NoMemoryAllocator : public Allocator {
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index faae19585d9dd2bc5f351772af93723daaa3b8be..adf4e1bae307d81d91e7e597fc882caf4c87601f 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -47,11 +47,8 @@ const std::string DeviceName<Eigen::GpuDevice>::value = DEVICE_GPU;
 const std::string DeviceName<Eigen::SyclDevice>::value = DEVICE_SYCL;
 #endif  // TENSORFLOW_USE_SYCL
 
-string DataTypeString(DataType dtype) {
-  if (IsRefType(dtype)) {
-    DataType non_ref = static_cast<DataType>(dtype - kDataTypeRefOffset);
-    return strings::StrCat(DataTypeString(non_ref), "_ref");
-  }
+namespace {
+string DataTypeStringInternal(DataType dtype) {
   switch (dtype) {
     case DT_INVALID:
       return "INVALID";
@@ -106,6 +103,15 @@ string DataTypeString(DataType dtype) {
       return strings::StrCat("unknown dtype enum (", dtype, ")");
   }
 }
+}  // end namespace
+
+string DataTypeString(DataType dtype) {
+  if (IsRefType(dtype)) {
+    DataType non_ref = static_cast<DataType>(dtype - kDataTypeRefOffset);
+    return strings::StrCat(DataTypeStringInternal(non_ref), "_ref");
+  }
+  return DataTypeStringInternal(dtype);
+}
 
 bool DataTypeFromString(StringPiece sp, DataType* dt) {
   if (sp.ends_with("_ref")) {
@@ -205,142 +211,12 @@ string DataTypeSliceString(const DataTypeSlice types) {
   return out;
 }
 
-DataTypeVector AllTypes() {
-  return {DT_FLOAT,   DT_DOUBLE, DT_INT32,  DT_UINT8,     DT_INT16,
-          DT_UINT16,  DT_INT8,   DT_STRING, DT_COMPLEX64, DT_COMPLEX128,
-          DT_INT64,   DT_BOOL,   DT_QINT8,  DT_QUINT8,    DT_QINT16,
-          DT_QUINT16, DT_QINT32, DT_HALF,   DT_RESOURCE,  DT_VARIANT,
-          DT_UINT32,  DT_UINT64};
-}
-
-#if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION)
-
-DataTypeVector RealNumberTypes() {
-  return {DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64,  DT_UINT8, DT_INT16,
-          DT_INT8,  DT_UINT16, DT_HALF,  DT_UINT32, DT_UINT64};
-}
-
-DataTypeVector QuantizedTypes() {
-  return {DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32};
-}
-
-DataTypeVector RealAndQuantizedTypes() {
-  return {DT_FLOAT,  DT_DOUBLE,  DT_INT32,  DT_INT64, DT_UINT8,
-          DT_UINT16, DT_UINT16,  DT_INT8,   DT_QINT8, DT_QUINT8,
-          DT_QINT16, DT_QUINT16, DT_QINT32, DT_HALF};
-}
-
-DataTypeVector NumberTypes() {
-  return {DT_FLOAT,     DT_DOUBLE,     DT_INT64,  DT_INT32,
-          DT_UINT8,     DT_UINT16,     DT_INT16,  DT_INT8,
-          DT_COMPLEX64, DT_COMPLEX128, DT_QINT8,  DT_QUINT8,
-          DT_QINT32,    DT_HALF,       DT_UINT32, DT_UINT64};
-}
-
-#elif defined(__ANDROID_TYPES_FULL__)
-
-DataTypeVector RealNumberTypes() {
-  return {DT_FLOAT, DT_INT32, DT_INT64, DT_HALF};
-}
-
-DataTypeVector NumberTypes() {
-  return {DT_FLOAT,  DT_INT32,  DT_INT64, DT_QINT8,
-          DT_QUINT8, DT_QINT32, DT_HALF};
-}
-
-DataTypeVector QuantizedTypes() {
-  return {DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32};
-}
-
-DataTypeVector RealAndQuantizedTypes() {
-  return {DT_FLOAT,  DT_INT32,   DT_INT64,  DT_QINT8, DT_QUINT8,
-          DT_QINT16, DT_QUINT16, DT_QINT32, DT_HALF};
-}
-
-#else  // defined(IS_MOBILE_PLATFORM) && !defined(__ANDROID_TYPES_FULL__)
-
-DataTypeVector RealNumberTypes() { return {DT_FLOAT, DT_INT32}; }
-
-DataTypeVector NumberTypes() {
-  return {DT_FLOAT, DT_INT32, DT_QINT8, DT_QUINT8, DT_QINT32};
-}
-
-DataTypeVector QuantizedTypes() {
-  return {DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32};
-}
-
-DataTypeVector RealAndQuantizedTypes() {
-  return {DT_FLOAT,  DT_INT32,   DT_QINT8, DT_QUINT8,
-          DT_QINT16, DT_QUINT16, DT_QINT32};
-}
-
-#endif  // defined(IS_MOBILE_PLATFORM)
-
-// TODO(jeff): Maybe unify this with Tensor::CanUseDMA, or the underlying
-// is_simple<T> in tensor.cc (and possible choose a more general name?)
-bool DataTypeCanUseMemcpy(DataType dt) {
+bool DataTypeAlwaysOnHost(DataType dt) {
+  // Includes DT_STRING and DT_RESOURCE.
   switch (dt) {
-    case DT_FLOAT:
-    case DT_DOUBLE:
-    case DT_INT32:
-    case DT_UINT32:
-    case DT_UINT8:
-    case DT_UINT16:
-    case DT_INT16:
-    case DT_INT8:
-    case DT_COMPLEX64:
-    case DT_COMPLEX128:
-    case DT_INT64:
-    case DT_UINT64:
-    case DT_BOOL:
-    case DT_QINT8:
-    case DT_QUINT8:
-    case DT_QINT16:
-    case DT_QUINT16:
-    case DT_QINT32:
-    case DT_BFLOAT16:
-    case DT_HALF:
-      return true;
-    default:
-      return false;
-  }
-}
-
-bool DataTypeIsQuantized(DataType dt) {
-  switch (dt) {
-    case DT_QINT8:
-    case DT_QUINT8:
-    case DT_QINT16:
-    case DT_QUINT16:
-    case DT_QINT32:
-      return true;
-    default:
-      return false;
-  }
-}
-
-bool DataTypeIsInteger(DataType dt) {
-  switch (dt) {
-    case DT_INT8:
-    case DT_UINT8:
-    case DT_INT16:
-    case DT_UINT16:
-    case DT_INT32:
-    case DT_UINT32:
-    case DT_INT64:
-    case DT_UINT64:
-      return true;
-    default:
-      return false;
-  }
-}
-
-bool DataTypeIsUnsigned(DataType dt) {
-  switch (dt) {
-    case DT_UINT8:
-    case DT_UINT16:
-    case DT_UINT32:
-    case DT_UINT64:
+    case DT_STRING:
+    case DT_STRING_REF:
+    case DT_RESOURCE:
       return true;
     default:
       return false;
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index dc53ed41780d90448872b1bd98e97f5e16d49592..ded6aa09918f873b975f537fa33dcd55902090fe 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -112,10 +112,127 @@ inline string DataTypeVectorString(const DataTypeVector& dtypes) {
   return DataTypeSliceString(dtypes);
 }
 
+// DataTypeSet represents a set of DataType values as a simple and efficient
+// bit mask.  Note that DataTypeSet cannot represent all DataType values; it
+// cannot represent any of the DT_*_REF values.
+class DataTypeSet {
+ private:
+  const uint32 mask_;
+
+  static constexpr uint32 kNumBits = 32;
+
+ public:
+  constexpr DataTypeSet(const DataTypeSet& other) : mask_(other.mask_) {}
+  explicit constexpr DataTypeSet(uint32 mask) : mask_(mask) {}
+
+  constexpr bool Contains(DataType dt) const {
+    return (static_cast<uint32>(dt) < kNumBits) &&
+           ((mask_ >> static_cast<uint32>(dt)) & 1u) != 0u;
+  }
+
+  class Iterator {
+    const DataTypeSet& set_;
+    uint32 pos_;
+
+   public:
+    Iterator(const DataTypeSet& set, uint32 pos) : set_(set), pos_(pos) {
+      DCHECK_LE(pos, kNumBits);
+    }
+    DataType operator*() const { return static_cast<DataType>(pos_); }
+    Iterator& operator++() {
+      ++pos_;
+      DCHECK_LE(pos_, kNumBits);
+      if (pos_ < kNumBits) {
+        uint32 remaining_mask = set_.mask_ >> pos_;
+        if (remaining_mask != 0u) {
+          pos_ += ctz_uint32(remaining_mask);
+        }
+      }
+      DCHECK_LE(pos_, kNumBits);
+      return *this;
+    }
+    bool operator==(const Iterator& other) const { return pos_ == other.pos_; }
+    bool operator!=(const Iterator& other) const { return !(*this == other); }
+    size_t operator-(const Iterator& other) const {
+      return this->pos_ - other.pos_;
+    }
+  };
+
+  static uint32 ctz_uint32(uint32 x) {
+    DCHECK_NE(x, 0u);
+#ifdef __GNUC__
+    return __builtin_ctz(x);
+#else
+    uint32 n = 0u;
+    while ((x & 1u) == 0u) {
+      x >>= 1;
+      ++n;
+    }
+    return n;
+#endif
+  }
+
+  static uint32 clz_uint32(uint32 x) {
+    DCHECK_NE(x, 0u);
+#ifdef __GNUC__
+    return __builtin_clz(x);
+#else
+    uint32 n = 0u;
+    while ((x >> (kNumBits - 1u)) == 0u) {
+      x <<= 1;
+      ++n;
+    }
+    return n;
+#endif
+  }
+
+  Iterator begin() const {
+    // The begin position is the index of the first bit set to 1 in the entire
+    // bit mask. If there are no bits set to 1, then the index is 0.
+    if (mask_ != 0) {
+      return Iterator(*this, ctz_uint32(mask_));
+    }
+    // The set is empty.
+    return Iterator(*this, 0);
+  }
+
+  Iterator end() const {
+    // The end position is the index of the highest bit that is set, plus 1.
+    // If there are no bits set to 1, then the index is 0.
+    if (mask_ != 0) {
+      return Iterator(*this, kNumBits - clz_uint32(mask_));
+    }
+    // The set is empty.
+    return Iterator(*this, 0);
+  }
+
+  size_t size() const {
+#if defined(__GNUC__)
+    return __builtin_popcount(mask_);
+#else
+    size_t n = 0;
+    uint32 x = mask_;
+    while (x > 0) {
+      n += x & 1u;
+      x >>= 1;
+    }
+    return n;
+#endif
+  }
+
+  constexpr DataTypeSet operator|(const DataTypeSet& other) const {
+    return DataTypeSet(mask_ | other.mask_);
+  }
+};
+
 // If "sp" names a valid type, store it in "*dt" and return true.  Otherwise,
 // return false.
 bool DataTypeFromString(StringPiece sp, DataType* dt);
 
+constexpr inline DataTypeSet ToSet(DataType dt) {
+  return DataTypeSet(1u << static_cast<uint32>(dt));
+}
+
 // DT_FLOAT + kDataTypeRefOffset == DT_FLOAT_REF, etc.
 enum { kDataTypeRefOffset = 100 };
 inline bool IsRefType(DataType dtype) {
@@ -139,17 +256,94 @@ inline bool TypesCompatible(DataType expected, DataType actual) {
 }
 
 // Does not include _ref types.
-DataTypeVector AllTypes();
+constexpr DataTypeSet kAllTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) | ToSet(DT_INT32) | ToSet(DT_UINT8) |
+    ToSet(DT_INT16) | ToSet(DT_UINT16) | ToSet(DT_INT8) | ToSet(DT_STRING) |
+    ToSet(DT_COMPLEX64) | ToSet(DT_COMPLEX128) | ToSet(DT_INT64) |
+    ToSet(DT_BOOL) | ToSet(DT_QINT8) | ToSet(DT_QUINT8) | ToSet(DT_QINT16) |
+    ToSet(DT_QUINT16) | ToSet(DT_QINT32) | ToSet(DT_HALF) | ToSet(DT_RESOURCE) |
+    ToSet(DT_VARIANT) | ToSet(DT_UINT32) | ToSet(DT_UINT64) |
+    ToSet(DT_BFLOAT16);
+inline const DataTypeSet& AllTypes() { return kAllTypes; }
+
+#if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION)
+
+// Types that support '<' and '>'.
+constexpr DataTypeSet kRealNumberTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) | ToSet(DT_INT32) | ToSet(DT_INT64) |
+    ToSet(DT_UINT8) | ToSet(DT_INT16) | ToSet(DT_INT8) | ToSet(DT_UINT16) |
+    ToSet(DT_HALF) | ToSet(DT_UINT32) | ToSet(DT_UINT64) | ToSet(DT_BFLOAT16);
+inline const DataTypeSet RealNumberTypes() { return kRealNumberTypes; }
 
 // Return the list of all numeric types.
+// Includes complex and quantized types.
 // NOTE: On Android, we only include the float and int32 types for now.
-DataTypeVector RealNumberTypes();  // Types that support '<' and '>'.
-DataTypeVector NumberTypes();      // Includes complex and quantized types.
+const DataTypeSet kNumberTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) | ToSet(DT_INT64) | ToSet(DT_INT32) |
+    ToSet(DT_UINT8) | ToSet(DT_UINT16) | ToSet(DT_INT16) | ToSet(DT_INT8) |
+    ToSet(DT_COMPLEX64) | ToSet(DT_COMPLEX128) | ToSet(DT_QINT8) |
+    ToSet(DT_QUINT8) | ToSet(DT_QINT32) | ToSet(DT_HALF) | ToSet(DT_UINT32) |
+    ToSet(DT_UINT64) | ToSet(DT_BFLOAT16);
+inline const DataTypeSet& NumberTypes() { return kNumberTypes; }
+
+constexpr DataTypeSet kQuantizedTypes = ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
+                                        ToSet(DT_QINT16) | ToSet(DT_QUINT16) |
+                                        ToSet(DT_QINT32);
+inline const DataTypeSet& QuantizedTypes() { return kQuantizedTypes; }
+
+// Types that support '<' and '>', including quantized types.
+const DataTypeSet kRealAndQuantizedTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) | ToSet(DT_INT32) | ToSet(DT_INT64) |
+    ToSet(DT_UINT8) | ToSet(DT_UINT16) | ToSet(DT_UINT16) | ToSet(DT_INT8) |
+    ToSet(DT_QINT8) | ToSet(DT_QUINT8) | ToSet(DT_QINT16) | ToSet(DT_QUINT16) |
+    ToSet(DT_QINT32) | ToSet(DT_HALF) | ToSet(DT_BFLOAT16);
+inline const DataTypeSet& RealAndQuantizedTypes() {
+  return kRealAndQuantizedTypes;
+}
+
+#elif defined(__ANDROID_TYPES_FULL__)
+
+constexpr DataTypeSet kRealNumberTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_INT32) | ToSet(DT_INT64) | ToSet(DT_HALF);
+inline DataTypeSet RealNumberTypes() { return kRealNumberTypes; }
+
+constexpr DataTypeSet kNumberTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_INT32) | ToSet(DT_INT64) | ToSet(DT_QINT8) |
+    ToSet(DT_QUINT8) | ToSet(DT_QINT32) | ToSet(DT_HALF);
+inline DataTypeSet NumberTypes() { return kNumberTypes; }
+
+constexpr DataTypeSet kQuantizedTypes = ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
+                                        ToSet(DT_QINT16) | ToSet(DT_QUINT16) |
+                                        ToSet(DT_QINT32);
+inline DataTypeSet QuantizedTypes() { return kQuantizedTypes; }
+
+constexpr DataTypeSet kRealAndQuantizedTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_INT32) | ToSet(DT_INT64) | ToSet(DT_QINT8) |
+    ToSet(DT_QUINT8) | ToSet(DT_QINT16) | ToSet(DT_QUINT16) | ToSet(DT_QINT32) |
+    ToSet(DT_HALF);
+inline DataTypeSet RealAndQuantizedTypes() { return kRealAndQuantizedTypes; }
+
+#else  // defined(IS_MOBILE_PLATFORM) && !defined(__ANDROID_TYPES_FULL__)
+
+constexpr DataTypeSet kRealNumberTypes = ToSet(DT_FLOAT) | ToSet(DT_INT32);
+inline DataTypeSet RealNumberTypes() { return kRealNumberTypes; }
+
+constexpr DataTypeSet kNumberTypes = ToSet(DT_FLOAT) | ToSet(DT_INT32) |
+                                     ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
+                                     ToSet(DT_QINT32);
+inline DataTypeSet NumberTypes() { return kNumberTypes; }
+
+constexpr DataTypeSet kQuantizedTypes = ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
+                                        ToSet(DT_QINT16) | ToSet(DT_QUINT16) |
+                                        ToSet(DT_QINT32);
+inline DataTypeSet QuantizedTypes() { return kQuantizedTypes; }
+
+constexpr DataTypeSet kRealAndQuantizedTypes =
+    ToSet(DT_FLOAT) | ToSet(DT_INT32) | ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
+    ToSet(DT_QINT16) | ToSet(DT_QUINT16) | ToSet(DT_QINT32);
+inline DataTypeSet RealAndQuantizedTypes() { return kRealAndQuantizedTypes; }
 
-DataTypeVector QuantizedTypes();
-DataTypeVector RealAndQuantizedTypes();  // Types that support '<' and
-                                         // '>', including quantized
-                                         // types
+#endif  // defined(IS_MOBILE_PLATFORM)
 
 // Validates type T for whether it is a supported DataType.
 template <class T>
@@ -220,19 +414,67 @@ struct IsValidDataType {
 static_assert(IsValidDataType<int64>::value, "Incorrect impl for int64");
 static_assert(IsValidDataType<int32>::value, "Incorrect impl for int32");
 
-bool DataTypeCanUseMemcpy(DataType dt);
+// TODO(jeff): Maybe unify this with Tensor::CanUseDMA, or the underlying
+// is_simple<T> in tensor.cc (and possible choose a more general name?)
+constexpr DataTypeSet kDataTypesCanUseMemcpy =
+    ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) | ToSet(DT_INT32) | ToSet(DT_UINT32) |
+    ToSet(DT_UINT8) | ToSet(DT_UINT16) | ToSet(DT_INT16) | ToSet(DT_INT8) |
+    ToSet(DT_COMPLEX64) | ToSet(DT_COMPLEX128) | ToSet(DT_INT64) |
+    ToSet(DT_UINT64) | ToSet(DT_BOOL) | ToSet(DT_QINT8) | ToSet(DT_QUINT8) |
+    ToSet(DT_QINT16) | ToSet(DT_QUINT16) | ToSet(DT_QINT32) |
+    ToSet(DT_BFLOAT16) | ToSet(DT_HALF);
+inline bool DataTypeCanUseMemcpy(DataType dt) {
+  return kDataTypesCanUseMemcpy.Contains(dt);
+}
+
+// Returns true iff 'dt' is a real, non-quantized floating point type.
+constexpr DataTypeSet kDataTypeIsFloating =
+    ToSet(DT_HALF) | ToSet(DT_BFLOAT16) | ToSet(DT_FLOAT) | ToSet(DT_DOUBLE);
+inline bool DataTypeIsFloating(DataType dt) {
+  return kDataTypeIsFloating.Contains(dt);
+}
 
-bool DataTypeIsQuantized(DataType dt);
+// Returns true iff 'dt' is a complex type.
+constexpr DataTypeSet kDataTypeIsComplex =
+    ToSet(DT_COMPLEX64) | ToSet(DT_COMPLEX128);
+inline bool DataTypeIsComplex(DataType dt) {
+  return kDataTypeIsComplex.Contains(dt);
+}
+
+inline bool DataTypeIsQuantized(DataType dt) {
+  return kQuantizedTypes.Contains(dt);
+}
 
 // Is the dtype nonquantized integral?
-bool DataTypeIsInteger(DataType dt);
+constexpr DataTypeSet kDataTypeIsInteger =
+    ToSet(DT_INT8) | ToSet(DT_UINT8) | ToSet(DT_INT16) | ToSet(DT_UINT16) |
+    ToSet(DT_INT32) | ToSet(DT_UINT32) | ToSet(DT_INT64) | ToSet(DT_UINT64);
+inline bool DataTypeIsInteger(DataType dt) {
+  return kDataTypeIsInteger.Contains(dt);
+}
+
+// Is the dtype a signed integral type?
+constexpr DataTypeSet kDataTypeIsSigned =
+    ToSet(DT_INT8) | ToSet(DT_INT16) | ToSet(DT_INT32) | ToSet(DT_INT64);
+inline bool DataTypeIsSigned(DataType dt) {
+  return kDataTypeIsSigned.Contains(dt);
+}
 
 // Is the dtype an unsigned integral type?
-bool DataTypeIsUnsigned(DataType dt);
+constexpr DataTypeSet kDataTypeIsUnsigned =
+    ToSet(DT_UINT8) | ToSet(DT_UINT16) | ToSet(DT_UINT32) | ToSet(DT_UINT64);
+inline bool DataTypeIsUnsigned(DataType dt) {
+  return kDataTypeIsUnsigned.Contains(dt);
+}
 
 // Returns a 0 on failure
 int DataTypeSize(DataType dt);
 
+// Types that always sit on host: DT_STRING, DT_STRING_REF, DT_RESOURCE.
+// For DT_RESOURCE, the handle always sits on host (even if the underlying
+// object has device-allocated resources).
+bool DataTypeAlwaysOnHost(DataType dt);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_TYPES_H_
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index bc57740469f96fee28de1cea8920cc0431511db1..60f2b4135a68c4eed618e3efb07758fbab85fa07 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -70,8 +70,8 @@ TEST(TypesTest, kDataTypeRefOffset) {
       << "Extra reference enum "
       << enum_descriptor->FindValueByNumber(e_ref)->name()
       << " without corresponding base enum with value " << e;
-  ASSERT_LT(DataType_MAX, e_ref) << "Gap in reference types, missing value for "
-                                 << e_ref;
+  ASSERT_LT(DataType_MAX, e_ref)
+      << "Gap in reference types, missing value for " << e_ref;
 
   // Make sure there are no enums defined after the last regular type before
   // the first reference type.
@@ -130,6 +130,13 @@ TEST(TypesTest, QuantizedTypes) {
   EXPECT_FALSE(DataTypeIsQuantized(DT_BFLOAT16));
 }
 
+TEST(TypesTest, ComplexTypes) {
+  EXPECT_TRUE(DataTypeIsComplex(DT_COMPLEX64));
+  EXPECT_TRUE(DataTypeIsComplex(DT_COMPLEX128));
+  EXPECT_FALSE(DataTypeIsComplex(DT_FLOAT));
+  EXPECT_FALSE(DataTypeIsComplex(DT_DOUBLE));
+}
+
 TEST(TypesTest, IntegerTypes) {
   for (auto dt : AllTypes()) {
     const string name = DataTypeString(dt);
diff --git a/tensorflow/core/framework/variant_encode_decode.h b/tensorflow/core/framework/variant_encode_decode.h
index 09ebf6257bdffc314e09a124db70e33801ae338d..5a84f9d94385a7048a0f4adfe78e1805b367f02d 100644
--- a/tensorflow/core/framework/variant_encode_decode.h
+++ b/tensorflow/core/framework/variant_encode_decode.h
@@ -233,6 +233,7 @@ void EncodeVariant(const T& value, string* buf) {
   VariantTensorData data;
   EncodeVariantImpl(value, TypeResolver<T>(), &data);
   data.set_type_name(TypeNameVariant(value));
+  DCHECK(buf != nullptr);
   data.SerializeToString(buf);
 }
 
diff --git a/tensorflow/core/framework/variant_op_registry.cc b/tensorflow/core/framework/variant_op_registry.cc
index 395329da3bee01cf73c69d52b150b88f34d1b1ff..ee07db1aee15e578c4bcbac22cecf6e75e95b6e2 100644
--- a/tensorflow/core/framework/variant_op_registry.cc
+++ b/tensorflow/core/framework/variant_op_registry.cc
@@ -182,7 +182,7 @@ Status VariantDeviceCopy(
 // Special casing UnaryOpFn per op and per device.
 UnaryVariantOpRegistry::VariantUnaryOpFn* UnaryVariantOpRegistry::GetUnaryOpFn(
     VariantUnaryOp op, StringPiece device, StringPiece type_name) {
-  auto found = unary_op_fns.find(std::make_tuple(op, device, type_name));
+  auto found = unary_op_fns.find({op, device, type_name});
   if (found == unary_op_fns.end()) return nullptr;
   return &found->second;
 }
@@ -195,12 +195,10 @@ void UnaryVariantOpRegistry::RegisterUnaryOpFn(
   CHECK_EQ(existing, nullptr)
       << "Unary VariantUnaryOpFn for type_name: " << type_name
       << " already registered for device type: " << device;
-  unary_op_fns.insert(
-      std::pair<std::tuple<VariantUnaryOp, StringPiece, StringPiece>,
-                VariantUnaryOpFn>(
-          std::make_tuple(op, GetPersistentStringPiece(device),
-                          GetPersistentStringPiece(type_name)),
-          unary_op_fn));
+  unary_op_fns.insert(std::pair<FuncTuple<VariantUnaryOp>, VariantUnaryOpFn>(
+      {op, GetPersistentStringPiece(device),
+       GetPersistentStringPiece(type_name)},
+      unary_op_fn));
 }
 
 namespace {
@@ -229,7 +227,7 @@ REGISTER_VARIANT_ZEROS_LIKE_TYPE(bool);
 UnaryVariantOpRegistry::VariantBinaryOpFn*
 UnaryVariantOpRegistry::GetBinaryOpFn(VariantBinaryOp op, StringPiece device,
                                       StringPiece type_name) {
-  auto found = binary_op_fns.find(std::make_tuple(op, device, type_name));
+  auto found = binary_op_fns.find({op, device, type_name});
   if (found == binary_op_fns.end()) return nullptr;
   return &found->second;
 }
@@ -242,12 +240,10 @@ void UnaryVariantOpRegistry::RegisterBinaryOpFn(
   CHECK_EQ(existing, nullptr)
       << "Unary VariantBinaryOpFn for type_name: " << type_name
       << " already registered for device type: " << device;
-  binary_op_fns.insert(
-      std::pair<std::tuple<VariantBinaryOp, StringPiece, StringPiece>,
-                VariantBinaryOpFn>(
-          std::make_tuple(op, GetPersistentStringPiece(device),
-                          GetPersistentStringPiece(type_name)),
-          add_fn));
+  binary_op_fns.insert(std::pair<FuncTuple<VariantBinaryOp>, VariantBinaryOpFn>(
+      {op, GetPersistentStringPiece(device),
+       GetPersistentStringPiece(type_name)},
+      add_fn));
 }
 
 namespace {
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index 13f6908cae1ed1b1964bf827dce0fcb2bee4e6d1..c9e8dd2217e0dc0225fa38d0739d1551e0ba2433 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -166,6 +166,21 @@ class UnaryVariantOpRegistry {
       device_copy_fns;
 
   // Map std::tuple<Op, device, type_name> to function.
+
+  // this breaks by falling victim to "too perfect forwarding"
+  // see https://stackoverflow.com/questions/44475317/variadic-template-issue
+  // and references therein
+  template <typename Op>
+  struct FuncTuple {
+    FuncTuple(const Op& op, const StringPiece& dev, const StringPiece& tname)
+        : op_type_(op), device_(dev), typename_(tname){};
+    Op op_type_;
+    StringPiece device_, typename_;
+  };
+  // friend declaration for operator==
+  // needed for clang
+  template <typename Op>
+  friend bool operator==(const FuncTuple<Op>& l, const FuncTuple<Op>& r);
   struct TupleHash {
     template <typename Op>
     std::size_t operator()(
@@ -176,18 +191,25 @@ class UnaryVariantOpRegistry {
       ret = Hash64Combine(ret, sp_hasher_(std::get<2>(x)));
       return ret;
     }
+
+    template <typename Op>
+    std::size_t operator()(const FuncTuple<Op>& x) const {
+      // The hash of an enum is just its value as a std::size_t.
+      std::size_t ret = static_cast<std::size_t>(x.op_type_);
+      ret = Hash64Combine(ret, sp_hasher_(x.device_));
+      ret = Hash64Combine(ret, sp_hasher_(x.typename_));
+      return ret;
+    }
     StringPieceHasher sp_hasher_;
   };
-  std::unordered_map<std::tuple<VariantUnaryOp, StringPiece, StringPiece>,
-                     VariantUnaryOpFn, TupleHash>
+  std::unordered_map<FuncTuple<VariantUnaryOp>, VariantUnaryOpFn, TupleHash>
       unary_op_fns;
-  std::unordered_map<std::tuple<VariantBinaryOp, StringPiece, StringPiece>,
-                     VariantBinaryOpFn, TupleHash>
+  std::unordered_map<FuncTuple<VariantBinaryOp>, VariantBinaryOpFn, TupleHash>
       binary_op_fns;
 
   // Find or insert a string into a persistent string storage
-  // container; return the StringPiece pointing to the permanent
-  // string location.
+  // container; return the StringPiece pointing to the permanent string
+  // location.
   static StringPiece GetPersistentStringPiece(const string& str) {
     const auto string_storage = PersistentStringStorage();
     auto found = string_storage->find(str);
@@ -199,7 +221,12 @@ class UnaryVariantOpRegistry {
     }
   }
 };
-
+template <typename Op>
+inline bool operator==(const UnaryVariantOpRegistry::FuncTuple<Op>& lhs,
+                       const UnaryVariantOpRegistry::FuncTuple<Op>& rhs) {
+  return (lhs.op_type_ == rhs.op_type_) && (lhs.device_ == rhs.device_) &&
+         (lhs.typename_ == rhs.typename_);
+}
 // Gets a TensorShape from a Tensor containing a scalar Variant.
 // Returns an Internal error if the Variant does not have a registered shape
 // function, or if it's a serialized Variant that cannot be decoded.
@@ -283,8 +310,8 @@ Status BinaryOpVariants(OpKernelContext* ctx, VariantBinaryOp op,
     return errors::Internal(
         "No unary variant binary_op function found for binary variant op "
         "enum: ",
-        op, " Variant type_name: '", a.TypeName(),
-        "' for device type: ", device);
+        op, " Variant type_name: '", a.TypeName(), "' for device type: ",
+        device);
   }
   return (*binary_op_fn)(ctx, a, b, out);
 }
diff --git a/tensorflow/core/framework/variant_tensor_data.cc b/tensorflow/core/framework/variant_tensor_data.cc
index 82479193d2a3464897b0fff6c8feaf6c487a23c4..99712dc114b248ba47ee7427c83cb84d5678e244 100644
--- a/tensorflow/core/framework/variant_tensor_data.cc
+++ b/tensorflow/core/framework/variant_tensor_data.cc
@@ -34,7 +34,9 @@ const Tensor& VariantTensorData::tensors(int index) const {
   return tensors_[index];
 }
 
-std::vector<Tensor> VariantTensorData::tensors() { return tensors_; }
+const std::vector<Tensor>& VariantTensorData::tensors() const {
+  return tensors_;
+}
 
 Tensor* VariantTensorData::add_tensors() {
   tensors_.emplace_back();
diff --git a/tensorflow/core/framework/variant_tensor_data.h b/tensorflow/core/framework/variant_tensor_data.h
index 6e04879494af447e620f6737bc749f68d9e1394d..1d87bc341a4bd268d1e461b3710d006cf99cc685 100644
--- a/tensorflow/core/framework/variant_tensor_data.h
+++ b/tensorflow/core/framework/variant_tensor_data.h
@@ -63,7 +63,7 @@ class VariantTensorData {
   // Tensors contained within objects being serialized.
   int tensors_size() const;
   const Tensor& tensors(int index) const;
-  std::vector<Tensor> tensors();
+  const std::vector<Tensor>& tensors() const;
   Tensor* add_tensors();
 
   // Conversion to and from VariantTensorDataProto
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index 6ef51aa7dfcd48f840f80040f068a766a33ff5bf..4652fbe40691a01e0567c7df2fba0ca2ea482fe1 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -83,13 +83,16 @@ void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
   ReverseDFSFrom(g, {g.sink_node()}, enter, leave, stable_comparator);
 }
 
-void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
-                    const std::function<void(Node*)>& enter,
-                    const std::function<void(Node*)>& leave,
-                    const NodeComparator& stable_comparator) {
+namespace {
+
+template <typename T>
+void ReverseDFSFromHelper(const Graph& g, gtl::ArraySlice<T> start,
+                          const std::function<void(T)>& enter,
+                          const std::function<void(T)>& leave,
+                          const NodeComparator& stable_comparator) {
   // Stack of work to do.
   struct Work {
-    Node* node;
+    T node;
     bool leave;  // Are we entering or leaving n?
   };
   std::vector<Work> stack(start.size());
@@ -102,7 +105,7 @@ void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
     Work w = stack.back();
     stack.pop_back();
 
-    Node* n = w.node;
+    T n = w.node;
     if (w.leave) {
       leave(n);
       continue;
@@ -117,7 +120,7 @@ void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
 
     gtl::iterator_range<NeighborIter> nodes = n->in_nodes();
 
-    auto add_work = [&visited, &stack](Node* out) {
+    auto add_work = [&visited, &stack](T out) {
       if (!visited[out->id()]) {
         // Note; we must not mark as visited until we actually process it.
         stack.push_back(Work{out, false});
@@ -125,22 +128,38 @@ void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
     };
 
     if (stable_comparator) {
-      std::vector<Node*> nodes_sorted;
-      for (Node* in : nodes) {
+      std::vector<T> nodes_sorted;
+      for (T in : nodes) {
         nodes_sorted.emplace_back(in);
       }
       std::sort(nodes_sorted.begin(), nodes_sorted.end(), stable_comparator);
-      for (Node* in : nodes_sorted) {
+      for (T in : nodes_sorted) {
         add_work(in);
       }
     } else {
-      for (Node* in : nodes) {
+      for (T in : nodes) {
         add_work(in);
       }
     }
   }
 }
 
+}  // namespace
+
+void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+                    const std::function<void(const Node*)>& enter,
+                    const std::function<void(const Node*)>& leave,
+                    const NodeComparator& stable_comparator) {
+  ReverseDFSFromHelper(g, start, enter, leave, stable_comparator);
+}
+
+void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                    const std::function<void(Node*)>& enter,
+                    const std::function<void(Node*)>& leave,
+                    const NodeComparator& stable_comparator) {
+  ReverseDFSFromHelper(g, start, enter, leave, stable_comparator);
+}
+
 void GetPostOrder(const Graph& g, std::vector<Node*>* order,
                   const NodeComparator& stable_comparator) {
   order->clear();
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index 5bb6041d98b6aebd3036b68fffeed32afda85e50..ac4a099013b67e0d256a9310495e4b585eb40e0a 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -69,6 +69,10 @@ extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
                            const std::function<void(Node*)>& enter,
                            const std::function<void(Node*)>& leave,
                            const NodeComparator& stable_comparator = {});
+extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+                           const std::function<void(const Node*)>& enter,
+                           const std::function<void(const Node*)>& leave,
+                           const NodeComparator& stable_comparator = {});
 
 // Stores in *order the post-order numbering of all nodes
 // in graph found via a depth first search starting at the source node.
diff --git a/tensorflow/core/graph/algorithm_test.cc b/tensorflow/core/graph/algorithm_test.cc
index 0cdcdb66856f0135720277bb7fab23dd24d3dde9..99ced0c0f5daa7c722aa4060e9a954855411010b 100644
--- a/tensorflow/core/graph/algorithm_test.cc
+++ b/tensorflow/core/graph/algorithm_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -81,7 +82,7 @@ TEST(AlgorithmTest, ReversePostOrder) {
   BinaryOp("TestMul", w2, {input, 1}, b.opts().WithName("t3"));
 
   Graph g(OpRegistry::Global());
-  TF_ASSERT_OK(b.ToGraph(&g));
+  TF_ASSERT_OK(GraphDefBuilderToGraph(b, &g));
   std::vector<Node*> order;
 
   // Test reverse post order:
@@ -139,7 +140,7 @@ TEST(AlgorithmTest, ReversePostOrderStable) {
     BinaryOp("TestMul", w1, {input, 1}, b.opts().WithName("t3"));
 
     Graph g(OpRegistry::Global());
-    TF_ASSERT_OK(b.ToGraph(&g));
+    TF_ASSERT_OK(GraphDefBuilderToGraph(b, &g));
     std::vector<Node*> order;
 
     // Test reverse post order generates expected ordering.
diff --git a/tensorflow/core/graph/control_flow.h b/tensorflow/core/graph/control_flow.h
index 22dbb47010729d61547b33db3a6c8b0ad4fefdb4..372044f538f9428e1979ba80bbb18a9742fc014e 100644
--- a/tensorflow/core/graph/control_flow.h
+++ b/tensorflow/core/graph/control_flow.h
@@ -33,6 +33,7 @@ struct ControlFlowInfo {
 // Assign to each node the name of the frame and the level it belongs to.
 // We check the well-formedness of the graph: All inputs to a node must
 // come from the same frame and have the same "static" iteration level.
+// `info` is cleared and populated by this function.
 // NOTE(yuanbyu): For now, we require all sends/recvs have iteration level
 // 0. This essentially means there can't be multiple serial Nexts in
 // an iteration, which all sane front-ends should satisfy.
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index 3ed32068ae19b73f93b2b2bd12d77712a1273cfb..4f3a6ec38cb88213c7127df41823bc16e9834d09 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -57,10 +57,10 @@ void CostModel::MergeFromLocal(const Graph& g, const CostModel& cm) {
     const int local_id = cm.Id(n);
     const int global_id = Id(n);
     if (local_id < 0 || global_id < 0) continue;
-    Ensure(global_id);
+    int num_slots = cm.slot_bytes_[local_id].size();
+    Ensure(global_id, num_slots);
     count_[global_id] += cm.count_[local_id];
     time_[global_id] += cm.time_[local_id];
-    int num_slots = cm.slot_bytes_[local_id].size();
     if (num_slots > 0) {
       if (slot_bytes_[global_id].empty()) {
         slot_bytes_[global_id].resize(num_slots);
@@ -78,11 +78,11 @@ void CostModel::MergeFromGlobal(const CostModel& cm) {
   CHECK(is_global_);
   CHECK_EQ(true, cm.is_global());
   const int num_nodes = cm.count_.size();
-  Ensure(num_nodes);
-  for (int i = 0; i < num_nodes; ++i) {
+  for (int i = num_nodes - 1; i >= 0; --i) {
     count_[i] += cm.count_[i];
     time_[i] += cm.time_[i];
     int num_slots = cm.slot_bytes_[i].size();
+    Ensure(i, num_slots);
     if (num_slots > 0) {
       if (slot_bytes_[i].empty()) {
         slot_bytes_[i].resize(num_slots);
@@ -106,7 +106,7 @@ void CostModel::MergeFromStats(const NodeNameToCostIdMap& map,
       // copy/send/recv nodes, feed/fetch, etc.
       if (iter == map.end()) continue;
       int32 global_id = iter->second;
-      Ensure(global_id);
+      Ensure(global_id, ns.output_size());
       int64 elapsed_micros = ns.op_end_rel_micros() - ns.op_start_rel_micros();
       count_[global_id]++;
       time_[global_id] += elapsed_micros;
@@ -122,7 +122,7 @@ void CostModel::MergeFromStats(const NodeNameToCostIdMap& map,
   }
 }
 
-void CostModel::Ensure(int id) {
+void CostModel::Ensure(int id, int num_outputs) {
   if (slot_bytes_.size() <= static_cast<size_t>(id)) {
     slot_bytes_.resize(id + 1);
     count_.resize(id + 1);
@@ -131,25 +131,37 @@ void CostModel::Ensure(int id) {
     max_exec_time_.resize(id + 1);
     output_port_alloc_ids_.resize(id + 1);
   }
+  if (num_outputs > 0) {
+    auto perslot = &slot_bytes_[id];
+    auto output_port_alloc_ids = &output_port_alloc_ids_[id];
+    auto max_mem_usage = &max_mem_usage_[id];
+
+    CHECK_LE(perslot->size(), num_outputs);
+    DCHECK_EQ(output_port_alloc_ids->size(), perslot->size());
+    DCHECK_EQ(max_mem_usage->output_port_mem.size(), perslot->size());
+    DCHECK_EQ(max_mem_usage->output_port_shape.size(), perslot->size());
+    DCHECK_EQ(max_mem_usage->output_port_type.size(), perslot->size());
+
+    perslot->resize(num_outputs, Bytes(-1));
+    output_port_alloc_ids->resize(num_outputs, -1);
+    max_mem_usage->output_port_mem.resize(num_outputs, Bytes(-1));
+    max_mem_usage->output_port_shape.resize(num_outputs, unknown_shape_);
+    max_mem_usage->output_port_type.resize(num_outputs, DT_INVALID);
+  }
 }
 
 void CostModel::SetNumOutputs(const Node* node, int num_outputs) {
   const int id = Id(node);
   if (id < 0) return;
-  Ensure(id);
+  // Do not resize the number of slots before checking its existing number of
+  // slots.
+  Ensure(id, 0);
   auto perslot = &slot_bytes_[id];
-  auto max_mem_usage = &max_mem_usage_[id];
-  auto output_port_alloc_ids = &output_port_alloc_ids_[id];
   if (!perslot->empty()) {
-    CHECK_EQ(num_outputs, perslot->size()) << "Cannot resize slot_bytes, node="
-                                           << node->name();
-  } else {
-    perslot->resize(num_outputs, Bytes(-1));
-    output_port_alloc_ids->resize(num_outputs, -1);
-    max_mem_usage->output_port_mem.resize(num_outputs, Bytes(-1));
-    max_mem_usage->output_port_shape.resize(num_outputs, unknown_shape_);
-    max_mem_usage->output_port_type.resize(num_outputs, DT_INVALID);
+    CHECK_EQ(num_outputs, perslot->size())
+        << "Cannot resize slot_bytes, node=" << node->name();
   }
+  Ensure(id, num_outputs);
 }
 
 void CostModel::RecordCount(const Node* node, int count) {
@@ -198,7 +210,7 @@ void CostModel::RecordTime(const Node* node, Microseconds time) {
   const int id = Id(node);
   if (id < 0) return;
   DCHECK(node->IsOp()) << node->DebugString();
-  Ensure(id);
+  Ensure(id, node->num_outputs());
   time_[id] += time;
 }
 
@@ -240,7 +252,13 @@ void CostModel::RecordMaxMemorySize(const Node* node, int output_slot,
                                     const DataType& dtype) {
   const int id = Id(node);
   if (id < 0) return;
-  Ensure(id);
+  if (output_slot >= node->num_outputs()) {
+    LOG(ERROR) << "Unexpected output slot for node " << node->DebugString()
+               << ". Got " << output_slot << " but its num_outputs is "
+               << node->num_outputs();
+    return;
+  }
+  Ensure(id, node->num_outputs());
   auto& current_max = max_mem_usage_[id].output_port_mem[output_slot];
   // If the memory allocator doesn't track memory usage, let's infer a lower
   // bound from the tensor shape and its data type.
@@ -291,59 +309,24 @@ Bytes CostModel::TempMemorySize(const Node* node) const {
   return max_mem_usage_[id].temp_memory_size;
 }
 
-Bytes CostModel::HostTempMemorySize(const Node* node) const {
-  const int id = Id(node);
-  if (id < 0) {
-    return Bytes(0);
-  }
-  return max_mem_usage_[id].host_temp_memory_size;
-}
-
-Bytes CostModel::DeviceTempMemorySize(const Node* node) const {
-  const int id = Id(node);
-  if (id < 0) {
-    return Bytes(0);
-  }
-  return max_mem_usage_[id].device_temp_memory_size;
-}
-
-Bytes CostModel::HostPersistentMemorySize(const Node* node) const {
+Bytes CostModel::PersistentMemorySize(const Node* node) const {
   const int id = Id(node);
   if (id < 0) {
     return Bytes(0);
   }
-  return max_mem_usage_[id].host_persistent_memory_size;
-}
-
-Bytes CostModel::DevicePersistentMemorySize(const Node* node) const {
-  const int id = Id(node);
-  if (id < 0) {
-    return Bytes(0);
-  }
-  return max_mem_usage_[id].device_persistent_memory_size;
+  return max_mem_usage_[id].persistent_memory_size;
 }
 
 void CostModel::RecordMemoryStats(const Node* node,
                                   const MemoryStats& memory_stats) {
   const int id = Id(node);
   if (id < 0) return;
-  max_mem_usage_[id].host_temp_memory_size =
-      memory_stats.host_temp_memory_size();
-  max_mem_usage_[id].device_temp_memory_size =
-      memory_stats.device_temp_memory_size();
-  max_mem_usage_[id].host_persistent_memory_size =
-      memory_stats.host_persistent_memory_size();
-  max_mem_usage_[id].device_persistent_memory_size =
-      memory_stats.device_persistent_memory_size();
-  for (int64 alloc_id : memory_stats.host_persistent_tensor_alloc_ids()) {
-    if (alloc_id > 0) {
-      host_persistent_alloc_ids_.insert(alloc_id);
-    }
-  }
-  for (int64 alloc_id : memory_stats.device_persistent_tensor_alloc_ids()) {
+  max_mem_usage_[id].temp_memory_size = memory_stats.temp_memory_size();
+  max_mem_usage_[id].persistent_memory_size =
+      memory_stats.persistent_memory_size();
+  for (int64 alloc_id : memory_stats.persistent_tensor_alloc_ids()) {
     if (alloc_id > 0) {
-      persistent_alloc_ids_by_devices_[node->assigned_device_name()].insert(
-          alloc_id);
+      persistent_alloc_ids_.insert(alloc_id);
     }
   }
 }
@@ -351,7 +334,7 @@ void CostModel::RecordMemoryStats(const Node* node,
 void CostModel::RecordMaxExecutionTime(const Node* node, Microseconds time) {
   const int id = Id(node);
   if (id < 0) return;
-  Ensure(id);
+  Ensure(id, node->num_outputs());
   max_exec_time_[id] = std::max(max_exec_time_[id], time);
 }
 
@@ -367,7 +350,7 @@ void CostModel::RecordAllocationId(const Node* node, int output_slot,
                                    int64 alloc_id) {
   const int id = Id(node);
   if (id < 0) return;
-  Ensure(id);
+  Ensure(id, node->num_outputs());
   output_port_alloc_ids_[id][output_slot] = alloc_id;
 }
 
@@ -381,7 +364,7 @@ int64 CostModel::AllocationId(const Node* node, int slot) const {
 }
 
 bool CostModel::IsPersistentTensor(const Node* node, int64 alloc_id) const {
-  if (host_persistent_alloc_ids_.count(alloc_id) > 0) {
+  if (persistent_alloc_ids_.count(alloc_id) > 0) {
     return true;
   }
   if (persistent_alloc_ids_by_devices_.find(node->assigned_device_name()) ==
@@ -548,11 +531,8 @@ void CostModel::AddToCostGraphDef(const Graph* graph,
       cnode->add_control_input(Id(e->src()));
     }
 
-    cnode->set_host_temp_memory_size(HostTempMemorySize(n).value());
-    cnode->set_device_temp_memory_size(DeviceTempMemorySize(n).value());
-    cnode->set_host_persistent_memory_size(HostPersistentMemorySize(n).value());
-    cnode->set_device_persistent_memory_size(
-        DevicePersistentMemorySize(n).value());
+    cnode->set_temporary_memory_size(TempMemorySize(n).value());
+    cnode->set_persistent_memory_size(PersistentMemorySize(n).value());
 
     cnode->set_compute_cost(MaxExecutionTime(n).value());
 
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index 8afa4971ad054b31eeb63d0dadaa1a2937c47a6e..9b703e46938b3355ed769045cdb3f298b48bb922 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -133,13 +133,8 @@ class CostModel {
   // Returns the size in bytes of temporary memory consumed by "node".
   Bytes TempMemorySize(const Node* node) const;
 
-  // Returns the size in bytes of temporary memory consumed by "node".
-  Bytes HostTempMemorySize(const Node* node) const;
-  Bytes DeviceTempMemorySize(const Node* node) const;
-
   // Returns the size of persistent memory allocated by "node".
-  Bytes HostPersistentMemorySize(const Node* node) const;
-  Bytes DevicePersistentMemorySize(const Node* node) const;
+  Bytes PersistentMemorySize(const Node* node) const;
 
   // Records memory stats such as temp momory and persistent memory.
   void RecordMemoryStats(const Node* node, const MemoryStats& memory_stats);
@@ -188,8 +183,8 @@ class CostModel {
 
   const bool is_global_;
 
-  // Resizes vectors so that they are large enough for "id".
-  void Ensure(int id);
+  // Resizes vectors so that they are large enough for "id" and id's outputs.
+  void Ensure(int id, int num_outputs);
 
   // Nodes and Edges whose count is < this value
   // get type/byte estimates of 0.
@@ -203,28 +198,18 @@ class CostModel {
   // Cumulative execution time.
   std::vector<Microseconds> time_;
   // Cumulative Bytes output on each channel.
-  std::vector<gtl::InlinedVector<Bytes, 2> > slot_bytes_;
+  std::vector<gtl::InlinedVector<Bytes, 2>> slot_bytes_;
 
   // Maximum execution time
   std::vector<Microseconds> max_exec_time_;
 
   // Maximum memory usage
   struct MemUsage {
-    MemUsage()
-        : temp_memory_size(-1),
-          host_temp_memory_size(0),
-          device_temp_memory_size(0),
-          host_persistent_memory_size(0),
-          device_persistent_memory_size(0) {}
+    MemUsage() : temp_memory_size(0), persistent_memory_size(0) {}
 
     // TODO(yuefengz): temp_memory_size is not being used, remove it.
     Bytes temp_memory_size;
-
-    Bytes host_temp_memory_size;
-    Bytes device_temp_memory_size;
-
-    Bytes host_persistent_memory_size;
-    Bytes device_persistent_memory_size;
+    Bytes persistent_memory_size;
 
     gtl::InlinedVector<Bytes, 2> output_port_mem;
     gtl::InlinedVector<TensorShapeProto, 2> output_port_shape;
@@ -232,9 +217,9 @@ class CostModel {
   };
   std::vector<MemUsage> max_mem_usage_;
 
-  std::vector<gtl::InlinedVector<int64, 2> > output_port_alloc_ids_;
+  std::vector<gtl::InlinedVector<int64, 2>> output_port_alloc_ids_;
 
-  std::set<int64> host_persistent_alloc_ids_;
+  std::set<int64> persistent_alloc_ids_;
   std::map<string, std::set<int64>> persistent_alloc_ids_by_devices_;
 
   TensorShapeProto unknown_shape_;
diff --git a/tensorflow/core/graph/edgeset.h b/tensorflow/core/graph/edgeset.h
index 8916ccf4d0d051b9b9d5197667a05eda7265db79..0a1ee5a666cbd0d1978c075f75ab688223355f78 100644
--- a/tensorflow/core/graph/edgeset.h
+++ b/tensorflow/core/graph/edgeset.h
@@ -54,7 +54,7 @@ class EdgeSet {
  private:
   // Up to kInline elements are stored directly in ptrs_ (nullptr means none).
   // If ptrs_[0] == this then ptrs_[1] points to a set<const Edge*>.
-  static const int kInline = 2;  // Must be >= 2.
+  static const int kInline = 4;  // Must be >= 2.
   const void* ptrs_[kInline];
 
   std::set<const Edge*>* get_set() const {
diff --git a/tensorflow/core/graph/gradients.h b/tensorflow/core/graph/gradients.h
index 75906e6ce96de3deb5bb603fb4ca06763496bb6d..ddfed084b09c1072aae7ae7838d84c4659188bf4 100644
--- a/tensorflow/core/graph/gradients.h
+++ b/tensorflow/core/graph/gradients.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
+#ifndef TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
+#define TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
 
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -55,4 +55,4 @@ Status AddSymbolicGradients(gtl::ArraySlice<NodeOut> y_node_outputs,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
+#endif  // TENSORFLOW_CORE_GRAPH_GRADIENTS_H_
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index fd1b5d33b93d0e2685cd7a909bbcc9909d7d3f87..9b56216f1f97a9598dd7ae8b70786e32bb7e0f4b 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -522,6 +522,12 @@ void Graph::ToGraphDef(GraphDef* graph_def) const {
   ToGraphDefSubRange(graph_def, 0);
 }
 
+GraphDef Graph::ToGraphDefDebug() const {
+  GraphDef ret;
+  ToGraphDef(&ret);
+  return ret;
+}
+
 void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
   graph_def->Clear();
   *graph_def->mutable_versions() = versions();
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index b620127d9072a845721f97112f4bad107412b06f..9d96cd4654bbf1fd65c5135d6a8bdc271c6e9443 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -62,8 +62,8 @@ class Node;
 class VersionDef;
 class WhileContext;
 
-class NeighborIter;  // Declared below
-class NodeIter;      // Declared below
+class NeighborIter;    // Declared below
+class NodeIter;        // Declared below
 class NodeProperties;  // Defined in .cc
 
 class Node {
@@ -494,6 +494,13 @@ class Graph {
   // Serialize to a GraphDef.
   void ToGraphDef(GraphDef* graph_def) const;
 
+  // This version can be called from debugger to inspect the graph content.
+  // Use the previous version outside debug context for efficiency reasons.
+  //
+  // Note: We do not expose a DebugString() API, since GraphDef.DebugString() is
+  // not defined in some TensorFlow builds.
+  GraphDef ToGraphDefDebug() const;
+
   // Generate new node name with the specified prefix that is unique
   // across this graph.
   string NewName(StringPiece prefix);
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 8890a9fb0f6858866d552d547ac31b7f40d9c8dd..0629ff32d00cf7fad00c39f07810aa4a9d57f14f 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -77,19 +77,22 @@ class GraphConstructor {
                      ? in.prefix
                      : in.prefix + "/"),
           uniquify_names(in.uniquify_names),
+          uniquify_prefix(in.uniquify_prefix),
           input_map(in.input_map),
           skip_mapped_nodes(in.skip_mapped_nodes),
           control_dependencies(in.control_dependencies),
           return_tensors(in.return_tensors),
           return_nodes(in.return_nodes),
           importing(true),
-          validate_colocation_constraints(in.validate_colocation_constraints) {}
+          validate_colocation_constraints(in.validate_colocation_constraints),
+          validate_shape(in.validate_shape) {}
 
     bool allow_internal_ops;
     bool expect_device_spec;
 
     string prefix;
     bool uniquify_names;
+    bool uniquify_prefix;
     std::map<TensorId, TensorId> input_map;
     bool skip_mapped_nodes;
     std::vector<string> control_dependencies;
@@ -106,25 +109,26 @@ class GraphConstructor {
     // remove this.
     bool importing;
     bool validate_colocation_constraints;
+    bool validate_shape = true;
   };
 
   typedef gtl::ArraySlice<const NodeDef*> NodeDefSlice;
 
   // versions and library may be nullptr
-  static Status Construct(const Options& opts, NodeDefSlice node_defs,
-                          const VersionDef* versions,
-                          const FunctionDefLibrary* library, Graph* g,
-                          ShapeRefiner* refiner,
-                          std::vector<std::pair<Node*, int>>* return_tensors,
-                          std::vector<Node*>* return_nodes,
-                          std::vector<TensorId>* unused_input_map_keys) {
+  static Status Construct(
+      const Options& opts, NodeDefSlice node_defs, const VersionDef* versions,
+      const FunctionDefLibrary* library, Graph* g, ShapeRefiner* refiner,
+      std::vector<std::pair<Node*, int>>* return_tensors,
+      std::vector<Node*>* return_nodes,
+      std::vector<TensorId>* missing_unused_input_map_keys) {
     if (versions) {
       TF_RETURN_IF_ERROR(CheckVersions(*versions, TF_GRAPH_DEF_VERSION,
                                        TF_GRAPH_DEF_VERSION_MIN_PRODUCER,
                                        "GraphDef", "graph"));
     }
     GraphConstructor c(opts, node_defs, versions, library, g, refiner,
-                       return_tensors, return_nodes, unused_input_map_keys);
+                       return_tensors, return_nodes,
+                       missing_unused_input_map_keys);
     const Status s = c.TryImport();
     if (!s.ok()) c.Undo();
     return s;
@@ -137,17 +141,18 @@ class GraphConstructor {
                    ShapeRefiner* refiner,
                    std::vector<std::pair<Node*, int>>* return_tensors,
                    std::vector<Node*>* return_nodes,
-                   std::vector<TensorId>* unused_input_map_keys)
+                   std::vector<TensorId>* missing_unused_input_map_keys)
       : opts_(opts),
         node_defs_(node_defs),
         versions_(versions),
         library_(library),
         g_(g),
         original_versions_(g->versions()),
+        prefix_(opts.prefix),
         refiner_(refiner),
         return_tensors_(return_tensors),
         return_nodes_(return_nodes),
-        unused_input_map_keys_(unused_input_map_keys) {}
+        missing_unused_input_map_keys_(missing_unused_input_map_keys) {}
 
   Status TryImport() {
     TF_RETURN_IF_ERROR(EnsureNoNameCollisions());
@@ -159,6 +164,8 @@ class GraphConstructor {
     TF_RETURN_IF_ERROR(UpdateVersionDef());
     TF_RETURN_IF_ERROR(PopulateReturnTensors());
     TF_RETURN_IF_ERROR(PopulateReturnNodes());
+    TF_RETURN_IF_ERROR(PopulateMissingUnusedInputMapKeys());
+    UpdateUniquifiedColocationNames();
     FixupSourceAndSinkEdges(g_);
     return Status::OK();
   }
@@ -172,6 +179,7 @@ class GraphConstructor {
   Status UpdateVersionDef();
   Status PopulateReturnTensors();
   Status PopulateReturnNodes();
+  Status PopulateMissingUnusedInputMapKeys();
 
   void Undo();
 
@@ -201,9 +209,18 @@ class GraphConstructor {
   void UniquifyNames(const std::vector<bool>& input_already_exists,
                      NodeDef* node_def);
 
+  // Updates any constructed nodes' colocation group names if the name has been
+  // updated by UniquifyNames. This is called after all the nodes have been
+  // constructed so all the names have been uniquified if necessary.
+  void UpdateUniquifiedColocationNames();
+
   // Returns true if `name` already exists in `g_` (either as a node name or
   // prefix).
-  bool NameExists(StringPiece name);
+  bool NameExistsInGraph(StringPiece name);
+
+  // Returns true if `name` already exists in the GraphDef being imported
+  // (either as a node name or prefix).
+  bool NameExistsInGraphDef(StringPiece name);
 
   // Returns a unique version of `original_name`, or `original_name` if it's
   // already unique in the graph.
@@ -217,6 +234,9 @@ class GraphConstructor {
   Graph* g_;
   const VersionDef original_versions_;
 
+  // A copy of opts_.prefix, possibly uniquified.
+  string prefix_;
+
   ShapeRefiner* refiner_;
 
   // May be null. Not owned.
@@ -226,9 +246,10 @@ class GraphConstructor {
   std::vector<Node*>* return_nodes_;
 
   // May be null. Not owned.
-  std::vector<TensorId>* unused_input_map_keys_;
+  std::vector<TensorId>* missing_unused_input_map_keys_;
 
-  // Intermediate datastructure used to populate `unused_input_map_keys_`.
+  // Intermediate datastructure used to populate
+  // `missing_unused_input_map_keys_`.
   std::set<TensorId> used_input_map_keys_;
 
   // Mapping from node name to the index within node_defs_.
@@ -243,6 +264,9 @@ class GraphConstructor {
   // alternative implementation of std::unordered_map.
   std::unordered_map<StringPiece, NodeInfo, StringPieceHasher> gdef_nodes_;
 
+  // Prefixes already used in the GraphDef being imported.
+  std::unordered_set<StringPiece, StringPieceHasher> gdef_prefixes_;
+
   // Mapping from node name to the existing node in g_.
   std::unordered_map<StringPiece, Node*, StringPieceHasher> existing_nodes_;
 
@@ -305,6 +329,16 @@ bool NodeNameInValues(const std::vector<string>& control_dependencies,
                    node_name) != control_dependencies.end();
 }
 
+// Adds any prefixes of `node_name` (not including the full name itself) to
+// `prefixes`.
+void AddPrefixes(StringPiece node_name,
+                 std::unordered_set<StringPiece, StringPieceHasher>* prefixes) {
+  size_t idx = -1;
+  while ((idx = node_name.find('/', idx + 1)) != StringPiece::npos) {
+    prefixes->insert(node_name.substr(0, idx));
+  }
+}
+
 Status GraphConstructor::EnsureNoNameCollisions() {
   existing_nodes_.reserve(g_->num_nodes());
   // Populate existing_nodes_ and existing_prefixes_.
@@ -323,34 +357,25 @@ Status GraphConstructor::EnsureNoNameCollisions() {
             n->name(), "'");
       }
     }
-    // Add all of node's prefixes to existing_prefixes_ (if it has any).
-    size_t idx = -1;
-    while ((idx = n->name().find('/', idx + 1)) != string::npos) {
-      StringPiece name(n->name());
-      existing_prefixes_.insert(name.substr(0, idx));
-    }
+    AddPrefixes(n->name(), &existing_prefixes_);
   }
-  if (opts_.prefix.empty() && opts_.importing && !opts_.uniquify_names) {
+  if (prefix_.empty() && opts_.importing && !opts_.uniquify_names) {
     for (const NodeDef* n : node_defs_) {
       const string& name = n->name();
-      if (NameExists(name)) {
+      if (NameExistsInGraph(name)) {
         return errors::InvalidArgument("Node name '", name,
                                        "' already exists in the Graph");
       }
     }
-  } else if (!opts_.prefix.empty()) {
-    StringPiece prefix_no_slash(opts_.prefix);
+  } else if (!prefix_.empty()) {
+    StringPiece prefix_no_slash(prefix_);
     prefix_no_slash.remove_suffix(1);
     if (!IsValidNodeName(prefix_no_slash, false)) {
-      return errors::InvalidArgument("Imported node name prefix '",
-                                     opts_.prefix,
+      return errors::InvalidArgument("Imported node name prefix '", prefix_,
                                      "' would lead to invalid node names");
     }
-    if (NameExists(prefix_no_slash)) {
-      return errors::InvalidArgument("Import node name prefix '",
-                                     prefix_no_slash,
-                                     "' conflicts with "
-                                     "name already used in the graph");
+    if (NameExistsInGraph(prefix_no_slash) && opts_.uniquify_prefix) {
+      prefix_ = strings::StrCat(FindUniqueName(prefix_no_slash), "/");
     }
   }
   return Status::OK();
@@ -384,7 +409,7 @@ Status GraphConstructor::ValidateInputMapAndControlDependencies() {
 }
 
 Status GraphConstructor::BuildNodeIndex() {
-  // Validate the node names and add them to gdef_nodes_.
+  // Validate the node names and add them to gdef_nodes_ and gdef_prefixes_.
   for (int n = 0; n < node_defs_.size(); ++n) {
     const NodeDef& node_def = *node_defs_[n];
     if (!IsValidNodeName(node_def.name(), opts_.allow_internal_ops)) {
@@ -419,6 +444,8 @@ Status GraphConstructor::BuildNodeIndex() {
             "': Control dependencies must come after regular dependencies");
       }
     }
+    // Update gdef_prefixes_.
+    AddPrefixes(node_def.name(), &gdef_prefixes_);
   }
   return Status::OK();
 }
@@ -529,7 +556,7 @@ Status GraphConstructor::MakeNode(const NodeDef& node_def, Node** node) {
 }
 
 Status GraphConstructor::ValidateShape(Node* node) {
-  if (!opts_.importing) return Status::OK();
+  if (!opts_.importing || !opts_.validate_shape) return Status::OK();
   TF_RETURN_IF_ERROR(refiner_->AddNode(node));
   // For nodes with the _output_shapes attribute, override the shape.
   std::vector<TensorShapeProto> shape_attrs;
@@ -720,8 +747,8 @@ void GraphConstructor::AddControlDependencies(
 
 void GraphConstructor::AddPrefixToNodeDef(
     const std::vector<bool>& input_already_exists, NodeDef* node_def) {
-  if (opts_.prefix.empty()) return;
-  node_def->set_name(strings::StrCat(opts_.prefix, node_def->name()));
+  if (prefix_.empty()) return;
+  node_def->set_name(strings::StrCat(prefix_, node_def->name()));
   // Update names of input nodes
   for (int i = 0; i < node_def->input_size(); ++i) {
     StringPiece input(node_def->input(i));
@@ -729,9 +756,9 @@ void GraphConstructor::AddPrefixToNodeDef(
     // imported).
     if (input_already_exists[i]) continue;
     if (input.Consume("^")) {
-      node_def->set_input(i, strings::StrCat("^", opts_.prefix, input));
+      node_def->set_input(i, strings::StrCat("^", prefix_, input));
     } else {
-      node_def->set_input(i, strings::StrCat(opts_.prefix, input));
+      node_def->set_input(i, strings::StrCat(prefix_, input));
     }
   }
   // Update names of colocation groups
@@ -741,8 +768,7 @@ void GraphConstructor::AddPrefixToNodeDef(
     for (int i = 0; i < list->s_size(); ++i) {
       StringPiece v(list->s(i));
       if (v.Consume(kColocationGroupPrefix)) {
-        list->set_s(i,
-                    strings::StrCat(kColocationGroupPrefix, opts_.prefix, v));
+        list->set_s(i, strings::StrCat(kColocationGroupPrefix, prefix_, v));
       }
     }
   }
@@ -750,10 +776,13 @@ void GraphConstructor::AddPrefixToNodeDef(
 
 void GraphConstructor::UniquifyNames(
     const std::vector<bool>& input_already_exists, NodeDef* node_def) {
-  if (NameExists(node_def->name())) {
+  if (NameExistsInGraph(node_def->name())) {
     string old_name = node_def->name();
     node_def->set_name(FindUniqueName(node_def->name()));
     uniquified_names_[old_name] = node_def->name();
+    // Note that we don't have to update gdef_nodes_ or gdef_prefixes_ with
+    // `name` because we guarantee the original NodeDef names are unique,
+    // meaning we won't generate this name again.
   }
   for (int i = 0; i < node_def->input_size(); ++i) {
     // Skip remapped inputs (which already exist in g_ and are not being
@@ -768,31 +797,52 @@ void GraphConstructor::UniquifyNames(
     id.first = iter->second;
     node_def->set_input(i, id.ToString());
   }
-  // Update names of colocation groups
-  if (node_def->attr().find(kColocationAttrName) != node_def->attr().end()) {
-    auto* list =
-        node_def->mutable_attr()->at(kColocationAttrName).mutable_list();
-    for (int i = 0; i < list->s_size(); ++i) {
-      StringPiece v(list->s(i));
-      if (v.Consume(kColocationGroupPrefix)) {
-        auto iter = uniquified_names_.find(v.ToString());
-        if (iter == uniquified_names_.end()) continue;
-        list->set_s(i, strings::StrCat(kColocationGroupPrefix, iter->second));
+}
+
+void GraphConstructor::UpdateUniquifiedColocationNames() {
+  for (const auto& pair : gdef_nodes_) {
+    Node* node = pair.second.node;
+    if (node == nullptr) continue;
+    std::vector<string> coloc_values;
+    Status status =
+        GetNodeAttr(node->attrs(), kColocationAttrName, &coloc_values);
+    if (!status.ok()) continue;
+    bool updated = false;
+    for (int i = 0; i < coloc_values.size(); ++i) {
+      StringPiece val(coloc_values[i]);
+      if (val.Consume(kColocationGroupPrefix)) {
+        const auto& name_pair = uniquified_names_.find(val.ToString());
+        if (name_pair == uniquified_names_.end()) continue;
+        updated = true;
+        coloc_values[i] =
+            strings::StrCat(kColocationGroupPrefix, name_pair->second);
       }
     }
+    if (updated) {
+      node->AddAttr(kColocationAttrName, coloc_values);
+    }
   }
 }
 
-bool GraphConstructor::NameExists(StringPiece name) {
+bool GraphConstructor::NameExistsInGraph(StringPiece name) {
   if (existing_nodes_.find(name) != existing_nodes_.end()) return true;
-  return existing_prefixes_.find(name) != existing_prefixes_.end();
+  if (existing_prefixes_.find(name) != existing_prefixes_.end()) return true;
+  return false;
+}
+
+bool GraphConstructor::NameExistsInGraphDef(StringPiece name) {
+  if (gdef_nodes_.find(name) != gdef_nodes_.end()) return true;
+  if (gdef_prefixes_.find(name) != gdef_prefixes_.end()) return true;
+  return false;
 }
 
 string GraphConstructor::FindUniqueName(StringPiece original_name) {
   string name = original_name.ToString();
-  int count = 1;
-  while (NameExists(name)) {
-    name = strings::StrCat(original_name, "_", count++);
+  int count = 0;
+  // Check that any generated names don't collide with imported NodeDefs (as
+  // well as nodes in g_).
+  while (NameExistsInGraph(name) || (count > 0 && NameExistsInGraphDef(name))) {
+    name = strings::StrCat(original_name, "_", ++count);
   }
   return name;
 }
@@ -931,9 +981,12 @@ Status GraphConstructor::Convert() {
 
     Node* node;
     if (opts_.importing) {
-      if (!opts_.prefix.empty()) {
+      if (!prefix_.empty()) {
         AddPrefixToNodeDef(input_already_exists, &imported_node_def);
-      } else if (opts_.uniquify_names) {
+      }
+      // Note: no need to uniquify names if the prefix already guarantees
+      // uniqueness
+      if (opts_.uniquify_names && (prefix_.empty() || !opts_.uniquify_prefix)) {
         UniquifyNames(input_already_exists, &imported_node_def);
       }
       TF_RETURN_IF_ERROR(ModifyNodeDefForImport(&imported_node_def));
@@ -972,15 +1025,6 @@ Status GraphConstructor::Convert() {
                                    " nodes in a cycle");
   }
 
-  // Update unused_input_map_keys_
-  if (unused_input_map_keys_ != nullptr) {
-    for (const auto& pair : opts_.input_map) {
-      if (used_input_map_keys_.find(pair.first) == used_input_map_keys_.end()) {
-        unused_input_map_keys_->push_back(pair.first);
-      }
-    }
-  }
-
   return Status::OK();
 }
 
@@ -1070,6 +1114,33 @@ Status GraphConstructor::PopulateReturnNodes() {
   return Status::OK();
 }
 
+Status GraphConstructor::PopulateMissingUnusedInputMapKeys() {
+  if (missing_unused_input_map_keys_ == nullptr) return Status::OK();
+  for (const auto& input_map_pair : opts_.input_map) {
+    TensorId key = input_map_pair.first;
+    if (used_input_map_keys_.count(key) > 0) continue;
+
+    auto pair = gdef_nodes_.find(key.first);
+    if (pair == gdef_nodes_.end()) {
+      // key's node doesn't exist in GraphDef
+      missing_unused_input_map_keys_->push_back(key);
+      continue;
+    }
+
+    // Check that key's index is in bounds. Get the number of outputs from the
+    // NodeDef, rather than the imported Node, since the Node may not exist if
+    // opts_.skip_mapped_nodes is true.
+    const NodeDef* node_def = node_defs_[pair->second.gdef_index];
+    const OpDef* op_def;
+    TF_RETURN_IF_ERROR(g_->op_registry()->LookUpOpDef(node_def->op(), &op_def));
+    if (key.second >= op_def->output_arg_size()) {
+      // key's index out of bounds
+      missing_unused_input_map_keys_->push_back(key);
+    }
+  }
+  return Status::OK();
+}
+
 void GraphConstructor::Undo() {
   for (const auto& iter : gdef_nodes_) {
     if (iter.second.node != nullptr) {
@@ -1101,7 +1172,7 @@ Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
   return GraphConstructor::Construct(
       opts, gdef.node(), &gdef.versions(), &gdef.library(), g, &refiner,
       /*return_tensors=*/nullptr, /*return_nodes=*/nullptr,
-      /*unused_input_map_keys=*/nullptr);
+      /*missing_unused_input_map_keys=*/nullptr);
 }
 
 Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
@@ -1115,7 +1186,7 @@ Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
   return GraphConstructor::Construct(opts, node_defs, nullptr, nullptr, g,
                                      &refiner, /*return_tensors=*/nullptr,
                                      /*return_nodes=*/nullptr,
-                                     /*unused_input_map_keys=*/nullptr);
+                                     /*missing_unused_input_map_keys=*/nullptr);
 }
 
 Status ImportGraphDef(const ImportGraphDefOptions& opts, const GraphDef& gdef,
@@ -1144,7 +1215,7 @@ Status ImportGraphDef(const ImportGraphDefOptions& opts, const GraphDef& gdef,
 
   if (results != nullptr) {
     if (!results->return_tensors.empty() || !results->return_nodes.empty() ||
-        !results->unused_input_map_keys.empty()) {
+        !results->missing_unused_input_map_keys.empty()) {
       return errors::InvalidArgument(
           "All fields in results argument to ImportGraphDef() must be empty.");
     }
@@ -1187,7 +1258,7 @@ Status ImportGraphDef(const ImportGraphDefOptions& opts, const GraphDef& gdef,
     return GraphConstructor::Construct(
         opts, gdef.node(), &gdef.versions(), &gdef.library(), g, refiner,
         &results->return_tensors, &results->return_nodes,
-        &results->unused_input_map_keys);
+        &results->missing_unused_input_map_keys);
   }
 }
 
diff --git a/tensorflow/core/graph/graph_constructor.h b/tensorflow/core/graph/graph_constructor.h
index 4b418b862290d23f6838f6a1f43345adee467884..b03d655fe6fcd918227c62cbdbc76db6156a55c4 100644
--- a/tensorflow/core/graph/graph_constructor.h
+++ b/tensorflow/core/graph/graph_constructor.h
@@ -54,7 +54,11 @@ extern Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
 
 // Options for calling ImportGraphDef().
 struct ImportGraphDefOptions {
-  ImportGraphDefOptions() : uniquify_names(false), skip_mapped_nodes(false) {}
+  ImportGraphDefOptions()
+      : uniquify_names(false),
+        uniquify_prefix(false),
+        skip_mapped_nodes(false),
+        validate_shape(true) {}
 
   // Name prefix to use for nodes imported from the GraphDef.  For example, if
   // prefix="animals" and GraphDef contains a node "bunny" then the node will be
@@ -68,6 +72,11 @@ struct ImportGraphDefOptions {
   // will guarantee all node names are unique.
   bool uniquify_names;
 
+  // If true, `prefix` will be modified if it already exists as a node name or
+  // prefix in the graph. If false, a conflicting prefix will be treated as an
+  // error. This option has no effect if `prefix` isn't specified.
+  bool uniquify_prefix;
+
   // Maps tensors in `gdef` to existing tensors in `g`. Inputs in `gdef`
   // corresponding to `input_map` keys will be remapped to the nodes in `g`
   // corresponding to the values.
@@ -122,6 +131,9 @@ struct ImportGraphDefOptions {
   // If true, checks that all colocation constraints are nodes in the GraphDef.
   bool validate_colocation_constraints = true;
 
+  // If false skips shape validation.
+  bool validate_shape;
+
   // TODO(ashankar): Enable handling of GraphDefs produced by newer binaries
   // with ops that are not defined in the binary calling ImportGraphDef.
   // Similar to the producer_op_list argument to import_graph_def in the
@@ -140,9 +152,10 @@ struct ImportGraphDefResults {
   // The requested nodes associated with ImportGraphDefOptions::return_nodes.
   std::vector<Node*> return_nodes;
 
-  // Keys in ImportGraphDefOptions::input_map that weren't used as an input to
-  // any node in`gdef`.
-  std::vector<TensorId> unused_input_map_keys;
+  // Keys in ImportGraphDefOptions::input_map that don't appear in `gdef` and
+  // weren't used as an input to any node in `gdef`. These keys are likely due
+  // to typos, and callers may wish to treat their existence as an error.
+  std::vector<TensorId> missing_unused_input_map_keys;
 };
 
 // Adds the graph in GraphDef `gdef` into an existing Graph `*g`.
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 0f88c80b85a4b05c21f76713a3406c72354cba0c..963c1dc024b4265e14314c610399fc92331f053c 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -160,9 +160,7 @@ class GraphConstructorTest : public ::testing::Test {
   }
 
   string GraphDebugString() const {
-    GraphDef def;
-    graph_.ToGraphDef(&def);
-    return def.DebugString();
+    return graph_.ToGraphDefDebug().DebugString();
   }
 
   Graph graph_;
@@ -1433,7 +1431,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapDuplicateNodeNames) {
       &refiner);
 }
 
-TEST_F(GraphConstructorTest, ImportGraphDef_InputMapUnusedKeys) {
+TEST_F(GraphConstructorTest, ImportGraphDef_InputMapMissingUnusedKeys) {
   ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 
   // No input map
@@ -1443,10 +1441,10 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapUnusedKeys) {
       "node { name: 'W1' op: 'TestParams' }"
       "node { name: 'input' op: 'TestInput' }",
       opts, &refiner, &results);
-  EXPECT_TRUE(results.unused_input_map_keys.empty());
+  EXPECT_TRUE(results.missing_unused_input_map_keys.empty());
 
-  // Non-empty unused_input_map_keys
-  results.unused_input_map_keys.push_back(TensorId());
+  // Non-empty missing_unused_input_map_keys
+  results.missing_unused_input_map_keys.push_back(TensorId());
   ExpectError(
       "node { name: 'W2' op: 'TestParams' }", opts,
       {"All fields in results argument to ImportGraphDef() must be empty."},
@@ -1454,13 +1452,16 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapUnusedKeys) {
 
   // Input map with some used, some unused keys
   const int kControlSlot = Graph::kControlSlot;
-  results.unused_input_map_keys.clear();
+  results.missing_unused_input_map_keys.clear();
   opts.input_map[TensorId("W2", kControlSlot)] = TensorId("W1", kControlSlot);
   opts.input_map[TensorId("new_input", 0)] = TensorId("input", 0);
   opts.input_map[TensorId("new_input", 1)] = TensorId("input", 0);
-  opts.input_map[TensorId("new_input", kControlSlot)] =
-      TensorId("input", kControlSlot);
-  opts.input_map[TensorId("t1", 1)] = TensorId("input", 0);
+  // Unused and missing (nonexistent index)
+  opts.input_map[TensorId("new_input", 3)] = TensorId("input", 0);
+  // Unused and missing (nonexistent node)
+  opts.input_map[TensorId("DNE", 0)] = TensorId("input", 0);
+  // Unused but not missing
+  opts.input_map[TensorId("t1", 0)] = TensorId("W1", 0);
   ExpectOK(
       R"EOF(
       node { name: 'W2' op: 'TestParams' }
@@ -1470,9 +1471,36 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapUnusedKeys) {
       )EOF",
       opts, &refiner, &results);
 
-  std::vector<TensorId> expected_unused_keys = {
-      TensorId("new_input", kControlSlot), TensorId("t1", 1)};
-  EXPECT_EQ(results.unused_input_map_keys, expected_unused_keys);
+  std::set<TensorId> expected_unused_keys = {TensorId("new_input", 3),
+                                             TensorId("DNE", 0)};
+  ASSERT_EQ(results.missing_unused_input_map_keys.size(),
+            expected_unused_keys.size());
+
+  std::set<TensorId> actual_unused_keys(
+      results.missing_unused_input_map_keys.begin(),
+      results.missing_unused_input_map_keys.end());
+  EXPECT_EQ(actual_unused_keys, expected_unused_keys);
+
+  // Test edge case: node isn't imported due to skip_mapped_nodes, but we still
+  // have a bad input_map key involving it.
+  opts = ImportGraphDefOptions();
+  opts.input_map[TensorId("new_input", 0)] = TensorId("input", 0);
+  opts.input_map[TensorId("new_input", 1)] = TensorId("input", 1);
+  // Index out of bounds
+  opts.input_map[TensorId("new_input", 2)] = TensorId("input", 1);
+  opts.skip_mapped_nodes = true;
+  opts.prefix = "import";
+  results = ImportGraphDefResults();
+  ExpectOK(
+      R"EOF(
+      node { name: 'W2' op: 'TestParams' }
+      node { name: 'new_input' op: 'TestInput' input: [ '^W2' ] }
+      node { name: 't1' op: 'TestMul' input: [ 'new_input:0', 'new_input:1' ] }
+      )EOF",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.missing_unused_input_map_keys.size(), 1);
+  EXPECT_EQ(results.missing_unused_input_map_keys[0], TensorId("new_input", 2));
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithUnboundInput) {
@@ -1709,7 +1737,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ReturnNodes) {
   // Check return tensors
   ASSERT_EQ(results.return_nodes.size(), 2);
   EXPECT_EQ(results.return_tensors.size(), 0);
-  EXPECT_EQ(results.unused_input_map_keys.size(), 0);
+  EXPECT_EQ(results.missing_unused_input_map_keys.size(), 0);
   EXPECT_EQ(results.return_nodes[0]->name(), "input");
   EXPECT_EQ(results.return_nodes[1]->name(), "t1");
 
@@ -1806,6 +1834,39 @@ TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames) {
   EXPECT_EQ(results.return_nodes[1]->name(), "B_2");
   EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_2:0");
 
+  // Import with an already-used prefix and uniquify_prefix = true
+  opts.prefix = "A";
+  opts.uniquify_prefix = true;
+  results = ImportGraphDefResults();
+  ExpectOK(graph_def_str, opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_3/A");
+  EXPECT_EQ(results.return_nodes[1]->name(), "A_3/B");
+  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_3/A");
+
+  // Create B_3 node to keep the A/B numbering in sync
+  ExpectOK("node { name: 'B_3' op: 'TestInput' }");
+
+  // Import with an already-used prefix and uniquify_prefix = false
+  opts.uniquify_prefix = false;
+  results = ImportGraphDefResults();
+  ExpectOK(graph_def_str, opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A/A");
+  EXPECT_EQ(results.return_nodes[1]->name(), "A/B");
+  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A/A");
+
+  // Repeat the same import
+  results = ImportGraphDefResults();
+  ExpectOK(graph_def_str, opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A/A_1");
+  EXPECT_EQ(results.return_nodes[1]->name(), "A/B_1");
+  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A/A_1:0");
+
   // Import with existing de-duped node names
   opts = ImportGraphDefOptions();
   opts.uniquify_names = true;
@@ -1822,6 +1883,30 @@ TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames) {
   EXPECT_EQ(results.return_nodes[1]->name(), "B_1_1");
   EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_1_1:0");
 
+  // Import with node names that must be de-duped from names and prefixes that
+  // exist in both the existing graph and the GraphDef being imported.
+  opts = ImportGraphDefOptions();
+  opts.uniquify_names = true;
+  opts.return_nodes.push_back("A");
+  opts.return_nodes.push_back("A_4");
+  opts.return_nodes.push_back("B");
+  opts.return_nodes.push_back("B_4/B");
+  results = ImportGraphDefResults();
+  ExpectOK(
+      "node { name: 'A' op: 'TestInput' }"
+      "node { name: 'A_4' op: 'TestInput' }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A'] }"
+      "node { name: 'B_4/B' op: 'TestOneInputTwoOutputs' input: ['A_4'] }",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 4);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_5");
+  EXPECT_EQ(results.return_nodes[1]->name(), "A_4");
+  EXPECT_EQ(results.return_nodes[2]->name(), "B_5");
+  EXPECT_EQ(results.return_nodes[2]->def().input(0), "A_5:0");
+  EXPECT_EQ(results.return_nodes[3]->name(), "B_4/B");
+  EXPECT_EQ(results.return_nodes[3]->def().input(0), "A_4");
+
   // Create node with prefix and then import node with same name
   ExpectOK("node { name: 'foo/abc' op: 'ABC' }");
   opts = ImportGraphDefOptions();
@@ -1871,16 +1956,25 @@ TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames) {
   ExpectOK(graph_def_str, opts, &refiner, &results);
 
   ASSERT_EQ(results.return_nodes.size(), 2);
-  EXPECT_EQ(results.return_nodes[0]->name(), "A_3");
-  EXPECT_EQ(results.return_nodes[1]->name(), "B_3");
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_6");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_6");
   EXPECT_EQ(results.return_nodes[1]->def().input(0), "A:0");
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames_ColocationGroups) {
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
+
+  // Create nodes 'A' and 'b"
+  ExpectOK(
+      "node { name: 'A' op: 'TestInput' }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A'] }");
 
   // Check that colocation groups are updated
-  opts = ImportGraphDefOptions();
+  ImportGraphDefOptions opts;
   opts.uniquify_names = true;
   opts.return_nodes.push_back("A");
   opts.return_nodes.push_back("B");
-  results = ImportGraphDefResults();
+  ImportGraphDefResults results;
   ExpectOK(
       "node { name: 'A' op: 'TestInput' }"
       "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A:0'] "
@@ -1888,14 +1982,48 @@ TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames) {
       opts, &refiner, &results);
 
   ASSERT_EQ(results.return_nodes.size(), 2);
-  EXPECT_EQ(results.return_nodes[0]->name(), "A_4");
-  EXPECT_EQ(results.return_nodes[1]->name(), "B_4");
-  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_4:0");
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_1");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_1");
   const AttrValue* class_attr =
       results.return_nodes[1]->attrs().Find(kColocationAttrName);
   ASSERT_TRUE(class_attr != nullptr);
   ASSERT_EQ(class_attr->list().s_size(), 1);
-  EXPECT_EQ(class_attr->list().s(0), "loc:@A_4");
+  EXPECT_EQ(class_attr->list().s(0), "loc:@A_1");
+
+  results = ImportGraphDefResults();
+  ExpectOK(
+      "node { name: 'A' op: 'TestInput' "
+      "       attr { key: '_class' value { list { s:'loc:@B' } } } }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A:0'] }",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_2");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_2");
+  class_attr = results.return_nodes[0]->attrs().Find(kColocationAttrName);
+  ASSERT_TRUE(class_attr != nullptr);
+  ASSERT_EQ(class_attr->list().s_size(), 1);
+  EXPECT_EQ(class_attr->list().s(0), "loc:@B_2");
+
+  results = ImportGraphDefResults();
+  ExpectOK(
+      "node { name: 'A' op: 'TestInput' "
+      "       attr { key: '_class' value { list { s:'loc:@B' } } } }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A:0'] "
+      "       attr { key: '_class' value { list { s:'loc:@B' } } } }",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_3");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_3");
+  class_attr = results.return_nodes[0]->attrs().Find(kColocationAttrName);
+  ASSERT_TRUE(class_attr != nullptr);
+  ASSERT_EQ(class_attr->list().s_size(), 1);
+  EXPECT_EQ(class_attr->list().s(0), "loc:@B_3");
+  class_attr = results.return_nodes[1]->attrs().Find(kColocationAttrName);
+  ASSERT_TRUE(class_attr != nullptr);
+  ASSERT_EQ(class_attr->list().s_size(), 1);
+  EXPECT_EQ(class_attr->list().s(0), "loc:@B_3");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_WithCycle) {
diff --git a/tensorflow/core/graph/graph_def_builder.cc b/tensorflow/core/graph/graph_def_builder.cc
index 33d2021f3819e7781a0a488a04e7459eaf14a0d7..7a58347bd1ba44d822f5c52d2686e4b3c6e43d9b 100644
--- a/tensorflow/core/graph/graph_def_builder.cc
+++ b/tensorflow/core/graph/graph_def_builder.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 
@@ -72,16 +71,6 @@ Status GraphDefBuilder::ToGraphDef(GraphDef* graph_def) const {
   return status_;
 }
 
-Status GraphDefBuilder::ToGraph(Graph* graph) const {
-  if (status_.ok()) {
-    GraphDef graph_def;
-    graph_.ToGraphDef(&graph_def);
-    GraphConstructorOptions opts;
-    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, graph_def, graph));
-  }
-  return status_;
-}
-
 string GraphDefBuilder::Options::GetNameForOp(StringPiece op) const {
   if (name_.empty()) return graph_->NewName(op);
   return name_;
diff --git a/tensorflow/core/graph/graph_def_builder.h b/tensorflow/core/graph/graph_def_builder.h
index a2c0c4d553e7229ae7e0f116691d8f717fe77f87..776a74c6d8821e53a26d73399105f55189f227df 100644
--- a/tensorflow/core/graph/graph_def_builder.h
+++ b/tensorflow/core/graph/graph_def_builder.h
@@ -161,14 +161,6 @@ class GraphDefBuilder {
   // successful, and if so fill *graph_def.
   Status ToGraphDef(GraphDef* graph_def) const;
 
-  // Like ToGraphDef(), but converts to a Graph (using the default
-  // GraphConstructorOptions).
-  // TODO(josh11b): Make this faster; right now it converts
-  // Graph->GraphDef->Graph.  This cleans up the graph (e.g. adds
-  // edges from the source and to the sink node, resolves back edges
-  // by name), and makes sure the resulting graph is valid.
-  Status ToGraph(Graph* graph) const;
-
   // Adds the function and gradient definitions in `fdef_lib` to this graph's op
   // registry. Ignores duplicate functions, and returns a bad status if an
   // imported function differs from an existing function or op with the same
diff --git a/tensorflow/core/graph/graph_def_builder_test.cc b/tensorflow/core/graph/graph_def_builder_test.cc
index e85de71ef79988199cd194274f2ef9986e86d350..be3c2be8007a4539e111a6d2375cef87dc5dff8e 100644
--- a/tensorflow/core/graph/graph_def_builder_test.cc
+++ b/tensorflow/core/graph/graph_def_builder_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -26,7 +27,6 @@ namespace tensorflow {
 namespace {
 
 TEST(GraphDefBuilderTest, Version) {
-
   // Verify that our assertions will be nontrivial
   ASSERT_LT(0, TF_GRAPH_DEF_VERSION);
 
@@ -35,7 +35,7 @@ TEST(GraphDefBuilderTest, Version) {
 
   // Check version when we convert to a Graph
   Graph graph(OpRegistry::Global());
-  TF_EXPECT_OK(builder.ToGraph(&graph));
+  TF_EXPECT_OK(GraphDefBuilderToGraph(builder, &graph));
   ASSERT_EQ(graph.versions().producer(), TF_GRAPH_DEF_VERSION);
   ASSERT_EQ(graph.versions().min_consumer(), TF_GRAPH_DEF_VERSION_MIN_CONSUMER);
 
diff --git a/tensorflow/core/graph/graph_def_builder_util.cc b/tensorflow/core/graph/graph_def_builder_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..102c72185f7c1d1a0f0370fcbad08b0fc473c237
--- /dev/null
+++ b/tensorflow/core/graph/graph_def_builder_util.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+
+#include "tensorflow/core/graph/graph_constructor.h"
+
+namespace tensorflow {
+
+Status GraphDefBuilderToGraph(const GraphDefBuilder& builder, Graph* graph) {
+  GraphDef graph_def;
+  TF_RETURN_IF_ERROR(builder.ToGraphDef(&graph_def));
+  GraphConstructorOptions opts;
+  return ConvertGraphDefToGraph(opts, graph_def, graph);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph_def_builder_util.h b/tensorflow/core/graph/graph_def_builder_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a157e5b71da48178139ff71da4d707901b955fe
--- /dev/null
+++ b/tensorflow/core/graph/graph_def_builder_util.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_UTIL_H_
+#define TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_UTIL_H_
+
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class Graph;
+
+// Converts the `GraphDef` being built by `builder` to a `Graph` and
+// stores it in `*graph`.
+// TODO(josh11b): Make this faster; right now it converts
+// Graph->GraphDef->Graph.  This cleans up the graph (e.g. adds
+// edges from the source and to the sink node, resolves back edges
+// by name), and makes sure the resulting graph is valid.
+Status GraphDefBuilderToGraph(const GraphDefBuilder& builder, Graph* graph);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_DEF_BUILDER_UTIL_H_
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 1924c05d3dd3944d0fa14d53c9ddb2ab14be751d..add80eda23d7887fb06902c0b123c03db8f4cccf 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -1152,7 +1152,7 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     // Add control edges from 'ref_control_inputs' to 'ref_recvs'.
     // NOTE(yuanbyu): Adding these control edges should not introduce
     // deadlocks. 'dst' has implicit "read" nodes that, when we split
-    // across devices, are made explicit; Retargettig the dependencies
+    // across devices, are made explicit; Retargeting the dependencies
     // to 'dst' to those nodes would not introduce cycles if there isn't
     // one before the transformation.
     // NOTE(yuanbyu): This may impact performance because it defers the
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 20822ecb1dd3657eb57ee070d3b722703869728d..6841f2914989b22d6aef91831ac6101b0ba6555f 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -43,8 +43,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-using strings::StrCat;
-
 // from graph_partition.cc
 extern Status TopologicalSortNodesWithTimePriority(
     const GraphDef* gdef, std::vector<std::pair<const NodeDef*, int64>>* nodes,
@@ -52,6 +50,13 @@ extern Status TopologicalSortNodesWithTimePriority(
 
 namespace {
 
+using ops::_Recv;
+using ops::_Send;
+using ops::Const;
+using ops::Identity;
+using ops::LoopCond;
+using ops::NextIteration;
+
 const char gpu_device[] = "/job:a/replica:0/task:0/device:GPU:0";
 
 string SplitByDevice(const Node* node) { return node->assigned_device_name(); }
@@ -63,7 +68,7 @@ string DeviceName(const Node* node) {
   } else {
     const string cpu_prefix = "/job:a/replica:0/task:0/cpu:";
     int index = first - 'A';
-    return StrCat(cpu_prefix, index);
+    return strings::StrCat(cpu_prefix, index);
   }
 }
 
@@ -232,7 +237,6 @@ class GraphPartitionTest : public ::testing::Test {
 };
 
 TEST_F(GraphPartitionTest, SingleDevice) {
-  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
   auto a1 = FloatInput(in_.WithOpName("A1"));
   Combine(in_.WithOpName("A2"), a1, a1);
 
@@ -245,7 +249,6 @@ TEST_F(GraphPartitionTest, SingleDevice) {
 }
 
 TEST_F(GraphPartitionTest, CrossDeviceData) {
-  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
   auto a1 = FloatInput(in_.WithOpName("A1"));
   auto b1 = FloatInput(in_.WithOpName("B1"));
   Combine(in_.WithOpName("B2"), a1, b1);
@@ -267,7 +270,6 @@ TEST_F(GraphPartitionTest, CrossDeviceData) {
 }
 
 TEST_F(GraphPartitionTest, CrossDeviceControl) {
-  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
   auto a1 = FloatInput(in_.WithOpName("A1"));
   auto b1 = FloatInput(in_.WithOpName("B1"));
   Combine(in_.WithOpName("B2").WithControlDependencies(a1), b1, b1);
@@ -291,7 +293,6 @@ TEST_F(GraphPartitionTest, CrossDeviceControl) {
 }
 
 TEST_F(GraphPartitionTest, CrossDeviceData_MultiUse) {
-  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
   auto a1 = FloatInput(in_.WithOpName("A1"));
   auto b1 = FloatInput(in_.WithOpName("B1"));
   Combine(in_.WithOpName("B2"), a1, b1);
@@ -315,7 +316,6 @@ TEST_F(GraphPartitionTest, CrossDeviceData_MultiUse) {
 }
 
 TEST_F(GraphPartitionTest, CrossDeviceControl_MultiUse) {
-  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
   auto a1 = FloatInput(in_.WithOpName("A1"));
   auto b1 = FloatInput(in_.WithOpName("B1"));
   Combine(in_.WithOpName("B2").WithControlDependencies(a1), b1, b1);
@@ -341,7 +341,6 @@ TEST_F(GraphPartitionTest, CrossDeviceControl_MultiUse) {
 }
 
 TEST_F(GraphPartitionTest, CrossDevice_DataControl) {
-  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
   auto a1 = FloatInput(in_.WithOpName("A1"));
   auto b1 = FloatInput(in_.WithOpName("B1"));
   Combine(in_.WithOpName("B2"), a1, b1);
@@ -372,7 +371,6 @@ TEST_F(GraphPartitionTest, CrossDevice_DataControl) {
 }
 
 TEST_F(GraphPartitionTest, CrossDeviceLoopSimple) {
-  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
   auto a1 = BoolInput(in_.WithOpName("A1"));
   auto a2 = ::tensorflow::ops::internal::Enter(in_.WithOpName("A2"), a1, "foo");
   auto a3 = ::tensorflow::ops::Merge(in_.WithOpName("A3"),
@@ -386,7 +384,6 @@ TEST_F(GraphPartitionTest, CrossDeviceLoopSimple) {
 }
 
 TEST_F(GraphPartitionTest, CrossDeviceLoopSimple1) {
-  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
   auto a1 = BoolInput(in_.WithOpName("A1"));
   auto a2 = ::tensorflow::ops::internal::Enter(in_.WithOpName("B2"), a1, "foo");
   auto a3 = ::tensorflow::ops::Merge(in_.WithOpName("A3"),
@@ -493,13 +490,14 @@ TEST_F(GraphPartitionTest, SetIncarnation) {
   attr { key: 'tensor_name' value { s: 'test' } }
 )proto";
   CHECK(protobuf::TextFormat::ParseFromString(
-      StrCat("node { name: 'A/Pi' op: 'Const' ",
-             "  attr { key: 'dtype' value { type: DT_FLOAT } } ",
-             "  attr { key: 'value' value { tensor { ",
-             "    dtype: DT_FLOAT tensor_shape {} float_val: 3.14 } } } }",
-             "node { name: 'A' op: '_Send' input: 'A/Pi' ", kSendRecvAttrs, "}",
-             "node { name: 'B' op: '_Recv' ", kSendRecvAttrs,
-             "  attr { key: 'tensor_type' value { type:DT_FLOAT}}}"),
+      strings::StrCat(
+          "node { name: 'A/Pi' op: 'Const' ",
+          "  attr { key: 'dtype' value { type: DT_FLOAT } } ",
+          "  attr { key: 'value' value { tensor { ",
+          "    dtype: DT_FLOAT tensor_shape {} float_val: 3.14 } } } }",
+          "node { name: 'A' op: '_Send' input: 'A/Pi' ", kSendRecvAttrs, "}",
+          "node { name: 'B' op: '_Recv' ", kSendRecvAttrs,
+          "  attr { key: 'tensor_type' value { type:DT_FLOAT}}}"),
       &gdef));
   gdef.mutable_versions()->set_producer(TF_GRAPH_DEF_VERSION);
   Partition(gdef, &partitions_);
@@ -527,7 +525,8 @@ TEST(TopologicalSortNodesWithTimePriorityTest, NoDependencies) {
   }
   std::vector<ops::Placeholder> placeholders;
   for (int i : indexes) {
-    placeholders.emplace_back(root.WithOpName(StrCat("p", i)), DT_FLOAT);
+    placeholders.emplace_back(root.WithOpName(strings::StrCat("p", i)),
+                              DT_FLOAT);
     placeholders.back().node()->AddAttr("_start_time", i + 1);
   }
 
@@ -540,7 +539,7 @@ TEST(TopologicalSortNodesWithTimePriorityTest, NoDependencies) {
       TopologicalSortNodesWithTimePriority(&gdef, &nodes, &node_to_start_time));
   ASSERT_EQ(nodes.size(), 20);
   for (int i = 0; i < nodes.size(); ++i) {
-    EXPECT_EQ(StrCat("p", i), nodes[i].first->name());
+    EXPECT_EQ(strings::StrCat("p", i), nodes[i].first->name());
     EXPECT_EQ(i + 1, nodes[i].second);
   }
 }
@@ -554,7 +553,7 @@ TEST(TopologicalSortNodesWithTimePriority, Dependencies) {
   const int num_leaves = 20;
   for (int i = 0; i < num_leaves; ++i) {
     indexes.push_back((i + 2001) % num_leaves);
-    placeholders_in_order.emplace_back(root.WithOpName(StrCat("p", i)),
+    placeholders_in_order.emplace_back(root.WithOpName(strings::StrCat("p", i)),
                                        DT_FLOAT);
     placeholders_in_order.back().node()->AddAttr("_start_time", i + 1);
   }
@@ -568,7 +567,8 @@ TEST(TopologicalSortNodesWithTimePriority, Dependencies) {
   // placeholder runs last).
   std::vector<ops::Square> squares;
   for (int i : indexes) {
-    squares.emplace_back(root.WithOpName(StrCat("s", i)), placeholders[i]);
+    squares.emplace_back(root.WithOpName(strings::StrCat("s", i)),
+                         placeholders[i]);
     squares.back().node()->AddAttr("_start_time", 50 - (i + 1));
   }
 
@@ -591,7 +591,7 @@ TEST(TopologicalSortNodesWithTimePriority, Dependencies) {
   ASSERT_EQ(1 + squares.size() + placeholders.size(), nodes.size());
   for (int i = 0; i < placeholders.size(); ++i) {
     const NodeDef* node = nodes[i].first;
-    EXPECT_EQ(StrCat("p", i), node->name());
+    EXPECT_EQ(strings::StrCat("p", i), node->name());
     EXPECT_EQ(i + 1, nodes[i].second);
     EXPECT_EQ(i + 1, node_to_start_time[node]);
   }
@@ -599,7 +599,7 @@ TEST(TopologicalSortNodesWithTimePriority, Dependencies) {
     int node_index = placeholders.size() + i;
     int square_index = num_leaves - 1 - i;
     const NodeDef* node = nodes[node_index].first;
-    EXPECT_EQ(StrCat("s", square_index), node->name());
+    EXPECT_EQ(strings::StrCat("s", square_index), node->name());
     EXPECT_EQ(50 - (square_index + 1), nodes[node_index].second);
     EXPECT_EQ(50 - (square_index + 1), node_to_start_time[node]);
   }
@@ -619,7 +619,7 @@ TEST(TopologicalSortNodesWithTimePriority, WhileLoop) {
   const int num_leaves = 20;
   for (int i = 0; i < num_leaves; ++i) {
     indexes.push_back((i + 2001) % num_leaves);
-    placeholders_in_order.emplace_back(root.WithOpName(StrCat("p", i)),
+    placeholders_in_order.emplace_back(root.WithOpName(strings::StrCat("p", i)),
                                        DT_FLOAT);
     placeholders_in_order.back().node()->AddAttr("_start_time", i + 1);
   }
@@ -633,10 +633,10 @@ TEST(TopologicalSortNodesWithTimePriority, WhileLoop) {
   std::vector<Exit> while_exits;
   const int nodes_per_loop = 8;
   for (int i : indexes) {
-    Scope scope = root.NewSubScope(StrCat("while", i));
+    Scope scope = root.NewSubScope(strings::StrCat("while", i));
     auto dummy = Placeholder(scope, DT_FLOAT);
 
-    Enter enter(scope, placeholders[i], StrCat("frame", i));
+    Enter enter(scope, placeholders[i], strings::StrCat("frame", i));
     Merge merge(scope, std::initializer_list<Input>{enter, dummy});
     auto cv = Const(scope.WithControlDependencies({merge.output}), false);
     LoopCond loop_cond(scope, cv);
@@ -663,7 +663,8 @@ TEST(TopologicalSortNodesWithTimePriority, WhileLoop) {
   std::vector<Square> squares;
   squares.reserve(indexes.size());
   for (int i : indexes) {
-    squares.emplace_back(root.WithOpName(StrCat("s", i)), while_exits[i]);
+    squares.emplace_back(root.WithOpName(strings::StrCat("s", i)),
+                         while_exits[i]);
     squares.back().node()->AddAttr("_start_time", 500 - (i + 1));
   }
 
@@ -680,20 +681,20 @@ TEST(TopologicalSortNodesWithTimePriority, WhileLoop) {
   int node_index = 0;
   for (int i = 0; i < placeholders.size(); ++i, ++node_index) {
     const NodeDef* node = nodes[i].first;
-    EXPECT_EQ(StrCat("p", i), node->name());
+    EXPECT_EQ(strings::StrCat("p", i), node->name());
     EXPECT_EQ(i + 1, nodes[i].second);
     EXPECT_EQ(i + 1, node_to_start_time[node]);
   }
   for (int i = 0; i < while_exits.size(); ++i, node_index += nodes_per_loop) {
     const NodeDef* node = nodes[node_index].first;
-    EXPECT_EQ(StrCat("while", i, "/Enter"), node->name());
+    EXPECT_EQ(strings::StrCat("while", i, "/Enter"), node->name());
     EXPECT_EQ(100 + i * 10, nodes[node_index].second);
     EXPECT_EQ(100 + i * 10, node_to_start_time[node]);
   }
   for (int i = 0; i < squares.size(); ++i, ++node_index) {
     int square_index = num_leaves - 1 - i;
     const NodeDef* node = nodes[node_index].first;
-    EXPECT_EQ(StrCat("s", square_index), node->name());
+    EXPECT_EQ(strings::StrCat("s", square_index), node->name());
     EXPECT_EQ(500 - (square_index + 1), nodes[node_index].second);
     EXPECT_EQ(500 - (square_index + 1), node_to_start_time[node]);
   }
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 880e4e712ef0a0d9378afefd91acd125351992f7..1b99d54e8e33fd5155913a78ee833343bf92b905 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -21,108 +21,101 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
-  // Since our ops are going to produce and also consume N addition tensors
-  // (Mkl) for N Tensorflow tensors, we can have following different
-  // orderings among these 2N tensors.
-  //
-  // E.g., for Tensorflow tensors A, B, and C, our ops will produce and
-  // consume A_m, B_m, and C_m additionally.
-  //
-  // INTERLEAVED: in this case 2N tensors are interleaved. So for above
-  //              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
-  //
-  // CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
-  //             by N Mkl tensors. So for above example, the ordering looks
-  //             like: A, B, C, A_m, B_m, C_m
-  //
-  // Following APIs map index of original Tensorflow tensors to their
-  // appropriate position based on selected ordering. For contiguous ordering,
-  // we need to know the total number of tensors (parameter total).
-  //
-  typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
-  // NOTE: Currently, we use contiguous ordering. If you change this, then you
-  // would need to change Mkl op definitions in nn_ops.cc.
-  static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
+// Since our ops are going to produce and also consume N addition tensors
+// (Mkl) for N Tensorflow tensors, we can have following different
+// orderings among these 2N tensors.
+//
+// E.g., for Tensorflow tensors A, B, and C, our ops will produce and
+// consume A_m, B_m, and C_m additionally.
+//
+// INTERLEAVED: in this case 2N tensors are interleaved. So for above
+//              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
+//
+// CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
+//             by N Mkl tensors. So for above example, the ordering looks
+//             like: A, B, C, A_m, B_m, C_m
+//
+// Following APIs map index of original Tensorflow tensors to their
+// appropriate position based on selected ordering. For contiguous ordering,
+// we need to know the total number of tensors (parameter total).
+//
+typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
+// NOTE: Currently, we use contiguous ordering. If you change this, then you
+// would need to change Mkl op definitions in nn_ops.cc.
+static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
 
-  // Get index of MetaData tensor from index 'n' of Data tensor.
-  inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
-    if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-      // For interleaved ordering, Mkl tensor follows immediately after
-      // Tensorflow tensor.
-      return n + 1;
-    } else {
-      CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-      // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
-      return n + total_tensors / 2;
-    }
+// Get index of MetaData tensor from index 'n' of Data tensor.
+inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // For interleaved ordering, Mkl tensor follows immediately after
+    // Tensorflow tensor.
+    return n + 1;
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
+    return n + total_tensors / 2;
   }
+}
 
-  int inline GetTensorDataIndex(int n, int total_tensors) {
-      if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-        return 2 * n;  // index corresponding to nth input/output tensor
-      } else {
-        CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-        return n;
-      }
-    }
+int inline GetTensorDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    return 2 * n;  // index corresponding to nth input/output tensor
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    return n;
+  }
+}
 
-  int inline GetTensorMetaDataIndex(int n, int total_tensors) {
-      // Get index for TensorData first and then use mapping function
-      // to get TensorMetaData index from TensorData index.
-      int tidx = GetTensorDataIndex(n, total_tensors);
-      return DataIndexToMetaDataIndex(tidx, total_tensors);
-    }
+int inline GetTensorMetaDataIndex(int n, int total_tensors) {
+  // Get index for TensorData first and then use mapping function
+  // to get TensorMetaData index from TensorData index.
+  int tidx = GetTensorDataIndex(n, total_tensors);
+  return DataIndexToMetaDataIndex(tidx, total_tensors);
+}
 
 namespace mkl_op_registry {
-  static const char* kMklOpLabel = "MklOp";
-  static const char* kMklOpLabelPattern = "label='MklOp'";
-
-  // Get the name of Mkl op from original TensorFlow op
-  // We prefix 'Mkl' to the original op to get Mkl op.
-  inline string GetMklOpName(const string& name) {
-    // Prefix that we add to Tensorflow op name to construct Mkl op name.
-    const char* const kMklOpPrefix = "_Mkl";
-    return string(kMklOpPrefix) + name;
-  }
+static const char* kMklOpLabel = "MklOp";
+static const char* kMklOpLabelPattern = "label='MklOp'";
+// Prefix that we add to Tensorflow op name to construct Mkl op name.
+static const char* const kMklOpPrefix = "_Mkl";
 
-  // Check whether opname with type T is registered as MKL-compliant.
-  //
-  // @input: name of the op
-  // @input: T datatype to be used for checking op
-  // @return: true if opname is registered as Mkl op; false otherwise
-  static inline bool IsMklOp(const std::string& op_name, DataType T) {
-    string kernel = KernelsRegisteredForOp(op_name);
-    bool result =
-        kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
-    if (result) {
-      VLOG(1) << "mkl_op_registry::" << op_name << " is " << kMklOpLabel;
-    }
-    return result;
-  }
-
-  // Check whether opname with type T is registered as MKL-compliant and
-  // is element-wise.
-  //
-  // @input: name of the op
-  // @input: T datatype to be used for checking op
-  // @return: true if opname is registered as element-wise Mkl op;
-  // false otherwise
-  static inline bool IsMklElementWiseOp(const std::string& op_name,
-    DataType T) {
-    if (!IsMklOp(op_name, T)) {
-      return false;
-    }
+// Get the name of Mkl op from original TensorFlow op
+// We prefix 'Mkl' to the original op to get Mkl op.
+inline string GetMklOpName(const string& name) {
+  return string(kMklOpPrefix) + name;
+}
 
-    bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
-                    0 == op_name.compare(GetMklOpName("Sub")) ||
-                    0 == op_name.compare(GetMklOpName("Mul")) ||
-                    0 == op_name.compare(GetMklOpName("Maximum")) ||
-                    0 == op_name.compare(GetMklOpName("SquaredDifference")));
+// Check whether opname with type T is registered as MKL-compliant.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as Mkl op; false otherwise
+static inline bool IsMklOp(const std::string& op_name, DataType T) {
+  string kernel = KernelsRegisteredForOp(op_name);
+  bool result =
+      kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
+  return result;
+}
 
-    VLOG(1) << "mkl_op_registry::" << op_name
-            << " is elementwise MKL op: " << result;
-    return result;
+// Check whether opname with type T is registered as MKL-compliant and
+// is element-wise.
+//
+// @input: name of the op
+// @input: T datatype to be used for checking op
+// @return: true if opname is registered as element-wise Mkl op;
+// false otherwise
+static inline bool IsMklElementWiseOp(const std::string& op_name, DataType T) {
+  if (!IsMklOp(op_name, T)) {
+    return false;
   }
+  bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
+                 0 == op_name.compare(GetMklOpName("Sub")) ||
+                 0 == op_name.compare(GetMklOpName("Mul")) ||
+                 0 == op_name.compare(GetMklOpName("Maximum")) ||
+                 0 == op_name.compare(GetMklOpName("SquaredDifference")));
+
+  return result;
+}
 }  // namespace mkl_op_registry
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 912075aa286042319a93bf60495f52af3f940ec8..7d3be152991351533a6185ea088503032f720b47 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -37,11 +37,13 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "tensorflow/core/graph/mkl_layout_pass.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
+#include "tensorflow/core/graph/mkl_layout_pass.h"
 
 namespace tensorflow {
 
+#ifdef INTEL_MKL_ML
+
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
 // (B) Rewriting a node in the graph to a new node
@@ -279,7 +281,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_with_bias_backprop_bias =
-                                   "_MklConv2DWithBiasBackpropBias";
+        "_MklConv2DWithBiasBackpropBias";
     csinfo_.relu = "Relu";
     csinfo_.relu_grad = "ReluGrad";
     csinfo_.reshape = "Reshape";
@@ -295,10 +297,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     // End - element-wise ops. See note above.
 
     // NOTE: names are alphabetically sorted.
-    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn), CopyAttrsAddN,
-                      AddNRewrite, nullptr});
-    rinfo_.push_back({csinfo_.add,
-                      mkl_op_registry::GetMklOpName(csinfo_.add),
+    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
+                      CopyAttrsAddN, AddNRewrite, nullptr});
+    rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
@@ -335,14 +336,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
                       CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.fused_batch_norm_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
-                      CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
+    rinfo_.push_back(
+        {csinfo_.fused_batch_norm_grad,
+         mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
+         CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.identity,
                       mkl_op_registry::GetMklOpName(csinfo_.identity),
                       CopyAttrsIdentity, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.lrn,
-                      mkl_op_registry::GetMklOpName(csinfo_.lrn),
+    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
                       CopyAttrsLRN, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.lrn_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
@@ -356,11 +357,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.maximum,
                       mkl_op_registry::GetMklOpName(csinfo_.maximum),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.mul,
-                      mkl_op_registry::GetMklOpName(csinfo_.mul),
+    rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.relu,
-                      mkl_op_registry::GetMklOpName(csinfo_.relu),
+    rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
     rinfo_.push_back({csinfo_.relu_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
@@ -371,8 +370,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.sub,
-                      mkl_op_registry::GetMklOpName(csinfo_.sub),
+    rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub),
                       CopyAttrsDataType, AlwaysRewrite, nullptr});
 
     // Add info about which ops to add workspace edge to and the slots.
@@ -386,9 +384,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     biasaddgrad_matmul_context_ = {csinfo_.bias_add_grad, csinfo_.matmul,
                                    IsBiasAddGradInMatMulContext};
 
-    biasaddgrad_conv2dwithbias_context_ = {csinfo_.bias_add_grad,
-                                   csinfo_.mkl_conv2d_with_bias,
-                                   IsBiasAddGradInConv2DWithBiasContext};
+    biasaddgrad_conv2dwithbias_context_ = {
+        csinfo_.bias_add_grad, csinfo_.mkl_conv2d_with_bias,
+        IsBiasAddGradInConv2DWithBiasContext};
 
     cinfo_.push_back(&biasaddgrad_matmul_context_);
     cinfo_.push_back(&biasaddgrad_conv2dwithbias_context_);
@@ -408,9 +406,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   /// Structure to specify the context information used in a node rewrite rule
   typedef struct {
-    string node;     // Name of the node to be rewritten
-    string fwd;      // Name of the node in the forward pass that this node
-                     // corresponds to
+    string node;  // Name of the node to be rewritten
+    string fwd;   // Name of the node in the forward pass that this node
+                  // corresponds to
     std::function<bool(const Node*, const Node**, void* c)> context_match_fn;
   } ContextInfo;
 
@@ -613,14 +611,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     std::vector<int32> ksize, strides;
     CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
     CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
-    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(),
-             true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), true);
     CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
 
     // Condition that specifies non-batch-wise and non-depth-wise pooling.
-    if (GetTensorDim(ksize,   data_format, 'N') == 1 &&
+    if (GetTensorDim(ksize, data_format, 'N') == 1 &&
         GetTensorDim(strides, data_format, 'N') == 1 &&
-        GetTensorDim(ksize,   data_format, 'C') == 1 &&
+        GetTensorDim(ksize, data_format, 'C') == 1 &&
         GetTensorDim(strides, data_format, 'C') == 1) {
       return true;
     }
@@ -783,8 +780,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
       for (const Edge* fe : first_inp_of_filter->out_edges()) {
         if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
             fe->dst_input() == 0) {
-          VLOG(1) << "MklLayoutRewritePass: found "
-                  << fe->dst()->DebugString()
+          VLOG(1) << "MklLayoutRewritePass: found " << fe->dst()->DebugString()
                   << " as the forward node for matching context, backward"
                   << " node is: " << n->DebugString();
           *fwd_node = fe->dst();
@@ -801,13 +797,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   //
   // @return - true (if BiasAddGrad is associated with MatMul);
   //           false otherwise.
-  static bool IsBiasAddGradInMatMulContext(const Node* n,
-                                           const Node** fwd_node,
+  static bool IsBiasAddGradInMatMulContext(const Node* n, const Node** fwd_node,
                                            void* ci) {
     return (!IsBiasAddGradInConv2DWithBiasContext(n, fwd_node, ci));
   }
 
-
   // Rewrite rule that uses context-information for matching,
   // used in scenario 2.
   //
@@ -878,10 +872,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @output output_nodes - the list of new nodes creating Mkl tensors
   //
   // @return None
-  void GetNodesProducingMklTensorList(std::unique_ptr<Graph>* g,
-    Node* orig_node, const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes);
+  void GetNodesProducingMklTensorList(
+      std::unique_ptr<Graph>* g, Node* orig_node,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
 
   // Get a node that will feed an Mkl tensor to the new
   // node that we are constructing. The output node could be (1) 'n'
@@ -898,7 +893,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   //                                will feed the tensor
   // @return None
   void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
-    Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot);
+                                 Node* n, int n_output_slot, Node** mkl_node,
+                                 int* mkl_node_output_slot);
 
   // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
   // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
@@ -968,9 +964,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
 MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_;
 MklLayoutRewritePass::ContextInfo
-  MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
+    MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
 MklLayoutRewritePass::ContextInfo
-  MklLayoutRewritePass::biasaddgrad_matmul_context_;
+    MklLayoutRewritePass::biasaddgrad_matmul_context_;
 std::vector<MklLayoutRewritePass::ContextInfo*> MklLayoutRewritePass::cinfo_;
 
 // We register Mkl rewrite pass for phase 1 in post partitioning group.
@@ -1039,13 +1035,13 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   TensorShape dummy_shape({8});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-               .Attr("value", proto)
-               .Attr("dtype", dt)
-               .Device(orig_node->def().device())  // We place this node on
-                                                   // the same device as the
-                                                   // device of the original
-                                                   // node.
-               .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // the same device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -1058,8 +1054,8 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
   }
 
@@ -1067,11 +1063,9 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
 }
 
 void MklLayoutRewritePass::GetNodesProducingMklTensorList(
-    std::unique_ptr<Graph>* g,
-    Node* orig_node,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-    int* input_idx, int list_length,
-    std::vector<NodeBuilder::NodeOut>* output_nodes) {
+    std::unique_ptr<Graph>* g, Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
   CHECK_LT(*input_idx, inputs.size());
   CHECK_GT(list_length, 0);
   CHECK_NOTNULL(output_nodes);
@@ -1088,8 +1082,8 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
     int mkl_node_output_slot = 0;
     GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
                               &mkl_node_output_slot);
-    output_nodes->push_back(NodeBuilder::NodeOut(mkl_node,
-                                                mkl_node_output_slot));
+    output_nodes->push_back(
+        NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot));
     (*input_idx)++;
     list_length--;
   }
@@ -1099,9 +1093,9 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
 // node that we are constructing. An input node could be (1) 'n'
 // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
 // if 'n' is not an Mkl layer.
-void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
-    Node* orig_node, Node* n,
-    int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
+void MklLayoutRewritePass::GetNodeProducingMklTensor(
+    std::unique_ptr<Graph>* g, Node* orig_node, Node* n, int n_output_slot,
+    Node** mkl_node, int* mkl_node_output_slot) {
   CHECK_NOTNULL(n);
   CHECK_NOTNULL(mkl_node);
   CHECK_NOTNULL(mkl_node_output_slot);
@@ -1232,8 +1226,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
       int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
-                                     N, &new_node_inputs);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
+                                     &new_node_inputs);
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -1334,13 +1328,13 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   TensorShape dummy_shape({1});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                .Attr("value", proto)
-                .Attr("dtype", dt)
-                .Device(orig_node->def().device())  // We place this node on
-                                                    // same the device as the
-                                                    // device of the original
-                                                    // node.
-                .Finalize(&**g, out));
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // same the device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -1353,8 +1347,8 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   // the same frame.
   if (orig_node->num_inputs() > 0) {
     Node* orig_input0 = nullptr;
-    TF_CHECK_OK(orig_node->input_node(0,
-                                      const_cast<const Node**>(&orig_input0)));
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
     CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
   }
 
@@ -1372,7 +1366,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   for (auto ws : wsinfo_) {
     if (orig_node->type_string() == ws.fwd_op &&
-        mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
+        mkl_op_registry::IsMklOp(
+            mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
       // If this op is a fwd op, then we need to check if there is an
       // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
       // an edge, then we just add an attribute on this node for setting
@@ -1398,8 +1393,9 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         nb->Attr("workspace_enabled", false);
       }
     } else if (orig_node->type_string() == ws.bwd_op &&
-               mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(orig_node->type_string()),
-                                        T)) {
+               mkl_op_registry::IsMklOp(
+                   mkl_op_registry::GetMklOpName(orig_node->type_string()),
+                   T)) {
       // If this op is a bwd op, then we need to add workspace edge and
       // it's Mkl tensor edge between its corresponding fwd op and this
       // op. Corresponding fwd op is specified in 'fwd_op' field of
@@ -1414,7 +1410,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         if (e->src_output() == ws.fwd_slot &&
             // We would have rewritten the forward op, so we need to use
             // GetMklOpName call to get its Mkl name.
-            e->src()->type_string() == mkl_op_registry::GetMklOpName(ws.fwd_op) &&
+            e->src()->type_string() ==
+                mkl_op_registry::GetMklOpName(ws.fwd_op) &&
             e->dst_input() == ws.bwd_slot) {
           nb->Attr("workspace_enabled", true);
           CHECK_NOTNULL(ws_tensors);
@@ -1591,7 +1588,7 @@ void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                           NodeBuilder* nb) {
+                                            NodeBuilder* nb) {
   DataType T;
   DataType Tshape;
 
@@ -1867,8 +1864,8 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
       if (e->IsControlEdge()) {
         CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
       } else {
-        CHECK_NOTNULL((*g)->AddEdge(new_node, e->src_output(), e->dst(),
-                                  e->dst_input()));
+        CHECK_NOTNULL(
+            (*g)->AddEdge(new_node, e->src_output(), e->dst(), e->dst_input()));
       }
     }
 
@@ -1939,9 +1936,9 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
       // and leave BiasAddGrad as it is. But we check for this condition
       // when we check for node rewrite rule. So we should not even come
       // here for MatMul. So we will fail now.
-        return Status(
-            error::Code::INVALID_ARGUMENT,
-            "No rewrite is required for BiasAddGrad for MatMul context.");
+      return Status(
+          error::Code::INVALID_ARGUMENT,
+          "No rewrite is required for BiasAddGrad for MatMul context.");
     }
   }
 
@@ -2010,9 +2007,10 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
     if (e->IsControlEdge()) {
       CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
     } else {
-      CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
-                            e->src()->num_outputs()),
-                    e->dst(), e->dst_input()));
+      CHECK_NOTNULL((*g)->AddEdge(
+          new_node,
+          GetTensorDataIndex(e->src_output(), e->src()->num_outputs()),
+          e->dst(), e->dst_input()));
     }
   }
 
@@ -2068,7 +2066,8 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
 
   // BiasAddGrad is not an Mkl layer, so we make an exception for it.
   if (n->type_string() != csinfo_.bias_add_grad) {
-    if (!mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()), T)) {
+    if (!mkl_op_registry::IsMklOp(
+            mkl_op_registry::GetMklOpName(n->type_string()), T)) {
       return nullptr;
     }
   }
@@ -2184,8 +2183,7 @@ bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
   return MklLayoutRewritePass().RunPass(g);
 }
 
-Status MklLayoutRewritePass::Run(
-  const GraphOptimizationPassOptions& options) {
+Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
   if (options.graph == nullptr && options.partition_graphs == nullptr) {
     return Status::OK();
   }
@@ -2213,6 +2211,2110 @@ Status MklLayoutRewritePass::Run(
   return Status::OK();
 }
 
+#else   // INTEL_MKL_ML
+
+// This pass implements rewriting of graph to support following scenarios:
+// (A) Merging nodes in the graph
+// (B) Rewriting a node in the graph to a new node
+//     Rewrite happens under following scenario:
+//     - Propagating Mkl layout as an additional output tensor
+//        (we will loosely call a tensor that carries Mkl layout as Mkl tensor
+//         henceforth.) from every Mkl supported NN layer.
+//
+// Example of A : Merging nodes in the graph
+// -----------------------------------------
+// Currently, we merge Conv2D+AddBias together. Consider Conv2D and BiasAdd as:
+//
+//           O = Conv2D(A, B)
+//           P = BiasAdd(O, C)
+//
+// We merge them into Conv2DWithBias as:
+//           P = _MklConv2DWithBias(A, A_m, B, B_m, C, C_m)
+//
+// The meaning of A_m, B_m and C_m is explained in B.1.
+//
+// Merge rules:
+//  - The merge for Conv2D and BiasAdd happens when the output of Conv2D _only_
+//    goes to BiasAdd.
+//  - Also, the intersection of attributes of both the nodes must have same
+//    values.
+//  - Both the nodes must have been assigned to same device (if any).
+//
+// Example of B.1 : Rewriting nodes to Mkl nodes
+// ---------------------------------------------
+// Consider a Relu node. Current definition of Relu node looks like:
+//
+//           O = Relu(A)
+//
+// Relu has 1 input (A), and 1 output (O).
+//
+// This rewrite pass will generate a new graph node for Relu (new node is
+// called MklRelu) as:
+//
+//          O, O_m = MklRelu(A, A_m)
+//
+// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m). Here input A is
+// same as input A of Relu; output O is same as output O of Relu. O_m is the
+// additional output tensor that will be set by MklRelu, and it represents
+// Mkl tensor corresponding to O -- in other words, O_m is some kind of
+// metadata for O. A_m is additional input of Relu, and it represents metadata
+// for A - as O_m is metadata for O, A_m is metadata for A. MklRelu receives
+// this metadata from previous node in the graph.
+//
+// When a previous node in the graph is an Mkl node, A_m will represent a valid
+// Mkl tensor. But when a previous node is not an Mkl node, A_m will represent
+// a dummy Mkl tensor.
+//
+// Rewriting rules:
+//  - Selection of a node for rewriting happens by registering the op type of
+//    the node with the rewriting pass. If the op type is not registered, then
+//    all nodes of this op type will not be rewritten.
+//  - Number of inputs after rewriting:
+//      Since for every input Tensorflow tensor, the rewritten node gets Mkl
+//      tensor(s), rewritten node gets 2*N inputs, where N is the number of
+//      inputs for the original node.
+//  - Number of outputs after rewriting:
+//      Since for every output Tensorflow tensor, the rewritten node generates
+//      Mkl tensor(s), the rewritten node generates 2*N outputs, where N is the
+//      number of outputs of the original node.
+//  - Ordering of Tensorflow tensors and Mkl tensors:
+//      Since every rewritten node generates twice the number of inputs and
+//      outputs, one could imagine various orderings among Tensorflow tensors
+//      and Mkl tensors. E.g., assume an op 'Conv2D' that takes (A, B) as
+//      inputs, then the new op '_MklConv2D' can take inputs A, B, A_m and B_m
+//      in A, A_m, B, B_m order or it can also take them in A, B, A_m, B_m
+//      order. Among N inputs one can get N! permutations.
+//
+//      So the question is: which order do we follow? We support 2 types of
+//      orderings: (1) interleaved, and (2) contiguous. Interleaved ordering
+//      follows an intuitive order where an Mkl tensor follows the
+//      corresponding Tensorflow tensor immediately. In the context of the
+//      above example, it will be: A, A_m, B, B_m. Note that the ordering rule
+//      applies to both the inputs and outputs. Contiguous ordering means
+//      all the Tensorflow tensors are contiguous followed by all the Mkl
+//      tensors. We use contiguous ordering as default.
+//
+// Graph rewrite algorithm:
+//      Algorithm: Graph Rewrite
+//      Input: Graph G, Names of the nodes to rewrite and their new names
+//      Output: Modified Graph G' if the nodes are modified, G otherwise.
+//      Start:
+//        N = Topological_Sort(G) // N is a set of nodes in toposort order.
+//        foreach node n in N
+//        do
+//          if (Is_MKL_Op(n))  // Can this node accept an Mkl layout as input.
+//          then
+//            E = set of <incoming edge and its src_output slot> of n
+//            E' = {}   // a new set of edges for rewritten node
+//            foreach <e,s> in E
+//            do
+//              E' U {<e,s>}  // First copy edge which generates Tensorflow
+//                            // tensor as it is
+//              m = Source node of edge e
+//              if Is_Rewritten(m)  // Did we rewrite this node in this pass?
+//              then
+//                E' U {<m,s+1>}    // If yes, then m will generate an Mkl
+//                                  // tensor as an additional output.
+//              else
+//                d = Generate_Dummy_Mkl_Tensor()  // If not, generate a dummy
+//                                                 // Mkl tensor.
+//                E' U {<d,0>}  // The dummy Mkl tensor has only 1 output slot.
+//              fi
+//            done
+//            n' = Build_New_Node(G,new_name,E')
+//            Mark_Rewritten(n')  // Mark the new node as being rewritten.
+//          fi
+//        done
+//
+//      Explanation:
+//        For graph rewrite, we visit nodes of the input graph in the
+//        topological sort order. With this ordering, we visit nodes in the
+//        top-to-bottom fashion. We need this order because while visiting a
+//        node we want that all of its input nodes are visited and rewritten if
+//        applicable. This is because if we need to rewrite a given node
+//        then all of its input nodes need to be fixed (in other words they
+//        cannot be deleted later.)
+//
+//        While visiting a node, we first check if the op type of the node is
+//        an Mkl op. If it is, then we rewrite that node after constructing
+//        new inputs to the node. If the op type of the node is not Mkl op,
+//        then we do not rewrite that node.
+//
+// Handling workspace propagation for certain ops:
+//
+//        Certain backward ops in MKL (MaxPool, LRN and BatchNorm) require
+//        passing of a workspace from their respective forward ops. Workspace
+//        tensors provide memory for storing results of intermediate operations
+//        which are helpful in backward propagation. TensorFlow does not have
+//        a notion of a workspace and as a result does not allow producing
+//        additional outputs from these forward ops. For these ops, we need
+//        to add 2 extra edges between forward ops and their corresponding
+//        backward ops - the first extra edge carries a workspace tensor and
+//        the second one carries an Mkl tensor for the workspace tensor.
+//
+//        Example:
+//
+//        Typical graph for MaxPool and its gradient looks like:
+//
+//        A = MaxPool(T)
+//        B = MaxPoolGrad(X, A, Y)
+//
+//        We will transform this graph to propagate the workspace as:
+//        (with the contiguous ordering)
+//
+//        A, W, A_m, W_m = MklMaxPool(T, T_m)
+//        B, B_m = MklMaxPoolGrad(X, A, Y, W, X_m, A_m, Y_m, W_m)
+//
+//        Here W is the workspace tensor. Transformed tensor names with the
+//        suffix _m are Mkl tensors, and this transformation has been done
+//        using the algorithm discussed earlier. The transformation for
+//        workspace propagation only adds extra outputs (W, W_m) for a forward
+//        op and connects them to the corresponding backward ops.
+//
+//        Terms:
+//
+//        Forward op name = name of the op in the forward pass
+//          where a workspace tensor originates (MaxPool in this example)
+//        Backward op name = name of the op in the backward pass that receives
+//          a workspace tensor from the forward op (MaxPoolGrad in the example)
+//        Slot = Position of the output or input slot that will be
+//               used by the workspace tensor (1 for MklMaxPool as W is the 2nd
+//               output of MaxPool (0 is 1st); 3 for MklMaxPoolGrad)
+//
+//        Question:
+//
+//        How do we associate a backward op to a forward op? There can be more
+//        than one op with the exact same name.
+//
+//        In this example, we associate MaxPoolGrad with MaxPool. But there
+//        could be more than one MaxPool ops. To solve this problem, we look
+//        for _direct_ edge between a forward op and a backward op (tensor A is
+//        flowing along this edge in the example).
+//
+//        How do we transform forward and backward ops when there is no direct
+//        edge between them? In such a case, we generate dummy tensors for
+//        workspace tensors. For the example, transformation of MaxPool will
+//        be exactly same as it would be when there is a direct edge between
+//        the forward and the backward op --- it is just that MaxPool won't
+//        generate any workspace tensor. For MaxPoolGrad, the transformation
+//        will also be same, but instead of connecting W and W_m with the
+//        outputs of MaxPool, we will produce dummy tensors for them, and we
+//        will set workspace_enabled attribute to false.
+//
+class MklLayoutRewritePass : public GraphOptimizationPass {
+ public:
+  MklLayoutRewritePass() {
+    // NOTE: names are alphabetically sorted.
+    csinfo_.addn = "AddN";
+    csinfo_.avg_pool = "AvgPool";
+    csinfo_.avg_pool_grad = "AvgPoolGrad";
+    csinfo_.bias_add = "BiasAdd";
+    csinfo_.bias_add_grad = "BiasAddGrad";
+    csinfo_.concat = "Concat";
+    csinfo_.concatv2 = "ConcatV2";
+    csinfo_.conv2d = "Conv2D";
+    csinfo_.conv2d_with_bias = "__MklDummyConv2DWithBias";
+    csinfo_.conv2d_grad_input = "Conv2DBackpropInput";
+    csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
+    csinfo_.conv2d_grad_filter_with_bias =
+        "__MklDummyConv2DBackpropFilterWithBias";
+    csinfo_.fused_batch_norm = "FusedBatchNorm";
+    csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
+    csinfo_.identity = "Identity";
+    csinfo_.lrn = "LRN";
+    csinfo_.lrn_grad = "LRNGrad";
+    csinfo_.matmul = "MatMul";
+    csinfo_.max_pool = "MaxPool";
+    csinfo_.max_pool_grad = "MaxPoolGrad";
+    csinfo_.mkl_conv2d = "_MklConv2D";
+    csinfo_.mkl_conv2d_grad_input = "_MklConv2DBackpropInput";
+    csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
+    csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
+    csinfo_.mkl_conv2d_grad_filter_with_bias =
+        "_MklConv2DBackpropFilterWithBias";
+    csinfo_.relu = "Relu";
+    csinfo_.relu_grad = "ReluGrad";
+    csinfo_.tanh = "Tanh";
+    csinfo_.tanh_grad = "TanhGrad";
+    csinfo_.reshape = "Reshape";
+    csinfo_.softmax = "Softmax";
+    csinfo_.split = "Split";
+    // Element-wise ops. Ensure you also add any new ops to IsOpElementWise
+    // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
+    // MklInputConversion op is added before it.
+    csinfo_.add = "Add";
+    csinfo_.maximum = "Maximum";
+    csinfo_.mul = "Mul";
+    csinfo_.squared_difference = "SquaredDifference";
+    csinfo_.sub = "Sub";
+    // End - element-wise ops. See note above.
+
+    // NOTE: names are alphabetically sorted.
+    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
+                      CopyAttrsAddN, AddNRewrite});
+    rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.avg_pool,
+                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.avg_pool_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.concat,
+                      mkl_op_registry::GetMklOpName(csinfo_.concat),
+                      CopyAttrsConcat, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.concatv2,
+                      mkl_op_registry::GetMklOpName(csinfo_.concatv2),
+                      CopyAttrsConcatV2, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv2d),
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_with_bias, csinfo_.mkl_conv2d_with_bias,
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_grad_filter,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_grad_filter_with_bias,
+                      csinfo_.mkl_conv2d_grad_filter_with_bias, CopyAttrsConv2D,
+                      AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_grad_input,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.fused_batch_norm,
+                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
+                      CopyAttrsFusedBatchNorm, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.fused_batch_norm_grad,
+         mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
+         CopyAttrsFusedBatchNorm, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.identity,
+                      mkl_op_registry::GetMklOpName(csinfo_.identity),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
+                      CopyAttrsLRN, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.lrn_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
+                      CopyAttrsLRN, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.max_pool,
+                      mkl_op_registry::GetMklOpName(csinfo_.max_pool),
+                      CopyAttrsPooling, NonDepthBatchWisePoolRewrite});
+    rinfo_.push_back({csinfo_.max_pool_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite});
+
+    rinfo_.push_back({csinfo_.maximum,
+                      mkl_op_registry::GetMklOpName(csinfo_.maximum),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.mul,
+                      mkl_op_registry::GetMklOpName(csinfo_.mul),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.relu_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
+                      CopyAttrsDataType, AlwaysRewrite});
+    /*
+    rinfo_.push_back({csinfo_.tanh,
+                      mkl_op_registry::GetMklOpName(csinfo_.tanh),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.tanh_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
+                      CopyAttrsDataType, AlwaysRewrite});
+    */
+    rinfo_.push_back({csinfo_.reshape,
+                      mkl_op_registry::GetMklOpName(csinfo_.reshape),
+                      CopyAttrsReshape, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.softmax,
+                      mkl_op_registry::GetMklOpName(csinfo_.softmax),
+                      CopyAttrsDataType, AlwaysRewrite});
+
+    rinfo_.push_back({csinfo_.squared_difference,
+                      mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.sub,
+                      mkl_op_registry::GetMklOpName(csinfo_.sub),
+                      CopyAttrsDataType, AlwaysRewrite});
+
+    // Add info about which ops to add workspace edge to and the slots.
+    wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
+    wsinfo_.push_back({csinfo_.max_pool, csinfo_.max_pool_grad, 0, 1, 1, 3});
+
+    // Add a rule for merging nodes
+    minfo_.push_back({csinfo_.conv2d, csinfo_.bias_add,
+                      csinfo_.conv2d_with_bias, GetConv2DOrBiasAdd});
+
+    minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
+                      csinfo_.conv2d_grad_filter_with_bias,
+                      GetConv2DBackpropFilterOrBiasAddGrad});
+  }
+
+  // Standard interface to run pass
+  Status Run(const GraphOptimizationPassOptions& options);
+
+  // Helper function which does most of heavy lifting for rewriting
+  // Mkl nodes to propagate Mkl tensor as additional output
+  //
+  // Extracts common functionality between Run public interface and
+  // test interface.
+  //
+  // @return true, if and only if graph is mutated; false otherwise.
+  bool RunPass(std::unique_ptr<Graph>* g);
+
+  /// Structure to specify the name of an original node, its new name after
+  /// rewrite, the number of inputs to the original node, the function to
+  /// be used to copy attributes for the op, and the rule (if any) which
+  /// must hold for rewriting the node
+  typedef struct {
+    string name;      // Original name of op of the node in the graph
+    string new_name;  // New name of the op of the node in the graph
+    // A function handler to copy attributes from an old node to a new node.
+    std::function<void(const Node*, NodeBuilder*)> copy_attrs;
+    // A rule under which to rewrite this node
+    std::function<bool(const Node*)> rewrite_rule;
+  } RewriteInfo;
+
+  /// Structure to specify a forward op, a backward op, and the slot numbers
+  /// in the forward and backward ops where we will add a workspace edge.
+  typedef struct {
+    string fwd_op;    // Name of a forward op in the graph
+    string bwd_op;    // Name of a backward op in the graph
+    int fwd_slot;     // Output slot in the forward op node where actual
+                      // output tensor resides
+    int bwd_slot;     // Input slot in the backward op node where actual
+                      // input tensor resides
+    int ws_fwd_slot;  // Output slot in the forward op node where workspace
+                      // edge is added
+    int ws_bwd_slot;  // Input slot in the backward op node where workspace
+                      // edge is added
+  } WorkSpaceInfo;
+
+  /// Structure to specify information used in node merge of 2 operators
+  typedef struct {
+    string op1;       // Node string for one operator.
+    string op2;       // Node string for second operator.
+    string new_node;  // Name of the node after merge
+    // Function that enables user of the node merger to specify how to find
+    // second operator given the first operator.
+    std::function<Node*(const Node*)> get_node_to_be_merged;
+  } MergeInfo;
+
+  /// Structure to store all constant strings
+  /// NOTE: names are alphabetically sorted.
+  typedef struct {
+    string addn;
+    string add;
+    string avg_pool;
+    string avg_pool_grad;
+    string bias_add;
+    string bias_add_grad;
+    string concat;
+    string concatv2;
+    string conv2d;
+    string conv2d_with_bias;
+    string conv2d_grad_input;
+    string conv2d_grad_filter;
+    string conv2d_grad_filter_with_bias;
+    string fused_batch_norm;
+    string fused_batch_norm_grad;
+    string identity;
+    string lrn;
+    string lrn_grad;
+    string matmul;
+    string max_pool;
+    string max_pool_grad;
+    string maximum;
+    string mkl_conv2d;
+    string mkl_conv2d_grad_input;
+    string mkl_conv2d_grad_filter;
+    string mkl_conv2d_grad_filter_with_bias;
+    string mkl_conv2d_with_bias;
+    string mul;
+    string relu;
+    string relu_grad;
+    string tanh;
+    string tanh_grad;
+    string reshape;
+    string softmax;
+    string split;
+    string squared_difference;
+    string sub;
+  } ConstStringsInfo;
+
+ private:
+  /// Maintain info about nodes to rewrite
+  std::vector<RewriteInfo> rinfo_;
+
+  /// Maintain info about nodes to add workspace edge
+  std::vector<WorkSpaceInfo> wsinfo_;
+
+  /// Maintain info about nodes to be merged
+  std::vector<MergeInfo> minfo_;
+
+  /// Maintain structure of constant strings
+  static ConstStringsInfo csinfo_;
+
+ private:
+  // Is OpDef::ArgDef a list type? It could be N * T or list(type).
+  // Refer to opdef.proto for details of list type.
+  inline bool ArgIsList(const OpDef::ArgDef& arg) const {
+    return !arg.type_list_attr().empty() || !arg.number_attr().empty();
+  }
+
+  // Get length of a list in 'n' if 'arg' is of list type. Refer to
+  // description of ArgIsList for definition of list type.
+  inline int GetTensorListLength(const OpDef::ArgDef& arg, Node* n) {
+    CHECK_EQ(ArgIsList(arg), true);
+    int N = 0;
+    const string attr_name = !arg.type_list_attr().empty()
+                                 ? arg.type_list_attr()
+                                 : arg.number_attr();
+    if (!arg.type_list_attr().empty()) {
+      std::vector<DataType> value;
+      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &value));
+      N = value.size();
+    } else {
+      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &N));
+    }
+    return N;
+  }
+
+  // Can op represented by node 'n' run on DEVICE_CPU?
+  // Op can run on CPU with MKL if the runtime assigned device or the
+  // user requested device contains device CPU, or both are empty.
+  bool CanOpRunOnCPUDevice(const Node* n) {
+    bool result = true;
+    string reason;
+
+    // Substring that should be checked for in device name for CPU device.
+    const char* const kCPUDeviceSubStr = "CPU";
+
+    // If Op has been specifically assigned to a non-CPU device, then No.
+    if (!n->assigned_device_name().empty() &&
+        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "Op has been assigned a runtime device that is not CPU.";
+    }
+
+    // If user has specifically assigned this op to a non-CPU device, then No.
+    if (!n->def().device().empty() &&
+        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "User has assigned a device that is not CPU.";
+    }
+
+    if (result == false) {
+      VLOG(1) << "MklLayoutRewritePass: Skipping rewriting of the node "
+              << n->type_string() << ", reason: " << reason;
+    }
+
+    // Otherwise Yes.
+    return result;
+  }
+
+  // Return a node that can be merged with input node 'n'
+  //
+  // @return pointer to the node if we can find such a
+  // node. Otherwise, it returns nullptr.
+  Node* CheckForNodeMerge(const Node* n) const;
+
+  // Merge node 'm' with node 'n'.
+  // Currently, we merge (1) Conv2D with BiasAdd, and (2) BiasAddGrad with
+  // Conv2DBackpropFilter.
+  //
+  // Input nodes m and n may be deleted if the call to
+  // this function is successful. Attempt to use the pointers
+  // after the call to function may result in undefined behaviors.
+  //
+  // @input g - input graph, m - graph node, n - graph node to be merged with m
+  // @return Status::OK(), if merging is successful and supported.
+  //         Returns appropriate Status error code otherwise.
+  //         Graph is updated in case nodes are merged. Otherwise, it is
+  //         not updated.
+  Status MergeNode(std::unique_ptr<Graph>* g, Node* m, Node* n);
+
+  // Helper function to merge different nodes
+  Status MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g, Node* m, Node* n);
+  Status MergeConv2DBackpropFilterWithBiasAddGrad(std::unique_ptr<Graph>* g,
+                                                  Node* m, Node* n);
+
+  // Find BiasAdd or Conv2D node that can be merged with input node 'm'.
+  // If input 'm' is BiasAdd, then check if there exists Conv2D node that can be
+  // merged with 'm'. If input 'm' is Conv2D, then check if there exists BiasAdd
+  // node that can be merged with 'm'.
+  static Node* GetConv2DOrBiasAdd(const Node* m) {
+    CHECK_NOTNULL(m);
+    Node* n = nullptr;
+
+    if (m->type_string() == csinfo_.bias_add) {
+      // If a is BiasAdd, then Conv2D is 0th input of BiasAdd.
+      TF_CHECK_OK(m->input_node(0, &n));
+    } else {
+      CHECK_EQ(m->type_string(), csinfo_.conv2d);
+      // Go over all output edges and search for BiasAdd Node.
+      // 0th input of BiasAdd is Conv2D.
+      for (const Edge* e : m->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.bias_add &&
+            e->dst_input() == 0) {
+          n = e->dst();
+          break;
+        }
+      }
+    }
+
+    if (n == nullptr) {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Conv2D and BiasAdd node for merging. Input node: "
+              << m->DebugString();
+    }
+
+    return n;
+  }
+
+  // Find Conv2DBackpropFilter or BiasAddGrad node that can be merged with input
+  // node 'm'. If input 'm' is Conv2DBackpropFilter, then check if there exists
+  // BiasAddGrad node that can be merged with 'm'. If input 'm' is BiasAddGrad,
+  // then check if there exists Conv2DBackpropFilter node that can be merged
+  // with 'm'.
+  //
+  // Graph that will allow us to connect Conv2DBackpropFilter with BiasAddGrad
+  // would look like:
+  //
+  // _ = Conv2DBackpropFilter(F, _, G)
+  // _ = BiasAddGrad(G)
+  //
+  // So 1st input of BiasAddGrad connects with 3rd input of
+  // Conv2DBackpropFilter and vice versa.
+  static Node* GetConv2DBackpropFilterOrBiasAddGrad(const Node* m) {
+    CHECK_NOTNULL(m);
+    Node* n = nullptr;
+
+    if (m->type_string() == csinfo_.bias_add_grad) {
+      // Get 1st input 'g' of BiasAddGrad.
+      Node* g = nullptr;
+      TF_CHECK_OK(m->input_node(0, &g));
+      // Now traverse all outgoing edges from g that have destination node as
+      // Conv2DBackpropFilter.
+      for (const Edge* e : g->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.conv2d_grad_filter &&
+            e->dst_input() == 2 /* 3rd input of BackpropFilter */) {
+          n = e->dst();
+          break;
+        }
+      }
+    } else {
+      CHECK_EQ(m->type_string(), csinfo_.conv2d_grad_filter);
+      // Get 3rd input 'g' of Conv2DBackpropFilter.
+      Node* g = nullptr;
+      TF_CHECK_OK(m->input_node(2, &g));
+      // Now traverse all outgoing edges from g that have destination node as
+      // BiasAddGrad.
+      for (const Edge* e : g->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.bias_add_grad &&
+            e->dst_input() == 0 /* 1st input of BiasAddGrad */) {
+          n = e->dst();
+          break;
+        }
+      }
+    }
+
+    if (n == nullptr) {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Conv2DBackpropFilter and BiasAddGrad node for merging. "
+              << "Input node: " << m->DebugString();
+    }
+    return n;
+  }
+
+  // Check if the node 'n' has any applicable rewrite rule
+  // We check for 2 scenarios for rewrite.
+  //
+  // @return RewriteInfo* for the applicable rewrite rule
+  const RewriteInfo* CheckForNodeRewrite(const Node* n) const;
+
+  // Default rewrite rule to be used in scenario 1 for rewrite.
+  // @return - true (since we want to always rewrite)
+  static bool AlwaysRewrite(const Node* n) { return true; }
+
+  // Check if we are performing pooling on depth or batch. If it is, then we
+  // do not rewrite MaxPool node to Mkl version.
+  // @return - true (if it is not a depth/batch wise pooling case);
+  //           false otherwise.
+  static bool NonDepthBatchWisePoolRewrite(const Node* n) {
+    CHECK_NOTNULL(n);
+
+    string data_format_str;
+    TensorFormat data_format;
+    std::vector<int32> ksize, strides;
+    CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), true);
+    CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
+
+    // Condition that specifies non-batch-wise and non-depth-wise pooling.
+    if (GetTensorDim(ksize, data_format, 'N') == 1 &&
+        GetTensorDim(strides, data_format, 'N') == 1 &&
+        GetTensorDim(ksize, data_format, 'C') == 1 &&
+        GetTensorDim(strides, data_format, 'C') == 1) {
+      return true;
+    }
+
+    return false;
+  }
+
+  static bool AddNRewrite(const Node* n) {
+    CHECK_NOTNULL(n);
+
+    int num;
+    CHECK_EQ(GetNodeAttr(n->def(), "N", &num).ok(), true);
+
+    // Condition that specifies non-batch-wise and non-depth-wise pooling.
+    if (num == 2) {
+      return true;
+    }
+
+    return false;
+  }
+
+  // Rewrites input node to a new node specified by its matching rewrite info.
+  //
+  // Method first searches matching rewrite info for input node and then
+  // uses that info to rewrite.
+  //
+  // Input node may be deleted in case of rewrite. Attempt to use the node
+  // after the call can result in undefined behaviors.
+  //
+  // @input  g - input graph, n - Node to be rewritten,
+  //         ri - matching rewriteinfo
+  // @return Status::OK(), if the input node is rewritten;
+  //         Returns appropriate Status error code otherwise.
+  //         Graph is updated in case the input node is rewritten.
+  //         Otherwise, it is not updated.
+  Status RewriteNode(std::unique_ptr<Graph>* g, Node* n, const RewriteInfo* ri);
+
+  // Get nodes that will feed a list of TF tensors to the new
+  // node that we are constructing.
+  //
+  // @input g - input graph,
+  // @input inputs - inputs to old node that we are using for constructing
+  //                 new inputs,
+  // @input input_idx - the index in the 'inputs' vector pointing to the
+  //                    current input that we have processed so far
+  // @output input_idx - index will be incremented by the number of nodes
+  //                     from 'inputs' that are processed
+  // @input list_length - The expected length of list of TF tensors
+  // @output output_nodes - the list of new nodes creating TF tensors
+  //
+  // @return None
+  void GetNodesProducingTFTensorList(
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
+
+  // Get nodes that will feed a list of Mkl tensors to the new
+  // node that we are constructing.
+  //
+  // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting
+  // @input inputs - inputs to old node that we are using for constructing
+  //                 new inputs,
+  // @input input_idx - the index in the 'inputs' vector pointing to the
+  //                    current input that we have processed so far
+  // @output input_idx - index will be incremented by the number of nodes
+  //                     from 'inputs' that are processed
+  // @input list_length - The expected length of list of Mkl tensors
+  // @output output_nodes - the list of new nodes creating Mkl tensors
+  //
+  // @return None
+  void GetNodesProducingMklTensorList(
+      std::unique_ptr<Graph>* g, Node* orig_node,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
+
+  // Get a node that will feed an Mkl tensor to the new
+  // node that we are constructing. The output node could be (1) 'n'
+  // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
+  // if 'n' is not an Mkl layer.
+  //
+  // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting,
+  // @input n - Node based on which we are creating Mkl node,
+  // @input n_output_slot - the output slot of node 'n'
+  //            which is feeding to the node that we are constructing
+  // @output mkl_node - the new node that will feed Mkl tensor
+  // @output mkl_node_output_slot - the slot number of mkl_node that
+  //                                will feed the tensor
+  // @return None
+  void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
+                                 Node* n, int n_output_slot, Node** mkl_node,
+                                 int* mkl_node_output_slot);
+
+  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
+  // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
+  // set up in contiguous fashion. 'workspace_tensors' carry graph nodes
+  // producing workspace edges if 'are_workspace_tensors_available' is true.
+  // Otherwise, 'workspace_tensors' is empty vector.
+  //
+  // For details, refer to 'Ordering of inputs after rewriting' section in the
+  // documentation above.
+  //
+  // Returns Status::OK() if setting up inputs is successful, otherwise
+  // returns appropriate status code.
+  int SetUpContiguousInputs(
+      std::unique_ptr<Graph>* g,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+      NodeBuilder* nb, Node* old_node,
+      std::vector<NodeBuilder::NodeOut>* workspace_tensors,
+      bool are_workspace_tensors_available);
+
+  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
+  // in graph 'g'. Original node is input in 'orig_node'.
+  //
+  // For details, refer to 'Ordering of Tensorflow tensors and Mkl tensors'
+  // section in the documentation above.
+  //
+  // Returns Status::OK() if setting up inputs is successful, otherwise
+  // returns appropriate status code.
+  Status SetUpInputs(std::unique_ptr<Graph>* g,
+                     const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+                     NodeBuilder* nb, Node* orig_node);
+
+  // Add workspace edge on the input or output side of Node 'orig_node' by using
+  // NodeBuilder 'nb' for the new node provided. If 'orig_node' does not dictate
+  // adding workspace edge then do not add it. Workspace Tensorflow and Mkl
+  // tensors, if they need to be added, will be set into these tensors.
+  // If we set workspace tensors, then are_ws_tensors_added should be true.
+  void AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g, Node* orig_node,
+                                NodeBuilder* nb,
+                                std::vector<NodeBuilder::NodeOut>* ws_tensors,
+                                bool* are_ws_tensors_added);
+
+  // Functions specific to operators to copy attributes
+  // We need operator-specific function to copy attributes because the framework
+  // does not provide any generic function for it.
+  // NOTE: names are alphabetically sorted.
+  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConv2D(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb);
+
+  // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
+  // using node for original node 'orig_node' and return it in '*out'.
+  // TODO(nhasabni) We should move this to mkl_util.h
+  void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out,
+                             Node* orig_node);
+  void GetDummyWorkspaceTensorNode(std::unique_ptr<Graph>* g, Node** out,
+                                   Node* orig_node);
+};
+
+MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_;
+
+// We register Mkl rewrite pass for phase 1 in post partitioning group.
+// We register it here so that we get a complete picture of all users of Mkl
+// nodes. Do not change the ordering of the Mkl passes.
+const OptimizationPassRegistry::Grouping kMklLayoutRewritePassGroup =
+    OptimizationPassRegistry::POST_PARTITIONING;
+REGISTER_OPTIMIZATION(kMklLayoutRewritePassGroup, 1, MklLayoutRewritePass);
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for creating new node
+//////////////////////////////////////////////////////////////////////////
+
+static void FillInputs(const Node* n,
+                       gtl::InlinedVector<Node*, 4>* control_edges,
+                       gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
+  control_edges->clear();
+  for (const Edge* e : n->in_edges()) {
+    if (e->IsControlEdge()) {
+      control_edges->push_back(e->src());
+    } else {
+      (*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output());
+    }
+  }
+  std::sort(control_edges->begin(), control_edges->end());
+  if (n->op_def().is_commutative()) {
+    // For commutative inputs, we sort the input by the input Node*
+    // to get a canonical ordering (so that add(a,b) and add(b, a) will
+    // hash to the same value if is_commutative is true for 'add').
+    std::sort(in->begin(), in->end());
+  }
+}
+
+void MklLayoutRewritePass::GetNodesProducingTFTensorList(
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
+  CHECK_LT(*input_idx, inputs.size());
+  CHECK_GT(list_length, 0);
+  CHECK_NOTNULL(output_nodes);
+  output_nodes->reserve(list_length);
+
+  while (list_length != 0) {
+    CHECK_GT(list_length, 0);
+    CHECK_LT(*input_idx, inputs.size());
+    Node* n = inputs[*input_idx].first;
+    int slot = inputs[*input_idx].second;
+    // If input node 'n' is just producing a single tensor at
+    // output slot 'slot' then we just add that single node.
+    output_nodes->push_back(NodeBuilder::NodeOut(n, slot));
+    (*input_idx)++;
+    list_length--;
+  }
+}
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
+                                                 Node** out, Node* orig_node) {
+  // We use a tensor of shape {8} and value 0,0,0,0,0,0,0,0 to represent
+  // dummy Mkl tensor. 8 = 2*size_t.
+  const DataType dt = DataTypeToEnum<uint8>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
+                           8);
+  TensorShape dummy_shape({8});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // the same device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
+    // Allow duplicate while adding control edge as it would fail (return
+    // NULL) if we try to add duplicate edge.
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out, true));
+  }
+
+  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
+}
+
+void MklLayoutRewritePass::GetNodesProducingMklTensorList(
+    std::unique_ptr<Graph>* g, Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
+  CHECK_LT(*input_idx, inputs.size());
+  CHECK_GT(list_length, 0);
+  CHECK_NOTNULL(output_nodes);
+  output_nodes->reserve(list_length);
+
+  while (list_length != 0) {
+    CHECK_GT(list_length, 0);
+    CHECK_LT(*input_idx, inputs.size());
+    Node* n = inputs[*input_idx].first;
+    int slot = inputs[*input_idx].second;
+    // If 'n' is producing a single tensor, then create a single Mkl tensor
+    // node.
+    Node* mkl_node = nullptr;
+    int mkl_node_output_slot = 0;
+    GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
+                              &mkl_node_output_slot);
+    output_nodes->push_back(
+        NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot));
+    (*input_idx)++;
+    list_length--;
+  }
+}
+
+// Get an input node that will feed Mkl tensor to the new
+// node that we are constructing. An input node could be (1) 'n'
+// if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
+// if 'n' is not an Mkl layer.
+void MklLayoutRewritePass::GetNodeProducingMklTensor(
+    std::unique_ptr<Graph>* g, Node* orig_node, Node* n, int n_output_slot,
+    Node** mkl_node, int* mkl_node_output_slot) {
+  CHECK_NOTNULL(n);
+  CHECK_NOTNULL(mkl_node);
+  CHECK_NOTNULL(mkl_node_output_slot);
+
+  // If this is an MKL op, then it will create extra output for MKL layout.
+  DataType T;
+  if (GetNodeAttr(n->def(), "T", &T).ok() &&
+      mkl_op_registry::IsMklOp(n->type_string(), T)) {
+    // If this is an MKL op, then it will generate an edge that will receive
+    // Mkl tensor from a node.
+    // output slot number for Mkl tensor would be N+slot number of TensorFlow
+    // tensor, where N is total number of TensorFlow tensors.
+    *mkl_node = n;
+    *mkl_node_output_slot =
+        GetTensorMetaDataIndex(n_output_slot, n->num_outputs());
+  } else {
+    // If we have not visited the node and rewritten it, then we need
+    // to create a dummy node that will feed a dummy Mkl tensor to this node.
+    // DummyMklTensor node has no input and generates only 1 output
+    // (dummy Mkl tensor) as output slot number 0.
+    GetDummyMklTensorNode(g, mkl_node, orig_node);
+    CHECK_NOTNULL(*mkl_node);
+    *mkl_node_output_slot = 0;
+  }
+}
+
+int MklLayoutRewritePass::SetUpContiguousInputs(
+    std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+    NodeBuilder* nb, Node* old_node,
+    std::vector<NodeBuilder::NodeOut>* workspace_tensors,
+    bool are_workspace_tensors_available) {
+  CHECK_NOTNULL(workspace_tensors);
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+
+  // TODO(nhasabni): Temporary solution to connect filter input of
+  // BackpropInput with the converted filter from Conv2D.
+  bool do_connect_conv2d_backprop_input_filter = false;
+  Node* conv2d_node = nullptr;
+  // Filter node is 2nd input (slot index 1) of Conv2D.
+  int kConv2DFilterInputSlotIdx = 1;
+  int kConv2DBackpropInputFilterInputSlotIdx = 1;
+  int kConv2DFilterOutputSlotIdx = 1;
+  if (old_node->type_string() == csinfo_.conv2d_grad_input) {
+    // We need to find Conv2D node from Conv2DBackpropInput.
+    // For that let's first find filter node that is 2nd input (slot 1)
+    // of BackpropInput.
+    Node* filter_node = nullptr;
+    old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx, &filter_node);
+    CHECK_NOTNULL(filter_node);
+
+    // Now check which nodes receive from filter_node. Filter feeds as
+    // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias.
+    for (const Edge* e : filter_node->out_edges()) {
+      if ((e->dst()->type_string() == csinfo_.mkl_conv2d ||
+           e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias) &&
+          e->dst_input() == kConv2DFilterInputSlotIdx
+          /* filter is 2nd input of Conv2D and _MklConv2D. */) {
+        if (conv2d_node != nullptr) {
+          VLOG(1) << "MklLayoutRewritePass: unusual case of same filter"
+                  << " feeding multiple Conv2D nodes: "
+                  << filter_node->DebugString();
+          // We will not connect filter input of Conv2DBackpropInput
+          // to be safe here.
+          do_connect_conv2d_backprop_input_filter = false;
+          break;
+        } else {
+          conv2d_node = e->dst();
+          do_connect_conv2d_backprop_input_filter = true;
+        }
+      }
+    }
+  }
+
+  // Number of input slots to original op
+  // Input slots are represented by .Input() calls in REGISTER_OP.
+  int old_node_input_slots = old_node->op_def().input_arg_size();
+  // Actual number of inputs can be greater than or equal to number
+  // of Input slots because inputs of type list could be unfolded.
+  CHECK_GE(old_node_inputs.size(), old_node_input_slots);
+  int nn_slot_idx = 0;  // slot index for inputs of new node
+
+  // Let's copy all inputs (TF tensors) of original node to new node.
+  int iidx = 0;
+  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
+    // An input slot could be a single tensor or a list. We need
+    // to handle this case accordingly.
+    CHECK_LT(iidx, old_node_inputs.size());
+    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
+    if (ArgIsList(arg)) {
+      std::vector<NodeBuilder::NodeOut> new_node_inputs;
+      int N = GetTensorListLength(arg, old_node);
+      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
+                                    &new_node_inputs);
+      nb->Input(new_node_inputs);
+      nn_slot_idx++;
+    } else {
+      // Special case for connecting filter input of Conv2DBackpropInput
+      if (do_connect_conv2d_backprop_input_filter &&
+          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
+        nb->Input(conv2d_node, kConv2DFilterOutputSlotIdx);
+      } else {
+        nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second);
+      }
+      iidx++;
+      nn_slot_idx++;
+    }
+  }
+
+  // If workspace tensors are available for this op and we are using
+  // contiguous ordering then we need to add Tensorflow tensor for
+  // workspace here because Tensorflow tensor for workspace is the
+  // last tensor in the list of Tensorflow tensors.
+  if (are_workspace_tensors_available) {
+    CHECK_EQ(workspace_tensors->size(), 2);
+    // Tensorflow tensor
+    nb->Input((*workspace_tensors)[0].node, (*workspace_tensors)[0].index);
+    nn_slot_idx++;
+  }
+
+  // Let's now setup all Mkl inputs to a new node.
+  // Number of Mkl inputs must be same as number of TF inputs.
+  iidx = 0;
+  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
+    // An input slot could be a single tensor or a list. We need
+    // to handle this case accordingly.
+    CHECK_LT(iidx, old_node_inputs.size());
+    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
+    if (ArgIsList(arg)) {
+      std::vector<NodeBuilder::NodeOut> new_node_inputs;
+      int N = GetTensorListLength(arg, old_node);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
+                                     &new_node_inputs);
+      nb->Input(new_node_inputs);
+      nn_slot_idx++;
+    } else {
+      Node* mkl_node = nullptr;
+      int mkl_node_output_slot = 0;
+      // Special case for connecting filter input of Conv2DBackpropInput
+      if (do_connect_conv2d_backprop_input_filter &&
+          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
+        GetNodeProducingMklTensor(g, old_node, conv2d_node,
+                                  kConv2DFilterOutputSlotIdx, &mkl_node,
+                                  &mkl_node_output_slot);
+      } else {
+        GetNodeProducingMklTensor(g, old_node, old_node_inputs[iidx].first,
+                                  old_node_inputs[iidx].second, &mkl_node,
+                                  &mkl_node_output_slot);
+      }
+      nb->Input(mkl_node, mkl_node_output_slot);
+      iidx++;
+      nn_slot_idx++;
+    }
+  }
+
+  // If workspace tensors are available for this op and we are using
+  // contiguous ordering then we need to add Mkl tensor for
+  // workspace here because Mkl tensor for workspace is the
+  // last tensor in the list of Mkl tensors.
+  if (are_workspace_tensors_available) {
+    CHECK_EQ(workspace_tensors->size(), 2);
+    // Mkl tensor
+    nb->Input((*workspace_tensors)[1].node, (*workspace_tensors)[1].index);
+    nn_slot_idx++;
+  }
+
+  return nn_slot_idx;
+}
+
+Status MklLayoutRewritePass::SetUpInputs(
+    std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+    NodeBuilder* nb, Node* old_node) {
+  // Let's check if we need to add workspace tensors for this node.
+  // We add workspace edge only for MaxPool, LRN and BatchNorm.
+  std::vector<NodeBuilder::NodeOut> workspace_tensors;
+  bool are_workspace_tensors_available = false;
+  AddWorkSpaceEdgeIfNeeded(g, old_node, nb, &workspace_tensors,
+                           &are_workspace_tensors_available);
+
+  int new_node_input_slots = 0;
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // TODO(nhasabni): implement this function just for same of completion.
+    // We do not use interleaved ordering right now.
+    return Status(
+        error::Code::UNIMPLEMENTED,
+        "Interleaved ordering of tensors is currently not supported.");
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    new_node_input_slots = SetUpContiguousInputs(
+        g, old_node_inputs, nb, old_node, &workspace_tensors,
+        are_workspace_tensors_available);
+  }
+
+  // Sanity check
+  int old_node_input_slots = old_node->op_def().input_arg_size();
+  if (!are_workspace_tensors_available) {
+    // If we are not adding workspace tensors for this op, then the total
+    // number of input slots to the new node _must_ be 2 times the number
+    // of input slots to the original node: N original Tensorflow tensors and
+    // N for Mkl tensors corresponding to each Tensorflow tensors.
+    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2);
+  } else {
+    // If we are adding workspace tensors for this op, then the total
+    // The total number of input slots to new node _must_ be 2 times the number
+    // of input slots to the original node: N original Tensorflow tensors and
+    // N for Mkl tensors corresponding to each Tensorflow tensors plus 2
+    // (for workspace Tensorflow tensor and workspace Mkl tensor).
+    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2 + 2);
+  }
+
+  return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions related to workspace pass
+//////////////////////////////////////////////////////////////////////////
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
+    std::unique_ptr<Graph>* g, Node** out, Node* orig_node) {
+  // We use a tensor of shape {1} and value 0 to represent
+  // dummy float tensor. We need this as a dummy workspace tensor.
+  // Workspace tensor has type uint8.
+  const DataType dt = DataTypeToEnum<uint8>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  float zero[1] = {0};
+  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
+                           4);
+  TensorShape dummy_shape({1});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+                  .Attr("value", proto)
+                  .Attr("dtype", dt)
+                  .Device(orig_node->def().device())  // We place this node on
+                                                      // same the device as the
+                                                      // device of the original
+                                                      // node.
+                  .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(
+        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
+    // Allow duplicate while adding control edge as it would fail (return
+    // NULL) if we try to add duplicate edge.
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out, true));
+  }
+
+  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
+}
+
+void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
+    std::unique_ptr<Graph>* g, Node* orig_node, NodeBuilder* nb,
+    std::vector<NodeBuilder::NodeOut>* ws_tensors, bool* are_ws_tensors_added) {
+  bool workspace_edge_added = false;  // Default initializer
+  CHECK_NOTNULL(are_ws_tensors_added);
+  *are_ws_tensors_added = false;  // Default initializer
+
+  DataType T;
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  for (auto ws : wsinfo_) {
+    if (orig_node->type_string() == ws.fwd_op &&
+        mkl_op_registry::IsMklOp(
+            mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
+      // If this op is a fwd op, then we need to check if there is an
+      // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
+      // an edge, then we just add an attribute on this node for setting
+      // workspace_passed to true. We don't add actual workspace edge
+      // in this node. Actual workspace edge gets added in the backward
+      // op for this node.
+      for (const Edge* e : orig_node->out_edges()) {
+        if (e->src_output() == ws.fwd_slot &&
+            e->dst()->type_string() == ws.bwd_op &&
+            e->dst_input() == ws.bwd_slot) {
+          nb->Attr("workspace_enabled", true);
+          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+                  << orig_node->type_string();
+          workspace_edge_added = true;
+          // We found the edge that we were looking for, so break.
+          break;
+        }
+      }
+
+      if (!workspace_edge_added) {
+        // If we are here, then we did not find backward operator for this
+        // node.
+        nb->Attr("workspace_enabled", false);
+      }
+    } else if (orig_node->type_string() == ws.bwd_op &&
+               mkl_op_registry::IsMklOp(
+                   mkl_op_registry::GetMklOpName(orig_node->type_string()),
+                   T)) {
+      // If this op is a bwd op, then we need to add workspace edge and
+      // it's Mkl tensor edge between its corresponding fwd op and this
+      // op. Corresponding fwd op is specified in 'fwd_op' field of
+      // workspace info. fwd_slot and bwd_slot in workspace info specify
+      // an edge between which slots connect forward and backward op.
+      // Once all these criteria match, we add a workspace edge between
+      // ws_fwd_slot and ws_bwd_slot. Its corresponding Mkl tensor is
+      // determined by interleaved/contiguous ordering. Function
+      // DataIndexToMetaDataIndex tells us the location of Mkl tensor
+      // from the location of the Tensorflow tensor.
+      for (const Edge* e : orig_node->in_edges()) {
+        if (e->src_output() == ws.fwd_slot &&
+            // We would have rewritten the forward op, so we need to use
+            // GetMklOpName call to get its Mkl name.
+            e->src()->type_string() ==
+                mkl_op_registry::GetMklOpName(ws.fwd_op) &&
+            e->dst_input() == ws.bwd_slot) {
+          nb->Attr("workspace_enabled", true);
+          CHECK_NOTNULL(ws_tensors);
+          // Add workspace edge between fwd op and bwd op.
+          ws_tensors->push_back(NodeBuilder::NodeOut(e->src(), ws.ws_fwd_slot));
+          // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
+          ws_tensors->push_back(NodeBuilder::NodeOut(
+              e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot,
+                                                 e->src()->num_outputs())));
+          *are_ws_tensors_added = true;
+          // In terms of input ordering, we add these calls to add Input
+          // here because workspace edge (and its Mkl tensor) is the last
+          // edge in the fwdop and bwdop. So all inputs before workspace
+          // tensor have been added by SetUpInputs function.
+          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+                  << orig_node->type_string();
+          workspace_edge_added = true;
+          // We found the edge that we were looking for, so break.
+          break;
+        }
+      }
+
+      // If we are here means we did not find fwd op that feeds to this
+      // bwd op. So in this case, we need to generate dummy tensors for
+      // workspace input and Mkl tensor for workspace, and set
+      // workspace_enabled to false.
+      if (!workspace_edge_added) {
+        nb->Attr("workspace_enabled", false);
+        Node* dmt_ws = nullptr;      // Dummy tensor for workspace
+        Node* dmt_mkl_ws = nullptr;  // Dummy Mkl tensor for workspace
+        GetDummyWorkspaceTensorNode(g, &dmt_ws, orig_node);
+        GetDummyMklTensorNode(g, &dmt_mkl_ws, orig_node);
+        CHECK_NOTNULL(dmt_ws);
+        CHECK_NOTNULL(dmt_mkl_ws);
+        CHECK_NOTNULL(ws_tensors);
+        // We add dummy tensor as workspace tensor.
+        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_ws, 0));
+        // We add dummy tensor as Mkl tensor for workspace tensor.
+        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_mkl_ws, 0));
+        *are_ws_tensors_added = true;
+        VLOG(1) << "MklLayoutRewritePass: dummy workspace_enabled for "
+                << orig_node->type_string();
+      }
+    } else {
+      // If this node does not match any workspace info, then we do not
+      // do anything special for workspace propagation for it.
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Op-specific functions to copy attributes from old node to new node
+//////////////////////////////////////////////////////////////////////////
+
+void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  bool use_cudnn_on_gpu;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
+}
+
+void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
+                                         NodeBuilder* nb) {
+  DataType T;
+  int N;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+}
+
+void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
+                                                NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  std::vector<int32> strides;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
+                                        NodeBuilder* nb) {
+  DataType T;
+  int depth_radius;
+  float bias;
+  float alpha;
+  float beta;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "depth_radius", &depth_radius));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "bias", &bias));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "beta", &beta));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("depth_radius", depth_radius);
+  nb->Attr("bias", bias);
+  nb->Attr("alpha", alpha);
+  nb->Attr("beta", beta);
+}
+
+void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
+                                            NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> ksize, strides;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "ksize", &ksize));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("ksize", ksize);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
+                                             NodeBuilder* nb) {
+  DataType T;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+}
+
+void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
+                                            NodeBuilder* nb) {
+  DataType T;
+  DataType Tshape;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape));
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("Tshape", Tshape);
+}
+
+void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
+                                          NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  int num_split;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_split", &num_split));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("num_split", num_split);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  int N;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+}
+
+void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
+                                             NodeBuilder* nb) {
+  DataType T;
+  int N;
+  DataType tidx;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tidx", &tidx));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+  nb->Attr("Tidx", tidx);
+}
+
+void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
+                                                   NodeBuilder* nb) {
+  DataType T;
+  float epsilon;
+  string data_format;
+  bool is_training;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "is_training", &is_training));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("epsilon", epsilon);
+  nb->Attr("data_format", data_format);
+  nb->Attr("is_training", is_training);
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions related to node merge pass
+//////////////////////////////////////////////////////////////////////////
+
+Node* MklLayoutRewritePass::CheckForNodeMerge(const Node* a) const {
+  // TODO(nhasabni) Add check for type of node similar to CheckForNodeRewrite
+  // once we support BiasAddGrad as Mkl layer.
+
+  // Search for all matching mergeinfo.
+  // We allow more than one match for extensibility.
+  std::vector<const MergeInfo*> matching_mi;
+  for (auto mi = minfo_.cbegin(); mi != minfo_.cend(); ++mi) {
+    if (a->type_string() == mi->op1 || a->type_string() == mi->op2) {
+      matching_mi.push_back(&*mi);
+    }
+  }
+
+  for (const MergeInfo* mi : matching_mi) {
+    // Get the operand with which 'a' can be merged.
+    Node* b = nullptr;
+    if ((b = mi->get_node_to_be_merged(a)) == nullptr) {
+      continue;
+    }
+
+    // Get the control edges and input of node
+    const int N_in = a->num_inputs();
+    gtl::InlinedVector<Node*, 4> a_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> a_in(N_in);
+    FillInputs(a, &a_control_edges, &a_in);
+
+    const int B_in = b->num_inputs();
+    gtl::InlinedVector<Node*, 4> b_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(B_in);
+    FillInputs(b, &b_control_edges, &b_in);
+
+    // Shouldn't merge if a and b have different control edges.
+    if (a_control_edges != b_control_edges) {
+      continue;
+    } else {
+      // We found a match.
+      return b;
+    }
+  }
+
+  return nullptr;
+}
+
+Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
+                                                    Node* m, Node* n) {
+  CHECK_EQ(((m->type_string() == csinfo_.bias_add &&
+             n->type_string() == csinfo_.conv2d)) ||
+               ((n->type_string() == csinfo_.bias_add &&
+                 m->type_string() == csinfo_.conv2d)),
+           true);
+
+  // If 'm' is BiasAdd, then 'n' is Conv2D. Since Conv2D feeds BiasAdd,
+  // BiasAdd is successor node, and Conv2D predecessor node.
+  Node* pred = m->type_string() == csinfo_.bias_add ? n : m;
+  Node* succ = m->type_string() == csinfo_.bias_add ? m : n;
+
+  // 1. Get all attributes from input nodes.
+  DataType T_pred, T_succ;
+  string padding;
+  std::vector<int32> strides;
+  string data_format_pred, data_format_succ;
+  bool use_cudnn_on_gnu;
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  // We check to ensure that data formats of both succ and pred are same.
+  // We expect them to be same, so we can enforce this as assert.
+  // But assert can be too strict, so we enforce this as a check.
+  // If the check fails, then we do not merge two nodes.
+  // We also do same check for devices.
+  if (data_format_pred != data_format_succ || T_pred != T_succ ||
+      pred->assigned_device_name() != succ->assigned_device_name() ||
+      pred->def().device() != succ->def().device()) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "data_format or T attribute or devices of Conv2D and "
+                  "BiasAdd do not match. Will skip node merge optimization");
+  }
+
+  const int succ_num = succ->num_inputs();
+  gtl::InlinedVector<Node*, 4> succ_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> succ_in(succ_num);
+  FillInputs(succ, &succ_control_edges, &succ_in);
+
+  const int pred_num = pred->num_inputs();
+  gtl::InlinedVector<Node*, 4> pred_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> pred_in(pred_num);
+  FillInputs(pred, &pred_control_edges, &pred_in);
+
+  // We need to ensure that Conv2D only feeds to BiasAdd (some other operator is
+  // not expecting output of Conv2D). If this is not the case, then we cannot
+  // merge Conv2D with BiasAdd.
+  const int kFirstOutputSlot = 0;
+  for (const Edge* e : pred->out_edges()) {
+    if (e->src_output() == kFirstOutputSlot && e->dst() != succ) {
+      return Status(error::Code::INVALID_ARGUMENT,
+                    "Conv2D does not feed to BiasAdd, or "
+                    "it feeds BiasAdd but has multiple outputs. "
+                    "Will skip node merge optimization");
+    }
+  }
+
+  // 2. Get inputs from both the nodes.
+  // Find the 2 inputs from the conv and the bias from the add Bias.
+  // Get operand 0, 1 of conv2D.
+  CHECK_EQ(pred->in_edges().size(), 2);  // Conv2D must have 2 inputs.
+  // Get operand 1 of add_bias
+  // BiasAdd must have 2 inputs: Conv, bias
+  CHECK_EQ(succ->in_edges().size(), 2);
+
+  // We will use the node name of BiasAdd as the name of new node
+  // Build new node. We use same name as original node, but change the op
+  // name.
+  NodeBuilder nb(succ->name(), csinfo_.conv2d_with_bias);
+  nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
+  // pred_in[1] will be 2nd Tensorflow tensor for Conv2D.
+  nb.Input(pred_in[1].first, pred_in[1].second);  // In2 of Conv2D
+  // In1 of BiasAdd is same as output of Conv2D.
+  nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
+
+  // Copy attributes from Conv2D to Conv2DWithBias.
+  CopyAttrsConv2D(const_cast<const Node*>(pred), &nb);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(succ->def().device());
+
+  // Create node.
+  Node* new_node;
+  nb.Finalize(&**g, &new_node);
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
+  // node are already copied in BuildNode. We handle control edges now.
+  for (const Edge* e : pred->in_edges()) {
+    if (e->IsControlEdge()) {
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true));
+    }
+  }
+  for (const Edge* e : succ->in_edges()) {
+    if (e->IsControlEdge()) {
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true));
+    }
+  }
+
+  // Incoming edges are fixed, we will fix the outgoing edges now.
+  // First, we will fix outgoing control edges from 'pred' node.
+  for (const Edge* e : pred->out_edges()) {
+    if (e->IsControlEdge()) {
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true));
+    }
+  }
+
+  // Second, we will fix outgoing control and data edges from 'succ' node.
+  for (const Edge* e : succ->out_edges()) {
+    if (e->IsControlEdge()) {
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true));
+    } else {
+      // BiasAdd has only 1 output (at slot 0) and merged node also has only 1
+      // output (at slot 0).
+      const int kConv2DWithBiasOutputSlot = 0;
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot, e->dst(),
+                                  e->dst_input()));
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  // It's ok to use pred or succ as we have enforced a check that
+  // both have same device assigned.
+  new_node->set_assigned_device_name(pred->assigned_device_name());
+
+  VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString()
+          << ", and node: " << succ->DebugString()
+          << ", into node:" << new_node->DebugString();
+
+  (*g)->RemoveNode(succ);
+  (*g)->RemoveNode(pred);
+
+  return Status::OK();
+}
+
+Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
+    std::unique_ptr<Graph>* g, Node* m, Node* n) {
+  CHECK_EQ(((m->type_string() == csinfo_.bias_add_grad &&
+             n->type_string() == csinfo_.conv2d_grad_filter)) ||
+               ((n->type_string() == csinfo_.bias_add_grad &&
+                 m->type_string() == csinfo_.conv2d_grad_filter)),
+           true);
+
+  // If 'm' is BiasAddGrad, then 'n' is BackpropFilter.
+  Node* badd = m->type_string() == csinfo_.bias_add_grad ? m : n;
+  Node* fltr = m->type_string() == csinfo_.bias_add_grad ? n : m;
+
+  // Sanity check for attributes from input nodes.
+  DataType T_b, T_f;
+  string data_format_b, data_format_f;
+  TF_CHECK_OK(GetNodeAttr(badd->def(), "T", &T_b));
+  TF_CHECK_OK(GetNodeAttr(fltr->def(), "T", &T_f));
+  TF_CHECK_OK(GetNodeAttr(badd->def(), "data_format", &data_format_b));
+  TF_CHECK_OK(GetNodeAttr(fltr->def(), "data_format", &data_format_f));
+  if (data_format_b != data_format_f || T_b != T_f ||
+      badd->assigned_device_name() != fltr->assigned_device_name() ||
+      badd->def().device() != fltr->def().device()) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "data_format or T attribute or devices of "
+                  "Conv2DBackpropFilter and BiasAddGrad do not match. "
+                  "Will skip node merge optimization");
+  }
+
+  // We will use the node name of Conv2DBackpropFilter as the name of new node.
+  // This is because BackpropFilterWithBias is going to emit bias output also.
+  NodeBuilder nb(fltr->name(), csinfo_.conv2d_grad_filter_with_bias);
+  // Since Conv2DBackpropFilterWithBias has same number of inputs as
+  // Conv2DBackpropFilter, we can just copy input edges directly. We dont need
+  // to copy any data input of BiasAddGrad because that input also goes to
+  // Conv2DBackpropFilter.
+  const int fltr_ins = fltr->num_inputs();
+  gtl::InlinedVector<Node*, 4> fltr_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> fltr_in_edges(fltr_ins);
+  FillInputs(fltr, &fltr_control_edges, &fltr_in_edges);
+  for (int idx = 0; idx < fltr_ins; idx++) {
+    nb.Input(fltr_in_edges[idx].first, fltr_in_edges[idx].second);
+  }
+
+  // Copy attributes from Conv2DBackpropFilter.
+  CopyAttrsConv2D(const_cast<const Node*>(fltr), &nb);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(fltr->def().device());
+
+  // Create node.
+  Node* new_node;
+  nb.Finalize(&**g, &new_node);
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from BiasAddGrad node and Conv2DBackpropFilter node to
+  // new 'new_node' node are already copied in BuildNode. We handle control
+  // edges now.
+  for (const Edge* e : badd->in_edges()) {
+    if (e->IsControlEdge()) {
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true));
+    }
+  }
+  for (const Edge* e : fltr->in_edges()) {
+    if (e->IsControlEdge()) {
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true));
+    }
+  }
+
+  // Incoming edges are fixed, we will fix the outgoing edges now.
+  // First, we will fix outgoing control edges from 'badd' node.
+  // Conv2DBackpropFilter has 1 output -- filter_grad.
+  // Conv2DBackpropFilterWithBias has 2 outputs -- filter_grad and
+  // bias_grad. But filter_grad is at same slot number (0) in both the
+  // nodes. bias_grad is at slot number 1 in Conv2DBackpropFilterWithBias, while
+  // it is at slot number 0 in BiasAddGrad.
+  const int kMergedNodeFilterGradOutputIdx = 0;
+  const int kMergedNodeBiasGradOutputIdx = 1;
+
+  for (const Edge* e : badd->out_edges()) {
+    if (e->IsControlEdge()) {
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true));
+    } else {
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeBiasGradOutputIdx,
+                                  e->dst(), e->dst_input()));
+    }
+  }
+
+  // Second, we will fix outgoing control and data edges from 'fltr' node.
+  for (const Edge* e : fltr->out_edges()) {
+    if (e->IsControlEdge()) {
+      // We allow duplicate edge for this case since we already add control
+      // edge from new_node in line 3990. Line below could be adding same
+      // edge to same destination again. In such case, if we do not allow
+      // duplicate edge, then this call will fail.
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true));
+    } else {
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeFilterGradOutputIdx,
+                                  e->dst(), e->dst_input()));
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  // It's ok to use badd or fltr as we have enforced a check that
+  // both have same device assigned.
+  new_node->set_assigned_device_name(badd->assigned_device_name());
+
+  VLOG(1) << "MklLayoutRewritePass: Merged old node:" << badd->DebugString()
+          << ", and node: " << fltr->DebugString()
+          << ", into node:" << new_node->DebugString();
+
+  (*g)->RemoveNode(badd);
+  (*g)->RemoveNode(fltr);
+
+  return Status::OK();
+}
+
+Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* m,
+                                       Node* n) {
+  CHECK_NOTNULL(m);
+  CHECK_NOTNULL(n);
+
+  if (((m->type_string() == csinfo_.bias_add &&
+        n->type_string() == csinfo_.conv2d)) ||
+      ((n->type_string() == csinfo_.bias_add &&
+        m->type_string() == csinfo_.conv2d))) {
+    return this->MergeConv2DWithBiasAdd(g, m, n);
+  }
+
+  if (((m->type_string() == csinfo_.bias_add_grad &&
+        n->type_string() == csinfo_.conv2d_grad_filter)) ||
+      ((n->type_string() == csinfo_.bias_add_grad &&
+        m->type_string() == csinfo_.conv2d_grad_filter))) {
+    return this->MergeConv2DBackpropFilterWithBiasAddGrad(g, m, n);
+  }
+
+  return Status(error::Code::UNIMPLEMENTED,
+                "Unimplemented case for node merge optimization.");
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for node rewrite
+//////////////////////////////////////////////////////////////////////////
+
+Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
+                                         Node* orig_node,
+                                         const RewriteInfo* ri) {
+  CHECK_NOTNULL(ri);
+  CHECK_NOTNULL(orig_node);
+
+  VLOG(1) << "MklLayoutRewritePass: Original node:" << orig_node->DebugString();
+
+  // Get all inputs.
+  int num_inputs = orig_node->in_edges().size();
+
+  // Drop count for control edges from inputs
+  for (const Edge* e : orig_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      num_inputs--;
+    }
+  }
+
+  gtl::InlinedVector<Node*, 4> control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num_inputs);
+  FillInputs(orig_node, &control_edges, &inputs);
+
+  // Build new node. We use same name as original node, but change the op name.
+  NodeBuilder nb(orig_node->name().c_str(), ri->new_name.c_str());
+  // Copy user-specified device assigned to original node to new node.
+  nb.Device(orig_node->def().device());
+  // Set up new inputs to the rewritten node.
+  Status s = SetUpInputs(g, inputs, &nb, orig_node);
+  if (s != Status::OK()) {
+    return s;
+  }
+
+  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb);
+  // Set the Mkl layer label for this op.
+  nb.Attr("_kernel", mkl_op_registry::kMklOpLabel);
+
+  // Finalize graph and get new node.
+  Node* new_node = nullptr;
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from 'orig_node' node to new 'new_node' node are
+  // already copied in BuildNode. We need to handle control edges now.
+  for (const Edge* e : orig_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node, true));
+    }
+  }
+
+  // Copy outgoing edges from 'orig_node' node to new
+  // 'new_node' node, since the output also follows same ordering among
+  // Tensorflow tensors and Mkl tensors. We need to connect Tensorflow
+  // tensors appropriately. Specifically, nth output of the original node
+  // will become 2*nth output of the Mkl node for the interleaved ordering
+  // of the tensors. For the contiguous ordering of the tensors, it will be n.
+  // GetTensorDataIndex provides this mapping function.
+  for (const Edge* e : orig_node->out_edges()) {
+    if (e->IsControlEdge()) {
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst(), true));
+    } else {
+      CHECK_NOTNULL((*g)->AddEdge(
+          new_node,
+          GetTensorDataIndex(e->src_output(), e->src()->num_outputs()),
+          e->dst(), e->dst_input()));
+    }
+  }
+
+  // Copy the runtime device assigned from original code to new node.
+  new_node->set_assigned_device_name(orig_node->assigned_device_name());
+
+  // Delete original node and mark new node as rewritten.
+  (*g)->RemoveNode(orig_node);
+
+  VLOG(1) << "MklLayoutRewritePass: New node:" << new_node->DebugString();
+  return Status::OK();
+}
+
+const MklLayoutRewritePass::RewriteInfo*
+MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
+  CHECK_NOTNULL(n);
+
+  // First check if node along with its type is supported by MKL layer.
+  // We do not want to rewrite an op into Mkl op if types are not supported.
+  // E.g., MklRelu does not support INT32. So we cannot rewrite Relu to
+  // MklRelu if type is INT32.
+  DataType T;
+  if (!GetNodeAttr(n->def(), "T", &T).ok()) {
+    return nullptr;
+  }
+
+  // We make an exception for __MklDummyConv2DWithBias and
+  // __MklConv2DBackpropFilterWithBias since their names do not match Mkl node
+  // names.
+  if (n->type_string() != csinfo_.conv2d_with_bias &&
+      n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
+      !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
+                                T)) {
+    return nullptr;
+  }
+
+  // For elementwise node, we reuse the Eigen implementation and pass the MKL
+  // metadata tensor through so we can avoid conversions. However, if all
+  // incoming edges are in TF format, we don't need all this overhead, so
+  // replace the elementwise node only if at least one of its parents is a MKL
+  // node.
+  //
+  // Identity nodes can also skip replacement if they are not being served by
+  // any MKL nodes.
+  //
+  // TODO(vrane): Add implementation for element-wise ops that doesn't reuse
+  // eigen code to reduce cross-library dependency.
+  VLOG(1) << "ELEMENTWISE: checking op: " << n->type_string();
+  if (mkl_op_registry::IsMklElementWiseOp(
+          mkl_op_registry::GetMklOpName(n->type_string()), T) ||
+      n->type_string().find("Identity") != string::npos) {
+    VLOG(1) << "ELEMENTWISE: op is elementwise: " << n->type_string();
+    bool incoming_mkl_edge = false;
+    int num_parent = 0;
+    for (auto parent : n->in_edges()) {
+      if (mkl_op_registry::IsMklOp(parent->src()->type_string(), T)) {
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++
+                << " is MKL op: " << parent->src()->type_string();
+        incoming_mkl_edge = true;
+        break;
+      } else {
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++
+                << " is NON-MKL op: " << parent->src()->type_string();
+      }
+    }
+    if (incoming_mkl_edge == false) {
+      VLOG(1) << "ELEMENTWISE: Skipping replacement of elementwise node which "
+                 "has no MKL "
+                 "parents.";
+      return nullptr;
+    } else {
+      VLOG(1) << "ELEMENTWISE: Replacing elementwise node " << n->type_string()
+              << " which has MKL parents";
+    }
+  }
+
+  // We now check if rewrite rule applies for this op. If rewrite rule passes
+  // for this op, then we rewrite it to Mkl op.
+  // Find matching RewriteInfo and then check that rewrite rule applies.
+  for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
+    if (n->type_string().compare(ri->name) == 0 && ri->rewrite_rule(n)) {
+      return &*ri;
+    }
+  }
+
+  // Else return not found.
+  return nullptr;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//              Run function for the pass
+///////////////////////////////////////////////////////////////////////////////
+
+bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
+  bool result = false;
+  CHECK_NOTNULL(g);
+
+  DumpGraph("Before running MklLayoutRewritePass", &**g);
+
+  std::vector<Node*> order;
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    Node* m = nullptr;
+    if ((m = CheckForNodeMerge(n)) != nullptr && CanOpRunOnCPUDevice(m)) {
+      // Check if the node 'n' can be merged with any other node. If it can
+      // be 'm' contains the node with which it can be merged.
+      string n1_name = n->name();
+      string n2_name = m->name();
+
+      VLOG(1) << "MklLayoutRewritePass: Scheduled nodes " << n1_name << " and "
+              << n2_name << " for merging";
+
+      if (MergeNode(g, n, m) == Status::OK()) {
+        VLOG(1) << "MklLayoutRewritePass: Merged nodes " << n1_name << " and "
+                << n2_name;
+        result = true;
+      }
+    }
+  }
+
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge)", &**g);
+
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    const RewriteInfo* ri = nullptr;
+    // We will first search if node is to be rewritten.
+    if ((ri = CheckForNodeRewrite(n)) != nullptr) {
+      string node_name = n->name();
+      string op_name = n->type_string();
+
+      VLOG(1) << "MklLayoutRewritePass: Scheduled node " << node_name
+              << " with op " << op_name << " for rewrite using"
+              << " layout optimization.";
+
+      if (RewriteNode(g, n, ri) == Status::OK()) {
+        VLOG(1) << "MklLayoutRewritePass: rewrote node " << node_name
+                << " with op " << op_name << " for Mkl layout optimization.";
+        result = true;
+      }
+    }
+  }
+
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite)", &**g);
+
+  return result;
+}
+
+bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
+  return MklLayoutRewritePass().RunPass(g);
+}
+
+Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
+  if (options.graph == nullptr && options.partition_graphs == nullptr) {
+    return Status::OK();
+  }
+
+  auto process_graph = [&](std::unique_ptr<Graph>* g) {
+    // Get the ownership of a graph
+    std::unique_ptr<Graph>* ng = std::move(g);
+    RunPass(ng);
+    // Return the ownership of a graph back
+    g->reset(ng->release());
+  };
+
+  if (kMklLayoutRewritePassGroup !=
+      OptimizationPassRegistry::POST_PARTITIONING) {
+    // For any pre-partitioning phase, a graph is stored in options.graph.
+    process_graph(options.graph);
+  } else {
+    // For post partitioning phase, graphs are stored in
+    // options.partition_graphs.
+    for (auto& pg : *options.partition_graphs) {
+      process_graph(&pg.second);
+    }
+  }
+
+  return Status::OK();
+}
+#endif  // INTEL_MKL_ML
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index abc63e4f35aa9fd6f1df127741ae6d10f49024b9..5e2a465e22c7cbe45cbea40ea7a11491e2b2ad24 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -37,6 +37,9 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
+
+#ifdef INTEL_MKL_ML
+
 namespace {
 
 const char kCPUDevice[] = "/job:a/replica:0/task:0/device:CPU:0";
@@ -122,8 +125,10 @@ REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
 REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
 REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
 REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
-REGISTER_OP("_MklInput2").Output("o: uint8")
-                        .Output("o1: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2")
+    .Output("o: uint8")
+    .Output("o1: uint8")
+    .SetIsStateful();
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to node merge optiimization
@@ -495,7 +500,6 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) {
             "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
 }
 
-
 // BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) {
   InitGraph(
@@ -871,11 +875,12 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
-            "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+      "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
+      "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+      "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
 // Concat with 2 Mkl layers feeding it
@@ -1270,7 +1275,8 @@ TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
       "node { name: 'H' op: 'Input'}"
       "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['H', 'G'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
       "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
       "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
       "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
@@ -1637,7 +1643,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }", kGPUDevice);
+      " input: ['B', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
 }
@@ -1663,7 +1670,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
       "node { name: 'F' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }", kGPUDevice);
+      " input: ['E'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
             "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
@@ -1684,7 +1692,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }", kGPUDevice);
+      " input: ['A', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
             "A->D;A->E;B->D:1;C->D:2;D->E:1");
@@ -1697,7 +1706,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -1710,7 +1720,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }", kGPUDevice);
+      " input: ['A', 'C'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
 }
@@ -1726,7 +1737,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -1742,7 +1754,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }", kGPUDevice);
+      " input: ['A', 'B'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
 }
@@ -1763,7 +1776,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['A', 'B:0', 'B:1']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
             "B->D:1;B:1->D:2;C->E;D->E:1");
@@ -1785,7 +1799,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
       " attr { key: 'N'                value { i: 2 } }"
       " input: ['B:0', 'B:1', 'A']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }", kGPUDevice);
+      " input: ['C', 'D'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
             "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
@@ -1805,7 +1820,8 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
       " attr { key: 'is_training'  value { b: true } }"
       " input: ['A', 'B', 'C', 'D', 'E'] }"
       "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }", kGPUDevice);
+      " input: ['A', 'F'] }",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Input);D(Input);E(Input);"
             "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
@@ -1834,7 +1850,8 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
       "node { name: 'Y' op: 'Input'}"
       "node { name: 'Z' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}", kGPUDevice);
+      " input: ['E', 'Y']}",
+      kGPUDevice);
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
             "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
@@ -1881,6 +1898,1642 @@ static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
 BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
+
+#else  // INTEL_MKL_ML
+
+namespace {
+
+const char kCPUDevice[] = "/job:a/replica:0/task:0/device:CPU:0";
+const char kGPUDevice[] = "/job:a/replica:0/task:0/device:GPU:0";
+
+static void InitGraph(const string& s, Graph* graph,
+                      const string& device = kCPUDevice) {
+  GraphDef graph_def;
+
+  auto parser = protobuf::TextFormat::Parser();
+  //  parser.AllowRelaxedWhitespace(true);
+  CHECK(parser.MergeFromString(s, &graph_def)) << s;
+  GraphConstructorOptions opts;
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
+
+  for (Node* node : graph->nodes()) {
+    node->set_assigned_device_name(device);
+  }
+}
+
+class MklLayoutPassTest : public ::testing::Test {
+ public:
+  MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
+
+  void InitGraph(const string& s, const string& device = kCPUDevice) {
+    ::tensorflow::InitGraph(s, &graph_, device);
+    original_ = CanonicalGraphString(&graph_);
+  }
+
+  static bool IncludeNode(const Node* n) { return n->IsOp(); }
+
+  static string EdgeId(const Node* n, int index) {
+    if (index == 0) {
+      return n->name();
+    } else if (index == Graph::kControlSlot) {
+      return strings::StrCat(n->name(), ":control");
+    } else {
+      return strings::StrCat(n->name(), ":", index);
+    }
+  }
+
+  string CanonicalGraphString(Graph* g) {
+    std::vector<string> nodes;
+    std::vector<string> edges;
+    for (const Node* n : g->nodes()) {
+      if (IncludeNode(n)) {
+        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
+      }
+    }
+    for (const Edge* e : g->edges()) {
+      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
+        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
+                                        EdgeId(e->dst(), e->dst_input())));
+      }
+    }
+    // Canonicalize
+    std::sort(nodes.begin(), nodes.end());
+    std::sort(edges.begin(), edges.end());
+    return strings::StrCat(str_util::Join(nodes, ";"), "|",
+                           str_util::Join(edges, ";"));
+  }
+
+  string DoMklLayoutOptimizationPass() {
+    string before = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "Before MKL layout rewrite pass: " << before;
+
+    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
+    RunMklLayoutRewritePass(ug);
+
+    string result = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "After MKL layout rewrite pass:  " << result;
+    return result;
+  }
+
+  const string& OriginalGraph() const { return original_; }
+
+  Graph graph_;
+  string original_;
+};
+
+REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
+REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
+REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
+REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2")
+    .Output("o: uint8")
+    .Output("o1: uint8")
+    .SetIsStateful();
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to node merge optiimization
+/////////////////////////////////////////////////////////////////////
+
+TEST_F(MklLayoutPassTest, Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Zeta);D(Zeta)|"
+            "A->C;A->D;B->C:1;B->D:1");
+}
+
+// Test set 1: Conv2D + AddBias
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Zeta(E,Y)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklConv2DWithBias);Y(Input);Z(Zeta)|A->E;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;DMT/_1->E:4;"
+            "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+
+// Graph contains only Conv2D, no AddBias.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);DMT/_0(Const);DMT/_1(Const)|"
+            "A->C;A:control->DMT/_0:control;A:control->DMT/_1:control;B->C:1;"
+            "DMT/_0->C:2;DMT/_1->C:3");
+}
+
+// Conv2D output does not go to BiasAdd.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }");  // Output of _MklConv2D does not go to BiasAdd.
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);E(Input);F(BiasAdd)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;D->F;DMT/_0->C:2;DMT/_1->C:3;"
+            "E->F:1");
+}
+
+// Conv2D has two outgoing edges: BiasAdd and some other dummy node (Zeta).
+// Merge should not be done in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }"  // Conv2D has two outputs.
+                              // No merge should happen.
+      "node { name: 'G' op: 'Zeta'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);E(Input);F(BiasAdd);G(Zeta)|A->C;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;B->C:1;C->G;"
+            "D->F;DMT/_0->C:2;DMT/_1->C:3;E->F:1;E->G:1");
+}
+
+// data_format attribute value mismatch. Merge should not be done
+// in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHCW' } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);E(BiasAdd)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->E;D->E:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+// Test set 2: BiasAddGrad + Conv2DBackpropFilter fusion tests
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);"
+            "D(_MklConv2DBackpropFilterWithBias);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const)|A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;C->D:2;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// BiasAddGrad fusion in the presence of BackpropFilter. But nodes do not match
+// criteria for rewrite. So rewrite should not happen. 3rd input of
+// Conv2DBackpropFilter is different than input to BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['A'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);"
+            "D(_MklConv2DBackpropFilter);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(BiasAddGrad)|A->D;A->E;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;C->D:2;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// BiasAddGrad fusion, but nodes do not match criteria for fusion.
+// Different input formats.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " input: ['A'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);"
+            "D(_MklConv2DBackpropFilter);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(BiasAddGrad)|A->D;A->E;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;C->D:2;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// BiasAddGrad fusion in the presence of BackpropFilter only. Fusion is done
+// before node rewrite. Check this ordering.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
+            "C->D:2;D->E;E->G;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
+            "O->G:5");
+}
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Y=Zeta(E,X);
+// G=Conv2DBackpropInput(F,B,E)
+// This is a case of node rewrite followed by node merge followed by connecting
+// filter output of Conv2DWithBias to filter input of Conv2DBackpropInput.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'X' op: 'Input'}"
+      "node { name: 'Y' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'X']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: 'Conv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['F', 'B', 'E']}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['G', 'X']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2DWithBias);F(Int32Input);"
+            "G(_MklConv2DBackpropInput);X(Input);Y(Zeta);Z(Zeta)|"
+            "A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;"
+            "DMT/_1->E:4;DMT/_2->E:5;DMT/_3->G:3;E->G:2;E->Y;E:1->G:1;E:2->G:5;"
+            "E:3->G:4;F->G;F:control->DMT/_3:control;G->Z;X->Y:1;X->Z:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node to Mkl node
+/////////////////////////////////////////////////////////////////////
+
+// Single Conv2D Op; No Mkl layer on the input and on the output.
+// We will generate dummy Mkl tensor as 2nd input of Conv2D.
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+// 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
+// have 2 outputs, both of which will be inputs to next Conv2D.
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(_MklConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->C;A->D;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->C:1;C->D:1;C->E;"
+            "C:2->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2");
+}
+
+// Conv2D with INT32 which is not supported by Mkl
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
+  InitGraph(
+      "node { name: 'A' op: 'HalfInput'}"
+      "node { name: 'B' op: 'HalfInput'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_HALF } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(HalfInput);B(HalfInput);C(Conv2D);D(Zeta)|"
+            "A->C;B->C:1;B->D;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropFilter);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['B', 'A', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropInput);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
+            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
+            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Polygamma'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'MatMul'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'transpose_a'      value { b: false } }"
+      " attr { key: 'transpose_b'      value { b: false } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Zeta);E(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput)|A->C;A->D:1;B->C:1;C->D;D->E;"
+            "M->C:2;N->C:3");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+      "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
+      "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+      "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Concat with 2 Mkl layers feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
+            "F(_MklConv2D);G(Const);H(_MklConcat);I(Zeta)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "DMT/_4->H:3;E->H:1;E:2->H:4;F->H:2;F:2->H:5;G->H;"
+            "G:control->DMT/_4:control;H->I:1");
+}
+
+// Concat with 1 Mkl and 1 non-Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
+            "H(_MklConcat);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:2->H:4;F->H:2;"
+            "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1");
+}
+
+// ConcatV2 Op test: ConcatV2 with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(_MklConcatV2);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D:2;B->D;B:1->D:1;"
+            "B:control->DMT/_0:control;B:control->DMT/_1:control;"
+            "B:control->DMT/_2:control;C->E;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// ConcatV2 with 2 Mkl layers feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['E', 'F', 'G']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
+            "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Zeta)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;"
+            "C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "DMT/_4->H:5;E->H;E:2->H:3;E:control->DMT/_4:control;F->H:1;"
+            "F:2->H:4;G->H:2;H->I:1");
+}
+
+// ConcatV2 with 1 Mkl and 1 non-Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['E', 'F', 'G']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
+            "H(_MklConcatV2);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:2->H:3;"
+            "E:control->DMT/_2:control;E:control->DMT/_3:control;F->H:1;"
+            "G->H:2;H->I:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(Zeta);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
+            "DMT/_1->C:2");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklAvgPool);C(Zeta);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Int32Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'AvgPoolGrad' "
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolAvgPoolGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'I' op: 'Int32Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'AvgPoolGrad' "
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['I', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"
+            "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"
+            "I:control->DMT/_1:control");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNormGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNormGrad);G(Zeta)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNorm);G(Zeta)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node for workspace edges
+/////////////////////////////////////////////////////////////////////
+
+/* Test LRN->MaxPool->MaxPoolGrad->LRNGrad replacement by workspace nodes. */
+TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['B'] }"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['B', 'C', 'D'] }"
+      "node { name: 'F' op: 'Input'}"
+      "node { name: 'G' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['E', 'F', 'B'] }"
+      "node { name: 'H' op: 'Input'}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['H', 'G'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
+      "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
+      "B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;B:control->DMT/_1:control;C->E:1;"
+      "C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;"
+      "E->G;E:1->G:4;E:control->DMT/_2:control;F->G:1;G->I:1;H->I");
+}
+
+/* Test LRN->LRNGrad replacement by workspace nodes. */
+TEST_F(MklLayoutPassTest, LRN_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'D', 'B'] }"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklLRNGrad);F(Zeta)|"
+            "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");
+}
+
+/* Test LRN->LRNGrad replacement when only one of them is present. */
+TEST_F(MklLayoutPassTest, LRN_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Zeta);DMT/_0(Const)|"
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+/* Test LRN->LRNGrad replacement when only one of them is present. */
+TEST_F(MklLayoutPassTest, LRN_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklLRNGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+}
+
+/* Test LRN->LRNGrad negative case, where single LRN feeds
+   2 LRNGrad nodes at different slots. */
+TEST_F(MklLayoutPassTest, LRN_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'D', 'B'] }"
+      "node { name: 'F' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['E', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);DMT/_5(Const);"
+            "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Zeta)|A->B;"
+            "A:control->DMT/_0:control;B->E:2;"
+            "B->F:1;B:1->E:3;B:2->E:6;B:2->F:5;B:3->E:7;C->E;C->F;"
+            "C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "C:control->DMT/_3:control;C:control->DMT/_4:control;"
+            "C:control->DMT/_5:control;C:control->DMT/_6:control;"
+            "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->F:3;DMT/_2->F:7;DMT/_3->F:4;"
+            "DMT/_4->F:6;DMT/_5->E:4;DMT/_6->E:5;E->G;F->G:1");
+}
+
+/* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Zeta)|"
+            "A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1");
+}
+
+// Test MaxPool>MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPool node but workspace edges will not
+// be present.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklMaxPool);C(Zeta);DMT/_0(Const)|"
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+// Test MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPoolGrad and for workspace tensor and
+// its Mkl part, we will generate dummy tensor.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklMaxPoolGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative4) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative5) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:2, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative6) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:2, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative7) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative8) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative9) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:2} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative10) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+
+// Single Conv2D Op on GPU device
+// No rewrite should happen
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
+            "O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;E->F;"
+            "M->D:3;N->D:4;O->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
+            "A->D;A->E;B->D:1;C->D:2;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
+            "B->D:1;B:1->D:2;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
+            "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);E(Input);"
+            "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
+            "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
+            "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+
+static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
+  testing::StopTiming();
+  string s;
+  for (int in = 0; in < 10; in++) {
+    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
+  }
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  for (int op = 0; op < op_nodes; op++) {
+    s += strings::Printf(
+        "node { name: 'op%04d' op: 'Zeta' attr { key: 'T' value { "
+        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
+        op, rnd.Uniform(10), rnd.Uniform(10));
+  }
+
+  bool first = true;
+  while (iters > 0) {
+    Graph* graph = new Graph(OpRegistry::Global());
+    InitGraph(s, graph);
+    int N = graph->num_node_ids();
+    if (first) {
+      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
+      first = false;
+    }
+    {
+      testing::StartTiming();
+      std::unique_ptr<Graph> ug(graph);
+      RunMklLayoutRewritePass(&ug);
+      testing::StopTiming();
+    }
+    iters -= N;  // Our benchmark units are individual graph nodes,
+                 // not whole graphs
+    // delete graph;
+  }
+}
+BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
+
+}  // namespace
+
+#endif  // INTEL_MKL_ML
+
 }  // namespace tensorflow
 
 #endif /* INTEL_MKL */
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index 599bb88f015bfc035b7666747571a652a954139d..5343e6802d1e75f516925d44ab680b96f4e157da 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 
-#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
+#include "tensorflow/core/graph/mkl_tfconversion_pass.h"
 
 namespace tensorflow {
 
@@ -152,12 +152,12 @@ Status MklToTfConversionPass::InsertConversionNodeOnEdge(
   string data_format;
 
   TF_CHECK_OK(GetNodeAttr(src->def(), "T", &src_datatype));
-  bool dst_dtype_found = GetNodeAttr(dst->def(), "T", &dst_datatype) ==
-                          Status::OK();
+  bool dst_dtype_found =
+      GetNodeAttr(dst->def(), "T", &dst_datatype) == Status::OK();
   // We compare source and destination datatypes only when both are found.
   if (dst_dtype_found && (src_datatype != dst_datatype)) {
-    string err_msg = "T attribute of " + src->name() + " and " +
-                      dst->name() + " do not match. Will not insert" +
+    string err_msg = "T attribute of " + src->name() + " and " + dst->name() +
+                     " do not match. Will not insert" +
                      " MklToTf node in such case.";
     return Status(error::Code::INVALID_ARGUMENT, err_msg.c_str());
   }
@@ -325,12 +325,12 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
     // may not be Mkl node.
     DataType src_datatype;
     DataType dst_datatype;
-    bool src_is_mkl_op = (GetNodeAttr(src->def(), "T", &src_datatype) ==
-                            Status::OK() &&
-                          IsMklSupportedOp(src->type_string(), src_datatype));
-    bool dst_is_mkl_op = (GetNodeAttr(dst->def(), "T", &dst_datatype) ==
-                            Status::OK() &&
-                          IsMklSupportedOp(dst->type_string(), dst_datatype));
+    bool src_is_mkl_op =
+        (GetNodeAttr(src->def(), "T", &src_datatype) == Status::OK() &&
+         IsMklSupportedOp(src->type_string(), src_datatype));
+    bool dst_is_mkl_op =
+        (GetNodeAttr(dst->def(), "T", &dst_datatype) == Status::OK() &&
+         IsMklSupportedOp(dst->type_string(), dst_datatype));
 
     // Check if src with is Mkl-compliant, while dst is not Mkl-compliant.
     if (src_is_mkl_op && !dst_is_mkl_op) {
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index fde1ea17437e86d01054a1b153055170bda51e8b..7219d9812f3e4a01cffa4b6b17d38781f7d5e2b0 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -361,7 +362,7 @@ static void BM_SubgraphHelper(int iters, int num_nodes,
         last_node = ops::SourceOp("In", b.opts().WithName(name));
       }
     }
-    TF_CHECK_OK(b.ToGraph(&g));
+    TF_CHECK_OK(GraphDefBuilderToGraph(b, &g));
   }
 
   std::vector<string> fed;
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 172471e34bc5ce344a4a8db2d404b77b7406c99f..0d88d1ff723b94783693559926c51c6726a2341b 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -40,7 +40,7 @@ REGISTER_KERNEL_BUILDER(
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
     Name("HostConst").Device(DEVICE_SYCL).HostMemory("output"), HostConstantOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register the HostConst Op
 // Returns a constant tensor on the host.  Useful for writing C++ tests
@@ -273,6 +273,16 @@ Node* Reverse(Graph* g, Node* tensor, Node* axis) {
   return Binary(g, "ReverseV2", tensor, axis);
 }
 
+Node* Roll(Graph* g, Node* input, Node* shift, Node* axis) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Roll", g->op_registry())
+                  .Input(input)
+                  .Input(shift)
+                  .Input(axis)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 Node* Error(Graph* g, Node* input, const string& errmsg) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Error")
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index 06597778bb204c83dae7699e1ffe0e2b196ac160..eb9038d619ed273bbfd2596bce964fda005b4ec1 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -117,6 +117,10 @@ Node* RandomGamma(Graph* g, Node* shape, Node* alpha);
 // Output dtype determined by lam.
 Node* RandomPoisson(Graph* g, Node* shape, Node* lam);
 
+// Rolls tensor by an offset of <shift> along the corresponding
+// <axis> dimensions.
+Node* Roll(Graph* g, Node* input, Node* shift, Node* axis);
+
 // Generates random parameters from the truncated standard normal distribution
 // of the nput shape
 Node* TruncatedNormal(Graph* g, Node* input, DataType dtype);
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 99f1318072220d397870794cf3d2643d64b9696e..2ca9b720ee127b892c06230efb3517f5afabea45 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -159,6 +159,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index e9ddb86a108c3da0e1a052f547bdc9a40ace904f..5b8ce373bcf87a10875e764ba5cdbec96d58c080 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -78,6 +78,8 @@ tf_cc_test(
     srcs = ["virtual_cluster_test.cc"],
     deps = [
         ":virtual_cluster",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -99,7 +101,9 @@ cc_library(
         "//tensorflow/cc:coordinator",
         "//tensorflow/cc:queue_runner",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:ops_util",
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index e2db47b758f588f0a356bde1c9eacc0d5ff7f335..39bfca244ed2d40544dd2a17a019dadbe50f6d29 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -23,8 +23,7 @@ Cluster::Cluster(int timeout_s) : timeout_s_(timeout_s) {
   DisableDetailedStats(false);
 }
 
-Cluster::~Cluster() {
-}
+Cluster::~Cluster() {}
 
 void Cluster::AllowSoftPlacement(bool soft_placement_state) {
   options_.config.set_allow_soft_placement(soft_placement_state);
@@ -35,6 +34,10 @@ void Cluster::SetNumWarmupSteps(int num_steps) {
       num_steps);
 }
 
+int Cluster::NumWarmupSteps() const {
+  return options_.config.graph_options().build_cost_model_after();
+}
+
 void Cluster::DisableDetailedStats(bool disable) {
   if (disable) {
     options_.config.mutable_graph_options()->set_build_cost_model(0);
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index 616ab6ffdcc1e62c4c56f6826a8a5852d51b00d7..5068f72b30d49850ab445318d1f67d0f4e0e618a 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -38,6 +39,9 @@ class Cluster {
   explicit Cluster(int timeout_s);
   virtual ~Cluster();
 
+  // Returns a string that represent the type of cluster that was instantiated.
+  virtual string type() const = 0;
+
   // Provision the hardware resources needed to run TensorFlow and start a
   // TensorFlow session that can take advantage of these resources.
   // The actual resources that are leveraged depend on the type of cluster
@@ -64,6 +68,9 @@ class Cluster {
   // before Provision().
   void SetNumWarmupSteps(int num_steps);
 
+  // Returns the number of warmup steps.
+  int NumWarmupSteps() const;
+
   // Disable the collection of detailed statistics. Must be called
   // before Provision().
   void DisableDetailedStats(bool disable);
@@ -85,6 +92,21 @@ class Cluster {
   // sorted alphabetically.
   const std::vector<string> GetDeviceNames() const;
 
+  // Enables collecting the allocator stats. Call with enable=true must be made
+  // before Provision().
+  virtual Status EnablePeakMemoryStats(bool enable) {
+    return errors::Unimplemented(strings ::StrCat(
+        "Peak Memory Stats are not supported on ", type(), " clusters"));
+  }
+
+  // Returns peak memory of all devices during the session creation and session
+  // runs.
+  virtual Status GetPeakMemoryUsage(
+      std::unordered_map<string, uint64>* device_peak_memory) const {
+    return errors::Unimplemented(
+        "GetPeakMemoryUsage is not implemented for this type of cluster.");
+  }
+
   // Prepare the session to run the specified grappler item. This include
   // initializing all the model variables.
   virtual Status Initialize(const GrapplerItem& item) = 0;
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 1a6fad41828c1cc3eaa0d78d12d984dcf5b59692..cc7f418d49816d64ffc51704d2f127a441815d7b 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/cc/training/queue_runner.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/grappler/clusters/utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -26,25 +28,16 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace grappler {
 
-static std::atomic<bool> already_created(false);
+static std::atomic<bool> already_provisioned(false);
 
 SingleMachine::SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus)
-    : Cluster(timeout_s),
-      num_gpus_(num_gpus),
-      expected_init_time_s_(0),
-      closing_(false) {
-  // This is really ugly: to avoid leaking variables, we need to reset the tf
-  // session every time we're done processing a grappler item. However,
-  // variables are global, and therefore we can't have more than 1 session alive
-  // at a time. This check detects when more that one cluster is created.
-  CHECK(!already_created);
-  already_created = true;
-
+    : Cluster(timeout_s), expected_init_time_s_(0), closing_(false) {
   VLOG(1) << "Number of CPU cores: " << num_cpu_cores
           << " Number of GPUs: " << num_gpus;
   thread_pool_.reset(new thread::ThreadPool(
@@ -71,26 +64,44 @@ SingleMachine::~SingleMachine() {
   // Reset the thread-pool so that there are no outstanding Session::Run(...)s
   // when we delete the session.
   thread_pool_.reset();
-
-  CHECK(already_created);
-  already_created = false;
 }
 
 Status SingleMachine::Provision() {
-  Status status = ResetSession();
-  if (!status.ok()) {
-    return status;
+  // This is really ugly: to avoid leaking variables, we need to reset the tf
+  // session every time we're done processing a grappler item. However,
+  // variables are global, and therefore we can't have more than 1 session alive
+  // at a time. This check detects when more that one cluster is provisioned.
+  if (already_provisioned) {
+    return errors::Unavailable(
+        "Can't provision more than one single cluster at a time");
   }
 
-  DeviceProperties attr = GetLocalCPUInfo();
-  devices_["/job:localhost/replica:0/task:0/cpu:0"] = GetLocalCPUInfo();
+  TF_RETURN_IF_ERROR(ResetSession());
+
+  std::vector<DeviceAttributes> devices;
+  TF_RETURN_IF_ERROR(session_->ListDevices(&devices));
+  int gpu_id = 0;
+  for (const auto& dev : devices) {
+    DeviceProperties attr;
+    if (dev.device_type() == "CPU") {
+      attr = GetLocalCPUInfo();
+    } else if (dev.device_type() == "GPU") {
+      attr = GetLocalGPUInfo(gpu_id++);
+    } else if (dev.device_type().find("XLA") == string::npos) {
+      // Filter out the fake XLA devices to avoid double counting the actual
+      // hardware resources that are available.
+      attr.set_type(dev.device_type());
+    }
+    // Overwrite the memory size since users might have requested to use only a
+    // fraction of the available device memory.
+    attr.set_memory_size(dev.memory_limit());
+    devices_[dev.name()] = attr;
+  }
+  already_provisioned = true;
 
-  VLOG(1) << "Number of GPUs: " << num_gpus_;
-  for (int i = 0; i < num_gpus_; ++i) {
-    string device_name =
-        strings::StrCat("/job:localhost/replica:0/task:0/device:GPU:", i);
-    VLOG(1) << "Adding GPU device " << device_name;
-    devices_[device_name] = GetLocalGPUInfo(i);
+  // Clear highmark stats of all local allocators.
+  if (cpu_allocator_stats_enabled_) {
+    TF_RETURN_IF_ERROR(ClearAllocatorStats());
   }
   return Status::OK();
 }
@@ -108,27 +119,12 @@ Status SingleMachine::Initialize(const GrapplerItem& item) {
 }
 
 Status SingleMachine::Shutdown() {
-  TF_RETURN_IF_ERROR(CloseSession(true /*use_timeout*/));
+  TF_RETURN_IF_ERROR(ShutdownSession());
+
+  mutex_lock l(this->last_graph_mu_);
+  last_graph_ = nullptr;
+  already_provisioned = false;
 
-  // Delete the threadpool: this ensures that all the pending closures complete
-  // before we return. Note that if TF deadlocked on us, the closures will
-  // never complete, and the call to thread_pool_.reset() will never return:
-  // therefore we need to delete the threadpool with the background thread.
-  // That thread itself will also never complete, so the user should
-  // abort the process to avoid leaking too many resources.
-  auto n = std::make_shared<Notification>();
-  Env::Default()->SchedClosure([this, n]() {
-    thread_pool_.reset();
-    n->Notify();
-  });
-  int64 timeout_us = 1000000ll * timeout_s_;
-  const bool notified = WaitForNotificationWithTimeout(n.get(), timeout_us);
-  if (!notified) {
-    // Let the caller know that we can't shutdown the session properly since
-    // there are calls to Session::Run() still running.
-    return errors::Unavailable("The session is still running graphs after ",
-                               timeout_s_, " seconds");
-  }
   return Status::OK();
 }
 
@@ -196,6 +192,41 @@ Status SingleMachine::Run(const GraphDef& graph_def,
   return Status::OK();
 }
 
+Status SingleMachine::EnablePeakMemoryStats(bool enable) {
+  EnableCPUAllocatorStats(enable);
+  cpu_allocator_stats_enabled_ = enable;
+  // No need to enable GPU allocator stats since its stats are always collected.
+  return Status::OK();
+}
+
+Status SingleMachine::GetPeakMemoryUsage(
+    std::unordered_map<string, uint64>* device_peak_memory) const {
+  // Cpu_allocator->TracksAllocationSizes() returns true doesn't always mean the
+  // the AllocatorStats would be collected.
+  if (!cpu_allocator_stats_enabled_) {
+    return Status(error::INVALID_ARGUMENT,
+                  "Tracking allocation for CPU is not enabled.");
+  }
+
+  const DeviceMgr* device_mgr;
+  TF_RETURN_IF_ERROR(session_->LocalDeviceManager(&device_mgr));
+  std::vector<Device*> devices = device_mgr->ListDevices();
+
+  device_peak_memory->clear();
+  for (Device* device : devices) {
+    AllocatorStats stats;
+    auto* allocator = device->GetAllocator(AllocatorAttributes());
+    if (!allocator->TracksAllocationSizes()) {
+      return Status(error::INVALID_ARGUMENT,
+                    "Tracking allocation is not enabled.");
+    }
+    allocator->GetStats(&stats);
+    (*device_peak_memory)[device->name()] = stats.max_bytes_in_use;
+  }
+
+  return Status::OK();
+}
+
 Status SingleMachine::RunWithTimeout(
     const std::vector<std::pair<string, Tensor>>& feed,
     const std::vector<string>& fetch, RunMetadata* run_metadata) {
@@ -230,7 +261,7 @@ Status SingleMachine::RunWithTimeout(
 }
 
 Status SingleMachine::CloseSession(bool use_timeout) {
-  if (!session_) {
+  if (!session_ || !thread_pool_) {
     return Status::OK();
   }
 
@@ -274,12 +305,38 @@ Status SingleMachine::CloseSession(bool use_timeout) {
   return Status::OK();
 }
 
+Status SingleMachine::ShutdownSession() {
+  TF_RETURN_IF_ERROR(CloseSession(true /*use_timeout*/));
+
+  // Delete the threadpool: this ensures that all the pending closures complete
+  // before we return. Note that if TF deadlocked on us, the closures will
+  // never complete, and the call to thread_pool_.reset() will never return:
+  // therefore we need to delete the threadpool with the background thread.
+  // That thread itself will also never complete, so the user should
+  // abort the process to avoid leaking too many resources.
+  auto n = std::make_shared<Notification>();
+  Env::Default()->SchedClosure([this, n]() {
+    thread_pool_.reset();
+    n->Notify();
+  });
+  int64 timeout_us = 1000000ll * timeout_s_;
+  const bool notified = WaitForNotificationWithTimeout(n.get(), timeout_us);
+  if (!notified) {
+    // Let the caller know that we can't shutdown the session properly since
+    // there are calls to Session::Run() still running.
+    return errors::Unavailable("The session is still running graphs after ",
+                               timeout_s_, " seconds");
+  }
+
+  return Status::OK();
+}
+
 Status SingleMachine::ResetSession() {
   if (session_) {
     LOG(INFO) << "Cleaning up previous session";
 
     // Make sure the session is properly closed
-    TF_RETURN_IF_ERROR(Shutdown());
+    TF_RETURN_IF_ERROR(ShutdownSession());
 
     // Destroying the object deletes all its variables as well. This is only
     // true for DirectSession.
@@ -332,5 +389,29 @@ void SingleMachine::MergeCosts(CostGraphDef* graph_costs,
   }
 }
 
+Status SingleMachine::ClearAllocatorStats() const {
+  // Cpu_allocator->TracksAllocationSizes() returns true doesn't always mean the
+  // the AllocatorStats would be collected.
+  if (!cpu_allocator_stats_enabled_) {
+    return Status(error::INVALID_ARGUMENT,
+                  "Tracking allocation for CPU is not enabled.");
+  }
+
+  const DeviceMgr* device_mgr;
+  TF_RETURN_IF_ERROR(session_->LocalDeviceManager(&device_mgr));
+  std::vector<Device*> devices = device_mgr->ListDevices();
+
+  for (Device* device : devices) {
+    AllocatorStats stats;
+    auto* allocator = device->GetAllocator(AllocatorAttributes());
+    if (!allocator->TracksAllocationSizes()) {
+      return Status(error::INVALID_ARGUMENT,
+                    "Tracking allocation is not enabled.");
+    }
+    allocator->ClearStats();
+  }
+  return Status::OK();
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index d3efbe3c614580d0502874412697cd5719e28be5..90d6a04cab650178db0dc14ac94564690b0d7bbb 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
 
 #include "tensorflow/cc/training/coordinator.h"
+#include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -32,6 +33,8 @@ class SingleMachine : public Cluster {
   SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus);
   ~SingleMachine() override;
 
+  string type() const override { return "single_machine"; }
+
   Status Provision() override;
   Status Shutdown() override;
 
@@ -40,6 +43,12 @@ class SingleMachine : public Cluster {
              const std::vector<std::pair<string, Tensor>>& feed,
              const std::vector<string>& fetch, RunMetadata* metadata) override;
 
+  Status EnablePeakMemoryStats(bool enable) override;
+
+  // It requires EnableAllocatorStats(true) be called before Provision().
+  Status GetPeakMemoryUsage(
+      std::unordered_map<string, uint64>* device_peak_memory) const override;
+
  private:
   Status RunWithTimeout(const std::vector<std::pair<string, Tensor>>& feed,
                         const std::vector<string>& fetch,
@@ -49,10 +58,12 @@ class SingleMachine : public Cluster {
                         RunMetadata* run_metadata, int64 timeout_s);
   Status ResetSession();
   Status CloseSession(bool use_timeout);
+  Status ShutdownSession();
   void MergeCosts(CostGraphDef* graph_costs, const CostGraphDef& init_costs,
                   const CostGraphDef& queue_costs);
 
-  const int num_gpus_;
+  Status ClearAllocatorStats() const;
+
   std::unique_ptr<Session> session_;
   std::vector<QueueRunnerDef> queue_runner_defs_;
   string last_graph_id_;
@@ -67,6 +78,8 @@ class SingleMachine : public Cluster {
 
   mutex close_mu_;
   bool closing_ GUARDED_BY(close_mu_);
+
+  bool cpu_allocator_stats_enabled_ = false;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index f6c325c2a4bb1877f07fbfd034755ff501344f48..c6352c1448bb38ece78530007e2534d475ef7fb6 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/queue_runner.pb.h"
@@ -44,10 +45,14 @@ class SingleMachineTest : public ::testing::Test {
 #endif
     cluster_.reset(
         new SingleMachine(timeout_s, 3 /* num_cpu_cores */, 0 /* num_gpus */));
+    TF_CHECK_OK(cluster_->EnablePeakMemoryStats(true));
     TF_CHECK_OK(cluster_->Provision());
   }
 
   void TearDown() override {
+    if (cluster_) {
+      TF_CHECK_OK(cluster_->Shutdown());
+    }
     cluster_.reset();
   }
 
@@ -55,6 +60,10 @@ class SingleMachineTest : public ::testing::Test {
   std::unique_ptr<SingleMachine> cluster_;
 };
 
+TEST_F(SingleMachineTest, ClusterType) {
+  CHECK_EQ("single_machine", cluster_->type());
+}
+
 TEST_F(SingleMachineTest, CostModel) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
                                           cluster_->GetDeviceNames());
@@ -178,8 +187,7 @@ TEST_F(SingleMachineTest, GraphOptimizations) {
   // With optimizations turned on, some nodes could have been optimized away,
   // and the cost model could be partial. Restart the cluster with optimizations
   // disabled and make sure we have all the information we're looking for.
-  cluster_.reset();
-  cluster_.reset(new SingleMachine(5, 3, 0));
+  TF_CHECK_OK(cluster_->Shutdown());
   cluster_->DisableOptimizer(true);
   TF_CHECK_OK(cluster_->Provision());
 
@@ -324,7 +332,7 @@ static void RunInfiniteTFLoop() {
 
 TEST_F(SingleMachineTest, InfiniteLoops) {
   // The RunInfiniteTFLoop function creates its own cluster.
-  cluster_.reset();
+  TF_CHECK_OK(cluster_->Shutdown());
 
   EXPECT_EXIT(RunInfiniteTFLoop(), ::testing::ExitedWithCode(0), ".*");
 }
@@ -459,60 +467,18 @@ TEST_F(SingleMachineTest, PersistentMemory) {
       found_hashtable = true;
       // Persistent memory usage should be 0 since it's recorded as part of the
       // initialize_table op.
-      EXPECT_EQ(0, node.host_persistent_memory_size());
-      EXPECT_EQ(0, node.device_persistent_memory_size());
+      EXPECT_EQ(0, node.persistent_memory_size());
     } else if (node.name() == "initialize_table") {
       found_table_init = true;
       // Persistent memory should hold 2 keys and 2 values.
-      EXPECT_LE(4 * sizeof(int64), node.host_persistent_memory_size());
-      EXPECT_EQ(0, node.device_persistent_memory_size());
+      EXPECT_LE(4 * sizeof(int64), node.persistent_memory_size());
     }
   }
   EXPECT_TRUE(found_table_init);
   EXPECT_TRUE(found_hashtable);
 }
 
-#if defined(PLATFORM_GOOGLE)
-namespace {
-
-SessionOptions GetSessionOption(int num_cpu_cores, int num_gpus) {
-  SessionOptions options;
-  // Copied from single_machine.h
-  (*options.config.mutable_device_count())["CPU"] = 1;
-  if (num_gpus > 0) {
-    (*options.config.mutable_device_count())["GPU"] = num_gpus;
-  }
-  CHECK_GE(num_cpu_cores, 1);
-  options.config.set_intra_op_parallelism_threads(num_cpu_cores);
-  options.config.add_session_inter_op_thread_pool()->set_num_threads(
-      num_cpu_cores);
-  return options;
-}
-
-Status GetDeviceMemoryStats(
-    const SessionOptions& session_option,
-    std::unordered_map<string, AllocatorStats>* allocator_stats_by_device) {
-  std::vector<Device*> devices;
-  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(session_option,
-                                               "" /* name_prefix */, &devices));
-  allocator_stats_by_device->clear();
-  for (Device* device : devices) {
-    AllocatorStats stats;
-    auto* allocator = device->GetAllocator(AllocatorAttributes());
-    if (!allocator->TracksAllocationSizes()) {
-      return Status(error::INVALID_ARGUMENT,
-                    "Tracking allocation is not enabled.");
-    }
-    allocator->GetStats(&stats);
-    (*allocator_stats_by_device)[device->name()] = stats;
-    delete device;
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
-TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
+GrapplerItem CreateGrapplerItemWithResourceMemory() {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   // Add a variable and initializer.
@@ -559,35 +525,88 @@ TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
   item.fetch.push_back("var_matmul");
   item.fetch.push_back("dequeue");
 
-  // Run the graph
+  return item;
+}
+
+#if defined(PLATFORM_GOOGLE)
+TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
+  GrapplerItem item = CreateGrapplerItemWithResourceMemory();
   TF_CHECK_OK(cluster_->Initialize(item));
-  EnableCPUAllocatorStats(true);
 
-  SessionOptions options =
-      GetSessionOption(3 /* cpu cores */, 0 /* num gpus */);
-  std::unordered_map<string, AllocatorStats> device_memory_before;
-  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory_before));
-  EXPECT_EQ(device_memory_before.size(), 1);
+  std::unordered_map<string, uint64> device_peak_memory_before;
+  TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory_before));
+  EXPECT_EQ(device_peak_memory_before.size(), 1);
+  // There might be a bit memory used before session's running anything.
+  EXPECT_LT(device_peak_memory_before.begin()->second, 200);
 
   RunMetadata metadata;
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
 
   // Check there is memory that is not released.
-  std::unordered_map<string, AllocatorStats> device_memory;
-  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory));
-  EXPECT_EQ(device_memory.size(), 1);
-  EXPECT_GT(device_memory.begin()->second.bytes_in_use, 0);
+  std::unordered_map<string, uint64> device_peak_memory;
+  TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory));
+  EXPECT_EQ(device_peak_memory.size(), 1);
+  EXPECT_GT(device_peak_memory.begin()->second, 0);
 
-  // Reset cluster_ would release all memory.
-  cluster_.reset();
-  std::unordered_map<string, AllocatorStats> device_memory_after;
-  TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory_after));
+  // Reprovisioning the cluster would release all memory.
+  TF_CHECK_OK(cluster_->Shutdown());
+  TF_CHECK_OK(cluster_->Provision());
+  std::unordered_map<string, uint64> device_peak_memory_after;
+  TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory_after));
+  TF_CHECK_OK(cluster_->Shutdown());
 
   // Check memory used by resources are released after cluster destruction.
-  EXPECT_EQ(device_memory_before.size(), 1);
-  EXPECT_EQ(device_memory_after.size(), 1);
-  EXPECT_EQ(device_memory_before.begin()->second.bytes_in_use, 0);
-  EXPECT_EQ(device_memory_after.begin()->second.bytes_in_use, 0);
+  EXPECT_EQ(device_peak_memory_before.size(), 1);
+  EXPECT_EQ(device_peak_memory_after.size(), 1);
+  EXPECT_LT(device_peak_memory_before.begin()->second, 200);
+  EXPECT_LT(device_peak_memory_after.begin()->second, 200);
+}
+
+TEST_F(SingleMachineTest, PeakMemory) {
+  GrapplerItem item = CreateGrapplerItemWithResourceMemory();
+  TF_CHECK_OK(cluster_->Initialize(item));
+
+  RunMetadata metadata;
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+
+  std::unordered_map<string, uint64> device_peak_memory;
+  TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory));
+  ASSERT_NE(
+      device_peak_memory.find("/job:localhost/replica:0/task:0/device:CPU:0"),
+      device_peak_memory.end());
+  uint64 cpu_memory =
+      device_peak_memory["/job:localhost/replica:0/task:0/device:CPU:0"];
+  EXPECT_GT(cpu_memory, 0);
+
+  TF_CHECK_OK(cluster_->Shutdown());
+  TF_CHECK_OK(cluster_->Provision());
+  device_peak_memory.clear();
+  TF_CHECK_OK(cluster_->GetPeakMemoryUsage(&device_peak_memory));
+  TF_CHECK_OK(cluster_->Shutdown());
+  ASSERT_NE(
+      device_peak_memory.find("/job:localhost/replica:0/task:0/device:CPU:0"),
+      device_peak_memory.end());
+  cpu_memory =
+      device_peak_memory["/job:localhost/replica:0/task:0/device:CPU:0"];
+  EXPECT_LT(cpu_memory, 100);
+}
+
+TEST_F(SingleMachineTest, PeakMemoryStatsNotEnabled) {
+  GrapplerItem item = CreateGrapplerItemWithResourceMemory();
+
+  TF_CHECK_OK(cluster_->Shutdown());
+  cluster_.reset();
+  SingleMachine cluster(60 /* timout_s */, 3 /* num_cpu_cores */,
+                        0 /* num_gpus */);
+
+  TF_CHECK_OK(cluster.Provision());
+  TF_CHECK_OK(cluster.Initialize(item));
+
+  std::unordered_map<string, uint64> device_peak_memory;
+  Status s = cluster.GetPeakMemoryUsage(&device_peak_memory);
+  TF_CHECK_OK(cluster.Shutdown());
+  ASSERT_FALSE(s.ok());
+  EXPECT_EQ(s.code(), errors::Code::INVALID_ARGUMENT);
 }
 #endif
 
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index 592e4b789d0dcb7369e2f0c6db447eb9daa92870..aacd2ccb72df07ac6b31c9bd5b96deca499038e4 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/mem.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -48,6 +49,11 @@ DeviceProperties GetLocalCPUInfo() {
   device.set_l2_cache_size(Eigen::l2CacheSize());
   device.set_l3_cache_size(Eigen::l3CacheSize());
 
+  int64 free_mem = port::AvailableRam();
+  if (free_mem < INT64_MAX) {
+    device.set_memory_size(free_mem);
+  }
+
   (*device.mutable_environment())["cpu_instruction_set"] =
       Eigen::SimdInstructionSetsInUse();
 
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index e1f5925f7e56b31babedf0cf274f7bf482883d4c..ae70c9860823dae1a85ba20e00afe15b218cd2b4 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -25,14 +25,16 @@ namespace grappler {
 
 VirtualCluster::VirtualCluster(
     const std::unordered_map<string, DeviceProperties>& devices)
-    : Cluster(0), node_estimator_(new OpLevelCostEstimator()) {
+    : Cluster(0),
+      node_estimator_(new OpLevelCostEstimator()),
+      node_manager_(new FirstReadyManager()) {
   devices_ = devices;
 }
 
 VirtualCluster::VirtualCluster(
     const std::unordered_map<string, DeviceProperties>& devices,
-    OpLevelCostEstimator* node_estimator)
-    : Cluster(0), node_estimator_(node_estimator) {
+    OpLevelCostEstimator* node_estimator, ReadyNodeManager* node_manager)
+    : Cluster(0), node_estimator_(node_estimator), node_manager_(node_manager) {
   devices_ = devices;
 }
 VirtualCluster::~VirtualCluster() {}
@@ -54,7 +56,7 @@ Status VirtualCluster::Run(const GraphDef& graph,
   item.graph = graph;
   item.feed = feed;
   item.fetch = fetch;
-  VirtualScheduler scheduler(&item, true, this);
+  VirtualScheduler scheduler(&item, true, this, node_manager_.get());
   TF_RETURN_IF_ERROR(scheduler.Init());
 
   if (metadata) {
@@ -96,6 +98,33 @@ Status VirtualCluster::Run(const GraphDef& graph,
   if (metadata) {
     scheduler.Summary(metadata);
   }
+
+  const std::unordered_map<string, DeviceProperties>& device = GetDevices();
+  std::unordered_map<string, int64> peak_mem_usage =
+      scheduler.GetPeakMemoryUsage();
+  for (const auto& mem_usage : peak_mem_usage) {
+    const string& device_name = mem_usage.first;
+    auto it = device.find(device_name);
+    if (it == device.end()) {
+      // It's probably the fake send/recv device. Eventually we'll need to
+      // remove this fake device to ensure proper memory accounting for
+      // multi-device settings.
+      continue;
+    }
+    const DeviceProperties& dev = it->second;
+    if (dev.memory_size() <= 0) {
+      // Available device memory unknown
+      continue;
+    }
+    int64 peak_mem = mem_usage.second;
+    if (peak_mem >= dev.memory_size()) {
+      return errors::ResourceExhausted(
+          "Graph requires ", peak_mem, " bytes of memory on device ",
+          device_name, " to run ", " but device only has ", dev.memory_size(),
+          " available.");
+    }
+  }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.h b/tensorflow/core/grappler/clusters/virtual_cluster.h
index a74911cb23a3fcdb8f41de624c4e5c9a01602577..dde70bab7a391e7573560b3202e9f0f7a0d69cae 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.h
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_map>
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 
 namespace tensorflow {
@@ -31,10 +32,13 @@ class VirtualCluster : public Cluster {
  public:
   VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices);
   VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices,
-                 OpLevelCostEstimator* node_estimator);
+                 OpLevelCostEstimator* node_estimator,
+                 ReadyNodeManager* node_manager);
 
   ~VirtualCluster() override;
 
+  string type() const override { return "virtual"; }
+
   Status Provision() override;
   Status Initialize(const GrapplerItem& item) override;
   Status Run(const GraphDef& item,
@@ -43,6 +47,7 @@ class VirtualCluster : public Cluster {
 
  private:
   std::unique_ptr<OpLevelCostEstimator> node_estimator_;
+  std::unique_ptr<ReadyNodeManager> node_manager_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster_test.cc b/tensorflow/core/grappler/clusters/virtual_cluster_test.cc
index ec21f5f4260d86129b63158d0d389052a8d7e82f..357b306b93be936bc81c818ca0f2ecbefdeb351a 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster_test.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster_test.cc
@@ -14,11 +14,14 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -37,18 +40,26 @@ class VirtualClusterTest : public ::testing::Test {
     cpu_device.set_l1_cache_size(32 * 1024);
     cpu_device.set_l2_cache_size(256 * 1024);
     cpu_device.set_l3_cache_size(4 * 1024 * 1024);
+    cpu_device.set_memory_size(1024 * 1024);
     std::unordered_map<string, DeviceProperties> devices;
     devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
     cluster_.reset(new VirtualCluster(devices));
     TF_CHECK_OK(cluster_->Provision());
   }
 
-  void TearDown() override { cluster_.reset(); }
+  void TearDown() override {
+    TF_CHECK_OK(cluster_->Shutdown());
+    cluster_.reset();
+  }
 
  protected:
   std::unique_ptr<VirtualCluster> cluster_;
 };
 
+TEST_F(VirtualClusterTest, ClusterType) {
+  CHECK_EQ("virtual", cluster_->type());
+}
+
 TEST_F(VirtualClusterTest, CostModel) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
                                           cluster_->GetDeviceNames());
@@ -91,6 +102,21 @@ TEST_F(VirtualClusterTest, CostModel) {
   }
 }
 
+TEST_F(VirtualClusterTest, OutOfMemory) {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  // Create a large variable that can't fit in memory.
+  auto zero = ops::Variable(root.WithOpName("zero"), {1024, 1024}, DT_FLOAT);
+  auto identity = ops::Identity(root.WithOpName("i"), zero);
+  auto identity2 = ops::Identity(root.WithOpName("i2"), identity);
+  GrapplerItem item;
+  TF_CHECK_OK(root.ToGraphDef(&item.graph));
+  item.fetch.push_back("i2");
+
+  TF_CHECK_OK(cluster_->Initialize(item));
+  Status s = cluster_->Run(item.graph, item.feed, item.fetch, nullptr);
+  EXPECT_EQ(error::RESOURCE_EXHAUSTED, s.code());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index f02cb51038a1d34475d9c13b0ca14b7137c41f35..0fe01e9c9e094ebfa7fd1e6200d775ef61775184 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -1,6 +1,10 @@
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library", "tf_cc_test")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_protos_grappler",
+)
 
 filegroup(
     name = "all_files",
@@ -16,7 +20,10 @@ filegroup(
 
 filegroup(
     name = "graph_properties_testdata",
-    srcs = glob(["graph_properties_testdata/*.pbtxt"]),
+    srcs = glob([
+        "graph_properties_testdata/*.pbtxt",
+        "graph_properties_testdata/*.pbtxt.html",
+    ]),
     visibility = ["//visibility:public"],
 )
 
@@ -34,6 +41,7 @@ tf_proto_library(
     name = "op_performance_data",
     srcs = ["op_performance_data.proto"],
     cc_api_version = 2,
+    default_header = True,
     protodeps = tf_additional_all_protos(),
     visibility = ["//visibility:public"],
 )
@@ -44,14 +52,14 @@ cc_library(
     hdrs = ["graph_properties.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":op_performance_data_cc",
         ":utils",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-    ],
+    ] + tf_protos_grappler(),
 )
 
 tf_cc_test(
@@ -131,16 +139,15 @@ tf_cuda_library(
     hdrs = ["utils.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":op_performance_data_cc",
-        "//tensorflow/core:core_cpu_base",
+        "//third_party/eigen3",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:utils",
-        "//third_party/eigen3",
-    ],
+    ] + tf_protos_grappler(),
 )
 
 tf_cc_test(
@@ -203,9 +210,8 @@ cc_library(
     hdrs = ["op_context.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":op_performance_data_cc",
         "//tensorflow/core:protos_all_cc",
-    ],
+    ] + tf_protos_grappler(),
 )
 
 cc_library(
@@ -272,12 +278,11 @@ cc_library(
     deps = [
         ":cost_estimator",
         ":op_context",
-        ":op_performance_data_cc",
+        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/clusters:utils",
-        "//third_party/eigen3",
-    ],
+    ] + tf_protos_grappler(),
 )
 
 tf_cc_test(
@@ -301,15 +306,15 @@ cc_library(
         ":cost_estimator",
         ":graph_properties",
         ":op_level_cost_estimator",
-        ":op_performance_data_cc",
         ":utils",
         ":virtual_placer",
         ":virtual_scheduler",
         "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
-    ],
+    ] + tf_protos_grappler(),
 )
 
 tf_cc_test(
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
index ca66f7c75a5ad7eb6970004da3c9a2f92c85479d..c8ba4dfbdadf50eab22ee2f4af898fe949572c66 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -34,13 +34,15 @@ AnalyticalCostEstimator::AnalyticalCostEstimator(Cluster* cluster,
                                                  bool use_static_shapes)
     : cluster_(cluster),
       node_estimator_(new OpLevelCostEstimator()),
+      node_manager_(VirtualScheduler::ReadyNodeManagerFactory("FirstReady")),
       use_static_shapes_(use_static_shapes) {}
 
 AnalyticalCostEstimator::AnalyticalCostEstimator(
     Cluster* cluster, OpLevelCostEstimator* node_estimator,
-    bool use_static_shapes)
+    ReadyNodeManager* node_manager, bool use_static_shapes)
     : cluster_(cluster),
       node_estimator_(node_estimator),
+      node_manager_(node_manager),
       use_static_shapes_(use_static_shapes) {}
 
 Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
@@ -61,7 +63,9 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
     }
   }
   std::vector<string> inaccurate_nodes;
-  VirtualScheduler scheduler(&item, use_static_shapes_, cluster_);
+  int nodes_executed = 0;
+  VirtualScheduler scheduler(&item, use_static_shapes_, cluster_,
+                             node_manager_.get());
   auto status = scheduler.Init();
   if (!status.ok()) {
     costs->execution_time = Costs::Duration::max();
@@ -70,6 +74,7 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
 
   Costs node_costs;
   do {
+    ++nodes_executed;
     OpContext op_context = scheduler.GetCurrNode();
     const string& op_name = op_context.name;
 
@@ -104,8 +109,7 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
 
   RunMetadata run_metadata;
   *costs = scheduler.Summary(&run_metadata);
-  VLOG(1) << inaccurate_nodes.size() << " out of "
-          << optimized_graph.node_size()
+  VLOG(1) << inaccurate_nodes.size() << " out of " << nodes_executed
           << " nodes have inaccurate time estimation";
   if (VLOG_IS_ON(3)) {
     for (const auto& node : inaccurate_nodes) {
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.h b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
index cf9163302c6740e16bbd8675ddebb23a365494ea..dd2738e088023ae387f269152c3ad9d33bcfd645 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/cost_estimator.h"
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -39,9 +40,10 @@ class AnalyticalCostEstimator : public CostEstimator {
   // Does not take ownership of cluster.
   AnalyticalCostEstimator(Cluster* cluster, bool use_static_shapes);
   // Does not take ownership of the cluster, but takes ownership of the
-  // node_estimator
+  // node_estimator and the node_manager
   AnalyticalCostEstimator(Cluster* cluster,
                           OpLevelCostEstimator* node_estimator,
+                          ReadyNodeManager* node_manager,
                           bool use_static_shapes);
   ~AnalyticalCostEstimator() override {}
 
@@ -59,6 +61,7 @@ class AnalyticalCostEstimator : public CostEstimator {
   Cluster* cluster_;  // Not owned.
   GrapplerItem item_;
   std::unique_ptr<OpLevelCostEstimator> node_estimator_;
+  std::unique_ptr<ReadyNodeManager> node_manager_;
   bool use_static_shapes_;
 };
 
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index d1f3e36aa8164c4a80537b8affc324503af5488b..f24192247113bfe91884a9c557f46cc29986ff9a 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -102,8 +102,14 @@ TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
   Costs summary;
   TF_ASSERT_OK(estimator.PredictCosts(item.graph, &cost_graph, &summary));
 
-  EXPECT_EQ(Costs::NanoSeconds(9156), summary.execution_time);
-  EXPECT_FALSE(summary.inaccurate);
+  EXPECT_EQ(Costs::NanoSeconds(9151), summary.execution_time);
+
+  // Make this estimate accurate:
+  // TODO(http://b/70031255): Accurate estimator for RandomUniform op needed
+  // TODO(http://b/70031363): Accurate estimator for Softmax needed
+  //
+  // Change to EXPECT_FALSE when the above TODOs are done:
+  EXPECT_TRUE(summary.inaccurate);
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index cf9fa4fdaf947cba8c38d6eb3ca67d3a43f35d29..9e01ec5ff5b48b9f979695b0a4b7b089245145c0 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -40,6 +40,16 @@ struct Costs {
   // Builds a Costs structure with all zero values, rather than unknowns.
   static inline Costs ZeroCosts();
 
+  struct MilliSeconds : std::chrono::milliseconds {
+    MilliSeconds() : std::chrono::milliseconds(0) {}
+    MilliSeconds(double d) : std::chrono::milliseconds(static_cast<int64>(d)) {}
+    MilliSeconds(const std::chrono::milliseconds& d)
+        : std::chrono::milliseconds(d) {}
+    MilliSeconds& operator=(const std::chrono::milliseconds& d) {
+      std::chrono::milliseconds::operator=(d);
+      return *this;
+    }
+  };
   struct MicroSeconds : std::chrono::microseconds {
     MicroSeconds() : std::chrono::microseconds(0) {}
     MicroSeconds(double d) : std::chrono::microseconds(static_cast<int64>(d)) {}
@@ -49,6 +59,9 @@ struct Costs {
       std::chrono::microseconds::operator=(d);
       return *this;
     }
+    MilliSeconds asMilliSeconds() const {
+      return std::chrono::duration_cast<std::chrono::milliseconds>(*this);
+    }
   };
   struct NanoSeconds : std::chrono::nanoseconds {
     NanoSeconds() : std::chrono::nanoseconds(0) {}
@@ -60,9 +73,13 @@ struct Costs {
       return *this;
     }
     MicroSeconds asMicroSeconds() const {
-      std::chrono::microseconds us =
-          std::chrono::duration_cast<std::chrono::microseconds>(*this);
-      return MicroSeconds(us);
+      return std::chrono::duration_cast<std::chrono::microseconds>(*this);
+    }
+    MilliSeconds asMilliSeconds() const {
+      return std::chrono::duration_cast<std::chrono::milliseconds>(*this);
+    }
+    static NanoSeconds infinity() {
+      return NanoSeconds(std::chrono::nanoseconds::max());
     }
   };
 
@@ -71,10 +88,7 @@ struct Costs {
   typedef NanoSeconds Duration;
 
   // Overall cost of running the graph; latency.
-  // Mean
   Duration execution_time;
-  Duration min_execution_time;
-  Duration max_execution_time;
 
   // Computation cost of running the graph.
   Duration compute_time;
@@ -86,6 +100,8 @@ struct Costs {
   // requirements of a graph. For example, it might assume that all activations
   // are live for all of a graph's execution.
   int64 max_memory;  // Maximum main memory requirement in bytes over all ops.
+  int64 persistent_memory;
+  int64 temporary_memory;
 
   // These fields are used for TPU-related estimations. They are per-op
   // maximums, so each op is evaluated independently, but we want the maximum of
@@ -100,6 +116,10 @@ struct Costs {
   std::unordered_map<string, uint64> estimated_max_memory_per_device;
 };
 
+inline std::ostream& operator<<(std::ostream& os, const Costs::MilliSeconds d) {
+  os << d.count() << "ms";
+  return os;
+}
 inline std::ostream& operator<<(std::ostream& os, const Costs::MicroSeconds d) {
   os << d.count() << "us";
   return os;
@@ -114,6 +134,8 @@ Costs::Costs() {
   compute_time = Duration::zero();
   memory_time = Duration::zero();
   max_memory = kMemoryUnknown;
+  persistent_memory = kMemoryUnknown;
+  temporary_memory = kMemoryUnknown;
   max_per_op_buffers = kMemoryUnknown;
   max_per_op_streaming = kMemoryUnknown;
 }
@@ -124,6 +146,8 @@ Costs Costs::ZeroCosts() {
   costs.compute_time = Duration::zero();
   costs.memory_time = Duration::zero();
   costs.max_memory = kZeroMemory;
+  costs.persistent_memory = kZeroMemory;
+  costs.temporary_memory = kZeroMemory;
   costs.max_per_op_buffers = kZeroMemory;
   costs.max_per_op_streaming = kZeroMemory;
   return costs;
diff --git a/tensorflow/core/grappler/costs/graph_memory.cc b/tensorflow/core/grappler/costs/graph_memory.cc
index 6022c47e8f689c6d9f262caae0c5e86f4cf6fb82..3604de392f803b8b2eb65e796848c2c3ec6a90e5 100644
--- a/tensorflow/core/grappler/costs/graph_memory.cc
+++ b/tensorflow/core/grappler/costs/graph_memory.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_memory.h"
 #include <list>
 #include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor_description.pb.h"
@@ -32,7 +33,17 @@ Status GraphMemory::InferStatically(
     const std::unordered_map<string, DeviceProperties>& devices) {
   VirtualCluster cluster(devices);
   TF_RETURN_IF_ERROR(cluster.Provision());
-  return InferDynamically(&cluster);
+  TF_RETURN_IF_ERROR(cluster.Initialize(item_));
+  RunMetadata metadata;
+  Status s = cluster.Run(item_.graph, item_.feed, item_.fetch, &metadata);
+  // The virtual cluster returns the RESOURCE_EXHAUSTED error when it detects
+  // that the model would run out of memory. We still get the metadata we need
+  // out of the simulation, so we just ignore this error.
+  if (!s.ok() && s.code() != error::RESOURCE_EXHAUSTED) {
+    return s;
+  }
+  InferFromTrace(metadata.step_stats());
+  return Status::OK();
 }
 
 Status GraphMemory::InferDynamically(Cluster* cluster) {
@@ -153,6 +164,8 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
 
   NodeMap node_map(&item_.graph);
   for (const auto& dev_stats : timeline.dev_stats()) {
+    const string& device_name = dev_stats.device();
+    const bool is_gpu = (device_name.find("GPU:") || device_name.find("gpu:"));
     std::list<LiveTensor>& device_tensors =
         live_tensors_per_device[dev_stats.device()];
     for (const auto& node_stats : dev_stats.node_stats()) {
@@ -184,7 +197,24 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
         // graph (e.g _Send/_Recv nodes).
         continue;
       }
-      for (const string& input : node->input()) {
+      std::unordered_set<int> swapped_inputs;
+      if (is_gpu) {
+        auto it = node->attr().find("_swap_to_host");
+        if (it != node->attr().end()) {
+          const AttrValue& val = it->second;
+          for (int port_id : val.list().i()) {
+            swapped_inputs.insert(port_id);
+          }
+        }
+      }
+      for (int i = 0; i < node->input_size(); ++i) {
+        if (swapped_inputs.find(i) != swapped_inputs.end()) {
+          // The memory of swapped inputs will be released as early as possible:
+          // therefore ignore this input when determining the deallocation time
+          // of the tensor.
+          continue;
+        }
+        const string& input = node->input(i);
         int position;
         string input_node = ParseNodeName(input, &position);
         if (position < 0) {
diff --git a/tensorflow/core/grappler/costs/graph_memory_test.cc b/tensorflow/core/grappler/costs/graph_memory_test.cc
index 6f3522b068bdb74eb98d3e6071d4d4b2e21c9ff6..95170ba49b77ef1be629cfa77bc4a333d2315e4f 100644
--- a/tensorflow/core/grappler/costs/graph_memory_test.cc
+++ b/tensorflow/core/grappler/costs/graph_memory_test.cc
@@ -134,6 +134,62 @@ TEST_F(GraphMemoryTest, MultiDevice) {
   EXPECT_EQ(gpu_expected, gpu_tensors);
 }
 
+TEST_F(GraphMemoryTest, GpuSwapping) {
+  TrivialTestGraphInputYielder fake_input(4, 2, 1024 * 1024, false, {"/GPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+  item.feed.clear();
+
+  {
+    // Estimate the max memory usage for the graph.
+    GraphMemory memory(item);
+    Status s = memory.InferStatically(devices_);
+    TF_CHECK_OK(s);
+
+    const GraphMemory::MemoryUsage& gpu_mem =
+        memory.GetPeakMemoryUsage("/GPU:0");
+    EXPECT_EQ(20971520, gpu_mem.used_memory);
+    std::set<string> gpu_tensors;
+    for (const auto& t : gpu_mem.live_tensors) {
+      gpu_tensors.insert(strings::StrCat(t.node, ":", t.output_id));
+    }
+    std::set<string> gpu_expected;
+    gpu_expected.insert("Square:0");
+    gpu_expected.insert("Square_1:0");
+    gpu_expected.insert("AddN:0");
+    gpu_expected.insert("AddN_1:0");
+    gpu_expected.insert("AddN_2:0");
+    EXPECT_EQ(gpu_expected, gpu_tensors);
+  }
+
+  {
+    // Swap the first input to node AddN_1: its fanin (the square nodes) should
+    // not appear in the max cut anymore.
+    for (auto& node : *item.graph.mutable_node()) {
+      if (node.name() == "AddN_1") {
+        (*node.mutable_attr())["_swap_to_host"].mutable_list()->add_i(0);
+      }
+    }
+    GraphMemory memory(item);
+    Status s = memory.InferStatically(devices_);
+    TF_CHECK_OK(s);
+    const GraphMemory::MemoryUsage& new_gpu_mem =
+        memory.GetPeakMemoryUsage("/GPU:0");
+    EXPECT_EQ(20971520, new_gpu_mem.used_memory);
+    std::set<string> new_gpu_tensors;
+    for (const auto& t : new_gpu_mem.live_tensors) {
+      new_gpu_tensors.insert(strings::StrCat(t.node, ":", t.output_id));
+    }
+    std::set<string> new_gpu_expected;
+    new_gpu_expected.insert("AddN:0");
+    new_gpu_expected.insert("AddN_1:0");
+    new_gpu_expected.insert("AddN_2:0");
+    new_gpu_expected.insert("AddN_3:0");
+    new_gpu_expected.insert("AddN_4:0");
+    EXPECT_EQ(new_gpu_expected, new_gpu_tensors);
+  }
+}
+
 TEST_F(GraphMemoryTest, CtrlDependencies) {
   // Build a simple graph with a control dependency.
   Scope s = Scope::NewRootScope();
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index dd389de636088f11da92fcd33ec13c305404ffb8..243ca9121c70d91631b474da62281bc56a476d8a 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -264,6 +265,87 @@ bool IsEnterWithQueue(const Node& node) {
   return false;
 }
 
+bool HasAnyUnknownDimensions(const TensorShapeProto& proto) {
+  if (proto.unknown_rank()) {
+    return true;
+  }
+  for (const auto& dim : proto.dim()) {
+    if (dim.size() < 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void VerboseLogUnknownDimensionSources(
+    const Graph& graph,
+    const std::map<string, std::vector<OpInfo::TensorProperties>>&
+        input_properties_map,
+    const std::map<string, std::vector<OpInfo::TensorProperties>>&
+        output_properties_map) {
+  if (!VLOG_IS_ON(2)) {
+    return;
+  }
+
+  VLOG(2) << "Nodes with known inputs, but with unknown output dimensions:";
+
+  // Find all nodes in the graph for which we
+  // do not have any unknown dimensions in their inputs, but
+  // we have some unknown dimensions in their outputs.
+  std::map<string, int> op_to_count;
+  for (const Node* const node : graph.nodes()) {
+    if (node->num_outputs() == 0) {
+      continue;
+    }
+
+    const auto& input_properties = input_properties_map.at(node->name());
+    const auto& output_properties = output_properties_map.at(node->name());
+
+    bool has_unknown_inputs = false;
+    for (int i = 0; i < node->num_inputs(); ++i) {
+      if (HasAnyUnknownDimensions(input_properties[i].shape())) {
+        has_unknown_inputs = true;
+        break;
+      }
+    }
+
+    if (has_unknown_inputs) {
+      continue;
+    }
+
+    for (int i = 0; i < node->num_outputs(); ++i) {
+      if (HasAnyUnknownDimensions(output_properties[i].shape())) {
+        string inputs = "input_shapes=[";
+        for (int i = 0; i < node->num_inputs(); ++i) {
+          inputs +=
+              PartialTensorShape::DebugString(input_properties[i].shape());
+        }
+        inputs += "]";
+
+        string outputs = "output_shapes=[";
+        for (int i = 0; i < node->num_outputs(); ++i) {
+          outputs +=
+              PartialTensorShape::DebugString(output_properties[i].shape());
+        }
+        outputs += "]";
+
+        VLOG(2) << "Node: " << node->name() << ", Op: " << node->def().op()
+                << ", " << inputs << ", " << outputs;
+
+        op_to_count[node->def().op()]++;
+
+        // don't log again for this node
+        break;
+      }
+    }
+  }
+  VLOG(2) << "Op types with known inputs, but with unknown output dimensions "
+          << "(format: <op_type> (<count>)):";
+  for (const auto& p : op_to_count) {
+    VLOG(2) << p.first << " (" << p.second << ")";
+  }
+}
+
 }  // namespace
 
 // Queue of nodes to process. Nodes can be enqueued in any order, but will be
@@ -290,7 +372,7 @@ class TopoQueue {
   // use their id to ensure they're sorted topologically.
   struct CompareNodes {
     bool operator()(const Node* lhs, const Node* rhs) const {
-      return lhs->id() > rhs->id();
+      return lhs->id() < rhs->id();
     }
   };
   std::set<const Node*, CompareNodes> queue_;
@@ -312,9 +394,15 @@ class SymbolicShapeRefiner {
   Status UpdateNode(const Node* node, bool relax, bool* refined) {
     return shape_refiner_->UpdateNode(node, relax, refined);
   }
-  Status SetShape(const Node* node, int output_port,
-                  shape_inference::ShapeHandle shape) {
-    return shape_refiner_->SetShape(node, output_port, shape);
+  Status SetUnknownShape(const Node* node, int output_port) {
+    shape_inference::ShapeHandle shape =
+        GetUnknownOutputShape(node, output_port);
+    InferenceContext* ctx = GetContext(node);
+    if (ctx == nullptr) {
+      return errors::InvalidArgument("Missing context");
+    }
+    ctx->set_output(output_port, shape);
+    return Status::OK();
   }
 
   struct ShapeId {
@@ -605,6 +693,10 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
   InferenceContext* c = shape_refiner->GetContext(node);
   CHECK_NE(c, nullptr);
 
+  ShapeHandle out1;
+  TF_RETURN_IF_ERROR(c->WithRank(c->output(1), 0, &out1));
+  c->set_output(1, out1);
+
   ShapeHandle out;
   bool out_initialized = false;
   for (const Edge* e : node->in_edges()) {
@@ -639,13 +731,29 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
 
   if (!shape_refiner->EquivalentShapes(out, c->output(0))) {
     c->set_output(0, out);
-    c->set_output(1, c->Scalar());
     new_shapes->push(node);
   }
 
   return Status::OK();
 }
 
+Status GraphProperties::OverwriteFedPorts(
+    SymbolicShapeRefiner* shape_refiner,
+    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+    const Node* node, TopoQueue* new_shapes) const {
+  auto it = fed_ports.find(node->name());
+  Status status;
+  if (it != fed_ports.end()) {
+    // It is possible to feed node output ports with tensors of any shape: as a
+    // result, the shape of a fed port is completely unknown.
+    for (const int output_port : it->second) {
+      status.Update(shape_refiner->SetUnknownShape(node, output_port));
+    }
+    new_shapes->push(node);
+  }
+  return status;
+}
+
 // Manually propagate the input shape for Enter nodes and update any Merge node
 // outputs.
 Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
@@ -673,9 +781,10 @@ Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
   return Status::OK();
 }
 
-Status GraphProperties::UpdateShapes(SymbolicShapeRefiner* shape_refiner,
-                                     bool relax, const Node* n,
-                                     TopoQueue* new_shapes) {
+Status GraphProperties::UpdateShapes(
+    SymbolicShapeRefiner* shape_refiner, bool relax,
+    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+    const Node* n, TopoQueue* new_shapes) const {
   if (n->IsEnter()) {
     // The Enter shape function always forwards an UnknownShape, so do the right
     // thing here.
@@ -695,7 +804,9 @@ Status GraphProperties::UpdateShapes(SymbolicShapeRefiner* shape_refiner,
       }
     }
   }
-  return Status::OK();
+  // Nodes can be fed with any shape. The TensorFlow shape inference code can't
+  // handle this properly, so overwrite its behavior here.
+  return OverwriteFedPorts(shape_refiner, fed_ports, n, new_shapes);
 }
 
 // Propagates the shapes in the transitive fan-out of <new_shapes>.
@@ -703,6 +814,7 @@ Status GraphProperties::PropagateShapes(
     SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
     const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
         resources,
+    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
     int num_loops) const {
   // Limit the number of iterations to prevent infinite loops in the presence of
   // incorrect shape functions. The algoritm should converge in at most
@@ -728,8 +840,8 @@ Status GraphProperties::PropagateShapes(
       for (const Edge* e : n->out_edges()) {
         if (!e->IsControlEdge()) {
           const Node* fanout = e->dst();
-          TF_RETURN_IF_ERROR(
-              UpdateShapes(shape_refiner, relax, fanout, new_shapes));
+          TF_RETURN_IF_ERROR(UpdateShapes(shape_refiner, relax, fed_ports,
+                                          fanout, new_shapes));
         }
       }
     }
@@ -744,6 +856,10 @@ Status GraphProperties::PropagateShapes(
   } while (!new_shapes->empty() &&
            num_resource_iterations++ < max_resource_iterations);
 
+  if (!new_shapes->empty()) {
+    return errors::Internal("Shape inference failed to converge");
+  }
+
   return Status::OK();
 }
 
@@ -803,7 +919,7 @@ Status GraphProperties::UpdateResource(
   return Status::OK();
 }
 
-Status GraphProperties::InferStatically() {
+Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   Graph graph(OpRegistry::Global());
   FunctionLibraryDefinition function_library(graph.op_registry(),
                                              item_.graph.library());
@@ -820,11 +936,21 @@ Status GraphProperties::InferStatically() {
   Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner);
   TF_RETURN_IF_ERROR(s);
 
+  std::unordered_map<string, std::unordered_set<int>> fed_ports;
+  if (!assume_valid_feeds) {
+    for (const auto& feed : item_.feed) {
+      int port_index = 0;
+      string node_name = ParseNodeName(feed.first, &port_index);
+      fed_ports[node_name].insert(port_index);
+    }
+  }
+
   // List the resources and the nodes using them. Also collect the Enter and
   // Merge nodes.
   std::unordered_map<const Node*, std::unordered_set<const Node*>> resources;
   std::unordered_set<const Node*> enter_nodes;
   std::unordered_set<const Node*> merge_nodes;
+  std::unordered_set<const Node*> fed_nodes;
   int num_loops = 0;
   for (const Node* const node : graph.nodes()) {
     for (int i = 0; i < node->num_inputs(); ++i) {
@@ -841,6 +967,9 @@ Status GraphProperties::InferStatically() {
     } else if (node->IsNextIteration()) {
       ++num_loops;
     }
+    if (fed_ports.find(node->name()) != fed_ports.end()) {
+      fed_nodes.insert(node);
+    }
   }
 
   SymbolicShapeRefiner refiner(&shape_refiner);
@@ -855,15 +984,22 @@ Status GraphProperties::InferStatically() {
     // Force the propagation of shapes of Enter nodes manually (the Enter shape
     // function always forwards an UnknownShape).
     for (const Node* node : enter_nodes) {
-      TF_RETURN_IF_ERROR(UpdateShapes(&refiner, relax, node, &new_shapes));
+      TF_RETURN_IF_ERROR(
+          UpdateShapes(&refiner, relax, fed_ports, node, &new_shapes));
     }
     // Seed the propagation of shapes through merge nodes.
     for (const Node* node : merge_nodes) {
-      TF_RETURN_IF_ERROR(UpdateShapes(&refiner, relax, node, &new_shapes));
+      TF_RETURN_IF_ERROR(
+          UpdateShapes(&refiner, relax, fed_ports, node, &new_shapes));
+    }
+    // Also seed the propagation of shapes in the fanout of fed nodes.
+    for (const Node* node : fed_nodes) {
+      TF_RETURN_IF_ERROR(
+          OverwriteFedPorts(&refiner, fed_ports, node, &new_shapes));
     }
     // Propagate shapes normally.
-    TF_RETURN_IF_ERROR(
-        PropagateShapes(&refiner, relax, &new_shapes, resources, num_loops));
+    TF_RETURN_IF_ERROR(PropagateShapes(&refiner, relax, &new_shapes, resources,
+                                       fed_ports, num_loops));
   }
 
   // Track shapes globally across the graph.
@@ -874,6 +1010,10 @@ Status GraphProperties::InferStatically() {
     if (!node_ctx) {
       continue;
     }
+    // Skip any information that comes from fed nodes.
+    if (fed_ports.find(node->name()) != fed_ports.end()) {
+      continue;
+    }
     for (const auto& merged_shapes : node_ctx->MergedShapes()) {
       if (!shape_manager.Merge(merged_shapes.first, merged_shapes.second)
                .ok()) {
@@ -896,7 +1036,7 @@ Status GraphProperties::InferStatically() {
   }
 
   for (const Node* const node : graph.nodes()) {
-    VLOG(1) << "<Node> " << node->name();
+    VLOG(3) << "Filling in graph properties for node: " << node->name();
     auto ctx = shape_refiner.GetContext(node);
     if (!ctx) {
       continue;
@@ -948,6 +1088,10 @@ Status GraphProperties::InferStatically() {
     }
   }
 
+  // Help trace the unknown dimensions to their origins.
+  VerboseLogUnknownDimensionSources(graph, input_properties_,
+                                    output_properties_);
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index 95bc5044d0a64d72daccba9a8377ffb73147e649..6fc53a7f2e7da7bae7b6f49c7b32291c981fef53 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -34,12 +34,19 @@ class TopoQueue;
 // nodes, and potentially a set of nodes to feed.
 class GraphProperties {
  public:
-  // Factory method for creating a GrapplerShapes from a MetaGraphDef.
-  // Returns nullptr if the given meta_graph cannot be converted.
   explicit GraphProperties(const GrapplerItem& item) : item_(item) {}
 
-  Status InferStatically();
+  // Infer the shapes through abstract interpretation. Feed information can be
+  // incorrect so it should be discarded to ensure correctness of the analysis.
+  // However, it can help infer shapes in the fanout of fed nodes (even though
+  // the correctness of these shapes can't be guaranteed), so in some cases
+  // (such as simulation or scheduling) it makes sense of keep these shapes.
+  Status InferStatically(bool assume_valid_feeds);
+  // Infer the shape by running the graph on the specified cluster and recording
+  // the shapes of the processed tensors.
   Status InferDynamically(Cluster* cluster);
+  // Extract the properties from a cost graph. For testing only since there is
+  // no way to ensure that the cost graph match the item.
   Status InferFromCostGraph(const CostGraphDef& cost_graph);
 
   // Stores `item_.graph` with the inferred output shapes to `output_graph_def`.
@@ -65,12 +72,6 @@ class GraphProperties {
       OpInfo::TensorProperties*);
 
  private:
-  // Inputs
-  GrapplerItem item_;
-  std::map<string, std::vector<OpInfo::TensorProperties>> input_properties_;
-  std::map<string, std::vector<OpInfo::TensorProperties>> output_properties_;
-  const std::vector<OpInfo::TensorProperties> missing_properties_;
-
   // Merges shapes <shapes_and_types>, determined from an EnqueueV2 node, into
   // <*queue_shapes_and_types>.
   static Status MergeEnqueueShapesAndTypes(
@@ -99,17 +100,31 @@ class GraphProperties {
   static Status UpdateEnter(SymbolicShapeRefiner* shape_refiner,
                             const Node* node, bool relax,
                             TopoQueue* new_shapes);
+  // Process a node that is used to feed the model.
+  Status OverwriteFedPorts(
+      SymbolicShapeRefiner* shape_refiner,
+      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+      const Node* node, TopoQueue* new_shapes) const;
   // Update the shapes for node 'n'. If output shapes for n have changed,
   // enqueue its fanout in 'new_shapes'.
-  static Status UpdateShapes(SymbolicShapeRefiner* shape_refiner, bool relax,
-                             const Node* n, TopoQueue* new_shapes);
+  Status UpdateShapes(
+      SymbolicShapeRefiner* shape_refiner, bool relax,
+      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+      const Node* n, TopoQueue* new_shapes) const;
   // Propagate the shapes for the nodes enqueued in new_shapes and their
   // transitive fanout until a fixed point is reached.
   Status PropagateShapes(
       SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
       const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
           resources,
+      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
       int num_loops) const;
+
+  // Data members
+  GrapplerItem item_;
+  std::map<string, std::vector<OpInfo::TensorProperties>> input_properties_;
+  std::map<string, std::vector<OpInfo::TensorProperties>> output_properties_;
+  const std::vector<OpInfo::TensorProperties> missing_properties_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index c11af5777a1175eaa9b8c0262808e666f1c056d7..5012069118fbe0b3d90d2e99690b2988c45a2843 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -43,7 +43,10 @@ class GraphPropertiesTest : public ::testing::Test {
     TF_CHECK_OK(cluster_->Provision());
   }
 
-  void TearDown() override { cluster_.reset(); }
+  void TearDown() override {
+    TF_CHECK_OK(cluster_->Shutdown());
+    cluster_.reset();
+  }
 
  protected:
   // Returns a string form of <p>, suitable for comparing type and shape.
@@ -73,7 +76,7 @@ TEST_F(GraphPropertiesTest, StaticProperties) {
   CHECK(fake_input.NextItem(&item));
 
   GraphProperties properties(item);
-  Status s = properties.InferStatically();
+  Status s = properties.InferStatically(true);
   TF_CHECK_OK(s);
 
   for (const auto& node : item.graph.node()) {
@@ -179,7 +182,7 @@ TEST_F(GraphPropertiesTest, Variables) {
 
   {
     GraphProperties static_properties(item);
-    TF_CHECK_OK(static_properties.InferStatically());
+    TF_CHECK_OK(static_properties.InferStatically(false));
 
     const auto props = static_properties.GetOutputProperties("Var");
     EXPECT_EQ(1, props.size());
@@ -219,7 +222,7 @@ TEST_F(GraphPropertiesTest, VarHandles) {
                   .Finalize(item.graph.add_node()));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   const auto props = properties.GetOutputProperties("VarRead");
   EXPECT_EQ(1, props.size());
@@ -286,7 +289,7 @@ TEST_F(GraphPropertiesTest, Queues) {
   TF_CHECK_OK(root.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   const auto props1 = properties.GetOutputProperties("Dequeue1");
   ASSERT_EQ(1, props1.size());
@@ -335,7 +338,7 @@ TEST_F(GraphPropertiesTest, MergeWithoutLoops) {
                                  "merge_without_loops.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> nodes{"cond/Merge", "cond/concat", "cond/concat_1"};
   std::vector<string> expected_outputs{"float: [-1,-1,1]", "float: [2,1,1]",
@@ -377,7 +380,7 @@ TEST_F(GraphPropertiesTest, WhileLoop) {
                                  "while_loop.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> nodes{"while/Merge_1", "while/NextIteration_1",
                             "while/Exit_1"};
@@ -435,7 +438,7 @@ TEST_F(GraphPropertiesTest, NestedLoop) {
                                  "nested_loop.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
                                   "while/Exit_1"};
@@ -498,7 +501,7 @@ TEST_F(GraphPropertiesTest, LoopsAndQueues) {
                                  "loops_and_queues.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
                                   "while/Exit_1"};
@@ -556,7 +559,7 @@ TEST_F(GraphPropertiesTest, LoopsAndResourceVars) {
                                  "loops_and_resource_vars.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
                                   "while/Exit_1"};
@@ -608,7 +611,7 @@ TEST_F(GraphPropertiesTest, QueuesAndLoops) {
                                  "queues_and_loops.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> nodes{"while/Merge_1", "while/NextIteration_1",
                             "while/Exit_1"};
@@ -657,7 +660,7 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape) {
   item.fetch.push_back("init_restore");
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   const auto restore_props = properties.GetOutputProperties("restore");
   const OpInfo::TensorProperties& restore_prop = restore_props[0];
@@ -704,7 +707,7 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape_WithTwoNodesShareSameOutput) {
   item.fetch.push_back("init2");
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   const auto props = properties.GetOutputProperties("restore");
   const OpInfo::TensorProperties& prop = props[0];
@@ -732,7 +735,7 @@ TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
                                  "simple_function.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
   const auto props = properties.GetOutputProperties("MyAdd_55e046a8_1");
   const OpInfo::TensorProperties& prop = props[0];
   EXPECT_EQ(DT_FLOAT, prop.dtype());
@@ -740,6 +743,10 @@ TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
   EXPECT_EQ(2, prop.shape().dim_size());
   EXPECT_EQ(1, prop.shape().dim(0).size());
   EXPECT_EQ(2, prop.shape().dim(1).size());
+
+  PartialTensorShape shape(prop.shape());
+  EXPECT_TRUE(shape.IsFullyDefined());
+  EXPECT_FALSE(shape.unknown_rank());
 }
 
 TEST_F(GraphPropertiesTest, SymbolicShapes) {
@@ -766,7 +773,7 @@ TEST_F(GraphPropertiesTest, SymbolicShapes) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
   const auto shape_a = properties.GetOutputProperties("a").at(0).shape();
   const auto shape_c = properties.GetOutputProperties("c").at(0).shape();
   EXPECT_EQ(2, shape_a.dim_size());
@@ -776,6 +783,10 @@ TEST_F(GraphPropertiesTest, SymbolicShapes) {
   EXPECT_GE(-2, shape_a.dim(1).size());
   EXPECT_EQ(shape_a.dim(1).size(), shape_c.dim(1).size());
 
+  PartialTensorShape shape(shape_a);
+  EXPECT_FALSE(shape.IsFullyDefined());
+  EXPECT_FALSE(shape.unknown_rank());
+
   const auto shape_b = properties.GetOutputProperties("b").at(0).shape();
   const auto shape_d = properties.GetOutputProperties("d").at(0).shape();
   EXPECT_EQ(1, shape_b.dim_size());
@@ -822,7 +833,7 @@ TEST_F(GraphPropertiesTest, DoNotValidateColocationConstraints) {
   GraphProperties properties(item);
   // This function should return OK, since it doesn't validate the colocation
   // constraints internally.
-  TF_EXPECT_OK(properties.InferStatically());
+  TF_EXPECT_OK(properties.InferStatically(false));
 }
 
 TEST_F(GraphPropertiesTest, ShapeTracking) {
@@ -842,7 +853,7 @@ TEST_F(GraphPropertiesTest, ShapeTracking) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
   const auto shape_a = properties.GetOutputProperties("a").at(0).shape();
   const auto shape_b = properties.GetOutputProperties("b").at(0).shape();
   const auto shape_o1 = properties.GetOutputProperties("o1").at(0).shape();
@@ -851,6 +862,76 @@ TEST_F(GraphPropertiesTest, ShapeTracking) {
   EXPECT_EQ(shape_b.DebugString(), shape_o2.DebugString());
 }
 
+TEST_F(GraphPropertiesTest, FedNodes) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
+                                          cluster_->GetDeviceNames());
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  {
+    // Conservative shape analysis: the shape of fed ports should be unknown
+    GraphProperties properties(item);
+    Status s = properties.InferStatically(false);
+    TF_CHECK_OK(s);
+    for (const auto& node : item.graph.node()) {
+      if (node.op() == "Const") {
+        continue;
+      }
+      const auto in_props = properties.GetInputProperties(node.name());
+      EXPECT_EQ(1, in_props.size());
+      const OpInfo::TensorProperties& in_prop = in_props[0];
+      const auto out_props = properties.GetOutputProperties(node.name());
+      EXPECT_EQ(1, out_props.size());
+      const OpInfo::TensorProperties& out_prop = out_props[0];
+
+      if (node.name() == "x") {
+        // x is fed: its input should have a known shape, while its output
+        // doesn't
+        EXPECT_FALSE(in_prop.shape().unknown_rank());
+        EXPECT_EQ(1, in_prop.shape().dim_size());
+        EXPECT_EQ(2, in_prop.shape().dim(0).size());
+        EXPECT_TRUE(out_prop.shape().unknown_rank());
+      } else if (node.op() == "Square" || node.op() == "AddN") {
+        // These nodes are in the fanout of x: their shapes should be unknown.
+        EXPECT_TRUE(in_prop.shape().unknown_rank());
+        EXPECT_TRUE(out_prop.shape().unknown_rank());
+      }
+    }
+  }
+  {
+    // Optimistic shape analysis: the shape of fed ports should be derived from
+    // the shape of the fanin.
+    GraphProperties properties(item);
+    Status s = properties.InferStatically(true);
+    TF_CHECK_OK(s);
+    for (const auto& node : item.graph.node()) {
+      if (node.op() == "Square" || node.op() == "AddN") {
+        const auto in_props = properties.GetInputProperties(node.name());
+        EXPECT_EQ(1, in_props.size());
+        const OpInfo::TensorProperties& in_prop = in_props[0];
+        EXPECT_EQ(DT_FLOAT, in_prop.dtype());
+        EXPECT_FALSE(in_prop.shape().unknown_rank());
+        EXPECT_EQ(2, in_prop.shape().dim_size());
+        const auto out_props = properties.GetOutputProperties(node.name());
+        EXPECT_EQ(1, out_props.size());
+        const OpInfo::TensorProperties& out_prop = out_props[0];
+        EXPECT_EQ(in_prop.DebugString(), out_prop.DebugString());
+      }
+    }
+  }
+}
+
+TEST_F(GraphPropertiesTest, Performance) {
+  // Load a large graph with many nested loops to make sure we can infer shapes
+  // quickly.
+  GrapplerItem item;
+  string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
+                                 "large_graph.pbtxt.html");
+  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/graph_properties_testdata/large_graph.pbtxt.html b/tensorflow/core/grappler/costs/graph_properties_testdata/large_graph.pbtxt.html
new file mode 100644
index 0000000000000000000000000000000000000000..efc642ed52908f5e28a0aaca34bd32645a2366ff
--- /dev/null
+++ b/tensorflow/core/grappler/costs/graph_properties_testdata/large_graph.pbtxt.html
@@ -0,0 +1,255137 @@
+node {
+  name: "transcript_input"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "speaker_input"
+  op: "PlaceholderWithDefault"
+  input: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+      }
+    }
+  }
+}
+node {
+  name: "Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "vui_input"
+  op: "PlaceholderWithDefault"
+  input: "Const_1"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+      }
+    }
+  }
+}
+node {
+  name: "Const_2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "waveform_input"
+  op: "PlaceholderWithDefault"
+  input: "Const_2"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "waveform_length_input"
+  op: "PlaceholderWithDefault"
+  input: "Const_3"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+      }
+    }
+  }
+}
+node {
+  name: "ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "ExpandDims"
+  op: "ExpandDims"
+  input: "transcript_input"
+  input: "ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "transcript_batch_input"
+  op: "PlaceholderWithDefault"
+  input: "ExpandDims"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "ExpandDims_1"
+  op: "ExpandDims"
+  input: "speaker_input"
+  input: "ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "speaker_batch_input"
+  op: "PlaceholderWithDefault"
+  input: "ExpandDims_1"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "ExpandDims_2/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "ExpandDims_2"
+  op: "ExpandDims"
+  input: "vui_input"
+  input: "ExpandDims_2/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "vui_batch_input"
+  op: "PlaceholderWithDefault"
+  input: "ExpandDims_2"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "ExpandDims_3/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "ExpandDims_3"
+  op: "ExpandDims"
+  input: "waveform_input"
+  input: "ExpandDims_3/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "waveform_batch_input"
+  op: "PlaceholderWithDefault"
+  input: "ExpandDims_3"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "ExpandDims_4/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "ExpandDims_4"
+  op: "ExpandDims"
+  input: "waveform_length_input"
+  input: "ExpandDims_4/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "waveform_length_batch_input"
+  op: "PlaceholderWithDefault"
+  input: "ExpandDims_4"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Shape"
+  input: "transcript_batch_input"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Shape"
+  input: "vui_batch_input"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "assert_equal/Equal"
+  op: "Equal"
+  input: "Shape"
+  input: "Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "assert_equal/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal/All"
+  op: "All"
+  input: "assert_equal/Equal"
+  input: "assert_equal/Const"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "assert_equal/Assert/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal/Assert/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Condition x == y did not hold element-wise:"
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal/Assert/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "x (Shape:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal/Assert/Const_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "y (Shape_1:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal/Assert/Assert/data_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal/Assert/Assert/data_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Condition x == y did not hold element-wise:"
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal/Assert/Assert/data_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "x (Shape:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal/Assert/Assert/data_4"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "y (Shape_1:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal/Assert/Assert"
+  op: "Assert"
+  input: "assert_equal/All"
+  input: "assert_equal/Assert/Assert/data_0"
+  input: "assert_equal/Assert/Assert/data_1"
+  input: "assert_equal/Assert/Assert/data_2"
+  input: "Shape"
+  input: "assert_equal/Assert/Assert/data_4"
+  input: "Shape_1"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_STRING
+        type: DT_STRING
+        type: DT_STRING
+        type: DT_INT32
+        type: DT_STRING
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "summarize"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Shape"
+  input: "transcript_batch_input"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape_3"
+  op: "Shape"
+  input: "waveform_length_batch_input"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "assert_equal_1/Equal"
+  op: "Equal"
+  input: "Shape_2"
+  input: "Shape_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "assert_equal_1/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_1/All"
+  op: "All"
+  input: "assert_equal_1/Equal"
+  input: "assert_equal_1/Const"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "assert_equal_1/Assert/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_1/Assert/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Condition x == y did not hold element-wise:"
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_1/Assert/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "x (Shape_2:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_1/Assert/Const_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "y (Shape_3:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_1/Assert/Assert/data_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_1/Assert/Assert/data_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Condition x == y did not hold element-wise:"
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_1/Assert/Assert/data_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "x (Shape_2:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_1/Assert/Assert/data_4"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "y (Shape_3:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_1/Assert/Assert"
+  op: "Assert"
+  input: "assert_equal_1/All"
+  input: "assert_equal_1/Assert/Assert/data_0"
+  input: "assert_equal_1/Assert/Assert/data_1"
+  input: "assert_equal_1/Assert/Assert/data_2"
+  input: "Shape_2"
+  input: "assert_equal_1/Assert/Assert/data_4"
+  input: "Shape_3"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_STRING
+        type: DT_STRING
+        type: DT_STRING
+        type: DT_INT32
+        type: DT_STRING
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "summarize"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "Shape_4"
+  op: "Shape"
+  input: "transcript_batch_input"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape_5"
+  op: "Shape"
+  input: "waveform_batch_input"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice"
+  op: "StridedSlice"
+  input: "Shape_5"
+  input: "strided_slice/stack"
+  input: "strided_slice/stack_1"
+  input: "strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "assert_equal_2/Equal"
+  op: "Equal"
+  input: "Shape_4"
+  input: "strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "assert_equal_2/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_2/All"
+  op: "All"
+  input: "assert_equal_2/Equal"
+  input: "assert_equal_2/Const"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "assert_equal_2/Assert/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_2/Assert/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Condition x == y did not hold element-wise:"
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_2/Assert/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "x (Shape_4:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_2/Assert/Const_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "y (strided_slice:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_2/Assert/Assert/data_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_2/Assert/Assert/data_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Condition x == y did not hold element-wise:"
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_2/Assert/Assert/data_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "x (Shape_4:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_2/Assert/Assert/data_4"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "y (strided_slice:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_2/Assert/Assert"
+  op: "Assert"
+  input: "assert_equal_2/All"
+  input: "assert_equal_2/Assert/Assert/data_0"
+  input: "assert_equal_2/Assert/Assert/data_1"
+  input: "assert_equal_2/Assert/Assert/data_2"
+  input: "Shape_4"
+  input: "assert_equal_2/Assert/Assert/data_4"
+  input: "strided_slice"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_STRING
+        type: DT_STRING
+        type: DT_STRING
+        type: DT_INT32
+        type: DT_STRING
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "summarize"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "Shape_6"
+  op: "Shape"
+  input: "transcript_batch_input"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Shape_7"
+  op: "Shape"
+  input: "speaker_batch_input"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "assert_equal_3/Equal"
+  op: "Equal"
+  input: "Shape_6"
+  input: "Shape_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "assert_equal_3/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_3/All"
+  op: "All"
+  input: "assert_equal_3/Equal"
+  input: "assert_equal_3/Const"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "assert_equal_3/Assert/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_3/Assert/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Condition x == y did not hold element-wise:"
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_3/Assert/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "x (Shape_6:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_3/Assert/Const_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "y (Shape_7:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_3/Assert/Assert/data_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_3/Assert/Assert/data_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Condition x == y did not hold element-wise:"
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_3/Assert/Assert/data_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "x (Shape_6:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_3/Assert/Assert/data_4"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "y (Shape_7:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_3/Assert/Assert"
+  op: "Assert"
+  input: "assert_equal_3/All"
+  input: "assert_equal_3/Assert/Assert/data_0"
+  input: "assert_equal_3/Assert/Assert/data_1"
+  input: "assert_equal_3/Assert/Assert/data_2"
+  input: "Shape_6"
+  input: "assert_equal_3/Assert/Assert/data_4"
+  input: "Shape_7"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_STRING
+        type: DT_STRING
+        type: DT_STRING
+        type: DT_INT32
+        type: DT_STRING
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "summarize"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "Identity"
+  op: "Identity"
+  input: "transcript_batch_input"
+  input: "^assert_equal/Assert/Assert"
+  input: "^assert_equal_1/Assert/Assert"
+  input: "^assert_equal_2/Assert/Assert"
+  input: "^assert_equal_3/Assert/Assert"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+
+}
+node {
+  name: "Const_4"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 400
+      }
+    }
+  }
+}
+node {
+  name: "decoder_output_length"
+  op: "PlaceholderWithDefault"
+  input: "Const_4"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+      }
+    }
+  }
+}
+node {
+  name: "Shape_8"
+  op: "Shape"
+  input: "Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_1"
+  op: "StridedSlice"
+  input: "Shape_8"
+  input: "strided_slice_1/stack"
+  input: "strided_slice_1/stack_1"
+  input: "strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "key_value_init/keys"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 40
+          }
+        }
+        string_val: "msf_001"
+        string_val: "msf_002"
+        string_val: "msf_003"
+        string_val: "msf_004"
+        string_val: "msf_005"
+        string_val: "msf_006"
+        string_val: "msf_007"
+        string_val: "msf_008"
+        string_val: "msf_009"
+        string_val: "msf_010"
+        string_val: "msf_011"
+        string_val: "msf_012"
+        string_val: "msf_013"
+        string_val: "msf_014"
+        string_val: "msf_015"
+        string_val: "msf_016"
+        string_val: "msf_017"
+        string_val: "msf_018"
+        string_val: "msf_019"
+        string_val: "msf_020"
+        string_val: "msm_001"
+        string_val: "msm_002"
+        string_val: "msm_003"
+        string_val: "msm_004"
+        string_val: "msm_005"
+        string_val: "msm_006"
+        string_val: "msm_007"
+        string_val: "msm_008"
+        string_val: "msm_009"
+        string_val: "msm_010"
+        string_val: "msm_011"
+        string_val: "msm_012"
+        string_val: "msm_013"
+        string_val: "msm_014"
+        string_val: "msm_015"
+        string_val: "msm_016"
+        string_val: "msm_017"
+        string_val: "msm_018"
+        string_val: "msm_019"
+        string_val: "msm_020"
+      }
+    }
+  }
+}
+node {
+  name: "key_value_init/values"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 40
+          }
+        }
+        tensor_content: "\001\000\000\000\002\000\000\000\003\000\000\000\004\000\000\000\005\000\000\000\006\000\000\000\007\000\000\000\010\000\000\000\t\000\000\000\n\000\000\000\013\000\000\000\014\000\000\000\r\000\000\000\016\000\000\000\017\000\000\000\020\000\000\000\021\000\000\000\022\000\000\000\023\000\000\000\024\000\000\000\025\000\000\000\026\000\000\000\027\000\000\000\030\000\000\000\031\000\000\000\032\000\000\000\033\000\000\000\034\000\000\000\035\000\000\000\036\000\000\000\037\000\000\000 \000\000\000!\000\000\000\"\000\000\000#\000\000\000$\000\000\000%\000\000\000&\000\000\000\'\000\000\000(\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "speaker_lookup_table"
+  op: "HashTableV2"
+
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "key_dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "use_node_name_sharing"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "value_dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "speaker_lookup_table/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "key_value_init"
+  op: "InitializeTableV2"
+  input: "speaker_lookup_table"
+  input: "key_value_init/keys"
+  input: "key_value_init/values"
+  attr {
+    key: "Tkey"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "Tval"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "speaker_lookup_table_Lookup"
+  op: "LookupTableFindV2"
+  input: "speaker_lookup_table"
+  input: "speaker_batch_input"
+  input: "speaker_lookup_table/Const"
+  attr {
+    key: "Tin"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Const_5"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Fill/dims"
+  op: "Pack"
+  input: "strided_slice_1"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Fill"
+  op: "Fill"
+  input: "Fill/dims"
+  input: "Const_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "padding_map_fn/Shape"
+  op: "Shape"
+  input: "Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "padding_map_fn/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/strided_slice"
+  op: "StridedSlice"
+  input: "padding_map_fn/Shape"
+  input: "padding_map_fn/strided_slice/stack"
+  input: "padding_map_fn/strided_slice/stack_1"
+  input: "padding_map_fn/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "padding_map_fn/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/strided_slice_1"
+  op: "StridedSlice"
+  input: "Identity"
+  input: "padding_map_fn/strided_slice_1/stack"
+  input: "padding_map_fn/strided_slice_1/stack_1"
+  input: "padding_map_fn/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+
+node {
+  name: "padding_map_fn/TokenizeTranscriptV4/cast"
+  op: "Cast"
+  input: "padding_map_fn/strided_slice_1"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+
+node {
+  name: "padding_map_fn/TokenizeTranscriptV4/shape"
+  op: "Const"
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape { dim { size: 1 } }
+        int_val: 1
+      }
+    }
+  }
+  attr { key: "dtype" value { type: DT_INT32 } }
+}
+
+node {
+  name: "padding_map_fn/TokenizeTranscriptV4/reshape"
+  op: "Reshape"
+  input: "padding_map_fn/TokenizeTranscriptV4/cast"
+  input: "padding_map_fn/TokenizeTranscriptV4/shape"
+
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+
+node {
+  name: "padding_map_fn/TokenizeTranscriptV4"
+  op: "PlaceholderWithDefault"
+  input: "padding_map_fn/TokenizeTranscriptV4/reshape"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/concat/values_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 54
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/concat/values_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 55
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/concat"
+  op: "ConcatV2"
+  input: "padding_map_fn/concat/values_0"
+  input: "padding_map_fn/TokenizeTranscriptV4"
+  input: "padding_map_fn/concat/values_2"
+  input: "padding_map_fn/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/Shape_1"
+  op: "Shape"
+  input: "padding_map_fn/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "padding_map_fn/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/strided_slice_2"
+  op: "StridedSlice"
+  input: "padding_map_fn/Shape_1"
+  input: "padding_map_fn/strided_slice_2/stack"
+  input: "padding_map_fn/strided_slice_2/stack_1"
+  input: "padding_map_fn/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "padding_map_fn/LogicalNot/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: false
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/LogicalNot"
+  op: "LogicalNot"
+  input: "padding_map_fn/LogicalNot/x"
+
+}
+node {
+  name: "padding_map_fn/Shape_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/Shape_3"
+  op: "Shape"
+  input: "padding_map_fn/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "padding_map_fn/Shape_4"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArray"
+  op: "TensorArrayV3"
+  input: "padding_map_fn/strided_slice"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArray_1"
+  op: "TensorArrayV3"
+  input: "padding_map_fn/strided_slice"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArray_2"
+  op: "TensorArrayV3"
+  input: "padding_map_fn/strided_slice"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArrayWrite/TensorArrayWriteV3/index"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/LogicalNot"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArrayWrite/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "padding_map_fn/TensorArray"
+  input: "padding_map_fn/TensorArrayWrite/TensorArrayWriteV3/index"
+  input: "padding_map_fn/LogicalNot"
+  input: "padding_map_fn/TensorArray:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/LogicalNot"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/TensorArrayWrite_1/TensorArrayWriteV3/index"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/concat"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArrayWrite_1/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "padding_map_fn/TensorArray_1"
+  input: "padding_map_fn/TensorArrayWrite_1/TensorArrayWriteV3/index"
+  input: "padding_map_fn/concat"
+  input: "padding_map_fn/TensorArray_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/concat"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/TensorArrayWrite_2/TensorArrayWriteV3/index"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/strided_slice_2"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArrayWrite_2/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "padding_map_fn/TensorArray_2"
+  input: "padding_map_fn/TensorArrayWrite_2/TensorArrayWriteV3/index"
+  input: "padding_map_fn/strided_slice_2"
+  input: "padding_map_fn/TensorArray_2:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/strided_slice_2"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/Enter"
+  op: "Enter"
+  input: "padding_map_fn/while/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/Enter_1"
+  op: "Enter"
+  input: "padding_map_fn/TensorArrayWrite/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/Enter_2"
+  op: "Enter"
+  input: "padding_map_fn/TensorArrayWrite_1/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/Enter_3"
+  op: "Enter"
+  input: "padding_map_fn/TensorArrayWrite_2/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/Enter_4"
+  op: "Enter"
+  input: "padding_map_fn/Shape_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/Enter_5"
+  op: "Enter"
+  input: "padding_map_fn/Shape_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/Enter_6"
+  op: "Enter"
+  input: "padding_map_fn/Shape_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/Merge"
+  op: "Merge"
+  input: "padding_map_fn/while/Enter"
+  input: "padding_map_fn/while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Merge_1"
+  op: "Merge"
+  input: "padding_map_fn/while/Enter_1"
+  input: "padding_map_fn/while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Merge_2"
+  op: "Merge"
+  input: "padding_map_fn/while/Enter_2"
+  input: "padding_map_fn/while/NextIteration_2"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Merge_3"
+  op: "Merge"
+  input: "padding_map_fn/while/Enter_3"
+  input: "padding_map_fn/while/NextIteration_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Merge_4"
+  op: "Merge"
+  input: "padding_map_fn/while/Enter_4"
+  input: "padding_map_fn/while/NextIteration_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Merge_5"
+  op: "Merge"
+  input: "padding_map_fn/while/Enter_5"
+  input: "padding_map_fn/while/NextIteration_5"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Merge_6"
+  op: "Merge"
+  input: "padding_map_fn/while/Enter_6"
+  input: "padding_map_fn/while/NextIteration_6"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Less"
+  op: "Less"
+  input: "padding_map_fn/while/Merge"
+  input: "padding_map_fn/while/Less/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Less/Enter"
+  op: "Enter"
+  input: "padding_map_fn/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/LoopCond"
+  op: "LoopCond"
+  input: "padding_map_fn/while/Less"
+
+}
+node {
+  name: "padding_map_fn/while/Switch"
+  op: "Switch"
+  input: "padding_map_fn/while/Merge"
+  input: "padding_map_fn/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while/Merge"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Switch_1"
+  op: "Switch"
+  input: "padding_map_fn/while/Merge_1"
+  input: "padding_map_fn/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while/Merge_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Switch_2"
+  op: "Switch"
+  input: "padding_map_fn/while/Merge_2"
+  input: "padding_map_fn/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while/Merge_2"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Switch_3"
+  op: "Switch"
+  input: "padding_map_fn/while/Merge_3"
+  input: "padding_map_fn/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while/Merge_3"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Switch_4"
+  op: "Switch"
+  input: "padding_map_fn/while/Merge_4"
+  input: "padding_map_fn/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while/Merge_4"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Switch_5"
+  op: "Switch"
+  input: "padding_map_fn/while/Merge_5"
+  input: "padding_map_fn/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while/Merge_5"
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/Switch_6"
+  op: "Switch"
+  input: "padding_map_fn/while/Merge_6"
+  input: "padding_map_fn/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while/Merge_6"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Identity"
+  op: "Identity"
+  input: "padding_map_fn/while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Identity_1"
+  op: "Identity"
+  input: "padding_map_fn/while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Identity_2"
+  op: "Identity"
+  input: "padding_map_fn/while/Switch_2:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Identity_3"
+  op: "Identity"
+  input: "padding_map_fn/while/Switch_3:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Identity_4"
+  op: "Identity"
+  input: "padding_map_fn/while/Switch_4:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Identity_5"
+  op: "Identity"
+  input: "padding_map_fn/while/Switch_5:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Identity_6"
+  op: "Identity"
+  input: "padding_map_fn/while/Switch_6:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/add/y"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/add"
+  op: "Add"
+  input: "padding_map_fn/while/Identity"
+  input: "padding_map_fn/while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/strided_slice/stack/1"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/strided_slice/stack"
+  op: "Pack"
+  input: "padding_map_fn/while/Identity"
+  input: "padding_map_fn/while/strided_slice/stack/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/strided_slice/stack_1/1"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/strided_slice/stack_1"
+  op: "Pack"
+  input: "padding_map_fn/while/add"
+  input: "padding_map_fn/while/strided_slice/stack_1/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/strided_slice/stack_2"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/strided_slice"
+  op: "StridedSlice"
+  input: "padding_map_fn/while/strided_slice/Enter"
+  input: "padding_map_fn/while/strided_slice/stack"
+  input: "padding_map_fn/while/strided_slice/stack_1"
+  input: "padding_map_fn/while/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/strided_slice/Enter"
+  op: "Enter"
+  input: "Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+
+node {
+  name: "padding_map_fn/while/TokenizeTranscriptV4/cast"
+  op: "Cast"
+  input: "padding_map_fn/while/strided_slice"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/TokenizeTranscriptV4/shape"
+  input: "^padding_map_fn/while/TokenizeTranscriptV4/cast"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/TokenizeTranscriptV4/reshape"
+  op: "Reshape"
+  input: "padding_map_fn/while/TokenizeTranscriptV4/cast"
+  input: "padding_map_fn/while/TokenizeTranscriptV4/shape"
+
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/TokenizeTranscriptV4"
+  op: "PlaceholderWithDefault"
+  input: "padding_map_fn/while/TokenizeTranscriptV4/reshape"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/concat/values_0"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 54
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/concat/values_2"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 55
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/concat/axis"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/concat"
+  op: "ConcatV2"
+  input: "padding_map_fn/while/concat/values_0"
+  input: "padding_map_fn/while/TokenizeTranscriptV4"
+  input: "padding_map_fn/while/concat/values_2"
+  input: "padding_map_fn/while/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Shape"
+  op: "Shape"
+  input: "padding_map_fn/while/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/strided_slice_1/stack"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/strided_slice_1/stack_1"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/strided_slice_1/stack_2"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/strided_slice_1"
+  op: "StridedSlice"
+  input: "padding_map_fn/while/Shape"
+  input: "padding_map_fn/while/strided_slice_1/stack"
+  input: "padding_map_fn/while/strided_slice_1/stack_1"
+  input: "padding_map_fn/while/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/LogicalNot/x"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: false
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/LogicalNot"
+  op: "LogicalNot"
+  input: "padding_map_fn/while/LogicalNot/x"
+
+}
+node {
+  name: "padding_map_fn/while/TensorArrayWrite/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "padding_map_fn/while/TensorArrayWrite/TensorArrayWriteV3/Enter"
+  input: "padding_map_fn/while/Identity"
+  input: "padding_map_fn/while/LogicalNot"
+  input: "padding_map_fn/while/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/LogicalNot"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/TensorArrayWrite/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "padding_map_fn/TensorArray"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/LogicalNot"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/TensorArrayWrite_1/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "padding_map_fn/while/TensorArrayWrite_1/TensorArrayWriteV3/Enter"
+  input: "padding_map_fn/while/Identity"
+  input: "padding_map_fn/while/concat"
+  input: "padding_map_fn/while/Identity_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/concat"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/TensorArrayWrite_1/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "padding_map_fn/TensorArray_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/concat"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/TensorArrayWrite_2/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "padding_map_fn/while/TensorArrayWrite_2/TensorArrayWriteV3/Enter"
+  input: "padding_map_fn/while/Identity"
+  input: "padding_map_fn/while/strided_slice_1"
+  input: "padding_map_fn/while/Identity_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/strided_slice_2"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/TensorArrayWrite_2/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "padding_map_fn/TensorArray_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/strided_slice_2"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/Shape_1"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/Maximum"
+  op: "Maximum"
+  input: "padding_map_fn/while/Identity_4"
+  input: "padding_map_fn/while/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Shape_2"
+  op: "Shape"
+  input: "padding_map_fn/while/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/Maximum_1"
+  op: "Maximum"
+  input: "padding_map_fn/while/Identity_5"
+  input: "padding_map_fn/while/Shape_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Shape_3"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/Maximum_2"
+  op: "Maximum"
+  input: "padding_map_fn/while/Identity_6"
+  input: "padding_map_fn/while/Shape_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/add_1/y"
+  op: "Const"
+  input: "^padding_map_fn/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while/add_1"
+  op: "Add"
+  input: "padding_map_fn/while/Identity"
+  input: "padding_map_fn/while/add_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/NextIteration"
+  op: "NextIteration"
+  input: "padding_map_fn/while/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/NextIteration_1"
+  op: "NextIteration"
+  input: "padding_map_fn/while/TensorArrayWrite/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/NextIteration_2"
+  op: "NextIteration"
+  input: "padding_map_fn/while/TensorArrayWrite_1/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/NextIteration_3"
+  op: "NextIteration"
+  input: "padding_map_fn/while/TensorArrayWrite_2/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/NextIteration_4"
+  op: "NextIteration"
+  input: "padding_map_fn/while/Maximum"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/NextIteration_5"
+  op: "NextIteration"
+  input: "padding_map_fn/while/Maximum_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/NextIteration_6"
+  op: "NextIteration"
+  input: "padding_map_fn/while/Maximum_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Exit"
+  op: "Exit"
+  input: "padding_map_fn/while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Exit_1"
+  op: "Exit"
+  input: "padding_map_fn/while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Exit_2"
+  op: "Exit"
+  input: "padding_map_fn/while/Switch_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Exit_3"
+  op: "Exit"
+  input: "padding_map_fn/while/Switch_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Exit_4"
+  op: "Exit"
+  input: "padding_map_fn/while/Switch_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Exit_5"
+  op: "Exit"
+  input: "padding_map_fn/while/Switch_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while/Exit_6"
+  op: "Exit"
+  input: "padding_map_fn/while/Switch_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/TensorArray_3"
+  op: "TensorArrayV3"
+  input: "padding_map_fn/strided_slice"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArray_4"
+  op: "TensorArrayV3"
+  input: "padding_map_fn/strided_slice"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArray_5"
+  op: "TensorArrayV3"
+  input: "padding_map_fn/strided_slice"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/Enter"
+  op: "Enter"
+  input: "padding_map_fn/while_1/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/Enter_1"
+  op: "Enter"
+  input: "padding_map_fn/TensorArray_3:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/Enter_2"
+  op: "Enter"
+  input: "padding_map_fn/TensorArray_4:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/Enter_3"
+  op: "Enter"
+  input: "padding_map_fn/TensorArray_5:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/Merge"
+  op: "Merge"
+  input: "padding_map_fn/while_1/Enter"
+  input: "padding_map_fn/while_1/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Merge_1"
+  op: "Merge"
+  input: "padding_map_fn/while_1/Enter_1"
+  input: "padding_map_fn/while_1/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Merge_2"
+  op: "Merge"
+  input: "padding_map_fn/while_1/Enter_2"
+  input: "padding_map_fn/while_1/NextIteration_2"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Merge_3"
+  op: "Merge"
+  input: "padding_map_fn/while_1/Enter_3"
+  input: "padding_map_fn/while_1/NextIteration_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Less"
+  op: "Less"
+  input: "padding_map_fn/while_1/Merge"
+  input: "padding_map_fn/while_1/Less/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Less/Enter"
+  op: "Enter"
+  input: "padding_map_fn/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/LoopCond"
+  op: "LoopCond"
+  input: "padding_map_fn/while_1/Less"
+
+}
+node {
+  name: "padding_map_fn/while_1/Switch"
+  op: "Switch"
+  input: "padding_map_fn/while_1/Merge"
+  input: "padding_map_fn/while_1/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while_1/Merge"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Switch_1"
+  op: "Switch"
+  input: "padding_map_fn/while_1/Merge_1"
+  input: "padding_map_fn/while_1/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while_1/Merge_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Switch_2"
+  op: "Switch"
+  input: "padding_map_fn/while_1/Merge_2"
+  input: "padding_map_fn/while_1/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while_1/Merge_2"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Switch_3"
+  op: "Switch"
+  input: "padding_map_fn/while_1/Merge_3"
+  input: "padding_map_fn/while_1/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while_1/Merge_3"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Identity"
+  op: "Identity"
+  input: "padding_map_fn/while_1/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Identity_1"
+  op: "Identity"
+  input: "padding_map_fn/while_1/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Identity_2"
+  op: "Identity"
+  input: "padding_map_fn/while_1/Switch_2:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Identity_3"
+  op: "Identity"
+  input: "padding_map_fn/while_1/Switch_3:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayReadV3"
+  op: "TensorArrayReadV3"
+  input: "padding_map_fn/while_1/TensorArrayReadV3/Enter"
+  input: "padding_map_fn/while_1/Identity"
+  input: "padding_map_fn/while_1/TensorArrayReadV3/Enter_1"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayReadV3/Enter"
+  op: "Enter"
+  input: "padding_map_fn/TensorArray"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayReadV3/Enter_1"
+  op: "Enter"
+  input: "padding_map_fn/while/Exit_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/Shape"
+  op: "Shape"
+  input: "padding_map_fn/while_1/TensorArrayReadV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/sub"
+  op: "Sub"
+  input: "padding_map_fn/while_1/sub/Enter"
+  input: "padding_map_fn/while_1/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/sub/Enter"
+  op: "Enter"
+  input: "padding_map_fn/while/Exit_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/ExpandDims/dim"
+  op: "Const"
+  input: "^padding_map_fn/while_1/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/ExpandDims"
+  op: "ExpandDims"
+  input: "padding_map_fn/while_1/sub"
+  input: "padding_map_fn/while_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Pad/paddings"
+  op: "Const"
+  input: "^padding_map_fn/while_1/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/Pad"
+  op: "Pad"
+  input: "padding_map_fn/while_1/ExpandDims"
+  input: "padding_map_fn/while_1/Pad/paddings"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Pad_1"
+  op: "Pad"
+  input: "padding_map_fn/while_1/TensorArrayReadV3"
+  input: "padding_map_fn/while_1/Pad"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayWrite/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "padding_map_fn/while_1/TensorArrayWrite/TensorArrayWriteV3/Enter"
+  input: "padding_map_fn/while_1/Identity"
+  input: "padding_map_fn/while_1/Pad_1"
+  input: "padding_map_fn/while_1/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while_1/Pad_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayWrite/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "padding_map_fn/TensorArray_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while_1/Pad_1"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayReadV3_1"
+  op: "TensorArrayReadV3"
+  input: "padding_map_fn/while_1/TensorArrayReadV3_1/Enter"
+  input: "padding_map_fn/while_1/Identity"
+  input: "padding_map_fn/while_1/TensorArrayReadV3_1/Enter_1"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayReadV3_1/Enter"
+  op: "Enter"
+  input: "padding_map_fn/TensorArray_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayReadV3_1/Enter_1"
+  op: "Enter"
+  input: "padding_map_fn/while/Exit_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/Shape_1"
+  op: "Shape"
+  input: "padding_map_fn/while_1/TensorArrayReadV3_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/sub_1"
+  op: "Sub"
+  input: "padding_map_fn/while_1/sub_1/Enter"
+  input: "padding_map_fn/while_1/Shape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/sub_1/Enter"
+  op: "Enter"
+  input: "padding_map_fn/while/Exit_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/ExpandDims_1/dim"
+  op: "Const"
+  input: "^padding_map_fn/while_1/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "padding_map_fn/while_1/sub_1"
+  input: "padding_map_fn/while_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Pad_2/paddings"
+  op: "Const"
+  input: "^padding_map_fn/while_1/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/Pad_2"
+  op: "Pad"
+  input: "padding_map_fn/while_1/ExpandDims_1"
+  input: "padding_map_fn/while_1/Pad_2/paddings"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Pad_3"
+  op: "Pad"
+  input: "padding_map_fn/while_1/TensorArrayReadV3_1"
+  input: "padding_map_fn/while_1/Pad_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayWrite_1/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "padding_map_fn/while_1/TensorArrayWrite_1/TensorArrayWriteV3/Enter"
+  input: "padding_map_fn/while_1/Identity"
+  input: "padding_map_fn/while_1/Pad_3"
+  input: "padding_map_fn/while_1/Identity_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while_1/Pad_3"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayWrite_1/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "padding_map_fn/TensorArray_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while_1/Pad_3"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayReadV3_2"
+  op: "TensorArrayReadV3"
+  input: "padding_map_fn/while_1/TensorArrayReadV3_2/Enter"
+  input: "padding_map_fn/while_1/Identity"
+  input: "padding_map_fn/while_1/TensorArrayReadV3_2/Enter_1"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayReadV3_2/Enter"
+  op: "Enter"
+  input: "padding_map_fn/TensorArray_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayReadV3_2/Enter_1"
+  op: "Enter"
+  input: "padding_map_fn/while/Exit_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/Shape_2"
+  op: "Shape"
+  input: "padding_map_fn/while_1/TensorArrayReadV3_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/sub_2"
+  op: "Sub"
+  input: "padding_map_fn/while_1/sub_2/Enter"
+  input: "padding_map_fn/while_1/Shape_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/sub_2/Enter"
+  op: "Enter"
+  input: "padding_map_fn/while/Exit_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/ExpandDims_2/dim"
+  op: "Const"
+  input: "^padding_map_fn/while_1/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/ExpandDims_2"
+  op: "ExpandDims"
+  input: "padding_map_fn/while_1/sub_2"
+  input: "padding_map_fn/while_1/ExpandDims_2/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Pad_4/paddings"
+  op: "Const"
+  input: "^padding_map_fn/while_1/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/Pad_4"
+  op: "Pad"
+  input: "padding_map_fn/while_1/ExpandDims_2"
+  input: "padding_map_fn/while_1/Pad_4/paddings"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Pad_5"
+  op: "Pad"
+  input: "padding_map_fn/while_1/TensorArrayReadV3_2"
+  input: "padding_map_fn/while_1/Pad_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayWrite_2/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "padding_map_fn/while_1/TensorArrayWrite_2/TensorArrayWriteV3/Enter"
+  input: "padding_map_fn/while_1/Identity"
+  input: "padding_map_fn/while_1/Pad_5"
+  input: "padding_map_fn/while_1/Identity_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while_1/Pad_5"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/TensorArrayWrite_2/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "padding_map_fn/TensorArray_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/while_1/Pad_5"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "padding_map_fn/while_1/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/add/y"
+  op: "Const"
+  input: "^padding_map_fn/while_1/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/while_1/add"
+  op: "Add"
+  input: "padding_map_fn/while_1/Identity"
+  input: "padding_map_fn/while_1/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/NextIteration"
+  op: "NextIteration"
+  input: "padding_map_fn/while_1/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/NextIteration_1"
+  op: "NextIteration"
+  input: "padding_map_fn/while_1/TensorArrayWrite/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/NextIteration_2"
+  op: "NextIteration"
+  input: "padding_map_fn/while_1/TensorArrayWrite_1/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/NextIteration_3"
+  op: "NextIteration"
+  input: "padding_map_fn/while_1/TensorArrayWrite_2/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Exit"
+  op: "Exit"
+  input: "padding_map_fn/while_1/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Exit_1"
+  op: "Exit"
+  input: "padding_map_fn/while_1/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Exit_2"
+  op: "Exit"
+  input: "padding_map_fn/while_1/Switch_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/while_1/Exit_3"
+  op: "Exit"
+  input: "padding_map_fn/while_1/Switch_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/TensorArrayStack/TensorArraySizeV3"
+  op: "TensorArraySizeV3"
+  input: "padding_map_fn/TensorArray_3"
+  input: "padding_map_fn/while_1/Exit_1"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_3"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/TensorArrayStack/range/start"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_3"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArrayStack/range/delta"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_3"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArrayStack/range"
+  op: "Range"
+  input: "padding_map_fn/TensorArrayStack/range/start"
+  input: "padding_map_fn/TensorArrayStack/TensorArraySizeV3"
+  input: "padding_map_fn/TensorArrayStack/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_3"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/TensorArrayStack/TensorArrayGatherV3"
+  op: "TensorArrayGatherV3"
+  input: "padding_map_fn/TensorArray_3"
+  input: "padding_map_fn/TensorArrayStack/range"
+  input: "padding_map_fn/while_1/Exit_1"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_3"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArrayStack_1/TensorArraySizeV3"
+  op: "TensorArraySizeV3"
+  input: "padding_map_fn/TensorArray_4"
+  input: "padding_map_fn/while_1/Exit_2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_4"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/TensorArrayStack_1/range/start"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_4"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArrayStack_1/range/delta"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_4"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArrayStack_1/range"
+  op: "Range"
+  input: "padding_map_fn/TensorArrayStack_1/range/start"
+  input: "padding_map_fn/TensorArrayStack_1/TensorArraySizeV3"
+  input: "padding_map_fn/TensorArrayStack_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_4"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/TensorArrayStack_1/TensorArrayGatherV3"
+  op: "TensorArrayGatherV3"
+  input: "padding_map_fn/TensorArray_4"
+  input: "padding_map_fn/TensorArrayStack_1/range"
+  input: "padding_map_fn/while_1/Exit_2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_4"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArrayStack_2/TensorArraySizeV3"
+  op: "TensorArraySizeV3"
+  input: "padding_map_fn/TensorArray_5"
+  input: "padding_map_fn/while_1/Exit_3"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_5"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/TensorArrayStack_2/range/start"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_5"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArrayStack_2/range/delta"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_5"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "padding_map_fn/TensorArrayStack_2/range"
+  op: "Range"
+  input: "padding_map_fn/TensorArrayStack_2/range/start"
+  input: "padding_map_fn/TensorArrayStack_2/TensorArraySizeV3"
+  input: "padding_map_fn/TensorArrayStack_2/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_5"
+      }
+    }
+  }
+
+}
+node {
+  name: "padding_map_fn/TensorArrayStack_2/TensorArrayGatherV3"
+  op: "TensorArrayGatherV3"
+  input: "padding_map_fn/TensorArray_5"
+  input: "padding_map_fn/TensorArrayStack_2/range"
+  input: "padding_map_fn/while_1/Exit_3"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@padding_map_fn/TensorArray_5"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+node {
+  name: "Shape_9"
+  op: "Shape"
+  input: "padding_map_fn/TensorArrayStack_1/TensorArrayGatherV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_2"
+  op: "StridedSlice"
+  input: "Shape_9"
+  input: "strided_slice_2/stack"
+  input: "strided_slice_2/stack_1"
+  input: "strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask/range"
+  op: "Range"
+  input: "sequence_length_mask/range/start"
+  input: "strided_slice_2"
+  input: "sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "sequence_length_mask/range"
+  input: "sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask/Shape"
+  op: "Shape"
+  input: "padding_map_fn/TensorArrayStack_2/TensorArrayGatherV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "sequence_length_mask/Shape"
+  input: "sequence_length_mask/strided_slice/stack"
+  input: "sequence_length_mask/strided_slice/stack_1"
+  input: "sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "sequence_length_mask/strided_slice"
+  input: "sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "sequence_length_mask/Tile"
+  op: "Tile"
+  input: "sequence_length_mask/ExpandDims"
+  input: "sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "padding_map_fn/TensorArrayStack_2/TensorArrayGatherV3"
+  input: "sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask/Less"
+  op: "Less"
+  input: "sequence_length_mask/Tile"
+  input: "sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask/Cast"
+  op: "Cast"
+  input: "sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "sub/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sub"
+  op: "Sub"
+  input: "sub/x"
+  input: "sequence_length_mask/Cast"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "mul/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 55
+      }
+    }
+  }
+}
+node {
+  name: "mul"
+  op: "Mul"
+  input: "sub"
+  input: "mul/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "padding_map_fn/TensorArrayStack_1/TensorArrayGatherV3"
+  input: "mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Const_6"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "All"
+  op: "All"
+  input: "padding_map_fn/TensorArrayStack/TensorArrayGatherV3"
+  input: "Const_6"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Assert/Assert"
+  op: "Assert"
+  input: "All"
+  input: "Identity"
+  input: "add"
+  input: "padding_map_fn/TensorArrayStack_2/TensorArrayGatherV3"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_STRING
+        type: DT_INT32
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "summarize"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "control_dependency"
+  op: "Identity"
+  input: "add"
+  input: "^Assert/Assert"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@add"
+      }
+    }
+  }
+
+}
+node {
+  name: "Const_7"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 24000.0
+      }
+    }
+  }
+}
+node {
+  name: "Fill_1/dims"
+  op: "Pack"
+  input: "strided_slice_1"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Fill_1"
+  op: "Fill"
+  input: "Fill_1/dims"
+  input: "Const_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Const_8"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Abs"
+  op: "Abs"
+  input: "waveform_batch_input"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "Const_9"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Max"
+  op: "Max"
+  input: "Abs"
+  input: "Const_9"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "div"
+  op: "RealDiv"
+  input: "waveform_batch_input"
+  input: "Max"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "pre_emphasis/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "pre_emphasis/ExpandDims"
+  op: "ExpandDims"
+  input: "div"
+  input: "pre_emphasis/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "pre_emphasis/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "pre_emphasis/ExpandDims_1"
+  op: "ExpandDims"
+  input: "pre_emphasis/ExpandDims"
+  input: "pre_emphasis/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "pre_emphasis/Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\354Qx\277\000\000\200?"
+      }
+    }
+  }
+}
+node {
+  name: "pre_emphasis/Conv2D"
+  op: "Conv2D"
+  input: "pre_emphasis/ExpandDims_1"
+  input: "pre_emphasis/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "pre_emphasis/Squeeze"
+  op: "Squeeze"
+  input: "pre_emphasis/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+        i: 3
+      }
+    }
+  }
+}
+node {
+  name: "frame/frame_length"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1200
+      }
+    }
+  }
+}
+node {
+  name: "frame/frame_step"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 300
+      }
+    }
+  }
+}
+node {
+  name: "frame/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "frame/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "frame/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "frame/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "frame/range"
+  op: "Range"
+  input: "frame/range/start"
+  input: "frame/Rank"
+  input: "frame/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/add/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "frame/add"
+  op: "Add"
+  input: "frame/axis"
+  input: "frame/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/strided_slice/stack"
+  op: "Pack"
+  input: "frame/axis"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/strided_slice/stack_1"
+  op: "Pack"
+  input: "frame/add"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "frame/strided_slice"
+  op: "StridedSlice"
+  input: "frame/range"
+  input: "frame/strided_slice/stack"
+  input: "frame/strided_slice/stack_1"
+  input: "frame/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "frame/Shape"
+  op: "Shape"
+  input: "pre_emphasis/Squeeze"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "frame/sub/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "frame/sub"
+  op: "Sub"
+  input: "frame/Rank"
+  input: "frame/sub/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/sub_1"
+  op: "Sub"
+  input: "frame/sub"
+  input: "frame/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/packed/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "frame/packed"
+  op: "Pack"
+  input: "frame/strided_slice"
+  input: "frame/packed/1"
+  input: "frame/sub_1"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "frame/split"
+  op: "SplitV"
+  input: "frame/Shape"
+  input: "frame/packed"
+  input: "frame/split/split_dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tlen"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "num_split"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "frame/Reshape/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "frame/Reshape"
+  op: "Reshape"
+  input: "frame/split:1"
+  input: "frame/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/Size"
+  op: "Size"
+  input: "frame/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "frame/Size_1"
+  op: "Size"
+  input: "frame/split:2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "frame/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "frame/Neg"
+  op: "Neg"
+  input: "frame/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/floordiv"
+  op: "FloorDiv"
+  input: "frame/Neg"
+  input: "frame/frame_step"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/Neg_1"
+  op: "Neg"
+  input: "frame/floordiv"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/sub_2/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "frame/sub_2"
+  op: "Sub"
+  input: "frame/Neg_1"
+  input: "frame/sub_2/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/mul"
+  op: "Mul"
+  input: "frame/frame_step"
+  input: "frame/sub_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/add_1"
+  op: "Add"
+  input: "frame/frame_length"
+  input: "frame/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/sub_3"
+  op: "Sub"
+  input: "frame/add_1"
+  input: "frame/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/Maximum/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "frame/Maximum"
+  op: "Maximum"
+  input: "frame/Maximum/x"
+  input: "frame/sub_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/zeros/shape/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "frame/zeros/shape"
+  op: "Pack"
+  input: "frame/Size"
+  input: "frame/zeros/shape/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/zeros/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "frame/zeros"
+  op: "Fill"
+  input: "frame/zeros/shape"
+  input: "frame/zeros/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "frame/zeros_1/shape/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "frame/zeros_1/shape"
+  op: "Pack"
+  input: "frame/Size_1"
+  input: "frame/zeros_1/shape/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/zeros_1/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "frame/zeros_1"
+  op: "Fill"
+  input: "frame/zeros_1/shape"
+  input: "frame/zeros_1/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "frame/concat/values_1/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "frame/concat/values_1/0"
+  op: "Pack"
+  input: "frame/concat/values_1/0/0"
+  input: "frame/Maximum"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/concat/values_1"
+  op: "Pack"
+  input: "frame/concat/values_1/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "frame/concat"
+  op: "ConcatV2"
+  input: "frame/zeros"
+  input: "frame/concat/values_1"
+  input: "frame/zeros_1"
+  input: "frame/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/PadV2"
+  op: "PadV2"
+  input: "pre_emphasis/Squeeze"
+  input: "frame/concat"
+  input: "frame/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/Shape_1"
+  op: "Shape"
+  input: "frame/PadV2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "frame/add_2/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "frame/add_2"
+  op: "Add"
+  input: "frame/strided_slice"
+  input: "frame/add_2/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/strided_slice_1/stack"
+  op: "Pack"
+  input: "frame/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/strided_slice_1/stack_1"
+  op: "Pack"
+  input: "frame/add_2"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "frame/strided_slice_1"
+  op: "StridedSlice"
+  input: "frame/Shape_1"
+  input: "frame/strided_slice_1/stack"
+  input: "frame/strided_slice_1/stack_1"
+  input: "frame/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "frame/gcd/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 300
+      }
+    }
+  }
+}
+node {
+  name: "frame/floordiv_1"
+  op: "FloorDiv"
+  input: "frame/frame_length"
+  input: "frame/gcd/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/floordiv_2"
+  op: "FloorDiv"
+  input: "frame/frame_step"
+  input: "frame/gcd/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/floordiv_3"
+  op: "FloorDiv"
+  input: "frame/strided_slice_1"
+  input: "frame/gcd/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/mul_1"
+  op: "Mul"
+  input: "frame/floordiv_3"
+  input: "frame/gcd/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/concat_1/values_1"
+  op: "Pack"
+  input: "frame/mul_1"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "frame/concat_1"
+  op: "ConcatV2"
+  input: "frame/split"
+  input: "frame/concat_1/values_1"
+  input: "frame/split:2"
+  input: "frame/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/concat_2/values_1"
+  op: "Pack"
+  input: "frame/floordiv_3"
+  input: "frame/gcd/Const"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "frame/concat_2"
+  op: "ConcatV2"
+  input: "frame/split"
+  input: "frame/concat_2/values_1"
+  input: "frame/split:2"
+  input: "frame/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/zeros_like"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "frame/ones_like/Shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "frame/ones_like/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "frame/ones_like"
+  op: "Fill"
+  input: "frame/ones_like/Shape"
+  input: "frame/ones_like/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "frame/StridedSlice"
+  op: "StridedSlice"
+  input: "frame/PadV2"
+  input: "frame/zeros_like"
+  input: "frame/concat_1"
+  input: "frame/ones_like"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/Reshape_1"
+  op: "Reshape"
+  input: "frame/StridedSlice"
+  input: "frame/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/range_1/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "frame/range_1/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "frame/range_1"
+  op: "Range"
+  input: "frame/range_1/start"
+  input: "frame/Neg_1"
+  input: "frame/range_1/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/mul_2"
+  op: "Mul"
+  input: "frame/range_1"
+  input: "frame/floordiv_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/Reshape_2/shape/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "frame/Reshape_2/shape"
+  op: "Pack"
+  input: "frame/Neg_1"
+  input: "frame/Reshape_2/shape/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/Reshape_2"
+  op: "Reshape"
+  input: "frame/mul_2"
+  input: "frame/Reshape_2/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/range_2/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "frame/range_2/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "frame/range_2"
+  op: "Range"
+  input: "frame/range_2/start"
+  input: "frame/floordiv_1"
+  input: "frame/range_2/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/Reshape_3/shape/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "frame/Reshape_3/shape"
+  op: "Pack"
+  input: "frame/Reshape_3/shape/0"
+  input: "frame/floordiv_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/Reshape_3"
+  op: "Reshape"
+  input: "frame/range_2"
+  input: "frame/Reshape_3/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/add_3"
+  op: "Add"
+  input: "frame/Reshape_2"
+  input: "frame/Reshape_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/GatherV2"
+  op: "GatherV2"
+  input: "frame/Reshape_1"
+  input: "frame/add_3"
+  input: "frame/strided_slice"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "frame/concat_3/values_1"
+  op: "Pack"
+  input: "frame/Neg_1"
+  input: "frame/frame_length"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "frame/concat_3/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "frame/concat_3"
+  op: "ConcatV2"
+  input: "frame/split"
+  input: "frame/concat_3/values_1"
+  input: "frame/split:2"
+  input: "frame/concat_3/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "frame/Reshape_4"
+  op: "Reshape"
+  input: "frame/GatherV2"
+  input: "frame/concat_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Pad/paddings"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000\t\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Pad"
+  op: "Pad"
+  input: "frame/Reshape_4"
+  input: "Pad/paddings"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "add_1/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2700
+      }
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "waveform_length_batch_input"
+  input: "add_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Neg"
+  op: "Neg"
+  input: "add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "floordiv/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 300
+      }
+    }
+  }
+}
+node {
+  name: "floordiv"
+  op: "FloorDiv"
+  input: "Neg"
+  input: "floordiv/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Neg_1"
+  op: "Neg"
+  input: "floordiv"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "hw/window_length"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1200
+      }
+    }
+  }
+}
+node {
+  name: "hw/periodic"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: true
+      }
+    }
+  }
+}
+node {
+  name: "hw/Cast"
+  op: "Cast"
+  input: "hw/periodic"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "hw/FloorMod/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "hw/FloorMod"
+  op: "FloorMod"
+  input: "hw/window_length"
+  input: "hw/FloorMod/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "hw/sub/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "hw/sub"
+  op: "Sub"
+  input: "hw/sub/x"
+  input: "hw/FloorMod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "hw/mul"
+  op: "Mul"
+  input: "hw/Cast"
+  input: "hw/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "hw/add"
+  op: "Add"
+  input: "hw/window_length"
+  input: "hw/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "hw/sub_1/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "hw/sub_1"
+  op: "Sub"
+  input: "hw/add"
+  input: "hw/sub_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "hw/Cast_1"
+  op: "Cast"
+  input: "hw/sub_1"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "hw/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "hw/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "hw/range"
+  op: "Range"
+  input: "hw/range/start"
+  input: "hw/window_length"
+  input: "hw/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hw/Cast_2"
+  op: "Cast"
+  input: "hw/range"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hw/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 6.28318548203
+      }
+    }
+  }
+}
+node {
+  name: "hw/mul_1"
+  op: "Mul"
+  input: "hw/Const"
+  input: "hw/Cast_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hw/truediv"
+  op: "RealDiv"
+  input: "hw/mul_1"
+  input: "hw/Cast_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hw/Cos"
+  op: "Cos"
+  input: "hw/truediv"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hw/mul_2/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "hw/mul_2"
+  op: "Mul"
+  input: "hw/mul_2/x"
+  input: "hw/Cos"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hw/sub_2/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "hw/sub_2"
+  op: "Sub"
+  input: "hw/sub_2/x"
+  input: "hw/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mul_1"
+  op: "Mul"
+  input: "Pad"
+  input: "hw/sub_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape_10"
+  op: "Shape"
+  input: "Neg_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Fill_2/value"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: false
+      }
+    }
+  }
+}
+node {
+  name: "Fill_2"
+  op: "Fill"
+  input: "Shape_10"
+  input: "Fill_2/value"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "LogicalNot"
+  op: "LogicalNot"
+  input: "Fill_2"
+
+}
+node {
+  name: "Const_10"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "All_1"
+  op: "All"
+  input: "LogicalNot"
+  input: "Const_10"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Assert_1/AssertGuard/Switch"
+  op: "Switch"
+  input: "All_1"
+  input: "All_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "Assert_1/AssertGuard/switch_t"
+  op: "Identity"
+  input: "Assert_1/AssertGuard/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "Assert_1/AssertGuard/switch_f"
+  op: "Identity"
+  input: "Assert_1/AssertGuard/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "Assert_1/AssertGuard/pred_id"
+  op: "Identity"
+  input: "All_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "Assert_1/AssertGuard/NoOp"
+  op: "NoOp"
+  input: "^Assert_1/AssertGuard/switch_t"
+}
+node {
+  name: "Assert_1/AssertGuard/control_dependency"
+  op: "Identity"
+  input: "Assert_1/AssertGuard/switch_t"
+  input: "^Assert_1/AssertGuard/NoOp"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Assert_1/AssertGuard/switch_t"
+      }
+    }
+  }
+
+}
+node {
+  name: "Assert_1/AssertGuard/Assert"
+  op: "Assert"
+  input: "Assert_1/AssertGuard/Assert/Switch"
+  input: "Assert_1/AssertGuard/Assert/Switch_1"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "summarize"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "Assert_1/AssertGuard/Assert/Switch"
+  op: "Switch"
+  input: "All_1"
+  input: "Assert_1/AssertGuard/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@All_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "Assert_1/AssertGuard/Assert/Switch_1"
+  op: "Switch"
+  input: "waveform_batch_input"
+  input: "Assert_1/AssertGuard/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@waveform_batch_input"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Assert_1/AssertGuard/control_dependency_1"
+  op: "Identity"
+  input: "Assert_1/AssertGuard/switch_f"
+  input: "^Assert_1/AssertGuard/Assert"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Assert_1/AssertGuard/switch_f"
+      }
+    }
+  }
+
+}
+node {
+  name: "Assert_1/AssertGuard/Merge"
+  op: "Merge"
+  input: "Assert_1/AssertGuard/control_dependency_1"
+  input: "Assert_1/AssertGuard/control_dependency"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "control_dependency_1"
+  op: "Identity"
+  input: "mul_1"
+  input: "^Assert_1/AssertGuard/Merge"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@mul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "encoder_transcripts"
+  op: "Identity"
+  input: "Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+
+}
+node {
+  name: "encoder_speaker_ids"
+  op: "Identity"
+  input: "speaker_lookup_table_Lookup"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "encoder_vuis"
+  op: "Identity"
+  input: "Fill"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "encoder_input"
+  op: "Identity"
+  input: "control_dependency"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "encoder_input_lengths"
+  op: "Identity"
+  input: "padding_map_fn/TensorArrayStack_2/TensorArrayGatherV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Shape_11"
+  op: "Shape"
+  input: "encoder_input"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_3"
+  op: "StridedSlice"
+  input: "Shape_11"
+  input: "strided_slice_3/stack"
+  input: "strided_slice_3/stack_1"
+  input: "strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_1/range"
+  op: "Range"
+  input: "sequence_length_mask_1/range/start"
+  input: "strided_slice_3"
+  input: "sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "sequence_length_mask_1/range"
+  input: "sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "sequence_length_mask_1/Shape"
+  input: "sequence_length_mask_1/strided_slice/stack"
+  input: "sequence_length_mask_1/strided_slice/stack_1"
+  input: "sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "sequence_length_mask_1/strided_slice"
+  input: "sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "sequence_length_mask_1/ExpandDims"
+  input: "sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_1/Less"
+  op: "Less"
+  input: "sequence_length_mask_1/Tile"
+  input: "sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "encoder_input_mask"
+  op: "Identity"
+  input: "sequence_length_mask_1/Cast"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_4"
+  op: "StridedSlice"
+  input: "Fill_1"
+  input: "strided_slice_4/stack"
+  input: "strided_slice_4/stack_1"
+  input: "strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "assert_equal_4/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 24000.0
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_4/Equal"
+  op: "Equal"
+  input: "assert_equal_4/x"
+  input: "strided_slice_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "assert_equal_4/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_4/All"
+  op: "All"
+  input: "assert_equal_4/Equal"
+  input: "assert_equal_4/Const"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "assert_equal_4/Assert/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "The provided sample_rate does not match output from reader."
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_4/Assert/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Condition x == y did not hold element-wise:"
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_4/Assert/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "x (assert_equal_4/x:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_4/Assert/Const_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "y (strided_slice_4:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/Switch"
+  op: "Switch"
+  input: "assert_equal_4/All"
+  input: "assert_equal_4/All"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/switch_t"
+  op: "Identity"
+  input: "assert_equal_4/Assert/AssertGuard/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/switch_f"
+  op: "Identity"
+  input: "assert_equal_4/Assert/AssertGuard/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/pred_id"
+  op: "Identity"
+  input: "assert_equal_4/All"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/NoOp"
+  op: "NoOp"
+  input: "^assert_equal_4/Assert/AssertGuard/switch_t"
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/control_dependency"
+  op: "Identity"
+  input: "assert_equal_4/Assert/AssertGuard/switch_t"
+  input: "^assert_equal_4/Assert/AssertGuard/NoOp"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@assert_equal_4/Assert/AssertGuard/switch_t"
+      }
+    }
+  }
+
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/Assert/data_0"
+  op: "Const"
+  input: "^assert_equal_4/Assert/AssertGuard/switch_f"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "The provided sample_rate does not match output from reader."
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/Assert/data_1"
+  op: "Const"
+  input: "^assert_equal_4/Assert/AssertGuard/switch_f"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Condition x == y did not hold element-wise:"
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/Assert/data_2"
+  op: "Const"
+  input: "^assert_equal_4/Assert/AssertGuard/switch_f"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "x (assert_equal_4/x:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/Assert/data_4"
+  op: "Const"
+  input: "^assert_equal_4/Assert/AssertGuard/switch_f"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "y (strided_slice_4:0) = "
+      }
+    }
+  }
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/Assert"
+  op: "Assert"
+  input: "assert_equal_4/Assert/AssertGuard/Assert/Switch"
+  input: "assert_equal_4/Assert/AssertGuard/Assert/data_0"
+  input: "assert_equal_4/Assert/AssertGuard/Assert/data_1"
+  input: "assert_equal_4/Assert/AssertGuard/Assert/data_2"
+  input: "assert_equal_4/Assert/AssertGuard/Assert/Switch_1"
+  input: "assert_equal_4/Assert/AssertGuard/Assert/data_4"
+  input: "assert_equal_4/Assert/AssertGuard/Assert/Switch_2"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_STRING
+        type: DT_STRING
+        type: DT_STRING
+        type: DT_FLOAT
+        type: DT_STRING
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "summarize"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/Assert/Switch"
+  op: "Switch"
+  input: "assert_equal_4/All"
+  input: "assert_equal_4/Assert/AssertGuard/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@assert_equal_4/All"
+      }
+    }
+  }
+
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/Assert/Switch_1"
+  op: "Switch"
+  input: "assert_equal_4/x"
+  input: "assert_equal_4/Assert/AssertGuard/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@assert_equal_4/x"
+      }
+    }
+  }
+
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/Assert/Switch_2"
+  op: "Switch"
+  input: "strided_slice_4"
+  input: "assert_equal_4/Assert/AssertGuard/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@strided_slice_4"
+      }
+    }
+  }
+
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/control_dependency_1"
+  op: "Identity"
+  input: "assert_equal_4/Assert/AssertGuard/switch_f"
+  input: "^assert_equal_4/Assert/AssertGuard/Assert"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@assert_equal_4/Assert/AssertGuard/switch_f"
+      }
+    }
+  }
+
+}
+node {
+  name: "assert_equal_4/Assert/AssertGuard/Merge"
+  op: "Merge"
+  input: "assert_equal_4/Assert/AssertGuard/control_dependency_1"
+  input: "assert_equal_4/Assert/AssertGuard/control_dependency"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "rfft/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2048
+      }
+    }
+  }
+}
+node {
+  name: "rfft/Pad/paddings"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000P\003\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "rfft/Pad"
+  op: "Pad"
+  input: "control_dependency_1"
+  input: "rfft/Pad/paddings"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2048
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "rfft"
+  op: "RFFT"
+  input: "rfft/Pad"
+  input: "rfft/Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Abs_1"
+  op: "ComplexAbs"
+  input: "rfft"
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Square"
+  op: "Square"
+  input: "Abs_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/sample_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 24000.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/lower_edge_hertz"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 80.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/upper_edge_hertz"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 12000.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/truediv/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 2.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/truediv"
+  op: "RealDiv"
+  input: "linear_to_mel_weight_matrix/sample_rate"
+  input: "linear_to_mel_weight_matrix/truediv/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/LinSpace/num"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1025
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/LinSpace"
+  op: "LinSpace"
+  input: "linear_to_mel_weight_matrix/Const"
+  input: "linear_to_mel_weight_matrix/truediv"
+  input: "linear_to_mel_weight_matrix/LinSpace/num"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/strided_slice"
+  op: "StridedSlice"
+  input: "linear_to_mel_weight_matrix/LinSpace"
+  input: "linear_to_mel_weight_matrix/strided_slice/stack"
+  input: "linear_to_mel_weight_matrix/strided_slice/stack_1"
+  input: "linear_to_mel_weight_matrix/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel/truediv/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 700.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel/truediv"
+  op: "RealDiv"
+  input: "linear_to_mel_weight_matrix/strided_slice"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel/truediv/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel/add/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel/add"
+  op: "Add"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel/add/x"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel/truediv"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel/Log"
+  op: "Log"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel/mul/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1127.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel/mul"
+  op: "Mul"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel/mul/x"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel/Log"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/ExpandDims"
+  op: "ExpandDims"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel/mul"
+  input: "linear_to_mel_weight_matrix/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_1/truediv/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 700.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_1/truediv"
+  op: "RealDiv"
+  input: "linear_to_mel_weight_matrix/lower_edge_hertz"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_1/truediv/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_1/add/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_1/add"
+  op: "Add"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_1/add/x"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_1/truediv"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_1/Log"
+  op: "Log"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_1/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_1/mul/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1127.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_1/mul"
+  op: "Mul"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_1/mul/x"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_1/Log"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_2/truediv/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 700.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_2/truediv"
+  op: "RealDiv"
+  input: "linear_to_mel_weight_matrix/upper_edge_hertz"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_2/truediv/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_2/add/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_2/add"
+  op: "Add"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_2/add/x"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_2/truediv"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_2/Log"
+  op: "Log"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_2/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_2/mul/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1127.0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/hertz_to_mel_2/mul"
+  op: "Mul"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_2/mul/x"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_2/Log"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/LinSpace_1/num"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 82
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/LinSpace_1"
+  op: "LinSpace"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_1/mul"
+  input: "linear_to_mel_weight_matrix/hertz_to_mel_2/mul"
+  input: "linear_to_mel_weight_matrix/LinSpace_1/num"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 82
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/frame_length"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/frame_step"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/range"
+  op: "Range"
+  input: "linear_to_mel_weight_matrix/frame/range/start"
+  input: "linear_to_mel_weight_matrix/frame/Rank"
+  input: "linear_to_mel_weight_matrix/frame/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/add/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/add"
+  op: "Add"
+  input: "linear_to_mel_weight_matrix/frame/axis"
+  input: "linear_to_mel_weight_matrix/frame/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/strided_slice/stack"
+  op: "Pack"
+  input: "linear_to_mel_weight_matrix/frame/axis"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/strided_slice/stack_1"
+  op: "Pack"
+  input: "linear_to_mel_weight_matrix/frame/add"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/strided_slice"
+  op: "StridedSlice"
+  input: "linear_to_mel_weight_matrix/frame/range"
+  input: "linear_to_mel_weight_matrix/frame/strided_slice/stack"
+  input: "linear_to_mel_weight_matrix/frame/strided_slice/stack_1"
+  input: "linear_to_mel_weight_matrix/frame/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 82
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/sub/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/sub"
+  op: "Sub"
+  input: "linear_to_mel_weight_matrix/frame/Rank"
+  input: "linear_to_mel_weight_matrix/frame/sub/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/sub_1"
+  op: "Sub"
+  input: "linear_to_mel_weight_matrix/frame/sub"
+  input: "linear_to_mel_weight_matrix/frame/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/packed/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/packed"
+  op: "Pack"
+  input: "linear_to_mel_weight_matrix/frame/strided_slice"
+  input: "linear_to_mel_weight_matrix/frame/packed/1"
+  input: "linear_to_mel_weight_matrix/frame/sub_1"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/split"
+  op: "SplitV"
+  input: "linear_to_mel_weight_matrix/frame/Shape"
+  input: "linear_to_mel_weight_matrix/frame/packed"
+  input: "linear_to_mel_weight_matrix/frame/split/split_dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tlen"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "num_split"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Reshape/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Reshape"
+  op: "Reshape"
+  input: "linear_to_mel_weight_matrix/frame/split:1"
+  input: "linear_to_mel_weight_matrix/frame/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Size"
+  op: "Size"
+  input: "linear_to_mel_weight_matrix/frame/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Size_1"
+  op: "Size"
+  input: "linear_to_mel_weight_matrix/frame/split:2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/sub_2"
+  op: "Sub"
+  input: "linear_to_mel_weight_matrix/frame/Reshape"
+  input: "linear_to_mel_weight_matrix/frame/frame_length"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/floordiv"
+  op: "FloorDiv"
+  input: "linear_to_mel_weight_matrix/frame/sub_2"
+  input: "linear_to_mel_weight_matrix/frame/frame_step"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/add_1/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/add_1"
+  op: "Add"
+  input: "linear_to_mel_weight_matrix/frame/add_1/x"
+  input: "linear_to_mel_weight_matrix/frame/floordiv"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Maximum/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Maximum"
+  op: "Maximum"
+  input: "linear_to_mel_weight_matrix/frame/Maximum/x"
+  input: "linear_to_mel_weight_matrix/frame/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/gcd/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/floordiv_1"
+  op: "FloorDiv"
+  input: "linear_to_mel_weight_matrix/frame/frame_length"
+  input: "linear_to_mel_weight_matrix/frame/gcd/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/floordiv_2"
+  op: "FloorDiv"
+  input: "linear_to_mel_weight_matrix/frame/frame_step"
+  input: "linear_to_mel_weight_matrix/frame/gcd/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/floordiv_3"
+  op: "FloorDiv"
+  input: "linear_to_mel_weight_matrix/frame/Reshape"
+  input: "linear_to_mel_weight_matrix/frame/gcd/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/mul"
+  op: "Mul"
+  input: "linear_to_mel_weight_matrix/frame/floordiv_3"
+  input: "linear_to_mel_weight_matrix/frame/gcd/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/concat/values_1"
+  op: "Pack"
+  input: "linear_to_mel_weight_matrix/frame/mul"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/concat"
+  op: "ConcatV2"
+  input: "linear_to_mel_weight_matrix/frame/split"
+  input: "linear_to_mel_weight_matrix/frame/concat/values_1"
+  input: "linear_to_mel_weight_matrix/frame/split:2"
+  input: "linear_to_mel_weight_matrix/frame/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/concat_1/values_1"
+  op: "Pack"
+  input: "linear_to_mel_weight_matrix/frame/floordiv_3"
+  input: "linear_to_mel_weight_matrix/frame/gcd/Const"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/concat_1"
+  op: "ConcatV2"
+  input: "linear_to_mel_weight_matrix/frame/split"
+  input: "linear_to_mel_weight_matrix/frame/concat_1/values_1"
+  input: "linear_to_mel_weight_matrix/frame/split:2"
+  input: "linear_to_mel_weight_matrix/frame/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/zeros_like"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/ones_like/Shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/ones_like/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/ones_like"
+  op: "Fill"
+  input: "linear_to_mel_weight_matrix/frame/ones_like/Shape"
+  input: "linear_to_mel_weight_matrix/frame/ones_like/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/StridedSlice"
+  op: "StridedSlice"
+  input: "linear_to_mel_weight_matrix/LinSpace_1"
+  input: "linear_to_mel_weight_matrix/frame/zeros_like"
+  input: "linear_to_mel_weight_matrix/frame/concat"
+  input: "linear_to_mel_weight_matrix/frame/ones_like"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Reshape_1"
+  op: "Reshape"
+  input: "linear_to_mel_weight_matrix/frame/StridedSlice"
+  input: "linear_to_mel_weight_matrix/frame/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/range_1/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/range_1/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/range_1"
+  op: "Range"
+  input: "linear_to_mel_weight_matrix/frame/range_1/start"
+  input: "linear_to_mel_weight_matrix/frame/Maximum"
+  input: "linear_to_mel_weight_matrix/frame/range_1/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/mul_1"
+  op: "Mul"
+  input: "linear_to_mel_weight_matrix/frame/range_1"
+  input: "linear_to_mel_weight_matrix/frame/floordiv_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Reshape_2/shape/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Reshape_2/shape"
+  op: "Pack"
+  input: "linear_to_mel_weight_matrix/frame/Maximum"
+  input: "linear_to_mel_weight_matrix/frame/Reshape_2/shape/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Reshape_2"
+  op: "Reshape"
+  input: "linear_to_mel_weight_matrix/frame/mul_1"
+  input: "linear_to_mel_weight_matrix/frame/Reshape_2/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/range_2/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/range_2/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/range_2"
+  op: "Range"
+  input: "linear_to_mel_weight_matrix/frame/range_2/start"
+  input: "linear_to_mel_weight_matrix/frame/floordiv_1"
+  input: "linear_to_mel_weight_matrix/frame/range_2/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Reshape_3/shape/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Reshape_3/shape"
+  op: "Pack"
+  input: "linear_to_mel_weight_matrix/frame/Reshape_3/shape/0"
+  input: "linear_to_mel_weight_matrix/frame/floordiv_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Reshape_3"
+  op: "Reshape"
+  input: "linear_to_mel_weight_matrix/frame/range_2"
+  input: "linear_to_mel_weight_matrix/frame/Reshape_3/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/add_2"
+  op: "Add"
+  input: "linear_to_mel_weight_matrix/frame/Reshape_2"
+  input: "linear_to_mel_weight_matrix/frame/Reshape_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/GatherV2"
+  op: "GatherV2"
+  input: "linear_to_mel_weight_matrix/frame/Reshape_1"
+  input: "linear_to_mel_weight_matrix/frame/add_2"
+  input: "linear_to_mel_weight_matrix/frame/strided_slice"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/concat_2/values_1"
+  op: "Pack"
+  input: "linear_to_mel_weight_matrix/frame/Maximum"
+  input: "linear_to_mel_weight_matrix/frame/frame_length"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/concat_2"
+  op: "ConcatV2"
+  input: "linear_to_mel_weight_matrix/frame/split"
+  input: "linear_to_mel_weight_matrix/frame/concat_2/values_1"
+  input: "linear_to_mel_weight_matrix/frame/split:2"
+  input: "linear_to_mel_weight_matrix/frame/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "linear_to_mel_weight_matrix/frame/Reshape_4"
+  op: "Reshape"
+  input: "linear_to_mel_weight_matrix/frame/GatherV2"
+  input: "linear_to_mel_weight_matrix/frame/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/split"
+  op: "Split"
+  input: "linear_to_mel_weight_matrix/split/split_dim"
+  input: "linear_to_mel_weight_matrix/frame/Reshape_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+          dim {
+            size: 80
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+          dim {
+            size: 80
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/Reshape/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000P\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/Reshape"
+  op: "Reshape"
+  input: "linear_to_mel_weight_matrix/split"
+  input: "linear_to_mel_weight_matrix/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/Reshape_1/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000P\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/Reshape_1"
+  op: "Reshape"
+  input: "linear_to_mel_weight_matrix/split:1"
+  input: "linear_to_mel_weight_matrix/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/Reshape_2/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000P\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/Reshape_2"
+  op: "Reshape"
+  input: "linear_to_mel_weight_matrix/split:2"
+  input: "linear_to_mel_weight_matrix/Reshape_2/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/sub"
+  op: "Sub"
+  input: "linear_to_mel_weight_matrix/ExpandDims"
+  input: "linear_to_mel_weight_matrix/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/sub_1"
+  op: "Sub"
+  input: "linear_to_mel_weight_matrix/Reshape_1"
+  input: "linear_to_mel_weight_matrix/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/truediv_1"
+  op: "RealDiv"
+  input: "linear_to_mel_weight_matrix/sub"
+  input: "linear_to_mel_weight_matrix/sub_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/sub_2"
+  op: "Sub"
+  input: "linear_to_mel_weight_matrix/Reshape_2"
+  input: "linear_to_mel_weight_matrix/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/sub_3"
+  op: "Sub"
+  input: "linear_to_mel_weight_matrix/Reshape_2"
+  input: "linear_to_mel_weight_matrix/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/truediv_2"
+  op: "RealDiv"
+  input: "linear_to_mel_weight_matrix/sub_2"
+  input: "linear_to_mel_weight_matrix/sub_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/Minimum"
+  op: "Minimum"
+  input: "linear_to_mel_weight_matrix/truediv_1"
+  input: "linear_to_mel_weight_matrix/truediv_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/Maximum"
+  op: "Maximum"
+  input: "linear_to_mel_weight_matrix/Const"
+  input: "linear_to_mel_weight_matrix/Minimum"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/Pad/paddings"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix/Pad"
+  op: "Pad"
+  input: "linear_to_mel_weight_matrix/Maximum"
+  input: "linear_to_mel_weight_matrix/Pad/paddings"
+  attr {
+    key: "T"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1025
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "linear_to_mel_weight_matrix"
+  op: "Cast"
+  input: "linear_to_mel_weight_matrix/Pad"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_DOUBLE
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1025
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/range/limit"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/range"
+  op: "Range"
+  input: "Tensordot/range/start"
+  input: "Tensordot/range/limit"
+  input: "Tensordot/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/range_1/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/range_1/limit"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/range_1/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/range_1"
+  op: "Range"
+  input: "Tensordot/range_1/start"
+  input: "Tensordot/range_1/limit"
+  input: "Tensordot/range_1/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/Shape"
+  op: "Shape"
+  input: "Square"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Tensordot/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/GreaterEqual/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/GreaterEqual"
+  op: "GreaterEqual"
+  input: "Tensordot/range"
+  input: "Tensordot/GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/Cast"
+  op: "Cast"
+  input: "Tensordot/GreaterEqual"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "Tensordot/mul"
+  op: "Mul"
+  input: "Tensordot/Cast"
+  input: "Tensordot/range"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/Less/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/Less"
+  op: "Less"
+  input: "Tensordot/range"
+  input: "Tensordot/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/Cast_1"
+  op: "Cast"
+  input: "Tensordot/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "Tensordot/add"
+  op: "Add"
+  input: "Tensordot/range"
+  input: "Tensordot/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/mul_1"
+  op: "Mul"
+  input: "Tensordot/Cast_1"
+  input: "Tensordot/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/add_1"
+  op: "Add"
+  input: "Tensordot/mul"
+  input: "Tensordot/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/range_2/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/range_2/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/range_2"
+  op: "Range"
+  input: "Tensordot/range_2/start"
+  input: "Tensordot/Rank"
+  input: "Tensordot/range_2/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/ListDiff"
+  op: "ListDiff"
+  input: "Tensordot/range_2"
+  input: "Tensordot/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_idx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Tensordot/Gather"
+  op: "Gather"
+  input: "Tensordot/Shape"
+  input: "Tensordot/ListDiff"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "Tensordot/Gather_1"
+  op: "Gather"
+  input: "Tensordot/Shape"
+  input: "Tensordot/add_1"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "Tensordot/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/Prod"
+  op: "Prod"
+  input: "Tensordot/Gather"
+  input: "Tensordot/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Tensordot/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/Prod_1"
+  op: "Prod"
+  input: "Tensordot/Gather_1"
+  input: "Tensordot/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Tensordot/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/concat"
+  op: "ConcatV2"
+  input: "Tensordot/Gather_1"
+  input: "Tensordot/Gather"
+  input: "Tensordot/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/concat_1"
+  op: "ConcatV2"
+  input: "Tensordot/ListDiff"
+  input: "Tensordot/add_1"
+  input: "Tensordot/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/stack"
+  op: "Pack"
+  input: "Tensordot/Prod"
+  input: "Tensordot/Prod_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Tensordot/transpose"
+  op: "Transpose"
+  input: "Square"
+  input: "Tensordot/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/Reshape"
+  op: "Reshape"
+  input: "Tensordot/transpose"
+  input: "Tensordot/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/Shape_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\004\000\000P\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/Rank_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/GreaterEqual_1/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/GreaterEqual_1"
+  op: "GreaterEqual"
+  input: "Tensordot/range_1"
+  input: "Tensordot/GreaterEqual_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/Cast_2"
+  op: "Cast"
+  input: "Tensordot/GreaterEqual_1"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "Tensordot/mul_2"
+  op: "Mul"
+  input: "Tensordot/Cast_2"
+  input: "Tensordot/range_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/Less_1/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/Less_1"
+  op: "Less"
+  input: "Tensordot/range_1"
+  input: "Tensordot/Less_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/Cast_3"
+  op: "Cast"
+  input: "Tensordot/Less_1"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "Tensordot/add_2"
+  op: "Add"
+  input: "Tensordot/range_1"
+  input: "Tensordot/Rank_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/mul_3"
+  op: "Mul"
+  input: "Tensordot/Cast_3"
+  input: "Tensordot/add_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/add_3"
+  op: "Add"
+  input: "Tensordot/mul_2"
+  input: "Tensordot/mul_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/range_3/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/range_3/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/range_3"
+  op: "Range"
+  input: "Tensordot/range_3/start"
+  input: "Tensordot/Rank_1"
+  input: "Tensordot/range_3/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/ListDiff_1"
+  op: "ListDiff"
+  input: "Tensordot/range_3"
+  input: "Tensordot/add_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_idx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Tensordot/Gather_2"
+  op: "Gather"
+  input: "Tensordot/Shape_1"
+  input: "Tensordot/ListDiff_1"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "Tensordot/Gather_3"
+  op: "Gather"
+  input: "Tensordot/Shape_1"
+  input: "Tensordot/add_3"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "Tensordot/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/Prod_2"
+  op: "Prod"
+  input: "Tensordot/Gather_2"
+  input: "Tensordot/Const_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Tensordot/Const_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/Prod_3"
+  op: "Prod"
+  input: "Tensordot/Gather_3"
+  input: "Tensordot/Const_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Tensordot/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/concat_2"
+  op: "ConcatV2"
+  input: "Tensordot/Gather_3"
+  input: "Tensordot/Gather_2"
+  input: "Tensordot/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/concat_3/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/concat_3"
+  op: "ConcatV2"
+  input: "Tensordot/add_3"
+  input: "Tensordot/ListDiff_1"
+  input: "Tensordot/concat_3/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/stack_1"
+  op: "Pack"
+  input: "Tensordot/Prod_3"
+  input: "Tensordot/Prod_2"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Tensordot/transpose_1"
+  op: "Transpose"
+  input: "linear_to_mel_weight_matrix"
+  input: "Tensordot/concat_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/Reshape_1"
+  op: "Reshape"
+  input: "Tensordot/transpose_1"
+  input: "Tensordot/stack_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot/MatMul"
+  op: "MatMul"
+  input: "Tensordot/Reshape"
+  input: "Tensordot/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Tensordot/concat_4/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Tensordot/concat_4"
+  op: "ConcatV2"
+  input: "Tensordot/Gather"
+  input: "Tensordot/Gather_2"
+  input: "Tensordot/concat_4/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Tensordot"
+  op: "Reshape"
+  input: "Tensordot/MatMul"
+  input: "Tensordot/concat_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "pow/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "pow"
+  op: "Pow"
+  input: "Tensordot"
+  input: "pow/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "add_2/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 9.99999997475e-07
+      }
+    }
+  }
+}
+node {
+  name: "add_2"
+  op: "Add"
+  input: "pow"
+  input: "add_2/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Log"
+  op: "Log"
+  input: "add_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Maximum/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -8.0
+      }
+    }
+  }
+}
+node {
+  name: "Maximum"
+  op: "Maximum"
+  input: "Log"
+  input: "Maximum/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "control_dependency_2"
+  op: "Identity"
+  input: "Maximum"
+  input: "^assert_equal_4/Assert/AssertGuard/Merge"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Maximum"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const_11"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "add_3/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 9.99999997475e-07
+      }
+    }
+  }
+}
+node {
+  name: "add_3"
+  op: "Add"
+  input: "Const_11"
+  input: "add_3/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "Log_1"
+  op: "Log"
+  input: "add_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "Maximum_1/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -8.0
+      }
+    }
+  }
+}
+node {
+  name: "Maximum_1"
+  op: "Maximum"
+  input: "Log_1"
+  input: "Maximum_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "decoder_target"
+  op: "Identity"
+  input: "control_dependency_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "decoder_target_lengths"
+  op: "Identity"
+  input: "Neg_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "strided_slice_5/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_5/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_5/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_5"
+  op: "StridedSlice"
+  input: "Fill_1"
+  input: "strided_slice_5/stack"
+  input: "strided_slice_5/stack_1"
+  input: "strided_slice_5/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "Equal"
+  op: "Equal"
+  input: "Fill_1"
+  input: "strided_slice_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "Const_12"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "All_2"
+  op: "All"
+  input: "Equal"
+  input: "Const_12"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "Assert_2/AssertGuard/Switch"
+  op: "Switch"
+  input: "All_2"
+  input: "All_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "Assert_2/AssertGuard/switch_t"
+  op: "Identity"
+  input: "Assert_2/AssertGuard/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "Assert_2/AssertGuard/switch_f"
+  op: "Identity"
+  input: "Assert_2/AssertGuard/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "Assert_2/AssertGuard/pred_id"
+  op: "Identity"
+  input: "All_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "Assert_2/AssertGuard/NoOp"
+  op: "NoOp"
+  input: "^Assert_2/AssertGuard/switch_t"
+}
+node {
+  name: "Assert_2/AssertGuard/control_dependency"
+  op: "Identity"
+  input: "Assert_2/AssertGuard/switch_t"
+  input: "^Assert_2/AssertGuard/NoOp"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Assert_2/AssertGuard/switch_t"
+      }
+    }
+  }
+
+}
+node {
+  name: "Assert_2/AssertGuard/Assert"
+  op: "Assert"
+  input: "Assert_2/AssertGuard/Assert/Switch"
+  input: "Assert_2/AssertGuard/Assert/Switch_1"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "summarize"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "Assert_2/AssertGuard/Assert/Switch"
+  op: "Switch"
+  input: "All_2"
+  input: "Assert_2/AssertGuard/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@All_2"
+      }
+    }
+  }
+
+}
+node {
+  name: "Assert_2/AssertGuard/Assert/Switch_1"
+  op: "Switch"
+  input: "Fill_1"
+  input: "Assert_2/AssertGuard/pred_id"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Fill_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Assert_2/AssertGuard/control_dependency_1"
+  op: "Identity"
+  input: "Assert_2/AssertGuard/switch_f"
+  input: "^Assert_2/AssertGuard/Assert"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Assert_2/AssertGuard/switch_f"
+      }
+    }
+  }
+
+}
+node {
+  name: "Assert_2/AssertGuard/Merge"
+  op: "Merge"
+  input: "Assert_2/AssertGuard/control_dependency_1"
+  input: "Assert_2/AssertGuard/control_dependency"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "control_dependency_3"
+  op: "Identity"
+  input: "strided_slice_5"
+  input: "^Assert_2/AssertGuard/Merge"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@strided_slice_5"
+      }
+    }
+  }
+
+}
+node {
+  name: "decoder_target_sample_rate"
+  op: "Identity"
+  input: "control_dependency_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "Shape_12"
+  op: "Shape"
+  input: "decoder_target"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_6/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_6/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_6/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_6"
+  op: "StridedSlice"
+  input: "Shape_12"
+  input: "strided_slice_6/stack"
+  input: "strided_slice_6/stack_1"
+  input: "strided_slice_6/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_2/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_2/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_2/range"
+  op: "Range"
+  input: "sequence_length_mask_2/range/start"
+  input: "strided_slice_6"
+  input: "sequence_length_mask_2/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_2/ExpandDims"
+  op: "ExpandDims"
+  input: "sequence_length_mask_2/range"
+  input: "sequence_length_mask_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_2/Shape"
+  op: "Shape"
+  input: "decoder_target_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_2/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_2/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_2/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_2/strided_slice"
+  op: "StridedSlice"
+  input: "sequence_length_mask_2/Shape"
+  input: "sequence_length_mask_2/strided_slice/stack"
+  input: "sequence_length_mask_2/strided_slice/stack_1"
+  input: "sequence_length_mask_2/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_2/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_2/Tile/multiples"
+  op: "Pack"
+  input: "sequence_length_mask_2/strided_slice"
+  input: "sequence_length_mask_2/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_2/Tile"
+  op: "Tile"
+  input: "sequence_length_mask_2/ExpandDims"
+  input: "sequence_length_mask_2/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "decoder_target_lengths"
+  input: "sequence_length_mask_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_2/Less"
+  op: "Less"
+  input: "sequence_length_mask_2/Tile"
+  input: "sequence_length_mask_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_2/Cast"
+  op: "Cast"
+  input: "sequence_length_mask_2/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "decoder_target_mask"
+  op: "Identity"
+  input: "sequence_length_mask_2/Cast"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "max_decoder_output_length"
+  op: "Identity"
+  input: "decoder_output_length"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "rfft_1/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2048
+      }
+    }
+  }
+}
+node {
+  name: "rfft_1/Pad/paddings"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000P\003\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "rfft_1/Pad"
+  op: "Pad"
+  input: "control_dependency_1"
+  input: "rfft_1/Pad/paddings"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2048
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "rfft_1"
+  op: "RFFT"
+  input: "rfft_1/Pad"
+  input: "rfft_1/Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Abs_2"
+  op: "ComplexAbs"
+  input: "rfft_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Square_1"
+  op: "Square"
+  input: "Abs_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "pow_1/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "pow_1"
+  op: "Pow"
+  input: "Square_1"
+  input: "pow_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "add_4/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 9.99999997475e-07
+      }
+    }
+  }
+}
+node {
+  name: "add_4"
+  op: "Add"
+  input: "pow_1"
+  input: "add_4/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Log_2"
+  op: "Log"
+  input: "add_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Maximum_2/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -8.0
+      }
+    }
+  }
+}
+node {
+  name: "Maximum_2"
+  op: "Maximum"
+  input: "Log_2"
+  input: "Maximum_2/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "synthesis_target"
+  op: "Identity"
+  input: "Maximum_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "synthesis_target_lengths"
+  op: "Identity"
+  input: "Neg_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Shape_13"
+  op: "Shape"
+  input: "synthesis_target"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "strided_slice_7/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_7/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_7/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_7"
+  op: "StridedSlice"
+  input: "Shape_13"
+  input: "strided_slice_7/stack"
+  input: "strided_slice_7/stack_1"
+  input: "strided_slice_7/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_3/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_3/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_3/range"
+  op: "Range"
+  input: "sequence_length_mask_3/range/start"
+  input: "strided_slice_7"
+  input: "sequence_length_mask_3/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_3/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_3/ExpandDims"
+  op: "ExpandDims"
+  input: "sequence_length_mask_3/range"
+  input: "sequence_length_mask_3/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_3/Shape"
+  op: "Shape"
+  input: "synthesis_target_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_3/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_3/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_3/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_3/strided_slice"
+  op: "StridedSlice"
+  input: "sequence_length_mask_3/Shape"
+  input: "sequence_length_mask_3/strided_slice/stack"
+  input: "sequence_length_mask_3/strided_slice/stack_1"
+  input: "sequence_length_mask_3/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_3/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_3/Tile/multiples"
+  op: "Pack"
+  input: "sequence_length_mask_3/strided_slice"
+  input: "sequence_length_mask_3/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_3/Tile"
+  op: "Tile"
+  input: "sequence_length_mask_3/ExpandDims"
+  input: "sequence_length_mask_3/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_3/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "sequence_length_mask_3/ExpandDims_1"
+  op: "ExpandDims"
+  input: "synthesis_target_lengths"
+  input: "sequence_length_mask_3/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_3/Less"
+  op: "Less"
+  input: "sequence_length_mask_3/Tile"
+  input: "sequence_length_mask_3/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "sequence_length_mask_3/Cast"
+  op: "Cast"
+  input: "sequence_length_mask_3/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "synthesis_target_mask"
+  op: "Identity"
+  input: "decoder_target_mask"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "decoder_input_sample_prob/tags"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "decoder_input_sample_prob"
+      }
+    }
+  }
+}
+node {
+  name: "decoder_input_sample_prob/values"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "decoder_input_sample_prob"
+  op: "ScalarSummary"
+  input: "decoder_input_sample_prob/tags"
+  input: "decoder_input_sample_prob/values"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_embedding/embedding"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: ")\000\000\000@\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_embedding/embedding"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.239045724273
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_embedding/embedding"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.239045724273
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 41
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform/max"
+  input: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_embedding/embedding"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 41
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform/mul"
+  input: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 41
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_embedding/embedding"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 41
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 41
+        }
+        dim {
+          size: 64
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_embedding/embedding/Assign"
+  op: "Assign"
+  input: "seq2seq/speaker_embedding/embedding"
+  input: "seq2seq/speaker_embedding/embedding/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 41
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_embedding/embedding/read"
+  op: "Identity"
+  input: "seq2seq/speaker_embedding/embedding"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 41
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/embedding_lookup"
+  op: "Gather"
+  input: "seq2seq/speaker_embedding/embedding/read"
+  input: "encoder_speaker_ids"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "speaker_embedding_lookup"
+  op: "PlaceholderWithDefault"
+  input: "seq2seq/embedding_lookup"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 64
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/Mean"
+  op: "Mean"
+  input: "seq2seq/Const"
+  input: "seq2seq/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/embedding/embedding/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/embedding/embedding"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "8\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/embedding/embedding/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/embedding/embedding"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.138675048947
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/embedding/embedding/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/embedding/embedding"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.138675048947
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/embedding/embedding/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/embedding/embedding/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 56
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/embedding/embedding/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/embedding/embedding/Initializer/random_uniform/max"
+  input: "seq2seq/embedding/embedding/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/embedding/embedding"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/embedding/embedding/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/embedding/embedding/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/embedding/embedding/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 56
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/embedding/embedding/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/embedding/embedding/Initializer/random_uniform/mul"
+  input: "seq2seq/embedding/embedding/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 56
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/embedding/embedding"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 56
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 56
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/embedding/embedding/Assign"
+  op: "Assign"
+  input: "seq2seq/embedding/embedding"
+  input: "seq2seq/embedding/embedding/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 56
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/embedding/embedding/read"
+  op: "Identity"
+  input: "seq2seq/embedding/embedding"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 56
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/embedding/token_to_embedding"
+  op: "Gather"
+  input: "seq2seq/embedding/embedding/read"
+  input: "encoder_input"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/embedding/embedding"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.10825317353
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.10825317353
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/biases"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/embedding/token_to_embedding"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/axes"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/GreaterEqual/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/GreaterEqual"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/axes"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Less/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Cast_1"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Cast_1"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Rank"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/ListDiff"
+  op: "ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/range"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_idx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Gather"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/ListDiff"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Gather_1"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/add_1"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Prod"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Prod_1"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/add_1"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Prod_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/embedding/token_to_embedding"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/transpose_1/perm"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/weights/read"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/transpose_1/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Reshape_1/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Reshape_1"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/transpose_1"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/Const_2"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Tensordot"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/Dropout/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.125
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.125
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/biases"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/biases/read"
+  op: "Identity"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/Dropout/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/axes"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/GreaterEqual/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/GreaterEqual"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/axes"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Less/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Cast_1"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Cast_1"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Rank"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/ListDiff"
+  op: "ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/range"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_idx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Gather"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/ListDiff"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Gather_1"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/add_1"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Prod"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Prod_1"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/add_1"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Prod_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/Dropout/Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/transpose_1/perm"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/weights/read"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/transpose_1/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Reshape_1/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Reshape_1"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/transpose_1"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/Const_2"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Tensordot"
+  input: "seq2seq/encoder/pre_enc_rnn_net/fully_connected_1/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/Dropout_1/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/fully_connected_1/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/pre_enc_rnn_net/Dropout_1/Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.013505294919
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.013505294919
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter0/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\002\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00954968575388
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00954968575388
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 2
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_1/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_2/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_2"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_2/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_3/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_3"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_3/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter1/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00779728591442
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00779728591442
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_2"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_2/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_4/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_4"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_2/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_4/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_5/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_5"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_4"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_5/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_2"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter2/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\004\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00675264745951
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00675264745951
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 4
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_3"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_3/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_6/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_6"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_3/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_6/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_7/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_7"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_6"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_7/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_3"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter3/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\005\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00603975169361
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00603975169361
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 5
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 5
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 5
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 5
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 5
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 5
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 5
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_4"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_4"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_4/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_4"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_8/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_8"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_4/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_8/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_9/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_9"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_8"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_9/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_4"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter4/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_9"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\006\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00551351346076
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00551351346076
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 6
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 6
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 6
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 6
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 6
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 6
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 6
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_5"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_5/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_5/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_5/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_5"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_5"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_5/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_5/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_5/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_5"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_10/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_10"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_5/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_10/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_11/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_11"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_10"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_11/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_5"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter5/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_11"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\007\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00510452175513
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00510452175513
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 7
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 7
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 7
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 7
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 7
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 7
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 7
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_6"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_6/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_6/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_6/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_6"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_6"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_6/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_6/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_6/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_6"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_12/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_12"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_6/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_12/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_13/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_13"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_12"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_13/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_6"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter6/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_13"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\010\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00477484287694
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00477484287694
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 8
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 8
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 8
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 8
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 8
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 8
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 8
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_7"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_7/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_7/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_7/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_7"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_7"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_7/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_7/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_7/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_7"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_14/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_14"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_7/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_14/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_15/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_15"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_14"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_15/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_7"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter7/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_15"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\t\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00450176512823
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00450176512823
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 9
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 9
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 9
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 9
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 9
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 9
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 9
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_8"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_8/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_8/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_8/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_8"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_8"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_8/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_8/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_8/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_8"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_16/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_16"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_8/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_16/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_17/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_17"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_16"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_17/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_8"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter8/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_17"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\n\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00427074916661
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00427074916661
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_9"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_9/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_9/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_9/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_9"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_9"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_9/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_9/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_9/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_9"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_18/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_18"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_9/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_18/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_19/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_19"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_18"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_19/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_9"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter9/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_19"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\013\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00407199980691
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00407199980691
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 11
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 11
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 11
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 11
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 11
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 11
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 11
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_10"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_10/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_10/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_10/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_10"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_10"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_10/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_10/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_10/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_10"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_20/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_20"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_10/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_20/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_21/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_21"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_20"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_21/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_10"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter10/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_21"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\014\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00389864295721
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00389864295721
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 12
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 12
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 12
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 12
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 12
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 12
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 12
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_11"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_11/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_11/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_11/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_11"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_11"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_11/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_11/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_11/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_11"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_22/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_22"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_11/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_22/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_23/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_23"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_22"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_23/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_11"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter11/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_23"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\r\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00374569487758
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00374569487758
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 13
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 13
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 13
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 13
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 13
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 13
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 13
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_12"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_12/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_12/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_12/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_12"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_12"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_12/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_12/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_12/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_12"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_24/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_24"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_12/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_24/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_25/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_25"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_24"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_25/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_12"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter12/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_25"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\016\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00360944191925
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00360944191925
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 14
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 14
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 14
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 14
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 14
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 14
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 14
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_13"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_13/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_13/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_13/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_13"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_13"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_13/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_13/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_13/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_13"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_26/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_26"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_13/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_26/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_27/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_27"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_26"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_27/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_13"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter13/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_27"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\017\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00348705216311
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00348705216311
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 15
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 15
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 15
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 15
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 15
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 15
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 15
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_14"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_14/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_14/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_14/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_14"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_14"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_14/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_14/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_14/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_14"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_28/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_28"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_14/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_28/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_29/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_29"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_28"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_29/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_14"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter14/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_29"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\020\000\000\000\200\000\000\000\001\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.00337632372975
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00337632372975
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 16
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 16
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 16
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 16
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 16
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 16
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 16
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 128
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_15"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_15/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_15/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_15/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_15"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/Shape_15"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_15/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_15/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_15/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/strided_slice_15"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_30/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_30"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/sequence_length_mask_15/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_30/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_31/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_31"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_30"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_31/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_15"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/conv1d_filter15/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/ExpandDims_31"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_4"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_5"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_6"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_7"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_8"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_9"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_10"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_11"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_12"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_13"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_14"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/mul_15"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 16
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 16
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/maxpool/MaxPool"
+  op: "MaxPool"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/conv1d_bank/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 16
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "ksize"
+    value {
+      list {
+        i: 1
+        i: 2
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/maxpool/MaxPool"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Shape_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_1/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_2/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_2"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_2/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_3/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_3"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_3/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/maxpool/MaxPool"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 16
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\003\000\000\000\020\000\000\000\000\001\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.015625
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.015625
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 16
+          }
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 16
+          }
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 16
+          }
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 16
+          }
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 16
+        }
+        dim {
+          size: 256
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 16
+          }
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 16
+          }
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul_1"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 16
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Shape_2"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Shape_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_2/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_4/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_4"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_2/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_4/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_5/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_5"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_4"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_5/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul_2"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv1/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        tensor_content: "\003\000\000\000\001\000\000\000\000\001\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 1
+        }
+        dim {
+          size: 256
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul_2"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/Const"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/beta/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/beta"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/beta/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/beta"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/beta/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/beta/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/beta"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/beta"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_mean/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_mean"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_mean/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_mean"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_mean/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_mean/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_mean"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_mean"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_variance/Initializer/ones"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_variance"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_variance/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_variance"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_variance/Initializer/ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_variance/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_variance"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_variance"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/FusedBatchNorm"
+  op: "FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/Conv2D"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/Const"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/beta/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_mean/read"
+  input: "seq2seq/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/moving_variance/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "epsilon"
+    value {
+      f: 0.0010000000475
+    }
+  }
+  attr {
+    key: "is_training"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0010000000475
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Shape_3"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/FusedBatchNorm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Shape_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_3/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/strided_slice_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_6/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_6"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/sequence_length_mask_3/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_6/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_7/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_7"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_6"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_7/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul_3"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/fixed_conv2/BatchNorm/FusedBatchNorm"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/ExpandDims_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Squeeze_1"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/mul_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Squeeze"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Squeeze_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "@\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.176776692271
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.176776692271
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 64
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/biases"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/MatMul"
+  op: "MatMul"
+  input: "speaker_embedding_lookup"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/MatMul"
+  input: "seq2seq/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/Dropout/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/fully_connected/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/Dropout/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/Reshape/shape/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/Reshape/shape/2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/Reshape/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/Reshape/shape/1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/Reshape/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/projection_0/Dropout/Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/conv1d_maxpool_residual/Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/biases"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/biases/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/linear_proj/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/axes"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/GreaterEqual/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/GreaterEqual"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/axes"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Less/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Cast_1"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Cast_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Rank"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/ListDiff"
+  op: "ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_idx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Gather"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/ListDiff"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Gather_1"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/add_1"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Prod"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Prod_1"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/add_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Prod_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/speaker_conditioning/cbhg_pre_highway/combination_0/add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/transpose_1/perm"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/weights/read"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/transpose_1/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Reshape_1/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Reshape_1"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/transpose_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/Const_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/Tensordot"
+  input: "seq2seq/encoder/cbhg/hw_mlp/linear_proj/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/biases"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate0/biases/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate0/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/axes"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/GreaterEqual/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/GreaterEqual"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/axes"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Less/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Cast_1"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Cast_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Rank"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/ListDiff"
+  op: "ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_idx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Gather"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/ListDiff"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Gather_1"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/add_1"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Prod"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Prod_1"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/add_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Prod_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/transpose_1/perm"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/weights/read"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/transpose_1/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Reshape_1/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Reshape_1"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/transpose_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/Const_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Tensordot"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate0/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/biases"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden0/biases/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden0/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/axes"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/GreaterEqual/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/GreaterEqual"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/axes"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Less/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Cast_1"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Cast_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Rank"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/ListDiff"
+  op: "ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_idx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Gather"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/ListDiff"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Gather_1"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/add_1"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Prod"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Prod_1"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/add_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Prod_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/transpose_1/perm"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/weights/read"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/transpose_1/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Reshape_1/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Reshape_1"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/transpose_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/Const_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Tensordot"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden0/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Sigmoid"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden0/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub/x"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate0/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/linear_proj/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/biases"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate1/biases/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/axes"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/GreaterEqual/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/GreaterEqual"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/axes"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Less/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Cast_1"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Cast_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Rank"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/ListDiff"
+  op: "ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_idx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Gather"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/ListDiff"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Gather_1"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/add_1"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Prod"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Prod_1"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/add_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Prod_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/transpose_1/perm"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/weights/read"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/transpose_1/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Reshape_1/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Reshape_1"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/transpose_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/Const_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Tensordot"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate1/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/biases"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden1/biases/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/axes"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/GreaterEqual/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/GreaterEqual"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/axes"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Less/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Cast_1"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Cast_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Rank"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/ListDiff"
+  op: "ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_idx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Gather"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/ListDiff"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Gather_1"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/add_1"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Prod"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Prod_1"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/add_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Prod_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/transpose_1/perm"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/weights/read"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/transpose_1/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Reshape_1/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Reshape_1"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/transpose_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/Const_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Tensordot"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden1/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_2"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Sigmoid"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden1/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub_1/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub_1"
+  op: "Sub"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub_1/x"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate1/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_3"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/biases"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate2/biases/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate2/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/axes"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/GreaterEqual/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/GreaterEqual"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/axes"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Less/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Cast_1"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Cast_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Rank"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/ListDiff"
+  op: "ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_idx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Gather"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/ListDiff"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Gather_1"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/add_1"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Prod"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Prod_1"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/add_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Prod_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/transpose_1/perm"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/weights/read"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/transpose_1/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Reshape_1/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Reshape_1"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/transpose_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/Const_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Tensordot"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate2/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/biases"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden2/biases/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden2/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/axes"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/GreaterEqual/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/GreaterEqual"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/axes"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Less/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Cast_1"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Cast_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Rank"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/ListDiff"
+  op: "ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_idx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Gather"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/ListDiff"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Gather_1"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/add_1"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Prod"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Prod_1"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/add_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Prod_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/transpose_1/perm"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/weights/read"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/transpose_1/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Reshape_1/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Reshape_1"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/transpose_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/Const_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Tensordot"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden2/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_4"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Sigmoid"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden2/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub_2/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub_2"
+  op: "Sub"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub_2/x"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate2/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_5"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_4"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/biases"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/gate3/biases/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/gate3/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/axes"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/GreaterEqual/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/GreaterEqual"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/axes"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Less/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Cast_1"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Cast_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Rank"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/ListDiff"
+  op: "ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_idx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Gather"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/ListDiff"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Gather_1"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/add_1"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Prod"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Prod_1"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/add_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Prod_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/transpose_1/perm"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/weights/read"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/transpose_1/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Reshape_1/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Reshape_1"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/transpose_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/Const_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Tensordot"
+  input: "seq2seq/encoder/cbhg/hw_mlp/gate3/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/biases"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/hw_mlp/hidden3/biases/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/hw_mlp/hidden3/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/axes"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/GreaterEqual/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/GreaterEqual"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/axes"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Less/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Cast_1"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/axes"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Rank"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Cast_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Rank"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/ListDiff"
+  op: "ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_idx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Gather"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/ListDiff"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Gather_1"
+  op: "Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/add_1"
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "validate_indices"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Prod"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Prod_1"
+  op: "Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Gather_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/ListDiff"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/add_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Prod"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Prod_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/transpose_1/perm"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/weights/read"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/transpose_1/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Reshape_1/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Reshape_1"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/transpose_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Reshape_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Gather"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/Const_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Tensordot"
+  input: "seq2seq/encoder/cbhg/hw_mlp/hidden3/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_6"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Sigmoid"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/hidden3/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub_3/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub_3"
+  op: "Sub"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub_3/x"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/gate3/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_7"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/sub_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_3"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_6"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/mul_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/Shape_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice_1/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/strided_slice_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/ExpandDims_2/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/ExpandDims_2"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/ExpandDims_2/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/hw_mlp/add_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/ExpandDims_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Rank"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat/values_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat/values_0"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/mul_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/sequence_length"
+  op: "Identity"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/transpose"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/Const"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/zeros/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/zeros"
+  op: "Fill"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/concat"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/zeros/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/sequence_length"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Equal"
+  op: "Equal"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Shape_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/All"
+  op: "All"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Equal"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Const"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Assert/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Expected shape for Tensor seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/sequence_length:0 is "
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Assert/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: " but saw shape: "
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Assert/Assert/data_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Expected shape for Tensor seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/sequence_length:0 is "
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Assert/Assert/data_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: " but saw shape: "
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Assert/Assert"
+  op: "Assert"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/All"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Assert/Assert/data_0"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Assert/Assert/data_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Shape_1"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_STRING
+        type: DT_INT32
+        type: DT_STRING
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "summarize"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/CheckSeqLen"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/sequence_length"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Assert/Assert"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Shape_2"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/transpose"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Shape_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_1/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Shape_3"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/transpose"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Shape_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_2/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Const_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/zeros/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/zeros"
+  op: "Fill"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/zeros/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Min"
+  op: "Min"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/CheckSeqLen"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Const_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Const_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Max"
+  op: "Max"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/CheckSeqLen"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Const_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/time"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray"
+  op: "TensorArrayV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_1"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/dynamic_rnn/output_0"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray_1"
+  op: "TensorArrayV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_1"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/dynamic_rnn/input_0"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/transpose"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3"
+  op: "TensorArrayScatterV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/transpose"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/iteration_counter"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/iteration_counter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Enter_1"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/time"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Enter_2"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Enter_3"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/GRUCellZeroState/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge"
+  op: "Merge"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge_1"
+  op: "Merge"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Enter_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge_2"
+  op: "Merge"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Enter_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/NextIteration_2"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge_3"
+  op: "Merge"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Enter_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/NextIteration_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Less/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Less/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Less_1"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Less/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/LogicalAnd"
+  op: "LogicalAnd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Less_1"
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/LoopCond"
+  op: "LoopCond"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/LogicalAnd"
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Switch"
+  op: "Switch"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Switch_1"
+  op: "Switch"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Switch_2"
+  op: "Switch"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge_2"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Switch_3"
+  op: "Switch"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Merge_3"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity_2"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Switch_2:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity_3"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Switch_3:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/add/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/TensorArrayReadV3"
+  op: "TensorArrayReadV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/TensorArrayReadV3/Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/TensorArrayReadV3/Enter_1"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/TensorArrayReadV3/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/TensorArrayReadV3/Enter_1"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.10825317353
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.10825317353
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/bias/Initializer/Const"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/bias"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/bias/Initializer/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/bias/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.125
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.125
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/bias"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/bias/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/concat/axis"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/TensorArrayReadV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/concat"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/MatMul/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/kernel/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/gates/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/split/split_dim"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/split"
+  op: "Split"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/split/split_dim"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/split"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/concat_1/axis"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/TensorArrayReadV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/MatMul_1"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/concat_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/MatMul_1/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/MatMul_1/Enter"
+  op: "Enter"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/kernel/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/BiasAdd_1"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/MatMul_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/BiasAdd_1/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/BiasAdd_1/Enter"
+  op: "Enter"
+  input: "seq2seq/encoder/cbhg/bi_gru/fw/gru_cell/candidate/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/BiasAdd_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/split:1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/sub/x"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/sub/x"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/mul_2"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/sub"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/Tanh"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/mul_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/GreaterEqual/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/GreaterEqual/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/CheckSeqLen"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Select"
+  op: "Select"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Select/Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/add"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Select/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/add"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/GreaterEqual_1"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/GreaterEqual/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Select_1"
+  op: "Select"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/GreaterEqual_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/add"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/TensorArrayWrite/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/TensorArrayWrite/TensorArrayWriteV3/Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Select"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/add"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/TensorArrayWrite/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/gru_cell/add"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/add_1/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Identity_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/add_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/NextIteration"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/NextIteration_1"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/NextIteration_2"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/TensorArrayWrite/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/NextIteration_3"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Select_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Exit"
+  op: "Exit"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Exit_1"
+  op: "Exit"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Exit_2"
+  op: "Exit"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Switch_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Exit_3"
+  op: "Exit"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Switch_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayStack/TensorArraySizeV3"
+  op: "TensorArraySizeV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Exit_2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayStack/range/start"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayStack/range/delta"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayStack/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayStack/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayStack/TensorArraySizeV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayStack/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayStack/TensorArrayGatherV3"
+  op: "TensorArrayGatherV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayStack/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/while/Exit_2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArray"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Const_4"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Rank_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/range_1/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/range_1/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/range_1"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/range_1/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/Rank_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/range_1/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat_2/values_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat_2/values_0"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/range_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/TensorArrayStack/TensorArrayGatherV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/ReverseSequence"
+  op: "ReverseSequence"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/mul_1"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tlen"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "batch_dim"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seq_dim"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Rank"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat/values_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat/values_0"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/ReverseSequence"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/sequence_length"
+  op: "Identity"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/transpose"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/Const"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/zeros/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/zeros"
+  op: "Fill"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/concat"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/zeros/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/sequence_length"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Equal"
+  op: "Equal"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Shape_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/stack"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/All"
+  op: "All"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Equal"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Const"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Assert/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Expected shape for Tensor seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/sequence_length:0 is "
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Assert/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: " but saw shape: "
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Assert/Assert/data_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "Expected shape for Tensor seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/sequence_length:0 is "
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Assert/Assert/data_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: " but saw shape: "
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Assert/Assert"
+  op: "Assert"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/All"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Assert/Assert/data_0"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Assert/Assert/data_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Shape_1"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_STRING
+        type: DT_INT32
+        type: DT_STRING
+        type: DT_INT32
+      }
+    }
+  }
+  attr {
+    key: "summarize"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/CheckSeqLen"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/sequence_length"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Assert/Assert"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Shape_2"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/transpose"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Shape_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_1/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Shape_3"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/transpose"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Shape_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_2/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/ExpandDims"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Const_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/zeros/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/zeros"
+  op: "Fill"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/zeros/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Min"
+  op: "Min"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/CheckSeqLen"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Const_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Const_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Max"
+  op: "Max"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/CheckSeqLen"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Const_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/time"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray"
+  op: "TensorArrayV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_1"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/dynamic_rnn/output_0"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray_1"
+  op: "TensorArrayV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_1"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/dynamic_rnn/input_0"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/transpose"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/Shape"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/strided_slice/stack"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/strided_slice"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3"
+  op: "TensorArrayScatterV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/transpose"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/iteration_counter"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/iteration_counter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Enter_1"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/time"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Enter_2"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Enter_3"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/GRUCellZeroState/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge"
+  op: "Merge"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge_1"
+  op: "Merge"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Enter_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge_2"
+  op: "Merge"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Enter_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/NextIteration_2"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge_3"
+  op: "Merge"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Enter_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/NextIteration_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Less/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Less/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Less_1"
+  op: "Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Less/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/LogicalAnd"
+  op: "LogicalAnd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Less"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Less_1"
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/LoopCond"
+  op: "LoopCond"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/LogicalAnd"
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Switch"
+  op: "Switch"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Switch_1"
+  op: "Switch"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Switch_2"
+  op: "Switch"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge_2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge_2"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Switch_3"
+  op: "Switch"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Merge_3"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity_2"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Switch_2:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity_3"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Switch_3:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/add/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/TensorArrayReadV3"
+  op: "TensorArrayReadV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/TensorArrayReadV3/Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/TensorArrayReadV3/Enter_1"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/TensorArrayReadV3/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/TensorArrayReadV3/Enter_1"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayUnstack/TensorArrayScatter/TensorArrayScatterV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.10825317353
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.10825317353
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/bias/Initializer/Const"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/bias"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/bias/Initializer/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/bias/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.125
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.125
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/bias"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/bias/read"
+  op: "Identity"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/concat/axis"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/TensorArrayReadV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/concat"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/MatMul/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/kernel/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/gates/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/split/split_dim"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/split"
+  op: "Split"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/split/split_dim"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/split"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/concat_1/axis"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/TensorArrayReadV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/MatMul_1"
+  op: "MatMul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/concat_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/MatMul_1/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/MatMul_1/Enter"
+  op: "Enter"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/kernel/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/BiasAdd_1"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/MatMul_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/BiasAdd_1/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/BiasAdd_1/Enter"
+  op: "Enter"
+  input: "seq2seq/encoder/cbhg/bi_gru/bw/gru_cell/candidate/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/BiasAdd_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/split:1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/sub/x"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/sub/x"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/mul_2"
+  op: "Mul"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/sub"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/Tanh"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/add"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/mul_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/GreaterEqual/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/GreaterEqual/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/CheckSeqLen"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Select"
+  op: "Select"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Select/Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/add"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Select/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/add"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/GreaterEqual_1"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/GreaterEqual/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Select_1"
+  op: "Select"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/GreaterEqual_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity_3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/add"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/TensorArrayWrite/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/TensorArrayWrite/TensorArrayWriteV3/Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Select"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/add"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/TensorArrayWrite/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/gru_cell/add"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/add_1/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Identity_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/add_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/NextIteration"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/NextIteration_1"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/NextIteration_2"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/TensorArrayWrite/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/NextIteration_3"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Select_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Exit"
+  op: "Exit"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Exit_1"
+  op: "Exit"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Exit_2"
+  op: "Exit"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Switch_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Exit_3"
+  op: "Exit"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Switch_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayStack/TensorArraySizeV3"
+  op: "TensorArraySizeV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Exit_2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayStack/range/start"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayStack/range/delta"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayStack/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayStack/range/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayStack/TensorArraySizeV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayStack/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayStack/TensorArrayGatherV3"
+  op: "TensorArrayGatherV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayStack/range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/while/Exit_2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArray"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Const_4"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Rank_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/range_1/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/range_1/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/range_1"
+  op: "Range"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/range_1/start"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/Rank_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/range_1/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat_2/values_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat_2/values_0"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/range_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/TensorArrayStack/TensorArrayGatherV3"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/ReverseSequence"
+  op: "ReverseSequence"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/bw/bw/transpose_1"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tlen"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "batch_dim"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seq_dim"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder/encoder/cbhg/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/bi_gru/fw/fw/transpose_1"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/ReverseSequence"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/encoder_top"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder/encoder/cbhg/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "encoder_top_pre_conditioning"
+  op: "Identity"
+  input: "seq2seq/seq2seq/encoder_top"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "@\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.136930644512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.136930644512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 64
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/biases"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/MatMul"
+  op: "MatMul"
+  input: "speaker_embedding_lookup"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/MatMul"
+  input: "seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/speaker_conditioning/encoder_top/projection_0/Dropout/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/projection_0/fully_connected/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/projection_0/Dropout/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/Shape"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/strided_slice/stack"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/Reshape/shape/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/Reshape/shape/2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/Reshape/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/strided_slice"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/Reshape/shape/1"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/Reshape/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/projection_0/Dropout/Identity"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/add"
+  op: "Add"
+  input: "encoder_top_pre_conditioning"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/Shape"
+  input: "seq2seq/seq2seq/strided_slice/stack"
+  input: "seq2seq/seq2seq/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq/strided_slice"
+  input: "seq2seq/seq2seq/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/sequence_length_mask/range"
+  input: "seq2seq/seq2seq/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "encoder_input_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "encoder_input_lengths"
+  input: "seq2seq/seq2seq/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq/speaker_conditioning/encoder_top/combination_0/add"
+  input: "seq2seq/seq2seq/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/Shape"
+  input: "seq2seq/seq2seq_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/sub/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/sub"
+  op: "Sub"
+  input: "encoder_input_mask"
+  input: "seq2seq/seq2seq_1/sub/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/mul/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1000.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/mul/x"
+  input: "seq2seq/seq2seq_1/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/GmmAttention/Shape"
+  input: "seq2seq/seq2seq_1/GmmAttention/strided_slice/stack"
+  input: "seq2seq/seq2seq_1/GmmAttention/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_1/GmmAttention/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/ToFloat"
+  op: "Cast"
+  input: "seq2seq/seq2seq_1/GmmAttention/strided_slice"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/DropoutWrapperInit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/DropoutWrapperInit/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/DropoutWrapperInit/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/range/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_1/GmmAttention/range/start"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/range/Cast_2"
+  op: "Cast"
+  input: "seq2seq/seq2seq_1/GmmAttention/range/delta"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/GmmAttention/range/Cast"
+  input: "seq2seq/seq2seq_1/GmmAttention/ToFloat"
+  input: "seq2seq/seq2seq_1/GmmAttention/range/Cast_2"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/GmmAttention/range"
+  input: "seq2seq/seq2seq_1/GmmAttention/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/GmmAttention/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_1/GmmAttention/ExpandDims"
+  input: "seq2seq/seq2seq_1/GmmAttention/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/GmmAttention/Tile"
+  input: "seq2seq/seq2seq_1/GmmAttention/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/strided_slice_1/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/strided_slice_1/stack_1"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/strided_slice_1/stack_2"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq/mul"
+  input: "seq2seq/seq2seq_1/GmmAttention/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_1/GmmAttention/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_1/GmmAttention/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 5
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 5
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\000\002\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0883883461356
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0883883461356
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform/max"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+        dim {
+          size: 512
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/weights"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/read"
+  op: "Identity"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 512
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 512
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/biases"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/GmmAttention/first_enc_timestep_proj/biases/read"
+  op: "Identity"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/GmmAttention/first_enc_timestep_proj/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/first_enc_timestep_proj/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/GmmAttention/strided_slice_1"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/first_enc_timestep_proj/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/GmmAttention/first_enc_timestep_proj/MatMul"
+  input: "seq2seq/GmmAttention/first_enc_timestep_proj/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_1/GmmAttention/split/split_dim"
+  input: "seq2seq/seq2seq_1/GmmAttention/first_enc_timestep_proj/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/zeros/shape/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/zeros/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/GmmAttention/zeros/shape/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/zeros/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/zeros"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/GmmAttention/zeros/shape"
+  input: "seq2seq/seq2seq_1/GmmAttention/zeros/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/zeros_1/shape/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 5
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/zeros_1/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/GmmAttention/zeros_1/shape/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/zeros_1/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/GmmAttention/zeros_1"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/GmmAttention/zeros_1/shape"
+  input: "seq2seq/seq2seq_1/GmmAttention/zeros_1/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/AttentionAggregator/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Shape"
+  op: "Shape"
+  input: "decoder_target"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 63
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Fill/dims"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Fill"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/Fill/dims"
+  input: "seq2seq/seq2seq_1/attention_decoder/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/assert_type/statically_determined_correct_type"
+  op: "NoOp"
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/transpose/perm"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/transpose"
+  op: "Transpose"
+  input: "decoder_target"
+  input: "seq2seq/seq2seq_1/attention_decoder/transpose/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/transpose_1/perm"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/seq2seq_1/attention_decoder/Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/transpose_1/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_1/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_1/stack_1"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_1/stack_2"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\002\000\000\000\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/transpose"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 6
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 7
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\002\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/transpose_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/div/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/div"
+  op: "FloorDiv"
+  input: "decoder_target_lengths"
+  input: "seq2seq/seq2seq_1/attention_decoder/div/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/div_1/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/div_1"
+  op: "FloorDiv"
+  input: "max_decoder_output_length"
+  input: "seq2seq/seq2seq_1/attention_decoder/div_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/DropoutWrapperInit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/DropoutWrapperInit/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/DropoutWrapperInit/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/DropoutWrapperInit_1/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/DropoutWrapperInit_1/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/DropoutWrapperInit_1/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 80
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_2"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 64
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        int64_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Fill_1/dims/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 80
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Fill_1/dims"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/Fill_1/dims/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Fill_1/value"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Fill_1"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/Fill_1/dims"
+  input: "seq2seq/seq2seq_1/attention_decoder/Fill_1/value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_4/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_4/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_4"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/zeros_4/shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/zeros_4/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/Const"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/zeros/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/zeros"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/concat"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/zeros/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_2/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_2"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_2/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/Const_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/zeros_1/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/zeros_1"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/concat_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/zeros_1/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_3/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_3"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_3/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/Const_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/Const"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/zeros/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/zeros"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/concat"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/zeros/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_2/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_2"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_2/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/Const_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/zeros_1/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/zeros_1"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/concat_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/zeros_1/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_3/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_3"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/ExpandDims_3/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/Const_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "@\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.136930644512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.136930644512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 64
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/biases"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/MatMul"
+  op: "MatMul"
+  input: "speaker_embedding_lookup"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/MatMul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_0/Dropout/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_0/fully_connected/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/combination_0/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/zeros"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_0/Dropout/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "@\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.136930644512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.136930644512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 64
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/biases"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/MatMul"
+  op: "MatMul"
+  input: "speaker_embedding_lookup"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/MatMul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_1/Dropout/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_1/fully_connected/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/combination_1/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState/DropoutWrapperZeroState/LSTMCellZeroState/zeros_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_1/Dropout/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "@\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.136930644512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.136930644512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 64
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/biases"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/MatMul"
+  op: "MatMul"
+  input: "speaker_embedding_lookup"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/MatMul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_2/Dropout/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_2/fully_connected/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/combination_2/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/zeros"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_2/Dropout/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "@\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.136930644512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.136930644512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 64
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/biases"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/MatMul"
+  op: "MatMul"
+  input: "speaker_embedding_lookup"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/MatMul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_3/Dropout/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_3/fully_connected/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/combination_3/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/MultiRNNCellZeroState/ResidualWrapperZeroState_1/DropoutWrapperZeroState/LSTMCellZeroState/zeros_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/projection_3/Dropout/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_5/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_5/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: false
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_5"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/zeros_5/shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/zeros_5/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_6"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_7"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/Shape_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_8/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_8/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/zeros_8"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/zeros_8/shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/zeros_8/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Shape_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\002\000\000\000P\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Shape_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Shape_4"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\002\000\000\000@\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Shape_5"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Shape_6"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Shape_7"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Shape_8"
+  op: "Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/zeros_8"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Fill/dims"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Fill/value"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: false
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Fill"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Fill/dims"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Fill/value"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat/values_0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/Shape_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_1/values_0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_1/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/Shape_3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_1/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_1"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_1/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_2/values_0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_2"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_2/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/Shape_4"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_2/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_2"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_2/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_3/values_0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_3/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_3"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_3/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/Shape_5"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_3/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_3/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_3"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_3/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_4/values_0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_4/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_4"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_4/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/Shape_6"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_4/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_4/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_4"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_4"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_4/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_5/values_0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_5/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_5"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_5/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/Shape_7"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_5/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_5/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_5"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_5"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_5/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_6/values_0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_6/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_6"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_6/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/Shape_8"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_6/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_6/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_6"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_6"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_6/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/LessEqual/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/LessEqual"
+  op: "LessEqual"
+  input: "seq2seq/seq2seq_1/attention_decoder/div_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/LessEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/LogicalOr"
+  op: "LogicalOr"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/LessEqual"
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_like/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/LogicalOr"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_like/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_like"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_like/Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_like/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray/size"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray"
+  op: "TensorArrayV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray/size"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_1/size"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_1"
+  op: "TensorArrayV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_1/size"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_2/size"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_2"
+  op: "TensorArrayV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_2/size"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_3/size"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_3"
+  op: "TensorArrayV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_3/size"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_4/size"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_4"
+  op: "TensorArrayV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_4/size"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_5/size"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_5"
+  op: "TensorArrayV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_5/size"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_6/size"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_6"
+  op: "TensorArrayV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_6/size"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_1"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_2"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_3"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_2:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_4"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_3:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_5"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_4:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_6"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_5:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_7"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_6:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_8"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/combination_0/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_9"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/combination_1/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_10"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/combination_2/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_11"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/speaker_conditioning/decoder_state/combination_3/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_12"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/GmmAttention/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_13"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/GmmAttention/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_14"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/GmmAttention/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_15"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/GmmAttention/zeros_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_16"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/AttentionAggregator/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_17"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/zeros_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_18"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/Fill_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_19"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/Fill_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_20"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/zeros_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_21"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/zeros_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_22"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/LogicalOr"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_23"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_like"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_1"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_2"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_2"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_3"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_4"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_4"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_5"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_5"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_5"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_6"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_6"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_6"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_7"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_7"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_7"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_8"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_8"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_8"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_9"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_9"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_9"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_10"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_10"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_10"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_11"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_11"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_11"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_12"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_12"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_12"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_13"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_13"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_13"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_14"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_14"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_14"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_15"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_15"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_15"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_16"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_16"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_16"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_17"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_17"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_17"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_18"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_18"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_18"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_19"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_19"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_19"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_20"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_20"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_20"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_21"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_21"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_21"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_22"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_22"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_23"
+  op: "Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Enter_23"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_23"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/All"
+  op: "All"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Const"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalNot"
+  op: "LogicalNot"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/All"
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  op: "LoopCond"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalNot"
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_1"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_2"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_2"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_3"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_3"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_4"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_4"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_4"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_5"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_5"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_5"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_6"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_6"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_6"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_7"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_7"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_7"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_8"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_8"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_8"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_9"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_9"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_9"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_10"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_10"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_10"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_11"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_11"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_11"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_12"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_12"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_12"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_13"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_13"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_13"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_14"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_14"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_14"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_15"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_15"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_15"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_16"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_16"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_16"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_17"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_17"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_17"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_18"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_18"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_18"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_19"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_19"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_19"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_20"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_20"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_20"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_21"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_21"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_21"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_22"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_22"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_23"
+  op: "Switch"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_23"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Merge_23"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_2"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_2:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_3"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_3:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_4"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_4:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_5"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_5:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_6"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_6:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_7"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_7:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_8"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_8:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_9"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_9:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_10"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_10:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_11"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_11:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_12"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_12:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_13"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_13:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_14"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_14:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_15"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_15:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_16"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_16:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_17"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_17:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_18"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_18:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_19"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_19:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_20"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_20:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_21"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_21:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_22:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_23"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_23:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "@\000\000\000P\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.20412415266
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.20412415266
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 64
+        }
+        dim {
+          size: 80
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 80
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 80
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/biases"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/fully_connected/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/fully_connected/MatMul/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/fully_connected/MatMul/Enter_1"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/fully_connected/MatMul/Enter"
+  op: "Enter"
+  input: "speaker_embedding_lookup"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/fully_connected/MatMul/Enter_1"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/fully_connected/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/fully_connected/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/fully_connected/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/speaker_conditioning/before_prenet/projection_0/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/Dropout/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/fully_connected/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/combination_0/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_19"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/Dropout/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "P\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.133630618453
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.133630618453
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 80
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/biases"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/combination_0/add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected/MatMul/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 80
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/keep_prob"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/random_uniform/min"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/random_uniform/max"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/random_uniform/max"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/random_uniform/RandomUniform"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/random_uniform"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/random_uniform/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/keep_prob"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/Floor"
+  op: "Floor"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/div"
+  op: "RealDiv"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected/Relu"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/keep_prob"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/div"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/Floor"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.125
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.125
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/biases"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/biases/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected_1/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout/dropout/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected_1/MatMul/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected_1/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected_1/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected_1/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected_1/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected_1/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/pre_dec_rnn_net/fully_connected_1/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected_1/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected_1/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/keep_prob"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected_1/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/random_uniform/min"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/random_uniform/max"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/random_uniform/max"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/random_uniform/RandomUniform"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/random_uniform"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/random_uniform/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/keep_prob"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/Floor"
+  op: "Floor"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/div"
+  op: "RealDiv"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/fully_connected_1/Relu"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/keep_prob"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/div"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/Floor"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros/shape/Enter"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros/shape/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros/shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/concat/axis"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/pre_dec_rnn_net/Dropout_1/dropout/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_12"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 384
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\002\000\000\000\004\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.060048058629
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.060048058629
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 640
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 640
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 640
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 640
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 640
+        }
+        dim {
+          size: 1024
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 640
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 640
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1024
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1024
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/bias"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/bias/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/concat/axis"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/concat"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_14"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 640
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/concat"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/MatMul/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/kernel/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 640
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/lstm_cell/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/split/split_dim"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/split/split_dim"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 4
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/add/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/split:2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_13"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/Sigmoid_1"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/Sigmoid_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/Tanh"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/clip_by_value/Minimum/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 10.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/clip_by_value/Minimum"
+  op: "Minimum"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/add_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/clip_by_value/Minimum/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/clip_by_value/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -10.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/clip_by_value"
+  op: "Maximum"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/clip_by_value/Minimum"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/clip_by_value/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/Sigmoid_2"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/split:3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/Tanh_1"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/clip_by_value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/mul_2"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/Sigmoid_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/Tanh_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.10000000149
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/mul/x"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.899999976158
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/mul/x"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/clip_by_value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Const"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_13"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/mul_2/x"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.899999976158
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/mul_2"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/mul_2/x"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/mul_3"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Const"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_14"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/mul_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/mul_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.125
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.125
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/bias"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/bias/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_hidden/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/mul_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_hidden/MatMul/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_hidden/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/kernel/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_hidden/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_hidden/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_hidden/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_hidden/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_hidden/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_hidden/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_hidden/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\017\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.204836621881
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.204836621881
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+        dim {
+          size: 15
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 15
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 15
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/bias"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/bias/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_output/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_hidden/Relu"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_output/MatMul/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_output/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/kernel/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_output/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_output/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_output/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_output/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/AttentionAggregator/GmmAttention/gmm_mlp_output/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 15
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Const_1"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/split/split_dim"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/split/split_dim"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/gmm_mlp_output/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Exp"
+  op: "Exp"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/split:2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Softmax"
+  op: "Softmax"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Exp_1"
+  op: "Exp"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_15"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Exp_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Minimum"
+  op: "Minimum"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Minimum/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Minimum/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/GmmAttention/ToFloat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims/dim"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Softmax"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims_1/dim"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Minimum"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims_2/dim"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims_2"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Exp"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims_2/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul/x"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 6.28318548203
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul/x"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/add_1/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 9.99999993923e-09
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/add_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Rsqrt"
+  op: "Rsqrt"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Rsqrt"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/sub/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/sub/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/GmmAttention/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/pow/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 2.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/pow"
+  op: "Pow"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/sub"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/pow/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Neg"
+  op: "Neg"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/pow"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul_2/x"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 2.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul_2"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul_2/x"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/add_2/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 9.99999993923e-09
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/add_2/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/div"
+  op: "RealDiv"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Neg"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/add_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Exp_2"
+  op: "Exp"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/div"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul_3"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Exp_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Sum/reduction_indices"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Sum"
+  op: "Sum"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul_3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Sum/reduction_indices"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul_4"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Sum"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul_4/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul_4/Enter"
+  op: "Enter"
+  input: "encoder_input_mask"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims_3/dim"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims_3"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul_4"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims_3/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/MatMul"
+  op: "BatchMatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/ExpandDims_3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/MatMul/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/MatMul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/concat_1/axis"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/concat_1"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Squeeze"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/lstm_cell/mul_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zeros/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros/shape/Enter"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zeros/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zeros"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zeros/shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zeros/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros_1/shape/1"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros_1/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros/shape/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros_1/shape/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros_1/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros_1"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros_1/shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros_1/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "@\000\000\000\000\002\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.10206207633
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.10206207633
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 64
+        }
+        dim {
+          size: 512
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 512
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 512
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/biases"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/fully_connected/MatMul/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/MatMul/Enter"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/context_and_cell_output/projection_0/Dropout/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/context_and_cell_output/projection_0/fully_connected/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/context_and_cell_output/combination_0/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/concat_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/context_and_cell_output/projection_0/Dropout/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "@\000\000\000\000\002\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.10206207633
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.10206207633
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 64
+        }
+        dim {
+          size: 512
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 512
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 512
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/biases"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/fully_connected/MatMul/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/MatMul/Enter"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/Dropout/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/fully_connected/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/combination_0/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/context_and_cell_output/combination_0/add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/projection_0/Dropout/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\003\000\000\000\004\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0578637570143
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0578637570143
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 768
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 768
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 768
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 768
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 768
+        }
+        dim {
+          size: 1024
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 768
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 768
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1024
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1024
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/bias"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/bias/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/concat/axis"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/combination_0/add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_9"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 768
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/concat"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/MatMul/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/kernel/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 768
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/split/split_dim"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/split/split_dim"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 4
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/add/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/split:2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_8"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/Sigmoid_1"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/Sigmoid_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/Tanh"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/clip_by_value/Minimum/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 10.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/clip_by_value/Minimum"
+  op: "Minimum"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/add_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/clip_by_value/Minimum/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/clip_by_value/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -10.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/clip_by_value"
+  op: "Maximum"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/clip_by_value/Minimum"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/clip_by_value/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/Sigmoid_2"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/split:3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/Tanh_1"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/clip_by_value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/mul_2"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/Sigmoid_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/Tanh_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.10000000149
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/mul/x"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.899999976158
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/mul/x"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/clip_by_value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/Const"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_8"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/mul_2/x"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.899999976158
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/mul_2"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/mul_2/x"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/mul_3"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/Const"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_9"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/mul_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/mul_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\002\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0883883461356
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0883883461356
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 512
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/biases"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/biases/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/residual_projection/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/speaker_conditioning/combination_0/add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/residual_projection/MatMul/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/residual_projection/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/residual_projection/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/residual_projection/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/residual_projection/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/residual_projection/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_0/residual_projection/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/residual_projection/BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/lstm_cell/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "@\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.136930644512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.136930644512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 64
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/biases"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/speaker_conditioning/before_prenet/projection_0/fully_connected/MatMul/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/MatMul/Enter"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 64
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/Dropout/Identity"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/fully_connected/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/combination_0/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/projection_0/Dropout/Identity"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\002\000\000\000\004\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0625
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0625
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 512
+        }
+        dim {
+          size: 1024
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1024
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1024
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/bias"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/bias/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/concat/axis"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/concat"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/combination_0/add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_11"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 512
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/concat"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/MatMul/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/kernel/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 512
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1024
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/split/split_dim"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/split/split_dim"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 4
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/add/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/split:2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_10"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/Sigmoid_1"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/Sigmoid_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/Tanh"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/clip_by_value/Minimum/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 10.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/clip_by_value/Minimum"
+  op: "Minimum"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/add_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/clip_by_value/Minimum/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/clip_by_value/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -10.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/clip_by_value"
+  op: "Maximum"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/clip_by_value/Minimum"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/clip_by_value/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/Sigmoid_2"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/split:3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/Tanh_1"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/clip_by_value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/mul_2"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/Sigmoid_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/Tanh_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.10000000149
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/mul/x"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.899999976158
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/mul/x"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/clip_by_value"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/Const"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_10"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/mul_2/x"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.899999976158
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/mul_2"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/mul_2/x"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/mul_3"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/Const"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_11"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/mul_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/mul_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/speaker_conditioning/combination_0/add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/lstm_cell/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack/1"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack/2"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack/1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack_1/1"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack_1/2"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack_1/1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack_1/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack_2"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 6
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 6
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_1/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1/stack/1"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1/stack"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1/stack/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1/stack_1/1"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1/stack_1/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1/stack_2"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/strided_slice/stack"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/strided_slice/stack_1"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/strided_slice/stack_2"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/strided_slice/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/FullyConnected/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\240\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/FullyConnected/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.120096117258
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/FullyConnected/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.120096117258
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/FullyConnected/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/FullyConnected/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/FullyConnected/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/FullyConnected/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/FullyConnected/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+        dim {
+          size: 160
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/FullyConnected/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/FullyConnected/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 160
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/FullyConnected/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 160
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/biases"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/FullyConnected/fully_connected/biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/FullyConnected/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/fully_connected/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/fully_connected/MatMul/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/fully_connected/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/fully_connected/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/fully_connected/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/fully_connected/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/FullyConnected/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 160
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/Reshape/shape/1"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/Reshape/shape/2"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 80
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/Reshape/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/Reshape/shape/1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/Reshape/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/fully_connected/BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.152794972062
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.152794972062
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform/max"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform/mul"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+        dim {
+          size: 1
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/biases/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/biases"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/biases"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/biases"
+      }
+    }
+  }
+
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/biases/Assign"
+  op: "Assign"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/biases"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/biases/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/biases"
+      }
+    }
+  }
+
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/biases/read"
+  op: "Identity"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/biases"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/EndOfSequenceOutputLayer/fully_connected/MatMul"
+  op: "MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/EndOfSequenceOutputLayer/fully_connected/MatMul/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/EndOfSequenceOutputLayer/fully_connected/MatMul/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/weights/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/EndOfSequenceOutputLayer/fully_connected/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/EndOfSequenceOutputLayer/fully_connected/MatMul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/EndOfSequenceOutputLayer/fully_connected/BiasAdd/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/EndOfSequenceOutputLayer/fully_connected/BiasAdd/Enter"
+  op: "Enter"
+  input: "seq2seq/attention_decoder/EndOfSequenceOutputLayer/fully_connected/biases/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/EndOfSequenceOutputLayer/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/EndOfSequenceOutputLayer/fully_connected/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/EndOfSequenceOutputLayer/Squeeze"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Greater/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.990000009537
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Greater"
+  op: "Greater"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Sigmoid"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Greater/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalOr"
+  op: "LogicalOr"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Greater"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_17"
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros/shape/1"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros/shape/2"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 64
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros/shape/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros/shape/1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros/shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros_1/shape/1"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros_1/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros/shape/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros_1/shape/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros_1/Const"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros_1"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros_1/shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros_1/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_2/stack"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\377\377\377\377\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_2/stack_1"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_2/stack_2"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/Reshape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 5
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 5
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_3/stack"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_3/stack_1"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_3/stack_2"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalOr_1"
+  op: "LogicalOr"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalOr"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_2/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_2/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/GreaterEqual/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/GreaterEqual/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/div_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalOr_2"
+  op: "LogicalOr"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalOr_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/GreaterEqual"
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalNot_1"
+  op: "LogicalNot"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalAnd"
+  op: "LogicalAnd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalNot_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalOr_2"
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_23"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_3/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_3"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_3/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Fill"
+  op: "Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalAnd"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Fill"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_23"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_1"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_1/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/FullyConnected/Reshape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_1/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_2"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_2/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/EndOfSequenceOutputLayer/Squeeze"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_2/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_3"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_3/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_3/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_4"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_4/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/zeros_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_4/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_5"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_5/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_5/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_6"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_6/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/zeros_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_6/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_7"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_7/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/mul_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_7/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/zeros_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_8"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_8"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_9"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_9"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_0/zoneout/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_10"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_10"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_11"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_11"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/attention_decoder/multi_rnn_cell/cell_1/zoneout/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_12"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_12"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Squeeze"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_13"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_13"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_14"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_14"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/zoneout/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_15"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_15"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/GmmAttention/Minimum"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_16"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_22"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_17"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalOr"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite/TensorArrayWriteV3/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_1"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_1/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_1/TensorArrayWriteV3/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_2"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_1/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_2"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_2/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_2/TensorArrayWriteV3/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_3"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_2/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_3"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_3/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_3/TensorArrayWriteV3/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_4"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_4"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_3/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_4"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_4/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_4/TensorArrayWriteV3/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_5"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_5"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_4/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_5"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_5/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_5/TensorArrayWriteV3/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_6"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_6"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_5/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_6"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_6/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_6/TensorArrayWriteV3/Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_7"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_7"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_6/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_7"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_4/y"
+  op: "Const"
+  input: "^seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_4"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_4/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/add_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_1"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_2"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_1/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_3"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_2/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_4"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_3/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_5"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_4/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_6"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_5/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_7"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/TensorArrayWrite_6/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_8"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_8"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_9"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_9"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_10"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_10"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_11"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_11"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_12"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_12"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_13"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_13"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_14"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_14"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_15"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_15"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_16"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/AttentionAggregator/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_17"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select_16"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_18"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_19"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_20"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_21"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/strided_slice_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_22"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/LogicalOr_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/NextIteration_23"
+  op: "NextIteration"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Select"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_1"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_2"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_3"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_4"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_5"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_6"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_7"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_8"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_8"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_9"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_9"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_10"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_10"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_11"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_11"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_12"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_12"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_13"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_13"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_14"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_14"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_15"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_15"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 5
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_16"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_16"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_17"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_17"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_18"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_18"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_19"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_19"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_20"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_20"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_21"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_21"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_22"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_22"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_23"
+  op: "Exit"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Switch_23"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack/TensorArraySizeV3"
+  op: "TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_1"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack/range/start"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack/range/delta"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack/range/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack/TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack/TensorArrayGatherV3"
+  op: "TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack/range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_1"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 2
+        }
+        dim {
+          size: 80
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_1/TensorArraySizeV3"
+  op: "TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_1/range/start"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_1"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_1/range/delta"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_1"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_1/range/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_1/TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_1/TensorArrayGatherV3"
+  op: "TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_1/range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_1"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_2/TensorArraySizeV3"
+  op: "TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_3"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_2"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_2/range/start"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_2"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_2/range/delta"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_2"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_2/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_2/range/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_2/TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_2/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_2"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_2/TensorArrayGatherV3"
+  op: "TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_2/range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_3"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_2"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 2
+        }
+        dim {
+          size: 64
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_3/TensorArraySizeV3"
+  op: "TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_4"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_3"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_3/range/start"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_3"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_3/range/delta"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_3"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_3/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_3/range/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_3/TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_3/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_3"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_3/TensorArrayGatherV3"
+  op: "TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_3/range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_4"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_3"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 2
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_4/TensorArraySizeV3"
+  op: "TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_4"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_5"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_4"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_4/range/start"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_4"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_4/range/delta"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_4"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_4/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_4/range/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_4/TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_4/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_4"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_4/TensorArrayGatherV3"
+  op: "TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_4"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_4/range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_5"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_4"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_5/TensorArraySizeV3"
+  op: "TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_5"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_6"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_5"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_5/range/start"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_5"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_5/range/delta"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_5"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_5/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_5/range/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_5/TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_5/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_5"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_5/TensorArrayGatherV3"
+  op: "TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_5"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_5/range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_6"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_5"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_6/TensorArraySizeV3"
+  op: "TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_6"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_7"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_6"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_6/range/start"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_6"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_6/range/delta"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_6"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_6/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_6/range/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_6/TensorArraySizeV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_6/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_6"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_6/TensorArrayGatherV3"
+  op: "TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_6"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_6/range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_7"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArray_6"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_7/values_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_7/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_7"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_7/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_7/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose"
+  op: "Transpose"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack/TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_7"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_1/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_1/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_1"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_1/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_1/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_8/values_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_8/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_8"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_8/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_8/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose_1"
+  op: "Transpose"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_1/TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_8"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_2/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_2/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_2"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_2/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_2/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_9/values_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_9/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_9"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_9/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_9/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose_2"
+  op: "Transpose"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_2/TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_9"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 64
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank_3"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_3/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_3/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_3"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_3/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank_3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_3/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_10/values_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_10/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_10"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_10/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_10/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose_3"
+  op: "Transpose"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_3/TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_10"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank_4"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_4/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_4/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_4"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_4/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank_4"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_4/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_11/values_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_11/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_11"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_11/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_4"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_11/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose_4"
+  op: "Transpose"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_4/TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_11"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank_5"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_5/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_5/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_5"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_5/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank_5"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_5/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_12/values_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_12/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_12"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_12/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_5"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_12/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose_5"
+  op: "Transpose"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_5/TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_12"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank_6"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_6/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_6/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_6"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_6/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/Rank_6"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_6/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_13/values_0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_13/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_13"
+  op: "ConcatV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_13/values_0"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/range_6"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_13/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose_6"
+  op: "Transpose"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/TensorArrayStack_6/TensorArrayGatherV3"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/concat_13"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Const_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Mean"
+  op: "Mean"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose_4"
+  input: "seq2seq/seq2seq_1/attention_decoder/Const_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/comb_weights/tag"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "seq2seq/seq2seq_1/attention_decoder/comb_weights"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/comb_weights"
+  op: "HistogramSummary"
+  input: "seq2seq/seq2seq_1/attention_decoder/comb_weights/tag"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Shape_9"
+  op: "Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 4
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/Shape_9"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_4"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Reshape/shape/2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 80
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Reshape/shape"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/Reshape/shape/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Reshape"
+  op: "Reshape"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose"
+  input: "seq2seq/seq2seq_1/attention_decoder/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul_1/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/while/Exit_23"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sub/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_1/attention_decoder/sub/x"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask/Cast"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/sub"
+  input: "seq2seq/seq2seq_1/attention_decoder/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul_2"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/ExpandDims"
+  input: "Maximum_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/Reshape"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Shape"
+  op: "Shape"
+  input: "decoder_target"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/strided_slice/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Maximum/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Maximum"
+  op: "Maximum"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/sub"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Maximum/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/PadV2/paddings/1/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/PadV2/paddings/1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/PadV2/paddings/1/0"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Maximum"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/PadV2/paddings/0_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/PadV2/paddings/2_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/PadV2/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/PadV2/paddings/0_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/PadV2/paddings/1"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/PadV2/paddings/2_1"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/PadV2"
+  op: "PadV2"
+  input: "decoder_target"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/PadV2/paddings"
+  input: "Maximum_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Slice/begin"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Slice/size/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Slice/size/2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Slice/size"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Slice/size/0"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Slice/size/2"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Slice"
+  op: "Slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/PadV2"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Slice/begin"
+  input: "seq2seq/seq2seq_1/attention_decoder/pad_or_truncate_sequence_tensor/Slice/size"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul_3/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul_3"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_3/x"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_3"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "decoder_target_lengths"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "decoder_target_lengths"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_5/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_5/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_5/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_5"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_5/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_5/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_5/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sub_1/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 9
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sub_1"
+  op: "Sub"
+  input: "decoder_target_lengths"
+  input: "seq2seq/seq2seq_1/attention_decoder/sub_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul_4/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul_4"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_4/x"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/range/start"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_4"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/range"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/sub_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/strided_slice/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/strided_slice"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/sub_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/Tile"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_6/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_6/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_6/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_6"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/sequence_length_mask_2/Cast"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_6/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_6/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_6/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sub_2/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/sub_2"
+  op: "Sub"
+  input: "seq2seq/seq2seq_1/attention_decoder/sub_2/x"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_7/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_7/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_7/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_7"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/sub"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_7/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_7/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_7/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul_5/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1000.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul_5"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_7"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_5/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Shape_10"
+  op: "Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_8/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_8/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_8/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/strided_slice_8"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_1/attention_decoder/Shape_10"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_8/stack"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_8/stack_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_8/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/zeros_like"
+  op: "ZerosLike"
+  input: "seq2seq/seq2seq_1/attention_decoder/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/GreaterEqual"
+  op: "GreaterEqual"
+  input: "seq2seq/seq2seq_1/attention_decoder/add_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/zeros_like"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/Select"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/GreaterEqual"
+  input: "seq2seq/seq2seq_1/attention_decoder/add_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/zeros_like"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/Neg"
+  op: "Neg"
+  input: "seq2seq/seq2seq_1/attention_decoder/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/Select_1"
+  op: "Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/GreaterEqual"
+  input: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/Neg"
+  input: "seq2seq/seq2seq_1/attention_decoder/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/add_1"
+  input: "seq2seq/seq2seq_1/attention_decoder/sub_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/Select"
+  input: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/Exp"
+  op: "Exp"
+  input: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/Select_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/Log1p"
+  op: "Log1p"
+  input: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/Exp"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/logistic_loss"
+  op: "Add"
+  input: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/sub"
+  input: "seq2seq/seq2seq_1/attention_decoder/logistic_loss/Log1p"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul_6"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_5"
+  input: "seq2seq/seq2seq_1/attention_decoder/logistic_loss"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Const_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/Sum"
+  op: "Sum"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_6"
+  input: "seq2seq/seq2seq_1/attention_decoder/Const_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/ToFloat"
+  op: "Cast"
+  input: "seq2seq/seq2seq_1/attention_decoder/strided_slice_8"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/truediv"
+  op: "RealDiv"
+  input: "seq2seq/seq2seq_1/attention_decoder/Sum"
+  input: "seq2seq/seq2seq_1/attention_decoder/ToFloat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/eos_loss"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/truediv"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul_7/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.00999999977648
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_1/attention_decoder/mul_7"
+  op: "Mul"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_7/x"
+  input: "seq2seq/seq2seq_1/attention_decoder/eos_loss"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient"
+  op: "StopGradient"
+  input: "seq2seq/seq2seq_1/attention_decoder/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  op: "StopGradient"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000P\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.169841557741
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.169841557741
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 80
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000P\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.169841557741
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.169841557741
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 80
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 80
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_residual_in/BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/1x1_skip_in/BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\002\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_0/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\004\000\000\000\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_1/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 8
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\010\000\000\000\010\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 8
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 8
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_2/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 16
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\020\000\000\000\020\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 16
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 16
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_3/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 32
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: " \000\000\000 \000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 32
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 32
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_4/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 64
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "@\000\000\000@\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 64
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 64
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_5/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_6/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_7/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\002\000\000\000\002\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_8/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_9/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\002\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_10/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\004\000\000\000\004\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 4
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_11/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 8
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\010\000\000\000\010\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 8
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 8
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_12/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 16
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\020\000\000\000\020\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 16
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 16
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_13/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 32
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: " \000\000\000 \000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 32
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 32
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_14/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 64
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "@\000\000\000@\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 64
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 64
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_15/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 128
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_16/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\001\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 256
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_17/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721687823534
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 256
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 256
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/filter_shape"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\003\000\000\000\200\000\000\000\000\001\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/stack"
+  op: "Const"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\002\000\000\000\002\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/stack_1"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/mod"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/dilation_rate"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/mod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/mod_1"
+  op: "FloorMod"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/dilation_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/add_2"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/mod_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/add_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_3/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_3/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/paddings/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_2"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_3"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/paddings"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/paddings/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/mod_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_4/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_4/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/crops/0/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/crops/0"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/crops/0/0"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/strided_slice_4"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/crops"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/crops/0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/paddings"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/concat/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/concat"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_2"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/required_space_to_batch_paddings/crops"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_2/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_2/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/concat_1/concat_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/concat_1"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/SpaceToBatchND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/SpaceToBatchND"
+  op: "SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/SpaceToBatchND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/conv1d_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/conv1d_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/SpaceToBatchND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/conv1d_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/conv1d_2/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/conv1d_2/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/conv1d_2/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/conv1d_2/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/conv1d_2/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/conv1d_2/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "VALID"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/conv1d_2/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/conv1d_2/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/BatchToSpaceND/block_shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 512
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/BatchToSpaceND"
+  op: "BatchToSpaceND"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/conv1d_2/Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/BatchToSpaceND/block_shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tblock_shape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tcrops"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/BatchToSpaceND"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 256
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/gated_unit/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/gated_unit/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/gated_unit/split"
+  op: "Split"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/gated_unit/split/split_dim"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/conv1d/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/gated_unit/Tanh"
+  op: "Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/gated_unit/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/gated_unit/Sigmoid"
+  op: "Sigmoid"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/gated_unit/split:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/gated_unit/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/gated_unit/Tanh"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/gated_unit/Sigmoid"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_residual/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/gated_unit/mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/add_1"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_18/mul_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/1x1_skip/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/Shape_1"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice_1"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/Shape_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice_1/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice_1/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/strided_slice_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/sequence_length_mask_1/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/mul_1"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/add_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/dilation_layer_19/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\200\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.153093114495
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 128
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 128
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/Relu"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/1x1_output/BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/Relu"
+  op: "Relu"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_0/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform/shape"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+        }
+        tensor_content: "\001\000\000\000\200\000\000\000\001\004\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform/min"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0721374824643
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform/max"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0721374824643
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform/sub"
+  op: "Sub"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform/max"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel"
+      }
+    }
+  }
+
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform/mul"
+  op: "Mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform/RandomUniform"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform"
+  op: "Add"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform/mul"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 128
+        }
+        dim {
+          size: 1025
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/Initializer/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/bias/Initializer/zeros"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1025
+          }
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/bias"
+  op: "VariableV2"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: "local"
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1025
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/bias/Assign"
+  op: "Assign"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/bias"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/bias/Initializer/zeros"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "validate_shape"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/bias/read"
+  op: "Identity"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/bias"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/bias"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/dilation_rate"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/conv1d/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/conv1d/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/Relu"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/conv1d/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 128
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/conv1d/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/conv1d/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/kernel/read"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/conv1d/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: 128
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/conv1d/Conv2D"
+  op: "Conv2D"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/conv1d/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/conv1d/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/conv1d/Squeeze"
+  op: "Squeeze"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/conv1d/Conv2D"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "squeeze_dims"
+    value {
+      list {
+        i: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/BiasAdd"
+  op: "BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/conv1d/Squeeze"
+  input: "seq2seq/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/bias/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/sequence_length_mask/Cast"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/1x1_output/BiasAdd"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/ExpandDims"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/Shape"
+  input: "seq2seq/seq2seq_2/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/range"
+  op: "Range"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/range/start"
+  input: "seq2seq/seq2seq_2/strided_slice"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/range"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/Shape"
+  op: "Shape"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/strided_slice"
+  op: "StridedSlice"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/Shape"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/strided_slice/stack"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/strided_slice/stack_1"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/Tile/multiples/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/Tile/multiples"
+  op: "Pack"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/strided_slice"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/Tile/multiples/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/Tile"
+  op: "Tile"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/ExpandDims"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/Tile/multiples"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/ExpandDims_1/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/ExpandDims_1"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_1"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/ExpandDims_1/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/Less"
+  op: "Less"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/Tile"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/ExpandDims_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/sequence_length_mask/Cast"
+  op: "Cast"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/Less"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/sub/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/sub"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/sub/x"
+  input: "seq2seq/seq2seq_2/sequence_length_mask/Cast"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "seq2seq/seq2seq_2/ExpandDims/dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/ExpandDims"
+  op: "ExpandDims"
+  input: "seq2seq/seq2seq_2/sub"
+  input: "seq2seq/seq2seq_2/ExpandDims/dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/mul"
+  op: "Mul"
+  input: "seq2seq/seq2seq_2/ExpandDims"
+  input: "Maximum_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "seq2seq/seq2seq_2/add"
+  op: "Add"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/dilated_conv1d_stack/output_layer_1/mul"
+  input: "seq2seq/seq2seq_2/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "attention_matrix"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "transcript_attention_comb_weights"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose_6"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "attention_controller_output"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/attention_decoder/transpose_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "decoder_outputs"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 80
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "decoder_output_lengths"
+  op: "Identity"
+  input: "seq2seq/seq2seq_1/attention_decoder/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "postnet_outputs"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "postnet_output_lengths"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "synthesis_outputs"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "synthesis_output_lengths"
+  op: "Identity"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "Exp"
+  op: "Exp"
+  input: "seq2seq/seq2seq_2/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "pow_2/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.20000004768
+      }
+    }
+  }
+}
+node {
+  name: "pow_2"
+  op: "Pow"
+  input: "Exp"
+  input: "pow_2/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/Shape"
+  op: "Shape"
+  input: "pow_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "g_lim/random_uniform/min"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/random_uniform/max"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/random_uniform/RandomUniform"
+  op: "RandomUniform"
+  input: "g_lim/Shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/random_uniform/sub"
+  op: "Sub"
+  input: "g_lim/random_uniform/max"
+  input: "g_lim/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "g_lim/random_uniform/mul"
+  op: "Mul"
+  input: "g_lim/random_uniform/RandomUniform"
+  input: "g_lim/random_uniform/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/random_uniform"
+  op: "Add"
+  input: "g_lim/random_uniform/mul"
+  input: "g_lim/random_uniform/min"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/mul/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 2.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/mul"
+  op: "Mul"
+  input: "g_lim/mul/x"
+  input: "g_lim/random_uniform"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/sub/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/sub"
+  op: "Sub"
+  input: "g_lim/mul"
+  input: "g_lim/sub/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/mul_1/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 3.14159274101
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/mul_1"
+  op: "Mul"
+  input: "g_lim/mul_1/x"
+  input: "g_lim/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/imag"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/Complex"
+  op: "Complex"
+  input: "pow_2"
+  input: "g_lim/imag"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/real"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/Complex_1"
+  op: "Complex"
+  input: "g_lim/real"
+  input: "g_lim/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/Exp"
+  op: "Exp"
+  input: "g_lim/Complex_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/mul_2"
+  op: "Mul"
+  input: "g_lim/Complex"
+  input: "g_lim/Exp"
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/Enter"
+  op: "Enter"
+  input: "g_lim/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "g_lim/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "g_lim/while/Enter_1"
+  op: "Enter"
+  input: "g_lim/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "g_lim/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "g_lim/while/Merge"
+  op: "Merge"
+  input: "g_lim/while/Enter"
+  input: "g_lim/while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/Merge_1"
+  op: "Merge"
+  input: "g_lim/while/Enter_1"
+  input: "g_lim/while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/Less/y"
+  op: "Const"
+  input: "^g_lim/while/Merge"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 99
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/Less"
+  op: "Less"
+  input: "g_lim/while/Merge"
+  input: "g_lim/while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/LoopCond"
+  op: "LoopCond"
+  input: "g_lim/while/Less"
+
+}
+node {
+  name: "g_lim/while/Switch"
+  op: "Switch"
+  input: "g_lim/while/Merge"
+  input: "g_lim/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@g_lim/while/Merge"
+      }
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/Switch_1"
+  op: "Switch"
+  input: "g_lim/while/Merge_1"
+  input: "g_lim/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@g_lim/while/Merge_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/Identity"
+  op: "Identity"
+  input: "g_lim/while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/Identity_1"
+  op: "Identity"
+  input: "g_lim/while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/frame_length"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1200
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/frame_step"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 300
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/fft_length"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2048
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/irfft/packed"
+  op: "Pack"
+  input: "g_lim/while/gl_ifft_ola/fft_length"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/irfft"
+  op: "IRFFT"
+  input: "g_lim/while/Identity_1"
+  input: "g_lim/while/gl_ifft_ola/irfft/packed"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2048
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/strided_slice/stack"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/strided_slice/stack_1"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\260\004\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/strided_slice/stack_2"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/strided_slice"
+  op: "StridedSlice"
+  input: "g_lim/while/gl_ifft_ola/irfft"
+  input: "g_lim/while/gl_ifft_ola/strided_slice/stack"
+  input: "g_lim/while/gl_ifft_ola/strided_slice/stack_1"
+  input: "g_lim/while/gl_ifft_ola/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/periodic"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: true
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/Cast"
+  op: "Cast"
+  input: "g_lim/while/gl_ifft_ola/hw/periodic"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/FloorMod/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/FloorMod"
+  op: "FloorMod"
+  input: "g_lim/while/gl_ifft_ola/frame_length"
+  input: "g_lim/while/gl_ifft_ola/hw/FloorMod/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/sub/x"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/sub"
+  op: "Sub"
+  input: "g_lim/while/gl_ifft_ola/hw/sub/x"
+  input: "g_lim/while/gl_ifft_ola/hw/FloorMod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/mul"
+  op: "Mul"
+  input: "g_lim/while/gl_ifft_ola/hw/Cast"
+  input: "g_lim/while/gl_ifft_ola/hw/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/add"
+  op: "Add"
+  input: "g_lim/while/gl_ifft_ola/frame_length"
+  input: "g_lim/while/gl_ifft_ola/hw/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/sub_1/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/sub_1"
+  op: "Sub"
+  input: "g_lim/while/gl_ifft_ola/hw/add"
+  input: "g_lim/while/gl_ifft_ola/hw/sub_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/Cast_1"
+  op: "Cast"
+  input: "g_lim/while/gl_ifft_ola/hw/sub_1"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/range/start"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/range/delta"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/range"
+  op: "Range"
+  input: "g_lim/while/gl_ifft_ola/hw/range/start"
+  input: "g_lim/while/gl_ifft_ola/frame_length"
+  input: "g_lim/while/gl_ifft_ola/hw/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/Cast_2"
+  op: "Cast"
+  input: "g_lim/while/gl_ifft_ola/hw/range"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/Const"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 6.28318548203
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/mul_1"
+  op: "Mul"
+  input: "g_lim/while/gl_ifft_ola/hw/Const"
+  input: "g_lim/while/gl_ifft_ola/hw/Cast_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/truediv"
+  op: "RealDiv"
+  input: "g_lim/while/gl_ifft_ola/hw/mul_1"
+  input: "g_lim/while/gl_ifft_ola/hw/Cast_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/Cos"
+  op: "Cos"
+  input: "g_lim/while/gl_ifft_ola/hw/truediv"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/mul_2/x"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/mul_2"
+  op: "Mul"
+  input: "g_lim/while/gl_ifft_ola/hw/mul_2/x"
+  input: "g_lim/while/gl_ifft_ola/hw/Cos"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/sub_2/x"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/hw/sub_2"
+  op: "Sub"
+  input: "g_lim/while/gl_ifft_ola/hw/sub_2/x"
+  input: "g_lim/while/gl_ifft_ola/hw/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/mul"
+  op: "Mul"
+  input: "g_lim/while/gl_ifft_ola/strided_slice"
+  input: "g_lim/while/gl_ifft_ola/hw/sub_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/Shape"
+  op: "Shape"
+  input: "g_lim/while/gl_ifft_ola/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice/stack"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice/stack_1"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -2
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice/stack_2"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice"
+  op: "StridedSlice"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/Shape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice/stack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice/stack_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/Rank"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_1/stack"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -2
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_1/stack_1"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_1/stack_2"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_1"
+  op: "StridedSlice"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/Shape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_1/stack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_1/stack_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_2/stack"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_2/stack_1"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_2/stack_2"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_2"
+  op: "StridedSlice"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/Shape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_2/stack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_2/stack_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Enter"
+  op: "Enter"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Enter_1"
+  op: "Enter"
+  input: "g_lim/while/gl_ifft_ola/frame_step"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Merge"
+  op: "Merge"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Enter"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Merge_1"
+  op: "Merge"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Enter_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/zeros_like"
+  op: "Const"
+  input: "^g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Merge"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Greater"
+  op: "Greater"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Merge_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/zeros_like"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/LoopCond"
+  op: "LoopCond"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Greater"
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Switch"
+  op: "Switch"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Merge"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Merge"
+      }
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Switch_1"
+  op: "Switch"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Merge_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Merge_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Identity"
+  op: "Identity"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Identity_1"
+  op: "Identity"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/FloorMod"
+  op: "FloorMod"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Identity"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/NextIteration"
+  op: "NextIteration"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/NextIteration_1"
+  op: "NextIteration"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/FloorMod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Exit"
+  op: "Exit"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Exit_1"
+  op: "Exit"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/floordiv"
+  op: "FloorDiv"
+  input: "g_lim/while/gl_ifft_ola/frame_step"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/floordiv_1"
+  op: "FloorDiv"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/sub/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/sub"
+  op: "Sub"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/sub/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/mul"
+  op: "Mul"
+  input: "g_lim/while/gl_ifft_ola/frame_step"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/add"
+  op: "Add"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/mul"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/floordiv_2"
+  op: "FloorDiv"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/add"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/concat/values_1/0"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/concat/values_1"
+  op: "Pack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/concat/values_1/0"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/gcd/while/Exit"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/concat/axis"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/concat"
+  op: "ConcatV2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/concat/values_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/Reshape"
+  op: "Reshape"
+  input: "g_lim/while/gl_ifft_ola/mul"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/k"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/Rank_1"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/range/start"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/range/delta"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/range"
+  op: "Range"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/range/start"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/Rank_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/sub_1"
+  op: "Sub"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/Rank_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/k"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/packed"
+  op: "Pack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/sub_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/k"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/split/split_dim"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/split"
+  op: "SplitV"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/range"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/packed"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/split/split_dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tlen"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_1/axis"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_1"
+  op: "ConcatV2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/split:1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/split"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/transpose"
+  op: "Transpose"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/Reshape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/range_1/start"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/range_1/delta"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/range_1"
+  op: "Range"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/range_1/start"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/floordiv_2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/range_1/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/axis"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Rank"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range/start"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range/delta"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range"
+  op: "Range"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range/start"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Rank"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/add/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/add"
+  op: "Add"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/axis"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/strided_slice/stack"
+  op: "Pack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/axis"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/strided_slice/stack_1"
+  op: "Pack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/add"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/strided_slice/stack_2"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/strided_slice"
+  op: "StridedSlice"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/strided_slice/stack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/strided_slice/stack_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Shape"
+  op: "Shape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/range_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/sub/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/sub"
+  op: "Sub"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Rank"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/sub/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/sub_1"
+  op: "Sub"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/sub"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/packed/1"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/packed"
+  op: "Pack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/strided_slice"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/packed/1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/sub_1"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/split/split_dim"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/split"
+  op: "SplitV"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Shape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/packed"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/split/split_dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tlen"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "num_split"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape/shape"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape"
+  op: "Reshape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/split:1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Size"
+  op: "Size"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Size_1"
+  op: "Size"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/split:2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/sub_2"
+  op: "Sub"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/floordiv_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/floordiv"
+  op: "FloorDiv"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/sub_2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/floordiv"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/add_1/x"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/add_1"
+  op: "Add"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/add_1/x"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/floordiv"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Maximum/x"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Maximum"
+  op: "Maximum"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Maximum/x"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Enter"
+  op: "Enter"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/floordiv_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Enter_1"
+  op: "Enter"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/floordiv"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Merge"
+  op: "Merge"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Enter"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Merge_1"
+  op: "Merge"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Enter_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/zeros_like"
+  op: "Const"
+  input: "^g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Merge"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Greater"
+  op: "Greater"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Merge_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/zeros_like"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/LoopCond"
+  op: "LoopCond"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Greater"
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Switch"
+  op: "Switch"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Merge"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Merge"
+      }
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Switch_1"
+  op: "Switch"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Merge_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Merge_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Identity"
+  op: "Identity"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Identity_1"
+  op: "Identity"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/FloorMod"
+  op: "FloorMod"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Identity"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/NextIteration"
+  op: "NextIteration"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/NextIteration_1"
+  op: "NextIteration"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/FloorMod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Exit"
+  op: "Exit"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Exit_1"
+  op: "Exit"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/floordiv_1"
+  op: "FloorDiv"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/floordiv_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/floordiv_2"
+  op: "FloorDiv"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/floordiv"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/floordiv_3"
+  op: "FloorDiv"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/mul"
+  op: "Mul"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/floordiv_3"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat/values_1"
+  op: "Pack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/mul"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat/axis"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat"
+  op: "ConcatV2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/split"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat/values_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/split:2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat_1/values_1"
+  op: "Pack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/floordiv_3"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/gcd/while/Exit"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat_1/axis"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat_1"
+  op: "ConcatV2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/split"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat_1/values_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/split:2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/zeros_like"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/ones_like/Shape"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/ones_like/Const"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/ones_like"
+  op: "Fill"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/ones_like/Shape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/ones_like/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/StridedSlice"
+  op: "StridedSlice"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/range_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/zeros_like"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/ones_like"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_1"
+  op: "Reshape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/StridedSlice"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range_1/start"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range_1/delta"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range_1"
+  op: "Range"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range_1/start"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Maximum"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range_1/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/mul_1"
+  op: "Mul"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/floordiv_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_2/shape/1"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_2/shape"
+  op: "Pack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Maximum"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_2/shape/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_2"
+  op: "Reshape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/mul_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_2/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range_2/start"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range_2/delta"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range_2"
+  op: "Range"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range_2/start"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/floordiv_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range_2/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_3/shape/0"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_3/shape"
+  op: "Pack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_3/shape/0"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/floordiv_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_3"
+  op: "Reshape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/range_2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_3/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/add_2"
+  op: "Add"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/GatherV2"
+  op: "GatherV2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/add_2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/strided_slice"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat_2/values_1"
+  op: "Pack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Maximum"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/floordiv_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat_2/axis"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat_2"
+  op: "ConcatV2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/split"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat_2/values_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/split:2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_4"
+  op: "Reshape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/GatherV2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/Reshape_1/shape"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/Reshape_1"
+  op: "Reshape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/frame/Reshape_4"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/UnsortedSegmentSum"
+  op: "UnsortedSegmentSum"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/transpose"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/Reshape_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/floordiv_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tnumsegments"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_2/values_1"
+  op: "Pack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/add"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_2/axis"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_2"
+  op: "ConcatV2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/strided_slice"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_2/values_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/sub_2/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/sub_2"
+  op: "Sub"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/Rank"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/sub_2/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/Rank_2"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/range_2/start"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/range_2/delta"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/range_2"
+  op: "Range"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/range_2/start"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/Rank_2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/range_2/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/sub_3"
+  op: "Sub"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/Rank_2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/sub_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/packed_1"
+  op: "Pack"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/sub_3"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/sub_2"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/split_1/split_dim"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/split_1"
+  op: "SplitV"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/range_2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/packed_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/split_1/split_dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tlen"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_3/axis"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_3"
+  op: "ConcatV2"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/split_1:1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/split_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_3/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/transpose_1"
+  op: "Transpose"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/UnsortedSegmentSum"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_ifft_ola/overlap_and_add/Reshape_2"
+  op: "Reshape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/transpose_1"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame_length"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1200
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame_step"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 300
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/fft_length"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2048
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/axis"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Rank"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/range/start"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/range/delta"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/range"
+  op: "Range"
+  input: "g_lim/while/gl_stft/frame/range/start"
+  input: "g_lim/while/gl_stft/frame/Rank"
+  input: "g_lim/while/gl_stft/frame/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/add/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/add"
+  op: "Add"
+  input: "g_lim/while/gl_stft/frame/axis"
+  input: "g_lim/while/gl_stft/frame/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/strided_slice/stack"
+  op: "Pack"
+  input: "g_lim/while/gl_stft/frame/axis"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/strided_slice/stack_1"
+  op: "Pack"
+  input: "g_lim/while/gl_stft/frame/add"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/strided_slice/stack_2"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/strided_slice"
+  op: "StridedSlice"
+  input: "g_lim/while/gl_stft/frame/range"
+  input: "g_lim/while/gl_stft/frame/strided_slice/stack"
+  input: "g_lim/while/gl_stft/frame/strided_slice/stack_1"
+  input: "g_lim/while/gl_stft/frame/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Shape"
+  op: "Shape"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/Reshape_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/sub/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/sub"
+  op: "Sub"
+  input: "g_lim/while/gl_stft/frame/Rank"
+  input: "g_lim/while/gl_stft/frame/sub/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/sub_1"
+  op: "Sub"
+  input: "g_lim/while/gl_stft/frame/sub"
+  input: "g_lim/while/gl_stft/frame/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/packed/1"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/packed"
+  op: "Pack"
+  input: "g_lim/while/gl_stft/frame/strided_slice"
+  input: "g_lim/while/gl_stft/frame/packed/1"
+  input: "g_lim/while/gl_stft/frame/sub_1"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/split/split_dim"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/split"
+  op: "SplitV"
+  input: "g_lim/while/gl_stft/frame/Shape"
+  input: "g_lim/while/gl_stft/frame/packed"
+  input: "g_lim/while/gl_stft/frame/split/split_dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tlen"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "num_split"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Reshape/shape"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Reshape"
+  op: "Reshape"
+  input: "g_lim/while/gl_stft/frame/split:1"
+  input: "g_lim/while/gl_stft/frame/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Size"
+  op: "Size"
+  input: "g_lim/while/gl_stft/frame/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Size_1"
+  op: "Size"
+  input: "g_lim/while/gl_stft/frame/split:2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/sub_2"
+  op: "Sub"
+  input: "g_lim/while/gl_stft/frame/Reshape"
+  input: "g_lim/while/gl_stft/frame_length"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/floordiv"
+  op: "FloorDiv"
+  input: "g_lim/while/gl_stft/frame/sub_2"
+  input: "g_lim/while/gl_stft/frame_step"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/add_1/x"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/add_1"
+  op: "Add"
+  input: "g_lim/while/gl_stft/frame/add_1/x"
+  input: "g_lim/while/gl_stft/frame/floordiv"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Maximum/x"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Maximum"
+  op: "Maximum"
+  input: "g_lim/while/gl_stft/frame/Maximum/x"
+  input: "g_lim/while/gl_stft/frame/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/gcd/Const"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 300
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/floordiv_1"
+  op: "FloorDiv"
+  input: "g_lim/while/gl_stft/frame_length"
+  input: "g_lim/while/gl_stft/frame/gcd/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/floordiv_2"
+  op: "FloorDiv"
+  input: "g_lim/while/gl_stft/frame_step"
+  input: "g_lim/while/gl_stft/frame/gcd/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/floordiv_3"
+  op: "FloorDiv"
+  input: "g_lim/while/gl_stft/frame/Reshape"
+  input: "g_lim/while/gl_stft/frame/gcd/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/mul"
+  op: "Mul"
+  input: "g_lim/while/gl_stft/frame/floordiv_3"
+  input: "g_lim/while/gl_stft/frame/gcd/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/concat/values_1"
+  op: "Pack"
+  input: "g_lim/while/gl_stft/frame/mul"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/concat/axis"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/concat"
+  op: "ConcatV2"
+  input: "g_lim/while/gl_stft/frame/split"
+  input: "g_lim/while/gl_stft/frame/concat/values_1"
+  input: "g_lim/while/gl_stft/frame/split:2"
+  input: "g_lim/while/gl_stft/frame/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/concat_1/values_1"
+  op: "Pack"
+  input: "g_lim/while/gl_stft/frame/floordiv_3"
+  input: "g_lim/while/gl_stft/frame/gcd/Const"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/concat_1/axis"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/concat_1"
+  op: "ConcatV2"
+  input: "g_lim/while/gl_stft/frame/split"
+  input: "g_lim/while/gl_stft/frame/concat_1/values_1"
+  input: "g_lim/while/gl_stft/frame/split:2"
+  input: "g_lim/while/gl_stft/frame/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/zeros_like"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/ones_like/Shape"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/ones_like/Const"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/ones_like"
+  op: "Fill"
+  input: "g_lim/while/gl_stft/frame/ones_like/Shape"
+  input: "g_lim/while/gl_stft/frame/ones_like/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/StridedSlice"
+  op: "StridedSlice"
+  input: "g_lim/while/gl_ifft_ola/overlap_and_add/Reshape_2"
+  input: "g_lim/while/gl_stft/frame/zeros_like"
+  input: "g_lim/while/gl_stft/frame/concat"
+  input: "g_lim/while/gl_stft/frame/ones_like"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Reshape_1"
+  op: "Reshape"
+  input: "g_lim/while/gl_stft/frame/StridedSlice"
+  input: "g_lim/while/gl_stft/frame/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/range_1/start"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/range_1/delta"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/range_1"
+  op: "Range"
+  input: "g_lim/while/gl_stft/frame/range_1/start"
+  input: "g_lim/while/gl_stft/frame/Maximum"
+  input: "g_lim/while/gl_stft/frame/range_1/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/mul_1"
+  op: "Mul"
+  input: "g_lim/while/gl_stft/frame/range_1"
+  input: "g_lim/while/gl_stft/frame/floordiv_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Reshape_2/shape/1"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Reshape_2/shape"
+  op: "Pack"
+  input: "g_lim/while/gl_stft/frame/Maximum"
+  input: "g_lim/while/gl_stft/frame/Reshape_2/shape/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Reshape_2"
+  op: "Reshape"
+  input: "g_lim/while/gl_stft/frame/mul_1"
+  input: "g_lim/while/gl_stft/frame/Reshape_2/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/range_2/start"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/range_2/delta"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/range_2"
+  op: "Range"
+  input: "g_lim/while/gl_stft/frame/range_2/start"
+  input: "g_lim/while/gl_stft/frame/floordiv_1"
+  input: "g_lim/while/gl_stft/frame/range_2/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Reshape_3/shape/0"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Reshape_3/shape"
+  op: "Pack"
+  input: "g_lim/while/gl_stft/frame/Reshape_3/shape/0"
+  input: "g_lim/while/gl_stft/frame/floordiv_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Reshape_3"
+  op: "Reshape"
+  input: "g_lim/while/gl_stft/frame/range_2"
+  input: "g_lim/while/gl_stft/frame/Reshape_3/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/add_2"
+  op: "Add"
+  input: "g_lim/while/gl_stft/frame/Reshape_2"
+  input: "g_lim/while/gl_stft/frame/Reshape_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/GatherV2"
+  op: "GatherV2"
+  input: "g_lim/while/gl_stft/frame/Reshape_1"
+  input: "g_lim/while/gl_stft/frame/add_2"
+  input: "g_lim/while/gl_stft/frame/strided_slice"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/concat_2/values_1"
+  op: "Pack"
+  input: "g_lim/while/gl_stft/frame/Maximum"
+  input: "g_lim/while/gl_stft/frame_length"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/concat_2/axis"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/frame/concat_2"
+  op: "ConcatV2"
+  input: "g_lim/while/gl_stft/frame/split"
+  input: "g_lim/while/gl_stft/frame/concat_2/values_1"
+  input: "g_lim/while/gl_stft/frame/split:2"
+  input: "g_lim/while/gl_stft/frame/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/frame/Reshape_4"
+  op: "Reshape"
+  input: "g_lim/while/gl_stft/frame/GatherV2"
+  input: "g_lim/while/gl_stft/frame/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/periodic"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: true
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/Cast"
+  op: "Cast"
+  input: "g_lim/while/gl_stft/hw/periodic"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/hw/FloorMod/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/FloorMod"
+  op: "FloorMod"
+  input: "g_lim/while/gl_stft/frame_length"
+  input: "g_lim/while/gl_stft/hw/FloorMod/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/hw/sub/x"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/sub"
+  op: "Sub"
+  input: "g_lim/while/gl_stft/hw/sub/x"
+  input: "g_lim/while/gl_stft/hw/FloorMod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/hw/mul"
+  op: "Mul"
+  input: "g_lim/while/gl_stft/hw/Cast"
+  input: "g_lim/while/gl_stft/hw/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/hw/add"
+  op: "Add"
+  input: "g_lim/while/gl_stft/frame_length"
+  input: "g_lim/while/gl_stft/hw/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/hw/sub_1/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/sub_1"
+  op: "Sub"
+  input: "g_lim/while/gl_stft/hw/add"
+  input: "g_lim/while/gl_stft/hw/sub_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/hw/Cast_1"
+  op: "Cast"
+  input: "g_lim/while/gl_stft/hw/sub_1"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/gl_stft/hw/range/start"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/range/delta"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/range"
+  op: "Range"
+  input: "g_lim/while/gl_stft/hw/range/start"
+  input: "g_lim/while/gl_stft/frame_length"
+  input: "g_lim/while/gl_stft/hw/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/Cast_2"
+  op: "Cast"
+  input: "g_lim/while/gl_stft/hw/range"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/Const"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 6.28318548203
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/mul_1"
+  op: "Mul"
+  input: "g_lim/while/gl_stft/hw/Const"
+  input: "g_lim/while/gl_stft/hw/Cast_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/truediv"
+  op: "RealDiv"
+  input: "g_lim/while/gl_stft/hw/mul_1"
+  input: "g_lim/while/gl_stft/hw/Cast_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/Cos"
+  op: "Cos"
+  input: "g_lim/while/gl_stft/hw/truediv"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/mul_2/x"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/mul_2"
+  op: "Mul"
+  input: "g_lim/while/gl_stft/hw/mul_2/x"
+  input: "g_lim/while/gl_stft/hw/Cos"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/sub_2/x"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/hw/sub_2"
+  op: "Sub"
+  input: "g_lim/while/gl_stft/hw/sub_2/x"
+  input: "g_lim/while/gl_stft/hw/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/mul"
+  op: "Mul"
+  input: "g_lim/while/gl_stft/frame/Reshape_4"
+  input: "g_lim/while/gl_stft/hw/sub_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/rfft/packed"
+  op: "Pack"
+  input: "g_lim/while/gl_stft/fft_length"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/rfft/Pad/paddings"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000P\003\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/rfft/Pad"
+  op: "Pad"
+  input: "g_lim/while/gl_stft/mul"
+  input: "g_lim/while/gl_stft/rfft/Pad/paddings"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tpaddings"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2048
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/gl_stft/rfft"
+  op: "RFFT"
+  input: "g_lim/while/gl_stft/rfft/Pad"
+  input: "g_lim/while/gl_stft/rfft/packed"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/Imag"
+  op: "Imag"
+  input: "g_lim/while/gl_stft/rfft"
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/Real"
+  op: "Real"
+  input: "g_lim/while/gl_stft/rfft"
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/div"
+  op: "RealDiv"
+  input: "g_lim/while/Imag"
+  input: "g_lim/while/Real"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Atan"
+  op: "Atan"
+  input: "g_lim/while/atan2/div"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Less/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Less"
+  op: "Less"
+  input: "g_lim/while/Real"
+  input: "g_lim/while/atan2/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Equal/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Equal"
+  op: "Equal"
+  input: "g_lim/while/Real"
+  input: "g_lim/while/atan2/Equal/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Less_1/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Less_1"
+  op: "Less"
+  input: "g_lim/while/Imag"
+  input: "g_lim/while/atan2/Less_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/ones_like/Shape"
+  op: "Shape"
+  input: "g_lim/while/Real"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/ones_like/Const"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/ones_like"
+  op: "Fill"
+  input: "g_lim/while/atan2/ones_like/Shape"
+  input: "g_lim/while/atan2/ones_like/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/zeros_like"
+  op: "ZerosLike"
+  input: "g_lim/while/Real"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/mul/x"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: nan
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/mul"
+  op: "Mul"
+  input: "g_lim/while/atan2/mul/x"
+  input: "g_lim/while/atan2/ones_like"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Greater/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Greater"
+  op: "Greater"
+  input: "g_lim/while/Real"
+  input: "g_lim/while/atan2/Greater/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Select"
+  op: "Select"
+  input: "g_lim/while/atan2/Greater"
+  input: "g_lim/while/atan2/Atan"
+  input: "g_lim/while/atan2/zeros_like"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/GreaterEqual/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/GreaterEqual"
+  op: "GreaterEqual"
+  input: "g_lim/while/Imag"
+  input: "g_lim/while/atan2/GreaterEqual/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/LogicalAnd"
+  op: "LogicalAnd"
+  input: "g_lim/while/atan2/Less"
+  input: "g_lim/while/atan2/GreaterEqual"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/add/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 3.14159274101
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/add"
+  op: "Add"
+  input: "g_lim/while/atan2/Atan"
+  input: "g_lim/while/atan2/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Select_1"
+  op: "Select"
+  input: "g_lim/while/atan2/LogicalAnd"
+  input: "g_lim/while/atan2/add"
+  input: "g_lim/while/atan2/Select"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/LogicalAnd_1"
+  op: "LogicalAnd"
+  input: "g_lim/while/atan2/Less"
+  input: "g_lim/while/atan2/Less_1"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/sub/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 3.14159274101
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/sub"
+  op: "Sub"
+  input: "g_lim/while/atan2/Atan"
+  input: "g_lim/while/atan2/sub/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Select_2"
+  op: "Select"
+  input: "g_lim/while/atan2/LogicalAnd_1"
+  input: "g_lim/while/atan2/sub"
+  input: "g_lim/while/atan2/Select_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Greater_1/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Greater_1"
+  op: "Greater"
+  input: "g_lim/while/Imag"
+  input: "g_lim/while/atan2/Greater_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/LogicalAnd_2"
+  op: "LogicalAnd"
+  input: "g_lim/while/atan2/Equal"
+  input: "g_lim/while/atan2/Greater_1"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/mul_1/x"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.57079637051
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/mul_1"
+  op: "Mul"
+  input: "g_lim/while/atan2/mul_1/x"
+  input: "g_lim/while/atan2/ones_like"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Select_3"
+  op: "Select"
+  input: "g_lim/while/atan2/LogicalAnd_2"
+  input: "g_lim/while/atan2/mul_1"
+  input: "g_lim/while/atan2/Select_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/LogicalAnd_3"
+  op: "LogicalAnd"
+  input: "g_lim/while/atan2/Equal"
+  input: "g_lim/while/atan2/Less_1"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/mul_2/x"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -1.57079637051
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/mul_2"
+  op: "Mul"
+  input: "g_lim/while/atan2/mul_2/x"
+  input: "g_lim/while/atan2/ones_like"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Select_4"
+  op: "Select"
+  input: "g_lim/while/atan2/LogicalAnd_3"
+  input: "g_lim/while/atan2/mul_2"
+  input: "g_lim/while/atan2/Select_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/IsNan"
+  op: "IsNan"
+  input: "g_lim/while/Real"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/IsNan_1"
+  op: "IsNan"
+  input: "g_lim/while/Imag"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/LogicalOr"
+  op: "LogicalOr"
+  input: "g_lim/while/atan2/IsNan"
+  input: "g_lim/while/atan2/IsNan_1"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/atan2/Select_5"
+  op: "Select"
+  input: "g_lim/while/atan2/LogicalOr"
+  input: "g_lim/while/atan2/mul"
+  input: "g_lim/while/atan2/Select_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/imag"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/Complex"
+  op: "Complex"
+  input: "g_lim/while/Complex/Enter"
+  input: "g_lim/while/imag"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/Complex/Enter"
+  op: "Enter"
+  input: "pow_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "g_lim/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "g_lim/while/real"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/Complex_1"
+  op: "Complex"
+  input: "g_lim/while/real"
+  input: "g_lim/while/atan2/Select_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/Exp"
+  op: "Exp"
+  input: "g_lim/while/Complex_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/mul"
+  op: "Mul"
+  input: "g_lim/while/Complex"
+  input: "g_lim/while/Exp"
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/add/y"
+  op: "Const"
+  input: "^g_lim/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/add"
+  op: "Add"
+  input: "g_lim/while/Identity"
+  input: "g_lim/while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/NextIteration"
+  op: "NextIteration"
+  input: "g_lim/while/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/NextIteration_1"
+  op: "NextIteration"
+  input: "g_lim/while/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "g_lim/while/Exit"
+  op: "Exit"
+  input: "g_lim/while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "g_lim/while/Exit_1"
+  op: "Exit"
+  input: "g_lim/while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_COMPLEX64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1025
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/frame_length"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1200
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/frame_step"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 300
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/fft_length"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2048
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/irfft/packed"
+  op: "Pack"
+  input: "inverse_stft/fft_length"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/irfft"
+  op: "IRFFT"
+  input: "g_lim/while/Exit_1"
+  input: "inverse_stft/irfft/packed"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 2048
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\260\004\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/strided_slice"
+  op: "StridedSlice"
+  input: "inverse_stft/irfft"
+  input: "inverse_stft/strided_slice/stack"
+  input: "inverse_stft/strided_slice/stack_1"
+  input: "inverse_stft/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "begin_mask"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/periodic"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: true
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/Cast"
+  op: "Cast"
+  input: "inverse_stft/hw/periodic"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_BOOL
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/hw/FloorMod/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/FloorMod"
+  op: "FloorMod"
+  input: "inverse_stft/frame_length"
+  input: "inverse_stft/hw/FloorMod/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/hw/sub/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/sub"
+  op: "Sub"
+  input: "inverse_stft/hw/sub/x"
+  input: "inverse_stft/hw/FloorMod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/hw/mul"
+  op: "Mul"
+  input: "inverse_stft/hw/Cast"
+  input: "inverse_stft/hw/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/hw/add"
+  op: "Add"
+  input: "inverse_stft/frame_length"
+  input: "inverse_stft/hw/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/hw/sub_1/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/sub_1"
+  op: "Sub"
+  input: "inverse_stft/hw/add"
+  input: "inverse_stft/hw/sub_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/hw/Cast_1"
+  op: "Cast"
+  input: "inverse_stft/hw/sub_1"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/hw/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/range"
+  op: "Range"
+  input: "inverse_stft/hw/range/start"
+  input: "inverse_stft/frame_length"
+  input: "inverse_stft/hw/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/Cast_2"
+  op: "Cast"
+  input: "inverse_stft/hw/range"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 6.28318548203
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/mul_1"
+  op: "Mul"
+  input: "inverse_stft/hw/Const"
+  input: "inverse_stft/hw/Cast_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/truediv"
+  op: "RealDiv"
+  input: "inverse_stft/hw/mul_1"
+  input: "inverse_stft/hw/Cast_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/Cos"
+  op: "Cos"
+  input: "inverse_stft/hw/truediv"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/mul_2/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/mul_2"
+  op: "Mul"
+  input: "inverse_stft/hw/mul_2/x"
+  input: "inverse_stft/hw/Cos"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/sub_2/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/hw/sub_2"
+  op: "Sub"
+  input: "inverse_stft/hw/sub_2/x"
+  input: "inverse_stft/hw/mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/mul"
+  op: "Mul"
+  input: "inverse_stft/strided_slice"
+  input: "inverse_stft/hw/sub_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: 1200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/Shape"
+  op: "Shape"
+  input: "inverse_stft/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -2
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/strided_slice"
+  op: "StridedSlice"
+  input: "inverse_stft/overlap_and_add/Shape"
+  input: "inverse_stft/overlap_and_add/strided_slice/stack"
+  input: "inverse_stft/overlap_and_add/strided_slice/stack_1"
+  input: "inverse_stft/overlap_and_add/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -2
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/strided_slice_1"
+  op: "StridedSlice"
+  input: "inverse_stft/overlap_and_add/Shape"
+  input: "inverse_stft/overlap_and_add/strided_slice_1/stack"
+  input: "inverse_stft/overlap_and_add/strided_slice_1/stack_1"
+  input: "inverse_stft/overlap_and_add/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/strided_slice_2"
+  op: "StridedSlice"
+  input: "inverse_stft/overlap_and_add/Shape"
+  input: "inverse_stft/overlap_and_add/strided_slice_2/stack"
+  input: "inverse_stft/overlap_and_add/strided_slice_2/stack_1"
+  input: "inverse_stft/overlap_and_add/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/Enter"
+  op: "Enter"
+  input: "inverse_stft/overlap_and_add/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "inverse_stft/overlap_and_add/gcd/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/Enter_1"
+  op: "Enter"
+  input: "inverse_stft/frame_step"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "inverse_stft/overlap_and_add/gcd/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/Merge"
+  op: "Merge"
+  input: "inverse_stft/overlap_and_add/gcd/while/Enter"
+  input: "inverse_stft/overlap_and_add/gcd/while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/Merge_1"
+  op: "Merge"
+  input: "inverse_stft/overlap_and_add/gcd/while/Enter_1"
+  input: "inverse_stft/overlap_and_add/gcd/while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/zeros_like"
+  op: "Const"
+  input: "^inverse_stft/overlap_and_add/gcd/while/Merge"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/Greater"
+  op: "Greater"
+  input: "inverse_stft/overlap_and_add/gcd/while/Merge_1"
+  input: "inverse_stft/overlap_and_add/gcd/while/zeros_like"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/LoopCond"
+  op: "LoopCond"
+  input: "inverse_stft/overlap_and_add/gcd/while/Greater"
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/Switch"
+  op: "Switch"
+  input: "inverse_stft/overlap_and_add/gcd/while/Merge"
+  input: "inverse_stft/overlap_and_add/gcd/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@inverse_stft/overlap_and_add/gcd/while/Merge"
+      }
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/Switch_1"
+  op: "Switch"
+  input: "inverse_stft/overlap_and_add/gcd/while/Merge_1"
+  input: "inverse_stft/overlap_and_add/gcd/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@inverse_stft/overlap_and_add/gcd/while/Merge_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/Identity"
+  op: "Identity"
+  input: "inverse_stft/overlap_and_add/gcd/while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/Identity_1"
+  op: "Identity"
+  input: "inverse_stft/overlap_and_add/gcd/while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/FloorMod"
+  op: "FloorMod"
+  input: "inverse_stft/overlap_and_add/gcd/while/Identity"
+  input: "inverse_stft/overlap_and_add/gcd/while/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/NextIteration"
+  op: "NextIteration"
+  input: "inverse_stft/overlap_and_add/gcd/while/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/NextIteration_1"
+  op: "NextIteration"
+  input: "inverse_stft/overlap_and_add/gcd/while/FloorMod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/Exit"
+  op: "Exit"
+  input: "inverse_stft/overlap_and_add/gcd/while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/gcd/while/Exit_1"
+  op: "Exit"
+  input: "inverse_stft/overlap_and_add/gcd/while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/floordiv"
+  op: "FloorDiv"
+  input: "inverse_stft/frame_step"
+  input: "inverse_stft/overlap_and_add/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/floordiv_1"
+  op: "FloorDiv"
+  input: "inverse_stft/overlap_and_add/strided_slice_2"
+  input: "inverse_stft/overlap_and_add/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/sub/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/sub"
+  op: "Sub"
+  input: "inverse_stft/overlap_and_add/strided_slice_1"
+  input: "inverse_stft/overlap_and_add/sub/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/mul"
+  op: "Mul"
+  input: "inverse_stft/frame_step"
+  input: "inverse_stft/overlap_and_add/sub"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/add"
+  op: "Add"
+  input: "inverse_stft/overlap_and_add/mul"
+  input: "inverse_stft/overlap_and_add/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/floordiv_2"
+  op: "FloorDiv"
+  input: "inverse_stft/overlap_and_add/add"
+  input: "inverse_stft/overlap_and_add/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/concat/values_1/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/concat/values_1"
+  op: "Pack"
+  input: "inverse_stft/overlap_and_add/concat/values_1/0"
+  input: "inverse_stft/overlap_and_add/gcd/while/Exit"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/concat"
+  op: "ConcatV2"
+  input: "inverse_stft/overlap_and_add/strided_slice"
+  input: "inverse_stft/overlap_and_add/concat/values_1"
+  input: "inverse_stft/overlap_and_add/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/Reshape"
+  op: "Reshape"
+  input: "inverse_stft/mul"
+  input: "inverse_stft/overlap_and_add/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/k"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/Rank_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/range"
+  op: "Range"
+  input: "inverse_stft/overlap_and_add/range/start"
+  input: "inverse_stft/overlap_and_add/Rank_1"
+  input: "inverse_stft/overlap_and_add/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/sub_1"
+  op: "Sub"
+  input: "inverse_stft/overlap_and_add/Rank_1"
+  input: "inverse_stft/overlap_and_add/k"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/packed"
+  op: "Pack"
+  input: "inverse_stft/overlap_and_add/sub_1"
+  input: "inverse_stft/overlap_and_add/k"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/split"
+  op: "SplitV"
+  input: "inverse_stft/overlap_and_add/range"
+  input: "inverse_stft/overlap_and_add/packed"
+  input: "inverse_stft/overlap_and_add/split/split_dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tlen"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/concat_1"
+  op: "ConcatV2"
+  input: "inverse_stft/overlap_and_add/split:1"
+  input: "inverse_stft/overlap_and_add/split"
+  input: "inverse_stft/overlap_and_add/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/transpose"
+  op: "Transpose"
+  input: "inverse_stft/overlap_and_add/Reshape"
+  input: "inverse_stft/overlap_and_add/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/range_1/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/range_1/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/range_1"
+  op: "Range"
+  input: "inverse_stft/overlap_and_add/range_1/start"
+  input: "inverse_stft/overlap_and_add/floordiv_2"
+  input: "inverse_stft/overlap_and_add/range_1/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Rank"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/range/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/range/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/range"
+  op: "Range"
+  input: "inverse_stft/overlap_and_add/frame/range/start"
+  input: "inverse_stft/overlap_and_add/frame/Rank"
+  input: "inverse_stft/overlap_and_add/frame/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/add/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/add"
+  op: "Add"
+  input: "inverse_stft/overlap_and_add/frame/axis"
+  input: "inverse_stft/overlap_and_add/frame/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/strided_slice/stack"
+  op: "Pack"
+  input: "inverse_stft/overlap_and_add/frame/axis"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/strided_slice/stack_1"
+  op: "Pack"
+  input: "inverse_stft/overlap_and_add/frame/add"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/strided_slice"
+  op: "StridedSlice"
+  input: "inverse_stft/overlap_and_add/frame/range"
+  input: "inverse_stft/overlap_and_add/frame/strided_slice/stack"
+  input: "inverse_stft/overlap_and_add/frame/strided_slice/stack_1"
+  input: "inverse_stft/overlap_and_add/frame/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Shape"
+  op: "Shape"
+  input: "inverse_stft/overlap_and_add/range_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/sub/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/sub"
+  op: "Sub"
+  input: "inverse_stft/overlap_and_add/frame/Rank"
+  input: "inverse_stft/overlap_and_add/frame/sub/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/sub_1"
+  op: "Sub"
+  input: "inverse_stft/overlap_and_add/frame/sub"
+  input: "inverse_stft/overlap_and_add/frame/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/packed/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/packed"
+  op: "Pack"
+  input: "inverse_stft/overlap_and_add/frame/strided_slice"
+  input: "inverse_stft/overlap_and_add/frame/packed/1"
+  input: "inverse_stft/overlap_and_add/frame/sub_1"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/split/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/split"
+  op: "SplitV"
+  input: "inverse_stft/overlap_and_add/frame/Shape"
+  input: "inverse_stft/overlap_and_add/frame/packed"
+  input: "inverse_stft/overlap_and_add/frame/split/split_dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tlen"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "num_split"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Reshape/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Reshape"
+  op: "Reshape"
+  input: "inverse_stft/overlap_and_add/frame/split:1"
+  input: "inverse_stft/overlap_and_add/frame/Reshape/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Size"
+  op: "Size"
+  input: "inverse_stft/overlap_and_add/frame/split"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Size_1"
+  op: "Size"
+  input: "inverse_stft/overlap_and_add/frame/split:2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/sub_2"
+  op: "Sub"
+  input: "inverse_stft/overlap_and_add/frame/Reshape"
+  input: "inverse_stft/overlap_and_add/floordiv_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/floordiv"
+  op: "FloorDiv"
+  input: "inverse_stft/overlap_and_add/frame/sub_2"
+  input: "inverse_stft/overlap_and_add/floordiv"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/add_1/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/add_1"
+  op: "Add"
+  input: "inverse_stft/overlap_and_add/frame/add_1/x"
+  input: "inverse_stft/overlap_and_add/frame/floordiv"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Maximum/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Maximum"
+  op: "Maximum"
+  input: "inverse_stft/overlap_and_add/frame/Maximum/x"
+  input: "inverse_stft/overlap_and_add/frame/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/Enter"
+  op: "Enter"
+  input: "inverse_stft/overlap_and_add/floordiv_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "inverse_stft/overlap_and_add/frame/gcd/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/Enter_1"
+  op: "Enter"
+  input: "inverse_stft/overlap_and_add/floordiv"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "inverse_stft/overlap_and_add/frame/gcd/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/Merge"
+  op: "Merge"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Enter"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/Merge_1"
+  op: "Merge"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Enter_1"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/zeros_like"
+  op: "Const"
+  input: "^inverse_stft/overlap_and_add/frame/gcd/while/Merge"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/Greater"
+  op: "Greater"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Merge_1"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/zeros_like"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/LoopCond"
+  op: "LoopCond"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Greater"
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/Switch"
+  op: "Switch"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Merge"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@inverse_stft/overlap_and_add/frame/gcd/while/Merge"
+      }
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/Switch_1"
+  op: "Switch"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Merge_1"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@inverse_stft/overlap_and_add/frame/gcd/while/Merge_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/Identity"
+  op: "Identity"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/Identity_1"
+  op: "Identity"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/FloorMod"
+  op: "FloorMod"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Identity"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/NextIteration"
+  op: "NextIteration"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/NextIteration_1"
+  op: "NextIteration"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/FloorMod"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/Exit"
+  op: "Exit"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/gcd/while/Exit_1"
+  op: "Exit"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/floordiv_1"
+  op: "FloorDiv"
+  input: "inverse_stft/overlap_and_add/floordiv_1"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/floordiv_2"
+  op: "FloorDiv"
+  input: "inverse_stft/overlap_and_add/floordiv"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/floordiv_3"
+  op: "FloorDiv"
+  input: "inverse_stft/overlap_and_add/frame/Reshape"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/mul"
+  op: "Mul"
+  input: "inverse_stft/overlap_and_add/frame/floordiv_3"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Exit"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/concat/values_1"
+  op: "Pack"
+  input: "inverse_stft/overlap_and_add/frame/mul"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/concat/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/concat"
+  op: "ConcatV2"
+  input: "inverse_stft/overlap_and_add/frame/split"
+  input: "inverse_stft/overlap_and_add/frame/concat/values_1"
+  input: "inverse_stft/overlap_and_add/frame/split:2"
+  input: "inverse_stft/overlap_and_add/frame/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/concat_1/values_1"
+  op: "Pack"
+  input: "inverse_stft/overlap_and_add/frame/floordiv_3"
+  input: "inverse_stft/overlap_and_add/frame/gcd/while/Exit"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/concat_1/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/concat_1"
+  op: "ConcatV2"
+  input: "inverse_stft/overlap_and_add/frame/split"
+  input: "inverse_stft/overlap_and_add/frame/concat_1/values_1"
+  input: "inverse_stft/overlap_and_add/frame/split:2"
+  input: "inverse_stft/overlap_and_add/frame/concat_1/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/zeros_like"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/ones_like/Shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/ones_like/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/ones_like"
+  op: "Fill"
+  input: "inverse_stft/overlap_and_add/frame/ones_like/Shape"
+  input: "inverse_stft/overlap_and_add/frame/ones_like/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "index_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/StridedSlice"
+  op: "StridedSlice"
+  input: "inverse_stft/overlap_and_add/range_1"
+  input: "inverse_stft/overlap_and_add/frame/zeros_like"
+  input: "inverse_stft/overlap_and_add/frame/concat"
+  input: "inverse_stft/overlap_and_add/frame/ones_like"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Reshape_1"
+  op: "Reshape"
+  input: "inverse_stft/overlap_and_add/frame/StridedSlice"
+  input: "inverse_stft/overlap_and_add/frame/concat_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/range_1/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/range_1/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/range_1"
+  op: "Range"
+  input: "inverse_stft/overlap_and_add/frame/range_1/start"
+  input: "inverse_stft/overlap_and_add/frame/Maximum"
+  input: "inverse_stft/overlap_and_add/frame/range_1/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/mul_1"
+  op: "Mul"
+  input: "inverse_stft/overlap_and_add/frame/range_1"
+  input: "inverse_stft/overlap_and_add/frame/floordiv_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Reshape_2/shape/1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Reshape_2/shape"
+  op: "Pack"
+  input: "inverse_stft/overlap_and_add/frame/Maximum"
+  input: "inverse_stft/overlap_and_add/frame/Reshape_2/shape/1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Reshape_2"
+  op: "Reshape"
+  input: "inverse_stft/overlap_and_add/frame/mul_1"
+  input: "inverse_stft/overlap_and_add/frame/Reshape_2/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/range_2/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/range_2/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/range_2"
+  op: "Range"
+  input: "inverse_stft/overlap_and_add/frame/range_2/start"
+  input: "inverse_stft/overlap_and_add/frame/floordiv_1"
+  input: "inverse_stft/overlap_and_add/frame/range_2/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Reshape_3/shape/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Reshape_3/shape"
+  op: "Pack"
+  input: "inverse_stft/overlap_and_add/frame/Reshape_3/shape/0"
+  input: "inverse_stft/overlap_and_add/frame/floordiv_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Reshape_3"
+  op: "Reshape"
+  input: "inverse_stft/overlap_and_add/frame/range_2"
+  input: "inverse_stft/overlap_and_add/frame/Reshape_3/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/add_2"
+  op: "Add"
+  input: "inverse_stft/overlap_and_add/frame/Reshape_2"
+  input: "inverse_stft/overlap_and_add/frame/Reshape_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/GatherV2"
+  op: "GatherV2"
+  input: "inverse_stft/overlap_and_add/frame/Reshape_1"
+  input: "inverse_stft/overlap_and_add/frame/add_2"
+  input: "inverse_stft/overlap_and_add/frame/strided_slice"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/concat_2/values_1"
+  op: "Pack"
+  input: "inverse_stft/overlap_and_add/frame/Maximum"
+  input: "inverse_stft/overlap_and_add/floordiv_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/concat_2"
+  op: "ConcatV2"
+  input: "inverse_stft/overlap_and_add/frame/split"
+  input: "inverse_stft/overlap_and_add/frame/concat_2/values_1"
+  input: "inverse_stft/overlap_and_add/frame/split:2"
+  input: "inverse_stft/overlap_and_add/frame/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/frame/Reshape_4"
+  op: "Reshape"
+  input: "inverse_stft/overlap_and_add/frame/GatherV2"
+  input: "inverse_stft/overlap_and_add/frame/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/Reshape_1/shape"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/Reshape_1"
+  op: "Reshape"
+  input: "inverse_stft/overlap_and_add/frame/Reshape_4"
+  input: "inverse_stft/overlap_and_add/Reshape_1/shape"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/UnsortedSegmentSum"
+  op: "UnsortedSegmentSum"
+  input: "inverse_stft/overlap_and_add/transpose"
+  input: "inverse_stft/overlap_and_add/Reshape_1"
+  input: "inverse_stft/overlap_and_add/floordiv_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tnumsegments"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/concat_2/values_1"
+  op: "Pack"
+  input: "inverse_stft/overlap_and_add/add"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/concat_2/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/concat_2"
+  op: "ConcatV2"
+  input: "inverse_stft/overlap_and_add/strided_slice"
+  input: "inverse_stft/overlap_and_add/concat_2/values_1"
+  input: "inverse_stft/overlap_and_add/concat_2/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/sub_2/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/sub_2"
+  op: "Sub"
+  input: "inverse_stft/overlap_and_add/Rank"
+  input: "inverse_stft/overlap_and_add/sub_2/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/Rank_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/range_2/start"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/range_2/delta"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/range_2"
+  op: "Range"
+  input: "inverse_stft/overlap_and_add/range_2/start"
+  input: "inverse_stft/overlap_and_add/Rank_2"
+  input: "inverse_stft/overlap_and_add/range_2/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 3
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/sub_3"
+  op: "Sub"
+  input: "inverse_stft/overlap_and_add/Rank_2"
+  input: "inverse_stft/overlap_and_add/sub_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/packed_1"
+  op: "Pack"
+  input: "inverse_stft/overlap_and_add/sub_3"
+  input: "inverse_stft/overlap_and_add/sub_2"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/split_1/split_dim"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/split_1"
+  op: "SplitV"
+  input: "inverse_stft/overlap_and_add/range_2"
+  input: "inverse_stft/overlap_and_add/packed_1"
+  input: "inverse_stft/overlap_and_add/split_1/split_dim"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tlen"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "num_split"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/concat_3/axis"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/concat_3"
+  op: "ConcatV2"
+  input: "inverse_stft/overlap_and_add/split_1:1"
+  input: "inverse_stft/overlap_and_add/split_1"
+  input: "inverse_stft/overlap_and_add/concat_3/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "inverse_stft/overlap_and_add/transpose_1"
+  op: "Transpose"
+  input: "inverse_stft/overlap_and_add/UnsortedSegmentSum"
+  input: "inverse_stft/overlap_and_add/concat_3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "inverse_stft/overlap_and_add/Reshape_2"
+  op: "Reshape"
+  input: "inverse_stft/overlap_and_add/transpose_1"
+  input: "inverse_stft/overlap_and_add/concat_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/Shape"
+  op: "Shape"
+  input: "inverse_stft/overlap_and_add/Reshape_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "out_type"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "de_emphasis/strided_slice/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/strided_slice/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/strided_slice/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/strided_slice"
+  op: "StridedSlice"
+  input: "de_emphasis/Shape"
+  input: "de_emphasis/strided_slice/stack"
+  input: "de_emphasis/strided_slice/stack_1"
+  input: "de_emphasis/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "de_emphasis/TensorArray"
+  op: "TensorArrayV3"
+  input: "de_emphasis/strided_slice"
+
+  attr {
+    key: "clear_after_read"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "dynamic_size"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    key: "identical_element_shapes"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "tensor_array_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "de_emphasis/strided_slice_1/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/strided_slice_1/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/strided_slice_1/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/strided_slice_1"
+  op: "StridedSlice"
+  input: "inverse_stft/overlap_and_add/Reshape_2"
+  input: "de_emphasis/strided_slice_1/stack"
+  input: "de_emphasis/strided_slice_1/stack_1"
+  input: "de_emphasis/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "de_emphasis/TensorArrayWrite/TensorArrayWriteV3/index"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@de_emphasis/strided_slice_1"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/TensorArrayWrite/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "de_emphasis/TensorArray"
+  input: "de_emphasis/TensorArrayWrite/TensorArrayWriteV3/index"
+  input: "de_emphasis/strided_slice_1"
+  input: "de_emphasis/TensorArray:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@de_emphasis/strided_slice_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/Const"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/strided_slice_2/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/strided_slice_2/stack_1"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/strided_slice_2/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/strided_slice_2"
+  op: "StridedSlice"
+  input: "inverse_stft/overlap_and_add/Reshape_2"
+  input: "de_emphasis/strided_slice_2/stack"
+  input: "de_emphasis/strided_slice_2/stack_1"
+  input: "de_emphasis/strided_slice_2/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/Enter"
+  op: "Enter"
+  input: "de_emphasis/Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "de_emphasis/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/Enter_1"
+  op: "Enter"
+  input: "de_emphasis/TensorArrayWrite/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "de_emphasis/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/Enter_2"
+  op: "Enter"
+  input: "de_emphasis/strided_slice_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "de_emphasis/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/Merge"
+  op: "Merge"
+  input: "de_emphasis/while/Enter"
+  input: "de_emphasis/while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/Merge_1"
+  op: "Merge"
+  input: "de_emphasis/while/Enter_1"
+  input: "de_emphasis/while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/Merge_2"
+  op: "Merge"
+  input: "de_emphasis/while/Enter_2"
+  input: "de_emphasis/while/NextIteration_2"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/Less"
+  op: "Less"
+  input: "de_emphasis/while/Merge"
+  input: "de_emphasis/while/Less/Enter"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/Less/Enter"
+  op: "Enter"
+  input: "de_emphasis/strided_slice"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "de_emphasis/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/LoopCond"
+  op: "LoopCond"
+  input: "de_emphasis/while/Less"
+
+}
+node {
+  name: "de_emphasis/while/Switch"
+  op: "Switch"
+  input: "de_emphasis/while/Merge"
+  input: "de_emphasis/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@de_emphasis/while/Merge"
+      }
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/Switch_1"
+  op: "Switch"
+  input: "de_emphasis/while/Merge_1"
+  input: "de_emphasis/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@de_emphasis/while/Merge_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/Switch_2"
+  op: "Switch"
+  input: "de_emphasis/while/Merge_2"
+  input: "de_emphasis/while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@de_emphasis/while/Merge_2"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/Identity"
+  op: "Identity"
+  input: "de_emphasis/while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/Identity_1"
+  op: "Identity"
+  input: "de_emphasis/while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/Identity_2"
+  op: "Identity"
+  input: "de_emphasis/while/Switch_2:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/add/y"
+  op: "Const"
+  input: "^de_emphasis/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/add"
+  op: "Add"
+  input: "de_emphasis/while/Identity"
+  input: "de_emphasis/while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/strided_slice/stack/0"
+  op: "Const"
+  input: "^de_emphasis/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/strided_slice/stack"
+  op: "Pack"
+  input: "de_emphasis/while/strided_slice/stack/0"
+  input: "de_emphasis/while/Identity"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/strided_slice/stack_1/0"
+  op: "Const"
+  input: "^de_emphasis/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/strided_slice/stack_1"
+  op: "Pack"
+  input: "de_emphasis/while/strided_slice/stack_1/0"
+  input: "de_emphasis/while/add"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/strided_slice/stack_2"
+  op: "Const"
+  input: "^de_emphasis/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/strided_slice"
+  op: "StridedSlice"
+  input: "de_emphasis/while/strided_slice/Enter"
+  input: "de_emphasis/while/strided_slice/stack"
+  input: "de_emphasis/while/strided_slice/stack_1"
+  input: "de_emphasis/while/strided_slice/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/strided_slice/Enter"
+  op: "Enter"
+  input: "inverse_stft/overlap_and_add/Reshape_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "de_emphasis/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/mul/x"
+  op: "Const"
+  input: "^de_emphasis/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.97000002861
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/mul"
+  op: "Mul"
+  input: "de_emphasis/while/mul/x"
+  input: "de_emphasis/while/Identity_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/add_1"
+  op: "Add"
+  input: "de_emphasis/while/strided_slice"
+  input: "de_emphasis/while/mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/add_2/y"
+  op: "Const"
+  input: "^de_emphasis/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/add_2"
+  op: "Add"
+  input: "de_emphasis/while/Identity"
+  input: "de_emphasis/while/add_2/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/strided_slice_1/stack/0"
+  op: "Const"
+  input: "^de_emphasis/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/strided_slice_1/stack"
+  op: "Pack"
+  input: "de_emphasis/while/strided_slice_1/stack/0"
+  input: "de_emphasis/while/Identity"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/strided_slice_1/stack_1/0"
+  op: "Const"
+  input: "^de_emphasis/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/strided_slice_1/stack_1"
+  op: "Pack"
+  input: "de_emphasis/while/strided_slice_1/stack_1/0"
+  input: "de_emphasis/while/add_2"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/strided_slice_1/stack_2"
+  op: "Const"
+  input: "^de_emphasis/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/strided_slice_1"
+  op: "StridedSlice"
+  input: "de_emphasis/while/strided_slice/Enter"
+  input: "de_emphasis/while/strided_slice_1/stack"
+  input: "de_emphasis/while/strided_slice_1/stack_1"
+  input: "de_emphasis/while/strided_slice_1/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 2
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/mul_1/x"
+  op: "Const"
+  input: "^de_emphasis/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.97000002861
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/mul_1"
+  op: "Mul"
+  input: "de_emphasis/while/mul_1/x"
+  input: "de_emphasis/while/Identity_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/add_3"
+  op: "Add"
+  input: "de_emphasis/while/strided_slice_1"
+  input: "de_emphasis/while/mul_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/TensorArrayWrite/TensorArrayWriteV3"
+  op: "TensorArrayWriteV3"
+  input: "de_emphasis/while/TensorArrayWrite/TensorArrayWriteV3/Enter"
+  input: "de_emphasis/while/Identity"
+  input: "de_emphasis/while/add_3"
+  input: "de_emphasis/while/Identity_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@de_emphasis/strided_slice_1"
+      }
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/TensorArrayWrite/TensorArrayWriteV3/Enter"
+  op: "Enter"
+  input: "de_emphasis/TensorArray"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@de_emphasis/strided_slice_1"
+      }
+    }
+  }
+
+  attr {
+    key: "frame_name"
+    value {
+      s: "de_emphasis/while/while_context"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/add_4/y"
+  op: "Const"
+  input: "^de_emphasis/while/Identity"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/while/add_4"
+  op: "Add"
+  input: "de_emphasis/while/Identity"
+  input: "de_emphasis/while/add_4/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/NextIteration"
+  op: "NextIteration"
+  input: "de_emphasis/while/add_4"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/NextIteration_1"
+  op: "NextIteration"
+  input: "de_emphasis/while/TensorArrayWrite/TensorArrayWriteV3"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/NextIteration_2"
+  op: "NextIteration"
+  input: "de_emphasis/while/add_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/Exit"
+  op: "Exit"
+  input: "de_emphasis/while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/Exit_1"
+  op: "Exit"
+  input: "de_emphasis/while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/while/Exit_2"
+  op: "Exit"
+  input: "de_emphasis/while/Switch_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/TensorArrayStack/TensorArraySizeV3"
+  op: "TensorArraySizeV3"
+  input: "de_emphasis/TensorArray"
+  input: "de_emphasis/while/Exit_1"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@de_emphasis/TensorArray"
+      }
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/TensorArrayStack/range/start"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@de_emphasis/TensorArray"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/TensorArrayStack/range/delta"
+  op: "Const"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@de_emphasis/TensorArray"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/TensorArrayStack/range"
+  op: "Range"
+  input: "de_emphasis/TensorArrayStack/range/start"
+  input: "de_emphasis/TensorArrayStack/TensorArraySizeV3"
+  input: "de_emphasis/TensorArrayStack/range/delta"
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@de_emphasis/TensorArray"
+      }
+    }
+  }
+
+}
+node {
+  name: "de_emphasis/TensorArrayStack/TensorArrayGatherV3"
+  op: "TensorArrayGatherV3"
+  input: "de_emphasis/TensorArray"
+  input: "de_emphasis/TensorArrayStack/range"
+  input: "de_emphasis/while/Exit_1"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@de_emphasis/TensorArray"
+      }
+    }
+  }
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "element_shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/transpose/perm"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "de_emphasis/transpose"
+  op: "Transpose"
+  input: "de_emphasis/TensorArrayStack/TensorArrayGatherV3"
+  input: "de_emphasis/transpose/perm"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tperm"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "sub_1/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "sub_1"
+  op: "Sub"
+  input: "seq2seq/seq2seq_2/convert_to_lin_specgram/StopGradient_1"
+  input: "sub_1/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "mul_2/y"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 300
+      }
+    }
+  }
+}
+node {
+  name: "mul_2"
+  op: "Mul"
+  input: "sub_1"
+  input: "mul_2/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "add_5/x"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1200
+      }
+    }
+  }
+}
+node {
+  name: "add_5"
+  op: "Add"
+  input: "add_5/x"
+  input: "mul_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+}
+node {
+  name: "decoder_reconstruction_lengths"
+  op: "Identity"
+  input: "add_5"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Const_13"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Max_1"
+  op: "Max"
+  input: "decoder_reconstruction_lengths"
+  input: "Const_13"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "strided_slice_8/stack"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\000\000\000\000\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_8/stack_1/0"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_8/stack_1"
+  op: "Pack"
+  input: "strided_slice_8/stack_1/0"
+  input: "Max_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "strided_slice_8/stack_2"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\001\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "strided_slice_8"
+  op: "StridedSlice"
+  input: "de_emphasis/transpose"
+  input: "strided_slice_8/stack"
+  input: "strided_slice_8/stack_1"
+  input: "strided_slice_8/stack_2"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+  attr {
+    key: "begin_mask"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "ellipsis_mask"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "end_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "new_axis_mask"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shrink_axis_mask"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "decoder_reconstruction"
+  op: "Identity"
+  input: "strided_slice_8"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "decoder_reconstruction_sample_rate"
+  op: "Identity"
+  input: "decoder_target_sample_rate"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "Abs_3"
+  op: "Abs"
+  input: "decoder_reconstruction"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+
+}
+node {
+  name: "Max_2/reduction_indices"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Max_2"
+  op: "Max"
+  input: "Abs_3"
+  input: "Max_2/reduction_indices"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+
+  attr {
+    key: "keep_dims"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "decoder_reconstruction_normalized"
+  op: "RealDiv"
+  input: "decoder_reconstruction"
+  input: "Max_2"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "decoder_reconstruction_1/tag"
+  op: "Const"
+
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+        }
+        string_val: "decoder_reconstruction_1"
+      }
+    }
+  }
+}
+node {
+  name: "decoder_reconstruction_1"
+  op: "AudioSummaryV2"
+  input: "decoder_reconstruction_1/tag"
+  input: "decoder_reconstruction_normalized"
+  input: "decoder_reconstruction_sample_rate"
+
+  attr {
+    key: "max_outputs"
+    value {
+      i: 5
+    }
+  }
+}
+node {
+  name: "Merge/MergeSummary"
+  op: "MergeSummary"
+  input: "decoder_input_sample_prob"
+  input: "seq2seq/seq2seq_1/attention_decoder/comb_weights"
+  input: "decoder_reconstruction_1"
+  attr {
+    key: "N"
+    value {
+      i: 3
+    }
+  }
+}
+
+versions {
+  producer: 24
+}
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
index 8fd1801863ad9aadd6e9f1bbde4b90600189d77c..ea4320687af366ccdd82e46cf28adf4ee9c100c0 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -117,8 +117,6 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
     LOG(ERROR) << "Failed to measure graph performance: "
                << status.error_message();
     costs->execution_time = Costs::Duration::max();
-    costs->max_execution_time = Costs::Duration::max();
-    costs->min_execution_time = 0;
     return status;
   }
 
@@ -126,8 +124,6 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
   // to filter out outliers.
   RobustStats stats(times);
   costs->execution_time = Costs::Duration(stats.mean());
-  costs->max_execution_time = Costs::Duration(stats.hi());
-  costs->min_execution_time = Costs::Duration(stats.lo());
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/costs/op_context.h b/tensorflow/core/grappler/costs/op_context.h
index 735a1e68ea6e30adff297d29f6a9c86111ef7507..6391de4a91ead5032013b3c9143ebcfc9f929901 100644
--- a/tensorflow/core/grappler/costs/op_context.h
+++ b/tensorflow/core/grappler/costs/op_context.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
 
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
@@ -36,4 +36,4 @@ struct OpContext {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_OP_CONTEXT_H_
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index b1e04ceec827763b7c5019e432a3f90aa49d522f..a57cfdd9891b1d654092f9b896af248fa40eb88f 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -25,11 +25,13 @@ namespace tensorflow {
 namespace grappler {
 
 constexpr int kOpsPerMac = 2;
+constexpr char kConst[] = "Const";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
 constexpr char kConv2dBackpropInput[] = "Conv2DBackpropInput";
 constexpr char kMatMul[] = "MatMul";
 constexpr char kSparseMatMul[] = "SparseMatMul";
+constexpr char kPlaceholder[] = "Placeholder";
 constexpr char kIdentity[] = "Identity";
 constexpr char kRefIdentity[] = "RefIdentity";
 constexpr char kNoOp[] = "NoOp";
@@ -45,6 +47,8 @@ constexpr char kSize[] = "Size";
 constexpr char kStopGradient[] = "StopGradient";
 constexpr char kPreventGradient[] = "PreventGradient";
 
+static const Costs::Duration kMinComputeTime(1);
+
 namespace {
 
 string GetDataFormat(const OpInfo& op_features) {
@@ -159,17 +163,23 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
-      {kIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kRefIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kStopGradient, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kPreventGradient, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kReshape, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kRecv, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kSend, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kVariable, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kVariableV2, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
+
+      {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
+
+      {kPlaceholder, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kRefIdentity, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kStopGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kPreventGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kReshape, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kRecv, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kSend, wrap(&OpLevelCostEstimator::PredictIdentity)},
+
+      {kConst, wrap(&OpLevelCostEstimator::PredictVariable)},
+      {kVariable, wrap(&OpLevelCostEstimator::PredictVariable)},
+      {kVariableV2, wrap(&OpLevelCostEstimator::PredictVariable)},
+
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kSize, wrap(&OpLevelCostEstimator::PredictMetadata)}};
@@ -221,6 +231,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
                      Eigen::internal::scalar_square_op<float>>::Cost},
       {"Tanh", Eigen::internal::functor_traits<
                    Eigen::internal::scalar_tanh_op<float>>::Cost},
+      {"Relu", Eigen::internal::functor_traits<
+                   Eigen::internal::scalar_max_op<float>>::Cost},
       {"Sigmoid", Eigen::internal::functor_traits<
                       Eigen::internal::scalar_sigmoid_op<float>>::Cost},
       {"Sign", Eigen::internal::functor_traits<
@@ -283,8 +295,10 @@ Costs OpLevelCostEstimator::PredictCosts(const OpContext& op_context) const {
     if (elementwise_ops_.find(op_features.op()) != elementwise_ops_.end()) {
       return PredictCwiseOp(op_context);
     }
-    VLOG(1) << "Missing implementation for op: " << op_features.op();
-    return DummyExecutionTime(op_context);
+
+    VLOG(1) << "Missing accurate estimator for op: " << op_features.op();
+
+    return PredictCostOfAnUnknownOp(op_context);
   }
 
   std::function<Costs(const OpContext&)> estimator = it->second;
@@ -339,6 +353,9 @@ OpLevelCostEstimator::DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
   VLOG(1) << "Device: " << device.type() << " gflops: " << gflops
           << " gb_per_sec: " << gb_per_sec;
 
+  DCHECK_LT(0, gflops) << device.DebugString();
+  DCHECK_LT(0, gb_per_sec) << device.DebugString();
+
   return {gflops, gb_per_sec};
 }
 
@@ -366,19 +383,27 @@ Costs OpLevelCostEstimator::PredictCwiseOp(const OpContext& op_context) const {
   }
 
   int op_cost = 1;
+  bool is_known_elementwise_op = false;
   auto it = elementwise_ops_.find(op_features.op());
   if (it != elementwise_ops_.end()) {
     op_cost = it->second;
+    is_known_elementwise_op = true;
+  } else {
+    LOG(WARNING) << "Not a cwise op: " << op_features.op();
   }
+
   Costs costs = PredictOpCountBasedCost(op_count * op_cost, op_features);
-  costs.inaccurate = found_unknown_shapes;
+  if (found_unknown_shapes || !is_known_elementwise_op) {
+    costs.inaccurate = true;
+  }
   return costs;
 }
 
-Costs OpLevelCostEstimator::DummyExecutionTime(
+Costs OpLevelCostEstimator::PredictCostOfAnUnknownOp(
     const OpContext& op_context) const {
-  // Use CwiseOp time as an estimation
-  auto costs = PredictCwiseOp(op_context);
+  // Don't assume the operation is cwise, return cost based on input/output size
+  // and admit that it is inaccurate...
+  auto costs = PredictOpCountBasedCost(0, op_context.op_info);
   costs.inaccurate = true;
   return costs;
 }
@@ -386,16 +411,22 @@ Costs OpLevelCostEstimator::DummyExecutionTime(
 Costs OpLevelCostEstimator::PredictOpCountBasedCost(
     double operations, const OpInfo& op_features) const {
   DeviceInfo device_perf = GetDeviceInfo(op_features.device());
+  if (device_perf.gigaops <= 0 || device_perf.gb_per_sec <= 0) {
+    VLOG(1) << "BAD DEVICE. Op:" << op_features.op()
+            << " device type:" << op_features.device().type()
+            << " device model:" << op_features.device().model();
+  }
+
   Costs::NanoSeconds compute_cost(std::ceil(operations / device_perf.gigaops));
   VLOG(1) << "Op:" << op_features.op() << " GOps:" << operations / 1e9
           << " Execution Time (ns):" << compute_cost.count();
 
   bool found_unknown_shapes = false;
-  double total_input_size =
+  const double total_input_size =
       CalculateInputSize(op_features, &found_unknown_shapes);
-  double total_output_size =
+  const double total_output_size =
       CalculateOutputSize(op_features, &found_unknown_shapes);
-  double total_io_size = total_input_size + total_output_size;
+  const double total_io_size = total_input_size + total_output_size;
 
   Costs::NanoSeconds memory_cost(
       std::ceil(total_io_size / device_perf.gb_per_sec));
@@ -411,6 +442,7 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
     costs.execution_time = compute_cost + memory_cost;
   }
   costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
   return costs;
 }
 
@@ -425,10 +457,15 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
     const TensorShapeProto& original_image_shape,
     const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
     bool* found_unknown_shapes) {
+  VLOG(2) << "op features: " << op_features.DebugString();
+  VLOG(2) << "Original image shape: " << original_image_shape.DebugString();
+  VLOG(2) << "Original filter shape: " << original_filter_shape.DebugString();
   auto image_shape =
       MaybeGetMinimumShape(original_image_shape, 4, found_unknown_shapes);
   auto filter_shape =
       MaybeGetMinimumShape(original_filter_shape, 4, found_unknown_shapes);
+  VLOG(2) << "Image shape: " << image_shape.DebugString();
+  VLOG(2) << "Filter shape: " << filter_shape.DebugString();
 
   int x_index, y_index, channel_index;
   const string& data_format = GetDataFormat(op_features);
@@ -687,18 +724,35 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
     bool* found_unknown_shapes) const {
   int64 ops = 0;
 
-  if (op_features.op() != kConv2dBackpropInput) {
-    LOG(ERROR) << "Invalid Operation";
+  DCHECK_EQ(kConv2dBackpropInput, op_features.op());
+
+  if (op_features.inputs_size() < 2) {
+    *found_unknown_shapes = true;
     return ops;
   }
 
-  if (op_features.outputs_size() != 1) {
-    // Need _output_shapes for input shape.
-    LOG(ERROR) << "No output shape in Conv2DBackpropInput op.";
-    return ops;
+  TensorShapeProto input_shape;
+  if (op_features.inputs(0).has_value()) {
+    const TensorProto& value = op_features.inputs(0).value();
+    if (value.int64_val_size() > 0) {
+      for (int i = 0; i < value.int64_val_size(); ++i) {
+        input_shape.add_dim()->set_size(value.int64_val(i));
+      }
+    } else {
+      for (int i = 0; i < value.int_val_size(); ++i) {
+        input_shape.add_dim()->set_size(value.int_val(i));
+      }
+    }
+  } else if (op_features.outputs_size() == 1) {
+    input_shape = op_features.outputs(0).shape();
+  } else {
+    // Set the minimum filter size that's feasible.
+    for (int i = 0; i < 4; ++i) {
+      input_shape.add_dim()->set_size(1);
+    }
+    *found_unknown_shapes = true;
   }
 
-  const auto& input_shape = op_features.outputs(0).shape();
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
       input_shape, op_features.inputs(1).shape(), op_features,
       found_unknown_shapes);
@@ -721,18 +775,34 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
     const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
     bool* found_unknown_shapes) const {
   int64 ops = 0;
-  if (op_features.op() != kConv2dBackpropFilter) {
-    LOG(ERROR) << "Invalid Operation";
-    return ops;
+  DCHECK_EQ(kConv2dBackpropFilter, op_features.op());
+
+  TensorShapeProto filter_shape;
+  if (op_features.inputs_size() >= 2 && op_features.inputs(1).has_value()) {
+    const TensorProto& value = op_features.inputs(1).value();
+    if (value.int64_val_size() > 0) {
+      for (int i = 0; i < value.int64_val_size(); ++i) {
+        filter_shape.add_dim()->set_size(value.int64_val(i));
+      }
+    } else {
+      for (int i = 0; i < value.int_val_size(); ++i) {
+        filter_shape.add_dim()->set_size(value.int_val(i));
+      }
+    }
+  } else if (op_features.outputs_size() == 1) {
+    filter_shape = op_features.outputs(0).shape();
+  } else {
+    // Set the minimum filter size that's feasible.
+    for (int i = 0; i < 4; ++i) {
+      filter_shape.add_dim()->set_size(1);
+    }
+    *found_unknown_shapes = true;
   }
 
-  if (op_features.outputs_size() != 1) {
-    // Need _output_shapes for input shape.
-    LOG(ERROR) << "No output shape in Conv2DBackpropFilter op.";
+  if (op_features.inputs_size() < 1) {
+    *found_unknown_shapes = true;
     return ops;
   }
-
-  const auto& filter_shape = op_features.outputs(0).shape();
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
       op_features.inputs(0).shape(), filter_shape, op_features,
       found_unknown_shapes);
@@ -867,6 +937,30 @@ Costs OpLevelCostEstimator::PredictNoOp(const OpContext& op_context) const {
   return Costs::ZeroCosts();
 }
 
+Costs OpLevelCostEstimator::PredictIdentity(const OpContext& op_context) const {
+  const auto& op_features = op_context.op_info;
+  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  Costs result = Costs::ZeroCosts();
+  result.max_memory = CalculateOutputSize(op_features, &result.inaccurate);
+  // Assign the minimum amount of time we can represent to the identity op since
+  // it tends to be really cheap.
+  result.compute_time = kMinComputeTime;
+  result.execution_time = result.compute_time;
+  return result;
+}
+
+Costs OpLevelCostEstimator::PredictVariable(const OpContext& op_context) const {
+  const auto& op_features = op_context.op_info;
+  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  Costs result = Costs::ZeroCosts();
+  result.persistent_memory =
+      CalculateOutputSize(op_features, &result.inaccurate);
+
+  result.compute_time = kMinComputeTime;
+  result.execution_time = result.execution_time;
+  return result;
+}
+
 Costs OpLevelCostEstimator::PredictBatchMatMul(
     const OpContext& op_context) const {
   const auto& op_features = op_context.op_info;
@@ -880,13 +974,12 @@ Costs OpLevelCostEstimator::PredictBatchMatMul(
 
 Costs OpLevelCostEstimator::PredictMetadata(const OpContext& op_context) const {
   const auto& op_features = op_context.op_info;
-  Costs costs;
+  Costs costs = Costs::ZeroCosts();
   costs.max_memory = CalculateOutputSize(op_features, &costs.inaccurate);
   // Metadata operations are so cheap we assume they take the minimum amount of
   // time we can represent (1 ns).
-  costs.execution_time = 1;
-  costs.compute_time = 1;
-  costs.memory_time = 0;
+  costs.compute_time = kMinComputeTime;
+  costs.execution_time = costs.compute_time;
 
   return costs;
 }
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 3a8385dd732d1747eca690339e098d741f68effc..a292e5e97fe52383648d74b08bb7a384b6278446 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -35,7 +35,6 @@ class OpLevelCostEstimator {
 
   virtual Costs PredictCosts(const OpContext& op_context) const;
 
- protected:
   // Basic device performance info, sufficient for roofline estimate.
   struct DeviceInfo {
     double gigaops;     // Billions of operations executed per second.
@@ -45,11 +44,12 @@ class OpLevelCostEstimator {
   // Returns basic device performance info.
   virtual DeviceInfo GetDeviceInfo(const DeviceProperties& device) const;
 
-  // For operations for which we haven't yet built estimates, returns a dummy
-  // value based on input size.
-  Costs DummyExecutionTime(const OpContext& op_context) const;
+ protected:
+  // Predict cost of an op for which no accurate estimator is defined.
+  Costs PredictCostOfAnUnknownOp(const OpContext& op_context) const;
 
-  // Naive cost estimate based on operations divided by device ops/sec.
+  // Naive cost estimate based on operations divided by device ops/sec,
+  // and input/output tensor sizes.
   Costs PredictOpCountBasedCost(double operations,
                                 const OpInfo& op_features) const;
 
@@ -132,6 +132,8 @@ class OpLevelCostEstimator {
   Costs PredictConv2DBackpropFilter(const OpContext& op_context) const;
   Costs PredictMatMul(const OpContext& op_context) const;
   Costs PredictNoOp(const OpContext& op_context) const;
+  Costs PredictIdentity(const OpContext& op_context) const;
+  Costs PredictVariable(const OpContext& op_context) const;
   Costs PredictBatchMatMul(const OpContext& op_context) const;
   Costs PredictMetadata(const OpContext& op_context) const;
 
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index f19be4a0ee53609fa0196405da4ecb8b94fa39e6..60fc783472d2b6a1d50eb52e912da1fccbe8cf08 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -167,8 +167,8 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
 TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
   auto cost = PredictCosts(DescribeOp("Dummy", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(200), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(2200), cost.execution_time);
+  EXPECT_EQ(Costs::Duration(0), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(2000), cost.execution_time);
   EXPECT_TRUE(cost.inaccurate);
 }
 
@@ -176,7 +176,7 @@ TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
   SetComputeMemoryOverlap(true);
   auto cost = PredictCosts(DescribeOp("Dummy", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(200), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2000), cost.execution_time);  // max(2000, 200)
   EXPECT_TRUE(cost.inaccurate);
   SetComputeMemoryOverlap(false);  // Set it back to default.
diff --git a/tensorflow/core/grappler/costs/op_performance_data.proto b/tensorflow/core/grappler/costs/op_performance_data.proto
index 1a111b71dc5ee82650cd5c772dfce9abcb32931b..37f9ebd6a146c8c0089857c7a41ba863b4c2fb1f 100644
--- a/tensorflow/core/grappler/costs/op_performance_data.proto
+++ b/tensorflow/core/grappler/costs/op_performance_data.proto
@@ -58,11 +58,18 @@ message LogNormalDistribution {
   double sigma = 2;
 }
 
+message SessionInfo {
+  int64 intra_op_parallelism = 1;
+}
+
 // Performance data for tensorflow operations
 message OpPerformance {
   // The op
   OpInfo op = 1;
 
+  // Information about the session configs.
+  SessionInfo session_info = 12;
+
   // The node name (optional). Makes it easier to associate the performance data
   // with a specific graph node.
   string node = 5;
@@ -96,13 +103,12 @@ message OpPerformance {
     // The output information may have memory usage and output shapes.
     repeated int64 output_memory = 1;
 
-    // Temporary memory allocated by this node.
-    int64 host_temp_memory = 2;
-    int64 device_temp_memory = 3;
+    // Temp and persistent memory allocated by this node.
+    int64 temp_memory = 2;
+    int64 persistent_memory = 4;
 
-    // The persisted_memory doesn't include outputs.
-    int64 host_persistent_memory = 4;
-    int64 device_persistent_memory = 5;
+    int64 device_temp_memory = 3 [deprecated = true];
+    int64 device_persistent_memory = 5 [deprecated = true];
   }
   OpMemory op_memory = 9;
 }
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index ade0ad53fb71c45f07b7a87824610b575676847c..602f69f12ea9d24ebd94da73a2a76d1992f3bfb1 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -285,14 +285,10 @@ OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
       perf->mutable_op_memory()->add_output_memory(output_info.size());
     }
 
-    perf->mutable_op_memory()->set_host_temp_memory(
-        cost_node->host_temp_memory_size());
-    perf->mutable_op_memory()->set_device_temp_memory(
-        cost_node->device_temp_memory_size());
-    perf->mutable_op_memory()->set_host_persistent_memory(
-        cost_node->host_persistent_memory_size());
-    perf->mutable_op_memory()->set_device_persistent_memory(
-        cost_node->device_persistent_memory_size());
+    perf->mutable_op_memory()->set_temp_memory(
+        cost_node->temporary_memory_size());
+    perf->mutable_op_memory()->set_persistent_memory(
+        cost_node->persistent_memory_size());
   }
   return ret;
 }
diff --git a/tensorflow/core/grappler/costs/virtual_placer.h b/tensorflow/core/grappler/costs/virtual_placer.h
index 7ccb1ebb9999989f17548aeb88d1d64abdcc5341..fee5ce0f510014988656f418b857a73b8d68b807 100644
--- a/tensorflow/core/grappler/costs/virtual_placer.h
+++ b/tensorflow/core/grappler/costs/virtual_placer.h
@@ -41,7 +41,7 @@ class VirtualPlacer {
  private:
   // Converts given device name to Lowercase Fully-Qualified Name (LFQN) string.
   // This helps us disambiguate device names internally and simplify matching.
-  // If device_name couldn't be parsed succesfully, returns empty string.
+  // If device_name couldn't be parsed successfully, returns empty string.
   string to_lfqn_or_empty(const string& device_name) const;
 
   // Map based on the cluster info: cluster device name -> device properties.
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index e5e1ee32926a4a77d7580d9b16812bd0c60ce984..14b4ed7507f6237ea6255f46e060aa3d0f60b34d 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -43,6 +44,9 @@ Costs CombineCosts(const Costs& left, const Costs& right) {
 
   Costs result = left;
   result.execution_time += right.execution_time;
+  if (right.inaccurate) {
+    result.inaccurate = true;
+  }
   if (right.max_memory != kMemoryUnknown) {
     result.max_memory += right.max_memory;
   }
@@ -72,7 +76,7 @@ struct RecvNodeDescriptor {
       : node(node_), port_num(port_num_), device(device_) {}
 };
 
-struct RecvNodeDescritorHash {
+struct RecvNodeDescriptorHash {
   std::size_t operator()(const RecvNodeDescriptor& recv_node) const {
     return std::hash<const NodeDef*>()(recv_node.node) ^
            std::hash<int>()(recv_node.port_num) ^
@@ -88,10 +92,188 @@ struct RecvNodeDescriptorEqual {
 };
 }  // namespace
 
+// ReadyNodeManager
+const NodeDef* LIFOManager::GetCurrNode() {
+  CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
+  if (curr_pos_ == nodes_.end()) {
+    curr_pos_ = --(nodes_.rbegin().base());  // Last one in the list.
+  }
+  // Once curr_pos_ is set to a valid entry in the list, we keep using the
+  // cached curr_pos_ until RemoveCurrNode() is called. AddNode() will not
+  // change the GetCurrNode() return value.
+  return *curr_pos_;
+}
+
+void LIFOManager::RemoveCurrNode() {
+  // Make sure we have curr_pos_ ready to be removed.
+  GetCurrNode();
+  // Note curr_pos_ may not be pointing the last element if some nodes are
+  // added.
+  nodes_.erase(curr_pos_);
+
+  curr_pos_ = nodes_.end();  // Reset curr_pos_.
+}
+
+FirstReadyManager::FirstReadyManager() : ReadyNodeManager() {
+  std::make_heap(nodes_.begin(), nodes_.end());
+}
+
+void FirstReadyManager::Init(
+    const std::unordered_map<const NodeDef*, NodeState>* node_state) {
+  // Reset the node state since different instances of the scheduler can reuse
+  // the same node_manager.
+  node_state_ = node_state;
+  nodes_.clear();
+  waiting_queue_.clear();
+  greater_ = [this](const NodeDef* a, const NodeDef* b) -> bool {
+    if (node_state_->at(a).time_ready == node_state_->at(b).time_ready) {
+      // Use Node name as tie-breaker for deterministic node scheduling.
+      return a->name().compare(b->name()) > 0;
+    } else {
+      // Note: we need a node with minimum time_ready, not
+      // maximum; hence, using a > b for comparison function.
+      return node_state_->at(a).time_ready > node_state_->at(b).time_ready;
+    }
+  };
+}
+
+const NodeDef* FirstReadyManager::GetCurrNode() {
+  if (nodes_.empty()) {
+    // Nothing in the node_; probably, the very first call. Move
+    // waiting_queue_ to node_.
+    DrainWaitingQueue();
+    CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
+  }
+  return nodes_.front();
+}
+
+void FirstReadyManager::RemoveCurrNode() {
+  if (nodes_.empty()) {
+    // Make sure that there is a node to be removed at the front of nodes_.
+    GetCurrNode();
+  }
+  std::pop_heap(nodes_.begin(), nodes_.end(), greater_);
+  nodes_.pop_back();
+  DrainWaitingQueue();
+}
+
+bool FirstReadyManager::Empty() const {
+  return nodes_.empty() && waiting_queue_.empty();
+}
+
+void FirstReadyManager::DrainWaitingQueue() {
+  for (const auto* node : waiting_queue_) {
+    // push_heap in AddNode() and pop_heap in RemoveCurrNode() guarantees that
+    // the first element is the node with minimum time_ready.
+    nodes_.push_back(node);
+    std::push_heap(nodes_.begin(), nodes_.end(), greater_);
+  }
+  waiting_queue_.clear();
+}
+
+CompositeNodeManager::CompositeNodeManager()
+    : ReadyNodeManager(), send_manager_(), recv_manager_() {}
+
+void CompositeNodeManager::Init(
+    const std::unordered_map<const NodeDef*, NodeState>* node_state) {
+  node_state_ = node_state;
+  send_manager_.Init(node_state);
+  recv_manager_.Init(node_state);
+  curr_node_ = nullptr;
+}
+
+void CompositeNodeManager::AddNode(const NodeDef* node) {
+  if (IsSend(*node)) {
+    send_manager_.AddNode(node);
+  } else if (IsRecv(*node)) {
+    recv_manager_.AddNode(node);
+  } else {
+    const auto& device = node_state_->at(node).device_name;
+    ops_lifo_map_[device].AddNode(node);
+  }
+}
+
+const NodeDef* CompositeNodeManager::GetCurrNode() {
+  if (curr_node_) return curr_node_;
+
+  // Per-device LIFO for normal ops (not _Send / _Recv),
+  // FirstReady for _Send and _Recv (separately),
+  // Globally (among the LIFO-selected ops from each device and _Send and
+  // _Recv) FirstReady,
+  // Priorty order: _Send, _Recv, and then the rest, if time_ready is equal.
+  std::vector<std::pair<const NodeDef*, Costs::Duration>> candidates;
+  for (auto& ops_lifo : ops_lifo_map_) {
+    if (!ops_lifo.second.Empty()) {
+      const auto* op = ops_lifo.second.GetCurrNode();
+      candidates.emplace_back(op, node_state_->at(op).time_ready);
+    }
+  }
+  if (!send_manager_.Empty()) {
+    const auto* send = send_manager_.GetCurrNode();
+    candidates.emplace_back(send, node_state_->at(send).time_ready);
+  }
+  if (!recv_manager_.Empty()) {
+    const auto* recv = recv_manager_.GetCurrNode();
+    candidates.emplace_back(recv, node_state_->at(recv).time_ready);
+  }
+  CHECK(!candidates.empty());
+  auto first_ready = std::min_element(
+      candidates.begin(), candidates.end(),
+      [](const std::pair<const NodeDef*, Costs::Duration>& a,
+         const std::pair<const NodeDef*, Costs::Duration>& b) {
+        if (a.second == b.second) {
+          // Note that there can be only 1 Send and only 1 Recv in candidates,
+          // at most; hence, score is 2 for Send, 1 for Recv, and 0 for a
+          // normap op, and a_score and b_score are equal only if both are
+          // normal ops.
+          int a_score = 2 * IsSend(*a.first) + IsRecv(*a.first);
+          int b_score = 2 * IsSend(*b.first) + IsRecv(*b.first);
+          if (a_score == b_score) {
+            // Both are normal ops; use node name as tie breaker.
+            return a.first->name().compare(b.first->name()) < 0;
+          } else {
+            // Priortize by op type: _Send, _Recv, and normap ops.
+            return a_score > b_score;
+          }
+        } else {
+          return a.second < b.second;
+        }
+      });
+  // Next time we call GetCurrNode(), it just returns the cached one,
+  // curr_node_ until we call RemovCurrNode().
+  curr_node_ = first_ready->first;
+
+  return curr_node_;
+}
+
+void CompositeNodeManager::RemoveCurrNode() {
+  const auto* node = GetCurrNode();
+  if (IsSend(*node)) {
+    send_manager_.RemoveCurrNode();
+  } else if (IsRecv(*node)) {
+    recv_manager_.RemoveCurrNode();
+  } else {
+    const auto device = node_state_->at(node).device_name;
+    ops_lifo_map_[device].RemoveCurrNode();
+  }
+  // Reset curr_node_ so that GetCurrNode() finds another node.
+  curr_node_ = nullptr;
+}
+
+bool CompositeNodeManager::Empty() const {
+  // Empty if all the ready managers are empty.
+  bool empty = true;
+  for (const auto& ops_lifo : ops_lifo_map_) {
+    empty &= ops_lifo.second.Empty();
+  }
+  return empty && send_manager_.Empty() && recv_manager_.Empty();
+}
+
 VirtualScheduler::VirtualScheduler(const GrapplerItem* grappler_item,
                                    const bool use_static_shapes,
-                                   Cluster* cluster)
-    : ready_nodes_(ReadyNodeManagerFactory("FirstReady")),
+                                   Cluster* cluster,
+                                   ReadyNodeManager* ready_nodes)
+    : ready_nodes_(ready_nodes),
       graph_costs_(Costs::ZeroCosts()),
       graph_properties_(*grappler_item),
       cluster_(cluster),
@@ -108,7 +290,9 @@ ReadyNodeManager* VirtualScheduler::ReadyNodeManagerFactory(
   } else if (ready_node_manager == "LIFO") {
     return new LIFOManager();
   } else if (ready_node_manager == "FirstReady") {
-    return new FirstReadyManager(GetNodeStates());
+    return new FirstReadyManager();
+  } else if (ready_node_manager == "Composite") {
+    return new CompositeNodeManager();
   }
   LOG(FATAL) << "Not a valid ready node manager: " << ready_node_manager;
 }
@@ -118,11 +302,11 @@ Status VirtualScheduler::Init() {
   // necessary information for emulating tensorflow op scheduling and
   // construct internal data structures (NodeState and DeviceState) for virtual
   // scheduling.
-
+  ready_nodes_->Init(GetNodeStates());
   // Construct graph properties.
   Status status;
   if (use_static_shapes_) {
-    status = graph_properties_.InferStatically();
+    status = graph_properties_.InferStatically(true);
   } else {
     status = graph_properties_.InferDynamically(cluster_);
   }
@@ -140,8 +324,13 @@ Status VirtualScheduler::Init() {
   }
 
   // Get the nodes that would run to output fetch_nodes.
+  bool ill_formed = false;
   std::vector<const NodeDef*> nodes =
-      ComputeTransitiveFanin(graph, fetch_nodes);
+      ComputeTransitiveFanin(graph, fetch_nodes, &ill_formed);
+  if (ill_formed) {
+    return errors::InvalidArgument(
+        "Ill formed graph or invalid set of fetch nodes specified");
+  }
 
   // TODO(dyoon): this is a bit inefficient as name_to_node is already built in
   // ComputeTransitiveFanin().
@@ -158,14 +347,14 @@ Status VirtualScheduler::Init() {
   // to _Recv as control dependency when creating GrapplerItem.
   std::unordered_map<string, const NodeDef*> name_to_send;
   for (const auto& node : graph.node()) {
-    if (node.op() == "_Send") {
+    if (IsSend(node)) {
       const auto& attr = node.attr();
       name_to_send[attr.at("tensor_name").s()] = &node;
     }
   }
 
   // To reuse _Recv ops.
-  std::unordered_map<RecvNodeDescriptor, const NodeDef*, RecvNodeDescritorHash,
+  std::unordered_map<RecvNodeDescriptor, const NodeDef*, RecvNodeDescriptorHash,
                      RecvNodeDescriptorEqual>
       cached_recv_nodes;
 
@@ -258,13 +447,14 @@ Status VirtualScheduler::Init() {
   }
 
   if (ready_nodes_->Empty()) {
-    return Status(error::UNAVAILABLE, "No ready nodes in the graph.");
+    return errors::InvalidArgument("No ready nodes in the graph.");
   }
 
-  if (!feed_nodes.empty())
-    LOG(ERROR) << "Some feed nodes were not found in the graph: "
-               << str_util::Join(feed_nodes, ",");
-
+  if (!feed_nodes.empty()) {
+    return errors::InvalidArgument(
+        strings::StrCat("Some feed nodes were not found in the graph: ",
+                        str_util::Join(feed_nodes, ",")));
+  }
   initialized_ = true;
   return Status::OK();
 }
@@ -538,7 +728,8 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   string node_description = GetOpDescription(op_context.op_info);
   op_counts_[node_description] += 1;
   op_costs_[node_description] =
-      node_costs.execution_time.asMicroSeconds().count();
+      std::make_pair(node_costs.execution_time.asMicroSeconds().count(),
+                     !node_costs.inaccurate);
 
   auto& op_cost = FindOrCreateZero(op_name, &op_to_cost_);
   op_cost = CombineCosts(op_cost, node_costs);
@@ -647,8 +838,10 @@ Costs VirtualScheduler::Summary() const {
   for (const auto& op_cost_pair : op_to_cost_) {
     const auto& op = op_cost_pair.first;
     const auto& cost = op_cost_pair.second.execution_time.count();
+    const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
     if (cost) {  // Skip printing out zero-cost ops.
-      VLOG(1) << " + " << op << " : " << cost;
+      VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
+              << cost;
     }
   }
 
@@ -699,10 +892,16 @@ Costs VirtualScheduler::Summary() const {
           CalculateOutputSize(node_map_.at(node).output_properties, port);
     }
     Costs::NanoSeconds total_compute_time_ns;
+    bool is_total_cost_accurate = true;
     for (const auto& op_cost_pair : state.op_to_cost) {
       const auto& op = op_cost_pair.first;
       const auto& cost = op_cost_pair.second.execution_time.count();
       total_compute_time_ns += op_cost_pair.second.execution_time;
+      const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
+      if (!is_op_cost_accurate) {
+        is_total_cost_accurate = false;
+      }
+
       int64 op_mem_usage = 0;
       auto it = op_to_memory.find(op);
       if (it != op_to_memory.end()) {
@@ -714,9 +913,9 @@ Costs VirtualScheduler::Summary() const {
                                : 0.0;
       if (cost || mem_usage_percent > 1.0) {
         // Print out only non-zero cost ops or ops with > 1% memory usage.
-        VLOG(1) << " + " << op << " : " << cost << " ("
-                << strings::HumanReadableNumBytes(op_mem_usage) << " ["
-                << mem_usage_percent << "%] "
+        VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
+                << cost << " (" << strings::HumanReadableNumBytes(op_mem_usage)
+                << " [" << mem_usage_percent << "%] "
                 << (persisent_ops.count(op) > 0 ? ": persistent op)" : ")");
       }
     }
@@ -725,8 +924,9 @@ Costs VirtualScheduler::Summary() const {
     if (wall_time_ns.count() > 0) {
       utilization = total_compute_time_ns.count() * 100 / wall_time_ns.count();
     }
-    VLOG(1) << "Device = " << name
-            << ", total_compute_time_ns = " << total_compute_time_ns.count()
+    VLOG(1) << "Device = " << name << ", total_compute_time_ns = "
+            << (is_total_cost_accurate ? "" : "~")
+            << total_compute_time_ns.count()
             << ", utilization = " << utilization << "%";
 
     if (critical_path_costs.execution_time <= state.GetCurrTime()) {
@@ -738,8 +938,11 @@ Costs VirtualScheduler::Summary() const {
     // Also log the op description and their corresponding counts.
     VLOG(2) << "Node description, counts, cost:";
     for (const auto& item : op_counts_) {
+      int cost;
+      bool is_cost_accurate;
+      std::tie(cost, is_cost_accurate) = op_costs_.at(item.first);
       VLOG(2) << "Node: " << item.first << ", Count: " << item.second
-              << ", Individual Cost: " << op_costs_.at(item.first);
+              << ", Individual Cost: " << (is_cost_accurate ? "" : "~") << cost;
     }
   }
 
@@ -788,21 +991,12 @@ Costs VirtualScheduler::Summary(RunMetadata* metadata) {
             nodestate.time_scheduled.asMicroSeconds().count());
         auto* mem_stats = node_stats->mutable_memory_stats();
         // VirtualScheduler does not specify scratch pad memory usage.
-        mem_stats->set_host_temp_memory_size(0);
-        mem_stats->set_device_temp_memory_size(0);
-        int64 host_persistent_memory_size = 0;
-        int64 device_persistent_memory_size = 0;
+        mem_stats->set_temp_memory_size(0);
+        int64 persistent_memory_size = 0;
         if (IsPersistentNode(node_def)) {
-          if (device.first.find("cpu") != string::npos ||
-              device.first.find("CPU") != string::npos) {
-            host_persistent_memory_size = total_output_size;
-          } else {
-            device_persistent_memory_size = total_output_size;
-          }
+          persistent_memory_size = total_output_size;
         }
-        mem_stats->set_host_persistent_memory_size(host_persistent_memory_size);
-        mem_stats->set_device_persistent_memory_size(
-            device_persistent_memory_size);
+        mem_stats->set_persistent_memory_size(persistent_memory_size);
         *device_partition_graph->add_node() = *node_def;
       }
     }
@@ -810,5 +1004,16 @@ Costs VirtualScheduler::Summary(RunMetadata* metadata) {
   return Summary();
 }
 
+const std::unordered_map<string, int64> VirtualScheduler::GetPeakMemoryUsage()
+    const {
+  std::unordered_map<string, int64> result;
+  for (const auto& device : device_) {
+    const string& name = device.first;
+    const DeviceState& state = device.second;
+    result[name] = state.max_memory_usage;
+  }
+  return result;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index c74d80c2bee9b99afbcd68cfc8a7d4177e3160bc..5116c8183cb4c51dc833988cbeb75a4a184e4c40 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
 
 #include <list>
 #include <memory>
@@ -127,6 +127,8 @@ class ReadyNodeManager {
  public:
   ReadyNodeManager() {}
   virtual ~ReadyNodeManager() {}
+  virtual void Init(
+      const std::unordered_map<const NodeDef*, NodeState>* node_state) {}
   virtual void AddNode(const NodeDef* node) = 0;
   virtual const NodeDef* GetCurrNode() = 0;
   virtual void RemoveCurrNode() = 0;
@@ -137,6 +139,8 @@ class FIFOManager : public ReadyNodeManager {
  public:
   FIFOManager() : ReadyNodeManager() {}
   ~FIFOManager() override {}
+  void Init(const std::unordered_map<const NodeDef*, NodeState>* node_state)
+      override {}
   void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
   const NodeDef* GetCurrNode() override {
     CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
@@ -157,26 +161,11 @@ class LIFOManager : public ReadyNodeManager {
  public:
   LIFOManager() : ReadyNodeManager() {}
   ~LIFOManager() override {}
+  void Init(const std::unordered_map<const NodeDef*, NodeState>* node_state)
+      override {}
   void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
-  const NodeDef* GetCurrNode() override {
-    CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
-    if (curr_pos_ == nodes_.end()) {
-      curr_pos_ = --(nodes_.rbegin().base());  // Last one in the list.
-    }
-    // Once curr_pos_ is set to a valid entry in the list, we keep using the
-    // cached curr_pos_ until RemoveCurrNode() is called. AddNode() will not
-    // change the GetCurrNode() return value.
-    return *curr_pos_;
-  }
-  void RemoveCurrNode() override {
-    // Make sure we have curr_pos_ ready to be removed.
-    GetCurrNode();
-    // Note curr_pos_ may not be pointing the last element if some nodes are
-    // added.
-    nodes_.erase(curr_pos_);
-
-    curr_pos_ = nodes_.end();  // Reset curr_pos_.
-  }
+  const NodeDef* GetCurrNode() override;
+  void RemoveCurrNode() override;
   bool Empty() const override { return nodes_.empty(); }
 
  private:
@@ -193,55 +182,18 @@ class LIFOManager : public ReadyNodeManager {
 // time_ready value (it depends on C++ STL push_heap and pop_heap).
 class FirstReadyManager : public ReadyNodeManager {
  public:
-  FirstReadyManager(
-      const std::unordered_map<const NodeDef*, NodeState>* node_state)
-      : ReadyNodeManager(), node_state_(node_state) {
-    std::make_heap(nodes_.begin(), nodes_.end());
-    greater_ = [this](const NodeDef* a, const NodeDef* b) -> bool {
-      // Note: we need a node with minimum time_ready, not
-      // maximum; hence, using a > b for comparison function.
-      return node_state_->at(a).time_ready > node_state_->at(b).time_ready;
-    };
-  }
+  FirstReadyManager();
+  void Init(
+      const std::unordered_map<const NodeDef*, NodeState>* node_state) override;
   ~FirstReadyManager() override {}
-
   void AddNode(const NodeDef* node) override { waiting_queue_.push_back(node); }
-
-  const NodeDef* GetCurrNode() override {
-    if (nodes_.empty()) {
-      // Nothing in the node_; probably, the very first call. Move
-      // waiting_queue_ to node_.
-      _DrainWaitingQueue();
-      CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
-    }
-    return nodes_.front();
-  }
-
-  void RemoveCurrNode() override {
-    if (nodes_.empty()) {
-      // Make sure that there is a node to be removed at the front of nodes_.
-      GetCurrNode();
-    }
-    std::pop_heap(nodes_.begin(), nodes_.end(), greater_);
-    nodes_.pop_back();
-    _DrainWaitingQueue();
-  }
-
-  bool Empty() const override {
-    return nodes_.empty() && waiting_queue_.empty();
-  }
+  const NodeDef* GetCurrNode() override;
+  void RemoveCurrNode() override;
+  bool Empty() const override;
 
  private:
   // Move all the nodes in the waiting_queue_ to nodes_.
-  void _DrainWaitingQueue() {
-    for (const auto* node : waiting_queue_) {
-      // push_heap in AddNode() and pop_heap in RemoveCurrNode() guarantees that
-      // the first element is the node with minimum time_ready.
-      nodes_.push_back(node);
-      std::push_heap(nodes_.begin(), nodes_.end(), greater_);
-    }
-    waiting_queue_.clear();
-  }
+  void DrainWaitingQueue();
 
   // nodes_ is the main queue, where we construct heap, and the front is the
   // current node.
@@ -259,13 +211,49 @@ class FirstReadyManager : public ReadyNodeManager {
   const std::unordered_map<const NodeDef*, NodeState>* node_state_;
 };
 
+// CompositeNodeManager has a few other NodeManagers: per-device LIFO for normal
+// ops (neither _Send nor _Recv) and FirstyReadyManagers for _Send ops and _Recv
+// ops, and then it chooses FirstReady among the ops chosen from each
+// internal NodeManagers. The objective is to maximize producer-consumer
+// locality within device, while processing nodes across devices, including
+// _Send and _Recv, fairly, in terms of their time_ready.
+class CompositeNodeManager : public ReadyNodeManager {
+ public:
+  CompositeNodeManager();
+  ~CompositeNodeManager() override {}
+
+  void Init(
+      const std::unordered_map<const NodeDef*, NodeState>* node_state) override;
+  void AddNode(const NodeDef* node) override;
+  const NodeDef* GetCurrNode() override;
+  void RemoveCurrNode() override;
+  bool Empty() const override;
+
+ private:
+  // Internal ready node managers:
+  // LIFO for normal ops to maximize producer consumer locality.
+  // One LIFO per device.
+  std::unordered_map<string, LIFOManager> ops_lifo_map_;
+  // FirstReady for send and recv. Handle send and recv separately ensures that
+  // send and recv do not block previously read ops with LIFO schedule.
+  FirstReadyManager send_manager_;
+  FirstReadyManager recv_manager_;
+
+  // NodeState structure from VirtualScheduler to get time_ready of ready nodes.
+  // Not owned by FirstReadyManager.
+  const std::unordered_map<const NodeDef*, NodeState>* node_state_;
+
+  // Cached curr node. Set back to nullptr from RemoveCurrNode().
+  const NodeDef* curr_node_;
+};
+
 // The virtual scheduler emulates execution of nodes in a graph, considering
 // dependencies, device, etc.
 class VirtualScheduler {
  public:
   VirtualScheduler(const GrapplerItem* grappler_item,
-                   const bool use_static_shapes, Cluster* cluster);
-
+                   const bool use_static_shapes, Cluster* cluster,
+                   ReadyNodeManager* ready_nodes);
   // Initializes NodeState and DeviceState from grappler_item_ and
   // graph_properties_.
   Status Init();
@@ -280,6 +268,12 @@ class VirtualScheduler {
   // Like the above, but writes detailed stats to RunMetadata.
   // If metadata is nullptr, then just calls and return Summary().
   Costs Summary(RunMetadata* metadata);
+  // Methods called from constructor.
+  static ReadyNodeManager* ReadyNodeManagerFactory(
+      const string& ready_node_manager);
+
+  // Return per device peak memory usage.
+  const std::unordered_map<string, int64> GetPeakMemoryUsage() const;
 
  protected:
   const std::unordered_map<string, DeviceState>* GetDeviceStates() const {
@@ -302,9 +296,6 @@ class VirtualScheduler {
   const string kAttrDstDevice = "dst_device_";
   const string kChannelDevice = "Channel";
 
-  // Methods called from constructor.
-  ReadyNodeManager* ReadyNodeManagerFactory(const string& ready_node_manager);
-
   // Methods called from Init(). Fails if initialize_ is set.
   void MaybeUpdateInputOutput(const NodeDef* node);
   NodeState& GetNodeStateOrCreateIt(const NodeDef* node);
@@ -321,7 +312,7 @@ class VirtualScheduler {
   bool IsPersistentNode(const NodeDef* node) const;
 
   // Scheduler states:
-  std::unique_ptr<ReadyNodeManager> ready_nodes_;
+  ReadyNodeManager* ready_nodes_;  // Not owned.
   std::unordered_map<const NodeDef*, NodeState> node_map_;
   std::unordered_map<string, DeviceState> device_;
 
@@ -330,8 +321,11 @@ class VirtualScheduler {
 
   // Stats:
   std::map<string, int> op_counts_;  // Op counts with key with input shape.
-  std::map<string, int> op_costs_;   // Individual op costs (with input shapes).
-  Costs graph_costs_;                // Graph cost.
+  // Individual op costs (with input shapes).
+  // Boolean field for whether the cost is accurate.
+  std::map<string, std::pair<int, bool>> op_costs_;
+
+  Costs graph_costs_;                   // Graph cost.
   std::map<string, Costs> op_to_cost_;  // Per-op cost.
 
   // Auxilliary data structures for constructing NodeState and DeviceState.
@@ -348,4 +342,4 @@ class VirtualScheduler {
 }  // namespace grappler
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 412b494be730c21bf8b3d8bd791cc42dcbf15794..53dcb497a6453dfa70c1215352e74e96796ebeb7 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -29,7 +29,8 @@ class TestVirtualScheduler : public VirtualScheduler {
  public:
   TestVirtualScheduler(const GrapplerItem* grappler_item,
                        const bool use_static_shapes, Cluster* cluster)
-      : VirtualScheduler(grappler_item, use_static_shapes, cluster) {}
+      : VirtualScheduler(grappler_item, use_static_shapes, cluster,
+                         &ready_node_manager_) {}
 
   FRIEND_TEST(VirtualSchedulerTest, CalculateOutputSize);
   FRIEND_TEST(VirtualSchedulerTest, MemoryUsage);
@@ -37,6 +38,9 @@ class TestVirtualScheduler : public VirtualScheduler {
   FRIEND_TEST(VirtualSchedulerTest, ComplexDependency);
   FRIEND_TEST(VirtualSchedulerTest, Variable);
   FRIEND_TEST(VirtualSchedulerTest, InterDeviceTransfer);
+
+ protected:
+  FirstReadyManager ready_node_manager_;
 };
 
 class VirtualSchedulerTest : public ::testing::Test {
@@ -44,8 +48,15 @@ class VirtualSchedulerTest : public ::testing::Test {
   NodeDef node1_, node2_, node3_, node4_, node5_, node6_;
   std::unordered_map<const NodeDef*, NodeState> node_states_;
 
+  // Device names:
   const string kCPU0 = "/job:localhost/replica:0/task:0/cpu:0";
   const string kCPU1 = "/job:localhost/replica:0/task:0/cpu:1";
+  const string kChannelFrom0To1 = "Channel from CPU0 to CPU1";
+  const string kChannelFrom1To0 = "Channel from CPU1 to CPU0";
+  // Op names:
+  const string kSend = "_Send";
+  const string kRecv = "_Recv";
+  const string kConv2D = "Conv2D";
 
   DeviceProperties GetDummyCPUDevice() {
     // Create CPU with 2 cores, 4 Ghz freq, 2 GB/s mem bandwidth.
@@ -59,29 +70,26 @@ class VirtualSchedulerTest : public ::testing::Test {
     return cpu_device;
   }
 
+  void NodeSetUp(const string& name, const string& op_name,
+                 const string& device_name, const uint64 time_ready,
+                 NodeDef* node) {
+    node->set_name(name);
+    node->set_op(op_name);
+    node->set_device(device_name);
+
+    node_states_[node] = NodeState();
+    node_states_[node].time_ready = time_ready;
+    node_states_[node].device_name = device_name;
+  }
+
   void SetUp() override {
-    // Initializes nodes for manager
-    node1_.set_name("Node1");
-    node2_.set_name("Node2");
-    node3_.set_name("Node3");
-    node4_.set_name("Node4");
-    node5_.set_name("Node5");
-    node6_.set_name("Node6");
-
-    // Initialize node_states, with time_ready in reverse order.
-    node_states_[&node1_] = NodeState();
-    node_states_[&node2_] = NodeState();
-    node_states_[&node3_] = NodeState();
-    node_states_[&node4_] = NodeState();
-    node_states_[&node5_] = NodeState();
-    node_states_[&node6_] = NodeState();
-
-    node_states_[&node6_].time_ready = 1000;
-    node_states_[&node5_].time_ready = 2000;
-    node_states_[&node4_].time_ready = 3000;
-    node_states_[&node3_].time_ready = 4000;
-    node_states_[&node2_].time_ready = 5000;
-    node_states_[&node1_].time_ready = 6000;
+    // node1_ to node6_ on kCPU0, with time_ready in reverse_order.
+    NodeSetUp("Node1", kConv2D, kCPU0, 6000, &node1_);
+    NodeSetUp("Node2", kConv2D, kCPU0, 5000, &node2_);
+    NodeSetUp("Node3", kConv2D, kCPU0, 4000, &node3_);
+    NodeSetUp("Node4", kConv2D, kCPU0, 3000, &node4_);
+    NodeSetUp("Node5", kConv2D, kCPU0, 2000, &node5_);
+    NodeSetUp("Node6", kConv2D, kCPU0, 1000, &node6_);
 
     // Initializes cluster_ and placer_.
     std::unordered_map<string, DeviceProperties> devices;
@@ -1144,23 +1152,24 @@ TEST_F(VirtualSchedulerTest, AddAndRemoveMultipleLIFOManager) {
 }
 
 TEST_F(VirtualSchedulerTest, GetSingleNodeFirstReadyManager) {
-  FirstReadyManager manager = FirstReadyManager(&node_states_);
+  FirstReadyManager manager;
+  manager.Init(&node_states_);
 
   manager.AddNode(&node1_);
   EXPECT_EQ("Node1", manager.GetCurrNode()->name());
 }
 
 TEST_F(VirtualSchedulerTest, RemoveSingleNodeFirstReadyManager) {
-  FirstReadyManager manager = FirstReadyManager(&node_states_);
-
+  FirstReadyManager manager;
+  manager.Init(&node_states_);
   manager.AddNode(&node1_);
   manager.RemoveCurrNode();
   EXPECT_TRUE(manager.Empty());
 }
 
 TEST_F(VirtualSchedulerTest, GetAndRemoveMultipleFirstReadyManager) {
-  FirstReadyManager manager = FirstReadyManager(&node_states_);
-
+  FirstReadyManager manager;
+  manager.Init(&node_states_);
   // Insert nodes in some random order.
   manager.AddNode(&node2_);
   manager.AddNode(&node1_);
@@ -1187,8 +1196,8 @@ TEST_F(VirtualSchedulerTest, GetAndRemoveMultipleFirstReadyManager) {
 }
 
 TEST_F(VirtualSchedulerTest, GetCurrNodeFirstReadyManager) {
-  FirstReadyManager manager = FirstReadyManager(&node_states_);
-
+  FirstReadyManager manager;
+  manager.Init(&node_states_);
   // Insert nodes in some random order.
   manager.AddNode(&node2_);
   manager.AddNode(&node1_);
@@ -1207,15 +1216,9 @@ TEST_F(VirtualSchedulerTest, GetCurrNodeFirstReadyManager) {
   NodeDef node7;
   NodeDef node8;
   NodeDef node9;
-  node7.set_name("Node7");
-  node8.set_name("Node8");
-  node9.set_name("Node9");
-  node_states_[&node7] = NodeState();
-  node_states_[&node8] = NodeState();
-  node_states_[&node9] = NodeState();
-  node_states_[&node7].time_ready = 5;
-  node_states_[&node8].time_ready = 4;
-  node_states_[&node9].time_ready = 3;
+  NodeSetUp("Node7", kConv2D, kCPU0, 5, &node7);
+  NodeSetUp("Node8", kConv2D, kCPU0, 4, &node8);
+  NodeSetUp("Node9", kConv2D, kCPU0, 3, &node9);
 
   manager.AddNode(&node7);
   EXPECT_EQ("Node6", manager.GetCurrNode()->name());
@@ -1249,6 +1252,296 @@ TEST_F(VirtualSchedulerTest, GetCurrNodeFirstReadyManager) {
   EXPECT_TRUE(manager.Empty());
 }
 
+TEST_F(VirtualSchedulerTest, DeterminismInFirstReadyManager) {
+  FirstReadyManager manager1;
+  manager1.Init(&node_states_);
+  FirstReadyManager manager2;
+  manager2.Init(&node_states_);
+
+  // 6 nodes with same time_ready.
+  NodeDef node7;
+  NodeDef node8;
+  NodeDef node9;
+  NodeDef node10;
+  NodeDef node11;
+  NodeDef node12;
+  NodeSetUp("Node7", kConv2D, kCPU0, 1000, &node7);
+  NodeSetUp("Node8", kConv2D, kCPU0, 1000, &node8);
+  NodeSetUp("Node9", kConv2D, kCPU0, 1000, &node9);
+  NodeSetUp("Node10", kConv2D, kCPU0, 1000, &node10);
+  NodeSetUp("Node11", kConv2D, kCPU0, 1000, &node11);
+  NodeSetUp("Node12", kConv2D, kCPU0, 1000, &node12);
+
+  // Add the above 6 nodes to manager1.
+  manager1.AddNode(&node7);
+  manager1.AddNode(&node8);
+  manager1.AddNode(&node9);
+  manager1.AddNode(&node10);
+  manager1.AddNode(&node11);
+  manager1.AddNode(&node12);
+
+  // Add the above 6 nodes to manager2, but in a different order.
+  manager2.AddNode(&node8);
+  manager2.AddNode(&node11);
+  manager2.AddNode(&node9);
+  manager2.AddNode(&node10);
+  manager2.AddNode(&node7);
+  manager2.AddNode(&node12);
+
+  // Expect both managers return the same nodes for deterministic node
+  // scheduling.
+  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager1.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+
+  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager1.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+
+  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager1.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+
+  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager1.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+
+  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager1.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+
+  EXPECT_EQ(manager1.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager1.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+
+  EXPECT_TRUE(manager1.Empty());
+  EXPECT_TRUE(manager2.Empty());
+}
+
+TEST_F(VirtualSchedulerTest, RemoveSingleNodeCompositeNodeManager) {
+  CompositeNodeManager manager;
+  manager.Init(&node_states_);
+  manager.AddNode(&node1_);
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(VirtualSchedulerTest, RemoveSingleNodeComopsiteNodeManager) {
+  CompositeNodeManager manager;
+  manager.Init(&node_states_);
+
+  manager.AddNode(&node1_);
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(VirtualSchedulerTest, GetAndRemoveMultipleComopsiteNodeManager) {
+  CompositeNodeManager manager;
+  manager.Init(&node_states_);
+
+  // Add the nodes to LIFOManager.
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+
+  // Keep checking current node as nodes are removed and added.
+  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
+  manager.AddNode(&node5_);
+  // GetCurrNode()  should return the same node even if some nodes are added,
+  // until RemoveCurrNode() is called.
+  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node5", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
+  manager.AddNode(&node6_);
+  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(VirtualSchedulerTest, MultiDeviceSendRecvComopsiteNodeManager) {
+  CompositeNodeManager manager;
+  manager.Init(&node_states_);
+  // Additional nodes on kCPU1
+  NodeDef node7;
+  NodeDef node8;
+  NodeDef node9;
+  NodeSetUp("Node7", kConv2D, kCPU1, 1001, &node7);
+  NodeSetUp("Node8", kConv2D, kCPU1, 2001, &node8);
+  NodeSetUp("Node9", kConv2D, kCPU1, 3001, &node9);
+
+  // Send and Recv nodes.
+  NodeDef send1;
+  NodeDef send2;
+  NodeDef recv1;
+  NodeDef recv2;
+  NodeSetUp("Send1", kSend, kChannelFrom0To1, 2002, &send1);
+  NodeSetUp("Send2", kSend, kChannelFrom1To0, 2005, &send2);
+  NodeSetUp("Recv1", kRecv, kCPU0, 2003, &recv1);
+  NodeSetUp("Recv2", kRecv, kCPU1, 2004, &recv2);
+
+  // Insert nodes.
+  manager.AddNode(&node1_);
+  manager.AddNode(&node2_);
+  manager.AddNode(&node3_);
+  manager.AddNode(&node4_);
+  manager.AddNode(&node5_);
+  manager.AddNode(&node6_);
+  manager.AddNode(&node7);
+  manager.AddNode(&node8);
+  manager.AddNode(&node9);
+  manager.AddNode(&send1);
+  manager.AddNode(&send2);
+  manager.AddNode(&recv1);
+  manager.AddNode(&recv2);
+
+  // on kCPU0; last one is node6_, on kCPU1: last one is node9;
+  // so choose one that has earliest time_ready among node6_, node9,
+  // Send1, Send2, Recv1, and Recv2.
+  EXPECT_EQ("Node6", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Then, the next one on kCPU0 is node5_; choose the earliest time_ready node
+  // among node5_, node9, Send1, Send2, Recv1, and Recv2.
+  EXPECT_EQ("Node5", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose among node4_, node9, Send1, Send2, Recv1, and Recv2.
+  EXPECT_EQ("Send1", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose among node4_, node9, Sen2, Recv1, and Recv2.
+  EXPECT_EQ("Recv1", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose among node4_, node9, Send2, and Recv2.
+  EXPECT_EQ("Recv2", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose among node4_, node9, and Send2.
+  EXPECT_EQ("Send2", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose between node4_, node9.
+  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose between node3_, node9.
+  EXPECT_EQ("Node9", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose between node3_, node8.
+  EXPECT_EQ("Node8", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Next, choose between node3_, node7.
+  EXPECT_EQ("Node7", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  // Then, just the nodes on kCPU1 -- LIFO.
+  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node1", manager.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
+TEST_F(VirtualSchedulerTest, DeterminismInCompositeNodeManager) {
+  CompositeNodeManager manager;
+  manager.Init(&node_states_);
+  CompositeNodeManager manager2;
+  manager2.Init(&node_states_);
+
+  // 6 nodes with same time_ready.
+  NodeDef node7;
+  NodeDef node8;
+  NodeDef node9;
+  NodeDef node10;
+  NodeDef node11;
+  NodeDef node12;
+  NodeSetUp("Node7", kConv2D, kCPU0, 1000, &node7);
+  NodeSetUp("Node8", kSend, kCPU0, 1000, &node8);
+  NodeSetUp("Node9", kRecv, kCPU0, 1000, &node9);
+  NodeSetUp("Node10", kConv2D, kCPU0, 999, &node10);
+  NodeSetUp("Node11", kRecv, kCPU0, 999, &node11);
+  NodeSetUp("Node12", kConv2D, kCPU1, 1000, &node12);
+
+  // Add Nodes 7 to 9 to manager.
+  manager.AddNode(&node7);
+  manager.AddNode(&node8);
+  manager.AddNode(&node9);
+
+  // It should return _Send, Recv, and the other op order, when the candidate
+  // nodes have same time_ready.
+  EXPECT_EQ("Node8", manager.GetCurrNode()->name());
+  EXPECT_EQ(kSend, manager.GetCurrNode()->op());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node9", manager.GetCurrNode()->name());
+  EXPECT_EQ(kRecv, manager.GetCurrNode()->op());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node7", manager.GetCurrNode()->name());
+  EXPECT_EQ(kConv2D, manager.GetCurrNode()->op());
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+
+  // Add Nodes 7 to 9 to manager, but in a different order.
+  manager.AddNode(&node9);
+  manager.AddNode(&node8);
+  manager.AddNode(&node7);
+
+  // Expect same order (_Send, _Recv, and the other op), regardless of Add
+  // order.
+  EXPECT_EQ("Node8", manager.GetCurrNode()->name());
+  EXPECT_EQ(kSend, manager.GetCurrNode()->op());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node9", manager.GetCurrNode()->name());
+  EXPECT_EQ(kRecv, manager.GetCurrNode()->op());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node7", manager.GetCurrNode()->name());
+  EXPECT_EQ(kConv2D, manager.GetCurrNode()->op());
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+
+  // Conv2D's time_ready < Send's time_ready; Expect Conv2D first.
+  manager.AddNode(&node8);
+  manager.AddNode(&node10);
+  EXPECT_EQ("Node10", manager.GetCurrNode()->name());
+  EXPECT_EQ(kConv2D, manager.GetCurrNode()->op());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node8", manager.GetCurrNode()->name());
+  EXPECT_EQ(kSend, manager.GetCurrNode()->op());
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+
+  // Recv's time_ready < Send' time_ready; Expect Recv first.
+  manager.AddNode(&node11);
+  manager.AddNode(&node8);
+  EXPECT_EQ("Node11", manager.GetCurrNode()->name());
+  EXPECT_EQ(kRecv, manager.GetCurrNode()->op());
+  manager.RemoveCurrNode();
+  EXPECT_EQ("Node8", manager.GetCurrNode()->name());
+  EXPECT_EQ(kSend, manager.GetCurrNode()->op());
+  manager.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+
+  // Node7 and 12 are normal ops with the same time_ready, placed on different
+  // devices. These two nodes are added to manager and manager2, but in
+  // different orders; Expect GetCurrNode() returns the nodes in the same order.
+  manager.AddNode(&node7);
+  manager.AddNode(&node12);
+
+  manager2.AddNode(&node12);
+  manager2.AddNode(&node7);
+
+  EXPECT_EQ(manager.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+  EXPECT_EQ(manager.GetCurrNode()->name(), manager2.GetCurrNode()->name());
+  manager.RemoveCurrNode();
+  manager2.RemoveCurrNode();
+  EXPECT_TRUE(manager.Empty());
+}
+
 // Create small graph, run predict costs on it, make sure the costs from the
 // summary match the hand-calculated costs.
 TEST_F(VirtualSchedulerTest, SummaryCostTest) {
@@ -1580,7 +1873,7 @@ TEST_F(VirtualSchedulerTest, WhileLoop) {
   EXPECT_NE(exit_start_micro, exit_1_start_micro);
 
   // Check dependency among the nodes; no matter what scheduling mechanism we
-  // use, the scheduled ops should follow these depedency chains.
+  // use, the scheduled ops should follow these dependency chains.
   // Note that currently, VirtualScheduler executes while/Merge twice; hence,
   // we're not testing dependency chains related to while/Merge.
   // TODO(dyoon): after fixing while loop behavior correctly (run nodes in the
@@ -1634,20 +1927,20 @@ TEST_F(VirtualSchedulerTest, InterDeviceTransfer) {
     const auto& name = x.first;
     const auto& node_info = x.second;
     const auto& op = node_info.op_info.op();
-    if (op == "_Recv") {
+    if (op == kRecv) {
       recv_op_names[get_port_num(name)] = name;
-    } else if (op == "_Send") {
+    } else if (op == kSend) {
       send_op_names[get_port_num(name)] = name;
     }
     op_count[op]++;
   }
 
   // Same number of _Send and _Recv.
-  EXPECT_EQ(op_count.at("_Send"), op_count.at("_Recv"));
+  EXPECT_EQ(op_count.at(kSend), op_count.at(kRecv));
 
   // Expect 4 Send and Recvs each: port 0, 1, and, 2, and control dependency.
-  EXPECT_EQ(op_count.at("_Recv"), 4);
-  EXPECT_EQ(op_count.at("_Send"), 4);
+  EXPECT_EQ(op_count.at(kRecv), 4);
+  EXPECT_EQ(op_count.at(kSend), 4);
 
   // Helper lambda for extracting output Tensor size.
   auto get_output_size = [this, ops_executed](const string& name) -> int64 {
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index bf8a98a722a1bb87ecf9c3c625a16145d74f9b01..0d3f94854b65cfc06c3d68fc5ac7bc3aa68f9a34 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -24,7 +24,7 @@ GraphView::GraphView(GraphDef* graph) : graph_(graph) {
     auto node = graph_->mutable_node(i);
     auto rslt = nodes_.insert(std::make_pair(node->name(), node));
     // Check that the graph doesn't contain multiple nodes with the same name.
-    CHECK(rslt.second);
+    CHECK(rslt.second) << "Non unique node name detected: " << node->name();
   }
   for (NodeDef& node : *graph_->mutable_node()) {
     for (int i = 0; i < node.input_size(); ++i) {
@@ -38,6 +38,8 @@ GraphView::GraphView(GraphDef* graph) : graph_(graph) {
         input.port_id = -1;
       } else {
         input.port_id = i;
+        num_regular_outputs_[fanin.node] =
+            std::max(num_regular_outputs_[fanin.node], fanin.port_id);
       }
 
       fanouts_[fanin].insert(input);
@@ -80,7 +82,7 @@ GraphView::GetFanout(const GraphView::OutputPort& port) const {
   return it->second;
 }
 
-const std::unordered_set<GraphView::OutputPort, GraphView::HashPort>
+std::unordered_set<GraphView::OutputPort, GraphView::HashPort>
 GraphView::GetFanin(const GraphView::InputPort& port) const {
   std::unordered_set<GraphView::OutputPort, GraphView::HashPort> result;
   if (port.port_id >= 0) {
@@ -118,5 +120,58 @@ const GraphView::OutputPort GraphView::GetRegularFanin(
   return fanin;
 }
 
+std::unordered_set<GraphView::InputPort, GraphView::HashPort>
+GraphView::GetFanouts(const NodeDef& node,
+                      bool include_controlled_nodes) const {
+  std::unordered_set<InputPort, HashPort> result;
+  OutputPort port;
+  port.node = const_cast<NodeDef*>(&node);
+  const int first_port_id = include_controlled_nodes ? -1 : 0;
+  auto it = num_regular_outputs_.find(&node);
+  const int last_port_id = (it != num_regular_outputs_.end()) ? it->second : -1;
+
+  for (int i = first_port_id; i <= last_port_id; ++i) {
+    port.port_id = i;
+    auto it = fanouts_.find(port);
+    if (it != fanouts_.end()) {
+      result.insert(it->second.begin(), it->second.end());
+    }
+  }
+  return result;
+}
+
+std::unordered_set<GraphView::OutputPort, GraphView::HashPort>
+GraphView::GetFanins(const NodeDef& node,
+                     bool include_controlling_nodes) const {
+  std::unordered_set<OutputPort, HashPort> result;
+  for (int i = 0; i < node.input_size(); ++i) {
+    OutputPort fanin;
+    string fanin_name = ParseNodeName(node.input(i), &fanin.port_id);
+    if (fanin.port_id < 0) {
+      if (!include_controlling_nodes) {
+        break;
+      }
+    }
+    auto it = nodes_.find(fanin_name);
+    if (it != nodes_.end()) {
+      fanin.node = it->second;
+      result.insert(fanin);
+    }
+  }
+  return result;
+}
+
+int GraphView::NumFanins(const NodeDef& node,
+                         bool include_controlling_nodes) const {
+  int count = 0;
+  for (const string& input : node.input()) {
+    if (!include_controlling_nodes && IsControlInput(input)) {
+      break;
+    }
+    count += 1;
+  }
+  return count;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index a24310ad1a40b7e84e2fa67686c1bf0575ac5881..173ce9c09c2fd98d855a801131ed16a796d9caac 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -29,8 +29,8 @@ namespace grappler {
 class GraphView {
  public:
   struct Port {
-    NodeDef* node;
-    int port_id;
+    NodeDef* node = nullptr;
+    int port_id = -1;
 
     bool operator==(const Port& other) const {
       return node == other.node && port_id == other.port_id;
@@ -46,24 +46,39 @@ class GraphView {
   };
 
   explicit GraphView(GraphDef* graph);
+  GraphDef* GetGraph() const { return graph_; }
   NodeDef* GetNode(const string& node_name) const;
   // Get the specified input port. Note that the special '-1' port_id can be
   // used to access the controlling nodes (i.e. the nodes connected to node_name
   // through an incoming control dependency).
   InputPort GetInputPort(const string& node_name, int port_id) const;
-  // Get the specified input port. Note that the special '-1' port_id can be
+  // Get the specified output port. Note that the special '-1' port_id can be
   // used to access the controlled nodes (i.e. the nodes connected to node_name
   // through an outgoing control dependency).
-
-  // Special case: regular (i.e. non-control) ports can only have one fanin.
   OutputPort GetOutputPort(const string& node_name, int port_id) const;
 
+  // Get the input (resp. output) port(s) in the immediate fanout (resp. fanin)
+  // of an output (resp. input) port.
   const std::unordered_set<InputPort, HashPort>& GetFanout(
       const OutputPort& port) const;
-  const std::unordered_set<OutputPort, HashPort> GetFanin(
+  std::unordered_set<OutputPort, HashPort> GetFanin(
       const InputPort& port) const;
+  // Special case: regular (i.e. non-control) input ports can only have one
+  // fanin.
   const OutputPort GetRegularFanin(const InputPort& port) const;
 
+  // Get all the input (resp. output) ports in the immediate fanout (resp fanin)
+  // of a node. Include the controlling nodes iff include_controlling_nodes is
+  // true.
+  std::unordered_set<InputPort, HashPort> GetFanouts(
+      const NodeDef& node, bool include_controlled_nodes) const;
+  std::unordered_set<OutputPort, HashPort> GetFanins(
+      const NodeDef& node, bool include_controlling_nodes) const;
+
+  // Get the number of ports in the immediate fanin of a node. Count the
+  // controlling nodes iff include_controlling_nodes is true.
+  int NumFanins(const NodeDef& node, bool include_controlling_nodes) const;
+
  private:
   GraphDef* graph_;
   std::unordered_map<string, NodeDef*> nodes_;
@@ -71,7 +86,7 @@ class GraphView {
   std::unordered_map<OutputPort, std::unordered_set<InputPort, HashPort>,
                      HashPort>
       fanouts_;
-  std::unordered_map<NodeDef*, std::unordered_set<NodeDef*>> controlled_nodes_;
+  std::unordered_map<const NodeDef*, int> num_regular_outputs_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/graph_view_test.cc b/tensorflow/core/grappler/graph_view_test.cc
index 15bed07d017a18d53973da012e7add4085380a74..958eb921fb72c8ff16657c1c7d6269878a5a69ae 100644
--- a/tensorflow/core/grappler/graph_view_test.cc
+++ b/tensorflow/core/grappler/graph_view_test.cc
@@ -58,6 +58,22 @@ TEST_F(GraphViewTest, BasicGraph) {
       EXPECT_FALSE(true);
     }
   }
+
+  const NodeDef* add_node = graph.GetNode("AddN");
+  EXPECT_NE(nullptr, add_node);
+  string fanouts;
+  for (const auto& fo : graph.GetFanouts(*add_node, false)) {
+    strings::StrAppend(&fanouts,
+                       strings::StrCat(fo.node->name(), ":", fo.port_id, " "));
+  }
+  EXPECT_EQ("AddN_2:0 AddN_3:0 ", fanouts);
+
+  string fanins;
+  for (const auto& fi : graph.GetFanins(*add_node, false)) {
+    strings::StrAppend(&fanins,
+                       strings::StrCat(fi.node->name(), ":", fi.port_id, " "));
+  }
+  EXPECT_EQ("Square_1:0 Square:0 ", fanins);
 }
 
 TEST_F(GraphViewTest, ControlDependencies) {
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 149f6fc7353b3c96e9d780c20697873c15bccaa8..2f8549cf395f6b78154f7a6faf3fea06ea6c56c4 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -134,6 +134,7 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
     const NodeDef* node = name_to_node[NodeName(root)];
     if (!node) {
       *ill_formed = true;
+      VLOG(2) << "ComputeTransitiveFanin: problem with root node: " << root;
       return {};
     }
     queue.push_back(node);
@@ -153,6 +154,7 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
     for (const string& input : node->input()) {
       const NodeDef* in = name_to_node[NodeName(input)];
       if (!in) {
+        VLOG(2) << "ComputeTransitiveFanin: problem with node: " << input;
         *ill_formed = true;
         return {};
       }
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 36c7f92c49e74244494af1c4ba5e45c23f6f49b0..7ba498dd06409635d7dfc282ab29f1133e299c9b 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -126,9 +126,6 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   graph_ctor_opts.allow_internal_ops = true;
   graph_ctor_opts.expect_device_spec = false;
   std::unique_ptr<Graph> graphptr(new Graph(function_library));
-  // Populate default attrs to the NodeDefs in the GraphDef.
-  TF_RETURN_IF_ERROR(
-      AddDefaultAttrsToGraphDef(&graph_def, *graphptr->op_registry(), 0));
 
   TF_RETURN_IF_ERROR(
       ConvertGraphDefToGraph(graph_ctor_opts, graph_def, graphptr.get()));
@@ -138,7 +135,10 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   optimizer.Optimize(flr, env, devices[0], &graphptr, /*shape_map=*/nullptr);
   graphptr->ToGraphDef(output_graph_def);
 
-  return Status::OK();
+  // The default values of attributes might have been stripped by the optimizer.
+  // Add them back.
+  return AddDefaultAttrsToGraphDef(output_graph_def, *graphptr->op_registry(),
+                                   0);
 }
 
 // Applies the same graph pruning logic to the graph as Session.Run in TF.
@@ -173,7 +173,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
                  << ", skipping this input.";
       return nullptr;
     }
-    LOG(INFO) << "Will use feed node " << feed_name;
+    VLOG(1) << "Will use feed node " << feed_name;
     new_item->feed.emplace_back(feed_name, Tensor());
   }
 
@@ -188,7 +188,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
                      << ", skipping this input";
           return nullptr;
         }
-        LOG(INFO) << "Will use fetch node " << name;
+        VLOG(1) << "Will use fetch node " << name;
         new_item->fetch.push_back(name);
       }
     }
@@ -449,6 +449,18 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
     new_item->save_restore_loc_tensor = saver.filename_tensor_name();
   }
 
+  // Instantiate all the missing attributes with their default values.
+  Status attr_status = AddDefaultAttrsToGraphDef(
+      &new_item->graph,
+      FunctionLibraryDefinition(OpRegistry::Global(),
+                                new_item->graph.library()),
+      0);
+  if (!attr_status.ok()) {
+    LOG(ERROR) << "Failed to instantiate default attribute values: "
+               << attr_status.error_message();
+    return nullptr;
+  }
+
   // Optimize the graph (function inlining, l1 optimizations, etc).
   VLOG(1) << "Number of nodes in graph before OptimizeGraph: "
           << new_item->graph.node_size();
@@ -498,5 +510,113 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   return new_item;
 }
 
+std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
+    const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr,
+    const FunctionDefLibrary& library) {
+  if (func.signature().name().empty()) {
+    LOG(ERROR) << "function name must be specified.";
+    return nullptr;
+  }
+  std::unique_ptr<GrapplerItem> new_item(new GrapplerItem());
+  new_item->id = func.signature().name();
+
+  std::unordered_map<string, string> port_map;
+
+  // Add the function inputs as placeholder
+  for (const auto& inp : func.signature().input_arg()) {
+    NodeDef* ph = new_item->graph.add_node();
+    ph->set_name(inp.name());
+    ph->set_op("Placeholder");
+    if (inp.type() != DT_INVALID) {
+      (*ph->mutable_attr())["T"].set_type(inp.type());
+    } else {
+      auto it = func_attr.find(inp.type_attr());
+      if (it == func_attr.end()) {
+        LOG(ERROR) << "Unknown type attribute " << inp.type_attr()
+                   << " for function input " << inp.name();
+        return nullptr;
+      } else {
+        (*ph->mutable_attr())["T"] = it->second;
+      }
+    }
+    port_map[inp.name()] = inp.name();
+  }
+
+  // Add the function body to the graph.
+  FunctionLibraryDefinition func_def(OpRegistry::Global(), library);
+
+  for (const NodeDef& node : func.node_def()) {
+    NodeDef* new_node = new_item->graph.add_node();
+    *new_node = node;
+    // Replace the placeholder attribute values with the specified value.
+    for (auto& attr : *new_node->mutable_attr()) {
+      const string& ph_name = attr.second.placeholder();
+      auto it = func_attr.find(ph_name);
+      if (it != func_attr.end()) {
+        attr.second = it->second;
+      }
+    }
+
+    // Functions use a custom format to encode connectivity. Map these custom
+    // strings to regular ones.
+    const OpRegistrationData* registration;
+    Status status = func_def.LookUp(node.op(), &registration);
+    if (!status.ok()) {
+      LOG(ERROR) << "Op " << node.op() << " not registered: " << status;
+      return nullptr;
+    }
+
+    tensorflow::NameRangeMap inputs;
+    tensorflow::NameRangeMap outputs;
+    status = tensorflow::NameRangesForNode(node, registration->op_def, &inputs,
+                                           &outputs);
+    if (!status.ok()) {
+      LOG(ERROR) << "Op " << node.op() << " invalid: " << status;
+      return nullptr;
+    }
+    for (const auto& name_range : outputs) {
+      string port_prefix =
+          strings::StrCat(node.name(), ":", name_range.first, ":");
+      int index_start = name_range.second.first;
+      int index_end = name_range.second.second;
+      for (int i = index_start; i < index_end; ++i) {
+        string port_id = strings::StrCat(port_prefix, i - index_start);
+        string port_name = strings::StrCat(node.name(), ":", i);
+        port_map[port_id] = port_name;
+      }
+    }
+  }
+
+  for (auto& node : *new_item->graph.mutable_node()) {
+    // Rewrite the inputs to use the normal naming convention.
+    for (int i = 0; i < node.input_size(); ++i) {
+      const string& input = node.input(i);
+      if (IsControlInput(input)) {
+        // No need to remap control dependencies.
+        continue;
+      } else {
+        auto it = port_map.find(input);
+        if (it == port_map.end()) {
+          LOG(ERROR) << "Unknown input: " << input;
+          return nullptr;
+        }
+        node.set_input(i, it->second);
+      }
+    }
+  }
+
+  // Add the function outputs to the list of fetch nodes.
+  for (const auto& out : func.signature().output_arg()) {
+    new_item->fetch.emplace_back(out.name());
+  }
+  // Add the function inputs to the list of feeds.
+  for (const auto& inp : func.signature().input_arg()) {
+    new_item->feed.emplace_back(inp.name(), Tensor());
+  }
+
+  return new_item;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item_builder.h b/tensorflow/core/grappler/grappler_item_builder.h
index 85151aabea107d40d7770da2ec398c1d305355a4..e892a3f556f7e9ccba91d5ce672a12d2eac49f5a 100644
--- a/tensorflow/core/grappler/grappler_item_builder.h
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <set>
 #include <string>
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 
 namespace tensorflow {
@@ -57,6 +58,13 @@ struct ItemConfig {
 std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
     const string& id, const MetaGraphDef& meta_graph, const ItemConfig& cfg);
 
+// Factory method for creating a GrapplerItem from a FunctionDef.
+// Returns nullptr if the given function def cannot be converted.
+std::unique_ptr<GrapplerItem> GrapplerItemFromFunctionDef(
+    const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr,
+    const FunctionDefLibrary& library);
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/grappler_item_builder_test.cc b/tensorflow/core/grappler/grappler_item_builder_test.cc
index 4272179d3cbef35362dc3330b5d1b3076df9bdb1..68437b60419f73419bca4467b409818bc0b11650 100644
--- a/tensorflow/core/grappler/grappler_item_builder_test.cc
+++ b/tensorflow/core/grappler/grappler_item_builder_test.cc
@@ -19,8 +19,10 @@ limitations under the License.
 #include "tensorflow/cc/gradients/grad_testutil.h"
 #include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -253,6 +255,230 @@ TEST_F(GrapplerItemBuilderTest, AssetFilepathOverrideTest_FileNotAccessible) {
   ASSERT_TRUE(item == nullptr);
 }
 
+TEST_F(GrapplerItemBuilderTest, GraphWithFunctions) {
+  MetaGraphDef meta_graph;
+  // y = XTimesTwo(x)
+  constexpr char device[] = "/cpu:0";
+  *meta_graph.mutable_graph_def() = test::function::GDef(
+      {test::function::NDef("x", "Const", {}, {{"dtype", DT_FLOAT}}, device),
+       test::function::NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}},
+                            device)},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  CollectionDef train_op;
+  train_op.mutable_node_list()->add_value("y");
+  (*meta_graph.mutable_collection_def())["train_op"] = train_op;
+
+  ItemConfig cfg;
+  cfg.inline_functions = false;
+
+  std::unique_ptr<GrapplerItem> item =
+      GrapplerItemFromMetaGraphDef("0", meta_graph, cfg);
+  ASSERT_TRUE(item != nullptr);
+}
+
+TEST_F(GrapplerItemBuilderTest, FromSimpleFunctionDef) {
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  FunctionDef func = FunctionDefHelper::Define(
+      // Name
+      "XTimesTwo",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
+      });
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+  FunctionDefLibrary library;
+  std::unique_ptr<GrapplerItem> item =
+      GrapplerItemFromFunctionDef(func, func_attr, library);
+  CHECK(item);
+  EXPECT_EQ("XTimesTwo", item->id);
+  EXPECT_EQ(4, item->graph.node_size());
+  EXPECT_EQ(std::vector<string>({"y"}), item->fetch);
+  EXPECT_EQ(1, item->feed.size());
+  EXPECT_EQ("x", item->feed[0].first);
+
+  for (const NodeDef &node : item->graph.node()) {
+    if (node.name() == "x") {
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "two") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "scale") {
+      EXPECT_EQ("Cast", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("DstT").type());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("two:0", node.input(0));
+    } else if (node.name() == "y") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("scale:0", node.input(1));
+    }
+  }
+}
+
+TEST_F(GrapplerItemBuilderTest, FromFunctionDefWithMultiOutputNodes) {
+  // Gradient graph for the Subtract operation
+  std::vector<FunctionDefHelper::Node> nodes = {
+      {{"sx"}, "Shape", {"x"}},
+      {{"sy"}, "Shape", {"y"}},
+      {{"gx"}, "Identity", {"dz"}},
+      {{"gy"}, "Neg", {"dz"}},
+      {{"rx", "ry"}, "BroadcastGradientArgs", {"sx", "sy"}},
+      {{"sum_gx"}, "Sum", {"gx", "rx"}},
+      {{"dx"}, "Reshape", {"sum_gx", "sx"}},
+      {{"sum_gy"}, "Sum", {"gy", "ry"}},
+      {{"dy"}, "Reshape", {"sum_gy", "sy"}},
+  };
+
+  for (auto &n : nodes) {
+    // "BroadcastGradientArgs" doesn't need any attrs.
+    if (n.attr.empty() && n.op != "BroadcastGradientArgs") {
+      n.attr = {{"T", "$T"}};
+    }
+  }
+  FunctionDef func = FunctionDefHelper::Define(
+      // Name
+      "SubGrad",
+      // Arg defs
+      {"x: T", "y: T", "dz: T"},
+      // Ret val defs
+      {"dx: T", "dy: T"},
+      // Attr defs
+      {{"T: {half, float, double}"}},
+      // Nodes
+      nodes);
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+  FunctionDefLibrary library;
+  std::unique_ptr<GrapplerItem> item =
+      GrapplerItemFromFunctionDef(func, func_attr, library);
+  CHECK(item);
+  EXPECT_EQ("SubGrad", item->id);
+  EXPECT_EQ(12, item->graph.node_size());
+  EXPECT_EQ(std::vector<string>({"dx", "dy"}), item->fetch);
+  EXPECT_EQ(3, item->feed.size());
+  EXPECT_EQ("x", item->feed[0].first);
+  EXPECT_EQ("y", item->feed[1].first);
+  EXPECT_EQ("dz", item->feed[2].first);
+
+  for (const NodeDef &node : item->graph.node()) {
+    if (node.name() == "x" || node.name() == "y" || node.name() == "dz") {
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "rx") {
+      EXPECT_EQ("BroadcastGradientArgs", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("sx:0", node.input(0));
+      EXPECT_EQ("sy:0", node.input(1));
+    } else if (node.name() == "sum_gx") {
+      EXPECT_EQ("Sum", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("gx:0", node.input(0));
+      EXPECT_EQ("rx:0", node.input(1));
+    } else if (node.name() == "sum_gy") {
+      EXPECT_EQ("Sum", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("gy:0", node.input(0));
+      EXPECT_EQ("rx:1", node.input(1));
+    }
+  }
+}
+
+TEST_F(GrapplerItemBuilderTest, FromFunctionDefWithNestedFuncs) {
+  FunctionDefLibrary library;
+  *library.add_function() = FunctionDefHelper::Define(
+      // Name
+      "Swap",
+      // Args
+      {"i0: T", "i1: T"},
+      // Return values
+      {"o0: T", "o1: T"},
+      // Attr def
+      {"T: {float, double}"},
+      // Nodes
+      {{{"o0"}, "Identity", {"i1"}, {{"T", "$T"}}},
+       {{"o1"}, "Identity", {"i0"}, {{"T", "$T"}}}});
+
+  FunctionDef func = FunctionDefHelper::Create(
+      // Name
+      "ManySwapsFirst",
+      // Args
+      {"x: float", "y: float"},
+      // Return values
+      {"o: float"},
+      // attr def
+      {},
+      // Nodes
+      // o = x*x + y*y.  Furthermore, The 1st swap depends on x2, and
+      // y2 depends on the 2nd swap.  The 2nd swap has data dependency
+      // on the 1st swap.
+      {{{"a0"}, "Swap", {"x", "y"}, {{"T", DT_FLOAT}}, {"x2"}},
+       {{"a1"}, "Swap", {"a0:o0:0", "a0:o1:0"}, {{"T", DT_FLOAT}}},
+       {{"x2"}, "Mul", {"x", "x"}, {{"T", DT_FLOAT}}},
+       {{"y2"}, "Mul", {"y", "y"}, {{"T", DT_FLOAT}}, {"a1"}},
+       {{"o"}, "Add", {"x2:z:0", "y2:z:0"}, {{"T", DT_FLOAT}}}},
+      {{"o", "o:z:0"}});
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+  std::unique_ptr<GrapplerItem> item =
+      GrapplerItemFromFunctionDef(func, func_attr, library);
+
+  for (const NodeDef &node : item->graph.node()) {
+    if (node.name() == "x" || node.name() == "y") {
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "a0") {
+      EXPECT_EQ("Swap", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^x2", node.input(2));
+    } else if (node.name() == "a1") {
+      EXPECT_EQ("Swap", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("a0:0", node.input(0));
+      EXPECT_EQ("a0:1", node.input(1));
+    } else if (node.name() == "x2") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+    } else if (node.name() == "y2") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^a1", node.input(2));
+    } else if (node.name() == "o") {
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x2:0", node.input(0));
+      EXPECT_EQ("y2:0", node.input(1));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.h b/tensorflow/core/grappler/inputs/file_input_yielder.h
index a17e1c9ff2a5e1521250e604192d21650732e795..b597319261011e2537848a34167f69cf1e3002f0 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.h
@@ -18,8 +18,8 @@ limitations under the License.
 // that may be stored in the checkpoint are not restored in order to speedup the
 // initialization.
 
-#ifndef LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
-#define LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#ifndef TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#define TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
 
 #include <stddef.h>
 #include <limits>
@@ -53,4 +53,4 @@ class FileInputYielder : public InputYielder {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // LEARNING_BRAIN_EXPERIMENTAL_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#endif  // TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
index 6d25556770d13058ba65045eff787b12c0ca12de..ec54bd5c7598a5acb5bf653bb2902f6c3aba38f6 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
@@ -31,8 +31,6 @@ namespace {
 GraphDef CreateGraphDef(int num_stages, int width, int tensor_size,
                         bool use_multiple_devices, bool insert_queue,
                         const std::vector<string>& device_names) {
-  CHECK_GE(device_names.size(), width);
-
   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -49,13 +47,17 @@ GraphDef CreateGraphDef(int num_stages, int width, int tensor_size,
     std::vector<Output> this_stage;
     for (int j = 0; j < width; j++) {
       if (last_stage.size() == 1) {
-        Output unary_op =
-            Square(s.WithDevice(device_names[use_multiple_devices ? j : 0]),
-                   last_stage[0]);
+        Output unary_op = Square(
+            s.WithDevice(
+                device_names[use_multiple_devices ? j % device_names.size()
+                                                  : 0]),
+            last_stage[0]);
         this_stage.push_back(unary_op);
       } else {
         Output combine =
-            AddN(s.WithDevice(device_names[use_multiple_devices ? j : 0]),
+            AddN(s.WithDevice(
+                     device_names[use_multiple_devices ? j % device_names.size()
+                                                       : 0]),
                  last_stage);
         this_stage.push_back(combine);
       }
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 15fcaa857e3024bc91a7629fc14d10d57c624c8e..fdf4540540b4b9f3d64ea767240ca4ea0c353d48 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <unordered_set>
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/op_types.h"
@@ -25,21 +26,58 @@ namespace tensorflow {
 namespace grappler {
 
 bool IsAdd(const NodeDef& node) {
-  return node.op() == "Add" || node.op() == "AddV2";
+  if (node.op() == "AddV2" || node.op() == "Add") {
+    DataType type = node.attr().at("T").type();
+    return type != DT_STRING;
+  }
+  return false;
 }
 
 bool IsAddN(const NodeDef& node) { return node.op() == "AddN"; }
 
+bool IsAll(const NodeDef& node) { return node.op() == "All"; }
+
+bool IsAngle(const NodeDef& node) { return node.op() == "Angle"; }
+
+bool IsAny(const NodeDef& node) { return node.op() == "Any"; }
+
+bool IsAnyDiv(const NodeDef& node) {
+  return node.op() == "RealDiv" || node.op() == "Div" ||
+         node.op() == "FloorDiv" || node.op() == "TruncateDiv";
+}
+
+bool IsApproximateEqual(const NodeDef& node) {
+  return node.op() == "ApproximateEqual";
+}
+
 bool IsAvgPoolGrad(const NodeDef& node) { return node.op() == "AvgPoolGrad"; }
 
 bool IsAssert(const NodeDef& node) { return node.op() == "Assert"; }
 
+bool IsAtan2(const NodeDef& node) { return node.op() == "Atan2"; }
+
+bool IsBetainc(const NodeDef& node) { return node.op() == "Betainc"; }
+
+bool IsBiasAdd(const NodeDef& node) {
+  return node.op() == "BiasAdd" || node.op() == "BiasAddV1";
+}
+
 bool IsBiasAddGrad(const NodeDef& node) { return node.op() == "BiasAddGrad"; }
 
+bool IsBitcast(const NodeDef& node) { return node.op() == "Bitcast"; }
+
+bool IsCast(const NodeDef& node) { return node.op() == "Cast"; }
+
+bool IsComplex(const NodeDef& node) { return node.op() == "Complex"; }
+
+bool IsComplexAbs(const NodeDef& node) { return node.op() == "ComplexAbs"; }
+
 bool IsConcatOffset(const NodeDef& node) { return node.op() == "ConcatOffset"; }
 
 bool IsConstant(const NodeDef& node) { return node.op() == "Const"; }
 
+bool IsConj(const NodeDef& node) { return node.op() == "Conj"; }
+
 bool IsConv2D(const NodeDef& node) { return node.op() == "Conv2D"; }
 
 bool IsConv2DBackpropFilter(const NodeDef& node) {
@@ -69,20 +107,39 @@ bool IsDequeueOp(const NodeDef& node) {
          op == "QueueDequeueUpToV2" || op == "QueueDequeueUpTo";
 }
 
+bool IsDiv(const NodeDef& node) { return node.op() == "Div"; }
+
+bool IsEluGrad(const NodeDef& node) { return node.op() == "EluGrad"; }
+
 bool IsEnter(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Enter" || op == "RefEnter";
 }
 
+bool IsEqual(const NodeDef& node) { return node.op() == "Equal"; }
+
 bool IsExit(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Exit" || op == "RefExit";
 }
 
+bool IsFill(const NodeDef& node) { return node.op() == "Fill"; }
+
+bool IsFloorDiv(const NodeDef& node) { return node.op() == "FloorDiv"; }
+
 bool IsFloorMod(const NodeDef& node) { return node.op() == "FloorMod"; }
 
-bool IsFusedBatchNormGradV1(const NodeDef& node) {
-  return node.op() == "FusedBatchNormGrad";
+bool IsFusedBatchNormGrad(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "FusedBatchNormGrad" || op == "FusedBatchNormGradV2";
+}
+
+bool IsGreater(const NodeDef& node) { return node.op() == "Greater"; }
+
+bool IsGreaterEqual(const NodeDef& node) { return node.op() == "GreaterEqual"; }
+
+bool IsHistogramSummary(const NodeDef& node) {
+  return node.op() == "HistogramSummary";
 }
 
 bool IsIdentity(const NodeDef& node) {
@@ -90,33 +147,97 @@ bool IsIdentity(const NodeDef& node) {
   return op == "Identity" || op == "RefIdentity";
 }
 
+bool IsIdentityN(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "IdentityN";
+}
+
+bool IsIgamma(const NodeDef& node) { return node.op() == "Igamma"; }
+
+bool IsIgammac(const NodeDef& node) { return node.op() == "Igammac"; }
+
+bool IsImag(const NodeDef& node) { return node.op() == "Imag"; }
+
+bool IsInvGrad(const NodeDef& node) { return node.op() == "InvGrad"; }
+
+bool IsLess(const NodeDef& node) { return node.op() == "Less"; }
+
+bool IsLessEqual(const NodeDef& node) { return node.op() == "LessEqual"; }
+
+bool IsLogicalAnd(const NodeDef& node) { return node.op() == "LogicalAnd"; }
+
+bool IsLogicalNot(const NodeDef& node) { return node.op() == "LogicalNot"; }
+
+bool IsLogicalOr(const NodeDef& node) { return node.op() == "LogicalOr"; }
+
+bool IsMatMul(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "MatMul" || op == "BatchMatMul" || op == "QuantizedMatMul" ||
+         op == "SparseMatMul";
+}
+
+bool IsMax(const NodeDef& node) { return node.op() == "Max"; }
+
+bool IsMaximum(const NodeDef& node) { return node.op() == "Maximum"; }
+
+bool IsMean(const NodeDef& node) { return node.op() == "Mean"; }
+
 bool IsMerge(const NodeDef& node) {
-  const auto op = node.op();
+  const auto& op = node.op();
   return op == "Merge" || op == "RefMerge";
 }
 
+bool IsMin(const NodeDef& node) { return node.op() == "Min"; }
+
+bool IsMinimum(const NodeDef& node) { return node.op() == "Minimum"; }
+
+bool IsMirrorPad(const NodeDef& node) { return node.op() == "MirrorPad"; }
+
+bool IsMirrorPadGrad(const NodeDef& node) {
+  return node.op() == "MirrorPadGrad";
+}
+
+bool IsMod(const NodeDef& node) { return node.op() == "Mod"; }
+
 bool IsMul(const NodeDef& node) { return node.op() == "Mul"; }
 
 bool IsNoOp(const NodeDef& node) { return node.op() == "NoOp"; }
 
+bool IsNotEqual(const NodeDef& node) { return node.op() == "NotEqual"; }
+
 bool IsNextIteration(const NodeDef& node) {
   const auto& op = node.op();
   return op == "NextIteration" || op == "RefNextIteration";
 }
 
-bool IsPad(const NodeDef& node) { return node.op() == "Pad"; }
+bool IsPad(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "Pad" || op == "PadV2";
+}
 
 bool IsPlaceholder(const NodeDef& node) {
-  const auto op = node.op();
+  const auto& op = node.op();
   return op == "Placeholder" || op == "PlaceholderV2" ||
          op == "PlaceholderWithDefault";
 }
 
+bool IsPolygamma(const NodeDef& node) { return node.op() == "Polygamma"; }
+
+bool IsPow(const NodeDef& node) { return node.op() == "Pow"; }
+
+bool IsProd(const NodeDef& node) { return node.op() == "Prod"; }
+
+bool IsReal(const NodeDef& node) { return node.op() == "Real"; }
+
 bool IsRealDiv(const NodeDef& node) { return node.op() == "RealDiv"; }
 
-bool IsReluGrad(const NodeDef& node) { return node.op() == "ReluGrad"; }
+bool IsReciprocalGrad(const NodeDef& node) {
+  return node.op() == "ReciprocalGrad";
+}
 
-bool IsRecv(const NodeDef& node) { return node.op() == "_Recv"; }
+bool IsRecv(const NodeDef& node) {
+  return node.op() == "_Recv" || node.op() == "_HostRecv";
+}
 
 bool IsReduction(const NodeDef& node) {
   const auto& op = node.op();
@@ -124,6 +245,10 @@ bool IsReduction(const NodeDef& node) {
          op == "Mean" || op == "Any" || op == "All";
 }
 
+bool IsReluGrad(const NodeDef& node) { return node.op() == "ReluGrad"; }
+
+bool IsRelu6Grad(const NodeDef& node) { return node.op() == "Relu6Grad"; }
+
 bool IsReshape(const NodeDef& node) { return (node.op() == "Reshape"); }
 
 bool IsRestore(const NodeDef& node) {
@@ -131,12 +256,36 @@ bool IsRestore(const NodeDef& node) {
           node.op() == "RestoreSlice");
 }
 
-bool IsSend(const NodeDef& node) { return node.op() == "_Send"; }
+bool IsReverseV2(const NodeDef& node) { return node.op() == "ReverseV2"; }
+
+bool IsRsqrtGrad(const NodeDef& node) { return node.op() == "RsqrtGrad"; }
+
+bool IsSelect(const NodeDef& node) { return node.op() == "Select"; }
+
+bool IsSeluGrad(const NodeDef& node) { return node.op() == "SeluGrad"; }
+
+bool IsSend(const NodeDef& node) {
+  return node.op() == "_Send" || node.op() == "_HostSend";
+}
+
+bool IsShape(const NodeDef& node) { return node.op() == "Shape"; }
+
+bool IsShapeN(const NodeDef& node) { return node.op() == "ShapeN"; }
+
+bool IsSigmoidGrad(const NodeDef& node) { return node.op() == "SigmoidGrad"; }
 
 bool IsSlice(const NodeDef& node) { return node.op() == "Slice"; }
 
+bool IsSoftplusGrad(const NodeDef& node) { return node.op() == "SoftplusGrad"; }
+
+bool IsSoftsignGrad(const NodeDef& node) { return node.op() == "SoftsignGrad"; }
+
 bool IsSplit(const NodeDef& node) { return node.op() == "Split"; }
 
+bool IsSplitV(const NodeDef& node) { return node.op() == "SplitV"; }
+
+bool IsSqrtGrad(const NodeDef& node) { return node.op() == "SqrtGrad"; }
+
 bool IsSquaredDifference(const NodeDef& node) {
   return node.op() == "SquaredDifference";
 }
@@ -148,6 +297,12 @@ bool IsStopGradient(const NodeDef& node) {
   return op == "StopGradient" || op == "PreventGradient";
 }
 
+bool IsStridedSlice(const NodeDef& node) { return node.op() == "StridedSlice"; }
+
+bool IsStridedSliceGrad(const NodeDef& node) {
+  return node.op() == "StridedSliceGrad";
+}
+
 bool IsSub(const NodeDef& node) { return node.op() == "Sub"; }
 
 bool IsSum(const NodeDef& node) { return node.op() == "Sum"; }
@@ -157,14 +312,34 @@ bool IsSwitch(const NodeDef& node) {
   return op == "Switch" || op == "RefSwitch";
 }
 
+bool IsTanhGrad(const NodeDef& node) { return node.op() == "TanhGrad"; }
+
+bool IsTile(const NodeDef& node) { return node.op() == "Tile"; }
+
 bool IsTranspose(const NodeDef& node) { return node.op() == "Transpose"; }
 
+bool IsTruncateDiv(const NodeDef& node) { return node.op() == "TruncateDiv"; }
+
+bool IsTruncateMod(const NodeDef& node) { return node.op() == "TruncateMod"; }
+
 bool IsVariable(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Variable" || op == "VariableV2" || op == "AutoReloadVariable" ||
          op == "VarHandleOp" || op == "ReadVariableOp";
 }
 
+bool IsZeta(const NodeDef& node) { return node.op() == "Zeta"; }
+
+namespace {
+bool GetBoolAttr(const NodeDef& node, const string& name) {
+  return node.attr().count(name) > 0 && node.attr().at(name).b();
+}
+}  // namespace
+
+bool IsPersistent(const NodeDef& node) {
+  return IsConstant(node) || IsVariable(node);
+}
+
 bool IsFreeOfSideEffect(const NodeDef& node) {
   // Placeholders must be preserved to keep the graph feedable.
   if (IsPlaceholder(node)) {
@@ -184,6 +359,10 @@ bool IsFreeOfSideEffect(const NodeDef& node) {
       return false;
     }
   }
+  // Some nodes do in-place updates on regular tensor inputs.
+  if (GetBoolAttr(node, "in_place") || GetBoolAttr(node, "inplace")) {
+    return false;
+  }
   return true;
 }
 
@@ -224,5 +403,10 @@ bool IsValuePreserving(const NodeDef& node) {
   return value_preserving_ops.count(node.op()) > 0;
 }
 
+bool HasOpDef(const NodeDef& node) {
+  const OpDef* op_def = nullptr;
+  return OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok();
+}
+
 }  // namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index b1d81448afb8b2c311d216a9af5d07031ed36b98..9cda40c0a6515caa9754d0c2f4f50a32f9fe8d98 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -24,9 +24,22 @@ namespace grappler {
 
 bool IsAdd(const NodeDef& node);
 bool IsAddN(const NodeDef& node);
+bool IsAll(const NodeDef& node);
+bool IsAngle(const NodeDef& node);
+bool IsAny(const NodeDef& node);
+bool IsAnyDiv(const NodeDef& node);
+bool IsApproximateEqual(const NodeDef& node);
 bool IsAvgPoolGrad(const NodeDef& node);
 bool IsAssert(const NodeDef& node);
+bool IsAtan2(const NodeDef& node);
+bool IsBetainc(const NodeDef& node);
+bool IsBiasAdd(const NodeDef& node);
 bool IsBiasAddGrad(const NodeDef& node);
+bool IsBitcast(const NodeDef& node);
+bool IsCast(const NodeDef& node);
+bool IsComplex(const NodeDef& node);
+bool IsComplexAbs(const NodeDef& node);
+bool IsConj(const NodeDef& node);
 bool IsConcatOffset(const NodeDef& node);
 bool IsConstant(const NodeDef& node);
 bool IsConv2D(const NodeDef& node);
@@ -36,34 +49,86 @@ bool IsDepthwiseConv2dNative(const NodeDef& node);
 bool IsDepthwiseConv2dNativeBackpropFilter(const NodeDef& node);
 bool IsDepthwiseConv2dNativeBackpropInput(const NodeDef& node);
 bool IsDequeueOp(const NodeDef& node);
+bool IsDiv(const NodeDef& node);
+bool IsEluGrad(const NodeDef& node);
 bool IsEnter(const NodeDef& node);
+bool IsEqual(const NodeDef& node);
 bool IsExit(const NodeDef& node);
+bool IsFill(const NodeDef& node);
+bool IsFloorDiv(const NodeDef& node);
 bool IsFloorMod(const NodeDef& node);
-bool IsFusedBatchNormGradV1(const NodeDef& node);
+bool IsFusedBatchNormGrad(const NodeDef& node);
+bool IsGreater(const NodeDef& node);
+bool IsGreaterEqual(const NodeDef& node);
+bool IsHistogramSummary(const NodeDef& node);
 bool IsIdentity(const NodeDef& node);
+bool IsIdentityN(const NodeDef& node);
+bool IsIgamma(const NodeDef& node);
+bool IsIgammac(const NodeDef& node);
+bool IsImag(const NodeDef& node);
+bool IsInvGrad(const NodeDef& node);
+bool IsLess(const NodeDef& node);
+bool IsLessEqual(const NodeDef& node);
+bool IsLogicalAnd(const NodeDef& node);
+bool IsLogicalNot(const NodeDef& node);
+bool IsLogicalOr(const NodeDef& node);
+bool IsMax(const NodeDef& node);
+bool IsMaximum(const NodeDef& node);
+bool IsMean(const NodeDef& node);
 bool IsMerge(const NodeDef& node);
+bool IsMin(const NodeDef& node);
+bool IsMinimum(const NodeDef& node);
+bool IsMirrorPad(const NodeDef& node);
+bool IsMirrorPadGrad(const NodeDef& node);
+bool IsMod(const NodeDef& node);
 bool IsMul(const NodeDef& node);
+bool IsMatMul(const NodeDef& node);
 bool IsNextIteration(const NodeDef& node);
 bool IsPad(const NodeDef& node);
 bool IsNoOp(const NodeDef& node);
+bool IsNotEqual(const NodeDef& node);
 bool IsPlaceholder(const NodeDef& node);
+bool IsPolygamma(const NodeDef& node);
+bool IsProd(const NodeDef& node);
+bool IsPow(const NodeDef& node);
+bool IsReal(const NodeDef& node);
 bool IsRealDiv(const NodeDef& node);
+bool IsRelu6Grad(const NodeDef& node);
 bool IsReluGrad(const NodeDef& node);
+bool IsReciprocalGrad(const NodeDef& node);
 bool IsRecv(const NodeDef& node);
 bool IsReduction(const NodeDef& node);
 bool IsReshape(const NodeDef& node);
 bool IsRestore(const NodeDef& node);
+bool IsReverseV2(const NodeDef& node);
+bool IsRsqrtGrad(const NodeDef& node);
+bool IsSelect(const NodeDef& node);
+bool IsSeluGrad(const NodeDef& node);
 bool IsSend(const NodeDef& node);
 bool IsSlice(const NodeDef& node);
+bool IsShape(const NodeDef& node);
+bool IsShapeN(const NodeDef& node);
+bool IsSigmoidGrad(const NodeDef& node);
+bool IsSoftplusGrad(const NodeDef& node);
+bool IsSoftsignGrad(const NodeDef& node);
 bool IsSplit(const NodeDef& node);
+bool IsSplitV(const NodeDef& node);
+bool IsSqrtGrad(const NodeDef& node);
 bool IsSquaredDifference(const NodeDef& node);
 bool IsSqueeze(const NodeDef& node);
 bool IsStopGradient(const NodeDef& node);
+bool IsStridedSlice(const NodeDef& node);
+bool IsStridedSliceGrad(const NodeDef& node);
 bool IsSub(const NodeDef& node);
 bool IsSum(const NodeDef& node);
 bool IsSwitch(const NodeDef& node);
+bool IsTanhGrad(const NodeDef& node);
+bool IsTile(const NodeDef& node);
 bool IsTranspose(const NodeDef& node);
+bool IsTruncateDiv(const NodeDef& node);
+bool IsTruncateMod(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
+bool IsZeta(const NodeDef& node);
 
 // Return true if the op is an aggregation (e.g. Add, AddN).
 // Returns false if it could not be determined to be so.
@@ -73,6 +138,10 @@ bool IsAggregate(const NodeDef& node);
 // Returns false if it could not be determined to be so.
 bool IsCommutative(const NodeDef& node);
 
+// Returns true if the node is known to use persistent memory to store its
+// value.
+bool IsPersistent(const NodeDef& node);
+
 bool IsFreeOfSideEffect(const NodeDef& node);
 bool ModifiesFrameInfo(const NodeDef& node);
 
@@ -85,6 +154,9 @@ bool IsInvolution(const NodeDef& node);
 // function returns true if the op commutes with all element-wise operations.
 bool IsValuePreserving(const NodeDef& node);
 
+// Returns true if we can find an opdef corresponding to the op of the node.
+bool HasOpDef(const NodeDef& node);
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 5d9eb8e0b128188be3cdcf8019cb0bca94c6b1cf..e839630605a96f1528114f98b88e90a7a20b0a3a 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -96,6 +96,7 @@ cc_library(
         ":graph_optimizer",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
@@ -124,6 +125,7 @@ tf_cc_test(
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
@@ -138,6 +140,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
     ],
 )
@@ -212,6 +215,7 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
 
@@ -230,6 +234,7 @@ tf_cc_test(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
 
@@ -276,14 +281,17 @@ cc_library(
         ":graph_optimizer",
         ":graph_rewriter",
         ":static_schedule",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_memory",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler/utils:traversal",
     ],
 )
 
@@ -295,11 +303,13 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
@@ -332,6 +342,11 @@ tf_cc_test(
     deps = [
         ":layout_optimizer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -356,6 +371,7 @@ cc_library(
         ":dependency_optimizer",
         ":graph_optimizer",
         ":layout_optimizer",
+        ":loop_optimizer",
         ":memory_optimizer",
         ":model_pruner",
         "//tensorflow/core:framework",
@@ -365,3 +381,39 @@ cc_library(
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
+
+cc_library(
+    name = "loop_optimizer",
+    srcs = ["loop_optimizer.cc"],
+    hdrs = [
+        "loop_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
+    ],
+)
+
+tf_cc_test(
+    name = "loop_optimizer_test",
+    size = "small",
+    srcs = ["loop_optimizer_test.cc"],
+    deps = [
+        ":loop_optimizer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 3cfc4f61e43d51a178054926d97f3faeae7746ce..9c544c82bf7f77760e5a2090ca947fd7185e27b7 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -245,14 +245,34 @@ void SetSourceDataType(DataType dtype, NodeDef* node) {
   SetDataTypeToAttr(dtype, SourceDataTypeAttrName(*node), node);
 }
 
-bool IsNumberType(DataType dtype) {
-  DataTypeVector number_types = NumberTypes();
-  return std::find(number_types.begin(), number_types.end(), dtype) !=
-         number_types.end();
-}
+bool IsNumberType(DataType dtype) { return kNumberTypes.Contains(dtype); }
 
 const char kOutputShapesAttr[] = "_output_shapes";
 
+PartialTensorShape GetInputShape(const string& input, const NodeMap& node_map) {
+  int output_pos;
+  string node_name = ParseNodeName(input, &output_pos);
+  const NodeDef* input_node = node_map.GetNode(node_name);
+  return input_node->attr().at(kOutputShapesAttr).list().shape(output_pos);
+}
+
+bool ShapesEqual(const string& input_x, const string& input_y,
+                 const NodeMap& node_map) {
+  PartialTensorShape x_shape = GetInputShape(input_x, node_map);
+  PartialTensorShape y_shape = GetInputShape(input_y, node_map);
+  if (x_shape.unknown_rank() || y_shape.unknown_rank() ||
+      x_shape.dims() != y_shape.dims()) {
+    return false;
+  }
+  for (int i = 0; i < x_shape.dims(); ++i) {
+    if (x_shape.dim_size(i) == -1 || y_shape.dim_size(i) == -1 ||
+        x_shape.dim_size(i) != y_shape.dim_size(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Returns whether `reshape` is an identity op. The tensor that `reshape`
 // reshapes is the `output_pos`-th output of node `input`.
 bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
@@ -416,23 +436,31 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   return true;
 }
 
+NodeDef* ArithmeticOptimizer::AddNode(const NodeDef& node, StringPiece suffix,
+                                      bool copy_node) {
+  return AddNode(OptimizedNodeName(node, suffix), copy_node ? &node : nullptr);
+}
+
 NodeDef* ArithmeticOptimizer::AddNode(const string& name,
                                       const NodeDef* node_to_copy) {
   NodeDef* new_node = optimized_graph_->add_node();
-  const string name_with_prefix =
-      AddPrefixToNodeName(name, kArithmeticOptimizer);
-  node_map_->AddNode(NodeName(name_with_prefix), new_node);
+  node_map_->AddNode(NodeName(name), new_node);
   if (node_to_copy != nullptr) {
-    new_node->CopyFrom(*node_to_copy);
+    *new_node = *node_to_copy;
   }
-  new_node->set_name(name_with_prefix);
+  new_node->set_name(name);
   return new_node;
 }
 
-bool ArithmeticOptimizer::OptimizedNodeExists(const string& name) {
-  const string name_with_prefix =
-      AddPrefixToNodeName(name, kArithmeticOptimizer);
-  return node_map_->NodeExists(name_with_prefix);
+string ArithmeticOptimizer::OptimizedNodeName(const NodeDef& node,
+                                              StringPiece suffix) const {
+  return AddPrefixToNodeName(strings::StrCat(node.name(), "_", suffix),
+                             kArithmeticOptimizer);
+}
+
+bool ArithmeticOptimizer::OptimizedNodeExists(const NodeDef& node,
+                                              StringPiece suffix) const {
+  return node_map_->NodeExists(OptimizedNodeName(node, suffix));
 }
 
 bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
@@ -494,7 +522,7 @@ void ArithmeticOptimizer::DedupComputations() {
   } while (!stop);
 
   // Delete duplicates
-  if (!duplicates.empty()) {
+  if (fetch_nodes_known_ && !duplicates.empty()) {
     int last = optimized_graph_->node_size() - 1;
     for (auto it = duplicates.rbegin(); it != duplicates.rend(); ++it) {
       int index = *it;
@@ -608,12 +636,11 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     }
 
     // If the reshape is a no-op, forward its input to its consumers. This is
-    // considered aggressive and turned off by default, because users may state
-    // that the placeholder outputs tensors of shape [M, N] while feeding it
-    // with tensors of shape [M*N] (or worse). The reshape nodes are then
-    // necessary to update the tensor metadata to the required shape.
-    if (opt_level_ == RewriterConfig::AGGRESSIVE &&
-        ReshapeIsIdentity(*reshape, *input, output_pos)) {
+    // considered aggressive, because users may state that the placeholder
+    // outputs tensors of shape [M, N] while feeding it with tensors of shape
+    // [M*N] (or worse). The reshape nodes are then necessary to update the
+    // tensor metadata to the required shape.
+    if (ReshapeIsIdentity(*reshape, *input, output_pos)) {
       return reshape->input(0);
     }
   }
@@ -649,17 +676,19 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         const DataType src_type = GetSourceDataType(*cast);
         const DataType dst_type = GetDestinationDataType(*cast);
         if (IsNumberType(src_type) && IsNumberType(dst_type) &&
-            DataTypeSize(src_type) < DataTypeSize(dst_type)) {
-          NodeDef* new_transpose =
-              AddNode(StrCat(transpose->name(), "_", DataTypeString(src_type)),
-                      transpose);
+            DataTypeSize(src_type) < DataTypeSize(dst_type) &&
+            !OptimizedNodeExists(*cast, DataTypeString(dst_type)) &&
+            !OptimizedNodeExists(*transpose, DataTypeString(src_type))) {
+          NodeDef* new_transpose = AddNode(*transpose, DataTypeString(src_type),
+                                           /*copy_node=*/true);
           (*new_transpose->mutable_attr())["T"].set_type(src_type);
           new_transpose->set_input(0, cast->input(0));
           node_map_->AddOutput(input->name(), new_transpose->name());
           node_map_->AddOutput(NodeName(new_transpose->input(1)),
                                new_transpose->name());
 
-          NodeDef* new_cast = AddNode(StrCat(cast->name(), "_new"), cast);
+          NodeDef* new_cast =
+              AddNode(*cast, DataTypeString(dst_type), /*copy_node=*/true);
           new_cast->set_input(0, new_transpose->name());
           node_map_->AddOutput(new_transpose->name(), new_cast->name());
 
@@ -735,7 +764,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     // multiply can be constant-folded. TODO(jingyue): When the weights aren't
     // constant, this should also help performance a bit and memory usage a lot,
     // since the weights tend to be smaller than the activations.
-    if (weights->op() == "Const") {
+    if (weights->op() == "Const" &&
+        !OptimizedNodeExists(*weights, StrCat("scaled_", conv->name()))) {
       const NodeDef* source = node_map_->GetNode(
           GetTailOfValuePreservingChain(*node, *node_map_, nodes_to_preserve_)
               ->input(0));
@@ -754,7 +784,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
               scale_tensor.tensor_shape().dim_size() == 0) {
             // Create new node `scaled_weights`.
             NodeDef* scaled_weights = AddNode(
-                StrCat(weights->name(), "_scaled_", conv->name()), nullptr);
+                *weights, StrCat("scaled_", conv->name()), /*copy_node=*/false);
             scaled_weights->set_op("Mul");
             scaled_weights->set_device(weights->device());
             (*scaled_weights->mutable_attr())["T"] =
@@ -791,9 +821,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   }
 
   if (node->op() == "Mul" && node->input(0) == node->input(1) &&
-      !OptimizedNodeExists(StrCat(node->name(), "_square"))) {
-    NodeDef* new_square_node =
-        AddNode(strings::StrCat(node->name(), "_square"), node);
+      !OptimizedNodeExists(*node, "square")) {
+    NodeDef* new_square_node = AddNode(*node, "square", /*copy_node=*/true);
     new_square_node->set_op("Square");
     for (int i = 1; i < new_square_node->input_size(); ++i) {
       new_square_node->set_input(i - 1, new_square_node->input(i));
@@ -828,8 +857,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         break;
       }
     }
-    const string mul_node_name = StrCat(node->name(), "_mul");
-    if (all_equal && !OptimizedNodeExists(mul_node_name)) {
+    if (all_equal && !OptimizedNodeExists(*node, "const") &&
+        !OptimizedNodeExists(*node, "mul")) {
       // 1. Create constant node with value N.
       const auto type = GetDataTypeFromAttr(*node, "T");
       Tensor t(type, TensorShape({}));
@@ -840,15 +869,14 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         return "";
       }
       TensorValue value(&t);
-      NodeDef* new_const_node =
-          AddNode(StrCat(node->name(), "_const"), nullptr);
+      NodeDef* new_const_node = AddNode(*node, "const", /*copy_node=*/false);
       *new_const_node =
           ConstantFolding::CreateNodeDef(new_const_node->name(), value);
       new_const_node->set_device(node->device());
       nodes_to_simplify->PushBack(new_const_node);
 
       // 2. Replace the aggregate node with Mul(Const(N), x).
-      NodeDef* new_mul_node = AddNode(mul_node_name, nullptr);
+      NodeDef* new_mul_node = AddNode(*node, "mul", /*copy_node=*/false);
       new_mul_node->set_op("Mul");
       new_mul_node->set_device(node->device());
       SetDataTypeToAttr(type, "T", new_mul_node);
@@ -868,9 +896,13 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   // multiplication over addition to hoist common factors out of aggregate nodes
   // where all the inputs are Mul nodes. This pattern occurs frequently in
   // regularization terms for the gradients during training.
-  // TODO(rmlarsen): Check shapes and enable for AddN.
-  if (IsAdd(*node) && NumNonControlInputs(*node) > 1 &&
-      !OptimizedNodeExists(StrCat(node->name(), "_hoist_add"))) {
+  // For example, we can rewrite an expression of the form:
+  //   AddN(Mul(x, y1), Mul(y2, x), Mul(x, y3), ... Mul(x, yn))
+  // to the following:
+  //   Mul(x, AddN(y1, y2, y3, ... yn))
+  if (IsAggregate(*node) && NumNonControlInputs(*node) > 1 &&
+      !OptimizedNodeExists(*node, "hoist_add") &&
+      !OptimizedNodeExists(*node, "hoist_mul")) {
     // Determine the set of common factors if the input nodes are all Mul nodes.
     std::set<string> common_factors;
     for (int i = 0; i < node->input_size(); ++i) {
@@ -899,24 +931,15 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     }
     if (common_factors.size() == 1) {
       const string& common_factor = *common_factors.begin();
-      // In this case we have an expression of the form
-      //   AddN(Mul(x, y1), Mul(y2, x), Mul(x, y3), ... Mul(x, yn))
-      // that can be rewritten as
-      //   Mul(x, AddN(y1, y2, y3, ... yn))
-
-      // 1. Use a copy of the first Mul node for the outer multiplication.
-      NodeDef* new_mul_node = AddNode(StrCat(node->name(), "_hoist_mul"),
-                                      node_map_->GetNode(node->input(0)));
-      NodeDef* new_add_node = AddNode(StrCat(node->name(), "_hoist_add"), node);
-      new_mul_node->set_device(node->device());
-      new_mul_node->set_input(0, common_factor);
-      node_map_->AddOutput(common_factor, new_mul_node->name());
-      new_mul_node->set_input(1, new_add_node->name());
-      node_map_->AddOutput(new_add_node->name(), new_mul_node->name());
-
-      // 2. Hoist non-shared factors up into the new AddN node.
-      nodes_to_simplify->PushBack(new_add_node);
-      for (int i = 0; i < node->input_size(); ++i) {
+
+      // Gather up the non-shared factors (the y's in the example).
+      // Unless the aggregation is Add, we have to make sure that all the y's
+      // have the same shape since the other aggregation ops do not support
+      // broadcasting.
+      std::vector<string> unique_factors;
+      unique_factors.reserve(node->input_size());
+      bool shapes_match = true;
+      for (int i = 0; i < node->input_size() && shapes_match; ++i) {
         const string& input = node->input(i);
         if (IsControlInput(input)) {
           break;
@@ -924,22 +947,47 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         const NodeDef* mul_node = node_map_->GetNode(input);
         const int unique_factor_index =
             mul_node->input(0) == common_factor ? 1 : 0;
-        const string unique_factor = mul_node->input(unique_factor_index);
-        new_add_node->set_input(i, unique_factor);
+        unique_factors.push_back(mul_node->input(unique_factor_index));
+        if (i > 0 && !IsAdd(*node)) {
+          shapes_match = ShapesEqual(unique_factors.front(),
+                                     unique_factors.back(), *node_map_);
+        }
       }
 
-      // 4. Add frame dependencies that the original node might have had.
-      AddFrameControlDeps(node, {new_add_node, new_mul_node}, common_factor,
-                          {new_add_node});
+      if (shapes_match) {
+        // 1. Use a copy of the first Mul node for the outer multiplication.
+        NodeDef* new_mul_node = AddNode(OptimizedNodeName(*node, "hoist_mul"),
+                                        node_map_->GetNode(node->input(0)));
+        NodeDef* new_add_node = AddNode(*node, "hoist_add", /*copy_node=*/true);
+        new_mul_node->set_device(node->device());
+        new_mul_node->set_input(0, common_factor);
+        node_map_->AddOutput(common_factor, new_mul_node->name());
+        new_mul_node->set_input(1, new_add_node->name());
+        node_map_->AddOutput(new_add_node->name(), new_mul_node->name());
+
+        // 2. Hoist non-shared factors up into the new AddN node.
+        nodes_to_simplify->PushBack(new_add_node);
+        for (int i = 0; i < node->input_size(); ++i) {
+          const string& input = node->input(i);
+          if (IsControlInput(input)) {
+            break;
+          }
+          new_add_node->set_input(i, unique_factors[i]);
+        }
 
-      return new_mul_node->name();
+        // 3. Add frame dependencies that the original node might have had.
+        AddFrameControlDeps(node, {new_add_node, new_mul_node}, common_factor,
+                            {new_add_node});
+
+        return new_mul_node->name();
+      }
     }
   }
 
   // Fold Transpose into matrix multiplication.
   if ((node->op() == "MatMul" || node->op() == "SparseMatMul" ||
        node->op() == "BatchMatMul") &&
-      !OptimizedNodeExists(StrCat(node->name(), "_fused"))) {
+      !OptimizedNodeExists(*node, "fused")) {
     const NodeDef* a = node_map_->GetNode(node->input(0));
     const NodeDef* b = node_map_->GetNode(node->input(1));
     bool is_complex = false;
@@ -957,7 +1005,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     const bool b_is_foldable = foldable_transpose_ops.count(b->op()) > 0 &&
                                IsInnerMatrixTransposeNode(*b, node_map_.get());
     if (a_is_foldable || b_is_foldable) {
-      NodeDef* new_op = AddNode(StrCat(node->name(), "_fused"), node);
+      NodeDef* new_op = AddNode(*node, "fused", /*copy_node=*/true);
       if (a_is_foldable) {
         const string attr_a =
             node->op() == "BatchMatMul" ? "adj_x" : "transpose_a";
@@ -982,7 +1030,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   // Fold Conj into Transpose or ConjugateTranspose.
   if ((node->op() == "Conj" || node->op() == "Transpose" ||
        node->op() == "ConjugateTranspose") &&
-      !OptimizedNodeExists(StrCat(node->name(), "_fused"))) {
+      !OptimizedNodeExists(*node, "fused")) {
     const NodeDef* input = node_map_->GetNode(node->input(0));
     const NodeDef* transpose_op = node->op() == "Conj" ? input : node;
     const NodeDef* conj_op = node->op() == "Conj" ? node : input;
@@ -990,7 +1038,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     if ((transpose_op->op() == "Transpose" ||
          transpose_op->op() == "ConjugateTranspose") &&
         conj_op->op() == "Conj") {
-      NodeDef* new_op = AddNode(StrCat(node->name(), "_fused"), transpose_op);
+      NodeDef* new_op =
+          AddNode(OptimizedNodeName(*node, "fused"), transpose_op);
       // Flip the type of transpose op to absorb the conjugation.
       new_op->set_op(transpose_op->op() == "Transpose" ? "ConjugateTranspose"
                                                        : "Transpose");
@@ -1064,13 +1113,10 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   int num_frames;
   TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
                                                &frame_map_, &num_frames));
-  if (opt_level_ == RewriterConfig::AGGRESSIVE) {
-    graph_properties_.reset(new GraphProperties(item));
-    // Shapes are only needed in aggressive mode.
-    TF_RETURN_IF_ERROR(graph_properties_->InferStatically());
-    TF_RETURN_IF_ERROR(
-        graph_properties_->AnnotateOutputShapes(optimized_graph_));
-  }
+  // Shapes are only needed in aggressive mode.
+  graph_properties_.reset(new GraphProperties(item));
+  TF_RETURN_IF_ERROR(graph_properties_->InferStatically(false));
+  TF_RETURN_IF_ERROR(graph_properties_->AnnotateOutputShapes(optimized_graph_));
 
   // Perform the optimizations.
   DedupComputations();
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index ec269792386189e5a590a99af020803810f36b1a..afd538db408aa859a108e08b2de9efad635d515c 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -48,7 +48,13 @@ class ArithmeticOptimizer : public GraphOptimizer {
  private:
   // Returns true is a node with given name and the optimizer prefix already
   // exists.
-  bool OptimizedNodeExists(const string& name);
+  string OptimizedNodeName(const NodeDef& node, StringPiece suffix) const;
+  bool OptimizedNodeExists(const NodeDef& node, StringPiece suffix) const;
+
+  // Creates a new node in the graph, with name equal to that of node, prefixed
+  // with "ArithmeticOptimizer/" and the given suffix. Also updates node_map_,
+  // and optionally copies node into the new node if copy_node is true.
+  NodeDef* AddNode(const NodeDef& node, StringPiece suffix, bool copy_node);
 
   // Creates a new node in the graph, prefixed with "ArithmeticOptimizer/",
   // updates node_map_, and optionally copies *node_to_copy into the new
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index e8a18ff9d9f5db8593c725e382229101e389848f..2a82b250586783759608db75bf9e383f4b0322cb 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -32,6 +32,21 @@ string OptimizedName(const string& name) {
   return AddPrefixToNodeName(name, kArithmeticOptimizer);
 }
 
+void VerifyGraphsMatch(const GraphDef& original_graph,
+                       const GraphDef& optimized_graph, int line) {
+  EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << line;
+  for (int i = 0; i < original_graph.node_size(); ++i) {
+    const NodeDef& original = original_graph.node(i);
+    const NodeDef& optimized = optimized_graph.node(i);
+    EXPECT_EQ(original.name(), optimized.name()) << line;
+    EXPECT_EQ(original.op(), optimized.op()) << line;
+    EXPECT_EQ(original.input_size(), optimized.input_size()) << line;
+    for (int j = 0; j < original.input_size(); ++j) {
+      EXPECT_EQ(original.input(j), optimized.input(j)) << line;
+    }
+  }
+}
+
 class ArithmeticOptimizerTest : public ::testing::Test {};
 
 TEST_F(ArithmeticOptimizerTest, NoOp) {
@@ -44,18 +59,7 @@ TEST_F(ArithmeticOptimizerTest, NoOp) {
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-
-  EXPECT_EQ(item.graph.node_size(), output.node_size());
-  for (int i = 0; i < item.graph.node_size(); ++i) {
-    const NodeDef& original = item.graph.node(i);
-    const NodeDef& optimized = output.node(i);
-    EXPECT_EQ(original.name(), optimized.name());
-    EXPECT_EQ(original.op(), optimized.op());
-    EXPECT_EQ(original.input_size(), optimized.input_size());
-    for (int j = 0; j < original.input_size(); ++j) {
-      EXPECT_EQ(original.input(j), optimized.input(j));
-    }
-  }
+  VerifyGraphsMatch(item.graph, output, __LINE__);
 }
 
 TEST_F(ArithmeticOptimizerTest, OpDedupping) {
@@ -65,6 +69,7 @@ TEST_F(ArithmeticOptimizerTest, OpDedupping) {
   Output div = ops::Div(s.WithOpName("div"), c1, c2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div"};
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -98,6 +103,7 @@ TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
                         check1, check2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div"};
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -126,6 +132,7 @@ TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
   Output div1 = ops::Div(s.WithOpName("div1"), mul1, mul2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div"};
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -398,39 +405,51 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
 }
 
 TEST_F(ArithmeticOptimizerTest, HoistFactor) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
-  Output y1 = ops::Const(s.WithOpName("y1"), {3.0f, 4.0f}, {1, 2});
-  Output y2 = ops::Const(s.WithOpName("y2"), {5.0f, 6.0f}, {1, 2});
-  Output mul1 = ops::Mul(s.WithOpName("mul1"), x, y1);
-  Output mul2 = ops::Mul(s.WithOpName("mul2"), y2, x);
-  Output add = ops::Add(s.WithOpName("add"), mul1, mul2);
-  Output id = ops::Identity(s.WithOpName("id"), add);
-
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
-  ArithmeticOptimizer optimizer;
-  GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-
-  EXPECT_EQ(9, output.node_size());
-  const NodeDef& new_add = output.node(8);
-  EXPECT_EQ(OptimizedName("add_hoist_add"), new_add.name());
-  EXPECT_EQ("y1", new_add.input(0));
-  EXPECT_EQ("y2", new_add.input(1));
-  const NodeDef& new_mul = output.node(7);
-  EXPECT_EQ(OptimizedName("add_hoist_mul"), new_mul.name());
-  EXPECT_EQ("x", new_mul.input(0));
-  EXPECT_EQ(OptimizedName("add_hoist_add"), new_mul.input(1));
-  const NodeDef& new_id = output.node(6);
-  EXPECT_EQ("id", new_id.name());
-  EXPECT_EQ(OptimizedName("add_hoist_mul"), new_id.input(0));
+  for (bool matching_shapes : {true, false}) {
+    for (bool use_addn : {true, false}) {
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+      Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+      Output y1 = ops::Const(s.WithOpName("y1"), {3.0f, 4.0f}, {1, 2});
+      Output y2 = matching_shapes
+                      ? ops::Const(s.WithOpName("y2"), {5.0f, 6.0f}, {1, 2})
+                      : ops::Const(s.WithOpName("y2"), {5.0f}, {1, 1});
+      Output mul1 = ops::Mul(s.WithOpName("mul1"), x, y1);
+      Output mul2 = ops::Mul(s.WithOpName("mul2"), y2, x);
+      Output id =
+          use_addn ? ops::Identity(s.WithOpName("id"),
+                                   ops::AddN(s.WithOpName("add"), {mul1, mul2}))
+                   : ops::Identity(s.WithOpName("id"),
+                                   ops::Add(s.WithOpName("add"), mul1, mul2));
+
+      GrapplerItem item;
+      TF_CHECK_OK(s.ToGraphDef(&item.graph));
+      ArithmeticOptimizer optimizer;
+      GraphDef output;
+      Status status = optimizer.Optimize(nullptr, item, &output);
+      TF_EXPECT_OK(status);
+      // Run the optimizer twice to make sure the rewrite is idempotent.
+      item.graph.Swap(&output);
+      status = optimizer.Optimize(nullptr, item, &output);
+      TF_EXPECT_OK(status);
+
+      if (use_addn && !matching_shapes) {
+        VerifyGraphsMatch(item.graph, output, __LINE__);
+      } else {
+        EXPECT_EQ(9, output.node_size());
+        const NodeDef& new_add = output.node(8);
+        EXPECT_EQ(OptimizedName("add_hoist_add"), new_add.name());
+        EXPECT_EQ("y1", new_add.input(0));
+        EXPECT_EQ("y2", new_add.input(1));
+        const NodeDef& new_mul = output.node(7);
+        EXPECT_EQ(OptimizedName("add_hoist_mul"), new_mul.name());
+        EXPECT_EQ("x", new_mul.input(0));
+        EXPECT_EQ(OptimizedName("add_hoist_add"), new_mul.input(1));
+        const NodeDef& new_id = output.node(6);
+        EXPECT_EQ("id", new_id.name());
+        EXPECT_EQ(OptimizedName("add_hoist_mul"), new_id.input(0));
+      }
+    }
+  }
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
@@ -606,10 +625,9 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer(RewriterConfig::AGGRESSIVE)
-                   .Optimize(nullptr, item, &output));
+  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(0, std::count_if(
@@ -631,10 +649,9 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer(RewriterConfig::AGGRESSIVE)
-                   .Optimize(nullptr, item, &output));
+  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(1, std::count_if(
@@ -654,10 +671,9 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshapeTooManyUnknownDimSizes) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer(RewriterConfig::AGGRESSIVE)
-                   .Optimize(nullptr, item, &output));
+  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(1, std::count_if(
@@ -690,7 +706,7 @@ TEST_F(ArithmeticOptimizerTest, CombineReshapes) {
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(1, std::count_if(
@@ -714,7 +730,7 @@ TEST_F(ArithmeticOptimizerTest, ReorderTransposeCast) {
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   const NodeDef* transpose_node = nullptr;
@@ -750,7 +766,7 @@ TEST_F(ArithmeticOptimizerTest, NoReorderTransposeCast) {
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   int num_transposes = 0;
@@ -784,7 +800,7 @@ TEST_F(ArithmeticOptimizerTest, RemoveInverseTransposes) {
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   std::set<string> nodes_after_optimization;
@@ -817,7 +833,7 @@ TEST_F(ArithmeticOptimizerTest, RemoveInverseTransposesMultipleOutputs) {
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   for (const NodeDef& node : output.node()) {
@@ -844,7 +860,7 @@ TEST_F(ArithmeticOptimizerTest, RemoveTransposesWithControlDependency) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   NodeMap node_map(&output);
@@ -873,7 +889,7 @@ TEST_F(ArithmeticOptimizerTest, NotRemoveTransposes) {
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(6, output.node_size());
@@ -904,7 +920,7 @@ TEST_F(ArithmeticOptimizerTest, FoldMulToTransposeConv) {
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   NodeMap node_map(&output);
@@ -946,7 +962,7 @@ TEST_F(ArithmeticOptimizerTest, NotFoldMulAcrossPreservedTranspose) {
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   NodeMap node_map(&output);
@@ -976,7 +992,7 @@ TEST_F(ArithmeticOptimizerTest, FoldMulToConv) {
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   NodeMap node_map(&output);
@@ -1015,11 +1031,15 @@ TEST_F(ArithmeticOptimizerTest, OptimizeCastMulTransposeConv) {
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
+
+  item.graph.Swap(&output);
   TF_EXPECT_OK(
       ConstantFolding(/*cpu_device=*/nullptr).Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   NodeMap node_map(&output);
@@ -1027,7 +1047,7 @@ TEST_F(ArithmeticOptimizerTest, OptimizeCastMulTransposeConv) {
   const NodeDef* transpose_node =
       CHECK_NOTNULL(node_map.GetNode(OptimizedName("Transpose_uint8")));
   const NodeDef* cast_node =
-      CHECK_NOTNULL(node_map.GetNode(OptimizedName("Cast_new")));
+      CHECK_NOTNULL(node_map.GetNode(OptimizedName("Cast_float")));
   const NodeDef* weights_node =
       CHECK_NOTNULL(node_map.GetNode(OptimizedName("weights_scaled_Conv2D")));
   const NodeDef* conv_node = CHECK_NOTNULL(node_map.GetNode("Conv2D"));
@@ -1064,11 +1084,11 @@ TEST_F(ArithmeticOptimizerTest, OptimizeMultipleMulTransposeConv) {
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(
       ConstantFolding(/*cpu_device=*/nullptr).Optimize(nullptr, item, &output));
 
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   NodeMap node_map(&output);
@@ -1097,7 +1117,7 @@ TEST_F(ArithmeticOptimizerTest, CombineBitcasts) {
 
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(1, std::count_if(
@@ -1117,7 +1137,7 @@ TEST_F(ArithmeticOptimizerTest, CombineAndRemoveBitcasts) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(0, std::count_if(
@@ -1136,7 +1156,7 @@ TEST_F(ArithmeticOptimizerTest, RemoveRedundantCast) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-  item.graph = output;
+  item.graph.Swap(&output);
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(0, std::count_if(
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h
index c5d2d47782f0d5515e65e1f99b212315dcc13c0e..8d1098d87755c1257dfebe016a3baf86bfece677 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.h
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 
-#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/framework/variable.pb.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index c77b2badf429d8674f60635b172f96891ed84961..1e6f11c8aa06b1115c7b74b25120a9d7b7b4a76c 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -30,13 +30,16 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/bcast.h"
+#include "tensorflow/core/util/saved_tensor_slice_util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -95,7 +98,74 @@ class DeviceSimple : public DeviceBase {
   std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
 };
 
+template <typename T>
+bool AllValuesAre(const TensorProto& tensor, const T& value) {
+  // TensorProto represents the content of the tensor in either <type>_val or
+  // tensor_content.
+  typename checkpoint::SaveTypeTraits<T>::RepeatedField* tensor_values =
+      checkpoint::MutableTensorProtoData<T>(const_cast<TensorProto*>(&tensor));
+  if (!tensor_values->empty()) {
+    for (const T& tensor_value : *tensor_values) {
+      if (tensor_value != value) {
+        return false;
+      }
+    }
+    return true;
+  }
+  const auto tensor_content_size = tensor.tensor_content().size();
+  if (tensor_content_size > 0) {
+    CHECK_EQ(0, tensor_content_size % sizeof(T));
+    std::vector<T> raw_values(tensor_content_size / sizeof(T));
+    port::CopyToArray(tensor.tensor_content(),
+                      reinterpret_cast<char*>(raw_values.data()));
+    for (int i = 0; i < tensor_content_size / sizeof(T); ++i) {
+      if (raw_values[i] != value) {
+        return false;
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+// Add new_input as a control input to node if it does not already depend on it.
+// TODO(rmlarsen): Move the following two utility functions to utils.{h,cc} and
+// clean up code that should be using them.
+bool MaybeAddControlInput(const string& new_input, NodeDef* node,
+                          GraphDef* graph, NodeMap* node_map) {
+  bool already_exists = false;
+  for (const string& input : node->input()) {
+    if (input == new_input || AsControlDependency(input) == new_input) {
+      already_exists = true;
+      break;
+    }
+  }
+  if (!already_exists) {
+    const string ctrl_dep =
+        ConstantFolding::AddControlDependency(new_input, graph, node_map);
+    node->add_input(ctrl_dep);
+    node_map->AddOutput(NodeName(new_input), node->name());
+  }
+  return !already_exists;
+}
+
+// Remove old_input as a control input to node.
+bool MaybeRemoveControlInput(const string& old_input, NodeDef* node,
+                             GraphDef* graph, NodeMap* node_map) {
+  for (int i = 0; i < node->input_size(); ++i) {
+    const string& input = node->input(i);
+    if (IsControlInput(input) && AsControlDependency(old_input) == input) {
+      node->mutable_input()->SwapElements(i, node->input_size() - 1);
+      node->mutable_input()->RemoveLast();
+      node_map->RemoveOutput(NodeName(old_input), node->name());
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace
+
 ConstantFolding::ConstantFolding(RewriterConfig::Toggle opt_level,
                                  DeviceBase* cpu_device)
     : opt_level_(opt_level), cpu_device_(cpu_device) {
@@ -109,6 +179,9 @@ ConstantFolding::ConstantFolding(DeviceBase* cpu_device)
 string ConstantFolding::AddControlDependency(const string& input_name,
                                              GraphDef* graph,
                                              NodeMap* node_map) {
+  if (IsControlInput(input_name)) {
+    return input_name;
+  }
   const NodeDef* node = node_map->GetNode(input_name);
   if (!IsSwitch(*node)) {
     return AsControlDependency(*node);
@@ -120,10 +193,10 @@ string ConstantFolding::AddControlDependency(const string& input_name,
     // We start by looking for an identity node connected to the output of the
     // switch node, and use it to anchor the control dependency.
     auto outputs = node_map->GetOutputs(node->name());
-    for (const NodeDef* node : outputs) {
-      if (IsIdentity(*node)) {
+    for (const NodeDef* output : outputs) {
+      if (IsIdentity(*output)) {
         if (IsSameInput(node->input(0), input_name)) {
-          return AsControlDependency(*node);
+          return AsControlDependency(*output);
         }
       }
     }
@@ -135,15 +208,18 @@ string ConstantFolding::AddControlDependency(const string& input_name,
     ctrl_dep_name = AddPrefixToNodeName(ctrl_dep_name, kConstantFoldingCtrl);
     const DataType output_type = node->attr().at("T").type();
 
-    NodeDef* added_node = graph->add_node();
-    added_node->set_name(ctrl_dep_name);
-    added_node->set_op("Identity");
-    added_node->set_device(node->device());
+    NodeDef* added_node = node_map->GetNode(ctrl_dep_name);
+    if (added_node == nullptr) {
+      added_node = graph->add_node();
+      added_node->set_name(ctrl_dep_name);
+      added_node->set_op("Identity");
+      added_node->set_device(node->device());
 
-    (*added_node->mutable_attr())["T"].set_type(output_type);
-    *added_node->add_input() = input_name;
-    node_map->AddNode(added_node->name(), added_node);
-    node_map->AddOutput(node->name(), added_node->name());
+      (*added_node->mutable_attr())["T"].set_type(output_type);
+      *added_node->add_input() = input_name;
+      node_map->AddNode(added_node->name(), added_node);
+      node_map->AddOutput(node->name(), added_node->name());
+    }
     return AsControlDependency(*added_node);
   }
 }
@@ -190,23 +266,45 @@ Status ConvertShapeToConstant(const string& op, const DataType& type,
   return Status::OK();
 }
 
-Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
-                                          const GraphProperties& properties) {
+// TODO(rmlarsen): Perhaps we should move this to the GraphOptimizer base class.
+bool ConstantFolding::OptimizedNodeExists(const NodeDef& node,
+                                          StringPiece suffix) const {
+  return node_map_->NodeExists(OptimizedNodeName(node, suffix));
+}
+
+string ConstantFolding::OptimizedNodeName(const NodeDef& node,
+                                          StringPiece suffix) const {
+  return AddPrefixToNodeName(strings::StrCat(node.name(), suffix),
+                             kConstantFoldingConst);
+}
+
+bool ConstantFolding::IsReallyConstant(const NodeDef& node) const {
+  if (!IsConstant(node)) {
+    return false;
+  }
+  // If the node is fed it's not constant anymore.
+  return feed_nodes_.find(node.name()) == feed_nodes_.end();
+}
+
+Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
   // We may add some nodes to the graph to encode control dependencies: there is
   // no need to process these, so only iterate over the nodes of the input
   // graph.
-  const int node_count = graph_.node_size();
+  const int node_count = graph_->node_size();
   for (int i = 0; i < node_count; ++i) {
-    NodeDef& node = *graph_.mutable_node(i);
+    NodeDef& node = *graph_->mutable_node(i);
     const string op = node.op();
     if (op != "Shape" && op != "Size" && op != "Rank" && op != "ShapeN") {
       continue;
     }
 
-    std::vector<OpInfo::TensorProperties> output =
+    const std::vector<OpInfo::TensorProperties>& output =
         properties.GetOutputProperties(node.name());
-    std::vector<OpInfo::TensorProperties> input =
+    const std::vector<OpInfo::TensorProperties>& input =
         properties.GetInputProperties(node.name());
+    if (input.empty() || output.empty()) {
+      continue;
+    }
     if (op == "Shape" || op == "Size" || op == "Rank") {
       CHECK_EQ(1, output.size());
       CHECK_EQ(1, input.size());
@@ -241,7 +339,7 @@ Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
           // cases where the shape/rank/size would have been run in
           // the original graph. Additional inputs are extra control
           string ctrl_dep =
-              AddControlDependency(node.input(0), &graph_, node_map_.get());
+              AddControlDependency(node.input(0), graph_, node_map_.get());
           node.set_input(0, ctrl_dep);
           node_map_->AddOutput(NodeName(ctrl_dep), node.name());
         } else {
@@ -252,11 +350,10 @@ Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
               string node_name = ParseNodeName(output->input(k), &port);
               if (node_name == node.name() && port == j) {
                 // Create a const node as ShapeN's output if not already.
-                string const_name =
-                    AddPrefixToNodeName(strings::StrCat(node.name(), "-", j),
-                                        kConstantFoldingConst);
+                const string const_name =
+                    OptimizedNodeName(node, strings::StrCat("-matshapes-", j));
                 if (node_map_->GetNode(const_name) == nullptr) {
-                  NodeDef* added_node = graph_.add_node();
+                  NodeDef* added_node = graph_->add_node();
                   added_node->set_name(const_name);
                   added_node->set_op("Const");
                   added_node->set_device(node.device());
@@ -267,7 +364,7 @@ Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
                   // We add a control dependency to the original ShapeN node,
                   // so that the node will only be run if all inputs of the
                   // original ShapeN node are run.
-                  string ctrl_dep = AddControlDependency(node.name(), &graph_,
+                  string ctrl_dep = AddControlDependency(node.name(), graph_,
                                                          node_map_.get());
                   *added_node->add_input() = ctrl_dep;
                   node_map_->AddOutput(NodeName(ctrl_dep), added_node->name());
@@ -285,6 +382,7 @@ Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
   return Status::OK();
 }
 
+namespace {
 bool ShapesEqual(const TensorShapeProto& shape1,
                  const TensorShapeProto& shape2) {
   if (shape1.unknown_rank() || shape2.unknown_rank()) {
@@ -297,11 +395,13 @@ bool ShapesEqual(const TensorShapeProto& shape1,
     if (shape1.dim(i).size() != shape2.dim(i).size()) {
       return false;
     }
+    if (shape1.dim(i).size() == -1 || shape2.dim(i).size() == -1) {
+      return false;
+    }
   }
   return true;
 }
 
-namespace {
 bool ExtractShape(const NodeDef& shape_node, const GraphProperties& properties,
                   BCast::Vec* shape, int64* min_id) {
   if (shape_node.op() == "Shape") {
@@ -344,11 +444,12 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
   const NodeDef* shape_node1 = node_map_->GetNode(node.input(0));
   const NodeDef* shape_node2 = node_map_->GetNode(node.input(1));
   if (shape_node1 == nullptr ||
-      (shape_node1->op() != "Shape" && shape_node1->op() != "Const") ||
+      (shape_node1->op() != "Shape" && !IsReallyConstant(*shape_node1)) ||
       shape_node2 == nullptr ||
-      (shape_node2->op() != "Shape" && shape_node2->op() != "Const")) {
+      (shape_node2->op() != "Shape" && !IsReallyConstant(*shape_node2))) {
     return Status::OK();
   }
+
   int64 min_id = 0;
   BCast::Vec shape1;
   if (!ExtractShape(*shape_node1, properties, &shape1, &min_id)) {
@@ -371,10 +472,42 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
       id = --min_id;
     }
   }
+
+  // Beware: the reduction dimensions computed by the BCast class are valid iff
+  // we assume that two distinct symbolic dimensions can't be equal and a
+  // symbolic dimension can't be equal to 1. This is often but not always true,
+  // so to make this optimization safe we filter out these cases.
+  const int common_dims = std::min(shape1.size(), shape2.size());
+  for (int i = 0; i < common_dims; ++i) {
+    if (shape1[i] >= 0 && shape2[i] >= 0) {
+      continue;
+    }
+    if (shape1[i] != shape2[i]) {
+      // We're either dealing with 2 different symbolic dimensions or a symbolic
+      // and a know dimensions. We can't be sure whether both are equal or not,
+      // so we can't be sure whether we'll be broadcasting or not.
+      return Status::OK();
+    }
+  }
+  // These extra dims could be equal to 1, in which case there is no
+  // broadcasting. It could also be greater than 1, in which case there would
+  // be broadcasting. Since we don't know, we'll just punt.
+  for (int i = common_dims; i < shape1.size(); ++i) {
+    if (shape1[i] < 0) {
+      return Status::OK();
+    }
+  }
+  for (int i = common_dims; i < shape2.size(); ++i) {
+    if (shape2[i] < 0) {
+      return Status::OK();
+    }
+  }
+
   BCast bcast(shape1, shape2);
   if (!bcast.IsValid()) {
     return Status::OK();
   }
+
   BCast::Vec reduce_dims[2];
   reduce_dims[0] = bcast.grad_x_reduce_idx();
   reduce_dims[1] = bcast.grad_y_reduce_idx();
@@ -382,31 +515,32 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
   const DataType type = node.attr().at("T").type();
   NodeDef* out[2];
   for (int j = 0; j < 2; ++j) {
-    if (!reduce_dims[j].empty()) {
-      // This is the case when a tensor dimension of 1 is matched against an
-      // unknown dimension. The unknown dimension could also be equal to 1, in
-      // which case there would be no reduction.
-      out[j] = nullptr;
-    } else {
-      string const_name = AddPrefixToNodeName(
-          strings::StrCat(node.name(), "-", j), kConstantFoldingConst);
-      out[j] = node_map_->GetNode(const_name);
-      if (out[j] == nullptr) {
-        out[j] = graph_.add_node();
-        Tensor value(type, TensorShape({0}));
-        *out[j] = CreateNodeDef(const_name, TensorValue(&value));
-        out[j]->set_device(node.device());
-        node_map_->AddNode(const_name, out[j]);
-        string ctrl_dep =
-            AddControlDependency(node.name(), &graph_, node_map_.get());
-        *out[j]->add_input() = ctrl_dep;
-        node_map_->AddOutput(NodeName(ctrl_dep), const_name);
+    int reduction_indices = reduce_dims[j].size();
+    Tensor value(type, TensorShape({reduction_indices}));
+    for (int i = 0; i < reduction_indices; ++i) {
+      if (type == DT_INT32) {
+        value.vec<int32>()(i) = reduce_dims[j][i];
+      } else {
+        value.vec<int64>()(i) = reduce_dims[j][i];
       }
     }
+    string const_name =
+        OptimizedNodeName(node, strings::StrCat("-bcastargs-", j));
+    out[j] = node_map_->GetNode(const_name);
+    if (out[j] == nullptr) {
+      out[j] = graph_->add_node();
+      *out[j] = CreateNodeDef(const_name, TensorValue(&value));
+      out[j]->set_device(node.device());
+      node_map_->AddNode(const_name, out[j]);
+      string ctrl_dep =
+          AddControlDependency(node.name(), graph_, node_map_.get());
+      *out[j]->add_input() = ctrl_dep;
+      node_map_->AddOutput(NodeName(ctrl_dep), const_name);
+    }
   }
 
-  auto outputs = node_map_->GetOutputs(node.name());
-  for (const auto& output : outputs) {
+  const std::set<NodeDef*> outputs = node_map_->GetOutputs(node.name());
+  for (NodeDef* output : outputs) {
     for (int k = 0; k < output->input_size(); ++k) {
       int port;
       string node_name = ParseNodeName(output->input(k), &port);
@@ -426,13 +560,17 @@ Status ConstantFolding::MaterializeReductionIndices(
     return Status::OK();
   }
   const NodeDef* indices = node_map_->GetNode(node->input(1));
-  if (!indices || IsConstant(*indices)) {
+  if (!indices || IsReallyConstant(*indices)) {
     // The reduction indices are already constant, there's nothing to do.
     return Status::OK();
   }
 
-  const OpInfo::TensorProperties& input_prop =
-      properties.GetInputProperties(node->name())[0];
+  const std::vector<OpInfo::TensorProperties>& input_props =
+      properties.GetInputProperties(node->name());
+  if (input_props.size() != 2) {
+    return Status::OK();
+  }
+  const OpInfo::TensorProperties& input_prop = input_props[0];
   if (input_prop.shape().unknown_rank()) {
     // We can't do anything if we don't know the rank of the input.
     return Status::OK();
@@ -442,17 +580,31 @@ Status ConstantFolding::MaterializeReductionIndices(
     // Unexpected graph, don't try to change it.
     return Status::OK();
   }
-  const OpInfo::TensorProperties& output_prop =
-      properties.GetOutputProperties(node->name())[0];
+  const std::vector<OpInfo::TensorProperties>& output_props =
+      properties.GetOutputProperties(node->name());
+  if (output_props.size() != 1) {
+    return Status::OK();
+  }
+  const bool keep_dims =
+      node->attr().count("keep_dims") && node->attr().at("keep_dims").b();
+  const OpInfo::TensorProperties& output_prop = output_props[0];
   PartialTensorShape output_shape(output_prop.shape());
   if (output_shape.num_elements() != 1) {
     bool full_reduction = false;
     for (const NodeDef* fanout : node_map_->GetOutputs(node->name())) {
-      if (!IsReshape(*fanout)) {
-        continue;
+      if (!IsReshape(*fanout) && !keep_dims) {
+        // Depending on how it's setup, a full reduction will generate a tensor
+        // of shape [], [1], [1, 1], [1, 1, ...]. If keep_dims isn't true, we
+        // rely on the existence of a reshape node following the reduction to
+        // ensure that the fanout is fed a scalar of the right shape.
+        return Status::OK();
+      }
+      const std::vector<OpInfo::TensorProperties>& reshape_props =
+          properties.GetOutputProperties(fanout->name());
+      if (reshape_props.size() != 1) {
+        return Status::OK();
       }
-      const OpInfo::TensorProperties& reshape_prop =
-          properties.GetOutputProperties(fanout->name())[0];
+      const OpInfo::TensorProperties& reshape_prop = reshape_props[0];
       PartialTensorShape shape(reshape_prop.shape());
       if (shape.num_elements() != 1) {
         return Status::OK();
@@ -465,21 +617,18 @@ Status ConstantFolding::MaterializeReductionIndices(
     }
   }
 
-  const OpInfo::TensorProperties& reduction_prop =
-      properties.GetInputProperties(node->name())[1];
+  const OpInfo::TensorProperties& reduction_prop = input_props[1];
   DataType dtype = reduction_prop.dtype();
   if (dtype != DT_INT32 && dtype != DT_INT64) {
     return Status::OK();
   }
   // We know it's a full reduction. We can generate the set of indices to
   // reduce.
-  string const_name =
-      AddPrefixToNodeName(strings::StrCat(node->name(), "-reduction_indices"),
-                          kConstantFoldingConst);
+  string const_name = OptimizedNodeName(*node, "-reduction_indices");
   if (node_map_->GetNode(const_name)) {
     return Status::OK();
   }
-  NodeDef* reduction_indices = graph_.add_node();
+  NodeDef* reduction_indices = graph_->add_node();
   Tensor value(dtype, TensorShape({rank}));
   for (int i = 0; i < rank; ++i) {
     if (dtype == DT_INT32) {
@@ -491,7 +640,7 @@ Status ConstantFolding::MaterializeReductionIndices(
   *reduction_indices = CreateNodeDef(const_name, TensorValue(&value));
   reduction_indices->set_device(node->device());
   string ctrl_dep =
-      AddControlDependency(node->input(1), &graph_, node_map_.get());
+      AddControlDependency(node->input(1), graph_, node_map_.get());
   *reduction_indices->add_input() = ctrl_dep;
   node_map_->AddNode(const_name, reduction_indices);
   node_map_->AddOutput(NodeName(ctrl_dep), const_name);
@@ -504,10 +653,10 @@ Status ConstantFolding::MaterializeReductionIndices(
 }
 
 Status ConstantFolding::MaterializeConstants(
-    const GrapplerItem& item, const GraphProperties& properties) {
-  const int node_count = graph_.node_size();
+    const GraphProperties& properties) {
+  const int node_count = graph_->node_size();
   for (int i = 0; i < node_count; ++i) {
-    NodeDef& node = *graph_.mutable_node(i);
+    NodeDef& node = *graph_->mutable_node(i);
     const string& op = node.op();
     if (op == "BroadcastGradientArgs") {
       TF_RETURN_IF_ERROR(MaterializeBroadcastGradientArgs(node, properties));
@@ -523,24 +672,23 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   if (node.input().empty()) {
     return false;
   }
-
   // Skips nodes that must be preserved except whitelisted nodes.
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end() &&
       nodes_whitelist_.find(node.name()) == nodes_whitelist_.end()) {
     return false;
   }
-
-  // Skips ops that don't benefit from folding.
-  const string& op = node.op();
-  // Skip constants, they're already folded
-  if (op == "Const") {
+  // Skip control flow nodes, they can't be folded
+  if (ModifiesFrameInfo(node)) {
     return false;
   }
-  // Skip constrol flow nodes, they can't be folded
-  if (op == "Enter" || op == "RefEnter" || op == "Exit" || op == "RefExit" ||
-      op == "NextIteration" || op == "RefNextIteration") {
+  // Skip constants, they're already folded
+  if (IsConstant(node)) {
     return false;
   }
+
+  // Skips ops that don't benefit from folding.
+  const string& op = node.op();
+
   if (op.find("Placeholder") == 0) {
     return false;
   }
@@ -594,7 +742,7 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
     if (!input_node) {
       return false;
     }
-    bool is_const = IsConstant(*input_node);
+    bool is_const = IsReallyConstant(*input_node);
     if (!is_const && !is_merge) {
       return false;
     }
@@ -612,6 +760,37 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   return true;
 }
 
+namespace {
+
+#define SET_TENSOR_VAL_CASE(DTYPE, TYPE, NAME)     \
+  case DTYPE:                                      \
+    t->add_##NAME##_val(static_cast<TYPE>(value)); \
+    break;
+
+Status CreateConstantTensorAttrValue(DataType type, double value,
+                                     const TensorShapeProto& shape,
+                                     AttrValue* attr_tensor) {
+  TensorProto* t = attr_tensor->mutable_tensor();
+  t->set_dtype(type);
+  *t->mutable_tensor_shape() = shape;
+  switch (type) {
+    SET_TENSOR_VAL_CASE(DT_FLOAT, float, float);
+    SET_TENSOR_VAL_CASE(DT_DOUBLE, double, double);
+    SET_TENSOR_VAL_CASE(DT_INT64, int64, int64);
+    SET_TENSOR_VAL_CASE(DT_INT32, int32, int);
+    SET_TENSOR_VAL_CASE(DT_INT16, int32, int);
+    SET_TENSOR_VAL_CASE(DT_INT8, int32, int);
+    SET_TENSOR_VAL_CASE(DT_UINT8, int32, int);
+    SET_TENSOR_VAL_CASE(DT_BOOL, bool, bool);
+    default:
+      return errors::InvalidArgument("Unsupported type: ", type);
+  }
+  return Status::OK();
+}
+
+#undef SET_TENSOR_CAL_CASE
+}  // namespace
+
 // static
 NodeDef ConstantFolding::CreateNodeDef(const string& name,
                                        const TensorValue& tensor) {
@@ -629,20 +808,26 @@ NodeDef ConstantFolding::CreateNodeDef(const string& name,
   // Use the packed representation whenever possible to avoid generating large
   // graphdefs. Moreover, avoid repeating the last values if they're equal.
   if (tensor->NumElements() > 4) {
-#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)         \
-  optimized = true;                                          \
-  TYPE last = tensor->flat<TYPE>()(0);                       \
-  int last_index = 0;                                        \
-  for (int i = 0; i < tensor->NumElements(); ++i) {          \
-    TYPE cur = tensor->flat<TYPE>()(i);                      \
-    t->add_##NAME##_val(cur);                                \
-    if (cur != last) {                                       \
-      last = cur;                                            \
-      last_index = i;                                        \
-    }                                                        \
-  }                                                          \
-  /* Remove all identical trailing values to save memory. */ \
-  t->mutable_##NAME##_val()->Truncate(last_index + 1);
+#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)                \
+  const TYPE* val_ptr = tensor->flat<TYPE>().data();                \
+  TYPE last = *val_ptr;                                             \
+  int64 last_index = 0;                                             \
+  for (int64 i = 0; i < tensor->NumElements(); ++i) {               \
+    TYPE cur = *val_ptr++;                                          \
+    if (cur != last) {                                              \
+      last = cur;                                                   \
+      last_index = i;                                               \
+    }                                                               \
+  }                                                                 \
+  if (last_index < kint32max) {                                     \
+    optimized = true;                                               \
+    t->mutable_##NAME##_val()->Reserve(last_index + 1);             \
+    t->mutable_##NAME##_val()->AddNAlreadyReserved(last_index + 1); \
+    val_ptr = tensor->flat<TYPE>().data();                          \
+    for (int64 i = 0; i <= last_index; ++i) {                       \
+      t->set_##NAME##_val(i, *val_ptr++);                           \
+    }                                                               \
+  }
 
     if (tensor->dtype() == DT_FLOAT) {
       POPULATE_TENSOR_PROTO(tensor, t, float, float)
@@ -652,6 +837,14 @@ NodeDef ConstantFolding::CreateNodeDef(const string& name,
       POPULATE_TENSOR_PROTO(tensor, t, int64, int64)
     } else if (tensor->dtype() == DT_INT32) {
       POPULATE_TENSOR_PROTO(tensor, t, int32, int)
+    } else if (tensor->dtype() == DT_INT16) {
+      POPULATE_TENSOR_PROTO(tensor, t, int16, int)
+    } else if (tensor->dtype() == DT_INT8) {
+      POPULATE_TENSOR_PROTO(tensor, t, int8, int)
+    } else if (tensor->dtype() == DT_UINT8) {
+      POPULATE_TENSOR_PROTO(tensor, t, uint8, int)
+    } else if (tensor->dtype() == DT_BOOL) {
+      POPULATE_TENSOR_PROTO(tensor, t, bool, bool)
     }
   }
   if (optimized) {
@@ -720,7 +913,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
       break;
     }
     const NodeDef* input_node = node_map_->GetNode(input);
-    if (!IsConstant(*input_node)) {
+    if (!IsReallyConstant(*input_node)) {
       return Status(error::INVALID_ARGUMENT,
                     strings::StrCat("Can't fold ", node.name(), ", its ", input,
                                     " isn't constant"));
@@ -737,7 +930,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
   }
 
   for (size_t i = 0; i < output_tensors.size(); i++) {
-    string node_name = AddPrefixToNodeName(node.name(), kConstantFoldingConst);
+    string node_name = OptimizedNodeName(node, "-folded");
     if (output_tensors.size() > 1) {
       node_name = strings::StrCat(node_name, "-", i);
     }
@@ -774,7 +967,7 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
         continue;
       }
       NodeDef* input_node = node_map_->GetNode(input);
-      if (!IsConstant(*input_node)) {
+      if (!IsReallyConstant(*input_node)) {
         continue;
       }
       bool valid_input = true;
@@ -789,16 +982,14 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
         continue;
       }
 
-      string const_out_name =
-          AddPrefixToNodeName(node->name(), kConstantFoldingConst);
-      string const_index_name = AddPrefixToNodeName(
-          strings::StrCat(node->name(), "_index"), kConstantFoldingConst);
+      string const_out_name = OptimizedNodeName(*node, "_const");
+      string const_index_name = OptimizedNodeName(*node, "_index");
       if (node_map_->GetNode(const_out_name) ||
           node_map_->GetNode(const_index_name)) {
         // Intended name already exists.
         return errors::AlreadyExists(
             strings::StrCat(const_out_name, " or ", const_index_name,
-                            "already present in the graph"));
+                            " already present in the graph"));
       }
 
       NodeDef* const_out = output_graph->add_node();
@@ -896,7 +1087,7 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
       if (node_map_->GetNode(const_node->name())) {
         // Intended name already exists.
         return errors::AlreadyExists(strings::StrCat(
-            const_node->name(), "already present in the graph"));
+            const_node->name(), " already present in the graph"));
       }
       NodeDef* added_node = output_graph->add_node();
       *added_node = *const_node;
@@ -955,8 +1146,8 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
 Status ConstantFolding::FoldGraph(GraphDef* output) {
   std::unordered_set<string> processed_nodes;
   std::deque<NodeDef*> queue;
-  for (int i = 0; i < graph_.node_size(); i++) {
-    auto node = graph_.mutable_node(i);
+  for (int i = 0; i < graph_->node_size(); i++) {
+    auto node = graph_->mutable_node(i);
     if (IsFoldable(*node)) {
       queue.push_back(node);
     }
@@ -969,6 +1160,7 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
     }
     // We need to record a copy of output nodes before FoldNode() modifies it.
     std::set<NodeDef*> outputs = node_map_->GetOutputs(node->name());
+
     Status s = FoldNode(node, output);
     processed_nodes.insert(node->name());
     if (!s.ok()) {
@@ -995,7 +1187,7 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
   output->mutable_node()->DeleteSubrange(last + 1,
                                          output->node_size() - last - 1);
 
-  for (const auto& node : graph_.node()) {
+  for (const auto& node : graph_->node()) {
     // If no fetch nodes is provided, we conservatively
     // keep all nodes in the original graph in case users need to fetch
     // their values.
@@ -1016,7 +1208,7 @@ bool ConstantFolding::IsSimplifiableReduction(const NodeDef& node) const {
   if (IsReduction(node)) {
     CHECK_LE(2, node.input_size());
     const NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
-    if (IsConstant(*reductions_indices)) {
+    if (IsReallyConstant(*reductions_indices)) {
       TensorVector output;
       Status s = EvaluateNode(*reductions_indices, TensorVector(), &output);
       if (!s.ok()) {
@@ -1040,7 +1232,7 @@ bool ConstantFolding::IsSimplifiableReshape(
   }
   CHECK_LE(2, node.input_size());
   const NodeDef* new_shape = node_map_->GetNode(node.input(1));
-  if (!IsConstant(*new_shape)) {
+  if (!IsReallyConstant(*new_shape)) {
     return false;
   }
   TensorVector outputs;
@@ -1090,49 +1282,369 @@ bool ConstantFolding::IsSimplifiableReshape(
   return shape.IsCompatibleWith(new_dims);
 }
 
+#define IS_VALUE_CASE(DTYPE, VALUE)                   \
+  case DTYPE:                                         \
+    return AllValuesAre<EnumToDataType<DTYPE>::Type>( \
+        node.attr().at("value").tensor(), EnumToDataType<DTYPE>::Type(VALUE))
+
+#define IS_ONES_CASE(TYPE) IS_VALUE_CASE(TYPE, 1)
+#define IS_ZEROS_CASE(TYPE) IS_VALUE_CASE(TYPE, 0)
+
+bool ConstantFolding::IsOnes(const NodeDef& node) const {
+  if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
+    return false;
+  }
+  if (node.op() == "OnesLike") {
+    return true;
+  }
+  if (node.op() != "Const") {
+    return false;
+  }
+  const auto dtype = node.attr().at("dtype").type();
+  switch (dtype) {
+    // TODO(rmlarsen): Make DT_HALF case compile.
+    //    IS_ONES_CASE(DT_HALF);
+    IS_ONES_CASE(DT_FLOAT);
+    IS_ONES_CASE(DT_DOUBLE);
+    IS_ONES_CASE(DT_UINT8);
+    IS_ONES_CASE(DT_INT8);
+    IS_ONES_CASE(DT_UINT16);
+    IS_ONES_CASE(DT_INT16);
+    IS_ONES_CASE(DT_INT32);
+    IS_ONES_CASE(DT_INT64);
+    IS_ONES_CASE(DT_COMPLEX64);
+    IS_ONES_CASE(DT_COMPLEX128);
+    default:
+      VLOG(1) << "Unsupported type " << DataTypeString(dtype);
+      return false;
+  }
+  return false;
+}
+
+bool ConstantFolding::IsZeros(const NodeDef& node) const {
+  if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
+    return false;
+  }
+  if (node.op() == "ZerosLike") {
+    return true;
+  }
+  if (!IsConstant(node)) {
+    return false;
+  }
+  const auto dtype = node.attr().at("dtype").type();
+  switch (dtype) {
+    // TODO(rmlarsen): Make DT_HALF case compile.
+    //    IS_ZEROS_CASE(DT_HALF);
+    IS_ZEROS_CASE(DT_FLOAT);
+    IS_ZEROS_CASE(DT_DOUBLE);
+    IS_ZEROS_CASE(DT_UINT8);
+    IS_ZEROS_CASE(DT_INT8);
+    IS_ZEROS_CASE(DT_UINT16);
+    IS_ZEROS_CASE(DT_INT16);
+    IS_ZEROS_CASE(DT_INT32);
+    IS_ZEROS_CASE(DT_INT64);
+    IS_ZEROS_CASE(DT_COMPLEX64);
+    IS_ZEROS_CASE(DT_COMPLEX128);
+    default:
+      VLOG(1) << "Unsupported type " << DataTypeString(dtype);
+      return false;
+  }
+  return false;
+}
+
+void ConstantFolding::ReplaceOperationWithIdentity(int input_to_forward,
+                                                   NodeDef* node,
+                                                   GraphDef* graph) {
+  node->set_op("Identity");
+  DataType dtype = node->attr().at("T").type();
+  node->clear_attr();
+  (*node->mutable_attr())["T"].set_type(dtype);
+
+  // Propagate the designated input through the identity.
+  node->mutable_input()->SwapElements(0, input_to_forward);
+  // Add all other inputs as control dependencies.
+  for (int i = 1; i < node->input_size(); ++i) {
+    if (IsControlInput(node->input(i))) {
+      break;
+    }
+    const string ctrl_dep =
+        AddControlDependency(node->input(i), graph, node_map_.get());
+    node_map_->UpdateInput(node->name(), node->input(i), ctrl_dep);
+    node->set_input(i, ctrl_dep);
+  }
+  graph_modified_ = true;
+}
+
+void ConstantFolding::ReplaceDivisionOfOnesByReciprocal(NodeDef* node,
+                                                        GraphDef* graph) {
+  node->set_op("Reciprocal");
+  node->mutable_input()->SwapElements(0, 1);
+  const string ctrl_dep =
+      AddControlDependency(node->input(1), graph, node_map_.get());
+  node_map_->UpdateInput(node->name(), node->input(1), ctrl_dep);
+  node->set_input(1, ctrl_dep);
+  graph_modified_ = true;
+}
+
+Status ConstantFolding::ReplaceOperationWithConstant(
+    double value, const TensorShapeProto& shape, NodeDef* node,
+    GraphDef* graph) {
+  AttrValue dtype_attr = node->attr().at("T");
+  AttrValue tensor_attr;
+  TF_RETURN_IF_ERROR(CreateConstantTensorAttrValue(dtype_attr.type(), value,
+                                                   shape, &tensor_attr));
+  node->clear_attr();
+  node->mutable_attr()->insert({"dtype", dtype_attr});
+  node->mutable_attr()->insert({"value", tensor_attr});
+  node->set_op("Const");
+  // Convert all inputs to control dependencies.
+  for (int i = 0; i < node->input_size(); ++i) {
+    if (IsControlInput(node->input(i))) {
+      break;
+    }
+    const string ctrl_dep =
+        AddControlDependency(node->input(i), graph, node_map_.get());
+    node_map_->UpdateInput(node->name(), node->input(i), ctrl_dep);
+    node->set_input(i, ctrl_dep);
+  }
+  graph_modified_ = true;
+  return Status::OK();
+}
+
 Status ConstantFolding::SimplifyGraph(GraphDef* output,
-                                      const GraphProperties& properties) {
-  for (auto& node : *output->mutable_node()) {
-    if (IsSimplifiableReduction(node)) {
+                                      const GraphProperties& properties,
+                                      bool use_shape_info) {
+  const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
+  for (int i = 0; i < output->node_size(); ++i) {
+    NodeDef* node = output->mutable_node(i);
+    if (IsSimplifiableReduction(*node)) {
       // Replace the reduction node with an identity node, that can be further
       // optimized by the model pruner.
-      const NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
       DataType output_type;
-      if (node.attr().count("T") > 0) {
-        output_type = node.attr().at("T").type();
+      if (node->attr().count("T") > 0) {
+        output_type = node->attr().at("T").type();
       } else {
         // This is an 'any' or 'all' reduction. The output is always boolean.
         output_type = DT_BOOL;
       }
-      node.set_op("Identity");
-      node.clear_attr();
-      (*node.mutable_attr())["T"].set_type(output_type);
-      if (node.input_size() > 2) {
-        node.mutable_input()->SwapElements(1, node.input_size() - 1);
+      node->set_op("Identity");
+      node->clear_attr();
+      (*node->mutable_attr())["T"].set_type(output_type);
+      *node->mutable_input(1) = AsControlDependency(node->input(1));
+      graph_modified_ = true;
+      continue;
+    }
+    if (use_shape_info && IsSimplifiableReshape(*node, properties)) {
+      DataType output_type = node->attr().at("T").type();
+      node->set_op("Identity");
+      node->clear_attr();
+      (*node->mutable_attr())["T"].set_type(output_type);
+      *node->mutable_input(1) = AsControlDependency(node->input(1));
+      graph_modified_ = true;
+      continue;
+    }
+    const bool safe_to_use_shapes =
+        use_shape_info && (feed_nodes_.empty() || is_aggressive);
+    const bool is_mul = IsMul(*node);
+    const bool is_matmul = IsMatMul(*node);
+    const bool is_add = IsAdd(*node) || IsBiasAdd(*node);
+    const bool is_sub = IsSub(*node);
+    const bool is_any_div = IsAnyDiv(*node);
+    // Simplify arithmetic operations with ones or zeros.
+    if (safe_to_use_shapes &&
+        (is_mul || is_matmul || is_add || is_sub || is_any_div) &&
+        properties.HasInputProperties(node->name()) &&
+        properties.HasOutputProperties(node->name())) {
+      const NodeDef* x = node_map_->GetNode(node->input(0));
+      const NodeDef* y = node_map_->GetNode(node->input(1));
+      if (x == nullptr || y == nullptr) {
+        return errors::InvalidArgument("Invalid inputs to node: ",
+                                       node->DebugString());
       }
-      node.mutable_input()->RemoveLast();
-      for (const auto& input : reductions_indices->input()) {
-        DCHECK(IsControlInput(input));
-        *node.add_input() = input;
+      const TensorShapeProto& output_shape =
+          properties.GetOutputProperties(node->name())[0].shape();
+
+      // Simplify element-wise multiplication by ones or addition/subtraction
+      // of zeros.
+      const TensorShapeProto& y_shape =
+          properties.GetInputProperties(node->name())[1].shape();
+      const bool x_is_zero = IsZeros(*x);
+      const bool x_is_one = IsOnes(*x);
+      const bool y_matches_output_shape = ShapesEqual(output_shape, y_shape);
+      if (y_matches_output_shape &&
+          ((is_mul && x_is_one) || (is_add && x_is_zero))) {
+        // TODO(rmlarsen): Handle subtraction 0 - y.
+        // 1 * y = y or 0 + y = y.
+        ReplaceOperationWithIdentity(1, node, output);
+        continue;
+      }
+
+      // Replace 1 / y with Reciprocal op.
+      if (y_matches_output_shape && is_any_div && x_is_one) {
+        DataType type = node->attr().at("T").type();
+        if (DataTypeIsFloating(type) || DataTypeIsComplex(type)) {
+          ReplaceDivisionOfOnesByReciprocal(node, output);
+          continue;
+        }
+      }
+
+      const TensorShapeProto& x_shape =
+          properties.GetInputProperties(node->name())[0].shape();
+      const bool y_is_zero = IsZeros(*y);
+      const bool y_is_one = IsOnes(*y);
+      const bool x_matches_output_shape = ShapesEqual(output_shape, x_shape);
+      if (x_matches_output_shape &&
+          (((is_mul || is_any_div) && y_is_one) ||
+           ((is_add || is_sub) && y_is_zero && is_aggressive))) {
+        // x * 1 = x or x / 1 = x or x +/- 0 = x
+        ReplaceOperationWithIdentity(0, node, output);
+        continue;
+      }
+
+      // Simplify multiplication and matmul by zeros.
+      // Also optimize zeros divided by a tensor, but only if we are in
+      // aggressive mode, since we might get rid of divisions by zero.
+      bool optimize_zeros_divided_by_y =
+          is_any_div && x_is_zero && is_aggressive;
+      if ((x_is_zero || y_is_zero) &&
+          (is_mul || is_matmul || optimize_zeros_divided_by_y)) {
+        const PartialTensorShape shp(output_shape);
+        if (shp.IsFullyDefined()) {
+          TF_RETURN_IF_ERROR(
+              ReplaceOperationWithConstant(0, output_shape, node, output));
+          continue;
+        }
+        // Even if an input shape is only partially known, we may known that it
+        // matches the output shape and thus forward the corresponding zero
+        // input.
+        if ((is_mul || is_any_div) && x_is_zero && x_matches_output_shape) {
+          ReplaceOperationWithIdentity(0, node, output);
+          continue;
+        } else if (is_mul && y_is_zero && y_matches_output_shape) {
+          ReplaceOperationWithIdentity(1, node, output);
+          continue;
+        }
       }
     }
-    // It's possible to feed a placeholder with a tensor that doesn't have the
-    // proper shape, and reshape this tensor later on. Therefore only remove
-    // reshapes in graphs that don't have placeholders.
-    if (IsSimplifiableReshape(node, properties)) {
-      const NodeDef* new_shape = node_map_->GetNode(node.input(1));
-      DataType output_type = node.attr().at("T").type();
-      node.set_op("Identity");
-      node.clear_attr();
-      (*node.mutable_attr())["T"].set_type(output_type);
-      if (node.input_size() > 2) {
-        node.mutable_input()->SwapElements(1, node.input_size() - 1);
+
+    // Strength reduce floating point division by a constant Div(x, const) to
+    // multiplication by the reciprocal Mul(x, Reciprocal(const)). This in turn
+    // will be constant folded to Mul(x, 1.0/const).
+    if (node->input_size() >= 2 && (IsRealDiv(*node) || IsDiv(*node))) {
+      const string& const_input = node->input(1);
+      const NodeDef* denom = node_map_->GetNode(const_input);
+      CHECK(denom != nullptr);
+      if (!IsReallyConstant(*denom)) {
+        continue;
       }
-      node.mutable_input()->RemoveLast();
-      for (const auto& input : new_shape->input()) {
-        DCHECK(IsControlInput(input));
-        *node.add_input() = input;
+      if (node->attr().count("T") == 0) {
+        continue;
       }
+      DataType type = node->attr().at("T").type();
+      if (IsDiv(*node) &&
+          !(DataTypeIsFloating(type) || DataTypeIsComplex(type))) {
+        continue;
+      }
+      // Insert new reciprocal op and change node from Div to Mul.
+      NodeDef* reciprocal_node = output->add_node();
+      reciprocal_node->set_name(AddPrefixToNodeName(
+          strings::StrCat(node->name(), "_recip"), kConstantFoldingConst));
+      reciprocal_node->set_op("Reciprocal");
+      reciprocal_node->set_device(node->device());
+      node->set_op("Mul");
+      // Re-wire inputs and outputs.
+      reciprocal_node->add_input(const_input);
+      (*reciprocal_node->mutable_attr())["T"].set_type(type);
+      node->set_input(1, reciprocal_node->name());
+      node_map_->AddNode(reciprocal_node->name(), reciprocal_node);
+      node_map_->UpdateOutput(node->name(), const_input,
+                              reciprocal_node->name());
+      graph_modified_ = true;
+    }
+
+    // Consider the transformation
+    //
+    //                      +                +       = parent
+    //                     / \              / \
+    //                    C   +    -- >    X   +     = children
+    //                       / \              / \
+    //                      X   Y            C   Y   = leaves
+    //
+    // where C is constant and X is non-constant, and '+' denotes an
+    // associative and commutative operator like addition or multiplication.
+    // This optimization pushes constants down in the tree to canonicalize it.
+    // Moreoever, in cases where the child node has a second constant input Y
+    // we will create a leaf node that can be folded, e.g.
+    //
+    //    Add(C1, Add(C2, X)) -> Add(X, Add(C1, C2)) -> Add(X, C1 + C2)
+    //
+    // TODO(rmlarsen): Handle non-associative/non-commutative operators like
+    // subtraction and division, as well as mixed subtraction/addition,
+    // division/multiplication.
+    // Don't touch BiasAdd since they can't handle vectors as their first
+    // inputs.
+    if (has_fetch_ && (IsAdd(*node) || is_mul) &&
+        NumNonControlInputs(*node) == 2) {
+      NodeDef* left_child = node_map_->GetNode(node->input(0));
+      NodeDef* right_child = node_map_->GetNode(node->input(1));
+      // One child must be constant, and the other the same op as the parent.
+      if (node->op() != left_child->op() && node->op() != right_child->op()) {
+        continue;
+      }
+      const bool left_child_is_constant = IsReallyConstant(*left_child);
+      const bool right_child_is_constant = IsReallyConstant(*right_child);
+      if (!left_child_is_constant && !right_child_is_constant) {
+        continue;
+      }
+      if (node->device() != left_child->device() ||
+          node->device() != right_child->device()) {
+        continue;
+      }
+      NodeDef* op_child_node =
+          left_child_is_constant ? right_child : left_child;
+      NodeDef* const_child_node =
+          left_child_is_constant ? left_child : right_child;
+      // Make sure that it is safe to change the value of the child node->
+      if (op_child_node->input_size() < 2 ||
+          NumNonControlOutputs(*op_child_node, *node_map_) > 1 ||
+          nodes_to_preserve_.find(op_child_node->name()) !=
+              nodes_to_preserve_.end()) {
+        continue;
+      }
+
+      // Identify the nodes to swap.
+      NodeDef* left_leaf = node_map_->GetNode(op_child_node->input(0));
+      NodeDef* right_leaf = node_map_->GetNode(op_child_node->input(1));
+      const bool left_leaf_is_constant = IsReallyConstant(*left_leaf);
+      const bool right_leaf_is_constant = IsReallyConstant(*right_leaf);
+      if (left_leaf_is_constant && right_leaf_is_constant) {
+        // Child is already foldable, leave it alone.
+        continue;
+      }
+      const int non_const_leaf_input = left_leaf_is_constant ? 1 : 0;
+      const int parent_const_input = left_child_is_constant ? 0 : 1;
+      const auto& child_output = node_map_->GetOutputs(op_child_node->name());
+      if (child_output.find(const_child_node) != child_output.end()) {
+        // If there is a control edge from the child op to C, the transformation
+        // would create a cycle in the graph. We know that it must be a control
+        // edge. We can replace such a control edge with a control edge from A
+        // to C.
+        CHECK(MaybeRemoveControlInput(op_child_node->name(), const_child_node,
+                                      graph_, node_map_.get()));
+        NodeDef* other_leaf = left_leaf_is_constant ? left_leaf : right_leaf;
+        MaybeAddControlInput(other_leaf->name(), const_child_node, graph_,
+                             node_map_.get());
+      }
+
+      // Swap the constant child with a non-constant leaf node.
+      node_map_->UpdateInput(node->name(), node->input(parent_const_input),
+                             op_child_node->input(non_const_leaf_input));
+      node_map_->UpdateInput(op_child_node->name(),
+                             op_child_node->input(non_const_leaf_input),
+                             node->input(parent_const_input));
+      std::swap(*node->mutable_input(parent_const_input),
+                *op_child_node->mutable_input(non_const_leaf_input));
+      graph_modified_ = true;
     }
   }
   return Status::OK();
@@ -1141,7 +1653,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* output,
 Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
                                             const GrapplerItem& item,
                                             GraphDef* output) {
-  node_map_.reset(new NodeMap(&graph_));
+  node_map_.reset(new NodeMap(graph_));
   nodes_whitelist_.clear();
   // Fold fetch nodes iff it has a single fanout. Note that if a fetch node
   // has a single fanout, it would be rewritten as a constant with the same
@@ -1152,42 +1664,36 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
   // more with the original node name.
   for (const auto& fetch : item.fetch) {
     const NodeDef* fetch_node = node_map_->GetNode(fetch);
-    if (fetch_node && NumOutputs(*fetch_node) == 1) {
+    if (fetch_node && NumOutputs(*fetch_node, graph_) == 1) {
       nodes_whitelist_.insert(fetch_node->name());
     }
   }
 
   GraphProperties properties(item);
-  const bool has_feed = !item.feed.empty();
-  bool needs_shapes = !has_feed || opt_level_ == RewriterConfig::AGGRESSIVE;
-  Status s = errors::Unknown(
-      "The graph properties are needed but were not initialized");
-  if (needs_shapes) {
-    s = properties.InferStatically();
-  }
+  // It's possible to feed a placeholder with a tensor of any shape: make sure
+  // that the shape inference deals with this conservatively unless we're in
+  // aggressive mode.
+  const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
+  Status s = properties.InferStatically(assume_valid_feeds);
+  const bool can_use_shape_info = s.ok();
 
-  if (!has_feed && s.ok()) {
-    // Only use static shape information when there is no feed in the
-    // graph. That's because it's possible to feed a placeholder with a tensor
-    // of any shape, which could make the static information inconsistent with
-    // the shapes actually fed.
-    TF_RETURN_IF_ERROR(MaterializeShapes(item, properties));
-  }
-  if (opt_level_ == RewriterConfig::AGGRESSIVE && s.ok()) {
-    TF_RETURN_IF_ERROR(MaterializeConstants(item, properties));
+  if (can_use_shape_info) {
+    TF_RETURN_IF_ERROR(MaterializeShapes(properties));
+    TF_RETURN_IF_ERROR(MaterializeConstants(properties));
   }
 
   TF_RETURN_IF_ERROR(FoldGraph(output));
-
-  if (!has_feed && s.ok()) {
-    TF_RETURN_IF_ERROR(SimplifyGraph(output, properties));
-  }
+  node_map_.reset(new NodeMap(output));
+  TF_RETURN_IF_ERROR(SimplifyGraph(output, properties, can_use_shape_info));
   return Status::OK();
 }
 
 Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
   nodes_to_preserve_ = item.NodesToPreserve();
+  for (const auto& feed : item.feed) {
+    feed_nodes_.insert(NodeName(feed.first));
+  }
 
   if (cpu_device_ == nullptr) {
     owned_device_.reset(new DeviceSimple());
@@ -1200,13 +1706,13 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
   *output = item.graph;
   int64 node_count;
   do {
-    graph_.Swap(output);
-    item_to_optimize.graph = graph_;
+    graph_modified_ = false;
+    item_to_optimize.graph.Swap(output);
+    graph_ = &item_to_optimize.graph;
     *output = GraphDef();
-    node_count = graph_.node_size();
+    node_count = graph_->node_size();
     TF_RETURN_IF_ERROR(RunOptimizationPass(cluster, item_to_optimize, output));
-  } while (output->node_size() != node_count);
-
+  } while (graph_modified_ || output->node_size() != node_count);
   *output->mutable_library() = item.graph.library();
   *output->mutable_versions() = item.graph.versions();
 
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index f04f413c10a7e8e19520cc462f88b2a9a2d0fecd..18acc91e8a18f4bf2eb77c7e5171eaca4ff5bec5 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -51,16 +51,19 @@ class ConstantFolding : public GraphOptimizer {
                 const GraphDef& optimize_output, double result) override;
 
  private:
-  Status MaterializeShapes(const GrapplerItem& item,
-                           const GraphProperties& properties);
+  string OptimizedNodeName(const NodeDef& node, StringPiece suffix) const;
+  bool OptimizedNodeExists(const NodeDef& node, StringPiece suffix) const;
+
+  bool IsReallyConstant(const NodeDef& node) const;
+
+  Status MaterializeShapes(const GraphProperties& properties);
 
   Status MaterializeBroadcastGradientArgs(const NodeDef& node,
                                           const GraphProperties& properties);
   Status MaterializeReductionIndices(NodeDef* node,
                                      const GraphProperties& properties);
 
-  Status MaterializeConstants(const GrapplerItem& item,
-                              const GraphProperties& properties);
+  Status MaterializeConstants(const GraphProperties& properties);
   bool IsFoldable(const NodeDef& node) const;
 
   Status EvaluateNode(const NodeDef& node,
@@ -72,12 +75,21 @@ class ConstantFolding : public GraphOptimizer {
 
   Status FoldNode(NodeDef* node, GraphDef* output_graph);
 
+  bool IsOnes(const NodeDef& node) const;
+  bool IsZeros(const NodeDef& node) const;
+  void ReplaceOperationWithIdentity(int input_to_forward, NodeDef* node,
+                                    GraphDef* graph);
+  Status ReplaceOperationWithConstant(double value,
+                                      const TensorShapeProto& shape,
+                                      NodeDef* node, GraphDef* graph);
+  void ReplaceDivisionOfOnesByReciprocal(NodeDef* node, GraphDef* graph);
   Status FoldGraph(GraphDef* output);
 
   bool IsSimplifiableReduction(const NodeDef& node) const;
   bool IsSimplifiableReshape(const NodeDef& node,
                              const GraphProperties& properties) const;
-  Status SimplifyGraph(GraphDef* output, const GraphProperties& properties);
+  Status SimplifyGraph(GraphDef* output, const GraphProperties& properties,
+                       bool use_shape_info);
 
   Status RunOptimizationPass(Cluster* cluster, const GrapplerItem& item,
                              GraphDef* output);
@@ -88,11 +100,13 @@ class ConstantFolding : public GraphOptimizer {
   std::unique_ptr<DeviceBase> owned_device_;
 
   std::unique_ptr<ResourceMgr> resource_mgr_;
-  GraphDef graph_;
+  GraphDef* graph_;
   std::unique_ptr<NodeMap> node_map_;
   std::unordered_set<string> nodes_to_preserve_;
   std::unordered_set<string> nodes_whitelist_;
+  std::unordered_set<string> feed_nodes_;
   bool has_fetch_;
+  bool graph_modified_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index b2d9b02c68358fc3e22881bba60a34feb3d4211e..46998dcc91c8df2313ff92b056f732379b173661 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -20,30 +20,15 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-class ConstantFoldingTest : public ::testing::Test {
- protected:
-  std::vector<Tensor> EvaluateNodes(const GraphDef& graph,
-                                    const std::vector<string>& fetch) {
-    SessionOptions options;
-    std::unique_ptr<tensorflow::Session> session(NewSession(options));
-    TF_CHECK_OK(session->Create(graph));
-    RunOptions run_options;
-    std::vector<Tensor> output_tensors;
-    TF_CHECK_OK(
-        session->Run(run_options, {}, fetch, fetch, &output_tensors, nullptr));
-    TF_CHECK_OK(session->Close());
-    return output_tensors;
-  }
-};
+class ConstantFoldingTest : public GrapplerTest {};
 
 TEST_F(ConstantFoldingTest, SimpleFolding) {
   // Build a simple graph with a few trivially prunable ops.
@@ -77,11 +62,496 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
+TEST_F(ConstantFoldingTest, AddTree) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output c2 = ops::Const(s.WithOpName("c2"), 2.0f, {2});
+  Output c3 = ops::Const(s.WithOpName("c3"), 3.0f, {2});
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output add_child = ops::Add(s.WithOpName("add_child"), c2, x);
+  Output c1 = ops::Const(s.WithOpName("c1").WithControlDependencies(add_child),
+                         1.0f, {1});
+  Output add_parent = ops::Add(s.WithOpName("add_parent"), c1, add_child);
+
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output c4 = ops::Const(s.WithOpName("c4"), 4.0f, {2});
+  Output c5 = ops::Const(s.WithOpName("c5"), 5.0f, {2});
+  Output c20 = ops::Const(s.WithOpName("c20"), 20.0f, {2});
+  Output mul_child = ops::Mul(s.WithOpName("mul_child"), c4, y);
+  Output mul_parent = ops::Mul(s.WithOpName("mul_parent"), c5, mul_child);
+  Output addmul_child = ops::Add(s.WithOpName("addmul_child"), c4, x);
+  Output addmul_parent =
+      ops::Mul(s.WithOpName("addmul_parent"), c5, addmul_child);
+
+  GrapplerItem item;
+  item.fetch = {"add_parent", "mul_parent", "addmul_parent"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding fold(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //    +                +             +
+  //   / \              / \           / \
+  // 1.0  +     -->    x   +    -->  x  3.0
+  //     / \              / \
+  //   2.0  x           1.0 2.0
+  //
+  //    *                *             *
+  //   / \              / \           / \
+  // 4.0  *     -->    y   *    -->  y  20.0
+  //     / \              / \
+  //   5.0  y           4.0 5.0
+
+  EXPECT_EQ(11, output.node_size());
+  for (const auto& node : output.node()) {
+    if (node.name() == "add_child") {
+      EXPECT_EQ("Const", node.op());
+      TensorProto t = node.attr().at("value").tensor();
+      EXPECT_EQ(1, t.tensor_shape().dim_size());
+      EXPECT_EQ(2, t.tensor_shape().dim(0).size());
+    } else if (node.name() == "add_parent") {
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("add_child", node.input(1));
+    } else if (node.name() == "mul_child") {
+      EXPECT_EQ("Const", node.op());
+      TensorProto t = node.attr().at("value").tensor();
+      EXPECT_EQ(1, t.tensor_shape().dim_size());
+      EXPECT_EQ(2, t.tensor_shape().dim(0).size());
+    } else if (node.name() == "mul_parent") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("mul_child", node.input(1));
+    } else if (node.name() == "addmul_child") {
+      // Unchanged.
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("c4", node.input(0));
+      EXPECT_EQ("x", node.input(1));
+    }
+  }
+
+  // Check that the result nodes have the expected value.
+  std::vector<string> fetch = {"c3", "c20"};
+  auto tensor_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(fetch.size(), tensor_expected.size());
+  fetch = {"add_child", "mul_child"};
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(fetch.size(), tensors.size());
+  for (int i = 0; i < fetch.size(); i++) {
+    test::ExpectTensorEqual<float>(tensor_expected[i], tensors[i]);
+  }
+}
+
+TEST_F(ConstantFoldingTest, NeutralElement) {
+  for (bool use_const : {true, false}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({2, 2})));
+    Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({2, 2})));
+    Output a = ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({3, 2})));
+    Output b = ops::Placeholder(s.WithOpName("b"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({2, 3})));
+    Output bias = ops::Placeholder(s.WithOpName("bias"), DT_FLOAT,
+                                   ops::Placeholder::Shape(TensorShape({2})));
+    Output zeros = !use_const ? ops::ZerosLike(s.WithOpName("zeros"), x)
+                              : ops::Const(s.WithOpName("zeros"), 0.0f, {2, 2});
+    Output zeros_1d = ops::Const(s.WithOpName("zeros_1d"), 0.0f, {2});
+    Output ones = !use_const ? ops::OnesLike(s.WithOpName("ones"), x)
+                             : ops::Const(s.WithOpName("ones"), 1.0f, {2, 2});
+    Output mul1 = ops::Mul(s.WithOpName("mul1"), x, zeros);
+    Output mul2 = ops::Mul(s.WithOpName("mul2"), zeros, y);
+    Output mul3 = ops::Mul(s.WithOpName("mul3"), x, ones);
+    Output mul4 = ops::Mul(s.WithOpName("mul4"), ones, y);
+    Output mul5 = ops::Mul(s.WithOpName("mul5"), x, zeros_1d);
+    Output mul6 = ops::Mul(s.WithOpName("mul6"), zeros_1d, y);
+    Output div1 = ops::Div(s.WithOpName("div1"), x, ones);
+    Output div2 = ops::Div(s.WithOpName("div2"), ones, y);
+    Output matmul1 = ops::MatMul(s.WithOpName("matmul1"), x, zeros);
+    Output matmul2 = ops::MatMul(s.WithOpName("matmul2"), zeros, y);
+    Output matmul3 = ops::MatMul(s.WithOpName("matmul3"), a, zeros);
+    Output matmul4 = ops::MatMul(s.WithOpName("matmul4"), zeros, b);
+    Output add1 = ops::Add(s.WithOpName("add1"), x, zeros);
+    Output add2 = ops::Add(s.WithOpName("add2"), zeros, y);
+    Output bias_add1 = ops::BiasAdd(s.WithOpName("bias_add1"), x, zeros_1d);
+    Output bias_add2 = ops::BiasAdd(s.WithOpName("bias_add2"), zeros, bias);
+    Output sub1 = ops::Sub(s.WithOpName("sub1"), x, zeros);
+    Output sub2 = ops::Sub(s.WithOpName("sub2"), zeros, y);
+    Output addn =
+        ops::AddN(s.WithOpName("addn"),
+                  {mul1, mul2, mul3, mul4, mul5, mul6, div1, div2, matmul1,
+                   matmul2, add1, add2, bias_add1, bias_add2, sub1, sub2});
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch = {"addn", "matmul3", "matmul4"};
+
+    ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                              nullptr /* cpu_device */);
+    GraphDef output;
+    Status status = optimizer.Optimize(nullptr, item, &output);
+    TF_EXPECT_OK(status);
+
+    EXPECT_EQ(27, output.node_size());
+    for (int i = 0; i < output.node_size(); ++i) {
+      const NodeDef& node = output.node(i);
+      const string& name = node.name();
+      if (name == "mul1") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "mul2") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^zeros", node.input(0));
+        EXPECT_EQ("^y", node.input(1));
+      } else if (name == "mul3") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      } else if (name == "mul4") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("y", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      } else if (name == "mul5") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^x", node.input(0));
+        EXPECT_EQ("^zeros_1d", node.input(1));
+      } else if (name == "mul6") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^zeros_1d", node.input(0));
+        EXPECT_EQ("^y", node.input(1));
+      } else if (name == "div1") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      } else if (name == "div2") {
+        EXPECT_EQ("Reciprocal", node.op());
+        EXPECT_EQ("y", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      } else if (name == "matmul1") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "matmul2") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^zeros", node.input(0));
+        EXPECT_EQ("^y", node.input(1));
+      } else if (name == "matmul3") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^a", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+        TensorProto t = node.attr().at("value").tensor();
+        EXPECT_EQ(1, t.float_val_size());
+        EXPECT_EQ(0, t.float_val(0));
+        EXPECT_EQ(2, t.tensor_shape().dim_size());
+        EXPECT_EQ(3, t.tensor_shape().dim(0).size());
+        EXPECT_EQ(2, t.tensor_shape().dim(1).size());
+      } else if (name == "matmul4") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^zeros", node.input(0));
+        EXPECT_EQ("^b", node.input(1));
+        TensorProto t = node.attr().at("value").tensor();
+        EXPECT_EQ(1, t.float_val_size());
+        EXPECT_EQ(0, t.float_val(0));
+        EXPECT_EQ(2, t.tensor_shape().dim_size());
+        EXPECT_EQ(2, t.tensor_shape().dim(0).size());
+        EXPECT_EQ(3, t.tensor_shape().dim(1).size());
+      } else if (name == "add1") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "add2") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("y", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "bias_add1") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^zeros_1d", node.input(1));
+      } else if (name == "bias_add2") {
+        // We don't eliminate this one, because it requires broadcasting.
+        EXPECT_EQ("BiasAdd", node.op());
+        EXPECT_EQ("zeros", node.input(0));
+        EXPECT_EQ("bias", node.input(1));
+      } else if (name == "sub1") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "sub2") {
+        // We don't handle this case yet.
+        EXPECT_EQ("Sub", node.op());
+        EXPECT_EQ("zeros", node.input(0));
+        EXPECT_EQ("y", node.input(1));
+      }
+      const std::set<string> square_zero_const{"mul1", "mul2",    "mul5",
+                                               "mul6", "matmul1", "matmul2"};
+      if (square_zero_const.count(name) > 0) {
+        TensorProto t = node.attr().at("value").tensor();
+        EXPECT_EQ(1, t.float_val_size());
+        EXPECT_EQ(0, t.float_val(0));
+        EXPECT_EQ(2, t.tensor_shape().dim_size());
+        EXPECT_EQ(2, t.tensor_shape().dim(0).size());
+        EXPECT_EQ(2, t.tensor_shape().dim(1).size());
+      }
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, StrengthReduce_Reciprocal) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output cf_half = ops::Const(s.WithOpName("cf_half"), 0.5f, {1});
+  Output xf = ops::Placeholder(s.WithOpName("xf"), DT_FLOAT,
+                               ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output xi = ops::Placeholder(s.WithOpName("xi"), DT_INT32,
+                               ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output ci = ops::Const(s.WithOpName("ci"), 2, {1});
+  Output cf = ops::Const(s.WithOpName("cf"), 2.0f, {1});
+  Output div_i = ops::Div(s.WithOpName("div_i"), xi, ci);
+  Output div_f = ops::Div(s.WithOpName("div_f"), xf, cf);
+  Output realdiv = ops::RealDiv(s.WithOpName("realdiv"), xf, cf);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div_f", "div_i", "realdiv"};
+  ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                            nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(8, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    const string& name = node.name();
+    if (name == "div_i") {
+      // Integer division is unchanged.
+      EXPECT_EQ("Div", node.op());
+      EXPECT_EQ("xi", node.input(0));
+      EXPECT_EQ("ci", node.input(1));
+    } else if (name == "div_f") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ("xf", node.input(0));
+      EXPECT_EQ("ConstantFolding/div_f_recip", node.input(1));
+    } else if (name == "realdiv") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ("xf", node.input(0));
+      EXPECT_EQ("ConstantFolding/realdiv_recip", node.input(1));
+    } else if (name == "ConstantFolding/div_f_recip") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
+      TensorProto t = node.attr().at("value").tensor();
+      EXPECT_EQ(DT_FLOAT, t.dtype());
+      EXPECT_EQ(1, t.tensor_shape().dim_size());
+      EXPECT_EQ(1, t.tensor_shape().dim(0).size());
+    } else if (name == "ConstantFolding/realdiv_recip") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
+      TensorProto t = node.attr().at("value").tensor();
+      EXPECT_EQ(DT_FLOAT, t.dtype());
+      EXPECT_EQ(1, t.tensor_shape().dim_size());
+      EXPECT_EQ(1, t.tensor_shape().dim(0).size());
+    }
+  }
+
+  // Check that the reciprocals have the expected value.
+  std::vector<string> fetch = {"cf_half"};
+  auto tensor_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(fetch.size(), tensor_expected.size());
+  fetch = {"ConstantFolding/div_f_recip", "ConstantFolding/realdiv_recip"};
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(fetch.size(), tensors.size());
+  for (int i = 0; i < fetch.size(); i++) {
+    test::ExpectTensorEqual<float>(tensor_expected[0], tensors[i]);
+  }
+}
+
+TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x_known =
+      ops::Placeholder(s.WithOpName("x_known"), DT_FLOAT,
+                       ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output x_partially_known =
+      ops::Placeholder(s.WithOpName("x_partially_unknown"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  Output x_unknown = ops::Placeholder(s.WithOpName("x_unknown"), DT_FLOAT);
+  Output zeros_known = ops::ZerosLike(s.WithOpName("zeros_known"), x_known);
+  Output zeros_partially_known =
+      ops::ZerosLike(s.WithOpName("zeros_partially_known"), x_partially_known);
+  Output zeros_unknown =
+      ops::ZerosLike(s.WithOpName("zeros_unknown"), x_unknown);
+
+  // Multiplies without any additional ops to supply the output shape.
+  int count = 0;
+  std::vector<Output> muls;
+  std::unordered_set<string> not_converted;
+  std::unordered_set<string> to_const;
+  std::unordered_set<string> to_identity;
+  for (const auto* x : {&x_known, &x_partially_known, &x_unknown}) {
+    for (const auto* zeros :
+         {&zeros_known, &zeros_partially_known, &zeros_unknown}) {
+      const string name = strings::StrCat("mul_", count++);
+      muls.push_back(ops::Mul(s.WithOpName(name), *x, *zeros));
+      if (x == &x_partially_known && zeros == &zeros_partially_known) {
+        to_identity.insert(name);
+      } else if (x == &x_unknown || zeros == &zeros_unknown) {
+        not_converted.insert(name);
+      } else {
+        to_const.insert(name);
+      }
+    }
+  }
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                            nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  LOG(INFO) << output.DebugString();
+
+  EXPECT_EQ(15, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    const string& name = node.name();
+    if (to_const.count(name) > 0) {
+      EXPECT_EQ("Const", node.op()) << node.name();
+    } else if (to_identity.count(name) > 0) {
+      EXPECT_EQ("Identity", node.op()) << node.name();
+    } else if (not_converted.count(name) > 0) {
+      EXPECT_EQ("Mul", node.op()) << node.name();
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output known_shape = ops::Const(s.WithOpName("known_shape"), 0.0f, {2, 2});
+  Output x_partially_known =
+      ops::Placeholder(s.WithOpName("x_partially_unknown"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  Output x_unknown = ops::Placeholder(s.WithOpName("x_unknown"), DT_FLOAT);
+  Output zeros_partially_known =
+      ops::ZerosLike(s.WithOpName("zeros_partially_known"), x_partially_known);
+  Output zeros_unknown =
+      ops::ZerosLike(s.WithOpName("zeros_unknown"), x_unknown);
+
+  // If at least one of the inputs to AddN has a known shape, shape inference
+  // will propagate the shape back to the inputs of AddN, making the
+  // output shapes of all its inputs known
+  std::vector<Output> muls_deduced_output_shape;
+  std::unordered_set<string> to_const;
+  int count = 0;
+  for (const auto& x : {x_partially_known, x_unknown}) {
+    for (const auto& zeros : {zeros_partially_known, zeros_unknown}) {
+      const string name = strings::StrCat("mul_", count++);
+      muls_deduced_output_shape.push_back(
+          ops::Mul(s.WithOpName(name), x, zeros));
+      to_const.insert(name);
+    }
+  }
+  // We add a known shape as input to AddN to propagate it back to the
+  // multiplies above, which means they can all be turned into Const nodes.
+  muls_deduced_output_shape.push_back(known_shape);
+  Output addn1 = ops::AddN(s.WithOpName("addn1"), muls_deduced_output_shape);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                            nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  LOG(INFO) << output.DebugString();
+
+  EXPECT_EQ(10, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    const string& name = node.name();
+    if (to_const.count(name) > 0) {
+      EXPECT_EQ("Const", node.op()) << node.name();
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_TRUE(IsControlInput(node.input(0)));
+      EXPECT_TRUE(IsControlInput(node.input(1)));
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, CreateConstNodes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+#define MAKE_TEST_GRAPH(TYPE)                                               \
+  Output TYPE##_const =                                                     \
+      ops::Const(s.WithOpName(#TYPE "_const"), static_cast<TYPE>(10), {5}); \
+  Output TYPE##_mul =                                                       \
+      ops::Mul(s.WithOpName(#TYPE "_mul"), TYPE##_const, TYPE##_const);     \
+  Output TYPE##_id = ops::Identity(s.WithOpName(#TYPE "_id"), TYPE##_mul)
+
+  MAKE_TEST_GRAPH(float);
+  MAKE_TEST_GRAPH(double);
+  MAKE_TEST_GRAPH(int64);
+  MAKE_TEST_GRAPH(int32);
+  MAKE_TEST_GRAPH(int16);
+  MAKE_TEST_GRAPH(int8);
+  MAKE_TEST_GRAPH(uint8);
+#undef MAKE_TEST_GRAPH
+
+  Output bool_const = ops::Const(s.WithOpName("bool_const"), true, {5});
+  Output bool_and =
+      ops::LogicalAnd(s.WithOpName("bool_and"), bool_const, bool_const);
+  Output bool_id = ops::Identity(s.WithOpName("bool_id"), bool_and);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  ConstantFolding fold(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(24, output.node_size());
+  for (const NodeDef& node : output.node()) {
+#define CHECK_RESULT(TYPE, FIELD)                                             \
+  if (node.name() == #TYPE "_mul") {                                          \
+    EXPECT_EQ(5,                                                              \
+              node.attr().at("value").tensor().tensor_shape().dim(0).size()); \
+    EXPECT_EQ(1, node.attr().at("value").tensor().FIELD##_val_size());        \
+    EXPECT_EQ(10 * 10, node.attr().at("value").tensor().FIELD##_val(0));      \
+  }
+
+    CHECK_RESULT(float, float);
+    CHECK_RESULT(double, double);
+    CHECK_RESULT(int64, int64);
+    CHECK_RESULT(int32, int);
+    CHECK_RESULT(int16, int);
+    CHECK_RESULT(int8, int);
+    CHECK_RESULT(uint8, int);
+#undef CHECK_RESULT
+
+    if (node.name() == "bool_and") {
+      EXPECT_EQ(5,
+                node.attr().at("value").tensor().tensor_shape().dim(0).size());
+      EXPECT_EQ(1, node.attr().at("value").tensor().bool_val_size());
+      EXPECT_EQ(true && true, node.attr().at("value").tensor().bool_val(0));
+    }
+  }
+}
+
 TEST_F(ConstantFoldingTest, FoldingNodeWithTwoOutputs) {
   // Build a simple graph with a few trivially prunable ops.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-  Output a = ops::Const(s.WithOpName("a"), 10, {3});
+  Output a = ops::Const(s.WithOpName("a"), 10, {5});
   auto b = ops::Unique(s.WithOpName("b"), {a});
   Output c = ops::Identity(s.WithOpName("c"), {b.y});
   Output d = ops::Identity(s.WithOpName("d"), {b.idx});
@@ -445,8 +915,10 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
   TF_EXPECT_OK(status);
   int found = 0;
   for (const auto& node : output.node()) {
-    EXPECT_NE(AddPrefixToNodeName("s-0", kConstantFoldingConst), node.name());
-    EXPECT_NE(AddPrefixToNodeName("s-1", kConstantFoldingConst), node.name());
+    EXPECT_NE(AddPrefixToNodeName("s-matshapes-0", kConstantFoldingConst),
+              node.name());
+    EXPECT_NE(AddPrefixToNodeName("s-matshapes-1", kConstantFoldingConst),
+              node.name());
     if (node.name() == "i1a" || node.name() == "i1b") {
       ++found;
       EXPECT_EQ("s", node.input(0));
@@ -457,7 +929,7 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
     }
     if (node.name() == "i3a" || node.name() == "i3b") {
       ++found;
-      EXPECT_EQ(AddPrefixToNodeName("s-2", kConstantFoldingConst),
+      EXPECT_EQ(AddPrefixToNodeName("s-matshapes-2", kConstantFoldingConst),
                 node.input(0));
     }
     if (node.name() == "s") {
@@ -467,7 +939,8 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
       EXPECT_EQ("v2", node.input(1));
       EXPECT_EQ("v3", node.input(2));
     }
-    if (node.name() == AddPrefixToNodeName("s-2", kConstantFoldingConst)) {
+    if (node.name() ==
+        AddPrefixToNodeName("s-matshapes-2", kConstantFoldingConst)) {
       ++found;
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ("^s", node.input(0));
@@ -735,7 +1208,7 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("v", node.input(0));
-      EXPECT_EQ("^v", node.input(1));
+      EXPECT_EQ("^i", node.input(1));
     }
   }
   EXPECT_TRUE(found);
@@ -794,20 +1267,20 @@ TEST_F(ConstantFoldingTest, NoOpReshape) {
       EXPECT_EQ("Identity", node.op());
       ASSERT_EQ(3, node.input_size());
       EXPECT_EQ("v1", node.input(0));
-      EXPECT_EQ("^d1", node.input(1));
-      EXPECT_EQ("^v1", node.input(2));
+      EXPECT_EQ("^i1", node.input(1));
+      EXPECT_EQ("^d1", node.input(2));
     } else if (node.name() == "r3") {
       ++found;
       EXPECT_EQ("Identity", node.op());
       ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("v3", node.input(0));
-      EXPECT_EQ("^v3", node.input(1));
+      EXPECT_EQ("^i3", node.input(1));
     } else if (node.name() == "r4") {
       ++found;
       EXPECT_EQ("Identity", node.op());
       ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("v4", node.input(0));
-      EXPECT_EQ("^v4", node.input(1));
+      EXPECT_EQ("^i4", node.input(1));
     } else if (node.name() == "r2") {
       ++found;
       EXPECT_EQ("Reshape", node.op());
@@ -879,19 +1352,19 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
     if (node.name() == "o1") {
       ++found;
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("ConstantFolding/f-0", node.input(0));
+      EXPECT_EQ("ConstantFolding/f-bcastargs-0", node.input(0));
     } else if (node.name() == "o2") {
       ++found;
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("ConstantFolding/f-1", node.input(0));
-    } else if (node.name() == "ConstantFolding/f-0") {
+      EXPECT_EQ("ConstantFolding/f-bcastargs-1", node.input(0));
+    } else if (node.name() == "ConstantFolding/f-bcastargs-0") {
       ++found;
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("^f", node.input(0));
       EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
                        .num_elements());
-    } else if (node.name() == "ConstantFolding/f-1") {
+    } else if (node.name() == "ConstantFolding/f-bcastargs-1") {
       ++found;
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ(1, node.input_size());
@@ -901,21 +1374,14 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
     } else if (node.name() == "p1") {
       ++found;
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("ConstantFolding/i-0", node.input(0));
+      EXPECT_EQ("i", node.input(0));
     } else if (node.name() == "p2") {
       ++found;
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("i:1", node.input(0));
-    } else if (node.name() == "ConstantFolding/i-0") {
-      ++found;
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^i", node.input(0));
-      EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
-                       .num_elements());
     }
   }
-  EXPECT_EQ(7, found);
+  EXPECT_EQ(6, found);
 }
 
 TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
@@ -963,3 +1429,5 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
+
+//  LocalWords:  NewRootScope
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index bd8a58d81452fbe93e6dfa8e67c939b84803bdac..edb0db65e987318e1e64bf0288b6ef18a7b9d662 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
 
+#include <unordered_map>
 #include <unordered_set>
 
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -23,8 +24,10 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -33,50 +36,83 @@ namespace grappler {
 
 namespace {
 
-int RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
-  int num_removed = 0;
+bool RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
+  bool removed_input = false;
   int pos = 0;
   while (pos < node->input_size()) {
     if (node->input(pos) == input) {
       node->mutable_input()->SwapElements(pos, node->input_size() - 1);
       node->mutable_input()->RemoveLast();
-      node_map->RemoveOutput(node->name(), NodeName(input));
+      node_map->RemoveOutput(NodeName(input), node->name());
+      removed_input = true;
     } else {
       ++pos;
     }
-    ++num_removed;
   }
-  return num_removed;
+  return removed_input;
 }
 
-// Remove dulicate control inputs.
-void PruneControlInputs(NodeDef* node) {
-  std::unordered_set<string> inputs;
-  int pos = 0;
-  while (pos < node->input_size()) {
-    const string& input = node->input(pos);
-    // TODO(rmlarsen): Remove control inputs that also appears as a regular
-    // inputs. Currently, doing so breaks testControlFlowStrictness in
-    // python/framework/function_test.
-    //    if (!inputs.insert(NodeName(input)).second && IsControlInput(input)) {
-    if (IsControlInput(input) && !inputs.insert(input).second) {
-      VLOG(1) << "**** Removing duplicate control input: " << input
-              << " from node " << node->DebugString();
-      node->mutable_input()->SwapElements(pos, node->input_size() - 1);
-      node->mutable_input()->RemoveLast();
-    } else {
-      ++pos;
-    }
+void DeleteNodes(const std::set<int>& nodes_to_delete, GraphDef* graph) {
+  int last = graph->node_size() - 1;
+  for (auto it = nodes_to_delete.rbegin(); it != nodes_to_delete.rend(); ++it) {
+    const int index = *it;
+    graph->mutable_node()->SwapElements(index, last);
+    last--;
   }
+  graph->mutable_node()->DeleteSubrange(last + 1, nodes_to_delete.size());
 }
 
 }  // namespace
 
+bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) {
+  if (!IsIdentity(node)) {
+    return true;
+  }
+
+  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
+    return false;
+  }
+  if (!fetch_nodes_known_) {
+    // The output values of this node may be needed.
+    return false;
+  }
+  const NodeDef* input = node_map_->GetNode(NodeName(node.input(0)));
+  CHECK(input != nullptr) << "node = " << node.name()
+                          << " input = " << node.input(0);
+  // Don't remove Identity nodes corresponding to Variable reads or following
+  // Recv.
+  if (IsVariable(*input) || IsRecv(*input)) {
+    return false;
+  } else if (IsSwitch(*input)) {
+    // Don't turn Identity nodes following Switch into NoOp or remove them
+    // if it requires anchoring a control dependencies the Switch node, which
+    // is not valid.
+    if (StringPiece(node.name()).starts_with(kConstantFoldingCtrl)) {
+      // TODO(rmlarsen): Try to remove this artificial contraint.
+      return false;
+    }
+  }
+  for (auto consumer : node_map_->GetOutputs(node.name())) {
+    if (node.input_size() > 1 && IsMerge(*consumer)) {
+      return false;
+    }
+    if (IsSwitch(*input)) {
+      for (const string& consumer_input : consumer->input()) {
+        if (consumer_input == AsControlDependency(node.name())) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
 bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
   }
   if (!fetch_nodes_known_ || NumNonControlOutputs(node, *node_map_) > 0) {
+    // The output values of this node may be needed.
     return false;
   }
   if (IsMerge(node) || IsSwitch(node)) {
@@ -88,6 +124,9 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
   if (!IsFreeOfSideEffect(node)) {
     return false;
   }
+  if (node.op() == "ControlTrigger") {
+    return false;
+  }
   if (node.op().rfind("Submodel", 0) == 0) {
     return false;
   }
@@ -97,27 +136,69 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
     return false;
   }
 
-  // TODO(rmlarsen): We have to skip Identity nodes to make an obsolete test in
-  // python/training/session_manager_test.py pass. See if we can fix or get rid
-  // of that test.
+  if (!SafeToRemoveIdentity(node)) {
+    return false;
+  }
+
   const std::unordered_set<string> do_not_rewrite_ops{
-      "Assert", "CheckNumerics",         "Identity",    "_Retval",
-      "_Arg",   "_ParallelConcatUpdate", "_TPUExecute", "_TPUCompile"};
+      "Assert",     "CheckNumerics",         "_Retval",
+      "_Arg",       "_ParallelConcatUpdate", "_TPUExecute",
+      "_TPUCompile"};
   return do_not_rewrite_ops.find(node.op()) == do_not_rewrite_ops.end();
 }
 
-string DependencyOptimizer::TryOptimizeDependencies(
-    NodeDef* node, SetVector<NodeDef*>* nodes_to_simplify) {
+void DependencyOptimizer::OptimizeNode(int node_idx,
+                                       SetVector<int>* nodes_to_simplify,
+                                       std::set<int>* nodes_to_delete) {
+  NodeDef* node = optimized_graph_->mutable_node(node_idx);
+  const bool is_noop = IsNoOp(*node);
+  const bool is_identity = IsIdentity(*node);
+  const string node_name = node->name();
+  // Constant nodes with no input control dependency are always executed early,
+  // so we can prune all their output control dependencies.
+  if (IsConstant(*node) && node->input_size() == 0) {
+    const std::set<NodeDef*> output_nodes = node_map_->GetOutputs(node_name);
+    for (NodeDef* fanout : output_nodes) {
+      bool optimize_fanout = false;
+      bool data_connection = false;
+      for (int i = fanout->input_size() - 1; i >= 0; --i) {
+        int pos;
+        string input_name = ParseNodeName(fanout->input(i), &pos);
+        if (input_name == node_name) {
+          if (pos < 0) {
+            fanout->mutable_input()->SwapElements(i, fanout->input_size() - 1);
+            fanout->mutable_input()->RemoveLast();
+            optimize_fanout = true;
+          } else {
+            data_connection = true;
+          }
+        }
+      }
+      if (optimize_fanout) {
+        nodes_to_simplify->PushBack(node_to_idx_[fanout]);
+        if (!data_connection) {
+          node_map_->RemoveOutput(node_name, fanout->name());
+        }
+      }
+    }
+    if (node_map_->GetOutputs(node_name).empty() && fetch_nodes_known_ &&
+        nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) {
+      // Mark the node for deletion.
+      nodes_to_delete->insert(node_to_idx_[node]);
+    }
+    return;
+  }
+
   // Change ops that only have control dependencies as outputs to NoOps.
-  if (node->op() != "NoOp" && SafeToConvertToNoOp(*node)) {
-    VLOG(1) << "***** Replacing  " << node->name() << " (" << node->op()
+  if (!is_noop && SafeToConvertToNoOp(*node)) {
+    VLOG(1) << "***** Replacing  " << node_name << " (" << node->op()
             << ") with NoOp.";
     // The outputs of this node are not consumed. Replace its inputs with
     // control dependencies and replace the op itself with the NoOp op.
     std::unordered_set<string> ctrl_inputs;
     int pos = 0;
     while (pos < node->input_size()) {
-      const string& old_input = node->input(pos);
+      const string old_input = node->input(pos);
       if (IsControlInput(old_input)) {
         if (!ctrl_inputs.insert(old_input).second) {
           // We found a duplicate control input. Remove it.
@@ -132,20 +213,21 @@ string DependencyOptimizer::TryOptimizeDependencies(
           old_input, optimized_graph_, node_map_.get());
       if (ctrl_inputs.insert(ctrl_input).second) {
         node->set_input(pos, ctrl_input);
-        node_map_->UpdateInput(node->name(), old_input, ctrl_input);
-        auto old_input_node = node_map_->GetNode(old_input);
-        nodes_to_simplify->PushBack(old_input_node);
+        node_map_->UpdateInput(node_name, old_input, ctrl_input);
+        const NodeDef* old_input_node = node_map_->GetNode(old_input);
+        nodes_to_simplify->PushBack(node_to_idx_[old_input_node]);
       }
       ++pos;
     }
     node->set_op("NoOp");
     node->clear_attr();
-    nodes_to_simplify->PushBack(node);
-    return "";
+    nodes_to_simplify->PushBack(node_to_idx_[node]);
+    return;
   }
 
-  // Remove NoOp nodes if their fan-in or fan-out is less than 2.
-  // The non-trivial rewrites take the following form:
+  // Remove NoOp nodes if the product of their fan-in and fan-out is less than
+  // or equal to the sum of the fan-in and fan-out. The non-trivial rewrites
+  // take the following form:
   //
   // Case a)
   //    x --^> +------+                x --^> +---+
@@ -158,120 +240,341 @@ string DependencyOptimizer::TryOptimizeDependencies(
   //    x --^> | NoOp | --^> b  ==>    | x | --^> b
   //           |      | ...            |   | ...
   //           +------+ --^> c         +---+ --^> c
-  if (node->op() == "NoOp" &&
-      nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) {
-    const auto output_nodes = node_map_->GetOutputs(node->name());
+  // Case c)
+  //           +------+                x ---^> a
+  //    x --^> | NoOp | --^> a  ==>      \/
+  //    y --^> |      | --^> b           /\
+  //           +------+                y ---^> b
+  //
+  // We only apply this optimization if we don't increase the number of control
+  // edges across device boundaries, e.g. in cases a) and b) if NoOp and
+  // a and x, respectively, are on the same device. Control edges across device
+  // boundaries require inter-device communication (Send/Recv pairs to be
+  // inserted in the graph), which is very costly.
+  //
+  // We also remove identity nodes, subject to the same constraints on number of
+  // resulting control edges and device boundary crossings:
+  //
+  // Case a)
+  //          +----------+ ---> a       +---+ ---> a
+  //    x --> | Identity | --^> b  ==>  | x | --^> b
+  //          |          | ...          |   | ...
+  //          +----------+ --^> c       +---+ --^> c
+  //
+  // Case b)
+  //    x ---> +----------+ ---> a      x ---> +---+
+  //    y --^> | Identity |        ==>  y --^> | a |
+  //    ...    |          |               ...  |   |
+  //    z --^> +----------+             z --^> +---+
+  //
+  // Case c)
+  //           +----------+             x ---> +---+
+  //    x ---> | Identity | ---> a ==>   \--^> | a |
+  //    y --^> |          | --^> b       /\    +---+
+  //           +----------+             y --^> b
+
+  if (is_noop || is_identity) {
+    const auto& output_node_set = node_map_->GetOutputs(node_name);
+    const std::vector<NodeDef*> output_nodes(output_node_set.begin(),
+                                             output_node_set.end());
     const int num_outputs = output_nodes.size();
     const int num_inputs = node->input_size();
-    if (num_inputs > 1 && num_outputs > 1) {
-      return "";
+
+    if (num_inputs * num_outputs > num_inputs + num_outputs) {
+      return;
     }
-    VLOG(1) << "***** Rerouting input around  " << node->name();
     std::vector<NodeDef*> input_nodes;
     for (int i = 0; i < num_inputs; ++i) {
-      NodeDef* tmp = node_map_->GetNode(node->input(i));
-      if (tmp != nullptr) {
-        input_nodes.push_back(tmp);
+      NodeDef* input_node = node_map_->GetNode(node->input(i));
+      CHECK_NE(input_node, nullptr);
+      input_nodes.push_back(input_node);
+    }
+
+    // Make sure that we don't increase the number of edges that cross
+    // device boundaries.
+    if ((num_inputs == 1 && num_outputs > 1 &&
+         input_nodes[0]->device() != node->device()) ||
+        (num_inputs > 1 && num_outputs == 1 &&
+         output_nodes[0]->device() != node->device())) {
+      return;
+    }
+    if (num_inputs == 2 && num_outputs == 2) {
+      const string& noop_dev = node->device();
+      const string& in0_dev = input_nodes[0]->device();
+      const string& in1_dev = input_nodes[1]->device();
+      const string& out0_dev = output_nodes[0]->device();
+      const string& out1_dev = output_nodes[1]->device();
+      const int num_cross_before = static_cast<int>(in0_dev != noop_dev) +
+                                   static_cast<int>(in1_dev != noop_dev) +
+                                   static_cast<int>(out0_dev != noop_dev) +
+                                   static_cast<int>(out1_dev != noop_dev);
+      const int num_cross_after = static_cast<int>(in0_dev != out0_dev) +
+                                  static_cast<int>(in0_dev != out1_dev) +
+                                  static_cast<int>(in1_dev != out0_dev) +
+                                  static_cast<int>(in1_dev != out1_dev);
+      if (num_cross_after > num_cross_before) {
+        return;
+      }
+      // To avoid potentially removing Identity nodes following _Recv nodes,
+      // we require that no device crossings occur in that case.
+      // TODO(rmlarsen): See if we can relax this condition.
+      if (is_identity && (num_cross_after > 0 || num_cross_before > 0)) {
+        return;
       }
     }
+    if (is_identity && !SafeToRemoveIdentity(*node)) {
+      return;
+    }
+
+    VLOG(1) << "***** Rerouting input around\n" << node->DebugString();
+    // Now remove the node and re-wire its inputs to its outputs.
     for (auto consumer : output_nodes) {
       bool updated_consumer = false;
-      VLOG(1) << "***** Considering consumer  " << consumer->name() << "\n"
-              << consumer->DebugString();
+      VLOG(1) << "consumer before:\n" << consumer->DebugString();
       for (int i = 0; i < num_inputs; ++i) {
-        const string& input = node->input(i);
+        const NodeDef* input = input_nodes[i];
         // Forward dependency from input to consumer if it doesn't already
         // depend on it.
-        if (node_map_->GetOutputs(NodeName(input)).count(consumer) == 0) {
-          consumer->add_input(input);
+        if (is_identity && i == 0) {
+          // Replace regular input from Identity node.
+          bool found_input = false;
+          string new_input;
+          const string& input_to_forward = node->input(0);
+          CHECK(!IsControlInput(input_to_forward));
+          for (int j = 0; j < consumer->input_size(); ++j) {
+            const string& old_input = consumer->input(j);
+            if (old_input == node_name) {
+              new_input = input_to_forward;
+              node_map_->UpdateInput(consumer->name(), old_input, new_input);
+              consumer->set_input(j, new_input);
+              found_input = true;
+            } else if (old_input == AsControlDependency(NodeName(node_name))) {
+              new_input = AsControlDependency(NodeName(input_to_forward));
+              node_map_->UpdateInput(consumer->name(), old_input, new_input);
+              consumer->set_input(j, new_input);
+              found_input = true;
+            }
+          }
+          CHECK(found_input);
           updated_consumer = true;
-          node_map_->AddOutput(NodeName(input), consumer->name());
-          nodes_to_simplify->PushBack(input_nodes[i]);
+        } else {
+          // Forward dependency from input to consumer if it doesn't already
+          // depend on it.
+          if (node_map_->GetOutputs(input->name()).count(consumer) == 0) {
+            consumer->add_input(AsControlDependency(input->name()));
+            node_map_->AddOutput(input->name(), consumer->name());
+            nodes_to_simplify->PushBack(node_to_idx_[input]);
+            updated_consumer = true;
+          }
         }
       }
       // Remove dependency on node from consumer.
-      updated_consumer |= RemoveInput(
-          consumer, AsControlDependency(node->name()), node_map_.get());
+      updated_consumer |= RemoveInput(consumer, AsControlDependency(node_name),
+                                      node_map_.get());
       if (updated_consumer) {
-        VLOG(1) << "***** Updated consumer  " << consumer->name() << " ("
-                << consumer->op() << ")";
-        nodes_to_simplify->PushBack(consumer);
+        nodes_to_simplify->PushBack(node_to_idx_[consumer]);
       }
+      VLOG(1) << "consumer after:\n" << consumer->DebugString();
     }
+    node_map_->RemoveOutputs(node_name);
+    if (fetch_nodes_known_ &&
+        nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) {
+      // Mark the node for deletion.
+      nodes_to_delete->insert(node_idx);
 
-    // Clear all (control) inputs to this NoOp node.
-    if (fetch_nodes_known_) {
-      node_map_->RemoveInputs(node->name());
+      // Disconnect the node from its inputs to enable further optimizations.
+      node_map_->RemoveInputs(node_name);
       node->clear_input();
     }
   }
+}
 
-  return "";
+void DependencyOptimizer::CleanControlInputs() {
+  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
+    DedupControlInputs(optimized_graph_->mutable_node(i));
+  }
 }
 
 Status DependencyOptimizer::OptimizeDependencies() {
-  // TODO(rmlarsen,bsteiner): The following code is similar to the control loop
-  // in the ArithmeticOptimizer. Dedup this.
-  SetVector<NodeDef*> nodes_to_simplify;
+  SetVector<int> nodes_to_simplify;
+  std::set<int> nodes_to_delete;
   for (int i = 0; i < optimized_graph_->node_size(); ++i) {
-    NodeDef* node = optimized_graph_->mutable_node(i);
-    if (node->op() == "NoOp" || SafeToConvertToNoOp(*node)) {
-      PruneControlInputs(node);
-      nodes_to_simplify.PushBack(node);
+    const NodeDef& node = optimized_graph_->node(i);
+    if (IsNoOp(node) || IsIdentity(node) || IsConstant(node) ||
+        SafeToConvertToNoOp(node)) {
+      nodes_to_simplify.PushBack(i);
     }
   }
   while (!nodes_to_simplify.Empty()) {
-    NodeDef* node = nodes_to_simplify.PopBack();
-    const string simplified_tensor =
-        TryOptimizeDependencies(node, &nodes_to_simplify);
-    if (!simplified_tensor.empty() &&
-        NodeName(simplified_tensor) != node->name()) {
-      // Always consider simplified_tensor for further optimizations.
-      NodeDef* simplified_node = node_map_->GetNode(simplified_tensor);
-      if (simplified_node != nullptr) {
-        nodes_to_simplify.PushBack(simplified_node);
+    int node_to_simplify = nodes_to_simplify.PopBack();
+    // Discard nodes that were marked for deletion already.
+    while (nodes_to_delete.find(node_to_simplify) != nodes_to_delete.end()) {
+      node_to_simplify = nodes_to_simplify.PopBack();
+    }
+    OptimizeNode(node_to_simplify, &nodes_to_simplify, &nodes_to_delete);
+  }
+
+  if (fetch_nodes_known_) {
+    VLOG(1) << "Deleted " << nodes_to_delete.size() << " out of "
+            << optimized_graph_->node_size() << " nodes.";
+    DeleteNodes(nodes_to_delete, optimized_graph_);
+    node_map_.reset(new NodeMap(optimized_graph_));
+    BuildNodeToIdx();
+  }
+  return Status::OK();
+}
+
+Status DependencyOptimizer::TransitiveReduction() {
+  // PRECONDITION: optimized_graph_ must be sorted topologically.
+  const int num_nodes = optimized_graph_->node_size();
+  // Set up a compressed version of the graph to save a constant factor in the
+  // expensive algorithm below. Also cache the set of control outputs and the
+  // highest index of a target of any control output from each node.
+  int num_controls = 0;
+  std::vector<gtl::InlinedVector<int, 4>> inputs(num_nodes);
+  std::vector<gtl::InlinedVector<std::pair<int, int>, 2>> control_outputs(
+      num_nodes);
+  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
+    const NodeDef& node = optimized_graph_->node(node_idx);
+    if (ModifiesFrameInfo(node) || !HasOpDef(node)) {
+      // Ignore function nodes and nodes that modify frame info.
+      continue;
+    }
+    for (int input_slot = 0; input_slot < node.input_size(); ++input_slot) {
+      const string& input = node.input(input_slot);
+      const NodeDef* input_node = node_map_->GetNode(input);
+      if (ModifiesFrameInfo(*input_node) || IsMerge(*input_node)) {
+        // Ignore edges from nodes that modify frame info and from Merge nodes,
+        // because we cannot know which of it's input paths executes.
+        continue;
+      }
+      const int input_node_idx = node_to_idx_[input_node];
+      inputs[node_idx].push_back(input_node_idx);
+      if (IsControlInput(input)) {
+        ++num_controls;
+        control_outputs[input_node_idx].emplace_back(node_idx, input_slot);
       }
-      // When `node` is simplifed to another node rather than in-place, the
-      // consumers of `node` are already redirected to `simplified_tensor`.
-      // Re-push the consumers into `nodes_to_simplify` for further
-      // optimizations.
-      std::set<NodeDef*> consumers = node_map_->GetOutputs(node->name());
-      for (NodeDef* consumer : consumers) {
-        // Update `consumer`'s use of `node` to `input`'s operand.
-        for (int i = 0; i < consumer->input_size(); ++i) {
-          int operand_pos;
-          string operand_node_name =
-              ParseNodeName(consumer->input(i), &operand_pos);
-          if (operand_node_name == node->name()) {
-            *consumer->mutable_input(i) =
-                (operand_pos < 0
-                     ? AsControlDependency(NodeName(simplified_tensor))
-                     : simplified_tensor);
+    }
+  }
+
+  // Run the longest path in DAG algorithm for each source node that has control
+  // outputs. If, for any target node of a control output, there exists a path
+  // of length > 1, we can drop that control dependency.
+  int num_controls_removed = 0;
+  std::vector<int> longest_distance(num_nodes);
+  // Map from target_index -> set of (input_slot, source_index), representing
+  // the control edges to remove. We sort them in reverse order by input slot,
+  // such that when we swap them out so we don't clobber the
+  // node(target).input() repeated field.
+  typedef std::pair<int, int> InputSlotAndSource;
+  std::unordered_map<
+      int, std::set<InputSlotAndSource, std::greater<InputSlotAndSource>>>
+      control_edges_to_remove;
+  for (int source = 0; source < num_nodes; ++source) {
+    int highest_control_target = -1;
+    for (const auto& control_output : control_outputs[source]) {
+      if (control_output.first > highest_control_target) {
+        highest_control_target = control_output.first;
+      }
+    }
+    if (highest_control_target <= source) {
+      continue;
+    }
+    std::fill(longest_distance.begin() + source,
+              longest_distance.begin() + highest_control_target + 1, 0);
+    for (int target = source + 1; target <= highest_control_target; ++target) {
+      for (int input : inputs[target]) {
+        // If the input node is before source in the topo order, no path
+        // source -> input -> target can exits and we can skip it.
+        // Also only extend a path from the source itself or from nodes that
+        // have a path from source, indicated by longest_distance[input] > 0.
+        if (input == source ||
+            (input > source && longest_distance[input] > 0)) {
+          // If source -> input -> target is longer than the longest
+          // path so far from source -> target, update the longest_distance.
+          int candidate_longest_distance = longest_distance[input] + 1;
+          if (candidate_longest_distance > longest_distance[target]) {
+            longest_distance[target] = candidate_longest_distance;
           }
         }
-        node_map_->UpdateInput(consumer->name(), node->name(),
-                               simplified_tensor);
-        nodes_to_simplify.PushBack(consumer);
+      }
+    }
+
+    // If the longest path from source to target of a control dependency is
+    // longer than 1, there exists an alternate path, and we can eliminate the
+    // redundant direct control dependency.
+    for (const auto& control_output : control_outputs[source]) {
+      const int target = control_output.first;
+      if (longest_distance[target] > 1) {
+        const int input_slot = control_output.second;
+        control_edges_to_remove[target].emplace(input_slot, source);
+        //        VLOG(1) << "Removing edge from:\n"
+        //                << optimized_graph_->node(source).DebugString() <<
+        //                "\n\nto:\n\n"
+        //                << optimized_graph_->node(target).DebugString();
       }
     }
   }
-  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
-    NodeDef* node = optimized_graph_->mutable_node(i);
-    PruneControlInputs(node);
+
+  for (const auto& it : control_edges_to_remove) {
+    const int target = it.first;
+    NodeDef* target_node = optimized_graph_->mutable_node(target);
+    for (const InputSlotAndSource& slot_and_source : it.second) {
+      const int input_slot = slot_and_source.first;
+      const int source = slot_and_source.second;
+      const NodeDef& source_node = optimized_graph_->node(source);
+      CHECK_LT(input_slot, target_node->input_size());
+      target_node->mutable_input()->SwapElements(input_slot,
+                                                 target_node->input_size() - 1);
+      node_map_->RemoveOutput(source_node.name(), target_node->name());
+      target_node->mutable_input()->RemoveLast();
+      ++num_controls_removed;
+    }
   }
+  VLOG(1) << "Removed " << num_controls_removed << " out of " << num_controls
+          << " control dependencies";
   return Status::OK();
 }
 
+void DependencyOptimizer::BuildNodeToIdx() {
+  // Set up &node -> index map.
+  node_to_idx_.clear();
+  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
+    const NodeDef& node = optimized_graph_->node(i);
+    node_to_idx_[&node] = i;
+  }
+}
+
 Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                      GraphDef* optimized_graph) {
   optimized_graph_ = optimized_graph;
   *optimized_graph_ = item.graph;
   nodes_to_preserve_ = item.NodesToPreserve();
-  node_map_.reset(new NodeMap(optimized_graph));
   fetch_nodes_known_ = !item.fetch.empty();
-  VLOG(1) << "Graph before optimization:\n" << optimized_graph_->DebugString();
-  TF_RETURN_IF_ERROR(OptimizeDependencies());
-  VLOG(1) << "Graph after optimization:\n" << optimized_graph_->DebugString();
+  CleanControlInputs();
+
+  const int num_iterations = 2;
+  for (int iteration = 0; iteration < num_iterations; ++iteration) {
+    Status topo_sort_status;
+    // Perform topological sort to prepare the graph for transitive reduction.
+    topo_sort_status = TopologicalSort(optimized_graph_);
+    // Set up index-based graph datastructures to speed up analysis steps below.
+    node_map_.reset(new NodeMap(optimized_graph_));
+    BuildNodeToIdx();
+
+    if (topo_sort_status.ok()) {
+      // Remove redundant control dependencies.
+      TF_RETURN_IF_ERROR(TransitiveReduction());
+    } else {
+      LOG(ERROR) << topo_sort_status.error_message();
+    }
+    // Turn nodes with only control outputs into NoOps, prune NoOp and Identity
+    // nodes.
+    TF_RETURN_IF_ERROR(OptimizeDependencies());
+
+    // Dedup control inputs.
+    CleanControlInputs();
+  }
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index a9d33227449ac05e74bbd26c5e51c2deac5644fd..61ed15479370614bc79c15b450039f0cbf30908d 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
 
 #include <unordered_set>
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
@@ -43,29 +43,35 @@ class DependencyOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
-  Status OptimizeDependencies();
-
+  // Returns true if node is not an Identity node or if it is an Identity
+  // that is safe to remove.
+  bool SafeToRemoveIdentity(const NodeDef& node);
   // Returns true if it is safe to convert node to NoOp.
   bool SafeToConvertToNoOp(const NodeDef& node);
-
-  // Tries to simplify the expression that roots at `node` and replaces the uses
-  // of `node` to the simplified expression. Returns the name of the simplified
-  // tensor (e.g. "split:1") or an empty string if no simplification is
-  // performed.
-  string TryOptimizeDependencies(NodeDef* node,
-                                 SetVector<NodeDef*>* nodes_to_simplify);
-
-  bool HasOnlyControlOutputs(const NodeDef* node);
+  // Removes all duplicate control dependencies.
+  void CleanControlInputs();
+  // Builds a map from the &optimized_graph_->node(i) to i.
+  void BuildNodeToIdx();
+  // Tries to optimize the node with the given index, possibly additional
+  // optimizations by inserting nodes in nodes_to_simplify, and pruning nodes by
+  // inserting them in nodes_to_delete.
+  void OptimizeNode(int node_idx, SetVector<int>* nodes_to_simplify,
+                    std::set<int>* nodes_to_delete);
+  // Eliminates redundant control dependencies by computing the transitive
+  // reduction of the graph.
+  Status TransitiveReduction();
+  // Main driver of dependency optimizations.
+  Status OptimizeDependencies();
 
   RewriterConfig::Toggle opt_level_;
-
   bool fetch_nodes_known_;
   std::unordered_set<string> nodes_to_preserve_;
   std::unique_ptr<NodeMap> node_map_;
+  std::unordered_map<const NodeDef*, int> node_to_idx_;
   GraphDef* optimized_graph_;  // Not owned.
 };
 
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index 90f5ec8c3fca8fdb8473f9d3c9868a710fa72b2b..33d6b992d21212fe325c642b87d3c3736185c445 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -59,10 +60,47 @@ TEST_F(DependencyOptimizerTest, NoOp) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-TEST_F(DependencyOptimizerTest, ChangeToNoop) {
+TEST_F(DependencyOptimizerTest, DependenciesDrivenByConstants) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
   Output y = ops::Const(s.WithOpName("y"), {1.0f, 2.0f}, {1, 2});
+  Output z = ops::Const(s.WithOpName("z"), {1.0f, 2.0f}, {1, 2});
+  Output add = ops::Add(s.WithOpName("add"), x, y);
+  Output id1 =
+      ops::Identity(s.WithOpName("id1").WithControlDependencies(x), add);
+  Output id2 = ops::Identity(
+      s.WithOpName("id2").WithControlDependencies(y).WithControlDependencies(z),
+      add);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("id1");
+  item.fetch.push_back("id2");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // The 'z' node should have been optimized away leaving only 5 nodes.
+  EXPECT_EQ(5, output.node_size());
+
+  for (const NodeDef& node : item.graph.node()) {
+    if (node.name() == "id1" || node.name() == "id2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("add", node.input(0));
+    }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, ChangeToNoop) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
   Output add = ops::Add(s.WithOpName("add"), x, y);
   Output id1 =
       ops::Identity(s.WithOpName("id1").WithControlDependencies(add), x);
@@ -85,21 +123,75 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop) {
 
   EXPECT_EQ(item.graph.node_size(), output.node_size());
   for (int i = 0; i < item.graph.node_size(); ++i) {
-    const NodeDef& original = item.graph.node(i);
-    const NodeDef& optimized = output.node(i);
-    EXPECT_EQ(original.name(), optimized.name());
-    if (original.name() == "add") {
-      EXPECT_EQ("NoOp", optimized.op());
-    } else {
-      EXPECT_EQ(original.op(), optimized.op());
+    const NodeDef& node = item.graph.node(i);
+    if (node.name() == "add") {
+      EXPECT_EQ("NoOp", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("^x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+    } else if (node.name() == "id1") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+    } else if (node.name() == "id2") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
     }
-    EXPECT_EQ(original.input_size(), optimized.input_size());
-    for (int j = 0; j < original.input_size(); ++j) {
-      if (original.name() == "add") {
-        EXPECT_EQ(AsControlDependency(original.input(j)), optimized.input(j));
-      } else {
-        EXPECT_EQ(original.input(j), optimized.input(j));
-      }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, ChangeToNoop_SwitchIdentity) {
+  // This tests that we don't try to repeatedly add Identity nodes
+  // with names like "ConstantFoldingCtrl/foo/bar/switch_$port" when
+  // multiple nodes reading the same output of a Switch node get
+  // optimized (e.g. constant folded or turned into NoOps).
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  ops::Variable v_in(scope.WithOpName("v_in"), {3}, DT_FLOAT);
+  ops::Variable v_ctrl(scope.WithOpName("v_ctrl"), {}, DT_BOOL);
+  ops::Switch s(scope.WithOpName("switch"), v_in, v_ctrl);
+  // "neg" should be turned into a NoOp with a control dependency from
+  // the existing Identity node "ConstantFoldingCtrl/switch_1" and
+  // subsequently eliminated completely from the graph.
+  Output neg = ops::Neg(scope.WithOpName("neg"), s.output_true);
+  // c1 could be a result of constant folding some node fed by neg.
+  Output c1 = ops::Const(scope.WithOpName("c1").WithControlDependencies(neg),
+                         {1.0f, 2.0f}, {1, 2});
+  Output ctrl_dep_id = ops::Identity(
+      scope.WithOpName("ConstantFoldingCtrl/switch_1"), s.output_true);
+  // c2 could be a result of constant folding a node fed by s, which also
+  // added the ctrl_dep_id node.
+  Output c2 =
+      ops::Const(scope.WithOpName("c2").WithControlDependencies(ctrl_dep_id),
+                 {1.0f, 2.0f}, {1, 2});
+  Output neg1 = ops::Neg(scope.WithOpName("neg1"), s.output_false);
+  Output neg2 = ops::Neg(scope.WithOpName("neg2"), ctrl_dep_id);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch.push_back("c1");
+  item.fetch.push_back("c2");
+  item.fetch.push_back("neg1");
+  item.fetch.push_back("neg2");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size() - 1, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    // "neg" should be eliminated.
+    EXPECT_NE("neg", node.name());
+    // A control dep from "^ConstantFoldingCtrl/switch_1"
+    // should be attached to "c1".
+    if (node.name() == "c1") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^ConstantFoldingCtrl/switch_1", node.input(0));
     }
   }
 }
@@ -107,8 +199,8 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop) {
 // TODO(rmlarsen): Add test to make sure we skip Switch and Merge.
 TEST_F(DependencyOptimizerTest, ChangeToNoop_NoFetch) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
-  Output y = ops::Const(s.WithOpName("y"), {1.0f, 2.0f}, {1, 2});
+  Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
   Output add = ops::Add(s.WithOpName("add"), x, y);
   Output id1 =
       ops::Identity(s.WithOpName("id1").WithControlDependencies(add), x);
@@ -123,12 +215,13 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_NoFetch) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
+  TF_CHECK_OK(TopologicalSort(&item.graph));
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
 TEST_F(DependencyOptimizerTest, RemoveNoOps_EmptyInputOrOutput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output x = ops::Const(s, {1.0f, 2.0f}, {1, 2});
+  Output x = ops::RandomUniform(s, {1, 2}, DT_FLOAT);
   auto noop1 = ops::NoOp(s);
   auto noop2 = ops::NoOp(s.WithControlDependencies(x));
   Output id = ops::Identity(s.WithControlDependencies({noop1.operation}), x);
@@ -152,15 +245,50 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_EmptyInputOrOutput) {
       EXPECT_EQ(0, node.input_size());
     } else if (node.name() == "Identity") {
       EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("Const", node.input(0));
+      EXPECT_EQ("RandomUniform", node.input(0));
     }
   }
 }
 
+TEST_F(DependencyOptimizerTest, RemoveNoOps_DeviceBoundaries) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x").WithDevice("/CPU:0"), {1, 2},
+                                DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y").WithDevice("/CPU:0"), {1, 2},
+                                DT_FLOAT);
+  // NoOp with a single input- and two output dependencies.
+  auto noop = ops::NoOp(s.WithControlDependencies(x).WithDevice("/CPU:1"));
+  // NoOp with a two input- and a single output dependency.
+  auto noop_1 = ops::NoOp(
+      s.WithControlDependencies(x).WithControlDependencies(y).WithDevice(
+          "/CPU:0"));
+  Output id = ops::Identity(
+      s.WithControlDependencies({noop.operation}).WithDevice("/CPU:1"), x);
+  Output id_1 = ops::Identity(
+      s.WithControlDependencies({noop.operation, noop_1.operation})
+          .WithDevice("/CPU:1"),
+      y);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("Identity");
+  item.fetch.push_back("Identity_1");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // The optimization should be disabled to prevent increasing the number of
+  // nodes crossing device boundaries.
+  TF_CHECK_OK(TopologicalSort(&item.graph));
+  VerifyGraphsEqual(item.graph, output, __FUNCTION__);
+}
+
 TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
-  Output y = ops::Const(s.WithOpName("y"), {1.0f, 2.0f}, {1, 2});
+  Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
   // NoOp with a single input- and two output dependencies.
   auto noop = ops::NoOp(s.WithControlDependencies(x));
   // NoOp with a two input- and a single output dependency.
@@ -197,6 +325,196 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
   }
 }
 
+TEST_F(DependencyOptimizerTest, RemoveIdentity) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
+  Output z = ops::RandomUniform(s.WithOpName("z"), {1, 2}, DT_FLOAT);
+
+  // Identity nodes to be removed.
+  // Case a) with a single input- and multiple outputs.
+  auto id_a = ops::Identity(s.WithOpName("id_a"), x);
+  // Case b) with multiple inputs and a single output.
+  auto id_b = ops::Identity(
+      s.WithOpName("id_b").WithControlDependencies(y).WithControlDependencies(
+          z),
+      x);
+  // Case c) with two inputs and two outputs.
+  auto id_c = ops::Identity(s.WithOpName("id_c").WithControlDependencies(y), x);
+
+  // Output for Case a.
+  Output a_a = ops::Identity(s.WithOpName("a_a"), id_a);
+  Output a_b = ops::Identity(s.WithOpName("a_b"), id_a);
+  Output a_c =
+      ops::Identity(s.WithOpName("a_c").WithControlDependencies(id_a), z);
+  Output a_d =
+      ops::Identity(s.WithOpName("a_d").WithControlDependencies(id_a), z);
+  // Output for Case b.
+  Output b_a = ops::Identity(s.WithOpName("b_a"), id_b);
+  // Output for Case c.
+  Output c_a = ops::Identity(s.WithOpName("c_a"), id_c);
+  Output c_b =
+      ops::Identity(s.WithOpName("c_b").WithControlDependencies(id_c), z);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"a_a", "a_b", "a_c", "a_d", "b_a", "c_a", "c_b"};
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size() - 3, output.node_size());
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE("id_a", node.name());
+    EXPECT_NE("id_b", node.name());
+    EXPECT_NE("id_c", node.name());
+    if (node.name() == "a_a" || node.name() == "a_b") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+    }
+    if (node.name() == "a_c" || node.name() == "a_d") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("z", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+    }
+    if (node.name() == "b_a") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+      EXPECT_EQ("^z", node.input(2));
+    }
+    if (node.name() == "c_a") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+    }
+    if (node.name() == "c_b") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("z", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+      EXPECT_EQ("^y", node.input(2));
+    }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, RemoveIdentity_RepeatedInputs) {
+  // Corner cases with repeated inputs.
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  ops::Variable x(scope.WithOpName("x"), {}, DT_BOOL);
+  ops::Variable y(scope.WithOpName("y"), {}, DT_BOOL);
+  ops::Switch sw(scope.WithOpName("switch"), x, x);
+  // id0 should be removed.
+  Output id0 = ops::Identity(scope.WithOpName("id0"), sw.output_true);
+  // id1 should not be removed, since it would anchor a control dependency
+  // on the switch.
+  Output id1 = ops::Identity(scope.WithOpName("id1"), sw.output_false);
+  Output or0 = ops::LogicalOr(scope.WithOpName("or0"), id0, id0);
+  Output or1 = ops::LogicalOr(scope.WithOpName("or1"), id0, y);
+  Output or2 = ops::LogicalOr(
+      scope.WithOpName("or2").WithControlDependencies(id1), y, y);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch.push_back("or0");
+  item.fetch.push_back("or1");
+  item.fetch.push_back("or2");
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size() - 1, output.node_size());
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE("id0", node.name());
+    if (node.name() == "or0") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("switch:1", node.input(0));
+      EXPECT_EQ("switch:1", node.input(1));
+    }
+    if (node.name() == "or1") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("switch:1", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+    }
+    if (node.name() == "or2") {
+      // or1 should be unchanged.
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^id1", node.input(2));
+    }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, Transitive_Reduction_Simple) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  Output x = ops::Square(s.WithOpName("x"), c);
+  Output neg1 = ops::Neg(s.WithOpName("neg1"), x);
+  Output neg2 =
+      ops::Neg(s.WithOpName("neg2").WithControlDependencies({x}), neg1);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("neg2");
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(4, output.node_size());
+  EXPECT_EQ("neg2", output.node(3).name());
+  EXPECT_EQ(1, output.node(3).input_size());
+  EXPECT_EQ("neg1", output.node(3).input(0));
+}
+
+TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  ops::Variable v_in(scope.WithOpName("v_in"), {3}, DT_FLOAT);
+  Output id_after_var = ops::Identity(scope.WithOpName("id_after_var"), v_in);
+  ops::Variable v_ctrl(scope.WithOpName("v_ctrl"), {}, DT_BOOL);
+  ops::Switch s(
+      scope.WithOpName("switch").WithControlDependencies(id_after_var), v_in,
+      v_ctrl);
+  Output id0 = ops::Identity(scope.WithOpName("id0"), s.output_true);
+  Output grappler_added_id = ops::Identity(
+      scope.WithOpName("ConstantFoldingCtrl/switch_1"), s.output_true);
+  Output c1 = ops::Const(scope.WithOpName("c1")
+                             .WithControlDependencies(id_after_var)
+                             .WithControlDependencies(grappler_added_id),
+                         {1.0f, 2.0f}, {1, 2});
+  Output id1 = ops::Identity(scope.WithOpName("id1"), c1);
+  Output id2 = ops::Identity(scope.WithOpName("id2"), id0);
+  Output fetch =
+      ops::Identity(scope.WithOpName("fetch").WithControlDependencies(id1), c1);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch.push_back("c1");
+  item.fetch.push_back("id2");
+  item.fetch.push_back("fetch");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size() - 2, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    // "id0" and "id1" but neither "ConstantFoldingCtrl/switch_1",
+    // "id_after_var, nor "id2"" should be eliminated.
+    EXPECT_NE("id0", node.name());
+    EXPECT_NE("id1", node.name());
+    if (node.name() == "c1") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^ConstantFoldingCtrl/switch_1", node.input(0));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
index 55a90dce88f91bf88e6c6ad4ff5f9d2804d539f9..42d9837312d25f3504c85f12883c4ac818157cdd 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -41,7 +41,7 @@ class GraphOptimizer {
                           GraphDef* optimized_graph) = 0;
 
   // Method invoked by the framework so that it can provide feedback
-  // on how well the "optimize_output" (produced as *output from a
+  // on how well the "optimized_graph" (produced as *optimized_graph from a
   // call to Optimize) performed.  Lower "result" scores are better.
   virtual void Feedback(Cluster* cluster, const GrapplerItem& item,
                         const GraphDef& optimized_graph, double result) = 0;
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.cc b/tensorflow/core/grappler/optimizers/graph_rewriter.cc
index 2d47ded156048480f243c01e8a706829578438c5..b45ceb12a7972d8e0fb15c0562d0e4ceeeeeef1c 100644
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.cc
+++ b/tensorflow/core/grappler/optimizers/graph_rewriter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
@@ -61,10 +62,19 @@ void GraphRewriter::ForwardInputs(
     const NodeDef& original_node,
     const std::unordered_set<const NodeDef*>& nodes_to_delete,
     NodeDef* new_node) {
-  ForwardInputsInternal(original_node, nodes_to_delete, new_node);
+  ForwardInputsInternal(original_node, nodes_to_delete, false, new_node);
   if (!new_node->name().empty()) {
     optimized_nodes_[new_node->name()] = new_node;
   }
+  // Reorder inputs such that control inputs come after regular inputs.
+  int pos = 0;
+  for (int i = 0; i < new_node->input_size(); ++i) {
+    if (!IsControlInput(new_node->input(i))) {
+      new_node->mutable_input()->SwapElements(pos, i);
+      ++pos;
+    }
+  }
+  DedupControlInputs(new_node);
 }
 
 bool GraphRewriter::DrivesControlDependency(const NodeDef& node) const {
@@ -72,6 +82,10 @@ bool GraphRewriter::DrivesControlDependency(const NodeDef& node) const {
          control_dependency_drivers_.end();
 }
 
+bool GraphRewriter::FeedsMerge(const NodeDef& node) const {
+  return merge_feeders_.find(&node) != merge_feeders_.end();
+}
+
 bool GraphRewriter::IsDrivenByControlDependency(const NodeDef& node) const {
   for (const auto& input : node.input()) {
     CHECK(!input.empty());
@@ -94,12 +108,27 @@ bool GraphRewriter::ReceivesRefValue(const NodeDef& node) const {
   return ref_receivers_.find(&node) != ref_receivers_.end();
 }
 
+bool GraphRewriter::IsDrivenBySwitch(const NodeDef& node) const {
+  return switch_receivers_.find(&node) != switch_receivers_.end();
+}
+
+bool GraphRewriter::RemovalIncreasesEdgeCount(const NodeDef& node) const {
+  const int in_degree = node.input_size();
+  auto itr = nodes_.find(node.name());
+  if (itr == nodes_.end()) {
+    return true;
+  }
+  const int out_degree = itr->second->out_degree;
+  return in_degree * out_degree > in_degree + out_degree;
+}
+
 void GraphRewriter::RecordConnectivity(
     const NodeDef& node, const std::unordered_set<string>& function_names) {
   const bool is_function =
       function_names.find(node.op()) != function_names.end();
 
   bool ref_receiver = false;
+  bool switch_receiver = false;
   for (const auto& input : node.input()) {
     int position = 0;
     string input_node_name = ParseNodeName(input, &position);
@@ -107,8 +136,14 @@ void GraphRewriter::RecordConnectivity(
     if (itr == nodes_.end()) {
       continue;
     }
-    const NodeInfo* fanin_info = itr->second.get();
+
+    NodeInfo* fanin_info = itr->second.get();
     const NodeDef* fanin = fanin_info->def;
+    if (IsMerge(node)) {
+      merge_feeders_.insert(fanin);
+    }
+    // Update out_degree of fanin.
+    ++fanin_info->out_degree;
     if (position < 0) {
       // This is a control edge
       control_dependency_drivers_.insert(fanin);
@@ -120,7 +155,9 @@ void GraphRewriter::RecordConnectivity(
       if (is_function) {
         function_neighbors_.insert(fanin);
       }
-
+      if (IsSwitch(*fanin)) {
+        switch_receiver = true;
+      }
       if (position < fanin_info->outputs.size() &&
           IsRefType(fanin_info->outputs[position])) {
         ref_receiver = true;
@@ -134,34 +171,41 @@ void GraphRewriter::RecordConnectivity(
   if (ref_receiver) {
     ref_receivers_.insert(&node);
   }
+  if (switch_receiver) {
+    switch_receivers_.insert(&node);
+  }
 }
 
 void GraphRewriter::ForwardInputsInternal(
     const NodeDef& node,
     const std::unordered_set<const NodeDef*>& nodes_to_delete,
-    NodeDef* new_node) {
+    bool add_as_control, NodeDef* new_node) {
   // To speed things up, use the optimized version of the node if
   // available.
   auto itr = optimized_nodes_.find(node.name());
   if (itr != optimized_nodes_.end()) {
     for (const string& input : itr->second->input()) {
-      *new_node->add_input() = input;
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
     }
     return;
   }
   for (const auto& input : node.input()) {
-    string input_node_name = NodeName(input);
+    const string input_node_name = NodeName(input);
     auto itr = nodes_.find(input_node_name);
     if (itr == nodes_.end()) {
       // Invalid input, preserve it as is.
-      *new_node->add_input() = input;
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
       continue;
     }
     const NodeDef* input_node = itr->second->def;
     if (nodes_to_delete.find(input_node) != nodes_to_delete.end()) {
-      ForwardInputsInternal(*input_node, nodes_to_delete, new_node);
+      ForwardInputsInternal(*input_node, nodes_to_delete,
+                            add_as_control || IsControlInput(input), new_node);
     } else {
-      *new_node->add_input() = input;
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
     }
   }
 }
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.h b/tensorflow/core/grappler/optimizers/graph_rewriter.h
index 4b9c9feef8f7a4456183a00c8c64f6a0d0991ad4..3d48d628e203e3d1ab6c8ee3bda9575facbd129f 100644
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.h
+++ b/tensorflow/core/grappler/optimizers/graph_rewriter.h
@@ -58,15 +58,27 @@ class GraphRewriter {
   // Returns true if the node has input from a stateful op.
   bool ReceivesRefValue(const NodeDef& node) const;
 
+  // Returns true if the node is driven by a Switch node.
+  bool IsDrivenBySwitch(const NodeDef& node) const;
+
+  // Returns true if the node feeds a Merge node.
+  bool FeedsMerge(const NodeDef& node) const;
+
+  // Returns true if removal of this degree would increase edge count, i.e. if
+  // in-degree * out-degree > in-degree + out-degree or if the condition could
+  // not be verified.
+  bool RemovalIncreasesEdgeCount(const NodeDef& node) const;
+
  private:
   void RecordConnectivity(const NodeDef& node,
                           const std::unordered_set<string>& function_names);
   void ForwardInputsInternal(
       const NodeDef& original_node,
       const std::unordered_set<const NodeDef*>& nodes_to_delete,
-      NodeDef* new_node);
+      bool add_as_control, NodeDef* new_node);
 
   struct NodeInfo {
+    int out_degree = 0;
     const NodeDef* def;
 
     // These are filled in when the NodeInfo is built, but not that they
@@ -80,6 +92,8 @@ class GraphRewriter {
   std::unordered_set<const NodeDef*> function_neighbors_;
   std::unordered_set<const NodeDef*> cross_device_receivers_;
   std::unordered_set<const NodeDef*> ref_receivers_;
+  std::unordered_set<const NodeDef*> switch_receivers_;
+  std::unordered_set<const NodeDef*> merge_feeders_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index d5563e9d4c62967e7bde2e85d419f3f6725e2b35..a606f972ac9b9423dc722bbfff958e3a028a6c97 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <deque>
 #include <unordered_set>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -35,17 +36,17 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-const char kConcatConst[] = "LayoutOptimizerConcatConst";
-const char kSplitConst[] = "LayoutOptimizerSplitConst";
-const char kPermNHWCToNCHW[] = "LayoutOptimizerPermConstNHWCToNCHW";
-const char kPermNCHWToNHWC[] = "LayoutOptimizerPermConstNCHWToNHWC";
-const char kGatherAxisConst[] = "LayoutOptimizerGatherAxisConst";
-const char kTransposeNHWCToNCHW[] = "LayoutOptimizerTransposeNHWCToNCHW";
-const char kTransposeNCHWToNHWC[] = "LayoutOptimizerTransposeNCHWToNHWC";
-const char kPermVecNHWCToNCHW[] = "LayoutOptimizerPermVecNHWCToNCHW";
-const char kReshapeNHWCToNCHW[] = "LayoutOptimizerReshapeNHWCToNCHW";
-const char kReshapeConst[] = "LayoutOptimizerReshapeConst";
-const char kReductionConst[] = "LayoutOptimizerReductionConst";
+const char kSuffix[] = "LayoutOptimizer";
+const char kPermNHWCToNCHW[] = "PermConstNHWCToNCHW";
+const char kPermNCHWToNHWC[] = "PermConstNCHWToNHWC";
+const char kTransposeNHWCToNCHW[] = "TransposeNHWCToNCHW";
+const char kTransposeNCHWToNHWC[] = "TransposeNCHWToNHWC";
+const char kDimMapNHWCToNCHW[] = "DimMapNHWCToNCHW";
+const char kDimMapNCHWToNHWC[] = "DimMapNCHWToNHWC";
+const char kVecPermuteNHWCToNCHW[] = "VecPermuteNHWCToNCHW";
+const char kVecPermuteNCHWToNHWC[] = "VecPermuteNCHWToNHWC";
+const char kReshapeNHWCToNCHW[] = "ReshapeNHWCToNCHW";
+const char kReshapeConst[] = "ReshapeConst";
 
 std::set<string> GetOpsFormatSupported() {
   std::set<string> ops_format_supported = {
@@ -60,54 +61,193 @@ std::set<string> GetOpsFormatSupported() {
       "DepthwiseConv2dNativeBackpropInput",
       "DepthwiseConv2dNativeBackpropFilter",
       "FusedBatchNorm",
+      "FusedBatchNormV2",
       "FusedBatchNormGrad",
+      "FusedBatchNormGradV2",
       "FusedConv2DBiasActivation",
       "MaxPool",
+      "MaxPoolV2",
       "MaxPoolGrad",
+      "MaxPoolGradGrad",
+      "MaxPoolGradV2",
+      "MaxPoolGradGradV2",
       "SpaceToDepth",
       "DepthToSpace"};
   return ops_format_supported;
 }
 
 std::set<string> GetOpsFormatAgnostic() {
-  std::set<string> ops_format_agnostic = {"Add",
+  std::set<string> ops_format_agnostic = {"Abs",
+                                          "Add",
                                           "AddN",
+                                          "AddV2",
+                                          "Acos",
+                                          "Acosh",
+                                          "All",
+                                          "Angle",
+                                          "Any",
+                                          "ApproximateEqual",
+                                          "Asin",
+                                          "Asinh",
+                                          "Atan",
+                                          "Atan2",
+                                          "Atanh",
+                                          "Betainc",
+                                          "Bitcast",
+                                          "Cast",
+                                          "Ceil",
+                                          "CheckNumerics",
+                                          "Complex",
+                                          "ComplexAbs",
                                           "Concat",
                                           "ConcatV2",
+                                          "Conj",
+                                          "Cos",
+                                          "Cosh",
+                                          "Digamma",
+                                          "Div",
+                                          "Elu",
+                                          "EluGrad",
+                                          "Enter",
+                                          "Equal",
+                                          "Erf",
+                                          "Erfc",
+                                          "Exit",
+                                          "Exp",
+                                          "Expm1",
+                                          "Fill",
                                           "Floor",
+                                          "FloorDiv",
+                                          "FloorMod",
+                                          "Greater",
+                                          "GreaterEqual",
+                                          "GuaranteeConst",
+                                          "HistogramSummary",
                                           "Identity",
+                                          "IdentityN",
+                                          "Igamma",
+                                          "Igammac",
+                                          "Imag",
+                                          "Inv",
+                                          "InvGrad",
+                                          "IsFinite",
+                                          "IsInf",
+                                          "IsNan",
+                                          "Less",
+                                          "LessEqual",
+                                          "Lgamma",
+                                          "Log",
+                                          "LogicalAnd",
+                                          "LogicalNot",
+                                          "LogicalOr",
+                                          "Log1p",
+                                          "Max",
+                                          "Maximum",
+                                          "Mean",
+                                          "Merge",
+                                          "Min",
+                                          "Minimum",
+                                          "Mod",
                                           "Mul",
                                           "Neg",
+                                          "NextIteration",
+                                          "NotEqual",
+                                          "OnesLike",
                                           "Pad",
+                                          "PreventGradient",
+                                          "Prod",
+                                          "Polygamma",
+                                          "Pow",
+                                          "Real",
                                           "RealDiv",
+                                          "Reciprocal",
+                                          "ReciprocalGrad",
                                           "Relu",
                                           "Relu6",
+                                          "Relu6Grad",
                                           "ReluGrad",
+                                          "Rint",
+                                          "Select",
+                                          "Selu",
+                                          "SeluGrad",
+                                          "Shape",
+                                          "ShapeN",
                                           "Sigmoid",
+                                          "SigmoidGrad",
+                                          "Sign",
+                                          "Sin",
+                                          "Sinh",
                                           "Slice",
+                                          "Snapshot",
+                                          "Softplus",
+                                          "SoftplusGrad",
                                           "Split",
+                                          "SplitV",
+                                          "StridedSlice",
+                                          "StridedSliceGrad",
+                                          "Switch",
+                                          "Tile",
+                                          "TruncateDiv",
+                                          "TruncateMod",
+                                          "ReverseV2",
+                                          "Round",
+                                          "Rsqrt",
+                                          "RsqrtGrad",
+                                          "Sqrt",
+                                          "SqrtGrad",
+                                          "Square",
                                           "SquaredDifference",
                                           "Squeeze",
-                                          "Sub"};
+                                          "StopGradient",
+                                          "Sub",
+                                          "Sum",
+                                          "Tan",
+                                          "Tanh",
+                                          "TanhGrad",
+                                          "ZerosLike",
+                                          "Zeta"};
   return ops_format_agnostic;
 }
 
-bool IsNodeNHWCToNCHW(const string& node_name) {
-  const string transpose_node_prefix = kTransposeNHWCToNCHW;
-  string prefix = node_name.substr(0, transpose_node_prefix.length());
-  if (prefix.compare(transpose_node_prefix) == 0) {
+bool EndWith(const string& str, const string& ending) {
+  if (str.size() < ending.size()) return false;
+  if (str.substr(str.size() - ending.size(), ending.size()) == ending)
     return true;
-  }
   return false;
 }
 
-bool IsNodeNCHWToNHWC(const string& node_name) {
-  const string transpose_node_prefix = kTransposeNCHWToNHWC;
-  string prefix = node_name.substr(0, transpose_node_prefix.length());
-  if (prefix.compare(transpose_node_prefix) == 0) {
-    return true;
-  }
-  return false;
+bool IsNodeByLayoutOptimizer(const string& node_name) {
+  const string suffix = kSuffix;
+  return EndWith(node_name, suffix);
+}
+
+bool IsNodeType(const string& node_name, const string& type) {
+  const string suffix = strings::StrCat(type, "-", kSuffix);
+  return EndWith(node_name, suffix);
+}
+
+bool IsTransposeNHWCToNCHW(const string& node_name) {
+  return IsNodeType(node_name, kTransposeNHWCToNCHW);
+}
+
+bool IsTransposeNCHWToNHWC(const string& node_name) {
+  return IsNodeType(node_name, kTransposeNCHWToNHWC);
+}
+
+bool IsDimMapNHWCToNCHW(const string& node_name) {
+  return IsNodeType(node_name, kDimMapNHWCToNCHW);
+}
+
+bool IsDimMapNCHWToNHWC(const string& node_name) {
+  return IsNodeType(node_name, kDimMapNCHWToNHWC);
+}
+
+bool IsVecPermuteNHWCToNCHW(const string& node_name) {
+  return IsNodeType(node_name, kVecPermuteNHWCToNCHW);
+}
+
+bool IsVecPermuteNCHWToNHWC(const string& node_name) {
+  return IsNodeType(node_name, kVecPermuteNCHWToNHWC);
 }
 
 bool IsConcat(const NodeDef& node) {
@@ -120,17 +260,121 @@ bool IsConcatV1(const NodeDef& node) {
   return op == "Concat";
 }
 
+bool IsMaxPoolV2(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "MaxPoolV2";
+}
+
 bool IsMaxPoolGradV1(const NodeDef& node) {
   const auto& op = node.op();
   return op == "MaxPoolGrad";
 }
 
+bool IsMaxPoolGradV2(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "MaxPoolGradV2";
+}
+
+bool IsMaxPoolGradGradV1(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "MaxPoolGradGrad";
+}
+
+bool IsMaxPoolGradGradV2(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "MaxPoolGradGradV2";
+}
+
+bool IsUnaryGrad(const NodeDef& node) {
+  bool is_unary_grad =
+      IsEluGrad(node) || IsInvGrad(node) || IsReciprocalGrad(node) ||
+      IsRelu6Grad(node) || IsReluGrad(node) || IsRsqrtGrad(node) ||
+      IsSeluGrad(node) || IsSigmoidGrad(node) || IsSoftplusGrad(node) ||
+      IsSoftsignGrad(node) || IsSqrtGrad(node) || IsTanhGrad(node);
+  return is_unary_grad;
+}
+
+bool IsComparisonOp(const NodeDef& node) {
+  bool is_compare = IsApproximateEqual(node) || IsEqual(node) ||
+                    IsGreater(node) || IsGreaterEqual(node) || IsLess(node) ||
+                    IsLessEqual(node) || IsNotEqual(node);
+  return is_compare;
+}
+
+bool IsLogicalOp(const NodeDef& node) {
+  return IsLogicalAnd(node) || IsLogicalNot(node) || IsLogicalOr(node);
+}
+
+bool IsReduceOp(const NodeDef& node) {
+  return IsSum(node) || IsMean(node) || IsProd(node) || IsMax(node) ||
+         IsMin(node) || IsAll(node) || IsAny(node);
+}
+
+bool IsBinaryOp(const NodeDef& node) {
+  bool is_binary =
+      IsAdd(node) || IsAtan2(node) || IsComparisonOp(node) || IsComplex(node) ||
+      IsDiv(node) || IsFloorDiv(node) || IsIgamma(node) || IsIgammac(node) ||
+      IsLogicalAnd(node) || IsLogicalOr(node) || IsMaximum(node) ||
+      IsMinimum(node) || IsMod(node) || IsMul(node) || IsPolygamma(node) ||
+      IsPow(node) || IsRealDiv(node) || IsSquaredDifference(node) ||
+      IsSub(node) || IsTruncateDiv(node) || IsTruncateMod(node) || IsZeta(node);
+  return is_binary;
+}
+
+std::vector<int> NonControlInputs(const NodeDef& node) {
+  std::vector<int> pos;
+  for (int i = 0; i < node.input_size(); i++) {
+    if (!IsControlInput(node.input(i))) {
+      pos.push_back(i);
+    }
+  }
+  return pos;
+}
+
+std::vector<int> DataInputPosConcat(const NodeDef& node) {
+  int n = node.attr().at("N").i();
+  std::vector<int> input_pos;
+  int start = (IsConcatV1(node)) ? 1 : 0;
+  int end = start + n;
+  for (int i = start; i < end; i++) {
+    input_pos.push_back(i);
+  }
+  return input_pos;
+}
+
+std::vector<int> DataInputPos(const NodeDef& node) {
+  if (IsSplit(node) || IsHistogramSummary(node)) {
+    return {1};
+  }
+  if (IsStridedSliceGrad(node)) {
+    return {4};
+  }
+  if (IsBinaryOp(node) || IsUnaryGrad(node)) {
+    return {0, 1};
+  }
+  if (IsBetainc(node) || IsSelect(node)) {
+    return {0, 1, 2};
+  }
+  if (IsShapeN(node) || IsIdentityN(node) || IsAddN(node)) {
+    return NonControlInputs(node);
+  }
+  if (IsConcat(node)) {
+    return DataInputPosConcat(node);
+  }
+  if (node.input_size() > 0 && !IsControlInput(node.input(0))) {
+    return {0};
+  }
+  return {};
+}
+
 class GraphProcessor {
  public:
-  GraphProcessor(const VirtualPlacer& virtual_placer,
+  GraphProcessor(const GraphProperties& graph_properties,
+                 const VirtualPlacer& virtual_placer,
                  const std::unordered_set<string>& nodes_to_preserve,
                  GraphDef* graph, NodeMap* node_map)
-      : virtual_placer_(virtual_placer),
+      : graph_properties_(graph_properties),
+        virtual_placer_(virtual_placer),
         nodes_to_preserve_(nodes_to_preserve),
         graph_(graph),
         node_map_(node_map) {}
@@ -186,33 +430,11 @@ class GraphProcessor {
     return node;
   }
 
-  NodeDef* AddNodeReductionConst(const string& name, const string& device) {
-    NodeDef* node = graph_->add_node();
-    node_map_->AddNode(name, node);
-    node->set_name(name);
-    node->set_op("Const");
-    AttrValue attr_data_type;
-    attr_data_type.set_type(DT_INT32);
-    node->mutable_attr()->insert({"dtype", attr_data_type});
-
-    AttrValue attr_tensor;
-    Tensor tensor(DT_INT32, TensorShape({3}));
-    std::vector<int> axis = {0, 2, 3};
-    for (int i = 0; static_cast<size_t>(i) < axis.size(); i++) {
-      tensor.flat<int>()(i) = axis[i];
-    }
-    tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
-    node->mutable_attr()->insert({"value", attr_tensor});
-    string device_name;
-    if (device.empty()) {
-      device_name = virtual_placer_.get_canonical_device_name(*node);
-    } else {
-      device_name = device;
-    }
-    node->set_device(device_name);
-    return node;
+  string LayoutOptimizerNode(const string& base_name) {
+    return strings::StrCat(base_name, "-", kSuffix);
   }
 
+  const GraphProperties& graph_properties_;
   const VirtualPlacer& virtual_placer_;
   const std::unordered_set<string>& nodes_to_preserve_;
   GraphDef* graph_;
@@ -221,18 +443,21 @@ class GraphProcessor {
 
 struct OptimizeContext {
   OptimizeContext(GraphDef* graph, NodeDef* node, NodeMap* node_map,
+                  const GraphProperties& graph_properties,
                   const VirtualPlacer& virtual_placer,
                   const std::unordered_set<string>& nodes_to_preserve,
                   bool is_in_frame)
       : graph(graph),
         node(node),
         node_map(node_map),
+        graph_properties(graph_properties),
         virtual_placer(virtual_placer),
         nodes_to_preserve(nodes_to_preserve),
         is_in_frame(is_in_frame) {}
   GraphDef* graph;
   NodeDef* node;
   NodeMap* node_map;
+  const GraphProperties& graph_properties;
   const VirtualPlacer& virtual_placer;
   const std::unordered_set<string>& nodes_to_preserve;
   bool is_in_frame;
@@ -241,8 +466,9 @@ struct OptimizeContext {
 class NodeProcessor : public GraphProcessor {
  public:
   explicit NodeProcessor(const OptimizeContext& opt_cxt)
-      : GraphProcessor(opt_cxt.virtual_placer, opt_cxt.nodes_to_preserve,
-                       opt_cxt.graph, opt_cxt.node_map),
+      : GraphProcessor(opt_cxt.graph_properties, opt_cxt.virtual_placer,
+                       opt_cxt.nodes_to_preserve, opt_cxt.graph,
+                       opt_cxt.node_map),
         node_(opt_cxt.node),
         is_in_frame_(opt_cxt.is_in_frame) {}
   virtual ~NodeProcessor() {}
@@ -260,17 +486,34 @@ class NodeProcessor : public GraphProcessor {
   }
 
  protected:
-  bool IsDimsN(const NodeDef& node, int n) const {
+  bool IsPortDimsN(const NodeDef& node, int port, int n) const {
     if (node.attr().find("_output_shapes") != node.attr().end()) {
-      auto shape = node.attr().at("_output_shapes").list().shape(0);
-      if (shape.dim_size() == n) {
-        return true;
+      if (node.attr().at("_output_shapes").list().shape_size() > port) {
+        auto shape = node.attr().at("_output_shapes").list().shape(port);
+        if (shape.unknown_rank()) {
+          return false;
+        }
+        if (shape.dim_size() == n) {
+          return true;
+        }
       }
     }
     return false;
   }
 
-  bool IsDimsFour(const NodeDef& node) const { return IsDimsN(node, 4); }
+  bool IsPortZeroDimsN(const NodeDef& node, int n) const {
+    return IsPortDimsN(node, 0, n);
+  }
+
+  bool IsPortZeroDimsFour(const NodeDef& node) const {
+    return NodeProcessor::IsPortZeroDimsN(node, 4) ||
+           IsTransposeNCHWToNHWC(node.name());
+  }
+
+  bool IsPortDimsFour(const NodeDef& node, int port) const {
+    return NodeProcessor::IsPortDimsN(node, port, 4) ||
+           IsTransposeNCHWToNHWC(node.name());
+  }
 
   bool IsNHWC() const {
     if (node_->attr().find("data_format") != node_->attr().end()) {
@@ -298,12 +541,7 @@ class NodeProcessor : public GraphProcessor {
     return nodes_to_preserve_.find(node_->name()) != nodes_to_preserve_.end();
   }
 
-  virtual bool ShouldProcess() const {
-    return !MustPreserve() && IsNHWC() && IsDimsFour(*node_) && HasOutputs() &&
-           IsOnGPU();
-  }
-
-  virtual bool IsOnGPU() const {
+  bool IsOnGPU() const {
     string device_name;
     if (node_->device().empty()) {
       device_name = virtual_placer_.get_canonical_device_name(*node_);
@@ -320,33 +558,154 @@ class NodeProcessor : public GraphProcessor {
     return false;
   }
 
-  void UpdateAttrDataFormat() {
-    if (node_->attr().find("data_format") != node_->attr().end()) {
-      if (node_->attr().at("data_format").s().compare("NHWC") == 0) {
-        string* data_format =
-            node_->mutable_attr()->at("data_format").mutable_s();
-        *data_format = "NCHW";
-      }
-    }
+  virtual bool ShouldProcess() const {
+    return !MustPreserve() && IsNHWC() && IsPortZeroDimsFour(*node_) &&
+           HasOutputs() && IsOnGPU();
   }
 
   virtual void UpdateAttrShape() {
     if (node_->attr().find("_output_shapes") != node_->attr().end()) {
-      auto shape = node_->mutable_attr()
-                       ->at("_output_shapes")
-                       .mutable_list()
-                       ->mutable_shape(0);
-      if (shape->dim_size() == 4) {
-        int64 h = shape->dim(1).size();
-        int64 w = shape->dim(2).size();
-        int64 c = shape->dim(3).size();
-        shape->mutable_dim(1)->set_size(c);
-        shape->mutable_dim(2)->set_size(h);
-        shape->mutable_dim(3)->set_size(w);
+      for (const auto& pos : GetOutputPos()) {
+        auto shape = node_->mutable_attr()
+                         ->at("_output_shapes")
+                         .mutable_list()
+                         ->mutable_shape(pos);
+        if (shape->dim_size() == 4) {
+          int64 h = shape->dim(1).size();
+          int64 w = shape->dim(2).size();
+          int64 c = shape->dim(3).size();
+          shape->mutable_dim(1)->set_size(c);
+          shape->mutable_dim(2)->set_size(h);
+          shape->mutable_dim(3)->set_size(w);
+        }
+      }
+    }
+  }
+
+  Status UpdateAttrValueOfInput(int input_index, bool permute) {
+    auto input_node = node_map_->GetNode(node_->input(input_index));
+    // We created a copy of the node, so that we don't modify the original node,
+    // which might be used elsewhere. Note that this copy also copies the
+    // control dependency input in the case this node is inside a loop,
+    // to ensure added_node is in the same frame with node_.
+    NodeDef* added_node = graph_->add_node();
+    *added_node = *input_node;
+    string base_name = strings::StrCat(node_->name(), "-", input_index);
+    string node_name = LayoutOptimizerNode(base_name);
+    added_node->set_name(node_name);
+    *node_->mutable_input(input_index) = node_name;
+    node_map_->AddNode(node_name, added_node);
+    node_map_->AddOutput(node_name, node_->name());
+    return UpdateAttrValue(added_node, permute);
+  }
+
+  virtual std::vector<int> GetInputPos() const { return {0}; }
+
+  virtual std::set<int> GetOutputPos() const {
+    // For most nodes, no need to process control nodes or nodes that use an
+    // output other than the first output: only the first output is of
+    // 4D NCHW/NHWC format and thus relevant here.
+    std::set<int> output_pos = {0};
+    return output_pos;
+  }
+
+  virtual Status AddLayoutTransposeToInputs() {
+    std::vector<int> input_pos = GetInputPos();
+    for (const auto& pos : input_pos) {
+      string node_name = LayoutOptimizerNode(
+          strings::StrCat(node_->name(), "-", pos, "-", kTransposeNHWCToNCHW));
+      DataType dtype =
+          graph_properties_.GetInputProperties(node_->name())[pos].dtype();
+      auto input_node = node_map_->GetNode(node_->input(pos));
+      TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes"));
+      string const_name = GetOrAddNodePermNHWCToNCHW(pos);
+      int output_pos;
+      ParseNodeName(node_->input(pos), &output_pos);
+      AddNodeTranspose(
+          node_name, node_->input(pos), const_name, dtype,
+          input_node->attr().at("_output_shapes").list().shape(output_pos),
+          true);
+      node_map_->UpdateOutput(NodeName(node_->input(pos)), node_->name(),
+                              node_name);
+      node_map_->AddOutput(node_name, node_->name());
+      *node_->mutable_input(pos) = node_name;
+    }
+    return Status::OK();
+  }
+
+  Status AddTransformToOutputs(const string& op) {
+    auto outputs = node_map_->GetOutputs(node_->name());
+    string const_name = GetOrAddNodePermNCHWToNHWC();
+    int output_count = 0;
+    for (const auto& output : outputs) {
+      int connections = 0;
+      int connections_removed = 0;
+      for (int i = 0; i < output->input_size(); i++) {
+        auto& input = *output->mutable_input(i);
+        int input_port;
+        string input_name = ParseNodeName(input, &input_port);
+        auto output_pos = GetOutputPos();
+        if (input_name == node_->name()) {
+          connections++;
+          if (output_pos.find(input_port) != output_pos.end()) {
+            connections_removed++;
+            string added_node_base_name =
+                strings::StrCat(node_->name(), "-", output_count, "-", i);
+            string added_node_name;
+            DataType dtype =
+                graph_properties_.GetOutputProperties(node_->name())[input_port]
+                    .dtype();
+            if (op == "Transpose") {
+              added_node_name = LayoutOptimizerNode(strings::StrCat(
+                  added_node_base_name, "-", kTransposeNCHWToNHWC));
+              TF_RETURN_IF_ERROR(HasAttribute(*node_, "_output_shapes"));
+              AddNodeTranspose(
+                  added_node_name, input, const_name, dtype,
+                  node_->attr().at("_output_shapes").list().shape(input_port),
+                  false);
+            } else if (op == "DataFormatVecPermute") {
+              added_node_name = LayoutOptimizerNode(strings::StrCat(
+                  added_node_base_name, "-", kVecPermuteNCHWToNHWC));
+              AddNodeDataFormatOp(added_node_name, input, op, dtype, false);
+            } else {
+              return errors::InvalidArgument("Unsupported op type: ", op);
+            }
+            input = added_node_name;
+            node_map_->AddOutput(node_->name(), added_node_name);
+            node_map_->AddOutput(added_node_name, output->name());
+          }
+        }
+      }
+      if (connections == connections_removed) {
+        node_map_->RemoveOutput(node_->name(), output->name());
       }
+      output_count++;
+    }
+    return Status::OK();
+  }
+
+  virtual Status AddLayoutTransposeToOutputs() {
+    return AddTransformToOutputs("Transpose");
+  }
+
+  virtual Status CustomizedProcessing() { return Status::OK(); }
+
+  Status UpdateOrTransformParamInput(int param_index, const string& op,
+                                     DataType dtype) {
+    auto param_node = node_map_->GetNode(node_->input(param_index));
+    bool permute = (op == "DataFormatVecPermute") ? true : false;
+    if (IsConstant(*param_node)) {
+      TF_RETURN_IF_ERROR(UpdateAttrValueOfInput(param_index, permute));
+    } else {
+      AddDataFormatTranformToParamInput(op, param_index, dtype);
     }
+    return Status::OK();
   }
 
+  NodeDef* node_;
+  bool is_in_frame_;
+
+ private:
   void UpdateAttrKSize() {
     if (node_->attr().find("ksize") != node_->attr().end()) {
       auto list = node_->mutable_attr()->at("ksize").mutable_list();
@@ -361,7 +720,17 @@ class NodeProcessor : public GraphProcessor {
     }
   }
 
-  Status UpdateAttrValue(NodeDef* node) {
+  void UpdateAttrDataFormat() {
+    if (node_->attr().find("data_format") != node_->attr().end()) {
+      if (node_->attr().at("data_format").s().compare("NHWC") == 0) {
+        string* data_format =
+            node_->mutable_attr()->at("data_format").mutable_s();
+        *data_format = "NCHW";
+      }
+    }
+  }
+
+  Status UpdateAttrValue(NodeDef* node, bool permute) {
     TF_RETURN_IF_ERROR(HasAttribute(*node, "value"));
     Tensor tensor;
     auto success =
@@ -369,46 +738,51 @@ class NodeProcessor : public GraphProcessor {
     if (!success) {
       LOG(ERROR) << "Failed to parse TensorProto.";
     }
-    if (tensor.dims() == 1) {
-      int c = tensor.flat<int>()(3);
-      tensor.flat<int>()(3) = tensor.flat<int>()(2);
-      tensor.flat<int>()(2) = tensor.flat<int>()(1);
-      tensor.flat<int>()(1) = c;
-    } else if (tensor.dims() == 2) {
-      for (int i = 0; i < 2; i++) {
-        int c = tensor.matrix<int>()(3, i);
-        tensor.matrix<int>()(3, i) = tensor.matrix<int>()(2, i);
-        tensor.matrix<int>()(2, i) = tensor.matrix<int>()(1, i);
-        tensor.matrix<int>()(1, i) = c;
+
+    if (permute) {
+      if (tensor.dims() == 1) {
+        if (tensor.flat<int>().size() == 4) {
+          int c = tensor.flat<int>()(3);
+          tensor.flat<int>()(3) = tensor.flat<int>()(2);
+          tensor.flat<int>()(2) = tensor.flat<int>()(1);
+          tensor.flat<int>()(1) = c;
+        } else {
+          return Status(error::INVALID_ARGUMENT,
+                        strings::StrCat("Unsupported tensor size: ",
+                                        tensor.flat<int>().size()));
+        }
+      } else if (tensor.dims() == 2) {
+        for (int i = 0; i < 2; i++) {
+          int c = tensor.matrix<int>()(3, i);
+          tensor.matrix<int>()(3, i) = tensor.matrix<int>()(2, i);
+          tensor.matrix<int>()(2, i) = tensor.matrix<int>()(1, i);
+          tensor.matrix<int>()(1, i) = c;
+        }
+      } else {
+        return Status(
+            error::INVALID_ARGUMENT,
+            strings::StrCat("Unsupported dimension size: ", tensor.dims()));
       }
     } else {
-      return Status(
-          error::INVALID_ARGUMENT,
-          strings::StrCat("Unsupported dimension size: ", tensor.dims()));
+      for (int i = 0; i < tensor.flat<int>().size(); i++) {
+        int value = tensor.flat<int>()(i);
+        value = (value >= 0) ? value : value + 4;
+        if (value == 1 || value == 2) {
+          value = value + 1;
+        } else if (value == 3) {
+          value = 1;
+        }
+        tensor.flat<int>()(i) = value;
+      }
     }
-    tensor.AsProtoTensorContent(
-        node->mutable_attr()->at({"value"}).mutable_tensor());
-    return Status::OK();
-  }
 
-  Status UpdateAttrValueOfInput(int input_index) {
-    auto input_node = node_map_->GetNode(node_->input(input_index));
-    // We created a copy of the node, so that we don't modify the original node,
-    // which might be used elsewhere.
-    NodeDef* added_node = graph_->add_node();
-    *added_node = *input_node;
-    string base_name = strings::StrCat(node_->name(), "-", input_node->name());
-    string node_name = AddPrefixToNodeName(base_name, "LayoutOptimizer", "-");
-    added_node->set_name(node_name);
-    *node_->mutable_input(input_index) = node_name;
-    node_map_->AddNode(node_name, added_node);
-    node_map_->AddOutput(node_name, node_->name());
-    return UpdateAttrValue(added_node);
-  }
-
-  virtual std::vector<int> GetInputPos() const {
-    std::vector<int> input_pos = {0};
-    return input_pos;
+    if (tensor.dims() == 0) {
+      tensor.AsProtoField(node->mutable_attr()->at({"value"}).mutable_tensor());
+    } else {
+      tensor.AsProtoTensorContent(
+          node->mutable_attr()->at({"value"}).mutable_tensor());
+    }
+    return Status::OK();
   }
 
   NodeDef* AddNodeTranspose(const string& node_name, const string& input_name,
@@ -447,106 +821,47 @@ class NodeProcessor : public GraphProcessor {
     return node;
   }
 
-  virtual Status AddLayoutTransposeToInputs() {
-    std::vector<int> input_pos = GetInputPos();
-    for (const auto& pos : input_pos) {
-      int output_pos;
-      string input_node_name = ParseNodeName(node_->input(pos), &output_pos);
-      string base_name =
-          strings::StrCat(node_->name(), "-", input_node_name, "-", output_pos);
-      string node_name =
-          AddPrefixToNodeName(base_name, kTransposeNHWCToNCHW, "-");
-      auto input_node = node_map_->GetNode(node_->input(pos));
-      TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
-      TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes"));
-      string const_name = GetOrAddNodePermNHWCToNCHW(pos);
-      AddNodeTranspose(
-          node_name, node_->input(pos), const_name,
-          node_->attr().at("T").type(),
-          input_node->attr().at("_output_shapes").list().shape(output_pos),
-          true);
-      node_map_->UpdateOutput(node_->input(pos), node_->name(), node_name);
-      node_map_->AddOutput(node_name, node_->name());
-      *node_->mutable_input(pos) = node_name;
-    }
-    return Status::OK();
-  }
-
-  virtual Status AddLayoutTransposeToOutputs() {
-    auto outputs = node_map_->GetOutputs(node_->name());
-    string const_name = GetOrAddNodePermNCHWToNHWC();
-    for (const auto& output : outputs) {
-      string base_name = strings::StrCat(node_->name(), "-", output->name());
-      string node_name =
-          AddPrefixToNodeName(base_name, kTransposeNCHWToNHWC, "-");
-      // TODO(yaozhang): handle the rare case where node A is connected to more
-      // than one input of node B.
-      auto it = std::find_if(output->mutable_input()->begin(),
-                             output->mutable_input()->end(),
-                             [this](const string& input) {
-                               string node_name = NodeName(input);
-                               return node_name.compare(node_->name()) == 0;
-                             });
-      if (it == output->mutable_input()->end()) {
-        return Status(error::INVALID_ARGUMENT,
-                      strings::StrCat("Expect ", node_->name(),
-                                      " to be an input of ", output->name()));
-      }
-      int output_pos = NodePosition(*it);
-      // No need to process control nodes or nodes that use an output
-      // other than the first output: only the first output is of 4D NCHW/NHWC
-      // format and thus relevant here.
-      if (output_pos != 0) {
-        continue;
-      }
-      TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
-      TF_RETURN_IF_ERROR(HasAttribute(*node_, "_output_shapes"));
-      AddNodeTranspose(
-          node_name, node_->name(), const_name, node_->attr().at("T").type(),
-          node_->attr().at("_output_shapes").list().shape(0), false);
-      *it = node_name;
-      node_map_->UpdateOutput(node_->name(), output->name(), node_name);
-      node_map_->AddOutput(node_name, output->name());
-    }
-    return Status::OK();
-  }
-
-  virtual Status CustomizedProcessing() { return Status::OK(); }
-
-  NodeDef* AddNodePermNHWCToNCHW(const string& suffix,
+  NodeDef* AddNodePermNHWCToNCHW(const string& base_name,
                                  const string& depended_node,
                                  const string& device) {
-    auto const_node = AddNodePermConst(
-        strings::StrCat(kPermNHWCToNCHW, "-", suffix), device, {0, 3, 1, 2});
+    string name =
+        LayoutOptimizerNode(strings::StrCat(base_name, "-", kPermNHWCToNCHW));
+    auto const_node = AddNodePermConst(name, device, {0, 3, 1, 2});
     // This is to ensure the transpose node and the const node are in the
     // same frame.
     *const_node->add_input() = AsControlDependency(depended_node);
     return const_node;
   }
 
-  NodeDef* AddNodePermNCHWToNHWC(const string& suffix,
+  NodeDef* AddNodePermNCHWToNHWC(const string& base_name,
                                  const string& depended_node,
                                  const string& device) {
     auto const_node = AddNodePermConst(
-        strings::StrCat(kPermNCHWToNHWC, "-", suffix), device, {0, 2, 3, 1});
+        LayoutOptimizerNode(strings::StrCat(base_name, "-", kPermNCHWToNHWC)),
+        device, {0, 2, 3, 1});
     // This is to ensure the transpose node and the const node are in the same
     // frame.
     *const_node->add_input() = AsControlDependency(depended_node);
     return const_node;
   }
 
-  NodeDef* node_;
-  bool is_in_frame_;
-
- private:
   string GetOrAddNodePermNHWCToNCHW(int pos) {
     string const_name;
     if (is_in_frame_) {
-      auto const_node = AddNodePermNHWCToNCHW(
-          node_->input(pos), NodeName(node_->input(pos)), node_->device());
+      string base_name = strings::StrCat(node_->name(), "-", pos);
+      string input = NodeName(node_->input(pos));
+      string depended_node;
+      if (!IsTransposeNCHWToNHWC(input)) {
+        depended_node = input;
+      } else {
+        auto input_node = node_map_->GetNode(input);
+        depended_node = NodeName(input_node->input(0));
+      }
+      auto const_node =
+          AddNodePermNHWCToNCHW(base_name, depended_node, node_->device());
       const_name = const_node->name();
     } else {
-      const_name = kPermNHWCToNCHW;
+      const_name = LayoutOptimizerNode(kPermNHWCToNCHW);
     }
     return const_name;
   }
@@ -558,7 +873,7 @@ class NodeProcessor : public GraphProcessor {
           AddNodePermNCHWToNHWC(node_->name(), node_->name(), node_->device());
       const_name = const_node->name();
     } else {
-      const_name = kPermNCHWToNHWC;
+      const_name = LayoutOptimizerNode(kPermNCHWToNHWC);
     }
     return const_name;
   }
@@ -571,6 +886,42 @@ class NodeProcessor : public GraphProcessor {
     list->set_i(2, h);
     list->set_i(3, w);
   }
+
+  NodeDef* AddNodeDataFormatOp(const string& name, const string& input_name,
+                               const string& op, DataType dtype,
+                               bool nhwc_to_nchw) {
+    NodeDef* added_node = graph_->add_node();
+    added_node->set_name(name);
+    added_node->set_op(op);
+    node_map_->AddNode(added_node->name(), added_node);
+    added_node->set_device(node_->device());
+    AttrValue attr_data_type;
+    attr_data_type.set_type(dtype);
+    added_node->mutable_attr()->insert({"T", attr_data_type});
+    string src_format = (nhwc_to_nchw) ? "NHWC" : "NCHW";
+    string dst_format = (nhwc_to_nchw) ? "NCHW" : "NHWC";
+    AttrValue attr_format;
+    attr_format.set_s(src_format);
+    added_node->mutable_attr()->insert({"src_format", attr_format});
+    attr_format.set_s(dst_format);
+    added_node->mutable_attr()->insert({"dst_format", attr_format});
+    *added_node->add_input() = input_name;
+    return added_node;
+  }
+
+  void AddDataFormatTranformToParamInput(const string& op, int input_pos,
+                                         DataType dtype) {
+    string suffix = (op == "DataFormatVecPermute") ? kVecPermuteNHWCToNCHW
+                                                   : kDimMapNHWCToNCHW;
+    string name = LayoutOptimizerNode(
+        strings::StrCat(node_->name(), "-", input_pos, "-", suffix));
+    auto added_node =
+        AddNodeDataFormatOp(name, node_->input(input_pos), op, dtype, true);
+    *node_->mutable_input(input_pos) = added_node->name();
+    node_map_->UpdateOutput(NodeName(added_node->input(0)), node_->name(),
+                            added_node->name());
+    node_map_->AddOutput(added_node->name(), node_->name());
+  }
 };
 
 class AvgPoolGradProcessor : public NodeProcessor {
@@ -579,11 +930,10 @@ class AvgPoolGradProcessor : public NodeProcessor {
       : NodeProcessor(opt_cxt) {}
 
  protected:
-  std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos = {1};
-    return input_pos;
+  std::vector<int> GetInputPos() const override { return {1}; }
+  Status CustomizedProcessing() override {
+    return UpdateOrTransformParamInput(0, "DataFormatVecPermute", DT_INT32);
   }
-  Status CustomizedProcessing() override { return UpdateAttrValueOfInput(0); }
 };
 
 class BiasAddGradProcessor : public NodeProcessor {
@@ -601,7 +951,9 @@ class BiasAddGradProcessor : public NodeProcessor {
     }
     auto input = node_map_->GetNode(node_->input(0));
     if (input) {
-      if ((IsNHWC() && IsDimsFour(*input)) || IsNodeNCHWToNHWC(input->name())) {
+      int port;
+      ParseNodeName(node_->input(0), &port);
+      if (IsNHWC() && IsPortDimsFour(*input, port)) {
         return true;
       }
     }
@@ -618,8 +970,8 @@ class Conv2DProcessor : public NodeProcessor {
 
  protected:
   bool ShouldProcess() const override {
-    return !MustPreserve() && IsNHWC() && IsDimsFour(*node_) && HasOutputs() &&
-           (!IsGemmUsed() || no_gemm_) && IsOnGPU();
+    return !MustPreserve() && IsNHWC() && IsPortZeroDimsFour(*node_) &&
+           HasOutputs() && (!IsGemmUsed() || no_gemm_) && IsOnGPU();
   }
 
   TensorShapeProto GetShape(const string& input_name) const {
@@ -692,10 +1044,7 @@ class Conv2DBackpropFilterProcessor : public Conv2DProcessor {
     return Conv2DProcessor::IsGemmUsed(filter_shape, input_shape);
   }
 
-  std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos = {0, 2};
-    return input_pos;
-  }
+  std::vector<int> GetInputPos() const override { return {0, 2}; }
 
   Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
   // No need to update output shape, as it is always of shape
@@ -716,12 +1065,11 @@ class Conv2DBackpropInputProcessor : public Conv2DProcessor {
     return Conv2DProcessor::IsGemmUsed(filter_shape, input_shape);
   }
 
-  std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos = {2};
-    return input_pos;
-  }
+  std::vector<int> GetInputPos() const override { return {2}; }
 
-  Status CustomizedProcessing() override { return UpdateAttrValueOfInput(0); }
+  Status CustomizedProcessing() override {
+    return UpdateOrTransformParamInput(0, "DataFormatVecPermute", DT_INT32);
+  }
 };
 
 class FusedBatchNormGradProcessor : public NodeProcessor {
@@ -734,10 +1082,7 @@ class FusedBatchNormGradProcessor : public NodeProcessor {
     return NodeProcessor::ShouldProcess() && IsTraining();
   }
 
-  std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos = {0, 1};
-    return input_pos;
-  }
+  std::vector<int> GetInputPos() const override { return {0, 1}; }
 
  private:
   bool IsTraining() const {
@@ -756,10 +1101,48 @@ class MaxPoolGradProcessor : public NodeProcessor {
       : NodeProcessor(opt_cxt) {}
 
  protected:
-  std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos = {0, 1, 2};
-    return input_pos;
-  }
+  std::vector<int> GetInputPos() const override { return {0, 1, 2}; }
+};
+
+class MaxPoolGradV2Processor : public MaxPoolGradProcessor {
+ public:
+  explicit MaxPoolGradV2Processor(const OptimizeContext& opt_cxt)
+      : MaxPoolGradProcessor(opt_cxt) {}
+
+ protected:
+  Status CustomizedProcessing() override {
+    for (int i = 3; i <= 4; i++) {
+      TF_RETURN_IF_ERROR(
+          UpdateOrTransformParamInput(i, "DataFormatVecPermute", DT_INT32));
+    }
+    return Status::OK();
+  }
+};
+
+class MaxPoolV2Processor : public NodeProcessor {
+ public:
+  explicit MaxPoolV2Processor(const OptimizeContext& opt_cxt)
+      : NodeProcessor(opt_cxt) {}
+
+ protected:
+  bool ShouldProcess() const override {
+    // We check data_input's shape instead, because the shape inference of
+    // MaxPoolV2 is not able to infer the shape when ksize or strides is not
+    // constant.
+    auto data_input = node_map_->GetNode(node_->input(0));
+    int port;
+    ParseNodeName(node_->input(0), &port);
+    return !MustPreserve() && IsNHWC() && IsPortDimsFour(*data_input, port) &&
+           HasOutputs() && IsOnGPU();
+  }
+
+  Status CustomizedProcessing() override {
+    for (int i = 1; i <= 2; i++) {
+      TF_RETURN_IF_ERROR(
+          UpdateOrTransformParamInput(i, "DataFormatVecPermute", DT_INT32));
+    }
+    return Status::OK();
+  }
 };
 
 class AgnosticNodeProcessor : public NodeProcessor {
@@ -769,30 +1152,43 @@ class AgnosticNodeProcessor : public NodeProcessor {
 
  protected:
   bool ShouldProcess() const override {
-    return !MustPreserve() && IsDimsFour(*node_) && HasOutputs() &&
+    return !MustPreserve() && IsPortZeroDimsFour(*node_) && HasOutputs() &&
            IsNodeAfterNCHWToNHWC() && IsOnGPU();
   }
 
-  bool IsNodeAfterNCHWToNHWC() const {
+  bool IsNodeAfterNCHWToNHWC(const NodeDef& node) const {
     std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
-    auto node = node_map_->GetNode(node_->name());
-    while (node->input_size() > 0) {
-      int data_input_pos = 0;
-      if (IsConcatV1(*node) || IsSplit(*node)) {
-        data_input_pos = 1;
-      }
-      node = node_map_->GetNode(node->input(data_input_pos));
-      if (IsNodeNCHWToNHWC(node->name())) {
+    std::deque<NodeDef*> queue;
+    auto data_node_pos = DataInputPos(node);
+    for (const auto& pos : data_node_pos) {
+      auto input_node = node_map_->GetNode(node.input(pos));
+      queue.push_back(input_node);
+    }
+    // The code will exit this while loop in one iteration in most cases, as the
+    // graph is already topologically sorted.
+    while (!queue.empty()) {
+      NodeDef* current_node = queue.front();
+      queue.pop_front();
+      if (IsTransposeNCHWToNHWC(current_node->name()) ||
+          IsDimMapNCHWToNHWC(current_node->name()) ||
+          IsVecPermuteNCHWToNHWC(current_node->name())) {
         return true;
       }
-      bool connected =
-          ops_format_agnostic.find(node->op()) != ops_format_agnostic.end();
-      if (!connected) {
-        return false;
+      // We only continue searching if the path is connected through
+      // format-agnostic nodes.
+      if (ops_format_agnostic.find(current_node->op()) !=
+          ops_format_agnostic.end()) {
+        auto current_node_pos = DataInputPos(*current_node);
+        for (const auto& pos : current_node_pos) {
+          auto input_node = node_map_->GetNode(current_node->input(pos));
+          queue.push_back(input_node);
+        }
       }
     }
     return false;
   }
+
+  bool IsNodeAfterNCHWToNHWC() const { return IsNodeAfterNCHWToNHWC(*node_); }
 };
 
 class AddNProcessor : public AgnosticNodeProcessor {
@@ -802,56 +1198,62 @@ class AddNProcessor : public AgnosticNodeProcessor {
 
  protected:
   std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos;
-    input_pos.reserve(node_->input_size());
-    for (int i = 0; i < node_->input_size(); i++) {
-      input_pos.push_back(i);
-    }
-    return input_pos;
+    return NonControlInputs(*node_);
   }
 };
 
 class BinaryOpProcessor : public AgnosticNodeProcessor {
  public:
   explicit BinaryOpProcessor(const OptimizeContext& opt_cxt)
-      : AgnosticNodeProcessor(opt_cxt) {
-    is_4d_with_vector_ = Is4DOperateWithVector();
-  }
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
-    return !MustPreserve() && IsDimsFour(*node_) && HasOutputs() &&
+    return !MustPreserve() && IsPortZeroDimsFour(*node_) && HasOutputs() &&
            IsNodeAfterNCHWToNHWC() &&
-           (Is4DOperateWithND(4) || Is4DOperateWithScalar() ||
-            Is4DOperateWithVector()) &&
+           (IsNDOperateWithMD(4, 0) || IsNDOperateWithMD(4, 1) ||
+            IsNDOperateWithMD(4, 4) || IsNDOperateWithMD(0, 4) ||
+            IsNDOperateWithMD(1, 4)) &&
            IsOnGPU();
   }
 
   std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos = {0};
-    if (Is4DOperateWithND(4)) {
+    std::vector<int> input_pos;
+    auto input0 = node_map_->GetNode(node_->input(0));
+    auto input1 = node_map_->GetNode(node_->input(1));
+    int input0_port;
+    ParseNodeName(node_->input(0), &input0_port);
+    int input1_port;
+    ParseNodeName(node_->input(1), &input1_port);
+    if (IsPortDimsFour(*input0, input0_port)) {
+      input_pos.push_back(0);
+    }
+    if (IsPortDimsFour(*input1, input1_port)) {
       input_pos.push_back(1);
     }
     return input_pos;
   }
 
-  bool Is4DOperateWithND(int n) const {
+  bool IsNDOperateWithMD(int n, int m) const {
     auto input0 = node_map_->GetNode(node_->input(0));
     auto input1 = node_map_->GetNode(node_->input(1));
+    int input0_port;
+    ParseNodeName(node_->input(0), &input0_port);
+    int input1_port;
+    ParseNodeName(node_->input(1), &input1_port);
+
     if (input0 && input1) {
-      return (IsDimsFour(*input0) || IsNodeNCHWToNHWC(input0->name())) &&
-             ((n == 4)
-                  ? (IsDimsFour(*input1) || IsNodeNCHWToNHWC(input1->name()))
-                  : IsDimsN(*input1, n));
+      bool input0_is_n = (n == 4) ? IsPortDimsFour(*input0, input0_port)
+                                  : IsPortDimsN(*input0, input0_port, n);
+      bool input1_is_m = (m == 4) ? IsPortDimsFour(*input1, input1_port)
+                                  : IsPortDimsN(*input1, input1_port, m);
+      return input0_is_n && input1_is_m;
     }
     return false;
   }
 
-  bool Is4DOperateWithScalar() const { return Is4DOperateWithND(0); }
-
-  bool Is4DOperateWithVector() const { return Is4DOperateWithND(1); }
-
-  NodeDef* AddNodeShapeConst(const string& name, int num_channels) {
+  NodeDef* AddNodeShapeConst(const string& name, int num_channels,
+                             const string& depended_node) {
     NodeDef* node = graph_->add_node();
     node_map_->AddNode(name, node);
     node->set_name(name);
@@ -869,6 +1271,11 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
     }
     tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
     node->mutable_attr()->insert({"value", attr_tensor});
+    if (is_in_frame_) {
+      // This is to ensure the transpose node and the const node are in the
+      // same frame.
+      *node->add_input() = AsControlDependency(depended_node);
+    }
     return node;
   }
 
@@ -894,31 +1301,41 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
   }
 
   Status CustomizedProcessing() override {
-    if (is_4d_with_vector_) {
-      string base_name = strings::StrCat(node_->name(), "-", node_->input(1));
-      string reshape_node_name =
-          AddPrefixToNodeName(base_name, kReshapeNHWCToNCHW, "-");
+    int vector_index = -1;
+    if (IsNDOperateWithMD(4, 1)) {
+      vector_index = 1;
+    } else if (IsNDOperateWithMD(1, 4)) {
+      vector_index = 0;
+    }
+    if (vector_index != -1) {
+      string base_name = strings::StrCat(node_->name(), "-", vector_index);
+      string reshape_node_name = LayoutOptimizerNode(
+          strings::StrCat(base_name, "-", kReshapeNHWCToNCHW));
       string shape_const_node_name =
-          AddPrefixToNodeName(base_name, kReshapeConst, "-");
-      auto input_node = node_map_->GetNode(node_->input(1));
+          LayoutOptimizerNode(strings::StrCat(base_name, "-", kReshapeConst));
+      auto input_node = node_map_->GetNode(node_->input(vector_index));
       TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes"));
-      int vector_size =
-          input_node->attr().at("_output_shapes").list().shape(0).dim(0).size();
-      AddNodeShapeConst(shape_const_node_name, vector_size);
+      int port;
+      ParseNodeName(node_->input(vector_index), &port);
+      int vector_size = input_node->attr()
+                            .at("_output_shapes")
+                            .list()
+                            .shape(port)
+                            .dim(0)
+                            .size();
+      AddNodeShapeConst(shape_const_node_name, vector_size,
+                        NodeName(node_->input(vector_index)));
       TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
-      AddNodeReshape(reshape_node_name, node_->input(1), shape_const_node_name,
-                     node_->attr().at("T").type());
+      AddNodeReshape(reshape_node_name, node_->input(vector_index),
+                     shape_const_node_name, node_->attr().at("T").type());
       node_map_->AddOutput(shape_const_node_name, reshape_node_name);
-      node_map_->UpdateOutput(node_->input(1), node_->name(),
-                              reshape_node_name);
+      node_map_->UpdateOutput(NodeName(node_->input(vector_index)),
+                              node_->name(), reshape_node_name);
       node_map_->AddOutput(reshape_node_name, node_->name());
-      *node_->mutable_input(1) = reshape_node_name;
+      *node_->mutable_input(vector_index) = reshape_node_name;
     }
     return Status::OK();
   }
-
- private:
-  bool is_4d_with_vector_;
 };
 
 class ConcatProcessor : public AgnosticNodeProcessor {
@@ -926,344 +1343,373 @@ class ConcatProcessor : public AgnosticNodeProcessor {
   explicit ConcatProcessor(const OptimizeContext& opt_cxt)
       : AgnosticNodeProcessor(opt_cxt) {
     // For Concat,  the concat axis is the first input; for ConcatV2,
-    // the last input.
-    axis_node_pos_ = (IsConcatV1(*node_)) ? 0 : (node_->input_size() - 1);
+    // the last input. Note that if with control inputs, the number of inputs
+    // is larger than the integer attribute N.
+    int n = node_->attr().at("N").i();
+    axis_node_pos_ = (IsConcatV1(*node_)) ? 0 : n;
   }
 
  protected:
-  bool ShouldProcess() const override {
-    return !MustPreserve() && IsDimsFour(*node_) && HasOutputs() &&
-           IsNodeAfterNCHWToNHWC() && IsAlongDimC() && IsOnGPU();
-  }
-
   std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos;
-    int start = (IsConcatV1(*node_)) ? 1 : 0;
-    int end =
-        (IsConcatV1(*node_)) ? node_->input_size() : (node_->input_size() - 1);
-    for (int i = start; i < end; i++) {
-      input_pos.push_back(i);
-    }
-    return input_pos;
+    return DataInputPosConcat(*node_);
   }
 
   Status CustomizedProcessing() override {
-    string concat_const_name = GetOrAddNodeConcatConst();
-    node_map_->AddOutput(concat_const_name, node_->name());
-    *node_->mutable_input(axis_node_pos_) = concat_const_name;
-    return Status::OK();
-  }
-
-  bool IsAlongDimC() const {
-    auto axis_node = node_map_->GetNode(node_->input(axis_node_pos_));
-    if (axis_node->attr().find("value") != axis_node->attr().end()) {
-      return axis_node->attr().at("value").tensor().int_val(0) == 3;
-    }
-    return false;
+    DataType dtype =
+        (IsConcatV1(*node_)) ? DT_INT32 : node_->attr().at("Tidx").type();
+    return UpdateOrTransformParamInput(axis_node_pos_, "DataFormatDimMap",
+                                       dtype);
   }
 
   int axis_node_pos_;
+};
 
- private:
-  NodeDef* AddNodeConcatConst(const string& suffix, const string& depended_node,
-                              const string& device) {
-    auto const_node = AddNodeConstScalar(
-        strings::StrCat(kConcatConst, "-", suffix), device, DT_INT32, 1);
-    // This is to ensure the concat node and the const node are
-    // in the same frame.
-    *const_node->add_input() = AsControlDependency(depended_node);
-    return const_node;
-  }
+class FillProcessor : public AgnosticNodeProcessor {
+ public:
+  explicit FillProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
-  string GetOrAddNodeConcatConst() {
-    string const_name;
-    if (is_in_frame_) {
-      int value_node_pos = (axis_node_pos_ == 0) ? 1 : 0;
-      auto const_node = AddNodeConcatConst(
-          node_->name(), NodeName(node_->input(value_node_pos)),
-          node_->device());
-      const_name = const_node->name();
-    } else {
-      const_name = kConcatConst;
-    }
-    return const_name;
+ protected:
+  std::vector<int> GetInputPos() const override { return {}; }
+
+  Status CustomizedProcessing() override {
+    DataType dtype = node_->attr().at("index_type").type();
+    return UpdateOrTransformParamInput(0, "DataFormatVecPermute", dtype);
   }
 };
 
-class PadProcessor : public AgnosticNodeProcessor {
+class HistogramSummaryProcessor : public AgnosticNodeProcessor {
  public:
-  explicit PadProcessor(const OptimizeContext& opt_cxt)
+  explicit HistogramSummaryProcessor(const OptimizeContext& opt_cxt)
       : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
-    return !MustPreserve() && IsDimsFour(*node_) && HasOutputs() &&
-           IsNodeAfterNCHWToNHWC() && PaddingSupported() && IsOnGPU();
+    auto input1 = node_map_->GetNode(node_->input(1));
+    int port;
+    ParseNodeName(node_->input(1), &port);
+    return !MustPreserve() && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+           IsPortDimsFour(*input1, port) && IsOnGPU();
   }
-  Status CustomizedProcessing() override { return UpdateAttrValueOfInput(1); }
 
- private:
-  bool PaddingSupported() const {
-    auto pad_const = node_map_->GetNode(node_->input(1));
-    bool is_const = IsConstant(*pad_const);
-    bool is_4D = false;
-    if (HasAttribute(*pad_const, "value").ok()) {
-      Tensor tensor;
-      if (tensor.FromProto(pad_const->mutable_attr()->at({"value"}).tensor())) {
-        if (tensor.dims() == 2) {
-          if (tensor.dim_size(0) == 4 && tensor.dim_size(1) == 2) {
-            is_4D = true;
-          }
+  std::vector<int> GetInputPos() const override { return {1}; }
+
+  Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
+};
+
+class IdentityNProcessor : public AgnosticNodeProcessor {
+ public:
+  explicit IdentityNProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {
+    std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
+    for (int i = 0; i < node_->input_size(); i++) {
+      auto input = node_map_->GetNode(node_->input(i));
+      int port;
+      ParseNodeName(node_->input(i), &port);
+      // Skip control input.
+      if (port != -1) {
+        bool is_agnostic =
+            ops_format_agnostic.find(input->op()) != ops_format_agnostic.end();
+        if (IsPortDimsFour(*input, port) &&
+            ((IsNodeAfterNCHWToNHWC(*input) && is_agnostic) ||
+             IsTransposeNCHWToNHWC(input->name()))) {
+          input_pos_.push_back(i);
         }
       }
     }
-    return is_const && is_4D;
   }
+
+ protected:
+  bool ShouldProcess() const override {
+    return !MustPreserve() && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+           IsOnGPU();
+  }
+
+  std::vector<int> GetInputPos() const override { return input_pos_; }
+
+  std::set<int> GetOutputPos() const override {
+    std::set<int> output_pos{};
+    for (const auto& input_pos : input_pos_) {
+      output_pos.insert(input_pos);
+    }
+    return output_pos;
+  }
+
+ private:
+  std::vector<int> input_pos_;
 };
 
-class SplitProcessor : public AgnosticNodeProcessor {
+class ShapeProcessor : public IdentityNProcessor {
  public:
-  explicit SplitProcessor(const OptimizeContext& opt_cxt)
+  explicit ShapeProcessor(const OptimizeContext& opt_cxt)
+      : IdentityNProcessor(opt_cxt) {}
+
+ protected:
+  Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
+
+  Status CustomizedProcessing() override {
+    return AddTransformToOutputs("DataFormatVecPermute");
+  }
+};
+
+class MergeProcessor : public AgnosticNodeProcessor {
+ public:
+  explicit MergeProcessor(const OptimizeContext& opt_cxt)
       : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
-    return AgnosticNodeProcessor::ShouldProcess() && SplitSupported();
+    return !MustPreserve() && IsPortZeroDimsFour(*node_) && HasOutputs() &&
+           IsEveryInputAfterNCHWToNHWC() && IsOnGPU();
   }
 
   std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos = {1};
+    std::vector<int> input_pos;
+    int n = node_->attr().at("N").i();
+    input_pos.reserve(n);
+    for (int i = 0; i < n; i++) {
+      input_pos.push_back(i);
+    }
     return input_pos;
   }
 
-  Status CustomizedProcessing() override {
-    string split_const_name = AddNodeSplitConst()->name();
-    node_map_->AddOutput(split_const_name, node_->name());
-    *node_->mutable_input(0) = split_const_name;
-    return Status::OK();
-  }
-
  private:
-  bool SplitSupported() const {
-    auto dim_node = node_map_->GetNode(node_->input(0));
-    if (!IsConstant(*dim_node)) {
-      return false;
-    }
-    if (HasAttribute(*dim_node, "value").ok()) {
-      auto tensor = dim_node->attr().at({"value"}).tensor();
-      if (tensor.tensor_shape().dim_size() == 0 && tensor.int_val_size() == 1) {
-        if (tensor.int_val(0) < 4 && tensor.int_val(0) >= -4) {
-          return true;
-        }
+  bool IsEveryInputAfterNCHWToNHWC() const {
+    std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
+    for (const auto& input : node_->input()) {
+      auto input_node = node_map_->GetNode(input);
+      int port;
+      ParseNodeName(input, &port);
+      bool is_agnostic = ops_format_agnostic.find(input_node->op()) !=
+                         ops_format_agnostic.end();
+      if (IsPortDimsFour(*input_node, port) &&
+          ((IsNodeAfterNCHWToNHWC(*input_node) && is_agnostic) ||
+           IsTransposeNCHWToNHWC(input_node->name()))) {
+        continue;
       }
+      return false;
     }
-    return false;
-  }
-
-  NodeDef* AddNodeSplitConst() {
-    auto dim_node = node_map_->GetNode(node_->input(0));
-    auto tensor = dim_node->attr().at({"value"}).tensor();
-    int value = tensor.int_val(0);
-    value = (value >= 0) ? value : value + 4;
-    if (value == 1 || value == 2) {
-      value = value + 1;
-    } else if (value == 3) {
-      value = 1;
-    }
-    // We created a copy of the node, so that we don't modify the original node,
-    // which might be used elsewhere. Note that this copy also copies the
-    // control dependency input in the case this node is inside a loop,
-    // to ensure added_node is in the same frame with the Split node.
-    NodeDef* added_node = graph_->add_node();
-    *added_node = *dim_node;
-    added_node->set_name(strings::StrCat(kSplitConst, "-", node_->name()));
-    added_node->mutable_attr()->at({"value"}).mutable_tensor()->set_int_val(
-        0, value);
-    return added_node;
+    return true;
   }
 };
 
-class ReluGradProcessor : public AgnosticNodeProcessor {
+class PadProcessor : public AgnosticNodeProcessor {
  public:
-  explicit ReluGradProcessor(const OptimizeContext& opt_cxt)
+  explicit PadProcessor(const OptimizeContext& opt_cxt)
       : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
-  std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos = {0, 1};
-    return input_pos;
+  Status CustomizedProcessing() override {
+    DataType dtype = node_->attr().at("Tpaddings").type();
+    return UpdateOrTransformParamInput(1, "DataFormatVecPermute", dtype);
   }
 };
 
-class SliceProcessor : public AgnosticNodeProcessor {
+class ReverseProcessor : public AgnosticNodeProcessor {
  public:
-  explicit SliceProcessor(const OptimizeContext& opt_cxt)
+  explicit ReverseProcessor(const OptimizeContext& opt_cxt)
       : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   Status CustomizedProcessing() override {
-    // Skip the first input, which is the data to be sliced.
-    for (int i = 1; i < node_->input_size(); i++) {
-      string base_name = strings::StrCat(node_->name(), "-input", i);
-      string node_name =
-          AddPrefixToNodeName(base_name, kPermVecNHWCToNCHW, "-");
-      TF_RETURN_IF_ERROR(HasAttribute(*node_, "Index"));
-      AddNodePermVec(node_name, node_->input(i),
-                     node_->attr().at("Index").type(), true);
-      node_map_->UpdateOutput(node_->input(i), node_->name(), node_name);
-      node_map_->AddOutput(node_name, node_->name());
-      *node_->mutable_input(i) = node_name;
-    }
-    return Status::OK();
+    DataType dtype = node_->attr().at("Tidx").type();
+    return UpdateOrTransformParamInput(1, "DataFormatDimMap", dtype);
   }
+};
 
- private:
-  NodeDef* AddNodeGatherAxisConst(const string& suffix,
-                                  const string& depended_node,
-                                  const string& device) {
-    auto const_node = AddNodeConstScalar(
-        strings::StrCat(kGatherAxisConst, "-", suffix), device, DT_INT32, 0);
-    // This is to ensure the Slice node and the const node are
-    // in the same frame.
-    *const_node->add_input() = AsControlDependency(depended_node);
-    return const_node;
+class SplitProcessor : public AgnosticNodeProcessor {
+ public:
+  explicit SplitProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {
+    axis_node_pos_ = 0;
   }
 
-  string GetOrAddNodeGatherAxisConst() {
-    string const_name;
-    if (is_in_frame_) {
-      auto const_node = AddNodeGatherAxisConst(
-          node_->name(), NodeName(node_->input(0)), node_->device());
-      const_name = const_node->name();
-    } else {
-      const_name = kGatherAxisConst;
+ protected:
+  std::vector<int> GetInputPos() const override { return {1}; }
+
+  std::set<int> GetOutputPos() const override {
+    std::set<int> output_pos{0};
+    if (HasAttribute(*node_, "num_split").ok()) {
+      for (int i = 1; i < node_->attr().at("num_split").i(); i++) {
+        output_pos.insert(i);
+      }
     }
-    return const_name;
+    return output_pos;
   }
 
-  string GetOrAddNodePermNHWCToNCHW() {
-    string const_name;
-    if (is_in_frame_) {
-      auto const_node = AddNodePermNHWCToNCHW(
-          node_->name(), NodeName(node_->input(0)), node_->device());
-      const_name = const_node->name();
-    } else {
-      const_name = kPermNHWCToNCHW;
-    }
-    return const_name;
+  Status CustomizedProcessing() override {
+    return UpdateOrTransformParamInput(axis_node_pos_, "DataFormatDimMap",
+                                       DT_INT32);
   }
 
-  string GetOrAddNodePermNCHWToNHWC() {
-    string const_name;
-    if (is_in_frame_) {
-      auto const_node = AddNodePermNCHWToNHWC(
-          node_->name(), NodeName(node_->input(0)), node_->device());
-      const_name = const_node->name();
-    } else {
-      const_name = kPermNCHWToNHWC;
-    }
-    return const_name;
+  int axis_node_pos_;
+};
+
+class SplitVProcessor : public SplitProcessor {
+ public:
+  explicit SplitVProcessor(const OptimizeContext& opt_cxt)
+      : SplitProcessor(opt_cxt) {
+    axis_node_pos_ = 2;
   }
 
-  void AddNodePermVec(const string& node_name, const string& input_name,
-                      DataType data_type, bool NHWCToNCHW) {
-    NodeDef* node = graph_->add_node();
-    node_map_->AddNode(node_name, node);
-    node->set_name(node_name);
-    *node->add_input() = input_name;
-    *node->add_input() = NHWCToNCHW ? GetOrAddNodePermNHWCToNCHW()
-                                    : GetOrAddNodePermNCHWToNHWC();
-    *node->add_input() = GetOrAddNodeGatherAxisConst();
-    node->set_op("GatherV2");
+ protected:
+  std::vector<int> GetInputPos() const override { return {0}; }
+};
 
-    AttrValue attr_type_indices;
-    attr_type_indices.set_type(DT_INT32);
-    node->mutable_attr()->insert({"Tindices", attr_type_indices});
+class TernaryOpProcessor : public AgnosticNodeProcessor {
+ public:
+  explicit TernaryOpProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
-    AttrValue attr_type_axis;
-    attr_type_axis.set_type(DT_INT32);
-    node->mutable_attr()->insert({"Taxis", attr_type_axis});
+ protected:
+  std::vector<int> GetInputPos() const override { return {0, 1, 2}; }
+};
 
-    AttrValue attr_type_params;
-    attr_type_params.set_type(data_type);
-    node->mutable_attr()->insert({"Tparams", attr_type_params});
+class SelectProcessor : public AgnosticNodeProcessor {
+ public:
+  explicit SelectProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
-    AttrValue attr_validate;
-    attr_validate.set_b(true);
-    node->mutable_attr()->insert({"validate_indices", attr_validate});
+ protected:
+  bool ShouldProcess() const override {
+    auto input0 = node_map_->GetNode(node_->input(0));
+    int input0_port;
+    ParseNodeName(node_->input(0), &input0_port);
+    bool is_input0_scalar_vector_4d = IsPortDimsN(*input0, input0_port, 0) ||
+                                      IsPortDimsN(*input0, input0_port, 1) ||
+                                      IsPortDimsN(*input0, input0_port, 4);
+    return AgnosticNodeProcessor::ShouldProcess() && is_input0_scalar_vector_4d;
+  }
+
+  std::vector<int> GetInputPos() const override {
+    auto input0 = node_map_->GetNode(node_->input(0));
+    int input0_port;
+    ParseNodeName(node_->input(0), &input0_port);
+    // Input 0 could be a scalar, a vector with size matching the first
+    // dimension of input 1 and 2, or must have the same shape as input 1 and 2.
+    if (IsPortDimsFour(*input0, input0_port)) {
+      return {0, 1, 2};
+    } else {
+      return {1, 2};
+    }
   }
 };
 
-// Specialized SliceProcessor, used if the second and third input are const
-// nodes, which could be the case if a constant folding pass is applied
-// before this optimization.
-class SliceProcessorConst : public AgnosticNodeProcessor {
+class UnaryGradProcessor : public AgnosticNodeProcessor {
  public:
-  explicit SliceProcessorConst(const OptimizeContext& opt_cxt)
+  explicit UnaryGradProcessor(const OptimizeContext& opt_cxt)
       : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
-  Status CustomizedProcessing() override {
+  std::vector<int> GetInputPos() const override { return {0, 1}; }
+};
+
+class SliceProcessor : public AgnosticNodeProcessor {
+ public:
+  explicit SliceProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {
     // Skip the first input, which is the data to be sliced.
-    for (int i = 1; i < node_->input_size(); i++) {
-      TF_RETURN_IF_ERROR(UpdateAttrValueOfInput(i));
+    start_ = 1;
+    // Note that we can't use node_->input_size() here because there
+    // could be control inputs.
+    end_ = 2;
+  }
+
+ protected:
+  Status ProcessInputs() {
+    for (int i = start_; i <= end_; i++) {
+      DataType dtype = node_->attr().at("Index").type();
+      TF_RETURN_IF_ERROR(
+          UpdateOrTransformParamInput(i, "DataFormatVecPermute", dtype));
     }
     return Status::OK();
   }
+
+  Status CustomizedProcessing() override { return ProcessInputs(); }
+
+  int start_;
+  int end_;
 };
 
-// Specialized SliceProcessor, used if the second input is ConcatOffset. An
-// example use case is in the gradient computation of Concat for InceptionV3.
-class SliceProcessorConcatOffset : public AgnosticNodeProcessor {
+class StridedSliceProcessor : public SliceProcessor {
  public:
-  explicit SliceProcessorConcatOffset(const OptimizeContext& opt_cxt)
-      : AgnosticNodeProcessor(opt_cxt) {}
+  explicit StridedSliceProcessor(const OptimizeContext& opt_cxt)
+      : SliceProcessor(opt_cxt) {
+    start_ = 1;
+    end_ = 3;
+  }
 
  protected:
+  bool ShouldProcess() const override {
+    return AgnosticNodeProcessor::ShouldProcess() && IsOnlyBeginEndMask();
+  }
+
   Status CustomizedProcessing() override {
-    auto maybe_concatoffset_node =
-        node_map_->GetNode(NodeName(node_->input(1)));
-    if (IsConcatOffset(*maybe_concatoffset_node)) {
-      auto maybe_axis_node =
-          node_map_->GetNode(maybe_concatoffset_node->input(0));
-      NodeDef* axis_node;
-      if (IsConstant(*maybe_axis_node)) {
-        axis_node = maybe_axis_node;
-        // A FloorMod node might be added between ConcatOffset and the concat
-        // dimension const node to handle a negative dimension index -1, meaning
-        // the last dimension, which is consistent with the python's notation
-        // for negative index.
-      } else if (IsFloorMod(*maybe_axis_node)) {
-        axis_node = node_map_->GetNode(maybe_axis_node->input(0));
-      } else {
-        return Status(error::INVALID_ARGUMENT,
-                      strings::StrCat("Expect either Const or FloorMod for the "
-                                      "input 1 of ConcatOffset"));
-      }
-      // Need to process if the channel is at dimension 3, which indicates the
-      // NHWC format is being used. As multiple Slice nodes may share the same
-      // ConcatOffset node, the NHWC to NCHW conversion may have already
-      // been performed when processing other Slice nodes.
-      TF_RETURN_IF_ERROR(HasAttribute(*axis_node, "value"));
-      int concat_dim = axis_node->attr().at("value").tensor().int_val(0);
-      if (concat_dim == -1 || concat_dim == 3) {
-        // Update the dimension order for shape input nodes. Note that the input
-        // 2 of Slice also shares one of the shape nodes.
-        for (int i = 1; i < maybe_concatoffset_node->input_size(); i++) {
-          auto shape_node =
-              node_map_->GetNode(maybe_concatoffset_node->input(i));
-          TF_RETURN_IF_ERROR(UpdateAttrValue(shape_node));
-        }
-        // Set the channel dimension to 1, as we have converted the vector
-        // element order from NHWC to NCHW.
-        axis_node->mutable_attr()->at("value").mutable_tensor()->set_int_val(0,
-                                                                             1);
-      }
+    TF_RETURN_IF_ERROR(UpdateMask("begin_mask"));
+    TF_RETURN_IF_ERROR(UpdateMask("end_mask"));
+    TF_RETURN_IF_ERROR(ProcessInputs());
+    return Status::OK();
+  }
+
+ private:
+  bool IsMaskZero(const string& mask) const {
+    return node_->attr().at(mask).i() == 0;
+  }
+
+  bool IsOnlyBeginEndMask() const {
+    return IsMaskZero("ellipsis_mask") && IsMaskZero("new_axis_mask") &&
+           IsMaskZero("shrink_axis_mask");
+  }
+
+  Status UpdateMask(const string& mask) {
+    int i = node_->attr().at(mask).i();
+    if (i < 0 || i > 15) {
+      return errors::InvalidArgument("invalid mask value: ", i);
+    }
+    if (i == 0 || i == 1 || i == 14 || i == 15) return Status::OK();
+    switch (i) {
+      case 2:
+      case 3:
+        i += 2;
+        break;
+      case 4:
+      case 5:
+        i += 4;
+        break;
+      case 6:
+      case 7:
+        i += 6;
+        break;
+      case 8:
+      case 9:
+        i -= 6;
+        break;
+      case 10:
+      case 11:
+        i -= 4;
+        break;
+      case 12:
+      case 13:
+        i -= 2;
+        break;
     }
+    node_->mutable_attr()->at(mask).set_i(i);
     return Status::OK();
   }
 };
 
+class StridedSliceGradProcessor : public StridedSliceProcessor {
+ public:
+  explicit StridedSliceGradProcessor(const OptimizeContext& opt_cxt)
+      : StridedSliceProcessor(opt_cxt) {
+    start_ = 0;
+    end_ = 3;
+  }
+
+ protected:
+  std::vector<int> GetInputPos() const override { return {4}; }
+};
+
 class SqueezeProcessor : public AgnosticNodeProcessor {
  public:
   explicit SqueezeProcessor(const OptimizeContext& opt_cxt)
@@ -1271,124 +1717,171 @@ class SqueezeProcessor : public AgnosticNodeProcessor {
 
  protected:
   bool ShouldProcess() const override {
-    return !MustPreserve() && IsDimsN(*node_, 2) && HasOutputs() &&
-           IsNodeAfterNCHWToNHWC() && IsInputConvertible() && IsAlongDimHW() &&
-           IsOnGPU();
+    bool is_dims_supported = (IsPortZeroDimsN(*node_, 2) && IsAlongHW()) ||
+                             (IsPortZeroDimsN(*node_, 1) && IsAlongNHW());
+    return !MustPreserve() && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+           IsInputConvertible() && is_dims_supported && IsOnGPU();
   }
 
   Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
 
+  Status CustomizedProcessing() override {
+    TF_RETURN_IF_ERROR(HasAttribute(*node_, "squeeze_dims"));
+    auto list = node_->mutable_attr()->at("squeeze_dims").mutable_list();
+    if (list->i_size() == 2) {
+      list->set_i(0, 2);
+      list->set_i(1, 3);
+    } else if (list->i_size() == 3) {
+      list->set_i(1, 2);
+      list->set_i(2, 3);
+    }
+    return Status::OK();
+  }
+
+ private:
   bool IsInputConvertible() const {
+    int input_port;
     auto input = node_map_->GetNode(node_->input(0));
-    if (IsNodeNCHWToNHWC(input->name())) {
-      input = node_map_->GetNode(input->input(0));
-    }
+    ParseNodeName(node_->input(0), &input_port);
     if (input->attr().find("_output_shapes") != input->attr().end()) {
-      auto shape = input->attr().at("_output_shapes").list().shape(0);
+      auto shape = input->attr().at("_output_shapes").list().shape(input_port);
       if (shape.dim_size() != 4) {
         return false;
       }
       if (shape.dim(1).size() == 1 && shape.dim(2).size() == 1) {
         return true;
       }
+      if (shape.dim(0).size() == 1 && shape.dim(1).size() == 1 &&
+          shape.dim(2).size() == 1) {
+        return true;
+      }
     }
     return false;
   }
 
-  bool IsAlongDimHW() const {
+  bool IsAlongAxis(const std::vector<int>& axis) const {
     if (node_->attr().find("squeeze_dims") != node_->attr().end()) {
       auto list = node_->attr().at("squeeze_dims").list();
-      if (list.i(0) == 1 && list.i(1) == 2) {
-        return true;
+      // If list is empty, Squeeze op will squeeze all dimensions of size 1.
+      if (list.i_size() == 0) return true;
+      if (list.i_size() == axis.size()) {
+        bool along_axis = true;
+        for (int i = 0; i < axis.size(); i++) {
+          along_axis = along_axis && (list.i(i) == axis[i]);
+        }
+        if (along_axis) return true;
       }
     }
     return false;
   }
-
-  Status CustomizedProcessing() override {
-    TF_RETURN_IF_ERROR(HasAttribute(*node_, "squeeze_dims"));
-    auto list = node_->mutable_attr()->at("squeeze_dims").mutable_list();
-    list->set_i(0, 2);
-    list->set_i(1, 3);
-    return Status::OK();
-  }
+  bool IsAlongHW() const { return IsAlongAxis({1, 2}); }
+  bool IsAlongNHW() const { return IsAlongAxis({0, 1, 2}); }
 };
 
-class SumProcessor : public AgnosticNodeProcessor {
+class ReduceProcessor : public AgnosticNodeProcessor {
  public:
-  explicit SumProcessor(const OptimizeContext& opt_cxt)
+  explicit ReduceProcessor(const OptimizeContext& opt_cxt)
       : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
     auto input0 = node_map_->GetNode(node_->input(0));
+    int port;
+    ParseNodeName(node_->input(0), &port);
     return !MustPreserve() && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
-           (IsDimsFour(*input0) || IsNodeNCHWToNHWC(input0->name())) &&
-           IsAlongDimNHW() && IsOnGPU();
+           IsPortDimsFour(*input0, port) && IsReduceAxisSupported() &&
+           IsOnGPU();
   }
 
-  Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
-
   Status CustomizedProcessing() override {
-    node_map_->AddOutput(kReductionConst, node_->name());
-    *node_->mutable_input(1) = GetOrAddNodeReductionConst();
+    if (IsAlongNHW() || IsAlongHW() || IsAlongC()) {
+      DataType dtype = node_->attr().at("Tidx").type();
+      TF_RETURN_IF_ERROR(
+          UpdateOrTransformParamInput(1, "DataFormatDimMap", dtype));
+    }
     return Status::OK();
   }
 
- private:
-  bool IsAlongDimNHW() const {
-    NodeDef* node = node_map_->GetNode(node_->input(1));
-    Tensor tensor;
-    if (node->attr().find({"value"}) == node->attr().end()) {
-      return false;
-    }
-    auto success = tensor.FromProto(node->attr().at({"value"}).tensor());
-    if (!success) {
-      LOG(ERROR) << "Failed to parse TensorProto.";
-      return false;
+  Status AddLayoutTransposeToOutputs() override {
+    if (KeepDims()) {
+      return AddTransformToOutputs("Transpose");
     }
-    if (tensor.flat<int>().size() != 3) {
+    return Status::OK();
+  }
+
+ private:
+  bool IsReduceAxisSupported() const {
+    return KeepDims() || ((IsAlongAllFourDims() || IsAlongHWC() ||
+                           IsAlongNHW() || IsAlongHW() || IsAlongC()) &&
+                          !KeepDims());
+  }
+
+  bool IsAlongAxis(const std::vector<int>& axis) const {
+    auto axis_node = node_map_->GetNode(node_->input(1));
+    if (!IsConstant(*axis_node)) {
       return false;
     }
-    if (tensor.flat<int>()(0) == 0 && tensor.flat<int>()(1) == 1 &&
-        tensor.flat<int>()(2) == 2) {
-      return true;
+    if (HasAttribute(*axis_node, "value").ok()) {
+      Tensor tensor;
+      auto success = tensor.FromProto(axis_node->attr().at({"value"}).tensor());
+      if (!success) {
+        LOG(ERROR) << "Failed to parse TensorProto.";
+      }
+      if (tensor.dims() == 1 && tensor.dim_size(0) == axis.size()) {
+        bool along_axis = true;
+        for (int i = 0; i < axis.size(); i++) {
+          along_axis = along_axis && (tensor.flat<int>()(i) == axis[i]);
+        }
+        if (along_axis) return true;
+      }
     }
     return false;
   }
 
-  NodeDef* AddNodeReductionConst(const string& suffix,
-                                 const string& depended_node,
-                                 const string& device) {
-    auto const_node = GraphProcessor::AddNodeReductionConst(
-        strings::StrCat(kReductionConst, "-", suffix), device);
-    // This is to ensure the Sum node and the const node are in the
-    // same frame.
-    *const_node->add_input() = AsControlDependency(depended_node);
-    return const_node;
-  }
+  bool IsAlongAllFourDims() const { return IsAlongAxis({0, 1, 2, 3}); }
 
-  string GetOrAddNodeReductionConst() {
-    string const_name;
-    if (is_in_frame_) {
-      auto const_node = AddNodeReductionConst(
-          node_->name(), NodeName(node_->input(0)), node_->device());
-      const_name = const_node->name();
-    } else {
-      const_name = kReductionConst;
-    }
-    return const_name;
+  bool IsAlongHWC() const { return IsAlongAxis({1, 2, 3}); }
+
+  bool IsAlongNHW() const { return IsAlongAxis({0, 1, 2}); }
+
+  bool IsAlongHW() const { return IsAlongAxis({1, 2}); }
+
+  bool IsAlongC() const { return IsAlongAxis({3}); }
+
+  bool KeepDims() const { return node_->attr().at("keep_dims").b(); }
+};
+
+class SwitchProcessor : public AgnosticNodeProcessor {
+ public:
+  explicit SwitchProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
+
+ protected:
+  std::set<int> GetOutputPos() const override { return {0, 1}; }
+};
+
+class TileProcessor : public AgnosticNodeProcessor {
+ public:
+  explicit TileProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
+
+ protected:
+  Status CustomizedProcessing() override {
+    DataType dtype = node_->attr().at("Tmultiples").type();
+    return UpdateOrTransformParamInput(1, "DataFormatVecPermute", dtype);
   }
 };
 
 class DataLayoutOptimizer : GraphProcessor {
  public:
   explicit DataLayoutOptimizer(
+      const GraphProperties& graph_properties,
       const VirtualPlacer& virtual_placer,
       const LayoutOptimizer::TuningConfig& config,
       const std::unordered_set<string>& nodes_to_preserve, GraphDef* graph,
       NodeMap* node_map)
-      : GraphProcessor(virtual_placer, nodes_to_preserve, graph, node_map),
+      : GraphProcessor(graph_properties, virtual_placer, nodes_to_preserve,
+                       graph, node_map),
         config_(config) {}
 
   Status Optimize() {
@@ -1402,23 +1895,13 @@ class DataLayoutOptimizer : GraphProcessor {
 
  private:
   NodeDef* AddNodePermNHWCToNCHW() {
-    return AddNodePermConst(kPermNHWCToNCHW, "", {0, 3, 1, 2});
+    return AddNodePermConst(LayoutOptimizerNode(kPermNHWCToNCHW), "",
+                            {0, 3, 1, 2});
   }
 
   NodeDef* AddNodePermNCHWToNHWC() {
-    return AddNodePermConst(kPermNCHWToNHWC, "", {0, 2, 3, 1});
-  }
-
-  NodeDef* AddNodeConcatConst() {
-    return AddNodeConstScalar(kConcatConst, "", DT_INT32, 1);
-  }
-
-  NodeDef* AddNodeGatherAxisConst() {
-    return AddNodeConstScalar(kGatherAxisConst, "", DT_INT32, 0);
-  }
-
-  NodeDef* AddNodeReductionConst() {
-    return GraphProcessor::AddNodeReductionConst(kReductionConst, "");
+    return AddNodePermConst(LayoutOptimizerNode(kPermNCHWToNHWC), "",
+                            {0, 2, 3, 1});
   }
 
   // Expand all nodes which is in NHWC, but supports NCHW or is layout agnostic.
@@ -1431,12 +1914,17 @@ class DataLayoutOptimizer : GraphProcessor {
     // This is the first pass where we expand the nodes which support NCHW.
     std::set<string> ops_format_supported = GetOpsFormatSupported();
     for (int i = 0; i < node_size_original; i++) {
+      if (IsNodeByLayoutOptimizer(graph_->node(i).name())) {
+        return Status(error::INVALID_ARGUMENT,
+                      "The graph is already optimized by layout optimizer.");
+      }
       if (ops_format_supported.find(graph_->node(i).op()) !=
           ops_format_supported.end()) {
         auto node = graph_->mutable_node(i);
         bool is_in_frame = !frames[node].empty();
-        OptimizeContext opt_cxt(graph_, node, node_map_, virtual_placer_,
-                                nodes_to_preserve_, is_in_frame);
+        OptimizeContext opt_cxt(graph_, node, node_map_, graph_properties_,
+                                virtual_placer_, nodes_to_preserve_,
+                                is_in_frame);
         std::unique_ptr<NodeProcessor> node_processor;
         if (IsAvgPoolGrad(*node)) {
           node_processor.reset(new AvgPoolGradProcessor(opt_cxt));
@@ -1457,10 +1945,14 @@ class DataLayoutOptimizer : GraphProcessor {
               new Conv2DBackpropFilterProcessor(opt_cxt, true));
         } else if (IsDepthwiseConv2dNativeBackpropInput(*node)) {
           node_processor.reset(new Conv2DBackpropInputProcessor(opt_cxt, true));
-        } else if (IsFusedBatchNormGradV1(*node)) {
+        } else if (IsFusedBatchNormGrad(*node)) {
           node_processor.reset(new FusedBatchNormGradProcessor(opt_cxt));
-        } else if (IsMaxPoolGradV1(*node)) {
+        } else if (IsMaxPoolV2(*node)) {
+          node_processor.reset(new MaxPoolV2Processor(opt_cxt));
+        } else if (IsMaxPoolGradV1(*node) || IsMaxPoolGradGradV1(*node)) {
           node_processor.reset(new MaxPoolGradProcessor(opt_cxt));
+        } else if (IsMaxPoolGradV2(*node) || IsMaxPoolGradGradV2(*node)) {
+          node_processor.reset(new MaxPoolGradV2Processor(opt_cxt));
         } else {
           node_processor.reset(new NodeProcessor(opt_cxt));
         }
@@ -1474,45 +1966,61 @@ class DataLayoutOptimizer : GraphProcessor {
     if (graph_->node_size() > node_size_original) {
       NodeDef* n = AddNodePermNHWCToNCHW();
       n = AddNodePermNCHWToNHWC();
-      n = AddNodeConcatConst();
-      n = AddNodeGatherAxisConst();
-      n = AddNodeReductionConst();
       std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
       for (int i = 0; i < graph_->node_size(); i++) {
         if (ops_format_agnostic.find(graph_->node(i).op()) !=
             ops_format_agnostic.end()) {
           auto node = graph_->mutable_node(i);
           bool is_in_frame = !frames[node].empty();
-          OptimizeContext opt_cxt(graph_, node, node_map_, virtual_placer_,
-                                  nodes_to_preserve_, is_in_frame);
+          OptimizeContext opt_cxt(graph_, node, node_map_, graph_properties_,
+                                  virtual_placer_, nodes_to_preserve_,
+                                  is_in_frame);
           std::unique_ptr<NodeProcessor> node_processor;
           if (IsAddN(*node)) {
             node_processor.reset(new AddNProcessor(opt_cxt));
-          } else if (IsAdd(*node) || IsMul(*node) || IsRealDiv(*node) ||
-                     IsSquaredDifference(*node) || IsSub(*node)) {
+          } else if (IsBetainc(*node)) {
+            node_processor.reset(new TernaryOpProcessor(opt_cxt));
+          } else if (IsBinaryOp(*node)) {
             node_processor.reset(new BinaryOpProcessor(opt_cxt));
           } else if (IsConcat(*node)) {
             node_processor.reset(new ConcatProcessor(opt_cxt));
-          } else if (IsPad(*node)) {
+          } else if (IsFill(*node)) {
+            node_processor.reset(new FillProcessor(opt_cxt));
+          } else if (IsHistogramSummary(*node)) {
+            node_processor.reset(new HistogramSummaryProcessor(opt_cxt));
+          } else if (IsIdentityN(*node)) {
+            node_processor.reset(new IdentityNProcessor(opt_cxt));
+          } else if (IsMerge(*node)) {
+            node_processor.reset(new MergeProcessor(opt_cxt));
+          } else if (IsPad(*node) || IsMirrorPad(*node) ||
+                     IsMirrorPadGrad(*node)) {
             node_processor.reset(new PadProcessor(opt_cxt));
-          } else if (IsReluGrad(*node)) {
-            node_processor.reset(new ReluGradProcessor(opt_cxt));
+          } else if (IsReduceOp(*node)) {
+            node_processor.reset(new ReduceProcessor(opt_cxt));
+          } else if (IsReverseV2(*node)) {
+            node_processor.reset(new ReverseProcessor(opt_cxt));
+          } else if (IsSelect(*node)) {
+            node_processor.reset(new SelectProcessor(opt_cxt));
           } else if (IsSlice(*node)) {
-            auto input1 = node_map_->GetNode(NodeName(node->input(1)));
-            auto input2 = node_map_->GetNode(NodeName(node->input(2)));
-            if (IsConcatOffset(*input1)) {
-              node_processor.reset(new SliceProcessorConcatOffset(opt_cxt));
-            } else if (IsConstant(*input1) && IsConstant(*input2)) {
-              node_processor.reset(new SliceProcessorConst(opt_cxt));
-            } else {
-              node_processor.reset(new SliceProcessor(opt_cxt));
-            }
+            node_processor.reset(new SliceProcessor(opt_cxt));
+          } else if (IsStridedSlice(*node)) {
+            node_processor.reset(new StridedSliceProcessor(opt_cxt));
+          } else if (IsShape(*node) || IsShapeN(*node)) {
+            node_processor.reset(new ShapeProcessor(opt_cxt));
           } else if (IsSplit(*node)) {
             node_processor.reset(new SplitProcessor(opt_cxt));
+          } else if (IsSplitV(*node)) {
+            node_processor.reset(new SplitVProcessor(opt_cxt));
           } else if (IsSqueeze(*node)) {
             node_processor.reset(new SqueezeProcessor(opt_cxt));
-          } else if (IsSum(*node)) {
-            node_processor.reset(new SumProcessor(opt_cxt));
+          } else if (IsStridedSliceGrad(*node)) {
+            node_processor.reset(new StridedSliceGradProcessor(opt_cxt));
+          } else if (IsSwitch(*node)) {
+            node_processor.reset(new SwitchProcessor(opt_cxt));
+          } else if (IsTile(*node)) {
+            node_processor.reset(new TileProcessor(opt_cxt));
+          } else if (IsUnaryGrad(*node)) {
+            node_processor.reset(new UnaryGradProcessor(opt_cxt));
           } else {
             node_processor.reset(new AgnosticNodeProcessor(opt_cxt));
           }
@@ -1530,8 +2038,16 @@ class DataLayoutOptimizer : GraphProcessor {
     for (int i = 0; i < graph_->node_size(); i++) {
       auto node = graph_->mutable_node(i);
       node->mutable_attr()->erase("_output_shapes");
-      if (IsNodeNHWCToNCHW(node->name())) {
-        if (IsNodeNCHWToNHWC(node->input(0))) {
+      if (IsTransposeNHWCToNCHW(node->name()) ||
+          IsDimMapNHWCToNCHW(node->name()) ||
+          IsVecPermuteNHWCToNCHW(node->name())) {
+        bool transpose_pair = IsTransposeNHWCToNCHW(node->name()) &&
+                              IsTransposeNCHWToNHWC(node->input(0));
+        bool dim_map_pair = IsDimMapNHWCToNCHW(node->name()) &&
+                            IsDimMapNCHWToNHWC(node->input(0));
+        bool vec_permute_pair = IsVecPermuteNHWCToNCHW(node->name()) &&
+                                IsVecPermuteNCHWToNHWC(node->input(0));
+        if (transpose_pair || dim_map_pair || vec_permute_pair) {
           const string& trans_first = node->input(0);
           const string& trans_second = node->name();
           auto outputs = node_map_->GetOutputs(trans_second);
@@ -1564,17 +2080,6 @@ class DataLayoutOptimizer : GraphProcessor {
   const LayoutOptimizer::TuningConfig& config_;
 };
 
-int GetNumTranspose(const GraphDef& graph) {
-  int number = 0;
-  for (const auto& node : graph.node()) {
-    if (IsTranspose(node)) {
-      number++;
-    }
-  }
-  VLOG(1) << "Number of Transpose nodes: " << number;
-  return number;
-}
-
 int GetNumGPUs(const Cluster& cluster) {
   auto devices = cluster.GetDevices();
   int num_gpus = 0;
@@ -1599,12 +2104,14 @@ Status LayoutOptimizer::Tune(const GrapplerItem& item,
                              const TuningConfig& config, GraphDef* output) {
   auto status = graph_properties.AnnotateOutputShapes(output);
   if (!status.ok()) {
+    VLOG(1) << "Annotate shape return status: " << status.ToString();
     *output = item.graph;
     return status;
   }
   NodeMap node_map(output);
-  DataLayoutOptimizer layout_optimizer(*virtual_placer_, config,
-                                       nodes_to_preserve_, output, &node_map);
+  DataLayoutOptimizer layout_optimizer(graph_properties, *virtual_placer_,
+                                       config, nodes_to_preserve_, output,
+                                       &node_map);
   status = layout_optimizer.Optimize();
   return status;
 }
@@ -1620,27 +2127,21 @@ Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   virtual_placer_.reset(new VirtualPlacer(cluster));
   nodes_to_preserve_ = item.NodesToPreserve();
   GraphProperties graph_properties(item);
-  auto status = graph_properties.InferStatically();
+  auto status = graph_properties.InferStatically(false);
   if (!status.ok()) {
+    VLOG(1) << "Infer shape return status: " << status.ToString();
     *output = item.graph;
     return status;
   }
 
   TuningConfig config;
-  config.no_gemm = false;
+  config.no_gemm = true;
+  // TODO(yaozhang): Enable tuning with various TuningConfig choices wtih
+  // the measurement-based estimator.
   status = Tune(item, graph_properties, config, output);
-  // This is based on an empirical observation that if the introduced Transpose
-  // nodes is more than 30, not using GEMM implementation would result in better
-  // performance.
-  if (status.ok() && GetNumTranspose(*output) > 30) {
-    config.no_gemm = true;
-    status = Tune(item, graph_properties, config, output);
-  }
-
   if (!status.ok()) {
     *output = item.graph;
   }
-
   return status;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index 8c89f6744ba583f6a83362c4bf436eb811908796..5cb366df2dccee2260c6f407e992e73296712ccc 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 
@@ -44,7 +45,7 @@ class LayoutOptimizerTest : public ::testing::Test {
 
   Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
                       const string& padding, const string& device) {
-    int batch_size = 128;
+    int batch_size = 8;
     int input_height = input_size;
     int input_width = input_size;
     int input_depth = 3;
@@ -71,6 +72,12 @@ class LayoutOptimizerTest : public ::testing::Test {
 
   Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
                                    int filter_size, const string& padding) {
+    return SimpleConv2DBackpropInput(s, input_size, filter_size, padding, true);
+  }
+
+  Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
+                                   int filter_size, const string& padding,
+                                   bool const_input_size) {
     int batch_size = 128;
     int input_height = input_size;
     int input_width = input_size;
@@ -100,11 +107,18 @@ class LayoutOptimizerTest : public ::testing::Test {
     Output output =
         ops::Const(s->WithOpName("Output"), Input::Initializer(output_data));
 
-    Output conv_backprop_input = ops::Conv2DBackpropInput(
-        s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output,
-        {1, stride, stride, 1}, padding);
-    TensorShape input_shape(
-        {batch_size, input_height, input_width, input_depth});
+    Output conv_backprop_input;
+    Output input_sizes_i =
+        ops::Identity(s->WithOpName("InputSizesIdentity"), input_sizes);
+    if (const_input_size) {
+      conv_backprop_input = ops::Conv2DBackpropInput(
+          s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output,
+          {1, stride, stride, 1}, padding);
+    } else {
+      conv_backprop_input = ops::Conv2DBackpropInput(
+          s->WithOpName("Conv2DBackpropInput"), input_sizes_i, filter, output,
+          {1, stride, stride, 1}, padding);
+    }
     return conv_backprop_input;
   }
 
@@ -158,8 +172,7 @@ TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
 
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
-  string input_name = AddPrefixToNodeName("Conv2DBackpropInput-InputSizes",
-                                          "LayoutOptimizer", "-");
+  string input_name = "Conv2DBackpropInput-0-LayoutOptimizer";
   auto input_sizes_node = node_map.GetNode(input_name);
   CHECK(input_sizes_node);
   auto conv2d_backprop_node = node_map.GetNode("Conv2DBackpropInput");
@@ -171,6 +184,28 @@ TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
   test::ExpectTensorEqual<int>(input_sizes_expected, input_sizes);
 }
 
+TEST_F(LayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", false);
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv2d_backprop_node = node_map.GetNode("Conv2DBackpropInput");
+  CHECK(conv2d_backprop_node);
+  EXPECT_EQ(conv2d_backprop_node->input(0),
+            "Conv2DBackpropInput-0-VecPermuteNHWCToNCHW-LayoutOptimizer");
+  auto input_sizes_node = node_map.GetNode(
+      "Conv2DBackpropInput-0-VecPermuteNHWCToNCHW-LayoutOptimizer");
+  CHECK(input_sizes_node);
+  EXPECT_EQ(input_sizes_node->input(0), "InputSizesIdentity");
+  EXPECT_EQ(input_sizes_node->op(), "DataFormatVecPermute");
+}
+
 TEST_F(LayoutOptimizerTest, FilterSizeIsOne) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv = SimpleConv2D(&s, 2, 1, "SAME");
@@ -181,8 +216,7 @@ TEST_F(LayoutOptimizerTest, FilterSizeIsOne) {
   GraphDef output;
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
-  EXPECT_FALSE(
-      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+  EXPECT_TRUE(node_map.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer"));
 }
 
 TEST_F(LayoutOptimizerTest, FilterSizeNotOne) {
@@ -195,8 +229,7 @@ TEST_F(LayoutOptimizerTest, FilterSizeNotOne) {
   GraphDef output;
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
-  EXPECT_FALSE(
-      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+  EXPECT_TRUE(node_map.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer"));
 }
 
 TEST_F(LayoutOptimizerTest, EqualSizeWithValidPadding) {
@@ -209,8 +242,7 @@ TEST_F(LayoutOptimizerTest, EqualSizeWithValidPadding) {
   GraphDef output;
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
-  EXPECT_FALSE(
-      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+  EXPECT_TRUE(node_map.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer"));
 }
 
 TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) {
@@ -223,13 +255,12 @@ TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) {
   GraphDef output;
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
-  EXPECT_TRUE(
-      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input-0"));
+  EXPECT_TRUE(node_map.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer"));
 }
 
 TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -237,13 +268,12 @@ TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
   GraphDef output;
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
-  EXPECT_TRUE(
-      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input-0"));
+  EXPECT_TRUE(node_map.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer"));
 }
 
 TEST_F(LayoutOptimizerTest, Pad) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
   auto c = ops::Const(s.WithOpName("c"), {1, 2, 3, 4, 5, 6, 7, 8}, {4, 2});
   auto p = ops::Pad(s.WithOpName("p"), conv, c);
   auto o = ops::Identity(s.WithOpName("o"), p);
@@ -257,7 +287,7 @@ TEST_F(LayoutOptimizerTest, Pad) {
   auto pad = node_map.GetNode("p");
   EXPECT_EQ(pad->input(0), "Conv2D");
 
-  auto pad_const = node_map.GetNode("LayoutOptimizer-p-c");
+  auto pad_const = node_map.GetNode("p-1-LayoutOptimizer");
   EXPECT_TRUE(pad_const);
   EXPECT_TRUE(pad_const->attr().find("value") != pad_const->attr().end());
   Tensor tensor;
@@ -270,7 +300,7 @@ TEST_F(LayoutOptimizerTest, Pad) {
 
 TEST_F(LayoutOptimizerTest, Connectivity) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
   auto i1 = ops::Identity(s.WithOpName("i1"), conv);
   auto i2 = ops::Identity(s.WithOpName("i2"), i1);
   auto i3 = ops::Identity(s.WithOpName("i3"), i2);
@@ -298,9 +328,42 @@ TEST_F(LayoutOptimizerTest, Connectivity) {
   EXPECT_EQ(node_i2_output->input(0), "i1");
 }
 
+TEST_F(LayoutOptimizerTest, ConnectivityBinaryOpWithInputScalarAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto i1 = ops::Identity(s.WithOpName("i1"), conv);
+  auto i2 = ops::Identity(s.WithOpName("i2"), i1);
+  auto scalar_sub = ops::Const(s.WithOpName("scalar_sub"), 3.0f, {});
+  auto sub = ops::Sub(s.WithOpName("sub"), scalar_sub, i2);
+  auto i3 = ops::Identity(s.WithOpName("i3"), sub);
+  auto i4 = ops::Identity(s.WithOpName("i4"), i3);
+  auto i5 = ops::Identity(s.WithOpName("i5"), i4);
+  auto scalar_mul = ops::Const(s.WithOpName("scalar_mul"), 3.0f, {});
+  auto mul = ops::Mul(s.WithOpName("mul"), scalar_mul, i5);
+  auto i6 = ops::Identity(s.WithOpName("i6"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  // Make the graph not in topological order to test the handling of multi-hop
+  // connectivity (here we say two nodes are connected if all nodes in the
+  // middle are layout agnostic). If the graph is already in topological order,
+  // the problem is easier, where layout optimizer only needs to check
+  // single-hop connectivity.
+  NodeMap node_map_original(&item.graph);
+  auto node_i1 = node_map_original.GetNode("i1");
+  auto node_mul = node_map_original.GetNode("mul");
+  node_mul->Swap(node_i1);
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map_output(&output);
+  auto mul_node = node_map_output.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "scalar_mul");
+  EXPECT_EQ(mul_node->input(1), "i5");
+}
+
 TEST_F(LayoutOptimizerTest, PreserveFetch) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
   auto i = ops::Identity(s.WithOpName("i"), conv);
   GrapplerItem item;
   item.fetch.push_back("Conv2D");
@@ -315,7 +378,7 @@ TEST_F(LayoutOptimizerTest, PreserveFetch) {
 
 TEST_F(LayoutOptimizerTest, EmptyDevice) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -330,7 +393,7 @@ TEST_F(LayoutOptimizerTest, EmptyDevice) {
 TEST_F(LayoutOptimizerTest, GPUDevice) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv =
-      SimpleConv2D(&s, 3, 2, "VALID", "/job:w/replica:0/task:0/device:gpu:0");
+      SimpleConv2D(&s, 4, 2, "VALID", "/job:w/replica:0/task:0/device:gpu:0");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -345,7 +408,7 @@ TEST_F(LayoutOptimizerTest, GPUDevice) {
 TEST_F(LayoutOptimizerTest, CPUDeviceLowercase) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv =
-      SimpleConv2D(&s, 3, 2, "VALID", "/job:w/replica:0/task:0/device:cpu:0");
+      SimpleConv2D(&s, 4, 2, "VALID", "/job:w/replica:0/task:0/device:cpu:0");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -359,7 +422,7 @@ TEST_F(LayoutOptimizerTest, CPUDeviceLowercase) {
 
 TEST_F(LayoutOptimizerTest, CPUDeviceUppercase) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID", "/CPU:0");
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID", "/CPU:0");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -401,7 +464,7 @@ TEST_F(LayoutOptimizerTest, FusedBatchNormGradTrainingFalse) {
 
 TEST_F(LayoutOptimizerTest, SplitDimC) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
   auto c = ops::Const(s.WithOpName("c"), 3, {});
   auto split = ops::Split(s.WithOpName("split"), c, conv, 2);
   auto i = ops::Identity(s.WithOpName("i"), split[0]);
@@ -412,16 +475,16 @@ TEST_F(LayoutOptimizerTest, SplitDimC) {
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   auto split_node = node_map.GetNode("split");
-  EXPECT_EQ(split_node->input(0), "LayoutOptimizerSplitConst-split");
+  EXPECT_EQ(split_node->input(0), "split-0-LayoutOptimizer");
   EXPECT_EQ(split_node->input(1), "Conv2D");
-  auto split_const = node_map.GetNode("LayoutOptimizerSplitConst-split");
+  auto split_const = node_map.GetNode("split-0-LayoutOptimizer");
   EXPECT_EQ(split_const->op(), "Const");
   EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 1);
 }
 
 TEST_F(LayoutOptimizerTest, SplitDimH) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 6, 2, "SAME");
   auto c = ops::Const(s.WithOpName("c"), 1, {});
   auto split = ops::Split(s.WithOpName("split"), c, conv, 2);
   auto i = ops::Identity(s.WithOpName("i"), split[0]);
@@ -432,16 +495,16 @@ TEST_F(LayoutOptimizerTest, SplitDimH) {
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   auto split_node = node_map.GetNode("split");
-  EXPECT_EQ(split_node->input(0), "LayoutOptimizerSplitConst-split");
+  EXPECT_EQ(split_node->input(0), "split-0-LayoutOptimizer");
   EXPECT_EQ(split_node->input(1), "Conv2D");
-  auto split_const = node_map.GetNode("LayoutOptimizerSplitConst-split");
+  auto split_const = node_map.GetNode("split-0-LayoutOptimizer");
   EXPECT_EQ(split_const->op(), "Const");
   EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 2);
 }
 
 TEST_F(LayoutOptimizerTest, SplitDimW) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
   auto c = ops::Const(s.WithOpName("c"), 2, {});
   auto split = ops::Split(s.WithOpName("split"), c, conv, 2);
   auto i = ops::Identity(s.WithOpName("i"), split[0]);
@@ -452,16 +515,16 @@ TEST_F(LayoutOptimizerTest, SplitDimW) {
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   auto split_node = node_map.GetNode("split");
-  EXPECT_EQ(split_node->input(0), "LayoutOptimizerSplitConst-split");
+  EXPECT_EQ(split_node->input(0), "split-0-LayoutOptimizer");
   EXPECT_EQ(split_node->input(1), "Conv2D");
-  auto split_const = node_map.GetNode("LayoutOptimizerSplitConst-split");
+  auto split_const = node_map.GetNode("split-0-LayoutOptimizer");
   EXPECT_EQ(split_const->op(), "Const");
   EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 3);
 }
 
 TEST_F(LayoutOptimizerTest, SplitDimN) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
   auto c = ops::Const(s.WithOpName("c"), 0, {});
   auto split = ops::Split(s.WithOpName("split"), c, conv, 2);
   auto i = ops::Identity(s.WithOpName("i"), split[0]);
@@ -472,16 +535,16 @@ TEST_F(LayoutOptimizerTest, SplitDimN) {
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   auto split_node = node_map.GetNode("split");
-  EXPECT_EQ(split_node->input(0), "LayoutOptimizerSplitConst-split");
+  EXPECT_EQ(split_node->input(0), "split-0-LayoutOptimizer");
   EXPECT_EQ(split_node->input(1), "Conv2D");
-  auto split_const = node_map.GetNode("LayoutOptimizerSplitConst-split");
+  auto split_const = node_map.GetNode("split-0-LayoutOptimizer");
   EXPECT_EQ(split_const->op(), "Const");
   EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 0);
 }
 
 TEST_F(LayoutOptimizerTest, SplitNonConstDim) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
   auto c = ops::Const(s.WithOpName("c"), 0, {});
   auto i1 = ops::Identity(s.WithOpName("i1"), c);
   auto split = ops::Split(s.WithOpName("split"), i1, conv, 2);
@@ -493,11 +556,552 @@ TEST_F(LayoutOptimizerTest, SplitNonConstDim) {
   Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   auto split_node = node_map.GetNode("split");
-  EXPECT_EQ(split_node->input(0), "i1");
-  EXPECT_EQ(split_node->input(1),
-            "LayoutOptimizerTransposeNCHWToNHWC-Conv2D-split");
+  EXPECT_EQ(split_node->input(0), "split-0-DimMapNHWCToNCHW-LayoutOptimizer");
+  EXPECT_EQ(split_node->input(1), "Conv2D");
+  auto map_node = node_map.GetNode("split-0-DimMapNHWCToNCHW-LayoutOptimizer");
+  EXPECT_EQ(map_node->op(), "DataFormatDimMap");
+  EXPECT_EQ(map_node->input(0), "i1");
+}
+
+TEST_F(LayoutOptimizerTest, SplitSamePortToMultipleInputsOfSameNode) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
+  auto axis = ops::Const(s.WithOpName("axis"), 3);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat =
+      ops::Concat(s.WithOpName("concat"), {split[1], split[1], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split:1");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "split:1");
+  EXPECT_EQ(concat_node->input(3), "concat-3-LayoutOptimizer");
+  auto concat_dim = node_map.GetNode("concat-3-LayoutOptimizer");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 1);
+}
+
+TEST_F(LayoutOptimizerTest, ConcatDimH) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "SAME");
+  auto axis = ops::Const(s.WithOpName("axis"), 1);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "concat-2-LayoutOptimizer");
+  auto concat_dim = node_map.GetNode("concat-2-LayoutOptimizer");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 2);
+}
+
+TEST_F(LayoutOptimizerTest, ConcatNonConst) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "SAME");
+  auto axis = ops::Const(s.WithOpName("axis"), 1);
+  auto i = ops::Identity(s.WithOpName("i"), axis);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, i);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "concat-2-DimMapNHWCToNCHW-LayoutOptimizer");
+  auto concat_dim =
+      node_map.GetNode("concat-2-DimMapNHWCToNCHW-LayoutOptimizer");
+  EXPECT_EQ(concat_dim->op(), "DataFormatDimMap");
+  EXPECT_EQ(concat_dim->input(0), "i");
+}
+
+TEST_F(LayoutOptimizerTest, ConcatDimW) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "SAME");
+  auto axis = ops::Const(s.WithOpName("axis"), 2);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "concat-2-LayoutOptimizer");
+  auto concat_dim = node_map.GetNode("concat-2-LayoutOptimizer");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 3);
+}
+
+TEST_F(LayoutOptimizerTest, ConcatDimN) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto axis = ops::Const(s.WithOpName("axis"), 0);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "concat-2-LayoutOptimizer");
+  auto concat_dim = node_map.GetNode("concat-2-LayoutOptimizer");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 0);
+}
+
+TEST_F(LayoutOptimizerTest, ConcatDimC) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto axis = ops::Const(s.WithOpName("axis"), 3);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "concat-2-LayoutOptimizer");
+  auto concat_dim = node_map.GetNode("concat-2-LayoutOptimizer");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 1);
+}
+
+TEST_F(LayoutOptimizerTest, Sum) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto reduction_indices =
+      ops::Const(s.WithOpName("reduction_indices"), {0, 1, 2}, {3});
+  auto sum = ops::Sum(s.WithOpName("sum"), conv, reduction_indices);
+  auto o = ops::Identity(s.WithOpName("o"), sum);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  // TODO(yaozhang): enable SumProcessor with auto-tuning. Currently disabled
+  // because of the worse performance in some cases.
+  /*
+  NodeMap node_map(&output);
+  auto sum_node = node_map.GetNode("sum");
+  EXPECT_EQ(sum_node->input(0), "Conv2D");
+  EXPECT_EQ(sum_node->input(1), "LayoutOptimizer-sum-reduction_indices");
+  auto sum_const = node_map.GetNode("LayoutOptimizer-sum-reduction_indices");
+  Tensor tensor;
+  EXPECT_TRUE(
+      tensor.FromProto(sum_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor tensor_expected(DT_INT32, {3});
+  test::FillValues<int>(&tensor_expected, {0, 2, 3});
+  test::ExpectTensorEqual<int>(tensor_expected, tensor);
+  */
+}
+
+TEST_F(LayoutOptimizerTest, MulScalarAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto scalar = ops::Const(s.WithOpName("scalar"), 3.0f, {});
+  auto mul = ops::Mul(s.WithOpName("mul"), scalar, conv);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "scalar");
+  EXPECT_EQ(mul_node->input(1), "Conv2D");
+}
+
+TEST_F(LayoutOptimizerTest, Mul4DAndScalar) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto scalar = ops::Const(s.WithOpName("scalar"), 3.0f, {});
+  auto mul = ops::Mul(s.WithOpName("mul"), conv, scalar);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "Conv2D");
+  EXPECT_EQ(mul_node->input(1), "scalar");
+}
+
+TEST_F(LayoutOptimizerTest, Mul4DAndUnknownRank) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto unknown_rank =
+      ops::Placeholder(s.WithOpName("unknown"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape()));
+  Output c = ops::Const(s.WithOpName("c"), 3.0f, {8, 2, 2, 2});
+  Output mul = ops::Mul(s.WithOpName("mul"), conv, unknown_rank);
+  auto o = ops::AddN(s.WithOpName("o"), {mul, c});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  // Node mul should not be processed by layout optimizer, because one of its
+  // inputs is of unknown rank.
+  EXPECT_EQ(mul_node->input(0),
+            "Conv2D-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
+  EXPECT_EQ(mul_node->input(1), "unknown");
 }
 
+TEST_F(LayoutOptimizerTest, Mul4DAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto i = ops::Identity(s.WithOpName("i"), conv);
+  auto mul = ops::Mul(s.WithOpName("mul"), conv, i);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "Conv2D");
+  EXPECT_EQ(mul_node->input(1), "i");
+}
+
+TEST_F(LayoutOptimizerTest, Mul4DAndVector) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto vector = ops::Const(s.WithOpName("vector"), {3.0f, 7.0f}, {2});
+  auto mul = ops::Mul(s.WithOpName("mul"), conv, vector);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "Conv2D");
+  EXPECT_EQ(mul_node->input(1), "mul-1-ReshapeNHWCToNCHW-LayoutOptimizer");
+  auto mul_const = node_map.GetNode("mul-1-ReshapeConst-LayoutOptimizer");
+  Tensor tensor;
+  EXPECT_TRUE(
+      tensor.FromProto(mul_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor tensor_expected(DT_INT32, {4});
+  test::FillValues<int>(&tensor_expected, {1, 2, 1, 1});
+  test::ExpectTensorEqual<int>(tensor_expected, tensor);
+}
+
+TEST_F(LayoutOptimizerTest, MulVectorAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto vector = ops::Const(s.WithOpName("vector"), {3.0f, 7.0f}, {2});
+  auto mul = ops::Mul(s.WithOpName("mul"), vector, conv);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "mul-0-ReshapeNHWCToNCHW-LayoutOptimizer");
+  EXPECT_EQ(mul_node->input(1), "Conv2D");
+  auto mul_const = node_map.GetNode("mul-0-ReshapeConst-LayoutOptimizer");
+  Tensor tensor;
+  EXPECT_TRUE(
+      tensor.FromProto(mul_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor tensor_expected(DT_INT32, {4});
+  test::FillValues<int>(&tensor_expected, {1, 2, 1, 1});
+  test::ExpectTensorEqual<int>(tensor_expected, tensor);
+}
+
+TEST_F(LayoutOptimizerTest, SliceConst) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
+  auto begin = ops::Const(s.WithOpName("begin"), {0, 2, 3, 1}, {4});
+  auto size = ops::Const(s.WithOpName("size"), {4, 1, 2, 4}, {4});
+  auto slice = ops::Slice(s.WithOpName("slice"), conv, begin, size);
+  auto o = ops::Identity(s.WithOpName("o"), slice);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto slice_node = node_map.GetNode("slice");
+  EXPECT_EQ(slice_node->input(0), "Conv2D");
+  EXPECT_EQ(slice_node->input(1), "slice-1-LayoutOptimizer");
+  EXPECT_EQ(slice_node->input(2), "slice-2-LayoutOptimizer");
+
+  auto begin_const = node_map.GetNode("slice-1-LayoutOptimizer");
+  Tensor begin_tensor;
+  EXPECT_TRUE(begin_tensor.FromProto(
+      begin_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor begin_tensor_expected(DT_INT32, {4});
+  test::FillValues<int>(&begin_tensor_expected, {0, 1, 2, 3});
+  test::ExpectTensorEqual<int>(begin_tensor_expected, begin_tensor);
+
+  auto size_const = node_map.GetNode("slice-2-LayoutOptimizer");
+  Tensor size_tensor;
+  EXPECT_TRUE(size_tensor.FromProto(
+      size_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor size_tensor_expected(DT_INT32, {4});
+  test::FillValues<int>(&size_tensor_expected, {4, 4, 1, 2});
+  test::ExpectTensorEqual<int>(size_tensor_expected, size_tensor);
+}
+
+TEST_F(LayoutOptimizerTest, SliceNonConst) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
+  auto begin = ops::Const(s.WithOpName("begin"), {0, 2, 3, 1}, {4});
+  auto ibegin = ops::Identity(s.WithOpName("ibegin"), begin);
+  auto size = ops::Const(s.WithOpName("size"), {4, 1, 2, 4}, {4});
+  auto isize = ops::Identity(s.WithOpName("isize"), size);
+  auto slice = ops::Slice(s.WithOpName("slice"), conv, ibegin, isize);
+  auto o = ops::Identity(s.WithOpName("o"), slice);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto slice_node = node_map.GetNode("slice");
+  EXPECT_EQ(slice_node->input(0), "Conv2D");
+  EXPECT_EQ(slice_node->input(1),
+            "slice-1-VecPermuteNHWCToNCHW-LayoutOptimizer");
+  EXPECT_EQ(slice_node->input(2),
+            "slice-2-VecPermuteNHWCToNCHW-LayoutOptimizer");
+  auto perm1 = node_map.GetNode("slice-1-VecPermuteNHWCToNCHW-LayoutOptimizer");
+  EXPECT_EQ(perm1->op(), "DataFormatVecPermute");
+  EXPECT_EQ(perm1->input(0), "ibegin");
+  auto perm2 = node_map.GetNode("slice-2-VecPermuteNHWCToNCHW-LayoutOptimizer");
+  EXPECT_EQ(perm1->op(), "DataFormatVecPermute");
+  EXPECT_EQ(perm2->input(0), "isize");
+}
+
+TEST_F(LayoutOptimizerTest, DoNotApplyOptimizerTwice) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto scalar =
+      ops::Const(s.WithOpName("AlreadyApplied-LayoutOptimizer"), 3.0f, {});
+  auto mul = ops::Mul(s.WithOpName("mul"), scalar, scalar);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  EXPECT_TRUE(errors::IsInvalidArgument(status));
+}
+
+TEST_F(LayoutOptimizerTest, ShapeNWithInputs4DAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto shapen = ops::ShapeN(s.WithOpName("shapen"), {conv, conv});
+  auto add = ops::Add(s.WithOpName("add"), shapen[0], shapen[1]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto shapen_node = node_map.GetNode("shapen");
+  EXPECT_EQ(shapen_node->input(0), "Conv2D");
+  EXPECT_EQ(shapen_node->input(1), "Conv2D");
+  auto add_node = node_map.GetNode("add");
+  EXPECT_EQ(add_node->input(0),
+            "shapen-0-0-VecPermuteNCHWToNHWC-LayoutOptimizer");
+  EXPECT_EQ(add_node->input(1),
+            "shapen-0-1-VecPermuteNCHWToNHWC-LayoutOptimizer");
+  auto vec_permute1 =
+      node_map.GetNode("shapen-0-0-VecPermuteNCHWToNHWC-LayoutOptimizer");
+  EXPECT_EQ(vec_permute1->input(0), "shapen");
+  EXPECT_EQ(vec_permute1->op(), "DataFormatVecPermute");
+  auto vec_permute2 =
+      node_map.GetNode("shapen-0-1-VecPermuteNCHWToNHWC-LayoutOptimizer");
+  EXPECT_EQ(vec_permute2->input(0), "shapen:1");
+  EXPECT_EQ(vec_permute2->op(), "DataFormatVecPermute");
+}
+
+TEST_F(LayoutOptimizerTest, ShapeNWithInputsVectorAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto vector = ops::Const(s.WithOpName("vector"), 3.0f, {7});
+  auto shapen = ops::ShapeN(s.WithOpName("shapen"), {vector, conv});
+  auto add = ops::Add(s.WithOpName("add"), shapen[0], shapen[1]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto shapen_node = node_map.GetNode("shapen");
+  EXPECT_EQ(shapen_node->input(0), "vector");
+  EXPECT_EQ(shapen_node->input(1), "Conv2D");
+  auto add_node = node_map.GetNode("add");
+  EXPECT_EQ(add_node->input(0), "shapen");
+  EXPECT_EQ(add_node->input(1),
+            "shapen-0-1-VecPermuteNCHWToNHWC-LayoutOptimizer");
+  auto vec_permute =
+      node_map.GetNode("shapen-0-1-VecPermuteNCHWToNHWC-LayoutOptimizer");
+  EXPECT_EQ(vec_permute->input(0), "shapen:1");
+  EXPECT_EQ(vec_permute->op(), "DataFormatVecPermute");
+}
+
+TEST_F(LayoutOptimizerTest, ShapeNWithInputs4DAndNoNeedToTransform4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto tensor_4d = ops::Const(s.WithOpName("tensor_4d"), 3.0f, {1, 1, 1, 3});
+  auto i1 = ops::Identity(s.WithOpName("i1"), tensor_4d);
+  Output i2 = ops::Identity(s.WithOpName("i2"), i1);
+  auto shapen = ops::ShapeN(s.WithOpName("shapen"), {conv, i2});
+  auto add = ops::Add(s.WithOpName("add"), shapen[0], shapen[1]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto shapen_node = node_map.GetNode("shapen");
+  EXPECT_EQ(shapen_node->input(0), "Conv2D");
+  EXPECT_EQ(shapen_node->input(1), "i2");
+}
+
+TEST_F(LayoutOptimizerTest, Switch) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  ops::Variable ctrl(s.WithOpName("ctrl"), {}, DT_BOOL);
+  auto sw = ops::Switch(s.WithOpName("switch"), conv, ctrl);
+  auto i1 = ops::Identity(s.WithOpName("i1"), sw.output_true);
+  auto i2 = ops::Identity(s.WithOpName("i2"), sw.output_false);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto switch_node = node_map.GetNode("switch");
+  EXPECT_EQ(switch_node->input(0), "Conv2D");
+  EXPECT_EQ(switch_node->input(1), "ctrl");
+  auto i1_node = node_map.GetNode("i1");
+  auto i2_node = node_map.GetNode("i2");
+  auto trans1 = node_map.GetNode(i1_node->input(0));
+  EXPECT_EQ(trans1->input(0), "switch:1");
+  auto trans2 = node_map.GetNode(i2_node->input(0));
+  EXPECT_EQ(trans2->input(0), "switch");
+}
+
+TEST_F(LayoutOptimizerTest, MergeBothInputsConvertible) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  Output i1 = ops::Identity(s.WithOpName("i1"), conv);
+  auto merge = ops::Merge(s.WithOpName("merge"), {conv, i1});
+  auto i2 = ops::Identity(s.WithOpName("i2"), merge.output);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto merge_node = node_map.GetNode("merge");
+  EXPECT_EQ(merge_node->input(0), "Conv2D");
+  EXPECT_EQ(merge_node->input(1), "i1");
+  auto i2_node = node_map.GetNode("i2");
+  EXPECT_EQ(i2_node->input(0), "merge-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
+  auto transpose =
+      node_map.GetNode("merge-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
+  EXPECT_EQ(transpose->input(0), "merge");
+}
+
+TEST_F(LayoutOptimizerTest, MergeOneInputNotConvertible) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto tensor_4d = ops::Const(s.WithOpName("tensor_4d"), 3.0f, {1, 1, 1, 3});
+  auto merge = ops::Merge(s.WithOpName("merge"), {tensor_4d, conv});
+  auto i2 = ops::Identity(s.WithOpName("i2"), merge.output);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto merge_node = node_map.GetNode("merge");
+  EXPECT_EQ(merge_node->input(0), "tensor_4d");
+  EXPECT_EQ(merge_node->input(1),
+            "Conv2D-0-1-TransposeNCHWToNHWC-LayoutOptimizer");
+}
+
+TEST_F(LayoutOptimizerTest, Complex) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto comp = ops::Complex(s.WithOpName("complex"), conv, conv);
+  auto i = ops::Identity(s.WithOpName("i"), comp);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto merge_node = node_map.GetNode("complex");
+  EXPECT_EQ(merge_node->input(0), "Conv2D");
+  EXPECT_EQ(merge_node->input(1), "Conv2D");
+  auto trans =
+      node_map.GetNode("complex-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
+  EXPECT_EQ(trans->attr().at("T").type(), DT_COMPLEX64);
+}
+
+TEST_F(LayoutOptimizerTest, IdentityNWithInputsVectorAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto vector = ops::Const(s.WithOpName("vector"), 3.0f, {2});
+  auto identity_n = ops::IdentityN(s.WithOpName("identity_n"), {vector, conv});
+  auto add = ops::Add(s.WithOpName("add"), identity_n[0], identity_n[1]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto i = node_map.GetNode("identity_n");
+  EXPECT_EQ(i->input(0), "vector");
+  EXPECT_EQ(i->input(1), "Conv2D");
+  auto trans =
+      node_map.GetNode("identity_n-0-1-TransposeNCHWToNHWC-LayoutOptimizer");
+  EXPECT_EQ(trans->input(0), "identity_n:1");
+  auto add_node = node_map.GetNode("add");
+  EXPECT_EQ(add_node->input(0), "identity_n");
+  EXPECT_EQ(add_node->input(1),
+            "identity_n-0-1-TransposeNCHWToNHWC-LayoutOptimizer");
+}
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..102526e22f4742cb90757a1daf55467dd16afc3e
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* optimized_graph) {
+  *optimized_graph = item.graph;
+
+  return Status::OK();
+}
+
+void LoopOptimizer::Feedback(Cluster* /*cluster*/, const GrapplerItem& /*item*/,
+                             const GraphDef& /*optimized_graph*/,
+                             double /*result*/) {
+  // Nothing to do for LoopOptimizer.
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..106d4628ae68f3c92ab597f903f96a6af8a64b8d
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LOOP_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LOOP_OPTIMIZER_H_
+
+#include <unordered_set>
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class LoopOptimizer : public GraphOptimizer {
+ public:
+  LoopOptimizer() : opt_level_(RewriterConfig::ON) {}
+  explicit LoopOptimizer(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
+  ~LoopOptimizer() override {}
+
+  string name() const override { return "loop_optimizer"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override;
+
+ private:
+  RewriterConfig::Toggle opt_level_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LOOP_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c09434f60916b9bf269b0f5006b8a3732afaa5fc
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class LoopOptimizerTest : public ::testing::Test {};
+
+void VerifyGraphsEqual(const GraphDef& original_graph,
+                       const GraphDef& optimized_graph, const string& func) {
+  EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << func;
+  for (int i = 0; i < original_graph.node_size(); ++i) {
+    const NodeDef& original = original_graph.node(i);
+    const NodeDef& optimized = optimized_graph.node(i);
+    EXPECT_EQ(original.name(), optimized.name()) << func;
+    EXPECT_EQ(original.op(), optimized.op()) << func;
+    EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
+    for (int j = 0; j < original.input_size(); ++j) {
+      EXPECT_EQ(original.input(j), optimized.input(j)) << func;
+    }
+  }
+}
+
+TEST_F(LoopOptimizerTest, NoOp) {
+  // This trivial graph is so basic there's nothing to optimize.
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  LoopOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  VerifyGraphsEqual(item.graph, output, __FUNCTION__);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 7c44ce15c6efee1ca375665976db1dc15dc01096..3057ee5fa14bd209ad4bb6a9ad690d57435601f4 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -23,7 +23,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_memory.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/graph_view.h"
@@ -33,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/static_schedule.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -412,6 +415,12 @@ void RecomputeSubgraph(
 void RecomputationRewritingPass(RewriterConfig::MemOptType optimization_level,
                                 const string& recomputation_targets_name_prefix,
                                 GraphDef* graph, const GrapplerItem& item) {
+  if (optimization_level != RewriterConfig::RECOMPUTATION_HEURISTICS &&
+      optimization_level != RewriterConfig::HEURISTICS &&
+      optimization_level != RewriterConfig::MANUAL) {
+    // Nothing to do
+    return;
+  }
   // The topological numberings and NodeMap will be stale as soon as we start
   // modifying the graph in RecomputeSubgraph. However, RecomputeSubgraph only
   // looks up nodes which were in the original graph, and preserves the graph
@@ -419,7 +428,7 @@ void RecomputationRewritingPass(RewriterConfig::MemOptType optimization_level,
   // We don't use the results of this topological sort until later, but this
   // call invalidates all NodeDef pointers, so it needs to be done before we
   // start collecting those.
-  TopologicalSort(graph);
+  TF_CHECK_OK(TopologicalSort(graph));
   NodeMap node_map(graph);
   std::vector<RecomputedSubGraph> recomputed_subgraphs;
   // Do not recompute nodes which are fed, since the recomputed node would not
@@ -480,31 +489,257 @@ void RecomputationRewritingPass(RewriterConfig::MemOptType optimization_level,
   }
 }
 
-std::pair<NodeDef*, NodeDef*> BuildSwapPair(NodeDef* node, int input_to_swap,
-                                            GraphDef* graph) {
+bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
+  // Look for AddN nodes (and equivalent) and record input names.
+  GraphView view(&item->graph);
+
+  std::unordered_map<string, std::unordered_set<NodeDef*>> addn_list;
+  for (NodeDef& node : *item->graph.mutable_node()) {
+    if (!IsAddN(node) && node.op() != "AccumulateNV2") {
+      continue;
+    }
+    // There is nothing to gain by optimizing nodes with 2 or fewer inputs.
+    if (view.NumFanins(node, false) <= 2) {
+      continue;
+    }
+    for (const auto& input : view.GetFanins(node, false)) {
+      if (input.node->device() == node.device()) {
+        string tensor_name =
+            strings::StrCat(input.node->name(), ":", input.port_id);
+        addn_list[tensor_name].insert(&node);
+      }
+    }
+  }
+
+  if (addn_list.empty()) {
+    return false;
+  }
+
+  GraphMemory memory(*item);
+  const std::unordered_map<string, DeviceProperties>& devices =
+      cluster->GetDevices();
+  Status s = memory.InferStatically(devices);
+  if (!s.ok()) {
+    VLOG(1) << "Failed to infer memory usage: " << s.error_message();
+    return false;
+  }
+
+  std::unordered_set<NodeDef*> addn_to_rewrite;
+  for (const auto& device : devices) {
+    const string& name = device.first;
+    const DeviceProperties& prop = device.second;
+    if (prop.memory_size() <= 0) {
+      VLOG(1) << "Available memory unknown for device " << name;
+      continue;
+    }
+    const GraphMemory::MemoryUsage& mem_usage = memory.GetPeakMemoryUsage(name);
+
+    if (mem_usage.used_memory <= prop.memory_size() * 0.8) {
+      continue;
+    }
+
+    for (const auto& live : mem_usage.live_tensors) {
+      string tensor_name = strings::StrCat(live.node, ":", live.output_id);
+      auto it = addn_list.find(tensor_name);
+      if (it != addn_list.end()) {
+        addn_to_rewrite.insert(it->second.begin(), it->second.end());
+      }
+    }
+  }
+
+  if (addn_to_rewrite.empty()) {
+    return false;
+  }
+  GraphProperties properties(*item);
+  s = properties.InferStatically(false);
+  if (!s.ok()) {
+    VLOG(1) << "Failed to infer shapes: " << s.error_message();
+    return false;
+  }
+
+  bool updated_graph = false;
+  // Rewrite the AddN.
+  for (NodeDef* node : addn_to_rewrite) {
+    if (!properties.HasOutputProperties(node->name())) {
+      VLOG(1) << "Missing properties for " << node->name();
+      continue;
+    }
+    const TensorShapeProto& shape =
+        properties.GetOutputProperties(node->name())[0].shape();
+    PartialTensorShape shp(shape);
+    if (!shp.IsFullyDefined()) {
+      VLOG(1) << "Shape not fully known for " << node->name();
+      continue;
+    }
+
+    // Compute a topological ordering for the node fanin.
+    std::unordered_map<NodeDef*, int> topo_order;
+    ReverseDfs(view, {node}, nullptr,
+               [&topo_order](NodeDef* n) {
+                 int topo_index = topo_order.size();
+                 topo_order[n] = topo_index;
+               },
+               nullptr);
+
+    std::vector<int> input_topo_index;
+
+    for (int i = 0; i < node->input_size(); ++i) {
+      const string& input = node->input(i);
+      const string node_name = NodeName(input);
+      NodeDef* node = view.GetNode(node_name);
+      input_topo_index.push_back(topo_order.at(node));
+    }
+    int min_input_topo_index = INT_MAX;
+    int min_input_id = -1;
+    for (int i = 0; i < node->input_size(); ++i) {
+      if (IsControlInput(node->input(i))) {
+        // control inputs are always last.
+        break;
+      }
+      const int current = input_topo_index[i];
+      if (current < min_input_topo_index) {
+        min_input_topo_index = current;
+        min_input_id = i;
+      }
+    }
+    CHECK_LE(0, min_input_id);
+    std::vector<string> pre_ctrl_deps;
+    std::vector<string> post_ctrl_deps;
+    for (int i = node->input_size() - 1; i >= 0; --i) {
+      if (!IsControlInput(node->input(i))) {
+        // control inputs are always last.
+        break;
+      }
+      if (input_topo_index[i] < min_input_topo_index) {
+        // These control dependencies can be executed before the node.
+        pre_ctrl_deps.push_back(node->input(i));
+      } else {
+        // These control dependencies should be executed after the node.
+        post_ctrl_deps.push_back(node->input(i));
+      }
+    }
+
+    DataType dtype = node->attr().at("T").type();
+    const string& device = node->device();
+
+    // Create the temporary variable that will hold intermediate results
+    NodeDef* tmp_var = item->graph.add_node();
+    tmp_var->set_name(strings::StrCat(node->name(), "/tmp_var"));
+    tmp_var->set_op("TemporaryVariable");
+    tmp_var->set_device(device);
+    (*tmp_var->mutable_attr())["dtype"].set_type(dtype);
+    *(*tmp_var->mutable_attr())["shape"].mutable_shape() = shape;
+    (*tmp_var->mutable_attr())["var_name"].set_s(tmp_var->name());
+
+    for (const string& ctrl_dep : pre_ctrl_deps) {
+      *tmp_var->add_input() = ctrl_dep;
+    }
+    *tmp_var->add_input() =
+        AsControlDependency(NodeName(node->input(min_input_id)));
+
+    // Initialize it to zero
+    NodeDef* zeros = item->graph.add_node();
+    zeros->set_name(strings::StrCat(node->name(), "/tmp_var_zeros"));
+    zeros->set_op("ZerosLike");
+    zeros->set_device(device);
+    (*zeros->mutable_attr())["T"].set_type(dtype);
+    *zeros->add_input() = node->input(min_input_id);
+
+    NodeDef* initialize = item->graph.add_node();
+    initialize->set_name(strings::StrCat(node->name(), "/tmp_var_initializer"));
+    initialize->set_op("Assign");
+    initialize->set_device(device);
+    (*initialize->mutable_attr())["T"].set_type(dtype);
+    (*initialize->mutable_attr())["use_locking"].set_b(false);
+    (*initialize->mutable_attr())["validate_shape"].set_b(false);
+    *initialize->add_input() = tmp_var->name();
+    *initialize->add_input() = zeros->name();
+
+    // Add the assignadd nodes
+    std::vector<NodeDef*> accumulates;
+    for (int i = 0; i < node->input_size(); ++i) {
+      const string& input = node->input(i);
+      if (!IsControlInput(input)) {
+        NodeDef* accumulate = item->graph.add_node();
+        accumulate->set_name(
+            strings::StrCat(node->name(), "/tmp_var_accum_", i));
+        accumulate->set_op("AssignAdd");
+        accumulate->set_device(device);
+        (*accumulate->mutable_attr())["T"].set_type(dtype);
+        (*accumulate->mutable_attr())["use_locking"].set_b(true);
+        *accumulate->add_input() = initialize->name();
+        *accumulate->add_input() = input;
+        accumulates.push_back(accumulate);
+      }
+    }
+
+    // Rewrite the AddN node as a DestroyTemporaryVariable ops
+    node->set_op("DestroyTemporaryVariable");
+    node->clear_input();
+    node->clear_attr();
+    (*node->mutable_attr())["T"].set_type(dtype);
+    (*node->mutable_attr())["var_name"].set_s(tmp_var->name());
+    *node->add_input() = initialize->name();
+    for (const NodeDef* accum : accumulates) {
+      *node->add_input() = AsControlDependency(accum->name());
+    }
+    for (const string& ctrl_dep : post_ctrl_deps) {
+      *node->add_input() = ctrl_dep;
+    }
+
+    updated_graph = true;
+  }
+
+  return updated_graph;
+}
+
+Status BuildSwapPair(NodeDef* node, int input_to_swap,
+                     const std::unordered_map<string, const NodeDef*>& name_map,
+                     GraphDef* graph,
+                     std::pair<NodeDef*, NodeDef*>* swap_pair) {
+  const OpDef* op_def;
+  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(node->op(), &op_def));
+  DataType input_type;
+  TF_RETURN_IF_ERROR(
+      InputTypeForNode(*node, *op_def, input_to_swap, &input_type));
+  if (IsRefType(input_type)) {
+    return errors::InvalidArgument("Can't swap input ", input_to_swap,
+                                   " of node ", node->name(),
+                                   " since it expects a reference");
+  }
+
   string tensor_to_swap = strings::StrCat(node->name(), "_", input_to_swap);
+  string swap_out_name = strings::StrCat("swap_out_", tensor_to_swap);
+  string swap_in_name = strings::StrCat("swap_in_", tensor_to_swap);
+  if (name_map.find(swap_out_name) != name_map.end() ||
+      name_map.find(swap_in_name) != name_map.end()) {
+    return errors::InvalidArgument("Input ", input_to_swap, " of node ",
+                                   node->name(), " is already swapped");
+  }
 
   // Force the tensor to be copied to cpu.
   NodeDef* swap_out_node = graph->add_node();
-  swap_out_node->set_name(strings::StrCat("swap_out_", tensor_to_swap));
+  swap_out_node->set_name(swap_out_name);
   swap_out_node->set_op("Identity");
-  swap_out_node->set_device("/CPU");
+  swap_out_node->set_device("/device:CPU:0");
 
   // Force the tensor to be restored to the device.
   NodeDef* swap_in_node = graph->add_node();
-  swap_in_node->set_name(strings::StrCat("swap_in_", tensor_to_swap));
+  swap_in_node->set_name(swap_in_name);
   swap_in_node->set_op("Identity");
   *swap_in_node->add_input() = swap_out_node->name();
 
   // Colocate the swap_in_ node with the node itself.
+  swap_in_node->set_device(node->device());
   string coloc_group = strings::StrCat("loc@", tensor_to_swap);
   (*swap_in_node->mutable_attr())["_class"].mutable_list()->add_s(coloc_group);
   (*node->mutable_attr())["_class"].mutable_list()->add_s(coloc_group);
 
-  const DataType input_type = node->attr().at("T").type();
   (*swap_in_node->mutable_attr())["T"].set_type(input_type);
   (*swap_out_node->mutable_attr())["T"].set_type(input_type);
-  return std::make_pair(swap_out_node, swap_in_node);
+  *swap_pair = std::make_pair(swap_out_node, swap_in_node);
+
+  return Status::OK();
 }
 
 static int64 EstimateSize(const OpInfo::TensorProperties& t) {
@@ -531,7 +766,7 @@ struct SwapInfo {
   Costs::NanoSeconds time_to_swap = 0;
 };
 
-static const NodeDef* FindSwapTrigger(
+static const NodeDef* FindSwapInTrigger(
     const NodeDef* node, const SwapInfo& swap_info,
     const std::unordered_map<string, const NodeDef*>& name_map,
     const std::unordered_map<const NodeDef*, Costs::NanoSeconds>&
@@ -568,9 +803,12 @@ static const NodeDef* FindSwapTrigger(
   max_trigger_time -= swap_info.time_to_swap;
 
   std::map<Costs::NanoSeconds, const NodeDef*> candidates;
+  std::set<string> already_processed;
+
   while (!possible_inputs.empty()) {
     const string input_node_name = *possible_inputs.begin();
     possible_inputs.erase(possible_inputs.begin());
+    already_processed.insert(input_node_name);
     auto it1 = name_map.find(input_node_name);
     if (it1 == name_map.end()) {
       return nullptr;
@@ -579,7 +817,7 @@ static const NodeDef* FindSwapTrigger(
     // Don't jump over frames, since adding a control dependency from one frame
     // to the next isn't supported. Don't go through branches, since we don't
     // know whether they'll be executed or not.
-    if (IsNextIteration(*input_node) || IsSwitch(*input_node) ||
+    if (ModifiesFrameInfo(*input_node) || IsSwitch(*input_node) ||
         IsMerge(*input_node)) {
       continue;
     }
@@ -591,7 +829,10 @@ static const NodeDef* FindSwapTrigger(
       candidates[it2->second] = input_node;
     } else {
       for (const string& fanin : input_node->input()) {
-        possible_inputs.insert(NodeName(fanin));
+        string name = NodeName(fanin);
+        if (already_processed.find(name) == already_processed.end()) {
+          possible_inputs.insert(name);
+        }
       }
     }
   }
@@ -605,16 +846,113 @@ static const NodeDef* FindSwapTrigger(
   return nullptr;
 }
 
-static void IdentifySwappingCandidates(Cluster* cluster,
-                                       const GrapplerItem& item,
-                                       GraphDef* optimized_graph) {
-  GraphMemory memory(item);
+static bool IsSwappable(const GraphView& graph, GraphView::OutputPort output) {
+  const NodeDef& node = *output.node;
+  // There is no point in swapping out persistent tensors, since the tensor will
+  // continue to use memory.
+  if (IsPersistent(node)) {
+    return false;
+  }
+
+  const OpDef* op_def;
+  if (!OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok()) {
+    return false;
+  }
+  DataType dtype;
+  if (!OutputTypeForNode(node, *op_def, output.port_id, &dtype).ok()) {
+    return false;
+  }
+  // References can only refer to persistent memory: therefore the node isn't
+  // swappable.
+  if (IsRefType(dtype)) {
+    return false;
+  }
+
+  if (output.node->op() == "Identity" || output.node->op() == "Reshape") {
+    // If placed on the same device, these nodes are just forwarding references
+    // to their input. Therefore they are swappable iff their fanin is swappable
+    // or it resides on a different device.
+    GraphView::InputPort input;
+    input.node = output.node;
+    input.port_id = 0;
+    GraphView::OutputPort fanin = graph.GetRegularFanin(input);
+    if (fanin.node->device() == node.device()) {
+      return IsSwappable(graph, fanin);
+    }
+  }
+  return true;
+}
+
+static NodeDef* FindSwapOutTrigger(
+    const NodeDef* node, int input_id, const GraphView& view,
+    const std::unordered_map<const NodeDef*, Costs::NanoSeconds>&
+        execution_times) {
+  // Find the output port that generated the tensor to swap.
+  GraphView::InputPort swap;
+  swap.node = const_cast<NodeDef*>(node);
+  swap.port_id = input_id;
+  GraphView::OutputPort generator = view.GetRegularFanin(swap);
+  if (!generator.node) {
+    return nullptr;
+  }
+
+  const std::unordered_set<GraphView::InputPort, GraphView::HashPort>& fanout =
+      view.GetFanout(generator);
+  NodeDef* trigger = nullptr;
+  Costs::NanoSeconds earliest_fanout(Costs::NanoSeconds::infinity());
+
+  for (const auto& port : fanout) {
+    if (port.node == node) {
+      continue;
+    }
+    auto it = execution_times.find(port.node);
+    if (it != execution_times.end() && it->second < earliest_fanout) {
+      earliest_fanout = it->second;
+      trigger = port.node;
+    }
+  }
+
+  return trigger;
+}
+
+static bool IsSwappable(GraphView::InputPort input) {
+  const NodeDef& node = *input.node;
+
+  const OpDef* op_def;
+  if (!OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok()) {
+    return false;
+  }
+
+  DataType dtype;
+  if (!InputTypeForNode(node, *op_def, input.port_id, &dtype).ok()) {
+    return false;
+  }
+
+  return !IsRefType(dtype);
+}
+
+struct MemInfo {
+  GraphView::OutputPort port;
+  int64 memory_used;
+  std::vector<GraphView::InputPort> uses_left;
+  double fitness;
+
+  bool operator<(const MemInfo& other) const { return fitness < other.fitness; }
+};
+
+static bool IdentifySwappingCandidates(
+    Cluster* cluster, GrapplerItem* item, std::unordered_set<string>* skip_list,
+    std::unordered_map<NodeDef*, SwapInfo>* nodes_to_swap) {
+  GraphMemory memory(*item);
   const std::unordered_map<string, DeviceProperties>& devices =
       cluster->GetDevices();
-  if (!memory.InferStatically(devices).ok()) {
-    return;
+  Status s = memory.InferStatically(devices);
+  if (!s.ok()) {
+    VLOG(1) << "Failed to infer memory usage: " << s.error_message();
+    return false;
   }
 
+  bool updated_graph = false;
   for (const auto& device : devices) {
     const string& name = device.first;
     const DeviceProperties& prop = device.second;
@@ -622,79 +960,157 @@ static void IdentifySwappingCandidates(Cluster* cluster,
       continue;
     }
     if (prop.memory_size() <= 0) {
+      VLOG(1) << "Peak memory usage unknown for device " << name;
       continue;
     }
     const GraphMemory::MemoryUsage& mem_usage = memory.GetPeakMemoryUsage(name);
+
     if (mem_usage.used_memory <= prop.memory_size()) {
       continue;
     }
     int64 required_savings = mem_usage.used_memory - prop.memory_size();
-    // TODO(bsteiner): sort the tensors by how long they're live.
 
-    std::unordered_map<const NodeDef*, Costs::NanoSeconds> execution_times;
-    if (!EstimateEarliestExecutionTimes(item, cluster, &execution_times).ok()) {
-      return;
+    std::unordered_map<string, Costs::NanoSeconds> op_completion_times;
+    {
+      VirtualCluster vcluster(cluster->GetDevices());
+      if (!vcluster.Provision().ok()) {
+        return false;
+      }
+      if (!vcluster.Initialize(*item).ok()) {
+        return false;
+      }
+      RunMetadata metadata;
+      Status s = vcluster.Run(item->graph, item->feed, item->fetch, &metadata);
+      if (!s.ok() && s.code() != error::RESOURCE_EXHAUSTED) {
+        return false;
+      }
+
+      for (const auto& dev_stats : metadata.step_stats().dev_stats()) {
+        for (const auto& node_stats : dev_stats.node_stats()) {
+          Costs::NanoSeconds exec_time =
+              Costs::NanoSeconds(1) +
+              Costs::MicroSeconds(node_stats.all_start_micros() +
+                                  node_stats.op_end_rel_micros());
+          op_completion_times.emplace(node_stats.node_name(), exec_time);
+        }
+      }
+    }
+
+    Costs::Duration peak_time = -1;
+    for (const auto& live_tensor : mem_usage.live_tensors) {
+      if (live_tensor.allocation_time > peak_time) {
+        peak_time = live_tensor.allocation_time;
+      }
     }
-    GraphView graph(optimized_graph);
+
+    std::vector<MemInfo> mem_state;
+
+    GraphView graph(&item->graph);
     for (const auto& live_tensor : mem_usage.live_tensors) {
+      if (live_tensor.memory_used <= 1024) {
+        // Don't bother with small tensors.
+        continue;
+      }
       if (live_tensor.deallocation_time - live_tensor.allocation_time <=
           Costs::Duration(1e6)) {
         // Not enough time to swap.
+        VLOG(1) << "Not enough time to swap: skipping " << live_tensor.node;
         continue;
       }
-      if (live_tensor.memory_used <= 1024) {
-        // Don't bother with small tensors.
+
+      if (skip_list->find(live_tensor.node) != skip_list->end()) {
         continue;
       }
-      Costs::NanoSeconds execution_time(-1);
-      GraphView::InputPort fanout_to_swap;
       GraphView::OutputPort port =
           graph.GetOutputPort(live_tensor.node, live_tensor.output_id);
+      if (!IsSwappable(graph, port)) {
+        continue;
+      }
+      MemInfo mem_info;
+      mem_info.port = port;
+      mem_info.memory_used = live_tensor.memory_used;
+      Costs::Duration allocation_time = live_tensor.allocation_time;
+      Costs::Duration earliest_use(Costs::Duration::infinity());
+      bool valid = true;
       for (GraphView::InputPort input : graph.GetFanout(port)) {
-        auto it = execution_times.find(input.node);
-        if (it != execution_times.end()) {
-          if (it->second > execution_time) {
-            fanout_to_swap = input;
-            execution_time = it->second;
-          }
+        // Get execution time.
+        auto it = op_completion_times.find(input.node->name());
+        if (it == op_completion_times.end()) {
+          valid = false;
+          break;
         }
-      }
-      // Annotate the fanout to request the tensor to be swapped if it's not
-      // already been done.
-      AttrValue& val = (*fanout_to_swap.node->mutable_attr())["_swap_to_host"];
-      bool found = false;
-      for (int port_id : val.list().i()) {
-        if (port_id == fanout_to_swap.port_id) {
-          found = true;
+        if (it->second <= peak_time) {
+          continue;
+        }
+
+        if (skip_list->find(input.node->name()) != skip_list->end()) {
+          valid = false;
           break;
         }
-      }
-      if (!found) {
-        val.mutable_list()->add_i(fanout_to_swap.port_id);
-        required_savings -= live_tensor.memory_used;
-        if (required_savings < 0) {
+        string input_name =
+            strings::StrCat(input.node->name(), ":", input.port_id);
+        if (skip_list->find(input_name) != skip_list->end()) {
+          valid = false;
           break;
         }
+        if (!IsSwappable(input)) {
+          valid = false;
+          break;
+        }
+
+        // Set earliest use time that's after peak.
+        mem_info.uses_left.emplace_back(input);
+        earliest_use = std::min(earliest_use, it->second);
+      }
+      if (valid && !mem_info.uses_left.empty()) {
+        // Compute the fitness: we need the tensor to be generated way away of
+        // the time of peak memory usage (to ensure there is enough time to swap
+        // it out). We also need to ensure it's used way after the peak time, to
+        // ensure that swapping the tensor back in won't recreate the memory
+        // bottleneck. Last but not least, we want the tensor to have as few
+        // remaining uses as possible.
+        mem_info.fitness = std::pow((earliest_use - peak_time).count(), 2);
+        mem_info.fitness /= std::pow(mem_info.uses_left.size(), 2);
+        mem_info.fitness += std::pow((allocation_time - peak_time).count(), 2);
+        mem_info.fitness = -mem_info.fitness;
+        mem_state.push_back(mem_info);
       }
     }
-  }
-}
 
-Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                 GraphDef* optimized_graph) {
-  *optimized_graph = item.graph;
+    // Sort by fitness
+    std::sort(mem_state.begin(), mem_state.end());
 
-  RecomputationRewritingPass(optimization_level_,
-                             recomputation_targets_name_prefix_,
-                             optimized_graph, item);
+    for (const MemInfo& mem_info : mem_state) {
+      for (const GraphView::InputPort fanout_to_swap : mem_info.uses_left) {
+        VLOG(1) << "Will swap fanout " << fanout_to_swap.node->name() << ":"
+                << fanout_to_swap.port_id << " of tensor "
+                << mem_info.port.node->name() << ":" << mem_info.port.port_id
+                << " of size " << mem_info.memory_used;
 
-  if (optimization_level_ == RewriterConfig::SWAPPING_HEURISTICS) {
-    IdentifySwappingCandidates(cluster, item, optimized_graph);
+        (*nodes_to_swap)[fanout_to_swap.node].inputs_to_swap.push_back(
+            fanout_to_swap.port_id);
+      }
+      required_savings -= mem_info.memory_used;
+      updated_graph = true;
+      if (required_savings < 0) {
+        break;
+      }
+    }
   }
+  return updated_graph;
+}
 
-  // Figure out what needs to be swapped;
+bool SwappingPass(RewriterConfig::MemOptType optimization_level,
+                  Cluster* cluster, GrapplerItem* item,
+                  std::unordered_set<string>* skip_list) {
   std::unordered_map<NodeDef*, SwapInfo> nodes_to_swap;
-  for (auto& node : *optimized_graph->mutable_node()) {
+  if (optimization_level == RewriterConfig::SWAPPING_HEURISTICS ||
+      optimization_level == RewriterConfig::HEURISTICS) {
+    // Use heuristics to figure out what needs to be swapped;
+    IdentifySwappingCandidates(cluster, item, skip_list, &nodes_to_swap);
+  }
+  // Look for manual annotatations in the graph.
+  for (auto& node : *item->graph.mutable_node()) {
     if (node.attr().count("_swap_to_host") != 0) {
       SwapInfo& swap_info = nodes_to_swap[&node];
       const AttrValue& val = node.attr().at("_swap_to_host");
@@ -710,61 +1126,130 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
   if (nodes_to_swap.empty()) {
     // Nothing to do.
-    return Status::OK();
+    return false;
   }
 
-  {
-    // Estimate the size of the data to swap for each node.
-    GraphProperties properties(item);
-    TF_RETURN_IF_ERROR(properties.InferStatically());
-    for (auto& swap : nodes_to_swap) {
-      const NodeDef* node = swap.first;
-      std::vector<OpInfo::TensorProperties> props =
-          properties.GetInputProperties(node->name());
-      SwapInfo& swap_info = swap.second;
-      int64 bytes_to_swap = 0;
-      for (int64 input_id : swap_info.inputs_to_swap) {
-        const OpInfo::TensorProperties& t = props[input_id];
-        bytes_to_swap += EstimateSize(t);
-      }
-      // Let's assume we're going to swap over PCIe running at 16 GBps.
-      swap_info.time_to_swap = bytes_to_swap / 16;
+  // Estimate the size of the data to swap for each node.
+  GraphProperties properties(*item);
+  if (!properties.InferStatically(true).ok()) {
+    return false;
+  }
+  for (auto& swap : nodes_to_swap) {
+    const NodeDef* node = swap.first;
+    const std::vector<OpInfo::TensorProperties>& props =
+        properties.GetInputProperties(node->name());
+    SwapInfo& swap_info = swap.second;
+    int64 bytes_to_swap = 0;
+    for (int64 input_id : swap_info.inputs_to_swap) {
+      const OpInfo::TensorProperties& t = props[input_id];
+      bytes_to_swap += EstimateSize(t);
     }
+    // Let's assume we're going to swap over PCIe running at 16 GBps.
+    swap_info.time_to_swap = bytes_to_swap / 16;
   }
 
   std::unordered_map<const NodeDef*, Costs::NanoSeconds> execution_times;
-  TF_RETURN_IF_ERROR(
-      EstimateEarliestExecutionTimes(item, cluster, &execution_times));
+  if (!EstimateEarliestExecutionTimes(*item, cluster, &execution_times).ok()) {
+    return false;
+  }
 
   std::unordered_map<string, const NodeDef*> name_map;
-  for (const auto& node : item.graph.node()) {
+  for (const auto& node : item->graph.node()) {
     name_map[node.name()] = &node;
   }
+  GraphView view(&item->graph);
+
+  bool updated_graph = false;
 
   for (auto& swap : nodes_to_swap) {
     NodeDef* node = swap.first;
-    SwapInfo& swap_info = swap.second;
+    const SwapInfo& swap_info = swap.second;
+    if (skip_list->find(node->name()) != skip_list->end()) {
+      continue;
+    }
 
     // Make sure the tensor isn't swapped back in right away: look for node that
     // will execute just before we need to swap the data back, and add a control
     // dependency from that node to the swap node.
-    const NodeDef* trigger =
-        FindSwapTrigger(node, swap_info, name_map, execution_times);
-    if (!trigger) {
+    const NodeDef* in_trigger =
+        FindSwapInTrigger(node, swap_info, name_map, execution_times);
+    // If we failed, don't attempt to reprocess this node in a subsequent pass.
+    if (!in_trigger) {
+      skip_list->insert(node->name());
       continue;
     }
+
     // Swap all the tensors that are marked with the 'swap_to_host' attribute.
     for (int input_id : swap_info.inputs_to_swap) {
-      std::pair<NodeDef*, NodeDef*> swap_nodes =
-          BuildSwapPair(node, input_id, optimized_graph);
+      string input_name = strings::StrCat(node->name(), ":", input_id);
+      if (skip_list->find(input_name) != skip_list->end()) {
+        continue;
+      } else {
+        // Don't attempt to reprocess this input in a subsequent pass.
+        skip_list->insert(input_name);
+      }
+
+      // Make sure the tensor is swapped out quickly: look for node that
+      // will execute just after the tensor is generated and add a control
+      // dependency from the swap out node to that node.
+      NodeDef* out_trigger =
+          FindSwapOutTrigger(node, input_id, view, execution_times);
+      if (!out_trigger) {
+        continue;
+      }
+
+      std::pair<NodeDef*, NodeDef*> swap_nodes;
+      if (!BuildSwapPair(node, input_id, name_map, &item->graph, &swap_nodes)
+               .ok()) {
+        continue;
+      }
       *swap_nodes.first->add_input() = node->input(input_id);
       *node->mutable_input(input_id) = swap_nodes.second->name();
 
-      // Add the control dependency needed to delay the execution of the swap.
-      *swap_nodes.second->add_input() = strings::StrCat("^", trigger->name());
+      // Add the control dependencies needed to delay the execution of the swap.
+      out_trigger->add_input(strings::StrCat("^", swap_nodes.first->name()));
+      swap_nodes.second->add_input(strings::StrCat("^", in_trigger->name()));
+
+      // Make sure we won't try to swap the swap nodes in subsequent passes.
+      skip_list->insert(swap_nodes.first->name());
+      skip_list->insert(swap_nodes.second->name());
+    }
+  }
+  return updated_graph;
+}
+
+Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* optimized_graph) {
+  *optimized_graph = item.graph;
+
+  RecomputationRewritingPass(optimization_level_,
+                             recomputation_targets_name_prefix_,
+                             optimized_graph, item);
+
+  GrapplerItem optimized_item(item, std::move(*optimized_graph));
+  std::unordered_set<string> skip_list;
+  // Bound the number of rewrite passes to avoid long processing times on graphs
+  // that simply won't fit in memory.
+  bool updated_graph = true;
+  for (int i = 0; i < 25 && updated_graph; ++i) {
+    updated_graph = false;
+    if ((optimization_level_ == RewriterConfig::DEFAULT_MEM_OPT ||
+         optimization_level_ == RewriterConfig::SCHEDULING_HEURISTICS ||
+         optimization_level_ == RewriterConfig::HEURISTICS) &&
+        cluster != nullptr) {
+      updated_graph |= SchedulingPass(cluster, &optimized_item);
+    }
+
+    if ((optimization_level_ == RewriterConfig::SWAPPING_HEURISTICS ||
+         optimization_level_ == RewriterConfig::HEURISTICS ||
+         optimization_level_ == RewriterConfig::MANUAL) &&
+        cluster != nullptr) {
+      updated_graph |= SwappingPass(optimization_level_, cluster,
+                                    &optimized_item, &skip_list);
     }
   }
 
+  optimized_graph->Swap(&optimized_item.graph);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index 6fa4731a863cea9d6124e379641682030ca80bed..5d7913e0c018ecf14cc09ab91d3a71125c720aa5 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -19,17 +19,18 @@ limitations under the License.
 
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-class RecomputeSubgraphTest : public ::testing::Test {};
+class RecomputeSubgraphTest : public GrapplerTest {};
 
 TEST_F(RecomputeSubgraphTest, SimpleSubgraph) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -193,7 +194,7 @@ TEST_F(RecomputeSubgraphTest, MultiNode) {
   EXPECT_EQ("^gradients/BN1Grad", recompute_trigger_c->input(0));
 }
 
-class MemoryOptimizerTest : public ::testing::Test {
+class MemoryOptimizerTest : public GrapplerTest {
  public:
   static std::unique_ptr<VirtualCluster> CreateVirtualCluster() {
     DeviceProperties cpu_device;
@@ -201,8 +202,17 @@ class MemoryOptimizerTest : public ::testing::Test {
     cpu_device.set_frequency(1000);
     cpu_device.set_num_cores(4);
     cpu_device.set_bandwidth(32);
+    cpu_device.set_memory_size(1024 * 1024);
+    DeviceProperties gpu_device;
+    gpu_device.set_type("GPU");
+    gpu_device.set_frequency(1000);
+    gpu_device.set_num_cores(24);
+    gpu_device.set_bandwidth(128);
+    gpu_device.set_memory_size(1024 * 1024);
+    gpu_device.mutable_environment()->insert({"architecture", "6"});
     std::unordered_map<string, DeviceProperties> devices;
     devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
+    devices["/job:localhost/replica:0/task:0/gpu:0"] = gpu_device;
     return std::unique_ptr<VirtualCluster>(new VirtualCluster(devices));
   }
 };
@@ -250,6 +260,145 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
   EXPECT_EQ(NodeName(b.name()), swap_out.input(0));
   EXPECT_EQ(NodeName(swap_out.name()), swap_in.input(0));
   EXPECT_EQ("^c", swap_in.input(1));
+
+  const NodeDef& new_c = output.node(2);
+  EXPECT_EQ(NodeName(c.name()), new_c.name());
+  EXPECT_EQ("^swap_out_e_0", new_c.input(1));
+
+  // Run the optimizer a second time to ensure it's idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(cluster.get(), item, &output);
+  TF_EXPECT_OK(status);
+}
+
+TEST_F(MemoryOptimizerTest, SwappingHeuristics) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output v = ops::Variable(s.WithOpName("v").WithDevice("/gpu:0"),
+                           {128, 128, 8}, DT_FLOAT);
+  Output a = ops::Identity(s.WithOpName("a").WithDevice("/gpu:0"), v);
+  Output b = ops::Square(s.WithOpName("b").WithDevice("/gpu:0"), v);
+  Output c = ops::Sqrt(s.WithOpName("c").WithDevice("/gpu:0"), a);
+  Output d = ops::Identity(s.WithOpName("d").WithDevice("/gpu:0"), b);
+  Output axis = ops::Const(s.WithOpName("axis"), 0);
+  Output e =
+      ops::Concat(s.WithOpName("e").WithDevice("/gpu:0"), {a, b, c, d}, axis);
+  Output f = ops::Square(s.WithOpName("f").WithDevice("/gpu:0"), a);
+  Output g = ops::Sqrt(s.WithOpName("g").WithDevice("/gpu:0"), b);
+  Output h = ops::Exp(s.WithOpName("h").WithDevice("/gpu:0"), c);
+  Output i = ops::Log(s.WithOpName("i").WithDevice("/gpu:0"), d);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"e", "f", "g", "h", "i"};
+
+  std::unique_ptr<VirtualCluster> cluster(CreateVirtualCluster());
+
+  MemoryOptimizer optimizer(RewriterConfig::SWAPPING_HEURISTICS);
+  GraphDef output;
+  Status status = optimizer.Optimize(cluster.get(), item, &output);
+  TF_EXPECT_OK(status);
+
+  for (const auto& node : output.node()) {
+    if (node.name() == "e") {
+      EXPECT_EQ(5, node.input_size());
+      EXPECT_EQ("a", node.input(0));
+      EXPECT_EQ("swap_in_e_1", node.input(1));
+      EXPECT_EQ("swap_in_e_2", node.input(2));
+      EXPECT_EQ("swap_in_e_3", node.input(3));
+      EXPECT_EQ("axis", node.input(4));
+    }
+  }
+}
+
+TEST_F(MemoryOptimizerTest, UnswappableInputs) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output v = ops::Variable(s.WithOpName("v").WithDevice("/gpu:0"),
+                           {128, 128, 8}, DT_FLOAT);
+  Output a = ops::Square(s.WithOpName("a").WithDevice("/gpu:0"), v);
+  Output b = ops::Identity(s.WithOpName("b").WithDevice("/gpu:0"), {a});
+  Output c = ops::Identity(s.WithOpName("c").WithDevice("/gpu:0"), {a});
+  Output index = ops::Const(s.WithOpName("index"), {0});
+  Output indices = ops::Tile(s.WithOpName("indices"), index, {128});
+  Output d =
+      ops::ScatterAdd(s.WithOpName("d").WithDevice("/gpu:0"), v, indices, c);
+  Output axis = ops::Const(s.WithOpName("axis"), 0);
+  Output e =
+      ops::Concat(s.WithOpName("e").WithDevice("/gpu:0"), {b, c, d}, axis);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"e"};
+
+  std::unique_ptr<VirtualCluster> cluster(CreateVirtualCluster());
+
+  MemoryOptimizer optimizer(RewriterConfig::SWAPPING_HEURISTICS);
+  GraphDef output;
+  Status status = optimizer.Optimize(cluster.get(), item, &output);
+  TF_EXPECT_OK(status);
+
+  for (const auto& node : output.node()) {
+    if (node.name() == "e") {
+      // The d node isn't swappable.
+      EXPECT_EQ(5, node.input_size());
+      EXPECT_EQ("d", node.input(2));
+      EXPECT_EQ("^swap_out_d_2", node.input(4));
+    }
+  }
+}
+
+TEST_F(MemoryOptimizerTest, AccumulationRewrites) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::RandomNormal(s.WithOpName("a").WithDevice("/cpu:0"),
+                               {128, 128, 8}, DT_FLOAT);
+  Output b = ops::RandomNormal(s.WithOpName("b").WithDevice("/cpu:0"),
+                               {128, 128, 8}, DT_FLOAT);
+  Output c = ops::RandomNormal(s.WithOpName("c").WithDevice("/cpu:0"),
+                               {128, 128, 8}, DT_FLOAT);
+  Output d = ops::AddN(s.WithOpName("d").WithDevice("/cpu:0"), {a, b, c});
+  Output e = ops::Square(s.WithOpName("e").WithDevice("/cpu:0"), d);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"e"};
+
+  std::unique_ptr<VirtualCluster> cluster(CreateVirtualCluster());
+  MemoryOptimizer optimizer(RewriterConfig::SCHEDULING_HEURISTICS);
+  GraphDef output;
+  Status status = optimizer.Optimize(cluster.get(), item, &output);
+  TF_EXPECT_OK(status);
+
+  int count = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "d") {
+      EXPECT_EQ("DestroyTemporaryVariable", node.op());
+      count++;
+    } else if (node.name() == "d/tmp_var_initializer") {
+      EXPECT_EQ("Assign", node.op());
+      count++;
+    } else if (node.name() == "d/tmp_var") {
+      EXPECT_EQ("TemporaryVariable", node.op());
+      count++;
+    } else if (node.name() == "e") {
+      EXPECT_EQ("Square", node.op());
+      EXPECT_EQ("d", node.input(0));
+      count++;
+    }
+  }
+  EXPECT_EQ(4, count);
+
+  std::vector<string> fetch = {"a", "b", "c", "e"};
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(4, tensors.size());
+
+  for (int i = 0; i < tensors[0].NumElements(); ++i) {
+    float actual = tensors[3].flat<float>()(i);
+    float expected = 0.0f;
+    for (int j = 0; j < 3; ++j) {
+      expected += tensors[j].flat<float>()(i);
+    }
+    expected *= expected;
+    EXPECT_NEAR(actual, expected, 1e-4);
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index d2df8cacb73ff8cc271373c8d8d5a3947ae18509..e27b9df6206c652e4503bb064366201a2b90f13a 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
@@ -30,6 +31,23 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+int64 NumEdges(const GraphDef& graph) {
+  int64 num_edges = 0;
+  for (const auto& node : graph.node()) {
+    num_edges += node.input_size();
+  }
+  return num_edges;
+}
+
+string PrintSizesBeforeAfter(const GraphDef& before, const GraphDef& after) {
+  return strings::StrCat("Graph size before: ", before.node_size(), " nodes, ",
+                         NumEdges(before),
+                         " edges. Graph size after: ", after.node_size(),
+                         " nodes, ", NumEdges(after), " edges.");
+}
+}  // namespace
+
 std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
     const string& optimizer) {
   VLOG(1) << "Adding graph optimization pass: " << optimizer;
@@ -58,6 +76,9 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
     graph_optimizer.reset(
         new DependencyOptimizer(cfg_.dependency_optimization()));
   }
+  if (optimizer == "loop") {
+    graph_optimizer.reset(new LoopOptimizer(cfg_.loop_optimization()));
+  }
   return graph_optimizer;
 }
 
@@ -76,15 +97,19 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
           new ArithmeticOptimizer(cfg_.arithmetic_optimization())));
     }
-    if (cfg_.dependency_optimization() == RewriterConfig::ON) {
+    if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
           new DependencyOptimizer(cfg_.dependency_optimization())));
     }
-    if (cfg_.layout_optimizer() == RewriterConfig::ON) {
+    if (cfg_.loop_optimization() != RewriterConfig::OFF) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+          new LoopOptimizer(cfg_.loop_optimization())));
+    }
+    if (cfg_.layout_optimizer() != RewriterConfig::OFF) {
       optimizers.push_back(
           std::unique_ptr<GraphOptimizer>(new LayoutOptimizer()));
     }
-    if (cfg_.memory_optimization() > 1) {
+    if (cfg_.memory_optimization() != RewriterConfig::NO_MEM_OPT) {
       if (cfg_.memory_optimizer_target_node_name_prefix().empty()) {
         optimizers.push_back(std::unique_ptr<GraphOptimizer>(
             // Use the default target node name prefix "gradients/"
@@ -102,8 +127,8 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
   } else {
     std::set<string> available_optimizers = {
-        "pruning",      "constfold",  "layout",    "memory",
-        "autoparallel", "arithmetic", "dependency"};
+        "pruning",      "constfold",  "layout",     "memory",
+        "autoparallel", "arithmetic", "dependency", "loop"};
     for (const auto& optimizer : cfg_.optimizers()) {
       if (available_optimizers.find(optimizer) != available_optimizers.end()) {
         optimizers.push_back(NewOptimizer(optimizer));
@@ -119,7 +144,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   bool already_optimized = false;
   for (const auto& optimizer : optimizers) {
     if (!already_optimized) {
-      auto status = optimizer->Optimize(cluster, item, optimized_graph);
+      Status status = optimizer->Optimize(cluster, item, optimized_graph);
       string result;
       if (!status.ok()) {
         VLOG(1) << "Not able to apply optimizer " << optimizer->name()
@@ -128,17 +153,14 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       } else {
         already_optimized = true;
         result = strings::StrCat(
-            "OK. "
-            "Graph size before: ",
-            item.graph.node_size(),
-            ". Graph size after: ", optimized_graph->node_size());
+            "OK. ", PrintSizesBeforeAfter(item.graph, *optimized_graph));
       }
       result_.push_back(std::make_pair(optimizer->name(), result));
       VLOG(1) << "Optimizer " << optimizer->name()
               << " return status: " << result;
     } else {
       GrapplerItem optimized_item(item, std::move(*optimized_graph));
-      auto status =
+      Status status =
           optimizer->Optimize(cluster, optimized_item, optimized_graph);
       string result;
       if (!status.ok()) {
@@ -148,10 +170,8 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         result = status.ToString();
       } else {
         result = strings::StrCat(
-            "OK. "
-            "Graph size before: ",
-            optimized_item.graph.node_size(),
-            ". Graph size after: ", optimized_graph->node_size());
+            "OK. ",
+            PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph));
       }
       result_.push_back(std::make_pair(optimizer->name(), result));
       VLOG(1) << "Optimizer " << optimizer->name()
@@ -160,7 +180,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
 
   if (already_optimized) {
-    TopologicalSort(optimized_graph);
+    TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
     // Make sure that the optimizers preserved the graph version and library.
     DCHECK_GE(optimized_graph->library().function_size(),
               item.graph.library().function_size());
@@ -189,11 +209,13 @@ void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
   return !cfg.disable_model_pruning() ||
-         cfg.layout_optimizer() == RewriterConfig::ON ||
+         cfg.layout_optimizer() != RewriterConfig::OFF ||
          cfg.constant_folding() != RewriterConfig::OFF ||
-         cfg.dependency_optimization() == RewriterConfig::ON ||
+         cfg.dependency_optimization() != RewriterConfig::OFF ||
+         cfg.loop_optimization() == RewriterConfig::ON ||
          cfg.arithmetic_optimization() != RewriterConfig::OFF ||
-         cfg.auto_parallel().enable() || cfg.memory_optimization() > 1 ||
+         cfg.auto_parallel().enable() ||
+         cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
          !cfg.optimizers().empty();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index c9bec7890e6af008859d21555fb7ed74451c72c6..f52a2ab86288adacefec6796ceed4cea73d9b632 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -26,12 +26,21 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-bool IsTrivialOp(const NodeDef& node) {
+bool IsTrivialOp(const NodeDef& node, const GraphRewriter& rewriter) {
   // Remove the stop gradient nodes since they serve no purpose once the graph
   // is built. Also remove Identity ops.
-  if (IsStopGradient(node) || IsIdentity(node)) {
+  if (IsStopGradient(node)) {
     return true;
   }
+  if (IsIdentity(node)) {
+    if (rewriter.FeedsMerge(node) || rewriter.IsDrivenBySwitch(node) ||
+        rewriter.IsDrivenByControlDependency(node) ||
+        rewriter.DrivesControlDependency(node)) {
+      return false;
+    } else {
+      return true;
+    }
+  }
   if (IsAddN(node) && NumNonControlInputs(node) <= 1) {
     return true;
   }
@@ -41,7 +50,7 @@ bool IsTrivialOp(const NodeDef& node) {
 
 Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
                              GraphDef* pruned_graph) {
-  std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
+  const std::unordered_set<string>& nodes_to_preserve = item.NodesToPreserve();
 
   // Prune all the nodes that won't be executed, ie all the nodes that aren't in
   // the fanin of a fetch node. If fetch nodes aren't specified, we'll assume
@@ -58,7 +67,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
       // let's be conservative and preserve the graph as is.
       return errors::InvalidArgument("Invalid input graph.");
     }
-    // Try to keep the nodes ordored somewhat topologically since this helps
+    // Try to keep the nodes ordered somewhat topologically since this helps
     // further optimizations perform better.
     for (int i = keep.size() - 1; i >= 0; --i) {
       *runnable_item.graph.add_node() = *keep[i];
@@ -72,7 +81,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
   // Check if we can further prune the graph, by removing the trivial ops.
   std::unordered_set<const NodeDef*> nodes_to_delete;
   for (auto& node : runnable_item.graph.node()) {
-    if (!IsTrivialOp(node)) {
+    if (!IsTrivialOp(node, rewriter)) {
       continue;
     }
 
@@ -95,8 +104,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     //   converting references to non-references. It is important to preserve
     //   these non-references since the partitioner will avoid sending
     //   non-references across partitions more than once.
-    if (!rewriter.DrivesControlDependency(node) &&
-        !rewriter.IsDrivenByControlDependency(node) &&
+    if (!rewriter.RemovalIncreasesEdgeCount(node) &&
         !rewriter.IsConnectedToFunction(node) &&
         !rewriter.IsDrivenByAnotherDevice(node) &&
         !rewriter.ReceivesRefValue(node)) {
@@ -112,13 +120,16 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     return Status::OK();
   }
 
+  const bool fetches_are_known = !item.fetch.empty();
   for (auto& node : runnable_item.graph.node()) {
-    NodeDef* new_node = pruned_graph->add_node();
-    *new_node = node;
-    new_node->clear_input();
-    rewriter.ForwardInputs(node, nodes_to_delete, new_node);
+    if (!fetches_are_known ||
+        nodes_to_delete.find(&node) == nodes_to_delete.end()) {
+      NodeDef* new_node = pruned_graph->add_node();
+      *new_node = node;
+      new_node->clear_input();
+      rewriter.ForwardInputs(node, nodes_to_delete, new_node);
+    }
   }
-
   VLOG(1) << "Pruned " << nodes_to_delete.size()
           << " nodes from the graph. The graph now contains "
           << pruned_graph->node_size() << " nodes.";
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index ee722f311edbb55fbb19044df57cfdfd0b29b1b8..8480a74572883a4657e11606b4cb8dcd5532ea3a 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -156,47 +156,42 @@ TEST_F(ModelPrunerTest, NoOpPruning) {
   const NodeDef& new_e = output.node(4);
   EXPECT_EQ(NodeName(e.name()), new_e.name());
 
-  EXPECT_EQ(1, new_e.input_size());
-  EXPECT_EQ(NodeName(d.name()), new_e.input(0));
-  EXPECT_EQ(2, new_d.input_size());
-  EXPECT_EQ(NodeName(b.name()), new_d.input(0));
-  EXPECT_EQ(1, new_c.input_size());
-  EXPECT_EQ(NodeName(b.name()), new_c.input(0));
+  for (const auto& new_node : output.node()) {
+    if (new_node.name() != "a") {
+      EXPECT_EQ(1, new_node.input_size());
+      EXPECT_EQ("a", new_node.input(0));
+    }
+  }
 }
 
-TEST_F(ModelPrunerTest, PruningSkipsCtrlDependencies) {
-  // Build a simple graph with a few trivially prunable ops.
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-
-  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
-  Output b = ops::Sqrt(s.WithOpName("b"), {a});
-  Output c = ops::Identity(s.WithOpName("c"), b);
-  Output d = ops::Identity(s.WithOpName("d"), c);
-  Output e = ops::Sqrt(s.WithOpName("e").WithControlDependencies(c), {d});
+TEST_F(ModelPrunerTest, PreserveIdentities) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  ops::Variable v_in(scope.WithOpName("v_in"), {3}, DT_FLOAT);
+  ops::Variable v_ctrl(scope.WithOpName("v_ctrl"), {}, DT_BOOL);
+  ops::Switch s(scope.WithOpName("switch"), v_in, v_ctrl);
+  // id0 is preserved because it is fed by a Switch and drives a
+  // control dependency.
+  Output id0 = ops::Identity(scope.WithOpName("id0"), s.output_true);
+  // id1 is preserved because it feeds a Merge.
+  Output id1 = ops::Identity(
+      scope.WithOpName("id1").WithControlDependencies(v_ctrl), s.output_false);
+  Output id2 = ops::Identity(scope.WithOpName("id2"), id0);
+  Output id3 =
+      ops::Identity(scope.WithOpName("id3").WithControlDependencies(id0), id1);
+  auto merge = ops::Merge(scope.WithOpName("merge"), {id0, id1});
 
   GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch.push_back("id2");
+  item.fetch.push_back("id3");
+  item.fetch.push_back("merge");
 
   ModelPruner pruner;
   GraphDef output;
   Status status = pruner.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
 
-  EXPECT_EQ(5, output.node_size());
-  const NodeDef& new_a = output.node(0);
-  EXPECT_EQ(NodeName(a.name()), new_a.name());
-  const NodeDef& new_b = output.node(1);
-  EXPECT_EQ(NodeName(b.name()), new_b.name());
-  const NodeDef& new_c = output.node(2);
-  EXPECT_EQ(NodeName(c.name()), new_c.name());
-  const NodeDef& new_d = output.node(3);
-  EXPECT_EQ(NodeName(d.name()), new_d.name());
-  const NodeDef& new_e = output.node(4);
-  EXPECT_EQ(NodeName(e.name()), new_e.name());
-
-  EXPECT_EQ(2, new_e.input_size());
-  EXPECT_EQ(NodeName(c.name()), new_e.input(0));
-  EXPECT_EQ("^c", new_e.input(1));
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
 }
 
 TEST_F(ModelPrunerTest, PruningSkipsRefOutputs) {
@@ -239,55 +234,53 @@ TEST_F(ModelPrunerTest, PruningSkipsRefOutputs) {
   EXPECT_EQ("b", new_e.input(0));
 }
 
-TEST_F(ModelPrunerTest, PruningPerservesCtrlDependencies) {
+// TODO(rmlarsen): Reenable this test when the issues with
+// //robotics/learning/sensor_predict:utils_multi_sensor_rnn_test
+// have been resolved.
+/*
+TEST_F(ModelPrunerTest, PruningForwardsCtrlDependencies) {
   // Build a simple graph with a few trivially prunable ops.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
   Output b = ops::Sqrt(s.WithOpName("b"), {a});
   Output c = ops::Sqrt(s.WithOpName("c"), {a});
-  Output d = ops::Identity(s.WithOpName("d"), c);
-  Output e = ops::Identity(s.WithOpName("e"), d);
-  Output f = ops::Sqrt(s.WithOpName("f"), {e});
+  Output d = ops::Identity(s.WithOpName("d").WithControlDependencies(b), c);
+  Output e = ops::Identity(s.WithOpName("e").WithControlDependencies(c), d);
+  Output f = ops::Sqrt(s.WithOpName("f"), {d});
+  Output g = ops::Sqrt(s.WithOpName("g"), {e});
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
-  // Add a control dependency between b and d and another one between c and e.
-  // They should be properly forwarded.
-  EXPECT_EQ("d", item.graph.node(3).name());
-  EXPECT_EQ("e", item.graph.node(4).name());
-  *item.graph.mutable_node(3)->add_input() = "^b";
-  *item.graph.mutable_node(4)->add_input() = "^c";
+  item.fetch.push_back("f");
+  item.fetch.push_back("g");
 
   ModelPruner pruner;
   GraphDef output;
   Status status = pruner.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
+  LOG(INFO) << "After: " << output.DebugString();
 
-  EXPECT_EQ(6, output.node_size());
-  const NodeDef& new_a = output.node(0);
-  EXPECT_EQ(NodeName(a.name()), new_a.name());
-  const NodeDef& new_b = output.node(1);
-  EXPECT_EQ(NodeName(b.name()), new_b.name());
-  const NodeDef& new_c = output.node(2);
-  EXPECT_EQ(NodeName(c.name()), new_c.name());
-  const NodeDef& new_d = output.node(3);
-  EXPECT_EQ(NodeName(d.name()), new_d.name());
-  const NodeDef& new_e = output.node(4);
-  EXPECT_EQ(NodeName(e.name()), new_e.name());
-  const NodeDef& new_f = output.node(5);
-  EXPECT_EQ(NodeName(f.name()), new_f.name());
-
-  EXPECT_EQ(1, new_f.input_size());
-  EXPECT_EQ(NodeName(e.name()), new_f.input(0));
-  EXPECT_EQ(2, new_e.input_size());
-  EXPECT_EQ(NodeName(d.name()), new_e.input(0));
-  EXPECT_EQ("^c", new_e.input(1));
-  EXPECT_EQ(2, new_d.input_size());
-  EXPECT_EQ(NodeName(c.name()), new_d.input(0));
-  EXPECT_EQ("^b", new_d.input(1));
+  EXPECT_EQ(5, output.node_size());
+  for (const auto& new_node : output.node()) {
+    // "d" and "e" should be removed.
+    EXPECT_NE("d", new_node.name());
+    EXPECT_NE("e", new_node.name());
+    if (new_node.name() == "g") {
+      EXPECT_EQ(2, new_node.input_size());
+      // The input from switch should be forwarded to id3.
+      EXPECT_EQ("c", new_node.input(0));
+      EXPECT_EQ("^b", new_node.input(1));
+    }
+    if (new_node.name() == "f") {
+      EXPECT_EQ(2, new_node.input_size());
+      // The input from switch should be forwarded to id3.
+      EXPECT_EQ("c", new_node.input(0));
+      EXPECT_EQ("^b", new_node.input(1));
+    }
+  }
 }
+*/
 
 TEST_F(ModelPrunerTest, PruningPerservesFetch) {
   // Build a simple graph with a few trivially prunable ops.
@@ -296,6 +289,7 @@ TEST_F(ModelPrunerTest, PruningPerservesFetch) {
   Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
   Output b = ops::Sqrt(s.WithOpName("b"), {a});
   Output c = ops::Identity(s.WithOpName("c"), b);
+  Output d = ops::Identity(s.WithOpName("d"), c);
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.cc b/tensorflow/core/grappler/optimizers/static_schedule.cc
index 6ce6deef2ceacdfe44b49659109e432b87739f97..450e85340796fdde9afdfebbd0eb9a724cb9440a 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule.cc
@@ -86,7 +86,7 @@ Status EstimateEarliestExecutionTimes(
   name_map.clear();
 
   GraphProperties properties(item);
-  TF_RETURN_IF_ERROR(properties.InferStatically());
+  TF_RETURN_IF_ERROR(properties.InferStatically(true));
   OpLevelCostEstimator estimator;
   VirtualPlacer placer(cluster);
 
@@ -154,7 +154,7 @@ Status EstimateRequiredTimes(
     }
   }
   GraphProperties properties(item);
-  TF_RETURN_IF_ERROR(properties.InferStatically());
+  TF_RETURN_IF_ERROR(properties.InferStatically(true));
   OpLevelCostEstimator estimator;
   VirtualPlacer placer(cluster);
 
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.h b/tensorflow/core/grappler/optimizers/static_schedule.h
index aa2726a2bdf95fa6f73d131e36371b8c18de1aaf..678b4d193fb30610820769d5e899322f924da4ad 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule.h
+++ b/tensorflow/core/grappler/optimizers/static_schedule.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
 
 #include <unordered_map>
 
@@ -47,4 +47,4 @@ Status EstimateRequiredTimes(
 }  // namespace grappler
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
diff --git a/tensorflow/core/grappler/optimizers/static_schedule_test.cc b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
index 5de593358727bf8b1f247c0fb9ec8f52b2819e4c..08580d92842377c2dd999950b2e01bef01e2fee6 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule_test.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
@@ -64,17 +64,17 @@ TEST_F(StaticScheduleTest, BasicGraph) {
     if (time.first->name() == "Const/Const") {
       EXPECT_EQ(Costs::NanoSeconds(1), time.second);
     } else if (time.first->name() == "x") {
-      EXPECT_EQ(Costs::NanoSeconds(250002), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(250001), time.second);
     } else if (time.first->name() == "Square") {
-      EXPECT_EQ(Costs::NanoSeconds(1500005), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(1500004), time.second);
     } else if (time.first->name() == "Square_1") {
-      EXPECT_EQ(Costs::NanoSeconds(2750008), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(2750007), time.second);
     } else if (time.first->name() == "Square_2") {
-      EXPECT_EQ(Costs::NanoSeconds(4000011), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(4000010), time.second);
     } else if (time.first->name() == "Square_3") {
-      EXPECT_EQ(Costs::NanoSeconds(5250014), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(5250013), time.second);
     } else if (time.first->name() == "y") {
-      EXPECT_EQ(Costs::NanoSeconds(6500017), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(6500013), time.second);
     }
   }
 }
@@ -110,13 +110,13 @@ TEST_F(StaticScheduleTest, BasicGraphWithCtrlDependencies) {
     if (time.first->name() == "a") {
       EXPECT_EQ(Costs::NanoSeconds(1), time.second);
     } else if (time.first->name() == "b") {
-      EXPECT_EQ(Costs::NanoSeconds(12500026), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(12500001), time.second);
     } else if (time.first->name() == "c") {
-      EXPECT_EQ(Costs::NanoSeconds(12500027), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(12500002), time.second);
     } else if (time.first->name() == "d") {
-      EXPECT_EQ(Costs::NanoSeconds(12500028), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(12500003), time.second);
     } else if (time.first->name() == "e") {
-      EXPECT_EQ(Costs::NanoSeconds(25000053), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(25000003), time.second);
     }
   }
 }
@@ -142,17 +142,17 @@ TEST_F(StaticScheduleTest, RequiredTimes) {
 
   for (auto time : required_times) {
     if (time.first->name() == "Const/Const") {
-      EXPECT_EQ(Costs::NanoSeconds(-6500016), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-6500012), time.second);
     } else if (time.first->name() == "x") {
-      EXPECT_EQ(Costs::NanoSeconds(-6250015), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-6250012), time.second);
     } else if (time.first->name() == "Square") {
-      EXPECT_EQ(Costs::NanoSeconds(-5000012), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-5000009), time.second);
     } else if (time.first->name() == "Square_1") {
-      EXPECT_EQ(Costs::NanoSeconds(-3750009), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-3750006), time.second);
     } else if (time.first->name() == "Square_2") {
-      EXPECT_EQ(Costs::NanoSeconds(-2500006), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-2500003), time.second);
     } else if (time.first->name() == "Square_3") {
-      EXPECT_EQ(Costs::NanoSeconds(-1250003), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-1250000), time.second);
     } else if (time.first->name() == "y") {
       EXPECT_EQ(Costs::NanoSeconds(0), time.second);
     }
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 07cf2cfc05d82f4caaf7a302146a37f94226927a..eb5a2c48dc8b12f7b4090e80c403e238a526e122 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <vector>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.h"
@@ -113,41 +115,6 @@ void NodeMap::UpdateOutput(const string& node_name,
   outputs.insert(nodes_[NodeName(new_output_name)]);
 }
 
-OutputMap::OutputMap(GraphDef* graph) : graph_(graph) {
-  for (int i = 0; i < graph_->node_size(); i++) {
-    auto node = graph_->mutable_node(i);
-    auto rslt = nodes_.emplace(node->name(), node);
-    // Check that the graph doesn't contain multiple nodes with the same name.
-    CHECK(rslt.second);
-    for (const auto& input : node->input()) {
-      string input_node = NodeName(input);
-      if (outputs_[input_node].count(node) == 0) {
-        outputs_[input_node].insert(std::make_pair(node, 1));
-      } else {
-        outputs_[input_node][node]++;
-      }
-    }
-  }
-}
-
-NodeDef* OutputMap::GetNode(const string& name) const {
-  string node_name = NodeName(name);
-  auto it = nodes_.find(node_name);
-  if (it == nodes_.end()) {
-    return nullptr;
-  }
-  return it->second;
-}
-
-const std::unordered_map<NodeDef*, int>& OutputMap::GetOutputs(
-    const string& node_name) const {
-  auto it = outputs_.find(node_name);
-  if (it == outputs_.end()) {
-    return empty_map_;
-  }
-  return it->second;
-}
-
 bool IsSameInput(const string& name1, const string& name2) {
   if (name1 == name2) {
     return true;
@@ -165,7 +132,7 @@ string ParseNodeName(const string& name, int* position) {
   strings::Scanner scan(name);
   scan.ZeroOrOneLiteral("^")
       .RestartCapture()
-      .One(strings::Scanner::LETTER_DIGIT_DOT)
+      .One(strings::Scanner::LETTER_DIGIT_DOT_UNDERSCORE)
       .Any(strings::Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
   StringPiece capture;
   StringPiece remaining;
@@ -241,7 +208,7 @@ string AsControlDependency(const string& node_name) {
              : strings::StrCat("^", node_name);
 }
 
-int NumOutputs(const NodeDef& node) {
+int NumOutputs(const NodeDef& node, GraphDef* graph) {
   int num_outputs = 0;
   const OpDef* op_def = nullptr;
   auto status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
@@ -256,6 +223,12 @@ int NumOutputs(const NodeDef& node) {
         num_outputs++;
       }
     }
+  } else {
+    FunctionLibraryDefinition fdef(OpRegistry::Global(), graph->library());
+    auto status = fdef.LookUpOpDef(node.op(), &op_def);
+    if (status.ok()) {
+      num_outputs = op_def->output_arg_size();
+    }
   }
   return num_outputs;
 }
@@ -317,5 +290,117 @@ NodeDef* GetTailOfChain(const NodeDef& source, const NodeMap& node_map,
   return const_cast<NodeDef*>(current);
 }
 
+// Every permutation is a product of one or more cycles. Iterate over the cycles
+// in the permutation, and convert each of those into a product of
+// transpositions (swaps): https://en.wikipedia.org/wiki/Cyclic_permutation
+void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
+                         bool invert_permutation) {
+  CHECK_EQ(graph->node_size(), permutation->size());
+  std::vector<int> inv_perm(permutation->size(), 0);
+  if (invert_permutation) {
+    for (size_t n = 0; n < permutation->size(); ++n) {
+      inv_perm[(*permutation)[n]] = n;
+    }
+    permutation->swap(inv_perm);
+  }
+  for (std::size_t n = 0; n + 1 < permutation->size(); ++n) {
+    while (n != (*permutation)[n]) {
+      std::size_t r = (*permutation)[n];
+      graph->mutable_node()->SwapElements(n, r);
+      std::swap((*permutation)[n], (*permutation)[r]);
+    }
+  }
+}
+
+void DedupControlInputs(NodeDef* node) {
+  std::unordered_set<string> inputs;
+  int pos = 0;
+  while (pos < node->input_size()) {
+    const string& input = node->input(pos);
+    if (!inputs.insert(NodeName(input)).second && IsControlInput(input)) {
+      node->mutable_input()->SwapElements(pos, node->input_size() - 1);
+      node->mutable_input()->RemoveLast();
+    } else {
+      ++pos;
+    }
+  }
+}
+
+namespace {
+template <typename T>
+inline void STLSortAndRemoveDuplicates(T* v) {
+  std::sort(v->begin(), v->end());
+  v->erase(std::unique(v->begin(), v->end()), v->end());
+}
+}  // namespace
+
+Status SimpleGraphView::Initialize(const GraphDef& graph, bool dedup_inputs,
+                                   bool dedup_outputs) {
+  const int num_nodes = graph.node_size();
+  inputs_.clear();
+  inputs_.resize(num_nodes);
+  outputs_.clear();
+  outputs_.resize(num_nodes);
+  name_to_index_.clear();
+  name_to_index_.reserve(num_nodes);
+  index_to_name_.clear();
+  index_to_name_.reserve(num_nodes);
+
+  // Build map from name to index and vice versa.
+  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
+    const NodeDef& node = graph.node(node_idx);
+    name_to_index_.emplace(node.name(), node_idx);
+    index_to_name_.push_back(node.name());
+  }
+
+  // Build forward and reverse adjacency lists.
+  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
+    const NodeDef& node = graph.node(node_idx);
+    inputs_[node_idx].reserve(node.input_size());
+    for (const string& input : node.input()) {
+      auto it = name_to_index_.find(NodeName(input));
+      if (it == name_to_index_.end()) {
+        return errors::InvalidArgument("Non-existent input ", input,
+                                       " for node ", node.name());
+      }
+      const int input_idx = it->second;
+      inputs_[node_idx].push_back(input_idx);
+      outputs_[input_idx].push_back(node_idx);
+    }
+    if (dedup_inputs) {
+      // Dedup the input list while it's still hot in cache.
+      STLSortAndRemoveDuplicates(&inputs_[node_idx]);
+    }
+  }
+
+  // Dedup outputs.
+  if (dedup_outputs) {
+    for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
+      STLSortAndRemoveDuplicates(&outputs_[node_idx]);
+    }
+  }
+  return Status::OK();
+}
+
+string SimpleGraphView::PrintToString() const {
+  string str;
+  for (int i = 0; i < num_nodes(); ++i) {
+    strings::StrAppend(&str, "Node ", i, "'", node_name(i), "'\n", "Inputs: [");
+    for (int input : inputs(i)) {
+      strings::StrAppend(&str, input, " '", node_name(input), "', ");
+    }
+    strings::StrAppend(&str, "]\n", "Outputs: [");
+    for (int j = 0; j < outputs(i).size(); ++j) {
+      const int output = outputs(i)[j];
+      if (j > 0) {
+        strings::StrAppend(&str, ", ");
+      }
+      strings::StrAppend(&str, output, " '", node_name(output), "'");
+    }
+    strings::StrAppend(&str, "]\n");
+  }
+  return str;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 411e44d487da2acf7e74d1db8669c1f809e592c1..4ecb28f681507f50ad5909f15cf1b408ed6e2979 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -58,22 +59,6 @@ class NodeMap {
   std::unordered_map<string, std::set<NodeDef*>> outputs_;
 };
 
-// A utility class to lookup a node's outputs and the number of times it
-// presents in each output.
-class OutputMap {
- public:
-  explicit OutputMap(GraphDef* graph);
-  NodeDef* GetNode(const string& name) const;
-  const std::unordered_map<NodeDef*, int>& GetOutputs(
-      const string& node_name) const;
-
- private:
-  GraphDef* graph_;
-  std::unordered_map<NodeDef*, int> empty_map_;
-  std::unordered_map<string, NodeDef*> nodes_;
-  std::unordered_map<string, std::unordered_map<NodeDef*, int>> outputs_;
-};
-
 // A vector with a set. The set stores the same elements as the vector, and
 // quickly answers whether a value is in the vector. Duplicated elements are not
 // allowed for now.
@@ -150,7 +135,7 @@ string AsControlDependency(const string& node);
 
 // Returns the number of outputs of a node according to its OpDef. Note that
 // some of the outputs may be unconnected.
-int NumOutputs(const NodeDef& node);
+int NumOutputs(const NodeDef& node, GraphDef* graph);
 
 // Number of connected non-control inputs.
 int NumNonControlInputs(const NodeDef& node);
@@ -158,6 +143,9 @@ int NumNonControlInputs(const NodeDef& node);
 // Number of connected non-control outputs.
 int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map);
 
+// Removes redundant control inputs from node.
+void DedupControlInputs(NodeDef* node);
+
 // Returns the data type in attribute `attr_name` of `node`. If that attribute
 // doesn't exist, returns DT_INVALID.
 DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name);
@@ -175,6 +163,43 @@ NodeDef* GetTailOfChain(const NodeDef& source, const NodeMap& node_map,
                         bool follow_control_input,
                         const std::function<bool(const NodeDef&)>& pred_fn);
 
+// Permute the nodes of graph in place according to the permutation.
+void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
+                         bool invert_permutation);
+
+class SimpleGraphView {
+ public:
+  Status Initialize(const GraphDef& graph) {
+    return Initialize(graph, true, true);
+  }
+  Status Initialize(const GraphDef& graph, bool dedup_inputs,
+                    bool dedup_outputs);
+
+  inline int num_nodes() const { return index_to_name_.size(); }
+  inline const int index(const string& node_name) const {
+    const auto& it = name_to_index_.find(node_name);
+    DCHECK(it != name_to_index_.end());
+    return it == name_to_index_.end() ? -1 : it->second;
+  }
+  inline const string& node_name(int node_idx) const {
+    return index_to_name_[node_idx];
+  }
+  inline const gtl::InlinedVector<int, 4>& inputs(int node_idx) const {
+    return inputs_[node_idx];
+  }
+  inline const gtl::InlinedVector<int, 2>& outputs(int node_idx) const {
+    return outputs_[node_idx];
+  }
+
+  string PrintToString() const;
+
+ private:
+  std::vector<string> index_to_name_;
+  std::unordered_map<string, int> name_to_index_;
+  std::vector<gtl::InlinedVector<int, 4>> inputs_;
+  std::vector<gtl::InlinedVector<int, 2>> outputs_;
+};
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 21243833accff6ca3423c505091900564094557d..0a9dbe22cfe3cd01c2c61661adcdd4839a957f03 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -53,6 +53,7 @@ cc_library(
     hdrs = ["topological_sort.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:op_types",
@@ -98,3 +99,49 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+cc_library(
+    name = "traversal",
+    srcs = ["traversal.cc"],
+    hdrs = ["traversal.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+tf_cc_test(
+    name = "traversal_test",
+    srcs = ["traversal_test.cc"],
+    deps = [
+        ":traversal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "grappler_test",
+    testonly = 1,
+    srcs = [
+        "grappler_test.cc",
+    ],
+    hdrs = ["grappler_test.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/frame.h b/tensorflow/core/grappler/utils/frame.h
index be726ae795769609769709746ce7bb74f849e37a..95b72748f4e1f13f1c61d64c4a457287e9d7d46b 100644
--- a/tensorflow/core/grappler/utils/frame.h
+++ b/tensorflow/core/grappler/utils/frame.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
 
 #include <unordered_map>
 #include "tensorflow/core/framework/graph.pb.h"
@@ -40,4 +40,4 @@ Status IdentifyFramesWithNodeMap(const GraphDef& graph, const NodeMap& node_map,
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..813f65f825759ca22dba2bdfd8433d946b7dd852
--- /dev/null
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include <memory>
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace grappler {
+
+std::vector<Tensor> GrapplerTest::EvaluateNodes(
+    const GraphDef& graph, const std::vector<string>& node_names) {
+  SessionOptions options;
+  std::unique_ptr<tensorflow::Session> session(NewSession(options));
+  TF_CHECK_OK(session->Create(graph));
+  RunOptions run_options;
+  std::vector<Tensor> output_tensors;
+  TF_CHECK_OK(session->Run(run_options, {}, node_names, node_names,
+                           &output_tensors, nullptr));
+  TF_CHECK_OK(session->Close());
+  return output_tensors;
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
new file mode 100644
index 0000000000000000000000000000000000000000..46ce47c8c3b6bc18b6eac76bbdb8ec1f8a58fab2
--- /dev/null
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_GRAPPLER_TEST_H_
+#define TENSORFLOW_GRAPPLER_GRAPPLER_TEST_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class GrapplerTest : public ::testing::Test {
+ protected:
+  std::vector<Tensor> EvaluateNodes(const GraphDef& graph,
+                                    const std::vector<string>& node_names);
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_GRAPPLER_TEST_H_
diff --git a/tensorflow/core/grappler/utils/scc.h b/tensorflow/core/grappler/utils/scc.h
index 4e46169971ac5a92b79370c01d4634cf9e6c1b96..4fb7aab6474c35eaa9d3ebbb93f0a70ab16c5fb4 100644
--- a/tensorflow/core/grappler/utils/scc.h
+++ b/tensorflow/core/grappler/utils/scc.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
 
 #include <unordered_map>
 #include "tensorflow/core/framework/graph.pb.h"
@@ -43,4 +43,4 @@ int IdentifyLoops(const GraphDef& graph,
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 77d4702d21e75b1689875eb17fbd2cda41aa1ba8..8d8ff4da3a8df5a2868f1a3a0ac6a5d0c2fd66ad 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -19,61 +19,56 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // Kahn's algorithm is implemented.
 // For details, see https://en.wikipedia.org/wiki/Topological_sorting
-void TopologicalSort(GraphDef* graph) {
-  OutputMap output_map(graph);
-  std::vector<NodeDef*> ready_nodes;
-  ready_nodes.reserve(graph->node_size());
+Status TopologicalSort(GraphDef* graph) {
+  SimpleGraphView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.Initialize(*graph));
+
+  std::vector<int> ready_nodes;
+  ready_nodes.reserve(graph_view.num_nodes());
+
   int front = 0;
   int back = 0;
-  std::unordered_map<const NodeDef*, int> ready_inputs;
-  for (int i = 0; i < graph->node_size(); i++) {
-    auto node = graph->mutable_node(i);
-    if (node->input_size() == 0) {
-      ready_nodes.push_back(node);
+  std::vector<int> num_ready_inputs(graph_view.num_nodes(), 0);
+  for (int i = 0; i < graph_view.num_nodes(); i++) {
+    if (graph_view.inputs(i).empty()) {
+      ready_nodes.push_back(i);
       back++;
     }
-    if (IsMerge(*node)) {
-      ready_inputs[node] = 0;
-      for (const auto& input : node->input()) {
-        if (IsNextIteration(*output_map.GetNode(input))) {
-          ready_inputs[node]++;
+    if (IsMerge(graph->node(i))) {
+      for (int input : graph_view.inputs(i)) {
+        if (IsNextIteration(graph->node(input))) {
+          num_ready_inputs[i]++;
         }
       }
-    } else {
-      ready_inputs[node] = 0;
     }
   }
 
   while (front != back) {
-    auto ready_node = ready_nodes[front];
-    for (const auto& fanout_pair : output_map.GetOutputs(ready_node->name())) {
-      auto fanout = fanout_pair.first;
-      ready_inputs[fanout] += fanout_pair.second;
-      if (ready_inputs[fanout] == fanout->input_size()) {
+    int ready_node = ready_nodes[front];
+    for (int fanout : graph_view.outputs(ready_node)) {
+      ++num_ready_inputs[fanout];
+      if (num_ready_inputs[fanout] == graph_view.inputs(fanout).size()) {
         ready_nodes.push_back(fanout);
-        back++;
+        ++back;
       }
     }
-    front++;
+    ++front;
   }
 
-  if (back == graph->node_size()) {
-    GraphDef new_graph;
-    new_graph.mutable_node()->Reserve(graph->node_size());
-    for (int i = 0; i < graph->node_size(); i++) {
-      auto new_node = new_graph.add_node();
-      new_node->Swap(ready_nodes[i]);
-    }
-    graph->mutable_node()->Swap(new_graph.mutable_node());
-  } else {
-    LOG(ERROR) << "The graph couldn't be sorted in topological order.";
+  if (back != graph_view.num_nodes()) {
+    return errors::InvalidArgument(
+        "The graph couldn't be sorted in topological order.");
   }
+
+  PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
+  return Status::OK();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
index d4d8034ef577a0282dbce161aed8ba440bf248ab..7700fe41e40e6d1111c9e84aabfd2a05968ef882 100644
--- a/tensorflow/core/grappler/utils/topological_sort.h
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -13,18 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // Sort a graph in topological order.
-void TopologicalSort(GraphDef* graph);
+Status TopologicalSort(GraphDef* graph);
 
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index dc99cb1052ce9db3035401a2cd75e838281fb748..c96f15b0e8424d70e8dd1393cf254b52f69200d2 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -51,7 +52,7 @@ TEST_F(TopologicalSortTest, NoLoop) {
   *graph.add_node() = CreateNode("5", {});
   *graph.add_node() = CreateNode("4", {});
 
-  TopologicalSort(&graph);
+  TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
@@ -67,7 +68,7 @@ TEST_F(TopologicalSortTest, WithLoop) {
   *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
   *graph.add_node() = CreateNode("1", {});
 
-  TopologicalSort(&graph);
+  TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"1", "2", "3", "4", "5"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
@@ -82,7 +83,7 @@ TEST_F(TopologicalSortTest, WithIllegalLoop) {
   *graph.add_node() = CreateNode("3", {"2"});
   *graph.add_node() = CreateNode("1", {});
 
-  TopologicalSort(&graph);
+  EXPECT_FALSE(TopologicalSort(&graph).ok());
   std::vector<string> order = {"2", "3", "1"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
@@ -94,13 +95,34 @@ TEST_F(TopologicalSortTest, DuplicatedInputs) {
   *graph.add_node() = CreateNode("2", {"1", "1"});
   *graph.add_node() = CreateNode("1", {});
 
-  TopologicalSort(&graph);
+  TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"1", "2"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
 }
 
+TEST_F(TopologicalSortTest, Idempotent) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("1", {});
+  *graph.add_node() = CreateNode("2", {});
+  *graph.add_node() = CreateNode("3", {"1", "2"});
+  *graph.add_node() = CreateNode("4", {"1", "3"});
+  *graph.add_node() = CreateNode("5", {"2", "3"});
+
+  TF_EXPECT_OK(TopologicalSort(&graph));
+  std::vector<string> order = {"1", "2", "3", "4", "5"};
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+
+  // Run topo sort again to verify that it is idenpotent.
+  TF_EXPECT_OK(TopologicalSort(&graph));
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/traversal.cc b/tensorflow/core/grappler/utils/traversal.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f44f53c4e63805544fa480628e805303064edb3d
--- /dev/null
+++ b/tensorflow/core/grappler/utils/traversal.cc
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/traversal.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+void ReverseDfs(const GraphView& graph_view, const std::vector<NodeDef*>& from,
+                const std::function<void(NodeDef*)>& pre_order,
+                const std::function<void(NodeDef*)>& post_order,
+                const std::function<void(NodeDef*, NodeDef*)>& on_back_edge) {
+  // Stack of work to do.
+  struct StackElem {
+    NodeDef* node;
+    bool children_visited;
+    NodeDef* src;
+  };
+  std::vector<StackElem> stack;
+
+  stack.reserve(from.size());
+  for (NodeDef* node : from) {
+    stack.push_back(StackElem{node, false});
+  }
+
+  enum NodeState { NOT_VISITED = 0, VISITING = 1, DONE = 2 };
+  std::unordered_map<NodeDef*, NodeState> node_state;
+  while (!stack.empty()) {
+    StackElem w = stack.back();
+    stack.pop_back();
+
+    if (w.children_visited) {
+      // We've processed all the children of this node
+      node_state[w.node] = DONE;
+      if (post_order) {
+        post_order(w.node);
+      }
+      continue;
+    }
+
+    auto& rslt = node_state[w.node];
+    if (rslt == DONE) {
+      continue;
+    } else if (rslt == VISITING) {
+      // Loop detected
+      if (on_back_edge) {
+        on_back_edge(w.src, w.node);
+      }
+      continue;
+    }
+    rslt = VISITING;
+    if (pre_order) {
+      pre_order(w.node);
+    }
+
+    // Enqueue the node again with the children_visited flag set to true.
+    stack.push_back(StackElem{w.node, true, w.src});
+
+    // Now enqueu the node children.
+    for (const auto fanin : graph_view.GetFanins(*w.node, true)) {
+      stack.push_back(StackElem{fanin.node, false, w.node});
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/traversal.h b/tensorflow/core/grappler/utils/traversal.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb3fa090e8fdaf12ed6dcb18eb1511c55496a125
--- /dev/null
+++ b/tensorflow/core/grappler/utils/traversal.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
+
+#include <functional>
+#include "tensorflow/core/grappler/graph_view.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Traverse the graph in reverse dfs order, starting from the list of nodes
+// specified in the 'from' argument. The pre_order and post_order functors will
+// be called on each reachable node (including the 'from' nodes) in pre and post
+// order. If loops are found, the on_back_edge functor will be called on the
+// corresponding back edges. Moreover, the pre and post order will assume that
+// these back edges will be cut.
+void ReverseDfs(const GraphView& graph_view, const std::vector<NodeDef*>& from,
+                const std::function<void(NodeDef*)>& pre_order,
+                const std::function<void(NodeDef*)>& post_order,
+                const std::function<void(NodeDef*, NodeDef*)>& on_back_edge);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
diff --git a/tensorflow/core/grappler/utils/traversal_test.cc b/tensorflow/core/grappler/utils/traversal_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc68bd1a9637cb6f61955e8fa5d495a34f19cb09
--- /dev/null
+++ b/tensorflow/core/grappler/utils/traversal_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/traversal.h"
+//#include "tensorflow/core/framework/node_def.pb.h"
+//#include "tensorflow/core/lib/core/status_test_util.h"
+//#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class TraversalTest : public ::testing::Test {
+ protected:
+  static NodeDef CreateNode(const string& name,
+                            const std::vector<string>& inputs) {
+    return CreateNode(name, "", inputs);
+  }
+  static NodeDef CreateNode(const string& name, const string& op,
+                            const std::vector<string>& inputs) {
+    NodeDef node;
+    node.set_name(name);
+    if (!op.empty()) {
+      node.set_op(op);
+    }
+    for (const string& input : inputs) {
+      node.add_input(input);
+    }
+    return node;
+  }
+};
+
+TEST_F(TraversalTest, ReverseDfsNoLoop) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("2", {"5"});
+  *graph.add_node() = CreateNode("0", {"5", "4"});
+  *graph.add_node() = CreateNode("1", {"4", "3"});
+  *graph.add_node() = CreateNode("3", {"2"});
+  *graph.add_node() = CreateNode("5", {});
+  *graph.add_node() = CreateNode("4", {});
+
+  std::vector<NodeDef*> start_nodes = {graph.mutable_node(1),
+                                       graph.mutable_node(2)};
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  bool found_back_edge = false;
+  ReverseDfs(
+      GraphView(&graph), start_nodes,
+      [&pre_order](NodeDef* n) { pre_order.push_back(n->name()); },
+      [&post_order](NodeDef* n) { post_order.push_back(n->name()); },
+      [&found_back_edge](NodeDef*, NodeDef*) { found_back_edge = true; });
+
+  EXPECT_EQ(std::vector<string>({"1", "4", "3", "2", "5", "0"}), pre_order);
+  EXPECT_EQ(std::vector<string>({"4", "5", "2", "3", "1", "0"}), post_order);
+  EXPECT_FALSE(found_back_edge);
+}
+
+TEST_F(TraversalTest, ReverseDfsWithLoop) {
+  GraphDef graph;
+  // Create a loop
+  *graph.add_node() = CreateNode("2", "Merge", {"1", "5"});
+  *graph.add_node() = CreateNode("3", "Switch", {"2"});
+  *graph.add_node() = CreateNode("4", "Identity", {"3"});
+  *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
+  *graph.add_node() = CreateNode("1", "Enter", {});
+  *graph.add_node() = CreateNode("6", "Exit", {"3"});
+
+  std::vector<NodeDef*> start_nodes = {graph.mutable_node(5)};
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+  ReverseDfs(
+      GraphView(&graph), start_nodes,
+      [&pre_order](NodeDef* n) { pre_order.push_back(n->name()); },
+      [&post_order](NodeDef* n) { post_order.push_back(n->name()); },
+      [&back_edges](NodeDef* src, NodeDef* dst) {
+        back_edges.push_back(strings::StrCat(src->name(), "->", dst->name()));
+      });
+
+  EXPECT_EQ(std::vector<string>({"6", "3", "2", "1", "5", "4"}), pre_order);
+  EXPECT_EQ(std::vector<string>({"1", "4", "5", "2", "3", "6"}), post_order);
+  EXPECT_EQ(std::vector<string>({"4->3"}), back_edges);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 77371c399e5fc7321f7c2b271aae32ce9655244b..eabce5b5ee7b037b7bc429abfa86ee8735bdbede 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -29,83 +29,84 @@ namespace {
 class UtilsTest : public ::testing::Test {
  protected:
   NodeDef CreateConcatOffsetNode() const {
-    const string gdef_ascii = R"EOF(
-name: "gradients/InceptionV3/Mixed_7c/Branch_1/concat_v2_grad/ConcatOffset"
-op: "ConcatOffset"
-input: "InceptionV3/Mixed_7c/Branch_1/concat_v2/axis"
-input: "gradients/InceptionV3/Mixed_7c/Branch_1/concat_v2_grad/Shape"
-input: "gradients/InceptionV3/Mixed_7c/Branch_1/concat_v2_grad/Shape_1"
-attr {
-  key: "N"
-  value {
-    i: 2
-  }
-}
-    )EOF";
+    const string gdef_ascii =
+        " name: 'gradients/InceptionV3/Mixed_7c/Branch_1/concat_v2_grad/"
+        "ConcatOffset'"
+        " op: 'ConcatOffset'"
+        " input: 'InceptionV3/Mixed_7c/Branch_1/concat_v2/axis'"
+        " input: 'gradients/InceptionV3/Mixed_7c/Branch_1/concat_v2_grad/Shape'"
+        " input: "
+        " 'gradients/InceptionV3/Mixed_7c/Branch_1/concat_v2_grad/Shape_1'"
+        " attr {"
+        "  key: 'N'"
+        "  value {"
+        "    i: 2"
+        "  }"
+        " }";
     NodeDef node;
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &node));
     return node;
   }
 
   NodeDef CreateDequeueNode() const {
-    const string gdef_ascii = R"EOF(
-name: "Train/TrainInput/input_producer_Dequeue"
-op: "QueueDequeueV2"
-input: "Train/TrainInput/input_producer"
-attr {
-  key: "component_types"
-  value {
-    list {
-      type: DT_INT32
-    }
-  }
-}
-attr {
-  key: "timeout_ms"
-  value {
-    i: -1
-  }
-}
-    )EOF";
+    const string gdef_ascii =
+        " name: 'Train/TrainInput/input_producer_Dequeue'"
+        " op: 'QueueDequeueV2'"
+        " input: 'Train/TrainInput/input_producer'"
+        " attr {"
+        "  key: 'component_types'"
+        "   value {"
+        "     list {"
+        "       type: DT_INT32"
+        "     }"
+        "   }"
+        " }"
+        " attr {"
+        "   key: 'timeout_ms'"
+        "   value {"
+        "     i: -1"
+        "   }"
+        " }";
+
     NodeDef node;
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &node));
     return node;
   }
 
   NodeDef CreateFusedBatchNormNode() const {
-    const string gdef_ascii = R"EOF(
-name: "InceptionV3/Conv2d_1a_3x3/BatchNorm/FusedBatchNorm"
-op: "FusedBatchNorm"
-input: "InceptionV3/Conv2d_1a_3x3/BatchNorm/FusedBatchNorm"
-input: "InceptionV3/Conv2d_1a_3x3/BatchNorm/gamma/read"
-input: "InceptionV3/Conv2d_1a_3x3/BatchNorm/beta/read"
-input: "InceptionV3/Conv2d_1a_3x3/BatchNorm/Const"
-input: "InceptionV3/Conv2d_1a_3x3/BatchNorm/Const_1"
-attr {
-  key: "T"
-  value {
-    type: DT_FLOAT
-  }
-}
-attr {
-  key: "data_format"
-  value {
-    s: "NHWC"
-  }
-}
-attr {
-  key: "epsilon"
-  value {
-    f: 0.001
-  }
-}
-attr {
-  key: "is_training"
-  value {
-    b: true
-  }
-}
-    )EOF";
+    const string gdef_ascii =
+        " name: 'InceptionV3/Conv2d_1a_3x3/BatchNorm/FusedBatchNorm'"
+        " op: 'FusedBatchNorm'"
+        " input: 'InceptionV3/Conv2d_1a_3x3/BatchNorm/FusedBatchNorm'"
+        " input: 'InceptionV3/Conv2d_1a_3x3/BatchNorm/gamma/read'"
+        " input: 'InceptionV3/Conv2d_1a_3x3/BatchNorm/beta/read'"
+        " input: 'InceptionV3/Conv2d_1a_3x3/BatchNorm/Const'"
+        " input: 'InceptionV3/Conv2d_1a_3x3/BatchNorm/Const_1'"
+        " attr {"
+        "   key: 'T'"
+        "   value {"
+        "     type: DT_FLOAT"
+        "   }"
+        " }"
+        " attr {"
+        "   key: 'data_format'"
+        "   value {"
+        "     s: 'NHWC'"
+        "   }"
+        " }"
+        " attr {"
+        "   key: 'epsilon'"
+        "   value {"
+        "     f: 0.001"
+        "   }"
+        " }"
+        " attr {"
+        "   key: 'is_training'"
+        "   value {"
+        "     b: true"
+        "   }"
+        " }";
+
     NodeDef node;
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &node));
     return node;
@@ -177,9 +178,10 @@ TEST_F(UtilsTest, ExecuteWithTimeout) {
 }
 
 TEST_F(UtilsTest, NumOutputs) {
-  EXPECT_EQ(2, NumOutputs(CreateConcatOffsetNode()));
-  EXPECT_EQ(5, NumOutputs(CreateFusedBatchNormNode()));
-  EXPECT_EQ(1, NumOutputs(CreateDequeueNode()));
+  GraphDef graph;
+  EXPECT_EQ(2, NumOutputs(CreateConcatOffsetNode(), &graph));
+  EXPECT_EQ(5, NumOutputs(CreateFusedBatchNormNode(), &graph));
+  EXPECT_EQ(1, NumOutputs(CreateDequeueNode(), &graph));
 }
 
 TEST_F(UtilsTest, AsControlDependency) {
@@ -249,6 +251,49 @@ TEST_F(UtilsTest, GetTailOfChain) {
   EXPECT_EQ("noop", tail->name());
 }
 
+TEST_F(UtilsTest, DedupControlInputs) {
+  NodeDef foo;
+  foo.set_name("foo");
+  foo.add_input("bar");
+  DedupControlInputs(&foo);
+  EXPECT_EQ(1, foo.input_size());
+  EXPECT_EQ("bar", foo.input(0));
+
+  foo.set_input(0, "^bar");
+  DedupControlInputs(&foo);
+  EXPECT_EQ(1, foo.input_size());
+  EXPECT_EQ("^bar", foo.input(0));
+
+  foo.set_input(0, "bar");
+  foo.add_input("bar");
+  DedupControlInputs(&foo);
+  EXPECT_EQ(2, foo.input_size());
+  EXPECT_EQ("bar", foo.input(0));
+  EXPECT_EQ("bar", foo.input(1));
+
+  foo.set_input(1, "^bar");
+  DedupControlInputs(&foo);
+  EXPECT_EQ(1, foo.input_size());
+  EXPECT_EQ("bar", foo.input(0));
+
+  foo.set_input(0, "^bar");
+  foo.add_input("^bar");
+  DedupControlInputs(&foo);
+  EXPECT_EQ(1, foo.input_size());
+  EXPECT_EQ("^bar", foo.input(0));
+
+  foo.set_input(0, "bar");
+  foo.add_input("gnu");
+  foo.add_input("^bar");
+  foo.add_input("^gnu");
+  DedupControlInputs(&foo);
+  EXPECT_EQ(2, foo.input_size());
+  EXPECT_EQ("bar", foo.input(0));
+  EXPECT_EQ("gnu", foo.input(1));
+}
+
+TEST_F(UtilsTest, DeleteNodes) {}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 21411097e8432c90dc94ed12c57aac4dab4b3700..523e3956996de2f1cd5a5626b15dfff73022a9d5 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -38,6 +38,7 @@ load(
     "tf_mkl_kernel_library",
     "cc_header_only_library",
     "if_not_windows",
+    "if_override_eigen_strong_inline",
 )
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
@@ -194,10 +195,9 @@ cc_library(
     ],
 )
 
-cc_library(
+tf_kernel_library(
     name = "fill_functor",
-    srcs = ["fill_functor.cc"],
-    hdrs = ["fill_functor.h"],
+    prefix = "fill_functor",
     deps = [
         "//tensorflow/core:framework",
         "//third_party/eigen3",
@@ -269,13 +269,11 @@ cc_library(
 cc_library(
     name = "conv_ops_gpu_hdrs",
     hdrs = ["conv_ops_gpu.h"],
-    deps = ["//third_party/eigen3"],
 )
 
 cc_library(
     name = "gpu_util_hdrs",
     hdrs = ["gpu_utils.h"],
-    deps = ["//third_party/eigen3"],
 )
 
 tf_cc_test(
@@ -371,6 +369,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "batch_kernels",
+    srcs = ["batch_kernels.cc"],
+    deps = [
+        "//tensorflow/core:batch_ops_op_lib",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:concat_lib_hdrs",
+        "//tensorflow/core/kernels:ops_util_hdrs",
+        "//tensorflow/core/kernels:split_lib_hdrs",
+        "//tensorflow/core/kernels/batching_util:periodic_function_dynamic",
+        "//tensorflow/core/kernels/batching_util:shared_batch_scheduler_hdrs",
+    ],
+    alwayslink = 1,
+)
+
 tf_kernel_library(
     name = "record_input_op",
     srcs = [
@@ -589,6 +603,7 @@ cc_library(
         ":extract_image_patches_op",
         ":gather_nd_op",
         ":gather_op",
+        ":guarantee_const_op",
         ":identity_n_op",
         ":identity_op",
         ":inplace_ops",
@@ -606,6 +621,7 @@ cc_library(
         ":reverse_sequence_op",
         ":shape_ops",
         ":slice_op",
+        ":snapshot_op",
         ":split_op",
         ":split_v_op",
         ":strided_slice_op",
@@ -613,6 +629,7 @@ cc_library(
         ":transpose_op",
         ":unique_op",
         ":unpack_op",
+        ":unravel_index_op",
         ":where_op",
     ],
 )
@@ -635,6 +652,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS,
 )
 
+tf_kernel_library(
+    name = "guarantee_const_op",
+    prefix = "guarantee_const_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "constant_op",
     prefix = "constant_op",
@@ -796,6 +819,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS + [":strided_slice_op"],
 )
 
+tf_kernel_library(
+    name = "snapshot_op",
+    prefix = "snapshot_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "split_op",
     gpu_srcs = ["cuda_device_array.h"],
@@ -855,6 +884,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS + [":split_lib"],
 )
 
+tf_kernel_library(
+    name = "unravel_index_op",
+    prefix = "unravel_index_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "where_op",
     srcs = ["where_op.cc"],
@@ -1193,6 +1228,25 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "guarantee_const_op_test",
+    size = "small",
+    srcs = ["guarantee_const_op_test.cc"],
+    deps = [
+        ":guarantee_const_op",
+        ":ops_testutil",
+        ":ops_util",
+        ":variable_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "identity_op_test",
     size = "small",
@@ -1836,6 +1890,7 @@ tf_kernel_library(
     srcs = ["resource_variable_ops.cc"],
     deps = [
         ":bounds_check",
+        ":critical_section",
         ":dense_update_functor",
         ":gather_functor",
         ":scatter_functor",
@@ -1849,6 +1904,23 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "list_kernels",
+    srcs = ["list_kernels.cc"],
+    hdrs = ["list_kernels.h"],
+    gpu_srcs = [
+        "list_kernels.cu.cc",
+        "list_kernels.h",
+    ],
+    deps = [
+        ":concat_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:list_ops_op_lib",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_kernel_library(
     name = "fact_op",
     prefix = "fact_op",
@@ -1870,6 +1942,17 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "functional_ops",
+    prefix = "functional_ops",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "image",
     deps = [
@@ -2298,6 +2381,7 @@ cc_library(
         ":determinant_op",
         ":matrix_exponential_op",
         ":matrix_inverse_op",
+        ":matrix_logarithm_op",
         ":matrix_solve_ls_op",
         ":matrix_solve_op",
         ":matrix_triangular_solve_op",
@@ -2369,6 +2453,12 @@ tf_kernel_library(
     deps = LINALG_DEPS,
 )
 
+tf_kernel_library(
+    name = "matrix_logarithm_op",
+    prefix = "matrix_logarithm_op",
+    deps = LINALG_DEPS,
+)
+
 tf_kernel_library(
     name = "self_adjoint_eig_op",
     prefix = "self_adjoint_eig_op",
@@ -2510,6 +2600,45 @@ tf_cc_tests(
     ],
 )
 
+cc_library(
+    name = "manip",
+    deps = [
+        ":roll_op",
+    ],
+)
+
+MANIP_DEPS = [
+    "//tensorflow/core:framework",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:manip_ops_op_lib",
+    "//third_party/eigen3",
+]
+
+tf_kernel_library(
+    name = "roll_op",
+    prefix = "roll_op",
+    deps = MANIP_DEPS,
+)
+
+tf_cc_test(
+    name = "roll_op_test",
+    size = "small",
+    srcs = ["roll_op_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        ":roll_op",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 MATH_DEPS = [
     ":bounds_check",
     ":fill_functor",
@@ -3012,6 +3141,7 @@ tf_kernel_library(
         "//conditions:default": [],
     }),
     hdrs = [
+        "fill_functor.h",
         "conv_grad_ops.h",
         "deep_conv2d.h",
         "gemm_functors.h",
@@ -3020,6 +3150,10 @@ tf_kernel_library(
         ":xsmm": ["xsmm_conv2d.h"],
         "//conditions:default": [],
     }),
+    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
+    # So that it doesn't take 20 minutes to compile conv_grad_ops_3d.cc and conv_ops_3d.cc
+    # on Windows. See https://github.com/tensorflow/tensorflow/issues/10521
+    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     defines = select({
         ":xsmm": [
             "TENSORFLOW_USE_LIBXSMM",
@@ -3036,6 +3170,7 @@ tf_kernel_library(
         ":conv_2d",
         ":conv_3d",
         ":image_resizer_state",
+        ":fill_functor",
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -3089,6 +3224,7 @@ cc_library(
         ":batch_norm_op",
         ":bias_op",
         ":conv_ops",
+        ":data_format_ops",
         ":depthwise_conv_grad_op",
         ":depthwise_conv_op",
         ":dilation_ops",
@@ -3126,6 +3262,12 @@ tf_kernel_library(
     deps = NN_DEPS,
 )
 
+tf_kernel_library(
+    name = "data_format_ops",
+    prefix = "data_format_ops",
+    deps = NN_DEPS,
+)
+
 tf_kernel_library(
     name = "bias_op",
     prefix = "bias_op",
@@ -3135,7 +3277,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "fused_batch_norm_op",
     prefix = "fused_batch_norm_op",
-    deps = NN_DEPS,
+    deps = NN_DEPS + [
+        ":fill_functor",
+    ],
 )
 
 tf_kernel_library(
@@ -3332,6 +3476,7 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ],
@@ -3457,6 +3602,7 @@ tf_kernel_library(
 cc_library(
     name = "parsing",
     deps = [
+        ":decode_compressed_op",
         ":decode_csv_op",
         ":decode_raw_op",
         ":example_parsing_ops",
@@ -3485,6 +3631,14 @@ tf_kernel_library(
     deps = PARSING_DEPS,
 )
 
+tf_kernel_library(
+    name = "decode_compressed_op",
+    prefix = "decode_compressed_op",
+    deps = [
+        "//tensorflow/core:lib_internal",
+    ] + PARSING_DEPS,
+)
+
 tf_kernel_library(
     name = "example_parsing_ops",
     prefix = "example_parsing_ops",
@@ -3914,6 +4068,8 @@ tf_kernel_library(
         "scatter_nd_op_cpu_impl_3.cc",
         "scatter_nd_op_cpu_impl_4.cc",
         "scatter_nd_op_cpu_impl_5.cc",
+        "scatter_nd_op_cpu_impl_6.cc",
+        "scatter_nd_op_cpu_impl_7.cc",
     ],
     hdrs = [
         "scatter_nd_op.h",
@@ -3923,7 +4079,11 @@ tf_kernel_library(
         "scatter_nd_op.h",
         "scatter_nd_op_gpu.cu.cc",
     ],
-    deps = STATE_DEPS + [":dense_update_functor"],
+    deps = STATE_DEPS + [
+        ":dense_update_functor",
+        ":training_op_helpers",
+        ":variable_ops",
+    ],
 )
 
 tf_kernel_library(
@@ -3932,6 +4092,12 @@ tf_kernel_library(
     deps = STATE_DEPS,
 )
 
+tf_kernel_library(
+    name = "critical_section",
+    prefix = "critical_section",
+    deps = STATE_DEPS + [":captured_function"],
+)
+
 tf_cc_test(
     name = "scatter_op_test",
     size = "small",
@@ -4175,7 +4341,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/fft2d:fft2d_headers",
-        "@fft2d//:fft2d",
+        "@fft2d",
     ],
 )
 
@@ -4482,6 +4648,8 @@ filegroup(
         "gather_nd_op_cpu_impl_3.cc",
         "gather_nd_op_cpu_impl_4.cc",
         "gather_nd_op_cpu_impl_5.cc",
+        "gather_nd_op_cpu_impl_6.cc",
+        "gather_nd_op_cpu_impl_7.cc",
         "gather_op.cc",
         "identity_n_op.cc",
         "identity_n_op.h",
@@ -4569,6 +4737,7 @@ filegroup(
         "control_flow_ops.h",
         "conv_2d.h",
         "conv_ops.h",
+        "data_format_ops.h",
         "depthtospace_op.h",
         "depthwise_conv_op.h",
         "fake_quant_ops_functor.h",
@@ -4682,6 +4851,7 @@ filegroup(
         "cwise_op_squared_difference.cc",
         "cwise_op_sub.cc",
         "cwise_op_tanh.cc",
+        "data_format_ops.cc",
         "decode_wav_op.cc",
         "deep_conv2d.cc",
         "deep_conv2d.h",
@@ -4874,10 +5044,11 @@ filegroup(
             "summary_interface.*",
             "summary_kernels.*",
             "spectrogram_convert_test_data.cc",
-            "sql_dataset_ops.cc",
             # Excluded due to experimental status:
             "debug_ops.*",
             "scatter_nd_op*",
+            "critical_section.*",
+            "batch_kernels.*",
         ],
     ),
     visibility = ["//visibility:public"],
@@ -4910,8 +5081,8 @@ cc_library(
         "//tensorflow/core:protos_all_cc_impl",
         "//third_party/eigen3",
         "//third_party/fft2d:fft2d_headers",
-        "@fft2d//:fft2d",
-        "@gemmlowp//:gemmlowp",
+        "@fft2d",
+        "@gemmlowp",
         "@protobuf_archive//:protobuf",
     ],
     alwayslink = 1,
@@ -4982,7 +5153,7 @@ tf_kernel_library(
         "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
-        "@gemmlowp//:gemmlowp",
+        "@gemmlowp",
     ],
 )
 
@@ -5732,6 +5903,23 @@ tf_mkl_kernel_library(
     ]),
 )
 
+tf_mkl_kernel_library(
+    name = "mkl_softmax_op",
+    prefix = "mkl_softmax",
+    deps = [
+        ":bounds_check",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
+    ] + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+        "@mkl_dnn//:mkl_dnn",
+    ]),
+)
+
 tf_mkl_kernel_library(
     name = "mkl_fused_batch_norm_op",
     srcs = ["mkl_fused_batch_norm_op.cc"],
@@ -5798,26 +5986,6 @@ tf_mkl_kernel_library(
     ],
 )
 
-cc_library(
-    name = "stats_aggregator",
-    hdrs = ["stats_aggregator.h"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_kernel_library(
-    name = "stats_aggregator_ops",
-    srcs = ["stats_aggregator_ops.cc"],
-    deps = [
-        ":stats_aggregator",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 cc_library(
     name = "batch_util",
     srcs = ["batch_util.cc"],
@@ -5828,509 +5996,36 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "dataset",
-    srcs = ["dataset.cc"],
-    hdrs = ["dataset.h"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/util/tensor_bundle",
-    ],
-)
-
-cc_library(
-    name = "dataset_utils",
-    srcs = ["dataset_utils.cc"],
-    hdrs = ["dataset_utils.h"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 cc_library(
     name = "captured_function",
-    srcs = ["captured_function.cc"],
     hdrs = ["captured_function.h"],
     deps = [
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:proto_text",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:session_options",
-        "//tensorflow/core/kernels:variable_ops",
+        "//tensorflow/core/kernels/data:captured_function",
     ],
 )
 
 cc_library(
-    name = "window_dataset",
-    srcs = ["window_dataset.cc"],
-    hdrs = ["window_dataset.h"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "batch_dataset_op",
-    srcs = ["batch_dataset_op.cc"],
-    deps = [
-        ":batch_util",
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "padded_batch_dataset_op",
-    srcs = ["padded_batch_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "dense_to_sparse_batch_dataset_op",
-    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "group_by_window_dataset_op",
-    srcs = ["group_by_window_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        ":window_dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "filter_dataset_op",
-    srcs = ["filter_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "map_dataset_op",
-    srcs = ["map_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "map_and_batch_dataset_op",
-    srcs = ["map_and_batch_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        ":inplace_ops",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "parallel_map_dataset_op",
-    srcs = ["parallel_map_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "scan_dataset_op",
-    srcs = ["scan_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "flat_map_dataset_op",
-    srcs = ["flat_map_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "interleave_dataset_op",
-    srcs = ["interleave_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "parallel_interleave_dataset_op",
-    srcs = ["parallel_interleave_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "prefetch_dataset_op",
-    srcs = ["prefetch_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_kernel_library(
-    name = "repeat_dataset_op",
-    srcs = ["repeat_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "take_dataset_op",
-    srcs = ["take_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "skip_dataset_op",
-    srcs = ["skip_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "ignore_errors_dataset_op",
-    srcs = ["ignore_errors_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "stats_dataset_ops",
-    srcs = ["stats_dataset_ops.cc"],
-    deps = [
-        ":dataset",
-        ":stats_aggregator",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "range_dataset_op",
-    srcs = ["range_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "shuffle_dataset_op",
-    srcs = ["shuffle_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "sparse_tensor_slice_dataset_op",
-    srcs = ["sparse_tensor_slice_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "tensor_dataset_op",
-    srcs = ["tensor_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "tensor_slice_dataset_op",
-    srcs = ["tensor_slice_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "zip_dataset_op",
-    srcs = ["zip_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "concatenate_dataset_op",
-    srcs = ["concatenate_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "reader_dataset_ops",
-    srcs = ["reader_dataset_ops.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "sql_dataset_ops",
-    srcs = [
-        "sql/driver_manager.cc",
-        "sql/sqlite_query_connection.cc",
-        "sql_dataset_ops.cc",
-    ],
-    hdrs = [
-        "sql/driver_manager.h",
-        "sql/query_connection.h",
-        "sql/sqlite_query_connection.h",
-    ],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/lib/db:sqlite",
-        "@sqlite_archive//:sqlite",
-    ],
-)
-
-tf_kernel_library(
-    name = "iterator_ops",
-    srcs = ["iterator_ops.cc"],
-    deps = [
-        ":dataset",
-        ":ops_util",
-        ":stats_aggregator",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_kernel_library(
-    name = "cache_dataset_ops",
-    srcs = ["cache_dataset_ops.cc"],
+    name = "dataset",
+    hdrs = ["dataset.h"],
     deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/util/tensor_bundle",
+        "//tensorflow/core/kernels/data:dataset",
     ],
 )
 
 tf_kernel_library(
     name = "dataset_ops",
     deps = [
-        ":batch_dataset_op",
-        ":cache_dataset_ops",
-        ":concatenate_dataset_op",
-        ":dense_to_sparse_batch_dataset_op",
-        ":filter_dataset_op",
-        ":flat_map_dataset_op",
-        ":group_by_window_dataset_op",
-        ":ignore_errors_dataset_op",
-        ":interleave_dataset_op",
-        ":iterator_ops",
-        ":map_and_batch_dataset_op",
-        ":map_dataset_op",
-        ":padded_batch_dataset_op",
-        ":parallel_interleave_dataset_op",
-        ":parallel_map_dataset_op",
-        ":prefetch_dataset_op",
-        ":range_dataset_op",
-        ":reader_dataset_ops",
-        ":repeat_dataset_op",
-        ":scan_dataset_op",
-        ":shuffle_dataset_op",
-        ":skip_dataset_op",
-        ":sparse_tensor_slice_dataset_op",
-        ":sql_dataset_ops",
-        ":stats_aggregator_ops",
-        ":stats_dataset_ops",
-        ":take_dataset_op",
-        ":tensor_dataset_op",
-        ":tensor_slice_dataset_op",
-        ":zip_dataset_op",
+        "//tensorflow/core/kernels/data:dataset_ops",
     ],
 )
 
 cc_library(
     name = "summary_interface",
-    srcs = ["summary_interface.cc"],
     hdrs = ["summary_interface.h"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:ptr_util",
-    ],
-)
-
-tf_cc_test(
-    name = "summary_interface_test",
-    srcs = ["summary_interface_test.cc"],
-    deps = [
-        ":summary_interface",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
     ],
 )
 
@@ -6338,8 +6033,9 @@ tf_kernel_library(
     name = "summary_kernels",
     srcs = ["summary_kernels.cc"],
     deps = [
-        ":summary_interface",
+        "//tensorflow/contrib/tensorboard/db:schema",
         "//tensorflow/contrib/tensorboard/db:summary_db_writer",
+        "//tensorflow/contrib/tensorboard/db:summary_file_writer",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -6362,3 +6058,31 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+# Library to link with when compiling the cwise_op kernels directly,
+# e.g. for selective registration.
+# should not be linked by projects that also link the cwise_op library.
+cc_library(
+    name = "cwise_lib",
+    srcs = [
+        "cwise_ops_common.cc",
+        "meta_support.cc",
+        "quantization_utils.cc",
+    ],
+    hdrs = [
+        "cwise_ops.h",
+        "cwise_ops_common.h",
+        "cwise_ops_gpu_common.cu.h",
+        "cwise_ops_gpu_gradients.cu.h",
+        "cwise_ops_gradients.h",
+        "meta_support.h",
+        "quantization_utils.h",
+    ],
+    deps = [
+        ":bounds_check",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+        "@gemmlowp",
+    ],
+)
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index 37976f71837cb365cd9d232c7c1e102ec5bfe338..72155fd037378fc3d93c02e9b893a6671e9659a6 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -40,8 +40,8 @@ typedef Eigen::SyclDevice SYCLDevice;
 template <typename Device, typename T>
 class AdjustContrastOp : public OpKernel {
  public:
-  explicit AdjustContrastOp(OpKernelConstruction* context) : OpKernel(context) {
-  }
+  explicit AdjustContrastOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
diff --git a/tensorflow/core/kernels/adjust_contrast_op_test.cc b/tensorflow/core/kernels/adjust_contrast_op_test.cc
index 0fc03b5a236b2d63fc731f232acebdcbd1ca2532..7522b320400b034aa882efb82efab8d0419d8144 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_test.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_test.cc
@@ -29,8 +29,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-class AdjustContrastOpTest : public OpsTestBase {
-};
+class AdjustContrastOpTest : public OpsTestBase {};
 
 TEST_F(AdjustContrastOpTest, Simple_1113) {
   TF_EXPECT_OK(NodeDefBuilder("adjust_contrast_op", "AdjustContrastv2")
diff --git a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h b/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
index c160ce2c3349fbd08a1d512e35a424dc00919628..49df5ae296b3e2a213c436d0e4656757c49cb16e 100644
--- a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
+++ b/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
@@ -11,8 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
 
 #if GOOGLE_CUDA
 
@@ -143,4 +143,4 @@ __global__ void adjust_hsv_nhwc(const int64 number_elements,
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_ADJUST_HSV_GPU_CU_H_
diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/adjust_saturation_op.cc
index 4643d4e6efda2157458a557819873c8cb7546e1a..f0c6ae499d4c209ef1556890e87f63085de7ea75 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op.cc
@@ -192,8 +192,9 @@ class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
     const DeviceBase::CpuWorkerThreads& worker_threads =
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
-          kCostPerChannel, [channel_count, &input_data, &output_data, scale_h](
-                               int64 start_channel, int64 end_channel) {
+          kCostPerChannel,
+          [channel_count, &input_data, &output_data, scale_h](
+              int64 start_channel, int64 end_channel) {
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
             for (int i = start_channel; i < end_channel; i++) {
diff --git a/tensorflow/core/kernels/aggregate_ops_cpu.h b/tensorflow/core/kernels/aggregate_ops_cpu.h
index dfa3fe585e375ada0c5d3d0b3061d05d8a4efabd..aa1cead928aa25e9cf8d9c8d6d43091bf93583ee 100644
--- a/tensorflow/core/kernels/aggregate_ops_cpu.h
+++ b/tensorflow/core/kernels/aggregate_ops_cpu.h
@@ -25,7 +25,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -201,7 +201,7 @@ struct Add7Functor<SYCLDevice, T> {
                   typename TTypes<T>::ConstFlat in6,
                   typename TTypes<T>::ConstFlat in7) {
     Add7EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7);
+                                          in7);
   }
 };
 
@@ -214,7 +214,7 @@ struct Add8Functor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
     Add8EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7, in8);
+                                          in7, in8);
   }
 };
 
@@ -227,7 +227,7 @@ struct Add8pFunctor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
     Add8pEigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                          in7, in8);
+                                           in7, in8);
   }
 };
 
@@ -241,10 +241,10 @@ struct Add9Functor<SYCLDevice, T> {
       typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
       typename TTypes<T>::ConstFlat in9) {
     Add9EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                         in7, in8, in9);
+                                          in7, in8, in9);
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h
index 1d2e1c8c9aaeb8a722646916ea691fa4e5c23771..a312e8e8a420f7f909b20b28f84bf55597a58aba 100644
--- a/tensorflow/core/kernels/assign_op.h
+++ b/tensorflow/core/kernels/assign_op.h
@@ -109,6 +109,9 @@ class AssignOp : public OpKernel {
         OP_REQUIRES_OK(
             context, context->allocate_persistent(old_lhs.dtype(), rhs.shape(),
                                                   &copy, &copyTensor, attr));
+        // We track memory of variables in variable ops instead of in this
+        // assign op.
+        context->clear_recorded_memory();
         context->replace_ref_input(0, *copyTensor, /* lock_held */ true);
         if (use_exclusive_lock_) {
           Copy(context, copyTensor, rhs);
diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc
index cc8f122cab357ed0c8243ba990b3b85dd7ddcb2f..ce2fce92e4ee8cbd7bdc578d92103a5bd5da0629 100644
--- a/tensorflow/core/kernels/attention_ops.cc
+++ b/tensorflow/core/kernels/attention_ops.cc
@@ -52,8 +52,9 @@ class ExtractGlimpseOp : public OpKernel {
     const int64 batch_size = input_shape.dim_size(0);
 
     const Tensor& window_size = context->input(1);
-    OP_REQUIRES(context, (window_size.shape().dims() == 1) &&
-                             window_size.shape().dim_size(0) == 2,
+    OP_REQUIRES(context,
+                (window_size.shape().dims() == 1) &&
+                    window_size.shape().dim_size(0) == 2,
                 errors::InvalidArgument(
                     "input must be a vector of size 2 (height, width)",
                     window_size.shape().DebugString()));
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index f9180236933d04d707eb1744de3993b9396b3dfa..ec9cbc2a9b5d4c1ac6d91913fc015e139fa2a068 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -56,7 +56,9 @@ class AvgPoolingOp : public UnaryOp<T> {
                 errors::InvalidArgument("Invalid data format"));
     OP_REQUIRES(
         context, data_format_ == FORMAT_NHWC,
-        errors::InvalidArgument("Default AvgPoolingOp only supports NHWC."));
+        errors::InvalidArgument("Default AvgPoolingOp only supports NHWC ",
+                                "on device type ",
+                                DeviceTypeString(context->device_type())));
     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
     OP_REQUIRES(context, ksize_.size() == 4,
                 errors::InvalidArgument("Sliding window ksize field must "
@@ -211,9 +213,11 @@ class AvgPoolingGradOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES(context, data_format_ == FORMAT_NHWC,
-                errors::InvalidArgument(
-                    "Default AvgPoolingGradOp only supports NHWC."));
+    OP_REQUIRES(
+        context, data_format_ == FORMAT_NHWC,
+        errors::InvalidArgument("Default AvgPoolingGradOp only supports NHWC ",
+                                "on device type ",
+                                DeviceTypeString(context->device_type())));
     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
     OP_REQUIRES(context, ksize_.size() == 4,
                 errors::InvalidArgument("Sliding window ksize field must "
diff --git a/tensorflow/core/kernels/avgpooling_op.h b/tensorflow/core/kernels/avgpooling_op.h
index dea2683184a06308bff7ead2b772aab466b90b34..f5e81dbc0930888ab9258d5d5b5d52fdeb0afc01 100644
--- a/tensorflow/core/kernels/avgpooling_op.h
+++ b/tensorflow/core/kernels/avgpooling_op.h
@@ -48,9 +48,8 @@ struct SpatialAvgPooling {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-// Launch a custom GPU kernels from Yanqing for the avgpooling backward operation
-// that works NHWC data formats.
-// Arguments:
+// Launch a custom GPU kernels from Yanqing for the avgpooling backward
+// operation that works NHWC data formats. Arguments:
 //   top_diff: backprop to the output of the pooling layer
 //   num: number of input batches
 //   height: input height
diff --git a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
index 2be330d1427b28a01635cc1db5fd10096f2a8abe..6537b42f1ed8856a5f701023eb5fc55ded278ec8 100644
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@@ -71,8 +71,8 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
         hstart = max(hstart, 0);
         wstart = max(wstart, 0);
         int pool_size = (hend - hstart) * (wend - wstart);
-        gradient +=
-            top_diff_slice[(ph * pooled_width + pw) * channels] / dtype(pool_size);
+        gradient += top_diff_slice[(ph * pooled_width + pw) * channels] /
+                    dtype(pool_size);
       }
     }
     bottom_diff[index] = gradient;
@@ -90,11 +90,11 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
                             const GPUDevice& d) {
   int x_size = num * height * width * channels;
   CudaLaunchConfig config = GetCudaLaunchConfig(x_size, d);
-  AvePoolBackwardNHWC<
-      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      config.virtual_thread_count, top_diff, num, height, width, channels,
-      pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
-      pad_t, pad_t, bottom_diff);
+  AvePoolBackwardNHWC<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          config.virtual_thread_count, top_diff, num, height, width, channels,
+          pooled_height, pooled_width, kernel_h, kernel_w, stride_h, stride_w,
+          pad_t, pad_t, bottom_diff);
 
   return d.ok();
 }
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index d0bbea9fe27856cc0dedb4570d285bd872741099..944564dfba62f257ae45b3c5c25d0de64fa0b773 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -111,13 +111,14 @@ class Barrier : public ResourceBase {
       mutex_lock lock(mu_);
       if (closed_) {
         OP_REQUIRES_ASYNC(
-            ctx, !cancel_pending_enqueues_ &&
-                     (num_inserted == 0 || !incomplete_.empty()),
+            ctx,
+            !cancel_pending_enqueues_ &&
+                (num_inserted == 0 || !incomplete_.empty()),
             errors::Cancelled(
                 "Barrier ", name_, " is closed.  Pending enqueues cancelled: ",
-                cancel_pending_enqueues_, ".  Number of new insertions: ",
-                num_inserted, ".  Number of incomplete keys: ",
-                incomplete_.size(), "."),
+                cancel_pending_enqueues_,
+                ".  Number of new insertions: ", num_inserted,
+                ".  Number of incomplete keys: ", incomplete_.size(), "."),
             callback);
       }
 
@@ -128,9 +129,10 @@ class Barrier : public ResourceBase {
 
       for (int i = 0; i < num_inserted; ++i) {
         OP_REQUIRES_OK_ASYNC(
-            ctx, InsertOneLocked<T>(ctx, keys, values, element_shape,
-                                    component_index, i, &ready_tuples,
-                                    &new_elements),
+            ctx,
+            InsertOneLocked<T>(ctx, keys, values, element_shape,
+                               component_index, i, &ready_tuples,
+                               &new_elements),
             callback);
       }
 
@@ -317,8 +319,9 @@ class Barrier : public ResourceBase {
         return errors::Cancelled(
             "Barrier ", name_,
             " is closed, but attempted to insert a brand new key: ",
-            keys_vec(i), ".  Pending enqueues cancelled: ",
-            cancel_pending_enqueues_, ".  Insertion index: ", i,
+            keys_vec(i),
+            ".  Pending enqueues cancelled: ", cancel_pending_enqueues_,
+            ".  Insertion index: ", i,
             ".  Number of incomplete keys: ", incomplete_.size(), ".");
       }
     } else {
@@ -532,13 +535,14 @@ class InsertManyOp : public BarrierOpKernel {
     OP_REQUIRES_ASYNC(
         ctx, component_index_ < barrier->num_components(),
         errors::InvalidArgument("The component ID is out of range ",
-                                component_index_, " > num_components", " (= ",
-                                barrier->num_components(), ")"),
+                                component_index_, " > num_components",
+                                " (= ", barrier->num_components(), ")"),
         callback);
     OP_REQUIRES_OK_ASYNC(
-        ctx, ctx->MatchSignature({DT_STRING_REF, DT_STRING,
-                                  barrier->component_type(component_index_)},
-                                 {}),
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_STRING,
+                             barrier->component_type(component_index_)},
+                            {}),
         callback);
 
     const Tensor* keys;
diff --git a/tensorflow/contrib/batching/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
similarity index 98%
rename from tensorflow/contrib/batching/kernels/batch_kernels.cc
rename to tensorflow/core/kernels/batch_kernels.cc
index 6041d8c9b2ca14bd325d1e7ea562bc4bc27d6a51..546e51be53cee1833e8e1d4a15ea9b5be8a31506 100644
--- a/tensorflow/contrib/batching/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/batching/shared_batch_scheduler.h"
-#include "tensorflow/contrib/batching/util/periodic_function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/batching_util/periodic_function.h"
+#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
@@ -207,7 +207,7 @@ Status Split(OpKernelContext* context, const Tensor& input,
 class BatchResource : public ResourceBase {
  public:
   static Status Create(int32 num_batch_threads, int32 max_batch_size,
-                       int32 batch_timeout_micros,
+                       int32 batch_timeout_micros, int32 max_enqueued_batches,
                        const std::vector<int32>& allowed_batch_sizes,
                        std::unique_ptr<BatchResource>* resource) {
     std::unique_ptr<BatchResource> new_resource(new BatchResource);
@@ -218,6 +218,8 @@ class BatchResource : public ResourceBase {
         Batcher::Create(batcher_options, &new_resource->batcher_));
 
     new_resource->batcher_queue_options_.max_batch_size = max_batch_size;
+    new_resource->batcher_queue_options_.max_enqueued_batches =
+        max_enqueued_batches;
     new_resource->batcher_queue_options_.batch_timeout_micros =
         batch_timeout_micros;
 
@@ -513,6 +515,8 @@ class BatchKernel : public AsyncOpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("max_batch_size", &max_batch_size_));
     OP_REQUIRES_OK(c,
                    c->GetAttr("batch_timeout_micros", &batch_timeout_micros_));
+    OP_REQUIRES_OK(c,
+                   c->GetAttr("max_enqueued_batches", &max_enqueued_batches_));
     OP_REQUIRES_OK(c, c->GetAttr("allowed_batch_sizes", &allowed_batch_sizes_));
     OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
   }
@@ -524,7 +528,7 @@ class BatchKernel : public AsyncOpKernel {
           std::unique_ptr<BatchResource> new_resource;
           TF_RETURN_IF_ERROR(BatchResource::Create(
               num_batch_threads_, max_batch_size_, batch_timeout_micros_,
-              allowed_batch_sizes_, &new_resource));
+              max_enqueued_batches_, allowed_batch_sizes_, &new_resource));
           *r = new_resource.release();
           return Status::OK();
         };
@@ -570,6 +574,7 @@ class BatchKernel : public AsyncOpKernel {
   int32 num_batch_threads_;
   int32 max_batch_size_;
   int32 batch_timeout_micros_;
+  int32 max_enqueued_batches_;
   std::vector<int32> allowed_batch_sizes_;
 };
 
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 93c391831982c529fb8e270f6eb0cac8063bffbf..43e716c542ac42835baabde057e45534d5442010 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -41,7 +41,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -429,14 +429,13 @@ template <typename Scalar>
 struct LaunchBatchMatMul<SYCLDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
                      const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
-
-  // Number of matrix multiplies i.e. size of the batch.
-  const int64 batch_size = in_x.dim_size(0);
-  ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y, out,
-                                        0, batch_size);
+    // Number of matrix multiplies i.e. size of the batch.
+    const int64 batch_size = in_x.dim_size(0);
+    ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y,
+                                          out, 0, batch_size);
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename Scalar>
 class BatchMatMul : public OpKernel {
@@ -462,10 +461,10 @@ class BatchMatMul : public OpKernel {
     TensorShape out_shape;
     for (int i = 0; i < ndims - 2; ++i) {
       OP_REQUIRES(ctx, in0.dim_size(i) == in1.dim_size(i),
-                  errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(",
-                                          i, ") must be the same: ",
-                                          in0.shape().DebugString(), " vs ",
-                                          in1.shape().DebugString()));
+                  errors::InvalidArgument(
+                      "In[0].dim(", i, ") and In[1].dim(", i,
+                      ") must be the same: ", in0.shape().DebugString(), " vs ",
+                      in1.shape().DebugString()));
       out_shape.AddDim(in0.dim_size(i));
     }
     auto n = (ndims == 2) ? 1 : out_shape.num_elements();
@@ -507,12 +506,12 @@ class BatchMatMul : public OpKernel {
   bool adj_y_;
 };
 
-#define REGISTER_BATCH_MATMUL_CPU(TYPE)                                              \
+#define REGISTER_BATCH_MATMUL_CPU(TYPE)                                 \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
       BatchMatMul<CPUDevice, TYPE>)
 
-#define REGISTER_BATCH_MATMUL_GPU(TYPE)                                              \
+#define REGISTER_BATCH_MATMUL_GPU(TYPE)                                 \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("BatchMatMul").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
       BatchMatMul<GPUDevice, TYPE>)
@@ -522,5 +521,5 @@ class BatchMatMul : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                               \
       Name("BatchMatMul").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
       BatchMatMul<SYCLDevice, TYPE>)
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 8d155ca62b297a4bf59f62159d6b62b01f777721..7e1e2aa4ec135872993f2e7738c7e863416eee87 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -35,5 +35,5 @@ TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
 #ifdef TENSORFLOW_USE_SYCL
 TF_CALL_float(REGISTER_BATCH_MATMUL_SYCL);
 TF_CALL_double(REGISTER_BATCH_MATMUL_SYCL);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_matmul_op_test.cc b/tensorflow/core/kernels/batch_matmul_op_test.cc
index 7923f34155b57cb79894936cb4ea0f485f92d99b..c3932cd7b9023482316807c73bfd52da3a4a3f7a 100644
--- a/tensorflow/core/kernels/batch_matmul_op_test.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_test.cc
@@ -53,9 +53,10 @@ static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a,
 /* Uncomment to enable benchmarks for double & complex types: */
 // BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
 // gpu);
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu);                    \
-// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);  \
-// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu);                    \
+// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
+// BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);
+// \
+// BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
 // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);
 
 // Typical fully connected layers
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
index d3ed617f713094cb94c1a87dc0c36c3d44d97918..c34ea14bf6007f6951733990c0a01999ac838b75 100644
--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class BatchNormOp : public OpKernel {
diff --git a/tensorflow/core/kernels/batch_norm_op_test.cc b/tensorflow/core/kernels/batch_norm_op_test.cc
index 5e3fcd2114a12709fb306ebadfd21a56b514e0c0..45ddc8532955578b5fca7ea372703f88b6b84f77 100644
--- a/tensorflow/core/kernels/batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/batch_norm_op_test.cc
@@ -54,7 +54,7 @@ TEST_F(BatchNormOpTest, Simple) {
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
   test::FillValues<float>(
       &expected, {-17.86f, -22.00f, -15.87f, -20.59f, -13.87f, -19.18f, -21.86f,
-                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f });
+                  -33.31f, -23.85f, -34.72f, -25.85f, -36.13f});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
 }
 
diff --git a/tensorflow/core/kernels/batch_util.cc b/tensorflow/core/kernels/batch_util.cc
index 298e15657961be9b899437373fb5baad28e5c73b..1a45212ad29a7b8a578ce176db20eaf3d2193afd 100644
--- a/tensorflow/core/kernels/batch_util.cc
+++ b/tensorflow/core/kernels/batch_util.cc
@@ -19,12 +19,28 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 
+#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
+
 namespace tensorflow {
 namespace batch_util {
 
 namespace {
 
-// Copies element into the index^th slice of parent (in the 0th dimension).
+Status ValidateInput(const Tensor& parent, const Tensor& element, int64 index) {
+  DCHECK_NE(parent.dim_size(0), 0);
+  DCHECK_GE(index, 0);
+  if (element.NumElements() != (parent.NumElements() / parent.dim_size(0))) {
+    TensorShape chip_shape = parent.shape();
+    chip_shape.RemoveDim(0);
+    return errors::Internal(
+        "ValidateInput Cannot perform copy: number of elements does not match. "
+        " Shapes are: [element]: ",
+        element.shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
+  }
+  return Status::OK();
+}
+
 template <typename T>
 Status HandleElementToSlice(Tensor element, Tensor* parent, int64 index,
                             bool /* can_move */) {
@@ -47,18 +63,37 @@ Status HandleElementToSlice<string>(Tensor element, Tensor* parent, int64 index,
   return Status::OK();
 }
 
+template <>
+Status HandleElementToSlice<Variant>(Tensor element, Tensor* parent,
+                                     int64 index, bool can_move) {
+  auto parent_as_matrix = parent->flat_outer_dims<Variant>();
+  auto element_flat = element.flat<Variant>();
+  if (can_move) {
+    for (int64 i = 0; i < element.NumElements(); ++i) {
+      parent_as_matrix(index, i) = std::move(element_flat(i));
+    }
+  } else {
+    parent_as_matrix.chip(index, 0) = element_flat;
+  }
+  return Status::OK();
+}
+
+// TODO(jsimsa): Add HandleElementToSlice<variant> specialization that moves
+// the data when possible.
+
+template <typename T>
+static Status HandleSliceToElement(const Tensor& parent, Tensor* element,
+                                   int64 index) {
+  element->flat<T>() = parent.flat_outer_dims<T>().chip(index, 0);
+  return Status::OK();
+}
+
 }  // namespace
 
+// Copies element into the index^th slice of parent (in the 0th dimension).
 Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
-  if (element.NumElements() != (parent->NumElements() / parent->dim_size(0))) {
-    TensorShape chip_shape = parent->shape();
-    chip_shape.RemoveDim(0);
-    return errors::InvalidArgument(
-        "HandleElementToSlice Cannot copy slice: number of elements does "
-        "not match. Shapes are: [element]: ",
-        element.shape().DebugString(),
-        ", [parent slice]: ", chip_shape.DebugString());
-  }
+  TF_RETURN_IF_ERROR(ValidateInput(*parent, element, index));
+
   bool can_move = element.RefCountIsOne();
 #define HANDLE_TYPE(T)                                                \
   case DataTypeToEnum<T>::value: {                                    \
@@ -76,5 +111,120 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
   }
 }
 
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
+  TF_RETURN_IF_ERROR(ValidateInput(parent, *element, index));
+
+#define HANDLE_TYPE(T)                                      \
+  case DataTypeToEnum<T>::value: {                          \
+    return HandleSliceToElement<T>(parent, element, index); \
+  }
+
+  switch (parent.dtype()) {
+    TF_CALL_ALL_TYPES(HANDLE_TYPE);
+    TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
+                                   element->dtype());
+  }
+}
+
+// The following five functions are copied from padding_fifo_queue.cc.
+// TODO(mrry): Reconcile these functions with the similar methods in the
+// queue implementation.
+Status ValidateElementToLargerSlice(const Tensor& element, Tensor* parent) {
+  DCHECK_NE(parent->dim_size(0), 0);
+  if (element.NumElements() > (parent->NumElements() / parent->dim_size(0))) {
+    TensorShape chip_shape = parent->shape();
+    chip_shape.RemoveDim(0);
+    return errors::Internal(
+        "HandleElementToLargerSlice Cannot copy slice: number of entries in "
+        "element is greater than number of elements in parent slice.  ",
+        "Shapes are: [element]: ", element.shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
+  }
+  return Status::OK();
+}
+
+template <typename T, int NDIMS>
+Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                  int index) {
+  TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(element, parent));
+  if (element.NumElements() == 0) {
+    return Status::OK();
+  }
+  auto element_t = element.tensor<T, NDIMS>();
+  auto parent_t = parent->tensor<T, NDIMS + 1>();
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_indices;
+  slice_indices[0] = index;
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_size;
+  slice_size[0] = 1;
+  for (size_t i = 1; i < slice_size.size(); ++i) {
+    slice_size[i] = element_t.dimension(i - 1);
+  }
+  parent_t.slice(slice_indices, slice_size) = element_t.reshape(slice_size);
+  return Status::OK();
+}
+
+template <int NDIMS>
+Status HandleElementToLargerSliceWithRank(const Tensor& element, Tensor* parent,
+                                          int index) {
+#define HANDLE_TYPE(T)                                                   \
+  case DataTypeToEnum<T>::value: {                                       \
+    return HandleElementToLargerSlice<T, NDIMS>(element, parent, index); \
+  }
+
+  switch (element.dtype()) {
+    TF_CALL_DATASET_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented(
+          "HandleElementToLargerSliceWithRank Unhandled data type: ",
+          element.dtype());
+  }
+}
+
+Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                int index) {
+  if (parent->dims() != element.dims() + 1) {
+    return errors::Internal(
+        "Mismatched ranks.  Element's rank is: ", element.dims(),
+        " but element is meant to be a slice in output Tensor having rank: ",
+        parent->dims(), " (should be: ", element.dims() + 1, ")");
+  }
+
+#define HANDLE_DIMS(NDIMS)                                                  \
+  case NDIMS: {                                                             \
+    TF_RETURN_IF_ERROR(                                                     \
+        HandleElementToLargerSliceWithRank<NDIMS>(element, parent, index)); \
+    return Status::OK();                                                    \
+  }
+
+  switch (element.dims()) {
+    HANDLE_DIMS(0);
+    HANDLE_DIMS(1);
+    HANDLE_DIMS(2);
+    HANDLE_DIMS(3);
+    HANDLE_DIMS(4);
+#undef HANDLE_DIMS
+    default:
+      return errors::Unimplemented("CopyElementToLargerSlice Unhandled rank: ",
+                                   element.dims());
+  }
+}
+
+Status SetElementZero(Tensor* element, const Tensor& padding) {
+#define HANDLE_TYPE(T)                                     \
+  if (element->dtype() == DataTypeToEnum<T>::value) {      \
+    element->flat<T>().setConstant(padding.scalar<T>()()); \
+    return Status::OK();                                   \
+  }
+  TF_CALL_DATASET_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+  return errors::Unimplemented("SetElementZero Unhandled data type: ",
+                               element->dtype());
+}
+
 }  // namespace batch_util
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_util.h b/tensorflow/core/kernels/batch_util.h
index 065011a699a5264ae06cdec3a42fde19be46e884..a47bf1935db611417cea1d98ed8aff496efbf689 100644
--- a/tensorflow/core/kernels/batch_util.h
+++ b/tensorflow/core/kernels/batch_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -29,7 +29,20 @@ namespace batch_util {
 // for DT_STRING tensors.
 Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
 
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
+
+// Zero-initializes the tensor `element` using the scalar stored in `padding`.
+// Both `element` and `padding` must have matching `dtype`.
+Status SetElementZero(Tensor* element, const Tensor& padding);
+
+// Copies `element` into a (0th dimension) slice of `parent`, assuming
+// the shape of `element` is strictly not larger along any axis than a
+// slice.
+Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                int index);
+
 }  // namespace batch_util
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4397410a5cee839a70bde69f34ca72e31530565f
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -0,0 +1,186 @@
+# Description: Utilities.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+            "**/google_*",
+        ],
+    ),
+)
+
+cc_library(
+    name = "periodic_function_dynamic",
+    srcs = ["periodic_function.cc"],
+    hdrs = ["periodic_function.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "periodic_function",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":periodic_function_dynamic",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "periodic_function_test",
+    srcs = ["periodic_function_test.cc"],
+    deps = [
+        ":fake_clock_env",
+        ":periodic_function_dynamic",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "batch_scheduler_hdrs",
+    hdrs = ["batch_scheduler.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_library(
+    name = "batch_scheduler",
+    hdrs = ["batch_scheduler.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "batch_scheduler_test",
+    srcs = ["batch_scheduler_test.cc"],
+    deps = [
+        ":batch_scheduler",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "shared_batch_scheduler_hdrs",
+    hdrs = ["shared_batch_scheduler.h"],
+    deps = [
+        ":batch_scheduler_hdrs",
+        ":periodic_function_dynamic",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_library(
+    name = "shared_batch_scheduler",
+    hdrs = ["shared_batch_scheduler.h"],
+    deps = [
+        ":batch_scheduler",
+        ":periodic_function_dynamic",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "shared_batch_scheduler_test",
+    srcs = ["shared_batch_scheduler_test.cc"],
+    deps = [
+        ":fake_clock_env",
+        ":shared_batch_scheduler",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "adaptive_shared_batch_scheduler",
+    hdrs = ["adaptive_shared_batch_scheduler.h"],
+    deps = [
+        ":batch_scheduler",
+        ":periodic_function_dynamic",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "adaptive_shared_batch_scheduler_test",
+    srcs = ["adaptive_shared_batch_scheduler_test.cc"],
+    tags = [
+        "local",
+        "manual",
+    ],
+    deps = [
+        ":adaptive_shared_batch_scheduler",
+        ":fake_clock_env",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "basic_batch_scheduler",
+    hdrs = ["basic_batch_scheduler.h"],
+    deps = [
+        ":shared_batch_scheduler",
+    ],
+)
+
+tf_cc_test(
+    name = "basic_batch_scheduler_test",
+    srcs = ["basic_batch_scheduler_test.cc"],
+    deps = [
+        ":basic_batch_scheduler",
+        ":batch_scheduler",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "basic_batch_scheduler_benchmark",
+    srcs = ["basic_batch_scheduler_benchmark_test.cc"],
+    tags = [
+        "local",
+        "manual",
+    ],
+    deps = [
+        ":basic_batch_scheduler",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "fake_clock_env",
+    testonly = 1,
+    srcs = ["fake_clock_env.cc"],
+    hdrs = ["fake_clock_env.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensorflow",
+    ],
+)
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..25c5f9cf424fdb286922548ea7ab0a35157a3502
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -0,0 +1,659 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <queue>
+#include <random>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/periodic_function.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+namespace internal {
+template <typename TaskType>
+class ASBSBatch;
+
+template <typename TaskType>
+class ASBSQueue;
+}  // namespace internal
+
+// EXPERIMENTAL: API MAY BE SUBJECTED TO SUDDEN CHANGES.
+//
+// Shared batch scheduler designed to minimize latency. The scheduler keeps
+// track of a number of queues (one per model or model version) which are
+// continuously enqueuing requests. The scheduler groups the requests into
+// batches which it periodically sends off for processing (see
+// shared_batch_scheduler.h for more details). The AdaptiveSharedBatchScheduler
+// prioritizes batches by age (i.e. the batch's oldest request) irrespective of
+// queue or batch size.
+//
+// The scheduling decision currently exists in two flavors, controlled by the
+// option use_in_flight_batches_implementation. It is expected that setting this
+// option to true will give universally better results; after a period of
+// testing to confirm, the old implementation will be removed.
+//
+// If use_in_flight_batches_implementation is set to true, the scheduler
+// limits the number of batches which can be processed concurrently.  If a new
+// batch is created, and the number of in flight batches is below the limit,
+// the next (i.e. oldest) batch is immediately scheduled.  Similarly, when a
+// batch finishes processing, the limit is rechecked, and another batch may be
+// scheduled.  To avoid the need to carefully tune the limit for workload,
+// model type, platform, etc, it is dynamically adjusted in order to provide the
+// lowest latency.
+//
+// If use_in_flight_batches_implementation is set to false, the scheduler will
+// process the oldest batch at an adjustable rate, regardless of batch size.
+// The user can provide feedback to help set this rate to achieve some goal
+// (i.e. minimize overall latency, limit cpu usage, etc). The rate (or rather,
+// the corresponding period) is adjusted each time a batch is processed, using
+// an exponentially weighted moving average to smooth noisy feedback:
+// ewma_feedback = ((N - 1) * ewma_feedback + feedback()) / N
+// period *= (1 + K * emwa_feedback)
+//
+// Some potential use cases:
+// Hardware Accelerators (GPUs & TPUs) - If some phase of batch processing
+//   involves serial processing by a device, from a latency perspective it is
+//   desirable to keep the device evenly loaded, avoiding the need to wait for
+//   the device to process prior batches.
+//   feedback = num_pending_on_device() - desired_pending.
+// CPU utilization - If the batch processing is cpu dominated, you can reap
+//   latency gains when underutilized by increasing the processing rate, but
+//   back the rate off when the load increases to avoid overload.
+//   feedback = cpu_rate() - desired_cpu_rate.
+
+template <typename TaskType>
+class AdaptiveSharedBatchScheduler
+    : public std::enable_shared_from_this<
+          AdaptiveSharedBatchScheduler<TaskType>> {
+ public:
+  ~AdaptiveSharedBatchScheduler() {
+    // Finish processing batches before destorying other class members.
+    batch_thread_pool_.reset();
+  }
+
+  struct Options {
+    // The name to use for the pool of batch threads.
+    string thread_pool_name = {"batch_threads"};
+    // Number of batch processing threads; equivalently the maximum number of
+    // concurrently running batches.
+    int64 num_batch_threads = port::NumSchedulableCPUs();
+    // The environment to use (typically only overridden by test code).
+    Env* env = Env::Default();
+    // Which implementation to use (described in class comments above).
+    bool use_in_flight_batches_implementation = false;
+    // Initial limit for number of batches being concurrently processed.
+    // Non-integer values correspond to probabilistic limits - i.e. a value of
+    // 3.2 results in an actual cap of 3 80% of the time, and 4 20% of the time.
+    double initial_in_flight_batches_limit = 3;
+    // Number of batches between adjustments of in_flight_batches_limit.  Larger
+    // numbers will give less noisy latency measurements, but will be less
+    // responsive to changes in workload.
+    int64 batches_to_average_over = 1000;
+
+    // TODO(kte): remove the rate based implementation and corresponding options
+    // below once testing confirms the superiority of the in flight batches
+    // implementation.
+    // Initial batch scheduling period in microseconds. Will be altered for
+    // non-zero rate_feedback.
+    double initial_scheduling_period_micros = 500;
+    // Minimum batch scheduling period in microseconds. Recommend setting this
+    // value greater than 0, otherwise it may take a while to recover from a
+    // sustained time of negative scheduling_period_feedback (which may occur
+    // under low load).
+    double min_scheduling_period_micros = 100;
+    // Maximum batch scheduling period in microseconds.
+    double max_scheduling_period_micros = 10000;
+    // Feedback function used to modify the scheduling period each time a batch
+    // is scheduled.  Should return values roughly O(1), with positive values
+    // resulting in an increased period.
+    std::function<double()> scheduling_period_feedback{[] { return 0.; }};
+    // To handle potentially noisy scheduling_period_feedback, the period is
+    // adjusted using an exponentially weighted moving average over the previous
+    // feedback_smoothing_batches batches.  Must be greater than 0.
+    int64 feedback_smoothing_batches = 10;
+  };
+
+  // Ownership is shared between the caller of Create() and any queues created
+  // via AddQueue().
+  static Status Create(
+      const Options& options,
+      std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>>* scheduler);
+
+  struct QueueOptions {
+    // Maximum size of each batch.
+    int max_batch_size = 1000;
+    // Maximum number of enqueued (i.e. non-scheduled) batches.
+    int max_enqueued_batches = 10;
+  };
+
+  using BatchProcessor = std::function<void(std::unique_ptr<Batch<TaskType>>)>;
+
+  // Adds queue (and its callback) to be managed by this scheduler.
+  Status AddQueue(const QueueOptions& options,
+                  BatchProcessor process_batch_callback,
+                  std::unique_ptr<BatchScheduler<TaskType>>* queue);
+
+  double in_flight_batches_limit() {
+    mutex_lock l(mu_);
+    return in_flight_batches_limit_;
+  }
+
+ private:
+  // access to AddBatch, RemoveQueue, GetEnv.
+  friend class internal::ASBSQueue<TaskType>;
+
+  explicit AdaptiveSharedBatchScheduler(const Options& options);
+
+  // Batch scheduling function which runs every scheduling_period_ microseconds.
+  // Only used when options_.use_in_flight_batches_implementation == false.
+  void ProcessOneBatch();
+
+  // Tracks processing latency and adjusts in_flight_batches_limit to minimize.
+  // Only used when options_.use_in_flight_batches_implementation == true.
+  void CallbackWrapper(const internal::ASBSBatch<TaskType>* batch,
+                       BatchProcessor callback);
+
+  // Schedules batch if in_flight_batches_limit_ is not met.
+  // Only used when options_.use_in_flight_batches_implementation == true.
+  void MaybeScheduleNextBatch() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Notifies scheduler of non-empty batch which is eligible for processing.
+  void AddBatch(const internal::ASBSBatch<TaskType>* batch);
+
+  // Removes queue from scheduler.
+  void RemoveQueue(const internal::ASBSQueue<TaskType>* queue);
+
+  Env* GetEnv() const { return options_.env; }
+
+  const Options options_;
+
+  struct BatchCompare {
+    bool operator()(const internal::ASBSBatch<TaskType>* a,
+                    const internal::ASBSBatch<TaskType>* b);
+  };
+
+  // Collection of batches added by AddBatch, ordered by age. Owned by scheduler
+  // until they are released for processing.
+  std::priority_queue<const internal::ASBSBatch<TaskType>*,
+                      std::vector<const internal::ASBSBatch<TaskType>*>,
+                      BatchCompare>
+      batches_ GUARDED_BY(mu_);
+
+  // Unowned queues and callbacks added by AddQueue.
+  std::unordered_map<const internal::ASBSQueue<TaskType>*, BatchProcessor>
+      queues_and_callbacks_ GUARDED_BY(mu_);
+
+  mutex mu_;
+
+  // Responsible for running ProcessOneBatch. PeriodicFunction was used in order
+  // to check for deletion so that the thread can be shut down.
+  // Only used when options_.use_in_flight_batches_implementation == false.
+  std::unique_ptr<PeriodicFunction> scheduling_thread_;
+
+  // Responsible for running the batch processing callbacks.
+  std::unique_ptr<thread::ThreadPool> batch_thread_pool_;
+
+  // Time interval in microseconds between successive ProcessOneBatch calls.
+  // Only used when options_.use_in_flight_batches_implementation == false.
+  double scheduling_period_;
+
+  // Exponentially weighted moving average of
+  // options_.scheduling_period_feedback() evaluated in each ProcessOneBatch
+  // call.
+  // Only used when options_.use_in_flight_batches_implementation == false.
+  double ewma_feedback_ = 0;
+
+  // Limit on number of batches which can be concurrently processed.
+  // Non-integer values correspond to probabilistic limits - i.e. a value of 3.2
+  // results in an actual cap of 3 80% of the time, and 4 20% of the time.
+  // Only used when options_.use_in_flight_batches_implementation == true.
+  double in_flight_batches_limit_ GUARDED_BY(mu_);
+
+  // Number of batches currently being processed.
+  // Only used when options_.use_in_flight_batches_implementation == true.
+  int64 in_flight_batches_ GUARDED_BY(mu_) = 0;
+
+  // RNG engine and distribution.
+  // Only used when options_.use_in_flight_batches_implementation == true.
+  std::default_random_engine rand_engine_;
+  std::uniform_real_distribution<double> rand_double_;
+
+  // Fields controlling the dynamic adjustment of in_flight_batches_limit_.
+  // Only used when options_.use_in_flight_batches_implementation == true.
+  // Number of batches since the last in_flight_batches_limit_ adjustment.
+  int64 batch_count_ GUARDED_BY(mu_) = 0;
+  // Sum of processing latency for batches counted by batch_count_.
+  int64 batch_latency_sum_ GUARDED_BY(mu_) = 0;
+  // Average batch latency for previous value of in_flight_batches_limit_.
+  double last_avg_latency_ms_ GUARDED_BY(mu_) = 0;
+  // Did last_avg_latency_ms_ decrease from the previous last_avg_latency_ms_?
+  bool last_latency_decreased_ GUARDED_BY(mu_) = false;
+  // Current direction (+-) to adjust in_flight_batches_limit_
+  int step_direction_ GUARDED_BY(mu_) = 1;
+  // Max adjustment size (as a fraction of in_flight_batches_limit_).
+  constexpr static double kMaxStepSizeMultiplier = 0.125;  // 1/8;
+  // Min adjustment size (as a fraction of in_flight_batches_limit_).
+  constexpr static double kMinStepSizeMultiplier = 0.0078125;  // 1/128
+  // Current adjustment size (as a fraction of in_flight_batches_limit_).
+  double step_size_multiplier_ GUARDED_BY(mu_) = kMaxStepSizeMultiplier;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AdaptiveSharedBatchScheduler);
+};
+
+//////////////////////////////////////////////////////////
+// Implementation details follow. API users need not read.
+
+namespace internal {
+// Consolidates tasks into batches, passing them off to the
+// AdaptiveSharedBatchScheduler for processing.
+template <typename TaskType>
+class ASBSQueue : public BatchScheduler<TaskType> {
+ public:
+  using QueueOptions =
+      typename AdaptiveSharedBatchScheduler<TaskType>::QueueOptions;
+
+  ASBSQueue(std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler,
+            const QueueOptions& options);
+
+  ~ASBSQueue() override;
+
+  // Adds task to current batch. Fails if the task size is larger than the batch
+  // size or if the current batch is full and this queue's number of outstanding
+  // batches is at its maximum.
+  Status Schedule(std::unique_ptr<TaskType>* task) override;
+
+  // Number of tasks waiting to be scheduled.
+  size_t NumEnqueuedTasks() const override;
+
+  // Number of size 1 tasks which could currently be scheduled without failing.
+  size_t SchedulingCapacity() const override;
+
+  // Notifies queue that a batch is about to be scheduled; the queue should not
+  // place any more tasks in this batch.
+  void ReleaseBatch(const ASBSBatch<TaskType>* batch);
+
+  size_t max_task_size() const override { return options_.max_batch_size; }
+
+ private:
+  std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler_;
+  const QueueOptions options_;
+  // Owned by scheduler_.
+  ASBSBatch<TaskType>* current_batch_ GUARDED_BY(mu_) = nullptr;
+  int64 num_enqueued_batches_ GUARDED_BY(mu_) = 0;
+  int64 num_enqueued_tasks_ GUARDED_BY(mu_) = 0;
+  mutable mutex mu_;
+  TF_DISALLOW_COPY_AND_ASSIGN(ASBSQueue);
+};
+
+// Batch which remembers when and by whom it was created.
+template <typename TaskType>
+class ASBSBatch : public Batch<TaskType> {
+ public:
+  ASBSBatch(ASBSQueue<TaskType>* queue, int64 creation_time_micros)
+      : queue_(queue), creation_time_micros_(creation_time_micros) {}
+
+  ~ASBSBatch() override {}
+
+  ASBSQueue<TaskType>* queue() const { return queue_; }
+
+  int64 creation_time_micros() const { return creation_time_micros_; }
+
+ private:
+  ASBSQueue<TaskType>* queue_;
+  const int64 creation_time_micros_;
+  TF_DISALLOW_COPY_AND_ASSIGN(ASBSBatch);
+};
+}  // namespace internal
+
+// ---------------- AdaptiveSharedBatchScheduler ----------------
+
+template <typename TaskType>
+constexpr double AdaptiveSharedBatchScheduler<TaskType>::kMaxStepSizeMultiplier;
+
+template <typename TaskType>
+constexpr double AdaptiveSharedBatchScheduler<TaskType>::kMinStepSizeMultiplier;
+
+template <typename TaskType>
+Status AdaptiveSharedBatchScheduler<TaskType>::Create(
+    const Options& options,
+    std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>>* scheduler) {
+  if (options.num_batch_threads < 1) {
+    return errors::InvalidArgument("num_batch_threads must be positive; was ",
+                                   options.num_batch_threads);
+  }
+  if (options.min_scheduling_period_micros < 0) {
+    return errors::InvalidArgument(
+        "min_scheduling_period_micros must be >= 0; was ",
+        options.min_scheduling_period_micros);
+  }
+  if (options.min_scheduling_period_micros >
+      options.initial_scheduling_period_micros) {
+    return errors::InvalidArgument(
+        "initial_scheduling_period_micros (",
+        options.initial_scheduling_period_micros,
+        ") must be >= min_scheduling_period_micros (",
+        options.min_scheduling_period_micros, ")");
+  }
+  if (options.initial_scheduling_period_micros >
+      options.max_scheduling_period_micros) {
+    return errors::InvalidArgument(
+        "initial_scheduling_period_micros (",
+        options.initial_scheduling_period_micros,
+        ") must be <= max_scheduling_period_micros (",
+        options.max_scheduling_period_micros, ")");
+  }
+  if (options.feedback_smoothing_batches < 1) {
+    return errors::InvalidArgument(
+        "feedback_smoothing_batches must be positive; was ",
+        options.feedback_smoothing_batches);
+  }
+  if (options.initial_in_flight_batches_limit > options.num_batch_threads) {
+    return errors::InvalidArgument(
+        "initial_in_flight_batches_limit (",
+        options.initial_in_flight_batches_limit,
+        ") should not be larger than num_batch_threads (",
+        options.num_batch_threads, ")");
+  }
+  if (options.initial_in_flight_batches_limit < 1) {
+    return errors::InvalidArgument(
+        "initial_in_flight_batches_limit should be "
+        "greater than or equal to 1; was ",
+        options.initial_in_flight_batches_limit);
+  }
+  if (options.batches_to_average_over < 1) {
+    return errors::InvalidArgument(
+        "batches_to_average_over should be "
+        "greater than or equal to 1; was ",
+        options.batches_to_average_over);
+  }
+  scheduler->reset(new AdaptiveSharedBatchScheduler<TaskType>(options));
+  return Status::OK();
+}
+
+template <typename TaskType>
+AdaptiveSharedBatchScheduler<TaskType>::AdaptiveSharedBatchScheduler(
+    const Options& options)
+    : options_(options),
+      scheduling_period_(options.initial_scheduling_period_micros),
+      in_flight_batches_limit_(options.initial_in_flight_batches_limit),
+      rand_double_(0.0, 1.0) {
+  std::random_device device;
+  rand_engine_.seed(device());
+  PeriodicFunction::Options opts;
+  opts.thread_name_prefix = "scheduling_thread";
+  opts.env = GetEnv();
+  batch_thread_pool_.reset(new thread::ThreadPool(
+      GetEnv(), options.thread_pool_name, options.num_batch_threads));
+  if (!options.use_in_flight_batches_implementation) {
+    scheduling_thread_.reset(
+        new PeriodicFunction([this] { ProcessOneBatch(); }, 0, opts));
+  }
+}
+
+template <typename TaskType>
+Status AdaptiveSharedBatchScheduler<TaskType>::AddQueue(
+    const QueueOptions& options, BatchProcessor process_batch_callback,
+    std::unique_ptr<BatchScheduler<TaskType>>* queue) {
+  if (options.max_batch_size <= 0) {
+    return errors::InvalidArgument("max_batch_size must be positive; was ",
+                                   options.max_batch_size);
+  }
+  if (options.max_enqueued_batches <= 0) {
+    return errors::InvalidArgument(
+        "max_enqueued_batches must be positive; was ",
+        options.max_enqueued_batches);
+  }
+  internal::ASBSQueue<TaskType>* asbs_queue_raw;
+  queue->reset(asbs_queue_raw = new internal::ASBSQueue<TaskType>(
+                   this->shared_from_this(), options));
+  mutex_lock l(mu_);
+  queues_and_callbacks_[asbs_queue_raw] = process_batch_callback;
+  return Status::OK();
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::AddBatch(
+    const internal::ASBSBatch<TaskType>* batch) {
+  mutex_lock l(mu_);
+  batches_.push(batch);
+  if (options_.use_in_flight_batches_implementation) {
+    MaybeScheduleNextBatch();
+  }
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::RemoveQueue(
+    const internal::ASBSQueue<TaskType>* queue) {
+  mutex_lock l(mu_);
+  queues_and_callbacks_.erase(queue);
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatch() {
+  if (batches_.empty() || in_flight_batches_ >= in_flight_batches_limit_)
+    return;
+  // Non-integer limit handled probabilistially.
+  if (in_flight_batches_limit_ - in_flight_batches_ < 1 &&
+      rand_double_(rand_engine_) >
+          (in_flight_batches_limit_ - in_flight_batches_))
+    return;
+  const internal::ASBSBatch<TaskType>* batch = batches_.top();
+  batches_.pop();
+  // Queue may destroy itself after ReleaseBatch is called.
+  batch->queue()->ReleaseBatch(batch);
+  batch_thread_pool_->Schedule(
+      std::bind(&AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper, this,
+                batch, queues_and_callbacks_[batch->queue()]));
+  in_flight_batches_++;
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper(
+    const internal::ASBSBatch<TaskType>* batch,
+    AdaptiveSharedBatchScheduler<TaskType>::BatchProcessor callback) {
+  int64 start_time = batch->creation_time_micros();
+  callback(std::unique_ptr<Batch<TaskType>>(
+      const_cast<internal::ASBSBatch<TaskType>*>(batch)));
+  int64 end_time = GetEnv()->NowMicros();
+  mutex_lock l(mu_);
+  in_flight_batches_--;
+  batch_count_++;
+  batch_latency_sum_ += end_time - start_time;
+  // Occasionally adjust in_flight_batches_limit_ to minimize average latency.
+  // Although the optimal value may depend on the workload, the latency should
+  // be a simple convex function of in_flight_batches_limit_, allowing us to
+  // locate the global minimum relatively quickly.
+  if (batch_count_ == options_.batches_to_average_over) {
+    double current_avg_latency_ms = (batch_latency_sum_ / 1000.) / batch_count_;
+    bool current_latency_decreased =
+        current_avg_latency_ms < last_avg_latency_ms_;
+    if (current_latency_decreased) {
+      // If latency improvement was because we're moving in the correct
+      // direction, increase step_size so that we can get to the minimum faster.
+      // If latency improvement was due to backtracking from a previous failure,
+      // decrease step_size in order to refine our location.
+      step_size_multiplier_ *= (last_latency_decreased_ ? 2 : 0.5);
+      step_size_multiplier_ =
+          std::min(step_size_multiplier_, kMaxStepSizeMultiplier);
+      step_size_multiplier_ =
+          std::max(step_size_multiplier_, kMinStepSizeMultiplier);
+    } else {
+      // Return (nearly) to previous position and confirm that latency is better
+      // there before decreasing step size.
+      step_direction_ = -step_direction_;
+    }
+    in_flight_batches_limit_ +=
+        step_direction_ * in_flight_batches_limit_ * step_size_multiplier_;
+    in_flight_batches_limit_ =
+        std::min(in_flight_batches_limit_,
+                 static_cast<double>(options_.num_batch_threads));
+    in_flight_batches_limit_ = std::max(in_flight_batches_limit_, 1.0);
+    last_avg_latency_ms_ = current_avg_latency_ms;
+    last_latency_decreased_ = current_latency_decreased;
+    batch_count_ = 0;
+    batch_latency_sum_ = 0;
+  }
+  MaybeScheduleNextBatch();
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::ProcessOneBatch() {
+  static const double kFeedbackMultiplier = .001;
+  const internal::ASBSBatch<TaskType>* batch = nullptr;
+  BatchProcessor callback;
+  const int64 start_time_micros = GetEnv()->NowMicros();
+  {
+    mutex_lock l(mu_);
+    if (!batches_.empty()) {
+      batch = batches_.top();
+      batches_.pop();
+      callback = queues_and_callbacks_[batch->queue()];
+    }
+  }
+  if (batch != nullptr) {
+    double feedback = options_.scheduling_period_feedback();
+    const int64 N = options_.feedback_smoothing_batches;
+    ewma_feedback_ = ((N - 1) * ewma_feedback_ + feedback) / N;
+    scheduling_period_ *= (1 + kFeedbackMultiplier * ewma_feedback_);
+    if (scheduling_period_ < options_.min_scheduling_period_micros) {
+      scheduling_period_ = options_.min_scheduling_period_micros;
+    } else if (scheduling_period_ > options_.max_scheduling_period_micros) {
+      scheduling_period_ = options_.max_scheduling_period_micros;
+    }
+    // Queue may destroy itself after ReleaseBatch is called.
+    batch->queue()->ReleaseBatch(batch);
+    batch_thread_pool_->Schedule([callback, batch] {
+      callback(std::unique_ptr<Batch<TaskType>>(
+          const_cast<internal::ASBSBatch<TaskType>*>(batch)));
+    });
+  }
+  const int64 sleep_time =
+      scheduling_period_ - (GetEnv()->NowMicros() - start_time_micros);
+  if (sleep_time > 0) {
+    GetEnv()->SleepForMicroseconds(sleep_time);
+  }
+}
+
+template <typename TaskType>
+bool AdaptiveSharedBatchScheduler<TaskType>::BatchCompare::operator()(
+    const internal::ASBSBatch<TaskType>* a,
+    const internal::ASBSBatch<TaskType>* b) {
+  return a->creation_time_micros() > b->creation_time_micros();
+}
+
+// ---------------- ASBSQueue ----------------
+
+namespace internal {
+template <typename TaskType>
+ASBSQueue<TaskType>::ASBSQueue(
+    std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler,
+    const QueueOptions& options)
+    : scheduler_(scheduler), options_(options) {}
+
+template <typename TaskType>
+ASBSQueue<TaskType>::~ASBSQueue() {
+  // Wait until last batch has been scheduled.
+  const int kSleepMicros = 1000;
+  for (;;) {
+    {
+      mutex_lock l(mu_);
+      if (num_enqueued_batches_ == 0) {
+        break;
+      }
+    }
+    scheduler_->GetEnv()->SleepForMicroseconds(kSleepMicros);
+  }
+  scheduler_->RemoveQueue(this);
+}
+
+template <typename TaskType>
+Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
+  ASBSBatch<TaskType>* new_batch = nullptr;
+  size_t size = (*task)->size();
+  if (size > options_.max_batch_size) {
+    return errors::InvalidArgument("Task size ", size,
+                                   " is larger than maximum batch size ",
+                                   options_.max_batch_size);
+  }
+  {
+    mutex_lock l(mu_);
+    // Current batch is full, create another if allowed.
+    if (current_batch_ &&
+        current_batch_->size() + size > options_.max_batch_size) {
+      if (num_enqueued_batches_ >= options_.max_enqueued_batches) {
+        return errors::Unavailable("The batch scheduling queue is full");
+      }
+      current_batch_->Close();
+      current_batch_ = nullptr;
+    }
+    if (!current_batch_) {
+      num_enqueued_batches_++;
+      current_batch_ = new_batch =
+          new ASBSBatch<TaskType>(this, scheduler_->GetEnv()->NowMicros());
+    }
+    current_batch_->AddTask(std::move(*task));
+    num_enqueued_tasks_++;
+  }
+  // AddBatch must be called outside of lock, since it may call ReleaseBatch.
+  if (new_batch != nullptr) scheduler_->AddBatch(new_batch);
+  return Status::OK();
+}
+
+template <typename TaskType>
+void ASBSQueue<TaskType>::ReleaseBatch(const ASBSBatch<TaskType>* batch) {
+  mutex_lock l(mu_);
+  num_enqueued_batches_--;
+  num_enqueued_tasks_ -= batch->num_tasks();
+  if (batch == current_batch_) {
+    current_batch_->Close();
+    current_batch_ = nullptr;
+  }
+}
+
+template <typename TaskType>
+size_t ASBSQueue<TaskType>::NumEnqueuedTasks() const {
+  mutex_lock l(mu_);
+  return num_enqueued_tasks_;
+}
+
+template <typename TaskType>
+size_t ASBSQueue<TaskType>::SchedulingCapacity() const {
+  mutex_lock l(mu_);
+  const int current_batch_capacity =
+      current_batch_ ? options_.max_batch_size - current_batch_->size() : 0;
+  const int spare_batches =
+      options_.max_enqueued_batches - num_enqueued_batches_;
+  return spare_batches * options_.max_batch_size + current_batch_capacity;
+}
+}  // namespace internal
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_ADAPTIVE_SHARED_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
similarity index 78%
rename from tensorflow/contrib/batching/adaptive_shared_batch_scheduler_test.cc
rename to tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
index a07cd6d834fa28904bf7748b16972cca217503c1..8ae8ca02eca20b5d1184e6e588f013d59d10464a 100644
--- a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h"
 
-#include "tensorflow/contrib/batching/test_util/fake_clock_env.h"
+#include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/macros.h"
@@ -141,6 +141,16 @@ TEST(AdaptiveSharedBatchSchedulerTest, BadOptions) {
   options = Scheduler::Options();
   options.feedback_smoothing_batches = 0;
   EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  options.initial_in_flight_batches_limit = 0.5;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  options.num_batch_threads = 5;
+  options.initial_in_flight_batches_limit = 8;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  options.batches_to_average_over = -5;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
 }
 
 TEST(AdaptiveSharedBatchSchedulerTest, ObeysQueueOptions) {
@@ -186,6 +196,7 @@ TEST(AdaptiveSharedBatchSchedulerTest, ObeysQueueOptions) {
     queue_options.max_enqueued_batches = 2;
     TF_ASSERT_OK(
         scheduler->AddQueue(queue_options, queue_0_callback, &queue_0));
+    EXPECT_EQ(10, queue_0->max_task_size());
     queue_options.max_batch_size = 0;
     // Queue must have max_batch_size > 0.
     EXPECT_FALSE(
@@ -433,6 +444,106 @@ TEST(AdaptiveSharedBatchSchedulerTest, QueueCapacityInfo) {
   }
   stop_teardown.Notify();
 }
+
+TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesImplementation) {
+  AdaptiveSharedBatchScheduler<FakeTask>::Options options;
+  options.use_in_flight_batches_implementation = true;
+  options.initial_in_flight_batches_limit = 2;
+  options.batches_to_average_over = 1000;
+  mutex mu;
+  int processed_batches = 0;
+  Notification finish_processing;
+  auto queue_callback = [&mu, &processed_batches, &finish_processing](
+                            std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+    EXPECT_GT(batch->num_tasks(), 0);
+    mu.lock();
+    int batch_num = ++processed_batches;
+    mu.unlock();
+    if (batch_num == 2) {
+      // Give third batch a chance to process if it's going to.
+      Env::Default()->SleepForMicroseconds(1000);
+      finish_processing.Notify();
+    }
+    if (batch_num == 3) {
+      ASSERT_TRUE(finish_processing.HasBeenNotified());
+    }
+    finish_processing.WaitForNotification();
+  };
+  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+
+  // Enqueue 3 batches.
+  for (int i = 0; i < 3; i++) {
+    TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  }
+}
+
+TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesLimitTuning) {
+  test_util::FakeClockEnv env(Env::Default());
+  Notification start_teardown, stop_teardown;
+  std::unique_ptr<Thread> teardown_thread =
+      CreateFakeClockAdvancerThread(&env, &start_teardown, &stop_teardown);
+  {
+    AdaptiveSharedBatchScheduler<FakeTask>::Options options;
+    options.env = &env;
+    options.use_in_flight_batches_implementation = true;
+    options.initial_in_flight_batches_limit = 2;
+    options.batches_to_average_over = 1;
+    auto queue_callback = [&env](std::unique_ptr<Batch<FakeTask>> batch) {
+      ASSERT_TRUE(batch->IsClosed());
+      switch (batch->size()) {
+        case 0:
+          env.AdvanceByMicroseconds(10);
+          break;
+        case 1:
+          env.AdvanceByMicroseconds(15);
+          break;
+        case 2:
+          env.AdvanceByMicroseconds(10);
+          break;
+        case 3:
+          env.AdvanceByMicroseconds(11);
+          break;
+      }
+    };
+    std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+    TF_ASSERT_OK(
+        AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+    std::unique_ptr<BatchScheduler<FakeTask>> queue;
+    TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+
+    TF_ASSERT_OK(ScheduleTask(0, queue.get()));
+    double in_flight_batches_limit = 2;
+    while (scheduler->in_flight_batches_limit() == in_flight_batches_limit) {
+    }
+    // Initial direction will be negative.
+    EXPECT_LT(scheduler->in_flight_batches_limit(), in_flight_batches_limit);
+    in_flight_batches_limit = scheduler->in_flight_batches_limit();
+    TF_ASSERT_OK(ScheduleTask(1, queue.get()));
+    while (scheduler->in_flight_batches_limit() == in_flight_batches_limit) {
+    }
+    // Latency increased -> change direction.
+    EXPECT_GT(scheduler->in_flight_batches_limit(), in_flight_batches_limit);
+    in_flight_batches_limit = scheduler->in_flight_batches_limit();
+    TF_ASSERT_OK(ScheduleTask(2, queue.get()));
+    while (scheduler->in_flight_batches_limit() == in_flight_batches_limit) {
+    }
+    // Latency decreased -> keep going in same direction.
+    EXPECT_GT(scheduler->in_flight_batches_limit(), in_flight_batches_limit);
+    in_flight_batches_limit = scheduler->in_flight_batches_limit();
+    TF_ASSERT_OK(ScheduleTask(3, queue.get()));
+    while (scheduler->in_flight_batches_limit() == in_flight_batches_limit) {
+    }
+    // Latency increased -> change direction.
+    EXPECT_LT(scheduler->in_flight_batches_limit(), in_flight_batches_limit);
+    start_teardown.Notify();
+  }
+  stop_teardown.Notify();
+}
 }  // namespace anonymous
 }  // namespace serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b5a991caf2fc3fdb1068070946f29d26c6a55ff
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
@@ -0,0 +1,268 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BASIC_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BASIC_BATCH_SCHEDULER_H_
+
+#include <stddef.h>
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
+
+namespace tensorflow {
+namespace serving {
+
+// A BatchScheduler implementation geared toward handling a single request type
+// running on a specific set of hardware resources. A typical scenario is one in
+// which all requests invoke the same machine-learned model on one GPU.
+//
+// If there are, say, two GPUs and two models each bound to one of the GPUs, one
+// could use two BasicBatchScheduler instances to schedule the two model/GPU
+// combinations independently. If multiple models must share a given GPU or
+// other hardware resource, consider using SharedBatchScheduler instead.
+//
+//
+// PARAMETERS AND BEHAVIOR:
+//
+// BasicBatchScheduler runs a fixed pool of threads, which it uses to process
+// batches of tasks. It enforces a maximum batch size, and enqueues a bounded
+// number of tasks. If the queue is nearly empty, such that a full batch cannot
+// be formed, when a thread becomes free, it anyway schedules a batch
+// immediately if a task has been in the queue for longer than a given timeout
+// parameter. If the timeout parameter is set to 0, then the batch threads will
+// always be kept busy (unless there are zero tasks waiting to be processed).
+//
+// For online serving, it is recommended to set the maximum number of enqueued
+// batches worth of tasks equal to the number of batch threads, which allows
+// enqueuing of enough tasks s.t. if every thread becomes available it can be
+// kept busy, but no more. For bulk processing jobs and throughput-oriented
+// benchmarks, you may want to set it much higher.
+//
+// When Schedule() is called, if the queue is full the call will fail with an
+// UNAVAILABLE error (after which the client may retry again later). If the call
+// succeeds, the maximum time the task will spend in the queue before being
+// placed in a batch and assigned to a thread for processing, is the greater of:
+//  - the maximum time to process ceil(max_enqueued_batches/num_batch_threads)
+//    (1 in the recommended configuration) batches of previously-submitted tasks
+//  - the configured timeout parameter (which can be 0, as mentioned above)
+//
+// Unlike StreamingBatchScheduler, when BasicBatchScheduler assigns a batch to a
+// thread, it closes the batch. The process-batch callback may assume that every
+// batch it receives is closed at the outset.
+//
+//
+// RECOMMENDED USE-CASES:
+//
+// BasicBatchScheduler is suitable for use-cases that feature a single kind of
+// request (e.g. a server performing inference with a single machine-learned
+// model, possibly evolving over time), with loose versioning semantics.
+// Concretely, the following conditions should hold:
+//
+//  A. All requests batched onto a given resource (e.g. a hardware accelerator,
+//     or a pool accelerators) are of the same type. For example, they all
+//     invoke the same machine-learned model.
+//
+//     These variations are permitted:
+//      - The model may reside in a single servable, or it may be spread across
+//        multiple servables that are used in unison (e.g. a vocabulary lookup
+//        table servable and a tensorflow session servable).
+//      - The model's servable(s) may be static, or they may evolve over time
+//        (successive servable versions).
+//      - Zero or more of the servables are used in the request thread; the rest
+//        are used in the batch thread. In our running example, the vocabulary
+//        lookups and tensorflow runs may both be performed in the batch thread,
+//        or alternatively the vocabulary lookup may occur in the request thread
+//        with only the tensorflow run performed in the batch thread.
+//
+//     In contrast, BasicBatchScheduler is not a good fit if the server
+//     hosts multiple distinct models running on a pool accelerators, with each
+//     request specifying which model it wants to use. BasicBatchScheduler
+//     has no facility to time-multiplex the batch threads across multiple
+//     models in a principled way. More basically, it cannot ensure that a given
+//     batch doesn't contain a mixture of requests for different models.
+//
+//  B. Requests do not specify a particular version of the servable(s) that must
+//     be used. Instead, each request is content to use the "latest" version.
+//
+//     BasicBatchScheduler does not constrain which requests get grouped
+//     together into a batch, so using this scheduler there is no way to achieve
+//     cohesion of versioned requests to version-specific batches.
+//
+//  C. No servable version coordination needs to be performed between the
+//     request threads and the batch threads. Often, servables are only used in
+//     the batch threads, in which case this condition trivially holds. If
+//     servables are used in both threads, then the use-case must tolerate
+//     version skew across the servables used in the two kinds of threads.
+//
+//
+// EXAMPLE USE-CASE FLOW:
+//
+// For such use-cases, request processing via BasicBatchScheduler generally
+// follows this flow (given for illustration; variations are possible):
+//  1. Optionally perform some pre-processing on each request in the request
+//     threads.
+//  2. Route the requests to the batch scheduler, as batching::Task objects.
+//     (Since all requests are of the same type and are not versioned, the
+//     scheduler is free to group them into batches arbitrarily.)
+//  3. Merge the requests into a single batched representation B.
+//  4. Obtain handles to the servable(s) needed to process B. The simplest
+//     approach is to obtain the latest version of each servable. Alternatively,
+//     if cross-servable consistency is required (e.g. the vocabulary lookup
+//     table's version number must match that of the tensorflow session),
+//     identify an appropriate version number and obtain the servable handles
+//     accordingly.
+//  5. Process B using the obtained servable handles, and split the result into
+//     individual per-request units.
+//  6. Perform any post-processing in the batch thread and/or request thread.
+//
+//
+// PERFORMANCE TUNING: See README.md.
+//
+template <typename TaskType>
+class BasicBatchScheduler : public BatchScheduler<TaskType> {
+ public:
+  // TODO(b/25089730): Tune defaults based on best practices as they develop.
+  // (Keep them mirrored to the ones in SharedBatchScheduler::QueueOptions and
+  // SharedBatchScheduler::Options.)
+  struct Options {
+    // The maximum size of each batch.
+    //
+    // The scheduler may form batches of any size between 1 and this number
+    // (inclusive). If there is a need to quantize the batch sizes, i.e. only
+    // submit batches whose size is in a small set of allowed sizes, that can be
+    // done by adding padding in the process-batch callback.
+    int max_batch_size = 1000;
+
+    // If a task has been enqueued for this amount of time (in microseconds),
+    // and a thread is available, the scheduler will immediately form a batch
+    // from enqueued tasks and assign the batch to the thread for processing,
+    // even if the batch's size is below 'max_batch_size'.
+    //
+    // This parameter offers a way to bound queue latency, so that a task isn't
+    // stuck in the queue indefinitely waiting for enough tasks to arrive to
+    // make a full batch. (The latency bound is given in the class documentation
+    // above.)
+    //
+    // The goal is to smooth out batch sizes under low request rates, and thus
+    // avoid latency spikes.
+    int64 batch_timeout_micros = 0;
+
+    // The name to use for the pool of batch threads.
+    string thread_pool_name = {"batch_threads"};
+
+    // The number of threads to use to process batches.
+    // Must be >= 1, and should be tuned carefully.
+    int num_batch_threads = port::NumSchedulableCPUs();
+
+    // The maximum allowable number of enqueued (accepted by Schedule() but
+    // not yet being processed on a batch thread) tasks in terms of batches.
+    // If this limit is reached, Schedule() will return an UNAVAILABLE error.
+    // See the class documentation above for guidelines on how to tune this
+    // parameter.
+    int max_enqueued_batches = 10;
+
+    // The following options are typically only overridden by test code.
+
+    // The environment to use.
+    Env* env = Env::Default();
+  };
+  static Status Create(const Options& options,
+                       std::function<void(std::unique_ptr<Batch<TaskType>>)>
+                           process_batch_callback,
+                       std::unique_ptr<BasicBatchScheduler>* scheduler);
+
+  ~BasicBatchScheduler() override = default;
+
+  Status Schedule(std::unique_ptr<TaskType>* task) override;
+  size_t NumEnqueuedTasks() const override;
+  size_t SchedulingCapacity() const override;
+
+  size_t max_task_size() const override {
+    return shared_scheduler_queue_->max_task_size();
+  }
+
+ private:
+  explicit BasicBatchScheduler(
+      std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue);
+
+  // This class is merely a thin wrapper around a SharedBatchScheduler with a
+  // single queue.
+  std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(BasicBatchScheduler);
+};
+
+//////////
+// Implementation details follow. API users need not read.
+
+template <typename TaskType>
+Status BasicBatchScheduler<TaskType>::Create(
+    const Options& options,
+    std::function<void(std::unique_ptr<Batch<TaskType>>)>
+        process_batch_callback,
+    std::unique_ptr<BasicBatchScheduler>* scheduler) {
+  typename SharedBatchScheduler<TaskType>::Options shared_scheduler_options;
+  shared_scheduler_options.thread_pool_name = options.thread_pool_name;
+  shared_scheduler_options.num_batch_threads = options.num_batch_threads;
+  shared_scheduler_options.env = options.env;
+  std::shared_ptr<SharedBatchScheduler<TaskType>> shared_scheduler;
+  TF_RETURN_IF_ERROR(SharedBatchScheduler<TaskType>::Create(
+      shared_scheduler_options, &shared_scheduler));
+
+  typename SharedBatchScheduler<TaskType>::QueueOptions
+      shared_scheduler_queue_options;
+  shared_scheduler_queue_options.max_batch_size = options.max_batch_size;
+  shared_scheduler_queue_options.batch_timeout_micros =
+      options.batch_timeout_micros;
+  shared_scheduler_queue_options.max_enqueued_batches =
+      options.max_enqueued_batches;
+  std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue;
+  TF_RETURN_IF_ERROR(shared_scheduler->AddQueue(shared_scheduler_queue_options,
+                                                process_batch_callback,
+                                                &shared_scheduler_queue));
+
+  scheduler->reset(
+      new BasicBatchScheduler<TaskType>(std::move(shared_scheduler_queue)));
+  return Status::OK();
+}
+
+template <typename TaskType>
+Status BasicBatchScheduler<TaskType>::Schedule(
+    std::unique_ptr<TaskType>* task) {
+  return shared_scheduler_queue_->Schedule(task);
+}
+
+template <typename TaskType>
+size_t BasicBatchScheduler<TaskType>::NumEnqueuedTasks() const {
+  return shared_scheduler_queue_->NumEnqueuedTasks();
+}
+
+template <typename TaskType>
+size_t BasicBatchScheduler<TaskType>::SchedulingCapacity() const {
+  return shared_scheduler_queue_->SchedulingCapacity();
+}
+
+template <typename TaskType>
+BasicBatchScheduler<TaskType>::BasicBatchScheduler(
+    std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue)
+    : shared_scheduler_queue_(std::move(shared_scheduler_queue)) {}
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BASIC_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler_benchmark.cc b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
similarity index 99%
rename from tensorflow/contrib/batching/basic_batch_scheduler_benchmark.cc
rename to tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
index ab6c81043359cd10d90668fcf88d61a5e0ea7ee0..65c9c00da57b7a5a163e590df8b52cd130be6fe1 100644
--- a/tensorflow/contrib/batching/basic_batch_scheduler_benchmark.cc
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // Benchmarks for performance (throughput and latency) of BasicBatchScheduler
 // under various rates of task injection.
 
-#include "tensorflow/contrib/batching/basic_batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/basic_batch_scheduler.h"
 #include "tensorflow/core/lib/histogram/histogram.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_test.cc
similarity index 94%
rename from tensorflow/contrib/batching/basic_batch_scheduler_test.cc
rename to tensorflow/core/kernels/batching_util/basic_batch_scheduler_test.cc
index e020301795c7dadee2815c0e0d727e53e5fb9e6e..494ba0c74c3efcdcc414aa49334a596dd625052c 100644
--- a/tensorflow/contrib/batching/basic_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_test.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/batching/basic_batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/basic_batch_scheduler.h"
 
 #include <utility>
 
-#include "tensorflow/contrib/batching/batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/macros.h"
@@ -73,6 +73,7 @@ TEST(BasicBatchSchedulerTest, Basic) {
     std::unique_ptr<BasicBatchScheduler<FakeTask>> scheduler;
     TF_ASSERT_OK(
         BasicBatchScheduler<FakeTask>::Create(options, callback, &scheduler));
+    EXPECT_EQ(10, scheduler->max_task_size());
     EXPECT_EQ(0, scheduler->NumEnqueuedTasks());
     EXPECT_EQ(3 * 10, scheduler->SchedulingCapacity());
     TF_ASSERT_OK(ScheduleTask(3, scheduler.get()));
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6d9a8f0c8824188d83124d857ca9def7224bc99
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -0,0 +1,281 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Abstractions for processing small tasks in a batched fashion, to reduce
+// processing times and costs that can be amortized across multiple tasks.
+//
+// The core class is BatchScheduler, which groups tasks into batches.
+//
+// BatchScheduler encapsulates logic for aggregating multiple tasks into a
+// batch, and kicking off processing of a batch on a thread pool it manages.
+//
+// This file defines an abstract BatchScheduler class.
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_H_
+
+#include <stddef.h>
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+
+// The abstract superclass for a unit of work to be done as part of a batch.
+//
+// An implementing subclass typically contains (or points to):
+//  (a) input data;
+//  (b) a thread-safe completion signal (e.g. a Notification);
+//  (c) a place to store the outcome (success, or some error), upon completion;
+//  (d) a place to store the output data, upon success.
+//
+// Items (b), (c) and (d) are typically non-owned pointers to data homed
+// elsewhere, because a task's ownership gets transferred to a BatchScheduler
+// (see below) and it may be deleted as soon as it is done executing.
+class BatchTask {
+ public:
+  virtual ~BatchTask() = default;
+
+  // Returns the size of the task, in terms of how much it contributes to the
+  // size of a batch. (A batch's size is the sum of its task sizes.)
+  virtual size_t size() const = 0;
+};
+
+// A thread-safe collection of BatchTasks, to be executed together in some
+// fashion.
+//
+// At a given time, a batch is either "open" or "closed": an open batch can
+// accept new tasks; a closed one cannot. A batch is monotonic: initially it is
+// open and tasks can be added to it; then it is closed and its set of tasks
+// remains fixed for the remainder of its life. A closed batch cannot be re-
+// opened. Tasks can never be removed from a batch.
+//
+// Type parameter TaskType must be a subclass of BatchTask.
+template <typename TaskType>
+class Batch {
+ public:
+  Batch() = default;
+  virtual ~Batch();  // Blocks until the batch is closed.
+
+  // Appends 'task' to the batch. After calling AddTask(), the newly-added task
+  // can be accessed via task(num_tasks()-1) or mutable_task(num_tasks()-1).
+  // Dies if the batch is closed.
+  void AddTask(std::unique_ptr<TaskType> task);
+
+  // Removes the most recently added task. Returns nullptr if the batch is
+  // empty.
+  std::unique_ptr<TaskType> RemoveTask();
+
+  // Returns the number of tasks in the batch.
+  int num_tasks() const;
+
+  // Returns true iff the batch contains 0 tasks.
+  bool empty() const;
+
+  // Returns a reference to the ith task (in terms of insertion order).
+  const TaskType& task(int i) const;
+
+  // Returns a pointer to the ith task (in terms of insertion order).
+  TaskType* mutable_task(int i);
+
+  // Returns the sum of the task sizes.
+  size_t size() const;
+
+  // Returns true iff the batch is currently closed.
+  bool IsClosed() const;
+
+  // Blocks until the batch is closed.
+  void WaitUntilClosed() const;
+
+  // Marks the batch as closed. Dies if called more than once.
+  void Close();
+
+ private:
+  mutable mutex mu_;
+
+  // The tasks in the batch.
+  std::vector<std::unique_ptr<TaskType>> tasks_ GUARDED_BY(mu_);
+
+  // The sum of the sizes of the tasks in 'tasks_'.
+  size_t size_ GUARDED_BY(mu_) = 0;
+
+  // Whether the batch has been closed.
+  Notification closed_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Batch);
+};
+
+// An abstract batch scheduler class. Collects individual tasks into batches,
+// and processes each batch on a pool of "batch threads" that it manages. The
+// actual logic for processing a batch is accomplished via a callback.
+//
+// Type parameter TaskType must be a subclass of BatchTask.
+template <typename TaskType>
+class BatchScheduler {
+ public:
+  virtual ~BatchScheduler() = default;
+
+  // Submits a task to be processed as part of a batch.
+  //
+  // Ownership of '*task' is transferred to the callee iff the method returns
+  // Status::OK. In that case, '*task' is left as nullptr. Otherwise, '*task' is
+  // left as-is.
+  //
+  // If no batch processing capacity is available to process this task at the
+  // present time, and any task queue maintained by the implementing subclass is
+  // full, this method returns an UNAVAILABLE error code. The client may retry
+  // later.
+  //
+  // Other problems, such as the task size being larger than the maximum batch
+  // size, yield other, permanent error types.
+  //
+  // In all cases, this method returns "quickly" without blocking for any
+  // substantial amount of time. If the method returns Status::OK, the task is
+  // processed asynchronously, and any errors that occur during the processing
+  // of the batch that includes the task can be reported to 'task'.
+  virtual Status Schedule(std::unique_ptr<TaskType>* task) = 0;
+
+  // Returns the number of tasks that have been scheduled (i.e. accepted by
+  // Schedule()), but have yet to be handed to a thread for execution as part of
+  // a batch. Note that this returns the number of tasks, not the aggregate task
+  // size (so if there is one task of size 3 and one task of size 5, this method
+  // returns 2 rather than 8).
+  virtual size_t NumEnqueuedTasks() const = 0;
+
+  // Returns a guaranteed number of size 1 tasks that can be Schedule()d without
+  // getting an UNAVAILABLE error. In a typical implementation, returns the
+  // available space on a queue.
+  //
+  // There are two important caveats:
+  //  1. The guarantee does not extend to varying-size tasks due to possible
+  //     internal fragmentation of batches.
+  //  2. The guarantee only holds in a single-thread environment or critical
+  //     section, i.e. if an intervening thread cannot call Schedule().
+  //
+  // This method is useful for monitoring, or for guaranteeing a future slot in
+  // the schedule (but being mindful about the caveats listed above).
+  virtual size_t SchedulingCapacity() const = 0;
+
+  // Returns the maximum allowed size of tasks submitted to the scheduler. (This
+  // is typically equal to a configured maximum batch size.)
+  virtual size_t max_task_size() const = 0;
+};
+
+//////////
+// Implementation details follow. API users need not read.
+
+template <typename TaskType>
+Batch<TaskType>::~Batch() {
+  WaitUntilClosed();
+}
+
+template <typename TaskType>
+void Batch<TaskType>::AddTask(std::unique_ptr<TaskType> task) {
+  DCHECK(!IsClosed());
+  {
+    mutex_lock l(mu_);
+    size_ += task->size();
+    tasks_.push_back(std::move(task));
+  }
+}
+
+template <typename TaskType>
+std::unique_ptr<TaskType> Batch<TaskType>::RemoveTask() {
+  {
+    mutex_lock l(mu_);
+    if (tasks_.empty()) {
+      return nullptr;
+    }
+    std::unique_ptr<TaskType> task = std::move(tasks_.back());
+    size_ -= task->size();
+    tasks_.pop_back();
+    return task;
+  }
+}
+
+template <typename TaskType>
+int Batch<TaskType>::num_tasks() const {
+  {
+    mutex_lock l(mu_);
+    return tasks_.size();
+  }
+}
+
+template <typename TaskType>
+bool Batch<TaskType>::empty() const {
+  {
+    mutex_lock l(mu_);
+    return tasks_.empty();
+  }
+}
+
+template <typename TaskType>
+const TaskType& Batch<TaskType>::task(int i) const {
+  DCHECK_GE(i, 0);
+  {
+    mutex_lock l(mu_);
+    DCHECK_LT(i, tasks_.size());
+    return *tasks_[i].get();
+  }
+}
+
+template <typename TaskType>
+TaskType* Batch<TaskType>::mutable_task(int i) {
+  DCHECK_GE(i, 0);
+  {
+    mutex_lock l(mu_);
+    DCHECK_LT(i, tasks_.size());
+    return tasks_[i].get();
+  }
+}
+
+template <typename TaskType>
+size_t Batch<TaskType>::size() const {
+  {
+    mutex_lock l(mu_);
+    return size_;
+  }
+}
+
+template <typename TaskType>
+bool Batch<TaskType>::IsClosed() const {
+  return const_cast<Notification*>(&closed_)->HasBeenNotified();
+}
+
+template <typename TaskType>
+void Batch<TaskType>::WaitUntilClosed() const {
+  const_cast<Notification*>(&closed_)->WaitForNotification();
+}
+
+template <typename TaskType>
+void Batch<TaskType>::Close() {
+  closed_.Notify();
+}
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
similarity index 96%
rename from tensorflow/contrib/batching/batch_scheduler_test.cc
rename to tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
index f15d8cc8e57300dddc06dcffb24ec98920e193ef..2357a320a8a477460dce8ba3f30973a0af4a369f 100644
--- a/tensorflow/contrib/batching/batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/batching/batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
@@ -74,7 +74,9 @@ TEST(BatchTest, Basic) {
   EXPECT_EQ(task1->size(), batch.task(1).size());
 
   EXPECT_EQ(7, batch.RemoveTask()->size());
+  EXPECT_EQ(3, batch.size());
   EXPECT_EQ(3, batch.RemoveTask()->size());
+  EXPECT_EQ(0, batch.size());
   EXPECT_TRUE(batch.empty());
 }
 
diff --git a/tensorflow/contrib/batching/test_util/fake_clock_env.cc b/tensorflow/core/kernels/batching_util/fake_clock_env.cc
similarity index 97%
rename from tensorflow/contrib/batching/test_util/fake_clock_env.cc
rename to tensorflow/core/kernels/batching_util/fake_clock_env.cc
index 166d6703bde1054a4a44842ecea382b5a1fb79e7..6a757d871681051432ae737e710655b3285f2d24 100644
--- a/tensorflow/contrib/batching/test_util/fake_clock_env.cc
+++ b/tensorflow/core/kernels/batching_util/fake_clock_env.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/batching/test_util/fake_clock_env.h"
+#include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
 
 #include <string>
 
diff --git a/tensorflow/core/kernels/batching_util/fake_clock_env.h b/tensorflow/core/kernels/batching_util/fake_clock_env.h
new file mode 100644
index 0000000000000000000000000000000000000000..60f1cbe7bd4d3bb73abfab413cdddaecf5de6c68
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/fake_clock_env.h
@@ -0,0 +1,76 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_FAKE_CLOCK_ENV_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_FAKE_CLOCK_ENV_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+namespace test_util {
+
+// An Env implementation with a fake clock for NowMicros() and
+// SleepForMicroseconds(). The clock doesn't advance on its own; it advances via
+// an explicit Advance() method.
+// All other Env virtual methods pass through to a wrapped Env.
+class FakeClockEnv : public EnvWrapper {
+ public:
+  explicit FakeClockEnv(Env* wrapped);
+  ~FakeClockEnv() override = default;
+
+  // Advance the clock by a certain number of microseconds.
+  void AdvanceByMicroseconds(int micros);
+
+  // Blocks until there is a sleeping thread that is scheduled to wake up at
+  // the given (absolute) time.
+  void BlockUntilSleepingThread(uint64 wake_time);
+
+  // Blocks until there are at least num_threads sleeping.
+  void BlockUntilThreadsAsleep(int num_threads);
+
+  // Methods that this class implements.
+  uint64 NowMicros() override;
+  void SleepForMicroseconds(int64 micros) override;
+
+ private:
+  mutex mu_;
+
+  uint64 current_time_ GUARDED_BY(mu_) = 0;
+
+  struct SleepingThread {
+    uint64 wake_time;
+    Notification* wake_notification;
+  };
+  std::vector<SleepingThread> sleeping_threads_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FakeClockEnv);
+};
+
+}  // namespace test_util
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_FAKE_CLOCK_ENV_H_
diff --git a/tensorflow/contrib/batching/util/periodic_function.cc b/tensorflow/core/kernels/batching_util/periodic_function.cc
similarity index 98%
rename from tensorflow/contrib/batching/util/periodic_function.cc
rename to tensorflow/core/kernels/batching_util/periodic_function.cc
index b7e4838da50c2daf70a5b2c7b7f630caa0be96fa..9726d04ac27bb4b2258275426e9cbb6ce5994435 100644
--- a/tensorflow/contrib/batching/util/periodic_function.cc
+++ b/tensorflow/core/kernels/batching_util/periodic_function.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/batching/util/periodic_function.h"
+#include "tensorflow/core/kernels/batching_util/periodic_function.h"
 
 #include <algorithm>
 
diff --git a/tensorflow/core/kernels/batching_util/periodic_function.h b/tensorflow/core/kernels/batching_util/periodic_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..36a4019002aa55c26fb5419c7a4d17562a367de8
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/periodic_function.h
@@ -0,0 +1,134 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// PeriodicFunction will periodically call the given function with a specified
+// period in a background thread.  After Start() returns, the thread is
+// guaranteed to have started. The destruction of the class causes the
+// background thread to be destroyed as well.  Start() should not be called more
+// than once.
+//
+// PeriodicFunction runs the function as soon as any previous run both is
+// complete and was started more than "interval_micros" earlier.  Thus, runs are
+// both serialized, and normally have a period of "interval_micros" if no run
+// exceeds the time.
+//
+// Note that, if the function takes longer than two interval_micross to finish,
+// then PeriodicFunction will "skip" at least one call to the function.  For
+// instance, if the period is 50ms and the function starts runs at time 0 for
+// 150ms, then the function will immediately start executing again at time 150,
+// but there will be no function runs corresponding to times 50 or 100.  This is
+// especially important to remember when using an environment with a simulated
+// clock: advancing simulated time atomically over N interval_micross will not
+// cause the function to be called N times.
+//
+// This object is thread-safe.
+//
+// Example:
+//
+//   class Foo {
+//    public:
+//     Foo() : periodic_function_([this]() { Bar(); },
+//                               1000 /* 1000us == 1ms*/) {
+//     }
+//
+//    private:
+//     void Bar() { ... }
+//
+//     PeriodicFunction periodic_function_;
+//   };
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_PERIODIC_FUNCTION_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_PERIODIC_FUNCTION_H_
+
+#include "tensorflow/core/kernels/batching_util/periodic_function.h"
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+
+namespace internal {
+class PeriodicFunctionTestAccess;
+}
+
+class PeriodicFunction {
+ public:
+  // Provides the ability to customize several aspects of the PeriodicFunction.
+  // Passed to constructor of PeriodicFunction.
+  struct Options {
+    Options() {}
+
+    // Any standard thread options, such as stack size, should
+    // be passed via "thread_options".
+    ThreadOptions thread_options;
+
+    // Specifies the thread name prefix (see the description in class
+    // Thread).
+    string thread_name_prefix = "periodic_function";
+
+    // The environment to use. Does not take ownership, but must remain alive
+    // for as long as the PeriodicFunction exists.
+    Env* env = Env::Default();
+
+    // Specifies the length of sleep before the first invocation of the
+    // function.
+    // This can be used for adding a random jitter to avoid synchronous behavior
+    // across multiple periodic functions.
+    int64 startup_delay_micros = 0;
+  };
+
+  // Also starts the background thread which will be calling the function.
+  PeriodicFunction(const std::function<void()>& function, int64 interval_micros,
+                   const Options& options = Options());
+
+  ~PeriodicFunction();
+
+ private:
+  friend class internal::PeriodicFunctionTestAccess;
+
+  // Notifies the background thread to stop.
+  void NotifyStop();
+
+  // (Blocking.) Loops forever calling "function_" every "interval_micros_".
+  void RunLoop(int64 start) LOCKS_EXCLUDED(mutex_);
+
+  const std::function<void()> function_;  // Actual client function
+  const int64 interval_micros_;           // Interval between calls.
+  const Options options_;
+
+  // Protects state below.
+  mutable mutex mutex_;
+  // Used to notify the thread to stop.
+  Notification stop_thread_;
+
+  // Thread for running "function_"
+  std::unique_ptr<Thread> thread_ GUARDED_BY(mutex_) = nullptr;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(PeriodicFunction);
+};
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_PERIODIC_FUNCTION_H_
diff --git a/tensorflow/contrib/batching/util/periodic_function_test.cc b/tensorflow/core/kernels/batching_util/periodic_function_test.cc
similarity index 98%
rename from tensorflow/contrib/batching/util/periodic_function_test.cc
rename to tensorflow/core/kernels/batching_util/periodic_function_test.cc
index 15179611160e1962bbd28b03ddbaa2eec35eb8ea..7682f0e1dd1506d27cefbf9d3f913c4ac5cdb7cc 100644
--- a/tensorflow/contrib/batching/util/periodic_function_test.cc
+++ b/tensorflow/core/kernels/batching_util/periodic_function_test.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/batching/util/periodic_function.h"
+#include "tensorflow/core/kernels/batching_util/periodic_function.h"
 
 #include <memory>
 #include <string>
 
-#include "tensorflow/contrib/batching/test_util/fake_clock_env.h"
+#include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..b77289aded437b2e6955ced3f7eca2aa5bd182dd
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -0,0 +1,705 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SHARED_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SHARED_BATCH_SCHEDULER_H_
+
+#include <stddef.h>
+#include <deque>
+#include <functional>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/periodic_function.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+namespace internal {
+template <typename TaskType>
+class Queue;
+}  // namespace internal
+}  // namespace serving
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace serving {
+
+// A batch scheduler for server instances that service multiple request types
+// (e.g. multiple machine-learned models, or multiple versions of a model served
+// concurrently), or even multiple distinct tasks for a given request. The
+// scheduler multiplexes batches of different kinds of tasks onto a fixed-size
+// thread pool (each batch contains tasks of a single type), in a carefully
+// controlled manner. A common configuration is to set the number of threads
+// equal to the number of hardware accelerator units, in which case the
+// scheduler takes care of multiplexing the task types onto the shared hardware,
+// in a manner that is both fair and efficient.
+//
+// Semantically, SharedBatchScheduler behaves like having N instances of
+// BasicBatchScheduler (see basic_batch_scheduler.h), one per task type. The
+// difference is that under the covers there is a single shared thread pool,
+// instead of N independent ones, with their sharing deliberately coordinated.
+//
+// SharedBatchScheduler does not implement the BatchScheduler API; rather, it
+// presents an abstraction of "queues", where each queue corresponds to one type
+// of task. Tasks submitted to a given queue are placed in their own batches,
+// and cannot be mixed with other tasks. Queues can be added and deleted
+// dynamically, to accommodate e.g. versions of a model being brought up and
+// down over the lifetime of a server.
+//
+// The batch thread pool round-robins through the queues, running one batch
+// from a queue and then moving to the next queue. Each queue behaves like a
+// BasicBatchScheduler instance, in the sense that it has maximum batch size and
+// timeout parameters, which govern when a batch is eligible to be processed.
+//
+// Each queue is independently configured with a maximum size (in terms of the
+// maximum number of batches worth of enqueued tasks). For online serving, it is
+// recommended that the queue sizes be configured such that the sum of the sizes
+// of the active queues roughly equal the number of batch threads. (The idea is
+// that if all threads become available at roughly the same time, there will be
+// enough enqueued work for them to take on, but no more.)
+//
+// If queue sizes are configured in the manner suggested above, the maximum time
+// a task can spend in a queue before being placed in a batch and assigned to a
+// thread for processing, is the greater of:
+//  - the maximum time to process one batch of tasks from any active queue
+//  - the configured timeout parameter for the task's queue (which can be 0)
+//
+// For bulk processing jobs and throughput-oriented benchmarks, you may want to
+// set the maximum queue size to a large value.
+//
+// TODO(b/26539183): Support queue servicing policies other than round-robin.
+// E.g. let each queue specify a "share" (an int >= 1), so e.g. with queues A
+// and B having shares 1 and 2 respectively, the servicing pattern is ABBABB...
+//
+//
+// PERFORMANCE TUNING: See README.md.
+//
+template <typename TaskType>
+class SharedBatchScheduler
+    : public std::enable_shared_from_this<SharedBatchScheduler<TaskType>> {
+ public:
+  // TODO(b/25089730): Tune defaults based on best practices as they develop.
+  struct Options {
+    // The name to use for the pool of batch threads.
+    string thread_pool_name = {"batch_threads"};
+
+    // The number of threads to use to process batches.
+    // Must be >= 1, and should be tuned carefully.
+    int num_batch_threads = port::NumSchedulableCPUs();
+
+    // The environment to use.
+    // (Typically only overridden by test code.)
+    Env* env = Env::Default();
+  };
+  // Ownership is shared between the caller of Create() and any queues created
+  // via AddQueue().
+  static Status Create(
+      const Options& options,
+      std::shared_ptr<SharedBatchScheduler<TaskType>>* scheduler);
+
+  ~SharedBatchScheduler();
+
+  // Adds a queue to which tasks may be submitted. The returned queue implements
+  // the BatchScheduler API. Each queue has its own set of scheduling options,
+  // and its own callback to process batches of tasks submitted to the queue.
+  //
+  // The returned queue's destructor blocks until all tasks submitted to it have
+  // been processed.
+  struct QueueOptions {
+    // The maximum size of each batch.
+    //
+    // The scheduler may form batches of any size between 1 and this number
+    // (inclusive). If there is a need to quantize the batch sizes, i.e. only
+    // submit batches whose size is in a small set of allowed sizes, that can be
+    // done by adding padding in the process-batch callback.
+    int max_batch_size = 1000;
+
+    // If a task has been enqueued for this amount of time (in microseconds),
+    // and a thread is available, the scheduler will immediately form a batch
+    // from enqueued tasks and assign the batch to the thread for processing,
+    // even if the batch's size is below 'max_batch_size'.
+    //
+    // This parameter offers a way to bound queue latency, so that a task isn't
+    // stuck in the queue indefinitely waiting for enough tasks to arrive to
+    // make a full batch. (The latency bound is given in the class documentation
+    // above.)
+    //
+    // The goal is to smooth out batch sizes under low request rates, and thus
+    // avoid latency spikes.
+    int64 batch_timeout_micros = 0;
+
+    // The maximum allowable number of enqueued (accepted by Schedule() but
+    // not yet being processed on a batch thread) tasks in terms of batches.
+    // If this limit is reached, Schedule() will return an UNAVAILABLE error.
+    // See the class documentation above for guidelines on how to tune this
+    // parameter.
+    int max_enqueued_batches = 10;
+  };
+  Status AddQueue(const QueueOptions& options,
+                  std::function<void(std::unique_ptr<Batch<TaskType>>)>
+                      process_batch_callback,
+                  std::unique_ptr<BatchScheduler<TaskType>>* queue);
+
+ private:
+  explicit SharedBatchScheduler(const Options& options);
+
+  // The code executed in 'batch_threads_'. Obtains a batch to process from the
+  // queue pointed to by 'next_queue_to_schedule_', and processes it. If that
+  // queue declines to provide a batch to process, moves onto the next queue. If
+  // no queues provide a batch to process, just sleeps briefly and exits.
+  void ThreadLogic();
+
+  const Options options_;
+
+  mutex mu_;
+
+  // A list of queues. (We use std::list instead of std::vector to ensure that
+  // iterators are not invalidated by adding/removing elements. It also offers
+  // efficient removal of elements from the middle.)
+  using QueueList = std::list<std::unique_ptr<internal::Queue<TaskType>>>;
+
+  // All "active" queues, i.e. ones that either:
+  //  - have not been removed, or
+  //  - have been removed but are not yet empty.
+  QueueList queues_ GUARDED_BY(mu_);
+
+  // An iterator over 'queues_', pointing to the queue from which the next
+  // available batch thread should grab work.
+  typename QueueList::iterator next_queue_to_schedule_ GUARDED_BY(mu_);
+
+  // Used by idle batch threads to wait for work to enter the system. Notified
+  // whenever a batch becomes schedulable.
+  condition_variable schedulable_batch_cv_;
+
+  // Threads that process batches obtained from the queues.
+  std::vector<std::unique_ptr<PeriodicFunction>> batch_threads_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SharedBatchScheduler);
+};
+
+//////////
+// Implementation details follow. API users need not read.
+
+namespace internal {
+
+// A task queue for SharedBatchScheduler. Accepts tasks and accumulates them
+// into batches, and dispenses those batches to be processed via a "pull"
+// interface. The queue's behavior is governed by maximum batch size, timeout
+// and maximum queue length parameters; see their documentation in
+// SharedBatchScheduler.
+//
+// The queue is implemented as a deque of batches, with these invariants:
+//  - The number of batches is between 1 and 'options_.max_enqueued_batches'.
+//  - The back-most batch is open; the rest are closed.
+//
+// Submitted tasks are added to the open batch. If that batch doesn't have room
+// but the queue isn't full, then that batch is closed and a new open batch is
+// started.
+//
+// Batch pull requests are handled by dequeuing the front-most batch if it is
+// closed. If the front-most batch is open (i.e. the queue contains only one
+// batch) and has reached the timeout, it is immediately closed and returned;
+// otherwise no batch is returned for the request.
+template <typename TaskType>
+class Queue {
+ public:
+  using ProcessBatchCallback =
+      std::function<void(std::unique_ptr<Batch<TaskType>>)>;
+  using SchedulableBatchCallback = std::function<void()>;
+  Queue(const typename SharedBatchScheduler<TaskType>::QueueOptions& options,
+        Env* env, ProcessBatchCallback process_batch_callback,
+        SchedulableBatchCallback schdulable_batch_callback);
+
+  // Illegal to destruct unless the queue is empty.
+  ~Queue();
+
+  // Submits a task to the queue, with the same semantics as
+  // BatchScheduler::Schedule().
+  Status Schedule(std::unique_ptr<TaskType>* task);
+
+  // Returns the number of enqueued tasks, with the same semantics as
+  // BatchScheduler::NumEnqueuedTasks().
+  size_t NumEnqueuedTasks() const;
+
+  // Returns the queue capacity, with the same semantics as
+  // BatchScheduler::SchedulingCapacity().
+  size_t SchedulingCapacity() const;
+
+  // Returns the maximum allowed size of tasks submitted to the queue.
+  size_t max_task_size() const { return options_.max_batch_size; }
+
+  // Called by a thread that is ready to process a batch, to request one from
+  // this queue. Either returns a batch that is ready to be processed, or
+  // nullptr if the queue declines to schedule a batch at this time. If it
+  // returns a batch, the batch is guaranteed to be closed.
+  std::unique_ptr<Batch<TaskType>> ScheduleBatch();
+
+  // Processes a batch that has been returned earlier by ScheduleBatch().
+  void ProcessBatch(std::unique_ptr<Batch<TaskType>> batch);
+
+  // Determines whether the queue is empty, i.e. has no tasks waiting or being
+  // processed.
+  bool IsEmpty() const;
+
+  // Marks the queue closed, and waits until it is empty.
+  void CloseAndWaitUntilEmpty();
+
+  bool closed() const {
+    mutex_lock l(mu_);
+    return closed_;
+  }
+
+ private:
+  // Same as IsEmpty(), but assumes the caller already holds a lock on 'mu_'.
+  bool IsEmptyInternal() const EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Closes the open batch residing at the back of 'batches_', and inserts a
+  // fresh open batch behind it.
+  void StartNewBatch() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Determines whether the open batch residing at the back of 'batches_' is
+  // currently schedulable.
+  bool IsOpenBatchSchedulable() const EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  const typename SharedBatchScheduler<TaskType>::QueueOptions options_;
+
+  // The environment to use.
+  Env* env_;
+
+  // A callback invoked to processes a batch of work units. Always invoked from
+  // a batch thread.
+  ProcessBatchCallback process_batch_callback_;
+
+  // A callback invoked to notify the scheduler that a new batch has become
+  // schedulable.
+  SchedulableBatchCallback schedulable_batch_callback_;
+
+  mutable mutex mu_;
+
+  // Whether this queue can accept new tasks. This variable is monotonic: it
+  // starts as false, and then at some point gets set to true and remains true
+  // for the duration of this object's life.
+  bool closed_ GUARDED_BY(mu_) = false;
+
+  // The enqueued batches. See the invariants in the class comments above.
+  std::deque<std::unique_ptr<Batch<TaskType>>> batches_ GUARDED_BY(mu_);
+
+  // The time at which the first task was added to the open (back-most) batch
+  // in 'batches_'. Valid iff that batch contains at least one task.
+  uint64 open_batch_start_time_micros_ GUARDED_BY(mu_);
+
+  // Whether this queue contains a batch that is eligible to be scheduled. Used
+  // to keep track of when to call 'schedulable_batch_callback_'.
+  bool schedulable_batch_ GUARDED_BY(mu_) = false;
+
+  // The number of batches currently being processed by batch threads.
+  // Incremented in ScheduleBatch() and decremented in ProcessBatch().
+  int num_batches_being_processed_ GUARDED_BY(mu_) = 0;
+
+  // Used by CloseAndWaitUntilEmpty() to wait until the queue is empty, for the
+  // case in which the queue is not empty when CloseAndWaitUntilEmpty() starts.
+  // When ProcessBatch() dequeues the last batch and makes the queue empty, if
+  // 'empty_notification_' is non-null it calls 'empty_notification_->Notify()'.
+  Notification* empty_notification_ GUARDED_BY(mu_) = nullptr;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Queue);
+};
+
+// A RAII-style object that points to a Queue and implements
+// the BatchScheduler API. To be handed out to clients who call AddQueue().
+template <typename TaskType>
+class QueueHandle : public BatchScheduler<TaskType> {
+ public:
+  QueueHandle(std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler,
+              Queue<TaskType>* queue);
+  ~QueueHandle() override;
+
+  Status Schedule(std::unique_ptr<TaskType>* task) override;
+  size_t NumEnqueuedTasks() const override;
+  size_t SchedulingCapacity() const override;
+
+  size_t max_task_size() const override { return queue_->max_task_size(); }
+
+ private:
+  // The scheduler that owns 'queue_'.
+  std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler_;
+
+  // The queue this handle wraps. Owned by 'scheduler_', which keeps it alive at
+  // least until this class's destructor closes it.
+  Queue<TaskType>* queue_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(QueueHandle);
+};
+
+}  // namespace internal
+
+template <typename TaskType>
+Status SharedBatchScheduler<TaskType>::Create(
+    const Options& options,
+    std::shared_ptr<SharedBatchScheduler<TaskType>>* scheduler) {
+  if (options.num_batch_threads < 1) {
+    return errors::InvalidArgument("num_batch_threads must be positive; was ",
+                                   options.num_batch_threads);
+  }
+  scheduler->reset(new SharedBatchScheduler<TaskType>(options));
+  return Status::OK();
+}
+
+template <typename TaskType>
+SharedBatchScheduler<TaskType>::~SharedBatchScheduler() {
+  // Wait until the batch threads finish clearing out and deleting the closed
+  // queues.
+  for (;;) {
+    {
+      mutex_lock l(mu_);
+      if (queues_.empty()) {
+        break;
+      }
+    }
+    const int64 kSleepTimeMicros = 100;
+    options_.env->SleepForMicroseconds(kSleepTimeMicros);
+  }
+  // Delete the batch threads before allowing state the threads may access (e.g.
+  // 'mu_') to be deleted.
+  batch_threads_.clear();
+}
+
+template <typename TaskType>
+Status SharedBatchScheduler<TaskType>::AddQueue(
+    const QueueOptions& options,
+    std::function<void(std::unique_ptr<Batch<TaskType>>)>
+        process_batch_callback,
+    std::unique_ptr<BatchScheduler<TaskType>>* queue) {
+  if (options.max_batch_size <= 0) {
+    return errors::InvalidArgument("max_batch_size must be positive; was ",
+                                   options.max_batch_size);
+  }
+  if (options.batch_timeout_micros < 0) {
+    return errors::InvalidArgument(
+        "batch_timeout_micros must be non-negative; was ",
+        options.batch_timeout_micros);
+  }
+  if (options.max_enqueued_batches < 0) {
+    return errors::InvalidArgument(
+        "max_enqueued_batches must be non-negative; was ",
+        options.max_enqueued_batches);
+  }
+
+  auto schedulable_batch_callback = [this] {
+    mutex_lock l(mu_);
+    schedulable_batch_cv_.notify_one();
+  };
+  auto internal_queue =
+      std::unique_ptr<internal::Queue<TaskType>>(new internal::Queue<TaskType>(
+          options, options_.env, process_batch_callback,
+          schedulable_batch_callback));
+  auto handle = std::unique_ptr<BatchScheduler<TaskType>>(
+      new internal::QueueHandle<TaskType>(this->shared_from_this(),
+                                          internal_queue.get()));
+  {
+    mutex_lock l(mu_);
+    queues_.push_back(std::move(internal_queue));
+    if (next_queue_to_schedule_ == queues_.end()) {
+      next_queue_to_schedule_ = queues_.begin();
+    }
+  }
+  *queue = std::move(handle);
+  return Status::OK();
+}
+
+template <typename TaskType>
+SharedBatchScheduler<TaskType>::SharedBatchScheduler(const Options& options)
+    : options_(options), next_queue_to_schedule_(queues_.end()) {
+  // Kick off the batch threads.
+  PeriodicFunction::Options periodic_fn_options;
+  periodic_fn_options.thread_name_prefix =
+      strings::StrCat(options.thread_pool_name, "_");
+  for (int i = 0; i < options.num_batch_threads; ++i) {
+    std::unique_ptr<PeriodicFunction> thread(new PeriodicFunction(
+        [this] { this->ThreadLogic(); },
+        0 /* function invocation interval time */, periodic_fn_options));
+    batch_threads_.push_back(std::move(thread));
+  }
+}
+
+template <typename TaskType>
+void SharedBatchScheduler<TaskType>::ThreadLogic() {
+  // A batch to process next (or nullptr if no work to do).
+  std::unique_ptr<Batch<TaskType>> batch_to_process;
+  // The queue with which 'batch_to_process' is associated.
+  internal::Queue<TaskType>* queue_for_batch = nullptr;
+  {
+    mutex_lock l(mu_);
+
+    const int num_queues = queues_.size();
+    for (int num_queues_tried = 0;
+         batch_to_process == nullptr && num_queues_tried < num_queues;
+         ++num_queues_tried) {
+      DCHECK(next_queue_to_schedule_ != queues_.end());
+
+      // If a closed queue responds to ScheduleBatch() with nullptr, the queue
+      // will never yield any further batches so we can drop it. To avoid a
+      // race, we take a snapshot of the queue's closedness state *before*
+      // calling ScheduleBatch().
+      const bool queue_closed = (*next_queue_to_schedule_)->closed();
+
+      // Ask '*next_queue_to_schedule_' if it wants us to process a batch.
+      batch_to_process = (*next_queue_to_schedule_)->ScheduleBatch();
+      if (batch_to_process != nullptr) {
+        queue_for_batch = next_queue_to_schedule_->get();
+      }
+
+      // Advance 'next_queue_to_schedule_'.
+      if (queue_closed && (*next_queue_to_schedule_)->IsEmpty() &&
+          batch_to_process == nullptr) {
+        // We've encountered a closed queue with no work to do. Drop it.
+        DCHECK_NE(queue_for_batch, next_queue_to_schedule_->get());
+        next_queue_to_schedule_ = queues_.erase(next_queue_to_schedule_);
+      } else {
+        ++next_queue_to_schedule_;
+      }
+      if (next_queue_to_schedule_ == queues_.end() && !queues_.empty()) {
+        // We've hit the end. Wrap to the first queue.
+        next_queue_to_schedule_ = queues_.begin();
+      }
+    }
+
+    if (batch_to_process == nullptr) {
+      // We couldn't find any work to do. Wait until a new batch becomes
+      // schedulable, or some time has elapsed, before checking again.
+      const int64 kTimeoutMillis = 1;  // The smallest accepted granule of time.
+      WaitForMilliseconds(&l, &schedulable_batch_cv_, kTimeoutMillis);
+      return;
+    }
+  }
+
+  queue_for_batch->ProcessBatch(std::move(batch_to_process));
+}
+
+namespace internal {
+
+template <typename TaskType>
+Queue<TaskType>::Queue(
+    const typename SharedBatchScheduler<TaskType>::QueueOptions& options,
+    Env* env, ProcessBatchCallback process_batch_callback,
+    SchedulableBatchCallback schedulable_batch_callback)
+    : options_(options),
+      env_(env),
+      process_batch_callback_(process_batch_callback),
+      schedulable_batch_callback_(schedulable_batch_callback) {
+  // Create an initial, open batch.
+  batches_.emplace_back(new Batch<TaskType>);
+}
+
+template <typename TaskType>
+Queue<TaskType>::~Queue() {
+  mutex_lock l(mu_);
+  DCHECK(IsEmptyInternal());
+
+  // Close the (empty) open batch, so its destructor doesn't block.
+  batches_.back()->Close();
+}
+
+template <typename TaskType>
+Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
+  if ((*task)->size() > options_.max_batch_size) {
+    return errors::InvalidArgument("Task size ", (*task)->size(),
+                                   " is larger than maximum batch size ",
+                                   options_.max_batch_size);
+  }
+
+  bool notify_of_schedulable_batch = false;
+  {
+    mutex_lock l(mu_);
+
+    DCHECK(!closed_);
+
+    if (batches_.back()->size() + (*task)->size() > options_.max_batch_size) {
+      if (batches_.size() >= options_.max_enqueued_batches) {
+        return errors::Unavailable(
+            "The batch scheduling queue to which this task was submitted is "
+            "full");
+      }
+      StartNewBatch();
+    }
+    if (batches_.back()->empty()) {
+      open_batch_start_time_micros_ = env_->NowMicros();
+    }
+    batches_.back()->AddTask(std::move(*task));
+
+    if (!schedulable_batch_) {
+      if (batches_.size() > 1 || IsOpenBatchSchedulable()) {
+        schedulable_batch_ = true;
+        notify_of_schedulable_batch = true;
+      }
+    }
+  }
+
+  if (notify_of_schedulable_batch) {
+    schedulable_batch_callback_();
+  }
+
+  return Status::OK();
+}
+
+template <typename TaskType>
+size_t Queue<TaskType>::NumEnqueuedTasks() const {
+  mutex_lock l(mu_);
+  size_t num_enqueued_tasks = 0;
+  for (const auto& batch : batches_) {
+    num_enqueued_tasks += batch->num_tasks();
+  }
+  return num_enqueued_tasks;
+}
+
+template <typename TaskType>
+size_t Queue<TaskType>::SchedulingCapacity() const {
+  mutex_lock l(mu_);
+  const int num_new_batches_schedulable =
+      options_.max_enqueued_batches - batches_.size();
+  const int open_batch_capacity =
+      options_.max_batch_size - batches_.back()->size();
+  return (num_new_batches_schedulable * options_.max_batch_size) +
+         open_batch_capacity;
+}
+
+template <typename TaskType>
+std::unique_ptr<Batch<TaskType>> Queue<TaskType>::ScheduleBatch() {
+  // The batch to schedule, which we may populate below. (If left as nullptr,
+  // that means we are electing not to schedule a batch at this time.)
+  std::unique_ptr<Batch<TaskType>> batch_to_schedule;
+
+  {
+    mutex_lock l(mu_);
+
+    // Consider closing the open batch at this time, to schedule it.
+    if (batches_.size() == 1 && IsOpenBatchSchedulable()) {
+      StartNewBatch();
+    }
+
+    if (batches_.size() >= 2) {
+      // There is at least one closed batch that is ready to be scheduled.
+      ++num_batches_being_processed_;
+      batch_to_schedule = std::move(batches_.front());
+      batches_.pop_front();
+    } else {
+      schedulable_batch_ = false;
+    }
+  }
+
+  return batch_to_schedule;
+}
+
+template <typename TaskType>
+void Queue<TaskType>::ProcessBatch(std::unique_ptr<Batch<TaskType>> batch) {
+  process_batch_callback_(std::move(batch));
+
+  {
+    mutex_lock l(mu_);
+    --num_batches_being_processed_;
+    if (empty_notification_ != nullptr && IsEmptyInternal()) {
+      empty_notification_->Notify();
+    }
+  }
+}
+
+template <typename TaskType>
+bool Queue<TaskType>::IsEmpty() const {
+  mutex_lock l(mu_);
+  return IsEmptyInternal();
+}
+
+template <typename TaskType>
+void Queue<TaskType>::CloseAndWaitUntilEmpty() {
+  Notification empty;
+  {
+    mutex_lock l(mu_);
+    closed_ = true;
+    if (IsEmptyInternal()) {
+      empty.Notify();
+    } else {
+      // Arrange for ProcessBatch() to notify when the queue becomes empty.
+      empty_notification_ = &empty;
+    }
+  }
+  empty.WaitForNotification();
+}
+
+template <typename TaskType>
+bool Queue<TaskType>::IsEmptyInternal() const {
+  return num_batches_being_processed_ == 0 && batches_.size() == 1 &&
+         batches_.back()->empty();
+}
+
+template <typename TaskType>
+void Queue<TaskType>::StartNewBatch() {
+  batches_.back()->Close();
+  batches_.emplace_back(new Batch<TaskType>);
+}
+
+template <typename TaskType>
+bool Queue<TaskType>::IsOpenBatchSchedulable() const {
+  Batch<TaskType>* open_batch = batches_.back().get();
+  if (open_batch->empty()) {
+    return false;
+  }
+  return closed_ || open_batch->size() >= options_.max_batch_size ||
+         env_->NowMicros() >=
+             open_batch_start_time_micros_ + options_.batch_timeout_micros;
+}
+
+template <typename TaskType>
+QueueHandle<TaskType>::QueueHandle(
+    std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler,
+    Queue<TaskType>* queue)
+    : scheduler_(scheduler), queue_(queue) {}
+
+template <typename TaskType>
+QueueHandle<TaskType>::~QueueHandle() {
+  queue_->CloseAndWaitUntilEmpty();
+}
+
+template <typename TaskType>
+Status QueueHandle<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
+  return queue_->Schedule(task);
+}
+
+template <typename TaskType>
+size_t QueueHandle<TaskType>::NumEnqueuedTasks() const {
+  return queue_->NumEnqueuedTasks();
+}
+
+template <typename TaskType>
+size_t QueueHandle<TaskType>::SchedulingCapacity() const {
+  return queue_->SchedulingCapacity();
+}
+
+}  // namespace internal
+
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SHARED_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/batching/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
similarity index 96%
rename from tensorflow/contrib/batching/shared_batch_scheduler_test.cc
rename to tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index 3e924ae5f13519b4fe9a3f4b510773ca2bddaf23..d5ea2b648f35efd03c04d00abc838edadd37570e 100644
--- a/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/batching/shared_batch_scheduler.h"
+#include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 
-#include "tensorflow/contrib/batching/test_util/fake_clock_env.h"
+#include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -55,15 +55,14 @@ Status ScheduleTask(size_t task_size, BatchScheduler<FakeTask>* scheduler) {
 // use the clock to be destroyed.
 std::unique_ptr<Thread> CreateFakeClockAdvancerThread(
     test_util::FakeClockEnv* env, Notification* start, Notification* stop) {
-  return std::unique_ptr<Thread>(
-      Env::Default()->StartThread({}, "FakeClockAdvancerThread",
-                                  [env, start, stop] {
-                                    start->WaitForNotification();
-                                    while (!stop->HasBeenNotified()) {
-                                      env->AdvanceByMicroseconds(10);
-                                      Env::Default()->SleepForMicroseconds(10);
-                                    }
-                                  }));
+  return std::unique_ptr<Thread>(Env::Default()->StartThread(
+      {}, "FakeClockAdvancerThread", [env, start, stop] {
+        start->WaitForNotification();
+        while (!stop->HasBeenNotified()) {
+          env->AdvanceByMicroseconds(10);
+          Env::Default()->SleepForMicroseconds(10);
+        }
+      }));
 }
 
 TEST(SharedBatchSchedulerTest, Basic) {
@@ -258,7 +257,7 @@ TEST(SharedBatchSchedulerTest, ObeysTimeout) {
 TEST(SharedBatchSchedulerTest, ObeysTimeoutWithRealClock) {
   Notification first_batch_processed, second_batch_processed;
   auto callback = [&first_batch_processed, &second_batch_processed](
-      std::unique_ptr<Batch<FakeTask>> batch) {
+                      std::unique_ptr<Batch<FakeTask>> batch) {
     ASSERT_TRUE(batch->IsClosed());
     if (batch->size() == 1) {
       first_batch_processed.Notify();
@@ -301,7 +300,7 @@ TEST(SharedBatchSchedulerTest,
   {
     Notification first_batch_processed, second_batch_processed;
     auto callback = [&first_batch_processed, &second_batch_processed](
-        std::unique_ptr<Batch<FakeTask>> batch) {
+                        std::unique_ptr<Batch<FakeTask>> batch) {
       ASSERT_TRUE(batch->IsClosed());
       if (batch->size() == 1) {
         first_batch_processed.Notify();
@@ -349,7 +348,7 @@ TEST(SharedBatchSchedulerTest, Fairness) {
     auto queue_0_callback = [&queue_0_first_batch_scheduled,
                              &queue_0_first_batch_proceed,
                              &queue_0_second_batch_scheduled](
-        std::unique_ptr<Batch<FakeTask>> batch) {
+                                std::unique_ptr<Batch<FakeTask>> batch) {
       if (!queue_0_first_batch_scheduled.HasBeenNotified()) {
         queue_0_first_batch_scheduled.Notify();
         queue_0_first_batch_proceed.WaitForNotification();
@@ -429,6 +428,7 @@ TEST(SharedBatchSchedulerTest, ConstMethods) {
     queue_options.max_enqueued_batches = max_enqueued_batches;
     std::unique_ptr<BatchScheduler<FakeTask>> queue;
     TF_ASSERT_OK(scheduler->AddQueue(queue_options, callback, &queue));
+    EXPECT_EQ(2, queue->max_task_size());
     EXPECT_EQ(0, queue->NumEnqueuedTasks());
     EXPECT_EQ(max_enqueued_batches * 2, queue->SchedulingCapacity());
 
@@ -466,7 +466,7 @@ TEST(SharedBatchSchedulerTest, ConstMethods) {
 TEST(SharedBatchSchedulerTest, OneFullQueueDoesntBlockOtherQueues) {
   Notification queue_0_processing, queue_0_proceed;
   auto queue_0_callback = [&queue_0_processing, &queue_0_proceed](
-      std::unique_ptr<Batch<FakeTask>> batch) {
+                              std::unique_ptr<Batch<FakeTask>> batch) {
     if (!queue_0_processing.HasBeenNotified()) {
       queue_0_processing.Notify();
       queue_0_proceed.WaitForNotification();
diff --git a/tensorflow/core/kernels/batchtospace_op.cc b/tensorflow/core/kernels/batchtospace_op.cc
index c1c0d6d329206088acaa009b3ffe695661527e44..b07c5fd718daea802a08650f97ccff393914e208 100644
--- a/tensorflow/core/kernels/batchtospace_op.cc
+++ b/tensorflow/core/kernels/batchtospace_op.cc
@@ -56,9 +56,10 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
       errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
                               " instead of ", orig_input_tensor.dims()));
 
-  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
-                           block_dims == orig_crops.dim_size(0) &&
-                           2 == orig_crops.dim_size(1),
+  OP_REQUIRES(context,
+              TensorShapeUtils::IsMatrix(orig_crops.shape()) &&
+                  block_dims == orig_crops.dim_size(0) &&
+                  2 == orig_crops.dim_size(1),
               errors::InvalidArgument("crops should have shape [", block_dims,
                                       ", 2] instead of ",
                                       orig_crops.shape().DebugString()));
diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc
index 2ad2c4163649cb97f5a0b03dfeffa1b5fd53e208..8e4f08e473060b50d387d53aab89c10d0a26b93a 100644
--- a/tensorflow/core/kernels/bcast_ops.cc
+++ b/tensorflow/core/kernels/bcast_ops.cc
@@ -13,20 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/util/bcast.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
 // Given shapes of two tensors, computes the broadcast shape.
+template <typename T>
 class BCastArgsOp : public OpKernel {
  public:
-  explicit BCastArgsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->MatchSignature({DT_INT32, DT_INT32}, {DT_INT32}));
-  }
+  explicit BCastArgsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
     OP_REQUIRES(
@@ -40,7 +39,7 @@ class BCastArgsOp : public OpKernel {
                                           in.shape().DebugString()));
       BCast::Vec vec;
       for (int64 i = 0; i < in.NumElements(); ++i) {
-        vec.push_back(in.vec<int32>()(i));
+        vec.push_back(in.vec<T>()(i));
       }
       shapes.push_back(vec);
     }
@@ -60,7 +59,7 @@ class BCastArgsOp : public OpKernel {
     Tensor* o = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(idx, TensorShape({len}), &o));
     for (int64 i = 0; i < len; ++i) {
-      o->flat<int32>()(i) = static_cast<int32>(v[i]);
+      o->flat<T>()(i) = static_cast<T>(v[i]);
     }
   }
 
@@ -72,12 +71,10 @@ class BCastArgsOp : public OpKernel {
 //
 // TODO(zhifengc):
 //   1. Adds support for n-ary (n >= 2).
+template <typename T>
 class BCastGradArgsOp : public OpKernel {
  public:
-  explicit BCastGradArgsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(
-        ctx, ctx->MatchSignature({DT_INT32, DT_INT32}, {DT_INT32, DT_INT32}));
-  }
+  explicit BCastGradArgsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
     OP_REQUIRES(
@@ -91,7 +88,7 @@ class BCastGradArgsOp : public OpKernel {
                                           in.shape().DebugString()));
       BCast::Vec vec;
       for (int64 i = 0; i < in.NumElements(); ++i) {
-        vec.push_back(in.vec<int32>()(i));
+        vec.push_back(in.vec<T>()(i));
       }
       shapes.push_back(vec);
     }
@@ -112,7 +109,7 @@ class BCastGradArgsOp : public OpKernel {
     Tensor* o = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(idx, TensorShape({len}), &o));
     for (int64 i = 0; i < len; ++i) {
-      o->flat<int32>()(i) = static_cast<int32>(v[i]);
+      o->flat<T>()(i) = static_cast<T>(v[i]);
     }
   }
 
@@ -125,14 +122,28 @@ REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
                             .HostMemory("s0")
                             .HostMemory("s1")
                             .HostMemory("r0"),
-                        BCastArgsOp);
+                        BCastArgsOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int64>("T")
+                            .HostMemory("s0")
+                            .HostMemory("s1")
+                            .HostMemory("r0"),
+                        BCastArgsOp<int64>);
 REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int32>("T")
                             .HostMemory("s0")
                             .HostMemory("s1")
                             .HostMemory("r0"),
-                        BCastArgsOp);
+                        BCastArgsOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int64>("T")
+                            .HostMemory("s0")
+                            .HostMemory("s1")
+                            .HostMemory("r0"),
+                        BCastArgsOp<int64>);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
@@ -141,7 +152,14 @@ REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
                             .HostMemory("s0")
                             .HostMemory("s1")
                             .HostMemory("r0"),
-                        BCastArgsOp);
+                        BCastArgsOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int64>("T")
+                            .HostMemory("s0")
+                            .HostMemory("s1")
+                            .HostMemory("r0"),
+                        BCastArgsOp<int32>);
 #endif
 
 REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
@@ -151,7 +169,15 @@ REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
                             .HostMemory("s1")
                             .HostMemory("r0")
                             .HostMemory("r1"),
-                        BCastGradArgsOp);
+                        BCastGradArgsOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int64>("T")
+                            .HostMemory("s0")
+                            .HostMemory("s1")
+                            .HostMemory("r0")
+                            .HostMemory("r1"),
+                        BCastGradArgsOp<int64>);
 REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int32>("T")
@@ -159,7 +185,15 @@ REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
                             .HostMemory("s1")
                             .HostMemory("r0")
                             .HostMemory("r1"),
-                        BCastGradArgsOp);
+                        BCastGradArgsOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int64>("T")
+                            .HostMemory("s0")
+                            .HostMemory("s1")
+                            .HostMemory("r0")
+                            .HostMemory("r1"),
+                        BCastGradArgsOp<int64>);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
@@ -169,6 +203,14 @@ REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
                             .HostMemory("s1")
                             .HostMemory("r0")
                             .HostMemory("r1"),
-                        BCastGradArgsOp);
+                        BCastGradArgsOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int64>("T")
+                            .HostMemory("s0")
+                            .HostMemory("s1")
+                            .HostMemory("r0")
+                            .HostMemory("r1"),
+                        BCastGradArgsOp<int64>);
 #endif
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 42f3db1d79d4e0b0406f8c5c9abb423c03f30ab6..754b93b073a36d0925a0339956b8224878b849e1 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -77,14 +77,14 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
   }
   CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
   if (data_format == FORMAT_NHWC) {
-    BiasNHWCKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, bias, output, bias_size);
+    BiasNHWCKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input, bias, output, bias_size);
   } else {
-    BiasNCHWKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input, bias, output, bias_size,
-        image_size);
+    BiasNCHWKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input, bias, output, bias_size,
+            image_size);
   }
 }
 
@@ -173,19 +173,13 @@ __global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop,
   // Accumulate the results in the shared memory into the first element.
   // No syncthreads is needed since this is only in the same warp.
   int32 thread_index = threadIdx.x;
-  if (thread_index < 16) {
-    s_data[thread_index] += s_data[thread_index + 16];
-    __syncwarp(0xFFFF);
-    if (thread_index < 8) s_data[thread_index] += s_data[thread_index + 8];
-    __syncwarp(0xFF);
-    if (thread_index < 4) s_data[thread_index] += s_data[thread_index + 4];
-    __syncwarp(0xF);
-    if (thread_index < 2) s_data[thread_index] += s_data[thread_index + 2];
-    __syncwarp(0x3);
+  if (thread_index < 32) {
+    AccT data = s_data[thread_index];
+    for (int32 delta = warpSize / 2; delta > 0; delta /= 2) {
+      data += CudaShuffleXorSync(kCudaWarpAll, data, delta);
+    }
     if (thread_index == 0) {
-      T val = T(s_data[0] + s_data[1]);
-      // The first thread writes out the accumulated result to global location.
-      CudaAtomicAdd(bias_backprop + bias_index, val);
+      CudaAtomicAdd(bias_backprop + bias_index, T(data));
     }
   }
 }
@@ -212,10 +206,10 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
   // Check if we have enough shared memory.
   if (shared_memory_size <= max_shared_memory_size) {
     if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_SharedAtomics<
-          T><<<config.block_count, config.thread_per_block, shared_memory_size,
-               d.stream()>>>(total_count, output_backprop, bias_backprop,
-                             bias_size);
+      BiasGradNHWC_SharedAtomics<T>
+          <<<config.block_count, config.thread_per_block, shared_memory_size,
+             d.stream()>>>(total_count, output_backprop, bias_backprop,
+                           bias_size);
     } else {
       // Round up the block count to multiple of bias_size.
       int group_size = (config.block_count + bias_size - 1) / bias_size;
@@ -223,23 +217,24 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
       if (config.thread_per_block < kWarpSize) {
         config.thread_per_block = kWarpSize;
       }
-      BiasGradNCHW_SharedAtomics<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          output_backprop, bias_backprop, batch, bias_size, image_size,
-          group_size);
+      BiasGradNCHW_SharedAtomics<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              output_backprop, bias_backprop, batch, bias_size, image_size,
+              group_size);
     }
   } else {
     // Note that even if we don't have enough shared memory to fit the entire
     // output block, it is possible to process one group of elements at a time.
     // But for now, we simply fall back to the naive implementation.
     if (data_format == FORMAT_NHWC) {
-      BiasGradNHWC_Naive<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          total_count, output_backprop, bias_backprop, bias_size);
+      BiasGradNHWC_Naive<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              total_count, output_backprop, bias_backprop, bias_size);
     } else {
-      BiasGradNCHW_Naive<
-          T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-          total_count, output_backprop, bias_backprop, bias_size, image_size);
+      BiasGradNCHW_Naive<T>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              total_count, output_backprop, bias_backprop, bias_size,
+              image_size);
     }
   }
 }
diff --git a/tensorflow/core/kernels/bitcast_op.h b/tensorflow/core/kernels/bitcast_op.h
index 0413569e795bcc0911d95a7a946e172579b4ef3a..900ab6f35c15e908a415849784b612da2b6d7c22 100644
--- a/tensorflow/core/kernels/bitcast_op.h
+++ b/tensorflow/core/kernels/bitcast_op.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/array_ops.cc.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
+#define TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
 
 #include <string.h>  // for memcpy
 
@@ -27,4 +27,4 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/casts.h"
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
diff --git a/tensorflow/core/kernels/bounds_check.h b/tensorflow/core/kernels/bounds_check.h
index e35f42ad4173348f63445030aef6c6de2b1de9a7..c8c60c55241ab2b1b3a426560959fed7ea893129 100644
--- a/tensorflow/core/kernels/bounds_check.h
+++ b/tensorflow/core/kernels/bounds_check.h
@@ -48,7 +48,7 @@ EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
   auto *to_x = reinterpret_cast<const volatile T *>(&x);
   return *to_x;
 }
-}  // namespace tensorflow::internal
+}  // namespace internal
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_UTIL_BOUNDS_CHECK_H_
diff --git a/tensorflow/core/kernels/bucketize_op.cc b/tensorflow/core/kernels/bucketize_op.cc
index c1693de53894228865af675746f8da13073574f8..4e4b6d52154cd1bacc621535f7dd9c56045a3c57 100644
--- a/tensorflow/core/kernels/bucketize_op.cc
+++ b/tensorflow/core/kernels/bucketize_op.cc
@@ -25,10 +25,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-using thread::ThreadPool;
-
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
 
 namespace functor {
 
@@ -49,6 +47,7 @@ struct BucketizeFunctor<CPUDevice, T> {
     return Status::OK();
   }
 };
+
 }  // namespace functor
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
index 325dee793b3eef4e045e2b3d5ad2f96dbf3943d8..551d77f4950d08e869c49cbc245c564a1050c047 100644
--- a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
@@ -33,11 +33,28 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename T>
+template <typename T, bool useSharedMem>
 __global__ void BucketizeCustomKernel(
     const int32 size_in, const T* in, const int32 size_boundaries,
     CudaDeviceArrayStruct<float> boundaries_array, int32* out) {
   const float* boundaries = GetCudaDeviceArrayOnDevice(&boundaries_array);
+
+  extern __shared__ __align__(sizeof(float)) unsigned char shared_mem[];
+  float* shared_mem_boundaries = reinterpret_cast<float*>(shared_mem);
+
+  if (useSharedMem) {
+    int32 lidx = threadIdx.y * blockDim.x + threadIdx.x;
+    int32 blockSize = blockDim.x * blockDim.y;
+
+    for (int32 i = lidx; i < size_boundaries; i += blockSize) {
+      shared_mem_boundaries[i] = boundaries[i];
+    }
+
+    __syncthreads();
+
+    boundaries = shared_mem_boundaries;
+  }
+
   CUDA_1D_KERNEL_LOOP(i, size_in) {
     T value = in[i];
     int32 bucket = 0;
@@ -77,11 +94,20 @@ struct BucketizeFunctor<GPUDevice, T> {
     TF_RETURN_IF_ERROR(boundaries_array.Finalize());
 
     CudaLaunchConfig config = GetCudaLaunchConfig(input.size(), d);
-    BucketizeCustomKernel<T>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            input.size(), input.data(), boundaries_vector.size(),
-            boundaries_array.data(), output.data());
-
+    int32 shared_mem_size = sizeof(float) * boundaries_vector.size();
+    const int32 kMaxSharedMemBytes = 16384;
+    if (shared_mem_size < d.sharedMemPerBlock() &&
+        shared_mem_size < kMaxSharedMemBytes) {
+      BucketizeCustomKernel<T, true>
+          <<<config.block_count, config.thread_per_block, shared_mem_size,
+             d.stream()>>>(input.size(), input.data(), boundaries_vector.size(),
+                           boundaries_array.data(), output.data());
+    } else {
+      BucketizeCustomKernel<T, false>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              input.size(), input.data(), boundaries_vector.size(),
+              boundaries_array.data(), output.data());
+    }
     return Status::OK();
   }
 };
diff --git a/tensorflow/core/kernels/candidate_sampler_ops.cc b/tensorflow/core/kernels/candidate_sampler_ops.cc
index e937c4f11ba34e16d319b7b4dec317e81b6b8b2c..654d99301af5f528e4360d70edf4cadd4165382d 100644
--- a/tensorflow/core/kernels/candidate_sampler_ops.cc
+++ b/tensorflow/core/kernels/candidate_sampler_ops.cc
@@ -126,13 +126,13 @@ REGISTER_KERNEL_BUILDER(Name("UniformCandidateSampler").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("LogUniformCandidateSampler").Device(DEVICE_CPU),
                         SimpleCandidateSamplerOp<LogUniformSampler>);
 
-REGISTER_KERNEL_BUILDER(Name("LearnedUnigramCandidateSampler")
-                            .Device(DEVICE_CPU),
-                        SimpleCandidateSamplerOp<UnigramSampler>);
+REGISTER_KERNEL_BUILDER(
+    Name("LearnedUnigramCandidateSampler").Device(DEVICE_CPU),
+    SimpleCandidateSamplerOp<UnigramSampler>);
 
-REGISTER_KERNEL_BUILDER(Name("ThreadUnsafeUnigramCandidateSampler")
-                            .Device(DEVICE_CPU),
-                        SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
+REGISTER_KERNEL_BUILDER(
+    Name("ThreadUnsafeUnigramCandidateSampler").Device(DEVICE_CPU),
+    SimpleCandidateSamplerOp<ThreadUnsafeUnigramSampler>);
 
 class AllCandidateSamplerOp : public BaseCandidateSamplerOp {
  public:
@@ -197,8 +197,9 @@ class ComputeAccidentalHitsOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& in_true_candidates = context->input(0);
     const TensorShape& in_true_candidates_shape = in_true_candidates.shape();
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
-                             in_true_candidates_shape.dim_size(1) == num_true_,
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsMatrix(in_true_candidates_shape) &&
+                    in_true_candidates_shape.dim_size(1) == num_true_,
                 errors::InvalidArgument(
                     "true_candidates must be a batch_size * num_true matrix"));
 
diff --git a/tensorflow/core/kernels/captured_function.cc b/tensorflow/core/kernels/captured_function.cc
deleted file mode 100644
index 00cdc1eff2d3003cb55e868389033f8504e01588..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/captured_function.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/kernels/captured_function.h"
-
-#include <utility>
-
-#include "tensorflow/core/common_runtime/threadpool_device.h"
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/device_attributes.pb.h"
-#include "tensorflow/core/framework/lookup_interface.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/queue_interface.h"
-#include "tensorflow/core/framework/reader_interface.h"
-#include "tensorflow/core/framework/resource_handle.pb_text.h"
-#include "tensorflow/core/kernels/dataset.h"
-#include "tensorflow/core/kernels/variable_ops.h"
-#include "tensorflow/core/platform/notification.h"
-#include "tensorflow/core/public/session_options.h"
-
-
-namespace tensorflow {
-
-/* static */
-Status CapturedFunction::Create(
-    OpKernelContext* ctx, const NameAttrList& func, int graph_def_version,
-    std::vector<Tensor> captured_inputs,
-    std::unique_ptr<CapturedFunction>* out_function) {
-  // NOTE(mrry): We need to assign a name to the device, and we choose
-  // the same name as the calling context's device so that we do not
-  // need to rewrite resource handles that are found in `captured_inputs`.
-  Device* device =
-      new ThreadPoolDevice(SessionOptions(), ctx->device()->attributes().name(),
-                           Bytes(256 << 20), DeviceLocality(), cpu_allocator());
-
-// TODO(mrry): Handle arbitrary resource types, which might require a
-// redesign (or opening up access to `ResourceMgr::DoLookup()` and
-// `ResourceMgr::DoCreate()` to this code).
-#define HANDLE_RESOURCE_TYPE(ResourceType)                                     \
-  if (input_handle.hash_code() == MakeTypeIndex<ResourceType>().hash_code()) { \
-    ResourceType* resource;                                                    \
-    Status s = LookupResource(ctx, input_handle, &resource);                   \
-    if (errors::IsNotFound(s)) {                                               \
-      return errors::FailedPrecondition(                                       \
-          "Failed to capture resource named \"", input_handle.name(),          \
-          "\" in a dataset function. You may need to initialize it "           \
-          "explicitly before initializing an iterator that uses it.");         \
-    } else if (!s.ok()) {                                                      \
-      return s;                                                                \
-    }                                                                          \
-    ResourceType* already_created_resource;                                    \
-    /* Look up the resource in the this function's resource manager, in case   \
-     * it has already been created. */                                         \
-    s = device->resource_manager()->Lookup(input_handle.container(),           \
-                                           input_handle.name(),                \
-                                           &already_created_resource);         \
-    if (s.ok()) {                                                              \
-      CHECK_EQ(resource, already_created_resource);                            \
-      resource->Unref();                                                       \
-      already_created_resource->Unref();                                       \
-    } else {                                                                   \
-      if (errors::IsNotFound(s)) {                                             \
-        TF_RETURN_IF_ERROR(device->resource_manager()->Create(                 \
-            input_handle.container(), input_handle.name(), resource));         \
-      } else {                                                                 \
-        return s;                                                              \
-      }                                                                        \
-    }                                                                          \
-    continue;                                                                  \
-  }
-
-  for (size_t i = 0; i < captured_inputs.size(); ++i) {
-    if (captured_inputs[i].dtype() == DT_RESOURCE) {
-      // Extract the resource from `ctx->resource_manager()` and
-      // insert it into `device->resource_manager()` so that it can be
-      // used when the function executes.
-      ResourceHandle input_handle =
-          captured_inputs[i].scalar<ResourceHandle>()();
-      HANDLE_RESOURCE_TYPE(lookup::LookupInterface);
-      HANDLE_RESOURCE_TYPE(QueueInterface);
-      HANDLE_RESOURCE_TYPE(Var);
-      return errors::Unimplemented(
-          "Cannot currently capture resource '",
-          ProtoDebugString(input_handle),
-          "' in a dataset function (type not supported).");
-    }
-  }
-#undef HANDLE_RESOURCE_TYPE
-
-  std::unique_ptr<DeviceMgr> device_mgr(new DeviceMgr({device}));
-  std::unique_ptr<FunctionLibraryDefinition> flib_def(
-      new FunctionLibraryDefinition(
-          *ctx->function_library()->GetFunctionLibraryDefinition()));
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
-      new ProcessFunctionLibraryRuntime(device_mgr.get(), ctx->env(),
-                                        graph_def_version, flib_def.get(),
-                                        {} /* TODO(mrry): OptimizerOptions? */,
-                                        nullptr /* TODO(mrry): ClusterFLR */));
-
-  FunctionLibraryRuntime* lib = pflr->GetFLR(device->name());
-
-  FunctionLibraryRuntime::Handle f_handle;
-  TF_RETURN_IF_ERROR(
-      lib->Instantiate(func.name(), AttrSlice(&func.attr()), &f_handle));
-
-  out_function->reset(new CapturedFunction(
-      device, std::move(device_mgr), std::move(flib_def), std::move(pflr), lib,
-      f_handle, std::move(captured_inputs)));
-  return Status::OK();
-}
-
-Status CapturedFunction::Run(FunctionLibraryRuntime::Options f_opts,
-                             gtl::ArraySlice<Tensor> args,
-                             std::vector<Tensor>* rets) {
-  Notification n;
-  Status s;
-  auto done_callback = [&n, &s](Status func_status) {
-    s.Update(func_status);
-    n.Notify();
-  };
-  // TODO(mrry): Add cancellation manager support to IteratorContext
-  // so that we can cancel running map functions. The local
-  // cancellation manager here is created so that we can run kernels
-  // (such as queue kernels) that depend on the non-nullness
-  // `OpKernelContext::cancellation_manager()`, but additional effort
-  // will be required to plumb it through the `IteratorContext`.
-  CancellationManager c_mgr;
-  f_opts.cancellation_manager = &c_mgr;
-  RunHelper(std::move(f_opts), args, rets, std::move(done_callback));
-  n.WaitForNotification();
-  return s;
-}
-
-void CapturedFunction::RunAsync(FunctionLibraryRuntime::Options f_opts,
-                                gtl::ArraySlice<Tensor> args,
-                                std::vector<Tensor>* rets,
-                                FunctionLibraryRuntime::DoneCallback done) {
-  auto c_mgr = new CancellationManager;
-  f_opts.cancellation_manager = c_mgr;
-  FunctionLibraryRuntime::DoneCallback wrapped_done = std::bind(
-      [c_mgr](FunctionLibraryRuntime::DoneCallback done,
-              // Begin unbound arguments.
-              Status s) {
-        delete c_mgr;
-        done(s);
-      },
-      std::move(done), std::placeholders::_1);
-  RunHelper(std::move(f_opts), args, rets, std::move(wrapped_done));
-}
-
-void CapturedFunction::RunHelper(FunctionLibraryRuntime::Options f_opts,
-                                 gtl::ArraySlice<Tensor> args,
-                                 std::vector<Tensor>* rets,
-                                 FunctionLibraryRuntime::DoneCallback done) {
-  // TODO(mrry): Implement a synchronous version of
-  // FunctionLibraryRuntime::Run() that avoids a context switch for small
-  // functions.
-  if (captured_inputs_.empty()) {
-    lib_->Run(f_opts, f_handle_, args, rets, std::move(done));
-  } else {
-    std::vector<Tensor> args_with_captured;
-    args_with_captured.reserve(args.size() + captured_inputs_.size());
-    args_with_captured.insert(args_with_captured.end(), args.begin(),
-                              args.end());
-    args_with_captured.insert(args_with_captured.end(),
-                              captured_inputs_.begin(), captured_inputs_.end());
-    lib_->Run(f_opts, f_handle_, args_with_captured, rets, std::move(done));
-  }
-}
-
-CapturedFunction::CapturedFunction(
-    Device* device, std::unique_ptr<DeviceMgr> device_mgr,
-    std::unique_ptr<FunctionLibraryDefinition> flib_def,
-    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
-    FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
-    std::vector<Tensor> captured_inputs)
-    : device_(device),
-      device_mgr_(std::move(device_mgr)),
-      flib_def_(std::move(flib_def)),
-      pflr_(std::move(pflr)),
-      lib_(lib),
-      f_handle_(f_handle),
-      captured_inputs_(std::move(captured_inputs)) {}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/captured_function.h b/tensorflow/core/kernels/captured_function.h
index 9430127600a26df6cafd14022aa271e9e18ed78a..2d2d87134e786139386509c6e5f353bb88882915 100644
--- a/tensorflow/core/kernels/captured_function.h
+++ b/tensorflow/core/kernels/captured_function.h
@@ -12,99 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_KERNELS_CAPTURED_FUNCTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_KERNELS_CAPTURED_FUNCTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
+#define TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
 
-#include <memory>
-#include <vector>
+#include "tensorflow/core/kernels/data/captured_function.h"
 
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/platform/macros.h"
-
-namespace tensorflow {
-
-class Device;
-class OpKernelContext;
-class ResourceMgr;
-
-// A `CapturedFunction` encapsulates a TensorFlow function and all of
-// the runtime support required to execute it.
-//
-// The `Dataset`-related classes use `CapturedFunction` to execute
-// TensorFlow functions outside a the normal `OpKernel::Compute()`
-// context.
-//
-// NOTE(mrry): Here we are taking a conservative approach to dealing with
-// ownership of the various framework and runtime objects that are needed
-// to execute functions. We copy the function library *definition* (i.e.
-// a set of FunctionDefs) out of this kernel's context's function library
-// *runtime*, then we use that together with a specially-created
-// ThreadPoolDevice to build a new FunctionLibraryRuntime for the Dataset.
-//
-// We need to do this (or refactor the ownership of framework components
-// in each of the session implementations) to make it possible to close
-// down a ParallelMapDataset::Iterator when its session is closed.
-//
-// TODO(mrry): Clean this up. Investigate whether it would be possible to
-// reuse the session's FunctionLibraryRuntime(s) or Device(s).
-class CapturedFunction {
- public:
-  // NOTE(mrry): The `captured_inputs` are passed by value. For
-  // efficiency, you are recommended to move this argument into the call.
-  static Status Create(OpKernelContext* ctx, const NameAttrList& func,
-                       int graph_def_version,
-                       std::vector<Tensor> captured_inputs,
-                       std::unique_ptr<CapturedFunction>* out_function);
-
-  Status Run(FunctionLibraryRuntime::Options f_opts,
-             gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets);
-
-  void RunAsync(FunctionLibraryRuntime::Options f_opts,
-                gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
-                FunctionLibraryRuntime::DoneCallback done);
-
-  const Device* device() const { return device_; }
-
-  ResourceMgr* resource_manager() const { return device_->resource_manager(); }
-
-  const std::vector<Tensor>& captured_inputs() { return captured_inputs_; }
-
-  static int64 generate_step_id() {
-    // Choose a step ID that is guaranteed not to clash with any
-    // Session-generated step ID. DirectSession only generates
-    // non-negative step IDs (contiguous, starting from 0), and
-    // MasterSession generates 56-bit random step IDs whose MSB is
-    // always 0, so a negative random step ID should suffice.
-    return -std::abs(static_cast<int64>(random::New64()));
-  }
-
- private:
-  CapturedFunction(Device* device, std::unique_ptr<DeviceMgr> device_mgr,
-                   std::unique_ptr<FunctionLibraryDefinition> flib_def,
-                   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
-                   FunctionLibraryRuntime* lib,
-                   FunctionLibraryRuntime::Handle f_handle,
-                   std::vector<Tensor> captured_inputs);
-
-  void RunHelper(FunctionLibraryRuntime::Options f_opts,
-                 gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
-                 FunctionLibraryRuntime::DoneCallback done);
-
-  Device* const device_;  // owned by device_mgr_.
-  const std::unique_ptr<DeviceMgr> device_mgr_;
-  const std::unique_ptr<FunctionLibraryDefinition> flib_def_;
-  const std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  FunctionLibraryRuntime* const lib_;  // owned by pflr_.
-  const FunctionLibraryRuntime::Handle f_handle_;
-  const std::vector<Tensor> captured_inputs_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(CapturedFunction);
-};
-
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_KERNELS_CAPTURED_FUNCTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index f16abb2b79fe24bfbe2711de03c7dfd0847b3003..626db9131aee28be13391ff9c1c92bf9f2d35dd0 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -36,7 +36,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define CURRY_TYPES2(FN, arg0)   \
   FN(arg0, bool);                \
@@ -223,11 +223,11 @@ class SyclCastOp : public CastOpBase {
   }
 };
 
-#define REGISTER_CAST_SYCL(srctype, dsttype)                    \
-  REGISTER_KERNEL_BUILDER(Name("Cast")                          \
-                              .TypeConstraint<srctype>("SrcT")  \
-                              .TypeConstraint<dsttype>("DstT")  \
-                              .Device(DEVICE_SYCL),             \
+#define REGISTER_CAST_SYCL(srctype, dsttype)                   \
+  REGISTER_KERNEL_BUILDER(Name("Cast")                         \
+                              .TypeConstraint<srctype>("SrcT") \
+                              .TypeConstraint<dsttype>("DstT") \
+                              .Device(DEVICE_SYCL),            \
                           SyclCastOp)
 CURRY_TYPES2(REGISTER_CAST_SYCL, bool);
 CURRY_TYPES2(REGISTER_CAST_SYCL, int32);
@@ -237,7 +237,7 @@ CURRY_TYPES2(REGISTER_CAST_SYCL, double);
 
 #undef REGISTER_CAST_SYCL
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #undef CURRY_TYPES2
 
@@ -250,6 +250,5 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("_HostCast").Device(DEVICE_SYCL).HostMemory("x").HostMemory("y"),
     CpuCastOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 8fedf2c271c2caf60a83fb1f4146dd94821c4643..fd4e75d26f02dc75e13c8781049c904587d10afd 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -131,7 +131,8 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> {
     p[0] = a.value;
     p[1] = 0;
 #else
-    static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!");
+    static_assert(::tensorflow::port::kLittleEndian,
+                  "Not a little endian system!");
     p[0] = 0;
     p[1] = a.value;
 #endif
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index 6309e4a4dc6f3ae094e5a310ca237474afeeca14..3ae9f2ab4d9c102941927215441b4c02625387f0 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
 
 #define EIGEN_USE_THREADS
 
@@ -41,25 +41,25 @@ struct CastFunctor<Eigen::SyclDevice, O, I> {
     o.device(d) = i.template cast<O>();
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
-#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1)   \
-  FN(arg0, arg1, bool);                        \
-  FN(arg0, arg1, uint8);                       \
-  FN(arg0, arg1, int8);                        \
-  FN(arg0, arg1, uint16);                      \
-  FN(arg0, arg1, int16);                       \
-  FN(arg0, arg1, int32);                       \
-  FN(arg0, arg1, int64);                       \
-  FN(arg0, arg1, float);                       \
-  FN(arg0, arg1, double);                      \
-  FN(arg0, arg1, std::complex<float>);         \
+#define CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
+  FN(arg0, arg1, bool);                      \
+  FN(arg0, arg1, uint8);                     \
+  FN(arg0, arg1, int8);                      \
+  FN(arg0, arg1, uint16);                    \
+  FN(arg0, arg1, int16);                     \
+  FN(arg0, arg1, int32);                     \
+  FN(arg0, arg1, int64);                     \
+  FN(arg0, arg1, float);                     \
+  FN(arg0, arg1, double);                    \
+  FN(arg0, arg1, std::complex<float>);       \
   FN(arg0, arg1, std::complex<double>)
 
-#define CURRY_TYPES3(FN, arg0, arg1)           \
-  CURRY_TYPES3_NO_HALF(FN, arg0, arg1)         \
+#define CURRY_TYPES3(FN, arg0, arg1)   \
+  CURRY_TYPES3_NO_HALF(FN, arg0, arg1) \
   FN(arg0, arg1, Eigen::half);
 
 #define CAST_CASE(DEVICE, IN, OUT)                                         \
@@ -181,4 +181,4 @@ GetSyclCastFromDouble(DataType dst_dtype);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CAST_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index a106f287c1845a108f596b960b65a6392c35b071..057e209a71903ad24e2d4f757e4d2a3bc4357a76 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -107,10 +107,10 @@ static void BM_gpu_float_int64(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -130,10 +130,10 @@ static void BM_gpu_bool_float(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
   test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -180,7 +180,7 @@ static void BM_gpu_float_half(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -191,7 +191,7 @@ static void BM_gpu_half_float(int iters, int num) {
   testing::UseRealTime();
 #if GOOGLE_CUDA
   test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 }
 BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
 
diff --git a/tensorflow/core/kernels/colorspace_op.cc b/tensorflow/core/kernels/colorspace_op.cc
index ba100b32e7d8cfcd6a0138a09062910743d6d2eb..9cc2e67bbe1f6919d581def55eb4315f7b908ca3 100644
--- a/tensorflow/core/kernels/colorspace_op.cc
+++ b/tensorflow/core/kernels/colorspace_op.cc
@@ -107,14 +107,14 @@ class HSVToRGBOp : public OpKernel {
   }
 };
 
-#define REGISTER_CPU(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_CPU) \
-                              .TypeConstraint<T>("T"),        \
-                          RGBToHSVOp<CPUDevice, T>);          \
-  template class RGBToHSVOp<CPUDevice, T>;                    \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_CPU) \
-                              .TypeConstraint<T>("T"),        \
-                          HSVToRGBOp<CPUDevice, T>);          \
+#define REGISTER_CPU(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("RGBToHSV").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      RGBToHSVOp<CPUDevice, T>);                                  \
+  template class RGBToHSVOp<CPUDevice, T>;                        \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("HSVToRGB").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      HSVToRGBOp<CPUDevice, T>);                                  \
   template class HSVToRGBOp<CPUDevice, T>;
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
@@ -123,40 +123,39 @@ TF_CALL_double(REGISTER_CPU);
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
-#define DECLARE_GPU(T)                                        \
-  template <>                                                 \
-  void RGBToHSV<GPUDevice, T>::operator()(const GPUDevice& d, \
-      TTypes<T, 2>::ConstTensor input_data,                   \
-      TTypes<T, 1>::Tensor range,                             \
-      TTypes<T, 2>::Tensor output_data);                      \
-  extern template struct RGBToHSV<GPUDevice, T>;              \
-  template <>                                                 \
-  void HSVToRGB<GPUDevice, T>::operator()(const GPUDevice& d, \
-      TTypes<T, 2>::ConstTensor input_data,                   \
-      TTypes<T, 2>::Tensor output_data);                      \
+#define DECLARE_GPU(T)                                               \
+  template <>                                                        \
+  void RGBToHSV<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
+      TTypes<T, 1>::Tensor range, TTypes<T, 2>::Tensor output_data); \
+  extern template struct RGBToHSV<GPUDevice, T>;                     \
+  template <>                                                        \
+  void HSVToRGB<GPUDevice, T>::operator()(                           \
+      const GPUDevice& d, TTypes<T, 2>::ConstTensor input_data,      \
+      TTypes<T, 2>::Tensor output_data);                             \
   extern template struct HSVToRGB<GPUDevice, T>;
 TF_CALL_float(DECLARE_GPU);
 TF_CALL_double(DECLARE_GPU);
 }  // namespace functor
-#define REGISTER_GPU(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_GPU) \
-                              .TypeConstraint<T>("T"),        \
-                          RGBToHSVOp<GPUDevice, T>);          \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_GPU) \
-                              .TypeConstraint<T>("T"),        \
-                          HSVToRGBOp<GPUDevice, T>);
+#define REGISTER_GPU(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("RGBToHSV").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      RGBToHSVOp<GPUDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("HSVToRGB").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      HSVToRGBOp<GPUDevice, T>);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RGBToHSV").Device(DEVICE_SYCL) \
-                              .TypeConstraint<T>("T"),         \
-                          RGBToHSVOp<SYCLDevice, T>);          \
-  REGISTER_KERNEL_BUILDER(Name("HSVToRGB").Device(DEVICE_SYCL) \
-                              .TypeConstraint<T>("T"),         \
-                          HSVToRGBOp<SYCLDevice, T>);
+#define REGISTER_SYCL(T)                                           \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
+      RGBToHSVOp<SYCLDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
+      HSVToRGBOp<SYCLDevice, T>);
 TF_CALL_float(REGISTER_SYCL);
 TF_CALL_double(REGISTER_SYCL);
 #endif
diff --git a/tensorflow/core/kernels/colorspace_op.h b/tensorflow/core/kernels/colorspace_op.h
index c5721ef6dd067e9df0b1c23ac471667edee06fb3..90bfce14194bb04a3ebe8418fcc4d1beaab4fc2b 100644
--- a/tensorflow/core/kernels/colorspace_op.h
+++ b/tensorflow/core/kernels/colorspace_op.h
@@ -54,10 +54,9 @@ struct RGBToHSV {
     // TODO(wicke): all these assignments are only necessary because a combined
     // expression is larger than kernel parameter space. A custom kernel is
     // probably in order.
-    H.device(d) = (R == V).select(norm * (G - B),
-                                  (G == V).select(
-                                      norm * (B - R) + T(2) / T(6),
-                                      norm * (R - G) + T(4) / T(6)));
+    H.device(d) = (R == V).select(
+        norm * (G - B), (G == V).select(norm * (B - R) + T(2) / T(6),
+                                        norm * (R - G) + T(4) / T(6)));
     H.device(d) = (range > T(0)).select(H, H.constant(T(0)));
     H.device(d) = (H < T(0)).select(H + T(1), H);
   }
diff --git a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
index e19d0b14d5df5c125c3fb071ea6ae6580fba8c6a..61f9ba44c46f1cee87a72349f8e4ebdd6d2e750f 100644
--- a/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/colorspace_op_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/colorspace_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/colorspace_op.h"
 
 namespace tensorflow {
 
@@ -29,6 +29,6 @@ typedef Eigen::GpuDevice GPUDevice;
   template class functor::HSVToRGB<GPUDevice, T>;
 TF_CALL_float(INSTANTIATE_GPU);
 TF_CALL_double(INSTANTIATE_GPU);
-}
+}  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/colorspace_op_test.cc b/tensorflow/core/kernels/colorspace_op_test.cc
index 8c6fb732abf40c52c0a9e9a5c338de859c669838..bd82826770f192acd50ca4212a475881fe5c34fc 100644
--- a/tensorflow/core/kernels/colorspace_op_test.cc
+++ b/tensorflow/core/kernels/colorspace_op_test.cc
@@ -224,34 +224,34 @@ class HSVToRGBOpTest : public OpsTestBase {
   }
 };
 
-#define TEST_COLORSPACE(test, dt)                               \
-  TEST_F(test, CheckBlack) {                                    \
-    MakeOp(dt);                                                 \
-    CheckBlack(dt);                                             \
-  }                                                             \
-  TEST_F(test, CheckGray) {                                     \
-    MakeOp(dt);                                                 \
-    CheckGray(dt);                                              \
-  }                                                             \
-  TEST_F(test, CheckWhite) {                                    \
-    MakeOp(dt);                                                 \
-    CheckWhite(dt);                                             \
-  }                                                             \
-  TEST_F(test, CheckRedMax) {                                   \
-    MakeOp(dt);                                                 \
-    CheckRedMax(dt);                                            \
-  }                                                             \
-  TEST_F(test, CheckGreenMax) {                                 \
-    MakeOp(dt);                                                 \
-    CheckGreenMax(dt);                                          \
-  }                                                             \
-  TEST_F(test, CheckBlueMax) {                                  \
-    MakeOp(dt);                                                 \
-    CheckBlueMax(dt);                                           \
-  }                                                             \
-  TEST_F(test, CheckNegativeDifference) {                       \
-    MakeOp(dt);                                                 \
-    CheckNegativeDifference(dt);                                \
+#define TEST_COLORSPACE(test, dt)         \
+  TEST_F(test, CheckBlack) {              \
+    MakeOp(dt);                           \
+    CheckBlack(dt);                       \
+  }                                       \
+  TEST_F(test, CheckGray) {               \
+    MakeOp(dt);                           \
+    CheckGray(dt);                        \
+  }                                       \
+  TEST_F(test, CheckWhite) {              \
+    MakeOp(dt);                           \
+    CheckWhite(dt);                       \
+  }                                       \
+  TEST_F(test, CheckRedMax) {             \
+    MakeOp(dt);                           \
+    CheckRedMax(dt);                      \
+  }                                       \
+  TEST_F(test, CheckGreenMax) {           \
+    MakeOp(dt);                           \
+    CheckGreenMax(dt);                    \
+  }                                       \
+  TEST_F(test, CheckBlueMax) {            \
+    MakeOp(dt);                           \
+    CheckBlueMax(dt);                     \
+  }                                       \
+  TEST_F(test, CheckNegativeDifference) { \
+    MakeOp(dt);                           \
+    CheckNegativeDifference(dt);          \
   }
 
 typedef RGBToHSVOpTest<float> rgb_to_hsv_float;
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.cc b/tensorflow/core/kernels/compare_and_bitpack_op.cc
index 9f626a274a4d36b568cc6e25af2e572a35ae3694..224fe534e3392f29e4fab2caa640883d055cb341 100644
--- a/tensorflow/core/kernels/compare_and_bitpack_op.cc
+++ b/tensorflow/core/kernels/compare_and_bitpack_op.cc
@@ -110,7 +110,19 @@ struct ComputeShard<T,
       typename TTypes<bool>::ConstMatrix input,
       typename TTypes<uint8>::Matrix output, bool /*thresh*/, int64 start,
       int64 limit) {
-    // NOTE(ebrevdo): This assumes memory is little-endian.
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    for (int64 i = start; i < limit; ++i) {
+      uint8* out = output.data() + i;
+      const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
+      *out = ((((block & (1LL << (7 * 8))) >> (7 * 8 - 7))) |
+              (((block & (1LL << (6 * 8))) >> (6 * 8 - 6))) |
+              (((block & (1LL << (5 * 8))) >> (5 * 8 - 5))) |
+              (((block & (1LL << (4 * 8))) >> (4 * 8 - 4))) |
+              (((block & (1LL << (3 * 8))) >> (3 * 8 - 3))) |
+              (((block & (1LL << (2 * 8))) >> (2 * 8 - 2))) |
+              (((block & (1LL << 8)) >> (1 * 8 - 1))) | (((block & (1LL)))));
+    }
+#else
     for (int64 i = start; i < limit; ++i) {
       uint8* out = output.data() + i;
       const int64 block = *reinterpret_cast<const int64*>(input.data() + 8 * i);
@@ -123,6 +135,7 @@ struct ComputeShard<T,
            (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) |
            (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7)));
     }
+#endif
   }
 };
 
diff --git a/tensorflow/core/kernels/compare_and_bitpack_op.h b/tensorflow/core/kernels/compare_and_bitpack_op.h
index 8e020249c106f28a8aada2cef6c31c6796b6d332..af8566c7ce200004bc6e0b5fe82afb239ad9cfad 100644
--- a/tensorflow/core/kernels/compare_and_bitpack_op.h
+++ b/tensorflow/core/kernels/compare_and_bitpack_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
+#define TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -39,4 +39,4 @@ struct CompareAndBitpack {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_COMPARE_AND_BITPACK_OP_H_
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index 14e6e1bc32455fd169cd71d74b42ab8f159738ab..16784c4770eb8626c11dc47104fea3af6c5edc07 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -23,12 +23,29 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Functors to concatenate tensors. These always take a rank-2 tensor (i.e a
+// matrix) and concatenate it along the axis 1 ("putting them next to each
+// other" as opposed to "putting them on top of one another").
+//
+// Any concatenation of n-dimensional tensors across any axis can be reduced to
+// a concatenation of two-dimensional tensors across the axis 1 by first
+// partitioning the axes of the original tensors into those less than the axis
+// to be concatenated across and the rest. Then reshape the tensors into a
+// two-dimensional tensor by collapsing these two sets of axes and concatenate
+// the resulting matrices across the axis 1, finally reshaping the result to
+// have the proper shape.
+//
+// So, for example, when stacking N tensors, reshape each to have shape
+// {1, Numelements} and reshape the result matrix to have shape
+// {1, N * NumElements} before passing it to this functor.
+
 // Assumes all inputs are nonempty
 template <typename T>
-void ConcatCPU(DeviceBase* d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output);
+void ConcatCPU(
+    DeviceBase* d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output);
 #if GOOGLE_CUDA
 template <typename T>
 void ConcatGPU(
@@ -41,11 +58,12 @@ void ConcatGPU(
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output);
-#endif // TENSORFLOW_USE_SYCL
+void ConcatSYCL(
+    const Eigen::SyclDevice& d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output);
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONCAT_LIB_H_
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index b0bec0c5dcd30f4a630cd927e6ea922105249676..547a7b40b9245d4b10c12830a0189b09c9dacc76 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -48,10 +48,11 @@ struct MemCpyCopier<ResourceHandle> {
 }  // namespace
 
 template <typename T>
-void ConcatCPU(DeviceBase* d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output) {
+void ConcatCPU(
+    DeviceBase* d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output) {
   if (std::is_same<T, string>::value) {
     // use a large cost here to force strings to be handled by separate threads
     ConcatCPUImpl<T>(d, inputs, 100000, MemCpyCopier<T>(), output);
@@ -72,34 +73,35 @@ REGISTER(qint8)
 REGISTER(quint16)
 REGISTER(qint16)
 REGISTER(qint32)
-REGISTER(bfloat16)
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \
     !defined(__ANDROID_TYPES_FULL__)
-// Primarily used for SavedModel support on mobile. Registering it here only if
-// __ANDROID_TYPES_FULL__ is not defined, as that already register strings
-REGISTER(string);
+    // Primarily used for SavedModel support on mobile. Registering it here only
+    // if __ANDROID_TYPES_FULL__ is not defined (which already registers string)
+    // to avoid duplicate registration.
+    REGISTER(string);
 #endif  // defined(IS_MOBILE_PLATFORM) &&
         // !defined(SUPPORT_SELECTIVE_REGISTRATION) &&
         // !defined(__ANDROID_TYPES_FULL__)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-void ConcatSYCL(const Eigen::SyclDevice& d,
-               const std::vector<
-                   std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>& inputs,
-               typename TTypes<T, 2>::Matrix* output) {
+void ConcatSYCL(
+    const Eigen::SyclDevice& d,
+    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
+        inputs,
+    typename TTypes<T, 2>::Matrix* output) {
   ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
-                   output);
+                    output);
 }
-#define REGISTER_SYCL(T)                                                      \
- template void ConcatSYCL<T>(                                                 \
-     const Eigen::SyclDevice&,                                                \
-     const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
-     typename TTypes<T, 2>::Matrix* output);
+#define REGISTER_SYCL(T)                                                       \
+  template void ConcatSYCL<T>(                                                 \
+      const Eigen::SyclDevice&,                                                \
+      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
+      typename TTypes<T, 2>::Matrix* output);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL)
 
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h
index 6a933efde4b6ababf35c83c94d233e4aa2552d84..720b5065377b49859fdecc2634d14fe308432fe3 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@@ -15,9 +15,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/concat_lib.h"
 #include <vector>
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -73,7 +73,7 @@ void ConcatCPUImpl(
 
   // Sharded mode.
   auto work = [&row_size, &sizes, &inputs, &output, &copier, &num_inputs](
-      int64 start, int64 end) {
+                  int64 start, int64 end) {
     int64 skipped_rows = start / row_size;
     T* out = output->data() + skipped_rows * row_size;
     T* out_start = output->data() + start;
@@ -160,5 +160,5 @@ void ConcatSYCLImpl(
     }
   }
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index 319ead49efd709932bed20e1e76a73749b1c4f19..d8643c0b2fb2633f6b640b4f54dc2f8c92da654d 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -116,8 +116,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER);
 TF_CALL_complex64(REGISTER);
 TF_CALL_complex128(REGISTER);
 TF_CALL_int64(REGISTER);
-REGISTER(bfloat16);
-REGISTER(bool);
+TF_CALL_bfloat16(REGISTER);
+TF_CALL_bool(REGISTER);
 
 #undef REGISTER
 
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index 8e480aa99524cd57bfe4dda2383d03bcd243b79f..7011550f7e161c9727b8d31eff0917964b09044e 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -37,7 +37,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
@@ -71,8 +71,9 @@ class ConcatBaseOp : public OpKernel {
     const TensorShape& input_shape = values[0].shape();
 
     int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
-    OP_REQUIRES(c, (0 <= axis && axis < input_dims) ||
-                       (allow_legacy_scalars() && concat_dim == 0),
+    OP_REQUIRES(c,
+                (0 <= axis && axis < input_dims) ||
+                    (allow_legacy_scalars() && concat_dim == 0),
                 errors::InvalidArgument(
                     "ConcatOp : Expected concatenating dimensions in the range "
                     "[",
@@ -97,8 +98,8 @@ class ConcatBaseOp : public OpKernel {
           c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
-              in.shape().DebugString()));
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", in.shape().DebugString()));
       for (int j = 0; j < input_dims; ++j) {
         if (j == axis) {
           continue;
@@ -107,8 +108,8 @@ class ConcatBaseOp : public OpKernel {
             c, in.dim_size(j) == input_shape.dim_size(j),
             errors::InvalidArgument(
                 "ConcatOp : Dimensions of inputs should match: shape[0] = ",
-                input_shape.DebugString(), " vs. shape[", i, "] = ",
-                in.shape().DebugString()));
+                input_shape.DebugString(), " vs. shape[", i,
+                "] = ", in.shape().DebugString()));
       }
       if (in.NumElements() > 0) {
         int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
@@ -142,7 +143,7 @@ class ConcatBaseOp : public OpKernel {
         ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
         return;
       }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -172,7 +173,6 @@ REGISTER_CONCAT(qint8);
 REGISTER_CONCAT(quint16);
 REGISTER_CONCAT(qint16);
 REGISTER_CONCAT(qint32);
-REGISTER_CONCAT(bfloat16);
 
 #undef REGISTER_CONCAT
 
@@ -253,7 +253,7 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
                         ConcatV2Op<CPUDevice, int32>);
 
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class ConcatOffsetOp : public OpKernel {
  public:
@@ -348,5 +348,5 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
                             .HostMemory("shape")
                             .HostMemory("offset"),
                         ConcatOffsetOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc
index c5bded9dafcdaf7264649e00db2ea2766db8eea9..e3ba8ae9f691c8ec9be79952d7f97801552b2a56 100644
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@@ -157,7 +157,8 @@ BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
 
 typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
-                         Eigen::Unaligned> EigenMap;
+                         Eigen::Unaligned>
+    EigenMap;
 static void MemcpyManyAlternative1(int iters, int dim2) {
   testing::StopTiming();
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index 05ee855daee8a7ffe4730ec4a18c65a7bd91733a..c7c7c983691c6f5257622940d183d06304ee74f1 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -160,20 +160,22 @@ class ConditionalAccumulatorBase : public ResourceBase {
  * Modifications to convenience macros defined in core/framework/op_kernel.h.
  * The below macros return a boolean if the test fails, so that the calling
  * function can get an indication that a failure has occurred.
-*/
-#define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS) \
-  if (!TF_PREDICT_TRUE(EXP)) {                \
-    (CTX)->CtxFailure((STATUS));              \
-    return false;                             \
-  }
+ */
+#define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS)          \
+  do {                                                 \
+    if (!TF_PREDICT_TRUE(EXP)) {                       \
+      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS)); \
+      return false;                                    \
+    }                                                  \
+  } while (0)
 
-#define OP_REQUIRES_OK_BOOLEAN(CTX, STATUS) \
-  do {                                      \
-    ::tensorflow::Status _s(STATUS);        \
-    if (!TF_PREDICT_TRUE(_s.ok())) {        \
-      (CTX)->CtxFailureWithWarning(_s);     \
-      return false;                         \
-    }                                       \
+#define OP_REQUIRES_OK_BOOLEAN(CTX, STATUS)                 \
+  do {                                                      \
+    ::tensorflow::Status _s(STATUS);                        \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
+      return false;                                         \
+    }                                                       \
   } while (0)
 
 /*
diff --git a/tensorflow/core/kernels/conditional_accumulator_op.cc b/tensorflow/core/kernels/conditional_accumulator_op.cc
index fa37916eaba4106fe8067b739e77e7f91631b1e9..e13bf8a4c63ebe86fbf3fcf2fdd50f928298d01b 100644
--- a/tensorflow/core/kernels/conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_op.cc
@@ -99,9 +99,10 @@ class AccumulatorTakeGradientOp
                       ConditionalAccumulatorBase* accumulator,
                       DoneCallback callback) override {
     // Check signature
-    OP_REQUIRES_OK_ASYNC(ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
-                                                  {accumulator->dtype()}),
-                         callback);
+    OP_REQUIRES_OK_ASYNC(
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_INT32}, {accumulator->dtype()}),
+        callback);
   }
 
  private:
@@ -111,5 +112,4 @@ class AccumulatorTakeGradientOp
 REGISTER_KERNEL_BUILDER(Name("AccumulatorTakeGradient").Device(DEVICE_CPU),
                         AccumulatorTakeGradientOp);
 
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 72132574a4ccee474734425233ff687e955022ef..4ab6fdbca1a3415937213d46fac3058097130f55 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/constant_op.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -41,8 +42,33 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+std::unique_ptr<const NodeDef> StripTensorDataFromNodeDef(
+    OpKernelConstruction* ctx) {
+#ifndef __ANDROID__
+  DCHECK_EQ(NodeDef::descriptor()->field_count(), 5)
+      << "The NodeDef format has changed, and the attr-stripping code may need "
+      << "to be updated.";
+#endif
+  const NodeDef& original = ctx->def();
+  NodeDef* ret = new NodeDef;
+  ret->set_name(original.name());
+  ret->set_op(original.op());
+  ret->set_device(original.device());
+  // Strip the "value" attr from the returned NodeDef.
+  // NOTE(mrry): The present implementation of `OpKernel::OpKernel()` only uses
+  // attrs that affect the cardinality of list-typed inputs and outputs, so it
+  // is safe to drop other attrs from the NodeDef.
+  AddNodeAttr("dtype", ctx->output_type(0), ret);
+  return std::unique_ptr<const NodeDef>(ret);
+}
+
+}  // namespace
+
 ConstantOp::ConstantOp(OpKernelConstruction* ctx)
-    : OpKernel(ctx), tensor_(ctx->output_type(0)) {
+    : OpKernel(ctx, StripTensorDataFromNodeDef(ctx)),
+      tensor_(ctx->output_type(0)) {
   const TensorProto* proto = nullptr;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
   OP_REQUIRES_OK(ctx, ctx->device()->MakeTensorFromProto(
@@ -57,12 +83,7 @@ ConstantOp::ConstantOp(OpKernelConstruction* ctx)
 void ConstantOp::Compute(OpKernelContext* ctx) {
   ctx->set_output(0, tensor_);
   if (TF_PREDICT_FALSE(ctx->track_allocations())) {
-    AllocatorAttributes attr;
-    if (ctx->allocate_on_host(attr)) {
-      ctx->record_host_persistent_memory_allocation(tensor_.AllocatedBytes());
-    } else {
-      ctx->record_device_persistent_memory_allocation(tensor_.AllocatedBytes());
-    }
+    ctx->record_persistent_memory_allocation(tensor_.AllocatedBytes());
   }
 }
 
@@ -151,38 +172,24 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
-namespace functor {
-
-// Partial specialization of FillFunctor<Device=CPUDevice, T>.
-template <typename T>
-struct FillFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstScalar in) {
-    out.device(d) = out.constant(in());
-  }
-};
-
-}  // end namespace functor
-
-template <typename Device, typename T>
+template <typename Device, typename T, typename Index>
 class FillOp : public OpKernel {
  public:
   explicit FillOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& Tdims = context->input(0);
-    OP_REQUIRES(
-        context, IsLegacyVector(Tdims.shape()),
-        errors::InvalidArgument("dims must be a vector of int32, got shape ",
-                                Tdims.shape().DebugString()));
+    OP_REQUIRES(context, IsLegacyVector(Tdims.shape()),
+                errors::InvalidArgument("dims must be a vector, got shape ",
+                                        Tdims.shape().DebugString()));
     const Tensor& Tvalue = context->input(1);
     OP_REQUIRES(context, IsLegacyScalar(Tvalue.shape()),
                 errors::InvalidArgument("value must be a scalar, got shape ",
                                         Tvalue.shape().DebugString()));
-    auto dims = Tdims.flat<int32>();
+    auto dims = Tdims.flat<Index>();
     TensorShape shape;
     OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                reinterpret_cast<const int32*>(dims.data()),
+                                reinterpret_cast<const Index*>(dims.data()),
                                 dims.size(), &shape));
     Tensor* out = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &out));
@@ -192,34 +199,19 @@ class FillOp : public OpKernel {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-
-namespace functor {
-// Partial specialization of FillFunctor<Device=SYCLDevice, T>.
-template <typename T>
-struct FillFunctor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstScalar in) {
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::array<int, 1> rank1{1};
-#else
-    Eigen::IndexList<Eigen::type2index<1> > rank1;
-#endif
-    const int size = out.dimension(0);
-    Eigen::array<int, 1> broadcast_dims{size};
-
-    To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims);
-  }
-};
-}  // namespace functor
-#endif  // TENSORFLOW_USE_SYCL
-
-#define REGISTER_KERNEL(D, TYPE)                         \
-  REGISTER_KERNEL_BUILDER(Name("Fill")                   \
-                              .Device(DEVICE_##D)        \
-                              .TypeConstraint<TYPE>("T") \
-                              .HostMemory("dims"),       \
-                          FillOp<D##Device, TYPE>);
+#define REGISTER_KERNEL(D, TYPE)                                   \
+  REGISTER_KERNEL_BUILDER(Name("Fill")                             \
+                              .Device(DEVICE_##D)                  \
+                              .TypeConstraint<TYPE>("T")           \
+                              .TypeConstraint<int32>("index_type") \
+                              .HostMemory("dims"),                 \
+                          FillOp<D##Device, TYPE, int32>);         \
+  REGISTER_KERNEL_BUILDER(Name("Fill")                             \
+                              .Device(DEVICE_##D)                  \
+                              .TypeConstraint<TYPE>("T")           \
+                              .TypeConstraint<int64>("index_type") \
+                              .HostMemory("dims"),                 \
+                          FillOp<D##Device, TYPE, int64>);
 
 #define REGISTER_CPU_KERNEL(TYPE) REGISTER_KERNEL(CPU, TYPE)
 TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL);
@@ -241,15 +233,17 @@ REGISTER_KERNEL(SYCL, int64);
 REGISTER_KERNEL_BUILDER(Name("Fill")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("index_type")
                             .HostMemory("dims")
                             .HostMemory("value")
                             .HostMemory("output"),
-                        FillOp<CPUDevice, int32>);
+                        FillOp<CPUDevice, int32, int32>);
 #undef REGISTER_KERNEL_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL(GPU, Eigen::half);
+REGISTER_KERNEL(GPU, bfloat16);
 REGISTER_KERNEL(GPU, float);
 REGISTER_KERNEL(GPU, double);
 REGISTER_KERNEL(GPU, uint8);
@@ -266,10 +260,11 @@ REGISTER_KERNEL(GPU, bool);
 REGISTER_KERNEL_BUILDER(Name("Fill")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("index_type")
                             .HostMemory("dims")
                             .HostMemory("value")
                             .HostMemory("output"),
-                        FillOp<CPUDevice, int32>);
+                        FillOp<CPUDevice, int32, int32>);
 #endif
 
 #undef REGISTER_KERNEL
@@ -328,6 +323,7 @@ REGISTER_KERNEL_BUILDER(Name("ZerosLike")
 #if GOOGLE_CUDA
 REGISTER_KERNEL(bool, GPU);
 REGISTER_KERNEL(Eigen::half, GPU);
+REGISTER_KERNEL(bfloat16, GPU);
 REGISTER_KERNEL(float, GPU);
 REGISTER_KERNEL(double, GPU);
 REGISTER_KERNEL(complex64, GPU);
@@ -380,6 +376,7 @@ REGISTER_KERNEL_BUILDER(Name("OnesLike")
 #if GOOGLE_CUDA
 REGISTER_KERNEL(bool, GPU);
 REGISTER_KERNEL(Eigen::half, GPU);
+REGISTER_KERNEL(bfloat16, GPU);
 REGISTER_KERNEL(float, GPU);
 REGISTER_KERNEL(double, GPU);
 REGISTER_KERNEL(complex64, GPU);
diff --git a/tensorflow/core/kernels/constant_op_gpu.cu.cc b/tensorflow/core/kernels/constant_op_gpu.cu.cc
index d1a1e34ec365da444a8465b34dd67f8865d29f5e..3487606778eabde386335f8450d627b7bf74ad42 100644
--- a/tensorflow/core/kernels/constant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/constant_op_gpu.cu.cc
@@ -77,7 +77,7 @@ struct FillFunctor<GPUDevice, T> {
 
 #define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
 TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
-DEFINE_FILL_GPU(bool);
+TF_CALL_bool(DEFINE_FILL_GPU);
 #undef DEFINE_FILL_GPU
 
 // Partial specialization of FillFunctor<Device=GPUDevice, T>.
@@ -88,15 +88,9 @@ struct SetZeroFunctor<GPUDevice, T> {
   }
 };
 
-#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>
-DEFINE_SETZERO_GPU(bool);
-DEFINE_SETZERO_GPU(Eigen::half);
-DEFINE_SETZERO_GPU(float);
-DEFINE_SETZERO_GPU(double);
-DEFINE_SETZERO_GPU(complex64);
-DEFINE_SETZERO_GPU(complex128);
-DEFINE_SETZERO_GPU(int32);
-DEFINE_SETZERO_GPU(int64);
+#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>;
+TF_CALL_NUMBER_TYPES(DEFINE_SETZERO_GPU);
+TF_CALL_bool(DEFINE_SETZERO_GPU);
 #undef DEFINE_SETZERO_GPU
 
 // Partial specialization of FillFunctor<Device=GPUDevice, T>.
@@ -107,15 +101,9 @@ struct SetOneFunctor<GPUDevice, T> {
   }
 };
 
-#define DEFINE_SETONE_GPU(T) template struct SetOneFunctor<GPUDevice, T>
-DEFINE_SETONE_GPU(bool);
-DEFINE_SETONE_GPU(Eigen::half);
-DEFINE_SETONE_GPU(float);
-DEFINE_SETONE_GPU(double);
-DEFINE_SETONE_GPU(complex64);
-DEFINE_SETONE_GPU(complex128);
-DEFINE_SETONE_GPU(int32);
-DEFINE_SETONE_GPU(int64);
+#define DEFINE_SETONE_GPU(T) template struct SetOneFunctor<GPUDevice, T>;
+TF_CALL_NUMBER_TYPES(DEFINE_SETONE_GPU);
+TF_CALL_bool(DEFINE_SETONE_GPU);
 #undef DEFINE_SETONE_GPU
 
 }  // end namespace functor
diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc
index 62cc67c7360ad55d9f2c487e8e6d3ae37d9a47ed..a6baae73d876d511f1e8d81792fe4cecea160bfd 100644
--- a/tensorflow/core/kernels/constant_op_test.cc
+++ b/tensorflow/core/kernels/constant_op_test.cc
@@ -72,12 +72,12 @@ void ConstantOpTest::PersistentMemoryTrackingTest(bool on_gpu) {
   TF_EXPECT_OK(ctx.status());
 
   if (on_gpu) {
-    EXPECT_EQ(ctx.device_persistent_memory_allocated(), 512);
+    EXPECT_EQ(ctx.persistent_memory_allocated(), 512);
   } else {
-    EXPECT_EQ(ctx.host_persistent_memory_allocated(), 480);
+    EXPECT_EQ(ctx.persistent_memory_allocated(), 480);
   }
 
-  // Remove memry leak errors.
+  // Remove memory leak errors.
   for (auto allocator_pair : ctx.wrapped_allocators()) {
     allocator_pair.second->GetRecordsAndUnRef();
   }
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 8fe82d118a702ec6809d6f4f4385fa3dc0949037..7d5d54e5bece7d448e7c11c6061109e9e8554008 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -113,47 +113,47 @@ REGISTER_GPU_HOST_REF_KERNEL(string);
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_SWITCH(type)                       \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
-                              .Device(DEVICE_SYCL)       \
-                              .HostMemory("pred")        \
-                              .TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_SWITCH(type)                        \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH);
 
-#define REGISTER_SYCL_REF_SWITCH(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
-                              .Device(DEVICE_SYCL)         \
-                              .HostMemory("pred")          \
-                              .TypeConstraint<type>("T"),  \
+#define REGISTER_SYCL_REF_SWITCH(type)                    \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
 
 #undef REGISTER_SYCL_SWITCH
 #undef REGISTER_SYCL_REF_SWITCH
 
-#define REGISTER_SYCL_HOST_KERNEL(type)                  \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                 \
-                              .Device(DEVICE_SYCL)       \
-                              .HostMemory("data")        \
-                              .HostMemory("pred")        \
-                              .HostMemory("output_false")\
-                              .HostMemory("output_true") \
-                              .TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_HOST_KERNEL(type)                   \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("pred")         \
+                              .HostMemory("output_false") \
+                              .HostMemory("output_true")  \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 
 REGISTER_SYCL_HOST_KERNEL(bool);
 REGISTER_SYCL_HOST_KERNEL(string);
 REGISTER_SYCL_HOST_KERNEL(int32);
 
-#define REGISTER_SYCL_HOST_REF_KERNEL(type)                \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")                \
-                              .Device(DEVICE_SYCL)         \
-                              .HostMemory("data")          \
-                              .HostMemory("pred")          \
-                              .HostMemory("output_false")  \
-                              .HostMemory("output_true")   \
-                              .TypeConstraint<type>("T"),  \
+#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_SYCL)        \
+                              .HostMemory("data")         \
+                              .HostMemory("pred")         \
+                              .HostMemory("output_false") \
+                              .HostMemory("output_true")  \
+                              .TypeConstraint<type>("T"), \
                           SwitchOp)
 
 REGISTER_SYCL_HOST_REF_KERNEL(int32);
@@ -162,7 +162,7 @@ REGISTER_SYCL_HOST_REF_KERNEL(string);
 
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class RefSelectOp : public OpKernel {
  public:
@@ -282,7 +282,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
 
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -331,7 +331,7 @@ REGISTER_SYCL_HOST_KERNEL(string);
 REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 void EnterOp::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
@@ -360,14 +360,14 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(          \
+#define REGISTER_SYCL_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(         \
       Name("Enter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 
-#define REGISTER_SYCL_REF_KERNEL(type)  \
-  REGISTER_KERNEL_BUILDER(              \
+#define REGISTER_SYCL_REF_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(             \
       Name("RefEnter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
 REGISTER_SYCL_REF_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
@@ -398,7 +398,7 @@ REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_SYCL_HOST_KERNEL
 #undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -455,10 +455,10 @@ REGISTER_GPU_REF_KERNEL(bool);
 #undef REGISTER_GPU_REF_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);   \
-  REGISTER_KERNEL_BUILDER(                                                   \
+#define REGISTER_SYCL_KERNEL(type)                                         \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp); \
+  REGISTER_KERNEL_BUILDER(                                                 \
       Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@@ -483,7 +483,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -556,12 +556,12 @@ REGISTER_GPU_HOST_KERNEL(string);
 #undef REGISTER_GPU_HOST_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
-      NextIterationOp);                                                      \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_KERNEL(type)                                            \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),    \
+      NextIterationOp);                                                       \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       NextIterationOp)
 REGISTER_SYCL_KERNEL(bool);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
@@ -585,7 +585,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // A LoopCond op has one input and one output. The input is a boolean
 // scalar representing the taken branches of the "pivot" Switch that
@@ -619,7 +619,7 @@ REGISTER_KERNEL_BUILDER(Name("LoopCond")
                             .HostMemory("input")
                             .HostMemory("output"),
                         LoopCondOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // ControlTrigger kernels
 REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_CPU),
@@ -631,7 +631,7 @@ REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_GPU),
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE_SYCL),
                         ControlTriggerOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // When called, abort op will abort the current process. This can be used to
 // abort remote PSs when needed.
diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc
index affa0e8ca6b9d053702f8b203321d6ee2954878e..a2f7bd406929ec516d67dfc76767532cf2bac28c 100644
--- a/tensorflow/core/kernels/control_flow_ops_test.cc
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@@ -91,6 +91,7 @@ class KilledBySignal {
  public:
   explicit KilledBySignal(int signum) : signum_(signum) {}
   bool operator()(int exit_status) const { return exit_status == signum_; }
+
  private:
   const int signum_;
 };
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index f78a162a8efbd7aeae16d59665afda50d2868b40..2142207b0d89a4b2f02c7f7b5d320c3b4b48462c 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -91,27 +91,25 @@ struct SpatialConvolutionBackwardInput {
   void operator()(const Device& d, typename TTypes<T, 4>::Tensor input_backward,
                   typename TTypes<T, 4>::ConstTensor kernel,
                   typename TTypes<T, 4>::ConstTensor output_backward,
-                  int input_rows, int input_cols, int row_stride,
-                  int col_stride) {
+                  int row_stride, int col_stride) {
     // Need to swap row/col when calling Eigen.
     input_backward.device(d) = Eigen::SpatialConvolutionBackwardInput(
-        kernel, output_backward, input_cols, input_rows, col_stride,
-        row_stride);
+        kernel, output_backward, input_backward.dimension(2),
+        input_backward.dimension(1), col_stride, row_stride);
   }
 };
 
 template <typename Device, typename T>
-struct SpatialConvolutionBackwardKernel {
+struct SpatialConvolutionBackwardFilter {
   void operator()(const Device& d,
                   typename TTypes<T, 4>::Tensor kernel_backward,
                   typename TTypes<T, 4>::ConstTensor input,
                   typename TTypes<T, 4>::ConstTensor output_backward,
-                  int kernel_rows, int kernel_cols, int row_stride,
-                  int col_stride) {
+                  int row_stride, int col_stride) {
     // Need to swap row/col when calling Eigen.
     kernel_backward.device(d) = Eigen::SpatialConvolutionBackwardKernel(
-        input, output_backward, kernel_cols, kernel_rows, col_stride,
-        row_stride);
+        input, output_backward, kernel_backward.dimension(1),
+        kernel_backward.dimension(0), col_stride, row_stride);
   }
 };
 
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 3d2bb57aff6b7c4a1de2f9221aea4b384fea45c3..512bcc6c01bf3eb4aed92f90eebb060abda8a7fc 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #ifdef TENSORFLOW_USE_LIBXSMM
 #include "tensorflow/core/kernels/xsmm_conv2d.h"
 #endif
@@ -92,16 +93,15 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T>
-struct LaunchConv2DBackpropInputOp<CPUDevice, T> {
+struct LaunchConv2DBackpropFilterOp<CPUDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& input,
                   int row_stride, int col_stride, const Padding& padding,
                   Tensor* filter_backprop, TensorFormat data_format) {
     const CPUDevice& d = ctx->eigen_device<CPUDevice>();
-    functor::SpatialConvolutionBackwardInput<CPUDevice, T>()(
+    functor::SpatialConvolutionBackwardFilter<CPUDevice, T>()(
         d, filter_backprop->tensor<T, 4>(), input.tensor<T, 4>(),
-        out_backprop.tensor<T, 4>(), filter_backprop->dim_size(0),
-        filter_backprop->dim_size(1), row_stride, col_stride);
+        out_backprop.tensor<T, 4>(), row_stride, col_stride);
   }
 };
 
@@ -194,7 +194,23 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, (dilations_[0] == 1 && dilations_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
+                errors::InvalidArgument(
+                    "Current Eigen and libxsmm implementations do not "
+                    "yet support dilation rates larger than 1."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -256,12 +272,13 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
     }
 #endif
 
-    LaunchConv2DBackpropInputOp<Device, T>()(
+    LaunchConv2DBackpropFilterOp<Device, T>()(
         context, false, false, out_backprop, input, dims.spatial_dims[0].stride,
         dims.spatial_dims[1].stride, padding_, filter_backprop, data_format_);
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
@@ -290,7 +307,23 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, (dilations_[0] == 1 && dilations_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
+                errors::InvalidArgument(
+                    "Current libxsmm and customized CPU implementations do "
+                    "not yet support dilation rates larger than 1."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -459,6 +492,7 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
@@ -510,10 +544,30 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     int stride_n = GetTensorDim(strides_, data_format_, 'N');
     int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    int stride_h = GetTensorDim(strides_, data_format_, 'H');
+    int stride_w = GetTensorDim(strides_, data_format_, 'W');
     OP_REQUIRES(
         context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, stride_h > 0 && stride_w > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+    int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+    int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+    int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+    OP_REQUIRES(context, dilation_n == 1 && dilation_c == 1,
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, dilation_h > 0 && dilation_w > 0,
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
@@ -541,18 +595,27 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     if (filter_shape.num_elements() == 0) {
       return;
     }
+    // If input is empty, set gradients to zero.
+    if (input.shape().num_elements() == 0) {
+      functor::SetZeroFunctor<Device, T> f;
+      f(context->eigen_device<Device>(), filter_backprop->flat<T>());
+      return;
+    }
 
     // For now we take the stride from the second and third dimensions only (we
     // do not support striding on the batch or depth dimension).
     const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
     const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+    const int dilation_rows = GetTensorDim(dilations_, data_format_, 'H');
+    const int dilation_cols = GetTensorDim(dilations_, data_format_, 'W');
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
-              stride_rows, stride_cols, padding_, filter_backprop,
-              data_format_);
+              dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
+              filter_backprop, data_format_);
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   bool use_cudnn_;
@@ -566,38 +629,46 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
 template <typename T>
 void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-    const Tensor& out_backprop, const Tensor& input, int row_stride,
-    int col_stride, const Padding& padding, Tensor* filter_backprop,
-    TensorFormat data_format) {
+    const Tensor& out_backprop, const Tensor& input, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    Tensor* filter_backprop, TensorFormat data_format) {
   using perftools::gputools::dnn::AlgorithmConfig;
   using perftools::gputools::dnn::AlgorithmDesc;
   using perftools::gputools::dnn::ProfileResult;
 
+  std::vector<int32> dilations(4, 1);
+  dilations[GetTensorDimIndex(data_format, 'H')] = row_dilation;
+  dilations[GetTensorDimIndex(data_format, 'W')] = col_dilation;
+
   std::vector<int32> strides(4, 1);
   strides[GetTensorDimIndex(data_format, 'H')] = row_stride;
   strides[GetTensorDimIndex(data_format, 'W')] = col_stride;
   TensorShape filter_shape = filter_backprop->shape();
 
   ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensions(
+  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensionsV2(
                           "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2,
                           input.shape(), filter_shape, out_backprop.shape(),
-                          strides, padding, data_format, &dims));
+                          dilations, strides, padding, data_format, &dims));
 
+  // TODO(yangzihao): The padding computations should be done in
+  // GetWindowedOutputSize() functions.
   const int padding_rows =
       (padding == VALID)
           ? 0
           : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
                                      dims.spatial_dims[0].stride +
-                                 dims.spatial_dims[0].filter_size -
-                                 dims.spatial_dims[0].input_size);
+                                 (dims.spatial_dims[0].filter_size - 1) *
+                                     dims.spatial_dims[0].dilation +
+                                 1 - dims.spatial_dims[0].input_size);
   const int padding_cols =
       (padding == VALID)
           ? 0
           : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
                                      dims.spatial_dims[1].stride +
-                                 dims.spatial_dims[1].filter_size -
-                                 dims.spatial_dims[1].input_size);
+                                 (dims.spatial_dims[1].filter_size - 1) *
+                                     dims.spatial_dims[1].dilation +
+                                 1 - dims.spatial_dims[1].input_size);
 
   // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only
   // calling it when that is true. Remove this check when (if?) cuDNN starts
@@ -730,7 +801,9 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       .set_input_feature_map_count(dims.in_depth)
       .set_output_feature_map_count(dims.out_depth);
   perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
-  conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride)
+  conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
+      .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
+      .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
       .set_zero_padding_height(padding_rows / 2)
       .set_zero_padding_width(padding_cols / 2);
@@ -821,6 +894,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       dims.out_depth,                        // out_depths
       {{dims.spatial_dims[0].filter_size,    // filter_rows
         dims.spatial_dims[1].filter_size}},  // filter_cols
+      {{dims.spatial_dims[0].dilation,       // dilation_rows
+        dims.spatial_dims[1].dilation}},     // dilation_cols
       {{dims.spatial_dims[0].stride,         // stride_rows
         dims.spatial_dims[1].stride}},       // stride_cols
       {{padding_rows,                        // padding_rows
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index d28f6b4d107647f8e2dc232dc5477cd7ee37f696..0356ff4c0f4240ec806d1e337546cfce6771d92f 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -106,8 +106,7 @@ struct LaunchConv2DBackpropInputOp<CPUDevice, T> {
     const CPUDevice& d = ctx->eigen_device<CPUDevice>();
     functor::SpatialConvolutionBackwardInput<CPUDevice, T>()(
         d, in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
-        out_backprop.tensor<T, 4>(), in_backprop->dim_size(1),
-        in_backprop->dim_size(2), row_stride, col_stride);
+        out_backprop.tensor<T, 4>(), row_stride, col_stride);
   }
 };
 
@@ -198,7 +197,23 @@ class Conv2DFastBackpropInputOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, (dilations_[0] && dilations_[3]),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
+                errors::InvalidArgument(
+                    "Current Eigen and libxsmm implementations do not "
+                    "yet support dilation rates larger than 1."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -268,6 +283,7 @@ class Conv2DFastBackpropInputOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
@@ -296,7 +312,23 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, (dilations_[0] == 1 && dilations_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
+                errors::InvalidArgument(
+                    "Current libxsmm and customized CPU implementations do "
+                    "not yet support dilation rates larger than 1."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -532,6 +564,7 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
@@ -586,10 +619,30 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
                                         "specify 4 dimensions"));
     int stride_n = GetTensorDim(strides_, data_format_, 'N');
     int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    int stride_h = GetTensorDim(strides_, data_format_, 'H');
+    int stride_w = GetTensorDim(strides_, data_format_, 'W');
     OP_REQUIRES(
         context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, stride_h > 0 && stride_w > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+    int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+    int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+    int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+    OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, dilation_h > 0 && dilation_w > 0,
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
@@ -622,12 +675,16 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
     // do not support striding on the batch or depth dimension).
     const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
     const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+    const int dilation_rows = GetTensorDim(dilations_, data_format_, 'H');
+    const int dilation_cols = GetTensorDim(dilations_, data_format_, 'W');
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, filter,
-              stride_rows, stride_cols, padding_, in_backprop, data_format_);
+              dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
+              in_backprop, data_format_);
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   bool use_cudnn_;
@@ -641,39 +698,48 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
 template <typename T>
 void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-    const Tensor& out_backprop, const Tensor& filter, int row_stride,
-    int col_stride, const Padding& padding, Tensor* in_backprop,
-    TensorFormat data_format) {
+    const Tensor& out_backprop, const Tensor& filter, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    Tensor* in_backprop, TensorFormat data_format) {
   using perftools::gputools::dnn::AlgorithmConfig;
   using perftools::gputools::dnn::AlgorithmDesc;
   using perftools::gputools::dnn::ProfileResult;
 
   std::vector<int32> strides(4, 1);
-  strides[GetTensorDimIndex(data_format, 'H')] = row_stride;
-  strides[GetTensorDimIndex(data_format, 'W')] = col_stride;
+  std::vector<int32> dilations(4, 1);
+  auto input_h = GetTensorDimIndex(data_format, 'H');
+  auto input_w = GetTensorDimIndex(data_format, 'W');
+  strides[input_h] = row_stride;
+  strides[input_w] = col_stride;
+  dilations[input_h] = row_dilation;
+  dilations[input_w] = col_dilation;
   TensorShape input_shape = in_backprop->shape();
 
   const TensorShape& filter_shape = filter.shape();
   ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensions(
+  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensionsV2(
                           "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2,
                           input_shape, filter_shape, out_backprop.shape(),
-                          strides, padding, data_format, &dims));
+                          dilations, strides, padding, data_format, &dims));
 
+  // TODO(yangzihao): The padding computations should be done in
+  // GetWindowedOutputSize() functions.
   const int padding_rows =
       (padding == VALID)
           ? 0
           : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
                                      dims.spatial_dims[0].stride +
-                                 dims.spatial_dims[0].filter_size -
-                                 dims.spatial_dims[0].input_size);
+                                 (dims.spatial_dims[0].filter_size - 1) *
+                                     dims.spatial_dims[0].dilation +
+                                 1 - dims.spatial_dims[0].input_size);
   const int padding_cols =
       (padding == VALID)
           ? 0
           : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
                                      dims.spatial_dims[1].stride +
-                                 dims.spatial_dims[1].filter_size -
-                                 dims.spatial_dims[1].input_size);
+                                 (dims.spatial_dims[1].filter_size - 1) *
+                                     dims.spatial_dims[1].dilation +
+                                 1 - dims.spatial_dims[1].input_size);
 
   // TODO(keveman): cuDNN only supports equal padding on both sides, so only
   // calling it when that is true. Remove this check when (if?) cuDNN starts
@@ -789,7 +855,9 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       .set_input_feature_map_count(dims.in_depth)
       .set_output_feature_map_count(dims.out_depth);
   perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
-  conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride)
+  conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
+      .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
+      .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
       .set_zero_padding_height(padding_rows / 2)
       .set_zero_padding_width(padding_cols / 2);
@@ -875,6 +943,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       dims.out_depth,                        // out_depths
       {{dims.spatial_dims[0].filter_size,    // filter_rows
         dims.spatial_dims[1].filter_size}},  // filter_cols
+      {{dims.spatial_dims[0].dilation,       // dilation_rows
+        dims.spatial_dims[1].dilation}},     // dilation_cols
       {{dims.spatial_dims[0].stride,         // stride_rows
         dims.spatial_dims[1].stride}},       // stride_cols
       {{padding_rows,                        // padding_rows
diff --git a/tensorflow/core/kernels/conv_grad_ops.h b/tensorflow/core/kernels/conv_grad_ops.h
index e068fb86848f93a4c826e1b19fc85790ab2500a4..535586d53ac916808a22a6ea55577b3be43321f9 100644
--- a/tensorflow/core/kernels/conv_grad_ops.h
+++ b/tensorflow/core/kernels/conv_grad_ops.h
@@ -175,15 +175,17 @@ template <typename Device, typename T>
 struct LaunchConv2DBackpropInputOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& filter,
-                  int row_stride, int col_stride, const Padding& padding,
-                  Tensor* in_backprop, TensorFormat data_format);
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding, Tensor* in_backprop,
+                  TensorFormat data_format);
 };
 
 template <typename Device, typename T>
 struct LaunchConv2DBackpropFilterOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& input,
-                  int row_stride, int col_stride, const Padding& padding,
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding,
                   Tensor* filter_backprop, TensorFormat data_format);
 };
 
@@ -191,8 +193,9 @@ struct LaunchConv2DBackpropFilterOp {
 template <typename T>
 struct LaunchConv2DBackpropInputOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-                  const Tensor& input, const Tensor& filter, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* output,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding, Tensor* output,
                   TensorFormat data_format);
 };
 
@@ -200,7 +203,8 @@ template <typename T>
 struct LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& input,
-                  int row_stride, int col_stride, const Padding& padding,
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding,
                   Tensor* filter_backprop, TensorFormat data_format);
 };
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index c2d24d1f1208961af73901ddee432b98302090f3..3650ab53b2533e3c95a764ead2d1318c4006c9e7 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -645,6 +645,9 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         {{input_size[0], input_size[1], input_size[2]}},
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
+        // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
+        // conv is supported.
+        /*dilation=*/{{1, 1, 1}},
         {{strides[0], strides[1], strides[2]}},
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
@@ -1011,6 +1014,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         {{input_size[0], input_size[1], input_size[2]}},
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
+        {{1, 1, 1}},
         {{strides[0], strides[1], strides[2]}},
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index bb67113fb003ea58e2fb12ae6d79f02251cd3c3d..dbddaf3dc640dcf2cad8f6ba7dd00aaa33a30e0c 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -112,8 +112,9 @@ struct LaunchGeneric {
 template <typename T>
 struct LaunchConv2DOp<CPUDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-                  const Tensor& input, const Tensor& filter, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* output,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding, Tensor* output,
                   TensorFormat data_format) {
     if (data_format != FORMAT_NHWC) {
       ctx->SetStatus(
@@ -121,6 +122,13 @@ struct LaunchConv2DOp<CPUDevice, T> {
                                 "NHWC tensor format for now."));
       return;
     }
+    // TODO(yangzihao): Add the CPU implementation of dilated conv 2D.
+    if (row_dilation > 1 || col_dilation > 1) {
+      ctx->SetStatus(
+          errors::Unimplemented("Generic conv implementation only supports "
+                                "dilated rate of 1 for now."));
+      return;
+    }
     LaunchGeneric<CPUDevice, T>()(ctx, input, filter, row_stride, col_stride,
                                   padding, output, data_format);
   }
@@ -133,8 +141,10 @@ class LaunchDeepConvOp {
                   const Tensor& filter, int batch, int input_rows,
                   int input_cols, int in_depth, int filter_rows,
                   int filter_cols, int pad_rows, int pad_cols, int out_rows,
-                  int out_cols, int out_depth, int stride_rows, int stride_cols,
-                  Tensor* output, TensorFormat data_format) {
+                  int /*out_cols*/, int /*out_depth*/, int /*dilation_rows*/,
+                  int /*dilation_cols*/, int /*stride_rows*/,
+                  int /*stride_cols*/, Tensor* /*output*/,
+                  TensorFormat /*data_format*/) {
     return false;
   }
 };
@@ -147,9 +157,11 @@ class LaunchDeepConvOp<CPUDevice, float> {
                   const Tensor& filter, int batch, int input_rows,
                   int input_cols, int in_depth, int filter_rows,
                   int filter_cols, int pad_rows, int pad_cols, int out_rows,
-                  int out_cols, int out_depth, int stride_rows, int stride_cols,
+                  int out_cols, int out_depth, int dilation_rows,
+                  int dilation_cols, int stride_rows, int stride_cols,
                   Tensor* output, TensorFormat data_format) {
-    if (data_format != FORMAT_NHWC ||
+    if (data_format != FORMAT_NHWC || dilation_rows != 1 ||
+        dilation_cols != 1 ||
         !CanUseDeepConv2D(stride_rows, stride_cols, filter_rows, filter_cols,
                           in_depth, out_depth, out_rows, out_cols)) {
       return false;
@@ -187,7 +199,8 @@ class LaunchXsmmConvOp {
                   int input_cols, int in_depth, int filter_rows,
                   int filter_cols, int pad_rows, int pad_cols, int out_rows,
                   int out_cols, int out_depth, int stride_rows, int stride_cols,
-                  Tensor* output, TensorFormat data_format) {
+                  int dilation_rows, int dilation_cols, Tensor* output,
+                  TensorFormat data_format) {
     return false;
   }
 };
@@ -199,7 +212,8 @@ class LaunchXsmmConvOp<CPUDevice, float> {
                   const Tensor& filter, int batch, int input_rows,
                   int input_cols, int in_depth, int filter_rows,
                   int filter_cols, int pad_rows, int pad_cols, int out_rows,
-                  int out_cols, int out_depth, int stride_rows, int stride_cols,
+                  int out_cols, int out_depth, int dilation_rows,
+                  int dilation_cols, int stride_rows, int stride_cols,
                   Tensor* output, TensorFormat data_format) {
     auto num_threads =
         ctx->device()->tensorflow_cpu_worker_threads()->num_threads;
@@ -228,11 +242,8 @@ class LaunchXsmmConvOp<CPUDevice, float> {
     desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE_OVERWRITE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
 
-    if (!CanUseXsmmConv2D(desc, data_format)) {
-      return false;
-    }
-
-    if (!CanUseXsmmConv2D(desc, data_format)) {
+    if (dilation_rows != 1 || dilation_cols != 1 ||
+        !CanUseXsmmConv2D(desc, data_format)) {
       return false;
     }
 
@@ -251,6 +262,7 @@ template <typename Device, typename T>
 class Conv2DOp : public BinaryOp<T> {
  public:
   explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
@@ -259,15 +271,35 @@ class Conv2DOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
     OP_REQUIRES(context, strides_.size() == 4,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
     const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
     const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+    const int64 stride_h = GetTensorDim(strides_, data_format_, 'H');
+    const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
     OP_REQUIRES(
         context, stride_n == 1 && stride_c == 1,
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, stride_h > 0 && stride_w > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
+
+    const int64 dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+    const int64 dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+    const int64 dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+    const int64 dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+    OP_REQUIRES(context, dilation_n == 1 && dilation_c == 1,
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, dilation_h > 0 && dilation_w > 0,
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
   }
 
@@ -334,18 +366,22 @@ class Conv2DOp : public BinaryOp<T> {
                 errors::InvalidArgument("batch is too large"));
     const int batch = static_cast<int>(batch_raw);
 
-    // For now we take the stride from the second and third dimensions only (we
-    // do not support striding on the batch or depth dimension).
+    // For now we take the stride and dilation from the second and third
+    // dimensions only (we do not support striding or dilation on the batch or
+    // depth dimension).
     const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
     const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
 
+    const int dilation_rows = GetTensorDim(dilations_, data_format_, 'H');
+    const int dilation_cols = GetTensorDim(dilations_, data_format_, 'W');
+
     int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
-                                         padding_, &out_rows, &pad_rows));
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
-                                         padding_, &out_cols, &pad_cols));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeV2(
+                                input_rows, filter_rows, dilation_rows,
+                                stride_rows, padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeV2(
+                                input_cols, filter_cols, dilation_cols,
+                                stride_cols, padding_, &out_cols, &pad_cols));
     TensorShape out_shape =
         ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
 
@@ -361,6 +397,8 @@ class Conv2DOp : public BinaryOp<T> {
             << ", filter_rows = " << filter_rows
             << ", stride_rows = " << stride_rows
             << ", stride_cols = " << stride_cols
+            << ", dilation_rows = " << dilation_rows
+            << ", dilation_cols = " << dilation_cols
             << ", out_depth = " << out_depth;
 
     // If there is nothing to compute, return.
@@ -372,7 +410,8 @@ class Conv2DOp : public BinaryOp<T> {
     if (LaunchXsmmConvOp<Device, T>::Run(
             context, input, filter, batch, input_rows, input_cols, in_depth,
             filter_rows, filter_cols, pad_rows, pad_cols, out_rows, out_cols,
-            out_depth, stride_rows, stride_cols, output, data_format_)) {
+            out_depth, dilation_rows, dilation_cols, stride_rows, stride_cols,
+            output, data_format_)) {
       return;
     }
 #endif
@@ -380,15 +419,18 @@ class Conv2DOp : public BinaryOp<T> {
     if (LaunchDeepConvOp<Device, T>::Run(
             context, input, filter, batch, input_rows, input_cols, in_depth,
             filter_rows, filter_cols, pad_rows, pad_cols, out_rows, out_cols,
-            out_depth, stride_rows, stride_cols, output, data_format_)) {
+            out_depth, dilation_rows, dilation_cols, stride_rows, stride_cols,
+            output, data_format_)) {
       return;
     }
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
-              stride_rows, stride_cols, padding_, output, data_format_);
+              dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
+              output, data_format_);
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   bool use_cudnn_;
   Padding padding_;
@@ -443,9 +485,9 @@ typedef AutoTuneSingleton<ConvAutoTuneGroup, ConvParameters,
 template <typename T>
 void LaunchConv2DOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-    const Tensor& input_param, const Tensor& filter, int row_stride,
-    int col_stride, const Padding& padding, Tensor* output,
-    TensorFormat data_format) {
+    const Tensor& input_param, const Tensor& filter, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    Tensor* output, TensorFormat data_format) {
   using perftools::gputools::dnn::AlgorithmConfig;
   using perftools::gputools::dnn::AlgorithmDesc;
   using perftools::gputools::dnn::ProfileResult;
@@ -461,8 +503,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
 
   Tensor input = input_param;
 
-  if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 &&
-      col_stride == 1 && data_format == FORMAT_NHWC) {
+  if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_dilation == 1 &&
+      col_dilation == 1 && row_stride == 1 && col_stride == 1 &&
+      data_format == FORMAT_NHWC) {
     // 1x1 filter, so call cublas directly.
     const uint64 m = input.dim_size(0) * input.dim_size(1) * input.dim_size(2);
     const uint64 k = filter.dim_size(2);
@@ -487,7 +530,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     }
     return;
   } else if (filter.dim_size(0) == input.dim_size(1) &&
-             filter.dim_size(1) == input.dim_size(2) && padding == VALID &&
+             filter.dim_size(1) == input.dim_size(2) && row_dilation == 1 &&
+             col_dilation == 1 && padding == VALID &&
              data_format == FORMAT_NHWC) {
     // The input data and filter have the same height/width, so call cublas
     // directly.
@@ -530,17 +574,19 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   const int64 patch_cols = filter.dim_size(1);
   if (padding == SAME) {
     // Total padding on rows and cols is
-    // Pr = (R' - 1) * S + Kr - R
-    // Pc = (C' - 1) * S + Kc - C
+    // Pr = (R' - 1) * S + (Kr - 1) * Dr + 1 - R
+    // Pc = (C' - 1) * S + (Kc - 1) * Dc + 1 - C
     // where (R', C') are output dimensions, (R, C) are input dimensions, S
-    // is stride, (Kr, Kc) are filter dimensions.
+    // is stride, (Dr, Dc) are dilations, (Kr, Kc) are filter dimensions.
     // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
     // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
     // we pad more on the right and bottom than on the top and left.
     padding_rows =
-        std::max<int>(0, (out_rows - 1) * row_stride + patch_rows - in_rows);
+        std::max<int>(0, (out_rows - 1) * row_stride +
+                             (patch_rows - 1) * row_dilation + 1 - in_rows);
     padding_cols =
-        std::max<int>(0, (out_cols - 1) * col_stride + patch_cols - in_cols);
+        std::max<int>(0, (out_cols - 1) * col_stride +
+                             (patch_cols - 1) * col_dilation + 1 - in_cols);
     const bool rows_odd = (padding_rows % 2 != 0);
     const bool cols_odd = (padding_cols % 2 != 0);
     if (rows_odd || cols_odd) {
@@ -605,7 +651,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       .set_input_feature_map_count(filter.dim_size(2))
       .set_output_feature_map_count(filter.dim_size(3));
   perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
-  conv_desc.set_vertical_filter_stride(row_stride)
+  conv_desc.set_vertical_dilation_rate(row_dilation)
+      .set_horizontal_dilation_rate(col_dilation)
+      .set_vertical_filter_stride(row_stride)
       .set_horizontal_filter_stride(col_stride)
       .set_zero_padding_height(padding_rows / 2)
       .set_zero_padding_width(padding_cols / 2);
@@ -640,7 +688,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
-      );
+  );
 
   int device_id = stream->parent()->device_ordinal();
   DataType dtype = input.dtype();
@@ -652,6 +700,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       out_depths,        // out_depths
       {{patch_rows,      // filter_rows
         patch_cols}},    // filter_cols
+      {{row_dilation,    // dilation_rows
+        col_dilation}},  // dilation_cols
       {{row_stride,      // stride_rows
         col_stride}},    // stride_cols
       {{padding_rows,    // padding_rows
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index e29271dff278afbc1ff2c947c161824615640b66..09a3b78776c8bf114ccd42866bc7aded92c463b5 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -34,8 +34,9 @@ class OpKernelContext;
 template <typename Device, typename T>
 struct LaunchConv2DOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-                  const Tensor& input, const Tensor& filter, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* output,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding, Tensor* output,
                   TensorFormat data_format);
 };
 
@@ -43,8 +44,9 @@ struct LaunchConv2DOp {
 template <typename T>
 struct LaunchConv2DOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-                  const Tensor& input, const Tensor& filter, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* output,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding, Tensor* output,
                   TensorFormat data_format);
 };
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 37cb67bc51112d42feaca25c37b3939775b66888..21c84b2a0ed15eaada88e308e1761dcb58cb07b3 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -377,6 +377,9 @@ struct LaunchConvOp<GPUDevice, T> {
         {{in_planes, in_rows, in_cols}},
         out_depth,
         {{filter_planes, filter_rows, filter_cols}},
+        // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
+        // conv is supported.
+        /*dilation=*/{{1, 1, 1}},
         {{strides[0], strides[1], strides[2]}},
         {{pad_planes, pad_rows, pad_cols}},
         dtype,
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index 291ebf2298762d25e2d44aa5b82ffd495ea92c0e..1b40ad81f413a726d14c5496f669923ab9254dce 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -679,8 +679,9 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
 
     const int dims = resized_shape.dims();
     OP_REQUIRES(
-        context, TensorShapeUtils::IsMatrix(paddings.shape()) &&
-                     paddings.dim_size(1) == 2,
+        context,
+        TensorShapeUtils::IsMatrix(paddings.shape()) &&
+            paddings.dim_size(1) == 2,
         errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
                                 paddings.shape().DebugString()));
     const int fixed_dims =
@@ -715,20 +716,22 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
       const int32 after =
           paddings_matrix(d, 1);  // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("paddings must be non-negative: ",
-                                          before, " ", after));
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
       if (offset_ == 0) {  // SYMMETRIC mode.
         OP_REQUIRES(
-            context, before <= resized_shape.dim_size(d) &&
-                         after <= resized_shape.dim_size(d),
+            context,
+            before <= resized_shape.dim_size(d) &&
+                after <= resized_shape.dim_size(d),
             errors::InvalidArgument("paddings must be no greater "
                                     "than the dimension size: ",
                                     before, ", ", after, " greater than ",
                                     resized_shape.dim_size(d)));
       } else if (offset_ == 1) {  // REFLECT mode.
         OP_REQUIRES(
-            context, before < resized_shape.dim_size(d) &&
-                         after < resized_shape.dim_size(d),
+            context,
+            before < resized_shape.dim_size(d) &&
+                after < resized_shape.dim_size(d),
             errors::InvalidArgument("paddings must be less than"
                                     " the dimension size: ",
                                     before, ", ", after, " not less than ",
@@ -767,18 +770,19 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // We only check the first three dims, since the depth is accessed as an
     // int64 below.
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     // The last dimension for input is in_depth. It must be the same as the
     // filter's in_depth.
     const int64 in_depth = padded_shape.dim_size(3);
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
@@ -786,9 +790,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // The second dimension for input is rows/height.
     // The first dimension for filter is rows/height.
     const int64 padded_rows_raw = padded_shape.dim_size(1);
-    OP_REQUIRES(context, FastBoundsCheck(padded_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int padded_rows = static_cast<int>(padded_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
     const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
@@ -796,9 +801,10 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
     // The third dimension for input is columns/width.
     // The second dimension for filter is columns/width.
     const int64 padded_cols_raw = padded_shape.dim_size(2);
-    OP_REQUIRES(context, FastBoundsCheck(padded_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int padded_cols = static_cast<int>(padded_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
     const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
@@ -864,24 +870,26 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
 };
 
-#define REGISTER_FUSED(T)                                                    \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("FusedResizeAndPadConv2D")                                        \
-          .Device(DEVICE_CPU)                                                \
-          .TypeConstraint<T>("T"),                                           \
-      FusedResizeConv2DUsingGemmOp<                                          \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                          BILINEAR>,                         \
+#define REGISTER_FUSED(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedResizeAndPadConv2D")                                     \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<T>("T"),                                        \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       BILINEAR>,                         \
           true>);
 
 TF_CALL_float(REGISTER_FUSED);
 
-#define REGISTER_PAD_ONLY_FUSED(T)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),      \
-      FusedResizeConv2DUsingGemmOp<                                          \
-          T, FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                          NEAREST>,                          \
+#define REGISTER_PAD_ONLY_FUSED(T)                                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       NEAREST>,                          \
           false>);
 
 TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index c852dc9991c2e879c8fa6a64b2bd8b5141606409..f0085be3a53b71af85d4c5f4bbcc6b07cd982ca8 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -27,7 +27,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-
 // Get the Cudnn workspace limit from the environment variable, which is in MB.
 // Return the workspace memory limit in bytes. If no value is set, return the
 // default value.
@@ -91,13 +90,14 @@ class ConvParameters {
   using SpatialArray = gtl::InlinedVector<int64, 3>;
   ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
                  int64 out_depths, const SpatialArray& filter,
-                 const SpatialArray& stride, const SpatialArray& padding,
-                 DataType dtype, int device_id)
+                 const SpatialArray& dilation, const SpatialArray& stride,
+                 const SpatialArray& padding, DataType dtype, int device_id)
       : batch_(batch),
         in_depths_(in_depths),
         out_depths_(out_depths),
         in_(in),
         filter_(filter),
+        dilation_(dilation),
         stride_(stride),
         padding_(padding),
         dtype_(dtype),
@@ -107,6 +107,7 @@ class ConvParameters {
     for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val);
     hash_code_ = Hash64Combine(hash_code_, out_depths);
     for (int64 val : filter) hash_code_ = Hash64Combine(hash_code_, val);
+    for (int64 val : dilation) hash_code_ = Hash64Combine(hash_code_, val);
     for (int64 val : stride) hash_code_ = Hash64Combine(hash_code_, val);
     for (int64 val : padding) hash_code_ = Hash64Combine(hash_code_, val);
     hash_code_ = Hash64Combine(hash_code_, dtype);
@@ -128,6 +129,7 @@ class ConvParameters {
         "(", str_util::Join(in_, ", "), "), ",
         out_depths_, ", ",
         "(", str_util::Join(filter_, ", "), "), ",
+        "(", str_util::Join(dilation_, ", "), "), ",
         "(", str_util::Join(stride_, ", "), "), ",
         "(", str_util::Join(padding_, ", "), "), ",
         dtype_, ", ",
@@ -143,7 +145,7 @@ class ConvParameters {
     int64 total_size = 16 * std::ceil(batch_ / 16.0) *
                        std::max(in_depths_, out_depths_) * in_[0] * in_[1] *
                        sizeof(T);
-    int64 threshold = 1L << 31;
+    int64 threshold = 1LL << 31;
     if (total_size >= threshold) {
       return false;
     } else {
@@ -154,11 +156,11 @@ class ConvParameters {
  protected:
   using ParameterDataType =
       std::tuple<int64, int64, SpatialArray, int64, SpatialArray, SpatialArray,
-                 SpatialArray, DataType, int>;
+                 SpatialArray, SpatialArray, DataType, int>;
 
   ParameterDataType get_data_as_tuple() const {
     return std::make_tuple(batch_, in_depths_, in_, out_depths_, filter_,
-                           stride_, padding_, dtype_, device_id_);
+                           dilation_, stride_, padding_, dtype_, device_id_);
   }
 
   uint64 hash_code_;
@@ -169,6 +171,7 @@ class ConvParameters {
   int64 out_depths_;
   SpatialArray in_;
   SpatialArray filter_;
+  SpatialArray dilation_;
   SpatialArray stride_;
   SpatialArray padding_;
   DataType dtype_;
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index 9a00a091bd29ca6bb3150c65e24833d6d99b2ffd..a376534badc73065e3ec01972dde85da7bbdb0f8 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -19,10 +19,13 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <limits>
+#include <utility>
 
 #include "cuda/include/cuda.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -223,31 +226,28 @@ __global__ void SwapDimension1And2InTensor3Simple(int nthreads, const T* input,
 // Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor,
 // where dimensions are zero-based: output[i][j][k] = input[i][k][j].
 //
-// Each thread block operates on a single tile, a square of dimensions TileSize
-// x TileSize.  We require that the thread block's X dimension equals TileSize,
-// and its Y dimension equals NumSubTiles.
+// Each thread block operates on a single tile, a rectangle of dimensions
+// TileSizeI x TileSizeJ.
 //
-// For best performance, you should probably set TileSize equal to the number of
-// threads in a warp (32 in nvidia GPUs).  With a TileSize of 32, NumSubTiles ==
-// 4 or 8 seems to get the best performance on K40 GPUs.
-template <typename T, int TileSize, int NumSubTiles, bool conjugate = false>
-__global__ void SwapDimension1And2InTensor3UsingTiles(const T* input,
-                                                      Dimension<3> input_dims,
-                                                      T* output) {
-  // One extra line in the inner dimension to avoid share memory bank conflict.
-  __shared__ T shared_memory_tile[TileSize][TileSize + 1];
-
-  static_assert(TileSize % NumSubTiles == 0,
-                "TileSize must be divisible by NumSubTiles");
-  eigen_assert(blockDim.x == TileSize);
-  eigen_assert(blockDim.y == NumSubTiles);
+// In general, for best performance, you should probably set TileSizeI,
+// TileSizeJ equal to the number of threads in a warp (32 in nvidia GPUs).
+// With a TileSizeI, TileSizeJ of 32, NumThreads of 128 or 256 seems to get
+// the best performance on K40 GPUs.
+template <typename T, int NumThreads, int TileSizeI, int TileSizeJ,
+          bool conjugate = false>
+__global__ void SwapDimension1And2InTensor3UsingTiles(
+    const T* __restrict__ input, Dimension<3> input_dims,
+    T* __restrict__ output) {
+  eigen_assert(blockDim.x == NumThreads);
+  eigen_assert(blockDim.y == 1);
   eigen_assert(blockDim.z == 1);
   eigen_assert(gridDim.y == 1);
   eigen_assert(gridDim.z == 1);
 
-  // We break down the tile into NumSubTiles groups, so each thread processes
-  // kSubTileSize elements (except at the edges of the input).
-  const int kSubTileSize = TileSize / NumSubTiles;
+  constexpr int ReadRowPerPass = NumThreads / TileSizeJ;
+  constexpr int WriteRowPerPass = NumThreads / TileSizeI;
+  // One extra line in the inner dimension to avoid share memory bank conflict.
+  __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
 
   int x = threadIdx.x;
 
@@ -259,8 +259,8 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(const T* input,
 
   Dimension<3> input_dims_in_tiles = {
       input_dims[0],
-      (input_dims[1] + TileSize - 1) / TileSize,
-      (input_dims[2] + TileSize - 1) / TileSize,
+      (input_dims[1] + TileSizeI - 1) / TileSizeI,
+      (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
   };
 
   Index<3> input_tile_index =
@@ -268,32 +268,58 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(const T* input,
 
   Index<3> input_tile_origin = {
       input_tile_index[0],
-      input_tile_index[1] * TileSize,
-      input_tile_index[2] * TileSize,
+      input_tile_index[1] * TileSizeI,
+      input_tile_index[2] * TileSizeJ,
   };
 
   int input_origin_flat_index =
       TensorIndexToFlat(input_tile_origin, input_dims);
 
-  int tile_width = TileSize;
+  bool full_tile = true;
+  int tile_width = TileSizeJ;
+
   // Only the last row or column may not have the full size.
   if (input_tile_index[2] == input_dims_in_tiles[2] - 1) {
-    tile_width = input_dims[2] - (input_dims_in_tiles[2] - 1) * TileSize;
+    tile_width = input_dims[2] - (input_dims_in_tiles[2] - 1) * TileSizeJ;
+    full_tile &= false;
   }
-  int tile_height = TileSize;
+
+  int tile_height = TileSizeI;
+
   if (input_tile_index[1] == input_dims_in_tiles[1] - 1) {
-    tile_height = input_dims[1] - (input_dims_in_tiles[1] - 1) * TileSize;
+    tile_height = input_dims[1] - (input_dims_in_tiles[1] - 1) * TileSizeI;
+    full_tile &= false;
   }
 
-  int input_flat_index = input_origin_flat_index + x;
-  int y_start = static_cast<int>(threadIdx.y) * kSubTileSize;
-
-  // Load the data from input memory to the shared memory tile.
-  if (x < tile_width) {
-    int y_end = min(y_start + kSubTileSize, tile_height);
-    for (int y = y_start; y < y_end; y++) {
-      shared_memory_tile[y][x] = maybe_conj<T, conjugate>::run(
-          input[input_flat_index + y * input_dims[2]]);
+  // Calculate effective thread number. This ensures that we use the largest
+  // number of threads available to form a regular thread block with no
+  // trailing incomplete lines.
+  constexpr int in_effective_thread_num = NumThreads / TileSizeJ * TileSizeJ;
+
+  if (x < in_effective_thread_num) {
+    // Orient the logical thread block with respect to the input array.
+    // ie. align the contiguous dimension of thread blocks with the contiguous
+    // dimension of the input array.
+    int ti = x / TileSizeJ;
+    int tj = x % TileSizeJ;
+    int input_index = input_origin_flat_index + ti * input_dims[2] + tj;
+    int input_increment = ReadRowPerPass * input_dims[2];
+
+    if (full_tile) {
+#pragma unroll
+      for (int i_loc = ti; i_loc < (TileSizeI); i_loc += ReadRowPerPass) {
+        shared_memory_tile[i_loc][tj] =
+            maybe_conj<T, conjugate>::run(input[input_index]);
+        input_index += input_increment;
+      }
+    } else {
+      if (tj < tile_width) {
+        for (int i_loc = ti; i_loc < (tile_height); i_loc += ReadRowPerPass) {
+          shared_memory_tile[i_loc][tj] =
+              maybe_conj<T, conjugate>::run(input[input_index]);
+          input_index += input_increment;
+        }
+      }
     }
   }
 
@@ -307,101 +333,36 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(const T* input,
 
   Index<3> output_tile_origin = {
       output_tile_index[0],
-      output_tile_index[1] * TileSize,
-      output_tile_index[2] * TileSize,
+      output_tile_index[1] * TileSizeJ,
+      output_tile_index[2] * TileSizeI,
   };
 
   int output_origin_flat_index =
       TensorIndexToFlat(output_tile_origin, output_dims);
 
-  int output_flat_index = output_origin_flat_index + x;
-
-  // Load the data from the shared memory tile to the output memory.
-  if (x < tile_height) {
-    int y_end = min(y_start + kSubTileSize, tile_width);
-    for (int y = y_start; y < y_end; y++) {
-      output[output_flat_index + y * output_dims[2]] = shared_memory_tile[x][y];
-    }
-  }
-}
-
-// Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor
-// when only one of the dimension sizes is smaller than 16,
-// where dimensions are zero-based: output[i][j][k] = input[i][k][j].
-//
-// small_dim = the_smaller_dimension_size
-// large_dim = the_larger_dimension_size
-// tile_num_per_block = blockDim.x
-// kTileLength = small_dim
-//
-// Each thread block operates on a single rectangle tile, where its width is
-// kTileLength (we currently set it to 64) and its height is small_dim,
-// We set the thread block's X dimension to be tile_num_per_block, and its Y
-// and Z to be one.
-template <typename T, int ShmemSize, bool SmallDim2, bool conjugate = false>
-__global__ void SwapDimension1And2InTensor3SmallDim(const T* input,
-                                                    int batch_per_block,
-                                                    Dimension<3> input_dims,
-                                                    T* output) {
-  // TODO(yangzihao) avoid share memory bank conflict.
-  __shared__ T shared_memory_tile[ShmemSize];
-
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.z == 1);
-
-  int block_offset = blockIdx.x * blockDim.x;
-
-  int x = threadIdx.x;
-  int tile_height = blockDim.x;
-
-  // Get tile height, width, and thread/block origin indices.
-  int small_dim = SmallDim2 ? input_dims[2] : input_dims[1];
-  int large_dim = SmallDim2 ? input_dims[1] : input_dims[2];
-
-  int global_offset = small_dim * large_dim * (blockIdx.y * batch_per_block) +
-                      (SmallDim2 ? block_offset * small_dim : block_offset);
-  if (global_offset >= (input_dims[0] * input_dims[1] * input_dims[2])) return;
-
-  for (int batch = 0; batch < batch_per_block; ++batch) {
-    int block_origin_idx =
-        small_dim * large_dim * (blockIdx.y * batch_per_block + batch);
-    int thread_origin_idx =
-        block_origin_idx +
-        (SmallDim2 ? block_offset * small_dim : block_offset) + x;
-
-    if (block_offset + blockDim.x > large_dim) {
-      tile_height = large_dim - block_offset;
-    }
-
-    __syncthreads();
-
-    // Load a continuous memory region to shared memory tile.
-    if (x < tile_height) {
-      for (int y = 0; y < small_dim; y++) {
-        int shmem_index =
-            SmallDim2 ? (x + y * tile_height) : (x * small_dim + y);
-        shared_memory_tile[shmem_index] = maybe_conj<T, conjugate>::run(
-            ldg(input + thread_origin_idx +
-                y * (SmallDim2 ? tile_height : large_dim)));
+  constexpr int out_effective_thread_num = NumThreads / TileSizeI * TileSizeI;
+
+  if (x < out_effective_thread_num) {
+    // Re-orient the logical thread block with respect to the output array.
+    // ie. align the contiguous dimension of thread blocks with contiguous
+    // dimension of the output array.
+    int ti = x / TileSizeI;
+    int tj = x % TileSizeI;
+    int output_index = output_origin_flat_index + ti * output_dims[2] + tj;
+    int output_increment = WriteRowPerPass * output_dims[2];
+
+    if (full_tile) {
+#pragma unroll
+      for (int i_loc = ti; i_loc < (TileSizeJ); i_loc += WriteRowPerPass) {
+        output[output_index] = shared_memory_tile[tj][i_loc];
+        output_index += output_increment;
       }
-    }
-
-    __syncthreads();
-
-    // Get block origin index for output array.
-    int output_block_offset = block_origin_idx;
-    int output_block_idx = SmallDim2 ? block_offset : block_offset * small_dim;
-    int output_block_origin_idx = output_block_offset + output_block_idx;
-
-    // Store the transposed memory region in shared memory to device.
-    if (x < tile_height) {
-      for (int y = 0; y < small_dim; y++) {
-        int output_idx = output_block_origin_idx + x +
-                         y * (SmallDim2 ? large_dim : tile_height);
-        int shmem_index =
-            SmallDim2 ? (x * small_dim + y) : (x + y * tile_height);
-        output[output_idx] = shared_memory_tile[shmem_index];
+    } else {
+      if (tj < tile_height) {
+        for (int i_loc = ti; i_loc < (tile_width); i_loc += WriteRowPerPass) {
+          output[output_index] = shared_memory_tile[tj][i_loc];
+          output_index += output_increment;
+        }
       }
     }
   }
@@ -548,6 +509,386 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
   }
 };
 
+// We want std::equal_to and std::greater, but they're not constexpr until
+// C++14.
+struct EqualTo {
+  constexpr bool operator()(int a, int b) const { return a == b; }
+};
+
+struct GreaterThan {
+  constexpr bool operator()(int a, int b) const { return a > b; }
+};
+
+// For each data type, the tile size possibility frontier denotes the tile size
+// combinations that consume the most computational resources constrained by
+// - number of threads per SM limit,
+// - limit on size of the short dimension (<=15) due to the definition of
+//   narrow matrix,
+// - shared memory limit and
+// - some experimentally determined, type-specific constraint on the product of
+//   two side lengths to increase grid-level parallelism.
+//
+// A tile size combination lies on the frontier if and only if one or more
+// constraint mentioned above is hit. Tile size combinations lying outside this
+// frontier are either not possible, or are slower than the alternatives.
+//
+// It is instrumental to consider, for each data type, two subsets of the
+// corresponding frontier:
+// - long side frontier: the union of the biggest tile size combination for
+//   each legal long side len.
+// - non long side frontier: the frontier set minus the long side frontier.
+//
+// TileSizePossibilityFrontierCheck defines the frontier using only the long
+// side frontier tile size combinations (since one can easily extrapolate
+// the entire frontier from this subset). It serves as a utility function
+// to help us determine where a tile size combination of interest lies with
+// resepect to the frontier.
+template <typename Op>
+constexpr bool TileSizePossibilityFrontierCheck(int TileLongSide,
+                                                int TileShortSide,
+                                                int size_of_t, Op op) {
+  // clang-format off
+
+  return (size_of_t == 16 && ((TileLongSide == 32   && op(TileShortSide, 4))  ||
+                             (TileLongSide == 64   && op(TileShortSide, 4))  ||
+                             (TileLongSide == 128  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 256  && op(TileShortSide, 2)))) ||
+          (size_of_t == 8 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 256  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 2)))) ||
+          (size_of_t == 4 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2)))) ||
+          (size_of_t == 2 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2)))) ||
+          (size_of_t == 1 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2))));
+
+  // clang-format on
+}
+
+constexpr bool TileSizeOnLongSideFrontier(int TileLongSide, int TileShortSide,
+                                          int size_of_t) {
+  return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
+                                          size_of_t, EqualTo());
+}
+constexpr bool TileSizeOutsideFrontier(int TileLongSide, int TileShortSide,
+                                       int size_of_t) {
+  return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
+                                          size_of_t, GreaterThan());
+}
+constexpr bool TileSizeOnNonLongSideFrontier(int TileLongSide,
+                                             int TileShortSide, int size_of_t) {
+  // For a tile size combination (longside, shortside), lying on the frontier
+  // implies that (longside, shortside) is on or within the frontier but
+  // (longside*2, shortside) or (longside, shortside+1) is not. With the above
+  // critereon, we simply need to use !TileSizeOnLongSideFrontier to ensure that
+  // it is not on the long side frontier.
+  return !TileSizeOutsideFrontier(TileLongSide, TileShortSide, size_of_t) &&
+         (TileSizeOutsideFrontier(TileLongSide * 2, TileShortSide, size_of_t) ||
+          TileSizeOutsideFrontier(TileLongSide, TileShortSide + 1,
+                                  size_of_t)) &&
+         !TileSizeOnLongSideFrontier(TileLongSide, TileShortSide, size_of_t);
+}
+
+// Helper function to launch a batch narrow matirx transpose kernel.
+template <typename T, int TileLongSide, int TileShortSide>
+void LaunchBatchNarrowMatrixTransposeKernel(
+    const GPUDevice& d, int tile_size_i, int tile_size_j, int total_tiles_count,
+    const T* input, const Dimension<3>& input_dims, T* output) {
+  constexpr int NumThreads = TileLongSide;
+  if (tile_size_i <= TileLongSide && tile_size_j <= TileShortSide) {
+    SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide,
+                                          TileShortSide>
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
+                                                           output);
+  } else {
+    SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide,
+                                          TileLongSide>
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
+                                                           output);
+  }
+}
+
+// Recursive template function to search, in a trial-and-error manner, for the
+// minimum tile size configuration satisfying the requested tile side lengths.
+// An important invariant of this search procedure is that for an unsatisfied
+// request, we always try doubling the long side len first, and only after
+// the request is satisfied for the long side len do we begin incrementing
+// the short side len.
+//
+// We have three specializations of this search function depending on where the
+// current tile size combination lies with respect to the frontier.
+// - It lies within the frontier. If request is not satisfied, for the next tile
+// size combination, we first try doubling the long side len and if that does
+// not work, we then increment the short side len.
+// - It lies on the non long side frontier. If the request is not satisfied, we
+// can only increment the short side len.
+// - It lies on the long side frontier. We launch the kernel without checking if
+// the request is satisfied or not.
+template <typename T, int TileLongSide, int TileShortSide,
+          typename dummy = void>
+struct BatchNarrowMatrixTransposeDispatcher {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+    bool request_satisfied =
+        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
+        std::min(tile_size_i, tile_size_j) <= TileShortSide;
+
+    if (request_satisfied) {
+      LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
+          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          output);
+      return;
+    }
+
+    // If the execution reaches here, then the kernel was not launched; we then
+    // determine whether it is the long side or the short side that falls short
+    // of the request and increase that parameter accordingly.
+    const bool long_side_request_not_satisfied =
+        std::max(tile_size_i, tile_size_j) > TileLongSide;
+
+    if (long_side_request_not_satisfied) {
+      BatchNarrowMatrixTransposeDispatcher<
+          T, TileLongSide * 2, TileShortSide>::DoIt(d, tile_size_i, tile_size_j,
+                                                    total_tiles_count, input,
+                                                    input_dims, output);
+    } else {
+      BatchNarrowMatrixTransposeDispatcher<
+          T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
+                                                    total_tiles_count, input,
+                                                    input_dims, output);
+    }
+  }
+};
+
+template <typename T, int TileLongSide, int TileShortSide>
+struct BatchNarrowMatrixTransposeDispatcher<
+    T, TileLongSide, TileShortSide,
+    typename std::enable_if<TileSizeOnNonLongSideFrontier(
+                                TileLongSide, TileShortSide, sizeof(T)),
+                            void>::type> {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+    bool request_satisfied =
+        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
+        std::min(tile_size_i, tile_size_j) <= TileShortSide;
+
+    if (request_satisfied) {
+      LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
+          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          output);
+      return;
+    }
+
+    // If the execution reaches here, then the kernel was not launched; since
+    // we are on the non long side frontier, we increment the short dimension
+    // and try again.
+    BatchNarrowMatrixTransposeDispatcher<
+        T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
+                                                  total_tiles_count, input,
+                                                  input_dims, output);
+  }
+};
+
+template <typename T, int TileLongSide, int TileShortSide>
+struct BatchNarrowMatrixTransposeDispatcher<
+    T, TileLongSide, TileShortSide,
+    typename std::enable_if<TileSizeOnLongSideFrontier(
+                                TileLongSide, TileShortSide, sizeof(T)),
+                            void>::type> {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+
+    LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
+        d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+        output);
+  }
+};
+
+// This function tries to recover, in a brute force way, the frontier defined in
+// TileSizePossibilityFrontierCheck as a vector of tile size combinations lying
+// on the long side frontier. This vector is sufficient to determine the entire
+// frontier.
+//
+// Note that if one changes the frontier definition in
+// TileSizePossibilityFrontierCheck and forgets to set the largest short
+// side len of the largest legal long side len to 2, this function will fail
+// and crash the program.
+template <int SizeOfT>
+const std::vector<std::pair<int, int>>& GetTileSizesFrontier() {
+  static_assert(
+      SizeOfT <= 16,
+      "Currently, only data types of sizes 16 bytes or less are supported.");
+  static_assert((SizeOfT & (SizeOfT - 1)) == 0,
+                "Data types must have sizes that are powers of 2.");
+
+  // Expensive work to populate sizes, lazily run in a thread-safe
+  // manner the first time GetTileSizesFrontier<N> is called.
+  static auto* frontier = [] {
+    auto* frontier = new std::vector<std::pair<int, int>>();
+    const int kMaxLongSideLen = 1024;
+    const int kMaxShortSideLen = 15;
+    for (int long_side = 32; long_side <= kMaxLongSideLen; long_side *= 2) {
+      for (int short_side = 2; short_side <= kMaxShortSideLen;
+           short_side += 1) {
+        if (TileSizeOnLongSideFrontier(long_side, short_side, SizeOfT)) {
+          // The current combination lies on the frontier, thus we
+          // add it to the frontier definition.
+          frontier->push_back(std::make_pair(long_side, short_side));
+
+          // The long side length is the largest one allowed iff its
+          // corresponding short side length is 2.
+          if (short_side == 2) return frontier;
+
+          // We have exhausted all the possibilities in the frontier
+          // with the given long side length.
+          break;
+        }
+      }
+    }
+    LOG(FATAL)
+        << "The corresponding short side length of the largest long side "
+           "length has to be 2.";
+  }();
+  return *frontier;
+}
+
+// Helper structs to help determine which data type to use given the size of
+// the matrix data type. A transpose of elements of size N will use a kernel
+// which operates on an array of TransposeElemType<N>::type.
+template <int ElemBytes>
+struct TransposeElemType;
+template <>
+struct TransposeElemType<1> {
+  using type = uint8;
+};
+template <>
+struct TransposeElemType<2> {
+  using type = uint16;
+};
+template <>
+struct TransposeElemType<4> {
+  using type = uint32;
+};
+template <>
+struct TransposeElemType<8> {
+  using type = uint64;
+};
+template <>
+struct TransposeElemType<16> {
+  using type = float4;
+};
+
+// A helper function to make RunSwapDimension1And2InTensor3 concise. This
+// helper function looks at the data type and input matrix sizes and decides
+// the thread numbers and tile sizes to use.
+template <typename T, bool conjugate = false>
+void SwapDimension1And2InTensor3WithNarrowMatrices(
+    const GPUDevice& d, const T* input, const Dimension<3>& input_dims,
+    T* output, const int kMinDimensionToUseTiles) {
+  // Get available tile sizes here for the data type requested:
+  const auto& tile_spec = GetTileSizesFrontier<sizeof(T)>();
+
+  int tile_long_side_len = 0;
+  int tile_short_side_len = 0;
+  float lowest_cost = std::numeric_limits<float>::max();
+  int data_long_side = std::max(input_dims[1], input_dims[2]);
+
+  for (auto tile_size_pair : tile_spec) {
+    int proposed_tile_long_side_len = tile_size_pair.first;
+
+    // Number of threads that will not be doing anything useful when reading
+    // the matrix because the thread block size is bigger than the data block
+    // size.
+    int num_wasted_threads =
+        data_long_side - MathUtil::FloorOfRatio<int>(
+                             data_long_side, proposed_tile_long_side_len) *
+                             proposed_tile_long_side_len;
+
+    int num_full_tiles = MathUtil::FloorOfRatio<int>(
+        data_long_side, proposed_tile_long_side_len);
+
+    float cost = 0;
+
+    // However, if we can execute two or more full tiles, then we gladly
+    // accept any number of wasted threads and ignore its cost.
+    if (num_full_tiles <= 1) cost = num_wasted_threads;
+
+    // Using less than or equal to here because given the same cost, we
+    // would like to launch as many threads as possible.
+    if (cost <= lowest_cost) {
+      tile_long_side_len = proposed_tile_long_side_len;
+      tile_short_side_len = tile_size_pair.second;
+      lowest_cost = cost;
+    }
+  }
+
+  // Request tile sizes such that the longer side of threadblock aligns with
+  // the longer side of input data block to maximize read throughput.
+  // The ideal tile shape is one where the length of the shorter side of the
+  // tile is equal to the length of the shorter side of the input matrix.
+  int requested_tile_size_i = input_dims[1] >= kMinDimensionToUseTiles
+                                  ? tile_long_side_len
+                                  : input_dims[1];
+  int requested_tile_size_j = input_dims[1] >= kMinDimensionToUseTiles
+                                  ? input_dims[2]
+                                  : tile_long_side_len;
+
+  // Truncate the shorter size requested according to the manual limit set in
+  // tile_spec to make sure that we do not launch configurations violating
+  // hardware limits.
+  requested_tile_size_i =
+      requested_tile_size_i == tile_long_side_len
+          ? tile_long_side_len
+          : std::min(requested_tile_size_i, tile_short_side_len);
+  requested_tile_size_j =
+      requested_tile_size_j == tile_long_side_len
+          ? tile_long_side_len
+          : std::min(requested_tile_size_j, tile_short_side_len);
+
+  Dimension<3> input_dims_in_tiles = {
+      input_dims[0],
+      MathUtil::CeilOfRatio<int>(input_dims[1], requested_tile_size_i),
+      MathUtil::CeilOfRatio<int>(input_dims[2], requested_tile_size_j),
+  };
+
+  int total_tiles_count =
+      input_dims_in_tiles[0] * input_dims_in_tiles[1] * input_dims_in_tiles[2];
+
+  using ElemType = typename TransposeElemType<sizeof(T)>::type;
+  static_assert(alignof(T) >= alignof(ElemType), "Unexpected data alignment.");
+  BatchNarrowMatrixTransposeDispatcher<ElemType, 32, 2>::DoIt(
+      d, requested_tile_size_i, requested_tile_size_j, total_tiles_count,
+      reinterpret_cast<const ElemType*>(input), input_dims,
+      reinterpret_cast<ElemType*>(output));
+}
+
 // Launch the GPU kernel that would swap dimension-1 and dimension-2 in a
 // 3D tensor. It looks at the shape of the incoming data, and decides the best
 // strategy to launch.
@@ -558,60 +899,35 @@ void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
   // If one dimension is trivial, use SmallDim kernel for swapping.
   // Otherwise, the trivial swapping relying on the ldg cache is more efficient.
   static const int kMinDimensionToUseTiles = 16;
-  bool use_tiles = (input_dims[1] >= kMinDimensionToUseTiles &&
-                    input_dims[2] >= kMinDimensionToUseTiles);
-  bool use_small_dim = ((input_dims[1] >= kMinDimensionToUseTiles &&
-                         input_dims[2] < kMinDimensionToUseTiles)) ||
-                       ((input_dims[1] < kMinDimensionToUseTiles &&
-                         input_dims[2] >= kMinDimensionToUseTiles));
-  static const int NumSubTiles = 8;
-
-  if (use_tiles) {
-    static const int TileSize = 32;
+  static const int kMinDimensionToUseRectTiles = 96;
+
+  bool large_matrix = input_dims[1] >= kMinDimensionToUseTiles &&
+                      input_dims[2] >= kMinDimensionToUseTiles;
+  bool narrow_matrix = input_dims[1] >= kMinDimensionToUseRectTiles ||
+                       input_dims[2] >= kMinDimensionToUseRectTiles;
+  if (large_matrix) {
+    // We get best performance when kTileSize is the number of threads in a warp
+    // (32 on our GPUs) and NumSubTiles is 8, so our block size is 8 * 32 = 256
+    // threads.
+    constexpr int kTileSize = 32;
+    constexpr int kNumThreads = 256;
+
     Dimension<3> input_dims_in_tiles = {
         input_dims[0],
-        (input_dims[1] + TileSize - 1) / TileSize,
-        (input_dims[2] + TileSize - 1) / TileSize,
+        MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
+        MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
     };
+
     int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
                             input_dims_in_tiles[2];
-    // We get best performance when TileSize is the number of threads in a warp
-    // (32 on our GPUs) and NumSubTiles is 8, so our block size is 8 * 32 = 256
-    // threads.
-    SwapDimension1And2InTensor3UsingTiles<T, TileSize, NumSubTiles, conjugate>
-        <<<total_tiles_count, dim3(TileSize, NumSubTiles), 0, d.stream()>>>(
-            input, input_dims, output);
-  } else if (use_small_dim) {
-    // When only one of the dimensions is smaller than kMinDimensionToUseTiles,
-    // we use one block to process a rectangle region with the size of
-    // kTileLength * small_dim. We found that when set kTileLength to 64 on
-    // TitanX Maxwell GPU, it achieves the best performance.
-    //              large_dim
-    //            +---------------...--------+
-    //            |            |        |    |
-    // small_dim  |            |  ...   |    |
-    //            |            |        |    |
-    //            +--------------...---------+
-    //            \----- ------/         \- -/
-    //                  V                  V
-    //    kTileLength(tile_height)    tile_height
-    static const int kTileLength = 64;
-    static const int kGridDimY = 65535;
-    int large_dim = std::max(input_dims[2], input_dims[1]);
-    int tile_num_per_block = (large_dim + kTileLength - 1) / kTileLength;
-    int grid_dim_y = std::min(input_dims[0], kGridDimY);
-    int batch_per_block = (input_dims[0] + grid_dim_y - 1) / grid_dim_y;
-    if (input_dims[2] < input_dims[1]) {
-      SwapDimension1And2InTensor3SmallDim<
-          T, kTileLength * kMinDimensionToUseTiles, true, conjugate>
-          <<<dim3(tile_num_per_block, grid_dim_y), kTileLength, 0,
-             d.stream()>>>(input, batch_per_block, input_dims, output);
-    } else {
-      SwapDimension1And2InTensor3SmallDim<
-          T, kTileLength * kMinDimensionToUseTiles, false, conjugate>
-          <<<dim3(tile_num_per_block, grid_dim_y), kTileLength, 0,
-             d.stream()>>>(input, batch_per_block, input_dims, output);
-    }
+    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize,
+                                          conjugate>
+        <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
+                                                            output);
+
+  } else if (narrow_matrix) {
+    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
+        d, input, input_dims, output, kMinDimensionToUseTiles);
   } else {
     int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
     CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index ea54d6cf6cbfb6f2d38ae10644fed348980ab622..666bca265c95febf3753e71bf010a7caf95c0541 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -43,6 +43,8 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) {
       128,       // out_depths
       {{3,       // filter_rows
         3}},     // filter_cols
+      {{1,       // dilation_rows
+        1}},     // dilation_cols
       {{1,       // stride_rows
         1}},     // stride_cols
       {{0,       // padding_rows
@@ -60,6 +62,8 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) {
       768,       // out_depths
       {{3,       // filter_rows
         3}},     // filter_cols
+      {{1,       // dilation_rows
+        1}},     // dilation_cols
       {{1,       // stride_rows
         1}},     // stride_cols
       {{0,       // padding_rows
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index 20da77c36f64173f2dd40fe8e4a608e39c128447..af0a9fa82ee5778fa9e18cea59cf759fa468224f 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -468,18 +468,19 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     // The last dimension for input is in_depth. It must be the same as the
     // filter's in_depth.
     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
@@ -487,18 +488,20 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
     // The second dimension for input is rows/height.
     // The first dimension for filter is rows/height.
     const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int input_rows = static_cast<int>(input_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
 
     // The third dimension for input is columns/width.
     // The second dimension for filter is columns/width.
     const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int input_cols = static_cast<int>(input_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
diff --git a/tensorflow/core/kernels/critical_section.cc b/tensorflow/core/kernels/critical_section.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30a9abf4ee78cdb336e4c25c217239daf89bae11
--- /dev/null
+++ b/tensorflow/core/kernels/critical_section.cc
@@ -0,0 +1,246 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <deque>
+#include <utility>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/captured_function.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class CriticalSection : public ResourceBase {
+ public:
+  explicit CriticalSection() : is_locked_(false) {}
+  ~CriticalSection() override {
+    // Wait for all closures to finish running.
+    mutex_lock lock(mu_);
+    while (!closures_.empty()) {
+      queue_empty_cv_.wait(lock);
+    }
+  }
+
+ private:
+  friend class ExecuteInCriticalSectionOp;
+
+  void Acquire(std::function<void()> closure) {
+    std::function<void()> next;
+    {
+      mutex_lock ml(mu_);
+      if (is_locked_) {
+        closures_.push_back(std::move(closure));
+      } else {
+        // This branch is the common case.  Avoid the queue.
+        is_locked_ = true;
+        next = std::move(closure);
+      }
+    }
+    if (next) {
+      next();
+    }
+  }
+
+  void Release() {
+    std::function<void()> next;
+    {
+      mutex_lock ml(mu_);
+      CHECK(is_locked_);
+      if (!closures_.empty()) {
+        // if queue is not empty, start the next entry off the queue.
+        std::swap(next, closures_.front());
+        closures_.pop_front();
+      } else {
+        is_locked_ = false;
+        queue_empty_cv_.notify_all();
+      }
+    }
+    if (next) {
+      next();
+    }
+  }
+
+  string DebugString() override {
+    tf_shared_lock ml(mu_);
+    return strings::StrCat("CriticalSection(locked: ", is_locked_,
+                           " queue_size: ", closures_.size(), ")");
+  }
+
+ private:
+  mutex mu_;
+  std::deque<std::function<void()>> closures_ GUARDED_BY(mu_);
+  bool is_locked_ GUARDED_BY(mu_);
+  condition_variable queue_empty_cv_ GUARDED_BY(mu_);
+};
+
+class ExecuteInCriticalSectionOp : public AsyncOpKernel {
+ public:
+  explicit ExecuteInCriticalSectionOp(OpKernelConstruction* c)
+      : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("f", &func_));
+  }
+
+ public:
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CriticalSection* critical_section = nullptr;
+    OP_REQUIRES_OK_ASYNC(c,
+                         LookupOrCreateResource<CriticalSection>(
+                             c, HandleFromInput(c, 0), &critical_section,
+                             [this, c](CriticalSection** ptr) {
+                               *ptr = new CriticalSection;
+                               return Status::OK();
+                             }),
+                         done);
+    // No need to Unref critical_section; the Closure below will take
+    // care of the Unref associated with this execution.
+
+    auto* execution = new Closure{std::move(done), c, critical_section, &func_};
+    execution->Start();
+  }
+
+ private:
+  class Closure {
+   public:
+    AsyncOpKernel::DoneCallback done_;
+    OpKernelContext* ctx_;
+    CriticalSection* cs_;
+    FunctionLibraryRuntime::Handle handle_;
+    FunctionLibraryRuntime::Options opts_;
+    std::vector<Tensor> arguments_t_;
+    std::vector<Tensor> output_t_;
+    NameAttrList* func_;
+
+    explicit Closure(AsyncOpKernel::DoneCallback done, OpKernelContext* ctx,
+                     CriticalSection* critical_section, NameAttrList* func)
+        : done_(std::move(done)),
+          ctx_(ctx),
+          cs_(critical_section),
+          handle_(-1),
+          func_(func) {}
+
+    ~Closure();
+
+    void Start() {
+      // Perform ExecuteFunction isnide a separate thread to avoid
+      // having lightweight Functions be inlined in this thread.
+      // That inlining would in turn inline DoneAndDelete inside the
+      // same thread.  Since DoneAndDelete can call the next
+      // ExecuteFunction in the CriticalSection, this can cause a
+      // stack overflow.
+      cs_->Acquire(
+          [this]() { (*ctx_->runner())([this]() { ExecuteFunction(); }); });
+    }
+
+   private:
+    void ExecuteFunction();
+    void DoneAndDelete(const Status& status);
+  };
+
+  NameAttrList func_;
+};
+
+void ExecuteInCriticalSectionOp::Closure::ExecuteFunction() {
+  // Arguments to a Function are in the order:
+  //   concat(<formal arguments>, <captured arguments>)
+  OpInputList arguments;
+  Status s = ctx_->input_list("arguments", &arguments);
+  if (!s.ok()) {
+    DoneAndDelete(s);
+    return;
+  }
+
+  arguments_t_.reserve(arguments.size());
+  for (const Tensor& t : arguments) {
+    arguments_t_.push_back(t);
+  }
+
+  auto* function_library = ctx_->function_library();
+  s = function_library->Instantiate(func_->name(), AttrSlice(&func_->attr()),
+                                    &handle_);
+  if (!s.ok()) {
+    DoneAndDelete(s);
+    return;
+  }
+
+  opts_.step_id = CapturedFunction::generate_step_id();
+  auto* step_container =
+      new ScopedStepContainer(opts_.step_id, [this](const string& name) {
+        ctx_->resource_manager()->Cleanup(name).IgnoreError();
+      });
+  opts_.cancellation_manager = ctx_->cancellation_manager();
+  opts_.step_container = step_container;
+  opts_.runner = ctx_->runner();
+
+  function_library->Run(opts_, handle_, arguments_t_, &output_t_,
+                        [this](const Status& s) { DoneAndDelete(s); });
+}
+
+void ExecuteInCriticalSectionOp::Closure::DoneAndDelete(const Status& status) {
+  cs_->Release();
+
+  if (!status.ok()) {
+    ctx_->SetStatus(status);
+  } else {
+    OpOutputList output;
+    const Status s = ctx_->output_list("outputs", &output);
+    if (!s.ok()) {
+      ctx_->SetStatus(s);
+    } else if (output_t_.size() != output.size()) {
+      ctx_->SetStatus(errors::Internal(
+          "Could not set all outputs.  Expected output size is ", output.size(),
+          " but function set ", output_t_.size(), " output values."));
+    } else {
+      for (int i = 0; i < output_t_.size(); ++i) {
+        output.set(i, output_t_[i]);
+      }
+    }
+  }
+
+  delete opts_.step_container;
+  opts_.step_container = nullptr;
+  done_();
+  cs_->Unref();
+  delete this;
+}
+
+ExecuteInCriticalSectionOp::Closure::~Closure() {
+  CHECK(!opts_.step_container)
+      << "Initialized closure destroyed without calling Done";
+}
+
+REGISTER_KERNEL_BUILDER(Name("ExecuteInCriticalSection").Device(DEVICE_CPU),
+                        ExecuteInCriticalSectionOp);
+
+REGISTER_KERNEL_BUILDER(Name("CriticalSectionOp").Device(DEVICE_CPU),
+                        ResourceHandleOp<CriticalSection>);
+
+// TODO(ebrevdo): Re-enable once the cross-device function execution works.
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("ExecuteInCriticalSection")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("critical_section"),
+                        ExecuteInCriticalSectionOp);
+REGISTER_KERNEL_BUILDER(
+    Name("CriticalSectionOp").Device(DEVICE_GPU).HostMemory("resource"),
+    ResourceHandleOp<CriticalSection>);
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cross_op.cc b/tensorflow/core/kernels/cross_op.cc
index 05a33a97b413d6e6484eed50532f359e22af017c..b29524f1f9e5d2c2aaefab957a4c54756e662033 100644
--- a/tensorflow/core/kernels/cross_op.cc
+++ b/tensorflow/core/kernels/cross_op.cc
@@ -105,6 +105,7 @@ TF_CALL_REAL_NUMBER_TYPES(DECLARE_GPU_KERNEL);
   REGISTER_KERNEL_BUILDER(                                        \
       Name("Cross").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       CrossOp<GPUDevice, type>);
+
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #endif
diff --git a/tensorflow/core/kernels/cross_op_gpu.cu.cc b/tensorflow/core/kernels/cross_op_gpu.cu.cc
index 7ea0b3be0ca6b8c7df1ba5c311c7949f3672bda1..4a37f6cfbbc4c60e0a2e3cbf280b09acccc0a98c 100644
--- a/tensorflow/core/kernels/cross_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/cross_op_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/cross_op.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/cross_op.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 73ee3106048f1435f65d435405282574aa0cffda..96bdb6a241b1d88c7b14f22fc618ea9c95fb7642 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
@@ -80,16 +80,17 @@ class CTCDecodeHelper {
 
     if (!(batch_size == (*seq_len)->dim_size(0))) {
       return errors::FailedPrecondition(
-          "len(sequence_length) != batch_size.  ", "len(sequence_length):  ",
-          (*seq_len)->dim_size(0), " batch_size: ", batch_size);
+          "len(sequence_length) != batch_size.  ",
+          "len(sequence_length):  ", (*seq_len)->dim_size(0),
+          " batch_size: ", batch_size);
     }
 
     auto seq_len_t = (*seq_len)->vec<int32>();
 
     for (int b = 0; b < batch_size; ++b) {
       if (!(seq_len_t(b) <= max_time)) {
-        return errors::FailedPrecondition("sequence_length(", b, ") <= ",
-                                          max_time);
+        return errors::FailedPrecondition("sequence_length(", b,
+                                          ") <= ", max_time);
       }
     }
 
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index fb03adb7a5336919c85c4685f4cc7e7a8180892d..b38d838bf1ebdabad85ee3c70a936844f96f106a 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -113,8 +113,8 @@ class CTCLossOp : public OpKernel {
       const int64 batch_indices = g.group()[0];
       OP_REQUIRES(ctx, FastBoundsCheck(batch_indices, batch_size),
                   errors::InvalidArgument("labels batch index must be between ",
-                                          0, " and ", batch_size, " but saw: ",
-                                          batch_indices));
+                                          0, " and ", batch_size,
+                                          " but saw: ", batch_indices));
 
       auto values = g.values<int32>();
       std::vector<int>* b_values = &labels_t[batch_indices];
diff --git a/tensorflow/core/kernels/cuda_device_array.h b/tensorflow/core/kernels/cuda_device_array.h
index a570993cf866a23ff205fb8d79c9db8badf27685..e7a5db0683eba48295dca96c6c7599126e436536 100644
--- a/tensorflow/core/kernels/cuda_device_array.h
+++ b/tensorflow/core/kernels/cuda_device_array.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
+#define TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
 
 #if GOOGLE_CUDA
 
@@ -117,4 +117,4 @@ class CudaDeviceArrayOnHost {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_H_
diff --git a/tensorflow/core/kernels/cuda_device_array_gpu.h b/tensorflow/core/kernels/cuda_device_array_gpu.h
index 220f7626368852aa8b19ad18285606ed775f80b5..64fa3cb806bc7454bc6d9893e560201a620df43a 100644
--- a/tensorflow/core/kernels/cuda_device_array_gpu.h
+++ b/tensorflow/core/kernels/cuda_device_array_gpu.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Contains structs and functions to be included in device code.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
 
 #if GOOGLE_CUDA
 
@@ -47,4 +47,4 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ValueType* GetCudaDeviceArrayOnDevice(
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_CUDA_DEVICE_ARRAY_GPU_H_
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index a83671a471c35d5db238616652224b0c3830da9f..6cec032f9492020fa55c468fcd6a5b09effb0e81 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -314,6 +314,11 @@ Status CudaSolver::forward_input_or_allocate_scoped_tensor(
 // are sometimes inaccurate, e.g., are missing 'const' on pointers
 // to immutable arguments, while the actual headers have them as expected.
 // Check the actual declarations in the cusolver_api.h header file.
+//
+// NOTE: The cuSolver functions called below appear not to be threadsafe.
+// so we put a global lock around the calls. Since these functions only put a
+// kernel on the shared stream, it is not a big performance hit.
+// TODO(rmlarsen): Investigate if the locking is still needed in Cuda 9.
 //=============================================================================
 
 template <typename Scalar, typename SolverFnT>
@@ -324,6 +329,7 @@ static inline Status GeamImpl(SolverFnT solver, cublasHandle_t cublas_handle,
                               const Scalar* A, int lda,
                               const Scalar* beta, /* host or device pointer */
                               const Scalar* B, int ldb, Scalar* C, int ldc) {
+  mutex_lock lock(handle_map_mutex);
   using CudaScalar = typename CUDAComplexT<Scalar>::type;
   TF_RETURN_IF_CUBLAS_ERROR(solver(cublas_handle, transa, transb, m, n,
                                    reinterpret_cast<const CudaScalar*>(alpha),
@@ -355,6 +361,7 @@ static inline Status PotrfImpl(BufSizeFnT bufsize, SolverFnT solver,
                                cusolverDnHandle_t cusolver_dn_handle,
                                cublasFillMode_t uplo, int n, Scalar* A, int lda,
                                int* dev_lapack_info) {
+  mutex_lock lock(handle_map_mutex);
   /* Get amount of workspace memory required. */
   int lwork;
   TF_RETURN_IF_CUSOLVER_ERROR(
@@ -387,6 +394,7 @@ static inline Status GetrfImpl(BufSizeFnT bufsize, SolverFnT solver,
                                cusolverDnHandle_t cusolver_dn_handle, int m,
                                int n, Scalar* A, int lda, int* dev_pivots,
                                int* dev_lapack_info) {
+  mutex_lock lock(handle_map_mutex);
   /* Get amount of workspace memory required. */
   int lwork;
   TF_RETURN_IF_CUSOLVER_ERROR(
@@ -419,9 +427,6 @@ static inline Status GetrsImpl(SolverFnT solver, OpKernelContext* context,
                                cublasOperation_t trans, int n, int nrhs,
                                const Scalar* A, int lda, const int* pivots,
                                Scalar* B, int ldb, int* dev_lapack_info) {
-  // Note: The cuSolver functions called here appear not to be threadsafe.
-  // so we put a global lock around it. Since this function only puts a
-  // kernel on the stream, it is not a big performance hit.
   mutex_lock lock(handle_map_mutex);
   /* Launch the solver kernel. */
   TF_RETURN_IF_CUSOLVER_ERROR(solver(cusolver_dn_handle, trans, n, nrhs,
@@ -449,6 +454,7 @@ static inline Status GeqrfImpl(BufSizeFnT bufsize, SolverFnT solver,
                                cusolverDnHandle_t cusolver_dn_handle, int m,
                                int n, Scalar* A, int lda, Scalar* tau,
                                int* dev_lapack_info) {
+  mutex_lock lock(handle_map_mutex);
   /* Get amount of workspace memory required. */
   int lwork;
   TF_RETURN_IF_CUSOLVER_ERROR(
@@ -483,6 +489,7 @@ static inline Status UnmqrImpl(BufSizeFnT bufsize, SolverFnT solver,
                                int m, int n, int k, const Scalar* dev_a,
                                int lda, const Scalar* dev_tau, Scalar* dev_c,
                                int ldc, int* dev_lapack_info) {
+  mutex_lock lock(handle_map_mutex);
   /* Get amount of workspace memory required. */
   int lwork;
   TF_RETURN_IF_CUSOLVER_ERROR(
@@ -526,6 +533,7 @@ static inline Status UngqrImpl(BufSizeFnT bufsize, SolverFnT solver,
                                cusolverDnHandle_t cusolver_dn_handle, int m,
                                int n, int k, Scalar* dev_a, int lda,
                                const Scalar* dev_tau, int* dev_lapack_info) {
+  mutex_lock lock(handle_map_mutex);
   /* Get amount of workspace memory required. */
   int lwork;
   TF_RETURN_IF_CUSOLVER_ERROR(bufsize(cusolver_dn_handle, m, n, k,
@@ -606,17 +614,13 @@ static inline Status GesvdImpl(
     OpKernelContext* context, cusolverDnHandle_t cusolver_dn_handle,
     signed char jobu, signed char jobvt, int m, int n, Scalar* A, int lda,
     Scalar* S, Scalar* U, int ldu, Scalar* VT, int ldvt, int* dev_lapack_info) {
+  mutex_lock lock(handle_map_mutex);
   /* Get amount of workspace memory required. */
   int lwork;
   TF_RETURN_IF_CUSOLVER_ERROR(bufsize(cusolver_dn_handle, m, n, &lwork));
   /* Allocate device memory for workspace. */
   auto dev_workspace =
       cuda_solver->GetScratchSpace<Scalar>(lwork, "", /* on_host */ false);
-  // Note: The cuSolver functions called here appear not to be threadsafe.
-  // so we put a global lock around it. Since this function only puts a
-  // kernel on the stream, it is not a big performance hit.
-  mutex_lock lock(handle_map_mutex);
-  /* Launch the solver kernel. */
   TF_RETURN_IF_CUSOLVER_ERROR(solver(cusolver_dn_handle, jobu, jobvt, m, n,
                                      CUDAComplex(A), lda, S, CUDAComplex(U),
                                      ldu, CUDAComplex(VT), ldvt,
@@ -655,6 +659,7 @@ static inline Status GetrfBatchedImpl(SolverFnT solver, CudaSolver* cuda_solver,
                                       int lda, int* dev_pivots,
                                       DeviceLapackInfo* dev_lapack_info,
                                       int batch_size) {
+  mutex_lock lock(handle_map_mutex);
   using CudaScalar = typename CUDAComplexT<Scalar>::type;
   ScratchSpace<uint8> dev_a_dev_ptrs =
       cuda_solver->GetScratchSpace<uint8>(sizeof(CudaScalar*) * batch_size, "",
@@ -689,6 +694,7 @@ static inline Status GetrsBatchedImpl(
     const Scalar* const host_a_dev_ptrs[], int lda, const int* dev_pivots,
     const Scalar* const host_b_dev_ptrs[], int ldb,
     DeviceLapackInfo* dev_lapack_info, int batch_size) {
+  mutex_lock lock(handle_map_mutex);
   using CudaScalar = typename CUDAComplexT<Scalar>::type;
   ScratchSpace<uint8> dev_a_dev_ptrs =
       cuda_solver->GetScratchSpace<uint8>(sizeof(CudaScalar*) * batch_size, "",
@@ -734,6 +740,7 @@ static inline Status GetriBatchedImpl(
     cublasHandle_t cublas_handle, int n, const Scalar* const host_a_dev_ptrs[],
     int lda, const int* dev_pivots, const Scalar* const host_a_inv_dev_ptrs[],
     int ldainv, DeviceLapackInfo* dev_lapack_info, int batch_size) {
+  mutex_lock lock(handle_map_mutex);
   using CudaScalar = typename CUDAComplexT<Scalar>::type;
   ScratchSpace<uint8> dev_a_dev_ptrs =
       cuda_solver->GetScratchSpace<uint8>(sizeof(CudaScalar*) * batch_size, "",
@@ -776,6 +783,7 @@ static inline Status MatInvBatchedImpl(
     cublasHandle_t cublas_handle, int n, const Scalar* const host_a_dev_ptrs[],
     int lda, const Scalar* const host_a_inv_dev_ptrs[], int ldainv,
     DeviceLapackInfo* dev_lapack_info, int batch_size) {
+  mutex_lock lock(handle_map_mutex);
   using CudaScalar = typename CUDAComplexT<Scalar>::type;
   ScratchSpace<uint8> dev_a_dev_ptrs =
       cuda_solver->GetScratchSpace<uint8>(sizeof(CudaScalar*) * batch_size, "",
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index 3c389a82ab4070d5fb1bf3a091a4c85a6309eda9..ecfa23750c213361bc2d0be8df0091ed6ea26dd9 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -427,7 +427,7 @@ inline DeviceLapackInfo CudaSolver::GetDeviceLapackInfo(
     int64 size, const string& debug_info) {
   DeviceLapackInfo new_dev_info(context_, size, debug_info);
   scratch_tensor_refs_.emplace_back(new_dev_info.tensor());
-  return std::move(new_dev_info);
+  return new_dev_info;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index 5fd38d9dc25c13e20766d1fed86c3f7af9912905..1466f24202fea4200f752985d620f1fbea61d35a 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -45,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("Abs")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::abs<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index 12cc6c8bdd43b64aa1be2860b54e90aaf5e4c05e..4919122607426f719c660b23baf3a8c7cc38e076 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acosh.cc b/tensorflow/core/kernels/cwise_op_acosh.cc
index 39c8814073382566bc3551fdf6d5afc7f1ef0012..c2b355ab7f4fb11cdc89d8f98a8ca1e293818966 100644
--- a/tensorflow/core/kernels/cwise_op_acosh.cc
+++ b/tensorflow/core/kernels/cwise_op_acosh.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Acosh", functor::acosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index 608a6dce3d223d522776c59a3a1b2ad0d0c14147..bf32c8a54b34586e43d34cf8890ed37fe64b8c34 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -44,7 +44,6 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                         BinaryOp<CPUDevice, functor::add<int32>>);
 #endif
 
-
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_KERNEL(type)                          \
   REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \
@@ -66,5 +65,5 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::add<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_add_2.cc b/tensorflow/core/kernels/cwise_op_add_2.cc
index ac21ca06c929662271ad99b3756b8a22fc62a0cf..e8acbac28533ae36a5af8ce527529927f5fe4129 100644
--- a/tensorflow/core/kernels/cwise_op_add_2.cc
+++ b/tensorflow/core/kernels/cwise_op_add_2.cc
@@ -22,8 +22,8 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64,
-          uint8, complex128, string);
+REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8,
+          complex128, string);
 // Notice: String is excluded to allow marking AddV2 is_commutative and
 // is_aggregate.
 REGISTER5(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,
diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc
index c28e27d95ae661bdc02a905bb6efd5bdd79f23e5..fe8dfea1173ca6ec6727f2fb475c011176cacad4 100644
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_asinh.cc b/tensorflow/core/kernels/cwise_op_asinh.cc
index e6e1b83b30750e28d84c458236221e5f7749b5a0..7cf0405f5244a1a5a7e7e09719da25d0e714a7da 100644
--- a/tensorflow/core/kernels/cwise_op_asinh.cc
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@@ -1,10 +1,10 @@
-  /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc
index 7d73de48102189f5c0d92ce811fa639ce6ba2cf4..09f0448874f7dc2bc7140e03cbe38d42246c3087 100644
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_atanh.cc b/tensorflow/core/kernels/cwise_op_atanh.cc
index 7b688db4c585b0f8d92f289cae598a78df7e379c..6170683fa64bdd50c00c8c774d6a1f137e60fa71 100644
--- a/tensorflow/core/kernels/cwise_op_atanh.cc
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64,
+          complex128);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_and.cc b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
index 017a2182dcff0f0121dd6343f1c012802cdf28d1..5a6cf4bad1609cebc0fded4d212e50fb19d22558 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_and.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(BinaryOp, CPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, CPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                      \
@@ -30,13 +30,15 @@ REGISTER_SYCL_KERNEL(int32);
 REGISTER_SYCL_KERNEL(int64);
 REGISTER_SYCL_KERNEL(uint8);
 REGISTER_SYCL_KERNEL(uint16);
+REGISTER_SYCL_KERNEL(uint32);
+REGISTER_SYCL_KERNEL(uint64);
 #undef REGISTER_SYCL_KERNEL
 
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
-REGISTER6(BinaryOp, GPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, GPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_or.cc b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
index 36f45fe92dfce44c68a778b6c719c45d24bcaa90..201a10198a629b26429393c5c04404175399df73 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_or.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(BinaryOp, CPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, CPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                     \
@@ -30,13 +30,15 @@ REGISTER_SYCL_KERNEL(int32);
 REGISTER_SYCL_KERNEL(int64);
 REGISTER_SYCL_KERNEL(uint8);
 REGISTER_SYCL_KERNEL(uint16);
+REGISTER_SYCL_KERNEL(uint32);
+REGISTER_SYCL_KERNEL(uint64);
 #undef REGISTER_SYCL_KERNEL
 
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
-REGISTER6(BinaryOp, GPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, GPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
index 36432d851d99f20706b7e7f8535e6ac241b00937..2a7cd2699596a7ace6afd5ce688ff2e186650336 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(BinaryOp, CPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, CPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                      \
@@ -30,13 +30,15 @@ REGISTER_SYCL_KERNEL(int32);
 REGISTER_SYCL_KERNEL(int64);
 REGISTER_SYCL_KERNEL(uint8);
 REGISTER_SYCL_KERNEL(uint16);
+REGISTER_SYCL_KERNEL(uint32);
+REGISTER_SYCL_KERNEL(uint64);
 #undef REGISTER_SYCL_KERNEL
 
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
-REGISTER6(BinaryOp, GPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, GPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index 0111e9d5fd18f1d94e8d39c5e67d16e04f21e854..816eadc80eb802de46ad4bb22521cbe6a7adf6b2 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index d4b3b0e3935deeded3a0e07bd04056476c4cc29c..71ad0ff0dc2e3031df6177e4d067ad905c23169f 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc
index bca99a4f897d1cc601a082cc17ca6725929942a2..31b4bb3cadd9b2df5d0ae35b2c8ea4a155278a32 100644
--- a/tensorflow/core/kernels/cwise_op_cosh.cc
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@@ -16,20 +16,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
+          complex128);
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Cosh")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
+#define REGISTER_SYCL_KERNEL(TYPE)                                \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("Cosh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
 REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index d44c1bf473e2e778a7d31890a25359e782e1dc94..c71c756e4461d4ed36628ea8a4f8a0922896302c 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -54,5 +54,5 @@ REGISTER_KERNEL_BUILDER(Name("Div")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_div<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
index 9d4d65442762b88bb418bc0266b41ae37259e43f..8f4ac98016cb252c9c952bbc3c67eb2ea3a92f21 100644
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -20,10 +20,11 @@ REGISTER5(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, double,
           complex64, complex128);
 
 #if GOOGLE_CUDA
-REGISTER3(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double);
+REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,
+          complex64, complex128);
 #endif
 
 #if TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc
index 4f723080060041f1223dbd86aa95f1cc64f5452c..ce03ad5de6285cfa64b56e3e5357e8c916f8baf3 100644
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 5a142b9ce9f8a32fe0569a78452cf710b2317760..d554d41c412bca4a8415852427190fb16f7f8f82 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -23,5 +23,5 @@ REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index fa81ef0872d4ed6545c312b865e305ee430fdccb..fecbf859897bd1560da00f54756d4a1ffb7660d4 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -49,5 +49,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 55f8a30461f16ebd52f27792f2d3b4a05fbf6977..29340b88506147eb9535893939cf28842c671cd9 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -40,5 +40,5 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc
index 27f973c90d73a1d7828ce180254363a0b7b4be76..3fbf69c114d3c546eafb9f6c504568a649c52e59 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY6(bitwise_and, int8, int16, int32, int64, uint8, uint16);
+DEFINE_BINARY8(bitwise_and, int8, int16, int32, int64, uint8, uint16, uint32,
+               uint64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc
index a34c3a52cd6253527c67d2d1f8c1498756ff5be8..8bcb82266a2d3567c0f8d79b2fdccd5916b2ecbb 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY6(bitwise_or, int8, int16, int32, int64, uint8, uint16);
+DEFINE_BINARY8(bitwise_or, int8, int16, int32, int64, uint8, uint16, uint32,
+               uint64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc
index a4531ab7c6f283f8e732dbc87b3c64d93a8a5bef..e62a87aba44eea0fc5b1cf13a74ddfed2ef294b6 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY6(bitwise_xor, int8, int16, int32, int64, uint8, uint16);
+DEFINE_BINARY8(bitwise_xor, int8, int16, int32, int64, uint8, uint16, uint32,
+               uint64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
index e7dff5d0ac521cbe6d80efd1f591a9f23a0c650d..77723b3169fa137f0059ffd80a27e84115cb94ca 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-  DEFINE_UNARY1(conj, complex64);
-  DEFINE_UNARY1(conj, complex128);
+DEFINE_UNARY1(conj, complex64);
+DEFINE_UNARY1(conj, complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
index 3675398126f3ce13722e41b43f382c7fa1eaf111..26748ef0e724903c95f6665a5d7c00bdbd298a28 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_equal_to.cu.cc
@@ -20,7 +20,7 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 DEFINE_BINARY10(equal_to, float, Eigen::half, double, uint8, int8, int16, int64,
-               complex64, complex128, bool);
+                complex64, complex128, bool);
 DEFINE_APPROXIMATE_EQUAL2(float, double);
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc
index 0f492917bd54cc5b518e7fe76a8dd08b3934d1da..417e5da7588221b190d11092b6e03787a0dd15d4 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_exp.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_UNARY3(exp, Eigen::half, float, double);
+DEFINE_UNARY5(exp, Eigen::half, float, double, complex64, complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_invert.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_invert.cu.cc
index 62f33612db079377729d8d0edde0c37d43fb9cfb..1072ef3aa687ac75dde4d1bacb60897775e74021 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_invert.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_invert.cu.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_UNARY6(invert, int8, int16, int32, int64, uint8, uint16);
+DEFINE_UNARY8(invert, int8, int16, int32, int64, uint8, uint16, uint32, uint64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
index a54dbdfc247dfcbba370852f525f0ca686b6c1b4..627ecc8c802a2bbd428f9cc2160bec379d7b654b 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#define EIGEN_USE_GPU
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 
 namespace tensorflow {
 namespace functor {
@@ -38,19 +40,17 @@ struct SelectScalarFunctor<GPUDevice, T> {
                   typename TTypes<bool>::ConstScalar cond,
                   typename TTypes<T>::ConstFlat then_flat,
                   typename TTypes<T>::ConstFlat else_flat) {
-
 #if !defined(EIGEN_HAS_INDEX_LIST)
-  Eigen::array<int, 1> rank1{1};
+    Eigen::array<int, 1> rank1{1};
 #else
-  Eigen::IndexList<Eigen::type2index<1>> rank1;
+    Eigen::IndexList<Eigen::type2index<1> > rank1;
 #endif
-  const int size  = then_flat.dimension(0);
-  Eigen::array<int, 1> broadcast_dims{size};
-
-  To32Bit(out).device(d) = cond.reshape(rank1)
-                               .broadcast(broadcast_dims)
-                               .select(then_flat, else_flat);
+    const int size = then_flat.dimension(0);
+    Eigen::array<int, 1> broadcast_dims{size};
 
+    To32Bit(out).device(d) = cond.reshape(rank1)
+                                 .broadcast(broadcast_dims)
+                                 .select(then_flat, else_flat);
   }
 };
 
@@ -89,8 +89,8 @@ struct BatchSelectFunctor<GPUDevice, T> {
   }
 };
 
-#define SELECT_FUNCTOR(T)                      \
-  template struct SelectFunctor<GPUDevice, T>; \
+#define SELECT_FUNCTOR(T)                            \
+  template struct SelectFunctor<GPUDevice, T>;       \
   template struct SelectScalarFunctor<GPUDevice, T>; \
   template struct BatchSelectFunctor<GPUDevice, T>;
 
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
index ba89899fb323c58f0a0045f3ef32a897f5f2680a..a4ea40883694540903ac80683d3a7151fac4a583 100644
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
index 8f0c483aecd7f84bbb8ac47e4c8b5877b40335d4..3f34d6269ef4a1ab0da3dae1d08da037c5507bdd 100644
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -35,7 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float, double);
+REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float,
+          double);
 
 REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                             .Device(DEVICE_SYCL)
@@ -44,5 +45,5 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater_equal<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc
index df2c02e42e17f5bbcb74b637adcfb1dbd5cac3c1..98c8d7e9b2e7b727e4662d1fce2efee12a1d7663 100644
--- a/tensorflow/core/kernels/cwise_op_invert.cc
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@@ -16,17 +16,17 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
-          uint8, uint16);
+REGISTER8(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
+          uint8, uint16, uint32, uint64);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
-         uint8, uint16);
+          uint8, uint16, uint32, uint64);
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
-REGISTER6(UnaryOp, GPU, "Invert", functor::invert, int8, int16, int32, int64,
-          uint8, uint16);
+REGISTER8(UnaryOp, GPU, "Invert", functor::invert, int8, int16, int32, int64,
+          uint8, uint16, uint32, uint64);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index 53ec1c1c63f17a03218535c974e591b4eec62a72..ae1e590d24290a397096cbdfdf08b7e2d348f362 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -26,5 +26,5 @@ REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index 4b34744304f6c856fb98d39fbadc1e1958c84238..f22ca21e1ca425978b23910c27881eed626626e4 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
index ad2dd3f722cebba926dd04748ca146c2ecfc0848..aa180c247e7d01ef0f2898b4a50a71c3c3bc6941 100644
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -24,5 +24,5 @@ REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 136c3666dfc351fa0485eeff060a6ea3a7d48c08..00cdecdbd184b84b6601eda76dd5dfded5aa1e1b 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Less")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 97a2508d1290c5afe758db9ff54a22a22b6dcac0..11806c5fc774dc3a37abc733127e4b6660f27f9c 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -44,5 +44,5 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less_equal<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index 7fdfdff0e38ea2bfe18acac86b148a4e1e944117..98936e0f960f1f407c2187746ca80d3db0a93412 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index 25ad7b24bb1cee3a09c4ea81cccf79b6a4dabeb9..162ca9e07cdc862e04276aca0dce0ad2f4cfc70e 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 87d54e380b4b923f72aff1eb33d56dd7d8a0dd11..e8a58eea80e611d29886af773be5f1ee061d6f66 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "Maximum", functor::maximum, float, Eigen::half,
-          double, int32, int64);
+REGISTER6(BinaryOp, CPU, "Maximum", functor::maximum, float, Eigen::half,
+          bfloat16, double, int32, int64);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Maximum", functor::maximum, float, Eigen::half,
           double, int64);
@@ -43,5 +43,5 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::maximum<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 442171193bfeb41e8594bf708590fc4d52291685..dff83df828f076a076a8f220d04974344d8ffafc 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -43,6 +43,6 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::minimum<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index 023eb07ca3f52f49c95b5b6450e3417b7cbeabe4..0e8d2e37350dbbb942bd5ed6b16392b6288313fe 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
-          uint8, int32);
+REGISTER5(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
+          int32);
 #if defined(__ANDROID_TYPES_SLIM__)
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
@@ -28,7 +28,7 @@ REGISTER(BinaryOp, CPU, "Mul", functor::mul, int32);
 
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Mul", functor::mul, float, Eigen::half, double,
-           uint8);
+          uint8);
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
@@ -50,5 +50,5 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::mul<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_2.cc b/tensorflow/core/kernels/cwise_op_mul_2.cc
index 7be5857cc06d0f6755d3f4cba2ca67f009740d46..6aa8f8836406ab4f350bc7b6cc1e88bd612ad933 100644
--- a/tensorflow/core/kernels/cwise_op_mul_2.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_2.cc
@@ -22,11 +22,11 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Mul", functor::mul,
-          int8, uint16, int16, int64, complex64, complex128);
+REGISTER6(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, int16, int64,
+          complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER6(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, int16, int64,
-           complex64, complex128);
+          complex64, complex128);
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc
index 536891b548f043cb25726d70bfdd362ed0294512..a136769b912718a5749273050a2226da3fa9e3cf 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@@ -27,7 +27,7 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::neg<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64,
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
index 7bd81ee12719618181a75907ce547815b1076b84..02cd298745795294bfb8117a24ba930a7f471788 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
-           double, uint8, int8, int16);
+          double, uint8, int8, int16);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
           double, uint8);
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
index 7d4ecec59f1564c90c11bb05d6e96c7e1b52a60d..05bdea66367c6d525469dd9cdc28b56d3e4c2adc 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
@@ -30,5 +30,5 @@ REGISTER6(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64,
 
 #endif  // GOOGLE_CUDA
 
-#endif   // !defined(__ANDROID_TYPES_SLIM__)
+#endif  // !defined(__ANDROID_TYPES_SLIM__)
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
index 5fb0735ac19ba9eb057dd68c7f2d849c65d5edaa..cf86478b0fe43c777563e62e6b3fea9c7d2e6575 100644
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -16,8 +16,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER7(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double, int32,
-          int64, complex64, complex128);
+REGISTER5(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double,
+          complex64, complex128);
+REGISTER2(BinaryOp, CPU, "Pow", functor::safe_pow, int32, int64);
 
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "Pow", functor::pow, float, Eigen::half, double,
@@ -25,5 +26,5 @@ REGISTER4(BinaryOp, GPU, "Pow", functor::pow, float, Eigen::half, double,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(BinaryOp, SYCL, "Pow", functor::pow, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc
index 8c0e21f9cf3535dd5f62657de165150f9efcae2e..aee25747b866c910a799b76e3b00b699bef41566 100644
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@@ -38,7 +38,7 @@ REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -48,5 +48,5 @@ REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index 3dd9de8d897479456c462ea068c5eda6354b199b..e259daaba47e2d0ab434e47b39376f7b723bdc9d 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -30,7 +30,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SelectOp : public OpKernel {
@@ -185,7 +185,7 @@ REGISTER_SELECT_SYCL(double);
 REGISTER_SELECT_SYCL(int32);
 REGISTER_SELECT_SYCL(int64);
 #undef REGISTER_SELECT_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
@@ -201,13 +201,11 @@ struct SelectFunctorBase {
 };
 
 template <typename T>
-struct SelectFunctor<CPUDevice, T>
-        : SelectFunctorBase<CPUDevice, T> {};
+struct SelectFunctor<CPUDevice, T> : SelectFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
-struct SelectFunctor<SYCLDevice, T>
-        : SelectFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+struct SelectFunctor<SYCLDevice, T> : SelectFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct SelectScalarFunctorBase {
@@ -222,12 +220,12 @@ struct SelectScalarFunctorBase {
 // CPU Specializations of Select functors with scalar
 template <typename T>
 struct SelectScalarFunctor<CPUDevice, T>
-        : SelectScalarFunctorBase<CPUDevice, T> {};
+    : SelectScalarFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct SelectScalarFunctor<SYCLDevice, T>
-        : SelectScalarFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+    : SelectScalarFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct BatchSelectFunctorBase {
@@ -240,8 +238,8 @@ struct BatchSelectFunctorBase {
     const Eigen::DenseIndex all_but_batch = then_flat_outer_dims.dimension(1);
 
 #if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{ 1, all_but_batch }};
-    Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{ batch, 1 }};
+    Eigen::array<Eigen::DenseIndex, 2> broadcast_dims{{1, all_but_batch}};
+    Eigen::Tensor<Eigen::DenseIndex, 2>::Dimensions reshape_dims{{batch, 1}};
 #else
     Eigen::IndexList<Eigen::type2index<1>, Eigen::DenseIndex> broadcast_dims;
     broadcast_dims.set(1, all_but_batch);
@@ -257,13 +255,13 @@ struct BatchSelectFunctorBase {
 };
 
 template <typename T>
-struct BatchSelectFunctor<CPUDevice, T>
-        : BatchSelectFunctorBase<CPUDevice, T> {};
+struct BatchSelectFunctor<CPUDevice, T> : BatchSelectFunctorBase<CPUDevice, T> {
+};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct BatchSelectFunctor<SYCLDevice, T>
-        : BatchSelectFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+    : BatchSelectFunctorBase<SYCLDevice, T> {};
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc
index a76a088ac8f762a1aa980170ba4617b0c66c6e47..c132fdb63f2b8669294de63ec6cb8567002e9bdd 100644
--- a/tensorflow/core/kernels/cwise_op_sigmoid.cc
+++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc
@@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -35,6 +35,6 @@ REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float,
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index a4084d5ad1796f5af1ce1a62e76c9dc6b473586d..02915ff4ce4547516e6e12bc250b605135d70521 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -41,6 +41,6 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::sign<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index b91ff1ac30ba8e7259223e011aa1e70b0a05f623..16c6057864073596592b62f4463cfd1229d3a415 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -25,5 +25,5 @@ REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Sin", functor::sin, float, double);
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc
index 055f0b12e14b1e1059600b968584a2ff9924237f..26b7a940aa8dd4fd6ce439eac17b6fd44d0fe3fd 100644
--- a/tensorflow/core/kernels/cwise_op_sinh.cc
+++ b/tensorflow/core/kernels/cwise_op_sinh.cc
@@ -16,20 +16,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double,
-          complex64, complex128);
+REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64,
+          complex128);
 
 #if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Sinh")                                \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
+#define REGISTER_SYCL_KERNEL(TYPE)                                \
+  REGISTER_KERNEL_BUILDER(                                        \
+      Name("Sinh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
 REGISTER_SYCL_KERNEL(float);
 REGISTER_SYCL_KERNEL(double);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Sinh", functor::sinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index 00efbb00f1501669b221682c565b4843c0497128..497756133d05249141823481e6ef43b73a84660b 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -25,7 +25,7 @@ REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Sqrt", functor::sqrt, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -36,5 +36,5 @@ REGISTER3(SimpleBinaryOp, GPU, "SqrtGrad", functor::sqrt_grad, float,
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(SimpleBinaryOp, SYCL, "SqrtGrad", functor::sqrt_grad, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index 07a4b0b084d804c46a8a4a0bc272f78b22d7e845..7fc2f6bf08b2c825f471123e1ab58bd060f6070a 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -42,5 +42,5 @@ REGISTER_KERNEL_BUILDER(Name("Square")
                             .HostMemory("y")
                             .TypeConstraint<int32>("T"),
                         UnaryOp<CPUDevice, functor::square<int32>>);
-#endif // TENSORFLOW_USE_SYC
+#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index 6adaecba04bfcf1b42a760d712eece493131ade2..025041946ac71f0e8f4724f9432d5e2901e348cc 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -53,5 +53,5 @@ REGISTER_KERNEL_BUILDER(Name("Sub")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::sub<int32>>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index 7891b1183dd56b9809ef7f5dc76c3f04fe605b02..c1a25767d3146abc43442cc25b48378c74f8e984 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -24,5 +24,5 @@ REGISTER2(UnaryOp, GPU, "Tan", functor::tan, float, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Tan", functor::tan, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index 8b3900892c300ee266b1a7fb066ef79c88c3d087..c5005f5ea8aa3e0b392bd038983d1658c8c56520 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -26,7 +26,7 @@ REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
           Eigen::half, double, complex64, complex128);
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index d32185b6bf48f7b6d49f355c0653004310bde533..06918075a42648a3cf7135376d728fa466e7c469 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -21,29 +21,34 @@ limitations under the License.
 #include <type_traits>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 
 namespace Eigen {
-namespace internal {
+namespace numext {
+#if GOOGLE_CUDA
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<float> exp(
+    const std::complex<float>& x) {
+  auto com = ::expf(x.real());
+  auto res_real = com * ::cosf(x.imag());
+  auto res_imag = com * ::sinf(x.imag());
+  return std::complex<float>(res_real, res_imag);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<double> exp(
+    const std::complex<double>& x) {
+  auto com = ::exp(x.real());
+  auto res_real = com * ::cos(x.imag());
+  auto res_imag = com * ::sin(x.imag());
+  return std::complex<double>(res_real, res_imag);
+}
+#endif
+}  // namespace numext
 
-// TODO(rmlarsen): Get rid of fmod2 once fmod is upstreamed to Eigen.
-template <typename T>
-struct scalar_fmod2_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod2_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a,
-                                                           const T& b) const {
-    return std::fmod(a, b);
-  }
-};
-template <typename T>
-struct functor_traits<scalar_fmod2_op<T>> {
-  enum {
-    Cost = 13,  // Reciprocal throughput of FPREM on Haswell.
-    PacketAccess = false,
-  };
-};
+namespace internal {
 
 template <typename T>
 struct scalar_asinh_op {
@@ -111,6 +116,35 @@ struct functor_traits<scalar_binary_pow_op_google<Scalar, Exponent>> {
   enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
 };
 
+template <typename Scalar, typename Exponent>
+struct safe_scalar_binary_pow_op {
+  static_assert(std::is_integral<Scalar>::value, "Integer type expected");
+  static_assert(std::is_integral<Exponent>::value &&
+                    std::is_signed<Exponent>::value,
+                "Signed integer type expected");
+
+  bool* const error;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_scalar_binary_pow_op(bool* error)
+      : error(error) {}
+
+  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a,
+                                             const Exponent& b) const {
+    const Exponent safe_b = tensorflow::internal::SubtleMustCopy(b);
+    if (TF_PREDICT_TRUE(safe_b >= 0)) {
+      return numext::pow(a, safe_b);
+    } else {
+      *error = true;
+      return 0;
+    }
+  }
+};
+
+template <typename Scalar, typename Exponent>
+struct functor_traits<safe_scalar_binary_pow_op<Scalar, Exponent>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+
 template <typename T, typename DivOrMod>
 struct safe_div_or_mod_op {
   static_assert(std::is_integral<T>::value, "Integer type expected");
@@ -702,7 +736,7 @@ struct safe_div : base<T, Eigen::internal::safe_div_or_mod_op<
 };
 
 template <typename T>
-struct fmod : base<T, Eigen::internal::scalar_fmod2_op<T>> {};
+struct fmod : base<T, Eigen::internal::scalar_fmod_op<T>> {};
 
 template <typename T>
 struct mod : base<T, Eigen::internal::scalar_mod2_op<T>> {};
@@ -737,6 +771,11 @@ struct floor_div_real : base<T, Eigen::internal::google_floor_div_real<T>> {};
 template <typename T>
 struct pow : base<T, Eigen::internal::scalar_binary_pow_op_google<T, T>> {};
 
+template <typename T>
+struct safe_pow : base<T, Eigen::internal::safe_scalar_binary_pow_op<T, T>> {
+  static const bool has_errors = true;
+};
+
 template <typename T>
 struct maximum : base<T, Eigen::internal::scalar_max_op<T>> {};
 
diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
index 693c6467ac592e3357e5b06a620a64b3829bc938..980edffceb35ee3f3d7f3557093baec1487a9b5a 100644
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -40,6 +40,11 @@ void BinaryOpShared::SetComputeError(OpKernelContext* ctx) {
   if ((op == "Div" || op == "Mod" || op == "FloorMod" || op == "FloorDiv") &&
       DataTypeIsInteger(ctx->op_kernel().input_type(0))) {
     ctx->CtxFailure(errors::InvalidArgument("Integer division by zero"));
+  } else if ((op == "Pow") &&
+             DataTypeIsInteger(ctx->op_kernel().input_type(0)) &&
+             DataTypeIsSigned(ctx->op_kernel().input_type(1))) {
+    ctx->CtxFailure(errors::InvalidArgument(
+        "Integers to negative integer powers are not allowed"));
   } else {
     ctx->CtxFailure(
         errors::Internal("Unexpected error in binary operator "
@@ -52,9 +57,9 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
       in1(ctx->input(1)),
       bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) {
   if (!bcast.IsValid()) {
-    ctx->SetStatus(errors::InvalidArgument("Incompatible shapes: ",
-                                           in0.shape().DebugString(), " vs. ",
-                                           in1.shape().DebugString()));
+    ctx->SetStatus(errors::InvalidArgument(
+        "Incompatible shapes: ", in0.shape().DebugString(), " vs. ",
+        in1.shape().DebugString()));
     return;
   }
   const TensorShape output_shape = BCast::ToShape(bcast.output_shape());
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
index 6dd108f7226ab5a64b8c074afa9ab219f045158a..965e42dcce1b24460d28e24cd33c520598ecfc41 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -136,6 +136,9 @@ struct ApproximateEqual<GPUDevice, T> {
 #define DEFINE_UNARY7(F, T0, T1, T2, T3, T4, T5, T6) \
   DEFINE_UNARY2(F, T0, T1);                          \
   DEFINE_UNARY5(F, T2, T3, T4, T5, T6)
+#define DEFINE_UNARY8(F, T0, T1, T2, T3, T4, T5, T6, T7) \
+  DEFINE_UNARY4(F, T0, T1, T2, T3);                      \
+  DEFINE_UNARY4(F, T4, T5, T6, T7)
 
 // Macros to explicitly instantiate kernels on GPU for multiple types
 // (T0, T1, etc.) for BinaryFunctor.
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
index 439477070893d37a9fcb7b662e379cce2955b07a..e81b840a509ada73e62a763b203763d9e4e65363 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h
@@ -50,16 +50,16 @@ struct SimpleBinaryFunctor<GPUDevice, Functor> {
 
 // Macros to explicitly instantiate kernels on GPU for multiple types
 // (T0, T1, etc.) for SimpleBiaryFunctor (e.g., functor::tanh_grad).
-#define DEFINE_SIMPLE_BINARY1(F, T)                  \
+#define DEFINE_SIMPLE_BINARY1(F, T) \
   template struct SimpleBinaryFunctor<GPUDevice, F<T> >
-#define DEFINE_SIMPLE_BINARY2(F, T0, T1)             \
-  DEFINE_SIMPLE_BINARY1(F, T0);                      \
+#define DEFINE_SIMPLE_BINARY2(F, T0, T1) \
+  DEFINE_SIMPLE_BINARY1(F, T0);          \
   DEFINE_SIMPLE_BINARY1(F, T1)
-#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2)         \
-  DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
+#define DEFINE_SIMPLE_BINARY3(F, T0, T1, T2) \
+  DEFINE_SIMPLE_BINARY2(F, T0, T1);          \
   DEFINE_SIMPLE_BINARY1(F, T2)
-#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3)     \
-  DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
+#define DEFINE_SIMPLE_BINARY4(F, T0, T1, T2, T3) \
+  DEFINE_SIMPLE_BINARY2(F, T0, T1);              \
   DEFINE_SIMPLE_BINARY2(F, T2, T3)
 #define DEFINE_SIMPLE_BINARY5(F, T0, T1, T2, T3, T4) \
   DEFINE_SIMPLE_BINARY2(F, T0, T1);                  \
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index 77b330f5899815d5784659515e43ee497bdca58e..82cdae9a348aaf3625e1e4cf9f80ea7768694062 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -171,7 +171,6 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
   }
 };
 
-
 #ifdef TENSORFLOW_USE_SYCL
 // Partial specialization of BinaryFunctor for SYCL devices
 typedef Eigen::SyclDevice SYCLDevice;
@@ -184,7 +183,7 @@ struct SimpleBinaryFunctor<SYCLDevice, Functor> {
   }
 };
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 struct tanh_grad : base<T, Eigen::internal::scalar_tanh_gradient_op<T>> {};
diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h
index 3f6ff7303d627ca64abd0f93658bf1b40ce4d71e..3e107cee04c787d71326bbe4799565f8609f6f4e 100644
--- a/tensorflow/core/kernels/cwise_ops_sycl_common.h
+++ b/tensorflow/core/kernels/cwise_ops_sycl_common.h
@@ -51,7 +51,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
   void operator()(const SYCLDevice& d, typename Functor::tout_type out,
                   typename Functor::tin_type in0,
                   typename Functor::tin_type in1, bool* error) {
-    To32Bit(out).device(d) = To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
+    To32Bit(out).device(d) =
+        To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
   }
 
   void Left(const SYCLDevice& d, typename Functor::tout_type out,
@@ -61,7 +62,9 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
     constexpr int NumDims = Functor::tin_type::NumDimensions;
     static_assert(NumDims == 1, "Unexpected size");
     Eigen::Sizes<1> scalar_dim;
-    out.device(d) = scalar.reshape(scalar_dim).broadcast(in.dimensions()).binaryExpr(in, Binary());
+    out.device(d) = scalar.reshape(scalar_dim)
+                        .broadcast(in.dimensions())
+                        .binaryExpr(in, Binary());
   }
 
   void Right(const SYCLDevice& d, typename Functor::tout_type out,
@@ -71,7 +74,8 @@ struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
     constexpr int NumDims = Functor::tin_type::NumDimensions;
     static_assert(NumDims == 1, "Unexpected size");
     Eigen::Sizes<1> scalar_dim;
-    out.device(d) = in.binaryExpr(scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
+    out.device(d) = in.binaryExpr(
+        scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
   }
 
   void BCast(const SYCLDevice& d,
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index bca0f1004d5f41fd3c8fd8b4eebd44c981053520..39f497e71612fc08a085e410edae73669fc9993a 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -54,36 +54,36 @@ int ColsFromArg(int arg) { return (arg % kRows); }
 BM_UNARY(cpu, Floor, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_UNARY(sycl, Floor, float, DT_FLOAT);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Floor, double, DT_DOUBLE);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Floor, double, DT_DOUBLE);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_UNARY(sycl, Floor, double, DT_DOUBLE);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 BM_UNARY(cpu, Rint, double, DT_DOUBLE);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, double, DT_DOUBLE);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_UNARY(cpu, Rint, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_UNARY(gpu, Rint, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 // data func scalar.
 Graph* BinaryScalar(int num, const string& func) {
@@ -113,18 +113,18 @@ Graph* BinaryScalar(int num, const string& func) {
 BM_BINARY_SCALAR(cpu, Less);
 #if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Less);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BINARY_SCALAR(sycl, Less);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 BM_BINARY_SCALAR(cpu, Add);
 #if GOOGLE_CUDA
 BM_BINARY_SCALAR(gpu, Add);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BINARY_SCALAR(sycl, Add);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BINARY_SCALAR
 
 template <class T>
@@ -163,11 +163,11 @@ using Eigen::half;
 BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #undef BM_BIAS_ADD_ALL
 #undef BM_BIAS_ADD
 
@@ -217,15 +217,15 @@ using Eigen::half;
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
 #if GOOGLE_CUDA
 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #undef BM_BIAS_ADD_GRAD_ALL
 #undef BM_BIAS_ADD_GRAD
 
@@ -265,10 +265,10 @@ Graph* BcastAdd(int rows, int cols, int dim) {
 BM_BCAST_ADD_ROW_ALL(cpu);
 #if GOOGLE_CUDA
 BM_BCAST_ADD_ROW_ALL(gpu);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BCAST_ADD_ROW_ALL(sycl);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_ROW_ALL
 #undef BM_BCAST_ADD_ROW
 
@@ -291,10 +291,10 @@ BM_BCAST_ADD_ROW_ALL(sycl);
 BM_BCAST_ADD_COL_ALL(cpu);
 #if GOOGLE_CUDA
 BM_BCAST_ADD_COL_ALL(gpu);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 BM_BCAST_ADD_COL_ALL(sycl);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_COL_ALL
 #undef BM_BCAST_ADD_COL
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1e3b0c231f35c12d2e9e23d8d503b3a7492ab676
--- /dev/null
+++ b/tensorflow/core/kernels/data/BUILD
@@ -0,0 +1,549 @@
+# Description:
+#   OpKernels for tf.data
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_kernel_library",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "stats_aggregator",
+    hdrs = ["stats_aggregator.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_aggregator_ops",
+    srcs = ["stats_aggregator_ops.cc"],
+    deps = [
+        ":stats_aggregator",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+# TODO(mrry): Remove this empty forwarding library.
+cc_library(
+    name = "dataset",
+    srcs = [],
+    hdrs = ["dataset.h"],
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "dataset_utils",
+    srcs = ["dataset_utils.cc"],
+    hdrs = ["dataset_utils.h"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "captured_function",
+    srcs = ["captured_function.cc"],
+    hdrs = ["captured_function.h"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:variable_ops",
+    ],
+)
+
+cc_library(
+    name = "window_dataset",
+    srcs = ["window_dataset.cc"],
+    hdrs = ["window_dataset.h"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "batch_dataset_op",
+    srcs = ["batch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
+    ],
+)
+
+tf_kernel_library(
+    name = "padded_batch_dataset_op",
+    srcs = ["padded_batch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
+    ],
+)
+
+tf_kernel_library(
+    name = "dense_to_sparse_batch_dataset_op",
+    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "group_by_window_dataset_op",
+    srcs = ["group_by_window_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        ":window_dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "filter_dataset_op",
+    srcs = ["filter_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "map_dataset_op",
+    srcs = ["map_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "map_and_batch_dataset_op",
+    srcs = ["map_and_batch_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:inplace_ops",
+    ],
+)
+
+tf_kernel_library(
+    name = "parallel_map_dataset_op",
+    srcs = ["parallel_map_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "scan_dataset_op",
+    srcs = ["scan_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "flat_map_dataset_op",
+    srcs = ["flat_map_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        ":dataset_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "interleave_dataset_op",
+    srcs = ["interleave_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        ":dataset_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "parallel_interleave_dataset_op",
+    srcs = ["parallel_interleave_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        ":dataset_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "prefetch_dataset_op",
+    srcs = ["prefetch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "repeat_dataset_op",
+    srcs = ["repeat_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "take_dataset_op",
+    srcs = ["take_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "skip_dataset_op",
+    srcs = ["skip_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_dataset_ops",
+    srcs = ["stats_dataset_ops.cc"],
+    deps = [
+        ":dataset",
+        ":stats_aggregator",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "random_dataset_op",
+    srcs = ["random_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "range_dataset_op",
+    srcs = ["range_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "shuffle_dataset_op",
+    srcs = ["shuffle_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "sparse_tensor_slice_dataset_op",
+    srcs = ["sparse_tensor_slice_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "tensor_dataset_op",
+    srcs = ["tensor_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "tensor_queue_dataset_op",
+    srcs = ["tensor_queue_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
+    ],
+)
+
+tf_kernel_library(
+    name = "tensor_slice_dataset_op",
+    srcs = ["tensor_slice_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:batch_util",
+    ],
+)
+
+tf_kernel_library(
+    name = "zip_dataset_op",
+    srcs = ["zip_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "concatenate_dataset_op",
+    srcs = ["concatenate_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "reader_dataset_ops",
+    srcs = ["reader_dataset_ops.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "sql_dataset_ops",
+    srcs = [
+        "sql_dataset_ops.cc",
+    ],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data/sql",
+    ],
+)
+
+tf_kernel_library(
+    name = "iterator_ops",
+    srcs = ["iterator_ops.cc"],
+    deps = [
+        ":dataset",
+        ":stats_aggregator",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+tf_kernel_library(
+    name = "cache_dataset_ops",
+    srcs = ["cache_dataset_ops.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/util/tensor_bundle",
+    ],
+)
+
+tf_kernel_library(
+    name = "unique_dataset_op",
+    srcs = ["unique_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "dataset_ops",
+    deps = [
+        ":batch_dataset_op",
+        ":cache_dataset_ops",
+        ":concatenate_dataset_op",
+        ":dense_to_sparse_batch_dataset_op",
+        ":filter_dataset_op",
+        ":flat_map_dataset_op",
+        ":group_by_window_dataset_op",
+        ":interleave_dataset_op",
+        ":iterator_ops",
+        ":map_and_batch_dataset_op",
+        ":map_dataset_op",
+        ":padded_batch_dataset_op",
+        ":parallel_interleave_dataset_op",
+        ":parallel_map_dataset_op",
+        ":prefetch_dataset_op",
+        ":random_dataset_op",
+        ":range_dataset_op",
+        ":reader_dataset_ops",
+        ":repeat_dataset_op",
+        ":scan_dataset_op",
+        ":shuffle_dataset_op",
+        ":skip_dataset_op",
+        ":sparse_tensor_slice_dataset_op",
+        ":sql_dataset_ops",
+        ":stats_aggregator_ops",
+        ":stats_dataset_ops",
+        ":take_dataset_op",
+        ":tensor_dataset_op",
+        ":tensor_queue_dataset_op",
+        ":tensor_slice_dataset_op",
+        ":unique_dataset_op",
+        ":zip_dataset_op",
+    ],
+)
diff --git a/tensorflow/core/kernels/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
similarity index 91%
rename from tensorflow/core/kernels/batch_dataset_op.cc
rename to tensorflow/core/kernels/data/batch_dataset_op.cc
index 3dec4f71d8a6823d15f6173d139fd9e60e9df29d..7fa67efb9e22e6877b97524150b9024521619dbc 100644
--- a/tensorflow/core/kernels/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -93,7 +92,6 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
     }
 
    private:
-
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
@@ -146,11 +144,21 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
           const Tensor& first_element = batch_elements[0][component_index];
           TensorShape batch_component_shape({num_batch_elements});
           batch_component_shape.AppendShape(first_element.shape());
-          Tensor batch_component(cpu_allocator(), first_element.dtype(),
+          Tensor batch_component(ctx->allocator({}), first_element.dtype(),
                                  batch_component_shape);
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
           for (size_t i = 0; i < num_batch_elements; ++i) {
+            if (batch_elements[i][component_index].shape() !=
+                first_element.shape()) {
+              return errors::InvalidArgument(
+                  "Cannot batch tensors with different shapes in component ",
+                  component_index, ". First element had shape ",
+                  first_element.shape().DebugString(), " and element ", i,
+                  " had shape ",
+                  batch_elements[i][component_index].shape().DebugString(),
+                  ".");
+            }
             TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice(
                 std::move(batch_elements[i][component_index]), &batch_component,
                 i));
@@ -173,7 +181,7 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         if (!reader->Contains(full_name("input_impl_empty"))) {
diff --git a/tensorflow/core/kernels/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
similarity index 99%
rename from tensorflow/core/kernels/cache_dataset_ops.cc
rename to tensorflow/core/kernels/data/cache_dataset_ops.cc
index 137002b9d77a18fbd5660eb06bcf69d0c4ad3f13..f0a2192826e051586e4999d729c24ed5495be0ea 100644
--- a/tensorflow/core/kernels/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4aa9ec26545a2792c1e741af69f61a292fcc216
--- /dev/null
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -0,0 +1,318 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/captured_function.h"
+
+#include <utility>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/notification.h"
+
+namespace tensorflow {
+
+/* static */
+Status CapturedFunction::Create(
+    const NameAttrList& func, std::vector<Tensor> captured_inputs,
+    std::unique_ptr<CapturedFunction>* out_function) {
+  out_function->reset(new CapturedFunction(func, std::move(captured_inputs)));
+  return Status::OK();
+}
+
+CapturedFunction::~CapturedFunction() {
+  if (lib_ != nullptr && f_handle_ != kInvalidHandle) {
+    lib_->ReleaseHandle(f_handle_).IgnoreError();
+  }
+}
+
+namespace {
+class CallFrameBase : public CallFrameInterface {
+ public:
+  explicit CallFrameBase(DataTypeSlice ret_types)
+      : ret_types_(ret_types), retvals_(ret_types.size()) {}
+
+  // Caller methods.
+  Status ConsumeRetvals(std::vector<Tensor>* retvals) {
+    retvals->reserve(retvals_.size());
+    int i = 0;
+    for (auto&& val : retvals_) {
+      if (!val) {
+        return errors::Internal("No return value for index ", i, ".");
+      }
+      retvals->emplace_back(std::move(val.value()));
+      ++i;
+    }
+    return Status::OK();
+  }
+
+  size_t num_retvals() const override { return retvals_.size(); }
+
+  // Callee methods.
+  Status SetRetval(int index, const Tensor& val) override {
+    if (index < retvals_.size() && val.dtype() == ret_types_[index] &&
+        !retvals_[index]) {
+      retvals_[index] = val;
+      return Status::OK();
+    } else if (index >= retvals_.size()) {
+      return errors::InvalidArgument("Return value ", index,
+                                     " is out of range.");
+    } else if (val.dtype() != ret_types_[index]) {
+      return errors::InvalidArgument("Expected type ",
+                                     DataTypeString(ret_types_[index]),
+                                     " for return value ", index, " but got ",
+                                     DataTypeString(val.dtype()), ".");
+    } else {
+      return errors::Internal("Attempted to set return value ", index,
+                              " more than once.");
+    }
+  }
+
+ private:
+  DataTypeSlice ret_types_;
+  std::vector<gtl::optional<Tensor>> retvals_;
+  TF_DISALLOW_COPY_AND_ASSIGN(CallFrameBase);
+};
+
+class OwnedArgsCallFrame : public CallFrameBase {
+ public:
+  OwnedArgsCallFrame(std::vector<Tensor>&& args,
+                     const std::vector<Tensor>* captured_inputs,
+                     DataTypeSlice ret_types)
+      : CallFrameBase(ret_types),
+        args_(std::move(args)),
+        captured_inputs_(captured_inputs) {}
+
+  size_t num_args() const override {
+    return args_.size() + captured_inputs_->size();
+  }
+
+  // Callee methods.
+  Status GetArg(int index, Tensor* val) const override {
+    if (index < args_.size() && args_[index].IsInitialized()) {
+      // TODO(mrry): Consider making `CallFrameInterface::GetArg` non-const in
+      // order to be able to `std::move(args_[index])` into `*val`.
+      *val = args_[index];
+      return Status::OK();
+    } else if (index < args_.size() + captured_inputs_->size()) {
+      *val = (*captured_inputs_)[index - args_.size()];
+      return Status::OK();
+    } else if (index >= args_.size() + captured_inputs_->size()) {
+      return errors::InvalidArgument("Argument ", index, " is out of range.");
+    } else {
+      return errors::Internal("Attempted to get argument ", index,
+                              " more than once.");
+    }
+  }
+
+ private:
+  std::vector<Tensor> args_;
+  const std::vector<Tensor>* const captured_inputs_;  // Not owned.
+};
+
+class BorrowedArgsCallFrame : public CallFrameBase {
+ public:
+  BorrowedArgsCallFrame(const std::vector<Tensor>& args,
+                        const std::vector<Tensor>* captured_inputs,
+                        DataTypeSlice ret_types)
+      : CallFrameBase(ret_types),
+        args_(args),
+        captured_inputs_(captured_inputs) {}
+
+  size_t num_args() const override {
+    return args_.size() + captured_inputs_->size();
+  }
+
+  // Callee methods.
+  Status GetArg(int index, Tensor* val) const override {
+    if (index < args_.size() && args_[index].IsInitialized()) {
+      *val = args_[index];
+      return Status::OK();
+    } else if (index < args_.size() + captured_inputs_->size()) {
+      *val = (*captured_inputs_)[index - args_.size()];
+      return Status::OK();
+    } else if (index >= args_.size() + captured_inputs_->size()) {
+      return errors::InvalidArgument("Argument ", index, " is out of range.");
+    } else {
+      return errors::Internal("Attempted to get argument ", index,
+                              " more than once.");
+    }
+  }
+
+ private:
+  const std::vector<Tensor>& args_;                   // Not owned.
+  const std::vector<Tensor>* const captured_inputs_;  // Not owned.
+};
+
+}  // namespace
+
+Status CapturedFunction::MaybeInstantiate(
+    IteratorContext* ctx, FunctionLibraryRuntime::Handle* out_handle) {
+  mutex_lock l(mu_);
+  if (lib_ == nullptr) {
+    // The context's runtime will be used for all subsequent calls.
+    lib_ = ctx->lib();
+    DCHECK(f_handle_ == kInvalidHandle);
+    FunctionLibraryRuntime::InstantiateOptions inst_opts;
+    inst_opts.overlay_lib = ctx->function_library().get();
+    inst_opts.state_handle = std::to_string(random::New64());
+    TF_RETURN_IF_ERROR(lib_->Instantiate(func_.name(), AttrSlice(&func_.attr()),
+                                         inst_opts, &f_handle_));
+    const FunctionBody* fbody = lib_->GetFunctionBody(f_handle_);
+    if (fbody == nullptr) {
+      return errors::Internal("Failed to instantiate function body.");
+    }
+    ret_types_ = fbody->ret_types;
+  } else {
+    // TODO(mrry): Consider moving this under a shared lock, as it is
+    // the common case.
+    if (ctx->lib() != lib_) {
+      return errors::Internal(
+          "Captured function was called with a different "
+          "FunctionLibraryRuntime*, which is not permitted.");
+    }
+  }
+  *out_handle = f_handle_;
+  return Status::OK();
+}
+
+Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
+                             std::vector<Tensor>* rets) {
+  FunctionLibraryRuntime::Handle handle;
+  TF_RETURN_IF_ERROR(MaybeInstantiate(ctx, &handle));
+
+  FunctionLibraryRuntime::Options f_opts;
+  f_opts.step_id = CapturedFunction::generate_step_id();
+  ScopedStepContainer step_container(f_opts.step_id, [ctx](const string& name) {
+    ctx->lib()->device()->resource_manager()->Cleanup(name).IgnoreError();
+  });
+  f_opts.step_container = &step_container;
+  f_opts.runner = ctx->runner();
+  // TODO(mrry): Add cancellation manager support to IteratorContext
+  // so that we can cancel running map functions. The local
+  // cancellation manager here is created so that we can run kernels
+  // (such as queue kernels) that depend on the non-nullness of
+  // `OpKernelContext::cancellation_manager()`, but additional effort
+  // will be required to plumb it through the `IteratorContext`.
+  CancellationManager c_mgr;
+  f_opts.cancellation_manager = &c_mgr;
+
+  OwnedArgsCallFrame frame(std::move(args), &captured_inputs_, ret_types_);
+  Notification n;
+  Status s;
+  ctx->lib()->Run(f_opts, handle, &frame, [&n, &s](Status func_status) {
+    s.Update(func_status);
+    n.Notify();
+  });
+  n.WaitForNotification();
+  TF_RETURN_IF_ERROR(s);
+  return frame.ConsumeRetvals(rets);
+}
+
+Status CapturedFunction::RunWithBorrowedArgs(IteratorContext* ctx,
+                                             const std::vector<Tensor>& args,
+                                             std::vector<Tensor>* rets) {
+  FunctionLibraryRuntime::Handle handle;
+  TF_RETURN_IF_ERROR(MaybeInstantiate(ctx, &handle));
+
+  FunctionLibraryRuntime::Options f_opts;
+  f_opts.step_id = CapturedFunction::generate_step_id();
+  ScopedStepContainer step_container(f_opts.step_id, [ctx](const string& name) {
+    ctx->lib()->device()->resource_manager()->Cleanup(name).IgnoreError();
+  });
+  f_opts.step_container = &step_container;
+  f_opts.runner = ctx->runner();
+  // TODO(mrry): Add cancellation manager support to IteratorContext
+  // so that we can cancel running map functions. The local
+  // cancellation manager here is created so that we can run kernels
+  // (such as queue kernels) that depend on the non-nullness of
+  // `OpKernelContext::cancellation_manager()`, but additional effort
+  // will be required to plumb it through the `IteratorContext`.
+  CancellationManager c_mgr;
+  f_opts.cancellation_manager = &c_mgr;
+
+  BorrowedArgsCallFrame frame(args, &captured_inputs_, ret_types_);
+  Notification n;
+  Status s;
+
+  ctx->lib()->Run(f_opts, handle, &frame, [&n, &s](Status func_status) {
+    s.Update(func_status);
+    n.Notify();
+  });
+  n.WaitForNotification();
+  TF_RETURN_IF_ERROR(s);
+  return frame.ConsumeRetvals(rets);
+}
+
+void CapturedFunction::RunAsync(IteratorContext* ctx,
+                                std::vector<Tensor>&& args,
+                                std::vector<Tensor>* rets,
+                                FunctionLibraryRuntime::DoneCallback done) {
+  // NOTE(mrry): This method does not transfer ownership of `ctx`, and it may
+  // be deleted before `done` is called. Take care not to capture `ctx` in any
+  // code that may execute asynchronously in this function.
+  FunctionLibraryRuntime::Handle handle;
+  Status s = MaybeInstantiate(ctx, &handle);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+  auto frame =
+      new OwnedArgsCallFrame(std::move(args), &captured_inputs_, ret_types_);
+
+  FunctionLibraryRuntime::Options f_opts;
+  f_opts.step_id = CapturedFunction::generate_step_id();
+  ResourceMgr* resource_mgr = ctx->lib()->device()->resource_manager();
+  auto step_container = new ScopedStepContainer(
+      f_opts.step_id, [resource_mgr](const string& name) {
+        resource_mgr->Cleanup(name).IgnoreError();
+      });
+  f_opts.step_container = step_container;
+  f_opts.runner = ctx->runner();
+  // TODO(mrry): Add cancellation manager support to IteratorContext
+  // so that we can cancel running map functions. The local
+  // cancellation manager here is created so that we can run kernels
+  // (such as queue kernels) that depend on the non-nullness of
+  // `OpKernelContext::cancellation_manager()`, but additional effort
+  // will be required to plumb it through the `IteratorContext`.
+  auto c_mgr = new CancellationManager;
+  f_opts.cancellation_manager = c_mgr;
+
+  tf_shared_lock l(mu_);
+  ctx->lib()->Run(f_opts, handle, frame,
+                  std::bind(
+                      [rets, step_container, c_mgr, frame](
+                          FunctionLibraryRuntime::DoneCallback done,
+                          // Begin unbound arguments.
+                          Status s) {
+                        delete step_container;
+                        delete c_mgr;
+                        if (s.ok()) {
+                          s = frame->ConsumeRetvals(rets);
+                        }
+                        delete frame;
+                        done(s);
+                      },
+                      std::move(done), std::placeholders::_1));
+}
+
+CapturedFunction::CapturedFunction(const NameAttrList& func,
+                                   std::vector<Tensor> captured_inputs)
+    : func_(func),
+      lib_(nullptr),
+      f_handle_(kInvalidHandle),
+      captured_inputs_(std::move(captured_inputs)) {}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..32d2bc3aaebf440584934231a8555199026074ae
--- /dev/null
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class Device;
+class OpKernelContext;
+class ResourceMgr;
+
+// A `CapturedFunction` encapsulates a TensorFlow function and all of
+// the runtime support required to execute it.
+//
+// The `Dataset`-related classes use `CapturedFunction` to execute
+// TensorFlow functions outside a the normal `OpKernel::Compute()`
+// context.
+class CapturedFunction {
+ public:
+  // NOTE(mrry): The `captured_inputs` are passed by value. For
+  // efficiency, you are recommended to move this argument into the call.
+  static Status Create(const NameAttrList& func,
+                       std::vector<Tensor> captured_inputs,
+                       std::unique_ptr<CapturedFunction>* out_function);
+
+  ~CapturedFunction();
+
+  // Runs the "Captured function" using the given FLR and caches the lib and
+  // handle generated during instantiation. If Run is called with a different
+  // lib afterwards, generates an error. This method takes ownership of the
+  // tensors in `args`, in order to be able to deallocate them as early as
+  // possible. Use `RunWithBorrowedArgs()` if the caller needs to retain
+  // ownership of the `args`.
+  Status Run(IteratorContext* ctx, std::vector<Tensor>&& args,
+             std::vector<Tensor>* rets);
+
+  // Synchronously runs the captured function on the given `args`, and stores
+  // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
+  // possible.
+  Status RunWithBorrowedArgs(IteratorContext* ctx,
+                             const std::vector<Tensor>& args,
+                             std::vector<Tensor>* rets);
+
+  // Asynchronously runs the captured function on the given `args`, stores
+  // the results in `*rets`, and calls the given `done` callback when the
+  // function returns. This method takes ownership of the tensors in `args`,
+  // in order to be able to deallocate them as early as possible.
+  void RunAsync(IteratorContext* ctx, std::vector<Tensor>&& args,
+                std::vector<Tensor>* rets,
+                FunctionLibraryRuntime::DoneCallback done);
+
+  // Returns that additional captured inputs that will be passed to the function
+  // when `Run*()` is called.
+  const std::vector<Tensor>& captured_inputs() { return captured_inputs_; }
+
+  // Returns a step ID for use when running a `CapturedFunction`.
+  static int64 generate_step_id() {
+    // Choose a step ID that is guaranteed not to clash with any
+    // Session-generated step ID. DirectSession only generates
+    // non-negative step IDs (contiguous, starting from 0), and
+    // MasterSession generates 56-bit random step IDs whose MSB is
+    // always 0, so a negative random step ID should suffice.
+    return -std::abs(static_cast<int64>(random::New64()));
+  }
+
+ private:
+  CapturedFunction(const NameAttrList& func,
+                   std::vector<Tensor> captured_inputs);
+
+  Status MaybeInstantiate(IteratorContext* ctx,
+                          FunctionLibraryRuntime::Handle* out_handle);
+
+  mutex mu_;
+  const NameAttrList func_;
+  FunctionLibraryRuntime* lib_ GUARDED_BY(mu_);
+  FunctionLibraryRuntime::Handle f_handle_ GUARDED_BY(mu_);
+  const std::vector<Tensor> captured_inputs_;
+  DataTypeSlice ret_types_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CapturedFunction);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_CAPTURED_FUNCTION_H_
diff --git a/tensorflow/core/kernels/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
similarity index 93%
rename from tensorflow/core/kernels/concatenate_dataset_op.cc
rename to tensorflow/core/kernels/data/concatenate_dataset_op.cc
index ad78ba01869a862d496d66b8dcac1243cf09fe84..f11abc62a67a6937cfa7891022a1643c93439e97 100644
--- a/tensorflow/core/kernels/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -128,14 +127,23 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_uninitialized"), ""));
+        }
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
+        if (reader->Contains(full_name("input_impl_uninitialized"))) {
+          input_impl_.reset();
+          return Status::OK();
+        }
         if (!TF_PREDICT_TRUE(i_ >= 0 && i_ <= 2))
           return errors::InvalidArgument("i_ must be in range [0, 2].");
         if (i_ == 1) {
diff --git a/tensorflow/core/kernels/data/dataset.h b/tensorflow/core/kernels/data/dataset.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c6fc8d5b4f607c026e683b3086ef0cf5e9e8e76
--- /dev/null
+++ b/tensorflow/core/kernels/data/dataset.h
@@ -0,0 +1,20 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_DATASET_H_
diff --git a/tensorflow/core/kernels/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
similarity index 68%
rename from tensorflow/core/kernels/dataset_utils.cc
rename to tensorflow/core/kernels/data/dataset_utils.cc
index cd58c8091211ae75265f6cfecb65746965f98d2f..e3a3601ee847148c459ab33decb8528f8b96521d 100644
--- a/tensorflow/core/kernels/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/dataset_utils.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 
 namespace tensorflow {
 
@@ -23,21 +23,10 @@ Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
     int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
     std::unique_ptr<IteratorBase>* out_iterator) {
-  FunctionLibraryRuntime::Options opts;
-  opts.runner = ctx->runner();
-  // Choose a step ID that is guaranteed not to clash with any
-  // Session-generated step ID. DirectSession only generates
-  // non-negative step IDs (contiguous, starting from 0), and
-  // MasterSession generates 56-bit random step IDs whose MSB
-  // is always 0, so a negative random step ID should suffice.
-  opts.step_id = CapturedFunction::generate_step_id();
-  ScopedStepContainer step_container(
-      opts.step_id, [captured_func, ctx](const string& name) {
-        captured_func->resource_manager()->Cleanup(name).IgnoreError();
-      });
-  opts.step_container = &step_container;
   std::vector<Tensor> return_values;
-  TF_RETURN_IF_ERROR(captured_func->Run(opts, input_element, &return_values));
+
+  TF_RETURN_IF_ERROR(
+      captured_func->RunWithBorrowedArgs(ctx, input_element, &return_values));
 
   if (!(return_values.size() == 1 && return_values[0].dtype() == DT_VARIANT &&
         TensorShapeUtils::IsScalar(return_values[0].shape()))) {
diff --git a/tensorflow/core/kernels/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
similarity index 78%
rename from tensorflow/core/kernels/dataset_utils.h
rename to tensorflow/core/kernels/data/dataset_utils.h
index eea2b8802b813808f752659a469c3818a52162d2..6c4191c2be6c55bfde7c5e8bd2e3b1e92edbaf27 100644
--- a/tensorflow/core/kernels/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_UTILS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
 
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -32,4 +32,4 @@ Status MakeIteratorFromInputElement(
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_UTILS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
diff --git a/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
similarity index 75%
rename from tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc
rename to tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
index e80d11eaea1640c54c21a7b94a2f043099c790f3..132808a5f140a31fc3c1852cb83e5cd8579b6d95 100644
--- a/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -56,10 +56,10 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
     *output = nullptr;
 
-#define HANDLE_TYPE(T)                                      \
-  case DataTypeToEnum<T>::value: {                          \
-    *output = new Dataset<T>(batch_size, row_shape, input); \
-    break;                                                  \
+#define HANDLE_TYPE(T)                                           \
+  case DataTypeToEnum<T>::value: {                               \
+    *output = new Dataset<T>(ctx, batch_size, row_shape, input); \
+    break;                                                       \
   }
 
     switch (input->output_dtypes()[0]) {
@@ -76,18 +76,20 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
  private:
   // TODO(mrry): Push the templated code down to the raw copying routine.
   template <class T>
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(int64 batch_size, const PartialTensorShape& row_shape,
-            const DatasetBase* input)
-        : batch_size_(batch_size), row_shape_(row_shape), input_(input) {
+    Dataset(OpKernelContext* ctx, int64 batch_size,
+            const PartialTensorShape& row_shape, const DatasetBase* input)
+        : GraphDatasetBase(ctx),
+          batch_size_(batch_size),
+          row_shape_(row_shape),
+          input_(input) {
       input_->Ref();
 
-      output_shapes_.reserve(3);
-      // Outputs represent a SparseTensor as (indices, values, dense_shape).
-      output_shapes_.push_back({-1, row_shape_.dims() + 1});
-      output_shapes_.push_back({-1});
-      output_shapes_.push_back({row_shape_.dims() + 1});
+      output_shapes_.reserve(1);
+      PartialTensorShape output_shape({-1});
+      output_shape.AppendShape(row_shape_);
+      output_shapes_.push_back(output_shape);
     }
 
     ~Dataset() override { input_->Unref(); }
@@ -99,8 +101,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
     }
 
     const DataTypeVector& output_dtypes() const override {
-      static DataTypeVector* output_dtypes_ =
-          new DataTypeVector({DT_INT64, DataTypeToEnum<T>::value, DT_INT64});
+      static DataTypeVector* output_dtypes_ = new DataTypeVector({DT_VARIANT});
       return *output_dtypes_;
     }
 
@@ -113,6 +114,25 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
                              ")::Dataset");
     }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      Node* batch_size_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
+      Node* row_shape_node;
+      std::vector<int64> row_shape;
+      row_shape.reserve(
+          row_shape_.dims());  // not an unknown rank PartialTensorShape
+      for (int i = 0; i < row_shape_.dims(); i++)
+        row_shape.emplace_back(row_shape_.dim_size(i));
+      TF_RETURN_IF_ERROR(b->AddVector(row_shape, &row_shape_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_node, batch_size_node, row_shape_node}, output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset<T>> {
      public:
@@ -135,7 +155,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
         // Determine the size of the output tensors:
         // * dense_shape will be [`row_shape + 1`].
-        Tensor dense_shape(cpu_allocator(), DT_INT64, {row_ndims + 1});
+        Tensor dense_shape(ctx->allocator({}), DT_INT64, {row_ndims + 1});
         auto dense_shape_vec = dense_shape.vec<int64>();
         for (size_t i = 0; i < row_ndims; ++i) {
           if (row_shape.dim_size(i) == -1) {
@@ -195,11 +215,11 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
         // * indices will be [`total_elements`, `row_shape + 1`].
         // * values will be [`total_elements`].
-        Tensor indices(cpu_allocator(), DT_INT64,
+        Tensor indices(ctx->allocator({}), DT_INT64,
                        {total_elements, row_ndims + 1});
         Tensor values(
-            cpu_allocator(),
-            DatasetIterator<Dataset<T>>::dataset()->output_dtypes()[1],
+            ctx->allocator({}),
+            DatasetIterator<Dataset<T>>::dataset()->input_->output_dtypes()[0],
             {total_elements});
         auto indices_matrix = indices.matrix<int64>();
         auto values_flat = values.flat<T>();
@@ -235,14 +255,31 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
         dense_shape_vec(0) = batch_elements.size();
 
-        out_tensors->push_back(std::move(indices));
-        out_tensors->push_back(std::move(values));
-        out_tensors->push_back(std::move(dense_shape));
+        Tensor serialized_sparse(DT_VARIANT, TensorShape({3}));
+        auto serialized_sparse_t = serialized_sparse.vec<Variant>();
+        serialized_sparse_t(0) = std::move(indices);
+        serialized_sparse_t(1) = std::move(values);
+        serialized_sparse_t(2) = std::move(dense_shape);
+        out_tensors->push_back(std::move(serialized_sparse));
 
         *end_of_sequence = false;
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(Iterator::SaveParent(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(Iterator::RestoreParent(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
      private:
       mutex mu_;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
similarity index 84%
rename from tensorflow/core/kernels/filter_dataset_op.cc
rename to tensorflow/core/kernels/data/filter_dataset_op.cc
index e4d80e4ce3a0d2070b9165dff598b11b190139eb..d16b5b7d416b85695287ccbab4bc4398a222c139 100644
--- a/tensorflow/core/kernels/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -12,15 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
-#include "tensorflow/core/kernels/captured_function.h"
-
 namespace tensorflow {
 
 namespace {
@@ -47,9 +45,8 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
     }
 
     std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
-                                                 std::move(other_arguments),
-                                                 &captured_func));
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
+                            func_, std::move(other_arguments), &captured_func));
 
     *output = new Dataset(ctx, input, func_, std::move(captured_func));
   }
@@ -95,7 +92,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
 
       DataTypeVector other_arguments_types;
       other_arguments_types.reserve(captured_func_->captured_inputs().size());
-      std::vector<NodeBuilder::NodeOut> other_arguments;
+      std::vector<Node*> other_arguments;
       other_arguments.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
@@ -146,28 +143,14 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
             return Status::OK();
           }
 
-          FunctionLibraryRuntime::Options opts;
-          opts.step_id = CapturedFunction::generate_step_id();
-          ScopedStepContainer step_container(
-              opts.step_id, [this, ctx](const string& name) {
-                dataset()
-                    ->captured_func_->resource_manager()
-                    ->Cleanup(name)
-                    .IgnoreError();
-              });
-          opts.step_container = &step_container;
-          opts.runner = ctx->runner();
           // TODO(mrry): Avoid blocking a threadpool thread. We will need to
           // stack-rip the iterators and use async kernels.
-          Notification n;
-          Status ret;
           std::vector<Tensor> result;
-          ret = dataset()->captured_func_->Run(opts, *out_tensors, &result);
+          TF_RETURN_IF_ERROR(dataset()->captured_func_->RunWithBorrowedArgs(
+              ctx, *out_tensors, &result));
 
-          if (!ret.ok()) {
-            return ret;
-          } else if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
-                     result[0].NumElements() != 1) {
+          if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
+              result[0].NumElements() != 1) {
             return errors::InvalidArgument(
                 "Filter predicate `f` must return a scalar bool.");
           }
@@ -192,7 +175,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         if (reader->Contains(full_name("input_impls_empty")))
diff --git a/tensorflow/core/kernels/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
similarity index 95%
rename from tensorflow/core/kernels/flat_map_dataset_op.cc
rename to tensorflow/core/kernels/data/flat_map_dataset_op.cc
index ac1689e5bf19b350c1baf486e060019aa9d17c2c..77a48a2aa9b0a2be22ef9112cf985964457d65bf 100644
--- a/tensorflow/core/kernels/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -12,16 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/random.h"
 
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset_utils.h"
-
 namespace tensorflow {
 
 namespace {
@@ -50,9 +48,8 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
     }
 
     std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
-                                                 std::move(other_arguments),
-                                                 &captured_func));
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
+                            func_, std::move(other_arguments), &captured_func));
 
     *output = new Dataset(ctx, input, func_, std::move(captured_func),
                           output_types_, output_shapes_);
@@ -102,7 +99,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
 
       DataTypeVector other_arguments_types;
       other_arguments_types.reserve(captured_func_->captured_inputs().size());
-      std::vector<NodeBuilder::NodeOut> other_arguments;
+      std::vector<Node*> other_arguments;
       other_arguments.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
@@ -197,7 +194,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         input_impl_.reset();
@@ -252,6 +249,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
         IteratorContext::Params params;
         params.env = ctx->env();
         params.runner = *(ctx->runner());
+        params.lib = ctx->function_library();
         IteratorContext iter_ctx(std::move(params));
         return BuildCurrentElementIteratorLocked(&iter_ctx);
       }
diff --git a/tensorflow/core/kernels/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
similarity index 50%
rename from tensorflow/core/kernels/group_by_window_dataset_op.cc
rename to tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index 8644bcf9b509b7aaf335791b583ad8e82073f471..834c06bb930d1c723c5b3f880dcc13a892bb44f7 100644
--- a/tensorflow/core/kernels/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -17,14 +17,12 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/window_dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset.h"
-#include "tensorflow/core/kernels/window_dataset.h"
-
 namespace tensorflow {
-
 namespace {
 
 // See documentation in ../ops/dataset_ops.cc for a high-level
@@ -74,36 +72,42 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
     // TODO(mrry): Refactor CapturedFunction to share the runtime
     // state between multiple functions?
     std::unique_ptr<CapturedFunction> captured_key_func;
-    OP_REQUIRES_OK(ctx,
-                   CapturedFunction::Create(ctx, key_func_, graph_def_version_,
-                                            std::move(key_func_other_arguments),
-                                            &captured_key_func));
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
+                            key_func_, std::move(key_func_other_arguments),
+                            &captured_key_func));
     std::unique_ptr<CapturedFunction> captured_reduce_func;
     OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(ctx, reduce_func_, graph_def_version_,
+        ctx, CapturedFunction::Create(reduce_func_,
                                       std::move(reduce_func_other_arguments),
                                       &captured_reduce_func));
     std::unique_ptr<CapturedFunction> captured_window_size_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
-                            ctx, window_size_func_, graph_def_version_,
-                            std::move(window_size_func_other_arguments),
-                            &captured_window_size_func));
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(
+                 window_size_func_, std::move(window_size_func_other_arguments),
+                 &captured_window_size_func));
 
     *output = new Dataset(
-        input, std::move(captured_key_func), std::move(captured_reduce_func),
+        ctx, input, key_func_, reduce_func_, window_size_func_,
+        std::move(captured_key_func), std::move(captured_reduce_func),
         std::move(captured_window_size_func), output_types_, output_shapes_);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& key_func, const NameAttrList& reduce_func,
+            const NameAttrList& window_size_func,
             std::unique_ptr<CapturedFunction> captured_key_func,
             std::unique_ptr<CapturedFunction> captured_reduce_func,
             std::unique_ptr<CapturedFunction> captured_window_size_func,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : input_(input),
+        : GraphDatasetBase(ctx),
+          input_(input),
+          key_func_(key_func),
+          reduce_func_(reduce_func),
+          window_size_func_(window_size_func),
           captured_key_func_(std::move(captured_key_func)),
           captured_reduce_func_(std::move(captured_reduce_func)),
           captured_window_size_func_(std::move(captured_window_size_func)),
@@ -129,6 +133,67 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "GroupByWindowDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, key_func_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, reduce_func_.name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, window_size_func_.name()));
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+
+      std::vector<Node*> key_func_other_arguments_node;
+      DataTypeVector key_func_other_arguments_types;
+      TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
+          b, captured_key_func_, &key_func_other_arguments_node,
+          &key_func_other_arguments_types));
+
+      std::vector<Node*> reduce_func_other_arguments_node;
+      DataTypeVector reduce_func_other_arguments_types;
+      TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
+          b, captured_reduce_func_, &reduce_func_other_arguments_node,
+          &reduce_func_other_arguments_types));
+
+      std::vector<Node*> window_size_func_other_arguments_node;
+      DataTypeVector window_size_func_other_arguments_types;
+      TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
+          b, captured_window_size_func_, &window_size_func_other_arguments_node,
+          &window_size_func_other_arguments_types));
+
+      AttrValue key_func;
+      b->BuildAttrValue(key_func_, &key_func);
+      AttrValue reduce_func;
+      b->BuildAttrValue(reduce_func_, &reduce_func);
+      AttrValue window_size_func;
+      b->BuildAttrValue(window_size_func_, &window_size_func);
+
+      AttrValue key_func_other_arguments_types_attr;
+      b->BuildAttrValue(key_func_other_arguments_types,
+                        &key_func_other_arguments_types_attr);
+      AttrValue reduce_func_other_arguments_types_attr;
+      b->BuildAttrValue(reduce_func_other_arguments_types,
+                        &reduce_func_other_arguments_types_attr);
+      AttrValue window_size_func_other_arguments_types_attr;
+      b->BuildAttrValue(window_size_func_other_arguments_types,
+                        &window_size_func_other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {{0, input_graph_node}},
+          {{1, key_func_other_arguments_node},
+           {2, reduce_func_other_arguments_node},
+           {3, window_size_func_other_arguments_node}},
+          {{"key_func", key_func},
+           {"reduce_func", reduce_func},
+           {"window_size_func", window_size_func},
+           {"Tkey_func_other_arguments", key_func_other_arguments_types_attr},
+           {"Treduce_func_other_arguments",
+            reduce_func_other_arguments_types_attr},
+           {"Twindow_size_func_other_arguments",
+            window_size_func_other_arguments_types_attr}},
+          output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -155,6 +220,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
             // We have reached the end of the current group, so maybe move on
             // to the next group.
             current_group_iterator_.reset();
+            groups_.erase(current_key_);
           }
 
           // Iterate through the input dataset until we get a full
@@ -165,23 +231,12 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
                 input_impl_->GetNext(ctx, &next_input_element, &end_of_input_));
 
             if (!end_of_input_) {
-              FunctionLibraryRuntime::Options opts;
-              opts.step_id = CapturedFunction::generate_step_id();
-              opts.runner = ctx->runner();
-              ScopedStepContainer step_container(
-                  opts.step_id, [this, ctx](const string& name) {
-                    dataset()
-                        ->captured_key_func_->resource_manager()
-                        ->Cleanup(name)
-                        .IgnoreError();
-                  });
-              opts.step_container = &step_container;
-
               // Run the key function on the input element to identify its
               // group.
               std::vector<Tensor> key_func_output;
-              TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Run(
-                  opts, next_input_element, &key_func_output));
+              TF_RETURN_IF_ERROR(
+                  dataset()->captured_key_func_->RunWithBorrowedArgs(
+                      ctx, next_input_element, &key_func_output));
 
               if (key_func_output.size() != 1 ||
                   key_func_output[0].dtype() != DT_INT64 ||
@@ -193,24 +248,11 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
               const int64 key = key_func_output[0].scalar<int64>()();
 
               if (window_sizes_.find(key) == window_sizes_.end()) {
-                // Run window_size function
-                FunctionLibraryRuntime::Options opts2;
-                opts2.step_id = CapturedFunction::generate_step_id();
-                opts2.runner = ctx->runner();
-                ScopedStepContainer step_container2(
-                    opts2.step_id, [this, ctx](const string& name) {
-                      dataset()
-                          ->captured_window_size_func_->resource_manager()
-                          ->Cleanup(name)
-                          .IgnoreError();
-                    });
-                opts2.step_container = &step_container2;
-
                 // Run the window size function on the key to identify its
                 // window size.
                 std::vector<Tensor> window_size_func_output;
                 TF_RETURN_IF_ERROR(dataset()->captured_window_size_func_->Run(
-                    opts2, key_func_output, &window_size_func_output));
+                    ctx, std::move(key_func_output), &window_size_func_output));
 
                 if (window_size_func_output.size() != 1 ||
                     window_size_func_output[0].dtype() != DT_INT64 ||
@@ -230,6 +272,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
               group.push_back(std::move(next_input_element));
 
               if (group.size() == window_size) {
+                current_key_ = key;
                 TF_RETURN_IF_ERROR(StartFlushingGroup(ctx, key));
                 break;
               }
@@ -240,6 +283,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
             if (!groups_.empty()) {
               // We have consumed all of the input, so flush an
               // arbitrarily chosen group.
+              current_key_ = groups_.begin()->first;
               TF_RETURN_IF_ERROR(
                   StartFlushingGroup(ctx, groups_.begin()->first));
             }
@@ -250,26 +294,162 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+
+        if (end_of_input_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("end_of_input"), ""));
+        }
+
+        // Saving groups_
+        if (!groups_.empty()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("groups_size"), groups_.size()));
+          int idx = 0;
+          for (auto it = groups_.begin(); it != groups_.end(); it++) {
+            int64 key = it->first;
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("groups_[", idx, "]->key")), key));
+            TF_RETURN_IF_ERROR(SaveGroup(
+                writer, full_name(strings::StrCat("groups_[", idx, "]")),
+                it->second));
+            idx++;
+          }
+        }
+
+        // Saving window_sizes_
+        if (!window_sizes_.empty()) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("window_sizes_size"),
+                                                 window_sizes_.size()));
+          int idx = 0;
+          for (auto it = window_sizes_.begin(); it != window_sizes_.end();
+               it++) {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("window_sizes_[", idx, "]->key")),
+                it->first));
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("window_sizes_[", idx, "]->value")),
+                it->second));
+            idx++;
+          }
+        }
+
+        if (current_group_iterator_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, current_group_iterator_));
+
+          // Saving current_key_
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("current_key"), current_key_));
+        } else {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name("current_iterator_not_initialized"), ""));
+        }
+
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+
+        if (reader->Contains(full_name("end_of_input"))) end_of_input_ = true;
+
+        // Restoring groups
+        if (reader->Contains(full_name("groups_size"))) {
+          int64 size;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("groups_size"), &size));
+          for (int idx = 0; idx < size; idx++) {
+            int64 key;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("groups_[", idx, "]->key")), &key));
+            std::vector<std::vector<Tensor>> group;
+            TF_RETURN_IF_ERROR(RestoreGroup(
+                reader, full_name(strings::StrCat("groups_[", idx, "]")),
+                &group));
+            groups_[key] = group;
+          }
+        }
+
+        // Restoring Windows
+        if (reader->Contains(full_name("window_sizes_size"))) {
+          int64 size;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("window_sizes_size"), &size));
+          for (int idx = 0; idx < size; idx++) {
+            int64 key;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("window_sizes_[", idx, "]->key")),
+                &key));
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("window_sizes_[", idx, "]->value")),
+                &window_sizes_[key]));
+          }
+        }
+
+        if (reader->Contains(full_name("current_iterator_not_initialized"))) {
+          current_group_iterator_.reset();
+        } else {
+          // Restore current_key_
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("current_key"), &current_key_));
+
+          // Initialize current_group_iterator_
+          TF_RETURN_IF_ERROR(StartFlushingGroup(ctx, current_key_));
+          // Restore current_group_iterator_ state
+          TF_RETURN_IF_ERROR(
+              RestoreParent(ctx, reader, current_group_iterator_));
+        }
+        return Status::OK();
+      }
+
      private:
-      Status StartFlushingGroup(IteratorContext* ctx, int64 key)
+      Status SaveGroup(IteratorStateWriter* writer, const string& name,
+                       const std::vector<std::vector<Tensor>>& group)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        FunctionLibraryRuntime::Options opts;
-        opts.step_id = CapturedFunction::generate_step_id();
-        opts.runner = ctx->runner();
-        ScopedStepContainer step_container(
-            opts.step_id, [this, ctx](const string& name) {
-              dataset()
-                  ->captured_reduce_func_->resource_manager()
-                  ->Cleanup(name)
-                  .IgnoreError();
-            });
-        opts.step_container = &step_container;
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(strings::StrCat(name, "_size"), group.size()));
+        for (int i = 0; i < group.size(); i++) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              strings::StrCat(name, "[", i, "]_size"), group[i].size()));
+          for (int j = 0; j < group[i].size(); j++) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                strings::StrCat(name, "[", i, "][", j, "]"), group[i][j]));
+          }
+        }
+        return Status::OK();
+      }
 
+      Status RestoreGroup(IteratorStateReader* reader, const string& name,
+                          std::vector<std::vector<Tensor>>* group)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        int64 group_size;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(strings::StrCat(name, "_size"), &group_size));
+        group->resize(group_size);
+        for (int i = 0; i < group_size; i++) {
+          int64 vector_size;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              strings::StrCat(name, "[", i, "]_size"), &vector_size));
+          group->at(i).resize(vector_size);
+          for (int j = 0; j < vector_size; j++) {
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                strings::StrCat(name, "[", i, "][", j, "]"), &group->at(i)[j]));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status StartFlushingGroup(IteratorContext* ctx, int64 key)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         DatasetBase* group_dataset;
         TF_RETURN_IF_ERROR(NewWindowDataset(
-            std::move(groups_[key]), dataset()->input_->output_dtypes(),
+            groups_[key], dataset()->input_->output_dtypes(),
             dataset()->input_->output_shapes(), &group_dataset));
-        groups_.erase(key);
 
         Tensor key_arg(DT_INT64, TensorShape({}));
         key_arg.scalar<int64>()() = key;
@@ -281,9 +461,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
         std::vector<Tensor> args(
             {std::move(key_arg), std::move(group_dataset_arg)});
         std::vector<Tensor> return_values;
-
-        TF_RETURN_IF_ERROR(
-            dataset()->captured_reduce_func_->Run(opts, args, &return_values));
+        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Run(
+            ctx, std::move(args), &return_values));
 
         if (!(return_values.size() == 1 &&
               return_values[0].dtype() == DT_VARIANT &&
@@ -304,20 +483,36 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      const std::unique_ptr<IteratorBase> input_impl_;
       mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       // TODO(mrry): Optimize for dense key space if appropriate.
       bool end_of_input_ GUARDED_BY(mu_) = false;
+      int64 current_key_ GUARDED_BY(mu_);
       std::map<int64, std::vector<std::vector<Tensor>>> groups_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> current_group_iterator_ GUARDED_BY(mu_);
       std::map<int64, int64> window_sizes_ GUARDED_BY(mu_);
     };
 
-    // A resource name for the temporary window dataset that is
-    // created as the input to the reduce function.
-    static constexpr const char* kWindowResourceName = "__window_dataset";
+    Status OtherArgumentsNodeAndType(
+        DatasetGraphDefBuilder* b,
+        const std::unique_ptr<CapturedFunction>& captured_func,
+        std::vector<Node*>* other_arguments_node,
+        DataTypeVector* other_arguments_types) const {
+      other_arguments_node->reserve(captured_func->captured_inputs().size());
+      other_arguments_types->reserve(captured_func->captured_inputs().size());
+      for (const Tensor& t : captured_func->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments_node->emplace_back(node);
+        other_arguments_types->emplace_back(t.dtype());
+      }
+      return Status::OK();
+    }
 
     const DatasetBase* const input_;
+    const NameAttrList key_func_;
+    const NameAttrList reduce_func_;
+    const NameAttrList window_size_func_;
     const std::unique_ptr<CapturedFunction> captured_key_func_;
     const std::unique_ptr<CapturedFunction> captured_reduce_func_;
     const std::unique_ptr<CapturedFunction> captured_window_size_func_;
@@ -337,5 +532,4 @@ REGISTER_KERNEL_BUILDER(Name("GroupByWindowDataset").Device(DEVICE_CPU),
                         GroupByWindowDatasetOp);
 
 }  // namespace
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
similarity index 93%
rename from tensorflow/core/kernels/interleave_dataset_op.cc
rename to tensorflow/core/kernels/data/interleave_dataset_op.cc
index cbee68b2dbed807ca101e1a887daebe289646da0..bce3f28d62bf898e5137568c4241aff4392db65b 100644
--- a/tensorflow/core/kernels/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -13,16 +13,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/random.h"
 
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset_utils.h"
-
 namespace tensorflow {
 
 namespace {
@@ -69,9 +67,8 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
         errors::InvalidArgument("block_length must be greater than zero."));
 
     std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
-                                                 std::move(other_arguments),
-                                                 &captured_func));
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
+                            func_, std::move(other_arguments), &captured_func));
 
     *output =
         new Dataset(ctx, input, func_, std::move(captured_func), cycle_length,
@@ -126,7 +123,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
       DataTypeVector other_arguments_types;
       other_arguments_types.reserve(captured_func_->captured_inputs().size());
-      std::vector<NodeBuilder::NodeOut> other_arguments;
+      std::vector<Node*> other_arguments;
       other_arguments.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
@@ -230,7 +227,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
@@ -268,13 +265,9 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreCurrentElements(OpKernelContext* ctx,
+      Status RestoreCurrentElements(IteratorContext* ctx,
                                     IteratorStateReader* reader)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        IteratorContext::Params params;
-        params.env = ctx->env();
-        params.runner = *(ctx->runner());
-        IteratorContext iter_ctx(std::move(params));
         for (int idx = 0; idx < current_elements_.size(); idx++) {
           if (reader->Contains(
                   full_name(strings::StrCat("args_size[", idx, "]")))) {
@@ -289,9 +282,8 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
                   &args_list_[idx][i]));
             }
             TF_RETURN_IF_ERROR(dataset::MakeIteratorFromInputElement(
-                &iter_ctx, args_list_[idx], idx,
-                dataset()->captured_func_.get(), prefix(),
-                &current_elements_[idx]));
+                ctx, args_list_[idx], idx, dataset()->captured_func_.get(),
+                prefix(), &current_elements_[idx]));
             TF_RETURN_IF_ERROR(
                 RestoreParent(ctx, reader, current_elements_[idx]));
           } else {
diff --git a/tensorflow/core/kernels/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
similarity index 72%
rename from tensorflow/core/kernels/iterator_ops.cc
rename to tensorflow/core/kernels/data/iterator_ops.cc
index 439775157bc936d44845e7b175e62c2fc088e6cf..d7d4ad5cf7f6d5a3386be524c7a227006da0b3f4 100644
--- a/tensorflow/core/kernels/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -14,21 +14,24 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/renamed_device.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/stats_aggregator.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/kernels/stats_aggregator.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
@@ -80,16 +83,26 @@ class IteratorResource : public ResourceBase {
  public:
   IteratorResource(const DataTypeVector& output_dtypes,
                    const std::vector<PartialTensorShape>& output_shapes,
-                   const int graph_def_version)
-      : iterator_(nullptr),
+                   const int /*unused: graph_def_version*/,
+                   std::unique_ptr<DeviceMgr> device_mgr,
+                   std::unique_ptr<FunctionLibraryDefinition> flib_def,
+                   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+                   FunctionLibraryRuntime* lib)
+      : device_mgr_(std::move(device_mgr)),
+        flib_def_(std::move(flib_def)),
+        pflr_(std::move(pflr)),
+        lib_(lib),
+        iterator_(nullptr),
         output_dtypes_(output_dtypes),
-        output_shapes_(output_shapes),
-        graph_def_version_(graph_def_version) {}
+        output_shapes_(output_shapes) {}
 
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) {
     std::shared_ptr<IteratorBase> captured_iterator(iterator_);
     if (captured_iterator) {
+      if (lib_ != nullptr) {
+        ctx->set_lib(lib_);
+      }
       return captured_iterator->GetNext(ctx, out_tensors, end_of_sequence);
     } else {
       return errors::FailedPrecondition(
@@ -129,26 +142,34 @@ class IteratorResource : public ResourceBase {
     GraphRunner graph_runner(ctx->env());
 
     // Build a new FLR that knows about the functions in the graph.
-    std::unique_ptr<FunctionLibraryDefinition> flib_def(
+    std::shared_ptr<FunctionLibraryDefinition> flib_def(
         new FunctionLibraryDefinition(
             *ctx->function_library()->GetFunctionLibraryDefinition()));
     TF_RETURN_IF_ERROR(flib_def->AddLibrary(graph_def.library()));
-    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
-        new ProcessFunctionLibraryRuntime(nullptr, ctx->env(),
-                                          graph_def_version_, flib_def.get(),
-                                          {}, nullptr));
-    FunctionLibraryRuntime* lib =
-        pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
 
     TF_RETURN_IF_ERROR(
-        graph_runner.Run(&graph, lib, {}, {output_node}, &outputs));
+        graph_runner.Run(&graph, lib_, {}, {output_node}, &outputs));
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
 
     TF_RETURN_IF_ERROR(set_iterator(dataset->MakeIterator("Iterator")));
     std::shared_ptr<IteratorBase> captured_iterator(iterator_);
 
     if (captured_iterator) {
-      return captured_iterator->Restore(ctx, reader);
+      IteratorContext::Params params;
+      params.env = ctx->env();
+      params.runner = *(ctx->runner());
+      params.function_library = flib_def;
+      params.lib = lib_;
+      DeviceBase* device = lib_->device();
+      params.allocator_getter = [device](AllocatorAttributes attrs) {
+        return device->GetAllocator(attrs);
+      };
+      IteratorContext iter_ctx(std::move(params));
+
+      TF_RETURN_IF_ERROR(captured_iterator->Restore(&iter_ctx, reader));
+      mutex_lock l(mu_);
+      lib_def_ = std::move(flib_def);
+      return Status::OK();
     } else {
       return errors::FailedPrecondition(
           "Failed to restore iterator. Make sure the checkpoint ",
@@ -157,6 +178,11 @@ class IteratorResource : public ResourceBase {
     }
   }
 
+  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
+    tf_shared_lock l(mu_);
+    return lib_def_;
+  }
+
   // Transfers ownership of iterator to this. This method is thread-safe.
   Status set_iterator(std::unique_ptr<IteratorBase> iterator) {
     if (iterator) {
@@ -188,12 +214,19 @@ class IteratorResource : public ResourceBase {
   }
 
  private:
+  // The following (device_mgr_, flib_def_, pflr_) are only used when the
+  // IteratorResource is shared between sessions and in that case we create
+  // a new FLR. Otherwise these are set to null.
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* lib_ = nullptr;  // not owned.
   std::shared_ptr<IteratorBase> iterator_;
   mutex mu_;
   std::shared_ptr<StatsAggregator> stats_aggregator_ GUARDED_BY(mu_);
+  std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
   const DataTypeVector output_dtypes_;
   const std::vector<PartialTensorShape> output_shapes_;
-  const int graph_def_version_;
 };
 
 // Helper class for reading data from a VariantTensorData object.
@@ -400,25 +433,90 @@ class IteratorStateVariant {
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant,
                                        kIteratorVariantTypeName);
 
-// TODO(mrry): Can we simply use the template kernel here?
-class IteratorHandleOp : public ResourceOpKernel<IteratorResource> {
+class IteratorHandleOp : public OpKernel {
  public:
   explicit IteratorHandleOp(OpKernelConstruction* ctx)
-      : ResourceOpKernel<IteratorResource>(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
+      : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
   }
 
- private:
-  Status CreateResource(IteratorResource** ret) override
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    *ret = new IteratorResource(output_dtypes_, output_shapes_,
-                                graph_def_version_);
-    return Status::OK();
+  // The resource is deleted from the resource manager only when it is private
+  // to kernel. Ideally the resource should be deleted when it is no longer held
+  // by anyone, but it would break backward compatibility.
+  ~IteratorHandleOp() override {
+    if (resource_ != nullptr) {
+      resource_->Unref();
+      if (cinfo_.resource_is_private_to_kernel()) {
+        if (!cinfo_.resource_manager()
+                 ->template Delete<IteratorResource>(cinfo_.container(),
+                                                     cinfo_.name())
+                 .ok()) {
+          // Do nothing; the resource can have been deleted by session resets.
+        }
+      }
+    }
   }
 
-  Status VerifyResource(IteratorResource* resource) override {
+  void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
+    {
+      mutex_lock l(mu_);
+      if (resource_ == nullptr) {
+        FunctionLibraryRuntime* lib;
+        std::unique_ptr<DeviceMgr> device_mgr(nullptr);
+        std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+        std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+        // If the iterator is shared then we construct a new FLR, and pass that
+        // in. NOTE(mrry,rohanj): In this case it is not possible to call remote
+        // functions from the iterator. We may add this functionality if there
+        // is sufficient demand, but it will require a significant refactoring.
+        if (!name_.empty()) {
+          lib = CreatePrivateFLR(context, &device_mgr, &flib_def, &pflr);
+        } else {
+          OP_REQUIRES_OK(context, context->function_library()->Clone(
+                                      &flib_def, &pflr, &lib));
+        }
+
+        ResourceMgr* mgr = context->resource_manager();
+        OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+
+        IteratorResource* resource;
+        OP_REQUIRES_OK(
+            context,
+            mgr->LookupOrCreate<IteratorResource>(
+                cinfo_.container(), cinfo_.name(), &resource,
+                [lib, &device_mgr, &flib_def, &pflr,
+                 this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                  *ret = new IteratorResource(
+                      output_dtypes_, output_shapes_, graph_def_version_,
+                      std::move(device_mgr), std::move(flib_def),
+                      std::move(pflr), lib);
+                  return Status::OK();
+                }));
+
+        Status s = VerifyResource(resource);
+        if (TF_PREDICT_FALSE(!s.ok())) {
+          resource->Unref();
+          context->SetStatus(s);
+          return;
+        }
+
+        resource_ = resource;
+      }
+    }
+    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                                context, 0, cinfo_.container(), cinfo_.name(),
+                                MakeTypeIndex<IteratorResource>()));
+  }
+
+ private:
+  // During the first Compute(), resource is either created or looked up using
+  // shared_name. In the latter case, the resource found should be verified if
+  // it is compatible with this op's configuration. The verification may fail in
+  // cases such as two graphs asking queues of the same shared name to have
+  // inconsistent capacities.
+  Status VerifyResource(IteratorResource* resource) {
     TF_RETURN_IF_ERROR(
         VerifyTypesMatch(output_dtypes_, resource->output_dtypes()));
     TF_RETURN_IF_ERROR(
@@ -426,10 +524,49 @@ class IteratorHandleOp : public ResourceOpKernel<IteratorResource> {
     return Status::OK();
   }
 
- private:
+  template <typename To, typename From>  // use like this: down_cast<T*>(foo);
+  static inline To down_cast(From* f) {  // so we only accept pointers
+    static_assert(
+        (std::is_base_of<From, typename std::remove_pointer<To>::type>::value),
+        "target type not derived from source type");
+
+    // We skip the assert and hence the dynamic_cast if RTTI is disabled.
+#if !defined(__GNUC__) || defined(__GXX_RTTI)
+    // Uses RTTI in dbg and fastbuild. asserts are disabled in opt builds.
+    assert(f == nullptr || dynamic_cast<To>(f) != nullptr);
+#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
+    return static_cast<To>(f);
+  }
+
+  FunctionLibraryRuntime* CreatePrivateFLR(
+      OpKernelContext* ctx, std::unique_ptr<DeviceMgr>* device_mgr,
+      std::unique_ptr<FunctionLibraryDefinition>* flib_def,
+      std::unique_ptr<ProcessFunctionLibraryRuntime>* pflr) {
+    // Wrap the existing device in order to see any captured resources
+    // in its resource manager. The existing device will outlive the
+    // IteratorResource, because we are storing the IteratorResource
+    // in that device's resource manager.
+    Device* wrapped_device = RenamedDevice::NewRenamedDevice(
+        ctx->device()->name(), down_cast<Device*>(ctx->device()),
+        false /* owns_underlying */, false /* isolate_session_state */);
+    device_mgr->reset(new DeviceMgr({wrapped_device}));
+    flib_def->reset(new FunctionLibraryDefinition(
+        *ctx->function_library()->GetFunctionLibraryDefinition()));
+    pflr->reset(new ProcessFunctionLibraryRuntime(
+        device_mgr->get(), ctx->env(), graph_def_version_, flib_def->get(),
+        {} /* TODO(mrry): OptimizerOptions? */,
+        nullptr /* TODO(mrry): ClusterFLR */));
+
+    return (*pflr)->GetFLR(ctx->device()->name());
+  }
+
+  mutex mu_;
+  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
+  IteratorResource* resource_ GUARDED_BY(mu_) = nullptr;
   DataTypeVector output_dtypes_;
   std::vector<PartialTensorShape> output_shapes_;
   const int graph_def_version_;
+  string name_;
 };
 
 class MakeIteratorOp : public OpKernel {
@@ -448,40 +585,66 @@ class MakeIteratorOp : public OpKernel {
   }
 };
 
-class ToSingleElementOp : public OpKernel {
+class ToSingleElementOp : public AsyncOpKernel {
  public:
-  explicit ToSingleElementOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit ToSingleElementOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        thread_pool_(new thread::ThreadPool(
+            ctx->env(), ThreadOptions(),
+            strings::StrCat("to_single_element_op_thread_",
+                            SanitizeThreadSuffix(name())),
+            1 /* num_threads */, false /* low_latency_hint */)) {}
 
-  void Compute(OpKernelContext* ctx) override {
-    DatasetBase* dataset;
-    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
-    auto iterator = dataset->MakeIterator("SingleElementIterator");
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    // The call to `iterator->GetNext()` may block and depend on an
+    // inter-op thread pool thread, so we issue the call from the
+    // owned thread pool.
+    thread_pool_->Schedule([ctx, done]() {
+      DatasetBase* dataset;
+      OP_REQUIRES_OK_ASYNC(
+          ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
+      auto iterator = dataset->MakeIterator("SingleElementIterator");
 
-    IteratorContext::Params params;
-    params.env = ctx->env();
-    params.runner = *(ctx->runner());
-    IteratorContext iter_ctx(std::move(params));
+      IteratorContext::Params params;
+      params.env = ctx->env();
+      params.runner = *(ctx->runner());
+      params.lib = ctx->function_library();
+      DeviceBase* device = ctx->function_library()->device();
+      params.allocator_getter = [device](AllocatorAttributes attrs) {
+        return device->GetAllocator(attrs);
+      };
 
-    std::vector<Tensor> components;
-    components.reserve(dataset->output_dtypes().size());
-    bool end_of_sequence;
+      IteratorContext iter_ctx(std::move(params));
 
-    OP_REQUIRES_OK(ctx,
-                   iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
-    OP_REQUIRES(ctx, !end_of_sequence,
-                errors::InvalidArgument("Dataset was empty."));
+      std::vector<Tensor> components;
+      components.reserve(dataset->output_dtypes().size());
+      bool end_of_sequence;
 
-    for (int i = 0; i < components.size(); ++i) {
-      // TODO(mrry): Check that the shapes match the shape attrs.
-      ctx->set_output(i, components[i]);
-    }
+      OP_REQUIRES_OK_ASYNC(
+          ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+          done);
+      OP_REQUIRES_ASYNC(ctx, !end_of_sequence,
+                        errors::InvalidArgument("Dataset was empty."), done);
 
-    components.clear();
-    OP_REQUIRES_OK(ctx,
-                   iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
-    OP_REQUIRES(ctx, end_of_sequence,
-                errors::InvalidArgument("Dataset had more than one element."));
+      for (int i = 0; i < components.size(); ++i) {
+        // TODO(mrry): Check that the shapes match the shape attrs.
+        ctx->set_output(i, components[i]);
+      }
+
+      components.clear();
+      OP_REQUIRES_OK_ASYNC(
+          ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+          done);
+      OP_REQUIRES_ASYNC(
+          ctx, end_of_sequence,
+          errors::InvalidArgument("Dataset had more than one element."), done);
+
+      done();
+    });
   }
+
+ private:
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
 };
 
 class OneShotIteratorOp : public AsyncOpKernel {
@@ -572,15 +735,22 @@ class OneShotIteratorOp : public AsyncOpKernel {
                  ContainerInfo* cinfo) {
     TF_RETURN_IF_ERROR(cinfo->Init(ctx->resource_manager(), def()));
 
+    FunctionLibraryRuntime* lib;
+    std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+    TF_RETURN_IF_ERROR(ctx->function_library()->Clone(&flib_def, &pflr, &lib));
+
     // Create an IteratorResource that will hold the iterator for this op.
     TF_RETURN_IF_ERROR(
         ctx->resource_manager()->LookupOrCreate<IteratorResource>(
             cinfo->container(), cinfo->name(), iterator,
-            [this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-              *ret = new IteratorResource(output_dtypes_, output_shapes_,
-                                          graph_def_version_);
-              return Status::OK();
-            }));
+            [lib, this, &flib_def, &pflr](IteratorResource** ret)
+                EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                  *ret = new IteratorResource(
+                      output_dtypes_, output_shapes_, graph_def_version_,
+                      nullptr, std::move(flib_def), std::move(pflr), lib);
+                  return Status::OK();
+                }));
 
     core::ScopedUnref unref_iterator(*iterator);
 
@@ -683,43 +853,86 @@ class IteratorGetNextOp : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     IteratorResource* iterator;
-    OP_REQUIRES_OK(ctx,
-                   LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
-
+    OP_REQUIRES_OK_ASYNC(
+        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
     // The call to `iterator->GetNext()` may block and depend on an
     // inter-op thread pool thread, so we issue the call from the
     // owned thread pool.
-    thread_pool_->Schedule([this, ctx, iterator, done]() {
-      core::ScopedUnref unref_iterator(iterator);
+    thread_pool_->Schedule(std::bind(
+        [this, ctx, iterator](DoneCallback done) {
+          core::ScopedUnref unref_iterator(iterator);
+
+          std::vector<Tensor> components;
+          bool end_of_sequence = false;
+
+          IteratorContext::Params params;
+          params.env = ctx->env();
+          params.stats_aggregator_getter = [iterator]() {
+            return iterator->stats_aggregator();
+          };
+          params.runner = *(ctx->runner());
+          params.function_library = iterator->function_library();
+          DeviceBase* device = ctx->function_library()->device();
+          params.allocator_getter = [device](AllocatorAttributes attrs) {
+            return device->GetAllocator(attrs);
+          };
+          IteratorContext iter_ctx(std::move(params));
+
+          OP_REQUIRES_OK_ASYNC(
+              ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+              done);
+          OP_REQUIRES_ASYNC(ctx, !end_of_sequence,
+                            errors::OutOfRange("End of sequence"), done);
+
+          for (int i = 0; i < components.size(); ++i) {
+            // TODO(mrry): Check that the shapes match the shape attrs.
+            ctx->set_output(i, components[i]);
+          }
+
+          done();
+        },
+        std::move(done)));
+  }
 
-      std::vector<Tensor> components;
-      bool end_of_sequence = false;
+ private:
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+};
 
-      IteratorContext::Params params;
-      params.env = ctx->env();
-      params.stats_aggregator_getter = [iterator]() {
-        return iterator->stats_aggregator();
-      };
-      params.runner = *(ctx->runner());
-      IteratorContext iter_ctx(std::move(params));
+class IteratorGetNextSyncOp : public OpKernel {
+ public:
+  explicit IteratorGetNextSyncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
-      OP_REQUIRES_OK_ASYNC(
-          ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
-          done);
-      OP_REQUIRES_ASYNC(ctx, !end_of_sequence,
-                        errors::OutOfRange("End of sequence"), done);
+  void Compute(OpKernelContext* ctx) override {
+    IteratorResource* iterator;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
+    core::ScopedUnref unref_iterator(iterator);
 
-      for (int i = 0; i < components.size(); ++i) {
-        // TODO(mrry): Check that the shapes match the shape attrs.
-        ctx->set_output(i, components[i]);
-      }
+    std::vector<Tensor> components;
+    bool end_of_sequence = false;
 
-      done();
-    });
-  }
+    IteratorContext::Params params;
+    params.env = ctx->env();
+    params.stats_aggregator_getter = [iterator]() {
+      return iterator->stats_aggregator();
+    };
+    params.runner = *(ctx->runner());
+    params.function_library = iterator->function_library();
+    DeviceBase* device = ctx->function_library()->device();
+    params.allocator_getter = [device](AllocatorAttributes attrs) {
+      return device->GetAllocator(attrs);
+    };
+    IteratorContext iter_ctx(std::move(params));
 
- private:
-  std::unique_ptr<thread::ThreadPool> thread_pool_;
+    OP_REQUIRES_OK(ctx,
+                   iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
+    OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence"));
+
+    for (int i = 0; i < components.size(); ++i) {
+      // TODO(mrry): Check that the shapes match the shape attrs.
+      ctx->set_output(i, components[i]);
+    }
+  }
 };
 
 class IteratorToStringHandleOp : public OpKernel {
@@ -885,6 +1098,8 @@ REGISTER_KERNEL_BUILDER(Name("OneShotIterator").Device(DEVICE_CPU),
                         OneShotIteratorOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_CPU),
                         IteratorGetNextOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE_CPU),
+                        IteratorGetNextSyncOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle").Device(DEVICE_CPU),
                         IteratorToStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandle").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
similarity index 64%
rename from tensorflow/core/kernels/map_and_batch_dataset_op.cc
rename to tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index ad1e356dbd32b89e7d59f0908f4ea634ac476e6b..9ce263732f6e6c907dfdc89692455daa5dca86d1 100644
--- a/tensorflow/core/kernels/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -14,13 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/tracing.h"
@@ -67,9 +67,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                     "num_parallel_batches must be greater than zero."));
 
     std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
-                                                 std::move(other_arguments),
-                                                 &captured_func));
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
+                            func_, std::move(other_arguments), &captured_func));
 
     *output = new Dataset(input, batch_size, num_parallel_batches,
                           output_types_, output_shapes_,
@@ -132,7 +131,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         if (current_batch_index_ != -1) {
           for (size_t batch_index = 0;
                batch_index < dataset()->num_parallel_batches_; ++batch_index) {
-            WaitForBatch(batch_index).IgnoreError();
+            int64 num_elements;
+            WaitForBatch(batch_index, &num_elements).IgnoreError();
             // Deallocate tensors allocated for the output.
             batch_results_[batch_index].output.clear();
           }
@@ -166,17 +166,35 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           }
         }
 
-        if (end_of_input_) {
+        int64 num_elements = 0;
+        Status status = WaitForBatch(current_batch_index_, &num_elements);
+        if (num_elements == 0) {
           *end_of_sequence = true;
           return Status::OK();
         }
-
-        Status status = WaitForBatch(current_batch_index_);
         if (!status.ok()) {
           // Deallocate tensors allocated for the output.
           batch_results_[current_batch_index_].output.clear();
         } else {
-          *out_tensors = std::move(batch_results_[current_batch_index_].output);
+          if (num_elements < dataset()->batch_size_) {
+            const std::vector<Tensor>& output =
+                batch_results_[current_batch_index_].output;
+            for (size_t i = 0; i < output.size(); ++i) {
+              TensorShape component_shape(
+                  batch_results_[current_batch_index_].output[i].shape());
+              component_shape.set_dim(0, num_elements);
+              Tensor component(ctx->allocator({}), output[i].dtype(),
+                               component_shape);
+              TF_RETURN_IF_ERROR(
+                  CopyPartialBatch(&component, output[i], num_elements));
+              out_tensors->emplace_back(std::move(component));
+            }
+            // Deallocate tensors allocated for the output.
+            batch_results_[current_batch_index_].output.clear();
+          } else {
+            *out_tensors =
+                std::move(batch_results_[current_batch_index_].output);
+          }
           *end_of_sequence = false;
         }
         StartInvocationBatch(ctx, current_batch_index_);
@@ -195,6 +213,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
       struct InvocationResult {
         Status status;
+        bool end_of_input;
         std::vector<Tensor> return_values;
       };
 
@@ -202,7 +221,31 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return batch_index * dataset()->batch_size_ + offset;
       }
 
-      void EnsureOutputAllocated(BatchResult* batch_result,
+      Status CopyPartialBatch(Tensor* output, const Tensor& value,
+                              int64 num_elements) {
+        switch (value.dtype()) {
+#define CASE(type)                                                \
+  case DataTypeToEnum<type>::value: {                             \
+    auto output_t = output->flat_outer_dims<type>();              \
+    auto value_t = value.flat_outer_dims<type>();                 \
+    for (size_t i = 0; i < num_elements; i++) {                   \
+      output_t.template chip<0>(i) = value_t.template chip<0>(i); \
+    }                                                             \
+    return Status::OK();                                          \
+  }
+          TF_CALL_NUMBER_TYPES(CASE);
+          TF_CALL_string(CASE);
+          TF_CALL_variant(CASE);
+#undef CASE
+          default:
+            return errors::InvalidArgument("Unsupported data type: ",
+                                           value.dtype());
+        }
+        return Status::OK();
+      }
+
+      void EnsureOutputAllocated(IteratorContext* ctx,
+                                 BatchResult* batch_result,
                                  const std::vector<Tensor>& return_values) {
         mutex_lock l(batch_result->mu);
         if (batch_result->output_allocated) {
@@ -212,7 +255,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         for (size_t i = 0; i < num_components; ++i) {
           TensorShape component_shape({dataset()->batch_size_});
           component_shape.AppendShape(return_values[i].shape());
-          Tensor component(cpu_allocator(), return_values[i].dtype(),
+          Tensor component(ctx->allocator({}), return_values[i].dtype(),
                            component_shape);
           batch_result->output.emplace_back(std::move(component));
         }
@@ -228,8 +271,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         // Get the next input element.
         std::vector<Tensor> input_element;
         result->status =
-            input_impl_->GetNext(ctx, &input_element, &end_of_input_);
-        if (end_of_input_ || !result->status.ok()) {
+            input_impl_->GetNext(ctx, &input_element, &result->end_of_input);
+        if (result->end_of_input || !result->status.ok()) {
           batch_result->counter->DecrementCount();
           return;
         }
@@ -237,58 +280,55 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         // Call `captured_func_(input_element)`, store the result in
         // `result->return_values`, and notify `batch_result->counter`
         // to unblock a consumer.
-        FunctionLibraryRuntime::Options opts;
-        opts.step_id = CapturedFunction::generate_step_id();
-        ScopedStepContainer* step_container = new ScopedStepContainer(
-            opts.step_id, [this, ctx](const string& name) {
-              dataset()
-                  ->captured_func_->resource_manager()
-                  ->Cleanup(name)
-                  .IgnoreError();
-            });
-        opts.step_container = step_container;
-        opts.runner = ctx->runner();
-        dataset()->captured_func_->RunAsync(
-            opts, input_element, &result->return_values,
-            [this, result, step_container, batch_result,
-             offset](Status ret_status) {
-              delete step_container;
-              result->status.Update(ret_status);
-              if (ret_status.ok()) {
-                EnsureOutputAllocated(batch_result, result->return_values);
-                const size_t num_components = result->return_values.size();
-                for (size_t i = 0; i < num_components; ++i) {
-                  const Tensor& tensor = result->return_values[i];
-                  Tensor* batch = &(batch_result->output)[i];
-                  if (tensor.NumElements() !=
-                      (batch->NumElements() / batch->dim_size(0))) {
-                    TensorShape batch_shape = batch->shape();
-                    batch_shape.RemoveDim(0);
-                    result->status.Update(errors::InvalidArgument(
-                        "Cannot add tensor to the batch: number of "
-                        "elements does not match. Shapes are: [tensor]: ",
-                        tensor.shape().DebugString(),
-                        ", [batch]: ", batch_shape.DebugString()));
-                    break;
-                  }
-                  // TODO(mrry): Add a version of DoParallelConcat that allows
-                  // us to move `tensor` where possible, to speed up string
-                  // tensor batching.
-                  Status copy_status = ::tensorflow::functor::DoParallelConcat(
-                      *dataset()->device_, tensor, offset, batch);
-                  if (!copy_status.ok()) {
-                    result->status.Update(copy_status);
-                    break;
-                  }
-                }
-              }
-              // NOTE(mrry): We clear the return values here to release any
-              // memory associated with them and to paralellize the destruction
-              // of the tensors (which can be surprisingly expensive for
-              // map functions with large numbers of return values).
-              result->return_values.clear();
-              batch_result->counter->DecrementCount();
-            });
+        (*ctx->runner())(std::bind(
+            [this, result, batch_result, offset](
+                IteratorContext* ctx, std::vector<Tensor> input_element) {
+              dataset()->captured_func_->RunAsync(
+                  ctx, std::move(input_element), &result->return_values,
+                  [this, ctx, result, batch_result, offset](Status ret_status) {
+                    result->status.Update(ret_status);
+                    if (ret_status.ok()) {
+                      EnsureOutputAllocated(ctx, batch_result,
+                                            result->return_values);
+                      const size_t num_components =
+                          result->return_values.size();
+                      for (size_t i = 0; i < num_components; ++i) {
+                        const Tensor& tensor = result->return_values[i];
+                        Tensor* batch = &(batch_result->output)[i];
+                        if (tensor.NumElements() !=
+                            (batch->NumElements() / batch->dim_size(0))) {
+                          TensorShape batch_shape = batch->shape();
+                          batch_shape.RemoveDim(0);
+                          result->status.Update(errors::InvalidArgument(
+                              "Cannot add tensor to the batch: number of "
+                              "elements does not match. Shapes are: [tensor]: ",
+                              tensor.shape().DebugString(),
+                              ", [batch]: ", batch_shape.DebugString()));
+                          break;
+                        }
+                        // TODO(mrry): Add a version of DoParallelConcat that
+                        // allows us to move `tensor` where possible, to speed
+                        // up string tensor batching.
+                        Status copy_status =
+                            ::tensorflow::functor::DoParallelConcat(
+                                *dataset()->device_, tensor, offset, batch);
+                        if (!copy_status.ok()) {
+                          result->status.Update(copy_status);
+                          break;
+                        }
+                      }
+                    }
+                    delete ctx;
+                    // NOTE(mrry): We clear the return values here to release
+                    // any memory associated with them and to paralellize the
+                    // destruction of the tensors (which can be surprisingly
+                    // expensive for map functions with large numbers of return
+                    // values).
+                    result->return_values.clear();
+                    batch_result->counter->DecrementCount();
+                  });
+            },
+            new IteratorContext(*ctx), std::move(input_element)));
       }
 
       void StartInvocationBatch(IteratorContext* ctx, int64 batch_index)
@@ -305,9 +345,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         for (size_t i = 0; i < dataset()->batch_size_; ++i) {
           size_t index = ComputeInvocationIndex(batch_index, i);
           InvocationResult* result = &invocation_results_[index];
-          // Reset the state of `result`.
-          // NOTE(mrry): `result->return_values` were cleared when the previous
-          // invocation completed.
+          // Reset the state of `result`; `result->return_values` was cleared
+          // when the previous invocation completed.
+          result->end_of_input = false;
           result->status = Status::OK();
         }
         // Start individual invocations.
@@ -316,13 +356,18 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
-      Status WaitForBatch(int64 batch_index) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      Status WaitForBatch(int64 batch_index, int64* num_elements)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         port::Tracing::TraceMe activity(strings::StrCat(prefix(), "::Wait"));
         batch_results_[batch_index].counter->Wait();
         Status status = Status::OK();
-        for (size_t i = 0; i < dataset()->batch_size_; ++i) {
+        for (size_t i = 0; i < dataset()->batch_size_; ++i, ++*num_elements) {
           size_t index = ComputeInvocationIndex(batch_index, i);
           InvocationResult* result = &invocation_results_[index];
+          if (result->end_of_input) {
+            VLOG(3) << "end of input encountered at element[" << i << "]: ";
+            return Status::OK();
+          }
           if (!result->status.ok()) {
             VLOG(3) << "failed to process element[" << i
                     << "]: " << result->status;
@@ -337,7 +382,6 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::vector<InvocationResult> invocation_results_ GUARDED_BY(mu_);
       std::vector<BatchResult> batch_results_ GUARDED_BY(mu_);
-      bool end_of_input_ GUARDED_BY(mu_) = false;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
similarity index 85%
rename from tensorflow/core/kernels/map_dataset_op.cc
rename to tensorflow/core/kernels/data/map_dataset_op.cc
index 4ba09bc335e9682eef2a0c2042aa98e9b428d562..89360d1cd95e896ebf284a0058edb122c7f82d09 100644
--- a/tensorflow/core/kernels/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -12,15 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
-#include "tensorflow/core/kernels/captured_function.h"
-
 namespace tensorflow {
 
 namespace {
@@ -49,9 +47,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     }
 
     std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
-                                                 std::move(other_arguments),
-                                                 &captured_func));
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
+                            func_, std::move(other_arguments), &captured_func));
 
     *output = new Dataset(ctx, input, func_, std::move(captured_func),
                           output_types_, output_shapes_);
@@ -98,10 +95,10 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
 
-      DataTypeVector other_arguments_types(
-          captured_func_->captured_inputs().size());
-      std::vector<NodeBuilder::NodeOut> other_arguments(
-          captured_func_->captured_inputs().size());
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
         TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
@@ -143,20 +140,10 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
           return Status::OK();
         }
 
-        FunctionLibraryRuntime::Options opts;
-        opts.step_id = CapturedFunction::generate_step_id();
-        ScopedStepContainer step_container(
-            opts.step_id, [this, ctx](const string& name) {
-              dataset()
-                  ->captured_func_->resource_manager()
-                  ->Cleanup(name)
-                  .IgnoreError();
-            });
-        opts.step_container = &step_container;
-        opts.runner = ctx->runner();
         // TODO(mrry): Avoid blocking a threadpool thread. We will need to
         // stack-rip the iterators and use async kernels.
-        Status s = dataset()->captured_func_->Run(opts, args, out_tensors);
+        Status s =
+            dataset()->captured_func_->Run(ctx, std::move(args), out_tensors);
         if (errors::IsOutOfRange(s)) {
           // `f` may deliberately raise `errors::OutOfRange` to indicate
           // that we should terminate the iteration early.
@@ -173,7 +160,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
         return Status::OK();
diff --git a/tensorflow/core/kernels/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
similarity index 76%
rename from tensorflow/core/kernels/padded_batch_dataset_op.cc
rename to tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 7c28d955e1a2ce129110f112fe87a9bce05a14a0..cfb4efda9a56fde04994201f509cf3d9fb45ea82 100644
--- a/tensorflow/core/kernels/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -25,102 +25,6 @@ namespace {
 // See documentation in ../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-// The following five functions are copied from padding_fifo_queue.cc.
-// TODO(mrry): Reconcile these functions with the similar methods in the
-// queue implementation.
-Status ValidateElementToLargerSlice(const Tensor& element, Tensor* parent) {
-  DCHECK_NE(parent->dim_size(0), 0);
-  if (element.NumElements() > (parent->NumElements() / parent->dim_size(0))) {
-    TensorShape chip_shape = parent->shape();
-    chip_shape.RemoveDim(0);
-    return errors::Internal(
-        "HandleElementToLargerSlice Cannot copy slice: number of entries in "
-        "element is greater than number of elements in parent slice.  ",
-        "Shapes are: [element]: ", element.shape().DebugString(),
-        ", [parent slice]: ", chip_shape.DebugString());
-  }
-  return Status::OK();
-}
-
-template <typename T, int NDIMS>
-Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent,
-                                  int index) {
-  TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(element, parent));
-  if (element.NumElements() == 0) {
-    return Status::OK();
-  }
-  auto element_t = element.tensor<T, NDIMS>();
-  auto parent_t = parent->tensor<T, NDIMS + 1>();
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_indices;
-  slice_indices[0] = index;
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_size;
-  slice_size[0] = 1;
-  for (size_t i = 1; i < slice_size.size(); ++i) {
-    slice_size[i] = element_t.dimension(i - 1);
-  }
-  parent_t.slice(slice_indices, slice_size) = element_t.reshape(slice_size);
-  return Status::OK();
-}
-
-template <int NDIMS>
-Status HandleElementToLargerSliceWithRank(const Tensor& element, Tensor* parent,
-                                          int index) {
-#define HANDLE_TYPE(T)                                                   \
-  case DataTypeToEnum<T>::value: {                                       \
-    return HandleElementToLargerSlice<T, NDIMS>(element, parent, index); \
-  }
-
-  switch (element.dtype()) {
-    TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-#undef HANDLE_TYPE
-    default:
-      return errors::Unimplemented(
-          "HandleElementToLargerSliceWithRank Unhandled data type: ",
-          element.dtype());
-  }
-}
-
-Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
-                                int index) {
-  if (parent->dims() != element.dims() + 1) {
-    return errors::Internal(
-        "Mismatched ranks.  Element's rank is: ", element.dims(),
-        " but element is meant to be a slice in output Tensor having rank: ",
-        parent->dims(), " (should be: ", element.dims() + 1, ")");
-  }
-
-#define HANDLE_DIMS(NDIMS)                                                  \
-  case NDIMS: {                                                             \
-    TF_RETURN_IF_ERROR(                                                     \
-        HandleElementToLargerSliceWithRank<NDIMS>(element, parent, index)); \
-    return Status::OK();                                                    \
-  }
-
-  switch (element.dims()) {
-    HANDLE_DIMS(0);
-    HANDLE_DIMS(1);
-    HANDLE_DIMS(2);
-    HANDLE_DIMS(3);
-    HANDLE_DIMS(4);
-#undef HANDLE_DIMS
-    default:
-      return errors::Unimplemented("CopyElementToLargerSlice Unhandled rank: ",
-                                   element.dims());
-  }
-}
-
-Status SetElementZero(Tensor* element, const Tensor& padding) {
-#define HANDLE_TYPE(T)                                     \
-  if (element->dtype() == DataTypeToEnum<T>::value) {      \
-    element->flat<T>().setConstant(padding.scalar<T>()()); \
-    return Status::OK();                                   \
-  }
-  TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-#undef HANDLE_TYPE
-  return errors::Unimplemented("SetElementZero Unhandled data type: ",
-                               element->dtype());
-}
-
 class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit PaddedBatchDatasetOp(OpKernelConstruction* ctx)
@@ -242,7 +146,7 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
       Node* batch_size = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
 
-      std::vector<NodeBuilder::NodeOut> padded_shapes;
+      std::vector<Node*> padded_shapes;
       padded_shapes.reserve(padded_shapes_.size());
       for (int i = 0; i < padded_shapes_.size(); i++) {
         Node* node;
@@ -254,7 +158,7 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
         padded_shapes.emplace_back(node);
       }
 
-      std::vector<NodeBuilder::NodeOut> padding_values;
+      std::vector<Node*> padding_values;
       padding_values.reserve(padding_values_.size());
       for (const Tensor& t : padding_values_) {
         Node* node;
@@ -377,20 +281,27 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
 
           // 2. Copy each batch element to the appropriate location in
           // the output component tensor.
-          Tensor batch_component(cpu_allocator(),
+          Tensor batch_component(ctx->allocator({}),
                                  output_dtypes()[component_index],
                                  batch_component_shape);
-          TF_RETURN_IF_ERROR(SetElementZero(
+          TF_RETURN_IF_ERROR(batch_util::SetElementZero(
               &batch_component, dataset()->padding_values_[component_index]));
 
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
+          TensorShape component_shape({});
+          for (int i = 1; i < batch_component_shape.dims(); ++i) {
+            component_shape.AddDim(batch_component_shape.dim_size(i));
+          }
           for (int64 i = 0; i < num_batch_elements; ++i) {
-            TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(
-                batch_elements[i][component_index], &batch_component));
-
-            TF_RETURN_IF_ERROR(CopyElementToLargerSlice(
-                batch_elements[i][component_index], &batch_component, i));
+            // Take the fast path if possible.
+            if (batch_elements[i][component_index].shape() == component_shape) {
+              TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice(
+                  batch_elements[i][component_index], &batch_component, i));
+            } else {
+              TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice(
+                  batch_elements[i][component_index], &batch_component, i));
+            }
           }
           out_tensors->push_back(std::move(batch_component));
         }
@@ -408,7 +319,7 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         if (reader->Contains(full_name("exhausted"))) {
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f88d6dee80ea7b07ef1ce88ee76edba65cddcde
--- /dev/null
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -0,0 +1,503 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <deque>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
+    std::vector<Tensor> other_arguments;
+    other_arguments.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      other_arguments.push_back(t);
+    }
+
+    int64 cycle_length = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "cycle_length", &cycle_length));
+    OP_REQUIRES(ctx, cycle_length > 0,
+                errors::InvalidArgument("`cycle_length` must be > 0"));
+
+    int64 block_length = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "block_length", &block_length));
+    OP_REQUIRES(ctx, block_length > 0,
+                errors::InvalidArgument("`block_length` must be > 0"));
+
+    bool sloppy = false;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "sloppy", &sloppy));
+
+    int64 buffer_output_elements = 0;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "buffer_output_elements",
+                                            &buffer_output_elements));
+    OP_REQUIRES(
+        ctx, buffer_output_elements > 0,
+        errors::InvalidArgument("`buffer_output_elements` must be > 0"));
+
+    int64 prefetch_input_elements = 0;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "prefetch_input_elements",
+                                            &prefetch_input_elements));
+    OP_REQUIRES(
+        ctx, prefetch_input_elements >= 0,
+        errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
+                            func_, std::move(other_arguments), &captured_func));
+
+    *output =
+        new Dataset(input, std::move(captured_func), cycle_length, block_length,
+                    sloppy, buffer_output_elements, prefetch_input_elements,
+                    output_types_, output_shapes_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input,
+            std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
+            int64 block_length, bool sloppy, int64 buffer_output_elements,
+            int64 prefetch_input_elements, const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : input_(input),
+          captured_func_(std::move(captured_func)),
+          cycle_length_(cycle_length),
+          block_length_(block_length),
+          sloppy_(sloppy),
+          buffer_output_elements_(buffer_output_elements),
+          prefetch_input_elements_(prefetch_input_elements),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::ParallelInterleave")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override {
+      return "ParallelInterleaveDatasetOp::Dataset";
+    }
+
+   private:
+    int64 num_threads() const {
+      return cycle_length_ + prefetch_input_elements_;
+    }
+
+    // Parallel interleave's implementation is designed around a few principles:
+    //  1. Thread creation is relatively expensive. (Not reusing
+    //     threads causes a number of indirect costs such as poorer tcmalloc
+    //     performance due to thread-local caches, etc.) We allocate a fixed
+    //     number of threads at the start and never change. This is why we've
+    //     fused functionality that is theoretically orthogonal (i.e.
+    //     .prefetch()) into the implementation.
+    //  2. Drop-in replacement for standard interleave. The goal will be to
+    //     auto-opt people into an optimized implementation without any work
+    //     on the customer's part. We thus go through great pains to maintain
+    //     identical iteration orders, full determinism (disabled only via a
+    //     flag, etc.)
+    //  3. Performance across a variety of environments and I/O envelopes.
+    //
+    // The actual implementation centers around a collection of worker threads
+    // and their corresponding worker state (tracked in the `workers_` vector).
+    // Worker threads repeatedly receive a vector of Tensors that are used as
+    // input to the flat-map function (`captured_func_`). The output of this
+    // function must be a dataset. The worker thread then repeatedly calls
+    // `GetNext()`, maintaining a buffer of elements to minimize the likelihood
+    // that a caller will block waiting for an element to be produced.
+    //
+    // Pointers to these worker states are kept in 2 disjoint data structures:
+    //  1. `interleave_` is a vector containing pointers to `WorkerState`s that
+    //  we
+    //     are interleaving. Worker threads backing these WorkerStates should
+    //     be regularly producing values.
+    //  2. `staging_` is a deque containing pointers to WorkerStates that we
+    //     will move to `interleave_` when an iterator in `interleave_` is
+    //     exhausted.
+    //
+    // The client calls `GetNext[Internal]()` to retrieve an output element. The
+    // internal implementation updates the state of `interleave_` and `staging_`
+    // as output iterators (run by the worker threads) are exhausted.
+    //
+    // `input_impl_` is the input iterator that generates arguments for the
+    // flat-map function (`captured_func_`). It is set to an iterator at
+    // Iterator construction, and is fixed until we consume all input elements.
+    // Once it is exhausted, we reset the unique_ptr to eagerly deallocate
+    // memory.
+    //
+    // A few invariants are maintained:
+    //  1. No element in interleave_ should be a nullptr unless `staging_` is
+    //     empty and `input_impl_` is empty.
+    //  2. Every `worker_` element is pointed to by at most one element of the
+    //     union of `interleave_` and `staging_`.
+    //  3. Unless `input_impl_` is empty, every `worker_` must be pointed to by
+    //     an element in `interleave_` or `staging_`.
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
+            workers_(dataset()->num_threads()) {}
+
+      ~Iterator() override {
+        mutex_lock l(mu_);
+        cancelled_ = true;
+        // Notify all workers in case they are blocked.
+        for (auto& worker : workers_) {
+          worker.cond_var.notify_all();
+        }
+      }
+
+      // It is implemented so that it matches the deterministic interleave
+      // unless getting the next element would block and we are allowed to be
+      // sloppy.
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(EnsureWorkerThreadsStarted(ctx));
+        while (!cancelled_) {
+          // Wait for an item to become available, blocking if necessary. If we
+          // are allowed to be sloppy, we can skip over input datasets that do
+          // not have an item readily available.
+          bool can_produce_elements = false;
+          bool must_wait_for_input = true;
+          for (int64 i = 0; i < interleave_.size(); ++i) {
+            int64 index = (next_index_ + i) % interleave_.size();
+            WorkerState* current_worker = interleave_[index];
+            if (!current_worker) continue;  // Empty interleave elements.
+            can_produce_elements |= current_worker->MayHaveElements();
+            if (!current_worker->outputs.empty()) {
+              // We have an element!
+              next_index_ = index;
+              if (i == 0) {
+                block_count_++;
+                if (block_count_ == dataset()->block_length_) {
+                  next_index_ = (index + 1) % interleave_.size();
+                  block_count_ = 0;
+                }
+              } else {
+                block_count_ = 0;
+              }
+              *end_of_sequence = false;
+              Status s = current_worker->outputs.front().status;
+              current_worker->outputs.front().output.swap(*out_tensors);
+              current_worker->outputs.pop_front();
+              current_worker->cond_var.notify_one();
+              return s;
+            } else if (current_worker->is_producing && !dataset()->sloppy_) {
+              // current_worker.outputs.empty(), and we must wait for this
+              // iterator.
+              if (next_index_ != index) {
+                // We have advanced to a new iterator; reset block counts.
+                next_index_ = index;
+                block_count_ = 0;
+              }
+              break;
+            } else if (!current_worker->is_producing) {
+              // This iterator has reached end of input.
+              interleave_[index] = nullptr;
+              if (input_impl_) {
+                // Start prefetching a new iterator.
+                std::vector<Tensor> args;
+                bool end_of_input = false;
+                Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
+                if (end_of_input) {
+                  input_impl_.reset();
+                } else {
+                  current_worker->SetInputs(s, std::move(args));
+                  staging_.emplace_back(current_worker);
+                }
+              }
+
+              if (!staging_.empty()) {
+                // Move a worker from `staging_` to `interleave_`.
+                interleave_[index] = staging_.front();
+                staging_.pop_front();
+
+                next_index_ = (index + 1) % interleave_.size();
+                block_count_ = 0;
+                // Restart the inner [for] loop
+                can_produce_elements = true;
+                must_wait_for_input = false;
+                break;
+              }
+            }
+          }
+
+          if (!can_produce_elements && !input_impl_) {
+            // No potential for future values.
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          if (must_wait_for_input) {
+            // Wait for elements to become available.
+            if (dataset()->sloppy_) {
+              sloppy_cond_var_.wait(l);
+            } else {
+              interleave_[next_index_]->cond_var.wait(l);
+            }
+          }
+        }
+        return errors::Cancelled(
+            "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
+      }
+
+     private:
+      // OutputElem contains the information from a call to GetNext by an output
+      // iterator.
+      struct OutputElem {
+        // The output iterator sets `status` if getting the output element
+        // fails.
+        Status status;
+        // The buffered data element.
+        std::vector<Tensor> output;
+
+        explicit OutputElem(const Status& s) : status(s) {}
+      };
+
+      // Worker threads operate on their relevant WorkerState structs.
+      //
+      // WorkerState's fields are all protected by mu_;
+      struct WorkerState {
+        // The arguments to be used to construct an output iterator.
+        std::vector<Tensor> input;
+        // The buffered output elements.
+        std::deque<OutputElem> outputs;
+        // Set to true iff the worker thread expects to append more elements to
+        // outputs. is_producing can be false despite !outputs.empty().
+        // Concretely, all output elements will have been consumed only when:
+        // is_producing == false && outputs.empty();
+        bool is_producing = false;
+        // Condition variable used to coordinate between threads. The worker
+        // thread waits on this condition variable when it is either (1) waiting
+        // for the main thread to add arguments to `input`, or (2) waiting for
+        // the main thread to consume an element of `outputs`. The main thread
+        // waits on cond_var if it is waiting for the worker thread to produce
+        // an element into `outputs` (this implies sloppy_==false).
+        condition_variable cond_var;
+
+        inline bool MayHaveElements() const {
+          return is_producing || !outputs.empty();
+        }
+
+        // Sets inputs for a worker thread and notifies it to start processing.
+        void SetInputs(const Status& s, std::vector<Tensor> input_arguments) {
+          if (s.ok()) {
+            DCHECK(!MayHaveElements())
+                << "Tried to start inputs, despite already producing!";
+            input = std::move(input_arguments);
+            is_producing = true;
+            cond_var.notify_one();
+          } else {
+            outputs.emplace_back(s);
+          }
+        }
+      };
+
+      Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (worker_threads_.empty()) {
+          worker_threads_.reserve(dataset()->num_threads());
+          for (int64 i = 0; i < dataset()->num_threads(); ++i) {
+            std::vector<Tensor> args;
+            bool end_of_input = false;
+            Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
+            if (end_of_input) {
+              input_impl_.reset();
+              return Status::OK();
+            }
+            workers_[i].SetInputs(s, std::move(args));
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, "worker_thread",
+                std::bind(&Iterator::WorkerThread, this,
+                          new IteratorContext(*ctx), i)));
+            if (i < dataset()->cycle_length_) {
+              interleave_.push_back(&workers_[i]);
+            } else {
+              staging_.push_back(&workers_[i]);
+            }
+          }
+          DCHECK(interleave_.size() == dataset()->cycle_length_);
+          DCHECK(staging_.size() == dataset()->prefetch_input_elements_);
+        }
+        return Status::OK();
+      }
+
+      // Produces elements into the worker's output buffers.
+      void WorkerThread(IteratorContext* ctx_ptr, const int64 thread_index) {
+        // std::function arguments are copy-constructable, so we pass raw
+        // pointers, and then immediately wrap them to ensure correct ownership.
+        std::unique_ptr<IteratorContext> ctx(ctx_ptr);
+        auto cleanup = gtl::MakeCleanup([this, thread_index] {
+          mutex_lock l(mu_);
+          workers_[thread_index].cond_var.notify_all();
+        });
+
+        while (true) {
+          // 1. Wait for input.
+          std::vector<Tensor> input;
+          {
+            mutex_lock l(mu_);
+            while (!cancelled_ && !workers_[thread_index].is_producing) {
+              workers_[thread_index].cond_var.wait(l);
+            }
+            if (cancelled_) return;
+            input.swap(workers_[thread_index].input);
+          }
+
+          // 2. Run the user defined function to produce a new iterator.
+          std::unique_ptr<IteratorBase> iterator;
+          Status s = dataset::MakeIteratorFromInputElement(
+              ctx.get(), input, thread_index, dataset()->captured_func_.get(),
+              prefix(), &iterator);
+          input.clear();  // Release memory as early as possible.
+
+          if (!s.ok()) {
+            mutex_lock l(mu_);
+            workers_[thread_index].outputs.emplace_back(s);
+            workers_[thread_index].is_producing = false;
+            workers_[thread_index].cond_var.notify_one();
+          } else {
+            // 3. Produce elements
+            bool end_of_sequence = false;
+            while (!end_of_sequence) {
+              // 3.a Produce an element!
+              std::vector<Tensor> output_elem;
+              s = iterator->GetNext(ctx.get(), &output_elem, &end_of_sequence);
+
+              // 3.b Make it available to the client.
+              {
+                mutex_lock l(mu_);
+
+                // Wait for space in the prefetch queue.
+                while (!cancelled_ && workers_[thread_index].outputs.size() ==
+                                          dataset()->buffer_output_elements_) {
+                  workers_[thread_index].cond_var.wait(l);
+                }
+                if (cancelled_) return;
+
+                // Output the element.
+                workers_[thread_index].is_producing = !end_of_sequence;
+                if (!end_of_sequence) {
+                  workers_[thread_index].outputs.emplace_back(s);
+                  workers_[thread_index].outputs.back().output.swap(
+                      output_elem);
+                }
+                if (dataset()->sloppy_) {
+                  sloppy_cond_var_.notify_one();
+                } else {
+                  workers_[thread_index].cond_var.notify_one();
+                }
+              }
+            }
+          }
+        }
+      }
+
+      // Mutex & condition variable to guard mutable iterator internals and
+      // coordinate among worker threads and client thread[s].
+      mutex mu_;
+      // The main thread waits on this condition variable if running in sloppy
+      // mode and no values are available.
+      condition_variable sloppy_cond_var_;
+
+      // The iterator producing elements which are converted to datasets by
+      // the dataset()->captured_func_ then interleaved together.
+      // input_impl_ is reset when we have exhausted its input.
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+
+      // The WorkerState structs the worker threads operate on.
+      // workers_ elements are in at most one of interleave_ and staging_.
+      std::vector<WorkerState> workers_ GUARDED_BY(mu_);
+
+      // The iterators to interleave
+      std::vector<WorkerState*> interleave_ GUARDED_BY(mu_);
+      // Prefetched iterators
+      std::deque<WorkerState*> staging_ GUARDED_BY(mu_);
+
+      // The index into output_elements_ for next element to produce.
+      size_t next_index_ GUARDED_BY(mu_) = 0;
+      // The number of items produced so far within the block
+      size_t block_count_ GUARDED_BY(mu_) = 0;
+      // Flag to instruct the worker threads to exit.
+      bool cancelled_ GUARDED_BY(mu_) = false;
+      // The worker threads. This must be last to ensure the
+      // threads have exited before any other members are deallocated.
+      // TODO(b/65178177): Avoid allocating additional threads.
+      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const int64 cycle_length_;
+    const int64 block_length_;
+    const bool sloppy_;
+    const int64 buffer_output_elements_;
+    const int64 prefetch_input_elements_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  NameAttrList func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
+                        ParallelInterleaveDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc4426a9fdbab971a4e49d57ffcea6896fc037a7
--- /dev/null
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -0,0 +1,403 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <deque>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ParallelMapDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
+    std::vector<Tensor> other_arguments;
+    other_arguments.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      other_arguments.push_back(t);
+    }
+
+    int32 num_parallel_calls;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
+                                            &num_parallel_calls));
+    OP_REQUIRES(ctx, num_parallel_calls > 0,
+                errors::InvalidArgument(
+                    "num_parallel_calls must be greater than zero."));
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
+                            func_, std::move(other_arguments), &captured_func));
+
+    *output = new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
+                          output_shapes_, std::move(captured_func));
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func, int32 num_parallel_calls,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes,
+            std::unique_ptr<CapturedFunction> captured_func)
+        : GraphDatasetBase(ctx),
+          input_(input),
+          func_(func),
+          num_parallel_calls_(num_parallel_calls),
+          output_types_(output_types),
+          output_shapes_(output_shapes),
+          captured_func_(std::move(captured_func)) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::ParallelMap")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "ParallelMapDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      // Input: input_dataset
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+
+      // Input: other_arguments
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+
+      // Input: num_parallel_calls
+      Node* num_parallel_calls = nullptr;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(num_parallel_calls_, &num_parallel_calls));
+
+      // Attr: f
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      AttrValue f;
+      b->BuildAttrValue(func_, &f);
+
+      // Attr: Targuments
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {std::make_pair(0, input_graph_node),
+           std::make_pair(2, num_parallel_calls)},  // Single tensor inputs.
+          {std::make_pair(1, other_arguments)},     // Tensor list inputs.
+          {std::make_pair("f", f),
+           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
+          output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
+            invocation_results_(params.dataset->num_parallel_calls_) {}
+
+      ~Iterator() override {
+        // TODO(mrry): Replace this cancellation logic with a
+        // CancellationManager. The syntax would be more heavyweight,
+        // but it would be possible to thread a cancellation manager
+        // through the IteratorContext to upstream,
+        // potentially-blocking iterators, when we add these.
+        {
+          mutex_lock l(mu_);
+          for (size_t i = 0; i < dataset()->num_parallel_calls_; ++i) {
+            if (invocation_results_[i].notification) {
+              invocation_results_[i].notification->WaitForNotification();
+            }
+          }
+        }
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+
+        // Ensure that there are `dataset()->num_parallel_calls_`
+        // invocations of `func_` outstanding at once.
+        while (input_impl_ && (num_inputs_consumed_ - num_outputs_consumed_ <
+                               dataset()->num_parallel_calls_)) {
+          InvokeFunctionLocked(ctx);
+        }
+
+        if (!input_impl_ && num_inputs_consumed_ == num_outputs_consumed_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
+        // Read the next result out of `invocation_results_`, which
+        // acts as a circular buffer.
+        const size_t result_index =
+            num_outputs_consumed_ % dataset()->num_parallel_calls_;
+        InvocationResult* result = &invocation_results_[result_index];
+        *end_of_sequence = false;
+        if (result->notification) {
+          result->notification->WaitForNotification();
+          if (result->status.ok()) {
+            std::swap(*out_tensors, result->return_values);
+          }
+        }
+        ++num_outputs_consumed_;
+        return result->status;
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("end_of_input"), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("num_inputs_consumed"),
+                                               num_inputs_consumed_));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name("num_outputs_consumed"), num_outputs_consumed_));
+
+        for (size_t i = 0; i < dataset()->num_parallel_calls_; i++) {
+          if (invocation_results_[i].notification) {
+            invocation_results_[i].notification->WaitForNotification();
+            TF_RETURN_IF_ERROR(
+                WriteStatusLocked(writer, i, invocation_results_[i].status));
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("invocation_results[", i, "].size")),
+                invocation_results_[i].return_values.size()));
+            for (size_t j = 0; j < invocation_results_[i].return_values.size();
+                 j++) {
+              TF_RETURN_IF_ERROR(writer->WriteTensor(
+                  full_name(
+                      strings::StrCat("invocation_results[", i, "][", j, "]")),
+                  invocation_results_[i].return_values[j]));
+            }
+          } else {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("invocation_results[", i, "]_empty")),
+                ""));
+          }
+        }
+
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("end_of_input"))) {
+          input_impl_.reset();
+        } else {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        }
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("num_inputs_consumed"),
+                                              &num_inputs_consumed_));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("num_outputs_consumed"),
+                                              &num_outputs_consumed_));
+        for (size_t i = 0; i < dataset()->num_parallel_calls_; i++) {
+          InvocationResult* result = &invocation_results_[i];
+          *result = InvocationResult();
+          if (!reader->Contains(full_name(
+                  strings::StrCat("invocation_results[", i, "]_empty")))) {
+            result->notification.reset(new Notification);
+            result->notification->Notify();
+            TF_RETURN_IF_ERROR(ReadStatusLocked(reader, i, &result->status));
+            size_t num_return_values;
+            {
+              int64 size;
+              TF_RETURN_IF_ERROR(
+                  reader->ReadScalar(full_name(strings::StrCat(
+                                         "invocation_results[", i, "].size")),
+                                     &size));
+              num_return_values = static_cast<size_t>(size);
+              if (num_return_values != size) {
+                return errors::InvalidArgument(strings::StrCat(
+                    full_name(
+                        strings::StrCat("invocation_results[", i, "].size")),
+                    ": ", size, " is not a valid value of type size_t."));
+              }
+            }
+            result->return_values.reserve(num_return_values);
+            for (size_t j = 0; j < num_return_values; j++) {
+              result->return_values.emplace_back();
+              TF_RETURN_IF_ERROR(reader->ReadTensor(
+                  full_name(
+                      strings::StrCat("invocation_results[", i, "][", j, "]")),
+                  &result->return_values.back()));
+            }
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      struct InvocationResult {
+        Status status;
+        std::unique_ptr<Notification> notification;
+        std::vector<Tensor> return_values;
+      };
+
+      void InvokeFunctionLocked(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        DCHECK(input_impl_);
+        DCHECK(num_inputs_consumed_ - num_outputs_consumed_ <
+               dataset()->num_parallel_calls_);
+
+        // The result of invoking the function will be written into the next
+        // slot in `invocation_results_`, which acts as a circular buffer.
+        const size_t result_index =
+            num_inputs_consumed_ % dataset()->num_parallel_calls_;
+        InvocationResult* result = &invocation_results_[result_index];
+        *result = InvocationResult();
+
+        // Get the next input element.
+        std::vector<Tensor> input_element;
+        bool end_of_input;
+        result->status =
+            input_impl_->GetNext(ctx, &input_element, &end_of_input);
+        if (end_of_input) {
+          input_impl_.reset();
+          result->status = errors::OutOfRange("");
+        } else {
+          ++num_inputs_consumed_;
+        }
+
+        if (result->status.ok()) {
+          // Call `func_(input_element)`, store the result in
+          // `result->return_values`, and notify `result->notification`
+          // to unblock a consumer.
+          result->notification.reset(new Notification);
+          dataset()->captured_func_->RunAsync(
+              ctx, std::move(input_element), &result->return_values,
+              [result, result_index](Status ret_status) {
+                result->status.Update(ret_status);
+                result->notification->Notify();
+              });
+        }
+      }
+
+      Status WriteStatusLocked(IteratorStateWriter* writer, size_t index,
+                               const Status& status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            CodeKey(index), static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index),
+                                                 status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      Status ReadStatusLocked(IteratorStateReader* reader, size_t index,
+                              Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        int64 code_int;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
+        error::Code code = static_cast<error::Code>(code_int);
+
+        if (code != error::Code::OK) {
+          string error_message;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(ErrorMessageKey(index), &error_message));
+          *status = Status(code, error_message);
+        } else {
+          *status = Status::OK();
+        }
+        return Status::OK();
+      }
+
+      string CodeKey(size_t index) {
+        return full_name(
+            strings::StrCat("invocation_results[", index, "].code"));
+      }
+
+      string ErrorMessageKey(size_t index) {
+        return full_name(
+            strings::StrCat("invocation_results[", index, "].error_message"));
+      }
+
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::vector<InvocationResult> invocation_results_ GUARDED_BY(mu_);
+      int64 num_inputs_consumed_ GUARDED_BY(mu_) = 0;
+      int64 num_outputs_consumed_ GUARDED_BY(mu_) = 0;
+    };
+
+    const DatasetBase* const input_;
+    const NameAttrList func_;
+    const int32 num_parallel_calls_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  NameAttrList func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ParallelMapDataset").Device(DEVICE_CPU),
+                        ParallelMapDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
similarity index 98%
rename from tensorflow/core/kernels/prefetch_dataset_op.cc
rename to tensorflow/core/kernels/data/prefetch_dataset_op.cc
index b02269f525a8bec3b6ddb01a5039316a7c47a309..1c548a30d2c8e7f33db85000d0f480b3151d6ecf 100644
--- a/tensorflow/core/kernels/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 
 namespace tensorflow {
@@ -164,7 +164,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
                 buffer_element.value.size()));
             for (size_t j = 0; j < buffer_element.value.size(); j++) {
               TF_RETURN_IF_ERROR(writer->WriteTensor(
-                  strings::StrCat("buffer[", i, "][", j, "]"),
+                  full_name(strings::StrCat("buffer[", i, "][", j, "]")),
                   buffer_element.value[j]));
             }
           }
@@ -172,7 +172,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock parent_l(parent_mu_);
         mutex_lock l(mu_);
@@ -201,7 +201,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
             for (size_t j = 0; j < value_size; j++) {
               buffer_element.value.emplace_back();
               TF_RETURN_IF_ERROR(reader->ReadTensor(
-                  strings::StrCat("buffer[", i, "][", j, "]"),
+                  full_name(strings::StrCat("buffer[", i, "][", j, "]")),
                   &buffer_element.value.back()));
             }
           }
diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/random_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..210b9ad1b84eeb0c106b0ee538b4957aba7ce1b2
--- /dev/null
+++ b/tensorflow/core/kernels/data/random_dataset_op.cc
@@ -0,0 +1,154 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class RandomDatasetOp : public DatasetOpKernel {
+ public:
+  explicit RandomDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    int64 seed;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed", &seed));
+
+    int64 seed2;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed2", &seed2));
+
+    // By TensorFlow convention, passing 0 for both seeds indicates
+    // that the shuffling should be seeded non-deterministically.
+    if (seed == 0 && seed2 == 0) {
+      seed = random::New64();
+      seed2 = random::New64();
+    }
+
+    *output = new Dataset(ctx, seed, seed2);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, int64 seed, int64 seed2)
+        : GraphDatasetBase(ctx), seed_(seed), seed2_(seed2) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Random")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_INT64});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("RandomDatasetOp(", seed_, ", ", seed2_,
+                             ")::Dataset");
+    }
+
+   protected:
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* seed = nullptr;
+      Node* seed2 = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(seed_, &seed));
+      TF_RETURN_IF_ERROR(b->AddScalar(seed2_, &seed2));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {seed, seed2}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            parent_generator_(dataset()->seed_, dataset()->seed2_),
+            generator_(&parent_generator_) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        Tensor value_tensor(ctx->allocator({}), DT_INT64, {});
+        value_tensor.scalar<int64>()() = Random();
+        out_tensors->emplace_back(std::move(value_tensor));
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("num_random_samples"),
+                                               num_random_samples_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("num_random_samples"),
+                                              &num_random_samples_));
+        parent_generator_ =
+            random::PhiloxRandom(dataset()->seed_, dataset()->seed2_);
+        generator_ = random::SingleSampleAdapter<random::PhiloxRandom>(
+            &parent_generator_);
+        generator_.Skip(num_random_samples_);
+        return Status::OK();
+      }
+
+     private:
+      random::SingleSampleAdapter<random::PhiloxRandom>::ResultType Random()
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        num_random_samples_++;
+        auto out = generator_();
+        return out;
+      }
+      mutex mu_;
+      random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+      random::SingleSampleAdapter<random::PhiloxRandom> generator_
+          GUARDED_BY(mu_);
+      int64 num_random_samples_ GUARDED_BY(mu_) = 0;
+    };
+
+    const int64 seed_;
+    const int64 seed2_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("RandomDataset").Device(DEVICE_CPU),
+                        RandomDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
similarity index 96%
rename from tensorflow/core/kernels/range_dataset_op.cc
rename to tensorflow/core/kernels/data/range_dataset_op.cc
index e7ae840fc7d023cda8c11ecd1f7cde3842a9da00..b57518e678ed185a183e0413d6e90f2a9f85e9fc 100644
--- a/tensorflow/core/kernels/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -101,7 +100,7 @@ class RangeDatasetOp : public DatasetOpKernel {
           *end_of_sequence = true;
           return Status::OK();
         }
-        Tensor value_tensor(cpu_allocator(), DT_INT64, {});
+        Tensor value_tensor(ctx->allocator({}), DT_INT64, {});
         value_tensor.scalar<int64>()() = next_;
         out_tensors->emplace_back(std::move(value_tensor));
         *end_of_sequence = false;
@@ -117,7 +116,7 @@ class RangeDatasetOp : public DatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next"), &next_));
diff --git a/tensorflow/core/kernels/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
similarity index 96%
rename from tensorflow/core/kernels/reader_dataset_ops.cc
rename to tensorflow/core/kernels/data/reader_dataset_ops.cc
index d942ddc4a7b9042038c6b7a2a52e46c1bf45b2a9..34d7d9f914d7a726135febabb1fbe35b0146977c 100644
--- a/tensorflow/core/kernels/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
@@ -142,7 +141,7 @@ class TextLineDatasetOp : public DatasetOpKernel {
 
             if (s.ok()) {
               // Produce the line as output.
-              Tensor line_tensor(cpu_allocator(), DT_STRING, {});
+              Tensor line_tensor(ctx->allocator({}), DT_STRING, {});
               line_tensor.scalar<string>()() = line_contents;
               out_tensors->emplace_back(std::move(line_tensor));
               *end_of_sequence = false;
@@ -183,7 +182,7 @@ class TextLineDatasetOp : public DatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         ResetStreamsLocked();
@@ -385,7 +384,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               TF_RETURN_IF_ERROR(
                   input_buffer_->ReadNBytes(dataset()->record_bytes_, &record));
               // Produce the record as output.
-              Tensor record_tensor(cpu_allocator(), DT_STRING, {});
+              Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
               record_tensor.scalar<string>()() = record;
               out_tensors->emplace_back(std::move(record_tensor));
               *end_of_sequence = false;
@@ -410,6 +409,20 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
           TF_RETURN_IF_ERROR(ctx->env()->GetFileSize(
               dataset()->filenames_[current_file_index_], &file_size));
           file_pos_limit_ = file_size - dataset()->footer_bytes_;
+
+          uint64 body_size =
+              file_size - (dataset()->header_bytes_ + dataset()->footer_bytes_);
+
+          if (body_size % dataset()->record_bytes_ != 0) {
+            return errors::InvalidArgument(
+                "Excluding the header (", dataset()->header_bytes_,
+                " bytes) and footer (", dataset()->footer_bytes_,
+                " bytes), input file \"",
+                dataset()->filenames_[current_file_index_],
+                "\" has body length ", body_size,
+                " bytes, which is not an exact multiple of the record length (",
+                dataset()->record_bytes_, " bytes).");
+          }
           TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
               dataset()->filenames_[current_file_index_], &file_));
           input_buffer_.reset(
@@ -434,7 +447,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         int64 current_file_index;
@@ -576,7 +589,7 @@ class TFRecordDatasetOp : public DatasetOpKernel {
         do {
           // We are currently processing a file, so try to read the next record.
           if (reader_) {
-            Tensor result_tensor(cpu_allocator(), DT_STRING, {});
+            Tensor result_tensor(ctx->allocator({}), DT_STRING, {});
             Status s = reader_->ReadRecord(&result_tensor.scalar<string>()());
             if (s.ok()) {
               out_tensors->emplace_back(std::move(result_tensor));
@@ -615,7 +628,7 @@ class TFRecordDatasetOp : public DatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         ResetStreamsLocked();
diff --git a/tensorflow/core/kernels/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
similarity index 88%
rename from tensorflow/core/kernels/repeat_dataset_op.cc
rename to tensorflow/core/kernels/data/repeat_dataset_op.cc
index 3d977a0fa38be77ac812cb12aade2af20b871fb8..1cb533158bb5b8bd4b950192ce67e17c0f9d5447 100644
--- a/tensorflow/core/kernels/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -100,7 +99,7 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
       Status SaveInternal(IteratorStateWriter* writer) override {
         return Status::OK();
       }
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         return Status::OK();
       }
@@ -148,7 +147,7 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
@@ -176,30 +175,25 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
         do {
+          bool first_call = false;
           if (!input_impl_) {
+            first_call = true;
             input_impl_ = dataset()->input_->MakeIterator(prefix());
-            TF_RETURN_IF_ERROR(
-                input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
-            // If the first call to GetNext() fails because the end of
-            // sequence has been reached, we return an OutOfRange
-            // error to terminate the iteration. (Otherwise, this
-            // iterator would loop infinitely and never produce a
-            // value.)
-            if (!*end_of_sequence) {
-              return Status::OK();
-            } else {
-              input_impl_.reset();
+          }
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          if (!*end_of_sequence) {
+            return Status::OK();
+          } else {
+            input_impl_.reset();
+            if (first_call) {
+              // If the first call to GetNext() fails because the end of
+              // sequence has been reached, we return an OutOfRange error to
+              // terminate the iteration. (Otherwise, this iterator would loop
+              // infinitely and never produce a value.)
               return errors::OutOfRange(
                   "Attempted to repeat an empty dataset infinitely.");
             }
-          } else {
-            TF_RETURN_IF_ERROR(
-                input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
-            if (!*end_of_sequence) {
-              return Status::OK();
-            } else {
-              input_impl_.reset();
-            }
           }
         } while (true);
       }
@@ -215,7 +209,7 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         if (reader->Contains(full_name("uninitialized"))) {
diff --git a/tensorflow/core/kernels/scan_dataset_op.cc b/tensorflow/core/kernels/data/scan_dataset_op.cc
similarity index 65%
rename from tensorflow/core/kernels/scan_dataset_op.cc
rename to tensorflow/core/kernels/data/scan_dataset_op.cc
index 76c219f1ae6352f047035b1bfd3231689d0d3771..5dd6ff848eb4836dd9cbc51b9408d01a652241f0 100644
--- a/tensorflow/core/kernels/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/scan_dataset_op.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
@@ -60,24 +60,26 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
     }
 
     std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
-                                                 std::move(other_arguments),
-                                                 &captured_func));
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(
+                            func_, std::move(other_arguments), &captured_func));
 
-    *output =
-        new Dataset(input, std::move(initial_state), std::move(captured_func),
-                    state_types_, output_types_, output_shapes_);
+    *output = new Dataset(ctx, input, func_, std::move(initial_state),
+                          std::move(captured_func), state_types_, output_types_,
+                          output_shapes_);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input, std::vector<Tensor> initial_state,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func, std::vector<Tensor> initial_state,
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& state_types,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : input_(input),
+        : GraphDatasetBase(ctx),
+          input_(input),
+          func_(func),
           initial_state_(std::move(initial_state)),
           captured_func_(std::move(captured_func)),
           state_types_(state_types),
@@ -103,6 +105,45 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "ScanDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      std::vector<Node*> initial_state_nodes;
+      initial_state_nodes.reserve(initial_state_.size());
+      for (const Tensor& t : initial_state_) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        initial_state_nodes.emplace_back(node);
+      }
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(func_, &f);
+      AttrValue state_types;
+      b->BuildAttrValue(state_types_, &state_types);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {{0, input_node}},
+                        {{1, initial_state_nodes}, {2, other_arguments}},
+                        {{"f", f},
+                         {"Tstate", state_types},
+                         {"Targuments", other_arguments_types_attr}},
+                        output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -129,22 +170,12 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
         std::copy(next_element.begin(), next_element.end(),
                   std::back_inserter(args));
 
-        FunctionLibraryRuntime::Options opts;
-        opts.step_id = CapturedFunction::generate_step_id();
-        ScopedStepContainer step_container(
-            opts.step_id, [this, ctx](const string& name) {
-              dataset()
-                  ->captured_func_->resource_manager()
-                  ->Cleanup(name)
-                  .IgnoreError();
-            });
-        opts.step_container = &step_container;
-        opts.runner = ctx->runner();
         std::vector<Tensor> state_and_output;
         state_and_output.reserve(dataset()->state_types_.size() +
                                  output_dtypes().size());
-        Status s =
-            dataset()->captured_func_->Run(opts, args, &state_and_output);
+
+        Status s = dataset()->captured_func_->Run(ctx, std::move(args),
+                                                  &state_and_output);
         if (s.ok()) {
           state_.clear();
           size_t i = 0;
@@ -185,6 +216,38 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
         return s;
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        if (!state_.empty()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("state_size"), state_.size()));
+          for (int idx = 0; idx < state_.size(); idx++) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat("state[", idx, "]")), state_[idx]));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        if (reader->Contains(full_name("state_size"))) {
+          int64 size;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("state_size"), &size));
+          state_.resize(size);
+          for (int idx = 0; idx < size; idx++) {
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                full_name(strings::StrCat("state[", idx, "]")), &state_[idx]));
+          }
+        }
+        return Status::OK();
+      }
+
      private:
       mutex mu_;
       const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
@@ -192,6 +255,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const DatasetBase* const input_;
+    const NameAttrList func_;
     const std::vector<Tensor> initial_state_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const DataTypeVector state_types_;
diff --git a/tensorflow/core/kernels/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
similarity index 50%
rename from tensorflow/core/kernels/shuffle_dataset_op.cc
rename to tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 72facb3a0d0cc13a559b3d8005592e19b97fed6f..1dde236c1711afd794ff397859631a48984b5ba8 100644
--- a/tensorflow/core/kernels/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
+
+#include <deque>
+#include <vector>
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
@@ -29,50 +32,21 @@ const int64 kLogIntervalMicros = 10 * 1000000;  // 10 seconds.
 // See documentation in ../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-class ShuffleDatasetOp : public UnaryDatasetOpKernel {
+class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
  public:
-  explicit ShuffleDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("reshuffle_each_iteration",
-                                     &reshuffle_each_iteration_));
-  }
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 buffer_size;
-    OP_REQUIRES_OK(
-        ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
-    OP_REQUIRES(
-        ctx, buffer_size > 0,
-        errors::InvalidArgument("buffer_size must be greater than zero."));
-
-    int64 seed;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed", &seed));
-
-    int64 seed2;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed2", &seed2));
-
-    // By TensorFlow convention, passing 0 for both seeds indicates
-    // that the shuffling should be seeded non-deterministically.
-    if (seed == 0 && seed2 == 0) {
-      seed = random::New64();
-      seed2 = random::New64();
-    }
-
-    if (reshuffle_each_iteration_) {
-      *output = new ReshufflingDataset(ctx, input, buffer_size, seed, seed2);
-    } else {
-      *output = new FixedSeedDataset(ctx, input, buffer_size, seed, seed2);
-    }
-  }
+  explicit ShuffleDatasetOpBase(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
 
- private:
+ protected:
   // Abstract base dataset that implements a shuffling iterator.
   class ShuffleDatasetBase : public GraphDatasetBase {
    public:
     ShuffleDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
-                       int64 buffer_size)
-        : GraphDatasetBase(ctx), input_(input), buffer_size_(buffer_size) {
+                       int64 buffer_size, int64 count)
+        : GraphDatasetBase(ctx),
+          input_(input),
+          buffer_size_(buffer_size),
+          count_(count) {
       input_->Ref();
     }
 
@@ -91,12 +65,15 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params, int64 seed, int64 seed2)
           : DatasetIterator<ShuffleDatasetBase>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
+            input_impl_(nullptr),
             seed_(seed),
             seed2_(seed2),
+            epoch_(0),
+            num_elements_(0),
             parent_generator_(seed, seed2),
             generator_(&parent_generator_) {
-        buffer_.reserve(params.dataset->buffer_size_);
+        buffer_.reset(new std::vector<Tensor>[params.dataset->buffer_size_]);
+        slices_.emplace_back(new Slice{0, 0});
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -105,19 +82,46 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
         mutex_lock l(mu_);
         int64 start_micros = ctx->env()->NowMicros();
         int64 num_log_entries = 0;
-        while (input_impl_ && buffer_.size() < dataset()->buffer_size_) {
+        bool first_call = false;
+        if (!input_impl_ && epoch_ == 0) {
+          first_call = true;
+          input_impl_ = dataset()->input_->MakeIterator(prefix());
+        }
+        while (input_impl_ && num_elements_ < dataset()->buffer_size_) {
           if (ctx->env()->NowMicros() >
               ((num_log_entries + 1) * kLogIntervalMicros) + start_micros) {
             num_log_entries++;
             LOG(INFO) << "Filling up shuffle buffer (this may take a while): "
-                      << buffer_.size() << " of " << dataset()->buffer_size_;
+                      << num_elements_ << " of " << dataset()->buffer_size_;
           }
           std::vector<Tensor> input_element;
-          bool end_of_input_sequence;
-          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &input_element,
-                                                  &end_of_input_sequence));
+          bool end_of_input_sequence = false;
+          while (dataset()->count_ == -1 || epoch_ < dataset()->count_) {
+            TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &input_element,
+                                                    &end_of_input_sequence));
+            if (!end_of_input_sequence) {
+              first_call = false;
+              break;
+            }
+            if (first_call && dataset()->count_ == -1) {
+              // If the first call to GetNext() fails because the end of
+              // sequence has been reached, we return an OutOfRange error to
+              // terminate the iteration. (Otherwise, this iterator may loop
+              // infinitely and never produce a value.)
+              *end_of_sequence = true;
+              return errors::OutOfRange(
+                  "Attempted to repeat an empty dataset infinitely.");
+            }
+            epoch_++;
+            int64 n = slices_.back()->end;
+            slices_.emplace_back(new Slice{n, n});
+            input_impl_ = dataset()->input_->MakeIterator(prefix());
+          }
           if (!end_of_input_sequence) {
-            buffer_.emplace_back(std::move(input_element));
+            buffer_[slices_.back()->end % dataset()->buffer_size_] =
+                std::move(input_element);
+            num_elements_++;
+            slices_.back()->end++;
           } else {
             input_impl_.reset();
           }
@@ -126,14 +130,25 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
           LOG(INFO) << "Shuffle buffer filled.";
         }
 
-        if (!buffer_.empty()) {
+        if (num_elements_ > 0) {
           *end_of_sequence = false;
-          // Choose an element to produce uniformly at random, and
-          // swap the last element into its place in the buffer.
-          int64 index = Random() % buffer_.size();
+          // Garbage collect all empty slices.
+          while (!slices_.empty() &&
+                 slices_.front()->start == slices_.front()->end) {
+            slices_.pop_front();
+          }
+          DCHECK(!slices_.empty());
+          // Choose an element to produce uniformly at random from the first
+          // slice, and then remove the element from the slice.
+          int64 offset =
+              Random() % (slices_.front()->end - slices_.front()->start);
+          int64 index =
+              (slices_.front()->start + offset) % dataset()->buffer_size_;
           *out_tensors = std::move(buffer_[index]);
-          std::swap(buffer_[index], buffer_.back());
-          buffer_.pop_back();
+          std::swap(buffer_[index],
+                    buffer_[slices_.front()->start % dataset()->buffer_size_]);
+          slices_.front()->start++;
+          num_elements_--;
         } else {
           DCHECK(input_impl_ == nullptr);
           *end_of_sequence = true;
@@ -145,20 +160,6 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
 
-        // Save the tensors in the buffer.
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("buffer_size"), buffer_.size()));
-        for (size_t i = 0; i < buffer_.size(); i++) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("buffer_", i, "_size")),
-              buffer_[i].size()));
-          for (size_t j = 0; j < buffer_[i].size(); j++) {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(strings::StrCat("buffer_", i, "_", j)),
-                buffer_[i][j]));
-          }
-        }
-
         // Save state needed to restore the random number generators.
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("num_random_samples"),
                                                num_random_samples_));
@@ -171,34 +172,38 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
         } else {
           TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
         }
+
+        // Save the epoch counter, buffer, and buffer slices.
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("epoch"), epoch_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("num_elements"), num_elements_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("slices_size"), slices_.size()));
+        for (size_t i = 0; i < slices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("slices_start_", i)),
+              slices_[i]->start));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("slices_end_", i)), slices_[i]->end));
+          for (size_t j = slices_[i]->start; j < slices_[i]->end; ++j) {
+            size_t index = j % dataset()->buffer_size_;
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("buffer_", index, "_size")),
+                buffer_[index].size()));
+            for (size_t k = 0; k < buffer_[index].size(); ++k) {
+              TF_RETURN_IF_ERROR(writer->WriteTensor(
+                  full_name(strings::StrCat("buffer_", index, "_", k)),
+                  buffer_[index][k]));
+            }
+          }
+        }
+
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-        buffer_.clear();
-
-        // Restore the buffer.
-        size_t buffer_size;
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("buffer_size"), &temp));
-          buffer_size = static_cast<size_t>(temp);
-        }
-        buffer_.reserve(buffer_size);
-        for (size_t i = 0; i < buffer_size; i++) {
-          int64 list_size;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat("buffer_", i, "_size")), &list_size));
-          buffer_.emplace_back(std::vector<Tensor>(list_size));
-          for (int j = 0; j < list_size; j++) {
-            TF_RETURN_IF_ERROR(reader->ReadTensor(
-                full_name(strings::StrCat("buffer_", i, "_", j)),
-                &buffer_[i][j]));
-          }
-        }
 
         // Restore the random number generators.
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("num_random_samples"),
@@ -212,10 +217,58 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
         } else {
           input_impl_.reset();
         }
+
+        // Restore the epoch counter, buffer, and buffer slices.
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("epoch"), &epoch_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("num_elements"), &num_elements_));
+        size_t slices_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("slices_size"), &temp));
+          slices_size = static_cast<size_t>(temp);
+        }
+        buffer_.reset(new std::vector<Tensor>[dataset()->buffer_size_]);
+        for (size_t i = 0; i < slices_size; ++i) {
+          int64 start;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat("slices_start_", i)), &start));
+          int64 end;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat("slices_end_", i)), &end));
+          slices_.emplace_back(new Slice{start, end});
+          for (size_t j = start; j < end; ++j) {
+            size_t index = j % dataset()->buffer_size_;
+            int64 list_size;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("buffer_", index, "_size")),
+                &list_size));
+            buffer_[index] = std::vector<Tensor>(list_size);
+            for (int k = 0; k < list_size; ++k) {
+              TF_RETURN_IF_ERROR(reader->ReadTensor(
+                  full_name(strings::StrCat("buffer_", index, "_", k)),
+                  &buffer_[index][k]));
+            }
+          }
+        }
+
         return Status::OK();
       }
 
      private:
+      // Used to represent slices of `buffer_` that belong to different epochs.
+      // The invariant maintained by the implementation is: `start` <= `end`.
+      // When using `start` and `end` to index into `buffer_`, their values
+      // should be taken modulo the size of `buffer_` as their absolute value
+      // can be greater than the range of `buffer_`.
+      struct Slice {
+        Slice(int64 start, int64 end) : start(start), end(end) {}
+
+        int64 start;
+        int64 end;
+      };
+
       random::SingleSampleAdapter<random::PhiloxRandom>::ResultType Random()
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         num_random_samples_++;
@@ -232,10 +285,13 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
       }
 
       mutex mu_;
-      std::vector<std::vector<Tensor>> buffer_ GUARDED_BY(mu_);
+      std::unique_ptr<std::vector<Tensor>[]> buffer_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       const int64 seed_ GUARDED_BY(mu_);
       const int64 seed2_ GUARDED_BY(mu_);
+      int64 epoch_ GUARDED_BY(mu_);
+      int64 num_elements_ GUARDED_BY(mu_);
+      std::deque<std::unique_ptr<Slice>> slices_ GUARDED_BY(mu_);
       random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
       random::SingleSampleAdapter<random::PhiloxRandom> generator_
           GUARDED_BY(mu_);
@@ -244,15 +300,58 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
 
     const DatasetBase* const input_;
     const int64 buffer_size_;
+    const int64 count_;
   };
+};
+
+class ShuffleDatasetOp : public ShuffleDatasetOpBase {
+ public:
+  explicit ShuffleDatasetOp(OpKernelConstruction* ctx)
+      : ShuffleDatasetOpBase(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("reshuffle_each_iteration",
+                                     &reshuffle_each_iteration_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 buffer_size;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
+    OP_REQUIRES(
+        ctx, buffer_size > 0,
+        errors::InvalidArgument("buffer_size must be greater than zero."));
+
+    int64 seed;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed", &seed));
+
+    int64 seed2;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed2", &seed2));
+
+    // By TensorFlow convention, passing 0 for both seeds indicates
+    // that the shuffling should be seeded non-deterministically.
+    if (seed == 0 && seed2 == 0) {
+      seed = random::New64();
+      seed2 = random::New64();
+    }
+
+    int64 count = 1;
+    if (reshuffle_each_iteration_) {
+      *output =
+          new ReshufflingDataset(ctx, input, buffer_size, seed, seed2, count);
+    } else {
+      *output =
+          new FixedSeedDataset(ctx, input, buffer_size, seed, seed2, count);
+    }
+  }
 
+ private:
   // A dataset that uses a pseduorandom sequence of seeds for the iterators
   // created from it. Used when `reshuffle_each_iteration` is true.
   class ReshufflingDataset : public ShuffleDatasetBase {
    public:
     ReshufflingDataset(OpKernelContext* ctx, const DatasetBase* input,
-                       int64 buffer_size, int64 seed, int64 seed2)
-        : ShuffleDatasetBase(ctx, input, buffer_size),
+                       int64 buffer_size, int64 seed, int64 seed2, int64 count)
+        : ShuffleDatasetBase(ctx, input, buffer_size, count),
           seed_(seed),
           seed2_(seed2),
           parent_generator_(seed, seed2),
@@ -291,8 +390,8 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
   class FixedSeedDataset : public ShuffleDatasetBase {
    public:
     FixedSeedDataset(OpKernelContext* ctx, const DatasetBase* input,
-                     int64 buffer_size, int64 seed, int64 seed2)
-        : ShuffleDatasetBase(ctx, input, buffer_size),
+                     int64 buffer_size, int64 seed, int64 seed2, int64 count)
+        : ShuffleDatasetBase(ctx, input, buffer_size, count),
           seed_(seed),
           seed2_(seed) {}
 
@@ -337,9 +436,93 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
   bool reshuffle_each_iteration_;
 };
 
+class ShuffleAndRepeatDatasetOp : public ShuffleDatasetOpBase {
+ public:
+  explicit ShuffleAndRepeatDatasetOp(OpKernelConstruction* ctx)
+      : ShuffleDatasetOpBase(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 buffer_size;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
+    OP_REQUIRES(
+        ctx, buffer_size > 0,
+        errors::InvalidArgument("buffer_size must be greater than zero."));
+
+    int64 seed;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed", &seed));
+
+    int64 seed2;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed2", &seed2));
+
+    int64 count;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "count", &count));
+
+    // By TensorFlow convention, if both seeds are 0, then shuffling should be
+    // seeded non-deterministically.
+    if (seed == 0 && seed2 == 0) {
+      seed = random::New64();
+      seed2 = random::New64();
+    }
+
+    *output = new Dataset(ctx, input, buffer_size, seed, seed2, count);
+  }
+
+ private:
+  class Dataset : public ShuffleDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size,
+            int64 seed, int64 seed2, int64 count)
+        : ShuffleDatasetBase(ctx, input, buffer_size, count),
+          seed_(seed),
+          seed2_(seed2) {}
+
+    string DebugString() override {
+      return strings::StrCat("ShuffleAndRepeatDatasetOp(", buffer_size_, ", ",
+                             seed_, ", ", seed2_, ", ", count_, ")::Dataset");
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new ShuffleDatasetBase::Iterator(
+          {this, strings::StrCat(prefix, "::ShuffleAndRepeat")}, seed_,
+          seed2_));
+    }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* buffer_size = nullptr;
+      Node* seed = nullptr;
+      Node* seed2 = nullptr;
+      Node* count = nullptr;
+
+      TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
+      TF_RETURN_IF_ERROR(b->AddScalar(seed_, &seed));
+      TF_RETURN_IF_ERROR(b->AddScalar(seed2_, &seed2));
+      TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_graph_node, buffer_size, seed, seed2, count},  // Inputs
+          {},                                                         // Attrs
+          output));
+      return Status::OK();
+    }
+
+   private:
+    const int64 seed_;
+    const int64 seed2_;
+  };
+};
+
 REGISTER_KERNEL_BUILDER(Name("ShuffleDataset").Device(DEVICE_CPU),
                         ShuffleDatasetOp);
 
+REGISTER_KERNEL_BUILDER(Name("ShuffleAndRepeatDataset").Device(DEVICE_CPU),
+                        ShuffleAndRepeatDatasetOp);
+
 }  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
similarity index 91%
rename from tensorflow/core/kernels/skip_dataset_op.cc
rename to tensorflow/core/kernels/data/skip_dataset_op.cc
index 1fe49271e299f042b9dc88a30d88d3d26a9e65f2..d636c37afe2aa0566df7d4a38a8d393c34fd0195 100644
--- a/tensorflow/core/kernels/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -100,7 +99,7 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         return Status::OK();
       }
@@ -129,8 +128,8 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         while (i_ < dataset()->count_) {
           // Fetch and throw away Tensors.
           std::vector<Tensor> dummy_out_tensors;
-          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &dummy_out_tensors,
-                                                  end_of_sequence));
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, &dummy_out_tensors, end_of_sequence));
           if (*end_of_sequence) {
             // We reached the end before the count was reached.
             input_impl_.reset();
@@ -141,8 +140,8 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         }
 
         // Return GetNext() on the underlying iterator.
-        TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, out_tensors,
-                                                end_of_sequence));
+        TF_RETURN_IF_ERROR(
+            input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
         if (*end_of_sequence) {
           input_impl_.reset();
         }
@@ -162,7 +161,7 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
@@ -185,8 +184,7 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("SkipDataset").Device(DEVICE_CPU),
-                        SkipDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("SkipDataset").Device(DEVICE_CPU), SkipDatasetOp);
 
 }  // namespace
 
diff --git a/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
similarity index 99%
rename from tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc
rename to tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index de5ab1a3678b981a95de533dc2f59cc16dd7705c..fcf17ad68bb1bb5fca7fd7767e12fe9fbc50e0ab 100644
--- a/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -14,11 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include <numeric>
 
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
@@ -168,7 +167,7 @@ class Dataset : public GraphDatasetBase {
       return Status::OK();
     }
 
-    Status RestoreInternal(OpKernelContext* ctx,
+    Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(reader->ReadScalar(Iterator::full_name("i"), &i_));
diff --git a/tensorflow/core/kernels/data/sql/BUILD b/tensorflow/core/kernels/data/sql/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f4698bdaf7ae9767e068e49dad61d2a3d9f739a8
--- /dev/null
+++ b/tensorflow/core/kernels/data/sql/BUILD
@@ -0,0 +1,39 @@
+# Description:
+#   SQL library.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        include = ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "sql",
+    srcs = [
+        "driver_manager.cc",
+        "sqlite_query_connection.cc",
+    ],
+    hdrs = [
+        "driver_manager.h",
+        "query_connection.h",
+        "sqlite_query_connection.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/data:dataset",
+        "//tensorflow/core/lib/db:sqlite",
+    ],
+)
diff --git a/tensorflow/core/kernels/sql/driver_manager.cc b/tensorflow/core/kernels/data/sql/driver_manager.cc
similarity index 89%
rename from tensorflow/core/kernels/sql/driver_manager.cc
rename to tensorflow/core/kernels/data/sql/driver_manager.cc
index 9a5d5aa853c438ef4e893fac2322af17ae863fa8..ffabda1a8a1fe8bce629ed34590c058a231f3cfc 100644
--- a/tensorflow/core/kernels/sql/driver_manager.cc
+++ b/tensorflow/core/kernels/data/sql/driver_manager.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/sql/driver_manager.h"
-#include "tensorflow/core/kernels/sql/sqlite_query_connection.h"
+#include "tensorflow/core/kernels/data/sql/driver_manager.h"
+#include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/sql/driver_manager.h b/tensorflow/core/kernels/data/sql/driver_manager.h
similarity index 83%
rename from tensorflow/core/kernels/sql/driver_manager.h
rename to tensorflow/core/kernels/data/sql/driver_manager.h
index 53350268d30f4f7215eb543a28ae3fedf837ac0d..a34691b5a2f43034feaf55241d0a445456c23bc3 100644
--- a/tensorflow/core/kernels/sql/driver_manager.h
+++ b/tensorflow/core/kernels/data/sql/driver_manager.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_DRIVER_MANAGER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_DRIVER_MANAGER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
 
-#include "tensorflow/core/kernels/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/sql/query_connection.h"
 
 namespace tensorflow {
 
@@ -38,4 +38,4 @@ class DriverManager {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_DRIVER_MANAGER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
diff --git a/tensorflow/core/kernels/sql/query_connection.h b/tensorflow/core/kernels/data/sql/query_connection.h
similarity index 90%
rename from tensorflow/core/kernels/sql/query_connection.h
rename to tensorflow/core/kernels/data/sql/query_connection.h
index f9945aee7dc6ac59df8cc9063ab5c4d9aedf4018..e9ffca202ff32f0c0130427c2699ce0449a0903a 100644
--- a/tensorflow/core/kernels/sql/query_connection.h
+++ b/tensorflow/core/kernels/data/sql/query_connection.h
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_QUERY_CONNECTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_QUERY_CONNECTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
 
 #include "tensorflow/core/framework/tensor.h"
 
 namespace tensorflow {
 
+class IteratorContext;
+
 namespace sql {
 // This interface allows a user to connect to a database, execute a query, and
 // iterate over the result set, putting the results into an output tensor.
@@ -56,7 +58,7 @@ class QueryConnection {
   // If there are no more rows in the result set, then instead `true` will be
   // stored in `*end_of_sequence`, and the content of `*out_tensors` will be
   // undefined.
-  virtual Status GetNext(std::vector<Tensor>* out_tensors,
+  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                          bool* end_of_sequence) = 0;
 };
 
@@ -64,4 +66,4 @@ class QueryConnection {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_QUERY_CONNECTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7cd07bd8eca160bfc62e15adc568742c84711779
--- /dev/null
+++ b/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
@@ -0,0 +1,119 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace tensorflow {
+
+namespace sql {
+
+SqliteQueryConnection::SqliteQueryConnection() {}
+
+SqliteQueryConnection::~SqliteQueryConnection() {
+  if (db_ != nullptr) db_->Unref();
+}
+
+Status SqliteQueryConnection::Open(const string& data_source_name,
+                                   const string& query,
+                                   const DataTypeVector& output_types) {
+  if (db_ != nullptr) {
+    return errors::FailedPrecondition(
+        "Failed to open query connection: Connection already opened.");
+  }
+  TF_RETURN_IF_ERROR(Sqlite::Open(
+      data_source_name, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, &db_));
+  query_ = query;
+  output_types_ = output_types;
+  return Status::OK();
+}
+
+Status SqliteQueryConnection::Close() {
+  stmt_ = SqliteStatement();
+  db_->Unref();
+  db_ = nullptr;
+  return Status::OK();
+}
+
+Status SqliteQueryConnection::GetNext(IteratorContext* ctx,
+                                      std::vector<Tensor>* out_tensors,
+                                      bool* end_of_sequence) {
+  if (!stmt_) TF_RETURN_IF_ERROR(PrepareQuery());
+  TF_RETURN_IF_ERROR(stmt_.Step(end_of_sequence));
+  if (!*end_of_sequence) {
+    for (int i = 0; i < column_count_; i++) {
+      DataType dt = output_types_[i];
+      // TODO(mrry): Pass in the `IteratorContext::allocator()`.
+      Tensor tensor(ctx->allocator({}), dt, {});
+      FillTensorWithResultSetEntry(dt, i, &tensor);
+      out_tensors->emplace_back(std::move(tensor));
+    }
+  }
+  return Status::OK();
+}
+
+Status SqliteQueryConnection::PrepareQuery() {
+  TF_RETURN_IF_ERROR(db_->Prepare(query_, &stmt_));
+  int column_count = stmt_.ColumnCount();
+  if (column_count != output_types_.size()) {
+    stmt_ = SqliteStatement();
+    return errors::InvalidArgument(tensorflow::strings::Printf(
+        "The number of columns in query (%d) must match the number of "
+        "elements in output_types (%zu).",
+        column_count, output_types_.size()));
+  }
+  column_count_ = column_count;
+  return Status::OK();
+}
+
+void SqliteQueryConnection::FillTensorWithResultSetEntry(
+    const DataType& data_type, int column_index, Tensor* tensor) {
+#define CASE(T, M)                                                 \
+  case DataTypeToEnum<T>::value:                                   \
+    tensor->scalar<T>()() = static_cast<T>(stmt_.M(column_index)); \
+    break;
+#define INT_CASE(T) CASE(T, ColumnInt)
+#define DOUBLE_CASE(T) CASE(T, ColumnDouble)
+#define STRING_CASE(T) CASE(T, ColumnString)
+  // clang-format off
+  switch (data_type) {
+    TF_CALL_int8(INT_CASE)
+    TF_CALL_uint8(INT_CASE)
+    TF_CALL_int16(INT_CASE)
+    TF_CALL_uint16(INT_CASE)
+    TF_CALL_int32(INT_CASE)
+    TF_CALL_uint32(INT_CASE)
+    TF_CALL_int64(INT_CASE)
+    TF_CALL_uint64(INT_CASE)
+    TF_CALL_float(DOUBLE_CASE)
+    TF_CALL_double(DOUBLE_CASE)
+    TF_CALL_string(STRING_CASE)
+    case DT_BOOL:
+      tensor->scalar<bool>()() = stmt_.ColumnInt(column_index) != 0;
+      break;
+    // Error preemptively thrown by SqlDatasetOp::MakeDataset in this case.
+    default:
+      LOG(FATAL)
+          << "Use of unsupported TensorFlow data type by 'SqlQueryConnection': "
+          << DataTypeString(data_type) << ".";
+  }
+  // clang-format on
+}
+
+}  // namespace sql
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
similarity index 80%
rename from tensorflow/core/kernels/sql/sqlite_query_connection.h
rename to tensorflow/core/kernels/data/sql/sqlite_query_connection.h
index 435dd8e234ca7a8fb9a3ef6ffeef0ca4dda7a221..81b19530b7d5964e17bde996de9fa7766af318b7 100644
--- a/tensorflow/core/kernels/sql/sqlite_query_connection.h
+++ b/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_SQLITE_QUERY_CONNECTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_SQLITE_QUERY_CONNECTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
 
 #include <memory>
 
-#include "tensorflow/core/kernels/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/sql/query_connection.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -32,7 +32,7 @@ class SqliteQueryConnection : public QueryConnection {
   Status Open(const string& data_source_name, const string& query,
               const DataTypeVector& output_types) override;
   Status Close() override;
-  Status GetNext(std::vector<Tensor>* out_tensors,
+  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) override;
 
  private:
@@ -42,7 +42,7 @@ class SqliteQueryConnection : public QueryConnection {
   // `stmt_`.
   void FillTensorWithResultSetEntry(const DataType& data_type, int column_index,
                                     Tensor* tensor);
-  std::shared_ptr<Sqlite> db_ = nullptr;
+  Sqlite* db_ = nullptr;
   SqliteStatement stmt_;
   int column_count_ = 0;
   string query_;
@@ -53,4 +53,4 @@ class SqliteQueryConnection : public QueryConnection {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SQL_SQLITE_QUERY_CONNECTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/sql_dataset_ops.cc b/tensorflow/core/kernels/data/sql_dataset_ops.cc
similarity index 95%
rename from tensorflow/core/kernels/sql_dataset_ops.cc
rename to tensorflow/core/kernels/data/sql_dataset_ops.cc
index 23846d65bb8426ad8e5c3343047f72d24653c101..d50e9c9cf9739044379c7bbe753fc4acc2de311e 100644
--- a/tensorflow/core/kernels/sql_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/sql_dataset_ops.cc
@@ -16,9 +16,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/dataset.h"
-#include "tensorflow/core/kernels/sql/driver_manager.h"
-#include "tensorflow/core/kernels/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/sql/driver_manager.h"
+#include "tensorflow/core/kernels/data/sql/query_connection.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -116,7 +116,7 @@ class SqlDatasetOp : public DatasetOpKernel {
         }
       }
 
-      Status GetNextInternal(IteratorContext* /*ctx*/,
+      Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
@@ -132,7 +132,7 @@ class SqlDatasetOp : public DatasetOpKernel {
             return s;
           }
         }
-        return query_connection_->GetNext(out_tensors, end_of_sequence);
+        return query_connection_->GetNext(ctx, out_tensors, end_of_sequence);
       }
 
      private:
diff --git a/tensorflow/core/kernels/stats_aggregator.h b/tensorflow/core/kernels/data/stats_aggregator.h
similarity index 94%
rename from tensorflow/core/kernels/stats_aggregator.h
rename to tensorflow/core/kernels/data/stats_aggregator.h
index 5f602c5f3bf4dc275538ae7884f9f552c71fc65a..076a56b0bf100161fe2cf4384e6be0809eb251fe 100644
--- a/tensorflow/core/kernels/stats_aggregator.h
+++ b/tensorflow/core/kernels/data/stats_aggregator.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_STATS_AGGREGATOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_STATS_AGGREGATOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
 
 #include <memory>
 #include <string>
@@ -81,4 +81,4 @@ class StatsAggregatorResource : public ResourceBase {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_STATS_AGGREGATOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_STATS_AGGREGATOR_H_
diff --git a/tensorflow/core/kernels/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
similarity index 98%
rename from tensorflow/core/kernels/stats_aggregator_ops.cc
rename to tensorflow/core/kernels/data/stats_aggregator_ops.cc
index 037ec64a83b58fd0f32789cd7560317959529225..5a2dd9c43dbcbf5250d4dcd4bd803ed4979999e0 100644
--- a/tensorflow/core/kernels/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/stats_aggregator.h"
+#include "tensorflow/core/kernels/data/stats_aggregator.h"
 
 #include <memory>
 
diff --git a/tensorflow/core/kernels/stats_dataset_ops.cc b/tensorflow/core/kernels/data/stats_dataset_ops.cc
similarity index 68%
rename from tensorflow/core/kernels/stats_dataset_ops.cc
rename to tensorflow/core/kernels/data/stats_dataset_ops.cc
index 7b1853aba61d7eeabceeebe76187535567509252..4dc1343e21faf947afc4e49539a45cdd1b38c0e9 100644
--- a/tensorflow/core/kernels/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/stats_dataset_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/dataset.h"
-#include "tensorflow/core/kernels/stats_aggregator.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/stats_aggregator.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
@@ -43,14 +43,14 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
                    DatasetBase** output) override {
     string tag;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "tag", &tag));
-    *output = new Dataset(input, std::move(tag));
+    *output = new Dataset(ctx, input, std::move(tag));
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    explicit Dataset(const DatasetBase* input, string tag)
-        : input_(input), tag_(std::move(tag)) {
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input, string tag)
+        : GraphDatasetBase(ctx), input_(input), tag_(std::move(tag)) {
       input_->Ref();
     }
 
@@ -71,6 +71,17 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "LatencyStatsDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      Node* tag_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(tag_, &tag_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_node, tag_node}, output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -81,6 +92,7 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
+        tf_shared_lock l(mu_);
         uint64 start = ctx->env()->NowMicros();
         Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
         uint64 end = ctx->env()->NowMicros();
@@ -92,8 +104,23 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
         return s;
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
      private:
-      const std::unique_ptr<IteratorBase> input_impl_;
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
@@ -110,14 +137,14 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
                    DatasetBase** output) override {
     string tag;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "tag", &tag));
-    *output = new Dataset(input, std::move(tag));
+    *output = new Dataset(ctx, input, std::move(tag));
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    explicit Dataset(const DatasetBase* input, string tag)
-        : input_(input), tag_(std::move(tag)) {
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input, string tag)
+        : GraphDatasetBase(ctx), input_(input), tag_(std::move(tag)) {
       input_->Ref();
     }
 
@@ -140,6 +167,17 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
       return "BytesProducedStatsDatasetOp::Dataset";
     }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      Node* tag_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(tag_, &tag_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_node, tag_node}, output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -150,6 +188,7 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
+        tf_shared_lock l(mu_);
         Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
         auto stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator && s.ok() && !*end_of_sequence) {
@@ -163,8 +202,23 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
         return s;
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
      private:
-      const std::unique_ptr<IteratorBase> input_impl_;
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/take_dataset_op.cc
rename to tensorflow/core/kernels/data/take_dataset_op.cc
index 7a6d20d6c7cb5a9bc5142e877c5c0c5285c1fd90..3bea46a747e002633a0db269434b26bad761a771 100644
--- a/tensorflow/core/kernels/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -101,7 +100,7 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         return Status::OK();
       }
@@ -149,7 +148,7 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
diff --git a/tensorflow/core/kernels/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
similarity index 96%
rename from tensorflow/core/kernels/tensor_dataset_op.cc
rename to tensorflow/core/kernels/data/tensor_dataset_op.cc
index fe53434d176d77c0064574a044a18db05146e62d..8c8994b1c3f470532cc7c45dabde4639e841dc4b 100644
--- a/tensorflow/core/kernels/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -70,7 +69,7 @@ class TensorDatasetOp : public DatasetOpKernel {
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      std::vector<NodeBuilder::NodeOut> components;
+      std::vector<Node*> components;
       components.reserve(tensors_.size());
       for (const Tensor& t : tensors_) {
         Node* node;
@@ -113,7 +112,7 @@ class TensorDatasetOp : public DatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         produced_ = reader->Contains(full_name("produced"));
diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff412a4671bd0307e4975027ebd1e098353de238
--- /dev/null
+++ b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
@@ -0,0 +1,646 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <deque>
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+
+namespace tensorflow {
+
+namespace {
+
+bool IsGreaterEqualToOrCompatibleWith(const PartialTensorShape& a,
+                                      const PartialTensorShape& b) {
+  // Returns true if dims[a] >= dims[b], or are compatible.
+  if (a.unknown_rank()) return true;
+  if (a.dims() != b.dims()) return false;
+  for (int d = 0; d < a.dims(); ++d) {
+    if (a.dim_size(d) == -1 || b.dim_size(d) == -1) continue;
+    if (a.dim_size(d) < b.dim_size(d)) return false;
+  }
+  return true;
+}
+
+DataTypeVector PrependQueueType(const DataTypeVector& dtypes) {
+  DataTypeVector out;
+  out.reserve(dtypes.size() + 1);
+  out.push_back(DT_VARIANT);  // The queue component.
+  for (const DataType& d : dtypes) out.push_back(d);
+  return out;
+}
+
+std::vector<PartialTensorShape> PrependQueueShapeWithBatch(
+    const std::vector<PartialTensorShape>& shapes) {
+  std::vector<PartialTensorShape> out;
+  out.reserve(shapes.size() + 1);
+  out.emplace_back(PartialTensorShape({-1}));  // The queue component.
+  for (PartialTensorShape s : shapes) {
+    s.InsertDim(0, -1);  // Unknown batch size.
+    out.push_back(std::move(s));
+  }
+  return out;
+}
+
+class EnqueueInQueueDatasetOp;
+
+class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
+ public:
+  PrependFromQueueAndPaddedBatchDataset(
+      OpKernelContext* ctx, const int64 batch_size, const DatasetBase* input,
+      const DataTypeVector& dtypes,
+      const std::vector<PartialTensorShape>& shapes,
+      std::vector<Tensor> padding_values)
+      : GraphDatasetBase(ctx),
+        batch_size_(batch_size),
+        input_(input),
+        dtypes_(dtypes),
+        shapes_(shapes),
+        padding_values_(std::move(padding_values)),
+        dtypes_with_queue_(PrependQueueType(dtypes)),
+        batched_shapes_with_queue_(PrependQueueShapeWithBatch(shapes)) {
+    input_->Ref();
+  }
+
+  ~PrependFromQueueAndPaddedBatchDataset() override { input_->Unref(); }
+
+  std::unique_ptr<IteratorBase> MakeIterator(
+      const string& prefix) const override {
+    return std::unique_ptr<IteratorBase>(new Iterator(
+        {this, strings::StrCat(prefix, "::PrependFromQueueAndPaddedBatch")}));
+  }
+
+  const DataTypeVector& output_dtypes() const override {
+    return dtypes_with_queue_;
+  }
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return batched_shapes_with_queue_;
+  }
+
+  string DebugString() override {
+    return "PrependFromQueueAndPaddedBatchDatasetOp::Dataset";
+  }
+
+ protected:
+  Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_graph = nullptr;
+    TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph));
+    Node* batch_size = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
+
+    std::vector<Node*> padded_shapes;
+    padded_shapes.reserve(shapes_.size());
+    for (int i = 0; i < shapes_.size(); i++) {
+      Node* node;
+      Tensor t(DT_INT64, TensorShape({shapes_[i].dims()}));
+      for (int j = 0; j < shapes_[i].dims(); j++) {
+        t.vec<int64>()(j) = shapes_[i].dim_size(j);
+      }
+      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+      padded_shapes.emplace_back(node);
+    }
+
+    std::vector<Node*> padding_values;
+    padding_values.reserve(padding_values_.size());
+    for (const Tensor& t : padding_values_) {
+      Node* node;
+      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+      padding_values.emplace_back(node);
+    }
+
+    AttrValue output_types;
+    b->BuildAttrValue(dtypes_, &output_types);
+
+    AttrValue output_shapes;
+    b->BuildAttrValue(batched_shapes_with_queue_, &output_shapes);
+
+    AttrValue N;
+    b->BuildAttrValue<int64>(shapes_.size(), &N);
+
+    TF_RETURN_IF_ERROR(b->AddDataset(this, {{0, input_graph}, {1, batch_size}},
+                                     {{2, padded_shapes}, {3, padding_values}},
+                                     {{"Toutput_types", output_types},
+                                      {"output_shapes", output_shapes},
+                                      {"N", N}},
+                                     output));
+
+    return Status::OK();
+  }
+
+ private:
+  friend class EnqueueInQueueDatasetOp;
+
+  class Iterator
+      : public DatasetIterator<PrependFromQueueAndPaddedBatchDataset> {
+   public:
+    explicit Iterator(const Params& params)
+        : DatasetIterator<PrependFromQueueAndPaddedBatchDataset>(params),
+          queue_(new TensorQueue(/*input_impl*/
+                                 params.dataset->input_->MakeIterator(
+                                     params.prefix),
+                                 params.dataset->dtypes_,
+                                 params.dataset->shapes_)) {}
+
+    ~Iterator() override { queue_->Unref(); }
+
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      std::vector<std::vector<Tensor>> batch;
+      TF_RETURN_IF_ERROR(queue_->GetNext(ctx, dataset()->batch_size_, &batch,
+                                         end_of_sequence));
+      const auto& dtypes = dataset()->dtypes_;
+      const auto& shapes = dataset()->shapes_;
+      const auto& input_shapes = dataset()->input_->output_shapes();
+      const auto& padding_values = dataset()->padding_values_;
+      const int64 batch_size = batch.size();
+      out_tensors->reserve(dtypes.size());
+
+      std::vector<TensorShape> max_shapes;  // Of non-queue components.
+      for (int i = 0; i < dtypes.size(); ++i) {
+        const PartialTensorShape& shape = shapes[i];
+        TensorShape out_shape({batch_size});
+        for (int r = 0; r < shape.dims(); ++r) {
+          if (shape.dim_size(r) >= 0) {
+            // padded_shape[r] is known.
+            out_shape.AddDim(shape.dim_size(r));
+          } else {
+            // padded_shape[r] is unknown, find the maximum across
+            // the batch.
+            int64 dim = 0;
+            for (int b = 0; b < batch.size(); ++b) {
+              dim = std::max(dim, batch[b][i].dim_size(r));
+            }
+            out_shape.AddDim(dim);
+          }
+        }
+        max_shapes.push_back(std::move(out_shape));
+      }
+
+      Tensor queues_t(cpu_allocator(), DT_VARIANT, TensorShape({batch_size}));
+      if (!batch.empty()) {
+        auto queues = queues_t.flat<Variant>();
+        Variant& queue_inserter = queues(0);
+        queue_inserter = TensorQueueInserter();
+        queue_inserter.get<TensorQueueInserter>()->set_queue(queue_);
+        for (int b = 1; b < batch.size(); ++b) {
+          // Copy the TensorQueueInserter.  Each copy increments the
+          // Ref on the queue_.
+          queues(b) = queues(0);
+        }
+      }
+      out_tensors->push_back(std::move(queues_t));
+
+      for (int i = 0; i < max_shapes.size(); ++i) {
+        Tensor component(cpu_allocator(), dtypes[i], max_shapes[i]);
+        // Try hard to take the fast path.
+        if (shapes[i].IsFullyDefined() &&
+            shapes[i].IsIdenticalTo(input_shapes[i])) {
+          // Take the fast path if we know all the shapes statically.
+          for (int64 b = 0; b < batch.size(); ++b) {
+            TF_RETURN_IF_ERROR(
+                batch_util::CopyElementToSlice(batch[b][i], &component, b));
+          }
+        } else {
+          TF_RETURN_IF_ERROR(
+              batch_util::SetElementZero(&component, padding_values[i]));
+          for (int64 b = 0; b < batch.size(); ++b) {
+            if (batch[b][i].shape() == max_shapes[i]) {
+              TF_RETURN_IF_ERROR(
+                  batch_util::CopyElementToSlice(batch[b][i], &component, b));
+            } else {
+              TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice(
+                  batch[b][i], &component, b));
+            }
+          }
+        }
+        out_tensors->push_back(std::move(component));
+      }
+
+      // end_of_sequence was set before we populated out_tensors, so
+      // it's ok to return now.
+      return Status::OK();
+    }
+
+   protected:
+    // Work around bug in MSVC that disallows access to protected
+    // members of Iterator from within TensorQueue.
+    class TensorQueue;
+    friend class TensorQueue;
+
+    class TensorQueue : public core::RefCounted {
+     public:
+      TensorQueue(std::unique_ptr<IteratorBase> input_impl,
+                  const DataTypeVector& dtypes,
+                  const std::vector<PartialTensorShape>& shapes)
+          : dtypes_(dtypes),
+            shapes_(shapes),
+            input_impl_(std::move(input_impl)) {}
+
+      void MaybeWaitForNotificationLocked(mutex_lock* lock)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // This essentially just releases the lock and immediately relocks.
+        cv_.wait_for(*lock, std::chrono::milliseconds(0));
+      }
+
+      void NotifyLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) { cv_.notify_all(); }
+
+      Status GetNext(IteratorContext* ctx, const int64 batch_size,
+                     std::vector<std::vector<Tensor>>* batch,
+                     bool* end_of_sequence) {
+        mutex_lock lock(mu_);
+
+        *end_of_sequence = false;
+
+        for (int64 b = 0; b < batch_size;) {
+          if (!entries_.empty()) {
+            batch->push_back(std::move(entries_.front()));
+            entries_.pop_front();
+            ++b;
+            continue;
+          } else {
+            if (input_impl_) {
+              // There's still input coming in.
+              std::vector<Tensor> tensors;
+              bool input_end;
+              TF_RETURN_IF_ERROR(
+                  input_impl_->GetNext(ctx, &tensors, &input_end));
+              if (!input_end) {
+                batch->push_back(std::move(tensors));
+                ++b;
+                continue;
+              } else {
+                input_impl_.reset();
+              }
+            }
+            if (!input_impl_) {
+              // There's no more input coming in.
+              if (RefCountIsOne()) {
+                // No TensorQueueInserters in the wild.
+                if (batch->empty()) {
+                  *end_of_sequence = true;
+                }
+                break;
+              } else {
+                MaybeWaitForNotificationLocked(&lock);
+                // If there's data available, try to add entries again.
+                // Otherwise return a smaller batch and hope the next
+                // iterator request has a non-empty or unused queue_.
+                if (entries_.empty()) {
+                  break;
+                }
+              }
+            }
+          }
+        }  // for (int64 b = ... batch_size)
+        return Status::OK();
+      }
+
+      Status Insert(const std::vector<Tensor>& tensors) {
+        if (tensors.size() != dtypes_.size()) {
+          return errors::InvalidArgument(
+              "TensorQueue::Insert: mismatched number of tensors.  Queue "
+              "expects ",
+              dtypes_.size(), " tensors but tried to insert ", tensors.size());
+        }
+        for (int i = 0; i < tensors.size(); ++i) {
+          if (tensors[i].dtype() != dtypes_[i]) {
+            return errors::InvalidArgument(
+                "TensorQueue::Insert: mismatched dtypes at component ", i,
+                ".  Attempted "
+                "to insert tensor of type ",
+                DataTypeString(tensors[i].dtype()),
+                " but queue expected type: ", DataTypeString(dtypes_[i]));
+          }
+          if (!shapes_[i].IsCompatibleWith(tensors[i].shape())) {
+            return errors::InvalidArgument(
+                "TensorQueue::Insert: mismatched shapes at component ", i,
+                ".  Attempted "
+                "to insert tensor with shape ",
+                tensors[i].shape().DebugString(),
+                " but queue expected shape: ", shapes_[i].DebugString());
+          }
+        }
+        mutex_lock lock(mu_);
+        entries_.push_back(tensors);
+        NotifyLocked();
+        return Status::OK();
+      }
+
+      Status Save(Iterator* iter, IteratorStateWriter* writer) {
+        mutex_lock lock(mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(iter->SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(iter->full_name("input_exhausted"), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(iter->full_name("entries_size"),
+                                               entries_.size()));
+        for (int64 b = 0; b < entries_.size(); ++b) {
+          for (int i = 0; i < dtypes_.size(); ++i) {
+            TF_RETURN_IF_ERROR(
+                writer->WriteTensor(strings::StrCat(iter->full_name("entries"),
+                                                    "[", b, "][", i, "]"),
+                                    entries_[b][i]));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status Restore(Iterator* iter, IteratorContext* ctx,
+                     IteratorStateReader* reader) {
+        mutex_lock l(mu_);
+        if (reader->Contains(iter->full_name("input_exhausted"))) {
+          input_impl_.reset();
+        } else {
+          input_impl_ = iter->dataset_input()->MakeIterator(iter->prefix());
+          TF_RETURN_IF_ERROR(iter->RestoreParent(ctx, reader, input_impl_));
+        }
+        entries_.clear();
+        int64 entries_size = -1;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(iter->full_name("entries_size"), &entries_size));
+        if (entries_size < 0) {
+          return errors::DataLoss(
+              "Expected entries_size key '", iter->full_name("entries_size"),
+              "' to have nonnegative value, but saw: ", entries_size);
+        }
+        for (int64 b = 0; b < entries_size; ++b) {
+          std::vector<Tensor> entry;
+          for (int i = 0; i < dtypes_.size(); ++i) {
+            Tensor value;
+            TF_RETURN_IF_ERROR(
+                reader->ReadTensor(strings::StrCat(iter->full_name("entries"),
+                                                   "[", b, "][", i, "]"),
+                                   &value));
+            entry.push_back(std::move(value));
+          }
+          entries_.push_back(std::move(entry));
+        }
+        return Status::OK();
+      }
+
+      mutex* mu() { return &mu_; }
+
+     private:
+      DataTypeVector dtypes_;
+      std::vector<PartialTensorShape> shapes_;
+
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::deque<std::vector<Tensor>> entries_ GUARDED_BY(mu_);
+      condition_variable cv_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* dataset_input() const { return dataset()->input_; }
+
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      return queue_->Save(this, writer);
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      return queue_->Restore(this, ctx, reader);
+    }
+
+   public:
+    class TensorQueueInserter {
+     public:
+      TensorQueueInserter() : queue_(nullptr) {}
+
+      void set_queue(TensorQueue* queue) {
+        queue_ = queue;
+        queue_->Ref();
+      }
+
+      TensorQueueInserter(const TensorQueueInserter& rhs) {
+        queue_ = rhs.queue_;
+        queue_->Ref();
+      };
+
+      TensorQueueInserter(TensorQueueInserter&& rhs) {
+        queue_ = rhs.queue_;
+        rhs.queue_ = nullptr;
+      }
+
+      TensorQueueInserter& operator=(const TensorQueueInserter& rhs) = delete;
+
+      string TypeName() const { return "tensorflow::TensorQueueInserter"; }
+      string DebugString() const { return TypeName(); }
+
+      void Encode(VariantTensorData*) const {}
+      bool Decode(const VariantTensorData&) { return false; }
+
+      ~TensorQueueInserter() {
+        if (queue_) {
+          mutex_lock lock(*queue_->mu());
+          queue_->Unref();
+          queue_->NotifyLocked();
+          queue_ = nullptr;
+        }
+      }
+
+      Status Insert(const std::vector<Tensor>& tensors) const {
+        CHECK(queue_);
+        return queue_->Insert(tensors);
+      }
+
+     private:
+      mutable TensorQueue* queue_;
+    };
+
+   private:
+    TensorQueue* const queue_;
+  };
+
+ private:
+  const int64 batch_size_;
+  const DatasetBase* input_;
+  const DataTypeVector dtypes_;
+  const std::vector<PartialTensorShape> shapes_;
+  const std::vector<Tensor> padding_values_;
+  const DataTypeVector dtypes_with_queue_;
+  const std::vector<PartialTensorShape> batched_shapes_with_queue_;
+};
+
+class PrependFromQueueAndPaddedBatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit PrependFromQueueAndPaddedBatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput_types", &output_types_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 batch_size = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "batch_size", &batch_size));
+    OP_REQUIRES(
+        ctx, batch_size > 0,
+        errors::InvalidArgument("Batch size must be greater than zero."));
+
+    OpInputList padded_shape_tensors;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("padded_shapes", &padded_shape_tensors));
+    std::vector<PartialTensorShape> padded_shapes;
+    padded_shapes.reserve(padded_shape_tensors.size());
+    OP_REQUIRES(ctx,
+                padded_shape_tensors.size() == input->output_shapes().size(),
+                errors::InvalidArgument("Number of padded shapes (",
+                                        padded_shape_tensors.size(),
+                                        ") must match the number of components "
+                                        "in the input dataset's elements (",
+                                        input->output_shapes().size(), ")"));
+    for (const Tensor& padded_shape_t : padded_shape_tensors) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(padded_shape_t.shape()),
+                  errors::InvalidArgument("All padded shapes must be vectors"));
+      PartialTensorShape padded_shape;
+      OP_REQUIRES_OK(ctx, PartialTensorShape::MakePartialShape(
+                              padded_shape_t.vec<int64>().data(),
+                              padded_shape_t.NumElements(), &padded_shape));
+      padded_shapes.push_back(std::move(padded_shape));
+    }
+
+    OP_REQUIRES(
+        ctx, input->output_dtypes() == output_types_,
+        errors::InvalidArgument("Input dataset and this dataset "
+                                "have different output_types: ",
+                                DataTypeVectorString(input->output_dtypes()),
+                                " and ", DataTypeVectorString(output_types_)));
+
+    for (int i = 0; i < input->output_shapes().size(); ++i) {
+      // Exclude the queue from the tensor_shapes calculation.
+      const PartialTensorShape& tensor_shape = padded_shapes[i];
+      OP_REQUIRES(
+          ctx,
+          IsGreaterEqualToOrCompatibleWith(tensor_shape,
+                                           input->output_shapes()[i]),
+          errors::InvalidArgument("Incompatible input shapes at component ", i,
+                                  " between input dataset this dataset: ",
+                                  input->output_shapes()[i].DebugString(),
+                                  " vs. ", tensor_shape.DebugString()));
+    }
+
+    OpInputList padding_values_list;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("padding_values", &padding_values_list));
+    std::vector<Tensor> padding_values;
+    OP_REQUIRES(ctx,
+                padding_values_list.size() == input->output_shapes().size(),
+                errors::InvalidArgument(
+                    "Number of padding values (", padding_values_list.size(),
+                    ") must match the number of components in the input "
+                    "dataset's elements (",
+                    input->output_shapes().size(), ")"));
+    for (int i = 0; i < padding_values_list.size(); ++i) {
+      const Tensor& padding_value_t = padding_values_list[i];
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::IsScalar(padding_value_t.shape()),
+          errors::InvalidArgument(
+              "All padding values must be scalars; but at component ", i,
+              " saw shape: ", padding_value_t.shape().DebugString()));
+      OP_REQUIRES(ctx, padding_value_t.dtype() == input->output_dtypes()[i],
+                  errors::InvalidArgument(
+                      "Mismatched type between padding value ", i,
+                      " and input dataset's component ", i, ": ",
+                      DataTypeString(padding_value_t.dtype()), " vs. ",
+                      DataTypeString(input->output_dtypes()[i])));
+      padding_values.push_back(padding_value_t);
+    }
+
+    *output = new PrependFromQueueAndPaddedBatchDataset(
+        ctx, batch_size, input, output_types_, padded_shapes,
+        std::move(padding_values));
+  }
+
+ private:
+  DataTypeVector output_types_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("PrependFromQueueAndPaddedBatchDataset").Device(DEVICE_CPU),
+    PrependFromQueueAndPaddedBatchDatasetOp);
+
+class EnqueueInQueueDatasetOp : public OpKernel {
+ public:
+  explicit EnqueueInQueueDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    using TensorQueueInserter =
+        PrependFromQueueAndPaddedBatchDataset::Iterator::TensorQueueInserter;
+
+    // TODO(ebrevdo): accept list of sequence lengths to do proper
+    // sub-slicing of tensors for placement into the queue?
+    const Tensor& tensor_queue_t = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_queue_t.shape()),
+                errors::InvalidArgument("queue must be a vector, saw shape: ",
+                                        tensor_queue_t.shape().DebugString()));
+    std::vector<const TensorQueueInserter*> inserters;
+    const int64 batch_size = tensor_queue_t.NumElements();
+    inserters.reserve(batch_size);
+    const Variant* variants = tensor_queue_t.flat<Variant>().data();
+    for (int i = 0; i < batch_size; ++i) {
+      const auto* inserter = variants[i].get<TensorQueueInserter>();
+      OP_REQUIRES(ctx, inserter != nullptr,
+                  errors::InvalidArgument(
+                      "Could not access TensorQueueInserter from queue[", i,
+                      "].  Received variant: ", variants[i].DebugString()));
+      inserters.push_back(inserter);
+    }
+
+    OpInputList components;
+    OP_REQUIRES_OK(ctx, ctx->input_list("components", &components));
+    for (int i = 0; i < components.size(); ++i) {
+      OP_REQUIRES(
+          ctx,
+          components[i].dims() > 0 && components[i].dim_size(0) == batch_size,
+          errors::InvalidArgument(
+              "Expected component ", i, " to have batched shape [", batch_size,
+              ",...], but saw shape: ", components[i].shape().DebugString()));
+    }
+    std::vector<TensorShape> element_shapes;
+    for (int i = 0; i < components.size(); ++i) {
+      TensorShape element_shape = components[i].shape();
+      element_shape.RemoveDim(0);
+      element_shapes.push_back(std::move(element_shape));
+    }
+    for (int64 b = 0; b < batch_size; ++b) {
+      std::vector<Tensor> tensors;
+      tensors.reserve(components.size());
+      for (int i = 0; i < components.size(); ++i) {
+        Tensor t(components[i].dtype(), element_shapes[i]);
+        OP_REQUIRES_OK(ctx,
+                       batch_util::CopySliceToElement(components[i], &t, b));
+        tensors.push_back(std::move(t));
+      }
+      // TODO(ebrevdo): Acquire the lock once for all inserters with
+      // the same underlying queue?  Add InsertLocked?
+      OP_REQUIRES_OK(ctx, inserters[b]->Insert(tensors));
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("EnqueueInQueueDataset").Device(DEVICE_CPU),
+                        EnqueueInQueueDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
similarity index 77%
rename from tensorflow/core/kernels/tensor_slice_dataset_op.cc
rename to tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index e85f59b584720cae0f00cf45a265862e688b157c..d5be4c778074e406122dc3a1a9c23681fca491d0 100644
--- a/tensorflow/core/kernels/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -86,7 +86,7 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      std::vector<NodeBuilder::NodeOut> components;
+      std::vector<Node*> components;
       components.reserve(tensors_.size());
       for (const Tensor& t : tensors_) {
         Node* node;
@@ -101,41 +101,6 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
     }
 
    private:
-    template <typename T>
-    static Status HandleSliceToElement(const Tensor& parent, Tensor* element,
-                                       int64 index) {
-      DCHECK_NE(parent.dim_size(0), 0);
-      DCHECK_GE(index, 0);
-      if (element->NumElements() !=
-          (parent.NumElements() / parent.dim_size(0))) {
-        TensorShape chip_shape = parent.shape();
-        chip_shape.RemoveDim(0);
-        return errors::Internal(
-            "HandleSliceToElement Cannot copy slice: number of elements does "
-            "not match.  Shapes are: [element]: ",
-            element->shape().DebugString(), ", [parent slice]: ",
-            chip_shape.DebugString());
-      }
-      auto parent_as_matrix = parent.flat_outer_dims<T>();
-      element->flat<T>() = parent_as_matrix.chip(index, 0);
-      return Status::OK();
-    }
-
-    static Status CopySliceToElement(const Tensor& parent, Tensor* element,
-                                     int64 index) {
-#define HANDLE_TYPE(T)                                      \
-  case DataTypeToEnum<T>::value: {                          \
-    return HandleSliceToElement<T>(parent, element, index); \
-  }
-
-      switch (parent.dtype()) {
-        TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-        default:
-          return errors::Unimplemented(
-              "CopySliceToElement Unhandled data type: ", element->dtype());
-      }
-    }
-
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
@@ -152,9 +117,9 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
           out_tensors->reserve(dataset()->tensors_.size());
           for (int i = 0; i < dataset()->tensors_.size(); ++i) {
             const Tensor& t = dataset()->tensors_[i];
-            Tensor t_slice(cpu_allocator(), t.dtype(),
+            Tensor t_slice(ctx->allocator({}), t.dtype(),
                            TensorShape(dataset()->shapes_[i].dim_sizes()));
-            TF_RETURN_IF_ERROR(CopySliceToElement(t, &t_slice, i_));
+            TF_RETURN_IF_ERROR(batch_util::CopySliceToElement(t, &t_slice, i_));
             out_tensors->emplace_back(std::move(t_slice));
           }
           ++i_;
@@ -172,7 +137,7 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
diff --git a/tensorflow/core/kernels/data/unique_dataset_op.cc b/tensorflow/core/kernels/data/unique_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7726ee0edf71b34cb65fe5fceb2b60dd30bb58e2
--- /dev/null
+++ b/tensorflow/core/kernels/data/unique_dataset_op.cc
@@ -0,0 +1,219 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/hash/hash.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class UniqueDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit UniqueDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    OP_REQUIRES(ctx, input->output_dtypes().size() == 1,
+                errors::InvalidArgument("UniqueDataset only supports "
+                                        "inputs with a single component."));
+
+    DataType input_dtype = input->output_dtypes()[0];
+    OP_REQUIRES(ctx,
+                input_dtype == DT_INT32 || input_dtype == DT_INT64 ||
+                    input_dtype == DT_STRING,
+                errors::InvalidArgument(
+                    "UniqueDataset only supports inputs with a single "
+                    "`tf.int32`, `tf.int64`, or `tf.string` component."));
+
+    *output = new Dataset(ctx, input);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input)
+        : GraphDatasetBase(ctx), input_(input) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Unique")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override {
+      return strings::StrCat("UniqueDatasetOp::Dataset");
+    }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const typename Iterator::Params& params)
+          : DatasetIterator<Dataset>(params),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        bool saw_new_value;
+        do {
+          saw_new_value = false;
+          out_tensors->clear();
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          if (*end_of_sequence) {
+            break;
+          }
+          DCHECK_EQ(1, out_tensors->size());
+          saw_new_value = unique_elements_.insert((*out_tensors)[0]).second;
+        } while (!saw_new_value);
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name("unique_elements_size"), unique_elements_.size()));
+        size_t i = 0;
+        for (const Tensor& t : unique_elements_) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat("unique_elements[", i++, "]")), t));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        int64 num_unique_elements;
+        unique_elements_.clear();
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("unique_elements_size"),
+                                              &num_unique_elements));
+        for (int64 i = 0; i < num_unique_elements; ++i) {
+          Tensor unique_element;
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat("unique_elements[", i, "]")),
+              &unique_element));
+          auto insert_result = unique_elements_.insert(unique_element);
+          if (!insert_result.second) {
+            return errors::InvalidArgument(
+                "Checkpoint contained two unique elements with the same "
+                "value.");
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      struct TensorHash {
+        size_t operator()(const Tensor& t) const {
+          if (t.dtype() == DT_INT32 || t.dtype() == DT_INT64) {
+            return Hash64(t.tensor_data().data(), t.tensor_data().size());
+          } else {
+            DCHECK_EQ(DT_STRING, t.dtype());
+            auto flat_t = t.flat<string>();
+            uint64 hash = 0;
+            for (int64 i = 0; i < t.NumElements(); ++i) {
+              hash = Hash64Combine(hash, Hash64(flat_t(i)));
+            }
+            return static_cast<size_t>(hash);
+          }
+        }
+      };
+
+      struct TensorKeyEqual {
+        bool operator()(const Tensor& lhs, const Tensor& rhs) const {
+          if (lhs.shape() != rhs.shape() || lhs.dtype() != rhs.dtype()) {
+            return false;
+          }
+          switch (lhs.dtype()) {
+#define HANDLE_TYPE(T)                                     \
+  case T:                                                  \
+    do {                                                   \
+      auto lhs_flat = lhs.flat<EnumToDataType<T>::Type>(); \
+      auto rhs_flat = rhs.flat<EnumToDataType<T>::Type>(); \
+      for (int64 i = 0; i < lhs.NumElements(); ++i) {      \
+        if (lhs_flat(i) != rhs_flat(i)) {                  \
+          return false;                                    \
+        }                                                  \
+      }                                                    \
+      return true;                                         \
+    } while (0)
+
+            HANDLE_TYPE(DT_INT32);
+            HANDLE_TYPE(DT_INT64);
+            HANDLE_TYPE(DT_STRING);
+            default:
+              LOG(FATAL) << "UniqueDataset unhandled data type: "
+                         << DataTypeString(lhs.dtype());
+          }
+        }
+      };
+
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unordered_set<Tensor, TensorHash, TensorKeyEqual> unique_elements_
+          GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("UniqueDataset").Device(DEVICE_CPU),
+                        UniqueDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
similarity index 84%
rename from tensorflow/core/kernels/window_dataset.cc
rename to tensorflow/core/kernels/data/window_dataset.cc
index 77345fd3dfb7e39184605ed1bb4cab3251a62ea1..e24bdea4ac70b76edb926419fa9180f13cf51fb0 100644
--- a/tensorflow/core/kernels/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/window_dataset.h"
+#include "tensorflow/core/kernels/data/window_dataset.h"
 
 namespace tensorflow {
 namespace {
@@ -59,6 +59,21 @@ class WindowDataset : public DatasetBase {
       return Status::OK();
     }
 
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      mutex_lock l(mu_);
+      TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
+      return Status::OK();
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      mutex_lock l(mu_);
+      int64 i;
+      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i));
+      i_ = size_t(i);
+      return Status::OK();
+    }
+
     mutex mu_;
     size_t i_ GUARDED_BY(mu_) = 0;
   };
diff --git a/tensorflow/core/kernels/window_dataset.h b/tensorflow/core/kernels/data/window_dataset.h
similarity index 88%
rename from tensorflow/core/kernels/window_dataset.h
rename to tensorflow/core/kernels/data/window_dataset.h
index a4fccf17b4c7cc064c1aec57554bb88bb7b59578..97c31668acba8869f1f5947acbbb4069c4adccb0 100644
--- a/tensorflow/core/kernels/window_dataset.h
+++ b/tensorflow/core/kernels/data/window_dataset.h
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINDOW_DATASET_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINDOW_DATASET_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
 
 #include <vector>
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -45,4 +45,4 @@ Status NewWindowDataset(std::vector<std::vector<Tensor>> elements,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINDOW_DATASET_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_WINDOW_DATASET_H_
diff --git a/tensorflow/core/kernels/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/zip_dataset_op.cc
rename to tensorflow/core/kernels/data/zip_dataset_op.cc
index 9381915ae9894a91a7418ade2c8648e407b8735b..0f79eac94710fafd3cbf5686876f629dac7bac09 100644
--- a/tensorflow/core/kernels/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 
@@ -80,7 +79,7 @@ class ZipDatasetOp : public DatasetOpKernel {
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      std::vector<NodeBuilder::NodeOut> input_graph_nodes;
+      std::vector<Node*> input_graph_nodes;
       input_graph_nodes.reserve(inputs_.size());
       for (const auto& input : inputs_) {
         Node* input_node;
@@ -145,7 +144,7 @@ class ZipDatasetOp : public DatasetOpKernel {
         return Status::OK();
       }
 
-      Status RestoreInternal(OpKernelContext* ctx,
+      Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         if (reader->Contains(full_name("input_impls_empty"))) {
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fa67545a0dad0332cce55c173fc39ba25c055902
--- /dev/null
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -0,0 +1,176 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/data_format_ops.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class DataFormatDimMapOp : public OpKernel {
+ public:
+  explicit DataFormatDimMapOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string src_format;
+    OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
+    string dst_format;
+    OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
+    OP_REQUIRES(
+        context, src_format == "NHWC",
+        errors::InvalidArgument(strings::StrCat(
+            "Current implementation doesn't support source data format ",
+            src_format)));
+    OP_REQUIRES(context, dst_format == "NCHW",
+                errors::InvalidArgument(strings::StrCat(
+                    "Current implementation doesn't support dst data format ",
+                    dst_format)));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    functor::DataFormatDimMap<Device, T>()(context->eigen_device<Device>(),
+                                           input.flat<T>(), output->flat<T>());
+  }
+};
+
+template <typename Device, typename T>
+class DataFormatVecPermuteOp : public OpKernel {
+ public:
+  explicit DataFormatVecPermuteOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string src_format;
+    OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
+    string dst_format;
+    OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
+    OP_REQUIRES(context,
+                (src_format == "NHWC" && dst_format == "NCHW") ||
+                    (src_format == "NCHW" && dst_format == "NHWC"),
+                errors::InvalidArgument(strings::StrCat(
+                    "Current implementation only supports NCHW-to-NHWC and "
+                    "NHWC-to-NCHW format conversion; got source format ",
+                    src_format, " and destination format ", dst_format)));
+    nhwc_to_nchw_ = (src_format == "NHWC") ? true : false;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, input.dims() == 1 || input.dims() == 2,
+                errors::InvalidArgument(
+                    "input must be a vector or 2D tensor, but got shape ",
+                    input.shape().DebugString()));
+    if (input.dims() == 1) {
+      OP_REQUIRES(
+          context, input.NumElements() == 4,
+          errors::InvalidArgument("1D input must be of size 4, but got shape ",
+                                  input.shape().DebugString()));
+    } else if (input.dims() == 2) {
+      OP_REQUIRES(
+          context, input.dim_size(0) == 4,
+          errors::InvalidArgument(
+              "First dimension of 2D input must be of size 4, but got shape ",
+              input.shape().DebugString()));
+      OP_REQUIRES(
+          context, input.dim_size(1) == 2,
+          errors::InvalidArgument(
+              "Second dimension of 2D input must be of size 2, but got shape ",
+              input.shape().DebugString()));
+    }
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    functor::DataFormatVecPermute<Device, T>()(
+        context->eigen_device<Device>(), input.flat<T>(), output->flat<T>(),
+        nhwc_to_nchw_);
+  }
+
+ private:
+  bool nhwc_to_nchw_;
+};
+
+#define REGISTER_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("DataFormatDimMap").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      DataFormatDimMapOp<CPUDevice, T>);
+TF_CALL_int32(REGISTER_KERNEL);
+TF_CALL_int64(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#define REGISTER_KERNEL(T)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("DataFormatVecPermute").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      DataFormatVecPermuteOp<CPUDevice, T>);
+TF_CALL_int32(REGISTER_KERNEL);
+TF_CALL_int64(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                \
+  template <>                                              \
+  void DataFormatDimMap<GPUDevice, T>::operator()(         \
+      const GPUDevice& d, typename TTypes<T>::ConstFlat x, \
+      typename TTypes<T>::Flat y);                         \
+  extern template struct DataFormatDimMap<GPUDevice, T>;
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
+TF_CALL_int32(DECLARE_GPU_SPECS);
+TF_CALL_int64(DECLARE_GPU_SPECS);
+#undef DECLARE_GPU_SPEC
+
+#define DECLARE_GPU_SPEC(T)                                \
+  template <>                                              \
+  void DataFormatVecPermute<GPUDevice, T>::operator()(     \
+      const GPUDevice& d, typename TTypes<T>::ConstFlat x, \
+      typename TTypes<T>::Vec y, bool nhwc_to_nchw);       \
+  extern template struct DataFormatVecPermute<GPUDevice, T>;
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
+TF_CALL_int32(DECLARE_GPU_SPECS);
+TF_CALL_int64(DECLARE_GPU_SPECS);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T)                                            \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("DataFormatDimMap").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      DataFormatDimMapOp<GPUDevice, T>);
+TF_CALL_int32(REGISTER_GPU_KERNEL);
+TF_CALL_int64(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+#define REGISTER_GPU_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("DataFormatVecPermute").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      DataFormatVecPermuteOp<GPUDevice, T>);
+TF_CALL_int32(REGISTER_GPU_KERNEL);
+TF_CALL_int64(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data_format_ops.h b/tensorflow/core/kernels/data_format_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf704cc35cf2ff18b38202db5d192b460b415fbb
--- /dev/null
+++ b/tensorflow/core/kernels/data_format_ops.h
@@ -0,0 +1,116 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_DATA_FORMAT_OPS_H_
+#define TENSORFLOW_KERNELS_DATA_FORMAT_OPS_H_
+// Functor definition for data format dim mapping ops, must be compilable
+// by nvcc.
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by DataFormatDimMapOP to do the computations.
+template <typename Device, typename T>
+struct DataFormatDimMap {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::Flat y) {
+    auto zero = x.constant(0);
+    auto one = x.constant(1);
+    auto three = x.constant(3);
+    auto four = x.constant(4);
+    auto x_mod = (x + four) % 4;
+    auto is_zero = (x_mod == zero);
+    auto is_three = (x_mod == three);
+    y.device(d) = is_zero.select(zero, is_three.select(one, x_mod + one));
+  }
+};
+
+template <typename T>
+struct VecPermuteNHWCToNCHW {
+  Eigen::DSizes<Eigen::DenseIndex, 1> dimensions(
+      typename TTypes<T>::ConstFlat input) const {
+    Eigen::DSizes<Eigen::DenseIndex, 1> result;
+    result[0] = input.dimension(0);
+    return result;
+  }
+  template <typename Output, typename Device>
+  void eval(typename TTypes<T>::ConstFlat input, Output& output,
+            const Device& d) const {
+    if (input.size() == 8) {
+      output.template chip<0>(0).device(d) = input.template chip<0>(0);
+      output.template chip<0>(1).device(d) = input.template chip<0>(1);
+      output.template chip<0>(2).device(d) = input.template chip<0>(6);
+      output.template chip<0>(3).device(d) = input.template chip<0>(7);
+      output.template chip<0>(4).device(d) = input.template chip<0>(2);
+      output.template chip<0>(5).device(d) = input.template chip<0>(3);
+      output.template chip<0>(6).device(d) = input.template chip<0>(4);
+      output.template chip<0>(7).device(d) = input.template chip<0>(5);
+    } else {
+      output.template chip<0>(0).device(d) = input.template chip<0>(0);
+      output.template chip<0>(1).device(d) = input.template chip<0>(3);
+      output.template chip<0>(2).device(d) = input.template chip<0>(1);
+      output.template chip<0>(3).device(d) = input.template chip<0>(2);
+    }
+  }
+};
+
+template <typename T>
+struct VecPermuteNCHWToNHWC {
+  Eigen::DSizes<Eigen::DenseIndex, 1> dimensions(
+      typename TTypes<T>::ConstFlat input) const {
+    Eigen::DSizes<Eigen::DenseIndex, 1> result;
+    result[0] = input.dimension(0);
+    return result;
+  }
+  template <typename Output, typename Device>
+  void eval(typename TTypes<T>::ConstFlat input, Output& output,
+            const Device& d) const {
+    if (input.size() == 8) {
+      output.template chip<0>(0).device(d) = input.template chip<0>(0);
+      output.template chip<0>(1).device(d) = input.template chip<0>(1);
+      output.template chip<0>(2).device(d) = input.template chip<0>(4);
+      output.template chip<0>(3).device(d) = input.template chip<0>(5);
+      output.template chip<0>(4).device(d) = input.template chip<0>(6);
+      output.template chip<0>(5).device(d) = input.template chip<0>(7);
+      output.template chip<0>(6).device(d) = input.template chip<0>(2);
+      output.template chip<0>(7).device(d) = input.template chip<0>(3);
+    } else {
+      output.template chip<0>(0).device(d) = input.template chip<0>(0);
+      output.template chip<0>(1).device(d) = input.template chip<0>(2);
+      output.template chip<0>(2).device(d) = input.template chip<0>(3);
+      output.template chip<0>(3).device(d) = input.template chip<0>(1);
+    }
+  }
+};
+
+// Functor used by DataFormatVecPermuteOp to do the computations.
+template <typename Device, typename T>
+struct DataFormatVecPermute {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
+                  typename TTypes<T>::Flat y, bool nhwc_to_nchw) {
+    if (nhwc_to_nchw) {
+      y.device(d) = x.customOp(VecPermuteNHWCToNCHW<T>());
+    } else {
+      y.device(d) = x.customOp(VecPermuteNCHWToNHWC<T>());
+    }
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_DATA_FORMAT_OPS_H_
diff --git a/tensorflow/core/kernels/data_format_ops_gpu.cu.cc b/tensorflow/core/kernels/data_format_ops_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38ce7c28fea662cea7004c47a46c0031875e3c36
--- /dev/null
+++ b/tensorflow/core/kernels/data_format_ops_gpu.cu.cc
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/data_format_ops.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::DataFormatDimMap<GPUDevice, int32>;
+template struct functor::DataFormatDimMap<GPUDevice, int64>;
+template struct functor::DataFormatVecPermute<GPUDevice, int32>;
+template struct functor::DataFormatVecPermute<GPUDevice, int64>;
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/dataset.cc b/tensorflow/core/kernels/dataset.cc
deleted file mode 100644
index fcfa2956f782fc9617448ad75e53b7c36963d222..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/dataset.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/dataset.h"
-
-namespace tensorflow {
-
-namespace {
-
-// A wrapper class for storing a `DatasetBase` instance in a DT_VARIANT tensor.
-// Objects of the wrapper class own a reference on an instance of `DatasetBase`,
-// and the wrapper's copy constructor and destructor take care of managing the
-// reference count.
-//
-// NOTE(mrry): This is not a feature-complete implementation of the DT_VARIANT
-// specification. In particular, we cannot currently serialize an arbitrary
-// `DatasetBase` object, so the `Encode()` and `Decode()` methods are not
-// implemented.
-class DatasetVariantWrapper {
- public:
-  DatasetVariantWrapper() : dataset_(nullptr) {}
-
-  // Transfers ownership of `dataset` to `*this`.
-  explicit DatasetVariantWrapper(DatasetBase* dataset) : dataset_(dataset) {}
-
-  DatasetVariantWrapper(const DatasetVariantWrapper& other)
-      : dataset_(other.dataset_) {
-    if (dataset_) dataset_->Ref();
-  }
-
-  ~DatasetVariantWrapper() {
-    if (dataset_) dataset_->Unref();
-  }
-
-  DatasetBase* get() const { return dataset_; }
-
-  string TypeName() const { return "tensorflow::DatasetVariantWrapper"; }
-  string DebugString() const {
-    if (dataset_) {
-      return dataset_->DebugString();
-    } else {
-      return "<Uninitialized DatasetVariantWrapper>";
-    }
-  }
-  void Encode(VariantTensorData* data) const {
-    LOG(ERROR) << "The Encode() method is not implemented for "
-                  "DatasetVariantWrapper objects.";
-  }
-  bool Decode(const VariantTensorData& data) {
-    LOG(ERROR) << "The Decode() method is not implemented for "
-                  "DatasetVariantWrapper objects.";
-    return false;
-  }
-
- private:
-  DatasetBase* const dataset_;  // Owns one reference.
-};
-
-}  // namespace
-
-Status GetDatasetFromVariantTensor(const Tensor& tensor,
-                                   DatasetBase** out_dataset) {
-  if (!(tensor.dtype() == DT_VARIANT ||
-        TensorShapeUtils::IsScalar(tensor.shape()))) {
-    return errors::InvalidArgument(
-        "Dataset tensor must be a scalar of dtype DT_VARIANT.");
-  }
-  const Variant& variant = tensor.scalar<Variant>()();
-  const DatasetVariantWrapper* wrapper = variant.get<DatasetVariantWrapper>();
-  if (wrapper == nullptr) {
-    return errors::InvalidArgument("Tensor must be a Dataset object.");
-  }
-  *out_dataset = wrapper->get();
-  if (*out_dataset == nullptr) {
-    return errors::Internal("Read uninitialized Dataset variant.");
-  }
-  return Status::OK();
-}
-
-Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor) {
-  if (!(tensor->dtype() == DT_VARIANT ||
-        TensorShapeUtils::IsScalar(tensor->shape()))) {
-    return errors::InvalidArgument(
-        "Dataset tensor must be a scalar of dtype DT_VARIANT.");
-  }
-  tensor->scalar<Variant>()() = DatasetVariantWrapper(dataset);
-  return Status::OK();
-}
-
-void DatasetOpKernel::Compute(OpKernelContext* ctx) {
-  DatasetBase* dataset = nullptr;
-  MakeDataset(ctx, &dataset);
-  if (ctx->status().ok()) {
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
-    OP_REQUIRES_OK(ctx, StoreDatasetInVariantTensor(dataset, output));
-  }
-}
-
-void UnaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
-                                       DatasetBase** output) {
-  DatasetBase* input;
-  OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &input));
-  MakeDataset(ctx, input, output);
-}
-
-void BinaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
-                                        DatasetBase** output) {
-  DatasetBase* input;
-  OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &input));
-  DatasetBase* another_input;
-  OP_REQUIRES_OK(ctx,
-                 GetDatasetFromVariantTensor(ctx->input(1), &another_input));
-  MakeDataset(ctx, input, another_input, output);
-}
-
-const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
-const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] =
-    "_DATASET_GRAPH_OUTPUT_NODE";
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dataset.h b/tensorflow/core/kernels/dataset.h
index afbebb0692d0a9bd246a77be7dc0ba2eae06b112..69ab78d6355dc2e22c7d77b62123fc0bd2359fc4 100644
--- a/tensorflow/core/kernels/dataset.h
+++ b/tensorflow/core/kernels/dataset.h
@@ -12,680 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATASET_H_
+#define TENSORFLOW_CORE_KERNELS_DATASET_H_
 
-#include <memory>
+#include "tensorflow/core/kernels/data/dataset.h"
 
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/framework/variant_tensor_data.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/tracing.h"
-
-// Polymorphic datasets should support all primitive TensorFlow
-// types. Use this macro to expand `m(T)` once for each primitive type
-// `T`, e.g. to build a `switch` statement.
-#define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
-
-namespace tensorflow {
-
-// Interface for reading values from a key-value store.
-// Used for restoring iterator state.
-class IteratorStateReader {
- public:
-  virtual Status ReadScalar(StringPiece key, int64* val) = 0;
-  virtual Status ReadScalar(StringPiece key, string* val) = 0;
-  virtual Status ReadTensor(StringPiece key, Tensor* val) = 0;
-  virtual bool Contains(StringPiece key) = 0;
-
-  virtual ~IteratorStateReader() {}
-};
-
-// Interface for writing values to a key-value store.
-// Used for saving iterator state.
-class IteratorStateWriter {
- public:
-  virtual Status WriteScalar(StringPiece key, const int64 val) = 0;
-  virtual Status WriteScalar(StringPiece key, const string& val) = 0;
-  virtual Status WriteTensor(StringPiece key, const Tensor& val) = 0;
-
-  virtual ~IteratorStateWriter() {}
-};
-
-// Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
-class GraphDefBuilderWrapper {
- public:
-  explicit GraphDefBuilderWrapper(GraphDefBuilder* b) : b_(b) {}
-
-  // Adds a Const node with scalar value to the Graph.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  template <typename T>
-  Status AddScalar(const T& val, Node** output) {
-    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
-    val_t.scalar<T>()() = val;
-    AddTensorInternal(val_t, output);
-    if (*output == nullptr) {
-      return errors::Internal("AddScalar: Failed to build Const op.");
-    }
-    return Status::OK();
-  }
-
-  // Adds a Const node with vector value to the Graph.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  // TODO(shivaniagrawal): Consider changing to gtl::ArraySlice?
-  template <typename T>
-  Status AddVector(const std::vector<T>& val, Node** output) {
-    Tensor val_t = Tensor(DataTypeToEnum<T>::v(),
-                          TensorShape({static_cast<int64>(val.size())}));
-    for (int i = 0; i < val.size(); i++) {
-      val_t.flat<T>()(i) = val[i];
-    }
-    AddTensorInternal(val_t, output);
-    if (*output == nullptr) {
-      return errors::Internal("AddVector: Failed to build Const op.");
-    }
-    return Status::OK();
-  }
-
-  // Adds a Const node with Tensor value to the Graph.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  Status AddTensor(const Tensor& val, Node** output) {
-    AddTensorInternal(val, output);
-    if (*output == nullptr) {
-      return errors::Internal("AddTesor: Failed to build Const op.");
-    }
-    return Status::OK();
-  }
-
-  template <class DatasetType>
-  Status AddDataset(const DatasetType* dataset,
-                    const std::vector<NodeBuilder::NodeOut>& inputs,
-                    Node** output) {
-    return AddDataset(dataset, inputs, {}, output);
-  }
-
-  // Adds a node corresponding to the `DatasetType` to the Graph.
-  // Return value of `DatasetType::op_name()` is used as the op type for the
-  // node.
-  // Values for the output_types and output_shapes node attributes are also
-  // written if those attributes are defined in the OpDef.
-  // `*output` contains a pointer to the output `Node`. It is guaranteed to be
-  // non-null if the method returns with an OK status.
-  // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  template <class DatasetType>
-  Status AddDataset(const DatasetType* dataset,
-                    const std::vector<NodeBuilder::NodeOut>& inputs,
-                    const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
-                    Node** output) {
-    std::vector<std::pair<size_t, NodeBuilder::NodeOut>> enumerated_inputs(
-        inputs.size());
-    for (int i = 0; i < inputs.size(); i++) {
-      enumerated_inputs[i] = std::make_pair(i, inputs[i]);
-    }
-    return AddDataset(dataset, enumerated_inputs, {}, attrs, output);
-  }
-
-  template <class DatasetType>
-  Status AddDataset(
-      const DatasetType* dataset,
-      const std::vector<std::pair<size_t, NodeBuilder::NodeOut>>& inputs,
-      const std::vector<
-          std::pair<size_t, gtl::ArraySlice<NodeBuilder::NodeOut>>>&
-          list_inputs,
-      const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
-      Node** output) {
-    const string& op_type_name = dataset->op_name();
-    std::unique_ptr<const GraphDefBuilder::Options> opts(
-        new GraphDefBuilder::Options(b_->opts()));
-    // TODO(srbs|mrry): Not all datasets have output_types and output_shapes
-    // attributes defined. It will be nice to have a consistent pattern.
-    bool has_output_types_attr = HasAttr(op_type_name, "output_types");
-    bool has_output_shapes_attr = HasAttr(op_type_name, "output_shapes");
-    if (has_output_shapes_attr) {
-      opts.reset(new GraphDefBuilder::Options(
-          opts->WithAttr("output_shapes", dataset->output_shapes())));
-    }
-    if (has_output_types_attr) {
-      opts.reset(new GraphDefBuilder::Options(
-          opts->WithAttr("output_types", dataset->output_dtypes())));
-    }
-    for (auto attr : attrs) {
-      opts.reset(new GraphDefBuilder::Options(
-          opts->WithAttr(attr.first, attr.second)));
-    }
-    if (opts->HaveError()) {
-      return errors::Internal("AddDataset: Failed to build Options with error ",
-                              opts->StatusToString());
-    }
-    NodeBuilder node_builder(opts->GetNameForOp(op_type_name), op_type_name,
-                             opts->op_registry());
-    {
-      size_t total_size = inputs.size() + list_inputs.size();
-      auto inputs_iter = inputs.begin();
-      auto list_inputs_iter = list_inputs.begin();
-      for (int i = 0; i < total_size; i++) {
-        if (inputs_iter != inputs.end() && inputs_iter->first == i) {
-          node_builder.Input(inputs_iter->second);
-          inputs_iter++;
-        } else if (list_inputs_iter != list_inputs.end() &&
-                   list_inputs_iter->first == i) {
-          node_builder.Input(list_inputs_iter->second);
-          list_inputs_iter++;
-        } else {
-          return errors::InvalidArgument("No input found for index ", i);
-        }
-      }
-    }
-    *output = opts->FinalizeBuilder(&node_builder);
-    if (*output == nullptr) {
-      return errors::Internal("AddDataset: Failed to build ", op_type_name,
-                              " op with error ", opts->StatusToString());
-    }
-    return Status::OK();
-  }
-
-  // Adds a user-defined function with name `function_name` to the graph and
-  // recursively adds all functions it references. If a function with a matching
-  // name has already been added, returns with OK status. If a user-defined with
-  // name `function_name` is not found in the FunctionLibraryDefinition, returns
-  // an InvalidArgumentError. If the function with name `function_name` or any
-  // of its dependent functions are stateful, returns an InvalidArgument error.
-  Status AddFunction(OpKernelContext* ctx, const string& function_name) {
-    if (b_->HasFunction(function_name)) {
-      LOG(INFO) << "Function with name " << function_name << "already exists in"
-                << " the graph. It will not be added again.";
-      return Status::OK();
-    }
-    TF_RETURN_IF_ERROR(EnsureFunctionIsStateless(ctx, function_name));
-    const FunctionLibraryDefinition* flib_def =
-        ctx->function_library()->GetFunctionLibraryDefinition();
-    const FunctionDef* f_def = flib_def->Find(function_name);
-    if (f_def == nullptr) {
-      return errors::InvalidArgument("Unable to find FunctionDef for ",
-                                     function_name, " in the registry.");
-    }
-    FunctionDefLibrary def;
-    *def.add_function() = *f_def;
-    const string gradient_func = flib_def->FindGradient(function_name);
-    if (!gradient_func.empty()) {
-      GradientDef* g_def = def.add_gradient();
-      g_def->set_function_name(function_name);
-      g_def->set_gradient_func(gradient_func);
-    }
-    TF_RETURN_IF_ERROR(b_->AddFunctionLibrary(def));
-
-    // Recursively add functions in inputs of function_name.
-    for (const NodeDef& node_def : f_def->node_def()) {
-      const OpRegistrationData* op_reg_data = nullptr;
-      TF_RETURN_IF_ERROR(flib_def->LookUp(node_def.op(), &op_reg_data));
-      if (op_reg_data->is_function_op) {
-        TF_RETURN_IF_ERROR(AddFunction(ctx, op_reg_data->op_def.name()));
-      }
-      // Recursively add functions in attrs of this NodeDef.
-      for (const auto& pair : node_def.attr()) {
-        TF_RETURN_IF_ERROR(AddAttrFunctions(pair.second, ctx));
-      }
-    }
-
-    // Recursively add functions in attrs of function_name.
-    for (auto iter = f_def->attr().begin(); iter != f_def->attr().end();
-         iter++) {
-      TF_RETURN_IF_ERROR(AddAttrFunctions(iter->second, ctx));
-    }
-    return Status::OK();
-  }
-
-  template <typename T>
-  void BuildAttrValue(const T& value, AttrValue* attr) {
-    SetAttrValue(value, attr);
-  }
-
- private:
-  void AddTensorInternal(const Tensor& val, Node** output) {
-    *output = ops::SourceOp(
-        "Const",
-        b_->opts().WithAttr("dtype", val.dtype()).WithAttr("value", val));
-  }
-
-  Status EnsureFunctionIsStateless(OpKernelContext* ctx,
-                                   const string& function_name) const {
-    const FunctionLibraryDefinition* lib_def =
-        ctx->function_library()->GetFunctionLibraryDefinition();
-    const FunctionDef* function_def = lib_def->Find(function_name);
-    if (!function_def) {
-      return errors::InvalidArgument("Unable to find FunctionDef for ",
-                                     function_name, " in registry.");
-    }
-    for (const NodeDef& node_def : function_def->node_def()) {
-      const OpDef* op_def;
-      TF_RETURN_IF_ERROR(lib_def->LookUpOpDef(node_def.op(), &op_def));
-      // TODO(b/65524810): Hack to allow functions to capture Dataset op
-      // nodes needed for FlatMap. Currently, source datasets nodes have been
-      // marked stateful to avoid constant folding since we do not have a
-      // good way of serializing them.
-      if (IsOpWhitelisted(op_def)) {
-        continue;
-      }
-      if (op_def->is_stateful()) {
-        return errors::InvalidArgument(
-            "Op[name: ", node_def.name(), ", type: ", node_def.op(), "] ",
-            "in function ", function_name, " is stateful. ",
-            "Saving stateful functions is not supported yet.");
-      }
-    }
-    return Status::OK();
-  }
-
-  bool IsOpWhitelisted(const OpDef* op_def) const {
-    return StringPiece(op_def->name()).ends_with("Dataset") &&
-           HasAttr(op_def, "output_shapes");
-  }
-
-  bool HasAttr(const string& op_type_name, const string& attr_name) const {
-    const OpDef* op_def = nullptr;
-    Status s = b_->opts().op_registry()->LookUpOpDef(op_type_name, &op_def);
-    if (!s.ok() || op_def == nullptr) {
-      return false;
-    }
-    return HasAttr(op_def, attr_name);
-  }
-
-  bool HasAttr(const OpDef* op_def, const string& attr_name) const {
-    for (auto attr : op_def->attr()) {
-      if (attr.name() == attr_name) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  Status AddAttrFunctions(const AttrValue& attr_value, OpKernelContext* ctx) {
-    if (attr_value.has_func()) {
-      TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name()));
-    } else if (attr_value.has_list()) {
-      for (const NameAttrList& name_attr_list : attr_value.list().func()) {
-        TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name()));
-      }
-    }
-    return Status::OK();
-  }
-
-  GraphDefBuilder* b_;
-};
-
-class StatsAggregator;
-
-// A cut-down version of OpKernelContext for running computations in
-// iterators. Note that we cannot simply use OpKernelContext here
-// because we might run computation in an iterator whose lifetime is
-// not nested within the lifetime of a single OpKernelContext
-// (e.g. asynchronous prefetching).
-//
-// TODO(mrry): We will probably need to support more of
-// OpKernelContext here. For example, should allocation be handled by
-// the IteratorContext?
-// TODO(mrry): We're making some daring assumptions about the lifetime
-// of the runner passed in here. A runner will be deleted when the original
-// step ends, but all existing runners only close over session-lifetime (or
-// longer-lived) state, so we can make a copy of the function. There's nothing
-// in the definition of the API from which we took the runner to guarantee that
-// what we are doing is safe. We should formalize the properties here.
-class IteratorContext {
- public:
-  struct Params {
-    // Interface to operating system functionality.
-    Env* env;
-
-    // Function call support.
-    std::function<void(std::function<void()>)> runner = nullptr;
-
-    // A function that returns the current `StatsAggregator` instance to be
-    // used when recording statistics about the iterator.
-    //
-    // NOTE(mrry): This is somewhat awkward, because (i) the `StatsAggregator`
-    // is a property of the `IteratorResource` (which this class does not know
-    // about), and (ii) it can change after the `IteratorContext` has been
-    // created. Better suggestions are welcome!
-    std::function<std::shared_ptr<StatsAggregator>()> stats_aggregator_getter =
-        nullptr;
-  };
-
-  explicit IteratorContext(Params params) : params_(std::move(params)) {}
-
-  Env* env() const { return params_.env; }
-
-  std::function<void(std::function<void()>)>* runner() {
-    return &params_.runner;
-  }
-
-  std::shared_ptr<StatsAggregator> stats_aggregator() {
-    if (params_.stats_aggregator_getter) {
-      return params_.stats_aggregator_getter();
-    } else {
-      return nullptr;
-    }
-  }
-
- private:
-  Params params_;
-};
-
-// Represents the current position in a range of outputs, where the
-// range of outputs is typically represented by an `DatasetBase`,
-// defined below.
-class IteratorBase {
- public:
-  virtual ~IteratorBase() {}
-
-  // Gets the next output from the range that this iterator is traversing.
-  //
-  // If at least one output remains in this iterator's range, that
-  // output will be stored in `*out_tensors` and `false` will be
-  // stored in `*end_of_sequence`.
-  //
-  // If no more outputs remain in this iterator's range, `true` will
-  // be stored in `*end_of_sequence`, and the content of
-  // `*out_tensors` will be undefined.
-  //
-  // This method is thread-safe.
-  //
-  // TODO(mrry): Define `GetNextAsync()` or `GetNextManyAsync()`, and
-  // potentially remove this method.
-  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                         bool* end_of_sequence) = 0;
-
-  // Returns a vector of DataType values, representing the respective
-  // element types of each tuple component in the outputs of this
-  // iterator.
-  virtual const DataTypeVector& output_dtypes() const = 0;
-
-  // Returns a vector of tensor shapes, representing the respective
-  // (and possibly partially defined) shapes of each tuple component
-  // in the outputs of this iterator.
-  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
-
-  // Saves the state of this iterator.
-  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
-    return SaveInternal(writer);
-  }
-
-  // Restores the state of this iterator.
-  virtual Status Restore(OpKernelContext* ctx, IteratorStateReader* reader) {
-    return RestoreInternal(ctx, reader);
-  }
-
- protected:
-  // This is needed so that sub-classes of IteratorBase can call
-  // `SaveInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
-  Status SaveParent(IteratorStateWriter* writer,
-                    const std::unique_ptr<IteratorBase>& parent) {
-    return parent->SaveInternal(writer);
-  }
-
-  // This is needed so that sub-classes of IteratorBase can call
-  // `RestoreInternal` on their parent iterators, e.g., in
-  // `RepeatDataasetOp::Dataset`.
-  Status RestoreParent(OpKernelContext* ctx, IteratorStateReader* reader,
-                       const std::unique_ptr<IteratorBase>& parent) {
-    return parent->RestoreInternal(ctx, reader);
-  }
-
-  // Saves the state of this iterator recursively.
-  virtual Status SaveInternal(IteratorStateWriter* writer) {
-    return errors::Unimplemented("SaveInternal");
-  }
-
-  // Restores the state of this iterator recursively.
-  virtual Status RestoreInternal(OpKernelContext* ctx,
-                                 IteratorStateReader* reader) {
-    return errors::Unimplemented("RestoreInternal");
-  }
-};
-
-// Represents a (potentially infinite) range of outputs, where each
-// output is a tuple of tensors.
-class DatasetBase : public core::RefCounted {
- public:
-  // Returns a new iterator for iterating over the range of elements in
-  // this dataset.
-  //
-  // This method may be called multiple times on the same instance,
-  // and the resulting iterators will have distinct state. Each
-  // iterator will traverse all elements in this dataset from the
-  // start.
-  //
-  // Ownership of the created iterator will be transferred to the caller.
-  //
-  // The prefix identifies the sequence of iterators leading up to the newly
-  // created iterator.
-  virtual std::unique_ptr<IteratorBase> MakeIterator(
-      const string& prefix) const = 0;
-
-  // Returns a vector of DataType values, representing the respective
-  // element types of each tuple component in the outputs of this
-  // dataset.
-  virtual const DataTypeVector& output_dtypes() const = 0;
-
-  // Returns a vector of tensor shapes, representing the respective
-  // (and possibly partially defined) shapes of each tuple component
-  // in the outputs of this dataset.
-  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
-
-  // A human-readable debug string for this dataset.
-  virtual string DebugString() = 0;
-
-  // Serializes the dataset and writes it to the `writer`.
-  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const {
-    return errors::Unimplemented("DatasetBase::Save");
-  }
-
- protected:
-  // TODO(srbs): Ideally all graph related logic should reside in
-  // GraphDatasetBase. However, that would require Datasets defined in all ops
-  // to derive from GraphDatasetBase. Once that is done we can move
-  // DatasetGraphDefBuilder and AsGraphDefInternal to GraphDatasetBase.
-  class DatasetGraphDefBuilder : public GraphDefBuilderWrapper {
-   public:
-    DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {}
-    Status AddParentDataset(OpKernelContext* ctx, const DatasetBase* dataset,
-                            Node** output) {
-      return dataset->AsGraphDefInternal(ctx, this, output);
-    }
-  };
-
-  virtual Status AsGraphDefInternal(OpKernelContext* ctx,
-                                    DatasetGraphDefBuilder* b,
-                                    Node** node) const {
-    return AsGraphDefInternal(b, node);
-  }
-
-  virtual Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
-                                    Node** node) const {
-    return errors::Unimplemented("AsGraphDefInternal");
-  }
-};
-
-// Base-class for datasets that are built by ops.
-class GraphDatasetBase : public DatasetBase {
- public:
-  GraphDatasetBase(OpKernelContext* ctx)
-      : op_name_(ctx->op_kernel().type_string()) {}
-
-  const string op_name() const { return op_name_; }
-
-  Status Save(OpKernelContext* ctx,
-              IteratorStateWriter* writer) const override {
-    string serialized_graph_def;
-    string output_node;
-    TF_RETURN_IF_ERROR(Serialize(ctx, &serialized_graph_def, &output_node));
-    TF_RETURN_IF_ERROR(
-        writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
-    TF_RETURN_IF_ERROR(
-        writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
-    return Status::OK();
-  }
-
-  // Key for storing the Dataset graph in the serialized format.
-  static const char kDatasetGraphKey[];
-
-  // Key for storing the output node of the Dataset graph in the serialized
-  // format.
-  static const char kDatasetGraphOutputNodeKey[];
-
- private:
-  Status Serialize(OpKernelContext* ctx, string* serialized_graph_def,
-                   string* output_node) const {
-    GraphDefBuilder b;
-    DatasetGraphDefBuilder db(&b);
-    Node* node = nullptr;
-    TF_RETURN_IF_ERROR(AsGraphDefInternal(ctx, &db, &node));
-    *output_node = node->name();
-    GraphDef graph_def;
-    TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
-    graph_def.SerializeToString(serialized_graph_def);
-    return Status::OK();
-  }
-
-  const string op_name_;
-};
-
-// Represents an iterator that is associated with a particular parent dataset.
-template <class DatasetType>
-class DatasetIterator : public IteratorBase {
- public:
-  struct Params {
-    // Owns one reference on the shared dataset resource.
-    const DatasetType* dataset;
-
-    // Identifies the sequence of iterators leading up to this iterator.
-    const string prefix;
-  };
-
-  explicit DatasetIterator(const Params& params) : params_(params) {
-    params_.dataset->Ref();
-  }
-
-  ~DatasetIterator() override { params_.dataset->Unref(); }
-
-  // The dataset from which this iterator was created.
-  const DatasetType* dataset() const { return params_.dataset; }
-
-  // The sequence of iterators leading up to this iterator.
-  const string prefix() const { return params_.prefix; }
-
-  const DataTypeVector& output_dtypes() const override {
-    return params_.dataset->output_dtypes();
-  }
-
-  const std::vector<PartialTensorShape>& output_shapes() const override {
-    return params_.dataset->output_shapes();
-  }
-
-  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                 bool* end_of_sequence) final {
-    port::Tracing::TraceMe activity(params_.prefix);
-    return GetNextInternal(ctx, out_tensors, end_of_sequence);
-  }
-
-  Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) final {
-    TF_RETURN_IF_ERROR(dataset()->Save(ctx, writer));
-    return IteratorBase::Save(ctx, writer);
-  }
-
- protected:
-  // Internal implementation of GetNext that is wrapped in tracing logic.
-  virtual Status GetNextInternal(IteratorContext* ctx,
-                                 std::vector<Tensor>* out_tensors,
-                                 bool* end_of_sequence) = 0;
-
-  string full_name(const string& name) const {
-    return strings::StrCat(prefix(), ":", name);
-  }
-
- private:
-  Params params_;
-};
-
-// Encapsulates the work required to plug a DatasetBase into the core TensorFlow
-// graph execution engine.
-class DatasetOpKernel : public OpKernel {
- public:
-  DatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) final;
-
- protected:
-  // Subclasses should implement this method. It will be called during Compute
-  // execution.
-  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase** output) = 0;
-
-  template <typename T>
-  Status ParseScalarArgument(OpKernelContext* ctx,
-                             const StringPiece& argument_name, T* output) {
-    const Tensor* argument_t;
-    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
-    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
-      return errors::InvalidArgument(argument_name, " must be a scalar");
-    }
-    *output = argument_t->scalar<T>()();
-    return Status::OK();
-  }
-};
-
-// Encapsulates the work required to plug unary Datasets into the core
-// TensorFlow graph execution engine.
-class UnaryDatasetOpKernel : public DatasetOpKernel {
- public:
-  UnaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
-
- protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
-  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                           DatasetBase** output) = 0;
-};
-
-// Encapsulates the work required to plug binary Datasets into the core
-// TensorFlow graph execution engine.
-class BinaryDatasetOpKernel : public DatasetOpKernel {
- public:
-  BinaryDatasetOpKernel(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
-
- protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) final;
-  virtual void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                           DatasetBase* another_input,
-                           DatasetBase** output) = 0;
-};
-
-// Validates and extracts a `DatasetBase` object from `tensor`.
-//
-// `tensor` must have been written by a call to SetVariantTensorToDataset().
-//
-// The retrieved pointer is a borrowed reference to the dataset, which is owned
-// by the tensor. The consumer must either acquire its own reference to the
-// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
-// destroyed or mutated while the retrieved pointer is in use.
-Status GetDatasetFromVariantTensor(const Tensor& tensor,
-                                   DatasetBase** out_dataset);
-
-// Stores a `DatasetBase` object in `tensor`.
-//
-// The ownership of `dataset` is transferred to `tensor`.
-Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
-
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATASET_H_
diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc
index 965a60c7e05297d7aa7125bfcb7eed062af7a058..1b94ea05440516ff458c1785edd27589d18ffe61 100644
--- a/tensorflow/core/kernels/debug_ops.cc
+++ b/tensorflow/core/kernels/debug_ops.cc
@@ -46,7 +46,7 @@ REGISTER_KERNEL_BUILDER(Name("CopyHost")
                             .HostMemory("input")
                             .HostMemory("output"),
                         CopyOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug identity (non-ref and ref) ops.
 REGISTER_KERNEL_BUILDER(Name("DebugIdentity").Device(DEVICE_CPU),
@@ -66,7 +66,7 @@ REGISTER_KERNEL_BUILDER(Name("DebugIdentity")
                             .HostMemory("input")
                             .HostMemory("output"),
                         DebugIdentityOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug NaN-counter (non-ref and ref) ops.
 #define REGISTER_DEBUG_NAN_COUNT(type)                                    \
@@ -98,7 +98,7 @@ REGISTER_GPU_DEBUG_NAN_COUNT(double);
                           DebugNanCountOp<type>);
 REGISTER_GPU_DEBUG_NAN_COUNT(float);
 REGISTER_GPU_DEBUG_NAN_COUNT(double);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug numeric summary ops.
 #define REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT(type)        \
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 381add3fb3bd57ebf068212cdd32a640bf60dd9b..53a23b130609f8b1f4d2dd9f7665d02154f47364 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -21,7 +21,7 @@ limitations under the License.
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 #include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -91,7 +91,7 @@ class CopyOp : public OpKernel {
       Device* device = static_cast<Device*>(context->device());
       // Determine if the input tensor is not on CPU (e.g., on GPU).
       const bool off_host_input = device->device_type() == DEVICE_SYCL &&
-                            !context->input_alloc_attr(0).on_host();
+                                  !context->input_alloc_attr(0).on_host();
 
       if (off_host_input) {
         SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor);
diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
index c778278e8fbbec67a0255ea7d257c19da4f3612f..b7d120a617849b2c1a48b38b959f9941eb8503ac 100644
--- a/tensorflow/core/kernels/decode_bmp_op.cc
+++ b/tensorflow/core/kernels/decode_bmp_op.cc
@@ -39,6 +39,13 @@ class DecodeBmpOp : public OpKernel {
         errors::InvalidArgument("channels must be 0, 1, 3 or 4, got ",
                                 channels_));
   }
+  inline int32 ByteSwapInt32ForBigEndian(int32 x) {
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    return le32toh(x);
+#else
+    return x;
+#endif
+  }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& contents = context->input(0);
@@ -56,14 +63,18 @@ class DecodeBmpOp : public OpKernel {
                                         input.size(), " bytes"));
 
     const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
-    const int32 header_size = internal::SubtleMustCopy(
+    int32 header_size_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 10)));
-    const int32 width = internal::SubtleMustCopy(
+    const int32 header_size = ByteSwapInt32ForBigEndian(header_size_);
+    int32 width_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 18)));
-    const int32 height = internal::SubtleMustCopy(
+    const int32 width = ByteSwapInt32ForBigEndian(width_);
+    int32 height_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 22)));
-    const int32 bpp = internal::SubtleMustCopy(
+    const int32 height = ByteSwapInt32ForBigEndian(height_);
+    int32 bpp_ = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 28)));
+    const int32 bpp = ByteSwapInt32ForBigEndian(bpp_);
 
     if (channels_) {
       OP_REQUIRES(context, (channels_ == bpp / 8),
diff --git a/tensorflow/core/kernels/decode_compressed_op.cc b/tensorflow/core/kernels/decode_compressed_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c3d49e1f8f187c6d6a880c386b2348246117faa
--- /dev/null
+++ b/tensorflow/core/kernels/decode_compressed_op.cc
@@ -0,0 +1,125 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/parse_ops.cc.
+
+#include <algorithm>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/lib/io/zlib_inputstream.h"
+
+namespace tensorflow {
+namespace {
+// Wrap memory buffer into InputStreamInterface
+class MemoryInputStream : public io::InputStreamInterface {
+ public:
+  explicit MemoryInputStream(const char* buffer, size_t length)
+      : buf_(buffer), len_(length), pos_(0) {}
+
+  ~MemoryInputStream() override {}
+
+  Status ReadNBytes(int64 bytes_to_read, string* result) override {
+    result->clear();
+    if (bytes_to_read < 0) {
+      return errors::InvalidArgument("Can't read a negative number of bytes: ",
+                                     bytes_to_read);
+    }
+    int64 bytes = bytes_to_read;
+    Status s = Status::OK();
+    if (pos_ + bytes_to_read > len_) {
+      bytes = len_ - pos_;
+      s = errors::OutOfRange("reached end of file");
+    }
+    if (bytes > 0) {
+      result->resize(bytes);
+      memcpy(&(*result)[0], &buf_[pos_], bytes);
+      pos_ += bytes;
+    }
+    return s;
+  }
+
+  int64 Tell() const override { return pos_; }
+
+  Status Reset() override {
+    pos_ = 0;
+    return Status::OK();
+  }
+
+ private:
+  const char* buf_;  // Not owned.
+  int64 len_;
+  int64 pos_ = 0;  // Tracks where we are in the file.
+};
+}  // namespace
+
+class DecodeCompressedOp : public OpKernel {
+ public:
+  explicit DecodeCompressedOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("compression_type", &compression_type_));
+    OP_REQUIRES(context,
+                (compression_type_.empty() || compression_type_ == "ZLIB" ||
+                 compression_type_ == "GZIP"),
+                errors::InvalidArgument(
+                    "Only ZLIB, GZIP or NONE are supported compressions"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* bytes_tensor;
+    OP_REQUIRES_OK(context, context->input("bytes", &bytes_tensor));
+    const auto& bytes_flat = bytes_tensor->flat<string>();
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("output", bytes_tensor->shape(),
+                                            &output_tensor));
+    auto output_flat = output_tensor->flat<string>();
+    if (compression_type_.empty()) {
+      for (int64 i = 0; i < bytes_flat.size(); i++) {
+        output_flat(i) = bytes_flat(i);
+      }
+    } else {
+      const io::ZlibCompressionOptions zlib_options =
+          compression_type_ == "ZLIB" ? io::ZlibCompressionOptions::DEFAULT()
+                                      : io::ZlibCompressionOptions::GZIP();
+      for (int64 i = 0; i < bytes_flat.size(); i++) {
+        std::unique_ptr<MemoryInputStream> input_stream(
+            new MemoryInputStream(bytes_flat(i).data(), bytes_flat(i).size()));
+        std::unique_ptr<io::ZlibInputStream> zlib_stream(
+            new io::ZlibInputStream(
+                input_stream.get(), static_cast<size_t>(kBufferSize),
+                static_cast<size_t>(kBufferSize), zlib_options));
+        string output_string;
+        Status s = zlib_stream->ReadNBytes(INT_MAX, &output_string);
+        OP_REQUIRES(context, (s.ok() || errors::IsOutOfRange(s)), s);
+        output_flat(i) = output_string;
+      }
+    }
+  }
+
+ private:
+  enum { kBufferSize = 256 << 10 /* 256 kB */ };
+  string compression_type_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("DecodeCompressed").Device(DEVICE_CPU),
+                        DecodeCompressedOp)
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index c4555db453ba1549601cbf9a4bbf096fc3db22b2..0c42f632521dd86760e791626c8978c0b1e82709 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -91,9 +91,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               int32 value;
               OP_REQUIRES(ctx, strings::safe_strto32(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid int32: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid int32: ", fields[f]));
               output[f]->flat<int32>()(i) = value;
             }
             break;
@@ -111,9 +111,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               int64 value;
               OP_REQUIRES(ctx, strings::safe_strto64(fields[f], &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid int64: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid int64: ", fields[f]));
               output[f]->flat<int64>()(i) = value;
             }
             break;
@@ -130,9 +130,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               float value;
               OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid float: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid float: ", fields[f]));
               output[f]->flat<float>()(i) = value;
             }
             break;
@@ -150,9 +150,9 @@ class DecodeCSVOp : public OpKernel {
             } else {
               double value;
               OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value),
-                          errors::InvalidArgument("Field ", f, " in record ", i,
-                                                  " is not a valid double: ",
-                                                  fields[f]));
+                          errors::InvalidArgument(
+                              "Field ", f, " in record ", i,
+                              " is not a valid double: ", fields[f]));
               output[f]->flat<double>()(i) = value;
             }
             break;
@@ -208,9 +208,10 @@ class DecodeCSVOp : public OpKernel {
         if (!quoted) {
           while (static_cast<size_t>(current_idx) < input.size() &&
                  input[current_idx] != delim_) {
-            OP_REQUIRES(ctx, (!use_quote_delim_ || input[current_idx] != '"') &&
-                                 input[current_idx] != '\n' &&
-                                 input[current_idx] != '\r',
+            OP_REQUIRES(ctx,
+                        (!use_quote_delim_ || input[current_idx] != '"') &&
+                            input[current_idx] != '\n' &&
+                            input[current_idx] != '\r',
                         errors::InvalidArgument(
                             "Unquoted fields cannot have quotes/CRLFs inside"));
             field += input[current_idx];
@@ -238,10 +239,11 @@ class DecodeCSVOp : public OpKernel {
           }
 
           OP_REQUIRES(
-              ctx, (static_cast<size_t>(current_idx) < input.size() &&
-                    input[current_idx] == '"' &&
-                    (static_cast<size_t>(current_idx) == input.size() - 1 ||
-                     input[current_idx + 1] == delim_)),
+              ctx,
+              (static_cast<size_t>(current_idx) < input.size() &&
+               input[current_idx] == '"' &&
+               (static_cast<size_t>(current_idx) == input.size() - 1 ||
+                input[current_idx + 1] == delim_)),
               errors::InvalidArgument("Quoted field has to end with quote "
                                       "followed by delim or end"));
 
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index ceb152c3f00fe8923429eaa5f8cff026254803a5..912d04c1536600348e8263f03709f2305607d11f 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -294,6 +294,7 @@ class DecodeImageOp : public OpKernel {
 
     // Decode GIF, allocating tensor once the size is known.
     Tensor* output = nullptr;
+    string error_string;
     OP_REQUIRES(
         context,
         gif::Decode(input.data(), input.size(),
@@ -320,8 +321,10 @@ class DecodeImageOp : public OpKernel {
                         return nullptr;
                       }
                       return output->flat<uint8>().data();
-                    }),
-        errors::InvalidArgument("Invalid GIF data, size ", input.size()));
+                    },
+                    &error_string),
+        errors::InvalidArgument("Invalid GIF data (size ", input.size(), "), ",
+                                error_string));
   }
 
  private:
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index 1c0085cfeab3498acfe388b6727ad4baa6c6c44e..bacacb94ae4384151bc4282960dd810cbf1299a0 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -51,7 +51,7 @@ class DecodeRawOp : public OpKernel {
     }
     TensorShape out_shape = input.shape();
     if (str_size == -1 || str_size == 0) {  // Empty input
-      out_shape.AddDim(1);
+      out_shape.AddDim(0);
       Tensor* output_tensor = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output("output", out_shape,
                                                        &output_tensor));
diff --git a/tensorflow/core/kernels/decode_wav_op_test.cc b/tensorflow/core/kernels/decode_wav_op_test.cc
index fc323a5e04205b81bc64e2335df4b9fcee5db8b7..84dc649dabacd021ca19b277ad5f271ee12b9745 100644
--- a/tensorflow/core/kernels/decode_wav_op_test.cc
+++ b/tensorflow/core/kernels/decode_wav_op_test.cc
@@ -32,8 +32,8 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
-
-using namespace ops;  // NOLINT(build/namespaces)
+namespace ops {
+namespace {
 
 TEST(DecodeWavOpTest, DecodeWavTest) {
   Scope root = Scope::NewRootScope();
@@ -121,4 +121,6 @@ TEST(DecodeWavOpTest, DecodeWav_ShapeFn) {
   INFER_ERROR("channels must be non-negative, got -2", op, "[]");
 }
 
+}  // namespace
+}  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 8e9b8a7e2e7be8e55deeacd4de3f77033499387f..829155fb313bd354d28432be6212af0760630c44 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -120,9 +120,9 @@ bool CanUseDeepConv2D(int stride_rows, int stride_cols, int filter_rows,
 
   VLOG(2) << "CanUseDeepConv2D"
           << " deep_conv_cost: " << deep_conv_cost
-          << " direct_conv_cost: " << direct_conv_cost
-          << " deep_direct_ratio: " << (static_cast<float>(deep_conv_cost) /
-                                        static_cast<float>(direct_conv_cost))
+          << " direct_conv_cost: " << direct_conv_cost << " deep_direct_ratio: "
+          << (static_cast<float>(deep_conv_cost) /
+              static_cast<float>(direct_conv_cost))
           << " use_deep_conv: " << (deep_conv_cost < direct_conv_cost);
   return deep_conv_cost < direct_conv_cost;
 }
diff --git a/tensorflow/core/kernels/deep_conv2d.h b/tensorflow/core/kernels/deep_conv2d.h
index c3f6f66dc9ba6fcf3e29c139eec0030cc7a0be57..17a0230516e27a7121fd632479b9eb8227f83283 100644
--- a/tensorflow/core/kernels/deep_conv2d.h
+++ b/tensorflow/core/kernels/deep_conv2d.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
+#define TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
 
 #include "tensorflow/core/framework/types.h"
 
@@ -114,4 +114,4 @@ struct DeepConv2D {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DEEP_CONV2D_H_
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 208401cb24e9c7ebf28e42ccb2762764474a5377..c9c97dc072c93e3ab840a8a9c9d81eadd2adaa3c 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -62,6 +62,8 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 #define DEFINE_GPU_KERNELS(T) \
   template struct functor::DenseUpdate<GPUDevice, T, ASSIGN>;
 TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
+TF_CALL_int32(DEFINE_GPU_KERNELS);
+TF_CALL_int64(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 6d44a92fa3c2d22ade6293d30b4f008a62eb8e0f..6497c8f3719737ede2d261decd16f01c9854a7eb 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -89,7 +89,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -113,14 +113,14 @@ TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                \
-REGISTER_KERNEL_BUILDER(                                           \
-    Name("Assign").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
-    AssignOpT<SYCLDevice, type>);
+#define REGISTER_SYCL_KERNELS(type)                                 \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("Assign").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      AssignOpT<SYCLDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                        \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -146,7 +146,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #endif  // end GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                         \
+#define REGISTER_SYCL_KERNELS(type)                                    \
   REGISTER_KERNEL_BUILDER(                                             \
       Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       DenseUpdateOp<SYCLDevice, type, DenseUpdateType::ADD>);          \
@@ -156,5 +156,5 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 9347978d515b9244dde2b50b2fcfaa3c91ab9c94..91a9587174be4c047f8a21ea9222219def42d5f1 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -400,7 +400,7 @@ struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
 
     // Computes one shard of depthwise conv2d backprop input.
     auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
 
       const int64 input_image_size =
@@ -750,7 +750,7 @@ struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
 
     // Computes one shard of depthwise conv2d backprop filter.
     auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
       const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
       const int64 padded_out_depth_size =
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index 2759ecb2f1157b037b700cc5b4662a35b175c08c..c060b2e14d2f03f990af5267260bd88fa01a2c81 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -308,10 +308,10 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
 
     // in_depth for input and filter must match.
     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
 
     // The last dimension for filter is depth multiplier.
     const int32 depth_multiplier = filter.dim_size(3);
@@ -373,8 +373,11 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     // If in_depth==1, this operation is just a standard convolution, so
     // invoke that op.
     if (std::is_same<T, float>::value && in_depth == 1) {
+      // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
+      // conv is supported.
       launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
-                stride_, stride_, padding_, output, data_format_);
+                /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
+                padding_, output, data_format_);
       return;
     }
 
@@ -427,9 +430,10 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 #endif
 
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
-    DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
+REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<Eigen::half>("T"),
+                        DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
 
 REGISTER_KERNEL_BUILDER(
     Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<float>("T"),
diff --git a/tensorflow/core/kernels/depthwise_conv_op.h b/tensorflow/core/kernels/depthwise_conv_op.h
index 097a9f5bfad4f1cf0232b0bb31cf6f88fdb5696b..b2d5898891370321f7e97f19f2382eb1d55985f7 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.h
+++ b/tensorflow/core/kernels/depthwise_conv_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/types.h"
@@ -83,7 +83,7 @@ struct LaunchDepthwiseConvBackpropFilterOp {
 #if GOOGLE_CUDA
 template <typename T>
 struct LaunchDepthwiseConvOp<Eigen::GpuDevice, T> {
-  void operator()(OpKernelContext* ctx, const DepthwiseArgs args,
+  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
                   const T* input, const T* filter, T* output,
                   TensorFormat data_format);
 };
@@ -284,4 +284,4 @@ struct DepthwiseInputCopyOp {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 903aac5d68baeb8c37b009a54863a084dcb75147..505d33046ecf1ab676899cf2c22415fddb07bf95 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -17,19 +17,19 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "external/cub_archive/cub/util_ptx.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "external/cub_archive/cub/util_ptx.cuh"
 
-#if !defined(_MSC_VER)
-#define UNROLL _Pragma("unroll")
-#define NOUNROLL _Pragma("nounroll")
-#else
+#if defined(_MSC_VER) && !defined(__clang__)
 #define UNROLL
 #define NOUNROLL
+#else
+#define UNROLL _Pragma("unroll")
+#define NOUNROLL _Pragma("nounroll")
 #endif
 
 namespace tensorflow {
@@ -39,7 +39,7 @@ using Eigen::GpuDevice;
 // Returns whether depthwise convolution forward or backward input pass can be
 // performed using the faster ('Small') variant of the kernel.
 EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
-    const DepthwiseArgs args) {
+    const DepthwiseArgs& args) {
   return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
          args.in_cols <= 32 && args.in_rows == args.out_rows &&
          args.in_cols == args.out_cols && args.pad_rows >= 0 &&
@@ -52,13 +52,13 @@ EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
 // Returns whether depthwise convolution backward filter pass can be performed
 // using the faster ('Small') variant of the kernel.
 EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(
-    const DepthwiseArgs args, const int block_rows) {
+    const DepthwiseArgs& args, const int block_height) {
   return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
          args.in_cols <= 32 && args.in_rows == args.out_rows &&
          args.in_cols == args.out_cols && args.pad_rows >= 0 &&
          args.pad_rows < args.filter_rows && args.pad_cols >= 0 &&
-         args.pad_cols < args.filter_cols && block_rows <= args.in_rows &&
-         args.filter_rows * args.filter_cols <= args.in_cols * block_rows;
+         args.pad_cols < args.filter_cols && block_height <= args.in_rows &&
+         args.filter_rows * args.filter_cols <= args.in_cols * block_height;
 }
 
 // The DepthwiseConv2dGPUKernels perform either forward or backprop input
@@ -72,72 +72,81 @@ template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
 __global__ void __launch_bounds__(1024, 2)
     DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args, const T* input,
                                  const T* filter, T* output, int num_outputs) {
-  const int in_rows = args.in_rows;
-  const int in_cols = args.in_cols;
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows =
+  const int filter_height =
       kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
-  const int filter_cols =
+  const int filter_width =
       kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
   const int depth_multiplier =
       kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
-  const int pad_rows = args.pad_rows;
-  const int pad_cols = args.pad_cols;
-  const int out_rows = args.out_rows;
-  const int out_cols = args.out_cols;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+  const int out_height = args.out_rows;
+  const int out_width = args.out_cols;
   const int out_depth = args.out_depth;
 
   CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) {
     // Compute the indexes of this thread in the output.
-    const int OD = thread_id % out_depth;
-    const int OC = (thread_id / out_depth) % out_cols;
-    const int OR = (thread_id / out_depth / out_cols) % out_rows;
-    const int OB = thread_id / out_depth / out_cols / out_rows;
+    const int out_channel = thread_id % out_depth;
+    const int out_col = (thread_id / out_depth) % out_width;
+    const int out_row = (thread_id / out_depth / out_width) % out_height;
+    const int batch = thread_id / out_depth / out_width / out_height;
     // Compute the input depth and the index of depth multiplier.
-    const int in_d = OD / depth_multiplier;
-    const int multiplier = OD % depth_multiplier;
+    const int in_channel = out_channel / depth_multiplier;
+    const int multiplier = out_channel % depth_multiplier;
 
     // Decide if all input is valid, if yes, we can skip the boundary checks
     // for each input.
-    const int input_row_start = OR * stride - pad_rows;
-    const int input_col_start = OC * stride - pad_cols;
-    const int input_row_end = input_row_start + filter_rows;
-    const int input_col_end = input_col_start + filter_cols;
+    const int input_row_start = out_row * stride - pad_height;
+    const int input_col_start = out_col * stride - pad_width;
+    const int input_row_end = input_row_start + filter_height;
+    const int input_col_end = input_col_start + filter_width;
 
     T sum = static_cast<T>(0);
 
-    const int input_offset_temp = in_rows * OB;
+    const int input_offset_temp = in_height * batch;
     if (input_row_start >= 0 && input_col_start >= 0 &&
-        input_row_end < in_rows && input_col_end < in_cols) {
-      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
-        const int in_r = input_row_start + f_r;
-        const int filter_offset_temp = filter_cols * f_r;
-        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
-          const int in_c = input_col_start + f_c;
+        input_row_end < in_height && input_col_end < in_width) {
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = input_row_start + filter_row;
+        const int filter_offset_temp = filter_width * filter_row;
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = input_col_start + filter_col;
 
           const int input_offset =
-              in_d + in_depth * (in_c + in_cols * (in_r + input_offset_temp));
+              in_channel +
+              in_depth * (in_col + in_width * (in_row + input_offset_temp));
           const int filter_offset =
               multiplier +
-              depth_multiplier * (in_d + in_depth * (f_c + filter_offset_temp));
+              depth_multiplier *
+                  (in_channel + in_depth * (filter_col + filter_offset_temp));
           sum += ldg(input + input_offset) * ldg(filter + filter_offset);
         }
       }
     } else {
-      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
-        const int in_r = input_row_start + f_r;
-        const int filter_offset_temp = filter_cols * f_r;
-        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
-          const int in_c = input_col_start + f_c;
-          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
-            const int in_c = input_col_start + f_c;
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = input_row_start + filter_row;
+        const int filter_offset_temp = filter_width * filter_row;
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = input_col_start + filter_col;
+          if (in_row >= 0 && in_row < in_height && in_col >= 0 &&
+              in_col < in_width) {
+            const int in_col = input_col_start + filter_col;
 
             const int input_offset =
-                in_d + in_depth * (in_c + in_cols * (in_r + input_offset_temp));
+                in_channel +
+                in_depth * (in_col + in_width * (in_row + input_offset_temp));
             const int filter_offset =
-                multiplier + depth_multiplier *
-                                 (in_d + in_depth * (f_c + filter_offset_temp));
+                multiplier +
+                depth_multiplier *
+                    (in_channel + in_depth * (filter_col + filter_offset_temp));
             sum += ldg(input + input_offset) * ldg(filter + filter_offset);
           }
         }
@@ -157,8 +166,8 @@ __global__ void __launch_bounds__(1024, 2)
 // Backprop input direction is the same as forward direction with the filter
 // rotated by 180°.
 template <typename T, DepthwiseConv2dDirection kDirection,
-          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockSlices,
-          bool kKnownEvenRows>
+          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
+          bool kKnownEvenHeight>
 __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
     const DepthwiseArgs args, const T* input, const T* filter, T* output) {
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
@@ -166,45 +175,45 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
   extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
   T* const shared_data = reinterpret_cast<T*>(shared_memory);
 
-  const int batches = args.batch;
-  const int in_rows = args.in_rows;
-  const int in_cols = args.in_cols;
+  const int num_batches = args.batch;
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows =
+  const int filter_height =
       kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
-  const int filter_cols =
+  const int filter_width =
       kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
-  const int pad_rows = args.pad_rows;
-  const int pad_cols = args.pad_cols;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
 
-  const int block_rows = blockDim.z;
+  const int block_height = blockDim.z;
 
   // These values are the same for all threads and could
   // be precomputed on the CPU.
-  const int block_size = block_rows * in_cols * kBlockSlices;
-  const int in_row_size = in_cols * in_depth;
-  const int in_size = in_rows * in_row_size;
-  const int in_increment = (in_cols - 1) * kBlockSlices;
-  const int filter_pixels = filter_rows * filter_cols;
-  const int tile_cols = in_cols + filter_cols - 1;
-  const int even_rows = kKnownEvenRows || (1 & ~in_rows);
-  const int tile_rows = in_rows + filter_rows - even_rows;
-  const int tile_row_size = tile_cols * kBlockSlices;
-  const int tile_size = tile_rows * tile_row_size;
-  const int tile_offset = block_rows * tile_row_size;
-  const int pad_offset = pad_rows * tile_cols + pad_cols;
-  const int batch_blocks = (in_depth + kBlockSlices - 1) / kBlockSlices;
-  const int in_blocks = batch_blocks * batches;
+  const int block_size = block_height * in_width * kBlockDepth;
+  const int in_row_size = in_width * in_depth;
+  const int in_size = in_height * in_row_size;
+  const int in_increment = (in_width - 1) * kBlockDepth;
+  const int filter_pixels = filter_height * filter_width;
+  const int tile_width = in_width + filter_width - 1;
+  const int even_height = kKnownEvenHeight || (1 & ~in_height);
+  const int tile_height = in_height + filter_height - even_height;
+  const int tile_row_size = tile_width * kBlockDepth;
+  const int tile_size = tile_height * tile_row_size;
+  const int tile_offset = block_height * tile_row_size;
+  const int pad_offset = pad_height * tile_width + pad_width;
+  const int batch_blocks = (in_depth + kBlockDepth - 1) / kBlockDepth;
+  const int in_blocks = batch_blocks * num_batches;
   const int tensor_offset =
-      kKnownEvenRows ? in_size / 2 : block_rows * in_row_size;
+      kKnownEvenHeight ? in_size / 2 : block_height * in_row_size;
 
   const int thread_depth = threadIdx.x;
   const int thread_col = threadIdx.y;
   const int thread_row = threadIdx.z;
 
   // Position in block.
-  const int thread_pix = thread_row * in_cols + thread_col;
-  const int thread_idx = thread_pix * kBlockSlices + thread_depth;
+  const int thread_pix = thread_row * in_width + thread_col;
+  const int thread_idx = thread_pix * kBlockDepth + thread_depth;
 
   // Initialize tile, in particular the padding.
   for (int i = thread_idx; i < tile_size; i += block_size) {
@@ -216,32 +225,32 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
   const int tensor_idx = thread_pix * in_depth + thread_depth;
 
   // Position in (padded) shared memory.
-  const int data_pix = thread_row * tile_cols + thread_col;
-  const int data_idx = data_pix * kBlockSlices + thread_depth;
+  const int data_pix = thread_row * tile_width + thread_col;
+  const int data_idx = data_pix * kBlockDepth + thread_depth;
 
-  // Position in shared memory, offset by pad_rows / pad_cols.
+  // Position in shared memory, offset by pad_height / pad_width.
   const int tile_pix = data_pix + pad_offset;
-  const int tile_idx = tile_pix * kBlockSlices + thread_depth;
+  const int tile_idx = tile_pix * kBlockDepth + thread_depth;
 
-  const int max_depth = in_depth - thread_depth;
+  const int max_channel = in_depth - thread_depth;
   const int filter_write_offset =
       thread_pix < filter_pixels ? tile_size + thread_idx : 0;
   const int filter_read_offset =
       tile_size + thread_depth +
-      (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockSlices);
+      (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockDepth);
   const bool skip_second =
-      !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
+      !kKnownEvenHeight && thread_row + (in_height & 1) == block_height;
 
   for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
     const int batch = b / batch_blocks;
-    const int stack = b - batch * batch_blocks;
+    const int block = b - batch * batch_blocks;
 
-    const int start_depth = stack * kBlockSlices;
-    const int filter_offset = tensor_idx + start_depth;
+    const int start_channel = block * kBlockDepth;
+    const int filter_offset = tensor_idx + start_channel;
     const int inout_offset = batch * in_size + filter_offset;
-    const bool depth_in_range = start_depth < max_depth;
+    const bool channel_in_range = start_channel < max_channel;
 
-    if (depth_in_range) {
+    if (channel_in_range) {
       const T* const in_ptr = inout_offset + input;
       T* const tile_ptr = tile_idx + shared_data;
       tile_ptr[0] = ldg(in_ptr);
@@ -257,23 +266,23 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
     // Note: the condition to reach this is uniform across the entire block.
     __syncthreads();
 
-    if (depth_in_range) {
+    if (channel_in_range) {
       T sum1 = static_cast<T>(0);
       T sum2 = static_cast<T>(0);
       int shared_offset = data_idx;
       const T* filter_ptr = filter_read_offset + shared_data;
-      UNROLL for (int r = 0; r < filter_rows; ++r) {
-        UNROLL for (int c = 0; c < filter_cols; ++c) {
+      UNROLL for (int r = 0; r < filter_height; ++r) {
+        UNROLL for (int c = 0; c < filter_width; ++c) {
           if (kDirection == DIRECTION_BACKWARD) {
-            filter_ptr -= kBlockSlices;
+            filter_ptr -= kBlockDepth;
           }
           const T filter_value = *filter_ptr;
           const T* const tile_ptr = shared_offset + shared_data;
           sum1 += filter_value * tile_ptr[0];
           sum2 += filter_value * tile_ptr[tile_offset];
-          shared_offset += kBlockSlices;
+          shared_offset += kBlockDepth;
           if (kDirection == DIRECTION_FORWARD) {
-            filter_ptr += kBlockSlices;
+            filter_ptr += kBlockDepth;
           }
         }
         shared_offset += in_increment;
@@ -297,20 +306,20 @@ template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
 __global__ void __launch_bounds__(1024, 2)
     DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args, const T* input,
                                  const T* filter, T* output, int num_outputs) {
-  const int in_rows = args.in_rows;
-  const int in_cols = args.in_cols;
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows =
+  const int filter_height =
       kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
-  const int filter_cols =
+  const int filter_width =
       kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
   const int depth_multiplier =
       kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
-  const int pad_rows = args.pad_rows;
-  const int pad_cols = args.pad_cols;
-  const int out_rows = args.out_rows;
-  const int out_cols = args.out_cols;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+  const int out_height = args.out_rows;
+  const int out_width = args.out_cols;
   const int out_depth = args.out_depth;
 
   CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) {
@@ -321,16 +330,16 @@ __global__ void __launch_bounds__(1024, 2)
     //
     // THIS IS PROBABLY WRONG, we are not doing coalesced reads
     // into the input, because of the depth multiplier division...
-    const int OC = thread_id % out_cols;
-    const int OR = (thread_id / out_cols) % out_rows;
-    const int OD = (thread_id / out_cols / out_rows) % out_depth;
-    const int OB = thread_id / out_cols / out_rows / out_depth;
+    const int out_col = thread_id % out_width;
+    const int out_row = (thread_id / out_width) % out_height;
+    const int out_channel = (thread_id / out_width / out_height) % out_depth;
+    const int batch = thread_id / out_width / out_height / out_depth;
 
     // Compute the input depth and the index of depth multiplier
     // based off the output depth index that this thread is
     // computing n.
-    const int in_d = OD / depth_multiplier;
-    const int multiplier = OD % depth_multiplier;
+    const int in_channel = out_channel / depth_multiplier;
+    const int multiplier = out_channel % depth_multiplier;
 
     // Data is stored in the following format (let's assume we
     // flatten the height and width into one contiguous dimension
@@ -339,7 +348,7 @@ __global__ void __launch_bounds__(1024, 2)
     // B1C1P1 B1C1P2 ..... B1C2P1 B1C2P2 ....
     // B2C1P1 B2C1P2 ..... B2C2P1 B2C2P2 ....
     //
-    // Each row contains in_depth * in_rows * in_cols values
+    // Each row contains in_depth * in_height * in_width values
     // for each sample in the batch.
     //
     // We can further flatten it into:
@@ -356,7 +365,8 @@ __global__ void __launch_bounds__(1024, 2)
     // patch.
     //
     // We can compute the index into the patch once right here.
-    const int input_offset_temp = (OB * in_depth + in_d) * (in_rows * in_cols);
+    const int input_offset_temp =
+        (batch * in_depth + in_channel) * (in_height * in_width);
 
     // Finally, we can iterate over the spatial dimensions and perform the
     // convolution, writing into the output at the end.
@@ -364,49 +374,56 @@ __global__ void __launch_bounds__(1024, 2)
     // We perform an additional optimization, where we can determine
     // whether the patch fits within the image indices statically, and
     // avoid boundary checking within the loop.
-    const int input_row_start = OR * stride - pad_rows;
-    const int input_col_start = OC * stride - pad_cols;
-    const int input_row_end = input_row_start + filter_rows;
-    const int input_col_end = input_col_start + filter_cols;
+    const int input_row_start = out_row * stride - pad_height;
+    const int input_col_start = out_col * stride - pad_width;
+    const int input_row_end = input_row_start + filter_height;
+    const int input_col_end = input_col_start + filter_width;
 
     T sum = static_cast<T>(0);
     if (input_row_start >= 0 && input_col_start >= 0 &&
-        input_row_end < in_rows && input_col_end < in_cols) {
+        input_row_end < in_height && input_col_end < in_width) {
       // Loop that doesn't need to check for boundary conditions.
-      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
-        const int in_r = input_row_start + f_r;
-        const int filter_offset_temp = filter_cols * f_r;
-        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
-          const int in_c = input_col_start + f_c;
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = input_row_start + filter_row;
+        const int filter_offset_temp = filter_width * filter_row;
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = input_col_start + filter_col;
 
           const int input_offset =
-              (input_offset_temp) + (in_r * in_cols) + in_c;
+              (input_offset_temp) + (in_row * in_width) + in_col;
           const int filter_offset =
               multiplier +
-              depth_multiplier * (in_d + in_depth * (f_c + filter_offset_temp));
+              depth_multiplier *
+                  (in_channel + in_depth * (filter_col + filter_offset_temp));
           sum += ldg(input + input_offset) * ldg(filter + filter_offset);
         }
       }
     } else {
       // Loop that needs to check for boundary conditions.
-      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
-        const int in_r = input_row_start + f_r;
-        const int filter_offset_temp = filter_cols * f_r;
-        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
-          const int in_c = input_col_start + f_c;
-          // TODO(vrv): the in_r check can be done outside of this loop;
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = input_row_start + filter_row;
+        const int filter_offset_temp = filter_width * filter_row;
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = input_col_start + filter_col;
+          // TODO(vrv): the in_row check can be done outside of this loop;
           // benchmark both methods to determine the better decision.
-          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
-            const int in_c = input_col_start + f_c;
+          if (in_row >= 0 && in_row < in_height && in_col >= 0 &&
+              in_col < in_width) {
+            const int in_col = input_col_start + filter_col;
 
             // input_offset_temp indexes into the start of memory
             // where the spatial data starts.
             const int input_offset =
-                (input_offset_temp) + (in_r * in_cols) + in_c;
+                (input_offset_temp) + (in_row * in_width) + in_col;
 
             const int filter_offset =
-                multiplier + depth_multiplier *
-                                 (in_d + in_depth * (f_c + filter_offset_temp));
+                multiplier +
+                depth_multiplier *
+                    (in_channel + in_depth * (filter_col + filter_offset_temp));
             sum += ldg(input + input_offset) * ldg(filter + filter_offset);
           }
         }
@@ -427,8 +444,8 @@ __global__ void __launch_bounds__(1024, 2)
 // Backprop input direction is the same as forward direction with the filter
 // rotated by 180°.
 template <typename T, DepthwiseConv2dDirection kDirection,
-          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockSlices,
-          bool kKnownEvenRows>
+          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
+          bool kKnownEvenHeight>
 __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
     const DepthwiseArgs args, const T* input, const T* filter, T* output) {
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
@@ -436,43 +453,43 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
   extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
   T* const shared_data = reinterpret_cast<T*>(shared_memory);
 
-  const int batches = args.batch;
-  const int in_rows = args.in_rows;
-  const int in_cols = args.in_cols;
+  const int num_batches = args.batch;
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows =
+  const int filter_height =
       kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
-  const int filter_cols =
+  const int filter_width =
       kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
-  const int pad_rows = args.pad_rows;
-  const int pad_cols = args.pad_cols;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
 
   // Fixed blockDim.z, tailored for maximum grid size for images of size 16x16.
-  const int block_rows = blockDim.y;
+  const int block_height = blockDim.y;
 
   // These values are the same for all threads and could
   // be precomputed on the CPU.
-  const int block_pixels = in_cols * block_rows;
-  const int block_size = block_pixels * kBlockSlices;
-  const int in_pixels = in_cols * in_rows;
-  const int in_increment = in_cols - 1;
-  const int filter_pixels = filter_rows * filter_cols;
-  const int tile_cols = in_cols + filter_cols - 1;
-  const int even_rows = kKnownEvenRows || (1 & ~in_rows);
-  const int tile_rows = in_rows + filter_rows - even_rows;
-  const int tile_pixels = tile_cols * tile_rows;
-  const int tile_size = tile_pixels * kBlockSlices;
-  const int tile_offset = block_rows * tile_cols;
-  const int pad_offset = pad_rows * tile_cols + pad_cols;
-  const int in_slices = in_depth * batches;
-  const int in_blocks = (in_slices + kBlockSlices - 1) / kBlockSlices;
+  const int block_pixels = in_width * block_height;
+  const int block_size = block_pixels * kBlockDepth;
+  const int in_pixels = in_width * in_height;
+  const int in_increment = in_width - 1;
+  const int filter_pixels = filter_height * filter_width;
+  const int tile_width = in_width + filter_width - 1;
+  const int even_height = kKnownEvenHeight || (1 & ~in_height);
+  const int tile_height = in_height + filter_height - even_height;
+  const int tile_pixels = tile_width * tile_height;
+  const int tile_size = tile_pixels * kBlockDepth;
+  const int tile_offset = block_height * tile_width;
+  const int pad_offset = pad_height * tile_width + pad_width;
+  const int in_total_depth = in_depth * num_batches;
+  const int in_blocks = (in_total_depth + kBlockDepth - 1) / kBlockDepth;
 
   const int thread_col = threadIdx.x;
   const int thread_row = threadIdx.y;
   const int thread_depth = threadIdx.z;
 
   // Position in block.
-  const int thread_pix = thread_row * in_cols + thread_col;
+  const int thread_pix = thread_row * in_width + thread_col;
   const int thread_idx = thread_depth * block_pixels + thread_pix;
 
   // Initialize tile, in particular the padding.
@@ -485,33 +502,33 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
   const int tensor_idx = thread_depth * in_pixels + thread_pix;
 
   // Position in (padded) shared memory.
-  const int data_pix = thread_row * tile_cols + thread_col;
+  const int data_pix = thread_row * tile_width + thread_col;
   const int data_idx = thread_depth * tile_pixels + data_pix;
 
-  // Position in shared memory, offset by pad_rows / pad_cols.
+  // Position in shared memory, offset by pad_height / pad_width.
   const int tile_idx = data_idx + pad_offset;
 
   // Filter is always in HWCK format, irrespective of the input/output format.
-  const int filter_pix = thread_idx / kBlockSlices;
-  const int filter_depth = thread_idx % kBlockSlices;
+  const int filter_pix = thread_idx / kBlockDepth;
+  const int filter_channel = thread_idx % kBlockDepth;
   const int filter_idx = filter_pix * in_depth;
 
-  const int max_slice = in_slices - thread_depth;
+  const int max_channel = in_total_depth - thread_depth;
   const int filter_write_offset =
       filter_pix < filter_pixels ? tile_size + thread_idx : 0;
   const int filter_read_offset =
       tile_size + thread_depth +
-      (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockSlices);
+      (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockDepth);
   const bool skip_second =
-      !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
+      !kKnownEvenHeight && thread_row + (in_height & 1) == block_height;
 
   for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
-    const int slice = b * kBlockSlices;
+    const int channel = b * kBlockDepth;
 
-    const int inout_offset = slice * in_pixels + tensor_idx;
-    const bool slice_in_range = slice < max_slice;
+    const int inout_offset = channel * in_pixels + tensor_idx;
+    const bool channel_in_range = channel < max_channel;
 
-    if (slice_in_range) {
+    if (channel_in_range) {
       const T* const in_ptr = inout_offset + input;
       T* const tile_ptr = tile_idx + shared_data;
       tile_ptr[0] = ldg(in_ptr);
@@ -521,22 +538,23 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
     }
 
     if (filter_write_offset != 0) {
-      const int filter_offset = filter_idx + (slice + filter_depth) % in_depth;
+      const int filter_offset =
+          filter_idx + (channel + filter_channel) % in_depth;
       shared_data[filter_write_offset] = ldg(filter_offset + filter);
     }
 
     // Note: the condition to reach this is uniform across the entire block.
     __syncthreads();
 
-    if (slice_in_range) {
+    if (channel_in_range) {
       T sum1 = static_cast<T>(0);
       T sum2 = static_cast<T>(0);
       int shared_offset = data_idx;
       const T* filter_ptr = filter_read_offset + shared_data;
-      UNROLL for (int r = 0; r < filter_rows; ++r) {
-        UNROLL for (int c = 0; c < filter_cols; ++c) {
+      UNROLL for (int r = 0; r < filter_height; ++r) {
+        UNROLL for (int c = 0; c < filter_width; ++c) {
           if (kDirection == DIRECTION_BACKWARD) {
-            filter_ptr -= kBlockSlices;
+            filter_ptr -= kBlockDepth;
           }
           const T filter_value = *filter_ptr;
           const T* const tile_ptr = shared_offset + shared_data;
@@ -544,7 +562,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
           sum2 += filter_value * tile_ptr[tile_offset];
           ++shared_offset;
           if (kDirection == DIRECTION_FORWARD) {
-            filter_ptr += kBlockSlices;
+            filter_ptr += kBlockDepth;
           }
         }
         shared_offset += in_increment;
@@ -562,86 +580,90 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
 }
 
 template <typename T, DepthwiseConv2dDirection kDirection,
-          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockSlices,
-          bool kKnownEvenRows>
-void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
-                                   const T* input, const T* filter, T* output,
+          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
+          bool kKnownEvenHeight>
+void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& device,
+                                   const DepthwiseArgs& args, const T* input,
+                                   const T* filter, T* output,
                                    TensorFormat data_format) {
-  const int block_rows = (args.in_rows + 1) / 2;
+  const int block_height = (args.in_rows + 1) / 2;
   dim3 block_dim;
   void (*kernel)(const DepthwiseArgs, const T*, const T*, T*);
   if (data_format == FORMAT_NHWC) {
-    block_dim = dim3(kBlockSlices, args.in_cols, block_rows);
+    block_dim = dim3(kBlockDepth, args.in_cols, block_height);
     kernel = DepthwiseConv2dGPUKernelNHWCSmall<T, kDirection, kKnownFilterWidth,
-                                               kKnownFilterHeight, kBlockSlices,
-                                               kKnownEvenRows>;
+                                               kKnownFilterHeight, kBlockDepth,
+                                               kKnownEvenHeight>;
   } else if (data_format == FORMAT_NCHW) {
-    block_dim = dim3(args.in_cols, block_rows, kBlockSlices);
+    block_dim = dim3(args.in_cols, block_height, kBlockDepth);
     kernel = DepthwiseConv2dGPUKernelNCHWSmall<T, kDirection, kKnownFilterWidth,
-                                               kKnownFilterHeight, kBlockSlices,
-                                               kKnownEvenRows>;
+                                               kKnownFilterHeight, kBlockDepth,
+                                               kKnownEvenHeight>;
   } else {
     assert(false && "Incorrect data format");
     return;
   }
-  const int tile_cols = args.in_cols + args.filter_cols - 1;
-  const int tile_rows = block_rows * 2 + args.filter_rows - 1;
-  const int tile_pixels = tile_rows * tile_cols;
+  const int tile_width = args.in_cols + args.filter_cols - 1;
+  const int tile_height = block_height * 2 + args.filter_rows - 1;
+  const int tile_pixels = tile_height * tile_width;
   const int filter_pixels = args.filter_rows * args.filter_cols;
   const int shared_memory_size =
-      kBlockSlices * (tile_pixels + filter_pixels) * sizeof(T);
+      kBlockDepth * (tile_pixels + filter_pixels) * sizeof(T);
   const int num_outputs =
       args.batch * args.out_rows * args.out_cols * args.out_depth;
   CudaLaunchConfig config =
-      GetCudaLaunchConfig(num_outputs, d, kernel, shared_memory_size,
+      GetCudaLaunchConfig(num_outputs, device, kernel, shared_memory_size,
                           block_dim.x * block_dim.y * block_dim.z);
-  kernel<<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
-      args, input, filter, output);
+  kernel<<<config.block_count, block_dim, shared_memory_size,
+           device.stream()>>>(args, input, filter, output);
 }
 
 template <typename T, DepthwiseConv2dDirection kDirection,
-          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockSlices>
-void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
-                                   const T* input, const T* filter, T* output,
+          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth>
+void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& device,
+                                   const DepthwiseArgs& args, const T* input,
+                                   const T* filter, T* output,
                                    TensorFormat data_format) {
   if (args.in_rows & 1) {
     LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
-                                  kKnownFilterHeight, kBlockSlices, false>(
-        d, args, input, filter, output, data_format);
+                                  kKnownFilterHeight, kBlockDepth, false>(
+        device, args, input, filter, output, data_format);
   } else {
     LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
-                                  kKnownFilterHeight, kBlockSlices, true>(
-        d, args, input, filter, output, data_format);
+                                  kKnownFilterHeight, kBlockDepth, true>(
+        device, args, input, filter, output, data_format);
   }
 }
 
 template <typename T, DepthwiseConv2dDirection kDirection,
           int kKnownFilterWidth, int kKnownFilterHeight>
-void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
-                                   const T* input, const T* filter, T* output,
+void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& device,
+                                   const DepthwiseArgs& args, const T* input,
+                                   const T* filter, T* output,
                                    TensorFormat data_format) {
-  // Maximize (power of two) kBlockSlices while keeping a block within 1024
+  // Maximize (power of two) kBlockDepth while keeping a block within 1024
   // threads (2 pixels per thread).
   const int block_pixels = (args.in_rows + 1) / 2 * args.in_cols;
   if (block_pixels > 256) {
     LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
-                                  kKnownFilterHeight, 2>(d, args, input, filter,
-                                                         output, data_format);
+                                  kKnownFilterHeight, 2>(
+        device, args, input, filter, output, data_format);
   } else if (block_pixels > 128) {
     LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
-                                  kKnownFilterHeight, 4>(d, args, input, filter,
-                                                         output, data_format);
+                                  kKnownFilterHeight, 4>(
+        device, args, input, filter, output, data_format);
   } else {
     LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
-                                  kKnownFilterHeight, 8>(d, args, input, filter,
-                                                         output, data_format);
+                                  kKnownFilterHeight, 8>(
+        device, args, input, filter, output, data_format);
   }
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
-void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
-                              const T* input, const T* filter, T* output,
+void LaunchDepthwiseConv2dGPU(const GpuDevice& device,
+                              const DepthwiseArgs& args, const T* input,
+                              const T* filter, T* output,
                               TensorFormat data_format) {
   void (*kernel)(const DepthwiseArgs, const T*, const T*, T*, int);
   if (data_format == FORMAT_NHWC) {
@@ -658,50 +680,52 @@ void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
   }
   const int num_outputs =
       args.batch * args.out_rows * args.out_cols * args.out_depth;
-  CudaLaunchConfig config = GetCudaLaunchConfig(num_outputs, d, kernel, 0, 0);
+  CudaLaunchConfig config =
+      GetCudaLaunchConfig(num_outputs, device, kernel, 0, 0);
   // The compile-time constant version runs faster with a single block.
   const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
                                       kKnownDepthMultiplier < 0
                                   ? std::numeric_limits<int>::max()
-                                  : d.getNumCudaMultiProcessors();
+                                  : device.getNumCudaMultiProcessors();
   kernel<<<std::min(max_block_count, config.block_count),
-           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
-                                                     output, num_outputs);
+           config.thread_per_block, 0, device.stream()>>>(args, input, filter,
+                                                          output, num_outputs);
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
-void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
-                              const T* input, const T* filter, T* output,
+void LaunchDepthwiseConv2dGPU(const GpuDevice& device,
+                              const DepthwiseArgs& args, const T* input,
+                              const T* filter, T* output,
                               TensorFormat data_format) {
   if (args.depth_multiplier == 1) {
     if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
       LaunchDepthwiseConv2dGPUSmall<T, DIRECTION_FORWARD, kKnownFilterWidth,
-                                    kKnownFilterHeight>(d, args, input, filter,
-                                                        output, data_format);
+                                    kKnownFilterHeight>(
+          device, args, input, filter, output, data_format);
       return;
     }
 
     LaunchDepthwiseConv2dGPU<T, kKnownFilterWidth, kKnownFilterHeight, 1>(
-        d, args, input, filter, output, data_format);
+        device, args, input, filter, output, data_format);
   } else {
     LaunchDepthwiseConv2dGPU<T, kKnownFilterWidth, kKnownFilterHeight, -1>(
-        d, args, input, filter, output, data_format);
+        device, args, input, filter, output, data_format);
   }
 }
 
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
-void LaunchDepthwiseConvOp<GPUDevice, T>::operator()(OpKernelContext* ctx,
-                                                     const DepthwiseArgs args,
+void LaunchDepthwiseConvOp<GpuDevice, T>::operator()(OpKernelContext* ctx,
+                                                     const DepthwiseArgs& args,
                                                      const T* input,
                                                      const T* filter, T* output,
                                                      TensorFormat data_format) {
-  const GPUDevice& d = ctx->eigen_device<GPUDevice>();
+  const GpuDevice& device = ctx->eigen_device<GpuDevice>();
   if (args.filter_rows == 3 && args.filter_cols == 3) {
-    LaunchDepthwiseConv2dGPU<T, 3, 3>(d, args, input, filter, output,
+    LaunchDepthwiseConv2dGPU<T, 3, 3>(device, args, input, filter, output,
                                       data_format);
   } else {
-    LaunchDepthwiseConv2dGPU<T, -1, -1>(d, args, input, filter, output,
+    LaunchDepthwiseConv2dGPU<T, -1, -1>(device, args, input, filter, output,
                                         data_format);
   }
   auto stream = ctx->op_device_context()->stream();
@@ -710,9 +734,9 @@ void LaunchDepthwiseConvOp<GPUDevice, T>::operator()(OpKernelContext* ctx,
                   "Launch of gpu kernel for DepthwiseConv2dGPULaunch failed"));
 }
 
-template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
-template struct LaunchDepthwiseConvOp<GPUDevice, float>;
-template struct LaunchDepthwiseConvOp<GPUDevice, double>;
+template struct LaunchDepthwiseConvOp<GpuDevice, Eigen::half>;
+template struct LaunchDepthwiseConvOp<GpuDevice, float>;
+template struct LaunchDepthwiseConvOp<GpuDevice, double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
@@ -722,59 +746,65 @@ __global__ void __launch_bounds__(640, 2)
                                               const T* out_backprop,
                                               const T* filter, T* in_backprop,
                                               int num_in_backprop) {
-  const int in_rows = args.in_rows;
-  const int in_cols = args.in_cols;
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows =
+  const int filter_height =
       kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
-  const int filter_cols =
+  const int filter_width =
       kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
   const int depth_multiplier =
       kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
-  const int pad_rows = args.pad_rows;
-  const int pad_cols = args.pad_cols;
-  const int out_rows = args.out_rows;
-  const int out_cols = args.out_cols;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+  const int out_height = args.out_rows;
+  const int out_width = args.out_cols;
   const int out_depth = args.out_depth;
 
   CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
     // Compute the indexes of this thread in the output.
-    const int in_d = thread_id % in_depth;
-    const int in_c = (thread_id / in_depth) % in_cols;
-    const int in_r = (thread_id / in_depth / in_cols) % in_rows;
-    const int b = thread_id / in_depth / in_cols / in_rows;
+    const int in_channel = thread_id % in_depth;
+    const int in_col = (thread_id / in_depth) % in_width;
+    const int in_row = (thread_id / in_depth / in_width) % in_height;
+    const int batch = thread_id / in_depth / in_width / in_height;
 
     T sum = static_cast<T>(0);
 
-    const int out_r_start =
-        tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride);
-    const int out_r_end = tf_min(out_rows - 1, (in_r + pad_rows) / stride);
-    const int out_c_start =
-        tf_max(0, (in_c - filter_cols + pad_cols + stride) / stride);
-    const int out_c_end = tf_min(out_cols - 1, (in_c + pad_cols) / stride);
-
-    NOUNROLL for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
-      const int f_r = in_r + pad_rows - out_r * stride;
+    const int out_row_start =
+        tf_max<int>(0, (in_row - filter_height + pad_height + stride) / stride);
+    const int out_row_end =
+        tf_min(out_height - 1, (in_row + pad_height) / stride);
+    const int out_col_start =
+        tf_max(0, (in_col - filter_width + pad_width + stride) / stride);
+    const int out_col_end =
+        tf_min(out_width - 1, (in_col + pad_width) / stride);
+
+    NOUNROLL for (int out_row = out_row_start; out_row <= out_row_end;
+                  ++out_row) {
+      const int filter_row = in_row + pad_height - out_row * stride;
       const int temp_out_backprop_offset =
-          out_depth * out_cols * (out_r + out_rows * b);
-      const int temp_filter_offset = filter_cols * f_r;
-      NOUNROLL for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
-        const int f_c = in_c + pad_cols - out_c * stride;
+          out_depth * out_width * (out_row + out_height * batch);
+      const int temp_filter_offset = filter_width * filter_row;
+      NOUNROLL for (int out_col = out_col_start; out_col <= out_col_end;
+                    ++out_col) {
+        const int filter_col = in_col + pad_width - out_col * stride;
         int filter_offset =
-            depth_multiplier * (in_d + in_depth * (f_c + temp_filter_offset));
+            depth_multiplier *
+            (in_channel + in_depth * (filter_col + temp_filter_offset));
         const int out_backprop_offset =
-            out_depth * out_c + temp_out_backprop_offset;
+            out_depth * out_col + temp_out_backprop_offset;
 #pragma unroll 6
         for (int i = 0; i < depth_multiplier; ++i) {
           sum += ldg(out_backprop + out_backprop_offset +
-                     in_d * depth_multiplier + i) *
+                     in_channel * depth_multiplier + i) *
                  ldg(filter + filter_offset + i);
         }
       }
     }
     const int in_backprop_offset =
-        in_d + in_depth * (in_c + in_cols * (in_r + in_rows * b));
+        in_channel +
+        in_depth * (in_col + in_width * (in_row + in_height * batch));
     in_backprop[in_backprop_offset] = sum;
   }
 }
@@ -786,74 +816,80 @@ __global__ void __launch_bounds__(640, 2)
                                               const T* out_backprop,
                                               const T* filter, T* in_backprop,
                                               int num_in_backprop) {
-  const int in_rows = args.in_rows;
-  const int in_cols = args.in_cols;
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows =
+  const int filter_height =
       kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
-  const int filter_cols =
+  const int filter_width =
       kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
   const int depth_multiplier =
       kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
-  const int pad_rows = args.pad_rows;
-  const int pad_cols = args.pad_cols;
-  const int out_rows = args.out_rows;
-  const int out_cols = args.out_cols;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+  const int out_height = args.out_rows;
+  const int out_width = args.out_cols;
   const int out_depth = args.out_depth;
 
   // TODO(vrv): Consider assigning threads to output and using
   // atomics for accumulation, similar to the filter case.
   CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
     // Compute the indexes of this thread in the input.
-    const int in_c = thread_id % in_cols;
-    const int in_r = (thread_id / in_cols) % in_rows;
-    const int in_d = (thread_id / in_cols / in_rows) % in_depth;
-    const int b = thread_id / in_depth / in_cols / in_rows;
+    const int in_col = thread_id % in_width;
+    const int in_row = (thread_id / in_width) % in_height;
+    const int in_channel = (thread_id / in_width / in_height) % in_depth;
+    const int batch = thread_id / in_depth / in_width / in_height;
 
     T sum = static_cast<T>(0);
-    const int out_d_start = in_d * depth_multiplier;
-    const int out_d_end = out_d_start + depth_multiplier;
-
-    const int out_r_start =
-        tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride);
-    const int out_r_end = tf_min(out_rows - 1, (in_r + pad_rows) / stride);
-    const int out_c_start =
-        tf_max(0, (in_c - filter_cols + pad_cols + stride) / stride);
-    const int out_c_end = tf_min(out_cols - 1, (in_c + pad_cols) / stride);
-
-    UNROLL for (int out_d = out_d_start; out_d < out_d_end; ++out_d) {
-      UNROLL for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
-        const int f_r = in_r + pad_rows - out_r * stride;
-        const int filter_dm = out_d - out_d_start;
-
-        const int temp_filter_offset = filter_cols * f_r;
-        for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
-          const int f_c = in_c + pad_cols - out_c * stride;
+    const int out_channel_start = in_channel * depth_multiplier;
+    const int out_channel_end = out_channel_start + depth_multiplier;
+
+    const int out_row_start =
+        tf_max<int>(0, (in_row - filter_height + pad_height + stride) / stride);
+    const int out_row_end =
+        tf_min(out_height - 1, (in_row + pad_height) / stride);
+    const int out_col_start =
+        tf_max(0, (in_col - filter_width + pad_width + stride) / stride);
+    const int out_col_end =
+        tf_min(out_width - 1, (in_col + pad_width) / stride);
+
+    UNROLL for (int out_channel = out_channel_start;
+                out_channel < out_channel_end; ++out_channel) {
+      UNROLL for (int out_row = out_row_start; out_row <= out_row_end;
+                  ++out_row) {
+        const int filter_row = in_row + pad_height - out_row * stride;
+        const int filter_dm = out_channel - out_channel_start;
+
+        const int temp_filter_offset = filter_width * filter_row;
+        for (int out_col = out_col_start; out_col <= out_col_end; ++out_col) {
+          const int filter_col = in_col + pad_width - out_col * stride;
           const int filter_offset =
-              filter_dm + args.depth_multiplier *
-                              (in_d + in_depth * (f_c + temp_filter_offset));
+              filter_dm +
+              args.depth_multiplier *
+                  (in_channel + in_depth * (filter_col + temp_filter_offset));
 
           const int out_backprop_offset =
-              (b * out_depth * out_rows * out_cols) +
-              (out_d * out_rows * out_cols) + (out_r * out_cols) + (out_c);
+              (batch * out_depth * out_height * out_width) +
+              (out_channel * out_height * out_width) + (out_row * out_width) +
+              (out_col);
 
           sum += ldg(out_backprop + out_backprop_offset) *
                  ldg(filter + filter_offset);
         }
       }
     }
-    const int in_backprop_offset = (b * in_rows * in_cols * in_depth) +
-                                   (in_d * in_rows * in_cols) +
-                                   (in_r * in_cols) + (in_c);
+    const int in_backprop_offset = (batch * in_height * in_width * in_depth) +
+                                   (in_channel * in_height * in_width) +
+                                   (in_row * in_width) + (in_col);
     in_backprop[in_backprop_offset] = sum;
   }
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
-void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
-                                           const DepthwiseArgs args,
+void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& device,
+                                           const DepthwiseArgs& args,
                                            const T* out_backprop,
                                            const T* filter, T* in_backprop,
                                            TensorFormat data_format) {
@@ -871,14 +907,14 @@ void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
   const int num_in_backprop =
       args.batch * args.in_rows * args.in_cols * args.in_depth;
   CudaLaunchConfig config =
-      GetCudaLaunchConfig(num_in_backprop, d, kernel, 0, 0);
-  kernel<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      GetCudaLaunchConfig(num_in_backprop, device, kernel, 0, 0);
+  kernel<<<config.block_count, config.thread_per_block, 0, device.stream()>>>(
       args, out_backprop, filter, in_backprop, num_in_backprop);
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
-void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
-                                           const DepthwiseArgs args,
+void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& device,
+                                           const DepthwiseArgs& args,
                                            const T* out_backprop,
                                            const T* filter, T* in_backprop,
                                            TensorFormat data_format) {
@@ -886,32 +922,32 @@ void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
     if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
       LaunchDepthwiseConv2dGPUSmall<T, DIRECTION_BACKWARD, kKnownFilterWidth,
                                     kKnownFilterHeight>(
-          d, args, out_backprop, filter, in_backprop, data_format);
+          device, args, out_backprop, filter, in_backprop, data_format);
       return;
     }
 
     LaunchDepthwiseConv2dBackpropInputGPU<T, kKnownFilterWidth,
                                           kKnownFilterHeight, 1>(
-        d, args, out_backprop, filter, in_backprop, data_format);
+        device, args, out_backprop, filter, in_backprop, data_format);
   } else {
     LaunchDepthwiseConv2dBackpropInputGPU<T, kKnownFilterWidth,
                                           kKnownFilterHeight, -1>(
-        d, args, out_backprop, filter, in_backprop, data_format);
+        device, args, out_backprop, filter, in_backprop, data_format);
   }
 }
 
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
-void LaunchDepthwiseConvBackpropInputOp<GPUDevice, T>::operator()(
+void LaunchDepthwiseConvBackpropInputOp<GpuDevice, T>::operator()(
     OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
     const T* filter, T* in_backprop, TensorFormat data_format) {
-  const GPUDevice& d = ctx->eigen_device<GPUDevice>();
+  const GpuDevice& device = ctx->eigen_device<GpuDevice>();
   if (args.filter_rows == 3 && args.filter_cols == 3) {
     LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3>(
-        d, args, out_backprop, filter, in_backprop, data_format);
+        device, args, out_backprop, filter, in_backprop, data_format);
   } else {
     LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1>(
-        d, args, out_backprop, filter, in_backprop, data_format);
+        device, args, out_backprop, filter, in_backprop, data_format);
   }
   auto stream = ctx->op_device_context()->stream();
   OP_REQUIRES(ctx, stream->ok(),
@@ -920,9 +956,9 @@ void LaunchDepthwiseConvBackpropInputOp<GPUDevice, T>::operator()(
                                "utGPULaunch failed"));
 }
 
-template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, Eigen::half>;
-template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
-template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, Eigen::half>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, float>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
@@ -933,75 +969,85 @@ __global__ void __launch_bounds__(640, 2)
                                                const T* input,
                                                T* filter_backprop,
                                                int num_out_backprop) {
-  const int in_rows = args.in_rows;
-  const int in_cols = args.in_cols;
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows =
+  const int filter_height =
       kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
-  const int filter_cols =
+  const int filter_width =
       kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
   const int depth_multiplier =
       kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
-  const int pad_rows = args.pad_rows;
-  const int pad_cols = args.pad_cols;
-  const int out_rows = args.out_rows;
-  const int out_cols = args.out_cols;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+  const int out_height = args.out_rows;
+  const int out_width = args.out_cols;
   const int out_depth = args.out_depth;
 
   CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
     // Compute the indexes of this thread in the output.
-    const int out_d = thread_id % out_depth;
-    const int out_c = (thread_id / out_depth) % out_cols;
-    const int out_r = (thread_id / out_depth / out_cols) % out_rows;
-    const int b = thread_id / out_depth / out_cols / out_rows;
+    const int out_channel = thread_id % out_depth;
+    const int out_col = (thread_id / out_depth) % out_width;
+    const int out_row = (thread_id / out_depth / out_width) % out_height;
+    const int batch = thread_id / out_depth / out_width / out_height;
     // Compute the input depth and the index of depth multiplier.
-    const int in_d = out_d / depth_multiplier;
-    const int dm = out_d % depth_multiplier;
+    const int in_channel = out_channel / depth_multiplier;
+    const int dm = out_channel % depth_multiplier;
 
     // Decide if all input is valid, if yes, we can skip the boundary checks
     // for each input.
-    const int in_r_start = out_r * stride - pad_rows;
-    const int in_c_start = out_c * stride - pad_cols;
-    const int in_r_end = in_r_start + filter_rows;
-    const int in_c_end = in_c_start + filter_cols;
+    const int in_row_start = out_row * stride - pad_height;
+    const int in_col_start = out_col * stride - pad_width;
+    const int in_row_end = in_row_start + filter_height;
+    const int in_col_end = in_col_start + filter_width;
 
     const int out_backprop_offset =
-        out_d + out_depth * (out_c + out_cols * (out_r + out_rows * b));
+        out_channel +
+        out_depth * (out_col + out_width * (out_row + out_height * batch));
     const T out_bp = ldg(out_backprop + out_backprop_offset);
-    if (in_r_start >= 0 && in_c_start >= 0 && in_r_end < in_rows &&
-        in_c_end < in_cols) {
-      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
-        const int in_r = in_r_start + f_r;
+    if (in_row_start >= 0 && in_col_start >= 0 && in_row_end < in_height &&
+        in_col_end < in_width) {
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = in_row_start + filter_row;
         // Avoid repeated computation.
-        const int input_offset_temp = in_cols * (in_r + in_rows * b);
-        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
-          const int in_c = in_c_start + f_c;
+        const int input_offset_temp = in_width * (in_row + in_height * batch);
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = in_col_start + filter_col;
 
-          const int input_offset = in_d + in_depth * (in_c + input_offset_temp);
+          const int input_offset =
+              in_channel + in_depth * (in_col + input_offset_temp);
           T partial_sum = ldg(input + input_offset) * out_bp;
-          T* addr = filter_backprop +
-                    (dm + depth_multiplier *
-                              (in_d + in_depth * (f_c + filter_cols * f_r)));
+          T* addr =
+              filter_backprop +
+              (dm + depth_multiplier *
+                        (in_channel +
+                         in_depth * (filter_col + filter_width * filter_row)));
           CudaAtomicAdd(addr, partial_sum);
         }
       }
     } else {
-      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
-        const int in_r = in_r_start + f_r;
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = in_row_start + filter_row;
         // Avoid repeated computation.
-        const int input_offset_temp = in_cols * (in_r + in_rows * b);
-        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
-          const int in_c = in_c_start + f_c;
-          const int addr_temp = filter_cols * f_r;
-
-          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
+        const int input_offset_temp = in_width * (in_row + in_height * batch);
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = in_col_start + filter_col;
+          const int addr_temp = filter_width * filter_row;
+
+          if (in_row >= 0 && in_row < in_height && in_col >= 0 &&
+              in_col < in_width) {
             const int input_offset =
-                in_d + in_depth * (in_c + input_offset_temp);
+                in_channel + in_depth * (in_col + input_offset_temp);
             T partial_sum = ldg(input + input_offset) * out_bp;
             T* addr =
                 filter_backprop +
-                (dm + depth_multiplier * (in_d + in_depth * (f_c + addr_temp)));
+                (dm + depth_multiplier *
+                          (in_channel + in_depth * (filter_col + addr_temp)));
             // Potentially many threads can add to the same address so we have
             // to use atomic add here.
             // TODO(jmchen): If atomic add turns out to be slow, we can:
@@ -1020,7 +1066,7 @@ __global__ void __launch_bounds__(640, 2)
 
 // Device function to compute sub-warp sum reduction for a power-of-two group of
 // neighboring threads.
-template<int kWidth, typename T>
+template <int kWidth, typename T>
 __device__ __forceinline__ T WarpSumReduce(T val) {
   // support only power-of-two widths.
   assert(__popc(kWidth) == 1);
@@ -1028,7 +1074,7 @@ __device__ __forceinline__ T WarpSumReduce(T val) {
   int zeros = sub_warp * kWidth;
   unsigned mask = ((1UL << kWidth) - 1) << zeros;
   for (int delta = kWidth / 2; delta > 0; delta /= 2) {
-    val += CudaShuffleXor(mask, val, delta);
+    val += CudaShuffleXorSync(mask, val, delta);
   }
   return val;
 }
@@ -1045,9 +1091,9 @@ __device__ __forceinline__ T WarpSumReduce(T val) {
 // memory are warp-accumulated (in chunks of kAccumPixels elements) and summed
 // up in global memory using atomics.
 // Requirements: threads per block must be multiple of 32 and <= launch_bounds,
-// kAccumPixels * 64 >= args.in_rows * args.in_cols * kBlockSlices.
+// kAccumPixels * 64 >= args.in_rows * args.in_cols * kBlockDepth.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
-          int kBlockSlices, int kAccumPixels>
+          int kBlockDepth, int kAccumPixels>
 __global__
 __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
     const DepthwiseArgs args, const T* output, const T* input, T* filter) {
@@ -1056,40 +1102,40 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
   extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
   T* const shared_data = reinterpret_cast<T*>(shared_memory);
 
-  const int batches = args.batch;
-  const int in_rows = args.in_rows;
-  const int in_cols = blockDim.y;  // slower (see b/62280718): args.in_cols;
+  const int num_batches = args.batch;
+  const int in_height = args.in_rows;
+  const int in_width = blockDim.y;  // slower (see b/62280718): args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows =
+  const int filter_height =
       kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
-  const int filter_cols =
+  const int filter_width =
       kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
-  const int pad_rows = args.pad_rows;
-  const int pad_cols = args.pad_cols;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
 
-  const int block_rows = blockDim.z;
+  const int block_height = blockDim.z;
 
   // These values are the same for all threads and could
   // be precomputed on the CPU.
-  const int block_size = block_rows * in_cols * kBlockSlices;
+  const int block_size = block_height * in_width * kBlockDepth;
   assert((block_size & 31) == 0);
-  const int in_row_size = in_cols * in_depth;
-  const int in_size = in_rows * in_row_size;
-  const int in_increment = (in_cols - 1) * kBlockSlices;
-  const int filter_pixels = filter_rows * filter_cols;
-  const int tile_cols = in_cols + filter_cols - 1;
-  const int tile_rows = 2 * block_rows + filter_rows - 1;
-  const int tile_row_size = tile_cols * kBlockSlices;
-  const int tile_size = tile_rows * tile_row_size;
-  const int tile_offset = block_rows * tile_row_size;
-  const int pad_offset = pad_rows * tile_cols + pad_cols;
-  const int batch_blocks = (in_depth + kBlockSlices - 1) / kBlockSlices;
-  const int in_blocks = batch_blocks * batches;
-  const int tensor_offset = block_rows * in_row_size;
+  const int in_row_size = in_width * in_depth;
+  const int in_size = in_height * in_row_size;
+  const int in_increment = (in_width - 1) * kBlockDepth;
+  const int filter_pixels = filter_height * filter_width;
+  const int tile_width = in_width + filter_width - 1;
+  const int tile_height = 2 * block_height + filter_height - 1;
+  const int tile_row_size = tile_width * kBlockDepth;
+  const int tile_size = tile_height * tile_row_size;
+  const int tile_offset = block_height * tile_row_size;
+  const int pad_offset = pad_height * tile_width + pad_width;
+  const int batch_blocks = (in_depth + kBlockDepth - 1) / kBlockDepth;
+  const int in_blocks = batch_blocks * num_batches;
+  const int tensor_offset = block_height * in_row_size;
   // The accumulator has a fixed number of pixels that can be reduced by one
-  // warp. Pixels beyond ceil(in_pixels * kBlockSlices / 64) are never written.
-  assert(kAccumPixels * 64 >= in_rows * in_cols * kBlockSlices);
-  const int accum_increment = kAccumPixels * kBlockSlices;
+  // warp. Pixels beyond ceil(in_pixels * kBlockDepth / 64) are never written.
+  assert(kAccumPixels * 64 >= in_height * in_width * kBlockDepth);
+  const int accum_increment = kAccumPixels * kBlockDepth;
   const int accum_size = filter_pixels * accum_increment;
 
   const int thread_depth = threadIdx.x;
@@ -1097,8 +1143,8 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
   const int thread_row = threadIdx.z;
 
   // Position in block.
-  const int thread_pix = thread_row * in_cols + thread_col;
-  const int thread_idx = thread_pix * kBlockSlices + thread_depth;
+  const int thread_pix = thread_row * in_width + thread_col;
+  const int thread_idx = thread_pix * kBlockDepth + thread_depth;
 
   // Initialize tile, in particular the padding and accumulator.
   for (int i = thread_idx; i < tile_size + accum_size; i += block_size) {
@@ -1110,31 +1156,31 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
   const int tensor_idx = thread_pix * in_depth + thread_depth;
 
   // Position in (padded) shared memory.
-  const int data_pix = thread_row * tile_cols + thread_col;
-  const int data_idx = data_pix * kBlockSlices + thread_depth;
+  const int data_pix = thread_row * tile_width + thread_col;
+  const int data_idx = data_pix * kBlockDepth + thread_depth;
 
-  // Position in shared memory, offset by pad_rows / pad_cols.
+  // Position in shared memory, offset by pad_height / pad_width.
   const int tile_pix = data_pix + pad_offset;
-  const int tile_idx = tile_pix * kBlockSlices + thread_depth;
+  const int tile_idx = tile_pix * kBlockDepth + thread_depth;
 
-  // Position in accumulator (kBlockSlices per warp, depth major).
-  const int accum_pix = thread_pix / (32 / kBlockSlices);
+  // Position in accumulator (kBlockDepth per warp, depth major).
+  const int accum_pix = thread_pix / (32 / kBlockDepth);
   const int accum_idx = thread_depth * kAccumPixels + accum_pix;
 
-  const int max_depth = in_depth - thread_depth;
+  const int max_channel = in_depth - thread_depth;
   const int accum_offset = tile_size + accum_idx;
-  const bool skip_second = block_rows + thread_row >= in_rows;
+  const bool skip_second = block_height + thread_row >= in_height;
 
   for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
     const int batch = b / batch_blocks;
-    const int stack = b - batch * batch_blocks;
+    const int block = b - batch * batch_blocks;
 
-    const int start_depth = stack * kBlockSlices;
-    const int filter_offset = tensor_idx + start_depth;
+    const int start_channel = block * kBlockDepth;
+    const int filter_offset = tensor_idx + start_channel;
     const int inout_offset = batch * in_size + filter_offset;
-    const bool depth_in_range = start_depth < max_depth;
+    const bool channel_in_range = start_channel < max_channel;
 
-    if (depth_in_range) {
+    if (channel_in_range) {
       const T* const in_ptr = inout_offset + input;
       T* const tile_ptr = tile_idx + shared_data;
       tile_ptr[0] = ldg(in_ptr);
@@ -1145,26 +1191,26 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
 
     // Note: the condition to reach this is uniform across the entire block.
     __syncthreads();
-    unsigned active_threads = CudaBallot(CUDA_WARP_ALL, depth_in_range);
+    unsigned active_threads = CudaBallotSync(kCudaWarpAll, channel_in_range);
 
-    if (depth_in_range) {
+    if (channel_in_range) {
       const T* const out_ptr = inout_offset + output;
       const T out1 = ldg(out_ptr);
       const T out2 = skip_second ? T(0) : ldg(tensor_offset + out_ptr);
       int shared_offset = data_idx;
       T* accum_ptr = accum_offset + shared_data;
-      UNROLL for (int r = 0; r < filter_rows; ++r) {
-        UNROLL for (int c = 0; c < filter_cols; ++c) {
+      UNROLL for (int r = 0; r < filter_height; ++r) {
+        UNROLL for (int c = 0; c < filter_width; ++c) {
           const T* const tile_ptr = shared_offset + shared_data;
           T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
           // Warp-accumulate pixels of the same depth and write to accumulator.
-          for (int delta = 16; delta >= kBlockSlices; delta /= 2) {
-            val += CudaShuffleDown(active_threads, val, delta);
+          for (int delta = 16; delta >= kBlockDepth; delta /= 2) {
+            val += CudaShuffleXorSync(active_threads, val, delta);
           }
-          if (!(thread_idx & 32 - kBlockSlices) /* lane_idx < kBlockSlices */) {
+          if (!(thread_idx & 32 - kBlockDepth) /* lane_idx < kBlockDepth */) {
             *accum_ptr = val;
           }
-          shared_offset += kBlockSlices;
+          shared_offset += kBlockDepth;
           accum_ptr += accum_increment;
         }
         shared_offset += in_increment;
@@ -1177,10 +1223,10 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
     const T* const accum_data = tile_size + shared_data;
     for (int i = thread_idx; i < accum_size; i += block_size) {
       const int filter_idx = i / kAccumPixels;
-      const int filter_pix = filter_idx / kBlockSlices;
-      const int filter_depth = filter_idx % kBlockSlices + start_depth;
-      const int filter_offset = filter_pix * in_depth + filter_depth;
-      if (filter_depth < in_depth) {
+      const int filter_pix = filter_idx / kBlockDepth;
+      const int filter_channel = filter_idx % kBlockDepth + start_channel;
+      const int filter_offset = filter_pix * in_depth + filter_channel;
+      if (filter_channel < in_depth) {
         T val = accum_data[i];
         // Warp-accumulate the pixels of the same depth from the accumulator.
         val = WarpSumReduce<kAccumPixels>(val);
@@ -1201,81 +1247,90 @@ __global__ void __launch_bounds__(640, 2)
                                                const T* input,
                                                T* filter_backprop,
                                                int num_out_backprop) {
-  const int in_rows = args.in_rows;
-  const int in_cols = args.in_cols;
+  const int in_height = args.in_rows;
+  const int in_width = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows =
+  const int filter_height =
       kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
-  const int filter_cols =
+  const int filter_width =
       kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
   const int depth_multiplier =
       kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
-  const int pad_rows = args.pad_rows;
-  const int pad_cols = args.pad_cols;
-  const int out_rows = args.out_rows;
-  const int out_cols = args.out_cols;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
+  const int out_height = args.out_rows;
+  const int out_width = args.out_cols;
   const int out_depth = args.out_depth;
 
   CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
     // Compute the indexes of this thread in the output.
-    const int out_c = thread_id % out_cols;
-    const int out_r = (thread_id / out_cols) % out_rows;
-    const int out_d = (thread_id / out_cols / out_rows) % out_depth;
+    const int out_col = thread_id % out_width;
+    const int out_row = (thread_id / out_width) % out_height;
+    const int out_channel = (thread_id / out_width / out_height) % out_depth;
 
-    const int b = thread_id / out_depth / out_cols / out_rows;
+    const int batch = thread_id / out_depth / out_width / out_height;
     // Compute the input depth and the index of depth multiplier.
-    const int in_d = out_d / depth_multiplier;
-    const int dm = out_d % depth_multiplier;
+    const int in_channel = out_channel / depth_multiplier;
+    const int dm = out_channel % depth_multiplier;
 
     // Decide if all input is valid, if yes, we can skip the boundary checks
     // for each input.
-    const int in_r_start = out_r * stride - pad_rows;
-    const int in_c_start = out_c * stride - pad_cols;
-    const int in_r_end = in_r_start + filter_rows;
-    const int in_c_end = in_c_start + filter_cols;
+    const int in_row_start = out_row * stride - pad_height;
+    const int in_col_start = out_col * stride - pad_width;
+    const int in_row_end = in_row_start + filter_height;
+    const int in_col_end = in_col_start + filter_width;
 
-    const int out_backprop_offset = (b * out_depth * out_rows * out_cols) +
-                                    (out_d * out_rows * out_cols) +
-                                    (out_r * out_cols) + (out_c);
+    const int out_backprop_offset =
+        (batch * out_depth * out_height * out_width) +
+        (out_channel * out_height * out_width) + (out_row * out_width) +
+        (out_col);
 
     const T out_bp = ldg(out_backprop + out_backprop_offset);
-    if (in_r_start >= 0 && in_c_start >= 0 && in_r_end < in_rows &&
-        in_c_end < in_cols) {
-      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
-        const int in_r = in_r_start + f_r;
+    if (in_row_start >= 0 && in_col_start >= 0 && in_row_end < in_height &&
+        in_col_end < in_width) {
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = in_row_start + filter_row;
         // Avoid repeated computation.
-        const int input_offset_temp = (b * in_depth * in_rows * in_cols) +
-                                      (in_d * in_rows * in_cols) +
-                                      (in_r * in_cols);
-
-        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
-          const int in_c = in_c_start + f_c;
-          const int input_offset = input_offset_temp + in_c;
+        const int input_offset_temp =
+            (batch * in_depth * in_height * in_width) +
+            (in_channel * in_height * in_width) + (in_row * in_width);
+
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = in_col_start + filter_col;
+          const int input_offset = input_offset_temp + in_col;
           T partial_sum = ldg(input + input_offset) * out_bp;
-          T* addr = filter_backprop +
-                    (dm + depth_multiplier *
-                              (in_d + in_depth * (f_c + filter_cols * f_r)));
+          T* addr =
+              filter_backprop +
+              (dm + depth_multiplier *
+                        (in_channel +
+                         in_depth * (filter_col + filter_width * filter_row)));
           CudaAtomicAdd(addr, partial_sum);
         }
       }
     } else {
-      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
-        const int in_r = in_r_start + f_r;
+      UNROLL for (int filter_row = 0; filter_row < filter_height;
+                  ++filter_row) {
+        const int in_row = in_row_start + filter_row;
         // Avoid repeated computation.
-        const int input_offset_temp = (b * in_depth * in_rows * in_cols) +
-                                      (in_d * in_rows * in_cols) +
-                                      (in_r * in_cols);
-        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
-          const int in_c = in_c_start + f_c;
-          const int addr_temp = filter_cols * f_r;
-
-          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
-            const int input_offset = input_offset_temp + in_c;
+        const int input_offset_temp =
+            (batch * in_depth * in_height * in_width) +
+            (in_channel * in_height * in_width) + (in_row * in_width);
+        UNROLL for (int filter_col = 0; filter_col < filter_width;
+                    ++filter_col) {
+          const int in_col = in_col_start + filter_col;
+          const int addr_temp = filter_width * filter_row;
+
+          if (in_row >= 0 && in_row < in_height && in_col >= 0 &&
+              in_col < in_width) {
+            const int input_offset = input_offset_temp + in_col;
             T partial_sum = ldg(input + input_offset) * out_bp;
             T* addr =
                 filter_backprop +
-                (dm + depth_multiplier * (in_d + in_depth * (f_c + addr_temp)));
+                (dm + depth_multiplier *
+                          (in_channel + in_depth * (filter_col + addr_temp)));
             // Potentially many threads can add to the same address so we have
             // to use atomic add here.
             // TODO(jmchen): If atomic add turns out to be slow, we can:
@@ -1304,9 +1359,9 @@ __global__ void __launch_bounds__(640, 2)
 // memory are warp-accumulated (in chunks of kAccumPixels elements) and summed
 // up in global memory using atomics.
 // Requirements: threads per block must be multiple of 32 and <= launch_bounds,
-// kAccumPixels * 64 >= args.in_rows * args.in_cols * kBlockSlices.
+// kAccumPixels * 64 >= args.in_rows * args.in_cols * kBlockDepth.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
-          int kBlockSlices, int kAccumPixels>
+          int kBlockDepth, int kAccumPixels>
 __global__
 __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
     const DepthwiseArgs args, const T* output, const T* input, T* filter) {
@@ -1315,39 +1370,39 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
   extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
   T* const shared_data = reinterpret_cast<T*>(shared_memory);
 
-  const int batches = args.batch;
-  const int in_rows = args.in_rows;
-  const int in_cols = blockDim.x;  // slower (see b/62280718): args.in_cols;
+  const int num_batches = args.batch;
+  const int in_height = args.in_rows;
+  const int in_width = blockDim.x;  // slower (see b/62280718): args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows =
+  const int filter_height =
       kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
-  const int filter_cols =
+  const int filter_width =
       kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
-  const int pad_rows = args.pad_rows;
-  const int pad_cols = args.pad_cols;
+  const int pad_height = args.pad_rows;
+  const int pad_width = args.pad_cols;
 
-  const int block_rows = blockDim.y;
+  const int block_height = blockDim.y;
 
   // These values are the same for all threads and could
   // be precomputed on the CPU.
-  const int block_pixels = in_cols * block_rows;
-  const int block_size = block_pixels * kBlockSlices;
+  const int block_pixels = in_width * block_height;
+  const int block_size = block_pixels * kBlockDepth;
   assert((block_size & 31) == 0);
-  const int in_pixels = in_cols * in_rows;
-  const int in_increment = in_cols - 1;
-  const int filter_pixels = filter_rows * filter_cols;
-  const int tile_cols = in_cols + filter_cols - 1;
-  const int tile_rows = 2 * block_rows + filter_rows - 1;
-  const int tile_pixels = tile_cols * tile_rows;
-  const int tile_size = tile_pixels * kBlockSlices;
-  const int tile_offset = block_rows * tile_cols;
-  const int pad_offset = pad_rows * tile_cols + pad_cols;
-  const int in_slices = in_depth * batches;
-  const int in_blocks = (in_slices + kBlockSlices - 1) / kBlockSlices;
+  const int in_pixels = in_width * in_height;
+  const int in_increment = in_width - 1;
+  const int filter_pixels = filter_height * filter_width;
+  const int tile_width = in_width + filter_width - 1;
+  const int tile_height = 2 * block_height + filter_height - 1;
+  const int tile_pixels = tile_width * tile_height;
+  const int tile_size = tile_pixels * kBlockDepth;
+  const int tile_offset = block_height * tile_width;
+  const int pad_offset = pad_height * tile_width + pad_width;
+  const int in_total_depth = in_depth * num_batches;
+  const int in_blocks = (in_total_depth + kBlockDepth - 1) / kBlockDepth;
   // The accumulator has a fixed number of pixels that can be reduced by one
-  // warp. Pixels beyond ceil(in_pixels * kBlockSlices / 64) are never written.
-  assert(kAccumPixels * 64 >= in_rows * in_cols * kBlockSlices);
-  const int accum_increment = kAccumPixels * kBlockSlices;
+  // warp. Pixels beyond ceil(in_pixels * kBlockDepth / 64) are never written.
+  assert(kAccumPixels * 64 >= in_height * in_width * kBlockDepth);
+  const int accum_increment = kAccumPixels * kBlockDepth;
   const int accum_size = filter_pixels * accum_increment;
 
   const int thread_col = threadIdx.x;
@@ -1355,7 +1410,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
   const int thread_depth = threadIdx.z;
 
   // Position in block.
-  const int thread_pix = thread_row * in_cols + thread_col;
+  const int thread_pix = thread_row * in_width + thread_col;
   const int thread_idx = thread_depth * block_pixels + thread_pix;
 
   // Initialize tile, in particular the padding and accumulator.
@@ -1368,27 +1423,27 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
   const int tensor_idx = thread_depth * in_pixels + thread_pix;
 
   // Position in (padded) shared memory.
-  const int data_pix = thread_row * tile_cols + thread_col;
+  const int data_pix = thread_row * tile_width + thread_col;
   const int data_idx = thread_depth * tile_pixels + data_pix;
 
-  // Position in shared memory, offset by pad_rows / pad_cols.
+  // Position in shared memory, offset by pad_height / pad_width.
   const int tile_idx = data_idx + pad_offset;
 
-  // Position in accumulator (kBlockSlices per warp, depth major).
-  const int accum_pix = thread_pix / (32 / kBlockSlices);
+  // Position in accumulator (kBlockDepth per warp, depth major).
+  const int accum_pix = thread_pix / (32 / kBlockDepth);
   const int accum_idx = thread_depth * kAccumPixels + accum_pix;
 
-  const int max_slice = in_slices - thread_depth;
+  const int max_channel = in_total_depth - thread_depth;
   const int accum_offset = tile_size + accum_idx;
-  const bool skip_second = block_rows + thread_row >= in_rows;
+  const bool skip_second = block_height + thread_row >= in_height;
 
   for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
-    const int slice = b * kBlockSlices;
+    const int channel = b * kBlockDepth;
 
-    const int inout_offset = slice * in_pixels + tensor_idx;
-    const bool slice_in_range = slice < max_slice;
+    const int inout_offset = channel * in_pixels + tensor_idx;
+    const bool channel_in_range = channel < max_channel;
 
-    if (slice_in_range) {
+    if (channel_in_range) {
       const T* const in_ptr = inout_offset + input;
       T* const tile_ptr = tile_idx + shared_data;
       tile_ptr[0] = ldg(in_ptr);
@@ -1399,24 +1454,24 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
 
     // Note: the condition to reach this is uniform across the entire block.
     __syncthreads();
-    unsigned active_threads = CudaBallot(CUDA_WARP_ALL, slice_in_range);
+    unsigned active_threads = CudaBallotSync(kCudaWarpAll, channel_in_range);
 
-    if (slice_in_range) {
+    if (channel_in_range) {
       const T* const out_ptr = inout_offset + output;
       const T out1 = ldg(out_ptr);
       const T out2 = skip_second ? T(0) : ldg(block_pixels + out_ptr);
       int shared_offset = data_idx;
       T* accum_ptr = accum_offset + shared_data;
-      UNROLL for (int r = 0; r < filter_rows; ++r) {
-        UNROLL for (int c = 0; c < filter_cols; ++c) {
+      UNROLL for (int r = 0; r < filter_height; ++r) {
+        UNROLL for (int c = 0; c < filter_width; ++c) {
           const T* const tile_ptr = shared_offset + shared_data;
           T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
           // Warp-accumulate pixels of the same depth and write to accumulator.
-          for (int delta = 16 / kBlockSlices; delta > 0; delta /= 2) {
-            val += CudaShuffleDown(active_threads, val, delta);
+          for (int delta = 16 / kBlockDepth; delta > 0; delta /= 2) {
+            val += CudaShuffleXorSync(active_threads, val, delta);
           }
-          if (!(thread_idx & 32 / kBlockSlices - 1)) {
-            *accum_ptr = val;
+          if (!(thread_idx & 32 / kBlockDepth - 1)) {
+            *accum_ptr = val;  // kBlockDepth threads per warp.
           }
           ++shared_offset;
           accum_ptr += accum_increment;
@@ -1431,10 +1486,11 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
     const T* const accum_data = tile_size + shared_data;
     for (int i = thread_idx; i < accum_size; i += block_size) {
       const int filter_idx = i / kAccumPixels;
-      const int filter_pix = filter_idx / kBlockSlices;
-      const int filter_depth = (slice + filter_idx % kBlockSlices) % in_depth;
-      const int filter_offset = filter_pix * in_depth + filter_depth;
-      if (filter_depth < in_depth) {
+      const int filter_pix = filter_idx / kBlockDepth;
+      const int filter_channel =
+          (channel + filter_idx % kBlockDepth) % in_depth;
+      const int filter_offset = filter_pix * in_depth + filter_channel;
+      if (filter_channel < in_depth) {
         T val = accum_data[i];
         // Warp-accumulate pixels of the same depth from the accumulator.
         val = WarpSumReduce<kAccumPixels>(val);
@@ -1447,31 +1503,31 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
-          int kBlockSlices, int kAccumPixels>
+          int kBlockDepth, int kAccumPixels>
 bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
-    const GpuDevice& d, const DepthwiseArgs args, const int block_rows,
+    const GpuDevice& device, const DepthwiseArgs& args, const int block_height,
     const T* out_backprop, const T* input, T* filter_backprop,
     TensorFormat data_format) {
-  const int tile_cols = args.in_cols + args.filter_cols - 1;
-  const int tile_rows = block_rows * 2 + args.filter_rows - 1;
-  const int tile_pixels = tile_rows * tile_cols;
+  const int tile_width = args.in_cols + args.filter_cols - 1;
+  const int tile_height = block_height * 2 + args.filter_rows - 1;
+  const int tile_pixels = tile_height * tile_width;
   const int filter_pixels = args.filter_rows * args.filter_cols;
   const int shared_memory_size =
-      kBlockSlices * (tile_pixels + filter_pixels * kAccumPixels) * sizeof(T);
-  if (shared_memory_size > d.sharedMemPerBlock()) {
+      kBlockDepth * (tile_pixels + filter_pixels * kAccumPixels) * sizeof(T);
+  if (shared_memory_size > device.sharedMemPerBlock()) {
     return false;
   }
 
   dim3 block_dim;
   void (*kernel)(const DepthwiseArgs, const T*, const T*, T*);
   if (data_format == FORMAT_NHWC) {
-    block_dim = dim3(kBlockSlices, args.in_cols, block_rows);
+    block_dim = dim3(kBlockDepth, args.in_cols, block_height);
     kernel = DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, kAccumPixels>;
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>;
   } else if (data_format == FORMAT_NCHW) {
-    block_dim = dim3(args.in_cols, block_rows, kBlockSlices);
+    block_dim = dim3(args.in_cols, block_height, kBlockDepth);
     kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, kAccumPixels>;
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>;
   } else {
     assert(false && "Incorrect data format");
     return false;
@@ -1479,77 +1535,80 @@ bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
   const int num_out_backprop =
       args.batch * args.out_rows * args.out_cols * args.out_depth;
   CudaLaunchConfig config =
-      GetCudaLaunchConfig(num_out_backprop, d, kernel, shared_memory_size,
+      GetCudaLaunchConfig(num_out_backprop, device, kernel, shared_memory_size,
                           block_dim.x * block_dim.y * block_dim.z);
-  kernel<<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
-      args, out_backprop, input, filter_backprop);
+  kernel<<<config.block_count, block_dim, shared_memory_size,
+           device.stream()>>>(args, out_backprop, input, filter_backprop);
   return true;
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
-          int kBlockSlices>
+          int kBlockDepth>
 bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
-    const GpuDevice& d, const DepthwiseArgs args, const int block_rows,
+    const GpuDevice& device, const DepthwiseArgs& args, const int block_height,
     const T* out_backprop, const T* input, T* filter_backprop,
     TensorFormat data_format) {
   // Minimize (power of two) kAccumPixels, while satisfying
-  // kAccumPixels * 32 >= block_rows * in_cols * kBlockSlices.
-  const int block_pixels = block_rows * args.in_cols * kBlockSlices;
+  // kAccumPixels * 32 >= block_height * in_width * kBlockDepth.
+  const int block_pixels = block_height * args.in_cols * kBlockDepth;
   if (block_pixels > 512) {
     return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, 32>(
-        d, args, block_rows, out_backprop, input, filter_backprop, data_format);
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, 32>(
+        device, args, block_height, out_backprop, input, filter_backprop,
+        data_format);
   } else if (block_pixels > 256) {
     return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, 16>(
-        d, args, block_rows, out_backprop, input, filter_backprop, data_format);
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, 16>(
+        device, args, block_height, out_backprop, input, filter_backprop,
+        data_format);
   } else {
     return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
-        T, kKnownFilterWidth, kKnownFilterHeight, kBlockSlices, 8>(
-        d, args, block_rows, out_backprop, input, filter_backprop, data_format);
+        T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, 8>(
+        device, args, block_height, out_backprop, input, filter_backprop,
+        data_format);
   }
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
 bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
-    const GpuDevice& d, const DepthwiseArgs args, const T* out_backprop,
+    const GpuDevice& device, const DepthwiseArgs& args, const T* out_backprop,
     const T* input, T* filter_backprop, TensorFormat data_format) {
-  // Maximize (power of two) kBlockSlices while keeping a block within 1024
+  // Maximize (power of two) kBlockDepth while keeping a block within 1024
   // threads (2 pixels per thread).
-  int block_slices = 8;
-  int block_rows = (args.in_rows + 1) / 2;
+  int block_depth = 8;
+  int block_height = (args.in_rows + 1) / 2;
   int round_mask = 1;
-  for (; block_slices > 1; block_slices /= 2) {
-    // args.in_cols * block_rows * kBlockSlices must be multiple of 32.
-    for (; block_rows * args.in_cols * block_slices & 31;
+  for (; block_depth > 1; block_depth /= 2) {
+    // args.in_cols * block_height * kBlockDepth must be multiple of 32.
+    for (; block_height * args.in_cols * block_depth & 31;
          round_mask = round_mask * 2 + 1) {
-      block_rows = block_rows + round_mask & ~round_mask;
+      block_height = block_height + round_mask & ~round_mask;
     }
-    int block_size = block_rows * args.in_cols * block_slices;
+    int block_size = block_height * args.in_cols * block_depth;
     if (block_size <= 1024) {
       break;
     }
   }
 
-  if (!CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, block_rows)) {
+  if (!CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, block_height)) {
     return false;
   }
 
-  switch (block_slices) {
+  switch (block_depth) {
     case 8:
       return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
           T, kKnownFilterWidth, kKnownFilterHeight, 8>(
-          d, args, block_rows, out_backprop, input, filter_backprop,
+          device, args, block_height, out_backprop, input, filter_backprop,
           data_format);
     case 4:
       return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
           T, kKnownFilterWidth, kKnownFilterHeight, 4>(
-          d, args, block_rows, out_backprop, input, filter_backprop,
+          device, args, block_height, out_backprop, input, filter_backprop,
           data_format);
     case 2:
       return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
           T, kKnownFilterWidth, kKnownFilterHeight, 2>(
-          d, args, block_rows, out_backprop, input, filter_backprop,
+          device, args, block_height, out_backprop, input, filter_backprop,
           data_format);
     default:
       return false;
@@ -1558,8 +1617,8 @@ bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
-void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
-                                            const DepthwiseArgs args,
+void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& device,
+                                            const DepthwiseArgs& args,
                                             const T* out_backprop,
                                             const T* input, T* filter_backprop,
                                             TensorFormat data_format) {
@@ -1577,40 +1636,40 @@ void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
   const int num_out_backprop =
       args.batch * args.out_rows * args.out_cols * args.out_depth;
   CudaLaunchConfig config =
-      GetCudaLaunchConfig(num_out_backprop, d, kernel, 0, 0);
-  kernel<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      GetCudaLaunchConfig(num_out_backprop, device, kernel, 0, 0);
+  kernel<<<config.block_count, config.thread_per_block, 0, device.stream()>>>(
       args, out_backprop, input, filter_backprop, num_out_backprop);
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
-void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
-                                            const DepthwiseArgs args,
+void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& device,
+                                            const DepthwiseArgs& args,
                                             const T* out_backprop,
                                             const T* input, T* filter_backprop,
                                             TensorFormat data_format) {
   if (args.depth_multiplier == 1) {
     if (TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<T, kKnownFilterWidth,
                                                        kKnownFilterHeight>(
-            d, args, out_backprop, input, filter_backprop, data_format)) {
+            device, args, out_backprop, input, filter_backprop, data_format)) {
       return;
     }
 
     LaunchDepthwiseConv2dBackpropFilterGPU<T, kKnownFilterWidth,
                                            kKnownFilterHeight, 1>(
-        d, args, out_backprop, input, filter_backprop, data_format);
+        device, args, out_backprop, input, filter_backprop, data_format);
   } else {
     LaunchDepthwiseConv2dBackpropFilterGPU<T, kKnownFilterWidth,
                                            kKnownFilterHeight, -1>(
-        d, args, out_backprop, input, filter_backprop, data_format);
+        device, args, out_backprop, input, filter_backprop, data_format);
   }
 }
 
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
-void LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T>::operator()(
+void LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()(
     OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
     const T* input, T* filter_backprop, TensorFormat data_format) {
-  const GPUDevice& d = ctx->eigen_device<GPUDevice>();
+  const GpuDevice& device = ctx->eigen_device<GpuDevice>();
   auto stream = ctx->op_device_context()->stream();
 
   // Initialize the results to 0.
@@ -1622,10 +1681,10 @@ void LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T>::operator()(
 
   if (args.filter_rows == 3 && args.filter_cols == 3) {
     LaunchDepthwiseConv2dBackpropFilterGPU<T, 3, 3>(
-        d, args, out_backprop, input, filter_backprop, data_format);
+        device, args, out_backprop, input, filter_backprop, data_format);
   } else {
     LaunchDepthwiseConv2dBackpropFilterGPU<T, -1, -1>(
-        d, args, out_backprop, input, filter_backprop, data_format);
+        device, args, out_backprop, input, filter_backprop, data_format);
   }
   OP_REQUIRES(ctx, stream->ok(),
               errors::Internal("Launch of gpu kernel for "
@@ -1633,8 +1692,8 @@ void LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T>::operator()(
                                "terGPULaunch failed"));
 }
 
-template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, Eigen::half>;
-template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
-template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, Eigen::half>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, float>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, double>;
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/determinant_op.h b/tensorflow/core/kernels/determinant_op.h
index e931e328e4bbb2e29f3f3ff4fbaf3dfb76fb1ea7..eefdfe0ae40bca1713f9667bf9fced934a412acb 100644
--- a/tensorflow/core/kernels/determinant_op.h
+++ b/tensorflow/core/kernels/determinant_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
 
 #include "tensorflow/core/framework/tensor_types.h"
 
@@ -44,4 +44,4 @@ struct LogDeterminantFromPivotedLUFunctor {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DETERMINANT_OP_H_
diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc
index 86fa7dce36afff121dc6ff0642f45c809bc63a3d..d228153d4c76dedd74a4b1db1059bc25ff0a6f77 100644
--- a/tensorflow/core/kernels/diag_op.cc
+++ b/tensorflow/core/kernels/diag_op.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -47,8 +47,9 @@ class DiagOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& diagonal = context->input(0);
     const int num_dims = diagonal.dims();
-    OP_REQUIRES(context, 0 != num_dims, errors::InvalidArgument(
-        "Input must be at least rank 1, got 0"));
+    OP_REQUIRES(
+        context, 0 != num_dims,
+        errors::InvalidArgument("Input must be at least rank 1, got 0"));
     TensorShape out_shape;
     for (int i = 0; i < num_dims; ++i) {
       out_shape.AddDim(diagonal.dim_size(i));
@@ -60,10 +61,9 @@ class DiagOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, out_shape, &output_tensor));
     functor::DiagFunctor<Device, T> diagFunc;
-    Status s = diagFunc(context,
-                        diagonal.NumElements(),
-                        diagonal.flat<T>().data(),
-                        output_tensor->flat<T>().data());
+    Status s =
+        diagFunc(context, diagonal.NumElements(), diagonal.flat<T>().data(),
+                 output_tensor->flat<T>().data());
     OP_REQUIRES_OK(context, s);
   }
 };
@@ -82,12 +82,12 @@ class DiagPartOp : public OpKernel {
                 errors::InvalidArgument("The rank of the tensor should be \
                                          even and positive, got shape ",
                                         tensor.shape().DebugString()));
-    for (int i = 0; i < out_dims; i++){
-      OP_REQUIRES(context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
-                  errors::InvalidArgument(
-                    "Invalid shape ", tensor.shape().DebugString(),
-                    ": dimensions ", i, " and ", i + out_dims, " do not match.")
-                  );
+    for (int i = 0; i < out_dims; i++) {
+      OP_REQUIRES(
+          context, tensor.dim_size(i) == tensor.dim_size(i + out_dims),
+          errors::InvalidArgument("Invalid shape ",
+                                  tensor.shape().DebugString(), ": dimensions ",
+                                  i, " and ", i + out_dims, " do not match."));
     }
 
     TensorShape out_shape;
@@ -96,13 +96,10 @@ class DiagPartOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, out_shape, &output));
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
     functor::DiagPartFunctor<Device, T> diagPartFunc;
-    Status s = diagPartFunc(context,
-                            out_shape.num_elements(),
-                            tensor.flat<T>().data(),
-                            output->flat<T>().data());
+    Status s = diagPartFunc(context, out_shape.num_elements(),
+                            tensor.flat<T>().data(), output->flat<T>().data());
     OP_REQUIRES_OK(context, s);
   }
 };
@@ -129,9 +126,8 @@ class DiagPartOp : public OpKernel {
 namespace functor {
 template <typename T>
 struct DiagFunctor<CPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // This subprocess is responsible for writing values in index range
     // [start*size, limit*size)
     auto subDiag = [in, out, size](int64 start, int64 limit) {
@@ -143,17 +139,16 @@ struct DiagFunctor<CPUDevice, T> {
 
     // Here, 5 is a empirical factor of cost_per_unit.
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    Shard(worker_threads.num_threads, worker_threads.workers, size,
-        5 * size, subDiag);
+    Shard(worker_threads.num_threads, worker_threads.workers, size, 5 * size,
+          subDiag);
     return Status::OK();
   }
 };
 
 template <typename T>
 struct DiagPartFunctor<CPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // This subprocess is responsible for extracting values in index range
     // [start, limit)
     auto subDiagPart = [in, out, size](int64 start, int64 limit) {
@@ -164,14 +159,13 @@ struct DiagPartFunctor<CPUDevice, T> {
 
     // Here, 5 is a empirical factor of cost_per_unit.
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    Shard(worker_threads.num_threads, worker_threads.workers, size,
-        5, subDiagPart);
+    Shard(worker_threads.num_threads, worker_threads.workers, size, 5,
+          subDiagPart);
     return Status::OK();
   }
 };
 }  // namespace functor
 
-
 // Register the CPU kernels.
 #define REGISTER_DIAGOP(T)                                    \
   REGISTER_KERNEL_BUILDER(                                    \
@@ -250,6 +244,4 @@ TF_CALL_complex128(REGISTER_DIAGPARTOP_GPU);
 
 #endif  // GOOGLE_CUDA
 
-
 }  // namespace tensorflow
-
diff --git a/tensorflow/core/kernels/diag_op.h b/tensorflow/core/kernels/diag_op.h
index c6ca6a2047455649b5197da27a58cb068476e928..baf16ddb4b987fa09de113c0316ec0014c884980 100644
--- a/tensorflow/core/kernels/diag_op.h
+++ b/tensorflow/core/kernels/diag_op.h
@@ -26,14 +26,14 @@ namespace functor {
 
 template <typename Device, typename T>
 struct DiagFunctor {
-  Status operator() (OpKernelContext* context, const int64 size,
-                     const T* in, T* out);
+  Status operator()(OpKernelContext* context, const int64 size, const T* in,
+                    T* out);
 };
 
 template <typename Device, typename T>
 struct DiagPartFunctor {
-  Status operator() (OpKernelContext* context, const int64 size,
-                     const T* in, T* out);
+  Status operator()(OpKernelContext* context, const int64 size, const T* in,
+                    T* out);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/diag_op_gpu.cu.cc b/tensorflow/core/kernels/diag_op_gpu.cu.cc
index d3c529d784e3a9ba4a793cd98cff9eb5e74d6090..910f3093b2307526e36bdfad9ac6746dd861d4fd 100644
--- a/tensorflow/core/kernels/diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/diag_op_gpu.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include <complex>
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/kernels/diag_op.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
 namespace functor {
@@ -28,10 +28,8 @@ namespace functor {
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T>
-__global__ void DiagCudaKernel(const int num_threads,
-                               const int64 size,
-                               const T* in,
-                               T* out) {
+__global__ void DiagCudaKernel(const int num_threads, const int64 size,
+                               const T* in, T* out) {
   CUDA_1D_KERNEL_LOOP(index, num_threads) {
     // Fill the diagonal elements or set to zero in other place.
     if (index % (1 + size) == 0) {
@@ -44,9 +42,8 @@ __global__ void DiagCudaKernel(const int num_threads,
 
 template <typename T>
 struct DiagFunctor<GPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // Empty tensor couldn't launch the kernel.
     if (size == 0) {
       return Status::OK();
@@ -56,25 +53,22 @@ struct DiagFunctor<GPUDevice, T> {
     // so this may overflow for `size*size` in extreme cases,
     // here is checking the multiplication overflow for integer.
     if (size && (int(size * size) / size) != size) {
-      return errors::Internal(
-          "DiagOp got input size too large.");
+      return errors::Internal("DiagOp got input size too large.");
     }
     int virtual_thread_count = int(size * size);
 
     // Launch the GPU kernel.
     const GPUDevice& device = context->eigen_device<GPUDevice>();
-    CudaLaunchConfig diag_config = GetCudaLaunchConfig(
-        virtual_thread_count, device);
-    DiagCudaKernel<<<diag_config.block_count,
-                     diag_config.thread_per_block,
-                     0, device.stream()>>>(
-        diag_config.virtual_thread_count, size, in, out);
+    CudaLaunchConfig diag_config =
+        GetCudaLaunchConfig(virtual_thread_count, device);
+    DiagCudaKernel<<<diag_config.block_count, diag_config.thread_per_block, 0,
+                     device.stream()>>>(diag_config.virtual_thread_count, size,
+                                        in, out);
 
     auto err = cudaGetLastError();
     if (err != cudaSuccess) {
       return errors::Internal(
-          "Could not launch DiagOp kernel: ",
-          cudaGetErrorString(err), ".");
+          "Could not launch DiagOp kernel: ", cudaGetErrorString(err), ".");
     }
     return Status::OK();
   }
@@ -87,12 +81,9 @@ template struct DiagFunctor<GPUDevice, int64>;
 template struct DiagFunctor<GPUDevice, complex64>;
 template struct DiagFunctor<GPUDevice, complex128>;
 
-
 template <typename T>
-__global__ void DiagPartCudaKernel(const int num_threads,
-                                   const int64 size,
-                                   const T* in,
-                                   T* out) {
+__global__ void DiagPartCudaKernel(const int num_threads, const int64 size,
+                                   const T* in, T* out) {
   CUDA_1D_KERNEL_LOOP(index, num_threads) {
     out[index] = in[(1 + size) * index];
   }
@@ -100,9 +91,8 @@ __global__ void DiagPartCudaKernel(const int num_threads,
 
 template <typename T>
 struct DiagPartFunctor<GPUDevice, T> {
-  EIGEN_ALWAYS_INLINE Status
-  operator() (OpKernelContext* context, const int64 size,
-              const T* in, T* out) {
+  EIGEN_ALWAYS_INLINE Status operator()(OpKernelContext* context,
+                                        const int64 size, const T* in, T* out) {
     // Empty tensor couldn't launch the kernel.
     if (size == 0) {
       return Status::OK();
@@ -111,16 +101,14 @@ struct DiagPartFunctor<GPUDevice, T> {
 
     // Extract the diagonal elements.
     CudaLaunchConfig diag_config = GetCudaLaunchConfig(size, device);
-    DiagPartCudaKernel<<<diag_config.block_count,
-                     diag_config.thread_per_block,
-                     0, device.stream()>>>(
-        diag_config.virtual_thread_count, size, in, out);
+    DiagPartCudaKernel<<<diag_config.block_count, diag_config.thread_per_block,
+                         0, device.stream()>>>(diag_config.virtual_thread_count,
+                                               size, in, out);
 
     auto err = cudaGetLastError();
     if (err != cudaSuccess) {
       return errors::Internal(
-          "Could not launch DiagPartOp kernel: ",
-          cudaGetErrorString(err), ".");
+          "Could not launch DiagPartOp kernel: ", cudaGetErrorString(err), ".");
     }
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/diag_op_test.cc b/tensorflow/core/kernels/diag_op_test.cc
index 2d1417854cc06a138a803169495196ac70e70e5d..a708e53dd016d9a004a0cd2ddcdc285b0e6ad6fd 100644
--- a/tensorflow/core/kernels/diag_op_test.cc
+++ b/tensorflow/core/kernels/diag_op_test.cc
@@ -30,8 +30,8 @@ static Graph* Diag(int n, DataType type) {
   return g;
 }
 
-#define BM_DiagDev(N, T, TFTYPE, DEVICE)                           \
-  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) {   \
+#define BM_DiagDev(N, T, TFTYPE, DEVICE)                        \
+  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) {  \
     testing::UseRealTime();                                     \
     testing::ItemsProcessed(static_cast<int64>(iters) * N * N); \
     test::Benchmark(#DEVICE, Diag<T>(N, TFTYPE)).Run(iters);    \
@@ -51,4 +51,3 @@ BM_Diag(128);
 BM_Diag(512);
 
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/dilation_ops.cc b/tensorflow/core/kernels/dilation_ops.cc
index 6f5c0e91569eb5d44069a452632ad108e5df7d0d..441a63465c8246e09a8e70535f4b95a94d7acdb3 100644
--- a/tensorflow/core/kernels/dilation_ops.cc
+++ b/tensorflow/core/kernels/dilation_ops.cc
@@ -91,10 +91,10 @@ void ParseSizes(OpKernelContext* context, const std::vector<int32>& strides,
                                       filter.shape().DebugString()));
   const int filter_rows = filter.dim_size(0);
   const int filter_cols = filter.dim_size(1);
-  OP_REQUIRES(
-      context, depth == filter.dim_size(2),
-      errors::InvalidArgument("input and filter must have the same depth: ",
-                              depth, " vs ", filter.dim_size(2)));
+  OP_REQUIRES(context, depth == filter.dim_size(2),
+              errors::InvalidArgument(
+                  "input and filter must have the same depth: ", depth, " vs ",
+                  filter.dim_size(2)));
 
   // Effective filter size, after introducing rate - 1 zeros between each
   // non-zero filter element.
@@ -234,10 +234,11 @@ class DilationBackpropInputOp : public OpKernel {
     // [ batch, out_rows, out_cols, depth ]
     const int batch = input.dim_size(0);
     const int depth = input.dim_size(3);
-    OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
-                             out_rows == out_backprop.dim_size(1) &&
-                             out_cols == out_backprop.dim_size(2) &&
-                             depth == out_backprop.dim_size(3),
+    OP_REQUIRES(context,
+                batch == out_backprop.dim_size(0) &&
+                    out_rows == out_backprop.dim_size(1) &&
+                    out_cols == out_backprop.dim_size(2) &&
+                    depth == out_backprop.dim_size(3),
                 errors::InvalidArgument("out_backprop has incompatible size."));
 
     // The computed in_backprop has the same dimensions as the input:
@@ -353,10 +354,11 @@ class DilationBackpropFilterOp : public OpKernel {
     // [ batch, out_rows, out_cols, depth ]
     const int batch = input.dim_size(0);
     const int depth = input.dim_size(3);
-    OP_REQUIRES(context, batch == out_backprop.dim_size(0) &&
-                             out_rows == out_backprop.dim_size(1) &&
-                             out_cols == out_backprop.dim_size(2) &&
-                             depth == out_backprop.dim_size(3),
+    OP_REQUIRES(context,
+                batch == out_backprop.dim_size(0) &&
+                    out_rows == out_backprop.dim_size(1) &&
+                    out_cols == out_backprop.dim_size(2) &&
+                    depth == out_backprop.dim_size(3),
                 errors::InvalidArgument("out_backprop has incompatible size."));
 
     // The computed filter_backprop has the same dimensions as the filter:
diff --git a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
index ac0775fbefe601e53aaa6c67529cf9a67a0562c2..c63806a7f68c6981dd0e83373c6bfd598788e338 100644
--- a/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/dilation_ops_gpu.cu.cc
@@ -61,9 +61,8 @@ __global__ void DilationKernel(const int32 nthreads, const T* input_ptr,
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
@@ -106,9 +105,8 @@ __global__ void DilationBackpropInputKernel(
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
@@ -156,9 +154,8 @@ __global__ void DilationBackpropFilterKernel(
           const int w_in = w_beg + w * rate_cols;
           if (w_in >= 0 && w_in < input_cols) {
             const T val =
-                input_ptr[d +
-                          depth *
-                              (w_in + input_cols * (h_in + input_rows * b))] +
+                input_ptr[d + depth * (w_in +
+                                       input_cols * (h_in + input_rows * b))] +
                 filter_ptr[d + depth * (w + filter_cols * h)];
             if (val > cur_val) {
               cur_val = val;
diff --git a/tensorflow/core/kernels/draw_bounding_box_op.cc b/tensorflow/core/kernels/draw_bounding_box_op.cc
index a8818b7385d9d5253588ec40f425b85180c79006..b5d5b880bbbacab07c51fc395b86b4fbbb343d36 100644
--- a/tensorflow/core/kernels/draw_bounding_box_op.cc
+++ b/tensorflow/core/kernels/draw_bounding_box_op.cc
@@ -29,8 +29,7 @@ template <class T>
 class DrawBoundingBoxesOp : public OpKernel {
  public:
   explicit DrawBoundingBoxesOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-  }
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& images = context->input(0);
@@ -94,35 +93,28 @@ class DrawBoundingBoxesOp : public OpKernel {
         int64 color_index = bb % color_table_length;
         const int64 min_box_row =
             static_cast<float>(tboxes(b, bb, 0)) * (height - 1);
-        const int64 min_box_row_clamp =
-            std::max<int64>(min_box_row, 0);
+        const int64 min_box_row_clamp = std::max<int64>(min_box_row, 0);
         const int64 max_box_row =
             static_cast<float>(tboxes(b, bb, 2)) * (height - 1);
         const int64 max_box_row_clamp =
             std::min<int64>(max_box_row, height - 1);
         const int64 min_box_col =
             static_cast<float>(tboxes(b, bb, 1)) * (width - 1);
-        const int64 min_box_col_clamp =
-            std::max<int64>(min_box_col, 0);
+        const int64 min_box_col_clamp = std::max<int64>(min_box_col, 0);
         const int64 max_box_col =
             static_cast<float>(tboxes(b, bb, 3)) * (width - 1);
-        const int64 max_box_col_clamp =
-            std::min<int64>(max_box_col, width - 1);
+        const int64 max_box_col_clamp = std::min<int64>(max_box_col, width - 1);
 
         if (min_box_row > max_box_row || min_box_col > max_box_col) {
-          LOG(WARNING) << "Bounding box (" << min_box_row
-                       << "," << min_box_col
-                       << "," << max_box_row
-                       << "," << max_box_col
+          LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
+                       << "," << max_box_row << "," << max_box_col
                        << ") is inverted and will not be drawn.";
           continue;
         }
-        if (min_box_row >= height || max_box_row < 0 ||
-            min_box_col >= width || max_box_col < 0) {
-          LOG(WARNING) << "Bounding box (" << min_box_row
-                       << "," << min_box_col
-                       << "," << max_box_row
-                       << "," << max_box_col
+        if (min_box_row >= height || max_box_row < 0 || min_box_col >= width ||
+            max_box_col < 0) {
+          LOG(WARNING) << "Bounding box (" << min_box_row << "," << min_box_col
+                       << "," << max_box_row << "," << max_box_col
                        << ") is completely outside the image"
                        << " and will not be drawn.";
           continue;
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 861e16b2fd02001e913f548a5b48ca6b7497a8f2..3c988db5e618b976b5b2d45a9bfc386485249826 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -103,7 +103,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       // Walk through data and copy the data to the appropriate output tensor
       const auto data_flat = data->flat<T>();
       std::vector<Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>,
-                                   Eigen::Aligned> > out_vec;
+                                   Eigen::Aligned> >
+          out_vec;
       out_vec.reserve(num_partitions_);
       for (int p = 0; p < num_partitions_; p++) {
         out_vec.push_back(outputs[p]->vec<T>());
@@ -124,7 +125,8 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
     } else {
       // If data has extra dimensions, use Eigen slices
       std::vector<Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
-                                   Eigen::Aligned> > out_flat;
+                                   Eigen::Aligned> >
+          out_flat;
       out_flat.reserve(num_partitions_);
       for (int p = 0; p < num_partitions_; p++) {
         out_flat.push_back(outputs[p]->flat_outer_dims<T>());
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index 872921efa581bec64e98623a8fe3d955cd3cf3f7..9dfeccff0e8d2488fec5a1dc7b93f83d2cfedca5 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -79,9 +79,9 @@ template <typename T>
 void RangeInit(const GPUDevice& d, const T start, const T delta,
                const int32 size, typename TTypes<T>::Flat out) {
   CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
-  RangeInitKernel<
-      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      start, delta, size, out.data());
+  RangeInitKernel<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          start, delta, size, out.data());
 }
 
 // Given *num_runs pairs (key, value), this function moves the value
@@ -103,11 +103,10 @@ void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
                       T* out, int64 gather_dim_size, int64 indices_size,
                       int64 slice_size, int64 out_size) {
   CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
-  GatherOpKernel<
-      T, int32,
-      true><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      params, indices, out, gather_dim_size, indices_size, slice_size,
-      out_size);
+  GatherOpKernel<T, int32, true>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          params, indices, out, gather_dim_size, indices_size, slice_size,
+          out_size);
 }
 
 struct IdentityOp {
@@ -130,7 +129,7 @@ class BoundedOutputIterator
     // Constructor
     __host__ __device__ __forceinline__
     BoundedReference(int32* ptr, int32* base, IdentityOp op, int32 limit)
-        : Reference(ptr, op), base(base), limit(limit) {}
+        : Reference(ptr, op), limit(limit), base(base) {}
 
     // Assignment
     __host__ __device__ __forceinline__ int32 operator=(int32 val) {
@@ -146,11 +145,11 @@ class BoundedOutputIterator
   __host__ __device__ __forceinline__ BoundedOutputIterator(int32* ptr,
                                                             IdentityOp op,
                                                             int32 size)
-      : TransformOutputIterator(ptr, op), base(ptr), limit(size) {}
+      : TransformOutputIterator(ptr, op), limit(size), base(ptr) {}
 
   __host__ __device__ __forceinline__
   BoundedOutputIterator(int32* ptr, int32* base, IdentityOp op, int32 size)
-      : TransformOutputIterator(ptr, op), base(base), limit(size) {}
+      : TransformOutputIterator(ptr, op), limit(size), base(base) {}
 
   // Indirection
   __host__ __device__ __forceinline__ reference operator*() const {
@@ -231,10 +230,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
 
     OP_REQUIRES_ASYNC(
         c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()),
-        errors::InvalidArgument("data.shape must start with partitions.shape, ",
-                                "got data.shape = ", data.shape().DebugString(),
-                                ", partitions.shape = ",
-                                partitions.shape().DebugString()),
+        errors::InvalidArgument(
+            "data.shape must start with partitions.shape, ",
+            "got data.shape = ", data.shape().DebugString(),
+            ", partitions.shape = ", partitions.shape().DebugString()),
         done);
 
     Tensor partition_count;
@@ -245,8 +244,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
       AllocatorAttributes alloc_attr;
       alloc_attr.set_on_host(true);
       OP_REQUIRES_OK_ASYNC(
-          c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                              &partition_count, alloc_attr),
+          c,
+          c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                           &partition_count, alloc_attr),
           done);
       auto e_part_count = partition_count.flat<int32>();
       for (int i = 0; i < num_partitions_; i++) e_part_count(i) = 0;
@@ -259,8 +259,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
 
     // Prepare for counting.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                            &partition_count),
+        c,
+        c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                         &partition_count),
         done);
     Tensor indices_out;
     // Count how many times each partition index occurs.
@@ -280,8 +281,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     alloc_attr.set_on_host(true);
     alloc_attr.set_gpu_compatible(true);
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(partition_count.dtype(), partition_count.shape(),
-                            &cpu_tensor, alloc_attr),
+        c,
+        c->allocate_temp(partition_count.dtype(), partition_count.shape(),
+                         &cpu_tensor, alloc_attr),
         done);
     perftools::gputools::DeviceMemoryBase wrapped(
         partition_count.flat<int32>().data(), num_partitions_ * sizeof(int32));
@@ -340,9 +342,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
         indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream);
     // Allocate temporary storage.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(
-               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-               &cub_temp_storage),
+        c,
+        c->allocate_temp(DT_INT8,
+                         TensorShape({static_cast<int64>(temp_storage_bytes)}),
+                         &cub_temp_storage),
         done);
     // Radix-sort the partition information.
     cub::DeviceRadixSort::SortPairs(
@@ -376,8 +379,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     zero_functor(device, partition_count->flat<int32>());
     // Allocate memory for aggregates_out.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
-                            &aggregates_out),
+        c,
+        c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                         &aggregates_out),
         done);
     // Obtain the pointers to inner buffers.
     int32* keys_in_ptr = partitions_out.flat<int32>().data();
@@ -408,9 +412,10 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
                                    num_runs_ptr, reduction_op, N, cu_stream);
     // Allocate temporary storage.
     OP_REQUIRES_OK_ASYNC(
-        c, c->allocate_temp(
-               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-               &cub_temp_storage),
+        c,
+        c->allocate_temp(DT_INT8,
+                         TensorShape({static_cast<int64>(temp_storage_bytes)}),
+                         &cub_temp_storage),
         done);
     // Run reduce-by-key. The effect is that we count how many times
     // each index appears in partitions. The distinct indices are stored
diff --git a/tensorflow/core/kernels/eigen_activations.h b/tensorflow/core/kernels/eigen_activations.h
index 57c8157b878f6b46ca5a57857747e899fddbebb2..302033e47c59db2d87483a8e2f1e70d0572b21f9 100644
--- a/tensorflow/core/kernels/eigen_activations.h
+++ b/tensorflow/core/kernels/eigen_activations.h
@@ -13,21 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
 
 /** scalar_sigmoid_fast_derivative_op
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to compute the fast derivative of a sigmoid
-  *
-  * Input should be the backpropagated gradient.
-  *
-  * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to compute the fast derivative of a sigmoid
+ *
+ * Input should be the backpropagated gradient.
+ *
+ * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
+ */
 template <typename T>
 struct scalar_sigmoid_fast_derivative_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_fast_derivative_op)
@@ -55,13 +55,13 @@ struct functor_traits<scalar_sigmoid_fast_derivative_op<T> > {
 }  // namespace internal
 
 /** scalar_tanh_fast_derivative_op
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to compute the fast derivative of a tanh
-  *
-  * Input should be the backpropagated gradient.
-  *
-  * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to compute the fast derivative of a tanh
+ *
+ * Input should be the backpropagated gradient.
+ *
+ * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
+ */
 template <typename T>
 struct scalar_tanh_fast_derivative_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_fast_derivative_op)
@@ -89,11 +89,11 @@ struct functor_traits<scalar_tanh_fast_derivative_op<T> > {
 }  // namespace internal
 
 /**
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to clip the magnitude of the first scalar.
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::Clip
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ * \brief Template functor to clip the magnitude of the first scalar.
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::Clip
+ */
 template <typename Scalar>
 struct scalar_clip_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op)
@@ -122,4 +122,4 @@ struct functor_traits<scalar_clip_op<Scalar> > {
 
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_ACTIVATIONS_H_
diff --git a/tensorflow/core/kernels/eigen_activations_test.cc b/tensorflow/core/kernels/eigen_activations_test.cc
index 907233103d8244749410c3198f0ca92ad44769b8..34952f5abb8526f0317ba8a674948fada4dc0ce7 100644
--- a/tensorflow/core/kernels/eigen_activations_test.cc
+++ b/tensorflow/core/kernels/eigen_activations_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenBackwardSpatialConvolutionsTest, SigmoidFastDerivative) {
   const ptrdiff_t depth = 3;
diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h
index f4c42372b1840e0c46b57b133745670a07a8c46c..4d86f9deb9902a64764e29ca0371bb68ad4f3370 100644
--- a/tensorflow/core/kernels/eigen_attention.h
+++ b/tensorflow/core/kernels/eigen_attention.h
@@ -13,43 +13,55 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
 
 /** ExtractGlimpses
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Extract glimpses from an input tensor.
-  *
-  * The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch).
-  * The width and height parameters specify the extension of the returned glimpses.
-  * The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension.
-  * The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension.
-  * The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center.
-  *
-  * The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch).
-  * The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size.
-  */
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Extract glimpses from an input tensor.
+ *
+ * The input parameter is expected to be a col-major tensor with a rank of 4
+ * (depth, x, y, and batch). The width and height parameters specify the
+ * extension of the returned glimpses. The offsets parameter specifies the x, y
+ * locations of the center of the glimpses relative to the center of the input
+ * image. The vector is expected to contain one IndexPair for each image in the
+ * batch dimension. The normalized boolean indicates if incoming coordinates are
+ * normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each
+ * height and width dimension. The centered boolean indicates if incoming
+ * coordinates are centered relative to the image, in which case -1.0 and 1.0
+ * correspond to minimum and maximum of each dimension while 0.0 corresponds to
+ * the center.
+ *
+ * The result can be assigned to a tensor of rank equal to that of the input.
+ * The result will be laid out in col-major order (depth, x, y, batch). The
+ * dimensions of the result will be equal to the dimensions of the input except
+ * for width and height which will be equal to the requested glimpse size.
+ */
 namespace {
 template <typename Index>
 struct GlimpseExtractionOp {
   GlimpseExtractionOp(const Index width, const Index height,
                       const std::vector<IndexPair<float> >& offsets,
-                      const bool normalized,
-                      const bool centered,
-                      const bool uniform_noise) :
-      width_(width), height_(height), offsets_(offsets),
-      normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { }
+                      const bool normalized, const bool centered,
+                      const bool uniform_noise)
+      : width_(width),
+        height_(height),
+        offsets_(offsets),
+        normalized_(normalized),
+        centered_(centered),
+        uniform_noise_(uniform_noise) {}
 
   template <typename Input>
   DSizes<Index, 4> dimensions(const Input& input) const {
     typedef typename internal::traits<Input>::Index IndexType;
     typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
+                             internal::traits<Input>::Layout, IndexType> >
+        Ref;
     Ref in(input);
 
     DSizes<Index, 4> dims = in.dimensions();
@@ -62,12 +74,12 @@ struct GlimpseExtractionOp {
   }
 
   template <typename Input, typename Output, typename Device>
-  EIGEN_DEVICE_FUNC
-  void eval(const Input& input, Output& output, const Device& device) const
-  {
+  EIGEN_DEVICE_FUNC void eval(const Input& input, Output& output,
+                              const Device& device) const {
     typedef typename internal::traits<Input>::Index IndexType;
     typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
+                             internal::traits<Input>::Layout, IndexType> >
+        Ref;
     Ref in(input);
     const Index num_channels = in.dimension(0);
     const Index input_width = in.dimension(1);
@@ -97,8 +109,8 @@ struct GlimpseExtractionOp {
       x -= width_ / 2.0f;
       y -= height_ / 2.0f;
 
-      const Index offset_x = (Index) x;
-      const Index offset_y = (Index) y;
+      const Index offset_x = (Index)x;
+      const Index offset_y = (Index)y;
       Index glimpse_width = width_;
       Index glimpse_height = height_;
       bool partial_overlap = false;
@@ -135,7 +147,7 @@ struct GlimpseExtractionOp {
         if (uniform_noise_) {
           // Initialize the glimpse with uniform noise.
           typedef typename internal::remove_const<
-            typename internal::traits<Input>::Scalar>::type Scalar;
+              typename internal::traits<Input>::Scalar>::type Scalar;
           TensorFixedSize<Scalar, Sizes<> > mini;
           mini.device(device) = input.template chip<3>(i).minimum();
           TensorFixedSize<float, Sizes<> > range;
@@ -215,21 +227,22 @@ struct GlimpseExtractionOp {
   const bool centered_;
   const bool uniform_noise_;
 };
-}
-
+}  // namespace
 
 template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorCustomUnaryOp<const GlimpseExtractionOp<typename internal::traits<Input>::Index>, const Input>
+EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<
+    const GlimpseExtractionOp<typename internal::traits<Input>::Index>,
+    const Input>
 ExtractGlimpses(const Input& input,
                 const typename internal::traits<Input>::Index width,
                 const typename internal::traits<Input>::Index height,
                 const std::vector<IndexPair<float> >& offsets,
                 const bool normalized = true, const bool centered = true,
-                const bool uniform_noise = true)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+                const bool uniform_noise = true) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
 
   typedef typename internal::traits<Input>::Index Index;
   const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
@@ -237,6 +250,6 @@ ExtractGlimpses(const Input& input,
   return input.customOp(op);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_ATTENTION_H_
diff --git a/tensorflow/core/kernels/eigen_attention_test.cc b/tensorflow/core/kernels/eigen_attention_test.cc
index 3a2eeb05959e8844903eb3b910a893760bb02e74..08f61877182cce36316752b7dd17dee3bd2efaac 100644
--- a/tensorflow/core/kernels/eigen_attention_test.cc
+++ b/tensorflow/core/kernels/eigen_attention_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenAttentionTest, Simple) {
   const ptrdiff_t depth = 3;
diff --git a/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h b/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
index a44e7197a9412926fc30eecbc8128fe08829d21e..e13e548f863bcdcb5e8853ea19532e8e787e4571 100644
--- a/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
+++ b/tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/eigen_volume_patch.h"
@@ -617,4 +617,4 @@ CuboidConvolutionBackwardKernel(
 
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_CUBOID_CONVOLUTIONS_H_
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
index d172de8e18d89b4e006c0093b603b7d3f305494f..099696105b61c19b7fcc9694fe1d7a3021cb97dc 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions.h
@@ -13,37 +13,37 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_SPATIAL_CONVOLUTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_SPATIAL_CONVOLUTIONS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_SPATIAL_CONVOLUTIONS_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_BACKWARD_SPATIAL_CONVOLUTIONS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
 
 /** SpatialConvolutionBackwardInput
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the input of a 2D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 3 or
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the input of a 2D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 3 or
  * more (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The output_backward and the kernel must both be in col-major layout. The
+ * The output_backward and the kernel must both be in col-major layout. The
  * result will also be in col-major layout.
-  *
-  * If row_in_stride, col_in_stride > 1, then applies convolution with holes
+ *
+ * If row_in_stride, col_in_stride > 1, then applies convolution with holes
  * (aka atrous convolution), sampling every row_in_stride, col_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * output_backward. The dimensions of the result will be filters, height, width
  * (and others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 #ifdef EIGEN_HAS_INDEX_LIST
 typedef IndexList<type2index<0>, type2index<0>, type2index<1>, type2index<1> >
     ReverseColMajor;
@@ -293,29 +293,29 @@ SpatialConvolutionBackwardInput(
 }
 
 /** SpatialConvolutionBackwardKernel
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the filter of a 2D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 3 or
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Computes the backprop for the filter of a 2D convolution.
+ *
+ * The output_backward parameter is expected to be a tensor with a rank of 3 or
  * more (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The output_backward and the kernel must both be in col-major layout. The
+ * The output_backward and the kernel must both be in col-major layout. The
  * result will also be in col-major layout.
-  *
-  * If row_in_stride, col_stride > 1, then applies convolution with holes (aka
+ *
+ * If row_in_stride, col_stride > 1, then applies convolution with holes (aka
  * atrous convolution), sampling every row_in_stride, col_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * output_backward. The dimensions of the result will be filters, height, width
  * (and others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 
 template <typename OutputBackward, typename Input>
 EIGEN_ALWAYS_INLINE static const typename internal::conditional<
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
index 1758067829e5b577477c1d86f9cdb4396b46e047..2229ec9659472daee3158c593252907f288d829f 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
@@ -25,7 +25,7 @@ void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
 static int ceil_div(int a, int b) { return (a + b - 1) / b; }
-}
+}  // namespace
 
 TEST(EigenBackwardSpatialConvolutionsTest,
      test_simple_spatial_convolution_backward_input_valid) {
diff --git a/tensorflow/core/kernels/eigen_cuboid_convolution.h b/tensorflow/core/kernels/eigen_cuboid_convolution.h
index 2dca664a86d6715e8e9d90842058d6ecc89f569a..62e9f9123dd4101d0e8466fb2f4f90fcb6da73c2 100644
--- a/tensorflow/core/kernels/eigen_cuboid_convolution.h
+++ b/tensorflow/core/kernels/eigen_cuboid_convolution.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/eigen_volume_patch.h"
@@ -224,4 +224,4 @@ CuboidConvolution(const Input& input, const Kernel& kernel,
 
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_CUBOID_CONVOLUTION_H_
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 94100d71ec30b07e47fafb826d5e428c2bde7bcb..896c9957616037da4ead2dbda8cb2393eaea226f 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/eigen_volume_patch.h"
@@ -309,10 +309,10 @@ struct AvgPoolMeanReducer {
   _mm512_castsi512_ps( \
       _mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ), -1))
 
-// The ternarylogic function immediate determines the values in the result
-// In the case below, 0xd8 implies (false_mask) ? (b) : (a)
-// For details, refer to the vpternlogd instruction table at
-// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
+  // The ternarylogic function immediate determines the values in the result
+  // In the case below, 0xd8 implies (false_mask) ? (b) : (a)
+  // For details, refer to the vpternlogd instruction table at
+  // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf
 
 #define psel(a, b, false_mask)                        \
   _mm512_castsi512_ps(_mm512_ternarylogic_epi32(      \
@@ -610,4 +610,4 @@ CuboidAvgPooling(const Input& input, DenseIndex patchPlanes,
 
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_POOLING_H_
diff --git a/tensorflow/core/kernels/eigen_pooling_test.cc b/tensorflow/core/kernels/eigen_pooling_test.cc
index 9383972b9fff39deb130d5cecac6f0c7abec5566..47b6665e680268793df18d50395d0b6c6aca0ad0 100644
--- a/tensorflow/core/kernels/eigen_pooling_test.cc
+++ b/tensorflow/core/kernels/eigen_pooling_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenPoolingTest, Simple) {
   const int depth = 10;
diff --git a/tensorflow/core/kernels/eigen_softmax.h b/tensorflow/core/kernels/eigen_softmax.h
index 20bb8a44dd9041b6c704447d7f14979bf0da0efb..12148c54b364bbc5ef1dff9b9645303534e7ea12 100644
--- a/tensorflow/core/kernels/eigen_softmax.h
+++ b/tensorflow/core/kernels/eigen_softmax.h
@@ -13,27 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
 
 /** SoftMax
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a softmax
-  *
-  * The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other).
-  *
-  * The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order.
-  *
-*/
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a softmax
+ *
+ * The input parameter is expected to be a col-major tensor with a rank of 2
+ * (depth and other).
+ *
+ * The result can be assigned to a tensor of rank and dimensions equal to that
+ * of the input. The result will be laid out in col-major order.
+ *
+ */
 
 namespace {
 struct SoftmaxOp {
-  SoftmaxOp(const float beta) : beta_(beta) { }
+  SoftmaxOp(const float beta) : beta_(beta) {}
 
   template <typename Input>
   typename Input::Dimensions dimensions(const Input& input) const {
@@ -41,8 +43,7 @@ struct SoftmaxOp {
   }
 
   template <typename Input, typename Output, typename Device>
-  void eval(const Input& input, Output& output, const Device& device) const
-  {
+  void eval(const Input& input, Output& output, const Device& device) const {
 #if !defined(EIGEN_HAS_INDEX_LIST)
     // nvcc doesn't support cxx11
     Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
@@ -56,35 +57,43 @@ struct SoftmaxOp {
 #else
     // Take advantage of cxx11 to give the compiler information it can use to
     // optimize the code.
-    Eigen::IndexList<Eigen::type2index<0>> depth_dim;
-    Eigen::IndexList<int, Eigen::type2index<1>> bcast;
+    Eigen::IndexList<Eigen::type2index<0> > depth_dim;
+    Eigen::IndexList<int, Eigen::type2index<1> > bcast;
     bcast.set(0, dimensions(input)[0]);
-    Eigen::IndexList<Eigen::type2index<1>, typename internal::traits<Input>::Index> dims2d;
+    Eigen::IndexList<Eigen::type2index<1>,
+                     typename internal::traits<Input>::Index>
+        dims2d;
     dims2d.set(1, dimensions(input)[1]);
 #endif
 
-    output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp();
-    output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+    output.device(device) =
+        ((input -
+          input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) *
+         beta_)
+            .exp();
+    output.device(device) =
+        output /
+        (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
   }
 
  private:
   const float beta_;
 };
-}
-
+}  // namespace
 
 template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorCustomUnaryOp<const SoftmaxOp, const Input>
-SoftMax(const Input& input, const float beta)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<const SoftmaxOp,
+                                                     const Input>
+SoftMax(const Input& input, const float beta) {
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE);
 
   const SoftmaxOp op(beta);
   return input.customOp(op);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SOFTMAX_H_
diff --git a/tensorflow/core/kernels/eigen_softmax_test.cc b/tensorflow/core/kernels/eigen_softmax_test.cc
index ba681d68ab0d416cd2c7bae9065df9b95638a3e8..7f985d71366487e0426e25e064764c196979b114 100644
--- a/tensorflow/core/kernels/eigen_softmax_test.cc
+++ b/tensorflow/core/kernels/eigen_softmax_test.cc
@@ -23,7 +23,7 @@ namespace {
 void EigenApprox(float a, float b) {
   ASSERT_TRUE(std::abs(a - b) <= std::min(std::abs(a), std::abs(b)) * 1e-3);
 }
-}
+}  // namespace
 
 TEST(EigenSoftmaxTest, Simple) {
   const int depth = 1024;
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 7702f3e70a806a3edda48a8a86e3a65571e8ba7e..1acbe3a658070222e99ff874815db9a6b07d4565 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
@@ -877,29 +877,29 @@ struct gemm_pack_rhs<
 }  // end namespace internal
 
 /** SpatialConvolution
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a 2D convolution over a multichannel input image.
-  *
-  * The input parameter is expected to be a tensor with a rank of 3 or more
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a 2D convolution over a multichannel input image.
+ *
+ * The input parameter is expected to be a tensor with a rank of 3 or more
  * (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
  * kernel_height, kernel_width)
-  * The input and the kernel must both be in col-major layout. The result will
+ * The input and the kernel must both be in col-major layout. The result will
  * also be in col-major layout.
-  *
-  * If col_in_stride, row_in_stride > 1, then applies convolution with holes
+ *
+ * If col_in_stride, row_in_stride > 1, then applies convolution with holes
  * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
  * pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
  * input. The dimensions of the result will be filters, height, width (and
  * others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided
+ *
+ * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
-  *
-  */
+ *
+ */
 template <typename Input, typename Kernel>
 EIGEN_DEVICE_FUNC
     EIGEN_ALWAYS_INLINE static const typename internal::conditional<
@@ -993,7 +993,7 @@ EIGEN_DEVICE_FUNC
     default:
       // Initialize unused variables to avoid a compiler warning
       out_height = 0;
-      out_width  = 0;
+      out_width = 0;
       eigen_assert(false && "unexpected padding");
   }
 
@@ -1069,4 +1069,4 @@ EIGEN_DEVICE_FUNC
 
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/core/kernels/eigen_volume_patch.h b/tensorflow/core/kernels/eigen_volume_patch.h
index afd5f37e352a5b5e4c2f77666bc3b18be914b1b2..a3d795813de19c9571ffeec705a6e4cb19f6b641 100644
--- a/tensorflow/core/kernels/eigen_volume_patch.h
+++ b/tensorflow/core/kernels/eigen_volume_patch.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
@@ -653,4 +653,4 @@ OVERRIDE_EVALUATOR(Eigen::DefaultDevice);
 
 };  // namespace Eigen
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_VOLUME_PATCH_H_
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc
index 4fcae25aa6eac8b31f78e1d5ae964aed427fc0f4..1a5b0f2b675a85ba2c1dbf0356c3e42b03db22b4 100644
--- a/tensorflow/core/kernels/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@@ -80,10 +80,11 @@ class EncodeJpegOp : public OpKernel {
                 errors::InvalidArgument("image must be 3-dimensional",
                                         image.shape().DebugString()));
 
-    OP_REQUIRES(context, FastBoundsCheck(image.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument(
-                    "Cannot encode images with >= max int32 elements"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(image.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument(
+            "Cannot encode images with >= max int32 elements"));
 
     const int32 dim_size0 = static_cast<int32>(image.dim_size(0));
     const int32 dim_size1 = static_cast<int32>(image.dim_size(1));
@@ -100,9 +101,10 @@ class EncodeJpegOp : public OpKernel {
       } else if (channels == 3) {
         adjusted_flags.format = jpeg::FORMAT_RGB;
       } else {
-        OP_REQUIRES(context, false, errors::InvalidArgument(
-                                        "image must have 1 or 3 channels, got ",
-                                        image.shape().DebugString()));
+        OP_REQUIRES(
+            context, false,
+            errors::InvalidArgument("image must have 1 or 3 channels, got ",
+                                    image.shape().DebugString()));
       }
     } else {
       if (flags_.format == jpeg::FORMAT_GRAYSCALE) {
diff --git a/tensorflow/core/kernels/encode_wav_op_test.cc b/tensorflow/core/kernels/encode_wav_op_test.cc
index 34138ac9a04fc7d233a3ec30383cb0b96c0126e6..b3c61e2c995b01dbea9c1080cb353d6108d87672 100644
--- a/tensorflow/core/kernels/encode_wav_op_test.cc
+++ b/tensorflow/core/kernels/encode_wav_op_test.cc
@@ -31,8 +31,8 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
-
-using namespace ops;  // NOLINT(build/namespaces)
+namespace ops {
+namespace {
 
 TEST(EncodeWavOpTest, EncodeWavTest) {
   Scope root = Scope::DisabledShapeInferenceScope();
@@ -77,4 +77,6 @@ TEST(EncodeWavOpTest, EncodeWavTest) {
   EXPECT_EQ(44100, sample_rate);
 }
 
+}  // namespace
+}  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index 2db844e410cea679291aec67748ed15297a0e36a..83cd0e9b47e5480cd562452213aa81c7a4a64a95 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -34,9 +34,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-class ExampleParserOp : public OpKernel {
+class ParseExampleOp : public OpKernel {
  public:
-  explicit ExampleParserOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+  explicit ParseExampleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, attrs_.Init(ctx));
   }
 
@@ -162,11 +162,107 @@ class ExampleParserOp : public OpKernel {
   }
 
  protected:
-  ParseSingleExampleAttrs attrs_;
+  ParseExampleAttrs attrs_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ParseExample").Device(DEVICE_CPU),
-                        ExampleParserOp);
+                        ParseExampleOp);
+
+class ParseSingleExampleOp : public OpKernel {
+ public:
+  explicit ParseSingleExampleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, attrs_.Init(ctx));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* serialized;
+    OpInputList dense_defaults;
+
+    // Grab the input list arguments.
+    OP_REQUIRES_OK(ctx, ctx->input("serialized", &serialized));
+    OP_REQUIRES_OK(ctx, ctx->input_list("dense_defaults", &dense_defaults));
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(serialized->shape()),
+                errors::InvalidArgument(
+                    "Expected serialized to be a scalar, got shape: ",
+                    serialized->shape().DebugString()));
+    OP_REQUIRES(ctx, dense_defaults.size() == attrs_.dense_keys.size(),
+                errors::InvalidArgument(
+                    "Expected len(dense_defaults) == len(dense_keys) but got: ",
+                    dense_defaults.size(), " vs. ", attrs_.dense_keys.size()));
+
+    for (size_t d = 0; d < attrs_.dense_keys.size(); ++d) {
+      const Tensor& def_value = dense_defaults[d];
+      if (attrs_.variable_length[d]) {
+        OP_REQUIRES(ctx, def_value.NumElements() == 1,
+                    errors::InvalidArgument(
+                        "dense_shape[", d, "] is a variable length shape: ",
+                        attrs_.dense_shapes[d].DebugString(),
+                        ", therefore "
+                        "def_value[",
+                        d,
+                        "] must contain a single element ("
+                        "the padding element).  But its shape is: ",
+                        def_value.shape().DebugString()));
+      } else if (def_value.NumElements() > 0) {
+        OP_REQUIRES(ctx,
+                    attrs_.dense_shapes[d].IsCompatibleWith(def_value.shape()),
+                    errors::InvalidArgument(
+                        "def_value[", d,
+                        "].shape() == ", def_value.shape().DebugString(),
+                        " is not compatible with dense_shapes_[", d,
+                        "] == ", attrs_.dense_shapes[d].DebugString()));
+      }
+      OP_REQUIRES(ctx, def_value.dtype() == attrs_.dense_types[d],
+                  errors::InvalidArgument(
+                      "dense_defaults[", d, "].dtype() == ",
+                      DataTypeString(def_value.dtype()), " != dense_types_[", d,
+                      "] == ", DataTypeString(attrs_.dense_types[d])));
+    }
+
+    example::Result result;
+
+    // TODO(mrry): Build the configuration once and cache it.
+    example::FastParseExampleConfig config;
+    for (int d = 0; d < attrs_.dense_keys.size(); ++d) {
+      config.dense.push_back({attrs_.dense_keys[d], attrs_.dense_types[d],
+                              attrs_.dense_shapes[d], dense_defaults[d],
+                              attrs_.variable_length[d],
+                              attrs_.elements_per_stride[d]});
+    }
+    for (int d = 0; d < attrs_.sparse_keys.size(); ++d) {
+      config.sparse.push_back({attrs_.sparse_keys[d], attrs_.sparse_types[d]});
+    }
+
+    const string& serialized_proto = serialized->scalar<string>()();
+
+    OP_REQUIRES_OK(ctx,
+                   FastParseSingleExample(config, serialized_proto, &result));
+
+    OpOutputList dense_values;
+    OpOutputList sparse_indices;
+    OpOutputList sparse_values;
+    OpOutputList sparse_shapes;
+    OP_REQUIRES_OK(ctx, ctx->output_list("dense_values", &dense_values));
+    OP_REQUIRES_OK(ctx, ctx->output_list("sparse_indices", &sparse_indices));
+    OP_REQUIRES_OK(ctx, ctx->output_list("sparse_values", &sparse_values));
+    OP_REQUIRES_OK(ctx, ctx->output_list("sparse_shapes", &sparse_shapes));
+    for (int d = 0; d < attrs_.dense_keys.size(); ++d) {
+      dense_values.set(d, result.dense_values[d]);
+    }
+    for (int d = 0; d < attrs_.sparse_keys.size(); ++d) {
+      sparse_indices.set(d, result.sparse_indices[d]);
+      sparse_values.set(d, result.sparse_values[d]);
+      sparse_shapes.set(d, result.sparse_shapes[d]);
+    }
+  }
+
+ protected:
+  ParseSingleExampleAttrs attrs_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ParseSingleExample").Device(DEVICE_CPU),
+                        ParseSingleExampleOp);
 
 class SingleSequenceExampleParserOp : public OpKernel {
  public:
@@ -250,8 +346,9 @@ class SingleSequenceExampleParserOp : public OpKernel {
           feature_list_sparse_keys[di].scalar<string>()();
     }
     OP_REQUIRES(
-        ctx, TensorShapeUtils::IsVector(
-                 feature_list_dense_missing_assumed_empty->shape()),
+        ctx,
+        TensorShapeUtils::IsVector(
+            feature_list_dense_missing_assumed_empty->shape()),
         errors::InvalidArgument(
             "Expected feature_list_dense_missing_assumed_empty ",
             "to be a vector, got shape: ",
@@ -290,12 +387,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
       required[d] = (def_value.NumElements() == 0);  // No default provided.
 
       if (def_value.NumElements() > 0) {
-        OP_REQUIRES(
-            ctx, def_value.shape() == attrs_.context_dense_shapes[d],
-            errors::InvalidArgument(
-                "def_value[", d, "].shape() == ",
-                def_value.shape().DebugString(), " != context_dense_shapes_[",
-                d, "] == ", attrs_.context_dense_shapes[d].DebugString()));
+        OP_REQUIRES(ctx, def_value.shape() == attrs_.context_dense_shapes[d],
+                    errors::InvalidArgument(
+                        "def_value[", d,
+                        "].shape() == ", def_value.shape().DebugString(),
+                        " != context_dense_shapes_[", d,
+                        "] == ", attrs_.context_dense_shapes[d].DebugString()));
         OP_REQUIRES(
             ctx, def_value.dtype() == attrs_.context_dense_types[d],
             errors::InvalidArgument(
@@ -480,12 +577,12 @@ class SingleSequenceExampleParserOp : public OpKernel {
         const Feature& f = fl.feature(t);
         bool types_match;
         OP_REQUIRES_OK(ctx, CheckTypesMatch(f, dtype, &types_match));
-        OP_REQUIRES(
-            ctx, types_match,
-            errors::InvalidArgument(
-                "Name: ", name, ", Feature list: ", key, ", Index: ", t,
-                ".  Data types don't match. ", "Expected type: ",
-                DataTypeString(dtype), "  Feature is: ", ProtoDebugString(f)));
+        OP_REQUIRES(ctx, types_match,
+                    errors::InvalidArgument(
+                        "Name: ", name, ", Feature list: ", key, ", Index: ", t,
+                        ".  Data types don't match. ",
+                        "Expected type: ", DataTypeString(dtype),
+                        "  Feature is: ", ProtoDebugString(f)));
         OP_REQUIRES_OK(ctx, FeatureDenseCopy(t, name, key, dtype, shape, f,
                                              feature_list_dense_values[d]));
       }
diff --git a/tensorflow/core/kernels/example_parsing_ops_test.cc b/tensorflow/core/kernels/example_parsing_ops_test.cc
index 29dbfd3b1bdd07ba362094609c0965ffeb6f7225..5d06eda79e7544951ea7ee10179c8e76dcbb58af 100644
--- a/tensorflow/core/kernels/example_parsing_ops_test.cc
+++ b/tensorflow/core/kernels/example_parsing_ops_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <mutex>
 #include <unordered_map>
 
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
@@ -80,6 +81,26 @@ class FloatFiller {
 
 template <typename T>
 struct ExampleStore {
+ private:
+  static ExampleTensorMap serialized_example;
+  static std::once_flag flags_init;
+
+ public:
+  static ExampleTensorMap& GetSerializedExample() {
+    std::call_once(flags_init, [] {
+      AddExample(&serialized_example, 10, 1, 1);
+      AddExample(&serialized_example, 100, 1, 1);
+      AddExample(&serialized_example, 1000, 1, 1);
+      AddExample(&serialized_example, 10, 128, 1);
+      AddExample(&serialized_example, 100, 128, 1);
+      AddExample(&serialized_example, 1000, 128, 1);
+      AddExample(&serialized_example, 10, 512, 1);
+      AddExample(&serialized_example, 100, 512, 1);
+      AddExample(&serialized_example, 1000, 512, 1);
+      AddExample(&serialized_example, 1, 1, 1000000);
+    });
+    return serialized_example;
+  }
   typedef T Filler;
   static void AddExample(ExampleTensorMap* examples, int num_keys,
                          int batch_size, int feature_size) {
@@ -101,31 +122,15 @@ struct ExampleStore {
     (*examples)[std::make_tuple(batch_size, num_keys, feature_size)] =
         record_string;
   }
-  static ExampleTensorMap GetSerializedExamples() {
-    ExampleTensorMap examples;
-    AddExample(&examples, 10, 128, 1);
-    AddExample(&examples, 100, 128, 1);
-    AddExample(&examples, 1000, 128, 1);
-    AddExample(&examples, 10, 512, 1);
-    AddExample(&examples, 100, 512, 1);
-    AddExample(&examples, 1000, 512, 1);
-    AddExample(&examples, 1, 1, 1000000);
-    return examples;
-  }
-  static ExampleTensorMap serialized_example;
 };
+template <typename T>
+ExampleTensorMap ExampleStore<T>::serialized_example;
+template <typename T>
+std::once_flag ExampleStore<T>::flags_init;
 
-template <>
-ExampleTensorMap ExampleStore<BytesFiller>::serialized_example =
-    ExampleStore<BytesFiller>::GetSerializedExamples();
-
-template <>
-ExampleTensorMap ExampleStore<Int64Filler>::serialized_example =
-    ExampleStore<Int64Filler>::GetSerializedExamples();
-
-template <>
-ExampleTensorMap ExampleStore<FloatFiller>::serialized_example =
-    ExampleStore<FloatFiller>::GetSerializedExamples();
+template class ExampleStore<BytesFiller>;
+template class ExampleStore<Int64Filler>;
+template class ExampleStore<FloatFiller>;
 
 enum BenchmarkType { kDense, kSparse, kVarLenDense };
 
@@ -139,7 +144,7 @@ struct BenchmarkOptions {
 template <typename Options>
 static Graph* ParseExample(int batch_size, int num_keys, int feature_size) {
   Graph* g = new Graph(OpRegistry::Global());
-  Tensor& serialized = Options::Store::serialized_example[std::make_tuple(
+  Tensor& serialized = Options::Store::GetSerializedExample()[std::make_tuple(
       batch_size, num_keys, feature_size)];
   Tensor names(DT_STRING, TensorShape({batch_size}));
 
@@ -186,6 +191,56 @@ static Graph* ParseExample(int batch_size, int num_keys, int feature_size) {
   return g;
 }
 
+template <typename Options>
+static Graph* ParseSingleExample(int num_keys, int feature_size) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor& serialized_batch_1 =
+      Options::Store::GetSerializedExample()[std::make_tuple(1, num_keys,
+                                                             feature_size)];
+  Tensor serialized(DT_STRING, TensorShape());
+  serialized.scalar<string>()() = serialized_batch_1.vec<string>()(0);
+
+  std::vector<string> sparse_keys;
+  std::vector<string> dense_keys;
+  std::vector<NodeBuilder::NodeOut> dense_defaults;
+  std::vector<DataType> sparse_types;
+  std::vector<PartialTensorShape> dense_shapes;
+  Options opt;
+  for (int i = 0; i < num_keys; ++i) {
+    string key = strings::Printf("feature_%d", i);
+    switch (opt.benchmark_type) {
+      case kDense:
+        dense_keys.push_back(key),
+            dense_defaults.emplace_back(test::graph::Constant(
+                g, opt.filler.make_dense_default(feature_size)));
+        dense_shapes.push_back(PartialTensorShape({feature_size}));
+        break;
+      case kVarLenDense:
+        dense_keys.push_back(key),
+            dense_defaults.emplace_back(
+                test::graph::Constant(g, opt.filler.make_dense_default(1)));
+        dense_shapes.push_back(PartialTensorShape({-1}));
+        break;
+      case kSparse:
+        sparse_keys.push_back(key), sparse_types.push_back(opt.filler.dtype);
+        break;
+    }
+  }
+
+  Node* ret;
+  TF_EXPECT_OK(NodeBuilder(g->NewName("n"), "ParseSingleExample")
+                   .Input(test::graph::Constant(g, serialized))
+                   .Input(dense_defaults)
+                   .Attr<int64>("num_sparse", sparse_keys.size())
+                   .Attr("sparse_keys", sparse_keys)
+                   .Attr("sparse_types", sparse_types)
+                   .Attr("dense_keys", dense_keys)
+                   .Attr("dense_shapes", dense_shapes)
+                   .Finalize(g, &ret));
+
+  return g;
+}
+
 // Benchmark settings (Sparse, Dense) X (Bytes, Int64, Float)
 typedef BenchmarkOptions<ExampleStore<BytesFiller>, kSparse> SparseString;
 typedef BenchmarkOptions<ExampleStore<BytesFiller>, kDense> DenseString;
@@ -212,10 +267,13 @@ typedef BenchmarkOptions<ExampleStore<FloatFiller>, kVarLenDense>
   BENCHMARK(BM_ParseExample##_##TYPE##_##B##_##K##_##F);
 
 #define BM_AllParseExample(Type)       \
+  BM_ParseExample(Type, 1, 10, 1);     \
   BM_ParseExample(Type, 128, 10, 1);   \
   BM_ParseExample(Type, 512, 10, 1);   \
+  BM_ParseExample(Type, 1, 100, 1);    \
   BM_ParseExample(Type, 128, 100, 1);  \
   BM_ParseExample(Type, 512, 100, 1);  \
+  BM_ParseExample(Type, 1, 1000, 1);   \
   BM_ParseExample(Type, 128, 1000, 1); \
   BM_ParseExample(Type, 512, 1000, 1); \
   BM_ParseExample(Type, 1, 1, 1000000);
@@ -230,4 +288,31 @@ BM_AllParseExample(SparseFloat);
 BM_AllParseExample(DenseFloat);
 BM_AllParseExample(VarLenDenseFloat);
 
+// K == num_keys. F == feature_size.
+// K must be one of 10, 100, 1000
+#define BM_ParseSingleExample(TYPE, K, F)                                \
+  static void BM_ParseSingleExample##_##TYPE##_1_##K##_##F(int iters) {  \
+    int64 items_per_iter = K * F;                                        \
+    testing::UseRealTime();                                              \
+    testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter); \
+    test::Benchmark("cpu", ParseSingleExample<TYPE>(K, F)).Run(iters);   \
+  }                                                                      \
+  BENCHMARK(BM_ParseSingleExample##_##TYPE##_1_##K##_##F);
+
+#define BM_AllParseSingleExample(Type)  \
+  BM_ParseSingleExample(Type, 10, 1);   \
+  BM_ParseSingleExample(Type, 100, 1);  \
+  BM_ParseSingleExample(Type, 1000, 1); \
+  BM_ParseSingleExample(Type, 1, 1000000);
+
+BM_AllParseSingleExample(SparseString);
+BM_AllParseSingleExample(DenseString);
+BM_AllParseSingleExample(VarLenDenseString);
+BM_AllParseSingleExample(SparseInt64);
+BM_AllParseSingleExample(DenseInt64);
+BM_AllParseSingleExample(VarLenDenseInt64);
+BM_AllParseSingleExample(SparseFloat);
+BM_AllParseSingleExample(DenseFloat);
+BM_AllParseSingleExample(VarLenDenseFloat);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/eye_functor.h b/tensorflow/core/kernels/eye_functor.h
index 70f093f81366e017f3a07614e319435e1bf5aca2..3799cfba9aea54a603af56c5ade9197f53f96dd1 100644
--- a/tensorflow/core/kernels/eye_functor.h
+++ b/tensorflow/core/kernels/eye_functor.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
 
 #include "tensorflow/core/framework/tensor_types.h"
 
@@ -29,4 +29,4 @@ struct EyeFunctor {
 }  // namespace functor
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_EYE_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/fact_op.cc b/tensorflow/core/kernels/fact_op.cc
index 4fbf76d2d0d0470c0529353003eb7e086451d57f..4a1aa433bc94e5f190ce75c1b991eaf91210eedf 100644
--- a/tensorflow/core/kernels/fact_op.cc
+++ b/tensorflow/core/kernels/fact_op.cc
@@ -122,13 +122,9 @@ static string D(const char* s) {
   return ret;
 }
 
-REGISTER_KERNEL_BUILDER(Name("Fact")
-                            .Device(DEVICE_CPU)
-                            .Label(D("Yoxmos").c_str()),
-                        FactOpKernel2);
-REGISTER_KERNEL_BUILDER(Name("Fact")
-                            .Device(DEVICE_CPU)
-                            .Label(D("yoxmos").c_str()),
-                        FactOpKernel2);
+REGISTER_KERNEL_BUILDER(
+    Name("Fact").Device(DEVICE_CPU).Label(D("Yoxmos").c_str()), FactOpKernel2);
+REGISTER_KERNEL_BUILDER(
+    Name("Fact").Device(DEVICE_CPU).Label(D("yoxmos").c_str()), FactOpKernel2);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fake_quant_ops.cc b/tensorflow/core/kernels/fake_quant_ops.cc
index 68762af8cf1e76211c0229163d9dce44fc0ad153..f5e279eca4c6d3492419a507c7d070613e169b64 100644
--- a/tensorflow/core/kernels/fake_quant_ops.cc
+++ b/tensorflow/core/kernels/fake_quant_ops.cc
@@ -45,7 +45,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 namespace {
-bool IsNumBitsValid(int num_bits) { return num_bits >= 2 && num_bits <= 8; }
+bool IsNumBitsValid(int num_bits) { return num_bits >= 2 && num_bits <= 16; }
 }  // namespace
 
 // -----------------------------------------------------------------------------
@@ -65,8 +65,9 @@ class FakeQuantWithMinMaxArgsOp
                                 " >= ", max_));
     int num_bits;
     OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
-    OP_REQUIRES(context, IsNumBitsValid(num_bits),
-                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    OP_REQUIRES(
+        context, IsNumBitsValid(num_bits),
+        InvalidArgument("num_bits must be between 2 and 16, inclusive"));
     bool narrow_range;
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
@@ -104,8 +105,9 @@ class FakeQuantWithMinMaxArgsGradientOp
                                 " >= ", max_));
     int num_bits;
     OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
-    OP_REQUIRES(context, IsNumBitsValid(num_bits),
-                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    OP_REQUIRES(
+        context, IsNumBitsValid(num_bits),
+        InvalidArgument("num_bits must be between 2 and 16, inclusive"));
     bool narrow_range;
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
@@ -175,8 +177,9 @@ class FakeQuantWithMinMaxVarsOp : public OpKernel {
       : OpKernel::OpKernel(context) {
     int num_bits;
     OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
-    OP_REQUIRES(context, IsNumBitsValid(num_bits),
-                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    OP_REQUIRES(
+        context, IsNumBitsValid(num_bits),
+        InvalidArgument("num_bits must be between 2 and 16, inclusive"));
     bool narrow_range;
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
@@ -213,8 +216,9 @@ class FakeQuantWithMinMaxVarsGradientOp : public OpKernel {
       : OpKernel::OpKernel(context) {
     int num_bits;
     OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
-    OP_REQUIRES(context, IsNumBitsValid(num_bits),
-                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    OP_REQUIRES(
+        context, IsNumBitsValid(num_bits),
+        InvalidArgument("num_bits must be between 2 and 16, inclusive"));
     bool narrow_range;
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
@@ -302,8 +306,9 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
       : OpKernel::OpKernel(context) {
     int num_bits;
     OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
-    OP_REQUIRES(context, IsNumBitsValid(num_bits),
-                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    OP_REQUIRES(
+        context, IsNumBitsValid(num_bits),
+        InvalidArgument("num_bits must be between 2 and 16, inclusive"));
     bool narrow_range;
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
@@ -348,8 +353,9 @@ class FakeQuantWithMinMaxVarsPerChannelGradientOp : public OpKernel {
       : OpKernel::OpKernel(context) {
     int num_bits;
     OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
-    OP_REQUIRES(context, IsNumBitsValid(num_bits),
-                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    OP_REQUIRES(
+        context, IsNumBitsValid(num_bits),
+        InvalidArgument("num_bits must be between 2 and 16, inclusive"));
     bool narrow_range;
     OP_REQUIRES_OK(context, context->GetAttr("narrow_range", &narrow_range));
     quant_min_ = narrow_range ? 1 : 0;
diff --git a/tensorflow/core/kernels/fake_quant_ops_functor.h b/tensorflow/core/kernels/fake_quant_ops_functor.h
index 7aaad6e6c7a48617d1a6cbc679eebc2297828f75..d51acc38ef7e5a865f51ac319a3ad16198714dd9 100644
--- a/tensorflow/core/kernels/fake_quant_ops_functor.h
+++ b/tensorflow/core/kernels/fake_quant_ops_functor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
+#define TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
 
 #include <tuple>
 
@@ -45,16 +45,16 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void Nudge(
   const float quant_max_float = static_cast<float>(quant_max);
   *scale = (max - min) / (quant_max_float - quant_min_float);
   const float zero_point_from_min = quant_min_float - min / *scale;
-  const uint8 nudged_zero_point = [zero_point_from_min, quant_min,
-                                   quant_min_float, quant_max,
-                                   quant_max_float] {
+  const uint16 nudged_zero_point = [zero_point_from_min, quant_min,
+                                    quant_min_float, quant_max,
+                                    quant_max_float] {
     if (zero_point_from_min < quant_min_float) {
-      return static_cast<uint8>(quant_min);
+      return static_cast<uint16>(quant_min);
     }
     if (zero_point_from_min > quant_max_float) {
-      return static_cast<uint8>(quant_max);
+      return static_cast<uint16>(quant_max);
     }
-    return static_cast<uint8>(StdRound(zero_point_from_min));
+    return static_cast<uint16>(StdRound(zero_point_from_min));
   }();
   *nudged_min = (quant_min_float - nudged_zero_point) * (*scale);
   *nudged_max = (quant_max_float - nudged_zero_point) * (*scale);
@@ -277,4 +277,4 @@ struct FakeQuantWithMinMaxVarsPerChannelGradientFunctor {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_FAKE_QUANT_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc
index 5953db14768fd4e8d6c8537a2bea91c2ca211b17..af3a42135d1fe99da87c1cfafbc2b8eb932a7d2c 100644
--- a/tensorflow/core/kernels/fake_quant_ops_test.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@@ -378,9 +378,8 @@ TEST_F(QuantOpsTest, WithArgsGradient_RegularRange) {
   Tensor* output = GetOutput(0);
   auto input_flat = GetInput(0).flat<float>();
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {0.0f, input_flat(1), input_flat(2),
-                     input_flat(3), input_flat(4), 0.0f});
+  FillValues<float>(&expected, {0.0f, input_flat(1), input_flat(2),
+                                input_flat(3), input_flat(4), 0.0f});
   ExpectClose(expected, *output);
 }
 
@@ -2167,21 +2166,19 @@ TEST_F(QuantOpsTest,
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, grad_flat(1), grad_flat(2),
-                     grad_flat(3), grad_flat(4), 0.0f});
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
   Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_min,
-                    {grad_flat(0), 0.0f, 0.0f});
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
   ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
 
   Tensor* output_bprop_wrt_max = GetOutput(2);
   Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_max,
-                    {0.0f, 0.0f, grad_flat(5)});
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
@@ -2215,21 +2212,19 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedUp_4Bits_NarrowRange) {
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, grad_flat(1), grad_flat(2),
-                     grad_flat(3), grad_flat(4), 0.0f});
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
   Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_min,
-                    {grad_flat(0), 0.0f, 0.0f});
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
   ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
 
   Tensor* output_bprop_wrt_max = GetOutput(2);
   Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
-  FillValues<float>(&expected_bprop_wrt_max,
-                    {0.0f, 0.0f, grad_flat(5)});
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
@@ -2270,14 +2265,13 @@ TEST_F(QuantOpsTest,
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
                                   TensorShape({1, 2, 3, 4}));
   auto grad_flat = GetInput(0).flat<float>();
-  FillValues<float>(
-      &expected_bprop_wrt_input,
-      {0.0f, grad_flat(1), grad_flat(2), 0.0f,
-       0.0f, grad_flat(5), grad_flat(6), 0.0f,
-       0.0f, grad_flat(9), grad_flat(10), 0.0f,
-       0.0f, grad_flat(13), grad_flat(14), 0.0f,
-       0.0f, grad_flat(17), grad_flat(18), 0.0f,
-       0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index 9fd82e2168383917b9002d43a894a2da064c2a34..479f7be4b506e4f8721216fb00ea0eff7e0394c2 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -95,7 +95,7 @@ Status FIFOQueue::GetElementComponentFromBatch(const FIFOQueue::Tuple& tuple,
   TF_RETURN_IF_ERROR(ctx->allocate_persistent(
       tuple[component].dtype(), element_shape, out_tensor, &element_access));
   TF_RETURN_IF_ERROR(
-      CopySliceToElement(tuple[component], element_access, index));
+      batch_util::CopySliceToElement(tuple[component], element_access, index));
   return Status::OK();
 }
 
@@ -255,97 +255,96 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, allow_small_batch, this](Attempt* attempt)
-              EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                int64 queue_size = queues_[0].size();
+          [callback, allow_small_batch,
+           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            int64 queue_size = queues_[0].size();
 
-                if (closed_ && queue_size < attempt->elements_requested) {
-                  // If we don't have enough for a full dequeue, we have
-                  // to reset the attempt tuple.
-                  if (!attempt->tuple.empty()) {
-                    // Restore already-dequeued elements to the front of the
-                    // queue.
-                    for (int64 i = attempt->tuple[0].dim_size(0) -
-                                   attempt->elements_requested - 1;
-                         i >= 0; --i) {
-                      for (int j = 0; j < num_components(); ++j) {
-                        PersistentTensor element;
-                        Status s = GetElementComponentFromBatch(
-                            attempt->tuple, i, j, attempt->context, &element);
-                        if (!s.ok()) {
-                          attempt->context->SetStatus(
-                              errors::DataLoss("Failed to restore element from "
-                                               "partially-dequeued batch "
-                                               "to FIFOQueue: ",
-                                               s.error_message()));
-                        }
-                        queues_[j].push_front(element);
-                      }
-                    }
-                  }
-                  if (allow_small_batch && !queues_[0].empty()) {
-                    // Request all remaining elements in the queue.
-                    queue_size = queues_[0].size();
-                    attempt->tuple.clear();
-                    attempt->elements_requested = queue_size;
-                  } else {
-                    if (allow_small_batch) {
-                      // There may be some other attempts containing
-                      // values.  If so, we'll yield and wait for them
-                      // to add elements to the queue.
-                      if (!enqueue_attempts_.empty()) return kProgress;
-                    }
-                    if (attempt->context->status().ok()) {
-                      attempt->context->SetStatus(errors::OutOfRange(
-                          "FIFOQueue '", name_, "' is closed and has ",
-                          "insufficient elements (requested ",
-                          attempt->elements_requested, ", current size ",
-                          queue_size, ")"));
+            if (closed_ && queue_size < attempt->elements_requested) {
+              // If we don't have enough for a full dequeue, we have
+              // to reset the attempt tuple.
+              if (!attempt->tuple.empty()) {
+                // Restore already-dequeued elements to the front of the
+                // queue.
+                for (int64 i = attempt->tuple[0].dim_size(0) -
+                               attempt->elements_requested - 1;
+                     i >= 0; --i) {
+                  for (int j = 0; j < num_components(); ++j) {
+                    PersistentTensor element;
+                    Status s = GetElementComponentFromBatch(
+                        attempt->tuple, i, j, attempt->context, &element);
+                    if (!s.ok()) {
+                      attempt->context->SetStatus(
+                          errors::DataLoss("Failed to restore element from "
+                                           "partially-dequeued batch "
+                                           "to FIFOQueue: ",
+                                           s.error_message()));
                     }
-                    return kComplete;
+                    queues_[j].push_front(element);
                   }
                 }
+              }
+              if (allow_small_batch && !queues_[0].empty()) {
+                // Request all remaining elements in the queue.
+                queue_size = queues_[0].size();
+                attempt->tuple.clear();
+                attempt->elements_requested = queue_size;
+              } else {
+                if (allow_small_batch) {
+                  // There may be some other attempts containing
+                  // values.  If so, we'll yield and wait for them
+                  // to add elements to the queue.
+                  if (!enqueue_attempts_.empty()) return kProgress;
+                }
+                if (attempt->context->status().ok()) {
+                  attempt->context->SetStatus(errors::OutOfRange(
+                      "FIFOQueue '", name_, "' is closed and has ",
+                      "insufficient elements (requested ",
+                      attempt->elements_requested, ", current size ",
+                      queue_size, ")"));
+                }
+                return kComplete;
+              }
+            }
 
-                RunResult result = kNoProgress;
-                for (; queue_size > 0; --queue_size) {
-                  if (attempt->tuple.empty()) {
-                    // Only allocate tuple when we have something to dequeue
-                    // so we don't use excessive memory when there are many
-                    // blocked dequeue attempts waiting.
-                    attempt->tuple.reserve(num_components());
-                    for (int i = 0; i < num_components(); ++i) {
-                      const TensorShape shape =
-                          ManyOutShape(i, attempt->elements_requested);
-                      Tensor element;
-                      attempt->context->SetStatus(
-                          attempt->context->allocate_temp(component_dtypes_[i],
-                                                          shape, &element));
-                      if (!attempt->context->status().ok()) return kComplete;
-                      attempt->tuple.emplace_back(element);
-                    }
-                  }
-                  result = kProgress;
-                  Tuple tuple;
-                  DequeueLocked(attempt->context, &tuple);
-                  const int64 index = attempt->tuple[0].dim_size(0) -
-                                      attempt->elements_requested;
-                  for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
-                        std::move(tuple[i]), &attempt->tuple[i], index));
-                    if (!attempt->context->status().ok()) return kComplete;
-                  }
-                  tuple.clear();
-                  --attempt->elements_requested;
-                  if (attempt->elements_requested == 0) {
-                    tuple = attempt->tuple;
-                    attempt->done_callback = [callback, tuple]() {
-                      callback(tuple);
-                    };
-                    return kComplete;
-                  }
+            RunResult result = kNoProgress;
+            for (; queue_size > 0; --queue_size) {
+              if (attempt->tuple.empty()) {
+                // Only allocate tuple when we have something to dequeue
+                // so we don't use excessive memory when there are many
+                // blocked dequeue attempts waiting.
+                attempt->tuple.reserve(num_components());
+                for (int i = 0; i < num_components(); ++i) {
+                  const TensorShape shape =
+                      ManyOutShape(i, attempt->elements_requested);
+                  Tensor element;
+                  attempt->context->SetStatus(attempt->context->allocate_temp(
+                      component_dtypes_[i], shape, &element));
+                  if (!attempt->context->status().ok()) return kComplete;
+                  attempt->tuple.emplace_back(element);
                 }
-                return result;
-              });
+              }
+              result = kProgress;
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              const int64 index =
+                  attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                    std::move(tuple[i]), &attempt->tuple[i], index));
+                if (!attempt->context->status().ok()) return kComplete;
+              }
+              tuple.clear();
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                tuple = attempt->tuple;
+                attempt->done_callback = [callback, tuple]() {
+                  callback(tuple);
+                };
+                return kComplete;
+              }
+            }
+            return result;
+          });
     }
   }
   if (!already_cancelled) {
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index ea0cc139f3da4fb7a5fc7d092cff6c5b0be792d5..7090417dfdb2d7e433025b1a0f1cdeb5eece10a8 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
@@ -41,6 +42,7 @@ void SetZeroFunctor<Eigen::ThreadPoolDevice, string>::operator()(
   template struct SetZeroFunctor<Eigen::ThreadPoolDevice, T>;
 DEFINE_SETZERO_CPU(bool);
 DEFINE_SETZERO_CPU(Eigen::half);
+DEFINE_SETZERO_CPU(bfloat16);
 DEFINE_SETZERO_CPU(float);
 DEFINE_SETZERO_CPU(double);
 DEFINE_SETZERO_CPU(uint8);
@@ -58,7 +60,7 @@ DEFINE_SETZERO_CPU(Variant);
 template <typename T>
 void SetZeroFunctor<Eigen::SyclDevice, T>::operator()(
     const Eigen::SyclDevice& d, typename TTypes<T>::Flat out) {
-      To32Bit(out).device(d) = To32Bit(out).constant(T(0));
+  To32Bit(out).device(d) = To32Bit(out).constant(T(0));
 }
 
 #define DEFINE_SETZERO_SYCL(T) \
@@ -74,6 +76,7 @@ DEFINE_SETZERO_SYCL(int32);
 DEFINE_SETZERO_SYCL(int64);
 #undef DEFINE_SETZERO_SYCL
 #endif  // TENSORFLOW_USE_SYCL
+
 template <typename T>
 void SetOneFunctor<Eigen::ThreadPoolDevice, T>::operator()(
     const Eigen::ThreadPoolDevice& d, typename TTypes<T>::Flat out) {
@@ -85,6 +88,7 @@ void SetOneFunctor<Eigen::ThreadPoolDevice, T>::operator()(
   template struct SetOneFunctor<Eigen::ThreadPoolDevice, T>;
 DEFINE_SETONE_CPU(bool);
 DEFINE_SETONE_CPU(Eigen::half);
+DEFINE_SETONE_CPU(bfloat16);
 DEFINE_SETONE_CPU(float);
 DEFINE_SETONE_CPU(double);
 DEFINE_SETONE_CPU(uint8);
@@ -112,5 +116,47 @@ DEFINE_SETONE_SYCL(double);
 #undef DEFINE_SETONE_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
+template <typename T>
+struct FillFunctor<Eigen::ThreadPoolDevice, T> {
+  void operator()(const Eigen::ThreadPoolDevice& d,
+                  typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in) {
+    out.device(d) = out.constant(in());
+  }
+};
+
+// Explicit instantiations.
+#define DEFINE_FILL_CPU(T) \
+  template struct FillFunctor<Eigen::ThreadPoolDevice, T>;
+
+TF_CALL_ALL_TYPES(DEFINE_FILL_CPU);
+DEFINE_FILL_CPU(quint8);
+DEFINE_FILL_CPU(quint16);
+#undef DEFINE_FILL_CPU
+
+#ifdef TENSORFLOW_USE_SYCL
+template <typename T>
+struct FillFunctor<Eigen::SyclDevice, T> {
+  void operator()(const Eigen::SyclDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in) {
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::array<int, 1> rank1{1};
+#else
+    Eigen::IndexList<Eigen::type2index<1> > rank1;
+#endif
+    const int size = out.dimension(0);
+    Eigen::array<int, 1> broadcast_dims{size};
+
+    To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims);
+  }
+};
+
+#define DEFINE_FILL_SYCL(T) template struct FillFunctor<Eigen::SyclDevice, T>;
+DEFINE_FILL_SYCL(float);
+DEFINE_FILL_SYCL(double);
+TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)
+#undef DEFINE_FILL_SYCL
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.cu.cc b/tensorflow/core/kernels/fill_functor.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3487606778eabde386335f8450d627b7bf74ad42
--- /dev/null
+++ b/tensorflow/core/kernels/fill_functor.cu.cc
@@ -0,0 +1,112 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace Eigen {
+namespace internal {
+
+template <typename T>
+struct scalar_const_op {
+  typedef typename packet_traits<T>::type Packet;
+
+  const T* val;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  scalar_const_op(const scalar_const_op& x)
+      : val(x.val) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_const_op(const T* v) : val(v) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()() const {
+    return *val;
+  }
+
+  template <typename PacketType = Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const {
+    return internal::pset1<PacketType>(*val);
+  }
+};
+
+template <typename T>
+struct functor_traits<scalar_const_op<T> > {
+  enum {
+    Cost = 1,
+    PacketAccess = packet_traits<T>::Vectorizable,
+    IsRepeatable = true
+  };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+namespace tensorflow {
+
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Partial specialization FillFunctor<Device=GPUDevice, T>
+template <typename T>
+struct FillFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out,
+                  typename TTypes<T>::ConstScalar in) {
+    Eigen::internal::scalar_const_op<T> f(in.data());
+    To32Bit(out).device(d) = To32Bit(out).nullaryExpr(f);
+  }
+};
+
+#define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
+TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
+TF_CALL_bool(DEFINE_FILL_GPU);
+#undef DEFINE_FILL_GPU
+
+// Partial specialization of FillFunctor<Device=GPUDevice, T>.
+template <typename T>
+struct SetZeroFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
+    To32Bit(out).device(d) = To32Bit(out).constant(T(0));
+  }
+};
+
+#define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>;
+TF_CALL_NUMBER_TYPES(DEFINE_SETZERO_GPU);
+TF_CALL_bool(DEFINE_SETZERO_GPU);
+#undef DEFINE_SETZERO_GPU
+
+// Partial specialization of FillFunctor<Device=GPUDevice, T>.
+template <typename T>
+struct SetOneFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat out) {
+    To32Bit(out).device(d) = To32Bit(out).constant(T(1));
+  }
+};
+
+#define DEFINE_SETONE_GPU(T) template struct SetOneFunctor<GPUDevice, T>;
+TF_CALL_NUMBER_TYPES(DEFINE_SETONE_GPU);
+TF_CALL_bool(DEFINE_SETONE_GPU);
+#undef DEFINE_SETONE_GPU
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc
index bfdb7b4a1e4cc9af9745896c5ff1341f00efdffe..135d0023458b1ef393ab0bc296dc07310347e7ff 100644
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/util/guarded_philox_random.h"
@@ -47,9 +48,20 @@ class FractionalAvgPoolOp : public OpKernel {
         errors::Unimplemented("Fractional average pooling is not yet "
                               "supported on the batch nor channel dimension."));
     OP_REQUIRES_OK(context, context->GetAttr("deterministic", &deterministic_));
-    pooling_region_generated_ = false;
-    // Initialize philox random generator.
-    OP_REQUIRES_OK(context, generator_.Init(context));
+    OP_REQUIRES_OK(context, context->GetAttr("seed", &seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2_));
+    if (deterministic_) {
+      // If both seeds are not set when deterministic_ is true, force set seeds.
+      if ((seed_ == 0) && (seed2_ == 0)) {
+        seed_ = random::New64();
+        seed2_ = random::New64();
+      }
+    } else {
+      OP_REQUIRES(
+          context, (seed_ == 0) && (seed2_ == 0),
+          errors::InvalidArgument(
+              "Both seed and seed2 should be 0 if deterministic is false."));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -64,47 +76,35 @@ class FractionalAvgPoolOp : public OpKernel {
     OP_REQUIRES(context, tensor_in.dims() == tensor_in_and_out_dims,
                 errors::InvalidArgument("tensor_in must be 4-dimensional"));
 
+    std::vector<int> input_size(tensor_in_and_out_dims);
+    std::vector<int> output_size(tensor_in_and_out_dims);
     for (int i = 0; i < tensor_in_and_out_dims; ++i) {
-      input_size_.push_back(tensor_in.dim_size(i));
+      input_size[i] = tensor_in.dim_size(i);
     }
     // Output size.
     for (int i = 0; i < tensor_in_and_out_dims; ++i) {
-      output_size_.push_back(
-          static_cast<int>(floor(input_size_[i] / pooling_ratio_[i])));
-      DCHECK_GT(output_size_[i], 0);
+      output_size[i] =
+          static_cast<int>(floor(input_size[i] / pooling_ratio_[i]));
+      DCHECK_GT(output_size[i], 0);
     }
 
     // Generate pooling sequence.
     std::vector<int64> row_cum_seq;
     std::vector<int64> col_cum_seq;
-    if (deterministic_) {
-      if (pooling_region_generated_) {
-        row_cum_seq = row_cum_seq_;
-        col_cum_seq = col_cum_seq_;
-      } else {
-        row_cum_seq = GeneratePoolingSequence(input_size_[1], output_size_[1],
-                                              &generator_, pseudo_random_);
-        col_cum_seq = GeneratePoolingSequence(input_size_[2], output_size_[2],
-                                              &generator_, pseudo_random_);
-        mutex_lock lock(mu_);
-        row_cum_seq_ = row_cum_seq;
-        col_cum_seq_ = col_cum_seq;
-        pooling_region_generated_ = true;
-      }
-    } else {
-      row_cum_seq = GeneratePoolingSequence(input_size_[1], output_size_[1],
-                                            &generator_, pseudo_random_);
-      col_cum_seq = GeneratePoolingSequence(input_size_[2], output_size_[2],
-                                            &generator_, pseudo_random_);
-    }
+    GuardedPhiloxRandom generator;
+    generator.Init(seed_, seed2_);
+    row_cum_seq = GeneratePoolingSequence(input_size[1], output_size[1],
+                                          &generator, pseudo_random_);
+    col_cum_seq = GeneratePoolingSequence(input_size[2], output_size[2],
+                                          &generator, pseudo_random_);
 
     // Prepare output.
     Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       0, TensorShape({output_size_[0], output_size_[1],
-                                       output_size_[2], output_size_[3]}),
-                       &output_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0,
+                                TensorShape({output_size[0], output_size[1],
+                                             output_size[2], output_size[3]}),
+                                &output_tensor));
     Tensor* output_row_seq_tensor = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(
@@ -116,12 +116,11 @@ class FractionalAvgPoolOp : public OpKernel {
                        2, TensorShape({static_cast<int64>(col_cum_seq.size())}),
                        &output_col_seq_tensor));
 
-    ConstEigenMatrixMap in_mat(
-        tensor_in.flat<T>().data(), input_size_[3],
-        input_size_[2] * input_size_[1] * input_size_[0]);
+    ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), input_size[3],
+                               input_size[2] * input_size[1] * input_size[0]);
 
-    EigenMatrixMap out_mat(output_tensor->flat<T>().data(), output_size_[3],
-                           output_size_[2] * output_size_[1] * output_size_[0]);
+    EigenMatrixMap out_mat(output_tensor->flat<T>().data(), output_size[3],
+                           output_size[2] * output_size[1] * output_size[0]);
     // out_count corresponds to number of elements in each pooling cell.
     Eigen::Matrix<T, Eigen::Dynamic, 1> out_count(out_mat.cols());
 
@@ -146,9 +145,9 @@ class FractionalAvgPoolOp : public OpKernel {
     // 1: row / row
     // 2: col / col
     // 3: depth / channel
-    const int64 row_max = input_size_[1] - 1;
-    const int64 col_max = input_size_[2] - 1;
-    for (int64 b = 0; b < input_size_[0]; ++b) {
+    const int64 row_max = input_size[1] - 1;
+    const int64 col_max = input_size[2] - 1;
+    for (int64 b = 0; b < input_size[0]; ++b) {
       // row sequence.
       for (int64 hs = 0; hs < row_cum_seq.size() - 1; ++hs) {
         // row start and end.
@@ -160,7 +159,7 @@ class FractionalAvgPoolOp : public OpKernel {
         // col sequence.
         for (int64 ws = 0; ws < col_cum_seq.size() - 1; ++ws) {
           const int64 out_offset =
-              (b * output_size_[1] + hs) * output_size_[2] + ws;
+              (b * output_size[1] + hs) * output_size[2] + ws;
           // col start and end.
           const int64 col_start = col_cum_seq[ws];
           int64 col_end =
@@ -169,7 +168,7 @@ class FractionalAvgPoolOp : public OpKernel {
           for (int64 h = row_start; h <= row_end; ++h) {
             for (int64 w = col_start; w <= col_end; ++w) {
               const int64 in_offset =
-                  (b * input_size_[1] + h) * input_size_[2] + w;
+                  (b * input_size[1] + h) * input_size[2] + w;
               out_mat.col(out_offset) += in_mat.col(in_offset);
               out_count(out_offset)++;
             }
@@ -183,18 +182,11 @@ class FractionalAvgPoolOp : public OpKernel {
 
  private:
   bool deterministic_;
-  // meaningful only when deterministic_ is true.
-  mutex mu_;
-  std::vector<int64> row_cum_seq_;
-  std::vector<int64> col_cum_seq_;
-  bool pooling_region_generated_;
-
-  std::vector<int32> input_size_;
-  std::vector<int32> output_size_;
+  int64 seed_;
+  int64 seed2_;
   std::vector<float> pooling_ratio_;
   bool pseudo_random_;
   bool overlapping_;
-  GuardedPhiloxRandom generator_;
 };
 
 #define REGISTER_FRACTIONALAVGPOOL(type)                                      \
@@ -240,8 +232,9 @@ class FractionalAvgPoolGradOp : public OpKernel {
 
     // Grab the inputs.
     const Tensor& orig_input_tensor_shape = context->input(0);
-    OP_REQUIRES(context, orig_input_tensor_shape.dims() == 1 &&
-                             orig_input_tensor_shape.NumElements() == 4,
+    OP_REQUIRES(context,
+                orig_input_tensor_shape.dims() == 1 &&
+                    orig_input_tensor_shape.NumElements() == 4,
                 errors::InvalidArgument("original input tensor shape must be"
                                         "1-dimensional and 4 elements"));
     const Tensor& out_backprop = context->input(1);
diff --git a/tensorflow/core/kernels/fractional_max_pool_op.cc b/tensorflow/core/kernels/fractional_max_pool_op.cc
index 33d73c84776341cf08243d828ee372456554e2cf..cf580adab256bf055f206f44a5996c1e5487540a 100644
--- a/tensorflow/core/kernels/fractional_max_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_max_pool_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/util/guarded_philox_random.h"
@@ -50,9 +51,20 @@ class FractionalMaxPoolOp : public OpKernel {
                               "supported on the batch nor channel dimension."));
 
     OP_REQUIRES_OK(context, context->GetAttr("deterministic", &deterministic_));
-    pooling_region_generated_ = false;
-    // Initialize philox random generator.
-    OP_REQUIRES_OK(context, generator_.Init(context));
+    OP_REQUIRES_OK(context, context->GetAttr("seed", &seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2_));
+    if (deterministic_) {
+      // If both seeds are not set when deterministic_ is true, force set seeds.
+      if ((seed_ == 0) && (seed2_ == 0)) {
+        seed_ = random::New64();
+        seed2_ = random::New64();
+      }
+    } else {
+      OP_REQUIRES(
+          context, (seed_ == 0) && (seed2_ == 0),
+          errors::InvalidArgument(
+              "Both seed and seed2 should be 0 if deterministic is false."));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -67,49 +79,37 @@ class FractionalMaxPoolOp : public OpKernel {
     OP_REQUIRES(context, tensor_in.dims() == tensor_in_and_out_dims,
                 errors::InvalidArgument("tensor_in must be 4-dimensional"));
 
+    std::vector<int> input_size(tensor_in_and_out_dims);
+    std::vector<int> output_size(tensor_in_and_out_dims);
     for (int i = 0; i < tensor_in_and_out_dims; ++i) {
-      input_size_.push_back(tensor_in.dim_size(i));
+      input_size[i] = tensor_in.dim_size(i);
     }
     // Output size.
     for (int i = 0; i < tensor_in_and_out_dims; ++i) {
       // This must match the same logic in the shape function in
       // core/ops/nn_ops.cc.
-      output_size_.push_back(
-          static_cast<int>(floor(input_size_[i] / pooling_ratio_[i])));
-      DCHECK_GT(output_size_[i], 0);
+      output_size[i] =
+          static_cast<int>(floor(input_size[i] / pooling_ratio_[i]));
+      DCHECK_GT(output_size[i], 0);
     }
 
     // Generate pooling sequence.
     std::vector<int64> height_cum_seq;
     std::vector<int64> width_cum_seq;
-    if (deterministic_) {
-      if (pooling_region_generated_) {
-        height_cum_seq = height_cum_seq_;
-        width_cum_seq = width_cum_seq_;
-      } else {
-        height_cum_seq = GeneratePoolingSequence(
-            input_size_[1], output_size_[1], &generator_, pseudo_random_);
-        width_cum_seq = GeneratePoolingSequence(input_size_[2], output_size_[2],
-                                                &generator_, pseudo_random_);
-        mutex_lock lock(mu_);
-        height_cum_seq_ = height_cum_seq;
-        width_cum_seq_ = width_cum_seq;
-        pooling_region_generated_ = true;
-      }
-    } else {
-      height_cum_seq = GeneratePoolingSequence(input_size_[1], output_size_[1],
-                                               &generator_, pseudo_random_);
-      width_cum_seq = GeneratePoolingSequence(input_size_[2], output_size_[2],
-                                              &generator_, pseudo_random_);
-    }
+    GuardedPhiloxRandom generator;
+    generator.Init(seed_, seed2_);
+    height_cum_seq = GeneratePoolingSequence(input_size[1], output_size[1],
+                                             &generator, pseudo_random_);
+    width_cum_seq = GeneratePoolingSequence(input_size[2], output_size[2],
+                                            &generator, pseudo_random_);
 
     // Prepare output.
     Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       0, TensorShape({output_size_[0], output_size_[1],
-                                       output_size_[2], output_size_[3]}),
-                       &output_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0,
+                                TensorShape({output_size[0], output_size[1],
+                                             output_size[2], output_size[3]}),
+                                &output_tensor));
     Tensor* output_height_seq_tensor = nullptr;
     OP_REQUIRES_OK(
         context,
@@ -122,12 +122,11 @@ class FractionalMaxPoolOp : public OpKernel {
                      2, TensorShape({static_cast<int64>(width_cum_seq.size())}),
                      &output_width_seq_tensor));
 
-    ConstEigenMatrixMap in_mat(
-        tensor_in.flat<T>().data(), input_size_[3],
-        input_size_[2] * input_size_[1] * input_size_[0]);
+    ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), input_size[3],
+                               input_size[2] * input_size[1] * input_size[0]);
 
-    EigenMatrixMap out_mat(output_tensor->flat<T>().data(), output_size_[3],
-                           output_size_[2] * output_size_[1] * output_size_[0]);
+    EigenMatrixMap out_mat(output_tensor->flat<T>().data(), output_size[3],
+                           output_size[2] * output_size[1] * output_size[0]);
 
     // Initializes the output tensor with MIN<T>.
     output_tensor->flat<T>().setConstant(Eigen::NumTraits<T>::lowest());
@@ -149,9 +148,9 @@ class FractionalMaxPoolOp : public OpKernel {
     // 1: height / row
     // 2: width / col
     // 3: depth / channel
-    const int64 height_max = input_size_[1] - 1;
-    const int64 width_max = input_size_[2] - 1;
-    for (int64 b = 0; b < input_size_[0]; ++b) {
+    const int64 height_max = input_size[1] - 1;
+    const int64 width_max = input_size[2] - 1;
+    for (int64 b = 0; b < input_size[0]; ++b) {
       // height sequence.
       for (int64 hs = 0; hs < height_cum_seq.size() - 1; ++hs) {
         // height start and end.
@@ -163,7 +162,7 @@ class FractionalMaxPoolOp : public OpKernel {
         // width sequence.
         for (int64 ws = 0; ws < width_cum_seq.size() - 1; ++ws) {
           const int64 out_offset =
-              (b * output_size_[1] + hs) * output_size_[2] + ws;
+              (b * output_size[1] + hs) * output_size[2] + ws;
           // width start and end.
           const int64 width_start = width_cum_seq[ws];
           int64 width_end =
@@ -172,7 +171,7 @@ class FractionalMaxPoolOp : public OpKernel {
           for (int64 h = height_start; h <= height_end; ++h) {
             for (int64 w = width_start; w <= width_end; ++w) {
               const int64 in_offset =
-                  (b * input_size_[1] + h) * input_size_[2] + w;
+                  (b * input_size[1] + h) * input_size[2] + w;
               out_mat.col(out_offset) =
                   out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
             }
@@ -184,18 +183,11 @@ class FractionalMaxPoolOp : public OpKernel {
 
  private:
   bool deterministic_;
-  // meaningful only when deterministic_ is true.
-  mutex mu_;
-  std::vector<int64> height_cum_seq_;
-  std::vector<int64> width_cum_seq_;
-  bool pooling_region_generated_;
-
-  std::vector<int32> input_size_;
-  std::vector<int32> output_size_;
+  int64 seed_;
+  int64 seed2_;
   std::vector<float> pooling_ratio_;
   bool pseudo_random_;
   bool overlapping_;
-  GuardedPhiloxRandom generator_;
 };
 
 #define REGISTER_FRACTIONALMAXPOOL(type)                                      \
@@ -243,15 +235,13 @@ class FractionalMaxPoolGradOp : public OpKernel {
 
     // Just to make it similar to FractionalMaxPoolOp.
     constexpr int tensor_in_and_out_dims = 4;
-    std::vector<int64> input_size;
-    std::vector<int64> output_size;
-    input_size.reserve(tensor_in_and_out_dims);
+    std::vector<int64> input_size(tensor_in_and_out_dims);
+    std::vector<int64> output_size(tensor_in_and_out_dims);
     for (int i = 0; i < tensor_in_and_out_dims; ++i) {
-      input_size.push_back(tensor_in.dim_size(i));
+      input_size[i] = tensor_in.dim_size(i);
     }
-    output_size.reserve(tensor_in_and_out_dims);
     for (int i = 0; i < tensor_in_and_out_dims; ++i) {
-      output_size.push_back(tensor_out.dim_size(i));
+      output_size[i] = tensor_out.dim_size(i);
     }
 
     // ---------
diff --git a/tensorflow/core/kernels/fractional_pool_common.h b/tensorflow/core/kernels/fractional_pool_common.h
index df0bbbfa066bca4705ff371d1823f789a1c4e9ef..2d7a230fc00613d91d147d4927403ba270a4d562 100644
--- a/tensorflow/core/kernels/fractional_pool_common.h
+++ b/tensorflow/core/kernels/fractional_pool_common.h
@@ -57,7 +57,7 @@ static inline void RandomShuffle(Iter first, Iter last, const Random& uniform) {
 //     * sum(generated_diff_pooling_sequence) = input_length
 //     * Let's define floor(input_length / output_length) = K, then
 //       K <= generated_diff_pooling_sequence[i] <= K+1
-// For example, when input_length = 10, output_length = 6, the followings are
+// For example, when input_length = 10, output_length = 6, the following are
 // valid pooling sequence:
 //     * [1, 2, 2, 1, 2, 2]
 //     * [1, 1, 2, 2, 2, 2]
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index f2290e87a5fdac44629ed6b81c8661cf74c2054e..9d4bc35ba890c251b0800f266e7845e411e7a835 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -53,6 +53,8 @@ class ArgOp : public OpKernel {
     ctx->set_output(0, val);
   }
 
+  bool IsExpensive() override { return false; }
+
  private:
   int index_;
   DataType dtype_;
@@ -78,6 +80,8 @@ class RetvalOp : public OpKernel {
     OP_REQUIRES_OK(ctx, frame->SetRetval(index_, val));
   }
 
+  bool IsExpensive() override { return false; }
+
  private:
   int index_;
   DataType dtype_;
@@ -249,22 +253,21 @@ class SymbolicGradientOp : public AsyncOpKernel {
       args.push_back(ctx->input(i));
     }
     std::vector<Tensor>* rets = new std::vector<Tensor>;
-    lib->Run(
-        opts, handle, args, rets, [ctx, done, rets](const Status& status) {
-          if (!status.ok()) {
-            ctx->SetStatus(status);
-          } else if (rets->size() != ctx->num_outputs()) {
-            ctx->SetStatus(errors::InvalidArgument(
-                "SymGrad expects to return ", ctx->num_outputs(),
-                " tensor(s), but get ", rets->size(), " tensor(s) instead."));
-          } else {
-            for (size_t i = 0; i < rets->size(); ++i) {
-              ctx->set_output(i, (*rets)[i]);
-            }
-          }
-          delete rets;
-          done();
-        });
+    lib->Run(opts, handle, args, rets, [ctx, done, rets](const Status& status) {
+      if (!status.ok()) {
+        ctx->SetStatus(status);
+      } else if (rets->size() != ctx->num_outputs()) {
+        ctx->SetStatus(errors::InvalidArgument(
+            "SymGrad expects to return ", ctx->num_outputs(),
+            " tensor(s), but get ", rets->size(), " tensor(s) instead."));
+      } else {
+        for (size_t i = 0; i < rets->size(); ++i) {
+          ctx->set_output(i, (*rets)[i]);
+        }
+      }
+      delete rets;
+      done();
+    });
   }
 
  private:
@@ -292,21 +295,21 @@ class RemoteCallOp : public AsyncOpKernel {
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     const Tensor* target;
     OP_REQUIRES_OK_ASYNC(ctx, ctx->input("target", &target), done);
-    AttrValueMap attr_values = func_.attr();
-    AttrValue v;
     const string& target_device =
         DeviceNameUtils::CanonicalizeDeviceName(target->scalar<string>()());
-    v.set_s(target_device);
-    AddAttr("_target", v, &attr_values);
 
     FunctionLibraryRuntime* lib = ctx->function_library();
     OP_REQUIRES_ASYNC(ctx, lib != nullptr,
                       errors::Internal("No function library is provided."),
                       done);
+    AttrValueMap attr_values = func_.attr();
+    FunctionLibraryRuntime::InstantiateOptions instantiate_opts;
+    instantiate_opts.target = target_device;
     FunctionLibraryRuntime::Handle handle;
-    OP_REQUIRES_OK_ASYNC(
-        ctx, lib->Instantiate(func_.name(), AttrSlice(&attr_values), &handle),
-        done);
+    OP_REQUIRES_OK_ASYNC(ctx,
+                         lib->Instantiate(func_.name(), AttrSlice(&attr_values),
+                                          instantiate_opts, &handle),
+                         done);
 
     OpInputList arguments;
     OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &arguments), done);
@@ -318,7 +321,7 @@ class RemoteCallOp : public AsyncOpKernel {
     if (opts.source_device != target_device) {
       opts.remote_execution = true;
     }
-    opts.rendezvous = ctx->rendezvous();
+    opts.create_rendezvous = true;
     std::vector<Tensor> args;
     args.reserve(arguments.size());
     for (const Tensor& argument : arguments) {
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b687088db16a31d8ecb74a7a483c35d2c65a74f9
--- /dev/null
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -0,0 +1,322 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef FunctionLibraryRuntime::Handle FHandle;
+typedef std::vector<Tensor> TensorVec;
+
+namespace {
+
+// Helper to instantiate function "func" in the library "lib".
+Status Instantiate(FunctionLibraryRuntime* lib, const NameAttrList& func,
+                   FunctionLibraryRuntime::Handle* handle) {
+  return lib->Instantiate(func.name(), AttrSlice(&func.attr()), handle);
+}
+
+// If "t" is a scalar of a supported type, returns t != 0 in "*v".
+Status ToBool(gtl::ArraySlice<Tensor> t, bool* v) {
+  if (t.size() != 1) {
+    return errors::InvalidArgument(
+        "Expected a single scalar which can be converted to a boolean, got ",
+        t.size(), " tensors.");
+  }
+  if (TensorShapeUtils::IsScalar(t[0].shape())) {
+    switch (t[0].dtype()) {
+#define CASE(T)                   \
+  case DataTypeToEnum<T>::value:  \
+    *v = t[0].scalar<T>()() != 0; \
+    break;
+
+      CASE(float);
+      CASE(double);
+      CASE(int32);
+      CASE(uint8);
+      CASE(int16);
+      CASE(int8);
+      CASE(int64);
+#undef CASE
+      case DT_BOOL:
+        *v = t[0].scalar<bool>()();
+        break;
+      case DT_STRING:
+        *v = !t[0].scalar<string>()().empty();
+        break;
+      default:
+        return errors::InvalidArgument(DataTypeString(t[0].dtype()),
+                                       " cannot be converted to a boolean");
+    }
+  } else {
+    *v = t[0].NumElements() > 0;
+  }
+  return Status::OK();
+}
+
+// Sets "rets" to be the output of "ctx". Validates rets' types based
+// on "kernel".
+Status SetOutputs(const OpKernel* kernel, OpKernelContext* ctx,
+                  gtl::ArraySlice<Tensor> rets) {
+  if (rets.size() != ctx->num_outputs()) {
+    return errors::Internal("Expect to produce ", ctx->num_outputs(),
+                            " tensors, but only get ", rets.size());
+  }
+  for (int i = 0; i < rets.size(); ++i) {
+    if (rets[i].dtype() != kernel->output_type(i)) {
+      return errors::Internal("Expect ", i, "-th output is of type ",
+                              DataTypeString(kernel->output_type(i)),
+                              " but get ", DataTypeString(rets[i].dtype()));
+    }
+    ctx->set_output(i, rets[i]);
+  }
+  return Status::OK();
+}
+
+void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
+                   bool always_collect_stats) {
+  opts->step_id = ctx->step_id();
+  opts->rendezvous = ctx->rendezvous();
+  opts->cancellation_manager = ctx->cancellation_manager();
+  if (always_collect_stats) {
+    opts->stats_collector = ctx->stats_collector();
+  }
+  opts->runner = ctx->runner();
+}
+
+}  // end namespace
+
+class FunctionalIf : public AsyncOpKernel {
+ public:
+  explicit FunctionalIf(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    auto lib = ctx->function_library();
+    OP_REQUIRES(ctx, lib != nullptr, errors::Internal("No function library"));
+    const NameAttrList* func;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("then_branch", &func));
+    OP_REQUIRES_OK(ctx, Instantiate(lib, *func, &then_handle_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("else_branch", &func));
+    OP_REQUIRES_OK(ctx, Instantiate(lib, *func, &else_handle_));
+  }
+
+  ~FunctionalIf() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    bool cond;
+    OP_REQUIRES_OK(ctx, ToBool({ctx->input(0)}, &cond));
+    (new State(this, ctx, cond, done))->Start();
+  }
+
+ private:
+  FHandle then_handle_;
+  FHandle else_handle_;
+
+  class State {
+   public:
+    State(FunctionalIf* kernel, OpKernelContext* ctx, bool cond,
+          DoneCallback done)
+        : kernel_(kernel),
+          ctx_(ctx),
+          cond_(cond),
+          done_(done),
+          lib_(CHECK_NOTNULL(ctx_->function_library())) {
+      SetRunOptions(ctx_, &opts_, true /* always_collect_stats */);
+      for (int i = 1; i < ctx_->num_inputs(); ++i) {
+        args_.push_back(ctx_->input(i));
+      }
+    }
+
+    ~State() {}
+
+    void Start() {
+      FHandle handle = cond_ ? kernel_->then_handle_ : kernel_->else_handle_;
+      rets_.clear();
+      lib_->Run(
+          // Evaluate one of the branch.
+          opts_, handle, args_, &rets_,
+          // Done callback
+          [this](Status s) {
+            if (s.ok()) {
+              s = SetOutputs(kernel_, ctx_, rets_);
+            }
+            ctx_->SetStatus(s);
+            auto done = done_;
+            delete this;
+            done();
+          });
+    }
+
+   private:
+    FunctionalIf* const kernel_;
+    OpKernelContext* const ctx_;
+    const bool cond_;
+    const DoneCallback done_;
+    FunctionLibraryRuntime* const lib_;
+    FunctionLibraryRuntime::Options opts_;
+    TensorVec args_;
+    TensorVec rets_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), FunctionalIf);
+REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_GPU).HostMemory("cond"),
+                        FunctionalIf);
+
+class FunctionalWhile : public AsyncOpKernel {
+ public:
+  explicit FunctionalWhile(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("cond", &cond_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("body", &body_func_));
+  }
+
+  ~FunctionalWhile() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    auto lib = ctx->function_library();
+    OP_REQUIRES_ASYNC(ctx, lib != nullptr,
+                      errors::Internal("No function library"), done);
+
+    // TODO(b/37549631): Because this op has `SetIsStateful()` in its
+    // op registration, this kernel may be shared by multiple
+    // subgraphs, which have different associated
+    // `FunctionLibraryRuntime` objects and hence different `FHandle`
+    // namespaces. We currently work around this by caching the map
+    // from `FunctionLibraryRuntime*` to `FHandle` pairs for the two
+    // functions this op uses.
+    FHandle cond_handle;
+    FHandle body_handle;
+    {
+      mutex_lock l(mu_);
+      const auto iter = handles_.find(lib);
+      if (iter == handles_.end()) {
+        OP_REQUIRES_OK_ASYNC(ctx, Instantiate(lib, cond_func_, &cond_handle),
+                             done);
+        OP_REQUIRES_OK_ASYNC(ctx, Instantiate(lib, body_func_, &body_handle),
+                             done);
+        handles_[lib] = {cond_handle, body_handle};
+      } else {
+        cond_handle = iter->second.first;
+        body_handle = iter->second.second;
+      }
+    }
+
+    (new State(this, ctx, cond_handle, body_handle, done))->Start();
+  }
+
+ private:
+  NameAttrList cond_func_;
+  NameAttrList body_func_;
+
+  mutex mu_;
+  std::unordered_map<FunctionLibraryRuntime*, std::pair<FHandle, FHandle>>
+      handles_ GUARDED_BY(mu_);
+
+  class State {
+   public:
+    State(FunctionalWhile* kernel, OpKernelContext* ctx, FHandle cond_handle,
+          FHandle body_handle, DoneCallback done)
+        : kernel_(kernel),
+          ctx_(ctx),
+          cond_handle_(cond_handle),
+          body_handle_(body_handle),
+          done_(done),
+          lib_(CHECK_NOTNULL(ctx_->function_library())) {
+      SetRunOptions(ctx_, &opts_, false /* always_collect_stats */);
+      for (int i = 0; i < ctx_->num_inputs(); ++i) {
+        args_.push_back(ctx_->input(i));
+      }
+    }
+
+    ~State() {}
+
+    void Start() { EvalCond(); }
+
+   private:
+    FunctionalWhile* const kernel_;
+    OpKernelContext* const ctx_;
+    const FHandle cond_handle_;
+    const FHandle body_handle_;
+    const DoneCallback done_;
+    FunctionLibraryRuntime* const lib_;
+    FunctionLibraryRuntime::Options opts_;
+    TensorVec args_;
+    TensorVec rets_;
+
+    void EvalCond() {
+      lib_->Run(
+          // Evaluate the condition.
+          opts_, cond_handle_, args_, &rets_,
+          // Done cb.
+          [this](const Status& s) {
+            if (!s.ok()) {
+              return Finish(s);
+            }
+            StartBody();
+          });
+    }
+
+    void StartBody() {
+      bool cond;
+      Status s = ToBool(rets_, &cond);
+      if (!s.ok()) {
+        return Finish(s);
+      }
+      if (!cond) {
+        return Finish(Status::OK());
+      }
+      rets_.clear();
+      lib_->Run(
+          // Evaluate the body.
+          opts_, body_handle_, args_, &rets_,
+          // Done callback
+          [this](const Status& s) {
+            if (!s.ok()) {
+              return Finish(s);
+            }
+            if (args_.size() != rets_.size()) {
+              return Finish(errors::InvalidArgument(
+                  "While loop body returned ", rets_.size(),
+                  " arguments. Expected: ", args_.size()));
+            }
+            args_.clear();
+            using std::swap;
+            swap(args_, rets_);
+            EvalCond();
+          });
+    }
+
+    void Finish(Status s) {
+      if (s.ok()) {
+        s = SetOutputs(kernel_, ctx_, args_);
+      }
+      ctx_->SetStatus(s);
+      done_();
+      delete this;
+    }
+  };
+};
+REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_CPU), FunctionalWhile);
+REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_GPU), FunctionalWhile);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 1688674eb784369ae8fbb2622695561cb5bebcae..9b4dca851138235d7b4a95906f3c8a0e5d592aa7 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/fused_batch_norm_op.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -239,6 +240,14 @@ struct FusedBatchNorm<GPUDevice, T, U> {
             << " offset shape: " << offset.shape().DebugString()
             << " tensor format: " << tensor_format;
 
+    // If input is empty, return NaN mean/variance
+    if (x.shape().num_elements() == 0) {
+      functor::SetNanFunctor<U> f;
+      f(context->eigen_device<GPUDevice>(), batch_mean->flat<U>());
+      f(context->eigen_device<GPUDevice>(), batch_var->flat<U>());
+      return;
+    }
+
     Tensor x_maybe_transformed = x;
     Tensor x_transformed;
     Tensor y_transformed;
@@ -623,14 +632,26 @@ class FusedBatchNormGradOp : public OpKernel {
     Tensor* offset_backprop = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(2, scale_offset_shape,
                                                      &offset_backprop));
-    // two placeholders for estimated_mean and estimated_variance, which are
+    // Two placeholders for estimated_mean and estimated_variance, which are
     // used for inference and thus not needed here for gradient computation.
+    // They are filled with zeros so as to avoid NaN outputs.
     Tensor* placeholder_1 = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output(3, TensorShape({}), &placeholder_1));
+    functor::SetZeroFunctor<Device, float> f;
+    f(context->eigen_device<Device>(), placeholder_1->flat<U>());
     Tensor* placeholder_2 = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output(4, TensorShape({}), &placeholder_2));
+    f(context->eigen_device<Device>(), placeholder_2->flat<U>());
+
+    // If input is empty, set gradients w.r.t scale/offset to zero.
+    if (x.shape().num_elements() == 0) {
+      functor::SetZeroFunctor<Device, U> f;
+      f(context->eigen_device<Device>(), scale_backprop->flat<U>());
+      f(context->eigen_device<Device>(), offset_backprop->flat<U>());
+      return;
+    }
 
     if (is_training_) {
       functor::FusedBatchNormGrad<Device, T, U>()(
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
index dc956066ecffe2ad3a38506fb6e76dd402def5b3..4a67b2b3a30463448ac97aff96402f6500eeb19a 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cu.cc
@@ -65,8 +65,16 @@ void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
                                               epsilon, sample_size, variance);
 }
 
+template <class T>
+void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d,
+                                  typename TTypes<T>::Flat out) {
+  To32Bit(out).device(d) =
+      To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
+}
+
 template class VarianceToInvVariance<float>;
 template class InvVarianceToVariance<float>;
+template class SetNanFunctor<float>;
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.h b/tensorflow/core/kernels/fused_batch_norm_op.h
index 3af104bf954257b260215d6a79b0a365227d7b23..d6c68df986117df0ab4f8c24fb1a713901b468f7 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.h
+++ b/tensorflow/core/kernels/fused_batch_norm_op.h
@@ -49,6 +49,12 @@ struct InvVarianceToVariance {
                   int channels, T* variance);
 };
 
+// This function sets a GPU tensor to NaNs.
+template <class T>
+struct SetNanFunctor {
+  void operator()(const Eigen::GpuDevice& d, typename TTypes<T>::Flat out);
+};
+
 #endif  // GOOGLE_CUDA
 
 // Functor used by FusedBatchNormGradOp to do the computations when
diff --git a/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
index 6d4a9dfdef4609a45d3a38e49a32492408043617..37edd1ce0f95d7f6d6a366f5b0d83bac7f6159d5 100644
--- a/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_base64_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
index b084a972049cc2b1997df64a2f43a6d79b6b4e6d..f3b24b2341e590adfbeac1a18b6a65fbfd34f598 100644
--- a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
index 9dd795b94e82c48ad037df67f3218ed62feb722e..e9ffad178616a7b0872d461653cb01c40b292d88 100644
--- a/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_json_example_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
index 4a68a5b5803f363ab93bf280df54fa8f14206a84..020f18b1895c480748cafbfb8f7f267887db1fba 100644
--- a/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_png_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
index 2d6c82826cf9dad1ca67d6e5ee1d13a059f9c8ea..a8f07f4bad3a7e7ccff4ebefd4c56c695d0b2573 100644
--- a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
index 81b6e491248fda37f602c0365c1e90d4b08f7c2a..f5dd47a052cd098937d66394ed04c66831ee5972 100644
--- a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
index d91a351c5969e71385348b76376202c14e86daac..4d736a21602b34b560ea1c8d9ede4645d806ca29 100644
--- a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index 0c0e548a909a0c87c622449c8ac6f66db29b5b8d..f1f3f199df137b83193c4d1e974dfb401d9ec9ff 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
-#define LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
+#define TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/graph/graph.h"
@@ -153,4 +153,4 @@ class FuzzStringInputOp : public FuzzSession {
 }  // end namespace fuzzing
 }  // end namespace tensorflow
 
-#endif  // LEARNING_BRAIN_KERNELS_FUZZING_FUZZ_SESSION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_FUZZING_FUZZ_SESSION_H_
diff --git a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
index ac3a12aa399a3efe532c71c49a092b6cecd6059b..5c3fc4a2795430d1f8f269f42131e882106db7b0 100644
--- a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
index 978fcd102822a6a2690478eaca473eabc6ae83ab..c90ad2cfeb7222f4c75e718fcaea6955567f3a4a 100644
--- a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
index 7d1aa1fbf3a149d25e82b454543a5add522145af..738d78e99a0081a2b9f0f59c94433372acec19e2 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
index 94255d215e5292bf77ab1104eb1d36c0cc1d661c..e98363ffbf166782649f3fa12dc2ab70024908cf 100644
--- a/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_to_number_fuzz.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index dde08b37eacb9edada92f98c5115f694015aad34..e6fefe643b72bd5a169f0c152ac2fee2568462aa 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -25,12 +25,12 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 
 // Forward declarations of the functor specializations for GPU.
-#define DECLARE_GPU_SPECS_INDEX(T, Index)                             \
-  template <>                                                         \
-  int64 GatherFunctor<GPUDevice, T, Index>::operator()(               \
+#define DECLARE_GPU_SPECS_INDEX(T, Index)                               \
+  template <>                                                           \
+  int64 GatherFunctor<GPUDevice, T, Index>::operator()(                 \
       OpKernelContext* ctx, typename TTypes<T, 3>::ConstTensor Tparams, \
-      typename TTypes<Index>::ConstFlat Tindices,                     \
-      typename TTypes<T, 3>::Tensor Tout);                            \
+      typename TTypes<Index>::ConstFlat Tindices,                       \
+      typename TTypes<T, 3>::Tensor Tout);                              \
   extern template struct GatherFunctor<GPUDevice, T, Index>;
 
 #define DECLARE_GPU_SPECS(T)         \
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 1e429a037e8b16f5e01766125e1d10ec7567d78d..16ccb03b8502dd626c0dc4f0c10fcfe50224c7b8 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -18,12 +18,12 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/prefetch.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -52,21 +52,23 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
   const size_t slice_bytes = slice_elems * sizeof(T);
   auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
   mutex mu;
-  // Store the value of invalidate index for printing error information, it's a shared variable.
+  // Store the value of invalidate index for printing error information, it's a
+  // shared variable.
   SliceIndex result = -1;
-  auto work = [&] (int64 start, int64 end) {
+  auto work = [&](int64 start, int64 end) {
     SliceIndex batch_idx = static_cast<SliceIndex>(start / indices_size);
     SliceIndex indices_idx = static_cast<SliceIndex>(start % indices_size);
     SliceIndex batch_idx_end = static_cast<SliceIndex>(end / indices_size);
     SliceIndex indices_idx_end = static_cast<SliceIndex>(end % indices_size);
 
     while ((batch_idx < batch_idx_end) ||
-            (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) {
+           (batch_idx == batch_idx_end && indices_idx < indices_idx_end)) {
       SliceIndex i_next = indices_idx + 1;
       SliceIndex b_next = batch_idx + 1;
       if ((batch_idx == batch_idx_end && i_next < indices_idx_end) ||
-              (i_next < indices_size)) {
-        port::prefetch<port::PREFETCH_HINT_T0>(&params(batch_idx, indices(i_next), 0));
+          (i_next < indices_size)) {
+        port::prefetch<port::PREFETCH_HINT_T0>(
+            &params(batch_idx, indices(i_next), 0));
         port::prefetch<port::PREFETCH_HINT_T0>(&out(batch_idx, i_next, 0));
         b_next = batch_idx;
       } else if (b_next <= batch_idx_end) {
@@ -85,11 +87,12 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
       // ahead-of-time compilation binary size).
       if (is_simple_type<T>::value) {
         // Avoid auto-promotion to Index from SliceIndex by casting.
-        memcpy(out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
-               params_base + (batch_idx * static_cast<SliceIndex>(limit) +
-                              static_cast<SliceIndex>(index)) *
-                             slice_elems,
-               slice_bytes);
+        memcpy(
+            out_base + (batch_idx * indices_size + indices_idx) * slice_elems,
+            params_base + (batch_idx * static_cast<SliceIndex>(limit) +
+                           static_cast<SliceIndex>(index)) *
+                              slice_elems,
+            slice_bytes);
       } else {
         // For non-"simple" types (e.g. strings).
         out.template chip<1>(indices_idx) = params.template chip<1>(index);
@@ -99,8 +102,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
     }
   };
 
-  Shard(worker_threads->num_threads, worker_threads->workers, batch_size*indices_size,
-        slice_elems * sizeof(T), work);
+  Shard(worker_threads->num_threads, worker_threads->workers,
+        batch_size * indices_size, slice_elems * sizeof(T), work);
   return result;
 }
 
@@ -117,16 +120,16 @@ struct GatherFunctorCPU {
     bool use_large = (slice_size > std::numeric_limits<int32>::max() ||
                       params.size() > std::numeric_limits<int32>::max() ||
                       N > std::numeric_limits<int32>::max());
-#define CALL(elems)                                                        \
-  do {                                                                     \
-    if (use_large) {                                                       \
-      bad_i = HandleCopies<T, Index, int64, elems>(ctx, params, indices,   \
-                                                   slice_size, out);       \
-    } else {                                                               \
-      const int32 small_slice = static_cast<int32>(slice_size);            \
-      bad_i = HandleCopies<T, Index, int32, elems>(ctx, params, indices,   \
-                                                   small_slice, out);      \
-    }                                                                      \
+#define CALL(elems)                                                      \
+  do {                                                                   \
+    if (use_large) {                                                     \
+      bad_i = HandleCopies<T, Index, int64, elems>(ctx, params, indices, \
+                                                   slice_size, out);     \
+    } else {                                                             \
+      const int32 small_slice = static_cast<int32>(slice_size);          \
+      bad_i = HandleCopies<T, Index, int32, elems>(ctx, params, indices, \
+                                                   small_slice, out);    \
+    }                                                                    \
   } while (0)
 
     if (slice_size == 10)
@@ -143,7 +146,8 @@ struct GatherFunctorCPU {
 
 template <typename Device, typename T, typename Index>
 struct GatherFunctor {
-  int64 operator()(OpKernelContext* ctx, typename TTypes<T, 3>::ConstTensor params,
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<T, 3>::ConstTensor params,
                    typename TTypes<Index>::ConstFlat indices,
                    typename TTypes<T, 3>::Tensor out);
 };
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.h b/tensorflow/core/kernels/gather_functor_gpu.cu.h
index a50b51b54b1d8e23b4082ba7b6bee8db2cc28382..11ea63d730aa69509edaacf127e62b4bbeb5740f 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
+#define TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
 
 #if GOOGLE_CUDA
 
@@ -118,4 +118,4 @@ struct GatherFunctor<GPUDevice, T, Index> {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_GATHER_FUNCTOR_GPU_CU_H_
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 5dc74d720ab22e2f2f10baf8309b59661740184f..7e5a9e1ec5aac26706d95646a29539dd0f4be2ed 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -176,10 +176,12 @@ Status DoGatherNd(OpKernelContext* c, const Tensor& params,
       PARAMS_CASE(3);
       PARAMS_CASE(4);
       PARAMS_CASE(5);
+      PARAMS_CASE(6);
+      PARAMS_CASE(7);
 #undef PARAMS_CASE
       default:
         return errors::InvalidArgument(
-            "Only indices.shape[-1] values between 1 and 5 "
+            "Only indices.shape[-1] values between 1 and 7 "
             "are currently supported.  Requested rank: ",
             indices_nd);
     }
@@ -218,7 +220,9 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 2); \
   DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 3); \
   DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 4); \
-  DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 5);
+  DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 5); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 6); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 7);
 
 #define DECLARE_GPU_SPECS(T)         \
   DECLARE_GPU_SPECS_INDEX(T, int32); \
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_6.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_6.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2aec872448ec02581faf95e30844e5e1e80cd277
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_6.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 6
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_7.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_7.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9222cb07695cb1c05b12da59b0c0bbc96bebb388
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_7.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 7
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index ed5240c20abd247404cb926dd9a455af901c0d7c..b03efc684ffca4abde99b31952983aad5f805ee3 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -111,7 +111,9 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
   DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 2); \
   DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 3); \
   DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 4); \
-  DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 5);
+  DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 5); \
+  DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 6); \
+  DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 7);
 
 #define DEFINE_GPU_SPECS(T)         \
   DEFINE_GPU_SPECS_INDEX(T, int32); \
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 239d5d2e990a88bbc8ca5949a07a2aa2a75de2ba..08adf4badbcd9c8baf664b13098f23dfb0584e24 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/mem.h"
@@ -106,8 +108,7 @@ class GatherOp : public OpKernel {
       auto out_flat = out->shaped<T, 3>({outer_size, N, inner_size});
 
       functor::GatherFunctor<Device, T, Index> functor;
-      int64 bad_i = functor(c, params_flat,
-                            indices_flat, out_flat);
+      int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
 
       OP_REQUIRES(
           c, bad_i < 0,
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index 366877bcf5f57139a5600c4e198a7862d8ed9ef7..ffc733e6bb6b45ab463f319de39dfd175e83e5c1 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
 
 #if GOOGLE_CUDA
 
@@ -162,4 +162,4 @@ class AutoTuneSingleton {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
diff --git a/tensorflow/core/kernels/guarantee_const_op.cc b/tensorflow/core/kernels/guarantee_const_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de3a2a1148b7e7922a08cfce159fb05ccdb9fe30
--- /dev/null
+++ b/tensorflow/core/kernels/guarantee_const_op.cc
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace {
+
+// Refer to the Op description for detailed comments.
+class GuaranteeConstOp : public OpKernel {
+ public:
+  explicit GuaranteeConstOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const DataType input_dtype = ctx->input_dtype(0);
+    OP_REQUIRES(ctx, input_dtype != DT_RESOURCE,
+                errors::InvalidArgument(
+                    "Input tensor cannot be a resource variable handle."));
+    const Tensor& input_tensor = ctx->input(0);
+    Tensor* output = nullptr;
+    if (!ctx->forward_input_to_output_with_shape(0, 0, input_tensor.shape(),
+                                                 &output)) {
+      ctx->set_output(0, input_tensor);
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+REGISTER_KERNEL_BUILDER(Name("GuaranteeConst").Device(DEVICE_CPU),
+                        GuaranteeConstOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/guarantee_const_op_test.cc b/tensorflow/core/kernels/guarantee_const_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..01461fbb8c22a2bfb9669bef680759ecab324a61
--- /dev/null
+++ b/tensorflow/core/kernels/guarantee_const_op_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class GuaranteeConstOpTest : public OpsTestBase {
+ protected:
+  Status Init(DataType input_type) {
+    TF_CHECK_OK(NodeDefBuilder("op", "GuaranteeConst")
+                    .Input(FakeInput(input_type))
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+};
+
+TEST_F(GuaranteeConstOpTest, Int32Success_6) {
+  TF_ASSERT_OK(Init(DT_INT32));
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(GuaranteeConstOpTest, Int32Success_2_3) {
+  TF_ASSERT_OK(Init(DT_INT32));
+  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({2, 3}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(GuaranteeConstOpTest, StringSuccess) {
+  TF_ASSERT_OK(Init(DT_STRING));
+  AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({6}));
+  test::FillValues<string>(&expected, {"A", "b", "C", "d", "E", "f"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(GuaranteeConstOpTest, ResourceInputError) {
+  TF_ASSERT_OK(Init(DT_RESOURCE));
+  AddResourceInput("", "resource", new Var(DT_INT32));
+  const auto status = RunOpKernel();
+  ASSERT_EQ(error::INVALID_ARGUMENT, status.code());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
index f0d7c670a62bf0a520cb37f01beda530d157d5c7..4040bf52bffe638d601f954f9a81d9eda78346a6 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
@@ -46,7 +46,7 @@ GraphTransferUtils::GetTopNFloatResults(const float* const data,
       GetTopNFloatResults(data, labels, element_count);
   LOG(INFO) << "=== Dump ranking ===";
   for (int i = 0; i < top_n; ++i) {
-    const std::tuple<float, int, string> &entry = queue.top();
+    const std::tuple<float, int, string>& entry = queue.top();
     LOG(INFO) << i << ": " << std::get<1>(entry) << ", " << std::get<2>(entry)
               << ", " << std::get<0>(entry);
     queue.pop();
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.h b/tensorflow/core/kernels/hexagon/graph_transferer.h
index 125d1fd200719de195da2ac3339576decde1ba46..0d43d028cdbea02b820d8ac0c48378524e875e78 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.h
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H_
+#define TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H_
 
 #include <array>
 #include <unordered_map>
@@ -181,8 +181,8 @@ class GraphTransferer {
   void AppendNodeInputParams(const int id, const Node& node,
                              const std::vector<int>& extra_inputs);
 
-  void AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
-                              const int id, const Node& node);
+  void AppendNodeOutputParams(const ShapeRefiner& shape_refiner, const int id,
+                              const Node& node);
 
   static std::array<int64, SHAPE_ARRAY_SIZE> BuildShapeArray(
       const shape_inference::ShapeHandle& shape_handle,
@@ -225,4 +225,4 @@ class GraphTransferer {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H
+#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_GRAPH_TRANSFERER_H
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
index 536d295506c9669b0434059e26094cb70a4f1e87..20b09f144bab5482f2cf1bfa86cf22f0b7ff815e 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
@@ -42,8 +42,7 @@ constexpr float VALUE_TOLERANCE_FLOAT = 1e-8f;
 
 class GraphTransfererTest : public ::testing::Test {
  protected:
-  void SetUp() final {
-  }
+  void SetUp() final {}
 
   GraphTransferer gt_;
 };
@@ -61,7 +60,7 @@ class TestGraphTransferOpsDefinitions : public IRemoteFusedGraphOpsDefinitions {
       }
     }
     return -1;
-}
+  }
 
  private:
   const std::vector<string> op_types_{"INPUT",   "OUTPUT",  "Conv2D",
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
index 8eb3995fc4f7974e382eb1370e05bec4a2f4a3f2..dca1f94a9b156bc9199064a72efc69b34956e59f 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
+#define TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
 
 #include <unordered_map>
 #include <vector>
@@ -88,4 +88,4 @@ class HexagonControlWrapper final : public IRemoteFusedGraphExecutor {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_CONTROL_WRAPPER_H_
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index 71bc4187b74cd6501d203aa3779c6d01e01f0d38..3f794dfb1a04cfdd6f7c114e0b2c7c0aac319a61 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -420,7 +420,7 @@ TEST(GraphTransferer,
       false,  // is_text_proto
       false,  // shape_inference_for_unknown_shape
       true    // dry_run_for_unknown_shape
-      );
+  );
   ASSERT_TRUE(status.ok()) << status;
   prof.Stop();
   prof.DumpStatistics("LoadGraphFromProtoFile");
@@ -487,7 +487,7 @@ TEST(GraphTransferer,
       false,  // is_text_proto
       true,   // shape_inference_for_unknown_shape
       false   // dry_run_for_unknown_shape
-      );
+  );
   ASSERT_TRUE(status.ok()) << status;
   prof.Stop();
   prof.DumpStatistics("LoadGraphFromProtoFile");
@@ -556,7 +556,7 @@ TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) {
       false,  // is_text_proto
       false,  // shape_inference_for_unknown_shape
       true    // dry_run_for_unknown_shape
-      );
+  );
   const GraphTransferInfo& gfi0 = gt0.GetGraphTransferInfo();
 
   ASSERT_TRUE(status.ok());
@@ -576,7 +576,7 @@ TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) {
       false,  // is_text_proto
       true,   // shape_inference_for_unknown_shape
       false   // dry_run_for_unknown_shape
-      );
+  );
   const GraphTransferInfo& gfi1 = gt1.GetGraphTransferInfo();
 
   ASSERT_TRUE(status.ok());
diff --git a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
index 993a5f9a3a81d1bfc00b59ec1364209d11ceeaa7..b9328c8e0e891cf637d467e7fcbbac331d84e12c 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
+#define TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
 
 #include <unordered_map>
 
@@ -55,4 +55,4 @@ class HexagonOpsDefinitions final : public IRemoteFusedGraphOpsDefinitions {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H
+#endif  // TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H
diff --git a/tensorflow/core/kernels/hinge-loss.h b/tensorflow/core/kernels/hinge-loss.h
index 789a7ce7a3d8ec9e5d918dd75fce8d644a3b5682..d303e9c877e7b7be05205003c26cf66ef8273416 100644
--- a/tensorflow/core/kernels/hinge-loss.h
+++ b/tensorflow/core/kernels/hinge-loss.h
@@ -50,9 +50,8 @@ class HingeLossUpdater : public DualLossUpdater {
     // valid value for new dual = 0
     // c. new optimal value > 1.0. Then new optimal value should be set to 1.0.
     const double candidate_optimal_dual =
-        current_dual +
-        (label - wx) /
-            (num_loss_partitions * example_weight * weighted_example_norm);
+        current_dual + (label - wx) / (num_loss_partitions * example_weight *
+                                       weighted_example_norm);
     if (label * candidate_optimal_dual < 0) {
       return 0.0;
     }
diff --git a/tensorflow/core/kernels/histogram_op_gpu.cu.cc b/tensorflow/core/kernels/histogram_op_gpu.cu.cc
index c2bb958be8b29c4a6df99cf5533748d7db73179c..a88e9b0ddcdda660cf34a88253ef7c8d1e28029c 100644
--- a/tensorflow/core/kernels/histogram_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/histogram_op_gpu.cu.cc
@@ -17,16 +17,16 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/histogram_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "external/cub_archive/cub/device/device_histogram.cuh"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/histogram_op.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -104,8 +104,8 @@ struct HistogramFixedWidthFunctor<GPUDevice, T, Tout> {
         /* num_samples */ num_samples,
         /* stream */ stream);
     if (err != cudaSuccess) {
-      return errors::Internal("Could not launch HistogramRange: ",
-                              cudaGetErrorString(err), ".");
+      return errors::Internal(
+          "Could not launch HistogramRange: ", cudaGetErrorString(err), ".");
     }
 
     return Status::OK();
diff --git a/tensorflow/core/kernels/i_remote_fused_graph_executor.h b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
index 05b76172b203673917f65f048f8132c2fb0de173..eb6b64da583f0ea9e4bb462925ebdf1bcf8dc1e3 100644
--- a/tensorflow/core/kernels/i_remote_fused_graph_executor.h
+++ b/tensorflow/core/kernels/i_remote_fused_graph_executor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
+#ifndef TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
+#define TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
 
 #include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -72,4 +72,4 @@ class IRemoteFusedGraphExecutor {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
+#endif  // TENSORFLOW_CORE_KERNELS_I_REMOTE_GRAPH_EXECUTOR_H_
diff --git a/tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h b/tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h
index 7d3329f490713c243cb23d2e3232d6e343c55187..9e51c9f51f4c75a7ccd635a0261f633b675326bf 100644
--- a/tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h
+++ b/tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
+#define TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
 
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/macros.h"
@@ -43,4 +43,4 @@ class IRemoteFusedGraphOpsDefinitions {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_I_REMOTE_FUSED_GRAPH_OPS_DEFINITIONS_H_
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index 7b8abf5494d23ddf6a7b590c58ae0c73c05d516d..a18a72c66dc659ffd372c231524dbf038df6ac22 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -42,6 +42,8 @@ REGISTER_KERNEL_BUILDER(Name("RefIdentity").Device(DEVICE_CPU), IdentityOp);
 // Python).
 REGISTER_KERNEL_BUILDER(Name("DebugGradientIdentity").Device(DEVICE_CPU),
                         IdentityOp);
+REGISTER_KERNEL_BUILDER(Name("DebugGradientRefIdentity").Device(DEVICE_CPU),
+                        IdentityOp);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(type)                                           \
@@ -102,7 +104,6 @@ REGISTER_SYCL_HOST_KERNEL(bool);
                           IdentityOp)
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
-REGISTER_GPU_KERNEL(bfloat16);
 REGISTER_GPU_KERNEL(Variant);
 
 #undef REGISTER_GPU_KERNEL
@@ -127,6 +128,7 @@ REGISTER_GPU_KERNEL(Variant);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_KERNEL(bool);
+REGISTER_GPU_HOST_KERNEL(string);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h
index f088315ff538e821666aa95d9a4c4ed49f7c0b59..faf997be05cccc366bcab618c99c8d39ff25e18b 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@@ -109,8 +109,9 @@ struct ImageResizerState {
     ValidateAndCalculateOutputSize(context, input);
     if (!context->status().ok()) return;
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({input.dim_size(0), out_height,
-                                                out_width, input.dim_size(3)}),
+                                0,
+                                TensorShape({input.dim_size(0), out_height,
+                                             out_width, input.dim_size(3)}),
                                 &output));
   }
 
@@ -168,8 +169,9 @@ struct ImageResizerGradientState {
         CalculateResizeScale(original_width, resized_width, align_corners_);
     output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({batch_size, original_height,
-                                                original_width, channels}),
+                                0,
+                                TensorShape({batch_size, original_height,
+                                             original_width, channels}),
                                 &output));
   }
 
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
index e2861ae090ccd48c0408b83a7bc7c0230bf2c1a5..c37055239c28e0ab243ea30b05b2c8af0905766c 100644
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -98,36 +98,36 @@ class InTopK : public OpKernel {
   int k_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("InTopK").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("precision")
-    .TypeConstraint<int32>("T"),
-    InTopK<float, int32>);
-REGISTER_KERNEL_BUILDER(
-    Name("InTopK").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("precision")
-    .TypeConstraint<int64>("T"),
-    InTopK<float, int64>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("InTopKV2").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("k")
-    .HostMemory("precision")
-    .TypeConstraint<int32>("T"),
-    InTopK<float, int32>);
-REGISTER_KERNEL_BUILDER(
-    Name("InTopKV2").Device(DEVICE_CPU)
-    .HostMemory("predictions")
-    .HostMemory("targets")
-    .HostMemory("k")
-    .HostMemory("precision")
-    .TypeConstraint<int64>("T"),
-    InTopK<float, int64>);
+REGISTER_KERNEL_BUILDER(Name("InTopK")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("precision")
+                            .TypeConstraint<int32>("T"),
+                        InTopK<float, int32>);
+REGISTER_KERNEL_BUILDER(Name("InTopK")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("precision")
+                            .TypeConstraint<int64>("T"),
+                        InTopK<float, int64>);
+
+REGISTER_KERNEL_BUILDER(Name("InTopKV2")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("k")
+                            .HostMemory("precision")
+                            .TypeConstraint<int32>("T"),
+                        InTopK<float, int32>);
+REGISTER_KERNEL_BUILDER(Name("InTopKV2")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("predictions")
+                            .HostMemory("targets")
+                            .HostMemory("k")
+                            .HostMemory("precision")
+                            .TypeConstraint<int64>("T"),
+                        InTopK<float, int64>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 01ae5a83c1eec9eb4ccb74841555b5bb1b6cd60f..a71d047ed1a381bfc0311f86987f585f51b02536 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -27,13 +27,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SyclDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
 template <typename Device, typename T>
-Status DoParallelConcatUpdate(const Device& d, const Tensor& value,
-                              int32 loc, Tensor* output) {
+Status DoParallelConcatUpdate(const Device& d, const Tensor& value, int32 loc,
+                              Tensor* output) {
   auto Tvalue = value.shaped<T, 2>({1, value.NumElements()});
   auto Toutput = output->flat_outer_dims<T>();
   auto nrows = Toutput.dimension(0);
@@ -52,6 +52,7 @@ Status DoParallelConcat(const CPUDevice& d, const Tensor& value, int32 loc,
     return DoParallelConcatUpdate<CPUDevice, type>(d, value, loc, output);
     TF_CALL_NUMBER_TYPES(CASE);
     TF_CALL_string(CASE);
+    TF_CALL_variant(CASE);
 #undef CASE
     default:
       return errors::InvalidArgument("Unsupported data type: ", value.dtype());
@@ -73,7 +74,7 @@ Status DoParallelConcat(const SyclDevice& d, const Tensor& value, int32 loc,
       return errors::InvalidArgument("Unsupported data type: ", value.dtype());
   }
 }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 
@@ -206,7 +207,7 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         ParallelConcatUpdate<CPUDevice>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/l2loss_op.cc b/tensorflow/core/kernels/l2loss_op.cc
index f8ed9351579ff8cbeeb5f45030e8ff278fa75101..f561287f7a142f4cbcf74225c3f2fde3986c169a 100644
--- a/tensorflow/core/kernels/l2loss_op.cc
+++ b/tensorflow/core/kernels/l2loss_op.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/l2loss_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
index 36907fb5716fcde3b0efc28cc4edca543432c8f4..b58bcf583480cb50ee7a6be13465e6c6d301295b 100644
--- a/tensorflow/core/kernels/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -108,7 +108,6 @@ void LinearAlgebraOp<Scalar>::Compute(OpKernelContext* context) {
   auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
   Shard(worker_threads.num_threads, worker_threads.workers,
         batch_shape.num_elements(), GetCostPerUnit(input_matrix_shapes), shard);
-
 }
 
 template <typename Scalar>
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..baf0a4abe48ea0c5a5fed5d7ef3e53925e393b10
--- /dev/null
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -0,0 +1,518 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <limits>
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/list_kernels.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// Variant compatible type for a list of tensors. This is mutable but instances
+// should never be mutated after stored in a variant tensor.
+TensorList::TensorList(const TensorList& other)
+    : tensors(other.tensors),
+      element_shape(other.element_shape),
+      element_dtype(other.element_dtype) {}
+
+void TensorList::Encode(VariantTensorData* data) const {
+  data->set_type_name(TypeName());
+  for (const Tensor& t : tensors) {
+    *data->add_tensors() = t;
+  }
+  string metadata;
+  core::PutVarint64(&metadata, static_cast<uint64>(element_dtype));
+  if (!element_shape.unknown_rank()) {
+    for (TensorShapeDim dim : element_shape) {
+      if (dim.size > 0) {
+        core::PutVarint64(&metadata, dim.size);
+      } else {
+        core::PutVarint64(&metadata, std::numeric_limits<uint64>::max());
+      }
+    }
+  }
+  data->set_metadata(metadata);
+}
+
+static Status TensorListDeviceCopy(
+    const TensorList& from, TensorList* to,
+    const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
+  to->element_shape = from.element_shape;
+  to->element_dtype = from.element_dtype;
+  to->tensors.reserve(from.tensors.size());
+  for (const Tensor& t : from.tensors) {
+    Tensor tmp(t.dtype());
+    TF_RETURN_IF_ERROR(copy(t, &tmp));
+    to->tensors.push_back(tmp);
+  }
+  return Status::OK();
+}
+
+#define REGISTER_LIST_COPY(DIRECTION)                   \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      TensorList, DIRECTION, TensorList::kTypeName, TensorListDeviceCopy)
+
+REGISTER_LIST_COPY(VariantDeviceCopyDirection::HOST_TO_DEVICE);
+REGISTER_LIST_COPY(VariantDeviceCopyDirection::DEVICE_TO_HOST);
+REGISTER_LIST_COPY(VariantDeviceCopyDirection::DEVICE_TO_DEVICE);
+
+REGISTER_UNARY_VARIANT_DECODE_FUNCTION(TensorList, TensorList::kTypeName);
+
+Status TensorListShape(const TensorList& t, TensorShape* s) {
+  *s = TensorShape({});
+  return Status::OK();
+}
+
+REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(TensorList, TensorList::kTypeName,
+                                      TensorListShape);
+
+bool TensorList::Decode(const VariantTensorData& data) {
+  tensors = data.tensors();
+  string metadata;
+  data.get_metadata(&metadata);
+  uint64 scratch;
+  StringPiece iter(metadata);
+  core::GetVarint64(&iter, &scratch);
+  element_dtype = static_cast<DataType>(scratch);
+  std::vector<int64> dims;
+  while (!iter.empty()) {
+    core::GetVarint64(&iter, &scratch);
+    if (scratch == std::numeric_limits<uint64>::max()) {
+      dims.push_back(-1);
+    } else {
+      dims.push_back(scratch);
+    }
+  }
+  return true;
+}
+
+Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out) {
+  if (t.shape() == TensorShape({})) {
+    if ((t.dtype() == DT_INT32 && t.scalar<int32>()() == -1) ||
+        (t.dtype() == DT_INT64 && t.scalar<int64>()() == -1)) {
+      return Status::OK();
+    }
+    return errors::InvalidArgument(
+        "The only valid scalar shape tensor is the fully unknown shape "
+        "specified as -1.");
+  }
+  if (t.dtype() == DT_INT32) {
+    return PartialTensorShape::MakePartialShape(t.vec<int32>().data(),
+                                                t.NumElements(), out);
+  } else if (t.dtype() == DT_INT64) {
+    return PartialTensorShape::MakePartialShape(t.vec<int64>().data(),
+                                                t.NumElements(), out);
+  }
+  return errors::InvalidArgument(
+      "Expected an int32 or int64 shape tensor; found ",
+      DataTypeString(t.dtype()));
+}
+
+class EmptyTensorList : public OpKernel {
+ public:
+  explicit EmptyTensorList(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* result;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result, attr));
+    TensorList empty;
+    empty.element_dtype = element_dtype_;
+    PartialTensorShape element_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeFromTensor(ctx->input(0), &element_shape));
+    empty.element_shape = element_shape;
+    result->scalar<Variant>()() = std::move(empty);
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+const char TensorList::kTypeName[] = "tensorflow::TensorList";
+
+REGISTER_KERNEL_BUILDER(Name("EmptyTensorList").Device(DEVICE_CPU),
+                        EmptyTensorList);
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(
+    Name("EmptyTensorList").Device(DEVICE_GPU).HostMemory("element_shape"),
+    EmptyTensorList);
+
+#endif  // GOOGLE_CUDA
+
+class TensorListPushBack : public OpKernel {
+ public:
+  explicit TensorListPushBack(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  ~TensorListPushBack() override {}
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& input = c->input(1);
+    OP_REQUIRES(c, element_dtype_ == input.dtype(),
+                errors::InvalidArgument("Invalid data types; list elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but tried to append ",
+                                        DataTypeString(input.dtype())));
+
+    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, l != nullptr,
+                errors::InvalidArgument(
+                    "Input handle is not a list. Saw: '",
+                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    OP_REQUIRES(c, l->element_shape.IsCompatibleWith(input.shape()),
+                errors::InvalidArgument(
+                    "Tried to append a tensor with incompatible shape to a "
+                    "list. Op element shape: ",
+                    input.shape().DebugString(),
+                    " list shape: ", l->element_shape.DebugString()));
+    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                errors::InvalidArgument("Invalid data types; op elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but list elements ",
+                                        DataTypeString(l->element_dtype)));
+
+    TensorList output;
+    output = *l;
+    output.tensors.push_back(input);
+    Tensor* result;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+    result->scalar<Variant>()() = std::move(output);
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorListPushBack").Device(DEVICE_CPU),
+                        TensorListPushBack);
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(Name("TensorListPushBack").Device(DEVICE_GPU),
+                        TensorListPushBack);
+
+#endif  // GOOGLE_CUDA
+
+class TensorListLength : public OpKernel {
+ public:
+  explicit TensorListLength(OpKernelConstruction* c) : OpKernel(c) {}
+  ~TensorListLength() override {}
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(
+        c, l != nullptr,
+        errors::InvalidArgument(
+            "TensorListLength received a variant which is not a list. Saw: '",
+            c->input(0).scalar<Variant>()().DebugString(), "'"));
+    Tensor* result;
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result));
+    result->scalar<int32>()() = l->tensors.size();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorListLength").Device(DEVICE_CPU),
+                        TensorListLength);
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(
+    Name("TensorListLength").Device(DEVICE_GPU).HostMemory("length"),
+    TensorListLength);
+
+#endif  // GOOGLE_CUDA
+
+class TensorListElementShape : public OpKernel {
+ public:
+  explicit TensorListElementShape(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    OP_REQUIRES(
+        c, c->input(0).shape().num_elements() == 1,
+        errors::InvalidArgument("List tensors are supposed to be scalars."));
+    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, l != nullptr,
+                errors::InvalidArgument(
+                    "TensorListElementShape received a variant which is not a "
+                    "list. Saw: '",
+                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    Tensor* result;
+    OP_REQUIRES_OK(c, c->allocate_output(
+                          0, TensorShape{l->element_shape.dims()}, &result));
+    for (int i = 0; i < l->element_shape.dims(); ++i) {
+      if (result->dtype() == DT_INT32) {
+        result->flat<int32>()(i) = l->element_shape.dim_size(i);
+      } else {
+        result->flat<int64>()(i) = l->element_shape.dim_size(i);
+      }
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorListElementShape").Device(DEVICE_CPU),
+                        TensorListElementShape);
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(Name("TensorListElementShape")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("element_shape"),
+                        TensorListElementShape);
+
+#endif  // GOOGLE_CUDA
+
+class TensorListPopBack : public OpKernel {
+ public:
+  explicit TensorListPopBack(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  ~TensorListPopBack() override {}
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, l != nullptr,
+                errors::InvalidArgument(
+                    "Input handle is not a list. Saw: '",
+                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                errors::InvalidArgument("Invalid data types; op elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but list elements ",
+                                        DataTypeString(l->element_dtype)));
+
+    OP_REQUIRES(c, !l->tensors.empty(),
+                errors::InvalidArgument("Trying to pop from an empty list."));
+
+    c->set_output(1, l->tensors.back());
+    TensorList output;
+    output = *l;
+    output.tensors.pop_back();
+    Tensor* result;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+    result->scalar<Variant>()() = std::move(output);
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorListPopBack").Device(DEVICE_CPU),
+                        TensorListPopBack);
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(Name("TensorListPopBack").Device(DEVICE_GPU),
+                        TensorListPopBack);
+
+#endif  // GOOGLE_CUDA
+
+class TensorListReserve : public OpKernel {
+ public:
+  explicit TensorListReserve(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    PartialTensorShape element_shape;
+    OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(0), &element_shape));
+    int32 num_elements = c->input(1).scalar<int32>()();
+    TensorList output;
+    output.element_shape = element_shape;
+    output.element_dtype = element_dtype_;
+    output.tensors.resize(num_elements, Tensor(DT_INVALID));
+    Tensor* result;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+    result->scalar<Variant>()() = std::move(output);
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorListReserve").Device(DEVICE_CPU),
+                        TensorListReserve);
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(Name("TensorListReserve")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("element_shape")
+                            .HostMemory("num_elements"),
+                        TensorListReserve);
+
+#endif  // GOOGLE_CUDA
+
+class TensorListGetItem : public OpKernel {
+ public:
+  explicit TensorListGetItem(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    OP_REQUIRES(
+        c, c->input(0).shape().num_elements() == 1,
+        errors::InvalidArgument("List tensors are supposed to be scalars."));
+    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, l != nullptr,
+                errors::InvalidArgument(
+                    "Input handle is not a list. Saw: '",
+                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                errors::InvalidArgument("Invalid data types; op elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but list elements ",
+                                        DataTypeString(l->element_dtype)));
+    int32 index = c->input(1).scalar<int32>()();
+    OP_REQUIRES(c, index < l->tensors.size(),
+                errors::InvalidArgument("Trying to access element ", index,
+                                        " in a list with ", l->tensors.size(),
+                                        " elements."));
+    c->set_output(0, l->tensors[index]);
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorListGetItem").Device(DEVICE_CPU),
+                        TensorListGetItem);
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(
+    Name("TensorListGetItem").Device(DEVICE_GPU).HostMemory("index"),
+    TensorListGetItem);
+
+#endif  // GOOGLE_CUDA
+
+class TensorListSetItem : public OpKernel {
+ public:
+  explicit TensorListSetItem(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, l != nullptr,
+                errors::InvalidArgument(
+                    "Input handle is not a list. Saw: '",
+                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                errors::InvalidArgument("Invalid data types; op elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but list elements ",
+                                        DataTypeString(l->element_dtype)));
+    int32 index = c->input(1).scalar<int32>()();
+    OP_REQUIRES(c, index < l->tensors.size(),
+                errors::InvalidArgument("Trying to modify element ", index,
+                                        " in a list with ", l->tensors.size(),
+                                        " elements."));
+    TensorList output;
+    output = *l;
+    output.tensors[index] = c->input(2);
+    Tensor* result;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+    result->scalar<Variant>()() = std::move(output);
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorListSetItem").Device(DEVICE_CPU),
+                        TensorListSetItem);
+
+#if GOOGLE_CUDA
+
+REGISTER_KERNEL_BUILDER(
+    Name("TensorListSetItem").Device(DEVICE_GPU).HostMemory("index"),
+    TensorListSetItem);
+
+#endif  // GOOGLE_CUDA
+
+#define REGISTER_TENSOR_LIST_STACK_CPU(T)                         \
+  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                 \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListStack<CPUDevice, T>)
+
+TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_STACK_CPU);
+REGISTER_TENSOR_LIST_STACK_CPU(quint8);
+REGISTER_TENSOR_LIST_STACK_CPU(qint8);
+REGISTER_TENSOR_LIST_STACK_CPU(quint16);
+REGISTER_TENSOR_LIST_STACK_CPU(qint16);
+REGISTER_TENSOR_LIST_STACK_CPU(qint32);
+REGISTER_TENSOR_LIST_STACK_CPU(bfloat16);
+
+#undef REGISTER_TENSOR_LIST_STACK_CPU
+
+#define REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(T)                   \
+  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")            \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListFromTensor<CPUDevice, T>)
+
+TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_CPU);
+REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(quint8);
+REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(qint8);
+REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(quint16);
+REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(qint16);
+REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(qint32);
+REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(bfloat16);
+
+#undef REGISTER_TENSOR_LIST_FROM_TENSOR_CPU
+
+REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_CPU,
+                                          TensorList, TensorList::kTypeName,
+                                          TensorListBinaryAdd<CPUDevice>);
+
+REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP,
+                                         DEVICE_CPU, TensorList,
+                                         TensorList::kTypeName,
+                                         TensorListZerosLike<CPUDevice>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..935f892dd0515025e97e02c8e941b96f21ed3b3e
--- /dev/null
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <limits>
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/list_kernels.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/util.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_TENSOR_LIST_STACK_GPU(T)                         \
+  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                 \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU),                \
+                          TensorListStack<GPUDevice, T>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_STACK_GPU);
+REGISTER_TENSOR_LIST_STACK_GPU(bfloat16);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_STACK_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_STACK_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_STACK_GPU);
+REGISTER_TENSOR_LIST_STACK_GPU(bool);
+
+#undef REGISTER_TENSOR_LIST_STACK_GPU
+
+#define REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(T)                   \
+  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")            \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("element_shape"),       \
+                          TensorListFromTensor<GPUDevice, T>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
+REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(bfloat16);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
+REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(bool);
+
+#undef REGISTER_TENSOR_LIST_FROM_TENSOR_GPU
+
+REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_GPU,
+                                          TensorList, TensorList::kTypeName,
+                                          TensorListBinaryAdd<GPUDevice>);
+REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP,
+                                         DEVICE_GPU, TensorList,
+                                         TensorList::kTypeName,
+                                         TensorListZerosLike<GPUDevice>);
+
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..9733883001d4ce7888b4893ecb43047b621a3eba
--- /dev/null
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -0,0 +1,273 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
+#define TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/util.h"
+
+namespace tensorflow {
+
+// Variant compatible type for a list of tensors. This is mutable but instances
+// should never be mutated after stored in a variant tensor.
+struct TensorList {
+ public:
+  TensorList() {}
+  TensorList(const TensorList& other);
+
+  static const char kTypeName[];
+  string TypeName() const { return kTypeName; }
+
+  void Encode(VariantTensorData* data) const;
+
+  bool Decode(const VariantTensorData& data);
+
+  // TODO(apassos) fill this out
+  string DebugString() const { return "TensorList"; }
+
+  std::vector<Tensor> tensors;
+  PartialTensorShape element_shape;
+  DataType element_dtype;
+};
+
+Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out);
+
+template <typename Device, typename T>
+class TensorListStack : public OpKernel {
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+  explicit TensorListStack(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+    OP_REQUIRES_OK(c, c->GetAttr("num_elements", &num_elements_));
+  }
+
+  ~TensorListStack() {}
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, l != nullptr,
+                errors::InvalidArgument(
+                    "Input handle is not a list. Saw: '",
+                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                errors::InvalidArgument("Invalid data types; op elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but list elements ",
+                                        DataTypeString(l->element_dtype)));
+    OP_REQUIRES(c, l->element_shape.IsFullyDefined(),
+                errors::InvalidArgument("Tried to stack elements from a list "
+                                        "with non-fully-defined shape."));
+    if (num_elements_ != -1) {
+      OP_REQUIRES(c, l->tensors.size() == num_elements_,
+                  errors::InvalidArgument("Operation expected a list with ",
+                                          num_elements_,
+                                          " elements but got a list with ",
+                                          l->tensors.size(), " elements."));
+    }
+    TensorShape resulting_shape;
+    resulting_shape.AddDim(l->tensors.size());
+    for (TensorShapeDim s : l->element_shape) {
+      resulting_shape.AddDim(s.size);
+    }
+    Tensor* output;
+    OP_REQUIRES_OK(c, c->allocate_output(0, resulting_shape, &output));
+    if (output->NumElements() == 0) {
+      return;
+    }
+
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(l->tensors.size());
+    for (const auto& t : l->tensors) {
+      OP_REQUIRES(
+          c, l->element_shape.IsCompatibleWith(t.shape()),
+          errors::InvalidArgument(
+              "Tensor with invalid shape in list. List element shape shape: ",
+              l->element_shape.DebugString(),
+              " and tensor shape: ", t.shape().DebugString()));
+      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+          t.shaped<T, 2>({1, t.NumElements()})));
+    }
+    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+
+#if GOOGLE_CUDA
+    if (std::is_same<Device, Eigen::GpuDevice>::value) {
+      ConcatGPU<T>(c, inputs_flat, output, &output_flat);
+      return;
+    }
+#endif  // GOOGLE_CUDA
+    ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+  }
+
+ private:
+  int num_elements_;
+  DataType element_dtype_;
+};
+
+template <typename Device, typename T>
+class TensorListFromTensor : public OpKernel {
+ public:
+  TensorListFromTensor(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    Tensor* output_tensor;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, {}, &output_tensor, attr));
+    PartialTensorShape element_shape;
+    OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(1), &element_shape));
+    TensorList output_list;
+    const Tensor& t = c->input(0);
+    output_list.element_dtype = t.dtype();
+    TensorShape output_shape(t.shape());
+    output_shape.RemoveDim(0);
+    OP_REQUIRES(c, element_shape.IsCompatibleWith(output_shape),
+                errors::InvalidArgument(
+                    "Specified a list with shape ", element_shape.DebugString(),
+                    " from a tensor with shape ", output_shape.DebugString()));
+    output_list.element_shape = element_shape;
+    output_list.tensors.reserve(t.shape().dim_size(0));
+    for (int i = 0; i < t.shape().dim_size(0); ++i) {
+      Tensor tmp = t.Slice(i, i + 1);
+      TensorShape tmp_shape = tmp.shape();
+      tmp_shape.RemoveDim(0);
+      OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
+                  errors::Unknown("Unexpected shape error."));
+      if (tmp.IsAligned() || !DataTypeCanUseMemcpy(DataTypeToEnum<T>::value)) {
+        output_list.tensors.push_back(tmp);
+      } else {
+        Tensor aligned;
+        OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+        aligned.flat<T>().device(c->eigen_device<Device>()) =
+            tmp.unaligned_flat<T>();
+        output_list.tensors.push_back(aligned);
+      }
+    }
+    output_tensor->scalar<Variant>()() = std::move(output_list);
+  }
+};
+
+template <typename Device>
+Status TensorListBinaryAdd(OpKernelContext* c, const TensorList& a,
+                           const TensorList& b, TensorList* out) {
+  if (a.element_dtype != b.element_dtype) {
+    return errors::InvalidArgument(
+        "Trying to add two lists of tensors of different dtypes. One is ",
+        DataTypeString(a.element_dtype), " and the other is ",
+        DataTypeString(b.element_dtype));
+  }
+  out->element_dtype = a.element_dtype;
+  if (!a.element_shape.IsCompatibleWith(b.element_shape)) {
+    return errors::InvalidArgument(
+        "Trying to add two lists of tensors with incompatible element shapes. "
+        "One is ",
+        a.element_shape.DebugString(), " and the other is ",
+        b.element_shape.DebugString());
+  }
+
+  TF_RETURN_IF_ERROR(
+      a.element_shape.MergeWith(b.element_shape, &out->element_shape));
+  if (a.tensors.size() != b.tensors.size()) {
+    return errors::InvalidArgument(
+        "Trying to add two lists of tensors with different lengths. One is ",
+        a.tensors.size(), " and the other is ", b.tensors.size());
+  }
+  out->tensors.reserve(a.tensors.size());
+  for (int i = 0; i < a.tensors.size(); ++i) {
+    const Tensor& a_tensor = a.tensors[i];
+    const Tensor& b_tensor = b.tensors[i];
+    if (a_tensor.dtype() == DT_INVALID) {
+      out->tensors.push_back(b_tensor);
+      continue;
+    }
+    if (b_tensor.dtype() == DT_INVALID) {
+      out->tensors.push_back(a_tensor);
+      continue;
+    }
+    if (a_tensor.shape() != b_tensor.shape()) {
+      // TODO(apassos) support broadcasting additions here?
+      return errors::InvalidArgument(
+          "Trying to add two tensors with incompatible element shapes. "
+          "One is ",
+          a_tensor.shape().DebugString(), " and the other is ",
+          b_tensor.shape().DebugString(), " in position ", i);
+    }
+    Tensor out_tensor;
+    TF_RETURN_IF_ERROR(
+        c->allocate_temp(a_tensor.dtype(), a_tensor.shape(), &out_tensor));
+    out->tensors.push_back(out_tensor);
+    switch (out_tensor.dtype()) {
+#define DTYPE_CASE(dtype)                                        \
+  case DataTypeToEnum<dtype>::value:                             \
+    out_tensor.flat<dtype>().device(c->eigen_device<Device>()) = \
+        a_tensor.flat<dtype>() + b_tensor.flat<dtype>();         \
+    break;
+
+      TF_CALL_NUMBER_TYPES(DTYPE_CASE)
+
+#undef DTYPE_CASE
+      default:
+        return errors::InvalidArgument("Trying to add unsupported dtype ",
+                                       out_tensor.dtype());
+    }
+  }
+  return Status::OK();
+}
+
+template <typename Device>
+Status TensorListZerosLike(OpKernelContext* c, const TensorList& x,
+                           TensorList* y) {
+  y->element_dtype = x.element_dtype;
+  y->element_shape = x.element_shape;
+  y->tensors.reserve(x.tensors.size());
+  for (const Tensor& t : x.tensors) {
+    Tensor out_tensor;
+    TF_RETURN_IF_ERROR(c->allocate_temp(t.dtype(), t.shape(), &out_tensor));
+    switch (out_tensor.dtype()) {
+#define DTYPE_CASE(dtype)                                        \
+  case DataTypeToEnum<dtype>::value:                             \
+    out_tensor.flat<dtype>().device(c->eigen_device<Device>()) = \
+        out_tensor.flat<dtype>().constant(dtype(0));             \
+    break;
+
+      TF_CALL_NUMBER_TYPES(DTYPE_CASE)
+
+#undef DTYPE_CASE
+      default:
+        return errors::InvalidArgument(
+            "Trying to compute zeros_like for unsupported dtype",
+            out_tensor.dtype());
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LIST_KERNELS_H_
diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc
index 31a427f2c90ad8a321d6004bf7ef85772d8e951f..2474fe4d564b37a7de36a85a6af3820e2bc4ac65 100755
--- a/tensorflow/core/kernels/lmdb_reader_op.cc
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/framework/reader_base.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 #include <sys/stat.h>
@@ -26,9 +26,8 @@ namespace tensorflow {
 
 class LMDBReader : public ReaderBase {
  public:
-  LMDBReader(const string& node_name, Env* env)
+  LMDBReader(const string& node_name, Env* /*unused*/)
       : ReaderBase(strings::StrCat("LMDBReader '", node_name, "'")),
-        env_(env),
         mdb_env_(nullptr),
         mdb_dbi_(0),
         mdb_txn_(nullptr),
@@ -77,15 +76,13 @@ class LMDBReader : public ReaderBase {
         *at_end = true;
         return Status::OK();
       }
-    }
-    else {
+    } else {
       if (Seek(MDB_NEXT) == false) {
         *at_end = true;
         return Status::OK();
       }
     }
-    *key = string(static_cast<const char*>(mdb_key_.mv_data),
-                  mdb_key_.mv_size);
+    *key = string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
     *value = string(static_cast<const char*>(mdb_value_.mv_data),
                     mdb_value_.mv_size);
     *produced = true;
@@ -109,7 +106,6 @@ class LMDBReader : public ReaderBase {
     }
   }
 
-  Env* const env_;
   MDB_env* mdb_env_;
   MDB_dbi mdb_dbi_;
 
@@ -123,13 +119,10 @@ class LMDBReaderOp : public ReaderOpKernel {
   explicit LMDBReaderOp(OpKernelConstruction* context)
       : ReaderOpKernel(context) {
     Env* env = context->env();
-    SetReaderFactory([this, env]() {
-      return new LMDBReader(name(), env);
-    });
+    SetReaderFactory([this, env]() { return new LMDBReader(name(), env); });
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU),
-                        LMDBReaderOp);
+REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU), LMDBReaderOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index 67d603dd0ae9851d1135e0d031efc16ca612f680..bacf3e77408a12a8a95bf7e7ab8f3a580e675675 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <iostream>
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -76,7 +77,7 @@ class PrintOp : public OpKernel {
       strings::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_),
                          "]");
     }
-    LOG(INFO) << msg;
+    std::cerr << msg << std::endl;
   }
 
  private:
diff --git a/tensorflow/core/kernels/logistic-loss.h b/tensorflow/core/kernels/logistic-loss.h
index 2765f42bbdc2d3bf3b9ec42f9f225166218fa9d0..6479e6f5dc3795451babd5675f1decc05b670251 100644
--- a/tensorflow/core/kernels/logistic-loss.h
+++ b/tensorflow/core/kernels/logistic-loss.h
@@ -122,10 +122,9 @@ class LogisticLossUpdater : public DualLossUpdater {
                              num_loss_partitions * weighted_example_norm *
                                  example_weight *
                                  (0.5 * (1 + tanhx) / label - current_dual);
-    const double denominator = -2 * label -
-                               num_loss_partitions * weighted_example_norm *
-                                   example_weight * (1 - tanhx * tanhx) * 0.5 /
-                                   label;
+    const double denominator =
+        -2 * label - num_loss_partitions * weighted_example_norm *
+                         example_weight * (1 - tanhx * tanhx) * 0.5 / label;
     return x - numerator / denominator;
   }
 };
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index 38adcada6d21bf78aee4c080bfa7fb68efeb1bf8..b352dd257ce9e60edc35ae6c142207d6f19495f7 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -82,8 +82,8 @@ class InitializeTableOp : public OpKernel {
     }
     OP_REQUIRES_OK(ctx, table->Initialize(iter));
     if (ctx->track_allocations()) {
-      ctx->record_host_persistent_memory_allocation(table->MemoryUsed() -
-                                                    memory_used_before);
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() -
+                                               memory_used_before);
     }
   }
 
@@ -144,8 +144,8 @@ class InitializeTableFromTextFileOp : public OpKernel {
                             vocab_filename, vocab_size_, delimiter_, key_index_,
                             value_index_, ctx->env(), table));
     if (ctx->track_allocations()) {
-      ctx->record_host_persistent_memory_allocation(table->MemoryUsed() -
-                                                    memory_used_before);
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() -
+                                               memory_used_before);
     }
   }
 
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index e774c771b8e28c1a3c19cfafb6e7597c81e4eb5c..e3872fee0edcae543b9193e0dcf6850d194ef067 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -709,8 +709,8 @@ class LookupTableInsertOp : public OpKernel {
     }
     OP_REQUIRES_OK(ctx, table->Insert(ctx, keys, values));
     if (ctx->track_allocations()) {
-      ctx->record_host_persistent_memory_allocation(table->MemoryUsed() -
-                                                    memory_used_before);
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() -
+                                               memory_used_before);
     }
   }
 };
@@ -786,8 +786,8 @@ class LookupTableImportOp : public OpKernel {
     }
     OP_REQUIRES_OK(ctx, table->ImportValues(ctx, keys, values));
     if (ctx->track_allocations()) {
-      ctx->record_host_persistent_memory_allocation(table->MemoryUsed() -
-                                                    memory_used_before);
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() -
+                                               memory_used_before);
     }
   }
 };
@@ -823,6 +823,7 @@ REGISTER_KERNEL(int64, int64);
 REGISTER_KERNEL(int64, float);
 REGISTER_KERNEL(string, string);
 REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(int32, int32);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index ff23a09a24f3c291aaec546577ead757e3eaa422..5ba9b936e4ea309ceda645f63e9630f01a99c985 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -64,7 +64,7 @@ class LookupTableOp : public OpKernel {
         return ctx->status();
       }
       if (ctx->track_allocations()) {
-        ctx->record_host_persistent_memory_allocation(
+        ctx->record_persistent_memory_allocation(
             container->MemoryUsed() + table_handle_.AllocatedBytes());
       }
       *ret = container;
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index e87a72f210c1d9476b2b5a68a94d2751ebddafc9..c7ce1c3747ea9f329f96d62af27708b0c9f4eb68 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -359,8 +359,8 @@ Status InitializeTableFromTextFile(const string& filename, int64 vocab_size,
   // time.
   Status s = table->Initialize(iter);
   if (errors::IsFailedPrecondition(s) && table->is_initialized()) {
-    LOG(WARNING) << "Table trying to initialize from file " << filename
-                 << " is already initialized.";
+    LOG(INFO) << "Table trying to initialize from file " << filename
+              << " is already initialized.";
     return Status::OK();
   }
   return s;
diff --git a/tensorflow/core/kernels/loss_test.cc b/tensorflow/core/kernels/loss_test.cc
index 89f0677e1f5a7a0301c2d85700ee9954869c50bb..460d65c5c270c43aae4cb8b26b5258c7d4dd9a5f 100644
--- a/tensorflow/core/kernels/loss_test.cc
+++ b/tensorflow/core/kernels/loss_test.cc
@@ -32,14 +32,17 @@ namespace {
 
 TEST(LogisticLoss, ComputePrimalLoss) {
   LogisticLossUpdater loss_updater;
-  EXPECT_NEAR(0.693147, loss_updater.ComputePrimalLoss(
-                            0 /* wx */, 1 /* label */, 1 /* example weight */),
+  EXPECT_NEAR(0.693147,
+              loss_updater.ComputePrimalLoss(0 /* wx */, 1 /* label */,
+                                             1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */,
-                                                  1 /* example weight */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputePrimalLoss(70 /* wx */, 1 /* label */,
+                                             1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(0.0, loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */,
-                                                  1 /* example weight */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputePrimalLoss(-70 /* wx */, -1 /* label */,
+                                             1 /* example weight */),
               1e-3);
 }
 
@@ -53,31 +56,35 @@ TEST(LogisticLoss, ComputeDualLoss) {
               loss_updater.ComputeDualLoss(1 /* current dual */, 1 /* label */,
                                            1 /* example weight */),
               1e-3);
-  EXPECT_NEAR(-0.693147, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                      1 /* label */,
-                                                      1 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -0.693147,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1 /* label */,
+                                   1 /* example weight */),
+      1e-3);
 }
 
 TEST(LogisticLoss, ComputeUpdatedDual) {
   LogisticLossUpdater loss_updater;
-  EXPECT_NEAR(0.479, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.5 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.479,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.5 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.031, loss_updater.ComputeUpdatedDual(
-                          2 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, 0.1 /* current_dual */,
-                          -0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.031,
+              loss_updater.ComputeUpdatedDual(
+                  2 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, 0.1 /* current_dual */,
+                  -0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(SquaredLoss, ComputePrimalLoss) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(0.5,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(40.5,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -95,43 +102,50 @@ TEST(SquaredLoss, ComputePrimalLoss) {
 
 TEST(SquaredLoss, ComputeDualLoss) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(0.66, loss_updater.ComputeDualLoss(0.2 /* current dual */,
-                                                 -1.0 /* label */,
-                                                 3.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(-0.375, loss_updater.ComputeDualLoss(1.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   1.0 /* example weight */),
-              1e-3);
-  EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      0.66,
+      loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      -0.375,
+      loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
+  EXPECT_NEAR(
+      -1.125,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(SquaredLoss, ComputeUpdatedDual) {
   SquaredLossUpdater loss_updater;
-  EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.3 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.336,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.3 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual(
-                          5 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.427,
+              loss_updater.ComputeUpdatedDual(
+                  5 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(HingeLoss, ComputePrimalLoss) {
   HingeLossUpdater loss_updater;
-  EXPECT_NEAR(1.0, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(1.0,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(0.0,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -149,10 +163,11 @@ TEST(HingeLoss, ComputePrimalLoss) {
 
 TEST(HingeLoss, ComputeDualLoss) {
   HingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
   EXPECT_NEAR(
       std::numeric_limits<double>::max(),
       loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
@@ -163,10 +178,11 @@ TEST(HingeLoss, ComputeDualLoss) {
       loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
                                    1.0 /* example weight */),
       1e-3);
-  EXPECT_NEAR(-1.5, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                 1.0 /* label */,
-                                                 3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -1.5,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(HingeLoss, ConvertLabel) {
@@ -195,28 +211,31 @@ TEST(HingeLoss, ComputeUpdatedDual) {
   // weighted_example_norm=100.0, it turns out that the optimal value to update
   // the dual to is 0.507 which is within the permitted range and thus should be
   // the value returned.
-  EXPECT_NEAR(0.507, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.5 /* current_dual */,
-                         0.3 /* wx */, 100.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.507,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.5 /* current_dual */,
+                  0.3 /* wx */, 100.0 /* weighted_example_norm */),
               1e-3);
   // When label=-1.0, example_weight=1.0, current_dual=0.4, wx=0.6,
   // weighted_example_norm=10.0 and num_loss_partitions=10, it turns out that
   // the optimal value to update the dual to is 0.384 which is within the
   // permitted range and thus should be the value returned.
-  EXPECT_NEAR(-0.416, loss_updater.ComputeUpdatedDual(
-                          10 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.6 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.416,
+              loss_updater.ComputeUpdatedDual(
+                  10 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.6 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
   // When label=1.0, example_weight=1.0, current_dual=-0.5, wx=0.3 and
   // weighted_example_norm=10.0, it turns out that the optimal value to update
   // the dual to is -0.43. However, this is outside the allowed [0.0, 1.0] range
   // and hence the closest permitted value (0.0) should be returned instead.
-  EXPECT_NEAR(0.0, loss_updater.ComputeUpdatedDual(
-                       1 /* num partitions */, 1.0 /* label */,
-                       1.0 /* example weight */, -0.5 /* current_dual */,
-                       0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.0,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, -0.5 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
   // When label=-1.0, example_weight=2.0, current_dual=-1.0, wx=0.3 and
@@ -224,17 +243,19 @@ TEST(HingeLoss, ComputeUpdatedDual) {
   // the dual to is -1.065. However, this is outside the allowed [-1.0, 0.0]
   // range and hence the closest permitted value (-1.0) should be returned
   // instead.
-  EXPECT_NEAR(-1.0, loss_updater.ComputeUpdatedDual(
-                        1 /* num partitions */, -1.0 /* label */,
-                        2.0 /* example weight */, -1.0 /* current_dual */,
-                        0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-1.0,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, -1.0 /* label */,
+                  2.0 /* example weight */, -1.0 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
 TEST(SmoothHingeLoss, ComputePrimalLoss) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.5, loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
-                                                  1.0 /* example weight */),
+  EXPECT_NEAR(0.5,
+              loss_updater.ComputePrimalLoss(0.0 /* wx */, 1.0 /* label */,
+                                             1.0 /* example weight */),
               1e-3);
   EXPECT_NEAR(0.0,
               loss_updater.ComputePrimalLoss(10.0 /* wx */, 1.0 /* label */,
@@ -252,10 +273,11 @@ TEST(SmoothHingeLoss, ComputePrimalLoss) {
 
 TEST(SmoothHingeLoss, ComputeDualLoss) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.0, loss_updater.ComputeDualLoss(0.0 /* current dual */,
-                                                -1.0 /* label */,
-                                                1.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      0.0,
+      loss_updater.ComputeDualLoss(0.0 /* current dual */, -1.0 /* label */,
+                                   1.0 /* example weight */),
+      1e-3);
   EXPECT_NEAR(
       std::numeric_limits<double>::max(),
       loss_updater.ComputeDualLoss(0.2 /* current dual */, -1.0 /* label */,
@@ -266,24 +288,27 @@ TEST(SmoothHingeLoss, ComputeDualLoss) {
       loss_updater.ComputeDualLoss(1.5 /* current dual */, 1.0 /* label */,
                                    1.0 /* example weight */),
       1e-3);
-  EXPECT_NEAR(-1.125, loss_updater.ComputeDualLoss(0.5 /* current dual */,
-                                                   1.0 /* label */,
-                                                   3.0 /* example weight */),
-              1e-3);
+  EXPECT_NEAR(
+      -1.125,
+      loss_updater.ComputeDualLoss(0.5 /* current dual */, 1.0 /* label */,
+                                   3.0 /* example weight */),
+      1e-3);
 }
 
 TEST(SmoothHingeLoss, ComputeUpdatedDual) {
   SmoothHingeLossUpdater loss_updater;
-  EXPECT_NEAR(0.336, loss_updater.ComputeUpdatedDual(
-                         1 /* num partitions */, 1.0 /* label */,
-                         1.0 /* example weight */, 0.3 /* current_dual */,
-                         0.3 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(0.336,
+              loss_updater.ComputeUpdatedDual(
+                  1 /* num partitions */, 1.0 /* label */,
+                  1.0 /* example weight */, 0.3 /* current_dual */,
+                  0.3 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 
-  EXPECT_NEAR(-0.427, loss_updater.ComputeUpdatedDual(
-                          5 /* num partitions */, -1.0 /* label */,
-                          1.0 /* example weight */, -0.4 /* current_dual */,
-                          0.8 /* wx */, 10.0 /* weighted_example_norm */),
+  EXPECT_NEAR(-0.427,
+              loss_updater.ComputeUpdatedDual(
+                  5 /* num partitions */, -1.0 /* label */,
+                  1.0 /* example weight */, -0.4 /* current_dual */,
+                  0.8 /* wx */, 10.0 /* weighted_example_norm */),
               1e-3);
 }
 
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index c905ebc84a6e9251a5e30be19b086d3fae215cad..c3a59c95762ad03f217768a9b14e31d6f501d789 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -229,10 +229,11 @@ class LRNOp : public OpKernel {
   explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     float tmp;
     OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
@@ -247,9 +248,10 @@ class LRNOp : public OpKernel {
     const Tensor& in = context->input(0);
     OP_REQUIRES(context, in.dims() == 4,
                 errors::InvalidArgument("in must be 4-dimensional"));
-    OP_REQUIRES(context, FastBoundsCheck(in.NumElements(),
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("argument to LRN too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(in.NumElements(), std::numeric_limits<int>::max()),
+        errors::InvalidArgument("argument to LRN too large"));
     // Cast to platform-specific int to avoid conversion warnings.
     const int batch = static_cast<int>(in.dim_size(0));
     const int rows = static_cast<int>(in.dim_size(1));
@@ -448,10 +450,11 @@ class LRNGradOp : public OpKernel {
   explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     float tmp;
     OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
index 5eb060f6641d1565417dd074a95bf72e2a81e472..cdff7bad5fe222b6f0824a742caa0a4e5d939f71 100644
--- a/tensorflow/core/kernels/matching_files_op.cc
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -45,15 +45,14 @@ class MatchingFilesOp : public OpKernel {
     int num_files = 0;
     std::vector<std::vector<string>> all_fnames(num_patterns);
     for (int i = 0; i < num_patterns; i++) {
-      OP_REQUIRES_OK(
-          context,
-          context->env()->GetMatchingPaths(patterns(i), &all_fnames[i]));
+      OP_REQUIRES_OK(context, context->env()->GetMatchingPaths(patterns(i),
+                                                               &all_fnames[i]));
       num_files += all_fnames[i].size();
     }
     Tensor* output_t = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       "filenames", TensorShape({num_files}), &output_t));
+    OP_REQUIRES_OK(
+        context, context->allocate_output("filenames", TensorShape({num_files}),
+                                          &output_t));
     auto output = output_t->vec<string>();
     int index = 0;
     for (int i = 0; i < num_patterns; ++i) {
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index 12d02a10c7a2b439475c1840b4e777bdd6809856..f499ce6519d097c7fea05e8175d08d102880f7fd 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -261,12 +261,12 @@ struct LaunchMatMul<GPUDevice, T, true /* USE_CUBLAS */> {
       std::vector<int64>* algorithms, bool use_autotune, Tensor* out) {
     using perftools::gputools::blas::AlgorithmConfig;
     using perftools::gputools::blas::ComputationType;
-    using perftools::gputools::blas::ProfileResult;
-    using perftools::gputools::blas::Transpose;
     using perftools::gputools::blas::kDefaultAlgorithm;
     using perftools::gputools::blas::kDefaultBlasGemm;
     using perftools::gputools::blas::kDefaultBlasGemv;
     using perftools::gputools::blas::kNoAlgorithm;
+    using perftools::gputools::blas::ProfileResult;
+    using perftools::gputools::blas::Transpose;
     Transpose trans[] = {Transpose::kNoTranspose, Transpose::kTranspose};
     const uint64 m = a.dim_size(1 - dim_pair[0].first);
     const uint64 k = a.dim_size(dim_pair[0].first);
@@ -535,13 +535,16 @@ struct MatMulFunctor<SYCLDevice, T> {
 
 }  // end namespace functor
 
-#define REGISTER_CPU(T)                                                        \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"),                \
-      MatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>);            \
+#define REGISTER_CPU_EIGEN(T)                                                  \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T").Label("eigen"), \
-      MatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>)
+      MatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>);
+
+#define REGISTER_CPU(T)                                             \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"),     \
+      MatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>); \
+  REGISTER_CPU_EIGEN(T);
 
 #define REGISTER_GPU(T)                                            \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -556,9 +559,14 @@ struct MatMulFunctor<SYCLDevice, T> {
 #if defined(INTEL_MKL)
 // MKL does not support half and int32 types for matrix-multiplication, so
 // register the kernel to use default Eigen based implementations for these
-// types
+// types. Registration for NO-LABEL version is in mkl_matmul_op.cc
+TF_CALL_float(REGISTER_CPU_EIGEN);
+TF_CALL_double(REGISTER_CPU_EIGEN);
 TF_CALL_half(REGISTER_CPU);
+
 TF_CALL_int32(REGISTER_CPU);
+TF_CALL_complex64(REGISTER_CPU_EIGEN);
+TF_CALL_complex128(REGISTER_CPU_EIGEN);
 #else
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
index 6398da2fb959b0bded9afad8c92be923e44c755c..628895ca86f9c86c5bda987dcade9a4a7af753d8 100644
--- a/tensorflow/core/kernels/matmul_op.h
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -30,7 +30,8 @@ struct MatMulTypes {
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Aligned>
       out_type;
   typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor>,
-                           Eigen::Aligned> in_type;
+                           Eigen::Aligned>
+      in_type;
 };
 
 template <typename Device, typename In0, typename In1, typename Out,
diff --git a/tensorflow/core/kernels/matrix_band_part_op.cc b/tensorflow/core/kernels/matrix_band_part_op.cc
index d7fff4bb0c2b03bdfa2845f3ff89d938e07466e1..1439141f6493943c94516e6f0f9c05e8314401d5 100644
--- a/tensorflow/core/kernels/matrix_band_part_op.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op.cc
@@ -62,7 +62,15 @@ class MatrixBandPartOp : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_lower_in.shape()),
                 errors::InvalidArgument("num_lower must be scalar, got shape ",
                                         num_lower_in.shape().DebugString()));
-    const int64 num_lower = num_lower_in.scalar<int64>()();
+
+    auto as_int64_scalar = [](const Tensor& tensor) -> int64 {
+      if (tensor.dtype() == DT_INT32) {
+        return tensor.scalar<int32>()();
+      } else {
+        return tensor.scalar<int64>()();
+      }
+    };
+    const int64 num_lower = as_int64_scalar(num_lower_in);
     OP_REQUIRES(
         context, num_lower <= input_reshaped.dimension(1),
         errors::InvalidArgument(
@@ -73,7 +81,7 @@ class MatrixBandPartOp : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_upper_in.shape()),
                 errors::InvalidArgument("num_upper must be scalar, got shape ",
                                         num_upper_in.shape().DebugString()));
-    const int64 num_upper = num_upper_in.scalar<int64>()();
+    const int64 num_upper = as_int64_scalar(num_upper_in);
     OP_REQUIRES(context, num_upper <= input_reshaped.dimension(2),
                 errors::InvalidArgument("num_upper must be negative or less or "
                                         "equal to number of columns (",
diff --git a/tensorflow/core/kernels/matrix_exponential_op.cc b/tensorflow/core/kernels/matrix_exponential_op.cc
index 4cc3f32f7e4a727fa2d9ec3c21a3750111f46392..99db898301378f7ad55f75b3a403a09a5f59bb3b 100644
--- a/tensorflow/core/kernels/matrix_exponential_op.cc
+++ b/tensorflow/core/kernels/matrix_exponential_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
@@ -40,7 +39,8 @@ class MatrixExponentialOp : public LinearAlgebraOp<Scalar> {
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
     if (input.rows() == 0) return;
-    using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
     Matrix tmp = input;
     outputs->at(0) = tmp.exp();
   }
@@ -51,9 +51,9 @@ class MatrixExponentialOp : public LinearAlgebraOp<Scalar> {
 
 REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<float>), float);
 REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<double>), double);
-REGISTER_LINALG_OP("MatrixExponential",
-                   (MatrixExponentialOp<complex64>), complex64);
-REGISTER_LINALG_OP("MatrixExponential",
-                   (MatrixExponentialOp<complex128>), complex128);
+REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<complex128>),
+                   complex128);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_inverse_op.cc b/tensorflow/core/kernels/matrix_inverse_op.cc
index c61a091c7b7b82dbcb6e7b7f016e9cd2361f3f51..52afdd15ba6c2e25f3d03973c1226404a6723f87 100644
--- a/tensorflow/core/kernels/matrix_inverse_op.cc
+++ b/tensorflow/core/kernels/matrix_inverse_op.cc
@@ -210,7 +210,7 @@ class MatrixInverseOpGpu : public AsyncOpKernel {
             done);
       }
     } else {
-      // For large matrices, we wompute the inverse of each matrix in the batch
+      // For large matrices, we compute the inverse of each matrix in the batch
       // sequentially. Here we use the cuSolver methods GETRF/GETRS because they
       // are MUCH faster than their batched cuBlas equivalents for large
       // matrices.
diff --git a/tensorflow/core/kernels/matrix_logarithm_op.cc b/tensorflow/core/kernels/matrix_logarithm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22ca094e2432723a49afab8a255339fc8ac2512e
--- /dev/null
+++ b/tensorflow/core/kernels/matrix_logarithm_op.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/MatrixFunctions"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+template <class Scalar>
+class MatrixLogarithmOp : public LinearAlgebraOp<Scalar> {
+ public:
+  INHERIT_LINALG_TYPEDEFS(Scalar);
+
+  explicit MatrixLogarithmOp(OpKernelConstruction* context) : Base(context) {}
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const ConstMatrixMap& input = inputs[0];
+    if (input.rows() == 0) return;
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    Matrix tmp = input;
+    outputs->at(0) = tmp.log();
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixLogarithmOp);
+};
+
+// For real-valued matrices, this Op would return the real part of the matrix
+// logarithm. If all eigenvalues are positive, then this returns the correct
+// logarithm, however checking for positive definiteness adds significant
+// overhead. Therefore at present we only register this Op for complex types.
+REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("MatrixLogarithm", (MatrixLogarithmOp<complex128>),
+                   complex128);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc
index 9dd665392bc33e1559d46d0e7be2277e8c22a20a..502d593474e06cc495854706a1d4d90014ea8f96 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op.cc
@@ -69,8 +69,8 @@ class MatrixSetDiagOp : public OpKernel {
                 errors::InvalidArgument(
                     "must have diagonal.shape == input.shape[:-2] + "
                     "min(input.shape[-2:]), but received input shape: ",
-                    input_shape.DebugString(), " and diagonal shape: ",
-                    diag_shape.DebugString()));
+                    input_shape.DebugString(),
+                    " and diagonal shape: ", diag_shape.DebugString()));
 
     if (input.NumElements() == 0) {
       // This is a no-op.
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 2eefadad4949fd8d78f6a27533ce0385c38d9c69..9be7408012bb81e80c73c29a6ee9bb6763c04490 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/maxpooling_op.h"
 
 #include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
@@ -89,7 +89,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
   //    max value.
   auto shard = [&params, &in_mat, &out_mat, &out_arg_max_mat, &input_backprop,
                 &output_arg_max, &out_backprop](int64 start, int64 limit) {
-
     const int32 depth = params.depth;
     const int32 in_rows = params.tensor_in_rows;
     const int32 in_cols = params.tensor_in_cols;
@@ -180,7 +179,6 @@ static void SpatialMaxPoolWithArgMaxHelper(
         input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
       }
     }
-
   };
 
   const int64 shard_cost = params.tensor_in_rows * params.tensor_in_cols *
@@ -567,7 +565,7 @@ class MaxPoolingGradGradOp : public OpKernel {
     //    tensor_out_as_matrix with the corresponding values in
     //    top_diff_as_matrix.
     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
-        int64 start, int64 limit) {
+                     int64 start, int64 limit) {
       const int32 depth = params.depth;
       const int32 in_rows = params.tensor_in_rows;
       const int32 in_cols = params.tensor_in_cols;
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index f8daaca4c94aada5dbae5e5582f0da075b7222d5..0c7a236b2ff0f0b5c6287d1dffb1e8ef9bac7cc0 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -450,10 +450,10 @@ bool MaxPoolBackwardWithArgmax<T>::operator()(
     T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-    kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
+            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
   MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
                     kThreadsPerBlock, 0, d.stream()>>>(
-                                        output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
+      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
   return d.ok();
 }
 
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
index 9fed01189fc3bfde4ad1e23ea8fda0c76311b3bc..39e60c9fcef174a4f9e2271600ed847f4e769625 100644
--- a/tensorflow/core/kernels/meta_support.cc
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -98,9 +98,9 @@ typedef gemmlowp::meta::SimpleContext<gemmlowp::WorkersPool> LocalContext;
 template <typename Context, typename Params>
 void MultiThreadGemm(Context* context, const Params& params) {
   if (params.m <= 4) {
-      gemmlowp::meta::MultiThreadGemm<
-          Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params,
-          1, 8, 8>(context, params);
+    gemmlowp::meta::MultiThreadGemm<
+        Context, gemmlowp::meta::GemmExecutorPackLHSCacheFriendly<>, Params, 1,
+        8, 8>(context, params);
   } else {
     if (params.m >= params.n) {
       gemmlowp::meta::MultiThreadGemm<
diff --git a/tensorflow/core/kernels/meta_support.h b/tensorflow/core/kernels/meta_support.h
index 53aece78e87c17cac76866a84c930f3024d38cae..97f39eb598367b83d4e74d2b0cafadec62bb4cea 100644
--- a/tensorflow/core/kernels/meta_support.h
+++ b/tensorflow/core/kernels/meta_support.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
+#ifndef TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
+#define TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
 
 #include "meta/multi_thread_gemm.h"
 #include "meta/multi_thread_transform.h"
@@ -109,4 +109,4 @@ void Clamp(OpKernelContext* context, const quint8* input, int input_count,
 }  // namespace meta
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
+#endif  // TENSORFLOW_CONTRIB_QUANTIZATION_KERNELS_META_SUPPORT_H_
diff --git a/tensorflow/core/kernels/mfcc.cc b/tensorflow/core/kernels/mfcc.cc
index 2793005aa2678b4017dc7a562b8362470e43b8ed..8c755e0df87546ab5f85c3ac5ce2d895d020de78 100644
--- a/tensorflow/core/kernels/mfcc.cc
+++ b/tensorflow/core/kernels/mfcc.cc
@@ -27,21 +27,19 @@ const double kFilterbankFloor = 1e-12;
 const int kDefaultFilterbankChannelCount = 40;
 const int kDefaultDCTCoefficientCount = 13;
 
-Mfcc::Mfcc() : initialized_(false),
-               lower_frequency_limit_(kDefaultLowerFrequencyLimit),
-               upper_frequency_limit_(kDefaultUpperFrequencyLimit),
-               filterbank_channel_count_(kDefaultFilterbankChannelCount),
-               dct_coefficient_count_(kDefaultDCTCoefficientCount) { }
+Mfcc::Mfcc()
+    : initialized_(false),
+      lower_frequency_limit_(kDefaultLowerFrequencyLimit),
+      upper_frequency_limit_(kDefaultUpperFrequencyLimit),
+      filterbank_channel_count_(kDefaultFilterbankChannelCount),
+      dct_coefficient_count_(kDefaultDCTCoefficientCount) {}
 
-bool Mfcc::Initialize(int input_length,
-                      double input_sample_rate) {
-  bool initialized = mel_filterbank_.Initialize(input_length,
-                                                input_sample_rate,
-                                                filterbank_channel_count_,
-                                                lower_frequency_limit_,
-                                                upper_frequency_limit_);
-  initialized &= dct_.Initialize(filterbank_channel_count_,
-                                 dct_coefficient_count_);
+bool Mfcc::Initialize(int input_length, double input_sample_rate) {
+  bool initialized = mel_filterbank_.Initialize(
+      input_length, input_sample_rate, filterbank_channel_count_,
+      lower_frequency_limit_, upper_frequency_limit_);
+  initialized &=
+      dct_.Initialize(filterbank_channel_count_, dct_coefficient_count_);
   initialized_ = initialized;
   return initialized;
 }
diff --git a/tensorflow/core/kernels/mfcc.h b/tensorflow/core/kernels/mfcc.h
index c39f10499091f0b5c6c74a3e70a812169b84c807..8eee76f7f0cadad45cb223ab9fbb990e4c365a44 100644
--- a/tensorflow/core/kernels/mfcc.h
+++ b/tensorflow/core/kernels/mfcc.h
@@ -15,28 +15,28 @@ limitations under the License.
 
 // Basic class for computing MFCCs from spectrogram slices.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MFCC_H_
+#define TENSORFLOW_CORE_KERNELS_MFCC_H_
 
 #include <vector>
 
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/mfcc_dct.h"
 #include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
 
 class Mfcc {
  public:
   Mfcc();
-  bool Initialize(int input_length,
-                  double input_sample_rate);
+  bool Initialize(int input_length, double input_sample_rate);
 
-  // Input is a single magnitude spectrogram frame. The input spectrum
-  // is filtered into bands using a triangular mel filterbank and a
-  // discrete cosine transform (DCT) of the values is taken. Output is
-  // populated with the lowest dct_coefficient_count of these values.
+  // Input is a single squared-magnitude spectrogram frame. The input spectrum
+  // is converted to linear magnitude and weighted into bands using a
+  // triangular mel filterbank, and a discrete cosine transform (DCT) of the
+  // values is taken. Output is populated with the lowest dct_coefficient_count
+  // of these values.
   void Compute(const std::vector<double>& spectrogram_frame,
                std::vector<double>* output) const;
 
@@ -73,4 +73,4 @@ class Mfcc {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MFCC_H_
diff --git a/tensorflow/core/kernels/mfcc_dct.h b/tensorflow/core/kernels/mfcc_dct.h
index 4fa3c01628d7f4888e6dd2c9cb5a1ef664e42723..888b8e8df8c45067981ef7ea27ddf568035dd3ae 100644
--- a/tensorflow/core/kernels/mfcc_dct.h
+++ b/tensorflow/core/kernels/mfcc_dct.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Basic minimal DCT class for MFCC speech processing.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
+#define TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
 
 #include <vector>
 
@@ -41,4 +41,4 @@ class MfccDct {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.cc b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
index 630de8a5a3362b77306ac76b70bbb63416d561d0..3db3b51e8b665f6e28ccb2bf8f3850785c7561fb 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.cc
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
@@ -38,13 +38,12 @@ namespace tensorflow {
 
 MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {}
 
-bool MfccMelFilterbank::Initialize(int input_length,
-                               double input_sample_rate,
-                               int output_channel_count,
-                               double lower_frequency_limit,
-                               double upper_frequency_limit) {
+bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate,
+                                   int output_channel_count,
+                                   double lower_frequency_limit,
+                                   double upper_frequency_limit) {
   num_channels_ = output_channel_count;
-  sample_rate_  = input_sample_rate;
+  sample_rate_ = input_sample_rate;
   input_length_ = input_length;
 
   if (num_channels_ < 1) {
@@ -85,10 +84,9 @@ bool MfccMelFilterbank::Initialize(int input_length,
   }
 
   // Always exclude DC; emulate HTK.
-  const double hz_per_sbin = 0.5 * sample_rate_ /
-      static_cast<double>(input_length_ - 1);
-  start_index_ = static_cast<int>(1.5 + (lower_frequency_limit /
-                                           hz_per_sbin));
+  const double hz_per_sbin =
+      0.5 * sample_rate_ / static_cast<double>(input_length_ - 1);
+  start_index_ = static_cast<int>(1.5 + (lower_frequency_limit / hz_per_sbin));
   end_index_ = static_cast<int>(upper_frequency_limit / hz_per_sbin);
 
   // Maps the input spectrum bin indices to filter bank channels/indices. For
@@ -121,12 +119,12 @@ bool MfccMelFilterbank::Initialize(int input_length,
       weights_[i] = 0.0;
     } else {
       if (channel >= 0) {
-        weights_[i] = (center_frequencies_[channel + 1] -
-                       FreqToMel(i * hz_per_sbin)) /
+        weights_[i] =
+            (center_frequencies_[channel + 1] - FreqToMel(i * hz_per_sbin)) /
             (center_frequencies_[channel + 1] - center_frequencies_[channel]);
       } else {
         weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) /
-            (center_frequencies_[0] - mel_low);
+                      (center_frequencies_[0] - mel_low);
       }
     }
   }
@@ -152,16 +150,16 @@ bool MfccMelFilterbank::Initialize(int input_length,
     }
   }
   if (!bad_channels.empty()) {
-    LOG(ERROR) << "Missing " << bad_channels.size() << " bands " <<
-        " starting at " << bad_channels[0] <<
-        " in mel-frequency design. " <<
-        "Perhaps too many channels or " <<
-        "not enough frequency resolution in spectrum. (" <<
-        "input_length: " << input_length <<
-        " input_sample_rate: " << input_sample_rate <<
-        " output_channel_count: " << output_channel_count <<
-        " lower_frequency_limit: " << lower_frequency_limit <<
-        " upper_frequency_limit: " << upper_frequency_limit;
+    LOG(ERROR) << "Missing " << bad_channels.size() << " bands "
+               << " starting at " << bad_channels[0]
+               << " in mel-frequency design. "
+               << "Perhaps too many channels or "
+               << "not enough frequency resolution in spectrum. ("
+               << "input_length: " << input_length
+               << " input_sample_rate: " << input_sample_rate
+               << " output_channel_count: " << output_channel_count
+               << " lower_frequency_limit: " << lower_frequency_limit
+               << " upper_frequency_limit: " << upper_frequency_limit;
   }
   initialized_ = true;
   return true;
@@ -171,7 +169,7 @@ bool MfccMelFilterbank::Initialize(int input_length,
 // square root, then summing FFT magnitudes under triangular integration windows
 // whose widths increase with frequency.
 void MfccMelFilterbank::Compute(const std::vector<double> &input,
-                            std::vector<double> *output) const {
+                                std::vector<double> *output) const {
   if (!initialized_) {
     LOG(ERROR) << "Mel Filterbank not initialized.";
     return;
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.h b/tensorflow/core/kernels/mfcc_mel_filterbank.h
index 33ea1bdb5bc3e2a2326913c99f2f6713bd82f096..37c3936e80d893a3c12b153ea92749ec4b73f872 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.h
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Basic class for applying a mel-scale filterbank to an input.
+// Basic class for applying a mel-scale mapping to a power spectrum.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
+#define TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
 
 #include <vector>
 #include "tensorflow/core/framework/op_kernel.h"
@@ -27,13 +27,12 @@ class MfccMelFilterbank {
  public:
   MfccMelFilterbank();
   bool Initialize(int input_length,  // Number of unique FFT bins fftsize/2+1.
-                  double input_sample_rate,
-                  int output_channel_count,
-                  double lower_frequency_limit,
-                  double upper_frequency_limit);
+                  double input_sample_rate, int output_channel_count,
+                  double lower_frequency_limit, double upper_frequency_limit);
 
-  // Takes a magnitude spectrogram slice as input, computes a
-  // traingular mel filterbank and places the result in output.
+  // Takes a squared-magnitude spectrogram slice as input, computes a
+  // triangular-mel-weighted linear-magnitude filterbank, and places the result
+  // in output.
   void Compute(const std::vector<double>& input,
                std::vector<double>* output) const;
 
@@ -55,11 +54,11 @@ class MfccMelFilterbank {
   // FFT bin i contributes to the upper side of mel channel band_mapper_[i]
   std::vector<int> band_mapper_;
   int start_index_;  // Lowest FFT bin used to calculate mel spectrum.
-  int end_index_;  // Highest FFT bin used to calculate mel spectrum.
+  int end_index_;    // Highest FFT bin used to calculate mel spectrum.
 
   TF_DISALLOW_COPY_AND_ASSIGN(MfccMelFilterbank);
 };
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
index 602dfeb4e5400143a10232219f02c8e5d8154a04..54f31e1699ef1843d942f952f540b2d657b2d063 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
@@ -34,11 +34,9 @@ TEST(MfccMelFilterbankTest, AgreesWithPythonGoldenValues) {
     input.push_back(i + 1);
   }
   const int kChannelCount = 20;
-  filterbank.Initialize(input.size(),
-                        22050 /* sample rate */,
-                        kChannelCount /* channels */,
-                        20.0 /*  lower frequency limit */,
-                        4000.0 /* upper frequency limit */);
+  filterbank.Initialize(
+      input.size(), 22050 /* sample rate */, kChannelCount /* channels */,
+      20.0 /*  lower frequency limit */, 4000.0 /* upper frequency limit */);
 
   std::vector<double> output;
   filterbank.Compute(input, &output);
@@ -65,13 +63,10 @@ TEST(MfccMelFilterbankTest, IgnoresExistingContentOfOutputVector) {
   std::vector<double> input;
   std::vector<double> output;
 
-  filterbank.Initialize(kSampleCount,
-                        22050 /* sample rate */,
-                        20 /* channels */,
-                        20.0 /*  lower frequency limit */,
+  filterbank.Initialize(kSampleCount, 22050 /* sample rate */,
+                        20 /* channels */, 20.0 /*  lower frequency limit */,
                         4000.0 /* upper frequency limit */);
 
-
   // First call with nonzero input value, and an empty output vector,
   // will resize the output and fill it with the correct, nonzero outputs.
   input.assign(kSampleCount, 1.0);
diff --git a/tensorflow/core/kernels/mfcc_op_test.cc b/tensorflow/core/kernels/mfcc_op_test.cc
index 57391128f9e1471d863b566bebf6f061dd68a415..43e2a4594f0d9ffa15fd072396c29afac2488029 100644
--- a/tensorflow/core/kernels/mfcc_op_test.cc
+++ b/tensorflow/core/kernels/mfcc_op_test.cc
@@ -31,8 +31,8 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
-
-using namespace ops;  // NOLINT(build/namespaces)
+namespace ops {
+namespace {
 
 TEST(MfccOpTest, SimpleTest) {
   Scope root = Scope::DisabledShapeInferenceScope();
@@ -74,4 +74,6 @@ TEST(MfccOpTest, SimpleTest) {
       1e-3);
 }
 
+}  // namespace
+}  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_test.cc b/tensorflow/core/kernels/mfcc_test.cc
index cb32df8811ed04363fd61490e3253dd31539460d..72c1d331d6e7bd91385aa268d7b59bbd786859b4 100644
--- a/tensorflow/core/kernels/mfcc_test.cc
+++ b/tensorflow/core/kernels/mfcc_test.cc
@@ -36,11 +36,10 @@ TEST(MfccTest, AgreesWithPythonGoldenValues) {
   std::vector<double> output;
   mfcc.Compute(input, &output);
 
-  std::vector<double> expected = {29.13970072, -6.41568601, -0.61903012,
-                             -0.96778652, -0.26819878, -0.40907028,
-                             -0.15614748, -0.23203119, -0.10481487,
-                             -0.1543029,  -0.0769791,  -0.10806114,
-                             -0.06047613};
+  std::vector<double> expected = {
+      29.13970072, -6.41568601, -0.61903012, -0.96778652, -0.26819878,
+      -0.40907028, -0.15614748, -0.23203119, -0.10481487, -0.1543029,
+      -0.0769791,  -0.10806114, -0.06047613};
 
   ASSERT_EQ(expected.size(), output.size());
   for (int i = 0; i < output.size(); ++i) {
diff --git a/tensorflow/core/kernels/mirror_pad_op.cc b/tensorflow/core/kernels/mirror_pad_op.cc
index fbdeaf43ebbfdcf6b76f97046130f40cf8c8efd1..26e1082989f317a35d55826a466cb8d9ef306c4c 100644
--- a/tensorflow/core/kernels/mirror_pad_op.cc
+++ b/tensorflow/core/kernels/mirror_pad_op.cc
@@ -87,8 +87,8 @@ class MirrorPadOp : public OpKernel {
       const Tpaddings before = paddings(d, 0);  // Pad before existing elements.
       const Tpaddings after = paddings(d, 1);   // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("paddings must be non-negative: ",
-                                          before, " ", after));
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
       if (offset_ == 0) {  // SYMMETRIC mode.
         OP_REQUIRES(context,
                     before <= in0.dim_size(d) && after <= in0.dim_size(d),
@@ -296,8 +296,8 @@ class MirrorPadGradOp : public OpKernel {
       const Tpaddings before = paddings(d, 0);  // Pad before existing elements.
       const Tpaddings after = paddings(d, 1);   // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument("Paddings must be non-negative: ",
-                                          before, ", ", after));
+                  errors::InvalidArgument(
+                      "Paddings must be non-negative: ", before, ", ", after));
 
       const int64 out_size = in0.dim_size(d) - (before + after);
       if (offset_ == 0) {  // SYMMETRIC mode.
diff --git a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h b/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
index bb22b2aa918dad379b80931ba0893feb9366489b..6716a26fac2c77ee1ee5306cc26cf802585dcfc4 100644
--- a/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
+++ b/tensorflow/core/kernels/mirror_pad_op_cpu_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
 
 #define EIGEN_USE_THREADS
 
@@ -41,4 +41,4 @@ TF_CALL_NUMBER_TYPES(DEFINE_CPU_SPECS);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
+#endif  // TENSORFLOW_CORE_MIRROR_PAD_OP_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 935eb81dd05897b49446cc285222a946be3d2931..b539b00009eb5cdc383aa557881e32782dce5193 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include <numeric>
-
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -29,10 +28,17 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-namespace tensorflow {
+#ifndef INTEL_MKL_ML
+#include "mkldnn.hpp"
+using mkldnn::stream;
+using mkldnn::sum;
+#endif
 
+namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+#ifdef INTEL_MKL_ML
+
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
  public:
@@ -41,40 +47,49 @@ class MklAddNOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const int num = ctx->num_inputs();
     OP_REQUIRES(ctx, num / 2 == 2,
-                errors::InvalidArgument("Only additions of two arguments "
+                errors::InvalidArgument("Only additions of two tensors "
                                         "supported by MKL. Num inputs: ",
                                         num));
 
     MklAddNOpContext mkl_context;
-    const Tensor& input0 = MklGetInput(ctx, 0);
-    GetMklShape(ctx, 0, &(mkl_context.input1_shape));
+    size_t src1_idx = 0, src2_idx = 1;
+    const Tensor& input0 = MklGetInput(ctx, src1_idx);
+    GetMklShape(ctx, src1_idx, &(mkl_context.input1_shape));
     bool input1_in_mkl_format = mkl_context.input1_shape.IsMklTensor();
 
-    const Tensor& input1 = MklGetInput(ctx, 1);
-    GetMklShape(ctx, 1, &(mkl_context.input2_shape));
+    const Tensor& input1 = MklGetInput(ctx, src2_idx);
+    GetMklShape(ctx, src2_idx, &(mkl_context.input2_shape));
     bool input2_in_mkl_format = mkl_context.input2_shape.IsMklTensor();
 
+    // if the shapes of two tensors are not same raise op error
+    TensorShape src1_shape, src2_shape;
+    src1_shape = input0.shape();
+    src2_shape = input1.shape();
+    if (!src1_shape.IsSameSize(src2_shape)) {
+      ctx->SetStatus(errors::InvalidArgument(
+          "Inputs to operation ", this->name(), " of type ",
+          this->type_string(), " must have the same size and shape.  Input 0: ",
+          src1_shape.DebugString(), " != input 1: ", src2_shape.DebugString()));
+    }
     // handle the case of a scalar
     if (!input1_in_mkl_format && input0.dims() == 0) {
       const TensorShape& o_shape = input0.shape();
       Tensor* out_tensor = nullptr;
       mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(ctx, 0, &out_tensor, o_shape,
+      AllocateOutputSetMklShape(ctx, src1_idx, &out_tensor, o_shape,
                                 mkl_context.output_shape);
       float user_i1 = (input0.scalar<T>()());
-      ;
       float user_i2 = (input1.scalar<T>()());
-      ;
       out_tensor->scalar<T>()() = std::plus<float>{}(user_i1, user_i2);
       return;
     }
 
     mkl_context.in_dims = input1_in_mkl_format
-        ? mkl_context.input1_shape.GetDimension()
-        : input0.dims();
+                              ? mkl_context.input1_shape.GetDimension()
+                              : input0.dims();
     mkl_context.in_dims = input2_in_mkl_format
-        ? mkl_context.input2_shape.GetDimension()
-        : input1.dims();
+                              ? mkl_context.input2_shape.GetDimension()
+                              : input1.dims();
 
     // If there is nothing to compute, return.
     if (!input1_in_mkl_format && !input2_in_mkl_format) {
@@ -82,7 +97,7 @@ class MklAddNOp : public OpKernel {
       if (o_shape.num_elements() == 0) {
         Tensor* out_tensor = nullptr;
         mkl_context.output_shape.SetMklTensor(false);
-        AllocateOutputSetMklShape(ctx, 0, &out_tensor, o_shape,
+        AllocateOutputSetMklShape(ctx, src1_idx, &out_tensor, o_shape,
                                   mkl_context.output_shape);
         return;
       }
@@ -110,7 +125,6 @@ class MklAddNOp : public OpKernel {
             mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
       }
     }
-
     std::vector<float> coeff(2, 1.0);
     mkl_context.MklCreateInputLayouts(ctx);
     CHECK_EQ(dnnSumCreate_F32(&mkl_context.Eltwise, mkl_context.attributes, 2,
@@ -119,32 +133,33 @@ class MklAddNOp : public OpKernel {
 
     Tensor mkl_tmp_input1_buf_tensor, mkl_tmp_input2_buf_tensor;
     mkl_context.MklPrepareAddNInputs(ctx, &mkl_tmp_input1_buf_tensor,
-    &mkl_tmp_input2_buf_tensor);
+                                     &mkl_tmp_input2_buf_tensor);
     Tensor* output = nullptr;
     if (input1_in_mkl_format || input2_in_mkl_format) {
-     TensorShape tf_shape;
-     mkl_context.output_shape.SetMklTensor(true);
-     mkl_context.output_shape.SetMklLayout(mkl_context.Eltwise, dnnResourceDst);
-
-     mkl_context.output_shape.SetTfLayout(
-         mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-     if (input1_in_mkl_format == true) {
-      mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims,
-      mkl_context.input1_shape.GetTfToMklDimMap());
-     } else {
-      mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims,
-      mkl_context.input2_shape.GetTfToMklDimMap());
-     }
-     tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                        mkl_context.output_shape.GetMklLayout())) /
-                    sizeof(T));
-
-     AllocateOutputSetMklShape(ctx, 0, &output, tf_shape,
-                              mkl_context.output_shape);
+      TensorShape tf_shape;
+      mkl_context.output_shape.SetMklTensor(true);
+      mkl_context.output_shape.SetMklLayout(mkl_context.Eltwise,
+                                            dnnResourceDst);
+
+      mkl_context.output_shape.SetTfLayout(
+          mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
+      if (input1_in_mkl_format == true) {
+        mkl_context.output_shape.SetTfDimOrder(
+            mkl_context.in_dims, mkl_context.input1_shape.GetTfToMklDimMap());
+      } else {
+        mkl_context.output_shape.SetTfDimOrder(
+            mkl_context.in_dims, mkl_context.input2_shape.GetTfToMklDimMap());
+      }
+      tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                          mkl_context.output_shape.GetMklLayout())) /
+                      sizeof(T));
+
+      AllocateOutputSetMklShape(ctx, src1_idx, &output, tf_shape,
+                                mkl_context.output_shape);
     } else {
-     const TensorShape& o_shape = input1.shape();
-     mkl_context.output_shape.SetMklTensor(false);
-     AllocateOutputSetMklShape(ctx, 0, &output, o_shape,
+      const TensorShape& o_shape = input1.shape();
+      mkl_context.output_shape.SetMklTensor(false);
+      AllocateOutputSetMklShape(ctx, src1_idx, &output, o_shape,
                                 mkl_context.output_shape);
     }
 
@@ -261,15 +276,224 @@ class MklAddNOp : public OpKernel {
         delete[] in_strides;
       }
       if (!input1_in_mkl_format) {
-         dnnLayoutDelete_F32(lt_input1);
+        dnnLayoutDelete_F32(lt_input1);
       }
       if (!input2_in_mkl_format) {
-         dnnLayoutDelete_F32(lt_input2);
+        dnnLayoutDelete_F32(lt_input2);
       }
     }
   } MklAddNOpContext;
 };
 
+#else  // INTEL_MKL_ML
+template <typename Device, typename T>
+class MklAddNOp : public OpKernel {
+ public:
+  ~MklAddNOp() {}
+  explicit MklAddNOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const int num = ctx->num_inputs();
+    // Only additions of 2 input tensors is supported now
+    OP_REQUIRES(ctx, num / 2 == 2,
+                errors::InvalidArgument("Only additions of two tensors "
+                                        "supported by MKL. Num inputs: ",
+                                        num));
+
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      size_t src1_idx = 0, src2_idx = 1, output_idx = 0;
+      const Tensor& src1_tensor = MklGetInput(ctx, src1_idx);
+      const Tensor& src2_tensor = MklGetInput(ctx, src2_idx);
+
+      MklDnnShape src1_mkl_shape, src2_mkl_shape;
+      GetMklShape(ctx, src1_idx, &src1_mkl_shape);
+      GetMklShape(ctx, src2_idx, &src2_mkl_shape);
+      bool input1_in_mkl_format = src1_mkl_shape.IsMklTensor();
+      bool input2_in_mkl_format = src2_mkl_shape.IsMklTensor();
+      int src1_dims_size = input1_in_mkl_format ? src1_mkl_shape.GetDimension()
+                                                : src1_tensor.dims();
+      int src2_dims_size = input2_in_mkl_format ? src2_mkl_shape.GetDimension()
+                                                : src2_tensor.dims();
+      // if the shapes of two tensors are not same raise op error
+      TensorShape src1_shape, src2_shape;
+      src1_shape = input1_in_mkl_format ? src1_mkl_shape.GetTfShape()
+                                        : src1_tensor.shape();
+      src2_shape = input2_in_mkl_format ? src2_mkl_shape.GetTfShape()
+                                        : src2_tensor.shape();
+
+      if (!src1_shape.IsSameSize(src2_shape)) {
+        ctx->SetStatus(errors::InvalidArgument(
+            "Inputs to operation ", this->name(), " of type ",
+            this->type_string(),
+            " must have the same size and shape.  Input 0: ",
+            src1_shape.DebugString(),
+            " != input 1: ", src2_shape.DebugString()));
+      }
+
+      if (!input1_in_mkl_format && src1_dims_size == 0) {
+        Tensor* dst_tensor = nullptr;
+        MklShape mkl_shape_dst;
+        mkl_shape_dst.SetMklTensor(false);
+        AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
+                                  src1_tensor.shape(), mkl_shape_dst);
+        float user_i1 = (src1_tensor.scalar<T>()());
+        float user_i2 = (src2_tensor.scalar<T>()());
+        dst_tensor->scalar<T>()() = std::plus<float>{}(user_i1, user_i2);
+        return;
+      }
+
+      // If there is nothing to compute, return.
+      if (!input1_in_mkl_format && !input2_in_mkl_format) {
+        if (src1_tensor.shape().num_elements() == 0) {
+          Tensor* dst_tensor = nullptr;
+          MklShape mkl_shape_dst;
+          mkl_shape_dst.SetMklTensor(false);
+          AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
+                                    src1_tensor.shape(), mkl_shape_dst);
+          return;
+        }
+      }
+
+      std::vector<double> coeff(2, 1.0);
+      MklDnnData<T> src1(&cpu_engine);
+      MklDnnData<T> src2(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      int tmp_size = input1_in_mkl_format ? src2_dims_size : src1_dims_size;
+      memory::dims dims(tmp_size);
+      memory::dims strides(tmp_size);
+      memory::desc md1({}, memory::data_undef, memory::format_undef);
+      memory::desc md2({}, memory::data_undef, memory::format_undef);
+
+      // For creating Sum primitive, we need to ensure that all inputs are in
+      // same format. What that means is if we have a mixed input case - where
+      // one input is in Tensorflow format and one input is in MKL format -,
+      // then we need to ensure that all inputs are in same format for
+      // primitive construction. For performance reason, we say that all inputs
+      // are in MKL format in such case, and insert reorder for input that is
+      // in Tensorflow format into MKL format. On the other hand, if both the
+      // inputs are in MKL format or both are in Tensorflow format, then we
+      // dont need reorder.
+      if (!input1_in_mkl_format && !input2_in_mkl_format) {
+        // If both the inputs are in Tensorflow format, we create blocked memory
+        // descriptor.
+        dims = TFShapeToMklDnnDims(src1_tensor.shape());
+        strides = CalculateTFStrides(dims);
+        md1 = MklDnnData<T>::CreateBlockedMemDesc(dims, strides);
+        md2 = md1;
+      } else if (input1_in_mkl_format && !input2_in_mkl_format) {
+        // If one input is in MKL format and other is in Tensorflow, then
+        // create respective descriptors describing the actual case. For input
+        // in Mkl format, we just get Mkl layout from MklDnnShape. For input in
+        // Tensorflow format, we create memory descriptor using data format.
+        md1 = src1_mkl_shape.GetMklLayout();
+
+        memory::format src1_mkl_data_format = src1_mkl_shape.GetTfDataFormat();
+        auto src1_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(src1_mkl_data_format);
+        auto src2_dims =
+            TFShapeToMklDnnDimsInNCHW(src2_tensor.shape(), src1_tf_data_format);
+        md2 = memory::desc(src2_dims, MklDnnType<T>(), src1_mkl_data_format);
+      } else if (input2_in_mkl_format && !input1_in_mkl_format) {
+        // Same comment as above.
+        memory::format src2_mkl_data_format = src2_mkl_shape.GetTfDataFormat();
+        auto src2_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(src2_mkl_data_format);
+        auto src1_dims =
+            TFShapeToMklDnnDimsInNCHW(src1_tensor.shape(), src2_tf_data_format);
+        md1 = memory::desc(src1_dims, MklDnnType<T>(), src2_mkl_data_format);
+
+        md2 = src2_mkl_shape.GetMklLayout();
+      } else {
+        // If both the inputs are in MKL format, we use Mkl layout of the input
+        // tensors.
+        md1 = src1_mkl_shape.GetMklLayout();
+        md2 = src2_mkl_shape.GetMklLayout();
+      }
+      src1.SetUsrMem(md1, &src1_tensor);
+      src2.SetUsrMem(md2, &src2_tensor);
+
+      // As per comment above, we tell MKLDNN that both the inputs are in same
+      // format. So we set common memory descriptor in MKL format, if any of the
+      // inputs are in MKL format. Let's get memory descriptor that we will use
+      // for both the inputs.
+      // We set output memory descriptor in MKL format, if any of the
+      // inputs are in MKL format.
+      memory::desc common_md({}, memory::data_undef, memory::format_undef);
+      if (input1_in_mkl_format || input2_in_mkl_format) {
+        common_md = input1_in_mkl_format ? md1 : md2;
+        dst.SetUsrMem(common_md);
+      } else {
+        // Since both the inputs are in Tensorflow format, and have
+        // same shape, we can get memory descriptor from any input.
+        common_md = md1;
+        dst.SetUsrMem(common_md);
+      }
+
+      std::vector<memory::primitive_desc> srcs_pd;
+      // Memory descriptor for 1st input
+      srcs_pd.push_back(memory::primitive_desc(common_md, cpu_engine));
+      // Memory descriptor for 2nd input
+      srcs_pd.push_back(memory::primitive_desc(common_md, cpu_engine));
+      auto sum_pd = sum::primitive_desc(dst.GetUsrMemDesc(), coeff, srcs_pd);
+
+      // Now we setup resources for primitive execution.
+      // First, we need to check if any of the inputs need to be reordered as
+      // per the logic described above. Since output will be in MKL format if
+      // atleast one input is in MKL format, we choose output descriptor for
+      // reorder.
+      std::vector<primitive::at> inputs;
+      std::vector<primitive> net;
+      // Check if actual input format of the tensor is different than common_pd
+      // we told MKLDNN. In that case, we will need reorder.
+      src1.CheckReorderToOpMem(srcs_pd[0], &net);
+      src2.CheckReorderToOpMem(srcs_pd[1], &net);
+      inputs.push_back(src1.GetOpMem());
+      inputs.push_back(src2.GetOpMem());
+
+      // Allocate output tensor now.
+      Tensor* dst_tensor = nullptr;
+      MklDnnShape output_mkl_shape;
+      TensorShape output_tf_shape;
+
+      if (input2_in_mkl_format || input1_in_mkl_format) {
+        output_mkl_shape.SetMklTensor(true);
+        auto output_pd = dst.GetUsrMemPrimDesc();
+        output_mkl_shape.SetMklLayout(&output_pd);
+        output_mkl_shape.SetElemType(MklDnnType<T>());
+        if (input1_in_mkl_format) {
+          output_mkl_shape.SetTfLayout(src1_dims_size,
+                                       src1_mkl_shape.GetSizesAsMklDnnDims(),
+                                       src1_mkl_shape.GetTfDataFormat());
+        } else {
+          output_mkl_shape.SetTfLayout(src2_dims_size,
+                                       src2_mkl_shape.GetSizesAsMklDnnDims(),
+                                       src2_mkl_shape.GetTfDataFormat());
+        }
+        output_tf_shape.AddDim((output_pd.get_size() / sizeof(T)));
+      } else {
+        output_mkl_shape.SetMklTensor(false);
+        output_tf_shape = src1_tensor.shape();
+      }
+      AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor, output_tf_shape,
+                                output_mkl_shape);
+      dst.SetUsrMemDataHandle(dst_tensor);
+
+      // Create Sum op, and submit net for execution.
+      net.push_back(sum(sum_pd, inputs, dst.GetOpMem()));
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          ctx, errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+};
+
+#endif
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklAddN")                          \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index d90baee069c17e9b25169dcb2650681f6103f9b1..d545d34fdfd8682b2e5b856d321579f675696e2f 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -24,10 +24,24 @@
 
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 
+#ifndef INTEL_MKL_ML
+#include "mkldnn.hpp"
+using mkldnn::algorithm;
+using mkldnn::engine;
+using mkldnn::error;
+using mkldnn::memory;
+using mkldnn::padding_kind;
+using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
+using mkldnn::prop_kind;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+#ifdef INTEL_MKL_ML
+
 template <typename Device, typename T>
 class MklAvgPoolingOp : public OpKernel {
  public:
@@ -132,7 +146,7 @@ class MklAvgPoolingOp : public OpKernel {
         E_SUCCESS);
 
     mkl_context.MklCleanup();
-  }
+  }  // Compute
 
  private:
   typedef struct {
@@ -343,10 +357,11 @@ class MklAvgPoolingGradOp : public OpKernel {
       if (!outbackprop_in_mkl_format) {
         // For avgpooling, tensor_in_shape should have 1 dimension, and 4
         // elements.
-        OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
-                                 tensor_in_shape.NumElements() == 4,
-                    errors::InvalidArgument("original input shape must be "
-                                            "1-dimensional and 4 elements"));
+        OP_REQUIRES(
+            context,
+            tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
+            errors::InvalidArgument("original input shape must be "
+                                    "1-dimensional and 4 elements"));
 
         // For avgpooling, out_backprop should have 4 dimensions.
         OP_REQUIRES(context, out_backprop.dims() == 4,
@@ -411,7 +426,280 @@ class MklAvgPoolingGradOp : public OpKernel {
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
-};
+};  // MklAvgPoolingGradOp
+
+#else
+
+template <typename Device, typename T>
+class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
+ public:
+  explicit MklAvgPoolingOp(OpKernelConstruction* context)
+      : MklPoolingForwardOpBase<T>(context) {
+    // Workspace is an MKLDNN construct that is only used in Max Pooling.
+    // So set workspace_enabled_ to false.
+    this->workspace_enabled_ = false;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& input_tensor =
+          MklGetInput(context, this->kInputTensorIndexInput);
+      MklDnnShape dnn_shape_input;
+      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
+      this->SanityCheckInput(context, input_tensor, dnn_shape_input);
+      if (!context->status().ok()) return;
+
+      MklDnnData<T> dnn_data_input(&cpu_engine);
+      MklDnnData<T> dnn_data_output(&cpu_engine);
+
+      // initialize variables for the pooling op
+      MklPoolParameters pool_params;
+      // Get the input tensor and initialize the pooling parameters
+      this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params,
+                           &dnn_data_input);
+      OP_REQUIRES_OK(context, context->status());
+
+      // Declare output tensor
+      Tensor* output_tensor = nullptr;
+      memory::dims output_dims_mkl_order;
+      this->GetOutputDims(pool_params, &output_dims_mkl_order);
+
+      // If input is an empty tensor, allocate an empty output tensor and return
+      if (input_tensor.NumElements() == 0) {
+        MklDnnShape output_mkl_shape;
+        output_mkl_shape.SetMklTensor(false);
+        TensorShape output_tf_shape;
+        if (pool_params.data_format == TensorFormat::FORMAT_NCHW) {
+          output_tf_shape = MklDnnDimsToTFShape(output_dims_mkl_order);
+        } else {
+          memory::dims output_dims_NHWC_order;
+          output_dims_NHWC_order = {pool_params.tensor_in_batch,
+                                    static_cast<int>(pool_params.out_height),
+                                    static_cast<int>(pool_params.out_width),
+                                    pool_params.out_depth};
+          output_tf_shape = MklDnnDimsToTFShape(output_dims_NHWC_order);
+        }
+        const int kOutputIndex = 0;
+        AllocateOutputSetMklShape(context, kOutputIndex, &output_tensor,
+                                  output_tf_shape, output_mkl_shape);
+        CHECK_NOTNULL(output_tensor);
+        return;
+      }
+
+      // If input is in Mkl layout, then just get the memory format from it
+      // directly, instead of using input data_format to AvgPool.
+      if (dnn_shape_input.IsMklTensor()) {
+        dnn_data_output.SetUsrMem(
+            output_dims_mkl_order,
+            static_cast<memory::format>(
+                dnn_data_input.GetUsrMemDesc().data.format));
+
+      } else {
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                                  this->data_format_mkldnn_);
+      }
+
+      // describe the memory layout
+      dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
+
+      // 3. create a pooling primitive descriptor
+      auto pool_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_avg_exclude_padding,
+          dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(),
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_prim_desc =
+          pooling_forward::primitive_desc(pool_desc, cpu_engine);
+
+      this->AllocateOutputTensor(context, pool_prim_desc, output_dims_mkl_order,
+                                 this->data_format_mkldnn_, &output_tensor);
+      CHECK_NOTNULL(output_tensor);
+
+      OP_REQUIRES_OK(context, context->status());
+      dnn_data_output.SetUsrMemDataHandle(output_tensor);
+
+      this->PrepareAndExecuteNet(pool_prim_desc, &dnn_data_input,
+                                 &dnn_data_output);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }  // Compute
+};   // MklAvgPoolingOp
+
+//-----------------------------------------------------------------------------
+
+template <class Device, class T>
+class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
+ public:
+  explicit MklAvgPoolingGradOp(OpKernelConstruction* context)
+      : MklPoolingBackwardOpBase<T>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnShape original_input_mkl_shape, input_gradient_mkl_shape;
+      const Tensor& tensor_in_shape =
+          MklGetInput(context, kInputTensorIndexInputShape);
+      const Tensor& input_gradient_tensor =
+          MklGetInput(context, kInputTensorIndexInputGradient);
+      GetMklShape(context, kInputTensorIndexInputShape,
+                  &original_input_mkl_shape);
+      GetMklShape(context, kInputTensorIndexInputGradient,
+                  &input_gradient_mkl_shape);
+
+      SanityCheckInputs(context, tensor_in_shape, input_gradient_tensor,
+                        original_input_mkl_shape, input_gradient_mkl_shape);
+      if (!context->status().ok()) return;
+
+      // Used to allocate output_diff_src/diff_src
+      // and create pool_fwd mdm desc
+      // 0. Input("orig_input_shape: int32") //NOT a T Tensor!
+      // 1. Input("grad: T")
+
+      MklDnnData<T> input_gradient_diff_dst(&cpu_engine);
+      MklDnnData<T> output_diff_src(&cpu_engine);
+      Tensor* output_tensor_diff_src = nullptr;
+      TensorShape original_input_shape;
+      MklPoolParameters pool_params;
+      memory::dims output_dims_mkl_order, original_input_dims_nchw;
+      // Configure the original input memory descriptor
+      memory::desc original_input_md = ConfigureOriginalInput(
+          context, tensor_in_shape, original_input_mkl_shape,
+          &original_input_dims_nchw, &pool_params, &original_input_shape);
+
+      // configure the original output memory descriptor
+      // by definition, the shape of the original output is the same
+      // as the shape of the gradient diff_dst
+      memory::desc original_output_md = this->ConfigureOriginalOutput(
+          pool_params, input_gradient_mkl_shape, output_dims_mkl_order);
+
+      memory::desc target_diff_dst_md = this->ConfigureInputGradient(
+          input_gradient_mkl_shape, input_gradient_tensor,
+          &input_gradient_diff_dst, original_output_md);
+      // The shape of the output diff src needs to be the same shape as the
+      // original input. But we will set its format to be same as the format of
+      // input gradient. We won't use format of original input since it will
+      // always be in Tensorflow layout (given that AvgPoolGrad gets shape of
+      // the input rather than actual input).
+      output_diff_src.SetUsrMem(
+          original_input_dims_nchw,
+          static_cast<memory::format>(target_diff_dst_md.data.format));
+
+      // Create the forward pooling primitive descriptor so we can reference it
+      // in the backward pooling primitive descriptor
+      auto pool_fwd_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_avg_exclude_padding,
+          original_input_md, original_output_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_prim_desc =
+          pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine);
+
+      auto pool_bkwd_desc = pooling_backward::desc(
+          algorithm::pooling_avg_exclude_padding,
+          output_diff_src.GetUsrMemDesc(), target_diff_dst_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_bkwd_prim_desc = pooling_backward::primitive_desc(
+          pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc);
+      this->AllocateOutputTensor(
+          context, pool_bkwd_prim_desc, original_input_dims_nchw,
+          this->data_format_mkldnn_, &output_tensor_diff_src);
+
+      output_diff_src.SetUsrMemDataHandle(output_tensor_diff_src);
+
+      this->PrepareAndExecuteNet(
+          pool_bkwd_prim_desc, &input_gradient_diff_dst, &output_diff_src,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine));
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
+    }
+  }  // Compute
+
+ private:
+  // 0. Input("orig_input_shape: int32")
+  // 1. Input("grad: T")
+  const int kInputTensorIndexInputShape = 0;
+  const int kInputTensorIndexInputGradient = 1;
+
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input_shape,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_mkl_order,
+      MklPoolParameters* pool_params, TensorShape* input_tensor_shape) {
+    CHECK_NOTNULL(original_input_dims_mkl_order);
+    CHECK_NOTNULL(pool_params);
+    CHECK_NOTNULL(input_tensor_shape);
+    // For AvgPoolGrad, we only get the size of the original input because
+    // The original data is irrelvant.
+    auto shape_vec = tensor_original_input_shape.vec<int32>();
+    for (int64 i = 0; i < tensor_original_input_shape.NumElements(); ++i) {
+      input_tensor_shape->AddDim(shape_vec(i));
+    }
+
+    return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
+        context, tensor_original_input_shape, original_input_mkl_shape,
+        original_input_dims_mkl_order, pool_params, *input_tensor_shape);
+  }
+
+  void SanityCheckInputs(OpKernelContext* context,
+                         const Tensor& tensor_in_shape,
+                         const Tensor& input_gradient_tensor,
+                         const MklDnnShape& original_input_mkl_shape,
+                         const MklDnnShape& input_gradient_mkl_shape) {
+    if (!original_input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(
+          context,
+          tensor_in_shape.dims() == 1 && tensor_in_shape.NumElements() == 4,
+          errors::InvalidArgument("original input shape must be "
+                                  "1-dimensional and 4 elements"));
+    } else {
+      OP_REQUIRES(context,
+                  original_input_mkl_shape.GetDimension() == 1 &&
+                      original_input_mkl_shape.DimSize(0) == 4,
+                  errors::InvalidArgument("original input shape must be "
+                                          "1-dimensional and 4 elements"));
+    }
+
+    if (!input_gradient_mkl_shape.IsMklTensor()) {
+      // For avgpooling, input_gradient_diff_dst should have 4 dimensions.
+      OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
+                  errors::InvalidArgument("Gradient shape must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, input_gradient_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Gradient shape must be "
+                                          "4-dimensional"));
+    }
+  }
+};  // MklAvgPoolingGradOp
+
+#endif  // INTEL_MKL_ML
 
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
                             .Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
index 138acdf29885cbd526086ce092e930b0c24aac13..d9713075be6e20b77ea681a0e71baa21b7b9eea9 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #if defined(INTEL_MKL)
 #include <vector>
 #include "mkl_cblas.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #define MKL_Complex8 tensorflow::complex64
 #define MKL_Complex16 tensorflow::complex128
@@ -72,10 +72,10 @@ class BatchMatMulMkl : public OpKernel {
     TensorShape out_shape;
     for (int i = 0; i < ndims - 2; ++i) {
       OP_REQUIRES(ctx, lhs.dim_size(i) == rhs.dim_size(i),
-                  errors::InvalidArgument("lhs.dim(", i, ") and rhs.dim(", i,
-                                          ") must be the same: ",
-                                          lhs.shape().DebugString(), " vs ",
-                                          rhs.shape().DebugString()));
+                  errors::InvalidArgument(
+                      "lhs.dim(", i, ") and rhs.dim(", i,
+                      ") must be the same: ", lhs.shape().DebugString(), " vs ",
+                      rhs.shape().DebugString()));
       out_shape.AddDim(lhs.dim_size(i));
     }
     auto batch_size = (ndims == 2) ? 1 : out_shape.num_elements();
@@ -109,7 +109,7 @@ class BatchMatMulMkl : public OpKernel {
     const uint64 M = lhs_reshaped.dimension(adj_x_ ? 2 : 1);
     const uint64 K = lhs_reshaped.dimension(adj_x_ ? 1 : 2);
     const uint64 N = rhs_reshaped.dimension(adj_y_ ? 1 : 2);
-    
+
     std::vector<MKL_INT> m_array(batch_size, M);
     std::vector<MKL_INT> n_array(batch_size, N);
     std::vector<MKL_INT> k_array(batch_size, K);
@@ -128,7 +128,7 @@ class BatchMatMulMkl : public OpKernel {
       b_array.push_back(&rhs_reshaped(i, 0, 0));
       c_array.push_back(&out_reshaped(i, 0, 0));
     }
-    
+
     MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, &m_array[0], &n_array[0],
                       &k_array[0], &a_array[0], &lda_array[0], &b_array[0],
                       &ldb_array[0], &c_array[0], &ldc_array[0], 1,
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index e6673b2ffb7dc4a2e0127c363b4402c98a023b17..f1f267e849aa39b43c153b857493160e0d103970 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -1,11 +1,8 @@
 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -33,9 +30,19 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifndef INTEL_MKL_ML
+#include "mkldnn.hpp"
+
+using mkldnn::concat;
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+// List of TensorShape objects. Used in Concat/Split layers.
+typedef std::vector<TensorShape> TensorShapeList;
+
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
 // TODO(intelft) Check if we can reuse existing EigenConcatOp using Mutable
@@ -55,6 +62,8 @@ class EigenConcatBaseOp : public OpKernel {
   // we need to have empty Compute because Compute is pure virtual function.
   void Compute(OpKernelContext* c) {}
 
+#ifdef INTEL_MKL_ML
+
   void Compute(OpKernelContext* c, const std::vector<Tensor>& values) {
     const Tensor* concat_dim_tensor;
     const char* axis_attribute_name =
@@ -139,8 +148,90 @@ class EigenConcatBaseOp : public OpKernel {
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
+
+#else  // MKL_DNN
+
+  void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
+               const TensorShapeList& input_shapes) {
+    const Tensor* concat_dim_tensor;
+    const char* axis_attribute_name =
+        AxisArgName == NAME_IS_AXIS
+            ? "axis"
+            : AxisArgName == NAME_IS_CONCAT_DIM ? "concat_dim" : "<invalid>";
+    OP_REQUIRES_OK(c, c->input(axis_attribute_name, &concat_dim_tensor));
+    OP_REQUIRES(c, IsLegacyScalar(concat_dim_tensor->shape()),
+                errors::InvalidArgument(
+                    axis_attribute_name,
+                    " tensor should be a scalar integer, but got shape ",
+                    concat_dim_tensor->shape().DebugString()));
+    const int32 concat_dim =
+        internal::SubtleMustCopy(concat_dim_tensor->scalar<int32>()());
+    // Instead of accessing values from context, we use input to Compute.
+    const int N = values.size();
+    const int input_dims = input_shapes[0].dims();
+    const TensorShape& input_shape = input_shapes[0];
+
+    int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
+    OP_REQUIRES(c,
+                (0 <= axis && axis < input_dims) ||
+                    (allow_legacy_scalars() && concat_dim == 0),
+                errors::InvalidArgument(
+                    "ConcatOp : Expected concatenating dimensions in the range "
+                    "[",
+                    -input_dims, ", ", input_dims, "), but got ", concat_dim));
+    // Note that we reduce the concat of n-dimensional tensors into a two
+    // dimensional concat. Assuming the dimensions of any input/output
+    // tensor are {x0, x1,...,xn-1, y0, y1,...,ym-1}, where the concat is along
+    // the dimension indicated with size y0, we flatten it to {x, y}, where y =
+    // Prod_i(yi) and x = ((n > 0) ? Prod_i(xi) : 1).
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(N);
+    int64 inputs_flat_dim0 = 1;
+    for (int d = 0; d < axis; ++d) {
+      inputs_flat_dim0 *= input_shape.dim_size(d);
+    }
+    int64 output_concat_dim = 0;
+    const bool input_is_scalar = IsLegacyScalar(input_shape);
+    for (int i = 0; i < N; ++i) {
+      const auto in = values[i];
+      const bool in_is_scalar = IsLegacyScalar(input_shapes[i]);
+      OP_REQUIRES(
+          c,
+          (input_shapes[i].dims() == input_dims) ||
+              (input_is_scalar && in_is_scalar),
+          errors::InvalidArgument(
+              "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", input_shapes[i].DebugString()));
+      if (in.NumElements() > 0) {
+        int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
+      }
+      output_concat_dim +=
+          input_shapes[i].dims() > 0 ? input_shapes[i].dim_size(axis) : 1;
+    }
+
+    TensorShape output_shape(input_shape);
+    if (output_shape.dims() == 0) {
+      output_shape.AddDim(output_concat_dim);
+    } else {
+      output_shape.set_dim(axis, output_concat_dim);
+    }
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() > 0) {
+      int64 output_dim1 = output->NumElements() / inputs_flat_dim0;
+      auto output_flat = output->shaped<T, 2>({inputs_flat_dim0, output_dim1});
+      ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+    }
+  }
+
+#endif
 };
 
+#ifdef INTEL_MKL_ML
+
 // --------------------------------------------------------------------------
 //                      Mkl Concat Op
 // --------------------------------------------------------------------------
@@ -435,8 +526,289 @@ class MklConcatOp : public OpKernel {
         mkl_tensor->flat<uint8>().data(),
         mkl_tensor->flat<uint8>().size() * sizeof(uint8));
   }
+
+  // overloading methods with input shapes as a list of TensorShape's
+  void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
+                        const TensorShapeList& input_shapes) {
+    CHECK_EQ(values.size(), input_shapes.size());
+
+    std::vector<Tensor> converted_values;
+    for (int i = 0; i < input_shapes.size(); i++) {
+      converted_values.push_back(values[i]);
+    }
+
+    // Call Eigen concat.
+    eigen_concat_op_.Compute(context, converted_values);
+
+    // Set dummy Mkl tensor as output Mkl tensor for this op.
+    MklShape mkl_tensor_mkl_shape;
+    mkl_tensor_mkl_shape.SetMklTensor(false);
+    mkl_tensor_mkl_shape.SetDimensions(4);
+    Tensor* mkl_tensor = nullptr;
+    TensorShape mkl_tensor_tf_shape;
+    mkl_tensor_tf_shape.AddDim(
+        SIZE_OF_MKL_SERIAL_DATA(mkl_tensor_mkl_shape.GetDimension()));
+    int tf_output_index = 0;
+    context->allocate_output(
+        GetTensorMetaDataIndex(tf_output_index, context->num_outputs()),
+        mkl_tensor_tf_shape, &mkl_tensor);
+    mkl_tensor_mkl_shape.SerializeMklShape(
+        mkl_tensor->flat<uint8>().data(),
+        mkl_tensor->flat<uint8>().size() * sizeof(uint8));
+  }
 };
 
+#else
+
+// --------------------------------------------------------------------------
+//                      Mkl Concat Op
+// --------------------------------------------------------------------------
+
+template <typename Device, typename T, AxisArgumentName AxisArgName>
+class MklConcatOp : public OpKernel {
+ private:
+  TensorFormat data_format_;
+  EigenConcatBaseOp<Device, T, AxisArgName> eigen_concat_op_;
+
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+
+  explicit MklConcatOp(OpKernelConstruction* c)
+      : OpKernel(c), eigen_concat_op_(c) {}
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      OpInputList input_tensors;
+      GetMklInputList(context, "values", &input_tensors);
+      const int N = input_tensors.size();
+
+      // Get Tensor shapes.
+      std::vector<MklDnnShape> input_shapes(N);
+      GetMklShapeList(context, "values", &input_shapes);
+
+      const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
+                                            ? MklGetInput(context, 0)
+                                            : MklGetInput(context, N);
+      // Sanity checks
+      OP_REQUIRES(
+          context, IsLegacyScalar(concat_dim_tensor.shape()),
+          errors::InvalidArgument(
+              "Concat dim tensor should be a scalar integer, but got shape ",
+              concat_dim_tensor.shape().DebugString()));
+      int32 concat_dim =
+          internal::SubtleMustCopy(concat_dim_tensor.scalar<int32>()());
+
+      // check that ranks of all tensors match
+      // and that their shapes match except for concat_dim.
+      int i = 0;
+      bool invoke_eigen = false;
+      bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
+      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
+                                             ? input_shapes[0].GetTfShape()
+                                             : input_tensors[0].shape();
+      size_t expected_dims = expected_shape.dims();
+
+      if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
+
+      for (auto& s : input_shapes) {
+        if (s == expected_shape) {
+          ++i;
+          continue;
+        }
+
+        TensorShape s_shape =
+            s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
+        size_t s_dims = s_shape.dims();
+
+        OP_REQUIRES(
+            context, s_dims == expected_dims,
+            errors::InvalidArgument(
+                "_MklConcatOp : Ranks of all input tensors should match:"
+                " input dimensions = ",
+                s_dims, " vs. expected rank = ", expected_dims));
+
+        for (int d = 0; d < expected_dims; ++d) {
+          if (d == concat_dim) continue;
+
+          size_t expected_size = expected_shape.dim_size(d);
+          size_t s_size = s_shape.dim_size(d);
+          OP_REQUIRES(
+              context, expected_size == s_size,
+              errors::InvalidArgument("_MklConcatOp : Dimensions of inputs "
+                                      "should match: shape[0][",
+                                      d, "]= ", expected_size, " vs. shape[", i,
+                                      "][", d, "] = ", s_size));
+        }
+
+        if (s.IsMklTensor())
+          are_all_tf_inputs = false;
+        else
+          are_all_mkl_inputs = false;
+
+        if (s_dims != 4) invoke_eigen = true;
+        ++i;
+      }
+
+      // All inputs are not in one format (TF or MKL). This is mixed input case.
+      // We can potentially optimize this case by converting all TF inputs
+      // to Mkl format. But currently, we fall to Eigen for this case.
+      // It may be possible to convert inputs that in TF format to Mkl
+      // format and avoid calling eigen version.
+      if (!are_all_tf_inputs && !are_all_mkl_inputs) invoke_eigen = true;
+
+      // Call Eigen library
+      if (invoke_eigen) {
+        TensorShapeList tf_input_shapes;
+        i = 0;
+        for (auto& s : input_shapes) {
+          TensorShape s_shape =
+              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
+          tf_input_shapes.push_back(s_shape);
+          ++i;
+        }
+        CallEigenVersion(context, input_tensors, tf_input_shapes);
+        return;
+      }
+
+      memory::dims dst_dims;
+      if (are_all_mkl_inputs)
+        dst_dims = TFShapeToMklDnnDims(input_shapes[0].GetTfShape());
+      else
+        // When all the inputs are in Tensorflow format, we don't know
+        // what is the input data format. In that case, we just use
+        // output format that is same as input formats.
+        dst_dims = TFShapeToMklDnnDims(input_tensors[0].shape());
+
+      std::vector<memory::primitive_desc> srcs_pd;
+      std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
+      int64 dst_concat_dim_size = 0;
+      for (int k = 0; k < N; k++) {
+        bool is_mkl_tensor = input_shapes[k].IsMklTensor();
+        memory::dims src_dims;
+
+        // Same comment as dst_dims for src_dims.
+        src_dims = (is_mkl_tensor)
+                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
+                       : TFShapeToMklDnnDims(input_tensors[k].shape());
+
+        dst_concat_dim_size += src_dims[concat_dim];
+        auto src_md =
+            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
+                          // It does not matter what data format we use here
+                          // (NHWC or NCHW). We just need to ensure that output
+                          // of Concat uses same data format as input.
+                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+
+        srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+        auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+        srcs_pd.push_back(src_mpd);
+      }
+      dst_dims[concat_dim] = dst_concat_dim_size;
+
+      MklDnnData<T> dst(&cpu_engine);
+      memory::desc dst_md({}, memory::data_undef, memory::format_undef);
+      memory::dims dst_dims_in_nchw;
+      if (are_all_mkl_inputs) {
+        // Since we are passing a specific format for destination,
+        // we need to have dst_dims in MklDnn order (NCHW).
+        auto orig_tf_format = input_shapes[0].GetTfDataFormat();
+        dst_dims_in_nchw = MklDnnDimsInNCHW(
+            dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
+        // We will set the output in the same format as input to avoid layout
+        // conversions.
+        // Currently we are setting dst format same as input format.
+        // See if we can make this choice in a better way.
+        dst_md = memory::desc(
+            dst_dims_in_nchw, MklDnnType<T>(),
+            (memory::format)input_shapes[0].GetMklLayout().data.format);
+      } else {
+        // Again, format does not matter here. We just need to make it same as
+        // input format.
+        dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nchw);
+      }
+
+      std::vector<primitive::at> inputs;
+      for (int k = 0; k < input_tensors.size(); k++)
+        inputs.push_back(srcs[k].GetOpMem());
+
+      // If all inputs are in MKL format, then meaning of concat_dim needs to
+      // change. Value of concat_dim is tied to input Tensorflow data format
+      // (NHWC or NCHW). MklDnn dimensions are in NCHW order. So if Tensorflow
+      // tensors are in NCHW order, then concat_dim semantics is preserved.
+      // But ifinput tensors are in NHWC order, then semantics need to change.
+      // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
+      // then since MklDnn order is NCHW, concat_dim needs to be 1.
+      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+
+      auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
+
+      MklDnnShape dnn_shape_dst;
+      TensorShape tf_shape_dst;
+      Tensor* dst_tensor = nullptr;
+      if (are_all_mkl_inputs) {
+        dnn_shape_dst.SetMklTensor(true);
+        auto dst_pd = concat_pd.dst_primitive_desc();
+        dnn_shape_dst.SetMklLayout(&dst_pd);
+        dnn_shape_dst.SetElemType(MklDnnType<T>());
+        dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
+                                  input_shapes[0].GetTfDataFormat());
+        tf_shape_dst.AddDim((dst_pd.get_size() / sizeof(T)));
+      } else {
+        dnn_shape_dst.SetMklTensor(false);
+        tf_shape_dst = MklDnnDimsToTFShape(dst_dims);
+      }
+      AllocateOutputSetMklShape(context, 0, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
+      CHECK_NOTNULL(dst_tensor);
+
+      dst_md =
+          dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout() : dst_md;
+      dst.SetUsrMem(dst_md, dst_tensor);
+
+      auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
+      std::vector<primitive> net;
+      net.push_back(concat_op);
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+  void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
+                        const TensorShapeList& input_shapes) {
+    CHECK_EQ(values.size(), input_shapes.size());
+
+    std::vector<Tensor> converted_values;
+    for (int i = 0; i < input_shapes.size(); i++)
+      converted_values.push_back(values[i]);
+
+    // Call Eigen concat.
+    eigen_concat_op_.Compute(context, converted_values, input_shapes);
+
+    // Set output Mkl tensor for this op.
+    MklDnnShape dnn_shape_output;
+    dnn_shape_output.SetMklTensor(false);
+    dnn_shape_output.SetDimensions(4);
+    Tensor* output_tensor = nullptr;
+    TensorShape tf_shape_output;
+    tf_shape_output.AddDim(dnn_shape_output.GetSerializeBufferSize());
+    context->allocate_output(GetTensorMetaDataIndex(0, context->num_outputs()),
+                             tf_shape_output, &output_tensor);
+    dnn_shape_output.SerializeMklDnnShape(
+        output_tensor->flat<uint8>().data(),
+        output_tensor->flat<uint8>().size() * sizeof(uint8));
+  }
+};
+
+#endif
+
 /* Use optimized concat for float type only */
 #define REGISTER_MKL_CPU(type)                                              \
   REGISTER_KERNEL_BUILDER(Name("_MklConcat")                                \
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index 0f1a218fe62dd91160320254342828811e3aa458..25c2573741265d4d33c9c91474792be241dd3b32 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -38,9 +38,9 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index f291281108d36465ef670cb990714dbb8a0a5715..1401bc65a45bd80ed78230840cf0b9958b1f012e 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -38,27 +38,24 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
-
-using mkldnn::convolution_forward;
 using mkldnn::convolution_backward_weights;
-using mkldnn::convolution_direct;
-
+using mkldnn::memory;
+using mkldnn::prop_kind;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, class T>
 class MklConv2DCustomBackpropFilterOp : public OpKernel {
@@ -363,8 +360,8 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
           (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
 
       const Tensor& out_backprop = MklGetInput(context, 2);
-      void* mkl_buf_out_backprop = const_cast<void*>(static_cast<const void*>(
-                                      out_backprop.flat<T>().data()));
+      void* mkl_buf_out_backprop = const_cast<void*>(
+          static_cast<const void*>(out_backprop.flat<T>().data()));
 
       CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
                                                 prim_conv_bwdfilter,
@@ -374,10 +371,11 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
           !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop, lt_out_backprop);
       if (mkl_convert_out_backprop) {
         CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
-                      lt_out_backprop, mkl_lt_internal_out_backprop),
+                                         lt_out_backprop,
+                                         mkl_lt_internal_out_backprop),
                  E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor,
-            lt_out_backprop, &mkl_buf_convert_out_backprop);
+                       lt_out_backprop, &mkl_buf_convert_out_backprop);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
                                           mkl_buf_out_backprop,
                                           mkl_buf_convert_out_backprop),
@@ -426,183 +424,239 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
   TensorFormat data_format_;
 };
 
+#define REGISTER_MKL_FILTER_KERNELS(T)                              \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
+#undef REGISTER_MKL_FILTER_KERNELS
+
 #else
 
-template <typename Device, class T>
-class MklConv2DCustomBackpropFilterOp : public OpKernel {
+template <typename Device, class T, bool biasEnabled>
+class MklConv2DCustomBackpropFilterOp
+    : public MklConv2DBackpropCommonOp<Device, T> {
  public:
   explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
+      : MklConv2DBackpropCommonOp<Device, T>(context) {}
+  ~MklConv2DCustomBackpropFilterOp() {}
 
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    int stride_n = GetTensorDim(strides_, data_format_, 'N');
-    int stride_c = GetTensorDim(strides_, data_format_, 'C');
-    OP_REQUIRES(
-        context, (stride_n == 1 && stride_c == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ private:
+  void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
+                         const MklDnnShape& filter_mkl_shape,
+                         const MklDnnShape& obp_mkl_shape) {
+    CHECK(!filter_mkl_shape.IsMklTensor())
+        << "Conv2DBackpropFilter: filter should not be in MKL Layout";
   }
 
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto cpu_engine = engine(engine::cpu, 0);
+  size_t GetInputTensorIndexWithSizes() { return 1; /* filter index */ }
 
-      MklDnnData<T> input(&cpu_engine);
-      MklDnnData<T> outbackprop(&cpu_engine);
-      MklDnnData<T> output(&cpu_engine);
+  TensorShape MakeInputTfShape(OpKernelContext* context,
+                               const Tensor& input_tensor) {
+    size_t input_idx = 0;
+    return GetTfShape(context, input_idx);
+  }
 
-      // Input tensors
-      const Tensor& input_tensor = MklGetInput(context, 0);
-      const Tensor& filter_tensor = MklGetInput(context, 1);
-      const Tensor& obp_tensor = MklGetInput(context, 2);  // Outbackprop
+  TensorShape MakeFilterTfShape(OpKernelContext* context,
+                                const Tensor& filter_tensor) {
+    TensorShape filter_tf_shape;
+    CHECK_EQ(TensorShapeUtils::IsVector(filter_tensor.shape()), true);
+    CHECK_EQ(TensorShapeUtils::MakeShape(filter_tensor.vec<int32>(),
+                                         &filter_tf_shape)
+                 .ok(),
+             true);
+    return filter_tf_shape;
+  }
 
-      // Generate input shapes.
-      TensorShape filter_shape;
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(filter_tensor.shape()),
-        errors::InvalidArgument(
-              "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
-              filter_tensor.dims()));
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                        filter_tensor.vec<int32>(), &filter_shape));
-      TensorShape input_shape = input_tensor.shape();
-      TensorShape obp_shape = obp_tensor.shape();
-
-      // By default, all dims are in MKL order. Only dims in TF order
-      // are those with prefix tf_order.
-      memory::dims obp_dims, fwd_input_dims, fwd_filter_dims;
-      memory::dims padding_l, padding_r, strides, fwd_output_dims;
-      memory::dims fwd_output_dims_tf_order;
-
-      // Get forward convolution parameters.
-      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(input_shape, filter_shape,
-                                         &fwd_input_dims, &fwd_filter_dims,
-                                         &strides,
-                                         &fwd_output_dims_tf_order,
-                                         &fwd_output_dims,
-                                         &padding_l, &padding_r);
-      if (!context->status().ok()) return;
-
-      // Create Convolution forward descriptor since Convolution backward
-      // API needs it. For that, we first need to create input, filter
-      // and output memory descriptors.
-      auto mkl_data_format = TFDataFormatToMklDnnDataFormat(data_format_);
-      auto fwd_src_md = memory::desc(fwd_input_dims, MklDnnType<T>(),
-                                     mkl_data_format);
-      auto fwd_filter_md = memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                        memory::format::hwio);
-      auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(),
-                                     mkl_data_format);
-      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, fwd_src_md, fwd_filter_md, fwd_out_md,
-            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
-      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
-
-      // Allocate output tensor and shape
-      // TODO(nhasabni): Update this when support for MKL layout is added.
-      // Shape of output of Conv2DBackpropInput is same as 'input' of Conv2D.
-      TensorShape tf_output_shape(filter_shape);
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      Tensor* output_tensor = nullptr;
-      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
-                                mkl_output_mkl_shape);
-
-      // Create memory for user data.
-      // Describe how the inputs and outputs of Convolution look like. Also
-      // specify buffers containing actual input and output data.
-      // Although input shape required is in MKL-DNN order, the layout is
-      // Tensorflow's layout (NHWC or NCHW depending on data format).
-      input.SetUsrMem(fwd_input_dims, mkl_data_format, &input_tensor);
-      // Outbackprop shape is NHWC or NCHW depending on data format. Since
-      // GetInputSizeInMklOrder function returns size in that order we just use
-      // use that function directly.
-      conv_utl.GetInputSizeInMklOrder(obp_shape, &obp_dims);
-      if (!context->status().ok()) return;
-      outbackprop.SetUsrMem(obp_dims, mkl_data_format, &obp_tensor);
-      // Although output shape required is in MKL-DNN order,
-      // layout is Tensorflow's filter layout (HWIO)
-      // Shape of output of Conv2DBackpropInput is same as shape of filter.
-      memory::dims bwd_output_dims = fwd_filter_dims;
-      output.SetUsrMem(bwd_output_dims, memory::format::hwio, output_tensor);
-
-      // Create memory descriptors for convolution data w/ no specified format.
-      input.SetOpMemDesc(fwd_input_dims, memory::format::any);
-      outbackprop.SetOpMemDesc(obp_dims, memory::format::any);
-      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
-
-      // Create convolution backward weights primitive.
-      auto bwd_desc = convolution_backward_weights::desc(convolution_direct,
-                          input.GetOpMemDesc(), output.GetOpMemDesc(),
-                          outbackprop.GetOpMemDesc(), strides, padding_l,
-                          padding_r, TFPaddingToMklDnnPadding(padding_));
-
-      auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc,
-                                                              cpu_engine,
-                                                              fwd_pd);
-
-      PrepareAndExecutePrimitive(bwd_pd, &input, &outbackprop, &output);
-    } catch (mkldnn::error &e) {
-     string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
-                                            error_msg));
+  TensorShape GetOutputTfShape(const TensorShape& input_shape,
+                               const TensorShape& filter_shape,
+                               const TensorShape& outbprop_shape) {
+    // Shape of output of Conv2DBackpropFilter is same as shape of filter.
+    return filter_shape;
+  }
+
+  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
+                                    const memory::dims& fwd_filter_dims) {
+    // Shape of output of Conv2DBackpropFilter is same as shape of filter.
+    return fwd_filter_dims;
+  }
+
+  memory::format GetOutputFormat(const memory::format data_format) {
+    // Output layout is Tensorflow's filter layout (HWIO).
+    return memory::format::hwio;
+  }
+
+  void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine,
+                       const convolution_forward::primitive_desc& conv_fwd_pd,
+                       MklDnnData<T>* input, MklDnnData<T>* filter,
+                       MklDnnData<T>* outbackprop, MklDnnData<T>* output,
+                       Tensor** output_tensor, const memory::dims& strides,
+                       const memory::dims& padding_l,
+                       const memory::dims& padding_r, padding_kind padding,
+                       const memory::dims& bwd_output_dims,
+                       memory::format bwd_output_format) {
+    CHECK_NOTNULL(context);
+    CHECK_NOTNULL(input);
+    CHECK_NOTNULL(filter);
+    CHECK_NOTNULL(outbackprop);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(output_tensor);
+
+    MklDnnData<T>* bias_grad = nullptr;
+    int depth = 0;
+    if (biasEnabled) {
+      // Data structure for bias_grad
+      bias_grad = new MklDnnData<T>(&cpu_engine);
+      TensorShape obp_tf_shape = GetTfShape(context, 2);
+      depth = (MklConv2DBackpropCommonOp<Device, T>::GetTFDataFormat() ==
+               FORMAT_NCHW)
+                  ? obp_tf_shape.dim_size(1)
+                  : obp_tf_shape.dim_size(3);
+      memory::dims bias_grad_dims = {depth};
+      bias_grad->SetOpMemDesc(bias_grad_dims, memory::format::x);
+    }
+
+    // Create convolution backward weights primitive.
+    auto bwd_desc =
+        (biasEnabled && (bias_grad != nullptr))
+            ? convolution_backward_weights::desc(
+                  convolution_direct, input->GetOpMemDesc(),
+                  output->GetOpMemDesc(), bias_grad->GetOpMemDesc(),
+                  outbackprop->GetOpMemDesc(), strides, padding_l, padding_r,
+                  padding)
+            : convolution_backward_weights::desc(
+                  convolution_direct, input->GetOpMemDesc(),
+                  output->GetOpMemDesc(), outbackprop->GetOpMemDesc(), strides,
+                  padding_l, padding_r, padding);
+
+    auto bwd_pd = convolution_backward_weights::primitive_desc(
+        bwd_desc, cpu_engine, conv_fwd_pd);
+
+    // Allocate output tensor.
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format,
+                         output_tensor);
+
+    CHECK_NOTNULL(*output_tensor);
+    // Set buffer handle using allocated output tensor.
+    output->SetUsrMemDataHandle(*output_tensor);
+
+    if (biasEnabled && (bias_grad != nullptr)) {
+      // Allocate bias_grad tensor
+      TensorShape bias_grad_shape({depth});
+      Tensor* bias_grad_tensor = nullptr;
+      AllocateBiasGradTensor(context, bias_grad_shape, &bias_grad_tensor);
+      memory::dims bias_grad_dims = {depth};
+      // Since Bias is 1D, we use format::x from MKLDNN to represent it.
+      auto bias_grad_md =
+          memory::desc({bias_grad_dims}, MklDnnType<T>(), memory::format::x);
+      bias_grad->SetUsrMem(bias_grad_md, bias_grad_tensor);
+      bias_grad->SetUsrMemDataHandle(bias_grad_tensor);
+    }
+
+    if (biasEnabled && (bias_grad != nullptr)) {
+      PrepareAndExecutePrimitive(bwd_pd, input, outbackprop, output, bias_grad);
+    } else {
+      PrepareAndExecutePrimitive(bwd_pd, input, outbackprop, output);
     }
   }
 
- private:
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
+  // Allocate output tensor.
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_backward_weights::primitive_desc& conv_pd,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+
+    // For BackpropFilter, we convert the output tensor back in Tensorflow
+    // layout. Because typically, BackpropFilter is the last operator in the
+    // graph that emit filter gradient that is provided to ApplyGradient
+    // method to update the filter. But it may be possible to eliminate this
+    // by forwarding filter in MKL layout if we support ApplyGradient method
+    // for MKL layout propagation.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(false);
+    // output_dims_mkl_order is in OIHW format.
+    // Allocate shape of TF tensor in HWIO format.
+    TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H],
+                                 output_dims_mkl_order[MklDnnDims::Dim_W],
+                                 output_dims_mkl_order[MklDnnDims::Dim_I],
+                                 output_dims_mkl_order[MklDnnDims::Dim_O]});
+    AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                              output_mkl_shape);
+  }
+
+  // Allocate tensor for bias grad
+  void AllocateBiasGradTensor(OpKernelContext* context,
+                              const TensorShape& bias_grad_shape,
+                              Tensor** bias_grad_tensor) {
+    CHECK_NOTNULL(bias_grad_tensor);
+
+    MklDnnShape bias_grad_mkl_shape;
+    bias_grad_mkl_shape.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 1, bias_grad_tensor, bias_grad_shape,
+                              bias_grad_mkl_shape);
+  }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-                  const convolution_backward_weights::primitive_desc& conv_pd,
-                  MklDnnData<T>* input, MklDnnData<T>* obp,
-                  MklDnnData<T>* output) {
+      const convolution_backward_weights::primitive_desc& conv_pd,
+      MklDnnData<T>* input, MklDnnData<T>* obp, MklDnnData<T>* output,
+      MklDnnData<T>* bias_grad = nullptr) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
     input->CheckReorderToOpMem(conv_pd.src_primitive_desc(), &net);
     obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
 
-    // Memory for output of convolution. Since we may need reorder on the
-    // output side, we will prepare reorder primitive in case output
-    // reorder to user memory is required.
+    // For BackpropFilter, we convert the output tensor back in Tensorflow
+    // layout.
     bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-                                      conv_pd.diff_weights_primitive_desc());
+        conv_pd.diff_weights_primitive_desc());
 
-    net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
-                                    obp->GetOpMem(), output->GetOpMem()));
+    if (biasEnabled && (bias_grad != nullptr)) {
+      net.push_back(convolution_backward_weights(
+          conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem(),
+          bias_grad->GetOpMem()));
+    } else {
+      net.push_back(convolution_backward_weights(
+          conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem()));
+    }
 
-    // Insert reorder primitive in the net for output reorder if reorder is
-    // required.
     if (output_reorder_required) {
       output->InsertReorderToUserMem(&net);
     }
 
-    // Handle output reorder
     stream(stream::kind::eager).submit(net).wait();
   }
 };
-#endif
 
-#define REGISTER_MKL_FILTER_KERNELS(T)                              \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+#define REGISTER_MKL_FILTER_KERNELS(T)                                   \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilter")                                   \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);             \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilterWithBias")                           \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>);              \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<T>("T")                    \
+                              .Label(mkl_op_registry::kMklOpLabel),      \
+                          MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
+
+#endif  // INTEL_MKL_ML
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 4a47d0463ef778430d59fed32202bff02233a9e9..eeed0095310280997ebb2ec3e848451df378c4fa 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include <algorithm>
 #include <vector>
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -41,25 +43,20 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
-using mkldnn::prop_kind;
-
-using mkldnn::convolution_forward;
-using mkldnn::convolution_direct;
 using mkldnn::convolution_backward_data;
+using mkldnn::prop_kind;
+using mkldnn::stream;
 #endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_DNN
+#ifdef INTEL_MKL_ML
 
 template <typename Device, class T>
 class MklConv2DCustomBackpropInputOp : public OpKernel {
@@ -362,176 +359,141 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
 #else
 
 template <typename Device, class T>
-class MklConv2DCustomBackpropInputOp : public OpKernel {
+class MklConv2DCustomBackpropInputOp
+    : public MklConv2DBackpropCommonOp<Device, T> {
  public:
-  ~MklConv2DCustomBackpropInputOp() {}
   explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format_str;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    int stride_n = GetTensorDim(strides_, data_format_, 'N');
-    int stride_c = GetTensorDim(strides_, data_format_, 'C');
-    OP_REQUIRES(
-        context, (stride_n == 1 && stride_c == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
+      : MklConv2DBackpropCommonOp<Device, T>(context) {}
+  ~MklConv2DCustomBackpropInputOp() {}
 
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ private:
+  const int kInputIndex_Filter = 1, kInputIndex_InputSizes = 0,
+            kInputIndex_OutBackProp = 2;
+  void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
+                         const MklDnnShape& filter_mkl_shape,
+                         const MklDnnShape& obp_mkl_shape) {
+    // Tensor that feeds to 'Input' slot of BackpropInput is always just a shape
+    // of the Tensor and never an actual tensor. So it will never be in MKL
+    // layout.
+    CHECK(!input_mkl_shape.IsMklTensor())
+        << "Conv2DBackpropInput: input should not be in MKL Layout";
   }
 
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto cpu_engine = engine(engine::cpu, 0);
+  size_t GetInputTensorIndexWithSizes() { return kInputIndex_InputSizes; }
 
-      MklDnnData<T> filter(&cpu_engine);
-      MklDnnData<T> outbackprop(&cpu_engine);
-      MklDnnData<T> output(&cpu_engine);
+  TensorShape MakeInputTfShape(OpKernelContext* context,
+                               const Tensor& input_tensor) {
+    TensorShape input_tf_shape;
+    CHECK_EQ(TensorShapeUtils::IsVector(input_tensor.shape()), true);
+    CHECK_EQ(
+        TensorShapeUtils::MakeShape(input_tensor.vec<int32>(), &input_tf_shape)
+            .ok(),
+        true);
+    return input_tf_shape;
+  }
 
-      // Input tensors
-      const Tensor& input_tensor = MklGetInput(context, 0);
-      const Tensor& filter_tensor = MklGetInput(context, 1);
-      const Tensor& obp_tensor = MklGetInput(context, 2);  // Outbackprop
+  TensorShape MakeFilterTfShape(OpKernelContext* context,
+                                const Tensor& filter_tensor) {
+    return GetTfShape(context, kInputIndex_Filter);
+  }
 
-      // Generate input shape.
-      TensorShape input_shape;
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor.shape()),
-        errors::InvalidArgument(
-              "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
-              input_tensor.dims()));
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                        input_tensor.vec<int32>(), &input_shape));
-      TensorShape filter_shape = filter_tensor.shape();
-      TensorShape obp_shape = obp_tensor.shape();
-
-      // By default, all dims are in MKL order. Only dims in TF order
-      // are those with prefix tf_order.
-      memory::dims obp_dims, fwd_input_dims, fwd_filter_dims;
-      memory::dims padding_l, padding_r, strides, fwd_output_dims;
-      memory::dims fwd_output_dims_tf_order;
-
-      // Get forward convolution parameters.
-      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(input_shape, filter_shape,
-                                         &fwd_input_dims, &fwd_filter_dims,
-                                         &strides,
-                                         &fwd_output_dims_tf_order,
-                                         &fwd_output_dims,
-                                         &padding_l, &padding_r);
-      if (!context->status().ok()) return;
-
-      // Create Convolution forward descriptor since Convolution backward
-      // API needs it. For that, we first need to create input, filter
-      // and output memory descriptors.
-      auto mkl_data_format = TFDataFormatToMklDnnDataFormat(data_format_);
-      auto fwd_src_md = memory::desc(fwd_input_dims, MklDnnType<T>(),
-                                     mkl_data_format);
-      auto fwd_filter_md = memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                        memory::format::hwio);
-      auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(),
-                                     mkl_data_format);
-      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, fwd_src_md, fwd_filter_md, fwd_out_md,
-            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
-      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
-
-      // Allocate output tensor and shape
-      // TODO(nhasabni): Update this when support for MKL layout is added.
-      // Shape of output of Conv2DBackpropInput is same as 'input' of Conv2D.
-      TensorShape tf_output_shape(input_shape);
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      Tensor* output_tensor = nullptr;
-      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
-                                mkl_output_mkl_shape);
-
-      // Create memory for user data.
-      // Describe how the inputs and outputs of Convolution look like. Also
-      // specify buffers containing actual input and output data.
-      // Although input shape required is in MKL-DNN order, the layout is
-      // Tensorflow's layout (NHWC or NCHW depending on data format).
-      // Although filter shape (filter_dims) required is in MKL-DNN order,
-      // the layout is Tensorflow's layout (HWIO).
-      // Shape of Conv2DBackpropInput's filter is same as that of Conv2D filter.
-      filter.SetUsrMem(fwd_filter_dims, memory::format::hwio, &filter_tensor);
-      // Outbackprop shape is NHWC or NCHW depending on data format. Since
-      // GetInputSizeInMklOrder function returns size in that order we just use
-      // use that function directly.
-      conv_utl.GetInputSizeInMklOrder(obp_shape, &obp_dims);
-      if (!context->status().ok()) return;
-      outbackprop.SetUsrMem(obp_dims, mkl_data_format, &obp_tensor);
-      // Although output shape required is in MKL-DNN order,
-      // layout is Tensorflow's layout (NHWC or NCHW depending on data format).
-      // Shape of output of Conv2DBackpropInput is same as shape of 'input'
-      // of Conv2D.
-      memory::dims bwd_output_dims = fwd_input_dims;
-      output.SetUsrMem(bwd_output_dims, mkl_data_format, output_tensor);
-
-      // Create memory descriptors for convolution data w/ no specified format.
-      filter.SetOpMemDesc(fwd_filter_dims, memory::format::any);
-      outbackprop.SetOpMemDesc(obp_dims, memory::format::any);
-      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
-
-      // Create convolution backward data primitive.
-      auto bwd_desc = convolution_backward_data::desc(convolution_direct,
-                          output.GetOpMemDesc(), filter.GetOpMemDesc(),
-                          outbackprop.GetOpMemDesc(), strides, padding_l,
-                          padding_r, TFPaddingToMklDnnPadding(padding_));
-
-      auto bwd_pd = convolution_backward_data::primitive_desc(bwd_desc,
-                                                              cpu_engine,
-                                                              fwd_pd);
-
-      PrepareAndExecutePrimitive(bwd_pd, &filter, &outbackprop, &output);
-    } catch (mkldnn::error &e) {
-     string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) +
-                       ", in file " + string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
-                                            error_msg));
-    }
+  TensorShape GetOutputTfShape(const TensorShape& input_shape,
+                               const TensorShape& filter_shape,
+                               const TensorShape& outbprop_shape) {
+    // Output Shape of Conv2DBackpropInput is same as shape of Conv2D 'input'.
+    return input_shape;
   }
 
- private:
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
+  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
+                                    const memory::dims& fwd_filter_dims) {
+    // Output Shape of Conv2DBackpropInput is same as shape of Conv2D 'input'.
+    return fwd_input_dims;
+  }
+
+  memory::format GetOutputFormat(const memory::format data_format) {
+    // Output layout is Tensorflow's layout in data format order.
+    return data_format;
+  }
+
+  void CreatePrimitive(OpKernelContext* context, const engine& cpu_engine,
+                       const convolution_forward::primitive_desc& conv_fwd_pd,
+                       MklDnnData<T>* input, MklDnnData<T>* filter,
+                       MklDnnData<T>* outbackprop, MklDnnData<T>* output,
+                       Tensor** output_tensor, const memory::dims& strides,
+                       const memory::dims& padding_l,
+                       const memory::dims& padding_r, padding_kind padding,
+                       const memory::dims& bwd_output_dims,
+                       memory::format bwd_output_format) {
+    CHECK_NOTNULL(context);
+    CHECK_NOTNULL(input);
+    CHECK_NOTNULL(filter);
+    CHECK_NOTNULL(outbackprop);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(output_tensor);
+
+    // Create convolution backward data primitive.
+    auto bwd_desc = convolution_backward_data::desc(
+        convolution_direct, output->GetOpMemDesc(), filter->GetOpMemDesc(),
+        outbackprop->GetOpMemDesc(), strides, padding_l, padding_r, padding);
+
+    auto bwd_pd = convolution_backward_data::primitive_desc(
+        bwd_desc, cpu_engine, conv_fwd_pd);
+
+    // Allocate output tensor in TensorFlow and MKL layout.
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims, bwd_output_format,
+                         output_tensor);
+    CHECK_NOTNULL(*output_tensor);
+    // Set buffer handle using allocated output tensor.
+    output->SetUsrMemDataHandle(*output_tensor);
+
+    PrepareAndExecutePrimitive(bwd_pd, filter, outbackprop, output);
+  }
+
+  // Allocate output tensor.
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_backward_data::primitive_desc& conv_pd,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+
+    // Output primitive descriptor for backward data is diff_src.
+    auto dst_pd = conv_pd.diff_src_primitive_desc();
+
+    // Allocate shape of Mkl tensor.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+
+    // Allocate shape of TF tensor.
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T));
+
+    AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                              output_mkl_shape);
+  }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-                  const convolution_backward_data::primitive_desc& conv_pd,
-                  MklDnnData<T>* filter, MklDnnData<T>* obp,
-                  MklDnnData<T>* output) {
+      const convolution_backward_data::primitive_desc& conv_pd,
+      MklDnnData<T>* filter, MklDnnData<T>* obp, MklDnnData<T>* output) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
     filter->CheckReorderToOpMem(conv_pd.weights_primitive_desc(), &net);
     obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
 
-    // Memory for output of convolution. Since we may need reorder on the
-    // output side, we will prepare reorder primitive in case output
-    // reorder to user memory is required.
-    bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-                                      conv_pd.diff_src_primitive_desc());
-
-    net.push_back(convolution_backward_data(conv_pd, obp->GetOpMem(),
-                                    filter->GetOpMem(), output->GetOpMem()));
-
-    // Insert reorder primitive in the net for output reorder if reorder is
-    // required.
-    if (output_reorder_required) {
-      output->InsertReorderToUserMem(&net);
-    }
+    net.push_back(convolution_backward_data(
+        conv_pd, obp->GetOpMem(), filter->GetOpMem(), output->GetOpMem()));
 
-    // Handle output reorder
     stream(stream::kind::eager).submit(net).wait();
   }
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 #define REGISTER_MKL_CPU_KERNELS(T)                                 \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index a9872b8d6d3ea89da0a73017af19cabbc25f78ce..2953426d5824064952858124882126c154fe6725 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <string.h>
 #include <map>
-#include <vector>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -40,25 +40,27 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 
 #include "tensorflow/core/util/mkl_util.h"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
+
 #include "mkldnn.hpp"
 
-using mkldnn::stream;
 using mkldnn::prop_kind;
+using mkldnn::stream;
 
-using mkldnn::convolution_forward;
 using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
+#else
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-// For now, MKL-ML is default. So making MKL-DNN not a default choice.
-#ifndef INTEL_MKL_DNN
+// MKL-DNN is now default. MKL-ML must be specified explicitly.
+#ifdef INTEL_MKL_ML
 
 template <typename Device, typename T, bool biasEnabled>
 class MklConv2DOp : public OpKernel {
@@ -114,18 +116,19 @@ class MklConv2DOp : public OpKernel {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
     }
 
     const int64 input_depth =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'C')
                             : GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(
-        context, input_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                input_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, input_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", input_depth,
+                    " vs ", filter.dim_size(2)));
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
 
@@ -134,9 +137,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_rows_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'H')
                             : GetTensorDim(input, data_format_, 'H');
-    OP_REQUIRES(context, FastBoundsCheck(input_rows_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input rows too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
     const int input_rows = static_cast<int>(input_rows_raw);
     const int filter_rows = static_cast<int>(filter.dim_size(0));
 
@@ -145,9 +149,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_cols_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'W')
                             : GetTensorDim(input, data_format_, 'W');
-    OP_REQUIRES(context, FastBoundsCheck(input_cols_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("Input cols too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
     const int input_cols = static_cast<int>(input_cols_raw);
     const int filter_cols = static_cast<int>(filter.dim_size(1));
 
@@ -155,9 +160,10 @@ class MklConv2DOp : public OpKernel {
     const int64 input_batch_raw =
         input_in_mkl_format ? GetMklTensorDim(mkl_context.input_shape, 'N')
                             : GetTensorDim(input, data_format_, 'N');
-    OP_REQUIRES(context, FastBoundsCheck(input_batch_raw,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("batch is too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input_batch_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("batch is too large"));
     const int batch = static_cast<int>(input_batch_raw);
 
     // For now we take the stride from the second and third dimensions only (we
@@ -288,10 +294,8 @@ class MklConv2DOp : public OpKernel {
     mkl_filter_output_mkl_shape.SetMklLayout(mkl_context.prim_fwd,
                                              dnnResourceFilter);
 
-    size_t filter_sizes[4] = {static_cast<size_t>(filter.dim_size(0)),
-                              static_cast<size_t>(filter.dim_size(1)),
-                              static_cast<size_t>(filter.dim_size(2)),
-                              static_cast<size_t>(filter.dim_size(3))};
+    size_t filter_sizes[4] = {filter.dim_size(0), filter.dim_size(1),
+                              filter.dim_size(2), filter.dim_size(3)};
     mkl_filter_output_mkl_shape.SetTfLayout(filter.dims(), filter_sizes,
                                             mkl_context.filter_strides);
 
@@ -313,8 +317,7 @@ class MklConv2DOp : public OpKernel {
     // Temp tensor used to allocate tmp buffers
     Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor,
         mkl_tmp_bias_buf_tensor;
-    mkl_context.MklPrepareConvolutionInputs(context,
-                                            &mkl_tmp_input_buf_tensor,
+    mkl_context.MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor,
                                             &mkl_tmp_filter_buf_tensor,
                                             &mkl_tmp_bias_buf_tensor);
 
@@ -398,8 +401,9 @@ class MklConv2DOp : public OpKernel {
       mkl_convert_input =
           !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
       if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
-                 lt_input, mkl_lt_internal_input), E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
                        &mkl_buf_convert_input);
         CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
@@ -510,9 +514,15 @@ class MklConv2DOp : public OpKernel {
       auto cpu_engine = engine(engine::cpu, 0);
 
       // Input tensors
-      size_t src_idx = 0, filter_idx = 1;
-      const Tensor& src_tensor = MklGetInput(context, src_idx);
-      const Tensor& filter_tensor = MklGetInput(context, filter_idx);
+      const Tensor& src_tensor = MklGetInput(context, kInputIndex_Src);
+      const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter);
+
+      MklDnnShape src_mkl_shape, filter_mkl_shape;
+      GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
+      GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
+      OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false,
+                  errors::InvalidArgument("Filter should not be in "
+                                          "Mkl Layout"));
 
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> filter(&cpu_engine);
@@ -523,67 +533,61 @@ class MklConv2DOp : public OpKernel {
 
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(src_tensor.shape(),
-                                         filter_tensor.shape(),
-                                         &src_dims, &filter_dims, &strides,
-                                         &output_dims_tf_order,
-                                         &output_dims_mkl_order, &padding_l,
-                                         &padding_r);
+      auto src_tf_shape = GetTfShape(context, kInputIndex_Src);
+      auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter);
+      conv_utl.GetConvFwdSizesInMklOrder(
+          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
+          &output_dims_tf_order, &output_dims_mkl_order, &padding_l,
+          &padding_r);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
-      TensorShape tf_output_shape({output_dims_tf_order[0],
-                                output_dims_tf_order[1],
-                                output_dims_tf_order[2],
-                                output_dims_tf_order[3]});
-      Tensor* output_tensor = nullptr;
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
-                                mkl_output_mkl_shape);
+      TensorShape output_tf_shape = MklDnnDimsToTFShape(output_dims_tf_order);
 
-      // Forward filter in TF format from input at index 1 to output at index 1.
-      ForwardTfTensorInToOut(context, 1, 1);
-
-      if (tf_output_shape.num_elements() == 0) {
+      // Corner cases: output with 0 elements and 0 batch size.
+      Tensor* output_tensor = nullptr;
+      if (output_tf_shape.num_elements() == 0 || output_dims_tf_order[0] == 0) {
         // TODO(jbobba): Verify correctness here
         //               Need semantics for Null MKL tensor
+        MklDnnShape output_mkl_shape;
+        output_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, kOutputIndex_Dst, &output_tensor,
+                                  src_tf_shape, output_mkl_shape);
+
+        // MklConv2D also outputs converted filter as 2nd output of Conv2D.
+        filter_mkl_shape.SetMklTensor(false);
+        Tensor* output_filter_tensor = nullptr;
+        AllocateOutputSetMklShape(context, kOutputIndex_Filter,
+                                  &output_filter_tensor, filter_tf_shape,
+                                  filter_mkl_shape);
         return;
       }
 
-      // Corner case to handle 0 batch size.
-      if (output_dims_tf_order[0] == 0) {
-        // Nothing to do, allocate output tensor and return
-        // TODO(nhasabni): remove this code later once serialization
-        // in MKL-DNN is supported.
-        AllocateOutputSetMklShape(context, 0, &output_tensor,
-                                  src_tensor.shape(), mkl_output_mkl_shape);
-        return;
-      } else {
-        // Otherwise regular output tensor allocation
-        // Allocate output tensor.
-      }
-      CHECK_NOTNULL(output_tensor);
-
       // Create memory for user data.
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
-      // Although input shape (src_dims) required is in MKL-DNN order,
-      // the layout is Tensorflow's layout (NHWC or NCHW depending on data
-      // format).
-      src.SetUsrMem(src_dims, TFDataFormatToMklDnnDataFormat(data_format_),
-                    const_cast<void*>(static_cast<const void*>(
-                    src_tensor.flat<T>().data())));
+      auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+      // If input is in MKL layout, then simply grab input layout; otherwise,
+      // construct input Tf layout. For TF layout, although input shape
+      // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
+      // layout (NHWC or NCHW depending on data format).
+      auto src_md = src_mkl_shape.IsMklTensor()
+                        ? src_mkl_shape.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
+      src.SetUsrMem(src_md, &src_tensor);
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
-      filter.SetUsrMem(filter_dims, memory::format::hwio,
-                       const_cast<void*>(static_cast<const void*>(
-                       filter_tensor.flat<T>().data())));
-      // Although output shape (output_dims) required is in MKL-DNN order,
-      // layout is Tensorflow's layout (NHWC or NCHW depending on data format).
-      output.SetUsrMem(output_dims_mkl_order,
-                       TFDataFormatToMklDnnDataFormat(data_format_),
-                       output_tensor->flat<T>().data());
+      auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
+                           ? filter_mkl_shape.GetMklLayout()
+                           : memory::desc(filter_dims, MklDnnType<T>(),
+                                          memory::format::hwio);
+      filter.SetUsrMem(filter_md, &filter_tensor);
+
+      // Set output shape (output_dims) required in MKL-DNN order.
+      // Currently, we set output layout as Tensorflow's layout (NHWC or NCHW
+      // depending on data format). But later we propagate Mkl layout of the
+      // output to the next op directly.
+      output.SetUsrMem(output_dims_mkl_order, tf_fmt);
 
       // Create memory descriptors for convolution data w/ no specified format.
       src.SetOpMemDesc(src_dims, memory::format::any);
@@ -594,40 +598,59 @@ class MklConv2DOp : public OpKernel {
       if (biasEnabled) {
         MklDnnData<T> bias(&cpu_engine);
         memory::dims bias_size;
-        conv_utl.GetBiasSizeInMklOrder(2 /* bias idx */, &bias_size);
-        const Tensor& bias_tensor = MklGetInput(context, 2);
-        bias.SetUsrMem(bias_size, memory::format::x,
-                       const_cast<void*>(static_cast<const void*>(
-                       bias_tensor.flat<T>().data())));
+        conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_size);
+        const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
+        bias.SetUsrMem(bias_size, memory::format::x, &bias_tensor);
         bias.SetOpMemDesc(bias_size, memory::format::any);
 
         // Create convolution primitive with Bias.
-        auto conv_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
-            bias.GetOpMemDesc(), output.GetOpMemDesc(), strides,
-            padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
-
-        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                cpu_engine);
-        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output);
+        auto conv_desc = convolution_forward::desc(
+            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
+            filter.GetOpMemDesc(), bias.GetOpMemDesc(), output.GetOpMemDesc(),
+            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+
+        auto conv_prim_desc =
+            convolution_forward::primitive_desc(conv_desc, cpu_engine);
+        AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
+                             tf_fmt, &output_tensor);
+        // Set data handle for output.
+        output.SetUsrMemDataHandle(output_tensor);
+
+        Tensor* filter_out_tensor = nullptr;
+        AllocateFilterOutputTensor(context, conv_prim_desc,
+                                   TFShapeToMklDnnDims(filter_tf_shape),
+                                   &filter_out_tensor);
+
+        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output,
+                             filter_out_tensor);
       } else {
         // Create convolution primitive without Bias.
-        auto conv_desc = convolution_forward::desc(prop_kind::forward,
-            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
-            output.GetOpMemDesc(), strides, padding_l, padding_r,
-            TFPaddingToMklDnnPadding(padding_));
-
-        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
-                                                                cpu_engine);
-        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output);
+        auto conv_desc = convolution_forward::desc(
+            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
+            filter.GetOpMemDesc(), output.GetOpMemDesc(), strides, padding_l,
+            padding_r, TFPaddingToMklDnnPadding(padding_));
+
+        auto conv_prim_desc =
+            convolution_forward::primitive_desc(conv_desc, cpu_engine);
+        AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
+                             tf_fmt, &output_tensor);
+        // Set data handle for output.
+        output.SetUsrMemDataHandle(output_tensor);
+
+        Tensor* filter_out_tensor = nullptr;
+        AllocateFilterOutputTensor(context, conv_prim_desc,
+                                   TFShapeToMklDnnDims(filter_tf_shape),
+                                   &filter_out_tensor);
+        PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output,
+                             filter_out_tensor);
       }
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + std::string(e.message) +
-                       ", in file " + std::string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-        errors::Aborted("Operation received an exception:", error_msg));
+                         ", message: " + std::string(e.message) + ", in file " +
+                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -635,43 +658,94 @@ class MklConv2DOp : public OpKernel {
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
+  const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
+  const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
+
+  // Allocate output tensor.
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    auto dst_pd = conv_prim_desc.dst_primitive_desc();
+
+    // Allocate shape of Mkl tensor.
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+
+    // Allocate shape of TF tensor.
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
+
+    AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
+                              output_tf_shape, output_mkl_shape);
+  }
+
+  // Allocate output tensor.
+  void AllocateFilterOutputTensor(
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& filter_dims_tf_order, Tensor** filter_tensor) {
+    CHECK_NOTNULL(filter_tensor);
+    auto filter_pd = conv_prim_desc.weights_primitive_desc();
+
+    // Allocate shape of Mkl tensor.
+    MklDnnShape filter_mkl_shape;
+    filter_mkl_shape.SetMklTensor(true);
+    filter_mkl_shape.SetMklLayout(&filter_pd);
+    filter_mkl_shape.SetElemType(MklDnnType<T>());
+
+    // The format of the filter is actually OIhw8i8o, but TF doesn't support
+    // this format. Just use format::blocked for now because the layout
+    // is stored in the MKL data.
+    filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(),
+                                 filter_dims_tf_order, memory::format::blocked);
+
+    // Allocate the data space for the filter to propagate as TF tensor.
+    TensorShape filter_tf_shape;
+    filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T)));
+
+    AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
+                              filter_tf_shape, filter_mkl_shape);
+  }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecuteNet(
-                  const convolution_forward::primitive_desc& conv_prim_desc,
-                  MklDnnData<T>* src, MklDnnData<T>* filter,
-                  MklDnnData<T>* bias, MklDnnData<T>* output) {
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      MklDnnData<T>* src, MklDnnData<T>* filter, MklDnnData<T>* bias,
+      MklDnnData<T>* output, Tensor* filter_out_tensor) {
+    CHECK_NOTNULL(filter_out_tensor);
+
     // Create reorders between user layout and MKL layout if it is needed and
-    // add it to the net before convolution.
+    // add it to the net before convolution. No need to check for output
+    // reorder as we propagate output layout to the next layer.
     std::vector<primitive> net;
     src->CheckReorderToOpMem(conv_prim_desc.src_primitive_desc(), &net);
-    filter->CheckReorderToOpMem(conv_prim_desc.weights_primitive_desc(), &net);
 
-    // Memory for output of convolution. Since we may need reorder on the
-    // output side, we will prepare reorder primitive in case output
-    // reorder to user memory is required.
-    bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-                                      conv_prim_desc.dst_primitive_desc());
+    // rather than re-order to a temp buffer, reorder directly to the
+    // filter output tensor
+    filter->CheckReorderToOpMem(conv_prim_desc.weights_primitive_desc(),
+                                filter->GetTensorBuffer(filter_out_tensor),
+                                &net);
 
     // Create convolution primitive and add it to net.
     if (bias) {
       CHECK_EQ(biasEnabled, true);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                    filter->GetOpMem(), bias->GetOpMem(),
-                                    output->GetOpMem()));
+                                        filter->GetOpMem(), bias->GetOpMem(),
+                                        output->GetOpMem()));
     } else {
       CHECK_EQ(biasEnabled, false);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                    filter->GetOpMem(), output->GetOpMem()));
+                                        filter->GetOpMem(),
+                                        output->GetOpMem()));
     }
 
-    // Insert reorder primitive in the net for output reorder if reorder is
-    // required.
-    if (output_reorder_required) {
-      output->InsertReorderToUserMem(&net);
-    }
-
-    // Handle output reorder
     stream(stream::kind::eager).submit(net).wait();
   }
 };
@@ -688,7 +762,12 @@ class MklConv2DOp : public OpKernel {
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DOp<CPUDevice, T, true>);
+                          MklConv2DOp<CPUDevice, T, true>);         \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU);
 
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index f0cb37f8a42c19cad183af2e0de7db2931cf299a..9dd88221a84671e1f69df13cca1b62b2ce65bb4e 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 
-#include <vector>
 #include <limits>
+#include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -26,8 +27,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
+#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -39,13 +40,19 @@ limitations under the License.
 
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
+
+using mkldnn::prop_kind;
+using mkldnn::stream;
+
+using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
 #endif
 
 namespace tensorflow {
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 class MklDnnConvUtil {
  protected:
@@ -56,13 +63,13 @@ class MklDnnConvUtil {
 
  public:
   MklDnnConvUtil(OpKernelContext* context, const std::vector<int32>& strides,
-                 Padding pad, TensorFormat fm) : context_(context),
-    strides_(strides), padding_(pad), data_format_(fm) {}
+                 Padding pad, TensorFormat fm)
+      : context_(context), strides_(strides), padding_(pad), data_format_(fm) {}
 
   virtual ~MklDnnConvUtil() { context_ = nullptr; }
 
   // Calculate Convolution strides
-  virtual inline void GetStridesInMklOrder(memory::dims *strides) {
+  virtual inline void GetStridesInMklOrder(memory::dims* strides) {
     // For now we take the stride from the second and third dimensions only
     // (we do not support striding on the batch or depth dimension).
     CHECK_NOTNULL(strides);
@@ -75,14 +82,14 @@ class MklDnnConvUtil {
   // requires input in NCHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void
-  GetInputSizeInMklOrder(const TensorShape& input_shape,
-                         memory::dims *input_dims) {
-  #define CHECK_BOUNDS(val, err_msg) do {                     \
-    OP_REQUIRES(context_, FastBoundsCheck(val,                \
-                            std::numeric_limits<int>::max()), \
-                errors::InvalidArgument(err_msg));            \
-  }while(0)
+  virtual inline void GetInputSizeInMklOrder(const TensorShape& input_shape,
+                                             memory::dims* input_dims) {
+#define CHECK_BOUNDS(val, err_msg)                                     \
+  do {                                                                 \
+    OP_REQUIRES(context_,                                              \
+                FastBoundsCheck(val, std::numeric_limits<int>::max()), \
+                errors::InvalidArgument(err_msg));                     \
+  } while (0)
 
     CHECK_NOTNULL(input_dims);
 
@@ -105,10 +112,16 @@ class MklDnnConvUtil {
     CHECK_BOUNDS(input_batch_raw, "Input batch too large");
     int input_batch = static_cast<int>(input_batch_raw);
 
-  #undef CHECK_BOUNDS
+#undef CHECK_BOUNDS
 
     // MKL-DNN always requires input in NCHW format.
-    *input_dims = {input_batch, input_depth, input_rows, input_cols};
+    std::vector<int> mkldnn_sizes(4, -1);
+    mkldnn_sizes[MklDnnDims::Dim_N] = input_batch;
+    mkldnn_sizes[MklDnnDims::Dim_C] = input_depth;
+    mkldnn_sizes[MklDnnDims::Dim_H] = input_rows;
+    mkldnn_sizes[MklDnnDims::Dim_W] = input_cols;
+
+    *input_dims = mkldnn_sizes;
   }
 
   // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
@@ -125,10 +138,9 @@ class MklDnnConvUtil {
   // forward gets actual tensor as input).
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void
-  GetFilterSizeInMklOrder(const TensorShape& input_shape,
-                          const TensorShape& filter_shape,
-                          memory::dims *filter_dims) {
+  virtual inline void GetFilterSizeInMklOrder(const TensorShape& input_shape,
+                                              const TensorShape& filter_shape,
+                                              memory::dims* filter_dims) {
     CHECK_NOTNULL(filter_dims);
 
     OP_REQUIRES(context_, filter_shape.dims() == 4,
@@ -136,17 +148,18 @@ class MklDnnConvUtil {
                                         filter_shape.DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context_, FastBoundsCheck(filter_shape.dim_size(i),
-                                           std::numeric_limits<int>::max()),
-                errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(context_,
+                  FastBoundsCheck(filter_shape.dim_size(i),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
     }
 
     int input_depth = GetTensorDim(input_shape, data_format_, 'C');
 
-    OP_REQUIRES(
-        context_, input_depth == filter_shape.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                input_depth, " vs ", filter_shape.dim_size(2)));
+    OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", input_depth,
+                    " vs ", filter_shape.dim_size(2)));
 
     // TF filter is always in (rows, cols, in_depth, out_depth) order.
     int filter_rows = static_cast<int>(filter_shape.dim_size(0));
@@ -156,32 +169,37 @@ class MklDnnConvUtil {
 
     // MKL-DNN always needs filter in OIHW format.
     // OIHW = (out_depth, in_depth, rows, cols)
-    *filter_dims = {out_depth, in_depth, filter_rows, filter_cols};
+    std::vector<int> mkldnn_sizes(4, -1);
+    mkldnn_sizes[MklDnnDims::Dim_O] = out_depth;
+    mkldnn_sizes[MklDnnDims::Dim_I] = in_depth;
+    mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows;
+    mkldnn_sizes[MklDnnDims::Dim_W] = filter_cols;
+
+    *filter_dims = mkldnn_sizes;
   }
 
   // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
   // requires filter in OIHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void
-  GetFilterSizeInMklOrder(size_t src_index, size_t filter_index,
-                          memory::dims *filter_dims) {
+  virtual inline void GetFilterSizeInMklOrder(size_t src_index,
+                                              size_t filter_index,
+                                              memory::dims* filter_dims) {
     CHECK_NOTNULL(filter_dims);
-    const Tensor& input = MklGetInput(context_, src_index);
-    const Tensor& filter = MklGetInput(context_, filter_index);
-    GetFilterSizeInMklOrder(input.shape(), filter.shape(), filter_dims);
+    GetFilterSizeInMklOrder(GetTfShape(context_, src_index),
+                            GetTfShape(context_, filter_index), filter_dims);
   }
 
   // Calculate Bias size for 2D Convolution. Function does not return
   // anything, but sets error in context status.
-  virtual inline void
-  GetBiasSizeInMklOrder(size_t bias_index, memory::dims *bias_dims) {
+  virtual inline void GetBiasSizeInMklOrder(size_t bias_index,
+                                            memory::dims* bias_dims) {
     const Tensor& bias = MklGetInput(context_, bias_index);
     OP_REQUIRES(context_, bias.dims() == 1,
                 errors::InvalidArgument("bias must be 1-dimensional: ",
                                         bias.shape().DebugString()));
 
-    *bias_dims = { static_cast<int>(bias.dim_size(0)) };
+    *bias_dims = {static_cast<int>(bias.dim_size(0))};
   }
 
   // Function to calculate output and padding size for 2D convolution.
@@ -193,13 +211,11 @@ class MklDnnConvUtil {
   // status is returned via context status.
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void
-  GetOutputAndPadSizeInMklOrder(const TensorShape& input_shape,
-                                const TensorShape& filter_shape,
-                                const memory::dims& strides,
-                                memory::dims *output_dims_tf_order,
-                                memory::dims *output_dims_mkl_order,
-                                memory::dims *pad_l, memory::dims *pad_r) {
+  virtual inline void GetOutputAndPadSizeInMklOrder(
+      const TensorShape& input_shape, const TensorShape& filter_shape,
+      const memory::dims& strides, memory::dims* output_dims_tf_order,
+      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
+      memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -225,21 +241,25 @@ class MklDnnConvUtil {
     int64 out_rows = 0, out_cols = 0;
     int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
 
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerbose(input_rows, filter_rows, stride_rows,
-                                 padding_, &out_rows, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(context_,
-            GetWindowedOutputSizeVerbose(input_cols, filter_cols, stride_cols,
-                                 padding_, &out_cols, &pad_left, &pad_right));
+    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                 input_rows, filter_rows, stride_rows, padding_,
+                                 &out_rows, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
+                                 input_cols, filter_cols, stride_cols, padding_,
+                                 &out_cols, &pad_left, &pad_right));
 
     // Tensorflow output is in data_format order. (NHWC or NCHW)
-    TensorShape out_shape = ShapeFromFormat(data_format_, out_batch,
-                                            out_rows, out_cols, out_depth);
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth);
     *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
 
     // MKL-DNN always needs output in NCHW format.
-    *output_dims_mkl_order = {out_batch, out_depth, static_cast<int>(out_rows),
-                   static_cast<int>(out_cols)};
+    std::vector<int> mkldnn_sizes(4, -1);
+    mkldnn_sizes[MklDnnDims::Dim_N] = out_batch;
+    mkldnn_sizes[MklDnnDims::Dim_C] = out_depth;
+    mkldnn_sizes[MklDnnDims::Dim_H] = static_cast<int>(out_rows);
+    mkldnn_sizes[MklDnnDims::Dim_W] = static_cast<int>(out_cols);
+    *output_dims_mkl_order = mkldnn_sizes;
 
     // Now handle padding. MKL-DNN uses asymetric padding.
     *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
@@ -250,27 +270,25 @@ class MklDnnConvUtil {
   // See comment on GetConvOutputAndPadSizeInMklOrder for parameters.
   //
   // Function does not return anything, but sets error in context status.
-  inline void
-  GetOutputAndPadSizeInMklOrder(size_t src_index, size_t filter_index,
-                                const memory::dims& strides,
-                                memory::dims *output_dims_tf_order,
-                                memory::dims *output_dims_mkl_order,
-                                memory::dims *pad_l, memory::dims *pad_r) {
+  inline void GetOutputAndPadSizeInMklOrder(
+      size_t src_index, size_t filter_index, const memory::dims& strides,
+      memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
+      memory::dims* pad_l, memory::dims* pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
     CHECK_NOTNULL(pad_r);
 
-    const Tensor& input = MklGetInput(context_, src_index);
-    const Tensor& filter = MklGetInput(context_, filter_index);
+    auto input_tf_shape = GetTfShape(context_, src_index);
+    auto filter_tf_shape = GetTfShape(context_, filter_index);
 
-    OP_REQUIRES(context_, input.dims() == 4,
+    OP_REQUIRES(context_, input_tf_shape.dims() == 4,
                 errors::InvalidArgument("input must be 4-dimensional",
-                                          input.shape().DebugString()));
+                                        input_tf_shape.DebugString()));
 
-    GetOutputAndPadSizeInMklOrder(input.shape(), filter.shape(),
-                                  strides, output_dims_tf_order,
-                                  output_dims_mkl_order, pad_l, pad_r);
+    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides,
+                                  output_dims_tf_order, output_dims_mkl_order,
+                                  pad_l, pad_r);
   }
 
   // Wrapper function to calculate input, filter, and output sizes of
@@ -279,15 +297,12 @@ class MklDnnConvUtil {
   // also calculates strides and paddings for 2D Convolution.
   //
   // Function does not return anything, but sets error in context status.
-  inline void GetConvFwdSizesInMklOrder(const TensorShape& input_shape,
-                                        const TensorShape& filter_shape,
-                                        memory::dims *input_dims,
-                                        memory::dims *filter_dims,
-                                        memory::dims *strides,
-                                        memory::dims *output_dims_tf_order,
-                                        memory::dims *output_dims_mkl_order,
-                                        memory::dims *pad_l,
-                                        memory::dims *pad_r) {
+  inline void GetConvFwdSizesInMklOrder(
+      const TensorShape& input_shape, const TensorShape& filter_shape,
+      memory::dims* input_dims, memory::dims* filter_dims,
+      memory::dims* strides, memory::dims* output_dims_tf_order,
+      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
+      memory::dims* pad_r) {
     CHECK_NOTNULL(input_dims);
     CHECK_NOTNULL(filter_dims);
     CHECK_NOTNULL(strides);
@@ -302,14 +317,262 @@ class MklDnnConvUtil {
     if (!context_->status().ok()) return;
     GetStridesInMklOrder(strides);
     GetOutputAndPadSizeInMklOrder(input_shape, filter_shape, *strides,
-                                  output_dims_tf_order,
-                                  output_dims_mkl_order,
+                                  output_dims_tf_order, output_dims_mkl_order,
                                   pad_l, pad_r);
     if (!context_->status().ok()) return;
   }
 };
 
-#endif  // INTEL_MKL_DNN
+/////////////////////////////////////////////////////////////////////
+///  Common class that implements Conv2DBackpropFilter and Input
+/////////////////////////////////////////////////////////////////////
+
+template <typename Device, class T>
+class MklConv2DBackpropCommonOp : public OpKernel {
+ public:
+  ~MklConv2DBackpropCommonOp() {}
+  explicit MklConv2DBackpropCommonOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format_str;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    int stride_n = GetTensorDim(strides_, data_format_, 'N');
+    int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+
+      // Prepare common tensors for Conv2DBackpropInput and
+      // Conv2DBackpropFilter.
+      MklDnnData<T> input(&cpu_engine);
+      MklDnnData<T> filter(&cpu_engine);
+      MklDnnData<T> outbackprop(&cpu_engine);
+      MklDnnData<T> output(&cpu_engine);
+
+      // Input tensors
+      const int kInputIdx = 0, kFilterIdx = 1, kOutbpropIdx = 2;
+      const Tensor& input_tensor = MklGetInput(context, kInputIdx);
+      const Tensor& filter_tensor = MklGetInput(context, kFilterIdx);
+      const Tensor& outbprop_tensor = MklGetInput(context, kOutbpropIdx);
+
+      MklDnnShape input_mkl_shape, filter_mkl_shape, outbprop_mkl_shape;
+      GetMklShape(context, kInputIdx, &input_mkl_shape);
+      GetMklShape(context, kFilterIdx, &filter_mkl_shape);
+      GetMklShape(context, kOutbpropIdx, &outbprop_mkl_shape);
+      // Allow operator-specific sanity checking of shapes.
+      ValidateMklShapes(input_mkl_shape, filter_mkl_shape, outbprop_mkl_shape);
+
+      // Allow operator-specific generation of shapes.
+      // E.g., Conv2DBackpropFilter gets filter as filter_sizes. It is a
+      // tensor containing shape of filter. So filter.shape() is not
+      // a correct way to get filter shape. These operator-specific calls
+      // allow this class to handle this case.
+      TensorShape input_tf_shape = MakeInputTfShape(context, input_tensor);
+      TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor);
+      TensorShape outbprop_tf_shape = GetTfShape(context, kOutbpropIdx);
+
+      // Corner cases: output with 0 elements and 0 batch size.
+      Tensor* output_tensor = nullptr;
+      if (input_tf_shape.num_elements() == 0 ||
+          filter_tf_shape.num_elements() == 0 ||
+          outbprop_tf_shape.num_elements() == 0) {
+        MklDnnShape output_mkl_shape;
+        output_mkl_shape.SetMklTensor(false);
+        TensorShape output_tf_shape = GetOutputTfShape(
+            input_tf_shape, filter_tf_shape, outbprop_tf_shape);
+        const int kOutputIdx = 0;
+        AllocateOutputSetMklShape(context, kOutputIdx, &output_tensor,
+                                  output_tf_shape, output_mkl_shape);
+        CHECK_NOTNULL(output_tensor);
+
+        // if output tensor has more than 0 elements, we need to 0 them out.
+        for (size_t i = 0; i < output_tf_shape.num_elements(); ++i) {
+          output_tensor->flat<T>().data()[i] = 0;
+        }
+
+        return;
+      }
+
+      // By default, all dims are in MKL order. Only dims in TF order
+      // are those with prefix tf_order.
+      memory::dims outbprop_dims, fwd_input_dims, fwd_filter_dims;
+      memory::dims padding_l, padding_r, strides, fwd_output_dims;
+      memory::dims fwd_output_dims_tf_order;
+
+      // Get forward convolution parameters.
+      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
+      conv_utl.GetConvFwdSizesInMklOrder(
+          input_tf_shape, filter_tf_shape, &fwd_input_dims, &fwd_filter_dims,
+          &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l,
+          &padding_r);
+      if (!context->status().ok()) return;
+
+      // Create Convolution forward descriptor since Convolution backward
+      // API needs it. For that, we first need to create input, filter
+      // and output memory descriptors.
+      auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+      // If input is in MKL layout, then simply grab input layout; otherwise,
+      // construct input TF layout. For TF layout, although input shape
+      // required is in MKL-DNN order, the layout is Tensorflow's layout
+      // (NHWC or NCHW depending on data format).
+      auto fwd_input_md =
+          input_mkl_shape.IsMklTensor()
+              ? input_mkl_shape.GetMklLayout()
+              : memory::desc(fwd_input_dims, MklDnnType<T>(), tf_fmt);
+      // If filter is in MKL layout, then simply grab filter layout; otherwise
+      // construct filter in TF layout. For TF layout, filter is in HWIO format.
+      auto fwd_filter_md = filter_mkl_shape.IsMklTensor()
+                               ? filter_mkl_shape.GetMklLayout()
+                               : memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                                              memory::format::hwio);
+      // Tensorflow Output of Conv2D is in data_format order.
+      auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(), tf_fmt);
+      auto fwd_desc = convolution_forward::desc(
+          prop_kind::forward, convolution_direct, fwd_input_md, fwd_filter_md,
+          fwd_out_md, strides, padding_l, padding_r,
+          TFPaddingToMklDnnPadding(padding_));
+      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
+
+      // Create memory for user data. Describe how the inputs and outputs of
+      // Convolution look like. Also specify buffers containing actual input
+      // and output data.
+
+      // Since this is a common class for both Conv2DBackpropFilter and
+      // Conv2DBackpropInput, we skip SetUsrMem call for input tensor (for
+      // Conv2DBackpropInput) and for filter tensor (for
+      // conv2DBackpropFilter) depending on which tensor is int32 type.
+      size_t input_with_sizes = GetInputTensorIndexWithSizes();
+      if (input_with_sizes != kInputIdx) {
+        // Shape of Conv2DBackpropFilter's input is same as Conv2D input.
+        input.SetUsrMem(fwd_input_md, &input_tensor);
+      } else if (input_with_sizes != kFilterIdx) {
+        // Shape of Conv2DBackpropInput's filter is same as Conv2D filter.
+        filter.SetUsrMem(fwd_filter_md, &filter_tensor);
+      }
+
+      conv_utl.GetInputSizeInMklOrder(outbprop_tf_shape, &outbprop_dims);
+      if (!context->status().ok()) return;
+      if (outbprop_mkl_shape.IsMklTensor()) {
+        // If outbackprop is in Mkl layout, then simply grab it.
+        auto outbprop_md = outbprop_mkl_shape.GetMklLayout();
+        outbackprop.SetUsrMem(outbprop_md, &outbprop_tensor);
+      } else {
+        // If outbackprop is in TensorFlow layout, then we need to create memory
+        // descriptor for it. Outbackprop shape is data format order.
+        outbackprop.SetUsrMem(outbprop_dims, tf_fmt, &outbprop_tensor);
+      }
+
+      // Operator specific call to get output shape and data_format.
+      auto bwd_output_dims = GetOutputDims(fwd_input_dims, fwd_filter_dims);
+      auto bwd_output_format = GetOutputFormat(tf_fmt);
+      output.SetUsrMem(bwd_output_dims, bwd_output_format);
+
+      // Create memory descriptors for convolution data w/ no specified format.
+      input.SetOpMemDesc(fwd_input_dims, memory::format::any);
+      filter.SetOpMemDesc(fwd_filter_dims, memory::format::any);
+      outbackprop.SetOpMemDesc(outbprop_dims, memory::format::any);
+      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
+
+      // Operator-specific call to create and execute primitive.
+      CreatePrimitive(context, cpu_engine, fwd_pd, &input, &filter,
+                      &outbackprop, &output, &output_tensor, strides, padding_l,
+                      padding_r, TFPaddingToMklDnnPadding(padding_),
+                      bwd_output_dims, bwd_output_format);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+  /// Pure virtual function to allow operator to check for validity of input
+  /// shapes. Function asserts that input shapes are valid.
+  virtual void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
+                                 const MklDnnShape& filter_mkl_shape,
+                                 const MklDnnShape& outbprop_mkl_shape) = 0;
+
+  /// Operator-specific function that returns index of input that is
+  /// representing input sizes. For Conv2DBackpropFilter it returns 1 since
+  /// filter for this operator is filter shape. For Conv2DBackpropInput it
+  /// returns 0 (for input).
+  virtual size_t GetInputTensorIndexWithSizes() = 0;
+
+  /// Get TensorFlow shape of input tensor.
+  virtual TensorShape MakeInputTfShape(OpKernelContext* context,
+                                       const Tensor& input_tensor) = 0;
+
+  /// Get TensorFlow shape of filter tensor.
+  virtual TensorShape MakeFilterTfShape(OpKernelContext* context,
+                                        const Tensor& filter_tensor) = 0;
+
+  /// Get the TensorFlow shape of output tensor.
+  virtual TensorShape GetOutputTfShape(const TensorShape& input_shape,
+                                       const TensorShape& filter_shape,
+                                       const TensorShape& outbprop_shape) = 0;
+
+  /// Get shape of output in MKL-DNN order. Computes shape of output from
+  /// input shape (fwd_input_dims) and filter shape (fwd_filter_dims).
+  virtual const memory::dims& GetOutputDims(
+      const memory::dims& fwd_input_dims,
+      const memory::dims& fwd_filter_dims) = 0;
+
+  /// Get data_format of output in MKL-DNN order. If output data format is
+  /// same as input data format, then it simply returns value of data_format
+  /// parameter as it is.
+  virtual memory::format GetOutputFormat(const memory::format data_format) = 0;
+
+  /// Create and execute the primitive storing output in the output_tensor.
+  virtual void CreatePrimitive(
+      OpKernelContext* context, const engine& cpu_engine,
+      const convolution_forward::primitive_desc& conv_fwd_pd,
+      MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
+      MklDnnData<T>* output, Tensor** output_tensor,
+      const memory::dims& strides, const memory::dims& padding_l,
+      const memory::dims& padding_r, padding_kind padding,
+      const memory::dims& bwd_output_dims,
+      memory::format bwd_output_format) = 0;
+
+  // Get the data_format {NCHW, NHWC}
+  TensorFormat GetTFDataFormat() { return data_format_; }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
+#endif  // INTEL_MKL_ML
+
+/////////////////////////////////////////////////////////////////////
+///  Dummy Mkl op that is just used for operators that are intermediate
+///  output of node fusion in the graph
+/////////////////////////////////////////////////////////////////////
+
+template <typename Device, typename T>
+class MklDummyOp : public OpKernel {
+ public:
+  ~MklDummyOp() {}
+
+  explicit MklDummyOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TF_CHECK_OK(
+        errors::Unimplemented("This is a dummy op."
+                              "It should not have been invoked."));
+  }
+};
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_cwise_ops_common.cc b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
index c065724e0dbbe091d253eb2315c9a5f3c041d695..58f0c30f32b0eebd7ceff856b2e3bd881b28121c 100644
--- a/tensorflow/core/kernels/mkl_cwise_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
@@ -1,4 +1,4 @@
-﻿/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0(the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index bc9e906c39a9a7f5f4b2ae83afc6774aecb38c48..8313224d7fe3e2d307d3642ced5b277b95c85cdb 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -25,10 +25,24 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifndef INTEL_MKL_ML
+#include "mkldnn.hpp"
+
+using mkldnn::batch_normalization_backward;
+using mkldnn::batch_normalization_forward;
+using mkldnn::prop_kind;
+using mkldnn::stream;
+using mkldnn::use_global_stats;
+using mkldnn::use_scale_shift;
+#endif
+
 // TODO(inteltf) Address comments from PR 8968.
 
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
+
+#ifdef INTEL_MKL_ML
+
 template <typename Device, typename T>
 class MklFusedBatchNormOp : public OpKernel {
  public:
@@ -46,7 +60,6 @@ class MklFusedBatchNormOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     MklFusedBatchNormOpContext mkl_context;
-
     const Tensor& input = MklGetInput(context, 0);
     const Tensor& scale = MklGetInput(context, 1);
     const Tensor& shift = MklGetInput(context, 2);
@@ -55,6 +68,7 @@ class MklFusedBatchNormOp : public OpKernel {
 
     GetMklShape(context, 0, &(mkl_context.mkl_shape_input_shape));
     bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
+
     if (!input_in_mkl_format) {
       OP_REQUIRES(context, input.dims() == 4,
                   errors::InvalidArgument("input must be 4-dimensional",
@@ -69,10 +83,12 @@ class MklFusedBatchNormOp : public OpKernel {
     OP_REQUIRES(context, est_mean.dims() == 1,
                 errors::InvalidArgument("estimated_mean must be 1-dimensional",
                                         est_mean.shape().DebugString()));
+
     OP_REQUIRES(
         context, est_variance.dims() == 1,
         errors::InvalidArgument("estimated_variance must be 1-dimensional",
                                 est_variance.shape().DebugString()));
+
     if (is_training_) {
       OP_REQUIRES(context, est_mean.dim_size(0) == 0,
                   errors::InvalidArgument("estimated_mean empty for training",
@@ -258,7 +274,6 @@ class MklFusedBatchNormOp : public OpKernel {
             E_SUCCESS);
       }
     }
-
     void MklPrepareContextInputs(OpKernelContext* context,
                                  Tensor* mkl_tmp_input_buf_tensor,
                                  Tensor* mkl_tmp_scale_shift_buf_tensor) {
@@ -325,15 +340,6 @@ class MklFusedBatchNormOp : public OpKernel {
   } MklFusedBatchNormOpContext;
 };
 
-#define REGISTER_MKL_CPU(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklFusedBatchNormOp<CPUDevice, T>);
-TF_CALL_float(REGISTER_MKL_CPU);
-#undef REGISTER_MKL_CPU
-
 template <typename Device, typename T>
 class MklFusedBatchNormGradOp : public OpKernel {
  public:
@@ -675,6 +681,684 @@ class MklFusedBatchNormGradOp : public OpKernel {
     }
   } MklFusedBatchNormGradOpContext;
 };
+#endif
+
+#ifndef INTEL_MKL_ML
+
+template <typename Device, typename T>
+class MklFusedBatchNormOp : public OpKernel {
+ public:
+  explicit MklFusedBatchNormOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float epsilon;
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+    epsilon_ = T(epsilon);
+    string tensor_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
+    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const size_t kSrcIndex = 0;       // index of src input tensor
+      const size_t kScaleIndex = 1;     // index of scale tensor
+      const size_t kShiftIndex = 2;     // index of shift tensor
+      const size_t kMeanIndex = 3;      // index of est_mean tensor
+      const size_t kVarianceIndex = 4;  // index of est_variance tensor
+
+      const Tensor& src_tensor = MklGetInput(context, kSrcIndex);
+      const Tensor& scale_tensor = MklGetInput(context, kScaleIndex);
+      const Tensor& shift_tensor = MklGetInput(context, kShiftIndex);
+      const Tensor& est_mean_tensor = MklGetInput(context, kMeanIndex);
+      const Tensor& est_variance_tensor = MklGetInput(context, kVarianceIndex);
+
+      TensorShape tf_shape_src;
+      MklDnnShape dnn_shape_src;
+      GetMklShape(context, kSrcIndex, &dnn_shape_src);
+
+      if (dnn_shape_src.IsMklTensor()) {
+        tf_shape_src = dnn_shape_src.GetTfShape();
+        OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
+      } else {
+        tf_shape_src = src_tensor.shape();
+        OP_REQUIRES(context, src_tensor.dims() == 4,
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
+      }
+      OP_REQUIRES(context, scale_tensor.dims() == 1,
+                  errors::InvalidArgument("scale must be 1-dimensional",
+                                          scale_tensor.shape().DebugString()));
+      OP_REQUIRES(context, shift_tensor.dims() == 1,
+                  errors::InvalidArgument("offset must be 1-dimensional",
+                                          shift_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, est_mean_tensor.dims() == 1,
+          errors::InvalidArgument("estimated_mean must be 1-dimensional",
+                                  est_mean_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, est_variance_tensor.dims() == 1,
+          errors::InvalidArgument("estimated_variance must be 1-dimensional",
+                                  est_variance_tensor.shape().DebugString()));
+
+      if (is_training_) {
+        OP_REQUIRES(
+            context, est_mean_tensor.dim_size(0) == 0,
+            errors::InvalidArgument("estimated_mean must be empty for training",
+                                    est_mean_tensor.shape().DebugString()));
+        OP_REQUIRES(context, est_variance_tensor.dim_size(0) == 0,
+                    errors::InvalidArgument(
+                        "estimated_variance must be empty for training",
+                        est_variance_tensor.shape().DebugString()));
+      }
+
+      // special case: input with 0 element and 0 batch size
+      Tensor* dst_tensor = nullptr;
+      if (tf_shape_src.num_elements() == 0) {
+        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
+                         &dst_tensor);
+        return;
+      }
+
+      if (dnn_shape_src.IsMklTensor())
+        depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
+      else
+        ExtractParams(context);
+
+      // Indices of output tensors
+      const size_t kDstIndex = 0;
+
+      // allocate 4 output TF tensors
+      Tensor* batch_mean_tensor = nullptr;
+      Tensor* batch_variance_tensor = nullptr;
+      Tensor* saved_mean_tensor = nullptr;
+      Tensor* saved_variance_tensor = nullptr;
+      AllocateTFOutputs(context, scale_tensor.shape(), &batch_mean_tensor,
+                        &batch_variance_tensor, &saved_mean_tensor,
+                        &saved_variance_tensor);
+
+      if (is_training_)
+        SetMeanVariance(*batch_mean_tensor, *batch_variance_tensor);
+      else
+        SetMeanVariance(est_mean_tensor, est_variance_tensor);
+
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      memory::format format_m;
+      if (dnn_shape_src.IsMklTensor()) {
+        if (dnn_shape_src.IsTensorInNCHWFormat()) {
+          format_m = memory::format::nchw;
+        } else {
+          format_m = memory::format::nhwc;
+        }
+      } else {
+        format_m = TFDataFormatToMklDnnDataFormat(tensor_format_);
+      }
+
+      // set src primitive
+      memory::dims src_dims;
+      if (dnn_shape_src.IsMklTensor()) {
+        src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(),
+                                             tensor_format_);
+      } else {
+        src_dims =
+            TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_);
+      }
+
+      auto src_md = dnn_shape_src.IsMklTensor()
+                        ? dnn_shape_src.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(), format_m);
+      src.SetUsrMem(src_md, &src_tensor);
+
+      // set weights primitive
+      // MKL-DNN packs scale & shift as "weights":
+      // <scale>...<scale><shift>...<shift>
+      auto weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
+      auto weights_m = memory(weights_pd);
+      T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
+      T* scale_tf =
+          reinterpret_cast<T*>(const_cast<T*>(scale_tensor.flat<T>().data()));
+      T* shift_tf =
+          reinterpret_cast<T*>(const_cast<T*>(shift_tensor.flat<T>().data()));
+
+      for (int k = 0; k < depth_; k++) {
+        weights_data[k] = scale_tf[k];
+        weights_data[k + depth_] = shift_tf[k];
+      }
+
+      // set mean primitive
+      auto mean_desc =
+          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto mean_pd = memory::primitive_desc(mean_desc, cpu_engine);
+      char* saved_mean_data_tf =
+          reinterpret_cast<char*>(saved_mean_tensor->flat<T>().data());
+      std::memcpy(saved_mean_data_tf, reinterpret_cast<char*>(mean_values_),
+                  depth_ * sizeof(T));
+      auto mean_m =
+          memory(mean_pd, reinterpret_cast<void*>(saved_mean_data_tf));
+
+      // set variance primitive
+      auto variance_desc =
+          memory::desc({1, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto variance_pd = memory::primitive_desc(variance_desc, cpu_engine);
+      char* saved_variance_data_tf =
+          reinterpret_cast<char*>(saved_variance_tensor->flat<T>().data());
+      std::memcpy(saved_variance_data_tf,
+                  reinterpret_cast<char*>(variance_values_),
+                  depth_ * sizeof(T));
+      auto variance_m = memory(variance_pd, saved_variance_data_tf);
+
+      prop_kind pk = (is_training_) ? prop_kind::forward_training
+                                    : prop_kind::forward_scoring;
+      auto bnrm_fwd_desc = batch_normalization_forward::desc(
+          pk, src.GetUsrMemDesc(), epsilon_,
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
+      auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
+          bnrm_fwd_desc, cpu_engine);
+
+      // allocate dst tensor
+      MklDnnShape dnn_shape_dst;
+      TensorShape tf_shape_dst;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_dst.SetMklTensor(true);
+        auto dst_pd = bnrm_fwd_pd.dst_primitive_desc();
+        dnn_shape_dst.SetMklLayout(&dst_pd);
+        dnn_shape_dst.SetElemType(MklDnnType<T>());
+        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(), src_dims,
+                                  format_m);
+        tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T));
+      } else {
+        dnn_shape_dst.SetMklTensor(false);
+        tf_shape_dst = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
+
+      // Output of batchnorm has same shape as input.
+      dst.SetUsrMem(src_md, dst_tensor);
+
+      primitive bnrm_fwd_op;
+      if (is_training_) {
+        bnrm_fwd_op =
+            batch_normalization_forward(bnrm_fwd_pd, src.GetOpMem(), weights_m,
+                                        dst.GetOpMem(), mean_m, variance_m);
+      } else {
+        bnrm_fwd_op = batch_normalization_forward(
+            bnrm_fwd_pd, src.GetOpMem(), mean_m, variance_m,
+            (const primitive::at)weights_m, dst.GetOpMem());
+      }
+      std::vector<primitive> net;
+      net.push_back(bnrm_fwd_op);
+      stream(stream::kind::eager).submit(net).wait();
+
+      // copy batch_mean data
+      T* batch_mean_data_tf =
+          reinterpret_cast<T*>(batch_mean_tensor->flat<T>().data());
+      std::memcpy(reinterpret_cast<char*>(batch_mean_data_tf),
+                  reinterpret_cast<char*>(mean_m.get_data_handle()),
+                  depth_ * sizeof(T));
+
+      // copy batch_variance data with Bessel's correction
+      // if training mode is on
+      float adjust_factor = 1.0;
+      if (is_training_) {
+        size_t orig_size = src_dims[0] * src_dims[2] * src_dims[3];
+        size_t adjust_size = orig_size - 1;
+        adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
+      }
+      for (int k = 0; k < depth_; k++)
+        batch_variance_tensor->flat<T>().data()[k] =
+            (reinterpret_cast<T*>(variance_m.get_data_handle()))[k] *
+            adjust_factor;
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+ private:
+  T epsilon_;
+  TensorFormat tensor_format_;
+  bool is_training_;
+  T* mean_values_;
+  T* variance_values_;
+  size_t depth_;  // batch normalization is done for per channel.
+
+  void ExtractParams(OpKernelContext* context) {
+    const Tensor& input = MklGetInput(context, 0);
+    depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
+  }
+
+  void SetMeanVariance(const Tensor& mean, const Tensor& variance) {
+    mean_values_ = reinterpret_cast<T*>(const_cast<T*>(mean.flat<T>().data()));
+    variance_values_ =
+        reinterpret_cast<T*>(const_cast<T*>(variance.flat<T>().data()));
+  }
+
+  void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
+                        TensorShape tf_shape_scale, Tensor** dst_tensor) {
+    CHECK_NOTNULL(dst_tensor);
+
+    const size_t kDstIndex = 0;
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kDstIndex, dst_tensor, tf_shape_src,
+                              dnn_shape_dst);
+    CHECK_NOTNULL(*dst_tensor);
+    memset(const_cast<char*>((*dst_tensor)->tensor_data().data()), 0,
+           (*dst_tensor)->tensor_data().size());
+
+    Tensor* batch_mean_tensor = nullptr;
+    Tensor* batch_variance_tensor = nullptr;
+    Tensor* saved_mean_tensor = nullptr;
+    Tensor* saved_variance_tensor = nullptr;
+    AllocateTFOutputs(context, tf_shape_scale, &batch_mean_tensor,
+                      &batch_variance_tensor, &saved_mean_tensor,
+                      &saved_variance_tensor);
+  }
+
+  void AllocateTFOutputs(OpKernelContext* context, TensorShape tf_shape_scale,
+                         Tensor** batch_mean_tensor,
+                         Tensor** batch_variance_tensor,
+                         Tensor** saved_mean_tensor,
+                         Tensor** saved_variance_tensor) {
+    CHECK_NOTNULL(batch_mean_tensor);
+    CHECK_NOTNULL(batch_variance_tensor);
+    CHECK_NOTNULL(saved_mean_tensor);
+    CHECK_NOTNULL(saved_variance_tensor);
+
+    const size_t kBatchMeanIndex = 1;
+    const size_t kBatchVarianceIndex = 2;
+    const size_t kSavedMeanIndex = 3;
+    const size_t kSavedVarianceIndex = 4;
+
+    // allocate batch mean output tensor
+    MklDnnShape mkl_shape_batch_mean;
+    mkl_shape_batch_mean.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kBatchMeanIndex, batch_mean_tensor,
+                              tf_shape_scale, mkl_shape_batch_mean);
+    CHECK_NOTNULL(*batch_mean_tensor);
+    // set NAN mean value in case of empty input tensor
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
+      (*batch_mean_tensor)->flat<T>().data()[k] = NAN;
+
+    // allocate batch variance output tensor
+    MklDnnShape mkl_shape_batch_variance;
+    mkl_shape_batch_variance.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kBatchVarianceIndex,
+                              batch_variance_tensor, tf_shape_scale,
+                              mkl_shape_batch_variance);
+    CHECK_NOTNULL(*batch_variance_tensor);
+    // set NAN variance value in case of empty input tensor
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
+      (*batch_variance_tensor)->flat<T>().data()[k] = NAN;
+
+    // Mean and variance (without Bessel's correction) saved for backward
+    // computation to serve as pre-computed mean and variance.
+    MklDnnShape mkl_shape_saved_mean;
+    mkl_shape_saved_mean.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kSavedMeanIndex, saved_mean_tensor,
+                              tf_shape_scale, mkl_shape_saved_mean);
+    CHECK_NOTNULL(*saved_mean_tensor);
+    // set NAN mean value in case of empty input tensor
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
+      (*saved_mean_tensor)->flat<T>().data()[k] = NAN;
+
+    MklDnnShape mkl_shape_saved_variance;
+    mkl_shape_saved_variance.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kSavedVarianceIndex,
+                              saved_variance_tensor, tf_shape_scale,
+                              mkl_shape_saved_variance);
+    CHECK_NOTNULL(*saved_variance_tensor);
+    // set NAN variance value in case of empty input tensor
+    for (int k = 0; k < tf_shape_scale.num_elements(); k++)
+      (*saved_variance_tensor)->flat<T>().data()[k] = NAN;
+  }
+};
+
+template <typename Device, typename T>
+class MklFusedBatchNormGradOp : public OpKernel {
+ public:
+  explicit MklFusedBatchNormGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float epsilon;
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+    epsilon_ = T(epsilon);
+    string tensor_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
+    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const size_t kDiffDstIndex = 0;   // index of diff_dst tensor
+      const size_t kSrcIndex = 1;       // index of src input tensor
+      const size_t kScaleIndex = 2;     // index of scale tensor
+      const size_t kMeanIndex = 3;      // index of saved_mean tensor
+      const size_t kVarianceIndex = 4;  // index of saved_variance tensor
+      const Tensor& diff_dst_tensor = MklGetInput(context, kDiffDstIndex);
+      const Tensor& src_tensor = MklGetInput(context, kSrcIndex);
+      const Tensor& scale_tensor = MklGetInput(context, kScaleIndex);
+      const Tensor& saved_mean_tensor = MklGetInput(context, kMeanIndex);
+      const Tensor& saved_variance_tensor =
+          MklGetInput(context, kVarianceIndex);
+
+      MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
+      GetMklShape(context, kSrcIndex, &dnn_shape_src);
+      GetMklShape(context, kDiffDstIndex, &dnn_shape_diff_dst);
+      TensorShape tf_shape_src, tf_shape_diff_dst;
+
+      if (dnn_shape_diff_dst.IsMklTensor()) {
+        tf_shape_diff_dst = dnn_shape_diff_dst.GetTfShape();
+        OP_REQUIRES(
+            context, dnn_shape_diff_dst.GetDimension() == 4,
+            errors::InvalidArgument("input must be 4-dimensional",
+                                    diff_dst_tensor.shape().DebugString()));
+      } else {
+        tf_shape_diff_dst = diff_dst_tensor.shape();
+        OP_REQUIRES(
+            context, diff_dst_tensor.dims() == 4,
+            errors::InvalidArgument("input must be 4-dimensional",
+                                    diff_dst_tensor.shape().DebugString()));
+      }
+
+      if (dnn_shape_src.IsMklTensor()) {
+        tf_shape_src = dnn_shape_src.GetTfShape();
+        OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
+      } else {
+        tf_shape_src = src_tensor.shape();
+        OP_REQUIRES(context, src_tensor.dims() == 4,
+                    errors::InvalidArgument("input must be 4-dimensional",
+                                            src_tensor.shape().DebugString()));
+      }
+
+      OP_REQUIRES(context, scale_tensor.dims() == 1,
+                  errors::InvalidArgument("scale must be 1-dimensional",
+                                          scale_tensor.shape().DebugString()));
+      OP_REQUIRES(
+          context, saved_mean_tensor.dims() == 1,
+          errors::InvalidArgument("saved mean must be 1-dimensional",
+                                  saved_mean_tensor.shape().DebugString()));
+
+      OP_REQUIRES(
+          context, saved_variance_tensor.dims() == 1,
+          errors::InvalidArgument("saved variance must be 1-dimensional",
+                                  saved_variance_tensor.shape().DebugString()));
+
+      Tensor* diff_src_tensor = nullptr;
+      if (tf_shape_src.num_elements() == 0 ||
+          tf_shape_diff_dst.num_elements() == 0) {
+        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
+                         &diff_src_tensor);
+        return;
+      }
+
+      if (dnn_shape_src.IsMklTensor())
+        depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
+      else
+        ExtractParams(context);
+
+      memory::format format_m;
+      if (dnn_shape_src.IsMklTensor()) {
+        if (dnn_shape_src.IsTensorInNCHWFormat())
+          format_m = memory::format::nchw;
+        else
+          format_m = memory::format::nhwc;
+      } else {
+        format_m = TFDataFormatToMklDnnDataFormat(tensor_format_);
+      }
+
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> mean(&cpu_engine);
+      MklDnnData<T> variance(&cpu_engine);
+      MklDnnData<T> diff_dst(&cpu_engine);
+      MklDnnData<T> diff_src(&cpu_engine);
+
+      memory::dims src_dims, diff_dst_dims;
+      if (dnn_shape_src.IsMklTensor())
+        src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(),
+                                             tensor_format_);
+      else
+        src_dims =
+            TFShapeToMklDnnDimsInNCHW(src_tensor.shape(), tensor_format_);
+
+      if (dnn_shape_diff_dst.IsMklTensor())
+        diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
+            dnn_shape_diff_dst.GetTfShape(), tensor_format_);
+      else
+        diff_dst_dims =
+            TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(), tensor_format_);
+
+      // set src and diff_dst primitives
+      memory::desc src_md({}, memory::data_undef, memory::format_undef);
+      memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
+        if (dnn_shape_src.IsMklTensor()) {
+          src_md = dnn_shape_src.GetMklLayout();
+          diff_dst_md = src_md;
+        } else {
+          diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
+          src_md = diff_dst_md;
+        }
+      } else {
+        src_md = memory::desc(src_dims, MklDnnType<T>(), format_m);
+        diff_dst_md = src_md;
+      }
+      src.SetUsrMem(src_md, &src_tensor);
+      diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
+
+      // weights -- DNN packs scales/shifts as weights in order of
+      // scale, ..., scale, shift, ..., shift
+      auto weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
+      auto weights_m = memory(weights_pd);
+      T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
+      T* scale_tf =
+          reinterpret_cast<T*>(const_cast<T*>(scale_tensor.flat<T>().data()));
+      for (int k = 0; k < depth_; k++) {
+        weights_data[k] = scale_tf[k];
+        weights_data[k + depth_] = 0;
+      }
+
+      // set mean primitive
+      memory::dims mv_dims = GetMeanVarianceDims();
+      mean.SetUsrMem(mv_dims, memory::format::nc,
+                     const_cast<void*>(static_cast<const void*>(
+                         saved_mean_tensor.flat<T>().data())));
+      mean.SetOpMemDesc(mv_dims, memory::format::nc);
+
+      // set variance primitive
+      variance.SetUsrMem(mv_dims, memory::format::nc,
+                         const_cast<void*>(static_cast<const void*>(
+                             saved_variance_tensor.flat<T>().data())));
+      variance.SetOpMemDesc(mv_dims, memory::format::nc);
+
+      // set diff_weight primitive
+      auto diff_weights_desc =
+          memory::desc({2, depth_}, MklDnnType<T>(), memory::format::nc);
+      auto diff_weights_pd =
+          memory::primitive_desc(diff_weights_desc, cpu_engine);
+      auto diff_weights_m = memory(diff_weights_pd);
+
+      auto bnrm_fwd_desc = batch_normalization_forward::desc(
+          prop_kind::forward_training, src.GetUsrMemDesc(), epsilon_,
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
+      auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
+          bnrm_fwd_desc, cpu_engine);
+
+      // Indices of output tensors
+      const size_t kDiffSrcIndex = 0;  // index of diff_src tensor
+
+      // allocate diff_src tensor
+      MklDnnShape dnn_shape_diff_src;
+      TensorShape tf_shape_diff_src;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_diff_src.SetMklTensor(true);
+        auto diff_src_pd = bnrm_fwd_pd.dst_primitive_desc();
+        dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
+        dnn_shape_diff_src.SetElemType(MklDnnType<T>());
+        dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(), src_dims,
+                                       format_m);
+        dnn_shape_diff_src.SetTfDimOrder(dnn_shape_src.GetDimension(),
+                                         tensor_format_);
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
+      } else {
+        dnn_shape_diff_src.SetMklTensor(false);
+        tf_shape_diff_src = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, kDiffSrcIndex, &diff_src_tensor,
+                                tf_shape_diff_src, dnn_shape_diff_src);
+
+      diff_src.SetUsrMem(src_md, diff_src_tensor);
+
+      prop_kind pk = prop_kind::backward;
+      auto bnrm_bwd_desc = batch_normalization_backward::desc(
+          pk, diff_src.GetUsrMemDesc(), src.GetUsrMemDesc(), epsilon_,
+          /* for inference, specify use_global_stats
+             1. on fwd prop, use mean and variance
+                provided as inputs
+             2. on bwd prop, mean and variance are
+                considered as constants. Thus,
+                reduce the amout of MKL computations
+          */
+          is_training_ ? use_scale_shift
+                       : (use_scale_shift | use_global_stats));
+      auto bnrm_bwd_pd = batch_normalization_backward::primitive_desc(
+          bnrm_bwd_desc, cpu_engine, bnrm_fwd_pd);
+
+      auto bnrm_bwd_op = batch_normalization_backward(
+          bnrm_bwd_pd, src.GetOpMem(), mean.GetOpMem(), variance.GetOpMem(),
+          diff_dst.GetOpMem(), weights_m, diff_src.GetOpMem(), diff_weights_m);
+
+      std::vector<primitive> net;
+      net.push_back(bnrm_bwd_op);
+      stream(stream::kind::eager).submit(net).wait();
+
+      // allocate 4 output TF tensors
+      Tensor* diff_scale_tensor = nullptr;
+      Tensor* diff_shift_tensor = nullptr;
+      AllocateTFOutputs(context, scale_tensor.shape(), &diff_scale_tensor,
+                        &diff_shift_tensor);
+
+      // copy data: diff_scale and diff_shift
+      T* diff_weights_data_dnn =
+          reinterpret_cast<T*>(diff_weights_m.get_data_handle());
+      for (int i = 0; i < depth_; i++) {
+        diff_scale_tensor->flat<T>().data()[i] = diff_weights_data_dnn[i];
+        diff_shift_tensor->flat<T>().data()[i] =
+            diff_weights_data_dnn[i + depth_];
+      }
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+ private:
+  T epsilon_;
+  TensorFormat tensor_format_;
+  int depth_;  // batch normalization is done for per channel.
+  bool is_training_;
+
+  void ExtractParams(OpKernelContext* context) {
+    const Tensor& input = MklGetInput(context, 0);
+    depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
+  }
+
+  void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
+                        TensorShape tf_shape_scale_shift,
+                        Tensor** diff_src_tensor) {
+    const size_t kDiffSrcIndex = 0;
+
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kDiffSrcIndex, diff_src_tensor,
+                              tf_shape_src, dnn_shape_diff_src);
+    for (size_t i = 0; i < (*diff_src_tensor)->shape().num_elements(); i++)
+      (*diff_src_tensor)->flat<T>().data()[i] = 0;
+
+    Tensor* diff_scale_tensor = nullptr;
+    Tensor* diff_shift_tensor = nullptr;
+    AllocateTFOutputs(context, tf_shape_scale_shift, &diff_scale_tensor,
+                      &diff_shift_tensor);
+  }
+
+  void AllocateTFOutputs(OpKernelContext* context,
+                         TensorShape tf_shape_scale_shift,
+                         Tensor** diff_scale_tensor,
+                         Tensor** diff_shift_tensor) {
+    CHECK_NOTNULL(diff_scale_tensor);
+    CHECK_NOTNULL(diff_shift_tensor);
+
+    const size_t kDiffScaleIndex = 1;
+    const size_t kDiffShiftIndex = 2;
+    const size_t kP1Index = 3;
+    const size_t kP2Index = 4;
+
+    // separate out scale and shift grad and copy to individual tensors
+    MklDnnShape mkl_shape_diff_scale;
+    mkl_shape_diff_scale.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kDiffScaleIndex, diff_scale_tensor,
+                              tf_shape_scale_shift, mkl_shape_diff_scale);
+    CHECK_NOTNULL(*diff_scale_tensor);
+    for (size_t i = 0; i < (*diff_scale_tensor)->shape().num_elements(); i++)
+      (*diff_scale_tensor)->flat<T>().data()[i] = 0;
+
+    MklDnnShape mkl_shape_diff_shift;
+    mkl_shape_diff_shift.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kDiffShiftIndex, diff_shift_tensor,
+                              tf_shape_scale_shift, mkl_shape_diff_shift);
+    CHECK_NOTNULL(*diff_shift_tensor);
+    for (size_t i = 0; i < (*diff_shift_tensor)->shape().num_elements(); i++)
+      (*diff_shift_tensor)->flat<T>().data()[i] = 0;
+
+    // Placeholders for estimated_mean and estimated_variance, which are
+    // used for inference and thus not needed here for gradient computation.
+    Tensor *p1_tensor = nullptr, *p2_tensor = nullptr;
+    MklDnnShape mkl_shape_p;
+    mkl_shape_p.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, kP1Index, &p1_tensor, TensorShape({}),
+                              mkl_shape_p);
+    AllocateOutputSetMklShape(context, kP2Index, &p2_tensor, TensorShape({}),
+                              mkl_shape_p);
+  }
+
+  memory::dims GetMeanVarianceDims() { return memory::dims({1, depth_}); }
+};
+
+#endif
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklFusedBatchNormOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
 
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNormGrad")            \
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index f31e7afd46873a02c10277283862a7e5e2384803..6c027f8e728b8660d18a70ae58995fa104f0b375 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -28,8 +28,15 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifndef INTEL_MKL_ML
+#include "mkldnn.hpp"
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
+
+#ifdef INTEL_MKL_ML
+
 template <typename Device, typename T>
 class MklIdentityOp : public OpKernel {
  public:
@@ -50,6 +57,32 @@ class MklIdentityOp : public OpKernel {
   bool IsExpensive() override { return false; }
 };
 
+#else
+
+template <typename Device, typename T>
+class MklIdentityOp : public OpKernel {
+ public:
+  explicit MklIdentityOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    MklDnnShape dnn_shape_input;
+    const int kInputIdx = 0, kOutputIdx = 0;
+    GetMklShape(context, kInputIdx, &dnn_shape_input);
+
+    if (dnn_shape_input.IsMklTensor()) {
+      ForwardMklTensorInToOut(context, kInputIdx, kOutputIdx);
+    } else {
+      ForwardTfTensorInToOut(context, kInputIdx, kOutputIdx);
+    }
+  }
+
+  // TensorFlow's IdentityOp has the following member function, so kept it
+  // as it is.
+  bool IsExpensive() override { return false; }
+};
+
+#endif
+
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklIdentity")                      \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index b58e44e39800c8c047d5557ab3c84113bb78d3ca..5a8799ae93c1bb3a53f19036c7bb13874a80d7fa 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -31,6 +31,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/mkl_tfconv_op.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifndef INTEL_MKL_ML
+#include "mkldnn.hpp"
+
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
@@ -44,15 +50,16 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 // else if both inputs are in mkl format:
 //   if both have the same shape:
 //     pass the inputs through to the output
-// 	else:
-// 		convert both to TF
+//   else:
+//     convert both to TF
 // else if one is TF and one is MKL:
-// 	if broadcast is needed:
-// 		convert the MKL format input to TF format
-// 	else:
-// 		convert the TF format input to MKL format
+//   if broadcast is needed:
+//     convert the MKL format input to TF format
+//   else:
+//     convert the TF format input to MKL format
 ///////////////////////////////////////////////////////////
 
+#ifdef INTEL_MKL_ML
 template <typename Device, typename T>
 class MklInputConversionOp : public OpKernel {
  public:
@@ -242,6 +249,243 @@ class MklInputConversionOp : public OpKernel {
   bool has_avx512f_ = false;
 };
 
+#else
+
+template <typename Device, typename T>
+class MklInputConversionOp : public OpKernel {
+ public:
+  explicit MklInputConversionOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
+    has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
+  }
+
+ private:
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor_0 = MklGetInput(context, 0);
+    MklDnnShape input_shape_0;
+    GetMklShape(context, 0, &input_shape_0);
+
+    const Tensor& input_tensor_1 = MklGetInput(context, 1);
+    MklDnnShape input_shape_1;
+    GetMklShape(context, 1, &input_shape_1);
+
+    bool tf_shapes_are_same =
+        context->input(0).shape() == context->input(1).shape();
+
+    VLOG(1) << "MklInputConversionOp: Input shapes are "
+            << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
+            << context->input(0).shape().DebugString() << " and "
+            << context->input(1).shape().DebugString();
+
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // if both inputs are in TF format, just copy input tensors to output.
+    if (!input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
+      VLOG(1) << "MklInputConversionOp: No conversion needed, "
+              << "copying TF inputs to output";
+
+      ForwardTfTensorInToOut(context, 0, 0);
+      ForwardTfTensorInToOut(context, 1, 1);
+      return;
+    }
+
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // If both inputs are in MKL format
+    if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
+      if (tf_shapes_are_same) {
+        auto input0_md = input_shape_0.GetMklLayout();
+        auto input1_md = input_shape_1.GetMklLayout();
+
+        // If both have the same shape and same format, pass them through
+        if (input0_md.data.format == input1_md.data.format) {
+          VLOG(1) << "MklInputConversionOp: No conversion needed, "
+                  << "copying MKL inputs with identical shapes to output";
+
+          ForwardMklTensorInToOut(context, 0, 0);
+          ForwardMklTensorInToOut(context, 1, 1);
+          return;
+        } else {
+          VLOG(1) << "MklInputConversionOp: Shape is same, but format is "
+                     "different, "
+                  << "need to convert to same format";
+
+          // Convert input0, and keep input1 unchanged
+          // Create MklDnnShape for output mkl tensor based on input0
+          Tensor* tensor_out;
+          MklDnnShape mkl_output_mkl_shape;
+          mkl_output_mkl_shape.SetMklTensor(true);
+          mkl_output_mkl_shape.SetElemType(MklDnnType<T>());
+          mkl_output_mkl_shape.SetTfLayout(input_shape_0.GetDimension(),
+                                           input_shape_0.GetSizesAsMklDnnDims(),
+                                           input_shape_0.GetTfDataFormat());
+
+          // Get MKL layout from input1 as destination layout
+          mkl_output_mkl_shape.SetMklLayout(&input1_md);
+
+          // Create output Mkl tensor for index 0
+          AllocateOutputSetMklShape(context, 0, &tensor_out,
+                                    input_tensor_0.shape(),
+                                    mkl_output_mkl_shape);
+
+          // Create MklDnnData object for input0 tesnsor
+          auto cpu_engine = engine(engine::cpu, 0);
+          MklDnnData<T> input(&cpu_engine);
+          input.SetUsrMem(input0_md, &input_tensor_0);
+
+          // Create reorder from input0's layout to input1's layout
+          std::vector<primitive> net;
+          CHECK_EQ(input.CheckReorderToOpMem(
+                       memory::primitive_desc(input1_md, cpu_engine),
+                       tensor_out, &net),
+                   true);
+          stream(stream::kind::eager).submit(net).wait();
+
+          // Input1 will be passed through
+          ForwardMklTensorInToOut(context, 1, 1);
+          return;
+        }
+      }
+
+      // Sanity check
+      bool mkl_shapes_are_same = input_shape_0 == input_shape_1;
+      if (mkl_shapes_are_same) {
+        CHECK(false) << "MklInputConversionOp: Unexpected: TF shapes are "
+                        "different but MKL shapes are same";
+      }
+
+      // Both have different shapes, so broadcast will be necessary.
+      // Convert to TF and pass both tensors through (we can't do broadcast
+      // with MKL tensors)
+      VLOG(1) << "MklInputConversionOp: Broadcast needed, "
+              << "converted MKL inputs to TF format";
+
+      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+                                           op_data_type, has_avx512f_, 0);
+      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+                                           op_data_type, has_avx512f_, 1);
+      SetDummyMklShapeOutput(context, 0);
+      SetDummyMklShapeOutput(context, 1);
+      return;
+    }
+
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // One input is MKL and one is TF. If no broadcast is needed, convert
+    // the TF tensor to MKL, otherwise convert the MKL tensor to TF format
+    VLOG(1) << "MklInputConversionOp: Inputs in different formats (MKL/TF)";
+
+    const Tensor* mkl_tensor;
+    const MklDnnShape* mkl_shape;
+    const Tensor* tf_tensor;
+    MklDnnShape* tf_mkl_shape;
+    uint mkl_tensor_index;
+    uint tf_tensor_index;
+    if (input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
+      mkl_tensor = &input_tensor_0;
+      mkl_shape = &input_shape_0;
+      mkl_tensor_index = 0;
+      tf_tensor = &input_tensor_1;
+      tf_mkl_shape = &input_shape_1;
+      tf_tensor_index = 1;
+    } else if (!input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
+      mkl_tensor = &input_tensor_1;
+      mkl_shape = &input_shape_1;
+      mkl_tensor_index = 1;
+      tf_tensor = &input_tensor_0;
+      tf_mkl_shape = &input_shape_0;
+      tf_tensor_index = 0;
+    } else {
+      CHECK(false) << "MklInputConversionOp: Unexpected combination of input "
+                      "shapes for MKL "
+                   << "element-wise op";
+    }
+
+    // Broadcast is needed if the shapes are not the same
+    bool broadcast_needed;
+
+    size_t in0_size = 1;
+    for (size_t i = 0; i < mkl_shape->GetDimension(); ++i)
+      in0_size *= mkl_shape->TfDimSize(i);
+
+    size_t in1_size = 1;
+    for (size_t i = 0; i < tf_tensor->shape().dims(); ++i)
+      in1_size *= tf_tensor->shape().dim_size(i);
+
+    broadcast_needed = (in0_size != in1_size);
+
+    if (!broadcast_needed) {
+      // Both shapes are same, convert the TF input to MKL
+      VLOG(1) << "MklInputConversionOp: No broadcast needed.";
+      VLOG(1) << "MklInputConversionOp: Converting input " << tf_tensor_index
+              << " to MKL format";
+
+      // Create MklDnnShape for output Mkl tensor.
+      Tensor* tensor_out;
+      MklDnnShape mkl_output_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(true);
+      mkl_output_mkl_shape.SetElemType(MklDnnType<T>());
+      mkl_output_mkl_shape.SetTfLayout(mkl_shape->GetDimension(),
+                                       mkl_shape->GetSizesAsMklDnnDims(),
+                                       mkl_shape->GetTfDataFormat());
+      // ** Temporarily borrow the layout from the MKL input **
+      auto output_mkl_md = mkl_shape->GetMklLayout();
+      mkl_output_mkl_shape.SetMklLayout(&output_mkl_md);
+
+      // Create output Mkl tensor
+      AllocateOutputSetMklShape(context, tf_tensor_index, &tensor_out,
+                                mkl_tensor->shape(), mkl_output_mkl_shape);
+
+      // Create MklDnnData object for input tensor. Input tensor is in
+      // Tensorflow layout.
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnData<T> tf_input(&cpu_engine);
+      auto input_tf_md = mkl_output_mkl_shape.GetTfLayout();
+      tf_input.SetUsrMem(input_tf_md, tf_tensor);
+
+      // Create reorder between tensorflow layout and Mkl layout.
+      std::vector<primitive> net;
+      CHECK_EQ(tf_input.CheckReorderToOpMem(
+                   memory::primitive_desc(output_mkl_md, cpu_engine),
+                   tensor_out, &net),
+               true);
+      stream(stream::kind::eager).submit(net).wait();
+
+      // -- The tensor in MKL format passes through --
+      ForwardMklTensorInToOut(context, mkl_tensor_index, mkl_tensor_index);
+    } else {
+      // Broadcast is needed, so convert the MKL input to TF
+      VLOG(1) << "MklInputConversionOp: Broadcast needed.";
+      VLOG(1) << "MklInputConversionOp: Converting input " << mkl_tensor_index
+              << " to TF format";
+      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+                                           op_data_type, has_avx512f_,
+                                           mkl_tensor_index);
+      SetDummyMklShapeOutput(context, mkl_tensor_index);
+
+      // The tensor in TF format passes through
+      ForwardTfTensorInToOut(context, tf_tensor_index, tf_tensor_index);
+    }
+
+    VLOG(1) << "MklInputConversionOp: Shapes (output): "
+            << context->mutable_output(0)->shape().DebugString() << " and "
+            << context->mutable_output(1)->shape().DebugString();
+
+    VLOG(1) << "MklInputConversion completed successfully.";
+  }
+
+ private:
+  /// Data format of the operation
+  string data_format_str;
+
+  /// Data type of the operation
+  DataType op_data_type;
+
+  /// CPUIDInfo
+  bool has_avx512f_ = false;
+};
+
+#endif
+
 ///////////////////////////////////////////////////////////
 //               Register kernel
 ///////////////////////////////////////////////////////////
@@ -253,7 +497,10 @@ class MklInputConversionOp : public OpKernel {
                               .Label(mkl_op_registry::kMklOpLabel), \
                           MklInputConversionOp<CPUDevice, T>);
 
-TF_CALL_NUMBER_TYPES(REGISTER_CPU);
+// TODO(nhasabni): We cannot support all number types since MklDnn does
+// not support types.
+// TF_CALL_NUMBER_TYPES(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
 #undef REGISTER_CPU
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index aa08e93924c588cfb5b4a22a20055e5c74a43b3a..5f0a12a1fb9bff3086e05918e23b8396196eb389 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -22,6 +22,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 #include <vector>
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -30,14 +33,20 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
 #endif
 
+#ifndef INTEL_MKL_ML
+#include "mkldnn.hpp"
+using mkldnn::lrn_across_channels;
+using mkldnn::lrn_backward;
+using mkldnn::lrn_forward;
+using mkldnn::prop_kind;
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 
 namespace {
@@ -58,6 +67,8 @@ void GetBandMatrix(int depth, int depth_radius,
 
 }  // namespace
 
+#ifdef INTEL_MKL_ML
+
 template <typename T>
 class MklLRNOp : public OpKernel {
  public:
@@ -66,10 +77,11 @@ class MklLRNOp : public OpKernel {
   explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<size_t>(depth_radius64);
 
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
@@ -92,9 +104,10 @@ class MklLRNOp : public OpKernel {
                               : input.dims();
     OP_REQUIRES(context, mkl_context.in_dims == 4,
                 errors::InvalidArgument("input must be 4-dimensional"));
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("argument to LRN too large"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()),
+        errors::InvalidArgument("argument to LRN too large"));
 
     if (!input_in_mkl_format) {
       mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
@@ -334,10 +347,11 @@ class MklLRNGradOp : public OpKernel {
   explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
-                                         std::numeric_limits<int>::max()),
-                errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                        " larger than int max"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
@@ -648,6 +662,7 @@ class MklLRNGradOp : public OpKernel {
       const auto nodes = cols * rows;
 
       auto grads_shaped = in_grads.shaped<T, 2>({nodes * batch, depth});
+
       auto in_shaped = in_image.shaped<T, 2>({nodes * batch, depth});
       auto activations = out_image.shaped<T, 2>({nodes * batch, depth});
 
@@ -717,6 +732,619 @@ class MklLRNGradOp : public OpKernel {
   float beta_;
 };
 
+#else
+
+template <typename T>
+class MklLRNOp : public OpKernel {
+ public:
+  ~MklLRNOp() {}
+
+  explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
+    int64 depth_radius64;
+    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
+    depth_radius_ = static_cast<size_t>(depth_radius64);
+
+    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
+    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
+    workspace_enabled_ = false;
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      SanityCheckInputs(context);
+      if (!context->status().ok()) return;
+
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& src_tensor = MklGetInput(context, kIdxInput);
+      MklDnnShape src_dnn_shape;
+      GetMklShape(context, kIdxInput, &src_dnn_shape);
+
+      // MKL-DNN has a notion of kernel_size and not depth_radius.
+      int kernel_size = 2 * depth_radius_ + 1;
+      float new_alpha = alpha_ * kernel_size;
+
+      // if the input tensor is not an MKL Tensor, or if the last
+      // dimension is not channel, then just use Eigen.
+      // MKL only support normalization over the channel dimension.
+      if (!src_dnn_shape.IsMklTensor()) {
+        MklDefaultToEigen(context, src_tensor);
+        return;
+      } else if (!src_dnn_shape.IsMklChannelDim(src_dnn_shape.GetDimension() -
+                                                1)) {
+        Tensor converted_tensor =
+            ConvertMklToTF<T>(context, src_tensor, src_dnn_shape);
+        MklDefaultToEigen(context, converted_tensor);
+        return;
+      }
+      // At this point, we can assume that the src is an MklTensor
+      // and we can enable the workspace
+      workspace_enabled_ = true;
+
+      MklDnnData<T> src_dnn_data(&cpu_engine);
+      MklDnnData<T> dst_dnn_data(&cpu_engine);
+      MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
+
+      TensorShape tf_output_shape = src_tensor.shape();
+
+      memory::desc src_md = src_dnn_shape.GetCurLayout();
+      memory::dims input_dims = src_dnn_shape.GetSizesAsMklDnnDims();
+
+      // Create memory for user input.
+      // Since Tensorflow always performs normalization over last dimension,
+      // and MKL-DNN performs normalization over Channel, we tell MKL-DNN
+      // that input is in NHWC layout with Channel being the last dimension.
+      src_dnn_data.SetUsrMem(src_md, &src_tensor);
+      src_dnn_data.SetOpMemDesc(input_dims, memory::format::nhwc);
+
+      // output_dnn_data and workspace both have the same shape as input
+      dst_dnn_data.SetUsrMem(src_md);
+      dst_dnn_data.SetOpMemDesc(input_dims, memory::format::nhwc);
+
+      // Create LRN primitive descriptor.
+      // Tensorflow's normalization semantics is across channels.
+      // MKL-DNN also supports normalization within channel.
+      auto lrn_desc = lrn_forward::desc(prop_kind::forward, lrn_across_channels,
+                                        src_dnn_data.GetUsrMemDesc(),
+                                        kernel_size, new_alpha, beta_, bias_);
+      auto lrn_prim_desc = lrn_forward::primitive_desc(lrn_desc, cpu_engine);
+
+      // Allocate output_dnn_data tensor.
+      Tensor* output_tensor = nullptr;
+      memory::format input_format = src_dnn_shape.GetTfDataFormat();
+      AllocateOutputTensor(context, lrn_prim_desc, input_dims, input_format,
+                           &output_tensor);
+      OP_REQUIRES_OK(context, context->status());
+      CHECK_NOTNULL(output_tensor);
+      dst_dnn_data.SetUsrMemDataHandle(output_tensor);
+
+      // Handle workspace required for MKL-DNN.
+      AllocateWorkspaceTensor(context, lrn_prim_desc, &workspace_dnn_data);
+      OP_REQUIRES_OK(context, context->status());
+
+      PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data, &dst_dnn_data,
+                           &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+ private:
+  void PrepareAndExecuteNet(const lrn_forward::primitive_desc& lrn_fwd_desc,
+                            MklDnnData<T>* src_dnn_data,
+                            MklDnnData<T>* dst_dnn_data,
+                            MklDnnData<uint8>* wksp_dnn_data = nullptr) {
+    std::vector<primitive> net;
+
+    // Check for input reorder
+    src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(), &net);
+
+    // Create pooling primitive and add it to net
+    if (wksp_dnn_data != nullptr) {
+      net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(),
+                                wksp_dnn_data->GetOpMem(),
+                                dst_dnn_data->GetOpMem()));
+    } else {
+      net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(),
+                                dst_dnn_data->GetOpMem()));
+    }
+    stream(stream::kind::eager).submit(net).wait();
+  }
+
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    memory::primitive_desc dst_pd = lrn_fwd_prim_desc.dst_primitive_desc();
+
+    MklDnnShape output_mkl_shape;
+    // We only handle the case when the inputs and output are in Mkl format
+    // Any other case is handled by Eigen
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+    TensorShape output_tf_shape;
+    // only allocate enough space for the elements we need.
+    size_t num_bytes = dst_pd.get_size();
+    CHECK_EQ(num_bytes % sizeof(T), 0);
+    output_tf_shape.AddDim(num_bytes / sizeof(T));
+    AllocateOutputSetMklShape(context, kIdxOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
+  }
+
+  // Fallback implementation - Taken from lrn_op.cc
+  // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
+  // copy.
+  void MklDefaultToEigen(OpKernelContext* context, const Tensor& input) {
+    const int batch = static_cast<int>(input.dim_size(0));
+    const int rows = static_cast<int>(input.dim_size(1));
+    const int cols = static_cast<int>(input.dim_size(2));
+    const int depth = static_cast<int>(input.dim_size(3));
+    const int nodes = cols * rows;
+
+    auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
+    // Multiplying the input with the band matrix has the effect of reducing
+    // the
+    // correct patch along the depth.
+    Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
+    GetBandMatrix<T>(depth, depth_radius_, &multiplier);
+
+    Tensor* output_dnn_data = nullptr;
+    MklDnnShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(false);
+    mkl_output_mkl_shape.SetDimensions(4);
+    AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
+                              input.shape(), mkl_output_mkl_shape);
+    CHECK_NOTNULL(output_dnn_data);
+
+    Tensor* workspace_tensor = nullptr;
+    MklDnnShape workspace_mkl_shape;
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(0);
+    AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor,
+                              workspace_tf_shape, workspace_mkl_shape);
+    CHECK_NOTNULL(workspace_tensor);
+
+    auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
+    Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+    auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
+    if (beta_ == T(1)) {
+      out_shaped.device(context->eigen_cpu_device()) =
+          in_shaped * tmp.inverse();
+    } else if (beta_ == T(0.5)) {
+      out_shaped.device(context->eigen_cpu_device()) = in_shaped * tmp.rsqrt();
+    } else {
+      out_shaped.device(context->eigen_cpu_device()) =
+          in_shaped * (tmp.log() * -beta_).exp();
+    }
+  }
+
+  void AllocateWorkspaceTensor(
+      OpKernelContext* context,
+      const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
+      MklDnnData<uint8>* dnn_data_wksp) {
+    CHECK_NOTNULL(dnn_data_wksp);
+    Tensor* workspace_tensor = nullptr;
+    memory::primitive_desc workspace_pd =
+        lrn_fwd_prim_desc.workspace_primitive_desc();
+    size_t workspace_bytes = workspace_pd.get_size();
+    MklDnnShape workspace_mkl_shape;
+    // the workspace tensor is a uint8 tensor that has
+    // exactly the number of bytes necessary
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(workspace_bytes);
+    AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor,
+                              workspace_tf_shape, workspace_mkl_shape);
+    CHECK_NOTNULL(workspace_tensor);
+    dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
+  }
+
+  void SanityCheckInputs(OpKernelContext* context) {
+    const Tensor& src_tensor = MklGetInput(context, kIdxInput);
+    MklDnnShape src_dnn_shape;
+    GetMklShape(context, kIdxInput, &src_dnn_shape);
+    if (src_dnn_shape.IsMklTensor()) {
+      OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional"));
+      OP_REQUIRES(context,
+                  FastBoundsCheck(src_tensor.NumElements(),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("argument to LRN too large"));
+    } else {
+      OP_REQUIRES(context, src_tensor.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional"));
+      OP_REQUIRES(context,
+                  FastBoundsCheck(src_tensor.NumElements(),
+                                  std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("argument to LRN too large"));
+    }
+  }
+  const int kIdxInput = 0, kIdxOutput = 0, kIdxWorkspace = 1;
+
+  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
+  bool workspace_enabled_;
+  int depth_radius_;
+  float bias_;
+  float alpha_;
+  float beta_;
+};
+
+template <typename T>
+class MklLRNGradOp : public OpKernel {
+ public:
+  explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
+    int64 depth_radius64;
+    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
+    depth_radius_ = static_cast<int>(depth_radius64);
+    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
+    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
+    workspace_enabled_ = false;
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      SanityCheckInputs(context);
+      if (!context->status().ok()) return;
+
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnData<T> input_grad_dnn_data(&cpu_engine);
+      MklDnnData<T> orig_input_dnn_data(&cpu_engine);
+      MklDnnData<T> orig_output_dnn_data(&cpu_engine);
+      MklDnnData<T> output_dnn_data(&cpu_engine);
+
+      MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
+          orig_output_dnn_shape;
+      GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
+      GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
+      GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
+
+      // We only use MKLDNN if all of the necessary inputs are present
+      // in mkldnn format, and Channel is the last dimension
+      bool can_use_mkldnn = workspace_enabled_ &&
+                            input_grad_dnn_shape.IsMklTensor() &&
+                            orig_input_dnn_shape.IsMklTensor() &&
+                            orig_output_dnn_shape.IsMklTensor() &&
+                            input_grad_dnn_shape.IsMklChannelDim(
+                                input_grad_dnn_shape.GetDimension() - 1) &&
+                            orig_input_dnn_shape.IsMklChannelDim(
+                                orig_input_dnn_shape.GetDimension() - 1) &&
+                            orig_output_dnn_shape.IsMklChannelDim(
+                                orig_output_dnn_shape.GetDimension() - 1);
+
+      if (!can_use_mkldnn) {
+        // Fallback to eigen
+        MklDefaultToEigen(context);
+        return;
+      }
+      // At this point, we have the all clear to use MklDnn constructs
+      // Naming: diff_dst is input_gradient_tensor; src is orig_input_tensor.
+      const Tensor& input_grad_tensor = MklGetInput(context, kIdxGradient);
+      const Tensor& orig_input_tensor = MklGetInput(context, kIdxOrigInput);
+      const Tensor& orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
+
+      // Get input sizes in MKL-DNN required NCHW format.
+      // LRN does not have data_format attribute. But by default it has
+      // NHWC format.
+      memory::desc original_output_md = orig_output_dnn_shape.GetCurLayout();
+      memory::desc target_diff_dst_md = ConfigureInputGradient(
+          input_grad_tensor, input_grad_dnn_shape, &input_grad_dnn_data);
+
+      memory::desc orig_input_md = orig_input_dnn_shape.GetCurLayout();
+      memory::dims orig_input_dims =
+          orig_input_dnn_shape.GetSizesAsMklDnnDims();
+      orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor);
+      orig_input_dnn_data.SetOpMemDesc(orig_input_dims, memory::format::nhwc);
+
+      // output_dnn_data has the same shape as original input
+      output_dnn_data.SetUsrMem(orig_input_md);
+      output_dnn_data.SetOpMemDesc(orig_input_dims, memory::format::nhwc);
+
+      // MKL-DNN has a notion of kernel_size and not depth_radius.
+      int kernel_size = 2 * depth_radius_ + 1;
+      float new_alpha = alpha_ * kernel_size;
+
+      // Create LRN backward primitive descriptor. It requires LRN forward
+      // primitive descriptor also.
+      auto lrn_fwd_desc = lrn_forward::desc(
+          prop_kind::forward, lrn_across_channels, orig_input_md, kernel_size,
+          new_alpha, beta_, bias_);
+      auto lrn_fwd_prim_desc =
+          lrn_forward::primitive_desc(lrn_fwd_desc, cpu_engine);
+      auto lrn_bwd_desc = lrn_backward::desc(
+          lrn_across_channels, original_output_md, target_diff_dst_md,
+          kernel_size, new_alpha, beta_, bias_);
+      auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(
+          lrn_bwd_desc, cpu_engine, lrn_fwd_prim_desc);
+
+      Tensor* output_tensor = nullptr;
+      memory::format orig_input_format = orig_input_dnn_shape.GetTfDataFormat();
+      AllocateOutputTensor(context, lrn_bwd_prim_desc, orig_input_dims,
+                           orig_input_format, &output_tensor);
+      OP_REQUIRES_OK(context, context->status());
+      CHECK_NOTNULL(output_tensor);
+      output_dnn_data.SetUsrMemDataHandle(output_tensor);
+
+      // Create LRN primitive and add it to the net
+      // At this point, workspace is enabled, so we don't need
+      // to check. Pass input workspace to LRN backward primitive.
+      const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace);
+      MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
+      ConfigureWorkspace(workspace_tensor,
+                         lrn_fwd_prim_desc.workspace_primitive_desc(),
+                         &workspace_dnn_data);
+
+      PrepareAndExecuteNet(
+          lrn_bwd_prim_desc, lrn_fwd_prim_desc, &orig_input_dnn_data,
+          &input_grad_dnn_data, &output_dnn_data,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine),
+          &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const lrn_backward::primitive_desc& lrn_bkwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    memory::primitive_desc dst_pd =
+        lrn_bkwd_prim_desc.diff_src_primitive_desc();
+    MklDnnShape output_mkl_shape;
+
+    // We assume that all outputs at this point are MKL Tensors
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+
+    TensorShape output_tf_shape;
+    size_t num_bytes = dst_pd.get_size();
+    CHECK_EQ(num_bytes % sizeof(T), 0);
+    output_tf_shape.AddDim(num_bytes / sizeof(T));
+    AllocateOutputSetMklShape(context, kIdxOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
+  }
+
+  memory::desc ConfigureInputGradient(const Tensor& input_grad_tensor,
+                                      const MklDnnShape& input_grad_dnn_shape,
+                                      MklDnnData<T>* input_grad_dnn_data) {
+    CHECK_NOTNULL(input_grad_dnn_data);
+    // This shouldn't be necessary at this point, but just in case
+    CHECK_EQ(input_grad_dnn_shape.IsMklTensor(), true);
+
+    memory::desc input_grad_md = input_grad_dnn_shape.GetCurLayout();
+    memory::dims orig_input_dims = input_grad_dnn_shape.GetSizesAsMklDnnDims();
+    input_grad_dnn_data->SetUsrMem(input_grad_md, &input_grad_tensor);
+    input_grad_dnn_data->SetOpMemDesc(orig_input_dims, memory::format::nhwc);
+    return input_grad_md;
+  }
+
+  void PrepareAndExecuteNet(
+      const lrn_backward::primitive_desc& lrn_bkwd_desc,
+      const lrn_forward::primitive_desc& lrn_fwd_desc,
+      MklDnnData<T>* src_dnn_data, MklDnnData<T>* input_gradient_diff_dst,
+      MklDnnData<T>* output_diff_src,
+      const memory::primitive_desc& target_diff_dst_pd,
+      const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
+    std::vector<primitive> net;
+
+    // Check for input reordering on the diff dst input
+    input_gradient_diff_dst->CheckReorderToOpMem(
+        lrn_bkwd_desc.diff_dst_primitive_desc(), &net);
+
+    // Check for input reordering on the original input
+    src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc(), &net);
+    // Create pooling primitive and add it to net
+    if (nullptr == workspace_dnn_data) {
+      net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(),
+                                 input_gradient_diff_dst->GetOpMem(),
+                                 output_diff_src->GetOpMem()));
+    } else {
+      net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(),
+                                 input_gradient_diff_dst->GetOpMem(),
+                                 workspace_dnn_data->GetOpMem(),
+                                 output_diff_src->GetOpMem()));
+    }
+    stream(stream::kind::eager).submit(net).wait();
+  }
+
+  void ConfigureWorkspace(const Tensor& workspace_tensor,
+                          memory::primitive_desc workspace_pd,
+                          MklDnnData<uint8>* workspace_dnn_data) {
+    CHECK_NOTNULL(workspace_dnn_data);
+
+    workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
+  }
+
+  // Fallback implementation - Taken from lrn_op.cc
+  // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
+  // copy.
+  void MklDefaultToEigen(OpKernelContext* context) {
+    Tensor input_gradient_tensor;
+    Tensor orig_input_tensor;
+    Tensor orig_output_tensor;
+
+    MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
+        orig_output_dnn_shape;
+    GetMklShape(context, kIdxGradient, &input_grad_dnn_shape);
+    GetMklShape(context, kIdxOrigInput, &orig_input_dnn_shape);
+    GetMklShape(context, kIdxOrigOutput, &orig_output_dnn_shape);
+
+    if (input_grad_dnn_shape.IsMklTensor()) {
+      input_gradient_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxGradient), input_grad_dnn_shape);
+    } else {
+      input_gradient_tensor = MklGetInput(context, kIdxGradient);
+    }
+
+    if (orig_input_dnn_shape.IsMklTensor()) {
+      orig_input_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxOrigInput), orig_input_dnn_shape);
+    } else {
+      orig_input_tensor = MklGetInput(context, kIdxOrigInput);
+    }
+
+    if (orig_output_dnn_shape.IsMklTensor()) {
+      orig_output_tensor = ConvertMklToTF<T>(
+          context, MklGetInput(context, kIdxOrigOutput), orig_output_dnn_shape);
+    } else {
+      orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
+    }
+
+    const int64 batch = static_cast<int64>(input_gradient_tensor.dim_size(0));
+    const int64 rows = static_cast<int64>(input_gradient_tensor.dim_size(1));
+    const int64 cols = static_cast<int64>(input_gradient_tensor.dim_size(2));
+    const int64 depth = static_cast<int64>(input_gradient_tensor.dim_size(3));
+    const auto nodes = cols * rows;
+
+    auto grads_shaped =
+        input_gradient_tensor.shaped<T, 2>({nodes * batch, depth});
+
+    auto in_shaped = orig_input_tensor.shaped<T, 2>({nodes * batch, depth});
+    auto activations = orig_output_tensor.shaped<T, 2>({nodes * batch, depth});
+
+    Tensor* output_dnn_data;
+    MklShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(false);
+    mkl_output_mkl_shape.SetDimensions(4);
+    AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
+                              input_gradient_tensor.shape(),
+                              mkl_output_mkl_shape);
+
+    auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
+    out_shaped.setZero();
+    auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
+                  depth](int64 begin, int64 end) {
+      for (int64 i = begin; i < end; ++i) {
+        for (int64 j = 0; j < depth; ++j) {
+          int64 depth_begin = std::max<int64>(0, j - depth_radius_);
+          int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
+
+          T norm(0);
+          for (int64 k = depth_begin; k < depth_end; ++k) {
+            norm += in_shaped(i, k) * in_shaped(i, k);
+          }
+          norm = alpha_ * norm + bias_;
+          DCHECK_GT(norm, T(1e-6));
+          for (int64 k = depth_begin; k < depth_end; ++k) {
+            T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
+                    activations(i, j) / norm;
+            if (k == j) {
+              dyi += Eigen::numext::pow(norm, -beta_);
+            }
+            dyi *= grads_shaped(i, j);
+            const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) += dyi;
+          }
+        }
+      }
+    };
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
+          depth * depth, shard);
+  }
+
+  void SanityCheckInputs(OpKernelContext* context) {
+    const Tensor& input_gradient_tensor = MklGetInput(context, kIdxGradient);
+    const Tensor& orig_input_tensor = MklGetInput(context, kIdxOrigInput);
+    const Tensor& orig_output_tensor = MklGetInput(context, kIdxOrigOutput);
+    const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace);
+    MklDnnShape in_grads_dnn_shape, in_image_dnn_shape, out_image_dnn_shape,
+        workspace_dnn_shape;
+    GetMklShape(context, kIdxGradient, &in_grads_dnn_shape);
+    GetMklShape(context, kIdxOrigInput, &in_image_dnn_shape);
+    GetMklShape(context, kIdxOrigOutput, &out_image_dnn_shape);
+    GetMklShape(context, kIdxWorkspace, &workspace_dnn_shape);
+    if (in_grads_dnn_shape.IsMklTensor()) {
+      OP_REQUIRES(context, in_grads_dnn_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Input gradient must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(
+          context, input_gradient_tensor.dims() == 4,
+          errors::InvalidArgument("input gradient must be 4-dimensional"));
+    }
+
+    if (in_image_dnn_shape.IsMklTensor()) {
+      OP_REQUIRES(context, in_image_dnn_shape.GetDimension() == 4,
+                  errors::InvalidArgument("input images must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, orig_input_tensor.dims() == 4,
+                  errors::InvalidArgument("input images must be "
+                                          "4-dimensional"));
+    }
+
+    if (out_image_dnn_shape.IsMklTensor()) {
+      OP_REQUIRES(context, out_image_dnn_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Output image must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(
+          context, orig_output_tensor.dims() == 4,
+          errors::InvalidArgument("Output image must be 4-dimensional"));
+    }
+
+    if (workspace_enabled_) {
+      if (workspace_dnn_shape.IsMklTensor()) {
+        OP_REQUIRES(
+            context, workspace_dnn_shape.IsMklTensor() == false,
+            errors::InvalidArgument("Workspace should not be MKL Tensor."));
+      } else {
+        OP_REQUIRES(context, workspace_tensor.dims() == 1,
+                    errors::InvalidArgument("Workspace must be 1-dimensional"));
+      }
+    }
+  }
+
+  // Input("input_grads: T")
+  // Input("input_image: T")
+  // Input("output_image: T")
+  // Input("workspace: uint8")
+  const int kIdxGradient = 0, kIdxOrigInput = 1, kIdxOrigOutput = 2,
+            kIdxWorkspace = 3, kIdxOutput = 0;
+
+  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
+  bool workspace_enabled_;
+  int depth_radius_;
+  float bias_;
+  float alpha_;
+  float beta_;
+};
+
+#endif  // INTEL_MKL_ML
+
 #define REGISTER_MKL_LRN_CPU(T)                                     \
   REGISTER_KERNEL_BUILDER(Name("_MklLRN")                           \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 846bb5710ded92c303567e4078c49a56b3746706..14607f26e0ccd1028dd62343000d90ac8451d7bb 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -16,17 +16,32 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 #define EIGEN_USE_THREADS
-
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
+#ifndef INTEL_MKL_ML
+#include <algorithm>
+#include "mkldnn.hpp"
+using mkldnn::algorithm;
+using mkldnn::engine;
+using mkldnn::error;
+using mkldnn::memory;
+using mkldnn::padding_kind;
+using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
+using mkldnn::prop_kind;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+// MKL-DNN is now default. MKL-ML must be specified explicitly.
+#ifdef INTEL_MKL_ML
+
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
 class MklMaxPoolingOp : public OpKernel {
@@ -382,18 +397,19 @@ class MklMaxPoolingGradOp : public OpKernel {
       if (workspace_enabled == false) {
         if (convert_input != nullptr) {
           if (input_in_mkl_format == false) {
-            CHECK_EQ(
-                dnnConversionExecute_F32(
-                    convert_input, const_cast<void*>(static_cast<const void*>(
-                                       tensor_in.flat<T>().data())),
-                    input_buf),
-                E_SUCCESS);
+            CHECK_EQ(dnnConversionExecute_F32(
+                         convert_input,
+                         const_cast<void*>(static_cast<const void*>(
+                             tensor_in.flat<T>().data())),
+                         input_buf),
+                     E_SUCCESS);
             CHECK_EQ(dnnDelete_F32(convert_input), E_SUCCESS);
             convert_input = nullptr;
           } else {
             input_shape.GetConvertedFlatData(
-                lt_input_prim, const_cast<void*>(static_cast<const void*>(
-                                   tensor_in.flat<T>().data())),
+                lt_input_prim,
+                const_cast<void*>(
+                    static_cast<const void*>(tensor_in.flat<T>().data())),
                 input_buf);
           }
           pooling_resfwd[dnnResourceSrc] = input_buf;
@@ -438,8 +454,9 @@ class MklMaxPoolingGradOp : public OpKernel {
           CHECK_EQ(dnnDelete_F32(convert_outbackprop), E_SUCCESS);
         } else {
           output_backprop_shape.GetConvertedFlatData(
-              lt_outbackprop_prim, const_cast<void*>(static_cast<const void*>(
-                                       out_backprop.flat<T>().data())),
+              lt_outbackprop_prim,
+              const_cast<void*>(
+                  static_cast<const void*>(out_backprop.flat<T>().data())),
               outbackprop_buf);
         }
         pooling_res[dnnResourceDiffDst] = outbackprop_buf;
@@ -475,8 +492,309 @@ class MklMaxPoolingGradOp : public OpKernel {
   TensorFormat data_format_;
 
   bool workspace_enabled_;
+};  // MklMaxPoolingGradOp
+
+#else
+
+// An implementation of MaxPooling (forward).
+template <typename Device, typename T>
+class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
+ public:
+  explicit MklMaxPoolingOp(OpKernelConstruction* context)
+      : MklPoolingForwardOpBase<T>(context) {
+    // In Max Pooling, MKLDNN does not allow passing workspace as NULL.
+    // So we set workspace_enabled_ to true.
+    this->workspace_enabled_ = true;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& input_tensor =
+          MklGetInput(context, this->kInputTensorIndexInput);
+      MklDnnShape dnn_shape_input;
+      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
+      this->SanityCheckInput(context, input_tensor, dnn_shape_input);
+      if (!context->status().ok()) return;
+
+      MklDnnData<T> dnn_data_input(&cpu_engine);
+      MklDnnData<T> dnn_data_output(&cpu_engine);
+      MklDnnData<uint8> dnn_data_wksp(&cpu_engine);
+
+      // initialize variables for the pooling op
+      MklPoolParameters pool_params;
+      // Get the input tensor and initialize the pooling parameters
+      this->ConfigureInput(context, dnn_shape_input, input_tensor, &pool_params,
+                           &dnn_data_input);
+      OP_REQUIRES_OK(context, context->status());
+
+      // Declare output tensor
+      Tensor* output_tensor = nullptr;
+      memory::dims output_dims_mkl_order;
+      this->GetOutputDims(pool_params, &output_dims_mkl_order);
+
+      // If input is in Mkl layout, then just get the memory format from it
+      // directly, instead of using input data_format to MaxPool.
+      if (dnn_shape_input.IsMklTensor()) {
+        dnn_data_output.SetUsrMem(
+            output_dims_mkl_order,
+            static_cast<memory::format>(
+                dnn_data_input.GetUsrMemDesc().data.format));
+      } else {
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                                  this->data_format_mkldnn_);
+      }
+
+      // describe the memory layout; let mkl-dnn choose the best for the op
+      dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
+
+      auto pool_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_max,
+          dnn_data_input.GetUsrMemDesc(), dnn_data_output.GetUsrMemDesc(),
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_desc =
+          pooling_forward::primitive_desc(pool_desc, cpu_engine);
+
+      this->AllocateOutputTensor(context, pool_fwd_desc, output_dims_mkl_order,
+                                 this->data_format_mkldnn_, &output_tensor);
+      OP_REQUIRES_OK(context, context->status());
+      dnn_data_output.SetUsrMemDataHandle(output_tensor);
+
+      AllocateWorkspaceTensor(context, pool_fwd_desc, &dnn_data_wksp);
+      OP_REQUIRES_OK(context, context->status());
+
+      this->PrepareAndExecuteNet(pool_fwd_desc, &dnn_data_input,
+                                 &dnn_data_output, &dnn_data_wksp);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
+    }
+  }  // Compute
+
+ private:
+  const int kOutputTensorIndexWorkspace = 1;
+
+  void AllocateWorkspaceTensor(
+      OpKernelContext* context,
+      const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+      MklDnnData<uint8>* dnn_data_wksp) {
+    CHECK_NOTNULL(dnn_data_wksp);
+    Tensor* workspace_tensor = nullptr;
+    memory::primitive_desc workspace_pd =
+        pool_fwd_prim_desc.workspace_primitive_desc();
+    size_t workspace_bytes = workspace_pd.get_size();
+    MklDnnShape workspace_mkl_shape;
+    workspace_mkl_shape.SetMklTensor(false);
+    TensorShape workspace_tf_shape;
+    workspace_tf_shape.AddDim(workspace_bytes);
+    AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
+                              &workspace_tensor, workspace_tf_shape,
+                              workspace_mkl_shape);
+    CHECK_NOTNULL(workspace_tensor);
+    dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
+  }
 };
 
+// The operation to compute MaxPool gradients.
+// It takes three inputs:
+//   - The original input tensor
+//   - The original output tensor
+//   - Backprop tensor for output
+// It produces one output: backprop tensor for input.
+template <class Device, class T>
+class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
+ public:
+  explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
+      : MklPoolingBackwardOpBase<T>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& orig_input_tensor =
+          MklGetInput(context, kInputTensorIndexOrigInput);
+      const Tensor& orig_output_tensor =
+          MklGetInput(context, kInputTensorIndexOrigOutput);
+      const Tensor& grad_tensor =
+          MklGetInput(context, kInputTensorIndexGradient);
+      const Tensor& workspace_tensor =
+          MklGetInput(context, kInputTensorIndexWorkspace);
+      MklDnnShape orig_input_mkl_shape, orig_output_mkl_shape, grad_mkl_shape,
+          workspace_mkl_shape;
+      GetMklShape(context, kInputTensorIndexOrigInput, &orig_input_mkl_shape);
+      GetMklShape(context, kInputTensorIndexOrigOutput, &orig_output_mkl_shape);
+      GetMklShape(context, kInputTensorIndexGradient, &grad_mkl_shape);
+      GetMklShape(context, kInputTensorIndexWorkspace, &workspace_mkl_shape);
+
+      SanityCheckInputs(context, orig_input_tensor, orig_output_tensor,
+                        grad_tensor, workspace_tensor, orig_input_mkl_shape,
+                        orig_output_mkl_shape, grad_mkl_shape,
+                        workspace_mkl_shape);
+      if (!context->status().ok()) return;
+
+      MklDnnData<T> grad_dnn_data(&cpu_engine);
+      MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
+      MklDnnData<T> output_dnn_data(&cpu_engine);
+      Tensor* output_tensor = nullptr;
+      MklPoolParameters pool_params;
+      TensorShape orig_input_shape;
+      memory::dims output_dims_mkl_order, orig_input_dims_mkl_order;
+      memory::desc original_input_md = ConfigureOriginalInput(
+          context, orig_input_tensor, orig_input_mkl_shape,
+          &orig_input_dims_mkl_order, &pool_params, &orig_input_shape);
+
+      memory::desc original_output_md = this->ConfigureOriginalOutput(
+          pool_params, orig_output_mkl_shape, output_dims_mkl_order);
+
+      memory::desc target_diff_dst_md = this->ConfigureInputGradient(
+          grad_mkl_shape, grad_tensor, &grad_dnn_data, original_output_md);
+
+      output_dnn_data.SetUsrMem(original_input_md);
+
+      // Create the forward pooling primitive descriptor so we can
+      // pass it as a hint to the backward pooling primitive descriptor
+      auto pool_fwd_desc = pooling_forward::desc(
+          prop_kind::forward, algorithm::pooling_max, original_input_md,
+          original_output_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_prim_desc =
+          pooling_forward::primitive_desc(pool_fwd_desc, cpu_engine);
+
+      auto pool_bkwd_desc = pooling_backward::desc(
+          algorithm::pooling_max, output_dnn_data.GetUsrMemDesc(),
+          target_diff_dst_md,
+          memory::dims({pool_params.row_stride, pool_params.col_stride}),
+          memory::dims({pool_params.window_rows, pool_params.window_cols}),
+          memory::dims({static_cast<int>(pool_params.pad_top),
+                        static_cast<int>(pool_params.pad_left)}),
+          memory::dims({static_cast<int>(pool_params.pad_bottom),
+                        static_cast<int>(pool_params.pad_right)}),
+          TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_bkwd_prim_desc = pooling_backward::primitive_desc(
+          pool_bkwd_desc, cpu_engine, pool_fwd_prim_desc);
+
+      this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
+                                 orig_input_dims_mkl_order,
+                                 this->data_format_mkldnn_, &output_tensor);
+      output_dnn_data.SetUsrMemDataHandle(output_tensor);
+
+      ConfigureWorkspace(workspace_tensor,
+                         pool_fwd_prim_desc.workspace_primitive_desc(),
+                         &workspace_dnn_data);
+      this->PrepareAndExecuteNet(
+          pool_bkwd_prim_desc, &grad_dnn_data, &output_dnn_data,
+          memory::primitive_desc(target_diff_dst_md, cpu_engine),
+          &workspace_dnn_data);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
+                                              error_msg));
+    }
+  }  // Compute
+
+ private:
+  // .Input("orig_input: T")
+  // .Input("orig_output: T")
+  // .Input("grad: T")
+  // .Input("workspace: T")
+  const int kInputTensorIndexOrigInput = 0;
+  const int kInputTensorIndexOrigOutput = 1;
+  const int kInputTensorIndexGradient = 2;
+  const int kInputTensorIndexWorkspace = 3;
+  //  Output("output: T") in Base Class
+
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_mkl_order,
+      MklPoolParameters* pool_params, TensorShape* input_tensor_shape) {
+    *input_tensor_shape = tensor_original_input.shape();
+    return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
+        context, tensor_original_input, original_input_mkl_shape,
+        original_input_dims_mkl_order, pool_params, *input_tensor_shape);
+  }
+
+  void ConfigureWorkspace(const Tensor& workspace_tensor,
+                          memory::primitive_desc workspace_pd,
+                          MklDnnData<uint8>* workspace_dnn_data) {
+    CHECK_NOTNULL(workspace_dnn_data);
+
+    workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
+  }
+
+  void SanityCheckInputs(OpKernelContext* context,
+                         const Tensor& orig_input_tensor,
+                         const Tensor& orig_output_tensor,
+                         const Tensor& grad_tensor,
+                         const Tensor& workspace_tensor,
+                         const MklDnnShape& orig_input_mkl_shape,
+                         const MklDnnShape& orig_output_mkl_shape,
+                         const MklDnnShape& grad_mkl_shape,
+                         const MklDnnShape& workspace_mkl_shape) {
+    if (!orig_input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, orig_input_tensor.dims() == 4,
+                  errors::InvalidArgument("Original input shape must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Original input shape must be "
+                                          "4-dimensional"));
+    }
+    if (!orig_output_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, orig_output_tensor.dims() == 4,
+                  errors::InvalidArgument("Original output must be "
+                                          "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Original output must be "
+                                          "4-dimensional"));
+    }
+    if (!grad_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, grad_tensor.dims() == 4,
+                  errors::InvalidArgument("Gradient must be 4-dimensional"));
+    } else {
+      OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Gradient must be "
+                                          "4-dimensional"));
+    }
+    if (this->workspace_enabled_) {
+      // The workspace should not be an MKL tensor
+      OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
+                  errors::InvalidArgument("Workspace tensor should not"
+                                          " be an MKL Tensor."));
+      // It should only have one dimension
+      OP_REQUIRES(context, workspace_tensor.dims() == 1,
+                  errors::InvalidArgument("Workspace tensor must be "
+                                          "1-dimensional"));
+    } else {
+      OP_REQUIRES(
+          context, this->workspace_enabled_,
+          errors::Unimplemented("MKL-DNN Max Pooling does not "
+                                "yet support the use case "
+                                "where MaxPoolGrad is called without first"
+                                " calling MaxPool."));
+    }
+  }
+};  // MklMaxPoolingGradOp
+
+#endif  // INTEL_MKL_ML
+
 REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 65e8852cfb11a2dd78395860a7ca7b2cc550be34..5ef6ce2a5789034b338fe7308a6eca02f135befa 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #ifdef INTEL_MKL
-#include <vector>
+
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
+#include <limits>
+#include <vector>
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
@@ -39,6 +42,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
   Init(context, ksize, stride, padding, data_format);
 }
 
+#ifdef INTEL_MKL_ML
 // Initialization for MKL format
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -53,7 +57,22 @@ void MklPoolParameters::Init(OpKernelContext* context,
 
   Init(context, ksize, stride, padding, data_format);
 }
+#else
+// Initialization for MKL format
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format,
+                             const MklDnnShape* mklInputShape) {
+  // Get the input sizes
+  depth = mklInputShape->GetDimension('C');
+  tensor_in_cols = mklInputShape->GetDimension('W');
+  tensor_in_rows = mklInputShape->GetDimension('H');
+  tensor_in_batch = mklInputShape->GetDimension('N');
 
+  Init(context, ksize, stride, padding, data_format);
+}
+#endif  // INTEL_MKL_ML
 // Common Initialization for TensorFlow and MKL formats
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -80,7 +99,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
                   "MaxPooling supports exactly one of pooling across depth "
                   "or pooling across width/height."));
 
-  if (depth_window == 1) {
+  if (depth_window == 1) {  // we are pooling in the H and W
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_rows, window_rows, row_stride,
                                 padding, &out_height, &pad_top, &pad_bottom));
@@ -88,7 +107,21 @@ void MklPoolParameters::Init(OpKernelContext* context,
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_cols, window_cols, col_stride,
                                 padding, &out_width, &pad_left, &pad_right));
-  } else {
+#ifndef INTEL_MKL_ML
+    // TF can work with int64, but mkldnn only supports int32
+    // Fail if the height or width are greater than MAX_INT
+
+    OP_REQUIRES(context,
+                FastBoundsCheck(out_height, std::numeric_limits<int>::max()),
+                errors::InvalidArgument("output height is too large"));
+
+    OP_REQUIRES(context,
+                FastBoundsCheck(out_width, std::numeric_limits<int>::max()),
+                errors::InvalidArgument("output width is too large"));
+
+#endif
+    out_depth = depth;  // output will have the same depth as the input
+  } else {              // we are pooling in the depth dimension
     // Our current version of depthwise max pooling does not support
     // any padding, and expects the depth_window to equal the depth
     // stride (no overlapping).
@@ -109,7 +142,6 @@ void MklPoolParameters::Init(OpKernelContext* context,
                 errors::Unimplemented("Depthwise max pooling is currently "
                                       "only implemented for CPU devices."));
 
-    pad_depth = 0;
     out_depth = depth / depth_window;
   }
 }
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 92ea2beb25aa1fd4cab7fd787b04c4d086ca1b05..279167aba24863441774b0665e9793e52d84ccfa 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -17,10 +17,19 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
 
 #ifdef INTEL_MKL
+#include <string>
 #include <vector>
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
+#ifndef INTEL_MKL_ML
+#include "mkldnn.hpp"
+using mkldnn::memory;
+using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -51,14 +60,40 @@ struct MklPoolParameters {
   int pad_depth;
 
   TensorFormat data_format;
+  MklPoolParameters()
+      : depth(0),
+        tensor_in_cols(0),
+        tensor_in_rows(0),
+        tensor_in_batch(0),
+        window_rows(0),
+        window_cols(0),
+        depth_window(0),
+        row_stride(0),
+        col_stride(0),
+        depth_stride(0),
+        out_height(0),
+        out_width(0),
+        out_depth(0),
+        pad_left(0),
+        pad_right(0),
+        pad_top(0),
+        pad_bottom(0),
+        pad_depth(0),
+        data_format(TensorFormat::FORMAT_NCHW) {}
 
   // Updates context->status if there is an invalid input.
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const TensorShape& tensor_in_shape);
+#ifdef INTEL_MKL_ML
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const MklShape* mkl_in_shape);
+#else
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format, const MklDnnShape* mkl_in_shape);
+#endif
 
  private:
   // Common initialization for TensorFlow and MKL formats
@@ -67,6 +102,301 @@ struct MklPoolParameters {
             TensorFormat data_format);
 };
 
+#ifndef INTEL_MKL_ML
+
+template <class T>
+class MklPoolingOpBase : public OpKernel {
+ public:
+  explicit MklPoolingOpBase(OpKernelConstruction* context)
+      : OpKernel(context), workspace_enabled_(false) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &this->data_format_tf_),
+                errors::InvalidArgument("Invalid data format"));
+    this->data_format_mkldnn_ =
+        TFDataFormatToMklDnnDataFormat(this->data_format_tf_);
+    OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
+    OP_REQUIRES(context, this->ksize_.size() == 4,
+                errors::InvalidArgument("Sliding window ksize field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_));
+    OP_REQUIRES(context, this->stride_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_));
+    OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1,
+                errors::Unimplemented("Pooling is not yet supported on the "
+                                      "batch dimension."));
+
+    // We may not get this attribute for this node if it does not go through
+    // graph rewrite pass. So we do not check for error while retrieving this
+    // attribute value.
+    context->GetAttr("workspace_enabled", &this->workspace_enabled_);
+  }
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  // Calculate output shape of pooling op in MKL-DNN and TensorFlow order.
+  // MKL-DNN uses NCHW for output order. But TensorFlow output will be in
+  // NHWC or NCHW format depending on data format. Function expects
+  // output height and output width to have already been int32
+  // bounds-checked
+  void GetOutputDims(const MklPoolParameters& mkl_pool_params,
+                     memory::dims* output_dims_mkl_order) {
+    // MKL-DNN always needs output in NCHW format.
+    *output_dims_mkl_order = {mkl_pool_params.tensor_in_batch,
+                              mkl_pool_params.out_depth,
+                              static_cast<int>(mkl_pool_params.out_height),
+                              static_cast<int>(mkl_pool_params.out_width)};
+  }
+
+  void InitMklPoolParameters(OpKernelContext* context,
+                             MklPoolParameters* pool_params,
+                             const MklDnnShape& original_input_mkl_shape,
+                             const TensorShape& input_tensor_shape) {
+    if (!original_input_mkl_shape.IsMklTensor()) {
+      pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
+                        this->data_format_tf_, input_tensor_shape);
+    } else {
+      pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
+                        this->data_format_tf_, &original_input_mkl_shape);
+    }
+  }
+
+  // Checks to make sure that the memory we need to allocate
+  // is a multiple of sizeof(T)
+  // returns the number of elements
+  size_t GetNumTElements(const memory::primitive_desc& pd) {
+    size_t num_bytes = pd.get_size();
+    size_t ret_val = num_bytes / sizeof(T);
+    if (num_bytes % sizeof(T) != 0) {
+      ret_val++;
+    }
+    return ret_val;
+  }
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_tf_;
+  memory::format data_format_mkldnn_;
+  bool workspace_enabled_;
+};
+
+template <class T>
+class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
+ public:
+  explicit MklPoolingForwardOpBase<T>(OpKernelConstruction* context)
+      : MklPoolingOpBase<T>(context) {}
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  void ConfigureInput(OpKernelContext* context,
+                      const MklDnnShape& input_mkl_shape,
+                      const Tensor& input_tensor,
+                      MklPoolParameters* pool_params,
+                      MklDnnData<T>* dnn_data_input) {
+    CHECK_NOTNULL(pool_params);
+    CHECK_NOTNULL(dnn_data_input);
+    TensorShape input_tensor_shape = input_tensor.shape();
+    memory::desc input_md =
+        input_mkl_shape.IsMklTensor()
+            ? input_mkl_shape.GetMklLayout()
+            : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+                                                     this->data_format_tf_),
+                           MklDnnType<T>(), this->data_format_mkldnn_);
+    dnn_data_input->SetUsrMem(input_md, &input_tensor);
+    this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
+                                input_tensor_shape);
+  }
+
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    memory::primitive_desc dst_pd = pool_fwd_prim_desc.dst_primitive_desc();
+
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+    TensorShape output_tf_shape;
+
+    // only allocate enough space for the elements we need.
+    output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
+    CHECK_NOTNULL(*output_tensor);
+  }
+
+  void PrepareAndExecuteNet(
+      const pooling_forward::primitive_desc& pool_fwd_desc,
+      const MklDnnData<T>* src, MklDnnData<T>* dst,
+      MklDnnData<uint8>* wksp = nullptr) {
+    std::vector<primitive> net;
+
+    // Create pooling primitive and add it to net
+    if (wksp != nullptr) {
+      net.push_back(pooling_forward(pool_fwd_desc, src->GetOpMem(),
+                                    dst->GetOpMem(), wksp->GetOpMem()));
+    } else {
+      net.push_back(
+          pooling_forward(pool_fwd_desc, src->GetOpMem(), dst->GetOpMem()));
+    }
+    stream(stream::kind::eager).submit(net).wait();
+  }
+
+  void SanityCheckInput(OpKernelContext* context, const Tensor& input_tensor,
+                        const MklDnnShape& input_mkl_shape) {
+    if (!input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, input_tensor.dims() == 4,
+                  errors::InvalidArgument("Input must be 4-dimensional"));
+    } else {
+      OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4,
+                  errors::InvalidArgument("Input shape must be "
+                                          "4-dimensional"));
+    }
+  }
+  // .Input("value: T")
+  // .Output("output: T")
+  const int kInputTensorIndexInput = 0;
+  const int kOutputTensorIndexOutput = 0;
+};  // MklPoolingForwardBaseOp
+
+template <class T>
+class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
+ public:
+  explicit MklPoolingBackwardOpBase<T>(OpKernelConstruction* context)
+      : MklPoolingOpBase<T>(context) {}
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  const int kOutputTensorIndexOutput = 0;
+
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const pooling_backward::primitive_desc& pool_bkwd_prim_desc,
+      const memory::dims output_dims_mkl_order,
+      const memory::format& output_tf_format, Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    memory::primitive_desc dst_pd =
+        pool_bkwd_prim_desc.diff_src_primitive_desc();
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                 output_dims_mkl_order, output_tf_format);
+
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
+                              output_tf_shape, output_mkl_shape);
+    CHECK_NOTNULL(*output_tensor);
+  }
+
+  void PrepareAndExecuteNet(
+      const pooling_backward::primitive_desc& pool_bkwd_desc,
+      MklDnnData<T>* input_gradient_diff_dst, MklDnnData<T>* output_diff_src,
+      const memory::primitive_desc& target_diff_dst_pd,
+      const MklDnnData<uint8>* workspace = nullptr) {
+    std::vector<primitive> net;
+
+    // If the input gradient isn't in the same format as the output
+    // reorder it to the same format as the output
+    input_gradient_diff_dst->CheckReorderToOpMem(target_diff_dst_pd, &net);
+
+    // Create pooling primitive and add it to net
+    if (nullptr == workspace) {
+      net.push_back(pooling_backward(pool_bkwd_desc,
+                                     input_gradient_diff_dst->GetOpMem(),
+                                     output_diff_src->GetOpMem()));
+    } else {
+      net.push_back(
+          pooling_backward(pool_bkwd_desc, input_gradient_diff_dst->GetOpMem(),
+                           workspace->GetOpMem(), output_diff_src->GetOpMem()));
+    }
+    stream(stream::kind::eager).submit(net).wait();
+  }
+
+  // Max Pooling and Avg Pooling have slightly different implementations
+  // Takes the Tensor containing original input data and the original
+  // mkl Dnn Shape and populates other data
+  memory::desc ConfigureOriginalInput(
+      OpKernelContext* context, const Tensor& tensor_original_input_shape,
+      const MklDnnShape& original_input_mkl_shape,
+      memory::dims* original_input_dims_nchw, MklPoolParameters* pool_params,
+      const TensorShape& input_tensor_shape) {
+    CHECK_NOTNULL(original_input_dims_nchw);
+    CHECK_NOTNULL(pool_params);
+    this->InitMklPoolParameters(context, pool_params, original_input_mkl_shape,
+                                input_tensor_shape);
+
+    *original_input_dims_nchw =
+        original_input_mkl_shape.IsMklTensor()
+            ? original_input_mkl_shape.GetSizesAsMklDnnDims()
+            : TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+                                        this->data_format_tf_);
+
+    return original_input_mkl_shape.IsMklTensor()
+               ? original_input_mkl_shape.GetMklLayout()
+               : memory::desc(*original_input_dims_nchw, MklDnnType<T>(),
+                              this->data_format_mkldnn_);
+  }
+
+  memory::desc ConfigureOriginalOutput(
+      const MklPoolParameters& pool_params,
+      const MklDnnShape& original_output_mkl_shape,
+      memory::dims output_dims_mkl_order) {
+    this->GetOutputDims(pool_params, &output_dims_mkl_order);
+
+    return original_output_mkl_shape.IsMklTensor()
+               ? original_output_mkl_shape.GetMklLayout()
+               : memory::desc(output_dims_mkl_order, MklDnnType<T>(),
+                              this->data_format_mkldnn_);
+  }
+
+  memory::desc ConfigureInputGradient(
+      const MklDnnShape& input_gradient_mkl_shape,
+      const Tensor& input_gradient_tensor,
+      MklDnnData<T>* input_gradient_dnn_data,
+      const memory::desc& original_output_md) {
+    // Configure the gradient as is
+    memory::desc original_input_grad_md =
+        input_gradient_mkl_shape.IsMklTensor()
+            ? input_gradient_mkl_shape.GetMklLayout()
+            : memory::desc(
+                  TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
+                                            this->data_format_tf_),
+                  MklDnnType<T>(), this->data_format_mkldnn_);
+
+    input_gradient_dnn_data->SetUsrMem(original_input_grad_md,
+                                       &input_gradient_tensor);
+
+    // Check to see if input grad diff dst is in the right format
+    // Create a new memory descriptor with the same shape as the
+    // original, but the format of the other tensors.
+    memory::format original_output_format =
+        static_cast<memory::format>(original_output_md.data.format);
+    bool grad_reorder_needed =
+        input_gradient_dnn_data->IsReorderNeeded(original_output_format);
+    memory::dims diff_dst_dims =
+        input_gradient_mkl_shape.IsMklTensor()
+            ? input_gradient_mkl_shape.GetSizesAsMklDnnDims()
+            : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
+                                        this->data_format_tf_);
+    memory::desc target_diff_dst_md =
+        memory::desc(diff_dst_dims, MklDnnType<T>(), original_output_format);
+
+    return grad_reorder_needed ? target_diff_dst_md : original_input_grad_md;
+  }
+};
+#endif  // INTEL_MKL_ML
+
 //-------------------------------------------------------------------
 // Utility functions
 
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 86a77d769a52d7592d15627b504ae60278b45058..51db3991e2a24f087771f571cd91fc9fbb26040b 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -16,17 +16,30 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
-#include "tensorflow/core/platform/default/logging.h"
-#include "tensorflow/core/util/mkl_util.h"
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+#ifndef INTEL_MKL_ML
+#include "mkldnn.hpp"
+
+using mkldnn::algorithm;
+using mkldnn::eltwise_elu;
+using mkldnn::eltwise_relu;
+using mkldnn::eltwise_tanh;
+using mkldnn::prop_kind;
+using mkldnn::relu_backward;
+using mkldnn::relu_forward;
+using mkldnn::stream;
+#endif
 
 namespace tensorflow {
 
@@ -45,6 +58,8 @@ struct MklReluHelpers {
   }
 };
 
+#ifdef INTEL_MKL_ML
+
 template <typename Device, typename T>
 class MklReluOp : public OpKernel {
  public:
@@ -59,6 +74,7 @@ class MklReluOp : public OpKernel {
     GetMklShape(context, 0, &mkl_context.input_shape);
     void* user_i = static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
     bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
     if (!input_in_mkl_format && !input.dims()) {  // handle the case of a scalar
       const TensorShape& o_shape = input.shape();
       Tensor* out_tensor = nullptr;
@@ -189,15 +205,16 @@ class MklReluGradOp : public OpKernel {
       const Tensor& a = MklGetInput(context, 1);
       void* buf_input = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
       void* mkl_buffer_convert = nullptr;
+
       dnnPrimitive_t cv_input_to_grad = nullptr;
 
-      // if input and grad are not in the same layout, do a conversion between
-      // them.
+      // if input and grad are not in the same layout,
+      // do a conversion between them.
       if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
                        &mkl_buffer_convert);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input,
-                   lt_grad), E_SUCCESS);
+        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad),
+                 E_SUCCESS);
         CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
                                           mkl_buffer_convert),
                  E_SUCCESS);
@@ -246,7 +263,6 @@ class MklReluGradOp : public OpKernel {
 };
 
 template <typename Device, typename T>
-
 void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   MklReluGradOpContext mkl_context;
   const Tensor& g = MklGetInput(context, 0);
@@ -264,20 +280,21 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
       !MklReluHelpers::ValidateSameSize(context, g, a))
     return;
   Tensor* output = nullptr;
-  if (!input_is_mkl && !grad_is_mkl &&
-      !a.dims()) {  // handle the case of a scalar
-    // Allocate space for g and
+
+  if (!input_is_mkl && !grad_is_mkl && !a.dims()) {
+    // handle the scalar case
     const TensorShape& g_shape = g.shape();
     mkl_context.output_shape.SetMklTensor(false);
     AllocateOutputSetMklShape(context, 0, &output, g_shape,
                               mkl_context.output_shape);
+
     void* out_o = static_cast<void*>(output->flat<T>().data());
     (static_cast<T*>(out_o))[0] =
         (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
     return;
   }
 
-  // Generate size, stride for input if input/grad is in MKL format.
+  // generate size, stride for input if input/grad is in mkl format.
   if (grad_is_mkl || input_is_mkl) {
     const MklShape* tmp_mkl_shape =
         (grad_is_mkl) ? &mkl_context.grad_shape : &mkl_context.input_shape;
@@ -314,15 +331,15 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
 
   if (input_is_mkl ||
-      grad_is_mkl) { /*if  grad or input are MKL leave it in MKL*/
+      grad_is_mkl) { /*if  grad or input are mkl leave it in mkl*/
     TensorShape tf_shape;
     mkl_context.output_shape.SetMklTensor(true);
     mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_bwd,
                                           dnnResourceDiffSrc);
     mkl_context.output_shape.SetTfLayout(
         mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-    // If input_is_mkl or grad_is_mkl, then we copy strides and sizes from Mkl
-    // shape of one that is in MKL layout.
+    // if input_is_mkl or grad_is_mkl, then we copy strides and sizes from mkl
+    // shape of one that is in mkl layout.
     if (grad_is_mkl == true) {
       mkl_context.output_shape.SetTfDimOrder(
           mkl_context.in_dims, mkl_context.grad_shape.GetTfToMklDimMap());
@@ -336,7 +353,6 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
                     sizeof(T));
     AllocateOutputSetMklShape(context, 0, &output, tf_shape,
                               mkl_context.output_shape);
-
   } else {
     const TensorShape& o_shape = g.shape();
     mkl_context.output_shape.SetMklTensor(false);
@@ -352,8 +368,474 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   mkl_context.MklCleanup();
 }
 
-/* Register DNN kernels for supported operations and supported types - right now
- * it is only Relu and f32*/
+#else  // INTEL_MKL_ML
+
+template <typename Device, typename T, algorithm alg_kind>
+class MklReluOpBase : public OpKernel {
+ public:
+  ~MklReluOpBase() {}
+
+  explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) = 0;
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const size_t src_index = 0;  // index of src input tensor
+      const size_t dst_index = 0;  // index of dst output tensor
+      const Tensor& src_tensor = MklGetInput(context, src_index);
+      MklDnnShape dnn_shape_src;
+      GetMklShape(context, src_index, &dnn_shape_src);
+
+      Tensor* dst_tensor = nullptr;
+      if (src_tensor.dims() == 0) {
+        Compute_Scalar(context);
+        return;
+      }
+
+      // Create relu primitive.
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      // Set DNN primitive - src
+      memory::desc src_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor()) {
+        src_md = dnn_shape_src.GetMklLayout();
+      } else {
+        auto src_dims = TFShapeToMklDnnDims(src_tensor.shape());
+        auto src_strides = CalculateTFStrides(src_dims);
+        // Create blocked memory descriptor
+        src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
+      }
+      src.SetUsrMem(src_md, &src_tensor);
+
+      T alpha = 0, beta = 0;
+      std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+      auto relu_fwd_desc = relu_forward::desc(
+          prop_kind::forward_training,
+          // Operator memory descriptor is same as user memory descriptor.
+          alg_kind, src.GetUsrMemDesc(), alpha, beta);
+      relu_fwd_pd.reset(
+          new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine));
+
+      // allocate dst tensor
+      MklDnnShape dnn_shape_dst;
+      TensorShape tf_shape_dst;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_dst.SetMklTensor(true);
+        auto dst_pd = relu_fwd_pd->dst_primitive_desc();
+        dnn_shape_dst.SetMklLayout(&dst_pd);
+        dnn_shape_dst.SetElemType(MklDnnType<T>());
+        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
+                                  dnn_shape_src.GetSizesAsMklDnnDims(),
+                                  dnn_shape_src.GetTfDataFormat());
+        tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T));
+      } else {
+        dnn_shape_dst.SetMklTensor(false);
+        tf_shape_dst = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, dst_index, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
+
+      // Destination memory descriptor is same as source memory descriptor.
+      auto dst_md = src_md;
+      dst.SetUsrMem(dst_md, dst_tensor);
+
+      // execute net
+      std::vector<primitive> net;
+      auto relu_fwd =
+          relu_forward(*relu_fwd_pd, src.GetOpMem(), dst.GetOpMem());
+      net.push_back(relu_fwd);
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+};
+
+template <typename Device, typename T, algorithm alg_kind>
+class MklReluGradOpBase : public OpKernel {
+ public:
+  ~MklReluGradOpBase() {}
+
+  explicit MklReluGradOpBase(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) = 0;
+
+  void Compute(OpKernelContext* context) {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> diff_dst(&cpu_engine);
+      MklDnnData<T> diff_src(&cpu_engine);
+
+      const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+      const size_t src_index = 1;       // index of src input tensor
+      const size_t diff_src_index = 0;  // index of diff_src output tensor
+
+      const Tensor& src_tensor = MklGetInput(context, src_index);
+      const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+      Tensor* diff_src_tensor = nullptr;
+
+      MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
+      GetMklShape(context, src_index, &dnn_shape_src);
+      GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+      int src_dims_size = src_tensor.dims();
+      if (src_dims_size == 0) {
+        Compute_Scalar(context);
+        return;
+      }
+
+      // Set DNN primitives for src & diff_dst
+      memory::desc src_md({}, memory::data_undef, memory::format_undef);
+      memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef);
+
+      // For creating Sum primitive, we need to ensure that all inputs are in
+      // same format. What that means is if we have a mixed input case - where
+      // one input is in Tensorflow format and one input is in MKL format -,
+      // then we need to ensure that all inputs are in same format for
+      // primitive construction. For performance reason, we say that all inputs
+      // are in MKL format in such case, and insert reorder for input that is
+      // in Tensorflow format into MKL format. On the other hand, if both the
+      // inputs are in MKL format or both are in Tensorflow format, then we
+      // dont need reorder.
+      if (!dnn_shape_src.IsMklTensor() && !dnn_shape_diff_dst.IsMklTensor()) {
+        // If both the inputs are in Tensorflow format, we create blocked memory
+        // descriptor.
+        auto src_dims = TFShapeToMklDnnDims(src_tensor.shape());
+        auto src_strides = CalculateTFStrides(src_dims);
+        src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
+        diff_dst_md = src_md;
+      } else if (dnn_shape_src.IsMklTensor() &&
+                 !dnn_shape_diff_dst.IsMklTensor()) {
+        // If one input is in MKL format and other is in Tensorflow, then
+        // create respective descriptors describing the actual case. For input
+        // in Mkl format, we just get Mkl layout from MklDnnShape. For input in
+        // Tensorflow format, we create memory descriptor using data format.
+        src_md = dnn_shape_src.GetMklLayout();
+
+        memory::format src_mkl_data_format = dnn_shape_src.GetTfDataFormat();
+        auto src_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(src_mkl_data_format);
+        auto diff_dst_dims = TFShapeToMklDnnDimsInNCHW(diff_dst_tensor.shape(),
+                                                       src_tf_data_format);
+        diff_dst_md =
+            memory::desc(diff_dst_dims, MklDnnType<T>(), src_mkl_data_format);
+      } else if (!dnn_shape_src.IsMklTensor() &&
+                 dnn_shape_diff_dst.IsMklTensor()) {
+        // Same comment as above.
+        diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
+
+        memory::format diff_dst_mkl_data_format =
+            dnn_shape_diff_dst.GetTfDataFormat();
+        auto diff_dst_tf_data_format =
+            MklDnnDataFormatToTFDataFormat(diff_dst_mkl_data_format);
+        auto src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
+                                                  diff_dst_tf_data_format);
+        src_md =
+            memory::desc(src_dims, MklDnnType<T>(), diff_dst_mkl_data_format);
+      } else {
+        // If both the inputs are in MKL format, we use Mkl layout of the input
+        // tensors.
+        src_md = dnn_shape_src.GetMklLayout();
+        diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
+      }
+
+      src.SetUsrMem(src_md, &src_tensor);
+      diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
+
+      // As per comment above, we tell MKLDNN that both the inputs are in same
+      // format. So we set common memory descriptor in MKL format, if any of the
+      // inputs are in MKL format. Let's get memory descriptor that we will use
+      // for both the inputs.
+      memory::desc common_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
+        common_md = dnn_shape_src.IsMklTensor() ? src_md : diff_dst_md;
+      } else {
+        // Since both the inputs are in Tensorflow format, and have
+        // same shape, we can get memory descriptor from any input.
+        common_md = src_md;
+      }
+
+      T alpha = 0, beta = 0;
+      std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+      auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
+                                              alg_kind, src_md, alpha, beta);
+      relu_fwd_pd.reset(
+          new relu_forward::primitive_desc(relu_fwd_desc, cpu_engine));
+      auto relu_bwd_desc =
+          relu_backward::desc(alg_kind, common_md, common_md, alpha, beta);
+      auto relu_bwd_pd = relu_backward::primitive_desc(
+          relu_bwd_desc, cpu_engine, *relu_fwd_pd);
+
+      // allocate diff_src tensor
+      MklDnnShape dnn_shape_diff_src;
+      TensorShape tf_shape_diff_src;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_diff_src.SetMklTensor(true);
+        auto diff_src_pd = relu_bwd_pd.diff_src_primitive_desc();
+        dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
+        dnn_shape_diff_src.SetElemType(MklDnnType<T>());
+        dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(),
+                                       dnn_shape_src.GetSizesAsMklDnnDims(),
+                                       dnn_shape_src.GetTfDataFormat());
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
+      } else {
+        dnn_shape_diff_src.SetMklTensor(false);
+        tf_shape_diff_src = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                                tf_shape_diff_src, dnn_shape_diff_src);
+
+      // diff_src memory descriptor is same as memory descriptor for both
+      // inputs.
+      diff_src.SetUsrMem(common_md, diff_src_tensor);
+
+      PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst);
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+  void PrepareAndExecuteNet(const relu_backward::primitive_desc& relu_prim_desc,
+                            MklDnnData<T>* src, MklDnnData<T>* diff_src,
+                            MklDnnData<T>* diff_dst) {
+    std::vector<primitive> net;
+
+    // Check if we need to reorder original input tensors into common_md layout
+    // that we set for primitive creation. diff_src_primitive_desc is same as
+    // common_md.
+    src->CheckReorderToOpMem(relu_prim_desc.diff_src_primitive_desc(), &net);
+    diff_dst->CheckReorderToOpMem(relu_prim_desc.diff_src_primitive_desc(),
+                                  &net);
+
+    net.push_back(relu_backward(relu_prim_desc, src->GetOpMem(),
+                                diff_dst->GetOpMem(), diff_src->GetOpMem()));
+    stream(stream::kind::eager).submit(net).wait();
+  }
+};
+
+template <typename Device, typename T>
+class MklReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
+ public:
+  ~MklReluOp() {}
+
+  explicit MklReluOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_relu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
+    (static_cast<T*>(out_o))[0] =
+        std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
+ public:
+  ~MklReluGradOp() {}
+
+  explicit MklReluGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_relu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    void* user_g =
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklEluOp : public MklReluOpBase<Device, T, eltwise_elu> {
+ public:
+  ~MklEluOp() {}
+
+  explicit MklEluOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_elu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
+    // return exp(feature) - 1 if feature > 0; feature otherwise
+    T feature = (static_cast<T*>(user_i))[0];
+    if (feature < 0)
+      (static_cast<T*>(out_o))[0] = std::exp(feature);
+    else
+      (static_cast<T*>(out_o))[0] = feature;
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
+ public:
+  ~MklEluGradOp() {}
+
+  explicit MklEluGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_elu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    void* user_g =
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    // gradient of elu(x) = 1 if x > 0; elu(x) + 1 otherwise
+    T feature = (static_cast<T*>(user_i))[0];
+    if (feature > 0) {
+      (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0];
+    } else {
+      T elu = std::exp(feature) - 1;
+      (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] * (elu + 1);
+    }
+  }
+};
+
+template <typename Device, typename T>
+class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
+ public:
+  ~MklTanhOp() {}
+
+  explicit MklTanhOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_tanh>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
+    // tanh(x) = (e^x - e^(-x))/ (e^x + e^(-x))
+    T feature = (static_cast<T*>(user_i))[0];
+    T e1 = std::exp(feature);
+    T e2 = std::exp(-feature);
+    (static_cast<T*>(out_o))[0] = (e1 - e2) / (e1 + e2);
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
+ public:
+  ~MklTanhGradOp() {}
+
+  explicit MklTanhGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_tanh>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
+    void* user_i =
+        static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    // gradient of tanh(x) = 1 - tanh(x)^2
+    T feature = (static_cast<T*>(user_i))[0];
+    T e1 = std::exp(feature);
+    T e2 = std::exp(-feature);
+    T tanh = (e1 - e2) / (e1 + e2);
+    void* user_g =
+        static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * (1 - tanh * tanh);
+  }
+};
+
+#endif
+
+// register dnn kernels for supported operations and supported types
 #define REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
   REGISTER_KERNEL_BUILDER(Name("_MklRelu")                          \
                               .Device(DEVICE_CPU)                   \
@@ -367,6 +849,37 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
                           MklReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 
+#ifndef INTEL_MKL_ML
+
+// register dnn kernels for supported operations and supported types
+#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)              \
+  REGISTER_KERNEL_BUILDER(Name("_MklElu")                           \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklEluOp<CPUDevice, type>);               \
+  REGISTER_KERNEL_BUILDER(Name("_MklEluGrad")                       \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklEluGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES);
+
+#define REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES(type)             \
+  REGISTER_KERNEL_BUILDER(Name("_MklTanh")                          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklTanhOp<CPUDevice, type>);              \
+  REGISTER_KERNEL_BUILDER(Name("_MklTanhGrad")                      \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklTanhGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
+
+#endif
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 5e985824750befb702f8fa7a59d699f853f40267..5dbc4a2709e2bc379ae3b9aa68ed14f3d6893e7c 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -28,6 +28,11 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifndef INTEL_MKL_ML
+#include "mkldnn.hpp"
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 template <typename Device, typename T>
@@ -35,6 +40,7 @@ class MklReshapeOp : public OpKernel {
  public:
   explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
 
+#ifdef INTEL_MKL_ML
   void Compute(OpKernelContext* context) override {
     const Tensor& input = MklGetInput(context, 0);
     const Tensor& sizes = MklGetInput(context, 1);
@@ -129,7 +135,189 @@ class MklReshapeOp : public OpKernel {
     }
   }
 
+#else
+
+ private:
+  // When the input tensor is in MKL layout and we are reshaping the tensor to a
+  // different shape than its actual shape, then we use MKLDNN reorder primitive
+  // to put tensor back in Tensorflow layout. But we can skip this reordering
+  // some times. This function checks for all such cases.
+  bool SkipReorder(const MklDnnShape& mkl_shape_input,
+                   const TensorShape& reshape_to) {
+    CHECK_EQ(mkl_shape_input.IsMklTensor(), true);
+    bool ret = false;
+
+    // If Tensorflow's data format and the underlying format maintained by
+    // MKLDNN are equivalent (both are NHWC or both are NCHW), then we can
+    // safely return true.
+    auto input_mkl_md = mkl_shape_input.GetMklLayout();
+    if (mkl_shape_input.GetTfDataFormat() == input_mkl_md.data.format) {
+      ret = true;
+    }
+
+    return ret;
+  }
+
+ public:
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor = MklGetInput(context, 0);
+    const Tensor& sizes = MklGetInput(context, 1);
+
+    MklDnnShape mkl_shape_input;
+    GetMklShape(context, kInputSlotIdx, &mkl_shape_input);
+    bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
+    const int64 nelems = input_in_mkl_format
+                             ? mkl_shape_input.GetTfShape().num_elements()
+                             : input_tensor.NumElements();
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, IsLegacyVector(sizes.shape()),
+                errors::InvalidArgument("sizes input must be 1-D, not shape ",
+                                        sizes.shape().DebugString()));
+
+    // Compute the output shape.  Determine product of specified
+    // dimensions, and find the index of the unspecified one.
+    TensorShape shape;
+    int64 product = 1;
+    int unknown_index = -1;
+    switch (sizes.dtype()) {
+      case DT_INT32:
+        OP_REQUIRES_OK(context, ValidateSizes<int32>(sizes, &product,
+                                                     &unknown_index, &shape));
+        break;
+      case DT_INT64:
+        OP_REQUIRES_OK(context, ValidateSizes<int64>(sizes, &product,
+                                                     &unknown_index, &shape));
+        break;
+      default:
+        context->CtxFailure(errors::InvalidArgument(
+            "desired shape must be a DT_INT32 or DT_INT64 vector, not a ",
+            DataTypeString(sizes.dtype())));
+        return;
+    }
+    if (unknown_index != -1) {
+      OP_REQUIRES(
+          context, product > 0,
+          errors::InvalidArgument("Reshape cannot infer the missing input size "
+                                  "for an empty tensor unless all specified "
+                                  "input sizes are non-zero"));
+      const int64 missing = nelems / product;
+      OP_REQUIRES(
+          context, product * missing == nelems,
+          errors::InvalidArgument(
+              "Input to reshape is a tensor with ", nelems,
+              " values, but the requested shape requires a multiple of ",
+              product));
+      shape.set_dim(unknown_index, missing);
+    }
+    OP_REQUIRES(
+        context, shape.num_elements() == nelems,
+        errors::InvalidArgument("Input to reshape is a tensor with ", nelems,
+                                " values, but the requested shape has ",
+                                shape.num_elements()));
+
+    if (input_in_mkl_format) {
+      TensorShape& shape_to = shape;
+      TensorShape shape_from = mkl_shape_input.GetTfShape();
+      if (shape_from == shape_to) {
+        CopyMklTensorInToOut(context, kInputSlotIdx, kOutputSlotIdx);
+        return;
+      } else {
+        try {
+          auto cpu_engine = engine(engine::cpu, 0);
+          MklDnnData<T> dnn_data_input(&cpu_engine);
+          // Reshape is just a logical view change operation for a tensor.
+          // It does not change underlying layout. But MKLDNN may maintain
+          // tensor data in different layout than that specified by Tensorflow.
+          // If MKLDNN maintains input tensor in different layout than that
+          // specified by Tensorflow, we will need to reorder tensor and then
+          // put it in the shape expected by Tensorflow. But if MKLDNN has
+          // maintained input tensor in the same layout as it is expected by
+          // Tensorflow, we don't need to reorder tensor contents, we just
+          // need to update MklDnnShape object associated with the input
+          // tensor to reflect the shape change expected by reshape.
+          if (!SkipReorder(mkl_shape_input, shape_to)) {
+            // If dimensions that are being expanded or collapsed are not
+            // maintained contiguously by MKLDNN, then we use reorder.
+
+            // Get Mkl layout of input tensor.
+            auto input_mkl_md = mkl_shape_input.GetMklLayout();
+            // Set input Mkl layout as the user layout.
+            dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
+            // Get expected Tensorflow layout of input tensor.
+            auto output_tf_md = mkl_shape_input.GetTfLayout();
+            auto output_tf_pd =
+                memory::primitive_desc(output_tf_md, cpu_engine);
+
+            Tensor* output_tensor = nullptr;
+            MklShape mkl_shape_output;
+            mkl_shape_output.SetMklTensor(false);
+            // We allocate output tensor in the shape expected by Reshape.
+            AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
+                                      shape_to, mkl_shape_output);
+
+            // Insert reorder between Mkl layout and TensorFlow layout if
+            // needed. If reorder is not needed but reshape is needed (since
+            // shape_from != shape_to), then we just copy input tensor to
+            // output tensor with target shape (we cannot forward Mkl layout
+            // in such case because shape has changed.)
+            std::vector<primitive> net;
+            if (dnn_data_input.CheckReorderToOpMem(output_tf_pd, output_tensor,
+                                                   &net)) {
+              stream(stream::kind::eager).submit(net).wait();
+            } else {
+              output_tensor->CopyFrom(input_tensor, shape_to);
+            }
+            return;
+          } else {
+            // If dimensions that are being expanded or collapsed are
+            // maintained contiguously by MKLDNN, then we skip reorder, just
+            // update MklDnnShape object for the tensorflow tensor, and forward
+            // Tensorflow tensor as it is to the output.
+            auto output_dims = TFShapeToMklDnnDims(shape_to);
+            auto output_strides = CalculateTFStrides(output_dims);
+            auto output_tf_md = MklDnnData<T>::CreateBlockedMemDesc(
+                output_dims, output_strides);
+            auto output_tf_pd =
+                memory::primitive_desc(output_tf_md, cpu_engine);
+
+            // Set MklDnnShape
+            MklDnnShape mkl_shape_output;
+            mkl_shape_output.SetMklTensor(true);
+            mkl_shape_output.SetMklLayout(&output_tf_pd);
+            mkl_shape_output.SetElemType(MklDnnType<T>());
+            mkl_shape_output.SetTfLayout(output_dims.size(), output_dims,
+                                         memory::format::blocked);
+
+            // We now simply forward input Mkl tensor to output and change its
+            // output MklDnnShape object.
+            ForwardMklTensorInToOutWithMklShape(
+                context, kInputSlotIdx, kOutputSlotIdx, mkl_shape_output);
+            return;
+          }
+        } catch (mkldnn::error& e) {
+          string error_msg = "Status: " + std::to_string(e.status) +
+                             ", message: " + string(e.message) + ", in file " +
+                             string(__FILE__) + ":" + std::to_string(__LINE__);
+          OP_REQUIRES_OK(
+              context,
+              errors::Aborted("Operation received an exception:", error_msg));
+        }
+      }
+    } else {
+      // If input tensor is not in Mkl format, then just copy Tensorflow tensor
+      // to output with specified shape.
+      CopyTfTensorInToOutWithShape(context, kInputSlotIdx, kOutputSlotIdx,
+                                   shape);
+    }
+  }
+
+#endif  // INTEL_MKL_ML
+
  private:
+  const int kInputSlotIdx = 0;
+  const int kOutputSlotIdx = 0;
+
   template <typename Tshape>
   Status ValidateSizes(const Tensor& sizes, int64* product, int* unknown_index,
                        TensorShape* shape) {
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aceef1e234eff3660b33f5a091a2cd10e25ea2f9
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -0,0 +1,160 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+#ifdef INTEL_MKL
+#ifndef INTEL_MKL_ML
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "mkldnn.h"
+#include "mkldnn_types.h"
+#include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+#include "mkldnn.hpp"
+using mkldnn::prop_kind;
+using mkldnn::softmax_forward;
+using mkldnn::stream;
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename T>
+class MklSoftmaxOp : public OpKernel {
+ public:
+  ~MklSoftmaxOp() {}
+
+  explicit MklSoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+
+      // src_tensor now points to the 0-th input of global data struct "context"
+      size_t src_idx = 0;
+      const Tensor& src_tensor = MklGetInput(context, src_idx);
+
+      // Add: get MklShape
+      MklDnnShape src_mkl_shape;
+      GetMklShape(context, src_idx, &src_mkl_shape);
+
+      // src_dims is the dimenstion of src_tensor
+      // dim of the dst will also be same as src_dims
+      auto src_tf_shape = src_mkl_shape.IsMklTensor()
+                              ? src_mkl_shape.GetTfShape()
+                              : src_tensor.shape();
+      auto src_dims = TFShapeToMklDnnDims(src_tf_shape);
+      auto output_dims = src_dims;
+
+      // Create softmax memory for src, dst: both are defined in mkl_util.h,
+      // they are wrapper
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      // If input is in MKL layout, then simply grab input layout; otherwise,
+      // construct input Tf layout. For TF layout, although input shape
+      // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
+      // layout
+      auto src_md =
+          src_mkl_shape.IsMklTensor()
+              ? src_mkl_shape.GetMklLayout()
+              : memory::desc(src_dims, MklDnnType<T>(), memory::format::nc);
+
+      // src: setting memory descriptor and op memory descriptor
+      // Basically following two functions maps the TF "src_tensor" to mkl
+      // tensor object "src"
+      // following functions are in mkl_util.h
+      // data format is "nc" for src and dst; since the src and dst buffer is
+      // always in 2D shape
+      src.SetUsrMem(src_md, &src_tensor);
+      src.SetOpMemDesc(src_dims, memory::format::nc);
+
+      // creating a memory descriptor
+      int axis = 1;  // axis to which softmax will be applied
+      auto softmax_fwd_desc = softmax_forward::desc(prop_kind::forward_scoring,
+                                                    src.GetOpMemDesc(), axis);
+      auto softmax_fwd_pd =
+          softmax_forward::primitive_desc(softmax_fwd_desc, cpu_engine);
+
+      // add: output
+      Tensor* output_tensor = nullptr;
+      MklDnnShape output_mkl_shape;
+      TensorShape output_tf_shape;  // shape of output TF tensor.
+      // Softmax MklDnn output layout is same as input layout.
+      auto dst_pd = src.GetUsrMemPrimDesc();
+
+      // if input is MKL shape, ouput is also MKL shape.
+      // if input is TF shape, output is also TF shape
+      if (src_mkl_shape.IsMklTensor()) {
+        output_mkl_shape.SetMklTensor(true);
+        output_mkl_shape.SetMklLayout(&dst_pd);
+        output_mkl_shape.SetElemType(MklDnnType<T>());
+        output_mkl_shape.SetTfLayout(output_dims.size(), output_dims,
+                                     memory::format::nc);
+        output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
+      } else {  // then output is also TF shape
+        output_mkl_shape.SetMklTensor(false);
+        output_tf_shape = MklDnnDimsToTFShape(output_dims);
+      }
+      // Allocate output shape (MKL or TF based on the above)
+      AllocateOutputSetMklShape(context, 0, &output_tensor, output_tf_shape,
+                                output_mkl_shape);
+
+      // Output_dims and input_dims are same
+      dst.SetUsrMem(src_md, output_tensor);
+
+      // finally creating the "softmax op" using the primitive descriptor, src
+      // and dst
+      auto softmax_fwd =
+          softmax_forward(softmax_fwd_pd, src.GetOpMem(), dst.GetOpMem());
+
+      // execute net (pushing to the stream)
+      // following 3 are common for all mkl dnn ops
+      std::vector<primitive> net;
+      net.push_back(softmax_fwd);
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+};
+
+/* Register DNN kernels for supported operations and supported types - right now
+ * it is only Softmax and f32 */
+#define REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES(type)          \
+  REGISTER_KERNEL_BUILDER(Name("_MklSoftmax")                       \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklSoftmaxOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES);
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL_ML
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index c4d5a45d3caff0f59b1ecc61f95dd26fe16fd06b..5fafa14b5dbf49d0c9902af4e38653b48d1f179b 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 using mkldnn::stream;
 #endif
 
@@ -61,7 +61,7 @@ class MklToTfOp : public OpKernel {
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
   static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
                              string data_format_str, DataType op_data_type,
                              bool has_avx512f, uint input_number) {
diff --git a/tensorflow/core/kernels/multinomial_op.cc b/tensorflow/core/kernels/multinomial_op.cc
index 8c0109f5c87ce5f73621a1683471bbcb8a936ea4..d086abb24760f1ab946605fd422a4fd0d5fc866d 100644
--- a/tensorflow/core/kernels/multinomial_op.cc
+++ b/tensorflow/core/kernels/multinomial_op.cc
@@ -40,7 +40,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
-template <typename Device, typename T>
+template <typename Device, typename T, typename OutputType>
 struct MultinomialFunctor {
   void operator()(OpKernelContext* ctx, const Device& d,
                   typename TTypes<T>::ConstMatrix logits,
@@ -49,11 +49,11 @@ struct MultinomialFunctor {
                   typename TTypes<float>::Flat scratch, int batch_size,
                   int num_classes, int num_samples,
                   const random::PhiloxRandom& gen,
-                  typename TTypes<int64>::Matrix output);
+                  typename TTypes<OutputType>::Matrix output);
 };
 
-template <typename T>
-struct MultinomialFunctor<CPUDevice, T> {
+template <typename T, typename OutputType>
+struct MultinomialFunctor<CPUDevice, T, OutputType> {
   void operator()(OpKernelContext* ctx, const CPUDevice& d,
                   typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<float>::Flat /* noises */,
@@ -61,7 +61,7 @@ struct MultinomialFunctor<CPUDevice, T> {
                   typename TTypes<float>::Flat /* scratch */, int batch_size,
                   int num_classes, int num_samples,
                   const random::PhiloxRandom& gen,
-                  typename TTypes<int64>::Matrix output) {
+                  typename TTypes<OutputType>::Matrix output) {
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
 
     // The implementation only parallelizes by batch.
@@ -128,7 +128,7 @@ struct MultinomialFunctor<CPUDevice, T> {
 }  // namespace functor
 
 // Samples from a multinomial distribution.
-template <typename Device, typename T>
+template <typename Device, typename T, typename OutputType>
 class MultinomialOp : public OpKernel {
  public:
   explicit MultinomialOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -195,11 +195,11 @@ class MultinomialOp : public OpKernel {
       if (std::is_same<Device, CPUDevice>::value) num_samples_ceil_4 *= 2;
       auto rng =
           generator_.ReserveRandomOutputs(batch_size * num_samples_ceil_4, 256);
-      functor::MultinomialFunctor<Device, T>()(
+      functor::MultinomialFunctor<Device, T, OutputType>()(
           ctx, ctx->eigen_device<Device>(), logits_t.matrix<T>(),
           noises.flat<float>(), scores.flat<float>(), scratch.flat<float>(),
           batch_size, num_classes, num_samples, rng,
-          samples_t->matrix<int64>());
+          samples_t->matrix<OutputType>());
     }
   }
 
@@ -209,10 +209,17 @@ class MultinomialOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(MultinomialOp);
 };
 
-#define REGISTER(TYPE)                                                  \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("Multinomial").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
-      MultinomialOp<CPUDevice, TYPE>);
+#define REGISTER(TYPE)                                                   \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<TYPE>("T")                 \
+                              .TypeConstraint("output_dtype", DT_INT32), \
+                          MultinomialOp<CPUDevice, TYPE, int32>);        \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<TYPE>("T")                 \
+                              .TypeConstraint("output_dtype", DT_INT64), \
+                          MultinomialOp<CPUDevice, TYPE, int64>);
 
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
@@ -220,12 +227,20 @@ TF_CALL_double(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
-#define REGISTER(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(Name("Multinomial")             \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("num_samples")  \
-                              .TypeConstraint<TYPE>("T"), \
-                          MultinomialOp<GPUDevice, TYPE>)
+#define REGISTER(TYPE)                                                   \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
+                              .Device(DEVICE_GPU)                        \
+                              .HostMemory("num_samples")                 \
+                              .TypeConstraint<TYPE>("T")                 \
+                              .TypeConstraint("output_dtype", DT_INT32), \
+                          MultinomialOp<GPUDevice, TYPE, int32>)         \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
+                              .Device(DEVICE_GPU)                        \
+                              .HostMemory("num_samples")                 \
+                              .TypeConstraint<TYPE>("T")                 \
+                              .TypeConstraint("output_dtype", DT_INT64), \
+                          MultinomialOp<GPUDevice, TYPE, int64>)
+
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
diff --git a/tensorflow/core/kernels/multinomial_op.h b/tensorflow/core/kernels/multinomial_op.h
index af5e81f219c802857fd6d5eb27e4962cc890a058..6e41060aa414b0611dd7dca31374444f8dd364ec 100644
--- a/tensorflow/core/kernels/multinomial_op.h
+++ b/tensorflow/core/kernels/multinomial_op.h
@@ -21,7 +21,7 @@ namespace tensorflow {
 namespace functor {
 
 // Generic helper functor for the Multinomial Op.
-template <typename Device, typename T>
+template <typename Device, typename T, typename OutputType>
 struct MultinomialFunctor;
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 19b4f3ca559f56d93fae203df77f0ef35718db1b..5cc5877cceb19320023423d35a352c5ba3db13e2 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -37,20 +37,22 @@ using GPUDevice = Eigen::GpuDevice;
 
 // Kernel for Multinomial op.  Data is interpreted to have the following shapes:
 //   scores: [B, S, C];  maxima: [B, S];  output: [B, S].
+template <typename OutputType>
 __global__ void MultinomialKernel(int32 nthreads, const int32 num_classes,
                                   const int32 num_samples, const float* scores,
-                                  const float* maxima, int64* output) {
+                                  const float* maxima, OutputType* output) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     const int maxima_idx = index / num_classes;
     if (ldg(maxima + maxima_idx) == ldg(scores + index)) {
-      CudaAtomicMax(reinterpret_cast<uint64*>(output + maxima_idx),
-                    static_cast<uint64>(index % num_classes));
+      using UnsignedOutputType = typename std::make_unsigned<OutputType>::type;
+      CudaAtomicMax(reinterpret_cast<UnsignedOutputType*>(output + maxima_idx),
+                    static_cast<UnsignedOutputType>(index % num_classes));
     }
   }
 }
 
-template <typename T>
-struct MultinomialFunctor<GPUDevice, T> {
+template <typename T, typename OutputType>
+struct MultinomialFunctor<GPUDevice, T, OutputType> {
   void operator()(OpKernelContext* ctx, const GPUDevice& d,
                   typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<float>::Flat noises,
@@ -58,7 +60,7 @@ struct MultinomialFunctor<GPUDevice, T> {
                   typename TTypes<float>::Flat maxima, int batch_size,
                   int num_classes, int num_samples,
                   const random::PhiloxRandom& gen,
-                  typename TTypes<int64>::Matrix output) {
+                  typename TTypes<OutputType>::Matrix output) {
     // Uniform, [0, 1).
     typedef random::UniformDistribution<random::PhiloxRandom, float> Dist;
     functor::FillPhiloxRandom<GPUDevice, Dist>()(ctx, d, gen, noises.data(),
@@ -111,11 +113,17 @@ struct MultinomialFunctor<GPUDevice, T> {
 };
 
 // Explicit instantiation of the GPU functors.
-template struct MultinomialFunctor<GPUDevice, Eigen::half>;
-template struct MultinomialFunctor<GPUDevice, float>;
-template struct MultinomialFunctor<GPUDevice, double>;
-template struct MultinomialFunctor<GPUDevice, int32>;
-template struct MultinomialFunctor<GPUDevice, int64>;
+template struct MultinomialFunctor<GPUDevice, Eigen::half, int32>;
+template struct MultinomialFunctor<GPUDevice, float, int32>;
+template struct MultinomialFunctor<GPUDevice, double, int32>;
+template struct MultinomialFunctor<GPUDevice, int32, int32>;
+template struct MultinomialFunctor<GPUDevice, int64, int32>;
+
+template struct MultinomialFunctor<GPUDevice, Eigen::half, int64>;
+template struct MultinomialFunctor<GPUDevice, float, int64>;
+template struct MultinomialFunctor<GPUDevice, double, int64>;
+template struct MultinomialFunctor<GPUDevice, int32, int64>;
+template struct MultinomialFunctor<GPUDevice, int64, int64>;
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/neon/BUILD b/tensorflow/core/kernels/neon/BUILD
index 536b2bdc03c5dc91e8e3e25dd9fbba82cd29fd5b..c3d24e50effb3fe5184e264064393a7f339105f0 100644
--- a/tensorflow/core/kernels/neon/BUILD
+++ b/tensorflow/core/kernels/neon/BUILD
@@ -39,6 +39,6 @@ tf_kernel_library(
         "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:ops_util",
-        "@gemmlowp//:gemmlowp",
+        "@gemmlowp",
     ],
 )
diff --git a/tensorflow/core/kernels/neon/depthwiseconv_float.h b/tensorflow/core/kernels/neon/depthwiseconv_float.h
index acd58a644f3b0b0b578778f8c017efff30771efa..11f5be7c03dcd3c03014a40b4901ef9fef1b892b 100644
--- a/tensorflow/core/kernels/neon/depthwiseconv_float.h
+++ b/tensorflow/core/kernels/neon/depthwiseconv_float.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
+#ifndef TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
+#define TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
 
 #include "public/gemmlowp.h"
 #include "tensorflow/core/kernels/neon/types.h"
@@ -722,4 +722,4 @@ void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
 }  // end namespace neon
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
+#endif  // TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
diff --git a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
index 17f2af550f248a6924bb3d1e7546eca84d4c1e51..0e820bbb6208ae9c13ac2fb33f67590b9e66ba7e 100644
--- a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
@@ -71,10 +71,10 @@ class NeonDepthwiseConv2dNativeOp : public BinaryOp<float> {
                                         filter.shape().DebugString()));
 
     const int32 in_depth = input.dim_size(3);
-    OP_REQUIRES(
-        context, in_depth == filter.dim_size(2),
-        errors::InvalidArgument("input and filter must have the same depth: ",
-                                in_depth, " vs ", filter.dim_size(2)));
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
     const int32 batch = input.dim_size(0);
     const int32 input_rows = input.dim_size(1);
     const int32 input_cols = input.dim_size(2);
diff --git a/tensorflow/core/kernels/neon/types.h b/tensorflow/core/kernels/neon/types.h
index 4ece22f015954a1867dd2a4a5365cc93c1eaee5d..05ff1bcc6cdbe7bf26766fc0b11909e3da8de71f 100644
--- a/tensorflow/core/kernels/neon/types.h
+++ b/tensorflow/core/kernels/neon/types.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
+#ifndef TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
+#define TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
 
 #include "tensorflow/core/platform/logging.h"
 
@@ -70,4 +70,4 @@ inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
 }  // end namespace neon
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
+#endif  // TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index 0db7c63b8b6a25f1d495dd937d49ec9d0615ab0a..a841291ddd7d4f64b0ab2b611c59307f4d11150f 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -653,6 +653,8 @@ BM_ConvFloatDepthwiseFwd(32, 7, 7, 1024, 1, 1024, 3, 3, 1, SAME, conv6);
 // Benchmarks with different stride and padding options.
 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 2, SAME, conv7);
 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 2, VALID, conv8);
+BM_ConvFloatDepthwiseFwd(1, 100, 100, 72, 1, 72, 3, 3, 1, SAME, conv9);
+BM_ConvFloatDepthwiseFwd(1, 100, 100, 72, 1, 72, 5, 5, 1, SAME, conv10);
 
 #define BM_ConvFloatDepthwiseBk(BS, R, C, ID, DM, OD, KR, KC, STR, PAD, LABEL) \
   static void BM_ConvFloatDepthwiseBkInCPU1_##LABEL(int iters) {               \
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 64bdef0008f20f3947e990e30e2af7b93a69d50c..5d28b87e6bb8c0f51653fc005a2f62734a44d321 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -92,13 +92,11 @@ static inline bool IOUGreaterThanThreshold(
   return iou > iou_threshold;
 }
 
-void DoNonMaxSuppressionOp(OpKernelContext* context,
-                           const Tensor& boxes,
-                           const Tensor& scores,
-                           const Tensor& max_output_size,
+void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
+                           const Tensor& scores, const Tensor& max_output_size,
                            const float iou_threshold) {
   OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
-      errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+              errors::InvalidArgument("iou_threshold must be in [0, 1]"));
 
   int num_boxes = 0;
   ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
@@ -106,10 +104,8 @@ void DoNonMaxSuppressionOp(OpKernelContext* context,
     return;
   }
 
-  const int output_size =
-      std::min(max_output_size.scalar<int>()(), num_boxes);
-  typename TTypes<float, 2>::ConstTensor boxes_data =
-      boxes.tensor<float, 2>();
+  const int output_size = std::min(max_output_size.scalar<int>()(), num_boxes);
+  typename TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();
 
   std::vector<float> scores_data(num_boxes);
   std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
@@ -181,8 +177,7 @@ template <typename Device>
 class NonMaxSuppressionV2Op : public OpKernel {
  public:
   explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
-      : OpKernel(context) {
-  }
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     // boxes: [num_boxes, 4]
@@ -197,10 +192,9 @@ class NonMaxSuppressionV2Op : public OpKernel {
                                 max_output_size.shape().DebugString()));
     // iou_threshold: scalar
     const Tensor& iou_threshold = context->input(3);
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
-        errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
-                                iou_threshold.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                        iou_threshold.shape().DebugString()));
 
     const float iou_threshold_val = iou_threshold.scalar<float>()();
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index fdbcf05b89ddf122eee9e0133651355edbb1ba5a..67d9217b9502a30f5727b6a91fbf36da872ab972 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -43,9 +43,10 @@ class NonMaxSuppressionOpTest : public OpsTestBase {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {3});
   TF_ASSERT_OK(RunOpKernel());
@@ -58,7 +59,7 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
 TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) {
   MakeOp(.5);
   AddInputFromArray<float>(TensorShape({6, 4}),
-                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f,  1, -0.1f,
+                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f, 1, -0.1f,
                             0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {3});
@@ -71,9 +72,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromThreeClustersFlippedCoordinates) {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {2});
   TF_ASSERT_OK(RunOpKernel());
@@ -85,9 +87,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
 
 TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostThirtyBoxesFromThreeClusters) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
   AddInputFromArray<int>(TensorShape({}), {30});
   TF_ASSERT_OK(RunOpKernel());
@@ -134,9 +137,10 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectFromTenIdenticalBoxes) {
 
 TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
   MakeOp(.5);
-  AddInputFromArray<float>(TensorShape({6, 4}),
-                           {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
-                            0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,  1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
   AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
   AddInputFromArray<int>(TensorShape({}), {30});
   Status s = RunOpKernel();
diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc
index da825e408c24617862e8613c6b63ed1a51944041..7f12eb953a31ec667a5f3cee379bd3d1970b3a56 100644
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@@ -16,15 +16,15 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #include "tensorflow/core/kernels/nth_element_op.h"
 
+#include <algorithm>
+#include <iostream>
+#include <vector>
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
-#include <vector>
-#include <algorithm>
-#include <iostream>
 
 namespace tensorflow {
 
@@ -54,8 +54,9 @@ class NthElementOp : public OpKernel {
                 errors::InvalidArgument("Input must be >= 1-D, got shape ",
                                         input_in.shape().DebugString()));
     // The last dimension of input tensor must be greater than N.
-    OP_REQUIRES(context, input_in.dim_size(num_dims-1) > n,
-                errors::InvalidArgument("Input must have at least n+1 columns"));
+    OP_REQUIRES(
+        context, input_in.dim_size(num_dims - 1) > n,
+        errors::InvalidArgument("Input must have at least n+1 columns"));
 
     // std::nth_element only support the nth-smallest selection.
     if (reverse_) {
@@ -64,7 +65,7 @@ class NthElementOp : public OpKernel {
 
     // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1].
     TensorShape out_shape;
-    for (int i = 0; i < num_dims-1; ++i) {
+    for (int i = 0; i < num_dims - 1; ++i) {
       out_shape.AddDim(input_in.dim_size(i));
     }
     Tensor* output_tensor = nullptr;
@@ -83,32 +84,28 @@ namespace functor {
 
 template <typename T>
 struct NthElementFunctor<CPUDevice, T> {
-  void operator() (OpKernelContext* context,
-                   const Tensor& input_tensor,
-                   Tensor& output_tensor,
-                   int n,
-                   bool reverse) {
+  void operator()(OpKernelContext* context, const Tensor& input_tensor,
+                  Tensor& output_tensor, int n, bool reverse) {
     const T* input = input_tensor.flat<T>().data();
     T* output = output_tensor.flat<T>().data();
 
     // Assume input_shape is [d1,d2,...dk], and output_shape is [d1,d2...dk-1],
     // then num_rows = d1*d2...dk-1, last_dim = dk.
     const int num_rows = output_tensor.NumElements();
-    const int last_dim = input_tensor.dim_size(input_tensor.dims()-1);
+    const int last_dim = input_tensor.dim_size(input_tensor.dims() - 1);
 
     // Allocate each row to different shard.
-    auto SubNthElement = [&, input, output, last_dim, n](int start,
-                                                         int limit) {
+    auto SubNthElement = [&, input, output, last_dim, n](int start, int limit) {
       // std::nth_element would rearrange the array, so we need a new buffer.
       std::vector<T> buf(last_dim);
 
       for (int b = start; b < limit; ++b) {
         // Copy from one row of elements to buffer
         const T* input_start = input + b * last_dim;
-        const T* input_end = input + (b+1) * last_dim;
+        const T* input_end = input + (b + 1) * last_dim;
         std::copy(input_start, input_end, buf.begin());
 
-        std::nth_element(buf.begin(), buf.begin()+n, buf.end());
+        std::nth_element(buf.begin(), buf.begin() + n, buf.end());
         // The element placed in the nth position is exactly the element that
         // would occur in this position if the range was fully sorted.
         output[b] = buf[n];
@@ -116,9 +113,9 @@ struct NthElementFunctor<CPUDevice, T> {
     };
 
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-    // The average time complexity of partition-based nth_element (BFPRT) is O(n),
-    // althought the worst time complexity could be O(n^2).
-    // Here, 20 is a empirical factor of cost_per_unit.
+    // The average time complexity of partition-based nth_element (BFPRT) is
+    // O(n), althought the worst time complexity could be O(n^2). Here, 20 is a
+    // empirical factor of cost_per_unit.
     Shard(worker_threads.num_threads, worker_threads.workers, num_rows,
           20 * last_dim, SubNthElement);
   }
@@ -126,7 +123,6 @@ struct NthElementFunctor<CPUDevice, T> {
 
 }  // namespace functor
 
-
 #define REGISTER_NTHOP(T)                                           \
   REGISTER_KERNEL_BUILDER(                                          \
       Name("NthElement").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
@@ -136,4 +132,3 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_NTHOP);
 #undef REGISTER_NTHOP
 
 }  // end namespace tensorflow
-
diff --git a/tensorflow/core/kernels/nth_element_op.h b/tensorflow/core/kernels/nth_element_op.h
index 11a6c996b093fa7255a230122f64eb1054789453..e7d25daecc74a6d7b178034d5d78776a390ffe04 100644
--- a/tensorflow/core/kernels/nth_element_op.h
+++ b/tensorflow/core/kernels/nth_element_op.h
@@ -26,10 +26,8 @@ namespace functor {
 
 template <typename Device, typename T>
 struct NthElementFunctor {
-  void operator() (OpKernelContext* context,
-                   const Tensor& input_tensor,
-                   Tensor& output_tensor,
-                   int n);
+  void operator()(OpKernelContext* context, const Tensor& input_tensor,
+                  Tensor& output_tensor, int n);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
index 49fd4bdebad420d8e848b0491a764d976f4557cd..647515ae38ab5530b69fa135257584eea531d46c 100644
--- a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
@@ -19,16 +19,16 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/one_hot_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/one_hot_op.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define DEFINE_GPU_SPEC_INDEX(T, TI)                   \
-  template class generator::OneGenerator<T, TI>;       \
+#define DEFINE_GPU_SPEC_INDEX(T, TI)             \
+  template class generator::OneGenerator<T, TI>; \
   template struct functor::OneHot<GPUDevice, T, TI>;
 
 #define DEFINE_GPU_SPEC(T)         \
diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc
index 9d53882deef89230bd39d8318f11d84269406f20..13427d71ff6841a85c31d3bf42c038f6413c1fe6 100644
--- a/tensorflow/core/kernels/ops_util_test.cc
+++ b/tensorflow/core/kernels/ops_util_test.cc
@@ -218,7 +218,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_1_2) {
 // in_size = 3, ksize = 3, stride = 2, pad_size = 0
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 2, 0}, {0, 3}}, {{1, 3, 3, 2, 0}, {2, 1}},
+      {{0, 3, 3, 2, 0}, {0, 3}},
+      {{1, 3, 3, 2, 0}, {2, 1}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -228,7 +229,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_0) {
 // in_size = 3, ksize = 3, stride = 2, pad_size = 1
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_2_1) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 2, 1}, {0, 2}}, {{1, 3, 3, 2, 1}, {1, 2}},
+      {{0, 3, 3, 2, 1}, {0, 2}},
+      {{1, 3, 3, 2, 1}, {1, 2}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -258,7 +260,8 @@ TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_0) {
 // in_size = 3, ksize = 3, stride = 3, pad_size = 1
 TEST_F(OpsUtilTest, GetBroadcastTest3_3_3_1) {
   bcast_struct bcast[] = {
-      {{0, 3, 3, 3, 1}, {0, 2}}, {{1, 3, 3, 3, 1}, {2, 1}},
+      {{0, 3, 3, 3, 1}, {0, 2}},
+      {{1, 3, 3, 3, 1}, {2, 1}},
   };
   for (size_t i = 0; i < sizeof(bcast) / sizeof(bcast[0]); ++i) {
     VerifyBcastValues(bcast[i]);
@@ -348,8 +351,8 @@ TEST_F(OpsUtilTest, Misaligned1DSlice) {
 
 TEST_F(OpsUtilTest, Aligned2DSliceOfDim0) {
 #if EIGEN_MAX_ALIGN_BYTES == 0
-  // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is nonzero,
-  // a multidimensional tensor is always aligned.
+  // When EIGEN_MAX_ALIGN_BYTES is 0 and the size of the first dimension is
+  // nonzero, a multidimensional tensor is always aligned.
   Tensor t(DT_FLOAT, TensorShape({3, 4}));
   int64 start = 1;
   int64 end = 2;
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 814128d99ac2acb4a10cfcb2907edb735eaca382..5645275cfa98eb820b7d1e885b18894bfab17e49 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -36,7 +36,7 @@ typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // --------------------------------------------------------------------------
 template <typename Device, typename T>
@@ -123,7 +123,7 @@ class PackOp : public OpKernel {
         ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
         return;
       }
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -139,7 +139,6 @@ class PackOp : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_PACK);
 TF_CALL_QUANTIZED_TYPES(REGISTER_PACK);
-TF_CALL_bfloat16(REGISTER_PACK);
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION)
 // Primarily used for SavedModel support on mobile.
@@ -157,6 +156,7 @@ REGISTER_PACK(string);
       PackOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
 REGISTER_GPU(bool);
 #undef REGISTER_GPU
diff --git a/tensorflow/core/kernels/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/parallel_interleave_dataset_op.cc
deleted file mode 100644
index 56942a5c01f3c2be5617aa1a9e1eadea12857911..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/parallel_interleave_dataset_op.cc
+++ /dev/null
@@ -1,402 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/captured_function.h"
-#include "tensorflow/core/kernels/dataset_utils.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/random/random.h"
-
-namespace tensorflow {
-
-namespace {
-
-// See documentation in ../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
-class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    OpInputList inputs;
-    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
-    std::vector<Tensor> other_arguments;
-    other_arguments.reserve(inputs.size());
-    for (const Tensor& t : inputs) {
-      other_arguments.push_back(t);
-    }
-
-    int64 cycle_length;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "cycle_length", &cycle_length));
-    OP_REQUIRES(ctx, cycle_length > 0,
-                errors::InvalidArgument("`cycle_length` must be > 0"));
-
-    int64 block_length;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "block_length", &block_length));
-    OP_REQUIRES(ctx, block_length > 0,
-                errors::InvalidArgument("`block_length` must be > 0"));
-
-    bool sloppy;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "sloppy", &sloppy));
-
-    std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
-                                                 std::move(other_arguments),
-                                                 &captured_func));
-
-    *output = new Dataset(input, std::move(captured_func), cycle_length,
-                          block_length, sloppy, output_types_, output_shapes_);
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(const DatasetBase* input,
-            std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
-            int64 block_length, bool sloppy, const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
-        : input_(input),
-          captured_func_(std::move(captured_func)),
-          cycle_length_(cycle_length),
-          block_length_(block_length),
-          sloppy_(sloppy),
-          output_types_(output_types),
-          output_shapes_(output_shapes) {
-      input_->Ref();
-    }
-
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIterator(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::ParallelInterleave")}));
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() override {
-      return "ParallelInterleaveDatasetOp::Dataset";
-    }
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
-            output_elements_(params.dataset->cycle_length_) {}
-
-      ~Iterator() override {
-        mutex_lock l(mu_);
-        cancelled_ = true;
-        // Notify all workers in case they are blocked.
-        for (int64 i = 0; i < dataset()->cycle_length_; ++i) {
-          output_elements_[i].cond_var.notify_all();
-        }
-      }
-
-      // It is implemented so that it matches the deterministic interleave
-      // unless we would block waiting for an element, at which point it skips
-      // along to the next available value.
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(EnsureWorkerThreadsStarted(ctx));
-        const int64 num_workers = worker_threads_.size();
-        if (num_workers == 0) {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
-        while (!cancelled_) {
-          // Wait for an item to become available, blocking if necessary. If we
-          // are allowed to be sloppy, we can skip over input datasets that do
-          // not have an item readily available.
-          const int64 n = dataset()->sloppy_ ? num_workers : 1LL;
-          for (int64 i = 0; i < n; ++i) {
-            int64 index = (next_index_ + i) % num_workers;
-            if (output_elements_[index].is_produced) {
-              next_index_ = index;
-              if (i == 0) {
-                block_count_++;
-                if (block_count_ == dataset()->block_length_) {
-                  next_index_ = (index + 1) % num_workers;
-                  block_count_ = 0;
-                }
-              } else {
-                block_count_ = 0;
-              }
-              // If we encounter an EoF, advance to the next iterator
-              if (output_elements_[index].end_of_sequence) {
-                output_elements_[index].is_produced = false;
-                output_elements_[index].cond_var.notify_one();
-                next_index_ = (index + 1) % num_workers;
-                block_count_ = 0;
-                i = -1;  // Restart the inner loop
-                continue;
-              }
-              *end_of_sequence = false;
-              if (output_elements_[index].output_status.ok()) {
-                output_elements_[index].output_value.swap(*out_tensors);
-              }
-              output_elements_[index].is_produced = false;
-              output_elements_[index].cond_var.notify_one();
-              return output_elements_[index].output_status;
-            }
-          }
-
-          if (num_active_threads_ == 0) {
-            // No potential for future values.
-            //
-            // Note: this condition check must occur after checking the output
-            // buffer, as its possible for there to be values in the output
-            // buffer, even if the number of live threads is zero.
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-
-          // If we are not allowed to be sloppy and
-          // `worker_threads_[next_index]` has finished, advance `next_index`.
-          if (!dataset()->sloppy_ && worker_threads_[next_index_].finished) {
-            next_index_ = (next_index_ + 1) % num_workers;
-            continue;
-          }
-
-          // No values available; wait until woken up.
-          // TODO(jsimsa): Use slot-specific condition variable for
-          // coordination of elements consumption.
-          cond_var_.wait(l);
-        }
-        return errors::Cancelled(
-            "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
-      }
-
-     private:
-      // Internal structure to manage thread coordination. All values are
-      // guarded by the enclosing Iterator's mu_.
-      struct OutputBufferElement {
-        // The producer must set `is_produced` to `true` after
-        // `output_status` or `output_value` has been written.
-        bool is_produced = false;
-        // The producer sets `output_status` if either getting the input element
-        // or applying the function to it fails.
-        Status output_status;
-        // Reached end of sequence for the underlying iterator.
-        bool end_of_sequence = false;
-        // The output data element.
-        std::vector<Tensor> output_value;
-        // The producer thread waits on this condition variable after having
-        // produced an element. The reader thread notifies this condition
-        // variable after reading the value.
-        condition_variable cond_var;
-      };
-
-      struct ThreadStatus {
-        // The underlying thread uses `finished` to communicate to the producer
-        // that it has finished.
-        bool finished = false;
-        // The underlying thread object.
-        std::unique_ptr<Thread> thread;
-
-        explicit ThreadStatus(Thread* thread) : thread(thread) {}
-      };
-
-      Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (worker_threads_.empty()) {
-          for (int64 i = 0; i < dataset()->cycle_length_; ++i) {
-            // Serialize the creation of the workers and their corresponding
-            // input elements to ensure we match the standard interleave when
-            // the underlying iterators induce no delay.
-            std::vector<Tensor> args;
-            TF_RETURN_IF_ERROR(
-                input_impl_->GetNext(ctx, &args, &end_of_input_));
-            if (end_of_input_) {
-              LOG(WARNING) << "Input iterator exhausted after " << i
-                           << " elements; cannot start all "
-                           << dataset()->cycle_length_ << " worker threads.";
-              return Status::OK();
-            }
-            std::unique_ptr<IteratorBase> itr;
-            TF_RETURN_IF_ERROR(dataset::MakeIteratorFromInputElement(
-                ctx, args, i, dataset()->captured_func_.get(), prefix(), &itr));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, "worker_thread",
-                std::bind(&Iterator::WorkerThread, this,
-                          new IteratorContext(*ctx), i, itr.release())));
-            num_active_threads_ = i + 1;
-          }
-        }
-        return Status::OK();
-      }
-
-      void BlockAndUpdateOutputBuffer(mutex_lock* l, const int64 thread_index,
-                                      const Status& status,
-                                      bool end_of_sequence,
-                                      std::vector<Tensor>* out_tensors)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        // We have produced an element; push it into the output buffer
-        // when space is available.
-        while (!cancelled_ && output_elements_[thread_index].is_produced) {
-          output_elements_[thread_index].cond_var.wait(*l);
-        }
-        if (cancelled_) {
-          return;
-        }
-        output_elements_[thread_index].is_produced = true;
-        output_elements_[thread_index].output_status = status;
-        output_elements_[thread_index].end_of_sequence = end_of_sequence;
-        if (status.ok()) {
-          output_elements_[thread_index].output_value.swap(*out_tensors);
-        } else {
-          output_elements_[thread_index].output_value.clear();
-        }
-        cond_var_.notify_one();
-      }
-
-      // Races to produce elements into the output queue buffers.
-      void WorkerThread(IteratorContext* ctx_ptr, const int64 thread_index,
-                        IteratorBase* out_iterator_ptr) {
-        // std::function arguments are copy-constructable, so we pass raw
-        // pointers, and then immediately wrap them to ensure correct ownership.
-        std::unique_ptr<IteratorContext> ctx(ctx_ptr);
-        std::unique_ptr<IteratorBase> out_iterator(out_iterator_ptr);
-        auto cleanup = gtl::MakeCleanup([this, thread_index] {
-          mutex_lock l(mu_);
-          worker_threads_[thread_index].finished = true;
-          num_active_threads_--;
-          cond_var_.notify_all();
-        });
-        while (true) {
-          // Attempt to produce an element.
-          bool end_of_out_itr_input = false;
-          std::vector<Tensor> out_tensors;
-          Status element_status = out_iterator->GetNext(ctx.get(), &out_tensors,
-                                                        &end_of_out_itr_input);
-          // Handle output.
-          {
-            mutex_lock l(mu_);
-            BlockAndUpdateOutputBuffer(&l, thread_index, element_status,
-                                       end_of_out_itr_input, &out_tensors);
-            if (end_of_out_itr_input) {
-              // We have exhausted our current iterator; get a new iterator;
-              // loop to handle errors.
-              while (!cancelled_) {
-                if (end_of_input_) {
-                  // No more iterator inputs; we're done!
-                  return;
-                }
-                std::vector<Tensor> args;
-                // BlockAndUpdateOutputBuffer() sequences calls to
-                // input_impl_->GetNext when the out_iterator doesn't cause
-                // slopping.
-                Status input_status =
-                    input_impl_->GetNext(ctx.get(), &args, &end_of_input_);
-                if (end_of_input_) {
-                  // No more elements to produce, stop the worker thread.
-                  return;
-                }
-                if (input_status.ok()) {
-                  input_status = dataset::MakeIteratorFromInputElement(
-                      ctx.get(), args, thread_index,
-                      dataset()->captured_func_.get(), prefix(), &out_iterator);
-                }
-                if (input_status.ok()) {
-                  // Successfully have a new out_iterator; restart the outer
-                  // loop to produce an element.
-                  break;
-                }
-
-                // We encountered an error; push the error to the output buffer.
-                BlockAndUpdateOutputBuffer(&l, thread_index, input_status,
-                                           /* end_of_sequence = */ false,
-                                           &out_tensors);
-              }
-            }
-
-            // Check if we should exit.
-            if (cancelled_) {
-              return;
-            }
-          }
-        }
-      }
-
-      // Mutex & condition variable to guard mutable iterator internals and
-      // coordinate among worker threads and client thread[s].
-      mutex mu_;
-      condition_variable cond_var_;
-      // The iterator producing elements which are converted to datasets by
-      // the dataset()->captured_func_ then interleaved together.
-      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      // Whether the input_impl_ can produce future elements.
-      bool end_of_input_ GUARDED_BY(mu_) = false;
-      // The buffer of elements to be produced. Each worker thread operates
-      // on a single OutputBufferElement.
-      std::vector<OutputBufferElement> output_elements_ GUARDED_BY(mu_);
-      // The index into output_elements_ for next element to produce.
-      size_t next_index_ GUARDED_BY(mu_) = 0;
-      // The number of items produced so far within the block
-      size_t block_count_ GUARDED_BY(mu_) = 0;
-      // Number of active threads.
-      size_t num_active_threads_ GUARDED_BY(mu_) = 0;
-      // Flag to instruct the worker threads to exit.
-      bool cancelled_ GUARDED_BY(mu_) = false;
-      // Pointers to the worker threads. This must be last to ensure the
-      // threads have exited before any other members are deallocated.
-      // TODO(b/65178177): Avoid allocating additional threads.
-      std::vector<ThreadStatus> worker_threads_ GUARDED_BY(mu_);
-    };
-
-    const DatasetBase* const input_;
-    const std::unique_ptr<CapturedFunction> captured_func_;
-    const int64 cycle_length_;
-    const int64 block_length_;
-    const bool sloppy_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
-  };
-
-  const int graph_def_version_;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList func_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
-                        ParallelInterleaveDatasetOp);
-
-}  // namespace
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/parallel_map_dataset_op.cc b/tensorflow/core/kernels/parallel_map_dataset_op.cc
deleted file mode 100644
index 2be87f4bde6f28596213433fe287d351ccf0c721..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/parallel_map_dataset_op.cc
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <deque>
-
-#include "tensorflow/core/kernels/dataset.h"
-
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/random/random.h"
-
-#include "tensorflow/core/kernels/captured_function.h"
-
-namespace tensorflow {
-
-namespace {
-
-// See documentation in ../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
-class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit ParallelMapDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
-
- protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    OpInputList inputs;
-    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
-    std::vector<Tensor> other_arguments;
-    other_arguments.reserve(inputs.size());
-    for (const Tensor& t : inputs) {
-      other_arguments.push_back(t);
-    }
-
-    int32 num_parallel_calls;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
-                                            &num_parallel_calls));
-    OP_REQUIRES(ctx, num_parallel_calls > 0,
-                errors::InvalidArgument(
-                    "num_parallel_calls must be greater than zero."));
-
-    std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
-                                                 std::move(other_arguments),
-                                                 &captured_func));
-
-    *output = new Dataset(input, num_parallel_calls, output_types_,
-                          output_shapes_, std::move(captured_func));
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(const DatasetBase* input, int32 num_parallel_calls,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes,
-            std::unique_ptr<CapturedFunction> captured_func)
-        : input_(input),
-          num_parallel_calls_(num_parallel_calls),
-          output_types_(output_types),
-          output_shapes_(output_shapes),
-          captured_func_(std::move(captured_func)) {
-      input_->Ref();
-    }
-
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIterator(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::ParallelMap")}));
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() override { return "ParallelMapDatasetOp::Dataset"; }
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
-            invocation_results_(params.dataset->num_parallel_calls_) {}
-
-      ~Iterator() override {
-        // TODO(mrry): Replace this cancellation logic with a
-        // CancellationManager. The syntax would be more heavyweight,
-        // but it would be possible to thread a cancellation manager
-        // through the IteratorContext to upstream,
-        // potentially-blocking iterators, when we add these.
-        {
-          mutex_lock l(mu_);
-          for (size_t i = 0; i < dataset()->num_parallel_calls_; ++i) {
-            if (invocation_results_[i].notification) {
-              invocation_results_[i].notification->WaitForNotification();
-            }
-          }
-        }
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-
-        // Ensure that there are `dataset()->num_parallel_calls_`
-        // invocations of `func_` outstanding at once.
-        while (!end_of_input_ && (num_inputs_consumed_ - num_outputs_consumed_ <
-                                  dataset()->num_parallel_calls_)) {
-          InvokeFunctionLocked(ctx);
-        }
-
-        if (end_of_input_ && num_inputs_consumed_ == num_outputs_consumed_) {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
-
-        // Read the next result out of `invocation_results_`, which
-        // acts as a circular buffer.
-        const size_t result_index =
-            num_outputs_consumed_ % dataset()->num_parallel_calls_;
-        InvocationResult* result = &invocation_results_[result_index];
-        *end_of_sequence = false;
-        if (result->notification) {
-          result->notification->WaitForNotification();
-          if (result->status.ok()) {
-            std::swap(*out_tensors, result->return_values);
-          }
-        }
-        ++num_outputs_consumed_;
-        return result->status;
-      }
-
-     private:
-      struct InvocationResult {
-        Status status;
-        std::unique_ptr<Notification> notification;
-        std::vector<Tensor> return_values;
-      };
-
-      void InvokeFunctionLocked(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        DCHECK(!end_of_input_);
-        DCHECK(num_inputs_consumed_ - num_outputs_consumed_ <
-               dataset()->num_parallel_calls_);
-
-        // The result of invoking the function will be written into the next
-        // slot in `invocation_results_`, which acts as a circular buffer.
-        const size_t result_index =
-            num_inputs_consumed_ % dataset()->num_parallel_calls_;
-        InvocationResult* result = &invocation_results_[result_index];
-        *result = InvocationResult();
-
-        // Get the next input element.
-        std::vector<Tensor> input_element;
-        result->status =
-            input_impl_->GetNext(ctx, &input_element, &end_of_input_);
-        if (end_of_input_) {
-          result->status = errors::OutOfRange("");
-        } else {
-          ++num_inputs_consumed_;
-        }
-
-        if (result->status.ok()) {
-          // Call `func_(input_element)`, store the result in
-          // `result->return_values`, and notify `result->notification`
-          // to unblock a consumer.
-          result->notification.reset(new Notification);
-
-          FunctionLibraryRuntime::Options opts;
-          opts.step_id = CapturedFunction::generate_step_id();
-          ScopedStepContainer* step_container = new ScopedStepContainer(
-              opts.step_id, [this, ctx](const string& name) {
-                dataset()
-                    ->captured_func_->resource_manager()
-                    ->Cleanup(name)
-                    .IgnoreError();
-              });
-          opts.step_container = step_container;
-          opts.runner = ctx->runner();
-          dataset()->captured_func_->RunAsync(
-              opts, input_element, &result->return_values,
-              [result, step_container, result_index](Status ret_status) {
-                delete step_container;
-                result->status.Update(ret_status);
-                result->notification->Notify();
-              });
-        }
-      }
-
-      mutex mu_;
-      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      std::vector<InvocationResult> invocation_results_ GUARDED_BY(mu_);
-      bool end_of_input_ GUARDED_BY(mu_) = false;
-      int64 num_inputs_consumed_ GUARDED_BY(mu_) = 0;
-      int64 num_outputs_consumed_ GUARDED_BY(mu_) = 0;
-    };
-
-    const DatasetBase* const input_;
-    const NameAttrList func_;
-    const int32 num_parallel_calls_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
-    const std::unique_ptr<CapturedFunction> captured_func_;
-  };
-
-  const int graph_def_version_;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList func_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("ParallelMapDataset").Device(DEVICE_CPU),
-                        ParallelMapDatasetOp);
-
-}  // namespace
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index b232ba16a76877b4d9f0e8c24e7ccd17a9bc0856..0ab9ff9f650e137017b49d5d279f1a28ff45fa29 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -95,9 +95,10 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
         int64 sample = b * samples_per_batch;
 
         // On GPU, this check will just fill samples with NAN if it fails.
-        OP_REQUIRES(ctx, stddev > T(0) && minval < maxval &&
-                             (Eigen::numext::isfinite(minval) ||
-                              Eigen::numext::isfinite(maxval)),
+        OP_REQUIRES(ctx,
+                    stddev > T(0) && minval < maxval &&
+                        (Eigen::numext::isfinite(minval) ||
+                         Eigen::numext::isfinite(maxval)),
                     errors::InvalidArgument("Invalid parameters"));
 
         int numIterations = 0;
@@ -118,8 +119,9 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
         // Determine the method to use.
         const T sqrtFactor = Eigen::numext::sqrt((normMin * normMin) + T(4));
         const T cutoff =
-            T(2) * Eigen::numext::exp(
-                       T(0.5) + (normMin * (normMin - sqrtFactor)) / T(4)) /
+            T(2) *
+            Eigen::numext::exp(T(0.5) +
+                               (normMin * (normMin - sqrtFactor)) / T(4)) /
             (normMin + sqrtFactor);
         const T diff = normMax - normMin;
         if (diff < cutoff) {
@@ -309,30 +311,34 @@ class ParameterizedTruncatedNormalOp : public OpKernel {
     } else {
       // Parameters must be broadcastable to the shape [num_batches].
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(means_tensor.shape()) ||
-                   means_tensor.dim_size(0) == 1 ||
-                   means_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(means_tensor.shape()) ||
+              means_tensor.dim_size(0) == 1 ||
+              means_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input means should have length 1 or shape[0], got shape: ",
               means_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(stddevs_tensor.shape()) ||
-                   stddevs_tensor.dim_size(0) == 1 ||
-                   stddevs_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(stddevs_tensor.shape()) ||
+              stddevs_tensor.dim_size(0) == 1 ||
+              stddevs_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input stddevs should have length 1 or shape[0], got shape: ",
               stddevs_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(minvals_tensor.shape()) ||
-                   minvals_tensor.dim_size(0) == 1 ||
-                   minvals_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(minvals_tensor.shape()) ||
+              minvals_tensor.dim_size(0) == 1 ||
+              minvals_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input minvals should have length 1 or shape[0], got shape: ",
               minvals_tensor.shape().DebugString()));
       OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(maxvals_tensor.shape()) ||
-                   maxvals_tensor.dim_size(0) == 1 ||
-                   maxvals_tensor.dim_size(0) == num_batches,
+          ctx,
+          TensorShapeUtils::IsScalar(maxvals_tensor.shape()) ||
+              maxvals_tensor.dim_size(0) == 1 ||
+              maxvals_tensor.dim_size(0) == num_batches,
           errors::InvalidArgument(
               "Input maxvals should have length 1 or shape[0], got shape: ",
               maxvals_tensor.shape().DebugString()));
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index 933de65c15a772154ce439cc54489c4a29c42ea5..661d47d925d1143d88b88d73b4ca51c654b43498 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
-#ifdef COMPILER_MSVC
+#if defined(_MSC_VER) && !defined(__clang__)
 // msvc does not support unroll. One could try the loop pragma but we need to
 // take a closer look if this generates better code in this case. For now let
 // the compiler take care of it.
@@ -202,12 +202,13 @@ struct TruncatedNormalFunctor<GPUDevice, T> {
                   typename TTypes<T>::Flat output) {
     const auto config = GetCudaLaunchConfig(num_elements, d);
 
-    TruncatedNormalKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        gen, output.data(), num_batches, samples_per_batch, num_elements,
-        means.data(), means.dimension(0) == 1, stddevs.data(),
-        stddevs.dimension(0) == 1, minvals.data(), minvals.dimension(0) == 1,
-        maxvals.data(), maxvals.dimension(0) == 1, kMaxIterations);
+    TruncatedNormalKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            gen, output.data(), num_batches, samples_per_batch, num_elements,
+            means.data(), means.dimension(0) == 1, stddevs.data(),
+            stddevs.dimension(0) == 1, minvals.data(),
+            minvals.dimension(0) == 1, maxvals.data(),
+            maxvals.dimension(0) == 1, kMaxIterations);
   };
 };
 
diff --git a/tensorflow/core/kernels/parse_tensor_op.cc b/tensorflow/core/kernels/parse_tensor_op.cc
index 6b599612ad7fde0bac44282521be26581aa752b8..8e175fe8d4b4fa203809e5871bfd301188c985da 100644
--- a/tensorflow/core/kernels/parse_tensor_op.cc
+++ b/tensorflow/core/kernels/parse_tensor_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/framework/register_types.h"
 
 namespace tensorflow {
 
@@ -92,7 +91,6 @@ class SerializeTensorOp : public OpKernel {
       Name("SerializeTensor").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       SerializeTensorOp<T>);
 TF_CALL_ALL_TYPES(REGISTER)
-TF_CALL_variant(REGISTER)
 #undef REGISTER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index a406317213f51d557d7b5a9942260156c0fe6369..01bcfede1e8d1f1a71059c5171f8a4d7290d7a5b 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -258,7 +258,7 @@ struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
 #else
           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
-                           Eigen::type2index<1> >
+                           Eigen::type2index<1>>
               bcast;
           bcast.set(1, csize);
           bcast.set(2, rsize);
@@ -431,7 +431,7 @@ struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
           Eigen::array<int, 5> bcast = {1, csize, rsize, psize, 1};
 #else
           Eigen::IndexList<Eigen::type2index<1>, int, int, int,
-                           Eigen::type2index<1> >
+                           Eigen::type2index<1>>
               bcast;
           bcast.set(1, csize);
           bcast.set(2, rsize);
@@ -833,7 +833,7 @@ TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
 
 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
+    TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
 #undef REGISTER_SYCL_KERNELS
 #endif  // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/pooling_ops_3d_sycl.h b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
index c1bc5af4986ee7102929af3e9b37a7301830de0e..b4bead2456d58c636301678d8a81864b25e3e85b 100644
--- a/tensorflow/core/kernels/pooling_ops_3d_sycl.h
+++ b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
@@ -281,12 +281,11 @@ class MaxPool3DGradSYCL {
 
     const T* input_data_n =
         input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-    const T* output_data_n =
-        output_data +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
-    const T* input_backprop_n =
-        input_backprop +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    const T* output_data_n = output_data + n * p_.out_planes_ * p_.out_cols_ *
+                                               p_.out_rows_ * p_.depth_;
+    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
+                                                     p_.out_cols_ *
+                                                     p_.out_rows_ * p_.depth_;
     for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
       int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
       const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
@@ -678,9 +677,9 @@ class AvgPool3DGradSYCL {
     n /= p_.in_planes_;
 
     T gradient = T(0);
-    const T* input_backprop_n =
-        input_backprop +
-        n * p_.out_planes_ * p_.out_cols_ * p_.out_rows_ * p_.depth_;
+    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
+                                                     p_.out_cols_ *
+                                                     p_.out_rows_ * p_.depth_;
     for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
       int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
       const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index ac90f67ce0bb8d9acffc3868acffc1cdfbe0f492..d4241b58090e4e4c1300fdcdc0e46411aa5a88f3 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -147,6 +147,9 @@ void DnnPoolingOp<T>::Compute(
   Tensor* tensor_out = nullptr;
   OP_REQUIRES_OK(context,
                  context->allocate_output(0, tensor_out_shape, &tensor_out));
+  if (tensor_in.shape().num_elements() == 0) {
+    return;
+  }
 
   PoolParameters params{context, size,        stride,
                         padding, data_format, tensor_in.shape()};
@@ -219,7 +222,7 @@ void DnnPoolingOp<T>::Compute(
                                       output_desc, &output_data)
                     .ok();
   OP_REQUIRES(context, status,
-              errors::Internal("cudnn PoolBackward launch failed"));
+              errors::Internal("cudnn PoolForward launch failed"));
 
   if (data_format == FORMAT_NHWC) {
     /// Transform the output data from NCHW back to NHWC
@@ -247,6 +250,9 @@ void DnnPoolingGradOp<T>::Compute(
   Tensor* input_backprop = nullptr;
   OP_REQUIRES_OK(context,
                  context->allocate_output(0, tensor_in_shape, &input_backprop));
+  if (tensor_in_shape.num_elements() == 0) {
+    return;
+  }
 
   PoolParameters params{context, size,        stride,
                         padding, data_format, tensor_in_shape};
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index 75a6fc371b4585695def1f15e7983be37417acf6..fc7cb437b8f583a811427deaf52a94d9ef996f37 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -86,7 +86,9 @@ class MaxPoolingOp : public OpKernel {
                   errors::InvalidArgument("Invalid data format"));
       OP_REQUIRES(
           context, data_format_ == FORMAT_NHWC,
-          errors::InvalidArgument("Default MaxPoolingOp only supports NHWC."));
+          errors::InvalidArgument("Default MaxPoolingOp only supports NHWC ",
+                                  "on device type ",
+                                  DeviceTypeString(context->device_type())));
     } else {
       data_format_ = FORMAT_NHWC;
     }
@@ -193,7 +195,6 @@ class MaxPoolingOp : public OpKernel {
       //    and updates the corresponding column(s) in output_as_matrix with the
       //    max value.
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
-
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
         const int32 pad_rows = params.pad_rows;
@@ -441,7 +442,6 @@ class MaxPoolingV2Op : public OpKernel {
       //    and updates the corresponding column(s) in output_as_matrix with the
       //    max value.
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
-
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
         const int32 pad_rows = params.pad_rows;
diff --git a/tensorflow/core/kernels/population_count_op.h b/tensorflow/core/kernels/population_count_op.h
index de89582e139b03de48719749ef29a0d3bb638e0e..2c9812967366d8b943715f08caf07ce5804877ca 100644
--- a/tensorflow/core/kernels/population_count_op.h
+++ b/tensorflow/core/kernels/population_count_op.h
@@ -14,8 +14,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
+#ifndef TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
+#define TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -35,4 +35,4 @@ struct PopulationCount {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
+#endif  // TENSORFLOW_CORE_KERNELS_POPULATION_COUNT_OP_H_
diff --git a/tensorflow/core/kernels/priority_queue.cc b/tensorflow/core/kernels/priority_queue.cc
index 5c487edbe3357753d44c21432765dce7b6d29e60..bab94f7f0ad1fd7609761aaabc4f76ae6eafeb7b 100644
--- a/tensorflow/core/kernels/priority_queue.cc
+++ b/tensorflow/core/kernels/priority_queue.cc
@@ -123,7 +123,7 @@ Status PriorityQueue::GetElementComponentFromBatch(
   TF_RETURN_IF_ERROR(ctx->allocate_persistent(
       tuple[component].dtype(), element_shape, out_tensor, &element_access));
   TF_RETURN_IF_ERROR(
-      CopySliceToElement(tuple[component], element_access, index));
+      batch_util::CopySliceToElement(tuple[component], element_access, index));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index 7c18496357c468322313b7b9064cfd7b3a22661a..9fafe6bb65406a1dbcbcb63624fe58019f9e83a3 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
 
 #define EIGEN_USE_THREADS
 
@@ -956,4 +956,4 @@ class TensorflowGemmContext : public gemmlowp::MultiThreadGemmContextBase {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_QUANTIZATION_UTILS_H_
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index a73581fbbc1e9db4af621b109496088ba2c7c7de..176720c22cc54ea8d9b79dacfc77f6cd2532f93a 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -385,8 +385,12 @@ void TestQuantizedToFloatInPlaceUsingEigen(
   // These are the float values we're going to test the conversions on.
   typedef std::pair<float, float> FPair;
   for (FPair min_and_max : std::vector<FPair>{
-           FPair(-255.0f, 255.0f), FPair(-1.0f, 1.0f), FPair(-1.0f, 255.0f),
-           FPair(0.0f, 1e6), FPair(0.0f, 1.0f), FPair(-31.0f, 13.0f),
+           FPair(-255.0f, 255.0f),
+           FPair(-1.0f, 1.0f),
+           FPair(-1.0f, 255.0f),
+           FPair(0.0f, 1e6),
+           FPair(0.0f, 1.0f),
+           FPair(-31.0f, 13.0f),
            FPair(-5.89505e+08, 5.89505e+08),
        }) {
     const float f_min = min_and_max.first;
@@ -743,7 +747,8 @@ template <int POW>
 void TestDivide64x2Pow(int64 val, int64 ref) {
   const int64x2_t val_64x2 = vmovq_n_s64(val);
   const int64x2_t ret = Divide64x2Pow<POW>(val_64x2);
-  int64 rets[2];
+  // TODO(b/70947959) Change back to int64 when possible
+  int64_t rets[2];
   vst1q_s64(rets, ret);
   EXPECT_EQ(rets[0], ref);
   EXPECT_EQ(rets[1], ref);
@@ -754,7 +759,8 @@ template <int POW>
 void TestDivide64x2PowRound(int64 val, int64 ref) {
   const int64x2_t val_64x2 = vmovq_n_s64(val);
   const int64x2_t shifted = Divide64x2PowRound<POW>(val_64x2);
-  int64 rets[2];
+  // TODO(b/70947959) Change back to int64 when possible
+  int64_t rets[2];
   vst1q_s64(rets, shifted);
   EXPECT_EQ(rets[0], ref) << "in = " << val << ", " << POW
                           << ", act = " << rets[0] << ", ref = " << ref;
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 1363c7e325b6a251d97039df3de271e92f59f6c0..3b09ea2527d8b401941c6ef0951c620edd0c5217 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -71,7 +71,8 @@ struct QuantizeAndDequantizeOneScaleImpl {
         out.device(d) =
             ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) *
                  scale +
-             T(0.5)).floor() *
+             T(0.5))
+                    .floor() *
                 inverse_scale +
             min_range;
       } else {
diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc
index d2cc55a94ddd7b3e31a5cfc841de25519abe2746..57982bdf76e3969b31f4ee73cbf47c564b2b53e6 100644
--- a/tensorflow/core/kernels/quantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_op_test.cc
@@ -250,7 +250,8 @@ TEST_F(QuantizedOpTest, QuantizeV2_32Bit) {
   Tensor expected(allocator(), DT_QINT32, TensorShape({element_count}));
   test::FillValues<qint32>(&expected,
                            {
-                               std::numeric_limits<int32>::min(), 0,
+                               std::numeric_limits<int32>::min(),
+                               0,
                                static_cast<int32>(1.0f * (1 << 23)),
                                static_cast<int32>(1.25f * (1 << 23)),
                                static_cast<int32>(1.75f * (1 << 23)),
diff --git a/tensorflow/core/kernels/quantized_add_op_test.cc b/tensorflow/core/kernels/quantized_add_op_test.cc
index 90bd145ad0c9b1da8805ecac7c49bd94c1db22ed..376fe34c4b5448ff46f3e657fead29753fb3c129 100644
--- a/tensorflow/core/kernels/quantized_add_op_test.cc
+++ b/tensorflow/core/kernels/quantized_add_op_test.cc
@@ -32,9 +32,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
-
-using namespace ops;  // NOLINT(build/namespaces)
-
+namespace ops {
 namespace {
 
 void TestAdd(const std::vector<int64>& x_shape,
@@ -184,8 +182,6 @@ void TimeAdd(const std::vector<int64>& x_shape,
             << ", total_duration=" << total_duration;
 }
 
-}  // namespace
-
 void TestManualScalar() {
   TestAdd(
       {10}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
@@ -276,10 +272,12 @@ void BenchmarkVectorPlusTensor() {
   TimeAdd({100000, 100}, {100}, 1);
 }
 
-}  // end namespace tensorflow
+}  // namespace
+}  // namespace ops
+}  // namespace tensorflow
 
 #define RUN_TEST(t) \
-  TEST(QuantizedAddOpTest, t) { tensorflow::t(); }
+  TEST(QuantizedAddOpTest, t) { tensorflow::ops::t(); }
 
 RUN_TEST(TestManualScalar);
 RUN_TEST(TestManualVector);
diff --git a/tensorflow/core/kernels/quantized_batch_norm_op.cc b/tensorflow/core/kernels/quantized_batch_norm_op.cc
index 18d83b414940504fcb4e031f3304412da3baf51b..b03da7ad17fab45086438691a1013b2acf54ee87 100644
--- a/tensorflow/core/kernels/quantized_batch_norm_op.cc
+++ b/tensorflow/core/kernels/quantized_batch_norm_op.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc
index ee573f1bb805107299fed89df211275a1e81c35d..b03ac8e87dac8fabe0d45d8685ec4fa5fd642519 100644
--- a/tensorflow/core/kernels/quantized_concat_op.cc
+++ b/tensorflow/core/kernels/quantized_concat_op.cc
@@ -135,8 +135,8 @@ class QuantizedConcatOp : public OpKernel {
           context, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
-              in.shape().DebugString()));
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", in.shape().DebugString()));
       for (int j = 0; j < input_dims; ++j) {
         if (j == concat_dim) {
           continue;
@@ -145,8 +145,8 @@ class QuantizedConcatOp : public OpKernel {
             context, in.dim_size(j) == input_shape.dim_size(j),
             errors::InvalidArgument(
                 "ConcatOp : Dimensions of inputs should match: shape[0] = ",
-                input_shape.DebugString(), " vs. shape[", i, "] = ",
-                in.shape().DebugString()));
+                input_shape.DebugString(), " vs. shape[", i,
+                "] = ", in.shape().DebugString()));
       }
       if (in.NumElements() > 0) {
         int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
@@ -174,13 +174,13 @@ class QuantizedConcatOp : public OpKernel {
     OP_REQUIRES(context, (input_mins.size() == N),
                 errors::InvalidArgument(
                     "QuantizedConcatOp : Expected mins input list length ",
-                    input_mins.size(), " to equal values length ", N))
+                    input_mins.size(), " to equal values length ", N));
     OpInputList input_maxes;
     OP_REQUIRES_OK(context, context->input_list("input_maxes", &input_maxes));
     OP_REQUIRES(context, (input_maxes.size() == N),
                 errors::InvalidArgument(
                     "QuantizedConcatOp : Expected maxes input list length ",
-                    input_maxes.size(), " to equal values length ", N))
+                    input_maxes.size(), " to equal values length ", N));
     const int input_dims = values[0].dims();
     const TensorShape& input_shape = values[0].shape();
     OP_REQUIRES(
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index 3b0764bb9bf9ff00c71173c53cdb78b6ab3ac6ca..5b3570edff5fee4b77d02684ef3da2af1d5f14b1 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -268,13 +268,19 @@ class Im2ColConvFunctor {
     Im2ColBufferResource<T1, chunk_value_count>* im2col_buffer_resource;
     std::function<Status(Im2ColBufferResource<T1, chunk_value_count>**)>
         creator = [](Im2ColBufferResource<T1, chunk_value_count>** resource) {
+#ifdef _MSC_VER
+          // MSVC complains about the capture of chunk_value_count which oddly
+          // works fine in conv_ops_using_gemm.cc for example.
+          // Define chunk_value_count inside the lambda for now.
+          const int64 chunk_value_count =
+              (kMaxChunkSize + (sizeof(T1) - 1)) / sizeof(T1);
+#endif
           *resource = new Im2ColBufferResource<T1, chunk_value_count>();
           return Status::OK();
         };
-    OP_REQUIRES_OK(
-        context,
-        context->resource_manager()->LookupOrCreate(
-            "Conv2d", "im2col_buffer", &im2col_buffer_resource, creator));
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "im2col_buffer",
+                                &im2col_buffer_resource, creator));
     // This means that multiple ops can't be run simultaneously on different
     // threads, because we have a single shared resource. The platforms this is
     // aimed at have intra-op parallelism as their focus though, so it shouldn't
@@ -457,6 +463,19 @@ class QuantizedConv2DOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    std::vector<int32> dilations;
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations));
+    OP_REQUIRES(context, dilations.size() == 4,
+                errors::InvalidArgument("Dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, dilations[1] == 1 && dilations[2] == 1,
+                errors::InvalidArgument(
+                    "Current implementation only supports dilated rate as 1 "
+                    "in the row and column dimensions."));
+    OP_REQUIRES(context, (dilations[0] == 1 && dilations[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
   }
 
diff --git a/tensorflow/core/kernels/quantized_instance_norm.cc b/tensorflow/core/kernels/quantized_instance_norm.cc
index c29f534f31b524f6e1d9ec09750b6de265ec10f8..d62094cc9fad85536edba8bb3854e71870df217c 100644
--- a/tensorflow/core/kernels/quantized_instance_norm.cc
+++ b/tensorflow/core/kernels/quantized_instance_norm.cc
@@ -278,10 +278,10 @@ class QuantizedInstanceNorm : public OpKernel {
     float input_max = context->input(2).flat<float>()(0);
     float input_scale = (input_max - input_min) / 255.0f;
 
-    OP_REQUIRES(
-        context, input_min < input_max,
-        errors::InvalidArgument("input_min must be less than input_max : ",
-                                input_min, " >= ", input_max));
+    OP_REQUIRES(context, input_min < input_max,
+                errors::InvalidArgument(
+                    "input_min must be less than input_max : ", input_min,
+                    " >= ", input_max));
 
     auto input_tensor = input.tensor<quint8, 4>();
     auto N = input_tensor.dimension(0);
diff --git a/tensorflow/core/kernels/quantized_instance_norm_test.cc b/tensorflow/core/kernels/quantized_instance_norm_test.cc
index d2b15ee20bb89a28c9d7f8398435352107eb4d79..896fe046e7ef2a99e8f854340c4c786095679a6e 100644
--- a/tensorflow/core/kernels/quantized_instance_norm_test.cc
+++ b/tensorflow/core/kernels/quantized_instance_norm_test.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 
 namespace tensorflow {
+namespace ops {
+namespace {
 
 void ReferenceImpl(const quint8* inp, float inp_min, float inp_max,
                    const TensorShape& shape, float var_eps, float* out) {
@@ -78,10 +80,6 @@ void ReferenceImpl(const quint8* inp, float inp_min, float inp_max,
   }
 }
 
-using namespace ops;  // NOLINT(build/namespaces)
-
-namespace {
-
 void Expect(const Tensor& input, float x_min, float x_max,
             bool output_range_given, float give_y_min, float given_y_max) {
   Scope root = Scope::NewRootScope();
@@ -123,8 +121,6 @@ void Expect(const Tensor& input, float x_min, float x_max,
   LOG(INFO) << "max diff " << max_diff();
 }
 
-}  // end namespace
-
 void TestBasic() {
   Tensor input_tensor(DT_QUINT8, {1, 4, 4, 32});
   auto input = input_tensor.flat<quint8>();
@@ -173,10 +169,12 @@ void TestClamp() {
   Expect(input_tensor, -10.0f, 10.0f, true, 0.0f, 1.0f);
 }
 
-}  // end namespace tensorflow
+}  // namespace
+}  // namespace ops
+}  // namespace tensorflow
 
 #define RUN_TEST(t) \
-  TEST(QuantizedAddOpTest, t) { tensorflow::t(); }
+  TEST(QuantizedInstanceNormTest, t) { tensorflow::ops::t(); }
 
 RUN_TEST(TestBasic);
 RUN_TEST(TestZeroInput);
diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc
index afb30d5f627feab1a009ec84c5f0bb9f851766e0..da8c46dc5162f30ea129e71fb5a1c81ee594718d 100644
--- a/tensorflow/core/kernels/quantized_matmul_op.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op.cc
@@ -104,9 +104,9 @@ class QuantizedMatMulOp : public OpKernel {
 
     OP_REQUIRES(context,
                 a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
-                errors::InvalidArgument("Matrix size-compatible: In[0]: ",
-                                        a.shape().DebugString(), ", In[1]: ",
-                                        b.shape().DebugString()));
+                errors::InvalidArgument(
+                    "Matrix size-compatible: In[0]: ", a.shape().DebugString(),
+                    ", In[1]: ", b.shape().DebugString()));
 
     OP_REQUIRES(context, ((shift_c >= 0) && (shift_c <= 31)),
                 errors::InvalidArgument("shift_c must be between 0 and 31, "
diff --git a/tensorflow/core/kernels/quantized_matmul_op_test.cc b/tensorflow/core/kernels/quantized_matmul_op_test.cc
index 535b5115c34e61333a0e7e1fdbfbe2b35571bf6c..c9f05dbc10bb8bcd3acae2d2ca0c149ac620bb79 100644
--- a/tensorflow/core/kernels/quantized_matmul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op_test.cc
@@ -206,17 +206,32 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) {
   // We have set the transpose_a flag to true, so the matrix is transposed, and
   // for filling the values the in-memory storage order is effectively
   // column major, rather than the default row-major.
-  AddInputFromArray<quint8>(TensorShape({a_rows, a_cols}),
-                            {
-                                11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-                            });
+  AddInputFromArray<quint8>(TensorShape({a_rows, a_cols}), {
+                                                               11,
+                                                               10,
+                                                               9,
+                                                               8,
+                                                               7,
+                                                               6,
+                                                               5,
+                                                               4,
+                                                               3,
+                                                               2,
+                                                               1,
+                                                               0,
+                                                           });
 
   // The B matrix is:
   // |   1 |   4|
   // |   2 |   5|
   // |   3 |   6|
   AddInputFromArray<quint8>(TensorShape({b_rows, b_cols}), {
-                                                               1, 4, 2, 5, 3, 6,
+                                                               1,
+                                                               4,
+                                                               2,
+                                                               5,
+                                                               3,
+                                                               6,
                                                            });
   AddInputFromArray<float>(TensorShape({1}), {-12.0f});
   AddInputFromArray<float>(TensorShape({1}), {243.0f});
@@ -238,10 +253,16 @@ TEST_F(QuantizedMatMulTest, Small_WithParams) {
   // |  -50 | -113 |
   // |  -56 | -128 |
   Tensor expected(allocator(), DT_QINT32, TensorShape({a_cols, b_cols}));
-  test::FillValues<qint32>(&expected,
-                           {
-                               -38, -83, -44, -98, -50, -113, -56, -128,
-                           });
+  test::FillValues<qint32>(&expected, {
+                                          -38,
+                                          -83,
+                                          -44,
+                                          -98,
+                                          -50,
+                                          -113,
+                                          -56,
+                                          -128,
+                                      });
   test::ExpectTensorEqual<qint32>(expected, *GetOutput(0));
 }
 
diff --git a/tensorflow/core/kernels/quantized_mul_op.cc b/tensorflow/core/kernels/quantized_mul_op.cc
index eaa5e667f7d5681e886a5de9e64a055ec175cf1e..3c7536e037396c338663ce0136832acb87bef401 100644
--- a/tensorflow/core/kernels/quantized_mul_op.cc
+++ b/tensorflow/core/kernels/quantized_mul_op.cc
@@ -298,9 +298,8 @@ class QuantizedMulOp : public OpKernel {
       return;
     }
     Tensor* z;
-    OP_REQUIRES_OK(
-        context,
-        context->allocate_output(0, BCast::ToShape(bcast.output_shape()), &z));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, BCast::ToShape(bcast.output_shape()), &z));
 
     // Make sure that we have valid quantization ranges for the input buffers.
     // If the difference between the min and max is negative or zero, it makes
diff --git a/tensorflow/core/kernels/quantized_mul_op_test.cc b/tensorflow/core/kernels/quantized_mul_op_test.cc
index 5f858eb8ce03be7d130649f814db5f1f9c68f18c..a4e407c7a94c9c2e11808eeb4533be5c346fb6f4 100644
--- a/tensorflow/core/kernels/quantized_mul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_mul_op_test.cc
@@ -32,9 +32,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
-
-using namespace ops;  // NOLINT(build/namespaces)
-
+namespace ops {
 namespace {
 
 void TestMul(const std::vector<int64>& x_shape,
@@ -184,19 +182,18 @@ void TimeMul(const std::vector<int64>& x_shape,
             << ", total_duration=" << total_duration;
 }
 
-}  // namespace
-
 void TestManualScalar() {
   TestMul(
       {10}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
       10.0f, {1}, {10.0f}, -100.0f, 100.0f, {10},
       {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f},
       3.0f);
-  TestMul({1}, {10.0f}, -100.0f, 100.0f, {10},
-          {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
-          10.0f, {10}, {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f,
-                        90.0f, 100.0f},
-          3.0f);
+  TestMul(
+      {1}, {10.0f}, -100.0f, 100.0f, {10},
+      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f}, 0.0f,
+      10.0f, {10},
+      {10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f, 90.0f, 100.0f},
+      3.0f);
 }
 
 void TestScalar() {
@@ -276,10 +273,12 @@ void BenchmarkVectorTimesTensor() {
   TimeMul({100000, 100}, {100}, 100);
 }
 
-}  // end namespace tensorflow
+}  // namespace
+}  // namespace ops
+}  // namespace tensorflow
 
 #define RUN_TEST(t) \
-  TEST(QuantizedAddOpTest, t) { tensorflow::t(); }
+  TEST(QuantizedAddOpTest, t) { tensorflow::ops::t(); }
 
 RUN_TEST(TestManualScalar);
 RUN_TEST(TestManualVector);
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index 6c91d0cd94c8aee6b90115fabd82f99ddd29263e..de495c19cba300fbd034cda01adfd0518548ce68 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -39,8 +39,8 @@ Status HandleSliceToElement(const Tensor& parent, Tensor* element,
     return errors::Internal(
         "HandleSliceToElement Cannot copy slice: number of elements does not "
         "match.  Shapes are: [element]: ",
-        element->shape().DebugString(), ", [parent slice]: ",
-        chip_shape.DebugString());
+        element->shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
   }
   auto parent_as_matrix = parent.flat_outer_dims<T>();
   element->flat<T>() = parent_as_matrix.chip(index, 0);
@@ -336,32 +336,7 @@ void QueueBase::FlushUnlocked() {
 
 Status QueueBase::CopySliceToElement(const Tensor& parent, Tensor* element,
                                      int64 index) {
-#define HANDLE_TYPE(DT)                                                   \
-  if (parent.dtype() == DT) {                                             \
-    TF_RETURN_IF_ERROR(HandleSliceToElement<DT>(parent, element, index)); \
-    return Status::OK();                                                  \
-  }
-  HANDLE_TYPE(DT_FLOAT);
-  HANDLE_TYPE(DT_HALF);
-  HANDLE_TYPE(DT_DOUBLE);
-  HANDLE_TYPE(DT_INT32);
-  HANDLE_TYPE(DT_UINT8);
-  HANDLE_TYPE(DT_INT16);
-  HANDLE_TYPE(DT_INT8);
-  HANDLE_TYPE(DT_STRING);
-  HANDLE_TYPE(DT_COMPLEX64);
-  HANDLE_TYPE(DT_COMPLEX128);
-  HANDLE_TYPE(DT_INT64);
-  HANDLE_TYPE(DT_BOOL);
-  HANDLE_TYPE(DT_QINT8);
-  HANDLE_TYPE(DT_QUINT8);
-  HANDLE_TYPE(DT_QINT32);
-  HANDLE_TYPE(DT_QINT16);
-  HANDLE_TYPE(DT_QUINT16);
-  HANDLE_TYPE(DT_UINT16);
-#undef HANDLE_TYPE
-  return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
-                               parent.dtype());
+  return batch_util::CopySliceToElement(parent, element, index);
 }
 
 /* static */
diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h
index 2d68ac7a298f1835b41750152f4ebff922cf019c..ad606803ee7017380b33819dca7718023daa3900 100644
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@@ -44,8 +44,7 @@ class QueueOp : public ResourceOpKernel<QueueInterface> {
   void Compute(OpKernelContext* context) override {
     ResourceOpKernel<QueueInterface>::Compute(context);
     if (resource_ && context->track_allocations()) {
-      context->record_host_persistent_memory_allocation(
-          resource_->MemoryUsed());
+      context->record_persistent_memory_allocation(resource_->MemoryUsed());
     }
   }
 
diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc
index 17831b74370bcd21cf7772f0ea6809ee840511c3..46a02854d732d6da657414a4e42b535f72ea7b64 100644
--- a/tensorflow/core/kernels/queue_ops.cc
+++ b/tensorflow/core/kernels/queue_ops.cc
@@ -428,13 +428,14 @@ REGISTER_KERNEL_BUILDER(Name("QueueSizeV2").Device(DEVICE_CPU), QueueSizeOp);
 class QueueIsClosedOp : public QueueOpKernel {
  public:
   explicit QueueIsClosedOp(OpKernelConstruction* context)
-     : QueueOpKernel(context) {}
+      : QueueOpKernel(context) {}
 
  protected:
   void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
                     DoneCallback callback) override {
     Tensor* Tqueue_is_closed = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed));
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({}), &Tqueue_is_closed));
     Tqueue_is_closed->flat<bool>().setConstant(queue->is_closed());
     callback();
   }
@@ -443,8 +444,10 @@ class QueueIsClosedOp : public QueueOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(QueueIsClosedOp);
 };
 
-REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU), QueueIsClosedOp);
-REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU), QueueIsClosedOp);
+REGISTER_KERNEL_BUILDER(Name("QueueIsClosed").Device(DEVICE_CPU),
+                        QueueIsClosedOp);
+REGISTER_KERNEL_BUILDER(Name("QueueIsClosedV2").Device(DEVICE_CPU),
+                        QueueIsClosedOp);
 
 class FakeQueueOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/random_crop_op.cc b/tensorflow/core/kernels/random_crop_op.cc
index ba94d6be5caff7245e08ca22b5f057e81f30db74..554909760aa8a6bebe7e2988cd995f9373e1cc33 100644
--- a/tensorflow/core/kernels/random_crop_op.cc
+++ b/tensorflow/core/kernels/random_crop_op.cc
@@ -68,10 +68,10 @@ class RandomCropOp : public OpKernel {
     // Edge case. The target dimensions are larger then the image, so
     // zero-pad the image. This guarantees that the image will *always*
     // be [target_height, target_width] in size.
-    OP_REQUIRES(
-        context, width >= target_width,
-        errors::FailedPrecondition("width must be >= target_width: width = ",
-                                   width, ", target_width = ", target_width));
+    OP_REQUIRES(context, width >= target_width,
+                errors::FailedPrecondition(
+                    "width must be >= target_width: width = ", width,
+                    ", target_width = ", target_width));
     OP_REQUIRES(context, height >= target_height,
                 errors::FailedPrecondition(
                     "height must be >= target_height: height = ", height,
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 55a8b9c9b67455483689a135306017bed8974ade..78ff7948fbf1b6406b2faca1d94acd7ea3325437 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -50,7 +50,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 using random::PhiloxRandom;
@@ -271,9 +271,10 @@ class RandomGammaOp : public OpKernel {
     const Tensor& shape_t = ctx->input(0);
     const Tensor& alpha_t = ctx->input(1);
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(shape_t.shape()) &&
-                         (shape_t.dtype() == DataType::DT_INT32 ||
-                          shape_t.dtype() == DataType::DT_INT64),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(shape_t.shape()) &&
+                    (shape_t.dtype() == DataType::DT_INT32 ||
+                     shape_t.dtype() == DataType::DT_INT64),
                 errors::InvalidArgument(
                     "shape must be a vector of {int32,int64}, got shape: ",
                     shape_t.DebugString()));
@@ -325,7 +326,7 @@ class RandomGammaOp : public OpKernel {
     // avoid a couple flops which can be done on a per-alpha basis.
 
     auto DoWork = [num_samples, num_alphas, &rng, samples_flat, alpha_flat](
-        int start_output, int limit_output) {
+                      int start_output, int limit_output) {
       using Eigen::numext::exp;
       using Eigen::numext::log;
       using Eigen::numext::pow;
@@ -448,40 +449,40 @@ class RandomGammaOp : public OpKernel {
 
 }  // namespace
 
-#define REGISTER(TYPE)                                                      \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice, random::UniformDistribution<random::PhiloxRandom, TYPE> >; \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice, random::NormalDistribution<random::PhiloxRandom, TYPE> >;  \
-  template struct functor::FillPhiloxRandom<                                \
-      CPUDevice,                                                            \
-      random::TruncatedNormalDistribution<                                  \
-          random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >;       \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomUniform")                                                 \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<CPUDevice, random::UniformDistribution<                \
-                                    random::PhiloxRandom, TYPE> >);         \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomStandardNormal")                                          \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<CPUDevice, random::NormalDistribution<                 \
-                                    random::PhiloxRandom, TYPE> >);         \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("TruncatedNormal")                                               \
-          .Device(DEVICE_CPU)                                               \
-          .HostMemory("shape")                                              \
-          .TypeConstraint<TYPE>("dtype"),                                   \
-      PhiloxRandomOp<                                                       \
-          CPUDevice,                                                        \
-          random::TruncatedNormalDistribution<                              \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);  \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),     \
+#define REGISTER(TYPE)                                                         \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice, random::UniformDistribution<random::PhiloxRandom, TYPE>>;     \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice, random::NormalDistribution<random::PhiloxRandom, TYPE>>;      \
+  template struct functor::FillPhiloxRandom<                                   \
+      CPUDevice,                                                               \
+      random::TruncatedNormalDistribution<                                     \
+          random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>;           \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<CPUDevice, random::UniformDistribution<                   \
+                                    random::PhiloxRandom, TYPE>>);             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<CPUDevice,                                                \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_CPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          CPUDevice,                                                           \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);      \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),        \
       RandomGammaOp<TYPE>)
 
 #define REGISTER_INT(IntType)                                   \
@@ -504,33 +505,33 @@ TF_CALL_int64(REGISTER_INT);
 
 #if GOOGLE_CUDA
 
-#define REGISTER(TYPE)                                              \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("RandomUniform")                                         \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<GPUDevice, random::UniformDistribution<        \
-                                    random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("RandomStandardNormal")                                  \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<GPUDevice, random::NormalDistribution<         \
-                                    random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("TruncatedNormal")                                       \
-          .Device(DEVICE_GPU)                                       \
-          .HostMemory("shape")                                      \
-          .TypeConstraint<int32>("T")                               \
-          .TypeConstraint<TYPE>("dtype"),                           \
-      PhiloxRandomOp<                                               \
-          GPUDevice,                                                \
-          random::TruncatedNormalDistribution<                      \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+#define REGISTER(TYPE)                                                         \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<GPUDevice, random::UniformDistribution<                   \
+                                    random::PhiloxRandom, TYPE>>);             \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<GPUDevice,                                                \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_GPU)                                                  \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<int32>("T")                                          \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          GPUDevice,                                                           \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
 
 #define REGISTER_INT(IntType)                                   \
   REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
@@ -565,13 +566,12 @@ struct FillPhiloxRandomKernel;
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, false> {
   typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write, sycl::access::target::global_buffer>;
+  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
+                                        sycl::access::target::global_buffer>;
 
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist)
-      : data_(data),
-        gen_(gen),
-        dist_(dist) {
-  }
+  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
+                         Distribution& dist)
+      : data_(data), gen_(gen), dist_(dist) {}
 
   void operator()(sycl::nd_item<1> item) {
     const size_t kGroupSize = Distribution::kResultElementCount;
@@ -597,7 +597,7 @@ struct FillPhiloxRandomKernel<Distribution, false> {
     const typename Distribution::ResultType samples = dist_(&gen_);
     for (size_t i = 0; i < kGroupSize; ++i) {
       if (offset >= size) {
-          return;
+        return;
       }
       data[offset] = samples[i];
       ++offset;
@@ -610,17 +610,15 @@ struct FillPhiloxRandomKernel<Distribution, false> {
   Distribution dist_;
 };
 
-
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, true> {
   typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write, sycl::access::target::global_buffer>;
+  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
+                                        sycl::access::target::global_buffer>;
 
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen, Distribution& dist)
-      : data_(data),
-        gen_(gen),
-        dist_(dist) {
-  }
+  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
+                         Distribution& dist)
+      : data_(data), gen_(gen), dist_(dist) {}
 
   void operator()(sycl::nd_item<1> item) {
     using random::PhiloxRandom;
@@ -628,9 +626,9 @@ struct FillPhiloxRandomKernel<Distribution, true> {
 
     const size_t kReservedSamplesPerOutput = 256;
     const size_t kGroupSize = Distribution::kResultElementCount;
-    const size_t kGeneratorSkipPerOutputGroup = kGroupSize *
-                                                kReservedSamplesPerOutput /
-                                                PhiloxRandom::kResultElementCount;
+    const size_t kGeneratorSkipPerOutputGroup =
+        kGroupSize * kReservedSamplesPerOutput /
+        PhiloxRandom::kResultElementCount;
 
     const size_t item_id = item.get_global(0);
     const size_t total_item_count = item.get_global_range();
@@ -674,10 +672,9 @@ class FillRandomKernel;
 // It splits the work into several tasks and run them in parallel
 template <class Distribution>
 void FillPhiloxRandom<SYCLDevice, Distribution>::operator()(
-    OpKernelContext* context, const SYCLDevice& device, random::PhiloxRandom gen,
-    typename Distribution::ResultElementType* data, int64 size,
-    Distribution dist) {
-
+    OpKernelContext* context, const SYCLDevice& device,
+    random::PhiloxRandom gen, typename Distribution::ResultElementType* data,
+    int64 size, Distribution dist) {
   const size_t group_size = device.maxSyclThreadsPerBlock();
   const size_t group_count = (size + group_size - 1) / group_size;
 
@@ -686,50 +683,52 @@ void FillPhiloxRandom<SYCLDevice, Distribution>::operator()(
   device.sycl_queue().submit([&](sycl::handler& cgh) {
     auto access = buffer.template get_access<sycl::access::mode::write>(cgh);
 
-    FillPhiloxRandomKernel<Distribution, Distribution::kVariableSamplesPerOutput> task(access, gen, dist);
+    FillPhiloxRandomKernel<Distribution,
+                           Distribution::kVariableSamplesPerOutput>
+        task(access, gen, dist);
     cgh.parallel_for<class FillRandomKernel<Distribution>>(
-      sycl::nd_range<1>(sycl::range<1>(group_count * group_size), sycl::range<1>(group_size)),
-      task
-    );
+        sycl::nd_range<1>(sycl::range<1>(group_count * group_size),
+                          sycl::range<1>(group_size)),
+        task);
   });
 }
 
-}
+}  // namespace functor
+
+#define REGISTER(TYPE)                                                         \
+  template struct functor::FillPhiloxRandom<                                   \
+      SYCLDevice, random::UniformDistribution<random::PhiloxRandom, TYPE>>;    \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomUniform")                                                    \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<SYCLDevice, random::UniformDistribution<                  \
+                                     random::PhiloxRandom, TYPE>>);            \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("RandomStandardNormal")                                             \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<SYCLDevice,                                               \
+                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("TruncatedNormal")                                                  \
+          .Device(DEVICE_SYCL)                                                 \
+          .HostMemory("shape")                                                 \
+          .TypeConstraint<TYPE>("dtype"),                                      \
+      PhiloxRandomOp<                                                          \
+          SYCLDevice,                                                          \
+          random::TruncatedNormalDistribution<                                 \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
 
-#define REGISTER(TYPE)                                                       \
-  template struct functor::FillPhiloxRandom<                                 \
-      SYCLDevice, random::UniformDistribution<random::PhiloxRandom, TYPE> >; \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RandomUniform")                                                  \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<SYCLDevice, random::UniformDistribution<                \
-                                    random::PhiloxRandom, TYPE> >);          \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RandomStandardNormal")                                           \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<SYCLDevice, random::NormalDistribution<                 \
-                                    random::PhiloxRandom, TYPE> >);          \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("TruncatedNormal")                                                \
-          .Device(DEVICE_SYCL)                                               \
-          .HostMemory("shape")                                               \
-          .TypeConstraint<TYPE>("dtype"),                                    \
-      PhiloxRandomOp<                                                        \
-          SYCLDevice,                                                        \
-          random::TruncatedNormalDistribution<                               \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
-
-#define REGISTER_INT(IntType)                                    \
-  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")               \
-                              .Device(DEVICE_SYCL)               \
-                              .HostMemory("shape")               \
-                              .HostMemory("minval")              \
-                              .HostMemory("maxval")              \
-                              .TypeConstraint<IntType>("Tout"),  \
+#define REGISTER_INT(IntType)                                   \
+  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
+                              .Device(DEVICE_SYCL)              \
+                              .HostMemory("shape")              \
+                              .HostMemory("minval")             \
+                              .HostMemory("maxval")             \
+                              .TypeConstraint<IntType>("Tout"), \
                           RandomUniformIntOp<SYCLDevice, IntType>);
 
 TF_CALL_float(REGISTER);
@@ -740,6 +739,6 @@ TF_CALL_int64(REGISTER_INT);
 #undef REGISTER
 #undef REGISTER_INT
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index 7afa6974c6a9389782fbbcd39ddede2a97ecd566..3393b39faf4a25791b48af99a5e474f3e9bfbfce 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -222,9 +222,8 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
       (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
       block_size;
 
-  FillPhiloxRandomKernelLaunch<
-      Distribution><<<num_blocks, block_size, 0, d.stream()>>>(gen, data, size,
-                                                               dist);
+  FillPhiloxRandomKernelLaunch<Distribution>
+      <<<num_blocks, block_size, 0, d.stream()>>>(gen, data, size, dist);
 };
 
 // Explicit instantiation of the GPU distributions functors
diff --git a/tensorflow/core/kernels/random_op_test.cc b/tensorflow/core/kernels/random_op_test.cc
index f93a0d130d8ebb3835637a35d67a99bda75b1941..47d94ad902852c26382ffe5a10daa44be4787751 100644
--- a/tensorflow/core/kernels/random_op_test.cc
+++ b/tensorflow/core/kernels/random_op_test.cc
@@ -17,11 +17,13 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
+namespace {
 
 Tensor VecShape(int64 v) {
   if (v >= std::numeric_limits<int32>::max()) {
@@ -57,7 +59,7 @@ Graph* TruncatedNormal(int64 n) {
 }
 
 #define BM_RNG(DEVICE, RNG)                                   \
-  static void BM_##DEVICE##_##RNG(int iters, int arg) {       \
+  void BM_##DEVICE##_##RNG(int iters, int arg) {              \
     testing::ItemsProcessed(static_cast<int64>(iters) * arg); \
     test::Benchmark(#DEVICE, RNG(arg)).Run(iters);            \
   }                                                           \
@@ -76,12 +78,13 @@ Tensor VecAlphas(int64 n) {
   for (int i = 0; i < n; i++) {
     // Alternate back and forth between small-and-growing (.25) and
     // large-and-shrinking (26.67) alpha.
-    alphas.vec<double>()(i) = 0.25 + std::pow(1.1, i % 2 == 0 ? i : n - i);
+    alphas.vec<double>()(i) =
+        0.25 + MathUtil::IPow(1.1, i % 2 == 0 ? i : n - i);
   }
   return alphas;
 }
 
-static void BM_cpu_RandomGamma(int iters, int nsamp, int nalpha) {
+void BM_cpu_RandomGamma(int iters, int nsamp, int nalpha) {
   testing::ItemsProcessed(static_cast<int64>(iters) * nsamp * nalpha);
   Graph* g = new Graph(OpRegistry::Global());
   test::graph::RandomGamma(g, test::graph::Constant(g, VecShape(nsamp)),
@@ -90,7 +93,7 @@ static void BM_cpu_RandomGamma(int iters, int nsamp, int nalpha) {
 }
 BENCHMARK(BM_cpu_RandomGamma)->RangePair(1 << 14, 4 << 15, 2, 50);
 
-static void BM_PhiloxRandom(int iters) {
+void BM_PhiloxRandom(int iters) {
   // Fill 2M random numbers
   int count = 2 << 20;
 
@@ -114,7 +117,7 @@ static void BM_PhiloxRandom(int iters) {
 }
 BENCHMARK(BM_PhiloxRandom);
 
-static void BM_StdMTRandom(int iters) {
+void BM_StdMTRandom(int iters) {
   // Fill 2M random numbers
   int count = 2 << 20;
 
@@ -138,4 +141,5 @@ static void BM_StdMTRandom(int iters) {
 }
 BENCHMARK(BM_StdMTRandom);
 
-}  // end namespace tensorflow
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc
index bf1d83ec7517d1bcfa9b88b482b983e6a2d3f7c4..64fb4a5c22848009743af6a577c719f206f022bb 100644
--- a/tensorflow/core/kernels/random_poisson_op.cc
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@@ -103,7 +103,7 @@ struct PoissonFunctor<CPUDevice, T, U> {
     typedef random::UniformDistribution<random::PhiloxRandom, CT> Uniform;
 
     auto DoWork = [num_samples, num_rate, &rng, samples_flat, rate_flat](
-        int start_output, int limit_output) {
+                      int start_output, int limit_output) {
       // Capturing "rng" by value would only make a copy for the _shared_
       // lambda.  Since we want to let each worker have its own copy, we pass
       // "rng" by reference and explicitly do a copy assignment.
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index 7a40e9ddf20023152a50363dfdc540a4f15823ac..87fc94333162c4b721fa3608f282bf9d28fc792e 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -171,7 +171,7 @@ Status RandomShuffleQueue::GetElementComponentFromBatch(
   TF_RETURN_IF_ERROR(ctx->allocate_persistent(
       tuple[component].dtype(), element_shape, out_tensor, &element_access));
   TF_RETURN_IF_ERROR(
-      CopySliceToElement(tuple[component], element_access, index));
+      batch_util::CopySliceToElement(tuple[component], element_access, index));
   return Status::OK();
 }
 
@@ -334,96 +334,95 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
       // TODO(josh11b): This makes two copies of callback, avoid this if possible.
       dequeue_attempts_.emplace_back(
           num_elements, [callback]() { callback(Tuple()); }, ctx, cm, token,
-          [callback, allow_small_batch, this](Attempt* attempt)
-              EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                int32 queue_size = queues_[0].size();
-                if (closed_ && queue_size < attempt->elements_requested) {
-                  // If we don't have enough for a full dequeue, we have
-                  // to reset the attempt tuple.
-                  if (!attempt->tuple.empty()) {
-                    // Restore already-dequeued elements to the queue.
-                    for (int64 i = attempt->tuple[0].dim_size(0) -
-                                   attempt->elements_requested - 1;
-                         i >= 0; --i) {
-                      for (int j = 0; j < num_components(); ++j) {
-                        PersistentTensor element;
-                        Status s = GetElementComponentFromBatch(
-                            attempt->tuple, i, j, attempt->context, &element);
-                        if (!s.ok()) {
-                          attempt->context->SetStatus(
-                              errors::DataLoss("Failed to restore element from "
-                                               "partially-dequeued batch "
-                                               "to RandomShuffleQueue: ",
-                                               s.error_message()));
-                        }
-                        queues_[j].push_back(element);
-                      }
-                    }
-                  }
-                  if (allow_small_batch && !queues_[0].empty()) {
-                    // Request all remaining elements in the queue.
-                    queue_size = queues_[0].size();
-                    attempt->tuple.clear();
-                    attempt->elements_requested = queue_size;
-                  } else {
-                    if (allow_small_batch) {
-                      // There may be some other attempts containing
-                      // values.  If so, we'll yield and wait for them
-                      // to add elements to the queue.
-                      if (!enqueue_attempts_.empty()) return kProgress;
-                    }
-                    if (attempt->context->status().ok()) {
-                      attempt->context->SetStatus(errors::OutOfRange(
-                          "RandomShuffleQueue '", name_, "' is closed and has ",
-                          "insufficient elements (requested ",
-                          attempt->elements_requested, ", current size ",
-                          queue_size, ")"));
+          [callback, allow_small_batch,
+           this](Attempt* attempt) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+            int32 queue_size = queues_[0].size();
+            if (closed_ && queue_size < attempt->elements_requested) {
+              // If we don't have enough for a full dequeue, we have
+              // to reset the attempt tuple.
+              if (!attempt->tuple.empty()) {
+                // Restore already-dequeued elements to the queue.
+                for (int64 i = attempt->tuple[0].dim_size(0) -
+                               attempt->elements_requested - 1;
+                     i >= 0; --i) {
+                  for (int j = 0; j < num_components(); ++j) {
+                    PersistentTensor element;
+                    Status s = GetElementComponentFromBatch(
+                        attempt->tuple, i, j, attempt->context, &element);
+                    if (!s.ok()) {
+                      attempt->context->SetStatus(
+                          errors::DataLoss("Failed to restore element from "
+                                           "partially-dequeued batch "
+                                           "to RandomShuffleQueue: ",
+                                           s.error_message()));
                     }
-                    return kComplete;
+                    queues_[j].push_back(element);
                   }
                 }
+              }
+              if (allow_small_batch && !queues_[0].empty()) {
+                // Request all remaining elements in the queue.
+                queue_size = queues_[0].size();
+                attempt->tuple.clear();
+                attempt->elements_requested = queue_size;
+              } else {
+                if (allow_small_batch) {
+                  // There may be some other attempts containing
+                  // values.  If so, we'll yield and wait for them
+                  // to add elements to the queue.
+                  if (!enqueue_attempts_.empty()) return kProgress;
+                }
+                if (attempt->context->status().ok()) {
+                  attempt->context->SetStatus(errors::OutOfRange(
+                      "RandomShuffleQueue '", name_, "' is closed and has ",
+                      "insufficient elements (requested ",
+                      attempt->elements_requested, ", current size ",
+                      queue_size, ")"));
+                }
+                return kComplete;
+              }
+            }
 
-                RunResult result = kNoProgress;
-                if (!closed_) queue_size -= min_after_dequeue_;
-                for (; queue_size > 0; --queue_size) {
-                  if (attempt->tuple.empty()) {
-                    // Only allocate tuple when we have something to dequeue
-                    // so we don't use excessive memory when there are many
-                    // blocked dequeue attempts waiting.
-                    attempt->tuple.reserve(num_components());
-                    for (int i = 0; i < num_components(); ++i) {
-                      const TensorShape shape =
-                          ManyOutShape(i, attempt->elements_requested);
-                      Tensor element;
-                      attempt->context->SetStatus(
-                          attempt->context->allocate_temp(component_dtypes_[i],
-                                                          shape, &element));
-                      if (!attempt->context->status().ok()) return kComplete;
-                      attempt->tuple.emplace_back(element);
-                    }
-                  }
-                  result = kProgress;
-                  Tuple tuple;
-                  DequeueLocked(attempt->context, &tuple);
-                  const int index = attempt->tuple[0].dim_size(0) -
-                                    attempt->elements_requested;
-                  for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
-                        std::move(tuple[i]), &attempt->tuple[i], index));
-                    if (!attempt->context->status().ok()) return kComplete;
-                  }
-                  tuple.clear();
-                  --attempt->elements_requested;
-                  if (attempt->elements_requested == 0) {
-                    tuple = attempt->tuple;
-                    attempt->done_callback = [callback, tuple]() {
-                      callback(tuple);
-                    };
-                    return kComplete;
-                  }
+            RunResult result = kNoProgress;
+            if (!closed_) queue_size -= min_after_dequeue_;
+            for (; queue_size > 0; --queue_size) {
+              if (attempt->tuple.empty()) {
+                // Only allocate tuple when we have something to dequeue
+                // so we don't use excessive memory when there are many
+                // blocked dequeue attempts waiting.
+                attempt->tuple.reserve(num_components());
+                for (int i = 0; i < num_components(); ++i) {
+                  const TensorShape shape =
+                      ManyOutShape(i, attempt->elements_requested);
+                  Tensor element;
+                  attempt->context->SetStatus(attempt->context->allocate_temp(
+                      component_dtypes_[i], shape, &element));
+                  if (!attempt->context->status().ok()) return kComplete;
+                  attempt->tuple.emplace_back(element);
                 }
-                return result;
-              });
+              }
+              result = kProgress;
+              Tuple tuple;
+              DequeueLocked(attempt->context, &tuple);
+              const int index =
+                  attempt->tuple[0].dim_size(0) - attempt->elements_requested;
+              for (int i = 0; i < num_components(); ++i) {
+                attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                    std::move(tuple[i]), &attempt->tuple[i], index));
+                if (!attempt->context->status().ok()) return kComplete;
+              }
+              tuple.clear();
+              --attempt->elements_requested;
+              if (attempt->elements_requested == 0) {
+                tuple = attempt->tuple;
+                attempt->done_callback = [callback, tuple]() {
+                  callback(tuple);
+                };
+                return kComplete;
+              }
+            }
+            return result;
+          });
     }
   }
   if (!already_cancelled) {
diff --git a/tensorflow/core/kernels/record_input_op.cc b/tensorflow/core/kernels/record_input_op.cc
index 878996c9d6a9923404791d4e8995b817ecdf9799..841f9dc4b8e08b5c2a5346e8c2abd585ebd0cb39 100644
--- a/tensorflow/core/kernels/record_input_op.cc
+++ b/tensorflow/core/kernels/record_input_op.cc
@@ -36,14 +36,18 @@ class RecordInputOp : public OpKernel {
     GETATTR(int64, file_buffer_size);
     GETATTR(int64, file_parallelism);
     GETATTR(int64, batch_size);
+    GETATTR(string, compression_type);
 #undef GETATTR
 
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("compression_type", &compression_type));
+
     RecordYielder::Options yopts;
     yopts.file_pattern = file_pattern;
     yopts.seed = file_random_seed;
     yopts.bufsize = file_buffer_size;
     yopts.file_shuffle_shift_ratio = file_shuffle_shift_ratio;
     yopts.parallelism = file_parallelism;
+    yopts.compression_type = compression_type;
     yielder_ = std::unique_ptr<RecordYielder>(new RecordYielder(ctx, yopts));
 
     batch_size_ = batch_size;
diff --git a/tensorflow/core/kernels/record_yielder.cc b/tensorflow/core/kernels/record_yielder.cc
index e4fa0ed322df57789f95efe584fe91a3efe561ec..3fd9bf9defe4aeedde1f0456638e60ea1e5e2cdb 100644
--- a/tensorflow/core/kernels/record_yielder.cc
+++ b/tensorflow/core/kernels/record_yielder.cc
@@ -206,7 +206,10 @@ void RecordYielder::ShardLoop(Shard* shard) {
       shard->status = errors::InvalidArgument("Can't open ", filename);
       break;
     }
-    io::RecordReader rdr(file.get());
+    io::RecordReaderOptions options =
+        io::RecordReaderOptions::CreateRecordReaderOptions(
+            opts_.compression_type);
+    io::RecordReader rdr(file.get(), options);
     uint64 offset = 0;
     string record;
     while (true) {
diff --git a/tensorflow/core/kernels/record_yielder.h b/tensorflow/core/kernels/record_yielder.h
index c6301812213bf569d47c1fd3b7deba3c57a31ae5..34817ad51b6e4f21e6b6b0f516c438a845b30e3b 100644
--- a/tensorflow/core/kernels/record_yielder.h
+++ b/tensorflow/core/kernels/record_yielder.h
@@ -78,6 +78,8 @@ class RecordYielder {
     // Uses these many concurrent tfrecord iterators to iterate through
     // tfrecords.
     int32 parallelism = 1;
+
+    string compression_type;
   };
 
   explicit RecordYielder(OpKernelConstruction* context,
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 36ca7f834f7b4fe7db1e2591189b1359231c7307..15ae4c1fc53b2b9bfe1d6085d2ecbc3659705b47 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -312,8 +312,7 @@ __global__ void ColumnReduceKernel(
   int col = blockIdx.x * 32 + threadIdx.x;
 
   value_type sum = initVal;
-  if (row < num_rows && col < num_cols)
-    sum = in[row * num_cols + col];
+  if (row < num_rows && col < num_cols) sum = in[row * num_cols + col];
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
@@ -366,8 +365,7 @@ __global__ void CleanupSegments(
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
 
   value_type val = initVal;
-  if (tid < segment_size * num_cols)
-    val = partial_sums[tid];
+  if (tid < segment_size * num_cols) val = partial_sums[tid];
 
   typedef cub::WarpReduce<value_type> WarpReduce;
 
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 9da992ccd18d7bf107a1bc2a7b91ec9fb1a85fd5..03d6e82e018a55214e3ce66d64f49b0a7eb42e11 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -239,16 +239,6 @@ class ReductionOp : public OpKernel {
     if (!out.CopyFrom(tmp_out, helper.out_shape())) {
       ctx->SetStatus(errors::Internal("Error during reduction copy."));
     }
-    if (ctx->track_allocations()) {
-      // The temporary memory becomes the output memory.
-      if (ctx->allocate_on_host(alloc_attr)) {
-        ctx->record_host_temp_memory_size(
-            -static_cast<int64>(out.AllocatedBytes()));
-      } else {
-        ctx->record_device_temp_memory_size(
-            -static_cast<int64>(out.AllocatedBytes()));
-      }
-    }
     ctx->set_output(0, out);
   }
 
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
index 807ac0a4567790ef3fb95b4c12a91a1562f83fa7..5c537c5b9c75afef2b8f4ea5446f3d4012ed0cbb 100644
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -50,6 +50,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .TypeConstraint<int64>("Tidx")                                       \
           .HostMemory("reduction_indices"),                                    \
       ReductionOp<GPUDevice, type, int64, Eigen::internal::MinReducer<type>>);
+REGISTER_GPU_KERNELS(Eigen::half);
 REGISTER_GPU_KERNELS(float);
 REGISTER_GPU_KERNELS(double);
 
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
index 9bbe993a2f93e522688738abaf41a518e95ef871..fe8ea59f1be521166d0e42295e79d1bb5a242750 100644
--- a/tensorflow/core/kernels/reduction_ops_test.cc
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -174,6 +174,11 @@ static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) {
 }
 BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
 
+static void BM_Min2DToScalarGPUHalf(int iters, int num_x, int num_y) {
+  ReduceToScalar<Eigen::half>(iters, "gpu", "Min", num_x, num_y);
+}
+BENCHMARK(BM_Min2DToScalarGPUHalf)->RangePair(2048, 8192, 2048, 8192);
+
 static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) {
   ReduceToScalar<bool>(iters, "gpu", "All", num_x, num_y);
 }
diff --git a/tensorflow/core/kernels/reference_gemm.h b/tensorflow/core/kernels/reference_gemm.h
index bb2a21720f337c61c38e91688fa99360d1270652..c9cc04ed1b7387b9a4a2f335a14100d7c691d507 100644
--- a/tensorflow/core/kernels/reference_gemm.h
+++ b/tensorflow/core/kernels/reference_gemm.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
+#ifndef TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
+#define TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
 
 #include <stdlib.h>
 
@@ -92,4 +92,4 @@ void ReferenceGemm(bool transpose_a, bool transpose_b, bool transpose_c,
 }
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
+#endif  // TENSORFLOW_CORE_KERNELS_REFERENCE_GEMM_H_
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index afad288cc00e0c3934318834d8dae8c181541212..d52358737fd121398ff2a4c95e417fd9b80987ab 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -31,7 +31,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_RELU_KERNELS(type)                                   \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -113,8 +113,7 @@ namespace functor {
                                                                                \
   template <>                                                                  \
   void Selu<GPUDevice, T>::operator()(                                         \
-      const GPUDevice& d,                                                      \
-      typename TTypes<T>::ConstTensor features,                                \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor features,            \
       typename TTypes<T>::Tensor activations);                                 \
   extern template struct Selu<GPUDevice, T>;                                   \
                                                                                \
@@ -125,8 +124,6 @@ namespace functor {
       typename TTypes<T>::Tensor backprops);                                   \
   extern template struct SeluGrad<GPUDevice, T>;
 
-
-
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 }  // namespace functor
 
@@ -157,8 +154,6 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
       Name("SeluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),  \
       SeluGradOp<GPUDevice, type>)
 
-
-
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
@@ -192,10 +187,8 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
       Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
       SeluGradOp<SYCLDevice, type>)
 
-
-
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
 #undef REGISTER_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h
index 24b789c5437c78a76c708a6637b60376d5087682..3bc5ba8a50de22156aa631ee6404ddfe04b3a105 100644
--- a/tensorflow/core/kernels/relu_op_functor.h
+++ b/tensorflow/core/kernels/relu_op_functor.h
@@ -85,10 +85,9 @@ struct Relu6Grad {
     // make sure not to propagate the associated gradient
     // value. This allows "features" to be either the input or the output of
     // the relu6.
-    backprops.device(d) =
-        gradients *
-        ((features > static_cast<T>(0)) * (features < static_cast<T>(6)))
-            .template cast<T>();
+    backprops.device(d) = gradients * ((features > static_cast<T>(0)) *
+                                       (features < static_cast<T>(6)))
+                                          .template cast<T>();
   }
 };
 
@@ -161,8 +160,8 @@ struct SeluGrad {
     const auto scale = static_cast<T>(1.0507009873554804934193349852946);
     const auto scale_alpha = static_cast<T>(1.7580993408473768599402175208123);
     backprops.device(d) =
-        (activations < static_cast<T>(0)).select(
-            gradients * (activations + scale_alpha), gradients * scale);
+        (activations < static_cast<T>(0))
+            .select(gradients * (activations + scale_alpha), gradients * scale);
   }
 };
 
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
index 3fa052108ec2d466caead1cb3c14e2ecc00a45f9..7de45eaaa16030b6e80c427b2db8ebd7280aed00 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
 
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -86,4 +86,4 @@ class TestRemoteFusedGraphExecutor final : public IRemoteFusedGraphExecutor {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_OP_TEST_UTILS_H_
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
index 541c26baaf999d6ad7b34aaf65bf43cb788da582..f0471442781de7c901e1c1cec69b840186015ce3 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
 
 #include <unordered_map>
 #include <unordered_set>
@@ -312,4 +312,4 @@ class RemoteFusedGraphExecuteUtils {
 };
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc
index 18ebf70c1738747ab64545f7770309a3e0865f1a..33c63e70500971cbcfb847d03239e0721d4871ff 100644
--- a/tensorflow/core/kernels/reshape_op.cc
+++ b/tensorflow/core/kernels/reshape_op.cc
@@ -43,7 +43,7 @@ REGISTER_KERNEL_BUILDER(Name("Reshape")
                               .TypeConstraint<int64>("Tshape"), \
                           ReshapeOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
-REGISTER_GPU_KERNEL(bool);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/reshape_util.h b/tensorflow/core/kernels/reshape_util.h
index ed583afd13824eff789ea556045507fb4cff44e6..6777748b63b299b450e4fdc09376f18127c8ab85 100644
--- a/tensorflow/core/kernels/reshape_util.h
+++ b/tensorflow/core/kernels/reshape_util.h
@@ -13,8 +13,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -28,4 +28,4 @@ void Reshape(OpKernelContext *context, const Tensor &input_indices_in,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
diff --git a/tensorflow/core/kernels/resize_area_op_test.cc b/tensorflow/core/kernels/resize_area_op_test.cc
index cc5244d3a07031a843f3bb77e0d409cf9d64b4f2..a7e06ef15a1dd15c4c1428f44dbcd5e560b5e993 100644
--- a/tensorflow/core/kernels/resize_area_op_test.cc
+++ b/tensorflow/core/kernels/resize_area_op_test.cc
@@ -41,7 +41,7 @@ class ResizeAreaOpTest : public OpsTestBase {
     bool is_ref = IsRefType(input_types_[inputs_.size()]);
     Tensor* input = new Tensor(device_->GetAllocator(AllocatorAttributes()),
                                DataTypeToEnum<float>::v(), shape);
-    input->flat<float>().setZero();
+    input->flat<float>().setRandom();
     tensors_.push_back(input);
     if (is_ref) {
       CHECK_EQ(RemoveRefType(input_types_[inputs_.size()]),
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
index 1a9cf4c6406d85bf26b43e0b9b855760a4888a4c..86e61bbcefc1ad2b103552101c17a05c3c3ede6e 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index 9e10fec42321023d95f3ae8d32a5a1c8f2c7a94e..25a37d5e1af5835d56dedb50922967704500ad46 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -286,13 +286,14 @@ BM_ResizeBicubicDev(32, 128, 3);
 BM_ResizeBicubicDev(32, 512, 3);
 BM_ResizeBicubicDev(32, 1024, 3);
 
-#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS)                          \
-  static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(int iters) { \
-    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * SIZE * SIZE *  \
-                            CHANNELS * 8 * 8);                                 \
-    test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8))         \
-        .Run(iters);                                                           \
-  }                                                                            \
+#define BM_ResizeBicubicExpand(BATCH, SIZE, CHANNELS)                         \
+  static void BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS(         \
+      int iters) {                                                            \
+    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * SIZE * SIZE * \
+                            CHANNELS * 8 * 8);                                \
+    test::Benchmark("cpu", ResizeBicubic(BATCH, SIZE, CHANNELS, 8, 8))        \
+        .Run(iters);                                                          \
+  }                                                                           \
   BENCHMARK(BM_ResizeBicubicExpand##_##BATCH##_##SIZE##_##CHANNELS);
 
 BM_ResizeBicubicExpand(12, 48, 1);
diff --git a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
index a7da7a0777d0cb35ade6a04dfff4edf604c1a169..f82c3fcd9ff45e26d2f44408890fa760c64477e4 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_gpu.cu.cc
@@ -164,11 +164,11 @@ struct ResizeBilinear<GPUDevice, T> {
     if (total_count == 0) return;
 
     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);
-    ResizeBilinearKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, images.data(), height_scale, width_scale,
-        batch, in_height, in_width, channels, out_height, out_width,
-        output.data());
+    ResizeBilinearKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, images.data(), height_scale,
+            width_scale, batch, in_height, in_width, channels, out_height,
+            out_width, output.data());
   }
 };
 
@@ -200,11 +200,11 @@ struct ResizeBilinearGrad<GPUDevice, T> {
     // Accumulate.
     total_count = batch * resized_height * resized_width * channels;
     config = GetCudaLaunchConfig(total_count, d);
-    ResizeBilinearGradKernel<
-        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, input_grad.data(), height_scale,
-        width_scale, batch, original_height, original_width, channels,
-        resized_height, resized_width, output_grad.data());
+    ResizeBilinearGradKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input_grad.data(), height_scale,
+            width_scale, batch, original_height, original_width, channels,
+            resized_height, resized_width, output_grad.data());
   }
 };
 
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 0ae8a8fdbc14af81650fb756fdd20bb0d983e71e..5b4aad3cdd83905716df0fd67cec4817e04a1ee1 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/gather_functor.h"
@@ -82,7 +83,7 @@ class ReadVariableOp : public OpKernel {
     ResourceHandle handle = HandleFromInput(ctx, 0);
     const auto status = LookupResource(ctx, handle, &variable);
     OP_REQUIRES(ctx, status.ok(),
-                errors::NotFound(
+                errors::FailedPrecondition(
                     "Error while reading resource variable ", handle.name(),
                     " from Container: ", handle.container(),
                     ". This could mean that the variable was uninitialized. ",
@@ -110,7 +111,6 @@ REGISTER_KERNEL_BUILDER(Name("ReadVariableOp").Device(DEVICE_CPU),
                         ReadVariableOp);
 
 #if GOOGLE_CUDA
-
 REGISTER_KERNEL_BUILDER(
     Name("ReadVariableOp").Device(DEVICE_GPU).HostMemory("resource"),
     ReadVariableOp);
@@ -130,6 +130,7 @@ REGISTER_KERNEL_BUILDER(
                           ResourceHandleOp<Var>)
 
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_variant(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA
 
@@ -275,6 +276,64 @@ class AssignVariableOp : public OpKernel {
   DataType dtype_;
 };
 
+template <typename Device>
+Status VariantCopyFn(OpKernelContext* context, const Tensor& from, Tensor* to);
+
+#define CPU_DENSE_COPY(T)                                                \
+  case DataTypeToEnum<T>::value: {                                       \
+    functor::DenseUpdate<CPUDevice, T, ASSIGN> copy_functor_;            \
+    copy_functor_(context->eigen_device<CPUDevice>(), tensor->flat<T>(), \
+                  from.flat<T>());                                       \
+    break;                                                               \
+  }
+
+#define INSTANTIATE_GET_VARIANT_COPY_FN(Device, TYPE_CALLER, TYPE_DENSE_COPY) \
+  template <>                                                                 \
+  Status VariantCopyFn<Device>(OpKernelContext * context, const Tensor& from, \
+                               Tensor* to) {                                  \
+    PersistentTensor tmp;                                                     \
+    Tensor* tensor;                                                           \
+    AllocatorAttributes attr;                                                 \
+    attr.set_gpu_compatible(true);                                            \
+    attr.set_nic_compatible(true);                                            \
+    TF_RETURN_IF_ERROR(context->allocate_persistent(                          \
+        from.dtype(), from.shape(), &tmp, &tensor, attr));                    \
+    switch (from.dtype()) {                                                   \
+      TYPE_CALLER(TYPE_DENSE_COPY);                                           \
+      default:                                                                \
+        return errors::InvalidArgument(                                       \
+            "VariantCopyFn: Could not perform a deep copy of variant "        \
+            "element of type: ",                                              \
+            DataTypeString(from.dtype()),                                     \
+            " using device: ", context->device()->name());                    \
+    }                                                                         \
+    *to = *tensor;                                                            \
+    return Status::OK();                                                      \
+  }
+
+INSTANTIATE_GET_VARIANT_COPY_FN(CPUDevice, TF_CALL_ALL_TYPES, CPU_DENSE_COPY);
+
+#if GOOGLE_CUDA
+#define GPU_DENSE_COPY(T)                                                \
+  case DataTypeToEnum<T>::value: {                                       \
+    functor::DenseUpdate<GPUDevice, T, ASSIGN> copy_functor_;            \
+    copy_functor_(context->eigen_device<GPUDevice>(), tensor->flat<T>(), \
+                  from.flat<T>());                                       \
+    break;                                                               \
+  }
+#define TF_CALL_GPU_AND_ADDITIONAL_TYPES(T) \
+  TF_CALL_GPU_ALL_TYPES(T);                 \
+  TF_CALL_int32(T);                         \
+  TF_CALL_int64(T);
+INSTANTIATE_GET_VARIANT_COPY_FN(GPUDevice, TF_CALL_GPU_AND_ADDITIONAL_TYPES,
+                                GPU_DENSE_COPY);
+#undef TF_CALL_GPU_AND_ADDITIONAL_TYPES
+#undef GPU_DENSE_COPY
+#endif  // GOOGLE_CUDA
+
+#undef CPU_DENSE_COPY
+#undef INSTANTIATE_GET_VARIANT_COPY_FN
+
 template <typename Device>
 class AssignVariableOp<Device, Variant> : public OpKernel {
  public:
@@ -287,21 +346,15 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& value = context->input(1);
-    OP_REQUIRES(context, dtype_ == value.dtype(),
-                errors::InvalidArgument(
-                    "Variable and value dtypes don't match; respectively, ",
-                    dtype_, " and ", context->input(1).dtype()));
-
     Var* variable = nullptr;
     OP_REQUIRES_OK(context, LookupOrCreateResource<Var>(
                                 context, HandleFromInput(context, 0), &variable,
                                 [this, context](Var** ptr) {
-                                  *ptr = new Var(dtype_);
-                                  // Create an empty new Variant tensor.
+                                  // Created on host.
+                                  *ptr = new Var(DT_VARIANT);
                                   return Status::OK();
                                 }));
     core::ScopedUnref s(variable);
-
     OP_REQUIRES(context, variable->tensor()->dtype() == DT_VARIANT,
                 errors::InvalidArgument(
                     "Trying to assign variable with wrong dtype. Expected ",
@@ -309,16 +362,17 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
                     DataTypeString(DT_VARIANT)));
 
     mutex_lock ml(*variable->mu());
-    // TODO(ebrevdo): Add a proper Variant deep copy / assign registry
-    // entry and use that here.  For now, use a serialization
-    // roundtrip to perform the copy on CPU.  This is OK because this
-    // op is not registered for GPU.
-    *variable->tensor() = Tensor();
-    TensorProto tmp;
-    value.AsProtoTensorContent(&tmp);
-    OP_REQUIRES(context, variable->tensor()->FromProto(tmp),
-                errors::Internal("Could not properly reserialize values "
-                                 "Variant.  Check logs for more details."));
+
+    *variable->tensor() = Tensor(DT_VARIANT, value.shape());
+    const auto elements_in = value.flat<Variant>();
+    auto elements_out = variable->tensor()->flat<Variant>();
+    auto copy_fn = std::bind(&VariantCopyFn<Device>, context,
+                             std::placeholders::_1, std::placeholders::_2);
+    for (int64 i = 0; i < elements_in.size(); ++i) {
+      OP_REQUIRES_OK(context, VariantDeviceCopy(
+                                  VariantDeviceCopyDirection::DEVICE_TO_DEVICE,
+                                  elements_in(i), &elements_out(i), copy_fn));
+    };
   }
 
  private:
@@ -333,7 +387,6 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
-TF_CALL_variant(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
@@ -345,6 +398,7 @@ TF_CALL_variant(REGISTER_KERNELS);
                           AssignVariableOp<GPUDevice, type>);
 
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_variant(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA
 
@@ -464,8 +518,7 @@ class ResourceGatherOp : public OpKernel {
       auto out_flat = out->shaped<T, 3>({1, N, out->NumElements() / N});
 
       functor::GatherFunctor<Device, T, Index> functor;
-      int64 bad_i = functor(c, params_flat,
-                            indices_flat, out_flat);
+      int64 bad_i = functor(c, params_flat, indices_flat, out_flat);
 
       OP_REQUIRES(
           c, bad_i < 0,
@@ -581,6 +634,9 @@ class ResourceScatterUpdateOp : public OpKernel {
 
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ARITHEMTIC_CPU);
 
+REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
+                        scatter_op::UpdateOp::ASSIGN);
+
 // Registers GPU kernels.
 #if GOOGLE_CUDA
 #define REGISTER_SCATTER_ARITHEMTIC_GPU(type) \
diff --git a/tensorflow/core/kernels/restore_op.cc b/tensorflow/core/kernels/restore_op.cc
index 0593a07b80cfb043ee2ea3c99932cc12d9334cc5..d9bbcb14ab3ccf436e8575d48507be0daef66d1b 100644
--- a/tensorflow/core/kernels/restore_op.cc
+++ b/tensorflow/core/kernels/restore_op.cc
@@ -41,7 +41,7 @@ class RestoreOp : public OpKernel {
   }
   void Compute(OpKernelContext* context) override {
     RestoreTensor(context, &checkpoint::OpenTableTensorSliceReader,
-                  preferred_shard_, false);
+                  preferred_shard_, false, 0);
   }
 
  private:
@@ -67,7 +67,7 @@ class RestoreSliceOp : public OpKernel {
   }
   void Compute(OpKernelContext* context) override {
     RestoreTensor(context, &checkpoint::OpenTableTensorSliceReader,
-                  preferred_shard_, true);
+                  preferred_shard_, true, 0);
   }
 
  private:
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 7ac34d1c62376f40f9d30397cad71233db9468dc..bb96c42f10c498d0ec3d6a726728cb1e7bc8f111 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -182,9 +182,9 @@ class ReverseOp : public OpKernel {
       OP_REQUIRES_OK(context,
                      context->allocate_output(0, input.shape(), &output));
 
-#define HANDLE_REVERSE(NDIMS)                                                 \
-  case NDIMS:                                                                 \
-    HandleReverseCase<Device, T, NDIMS>(context, dims.vec<bool>(), output);   \
+#define HANDLE_REVERSE(NDIMS)                                               \
+  case NDIMS:                                                               \
+    HandleReverseCase<Device, T, NDIMS>(context, dims.vec<bool>(), output); \
     return;
 
       switch (input_dims) {
@@ -228,7 +228,7 @@ void HandleReverseV2Case(OpKernelContext* context,
                                        result->tensor<T, NDIMS>());
 }
 
-template <typename Device, typename T>
+template <typename Device, typename T, typename Tidx>
 class ReverseV2Op : public OpKernel {
  public:
   explicit ReverseV2Op(OpKernelConstruction* context) : OpKernel(context) {}
@@ -242,15 +242,15 @@ class ReverseV2Op : public OpKernel {
     } else {
       const int input_dims = input.dims();
       const TensorShape& sparse_dims_shape = sparse_dims.shape();
-      const auto& axes_sparse_flat = sparse_dims.flat<int32>();
+      const auto& axes_sparse_flat = sparse_dims.flat<Tidx>();
 
       OP_REQUIRES(context, TensorShapeUtils::IsVector(sparse_dims_shape),
                   errors::InvalidArgument("'dims' must be 1-dimension, not ",
                                           sparse_dims.dims()));
       gtl::InlinedVector<bool, 8> axes_dense(input_dims, false);
       for (int dummy = 0; dummy < axes_sparse_flat.size(); dummy++) {
-        int32 axis = internal::SubtleMustCopy<int32>(axes_sparse_flat(dummy));
-        int32 canonical_axis = axis < 0 ? input_dims + axis : axis;
+        Tidx axis = internal::SubtleMustCopy<Tidx>(axes_sparse_flat(dummy));
+        Tidx canonical_axis = axis < 0 ? input_dims + axis : axis;
         OP_REQUIRES(context, canonical_axis >= 0 && canonical_axis < input_dims,
                     errors::InvalidArgument("'axis'[", dummy, "] = ", axis,
                                             " is out of valid range [", 0, ", ",
@@ -269,10 +269,10 @@ class ReverseV2Op : public OpKernel {
       OP_REQUIRES_OK(context,
                      context->allocate_output(0, input.shape(), &output));
 
-// TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse of
-// a single dimension to the dims=3 or dims=2 case, regardless of the number
-// of dimensions in the tensor. This would let some ops use faster
-// lower-dimension code (and use optimized versions).
+      // TODO(cwhipkey): we can do dimension folding to reduce, e.g., a reverse
+      // of a single dimension to the dims=3 or dims=2 case, regardless of the
+      // number of dimensions in the tensor. This would let some ops use faster
+      // lower-dimension code (and use optimized versions).
 
 #define HANDLE_REVERSE(NDIMS)                                           \
   case NDIMS:                                                           \
@@ -306,7 +306,13 @@ class ReverseV2Op : public OpKernel {
                               .TypeConstraint<T>("T")        \
                               .TypeConstraint<int32>("Tidx") \
                               .HostMemory("axis"),           \
-                          ReverseV2Op<CPUDevice, T>)
+                          ReverseV2Op<CPUDevice, T, int32>)  \
+  REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
+                              .Device(DEVICE_CPU)            \
+                              .TypeConstraint<T>("T")        \
+                              .TypeConstraint<int64>("Tidx") \
+                              .HostMemory("axis"),           \
+                          ReverseV2Op<CPUDevice, T, int64>)
 TF_CALL_POD_TYPES(REGISTER_KERNELS);
 TF_CALL_string(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
@@ -358,7 +364,13 @@ TF_CALL_complex128(DECLARE_GPU_SPEC);
                               .TypeConstraint<T>("T")        \
                               .TypeConstraint<int32>("Tidx") \
                               .HostMemory("axis"),           \
-                          ReverseV2Op<GPUDevice, T>)
+                          ReverseV2Op<GPUDevice, T, int32>)  \
+  REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
+                              .Device(DEVICE_GPU)            \
+                              .TypeConstraint<T>("T")        \
+                              .TypeConstraint<int64>("Tidx") \
+                              .HostMemory("axis"),           \
+                          ReverseV2Op<GPUDevice, T, int64>)
 TF_CALL_uint8(REGISTER_GPU_KERNELS);
 TF_CALL_int8(REGISTER_GPU_KERNELS);
 // TODO decide whether we want to enable the bool kernel.
@@ -387,7 +399,15 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2")
                             .HostMemory("tensor")
                             .HostMemory("axis")
                             .HostMemory("output"),
-                        ReverseV2Op<CPUDevice, int32>);
+                        ReverseV2Op<CPUDevice, int32, int32>);
+REGISTER_KERNEL_BUILDER(Name("ReverseV2")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("Tidx")
+                            .HostMemory("tensor")
+                            .HostMemory("axis")
+                            .HostMemory("output"),
+                        ReverseV2Op<CPUDevice, int32, int64>);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
@@ -402,7 +422,13 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2")
                               .TypeConstraint<T>("T")        \
                               .TypeConstraint<int32>("Tidx") \
                               .HostMemory("axis"),           \
-                          ReverseV2Op<SYCLDevice, T>)
+                          ReverseV2Op<SYCLDevice, T, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
+                              .Device(DEVICE_SYCL)           \
+                              .TypeConstraint<T>("T")        \
+                              .TypeConstraint<int64>("Tidx") \
+                              .HostMemory("axis"),           \
+                          ReverseV2Op<SYCLDevice, T, int64>)
 TF_CALL_uint8(REGISTER_SYCL_KERNELS);
 TF_CALL_int8(REGISTER_SYCL_KERNELS);
 TF_CALL_float(REGISTER_SYCL_KERNELS);
@@ -422,6 +448,14 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2")
                             .HostMemory("tensor")
                             .HostMemory("axis")
                             .HostMemory("output"),
-                        ReverseV2Op<CPUDevice, int32>);
-#endif // TENSORFLOW_USE_SYCL
+                        ReverseV2Op<CPUDevice, int32, int32>);
+REGISTER_KERNEL_BUILDER(Name("ReverseV2")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("Tidx")
+                            .HostMemory("tensor")
+                            .HostMemory("axis")
+                            .HostMemory("output"),
+                        ReverseV2Op<CPUDevice, int32, int64>);
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reverse_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
index b05a7c5550438c6937745df5e58e81630361d64a..3ee49db669faaa85f2eff7a7f119725fc7170dea 100644
--- a/tensorflow/core/kernels/reverse_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_op_gpu.cu.cc
@@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice;
 #define DEFINE_REVERSE(T, DIM) \
   template struct functor::Reverse<GPUDevice, T, DIM>;
 #define DEFINE_REVERSE_ALL_DIMS(T) \
-  DEFINE_REVERSE(T, 0) \
-  DEFINE_REVERSE(T, 1) \
-  DEFINE_REVERSE(T, 2) \
-  DEFINE_REVERSE(T, 3) \
-  DEFINE_REVERSE(T, 4) \
-  DEFINE_REVERSE(T, 5) \
-  DEFINE_REVERSE(T, 6) \
-  DEFINE_REVERSE(T, 7) \
+  DEFINE_REVERSE(T, 0)             \
+  DEFINE_REVERSE(T, 1)             \
+  DEFINE_REVERSE(T, 2)             \
+  DEFINE_REVERSE(T, 3)             \
+  DEFINE_REVERSE(T, 4)             \
+  DEFINE_REVERSE(T, 5)             \
+  DEFINE_REVERSE(T, 6)             \
+  DEFINE_REVERSE(T, 7)             \
   DEFINE_REVERSE(T, 8)
 
 TF_CALL_uint8(DEFINE_REVERSE_ALL_DIMS);
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index d1980d4b652ecb507d8745bf64be2395d14920bb..15a707a9c6609e2ac5b790ea519f6c8e523067b1 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -51,8 +51,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
 
   // Copy seq_len info down for validity checks
   context->eigen_device<Device>().memcpyDeviceToHost(
-      seq_lens_vec.data(), seq_lens_t.data(),
-      sizeof(Tlen) * seq_lens_t.size());
+      seq_lens_vec.data(), seq_lens_t.data(), sizeof(Tlen) * seq_lens_t.size());
 
   OP_REQUIRES(context, batch_dim != seq_dim,
               errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
@@ -76,8 +75,7 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   }
 }
 
-void CheckErrorsGPU(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
   const Tensor& seq_lens = context->input(1);
 
@@ -98,13 +96,13 @@ void CheckErrorsGPU(OpKernelContext* context, int batch_dim,
 
 template <>
 void CheckErrors<GPUDevice, int32>(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+                                   int seq_dim) {
   CheckErrorsGPU(context, batch_dim, seq_dim);
 }
 
 template <>
 void CheckErrors<GPUDevice, int64>(OpKernelContext* context, int batch_dim,
-                            int seq_dim) {
+                                   int seq_dim) {
   CheckErrorsGPU(context, batch_dim, seq_dim);
 }
 
@@ -164,14 +162,15 @@ class ReverseSequenceOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(ReverseSequenceOp);
 };
 
-#define REGISTER_REVERSE_SEQUENCE(type, len_type)                           \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ReverseSequence").Device(DEVICE_CPU).TypeConstraint<type>("T"). \
-      TypeConstraint<len_type>("Tlen"),                                     \
-      ReverseSequenceOp<CPUDevice, type, len_type>);
+#define REGISTER_REVERSE_SEQUENCE(type, len_type)                \
+  REGISTER_KERNEL_BUILDER(Name("ReverseSequence")                \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<len_type>("Tlen"), \
+                          ReverseSequenceOp<CPUDevice, type, len_type>);
 
-#define REGISTER_REVERSE_SEQUENCE_LEN(type)    \
-  REGISTER_REVERSE_SEQUENCE(type, int32);      \
+#define REGISTER_REVERSE_SEQUENCE_LEN(type) \
+  REGISTER_REVERSE_SEQUENCE(type, int32);   \
   REGISTER_REVERSE_SEQUENCE(type, int64);
 
 TF_CALL_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_LEN);
@@ -181,23 +180,23 @@ TF_CALL_bool(REGISTER_REVERSE_SEQUENCE_LEN);
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T, Tlen, Dims)                                     \
-  template <>                                                               \
-  void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(                  \
-      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input,      \
-      int32 batch_dim, int32 seq_dim,                                       \
-      typename TTypes<Tlen>::ConstVec seq_lens,                             \
-      typename TTypes<T, Dims>::Tensor output);                             \
+#define DECLARE_GPU_SPEC(T, Tlen, Dims)                                \
+  template <>                                                          \
+  void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(             \
+      const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
+      int32 batch_dim, int32 seq_dim,                                  \
+      typename TTypes<Tlen>::ConstVec seq_lens,                        \
+      typename TTypes<T, Dims>::Tensor output);                        \
   extern template struct ReverseSequence<GPUDevice, T, Tlen, Dims>;
 
-#define DECLARE_GPU_SPEC_LEN(T, Dims)    \
-  DECLARE_GPU_SPEC(T, int32, Dims);      \
+#define DECLARE_GPU_SPEC_LEN(T, Dims) \
+  DECLARE_GPU_SPEC(T, int32, Dims);   \
   DECLARE_GPU_SPEC(T, int64, Dims);
 
-#define DECLARE_GPU_SPECS(T)     \
-  DECLARE_GPU_SPEC_LEN(T, 2);    \
-  DECLARE_GPU_SPEC_LEN(T, 3);    \
-  DECLARE_GPU_SPEC_LEN(T, 4);    \
+#define DECLARE_GPU_SPECS(T)  \
+  DECLARE_GPU_SPEC_LEN(T, 2); \
+  DECLARE_GPU_SPEC_LEN(T, 3); \
+  DECLARE_GPU_SPEC_LEN(T, 4); \
   DECLARE_GPU_SPEC_LEN(T, 5);
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
@@ -206,14 +205,15 @@ TF_CALL_bool(DECLARE_GPU_SPECS);
 }  // namespace functor
 
 // Registration of the GPU implementations.
-#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type)                       \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ReverseSequence").Device(DEVICE_GPU).TypeConstraint<type>("T"). \
-      TypeConstraint<len_type>("Tlen"),                                     \
-      ReverseSequenceOp<GPUDevice, type, len_type>);
-
-#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type)   \
-  REGISTER_REVERSE_SEQUENCE_GPU(type, int32);     \
+#define REGISTER_REVERSE_SEQUENCE_GPU(type, len_type)            \
+  REGISTER_KERNEL_BUILDER(Name("ReverseSequence")                \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<len_type>("Tlen"), \
+                          ReverseSequenceOp<GPUDevice, type, len_type>);
+
+#define REGISTER_REVERSE_SEQUENCE_GPU_LEN(type) \
+  REGISTER_REVERSE_SEQUENCE_GPU(type, int32);   \
   REGISTER_REVERSE_SEQUENCE_GPU(type, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_REVERSE_SEQUENCE_GPU_LEN);
diff --git a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
index cb49f14525a3c54ea46df47fb2edeaa9277dc2d3..4a2136a2cd37f4d549c62396d5e30616a306f84f 100644
--- a/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op_gpu.cu.cc
@@ -28,14 +28,14 @@ typedef Eigen::GpuDevice GPUDevice;
   template class generator::ReverseGenerator<T, Tlen, dims>; \
   template struct functor::ReverseSequence<GPUDevice, T, Tlen, dims>;
 
-#define DEFINE_GPU_SPEC_LEN(T, dims)  \
-  DEFINE_GPU_SPEC(T, int32, dims);    \
+#define DEFINE_GPU_SPEC_LEN(T, dims) \
+  DEFINE_GPU_SPEC(T, int32, dims);   \
   DEFINE_GPU_SPEC(T, int64, dims);
 
-#define DEFINE_GPU_SPECS(T) \
-  DEFINE_GPU_SPEC_LEN(T, 2);    \
-  DEFINE_GPU_SPEC_LEN(T, 3);    \
-  DEFINE_GPU_SPEC_LEN(T, 4);    \
+#define DEFINE_GPU_SPECS(T)  \
+  DEFINE_GPU_SPEC_LEN(T, 2); \
+  DEFINE_GPU_SPEC_LEN(T, 3); \
+  DEFINE_GPU_SPEC_LEN(T, 4); \
   DEFINE_GPU_SPEC_LEN(T, 5);
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bcbdbee058b4fdb587f2099c54545b8a6aec8ca9
--- /dev/null
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -0,0 +1,334 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/register_types_traits.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+#define EIGEN_USE_THREADS
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+// dim_size - the size of each dimension
+// dim_range - the number of indices over in the flattened tensor
+//    you need to skip in order to make it over from one side of a dimension
+//    to the other. Used to make the shifts wrap around after a threshold.
+// threshold - the index for each dimension that the roll starts to wrap
+//    back to the front
+template <typename T>
+void DoRoll(OpKernelContext* context, const int64 num_elements,
+            const int num_dims, const gtl::ArraySlice<int>& dim_size,
+            const T* input, T* output, const gtl::ArraySlice<int>& threshold,
+            const gtl::ArraySlice<int64>& dim_range) {
+  auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range](
+                  int64 start, int64 end) {
+    // array of indices for each dimension
+    gtl::InlinedVector<int, 4> indices(num_dims);
+    int offset = 0;  // the shift along the flattened tensor for current element
+    // initialize indices and offset
+    for (int i = 0; i < num_dims; i++) {
+      // stride is the number of indices over in the flattened tensor
+      // you need to skip in order to make it over to an adjacent element
+      // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1)
+      const int64 stride = dim_range[i] / dim_size[i];
+      const int shift = dim_size[i] - threshold[i];
+      const int indx = (start / stride) % dim_size[i];
+      indices[i] = indx;
+      // calculate dimension index after the shift
+      const int shifted_indx = (indx + shift) % dim_size[i];
+      offset += (shifted_indx - indx) * stride;
+    }
+
+    for (int64 i = start; i < end; i++) {
+      output[i + offset] = input[i];
+      // create next combination of indices
+      // while at it adjust offset if needed
+      for (int j = num_dims - 1; j >= 0; j--) {
+        const int indx = (indices[j] + 1) % dim_size[j];
+        indices[j] = indx;
+        if (indx != 0) {
+          if (indx == threshold[j]) {  // we've reached the threshold
+            // dim_range[j] = threshold[j] + shift[j]
+            // offset = shift[j] + ... other offsets
+            // offset - dim_range[j] = -threshold[j] + ... other offsets
+            // thus we undo our previous offset as well as add a new offset of
+            // -threshold[j] in one operation
+            offset -= dim_range[j];  // now wraps around
+          }
+          break;                         // indx != 0 don't need to carry
+        } else if (threshold[j] != 0) {  // if threshold is 0 shift is 0
+          offset += dim_range[j];        // indx became 0 so reverse wrap around
+        }
+      }
+    }
+  };
+  // Shard
+  auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+  // 15 - expiramentally determined with float and bool types
+  const int cost_per_element = 15 * sizeof(T);  // rough esitmate
+  Shard(worker_threads->num_threads, worker_threads->workers, num_elements,
+        cost_per_element, std::move(work));
+}
+
+// dim_size - the size of each dimension
+// dim_range - the number of indices over in the flattened tensor
+//    you need to skip in order to make it over from one side of a dimension
+//    to the other. Used to make the shifts wrap around after a threshold.
+// threshold - the index for each dimension that the roll starts to wrap
+//    back to the front
+// isd - inner shift dimension
+template <typename T>
+// Use memcpy to copy memory in groups when the data type supports memcpy
+void DoRollWithMemcpy(OpKernelContext* context, const int64 num_elements,
+                      const int num_dims, const gtl::ArraySlice<int>& dim_size,
+                      const T* input, T* output,
+                      const gtl::ArraySlice<int>& threshold,
+                      const gtl::ArraySlice<int64>& dim_range,
+                      const int64 isd) {
+  auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range, isd](
+                  int64 start, int64 end) {
+    // the number of indices over in the flattened tensor you need to skip in
+    // order to make it over from one side of the isd to the other
+    const int64 isd_range = std::max<int>(dim_range[isd], 1);
+    // the distance along the flattend tensor to the next element in the isd
+    const int64 isd_stride = isd_range / std::max<int>(dim_size[isd], 1);
+
+    // start and end represent the i-th group currently so we will convert
+    // them into numbers representing the i-th elements.
+    // there are 2 groups per isd one for all elements before threshold[isd]
+    // and another for all elements after threshold[isd].
+    const int64 start_remainder = (start % 2) * threshold[isd] * isd_stride;
+    const int64 end_remainder = (end % 2) * threshold[isd] * isd_stride;
+    start = (start / 2) * isd_range + start_remainder;
+    end = (end / 2) * isd_range + end_remainder;
+
+    const T* in_ptr = &input[0];
+    T* out_ptr = &output[0];
+    in_ptr += start;
+    out_ptr += start;
+
+    // array of indices for each dimension
+    // indicies = [i, j, k, l, m, n]
+    gtl::InlinedVector<int, 4> indicies(num_dims);
+    // the offset needed to make all inner non-shifting dimensions become 0
+    int64 remainder_offset = 0;
+    // initialize indicies
+    for (int i = 0; i < num_dims; i++) {
+      // stride is the number of indices over in the flattened tensor
+      // you need to skip in order to make it over to an adjacent element
+      // along a dimension. dim_size[i] != 0 because we set it to max(dim, 1)
+      const int64 stride = dim_range[i] / dim_size[i];
+      const int shift = dim_size[i] - threshold[i];
+      const int indx = (start / stride) % dim_size[i];
+      indicies[i] = indx;
+      // calculate dimension index after the shift
+      int out_indx = (indx + shift) % dim_size[i];
+      if (i > isd) {
+        // trailing zeroes for indices after the inner shifted dimension
+        out_indx = 0;
+        remainder_offset += (out_indx - indx) * stride;
+      }
+      out_ptr += (out_indx - indx) * stride;
+    }
+    // set trailing zeroes for indices after the inner shifted dimension
+    for (int i = num_dims - 1; i > isd; i--) indicies[i] = 0;
+
+    // the number of indices in the isd dimension the next group will skip
+    // to make it to the next threshold or end point
+    int isd_indx_skip = 0;
+    // the size of the next group
+    int64 group_size = 0;
+    // initialize isd_indx_skip and group_size
+    if (indicies[isd] < threshold[isd]) {
+      isd_indx_skip = threshold[isd] - indicies[isd];
+      group_size = isd_indx_skip * isd_stride + remainder_offset;
+    } else {
+      isd_indx_skip = dim_size[isd] - indicies[isd];
+      group_size = isd_indx_skip * isd_stride + remainder_offset;
+    }
+
+    int64 i = start;
+    while (i < end) {
+      // copy group of elements
+      memcpy(out_ptr, in_ptr, group_size * sizeof(T));
+
+      // shift i and the pointers over to the next group position
+      i += group_size;
+      out_ptr += group_size;
+      in_ptr += group_size;
+
+      // produce next combination of indices and adjust the out_ptr position
+      // to fix the offset if necessary
+      // the isd (inner shift dim) should skip to next threshold or endpoint
+      // all dimensions to the left increment by 1 when a digit is carried
+      // all dimensions to the right remain set to 0
+      //            +1 +1 +1 +isd_indx_skip
+      // indicies = [i, j, k, l, 0, 0]
+      //                      ^isd
+      for (int j = isd; j >= 0; j--) {
+        int inc = 1;
+        if (j == isd) inc = isd_indx_skip;
+        const int indx = (indicies[j] + inc) % dim_size[j];
+        indicies[j] = indx;
+        if (indx != 0) {
+          if (indx == threshold[j]) {
+            out_ptr -= dim_range[j];  // now wraps around
+          }
+          break;                         // indx != 0 don't need to carry
+        } else if (threshold[j] != 0) {  // if threshold is 0 shift is 0
+          out_ptr += dim_range[j];       // indx became 0 so reverse wrap around
+        }
+      }
+
+      // set isd_indx_skip and group_size for next iteration
+      if (indicies[isd] < threshold[isd]) {
+        isd_indx_skip = threshold[isd] - indicies[isd];
+        group_size = isd_indx_skip * isd_stride;
+      } else {
+        isd_indx_skip = dim_size[isd] - indicies[isd];
+        group_size = isd_indx_skip * isd_stride;
+      }
+    }
+  };
+  // Shard
+  auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
+  const int64 ave_group_size = dim_range[isd] / 2;
+  const int total_work = 2 * num_elements / std::max<int>(dim_range[isd], 1);
+  // 25000 - expiramentally determined with float and bool types
+  const int cost_per_group = 25000 * sizeof(T) * ave_group_size;
+  Shard(worker_threads->num_threads, worker_threads->workers, total_work,
+        cost_per_group, std::move(work));
+}
+
+template <typename Device, typename T, typename Tshift, typename Taxis>
+class RollOp : public OpKernel {
+ public:
+  explicit RollOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    const Tensor& input = context->input(0);
+    const Tensor& shift = context->input(1);
+    const Tensor& axis = context->input(2);
+
+    auto shift_flat = shift.flat<Tshift>();
+    auto axis_flat = axis.flat<Taxis>();
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(input.shape()),
+                errors::InvalidArgument("input must be 1-D or higher"));
+    OP_REQUIRES(context, shift.shape().dims() <= 1,
+                errors::InvalidArgument(
+                    "shift must be a scalar or a 1-D vector. Found: ",
+                    shift.shape().DebugString()));
+    OP_REQUIRES(context, axis.shape().dims() <= 1,
+                errors::InvalidArgument(
+                    "axis must be a scalar or a 1-D vector. Found: ",
+                    axis.shape().DebugString()));
+    OP_REQUIRES(
+        context, shift.shape() == axis.shape(),
+        errors::InvalidArgument("shift and axis must have the same size"));
+    const int64 num_elements = input.NumElements();
+    const int num_shifts = static_cast<int>(shift_flat.size());
+    const int num_dims = input.dims();
+
+    // if there are any duplicate axes, shift_mod_sum will have the
+    // total modulo sum of shifts for each dimension
+    gtl::InlinedVector<int, 4> shift_mod_sum(num_dims, 0);
+    for (int i = 0; i < num_shifts; i++) {
+      const int axis = axis_flat(i);
+      OP_REQUIRES(context, axis < num_dims,
+                  errors::InvalidArgument("axis ", axis, " is out of range"));
+      const int ds = std::max<int>(static_cast<int>(input.dim_size(axis)), 1);
+      const int sum = shift_mod_sum[axis] + static_cast<int>(shift_flat(i));
+      // modulo that works with negatives: ((x % y) + y) % y
+      shift_mod_sum[axis] = (sum % ds + ds) % ds;
+    }
+    // the size of each dimension
+    gtl::InlinedVector<int, 4> dim_size(num_dims);
+    // threshold[i] is the index that the roll starts to wrap back to the front
+    gtl::InlinedVector<int, 4> threshold(num_dims);
+    // dim_range is the number of indices over in the flattened tensor
+    // you need to skip in order to make it over from one side of a dimension
+    // to the other. Used to make the shifts wrap around after a threshold.
+    gtl::InlinedVector<int64, 4> dim_range(num_dims);
+    int64 dim_size_prod = 1;  // dimension size product
+    // inner shift dimension (inner most shifted dimension)
+    int64 isd = 0;
+    for (int i = num_dims - 1; i >= 0; i--) {
+      if (isd == 0 && shift_mod_sum[i] != 0) isd = i;
+      const int ds = std::max<int>(static_cast<int>(input.dim_size(i)), 1);
+      dim_size[i] = ds;
+      threshold[i] = (ds - shift_mod_sum[i]) % ds;
+      dim_size_prod *= static_cast<int64>(input.dim_size(i));
+      dim_range[i] = dim_size_prod;
+    }
+
+    Tensor* output = NULL;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    auto input_flat = input.flat<T>().data();
+    auto output_flat = output->flat<T>().data();
+
+    if (std::is_same<Device, CPUDevice>::value) {
+      if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+        // V2 copies memory in groups instead of element by element
+        DoRollWithMemcpy<T>(context, num_elements, num_dims, dim_size,
+                            input_flat, output_flat, threshold, dim_range, isd);
+      } else {
+        // incase memcpy does not work for current data type
+        DoRoll<T>(context, num_elements, num_dims, dim_size, input_flat,
+                  output_flat, threshold, dim_range);
+      }
+    }
+  }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(type)                                       \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("Tshift")   \
+                              .TypeConstraint<int32>("Taxis"),   \
+                          RollOp<CPUDevice, type, int32, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("Tshift")   \
+                              .TypeConstraint<int32>("Taxis"),   \
+                          RollOp<CPUDevice, type, int64, int32>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("Tshift")   \
+                              .TypeConstraint<int64>("Taxis"),   \
+                          RollOp<CPUDevice, type, int32, int64>) \
+  REGISTER_KERNEL_BUILDER(Name("Roll")                           \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("Tshift")   \
+                              .TypeConstraint<int64>("Taxis"),   \
+                          RollOp<CPUDevice, type, int64, int64>)
+
+TF_CALL_ALL_TYPES(REGISTER_CPU);
+#undef REGISTER_CPU
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90b6f8d0f3094224ca694b59c851c14bb424d120
--- /dev/null
+++ b/tensorflow/core/kernels/roll_op_test.cc
@@ -0,0 +1,484 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class RollOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(DataType data_type, DataType index_type) {
+    TF_ASSERT_OK(NodeDefBuilder("myop", "Roll")
+                     .Input(FakeInput(data_type))
+                     .Input(FakeInput(index_type))
+                     .Input(FakeInput(index_type))
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(RollOpTest, ScalarIndices) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5}));
+  test::FillValues<float>(&expected, {2, 3, 4, 0, 1});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ScalarIndices_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5}), {"a", "b", "c", "d", "e"});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5}));
+  test::FillValues<string>(&expected, {"c", "d", "e", "a", "b"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ScalarIndices_Complex) {
+  MakeOp(DT_COMPLEX64, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<std::complex<float>>(
+      TensorShape({5}), {std::complex<float>(0, 10), std::complex<float>(1, 11),
+                         std::complex<float>(2, 12), std::complex<float>(3, 13),
+                         std::complex<float>(4, 14)});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_COMPLEX64, TensorShape({5}));
+  test::FillValues<std::complex<float>>(
+      &expected, {std::complex<float>(2, 12), std::complex<float>(3, 13),
+                  std::complex<float>(4, 14), std::complex<float>(0, 10),
+                  std::complex<float>(1, 11)});
+  test::ExpectTensorEqual<std::complex<float>>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({3, 5}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({2}), {2, -1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5}));
+  test::FillValues<float>(&expected,
+                          {6, 7, 8, 9, 5, 11, 12, 13, 14, 10, 1, 2, 3, 4, 0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({3, 5}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int32>(TensorShape({2}), {2, -1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
+  test::FillValues<string>(&expected, {"g", "h", "i", "j", "f", "l", "m", "n",
+                                       "o", "k", "b", "c", "d", "e", "a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3}));
+  test::FillValues<float>(&expected, {10, 11, 9, 7, 8, 6, 4, 5, 3, 1, 2, 0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({2, 2, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int32>(TensorShape({3}), {1, -1, -1});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
+  test::FillValues<string>(
+      &expected, {"k", "l", "j", "h", "i", "g", "e", "f", "d", "b", "c", "a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD64) {
+  MakeOp(DT_FLOAT, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int64>(TensorShape({2}), {-1, 4});
+  AddInputFromArray<int64>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 3}));
+  test::FillValues<float>(&expected,
+                          {5, 3, 4, 8, 6, 7, 11, 9, 10, 14, 12, 13, 2, 0, 1});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_TwoD64_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5, 3}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int64>(TensorShape({2}), {-1, 4});
+  AddInputFromArray<int64>(TensorShape({2}), {0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5, 3}));
+  test::FillValues<string>(&expected, {"f", "d", "e", "i", "g", "h", "l", "j",
+                                       "k", "o", "m", "n", "c", "a", "b"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD64) {
+  MakeOp(DT_FLOAT, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({4, 1, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 1, 3}));
+  test::FillValues<float>(&expected, {1, 2, 0, 4, 5, 3, 7, 8, 6, 10, 11, 9});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Simple_ThreeD64_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT64);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({4, 1, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int64>(TensorShape({3}), {4, 3, 2});
+  AddInputFromArray<int64>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({4, 1, 3}));
+  test::FillValues<string>(
+      &expected, {"b", "c", "a", "e", "f", "d", "h", "i", "g", "k", "l", "j"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroShift_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 3}));
+  test::FillValues<float>(&expected, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroShift_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(
+      TensorShape({2, 2, 3}),
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 0, 0});
+  AddInputFromArray<int32>(TensorShape({3}), {0, 1, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({2, 2, 3}));
+  test::FillValues<string>(
+      &expected, {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroSize_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 0, 0}), {});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({5, 0, 0}));
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, ZeroSize_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({5, 0, 0}), {});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({5, 0, 0}));
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, OneSize_ThreeD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({1, 1, 1}), {5});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1}));
+  test::FillValues<float>(&expected, {5});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, OneSize_ThreeD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({1, 1, 1}), {"a"});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({1, 1, 1}));
+  test::FillValues<string>(&expected, {"a"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, MultiShifts_TwoD32) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({3, 5}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1});
+  AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({3, 5}));
+  test::FillValues<float>(&expected,
+                          {11, 12, 13, 14, 10, 1, 2, 3, 4, 0, 6, 7, 8, 9, 5});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, MultiShifts_TwoD32_NoMemcpy) {
+  MakeOp(DT_STRING, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<string>(TensorShape({3, 5}),
+                            {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
+                             "k", "l", "m", "n", "o"});
+  AddInputFromArray<int32>(TensorShape({4}), {-2, 2, -1, 1});
+  AddInputFromArray<int32>(TensorShape({4}), {1, 0, 0, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_STRING, TensorShape({3, 5}));
+  test::FillValues<string>(&expected, {"l", "m", "n", "o", "k", "b", "c", "d",
+                                       "e", "a", "g", "h", "i", "j", "f"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({}), {7});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("input must be 1-D or higher"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_AxisMustBeScalarOrVector) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("axis must be a scalar or a 1-D vector"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_ShiftMustBeScalarOrVector) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("shift must be a scalar or a 1-D vector"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_ShiftAndAxisMustBeSameSize) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({2, 2}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({1}), {1});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString())
+                  .contains("shift and axis must have the same size"))
+      << s;
+}
+
+TEST_F(RollOpTest, Error_AxisOutOfRange) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({4}), {1, 2, 3, 4});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("is out of range")) << s;
+}
+
+// isd - (inner shift dimension) The inner most dimension to be shifted.
+//    All outer dimensions will also be shifted for testing.
+static Graph* RollGraph(const TensorShape& shape, int isd) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor input(DT_FLOAT, shape);
+  input.flat<float>().setRandom();
+  const int dims = static_cast<int>(input.dims());
+  Tensor shift(DT_INT32, TensorShape({dims}));
+  for (int i = 0; i < dims; i++) {
+    // shift the inner shift dimension and all outer dimensions
+    shift.flat<int32>()(i) = (i <= isd) ? 2 : 0;
+  }
+  Tensor axis(DT_INT32, TensorShape({dims}));
+  for (int i = 0; i < dims; i++) {
+    axis.flat<int32>()(i) = i;
+  }
+  test::graph::Roll(g, test::graph::Constant(g, input),
+                    test::graph::Constant(g, shift),
+                    test::graph::Constant(g, axis));
+  return g;
+}
+
+#define BM_ROLL_OUTER(DEVICE)                                                 \
+  static void BM_##DEVICE##_roll_outer(int iters, int rows, int columns) {    \
+    TensorShape shape{rows, columns};                                         \
+    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
+    testing::ItemsProcessed(num_items);                                       \
+    testing::BytesProcessed(num_items * sizeof(float));                       \
+    testing::UseRealTime();                                                   \
+    test::Benchmark(#DEVICE, RollGraph(shape, 0)).Run(iters);                 \
+  }                                                                           \
+  BENCHMARK(BM_##DEVICE##_roll_outer)                                         \
+      ->ArgPair(256, 256)                                                     \
+      ->ArgPair(512, 512)                                                     \
+      ->ArgPair(1024, 1024)                                                   \
+      ->ArgPair(2048, 2048)
+
+#define BM_ROLL_ALL(DEVICE)                                                   \
+  static void BM_##DEVICE##_roll_all(int iters, int rows, int columns) {      \
+    TensorShape shape{rows, columns};                                         \
+    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
+    testing::ItemsProcessed(num_items);                                       \
+    testing::BytesProcessed(num_items * sizeof(float));                       \
+    testing::UseRealTime();                                                   \
+    test::Benchmark(#DEVICE, RollGraph(shape, 1)).Run(iters);                 \
+  }                                                                           \
+  BENCHMARK(BM_##DEVICE##_roll_all)                                           \
+      ->ArgPair(256, 256)                                                     \
+      ->ArgPair(512, 512)                                                     \
+      ->ArgPair(1024, 1024)                                                   \
+      ->ArgPair(2048, 2048)
+
+BM_ROLL_OUTER(cpu);
+BM_ROLL_ALL(cpu);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 6b06cf650a849d3ff606b62b00f437ac9accb013..990bd2bff94ac9cf18dd6f6316503890bb31884d 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/kernels/save_restore_tensor.h"
+#include <numeric>
 #include <unordered_map>
-
 #include <utility>
 #include <vector>
-#include "tensorflow/core/kernels/save_restore_tensor.h"
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -85,7 +85,17 @@ void SaveTensors(
   Status s;
   auto tensor_names_flat = tensor_names_t.flat<string>();
 
-  for (int i = 0; i < N; ++i) {
+  // Process tensors in sorted name order.  This allows us to avoid seeking
+  // during restoration in the common case where we are restoring a full
+  // checkpoint.
+  std::vector<size_t> sorted_name_idx(tensor_names_flat.size());
+  std::iota(sorted_name_idx.begin(), sorted_name_idx.end(), 0);
+  std::sort(sorted_name_idx.begin(), sorted_name_idx.end(),
+            [&tensor_names_flat](size_t a, size_t b) {
+              return tensor_names_flat(a) < tensor_names_flat(b);
+            });
+
+  for (size_t i : sorted_name_idx) {
     const string& name = tensor_names_flat(i);
     const Tensor& input = context->input(i + kFixedInputs);
     TensorShape shape(input.shape());
@@ -96,11 +106,11 @@ void SaveTensors(
       OP_REQUIRES_OK(context, checkpoint::ParseShapeAndSlice(
                                   shape_spec, &shape, &slice, &slice_shape));
       OP_REQUIRES(context, slice_shape.IsSameSize(input.shape()),
-                  errors::InvalidArgument("Slice in shape_and_slice "
-                                          "specification does not match the "
-                                          "shape of the tensor to  save: ",
-                                          shape_spec, ", tensor: ",
-                                          input.shape().DebugString()));
+                  errors::InvalidArgument(
+                      "Slice in shape_and_slice "
+                      "specification does not match the "
+                      "shape of the tensor to  save: ",
+                      shape_spec, ", tensor: ", input.shape().DebugString()));
     }
 
 #define WRITER_ADD(T)                                           \
@@ -109,8 +119,7 @@ void SaveTensors(
     break;
 
     switch (input.dtype()) {
-      TF_CALL_POD_STRING_TYPES(WRITER_ADD)
-      TF_CALL_QUANTIZED_TYPES(WRITER_ADD)
+      TF_CALL_SAVE_RESTORE_TYPES(WRITER_ADD)
       default:
         context->SetStatus(errors::Unimplemented("Saving data type ",
                                                  DataTypeString(input.dtype()),
@@ -132,7 +141,7 @@ void SaveTensors(
 
 void RestoreTensor(OpKernelContext* context,
                    checkpoint::TensorSliceReader::OpenTableFunction open_func,
-                   int preferred_shard, bool restore_slice) {
+                   int preferred_shard, bool restore_slice, int restore_index) {
   const Tensor& file_pattern_t = context->input(0);
   {
     const int64 size = file_pattern_t.NumElements();
@@ -145,26 +154,7 @@ void RestoreTensor(OpKernelContext* context,
   const string& file_pattern = file_pattern_t.flat<string>()(0);
 
   const Tensor& tensor_name_t = context->input(1);
-  {
-    const int64 size = tensor_name_t.NumElements();
-    OP_REQUIRES(
-        context, size == 1,
-        errors::InvalidArgument(
-            "Input 1 (tensor_name) must be a string scalar; got a tensor of ",
-            size, "elements"));
-  }
-  const string& tensor_name = tensor_name_t.flat<string>()(0);
-
-  const string* tensor_shape_and_slice_ptr = nullptr;
-  if (restore_slice) {
-    const Tensor& tensor_shape_and_slice_t = context->input(2);
-    OP_REQUIRES(
-        context, tensor_shape_and_slice_t.NumElements() == 1,
-        errors::InvalidArgument("Expected 1 element for the tensor "
-                                "shape and slice but got ",
-                                tensor_shape_and_slice_t.NumElements()));
-    tensor_shape_and_slice_ptr = tensor_shape_and_slice_t.flat<string>().data();
-  }
+  const string& tensor_name = tensor_name_t.flat<string>()(restore_index);
 
   // If we cannot find a cached reader we will allocate our own.
   std::unique_ptr<checkpoint::TensorSliceReader> allocated_reader;
@@ -187,7 +177,7 @@ void RestoreTensor(OpKernelContext* context,
       errors::NotFound("Tensor name \"", tensor_name,
                        "\" not found in checkpoint files ", file_pattern));
   OP_REQUIRES(
-      context, type == context->expected_output_dtype(0),
+      context, type == context->expected_output_dtype(restore_index),
       errors::InvalidArgument("Expected to restore a tensor of type ",
                               DataTypeString(context->expected_output_dtype(0)),
                               ", got a tensor of type ", DataTypeString(type),
@@ -196,23 +186,26 @@ void RestoreTensor(OpKernelContext* context,
   // Shape of the output and slice to load.
   TensorShape output_shape(saved_shape);
   TensorSlice slice_to_load(saved_shape.dims());
-  if (restore_slice && !tensor_shape_and_slice_ptr[0].empty()) {
-    const string& shape_spec = tensor_shape_and_slice_ptr[0];
-    TensorShape parsed_shape;
-    OP_REQUIRES_OK(
-        context, checkpoint::ParseShapeAndSlice(shape_spec, &parsed_shape,
-                                                &slice_to_load, &output_shape));
-    OP_REQUIRES(
-        context, parsed_shape.IsSameSize(saved_shape),
-        errors::InvalidArgument(
-            "Shape in shape_and_slice spec does not match the shape in the "
-            "save file: ",
-            parsed_shape.DebugString(), ", save file shape: ",
-            saved_shape.DebugString()));
+  if (restore_slice) {
+    const string& shape_spec = context->input(2).flat<string>()(restore_index);
+    if (!shape_spec.empty()) {
+      TensorShape parsed_shape;
+      OP_REQUIRES_OK(context, checkpoint::ParseShapeAndSlice(
+                                  shape_spec, &parsed_shape, &slice_to_load,
+                                  &output_shape));
+      OP_REQUIRES(
+          context, parsed_shape.IsSameSize(saved_shape),
+          errors::InvalidArgument(
+              "Shape in shape_and_slice spec does not match the shape in the "
+              "save file: ",
+              parsed_shape.DebugString(),
+              ", save file shape: ", saved_shape.DebugString()));
+    }
   }
 
   Tensor* t = nullptr;
-  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &t));
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(restore_index, output_shape, &t));
 
   if (output_shape.num_elements() == 0) return;
 
@@ -225,8 +218,7 @@ void RestoreTensor(OpKernelContext* context,
     break;
 
   switch (type) {
-    TF_CALL_POD_STRING_TYPES(READER_COPY)
-    TF_CALL_QUANTIZED_TYPES(READER_COPY)
+    TF_CALL_SAVE_RESTORE_TYPES(READER_COPY)
     default:
       context->SetStatus(errors::Unimplemented(
           "Restoring data type ", DataTypeString(type), " not yet supported"));
@@ -239,9 +231,18 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
                         const Tensor& shape_and_slices,
                         gtl::ArraySlice<DataType> dtypes) {
   const string& prefix_string = prefix.scalar<string>()();
+
   const auto& tensor_names_flat = tensor_names.flat<string>();
   const auto& shape_and_slices_flat = shape_and_slices.flat<string>();
 
+  // Sort lookup keys to improve locality when reading multiple tensors.
+  std::vector<size_t> sorted_name_idx(tensor_names_flat.size());
+  std::iota(sorted_name_idx.begin(), sorted_name_idx.end(), 0);
+  std::sort(sorted_name_idx.begin(), sorted_name_idx.end(),
+            [&tensor_names_flat](size_t a, size_t b) {
+              return tensor_names_flat(a) < tensor_names_flat(b);
+            });
+
   BundleReader reader(Env::Default(), prefix_string);
   TF_RETURN_IF_ERROR(reader.status());
 
@@ -250,9 +251,10 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
   // within a fixed memory budget.
   TensorShape restored_full_shape;
   Tensor* restored_tensor = nullptr;
-  for (size_t i = 0; i < tensor_names_flat.size(); ++i) {
+  for (auto i : sorted_name_idx) {
     const string& tensor_name = tensor_names_flat(i);
     const string& shape_and_slice = shape_and_slices_flat(i);
+
     TF_RETURN_IF_ERROR(
         reader.LookupTensorShape(tensor_name, &restored_full_shape));
 
diff --git a/tensorflow/core/kernels/save_restore_tensor.h b/tensorflow/core/kernels/save_restore_tensor.h
index 1e87e5c30b75754777f5b03ef58dd6c3102e27ec..5b74b586e84f5b33c179c986bc8aeacf65835f61 100644
--- a/tensorflow/core/kernels/save_restore_tensor.h
+++ b/tensorflow/core/kernels/save_restore_tensor.h
@@ -37,18 +37,21 @@ void SaveTensors(
     checkpoint::TensorSliceWriter::CreateBuilderFunction builder_func,
     bool save_slices);
 
-// Reads a tensor from the reader built from open_func() and produces it as
-// context->output(0).  "preferred_shard" is the same the TensorSliceReader
-// preferred_shard parameter.
+// Reads a single tensor from the reader built from open_func() and produces
+// it as context->output(restore_index).  "preferred_shard" is the same the
+// TensorSliceReader preferred_shard parameter.
 //
 // context must have the following inputs:
 //  0: a single element string tensor that contains the file name.
-//  1: a single element string tensor that names the output to be restored.
+//  1: string tensor that names the outputs to be restored.
 // If restore_slice is true:
-//  2: shape and slice specification of the tensor to restore.
+//  2: shape and slice specification of the tensors to restore.
+//
+// restore_index indicates the variable name and slice to lookup
+// in context(1) and (2).
 void RestoreTensor(OpKernelContext* context,
                    checkpoint::TensorSliceReader::OpenTableFunction open_func,
-                   int preferred_shard, bool restore_slice);
+                   int preferred_shard, bool restore_slice, int restore_index);
 
 // V2 checkpoint format.
 
diff --git a/tensorflow/core/kernels/save_restore_v2_ops.cc b/tensorflow/core/kernels/save_restore_v2_ops.cc
index c665bc5b03ca741abfa868a4a089d19e97f47536..3acf290ea209923c53333a4233301568e3874219 100644
--- a/tensorflow/core/kernels/save_restore_v2_ops.cc
+++ b/tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -169,8 +169,14 @@ class RestoreV2 : public OpKernel {
         paths.empty()) {
       // Cannot find V2's metadata file, so "prefix_string" does not point to a
       // V2 checkpoint.  Invokes the V1 read path instead.
-      RestoreTensor(context, &checkpoint::OpenTableTensorSliceReader,
-                    /* preferred_shard */ -1, /* restore_slice */ true);
+      for (size_t i = 0; i < tensor_names.NumElements(); ++i) {
+        RestoreTensor(context, &checkpoint::OpenTableTensorSliceReader,
+                      /* preferred_shard */ -1, /* restore_slice */ true,
+                      /* restore_index */ i);
+        if (!context->status().ok()) {
+          return;
+        }
+      }
       return;
     }
     // If found, invokes the V2 reader.
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index c6e35fe329e1c1b7acb62daedeeb2f1a92444b78..079f15e101308867389745ee42146086af91c47c 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -29,7 +29,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace scatter_op {
 
@@ -117,7 +117,7 @@ struct AssignSYCL<scatter_op::UpdateOp::DIV> {
     p.device(d) = p / u;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace internal
 }  // namespace scatter_op
@@ -156,7 +156,7 @@ struct ScatterFunctorBase {
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterFunctorBase <SYCLDevice, T, Index, op> {
+struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
   Index operator()(OpKernelContext* c, const SYCLDevice& d,
                    typename TTypes<T>::Matrix params,
                    typename TTypes<T>::ConstMatrix updates,
@@ -171,13 +171,13 @@ struct ScatterFunctorBase <SYCLDevice, T, Index, op> {
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Copy last Ndim-1 dimensions of updates[i] to params[index]
-      scatter_op::internal::AssignSYCL<op>::Run(d, params.template chip<0>(index),
-                                            updates.template chip<0>(i));
+      scatter_op::internal::AssignSYCL<op>::Run(
+          d, params.template chip<0>(index), updates.template chip<0>(i));
     }
     return -1;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T, typename Index>
 struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
@@ -217,7 +217,7 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
 
 template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterFunctor<CPUDevice, T, Index, op>
-        : ScatterFunctorBase<CPUDevice, T, Index, op>{};
+    : ScatterFunctorBase<CPUDevice, T, Index, op> {};
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
@@ -239,7 +239,7 @@ struct ScatterFunctorSYCL {
     return -1;
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
index e116077d3cfc37871009ee3fede633590d269681..be18658543ea330e3196d0f372154df32e4e1dfc 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -30,9 +30,10 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T, typename Index, scatter_op::UpdateOp op>
-__global__ void ScatterOpCustomKernel(
-    T* params, const T* updates, const Index* indices,
-    Index first_dim_size, Index updates_size, Index indices_size) {
+__global__ void ScatterOpCustomKernel(T* params, const T* updates,
+                                      const Index* indices,
+                                      Index first_dim_size, Index updates_size,
+                                      Index indices_size) {
   Index update_block = updates_size / indices_size;
   CUDA_1D_KERNEL_LOOP(i, updates_size) {
     int indices_i = i / update_block;
@@ -85,8 +86,8 @@ struct ScatterFunctor<GPUDevice, T, Index, op> {
     CudaLaunchConfig config = GetCudaLaunchConfig(updates_size, d);
     ScatterOpCustomKernel<T, Index, op>
         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            params.data(), updates.data(), indices.data(),
-            first_dim_size, updates_size, indices_size);
+            params.data(), updates.data(), indices.data(), first_dim_size,
+            updates_size, indices_size);
     return -1;
   }
 };
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 484932ab0157dee1685b2b90a6c013c11dac061d..3a95dd1773398509e81a514f07fd79f5cb9a0928 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #endif  // GOOGLE_CUDA
 
 #include "tensorflow/core/kernels/scatter_nd_op.h"
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,6 +29,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -83,7 +86,10 @@ class ScatterNdUpdateOp : public OpKernel {
     const DataType dt = DataTypeToEnum<T>::v();
     const DataType dt_ref = DataTypeToEnum<T>::ref();
     const DataType index_t = DataTypeToEnum<Index>::v();
-    if (IsRefType(c->input_type(0))) {
+    dtype_ = c->input_type(0);
+    if (c->input_type(0) == DT_RESOURCE) {
+      // TODO(apassos): what to validate here?
+    } else if (IsRefType(c->input_type(0))) {
       OP_REQUIRES_OK(c, c->MatchSignature({dt_ref, index_t, dt}, {dt_ref}));
       OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
     } else {
@@ -93,7 +99,16 @@ class ScatterNdUpdateOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    if (use_exclusive_lock_) {
+    if (dtype_ == DT_RESOURCE) {
+      if (use_exclusive_lock_) {
+        Var* v;
+        OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
+        mutex_lock m(*v->mu());
+        DoCompute(c);
+      } else {
+        DoCompute(c);
+      }
+    } else if (use_exclusive_lock_) {
       // If we're here, it means the input type is a ref.
       DCHECK(IsRefType(c->input_dtype(0)));
       // Hold mutex while we apply updates
@@ -105,6 +120,7 @@ class ScatterNdUpdateOp : public OpKernel {
   }
 
  private:
+  DataType dtype_;
   bool use_exclusive_lock_;
 
   void DoCompute(OpKernelContext* c) {
@@ -113,7 +129,20 @@ class ScatterNdUpdateOp : public OpKernel {
     Tensor params;
     TensorShape params_shape;
 
-    if (IsRefType(c->input_dtype(0))) {
+    if (dtype_ == DT_RESOURCE) {
+      Var* v;
+      OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
+      Tensor* t = v->tensor();
+      if (!use_exclusive_lock_) {
+        // We're not holding the lock in the outer scope so need it here.
+        mutex_lock m(*v->mu());
+        OP_REQUIRES_OK(c, PrepareToUpdateVariable<Device, T>(c, t));
+      } else {
+        OP_REQUIRES_OK(c, PrepareToUpdateVariable<Device, T>(c, t));
+      }
+      params = *t;
+      params_shape = params.shape();
+    } else if (IsRefType(c->input_dtype(0))) {
       params = c->mutable_input(0, use_exclusive_lock_);
       params_shape = params.shape();
       c->forward_ref_input_to_ref_output(0, 0);
@@ -159,6 +188,16 @@ class ScatterNdUpdateOp : public OpKernel {
           .TypeConstraint<index_type>("Tindices"),                           \
       ScatterNdUpdateOp<dev##Device, type, index_type, op>)
 
+#define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, index_type, \
+                                                         dev, name, op)    \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name(name)                                                           \
+          .Device(DEVICE_##dev)                                            \
+          .TypeConstraint<type>("T")                                       \
+          .TypeConstraint<index_type>("Tindices")                          \
+          .HostMemory("ref"),                                              \
+      ScatterNdUpdateOp<dev##Device, type, index_type, op>)
+
 #define REGISTER_SCATTER_ND_KERNEL(type, dev, name)         \
   REGISTER_SCATTER_ND_KERNEL_INDEX(type, int32, dev, name); \
   REGISTER_SCATTER_ND_KERNEL_INDEX(type, int64, dev, name)
@@ -167,6 +206,11 @@ class ScatterNdUpdateOp : public OpKernel {
   REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int32, dev, name, op); \
   REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int64, dev, name, op)
 
+#define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(type, dev, name, op)    \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int32, dev, name, \
+                                                   op);                    \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int64, dev, name, op)
+
 #define REGISTER_SCATTER_ND_ADD_SUB(type, dev)                            \
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdAdd",            \
                                     scatter_nd_op::UpdateOp::ADD);        \
@@ -178,9 +222,11 @@ class ScatterNdUpdateOp : public OpKernel {
 #define REGISTER_SCATTER_ND(type, dev) \
   REGISTER_SCATTER_ND_KERNEL(type, dev, "ScatterNd");
 
-#define REGISTER_SCATTER_ND_UPDATE(type, dev)                     \
-  REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdUpdate", \
-                                    scatter_nd_op::UpdateOp::ASSIGN);
+#define REGISTER_SCATTER_ND_UPDATE(type, dev)                         \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdUpdate",     \
+                                    scatter_nd_op::UpdateOp::ASSIGN); \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                         \
+      type, dev, "ResourceScatterNdUpdate", scatter_nd_op::UpdateOp::ASSIGN);
 
 // Registers CPU kernels.
 #define REGISTER_SCATTER_ND_ADD_SUB_CPU(type) \
@@ -281,8 +327,7 @@ Status ValidateUpdateShape(const TensorShape& params_shape,
 }
 
 template <typename Index>
-Status PrepareAndValidateInputs(OpKernelContext* c,
-                                const TensorShape& params_shape,
+Status PrepareAndValidateInputs(const TensorShape& params_shape,
                                 const Tensor& indices, const Tensor& updates,
                                 int64* slice_dim, Index* num_updates,
                                 Index* slice_size) {
@@ -396,7 +441,7 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
   Index num_updates;
   Index slice_size;
   TF_RETURN_IF_ERROR(PrepareAndValidateInputs<Index>(
-      c, shape, indices, updates, &slice_dim, &num_updates, &slice_size));
+      shape, indices, updates, &slice_dim, &num_updates, &slice_size));
 
   IndexFlattener<Device, Index> index_flattener;
   auto indices_flat = index_flattener(c, indices);
@@ -442,6 +487,8 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
       PARAMS_CASE(3);
       PARAMS_CASE(4);
       PARAMS_CASE(5);
+      PARAMS_CASE(6);
+      PARAMS_CASE(7);
 #undef PARAMS_CASE
       default:
         return errors::InvalidArgument(
@@ -480,7 +527,9 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 2); \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 3); \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 4); \
-  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5);
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 6); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 7);
 
 #define DECLARE_GPU_SPECS_INDEX(T, Index)                                \
   DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ASSIGN); \
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index cffc326174b274e5e42ee5676a6addad7d7c9203..e82660dcc1dcf9dbb7d531c0223e211ce46a8635 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
 
 // Functor definitions for ScatterND ops, must be compilable by nvcc.
 
@@ -40,7 +40,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class OpKernelContext;
 
@@ -251,10 +251,10 @@ REGISTER_SCATTER_ND_MATH_SYCL(int32);
 #undef REGISTER_SCATTER_ND_INDEX_SYCL
 #undef REGISTER_SCATTER_ND_FULL_SYCL
 
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SCATTER_ND_OP_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl_6.cc b/tensorflow/core/kernels/scatter_nd_op_cpu_impl_6.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d98412e2551b5eacb9190838b922cadd26d7aaf2
--- /dev/null
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl_6.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 6
+#include "tensorflow/core/kernels/scatter_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl_7.cc b/tensorflow/core/kernels/scatter_nd_op_cpu_impl_7.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a008b55603c060953015a463cf49f5768bde637a
--- /dev/null
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl_7.cc
@@ -0,0 +1,19 @@
+
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 7
+#include "tensorflow/core/kernels/scatter_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 0eb3cf32dd33705cffe4c37dbe91eb0ffc31563a..a3c21edc15f684e51c7f1806aeeeeead679ea22e 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -55,6 +55,27 @@ struct LeftUpdate<T, scatter_nd_op::UpdateOp::SUB> {
   }
 };
 
+// Specializations for std::complex, updating real and imaginary part
+// individually. Even though this is not an atomic op anymore, it is safe
+// because there is only one type of op per kernel.
+template <typename T>
+struct LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::ADD> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(
+      std::complex<T>* out, const std::complex<T>& val) {
+    T* ptr = reinterpret_cast<T*>(out);
+    CudaAtomicAdd(ptr, val.real());
+    CudaAtomicAdd(ptr, val.imag());
+  }
+};
+
+template <typename T>
+struct LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::SUB> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void operator()(
+      std::complex<T>* out, const std::complex<T>& val) {
+    LeftUpdate<std::complex<T>, scatter_nd_op::UpdateOp::ADD>()(out, -val);
+  }
+};
+
 }  // namespace
 
 template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM>
@@ -136,7 +157,9 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 2); \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 3); \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 4); \
-  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5);
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 6); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 7);
 
 #define DECLARE_GPU_SPECS_INDEX(T, Index)                                \
   DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ASSIGN); \
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 8607c7f95af79c8f581768cfc698bad9fe085188..282165349f316144d261859d5a3a992f047e0df3 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_SYCL
 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -33,7 +33,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Check whether updates.shape = indices.shape + params.shape[1:]
 static bool ValidShapes(const Tensor& params, const Tensor& updates,
@@ -102,11 +102,12 @@ class ScatterUpdateOp : public OpKernel {
 
     // Check that we have enough index space
     const int64 N_big = indices.NumElements();
-    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
-                errors::InvalidArgument(
-                    "indices has too many elements for ",
-                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
-                    N_big, " > ", std::numeric_limits<Index>::max()));
+    OP_REQUIRES(
+        c, N_big <= std::numeric_limits<Index>::max(),
+        errors::InvalidArgument("indices has too many elements for ",
+                                DataTypeString(DataTypeToEnum<Index>::v()),
+                                " indexing: ", N_big, " > ",
+                                std::numeric_limits<Index>::max()));
     const Index N = static_cast<Index>(indices.NumElements());
     OP_REQUIRES(
         c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
@@ -137,7 +138,7 @@ class ScatterUpdateOp : public OpKernel {
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
-class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
+class ScatterUpdateOp<SYCLDevice, T, Index, op> : public OpKernel {
  public:
   explicit ScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
@@ -165,11 +166,12 @@ class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
 
     // Check that we have enough index space
     const int64 N_big = indices.NumElements();
-    OP_REQUIRES(c, N_big <= std::numeric_limits<Index>::max(),
-                errors::InvalidArgument(
-                    "indices has too many elements for ",
-                    DataTypeString(DataTypeToEnum<Index>::v()), " indexing: ",
-                    N_big, " > ", std::numeric_limits<Index>::max()));
+    OP_REQUIRES(
+        c, N_big <= std::numeric_limits<Index>::max(),
+        errors::InvalidArgument("indices has too many elements for ",
+                                DataTypeString(DataTypeToEnum<Index>::v()),
+                                " indexing: ", N_big, " > ",
+                                std::numeric_limits<Index>::max()));
     const Index N = static_cast<Index>(indices.NumElements());
     OP_REQUIRES(
         c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
@@ -206,7 +208,7 @@ class ScatterUpdateOp <SYCLDevice, T, Index, op> : public OpKernel {
     }
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SCATTER_KERNEL_INDEX(type, index_type, dev, name, op) \
   REGISTER_KERNEL_BUILDER(Name(name)                                   \
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index 5042cfafc0ebd942508df92c25d3720a8a7f1b72..066a4b80a2bc6976a6c95ced2c5efecbef13eeba 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -21,10 +21,10 @@ limitations under the License.
 #include <random>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 
 namespace tensorflow {
-
 namespace sdca {
 
 using UnalignedFloatVector = TTypes<const float>::UnalignedConstVec;
@@ -37,9 +37,8 @@ void FeatureWeightsDenseStorage::UpdateDenseDeltaWeights(
   const size_t num_weight_vectors = normalized_bounded_dual_delta.size();
   if (num_weight_vectors == 1) {
     deltas_.device(device) =
-        deltas_ +
-        dense_vector.RowAsMatrix() *
-            deltas_.constant(normalized_bounded_dual_delta[0]);
+        deltas_ + dense_vector.RowAsMatrix() *
+                      deltas_.constant(normalized_bounded_dual_delta[0]);
   } else {
     // Transform the dual vector into a column matrix.
     const Eigen::TensorMap<Eigen::Tensor<const double, 2, Eigen::RowMajor>>
@@ -61,9 +60,8 @@ void FeatureWeightsSparseStorage::UpdateSparseDeltaWeights(
     const Example::SparseFeatures& sparse_features,
     const std::vector<double>& normalized_bounded_dual_delta) {
   for (int64 k = 0; k < sparse_features.indices->size(); ++k) {
-    const double feature_value = sparse_features.values == nullptr
-                                     ? 1.0
-                                     : (*sparse_features.values)(k);
+    const double feature_value =
+        sparse_features.values == nullptr ? 1.0 : (*sparse_features.values)(k);
     auto it = indices_to_id_.find((*sparse_features.indices)(k));
     for (size_t l = 0; l < normalized_bounded_dual_delta.size(); ++l) {
       deltas_(l, it->second) +=
@@ -122,23 +120,24 @@ Status ModelWeights::Initialize(OpKernelContext* const context) {
   }
 
   // Reads in the weights, and allocates and initializes the delta weights.
-  const auto initialize_weights = [&](
-      const OpInputList& weight_inputs, OpOutputList* const weight_outputs,
-      std::vector<FeatureWeightsDenseStorage>* const feature_weights) {
-    for (int i = 0; i < weight_inputs.size(); ++i) {
-      Tensor* delta_t;
-      TF_RETURN_IF_ERROR(
-          weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t));
-      // Convert the input vector to a row matrix in internal representation.
-      auto deltas = delta_t->shaped<float, 2>({1, delta_t->NumElements()});
-      deltas.setZero();
-      feature_weights->emplace_back(
-          FeatureWeightsDenseStorage{weight_inputs[i].shaped<float, 2>(
-                                         {1, weight_inputs[i].NumElements()}),
-                                     deltas});
-    }
-    return Status::OK();
-  };
+  const auto initialize_weights =
+      [&](const OpInputList& weight_inputs, OpOutputList* const weight_outputs,
+          std::vector<FeatureWeightsDenseStorage>* const feature_weights) {
+        for (int i = 0; i < weight_inputs.size(); ++i) {
+          Tensor* delta_t;
+          TF_RETURN_IF_ERROR(
+              weight_outputs->allocate(i, weight_inputs[i].shape(), &delta_t));
+          // Convert the input vector to a row matrix in internal
+          // representation.
+          auto deltas = delta_t->shaped<float, 2>({1, delta_t->NumElements()});
+          deltas.setZero();
+          feature_weights->emplace_back(FeatureWeightsDenseStorage{
+              weight_inputs[i].shaped<float, 2>(
+                  {1, weight_inputs[i].NumElements()}),
+              deltas});
+        }
+        return Status::OK();
+      };
 
   return initialize_weights(dense_weights_inputs, &dense_weights_outputs,
                             &dense_weights_);
@@ -278,7 +277,7 @@ Status Examples::SampleAdaptativeProbabilities(
   int num_retries = 0;
   while (id < num_examples() && num_retries < num_examples()) {
     int picked_id = sampler.Sample(&random);
-    if (dis(gen) > std::pow(0.1, sampled_count_[picked_id])) {
+    if (dis(gen) > MathUtil::IPow(0.1, sampled_count_[picked_id])) {
       num_retries++;
       continue;
     }
@@ -520,5 +519,4 @@ void Examples::ComputeSquaredNormPerExample(
 }
 
 }  // namespace sdca
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h
index 9f072700754320700024be57ebe3c4ca780a1ae9..45915693ac6f0b4ad2d5f2aacebcd4aa34c03439 100644
--- a/tensorflow/core/kernels/sdca_internal.h
+++ b/tensorflow/core/kernels/sdca_internal.h
@@ -149,7 +149,8 @@ class Example {
   // 1.0f.
   struct SparseFeatures {
     std::unique_ptr<TTypes<const int64>::UnalignedConstVec> indices;
-    std::unique_ptr<TTypes<const float>::UnalignedConstVec> values;  // nullptr encodes optional.
+    std::unique_ptr<TTypes<const float>::UnalignedConstVec>
+        values;  // nullptr encodes optional.
   };
 
   // A dense vector which is a row-slice of the underlying matrix.
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index 0f5c2424b38aeed5912287bba7a218575a107073..dbe0177dda337a271433cd3bb4257026dc702364 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -57,11 +57,11 @@ namespace tensorflow {
 
 namespace {
 
-using sdca::Regularizations;
 using sdca::Example;
 using sdca::Examples;
 using sdca::ExampleStatistics;
 using sdca::ModelWeights;
+using sdca::Regularizations;
 
 struct ComputeOptions {
   explicit ComputeOptions(OpKernelConstruction* const context) {
@@ -76,8 +76,9 @@ struct ComputeOptions {
     } else if (loss_type == "smooth_hinge_loss") {
       loss_updater.reset(new SmoothHingeLossUpdater);
     } else {
-      OP_REQUIRES(context, false, errors::InvalidArgument(
-                                      "Unsupported loss type: ", loss_type));
+      OP_REQUIRES(
+          context, false,
+          errors::InvalidArgument("Unsupported loss type: ", loss_type));
     }
     OP_REQUIRES_OK(context, context->GetAttr("adaptative", &adaptative));
     OP_REQUIRES_OK(
@@ -90,9 +91,10 @@ struct ComputeOptions {
         context, num_sparse_features + num_dense_features > 0,
         errors::InvalidArgument("Requires at least one feature to train."));
 
-    OP_REQUIRES(context, static_cast<int64>(num_sparse_features) +
-                                 static_cast<int64>(num_dense_features) <=
-                             std::numeric_limits<int>::max(),
+    OP_REQUIRES(context,
+                static_cast<int64>(num_sparse_features) +
+                        static_cast<int64>(num_dense_features) <=
+                    std::numeric_limits<int>::max(),
                 errors::InvalidArgument(
                     strings::Printf("Too many feature groups: %lld > %d",
                                     static_cast<int64>(num_sparse_features) +
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 2334e50f1dcb08e9662615e83d721f8b08568102..6c4685a50a4139b9f33d22b409059f7c03fa2812 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
-#include "tensorflow/core/kernels/segment_reduction_ops.h"
-#include <vector>
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/segment_reduction_ops.h"
+#include <vector>
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -115,7 +115,7 @@ class SegmentReductionOp : public OpKernel {
     Eigen::DSizes<Eigen::DenseIndex, 1> dims_to_reduce;
     dims_to_reduce[0] = 0;
 #else
-    Eigen::IndexList<Eigen::type2index<0>> dims_to_reduce;
+    Eigen::IndexList<Eigen::type2index<0> > dims_to_reduce;
 #endif
     Index start = 0, end = 1;
 
@@ -356,158 +356,180 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SORTED_KERNELS_ALL);
 #undef REGISTER_GPU_SORTED_KERNELS_ALL
 #endif  // GOOGLE_CUDA
 
+// ____________________________________________________________________________
+// Unsorted segment reduction ops.
+
 namespace functor {
 
-// UnsortedSegmentSumFunctor implementation for CPUDevice.
-// todo: Remove duplicate code in UnsortedSegmentSumFunctor and UnsortedSegmentMaxFunctor.
-template <typename T, typename Index>
-struct UnsortedSegmentSumFunctor<CPUDevice, T, Index>
-    : UnsortedSegmentBaseFunctor<CPUDevice, T, Index> {
-  void operator()(OpKernelContext* ctx, const CPUDevice& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
+// The ReductionFunctor implementation for CPU.
+template <typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor<CPUDevice, T, Index, InitialValueF, ReductionF> {
+  void operator()(OpKernelContext* ctx, const Index num_segments,
+                  const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
                   const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output) override {
-    output.setZero();
+                  typename TTypes<T, 2>::Tensor output) {
+    output.setConstant(InitialValueF()());
     if (data_size == 0) {
       return;
     }
     const int64 N = segment_ids.dimension(0);
+    ReductionF reduction;
     auto data_flat = typename TTypes<T, 2>::ConstTensor(data, N, data_size / N);
     for (int64 i = 0; i < N; ++i) {
       Index j = internal::SubtleMustCopy(segment_ids(i));
       if (j < 0) {
         continue;
       }
-      OP_REQUIRES(ctx, FastBoundsCheck(j, output_rows),
+      OP_REQUIRES(ctx, FastBoundsCheck(j, num_segments),
                   errors::InvalidArgument(
                       "segment_ids", SliceDebugString(segment_ids_shape, i),
-                      " = ", j, " is out of range [0, ", output_rows, ")"));
-      output.template chip<0>(j) += data_flat.template chip<0>(i);
+                      " = ", j, " is out of range [0, ", num_segments, ")"));
+      reduction(data_flat.template chip<0>(i), output.template chip<0>(j));
     }
   }
 };
-// UnsortedSegmentMaxFunctor implementation for CPUDevice.
-template <typename T, typename Index>
-struct UnsortedSegmentMaxFunctor<CPUDevice, T, Index>
-    : UnsortedSegmentBaseFunctor<CPUDevice, T, Index> {
-  void operator()(OpKernelContext* ctx, const CPUDevice& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
-                  typename TTypes<Index>::ConstFlat segment_ids,
-                  const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output) override {
-    output.setConstant(std::numeric_limits<T>::lowest());
-    if (data_size == 0) {
-      return;
-    }
-    const int64 N = segment_ids.dimension(0);
-    auto data_flat = typename TTypes<T, 2>::ConstTensor(data, N, data_size / N);
-    for (int64 i = 0; i < N; ++i) {
-      Index j = internal::SubtleMustCopy(segment_ids(i));
-      OP_REQUIRES(ctx, FastBoundsCheck(j, output_rows),
-                  errors::InvalidArgument(
-                      "segment_ids", SliceDebugString(segment_ids_shape, i),
-                      " = ", j, " is out of range [0, ", output_rows, ")"));
-      output.template chip<0>(j) =
-          data_flat.template chip<0>(i).cwiseMax(output.template chip<0>(j));
-    }
+
+template <typename T>
+using MatrixChip = Eigen::TensorChippingOp<0l, typename TTypes<T, 2>::Matrix>;
+
+template <typename T>
+using constMatrixChip =
+    Eigen::TensorChippingOp<0l, const typename TTypes<T, 2>::ConstMatrix>;
+
+// reduction functors
+template <typename T>
+struct SumOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output += data;
+  }
+};
+
+template <typename T>
+struct MaxOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output = data.cwiseMax(output);
+  }
+};
+
+template <typename T>
+struct MinOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output = data.cwiseMin(output);
+  }
+};
+
+template <typename T>
+struct ProdOp {
+  void operator()(const constMatrixChip<T> data, MatrixChip<T> output) {
+    output *= data;
   }
 };
 }  // namespace functor
 
-// Base class for SegmentReductionOps that can handle unsorted segment
-// definitions
-// and specifying the size of the output in addition to a reduction function
-template <typename Device, class T, class Index>
-class UnsortedSegmentBaseOp : public OpKernel {
+// Static check routines not in the templated class to reduce code size
+static void UnsortedSegmentReductionValidation(OpKernel* op_kernel,
+                                               OpKernelContext* context,
+                                               const Tensor& data,
+                                               const Tensor& segment_ids,
+                                               const Tensor& num_segments) {
+  OP_REQUIRES(
+      context, op_kernel->IsLegacyScalar(num_segments.shape()),
+      errors::InvalidArgument("num_segments should be a scalar, not shape ",
+                              num_segments.shape().DebugString()));
+  OP_REQUIRES(
+      context, TensorShapeUtils::StartsWith(data.shape(), segment_ids.shape()),
+      errors::InvalidArgument("data.shape = ", data.shape().DebugString(),
+                              " does not start with segment_ids.shape = ",
+                              segment_ids.shape().DebugString()));
+}
+
+static bool UnsortedSegmentReductionDoValidation(OpKernel* op_kernel,
+                                                 OpKernelContext* context,
+                                                 const Tensor& data,
+                                                 const Tensor& segment_ids,
+                                                 const Tensor& num_segments) {
+  UnsortedSegmentReductionValidation(op_kernel, context, data, segment_ids,
+                                     num_segments);
+  return context->status().ok();
+}
+
+// The UnsortedSegmentReduction OpKernel. The DeviceReductionFunctor
+// is the device specific implementation of the reduction. These device
+// specific implementations are templated themselves with the corresponding
+// initial value functors and reduction functors.
+template <typename T, typename Index, typename DeviceReductionFunctor>
+class UnsortedSegmentReductionOp : public OpKernel {
  public:
-  explicit UnsortedSegmentBaseOp(
-      OpKernelConstruction* context,
-      functor::UnsortedSegmentBaseFunctor<Device, T, Index>& functor)
-      : OpKernel(context), reduction_functor_(functor) {}
+  explicit UnsortedSegmentReductionOp(OpKernelConstruction* context)
+      : OpKernel(context), reduction_functor_(DeviceReductionFunctor()) {}
 
   void Compute(OpKernelContext* context) override {
     const Tensor& data = context->input(0);
     const Tensor& segment_ids = context->input(1);
     const Tensor& num_segments = context->input(2);
-
-    OP_REQUIRES(
-        context, IsLegacyScalar(num_segments.shape()),
-        errors::InvalidArgument("num_segments should be a scalar, not shape ",
-                                num_segments.shape().DebugString()));
-    OP_REQUIRES(
-        context,
-        TensorShapeUtils::StartsWith(data.shape(), segment_ids.shape()),
-        errors::InvalidArgument("data.shape = ", data.shape().DebugString(),
-                                " does not start with segment_ids.shape = ",
-                                segment_ids.shape().DebugString()));
-
+    if (!UnsortedSegmentReductionDoValidation(this, context, data, segment_ids,
+                                              num_segments)) {
+      return;
+    }
     const auto segment_flat = segment_ids.flat<Index>();
     const Index output_rows =
         internal::SubtleMustCopy(num_segments.scalar<int32>()());
     OP_REQUIRES(context, output_rows >= 0,
                 errors::InvalidArgument("Input num_segments == ", output_rows,
                                         " must not be negative."));
-
     TensorShape output_shape;
     output_shape.AddDim(output_rows);
     for (int i = segment_ids.dims(); i < data.dims(); i++) {
       output_shape.AddDim(data.dim_size(i));
     }
-
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
     auto output_flat = output->flat_outer_dims<T>();
-
     auto data_ptr = data.template flat<T>().data();
-    reduction_functor_(context, context->template eigen_device<Device>(),
-                     output_rows, segment_ids.shape(), segment_flat,
-                     data.NumElements(), data_ptr, output_flat);
+    reduction_functor_(context, output_rows, segment_ids.shape(), segment_flat,
+                       data.NumElements(), data_ptr, output_flat);
   }
- private:
-  functor::UnsortedSegmentBaseFunctor<Device, T, Index>& reduction_functor_;
-};
 
-template <typename Device, class T, class Index>
-class UnsortedSegmentSumOp : public UnsortedSegmentBaseOp<Device, T, Index> {
- public:
-  explicit UnsortedSegmentSumOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(
-            context,
-            sum_functor_) {}
- private:
-    functor::UnsortedSegmentSumFunctor<Device, T, Index> sum_functor_;
+ protected:
+  DeviceReductionFunctor reduction_functor_;
 };
 
-template <typename Device, class T, class Index>
-class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp<Device, T, Index> {
- public:
-  explicit UnsortedSegmentMaxOp(OpKernelConstruction* context)
-      : UnsortedSegmentBaseOp<Device, T, Index>(
-            context,
-            max_functor_) {}
- private:
-    functor::UnsortedSegmentMaxFunctor<Device, T, Index> max_functor_;
-};
-
-#define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type)                  \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum")                          \
-                              .Device(DEVICE_CPU)                             \
-                              .TypeConstraint<type>("T")                      \
-                              .TypeConstraint<index_type>("Tindices"),        \
-                          UnsortedSegmentSumOp<CPUDevice, type, index_type>); \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentMax")                          \
-                              .Device(DEVICE_CPU)                             \
-                              .TypeConstraint<type>("T")                      \
-                              .TypeConstraint<index_type>("Tindices"),        \
-                          UnsortedSegmentMaxOp<CPUDevice, type, index_type>);
-
-#define REGISTER_COMPLEX_CPU_UNSORTED_KERNELS(type, index_type)        \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum")                   \
-                              .Device(DEVICE_CPU)                      \
-                              .TypeConstraint<type>("T")               \
-                              .TypeConstraint<index_type>("Tindices"), \
-                          UnsortedSegmentSumOp<CPUDevice, type, index_type>);
+#define REGISTER_CPU_KERNEL_UNSORTEDSEGMENT(                           \
+    name, type, index_type, initial_value_functor, reduction_functor)  \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name(name)                                                       \
+          .Device(DEVICE_CPU)                                          \
+          .TypeConstraint<type>("T")                                   \
+          .TypeConstraint<index_type>("Tindices"),                     \
+      UnsortedSegmentReductionOp<                                      \
+          type, index_type,                                            \
+          functor::UnsortedSegmentFunctor<CPUDevice, type, index_type, \
+                                          initial_value_functor,       \
+                                          reduction_functor> >)
+
+#define REGISTER_REAL_CPU_UNSORTED_KERNELS(type, index_type)                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type,  \
+                                      functor::Zero<type>,                     \
+                                      functor::SumOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMax", type, index_type,  \
+                                      functor::Lowest<type>,                   \
+                                      functor::MaxOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMin", type, index_type,  \
+                                      functor::Highest<type>,                  \
+                                      functor::MinOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
+                                      functor::One<type>,                      \
+                                      functor::ProdOp<type>);
+
+#define REGISTER_COMPLEX_CPU_UNSORTED_KERNELS(type, index_type)                \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type,  \
+                                      functor::Zero<type>,                     \
+                                      functor::SumOp<type>);                   \
+  REGISTER_CPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
+                                      functor::One<type>,                      \
+                                      functor::ProdOp<type>)
 
 #define REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL(type) \
   REGISTER_REAL_CPU_UNSORTED_KERNELS(type, int32);   \
@@ -520,31 +542,72 @@ class UnsortedSegmentMaxOp : public UnsortedSegmentBaseOp<Device, T, Index> {
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL);
 REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex64);
 REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex128);
+
 #undef REGISTER_REAL_CPU_UNSORTED_KERNELS
+#undef REGISTER_CPU_KERNEL_UNSORTEDSEGMENT
 #undef REGISTER_COMPLEX_CPU_UNSORTED_KERNELS
 #undef REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL
 #undef REGISTER_REAL_CPU_UNSORTED_KERNELS_ALL
 
 #if GOOGLE_CUDA
-#define REGISTER_GPU_UNSORTED_KERNELS(type, index_type)                \
-  REGISTER_KERNEL_BUILDER(Name("UnsortedSegmentSum")                   \
-                              .Device(DEVICE_GPU)                      \
-                              .HostMemory("num_segments")              \
-                              .TypeConstraint<type>("T")               \
-                              .TypeConstraint<index_type>("Tindices"), \
-                          UnsortedSegmentSumOp<GPUDevice, type, index_type>);
-
-#define REGISTER_GPU_UNSORTED_KERNELS_ALL(type) \
-  REGISTER_GPU_UNSORTED_KERNELS(type, int32);   \
-  REGISTER_GPU_UNSORTED_KERNELS(type, int64);
+#define REGISTER_GPU_KERNEL_UNSORTEDSEGMENT(                                 \
+    name, type, index_type, initial_value_functor, reduction_kernel_functor) \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name(name)                                                             \
+          .Device(DEVICE_GPU)                                                \
+          .HostMemory("num_segments")                                        \
+          .TypeConstraint<type>("T")                                         \
+          .TypeConstraint<index_type>("Tindices"),                           \
+      UnsortedSegmentReductionOp<                                            \
+          type, index_type,                                                  \
+          functor::UnsortedSegmentFunctor<GPUDevice, type, index_type,       \
+                                          initial_value_functor,             \
+                                          reduction_kernel_functor> >)
+
+// sum is the only op that supports all input types currently
+#define REGISTER_REAL_GPU_UNSORTED_KERNELS(type, index_type)                   \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMax", type, index_type,  \
+                                      functor::Lowest<type>,                   \
+                                      functor::MaxOpGpu<type>);                \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentMin", type, index_type,  \
+                                      functor::Highest<type>,                  \
+                                      functor::MinOpGpu<type>);                \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentProd", type, index_type, \
+                                      functor::One<type>,                      \
+                                      functor::ProdOpGpu<type>);
+
+#define REGISTER_SUM_GPU_UNSORTED_KERNELS(type, index_type)                   \
+  REGISTER_GPU_KERNEL_UNSORTEDSEGMENT("UnsortedSegmentSum", type, index_type, \
+                                      functor::Zero<type>,                    \
+                                      functor::SumOpGpu<type>);
+
+#define REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL(type) \
+  REGISTER_REAL_GPU_UNSORTED_KERNELS(type, int32);   \
+  REGISTER_REAL_GPU_UNSORTED_KERNELS(type, int64);
+
+#define REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL(type) \
+  REGISTER_SUM_GPU_UNSORTED_KERNELS(type, int32);   \
+  REGISTER_SUM_GPU_UNSORTED_KERNELS(type, int64);
+
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_int32(REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_int32(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_complex64(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_complex128(REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL);
+
+#undef REGISTER_GPU_KERNEL_UNSORTEDSEGMENT
+#undef REGISTER_REAL_GPU_UNSORTED_KERNELS
+#undef REGISTER_SUM_GPU_UNSORTED_KERNELS
+#undef REGISTER_REAL_GPU_UNSORTED_KERNELS_ALL
+#undef REGISTER_SUM_GPU_UNSORTED_KERNELS_ALL
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_UNSORTED_KERNELS_ALL);
-TF_CALL_complex64(REGISTER_GPU_UNSORTED_KERNELS_ALL);
-TF_CALL_complex128(REGISTER_GPU_UNSORTED_KERNELS_ALL);
-#undef REGISTER_GPU_UNSORTED_KERNELS
-#undef REGISTER_GPU_UNSORTED_KERNELS_ALL
 #endif  // GOOGLE_CUDA
 
+// ____________________________________________________________________________
+// Sparse segment reduction ops.
+
 // Same as SegmentReductionOp but takes as input a "sparse" tensor, represented
 // by two dense tensors, one containing the data, and the other containing
 // indices into the data.
@@ -553,10 +616,11 @@ class SparseSegmentReductionOpBase : public OpKernel {
  public:
   explicit SparseSegmentReductionOpBase(OpKernelConstruction* context,
                                         bool is_mean, bool is_sqrtn,
-                                        T default_value)
+                                        bool has_num_segments, T default_value)
       : OpKernel(context),
         is_mean_(is_mean),
         is_sqrtn_(is_sqrtn),
+        has_num_segments_(has_num_segments),
         default_value_(default_value) {}
 
   void Compute(OpKernelContext* context) override {
@@ -564,6 +628,19 @@ class SparseSegmentReductionOpBase : public OpKernel {
     const Tensor& indices = context->input(1);
     const Tensor& segment_ids = context->input(2);
 
+    Index output_rows = -1;
+    if (has_num_segments_) {
+      const Tensor& num_segments = context->input(3);
+
+      OP_REQUIRES(
+          context, num_segments.shape().dims() == 0,
+          errors::InvalidArgument("num_segments should be a scalar, not shape ",
+                                  num_segments.shape().DebugString()));
+      output_rows = internal::SubtleMustCopy(num_segments.scalar<int32>()());
+      OP_REQUIRES(context, output_rows >= 0,
+                  errors::InvalidArgument("segment ids must be >= 0"));
+    }
+
     OP_REQUIRES(context, TensorShapeUtils::IsVector(indices.shape()),
                 errors::InvalidArgument("indices should be a vector."));
     OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()),
@@ -581,10 +658,17 @@ class SparseSegmentReductionOpBase : public OpKernel {
     const auto segment_vec = segment_ids.vec<OutputRow>();
     // Note that the current implementation assumes that segment_vec values are
     // sorted.
-    const OutputRow output_rows =
+    const OutputRow last_segment_id_plus_one =
         num_indices > 0
             ? internal::SubtleMustCopy(segment_vec(num_indices - 1)) + 1
             : 0;
+    if (has_num_segments_) {
+      OP_REQUIRES(
+          context, output_rows >= last_segment_id_plus_one,
+          errors::InvalidArgument("segment ids must be < num_segments"));
+    } else {
+      output_rows = last_segment_id_plus_one;
+    }
     OP_REQUIRES(context, output_rows >= 0,
                 errors::InvalidArgument("segment ids must be >= 0"));
 
@@ -642,15 +726,24 @@ class SparseSegmentReductionOpBase : public OpKernel {
           Reduce(input_flat, indices_vec, start, end - start, out);
       OP_REQUIRES(context, bad_offset < 0,
                   errors::InvalidArgument(
-                      "Bad: indices[", start + bad_offset, "] == ",
-                      indices_vec(start + bad_offset), " out of range [0, ",
-                      input_flat.dimension(0), ")"));
+                      "Bad: indices[", start + bad_offset,
+                      "] == ", indices_vec(start + bad_offset),
+                      " out of range [0, ", input_flat.dimension(0), ")"));
 
-      if (end >= num_indices) break;
       start = end;
       ++end;
       uninitialized_index = out_index + 1;
       out_index = next_index;
+      if (end > num_indices) break;
+    }
+
+    // Fill the gap at the end with the default value.
+    if (uninitialized_index < output_rows) {
+      Eigen::DSizes<Eigen::DenseIndex, 2> gap_slice_shape(
+          output_rows - uninitialized_index, num_col);
+      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Unaligned>
+          gap_slice(&output_flat(uninitialized_index, 0), gap_slice_shape);
+      gap_slice.setConstant(default_value_);
     }
   }
 
@@ -786,6 +879,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
 
   const bool is_mean_;
   const bool is_sqrtn_;
+  const bool has_num_segments_;
   const T default_value_;
 };
 
@@ -794,9 +888,20 @@ class SparseSegmentReductionMeanOp
     : public SparseSegmentReductionOpBase<Device, T> {
  public:
   explicit SparseSegmentReductionMeanOp(OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(context, true /*is_mean*/,
-                                                false /*is_sqrtn*/,
-                                                T(0) /* default_value */) {}
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, true /*is_mean*/, false /*is_sqrtn*/,
+            false /* has_num_segments */, T(0) /* default_value */) {}
+};
+
+template <typename Device, class T>
+class SparseSegmentReductionMeanWithNumSegmentsOp
+    : public SparseSegmentReductionOpBase<Device, T> {
+ public:
+  explicit SparseSegmentReductionMeanWithNumSegmentsOp(
+      OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, true /*is_mean*/, false /*is_sqrtn*/,
+            true /* has_num_segments */, T(0) /* default_value */) {}
 };
 
 template <typename Device, class T>
@@ -804,9 +909,20 @@ class SparseSegmentReductionSqrtNOp
     : public SparseSegmentReductionOpBase<Device, T> {
  public:
   explicit SparseSegmentReductionSqrtNOp(OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(context, false /*is_mean*/,
-                                                true /*is_sqrtn*/,
-                                                T(0) /* default_value */) {}
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, false /*is_mean*/, true /*is_sqrtn*/,
+            false /* has_num_segments */, T(0) /* default_value */) {}
+};
+
+template <typename Device, class T>
+class SparseSegmentReductionSqrtNWithNumSegmentsOp
+    : public SparseSegmentReductionOpBase<Device, T> {
+ public:
+  explicit SparseSegmentReductionSqrtNWithNumSegmentsOp(
+      OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, false /*is_mean*/, true /*is_sqrtn*/,
+            true /* has_num_segments */, T(0) /* default_value */) {}
 };
 
 template <typename Device, class T>
@@ -814,37 +930,65 @@ class SparseSegmentReductionSumOp
     : public SparseSegmentReductionOpBase<Device, T> {
  public:
   explicit SparseSegmentReductionSumOp(OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(context, false /*is_mean*/,
-                                                false /*is_sqrtn*/,
-                                                T(0) /* default_value */) {}
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, false /*is_mean*/, false /*is_sqrtn*/,
+            false /* has_num_segments */, T(0) /* default_value */) {}
 };
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSum")            \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("T")      \
-                              .TypeConstraint<int32>("Tidx"), \
-                          SparseSegmentReductionSumOp<CPUDevice, type>);
+template <typename Device, class T>
+class SparseSegmentReductionSumWithNumSegmentsOp
+    : public SparseSegmentReductionOpBase<Device, T> {
+ public:
+  explicit SparseSegmentReductionSumWithNumSegmentsOp(
+      OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, false /*is_mean*/, false /*is_sqrtn*/,
+            true /* has_num_segments */, T(0) /* default_value */) {}
+};
 
+#define REGISTER_CPU_SPARSE_KERNELS(type)                                \
+  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSum")                       \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<type>("T")                 \
+                              .TypeConstraint<int32>("Tidx"),            \
+                          SparseSegmentReductionSumOp<CPUDevice, type>); \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("SparseSegmentSumWithNumSegments")                            \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<type>("T")                                     \
+          .TypeConstraint<int32>("Tidx"),                                \
+      SparseSegmentReductionSumWithNumSegmentsOp<CPUDevice, type>);
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_SPARSE_KERNELS);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentMean")           \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("T")      \
-                              .TypeConstraint<int32>("Tidx"), \
-                          SparseSegmentReductionMeanOp<CPUDevice, type>);
+#define REGISTER_CPU_SPARSE_KERNELS(type)                                 \
+  REGISTER_KERNEL_BUILDER(Name("SparseSegmentMean")                       \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<type>("T")                  \
+                              .TypeConstraint<int32>("Tidx"),             \
+                          SparseSegmentReductionMeanOp<CPUDevice, type>); \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("SparseSegmentMeanWithNumSegments")                            \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<type>("T")                                      \
+          .TypeConstraint<int32>("Tidx"),                                 \
+      SparseSegmentReductionMeanWithNumSegmentsOp<CPUDevice, type>);
 REGISTER_CPU_SPARSE_KERNELS(float);
 REGISTER_CPU_SPARSE_KERNELS(double);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSqrtN")          \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("T")      \
-                              .TypeConstraint<int32>("Tidx"), \
-                          SparseSegmentReductionSqrtNOp<CPUDevice, type>);
+#define REGISTER_CPU_SPARSE_KERNELS(type)                                  \
+  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSqrtN")                       \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<type>("T")                   \
+                              .TypeConstraint<int32>("Tidx"),              \
+                          SparseSegmentReductionSqrtNOp<CPUDevice, type>); \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("SparseSegmentSqrtNWithNumSegments")                            \
+          .Device(DEVICE_CPU)                                              \
+          .TypeConstraint<type>("T")                                       \
+          .TypeConstraint<int32>("Tidx"),                                  \
+      SparseSegmentReductionSqrtNWithNumSegmentsOp<CPUDevice, type>);
 REGISTER_CPU_SPARSE_KERNELS(float);
 REGISTER_CPU_SPARSE_KERNELS(double);
 #undef REGISTER_CPU_SPARSE_KERNELS
@@ -889,9 +1033,10 @@ class SparseSegmentGradOpBase : public OpKernel {
 
     // Note that similar to SparseSegmentMean, we assume that segment_vec is
     // already sorted and has non-negative values.
-    const SegmentId num_segments =
+    const SegmentId num_segments = input.dim_size(0);
+    const SegmentId last_segment_id_plus_one =
         internal::SubtleMustCopy(segment_vec(N - 1)) + 1;
-    OP_REQUIRES(context, input.dim_size(0) == num_segments,
+    OP_REQUIRES(context, last_segment_id_plus_one <= num_segments,
                 errors::InvalidArgument("Invalid number of segments"));
 
     // Compute scaling factors for input.
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index b10bea72ba89e7089e0668389995c629644b534d..51814273b305bfa35bca0ddce0376658064ea56a 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -46,55 +46,80 @@ struct SegmentSumFunctor {
                   const Index data_size, const T* data,
                   typename TTypes<T, 2>::Tensor output);
 };
+
 #endif
 
-// BaseFunctor for definition of UnsorteSegmentReductionOp
-// for usage without templates.
-template <typename Device, typename T, typename Index>
-struct UnsortedSegmentBaseFunctor{
-  virtual ~UnsortedSegmentBaseFunctor(){}
-  virtual void operator()(OpKernelContext* ctx, const Device& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
+template <typename Device, typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor {
+  void operator()(OpKernelContext* ctx, const Index num_segments,
+                  const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
                   const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output){};
+                  typename TTypes<T, 2>::Tensor output);
 };
 
-// Functor for UnsortedSegmentSumOp.
-// output_rows: the number of output segments (unique segment ids in
-//                'segment_ids').
-// segment_ids_shape: shape of 'segment_ids' tensor.
-// segment_ids: unsorted map from input to output segment ids at which to
-//                perform segment sum operation.
-// data_size: size of input data tensor.
-// data: input data tensor.
-// output: output reshaped to {output_rows, output.size/output_rows}
-template <typename Device, typename T, typename Index>
-struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
-  void operator()(OpKernelContext* ctx, const Device& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
-                  typename TTypes<Index>::ConstFlat segment_ids,
-                  const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output);
+#ifdef GOOGLE_CUDA
+// reduction functors for the gpu
+template <typename T>
+struct SumOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicAdd(dest, value);
+  }
 };
 
-// Functor for UnsortedSegmentMaxOp.
-// output_rows: the number of output segments (unique segment ids in
-//                'segment_ids').
-// segment_ids_shape: shape of 'segment_ids' tensor.
-// segment_ids: unsorted map from input to output segment ids at which to
-//                perform segment sum operation.
-// data_size: size of input data tensor.
-// data: input data tensor.
-// output: output reshaped to {output_rows, output.size/output_rows}
-template <typename Device, typename T, typename Index>
-struct UnsortedSegmentMaxFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
-  void operator()(OpKernelContext* ctx, const Device& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
-                  typename TTypes<Index>::ConstFlat segment_ids,
-                  const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output);
+template <typename T>
+struct ProdOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicMul(dest, value);
+  }
+};
+
+template <typename T>
+struct MaxOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicMax(dest, value);
+  }
+};
+
+template <typename T>
+struct MinOpGpu {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(T* dest,
+                                                        const T& value) {
+    CudaAtomicMin(dest, value);
+  }
 };
+
+#endif  // GOOGLE_CUDA
+
+// initial value functors
+template <typename T>
+struct Zero {
+  EIGEN_STRONG_INLINE T operator()() const { return T(0); }
+};
+
+template <typename T>
+struct One {
+  EIGEN_STRONG_INLINE T operator()() const { return T(1); }
+};
+
+template <typename T>
+struct Lowest {
+  EIGEN_STRONG_INLINE T operator()() const {
+    return Eigen::NumTraits<T>::lowest();
+  }
+};
+
+template <typename T>
+struct Highest {
+  EIGEN_STRONG_INLINE T operator()() const {
+    return Eigen::NumTraits<T>::highest();
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index 159fada621bd88de259e9b044491f3ecebf10b19..ba979e6bb216b649ff4fc3cefa7099ac9cbc1b91 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -18,42 +18,15 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/segment_reduction_ops.h"
-
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/cuda_device_functions.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
+
 namespace tensorflow {
 
 using GPUDevice = Eigen::GpuDevice;
 
-// Helper for UnusortedSegmentSumCustomKernel that adds value into dest
-// atomically.
-template <typename T>
-static __device__ __forceinline__ void AccumulateInto(T* dest, const T& value) {
-  CudaAtomicAdd(dest, value);
-}
-
-// Specializations of AccumulateInto for complex types, which CudaAtomicAdd does
-// not support. We treat a std::complex<T>* as a T* (the C++ standard section
-// 26.4.4 allows this explicitly) and atomic add the real and imaginary
-// components individually. The operation as a whole is not atomic, but we can
-// safely treat the components independently for the purpose of accumulating.
-template <>
-__device__ __forceinline__ void AccumulateInto(
-    std::complex<float>* dest, const std::complex<float>& value) {
-  auto dest_scalar = reinterpret_cast<float*>(dest);
-  CudaAtomicAdd(dest_scalar, value.real());
-  CudaAtomicAdd(dest_scalar + 1, value.imag());
-}
-
-template <>
-__device__ __forceinline__ void AccumulateInto(
-    std::complex<double>* dest, const std::complex<double>& value) {
-  auto dest_scalar = reinterpret_cast<double*>(dest);
-  CudaAtomicAdd(dest_scalar, value.real());
-  CudaAtomicAdd(dest_scalar + 1, value.imag());
-}
-
 // SortedSegmentSumFunctor kernel reduces input data just as
 // UnsortedSegmentSumCustomKernel does except that input data
 // is partitioned along the outer reduction dimension. This is
@@ -81,7 +54,7 @@ __global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
                                              const Index* segment_ids,
                                              const T* input, T* output,
                                              const Index total_stripe_count) {
-  CUDA_1D_KERNEL_LOOP(stripe_index, total_stripe_count) {
+  for (int stripe_index : CudaGridRangeX(total_stripe_count)) {
     const Index segment_offset = stripe_index % inner_dim_size;
     const Index input_outer_dim_index_base =
         stripe_index / inner_dim_size * Index(OuterDimTileSize);
@@ -106,7 +79,7 @@ __global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
         // decide whether to write result to global memory using atomic
         // operations
         if (last_output_segment_id == first_segment_id) {
-          AccumulateInto<T>(output + output_index, sum);
+          CudaAtomicAdd(output + output_index, sum);
         } else {
           *(output + output_index) = sum;
         }
@@ -121,31 +94,31 @@ __global__ void SortedSegmentSumCustomKernel(const Index input_outer_dim_size,
     // the following strip.
     const Index output_index =
         last_output_segment_id * inner_dim_size + segment_offset;
-    AccumulateInto<T>(output + output_index, sum);
+    CudaAtomicAdd(output + output_index, sum);
   }
 }
 
-// UnsortedSegmentSumFunctor kernel processes 'input_total_size' elements.
+// UnsortedSegmentSumKernel processes 'input_total_size' elements.
 // Each element is mapped from input to output by a combination of its
 // 'segment_ids' mapping and 'inner_dim_size'.
-template <typename T, typename Index>
-__global__ void UnsortedSegmentSumCustomKernel(
-    const Index input_outer_dim_size, const Index inner_dim_size,
-    const Index output_outer_dim_size, const Index* segment_ids, const T* input,
-    T* output) {
+template <typename T, typename Index, typename KernelReductionFunctor>
+__global__ void UnsortedSegmentCustomKernel(const Index input_outer_dim_size,
+                                            const Index inner_dim_size,
+                                            const Index output_outer_dim_size,
+                                            const Index* segment_ids,
+                                            const T* input, T* output) {
   const Index input_total_size = input_outer_dim_size * inner_dim_size;
   const Index output_total_size = output_outer_dim_size * inner_dim_size;
-  CUDA_1D_KERNEL_LOOP(input_index, input_total_size) {
+  for (int input_index : CudaGridRangeX(input_total_size)) {
     const Index input_segment_index = input_index / inner_dim_size;
     const Index segment_offset = input_index % inner_dim_size;
     const Index output_segment_index = segment_ids[input_segment_index];
-
     if (output_segment_index < 0 || output_segment_index >= output_total_size) {
       continue;
     }
     const Index output_index =
         output_segment_index * inner_dim_size + segment_offset;
-    AccumulateInto<T>(output + output_index, ldg(input + input_index));
+    KernelReductionFunctor()(output + output_index, ldg(input + input_index));
   }
 }
 
@@ -190,42 +163,40 @@ void SegmentSumFunctor<T, Index>::operator()(
       <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
           input_outer_dim_size, input_inner_dim_size, output_rows,
           segment_ids.data(), data, output.data(), total_stripe_count);
-};
+}
 
-// UnsortedSegmentSumFunctor implementation for GPUDevice.
-template <typename T, typename Index>
-struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFunctor<GPUDevice, T, Index> {
-  void operator()(OpKernelContext* ctx, const GPUDevice& d,
-                  const Index output_rows, const TensorShape& segment_ids_shape,
+template <typename T, typename Index, typename InitialValueF,
+          typename ReductionF>
+struct UnsortedSegmentFunctor<GPUDevice, T, Index, InitialValueF, ReductionF> {
+  void operator()(OpKernelContext* ctx, const Index num_segments,
+                  const TensorShape& segment_ids_shape,
                   typename TTypes<Index>::ConstFlat segment_ids,
                   const Index data_size, const T* data,
-                  typename TTypes<T, 2>::Tensor output) override {
+                  typename TTypes<T, 2>::Tensor output) {
     if (output.size() == 0) {
       return;
     }
-    // Set 'output' to zeros.
+    // Set 'output' to initial value.
+    GPUDevice d = ctx->template eigen_device<GPUDevice>();
     CudaLaunchConfig config = GetCudaLaunchConfig(output.size(), d);
-    SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        output.size(), output.data());
+    SetToValue<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        output.size(), output.data(), InitialValueF()());
     if (data_size == 0 || segment_ids_shape.num_elements() == 0) {
       return;
     }
-
-    // Launch kernel to compute unsorted segment sum.
+    // Launch kernel to compute unsorted segment reduction.
     // Notes:
-    // *) 'input_total_size' is the total number of elements to process.
+    // *) 'data_size' is the total number of elements to process.
     // *) 'segment_ids.shape' is a prefix of data's shape.
     // *) 'input_outer_dim_size' is the total number of segments to process.
-    const Index input_total_size = data_size;
     const Index input_outer_dim_size = segment_ids.dimension(0);
-    const Index input_inner_dim_size = input_total_size / input_outer_dim_size;
-
-    config = GetCudaLaunchConfig(input_total_size, d);
-    UnsortedSegmentSumCustomKernel<
-        T,
-        Index><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        input_outer_dim_size, input_inner_dim_size, output_rows,
-        segment_ids.data(), data, output.data());
+    const Index input_inner_dim_size = data_size / input_outer_dim_size;
+    config = GetCudaLaunchConfig(data_size, d);
+
+    UnsortedSegmentCustomKernel<T, Index, ReductionF>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            input_outer_dim_size, input_inner_dim_size, num_segments,
+            segment_ids.data(), data, output.data());
   }
 };
 
@@ -238,19 +209,40 @@ struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFuncto
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_SORTED_GPU_SPECS);
 
-#define DEFINE_GPU_SPECS_INDEX(T, Index) \
-  template struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>
-
-#define DEFINE_GPU_SPECS(T)         \
-  DEFINE_GPU_SPECS_INDEX(T, int32); \
-  DEFINE_GPU_SPECS_INDEX(T, int64);
-
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
-TF_CALL_complex64(DEFINE_GPU_SPECS);
-TF_CALL_complex128(DEFINE_GPU_SPECS);
-
-#undef DEFINE_GPU_SPECS
-#undef DEFINE_GPU_SPECS_INDEX
+#define DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, Index)                         \
+  template struct UnsortedSegmentFunctor<                                      \
+      GPUDevice, T, Index, functor::Lowest<T>, functor::MaxOpGpu<T>>;          \
+  template struct UnsortedSegmentFunctor<                                      \
+      GPUDevice, T, Index, functor::Highest<T>, functor::MinOpGpu<T>>;         \
+  template struct UnsortedSegmentFunctor<GPUDevice, T, Index, functor::One<T>, \
+                                         functor::ProdOpGpu<T>>;
+
+// sum is the only op that supports all input types currently
+#define DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, Index) \
+  template struct UnsortedSegmentFunctor<             \
+      GPUDevice, T, Index, functor::Zero<T>, functor::SumOpGpu<T>>;
+
+#define DEFINE_REAL_GPU_SPECS(T)                  \
+  DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, int32); \
+  DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX(T, int64);
+
+#define DEFINE_SUM_GPU_SPECS(T)                  \
+  DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, int32); \
+  DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX(T, int64);
+
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_REAL_GPU_SPECS);
+TF_CALL_int32(DEFINE_REAL_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_SUM_GPU_SPECS);
+TF_CALL_int32(DEFINE_SUM_GPU_SPECS);
+TF_CALL_complex64(DEFINE_SUM_GPU_SPECS);
+TF_CALL_complex128(DEFINE_SUM_GPU_SPECS);
+
+#undef DEFINE_SORTED_GPU_SPECS_INDEX
+#undef DEFINE_SORTED_GPU_SPECS
+#undef DEFINE_REAL_UNSORTED_GPU_SPECS_INDEX
+#undef DEFINE_SUM_UNSORTED_GPU_SPECS_INDEX
+#undef DEFINE_REAL_GPU_SPECS
+#undef DEFINE_SUM_GPU_SPECS
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/self_adjoint_eig_op.cc b/tensorflow/core/kernels/self_adjoint_eig_op.cc
index 97657807268d30d66a01573bc3df09e318ce1d51..bcd88773902824c6e88db4226af43993d5649007 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_op.cc
+++ b/tensorflow/core/kernels/self_adjoint_eig_op.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
-
 namespace tensorflow {
 
 template <class Scalar>
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 206fd40fa68c3158fa60b7651d40121ab1344bbd..688e61fcadc3ad01b579f8dfc712af2d8032ee35 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -114,7 +114,7 @@ REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_GPU), SendOp);
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_SYCL), SendOp);
 REGISTER_KERNEL_BUILDER(
     Name("_HostSend").Device(DEVICE_SYCL).HostMemory("tensor"), SendOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("_HostSend").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(
@@ -198,7 +198,7 @@ REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_GPU), RecvOp);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_SYCL), RecvOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("_HostRecv").Device(DEVICE_CPU), RecvOp);
 REGISTER_KERNEL_BUILDER(
@@ -207,6 +207,6 @@ REGISTER_KERNEL_BUILDER(
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(
     Name("_HostRecv").Device(DEVICE_SYCL).HostMemory("tensor"), RecvOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index e2e3758d87e49702ebc48f78c022affe49a3b7e4..9db0bd4d98bdb9964cb561d96d91782ba3615a7f 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -53,13 +53,13 @@ class RangeOp : public OpKernel {
     if (delta > 0) {
       OP_REQUIRES(
           context, start <= limit,
-          errors::InvalidArgument("Requires start <= limit when delta > 0: ",
-                                  start, "/", limit));
+          errors::InvalidArgument(
+              "Requires start <= limit when delta > 0: ", start, "/", limit));
     } else {
       OP_REQUIRES(
           context, start >= limit,
-          errors::InvalidArgument("Requires start >= limit when delta < 0: ",
-                                  start, "/", limit));
+          errors::InvalidArgument(
+              "Requires start >= limit when delta < 0: ", start, "/", limit));
     }
     int64 size = (std::is_integral<T>::value
                       ? ((std::abs(limit - start) + std::abs(delta) - 1) /
diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc
index cfb86904d573cd7577fc7dca0d48a6d146ceb058..799c574d1542c345c606c276b0cc24fe61a47bba 100644
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@@ -27,23 +27,31 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/reshape_util.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
 
 using sparse::SparseTensor;
 
+template <typename T>
 class SerializeSparseOp : public OpKernel {
  public:
   explicit SerializeSparseOp(OpKernelConstruction* context)
       : OpKernel(context) {}
 
+  Status Initialize(Tensor* result);
+  Status Serialize(const Tensor& input, T* result);
+
   void Compute(OpKernelContext* context) override {
     const Tensor* input_indices;
     const Tensor* input_values;
     const Tensor* input_shape;
+
     OP_REQUIRES_OK(context, context->input("sparse_indices", &input_indices));
     OP_REQUIRES_OK(context, context->input("sparse_values", &input_values));
     OP_REQUIRES_OK(context, context->input("sparse_shape", &input_shape));
@@ -62,34 +70,75 @@ class SerializeSparseOp : public OpKernel {
                     "Input shape should be a vector but received shape ",
                     input_shape->shape().DebugString()));
 
-    TensorProto proto_indices;
-    TensorProto proto_values;
-    TensorProto proto_shape;
-
-    input_indices->AsProtoTensorContent(&proto_indices);
-    input_values->AsProtoTensorContent(&proto_values);
-    input_shape->AsProtoTensorContent(&proto_shape);
+    Tensor serialized_sparse;
+    OP_REQUIRES_OK(context, Initialize(&serialized_sparse));
 
-    Tensor serialized_sparse(DT_STRING, TensorShape({3}));
-    auto serialized_sparse_t = serialized_sparse.vec<string>();
-
-    serialized_sparse_t(0) = proto_indices.SerializeAsString();
-    serialized_sparse_t(1) = proto_values.SerializeAsString();
-    serialized_sparse_t(2) = proto_shape.SerializeAsString();
+    auto serialized_sparse_t = serialized_sparse.vec<T>();
+    OP_REQUIRES_OK(context, Serialize(*input_indices, &serialized_sparse_t(0)));
+    OP_REQUIRES_OK(context, Serialize(*input_values, &serialized_sparse_t(1)));
+    OP_REQUIRES_OK(context, Serialize(*input_shape, &serialized_sparse_t(2)));
 
     context->set_output(0, serialized_sparse);
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("SerializeSparse").Device(DEVICE_CPU),
-                        SerializeSparseOp);
+template <>
+Status SerializeSparseOp<string>::Initialize(Tensor* result) {
+  *result = Tensor(DT_STRING, TensorShape({3}));
+  return Status::OK();
+}
+
+template <>
+Status SerializeSparseOp<string>::Serialize(const Tensor& input,
+                                            string* result) {
+  TensorProto proto;
+  input.AsProtoTensorContent(&proto);
+  *result = proto.SerializeAsString();
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("SerializeSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<string>("out_type"),
+                        SerializeSparseOp<string>);
+
+template <>
+Status SerializeSparseOp<Variant>::Initialize(Tensor* result) {
+  *result = Tensor(DT_VARIANT, TensorShape({3}));
+  return Status::OK();
+}
+
+template <>
+Status SerializeSparseOp<Variant>::Serialize(const Tensor& input,
+                                             Variant* result) {
+  *result = input;
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("SerializeSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<Variant>("out_type"),
+                        SerializeSparseOp<Variant>);
 
 template <typename T>
-class SerializeManySparseOp : public OpKernel {
+class SerializeManySparseOpBase : public OpKernel {
  public:
-  explicit SerializeManySparseOp(OpKernelConstruction* context)
+  explicit SerializeManySparseOpBase(OpKernelConstruction* context)
       : OpKernel(context) {}
 
+  void Compute(OpKernelContext* context) override {}
+
+ protected:
+  Status Initialize(const int64 n, Tensor* result);
+  Status Serialize(const Tensor& input, T* result);
+};
+
+template <typename T, typename U>
+class SerializeManySparseOp : public SerializeManySparseOpBase<U> {
+ public:
+  explicit SerializeManySparseOp(OpKernelConstruction* context)
+      : SerializeManySparseOpBase<U>(context) {}
+
   void Compute(OpKernelContext* context) override {
     const Tensor* input_indices;
     const Tensor* input_values;
@@ -127,37 +176,31 @@ class SerializeManySparseOp : public OpKernel {
 
     auto input_shape_t = input_shape->vec<int64>();
     const int64 N = input_shape_t(0);
-
-    Tensor serialized_sparse(DT_STRING, TensorShape({N, 3}));
-    auto serialized_sparse_t = serialized_sparse.matrix<string>();
+    Tensor serialized_sparse;
+    OP_REQUIRES_OK(context, this->Initialize(N, &serialized_sparse));
+    auto serialized_sparse_t = serialized_sparse.matrix<U>();
 
     OP_REQUIRES_OK(context, input_st.IndicesValid());
 
-    // We can generate the output shape proto string now, for all
-    // minibatch entries.
-    Tensor output_shape(DT_INT64, {rank - 1});
-    auto output_shape_t = output_shape.vec<int64>();
-    for (int d = 1; d < rank; d++) output_shape_t(d - 1) = input_shape_t(d);
-    TensorProto proto_shape;
-    output_shape.AsProtoTensorContent(&proto_shape);
-    const string proto_shape_string = proto_shape.SerializeAsString();
-
+    // Initialize output with empty values and the proper shapes.
     Tensor output_blank_indices(DT_INT64, {0, rank - 1});
-    Tensor output_blank_values(DataTypeToEnum<T>::value, {0});
-    TensorProto proto_blank_indices;
-    TensorProto proto_blank_values;
-    output_blank_indices.AsProtoTensorContent(&proto_blank_indices);
-    output_blank_values.AsProtoTensorContent(&proto_blank_values);
+    U serialized_indices;
+    OP_REQUIRES_OK(context,
+                   this->Serialize(output_blank_indices, &serialized_indices));
+    serialized_sparse_t.template chip<1>(0).setConstant(serialized_indices);
 
-    const string proto_blank_indices_string =
-        proto_blank_indices.SerializeAsString();
-    const string proto_blank_values_string =
-        proto_blank_values.SerializeAsString();
+    Tensor output_blank_values(DataTypeToEnum<T>::value, {0});
+    U serialized_values;
+    OP_REQUIRES_OK(context,
+                   this->Serialize(output_blank_values, &serialized_values));
+    serialized_sparse_t.template chip<1>(1).setConstant(serialized_values);
 
-    // Initialize output with empty values and the proper shapes.
-    serialized_sparse_t.chip<1>(0).setConstant(proto_blank_indices_string);
-    serialized_sparse_t.chip<1>(1).setConstant(proto_blank_values_string);
-    serialized_sparse_t.chip<1>(2).setConstant(proto_shape_string);
+    Tensor output_shape(DT_INT64, {rank - 1});
+    auto output_shape_t = output_shape.vec<int64>();
+    for (int d = 1; d < rank; d++) output_shape_t(d - 1) = input_shape_t(d);
+    U serialized_shape;
+    OP_REQUIRES_OK(context, this->Serialize(output_shape, &serialized_shape));
+    serialized_sparse_t.template chip<1>(2).setConstant(serialized_shape);
 
     // Get groups by minibatch dimension
     sparse::GroupIterable minibatch = input_st.group({0});
@@ -186,24 +229,62 @@ class SerializeManySparseOp : public OpKernel {
         output_values_t(i) = values(i);
       }
 
-      TensorProto proto_indices;
-      TensorProto proto_values;
-      output_indices.AsProtoTensorContent(&proto_indices);
-      output_values.AsProtoTensorContent(&proto_values);
-
-      serialized_sparse_t(b, 0) = proto_indices.SerializeAsString();
-      serialized_sparse_t(b, 1) = proto_values.SerializeAsString();
+      OP_REQUIRES_OK(
+          context, this->Serialize(output_indices, &serialized_sparse_t(b, 0)));
+      OP_REQUIRES_OK(
+          context, this->Serialize(output_values, &serialized_sparse_t(b, 1)));
     }
 
     context->set_output(0, serialized_sparse);
   }
 };
 
-#define REGISTER_KERNELS(type)                            \
-  REGISTER_KERNEL_BUILDER(Name("SerializeManySparse")     \
-                              .Device(DEVICE_CPU)         \
-                              .TypeConstraint<type>("T"), \
-                          SerializeManySparseOp<type>)
+template <>
+Status SerializeManySparseOpBase<string>::Initialize(const int64 n,
+                                                     Tensor* result) {
+  *result = Tensor(DT_STRING, TensorShape({n, 3}));
+  return Status::OK();
+}
+
+template <>
+Status SerializeManySparseOpBase<string>::Serialize(const Tensor& input,
+                                                    string* result) {
+  TensorProto proto;
+  input.AsProtoTensorContent(&proto);
+  *result = proto.SerializeAsString();
+  return Status::OK();
+}
+
+#define REGISTER_KERNELS(type)                                     \
+  REGISTER_KERNEL_BUILDER(Name("SerializeManySparse")              \
+                              .Device(DEVICE_CPU)                  \
+                              .TypeConstraint<type>("T")           \
+                              .TypeConstraint<string>("out_type"), \
+                          SerializeManySparseOp<type, string>)
+
+TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+template <>
+Status SerializeManySparseOpBase<Variant>::Initialize(const int64 n,
+                                                      Tensor* result) {
+  *result = Tensor(DT_VARIANT, TensorShape({n, 3}));
+  return Status::OK();
+}
+
+template <>
+Status SerializeManySparseOpBase<Variant>::Serialize(const Tensor& input,
+                                                     Variant* result) {
+  *result = input;
+  return Status::OK();
+}
+
+#define REGISTER_KERNELS(type)                                      \
+  REGISTER_KERNEL_BUILDER(Name("SerializeManySparse")               \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .TypeConstraint<Variant>("out_type"), \
+                          SerializeManySparseOp<type, Variant>)
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
@@ -212,7 +293,9 @@ template <typename T>
 class DeserializeSparseOp : public OpKernel {
  public:
   explicit DeserializeSparseOp(OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+  }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& serialized_sparse = context->input(0);
@@ -240,86 +323,43 @@ class DeserializeSparseOp : public OpKernel {
             "but has a zero dimension ",
             serialized_sparse.shape().DebugString()));
 
+    if (num_sparse_tensors == 0 && serialized_sparse.shape().dims() == 1) {
+      // Special case with a single sparse tensor. We can avoid data
+      // motion in the Concat and Reshape.
+      const auto& serialized_sparse_t = serialized_sparse.vec<T>();
+
+      Tensor output_indices;
+      Tensor output_values;
+      Tensor output_shape;
+      OP_REQUIRES_OK(context,
+                     this->GetAndValidateSparseTensor(
+                         serialized_sparse_t(0), serialized_sparse_t(1),
+                         serialized_sparse_t(2), dtype_, 0 /* index */,
+                         &output_indices, &output_values, &output_shape));
+      context->set_output(0, output_indices);
+      context->set_output(1, output_values);
+      context->set_output(2, output_shape);
+      return;
+    }
+
     std::vector<Tensor> indices;
     std::vector<Tensor> values;
     TensorShape shape;
     indices.reserve(num_sparse_tensors);
     values.reserve(num_sparse_tensors);
 
-    const auto& serialized_sparse_t =
-        serialized_sparse.flat_inner_dims<string, 2>();
-
+    const auto& serialized_sparse_t = serialized_sparse.flat_inner_dims<T, 2>();
     for (int i = 0; i < num_sparse_tensors; ++i) {
-      Tensor output_indices(DT_INT64);
-      Tensor output_values(DataTypeToEnum<T>::value);
-      Tensor output_shape(DT_INT64);
-      TensorProto proto_indices;
-      TensorProto proto_values;
-      TensorProto proto_shape;
-
-      OP_REQUIRES(
-          context,
-          ParseProtoUnlimited(&proto_indices, serialized_sparse_t(i, 0)),
-          errors::InvalidArgument("Could not parse serialized_sparse[", i,
-                                  ", 0]"));
-      OP_REQUIRES(context,
-                  ParseProtoUnlimited(&proto_values, serialized_sparse_t(i, 1)),
-                  errors::InvalidArgument("Could not parse serialized_sparse[",
-                                          i, ", 1]"));
-      OP_REQUIRES(context,
-                  ParseProtoUnlimited(&proto_shape, serialized_sparse_t(i, 2)),
-                  errors::InvalidArgument("Could not parse serialized_sparse[",
-                                          i, ", 2]"));
-
-      OP_REQUIRES(context, output_indices.FromProto(proto_indices),
-                  errors::InvalidArgument(
-                      "Could not construct Tensor serialized_sparse[", i,
-                      ", 0] (indices)"));
-      OP_REQUIRES(context, TensorShapeUtils::IsMatrix(output_indices.shape()),
-                  errors::InvalidArgument(
-                      "Expected serialized_sparse[", i,
-                      ", 0] to represent an index matrix but received shape ",
-                      output_indices.shape().DebugString()));
-      OP_REQUIRES(context, output_values.FromProto(proto_values),
-                  errors::InvalidArgument(
-                      "Could not construct Tensor serialized_sparse[", i,
-                      ", 1] (values)"));
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(output_values.shape()),
-                  errors::InvalidArgument(
-                      "Expected serialized_sparse[", i,
-                      ", 1] to represent a values vector but received shape ",
-                      output_values.shape().DebugString()));
-      OP_REQUIRES(context, output_shape.FromProto(proto_shape),
-                  errors::InvalidArgument(
-                      "Could not construct Tensor serialized_sparse[", i,
-                      ", 2] (shape)"));
-      OP_REQUIRES(
-          context, TensorShapeUtils::IsVector(output_shape.shape()),
-          errors::InvalidArgument("Expected serialized_sparse[", i,
-                                  ", 1] to be a shape vector but its shape is ",
-                                  output_shape.shape().DebugString()));
-
-      OP_REQUIRES(
-          context, DataTypeToEnum<T>::value == output_values.dtype(),
-          errors::InvalidArgument(
-              "Requested SparseTensor of type ",
-              DataTypeString(DataTypeToEnum<T>::value), " but SparseTensor[", i,
-              "].values.dtype() == ", DataTypeString(output_values.dtype())));
-
+      Tensor output_indices;
+      Tensor output_values;
+      Tensor output_shape;
+      OP_REQUIRES_OK(context,
+                     this->GetAndValidateSparseTensor(
+                         serialized_sparse_t(i, 0), serialized_sparse_t(i, 1),
+                         serialized_sparse_t(i, 2), dtype_, i, &output_indices,
+                         &output_values, &output_shape));
       int64 num_entries = output_indices.dim_size(0);
-      OP_REQUIRES(context, num_entries == output_values.dim_size(0),
-                  errors::InvalidArgument(
-                      "Expected row counts of SparseTensor[", i,
-                      "].indices and SparseTensor[", i,
-                      "].values to match but they do not: ", num_entries,
-                      " vs. ", output_values.dim_size(0)));
       int rank = output_indices.dim_size(1);
-      OP_REQUIRES(
-          context, rank == output_shape.dim_size(0),
-          errors::InvalidArgument("Expected column counts of SparseTensor[", i,
-                                  "].indices to match size of SparseTensor[", i,
-                                  "].shape but they do not: ", rank, " vs. ",
-                                  output_shape.dim_size(0)));
 
       // Now we expand each SparseTensors' indices and shape by
       // prefixing a dimension
@@ -376,7 +416,24 @@ class DeserializeSparseOp : public OpKernel {
       tensors.emplace_back(indices[i], values[i], shape, std_order);
     }
 
-    SparseTensor output = SparseTensor::Concat<T>(tensors);
+    gtl::optional<SparseTensor> maybe_output;
+#define HANDLE_TYPE(T)                               \
+  case DataTypeToEnum<T>::value: {                   \
+    maybe_output = SparseTensor::Concat<T>(tensors); \
+    break;                                           \
+  }
+
+    switch (dtype_) {
+      TF_CALL_ALL_TYPES(HANDLE_TYPE);
+      TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+      default:
+        OP_REQUIRES(context, false,
+                    errors::Unimplemented(
+                        "DeserializeSparse Unhandled data type: ", dtype_));
+    }
+    DCHECK(maybe_output);
+    SparseTensor& output = maybe_output.value();
 
     // Compute the input shape for the reshape operation.
     Tensor input_shape(DT_INT64, TensorShape({output.dims()}));
@@ -398,198 +455,101 @@ class DeserializeSparseOp : public OpKernel {
             0 /* output indices index */, 2 /* output shape index */);
     context->set_output(1, output.values());
   }
-};
-
-#define REGISTER_KERNELS(type)                                \
-  REGISTER_KERNEL_BUILDER(Name("DeserializeSparse")           \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("dtype"), \
-                          DeserializeSparseOp<type>)
-
-TF_CALL_ALL_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS
-
-template <typename T>
-class DeserializeManySparseOp : public OpKernel {
- public:
-  explicit DeserializeManySparseOp(OpKernelConstruction* context)
-      : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& serialized_sparse = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(serialized_sparse.shape()),
-                errors::InvalidArgument(
-                    "Serialized sparse should be a matrix but received shape ",
-                    serialized_sparse.shape().DebugString()));
-    OP_REQUIRES(
-        context, serialized_sparse.shape().dim_size(1) == 3,
-        errors::InvalidArgument(
-            "Serialized sparse should have 3 columns but received shape ",
-            serialized_sparse.shape().DebugString()));
-
-    int num_sparse_tensors = serialized_sparse.shape().dim_size(0);
-
-    OP_REQUIRES(
-        context, num_sparse_tensors > 0,
-        errors::InvalidArgument("Must have at least 1 serialized SparseTensor, "
-                                "but input matrix has 0 rows"));
-
-    std::vector<Tensor> indices_to_concat;
-    std::vector<Tensor> values_to_concat;
-    std::vector<TensorShape> shapes_to_concat;
-
-    const auto& serialized_sparse_t = serialized_sparse.matrix<string>();
-
-    for (int i = 0; i < num_sparse_tensors; ++i) {
-      Tensor output_indices(DT_INT64);
-      Tensor output_values(DataTypeToEnum<T>::value);
-      Tensor output_shape(DT_INT64);
-      TensorProto proto_indices;
-      TensorProto proto_values;
-      TensorProto proto_shape;
-
-      OP_REQUIRES(
-          context,
-          ParseProtoUnlimited(&proto_indices, serialized_sparse_t(i, 0)),
-          errors::InvalidArgument("Could not parse serialized_sparse[", i,
-                                  ", 0]"));
-      OP_REQUIRES(context,
-                  ParseProtoUnlimited(&proto_values, serialized_sparse_t(i, 1)),
-                  errors::InvalidArgument("Could not parse serialized_sparse[",
-                                          i, ", 1]"));
-      OP_REQUIRES(context,
-                  ParseProtoUnlimited(&proto_shape, serialized_sparse_t(i, 2)),
-                  errors::InvalidArgument("Could not parse serialized_sparse[",
-                                          i, ", 2]"));
-
-      OP_REQUIRES(context, output_indices.FromProto(proto_indices),
-                  errors::InvalidArgument(
-                      "Could not construct Tensor serialized_sparse[", i,
-                      ", 0] (indices)"));
-      OP_REQUIRES(context, TensorShapeUtils::IsMatrix(output_indices.shape()),
-                  errors::InvalidArgument(
-                      "Expected serialized_sparse[", i,
-                      ", 0] to represent an index matrix but received shape ",
-                      output_indices.shape().DebugString()));
-      OP_REQUIRES(context, output_values.FromProto(proto_values),
-                  errors::InvalidArgument(
-                      "Could not construct Tensor serialized_sparse[", i,
-                      ", 1] (values)"));
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(output_values.shape()),
-                  errors::InvalidArgument(
-                      "Expected serialized_sparse[", i,
-                      ", 1] to represent a values vector but received shape ",
-                      output_values.shape().DebugString()));
-      OP_REQUIRES(context, output_shape.FromProto(proto_shape),
-                  errors::InvalidArgument(
-                      "Could not construct Tensor serialized_sparse[", i,
-                      ", 2] (shape)"));
-      OP_REQUIRES(
-          context, TensorShapeUtils::IsVector(output_shape.shape()),
-          errors::InvalidArgument("Expected serialized_sparse[", i,
-                                  ", 1] to be a shape vector but its shape is ",
-                                  output_shape.shape().DebugString()));
-
-      OP_REQUIRES(
-          context, DataTypeToEnum<T>::value == output_values.dtype(),
-          errors::InvalidArgument(
-              "Requested SparseTensor of type ",
-              DataTypeString(DataTypeToEnum<T>::value), " but SparseTensor[", i,
-              "].values.dtype() == ", DataTypeString(output_values.dtype())));
-
-      int64 num_entries = output_indices.dim_size(0);
-      OP_REQUIRES(context, num_entries == output_values.dim_size(0),
-                  errors::InvalidArgument(
-                      "Expected row counts of SparseTensor[", i,
-                      "].indices and SparseTensor[", i,
-                      "].values to match but they do not: ", num_entries,
-                      " vs. ", output_values.dim_size(0)));
-      int rank = output_indices.dim_size(1);
-      OP_REQUIRES(
-          context, rank == output_shape.dim_size(0),
-          errors::InvalidArgument("Expected column counts of SparseTensor[", i,
-                                  "].indices to match size of SparseTensor[", i,
-                                  "].shape "
-                                  "but they do not: ",
-                                  rank, " vs. ", output_shape.dim_size(0)));
 
-      // Now we expand each SparseTensors' indices and shape by
-      // prefixing a dimension
-      Tensor expanded_indices(
-          DT_INT64, TensorShape({num_entries, 1 + output_indices.dim_size(1)}));
-      Tensor expanded_shape(DT_INT64,
-                            TensorShape({1 + output_shape.dim_size(0)}));
-      const auto& output_indices_t = output_indices.matrix<int64>();
-      const auto& output_shape_t = output_shape.vec<int64>();
-      auto expanded_indices_t = expanded_indices.matrix<int64>();
-      auto expanded_shape_t = expanded_shape.vec<int64>();
-      expanded_indices_t.chip<1>(0).setZero();
-      Eigen::DSizes<Eigen::DenseIndex, 2> indices_start(0, 1);
-      Eigen::DSizes<Eigen::DenseIndex, 2> indices_sizes(num_entries, rank);
-      expanded_indices_t.slice(indices_start, indices_sizes) = output_indices_t;
-      expanded_shape_t(0) = 1;
-      std::copy_n(&output_shape_t(0), rank, &expanded_shape_t(1));
-
-      TensorShape expanded_tensor_shape(expanded_shape.vec<int64>());
-
-      indices_to_concat.push_back(expanded_indices);
-      values_to_concat.push_back(output_values);
-      shapes_to_concat.push_back(expanded_tensor_shape);
+ protected:
+  Status Deserialize(const T& serialized, Tensor* result);
+
+  Status GetAndValidateSparseTensor(
+      const T& serialized_indices, const T& serialized_values,
+      const T& serialized_shape, DataType values_dtype, int index,
+      Tensor* output_indices, Tensor* output_values, Tensor* output_shape) {
+    // Deserialize and validate the indices.
+    TF_RETURN_IF_ERROR(this->Deserialize(serialized_indices, output_indices));
+    if (!TensorShapeUtils::IsMatrix(output_indices->shape())) {
+      return errors::InvalidArgument(
+          "Expected serialized_sparse[", index,
+          ", 0] to represent an index matrix but received shape ",
+          output_indices->shape().DebugString());
     }
-
-    int rank = -1;
-    for (int i = 0; i < num_sparse_tensors; ++i) {
-      if (rank < 0) rank = shapes_to_concat[i].dims();
-      OP_REQUIRES(context, rank == shapes_to_concat[i].dims(),
-                  errors::InvalidArgument(
-                      "Inconsistent rank across SparseTensors: rank prior to "
-                      "SparseTensor[",
-                      i, "] was: ", rank, " but rank of SparseTensor[", i,
-                      "] is: ", shapes_to_concat[i].dims()));
+    int64 num_entries = output_indices->dim_size(0);
+    int rank = output_indices->dim_size(1);
+
+    // Deserialize and validate the values.
+    TF_RETURN_IF_ERROR(this->Deserialize(serialized_values, output_values));
+    if (!TensorShapeUtils::IsVector(output_values->shape())) {
+      return errors::InvalidArgument(
+          "Expected serialized_sparse[", index,
+          ", 1] to represent a values vector but received shape ",
+          output_values->shape().DebugString());
     }
-
-    // SparseTensor::Concat requires consistent shape for all but the
-    // primary order dimension (dimension 0 in this case).  So we get
-    // the maximum value across all the input SparseTensors for each
-    // dimension and use that.
-    TensorShape preconcat_shape(shapes_to_concat[0]);
-    for (int i = 0; i < num_sparse_tensors; ++i) {
-      for (int d = 0; d < rank; ++d) {
-        preconcat_shape.set_dim(d, std::max(preconcat_shape.dim_size(d),
-                                            shapes_to_concat[i].dim_size(d)));
-      }
+    if (values_dtype != output_values->dtype()) {
+      return errors::InvalidArgument(
+          "Requested SparseTensor of type ", DataTypeString(values_dtype),
+          " but SparseTensor[", index,
+          "].values.dtype() == ", DataTypeString(output_values->dtype()));
     }
-
-    // Dimension 0 is the primary dimension.
-    gtl::InlinedVector<int64, 8> std_order(rank);
-    std::iota(std_order.begin(), std_order.end(), 0);
-
-    std::vector<SparseTensor> tensors_to_concat;
-    tensors_to_concat.reserve(num_sparse_tensors);
-    for (int i = 0; i < num_sparse_tensors; ++i) {
-      tensors_to_concat.emplace_back(indices_to_concat[i], values_to_concat[i],
-                                     preconcat_shape, std_order);
+    if (num_entries != output_values->dim_size(0)) {
+      return errors::InvalidArgument(
+          "Expected row counts of SparseTensor[", index,
+          "].indices and SparseTensor[", index,
+          "].values to match but they do not: ", num_entries, " vs. ",
+          output_values->dim_size(0));
     }
 
-    SparseTensor output = SparseTensor::Concat<T>(tensors_to_concat);
-
-    Tensor final_output_shape(DT_INT64, TensorShape({output.dims()}));
-
-    std::copy_n(output.shape().data(), output.dims(),
-                final_output_shape.vec<int64>().data());
-
-    context->set_output(0, output.indices());
-    context->set_output(1, output.values());
-    context->set_output(2, final_output_shape);
+    // Deserialize and validate the shape.
+    TF_RETURN_IF_ERROR(this->Deserialize(serialized_shape, output_shape));
+    if (!TensorShapeUtils::IsVector(output_shape->shape())) {
+      return errors::InvalidArgument(
+          "Expected serialized_sparse[", index,
+          ", 1] to be a shape vector but its shape is ",
+          output_shape->shape().DebugString());
+    }
+    if (rank != output_shape->dim_size(0)) {
+      return errors::InvalidArgument("Expected column counts of SparseTensor[",
+                                     index,
+                                     "].indices to match size of SparseTensor[",
+                                     index, "].shape but they do not: ", rank,
+                                     " vs. ", output_shape->dim_size(0));
+    }
+    return Status::OK();
   }
+
+  DataType dtype_;
 };
 
-#define REGISTER_KERNELS(type)                                \
-  REGISTER_KERNEL_BUILDER(Name("DeserializeManySparse")       \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("dtype"), \
-                          DeserializeManySparseOp<type>)
+template <>
+Status DeserializeSparseOp<string>::Deserialize(const string& serialized,
+                                                Tensor* result) {
+  TensorProto proto;
+  if (!ParseProtoUnlimited(&proto, serialized)) {
+    return errors::InvalidArgument("Could not parse serialized proto");
+  }
+  Tensor tensor;
+  if (!tensor.FromProto(proto)) {
+    return errors::InvalidArgument("Could not construct tensor from proto");
+  }
+  *result = tensor;
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("DeserializeSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<string>("Tserialized"),
+                        DeserializeSparseOp<string>)
+
+REGISTER_KERNEL_BUILDER(Name("DeserializeManySparse").Device(DEVICE_CPU),
+                        DeserializeSparseOp<string>)
+
+template <>
+Status DeserializeSparseOp<Variant>::Deserialize(const Variant& serialized,
+                                                 Tensor* result) {
+  *result = *serialized.get<Tensor>();
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("DeserializeSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<Variant>("Tserialized"),
+                        DeserializeSparseOp<Variant>)
 
-TF_CALL_ALL_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc
index 185c5b248fca8f5a4e8edf6d46e9447f8a0b4750..f2dd2812b53e2c974efac3d3e1aef1052d907da6 100644
--- a/tensorflow/core/kernels/session_ops.cc
+++ b/tensorflow/core/kernels/session_ops.cc
@@ -144,7 +144,7 @@ REGISTER_GPU_KERNEL(bool);
 TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
 REGISTER_SYCL_KERNEL(bool);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class DeleteSessionTensorOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/set_kernels.cc b/tensorflow/core/kernels/set_kernels.cc
index 5a2b18b41ca4160327645f8655c2c70adb4b427d..e836c764acf859ed728f760d2e8e9c57ea86080f 100644
--- a/tensorflow/core/kernels/set_kernels.cc
+++ b/tensorflow/core/kernels/set_kernels.cc
@@ -216,7 +216,7 @@ void PopulateFromDenseGroup(OpKernelContext* ctx, const Tensor& input_tensor,
   result->clear();
   auto input_flat = input_tensor.flat<T>();
   const auto start = std::inner_product(
-      group_indices.begin(), group_indices.end(), input_strides.begin(), 0L);
+      group_indices.begin(), group_indices.end(), input_strides.begin(), 0LL);
   const TensorShape& input_shape = input_tensor.shape();
   const auto end = start + input_shape.dim_size(input_shape.dims() - 1);
   for (int64 i = start; i < end; ++i) {
@@ -279,7 +279,7 @@ void SetSizeOp<T>::Compute(OpKernelContext* ctx) {
 
     const auto group_key = group.group();
     const auto output_index = std::inner_product(
-        group_key.begin(), group_key.end(), output_strides.begin(), 0L);
+        group_key.begin(), group_key.end(), output_strides.begin(), 0LL);
     out(output_index) = group_set.size();
   }
 }
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index 8d9d0ea84612b51bdcd597698b89e3b8ffb8a915..55be308901b2b1233090c097944f441a17938125 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -235,10 +235,10 @@ class SqueezeOp : public OpKernel {
       if (!wrapped_squeeze_dims.empty()) {
         if (wrapped_squeeze_dims.count(i) > 0) {
           OP_REQUIRES(ctx, existing_dim == 1,
-                      errors::InvalidArgument("Tried to explicitly squeeze "
-                                              "dimension ",
-                                              i, " but dimension was not 1: ",
-                                              existing_dim));
+                      errors::InvalidArgument(
+                          "Tried to explicitly squeeze "
+                          "dimension ",
+                          i, " but dimension was not 1: ", existing_dim));
         } else {
           // This dimension is not being squeezed.
           new_shape.push_back(existing_dim);
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 28a379774be5222bb15865c3642d9467659c3d1e..79369fd4a9cc1668bc12cfdb466ad2ec2bbe8d11 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -58,7 +58,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Shared code that is not dependent on the type of T.  We do this to reduce
 // code size by not duplicating all this for all T (float, double, int32, etc.)
@@ -72,10 +72,11 @@ static void SharedValidation(OpKernelContext* context,
   const Tensor& size_tensor = context->input(2);
 
   OP_REQUIRES(
-      context, context->op_kernel().IsLegacyVector(begin_tensor.shape()) &&
-                   context->op_kernel().IsLegacyVector(size_tensor.shape()) &&
-                   begin_tensor.NumElements() == input.dims() &&
-                   size_tensor.NumElements() == input.dims(),
+      context,
+      context->op_kernel().IsLegacyVector(begin_tensor.shape()) &&
+          context->op_kernel().IsLegacyVector(size_tensor.shape()) &&
+          begin_tensor.NumElements() == input.dims() &&
+          size_tensor.NumElements() == input.dims(),
       errors::InvalidArgument(
           "Expected begin and size arguments to be 1-D tensors of size ",
           input.dims(), ", but got shapes ", begin_tensor.shape().DebugString(),
@@ -125,8 +126,7 @@ static void SharedSliceCommonCases(OpKernelContext* context,
                                    TensorShape* output_shape,
                                    gtl::InlinedVector<int64, 4>* begin,
                                    gtl::InlinedVector<int64, 4>* size,
-                                   Tensor** result,
-                                   bool* done) {
+                                   Tensor** result, bool* done) {
   bool is_identity = true;
   bool slice_dim0 = true;
   *done = false;
@@ -142,8 +142,8 @@ static void SharedSliceCommonCases(OpKernelContext* context,
     return;
   }
 
-  if (slice_dim0 && IsDim0SliceAligned<T>(input.shape(), (*begin)[0],
-                                          (*size)[0])) {
+  if (slice_dim0 &&
+      IsDim0SliceAligned<T>(input.shape(), (*begin)[0], (*size)[0])) {
     VLOG(1) << "Slice dim 0: " << input.shape().DebugString();
     CHECK_GE(input.dims(), 1);  // Otherwise, is_identity should be true.
     context->set_output(0, input.Slice((*begin)[0], (*begin)[0] + (*size)[0]));
@@ -154,7 +154,6 @@ static void SharedSliceCommonCases(OpKernelContext* context,
   OP_REQUIRES_OK(context, context->allocate_output(0, *output_shape, result));
 }
 
-
 template <typename Device, typename T>
 class SliceOp : public OpKernel {
  public:
@@ -190,26 +189,43 @@ class SliceOp : public OpKernel {
         }
         return;
       }
-#define HANDLE_DIM(NDIM)                                              \
-  if (input_dims == NDIM) {                                           \
-    functor::Slice<Device, T, NDIM>()(                                \
-        context->eigen_device<Device>(), result, input, begin, size); \
-    return;                                                           \
+#define HANDLE_DIM(NDIM)                            \
+  if (input_dims == NDIM) {                         \
+    HandleCase<NDIM>(context, begin, size, result); \
+    return;                                         \
   }
+
       HANDLE_DIM(1);
       HANDLE_DIM(2);
       HANDLE_DIM(3);
       HANDLE_DIM(4);
       HANDLE_DIM(5);
       HANDLE_DIM(6);
+      HANDLE_DIM(7);
 
 #undef HANDLE_DIM
 
-      // handle cases which dim >= 7
-      functor::Slice<Device, T, 7>()(
-          context->eigen_device<Device>(), result, input, begin, size);
+      OP_REQUIRES(
+          context, false,
+          errors::Unimplemented("SliceOp : Unhandled input dimensions"));
     }
   }
+
+ private:
+  template <int NDIM>
+  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
+                  const gtl::ArraySlice<int64>& size, Tensor* result) {
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
+    for (int i = 0; i < NDIM; ++i) {
+      indices[i] = begin[i];
+      sizes[i] = size[i];
+    }
+
+    functor::Slice<Device, T, NDIM>()(
+        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+        context->input(0).tensor<T, NDIM>(), indices, sizes);
+  }
 };
 
 #ifdef INTEL_MKL
@@ -248,16 +264,11 @@ class MklSliceOp : public OpKernel {
         }
         return;
       }
-      // Special case for handling 4-D tensor slice.
-      if (input_dims == 4) {
-        HandleCase4D(context, begin, size, result);
-      } else {
-#define HANDLE_DIM(NDIM)                                                  \
-      if (input_dims == NDIM) {                                           \
-        functor::Slice<Device, T, NDIM>()(                                \
-            context->eigen_device<Device>(), result, input, begin, size); \
-            return;                                                       \
-      }
+#define HANDLE_DIM(NDIM)                            \
+  if (input_dims == NDIM) {                         \
+    HandleCase<NDIM>(context, begin, size, result); \
+    return;                                         \
+  }
 
       HANDLE_DIM(1);
       HANDLE_DIM(2);
@@ -265,13 +276,13 @@ class MklSliceOp : public OpKernel {
       HANDLE_DIM(4);
       HANDLE_DIM(5);
       HANDLE_DIM(6);
+      HANDLE_DIM(7);
 
 #undef HANDLE_DIM
 
-        // handle cases which dim >= 7
-        functor::Slice<Device, T, 7>()(
-          context->eigen_device<Device>(), result, input, begin, size);
-      }
+      OP_REQUIRES(
+          context, false,
+          errors::Unimplemented("SliceOp : Unhandled input dimensions"));
     }
   }
 
@@ -282,9 +293,9 @@ class MklSliceOp : public OpKernel {
   // as the sizes of all the dimensions of the input except slice_dim, then
   // returns True. Otherwise, returns False.
   bool DoesSliceShapeDifferInOnly1DHelper(const TensorShape& input_shape,
-                          const gtl::ArraySlice<int64>& begin,
-                          const gtl::ArraySlice<int64>& size,
-                          int slice_dim) {
+                                          const gtl::ArraySlice<int64>& begin,
+                                          const gtl::ArraySlice<int64>& size,
+                                          int slice_dim) {
     for (int dim = 0; dim < 4; dim++) {
       if (dim != slice_dim &&
           (begin[dim] != 0 || size[dim] != input_shape.dim_size(dim))) {
@@ -306,9 +317,9 @@ class MklSliceOp : public OpKernel {
   // Returns True if Slicing over a single dimension, and sets slice_dim
   // to the number of the dimension that satisfies criteria.
   bool DoesSliceShapeDifferInOnly1D(const TensorShape& input_shape,
-                          const gtl::ArraySlice<int64>& begin,
-                          const gtl::ArraySlice<int64>& size,
-                          int* slice_dim) {
+                                    const gtl::ArraySlice<int64>& begin,
+                                    const gtl::ArraySlice<int64>& size,
+                                    int* slice_dim) {
     for (int dim = 0; dim < 4; dim++) {
       if (DoesSliceShapeDifferInOnly1DHelper(input_shape, begin, size, dim)) {
         *slice_dim = dim;
@@ -318,8 +329,8 @@ class MklSliceOp : public OpKernel {
     return false;
   }
 
-  void HandleCase4D(OpKernelContext* context,
-                  const gtl::ArraySlice<int64>& begin,
+  template <int NDIM>
+  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
                   const gtl::ArraySlice<int64>& size, Tensor* result) {
     int slice_dim = -1;
     TensorShape in_shape = context->input(0).shape();
@@ -327,72 +338,77 @@ class MklSliceOp : public OpKernel {
     // differs from the input tensor in only 1 out of 4 dimensions.
     // This case arises in the context of Slice of 4-D tensor in NHWC or NCHW
     // format over channel dimension.
-    if (DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
-        size_t in_strides[4] = { (size_t) in_shape.dim_size(1) *
-                                          in_shape.dim_size(2) *
-                                          in_shape.dim_size(3),
-                                 (size_t) in_shape.dim_size(2) *
-                                          in_shape.dim_size(3),
-                                 (size_t) in_shape.dim_size(3),
-                                 (size_t) 1
-                               };
-
-        size_t out_strides[4] = { (size_t) size[1] * size[2] * size[3],
-                                  (size_t) size[2] * size[3],
-                                  (size_t) size[3],
-                                  (size_t) 1 };
-
-        T *in_buf = const_cast<T*>(const_cast<const T*>(
-                    context->input(0).flat<T>().data()));
-        T *op_buf = result->flat<T>().data();
-
-        if (slice_dim == 1) {
-          /* data format = NCHW */
-
-          #pragma omp parallel for
-          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-              T *ip  = in_buf + (d0 * in_strides[0]);
-              T *op  = op_buf + ((d0 - begin[0]) * out_strides[0]);
-            #pragma omp parallel for
-            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-              T *ip1 = ip + (d1 * in_strides[1]);
-              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
-              // For NCHW, H and W will be contiguous. So we can copy
-              // both with one memcpy.
-              memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
-                     sizeof(T) * in_strides[1]);
-            }
+    if (NDIM == 4 &&
+        DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
+      size_t in_strides[4] = {
+          (size_t)in_shape.dim_size(1) * in_shape.dim_size(2) *
+              in_shape.dim_size(3),
+          (size_t)in_shape.dim_size(2) * in_shape.dim_size(3),
+          (size_t)in_shape.dim_size(3), (size_t)1};
+
+      size_t out_strides[4] = {(size_t)size[1] * size[2] * size[3],
+                               (size_t)size[2] * size[3], (size_t)size[3],
+                               (size_t)1};
+
+      T* in_buf = const_cast<T*>(
+          const_cast<const T*>(context->input(0).flat<T>().data()));
+      T* op_buf = result->flat<T>().data();
+
+      if (slice_dim == 1) {
+        /* data format = NCHW */
+
+#pragma omp parallel for
+        for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+          T* ip = in_buf + (d0 * in_strides[0]);
+          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
+#pragma omp parallel for
+          for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+            T* ip1 = ip + (d1 * in_strides[1]);
+            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
+            // For NCHW, H and W will be contiguous. So we can copy
+            // both with one memcpy.
+            memcpy(static_cast<void*>(op1), static_cast<void*>(ip1),
+                   sizeof(T) * in_strides[1]);
           }
-          return;
-        } else if (slice_dim == 3) {
-          /* data_format = NHWC */
-
-          #pragma omp parallel for
-          for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
-              T *ip = in_buf + (d0 * in_strides[0]);
-              T *op = op_buf + ((d0 - begin[0]) * out_strides[0]);
-            #pragma omp parallel for
-            for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
-              T *ip1 = ip + (d1 * in_strides[1]);
-              T *op1 = op + ((d1 - begin[1]) * out_strides[1]);
-              #pragma omp parallel for
-              for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
-                T *ip2 = ip1 + (d2 * in_strides[2]);
-                T *ip3 = ip2 + begin[3];
-                T *op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
-                T *op3 = op2;
-                memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
-                       sizeof(T) * size[3]);
-              }
+        }
+        return;
+      } else if (slice_dim == 3) {
+        /* data_format = NHWC */
+
+#pragma omp parallel for
+        for (size_t d0 = begin[0]; d0 < begin[0] + size[0]; d0++) {
+          T* ip = in_buf + (d0 * in_strides[0]);
+          T* op = op_buf + ((d0 - begin[0]) * out_strides[0]);
+#pragma omp parallel for
+          for (size_t d1 = begin[1]; d1 < begin[1] + size[1]; d1++) {
+            T* ip1 = ip + (d1 * in_strides[1]);
+            T* op1 = op + ((d1 - begin[1]) * out_strides[1]);
+#pragma omp parallel for
+            for (size_t d2 = begin[2]; d2 < begin[2] + size[2]; d2++) {
+              T* ip2 = ip1 + (d2 * in_strides[2]);
+              T* ip3 = ip2 + begin[3];
+              T* op2 = op1 + ((d2 - begin[2]) * out_strides[2]);
+              T* op3 = op2;
+              memcpy(static_cast<void*>(op3), static_cast<void*>(ip3),
+                     sizeof(T) * size[3]);
             }
           }
-          return;
         }
-        // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
+        return;
+      }
+      // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
+    }
+
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
+    for (int i = 0; i < NDIM; ++i) {
+      indices[i] = begin[i];
+      sizes[i] = size[i];
     }
 
-    functor::Slice<Device, T, 4>()(
-        context->eigen_device<Device>(), result, context->input(0), begin, size);
+    functor::Slice<Device, T, NDIM>()(
+        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
+        context->input(0).tensor<T, NDIM>(), indices, sizes);
   }
 };
 #endif
@@ -400,13 +416,13 @@ class MklSliceOp : public OpKernel {
 // Forward declarations of the functor specializations for declared in the
 // sharded source files.
 namespace functor {
-#define DECLARE_CPU_SPEC(T, NDIM)                        \
-  template <>                                            \
-  void Slice<CPUDevice, T, NDIM>::operator()(            \
-      const CPUDevice& d, Tensor* output,                \
-      const Tensor& input,                               \
-      const gtl::ArraySlice<int64>& slice_indices,       \
-      const gtl::ArraySlice<int64>& slice_sizes);        \
+#define DECLARE_CPU_SPEC(T, NDIM)                                  \
+  template <>                                                      \
+  void Slice<CPUDevice, T, NDIM>::operator()(                      \
+      const CPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                 \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
   extern template struct Slice<CPUDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)  \
@@ -419,7 +435,6 @@ namespace functor {
   DECLARE_CPU_SPEC(T, 7);
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N);
-DECLARE_FOR_N(bfloat16);
 
 #undef DECLARE_FOR_N
 #undef DECLARE_CPU_SPEC
@@ -436,7 +451,6 @@ DECLARE_FOR_N(bfloat16);
 
 TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
 TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
-REGISTER_SLICE(bfloat16);
 #undef REGISTER_SLICE
 #else
 #define REGISTER_SLICE(type)                             \
@@ -449,21 +463,19 @@ REGISTER_SLICE(bfloat16);
 
 TF_CALL_POD_STRING_TYPES(REGISTER_SLICE);
 TF_CALL_QUANTIZED_TYPES(REGISTER_SLICE);
-REGISTER_SLICE(bfloat16);
 #undef REGISTER_SLICE
 #endif  // INTEL_MKL
 
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T, NDIM)                        \
-  template <>                                            \
-  void Slice<GPUDevice, T, NDIM>::operator()(            \
-      const GPUDevice& d,                                \
-      Tensor* output,                                    \
-      const Tensor& input,                               \
-      const gtl::ArraySlice<int64>& slice_indices,       \
-      const gtl::ArraySlice<int64>& slice_sizes);        \
+#define DECLARE_GPU_SPEC(T, NDIM)                                  \
+  template <>                                                      \
+  void Slice<GPUDevice, T, NDIM>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                 \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
   extern template struct Slice<GPUDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)  \
@@ -478,6 +490,7 @@ namespace functor {
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N);
 TF_CALL_complex64(DECLARE_FOR_N);
 TF_CALL_complex128(DECLARE_FOR_N);
+TF_CALL_bfloat16(DECLARE_FOR_N);
 DECLARE_FOR_N(int32);
 
 #undef DECLARE_FOR_N
@@ -496,6 +509,7 @@ DECLARE_FOR_N(int32);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
+TF_CALL_bfloat16(REGISTER_GPU);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -517,14 +531,13 @@ REGISTER_KERNEL_BUILDER(Name("Slice")
 #ifdef TENSORFLOW_USE_SYCL
 // Forward declarations of the functor specializations for SYCL.
 namespace functor {
-#define DECLARE_SYCL_SPEC(T, NDIM)                       \
-  template <>                                            \
-  void Slice<SYCLDevice, T, NDIM>::operator()(           \
-      const SYCLDevice& d,                               \
-      Tensor* output,                                    \
-      const Tensor& input,                               \
-      const gtl::ArraySlice<int64>& slice_indices,       \
-      const gtl::ArraySlice<int64>& slice_sizes);        \
+#define DECLARE_SYCL_SPEC(T, NDIM)                                  \
+  template <>                                                       \
+  void Slice<SYCLDevice, T, NDIM>::operator()(                      \
+      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                  \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,        \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);         \
   extern template struct Slice<SYCLDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)   \
diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h
index 5fd6ce4067a60c4a3446abc98bf58d6c12a75124..db7eded745eb0d3c880dc46d164aad31b2531829 100644
--- a/tensorflow/core/kernels/slice_op.h
+++ b/tensorflow/core/kernels/slice_op.h
@@ -19,104 +19,31 @@ limitations under the License.
 // Functor definition for SliceOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/ops_util.h"
 
 namespace tensorflow {
-
-namespace internal {
-
-template <typename Device, typename T>
-void SliceSimple(const Device& d, Tensor* out, const Tensor& in,
-                 const gtl::ArraySlice<int64>& slice_indices);
-template <typename Device, typename T>
-void SliceSimpleGpu(const Device& d, Tensor* out, const Tensor& in,
-                 const gtl::ArraySlice<int64>& slice_indices);
-
-template <typename Device, typename T>
-void SliceSimple(const Device& d, Tensor* out, const Tensor& in,
-                 const gtl::ArraySlice<int64>& slice_indices) {
-  const int ndims = in.dims();
-  const int64 nelem = out->NumElements();
-  const gtl::InlinedVector<int64, 8> in_strides = ComputeStride<int64>(in.shape());
-  const gtl::InlinedVector<int64, 8> out_strides = ComputeStride<int64>(out->shape());
-  const T* p = in.flat<T>().data();
-  T* q = out->flat<T>().data();
-
-  std::vector<int64> i_idx(nelem, 0);
-  std::vector<int64> t(nelem, 0);
-
-  for (int64 o_idx = 0; o_idx < nelem; ++o_idx) {
-    t[o_idx] = o_idx;
-  }
-  for (int i = 0; i < ndims; ++i) {
-    int64 n = (nelem + 7) / 8;
-    int64 o_idx = 0;
-    switch (nelem % 8) {
-#define CALC_INPUT_IDX                                                            \
-  i_idx[o_idx] += (t[o_idx] / out_strides[i] + slice_indices[i]) * in_strides[i]; \
-  t[o_idx] %= out_strides[i];                                                     \
-  ++o_idx;
-      case 0: do { CALC_INPUT_IDX;
-      case 7:      CALC_INPUT_IDX;
-      case 6:      CALC_INPUT_IDX;
-      case 5:      CALC_INPUT_IDX;
-      case 4:      CALC_INPUT_IDX;
-      case 3:      CALC_INPUT_IDX;
-      case 2:      CALC_INPUT_IDX;
-      case 1:      CALC_INPUT_IDX;
-#undef CALC_INPUT_IDX
-              } while (--n > 0);
-    }
-  }
-  for (int64 o_idx = 0; o_idx < nelem; ++o_idx) {
-    q[o_idx] = p[i_idx[o_idx]];
-  }
-}
-
-template <typename Device, typename T, int NDIMS>
-void SliceUsingEigen(const Device& d, Tensor* out, const Tensor& in,
-                 const gtl::ArraySlice<int64>& slice_indices,
-                 const gtl::ArraySlice<int64>& slice_sizes) {
-  auto input = in.tensor<T, NDIMS>();
-  auto output = out->tensor<T, NDIMS>();
-  Eigen::DSizes<int, NDIMS> indices;
-  for (int i = 0; i < NDIMS; ++i) {
-    indices[i] = slice_indices[i];
-  }
-  Eigen::DSizes<int, NDIMS> sizes;
-  for (int i = 0; i < NDIMS; ++i) {
-    sizes[i] = slice_sizes[i];
-  }
-  const bool use_64bit = input.size() > Eigen::NumTraits<int>::highest();
-  if (!use_64bit &&
-      Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
-    To32Bit(output).device(d) = To32Bit(input).slice(indices, sizes);
-  } else {
-    output.device(d) = input.slice(indices, sizes);
-  }
-}
-
-} // namespace internal
-
 namespace functor {
 
-// Template parameter NDIM is not neccesary here. The aim of keeping it
-// is to compile struct slice separately which minimizes the compiling time.
-template <typename Device, typename T, int NDIM>
+template <typename Device, typename T, int NDIMS>
 struct Slice {
-  void operator()(const Device& d, Tensor* out, const Tensor& in,
-                  const gtl::ArraySlice<int64>& slice_indices,
-                  const gtl::ArraySlice<int64>& slice_sizes) {
-    if (in.dims() == NDIM) {
-        internal::SliceUsingEigen<Device, T, NDIM>(d, out, in, slice_indices, slice_sizes);
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
+                  typename TTypes<T, NDIMS>::ConstTensor input,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& slice_indices,
+                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& slice_sizes) {
+    bool use_64bit = (input.size() > Eigen::NumTraits<int>::highest());
+    if (!use_64bit &&
+        Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+      Eigen::DSizes<int, NDIMS> indices;
+      for (int i = 0; i < NDIMS; ++i) {
+        indices[i] = slice_indices[i];
+      }
+      Eigen::DSizes<int, NDIMS> sizes;
+      for (int i = 0; i < NDIMS; ++i) {
+        sizes[i] = slice_sizes[i];
+      }
+      To32Bit(output).device(d) = To32Bit(input).slice(indices, sizes);
     } else {
-        if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
-          internal::SliceSimpleGpu<Device, T>(d, out, in, slice_indices);
-        } else {
-          internal::SliceSimple<Device, T>(d, out, in, slice_indices);
-        }
+      output.device(d) = input.slice(slice_indices, slice_sizes);
     }
   }
 };
diff --git a/tensorflow/core/kernels/slice_op_cpu_impl.h b/tensorflow/core/kernels/slice_op_cpu_impl.h
index a70805658e8134ded229aa44ed86bb63762ab8b0..64b6948190a23b554582975d38dae8be638840fa 100644
--- a/tensorflow/core/kernels/slice_op_cpu_impl.h
+++ b/tensorflow/core/kernels/slice_op_cpu_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
 
 #define EIGEN_USE_THREADS
 
@@ -30,7 +30,6 @@ using CpuDevice = Eigen::ThreadPoolDevice;
   template struct functor::Slice<CpuDevice, T, CPU_PROVIDED_IXDIM>;
 
 TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS);
-DEFINE_CPU_KERNELS(bfloat16);
 
 #undef DEFINE_CPU_KERNELS
 
@@ -44,8 +43,8 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_SYCL_KERNELS);
 DEFINE_SYCL_KERNELS(int32);
 
 #undef DEFINE_SYCL_KERNELS
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SLICE_OP_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/slice_op_gpu.cu.cc b/tensorflow/core/kernels/slice_op_gpu.cu.cc
index 3039b3d777f543e2f24c8ce9e138aa8ebd843090..9d51f8978c0a24afb2f98845a4de4e8b51a29aeb 100644
--- a/tensorflow/core/kernels/slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/slice_op_gpu.cu.cc
@@ -21,65 +21,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
-namespace internal {
-
-template <typename T>
-__global__ void SliceKernel(int nthreads, const T* src, const int32* buf,
-                            const int32 ndims, T* dst) {
-  const int32* in_strides = buf;
-  const int32* out_strides = buf + ndims;
-  const int32* slice_indices = buf + ndims * 2;
-  CUDA_1D_KERNEL_LOOP(o_idx, nthreads) {
-    int32 i_idx = 0;
-    int32 t = o_idx;
-    for (int i = 0; i < ndims; ++i) {
-      i_idx += (t / out_strides[i] + slice_indices[i]) * in_strides[i];
-      t %= out_strides[i];
-    }
-    dst[o_idx] = ldg(src + i_idx);
-  }
-}
-
-template <typename Device, typename T>
-void SliceSimpleGpu(const Device& d, Tensor* out, const Tensor& in,
-                 const gtl::ArraySlice<int64>& slice_indices) {
-  // Ensures we can use 32-bit index.
-  const int64 in_nelem = in.NumElements();
-  CHECK_LT(in_nelem, kint32max) << "Tensor too large to transpose on GPU";
-  const int64 out_nelem = out->NumElements();
-  CHECK_LT(out_nelem, kint32max) << "Tensor too large to transpose on GPU";
-  // Pack strides and slice indices sizes into one buffer.
-  const int32 ndims = in.dims();
-  gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
-  gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape());
-  gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape());
-  for (int i = 0; i < ndims; ++i) {
-    host_buf[i] = in_strides[i];
-    host_buf[ndims + i] = out_strides[i];
-    host_buf[ndims * 2 + i] = slice_indices[i];
-  }
-  auto num_bytes = sizeof(int64) * host_buf.size();
-  auto dev_buf = d.allocate(num_bytes);
-  // NOTE: host_buf is not allocated by CudaHostAllocator, and
-  // therefore we are doing a sync copy effectively.
-  d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
-  // Launch kernel to q[...] = p[...].
-  const T* p = in.flat<T>().data();
-  T* q = out->flat<T>().data();
-  CudaLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
-  SliceKernel<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
-      cfg.virtual_thread_count, p, reinterpret_cast<const int32*>(dev_buf),
-      ndims, q);
-  // Safe to deallocate immediately after the kernel launch.
-  d.deallocate(dev_buf);
-}
-
-} // namespace internal
 
 typedef Eigen::GpuDevice GPUDevice;
 
@@ -95,6 +39,7 @@ typedef Eigen::GpuDevice GPUDevice;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
 TF_CALL_complex128(DEFINE_GPU_KERNELS);
+TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(int32);
 
 #undef DEFINE_GPU_KERNELS
diff --git a/tensorflow/core/kernels/snapshot_op.cc b/tensorflow/core/kernels/snapshot_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50157d5d48f93bfe61cbac95246123ef0a7d446e
--- /dev/null
+++ b/tensorflow/core/kernels/snapshot_op.cc
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+#include "tensorflow/core/kernels/snapshot_op.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+#define REGISTER_KERNEL(TYPE)                                        \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Snapshot").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
+      SnapshotOp<CPUDevice, TYPE>);
+
+TF_CALL_POD_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SyclDevice;
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Snapshot").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      SnapshotOp<SyclDevice, TYPE>);
+
+TF_CALL_POD_TYPES(REGISTER_SYCL_KERNEL);
+
+#undef REGISTER_SYCL_KERNEL
+#endif  // TENSORFLOW_USE_SYCL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/snapshot_op.h b/tensorflow/core/kernels/snapshot_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c79893b49661519515a7b4a537ff3caeceba2be
--- /dev/null
+++ b/tensorflow/core/kernels/snapshot_op.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
+#define TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+template <typename Device, typename Scalar>
+class SnapshotOp : public OpKernel {
+ public:
+  explicit SnapshotOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    const Device& device = context->eigen_device<Device>();
+    device.memcpy(output->template flat<Scalar>().data(),
+                  input.template flat<Scalar>().data(),
+                  input.NumElements() * sizeof(Scalar));
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.h b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
similarity index 52%
rename from tensorflow/compiler/tf2xla/kernels/gather_op.h
rename to tensorflow/core/kernels/snapshot_op_gpu.cu.cc
index df86e1fcdd1a4860ed7ee0c5017d25ccf9d227ea..52070be838d65d21813dfe097db9c395ef5a8448 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.h
+++ b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
@@ -12,30 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#if GOOGLE_CUDA
 
-// Declaration of the Gather Op using the XLA dynamic slice implementation.
+// See docs in ../ops/array_ops.cc.
+#include "tensorflow/core/kernels/snapshot_op.h"
 
-#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_GATHER_OP_H_
-#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_GATHER_OP_H_
-
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/util/bcast.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
 
-class GatherOpDynamicSlice : public XlaOpKernel {
- public:
-  explicit GatherOpDynamicSlice(OpKernelConstruction* context);
-
-  void Compile(XlaOpKernelContext* context) override;
+#define REGISTER_KERNEL(TYPE)                                        \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Snapshot").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      SnapshotOp<GPUDevice, TYPE>);
 
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(GatherOpDynamicSlice);
-};
+TF_CALL_POD_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_TF2XLA_KERNELS_GATHER_OP_H_
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index 590f01c4691f479cbf90971b368656ff3c78c91a..e1712ac239d6be2d51b0c0598a799959a8b53a94 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // Partial specialization for a CPUDevice, that uses the Eigen implementation
 // from SoftmaxEigenImpl.
@@ -48,7 +48,7 @@ struct SoftmaxFunctor<CPUDevice, T> : SoftmaxFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct SoftmaxFunctor<SYCLDevice, T> : SoftmaxFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 template <typename Device, typename T>
@@ -100,5 +100,5 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<double>("T"),
     SoftmaxOp<SYCLDevice, double>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op_functor.h b/tensorflow/core/kernels/softmax_op_functor.h
index 1f38bdce8c3a8f70e89efe62ad6c6f385bb5dfc0..d3a267ed877eedf8ed3845ebd11255f0690b3106 100644
--- a/tensorflow/core/kernels/softmax_op_functor.h
+++ b/tensorflow/core/kernels/softmax_op_functor.h
@@ -64,23 +64,21 @@ struct SoftmaxEigenImpl {
     one_by_class.set(1, num_classes);
 #endif
     // shifted_logits = logits - max(logits along classes);
-    auto shifted_logits = (logits -
-                           logits.maximum(along_class)
-                               .eval()
-                               .reshape(batch_by_one)
-                               .broadcast(one_by_class));
+    auto shifted_logits = (logits - logits.maximum(along_class)
+                                        .eval()
+                                        .reshape(batch_by_one)
+                                        .broadcast(one_by_class));
     if (log) {
       // Calculate the log of the softmax
       // softmax = logits - max(logits along classes);
       softmax.device(d) = shifted_logits;
       // softmax = softmax - log(sum(exp(softmax along classes)));
-      softmax.device(d) = (softmax -
-                           softmax.exp()
-                               .sum(along_class)
-                               .eval()
-                               .reshape(batch_by_one)
-                               .log()
-                               .broadcast(one_by_class));
+      softmax.device(d) = (softmax - softmax.exp()
+                                         .sum(along_class)
+                                         .log()
+                                         .eval()
+                                         .reshape(batch_by_one)
+                                         .broadcast(one_by_class));
     } else {
       // NOTE(touts): If you modify this implementation please run
       // the BM_ImageNetSoftmaxFwd benchmark in nn_ops_test.cc.
@@ -88,12 +86,11 @@ struct SoftmaxEigenImpl {
       // softmax = exp(logits - max(logits along classes));
       softmax.device(d) = shifted_logits.exp();
       // softmax = softmax * (1 / sum(softmax along classes));
-      softmax.device(d) = (softmax *
-                           softmax.sum(along_class)
-                               .inverse()
-                               .eval()
-                               .reshape(batch_by_one)
-                               .broadcast(one_by_class));
+      softmax.device(d) = (softmax * softmax.sum(along_class)
+                                         .inverse()
+                                         .eval()
+                                         .reshape(batch_by_one)
+                                         .broadcast(one_by_class));
     }
   }
 };
diff --git a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
index c25ce2d8bb5ee5fe50034e74f0362fd6b0e79589..92ddf8edbfbe5e3c8fbc2c3b5ddeddd847838814 100644
--- a/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
+++ b/tensorflow/core/kernels/spacetobatch_benchmark_test.cc
@@ -70,7 +70,7 @@ static Graph* ConstructSpaceToBatchGraph(
   }                                                                                                     \
   BENCHMARK(                                                                                            \
       BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11);
-#define BM_SpaceToBatch(OP, ...)                      \
+#define BM_SpaceToBatch(OP, ...)                                 \
   BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \
   BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \
   BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_HALF, __VA_ARGS__));  \
diff --git a/tensorflow/core/kernels/spacetobatch_functor.cc b/tensorflow/core/kernels/spacetobatch_functor.cc
index 23d8a5f9ed4483c0e7d5c15108db6cbbdbe0890a..4c374b8d99444023c14fcb4ed770a5c263535be0 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor.cc
@@ -154,7 +154,7 @@ struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, B2S> {
 #define INSTANTIATE(NUM_BLOCK_DIMS, T)                                      \
   template struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, false>; \
   template struct SpaceToBatchFunctor<CPUDevice, T, NUM_BLOCK_DIMS, true>;  \
-/**/
+  /**/
 
 #define INSTANTIATE_FOR_T(T) \
   TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
diff --git a/tensorflow/core/kernels/spacetobatch_functor.h b/tensorflow/core/kernels/spacetobatch_functor.h
index 06813650c08ec26a38edfe2ba01440a2fb8066fc..f46a84da1e951113382e4d44b44463c2a621ca10 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.h
+++ b/tensorflow/core/kernels/spacetobatch_functor.h
@@ -44,7 +44,7 @@ constexpr int kMaxSpaceToBatchBlockDims = 4;
   MACRO(2 /**/, ##__VA_ARGS__)                              \
   MACRO(3 /**/, ##__VA_ARGS__)                              \
   MACRO(4 /**/, ##__VA_ARGS__)                              \
-/**/
+  /**/
 
 namespace internal {
 namespace spacetobatch {
diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
index db8d419c38ff5f8a06a1aafde14076b55b7c75e6..5687141c9eaeec11498c1d2cc954155bd9e05856 100644
--- a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
@@ -141,10 +141,10 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
     }
     CudaLaunchConfig config =
         GetCudaLaunchConfig(static_cast<int32>(total_count), d);
-    S2B<T, NUM_BLOCK_DIMS,
-        B2S><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-        config.virtual_thread_count, const_cast<T*>(space_tensor.data()), args,
-        const_cast<T*>(batch_tensor.data()));
+    S2B<T, NUM_BLOCK_DIMS, B2S>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, const_cast<T*>(space_tensor.data()),
+            args, const_cast<T*>(batch_tensor.data()));
     return Status::OK();
   }
 };
@@ -153,7 +153,7 @@ struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
 #define INSTANTIATE(NUM_BLOCK_DIMS, T)                                      \
   template struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, false>; \
   template struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, true>;  \
-/**/
+  /**/
 
 #define INSTANTIATE_FOR_T(T) \
   TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(INSTANTIATE, T)
diff --git a/tensorflow/core/kernels/spacetobatch_op.cc b/tensorflow/core/kernels/spacetobatch_op.cc
index 95c1f5e7e8ca978fda334396538de0cf4ed5b774..fdc08ec8e3bfd128a3e341efab8e5ba319c90e4f 100644
--- a/tensorflow/core/kernels/spacetobatch_op.cc
+++ b/tensorflow/core/kernels/spacetobatch_op.cc
@@ -58,9 +58,10 @@ void SpaceToBatchOpCompute(OpKernelContext* context,
       errors::InvalidArgument("input rank should be >= ", 1 + block_dims,
                               " instead of ", orig_input_tensor.dims()));
 
-  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(orig_paddings.shape()) &&
-                           block_dims == orig_paddings.dim_size(0) &&
-                           2 == orig_paddings.dim_size(1),
+  OP_REQUIRES(context,
+              TensorShapeUtils::IsMatrix(orig_paddings.shape()) &&
+                  block_dims == orig_paddings.dim_size(0) &&
+                  2 == orig_paddings.dim_size(1),
               errors::InvalidArgument("paddings should have shape [",
                                       block_dims, ", 2] instead of ",
                                       orig_paddings.shape().DebugString()));
diff --git a/tensorflow/core/kernels/sparse_add_grad_op.cc b/tensorflow/core/kernels/sparse_add_grad_op.cc
index d8ed0c6f0c20d13d5e7870159ed1569514333c5e..8597f3a8f7307584d27a265bc8df8949f20898b6 100644
--- a/tensorflow/core/kernels/sparse_add_grad_op.cc
+++ b/tensorflow/core/kernels/sparse_add_grad_op.cc
@@ -35,9 +35,10 @@ class SparseAddGradOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices));
     OP_REQUIRES_OK(ctx, ctx->input("sum_indices", &sum_indices));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(b_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(sum_indices->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsMatrix(a_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(b_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(sum_indices->shape()),
                 errors::InvalidArgument(
                     "Input indices should be matrices but received shapes: ",
                     a_indices->shape().DebugString(), " and ",
@@ -49,8 +50,9 @@ class SparseAddGradOp : public OpKernel {
             "Input backprop_val_grad should be a vector but received shape: ",
             backprop_val_grad->shape().DebugString()));
     OP_REQUIRES(
-        ctx, a_indices->dim_size(1) == b_indices->dim_size(1) &&
-                 b_indices->dim_size(1) == sum_indices->dim_size(1),
+        ctx,
+        a_indices->dim_size(1) == b_indices->dim_size(1) &&
+            b_indices->dim_size(1) == sum_indices->dim_size(1),
         errors::InvalidArgument("The densified operands should have the same "
                                 "ndims; for A, B, sum got: ",
                                 a_indices->dim_size(1), b_indices->dim_size(1),
diff --git a/tensorflow/core/kernels/sparse_add_op.cc b/tensorflow/core/kernels/sparse_add_op.cc
index bd91dfdce64cbfc697345e0f0c7278de938ecc5b..d16317af671dd6592d3e30ac52941508c4ffd088 100644
--- a/tensorflow/core/kernels/sparse_add_op.cc
+++ b/tensorflow/core/kernels/sparse_add_op.cc
@@ -34,8 +34,9 @@ class SparseAddOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, ctx->input("a_indices", &a_indices));
     OP_REQUIRES_OK(ctx, ctx->input("b_indices", &b_indices));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()) &&
-                         TensorShapeUtils::IsMatrix(b_indices->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsMatrix(a_indices->shape()) &&
+                    TensorShapeUtils::IsMatrix(b_indices->shape()),
                 errors::InvalidArgument(
                     "Input indices should be matrices but received shapes: ",
                     a_indices->shape().DebugString(), " and ",
@@ -46,8 +47,9 @@ class SparseAddOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("a_values", &a_values_t));
     OP_REQUIRES_OK(ctx, ctx->input("b_values", &b_values_t));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_values_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_values_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_values_t->shape()),
                 errors::InvalidArgument(
                     "Input values should be vectors but received shapes: ",
                     a_values_t->shape().DebugString(), " and ",
@@ -62,8 +64,9 @@ class SparseAddOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, ctx->input("a_shape", &a_shape));
     OP_REQUIRES_OK(ctx, ctx->input("b_shape", &b_shape));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape->shape()) &&
-                         TensorShapeUtils::IsVector(b_shape->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_shape->shape()) &&
+                    TensorShapeUtils::IsVector(b_shape->shape()),
                 errors::InvalidArgument(
                     "Input shapes should be a vector but received shapes ",
                     a_shape->shape().DebugString(), " and ",
diff --git a/tensorflow/core/kernels/sparse_add_op_test.cc b/tensorflow/core/kernels/sparse_add_op_test.cc
index 4cad02bbee8dd20328bac3ec24074c22493009b8..1f08e6c5ce2e8a40cf464760434f9161015b643c 100644
--- a/tensorflow/core/kernels/sparse_add_op_test.cc
+++ b/tensorflow/core/kernels/sparse_add_op_test.cc
@@ -61,9 +61,9 @@ TEST_F(SparseAddOpTest, TwoD_AddSparseTensorWithSelf) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
 #define ADD_TENSOR_INPUT()                                  \
diff --git a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
index c122616cf15b8567494a604337951c8d278f5ead..80bc1f19344dffadaae864f64c98d1f15addd1fb 100644
--- a/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/sparse_conditional_accumulator_op.cc
@@ -103,8 +103,9 @@ class SparseAccumulatorTakeGradientOp
                       DoneCallback callback) override {
     // Check signature
     OP_REQUIRES_OK_ASYNC(
-        ctx, ctx->MatchSignature({DT_STRING_REF, DT_INT32},
-                                 {DT_INT64, accumulator->dtype(), DT_INT64}),
+        ctx,
+        ctx->MatchSignature({DT_STRING_REF, DT_INT32},
+                            {DT_INT64, accumulator->dtype(), DT_INT64}),
         callback);
   }
 
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index 07d935d55fe06150309736ba0fec88091ed007c6..7cd4532ad63812d905ceb6b96291aa50293070ef 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -288,8 +288,7 @@ struct CrossTraits<true, int64> {
 template <bool HASHED_OUTPUT, typename InternalType>
 class SparseCrossOp : public OpKernel {
  public:
-  explicit SparseCrossOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+  explicit SparseCrossOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
     // Read signed_hash_key_ as int64 since uint64 attributes are not
     // supported by REGISTER_OP.
@@ -316,8 +315,8 @@ class SparseCrossOp : public OpKernel {
         GenerateColumnsFromInput(indices_list_in, values_list_in,
                                  shapes_list_in, dense_list_in);
 
-    typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser
-        crosser(columns, num_buckets_, hash_key_);
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser crosser(
+        columns, num_buckets_, hash_key_);
     Tensor* indices_out;
     Tensor* values_out;
     Tensor* shape_out;
@@ -326,8 +325,8 @@ class SparseCrossOp : public OpKernel {
     CreateOutputTensors(columns, batch_size, context, &indices_out, &values_out,
                         &shape_out, &output_start_indices);
 
-    typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater
-        updater(output_start_indices, indices_out, values_out);
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater updater(
+        output_start_indices, indices_out, values_out);
     auto do_work = [this, &columns, crosser, updater](int64 begin, int64 end) {
       for (int b = begin; b < end; b++) {
         ProductIterator<InternalType> product_iterator(columns, b);
@@ -381,8 +380,9 @@ class SparseCrossOp : public OpKernel {
               "Input values should be a std::vector but received shape ",
               values_list_in[i].shape().DebugString(), " at position ", i));
       OP_REQUIRES(
-          context, indices_list_in[i].shape().dim_size(0) ==
-                       values_list_in[i].shape().dim_size(0),
+          context,
+          indices_list_in[i].shape().dim_size(0) ==
+              values_list_in[i].shape().dim_size(0),
           errors::InvalidArgument(
               "Expected size of values to be ",
               indices_list_in[i].shape().dim_size(0), " got ",
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
index cc0f86ce05e613767b22d51875f90e8391504bdb..ac48202ada2204ea36478257630f20f7892be50b 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@@ -70,8 +70,9 @@ class SparseDenseBinaryOpShared : public OpKernel {
                 errors::InvalidArgument(
                     "Input sp_indices should be a matrix but received shape: ",
                     indices_t->shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(values_t->shape()) &&
-                         TensorShapeUtils::IsVector(shape_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(values_t->shape()) &&
+                    TensorShapeUtils::IsVector(shape_t->shape()),
                 errors::InvalidArgument(
                     "Inputs sp_values and sp_shape should be vectors "
                     "but received shapes: ",
@@ -150,8 +151,9 @@ class SparseDenseBinaryOpShared : public OpKernel {
       CASE(4);
       CASE(5);
       default:
-        OP_REQUIRES(ctx, false, errors::InvalidArgument(
-                                    "Only tensors with ranks between 1 and 5 "
+        OP_REQUIRES(
+            ctx, false,
+            errors::InvalidArgument("Only tensors with ranks between 1 and 5 "
                                     "are currently supported.  Tensor rank: ",
                                     ndims));
 #undef CASE
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index eaf1884243ec19689af783e29adaee886e7498d6..fe198af7e6c131ab19daf877063a2a6838d1f2c7 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -96,9 +96,9 @@ TEST_F(SparseDenseCDivTest, SameShape) {
   // [2    ]  cdiv [dense: same shape, all 1's]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   // Tensor dense(DT_FLOAT, TensorShape({3, 1}));
@@ -125,9 +125,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseSameDims) {
   // [2    ]  cdiv [dense: shape [3,1], all 1's]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({3, 1}));
@@ -152,9 +152,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseFewerDims) {
   // [2    ]  cdiv [dense: shape [2]]
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
@@ -184,9 +184,9 @@ TEST_F(SparseDenseCMulTest, BroadcastDense) {
   // [1   ?]  where ? remains implicitly zero.
   // [1.5 0]
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 8ab23b64d3d94c604ae027bbfd75357a4e2e284b..a1f9667b783ca5f455523874bc4e342f1368d4f3 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -159,8 +159,8 @@ struct SparseSlice {
 
 template <typename T>
 template <bool Transpose>
-void SparseSlice<T>::Initialize(const typename SparseSlice<T>::ConstMatrixMap& mat,
-                                int col_offset) {
+void SparseSlice<T>::Initialize(
+    const typename SparseSlice<T>::ConstMatrixMap& mat, int col_offset) {
   const int mat_rows = Transpose ? mat.dimension(1) : mat.dimension(0);
   const int mat_cols = Transpose ? mat.dimension(0) : mat.dimension(1);
   DCHECK_LE(num_rows, mat_rows);
@@ -278,9 +278,9 @@ ALWAYS_INLINE float ConvertBfloat16ToFloat(const bfloat16* src) {
   float out = 0;
   auto tmp = reinterpret_cast<bfloat16*>(&out);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    tmp[0] = *src;
+  tmp[0] = *src;
 #else
-    tmp[1] = *src;
+  tmp[1] = *src;
 #endif
   return out;
 }
@@ -970,9 +970,9 @@ class SparseMatMulOp : public OpKernel {
     const int k2 = transpose_b_ ? b.dim_size(1) : b.dim_size(0);
 
     OP_REQUIRES(ctx, k == k2,
-                errors::InvalidArgument("Matrix size incompatible: a: ",
-                                        a.shape().DebugString(), ", b: ",
-                                        b.shape().DebugString()));
+                errors::InvalidArgument(
+                    "Matrix size incompatible: a: ", a.shape().DebugString(),
+                    ", b: ", b.shape().DebugString()));
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({m, n}), &output));
 
@@ -1224,8 +1224,9 @@ ALWAYS_INLINE void CopyAndMayBeInterleave(void* dst, const void* src,
 
 template <typename TL, typename TR>
 inline BlockingCounter* SparseMatMul<TL, TR>::ShuffleMatrix(
-    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat, int slice_row_start,
-    int slice_num_rows, int slice_col_start, int slice_num_cols, const int N,
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& mat,
+    int slice_row_start, int slice_num_rows, int slice_col_start,
+    int slice_num_cols, const int N,
     const DeviceBase::CpuWorkerThreads* thread_pool, MatrixR* buffer) {
   DCHECK_EQ(N % 2, 0);
   DCHECK_LE(kNumOperands * sizeof(float) / sizeof(TR), N);
@@ -1306,8 +1307,9 @@ inline std::unique_ptr<BlockingCounter> SparseMatMul<TL, TR>::CreateDenseSlices(
 template <typename TL, typename TR>
 inline void SparseMatMul<TL, TR>::ComputeBlockSizes(
     const typename SparseMatMul<TL, TR>::ConstMatrixMapL& left,
-    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right, bool transpose_left,
-    int num_threads, int* KR, int* NR, int* KL, int* JB, int* IB) {
+    const typename SparseMatMul<TL, TR>::ConstMatrixMapR& right,
+    bool transpose_left, int num_threads, int* KR, int* NR, int* KL, int* JB,
+    int* IB) {
   // Heuristics for calculating block sizes
   // Assume two hyperthreads per core.
   const int est_num_cores = std::max(1, (num_threads + 1) / 2);
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index cca52558ae25a7a0840d8551440f68ccc5ec2277..14ef2ed7044a796dff67e287230d955e32ca62cd 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -159,25 +159,25 @@ EIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) {
 // Return a packet with the first value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_first<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 0);
+  return vec_splat(a, 0);
 }
 
 // Return a packet with the second value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_second<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 1);
+  return vec_splat(a, 1);
 }
 
 // Return a packet with the third value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_third<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 2);
+  return vec_splat(a, 2);
 }
 
 // Return a packet with the fourth value of the input Packet replicated
 template <>
 EIGEN_STRONG_INLINE Packet4f pbroadcast_fourth<Packet4f>(const Packet4f& a) {
-  return vec_splat (a, 3);
+  return vec_splat(a, 3);
 }
 #endif
 
diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc
index f815ca9e344664c4c95befccb88e750eb99d0eaf..ebc6d8fa4ec5422925e57c25856e0007702299b1 100644
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@@ -284,11 +284,11 @@ class SparseMatmulOpTest : public ::testing::Test {
       uint16_t* data3_bfloat16_p =
           reinterpret_cast<uint16_t*>(data3_bfloat16) + i;
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-            data3_p[1] = 0;
-            data3_bfloat16_p[0] = data3_p[0];
+      data3_p[1] = 0;
+      data3_bfloat16_p[0] = data3_p[0];
 #else
-            data3_p[0] = 0;
-            data3_bfloat16_p[0] = data3_p[1];
+      data3_p[0] = 0;
+      data3_bfloat16_p[0] = data3_p[1];
 #endif
     }
   }
diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
index 110376be42573fe31cc1a13306c80e5050477f03..96246c7a71272bf638523fafb548b7e802f09039 100644
--- a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
+++ b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
@@ -51,9 +51,9 @@ TEST_F(SparseReduceSumOpTest, SimpleReduce) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   AddInputFromArray<int64>(indices_shape, indices);
@@ -93,9 +93,9 @@ TEST_F(SparseReduceSumSparseOpTest, SimpleReduce) {
   // [3   4]
 
   const auto indices_shape = TensorShape({4, 2});
-  std::initializer_list<int64> in{ 0, 1, 1, 0, 2, 0, 2, 1 };
+  std::initializer_list<int64> in{0, 1, 1, 0, 2, 0, 2, 1};
   const gtl::ArraySlice<int64> indices(in);
-  std::initializer_list<int64> sh{ 3, 2 };
+  std::initializer_list<int64> sh{3, 2};
   const gtl::ArraySlice<int64> shape(sh);
 
   AddInputFromArray<int64>(indices_shape, indices);
diff --git a/tensorflow/core/kernels/sparse_softmax_op.cc b/tensorflow/core/kernels/sparse_softmax_op.cc
index 327a94b8a12e1d8568c5ca79263cc6eb78501d15..444a5f657a969290d9cc67d88c500a49a0971282 100644
--- a/tensorflow/core/kernels/sparse_softmax_op.cc
+++ b/tensorflow/core/kernels/sparse_softmax_op.cc
@@ -50,8 +50,9 @@ class SparseSoftmaxOp : public OpKernel {
                 errors::InvalidArgument(
                     "Input sp_indices should be a matrix but received shape: ",
                     indices_t->shape().DebugString()));
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(values_t->shape()) &&
-                             TensorShapeUtils::IsVector(shape_t->shape()),
+    OP_REQUIRES(context,
+                TensorShapeUtils::IsVector(values_t->shape()) &&
+                    TensorShapeUtils::IsVector(shape_t->shape()),
                 errors::InvalidArgument(
                     "Inputs sp_values and sp_shape should be vectors "
                     "but received shapes: ",
diff --git a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
index b027adba6b384c63d119387b5b13122fb1c25b12..09cb2a6a71c7c0f0ebc9cbc2e7b1951705890a41 100644
--- a/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_sparse_binary_op_shared.cc
@@ -132,14 +132,16 @@ class SparseSparseBinaryOpShared : public OpKernel {
 
     // Validations.
     OP_REQUIRES(
-        ctx, TensorShapeUtils::IsMatrix(a_indices_t->shape()) &&
-                 TensorShapeUtils::IsMatrix(b_indices_t->shape()),
+        ctx,
+        TensorShapeUtils::IsMatrix(a_indices_t->shape()) &&
+            TensorShapeUtils::IsMatrix(b_indices_t->shape()),
         errors::InvalidArgument("Inputs a_indices and b_indices should be "
                                 "matrices but received shapes: ",
                                 a_indices_t->shape().DebugString(), ", ",
                                 b_indices_t->shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_values_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_values_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_values_t->shape()),
                 errors::InvalidArgument(
                     "Inputs a_values and b_values should be vectors "
                     "but received shapes: ",
@@ -157,8 +159,9 @@ class SparseSparseBinaryOpShared : public OpKernel {
                                 " non-empty input values, got ",
                                 a_values.size(), " and ", b_values.size()));
 
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(a_shape_t->shape()) &&
-                         TensorShapeUtils::IsVector(b_shape_t->shape()),
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(a_shape_t->shape()) &&
+                    TensorShapeUtils::IsVector(b_shape_t->shape()),
                 errors::InvalidArgument(
                     "Input shapes should be a vector but received shapes ",
                     a_shape_t->shape().DebugString(), " and ",
diff --git a/tensorflow/core/kernels/sparse_split_op.cc b/tensorflow/core/kernels/sparse_split_op.cc
index 6171b532aa243e6a3d8b42e5c8856aaa1c7ad207..67dcf05a6ced17fa2dbd44fb03dca21a032bcc5b 100644
--- a/tensorflow/core/kernels/sparse_split_op.cc
+++ b/tensorflow/core/kernels/sparse_split_op.cc
@@ -48,18 +48,20 @@ class SparseSplitOp : public OpKernel {
                     "Input shape should be a vector but received shape ",
                     input_shape.shape().DebugString()));
 
-    OP_REQUIRES(context, input_shape.dim_size(0) &&
-                             split_dim < input_shape.vec<int64>().size(),
-                errors::InvalidArgument(
-                    "Input split_dim should be between 0 and rank (",
-                    input_shape.vec<int64>().size(), "), got ", split_dim));
-
-    OP_REQUIRES(context, num_split_ >= 1 &&
-                             num_split_ <= input_shape.vec<int64>()(split_dim),
-                errors::InvalidArgument("Input num_split should be between 1 "
-                                        "and the splitting dimension size (",
-                                        input_shape.vec<int64>()(split_dim),
-                                        "), got ", num_split_));
+    OP_REQUIRES(
+        context,
+        input_shape.dim_size(0) && split_dim < input_shape.vec<int64>().size(),
+        errors::InvalidArgument(
+            "Input split_dim should be between 0 and rank (",
+            input_shape.vec<int64>().size(), "), got ", split_dim));
+
+    OP_REQUIRES(
+        context,
+        num_split_ >= 1 && num_split_ <= input_shape.vec<int64>()(split_dim),
+        errors::InvalidArgument("Input num_split should be between 1 "
+                                "and the splitting dimension size (",
+                                input_shape.vec<int64>()(split_dim), "), got ",
+                                num_split_));
 
     sparse::SparseTensor sparse_tensor(input_indices, input_values,
                                        TensorShape(input_shape.vec<int64>()));
diff --git a/tensorflow/core/kernels/sparse_to_dense_op.cc b/tensorflow/core/kernels/sparse_to_dense_op.cc
index 6a6cc3d81382a783aa9e34c841cb7be650dd7c87..ba3da21a4331562354e7dfce3348954fda3d46ad 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op.cc
@@ -73,8 +73,9 @@ class SparseToDense : public OpKernel {
     // sparse_values
     const Tensor& sparse_values = c->input(2);
     const int64 num_values = sparse_values.NumElements();
-    OP_REQUIRES(c, sparse_values.dims() == 0 ||
-                       (sparse_values.dims() == 1 && num_values == num_elems),
+    OP_REQUIRES(c,
+                sparse_values.dims() == 0 ||
+                    (sparse_values.dims() == 1 && num_values == num_elems),
                 errors::InvalidArgument("sparse_values has incorrect shape ",
                                         sparse_values.shape().DebugString(),
                                         ", should be [] or [", num_elems, "]"));
diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
index f0d19da8046e7cb3c243f1e4e6c3266a0f96d921..d8b0f93082453bab574fe5fd5edbb78041efad54 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
@@ -38,7 +38,6 @@ namespace {
 
 class SparseToDenseTest : public OpsTestBase {
  protected:
-
   void MakeOp(int dim, DataType index_type, DataType value_type) {
     TF_ASSERT_OK(NodeDefBuilder("sparsetodense", "SparseToDense")
                      .Input(FakeInput(index_type))
diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc
index c35ba42db2915216fe74a1f82d403e9b6803f63a..f84ffd53238f7753c1b4562268be9058c6c03e6d 100644
--- a/tensorflow/core/kernels/sparse_xent_op.cc
+++ b/tensorflow/core/kernels/sparse_xent_op.cc
@@ -39,10 +39,10 @@ Status CheckInvalidLabelIndex(const Tensor& labels, int64 max_index) {
   if (*min_max_dim_value.first < 0 || *min_max_dim_value.second >= max_index) {
     bad_index = (*min_max_dim_value.first < 0) ? *min_max_dim_value.first
                                                : *min_max_dim_value.second;
-    return errors::InvalidArgument("Received a label value of ", bad_index,
-                                   " which is outside the valid range of [0, ",
-                                   max_index, ").  Label values: ",
-                                   labels.SummarizeValue(labels.NumElements()));
+    return errors::InvalidArgument(
+        "Received a label value of ", bad_index,
+        " which is outside the valid range of [0, ", max_index,
+        ").  Label values: ", labels.SummarizeValue(labels.NumElements()));
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/sparse_xent_op_test.cc b/tensorflow/core/kernels/sparse_xent_op_test.cc
index b8ea0d2d7e279bc089aeb5574fc58c1af1686ca9..afb0bf76267f24ba1e2142954abfdcb41356cb96 100644
--- a/tensorflow/core/kernels/sparse_xent_op_test.cc
+++ b/tensorflow/core/kernels/sparse_xent_op_test.cc
@@ -41,10 +41,10 @@ static Graph* SparseXent(int batch_size, int num_classes) {
   return g;
 }
 
-#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                                \
-  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) {       \
+#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                          \
+  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \
     testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
-    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters);            \
+    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters);      \
   }                                                                     \
   BENCHMARK(BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE);
 
diff --git a/tensorflow/core/kernels/spectrogram.h b/tensorflow/core/kernels/spectrogram.h
index 5476a0a961859c3953eb3d4e8e841ead1f154202..fef0e64942816b5ffb53ac5b879159ab31b009cd 100644
--- a/tensorflow/core/kernels/spectrogram.h
+++ b/tensorflow/core/kernels/spectrogram.h
@@ -28,8 +28,8 @@ limitations under the License.
 // window = hann(window_length_samples, 'periodic');
 // S = abs(spectrogram(audio, window, overlap_samples)).^2;
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+#define TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
 
 #include <complex>
 #include <deque>
@@ -109,4 +109,4 @@ class Spectrogram {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
diff --git a/tensorflow/core/kernels/spectrogram_op_test.cc b/tensorflow/core/kernels/spectrogram_op_test.cc
index 5c3cbeeeb93fb37c7718cd522d16fc582ff8ba13..d34a7c99ecbffc18d819f4182389c98635418934 100644
--- a/tensorflow/core/kernels/spectrogram_op_test.cc
+++ b/tensorflow/core/kernels/spectrogram_op_test.cc
@@ -31,8 +31,8 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
-
-using namespace ops;  // NOLINT(build/namespaces)
+namespace ops {
+namespace {
 
 TEST(SpectrogramOpTest, SimpleTest) {
   Scope root = Scope::NewRootScope();
@@ -101,4 +101,6 @@ TEST(SpectrogramOpTest, SquaredTest) {
       test::AsTensor<float>({0, 1, 4, 1, 0}, TensorShape({1, 1, 5})), 1e-3);
 }
 
+}  // namespace
+}  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.cc b/tensorflow/core/kernels/spectrogram_test_utils.cc
index 046f6344dfed44069cf27f1b6d923db10498c98c..872a6e9d1bcce09765d1531c5f2898b2badc66a7 100644
--- a/tensorflow/core/kernels/spectrogram_test_utils.cc
+++ b/tensorflow/core/kernels/spectrogram_test_utils.cc
@@ -70,10 +70,24 @@ bool ReadRawFloatFileToComplexVector(
   int offset = 0;
   const int end = data_string.size();
   while (offset < end) {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    char arr[4];
+    for (int i = 0; i < kBytesPerValue; ++i) {
+      arr[3 - i] = *(data_string.data() + offset + i);
+    }
+    memcpy(&real_out, arr, kBytesPerValue);
+    offset += kBytesPerValue;
+    for (int i = 0; i < kBytesPerValue; ++i) {
+      arr[3 - i] = *(data_string.data() + offset + i);
+    }
+    memcpy(&imag_out, arr, kBytesPerValue);
+    offset += kBytesPerValue;
+#else
     memcpy(&real_out, data_string.data() + offset, kBytesPerValue);
     offset += kBytesPerValue;
     memcpy(&imag_out, data_string.data() + offset, kBytesPerValue);
     offset += kBytesPerValue;
+#endif
     if (row_counter >= row_length) {
       data->push_back(data_row);
       data_row.clear();
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.h b/tensorflow/core/kernels/spectrogram_test_utils.h
index 59a903549e853b0d270ba8cd565830f1310b677e..d4187076e748af5454e6dd03d05e49d923f1e9d2 100644
--- a/tensorflow/core/kernels/spectrogram_test_utils.h
+++ b/tensorflow/core/kernels/spectrogram_test_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+#ifndef TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
 
 #include <complex>
 #include <string>
@@ -78,4 +78,4 @@ void SineWave(int sample_rate, float frequency, float duration_seconds,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+#endif  // TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
diff --git a/tensorflow/core/kernels/split_lib.h b/tensorflow/core/kernels/split_lib.h
index ff92ffeeb38a964dcd068b54f9558ca8da7c969e..a08949e626cc8e5d4c3707b75a902d82b46c3376 100644
--- a/tensorflow/core/kernels/split_lib.h
+++ b/tensorflow/core/kernels/split_lib.h
@@ -57,7 +57,7 @@ struct Split<Eigen::SyclDevice, T> {
                   const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
                   const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes);
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index 6583f96a9172e0bd79fdc463ad249c71c99ffef9..771c633b156edf7c7d9944fe95703a0e0cd9e981 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -41,7 +41,6 @@ void Split<Eigen::ThreadPoolDevice, T>::operator()(
 
 TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS)
 DEFINE_CPU_KERNELS(quint8)
-DEFINE_CPU_KERNELS(bfloat16)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
@@ -50,13 +49,13 @@ void Split<Eigen::SyclDevice, T>::operator()(
     typename TTypes<T, 3>::ConstTensor input,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_indices,
     const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_sizes) {
-    output.device(d) = input.slice(slice_indices, slice_sizes);
+  output.device(d) = input.slice(slice_indices, slice_sizes);
 }
 
 #define DEFINE_SYCL_KERNELS(T) template struct Split<Eigen::SyclDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_SYCL_KERNELS);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index dd6fc6115f7b5bce60f5373c8556e7b1642afd6a..9f234fc0935be0662b0d8df1a6bd1c109ab24fd9 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -52,7 +52,7 @@ void SplitCustom<Device, T>::operator()(
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
 TF_CALL_complex128(DEFINE_GPU_KERNELS);
-DEFINE_GPU_KERNELS(bfloat16);
+TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
 
 #undef DEFINE_GPU_KERNELS
 #define DEFINE_GPU_KERNELS(T) template struct SplitCustom<Eigen::GpuDevice, T>;
@@ -60,7 +60,7 @@ DEFINE_GPU_KERNELS(bfloat16);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
 TF_CALL_complex128(DEFINE_GPU_KERNELS);
-DEFINE_GPU_KERNELS(bfloat16);
+TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
 
 #undef DEFINE_GPU_KERNELS
 
@@ -243,6 +243,7 @@ struct SplitVOpGPULaunch {
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_complex64(REGISTER_GPU_KERNEL);
 TF_CALL_complex128(REGISTER_GPU_KERNEL);
+TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #define REGISTER_GPU_KERNEL(T)                 \
   template struct SplitVOpGPULaunch<T, int32>; \
@@ -251,7 +252,7 @@ TF_CALL_complex128(REGISTER_GPU_KERNEL);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_complex64(REGISTER_GPU_KERNEL);
 TF_CALL_complex128(REGISTER_GPU_KERNEL);
-REGISTER_GPU_KERNEL(bfloat16);
+TF_CALL_bfloat16(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 58e1a73be61cf04aba05ebadb8d8e49f6aacef6b..85f529326dbf5d9d5ae72156da05f08f805d1271 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -39,7 +39,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SplitOpBase : public OpKernel {
@@ -142,8 +142,9 @@ class SplitOpCPU : public SplitOpBase<CPUDevice, T> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -245,10 +246,11 @@ class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
     const int32 split_dim =
         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
     const int32 num_split = Base::num_outputs();
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("Split on GPU requires input size "
-                                        "< max int32"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("Split on GPU requires input size "
+                                "< max int32"));
     int32 prefix_dim_size;
     int32 split_dim_size;
     int32 suffix_dim_size;
@@ -304,8 +306,9 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -342,14 +345,14 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
             {prefix_dim_size, split_dim_output_size, suffix_dim_size});
 
         functor::Split<SYCLDevice, T>()(context->eigen_device<SYCLDevice>(),
-                                       result_shaped, input_reshaped,
-                                       slice_indices, slice_sizes);
+                                        result_shaped, input_reshaped,
+                                        slice_indices, slice_sizes);
       }
       indices[1] += split_dim_output_size;
     }
   }
 };
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SPLIT(type)                             \
   REGISTER_KERNEL_BUILDER(Name("Split")                  \
@@ -375,16 +378,17 @@ REGISTER_SPLIT(quint8);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
+REGISTER_GPU(bfloat16);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                               \
-  REGISTER_KERNEL_BUILDER(Name("Split")                   \
-                              .Device(DEVICE_SYCL)        \
-                              .TypeConstraint<type>("T")  \
-                              .HostMemory("split_dim"),   \
+#define REGISTER_SYCL(type)                              \
+  REGISTER_KERNEL_BUILDER(Name("Split")                  \
+                              .Device(DEVICE_SYCL)       \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("split_dim"),  \
                           SplitOpSYCL<type>)
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index 3316e5fcc920166a8bd4f49f4ce1752b4c8910cb..7ff5df47d70fa8e47aabfb24e82874c146708ef1 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -197,8 +197,9 @@ class SplitVOpCPU : public SplitVOpBase<CPUDevice, T, Tlen> {
 
     // Android also uses int32 indexing, so check here also.
     OP_REQUIRES(
-        context, FastBoundsCheck(input.NumElements(),
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(input.NumElements(),
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("Split requires input size < ",
                                 std::numeric_limits<Eigen::DenseIndex>::max()));
 
@@ -305,10 +306,11 @@ class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> {
     const int32 split_dim_orig = context->input(2).flat<int32>()(0);
     const int32 split_dim =
         split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
-    OP_REQUIRES(context, FastBoundsCheck(input.NumElements(),
-                                         std::numeric_limits<int32>::max()),
-                errors::InvalidArgument("Split on GPU requires input size "
-                                        "< max int32"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("Split on GPU requires input size "
+                                "< max int32"));
 
     int32 prefix_dim_size;
     int32 split_dim_size;
@@ -406,7 +408,6 @@ class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> {
   REGISTER_SPLIT(type, int64);
 
 TF_CALL_ALL_TYPES(REGISTER_SPLIT_LEN);
-REGISTER_SPLIT_LEN(bfloat16);
 
 #undef REGISTER_SPLIT_LEN
 #undef REGISTER_SPLIT
diff --git a/tensorflow/core/kernels/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/sql/sqlite_query_connection.cc
deleted file mode 100644
index 1330506d28ca96b4a9e668219dc67cbb1c3b796d..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/sql/sqlite_query_connection.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/kernels/sql/sqlite_query_connection.h"
-
-#include "tensorflow/core/lib/strings/stringprintf.h"
-
-namespace tensorflow {
-
-namespace sql {
-
-SqliteQueryConnection::SqliteQueryConnection() {}
-SqliteQueryConnection::~SqliteQueryConnection() {}
-
-Status SqliteQueryConnection::Open(const string& data_source_name,
-                                   const string& query,
-                                   const DataTypeVector& output_types) {
-  if (db_ != nullptr) {
-    return errors::FailedPrecondition(
-        "Failed to open query connection: Connection already opeend.");
-  }
-  auto s = Sqlite::Open(data_source_name);
-  if (s.ok()) {
-    db_ = std::move(s.ValueOrDie());
-    query_ = query;
-    output_types_ = output_types;
-  }
-  return s.status();
-}
-
-Status SqliteQueryConnection::Close() {
-  Status s;
-  s.Update(stmt_.Close());
-  s.Update(db_->Close());
-  return s;
-}
-
-Status SqliteQueryConnection::GetNext(std::vector<Tensor>* out_tensors,
-                                      bool* end_of_sequence) {
-  if (!stmt_) {
-    Status s = PrepareQuery();
-    if (!s.ok()) {
-      return s;
-    }
-  }
-  Status s = stmt_.Step(end_of_sequence);
-  if (!*end_of_sequence) {
-    for (int i = 0; i < column_count_; i++) {
-      DataType dt = output_types_[i];
-      Tensor tensor(cpu_allocator(), dt, {});
-      FillTensorWithResultSetEntry(dt, i, &tensor);
-      out_tensors->emplace_back(std::move(tensor));
-    }
-  }
-  return s;
-}
-
-Status SqliteQueryConnection::PrepareQuery() {
-  stmt_ = db_->Prepare(query_);
-  Status s = stmt_.status();
-  if (s.ok()) {
-    int column_count = stmt_.ColumnCount();
-    if (column_count != output_types_.size()) {
-      return errors::InvalidArgument(tensorflow::strings::Printf(
-          "The number of columns in query (%d) must match the number of "
-          "elements in output_types (%zu).",
-          column_count, output_types_.size()));
-    }
-    column_count_ = column_count;
-  }
-  return s;
-}
-
-void SqliteQueryConnection::FillTensorWithResultSetEntry(
-    const DataType& data_type, int column_index, Tensor* tensor) {
-  switch (data_type) {
-    case DT_STRING:
-      tensor->scalar<string>()() = stmt_.ColumnString(column_index);
-      break;
-    case DT_INT8:
-      tensor->scalar<int8>()() =
-          static_cast<int8>(stmt_.ColumnInt(column_index));
-      break;
-    case DT_INT16:
-      tensor->scalar<int16>()() =
-          static_cast<int16>(stmt_.ColumnInt(column_index));
-      break;
-    case DT_INT32:
-      tensor->scalar<int32>()() =
-          static_cast<int32>(stmt_.ColumnInt(column_index));
-      break;
-    case DT_INT64:
-      tensor->scalar<int64>()() = stmt_.ColumnInt(column_index);
-      break;
-    case DT_UINT8:
-      tensor->scalar<uint8>()() =
-          static_cast<uint8>(stmt_.ColumnInt(column_index));
-      break;
-    case DT_UINT16:
-      tensor->scalar<uint16>()() =
-          static_cast<uint16>(stmt_.ColumnInt(column_index));
-      break;
-    case DT_BOOL:
-      tensor->scalar<bool>()() = stmt_.ColumnInt(column_index) != 0;
-      break;
-    case DT_FLOAT:
-      tensor->scalar<float>()() =
-          static_cast<float>(stmt_.ColumnDouble(column_index));
-      break;
-    case DT_DOUBLE:
-      tensor->scalar<double>()() = stmt_.ColumnDouble(column_index);
-      break;
-      // Error preemptively thrown by SqlDatasetOp::MakeDataset in this case.
-    default: {
-      LOG(FATAL)
-          << "Use of unsupported TensorFlow data type by 'SqlQueryConnection': "
-          << DataTypeString(data_type) << ".";
-    }
-  }
-}
-
-}  // namespace sql
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index affe81a55567d1ef304c7161c65c201021da1363..65296f61fd180e2f57855d4cee1566bf827dd46a 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -42,7 +42,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class Stack : public ResourceBase {
  public:
@@ -242,7 +242,7 @@ REGISTER_KERNEL_BUILDER(Name("StackV2")
                             .HostMemory("max_size")
                             .HostMemory("handle"),
                         StackOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device>
 class StackPushOp : public AsyncOpKernel {
@@ -274,11 +274,11 @@ class StackPushOp : public AsyncOpKernel {
     static constexpr int kCopyThreshold = 2048;
     static constexpr double kOccupancy = 0.7;
     if (swap_memory_ && !alloc_attrs.on_host() &&
-        ( std::is_same<Device, GPUDevice>::value
+        (std::is_same<Device, GPUDevice>::value
 #ifdef TENSORFLOW_USE_SYCL
-          || std::is_same<Device, SYCLDevice>::value
-#endif // TENSORFLOW_USE_SYCL
-        ) &&
+         || std::is_same<Device, SYCLDevice>::value
+#endif  // TENSORFLOW_USE_SYCL
+         ) &&
         tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
       DeviceContext* device_ctxt = ctx->op_device_context();
       auto device = static_cast<tensorflow::Device*>(ctx->device());
@@ -391,7 +391,7 @@ REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(bool);
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StackPopOp : public AsyncOpKernel {
  public:
@@ -498,7 +498,7 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 
 #undef REGISTER_SYCL_KERNEL
 #undef REGISTER_SYCL_HOST_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StackCloseOp : public OpKernel {
  public:
@@ -526,6 +526,6 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("StackCloseV2").Device(DEVICE_SYCL).HostMemory("handle"),
     StackCloseOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 0fae46dea61d361bd4ead0afc0fa33711407fc9b..03fc4467a1dcf9d70c90c19809690934b0a7c2f4 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -70,12 +70,11 @@ class Buffer : public ResourceBase {
     return bytes + current_bytes_ > memory_limit_;
   }
 
-  std::size_t GetTupleBytes(const Tuple & tuple)
-  {
+  std::size_t GetTupleBytes(const Tuple& tuple) {
     return std::accumulate(tuple.begin(), tuple.end(), 0,
-      [](const std::size_t & lhs, const Tensor & rhs) {
-        return lhs + rhs.TotalBytes();
-    });
+                           [](const std::size_t& lhs, const Tensor& rhs) {
+                             return lhs + rhs.TotalBytes();
+                           });
   }
 
  public:
@@ -90,19 +89,22 @@ class Buffer : public ResourceBase {
     std::size_t tuple_bytes = GetTupleBytes(*tuple);
 
     // Sanity check so that we don't block for ever below
-    if(memory_limit_ > 0 && tuple_bytes > memory_limit_) {
-      return Status(errors::ResourceExhausted("Attempted to insert "
-        "tensors with combined size of '", tuple_bytes, "' bytes into "
-        "Staging Area with a memory limit of '", memory_limit_, "'."));
+    if (memory_limit_ > 0 && tuple_bytes > memory_limit_) {
+      return Status(
+          errors::ResourceExhausted("Attempted to insert "
+                                    "tensors with combined size of '",
+                                    tuple_bytes,
+                                    "' bytes into "
+                                    "Staging Area with a memory limit of '",
+                                    memory_limit_, "'."));
     }
 
-
     // If buffer capacity is bounded wait until elements have been removed
-    if(IsBounded()) {
+    if (IsBounded()) {
       full_cond_var_.wait(lock, [tuple_bytes, this]() {
         // If there's a memory limit, check if there's space for insertion
-        bool memory_limit_valid = memory_limit_ > 0 ?
-            !WouldExceedMemoryLimit(tuple_bytes) : true;
+        bool memory_limit_valid =
+            memory_limit_ > 0 ? !WouldExceedMemoryLimit(tuple_bytes) : true;
         // If we're configured for capacity check if there's space for insertion
         bool capacity_valid = capacity_ > 0 ? !IsCapacityFull() : true;
 
@@ -186,8 +188,7 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
   ContainerInfo cinfo;
 
   // Lambda for creating the Staging Area
-  auto create_fn = [&ndef](Buffer** ret) -> Status
-  {
+  auto create_fn = [&ndef](Buffer** ret) -> Status {
     int64 capacity;
     int64 memory_limit;
     TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "capacity", &capacity));
@@ -196,7 +197,6 @@ Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
     return Status::OK();
   };
 
-
   TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */));
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<Buffer>(cinfo.container(), cinfo.name(),
                                                 buf, create_fn));
@@ -228,7 +228,7 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class UnstageOp : public OpKernel {
  public:
@@ -244,7 +244,8 @@ class UnstageOp : public OpKernel {
 
     buf->Get(&tuple);
 
-    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
+    OP_REQUIRES(
+        ctx, tuple.size() == (size_t)ctx->num_outputs(),
         errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
                                 " vs. ", ctx->num_outputs()));
 
@@ -260,7 +261,7 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class StagePeekOp : public OpKernel {
  public:
@@ -278,7 +279,8 @@ class StagePeekOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx, buf->Peek(index, &tuple));
 
-    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
+    OP_REQUIRES(
+        ctx, tuple.size() == (size_t)ctx->num_outputs(),
         errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
                                 " vs. ", ctx->num_outputs()));
 
@@ -288,17 +290,15 @@ class StagePeekOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU),
-                                              StagePeekOp);
+REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU), StagePeekOp);
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index").
-                            Device(DEVICE_GPU), StagePeekOp);
+REGISTER_KERNEL_BUILDER(
+    Name("StagePeek").HostMemory("index").Device(DEVICE_GPU), StagePeekOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index")
-                          .Device(DEVICE_SYCL), StagePeekOp);
-#endif // TENSORFLOW_USE_SYCL
-
+REGISTER_KERNEL_BUILDER(
+    Name("StagePeek").HostMemory("index").Device(DEVICE_SYCL), StagePeekOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 class StageSizeOp : public OpKernel {
  public:
@@ -312,9 +312,8 @@ class StageSizeOp : public OpKernel {
     core::ScopedUnref scope(buf);
 
     // Allocate size output tensor
-    Tensor * size = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}),
-                                                     &size));
+    Tensor* size = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &size));
 
     // Set it to the actual size
     size->scalar<int32>().setConstant(buf->Size());
@@ -323,13 +322,13 @@ class StageSizeOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("StageSize").Device(DEVICE_CPU), StageSizeOp);
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
-                        .Device(DEVICE_GPU), StageSizeOp);
+REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size").Device(DEVICE_GPU),
+                        StageSizeOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
-                        .Device(DEVICE_SYCL), StageSizeOp);
-#endif // TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("StageSize").HostMemory("size").Device(DEVICE_SYCL), StageSizeOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 class StageClearOp : public OpKernel {
  public:
@@ -352,7 +351,6 @@ REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_GPU), StageClearOp);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_SYCL), StageClearOp);
-#endif // TENSORFLOW_USE_SYCL
-
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 8fc40db3cc22060eb18b64c2246188925626b8bf..7745effe2abe94ba73a2f0d761210e07c62e499c 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -294,6 +294,11 @@ class StridedSliceAssignOp : public OpKernel {
       OP_REQUIRES_OK(context,
                      LookupResource(context, HandleFromInput(context, 0), &v));
       old_lhs = *v->tensor();
+      OP_REQUIRES(context, old_lhs.dtype() == DataTypeToEnum<T>::value,
+                  errors::InvalidArgument(
+                      "l-value dtype ", DataTypeString(old_lhs.dtype()),
+                      " does not match r-value dtype ",
+                      DataTypeString(DataTypeToEnum<T>::value)));
     } else {
       context->forward_ref_input_to_ref_output(0, 0);
       old_lhs = context->mutable_input(0, true);
@@ -386,7 +391,6 @@ class StridedSliceAssignOp : public OpKernel {
                           StridedSliceAssignOp<CPUDevice, type>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
-REGISTER_STRIDED_SLICE(bfloat16);
 
 #undef REGISTER_STRIDED_SLICE
 
@@ -427,6 +431,7 @@ REGISTER_STRIDED_SLICE(bfloat16);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
+TF_CALL_int64(REGISTER_GPU);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -541,5 +546,5 @@ REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
                             .HostMemory("strides"),
                         StridedSliceAssignOp<CPUDevice, int32>)
 #undef REGISTER_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.h b/tensorflow/core/kernels/strided_slice_op.h
index 0f72c4b771025458a1403ce13842787249a2718f..2b5863229860c256e1c74f1fe11bf57ed502008e 100644
--- a/tensorflow/core/kernels/strided_slice_op.h
+++ b/tensorflow/core/kernels/strided_slice_op.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
index a8487f49f4488269e058c6b7ee94d0f82aeb5270..8ca27e3b920e7c0cd36343d0c9db5a6098b6bede 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
@@ -53,6 +53,7 @@ typedef Eigen::GpuDevice GPUDevice;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
 TF_CALL_complex128(DEFINE_GPU_KERNELS);
+TF_CALL_int64(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(int32);
 
 #undef DEFINE_GPU_KERNELS
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 7d4288742644be26d7e91e730b611a165989063c..1c4472bb1ab4e6b9d09a1f1464577172056c6fbe 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -26,6 +26,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/register_types_traits.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -84,16 +86,16 @@ void HandleStridedSliceCase(OpKernelContext* context,
 
   gtl::InlinedVector<int64, 4> processing_dims = processing_shape.dim_sizes();
   if (is_simple_slice) {
-    gtl::InlinedVector<int64, 4> sizes(begin.size());
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> begin_di;
+    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes_di;
     for (int i = 0; i < NDIM; ++i) {
-      sizes[i] = end[i] - begin[i];
+      begin_di[i] = begin[i];
+      sizes_di[i] = end[i] - begin[i];
     }
-    const TensorShape final_shape = result->shape();
-    CHECK(result->CopyFrom(*result, processing_shape));
-    const Tensor input = context->input(0);
-    functor::Slice<Device, T, NDIM>()(
-        context->eigen_device<Device>(), result, input, begin, sizes);
-    CHECK(result->CopyFrom(*result, final_shape));
+    functor::Slice<Device, Proxy, NDIM>()(
+        context->eigen_device<Device>(),
+        result->bit_casted_shaped<Proxy, NDIM>(processing_dims),
+        context->input(0).bit_casted_tensor<Proxy, NDIM>(), begin_di, sizes_di);
   } else {
     Eigen::DSizes<Eigen::DenseIndex, NDIM> begin_di;
     Eigen::DSizes<Eigen::DenseIndex, NDIM> end_di;
@@ -196,9 +198,10 @@ class HandleStridedSliceAssignCase<Device, T, 0> {
   extern template struct StridedSlice<GPUDevice, T, NDIM>;         \
   template <>                                                      \
   void Slice<GPUDevice, T, NDIM>::operator()(                      \
-      const GPUDevice& d, Tensor* output, const Tensor& input,     \
-      const gtl::ArraySlice<int64>& slice_indices,                 \
-      const gtl::ArraySlice<int64>& slice_sizes);                  \
+      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
+      typename TTypes<T, NDIM>::ConstTensor input,                 \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
+      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
   extern template struct Slice<GPUDevice, T, NDIM>;                \
   template <>                                                      \
   void StridedSliceGrad<GPUDevice, T, NDIM>::operator()(           \
@@ -283,10 +286,10 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_GPU);
 TF_CALL_complex64(DECLARE_FOR_N_GPU);
 TF_CALL_complex128(DECLARE_FOR_N_GPU);
 DECLARE_FOR_N_GPU(int32);
+DECLARE_FOR_N_GPU(int64);
 #endif  // END GOOGLE_CUDA
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
-DECLARE_FOR_N_CPU(bfloat16);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define PREVENT_FOR_N_SYCL(T) \
@@ -298,9 +301,10 @@ DECLARE_FOR_N_CPU(bfloat16);
 TF_CALL_SYCL_PROXY_TYPES(PREVENT_FOR_N_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_FOR_N_SYCL);
 DECLARE_FOR_N_SYCL(int32);
+DECLARE_FOR_N_SYCL(int64);
 
 #undef DECLARE_FOR_N_SYCL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #undef INSTANTIATE
 #undef DECLARE_FOR_N_CPU
diff --git a/tensorflow/core/kernels/strided_slice_op_test.cc b/tensorflow/core/kernels/strided_slice_op_test.cc
index 78bb15463c2ae4bb1b2b00a810223ab00b3aee70..281ca0f58fe8148d8ad5ba959b88fbe16950c31d 100644
--- a/tensorflow/core/kernels/strided_slice_op_test.cc
+++ b/tensorflow/core/kernels/strided_slice_op_test.cc
@@ -76,69 +76,20 @@ static void SliceHelper(int iters, int size) {
   testing::UseRealTime();
 }
 
-template <typename T>
-static void Dim8SliceHelper(int iters, int size) {
-  testing::StopTiming();
-  Graph* g = new Graph(OpRegistry::Global());
-  DataType dt = DataTypeToEnum<T>::v();
-  int kDim = 100;
-  int kMaxSize = 15000;
-  CHECK_LT(size, kMaxSize);
-
-  Tensor begin(DT_INT32, TensorShape({8}));
-  begin.flat<int32>()(10) = 10;
-  for (int i = 1; i < 7; ++i) {
-    begin.flat<int32>()(i) = 0;
-  }
-  begin.flat<int32>()(7) = 10;
-
-  Tensor end(DT_INT32, TensorShape({8}));
-  end.flat<int32>()(0) = 10 + kDim;
-  for (int i = 1; i < 7; ++i) {
-    end.flat<int32>()(i) = 1;
-  }
-  end.flat<int32>()(7) = 10 + size;
-
-  Tensor strides(DT_INT32, TensorShape({8}));
-  for (int i = 0; i < 8; ++i) {
-    strides.flat<int32>()(i) = 1;
-  }
-
-  Tensor input(dt, TensorShape({2*kDim, 1, 1, 1, 1, 1, 1, kMaxSize}));
-  input.flat<T>().setRandom();
-
-  Node* node;
-  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "StridedSlice")
-                  .Input(test::graph::Constant(g, input))
-                  .Input(test::graph::Constant(g, begin))
-                  .Input(test::graph::Constant(g, end))
-                  .Input(test::graph::Constant(g, strides))
-                  .Attr("T", dt)
-                  .Finalize(g, &node));
-
-  testing::BytesProcessed(static_cast<int64>(iters) * kDim * size * sizeof(T));
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
-  testing::UseRealTime();
-}
-
 static void BM_SliceFloat(int iters, int dim2) {
   SliceHelper<float>(iters, dim2);
-  Dim8SliceHelper<float>(iters, dim2);
 }
 
 BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000);
 
 static void BM_SliceComplex64(int iters, int dim2) {
   SliceHelper<std::complex<float>>(iters, dim2);
-  Dim8SliceHelper<std::complex<float>>(iters, dim2);
 }
 
 BENCHMARK(BM_SliceComplex64)->Arg(100)->Arg(1000)->Arg(10000);
 
 static void BM_SliceBFloat16(int iters, int dim2) {
   SliceHelper<bfloat16>(iters, dim2);
-  Dim8SliceHelper<bfloat16>(iters, dim2);
 }
 
 BENCHMARK(BM_SliceBFloat16)->Arg(100)->Arg(1000)->Arg(10000);
diff --git a/tensorflow/core/kernels/string_join_op.cc b/tensorflow/core/kernels/string_join_op.cc
index 721702bec68efa24d4dafef1e9aaa0c5f1b4c849..28cca9f44849b39647ba08c54d9e1f3c108f91fd 100644
--- a/tensorflow/core/kernels/string_join_op.cc
+++ b/tensorflow/core/kernels/string_join_op.cc
@@ -50,9 +50,9 @@ class StringJoinOp : public OpKernel {
         } else {
           OP_REQUIRES(
               context, input_shape == input.shape(),
-              errors::InvalidArgument("Input shapes do not match: ",
-                                      input_shape.DebugString(), " vs. ",
-                                      input.shape().DebugString()));
+              errors::InvalidArgument(
+                  "Input shapes do not match: ", input_shape.DebugString(),
+                  " vs. ", input.shape().DebugString()));
         }
       }
     }
diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc
index d583e4e6bba27d76ac2795eb8b7d11147282a04d..70dbd15c46cb341d8ad6ed6013b5b9ff8a5d61da 100644
--- a/tensorflow/core/kernels/string_to_number_op.cc
+++ b/tensorflow/core/kernels/string_to_number_op.cc
@@ -49,43 +49,15 @@ class StringToNumberOp : public OpKernel {
     auto output_flat = output_tensor->flat<OutputType>();
 
     for (int i = 0; i < input_flat.size(); ++i) {
-      Convert(input_flat(i), &output_flat(i), context);
+      OP_REQUIRES(
+          context,
+          strings::SafeStringToNumeric<OutputType>(input_flat(i).c_str(),
+                                                   &output_flat(i)),
+          errors::InvalidArgument(kErrorMessage, input_flat(i).c_str()));
     }
   }
-
- private:
-  void Convert(const string& s, OutputType* output_data,
-               OpKernelContext* context);
 };
 
-template <>
-void StringToNumberOp<float>::Convert(const string& s, float* output_data,
-                                      OpKernelContext* context) {
-  OP_REQUIRES(context, strings::safe_strtof(s.c_str(), output_data),
-              errors::InvalidArgument(kErrorMessage, s));
-}
-
-template <>
-void StringToNumberOp<double>::Convert(const string& s, double* output_data,
-                                       OpKernelContext* context) {
-  OP_REQUIRES(context, strings::safe_strtod(s.c_str(), output_data),
-              errors::InvalidArgument(kErrorMessage, s));
-}
-
-template <>
-void StringToNumberOp<int32>::Convert(const string& s, int32* output_data,
-                                      OpKernelContext* context) {
-  OP_REQUIRES(context, strings::safe_strto32(s, output_data),
-              errors::InvalidArgument(kErrorMessage, s));
-}
-
-template <>
-void StringToNumberOp<int64>::Convert(const string& s, int64* output_data,
-                                      OpKernelContext* context) {
-  OP_REQUIRES(context, strings::safe_strto64(s, output_data),
-              errors::InvalidArgument(kErrorMessage, s));
-}
-
 // Registers the currently supported output types.
 #define REGISTER(type)                                           \
   REGISTER_KERNEL_BUILDER(Name("StringToNumber")                 \
diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 743f11315042af94cfe41cecf52d145ae69f8209..e29f67297f9ce4a99898b256deda46ba95362904 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -95,9 +95,9 @@ class SubstrOp : public OpKernel {
       // Create BCast helper with shape of input and pos/len
       BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(pos_shape));
       OP_REQUIRES(context, bcast.IsValid(),
-                  errors::InvalidArgument("Incompatible shapes: ",
-                                          input_shape.DebugString(), " vs. ",
-                                          pos_shape.DebugString()));
+                  errors::InvalidArgument(
+                      "Incompatible shapes: ", input_shape.DebugString(),
+                      " vs. ", pos_shape.DebugString()));
       TensorShape output_shape = BCast::ToShape(bcast.result_shape());
       int ndims = output_shape.dims();
       Tensor* output_tensor = nullptr;
diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc
index 233b824bcc3bab74d70c990c44389e6df7b10f02..29b21ee7353fe03ce87bc03dad72b05ca8fd4311 100644
--- a/tensorflow/core/kernels/summary_image_op.cc
+++ b/tensorflow/core/kernels/summary_image_op.cc
@@ -54,18 +54,20 @@ class SummaryImageOp : public OpKernel {
     const Tensor& tensor = c->input(1);
     OP_REQUIRES(c, IsLegacyScalar(tags.shape()),
                 errors::InvalidArgument("Tags must be a scalar"));
-    OP_REQUIRES(c, tensor.dims() == 4 &&
-                       (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
-                        tensor.dim_size(3) == 4),
+    OP_REQUIRES(c,
+                tensor.dims() == 4 &&
+                    (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
+                     tensor.dim_size(3) == 4),
                 errors::InvalidArgument(
                     "Tensor must be 4-D with last dim 1, 3, or 4, not ",
                     tensor.shape().DebugString()));
     const string& base_tag = tags.scalar<string>()();
 
-    OP_REQUIRES(c, tensor.dim_size(0) < (1LL << 31) &&
-                       tensor.dim_size(1) < (1LL << 31) &&
-                       tensor.dim_size(2) < (1LL << 31) &&
-                       (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29),
+    OP_REQUIRES(c,
+                tensor.dim_size(0) < (1LL << 31) &&
+                    tensor.dim_size(1) < (1LL << 31) &&
+                    tensor.dim_size(2) < (1LL << 31) &&
+                    (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29),
                 errors::InvalidArgument("Tensor too large for summary ",
                                         tensor.shape().DebugString()));
 
diff --git a/tensorflow/core/kernels/summary_interface.cc b/tensorflow/core/kernels/summary_interface.cc
deleted file mode 100644
index 97c0c2c099cfceaa98a577d9642710020621e7e6..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/summary_interface.cc
+++ /dev/null
@@ -1,462 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/kernels/summary_interface.h"
-
-#include <utility>
-
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/summary.pb.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/histogram/histogram.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/png/png_io.h"
-#include "tensorflow/core/lib/wav/wav_io.h"
-#include "tensorflow/core/util/events_writer.h"
-#include "tensorflow/core/util/ptr_util.h"
-
-namespace tensorflow {
-namespace {
-template <typename T>
-Status TensorValueAt(Tensor t, int64 index, T* out) {
-  switch (t.dtype()) {
-    case DT_FLOAT:
-      *out = t.flat<float>()(index);
-      break;
-    case DT_DOUBLE:
-      *out = t.flat<double>()(index);
-      break;
-    case DT_HALF:
-      *out = T(t.flat<Eigen::half>()(index));
-      break;
-    case DT_INT32:
-      *out = t.flat<int32>()(index);
-      break;
-    case DT_UINT8:
-      *out = t.flat<uint8>()(index);
-      break;
-    case DT_INT16:
-      *out = t.flat<int16>()(index);
-      break;
-    case DT_INT8:
-      *out = t.flat<int8>()(index);
-      break;
-    case DT_BOOL:
-      *out = t.flat<bool>()(index);
-      break;
-    case DT_INT64:
-      *out = t.flat<int64>()(index);
-      break;
-    default:
-      return errors::Unimplemented("Scalar summary for dtype ",
-                                   DataTypeString(t.dtype()),
-                                   " is not supported.");
-  }
-  return Status::OK();
-}
-
-typedef Eigen::Tensor<uint8, 2, Eigen::RowMajor> Uint8Image;
-
-// Add the sequence of images specified by ith_image to the summary.
-//
-// Factoring this loop out into a helper function lets ith_image behave
-// differently in the float and uint8 cases: the float case needs a temporary
-// buffer which can be shared across calls to ith_image, but the uint8 case
-// does not.
-Status AddImages(const string& tag, int max_images, int batch_size, int w,
-                 int h, int depth,
-                 const std::function<Uint8Image(int)>& ith_image, Summary* s) {
-  const int N = std::min<int>(max_images, batch_size);
-  for (int i = 0; i < N; ++i) {
-    Summary::Value* v = s->add_value();
-    // The tag depends on the number of requested images (not the number
-    // produced.)
-    //
-    // Note that later on avisu uses "/" to figure out a consistent naming
-    // convention for display, so we append "/image" to guarantee that the
-    // image(s) won't be displayed in the global scope with no name.
-    if (max_images > 1) {
-      v->set_tag(strings::StrCat(tag, "/image/", i));
-    } else {
-      v->set_tag(strings::StrCat(tag, "/image"));
-    }
-
-    const auto image = ith_image(i);
-    Summary::Image* si = v->mutable_image();
-    si->set_height(h);
-    si->set_width(w);
-    si->set_colorspace(depth);
-    const int channel_bits = 8;
-    const int compression = -1;  // Use zlib default
-    if (!png::WriteImageToBuffer(image.data(), w, h, w * depth, depth,
-                                 channel_bits, compression,
-                                 si->mutable_encoded_image_string(), nullptr)) {
-      return errors::Internal("PNG encoding failed");
-    }
-  }
-  return Status::OK();
-}
-
-template <class T>
-void NormalizeFloatImage(int hw, int depth,
-                         typename TTypes<T>::ConstMatrix values,
-                         typename TTypes<uint8>::ConstVec bad_color,
-                         Uint8Image* image) {
-  if (!image->size()) return;  // Nothing to do for empty images
-
-  // Rescale the image to uint8 range.
-  //
-  // We are trying to generate an RGB image from a float/half tensor.  We do
-  // not have any info about the expected range of values in the tensor
-  // but the generated image needs to have all RGB values within [0, 255].
-  //
-  // We use two different algorithms to generate these values.  If the
-  // tensor has only positive values we scale them all by 255/max(values).
-  // If the tensor has both negative and positive values we scale them by
-  // the max of their absolute values and center them around 127.
-  //
-  // This works for most cases, but does not respect the relative dynamic
-  // range across different instances of the tensor.
-
-  // Compute min and max ignoring nonfinite pixels
-  float image_min = std::numeric_limits<float>::infinity();
-  float image_max = -image_min;
-  for (int i = 0; i < hw; i++) {
-    bool finite = true;
-    for (int j = 0; j < depth; j++) {
-      if (!Eigen::numext::isfinite(values(i, j))) {
-        finite = false;
-        break;
-      }
-    }
-    if (finite) {
-      for (int j = 0; j < depth; j++) {
-        float value(values(i, j));
-        image_min = std::min(image_min, value);
-        image_max = std::max(image_max, value);
-      }
-    }
-  }
-
-  // Pick an affine transform into uint8
-  const float kZeroThreshold = 1e-6;
-  T scale, offset;
-  if (image_min < 0) {
-    const float max_val = std::max(std::abs(image_min), std::abs(image_max));
-    scale = T(max_val < kZeroThreshold ? 0.0f : 127.0f / max_val);
-    offset = T(128.0f);
-  } else {
-    scale = T(image_max < kZeroThreshold ? 0.0f : 255.0f / image_max);
-    offset = T(0.0f);
-  }
-
-  // Transform image, turning nonfinite values to bad_color
-  for (int i = 0; i < hw; i++) {
-    bool finite = true;
-    for (int j = 0; j < depth; j++) {
-      if (!Eigen::numext::isfinite(values(i, j))) {
-        finite = false;
-        break;
-      }
-    }
-    if (finite) {
-      image->chip<0>(i) =
-          (values.template chip<0>(i) * scale + offset).template cast<uint8>();
-    } else {
-      image->chip<0>(i) = bad_color;
-    }
-  }
-}
-
-template <class T>
-Status NormalizeAndAddImages(const Tensor& tensor, int max_images, int h, int w,
-                             int hw, int depth, int batch_size,
-                             const string& base_tag, Tensor bad_color_tensor,
-                             Summary* s) {
-  // For float and half images, nans and infs are replaced with bad_color.
-  if (bad_color_tensor.dim_size(0) < depth) {
-    return errors::InvalidArgument(
-        "expected depth <= bad_color.size, got depth = ", depth,
-        ", bad_color.size = ", bad_color_tensor.dim_size(0));
-  }
-  auto bad_color_full = bad_color_tensor.vec<uint8>();
-  typename TTypes<uint8>::ConstVec bad_color(bad_color_full.data(), depth);
-
-  // Float images must be scaled and translated.
-  Uint8Image image(hw, depth);
-  auto ith_image = [&tensor, &image, bad_color, batch_size, hw, depth](int i) {
-    auto tensor_eigen = tensor.template shaped<T, 3>({batch_size, hw, depth});
-    typename TTypes<T>::ConstMatrix values(
-        &tensor_eigen(i, 0, 0), Eigen::DSizes<Eigen::DenseIndex, 2>(hw, depth));
-    NormalizeFloatImage<T>(hw, depth, values, bad_color, &image);
-    return image;
-  };
-  return AddImages(base_tag, max_images, batch_size, w, h, depth, ith_image, s);
-}
-
-}  // namespace
-
-class SummaryWriterImpl : public SummaryWriterInterface {
- public:
-  SummaryWriterImpl(int max_queue, int flush_millis, Env* env)
-      : SummaryWriterInterface(),
-        is_initialized_(false),
-        max_queue_(max_queue),
-        flush_millis_(flush_millis),
-        env_(env) {}
-
-  Status Initialize(const string& logdir, const string& filename_suffix) {
-    const Status is_dir = env_->IsDirectory(logdir);
-    if (!is_dir.ok()) {
-      if (is_dir.code() != tensorflow::error::NOT_FOUND) {
-        return is_dir;
-      }
-      TF_RETURN_IF_ERROR(env_->CreateDir(logdir));
-    }
-    mutex_lock ml(mu_);
-    events_writer_ =
-        tensorflow::MakeUnique<EventsWriter>(io::JoinPath(logdir, "events"));
-    if (!events_writer_->InitWithSuffix(filename_suffix)) {
-      return errors::Unknown("Could not initialize events writer.");
-    }
-    last_flush_ = env_->NowMicros();
-    is_initialized_ = true;
-    return Status::OK();
-  }
-
-  Status Flush() override {
-    mutex_lock ml(mu_);
-    if (!is_initialized_) {
-      return errors::FailedPrecondition("Class was not properly initialized.");
-    }
-    return InternalFlush();
-  }
-
-  ~SummaryWriterImpl() override {
-    (void)Flush();  // Ignore errors.
-  }
-
-  Status WriteTensor(int64 global_step, Tensor t, const string& tag,
-                     const string& serialized_metadata) override {
-    std::unique_ptr<Event> e{new Event};
-    e->set_step(global_step);
-    e->set_wall_time(GetWallTime());
-    Summary::Value* v = e->mutable_summary()->add_value();
-    t.AsProtoTensorContent(v->mutable_tensor());
-    v->set_tag(tag);
-    if (!serialized_metadata.empty()) {
-      v->mutable_metadata()->ParseFromString(serialized_metadata);
-    }
-    return WriteEvent(std::move(e));
-  }
-
-  Status WriteScalar(int64 global_step, Tensor t, const string& tag) override {
-    std::unique_ptr<Event> e{new Event};
-    e->set_step(global_step);
-    e->set_wall_time(GetWallTime());
-    Summary::Value* v = e->mutable_summary()->add_value();
-    v->set_tag(tag);
-    float value;
-    TF_RETURN_IF_ERROR(TensorValueAt<float>(t, 0, &value));
-    v->set_simple_value(value);
-    return WriteEvent(std::move(e));
-  }
-
-  Status WriteHistogram(int64 global_step, Tensor t,
-                        const string& tag) override {
-    std::unique_ptr<Event> e{new Event};
-    e->set_step(global_step);
-    e->set_wall_time(GetWallTime());
-    Summary::Value* v = e->mutable_summary()->add_value();
-    v->set_tag(tag);
-    histogram::Histogram histo;
-    for (int64 i = 0; i < t.NumElements(); i++) {
-      double double_val;
-      TF_RETURN_IF_ERROR(TensorValueAt<double>(t, i, &double_val));
-      if (Eigen::numext::isnan(double_val)) {
-        return errors::InvalidArgument("Nan in summary histogram for: ", tag);
-      } else if (Eigen::numext::isinf(double_val)) {
-        return errors::InvalidArgument("Infinity in summary histogram for: ",
-                                       tag);
-      }
-      histo.Add(double_val);
-    }
-
-    histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */);
-    return WriteEvent(std::move(e));
-  }
-
-  Status WriteImage(int64 global_step, Tensor tensor, const string& tag,
-                    int max_images, Tensor bad_color) override {
-    if (!(tensor.dims() == 4 &&
-          (tensor.dim_size(3) == 1 || tensor.dim_size(3) == 3 ||
-           tensor.dim_size(3) == 4))) {
-      return errors::InvalidArgument(
-          "Tensor must be 4-D with last dim 1, 3, or 4, not ",
-          tensor.shape().DebugString());
-    }
-    if (!(tensor.dim_size(0) < (1LL << 31) &&
-          tensor.dim_size(1) < (1LL << 31) &&
-          tensor.dim_size(2) < (1LL << 31) &&
-          (tensor.dim_size(1) * tensor.dim_size(2)) < (1LL << 29))) {
-      return errors::InvalidArgument("Tensor too large for summary ",
-                                     tensor.shape().DebugString());
-    }
-    std::unique_ptr<Event> e{new Event};
-    e->set_step(global_step);
-    e->set_wall_time(GetWallTime());
-    Summary* s = e->mutable_summary();
-    // The casts and h * w cannot overflow because of the limits above.
-    const int batch_size = static_cast<int>(tensor.dim_size(0));
-    const int h = static_cast<int>(tensor.dim_size(1));
-    const int w = static_cast<int>(tensor.dim_size(2));
-    const int hw = h * w;  // Compact these two dims for simplicity
-    const int depth = static_cast<int>(tensor.dim_size(3));
-    if (tensor.dtype() == DT_UINT8) {
-      // For uint8 input, no normalization is necessary
-      auto ith_image = [&tensor, batch_size, hw, depth](int i) {
-        auto values = tensor.shaped<uint8, 3>({batch_size, hw, depth});
-        return typename TTypes<uint8>::ConstMatrix(
-            &values(i, 0, 0), Eigen::DSizes<Eigen::DenseIndex, 2>(hw, depth));
-      };
-      TF_RETURN_IF_ERROR(
-          AddImages(tag, max_images, batch_size, w, h, depth, ith_image, s));
-    } else if (tensor.dtype() == DT_HALF) {
-      TF_RETURN_IF_ERROR(NormalizeAndAddImages<Eigen::half>(
-          tensor, max_images, h, w, hw, depth, batch_size, tag, bad_color, s));
-    } else if (tensor.dtype() == DT_FLOAT) {
-      TF_RETURN_IF_ERROR(NormalizeAndAddImages<float>(
-          tensor, max_images, h, w, hw, depth, batch_size, tag, bad_color, s));
-    } else {
-      return errors::InvalidArgument(
-          "Only DT_INT8, DT_HALF, and DT_FLOAT images are supported. Got ",
-          DataTypeString(tensor.dtype()));
-    }
-
-    return WriteEvent(std::move(e));
-  }
-
-  Status WriteAudio(int64 global_step, Tensor tensor, const string& tag,
-                    int max_outputs, float sample_rate) override {
-    if (sample_rate <= 0.0f) {
-      return errors::InvalidArgument("sample_rate must be > 0");
-    }
-    const int batch_size = tensor.dim_size(0);
-    const int64 length_frames = tensor.dim_size(1);
-    const int64 num_channels =
-        tensor.dims() == 2 ? 1 : tensor.dim_size(tensor.dims() - 1);
-    std::unique_ptr<Event> e{new Event};
-    e->set_step(global_step);
-    e->set_wall_time(GetWallTime());
-    Summary* s = e->mutable_summary();
-    const int N = std::min<int>(max_outputs, batch_size);
-    for (int i = 0; i < N; ++i) {
-      Summary::Value* v = s->add_value();
-      if (max_outputs > 1) {
-        v->set_tag(strings::StrCat(tag, "/audio/", i));
-      } else {
-        v->set_tag(strings::StrCat(tag, "/audio"));
-      }
-
-      Summary::Audio* sa = v->mutable_audio();
-      sa->set_sample_rate(sample_rate);
-      sa->set_num_channels(num_channels);
-      sa->set_length_frames(length_frames);
-      sa->set_content_type("audio/wav");
-
-      auto values =
-          tensor.shaped<float, 3>({batch_size, length_frames, num_channels});
-      auto channels_by_frames = typename TTypes<float>::ConstMatrix(
-          &values(i, 0, 0),
-          Eigen::DSizes<Eigen::DenseIndex, 2>(length_frames, num_channels));
-      size_t sample_rate_truncated = lrintf(sample_rate);
-      if (sample_rate_truncated == 0) {
-        sample_rate_truncated = 1;
-      }
-      TF_RETURN_IF_ERROR(wav::EncodeAudioAsS16LEWav(
-          channels_by_frames.data(), sample_rate_truncated, num_channels,
-          length_frames, sa->mutable_encoded_audio_string()));
-    }
-    return WriteEvent(std::move(e));
-  }
-
-  Status WriteGraph(int64 global_step,
-                    std::unique_ptr<GraphDef> graph) override {
-    std::unique_ptr<Event> e{new Event};
-    e->set_step(global_step);
-    e->set_wall_time(GetWallTime());
-    graph->SerializeToString(e->mutable_graph_def());
-    return WriteEvent(std::move(e));
-  }
-
-  Status WriteEvent(std::unique_ptr<Event> event) override {
-    mutex_lock ml(mu_);
-    queue_.emplace_back(std::move(event));
-    if (queue_.size() >= max_queue_ ||
-        env_->NowMicros() - last_flush_ > 1000 * flush_millis_) {
-      return InternalFlush();
-    }
-    return Status::OK();
-  }
-
-  string DebugString() override { return "SummaryWriterImpl"; }
-
- private:
-  double GetWallTime() {
-    return static_cast<double>(env_->NowMicros()) / 1.0e6;
-  }
-
-  Status InternalFlush() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    for (const std::unique_ptr<Event>& e : queue_) {
-      events_writer_->WriteEvent(*e);
-    }
-    queue_.clear();
-    if (!events_writer_->Flush()) {
-      return errors::InvalidArgument("Could not flush events file.");
-    }
-    last_flush_ = env_->NowMicros();
-    return Status::OK();
-  }
-
-  bool is_initialized_;
-  const int max_queue_;
-  const int flush_millis_;
-  uint64 last_flush_;
-  Env* env_;
-  mutex mu_;
-  std::vector<std::unique_ptr<Event>> queue_ GUARDED_BY(mu_);
-  // A pointer to allow deferred construction.
-  std::unique_ptr<EventsWriter> events_writer_ GUARDED_BY(mu_);
-  std::vector<std::pair<string, SummaryMetadata>> registered_summaries_
-      GUARDED_BY(mu_);
-};
-
-Status CreateSummaryWriter(int max_queue, int flush_millis,
-                           const string& logdir, const string& filename_suffix,
-                           Env* env, SummaryWriterInterface** result) {
-  SummaryWriterImpl* w = new SummaryWriterImpl(max_queue, flush_millis, env);
-  const Status s = w->Initialize(logdir, filename_suffix);
-  if (!s.ok()) {
-    w->Unref();
-    *result = nullptr;
-    return s;
-  }
-  *result = w;
-  return Status::OK();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/summary_interface.h b/tensorflow/core/kernels/summary_interface.h
index da1c28709fb35372b1f0b28faba757a23bcd9ac4..02391e967a84b2d2ff015d541969163807b9adc2 100644
--- a/tensorflow/core/kernels/summary_interface.h
+++ b/tensorflow/core/kernels/summary_interface.h
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
@@ -53,16 +55,6 @@ class SummaryWriterInterface : public ResourceBase {
   virtual Status WriteEvent(std::unique_ptr<Event> e) = 0;
 };
 
-// Creates a SummaryWriterInterface instance which writes to a file. It will
-// enqueue up to max_queue summaries, and flush at least every flush_millis
-// milliseconds. The summaries will be written to the directory specified by
-// logdir and with the filename suffixed by filename_suffix. The caller owns a
-// reference to result if the returned status is ok. The Env object must not
-// be destroyed until after the returned writer.
-Status CreateSummaryWriter(int max_queue, int flush_millis,
-                           const string& logdir, const string& filename_suffix,
-                           Env* env, SummaryWriterInterface** result);
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_SUMMARY_INTERFACE_H_
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index 7487e70acc22634edafd69b9b8d0a06481bcc4ed..d317a8d33db5b69a84a0d193cb6322afaa53dff6 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/contrib/tensorboard/db/schema.h"
 #include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
+#include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/kernels/summary_interface.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -41,10 +42,16 @@ class CreateSummaryFileWriterOp : public OpKernel {
     const int32 flush_millis = tmp->scalar<int32>()();
     OP_REQUIRES_OK(ctx, ctx->input("filename_suffix", &tmp));
     const string filename_suffix = tmp->scalar<string>()();
-    SummaryWriterInterface* s;
-    OP_REQUIRES_OK(ctx, CreateSummaryWriter(max_queue, flush_millis, logdir,
-                                            filename_suffix, ctx->env(), &s));
-    OP_REQUIRES_OK(ctx, CreateResource(ctx, HandleFromInput(ctx, 0), s));
+
+    SummaryWriterInterface* s = nullptr;
+    OP_REQUIRES_OK(ctx, LookupOrCreateResource<SummaryWriterInterface>(
+                            ctx, HandleFromInput(ctx, 0), &s,
+                            [max_queue, flush_millis, logdir, filename_suffix,
+                             ctx](SummaryWriterInterface** s) {
+                              return CreateSummaryFileWriter(
+                                  max_queue, flush_millis, logdir,
+                                  filename_suffix, ctx->env(), s);
+                            }));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryFileWriter").Device(DEVICE_CPU),
@@ -64,13 +71,23 @@ class CreateSummaryDbWriterOp : public OpKernel {
     const string run_name = tmp->scalar<string>()();
     OP_REQUIRES_OK(ctx, ctx->input("user_name", &tmp));
     const string user_name = tmp->scalar<string>()();
-    SummaryWriterInterface* s;
-    auto db = Sqlite::Open(db_uri);
-    OP_REQUIRES_OK(ctx, db.status());
+
+    SummaryWriterInterface* s = nullptr;
     OP_REQUIRES_OK(
-        ctx, CreateSummaryDbWriter(std::move(db.ValueOrDie()), experiment_name,
-                                   run_name, user_name, ctx->env(), &s));
-    OP_REQUIRES_OK(ctx, CreateResource(ctx, HandleFromInput(ctx, 0), s));
+        ctx,
+        LookupOrCreateResource<SummaryWriterInterface>(
+            ctx, HandleFromInput(ctx, 0), &s,
+            [db_uri, experiment_name, run_name, user_name,
+             ctx](SummaryWriterInterface** s) {
+              Sqlite* db;
+              TF_RETURN_IF_ERROR(Sqlite::Open(
+                  db_uri, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, &db));
+              core::ScopedUnref unref(db);
+              TF_RETURN_IF_ERROR(SetupTensorboardSqliteDb(db));
+              TF_RETURN_IF_ERROR(CreateSummaryDbWriter(
+                  db, experiment_name, run_name, user_name, ctx->env(), s));
+              return Status::OK();
+            }));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryDbWriter").Device(DEVICE_CPU),
@@ -261,8 +278,6 @@ class WriteAudioSummaryOp : public OpKernel {
 
  private:
   int max_outputs_;
-  bool has_sample_rate_attr_;
-  float sample_rate_attr_;
 };
 REGISTER_KERNEL_BUILDER(Name("WriteAudioSummary").Device(DEVICE_CPU),
                         WriteAudioSummaryOp);
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index b818724ec2e895d3995fe19b811327ed0ba112c2..1f4e3418f4826dee789002d4aa688f8ce14e17d2 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -41,11 +41,12 @@ class SummaryScalarOp : public OpKernel {
     const Tensor& values = c->input(1);
 
     OP_REQUIRES(
-        c, tags.IsSameSize(values) ||
-               (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())),
-        errors::InvalidArgument("tags and values not the same shape: ",
-                                tags.shape().DebugString(), " != ",
-                                values.shape().DebugString(), SingleTag(tags)));
+        c,
+        tags.IsSameSize(values) ||
+            (IsLegacyScalar(tags.shape()) && IsLegacyScalar(values.shape())),
+        errors::InvalidArgument(
+            "tags and values not the same shape: ", tags.shape().DebugString(),
+            " != ", values.shape().DebugString(), SingleTag(tags)));
     auto Ttags = tags.flat<string>();
     auto Tvalues = values.flat<T>();
     Summary s;
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index dedc2da60bab0d0c0613630c384c2f23ddae31e3..8c3a58b108abe66f2b61b5153923bee192246cd1 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -63,8 +63,8 @@ __global__ void ComputeValueOfVKernel(Cuda2DLaunchConfig config, int64 m,
                                       int64 ldu, const Scalar* M,
                                       const Scalar* U, const Scalar* S,
                                       Scalar* V) {
-  CUDA_AXIS_KERNEL_LOOP(batch, config.virtual_thread_count, x) {
-    CUDA_AXIS_KERNEL_LOOP(i, config.virtual_thread_count, y) {
+  CUDA_AXIS_KERNEL_LOOP(batch, config.virtual_thread_count.x, X) {
+    CUDA_AXIS_KERNEL_LOOP(i, config.virtual_thread_count.y, Y) {
       Scalar v = M[i + m * batch] * U[ldu * (i + m * batch)] * S[batch];
       CudaAtomicAdd(V + batch, v);
     }
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index cca6d0e35f2ee11d2a97f68581dd6f8dc87d929d..af93d814ec06ff86c6c7eb3312d97224dee485f2 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -336,8 +336,7 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
           tensor_array->HasIdenticalElementShapes(), false /* dynamic_size */,
           true /* multiple_writes_aggregate */, true /* is_grad */,
           marked_size /* marked_size */, true /* close_after_read */);
-      TF_RETURN_IF_ERROR((*ret)->CopyShapesFrom(tensor_array));
-      return Status::OK();
+      return (*ret)->CopyShapesFrom(tensor_array);
     };
 
     Status s = rm->LookupOrCreate<TensorArray>(
@@ -709,7 +708,6 @@ TF_CALL_POD_STRING_TYPES(REGISTER_GATHER_AND_PACK);
 REGISTER_GATHER_AND_PACK(quint8);
 REGISTER_GATHER_AND_PACK(qint8);
 REGISTER_GATHER_AND_PACK(qint32);
-REGISTER_GATHER_AND_PACK(bfloat16);
 
 #undef REGISTER_GATHER_AND_PACK
 
@@ -940,7 +938,6 @@ TF_CALL_POD_STRING_TYPES(REGISTER_CONCAT);
 REGISTER_CONCAT(quint8);
 REGISTER_CONCAT(qint8);
 REGISTER_CONCAT(qint32);
-REGISTER_CONCAT(bfloat16);
 
 #undef REGISTER_CONCAT
 
diff --git a/tensorflow/core/kernels/tile_functor_cpu.cc b/tensorflow/core/kernels/tile_functor_cpu.cc
index b2fd669541d32406512c4618fac77604baefedbe..f8144867014eccf04c892d0ce90a2aa280dfd764 100644
--- a/tensorflow/core/kernels/tile_functor_cpu.cc
+++ b/tensorflow/core/kernels/tile_functor_cpu.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/tile_functor.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/tile_functor.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index a6eed4935d5c4a2aaa8618bab88998d4ce060ecb..df6a666cd441d9c1306d950bbe0e79bf3dae28d9 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
 
 #define EIGEN_USE_THREADS
 
@@ -63,9 +63,9 @@ TF_CALL_int64(DEFINE_TYPE);
 
 #undef DEFINE_DIM
 #undef DEFINE_TYPE
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 }  // end namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_OPS_CPU_IMPL_H_
diff --git a/tensorflow/core/kernels/tile_ops_gpu_impl.h b/tensorflow/core/kernels/tile_ops_gpu_impl.h
index 592f99e9b7b5c928c7e522b734186ab0225cd1d0..8da337dabd2e7fc021ec92df97091d15fa39aeab 100644
--- a/tensorflow/core/kernels/tile_ops_gpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_gpu_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
 
 // Header used to split up compilation of GPU tile ops.  For each type you want
 // to have tile ops, create a .cu.cc file containing
@@ -56,4 +56,4 @@ limitations under the License.
   }                             \
   }
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_OPS_GPU_IMPL_H_
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 7648536c43d5a14effde0e530711de4dbee430e3..7fdce6cb7190ffa5f799853e27d18b9e33f2971a 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -64,7 +64,9 @@ class TopK : public OpKernel {
                 errors::InvalidArgument("input must be >= 1-D, got shape ",
                                         input_in.shape().DebugString()));
     OP_REQUIRES(context, input_in.dim_size(input_in.dims() - 1) >= k,
-                errors::InvalidArgument("input must have at least k columns"));
+                errors::InvalidArgument(
+                    "input must have at least k columns. Had ",
+                    input_in.dim_size(input_in.dims() - 1), ", needed ", k));
 
     const auto& input = input_in.flat_inner_dims<T>();
 
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index b8d601389ba18b9d37d1adb23c3d7dec9614346f..07befa27bc54631d30e413a15972c560655418e0 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -536,8 +536,9 @@ class ApplyAdadeltaOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    if (use_exclusive_lock_) {
-      mutex_lock l1(*GetTrainingVariableMutex(ctx, 0));
+    mutex* mu = GetTrainingVariableMutex(ctx, 0);
+    if (use_exclusive_lock_ && mu != nullptr) {
+      mutex_lock l1(*mu);
       // Don't try to acquire a lock on the second ref as they share the same
       // mutex.
       //
@@ -682,15 +683,21 @@ class SparseApplyAdadeltaOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    mutex* mu_var = GetTrainingVariableMutex(ctx, 0);
+  void Compute(OpKernelContext* ctx) override {
+    mutex* mu = GetTrainingVariableMutex(ctx, 0);
     // mu_accum is actually the same mutex as mu_var since currently we use a
     // global mutex.
     //
     // mutex* mu_accum = ctx->input_ref_mutex(1);
-    if (use_exclusive_lock_) {
-      mu_var->lock();
+    if (use_exclusive_lock_ && mu != nullptr) {
+      mutex_lock ml(*mu);
+      DoCompute(ctx);
+    } else {
+      DoCompute(ctx);
     }
+  }
+
+  void DoCompute(OpKernelContext* ctx) {
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
                             ctx, 0, use_exclusive_lock_, true, &var));
@@ -791,9 +798,6 @@ class SparseApplyAdadeltaOp : public OpKernel {
             update.square() * update.constant(static_cast<T>(1) - rho_scalar);
       }
     }
-    if (use_exclusive_lock_) {
-      mu_var->unlock();
-    }
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -3275,7 +3279,6 @@ REGISTER_KERNELS(double, int64);
 
 #undef REGISTER_KERNELS
 
-
 template <typename Device, typename T>
 class ApplyAddSignOp : public OpKernel {
  public:
@@ -3358,17 +3361,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                               \
-  template <>                                                             \
-  void ApplyAddSign<GPUDevice, T>::operator()(                            \
-      const GPUDevice& d,                                                 \
-      typename TTypes<T>::Flat var,                                       \
-      typename TTypes<T>::Flat m,                                         \
-      typename TTypes<T>::ConstScalar lr,                                 \
-      typename TTypes<T>::ConstScalar alpha,                              \
-      typename TTypes<T>::ConstScalar sign_decay,                         \
-      typename TTypes<T>::ConstScalar beta,                               \
-      typename TTypes<T>::ConstFlat grad);                                \
+#define DECLARE_GPU_SPEC(T)                                           \
+  template <>                                                         \
+  void ApplyAddSign<GPUDevice, T>::operator()(                        \
+      const GPUDevice& d, typename TTypes<T>::Flat var,               \
+      typename TTypes<T>::Flat m, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstScalar alpha,                          \
+      typename TTypes<T>::ConstScalar sign_decay,                     \
+      typename TTypes<T>::ConstScalar beta,                           \
+      typename TTypes<T>::ConstFlat grad);                            \
   extern template struct ApplyAddSign<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -3383,7 +3384,6 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
-
 template <typename Device, typename T>
 class ApplyPowerSignOp : public OpKernel {
  public:
@@ -3466,17 +3466,15 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                               \
-  template <>                                                             \
-  void ApplyPowerSign<GPUDevice, T>::operator()(                          \
-      const GPUDevice& d,                                                 \
-      typename TTypes<T>::Flat var,                                       \
-      typename TTypes<T>::Flat m,                                         \
-      typename TTypes<T>::ConstScalar lr,                                 \
-      typename TTypes<T>::ConstScalar logbase,                            \
-      typename TTypes<T>::ConstScalar sign_decay,                         \
-      typename TTypes<T>::ConstScalar beta,                               \
-      typename TTypes<T>::ConstFlat grad);                                \
+#define DECLARE_GPU_SPEC(T)                                           \
+  template <>                                                         \
+  void ApplyPowerSign<GPUDevice, T>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T>::Flat var,               \
+      typename TTypes<T>::Flat m, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstScalar logbase,                        \
+      typename TTypes<T>::ConstScalar sign_decay,                     \
+      typename TTypes<T>::ConstScalar beta,                           \
+      typename TTypes<T>::ConstFlat grad);                            \
   extern template struct ApplyPowerSign<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index d443a6b3c1d0b548e915216adbc05549a66eaeda..0376a3b2c602c13b3082b7762cf61a2b30552199 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/training_ops.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/training_ops.h"
 
 namespace tensorflow {
 
@@ -115,13 +115,11 @@ struct ApplyAdam<GPUDevice, T> {
     Eigen::Sizes<1> single;
     const auto one = static_cast<T>(1.0);
     m.device(d) =
-        m +
-        (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
-            (grad - m);
+        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+                (grad - m);
     v.device(d) =
-        v +
-        (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
-            (grad.square() - v);
+        v + (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
+                (grad.square() - v);
 
     if (use_nesterov) {
       var.device(d) -=
@@ -157,9 +155,9 @@ struct ApplyRMSProp<GPUDevice, T> {
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
     const auto one = static_cast<T>(1.0);
-    ms.device(d) = ms +
-                   (rho.constant(one) - rho).reshape(single).broadcast(bcast) *
-                       (grad.square() - ms);
+    ms.device(d) =
+        ms + (rho.constant(one) - rho).reshape(single).broadcast(bcast) *
+                 (grad.square() - ms);
     mom.device(d) =
         mom * momentum.reshape(single).broadcast(bcast) +
         lr.reshape(single).broadcast(bcast) * grad /
@@ -212,7 +210,7 @@ struct ApplyAddSign<GPUDevice, T> {
     auto beta_bcast = beta.reshape(single).broadcast(bcast);
     auto one_minus_beta =
         (beta.constant(one) - beta).reshape(single).broadcast(bcast);
-    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+    m.device(d) = m * beta_bcast + grad * one_minus_beta;
 
     // The following is the GPU equivalent of the CPU version:
     // var.device(d) -= lr() * (alpha() + sign_decay() * sign_gm) * grad;
@@ -244,7 +242,7 @@ struct ApplyPowerSign<GPUDevice, T> {
     auto beta_bcast = beta.reshape(single).broadcast(bcast);
     auto one_minus_beta =
         (beta.constant(one) - beta).reshape(single).broadcast(bcast);
-    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+    m.device(d) = m * beta_bcast + grad * one_minus_beta;
 
     // The following is the GPU equivalent of the CPU version:
     // auto grad_scale = (logbase() * sign_decay() * sign_gm).exp();
@@ -253,7 +251,7 @@ struct ApplyPowerSign<GPUDevice, T> {
     auto lr_bcast = lr.reshape(single).broadcast(bcast);
     auto logbase_bcast = logbase.reshape(single).broadcast(bcast);
     auto sign_decay_bcast = sign_decay.reshape(single).broadcast(bcast);
-    auto grad_scale =  (logbase_bcast * sign_decay_bcast * sign_gm).exp();
+    auto grad_scale = (logbase_bcast * sign_decay_bcast * sign_gm).exp();
     var.device(d) -= lr_bcast * grad_scale * grad;
   }
 };
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index ffa7f87c9efda0e3288b9fb06d0c9d1a3dcba277..2dcc4a500e6c64753c6fde4f88582f914a50089e 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -176,8 +176,9 @@ static void Adam(int32 n, Graph** init_g, Graph** train_g) {
     auto beta2 = Scalar(g, 0.99);
     auto epsilon = Scalar(g, 1e-8);
     auto grad = Random(g, n);
-    test::graph::Multi(g, "ApplyAdam", {var, m, v, beta1_power, beta2_power, lr,
-                                        beta1, beta2, epsilon, grad});
+    test::graph::Multi(
+        g, "ApplyAdam",
+        {var, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad});
     *train_g = g;
   }
 }
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 41b73fdaf4aced13070164afb81825592637f8c4..5198df7e16e020f0ee19baa387ccae899e21499a 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -88,6 +88,18 @@ struct Transpose<CPUDevice, T, conjugate> {
         internal::TransposeUsingEigen<CPUDevice, T, 5>(d, in, perm, conjugate,
                                                        out);
         break;
+      case 6:
+        internal::TransposeUsingEigen<CPUDevice, T, 6>(d, in, perm, conjugate,
+                                                       out);
+        break;
+      case 7:
+        internal::TransposeUsingEigen<CPUDevice, T, 7>(d, in, perm, conjugate,
+                                                       out);
+        break;
+      case 8:
+        internal::TransposeUsingEigen<CPUDevice, T, 8>(d, in, perm, conjugate,
+                                                       out);
+        break;
       default:
         TransposeSimple<T, conjugate>(d, in, perm, out);
         break;
diff --git a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
index 493dac9a7ca5a57dba10a3c155299d78e3a69f38..d6a237d6c183cbacf2b5bbbd5f5e9034e84c73af 100644
--- a/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/transpose_functor_gpu.cu.cc
@@ -201,6 +201,27 @@ struct Transpose<GPUDevice, T, conjugate> {
                                                          out);
         }
         break;
+      case 6:
+        if (!internal::TransposeUsingTile<T, conjugate>::run(d, in, perm,
+                                                             out)) {
+          internal::TransposeUsingEigen<GPUDevice, T, 6>(d, in, perm, conjugate,
+                                                         out);
+        }
+        break;
+      case 7:
+        if (!internal::TransposeUsingTile<T, conjugate>::run(d, in, perm,
+                                                             out)) {
+          internal::TransposeUsingEigen<GPUDevice, T, 7>(d, in, perm, conjugate,
+                                                         out);
+        }
+        break;
+      case 8:
+        if (!internal::TransposeUsingTile<T, conjugate>::run(d, in, perm,
+                                                             out)) {
+          internal::TransposeUsingEigen<GPUDevice, T, 8>(d, in, perm, conjugate,
+                                                         out);
+        }
+        break;
       default:
         internal::TransposeSimple<T, conjugate>(d, in, perm, out);
         break;
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 96c051c636e54b671fec259d38218dcf7cc0837c..7177ad78884cae85a847a283017511dcad2e4878 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -176,9 +176,10 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
     }
   }
   for (int i = 0; i < dims; ++i) {
-    OP_REQUIRES(ctx, bits[i], errors::InvalidArgument(
-                                  i, " is missing from {",
-                                  str_util::Join(permutation, ","), "}."));
+    OP_REQUIRES(
+        ctx, bits[i],
+        errors::InvalidArgument(i, " is missing from {",
+                                str_util::Join(permutation, ","), "}."));
   }
 
   // 0-D, 1-D, and identity transposes do nothing.
@@ -230,7 +231,6 @@ Status ConjugateTransposeCpuOp::DoTranspose(OpKernelContext* ctx,
                               .HostMemory("perm"),    \
                           MklConjugateTransposeCpuOp);
 TF_CALL_ALL_TYPES(REGISTER);
-REGISTER(bfloat16);
 #undef REGISTER
 
 #else  // INTEL_MKL
@@ -247,7 +247,6 @@ REGISTER(bfloat16);
                               .HostMemory("perm"),    \
                           ConjugateTransposeCpuOp);
 TF_CALL_ALL_TYPES(REGISTER)
-REGISTER(bfloat16);
 #undef REGISTER
 #endif  // INTEL_MKL
 
diff --git a/tensorflow/core/kernels/typed_queue.h b/tensorflow/core/kernels/typed_queue.h
index 0d608d9b8799d561141ac2d3378a0f0e3507acfd..43dcb4cef74c568a6bc31abc8c460cff241fc6fa 100644
--- a/tensorflow/core/kernels/typed_queue.h
+++ b/tensorflow/core/kernels/typed_queue.h
@@ -58,9 +58,9 @@ Status TypedQueue<SubQueue>::Initialize() {
   if (!component_shapes_.empty() &&
       component_dtypes_.size() != component_shapes_.size()) {
     return errors::InvalidArgument(
-        "Different number of component types.  ", "Types: ",
-        DataTypeSliceString(component_dtypes_), ", Shapes: ",
-        ShapeListString(component_shapes_));
+        "Different number of component types.  ",
+        "Types: ", DataTypeSliceString(component_dtypes_),
+        ", Shapes: ", ShapeListString(component_shapes_));
   }
 
   mutex_lock lock(mu_);
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index d087784c8a0bd2a53438af4582754b2d47620545..0ef8724b10e492373c7663a58420bfe236be7df7 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
@@ -63,8 +64,17 @@ class UniqueOp : public OpKernel {
         OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
                     errors::InvalidArgument("unique expects a 1D vector."));
       } else {
-        auto axis_vec = axis_tensor.vec<int64>();
-        axis = axis_vec(0);
+        OP_REQUIRES(context,
+                    (axis_tensor.dtype() == DT_INT32 ||
+                     axis_tensor.dtype() == DT_INT64),
+                    errors::InvalidArgument(
+                        "axis tensor should be int32 or int64, but got ",
+                        axis_tensor.dtype()));
+        if (axis_tensor.dtype() == DT_INT32) {
+          axis = internal::SubtleMustCopy(axis_tensor.scalar<int32>()());
+        } else {
+          axis = internal::SubtleMustCopy(axis_tensor.scalar<int64>()());
+        }
         axis = axis < 0 ? axis + input.dims() : axis;
         OP_REQUIRES(context, 0 <= axis && axis < input.dims(),
                     errors::InvalidArgument("axis has to be between [0, ",
@@ -83,69 +93,100 @@ class UniqueOp : public OpKernel {
       }
     }
 
-    auto Tin = input.shaped<T, 3>(new_sizes);
-
     Tensor* idx = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(
-                                1, TensorShape({Tin.dimension(1)}), &idx));
+                                1, TensorShape({new_sizes[1]}), &idx));
     auto idx_vec = idx->template vec<TIndex>();
 
-    auto hash_fn = [&Tin](const int64& key) -> unsigned long {
-      size_t h = 0;
-      for (int64 i = 0; i < Tin.dimension(0); i++) {
-        for (int64 j = 0; j < Tin.dimension(2); j++) {
-          h = Hash64Combine(h, hash<T>{}(Tin(i, key, j)));
+    int64 uniq_size;
+    if (new_sizes[0] == 1 && new_sizes[2] == 1) {
+      // Specialized and faster implementation when unique is run over single
+      // elements. Here we put T directly into the map rather than ints pointing
+      // to them as in the general case.
+      auto Tin = input.flat<T>();
+      const int64 N = static_cast<int64>(Tin.size());
+
+      std::unordered_map<T, TIndex> uniq;
+      uniq.reserve(2 * N);
+      for (int64 i = 0, j = 0; i < N; ++i) {
+        auto it = uniq.insert(std::make_pair(Tin(i), j));
+        idx_vec(i) = it.first->second;
+        if (it.second) {
+          ++j;
         }
       }
-      return h;
-    };
 
-    auto equal_to_fn = [&Tin](const int64& lhs, const int64& rhs) {
-      for (int64 i = 0; i < Tin.dimension(0); i++) {
-        for (int64 j = 0; j < Tin.dimension(2); j++) {
-          if (Tin(i, lhs, j) != Tin(i, rhs, j)) {
-            return false;
+      uniq_size = static_cast<int64>(uniq.size());
+      TensorShape output_shape(input.shape());
+      output_shape.set_dim(axis, uniq_size);
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, output_shape, &output));
+      auto Tout = output->flat<T>();
+
+      for (auto it : uniq) {
+        Tout(it.second) = it.first;
+      }
+    } else {
+      // General implementation when unique is run over multiple elements.
+      auto Tin = input.shaped<T, 3>(new_sizes);
+
+      auto hash_fn = [&Tin](const int64& key) {
+        size_t h = 0;
+        for (int64 i = 0; i < Tin.dimension(0); i++) {
+          for (int64 j = 0; j < Tin.dimension(2); j++) {
+            h = Hash64Combine(h, hash<T>{}(Tin(i, key, j)));
           }
         }
-      }
-      return true;
-    };
+        return h;
+      };
+
+      auto equal_to_fn = [&Tin](const int64& lhs, const int64& rhs) {
+        for (int64 i = 0; i < Tin.dimension(0); i++) {
+          for (int64 j = 0; j < Tin.dimension(2); j++) {
+            if (Tin(i, lhs, j) != Tin(i, rhs, j)) {
+              return false;
+            }
+          }
+        }
+        return true;
+      };
 
-    std::unordered_map<int64, int64, decltype(hash_fn), decltype(equal_to_fn)>
-        uniq(0, hash_fn, equal_to_fn);
+      std::unordered_map<int64, int64, decltype(hash_fn), decltype(equal_to_fn)>
+          uniq(0, hash_fn, equal_to_fn);
 
-    uniq.reserve(2 * Tin.dimension(1));
+      uniq.reserve(2 * Tin.dimension(1));
 
-    for (int64 i = 0, j = 0; i < Tin.dimension(1); ++i) {
-      auto it = uniq.insert(std::make_pair(i, j));
-      idx_vec(i) = it.first->second;
-      if (it.second) {
-        ++j;
+      for (int64 i = 0, j = 0; i < Tin.dimension(1); ++i) {
+        auto it = uniq.insert(std::make_pair(i, j));
+        idx_vec(i) = it.first->second;
+        if (it.second) {
+          ++j;
+        }
       }
-    }
 
-    int64 uniq_size = static_cast<int64>(uniq.size());
-    new_sizes[1] = uniq_size;
-    TensorShape output_shape(input.shape());
-    output_shape.set_dim(axis, uniq_size);
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
-    auto Tout = output->shaped<T, 3>(new_sizes);
+      uniq_size = static_cast<int64>(uniq.size());
+      new_sizes[1] = uniq_size;
+      TensorShape output_shape(input.shape());
+      output_shape.set_dim(axis, uniq_size);
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, output_shape, &output));
+      auto Tout = output->shaped<T, 3>(new_sizes);
 
-    for (auto it : uniq) {
-      for (int64 i = 0; i < Tin.dimension(0); i++) {
-        for (int64 j = 0; j < Tin.dimension(2); j++) {
-          Tout(i, it.second, j) = Tin(i, it.first, j);
-        }
+      for (auto it : uniq) {
+        Tout.chip(it.second, 1) = Tin.chip(it.first, 1);
       }
     }
 
     if (num_outputs() > 2) {
+      Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(
                                   2, TensorShape({uniq_size}), &output));
       auto count_output_vec = output->template vec<TIndex>();
       count_output_vec.setZero();
-      for (int64 i = 0; i < Tin.dimension(1); ++i) {
+      const int N = idx_vec.size();
+      for (int64 i = 0; i < N; ++i) {
         count_output_vec(idx_vec(i))++;
       }
     }
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 7fd1def1fe02e8418882bc4cb19c4318779c5282..764b6a252adf09c13511a01f95332857f46eee96 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -34,7 +34,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class UnpackOp : public OpKernel {
@@ -65,8 +65,9 @@ class UnpackOp : public OpKernel {
     output_shape.RemoveDim(axis);
     const int64 output_size = output_shape.num_elements();
     OP_REQUIRES(
-        context, FastBoundsCheck(output_size,
-                                 std::numeric_limits<Eigen::DenseIndex>::max()),
+        context,
+        FastBoundsCheck(output_size,
+                        std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("output size must fit in Eigen DenseIndex"));
 
 // This optimization is currently not applicable for SYCL devices
@@ -142,6 +143,7 @@ TF_CALL_ALL_TYPES(REGISTER_UNPACK);
       UnpackOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_bfloat16(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
@@ -153,6 +155,12 @@ REGISTER_KERNEL_BUILDER(Name("Unpack")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         UnpackOp<CPUDevice, int32>);
+REGISTER_KERNEL_BUILDER(Name("Unpack")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("value")
+                            .HostMemory("output")
+                            .TypeConstraint<int64>("T"),
+                        UnpackOp<CPUDevice, int64>);
 
 #endif  // GOOGLE_CUDA
 
@@ -170,6 +178,13 @@ REGISTER_KERNEL_BUILDER(Name("Unpack")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         UnpackOp<CPUDevice, int32>);
+
+REGISTER_KERNEL_BUILDER(Name("Unpack")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("value")
+                            .HostMemory("output")
+                            .TypeConstraint<int64>("T"),
+                        UnpackOp<CPUDevice, int64>);
 #undef REGISTER_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62e814ff773ccb2ee3d7e9445966f5d805817802
--- /dev/null
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -0,0 +1,122 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+
+namespace {
+template <typename T>
+struct mod_op {
+  const T operator()(const T& a, const T& b) const { return a % b; }
+};
+}  // namespace
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Tidx>
+class UnravelIndexOp : public OpKernel {
+ public:
+  explicit UnravelIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& indices_tensor = ctx->input(0);
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsVector(indices_tensor.shape()) ||
+                    TensorShapeUtils::IsScalar(indices_tensor.shape()),
+                errors::InvalidArgument(
+                    "The indices can only be scalar or vector, got \"",
+                    indices_tensor.shape().DebugString(), "\""));
+
+    const Tensor& dims_tensor = ctx->input(1);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(dims_tensor.shape()),
+        errors::InvalidArgument("The indices can only be 1-D, got \"",
+                                dims_tensor.shape().DebugString(), "\""));
+
+    auto dims = dims_tensor.vec<Tidx>();
+
+    Eigen::array<bool, 1> reverse({true});
+
+    Tensor strides_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DataTypeToEnum<Tidx>::value,
+                                      TensorShape({dims_tensor.NumElements()}),
+                                      &strides_tensor));
+
+    auto strides = strides_tensor.vec<Tidx>();
+    strides = dims.reverse(reverse)
+                  .scan(0, Eigen::internal::ProdReducer<Tidx>(), false)
+                  .reverse(reverse);
+
+    Tensor strides_shifted_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_temp(DataTypeToEnum<Tidx>::value,
+                                      TensorShape({dims_tensor.NumElements()}),
+                                      &strides_shifted_tensor));
+
+    auto strides_shifted = strides_shifted_tensor.vec<Tidx>();
+    strides_shifted = dims.reverse(reverse)
+                          .scan(0, Eigen::internal::ProdReducer<Tidx>(), true)
+                          .reverse(reverse);
+
+    Tensor* output_tensor = nullptr;
+    if (TensorShapeUtils::IsScalar(indices_tensor.shape())) {
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(0, TensorShape({dims_tensor.NumElements()}),
+                                    &output_tensor));
+
+      auto output = output_tensor->vec<Tidx>();
+
+      output = output.constant(indices_tensor.scalar<Tidx>()());
+      output = output.binaryExpr(strides, mod_op<Tidx>()) / strides_shifted;
+    } else {
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(0,
+                                    TensorShape({dims_tensor.NumElements(),
+                                                 indices_tensor.NumElements()}),
+                                    &output_tensor));
+
+      auto output = output_tensor->matrix<Tidx>();
+
+      Eigen::array<int64, 2> reshape{{dims_tensor.NumElements(), 1}};
+      Eigen::array<int64, 2> bcast({1, indices_tensor.NumElements()});
+      Eigen::array<int64, 2> indices_reshape{{1, indices_tensor.NumElements()}};
+      Eigen::array<int64, 2> indices_bcast({dims_tensor.NumElements(), 1});
+
+      output = indices_tensor.vec<Tidx>()
+                   .reshape(indices_reshape)
+                   .broadcast(indices_bcast);
+      output = output.binaryExpr(strides.reshape(reshape).broadcast(bcast),
+                                 mod_op<Tidx>()) /
+               strides_shifted.reshape(reshape).broadcast(bcast);
+    }
+  }
+};
+
+#define REGISTER_KERNEL(type)                                               \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("UnravelIndex").Device(DEVICE_CPU).TypeConstraint<type>("Tidx"), \
+      UnravelIndexOp<type>);
+TF_CALL_int32(REGISTER_KERNEL) TF_CALL_int64(REGISTER_KERNEL)
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 36b8ff09d7381a0b8bbb8b6f8d71b14e47fa4663..10ccc85b7cd63db7f8d329a4253784abed7174cf 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -23,6 +23,160 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Resource stored by variables in the resource manager
+// (legacy, ref-style version).
+class LegacyVar : public ResourceBase {
+ public:
+  explicit LegacyVar(DataType dtype) : tensor_(dtype) {}
+  // Not copyable or movable.
+  LegacyVar(const LegacyVar&) = delete;
+  LegacyVar& operator=(const LegacyVar&) = delete;
+
+  mutex* mu() { return &mu_; }
+  Tensor* tensor() { return &tensor_; }
+
+  string DebugString() override {
+    return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
+                           tensor_.shape().DebugString());
+  }
+
+ private:
+  mutex mu_;
+  Tensor tensor_;
+
+  ~LegacyVar() override {}
+};
+
+VariableOp::VariableOp(OpKernelConstruction* context) : OpKernel(context) {
+  OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
+  dtype_ = RemoveRefType(context->output_type(0));
+}
+
+void VariableOp::Compute(OpKernelContext* ctx) {
+  mutex_lock l(init_mu_);
+  if (!initialized_) {
+    OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+                                    true /* use name() */));
+    initialized_ = true;
+  }
+  auto creator = [this](LegacyVar** var) {
+    *var = new LegacyVar(dtype_);
+    (*var)->tensor()->set_shape(shape_);
+    return Status::OK();
+  };
+  LegacyVar* var;
+  OP_REQUIRES_OK(ctx, cinfo_.resource_manager()->LookupOrCreate<LegacyVar>(
+                          cinfo_.container(), cinfo_.name(), &var, creator));
+  // Output a reference to our tensor, so it may be updated.
+  //
+  // As long as the resource manager hasn't been cleared the ref we return
+  // here is valid because it owns a ref on var.
+  ctx->set_output_ref(0, var->mu(), var->tensor());
+  if (ctx->track_allocations() && var->tensor()->IsInitialized()) {
+    AllocatorAttributes attr;
+    attr.set_gpu_compatible(true);
+    attr.set_nic_compatible(true);
+    ctx->record_persistent_memory_allocation(var->tensor()->AllocatedBytes());
+  }
+  var->Unref();
+}
+
+class TemporaryVariableOp : public OpKernel {
+ public:
+  explicit TemporaryVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
+    // Variable name defaults to op name if not specified explicitly.
+    if (var_name_.empty()) var_name_ = name();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    Status s;
+    ResourceMgr* rm = context->resource_manager();
+    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
+    auto* tmp_var = new TmpVar;
+    OP_REQUIRES(context, tmp_var,
+                errors::ResourceExhausted("Could not allocate TmpVar."));
+    tmp_var->name = var_name_;
+    s = context->allocate_temp(dtype_, shape_, &tmp_var->val);
+    if (!s.ok()) tmp_var->Unref();
+    OP_REQUIRES_OK(context, s);
+    OP_REQUIRES_OK(context, rm->Create(context->step_container()->name(),
+                                       var_name_, tmp_var));
+    context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
+    if (context->track_allocations()) {
+      context->record_persistent_memory_allocation(
+          tmp_var->val.AllocatedBytes());
+    }
+  }
+
+ private:
+  // Refcounted temporary variable resource.
+  friend class DestroyTemporaryVariableOp;
+  struct TmpVar : public ResourceBase {
+    mutex mu;
+    Tensor val;
+    string name;
+    string DebugString() override { return name; }
+    ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
+  };
+
+  TensorShape shape_;
+  DataType dtype_;
+  string var_name_;
+};
+
+class DestroyTemporaryVariableOp : public OpKernel {
+ public:
+  explicit DestroyTemporaryVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES(context, IsRefType(context->input_type(0)),
+                errors::InvalidArgument("lhs input needs to be a ref type"));
+    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
+    OP_REQUIRES(context, !var_name_.empty(),
+                errors::InvalidArgument("Missing var_name attribute"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // NOTE(pbar): All other mutators of the Tensor Ref *must* have completed
+    // their execution before this DestroyTemporaryVariable op executes.
+    // This is typically achieved using control dependencies.
+    CHECK(IsRefType(context->input_dtype(0)));
+    Tensor tmpvar = context->mutable_input(0, false);
+    context->set_output(0, tmpvar);
+    ResourceMgr* rm = context->resource_manager();
+    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
+    OP_REQUIRES_OK(context, rm->Delete<TemporaryVariableOp::TmpVar>(
+                                context->step_container()->name(), var_name_));
+    if (context->track_allocations()) {
+      context->record_persistent_memory_allocation(
+          -static_cast<int64>(tmpvar.AllocatedBytes()));
+    }
+  }
+
+ private:
+  string var_name_;
+};
+
+class IsVariableInitializedOp : public OpKernel {
+ public:
+  explicit IsVariableInitializedOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Get a mutable input tensor of the Ref input.
+    const Tensor& input_tensor = context->mutable_input(0, false);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+    auto output_tensor = output->tensor<bool, 0>();
+    bool result = input_tensor.IsInitialized();
+    output_tensor() = result;
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("Variable").Device(DEVICE_CPU), VariableOp);
 REGISTER_KERNEL_BUILDER(Name("VariableV2").Device(DEVICE_CPU), VariableOp);
 REGISTER_KERNEL_BUILDER(Name("TemporaryVariable").Device(DEVICE_CPU),
@@ -33,30 +187,30 @@ REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU),
                         IsVariableInitializedOp);
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                         \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Variable").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"),  \
-      VariableOp);                                                         \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("VariableV2").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"),\
-      VariableOp);                                                         \
-  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                        \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("dtype"),              \
-                          TemporaryVariableOp);                            \
-  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")                 \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T"),                  \
-                          DestroyTemporaryVariableOp);                     \
-  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                    \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("dtype")               \
-                              .HostMemory("is_initialized"),               \
+#define REGISTER_SYCL_KERNEL(type)                                          \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("Variable").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"),   \
+      VariableOp);                                                          \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("VariableV2").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"), \
+      VariableOp);                                                          \
+  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                         \
+                              .Device(DEVICE_SYCL)                          \
+                              .TypeConstraint<type>("dtype"),               \
+                          TemporaryVariableOp);                             \
+  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")                  \
+                              .Device(DEVICE_SYCL)                          \
+                              .TypeConstraint<type>("T"),                   \
+                          DestroyTemporaryVariableOp);                      \
+  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                     \
+                              .Device(DEVICE_SYCL)                          \
+                              .TypeConstraint<type>("dtype")                \
+                              .HostMemory("is_initialized"),                \
                           IsVariableInitializedOp);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 // Only register 'Variable' on GPU for the subset of types also supported by
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
index 355140d44c5c53c8496d5bd2b3028e9ae9b3940b..83134bad378bfef18c3e93be5cc3c6b70ab4f523 100644
--- a/tensorflow/core/kernels/variable_ops.h
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -27,10 +27,16 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Resource stored by variables in the resource manager.
+// Resource stored by variables in the resource manager
+// (new, resource-style version).
 class Var : public ResourceBase {
  public:
   explicit Var(DataType dtype) : tensor_(dtype) {}
+  // Not copyable or movable.
+  Var(const Var&) = delete;
+  Var& operator=(const Var&) = delete;
+
+  // TODO(ebrevdo): Use LockSet instead of exposing mu.
   mutex* mu() { return &mu_; }
   Tensor* tensor() { return &tensor_; }
 
@@ -44,52 +50,12 @@ class Var : public ResourceBase {
   Tensor tensor_;
 
   ~Var() override {}
-  TF_DISALLOW_COPY_AND_ASSIGN(Var);
 };
 
 class VariableOp : public OpKernel {
  public:
-  explicit VariableOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
-    dtype_ = RemoveRefType(context->output_type(0));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    mutex_lock l(init_mu_);
-    if (!initialized_) {
-      OP_REQUIRES_OK(
-          ctx,
-          cinfo_.Init(ctx->resource_manager(), def(), true /* use name() */));
-      initialized_ = true;
-    }
-    auto creator = [this](Var** var) {
-      *var = new Var(dtype_);
-      (*var)->tensor()->set_shape(shape_);
-      return Status::OK();
-    };
-    Var* var;
-    OP_REQUIRES_OK(ctx,
-                   cinfo_.resource_manager()->LookupOrCreate<Var>(
-                       cinfo_.container(), cinfo_.name(), &var, creator));
-    // Output a reference to our tensor, so it may be updated.
-    //
-    // As long as the resource manager hasn't been cleared the ref we return
-    // here is valid because it owns a ref on var.
-    ctx->set_output_ref(0, var->mu(), var->tensor());
-    if (ctx->track_allocations() && var->tensor()->IsInitialized()) {
-      AllocatorAttributes attr;
-      attr.set_gpu_compatible(true);
-      attr.set_nic_compatible(true);
-      if (ctx->allocate_on_host(attr)) {
-        ctx->record_host_persistent_memory_allocation(
-            var->tensor()->AllocatedBytes());
-      } else {
-        ctx->record_device_persistent_memory_allocation(
-            var->tensor()->AllocatedBytes());
-      }
-    }
-    var->Unref();
-  }
+  explicit VariableOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* ctx) override;
 
  private:
   DataType dtype_;
@@ -102,112 +68,6 @@ class VariableOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(VariableOp);
 };
 
-class TemporaryVariableOp : public OpKernel {
- public:
-  explicit TemporaryVariableOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
-    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
-    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
-    // Variable name defaults to op name if not specified explicitly.
-    if (var_name_ == "") var_name_ = name();
-  }
-
-  void Compute(OpKernelContext* context) override {
-    Status s;
-    ResourceMgr* rm = context->resource_manager();
-    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
-    auto* tmp_var = new TmpVar;
-    OP_REQUIRES(context, tmp_var,
-                errors::ResourceExhausted("Could not allocate TmpVar."));
-    tmp_var->name = var_name_;
-    s = context->allocate_temp(dtype_, shape_, &tmp_var->val);
-    if (!s.ok()) tmp_var->Unref();
-    OP_REQUIRES_OK(context, s);
-    OP_REQUIRES_OK(context, rm->Create(context->step_container()->name(),
-                                       var_name_, tmp_var));
-    context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
-    if (context->track_allocations()) {
-      AllocatorAttributes attr;
-      if (context->allocate_on_host(attr)) {
-        context->record_host_persistent_memory_allocation(
-            tmp_var->val.AllocatedBytes());
-      } else {
-        context->record_device_persistent_memory_allocation(
-            tmp_var->val.AllocatedBytes());
-      }
-    }
-  }
-
- private:
-  // Refcounted temporary variable resource.
-  friend class DestroyTemporaryVariableOp;
-  struct TmpVar : public ResourceBase {
-    mutex mu;
-    Tensor val;
-    string name;
-    string DebugString() override { return name; }
-    ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
-  };
-
-  TensorShape shape_;
-  DataType dtype_;
-  string var_name_;
-};
-
-class DestroyTemporaryVariableOp : public OpKernel {
- public:
-  explicit DestroyTemporaryVariableOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES(context, IsRefType(context->input_type(0)),
-                errors::InvalidArgument("lhs input needs to be a ref type"))
-    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
-    OP_REQUIRES(context, var_name_ != "",
-                errors::InvalidArgument("Missing var_name attribute"));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // NOTE(pbar): All other mutators of the Tensor Ref *must* have completed
-    // their execution before this DestroyTemporaryVariable op executes.
-    // This is typically achieved using control dependencies.
-    CHECK(IsRefType(context->input_dtype(0)));
-    Tensor tmpvar = context->mutable_input(0, false);
-    context->set_output(0, tmpvar);
-    ResourceMgr* rm = context->resource_manager();
-    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
-    OP_REQUIRES_OK(context, rm->Delete<TemporaryVariableOp::TmpVar>(
-                                context->step_container()->name(), var_name_));
-    if (context->track_allocations()) {
-      if (context->allocate_on_host(AllocatorAttributes())) {
-        context->record_host_persistent_memory_allocation(
-            -static_cast<int64>(tmpvar.AllocatedBytes()));
-      } else {
-        context->record_device_persistent_memory_allocation(
-            -static_cast<int64>(tmpvar.AllocatedBytes()));
-      }
-    }
-  }
-
- private:
-  string var_name_;
-};
-
-class IsVariableInitializedOp : public OpKernel {
- public:
-  IsVariableInitializedOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Get a mutable input tensor of the Ref input.
-    const Tensor& input_tensor = context->mutable_input(0, false);
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-    auto output_tensor = output->tensor<bool, 0>();
-    bool result = input_tensor.IsInitialized();
-    output_tensor() = result;
-  }
-};
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_VARIABLE_OPS_H_
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index 42d1365e64592c6609c6daf83678f7dbd056a23f..f92c4ed17af501eaf79523bc6977e614b8168720 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -55,14 +55,14 @@ namespace functor {
 namespace {
 template <typename T>
 int64 CountAccumulator(const T* begin, const T* end) {
-  return std::accumulate(begin, end, 0L, [](int64 accum, const T& val) {
+  return std::accumulate(begin, end, 0LL, [](int64 accum, const T& val) {
     return accum + (val != T(0));
   });
 }
 
 template <>
 int64 CountAccumulator<bool>(const bool* begin, const bool* end) {
-  return std::accumulate(begin, end, 0L);
+  return std::accumulate(begin, end, 0LL);
 }
 
 }  // namespace
@@ -131,7 +131,7 @@ class WhereCPUOp : public OpKernel {
     OP_REQUIRES(
         context, input.dtype() != DT_HALF,
         errors::Unimplemented("No WhereOp available for float16/half type on "
-                              "GPU; dying in CPU WhereOp to avoid silently "
+                              "CPU; dying in CPU WhereOp to avoid silently "
                               "creating costly copies from device."));
 
     const int input_dims = input.dims();
diff --git a/tensorflow/core/kernels/winograd_transform.h b/tensorflow/core/kernels/winograd_transform.h
index 5caee9fdc14ddeeae5adbf9fa22cfc04ac53b58a..d22710e503285150cf270f5c4e32796f275171a0 100644
--- a/tensorflow/core/kernels/winograd_transform.h
+++ b/tensorflow/core/kernels/winograd_transform.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
+#ifndef TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
+#define TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
 
 #include "tensorflow/core/kernels/deep_conv2d.h"
 
@@ -374,4 +374,4 @@ void WinogradTransform<T>::GetOutputTransformMatrix(const int64 rows,
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
+#endif  // TENSORFLOW_CORE_KERNELS_WINOGRAD_TRANSFORM_H_
diff --git a/tensorflow/core/kernels/word2vec_kernels.cc b/tensorflow/core/kernels/word2vec_kernels.cc
index 2d05d72bff162f98e8d13e8a3208e4dd00a48fa4..3477445197a961b275e3efb8ce09d5b075342f9e 100644
--- a/tensorflow/core/kernels/word2vec_kernels.cc
+++ b/tensorflow/core/kernels/word2vec_kernels.cc
@@ -188,9 +188,9 @@ class SkipgramOp : public OpKernel {
       ++corpus_size_;
     }
     if (corpus_size_ < window_size_ * 10) {
-      return errors::InvalidArgument("The text file ", filename,
-                                     " contains too little data: ",
-                                     corpus_size_, " words");
+      return errors::InvalidArgument(
+          "The text file ", filename,
+          " contains too little data: ", corpus_size_, " words");
     }
     typedef std::pair<string, int32> WordFreq;
     std::vector<WordFreq> ordered;
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index dc21cee3a8a5a76d8fe5d0d88eae03e7cede3f58..a6a71fdfaf126410b26766954c0c2fc5b86d003a 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -30,7 +30,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SoftmaxXentWithLogitsOp : public OpKernel {
@@ -44,8 +44,8 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
     OP_REQUIRES(context, logits_in.IsSameSize(labels_in),
                 errors::InvalidArgument(
                     "logits and labels must be same size: logits_size=",
-                    logits_in.shape().DebugString(), " labels_size=",
-                    labels_in.shape().DebugString()));
+                    logits_in.shape().DebugString(),
+                    " labels_size=", labels_in.shape().DebugString()));
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(logits_in.shape()),
                 errors::InvalidArgument("logits must be 2-dimensional"));
     // As we already tested that both inputs have the same shape no need to
@@ -67,10 +67,12 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
     // Try to reuse the logits_in buffer for the backprop output.
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                 {0}, 1, logits_in.shape(), &back_out));
-    functor::XentFunctor<Device, T> functor;
-    functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
-            labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
-            back_out->matrix<T>());
+    if (logits_in.dim_size(0) > 0) {
+      functor::XentFunctor<Device, T> functor;
+      functor(context->eigen_device<Device>(), logits_in.matrix<T>(),
+              labels_in.matrix<T>(), scratch.matrix<T>(), loss_out->vec<T>(),
+              back_out->matrix<T>());
+    }
   }
 };
 
@@ -85,7 +87,7 @@ struct XentFunctorBase {
                   typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
     XentEigenImpl<Device, T>::Compute(d, logits, labels, scratch, loss,
-                                         backprop);
+                                      backprop);
   }
 };
 
@@ -95,7 +97,7 @@ struct XentFunctor<CPUDevice, T> : XentFunctorBase<CPUDevice, T> {};
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
 struct XentFunctor<SYCLDevice, T> : XentFunctorBase<SYCLDevice, T> {};
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 #define REGISTER_CPU(T)                                         \
@@ -127,6 +129,6 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
                             .Device(DEVICE_SYCL)
                             .TypeConstraint<float>("T"),
                         SoftmaxXentWithLogitsOp<SYCLDevice, float>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/xsmm_conv2d.h b/tensorflow/core/kernels/xsmm_conv2d.h
index b439511dc78b46dc90eb8523b98b42d9ba1de45a..003291329a8d3c4062aee00c5b5e1ab8e0ebf8c2 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.h
+++ b/tensorflow/core/kernels/xsmm_conv2d.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
+#ifndef TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
+#define TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
 
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -57,4 +57,4 @@ struct XsmmBkwFilterConv2D {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
+#endif  // TENSORFLOW_CORE_KERNELS_XSMM_CONV2D_H_
diff --git a/tensorflow/core/kernels/xsmm_conv2d_test.cc b/tensorflow/core/kernels/xsmm_conv2d_test.cc
index e29470124674636a0e125a5cd1b856a467f4c6f0..481f3b7ba46bac42a276d46e60c11f34bc163e3b 100644
--- a/tensorflow/core/kernels/xsmm_conv2d_test.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d_test.cc
@@ -13,18 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/conv_ops.h"
-#include "tensorflow/core/platform/test.h"
+#include "include/libxsmm.h"
+#include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
-#include "include/libxsmm.h"
-#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
 
-
 typedef struct {
   int nImg;
   int nIfm;
@@ -49,45 +48,41 @@ typedef struct {
   int stride_w;
 } naive_conv_t;
 
-
-LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor &nhwc, int N, int H, int W, int C)
-{
-  LIBXSMM_VLA_DECL(4, const float,  input, nchw, C, H, W);
+LIBXSMM_INLINE void naive_copy_NCHW_to_NHWC(const float* nchw, Tensor& nhwc,
+                                            int N, int H, int W, int C) {
+  LIBXSMM_VLA_DECL(4, const float, input, nchw, C, H, W);
   int n, h, w, c;
-  auto output =  nhwc.flat<float>();
-  for ( n = 0; n < N; n++ ) {
-    for ( h = 0; h < H; h++ ) {
-      for ( w = 0; w < W; w++ ) {
-        for ( c = 0; c < C; c++ ) {
-          output(n*H*W*C + h*W*C +w*C + c)  =
-          LIBXSMM_VLA_ACCESS(4,  input, n, c, h, w, C, H, W);
+  auto output = nhwc.flat<float>();
+  for (n = 0; n < N; n++) {
+    for (h = 0; h < H; h++) {
+      for (w = 0; w < W; w++) {
+        for (c = 0; c < C; c++) {
+          output(n * H * W * C + h * W * C + w * C + c) =
+              LIBXSMM_VLA_ACCESS(4, input, n, c, h, w, C, H, W);
         }
       }
     }
   }
 }
 
-
-LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor  &rsck, int R, int S, int C, int K)
-{
-  LIBXSMM_VLA_DECL(4, const float,  input, kcrs, C, R, S);
+LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor& rsck,
+                                            int R, int S, int C, int K) {
+  LIBXSMM_VLA_DECL(4, const float, input, kcrs, C, R, S);
   int r, s, c, k;
-  auto output =  rsck.flat<float>();
-
-  for ( r = 0; r < R; r++ ) {
-    for ( s = 0; s < S; s++ ) {
-      for ( c = 0; c < C; c++ ) {
-        for ( k = 0; k < K; k++ ) {
-          output(r*S*C*K + s*C*K + c*K + k) =
-          LIBXSMM_VLA_ACCESS(4,  input, k, c, r, s, C, R, S);
+  auto output = rsck.flat<float>();
+
+  for (r = 0; r < R; r++) {
+    for (s = 0; s < S; s++) {
+      for (c = 0; c < C; c++) {
+        for (k = 0; k < K; k++) {
+          output(r * S * C * K + s * C * K + c * K + k) =
+              LIBXSMM_VLA_ACCESS(4, input, k, c, r, s, C, R, S);
         }
       }
     }
   }
 }
 
-
-
 LIBXSMM_INLINE void zero_buf(float* buf, long size) {
   int i;
   for (i = 0; i < size; ++i) {
@@ -95,52 +90,53 @@ LIBXSMM_INLINE void zero_buf(float* buf, long size) {
   }
 }
 
-LIBXSMM_INLINE void copy_buf(Tensor &dst,float *src,long size) {
-  long  i;
-  auto output =  dst.flat<float>();
-  for (i = 0; i < size; ++i)
-          output(i) = src[i];
+LIBXSMM_INLINE void copy_buf(Tensor& dst, float* src, long size) {
+  long i;
+  auto output = dst.flat<float>();
+  for (i = 0; i < size; ++i) output(i) = src[i];
 }
 
-LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne)
-{
+LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne) {
   int i;
   zero_buf(buf, size);
   for (i = 0; i < size; ++i) {
-    buf[i] = (float)((initOne != 0) ? 1.0 : ((initPos != 0) ? drand48() : (0.05 - drand48()/10.0)));
+    buf[i] =
+        (float)((initOne != 0)
+                    ? 1.0
+                    : ((initPos != 0) ? drand48() : (0.05 - drand48() / 10.0)));
   }
 }
 
-
-
-LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float* output, const float* filter)
-{
-  int nImg      = param->nImg;
-  int nIfm      = param->nIfm;
-  int nOfm      = param->nOfm;
-  int ifhp      = param->ifhp;
-  int ifwp      = param->ifwp;
-  int ofhp      = param->ofhp;
-  int ofwp      = param->ofwp;
-  int ifh       = param->ifh;
-  int ifw       = param->ifw;
-  int ofh       = param->ofh;
-  int ofw       = param->ofw;
-  int pad_h     = param->pad_h;
-  int pad_w     = param->pad_w;
-  int pad_h_in  = param->pad_h_in;
-  int pad_w_in  = param->pad_w_in;
+LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input,
+                                  float* output, const float* filter) {
+  int nImg = param->nImg;
+  int nIfm = param->nIfm;
+  int nOfm = param->nOfm;
+  int ifhp = param->ifhp;
+  int ifwp = param->ifwp;
+  int ofhp = param->ofhp;
+  int ofwp = param->ofwp;
+  int ifh = param->ifh;
+  int ifw = param->ifw;
+  int ofh = param->ofh;
+  int ofw = param->ofw;
+  int pad_h = param->pad_h;
+  int pad_w = param->pad_w;
+  int pad_h_in = param->pad_h_in;
+  int pad_w_in = param->pad_w_in;
   int pad_h_out = param->pad_h_out;
   int pad_w_out = param->pad_w_out;
-  int kh        = param->kh;
-  int kw        = param->kw;
-  int stride_h  = param->stride_h;
-  int stride_w  = param->stride_w;
+  int kh = param->kh;
+  int kw = param->kw;
+  int stride_h = param->stride_h;
+  int stride_w = param->stride_w;
   /* loop counters */
   int img, ofm, ifm, oj, oi, ij, ii, kj, ki;
 
-  LIBXSMM_VLA_DECL(4,       float, output_t, output + (pad_w_out * ofwp + pad_h_out), nOfm, ofhp, ofwp);
-  LIBXSMM_VLA_DECL(4, const float,  input_t,  input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
+  LIBXSMM_VLA_DECL(4, float, output_t, output + (pad_w_out * ofwp + pad_h_out),
+                   nOfm, ofhp, ofwp);
+  LIBXSMM_VLA_DECL(4, const float, input_t,
+                   input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
   LIBXSMM_VLA_DECL(4, const float, filter_t, filter, nIfm, kh, kw);
 
   for (img = 0; img < nImg; ++img) {
@@ -151,12 +147,15 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float
           for (oi = 0; oi < ofw; ++oi) {
             ii = oi * stride_w - pad_w;
             for (kj = 0; kj < kh; ++kj) {
-              if(ij+kj < 0 || ij+kj >= ifh) continue;
+              if (ij + kj < 0 || ij + kj >= ifh) continue;
               for (ki = 0; ki < kw; ++ki) {
-                if(ii+ki < 0 || ii+ki >= ifw) continue;
-                LIBXSMM_VLA_ACCESS(  4, output_t, img, ofm, oj, oi, nOfm, ofhp, ofwp) +=
-                  LIBXSMM_VLA_ACCESS(4,  input_t, img, ifm, ij + kj, ii + ki, nIfm, ifhp, ifwp)
-                * LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh, kw);
+                if (ii + ki < 0 || ii + ki >= ifw) continue;
+                LIBXSMM_VLA_ACCESS(4, output_t, img, ofm, oj, oi, nOfm, ofhp,
+                                   ofwp) +=
+                    LIBXSMM_VLA_ACCESS(4, input_t, img, ifm, ij + kj, ii + ki,
+                                       nIfm, ifhp, ifwp) *
+                    LIBXSMM_VLA_ACCESS(4, filter_t, ofm, ifm, kj, ki, nIfm, kh,
+                                       kw);
               }
             }
           }
@@ -168,134 +167,118 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float
 
 void RunXsmmVsGeneric() {}
 
-
 class XsmmConv2DTest : public OpsTestBase {
  protected:
   void MakeOp(int stride) {
-
     TF_CHECK_OK(NodeDefBuilder("xsmm", "Conv2D")
-                      .Input(FakeInput(DT_FLOAT))
-                      .Input(FakeInput(DT_FLOAT))
-                      .Attr("strides", {1, stride,stride, 1})
-                      .Attr("padding", "VALID" )
-                      .Finalize(node_def()));
-
+                    .Input(FakeInput(DT_FLOAT))
+                    .Input(FakeInput(DT_FLOAT))
+                    .Attr("strides", {1, stride, stride, 1})
+                    .Attr("padding", "VALID")
+                    .Finalize(node_def()));
 
     TF_ASSERT_OK(InitOp());
   }
 };
 
 TEST_F(XsmmConv2DTest, Basic) {
-     MakeOp(1);
+  MakeOp(1);
 
-     // setup scoped allocator, which uses cpu_allocator() for this scope
-     const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
+  // setup scoped allocator, which uses cpu_allocator() for this scope
+  const libxsmm_tf_allocator<libxsmm_scratch_allocator> tf_allocator;
 
-     int ifw = 14;           /* input width, "W" */
-     int ifh = 14;           /* input height, "H" */
-     int nImg = 32;          /* mini-batch size, "N" */
-     int nIfm = 64;         /* number of input feature maps, "C" */
-     int nOfm = 64;         /* number of output feature maps, "K" */
-     int kh = 3;             /* filter height, "R" */
-     int kw = 3;             /* filter width, "S" */
-     int pad = 0;            /* padding in output */
-     int stride = 1;         /* stride when accessing inputs */
+  int ifw = 14;   /* input width, "W" */
+  int ifh = 14;   /* input height, "H" */
+  int nImg = 32;  /* mini-batch size, "N" */
+  int nIfm = 64;  /* number of input feature maps, "C" */
+  int nOfm = 64;  /* number of output feature maps, "K" */
+  int kh = 3;     /* filter height, "R" */
+  int kw = 3;     /* filter width, "S" */
+  int pad = 0;    /* padding in output */
+  int stride = 1; /* stride when accessing inputs */
 
+  int stride_w = stride;
+  int stride_h = stride;
+  int pad_h = pad;
+  int pad_w = pad;
 
-     int stride_w = stride;
-     int stride_h = stride;
-     int pad_h = pad;
-     int pad_w = pad;
+  int pad_h_in = pad_h;
+  int pad_w_in = pad_w;
 
-     int pad_h_in = pad_h;
-     int pad_w_in = pad_w;
-
-     int pad_h_out = 0;
-     int pad_w_out = 0;
+  int pad_h_out = 0;
+  int pad_w_out = 0;
 
   /* deriving some values for naive code */
-     int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
-     int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
-     int ifhp = ifh + 2 * pad_h_in;
-     int ifwp = ifw + 2 * pad_w_in;
-     int ofhp = ofh + 2 * pad_h_out;
-     int ofwp = ofw + 2 * pad_w_out;
-
-
-    //Initialization of Filter and Image
-
-    /* allocate data */
-     float *naive_input           = (float*)libxsmm_aligned_scratch( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
-     float *naive_output          = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
-     float *naive_filter          = (float*)libxsmm_aligned_scratch( nOfm*nIfm*kh*kw*    sizeof(float), 2097152);
-     /* initialize data */
-     init_buf(naive_input,          nImg*nIfm*ifhp*ifwp, 0, 0);
-     zero_buf(naive_output,         nImg*nOfm*ofhp*ofwp);
-     init_buf(naive_filter,         nOfm*nIfm*kh*kw, 0, 0);
-
-
-     Tensor image(DT_FLOAT,
-                 {nImg, ifhp, ifwp, nIfm});
-
-
-     Tensor filter(DT_FLOAT, {kh,kw,nIfm,nOfm});
-
-
-     naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
-     naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm);
-
-
-    //Run naive convolution
-
-     naive_conv_t naive_param;
-
-     naive_param.nImg = nImg;
-     naive_param.nIfm = nIfm;
-     naive_param.nOfm = nOfm;
-     naive_param.ifhp = ifhp;
-     naive_param.ifwp = ifwp;
-     naive_param.ofhp = ofhp;
-     naive_param.ofwp = ofwp;
-     naive_param.ifh = ifh;
-     naive_param.ifw = ifw;
-     naive_param.ofh = ofh;
-     naive_param.ofw = ofw;
-     naive_param.pad_h = pad_h;
-     naive_param.pad_w = pad_w;
-     naive_param.pad_h_in = pad_h_in;
-     naive_param.pad_w_in = pad_w_in;
-     naive_param.pad_h_out = pad_h_out;
-     naive_param.pad_w_out = pad_w_out;
-     naive_param.kh = kh;
-     naive_param.kw = kw;
-     naive_param.stride_h = stride_h;
-     naive_param.stride_w = stride_w;
-
-
-     naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
-
-
-
-     AddInputFromArray<float>(image.shape(), image.flat<float>());
-     AddInputFromArray<float>(filter.shape(), filter.flat<float>());
-
-
-
-     //Run Op (TF)
-     TF_ASSERT_OK(RunOpKernel());
-
-     // Check the output.
-     Tensor expected(DT_FLOAT, {nImg,ofhp,ofwp, nOfm});
-     naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
-
-
-     test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
-     libxsmm_free(naive_input);
-     libxsmm_free(naive_output);
-     libxsmm_free(naive_filter);
-
-
-
+  int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
+  int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
+  int ifhp = ifh + 2 * pad_h_in;
+  int ifwp = ifw + 2 * pad_w_in;
+  int ofhp = ofh + 2 * pad_h_out;
+  int ofwp = ofw + 2 * pad_w_out;
+
+  // Initialization of Filter and Image
+
+  /* allocate data */
+  float* naive_input = (float*)libxsmm_aligned_scratch(
+      nImg * nIfm * ifhp * ifwp * sizeof(float), 2097152);
+  float* naive_output = (float*)libxsmm_aligned_scratch(
+      nImg * nOfm * ofhp * ofwp * sizeof(float), 2097152);
+  float* naive_filter = (float*)libxsmm_aligned_scratch(
+      nOfm * nIfm * kh * kw * sizeof(float), 2097152);
+  /* initialize data */
+  init_buf(naive_input, nImg * nIfm * ifhp * ifwp, 0, 0);
+  zero_buf(naive_output, nImg * nOfm * ofhp * ofwp);
+  init_buf(naive_filter, nOfm * nIfm * kh * kw, 0, 0);
+
+  Tensor image(DT_FLOAT, {nImg, ifhp, ifwp, nIfm});
+
+  Tensor filter(DT_FLOAT, {kh, kw, nIfm, nOfm});
+
+  naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
+  naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm);
+
+  // Run naive convolution
+
+  naive_conv_t naive_param;
+
+  naive_param.nImg = nImg;
+  naive_param.nIfm = nIfm;
+  naive_param.nOfm = nOfm;
+  naive_param.ifhp = ifhp;
+  naive_param.ifwp = ifwp;
+  naive_param.ofhp = ofhp;
+  naive_param.ofwp = ofwp;
+  naive_param.ifh = ifh;
+  naive_param.ifw = ifw;
+  naive_param.ofh = ofh;
+  naive_param.ofw = ofw;
+  naive_param.pad_h = pad_h;
+  naive_param.pad_w = pad_w;
+  naive_param.pad_h_in = pad_h_in;
+  naive_param.pad_w_in = pad_w_in;
+  naive_param.pad_h_out = pad_h_out;
+  naive_param.pad_w_out = pad_w_out;
+  naive_param.kh = kh;
+  naive_param.kw = kw;
+  naive_param.stride_h = stride_h;
+  naive_param.stride_w = stride_w;
+
+  naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
+
+  AddInputFromArray<float>(image.shape(), image.flat<float>());
+  AddInputFromArray<float>(filter.shape(), filter.flat<float>());
+
+  // Run Op (TF)
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(DT_FLOAT, {nImg, ofhp, ofwp, nOfm});
+  naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
+
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+  libxsmm_free(naive_input);
+  libxsmm_free(naive_output);
+  libxsmm_free(naive_filter);
 }
 
 /*
@@ -325,7 +308,8 @@ TEST(XsmmConv2DTest, Basic) {
     desc.threads = num_threads;
     desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
     desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC;
-    desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
+    desc.filter_format =
+LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;//LIBXSMM_DNN_TENSOR_FORMAT_RSCK;
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc b/tensorflow/core/lib/bfloat16/bfloat16.cc
similarity index 51%
rename from tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
rename to tensorflow/core/lib/bfloat16/bfloat16.cc
index 181deedde71bab3cb9ef1820a88de557131b9311..a591717fd1abfc3d959d219d9ce2bde1272fd8ea 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.cc
+++ b/tensorflow/core/lib/bfloat16/bfloat16.cc
@@ -13,31 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h"
-
-#define EIGEN_USE_THREADS
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 
 #include "third_party/eigen3/Eigen/Core"
 
-#ifdef __AVX__
-xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_ExpV8F32AVX(
-    xla::cpu::runtime::V8F32AVX x) {
-  return Eigen::internal::pexp(x);
-}
+namespace tensorflow {
 
-xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_LogV8F32AVX(
-    xla::cpu::runtime::V8F32AVX x) {
-  return Eigen::internal::plog(x);
+B16_DEVICE_FUNC bfloat16::operator Eigen::half() const {
+  return static_cast<Eigen::half>(float(*this));
 }
-#endif  // __AVX__
-
-namespace xla {
-namespace cpu {
-namespace runtime {
-
-const char *const kExpV8F32AVXSymbolName = "__xla_cpu_runtime_ExpV8F32AVX";
-const char *const kLogV8F32AVXSymbolName = "__xla_cpu_runtime_LogV8F32AVX";
-
-}  // namespace runtime
-}  // namespace cpu
-}  // namespace xla
+}  // end namespace tensorflow
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9cca0ef2ab90c677e47d979a4636b3fc25ec919
--- /dev/null
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -0,0 +1,276 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
+#define TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
+
+#include <complex>
+
+#ifdef __CUDACC__
+// All functions callable from CUDA code must be qualified with __device__
+#define B16_DEVICE_FUNC __host__ __device__
+
+#else
+#define B16_DEVICE_FUNC
+
+#endif
+
+namespace Eigen {
+struct half;
+}
+
+namespace tensorflow {
+
+// Single precision complex.
+typedef std::complex<float> complex64;
+// Double precision complex.
+typedef std::complex<double> complex128;
+
+// see framework/bfloat16.h for description.
+struct bfloat16 {
+  B16_DEVICE_FUNC bfloat16() {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const float v) {
+    if (float_isnan(v)) {
+      value = NAN_VALUE;
+      return;
+    }
+    const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    value = p[0];
+#else
+    value = p[1];
+#endif
+  }
+
+  B16_DEVICE_FUNC explicit bfloat16(const double val)
+      : bfloat16(static_cast<float>(val)) {}
+  // Following the convention of numpy, converting between complex and
+  // float will lead to loss of imag value.
+  B16_DEVICE_FUNC explicit bfloat16(const complex64& val)
+      : bfloat16(val.real()) {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const complex128& val)
+      : bfloat16(static_cast<float>(val.real())) {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const unsigned short val)
+      : bfloat16(static_cast<float>(val)) {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const unsigned int val)
+      : bfloat16(static_cast<float>(val)) {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const int val)
+      : bfloat16(static_cast<float>(val)) {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const long val)
+      : bfloat16(static_cast<float>(val)) {}
+
+  B16_DEVICE_FUNC explicit bfloat16(const long long val)
+      : bfloat16(static_cast<float>(val)) {}
+
+  template <class T>
+  B16_DEVICE_FUNC explicit bfloat16(const T& val)
+      : bfloat16(static_cast<float>(val)) {}
+
+  B16_DEVICE_FUNC explicit operator float() const {
+    float result;
+
+    uint16_t* q = reinterpret_cast<uint16_t*>(&result);
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    q[0] = value;
+    q[1] = 0;
+#else
+    q[0] = 0;
+    q[1] = value;
+#endif
+    return result;
+  }
+
+  B16_DEVICE_FUNC explicit operator bool() const {
+    return static_cast<bool>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator Eigen::half() const;
+
+  B16_DEVICE_FUNC explicit operator short() const {
+    return static_cast<short>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator int() const {
+    return static_cast<int>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator long() const {
+    return static_cast<long>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator char() const {
+    return static_cast<char>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator signed char() const {
+    return static_cast<signed char>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator unsigned char() const {
+    return static_cast<unsigned char>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator unsigned short() const {
+    return static_cast<unsigned short>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator unsigned int() const {
+    return static_cast<unsigned int>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator unsigned long() const {
+    return static_cast<unsigned long>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator unsigned long long() const {
+    return static_cast<unsigned long long>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator long long() const {
+    return static_cast<long long>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator double() const {
+    return static_cast<double>(float(*this));
+  }
+
+  B16_DEVICE_FUNC explicit operator complex64() const {
+    return complex64(float(*this), float(0.0));
+  }
+
+  B16_DEVICE_FUNC explicit operator complex128() const {
+    return complex128(double(*this), double(0.0));
+  }
+
+  static bfloat16 epsilon() {
+    bfloat16 x;
+    x.value = 0x3c00;  // 0x1.0p-7
+    return x;
+  }
+
+  uint16_t value;
+
+  // A value that represents "not a number".
+  static const uint16_t NAN_VALUE = 0x7FC0;
+
+ private:
+  B16_DEVICE_FUNC bool float_isnan(const float& x) {
+#ifdef __CUDA_ARCH__
+    return ::isnan(x);
+#else
+    return std::isnan(x);
+#endif
+  }
+};
+
+B16_DEVICE_FUNC inline std::ostream& operator<<(std::ostream& os,
+                                                const bfloat16& dt) {
+  os << static_cast<float>(dt);
+  return os;
+}
+
+B16_DEVICE_FUNC inline bfloat16 operator+(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+}
+B16_DEVICE_FUNC inline bfloat16 operator+(bfloat16 a, int b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+}
+B16_DEVICE_FUNC inline bfloat16 operator+(int a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+}
+B16_DEVICE_FUNC inline bfloat16 operator-(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
+}
+B16_DEVICE_FUNC inline bfloat16 operator*(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
+}
+B16_DEVICE_FUNC inline bfloat16 operator/(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+B16_DEVICE_FUNC inline bfloat16 operator-(bfloat16 a) {
+  a.value ^= 0x8000;
+  return a;
+}
+B16_DEVICE_FUNC inline bool operator<(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+B16_DEVICE_FUNC inline bool operator<=(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+B16_DEVICE_FUNC inline bool operator==(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) == static_cast<float>(b);
+}
+B16_DEVICE_FUNC inline bool operator!=(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+B16_DEVICE_FUNC inline bool operator>(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+B16_DEVICE_FUNC inline bool operator>=(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+B16_DEVICE_FUNC inline bfloat16& operator+=(bfloat16& a, bfloat16 b) {
+  a = a + b;
+  return a;
+}
+B16_DEVICE_FUNC inline bfloat16& operator-=(bfloat16& a, bfloat16 b) {
+  a = a - b;
+  return a;
+}
+B16_DEVICE_FUNC inline bfloat16 operator++(bfloat16& a) {
+  a += bfloat16(1);
+  return a;
+}
+B16_DEVICE_FUNC inline bfloat16 operator--(bfloat16& a) {
+  a -= bfloat16(1);
+  return a;
+}
+B16_DEVICE_FUNC inline bfloat16 operator++(bfloat16& a, int) {
+  bfloat16 original_value = a;
+  ++a;
+  return original_value;
+}
+B16_DEVICE_FUNC inline bfloat16 operator--(bfloat16& a, int) {
+  bfloat16 original_value = a;
+  --a;
+  return original_value;
+}
+B16_DEVICE_FUNC inline bfloat16& operator*=(bfloat16& a, bfloat16 b) {
+  a = a * b;
+  return a;
+}
+B16_DEVICE_FUNC inline bfloat16& operator/=(bfloat16& a, bfloat16 b) {
+  a = a / b;
+  return a;
+}
+}  // end namespace tensorflow
+
+namespace std {
+template <>
+struct hash<tensorflow::bfloat16> {
+  size_t operator()(const tensorflow::bfloat16& v) const {
+    return hash<float>()(static_cast<float>(v));
+  }
+};
+}  // namespace std
+
+#endif  // TENSORFLOW_CORE_LIB_BFLOAT16_BFLOAT16_H_
diff --git a/tensorflow/core/lib/core/arena.cc b/tensorflow/core/lib/core/arena.cc
index 2a04f7bd39df98a97ec7ed0f82dfdfbd8222a2da..55e481d0e60a004f2baebdcac444dd7e7cf93e66 100644
--- a/tensorflow/core/lib/core/arena.cc
+++ b/tensorflow/core/lib/core/arena.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mem.h"
@@ -113,24 +114,11 @@ void Arena::MakeNewBlock(const uint32 alignment) {
   CHECK(SatisfyAlignment(alignment));
 }
 
-// The following simple numeric routines also exist in util/math/mathutil.h
-// but we don't want to depend on that library.
-
-// Euclid's algorithm for Greatest Common Denominator.
-static uint32 GCD(uint32 x, uint32 y) {
-  while (y != 0) {
-    uint32 r = x % y;
-    x = y;
-    y = r;
-  }
-  return x;
-}
-
 static uint32 LeastCommonMultiple(uint32 a, uint32 b) {
   if (a > b) {
-    return (a / GCD(a, b)) * b;
+    return (a / MathUtil::GCD<uint32>(a, b)) * b;
   } else if (a < b) {
-    return (b / GCD(b, a)) * a;
+    return (b / MathUtil::GCD<uint32>(b, a)) * a;
   } else {
     return a;
   }
diff --git a/tensorflow/core/lib/core/bitmap.h b/tensorflow/core/lib/core/bitmap.h
index b30479fa1bbec58697d50a6bb85d6f430454e5e9..8ff1e666b4ffcdc09353b57b949584404be4aeed 100644
--- a/tensorflow/core/lib/core/bitmap.h
+++ b/tensorflow/core/lib/core/bitmap.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
+#ifndef TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
+#define TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
 
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -103,4 +103,4 @@ inline void Bitmap::clear(size_t i) {
 }  // namespace core
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
+#endif  // TENSORFLOW_CORE_LIB_CORE_BITMAP_H_
diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h
index 3b8a322854f562c0b066e6175e23697ca6445633..49f74ff47fbc839c84465ba86e85b38cb3bd38ec 100644
--- a/tensorflow/core/lib/core/status.h
+++ b/tensorflow/core/lib/core/status.h
@@ -127,11 +127,11 @@ inline tensorflow::string* TfCheckOpHelper(::tensorflow::Status v,
   return TfCheckOpHelperOutOfLine(v, msg);
 }
 
-#define TF_DO_CHECK_OK(val, level)                  \
-  while (auto _result = TfCheckOpHelper(val, #val)) \
-    LOG(level) << *(_result)
+#define TF_DO_CHECK_OK(val, level)                                \
+  while (auto _result = ::tensorflow::TfCheckOpHelper(val, #val)) \
+  LOG(level) << *(_result)
 
-#define TF_CHECK_OK(val)  TF_DO_CHECK_OK(val, FATAL)
+#define TF_CHECK_OK(val) TF_DO_CHECK_OK(val, FATAL)
 #define TF_QCHECK_OK(val) TF_DO_CHECK_OK(val, QFATAL)
 
 // DEBUG only version of TF_CHECK_OK.  Compiler still parses 'val' even in opt
diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h
index 89a1e26b812bf559c5e5413a58da7f5ed8947c7f..caa9642774bebec05a28b7a0c2ea71d18d6ebd1a 100644
--- a/tensorflow/core/lib/core/stringpiece.h
+++ b/tensorflow/core/lib/core/stringpiece.h
@@ -42,7 +42,7 @@ class StringPiece {
   typedef size_t size_type;
 
   // Create an empty slice.
-  StringPiece() : data_(""), size_(0) {}
+  StringPiece() : data_(nullptr), size_(0) {}
 
   // Create a slice that refers to d[0,n-1].
   StringPiece(const char* d, size_t n) : data_(d), size_(n) {}
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 2b10ebeaf7cbed4a8466a69898d6d4d6660ed5cb..e55ed79d36cd2db7a6f6b19f3579f47e73b4b2d9 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -66,7 +66,9 @@ struct EigenEnvironment {
     }
     return Task{
         std::unique_ptr<TaskImpl>(new TaskImpl{
-            std::move(f), Context(ContextKind::kThread), id,
+            std::move(f),
+            Context(ContextKind::kThread),
+            id,
         }),
     };
   }
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index 49ddb16645c32a82d90eafa5f550b8887ac84b79..627ef5a892a35ec43d0c31220dcf046b4b8eda55 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -97,8 +97,8 @@ TEST(ThreadPool, ParallelForWithWorkerId) {
     }
     pool.ParallelForWithWorkerId(
         kWorkItems, kHugeCost,
-        [&threads_running, &work, num_threads](
-            int64 begin, int64 end, int64 id) {
+        [&threads_running, &work, num_threads](int64 begin, int64 end,
+                                               int64 id) {
           // Store true for the current thread, and assert that another thread
           // is not running with the same id.
           ASSERT_LE(0, id);
diff --git a/tensorflow/core/lib/db/BUILD b/tensorflow/core/lib/db/BUILD
index 41b7af1b6993d967370e54f080fcd63a4483d4b6..9ff87e8d66d2575966c703a896ac9ff0bc51661a 100644
--- a/tensorflow/core/lib/db/BUILD
+++ b/tensorflow/core/lib/db/BUILD
@@ -5,21 +5,35 @@ package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_copts")
 
 cc_library(
     name = "sqlite",
     srcs = ["sqlite.cc"],
     hdrs = ["sqlite.h"],
+    copts = tf_copts(),
     deps = [
-        "//tensorflow/compiler/xla:statusor",
+        ":snapfn",
         "//tensorflow/core:lib",
-        "@sqlite_archive//:sqlite",
+        "//tensorflow/core:lib_internal",
+        "@org_sqlite",
+    ],
+)
+
+cc_library(
+    name = "snapfn",
+    srcs = ["snapfn.cc"],
+    copts = tf_copts() + ["-DSQLITE_OMIT_LOAD_EXTENSION"],
+    linkstatic = 1,
+    deps = [
+        "@org_sqlite",
+        "@snappy",
     ],
 )
 
 tf_cc_test(
     name = "sqlite_test",
+    size = "small",
     srcs = ["sqlite_test.cc"],
     deps = [
         ":sqlite",
diff --git a/tensorflow/core/lib/db/snapfn.cc b/tensorflow/core/lib/db/snapfn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a659f41ed99ff50ebd0d0498f70a57dd715f49e
--- /dev/null
+++ b/tensorflow/core/lib/db/snapfn.cc
@@ -0,0 +1,253 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \brief SQLite extension for Snappy compression
+///
+/// Snappy a compression library that trades ratio for speed, almost going a
+/// tenth as fast as memcpy().
+///
+/// FUNCTIONS
+///
+/// - snap(value: BLOB|TEXT) -> BLOB
+/// - snap(value: NULL|INT|REAL) -> value
+///
+///   Applies Snappy compression. If value is TEXT or BLOB, then it is
+///   compressed and a BLOB is returned with a byte prepended to indicate the
+///   original type. Other types are returned as-is.
+///
+/// - unsnap(value: BLOB) -> TEXT|BLOB
+/// - unsnap(value: TEXT) -> SQLITE_MISMATCH
+/// - unsnap(value: NULL|INT|REAL) -> value
+///
+///   Decompresses value created by snap(). If value is empty, then an empty
+///   blob is returned. Otherwise the original type is restored from the first
+///   byte and the remaining ones are decompressed. TEXT is not allowed as an
+///   input type. Remaining types are returned as-is.
+///
+/// PERFORMANCE CONSIDERATIONS
+///
+/// These functions are deterministic. This means SQLite ≥3.8.3 will factor
+/// them out of inner loops when constant arguments are provided. In SQLite
+/// ≥3.15.0 they can be used in the WHERE clause of partial indexes. Currently
+/// there is no support for common sub-expression elimination.
+///
+/// SQLite environments that aren't universally UTF8 will work, but should
+/// encounter superfluous charset transcodings; as this implementation encodes
+/// only UTF8 TEXT for the sake of simplicity. Contributions are welcome that
+/// register multiple sister functions for the various charsets, which use the
+/// higher order bits of the type byte to indicate encoding.
+///
+/// SUPPORT MATRIX
+///
+/// - 3.20.0 (2016-05-18) What FOSS TensorFlow uses
+/// - 3.13.0 (2016-05-18) What Google uses c. 2017-12
+/// - 3.8.2  (2013-12-06) Used by Ubuntu 14.04
+///
+/// MANUAL COMPILATION
+///
+/// $ sudo apt-get install libsqlite3-dev libsnappy-dev
+/// $ c++ -shared --std=c++11 -o libsnapfn.so -fPIC snapfn.cc -lsnappy
+///
+/// $ sqlite3
+/// sqlite> .load libsnapfn.so
+/// sqlite> select hex(snap('aaaaaaaaaaaaaaaaa'));
+/// 031100613E0100
+/// sqlite> select unsnap(x'031100613E0100');
+/// aaaaaaaaaaaaaaaaa
+///
+/// $ python
+/// >>> import sqlite3
+/// >>> db = sqlite3.connect(':memory:')
+/// >>> db.enable_load_extension(True)
+/// >>> db.execute('select load_extension("libsnapfn.so")')
+/// >>> db.enable_load_extension(False)
+/// >>> db.execute('select hex(snap("aaaaaaaaaaaaaaaaa"))').fetchone()[0]
+/// u'031100613E0100'
+
+#include "sqlite3ext.h"
+#include "snappy.h"
+
+SQLITE_EXTENSION_INIT1
+
+static void snap(sqlite3_context* ctx, int /*argc*/, sqlite3_value** argv) {
+  const char* data;
+  int type = sqlite3_value_type(argv[0]);
+  switch (type) {
+    case SQLITE_NULL:
+      return;
+    case SQLITE_INTEGER:
+      sqlite3_result_int64(ctx, sqlite3_value_int64(argv[0]));
+      return;
+    case SQLITE_FLOAT:
+      sqlite3_result_double(ctx, sqlite3_value_double(argv[0]));
+      return;
+    case SQLITE_BLOB:
+      data = reinterpret_cast<const char*>(sqlite3_value_blob(argv[0]));
+      break;
+    case SQLITE_TEXT:
+      data = reinterpret_cast<const char*>(sqlite3_value_text(argv[0]));
+      break;
+    default:
+      sqlite3_result_error(ctx, "snap() invalid type", -1);
+      sqlite3_result_error_code(ctx, SQLITE_MISMATCH);
+      return;
+  }
+  int size = sqlite3_value_bytes(argv[0]);
+  if (size <= 0) {
+    char result[] = {static_cast<char>(type)};
+    sqlite3_result_blob(ctx, result, sizeof(result), SQLITE_TRANSIENT);
+    return;
+  }
+  size_t output_size =
+      snappy::MaxCompressedLength(static_cast<size_t>(size)) + 1;
+  if (output_size >
+      static_cast<size_t>(sqlite3_limit(sqlite3_context_db_handle(ctx),
+                                        SQLITE_LIMIT_LENGTH, -1))) {
+    sqlite3_result_error_toobig(ctx);
+    return;
+  }
+  auto output =
+      static_cast<char*>(sqlite3_malloc(static_cast<int>(output_size)));
+  if (output == nullptr) {
+    sqlite3_result_error_nomem(ctx);
+    return;
+  }
+  *output++ = static_cast<char>(type), --output_size;
+  snappy::RawCompress(data, static_cast<size_t>(size), output, &output_size);
+  sqlite3_result_blob(ctx, output - 1, static_cast<int>(output_size + 1),
+                      sqlite3_free);
+}
+
+static void unsnap(sqlite3_context* ctx, int /*argc*/, sqlite3_value** argv) {
+  int type = sqlite3_value_type(argv[0]);
+  switch (type) {
+    case SQLITE_NULL:
+      return;
+    case SQLITE_INTEGER:
+      sqlite3_result_int64(ctx, sqlite3_value_int64(argv[0]));
+      return;
+    case SQLITE_FLOAT:
+      sqlite3_result_double(ctx, sqlite3_value_double(argv[0]));
+      return;
+    case SQLITE_BLOB:
+      break;
+    default:
+      sqlite3_result_error(ctx, "unsnap() invalid type", -1);
+      sqlite3_result_error_code(ctx, SQLITE_MISMATCH);
+      return;
+  }
+  int size = sqlite3_value_bytes(argv[0]);
+  auto blob = reinterpret_cast<const char*>(sqlite3_value_blob(argv[0]));
+  if (size <= 0) {
+    sqlite3_result_zeroblob(ctx, 0);
+    return;
+  }
+  type = static_cast<int>(*blob++), --size;
+  if (type != SQLITE_BLOB && type != SQLITE_TEXT) {
+    sqlite3_result_error(ctx, "unsnap() first byte is invalid type", -1);
+    sqlite3_result_error_code(ctx, SQLITE_CORRUPT);
+    return;
+  }
+  if (size == 0) {
+    if (type == SQLITE_TEXT) {
+      sqlite3_result_text(ctx, "", 0, SQLITE_STATIC);
+    } else {
+      sqlite3_result_zeroblob(ctx, 0);
+    }
+    return;
+  }
+  size_t output_size;
+  if (!snappy::GetUncompressedLength(blob, static_cast<size_t>(size),
+                                     &output_size)) {
+    sqlite3_result_error(ctx, "snappy parse error", -1);
+    sqlite3_result_error_code(ctx, SQLITE_CORRUPT);
+    return;
+  }
+  if (output_size >
+      static_cast<size_t>(sqlite3_limit(sqlite3_context_db_handle(ctx),
+                                        SQLITE_LIMIT_LENGTH, -1))) {
+    sqlite3_result_error_toobig(ctx);
+    return;
+  }
+  auto output =
+      static_cast<char*>(sqlite3_malloc(static_cast<int>(output_size)));
+  if (output == nullptr) {
+    sqlite3_result_error_nomem(ctx);
+    return;
+  }
+  if (!snappy::RawUncompress(blob, static_cast<size_t>(size), output)) {
+    sqlite3_result_error(ctx, "snappy message corruption", -1);
+    sqlite3_result_error_code(ctx, SQLITE_CORRUPT);
+    sqlite3_free(output);
+    return;
+  }
+  if (type == SQLITE_TEXT) {
+    sqlite3_result_text(ctx, output, static_cast<int>(output_size),
+                        sqlite3_free);
+  } else {
+    sqlite3_result_blob(ctx, output, static_cast<int>(output_size),
+                        sqlite3_free);
+  }
+}
+
+extern "C" {
+
+#ifndef SQLITE_DETERMINISTIC
+#define SQLITE_DETERMINISTIC 0
+#endif
+
+#ifndef SQLITE_CALLBACK
+#define SQLITE_CALLBACK
+#endif
+
+SQLITE_CALLBACK int sqlite3_snapfn_init(sqlite3* db, const char** /*pzErrMsg*/,
+                                        const sqlite3_api_routines* pApi) {
+  SQLITE_EXTENSION_INIT2(pApi);
+  int rc;
+
+  rc = sqlite3_create_function_v2(
+      db,
+      "snap",                              // zFunctionName
+      1,                                   // nArg
+      SQLITE_UTF8 | SQLITE_DETERMINISTIC,  // eTextRep
+      nullptr,                             // pApp
+      snap,                                // xFunc
+      nullptr,                             // xStep
+      nullptr,                             // xFinal
+      nullptr                              // xDestroy
+  );
+  if (rc != SQLITE_OK) {
+    return rc;
+  }
+
+  rc = sqlite3_create_function_v2(
+      db,
+      "unsnap",                            // zFunctionName
+      1,                                   // nArg
+      SQLITE_UTF8 | SQLITE_DETERMINISTIC,  // eTextRep
+      nullptr,                             // pApp
+      unsnap,                              // xFunc
+      nullptr,                             // xStep
+      nullptr,                             // xFinal
+      nullptr                              // xDestroy
+  );
+  if (rc != SQLITE_OK) {
+    return rc;
+  }
+
+  return SQLITE_OK;
+}
+
+}  // extern "C"
diff --git a/tensorflow/core/lib/db/sqlite.cc b/tensorflow/core/lib/db/sqlite.cc
index 701655f622a7ec0288f1cb53818877e65839643e..cb6943379d4ebe38c79ba9097d4c3183c7b8c205 100644
--- a/tensorflow/core/lib/db/sqlite.cc
+++ b/tensorflow/core/lib/db/sqlite.cc
@@ -14,176 +14,268 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/lib/db/sqlite.h"
 
-#include "tensorflow/core/lib/io/record_reader.h"
-#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 
-namespace tensorflow {
+extern "C" int sqlite3_snapfn_init(sqlite3*, const char**, const void*);
 
-/* static */
-xla::StatusOr<std::shared_ptr<Sqlite>> Sqlite::Open(const string& uri) {
-  sqlite3* sqlite = nullptr;
-  Status s = MakeStatus(sqlite3_open(uri.c_str(), &sqlite));
-  if (s.ok()) {
-    return std::shared_ptr<Sqlite>(new Sqlite(sqlite));
-  }
-  return s;
-}
+namespace tensorflow {
+namespace {
 
-/* static */ Status Sqlite::MakeStatus(int resultCode) {
+error::Code GetTfErrorCode(int code) {
   // See: https://sqlite.org/rescode.html
-  switch (resultCode & 0xff) {
-    case SQLITE_OK:
-    case SQLITE_ROW:   // sqlite3_step() has another row ready
-    case SQLITE_DONE:  // sqlite3_step() has finished executing
-      return Status::OK();
+  switch (code & 0xff) {
+    case SQLITE_OK:    // Successful result
+    case SQLITE_ROW:   // Step has another row ready
+    case SQLITE_DONE:  // Step has finished executing
+      return error::OK;
     case SQLITE_ABORT:  // Callback routine requested an abort
-      return errors::Aborted(sqlite3_errstr(resultCode));
+      return error::ABORTED;
     case SQLITE_READONLY:  // Attempt to write a readonly database
     case SQLITE_MISMATCH:  // Data type mismatch
-      return errors::FailedPrecondition(sqlite3_errstr(resultCode));
+      return error::FAILED_PRECONDITION;
     case SQLITE_MISUSE:    // Library used incorrectly
     case SQLITE_INTERNAL:  // Internal logic error in SQLite
-      return errors::Internal(sqlite3_errstr(resultCode));
+      return error::INTERNAL;
     case SQLITE_RANGE:  // 2nd parameter to sqlite3_bind out of range
-      return errors::OutOfRange(sqlite3_errstr(resultCode));
+      return error::OUT_OF_RANGE;
     case SQLITE_CANTOPEN:    // Unable to open the database file
     case SQLITE_CONSTRAINT:  // Abort due to constraint violation
     case SQLITE_NOTFOUND:    // Unknown opcode or statement parameter name
     case SQLITE_NOTADB:      // File opened that is not a database file
-      return errors::InvalidArgument(sqlite3_errstr(resultCode));
+      return error::INVALID_ARGUMENT;
     case SQLITE_CORRUPT:  // The database disk image is malformed
-      return errors::DataLoss(sqlite3_errstr(resultCode));
+      return error::DATA_LOSS;
     case SQLITE_AUTH:  // Authorization denied
     case SQLITE_PERM:  // Access permission denied
-      return errors::PermissionDenied(sqlite3_errstr(resultCode));
+      return error::PERMISSION_DENIED;
     case SQLITE_FULL:    // Insertion failed because database is full
     case SQLITE_TOOBIG:  // String or BLOB exceeds size limit
     case SQLITE_NOLFS:   // Uses OS features not supported on host
-      return errors::ResourceExhausted(sqlite3_errstr(resultCode));
+      return error::RESOURCE_EXHAUSTED;
     case SQLITE_BUSY:      // The database file is locked
     case SQLITE_LOCKED:    // A table in the database is locked
     case SQLITE_PROTOCOL:  // Database lock protocol error
-    case SQLITE_NOMEM:     // A malloc() failed
-      return errors::Unavailable(sqlite3_errstr(resultCode));
+    case SQLITE_NOMEM:     // Out of heap or perhaps lookaside memory
+      return error::UNAVAILABLE;
     case SQLITE_INTERRUPT:  // Operation terminated by sqlite3_interrupt
-      return errors::Cancelled(sqlite3_errstr(resultCode));
+      return error::CANCELLED;
     case SQLITE_ERROR:   // SQL error or missing database
     case SQLITE_IOERR:   // Some kind of disk I/O error occurred
     case SQLITE_SCHEMA:  // The database schema changed
     default:
-      return errors::Unknown(sqlite3_errstr(resultCode));
+      return error::UNKNOWN;
   }
 }
 
-Sqlite::Sqlite(sqlite3* db) : db_(db) {}
-
-Sqlite::~Sqlite() {
-  // close_v2 doesn't care if a stmt hasn't been GC'd yet
-  int rc = sqlite3_close_v2(db_);
-  if (rc != SQLITE_OK) {
-    LOG(ERROR) << "destruct sqlite3: " << MakeStatus(rc);
-  }
-}
-
-Status Sqlite::Close() {
-  if (db_ == nullptr) {
-    return Status::OK();
-  }
-  // If Close is explicitly called, ordering must be correct.
-  Status s = MakeStatus(sqlite3_close(db_));
-  if (s.ok()) {
-    db_ = nullptr;
-  }
-  return s;
+template <typename... Args>
+Status PrintfStatus(int rc, const char* fmt, Args&&... args) {
+  return {GetTfErrorCode(rc),
+          strings::Printf(fmt, std::forward<Args>(args)...)};
 }
 
-SqliteStatement Sqlite::Prepare(const string& sql) {
+sqlite3_stmt* PrepareRawOrDie(sqlite3* db, const char* sql) {
   sqlite3_stmt* stmt = nullptr;
-  int rc = sqlite3_prepare_v2(db_, sql.c_str(), sql.size() + 1, &stmt, nullptr);
-  if (rc == SQLITE_OK) {
-    return {stmt, SQLITE_OK, std::unique_ptr<string>(nullptr)};
-  } else {
-    return {nullptr, rc, std::unique_ptr<string>(new string(sql))};
-  }
+  int rc = sqlite3_prepare_v2(db, sql, -1, &stmt, nullptr);
+  CHECK_EQ(SQLITE_OK, rc) << sql;
+  return stmt;
 }
 
-Status SqliteStatement::status() const {
-  Status s = Sqlite::MakeStatus(error_);
-  if (!s.ok()) {
-    if (stmt_ != nullptr) {
-      errors::AppendToMessage(&s, sqlite3_sql(stmt_));
-    } else {
-      errors::AppendToMessage(&s, *prepare_error_sql_);
+Status SetPragma(Sqlite* db, const char* pragma, const StringPiece& value) {
+  if (value.empty()) return Status::OK();
+  for (auto p = value.begin(); p < value.end(); ++p) {
+    if (!(('0' <= *p && *p <= '9') || ('A' <= *p && *p <= 'Z') ||
+          ('a' <= *p && *p <= 'z') || *p == '-')) {
+      return errors::InvalidArgument("Illegal pragma character");
     }
   }
-  return s;
+  SqliteStatement stmt;
+  TF_RETURN_IF_ERROR(  // We can't use Bind*() pragma statements.
+      db->Prepare(strings::StrCat("PRAGMA ", pragma, "=", value), &stmt));
+  bool unused_done;
+  return stmt.Step(&unused_done);
 }
 
-void SqliteStatement::CloseOrLog() {
-  if (stmt_ != nullptr) {
-    int rc = sqlite3_finalize(stmt_);
-    if (rc != SQLITE_OK) {
-      LOG(ERROR) << "destruct sqlite3_stmt: " << Sqlite::MakeStatus(rc);
-    }
-    stmt_ = nullptr;
-  }
+const StringPiece GetEnv(const char* var) {
+  const char* val = std::getenv(var);
+  return (val == nullptr) ? StringPiece() : StringPiece(val);
+}
+
+Status EnvPragma(Sqlite* db, const char* pragma, const char* var) {
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(SetPragma(db, pragma, GetEnv(var)), "getenv(",
+                                  var, ")");
+  return Status::OK();
 }
 
-Status SqliteStatement::Close() {
-  if (stmt_ == nullptr) {
-    return Status::OK();
+}  // namespace
+
+/* static */
+Status Sqlite::Open(const string& path, int flags, Sqlite** db) {
+  flags |= SQLITE_OPEN_PRIVATECACHE;
+  sqlite3* sqlite = nullptr;
+  int rc = sqlite3_open_v2(path.c_str(), &sqlite, flags, nullptr);
+  if (rc != SQLITE_OK) {
+    *db = nullptr;
+    return PrintfStatus(rc, "Sqlite::Open(%s) failed: %s", path.c_str(),
+                        sqlite3_errstr(rc));
   }
-  int rc = sqlite3_finalize(stmt_);
-  if (rc == SQLITE_OK) {
-    stmt_ = nullptr;
+  CHECK_EQ(SQLITE_OK, sqlite3_extended_result_codes(sqlite, 1));
+  CHECK_EQ(SQLITE_OK, sqlite3_snapfn_init(sqlite, nullptr, nullptr));
+  // Prepare these tiny privileged statements for SqliteTransaction
+  // so it can do less work, particularly in its constructor, per
+  // Google C++ Style.
+  sqlite3_stmt* begin = PrepareRawOrDie(sqlite, "BEGIN");
+  sqlite3_stmt* commit = PrepareRawOrDie(sqlite, "COMMIT");
+  sqlite3_stmt* rollback = PrepareRawOrDie(sqlite, "ROLLBACK");
+  *db = new Sqlite(sqlite, begin, commit, rollback);
+  Status s = Status::OK();
+  // Up until 2016 the default SQLite page_size was 1024. This ensures
+  // the new default regardless of linkage unless configured otherwise.
+  s.Update(SetPragma(*db, "page_size", "4096"));
+  // TensorFlow is designed to work well in all SQLite modes. However
+  // users might find tuning some these pragmas rewarding, depending on
+  // various considerations. Pragmas are set on a best-effort basis and
+  // might be ignored.
+  s.Update(EnvPragma(*db, "secure_delete", "TF_SQLITE_SECURE_DELETE"));
+  s.Update(EnvPragma(*db, "page_size", "TF_SQLITE_PAGE_SIZE"));
+  s.Update(EnvPragma(*db, "journal_mode", "TF_SQLITE_JOURNAL_MODE"));
+  s.Update(EnvPragma(*db, "synchronous", "TF_SQLITE_SYNCHRONOUS"));
+  s.Update(EnvPragma(*db, "mmap_size", "TF_SQLITE_MMAP_SIZE"));
+  s.Update(EnvPragma(*db, "locking_mode", "TF_SQLITE_LOCKING_MODE"));
+  s.Update(EnvPragma(*db, "cache_size", "TF_SQLITE_CACHE_SIZE"));
+  s.Update(EnvPragma(*db, "auto_vacuum", "TF_SQLITE_AUTO_VACUUM"));
+  DCHECK((*db)->RefCountIsOne());
+  if (!s.ok()) {
+    (*db)->Unref();
+    *db = nullptr;
   }
-  Update(rc);
-  return status();
+  return s;
 }
 
-void SqliteStatement::Reset() {
-  if (TF_PREDICT_TRUE(stmt_ != nullptr)) {
-    sqlite3_reset(stmt_);
-    sqlite3_clear_bindings(stmt_);  // not nullptr friendly
+Sqlite::~Sqlite() {
+  sqlite3_finalize(rollback_);
+  sqlite3_finalize(commit_);
+  sqlite3_finalize(begin_);
+  CHECK_EQ(SQLITE_OK, sqlite3_close(db_));
+}
+
+Status Sqlite::Prepare(const StringPiece& sql, SqliteStatement* stmt) {
+  SqliteLock lock(*this);
+  sqlite3_stmt* ps = nullptr;
+  int rc = sqlite3_prepare_v2(db_, sql.data(), static_cast<int>(sql.size()),
+                              &ps, nullptr);
+  if (rc != SQLITE_OK) {
+    *stmt = SqliteStatement();
+    return PrintfStatus(rc, "Prepare() failed: [%d] %s: %.*s", rc, errmsg(),
+                        sql.size(), sql.data());
   }
-  error_ = SQLITE_OK;
+  *stmt = SqliteStatement(this, ps);
+  return Status::OK();
 }
 
-Status SqliteStatement::Step(bool* isDone) {
-  if (TF_PREDICT_FALSE(error_ != SQLITE_OK)) {
-    *isDone = true;
-    return status();
+Status SqliteStatement::Step(bool* is_done) {
+  DCHECK(stmt_ != nullptr);
+  if (TF_PREDICT_FALSE(bind_error_ != SQLITE_OK)) {
+    *is_done = true;
+    return PrintfStatus(bind_error_, "Bind(%d) failed: %s: %s",
+                        bind_error_parameter_, sqlite3_errstr(bind_error_),
+                        sql());
   }
+  SqliteLock lock(*db_);
   int rc = sqlite3_step(stmt_);
   switch (rc) {
     case SQLITE_ROW:
-      *isDone = false;
+      *is_done = false;
       return Status::OK();
     case SQLITE_DONE:
-      *isDone = true;
+      *is_done = true;
       return Status::OK();
     default:
-      *isDone = true;
-      error_ = rc;
-      return status();
+      *is_done = true;
+      return PrintfStatus(rc, "Step() failed: [%d] %s: %s", rc, db_->errmsg(),
+                          sql());
   }
 }
 
-Status SqliteStatement::StepAndReset() {
-  if (TF_PREDICT_FALSE(error_ != SQLITE_OK)) {
-    return status();
+bool SqliteStatement::StepOrDie() {
+  bool is_done;
+  TF_CHECK_OK(Step(&is_done));
+  return !is_done;
+}
+
+Status SqliteStatement::StepOnce() {
+  bool is_done;
+  TF_RETURN_IF_ERROR(Step(&is_done));
+  if (TF_PREDICT_FALSE(is_done)) {
+    return errors::Internal("No rows returned: ", sql());
   }
-  Status s;
-  int rc = sqlite3_step(stmt_);
-  if (rc != SQLITE_DONE) {
-    if (rc == SQLITE_ROW) {
-      s.Update(errors::Internal("unexpected sqlite row"));
-    } else {
-      s.Update(Sqlite::MakeStatus(rc));
-    }
+  return Status::OK();
+}
+
+const SqliteStatement& SqliteStatement::StepOnceOrDie() {
+  TF_CHECK_OK(StepOnce());
+  return *this;
+}
+
+Status SqliteStatement::StepAndReset() {
+  bool is_done;
+  Status s = Step(&is_done);
+  if (TF_PREDICT_FALSE(s.ok() && !is_done)) {
+    s = errors::Internal("Unexpected row: ", sql());
   }
   Reset();
   return s;
 }
 
+void SqliteStatement::StepAndResetOrDie() { TF_CHECK_OK(StepAndReset()); }
+
+void SqliteStatement::Reset() {
+  if (TF_PREDICT_TRUE(stmt_ != nullptr)) {
+    sqlite3_reset(stmt_);
+    sqlite3_clear_bindings(stmt_);
+  }
+  bind_error_ = SQLITE_OK;
+  size_ = 0;
+}
+
+SqliteTransaction::SqliteTransaction(Sqlite& db) : db_(&db) {
+  sqlite3_mutex_enter(sqlite3_db_mutex(db_->db_));
+  CHECK(!db_->is_in_transaction_);
+  db_->is_in_transaction_ = true;
+  Begin();
+}
+
+SqliteTransaction::~SqliteTransaction() {
+  // Rollback should only return an error if there's no transaction.
+  // Since the API performs auto-rollbacks in some cases, we ignore.
+  sqlite3_step(db_->rollback_);
+  sqlite3_reset(db_->rollback_);
+  sqlite3_reset(db_->begin_);
+  db_->is_in_transaction_ = false;
+  sqlite3_mutex_leave(sqlite3_db_mutex(db_->db_));
+}
+
+void SqliteTransaction::Begin() {
+  // This shouldn't allocate memory or perform I/O. All it does is
+  // execute OP_AutoCommit(0, 0) a.k.a. BEGIN DEFERRED which flips
+  // the sqlite3::autoCommit bit.
+  if (sqlite3_step(db_->begin_) != SQLITE_DONE) {
+    // It shouldn't be possible for this to fail since we already
+    // performed the reentrancy check.
+    LOG(FATAL) << "BEGIN failed: " << sqlite3_errmsg(db_->db_);
+  }
+}
+
+Status SqliteTransaction::Commit() {
+  int rc = sqlite3_step(db_->commit_);
+  if (rc != SQLITE_DONE) {
+    return PrintfStatus(rc, "COMMIT failed: [%d] %s", rc,
+                        sqlite3_errmsg(db_->db_));
+  }
+  sqlite3_reset(db_->commit_);
+  sqlite3_reset(db_->begin_);
+  Begin();
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/db/sqlite.h b/tensorflow/core/lib/db/sqlite.h
index 774852efea7b494406c89960654b1acdca1f4ac9..efe97f78d259199a74bf5e830f70de657d1cd679 100644
--- a/tensorflow/core/lib/db/sqlite.h
+++ b/tensorflow/core/lib/db/sqlite.h
@@ -15,149 +15,208 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_DB_SQLITE_H_
 #define TENSORFLOW_CORE_LIB_DB_SQLITE_H_
 
-#include <stddef.h>
-#include <memory>
-#include <utility>
+#include <mutex>
 
 #include "sqlite3.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
+/// TensorFlow SQLite Veneer
+///
+/// - Memory safety
+/// - Less boilerplate
+/// - Removes deprecated stuff
+/// - Pretends UTF16 doesn't exist
+/// - Transaction compile-time safety
+/// - Statically loads our native extensions
+/// - Error reporting via tensorflow::Status et al.
+///
+/// SQLite>=3.8.2 needs to be supported until April 2019, which is when
+/// Ubuntu 14.04 LTS becomes EOL.
+
 namespace tensorflow {
 
+class SqliteLock;
 class SqliteStatement;
+class SqliteTransaction;
 
 /// \brief SQLite connection object.
 ///
-/// This class is a thin wrapper around `sqlite3` that makes it easier
-/// and safer to use SQLite in the TensorFlow C++ codebase. It removes
-/// deprecated APIs, improves the safety of others, adds helpers, and
-/// pretends UTF16 doesn't exist.
+/// The SQLite connection is closed automatically by the destructor.
+/// Reference counting ensures that happens after its statements are
+/// destructed.
 ///
-/// Instances are thread safe, with the exception of Close().
-class Sqlite {
+/// Instances are reference counted and can be shared between threads.
+/// This class offers the same thread safety behaviors as the SQLite
+/// API itself.
+///
+/// This veneer uses auto-commit mode by default, which means a 4ms
+/// fsync() happens after every write unless a SqliteTransaction is
+/// used or WAL mode is enabled beforehand.
+class LOCKABLE Sqlite : public core::RefCounted {
  public:
+  /// \brief Closes SQLite connection, which can take milliseconds.
+  virtual ~Sqlite();
+
   /// \brief Opens SQLite database file.
   ///
-  /// The `uri` parameter can be a filename, or a proper URI like
-  /// `file:/tmp/tf.sqlite?mode=ro&cache=private`. It can also be
-  /// `file::memory:` for testing.
+  /// Most users will want to set flags to SQLITE_OPEN_READWRITE |
+  /// SQLITE_OPEN_CREATE. There are many other open flags; here are
+  /// notes on a few of them:
   ///
-  /// See https://sqlite.org/c3ref/open.html
-  static xla::StatusOr<std::shared_ptr<Sqlite>> Open(const string& uri);
-
-  /// \brief Makes tensorflow::Status for SQLite result code.
+  /// - SQLITE_OPEN_READONLY: Allowed if no WAL journal is active.
+  /// - SQLITE_OPEN_SHAREDCACHE: Will be ignored because this veneer
+  ///   doesn't support the unlock notify API.
+  /// - SQLITE_OPEN_NOMUTEX: Means access to this connection MUST be
+  ///   serialized by the caller in accordance with the same contracts
+  ///   implemented by this API.
   ///
-  /// See https://sqlite.org/rescode.html
-  static Status MakeStatus(int resultCode);
+  /// This function sets PRAGMA values from TF_SQLITE_* environment
+  /// variables. See sqlite.cc to learn more.
+  static Status Open(const string& path, int flags, Sqlite** db);
 
-  /// \brief Destroys object and frees resources.
+  /// \brief Creates SQLite statement.
   ///
-  /// This will free the underlying object if Close was not called. If
-  /// an error code is returned then it will be logged.
+  /// This routine should never fail if sql is valid and does not
+  /// reference tables. When tables are referenced, system calls are
+  /// needed which can take microseconds. When the schema changes, this
+  /// routine will retry automatically and then possibly fail.
   ///
-  /// Note: Unlike Close() this destructor maps to sqlite3_close_v2(),
-  /// which is lax about ordering and GC friendly.
-  ~Sqlite();
+  /// The returned statement holds a reference to this object.
+  Status Prepare(const StringPiece& sql, SqliteStatement* stmt);
+  SqliteStatement PrepareOrDie(const StringPiece& sql);
 
-  /// \brief Frees underlying SQLite object.
+  /// \brief Returns extended result code of last error.
   ///
-  /// Unlike the destructor, all SqliteStatement objects must be closed
-  /// beforehand. This is a no-op if already closed
-  Status Close();
+  /// If the most recent API call was successful, the result is
+  /// undefined. The legacy result code can be obtained by saying
+  /// errcode() & 0xff.
+  int errcode() const EXCLUSIVE_LOCKS_REQUIRED(this) {
+    return sqlite3_extended_errcode(db_);
+  }
 
-  /// \brief Creates SQLite statement.
-  ///
-  /// Call result.status() to determine whether or not this operation
-  /// failed. It is also possible to punt the error checking to after
-  /// the values have been binded and Step() or ExecuteWriteQuery() is
-  /// called.
-  SqliteStatement Prepare(const string& sql);
+  /// \brief Returns pointer to current error message state.
+  const char* errmsg() const EXCLUSIVE_LOCKS_REQUIRED(this) {
+    return sqlite3_errmsg(db_);
+  }
+
+  /// \brief Returns rowid assigned to last successful insert.
+  int64 last_insert_rowid() const EXCLUSIVE_LOCKS_REQUIRED(this) {
+    return sqlite3_last_insert_rowid(db_);
+  }
+
+  /// \brief Returns number of rows directly changed by last write.
+  int64 changes() const EXCLUSIVE_LOCKS_REQUIRED(this) {
+    return sqlite3_changes(db_);
+  }
 
  private:
-  explicit Sqlite(sqlite3* db);
-  sqlite3* db_;
+  friend class SqliteLock;
+  friend class SqliteStatement;
+  friend class SqliteTransaction;
+
+  Sqlite(sqlite3* db, sqlite3_stmt* begin, sqlite3_stmt* commit,
+         sqlite3_stmt* rollback) noexcept
+      : db_(db), begin_(begin), commit_(commit), rollback_(rollback) {}
+
+  sqlite3* const db_;
+  sqlite3_stmt* const begin_;
+  sqlite3_stmt* const commit_;
+  sqlite3_stmt* const rollback_;
+  bool is_in_transaction_ = false;
+
   TF_DISALLOW_COPY_AND_ASSIGN(Sqlite);
 };
 
-/// \brief SQLite prepared statement cursor object.
+/// \brief SQLite prepared statement.
 ///
-/// This class tracks error state internally, like Status::Update.
+/// Instances can only be shared between threads if caller serializes
+/// access from first Bind*() to *Reset().
 ///
-/// Instances of this class are not thread safe.
+/// When reusing a statement in a loop, be certain to not have jumps
+/// betwixt Bind*() and *Reset().
 class SqliteStatement {
  public:
-  /// \brief Constructs empty statement that should be assigned later.
-  SqliteStatement() : stmt_(nullptr), error_(SQLITE_OK) {}
+  /// \brief Initializes an empty statement to be assigned later.
+  SqliteStatement() noexcept = default;
 
-  /// \brief Empties object and finalizes statement if needed.
-  ~SqliteStatement() { CloseOrLog(); }
+  /// \brief Finalizes statement.
+  ///
+  /// This can take milliseconds if it was blocking the Sqlite
+  /// connection object from being freed.
+  ~SqliteStatement() {
+    sqlite3_finalize(stmt_);
+    if (db_ != nullptr) db_->Unref();
+  }
 
-  /// \brief Move constructor, after which <other> should not be used.
-  SqliteStatement(SqliteStatement&& other);
+  /// \brief Returns true if statement is initialized.
+  explicit operator bool() const { return stmt_ != nullptr; }
 
-  /// \brief Move assignment, after which <other> should not be used.
-  SqliteStatement& operator=(SqliteStatement&& other);
+  /// \brief Returns SQL text from when this query was prepared.
+  const char* sql() const { return sqlite3_sql(stmt_); }
 
-  /// \brief Returns true if statement is not empty.
-  operator bool() const { return stmt_ != nullptr; }
+  /// \brief Number of bytes bound since last *Reset().
+  uint64 size() { return size_; }
 
-  /// \brief Returns SQLite result code state.
+  /// \brief Executes query for fetching arbitrary rows.
   ///
-  /// This will be SQLITE_OK unless an error happened. If multiple
-  /// errors happened, only the first error code will be returned.
-  int error() const { return error_; }
-
-  /// \brief Returns error() as a tensorflow::Status.
-  Status status() const;
-
-  /// \brief Finalize statement object.
+  /// `is_done` will always be set to true unless SQLITE_ROW is
+  /// returned by the underlying API. If status() is already in an
+  /// error state, then this method is a no-op and the existing status
+  /// is returned.
+  ///
+  /// The OrDie version returns `!is_done` which, if true, indicates a
+  /// row is available.
   ///
-  /// Please note that the destructor can also do this. This method is
-  /// a no-op if already closed.
-  Status Close();
+  /// This statement should be Reset() or destructed when when finished
+  /// with the result.
+  Status Step(bool* is_done);
+  bool StepOrDie() TF_MUST_USE_RESULT;
 
-  /// \brief Executes query and/or fetches next row.
+  /// \brief Executes query when only one row is desired.
   ///
-  /// `isDone` will always be set to true unless SQLITE_ROW is returned
-  /// by the underlying API. If status() is already in an error state,
-  /// then this method is a no-op and the existing status is returned.
-  Status Step(bool* isDone);
+  /// If a row isn't returned, an internal error Status is returned
+  /// that won't be reflected in the connection error state.
+  ///
+  /// This statement should be Reset() or destructed when when finished
+  /// with the result.
+  Status StepOnce();
+  const SqliteStatement& StepOnceOrDie();
 
-  /// \brief Executes query that returns no data.
+  /// \brief Executes query, ensures zero rows returned, then Reset().
   ///
-  /// This helper calls Step(), ensures SQLITE_DONE was returned, then
-  /// resets the statement and clears the bindings. If status() is
-  /// already in an error state, then this method is a no-op and the
-  /// existing status is returned.
+  /// If a row is returned, an internal error Status is returned that
+  /// won't be reflected in the connection error state.
   Status StepAndReset();
+  void StepAndResetOrDie();
 
   /// \brief Resets statement so it can be executed again.
   ///
-  /// - Resets the prepared statement
-  /// - Sets all Bind*() values to NULL
-  ///
-  /// Support for calling sqlite3_reset() and sqlite3_clear_bindings()
-  /// independently may be added in the future if a compelling use case
-  /// can be demonstrated.
+  /// Implementation note: This method diverges from canonical API
+  /// behavior by calling sqlite3_clear_bindings() in addition to
+  /// sqlite3_reset(). That makes the veneer safer; we haven't found a
+  /// super compelling reason yet to call them independently.
   void Reset();
 
   /// \brief Binds signed 64-bit integer to 1-indexed query parameter.
   void BindInt(int parameter, int64 value) {
-    Update(sqlite3_bind_int64(stmt_, parameter, value));
+    Update(sqlite3_bind_int64(stmt_, parameter, value), parameter);
+    size_ += sizeof(int64);
   }
-  void BindInt(const string& parameter, int64 value) {
+  void BindInt(const char* parameter, int64 value) {
     BindInt(GetParameterIndex(parameter), value);
   }
 
   /// \brief Binds double to 1-indexed query parameter.
   void BindDouble(int parameter, double value) {
-    Update(sqlite3_bind_double(stmt_, parameter, value));
+    Update(sqlite3_bind_double(stmt_, parameter, value), parameter);
+    size_ += sizeof(double);
   }
-  void BindDouble(const string& parameter, double value) {
+  void BindDouble(const char* parameter, double value) {
     BindDouble(GetParameterIndex(parameter), value);
   }
 
@@ -166,69 +225,71 @@ class SqliteStatement {
   /// If NUL characters are present, they will still go in the DB and
   /// be successfully retrieved by ColumnString(); however, the
   /// behavior of these values with SQLite functions is undefined.
-  void BindText(int parameter, const string& text) {
+  ///
+  /// When using the unsafe methods, the data must not be changed or
+  /// freed until this statement is Reset() or finalized.
+  void BindText(int parameter, const StringPiece& text) {
     Update(sqlite3_bind_text64(stmt_, parameter, text.data(), text.size(),
-                               SQLITE_TRANSIENT, SQLITE_UTF8));
+                               SQLITE_TRANSIENT, SQLITE_UTF8),
+           parameter);
+    size_ += text.size();
   }
-  void BindText(const string& parameter, const string& text) {
+  void BindText(const char* parameter, const StringPiece& text) {
     BindText(GetParameterIndex(parameter), text);
   }
-
-  /// \brief Copies binary data to 1-indexed query parameter.
-  void BindBlob(int parameter, const string& blob) {
-    Update(sqlite3_bind_blob64(stmt_, parameter, blob.data(), blob.size(),
-                               SQLITE_TRANSIENT));
-  }
-  void BindBlob(const string& parameter, const string& blob) {
-    BindBlob(GetParameterIndex(parameter), blob);
-  }
-
-  /// \brief Binds UTF-8 text to 1-indexed query parameter.
-  ///
-  /// The contents of `text` must not be changed or freed until Reset()
-  /// or Close() is called.
-  ///
-  /// If NUL characters are present, they will still go in the DB and
-  /// be successfully retrieved by ColumnString(); however, the
-  /// behavior of these values with SQLite functions is undefined.
-  void BindTextUnsafe(int parameter, const string& text) {
+  void BindTextUnsafe(int parameter, const StringPiece& text) {
     Update(sqlite3_bind_text64(stmt_, parameter, text.data(), text.size(),
-                               SQLITE_STATIC, SQLITE_UTF8));
+                               SQLITE_STATIC, SQLITE_UTF8),
+           parameter);
+    size_ += text.size();
   }
-  void BindTextUnsafe(const string& parameter, const string& text) {
+  void BindTextUnsafe(const char* parameter, const StringPiece& text) {
     BindTextUnsafe(GetParameterIndex(parameter), text);
   }
 
-  /// \brief Binds binary data to 1-indexed query parameter.
+  /// \brief Copies binary data to 1-indexed query parameter.
   ///
-  /// The contents of `blob` must not be changed or freed until Reset()
-  /// or Close() is called.
-  void BindBlobUnsafe(int parameter, const string& blob) {
+  /// When using the unsafe methods, the data must not be changed or
+  /// freed until this statement is Reset() or finalized.
+  void BindBlob(int parameter, const StringPiece& blob) {
+    Update(sqlite3_bind_blob64(stmt_, parameter, blob.data(), blob.size(),
+                               SQLITE_TRANSIENT),
+           parameter);
+    size_ += blob.size();
+  }
+  void BindBlob(const char* parameter, const StringPiece& blob) {
+    BindBlob(GetParameterIndex(parameter), blob);
+  }
+  void BindBlobUnsafe(int parameter, const StringPiece& blob) {
     Update(sqlite3_bind_blob64(stmt_, parameter, blob.data(), blob.size(),
-                               SQLITE_STATIC));
+                               SQLITE_STATIC),
+           parameter);
+    size_ += blob.size();
   }
-  void BindBlobUnsafe(const string& parameter, const string& text) {
+  void BindBlobUnsafe(const char* parameter, const StringPiece& text) {
     BindBlobUnsafe(GetParameterIndex(parameter), text);
   }
 
   /// \brief Returns number of columns in result set.
-  int ColumnCount() TF_MUST_USE_RESULT { return sqlite3_column_count(stmt_); }
+  int ColumnCount() const TF_MUST_USE_RESULT {
+    return sqlite3_column_count(stmt_);
+  }
 
   /// \brief Returns type of 0-indexed column value in row data.
   ///
   /// Please note that SQLite is dynamically typed and the type of a
   /// particular column can vary from row to row.
-  int ColumnType(int column) TF_MUST_USE_RESULT {
+  int ColumnType(int column) const TF_MUST_USE_RESULT {
     return sqlite3_column_type(stmt_, column);
   }
 
   /// \brief Returns 0-indexed column from row result coerced as an integer.
-  int64 ColumnInt(int column) TF_MUST_USE_RESULT {
+  int64 ColumnInt(int column) const TF_MUST_USE_RESULT {
     return sqlite3_column_int64(stmt_, column);
   }
 
   /// \brief Returns 0-indexed column from row result coerced as a double.
-  double ColumnDouble(int column) TF_MUST_USE_RESULT {
+  double ColumnDouble(int column) const TF_MUST_USE_RESULT {
     return sqlite3_column_double(stmt_, column);
   }
 
@@ -236,80 +297,151 @@ class SqliteStatement {
   ///
   /// NULL values are returned as empty string. This method should be
   /// used for both BLOB and TEXT columns. See also: ColumnType().
-  string ColumnString(int column) TF_MUST_USE_RESULT {
+  string ColumnString(int column) const TF_MUST_USE_RESULT {
     auto data = sqlite3_column_blob(stmt_, column);
-    if (data == nullptr) {
-      return "";
-    }
+    if (data == nullptr) return "";
     return {static_cast<const char*>(data),
             static_cast<size_t>(ColumnSize(column))};
   }
 
   /// \brief Returns pointer to binary data at 0-indexed column.
   ///
-  /// The returned memory will be mutated or freed the next time
-  /// Step() or Reset() is called. No NUL terminator is added. See
-  /// ColumnSize(). Please note that an empty BLOB is NULL.
-  const char* ColumnStringUnsafe(int column) TF_MUST_USE_RESULT {
-    return static_cast<const char*>(sqlite3_column_blob(stmt_, column));
+  /// Empty values are returned as NULL. The returned memory will no
+  /// longer be valid the next time Step() or Reset() is called. No NUL
+  /// terminator is added.
+  StringPiece ColumnStringUnsafe(int column) const TF_MUST_USE_RESULT {
+    return {static_cast<const char*>(sqlite3_column_blob(stmt_, column)),
+            static_cast<size_t>(ColumnSize(column))};
   }
 
   /// \brief Returns number of bytes stored at 0-indexed column.
-  int ColumnSize(int column) TF_MUST_USE_RESULT {
+  int ColumnSize(int column) const TF_MUST_USE_RESULT {
     return sqlite3_column_bytes(stmt_, column);
   }
 
+  /// \brief Move constructor, after which <other> is reset to empty.
+  SqliteStatement(SqliteStatement&& other) noexcept
+      : db_(other.db_), stmt_(other.stmt_), bind_error_(other.bind_error_) {
+    other.db_ = nullptr;
+    other.stmt_ = nullptr;
+    other.bind_error_ = SQLITE_OK;
+  }
+
+  /// \brief Move assignment, after which <other> is reset to empty.
+  SqliteStatement& operator=(SqliteStatement&& other) noexcept {
+    if (&other != this) {
+      if (db_ != nullptr) db_->Unref();
+      if (stmt_ != nullptr) sqlite3_finalize(stmt_);
+      db_ = other.db_;
+      stmt_ = other.stmt_;
+      bind_error_ = other.bind_error_;
+      size_ = other.size_;
+      other.db_ = nullptr;
+      other.stmt_ = nullptr;
+      other.bind_error_ = SQLITE_OK;
+      other.size_ = 0;
+    }
+    return *this;
+  }
+
  private:
-  friend Sqlite;
-  SqliteStatement(sqlite3_stmt* stmt, int error,
-                  std::unique_ptr<string> prepare_error_sql)
-      : stmt_(stmt),
-        error_(error),
-        prepare_error_sql_(std::move(prepare_error_sql)) {}
-  void CloseOrLog();
-
-  void Update(int rc) {
+  friend class Sqlite;
+
+  SqliteStatement(Sqlite* db, sqlite3_stmt* stmt) noexcept
+      : db_(db), stmt_(stmt) {
+    db_->Ref();
+  }
+
+  void Update(int rc, int parameter) {
+    // Binding strings can fail if they exceed length limit.
     if (TF_PREDICT_FALSE(rc != SQLITE_OK)) {
-      if (error_ == SQLITE_OK) {
-        error_ = rc;
+      if (bind_error_ == SQLITE_OK) {
+        bind_error_ = rc;
+        bind_error_parameter_ = parameter;
       }
     }
   }
 
-  int GetParameterIndex(const string& parameter) {
-    // Each call to this function requires O(n) strncmp().
-    int index = sqlite3_bind_parameter_index(stmt_, parameter.c_str());
-    if (TF_PREDICT_FALSE(index == 0)) {
-      Update(SQLITE_NOTFOUND);
-    }
+  int GetParameterIndex(const char* parameter) {
+    int index = sqlite3_bind_parameter_index(stmt_, parameter);
+    DCHECK(index > 0);  // OK to compile away since it'll fail again
     return index;
   }
 
-  sqlite3_stmt* stmt_;
-  int error_;
-  std::unique_ptr<string> prepare_error_sql_;
+  Sqlite* db_ = nullptr;
+  sqlite3_stmt* stmt_ = nullptr;
+  int bind_error_ = SQLITE_OK;
+  int bind_error_parameter_ = 0;
+  uint64 size_ = 0;
 
   TF_DISALLOW_COPY_AND_ASSIGN(SqliteStatement);
 };
 
-inline SqliteStatement::SqliteStatement(SqliteStatement&& other)
-    : stmt_(other.stmt_),
-      error_(other.error_),
-      prepare_error_sql_(std::move(other.prepare_error_sql_)) {
-  other.stmt_ = nullptr;
-  other.error_ = SQLITE_OK;
-}
-
-inline SqliteStatement& SqliteStatement::operator=(SqliteStatement&& other) {
-  if (&other != this) {
-    CloseOrLog();
-    stmt_ = other.stmt_;
-    error_ = other.error_;
-    prepare_error_sql_ = std::move(other.prepare_error_sql_);
-    other.stmt_ = nullptr;
-    other.error_ = SQLITE_OK;
+/// \brief Reentrant SQLite connection object lock
+///
+/// This is a no-op if SQLITE_OPEN_NOMUTEX was used.
+class SCOPED_LOCKABLE SqliteLock {
+ public:
+  explicit SqliteLock(Sqlite& db) EXCLUSIVE_LOCK_FUNCTION(db)
+      : mutex_(sqlite3_db_mutex(db.db_)) {
+    sqlite3_mutex_enter(mutex_);
+  }
+  SqliteLock(Sqlite& db, std::try_to_lock_t) EXCLUSIVE_LOCK_FUNCTION(db)
+      : mutex_(sqlite3_db_mutex(db.db_)) {
+    if (TF_PREDICT_FALSE(sqlite3_mutex_try(mutex_) != SQLITE_OK)) {
+      is_locked_ = false;
+    }
   }
-  return *this;
+  ~SqliteLock() UNLOCK_FUNCTION() {
+    if (is_locked_) sqlite3_mutex_leave(mutex_);
+  }
+  explicit operator bool() const { return is_locked_; }
+
+ private:
+  sqlite3_mutex* const mutex_;
+  bool is_locked_ = true;
+  TF_DISALLOW_COPY_AND_ASSIGN(SqliteLock);
+};
+#define SqliteLock(x) static_assert(0, "sqlite_lock_decl_missing_name");
+
+/// \brief SQLite transaction scope.
+///
+/// This class acquires an exclusive lock on the connection object (if
+/// mutexes weren't disabled) and runs BEGIN / ROLLBACK automatically.
+/// Unlike SqliteLock this scope is non-reentrant. To avoid program
+/// crashes, business logic should use the EXCLUSIVE_LOCK_FUNCTION and
+/// LOCKS_EXCLUDED annotations as much as possible.
+class SCOPED_LOCKABLE SqliteTransaction {
+ public:
+  /// \brief Locks db and begins deferred transaction.
+  ///
+  /// This will crash if a transaction is already active.
+  explicit SqliteTransaction(Sqlite& db) EXCLUSIVE_LOCK_FUNCTION(db);
+
+  /// \brief Runs ROLLBACK and unlocks.
+  ~SqliteTransaction() UNLOCK_FUNCTION();
+
+  /// \brief Commits transaction.
+  ///
+  /// If this is successful, a new transaction will be started, which
+  /// is rolled back when exiting the scope.
+  Status Commit();
+
+ private:
+  void Begin();
+  Sqlite* const db_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SqliteTransaction);
+};
+
+#define SQLITE_EXCLUSIVE_TRANSACTIONS_REQUIRED(...) \
+  EXCLUSIVE_LOCKS_REQUIRED(__VA_ARGS__)
+#define SQLITE_TRANSACTIONS_EXCLUDED(...) LOCKS_EXCLUDED(__VA_ARGS__)
+
+inline SqliteStatement Sqlite::PrepareOrDie(const StringPiece& sql) {
+  SqliteStatement stmt;
+  TF_CHECK_OK(Prepare(sql, &stmt));
+  return stmt;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/db/sqlite_test.cc b/tensorflow/core/lib/db/sqlite_test.cc
index ba045274adc605fbbaece7736537e8157e27cbc7..1e88323d017bec4b2705c6dbb19005efb8adbaa9 100644
--- a/tensorflow/core/lib/db/sqlite_test.cc
+++ b/tensorflow/core/lib/db/sqlite_test.cc
@@ -14,13 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/lib/db/sqlite.h"
 
-#include <limits.h>
 #include <array>
+#include <climits>
 
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -29,23 +29,25 @@ namespace {
 class SqliteTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    db_ = Sqlite::Open(":memory:").ValueOrDie();
-    auto stmt = db_->Prepare("CREATE TABLE T (a BLOB, b BLOB)");
-    TF_ASSERT_OK(stmt.StepAndReset());
+    TF_ASSERT_OK(Sqlite::Open(":memory:", SQLITE_OPEN_READWRITE, &db_));
+    db_->PrepareOrDie("CREATE TABLE T (a BLOB, b BLOB)").StepAndResetOrDie();
   }
-  std::shared_ptr<Sqlite> db_;
+
+  void TearDown() override { db_->Unref(); }
+
+  Sqlite* db_;
   bool is_done_;
 };
 
 TEST_F(SqliteTest, InsertAndSelectInt) {
-  auto stmt = db_->Prepare("INSERT INTO T (a, b) VALUES (?, ?)");
+  auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindInt(1, 3);
   stmt.BindInt(2, -7);
   TF_ASSERT_OK(stmt.StepAndReset());
   stmt.BindInt(1, 123);
   stmt.BindInt(2, -123);
   TF_ASSERT_OK(stmt.StepAndReset());
-  stmt = db_->Prepare("SELECT a, b FROM T ORDER BY b");
+  stmt = db_->PrepareOrDie("SELECT a, b FROM T ORDER BY b");
   TF_ASSERT_OK(stmt.Step(&is_done_));
   ASSERT_FALSE(is_done_);
   EXPECT_EQ(123, stmt.ColumnInt(0));
@@ -59,11 +61,11 @@ TEST_F(SqliteTest, InsertAndSelectInt) {
 }
 
 TEST_F(SqliteTest, InsertAndSelectDouble) {
-  auto stmt = db_->Prepare("INSERT INTO T (a, b) VALUES (?, ?)");
+  auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindDouble(1, 6.28318530);
   stmt.BindDouble(2, 1.61803399);
   TF_ASSERT_OK(stmt.StepAndReset());
-  stmt = db_->Prepare("SELECT a, b FROM T");
+  stmt = db_->PrepareOrDie("SELECT a, b FROM T");
   TF_ASSERT_OK(stmt.Step(&is_done_));
   EXPECT_EQ(6.28318530, stmt.ColumnDouble(0));
   EXPECT_EQ(1.61803399, stmt.ColumnDouble(1));
@@ -74,11 +76,11 @@ TEST_F(SqliteTest, InsertAndSelectDouble) {
 TEST_F(SqliteTest, NulCharsInString) {
   string s;  // XXX: Want to write {2, '\0'} but not sure why not.
   s.append(static_cast<size_t>(2), '\0');
-  auto stmt = db_->Prepare("INSERT INTO T (a, b) VALUES (?, ?)");
+  auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindBlob(1, s);
   stmt.BindText(2, s);
   TF_ASSERT_OK(stmt.StepAndReset());
-  stmt = db_->Prepare("SELECT a, b FROM T");
+  stmt = db_->PrepareOrDie("SELECT a, b FROM T");
   TF_ASSERT_OK(stmt.Step(&is_done_));
   EXPECT_EQ(2, stmt.ColumnSize(0));
   EXPECT_EQ(2, stmt.ColumnString(0).size());
@@ -92,58 +94,38 @@ TEST_F(SqliteTest, NulCharsInString) {
 
 TEST_F(SqliteTest, Unicode) {
   string s = "要依法治国是赞美那些谁是公义的和惩罚恶人。 - 韩非";
-  auto stmt = db_->Prepare("INSERT INTO T (a, b) VALUES (?, ?)");
+  auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindBlob(1, s);
   stmt.BindText(2, s);
   TF_ASSERT_OK(stmt.StepAndReset());
-  stmt = db_->Prepare("SELECT a, b FROM T");
+  stmt = db_->PrepareOrDie("SELECT a, b FROM T");
   TF_ASSERT_OK(stmt.Step(&is_done_));
   EXPECT_EQ(s, stmt.ColumnString(0));
   EXPECT_EQ(s, stmt.ColumnString(1));
 }
 
 TEST_F(SqliteTest, StepAndResetClearsBindings) {
-  auto stmt = db_->Prepare("INSERT INTO T (a, b) VALUES (?, ?)");
+  auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindInt(1, 1);
   stmt.BindInt(2, 123);
   TF_ASSERT_OK(stmt.StepAndReset());
   stmt.BindInt(1, 2);
   TF_ASSERT_OK(stmt.StepAndReset());
-  stmt = db_->Prepare("SELECT b FROM T ORDER BY a");
+  stmt = db_->PrepareOrDie("SELECT b FROM T ORDER BY a");
   TF_ASSERT_OK(stmt.Step(&is_done_));
   EXPECT_EQ(123, stmt.ColumnInt(0));
   TF_ASSERT_OK(stmt.Step(&is_done_));
   EXPECT_EQ(SQLITE_NULL, stmt.ColumnType(0));
 }
 
-TEST_F(SqliteTest, CloseBeforeFinalizeFails) {
-  auto stmt = db_->Prepare("INSERT INTO T (a, b) VALUES (?, ?)");
-  Status s = db_->Close();
-  EXPECT_FALSE(s.ok());
-}
-
-// Rather than bothering to check the status code of creating a
-// statement and every single bind call afterwards, SqliteStatement
-// is designed to carry the first error state forward to Step().
-TEST_F(SqliteTest, ErrorPuntingDoesNotReportLibraryAbuse) {
-  auto stmt = db_->Prepare("lol cat");
-  EXPECT_FALSE(stmt.status().ok());
-  EXPECT_EQ(SQLITE_ERROR, stmt.error());
-  stmt.BindInt(1, 1);
-  stmt.BindInt(2, 2);
-  Status s = stmt.Step(&is_done_);
-  EXPECT_EQ(SQLITE_ERROR, stmt.error());  // first error of several
-  EXPECT_FALSE(s.ok());
-}
-
 TEST_F(SqliteTest, SafeBind) {
   string s = "hello";
-  auto stmt = db_->Prepare("INSERT INTO T (a, b) VALUES (?, ?)");
+  auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindBlob(1, s);
   stmt.BindText(2, s);
   s.at(0) = 'y';
   TF_ASSERT_OK(stmt.StepAndReset());
-  stmt = db_->Prepare("SELECT a, b FROM T");
+  stmt = db_->PrepareOrDie("SELECT a, b FROM T");
   TF_ASSERT_OK(stmt.Step(&is_done_));
   EXPECT_EQ("hello", stmt.ColumnString(0));
   EXPECT_EQ("hello", stmt.ColumnString(1));
@@ -151,42 +133,42 @@ TEST_F(SqliteTest, SafeBind) {
 
 TEST_F(SqliteTest, UnsafeBind) {
   string s = "hello";
-  auto stmt = db_->Prepare("INSERT INTO T (a, b) VALUES (?, ?)");
+  auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindBlobUnsafe(1, s);
   stmt.BindTextUnsafe(2, s);
   s.at(0) = 'y';
   TF_ASSERT_OK(stmt.StepAndReset());
-  stmt = db_->Prepare("SELECT a, b FROM T");
+  stmt = db_->PrepareOrDie("SELECT a, b FROM T");
   TF_ASSERT_OK(stmt.Step(&is_done_));
   EXPECT_EQ("yello", stmt.ColumnString(0));
   EXPECT_EQ("yello", stmt.ColumnString(1));
 }
 
 TEST_F(SqliteTest, UnsafeColumn) {
-  auto stmt = db_->Prepare("INSERT INTO T (a, b) VALUES (?, ?)");
+  auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
   stmt.BindInt(1, 1);
   stmt.BindText(2, "hello");
   TF_ASSERT_OK(stmt.StepAndReset());
   stmt.BindInt(1, 2);
   stmt.BindText(2, "there");
   TF_ASSERT_OK(stmt.StepAndReset());
-  stmt = db_->Prepare("SELECT b FROM T ORDER BY a");
+  stmt = db_->PrepareOrDie("SELECT b FROM T ORDER BY a");
   TF_ASSERT_OK(stmt.Step(&is_done_));
-  const char* p = stmt.ColumnStringUnsafe(0);
-  EXPECT_EQ('h', *p);
+  StringPiece p = stmt.ColumnStringUnsafe(0);
+  EXPECT_EQ('h', *p.data());
   TF_ASSERT_OK(stmt.Step(&is_done_));
   // This will actually happen, but it's not safe to test this behavior.
-  // EXPECT_EQ('t', *p);
+  // EXPECT_EQ('t', *p.data());
 }
 
 TEST_F(SqliteTest, NamedParameterBind) {
-  auto stmt = db_->Prepare("INSERT INTO T (a) VALUES (:a)");
+  auto stmt = db_->PrepareOrDie("INSERT INTO T (a) VALUES (:a)");
   stmt.BindText(":a", "lol");
   TF_ASSERT_OK(stmt.StepAndReset());
-  stmt = db_->Prepare("SELECT COUNT(*) FROM T");
+  stmt = db_->PrepareOrDie("SELECT COUNT(*) FROM T");
   TF_ASSERT_OK(stmt.Step(&is_done_));
   EXPECT_EQ(1, stmt.ColumnInt(0));
-  stmt = db_->Prepare("SELECT a FROM T");
+  stmt = db_->PrepareOrDie("SELECT a FROM T");
   TF_ASSERT_OK(stmt.Step(&is_done_));
   EXPECT_FALSE(is_done_);
   EXPECT_EQ("lol", stmt.ColumnString(0));
@@ -195,40 +177,111 @@ TEST_F(SqliteTest, NamedParameterBind) {
 TEST_F(SqliteTest, Statement_DefaultConstructor) {
   SqliteStatement stmt;
   EXPECT_FALSE(stmt);
-  EXPECT_FALSE(stmt.StepAndReset().ok());
-  stmt = db_->Prepare("INSERT INTO T (a) VALUES (1)");
+  stmt = db_->PrepareOrDie("INSERT INTO T (a) VALUES (1)");
   EXPECT_TRUE(stmt);
   EXPECT_TRUE(stmt.StepAndReset().ok());
 }
 
 TEST_F(SqliteTest, Statement_MoveConstructor) {
-  SqliteStatement stmt{db_->Prepare("INSERT INTO T (a) VALUES (1)")};
+  SqliteStatement stmt{db_->PrepareOrDie("INSERT INTO T (a) VALUES (1)")};
   EXPECT_TRUE(stmt.StepAndReset().ok());
 }
 
 TEST_F(SqliteTest, Statement_MoveAssignment) {
-  SqliteStatement stmt1 = db_->Prepare("INSERT INTO T (a) VALUES (1)");
+  SqliteStatement stmt1 = db_->PrepareOrDie("INSERT INTO T (a) VALUES (1)");
   SqliteStatement stmt2;
   EXPECT_TRUE(stmt1.StepAndReset().ok());
-  EXPECT_FALSE(stmt2.StepAndReset().ok());
+  EXPECT_FALSE(stmt2);
   stmt2 = std::move(stmt1);
   EXPECT_TRUE(stmt2.StepAndReset().ok());
 }
 
 TEST_F(SqliteTest, PrepareFailed) {
-  SqliteStatement s = db_->Prepare("SELECT");
-  EXPECT_FALSE(s.status().ok());
-  EXPECT_NE(string::npos, s.status().error_message().find("SELECT"));
+  SqliteLock lock(*db_);
+  SqliteStatement stmt;
+  Status s = db_->Prepare("SELECT", &stmt);
+  ASSERT_FALSE(s.ok());
+  EXPECT_NE(string::npos, s.error_message().find("SELECT"));
+  EXPECT_EQ(SQLITE_ERROR, db_->errcode());
 }
 
 TEST_F(SqliteTest, BindFailed) {
-  SqliteStatement s = db_->Prepare("INSERT INTO T (a) VALUES (123)");
-  EXPECT_TRUE(s.status().ok());
-  EXPECT_EQ("", s.status().error_message());
-  s.BindInt(1, 123);
-  EXPECT_FALSE(s.status().ok());
+  auto stmt = db_->PrepareOrDie("INSERT INTO T (a) VALUES (123)");
+  stmt.BindInt(1, 123);
+  Status s = stmt.StepOnce();
   EXPECT_NE(string::npos,
-            s.status().error_message().find("INSERT INTO T (a) VALUES (123)"));
+            s.error_message().find("INSERT INTO T (a) VALUES (123)"))
+      << s.error_message();
+}
+
+TEST_F(SqliteTest, SnappyExtension) {
+  auto stmt = db_->PrepareOrDie("SELECT UNSNAP(SNAP(?))");
+  stmt.BindText(1, "hello");
+  EXPECT_EQ("hello", stmt.StepOnceOrDie().ColumnString(0));
+}
+
+TEST_F(SqliteTest, SnappyBinaryCompatibility) {
+  EXPECT_EQ(
+      "today is the end of the republic",
+      db_->PrepareOrDie("SELECT UNSNAP(X'03207C746F6461792069732074686520656E64"
+                        "206F66207468652072657075626C6963')")
+          .StepOnceOrDie()
+          .ColumnString(0));
+}
+
+TEST(SqliteOpenTest, CloseConnectionBeforeStatement_KeepsConnectionOpen) {
+  Sqlite* db;
+  TF_ASSERT_OK(Sqlite::Open(":memory:", SQLITE_OPEN_READWRITE, &db));
+  SqliteStatement stmt = db->PrepareOrDie("SELECT ? + ?");
+  db->Unref();
+  stmt.BindInt(1, 7);
+  stmt.BindInt(2, 3);
+  EXPECT_EQ(10, stmt.StepOnceOrDie().ColumnInt(0));
+}
+
+TEST_F(SqliteTest, TransactionRollback) {
+  {
+    SqliteTransaction txn(*db_);
+    auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
+    stmt.BindDouble(1, 6.28318530);
+    stmt.BindDouble(2, 1.61803399);
+    TF_ASSERT_OK(stmt.StepAndReset());
+  }
+  EXPECT_EQ(
+      0,
+      db_->PrepareOrDie("SELECT COUNT(*) FROM T").StepOnceOrDie().ColumnInt(0));
+}
+
+TEST_F(SqliteTest, TransactionCommit) {
+  {
+    SqliteTransaction txn(*db_);
+    auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
+    stmt.BindDouble(1, 6.28318530);
+    stmt.BindDouble(2, 1.61803399);
+    TF_ASSERT_OK(stmt.StepAndReset());
+    TF_ASSERT_OK(txn.Commit());
+  }
+  EXPECT_EQ(
+      1,
+      db_->PrepareOrDie("SELECT COUNT(*) FROM T").StepOnceOrDie().ColumnInt(0));
+}
+
+TEST_F(SqliteTest, TransactionCommitMultipleTimes) {
+  {
+    SqliteTransaction txn(*db_);
+    auto stmt = db_->PrepareOrDie("INSERT INTO T (a, b) VALUES (?, ?)");
+    stmt.BindDouble(1, 6.28318530);
+    stmt.BindDouble(2, 1.61803399);
+    TF_ASSERT_OK(stmt.StepAndReset());
+    TF_ASSERT_OK(txn.Commit());
+    stmt.BindDouble(1, 6.28318530);
+    stmt.BindDouble(2, 1.61803399);
+    TF_ASSERT_OK(stmt.StepAndReset());
+    TF_ASSERT_OK(txn.Commit());
+  }
+  EXPECT_EQ(
+      2,
+      db_->PrepareOrDie("SELECT COUNT(*) FROM T").StepOnceOrDie().ColumnInt(0));
 }
 
 }  // namespace
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index b5c0d9f621dd2e6fa8c5fd64d71f886fcfb3fd1e..9a5215320f58d10c22872c2837e882bed82f5b52 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -16,7 +16,9 @@ limitations under the License.
 // Functions to read images in GIF format.
 
 #include "tensorflow/core/lib/gif/gif_io.h"
+#include <algorithm>
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/gif.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
@@ -43,8 +45,17 @@ int input_callback(GifFileType* gif_file, GifByteType* buf, int size) {
   return 0;
 }
 
+static const char* GifErrorStringNonNull(int error_code) {
+  const char* error_string = GifErrorString(error_code);
+  if (error_string == nullptr) {
+    return "Unknown error";
+  }
+  return error_string;
+}
+
 uint8* Decode(const void* srcdata, int datasize,
-              std::function<uint8*(int, int, int, int)> allocate_output) {
+              const std::function<uint8*(int, int, int, int)>& allocate_output,
+              string* error_string) {
   int error_code = D_GIF_SUCCEEDED;
   InputBufferInfo info = {reinterpret_cast<const uint8*>(srcdata), datasize};
   GifFileType* gif_file =
@@ -53,21 +64,21 @@ uint8* Decode(const void* srcdata, int datasize,
     int error_code = D_GIF_SUCCEEDED;
     if (gif_file && DGifCloseFile(gif_file, &error_code) != GIF_OK) {
       LOG(WARNING) << "Fail to close gif file, reason: "
-                   << GifErrorString(error_code);
+                   << GifErrorStringNonNull(error_code);
     }
   });
   if (error_code != D_GIF_SUCCEEDED) {
-    LOG(ERROR) << "Fail to open gif file, reason: "
-               << GifErrorString(error_code);
+    *error_string = strings::StrCat("failed to open gif file: ",
+                                    GifErrorStringNonNull(error_code));
     return nullptr;
   }
   if (DGifSlurp(gif_file) != GIF_OK) {
-    LOG(ERROR) << "Fail to slurp gif file, reason: "
-               << GifErrorString(gif_file->Error);
+    *error_string = strings::StrCat("failed to slurp gif file: ",
+                                    GifErrorStringNonNull(gif_file->Error));
     return nullptr;
   }
   if (gif_file->ImageCount <= 0) {
-    LOG(ERROR) << "Gif file does not contain any image";
+    *error_string = strings::StrCat("gif file does not contain any image");
     return nullptr;
   }
 
@@ -79,23 +90,52 @@ uint8* Decode(const void* srcdata, int datasize,
   uint8* const dstdata = allocate_output(num_frames, width, height, channel);
   if (!dstdata) return nullptr;
   for (int k = 0; k < num_frames; k++) {
+    uint8* this_dst = dstdata + k * width * channel * height;
+
     SavedImage* this_image = &gif_file->SavedImages[k];
     GifImageDesc* img_desc = &this_image->ImageDesc;
+
+    int imgLeft = img_desc->Left;
+    int imgTop = img_desc->Top;
+    int imgRight = img_desc->Left + img_desc->Width;
+    int imgBottom = img_desc->Top + img_desc->Height;
+
     if (img_desc->Left != 0 || img_desc->Top != 0 || img_desc->Width != width ||
         img_desc->Height != height) {
-      LOG(ERROR) << "Can't process optimized gif.";
-      return nullptr;
+      // If the first frame does not fill the entire canvas then return error.
+      if (k == 0) {
+        *error_string =
+            strings::StrCat("the first frame does not fill the canvas");
+        return nullptr;
+      }
+      // Otherwise previous frame will be reused to fill the unoccupied canvas.
+      imgLeft = std::max(imgLeft, 0);
+      imgTop = std::max(imgTop, 0);
+      imgRight = std::min(imgRight, width);
+      imgBottom = std::min(imgBottom, height);
+
+      uint8* last_dst = dstdata + (k - 1) * width * channel * height;
+      for (int i = 0; i < height; ++i) {
+        uint8* p_dst = this_dst + i * width * channel;
+        uint8* l_dst = last_dst + i * width * channel;
+        for (int j = 0; j < width; ++j) {
+          p_dst[j * channel + 0] = l_dst[j * channel + 0];
+          p_dst[j * channel + 1] = l_dst[j * channel + 1];
+          p_dst[j * channel + 2] = l_dst[j * channel + 2];
+        }
+      }
     }
 
     ColorMapObject* color_map = this_image->ImageDesc.ColorMap
                                     ? this_image->ImageDesc.ColorMap
                                     : gif_file->SColorMap;
 
-    uint8* this_dst = dstdata + k * width * channel * height;
-    for (int i = 0; i < height; ++i) {
+    for (int i = imgTop; i < imgBottom; ++i) {
       uint8* p_dst = this_dst + i * width * channel;
-      for (int j = 0; j < width; ++j) {
-        GifByteType color_index = this_image->RasterBits[i * width + j];
+      for (int j = imgLeft; j < imgRight; ++j) {
+        GifByteType color_index =
+            this_image->RasterBits[(i - img_desc->Top) * (img_desc->Width) +
+                                   (j - img_desc->Left)];
         const GifColorType& gif_color = color_map->Colors[color_index];
         p_dst[j * channel + 0] = gif_color.Red;
         p_dst[j * channel + 1] = gif_color.Green;
diff --git a/tensorflow/core/lib/gif/gif_io.h b/tensorflow/core/lib/gif/gif_io.h
index 5399e6a53812b70ac25d33dc5c8acd93a8a82f04..0a7967a5a1534ea61e6adab67492802882a02c5c 100644
--- a/tensorflow/core/lib/gif/gif_io.h
+++ b/tensorflow/core/lib/gif/gif_io.h
@@ -43,7 +43,8 @@ namespace tensorflow {
 namespace gif {
 
 uint8* Decode(const void* srcdata, int datasize,
-              std::function<uint8*(int, int, int, int)> allocate_output);
+              const std::function<uint8*(int, int, int, int)>& allocate_output,
+              string* error_string);
 
 }  // namespace gif
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/gtl/cleanup.h b/tensorflow/core/lib/gtl/cleanup.h
index 6053e986402598568299d1756d23068693c193c8..6bd60ca482430cf13f4f076badf460cf2e1d593b 100644
--- a/tensorflow/core/lib/gtl/cleanup.h
+++ b/tensorflow/core/lib/gtl/cleanup.h
@@ -55,22 +55,21 @@ namespace gtl {
 template <typename F>
 class Cleanup {
  public:
-  Cleanup()
-      : released_(true), f_() {}
+  Cleanup() : released_(true), f_() {}
 
   template <typename G>
-  explicit Cleanup(G&& f)  // NOLINT
+  explicit Cleanup(G&& f)          // NOLINT
       : f_(std::forward<G>(f)) {}  // NOLINT(build/c++11)
 
   Cleanup(Cleanup&& src)  // NOLINT
-      : released_(src.is_released()), f_(src.release()) { }
+      : released_(src.is_released()), f_(src.release()) {}
 
   // Implicitly move-constructible from any compatible Cleanup<G>.
   // The source will be released as if src.release() were called.
   // A moved-from Cleanup can be safely destroyed or reassigned.
   template <typename G>
   Cleanup(Cleanup<G>&& src)  // NOLINT
-      : released_(src.is_released()), f_(src.release()) { }
+      : released_(src.is_released()), f_(src.release()) {}
 
   // Assignment to a Cleanup object behaves like destroying it
   // and making a new one in its place, analogous to unique_ptr
@@ -102,8 +101,8 @@ class Cleanup {
   F f_;
 };
 
-template <int&... ExplicitParameterBarrier,
-          typename F, typename DecayF = typename std::decay<F>::type>
+template <int&... ExplicitParameterBarrier, typename F,
+          typename DecayF = typename std::decay<F>::type>
 TF_MUST_USE_RESULT Cleanup<DecayF> MakeCleanup(F&& f) {
   return Cleanup<DecayF>(std::forward<F>(f));
 }
diff --git a/tensorflow/core/lib/gtl/cleanup_test.cc b/tensorflow/core/lib/gtl/cleanup_test.cc
index bd151cb2ab1c8a830eb1bd9546ab452d05c6c20c..a86ffd5fe284485f15fa824026e8d79f5191a384 100644
--- a/tensorflow/core/lib/gtl/cleanup_test.cc
+++ b/tensorflow/core/lib/gtl/cleanup_test.cc
@@ -65,15 +65,14 @@ TEST(CleanupTest, Release) {
 TEST(FinallyTest, TypeErasedWithoutFactory) {
   string s = "active";
   {
-    AnyCleanup s_cleaner([&s]{ s.append(" clean"); });
+    AnyCleanup s_cleaner([&s] { s.append(" clean"); });
     EXPECT_EQ("active", s);
   }
   EXPECT_EQ("active clean", s);
 }
 
 struct Appender {
-  Appender(string* s, const string& msg)
-      : s_(s), msg_(msg) {}
+  Appender(string* s, const string& msg) : s_(s), msg_(msg) {}
   void operator()() const { s_->append(msg_); }
   string* s_;
   string msg_;
@@ -163,7 +162,12 @@ class CleanupReferenceTest : public ::testing::Test {
     int* i;
     F(int* cp, int* i) : cp(cp), i(i) {}
     F(const F& o) : cp(o.cp), i(o.i) { ++*cp; }
-    F& operator=(const F& o) { cp = o.cp; i = o.i; ++*cp; return *this; }
+    F& operator=(const F& o) {
+      cp = o.cp;
+      i = o.i;
+      ++*cp;
+      return *this;
+    }
     F(F&&) = default;
     F& operator=(F&&) = default;
     void operator()() const { ++*i; }
@@ -279,7 +283,7 @@ BENCHMARK(BM_AnyCleanup);
 
 void BM_AnyCleanupNoFactory(int iters) {
   while (iters--) {
-    AnyCleanup fin([]{Incr();});
+    AnyCleanup fin([] { Incr(); });
   }
 }
 BENCHMARK(BM_AnyCleanupNoFactory);
diff --git a/tensorflow/core/lib/gtl/compactptrset.h b/tensorflow/core/lib/gtl/compactptrset.h
index 1d4d6cc8d2de035f345c4fef8121041b091c24d7..d3d23b94aa26471f7b0d178296c7112c5084f8cf 100644
--- a/tensorflow/core/lib/gtl/compactptrset.h
+++ b/tensorflow/core/lib/gtl/compactptrset.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
+#ifndef TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
+#define TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
 
 #include <type_traits>
 #include "tensorflow/core/lib/gtl/flatset.h"
@@ -205,4 +205,4 @@ class CompactPointerSet {
 }  // namespace gtl
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
+#endif  // TENSORFLOW_CORE_LIB_GTL_COMPACTPTRSET_H_
diff --git a/tensorflow/core/lib/gtl/flatmap.h b/tensorflow/core/lib/gtl/flatmap.h
index 6dd67ad2ea56a2691d27cfe4b9a11c0aafa05d01..889d2ddaa6be36332a3b810c0aefef6ecb684e40 100644
--- a/tensorflow/core/lib/gtl/flatmap.h
+++ b/tensorflow/core/lib/gtl/flatmap.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
+#ifndef TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
+#define TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
 
 #include <stddef.h>
 #include <functional>
@@ -379,4 +379,4 @@ class FlatMap {
 }  // namespace gtl
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
+#endif  // TENSORFLOW_CORE_LIB_GTL_FLATMAP_H_
diff --git a/tensorflow/core/lib/gtl/flatrep.h b/tensorflow/core/lib/gtl/flatrep.h
index bb405b327aa86983a171727b76a63109d7028431..0d7e7487fc33353603bf3c4d56d8d04466e326a1 100644
--- a/tensorflow/core/lib/gtl/flatrep.h
+++ b/tensorflow/core/lib/gtl/flatrep.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
+#ifndef TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
+#define TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
 
 #include <string.h>
 #include <utility>
@@ -328,4 +328,4 @@ class FlatRep {
 }  // namespace gtl
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
+#endif  // TENSORFLOW_CORE_LIB_GTL_FLATREP_H_
diff --git a/tensorflow/core/lib/gtl/flatset.h b/tensorflow/core/lib/gtl/flatset.h
index 2b7f31ab224f3d70fe5e69ced17f54cc1e742453..f31e3abe4115887ed1f2ed3bec52c73b2622715c 100644
--- a/tensorflow/core/lib/gtl/flatset.h
+++ b/tensorflow/core/lib/gtl/flatset.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
+#ifndef TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
+#define TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
 
 #include <stddef.h>
 #include <functional>
@@ -278,4 +278,4 @@ class FlatSet {
 }  // namespace gtl
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
+#endif  // TENSORFLOW_CORE_LIB_GTL_FLATSET_H_
diff --git a/tensorflow/core/lib/gtl/inlined_vector.h b/tensorflow/core/lib/gtl/inlined_vector.h
index d6e5d9effa794c46b7aa98691bb993dbd7e764c8..6e3cb2206d9658a3b0bc24b506049f503ae304ed 100644
--- a/tensorflow/core/lib/gtl/inlined_vector.h
+++ b/tensorflow/core/lib/gtl/inlined_vector.h
@@ -31,12 +31,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_GTL_INLINED_VECTOR_H_
 #define TENSORFLOW_LIB_GTL_INLINED_VECTOR_H_
 
-#include <cstddef>
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <algorithm>
+#include <cstddef>
 #include <iterator>
 #include <memory>
 #include <type_traits>
@@ -407,7 +407,7 @@ class InlinedVector {
   };
   // 2) Construct a T with args at not-yet-initialized memory pointed by dst.
   struct Construct {
-    template<class... Args>
+    template <class... Args>
     void operator()(T* dst, Args&&... args) const {
       new (dst) T(std::forward<Args>(args)...);
     }
diff --git a/tensorflow/core/lib/gtl/int_type.h b/tensorflow/core/lib/gtl/int_type.h
index 647fc81aa7e4925d1d2b74b82146b18b0c17a4a9..af3e50ad78ff9d07bc0e8e79a5ff7cb3d1aacbfe 100644
--- a/tensorflow/core/lib/gtl/int_type.h
+++ b/tensorflow/core/lib/gtl/int_type.h
@@ -255,13 +255,13 @@ class IntType {
     value_ op arg_value;                             \
     return *this;                                    \
   }
-  INT_TYPE_ASSIGNMENT_OP(+= );
-  INT_TYPE_ASSIGNMENT_OP(-= );
-  INT_TYPE_ASSIGNMENT_OP(*= );
-  INT_TYPE_ASSIGNMENT_OP(/= );
-  INT_TYPE_ASSIGNMENT_OP(<<= );  // NOLINT
-  INT_TYPE_ASSIGNMENT_OP(>>= );  // NOLINT
-  INT_TYPE_ASSIGNMENT_OP(%= );
+  INT_TYPE_ASSIGNMENT_OP(+=);
+  INT_TYPE_ASSIGNMENT_OP(-=);
+  INT_TYPE_ASSIGNMENT_OP(*=);
+  INT_TYPE_ASSIGNMENT_OP(/=);
+  INT_TYPE_ASSIGNMENT_OP(<<=);  // NOLINT
+  INT_TYPE_ASSIGNMENT_OP(>>=);  // NOLINT
+  INT_TYPE_ASSIGNMENT_OP(%=);
 #undef INT_TYPE_ASSIGNMENT_OP
 
   ThisType& operator=(ValueType arg_value) {
@@ -314,10 +314,10 @@ std::ostream& operator<<(std::ostream& os,  // NOLINT
 INT_TYPE_ARITHMETIC_OP(+);
 INT_TYPE_ARITHMETIC_OP(-);
 INT_TYPE_ARITHMETIC_OP(*);
-INT_TYPE_ARITHMETIC_OP(/ );
-INT_TYPE_ARITHMETIC_OP(<< );  // NOLINT
-INT_TYPE_ARITHMETIC_OP(>> );  // NOLINT
-INT_TYPE_ARITHMETIC_OP(% );
+INT_TYPE_ARITHMETIC_OP(/);
+INT_TYPE_ARITHMETIC_OP(<<);  // NOLINT
+INT_TYPE_ARITHMETIC_OP(>>);  // NOLINT
+INT_TYPE_ARITHMETIC_OP(%);
 #undef INT_TYPE_ARITHMETIC_OP
 
 // -- NON-MEMBER COMPARISON OPERATORS ------------------------------------------
@@ -345,12 +345,12 @@ INT_TYPE_ARITHMETIC_OP(% );
       IntType<IntTypeName, ValueType> id) {                      \
     return val op id.value();                                    \
   }
-INT_TYPE_COMPARISON_OP(== );  // NOLINT
-INT_TYPE_COMPARISON_OP(!= );  // NOLINT
-INT_TYPE_COMPARISON_OP(< );   // NOLINT
-INT_TYPE_COMPARISON_OP(<= );  // NOLINT
-INT_TYPE_COMPARISON_OP(> );   // NOLINT
-INT_TYPE_COMPARISON_OP(>= );  // NOLINT
+INT_TYPE_COMPARISON_OP(==);  // NOLINT
+INT_TYPE_COMPARISON_OP(!=);  // NOLINT
+INT_TYPE_COMPARISON_OP(<);   // NOLINT
+INT_TYPE_COMPARISON_OP(<=);  // NOLINT
+INT_TYPE_COMPARISON_OP(>);   // NOLINT
+INT_TYPE_COMPARISON_OP(>=);  // NOLINT
 #undef INT_TYPE_COMPARISON_OP
 
 }  // namespace gtl
diff --git a/tensorflow/core/lib/gtl/int_type_test.cc b/tensorflow/core/lib/gtl/int_type_test.cc
index d3c405d9acdb221f465e98d957ba55ba6bc63f57..61d364017cb90933e8e9af7e800db4a6988d8442 100644
--- a/tensorflow/core/lib/gtl/int_type_test.cc
+++ b/tensorflow/core/lib/gtl/int_type_test.cc
@@ -42,7 +42,8 @@ class IntTypeTest : public ::testing::Test {
 
 // All tests below will be executed on all supported IntTypes.
 typedef ::testing::Types<Int8_IT, UInt8_IT, Int16_IT, UInt16_IT, Int32_IT,
-                         Int64_IT, UInt64_IT, Long_IT> SupportedIntTypes;
+                         Int64_IT, UInt64_IT, Long_IT>
+    SupportedIntTypes;
 
 TYPED_TEST_CASE(IntTypeTest, SupportedIntTypes);
 
@@ -232,7 +233,8 @@ TYPED_TEST(IntTypeTest, TestOperators) {
 
 TYPED_TEST(IntTypeTest, TestHashFunctor) {
   std::unordered_map<typename TestFixture::T, char,
-                     typename TestFixture::T::Hasher> map;
+                     typename TestFixture::T::Hasher>
+      map;
   typename TestFixture::T a(0);
   map[a] = 'c';
   EXPECT_EQ('c', map[a]);
diff --git a/tensorflow/core/lib/gtl/iterator_range.h b/tensorflow/core/lib/gtl/iterator_range.h
index e7fea7579db6e3bd8f6f2ce6f5f8c53a40dd3d20..0ba4587fde65f9d396716acb6a7e4f491ff51e32 100644
--- a/tensorflow/core/lib/gtl/iterator_range.h
+++ b/tensorflow/core/lib/gtl/iterator_range.h
@@ -37,6 +37,10 @@ namespace gtl {
 template <typename IteratorT>
 class iterator_range {
  public:
+  using value_type = decltype(*std::declval<IteratorT>());
+  using iterator = IteratorT;
+  using const_iterator = IteratorT;
+
   iterator_range() : begin_iterator_(), end_iterator_() {}
   iterator_range(IteratorT begin_iterator, IteratorT end_iterator)
       : begin_iterator_(std::move(begin_iterator)),
diff --git a/tensorflow/core/lib/gtl/optional.h b/tensorflow/core/lib/gtl/optional.h
index 2ff8b9c7d1adbbc206e0429142389e9730efa33c..4ee3f88d186562e5d3261bc634952fb53b4f5774 100644
--- a/tensorflow/core/lib/gtl/optional.h
+++ b/tensorflow/core/lib/gtl/optional.h
@@ -478,7 +478,7 @@ class optional : private internal_optional::optional_data<T>,
     return *this;
   }
 
-  // Copy assigment, standard semantics.
+  // Copy assignment, standard semantics.
   optional& operator=(const optional& src) = default;
 
   // Move assignment, standard semantics.
@@ -593,12 +593,12 @@ class optional : private internal_optional::optional_data<T>,
     assert(this->engaged_);
     return this->pointer();
   }
-  constexpr const T& operator*() const & { return reference(); }
+  constexpr const T& operator*() const& { return reference(); }
   T& operator*() & {
     assert(this->engaged_);
     return reference();
   }
-  constexpr const T&& operator*() const && { return std::move(reference()); }
+  constexpr const T&& operator*() const&& { return std::move(reference()); }
   T&& operator*() && {
     assert(this->engaged_);
     return std::move(reference());
@@ -621,7 +621,7 @@ class optional : private internal_optional::optional_data<T>,
   // Use `opt.value()` to get a reference to underlying value.  The constness
   // and lvalue/rvalue-ness of `opt` is preserved to the view of the T
   // subobject.
-  const T& value() const & {
+  const T& value() const& {
     CHECK(*this) << "Bad optional access";
     return reference();
   }
@@ -633,7 +633,7 @@ class optional : private internal_optional::optional_data<T>,
     CHECK(*this) << "Bad optional access";
     return std::move(reference());
   }
-  const T&& value() const && {  // NOLINT(build/c++11)
+  const T&& value() const&& {  // NOLINT(build/c++11)
     CHECK(*this) << "Bad optional access";
     return std::move(reference());
   }
@@ -641,7 +641,7 @@ class optional : private internal_optional::optional_data<T>,
   // Use `opt.value_or(val)` to get either the value of T or the given default
   // `val` in the empty case.
   template <class U>
-  constexpr T value_or(U&& v) const & {
+  constexpr T value_or(U&& v) const& {
     return static_cast<bool>(*this) ? **this
                                     : static_cast<T>(std::forward<U>(v));
   }
@@ -656,8 +656,8 @@ class optional : private internal_optional::optional_data<T>,
   constexpr const T& reference() const { return *this->pointer(); }
   T& reference() { return *(this->pointer()); }
 
-  // T constraint checks.  You can't have an optional of nullopt_t, in_place_t or
-  // a reference.
+  // T constraint checks.  You can't have an optional of nullopt_t, in_place_t
+  // or a reference.
   static_assert(
       !std::is_same<nullopt_t, typename std::remove_cv<T>::type>::value,
       "optional<nullopt_t> is not allowed.");
diff --git a/tensorflow/core/lib/gtl/optional_test.cc b/tensorflow/core/lib/gtl/optional_test.cc
index 547bee7b75f3d05e290ec7d53d889ff7e82794a9..12b5bbc60be9961a5f852210c42479b2cd48ea92 100644
--- a/tensorflow/core/lib/gtl/optional_test.cc
+++ b/tensorflow/core/lib/gtl/optional_test.cc
@@ -24,17 +24,29 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-using tensorflow::gtl::optional;
-using tensorflow::gtl::nullopt;
-using tensorflow::gtl::nullopt_t;
 using tensorflow::gtl::in_place;
 using tensorflow::gtl::in_place_t;
 using tensorflow::gtl::make_optional;
+using tensorflow::gtl::nullopt;
+using tensorflow::gtl::nullopt_t;
+using tensorflow::gtl::optional;
 
-template <typename T> string TypeQuals(T&) { return "&"; }
-template <typename T> string TypeQuals(T&&) { return "&&"; }
-template <typename T> string TypeQuals(const T&) { return "c&"; }
-template <typename T> string TypeQuals(const T&&) { return "c&&"; }
+template <typename T>
+string TypeQuals(T&) {
+  return "&";
+}
+template <typename T>
+string TypeQuals(T&&) {
+  return "&&";
+}
+template <typename T>
+string TypeQuals(const T&) {
+  return "c&";
+}
+template <typename T>
+string TypeQuals(const T&&) {
+  return "c&&";
+}
 
 struct StructorListener {
   int construct0 = 0;
diff --git a/tensorflow/core/lib/gtl/top_n_test.cc b/tensorflow/core/lib/gtl/top_n_test.cc
index fae85570dc071568a53abcb72fea6ffc22a465ea..ba30c072a9033073a7439f60dbfa3402dbfc5923 100644
--- a/tensorflow/core/lib/gtl/top_n_test.cc
+++ b/tensorflow/core/lib/gtl/top_n_test.cc
@@ -28,10 +28,10 @@ limitations under the License.
 
 namespace {
 
+using tensorflow::string;
 using tensorflow::gtl::TopN;
 using tensorflow::random::PhiloxRandom;
 using tensorflow::random::SimplePhilox;
-using tensorflow::string;
 
 // Move the contents from an owned raw pointer, returning by value.
 // Objects are easier to manage by value.
diff --git a/tensorflow/core/lib/hash/hash.h b/tensorflow/core/lib/hash/hash.h
index 0fb12966afeb98bf3365e0b1df8381bc900d9765..4d312ab7e830963671a8be9d4622a5b83488d295 100644
--- a/tensorflow/core/lib/hash/hash.h
+++ b/tensorflow/core/lib/hash/hash.h
@@ -64,6 +64,13 @@ struct hash<T*> {
   }
 };
 
+template <>
+struct hash<bfloat16> {
+  size_t operator()(const bfloat16& t) const {
+    return std::hash<float>()(static_cast<float>(t));
+  }
+};
+
 template <>
 struct hash<string> {
   size_t operator()(const string& s) const {
diff --git a/tensorflow/core/lib/io/buffered_inputstream.h b/tensorflow/core/lib/io/buffered_inputstream.h
index 2b824f35f80de47f951477a9352bedeca1290848..924619f40f23152e8155651c72538ef5da98e611 100644
--- a/tensorflow/core/lib/io/buffered_inputstream.h
+++ b/tensorflow/core/lib/io/buffered_inputstream.h
@@ -104,4 +104,4 @@ class BufferedInputStream : public InputStreamInterface {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_LIB_IO_BUFFERED_INPUTSTREAM_H_
+#endif  // TENSORFLOW_LIB_IO_BUFFERED_INPUTSTREAM_H_
diff --git a/tensorflow/core/lib/io/compression.cc b/tensorflow/core/lib/io/compression.cc
index c12de98e40105907460f74f967e20aa41bdb0ceb..0d25bca9eccf2b28800a288858ffbc0caeb2dbd3 100644
--- a/tensorflow/core/lib/io/compression.cc
+++ b/tensorflow/core/lib/io/compression.cc
@@ -22,6 +22,6 @@ namespace compression {
 const char kNone[] = "";
 const char kGzip[] = "GZIP";
 
-}
-}
-}
+}  // namespace compression
+}  // namespace io
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/compression.h b/tensorflow/core/lib/io/compression.h
index 7a0c5c12a7461546a7511ccc967237336a61b744..4d8e7788cad823e0e79a4e9567c6f17a3d9259cf 100644
--- a/tensorflow/core/lib/io/compression.h
+++ b/tensorflow/core/lib/io/compression.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
+#define TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
 
 namespace tensorflow {
 namespace io {
@@ -23,8 +23,8 @@ namespace compression {
 extern const char kNone[];
 extern const char kGzip[];
 
-}
-}
-}
+}  // namespace compression
+}  // namespace io
+}  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_COMPRESSION_H_
diff --git a/tensorflow/core/lib/io/inputstream_interface.h b/tensorflow/core/lib/io/inputstream_interface.h
index 096248693bb83cb4e4ede64fb3e9aac2bee42c7a..3083d20776f8a85d03a07756954980fd7e100141 100644
--- a/tensorflow/core/lib/io/inputstream_interface.h
+++ b/tensorflow/core/lib/io/inputstream_interface.h
@@ -54,4 +54,4 @@ class InputStreamInterface {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_INPUTSTREAM_INTERFACE_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_INPUTSTREAM_INTERFACE_H_
diff --git a/tensorflow/core/lib/io/random_inputstream.cc b/tensorflow/core/lib/io/random_inputstream.cc
index 8b8c1392a1dce339a56b718af036248f22ba0b59..09336e79cda67b324299d78c65217e6a7b40dc21 100644
--- a/tensorflow/core/lib/io/random_inputstream.cc
+++ b/tensorflow/core/lib/io/random_inputstream.cc
@@ -57,6 +57,43 @@ Status RandomAccessInputStream::ReadNBytes(int64 bytes_to_read,
   return Status::OK();
 }
 
+// To limit memory usage, the default implementation of SkipNBytes() only reads
+// 8MB at a time.
+static constexpr int64 kMaxSkipSize = 8 * 1024 * 1024;
+
+Status RandomAccessInputStream::SkipNBytes(int64 bytes_to_skip) {
+  if (bytes_to_skip < 0) {
+    return errors::InvalidArgument("Can't skip a negative number of bytes");
+  }
+  std::unique_ptr<char[]> scratch(new char[kMaxSkipSize]);
+  // Try to read 1 bytes first, if we could complete the read then EOF is
+  // not reached yet and we could return.
+  if (bytes_to_skip > 0) {
+    StringPiece data;
+    Status s = file_->Read(pos_ + bytes_to_skip - 1, 1, &data, scratch.get());
+    if ((s.ok() || errors::IsOutOfRange(s)) && data.size() == 1) {
+      pos_ += bytes_to_skip;
+      return Status::OK();
+    }
+  }
+  // Read kDefaultSkipSize at a time till bytes_to_skip.
+  while (bytes_to_skip > 0) {
+    int64 bytes_to_read = std::min<int64>(kMaxSkipSize, bytes_to_skip);
+    StringPiece data;
+    Status s = file_->Read(pos_, bytes_to_read, &data, scratch.get());
+    if (s.ok() || errors::IsOutOfRange(s)) {
+      pos_ += data.size();
+    } else {
+      return s;
+    }
+    if (data.size() < bytes_to_read) {
+      return errors::OutOfRange("reached end of file");
+    }
+    bytes_to_skip -= bytes_to_read;
+  }
+  return Status::OK();
+}
+
 int64 RandomAccessInputStream::Tell() const { return pos_; }
 
 }  // namespace io
diff --git a/tensorflow/core/lib/io/random_inputstream.h b/tensorflow/core/lib/io/random_inputstream.h
index 09ebe9ba49e741945457c82cf0c64b3c1268a694..bdbdbd71ff914cfaf1690b2813ddbab070a9f99a 100644
--- a/tensorflow/core/lib/io/random_inputstream.h
+++ b/tensorflow/core/lib/io/random_inputstream.h
@@ -34,6 +34,8 @@ class RandomAccessInputStream : public InputStreamInterface {
 
   Status ReadNBytes(int64 bytes_to_read, string* result) override;
 
+  Status SkipNBytes(int64 bytes_to_skip) override;
+
   int64 Tell() const override;
 
   Status Seek(int64 position) {
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index 403c82818ef3293a1dc027d362eb766906d0e94a..254fdf115da132343b8e6f176e67672a11281cd0 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -49,7 +49,7 @@ RecordReaderOptions RecordReaderOptions::CreateRecordReaderOptions(
 #endif  // IS_SLIM_BUILD
   } else if (compression_type != compression::kNone) {
     LOG(ERROR) << "Unsupported compression_type:" << compression_type
-               << ". No comprression will be used.";
+               << ". No compression will be used.";
   }
   return options;
 }
@@ -207,7 +207,7 @@ Status RecordReader::SkipNBytes(uint64 offset) {
     }
   }
   return Status::OK();
-}
+}  // namespace io
 
 SequentialRecordReader::SequentialRecordReader(
     RandomAccessFile* file, const RecordReaderOptions& options)
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index 507c26a63ff587809e80739f8d015d1adcc3b21d..b7e51256a22b0d84e734e2a036a184b3adc3e547 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -218,8 +218,8 @@ TEST_F(RecordioTest, RandomRead) {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void AssertHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected)) << s << " does not contain "
-                                                 << expected;
+  EXPECT_TRUE(StringPiece(s).contains(expected))
+      << s << " does not contain " << expected;
 }
 
 TEST_F(RecordioTest, ReadError) {
diff --git a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
index be1fa22c69c27a5c57e3c397076a66dfe05eb035..3c310167326721e8f569ab6148622517aaf82ce5 100644
--- a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
@@ -161,7 +161,7 @@ Status SnappyOutputBuffer::Deflate() {
   }
 
   // Write length of compressed block to output buffer.
-  char* compressed_length_array = new char[4];
+  char compressed_length_array[4];
   std::fill(compressed_length_array, compressed_length_array + 4, 0);
   for (int i = 0; i < 4; i++) {
     // Little endian.
@@ -173,7 +173,6 @@ Status SnappyOutputBuffer::Deflate() {
   TF_RETURN_IF_ERROR(AddToOutputBuffer(output.data(), output.size()));
   next_in_ += avail_in_;
   avail_in_ = 0;
-  delete[] compressed_length_array;
 
   return Status::OK();
 }
diff --git a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.h b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.h
index 5d330a2c5a3d97456495893d3bb87c376beeeb1f..5aea503846df7c1b0f3c3f140a820dc0cd951726 100644
--- a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.h
+++ b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
+#define TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
 
 #include <string>
 #include "tensorflow/core/lib/core/status.h"
@@ -117,4 +117,4 @@ class SnappyOutputBuffer {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_SNAPPY_OUTPUTBUFFER_H_
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.h b/tensorflow/core/lib/io/zlib_outputbuffer.h
index 5cad2e945705701662d845315c86acbf70f1f1d3..3d86d89a99204c1c8a80081b299e28837141b33d 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.h
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
+#ifndef TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
+#define TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
 
 #include <zlib.h>
 
@@ -143,4 +143,4 @@ class ZlibOutputBuffer : public WritableFile {
 }  // namespace io
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
+#endif  // TENSORFLOW_CORE_LIB_IO_COMPRESSED_OUTPUTBUFFER_H_
diff --git a/tensorflow/core/lib/math/math_util.h b/tensorflow/core/lib/math/math_util.h
index 6f279865e7b361d7b0d2c402747c7b3476e63448..41d486f2bd142954d288f1ccdcf30d960fa2c6a7 100644
--- a/tensorflow/core/lib/math/math_util.h
+++ b/tensorflow/core/lib/math/math_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_MATH_MATH_UTIL_H_
 #define TENSORFLOW_LIB_MATH_MATH_UTIL_H_
 
+#include <type_traits>
+
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -59,6 +61,29 @@ class MathUtil {
   template <typename IntegralType, bool ceil>
   static IntegralType CeilOrFloorOfRatio(IntegralType numerator,
                                          IntegralType denominator);
+
+  template <typename IntegralType>
+  static IntegralType GCD(IntegralType x, IntegralType y);
+
+  // ----------------------------------------------------------------------
+  // IPow<T>
+  //   Computes the result of raising a number to a non-negative integral power.
+  //
+  //  * T: An integral type, floating-point type, or user-defined type for which
+  //    operator*= is defined.
+  //  * base: the base "v" of the operation
+  //  * exp: the exponent "i" of the operation; must be non-negative.
+  //
+  // Computes v^i, in a way that is faster than std::pow (which supports
+  // arbitrary real exponents).
+  //
+  // When T is a floating point type, this has the same semantics as std::pow,
+  // but it is much faster. When T is an integral type, computations are
+  // performed in the value domain of T, and overflow semantics are those of T.
+  //
+  // Input validity is DCHECKed.
+  template <typename T>
+  static T IPow(T base, int exp);
 };
 
 // ---- CeilOrFloorOfRatio ----
@@ -107,6 +132,32 @@ IntegralType MathUtil::CeilOrFloorOfRatio(IntegralType numerator,
   }
 }
 
+template <typename IntegralType>
+IntegralType MathUtil::GCD(IntegralType a, IntegralType b) {
+  static_assert(std::is_unsigned<IntegralType>::value,
+                "signed GCD not supported!");
+  while (b != 0) {
+    IntegralType r = a % b;
+    a = b;
+    b = r;
+  }
+  return a;
+}
+
+// ---- IPow ----
+// Implemented with the squared exponentiation method (a.k.a. double-and-add).
+//
+// Note that "exp >>= 1" is faster than "exp /= 2" on at least one platform.
+template <typename T>
+T MathUtil::IPow(T base, int exp) {
+  DCHECK_GE(exp, 0);
+  for (T result(1);; base *= base) {
+    if ((exp & 1) != 0) result *= base;
+    exp >>= 1;
+    if (exp == 0) return result;
+  }
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_LIB_MATH_MATH_UTIL_H_
diff --git a/tensorflow/core/lib/math/math_util_test.cc b/tensorflow/core/lib/math/math_util_test.cc
index eaf8c31a431728d6f728abeb50e14c443bce6d85..cad5d0d8993b5c61e82489ca942744608f7fd37a 100644
--- a/tensorflow/core/lib/math/math_util_test.cc
+++ b/tensorflow/core/lib/math/math_util_test.cc
@@ -15,12 +15,17 @@ limitations under the License.
 
 #include "tensorflow/core/lib/math/math_util.h"
 
+#include <cmath>
+#include <limits>
 #include <vector>
+
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+namespace {
 
 // Number of arguments for each test of the CeilOrRatio method
 const int kNumTestArguments = 4;
@@ -195,4 +200,141 @@ TEST(MathUtil, CeilOfRatio) {
 #endif
 }
 
+struct GCDTestCase {
+  unsigned int x;
+  unsigned int y;
+  unsigned int gcd;
+};
+
+TEST(MathUtil, GCD) {
+  std::vector<GCDTestCase> testcases({
+      {10, 20, 10},  //
+      {27, 8, 1},    //
+      {4, 3, 1},     //
+      {6, 8, 2},     //
+      {5, 0, 5},     //
+      {5, 5, 5},     //
+      {0, 0, 0}      //
+  });
+
+  for (const auto& tc : testcases) {
+    EXPECT_EQ(tc.gcd, MathUtil::GCD<uint32>(tc.x, tc.y));
+    EXPECT_EQ(tc.gcd, MathUtil::GCD<uint32>(tc.y, tc.x));
+    EXPECT_EQ(tc.gcd, MathUtil::GCD<uint64>(tc.x, tc.y));
+    EXPECT_EQ(tc.gcd, MathUtil::GCD<uint64>(tc.y, tc.x));
+  }
+
+  const uint64 biggish_prime = 1666666667;
+  EXPECT_EQ(biggish_prime,
+            MathUtil::GCD<uint64>(biggish_prime * 3, biggish_prime * 4));
+}
+
+template <typename T>
+void TestOneIPowN() {
+  const T one{1};
+  for (int i = 0; i < 1024; ++i) {
+    // Computations are exact.
+    EXPECT_EQ(MathUtil::IPow(one, i), one);
+  }
+}
+
+template <typename T>
+void TestTwoIPowN() {
+  int limit = std::is_integral<T>::value ? std::numeric_limits<T>::digits : 63;
+  for (int i = 0; i < limit; ++i) {
+    // Computations are exact.
+    EXPECT_EQ(MathUtil::IPow(T{2}, i), static_cast<T>(1ull << i));
+  }
+}
+
+template <typename T>
+void TestFloatIPow(const int max_exponent, const T start, const T end,
+                   const T step) {
+  for (T f = start; f < end; f += step) {
+    for (int i = 0; i < max_exponent; ++i) {
+      EXPECT_FLOAT_EQ(MathUtil::IPow(f, i), pow(f, i));
+    }
+  }
+}
+
+TEST(MathUtil, IPow) {
+  TestOneIPowN<double>();
+  TestOneIPowN<float>();
+  TestOneIPowN<int>();
+  TestOneIPowN<int64>();
+  TestTwoIPowN<double>();
+  TestTwoIPowN<float>();
+  TestTwoIPowN<int>();
+  TestTwoIPowN<int64>();
+
+  EXPECT_EQ(MathUtil::IPow(3, 0), 1);
+  EXPECT_EQ(MathUtil::IPow(3, 1), 3);
+  EXPECT_EQ(MathUtil::IPow(3, 2), 9);
+  EXPECT_EQ(MathUtil::IPow(3, 3), 27);
+  EXPECT_EQ(MathUtil::IPow(3, 4), 81);
+  EXPECT_EQ(MathUtil::IPow(3, 5), 243);
+
+  TestFloatIPow<float>(13, -16.0f, 16.0f, 1.0f / 8);
+  TestFloatIPow<double>(13, -16.0, 16.0, 1.0 / 8);
+
+  TestFloatIPow<float>(13, -1.0f / (1 << 12), -1.0f / (1 << 12),
+                       1.0f / (1 << 16));
+  TestFloatIPow<double>(13, -1.0 / (1 << 12), -1.0 / (1 << 12),
+                        1.0 / (1 << 16));
+}
+
+TEST(MathUtil, IPowEdgeCases) {
+  constexpr const double kInf = std::numeric_limits<double>::infinity();
+
+  EXPECT_EQ(MathUtil::IPow(-12345.0, 79), -kInf);
+  EXPECT_EQ(MathUtil::IPow(-12345.0, 80), +kInf);
+
+  // The semantics of the edge cases that follow  are defined in the standard:
+  // http://en.cppreference.com/w/cpp/numeric/math/pow for a summary.
+
+  // 1 - These edge cases apply.
+  // pow(+0, exp), where exp is a positive odd integer, returns +0
+  EXPECT_EQ(MathUtil::IPow(+0.0, 3), +0.0);
+  // pow(-0, exp), where exp is a positive odd integer, returns -0
+  EXPECT_EQ(MathUtil::IPow(-0.0, 3), -0.0);
+  // pow(±0, exp), where exp is positive non-integer or a positive even integer,
+  // returns +0
+  EXPECT_EQ(MathUtil::IPow(+0.0, 42), +0.0);
+  EXPECT_EQ(MathUtil::IPow(-0.0, 42), +0.0);
+  // pow(base, ±0) returns 1 for any base, even when base is NaN
+  EXPECT_EQ(MathUtil::IPow(-kInf, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(-2.0, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(-1.0, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(-0.0, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(+0.0, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(+1.0, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(+2.0, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(+kInf, 0.0), 1.0);
+  EXPECT_EQ(MathUtil::IPow(std::numeric_limits<double>::quiet_NaN(), 0.0), 1.0);
+  // pow(-∞, exp) returns -∞ if exp is a positive odd integer
+  EXPECT_EQ(MathUtil::IPow(-kInf, 43), -kInf);
+  // pow(-∞, exp) returns +∞ if exp is a positive non-integer or even integer
+  EXPECT_EQ(MathUtil::IPow(-kInf, 42), +kInf);
+  // pow(+∞, exp) returns +∞ for any positive exp
+  EXPECT_EQ(MathUtil::IPow(+kInf, 42), +kInf);
+  EXPECT_EQ(MathUtil::IPow(+kInf, 43), +kInf);
+
+  // 2 - These do not apply due to the restricted exp range.
+  // pow(+0, exp), where exp is a negative odd integer, returns +∞ and raises
+  // FE_DIVBYZERO pow(-0, exp), where exp is a negative odd integer, returns -∞
+  // and raises FE_DIVBYZERO pow(±0, exp), where exp is negative, finite, and is
+  // an even integer or a non-integer, returns +∞ and raises FE_DIVBYZERO
+  // pow(-1, ±∞) returns 1
+  // pow(+1, exp) returns 1 for any exp, even when exp is NaN
+  // pow(±0, -∞) returns +∞ and may raise FE_DIVBYZERO
+  // pow(base, exp) returns NaN and raises FE_INVALID if base is finite and
+  // negative and exp is finite and non-integer. pow(base, -∞) returns +∞ for
+  // any |base|<1 pow(base, -∞) returns +0 for any |base|>1 pow(base, +∞)
+  // returns +0 for any |base|<1 pow(base, +∞) returns +∞ for any |base|>1
+  // pow(-∞, exp) returns -0 if exp is a negative odd integer
+  // pow(-∞, exp) returns +0 if exp is a negative non-integer or even integer
+  // pow(+∞, exp) returns +0 for any negative exp
+}
+
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/monitoring/collected_metrics.h b/tensorflow/core/lib/monitoring/collected_metrics.h
index fbef25619fd4f9ad6dc6927c43d2b8750ac51804..e2009816097804c228a094575d05e732c08b4b90 100644
--- a/tensorflow/core/lib/monitoring/collected_metrics.h
+++ b/tensorflow/core/lib/monitoring/collected_metrics.h
@@ -17,8 +17,8 @@ limitations under the License.
 // These are to be used only by the CollectionRegistry and exporters which
 // collect metrics using the CollectionRegistry.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
 
 #include <map>
 #include <memory>
@@ -88,6 +88,7 @@ struct Point {
   ValueType value_type;
   int64 int64_value;
   string string_value;
+  bool bool_value;
   HistogramProto histogram_value;
 
   // start_timestamp and end_timestamp indicate the time period over which this
@@ -150,4 +151,4 @@ struct CollectedMetrics {
 }  // namespace monitoring
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_COLLECTED_METRICS_H_
diff --git a/tensorflow/core/lib/monitoring/collection_registry.h b/tensorflow/core/lib/monitoring/collection_registry.h
index 113d37e07d89f08ee725c7308122fee7d5031556..63cc0f550df79c4c6821f4618a4d8324969577b2 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.h
+++ b/tensorflow/core/lib/monitoring/collection_registry.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
 
 #include <map>
 #include <memory>
@@ -224,6 +224,12 @@ inline void CollectValue(const string& value, Point* const point) {
   point->string_value = value;
 }
 
+template <>
+inline void CollectValue(const bool& value, Point* const point) {
+  point->value_type = ValueType::kBool;
+  point->bool_value = value;
+}
+
 template <>
 inline void CollectValue(const HistogramProto& value, Point* const point) {
   point->value_type = ValueType::kHistogram;
@@ -350,4 +356,4 @@ MetricCollector<metric_kind, Value, NumLabels> MetricCollectorGetter::Get(
 }  // namespace monitoring
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_COLLECTION_REGISTRY_H_
diff --git a/tensorflow/core/lib/monitoring/counter.h b/tensorflow/core/lib/monitoring/counter.h
index 7240348a9b764e3092f71da4bce9a953c08e7900..8ff810db41d98024eb9e6be1e1c2a10a4b792a75 100644
--- a/tensorflow/core/lib/monitoring/counter.h
+++ b/tensorflow/core/lib/monitoring/counter.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
 
 // We replace this implementation with a null implementation for mobile
 // platforms.
@@ -172,4 +172,4 @@ CounterCell* Counter<NumLabels>::GetCell(const Labels&... labels)
 }  // namespace tensorflow
 
 #endif  // IS_MOBILE_PLATFORM
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_COUNTER_H_
diff --git a/tensorflow/core/lib/monitoring/gauge.h b/tensorflow/core/lib/monitoring/gauge.h
index 75471cfb22956deac0b0a5841fdde8ee538da30e..ee9a862f40a8266b1f3fa35150a7209f1b61819b 100644
--- a/tensorflow/core/lib/monitoring/gauge.h
+++ b/tensorflow/core/lib/monitoring/gauge.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
 
 // We replace this implementation with a null implementation for mobile
 // platforms.
@@ -86,8 +86,29 @@ class GaugeCell<int64> {
   TF_DISALLOW_COPY_AND_ASSIGN(GaugeCell);
 };
 
+// Explicit specialization of GaugeCell<bool>. Compared to the primary
+// template, it uses atomic values as opposed to mutex. This class is
+// thread-safe.
+template <>
+class GaugeCell<bool> {
+ public:
+  explicit GaugeCell(bool value) : value_(value) {}
+  ~GaugeCell() {}
+
+  // Atomically sets the value.
+  void Set(bool value);
+
+  // Retrieves the current value.
+  bool value() const;
+
+ private:
+  std::atomic<bool> value_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GaugeCell);
+};
+
 // A stateful class for updating a gauge-like metric. Allowed ValueType are
-// int64 and string.
+// int64, string and bool.
 //
 // This class encapsulates a set of values (or a single value for a label-less
 // metric). Each value is identified by a tuple of labels. The class allows the
@@ -117,6 +138,9 @@ class Gauge {
   //
   // auto* integer_gauge = Gauge<int64, 0>::New("/tensorflow/integer_gauge",
   //   "Integer gauge")
+  //
+  // auto* bool_gauge = Gauge<bool, 0>::New("/tensorflow/bool_gauge",
+  //   "Bool gauge")
   template <typename... MetricDefArgs>
   static Gauge* New(MetricDefArgs&&... metric_def_args);
 
@@ -172,12 +196,17 @@ inline void GaugeCell<int64>::Set(int64 value) { value_ = value; }
 
 inline int64 GaugeCell<int64>::value() const { return value_; }
 
+inline void GaugeCell<bool>::Set(bool value) { value_ = value; }
+
+inline bool GaugeCell<bool>::value() const { return value_; }
+
 template <typename ValueType, int NumLabels>
 template <typename... MetricDefArgs>
 Gauge<ValueType, NumLabels>* Gauge<ValueType, NumLabels>::New(
     MetricDefArgs&&... metric_def_args) {
   static_assert(std::is_same<ValueType, int64>::value ||
-                    std::is_same<ValueType, string>::value,
+                    std::is_same<ValueType, string>::value ||
+                    std::is_same<ValueType, bool>::value,
                 "Gauge only allows int64 and string types.");
   return new Gauge<ValueType, NumLabels>(
       MetricDef<MetricKind::kGauge, ValueType, NumLabels>(
@@ -212,4 +241,4 @@ GaugeCell<ValueType>* Gauge<ValueType, NumLabels>::GetCell(
 }  // namespace tensorflow
 
 #endif  // IS_MOBILE_PLATFORM
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
diff --git a/tensorflow/core/lib/monitoring/gauge_test.cc b/tensorflow/core/lib/monitoring/gauge_test.cc
index f98cfe2a3b34cfb0630865e2fd0eeef6ea4f734d..c8f673db38928b96bd4f97cbb72c1007fdc9e9bb 100644
--- a/tensorflow/core/lib/monitoring/gauge_test.cc
+++ b/tensorflow/core/lib/monitoring/gauge_test.cc
@@ -87,6 +87,28 @@ TEST(GaugeOfStringValue, GetCell) {
   EXPECT_EQ("bar", same_cell->value());
 }
 
+auto* bool_gauge =
+    Gauge<bool, 0>::New("/tensorflow/test/bool_gauge", "Gauge of bool value.");
+
+TEST(GaugeOfBoolValue, InitializedWithFalseValue) {
+  EXPECT_EQ(false, bool_gauge->GetCell()->value());
+}
+
+TEST(GaugeOfBoolValue, GetCell) {
+  auto* cell = bool_gauge->GetCell();
+  EXPECT_EQ(false, cell->value());
+
+  cell->Set(true);
+  EXPECT_EQ(true, cell->value());
+
+  auto* same_cell = bool_gauge->GetCell();
+  EXPECT_EQ(true, cell->value());
+
+  same_cell->Set(false);
+  EXPECT_EQ(false, cell->value());
+  EXPECT_EQ(false, same_cell->value());
+}
+
 }  // namespace
 }  // namespace monitoring
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/monitoring/metric_def.h b/tensorflow/core/lib/monitoring/metric_def.h
index a7f14f9c94e67d7543382d59308ec0bd4445c190..5ecadcc4272581a5e4e2c934cd605bd1a1110fcd 100644
--- a/tensorflow/core/lib/monitoring/metric_def.h
+++ b/tensorflow/core/lib/monitoring/metric_def.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
 
 #include <array>
 #include <vector>
@@ -28,16 +28,16 @@ namespace monitoring {
 // The different metric kinds available.
 //
 // Gauge indicates that the metric's values are instantaneous measurements of a
-// (typically) continuously varying quantity or a string value. Examples: a
-// process's current heap size, a queue's current length, the name of the binary
-// used by a process.
+// (typically) continuously varying value. Examples: a process's current heap
+// size, a queue's current length, the name of the binary used by a process,
+// whether a task is complete.
 //
 // Cumulative indicates that the metric's values represent non-negative changes
 // over specified time periods. Example: the number of rpc calls to a service.
 enum class MetricKind : int { kGauge = 0, kCumulative };
 
 // The type of the metric values.
-enum class ValueType : int { kInt64 = 0, kHistogram, kString };
+enum class ValueType : int { kInt64 = 0, kHistogram, kString, kBool };
 
 // Everything in the internal namespace is implementation details. Do not depend
 // on this.
@@ -61,6 +61,11 @@ inline ValueType GetValueType<string>() {
   return ValueType::kString;
 }
 
+template <>
+inline ValueType GetValueType<bool>() {
+  return ValueType::kBool;
+}
+
 }  // namespace internal
 
 // Abstract base class for a metric definition.
@@ -134,4 +139,4 @@ class MetricDef : public AbstractMetricDef {
 }  // namespace monitoring
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_METRIC_DEF_H_
diff --git a/tensorflow/core/lib/monitoring/mobile_counter.h b/tensorflow/core/lib/monitoring/mobile_counter.h
index c30bfe026f15922213312c68af0236e3d07d9380..c297d843d2fa7fb487b315fc8870e62fd5ec930d 100644
--- a/tensorflow/core/lib/monitoring/mobile_counter.h
+++ b/tensorflow/core/lib/monitoring/mobile_counter.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Null implementation of the Counter metric for mobile platforms.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_COUNTER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_COUNTER_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_MOBILE_COUNTER_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_MOBILE_COUNTER_H_
 
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -64,4 +64,4 @@ class Counter {
 }  // namespace monitoring
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_COUNTER_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_MOBILE_COUNTER_H_
diff --git a/tensorflow/core/lib/monitoring/mobile_gauge.h b/tensorflow/core/lib/monitoring/mobile_gauge.h
index ac13ad35c020a45770e8acd7cd0820cbc2ac8cf4..a03b41aef334901eec206ce2ebfcf28251f4e28e 100644
--- a/tensorflow/core/lib/monitoring/mobile_gauge.h
+++ b/tensorflow/core/lib/monitoring/mobile_gauge.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Null implementation of the Gauge metric for mobile platforms.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
 
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -69,4 +69,4 @@ class Gauge {
 }  // namespace monitoring
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
diff --git a/tensorflow/core/lib/monitoring/mobile_sampler.h b/tensorflow/core/lib/monitoring/mobile_sampler.h
index cf390e5c7f67723e017b991cd7d0cd15266e24d9..77310dd619fd886c65b3ae3bf7c12d050d82c9d8 100644
--- a/tensorflow/core/lib/monitoring/mobile_sampler.h
+++ b/tensorflow/core/lib/monitoring/mobile_sampler.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Null implementation of the Sampler metric for mobile platforms.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_SAMPLER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_SAMPLER_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_MOBILE_SAMPLER_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_MOBILE_SAMPLER_H_
 
 #include <memory>
 
@@ -98,4 +98,4 @@ class Sampler {
 }  // namespace monitoring
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_SAMPLER_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_MOBILE_SAMPLER_H_
diff --git a/tensorflow/core/lib/monitoring/sampler.h b/tensorflow/core/lib/monitoring/sampler.h
index c7a05428e2dced68ce3dc165616837084916f49d..a4f397f5566a7425b197e5de91aed811ec08e564 100644
--- a/tensorflow/core/lib/monitoring/sampler.h
+++ b/tensorflow/core/lib/monitoring/sampler.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
+#ifndef TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
+#define TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
 
 // We replace this implementation with a null implementation for mobile
 // platforms.
@@ -215,4 +215,4 @@ SamplerCell* Sampler<NumLabels>::GetCell(const Labels&... labels)
 }  // namespace tensorflow
 
 #endif  // IS_MOBILE_PLATFORM
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
+#endif  // TENSORFLOW_CORE_LIB_MONITORING_SAMPLER_H_
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index 354c819b090ce5e04047f13d2ff19441a499d770..cba473927dd1fce30bbe690b4bfda1e382ca12c0 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -90,11 +90,8 @@ void WarningHandler(png_structp png_ptr, png_const_charp msg) {
 void StringReader(png_structp png_ptr, png_bytep data, png_size_t length) {
   DecodeContext* const ctx = bit_cast<DecodeContext*>(png_get_io_ptr(png_ptr));
   if (static_cast<png_size_t>(ctx->data_left) < length) {
-    if (!ctx->error_condition) {
-      VLOG(1) << "PNG read decoding error";
-      ctx->error_condition = true;
-    }
     memset(data, 0, length);
+    png_error(png_ptr, "More bytes requested to read than available");
   } else {
     memcpy(data, ctx->data, length);
     ctx->data += length;
@@ -197,8 +194,8 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
                       int desired_channel_bits, DecodeContext* context) {
   CHECK(desired_channel_bits == 8 || desired_channel_bits == 16)
       << "desired_channel_bits = " << desired_channel_bits;
-  CHECK(0 <= desired_channels && desired_channels <= 4) << "desired_channels = "
-                                                        << desired_channels;
+  CHECK(0 <= desired_channels && desired_channels <= 4)
+      << "desired_channels = " << desired_channels;
   context->error_condition = false;
   context->channels = desired_channels;
   context->png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, context,
diff --git a/tensorflow/core/lib/random/philox_random_test_utils.h b/tensorflow/core/lib/random/philox_random_test_utils.h
index f4bb087e107e10f90196a807c03ed2407d9d1ad6..6c29ae6b6a224d9c0369172bbf21af465ad53a19 100644
--- a/tensorflow/core/lib/random/philox_random_test_utils.h
+++ b/tensorflow/core/lib/random/philox_random_test_utils.h
@@ -35,8 +35,8 @@ void FillRandoms(PhiloxRandom gen, typename Distribution::ResultElementType* p,
                  int64 size) {
   const int granularity = Distribution::kResultElementCount;
 
-  CHECK(size % granularity == 0) << " size: " << size
-                                 << " granularity: " << granularity;
+  CHECK(size % granularity == 0)
+      << " size: " << size << " granularity: " << granularity;
 
   Distribution dist;
   for (int i = 0; i < size; i += granularity) {
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index 0e281403f8748ffbb7dbfac888cd2303c0a7253f..3fe1f9bc6cf06158df4811eaa177988b60890006 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
 
 #define _USE_MATH_DEFINES
-#include <cmath>
 #include <math.h>
+#include <cmath>
 #undef _USE_MATH_DEFINES
 
 #include <string.h>
@@ -27,7 +27,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/lib/random/philox_random.h"
 
-
 namespace tensorflow {
 namespace random {
 
diff --git a/tensorflow/core/lib/random/random_distributions_test.cc b/tensorflow/core/lib/random/random_distributions_test.cc
index bd574cba2f38ee23aca3dda68b9def6025bdd36e..85d68f456e1e27b7a62315f2b0a962843da87d52 100644
--- a/tensorflow/core/lib/random/random_distributions_test.cc
+++ b/tensorflow/core/lib/random/random_distributions_test.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include <math.h>
 #include <algorithm>
 #include <functional>
+#include <numeric>
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/philox_random_test_utils.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -43,8 +45,8 @@ void FillRandomsWithSingles(PhiloxRandom gen,
                             int64 size) {
   int granularity = Distribution::kResultElementCount;
 
-  CHECK(size % granularity == 0) << " size: " << size
-                                 << " granularity: " << granularity;
+  CHECK(size % granularity == 0)
+      << " size: " << size << " granularity: " << granularity;
 
   SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
 
@@ -104,12 +106,12 @@ bool CheckSamplesMoments(const std::vector<T>& samples,
 
   for (int i = 1; i <= max_moments; ++i) {
     // Calculate the theoretical mean and variance
-    const double moments_i_mean = (stride == 0)
-                                      ? theoretical_moments(i)
-                                      : std::pow(theoretical_moments(1), i);
-    const double moments_i_squared = (stride == 0)
-                                         ? theoretical_moments(2 * i)
-                                         : std::pow(theoretical_moments(2), i);
+    const double moments_i_mean =
+        (stride == 0) ? theoretical_moments(i)
+                      : MathUtil::IPow(theoretical_moments(1), i);
+    const double moments_i_squared =
+        (stride == 0) ? theoretical_moments(2 * i)
+                      : MathUtil::IPow(theoretical_moments(2), i);
     const double moments_i_var =
         moments_i_squared - moments_i_mean * moments_i_mean;
 
@@ -150,8 +152,8 @@ void UniformMomentsTest(int count, int max_moments,
   PhiloxRandom gen(seed);
   FillRandoms<UniformDistribution<PhiloxRandom, T> >(gen, &v1[0], v1.size());
   for (int stride : strides) {
-    bool status = CheckSamplesMoments<T>(v1, uniform_moments, max_moments,
-                                         stride, z_limit);
+    bool status =
+        CheckSamplesMoments(v1, uniform_moments, max_moments, stride, z_limit);
     ASSERT_TRUE(status) << " UniformMomentsTest failing. seed: " << seed;
   }
 }
@@ -182,8 +184,8 @@ void NormalMomentsTest(int count, int max_moments,
   FillRandoms<NormalDistribution<PhiloxRandom, T> >(gen, &v1[0], v1.size());
 
   for (int stride : strides) {
-    bool status = CheckSamplesMoments<T>(v1, normal_moments, max_moments,
-                                         stride, z_limit);
+    bool status =
+        CheckSamplesMoments(v1, normal_moments, max_moments, stride, z_limit);
     ASSERT_TRUE(status) << " NormalMomentsTest failing. seed: " << seed;
   }
 }
@@ -213,7 +215,7 @@ class TruncatedNormalMoments {
     }
 
     // The real computation of the moment.
-    double bias = 2.0 * std::pow(kV, n - 1) * kFV / (2.0 * kPhiV - 1.0);
+    double bias = 2.0 * MathUtil::IPow(kV, n - 1) * kFV / (2.0 * kPhiV - 1.0);
     double moment_n_minus_2 = (*this)(n - 2);
     double moment_n = (n - 1) * moment_n_minus_2 - bias;
 
@@ -244,8 +246,8 @@ void RandomParametersMomentsTest(int count, int max_moments,
       gen, &v1[0], v1.size());
 
   for (int stride : strides) {
-    bool status = CheckSamplesMoments<T>(v1, TruncatedNormalMoments(),
-                                         max_moments, stride, z_limit);
+    bool status = CheckSamplesMoments(v1, TruncatedNormalMoments(), max_moments,
+                                      stride, z_limit);
     ASSERT_TRUE(status) << " NormalMomentsTest failing. seed: " << seed;
   }
 }
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index b3cca504e1d0f04ccf56bd517426e8434b57e3b6..f5822fad8e3d3b8559d19c79ee2885e580ea3e11 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -23,9 +23,6 @@ limitations under the License.
 #include <locale>
 #include <unordered_map>
 
-#include "double-conversion/double-conversion.h"
-
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -35,15 +32,74 @@ namespace tensorflow {
 
 namespace {
 
-static inline const double_conversion::StringToDoubleConverter& StringToFloatConverter() {
-    const static double_conversion::StringToDoubleConverter converter(
-        double_conversion::StringToDoubleConverter::ALLOW_LEADING_SPACES
-        | double_conversion::StringToDoubleConverter::ALLOW_HEX
-        | double_conversion::StringToDoubleConverter::ALLOW_TRAILING_SPACES
-        | double_conversion::StringToDoubleConverter::ALLOW_CASE_INSENSIBILITY,
-        0., 0., "inf", "nan"
-    );
-    return converter;
+template <typename T>
+T locale_independent_strtonum(const char* str, const char** endptr) {
+  static const std::unordered_map<string, T> special_nums = {
+      {"inf", std::numeric_limits<T>::infinity()},
+      {"+inf", std::numeric_limits<T>::infinity()},
+      {"-inf", -std::numeric_limits<T>::infinity()},
+      {"infinity", std::numeric_limits<T>::infinity()},
+      {"+infinity", std::numeric_limits<T>::infinity()},
+      {"-infinity", -std::numeric_limits<T>::infinity()},
+      {"nan", std::numeric_limits<T>::quiet_NaN()},
+      {"+nan", std::numeric_limits<T>::quiet_NaN()},
+      {"-nan", -std::numeric_limits<T>::quiet_NaN()},
+  };
+  std::stringstream s(str);
+
+  // Check if str is one of the special numbers.
+  string special_num_str;
+  s >> special_num_str;
+
+  for (int i = 0; i < special_num_str.length(); ++i) {
+    special_num_str[i] =
+        std::tolower(special_num_str[i], std::locale::classic());
+  }
+
+  auto entry = special_nums.find(special_num_str);
+  if (entry != special_nums.end()) {
+    *endptr = str + (s.eof() ? static_cast<std::iostream::pos_type>(strlen(str))
+                             : s.tellg());
+    return entry->second;
+  } else {
+    // Perhaps it's a hex number
+    if (special_num_str.compare(0, 2, "0x") == 0 ||
+        special_num_str.compare(0, 3, "-0x") == 0) {
+      return strtol(str, const_cast<char**>(endptr), 16);
+    }
+  }
+  // Reset the stream
+  s.str(str);
+  s.clear();
+  // Use the "C" locale
+  s.imbue(std::locale::classic());
+
+  T result;
+  s >> result;
+
+  // Set to result to what strto{f,d} functions would have returned. If the
+  // number was outside the range, the stringstream sets the fail flag, but
+  // returns the +/-max() value, whereas strto{f,d} functions return +/-INF.
+  if (s.fail()) {
+    if (result == std::numeric_limits<T>::max() ||
+        result == std::numeric_limits<T>::infinity()) {
+      result = std::numeric_limits<T>::infinity();
+      s.clear(s.rdstate() & ~std::ios::failbit);
+    } else if (result == -std::numeric_limits<T>::max() ||
+               result == -std::numeric_limits<T>::infinity()) {
+      result = -std::numeric_limits<T>::infinity();
+      s.clear(s.rdstate() & ~std::ios::failbit);
+    }
+  }
+
+  if (endptr) {
+    *endptr =
+        str +
+        (s.fail() ? static_cast<std::iostream::pos_type>(0)
+                  : (s.eof() ? static_cast<std::iostream::pos_type>(strlen(str))
+                             : s.tellg()));
+  }
+  return result;
 }
 
 }  // namespace
@@ -111,8 +167,8 @@ char* DoubleToBuffer(double value, char* buffer) {
     // larger than the precision we asked for.
     DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
 
-    auto parsed_value = double{};
-    full_precision_needed = !safe_strtod(buffer, &parsed_value) || parsed_value != value;
+    full_precision_needed =
+        locale_independent_strtonum<double>(buffer, nullptr) != value;
   }
 
   if (full_precision_needed) {
@@ -248,23 +304,25 @@ bool safe_strtou32(StringPiece str, uint32* value) {
 }
 
 bool safe_strtof(const char* str, float* value) {
-  int processed_characters_count = -1;
-  auto len = str_util::Strnlen(str, kFastToBufferSize);
-  *value = StringToFloatConverter().StringToFloat(
-      str,
-      len,
-      &processed_characters_count);
-  return processed_characters_count > 0;
+  const char* endptr;
+  *value = locale_independent_strtonum<float>(str, &endptr);
+  while (isspace(*endptr)) ++endptr;
+  // Ignore range errors from strtod/strtof.
+  // The values it returns on underflow and
+  // overflow are the right fallback in a
+  // robust setting.
+  return *str != '\0' && *endptr == '\0';
 }
 
 bool safe_strtod(const char* str, double* value) {
-  int processed_characters_count = -1;
-  auto len = str_util::Strnlen(str, kFastToBufferSize);
-  *value = StringToFloatConverter().StringToDouble(
-      str,
-      len,
-      &processed_characters_count);
-  return processed_characters_count > 0;
+  const char* endptr;
+  *value = locale_independent_strtonum<double>(str, &endptr);
+  while (isspace(*endptr)) ++endptr;
+  // Ignore range errors from strtod/strtof.
+  // The values it returns on underflow and
+  // overflow are the right fallback in a
+  // robust setting.
+  return *str != '\0' && *endptr == '\0';
 }
 
 char* FloatToBuffer(float value, char* buffer) {
diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h
index 31b6abbac682bf682c8043caafce0d38348b8f1a..3c45b9027401999ba4e6c32005456312970cccba 100644
--- a/tensorflow/core/lib/strings/numbers.h
+++ b/tensorflow/core/lib/strings/numbers.h
@@ -122,6 +122,38 @@ bool safe_strtof(const char* str, float* value);
 // Values may be rounded on over- and underflow.
 bool safe_strtod(const char* str, double* value);
 
+inline bool ProtoParseNumeric(StringPiece s, int32* value) {
+  return safe_strto32(s, value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, uint32* value) {
+  return safe_strtou32(s, value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, int64* value) {
+  return safe_strto64(s, value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, uint64* value) {
+  return safe_strtou64(s, value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, float* value) {
+  return safe_strtof(s.ToString().c_str(), value);
+}
+
+inline bool ProtoParseNumeric(StringPiece s, double* value) {
+  return safe_strtod(s.ToString().c_str(), value);
+}
+
+// Convert strings to number of type T.
+// Leading and trailing spaces are allowed.
+// Values may be rounded on over- and underflow.
+template <typename T>
+bool SafeStringToNumeric(StringPiece s, T* value) {
+  return ProtoParseNumeric(s, value);
+}
+
 // Converts from an int64 to a human readable string representing the
 // same number, using decimal powers.  e.g. 1200000 -> "1.20M".
 string HumanReadableNum(int64 value);
diff --git a/tensorflow/core/lib/strings/numbers_test.cc b/tensorflow/core/lib/strings/numbers_test.cc
index df395c301e04217fab42219570557a6905722292..e15161de66c75ced0c9cbc9ccb2a6900dc8c7d02 100644
--- a/tensorflow/core/lib/strings/numbers_test.cc
+++ b/tensorflow/core/lib/strings/numbers_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 
 #include <string>
-#include <cmath>
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -278,40 +277,7 @@ TEST(safe_strtof, Float) {
   EXPECT_TRUE(safe_strtof("-0x2A", &result));
   EXPECT_EQ(-42.0f, result);
 
-  EXPECT_TRUE(safe_strtof(" -0x2", &result));
-  EXPECT_EQ(-2.0f, result);
-
-  EXPECT_TRUE(safe_strtof("8 \t", &result));
-  EXPECT_EQ(8.0f, result);
-
-  EXPECT_TRUE(safe_strtof("\t20.0\t ", &result));
-  EXPECT_EQ(20.0f, result);
-
   EXPECT_FALSE(safe_strtof("-infinity is awesome", &result));
-
-  EXPECT_TRUE(safe_strtof("-inf", &result));
-  EXPECT_EQ(-std::numeric_limits<float>::infinity(), result);
-
-  EXPECT_TRUE(safe_strtof("+inf", &result));
-  EXPECT_EQ(std::numeric_limits<float>::infinity(), result);
-
-  EXPECT_TRUE(safe_strtof("InF", &result));
-  EXPECT_EQ(std::numeric_limits<float>::infinity(), result);
-
-  EXPECT_TRUE(safe_strtof("-INF", &result));
-  EXPECT_EQ(-std::numeric_limits<float>::infinity(), result);
-
-  EXPECT_TRUE(safe_strtof("nan", &result));
-  EXPECT_TRUE(std::isnan(result));
-
-  EXPECT_TRUE(safe_strtof("-nan", &result));
-  EXPECT_TRUE(std::isnan(result));
-
-  EXPECT_TRUE(safe_strtof("-NaN", &result));
-  EXPECT_TRUE(std::isnan(result));
-
-  EXPECT_TRUE(safe_strtof("+NAN", &result));
-  EXPECT_TRUE(std::isnan(result));
 }
 
 TEST(safe_strtod, Double) {
@@ -330,41 +296,6 @@ TEST(safe_strtod, Double) {
 
   EXPECT_TRUE(safe_strtod("1e-325", &result));
   EXPECT_EQ(0, result);
-
-  EXPECT_TRUE(safe_strtod(" -0x1c", &result));
-  EXPECT_EQ(-28.0, result);
-
-  EXPECT_TRUE(safe_strtod("50 \t", &result));
-  EXPECT_EQ(50.0, result);
-
-  EXPECT_TRUE(safe_strtod("\t82.0\t ", &result));
-  EXPECT_EQ(82.0, result);
-
-  EXPECT_FALSE(safe_strtod("infinity", &result));
-
-  EXPECT_TRUE(safe_strtod("-inf", &result));
-  EXPECT_EQ(-std::numeric_limits<double>::infinity(), result);
-
-  EXPECT_TRUE(safe_strtod("+inf", &result));
-  EXPECT_EQ(std::numeric_limits<double>::infinity(), result);
-
-  EXPECT_TRUE(safe_strtod("InF", &result));
-  EXPECT_EQ(std::numeric_limits<double>::infinity(), result);
-
-  EXPECT_TRUE(safe_strtod("-INF", &result));
-  EXPECT_EQ(-std::numeric_limits<double>::infinity(), result);
-
-  EXPECT_TRUE(safe_strtod("nan", &result));
-  EXPECT_TRUE(std::isnan(result));
-
-  EXPECT_TRUE(safe_strtod("-nan", &result));
-  EXPECT_TRUE(std::isnan(result));
-
-  EXPECT_TRUE(safe_strtod("-NaN", &result));
-  EXPECT_TRUE(std::isnan(result));
-
-  EXPECT_TRUE(safe_strtod("+NAN", &result));
-  EXPECT_TRUE(std::isnan(result));
 }
 
 }  // namespace strings
diff --git a/tensorflow/core/lib/strings/ordered_code.cc b/tensorflow/core/lib/strings/ordered_code.cc
index af9a15125948d8ed390e5873f3677527ebddea8e..ef90050b4f628ab65c1dd939ba358fec714c95b5 100644
--- a/tensorflow/core/lib/strings/ordered_code.cc
+++ b/tensorflow/core/lib/strings/ordered_code.cc
@@ -472,7 +472,8 @@ void OrderedCode::WriteSignedNumIncreasing(string* dest, int64 val) {
   // buf = val in network byte order, sign extended to 10 bytes
   const char sign_byte = val < 0 ? '\xff' : '\0';
   char buf[10] = {
-      sign_byte, sign_byte,
+      sign_byte,
+      sign_byte,
   };
   StoreBigEndian64(buf + 2, val);
   static_assert(sizeof(buf) == kMaxSigned64Length, "max length size mismatch");
diff --git a/tensorflow/core/lib/strings/proto_text_util.h b/tensorflow/core/lib/strings/proto_text_util.h
index 3d0c6e4a376268e03c84270a869a4ec73b7c731d..05dbda6e152b7a3b820e36f7c1b56094e2dc04fa 100644
--- a/tensorflow/core/lib/strings/proto_text_util.h
+++ b/tensorflow/core/lib/strings/proto_text_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
+#ifndef TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
+#define TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
 
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
@@ -118,30 +118,6 @@ class ProtoTextOutput {
   TF_DISALLOW_COPY_AND_ASSIGN(ProtoTextOutput);
 };
 
-inline bool ProtoParseNumeric(StringPiece s, int32* value) {
-  return ::tensorflow::strings::safe_strto32(s, value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, uint32* value) {
-  return ::tensorflow::strings::safe_strtou32(s, value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, int64* value) {
-  return ::tensorflow::strings::safe_strto64(s, value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, uint64* value) {
-  return ::tensorflow::strings::safe_strtou64(s, value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, float* value) {
-  return ::tensorflow::strings::safe_strtof(s.ToString().c_str(), value);
-}
-
-inline bool ProtoParseNumeric(StringPiece s, double* value) {
-  return ::tensorflow::strings::safe_strtod(s.ToString().c_str(), value);
-}
-
 inline void ProtoSpaceAndComments(Scanner* scanner) {
   for (;;) {
     scanner->AnySpace();
@@ -174,7 +150,7 @@ bool ProtoParseNumericFromScanner(Scanner* scanner, T* value) {
   }
 
   ProtoSpaceAndComments(scanner);
-  return ProtoParseNumeric(numeric_str, value);
+  return SafeStringToNumeric<T>(numeric_str, value);
 }
 
 // Parse the next boolean value from <scanner>, returning false if parsing
@@ -188,4 +164,4 @@ bool ProtoParseStringLiteralFromScanner(Scanner* scanner, string* value);
 }  // namespace strings
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
+#endif  // TENSORFLOW_CORE_LIB_STRINGS_PROTO_TEXT_UTIL_H_
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc
index 0ae6c66080a4686127237101151ed66779d3b38b..d28857803d7ef1edd66ae6c1a6b81a7ed1dbce85 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/lib/strings/str_util.cc
@@ -452,13 +452,5 @@ bool SplitAndParseAsFloats(StringPiece text, char delim,
                                     result);
 }
 
-size_t Strnlen(const char* str, const size_t string_max_len) {
-  size_t len = 0;
-  while (len < string_max_len && str[len] != '\0') {
-    ++len;
-  }
-  return len;
-}
-
 }  // namespace str_util
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index b0d774a05ce445b794cf631776970ad273fda0d5..44c52850fa99f7688fb496784a18b651c147bb8b 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -83,7 +83,7 @@ string Uppercase(StringPiece s);
 
 // Converts "^2ILoveYou!" to "i_love_you_". More specifically:
 // - converts all non-alphanumeric characters to underscores
-// - replaces each occurence of a capital letter (except the very
+// - replaces each occurrence of a capital letter (except the very
 //   first character and if there is already an '_' before it) with '_'
 //   followed by this letter in lower case
 // - Skips leading non-alpha characters
@@ -209,11 +209,6 @@ std::vector<string> Split(StringPiece text, char delims, Predicate p) {
   return Split(text, StringPiece(&delims, 1), p);
 }
 
-// Returns the length of the given null-terminated byte string 'str'.
-// Returns 'string_max_len' if the null character was not found in the first
-// 'string_max_len' bytes of 'str'.
-size_t Strnlen(const char* str, const size_t string_max_len);
-
 }  // namespace str_util
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/lib/strings/str_util_test.cc
index 3a8de7c96b5fd01be24c981edadfd92a3f7d44b7..6d461241f7e9c5a29064c015991039d5bf95a80f 100644
--- a/tensorflow/core/lib/strings/str_util_test.cc
+++ b/tensorflow/core/lib/strings/str_util_test.cc
@@ -305,7 +305,7 @@ TEST(SplitAndParseAsInts, Int64) {
   EXPECT_EQ(nums[0], 134);
   EXPECT_EQ(nums[1], 2);
   EXPECT_EQ(nums[2], 13);
-  EXPECT_EQ(nums[3], -4000000000);
+  EXPECT_EQ(nums[3], static_cast<int64>(-4000000000ull));
 
   EXPECT_FALSE(str_util::SplitAndParseAsInts("abc", ',', &nums));
 
@@ -430,12 +430,4 @@ TEST(StringReplace, EmptyStringReplaceAll) {
   EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/true));
 }
 
-TEST(Strnlen, Basic) {
-  EXPECT_EQ(0, str_util::Strnlen("ab", 0));
-  EXPECT_EQ(1, str_util::Strnlen("a", 1));
-  EXPECT_EQ(2, str_util::Strnlen("abcd", 2));
-  EXPECT_EQ(3, str_util::Strnlen("abc", 10));
-  EXPECT_EQ(4, str_util::Strnlen("a \t\n", 10));
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h
index 8e35549ed4bdd9afa497011c1f10504b59a0f350..2bc14945cd0413751003c03c7f5255c300790321 100644
--- a/tensorflow/core/lib/strings/strcat.h
+++ b/tensorflow/core/lib/strings/strcat.h
@@ -119,11 +119,14 @@ class AlphaNum {
 
   AlphaNum(float f)  // NOLINT(runtime/explicit)
       : piece_(digits_, strlen(FloatToBuffer(f, digits_))) {}
+  AlphaNum(bfloat16 f)  // NOLINT(runtime/explicit)
+      : piece_(digits_, strlen(FloatToBuffer(static_cast<float>(f), digits_))) {
+  }
   AlphaNum(double f)  // NOLINT(runtime/explicit)
       : piece_(digits_, strlen(DoubleToBuffer(f, digits_))) {}
 
   AlphaNum(const Eigen::half &f);  // NOLINT(runtime/explicit)
-  AlphaNum(Hex hex);  // NOLINT(runtime/explicit)
+  AlphaNum(Hex hex);               // NOLINT(runtime/explicit)
 
   AlphaNum(const char *c_str) : piece_(c_str) {}   // NOLINT(runtime/explicit)
   AlphaNum(const StringPiece &pc) : piece_(pc) {}  // NOLINT(runtime/explicit)
diff --git a/tensorflow/core/lib/strings/strcat_test.cc b/tensorflow/core/lib/strings/strcat_test.cc
index c556b1f676b24caefdc1ad9eb9cbcaa08e943a8e..7cb186e6375fae4d8a7140dd2f9ee6e7e64ddd1a 100644
--- a/tensorflow/core/lib/strings/strcat_test.cc
+++ b/tensorflow/core/lib/strings/strcat_test.cc
@@ -46,19 +46,19 @@ TEST(StrCat, Ints) {
   const intptr_t intptr = -12;
   const uintptr_t uintptr = 13;
   string answer;
-  answer = StrCat(s, us);
+  answer = tensorflow::strings::StrCat(s, us);
   EXPECT_EQ(answer, "-12");
-  answer = StrCat(i, ui);
+  answer = tensorflow::strings::StrCat(i, ui);
   EXPECT_EQ(answer, "-34");
-  answer = StrCat(l, ul);
+  answer = tensorflow::strings::StrCat(l, ul);
   EXPECT_EQ(answer, "-56");
-  answer = StrCat(ll, ull);
+  answer = tensorflow::strings::StrCat(ll, ull);
   EXPECT_EQ(answer, "-78");
-  answer = StrCat(ptrdiff, size);
+  answer = tensorflow::strings::StrCat(ptrdiff, size);
   EXPECT_EQ(answer, "-910");
-  answer = StrCat(ssize, intptr);
+  answer = tensorflow::strings::StrCat(ssize, intptr);
   EXPECT_EQ(answer, "-11-12");
-  answer = StrCat(uintptr, 0);
+  answer = tensorflow::strings::StrCat(uintptr, 0);
   EXPECT_EQ(answer, "130");
 }
 
@@ -74,118 +74,137 @@ TEST(StrCat, Basics) {
   int32 i32s[] = {'H', 'C', 'W'};
   uint64 ui64s[] = {12345678910LL, 10987654321LL};
 
-  result = StrCat(false, true, 2, 3);
+  result = tensorflow::strings::StrCat(false, true, 2, 3);
   EXPECT_EQ(result, "0123");
 
-  result = StrCat(-1);
+  result = tensorflow::strings::StrCat(-1);
   EXPECT_EQ(result, "-1");
 
-  result = StrCat(0.5);
+  result = tensorflow::strings::StrCat(0.5);
   EXPECT_EQ(result, "0.5");
 
-  result = StrCat(strs[1], pieces[2]);
+  result = tensorflow::strings::StrCat(strs[1], pieces[2]);
   EXPECT_EQ(result, "CruelWorld");
 
-  result = StrCat(strs[0], ", ", pieces[2]);
+  result = tensorflow::strings::StrCat(strs[0], ", ", pieces[2]);
   EXPECT_EQ(result, "Hello, World");
 
-  result = StrCat(strs[0], ", ", strs[1], " ", strs[2], "!");
+  result =
+      tensorflow::strings::StrCat(strs[0], ", ", strs[1], " ", strs[2], "!");
   EXPECT_EQ(result, "Hello, Cruel World!");
 
-  result = StrCat(pieces[0], ", ", pieces[1], " ", pieces[2]);
+  result =
+      tensorflow::strings::StrCat(pieces[0], ", ", pieces[1], " ", pieces[2]);
   EXPECT_EQ(result, "Hello, Cruel World");
 
-  result = StrCat(c_strs[0], ", ", c_strs[1], " ", c_strs[2]);
+  result =
+      tensorflow::strings::StrCat(c_strs[0], ", ", c_strs[1], " ", c_strs[2]);
   EXPECT_EQ(result, "Hello, Cruel World");
 
-  result = StrCat("ASCII ", i32s[0], ", ", i32s[1], " ", i32s[2], "!");
+  result = tensorflow::strings::StrCat("ASCII ", i32s[0], ", ", i32s[1], " ",
+                                       i32s[2], "!");
   EXPECT_EQ(result, "ASCII 72, 67 87!");
 
-  result = StrCat(ui64s[0], ", ", ui64s[1], "!");
+  result = tensorflow::strings::StrCat(ui64s[0], ", ", ui64s[1], "!");
   EXPECT_EQ(result, "12345678910, 10987654321!");
 
   string one = "1";  // Actually, it's the size of this string that we want; a
                      // 64-bit build distinguishes between size_t and uint64,
                      // even though they're both unsigned 64-bit values.
-  result = StrCat("And a ", one.size(), " and a ", &result[2] - &result[0],
-                  " and a ", one, " 2 3 4", "!");
+  result = tensorflow::strings::StrCat("And a ", one.size(), " and a ",
+                                       &result[2] - &result[0], " and a ", one,
+                                       " 2 3 4", "!");
   EXPECT_EQ(result, "And a 1 and a 2 and a 1 2 3 4!");
 
   // result = StrCat("Single chars won't compile", '!');
   // result = StrCat("Neither will NULLs", NULL);
-  result = StrCat("To output a char by ASCII/numeric value, use +: ", '!' + 0);
+  result = tensorflow::strings::StrCat(
+      "To output a char by ASCII/numeric value, use +: ", '!' + 0);
   EXPECT_EQ(result, "To output a char by ASCII/numeric value, use +: 33");
 
   float f = 100000.5;
-  result = StrCat("A hundred K and a half is ", f);
+  result = tensorflow::strings::StrCat("A hundred K and a half is ", f);
   EXPECT_EQ(result, "A hundred K and a half is 100000.5");
 
   double d = f;
   d *= d;
-  result = StrCat("A hundred K and a half squared is ", d);
+  result = tensorflow::strings::StrCat("A hundred K and a half squared is ", d);
   EXPECT_EQ(result, "A hundred K and a half squared is 10000100000.25");
 
   Eigen::half h(10007.0f);
-  result = StrCat("Ten thousand seven is approximately ", h);
+  result =
+      tensorflow::strings::StrCat("Ten thousand seven is approximately ", h);
   EXPECT_EQ(result, "Ten thousand seven is approximately 10008");
 
-  result = StrCat(1, 2, 333, 4444, 55555, 666666, 7777777, 88888888, 999999999);
+  result = tensorflow::strings::StrCat(1, 2, 333, 4444, 55555, 666666, 7777777,
+                                       88888888, 999999999);
   EXPECT_EQ(result, "12333444455555666666777777788888888999999999");
 }
 
 TEST(StrCat, MaxArgs) {
   string result;
   // Test 10 up to 26 arguments, the current maximum
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a");
   EXPECT_EQ(result, "123456789a");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b");
   EXPECT_EQ(result, "123456789ab");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c");
+  result =
+      tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c");
   EXPECT_EQ(result, "123456789abc");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c",
+                                       "d");
   EXPECT_EQ(result, "123456789abcd");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c",
+                                       "d", "e");
   EXPECT_EQ(result, "123456789abcde");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c",
+                                       "d", "e", "f");
   EXPECT_EQ(result, "123456789abcdef");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c",
+                                       "d", "e", "f", "g");
   EXPECT_EQ(result, "123456789abcdefg");
-  result =
-      StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g", "h");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c",
+                                       "d", "e", "f", "g", "h");
   EXPECT_EQ(result, "123456789abcdefgh");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g",
-                  "h", "i");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c",
+                                       "d", "e", "f", "g", "h", "i");
   EXPECT_EQ(result, "123456789abcdefghi");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g",
-                  "h", "i", "j");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c",
+                                       "d", "e", "f", "g", "h", "i", "j");
   EXPECT_EQ(result, "123456789abcdefghij");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g",
-                  "h", "i", "j", "k");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c",
+                                       "d", "e", "f", "g", "h", "i", "j", "k");
   EXPECT_EQ(result, "123456789abcdefghijk");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g",
-                  "h", "i", "j", "k", "l");
+  result =
+      tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d",
+                                  "e", "f", "g", "h", "i", "j", "k", "l");
   EXPECT_EQ(result, "123456789abcdefghijkl");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g",
-                  "h", "i", "j", "k", "l", "m");
+  result =
+      tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d",
+                                  "e", "f", "g", "h", "i", "j", "k", "l", "m");
   EXPECT_EQ(result, "123456789abcdefghijklm");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g",
-                  "h", "i", "j", "k", "l", "m", "n");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c",
+                                       "d", "e", "f", "g", "h", "i", "j", "k",
+                                       "l", "m", "n");
   EXPECT_EQ(result, "123456789abcdefghijklmn");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g",
-                  "h", "i", "j", "k", "l", "m", "n", "o");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c",
+                                       "d", "e", "f", "g", "h", "i", "j", "k",
+                                       "l", "m", "n", "o");
   EXPECT_EQ(result, "123456789abcdefghijklmno");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g",
-                  "h", "i", "j", "k", "l", "m", "n", "o", "p");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c",
+                                       "d", "e", "f", "g", "h", "i", "j", "k",
+                                       "l", "m", "n", "o", "p");
   EXPECT_EQ(result, "123456789abcdefghijklmnop");
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c", "d", "e", "f", "g",
-                  "h", "i", "j", "k", "l", "m", "n", "o", "p", "q");
+  result = tensorflow::strings::StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, "a", "b", "c",
+                                       "d", "e", "f", "g", "h", "i", "j", "k",
+                                       "l", "m", "n", "o", "p", "q");
   EXPECT_EQ(result, "123456789abcdefghijklmnopq");
   // No limit thanks to C++11's variadic templates
-  result = StrCat(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, "a", "b", "c", "d", "e", "f",
-                  "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
-                  "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D",
-                  "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
-                  "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z");
+  result = tensorflow::strings::StrCat(
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, "a", "b", "c", "d", "e", "f", "g", "h",
+      "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w",
+      "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L",
+      "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z");
   EXPECT_EQ(result,
             "12345678910abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ");
 }
@@ -203,78 +222,85 @@ TEST(StrAppend, Basics) {
   uint64 ui64s[] = {12345678910LL, 10987654321LL};
 
   string::size_type old_size = result.size();
-  StrAppend(&result, strs[0]);
+  tensorflow::strings::StrAppend(&result, strs[0]);
   EXPECT_EQ(result.substr(old_size), "Hello");
 
   old_size = result.size();
-  StrAppend(&result, strs[1], pieces[2]);
+  tensorflow::strings::StrAppend(&result, strs[1], pieces[2]);
   EXPECT_EQ(result.substr(old_size), "CruelWorld");
 
   old_size = result.size();
-  StrAppend(&result, strs[0], ", ", pieces[2]);
+  tensorflow::strings::StrAppend(&result, strs[0], ", ", pieces[2]);
   EXPECT_EQ(result.substr(old_size), "Hello, World");
 
   old_size = result.size();
-  StrAppend(&result, strs[0], ", ", strs[1], " ", strs[2], "!");
+  tensorflow::strings::StrAppend(&result, strs[0], ", ", strs[1], " ", strs[2],
+                                 "!");
   EXPECT_EQ(result.substr(old_size), "Hello, Cruel World!");
 
   old_size = result.size();
-  StrAppend(&result, pieces[0], ", ", pieces[1], " ", pieces[2]);
+  tensorflow::strings::StrAppend(&result, pieces[0], ", ", pieces[1], " ",
+                                 pieces[2]);
   EXPECT_EQ(result.substr(old_size), "Hello, Cruel World");
 
   old_size = result.size();
-  StrAppend(&result, c_strs[0], ", ", c_strs[1], " ", c_strs[2]);
+  tensorflow::strings::StrAppend(&result, c_strs[0], ", ", c_strs[1], " ",
+                                 c_strs[2]);
   EXPECT_EQ(result.substr(old_size), "Hello, Cruel World");
 
   old_size = result.size();
-  StrAppend(&result, "ASCII ", i32s[0], ", ", i32s[1], " ", i32s[2], "!");
+  tensorflow::strings::StrAppend(&result, "ASCII ", i32s[0], ", ", i32s[1], " ",
+                                 i32s[2], "!");
   EXPECT_EQ(result.substr(old_size), "ASCII 72, 67 87!");
 
   old_size = result.size();
-  StrAppend(&result, ui64s[0], ", ", ui64s[1], "!");
+  tensorflow::strings::StrAppend(&result, ui64s[0], ", ", ui64s[1], "!");
   EXPECT_EQ(result.substr(old_size), "12345678910, 10987654321!");
 
   string one = "1";  // Actually, it's the size of this string that we want; a
                      // 64-bit build distinguishes between size_t and uint64,
                      // even though they're both unsigned 64-bit values.
   old_size = result.size();
-  StrAppend(&result, "And a ", one.size(), " and a ", &result[2] - &result[0],
-            " and a ", one, " 2 3 4", "!");
+  tensorflow::strings::StrAppend(&result, "And a ", one.size(), " and a ",
+                                 &result[2] - &result[0], " and a ", one,
+                                 " 2 3 4", "!");
   EXPECT_EQ(result.substr(old_size), "And a 1 and a 2 and a 1 2 3 4!");
 
   // result = StrCat("Single chars won't compile", '!');
   // result = StrCat("Neither will NULLs", NULL);
   old_size = result.size();
-  StrAppend(&result, "To output a char by ASCII/numeric value, use +: ",
-            '!' + 0);
+  tensorflow::strings::StrAppend(
+      &result, "To output a char by ASCII/numeric value, use +: ", '!' + 0);
   EXPECT_EQ(result.substr(old_size),
             "To output a char by ASCII/numeric value, use +: 33");
 
   float f = 100000.5;
   old_size = result.size();
-  StrAppend(&result, "A hundred K and a half is ", f);
+  tensorflow::strings::StrAppend(&result, "A hundred K and a half is ", f);
   EXPECT_EQ(result.substr(old_size), "A hundred K and a half is 100000.5");
 
   double d = f;
   d *= d;
   old_size = result.size();
-  StrAppend(&result, "A hundred K and a half squared is ", d);
+  tensorflow::strings::StrAppend(&result, "A hundred K and a half squared is ",
+                                 d);
   EXPECT_EQ(result.substr(old_size),
             "A hundred K and a half squared is 10000100000.25");
 
   // Test 9 arguments, the old maximum
   old_size = result.size();
-  StrAppend(&result, 1, 22, 333, 4444, 55555, 666666, 7777777, 88888888, 9);
+  tensorflow::strings::StrAppend(&result, 1, 22, 333, 4444, 55555, 666666,
+                                 7777777, 88888888, 9);
   EXPECT_EQ(result.substr(old_size), "1223334444555556666667777777888888889");
 
   // No limit thanks to C++11's variadic templates
   old_size = result.size();
-  StrAppend(&result, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, "a", "b", "c", "d", "e",
-            "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r",
-            "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E",
-            "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R",
-            "S", "T", "U", "V", "W", "X", "Y", "Z",
-            "No limit thanks to C++11's variadic templates");
+  tensorflow::strings::StrAppend(
+      &result, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, "a", "b", "c", "d", "e", "f", "g",
+      "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v",
+      "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K",
+      "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
+      "No limit thanks to C++11's variadic templates");
   EXPECT_EQ(result.substr(old_size),
             "12345678910abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
             "No limit thanks to C++11's variadic templates");
@@ -282,8 +308,8 @@ TEST(StrAppend, Basics) {
 
 TEST(StrAppend, Death) {
   string s = "self";
-  EXPECT_DEBUG_DEATH(StrAppend(&s, s.c_str() + 1), "Check failed:");
-  EXPECT_DEBUG_DEATH(StrAppend(&s, s), "Check failed:");
+  EXPECT_DEBUG_DEATH(strings::StrAppend(&s, s.c_str() + 1), "Check failed:");
+  EXPECT_DEBUG_DEATH(strings::StrAppend(&s, s), "Check failed:");
 }
 
 static void CheckHex64(uint64 v) {
diff --git a/tensorflow/core/ops/array_grad.cc b/tensorflow/core/ops/array_grad.cc
index 325dbc48835d2f975ecd2530486be239fdcf96c6..38bd851da89357238360dcb3dd465b5e4f6a5fdd 100644
--- a/tensorflow/core/ops/array_grad.cc
+++ b/tensorflow/core/ops/array_grad.cc
@@ -333,6 +333,25 @@ Status TransposeGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Transpose", TransposeGrad);
 
+Status ConjugateTransposeGrad(const AttrSlice& attrs, FunctionDef* g) {
+  *g = FDH::Define(
+      // Arg defs
+      {"x: T", "p: int32", "dy: T"},
+      // Ret val defs
+      {"dx: T", "dp: int32"},
+      // Attr defs
+      {"T: type"},
+      // Nodes
+      {
+          {{"q"}, "InvertPermutation", {"p"}, {}},
+          {{"dx"}, "ConjugateTranspose", {"dy", "q"}, {{"T", "$T"}}},
+          {{"dp"}, "ZerosLike", {"p"}, {{"T", DT_INT32}}},
+      });
+  VLOG(1) << "ConjugateTransposeGrad " << DebugString(*g);
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("ConjugateTranspose", ConjugateTransposeGrad);
+
 Status ReverseGrad(const AttrSlice& attrs, FunctionDef* g) {
   *g = FDH::Define(
       // Arg defs
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 9fa6423d59d37dfb2a1086305c7e7dc7e5b2ebd2..267ce88440080399aae783903503f0bbd025d8b4 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -261,33 +261,7 @@ REGISTER_OP("ParallelConcat")
       c->set_output(0, passed_shape);
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-Concatenates a list of `N` tensors along the first dimension.
-
-The input tensors are all required to have size 1 in the first dimension.
-
-For example:
-
-```
-# 'x' is [[1, 4]]
-# 'y' is [[2, 5]]
-# 'z' is [[3, 6]]
-parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-```
-
-The difference between concat and parallel_concat is that concat requires all
-of the inputs be computed before the operation will begin but doesn't require
-that the input shapes be known during graph construction.  Parallel concat
-will copy pieces of the input into the output as they become available, in
-some situations this can provide a performance benefit.
-
-values: Tensors to be concatenated. All must have size 1 in the first dimension
- and same shape.
-output: The concatenated tensor.
-shape: the final shape of the result; should be equal to the shapes of any input
- but with the number of input values in the first dimension.
-)doc");
+    });
 
 REGISTER_OP("Pack")
     .Input("values: N * T")
@@ -323,35 +297,7 @@ REGISTER_OP("Pack")
 
       c->set_output(0, c->MakeShape(dims));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
-
-Packs the `N` tensors in `values` into a tensor with rank one higher than each
-tensor in `values`, by packing them along the `axis` dimension.
-Given a list of tensors of shape `(A, B, C)`;
-
-if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-Etc.
-
-For example:
-
-```
-# 'x' is [1, 4]
-# 'y' is [2, 5]
-# 'z' is [3, 6]
-pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-```
-
-This is the opposite of `unpack`.
-
-values: Must be of same shape and type.
-axis: Dimension along which to pack.  Negative values wrap around, so the
-  valid range is `[-(R+1), R+1)`.
-output: The packed tensor.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Unpack")
@@ -387,28 +333,14 @@ REGISTER_OP("Unpack")
       }
       for (int i = 0; i < c->num_outputs(); ++i) c->set_output(i, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-
-Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-For example, given a tensor of shape `(A, B, C, D)`;
-
-If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-  and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-  dimension unpacked along is gone, unlike `split`).
+    });
 
-If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-  and each tensor in `output` will have shape `(A, C, D)`.
-Etc.
-
-This is the opposite of `pack`.
-
-value: 1-D or higher, with `axis` dimension size equal to `num`.
-axis: Dimension along which to unpack.  Negative values wrap around, so the
-  valid range is `[-R, R)`.
-output: The list of tensors unpacked from `value`.
-)doc");
+REGISTER_OP("UnravelIndex")
+    .Input("indices: Tidx")
+    .Input("dims: Tidx")
+    .Output("output: Tidx")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) { return Status::OK(); });
 
 // --------------------------------------------------------------------------
 // TODO(josh11b): Remove the >= 2 constraint, once we can rewrite the graph
@@ -421,18 +353,7 @@ REGISTER_OP("Concat")
     .Attr("T: type")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::ConcatShape(c, c->num_inputs() - 1);
-    })
-    .Doc(R"doc(
-Concatenates tensors along one dimension.
-
-concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-  range [0, rank(values)).
-values: The `N` Tensors to concatenate. Their ranks and types must match,
-  and their sizes must match in all dimensions except `concat_dim`.
-output: A `Tensor` with the concatenation of values stacked along the
-  `concat_dim` dimension.  This tensor's shape matches that of `values` except
-  in `concat_dim` where it has the sum of the sizes.
-)doc");
+    });
 
 REGISTER_OP("ConcatV2")
     .Input("values: N * T")
@@ -441,18 +362,7 @@ REGISTER_OP("ConcatV2")
     .Attr("N: int >= 2")
     .Attr("T: type")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(shape_inference::ConcatV2Shape)
-    .Doc(R"doc(
-Concatenates tensors along one dimension.
-
-values: List of `N` Tensors to concatenate. Their ranks and types must match,
-  and their sizes must match in all dimensions except `concat_dim`.
-axis: 0-D.  The dimension along which to concatenate.  Must be in the
-  range [-rank(values), rank(values)).
-output: A `Tensor` with the concatenation of values stacked along the
-  `concat_dim` dimension.  This tensor's shape matches that of `values` except
-  in `concat_dim` where it has the sum of the sizes.
-)doc");
+    .SetShapeFn(shape_inference::ConcatV2Shape);
 
 // TODO(vivek.v.rane@intel.com): Prefix the op names with underscore if the ops
 // are not to be made user-accessible.
@@ -486,26 +396,7 @@ REGISTER_OP("ConcatOffset")
         c->set_output(i - 1, c->input(i));
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes offsets of concat inputs within its output.
-
-For example:
-
-```
-# 'x' is [2, 2, 7]
-# 'y' is [2, 3, 7]
-# 'z' is [2, 5, 7]
-concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-```
-
-This is typically used by gradient computations for a concat operation.
-
-concat_dim: The dimension along which to concatenate.
-shape: The `N` int32 vectors representing shape of tensors being concatenated.
-offset: The `N` int32 vectors representing the starting offset
-        of input tensors within the concatenated output.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Split")
@@ -540,19 +431,7 @@ REGISTER_OP("Split")
       }
       for (int i = 0; i < num_split; ++i) c->set_output(i, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Splits a tensor into `num_split` tensors along one dimension.
-
-split_dim: 0-D.  The dimension along which to split.  Must be in the range
-  `[-rank(value), rank(value))`.
-num_split: The number of ways to split.  Must evenly divide
-  `value.shape[split_dim]`.
-value: The tensor to split.
-output: They are identically shaped tensors, whose shape matches that of `value`
-  except along `split_dim`, where their sizes are
-  `values.shape[split_dim] / num_split`.
-)doc");
+    });
 
 REGISTER_OP("SplitV")
     .Input("value: T")
@@ -647,20 +526,7 @@ REGISTER_OP("SplitV")
       }
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-Splits a tensor into `num_split` tensors along one dimension.
-
-value: The tensor to split.
-size_splits: list containing the sizes of each output tensor along the split
-             dimension. Must sum to the dimension of value along split_dim.
-             Can contain one -1 indicating that dimension is to be inferred.
-split_dim: 0-D.  The dimension along which to split.  Must be in the range
-  `[-rank(value), rank(value))`.
-output: Tensors whose shape matches that of `value`
-  except along `split_dim`, where their sizes are
-  `size_splits[i]`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Const")
@@ -679,12 +545,7 @@ REGISTER_OP("Const")
       }
       c->set_output(0, c->MakeShape(dims));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns a constant tensor.
-
-value: Attr `value` is the tensor to return.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 // TODO(mgubin): Update the doc when the freeze_graph script supports converting
@@ -694,51 +555,39 @@ REGISTER_OP("ImmutableConst")
     .Attr("shape: shape")
     .Attr("memory_region_name: string")
     .Output("tensor: dtype")
-    .SetShapeFn(shape_inference::ExplicitShape)
-    .Doc(R"doc(
-Returns immutable tensor from memory region.
-
-The current implementation memmaps the tensor from a file.
+    .SetShapeFn(shape_inference::ExplicitShape);
 
-dtype: Type of the returned tensor.
-shape: Shape of the returned tensor.
-memory_region_name: Name of readonly memory region used by the tensor, see
-  NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-)doc");
+REGISTER_OP("GuaranteeConst")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      return UnchangedShape(c);
+    })
+    // We don't want this to be optimized away.
+    .SetIsStateful();
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ZerosLike")
     .Input("x: T")
     .Output("y: T")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns a tensor of zeros with the same shape and type as x.
-
-x: a tensor of type T.
-y: a tensor of the same shape and type as x but filled with zeros.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("OnesLike")
     .Input("x: T")
     .Output("y: T")
     .Attr(
-        "T: {float, double, int8, uint8, int16, uint16, int32, int64, "
-        "complex64, complex128, bool}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns a tensor of ones with the same shape and type as x.
-
-x: a tensor of type T.
-y: a tensor of the same shape and type as x but filled with ones.
-)doc");
+        "T: {bfloat16, float, double, int8, uint8, int16, uint16, int32, "
+        "int64, complex64, complex128, bool}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Diag")
     .Input("diagonal: T")
     .Output("output: T")
-    .Attr("T: {float, double, int32, int64, complex64, complex128}")
+    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(in, 1, &in));
@@ -747,36 +596,13 @@ REGISTER_OP("Diag")
       TF_RETURN_IF_ERROR(c->Concatenate(in, in, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns a diagonal tensor with a given diagonal values.
-
-Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-everything else padded with zeros. The diagonal is computed as follows:
-
-Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
-
-`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
-
-For example:
-
-```
-# 'diagonal' is [1, 2, 3, 4]
-tf.diag(diagonal) ==> [[1, 0, 0, 0]
-                       [0, 2, 0, 0]
-                       [0, 0, 3, 0]
-                       [0, 0, 0, 4]]
-```
-
-diagonal: Rank k tensor where k is at most 1.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("DiagPart")
     .Input("input: T")
     .Output("diagonal: T")
-    .Attr("T: {float, double, int32, int64, complex64, complex128}")
+    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       if (!c->RankKnown(in)) {
@@ -799,33 +625,7 @@ REGISTER_OP("DiagPart")
       }
       c->set_output(0, c->MakeShape(dims));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the diagonal part of the tensor.
-
-This operation returns a tensor with the `diagonal` part
-of the `input`. The `diagonal` part is computed as follows:
-
-Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
-tensor of rank `k` with dimensions `[D1,..., Dk]` where:
-
-`diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
-
-For example:
-
-```
-# 'input' is [[1, 0, 0, 0]
-              [0, 2, 0, 0]
-              [0, 0, 3, 0]
-              [0, 0, 0, 4]]
-
-tf.diag_part(input) ==> [1, 2, 3, 4]
-```
-
-input: Rank k tensor where k is even and not zero.
-diagonal: The extracted diagonal.
-
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("MatrixDiag")
@@ -845,40 +645,7 @@ REGISTER_OP("MatrixDiag")
           c->Concatenate(in, c->Vector(c->Dim(in, rank - 1)), &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns a batched diagonal tensor with a given batched diagonal values.
-
-Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-everything else padded with zeros. The diagonal is computed as follows:
-
-Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-
-`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-
-For example:
-
-```
-# 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-
-and diagonal.shape = (2, 4)
-
-tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-                                     [0, 2, 0, 0]
-                                     [0, 0, 3, 0]
-                                     [0, 0, 0, 4]],
-                                    [[5, 0, 0, 0]
-                                     [0, 6, 0, 0]
-                                     [0, 0, 7, 0]
-                                     [0, 0, 0, 8]]]
-
-which has shape (2, 4, 4)
-```
-
-diagonal: Rank `k`, where `k >= 1`.
-output: Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("MatrixSetDiag")
@@ -911,27 +678,7 @@ REGISTER_OP("MatrixSetDiag")
       }
       c->set_output(0, output);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns a batched matrix tensor with new batched diagonal values.
-
-Given `input` and `diagonal`, this operation returns a tensor with the
-same shape and values as `input`, except for the main diagonal of the
-innermost matrices.  These will be overwritten by the values in `diagonal`.
-
-The output is computed as follows:
-
-Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-`k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
-
-  * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-  * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
-
-input: Rank `k+1`, where `k >= 1`.
-diagonal: Rank `k`, where `k >= 1`.
-output: Rank `k+1`, with `output.shape = input.shape`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("MatrixDiagPart")
@@ -956,102 +703,17 @@ REGISTER_OP("MatrixDiagPart")
       dims.push_back(min_dim);
       c->set_output(0, c->MakeShape(dims));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the batched diagonal part of a batched tensor.
-
-This operation returns a tensor with the `diagonal` part
-of the batched `input`. The `diagonal` part is computed as follows:
-
-Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
-
-`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
-
-The input must be at least a matrix.
-
-For example:
-
-```
-# 'input' is [[[1, 0, 0, 0]
-               [0, 2, 0, 0]
-               [0, 0, 3, 0]
-               [0, 0, 0, 4]],
-              [[5, 0, 0, 0]
-               [0, 6, 0, 0]
-               [0, 0, 7, 0]
-               [0, 0, 0, 8]]]
-
-and input.shape = (2, 4, 4)
-
-tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
-
-which has shape (2, 4)
-```
-
-input: Rank `k` tensor where `k >= 2`.
-diagonal: The extracted diagonal(s) having shape
-  `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("MatrixBandPart")
     .Input("input: T")
-    .Input("num_lower: int64")
-    .Input("num_upper: int64")
+    .Input("num_lower: Tindex")
+    .Input("num_upper: Tindex")
     .Output("band: T")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Copy a tensor setting everything outside a central band in each innermost matrix
-to zero.
-
-The `band` part is computed as follows:
-Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-tensor with the same shape where
-
-`band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-
-The indicator function
-
-`in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-                 (num_upper < 0 || (n-m) <= num_upper)`.
-
-For example:
-
-```
-# if 'input' is [[ 0,  1,  2, 3]
-                 [-1,  0,  1, 2]
-                 [-2, -1,  0, 1]
-                 [-3, -2, -1, 0]],
-
-tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-                                       [-1,  0,  1, 2]
-                                       [ 0, -1,  0, 1]
-                                       [ 0,  0, -1, 0]],
-
-tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-                                      [-1,  0,  1, 0]
-                                      [-2, -1,  0, 1]
-                                      [ 0, -2, -1, 0]]
-```
-
-Useful special cases:
-
-```
- tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
- tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
- tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-```
-
-input: Rank `k` tensor.
-num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-           lower triangle.
-num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-           entire upper triangle.
-band: Rank `k` tensor of the same shape as input. The extracted banded tensor.
-
-)doc");
+    .Attr("Tindex: {int32, int64} = DT_INT64")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Reverse")
@@ -1059,9 +721,8 @@ REGISTER_OP("Reverse")
     .Input("dims: bool")
     .Output("output: T")
     .Attr(
-        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, float, "
-        "double, complex64, "
-        "complex128, string}")
+        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, "
+        "float, double, complex64, complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       ShapeHandle dims;
@@ -1076,59 +737,7 @@ REGISTER_OP("Reverse")
       }
       c->set_output(0, input);
       return Status::OK();
-    })
-    .Doc(R"Doc(
-Reverses specific dimensions of a tensor.
-
-Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-of `tensor`, this operation reverses each dimension i of `tensor` where
-`dims[i]` is `True`.
-
-`tensor` can have up to 8 dimensions. The number of dimensions
-of `tensor` must equal the number of elements in `dims`. In other words:
-
-`rank(tensor) = size(dims)`
-
-For example:
-
-```
-# tensor 't' is [[[[ 0,  1,  2,  3],
-#                  [ 4,  5,  6,  7],
-#                  [ 8,  9, 10, 11]],
-#                 [[12, 13, 14, 15],
-#                  [16, 17, 18, 19],
-#                  [20, 21, 22, 23]]]]
-# tensor 't' shape is [1, 2, 3, 4]
-
-# 'dims' is [False, False, False, True]
-reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-                        [ 7,  6,  5,  4],
-                        [ 11, 10, 9, 8]],
-                       [[15, 14, 13, 12],
-                        [19, 18, 17, 16],
-                        [23, 22, 21, 20]]]]
-
-# 'dims' is [False, True, False, False]
-reverse(t, dims) ==> [[[[12, 13, 14, 15],
-                        [16, 17, 18, 19],
-                        [20, 21, 22, 23]
-                       [[ 0,  1,  2,  3],
-                        [ 4,  5,  6,  7],
-                        [ 8,  9, 10, 11]]]]
-
-# 'dims' is [False, False, True, False]
-reverse(t, dims) ==> [[[[8, 9, 10, 11],
-                        [4, 5, 6, 7],
-                        [0, 1, 2, 3]]
-                       [[20, 21, 22, 23],
-                        [16, 17, 18, 19],
-                        [12, 13, 14, 15]]]]
-```
-
-tensor: Up to 8-D.
-dims: 1-D. The dimensions to reverse.
-output: The same shape as `tensor`.
-)Doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ReverseV2")
@@ -1137,9 +746,8 @@ REGISTER_OP("ReverseV2")
     .Output("output: T")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr(
-        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, float, "
-        "double, complex64, "
-        "complex128, string}")
+        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, bfloat16, "
+        "float, double, complex64, complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       ShapeHandle axis;
@@ -1151,62 +759,7 @@ REGISTER_OP("ReverseV2")
       }
       c->set_output(0, input);
       return Status::OK();
-    })
-    .Doc(R"Doc(
-Reverses specific dimensions of a tensor.
-
-NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-
-Given a `tensor`, and a `int32` tensor `axis` representing the set of
-dimensions of `tensor` to reverse. This operation reverses each dimension
-`i` for which there exists `j` s.t. `axis[j] == i`.
-
-`tensor` can have up to 8 dimensions. The number of dimensions specified
-in `axis` may be 0 or more entries. If an index is specified more than
-once, a InvalidArgument error is raised.
-
-For example:
-
-```
-# tensor 't' is [[[[ 0,  1,  2,  3],
-#                  [ 4,  5,  6,  7],
-#                  [ 8,  9, 10, 11]],
-#                 [[12, 13, 14, 15],
-#                  [16, 17, 18, 19],
-#                  [20, 21, 22, 23]]]]
-# tensor 't' shape is [1, 2, 3, 4]
-
-# 'dims' is [3] or 'dims' is [-1]
-reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-                        [ 7,  6,  5,  4],
-                        [ 11, 10, 9, 8]],
-                       [[15, 14, 13, 12],
-                        [19, 18, 17, 16],
-                        [23, 22, 21, 20]]]]
-
-# 'dims' is '[1]' (or 'dims' is '[-3]')
-reverse(t, dims) ==> [[[[12, 13, 14, 15],
-                        [16, 17, 18, 19],
-                        [20, 21, 22, 23]
-                       [[ 0,  1,  2,  3],
-                        [ 4,  5,  6,  7],
-                        [ 8,  9, 10, 11]]]]
-
-# 'dims' is '[2]' (or 'dims' is '[-2]')
-reverse(t, dims) ==> [[[[8, 9, 10, 11],
-                        [4, 5, 6, 7],
-                        [0, 1, 2, 3]]
-                       [[20, 21, 22, 23],
-                        [16, 17, 18, 19],
-                        [12, 13, 14, 15]]]]
-```
-
-tensor: Up to 8-D.
-axis: 1-D. The indices of the dimensions to reverse. Must be in the range
-  `[-rank(tensor), rank(tensor))`.
-output: The same shape as `tensor`.
-)Doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("EditDistance")
@@ -1248,73 +801,21 @@ REGISTER_OP("EditDistance")
 
       c->set_output(0, c->MakeShape(dims));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the (possibly normalized) Levenshtein Edit Distance.
-
-The inputs are variable-length sequences provided by SparseTensors
-  (hypothesis_indices, hypothesis_values, hypothesis_shape)
-and
-  (truth_indices, truth_values, truth_shape).
-
-The inputs are:
-
-hypothesis_indices: The indices of the hypothesis list SparseTensor.
-  This is an N x R int64 matrix.
-hypothesis_values: The values of the hypothesis list SparseTensor.
-  This is an N-length vector.
-hypothesis_shape: The shape of the hypothesis list SparseTensor.
-  This is an R-length vector.
-truth_indices: The indices of the truth list SparseTensor.
-  This is an M x R int64 matrix.
-truth_values: The values of the truth list SparseTensor.
-  This is an M-length vector.
-truth_shape: The shape of the truth list SparseTensor.
-  This is an R-length vector.
-truth_shape: truth indices, vector.
-normalize: boolean (if true, edit distances are normalized by length of truth).
-
-The output is:
-
-output: A dense float tensor with rank R - 1.
-
-For the example input:
-
-    // hypothesis represents a 2x1 matrix with variable-length values:
-    //   (0,0) = ["a"]
-    //   (1,0) = ["b"]
-    hypothesis_indices = [[0, 0, 0],
-                          [1, 0, 0]]
-    hypothesis_values = ["a", "b"]
-    hypothesis_shape = [2, 1, 1]
-
-    // truth represents a 2x2 matrix with variable-length values:
-    //   (0,0) = []
-    //   (0,1) = ["a"]
-    //   (1,0) = ["b", "c"]
-    //   (1,1) = ["a"]
-    truth_indices = [[0, 1, 0],
-                     [1, 0, 0],
-                     [1, 0, 1],
-                     [1, 1, 0]]
-    truth_values = ["a", "b", "c", "a"]
-    truth_shape = [2, 2, 2]
-    normalize = true
-
-The output will be:
-
-    // output is a 2x2 matrix with edit distances normalized by truth lengths.
-    output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-              [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Fill")
-    .Input("dims: int32")
+    .Input("dims: index_type")
     .Input("value: T")
     .Output("output: T")
     .Attr("T: type")
+    .Attr("index_type: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
+      DataType index_type = DT_INT32;
+      Status s = c->GetAttr("index_type", &index_type);
+      if (!s.ok() && s.code() != error::NOT_FOUND) {
+        return s;
+      }
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
@@ -1322,7 +823,8 @@ REGISTER_OP("Fill")
       const Tensor* t = c->input_tensor(0);
       if (t != nullptr) {
         for (int i = 0; i < t->NumElements(); ++i) {
-          if (t->vec<int32>()(i) < 0) {
+          if ((index_type == DT_INT32 && t->vec<int32>()(i) < 0) ||
+              (index_type == DT_INT64 && t->vec<int64>()(i) < 0)) {
             return errors::InvalidArgument("Fill dimensions must be >= 0");
           }
         }
@@ -1332,27 +834,7 @@ REGISTER_OP("Fill")
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Creates a tensor filled with a scalar value.
-
-This operation creates a tensor of shape `dims` and fills it with `value`.
-
-For example:
-
-```
-# Output tensor has shape [2, 3].
-fill([2, 3], 9) ==> [[9, 9, 9]
-                     [9, 9, 9]]
-```
-
-dims: 1-D. Represents the shape of the output tensor.
-value: 0-D (scalar). Value to fill the returned tensor.
-
-@compatibility(numpy)
-Equivalent to np.full
-@end_compatibility
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("_ParallelConcatStart")
@@ -1414,36 +896,7 @@ REGISTER_OP("Gather")
       TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, params_subshape, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Gather slices from `params` according to `indices`.
-
-`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-
-```python
-    # Scalar indices
-    output[:, ..., :] = params[indices, :, ... :]
-
-    # Vector indices
-    output[i, :, ..., :] = params[indices[i], :, ... :]
-
-    # Higher rank indices
-    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-```
-
-If `indices` is a permutation and `len(indices) == params.shape[0]` then
-this operation will permute `params` accordingly.
-
-`validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
-`indices` are always validated to be within range. If assigned to GPU,
-out-of-bound indices result in safe but unspecified behavior, which may include
-raising an error.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-</div>
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("GatherV2")
@@ -1509,40 +962,7 @@ REGISTER_OP("GatherV2")
 
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Gather slices from `params` axis `axis` according to `indices`.
-
-`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-params.shape[axis + 1:]` where:
-
-```python
-    # Scalar indices (output is rank(params) - 1).
-    output[a_0, ..., a_n, b_0, ..., b_n] =
-      params[a_0, ..., a_n, indices, b_0, ..., b_n]
-
-    # Vector indices (output is rank(params)).
-    output[a_0, ..., a_n, i, b_0, ..., b_n] =
-      params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
-
-    # Higher rank indices (output is rank(params) + rank(indices) - 1).
-    output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
-      params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
-```
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-</div>
-
-params: The tensor from which to gather values. Must be at least rank
-  `axis + 1`.
-indices: Index tensor. Must be in range `[0, params.shape[axis])`.
-axis: The axis in `params` to gather `indices` from. Defaults to the first
-  dimension. Supports negative indexes.
-output: Values from `params` gathered from indices given by `indices`, with
-  shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("GatherNd")
@@ -1565,8 +985,8 @@ REGISTER_OP("GatherNd")
       if (c->Value(r_dim) > c->Rank(params)) {
         return errors::InvalidArgument(
             "indices.shape[-1] must be <= params.rank, but saw indices shape: ",
-            c->DebugString(indices), " and params shape: ",
-            c->DebugString(params));
+            c->DebugString(indices),
+            " and params shape: ", c->DebugString(params));
       }
 
       // Remove r_dim from indices to get output.
@@ -1578,114 +998,7 @@ REGISTER_OP("GatherNd")
       TF_RETURN_IF_ERROR(c->Concatenate(indices_slice, params_slice, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Gather slices from `params` into a Tensor with shape specified by `indices`.
-
-`indices` is an K-dimensional integer tensor, best thought of as a
-(K-1)-dimensional tensor of indices into `params`, where each element defines a
-slice of `params`:
-
-    output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]
-
-Whereas in @{tf.gather} `indices` defines slices into the first
-dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
-first `N` dimensions of `params`, where `N = indices.shape[-1]`.
-
-The last dimension of `indices` can be at most the rank of
-`params`:
-
-    indices.shape[-1] <= params.rank
-
-The last dimension of `indices` corresponds to elements
-(if `indices.shape[-1] == params.rank`) or slices
-(if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
-of `params`.  The output tensor has shape
-
-    indices.shape[:-1] + params.shape[indices.shape[-1]:]
-
-Some examples below.
-
-Simple indexing into a matrix:
-
-```python
-    indices = [[0, 0], [1, 1]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = ['a', 'd']
-```
-
-Slice indexing into a matrix:
-
-```python
-    indices = [[1], [0]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = [['c', 'd'], ['a', 'b']]
-```
-
-Indexing into a 3-tensor:
-
-```python
-    indices = [[1]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [[['a1', 'b1'], ['c1', 'd1']]]
-
-
-    indices = [[0, 1], [1, 0]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [['c0', 'd0'], ['a1', 'b1']]
-
-
-    indices = [[0, 0, 1], [1, 0, 1]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = ['b0', 'b1']
-```
-
-Batched indexing into a matrix:
-
-```python
-    indices = [[[0, 0]], [[0, 1]]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = [['a'], ['b']]
-```
-
-Batched slice indexing into a matrix:
-
-```python
-    indices = [[[1]], [[0]]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = [[['c', 'd']], [['a', 'b']]]
-```
-
-Batched indexing into a 3-tensor:
-
-```python
-    indices = [[[1]], [[0]]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [[[['a1', 'b1'], ['c1', 'd1']]],
-              [[['a0', 'b0'], ['c0', 'd0']]]]
-
-    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [[['c0', 'd0'], ['a1', 'b1']],
-              [['a0', 'b0'], ['c1', 'd1']]]
-
-
-    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [['b0', 'b1'], ['d0', 'c1']]
-```
-
-params: The tensor from which to gather values.
-indices: Index tensor.
-output: Values from `params` gathered from indices given by `indices`, with
-  shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Identity")
@@ -1699,10 +1012,20 @@ REGISTER_OP("Identity")
         c->set_output_handle_shapes_and_types(0, *handle_data);
       }
       return Status::OK();
-    })
-    .Doc(R"Doc(
-Return a tensor with the same shape and contents as the input tensor or value.
-)Doc");
+    });
+
+REGISTER_OP("Snapshot")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
+      return Status::OK();
+    });
 
 #ifdef INTEL_MKL
 REGISTER_OP("_MklIdentity")
@@ -1732,25 +1055,7 @@ REGISTER_OP("IdentityN")
       TF_RETURN_IF_ERROR(c->input("input", &input));
       TF_RETURN_IF_ERROR(c->set_output("output", input));
       return Status::OK();
-    })
-    .Doc(R"Doc(
-Returns a list of tensors with the same shapes and contents as the input
-tensors.
-
-This op can be used to override the gradient for complicated functions. For
-example, suppose y = f(x) and we wish to apply a custom function g for backprop
-such that dx = g(dy). In Python,
-
-```python
-with tf.get_default_graph().gradient_override_map(
-    {'IdentityN': 'OverrideGradientWithG'}):
-  y, _ = identity_n([f(x), x])
-
-@tf.RegisterGradient('OverrideGradientWithG')
-def ApplyG(op, dy, _):
-  return [None, g(dy)]  # Do not backprop to f(x).
-```
-)Doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("RefIdentity")
@@ -1758,10 +1063,7 @@ REGISTER_OP("RefIdentity")
     .Output("output: Ref(T)")
     .Attr("T: type")
     .SetShapeFn(shape_inference::UnchangedShape)
-    .SetAllowsUninitializedInput()
-    .Doc(R"Doc(
-Return the same ref tensor as the input ref tensor.
-)Doc");
+    .SetAllowsUninitializedInput();
 
 // --------------------------------------------------------------------------
 REGISTER_OP("DebugGradientIdentity")
@@ -1769,82 +1071,36 @@ REGISTER_OP("DebugGradientIdentity")
     .Output("output: T")
     .Attr("T: type")
     .SetShapeFn(shape_inference::UnchangedShape)
-    .SetAllowsUninitializedInput()
-    .Doc(R"Doc(
-Identity op for gradient debugging.
+    .SetAllowsUninitializedInput();
 
-This op is hidden from public in Python. It is used by TensorFlow Debugger to
-register gradient tensors for gradient debugging.
-)Doc");
+REGISTER_OP("DebugGradientRefIdentity")
+    .Input("input: Ref(T)")
+    .Output("output: Ref(T)")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .SetAllowsUninitializedInput();
 
 // --------------------------------------------------------------------------
 REGISTER_OP("StopGradient")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"Doc(
-Stops gradient computation.
-
-When executed in a graph, this op outputs its input tensor as-is.
-
-When building ops to compute gradients, this op prevents the contribution of
-its inputs to be taken into account.  Normally, the gradient generator adds ops
-to a graph to compute the derivatives of a specified 'loss' by recursively
-finding out inputs that contributed to its computation.  If you insert this op
-in the graph it inputs are masked from the gradient generator.  They are not
-taken into account for computing gradients.
-
-This is useful any time you want to compute a value with TensorFlow but need
-to pretend that the value was a constant. Some examples include:
-
-*  The *EM* algorithm where the *M-step* should not involve backpropagation
-   through the output of the *E-step*.
-*  Contrastive divergence training of Boltzmann machines where, when
-   differentiating the energy function, the training must not backpropagate
-   through the graph that generated the samples from the model.
-*  Adversarial training, where no backprop should happen through the adversarial
-   example generation process.
-)Doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("PreventGradient")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: type")
     .Attr("message: string = ''")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"Doc(
-An identity op that triggers an error if a gradient is requested.
-
-When executed in a graph, this op outputs its input tensor as-is.
-
-When building ops to compute gradients, the TensorFlow gradient system
-will return an error when trying to lookup the gradient of this op,
-because no gradient must ever be registered for this function.  This
-op exists to prevent subtle bugs from silently returning unimplemented
-gradients in some corner cases.
-
-input: any tensor.
-output: the same input tensor.
-message: Will be printed in the error when anyone tries to differentiate
-this operation.
-)Doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("CheckNumerics")
     .Input("tensor: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("message: string")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Checks a tensor for NaN and Inf values.
-
-When run, reports an `InvalidArgument` error if `tensor` has any values
-that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
-
-message: Prefix of the error message.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Reshape")
@@ -1853,69 +1109,9 @@ REGISTER_OP("Reshape")
     .Output("output: T")
     .Attr("T: type")
     .Attr("Tshape: {int32, int64} = DT_INT32")
-    .SetShapeFn([](InferenceContext* c) { return SetOutputShapeForReshape(c); })
-    .Doc(R"Doc(
-Reshapes a tensor.
-
-Given `tensor`, this operation returns a tensor that has the same values
-as `tensor` with shape `shape`.
-
-If one component of `shape` is the special value -1, the size of that dimension
-is computed so that the total size remains constant.  In particular, a `shape`
-of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
-
-If `shape` is 1-D or higher, then the operation returns a tensor with shape
-`shape` filled with the values of `tensor`. In this case, the number of elements
-implied by `shape` must be the same as the number of elements in `tensor`.
-
-For example:
-
-```
-# tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-# tensor 't' has shape [9]
-reshape(t, [3, 3]) ==> [[1, 2, 3],
-                        [4, 5, 6],
-                        [7, 8, 9]]
-
-# tensor 't' is [[[1, 1], [2, 2]],
-#                [[3, 3], [4, 4]]]
-# tensor 't' has shape [2, 2, 2]
-reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
-                        [3, 3, 4, 4]]
-
-# tensor 't' is [[[1, 1, 1],
-#                 [2, 2, 2]],
-#                [[3, 3, 3],
-#                 [4, 4, 4]],
-#                [[5, 5, 5],
-#                 [6, 6, 6]]]
-# tensor 't' has shape [3, 2, 3]
-# pass '[-1]' to flatten 't'
-reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
-
-# -1 can also be used to infer the shape
-
-# -1 is inferred to be 9:
-reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-# -1 is inferred to be 2:
-reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-# -1 is inferred to be 3:
-reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
-                              [2, 2, 2],
-                              [3, 3, 3]],
-                             [[4, 4, 4],
-                              [5, 5, 5],
-                              [6, 6, 6]]]
-
-# tensor 't' is [7]
-# shape `[]` reshapes to a scalar
-reshape(t, []) ==> 7
-```
-
-shape: Defines the shape of the output tensor.
-)Doc");
+    .SetShapeFn([](InferenceContext* c) {
+      return SetOutputShapeForReshape(c);
+    });
 
 #ifdef INTEL_MKL
 REGISTER_OP("_MklReshape")
@@ -1942,29 +1138,7 @@ REGISTER_OP("InvertPermutation")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &x));
       c->set_output(0, x);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the inverse permutation of a tensor.
-
-This operation computes the inverse of an index permutation. It takes a 1-D
-integer tensor `x`, which represents the indices of a zero-based array, and
-swaps each value with its index position. In other words, for an output tensor
-`y` and an input tensor `x`, this operation computes the following:
-
-`y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-
-The values must include 0. There can be no duplicate values or negative values.
-
-For example:
-
-```
-# tensor `x` is [3, 4, 0, 2, 1]
-invert_permutation(x) ==> [2, 4, 3, 0, 1]
-```
-
-x: 1-D.
-y: 1-D.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Transpose")
@@ -1973,13 +1147,7 @@ REGISTER_OP("Transpose")
     .Output("y: T")
     .Attr("T: type")
     .Attr("Tperm: {int32, int64} = DT_INT32")
-    .SetShapeFn(TransposeShapeFn)
-    .Doc(R"doc(
-Shuffle dimensions of x according to a permutation.
-
-The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-  `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-)doc");
+    .SetShapeFn(TransposeShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ConjugateTranspose")
@@ -1988,14 +1156,7 @@ REGISTER_OP("ConjugateTranspose")
     .Output("y: T")
     .Attr("T: type")
     .Attr("Tperm: {int32, int64} = DT_INT32")
-    .SetShapeFn(TransposeShapeFn)
-    .Doc(R"doc(
-Shuffle dimensions of x according to a permutation and conjugate the result.
-
-The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-  `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-  `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
-)doc");
+    .SetShapeFn(TransposeShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Unique")
@@ -2008,70 +1169,21 @@ REGISTER_OP("Unique")
       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
       c->set_output(1, c->input(0));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Finds unique elements in a 1-D tensor.
-
-This operation returns a tensor `y` containing all of the unique elements of `x`
-sorted in the same order that they occur in `x`. This operation also returns a
-tensor `idx` the same size as `x` that contains the index of each value of `x`
-in the unique output `y`. In other words:
-
-`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-
-For example:
-
-```
-# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-y, idx = unique(x)
-y ==> [1, 2, 4, 7, 8]
-idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-```
-
-x: 1-D.
-y: 1-D.
-idx: 1-D.
-)doc");
+    });
 
 REGISTER_OP("UniqueV2")
     .Input("x: T")
-    .Input("axis: int64")
+    .Input("axis: Taxis")
     .Output("y: T")
     .Output("idx: out_idx")
     .Attr("T: type")
+    .Attr("Taxis: {int32,int64} = DT_INT64")
     .Attr("out_idx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
       c->set_output(1, c->input(0));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Finds unique elements in a 1-D tensor.
-
-This operation returns a tensor `y` containing all of the unique elements of `x`
-sorted in the same order that they occur in `x`. This operation also returns a
-tensor `idx` the same size as `x` that contains the index of each value of `x`
-in the unique output `y`. In other words:
-
-`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-
-For example:
-
-```
-# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-y, idx = unique(x)
-y ==> [1, 2, 4, 7, 8]
-idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-```
-
-
-x: A `Tensor`.
-axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
-  find the unique elements.
-y: A `Tensor`. Unique elements along the `axis` of `Tensor` x.
-idx: A 1-D Tensor. Has the same type as x that contains the index of each
-  value of x in the output y.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("UniqueWithCounts")
@@ -2087,33 +1199,7 @@ REGISTER_OP("UniqueWithCounts")
       c->set_output(1, c->input(0));
       c->set_output(2, uniq);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Finds unique elements in a 1-D tensor.
-
-This operation returns a tensor `y` containing all of the unique elements of `x`
-sorted in the same order that they occur in `x`. This operation also returns a
-tensor `idx` the same size as `x` that contains the index of each value of `x`
-in the unique output `y`. Finally, it returns a third tensor `count` that
-contains the count of each element of `y` in `x`. In other words:
-
-`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-
-For example:
-
-```
-# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-y, idx, count = unique_with_counts(x)
-y ==> [1, 2, 4, 7, 8]
-idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-count ==> [2, 1, 3, 1, 2]
-```
-
-x: 1-D.
-y: 1-D.
-idx: 1-D.
-count: 1-D.
-)doc");
+    });
 
 namespace {
 
@@ -2138,20 +1224,7 @@ REGISTER_OP("Shape")
     .Output("output: out_type")
     .Attr("T: type")
     .Attr("out_type: {int32, int64} = DT_INT32")
-    .SetShapeFn(ShapeShapeFn)
-    .Doc(R"doc(
-Returns the shape of a tensor.
-
-This operation returns a 1-D integer tensor representing the shape of `input`.
-
-For example:
-
-```
-# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-shape(t) ==> [2, 2, 3]
-```
-
-)doc");
+    .SetShapeFn(ShapeShapeFn);
 
 REGISTER_OP("ShapeN")
     .Input("input: N * T")
@@ -2159,12 +1232,7 @@ REGISTER_OP("ShapeN")
     .Attr("N: int")
     .Attr("T: type")
     .Attr("out_type: {int32, int64} = DT_INT32")
-    .SetShapeFn(ShapeShapeFn)
-    .Doc(R"doc(
-Returns shape of tensors.
-
-This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-)doc");
+    .SetShapeFn(ShapeShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ReverseSequence")
@@ -2192,12 +1260,12 @@ REGISTER_OP("ReverseSequence")
       // Validate batch_dim and seq_dim against input.
       const int32 input_rank = c->Rank(input);
       if (batch_dim >= input_rank) {
-        return errors::InvalidArgument("batch_dim must be < input rank: ",
-                                       batch_dim, " vs. ", input_rank);
+        return errors::InvalidArgument(
+            "batch_dim must be < input rank: ", batch_dim, " vs. ", input_rank);
       }
       if (seq_dim >= input_rank) {
-        return errors::InvalidArgument("seq_dim must be < input rank: ",
-                                       seq_dim, " vs. ", input_rank);
+        return errors::InvalidArgument(
+            "seq_dim must be < input rank: ", seq_dim, " vs. ", input_rank);
       }
 
       DimensionHandle batch_dim_dim = c->Dim(input, batch_dim);
@@ -2210,96 +1278,14 @@ REGISTER_OP("ReverseSequence")
           c->ReplaceDim(input, batch_dim, batch_dim_dim, &output_shape));
       c->set_output(0, output_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Reverses variable length slices.
-
-This op first slices `input` along the dimension `batch_dim`, and for each
-slice `i`, reverses the first `seq_lengths[i]` elements along
-the dimension `seq_dim`.
-
-The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-
-The output slice `i` along dimension `batch_dim` is then given by input
-slice `i`, with the first `seq_lengths[i]` slices along dimension
-`seq_dim` reversed.
-
-For example:
-
-```
-# Given this:
-batch_dim = 0
-seq_dim = 1
-input.dims = (4, 8, ...)
-seq_lengths = [7, 2, 3, 5]
-
-# then slices of input are reversed on seq_dim, but only up to seq_lengths:
-output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-
-# while entries past seq_lens are copied through:
-output[0, 7:, :, ...] = input[0, 7:, :, ...]
-output[1, 2:, :, ...] = input[1, 2:, :, ...]
-output[2, 3:, :, ...] = input[2, 3:, :, ...]
-output[3, 2:, :, ...] = input[3, 2:, :, ...]
-```
-
-In contrast, if:
-
-```
-# Given this:
-batch_dim = 2
-seq_dim = 0
-input.dims = (8, ?, 4, ...)
-seq_lengths = [7, 2, 3, 5]
-
-# then slices of input are reversed on seq_dim, but only up to seq_lengths:
-output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
-
-# while entries past seq_lens are copied through:
-output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-```
-
-input: The input to reverse.
-seq_lengths: 1-D with length `input.dims(batch_dim)` and
-  `max(seq_lengths) <= input.dims(seq_dim)`
-seq_dim: The dimension which is partially reversed.
-batch_dim: The dimension along which reversal is performed.
-output: The partially reversed input. It has the same shape as `input`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Rank")
     .Input("input: T")
     .Output("output: int32")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Returns the rank of a tensor.
-
-This operation returns an integer representing the rank of `input`.
-
-For example:
-
-```
-# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-# shape of tensor 't' is [2, 2, 3]
-rank(t) ==> 3
-```
-
-**Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-of a tensor is the number of indices required to uniquely select each element
-of the tensor. Rank is also known as "order", "degree", or "ndims."
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Size")
@@ -2307,21 +1293,7 @@ REGISTER_OP("Size")
     .Output("output: out_type")
     .Attr("T: type")
     .Attr("out_type: {int32, int64} = DT_INT32")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Returns the size of a tensor.
-
-This operation returns an integer representing the number of elements in
-`input`.
-
-For example:
-
-```
-# 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-size(t) ==> 12
-```
-
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 namespace {
 
@@ -2438,24 +1410,7 @@ REGISTER_OP("Slice")
       }
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-Return a slice from 'input'.
-
-The output tensor is a tensor with dimensions described by 'size'
-whose values are extracted from 'input' starting at the offsets in
-'begin'.
-
-*Requirements*:
-  0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
-
-begin: begin[i] specifies the offset into the 'i'th dimension of
-  'input' to slice from.
-size: size[i] specifies the number of elements of the 'i'th dimension
-  of 'input' to slice. If size[i] is -1, all remaining elements in dimension
-  i are included in the slice (i.e. this is equivalent to setting
-  size[i] = input.dim_size(i) - begin[i]).
-)doc");
+    });
 
 REGISTER_OP("StridedSlice")
     .Input("input: T")
@@ -2520,133 +1475,7 @@ REGISTER_OP("StridedSlice")
       c->set_output(0, out);
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-Return a strided slice from `input`.
-
-Note, most python users will want to use the Python `Tensor.__getitem__`
-or `Variable.__getitem__` rather than this op directly.
-
-The goal of this op is to produce a new tensor with a subset of
-the elements from the `n` dimensional `input` tensor. The subset is chosen using
-a sequence of `m` sparse range specifications encoded into the arguments
-of this function. Note, in some cases
-`m` could be equal to `n`, but this need not be the case. Each
-range specification entry can be one of the following:
-
-- An ellipsis (...). Ellipses are used to imply zero or more
-  dimensions of full-dimension selection and are produced using
-  `ellipsis_mask`. For example, `foo[...]` is the identity slice.
-
-- A new axis. This is used to insert a new shape=1 dimension and is
-  produced using `new_axis_mask`. For example, `foo[:, ...]` where
-  `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
-
-
-- A range `begin:end:stride`. This is used to specify how much to choose from
-  a given dimension. `stride` can be any integer but 0.  `begin` is an integer
-  which represents the index of the first value to select while `end` represents
-  the index of the last value to select. The number of values selected in each
-  dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
-  `begin` and `end` can be negative where `-1` is the last element, `-2` is
-  the second to last. `begin_mask` controls whether to replace the explicitly
-  given `begin` with an implicit effective value of `0` if `stride > 0` and
-  `-1` if `stride < 0`. `end_mask` is analogous but produces the number
-  required to create the largest open interval. For example, given a shape
-  `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
-  not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
-  and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
-  first dimension of a tensor while dropping the last two (in the original
-  order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
-
-- A single index. This is used to keep only elements that have a given
-  index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
-  shape `(6,)` tensor. This is encoded in `begin` and `end` and
-  `shrink_axis_mask`.
-
-Each conceptual range specification is encoded in the op's argument. This
-encoding is best understand by considering a non-trivial example. In
-particular,
-`foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
-
-```
-begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
-end = [2, 4, x, x, -3, x]
-strides = [1, 1, x, x, -1, 1]
-begin_mask = 1<<4 | 1 << 5 = 48
-end_mask = 1<<5 = 32
-ellipsis_mask = 1<<3 = 8
-new_axis_mask = 1<<2 4
-shrink_axis_mask = 1<<0
-```
-
-In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
-the slice becomes (2, 1, 5, 5, 2, 5).
-Let us walk step by step through each argument specification.
-
-1.  The first argument in the example slice is turned into `begin = 1` and
-`end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
-also set the appropriate bit in `shrink_axis_mask`.
-
-2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
-zero bits contributed.
-
-3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
-dimension in the final shape. Dummy values are contributed to begin,
-end and stride, while the new_axis_mask bit is set.
-
-4. `...` grab the full ranges from as many dimensions as needed to
-fully specify a slice for every dimension of the input shape.
-
-5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
-with a dimension that has shape `s` is converted to a positive index
-`s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
-is done internally so begin, end and strides receive x, -3, and -1.
-The appropriate begin_mask bit is set to indicate the start range is the
-full range (ignoring the x).
-
-6. `:` indicates that the entire contents of the corresponding dimension
-is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
-receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
-`end_mask` are also set.
-
-*Requirements*:
-  `0 != strides[i] for i in [0, m)`
-  `ellipsis_mask must be a power of two (only one ellipsis)`
-
-begin: `begin[k]` specifies the offset into the `k`th range specification.
-  The exact dimension this corresponds to will be determined by context.
-  Out-of-bounds values will be silently clamped. If the `k`th bit of
-  `begin_mask` then `begin[k]` is ignored and the full range of the
-  appropriate dimension is used instead. Negative values causes indexing
-  to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
-end: `end[i]` is like `begin` with the exception that `end_mask` is
-  used to determine full ranges.
-strides: `strides[i]` specifies the increment in the `i`th specification
-  after extracting a given element. Negative indices will reverse
-  the original order. Out or range values are
-  clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
-begin_mask: a bitmask where a bit i being 1 means to ignore the begin
-  value and instead use the largest interval possible. At runtime
-  begin[i] will be replaced with `[0, n-1) if `stride[i] > 0` or
-  `[-1, n-1]` if `stride[i] < 0`
-end_mask: analogous to `begin_mask`
-ellipsis_mask: a bitmask where bit `i` being 1 means the `i`th
-  position is actually an ellipsis. One bit at most can be 1.
-  If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
-  is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
-  implicitly creates as many range specifications as necessary to fully
-  specify the sliced range for every dimension. For example for a 4-dimensional
-  tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
-new_axis_mask: a bitmask where bit `i` being 1 means the `i`th
-  specification creates a new shape 1 dimension. For example
-  `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
-shrink_axis_mask: a bitmask where bit `i` implies that the `i`th
-  specification should shrink the dimensionality. begin and end
-  must imply a slice of size 1 in the dimension. For example in
-  python one might do `foo[:, 3, :]` which would result in
-  `shrink_axis_mask` being 2.
-)doc");
+    });
 
 REGISTER_OP("StridedSliceGrad")
     .Input("shape: Index")
@@ -2667,19 +1496,7 @@ REGISTER_OP("StridedSliceGrad")
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the gradient of `StridedSlice`.
-
-Since `StridedSlice` cuts out pieces of its `input` which is size
-`shape`, its gradient will have the same shape (which is passed here
-as `shape`). The gradient will be zero in any element that the slice
-does not select.
-
-Arguments are the same as StridedSliceGrad with the exception that
-`dy` is the input gradient to be propagated and `shape` is the
-shape of `StridedSlice`'s `input`.
-)doc");
+    });
 
 REGISTER_OP("StridedSliceAssign")
     .Input("ref: Ref(T)")
@@ -2695,18 +1512,7 @@ REGISTER_OP("StridedSliceAssign")
     .Attr("ellipsis_mask: int = 0")
     .Attr("new_axis_mask: int = 0")
     .Attr("shrink_axis_mask: int = 0")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Assign `value` to the sliced l-value reference of `ref`.
-
-The values of `value` are assigned to the positions in the variable
-`ref` that are selected by the slice parameters. The slice parameters
-`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
-
-NOTE this op currently does not support broadcasting and so `value`'s
-shape must be exactly the shape produced by the slice of `ref`.
-
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 // TODO(aselle): Fix this documentation once StridedSliceAssign Supports
 // broadcasting.
 // --------------------------------------------------------------------------
@@ -2724,18 +1530,7 @@ REGISTER_OP("ResourceStridedSliceAssign")
     .Attr("ellipsis_mask: int = 0")
     .Attr("new_axis_mask: int = 0")
     .Attr("shrink_axis_mask: int = 0")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Assign `value` to the sliced l-value reference of `ref`.
-
-The values of `value` are assigned to the positions in the variable
-`ref` that are selected by the slice parameters. The slice parameters
-`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
-
-NOTE this op currently does not support broadcasting and so `value`'s
-shape must be exactly the shape produced by the slice of `ref`.
-
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("Tile")
     .Input("input: T")
@@ -2767,19 +1562,7 @@ REGISTER_OP("Tile")
       }
       c->set_output(0, c->MakeShape(dims));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Constructs a tensor by tiling a given tensor.
-
-This operation creates a new tensor by replicating `input` `multiples` times.
-The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-and the values of `input` are replicated `multiples[i]` times along the 'i'th
-dimension. For example, tiling `[a b c d]` by `[2]` produces
-`[a b c d a b c d]`.
-
-input: 1-D or higher.
-multiples: 1-D. Length must be the same as the number of dimensions in `input`
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("TileGrad")
@@ -2788,14 +1571,7 @@ REGISTER_OP("TileGrad")
     .Output("output: T")
     .Attr("T: type")
     .Deprecated(3, "TileGrad has been replaced with reduce_sum")
-    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
-    .Doc(R"doc(
-Returns the gradient of `Tile`.
-
-Since `Tile` takes an input and repeats the input `multiples` times
-along each dimension, `TileGrad` takes in `multiples` and aggregates
-each repeated tile of `input` into `output`.
-)doc");
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Where")
@@ -2805,71 +1581,7 @@ REGISTER_OP("Where")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Matrix(c->UnknownDim(), c->Rank(c->input(0))));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns locations of nonzero / true values in a tensor.
-
-This operation returns the coordinates of true elements in `input`. The
-coordinates are returned in a 2-D tensor where the first dimension (rows)
-represents the number of true elements, and the second dimension (columns)
-represents the coordinates of the true elements. Keep in mind, the shape of
-the output tensor can vary depending on how many true values there are in
-`input`. Indices are output in row-major order.
-
-For example:
-
-```
-# 'input' tensor is [[True, False]
-#                    [True, False]]
-# 'input' has two true values, so output has two coordinates.
-# 'input' has rank of 2, so coordinates have two indices.
-where(input) ==> [[0, 0],
-                  [1, 0]]
-
-# `input` tensor is [[[True, False]
-#                     [True, False]]
-#                    [[False, True]
-#                     [False, True]]
-#                    [[False, False]
-#                     [False, True]]]
-# 'input' has 5 true values, so output has 5 coordinates.
-# 'input' has rank of 3, so coordinates have three indices.
-where(input) ==> [[0, 0, 0],
-                  [0, 1, 0],
-                  [1, 0, 1],
-                  [1, 1, 1],
-                  [2, 1, 1]]
-
-# `input` tensor is [[[1.5,  0.0]
-#                     [-0.5, 0.0]]
-#                    [[0.0,  0.25]
-#                     [0.0,  0.75]]
-#                    [[0.0,  0.0]
-#                     [0.0,  0.01]]]
-# 'input' has 5 nonzero values, so output has 5 coordinates.
-# 'input' has rank of 3, so coordinates have three indices.
-where(input) ==> [[0, 0, 0],
-                  [0, 1, 0],
-                  [1, 0, 1],
-                  [1, 1, 1],
-                  [2, 1, 1]]
-
-# `input` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
-#                     [0.0 + 0.5j, 0.0  + 0.0j]]
-#                    [[0.0 + 0.0j, 0.25 + 1.5j]
-#                     [0.0 + 0.0j, 0.75 + 0.0j]]
-#                    [[0.0 + 0.0j, 0.0  + 0.0j]
-#                     [0.0 + 0.0j, 0.01 + 0.0j]]]
-# 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
-# 'input' has rank of 3, so coordinates have three indices.
-where(input) ==> [[0, 0, 0],
-                  [0, 1, 0],
-                  [1, 0, 1],
-                  [1, 1, 1],
-                  [2, 1, 1]]
-```
-
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("BroadcastArgs")
@@ -2896,13 +1608,7 @@ REGISTER_OP("BroadcastArgs")
       // Broadcasted shape is going to be as large as the largest dimension.
       c->set_output(0, c->Vector(std::max(x_dim, y_dim)));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Return the shape of s0 op s1 with broadcast.
-
-Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("BroadcastGradientArgs")
@@ -2919,12 +1625,7 @@ REGISTER_OP("BroadcastGradientArgs")
       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
       c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Return the reduction indices for computing gradients of s0 op s1 with broadcast.
-
-This is typically used by gradient computations for a broadcasting operation.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Pad")
@@ -2933,34 +1634,7 @@ REGISTER_OP("Pad")
     .Output("output: T")
     .Attr("T: type")
     .Attr("Tpaddings: {int32, int64} = DT_INT32")
-    .SetShapeFn(PadShapeFn)
-    .Doc(R"doc(
-Pads a tensor with zeros.
-
-This operation pads a `input` with zeros according to the `paddings` you
-specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-how many zeros to add before the contents of `input` in that dimension, and
-`paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-in that dimension.
-
-The padded size of each dimension D of the output is:
-
-`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-
-For example:
-
-```
-# 't' is [[1, 1], [2, 2]]
-# 'paddings' is [[1, 1], [2, 2]]
-# rank of 't' is 2
-pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-                      [0, 0, 1, 1, 0, 0]
-                      [0, 0, 2, 2, 0, 0]
-                      [0, 0, 0, 0, 0, 0]]
-```
-
-)doc");
+    .SetShapeFn(PadShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("PadV2")
@@ -2970,36 +1644,7 @@ REGISTER_OP("PadV2")
     .Output("output: T")
     .Attr("T: type")
     .Attr("Tpaddings: {int32, int64} = DT_INT32")
-    .SetShapeFn(PadShapeFn)
-    .Doc(R"doc(
-Pads a tensor.
-
-This operation pads `input` according to the `paddings` and `constant_values`
-you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
-the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-how many padding values to add before the contents of `input` in that dimension,
-and `paddings[D, 1]` indicates how many padding values to add after the contents
-of `input` in that dimension. `constant_values` is a scalar tensor of the same
-type as `input` that indicates the value to use for padding `input`.
-
-The padded size of each dimension D of the output is:
-
-`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-
-For example:
-
-```
-# 't' is [[1, 1], [2, 2]]
-# 'paddings' is [[1, 1], [2, 2]]
-# 'constant_values' is 0
-# rank of 't' is 2
-pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-                      [0, 0, 1, 1, 0, 0]
-                      [0, 0, 2, 2, 0, 0]
-                      [0, 0, 0, 0, 0, 0]]
-```
-
-)doc");
+    .SetShapeFn(PadShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("MirrorPad")
@@ -3009,46 +1654,7 @@ REGISTER_OP("MirrorPad")
     .Attr("T: type")
     .Attr("Tpaddings: {int32, int64} = DT_INT32")
     .Attr(GetMirrorPadModeAttrString())
-    .SetShapeFn(PadShapeFn)
-    .Doc(R"doc(
-Pads a tensor with mirrored values.
-
-This operation pads a `input` with mirrored values according to the `paddings`
-you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
-the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-how many values to add before the contents of `input` in that dimension, and
-`paddings[D, 1]` indicates how many values to add after the contents of `input`
-in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
-than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
-(if false, respectively).
-
-The padded size of each dimension D of the output is:
-
-`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-
-For example:
-
-```
-# 't' is [[1, 2, 3], [4, 5, 6]].
-# 'paddings' is [[1, 1]], [2, 2]].
-# 'mode' is SYMMETRIC.
-# rank of 't' is 2.
-pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
-                      [2, 1, 1, 2, 3, 3, 2]
-                      [5, 4, 4, 5, 6, 6, 5]
-                      [5, 4, 4, 5, 6, 6, 5]]
-```
-
-input: The input tensor to be padded.
-paddings: A two-column matrix specifying the padding sizes. The number of
-  rows must be the same as the rank of `input`.
-mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
-  do not include the borders, while in symmetric mode the padded regions
-  do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
-  is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
-  it is `[1, 2, 3, 3, 2]` in symmetric mode.
-output: The padded tensor.
-)doc");
+    .SetShapeFn(PadShapeFn);
 
 // --------------------------------------------------------------------------
 namespace {
@@ -3110,35 +1716,7 @@ REGISTER_OP("MirrorPadGrad")
       } else {
         return MirrorPadKnown<int64>(c, input, paddings_t, input_rank);
       }
-    })
-    .Doc(R"doc(
-Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
-
-This operation folds the padded areas of `input` by `MirrorPad` according to the
-`paddings` you specify. `paddings` must be the same as `paddings` argument
-given to the corresponding `MirrorPad` op.
-
-The folded size of each dimension D of the output is:
-
-`input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
-
-For example:
-
-```
-# 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
-# 'paddings' is [[0, 1]], [0, 1]].
-# 'mode' is SYMMETRIC.
-# rank of 't' is 2.
-pad(t, paddings) ==> [[ 1,  5]
-                      [11, 28]]
-```
-
-input: The input tensor to be folded.
-paddings: A two-column matrix specifying the padding sizes. The number of
-  rows must be the same as the rank of `input`.
-mode: The mode used in the `MirrorPad` op.
-output: The folded tensor.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Placeholder")
@@ -3160,19 +1738,7 @@ REGISTER_OP("Placeholder")
       TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-A placeholder op for a value that will be fed into the computation.
-
-N.B. This operation will fail with an error if it is executed. It is
-intended as a way to represent a value that will always be fed, and to
-provide attrs that enable the fed value to be checked at runtime.
-
-output: A placeholder tensor that must be replaced using the feed mechanism.
-dtype: The type of elements in the tensor.
-shape: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
-  shape is unconstrained.
-)doc");
+    });
 
 // Placeholder was modified in a backwards compatible way to do what
 // PlaceholderV2 did, so we have deprecated V2 (no one was really
@@ -3182,19 +1748,7 @@ REGISTER_OP("PlaceholderV2")
     .Attr("dtype: type")
     .Attr("shape: shape")
     .SetShapeFn(shape_inference::ExplicitShape)
-    .Deprecated(23, "Placeholder now behaves the same as PlaceholderV2.")
-    .Doc(R"doc(
-A placeholder op for a value that will be fed into the computation.
-
-N.B. This operation will fail with an error if it is executed. It is
-intended as a way to represent a value that will always be fed, and to
-provide attrs that enable the fed value to be checked at runtime.
-
-output: A placeholder tensor that must be replaced using the feed mechanism.
-dtype: The type of elements in the tensor.
-shape: The shape of the tensor. The shape can be any partially-specified
-   shape.  To be unconstrained, pass in a shape with unknown rank.
-)doc");
+    .Deprecated(23, "Placeholder now behaves the same as PlaceholderV2.");
 
 // --------------------------------------------------------------------------
 REGISTER_OP("PlaceholderWithDefault")
@@ -3215,15 +1769,7 @@ REGISTER_OP("PlaceholderWithDefault")
       TF_RETURN_IF_ERROR(c->Merge(input, out, &unused));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-A placeholder op that passes through `input` when its output is not fed.
-
-input: The default value to produce when `output` is not fed.
-output: A placeholder tensor that defaults to `input` if it is not fed.
-dtype: The type of elements in the tensor.
-shape: The (possibly partial) shape of the tensor.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ExpandDims")
@@ -3263,57 +1809,17 @@ REGISTER_OP("ExpandDims")
         dim += rank + 1;
       }
 
-      ShapeHandle end;
-      TF_RETURN_IF_ERROR(c->Subshape(input, dim, &end));
-
-      // Build output as start + 1 + end.
-      ShapeHandle output;
-      TF_RETURN_IF_ERROR(c->Subshape(input, 0, dim, &output));
-      TF_RETURN_IF_ERROR(c->Concatenate(output, c->Vector(1), &output));
-      TF_RETURN_IF_ERROR(c->Concatenate(output, end, &output));
-      c->set_output(0, output);
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Inserts a dimension of 1 into a tensor's shape.
-
-Given a tensor `input`, this operation inserts a dimension of 1 at the
-dimension index `dim` of `input`'s shape. The dimension index `dim` starts at
-zero; if you specify a negative number for `dim` it is counted backward from
-the end.
-
-This operation is useful if you want to add a batch dimension to a single
-element. For example, if you have a single image of shape `[height, width,
-channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-which will make the shape `[1, height, width, channels]`.
-
-Other examples:
-
-```
-# 't' is a tensor of shape [2]
-shape(expand_dims(t, 0)) ==> [1, 2]
-shape(expand_dims(t, 1)) ==> [2, 1]
-shape(expand_dims(t, -1)) ==> [2, 1]
-
-# 't2' is a tensor of shape [2, 3, 5]
-shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-```
-
-This operation requires that:
-
-`-1-input.dims() <= dim <= input.dims()`
-
-This operation is related to `squeeze()`, which removes dimensions of
-size 1.
+      ShapeHandle end;
+      TF_RETURN_IF_ERROR(c->Subshape(input, dim, &end));
 
-dim: 0-D (scalar). Specifies the dimension index at which to
-  expand the shape of `input`. Must be in the range
-  `[-rank(input) - 1, rank(input)]`.
-output: Contains the same data as `input`, but its shape has an additional
-  dimension of size 1 added.
-)doc");
+      // Build output as start + 1 + end.
+      ShapeHandle output;
+      TF_RETURN_IF_ERROR(c->Subshape(input, 0, dim, &output));
+      TF_RETURN_IF_ERROR(c->Concatenate(output, c->Vector(1), &output));
+      TF_RETURN_IF_ERROR(c->Concatenate(output, end, &output));
+      c->set_output(0, output);
+      return Status::OK();
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Squeeze")
@@ -3381,36 +1887,7 @@ REGISTER_OP("Squeeze")
 
       c->set_output(0, c->MakeShape(result_shape));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Removes dimensions of size 1 from the shape of a tensor.
-
-Given a tensor `input`, this operation returns a tensor of the same type with
-all dimensions of size 1 removed. If you don't want to remove all size 1
-dimensions, you can remove specific size 1 dimensions by specifying
-`squeeze_dims`.
-
-For example:
-
-```
-# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-shape(squeeze(t)) ==> [2, 3]
-```
-
-Or, to remove specific size 1 dimensions:
-
-```
-# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-```
-
-input: The `input` to squeeze.
-squeeze_dims: If specified, only squeezes the dimensions listed. The dimension
-  index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-  be in the range `[-rank(input), rank(input))`.
-output: Contains the same data as `input`, but has one or more dimensions of
-  size 1 removed.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ListDiff")
@@ -3429,37 +1906,7 @@ REGISTER_OP("ListDiff")
       c->set_output(0, out);
       c->set_output(1, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the difference between two lists of numbers or strings.
-
-Given a list `x` and a list `y`, this operation returns a list `out` that
-represents all values that are in `x` but not in `y`. The returned list `out`
-is sorted in the same order that the numbers appear in `x` (duplicates are
-preserved). This operation also returns a list `idx` that represents the
-position of each `out` element in `x`. In other words:
-
-`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-
-For example, given this input:
-
-```
-x = [1, 2, 3, 4, 5, 6]
-y = [1, 3, 5]
-```
-
-This operation would return:
-
-```
-out ==> [2, 4, 6]
-idx ==> [1, 3, 5]
-```
-
-x: 1-D. Values to keep.
-y: 1-D. Values to remove.
-out: 1-D. Values present in `x` but not in `y`.
-idx: 1-D. Positions of `x` values preserved in `out`.
-)doc");
+    });
 
 namespace {
 
@@ -3647,133 +2094,7 @@ REGISTER_OP("SpaceToBatchND")
       return SpaceToBatchShapeHelper(c, c->input(0), c->input(1),
                                      c->input_tensor(1), c->input(2),
                                      c->input_tensor(2));
-    })
-    .Doc(R"doc(
-SpaceToBatch for N-D tensors of type T.
-
-This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
-grid of blocks of shape `block_shape`, and interleaves these blocks with the
-"batch" dimension (0) such that in the output, the spatial dimensions
-`[1, ..., M]` correspond to the position within the grid, and the batch
-dimension combines both the position within a spatial block and the original
-batch position.  Prior to division into blocks, the spatial dimensions of the
-input are optionally zero padded according to `paddings`.  See below for a
-precise description.
-
-input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-  where spatial_shape has `M` dimensions.
-
-block_shape: 1-D with shape `[M]`, all values must be >= 1.
-
-paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
-  `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
-  `i + 1`, which corresponds to spatial dimension `i`.  It is required that
-  `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
-
-This operation is equivalent to the following steps:
-
-1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
-   input according to `paddings` to produce `padded` of shape `padded_shape`.
-
-2. Reshape `padded` to `reshaped_padded` of shape:
-
-     [batch] +
-     [padded_shape[1] / block_shape[0],
-       block_shape[0],
-      ...,
-      padded_shape[M] / block_shape[M-1],
-      block_shape[M-1]] +
-     remaining_shape
-
-3. Permute dimensions of `reshaped_padded` to produce
-   `permuted_reshaped_padded` of shape:
-
-     block_shape +
-     [batch] +
-     [padded_shape[1] / block_shape[0],
-      ...,
-      padded_shape[M] / block_shape[M-1]] +
-     remaining_shape
-
-4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
-   dimension, producing an output tensor of shape:
-
-     [batch * prod(block_shape)] +
-     [padded_shape[1] / block_shape[0],
-      ...,
-      padded_shape[M] / block_shape[M-1]] +
-     remaining_shape
-
-Some examples:
-
-(1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
-    `paddings = [[0, 0], [0, 0]]`:
-
-```
-x = [[[[1], [2]], [[3], [4]]]]
-```
-
-The output tensor has shape `[4, 1, 1, 1]` and value:
-
-```
-[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-```
-
-(2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
-    `paddings = [[0, 0], [0, 0]]`:
-
-```
-x = [[[[1, 2, 3], [4, 5, 6]],
-      [[7, 8, 9], [10, 11, 12]]]]
-```
-
-The output tensor has shape `[4, 1, 1, 3]` and value:
-
-```
-[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-```
-
-(3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
-    `paddings = [[0, 0], [0, 0]]`:
-
-```
-x = [[[[1],   [2],  [3],  [4]],
-      [[5],   [6],  [7],  [8]],
-      [[9],  [10], [11],  [12]],
-      [[13], [14], [15],  [16]]]]
-```
-
-The output tensor has shape `[4, 2, 2, 1]` and value:
-
-```
-x = [[[[1], [3]], [[9], [11]]],
-     [[[2], [4]], [[10], [12]]],
-     [[[5], [7]], [[13], [15]]],
-     [[[6], [8]], [[14], [16]]]]
-```
-
-(4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
-    paddings = `[[0, 0], [2, 0]]`:
-
-```
-x = [[[[1],   [2],  [3],  [4]],
-      [[5],   [6],  [7],  [8]]],
-     [[[9],  [10], [11],  [12]],
-      [[13], [14], [15],  [16]]]]
-```
-
-The output tensor has shape `[8, 1, 3, 1]` and value:
-
-```
-x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-     [[[0], [2], [4]]], [[[0], [10], [12]]],
-     [[[0], [5], [7]]], [[[0], [13], [15]]],
-     [[[0], [6], [8]]], [[[0], [14], [16]]]]
-```
-
-Among others, this operation is useful for reducing atrous convolution into
-regular convolution.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("SpaceToBatch")
@@ -3798,106 +2119,7 @@ REGISTER_OP("SpaceToBatch")
       return SpaceToBatchShapeHelper(c, input_shape, c->MakeShape({2}),
                                      &block_shape, c->input(1),
                                      c->input_tensor(1));
-    })
-    .Doc(R"doc(
-SpaceToBatch for 4-D tensors of type T.
-
-This is a legacy version of the more general SpaceToBatchND.
-
-Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
-More specifically, this op outputs a copy of the input tensor where values from
-the `height` and `width` dimensions are moved to the `batch` dimension. After
-the zero-padding, both `height` and `width` of the input must be divisible by the
-block size.
-
-input: 4-D with shape `[batch, height, width, depth]`.
-
-paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-  the padding of the input with zeros across the spatial dimensions as follows:
-
-      paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
-
-  The effective spatial dimensions of the zero-padded input tensor will be:
-
-      height_pad = pad_top + height + pad_bottom
-      width_pad = pad_left + width + pad_right
-
-The attr `block_size` must be greater than one. It indicates the block size.
-
-  * Non-overlapping blocks of size `block_size x block size` in the height and
-    width dimensions are rearranged into the batch dimension at each location.
-  * The batch of the output tensor is `batch * block_size * block_size`.
-  * Both height_pad and width_pad must be divisible by block_size.
-
-The shape of the output will be:
-
-    [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-     depth]
-
-Some examples:
-
-(1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
-
-```
-x = [[[[1], [2]], [[3], [4]]]]
-```
-
-The output tensor has shape `[4, 1, 1, 1]` and value:
-
-```
-[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-```
-
-(2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
-
-```
-x = [[[[1, 2, 3], [4, 5, 6]],
-      [[7, 8, 9], [10, 11, 12]]]]
-```
-
-The output tensor has shape `[4, 1, 1, 3]` and value:
-
-```
-[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-```
-
-(3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
-
-```
-x = [[[[1],   [2],  [3],  [4]],
-      [[5],   [6],  [7],  [8]],
-      [[9],  [10], [11],  [12]],
-      [[13], [14], [15],  [16]]]]
-```
-
-The output tensor has shape `[4, 2, 2, 1]` and value:
-
-```
-x = [[[[1], [3]], [[9], [11]]],
-     [[[2], [4]], [[10], [12]]],
-     [[[5], [7]], [[13], [15]]],
-     [[[6], [8]], [[14], [16]]]]
-```
-
-(4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
-
-```
-x = [[[[1],   [2],  [3],  [4]],
-      [[5],   [6],  [7],  [8]]],
-     [[[9],  [10], [11],  [12]],
-      [[13], [14], [15],  [16]]]]
-```
-
-The output tensor has shape `[8, 1, 2, 1]` and value:
-
-```
-x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-```
-
-Among others, this operation is useful for reducing atrous convolution into
-regular convolution.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("BatchToSpaceND")
@@ -3912,132 +2134,7 @@ REGISTER_OP("BatchToSpaceND")
       return BatchToSpaceShapeHelper(c, c->input(0), c->input(1),
                                      c->input_tensor(1), c->input(2),
                                      c->input_tensor(2));
-    })
-    .Doc(R"doc(
-BatchToSpace for N-D tensors of type T.
-
-This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-`block_shape + [batch]`, interleaves these blocks back into the grid defined by
-the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-the input.  The spatial dimensions of this intermediate result are then
-optionally cropped according to `crops` to produce the output.  This is the
-reverse of SpaceToBatch.  See below for a precise description.
-
-input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-  where spatial_shape has M dimensions.
-
-block_shape: 1-D with shape `[M]`, all values must be >= 1.
-
-crops: 2-D with shape `[M, 2]`, all values must be >= 0.
-  `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
-  dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
-  required that
-  `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
-
-This operation is equivalent to the following steps:
-
-1. Reshape `input` to `reshaped` of shape:
-     [block_shape[0], ..., block_shape[M-1],
-      batch / prod(block_shape),
-      input_shape[1], ..., input_shape[N-1]]
-
-2. Permute dimensions of `reshaped` to produce `permuted` of shape
-     [batch / prod(block_shape),
-
-      input_shape[1], block_shape[0],
-      ...,
-      input_shape[M], block_shape[M-1],
-
-      input_shape[M+1], ..., input_shape[N-1]]
-
-3. Reshape `permuted` to produce `reshaped_permuted` of shape
-     [batch / prod(block_shape),
-
-      input_shape[1] * block_shape[0],
-      ...,
-      input_shape[M] * block_shape[M-1],
-
-      input_shape[M+1],
-      ...,
-      input_shape[N-1]]
-
-4. Crop the start and end of dimensions `[1, ..., M]` of
-   `reshaped_permuted` according to `crops` to produce the output of shape:
-     [batch / prod(block_shape),
-
-      input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-      ...,
-      input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-
-      input_shape[M+1], ..., input_shape[N-1]]
-
-Some examples:
-
-(1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
-    `crops = [[0, 0], [0, 0]]`:
-
-```
-[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-```
-
-The output tensor has shape `[1, 2, 2, 1]` and value:
-
-```
-x = [[[[1], [2]], [[3], [4]]]]
-```
-
-(2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
-    `crops = [[0, 0], [0, 0]]`:
-
-```
-[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-```
-
-The output tensor has shape `[1, 2, 2, 3]` and value:
-
-```
-x = [[[[1, 2, 3], [4, 5, 6]],
-      [[7, 8, 9], [10, 11, 12]]]]
-```
-
-(3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
-    `crops = [[0, 0], [0, 0]]`:
-
-```
-x = [[[[1], [3]], [[9], [11]]],
-     [[[2], [4]], [[10], [12]]],
-     [[[5], [7]], [[13], [15]]],
-     [[[6], [8]], [[14], [16]]]]
-```
-
-The output tensor has shape `[1, 4, 4, 1]` and value:
-
-```
-x = [[[1],   [2],  [3],  [4]],
-     [[5],   [6],  [7],  [8]],
-     [[9],  [10], [11],  [12]],
-     [[13], [14], [15],  [16]]]
-```
-
-(4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
-    `crops = [[0, 0], [2, 0]]`:
-
-```
-x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-     [[[0], [2], [4]]], [[[0], [10], [12]]],
-     [[[0], [5], [7]]], [[[0], [13], [15]]],
-     [[[0], [6], [8]]], [[[0], [14], [16]]]]
-```
-
-The output tensor has shape `[2, 2, 4, 1]` and value:
-
-```
-x = [[[[1],   [2],  [3],  [4]],
-      [[5],   [6],  [7],  [8]]],
-     [[[9],  [10], [11],  [12]],
-      [[13], [14], [15],  [16]]]]
-```
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("BatchToSpace")
@@ -4062,97 +2159,7 @@ REGISTER_OP("BatchToSpace")
       return BatchToSpaceShapeHelper(c, input_shape, c->MakeShape({2}),
                                      &block_shape, c->input(1),
                                      c->input_tensor(1));
-    })
-    .Doc(R"doc(
-BatchToSpace for 4-D tensors of type T.
-
-This is a legacy version of the more general BatchToSpaceND.
-
-Rearranges (permutes) data from batch into blocks of spatial data, followed by
-cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-this op outputs a copy of the input tensor where values from the `batch`
-dimension are moved in spatial blocks to the `height` and `width` dimensions,
-followed by cropping along the `height` and `width` dimensions.
-
-input: 4-D tensor with shape
- `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-   depth]`. Note that the batch size of the input tensor must be divisible by
- `block_size * block_size`.
-
-crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-  how many elements to crop from the intermediate result across the spatial
-  dimensions as follows:
-
-      crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
-
-output: 4-D with shape `[batch, height, width, depth]`, where:
-
-      height = height_pad - crop_top - crop_bottom
-      width = width_pad - crop_left - crop_right
-
-The attr `block_size` must be greater than one. It indicates the block size.
-
-Some examples:
-
-(1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-
-```
-[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-```
-
-The output tensor has shape `[1, 2, 2, 1]` and value:
-
-```
-x = [[[[1], [2]], [[3], [4]]]]
-```
-
-(2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
-
-```
-[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-```
-
-The output tensor has shape `[1, 2, 2, 3]` and value:
-
-```
-x = [[[[1, 2, 3], [4, 5, 6]],
-      [[7, 8, 9], [10, 11, 12]]]]
-```
-
-(3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
-
-```
-x = [[[[1], [3]], [[9], [11]]],
-     [[[2], [4]], [[10], [12]]],
-     [[[5], [7]], [[13], [15]]],
-     [[[6], [8]], [[14], [16]]]]
-```
-
-The output tensor has shape `[1, 4, 4, 1]` and value:
-
-```
-x = [[[1],   [2],  [3],  [4]],
-     [[5],   [6],  [7],  [8]],
-     [[9],  [10], [11],  [12]],
-     [[13], [14], [15],  [16]]]
-```
-
-(4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
-
-```
-x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-```
-
-The output tensor has shape `[2, 2, 4, 1]` and value:
-
-```
-x = [[[[1], [3]], [[5], [7]]],
-     [[[2], [4]], [[10], [12]]],
-     [[[5], [7]], [[13], [15]]],
-     [[[6], [8]], [[14], [16]]]]
-```
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("SpaceToDepth")
@@ -4206,96 +2213,7 @@ REGISTER_OP("SpaceToDepth")
 
       c->set_output(0, output_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-SpaceToDepth for tensors of type T.
-
-Rearranges blocks of spatial data, into depth. More specifically,
-this op outputs a copy of the input tensor where values from the `height`
-and `width` dimensions are moved to the `depth` dimension.
-The attr `block_size` indicates the input block size.
-
-  * Non-overlapping blocks of size `block_size x block size` are rearranged
-    into depth at each location.
-  * The depth of the output tensor is `block_size * block_size * input_depth`.
-  * The Y, X coordinates within each block of the input become the high order
-    component of the output channel index.
-  * The input tensor's height and width must be divisible by block_size.
-
-The `data_format` attr specifies the layout of the input and output tensors
-with the following options:
-  "NHWC": `[ batch, height, width, channels ]`
-  "NCHW": `[ batch, channels, height, width ]`
-  "NCHW_VECT_C":
-      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
-
-It is useful to consider the operation as transforming a 6-D Tensor.
-e.g. for data_format = NHWC,
-     Each element in the input tensor can be specified via 6 coordinates,
-     ordered by decreasing memory layout significance as:
-     n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
-                        within the output image, bX, bY means coordinates
-                        within the input block, iC means input channels).
-     The output would be a transpose to the following layout:
-     n,oY,oX,bY,bX,iC
-
-This operation is useful for resizing the activations between convolutions
-(but keeping all data), e.g. instead of pooling. It is also useful for training
-purely convolutional models.
-
-For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
-block_size = 2:
-
-```
-x = [[[[1], [2]],
-      [[3], [4]]]]
-```
-
-This operation will output a tensor of shape `[1, 1, 1, 4]`:
-
-```
-[[[[1, 2, 3, 4]]]]
-```
-
-Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
-the corresponding output will have a single element (i.e. width and height are
-both 1) and will have a depth of 4 channels (1 * block_size * block_size).
-The output element shape is `[1, 1, 4]`.
-
-For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
-
-```
-x = [[[[1, 2, 3], [4, 5, 6]],
-      [[7, 8, 9], [10, 11, 12]]]]
-```
-
-This operation, for block_size of 2, will return the following tensor of shape
-`[1, 1, 1, 12]`
-
-```
-[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-```
-
-Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
-
-```
-x = [[[[1],   [2],  [5],  [6]],
-      [[3],   [4],  [7],  [8]],
-      [[9],  [10], [13],  [14]],
-      [[11], [12], [15],  [16]]]]
-```
-
-the operator will return the following tensor of shape `[1 2 2 4]`:
-
-```
-x = [[[[1, 2, 3, 4],
-       [5, 6, 7, 8]],
-      [[9, 10, 11, 12],
-       [13, 14, 15, 16]]]]
-```
-
-block_size: The size of the spatial block.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("DepthToSpace")
@@ -4347,102 +2265,7 @@ REGISTER_OP("DepthToSpace")
 
       c->set_output(0, output_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-DepthToSpace for tensors of type T.
-
-Rearranges data from depth into blocks of spatial data.
-This is the reverse transformation of SpaceToDepth. More specifically,
-this op outputs a copy of the input tensor where values from the `depth`
-dimension are moved in spatial blocks to the `height` and `width` dimensions.
-The attr `block_size` indicates the input block size and how the data is moved.
-
-  * Chunks of data of size `block_size * block_size` from depth are rearranged
-    into non-overlapping blocks of size `block_size x block_size`
-  * The width the output tensor is `input_depth * block_size`, whereas the
-    height is `input_height * block_size`.
-  * The Y, X coordinates within each block of the output image are determined
-    by the high order component of the input channel index.
-  * The depth of the input tensor must be divisible by
-    `block_size * block_size`.
-
-The `data_format` attr specifies the layout of the input and output tensors
-with the following options:
-  "NHWC": `[ batch, height, width, channels ]`
-  "NCHW": `[ batch, channels, height, width ]`
-  "NCHW_VECT_C":
-      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
-
-It is useful to consider the operation as transforming a 6-D Tensor.
-e.g. for data_format = NHWC,
-     Each element in the input tensor can be specified via 6 coordinates,
-     ordered by decreasing memory layout significance as:
-     n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
-                        within the input image, bX, bY means coordinates
-                        within the output block, oC means output channels).
-     The output would be the input transposed to the following layout:
-     n,iY,bY,iX,bX,oC
-
-This operation is useful for resizing the activations between convolutions
-(but keeping all data), e.g. instead of pooling. It is also useful for training
-purely convolutional models.
-
-For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
-block_size = 2:
-
-```
-x = [[[[1, 2, 3, 4]]]]
-
-```
-
-This operation will output a tensor of shape `[1, 2, 2, 1]`:
-
-```
-   [[[[1], [2]],
-     [[3], [4]]]]
-```
-
-Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
-the corresponding output will have 2x2 elements and will have a depth of
-1 channel (1 = `4 / (block_size * block_size)`).
-The output element shape is `[2, 2, 1]`.
-
-For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
-
-```
-x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-```
-
-This operation, for block size of 2, will return the following tensor of shape
-`[1, 2, 2, 3]`
-
-```
-   [[[[1, 2, 3], [4, 5, 6]],
-     [[7, 8, 9], [10, 11, 12]]]]
-
-```
-
-Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
-
-```
-x =  [[[[1, 2, 3, 4],
-       [5, 6, 7, 8]],
-      [[9, 10, 11, 12],
-       [13, 14, 15, 16]]]]
-```
-
-the operator will return the following tensor of shape `[1 4 4 1]`:
-
-```
-x = [[[ [1],   [2],  [5],  [6]],
-      [ [3],   [4],  [7],  [8]],
-      [ [9],  [10], [13],  [14]],
-      [ [11], [12], [15],  [16]]]]
-
-```
-
-block_size: The size of the spatial block, same as in Space2Depth.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -4529,34 +2352,7 @@ REGISTER_OP("ExtractImagePatches")
           {batch_size_dim, output_rows, output_cols, output_depth_dim});
       c->set_output(0, output_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Extract `patches` from `images` and put them in the "depth" output dimension.
-
-images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
-patches: 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
-  ksize_cols * depth]` containing image patches with size
-  `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
-  `out_rows` and `out_cols` are the dimensions of the output patches.
-ksizes: The size of the sliding window for each dimension of `images`.
-strides: 1-D of length 4. How far the centers of two consecutive patches are in
-  the images. Must be: `[1, stride_rows, stride_cols, 1]`.
-rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
-  input stride, specifying how far two consecutive patch samples are in the
-  input. Equivalent to extracting patches with
-  `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-  subsampling them spatially by a factor of `rates`. This is equivalent to
-  `rate` in dilated (a.k.a. Atrous) convolutions.
-padding: The type of padding algorithm to use.
-
-We specify the size-related attributes as:
-
-```python
-      ksizes = [1, ksize_rows, ksize_cols, 1]
-      strides = [1, strides_rows, strides_cols, 1]
-      rates = [1, rates_rows, rates_cols, 1]
-```
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -4565,12 +2361,12 @@ REGISTER_OP("Bitcast")
     .Output("output: type")
     // All supported dtypes are listed here to include qint16 and quint16.
     .Attr(
-        "T: {float, double, int64, int32, uint8, uint16, int8, int16,"
+        "T: {bfloat16, float, double, int64, int32, uint8, uint16, int8, int16,"
         " complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
         " half}")
     .Attr(
-        "type: {float, double, int64, int32, uint8, uint16, int8, int16,"
-        " complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
+        "type: {bfloat16, float, double, int64, int32, uint8, uint16, int8, "
+        "int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
         " half}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
@@ -4620,23 +2416,7 @@ REGISTER_OP("Bitcast")
 
       c->set_output(0, new_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Bitcasts a tensor from one type to another without copying data.
-
-Given a tensor `input`, this operation returns a tensor that has the same buffer
-data as `input` with datatype `type`.
-
-If the input datatype `T` is larger than the output datatype `type` then the
-shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
-
-If `T` is smaller than `type`, the operator requires that the rightmost
-dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
-[..., sizeof(`type`)/sizeof(`T`)] to [...].
-
-*NOTE*: Bitcast is implemented as a low-level cast, so machines with different
-endian orderings will give different results.
-)doc");
+    });
 
 REGISTER_OP("OneHot")
     .Input("indices: TI")
@@ -4672,106 +2452,7 @@ REGISTER_OP("OneHot")
       TF_RETURN_IF_ERROR(c->Concatenate(front, back, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns a one-hot tensor.
-
-The locations represented by indices in `indices` take value `on_value`,
-while all other locations take value `off_value`.
-
-If the input `indices` is rank `N`, the output will have rank `N+1`,
-The new axis is created at dimension `axis` (default: the new axis is
-appended at the end).
-
-If `indices` is a scalar the output shape will be a vector of length `depth`.
-
-If `indices` is a vector of length `features`, the output shape will be:
-```
-  features x depth if axis == -1
-  depth x features if axis == 0
-```
-
-If `indices` is a matrix (batch) with shape `[batch, features]`,
-the output shape will be:
-```
-  batch x features x depth if axis == -1
-  batch x depth x features if axis == 1
-  depth x batch x features if axis == 0
-```
-
-
-Examples
-=========
-
-Suppose that
-
-```
-  indices = [0, 2, -1, 1]
-  depth = 3
-  on_value = 5.0
-  off_value = 0.0
-  axis = -1
-```
-
-Then output is `[4 x 3]`:
-
-    ```output =
-      [5.0 0.0 0.0]  // one_hot(0)
-      [0.0 0.0 5.0]  // one_hot(2)
-      [0.0 0.0 0.0]  // one_hot(-1)
-      [0.0 5.0 0.0]  // one_hot(1)
-    ```
-
-Suppose that
-
-```
-  indices = [0, 2, -1, 1]
-  depth = 3
-  on_value = 0.0
-  off_value = 3.0
-  axis = 0
-```
-
-Then output is `[3 x 4]`:
-
-    ```output =
-      [0.0 3.0 3.0 3.0]
-      [3.0 3.0 3.0 0.0]
-      [3.0 3.0 3.0 3.0]
-      [3.0 0.0 3.0 3.0]
-    //  ^                one_hot(0)
-    //      ^            one_hot(2)
-    //          ^        one_hot(-1)
-    //              ^    one_hot(1)
-    ```
-Suppose that
-
-```
-  indices = [[0, 2], [1, -1]]
-  depth = 3
-  on_value = 1.0
-  off_value = 0.0
-  axis = -1
-```
-
-Then output is `[2 x 2 x 3]`:
-
-    ```output =
-      [
-        [1.0, 0.0, 0.0]  // one_hot(0)
-        [0.0, 0.0, 1.0]  // one_hot(2)
-      ][
-        [0.0, 1.0, 0.0]  // one_hot(1)
-        [0.0, 0.0, 0.0]  // one_hot(-1)
-      ]```
-
-indices: A tensor of indices.
-depth: A scalar defining the depth of the one hot dimension.
-on_value: A scalar defining the value to fill in output when `indices[j] = i`.
-off_value: A scalar defining the value to fill in output when `indices[j] != i`.
-axis: The axis to fill (default: -1, a new inner-most axis).
-output: The one-hot tensor.
-)doc");
+    });
 
 // EXPERIMENTAL. DO NOT USE OR DEPEND ON THIS YET.
 REGISTER_OP("QuantizeAndDequantize")
@@ -4782,12 +2463,9 @@ REGISTER_OP("QuantizeAndDequantize")
     .Attr("input_min: float = 0")
     .Attr("input_max: float = 0")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
-    .Deprecated(22, "Replaced by QuantizeAndDequantizeV2")
-    .Doc(R"doc(
-Use QuantizeAndDequantizeV2 instead.
-)doc");
+    .Deprecated(22, "Replaced by QuantizeAndDequantizeV2");
 
 // TODO(suharshs): Deprecate QuantizeAndDequantizeV2.
 REGISTER_OP("QuantizeAndDequantizeV2")
@@ -4798,76 +2476,14 @@ REGISTER_OP("QuantizeAndDequantizeV2")
     .Attr("num_bits: int = 8")
     .Attr("range_given: bool = false")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
       c->set_output(0, c->input(0));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Quantizes then dequantizes a tensor.
-
-This op simulates the precision loss from the quantized forward pass by:
-1. Quantizing the tensor to fixed point numbers, which should match the target
-   quantization method when it is used in inference.
-2. Dequantizing it back to floating point numbers for the following ops, most
-   likely matmul.
-
-There are different ways to quantize. This version does not use the full range
-of the output type, choosing to elide the lowest possible value for symmetry
-(e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit
-quantization), so that 0.0 maps to 0.
-
-To perform this op, we first find the range of values in our tensor. The range
-we use is always centered on 0, so we find m such that
-
-1. m = max(abs(input_min), abs(input_max)) if range_given is true,
-2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
-
-Our input tensor range is then [-m, m].
-
-Next, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].
-If signed_input is true, this is
-
-  [min_fixed, max_fixed ] =
-      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].
-
-Otherwise, if signed_input is false, the fixed-point range is
-
-  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].
-
-From this we compute our scaling factor, s:
-
-  s = (max_fixed - min_fixed) / (2 * m).
-
-Now we can quantize and dequantize the elements of our tensor.  An element e
-is transformed into e':
-
-  e' = (e * s).round_to_nearest() / s.
-
-Note that we have a different number of buckets in the signed vs. unsigned
-cases.  For example, if num_bits == 8, we get 254 buckets in the signed case
-vs. 255 in the unsigned case.
-
-For example, suppose num_bits = 8 and m = 1.  Then
-
-  [min_fixed, max_fixed] = [-127, 127], and
-  s = (127 + 127) / 2 = 127.
-
-Given the vector {-1, -0.5, 0, 0.3}, this is quantized to
-{-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}.
-
-input: Tensor to quantize and then dequantize.
-signed_input: If the quantization is signed or unsigned.
-num_bits: The bitwidth of the quantization.
-range_given: If the range is given or should be computed from the tensor.
-input_min: If range_given, this is the min of the range, otherwise this input
-           will be ignored.
-input_max: If range_given, this is the max of the range, otherwise this input
-           will be ignored.
-)doc");
+    });
 
 REGISTER_OP("QuantizeAndDequantizeV3")
     .Input("input: T")
@@ -4877,7 +2493,7 @@ REGISTER_OP("QuantizeAndDequantizeV3")
     .Attr("signed_input: bool = true")
     .Attr("range_given: bool = true")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
@@ -4885,13 +2501,7 @@ REGISTER_OP("QuantizeAndDequantizeV3")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       c->set_output(0, c->input(0));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Quantizes then dequantizes a tensor.
-
-This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-tensor, so its value can change during training.
-)doc");
+    });
 
 REGISTER_OP("QuantizeV2")
     .Input("input: float")
@@ -4913,110 +2523,7 @@ REGISTER_OP("QuantizeV2")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-
-[min_range, max_range] are scalar floats that specify the range for
-the 'input' data. The 'mode' attribute controls exactly which calculations are
-used to convert the float values to their quantized equivalents.  The
-'round_mode' attribute controls which rounding tie-breaking algorithm is used
-when rounding float values to their quantized equivalents.
-
-In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-
-```
-out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-if T == qint8, out[i] -= (range(T) + 1) / 2.0
-```
-here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-
-*MIN_COMBINED Mode Example*
-
-Assume the input is type float and has a possible range of [0.0, 6.0] and the
-output type is quint8 ([0, 255]). The min_range and max_range values should be
-specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-value of the input by 255/6 and cast to quint8.
-
-If the output type was qint8 ([-128, 127]), the operation will additionally
-subtract each value by 128 prior to casting, so that the range of values aligns
-with the range of qint8.
-
-If the mode is 'MIN_FIRST', then this approach is used:
-
-```
-num_discrete_values = 1 << (# of bits in T)
-range_adjust = num_discrete_values / (num_discrete_values - 1)
-range = (range_max - range_min) * range_adjust
-range_scale = num_discrete_values / range
-quantized = round(input * range_scale) - round(range_min * range_scale) +
-  numeric_limits<T>::min()
-quantized = max(quantized, numeric_limits<T>::min())
-quantized = min(quantized, numeric_limits<T>::max())
-```
-
-The biggest difference between this and MIN_COMBINED is that the minimum range
-is rounded first, before it's subtracted from the rounded value. With
-MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-and dequantizing will introduce a larger and larger error.
-
-*SCALED mode Example*
-
-`SCALED` mode matches the quantization approach used in
-`QuantizeAndDequantize{V2|V3}`.
-
-If the mode is `SCALED`, we do not use the full range of the output type,
-choosing to elide the lowest possible value for symmetry (e.g., output range is
--127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-0.
-
-We first find the range of values in our tensor. The
-range we use is always centered on 0, so we find m such that
-```c++
-  m = max(abs(input_min), abs(input_max))
-```
-
-Our input tensor range is then `[-m, m]`.
-
-Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-If T is signed, this is
-```
-  num_bits = sizeof(T) * 8
-  [min_fixed, max_fixed] =
-      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-```
-
-Otherwise, if T is unsigned, the fixed-point range is
-```
-  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-```
-
-From this we compute our scaling factor, s:
-```c++
-  s = (max_fixed - min_fixed) / (2 * m)
-```
-
-Now we can quantize the elements of our tensor:
-```c++
-result = round(input * s)
-```
-
-One thing to watch out for is that the operator may choose to adjust the
-requested minimum and maximum values slightly during the quantization process,
-so you should always use the output ports as the range for further calculations.
-For example, if the requested minimum and maximum values are close to equal,
-they will be separated by a small epsilon value to prevent ill-formed quantized
-buffers from being created. Otherwise, you can end up with buffers where all the
-quantized values map to the same float value, which causes problems for
-operations that have to perform further calculations on them.
-
-min_range: The minimum scalar value possibly produced for the input.
-max_range: The maximum scalar value possibly produced for the input.
-output: The quantized data produced from the float input.
-output_min: The actual minimum scalar value used for the output.
-output_max: The actual maximum scalar value used for the output.
-
-)doc");
+    });
 
 REGISTER_OP("Dequantize")
     .Input("input: T")
@@ -5031,88 +2538,7 @@ REGISTER_OP("Dequantize")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Dequantize the 'input' tensor into a float Tensor.
-
-[min_range, max_range] are scalar floats that specify the range for
-the 'input' data. The 'mode' attribute controls exactly which calculations are
-used to convert the float values to their quantized equivalents.
-
-In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-
-```
-if T == qint8, in[i] += (range(T) + 1)/ 2.0
-out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-```
-here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-
-*MIN_COMBINED Mode Example*
-
-If the input comes from a QuantizedRelu6, the output type is
-quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-Dequantize on quint8 will take each value, cast to float, and multiply
-by 6 / 255.
-Note that if quantizedtype is qint8, the operation will additionally add
-each value by 128 prior to casting.
-
-If the mode is 'MIN_FIRST', then this approach is used:
-
-```c++
-num_discrete_values = 1 << (# of bits in T)
-range_adjust = num_discrete_values / (num_discrete_values - 1)
-range = (range_max - range_min) * range_adjust
-range_scale = range / num_discrete_values
-const double offset_input = static_cast<double>(input) - lowest_quantized;
-result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-```
-
-*SCALED mode Example*
-
-`SCALED` mode matches the quantization approach used in
-`QuantizeAndDequantize{V2|V3}`.
-
-If the mode is `SCALED`, we do not use the full range of the output type,
-choosing to elide the lowest possible value for symmetry (e.g., output range is
--127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-0.
-
-We first find the range of values in our tensor. The
-range we use is always centered on 0, so we find m such that
-```c++
-  m = max(abs(input_min), abs(input_max))
-```
-
-Our input tensor range is then `[-m, m]`.
-
-Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-If T is signed, this is
-```
-  num_bits = sizeof(T) * 8
-  [min_fixed, max_fixed] =
-      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-```
-
-Otherwise, if T is unsigned, the fixed-point range is
-```
-  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-```
-
-From this we compute our scaling factor, s:
-```c++
-  s = (2 * m) / (max_fixed - min_fixed)
-```
-
-Now we can dequantize the elements of our tensor:
-```c++
-result = input * s
-```
-
-min_range: The minimum scalar value possibly produced for the input.
-max_range: The maximum scalar value possibly produced for the input.
-
-)doc");
+    });
 
 REGISTER_OP("QuantizedConcat")
     .Input("concat_dim: int32")
@@ -5134,22 +2560,7 @@ REGISTER_OP("QuantizedConcat")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Concatenates quantized tensors along one dimension.
-
-concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-  range [0, rank(values)).
-values: The `N` Tensors to concatenate. Their ranks and types must match,
-  and their sizes must match in all dimensions except `concat_dim`.
-input_mins: The minimum scalar values for each of the input tensors.
-input_maxes: The maximum scalar values for each of the input tensors.
-output_min: The float value that the minimum quantized output value represents.
-output_max: The float value that the maximum quantized output value represents.
-output: A `Tensor` with the concatenation of values stacked along the
-  `concat_dim` dimension.  This tensor's shape matches that of `values` except
-  in `concat_dim` where it has the sum of the sizes.
-)doc");
+    });
 
 REGISTER_OP("QuantizedReshape")
     .Input("tensor: T")
@@ -5169,17 +2580,7 @@ REGISTER_OP("QuantizedReshape")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"Doc(
-Reshapes a quantized tensor as per the Reshape op.
-```
-
-shape: Defines the shape of the output tensor.
-input_min: The minimum value of the input.
-input_max: The maximum value of the input.
-output_min: This value is copied from input_min.
-output_max: This value is copied from input_max.
-)Doc");
+    });
 
 REGISTER_OP("QuantizedInstanceNorm")
     .Input("x: T")
@@ -5207,24 +2608,7 @@ REGISTER_OP("QuantizedInstanceNorm")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Quantized Instance normalization.
-
-x: A 4D input Tensor.
-x_min: The value represented by the lowest quantized input.
-x_max: The value represented by the highest quantized input.
-y: A 4D Tensor.
-y_min: The value represented by the lowest quantized output.
-y_max: The value represented by the highest quantized output.
-output_range_given: If True, `given_y_min` and `given_y_min`
-  and `given_y_max` are used as the output range. Otherwise,
-  the implementation computes the output range.
-given_y_min: Output in `y_min` if `output_range_given` is True.
-given_y_max: Output in `y_max` if `output_range_given` is True.
-variance_epsilon: A small float number to avoid dividing by 0.
-min_separation: Minimum value of `y_max - y_min`
-)doc");
+    });
 
 namespace {
 
@@ -5262,8 +2646,9 @@ Status ScatterNdShape(InferenceContext* c) {
       Status s = c->Merge(prefix_indices, prefix_updates, &unused);
       if (!s.ok()) {
         return errors::InvalidArgument(
-            "The outer ", outer_dims, " dimensions of indices.shape=",
-            c->DebugString(indices_shape), " must match the outer ", outer_dims,
+            "The outer ", outer_dims,
+            " dimensions of indices.shape=", c->DebugString(indices_shape),
+            " must match the outer ", outer_dims,
             " dimensions of updates.shape=", c->DebugString(updates_shape),
             ": ", s.error_message());
       }
@@ -5298,88 +2683,7 @@ REGISTER_OP("ScatterNd")
     .Output("output: T")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
-    .SetShapeFn(ScatterNdShape)
-    .Doc(R"doc(
-Scatter `updates` into a new (initially zero) tensor according to `indices`.
-
-Creates a new tensor by applying sparse `updates` to individual
-values or slices within a zero tensor of the given `shape` according to
-indices.  This operator is the inverse of the @{tf.gather_nd} operator which
-extracts values or slices from a given tensor.
-
-**WARNING**: The order in which updates are applied is nondeterministic, so the
-output will be nondeterministic if `indices` contains duplicates.
-
-`indices` is an integer tensor containing indices into a new tensor of shape
-`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-
-    indices.shape[-1] <= shape.rank
-
-The last dimension of `indices` corresponds to indices into elements
-(if `indices.shape[-1] = shape.rank`) or slices
-(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-`shape`.  `updates` is a tensor with shape
-
-    indices.shape[:-1] + shape[indices.shape[-1]:]
-
-The simplest form of scatter is to insert individual elements in a tensor by
-index. For example, say we want to insert 4 scattered elements in a rank-1
-tensor with 8 elements.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-</div>
-
-In Python, this scatter operation would look like this:
-
-```python
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    shape = tf.constant([8])
-    scatter = tf.scatter_nd(indices, updates, shape)
-    with tf.Session() as sess:
-      print(sess.run(scatter))
-```
-
-The resulting tensor would look like this:
-
-    [0, 11, 0, 10, 9, 0, 0, 12]
-
-We can also, insert entire slices of a higher rank tensor all at once. For
-example, if we wanted to insert two slices in the first dimension of a
-rank-3 tensor with two matrices of new values.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
-</div>
-
-In Python, this scatter operation would look like this:
-
-```python
-    indices = tf.constant([[0], [2]])
-    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-                            [7, 7, 7, 7], [8, 8, 8, 8]],
-                           [[5, 5, 5, 5], [6, 6, 6, 6],
-                            [7, 7, 7, 7], [8, 8, 8, 8]]])
-    shape = tf.constant([4, 4, 4])
-    scatter = tf.scatter_nd(indices, updates, shape)
-    with tf.Session() as sess:
-      print(sess.run(scatter))
-```
-
-The resulting tensor would look like this:
-
-    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
-
-indices: Index tensor.
-updates: Updates to scatter into output.
-shape: 1-D. The shape of the resulting tensor.
-output: A new tensor with the given shape and updates applied according
-  to the indices.
-)doc");
+    .SetShapeFn(ScatterNdShape);
 
 REGISTER_OP("ScatterNdNonAliasingAdd")
     .Input("input: T")
@@ -5388,53 +2692,7 @@ REGISTER_OP("ScatterNdNonAliasingAdd")
     .Output("output: T")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape)
-    .Doc(R"doc(
-Applies sparse addition to `input` using individual values or slices
-from `updates` according to indices `indices`.  The updates are non-aliasing:
-`input` is only modified in-place if no other operations will use it.
-Otherwise, a copy of `input` is made.  This operation has a gradient with
-respect to both `input` and `updates`.
-
-`input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `input`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-(if `K < P`) along the `K`th dimension of `input`.
-
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].
-```
-
-For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-elements. In Python, that addition would look like this:
-
-    input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-    with tf.Session() as sess:
-      print(sess.run(output))
-
-The resulting value `output` would look like this:
-
-    [1, 13, 3, 14, 14, 6, 7, 20]
-
-See @{tf.scatter_nd} for more details about how to make updates to slices.
-
-input: A Tensor.
-indices: A Tensor. Must be one of the following types: `int32`, `int64`.
-  A tensor of indices into `input`.
-updates: A Tensor. Must have the same type as ref. A tensor of updated values
-  to add to `input`.
-output: A `Tensor` with the same shape as `input`, containing values of `input`
-  updated with `updates`.
-)doc");
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
 
 REGISTER_OP("FakeQuantWithMinMaxArgs")
     .Attr("min: float = -6.0")
@@ -5443,18 +2701,7 @@ REGISTER_OP("FakeQuantWithMinMaxArgs")
     .Attr("narrow_range: bool = false")
     .Input("inputs: float")
     .Output("outputs: float")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
-
-Attributes `[min; max]` define the clamping range for the `inputs` data.
-`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
-
-Quantization is called fake since the output is still in floating point.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("FakeQuantWithMinMaxArgsGradient")
     .Attr("min: float = -6.0")
@@ -5464,15 +2711,7 @@ REGISTER_OP("FakeQuantWithMinMaxArgsGradient")
     .Input("gradients: float")
     .Input("inputs: float")
     .Output("backprops: float")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Compute gradients for a FakeQuantWithMinMaxArgs operation.
-
-gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
-backprops: Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-  `gradients * (inputs >= min && inputs <= max)`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("FakeQuantWithMinMaxVars")
     .Attr("num_bits: int = 8")
@@ -5487,20 +2726,7 @@ REGISTER_OP("FakeQuantWithMinMaxVars")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
-and `max` to 'outputs' tensor of same shape as `inputs`.
-
-`[min; max]` define the clamping range for the `inputs` data.
-`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
-
-This operation has a gradient and thus allows for training `min` and `max`
-values.
-)doc");
+    });
 
 REGISTER_OP("FakeQuantWithMinMaxVarsGradient")
     .Attr("num_bits: int = 8")
@@ -5526,22 +2752,7 @@ REGISTER_OP("FakeQuantWithMinMaxVarsGradient")
       c->set_output(1, min_max);
       c->set_output(2, min_max);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Compute gradients for a FakeQuantWithMinMaxVars operation.
-
-gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
-inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
-min, max: Quantization interval, scalar floats.
-num_bits: The bitwidth of the quantization; between 2 and 8, inclusive.
-narrow_range: Whether to quantize into 2^num_bits - 1 distinct values.
-backprops_wrt_input: Backpropagated gradients w.r.t. inputs:
-  `gradients * (inputs >= min && inputs <= max)`.
-backprop_wrt_min: Backpropagated gradients w.r.t. min parameter:
-  `sum(gradients * (inputs < min))`.
-backprop_wrt_max: Backpropagated gradients w.r.t. max parameter:
-  `sum(gradients * (inputs > max))`.
-)doc");
+    });
 
 REGISTER_OP("FakeQuantWithMinMaxVarsPerChannel")
     .Attr("num_bits: int = 8")
@@ -5563,21 +2774,7 @@ REGISTER_OP("FakeQuantWithMinMaxVarsPerChannel")
 
       c->set_output(0, input);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
-`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
-to 'outputs' tensor of same shape as `inputs`.
-
-`[min; max]` define the clamping range for the `inputs` data.
-`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
-
-This operation has a gradient and thus allows for training `min` and `max`
-values.
-)doc");
+    });
 
 REGISTER_OP("FakeQuantWithMinMaxVarsPerChannelGradient")
     .Attr("num_bits: int = 8")
@@ -5606,25 +2803,7 @@ REGISTER_OP("FakeQuantWithMinMaxVarsPerChannelGradient")
       c->set_output(1, min_max);
       c->set_output(2, min_max);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
-
-gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
-  shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
-inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
-  same as `gradients`.
-min, max: Quantization interval, floats of shape `[d]`.
-num_bits: The bitwidth of the quantization; between 2 and 8, inclusive.
-narrow_range: Whether to quantize into 2^num_bits - 1 distinct values.
-backprops_wrt_input: Backpropagated gradients w.r.t. inputs, shape same as
-  `inputs`:
-    `gradients * (inputs >= min && inputs <= max)`.
-backprop_wrt_min: Backpropagated gradients w.r.t. min parameter, shape `[d]`:
-  `sum_per_d(gradients * (inputs < min))`.
-backprop_wrt_max: Backpropagated gradients w.r.t. max parameter, shape `[d]`:
-  `sum_per_d(gradients * (inputs > max))`.
-)doc");
+    });
 
 #ifdef INTEL_MKL
 REGISTER_OP("_MklConcat")
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 94eb120175555d8d51b9be1ff98676a9dc4fff07..86d64635f4c1bc1c34407a517267758ce5cf60fc 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -142,8 +142,13 @@ TEST(ArrayOpsTest, Const_ShapeFn) {
 
 TEST(ArrayOpsTest, UnchangedShapes_ShapeFn) {
   for (const char* op_name : {
-           "CheckNumerics", "Identity", "RefIdentity", "QuantizeAndDequantize",
-           "StopGradient", "ZerosLike", "OnesLike",
+           "CheckNumerics",
+           "Identity",
+           "RefIdentity",
+           "QuantizeAndDequantize",
+           "StopGradient",
+           "ZerosLike",
+           "OnesLike",
        }) {
     ShapeInferenceTestOp op(op_name);
     INFER_OK(op, "?", "in0");
@@ -158,6 +163,13 @@ TEST(ArrayOpsTest, UnchangedShapes_ShapeFn) {
   INFER_OK(op, "[1,2,?,4,5];?;?", "in0");
 }
 
+TEST(ArrayOpsTest, GuaranteeConst_ShapeFn) {
+  ShapeInferenceTestOp op("GuaranteeConst");
+  INFER_OK(op, "?", "in0");
+  INFER_OK(op, "[]", "in0");
+  INFER_OK(op, "[1,2,?,4,5]", "in0");
+}
+
 TEST(ArrayOpsTest, Identity_ShapeFnHandles) {
   const char* op_name = "Identity";
   ShapeInferenceTestOp op(op_name);
@@ -246,6 +258,7 @@ TEST(ArrayOpsTest, ReverseV2_ShapeFn) {
 
 TEST(ArrayOpsTest, Fill_ShapeFn) {
   ShapeInferenceTestOp op("Fill");
+  AddNodeAttr("index_type", DT_INT32, &op.node_def);
   op.input_tensors.resize(2);
   INFER_OK(op, "?;?", "?");
   INFER_OK(op, "[?];?", "?");
@@ -514,7 +527,7 @@ TEST(ArrayOpsTest, MatrixSetDiag_ShapeFn) {
   INFER_ERROR("Dimensions must be equal, but are 2 and 3", op, "[2,3];[3]");
 
   // Output matches input.
-  INFER_OK(op, "?;?", "?");
+  INFER_OK(op, "?;?", "in0");
   INFER_OK(op, "[1,2,2];[1,2]", "in0");
   INFER_OK(op, "[1,2,3];?", "in0");
   INFER_OK(op, "[1,3,2];?", "in0");
@@ -1612,7 +1625,7 @@ TEST(ArrayOpsTest, UnchangedWithQuantizationScalars_ShapeFn) {
 TEST(ArrayOpsTest, FakeQuantWithMinMaxVarsPerChannel) {
   ShapeInferenceTestOp op("FakeQuantWithMinMaxVarsPerChannel");
 
-  INFER_OK(op, "?;?;?", "?");
+  INFER_OK(op, "?;?;?", "in0");
   INFER_OK(op, "[?];?;?", "in0");
   INFER_OK(op, "[1,?,3];[3];[3]", "in0");
   INFER_OK(op, "[3];[3];[3]", "in0");
@@ -1631,7 +1644,7 @@ TEST(ArrayOpsTest, FakeQuantWithMinMaxVarsPerChannel) {
 TEST(ArrayOpsTest, FakeQuantWithMinMaxVarsPerChannelGradient) {
   ShapeInferenceTestOp op("FakeQuantWithMinMaxVarsPerChannelGradient");
 
-  INFER_OK(op, "?;?;?;?", "?;[?];[?]");
+  INFER_OK(op, "?;?;?;?", "in0;[?];[?]");
   INFER_OK(op, "[3];[3];[3];[3]", "in0;in3;in3");
   INFER_OK(op, "[1,3];[1,3];[3];[3]", "in0;in3;in3");
   INFER_OK(op, "[1,2,3,4];[1,2,3,4];[4];[4]", "in0;in3;in3");
diff --git a/tensorflow/core/ops/audio_ops.cc b/tensorflow/core/ops/audio_ops.cc
index d944e385a8cba2eee8311c36deed689d42150ef8..bcc46761c130565d2462584a4fb06493f5a9841c 100644
--- a/tensorflow/core/ops/audio_ops.cc
+++ b/tensorflow/core/ops/audio_ops.cc
@@ -128,52 +128,13 @@ REGISTER_OP("DecodeWav")
     .Attr("desired_samples: int = -1")
     .Output("audio: float")
     .Output("sample_rate: int32")
-    .SetShapeFn(DecodeWavShapeFn)
-    .Doc(R"doc(
-Decode a 16-bit PCM WAV file to a float tensor.
-
-The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
-
-When desired_channels is set, if the input contains fewer channels than this
-then the last channel will be duplicated to give the requested number, else if
-the input has more channels than requested then the additional channels will be
-ignored.
-
-If desired_samples is set, then the audio will be cropped or padded with zeroes
-to the requested length.
-
-The first output contains a Tensor with the content of the audio samples. The
-lowest dimension will be the number of channels, and the second will be the
-number of samples. For example, a ten-sample-long stereo WAV file should give an
-output shape of [10, 2].
-
-contents: The WAV-encoded audio, usually from a file.
-desired_channels: Number of sample channels wanted.
-desired_samples: Length of audio requested.
-audio: 2-D with shape `[length, channels]`.
-sample_rate: Scalar holding the sample rate found in the WAV header.
-)doc");
+    .SetShapeFn(DecodeWavShapeFn);
 
 REGISTER_OP("EncodeWav")
     .Input("audio: float")
     .Input("sample_rate: int32")
     .Output("contents: string")
-    .SetShapeFn(EncodeWavShapeFn)
-    .Doc(R"doc(
-Encode audio data using the WAV file format.
-
-This operation will generate a string suitable to be saved out to create a .wav
-audio file. It will be encoded in the 16-bit PCM format. It takes in float
-values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-that range.
-
-`audio` is a 2-D float Tensor of shape `[length, channels]`.
-`sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
-
-audio: 2-D with shape `[length, channels]`.
-sample_rate: Scalar containing the sample frequency.
-contents: 0-D. WAV-encoded file contents.
-)doc");
+    .SetShapeFn(EncodeWavShapeFn);
 
 REGISTER_OP("AudioSpectrogram")
     .Input("input: float")
@@ -181,44 +142,7 @@ REGISTER_OP("AudioSpectrogram")
     .Attr("stride: int")
     .Attr("magnitude_squared: bool = false")
     .Output("spectrogram: float")
-    .SetShapeFn(SpectrogramShapeFn)
-    .Doc(R"doc(
-Produces a visualization of audio data over time.
-
-Spectrograms are a standard way of representing audio information as a series of
-slices of frequency information, one slice for each window of time. By joining
-these together into a sequence, they form a distinctive fingerprint of the sound
-over time.
-
-This op expects to receive audio data as an input, stored as floats in the range
--1 to 1, together with a window width in samples, and a stride specifying how
-far to move the window between slices. From this it generates a three
-dimensional output. The lowest dimension has an amplitude value for each
-frequency during that time slice. The next dimension is time, with successive
-frequency slices. The final dimension is for the channels in the input, so a
-stereo audio input would have two here for example.
-
-This means the layout when converted and saved as an image is rotated 90 degrees
-clockwise from a typical spectrogram. Time is descending down the Y axis, and
-the frequency decreases from left to right.
-
-Each value in the result represents the square root of the sum of the real and
-imaginary parts of an FFT on the current window of samples. In this way, the
-lowest dimension represents the power of each frequency in the current window,
-and adjacent windows are concatenated in the next dimension.
-
-To get a more intuitive and visual look at what this operation does, you can run
-tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
-resulting spectrogram as a PNG image.
-
-input: Float representation of audio data.
-window_size: How wide the input window is in samples. For the highest efficiency
-  this should be a power of two, but other values are accepted.
-stride: How widely apart the center of adjacent sample windows should be.
-magnitude_squared: Whether to return the squared magnitude or just the
-  magnitude. Using squared magnitude can avoid extra calculations.
-spectrogram: 3D representation of the audio frequencies as an image.
-)doc");
+    .SetShapeFn(SpectrogramShapeFn);
 
 REGISTER_OP("Mfcc")
     .Input("spectrogram: float")
@@ -228,26 +152,6 @@ REGISTER_OP("Mfcc")
     .Attr("filterbank_channel_count: int = 40")
     .Attr("dct_coefficient_count: int = 13")
     .Output("output: float")
-    .SetShapeFn(MfccShapeFn)
-    .Doc(R"doc(
-Transforms a spectrogram into a form that's useful for speech recognition.
-
-Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-been effective as an input feature for machine learning. They are created by
-taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-higher frequencies that are less significant to the human ear. They have a long
-history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-is a good resource to learn more.
-
-spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-  set to true.
-sample_rate: How many samples per second the source audio used.
-upper_frequency_limit: The highest frequency to use when calculating the
-  ceptstrum.
-lower_frequency_limit: The lowest frequency to use when calculating the
-  ceptstrum.
-filterbank_channel_count: Resolution of the Mel bank used internally.
-dct_coefficient_count: How many output channels to produce per time slice.
-)doc");
+    .SetShapeFn(MfccShapeFn);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/batch_ops.cc b/tensorflow/core/ops/batch_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a62965eedd3c053dff558108f21e99a77407587
--- /dev/null
+++ b/tensorflow/core/ops/batch_ops.cc
@@ -0,0 +1,85 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("Batch")
+    .Input("in_tensors: T")
+    .Output("batched_tensors: T")
+    .Output("batch_index: int64")
+    .Output("id: int64")
+    .Attr("num_batch_threads: int")
+    .Attr("max_batch_size: int")
+    .Attr("max_enqueued_batches: int = 10")
+    .Attr("batch_timeout_micros: int")
+    .Attr("allowed_batch_sizes: list(int) = []")
+    .Attr("grad_timeout_micros: int")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("batching_queue: string = ''")
+    .Attr("T: list(type)")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      std::vector<shape_inference::ShapeHandle> in_shapes;
+      TF_RETURN_IF_ERROR(c->input("in_tensors", &in_shapes));
+      std::vector<shape_inference::ShapeHandle> out_shapes(in_shapes.size());
+      for (int i = 0; i < in_shapes.size(); ++i) {
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(in_shapes[i], 0, c->UnknownDim(), &out_shapes[i]));
+      }
+      TF_RETURN_IF_ERROR(c->set_output("batched_tensors", out_shapes));
+      TF_RETURN_IF_ERROR(c->set_output("id", {c->Scalar()}));
+      TF_RETURN_IF_ERROR(c->set_output(
+          "batch_index",
+          {c->MakeShape({shape_inference::DimensionOrConstant(c->UnknownDim()),
+                         shape_inference::DimensionOrConstant(3)})}));
+      return Status::OK();
+    });
+
+REGISTER_OP("Unbatch")
+    .Input("batched_tensor: T")
+    .Input("batch_index: int64")
+    .Input("id: int64")
+    .Output("unbatched_tensor: T")
+    .Attr("timeout_micros: int")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle out_shape;
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &out_shape));
+      c->set_output(0, out_shape);
+      return Status::OK();
+    });
+
+REGISTER_OP("UnbatchGrad")
+    .Input("original_input: T")
+    .Input("batch_index: int64")
+    .Input("grad: T")
+    .Input("id: int64")
+    .Output("batched_grad: T")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShapeOfRank(c->Rank(c->input(2))));
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/bitwise_ops.cc b/tensorflow/core/ops/bitwise_ops.cc
index 2889953bdbc614bc4e56245e45c08d913cfd5255..39acf5f358b9c1388d56d884e4f27dc4656d9514 100644
--- a/tensorflow/core/ops/bitwise_ops.cc
+++ b/tensorflow/core/ops/bitwise_ops.cc
@@ -24,13 +24,7 @@ REGISTER_OP("Invert")
     .Input("x: T")
     .Output("y: T")
     .Attr("T: {int8, int16, int32, int64, uint8, uint16, uint32, uint64}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Flips all bits elementwise.
-
-The result will have exactly those bits set, that are not set in `x`. The
-computation is performed on the underlying representation of x.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 #define BINARY_BITWISE()                                                     \
   Input("x: T")                                                              \
@@ -38,70 +32,22 @@ computation is performed on the underlying representation of x.
       .Output("z: T")                                                        \
       .SetIsCommutative()                                                    \
       .Attr("T: {int8, int16, int32, int64, uint8, uint16, uint32, uint64}") \
-      .SetShapeFn(shape_inference::UnchangedShape)
+      .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
 
 REGISTER_OP("PopulationCount")
     .Input("x: T")
     .Output("y: uint8")
     .Attr("T: {int8, int16, int32, int64, uint8, uint16, uint32, uint64}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
-
-For each entry in `x`, calculates the number of `1` (on) bits in the binary
-representation of that entry.
-
-**NOTE**: It is more efficient to first `tf.bitcast` your tensors into
-`int32` or `int64` and perform the bitcount on the result, than to feed in
-8- or 16-bit inputs and then aggregate the resulting counts.
-)doc");
-
-REGISTER_OP("BitwiseAnd")
-    .BINARY_BITWISE()
-    .Doc(R"doc(
-Elementwise computes the bitwise AND of `x` and `y`.
-
-The result will have those bits set, that are set in both `x` and `y`. The
-computation is performed on the underlying representations of `x` and `y`.
-)doc");
-
-REGISTER_OP("BitwiseOr")
-    .BINARY_BITWISE()
-    .Doc(R"doc(
-Elementwise computes the bitwise OR of `x` and `y`.
-
-The result will have those bits set, that are set in `x`, `y` or both. The
-computation is performed on the underlying representations of `x` and `y`.
-)doc");
-
-REGISTER_OP("BitwiseXor")
-    .BINARY_BITWISE()
-    .Doc(R"doc(
-Elementwise computes the bitwise XOR of `x` and `y`.
-
-The result will have those bits set, that are different in `x` and `y`. The
-computation is performed on the underlying representations of `x` and `y`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
-REGISTER_OP("LeftShift")
-    .BINARY_BITWISE()
-    .Doc(R"doc(
-Elementwise computes the bitwise left-shift of `x` and `y`.
+REGISTER_OP("BitwiseAnd").BINARY_BITWISE();
 
-If `y` is negative, or greater than or equal to the width of `x` in bits the
-result is implementation defined.
-)doc");
+REGISTER_OP("BitwiseOr").BINARY_BITWISE();
 
-REGISTER_OP("RightShift")
-    .BINARY_BITWISE()
-    .Doc(R"doc(
-Elementwise computes the bitwise right-shift of `x` and `y`.
+REGISTER_OP("BitwiseXor").BINARY_BITWISE();
 
-Performs a logical shift for unsigned integer types, and an arithmetic shift
-for signed integer types.
+REGISTER_OP("LeftShift").BINARY_BITWISE();
 
-If `y` is negative, or greater than or equal to than the width of `x` in bits
-the result is implementation defined.
-)doc");
+REGISTER_OP("RightShift").BINARY_BITWISE();
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc
index 18700be67a667359d7a86d8f81ada383be973a0a..6e4d100b04fba22c170a654c9314e3a7e26fadda 100644
--- a/tensorflow/core/ops/candidate_sampling_ops.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops.cc
@@ -55,42 +55,7 @@ REGISTER_OP("UniformCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
-    .SetIsStateful()
-    .Doc(R"doc(
-Generates labels for candidate sampling with a uniform distribution.
-
-See explanations of candidate sampling and the data formats at
-go/candidate-sampling.
-
-For each batch, this op picks a single set of sampled candidate labels.
-
-The advantages of sampling candidates per-batch are simplicity and the
-possibility of efficient dense matrix multiplication. The disadvantage is that
-the sampled candidates must be chosen independently of the context and of the
-true labels.
-
-true_classes: A batch_size * num_true matrix, in which each row contains the
-  IDs of the num_true target_classes in the corresponding original label.
-sampled_candidates: A vector of length num_sampled, in which each element is
-  the ID of a sampled candidate.
-true_expected_count: A batch_size * num_true matrix, representing
-  the number of times each candidate is expected to occur in a batch
-  of sampled candidates. If unique=true, then this is a probability.
-sampled_expected_count: A vector of length num_sampled, for each sampled
-  candidate representing the number of times the candidate is expected
-  to occur in a batch of sampled candidates.  If unique=true, then this is a
-  probability.
-num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample.
-unique: If unique is true, we sample with rejection, so that all sampled
-  candidates in a batch are unique. This requires some approximation to
-  estimate the post-rejection sampling probabilities.
-range_max: The sampler will sample integers from the interval [0, range_max).
-seed: If either seed or seed2 are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: An second seed to avoid seed collision.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("LogUniformCandidateSampler")
     .Input("true_classes: int64")
@@ -104,43 +69,7 @@ REGISTER_OP("LogUniformCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
-    .SetIsStateful()
-    .Doc(R"doc(
-Generates labels for candidate sampling with a log-uniform distribution.
-
-See explanations of candidate sampling and the data formats at
-go/candidate-sampling.
-
-For each batch, this op picks a single set of sampled candidate labels.
-
-The advantages of sampling candidates per-batch are simplicity and the
-possibility of efficient dense matrix multiplication. The disadvantage is that
-the sampled candidates must be chosen independently of the context and of the
-true labels.
-
-
-true_classes: A batch_size * num_true matrix, in which each row contains the
-  IDs of the num_true target_classes in the corresponding original label.
-sampled_candidates: A vector of length num_sampled, in which each element is
-  the ID of a sampled candidate.
-true_expected_count: A batch_size * num_true matrix, representing
-  the number of times each candidate is expected to occur in a batch
-  of sampled candidates. If unique=true, then this is a probability.
-sampled_expected_count: A vector of length num_sampled, for each sampled
-  candidate representing the number of times the candidate is expected
-  to occur in a batch of sampled candidates.  If unique=true, then this is a
-  probability.
-num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample.
-unique: If unique is true, we sample with rejection, so that all sampled
-  candidates in a batch are unique. This requires some approximation to
-  estimate the post-rejection sampling probabilities.
-range_max: The sampler will sample integers from the interval [0, range_max).
-seed: If either seed or seed2 are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: An second seed to avoid seed collision.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("LearnedUnigramCandidateSampler")
     .Input("true_classes: int64")
@@ -154,42 +83,7 @@ REGISTER_OP("LearnedUnigramCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
-    .SetIsStateful()
-    .Doc(R"doc(
-Generates labels for candidate sampling with a learned unigram distribution.
-
-See explanations of candidate sampling and the data formats at
-go/candidate-sampling.
-
-For each batch, this op picks a single set of sampled candidate labels.
-
-The advantages of sampling candidates per-batch are simplicity and the
-possibility of efficient dense matrix multiplication. The disadvantage is that
-the sampled candidates must be chosen independently of the context and of the
-true labels.
-
-true_classes: A batch_size * num_true matrix, in which each row contains the
-  IDs of the num_true target_classes in the corresponding original label.
-sampled_candidates: A vector of length num_sampled, in which each element is
-  the ID of a sampled candidate.
-true_expected_count: A batch_size * num_true matrix, representing
-  the number of times each candidate is expected to occur in a batch
-  of sampled candidates. If unique=true, then this is a probability.
-sampled_expected_count: A vector of length num_sampled, for each sampled
-  candidate representing the number of times the candidate is expected
-  to occur in a batch of sampled candidates.  If unique=true, then this is a
-  probability.
-num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample.
-unique: If unique is true, we sample with rejection, so that all sampled
-  candidates in a batch are unique. This requires some approximation to
-  estimate the post-rejection sampling probabilities.
-range_max: The sampler will sample integers from the interval [0, range_max).
-seed: If either seed or seed2 are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: An second seed to avoid seed collision.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("ThreadUnsafeUnigramCandidateSampler")
     .Input("true_classes: int64")
@@ -203,42 +97,7 @@ REGISTER_OP("ThreadUnsafeUnigramCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
-    .SetIsStateful()
-    .Doc(R"doc(
-Generates labels for candidate sampling with a learned unigram distribution.
-
-See explanations of candidate sampling and the data formats at
-go/candidate-sampling.
-
-For each batch, this op picks a single set of sampled candidate labels.
-
-The advantages of sampling candidates per-batch are simplicity and the
-possibility of efficient dense matrix multiplication. The disadvantage is that
-the sampled candidates must be chosen independently of the context and of the
-true labels.
-
-true_classes: A batch_size * num_true matrix, in which each row contains the
-  IDs of the num_true target_classes in the corresponding original label.
-sampled_candidates: A vector of length num_sampled, in which each element is
-  the ID of a sampled candidate.
-true_expected_count: A batch_size * num_true matrix, representing
-  the number of times each candidate is expected to occur in a batch
-  of sampled candidates. If unique=true, then this is a probability.
-sampled_expected_count: A vector of length num_sampled, for each sampled
-  candidate representing the number of times the candidate is expected
-  to occur in a batch of sampled candidates.  If unique=true, then this is a
-  probability.
-num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample.
-unique: If unique is true, we sample with rejection, so that all sampled
-  candidates in a batch are unique. This requires some approximation to
-  estimate the post-rejection sampling probabilities.
-range_max: The sampler will sample integers from the interval [0, range_max).
-seed: If either seed or seed2 are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: An second seed to avoid seed collision.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("FixedUnigramCandidateSampler")
     .Input("true_classes: int64")
@@ -258,70 +117,7 @@ REGISTER_OP("FixedUnigramCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
-    .SetIsStateful()
-    .Doc(R"doc(
-Generates labels for candidate sampling with a learned unigram distribution.
-
-A unigram sampler could use a fixed unigram distribution read from a
-file or passed in as an in-memory array instead of building up the distribution
-from data on the fly. There is also an option to skew the distribution by
-applying a distortion power to the weights.
-
-The vocabulary file should be in CSV-like format, with the last field
-being the weight associated with the word.
-
-For each batch, this op picks a single set of sampled candidate labels.
-
-The advantages of sampling candidates per-batch are simplicity and the
-possibility of efficient dense matrix multiplication. The disadvantage is that
-the sampled candidates must be chosen independently of the context and of the
-true labels.
-
-true_classes: A batch_size * num_true matrix, in which each row contains the
-  IDs of the num_true target_classes in the corresponding original label.
-sampled_candidates: A vector of length num_sampled, in which each element is
-  the ID of a sampled candidate.
-true_expected_count: A batch_size * num_true matrix, representing
-  the number of times each candidate is expected to occur in a batch
-  of sampled candidates. If unique=true, then this is a probability.
-sampled_expected_count: A vector of length num_sampled, for each sampled
-  candidate representing the number of times the candidate is expected
-  to occur in a batch of sampled candidates.  If unique=true, then this is a
-  probability.
-num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample.
-unique: If unique is true, we sample with rejection, so that all sampled
-  candidates in a batch are unique. This requires some approximation to
-  estimate the post-rejection sampling probabilities.
-range_max: The sampler will sample integers from the interval [0, range_max).
-vocab_file: Each valid line in this file (which should have a CSV-like format)
-  corresponds to a valid word ID. IDs are in sequential order, starting from
-  num_reserved_ids. The last entry in each line is expected to be a value
-  corresponding to the count or relative probability. Exactly one of vocab_file
-  and unigrams needs to be passed to this op.
-distortion: The distortion is used to skew the unigram probability distribution.
-  Each weight is first raised to the distortion's power before adding to the
-  internal unigram distribution. As a result, distortion = 1.0 gives regular
-  unigram sampling (as defined by the vocab file), and distortion = 0.0 gives
-  a uniform distribution.
-num_reserved_ids: Optionally some reserved IDs can be added in the range [0,
-  ..., num_reserved_ids) by the users. One use case is that a special unknown
-  word token is used as ID 0. These IDs will have a sampling probability of 0.
-num_shards: A sampler can be used to sample from a subset of the original range
-  in order to speed up the whole computation through parallelism. This parameter
-  (together with 'shard') indicates the number of partitions that are being
-  used in the overall computation.
-shard: A sampler can be used to sample from a subset of the original range
-  in order to speed up the whole computation through parallelism. This parameter
-  (together with 'num_shards') indicates the particular partition number of a
-  sampler op, when partitioning is being used.
-unigrams: A list of unigram counts or probabilities, one per ID in sequential
-  order. Exactly one of vocab_file and unigrams should be passed to this op.
-seed: If either seed or seed2 are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: An second seed to avoid seed collision.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("AllCandidateSampler")
     .Input("true_classes: int64")
@@ -334,41 +130,7 @@ REGISTER_OP("AllCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
-    .SetIsStateful()
-    .Doc(R"doc(
-Generates labels for candidate sampling with a learned unigram distribution.
-
-See explanations of candidate sampling and the data formats at
-go/candidate-sampling.
-
-For each batch, this op picks a single set of sampled candidate labels.
-
-The advantages of sampling candidates per-batch are simplicity and the
-possibility of efficient dense matrix multiplication. The disadvantage is that
-the sampled candidates must be chosen independently of the context and of the
-true labels.
-
-true_classes: A batch_size * num_true matrix, in which each row contains the
-  IDs of the num_true target_classes in the corresponding original label.
-sampled_candidates: A vector of length num_sampled, in which each element is
-  the ID of a sampled candidate.
-true_expected_count: A batch_size * num_true matrix, representing
-  the number of times each candidate is expected to occur in a batch
-  of sampled candidates. If unique=true, then this is a probability.
-sampled_expected_count: A vector of length num_sampled, for each sampled
-  candidate representing the number of times the candidate is expected
-  to occur in a batch of sampled candidates.  If unique=true, then this is a
-  probability.
-num_true: Number of true labels per context.
-num_sampled: Number of candidates to produce.
-unique: If unique is true, we sample with rejection, so that all sampled
-  candidates in a batch are unique. This requires some approximation to
-  estimate the post-rejection sampling probabilities.
-seed: If either seed or seed2 are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: An second seed to avoid seed collision.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("ComputeAccidentalHits")
     .Input("true_classes: int64")
@@ -396,27 +158,6 @@ REGISTER_OP("ComputeAccidentalHits")
       c->set_output(1, v);
       c->set_output(2, v);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the ids of the positions in sampled_candidates that match true_labels.
-
-When doing log-odds NCE, the result of this op should be passed through a
-SparseToDense op, then added to the logits of the sampled candidates. This has
-the effect of 'removing' the sampled labels that match the true labels by
-making the classifier sure that they are sampled labels.
-
-true_classes: The true_classes output of UnpackSparseLabels.
-sampled_candidates: The sampled_candidates output of CandidateSampler.
-indices: A vector of indices corresponding to rows of true_candidates.
-ids: A vector of IDs of positions in sampled_candidates that match a true_label
-  for the row with the corresponding index in indices.
-weights: A vector of the same length as indices and ids, in which each element
-  is -FLOAT_MAX.
-num_true: Number of true labels per context.
-seed: If either seed or seed2 are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: An second seed to avoid seed collision.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/candidate_sampling_ops_test.cc b/tensorflow/core/ops/candidate_sampling_ops_test.cc
index c79b4439148e5795e313c71bbce35c82242cd335..f367371604097b7a500d746a3b8a8a5906082cbb 100644
--- a/tensorflow/core/ops/candidate_sampling_ops_test.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops_test.cc
@@ -23,9 +23,12 @@ namespace tensorflow {
 
 TEST(CandidateSamplerOpsTest, CandidateSampler_ShapeFn) {
   for (const char* op_name : {
-           "AllCandidateSampler", "FixedUnigramCandidateSampler",
-           "LearnedUnigramCandidateSampler", "LogUniformCandidateSampler",
-           "ThreadUnsafeUnigramCandidateSampler", "UniformCandidateSampler",
+           "AllCandidateSampler",
+           "FixedUnigramCandidateSampler",
+           "LearnedUnigramCandidateSampler",
+           "LogUniformCandidateSampler",
+           "ThreadUnsafeUnigramCandidateSampler",
+           "UniformCandidateSampler",
        }) {
     ShapeInferenceTestOp op(op_name);
     TF_ASSERT_OK(NodeDefBuilder("test", op.name)
diff --git a/tensorflow/core/ops/checkpoint_ops.cc b/tensorflow/core/ops/checkpoint_ops.cc
index 08b00c8255c8e44cea9a2e0d4c97378ecc3bb998..5fe82e165313683b732d39e40266df2d31c71231 100644
--- a/tensorflow/core/ops/checkpoint_ops.cc
+++ b/tensorflow/core/ops/checkpoint_ops.cc
@@ -38,49 +38,7 @@ REGISTER_OP("GenerateVocabRemapping")
       c->set_output(0, c->Vector(num_new_vocab));
       c->set_output(1, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Given a path to new and old vocabulary files, returns a remapping Tensor of
-length `num_new_vocab`, where `remapping[i]` contains the row number in the old
-vocabulary that corresponds to row `i` in the new vocabulary (starting at line
-`new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-in the new vocabulary is not in the old vocabulary.  The old vocabulary is
-constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
-default value of -1.
-
-`num_vocab_offset` enables
-use in the partitioned variable case, and should generally be set through
-examining partitioning info.  The format of the files should be a text file,
-with each line containing a single entity within the vocabulary.
-
-For example, with `new_vocab_file` a text file containing each of the following
-elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
-`num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
-`[0, -1, 2]`.
-
-The op also returns a count of how many entries in the new vocabulary
-were present in the old vocabulary, which is used to calculate the number of
-values to initialize in a weight matrix remapping
-
-This functionality can be used to remap both row vocabularies (typically,
-features) and column vocabularies (typically, classes) from TensorFlow
-checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
-corresponding to div-partitioned variables.  Moreover, the underlying remapping
-uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
-use the corresponding index_table_from_file() as the FeatureColumn framework
-does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
-
-new_vocab_file: Path to the new vocab file.
-old_vocab_file: Path to the old vocab file.
-new_vocab_offset: How many entries into the new vocab file to start reading.
-num_new_vocab: Number of entries in the new vocab file to remap.
-old_vocab_size: Number of entries in the old vocab file to consider.  If -1,
-  use the entire old vocabulary.
-remapping: A Tensor of length num_new_vocab where the element at index i
-  is equal to the old ID that maps to the new ID i.  This element is -1 for any
-  new ID that is not found in the old vocabulary.
-num_present: Number of new vocab entries found in old vocab.
-)doc");
+    });
 
 REGISTER_OP("LoadAndRemapMatrix")
     .Input("ckpt_path: string")
@@ -109,63 +67,5 @@ REGISTER_OP("LoadAndRemapMatrix")
 
       c->set_output(0, c->Matrix(num_rows, num_cols));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
-at `ckpt_path` and potentially reorders its rows and columns using the
-specified remappings.
-
-Most users should use one of the wrapper initializers (such as
-`tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-function directly.
-
-The remappings are 1-D tensors with the following properties:
-
-* `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-  matrix will be initialized from the row corresponding to index
-  `row_remapping[i]` in the old `Tensor` from the checkpoint.
-* `col_remapping` must have either 0 entries (indicating that no column
-  reordering is needed) or `num_cols` entries. If specified, column `j` of the
-  output matrix will be initialized from the column corresponding to index
-  `col_remapping[j]` in the old `Tensor` from the checkpoint.
-* A value of -1 in either of the remappings signifies a "missing" entry. In that
-  case, values from the `initializing_values` tensor will be used to fill that
-  missing row or column. If `row_remapping` has `r` missing entries and
-  `col_remapping` has `c` missing entries, then the following condition must be
-  true:
-
-`(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
-
-The remapping tensors can be generated using the GenerateVocabRemapping op.
-
-As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-the value from row i, column j of the old tensor in the checkpoint, the output
-matrix will look like the following:
-
-[[w(1, 0),  w(1, 2),  0.5],
- [w(0, 0),  w(0, 2), -0.5],
- [0.25,    -0.25,      42]]
-
-ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
-  which the old matrix `Tensor` will be loaded.
-old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-row_remapping: An int `Tensor` of row remappings (generally created by
-  `generate_vocab_remapping`).  Even if no row remapping is needed, this must
-  still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
-  index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
-col_remapping: An int `Tensor` of column remappings (generally created by
-  `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
-  is to be done (e.g. column ordering is the same).
-initializing_values: A float `Tensor` containing  values to fill in for cells
-  in the output matrix that are not loaded from the checkpoint. Length must be
-  exactly the same as the number of missing / new cells.
-num_rows: Number of rows (length of the 1st dimension) in the output matrix.
-num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
-max_rows_in_memory: The maximum number of rows to load from the checkpoint at
-  once. If less than or equal to 0, the entire matrix will be loaded into
-  memory. Setting this arg trades increased disk reads for lower memory usage.
-output_matrix: Output matrix containing existing values loaded from the
-  checkpoint, and with any missing values filled in from initializing_values.
-)doc");
+    });
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/compat/backwards_compatibility_test.cc b/tensorflow/core/ops/compat/backwards_compatibility_test.cc
index add05d6610ae62158b653d27699f61bc511ee3b6..6e05ae4be4fb967ac8dcc5a03fa548c7cb6c0f9b 100644
--- a/tensorflow/core/ops/compat/backwards_compatibility_test.cc
+++ b/tensorflow/core/ops/compat/backwards_compatibility_test.cc
@@ -25,8 +25,9 @@ namespace tensorflow {
 namespace {
 
 TEST(BackwardsCompatibilityTest, IsCompatible) {
-  OpCompatibilityLib compatibility(
-      "tensorflow/core/ops", strings::StrCat("v", TF_MAJOR_VERSION), nullptr);
+  OpCompatibilityLib compatibility("tensorflow/core/ops",
+                                   strings::StrCat("v", TF_MAJOR_VERSION),
+                                   nullptr);
 
   Env* env = Env::Default();
   int changed_ops = 0;
diff --git a/tensorflow/core/ops/compat/op_compatibility_lib.cc b/tensorflow/core/ops/compat/op_compatibility_lib.cc
index 61243d2bd23b6407b539171d4c39a7792b9fae91..45017c9da5ef28828329989c00ff0409994a7ce5 100644
--- a/tensorflow/core/ops/compat/op_compatibility_lib.cc
+++ b/tensorflow/core/ops/compat/op_compatibility_lib.cc
@@ -146,6 +146,11 @@ Status OpCompatibilityLib::ValidateCompatible(Env* env, int* changed_ops,
               OpDefCompatible(in_op_history.op(i), op_list_.op(cur)));
         }
 
+        // Verify default value of attrs has not been added/removed/modified
+        // as compared to only the last historical version.
+        TF_RETURN_IF_ERROR(OpDefAttrDefaultsUnchanged(in_op_history.op(end - 1),
+                                                      op_list_.op(cur)));
+
         // Check that attrs missing from in_op_history.op(start) don't
         // change their defaults.
         if (start < end - 1) {
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index c7a296d9381b5263617ae9cb014856f234733fd9..fc9e5b02a2253621203a47c5f7d1b7d311c82a97 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -39,6 +39,79 @@ op {
     }
   }
 }
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "AccumulateNV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_aggregate: true
+  is_commutative: true
+}
 op {
   name: "AccumulateNV2"
   input_arg {
@@ -77,6 +150,56 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AccumulateNV2"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -165,6 +288,88 @@ op {
     }
   }
 }
+op {
+  name: "AccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
 op {
   name: "AccumulatorNumAccumulated"
   input_arg {
@@ -267,6 +472,114 @@ op {
     }
   }
 }
+op {
+  name: "AccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "AccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "average"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Acos"
   input_arg {
@@ -283,6 +596,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -317,6 +631,65 @@ op {
     }
   }
 }
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Add"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "Add"
   input_arg {
@@ -337,6 +710,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -519,6 +893,98 @@ op {
   is_aggregate: true
   is_commutative: true
 }
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+        type: DT_VARIANT
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
+op {
+  name: "AddN"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "sum"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_VARIANT
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
 op {
   name: "AddSparseToTensorsMap"
   input_arg {
@@ -592,6 +1058,42 @@ op {
   is_aggregate: true
   is_commutative: true
 }
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
 op {
   name: "AdjustContrast"
   input_arg {
@@ -1023,7 +1525,7 @@ op {
   }
 }
 op {
-  name: "ApplyAdagrad"
+  name: "ApplyAdadelta"
   input_arg {
     name: "var"
     type_attr: "T"
@@ -1034,15 +1536,28 @@ op {
     type_attr: "T"
     is_ref: true
   }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "rho"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
     name: "out"
     type_attr: "T"
     is_ref: true
@@ -1066,6 +1581,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1078,7 +1596,7 @@ op {
   }
 }
 op {
-  name: "ApplyAdagrad"
+  name: "ApplyAdadelta"
   input_arg {
     name: "var"
     type_attr: "T"
@@ -1089,10 +1607,23 @@ op {
     type_attr: "T"
     is_ref: true
   }
+  input_arg {
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
   input_arg {
     name: "grad"
     type_attr: "T"
@@ -1109,17 +1640,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -1135,42 +1667,25 @@ op {
   }
 }
 op {
-  name: "ApplyAdagradDA"
+  name: "ApplyAdagrad"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "gradient_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
+    name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
   output_arg {
     name: "out"
     type_attr: "T"
@@ -1207,42 +1722,25 @@ op {
   }
 }
 op {
-  name: "ApplyAdagradDA"
+  name: "ApplyAdagrad"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "gradient_accumulator"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
+    name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
   output_arg {
     name: "out"
     type_attr: "T"
@@ -1281,44 +1779,77 @@ op {
   }
 }
 op {
-  name: "ApplyAdam"
+  name: "ApplyAdagrad"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "m"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "v"
+    name: "lr"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "beta1_power"
+    name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "beta2_power"
+  output_arg {
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
+}
+op {
+  name: "ApplyAdagrad"
   input_arg {
-    name: "beta1"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "beta2"
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "epsilon"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
@@ -1337,18 +1868,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -1361,28 +1895,24 @@ op {
   }
 }
 op {
-  name: "ApplyAdam"
+  name: "ApplyAdagradDA"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "m"
+    name: "gradient_accumulator"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "v"
+    name: "gradient_squared_accumulator"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
@@ -1390,20 +1920,16 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "global_step"
+    type: DT_INT64
   }
   output_arg {
     name: "out"
@@ -1439,37 +1965,26 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ApplyAdam"
+  name: "ApplyAdagradDA"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "m"
+    name: "gradient_accumulator"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "v"
+    name: "gradient_squared_accumulator"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
@@ -1477,20 +1992,16 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "beta1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "global_step"
+    type: DT_INT64
   }
   output_arg {
     name: "out"
@@ -1528,46 +2039,44 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ApplyAddSign"
+  name: "ApplyAdagradDA"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "m"
+    name: "gradient_accumulator"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "lr"
+    name: "gradient_squared_accumulator"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "alpha"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "sign_decay"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "beta"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l2"
     type_attr: "T"
   }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   output_arg {
     name: "out"
     type_attr: "T"
@@ -1594,6 +2103,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1606,46 +2116,41 @@ op {
   }
 }
 op {
-  name: "ApplyCenteredRMSProp"
+  name: "ApplyAdagradDA"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mg"
+    name: "gradient_accumulator"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "ms"
+    name: "gradient_squared_accumulator"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mom"
+    name: "grad"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "global_step"
+    type: DT_INT64
   }
   output_arg {
     name: "out"
@@ -1659,18 +2164,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -1683,37 +2191,40 @@ op {
   }
 }
 op {
-  name: "ApplyCenteredRMSProp"
+  name: "ApplyAdam"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mg"
+    name: "m"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "ms"
+    name: "v"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mom"
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "beta1"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "beta2"
     type_attr: "T"
   }
   input_arg {
@@ -1748,8 +2259,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -1762,24 +2271,28 @@ op {
   }
 }
 op {
-  name: "ApplyFtrl"
+  name: "ApplyAdam"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "m"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "linear"
+    name: "v"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "grad"
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
     type_attr: "T"
   }
   input_arg {
@@ -1787,15 +2300,19 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "beta1"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "beta2"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -1832,26 +2349,37 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "ApplyFtrl"
+  name: "ApplyAdam"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "m"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "linear"
+    name: "v"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "grad"
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
     type_attr: "T"
   }
   input_arg {
@@ -1859,15 +2387,19 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "beta1"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "beta2"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -1906,26 +2438,37 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "ApplyFtrlV2"
+  name: "ApplyAdam"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "m"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "linear"
+    name: "v"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "grad"
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
     type_attr: "T"
   }
   input_arg {
@@ -1933,19 +2476,19 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "beta1"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "beta2"
     type_attr: "T"
   }
   input_arg {
-    name: "l2_shrinkage"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -1972,6 +2515,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -1982,26 +2528,37 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "ApplyFtrlV2"
+  name: "ApplyAdam"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "m"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "linear"
+    name: "v"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "grad"
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
     type_attr: "T"
   }
   input_arg {
@@ -2009,19 +2566,19 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "beta1"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "beta2"
     type_attr: "T"
   }
   input_arg {
-    name: "l2_shrinkage"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -2036,17 +2593,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -2060,70 +2618,44 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "ApplyGradientDescent"
+  name: "ApplyAddSign"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "alpha"
+    name: "m"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "delta"
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "out"
+  input_arg {
+    name: "alpha"
     type_attr: "T"
-    is_ref: true
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "ApplyGradientDescent"
   input_arg {
-    name: "var"
+    name: "sign_decay"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "alpha"
+    name: "beta"
     type_attr: "T"
   }
   input_arg {
-    name: "delta"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -2164,14 +2696,14 @@ op {
   }
 }
 op {
-  name: "ApplyMomentum"
+  name: "ApplyAddSign"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "m"
     type_attr: "T"
     is_ref: true
   }
@@ -2180,11 +2712,19 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -2211,6 +2751,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -2221,23 +2764,16 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ApplyMomentum"
+  name: "ApplyAddSign"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "m"
     type_attr: "T"
     is_ref: true
   }
@@ -2246,11 +2782,19 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -2265,17 +2809,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -2289,23 +2834,26 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ApplyPowerSign"
+  name: "ApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "m"
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
     type_attr: "T"
     is_ref: true
   }
@@ -2314,15 +2862,15 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "logbase"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "sign_decay"
+    name: "momentum"
     type_attr: "T"
   }
   input_arg {
-    name: "beta"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
@@ -2353,8 +2901,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -2367,14 +2913,24 @@ op {
   }
 }
 op {
-  name: "ApplyProximalAdagrad"
+  name: "ApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
     type_attr: "T"
     is_ref: true
   }
@@ -2383,11 +2939,15 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
@@ -2418,6 +2978,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -2430,14 +2992,24 @@ op {
   }
 }
 op {
-  name: "ApplyProximalAdagrad"
+  name: "ApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
     type_attr: "T"
     is_ref: true
   }
@@ -2446,11 +3018,15 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
@@ -2483,6 +3059,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -2495,26 +3072,45 @@ op {
   }
 }
 op {
-  name: "ApplyProximalGradientDescent"
+  name: "ApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "alpha"
+    name: "mg"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l1"
+    name: "ms"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l2"
+    name: "mom"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "delta"
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -2529,18 +3125,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -2553,14 +3152,28 @@ op {
   }
 }
 op {
-  name: "ApplyProximalGradientDescent"
+  name: "ApplyFtrl"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "alpha"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
@@ -2572,7 +3185,7 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "delta"
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
@@ -2599,8 +3212,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -2613,40 +3224,40 @@ op {
   }
 }
 op {
-  name: "ApplyRMSProp"
+  name: "ApplyFtrl"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "ms"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mom"
+    name: "linear"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "lr"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
@@ -2673,6 +3284,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -2685,40 +3298,40 @@ op {
   }
 }
 op {
-  name: "ApplyRMSProp"
+  name: "ApplyFtrl"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "ms"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "mom"
+    name: "linear"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "lr"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
@@ -2747,6 +3360,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -2759,18 +3373,46 @@ op {
   }
 }
 op {
-  name: "ApproximateEqual"
+  name: "ApplyFtrl"
   input_arg {
-    name: "x"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "y"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -2779,43 +3421,77 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "tolerance"
-    type: "float"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      f: 1e-05
+      b: false
     }
   }
-  is_commutative: true
 }
 op {
-  name: "ApproximateEqual"
+  name: "ApplyFtrlV2"
   input_arg {
-    name: "x"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "y"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -2836,33 +3512,62 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "tolerance"
-    type: "float"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      f: 1e-05
+      b: false
     }
   }
-  is_commutative: true
 }
 op {
-  name: "ArgMax"
+  name: "ApplyFtrlV2"
   input_arg {
-    name: "input"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_INT64
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -2883,36 +3588,64 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
 }
 op {
-  name: "ArgMax"
+  name: "ApplyFtrlV2"
   input_arg {
-    name: "input"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "output_type"
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -2933,49 +3666,118 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT32
+      b: false
     }
+  }
+}
+op {
+  name: "ApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
         type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "output_type"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
 }
 op {
-  name: "ArgMax"
+  name: "ApplyGradientDescent"
   input_arg {
-    name: "input"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "output_type"
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -2996,51 +3798,36 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
       }
     }
   }
   attr {
-    name: "output_type"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
 }
 op {
-  name: "ArgMin"
+  name: "ApplyGradientDescent"
   input_arg {
-    name: "input"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
+    name: "delta"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_INT64
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -3061,36 +3848,38 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
 }
 op {
-  name: "ArgMin"
+  name: "ApplyGradientDescent"
   input_arg {
-    name: "input"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
+    name: "delta"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "output_type"
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -3111,49 +3900,39 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "output_type"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
 }
 op {
-  name: "ArgMin"
+  name: "ApplyGradientDescent"
   input_arg {
-    name: "input"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "dimension"
-    type_attr: "Tidx"
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "output_type"
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -3162,17 +3941,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -3180,217 +3960,242 @@ op {
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "output_type"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
 }
 op {
-  name: "AsString"
+  name: "ApplyMomentum"
   input_arg {
-    name: "input"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_STRING
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_BOOL
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "precision"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "scientific"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "shortest"
+    name: "use_nesterov"
     type: "bool"
     default_value {
       b: false
     }
   }
-  attr {
-    name: "width"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "fill"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
 }
 op {
-  name: "Asin"
+  name: "ApplyMomentum"
   input_arg {
-    name: "x"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "y"
+  input_arg {
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-}
-op {
-  name: "Asinh"
   input_arg {
-    name: "x"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "Assert"
-  input_arg {
-    name: "condition"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "data"
-    type_list_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "summarize"
-    type: "int"
+    name: "use_nesterov"
+    type: "bool"
     default_value {
-      i: 3
+      b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Assign"
+  name: "ApplyMomentum"
   input_arg {
-    name: "ref"
+    name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "value"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "out"
     type_attr: "T"
     is_ref: true
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "validate_shape"
+    name: "use_locking"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
   attr {
-    name: "use_locking"
+    name: "use_nesterov"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
-  allows_uninitialized_input: true
 }
 op {
-  name: "AssignAdd"
+  name: "ApplyMomentum"
   input_arg {
-    name: "ref"
+    name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "value"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "out"
     type_attr: "T"
     is_ref: true
   }
@@ -3401,18 +4206,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -3423,20 +4231,48 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "AssignAdd"
+  name: "ApplyPowerSign"
   input_arg {
-    name: "ref"
+    name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "value"
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "out"
     type_attr: "T"
     is_ref: true
   }
@@ -3473,34 +4309,39 @@ op {
   }
 }
 op {
-  name: "AssignAddVariableOp"
+  name: "ApplyPowerSign"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "value"
-    type_attr: "dtype"
+    name: "m"
+    type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "dtype"
-    type: "type"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "AssignSub"
   input_arg {
-    name: "ref"
+    name: "logbase"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "value"
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "out"
     type_attr: "T"
     is_ref: true
   }
@@ -3523,6 +4364,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -3535,18 +4379,39 @@ op {
   }
 }
 op {
-  name: "AssignSub"
+  name: "ApplyPowerSign"
   input_arg {
-    name: "ref"
+    name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "value"
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "out"
     type_attr: "T"
     is_ref: true
   }
@@ -3557,17 +4422,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -3583,132 +4449,127 @@ op {
   }
 }
 op {
-  name: "AssignSubVariableOp"
+  name: "ApplyProximalAdagrad"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
-  is_stateful: true
-}
-op {
-  name: "AssignVariableOp"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "value"
-    type_attr: "dtype"
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "dtype"
-    type: "type"
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "Atan"
   input_arg {
-    name: "x"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Atan2"
+  name: "ApplyProximalAdagrad"
   input_arg {
-    name: "y"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "x"
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "z"
+  input_arg {
+    name: "l1"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-}
-op {
-  name: "Atanh"
   input_arg {
-    name: "x"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "AudioSpectrogram"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "spectrogram"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "window_size"
-    type: "int"
-  }
-  attr {
-    name: "stride"
-    type: "int"
-  }
   attr {
-    name: "magnitude_squared"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
@@ -3716,108 +4577,37 @@ op {
   }
 }
 op {
-  name: "AudioSummary"
+  name: "ApplyProximalAdagrad"
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "tensor"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "sample_rate"
-    type: "float"
-  }
-  attr {
-    name: "max_outputs"
-    type: "int"
-    default_value {
-      i: 3
-    }
-    has_minimum: true
-    minimum: 1
-  }
-  deprecation {
-    version: 15
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
-}
-op {
-  name: "AudioSummaryV2"
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "tensor"
-    type: DT_FLOAT
+    name: "l1"
+    type_attr: "T"
   }
   input_arg {
-    name: "sample_rate"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  attr {
-    name: "max_outputs"
-    type: "int"
-    default_value {
-      i: 3
-    }
-    has_minimum: true
-    minimum: 1
+    name: "l2"
+    type_attr: "T"
   }
-}
-op {
-  name: "AvgPool"
   input_arg {
-    name: "value"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+    is_ref: true
   }
   attr {
     name: "T"
@@ -3825,56 +4615,65 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_HALF
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "AvgPool"
+  name: "ApplyProximalAdagrad"
   input_arg {
-    name: "value"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -3883,99 +4682,117 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "AvgPool"
+  name: "ApplyProximalGradientDescent"
   input_arg {
-    name: "value"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "alpha"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "AvgPool3D"
+  name: "ApplyProximalGradientDescent"
   input_arg {
-    name: "input"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "alpha"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -3984,54 +4801,58 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "AvgPool3D"
+  name: "ApplyProximalGradientDescent"
   input_arg {
-    name: "input"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "alpha"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -4040,45 +4861,59 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "AvgPool3DGrad"
+  name: "ApplyProximalGradientDescent"
   input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "grad"
+    name: "alpha"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "l1"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  input_arg {
+    name: "delta"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -4087,118 +4922,73 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "AvgPool3DGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
+}
+op {
+  name: "ApplyRMSProp"
   input_arg {
-    name: "grad"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "ms"
     type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
-}
-op {
-  name: "AvgPoolGrad"
   input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
+    name: "epsilon"
+    type_attr: "T"
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+    is_ref: true
   }
   attr {
     name: "T"
@@ -4206,330 +4996,267 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_HALF
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "AvgPoolGrad"
+  name: "ApplyRMSProp"
   input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "grad"
+    name: "ms"
     type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "mom"
     type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_HALF
-      }
-    }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
-}
-op {
-  name: "AvgPoolGrad"
   input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
+    name: "epsilon"
+    type_attr: "T"
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "Barrier"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "BarrierClose"
+  name: "ApplyRMSProp"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "var"
+    type_attr: "T"
     is_ref: true
   }
-  attr {
-    name: "cancel_pending_enqueues"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
   }
-}
-op {
-  name: "BarrierIncompleteSize"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "mom"
+    type_attr: "T"
     is_ref: true
   }
-  output_arg {
-    name: "size"
-    type: DT_INT32
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-}
-op {
-  name: "BarrierInsertMany"
   input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "rho"
+    type_attr: "T"
   }
   input_arg {
-    name: "keys"
-    type: DT_STRING
+    name: "momentum"
+    type_attr: "T"
   }
   input_arg {
-    name: "values"
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "component_index"
-    type: "int"
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "BarrierReadySize"
+  name: "ApplyRMSProp"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "var"
+    type_attr: "T"
     is_ref: true
   }
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-}
-op {
-  name: "BarrierTakeMany"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "ms"
+    type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "num_elements"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "keys"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "component_types"
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "allow_small_batch"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-  attr {
-    name: "wait_for_incomplete"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-}
-op {
-  name: "BatchCholesky"
   input_arg {
-    name: "input"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  deprecation {
-    version: 13
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "BatchCholeskyGrad"
+  name: "ApproximateEqual"
   input_arg {
-    name: "l"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type: DT_BOOL
   }
   attr {
     name: "T"
@@ -4538,154 +5265,79 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
-  deprecation {
-    version: 13
-  }
-}
-op {
-  name: "BatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
   }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "BatchDataset"
+  name: "ApproximateEqual"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "z"
+    type: DT_BOOL
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "BatchFFT"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "BatchFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "BatchFFT3D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "BatchIFFT"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "BatchIFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
-  }
-}
-op {
-  name: "BatchIFFT3D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-  deprecation {
-    version: 15
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
   }
+  is_commutative: true
 }
 op {
-  name: "BatchMatMul"
+  name: "ApproximateEqual"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -4695,73 +5347,56 @@ op {
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type: DT_BOOL
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "adj_x"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "adj_y"
-    type: "bool"
+    name: "tolerance"
+    type: "float"
     default_value {
-      b: false
+      f: 1e-05
     }
   }
+  is_commutative: true
 }
 op {
-  name: "BatchMatrixBandPart"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_lower"
-    type: DT_INT64
-  }
+  name: "ApproximateEqual"
   input_arg {
-    name: "num_upper"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "band"
+    name: "x"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  deprecation {
-    version: 14
-  }
-}
-op {
-  name: "BatchMatrixDeterminant"
   input_arg {
-    name: "input"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type: DT_BOOL
   }
   attr {
     name: "T"
@@ -4770,22 +5405,46 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  deprecation {
-    version: 13
+  attr {
+    name: "tolerance"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
   }
+  is_commutative: true
 }
 op {
-  name: "BatchMatrixDeterminant"
+  name: "ArgMax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type: DT_INT64
   }
   attr {
     name: "T"
@@ -4794,245 +5453,308 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
-  deprecation {
-    version: 13
-  }
-}
-op {
-  name: "BatchMatrixDiag"
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
-  }
-  deprecation {
-    version: 14
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "BatchMatrixDiagPart"
+  name: "ArgMax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
   output_arg {
-    name: "diagonal"
-    type_attr: "T"
+    name: "output"
+    type_attr: "output_type"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  deprecation {
-    version: 14
-  }
-}
-op {
-  name: "BatchMatrixInverse"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "Tidx"
+    type: "type"
     default_value {
-      b: false
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "T"
+    name: "output_type"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-  deprecation {
-    version: 13
-  }
 }
 op {
-  name: "BatchMatrixSetDiag"
+  name: "ArgMax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "diagonal"
-    type_attr: "T"
+    name: "dimension"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "output_type"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  deprecation {
-    version: 14
-  }
-}
-op {
-  name: "BatchMatrixSolve"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "Tidx"
+    type: "type"
     default_value {
-      b: false
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "T"
+    name: "output_type"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-  deprecation {
-    version: 13
-  }
 }
 op {
-  name: "BatchMatrixSolveLs"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
+  name: "ArgMax"
   input_arg {
-    name: "rhs"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
+    name: "dimension"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "output_type"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "fast"
-    type: "bool"
+    name: "Tidx"
+    type: "type"
     default_value {
-      b: true
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  deprecation {
-    version: 13
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "BatchMatrixTriangularSolve"
+  name: "ArgMax"
   input_arg {
-    name: "matrix"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "rhs"
-    type_attr: "T"
+    name: "dimension"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "output_type"
   }
   attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "Tidx"
+    type: "type"
     default_value {
-      b: false
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "T"
+    name: "output_type"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-  deprecation {
-    version: 13
-  }
 }
 op {
-  name: "BatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
+  name: "ArgMin"
   input_arg {
-    name: "beta"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "gamma"
-    type_attr: "T"
+    name: "dimension"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "result"
-    type_attr: "T"
+    name: "output"
+    type: DT_INT64
   }
   attr {
     name: "T"
@@ -5057,42 +5779,32 @@ op {
     }
   }
   attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "BatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
+  name: "ArgMin"
   input_arg {
-    name: "beta"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "gamma"
-    type_attr: "T"
+    name: "dimension"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "result"
-    type_attr: "T"
+    name: "output"
+    type_attr: "output_type"
   }
   attr {
     name: "T"
@@ -5113,64 +5825,49 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "variance_epsilon"
-    type: "float"
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "BatchNormWithGlobalNormalizationGrad"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
+  name: "ArgMin"
   input_arg {
-    name: "gamma"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dx"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dm"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dv"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "db"
-    type_attr: "T"
+    name: "dimension"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "dg"
-    type_attr: "T"
+    name: "output"
+    type_attr: "output_type"
   }
   attr {
     name: "T"
@@ -5191,62 +5888,51 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "variance_epsilon"
-    type: "float"
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "BatchNormWithGlobalNormalizationGrad"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "m"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "v"
-    type_attr: "T"
-  }
+  name: "ArgMin"
   input_arg {
-    name: "gamma"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dx"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dm"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "dv"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "db"
-    type_attr: "T"
+    name: "dimension"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "dg"
-    type_attr: "T"
+    name: "output"
+    type_attr: "output_type"
   }
   attr {
     name: "T"
@@ -5269,231 +5955,225 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "variance_epsilon"
-    type: "float"
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "scale_after_normalization"
-    type: "bool"
-  }
-  deprecation {
-    version: 9
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "BatchSelfAdjointEig"
+  name: "ArgMin"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "dimension"
+    type_attr: "Tidx"
+  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "output_type"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  deprecation {
-    version: 11
-  }
-}
-op {
-  name: "BatchSelfAdjointEigV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "e"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "v"
-    type_attr: "T"
-  }
   attr {
-    name: "compute_v"
-    type: "bool"
+    name: "Tidx"
+    type: "type"
     default_value {
-      b: true
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "T"
+    name: "output_type"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-  deprecation {
-    version: 13
-  }
 }
 op {
-  name: "BatchSvd"
+  name: "AsString"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "s"
-    type_attr: "T"
+    name: "output"
+    type: DT_STRING
   }
-  output_arg {
-    name: "u"
-    type_attr: "T"
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_BOOL
+        type: DT_INT8
+      }
+    }
   }
-  output_arg {
-    name: "v"
-    type_attr: "T"
+  attr {
+    name: "precision"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
   attr {
-    name: "compute_uv"
+    name: "scientific"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
   attr {
-    name: "full_matrices"
+    name: "shortest"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "width"
+    type: "int"
+    default_value {
+      i: -1
     }
   }
-  deprecation {
-    version: 13
+  attr {
+    name: "fill"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
 }
 op {
-  name: "BatchToSpace"
+  name: "Asin"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "crops"
-    type_attr: "Tidx"
-  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "BatchToSpaceND"
+  name: "Asin"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "block_shape"
-    type_attr: "Tblock_shape"
-  }
-  input_arg {
-    name: "crops"
-    type_attr: "Tcrops"
-  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tblock_shape"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tcrops"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Betainc"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
-    type_attr: "T"
-  }
+  name: "Asinh"
   input_arg {
     name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -5501,24 +6181,23 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "BiasAdd"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
+  name: "Asinh"
   input_arg {
-    name: "bias"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -5526,98 +6205,92 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
+}
+op {
+  name: "Assert"
+  input_arg {
+    name: "condition"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "summarize"
+    type: "int"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+      i: 3
     }
   }
+  is_stateful: true
 }
 op {
-  name: "BiasAdd"
+  name: "Assign"
   input_arg {
-    name: "value"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "bias"
+    name: "value"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "validate_shape"
+    type: "bool"
     default_value {
-      s: "NHWC"
+      b: true
     }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
     }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "BiasAddGrad"
+  name: "AssignAdd"
   input_arg {
-    name: "out_backprop"
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -5642,28 +6315,28 @@ op {
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+      b: false
     }
   }
 }
 op {
-  name: "BiasAddGrad"
+  name: "AssignAdd"
   input_arg {
-    name: "out_backprop"
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -5690,32 +6363,28 @@ op {
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+      b: false
     }
   }
 }
 op {
-  name: "BiasAddV1"
+  name: "AssignAdd"
   input_arg {
-    name: "value"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "bias"
+    name: "value"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -5736,23 +6405,35 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "BiasAddV1"
+  name: "AssignAdd"
   input_arg {
-    name: "value"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "bias"
+    name: "value"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -5761,64 +6442,63 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Bincount"
-  input_arg {
-    name: "arr"
-    type: DT_INT32
-  }
+  name: "AssignAddVariableOp"
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "weights"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "bins"
-    type_attr: "T"
+    name: "value"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
   }
+  is_stateful: true
 }
 op {
-  name: "Bitcast"
+  name: "AssignSub"
   input_arg {
-    name: "input"
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "type"
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -5843,7 +6523,31 @@ op {
     }
   }
   attr {
-    name: "type"
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "AssignSub"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -5861,19 +6565,34 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Bitcast"
+  name: "AssignSub"
   input_arg {
-    name: "input"
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "value"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "type"
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -5886,148 +6605,117 @@ op {
         type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
-        type: DT_INT8
         type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "type"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-        type: DT_HALF
-      }
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
 }
 op {
-  name: "BitwiseAnd"
+  name: "AssignSub"
   input_arg {
-    name: "x"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "y"
+    name: "value"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  is_commutative: true
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "BitwiseAnd"
+  name: "AssignSubVariableOp"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "value"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
   }
-  is_commutative: true
+  is_stateful: true
 }
 op {
-  name: "BitwiseOr"
+  name: "AssignVariableOp"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "value"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-      }
-    }
   }
-  is_commutative: true
+  is_stateful: true
 }
 op {
-  name: "BitwiseOr"
+  name: "Atan"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
   output_arg {
-    name: "z"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -6035,31 +6723,25 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "BitwiseXor"
+  name: "Atan"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
   output_arg {
-    name: "z"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -6067,25 +6749,26 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "BitwiseXor"
+  name: "Atan2"
   input_arg {
-    name: "x"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
@@ -6097,388 +6780,330 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "BroadcastArgs"
+  name: "Atan2"
   input_arg {
-    name: "s0"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "s1"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "r0"
+    name: "z"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "BroadcastGradientArgs"
-  input_arg {
-    name: "s0"
-    type_attr: "T"
-  }
+  name: "Atanh"
   input_arg {
-    name: "s1"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "r0"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "r1"
+    name: "y"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Bucketize"
+  name: "Atanh"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_INT32
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "boundaries"
-    type: "list(float)"
-  }
 }
 op {
-  name: "BytesProducedStatsDataset"
+  name: "AudioSpectrogram"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "CTCBeamSearchDecoder"
-  input_arg {
-    name: "inputs"
+    name: "input"
     type: DT_FLOAT
   }
-  input_arg {
-    name: "sequence_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "decoded_indices"
-    type: DT_INT64
-    number_attr: "top_paths"
-  }
-  output_arg {
-    name: "decoded_values"
-    type: DT_INT64
-    number_attr: "top_paths"
-  }
-  output_arg {
-    name: "decoded_shape"
-    type: DT_INT64
-    number_attr: "top_paths"
-  }
   output_arg {
-    name: "log_probability"
+    name: "spectrogram"
     type: DT_FLOAT
   }
   attr {
-    name: "beam_width"
+    name: "window_size"
     type: "int"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "top_paths"
+    name: "stride"
     type: "int"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "merge_repeated"
+    name: "magnitude_squared"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
 }
 op {
-  name: "CTCGreedyDecoder"
+  name: "AudioSummary"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "tag"
+    type: DT_STRING
   }
   input_arg {
-    name: "sequence_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "decoded_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "decoded_values"
-    type: DT_INT64
+    name: "tensor"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "decoded_shape"
-    type: DT_INT64
+    name: "summary"
+    type: DT_STRING
   }
-  output_arg {
-    name: "log_probability"
-    type: DT_FLOAT
+  attr {
+    name: "sample_rate"
+    type: "float"
   }
   attr {
-    name: "merge_repeated"
-    type: "bool"
+    name: "max_outputs"
+    type: "int"
     default_value {
-      b: false
+      i: 3
     }
+    has_minimum: true
+    minimum: 1
+  }
+  deprecation {
+    version: 15
   }
 }
 op {
-  name: "CTCLoss"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
+  name: "AudioSummaryV2"
   input_arg {
-    name: "labels_indices"
-    type: DT_INT64
+    name: "tag"
+    type: DT_STRING
   }
   input_arg {
-    name: "labels_values"
-    type: DT_INT32
+    name: "tensor"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "sequence_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "loss"
+    name: "sample_rate"
     type: DT_FLOAT
   }
   output_arg {
-    name: "gradient"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "preprocess_collapse_repeated"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "summary"
+    type: DT_STRING
   }
   attr {
-    name: "ctc_merge_repeated"
-    type: "bool"
+    name: "max_outputs"
+    type: "int"
     default_value {
-      b: true
+      i: 3
     }
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "CTCLoss"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "labels_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "labels_values"
-    type: DT_INT32
-  }
+  name: "AvgPool"
   input_arg {
-    name: "sequence_length"
-    type: DT_INT32
+    name: "value"
+    type_attr: "T"
   }
   output_arg {
-    name: "loss"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
-  output_arg {
-    name: "gradient"
-    type: DT_FLOAT
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "preprocess_collapse_repeated"
-    type: "bool"
-    default_value {
-      b: false
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "ctc_merge_repeated"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: true
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
-    name: "ignore_longer_outputs_than_inputs"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
     }
   }
 }
 op {
-  name: "CacheDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "AvgPool"
   input_arg {
-    name: "filename"
-    type: DT_STRING
+    name: "value"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 4
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "CacheDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
+    minimum: 4
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
   }
 }
 op {
-  name: "Cast"
+  name: "AvgPool"
   input_arg {
-    name: "x"
-    type_attr: "SrcT"
+    name: "value"
+    type_attr: "T"
   }
   output_arg {
-    name: "y"
-    type_attr: "DstT"
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "SrcT"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "DstT"
-    type: "type"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
-}
-op {
-  name: "Ceil"
-  input_arg {
-    name: "x"
-    type_attr: "T"
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
   }
   attr {
     name: "T"
@@ -6493,9 +7118,9 @@ op {
   }
 }
 op {
-  name: "CheckNumerics"
+  name: "AvgPool"
   input_arg {
-    name: "tensor"
+    name: "value"
     type_attr: "T"
   }
   output_arg {
@@ -6503,44 +7128,55 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "message"
+    name: "data_format"
     type: "string"
-  }
-}
-op {
-  name: "Cholesky"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "Cholesky"
+  name: "AvgPool3D"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -6550,32 +7186,27 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
-}
-op {
-  name: "CholeskyGrad"
-  input_arg {
-    name: "l"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
@@ -6588,402 +7219,233 @@ op {
   }
 }
 op {
-  name: "CompareAndBitpack"
+  name: "AvgPool3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "threshold"
-    type_attr: "T"
-  }
   output_arg {
     name: "output"
-    type: DT_UINT8
+    type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
-}
-op {
-  name: "Complex"
-  input_arg {
-    name: "real"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "imag"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "out"
-    type_attr: "Tout"
-  }
   attr {
-    name: "T"
-    type: "type"
+    name: "data_format"
+    type: "string"
     default_value {
-      type: DT_FLOAT
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
   attr {
-    name: "Tout"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
     allowed_values {
       list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "ComplexAbs"
+  name: "AvgPool3D"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
-    type_attr: "Tout"
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "Tout"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
-}
-op {
-  name: "ComputeAccidentalHits"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "ids"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "weights"
-    type: DT_FLOAT
-  }
   attr {
-    name: "num_true"
-    type: "int"
-  }
-  attr {
-    name: "seed"
-    type: "int"
+    name: "data_format"
+    type: "string"
     default_value {
-      i: 0
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
 }
 op {
-  name: "Concat"
+  name: "AvgPool3DGrad"
   input_arg {
-    name: "concat_dim"
+    name: "orig_input_shape"
     type: DT_INT32
   }
   input_arg {
-    name: "values"
+    name: "grad"
     type_attr: "T"
-    number_attr: "N"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "N"
-    type: "int"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "ConcatOffset"
-  input_arg {
-    name: "concat_dim"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "shape"
-    type: DT_INT32
-    number_attr: "N"
-  }
-  output_arg {
-    name: "offset"
-    type: DT_INT32
-    number_attr: "N"
+    minimum: 5
   }
   attr {
-    name: "N"
-    type: "int"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: 2
-  }
-}
-op {
-  name: "ConcatV2"
-  input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    minimum: 5
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "ConcatenateDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "another_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ConcatenateDataset"
+  name: "AvgPool3DGrad"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "orig_input_shape"
+    type: DT_INT32
   }
   input_arg {
-    name: "another_dataset"
-    type: DT_VARIANT
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 5
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    minimum: 5
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ""
+      s: "NDHWC"
     }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
-  is_stateful: true
-}
-op {
-  name: "ConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "Conj"
+  name: "AvgPool3DGrad"
   input_arg {
-    name: "input"
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -6991,102 +7453,60 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
-}
-op {
-  name: "Conj"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
+    name: "data_format"
+    type: "string"
     default_value {
-      type: DT_COMPLEX64
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_VARIANT
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
-}
-op {
-  name: "ConjugateTranspose"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "perm"
-    type_attr: "Tperm"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tperm"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "Const"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "value"
-    type: "tensor"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-}
-op {
-  name: "ControlTrigger"
-}
-op {
-  name: "Conv2D"
+  name: "AvgPoolGrad"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "orig_input_shape"
+    type: DT_INT32
   }
   input_arg {
-    name: "filter"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -7094,25 +7514,16 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    has_minimum: true
+    minimum: 4
   }
   attr {
     name: "padding"
@@ -7137,19 +7548,26 @@ op {
       }
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
 }
 op {
-  name: "Conv2DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "AvgPoolGrad"
   input_arg {
-    name: "filter_sizes"
+    name: "orig_input_shape"
     type: DT_INT32
   }
   input_arg {
-    name: "out_backprop"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -7157,25 +7575,16 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    has_minimum: true
+    minimum: 4
   }
   attr {
     name: "padding"
@@ -7200,19 +7609,26 @@ op {
       }
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
 }
 op {
-  name: "Conv2DBackpropInput"
+  name: "AvgPoolGrad"
   input_arg {
-    name: "input_sizes"
+    name: "orig_input_shape"
     type: DT_INT32
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -7220,25 +7636,16 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    has_minimum: true
+    minimum: 4
   }
   attr {
     name: "padding"
@@ -7263,56 +7670,26 @@ op {
       }
     }
   }
-}
-op {
-  name: "Conv3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
 }
 op {
-  name: "Conv3D"
+  name: "AvgPoolGrad"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "orig_input_shape"
+    type: DT_INT32
   }
   input_arg {
-    name: "filter"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -7320,20 +7697,16 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "padding"
@@ -7349,397 +7722,336 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NDHWC"
+      s: "NHWC"
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
-}
-op {
-  name: "Conv3DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  deprecation {
-    version: 10
-  }
 }
 op {
-  name: "Conv3DBackpropFilterV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
+  name: "Barrier"
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "component_types"
+    type: "list(type)"
     has_minimum: true
-    minimum: 5
+    minimum: 1
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
       list {
-        s: "SAME"
-        s: "VALID"
       }
     }
-  }
-}
-op {
-  name: "Conv3DBackpropFilterV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
+    name: "container"
     type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "data_format"
+    name: "shared_name"
     type: "string"
     default_value {
-      s: "NDHWC"
+      s: ""
     }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+  }
+  is_stateful: true
+}
+op {
+  name: "BarrierClose"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
 }
 op {
-  name: "Conv3DBackpropInput"
+  name: "BarrierIncompleteSize"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
   }
+}
+op {
+  name: "BarrierInsertMany"
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "out_backprop"
-    type_attr: "T"
+    name: "keys"
+    type: DT_STRING
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "values"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  deprecation {
-    version: 10
+    name: "component_index"
+    type: "int"
   }
 }
 op {
-  name: "Conv3DBackpropInputV2"
+  name: "BarrierReadySize"
   input_arg {
-    name: "input_sizes"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
     type: DT_INT32
   }
+}
+op {
+  name: "BarrierTakeMany"
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "out_backprop"
-    type_attr: "T"
+    name: "num_elements"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "keys"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "component_types"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "allow_small_batch"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "wait_for_incomplete"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
     }
   }
 }
 op {
-  name: "Conv3DBackpropInputV2"
+  name: "Batch"
   input_arg {
-    name: "input_sizes"
-    type: DT_INT32
+    name: "in_tensors"
+    type_list_attr: "T"
   }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
+  output_arg {
+    name: "batched_tensors"
+    type_list_attr: "T"
   }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
+  output_arg {
+    name: "batch_index"
+    type: DT_INT64
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "id"
+    type: DT_INT64
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "num_batch_threads"
+    type: "int"
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "max_batch_size"
+    type: "int"
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "batch_timeout_micros"
+    type: "int"
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "allowed_batch_sizes"
+    type: "list(int)"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
       }
     }
   }
-}
-op {
-  name: "Copy"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+  attr {
+    name: "grad_timeout_micros"
+    type: "int"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "tensor_name"
+    name: "batching_queue"
     type: "string"
     default_value {
       s: ""
     }
   }
-  allows_uninitialized_input: true
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
 }
 op {
-  name: "Copy"
+  name: "Batch"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "in_tensors"
+    type_list_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "batched_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "id"
+    type: DT_INT64
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "num_batch_threads"
+    type: "int"
   }
   attr {
-    name: "tensor_name"
-    type: "string"
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
     default_value {
-      s: ""
+      i: 10
     }
   }
   attr {
-    name: "debug_ops_spec"
-    type: "list(string)"
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
     default_value {
       list {
       }
     }
   }
-  allows_uninitialized_input: true
-}
-op {
-  name: "CopyHost"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+  attr {
+    name: "grad_timeout_micros"
+    type: "int"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "tensor_name"
+    name: "batching_queue"
     type: "string"
     default_value {
       s: ""
     }
   }
-  allows_uninitialized_input: true
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
 }
 op {
-  name: "CopyHost"
+  name: "BatchCholesky"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -7751,32 +8063,29 @@ op {
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_ops_spec"
-    type: "list(string)"
-    default_value {
+    allowed_values {
       list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
-  allows_uninitialized_input: true
+  deprecation {
+    version: 13
+  }
 }
 op {
-  name: "Cos"
+  name: "BatchCholeskyGrad"
   input_arg {
-    name: "x"
+    name: "l"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -7784,375 +8093,457 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
-}
+  deprecation {
+    version: 13
+  }
+}
 op {
-  name: "Cosh"
+  name: "BatchDataset"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
   }
   output_arg {
-    name: "y"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "CountUpTo"
+  name: "BatchDataset"
   input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "limit"
-    type: "int"
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "CropAndResize"
+  name: "BatchFFT"
   input_arg {
-    name: "image"
-    type_attr: "T"
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
   }
+}
+op {
+  name: "BatchFFT2D"
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
   }
+}
+op {
+  name: "BatchFFT3D"
   input_arg {
-    name: "box_ind"
-    type: DT_INT32
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
   }
+}
+op {
+  name: "BatchIFFT"
   input_arg {
-    name: "crop_size"
-    type: DT_INT32
+    name: "input"
+    type: DT_COMPLEX64
   }
   output_arg {
-    name: "crops"
-    type: DT_FLOAT
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchIFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchIFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
+  }
+}
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
   attr {
-    name: "method"
-    type: "string"
+    name: "adj_x"
+    type: "bool"
     default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-      }
+      b: false
     }
   }
   attr {
-    name: "extrapolation_value"
-    type: "float"
+    name: "adj_y"
+    type: "bool"
     default_value {
-      f: 0
+      b: false
     }
   }
 }
 op {
-  name: "CropAndResize"
+  name: "BatchMatMul"
   input_arg {
-    name: "image"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "crop_size"
-    type: DT_INT32
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "crops"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
   attr {
-    name: "method"
-    type: "string"
+    name: "adj_x"
+    type: "bool"
     default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-      }
+      b: false
     }
   }
   attr {
-    name: "extrapolation_value"
-    type: "float"
+    name: "adj_y"
+    type: "bool"
     default_value {
-      f: 0
+      b: false
     }
   }
 }
 op {
-  name: "CropAndResizeGradBoxes"
+  name: "BatchMatrixBandPart"
   input_arg {
-    name: "grads"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "image"
-    type_attr: "T"
+    name: "num_lower"
+    type: DT_INT64
   }
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "num_upper"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
   }
+}
+op {
+  name: "BatchMatrixDeterminant"
   input_arg {
-    name: "box_ind"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "method"
-    type: "string"
-    default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-      }
-    }
+  deprecation {
+    version: 13
   }
 }
 op {
-  name: "CropAndResizeGradBoxes"
-  input_arg {
-    name: "grads"
-    type: DT_FLOAT
-  }
+  name: "BatchMatrixDeterminant"
   input_arg {
-    name: "image"
+    name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "box_ind"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "method"
-    type: "string"
-    default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-      }
-    }
+  deprecation {
+    version: 13
   }
 }
 op {
-  name: "CropAndResizeGradImage"
+  name: "BatchMatrixDiag"
   input_arg {
-    name: "grads"
-    type: DT_FLOAT
+    name: "diagonal"
+    type_attr: "T"
   }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
   }
+  deprecation {
+    version: 14
+  }
+}
+op {
+  name: "BatchMatrixDiagPart"
   input_arg {
-    name: "box_ind"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
   }
+}
+op {
+  name: "BatchMatrixInverse"
   input_arg {
-    name: "image_size"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_HALF
         type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
+  deprecation {
+    version: 13
+  }
+}
+op {
+  name: "BatchMatrixSetDiag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "method"
-    type: "string"
-    default_value {
-      s: "bilinear"
-    }
-    allowed_values {
-      list {
-        s: "bilinear"
-      }
-    }
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 14
   }
 }
 op {
-  name: "Cross"
+  name: "BatchMatrixSolve"
   input_arg {
-    name: "a"
+    name: "matrix"
     type_attr: "T"
   }
   input_arg {
-    name: "b"
+    name: "rhs"
     type_attr: "T"
   }
   output_arg {
-    name: "product"
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
+        type: DT_FLOAT
       }
     }
   }
+  deprecation {
+    version: 13
+  }
 }
 op {
-  name: "Cross"
+  name: "BatchMatrixSolveLs"
   input_arg {
-    name: "a"
+    name: "matrix"
     type_attr: "T"
   }
   input_arg {
-    name: "b"
+    name: "rhs"
     type_attr: "T"
   }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
   output_arg {
-    name: "product"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -8160,44 +8551,45 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_FLOAT
       }
     }
   }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  deprecation {
+    version: 13
+  }
 }
 op {
-  name: "Cumprod"
+  name: "BatchMatrixTriangularSolve"
   input_arg {
-    name: "x"
+    name: "matrix"
     type_attr: "T"
   }
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "rhs"
+    type_attr: "T"
   }
   output_arg {
-    name: "out"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "exclusive"
+    name: "lower"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
   attr {
-    name: "reverse"
+    name: "adjoint"
     type: "bool"
     default_value {
       b: false
@@ -8208,64 +8600,40 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        type: DT_FLOAT
       }
     }
   }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  deprecation {
+    version: 13
   }
 }
 op {
-  name: "Cumprod"
+  name: "BatchNormWithGlobalNormalization"
   input_arg {
-    name: "x"
+    name: "t"
     type_attr: "T"
   }
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "m"
+    type_attr: "T"
   }
-  output_arg {
-    name: "out"
+  input_arg {
+    name: "v"
     type_attr: "T"
   }
-  attr {
-    name: "exclusive"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
   }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -8286,52 +8654,46 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
   }
 }
 op {
-  name: "Cumsum"
+  name: "BatchNormWithGlobalNormalization"
   input_arg {
-    name: "x"
+    name: "t"
     type_attr: "T"
   }
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "m"
+    type_attr: "T"
   }
-  output_arg {
-    name: "out"
+  input_arg {
+    name: "v"
     type_attr: "T"
   }
-  attr {
-    name: "exclusive"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
   }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -8352,50 +8714,48 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
   }
 }
 op {
-  name: "Cumsum"
+  name: "BatchNormWithGlobalNormalization"
   input_arg {
-    name: "x"
+    name: "t"
     type_attr: "T"
   }
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "m"
+    type_attr: "T"
   }
-  output_arg {
-    name: "out"
+  input_arg {
+    name: "v"
     type_attr: "T"
   }
-  attr {
-    name: "exclusive"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
   }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -8418,1213 +8778,1271 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "DatasetToSingleElement"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "output_types"
+    name: "variance_epsilon"
+    type: "float"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "scale_after_normalization"
+    type: "bool"
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  deprecation {
+    version: 9
   }
 }
 op {
-  name: "DebugGradientIdentity"
+  name: "BatchNormWithGlobalNormalization"
   input_arg {
-    name: "input"
+    name: "t"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "m"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
+  input_arg {
+    name: "v"
+    type_attr: "T"
   }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugIdentity"
   input_arg {
-    name: "input"
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "result"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "variance_epsilon"
+    type: "float"
   }
   attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
+    name: "scale_after_normalization"
+    type: "bool"
+  }
+  deprecation {
+    version: 9
   }
-  allows_uninitialized_input: true
 }
 op {
-  name: "DebugIdentity"
+  name: "BatchNormWithGlobalNormalizationGrad"
   input_arg {
-    name: "input"
+    name: "t"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "m"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
+  input_arg {
+    name: "v"
+    type_attr: "T"
   }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
   }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugIdentity"
   input_arg {
-    name: "input"
+    name: "backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "dx"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
+  output_arg {
+    name: "dm"
+    type_attr: "T"
   }
-  attr {
-    name: "device_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
   }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
   }
   attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
+    name: "T"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "gated_grpc"
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
     type: "bool"
-    default_value {
-      b: false
-    }
   }
-  allows_uninitialized_input: true
+  deprecation {
+    version: 9
+  }
 }
 op {
-  name: "DebugNanCount"
+  name: "BatchNormWithGlobalNormalizationGrad"
   input_arg {
-    name: "input"
+    name: "t"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
+  input_arg {
+    name: "m"
+    type_attr: "T"
   }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "v"
+    type_attr: "T"
   }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
   }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNanCount"
   input_arg {
-    name: "input"
+    name: "backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "dx"
+    type_attr: "T"
   }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
+  output_arg {
+    name: "dm"
+    type_attr: "T"
   }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
   }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNanCount"
-  input_arg {
-    name: "input"
+  output_arg {
+    name: "db"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_INT64
+    name: "dg"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "device_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
+    allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "gated_grpc"
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
     type: "bool"
-    default_value {
-      b: false
-    }
   }
-  allows_uninitialized_input: true
+  deprecation {
+    version: 9
+  }
 }
 op {
-  name: "DebugNumericSummary"
+  name: "BatchNormWithGlobalNormalizationGrad"
   input_arg {
-    name: "input"
+    name: "t"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type: DT_DOUBLE
-  }
-  attr {
-    name: "T"
-    type: "type"
+  input_arg {
+    name: "m"
+    type_attr: "T"
   }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "v"
+    type_attr: "T"
   }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
   }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNumericSummary"
   input_arg {
-    name: "input"
+    name: "backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_DOUBLE
+    name: "dx"
+    type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
+  output_arg {
+    name: "dm"
+    type_attr: "T"
   }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
   }
   attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
+    name: "T"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "lower_bound"
-    type: "float"
-    default_value {
-      f: -inf
-    }
-  }
-  attr {
-    name: "upper_bound"
+    name: "variance_epsilon"
     type: "float"
-    default_value {
-      f: inf
-    }
   }
   attr {
-    name: "mute_if_healthy"
+    name: "scale_after_normalization"
     type: "bool"
-    default_value {
-      b: false
-    }
   }
-  allows_uninitialized_input: true
+  deprecation {
+    version: 9
+  }
 }
 op {
-  name: "DebugNumericSummary"
+  name: "BatchNormWithGlobalNormalizationGrad"
   input_arg {
-    name: "input"
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_DOUBLE
+    name: "dx"
+    type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
+  output_arg {
+    name: "dm"
+    type_attr: "T"
   }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "dv"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "db"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "dg"
+    type_attr: "T"
   }
   attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
+    name: "T"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "lower_bound"
-    type: "float"
-    default_value {
-      f: -inf
-    }
-  }
-  attr {
-    name: "upper_bound"
+    name: "variance_epsilon"
     type: "float"
-    default_value {
-      f: inf
-    }
   }
   attr {
-    name: "mute_if_healthy"
+    name: "scale_after_normalization"
     type: "bool"
-    default_value {
-      b: false
-    }
   }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  deprecation {
+    version: 9
   }
-  allows_uninitialized_input: true
 }
 op {
-  name: "DebugNumericSummary"
+  name: "BatchSelfAdjointEig"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    type: DT_DOUBLE
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "device_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
+    allowed_values {
       list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
-  attr {
-    name: "lower_bound"
-    type: "float"
-    default_value {
-      f: -inf
-    }
+  deprecation {
+    version: 11
   }
-  attr {
-    name: "upper_bound"
-    type: "float"
-    default_value {
-      f: inf
-    }
+}
+op {
+  name: "BatchSelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
   }
   attr {
-    name: "mute_if_healthy"
+    name: "compute_v"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
   attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
     }
   }
-  allows_uninitialized_input: true
+  deprecation {
+    version: 13
+  }
 }
 op {
-  name: "DecodeAndCropJpeg"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
+  name: "BatchSvd"
   input_arg {
-    name: "crop_window"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "image"
-    type: DT_UINT8
+    name: "s"
+    type_attr: "T"
   }
-  attr {
-    name: "channels"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  output_arg {
+    name: "u"
+    type_attr: "T"
   }
-  attr {
-    name: "ratio"
-    type: "int"
-    default_value {
-      i: 1
-    }
+  output_arg {
+    name: "v"
+    type_attr: "T"
   }
   attr {
-    name: "fancy_upscaling"
+    name: "compute_uv"
     type: "bool"
     default_value {
       b: true
     }
   }
   attr {
-    name: "try_recover_truncated"
+    name: "full_matrices"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "acceptable_fraction"
-    type: "float"
-    default_value {
-      f: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
-  attr {
-    name: "dct_method"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  deprecation {
+    version: 13
   }
 }
 op {
-  name: "DecodeBase64"
+  name: "BatchToSpace"
   input_arg {
     name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_STRING
+    type_attr: "T"
   }
-}
-op {
-  name: "DecodeBmp"
   input_arg {
-    name: "contents"
-    type: DT_STRING
+    name: "crops"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "image"
-    type: DT_UINT8
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "channels"
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
     type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
     default_value {
-      i: 0
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
 }
 op {
-  name: "DecodeCSV"
+  name: "BatchToSpaceND"
   input_arg {
-    name: "records"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "record_defaults"
-    type_list_attr: "OUT_TYPE"
+    name: "block_shape"
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "crops"
+    type_attr: "Tcrops"
   }
   output_arg {
     name: "output"
-    type_list_attr: "OUT_TYPE"
+    type_attr: "T"
   }
   attr {
-    name: "OUT_TYPE"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tblock_shape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_INT32
         type: DT_INT64
-        type: DT_STRING
       }
     }
   }
   attr {
-    name: "field_delim"
-    type: "string"
+    name: "Tcrops"
+    type: "type"
     default_value {
-      s: ","
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
 }
 op {
-  name: "DecodeCSV"
+  name: "Betainc"
   input_arg {
-    name: "records"
-    type: DT_STRING
+    name: "a"
+    type_attr: "T"
   }
   input_arg {
-    name: "record_defaults"
-    type_list_attr: "OUT_TYPE"
+    name: "b"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_list_attr: "OUT_TYPE"
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "OUT_TYPE"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_STRING
+        type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "field_delim"
-    type: "string"
-    default_value {
-      s: ","
-    }
-  }
-  attr {
-    name: "use_quote_delim"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
 }
 op {
-  name: "DecodeCSV"
+  name: "BiasAdd"
   input_arg {
-    name: "records"
-    type: DT_STRING
+    name: "value"
+    type_attr: "T"
   }
   input_arg {
-    name: "record_defaults"
-    type_list_attr: "OUT_TYPE"
+    name: "bias"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_list_attr: "OUT_TYPE"
+    type_attr: "T"
   }
   attr {
-    name: "OUT_TYPE"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_INT32
+        type: DT_DOUBLE
         type: DT_INT64
-        type: DT_STRING
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "field_delim"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ","
-    }
-  }
-  attr {
-    name: "use_quote_delim"
-    type: "bool"
-    default_value {
-      b: true
+      s: "NHWC"
     }
-  }
-  attr {
-    name: "na_value"
-    type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
 }
 op {
-  name: "DecodeCSV"
+  name: "BiasAdd"
   input_arg {
-    name: "records"
-    type: DT_STRING
+    name: "value"
+    type_attr: "T"
   }
   input_arg {
-    name: "record_defaults"
-    type_list_attr: "OUT_TYPE"
+    name: "bias"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_list_attr: "OUT_TYPE"
+    type_attr: "T"
   }
   attr {
-    name: "OUT_TYPE"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
-        type: DT_STRING
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "field_delim"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ","
-    }
-  }
-  attr {
-    name: "use_quote_delim"
-    type: "bool"
-    default_value {
-      b: true
+      s: "NHWC"
     }
-  }
-  attr {
-    name: "na_value"
-    type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
 }
 op {
-  name: "DecodeGif"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "image"
-    type: DT_UINT8
-  }
-}
-op {
-  name: "DecodeJSONExample"
+  name: "BiasAdd"
   input_arg {
-    name: "json_examples"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "binary_examples"
-    type: DT_STRING
+    name: "value"
+    type_attr: "T"
   }
-}
-op {
-  name: "DecodeJpeg"
   input_arg {
-    name: "contents"
-    type: DT_STRING
+    name: "bias"
+    type_attr: "T"
   }
   output_arg {
-    name: "image"
-    type: DT_UINT8
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "channels"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
     }
   }
   attr {
-    name: "ratio"
-    type: "int"
+    name: "data_format"
+    type: "string"
     default_value {
-      i: 1
+      s: "NHWC"
     }
-  }
-  attr {
-    name: "fancy_upscaling"
-    type: "bool"
-    default_value {
-      b: true
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
-  attr {
-    name: "try_recover_truncated"
-    type: "bool"
-    default_value {
-      b: false
-    }
+}
+op {
+  name: "BiasAdd"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "acceptable_fraction"
-    type: "float"
-    default_value {
-      f: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
   attr {
-    name: "dct_method"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ""
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
 }
 op {
-  name: "DecodePng"
+  name: "BiasAddGrad"
   input_arg {
-    name: "contents"
-    type: DT_STRING
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "image"
-    type_attr: "dtype"
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "channels"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
     }
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "data_format"
+    type: "string"
     default_value {
-      type: DT_UINT8
+      s: "NHWC"
     }
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_UINT16
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
 }
 op {
-  name: "DecodeRaw"
+  name: "BiasAddGrad"
   input_arg {
-    name: "bytes"
-    type: DT_STRING
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_attr: "out_type"
+    type_attr: "T"
   }
   attr {
-    name: "out_type"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "little_endian"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: true
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
 }
 op {
-  name: "DecodeRaw"
+  name: "BiasAddGrad"
   input_arg {
-    name: "bytes"
-    type: DT_STRING
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_attr: "out_type"
+    type_attr: "T"
   }
   attr {
-    name: "out_type"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT16
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "little_endian"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: true
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
 }
 op {
-  name: "DecodeWav"
+  name: "BiasAddGrad"
   input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "audio"
-    type: DT_FLOAT
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "sample_rate"
-    type: DT_INT32
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "desired_channels"
-    type: "int"
-    default_value {
-      i: -1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
   attr {
-    name: "desired_samples"
-    type: "int"
+    name: "data_format"
+    type: "string"
     default_value {
-      i: -1
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
 }
 op {
-  name: "DeleteSessionTensor"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-}
-op {
-  name: "DenseToDenseSetOperation"
+  name: "BiasAddV1"
   input_arg {
-    name: "set1"
+    name: "value"
     type_attr: "T"
   }
   input_arg {
-    name: "set2"
+    name: "bias"
     type_attr: "T"
   }
   output_arg {
-    name: "result_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "result_values"
+    name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "result_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "set_operation"
-    type: "string"
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
-        type: DT_STRING
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "BiasAddV1"
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
+    name: "value"
+    type_attr: "T"
   }
   input_arg {
-    name: "row_shape"
-    type: DT_INT64
+    name: "bias"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "BiasAddV1"
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
+    name: "value"
+    type_attr: "T"
   }
   input_arg {
-    name: "row_shape"
-    type: DT_INT64
+    name: "bias"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
 }
 op {
-  name: "DenseToSparseSetOperation"
+  name: "BiasAddV1"
   input_arg {
-    name: "set1"
+    name: "value"
     type_attr: "T"
   }
   input_arg {
-    name: "set2_indices"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "set2_values"
+    name: "bias"
     type_attr: "T"
   }
-  input_arg {
-    name: "set2_shape"
-    type: DT_INT64
-  }
   output_arg {
-    name: "result_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "result_values"
+    name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "result_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "set_operation"
-    type: "string"
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_STRING
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "DepthToSpace"
+  name: "Bincount"
   input_arg {
-    name: "input"
+    name: "arr"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "weights"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "bins"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
-  name: "DepthToSpace"
+  name: "Bitcast"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "type"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
   attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
+    name: "type"
+    type: "type"
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "DepthwiseConv2dNative"
+  name: "Bitcast"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "type"
   }
   attr {
     name: "T"
@@ -9633,92 +10051,121 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "type"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "DepthwiseConv2dNative"
+  name: "Bitcast"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "type"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
+    name: "type"
+    type: "type"
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "DepthwiseConv2dNativeBackpropFilter"
+  name: "BitwiseAnd"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -9726,42 +10173,29 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "DepthwiseConv2dNativeBackpropFilter"
+  name: "BitwiseAnd"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -9769,55 +10203,61 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
+  is_commutative: true
+}
+op {
+  name: "BitwiseOr"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "DepthwiseConv2dNativeBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
+  name: "BitwiseOr"
   input_arg {
-    name: "filter"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -9825,42 +10265,61 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
+  is_commutative: true
+}
+op {
+  name: "BitwiseXor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "DepthwiseConv2dNativeBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
+  name: "BitwiseXor"
   input_arg {
-    name: "filter"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -9868,256 +10327,387 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
+  is_commutative: true
+}
+op {
+  name: "BroadcastArgs"
+  input_arg {
+    name: "s0"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "s1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r0"
+    type_attr: "T"
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "T"
+    type: "type"
     default_value {
-      s: "NHWC"
+      type: DT_INT32
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "Dequantize"
+  name: "BroadcastGradientArgs"
   input_arg {
-    name: "input"
+    name: "s0"
     type_attr: "T"
   }
   input_arg {
-    name: "min_range"
-    type: DT_FLOAT
+    name: "s1"
+    type_attr: "T"
   }
-  input_arg {
-    name: "max_range"
-    type: DT_FLOAT
+  output_arg {
+    name: "r0"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_FLOAT
+    name: "r1"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
     default_value {
-      s: "MIN_COMBINED"
+      type: DT_INT32
     }
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "Dequantize"
+  name: "Bucketize"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "min_range"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type: DT_INT32
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
-    allowed_values {
-      list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
-      }
-    }
+    name: "boundaries"
+    type: "list(float)"
   }
 }
 op {
-  name: "DeserializeIterator"
+  name: "BytesProducedStatsDataset"
   input_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "serialized"
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
     type: DT_VARIANT
   }
-  is_stateful: true
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
 }
 op {
-  name: "DeserializeManySparse"
+  name: "CTCBeamSearchDecoder"
   input_arg {
-    name: "serialized_sparse"
-    type: DT_STRING
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
   }
   output_arg {
-    name: "sparse_indices"
+    name: "decoded_indices"
     type: DT_INT64
+    number_attr: "top_paths"
   }
   output_arg {
-    name: "sparse_values"
-    type_attr: "dtype"
+    name: "decoded_values"
+    type: DT_INT64
+    number_attr: "top_paths"
   }
   output_arg {
-    name: "sparse_shape"
+    name: "decoded_shape"
     type: DT_INT64
+    number_attr: "top_paths"
+  }
+  output_arg {
+    name: "log_probability"
+    type: DT_FLOAT
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "beam_width"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "top_paths"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
 }
 op {
-  name: "DeserializeSparse"
+  name: "CTCGreedyDecoder"
   input_arg {
-    name: "serialized_sparse"
-    type: DT_STRING
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
   }
   output_arg {
-    name: "sparse_indices"
+    name: "decoded_indices"
     type: DT_INT64
   }
   output_arg {
-    name: "sparse_values"
-    type_attr: "dtype"
+    name: "decoded_values"
+    type: DT_INT64
   }
   output_arg {
-    name: "sparse_shape"
+    name: "decoded_shape"
     type: DT_INT64
   }
+  output_arg {
+    name: "log_probability"
+    type: DT_FLOAT
+  }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "merge_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "DestroyResourceOp"
+  name: "CTCLoss"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "labels_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "labels_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "loss"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient"
+    type: DT_FLOAT
   }
   attr {
-    name: "ignore_lookup_error"
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "ctc_merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "CTCLoss"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "labels_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "labels_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "loss"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "ctc_merge_repeated"
     type: "bool"
     default_value {
       b: true
     }
   }
+  attr {
+    name: "ignore_longer_outputs_than_inputs"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "CacheDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
   is_stateful: true
 }
 op {
-  name: "DestroyTemporaryVariable"
+  name: "CacheDataset"
   input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
   }
   output_arg {
-    name: "value"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "var_name"
-    type: "string"
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "Diag"
+  name: "Cast"
   input_arg {
-    name: "diagonal"
-    type_attr: "T"
+    name: "x"
+    type_attr: "SrcT"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "y"
+    type_attr: "DstT"
   }
   attr {
-    name: "T"
+    name: "SrcT"
+    type: "type"
+  }
+  attr {
+    name: "DstT"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
 }
 op {
-  name: "DiagPart"
+  name: "Ceil"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "diagonal"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -10125,18 +10715,15 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Digamma"
+  name: "Ceil"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -10151,6 +10738,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -10158,13 +10746,9 @@ op {
   }
 }
 op {
-  name: "Dilation2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "CheckNumerics"
   input_arg {
-    name: "filter"
+    name: "tensor"
     type_attr: "T"
   }
   output_arg {
@@ -10176,49 +10760,21 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
+    name: "message"
     type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
   }
 }
 op {
-  name: "Dilation2D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "CheckNumerics"
   input_arg {
-    name: "filter"
+    name: "tensor"
     type_attr: "T"
   }
   output_arg {
@@ -10230,59 +10786,26 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
+    name: "message"
     type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
   }
 }
 op {
-  name: "Dilation2DBackpropFilter"
+  name: "Cholesky"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
   output_arg {
-    name: "filter_backprop"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -10290,57 +10813,47 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
+        type: DT_FLOAT
       }
     }
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+}
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Dilation2DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "CholeskyGrad"
   input_arg {
-    name: "filter"
+    name: "l"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
-    name: "filter_backprop"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -10350,209 +10863,107 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
       }
     }
   }
 }
 op {
-  name: "Dilation2DBackpropInput"
+  name: "CompareAndBitpack"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "threshold"
     type_attr: "T"
   }
   output_arg {
-    name: "in_backprop"
-    type_attr: "T"
+    name: "output"
+    type: DT_UINT8
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_BOOL
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
       }
     }
   }
 }
 op {
-  name: "Dilation2DBackpropInput"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "Complex"
   input_arg {
-    name: "filter"
+    name: "real"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
+    name: "imag"
     type_attr: "T"
   }
   output_arg {
-    name: "in_backprop"
-    type_attr: "T"
+    name: "out"
+    type_attr: "Tout"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Div"
+  name: "ComplexAbs"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "y"
+    type_attr: "Tout"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
     }
   }
-}
-op {
-  name: "DrawBoundingBoxes"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tout"
     type: "type"
     default_value {
       type: DT_FLOAT
@@ -10560,58 +10971,72 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_HALF
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "DynamicPartition"
+  name: "ComputeAccidentalHits"
   input_arg {
-    name: "data"
-    type_attr: "T"
+    name: "true_classes"
+    type: DT_INT64
   }
   input_arg {
-    name: "partitions"
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "indices"
     type: DT_INT32
   }
   output_arg {
-    name: "outputs"
-    type_attr: "T"
-    number_attr: "num_partitions"
+    name: "ids"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
   }
   attr {
-    name: "num_partitions"
+    name: "num_true"
     type: "int"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
 }
 op {
-  name: "DynamicStitch"
+  name: "Concat"
   input_arg {
-    name: "indices"
+    name: "concat_dim"
     type: DT_INT32
-    number_attr: "N"
   }
   input_arg {
-    name: "data"
+    name: "values"
     type_attr: "T"
     number_attr: "N"
   }
   output_arg {
-    name: "merged"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "N"
     type: "int"
     has_minimum: true
-    minimum: 1
+    minimum: 2
   }
   attr {
     name: "T"
@@ -10619,302 +11044,387 @@ op {
   }
 }
 op {
-  name: "EditDistance"
+  name: "ConcatOffset"
   input_arg {
-    name: "hypothesis_indices"
-    type: DT_INT64
+    name: "concat_dim"
+    type: DT_INT32
   }
   input_arg {
-    name: "hypothesis_values"
-    type_attr: "T"
+    name: "shape"
+    type: DT_INT32
+    number_attr: "N"
   }
-  input_arg {
-    name: "hypothesis_shape"
-    type: DT_INT64
+  output_arg {
+    name: "offset"
+    type: DT_INT32
+    number_attr: "N"
   }
-  input_arg {
-    name: "truth_indices"
-    type: DT_INT64
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
   }
+}
+op {
+  name: "ConcatV2"
   input_arg {
-    name: "truth_values"
+    name: "values"
     type_attr: "T"
+    number_attr: "N"
   }
   input_arg {
-    name: "truth_shape"
-    type: DT_INT64
+    name: "axis"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   attr {
-    name: "normalize"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
   }
   attr {
     name: "T"
     type: "type"
   }
-}
-op {
-  name: "Elu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_HALF
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "Elu"
+  name: "ConcatenateDataset"
   input_arg {
-    name: "features"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "another_dataset"
+    type: DT_VARIANT
   }
   output_arg {
-    name: "activations"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "EluGrad"
+  name: "ConcatenateDataset"
   input_arg {
-    name: "gradients"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "outputs"
-    type_attr: "T"
+    name: "another_dataset"
+    type: DT_VARIANT
   }
   output_arg {
-    name: "backprops"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
-}
-op {
-  name: "EluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
+  attr {
+    name: "shape"
+    type: "shape"
   }
-  input_arg {
-    name: "outputs"
-    type_attr: "T"
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  is_stateful: true
+}
+op {
+  name: "ConditionalAccumulator"
   output_arg {
-    name: "backprops"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "EncodeBase64"
-  input_arg {
-    name: "input"
-    type: DT_STRING
+  attr {
+    name: "shape"
+    type: "shape"
   }
-  output_arg {
-    name: "output"
-    type: DT_STRING
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "pad"
-    type: "bool"
+    name: "shared_name"
+    type: "string"
     default_value {
-      b: false
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "EncodeJpeg"
-  input_arg {
-    name: "image"
-    type: DT_UINT8
-  }
+  name: "ConditionalAccumulator"
   output_arg {
-    name: "contents"
+    name: "handle"
     type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "format"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "dtype"
+    type: "type"
     allowed_values {
       list {
-        s: ""
-        s: "grayscale"
-        s: "rgb"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "quality"
-    type: "int"
-    default_value {
-      i: 95
-    }
+    name: "shape"
+    type: "shape"
   }
   attr {
-    name: "progressive"
-    type: "bool"
+    name: "container"
+    type: "string"
     default_value {
-      b: false
+      s: ""
     }
   }
   attr {
-    name: "optimize_size"
-    type: "bool"
+    name: "shared_name"
+    type: "string"
     default_value {
-      b: false
+      s: ""
     }
   }
-  attr {
-    name: "chroma_downsampling"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  is_stateful: true
+}
+op {
+  name: "ConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "density_unit"
-    type: "string"
-    default_value {
-      s: "in"
-    }
+    name: "dtype"
+    type: "type"
     allowed_values {
       list {
-        s: "in"
-        s: "cm"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "x_density"
-    type: "int"
-    default_value {
-      i: 300
-    }
+    name: "shape"
+    type: "shape"
   }
   attr {
-    name: "y_density"
-    type: "int"
+    name: "container"
+    type: "string"
     default_value {
-      i: 300
+      s: ""
     }
   }
   attr {
-    name: "xmp_metadata"
+    name: "shared_name"
     type: "string"
     default_value {
       s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "EncodePng"
+  name: "Conj"
   input_arg {
-    name: "image"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  attr {
-    name: "compression"
-    type: "int"
-    default_value {
-      i: -1
-    }
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     default_value {
-      type: DT_UINT8
+      type: DT_COMPLEX64
     }
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "EncodeWav"
-  input_arg {
-    name: "audio"
-    type: DT_FLOAT
-  }
+  name: "Conj"
   input_arg {
-    name: "sample_rate"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "contents"
-    type: DT_STRING
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_VARIANT
+      }
+    }
   }
 }
 op {
-  name: "Enter"
+  name: "ConjugateTranspose"
   input_arg {
-    name: "data"
+    name: "x"
     type_attr: "T"
   }
+  input_arg {
+    name: "perm"
+    type_attr: "Tperm"
+  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -10922,37 +11432,50 @@ op {
     type: "type"
   }
   attr {
-    name: "frame_name"
-    type: "string"
-  }
-  attr {
-    name: "is_constant"
-    type: "bool"
+    name: "Tperm"
+    type: "type"
     default_value {
-      b: false
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
+}
+op {
+  name: "Const"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
   attr {
-    name: "parallel_iterations"
-    type: "int"
-    default_value {
-      i: 10
-    }
+    name: "value"
+    type: "tensor"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
   }
 }
 op {
-  name: "Equal"
+  name: "ControlTrigger"
+}
+op {
+  name: "Conv2D"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -10961,54 +11484,56 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
       }
     }
   }
-  is_commutative: true
-}
-op {
-  name: "Erf"
-  input_arg {
-    name: "x"
-    type_attr: "T"
+  attr {
+    name: "strides"
+    type: "list(int)"
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
 }
 op {
-  name: "Erfc"
+  name: "Conv2D"
   input_arg {
-    name: "x"
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -11017,35 +11542,74 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
-}
-op {
-  name: "Exit"
-  input_arg {
-    name: "data"
-    type_attr: "T"
+  attr {
+    name: "strides"
+    type: "list(int)"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
 }
 op {
-  name: "Exp"
+  name: "Conv2DBackpropFilter"
   input_arg {
-    name: "x"
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -11055,22 +11619,57 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
 }
 op {
-  name: "ExpandDims"
+  name: "Conv2DBackpropFilter"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "dim"
-    type_attr: "Tdim"
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -11079,29 +11678,77 @@ op {
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
   }
   attr {
-    name: "Tdim"
-    type: "type"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
     default_value {
-      type: DT_INT32
+      b: true
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
     }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "Expm1"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "x"
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -11111,136 +11758,174 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
-}
-op {
-  name: "ExtractGlimpse"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "offsets"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "glimpse"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "centered"
+    name: "use_cudnn_on_gpu"
     type: "bool"
     default_value {
       b: true
     }
   }
   attr {
-    name: "normalized"
-    type: "bool"
-    default_value {
-      b: true
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "uniform_noise"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: true
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
 }
 op {
-  name: "ExtractImagePatches"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "images"
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
-    name: "patches"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "ksizes"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
   }
   attr {
     name: "strides"
     type: "list(int)"
-    has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "padding"
+    name: "data_format"
     type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "ExtractImagePatches"
+  name: "Conv3D"
   input_arg {
-    name: "images"
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
-    name: "patches"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "ksizes"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
-    name: "rates"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -11249,18 +11934,15 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
   attr {
     name: "padding"
     type: "string"
@@ -11271,3740 +11953,24082 @@ op {
       }
     }
   }
-}
-op {
-  name: "ExtractJpegShape"
-  input_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "image_shape"
-    type_attr: "output_type"
-  }
   attr {
-    name: "output_type"
-    type: "type"
+    name: "data_format"
+    type: "string"
     default_value {
-      type: DT_INT32
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
 }
 op {
-  name: "FFT"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "FFT2D"
+  name: "Conv3D"
   input_arg {
     name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
+    type_attr: "T"
   }
-}
-op {
-  name: "FFT3D"
   input_arg {
-    name: "input"
-    type: DT_COMPLEX64
+    name: "filter"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "FIFOQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    type_attr: "T"
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
+    name: "T"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "container"
+    name: "data_format"
     type: "string"
     default_value {
-      s: ""
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      s: ""
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "FIFOQueueV2"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
+    name: "T"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
-    has_minimum: true
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "container"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  deprecation {
+    version: 10
   }
-  is_stateful: true
 }
 op {
-  name: "Fact"
-  output_arg {
-    name: "fact"
-    type: DT_STRING
+  name: "Conv3DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-}
-op {
-  name: "FakeQuantWithMinMaxArgs"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "outputs"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
   attr {
-    name: "max"
-    type: "float"
-    default_value {
-      f: 6
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
+  deprecation {
+    version: 10
+  }
 }
 op {
-  name: "FakeQuantWithMinMaxArgs"
+  name: "Conv3DBackpropFilterV2"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "outputs"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
   attr {
-    name: "max"
-    type: "float"
-    default_value {
-      f: 6
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxArgs"
+  name: "Conv3DBackpropFilterV2"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "outputs"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
   attr {
-    name: "max"
-    type: "float"
-    default_value {
-      f: 6
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "narrow_range"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxArgsGradient"
+  name: "Conv3DBackpropFilterV2"
   input_arg {
-    name: "gradients"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "backprops"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
   attr {
-    name: "max"
-    type: "float"
-    default_value {
-      f: 6
-    }
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxArgsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops"
-    type: DT_FLOAT
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "max"
-    type: "float"
+    name: "data_format"
+    type: "string"
     default_value {
-      f: 6
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      i: 8
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxArgsGradient"
+  name: "Conv3DBackpropInput"
   input_arg {
-    name: "gradients"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "backprops"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "min"
-    type: "float"
-    default_value {
-      f: -6
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
   attr {
-    name: "max"
-    type: "float"
-    default_value {
-      f: 6
-    }
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
-  attr {
-    name: "narrow_range"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  deprecation {
+    version: 10
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVars"
+  name: "Conv3DBackpropInput"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "min"
-    type: DT_FLOAT
+    name: "filter"
+    type_attr: "T"
   }
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
-}
-op {
-  name: "FakeQuantWithMinMaxVars"
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
+  deprecation {
+    version: 10
+  }
 }
 op {
-  name: "FakeQuantWithMinMaxVars"
+  name: "Conv3DBackpropInputV2"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
-    name: "min"
-    type: DT_FLOAT
+    name: "filter"
+    type_attr: "T"
   }
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "outputs"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "narrow_range"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
-}
-op {
-  name: "FakeQuantWithMinMaxVarsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVarsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
+  name: "Conv3DBackpropInputV2"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
-    name: "min"
-    type: DT_FLOAT
+    name: "filter"
+    type_attr: "T"
   }
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
     default_value {
-      i: 8
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVarsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
+  name: "Conv3DBackpropInputV2"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
-    name: "min"
-    type: DT_FLOAT
+    name: "filter"
+    type_attr: "T"
   }
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "out_backprop"
+    type_attr: "T"
   }
   output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
     default_value {
-      i: 8
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
   attr {
-    name: "narrow_range"
-    type: "bool"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      b: false
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVarsPerChannel"
+  name: "Copy"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
+  attr {
+    name: "T"
+    type: "type"
   }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "FakeQuantWithMinMaxVarsPerChannel"
+  name: "Copy"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
-  input_arg {
-    name: "max"
-    type: DT_FLOAT
+  attr {
+    name: "T"
+    type: "type"
   }
-  output_arg {
-    name: "outputs"
-    type: DT_FLOAT
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "debug_ops_spec"
+    type: "list(string)"
     default_value {
-      i: 8
+      list {
+      }
     }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "FakeQuantWithMinMaxVarsPerChannel"
+  name: "CopyHost"
   input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  allows_uninitialized_input: true
+}
+op {
+  name: "CopyHost"
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "outputs"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
     default_value {
-      i: 8
+      s: ""
     }
   }
   attr {
-    name: "narrow_range"
-    type: "bool"
+    name: "debug_ops_spec"
+    type: "list(string)"
     default_value {
-      b: false
+      list {
+      }
     }
   }
+  allows_uninitialized_input: true
 }
 op {
-  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
-  }
+  name: "Cos"
   input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
+    name: "y"
+    type_attr: "T"
   }
-  output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  name: "Cos"
   input_arg {
-    name: "gradients"
-    type: DT_FLOAT
+    name: "x"
+    type_attr: "T"
   }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
+}
+op {
+  name: "Cosh"
   input_arg {
-    name: "max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
 }
 op {
-  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  name: "Cosh"
   input_arg {
-    name: "gradients"
-    type: DT_FLOAT
+    name: "x"
+    type_attr: "T"
   }
-  input_arg {
-    name: "inputs"
-    type: DT_FLOAT
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
-  input_arg {
-    name: "min"
-    type: DT_FLOAT
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
+}
+op {
+  name: "CountUpTo"
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
   output_arg {
-    name: "backprops_wrt_input"
-    type: DT_FLOAT
+    name: "output"
+    type_attr: "T"
   }
-  output_arg {
-    name: "backprop_wrt_min"
-    type: DT_FLOAT
+  attr {
+    name: "limit"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
+}
+op {
+  name: "CriticalSectionOp"
   output_arg {
-    name: "backprop_wrt_max"
-    type: DT_FLOAT
+    name: "resource"
+    type: DT_RESOURCE
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "container"
+    type: "string"
     default_value {
-      i: 8
+      s: ""
     }
   }
   attr {
-    name: "narrow_range"
-    type: "bool"
+    name: "shared_name"
+    type: "string"
     default_value {
-      b: false
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "FakeQueue"
+  name: "CropAndResize"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "image"
+    type_attr: "T"
   }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
   }
-  is_stateful: true
-}
-op {
-  name: "Fill"
   input_arg {
-    name: "dims"
+    name: "box_ind"
     type: DT_INT32
   }
   input_arg {
-    name: "value"
-    type_attr: "T"
+    name: "crop_size"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "crops"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
-  }
-}
-op {
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+}
+op {
+  name: "CropAndResize"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "crops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradBoxes"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradBoxes"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "CropAndResizeGradImage"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "image_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Cross"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumprod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Cumsum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  attr {
+    name: "exclusive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "DataFormatDimMap"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "src_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "dst_format"
+    type: "string"
+    default_value {
+      s: "NCHW"
+    }
+  }
+}
+op {
+  name: "DataFormatVecPermute"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "src_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "dst_format"
+    type: "string"
+    default_value {
+      s: "NCHW"
+    }
+  }
+}
+op {
+  name: "DatasetToSingleElement"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "DebugGradientIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugGradientRefIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DecodeAndCropJpeg"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "crop_window"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ratio"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "fancy_upscaling"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "try_recover_truncated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "acceptable_fraction"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "dct_method"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "DecodeBase64"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
+op {
+  name: "DecodeBmp"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "DecodeCompressed"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "DecodeGif"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+}
+op {
+  name: "DecodeJSONExample"
+  input_arg {
+    name: "json_examples"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "binary_examples"
+    type: DT_STRING
+  }
+}
+op {
+  name: "DecodeJpeg"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ratio"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "fancy_upscaling"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "try_recover_truncated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "acceptable_fraction"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "dct_method"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "DecodePng"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "DecodeRaw"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "DecodeRaw"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT16
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "DecodeWav"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "audio"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  attr {
+    name: "desired_channels"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "desired_samples"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "DeleteSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+}
+op {
+  name: "DeleteSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "DenseToDenseSetOperation"
+  input_arg {
+    name: "set1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "DenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "DenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "DenseToSparseSetOperation"
+  input_arg {
+    name: "set1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "set2_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "set2_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "result_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "result_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "set_operation"
+    type: "string"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "DepthToSpace"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "DepthToSpace"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNative"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Dequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+      }
+    }
+  }
+}
+op {
+  name: "Dequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+}
+op {
+  name: "Dequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+}
+op {
+  name: "DeserializeIterator"
+  input_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "serialized"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "DeserializeManySparse"
+  input_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "DeserializeSparse"
+  input_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "DeserializeSparse"
+  input_arg {
+    name: "serialized_sparse"
+    type_attr: "Tserialized"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tserialized"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
+}
+op {
+  name: "DestroyResourceOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "ignore_lookup_error"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "DestroyTemporaryVariable"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "var_name"
+    type: "string"
+  }
+}
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Digamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Digamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "filter_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "in_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "in_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "in_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Dilation2DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "in_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Div"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Div"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DrawBoundingBoxes"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "DynamicPartition"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "partitions"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "outputs"
+    type_attr: "T"
+    number_attr: "num_partitions"
+  }
+  attr {
+    name: "num_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "DynamicStitch"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "merged"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "EagerPyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "EditDistance"
+  input_arg {
+    name: "hypothesis_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "hypothesis_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "hypothesis_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "truth_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "truth_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "truth_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "normalize"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "EluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "EluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "EluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "EmptyTensorList"
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "EncodeBase64"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "pad"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "EncodeJpeg"
+  input_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  attr {
+    name: "format"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    allowed_values {
+      list {
+        s: ""
+        s: "grayscale"
+        s: "rgb"
+      }
+    }
+  }
+  attr {
+    name: "quality"
+    type: "int"
+    default_value {
+      i: 95
+    }
+  }
+  attr {
+    name: "progressive"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "optimize_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "chroma_downsampling"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "density_unit"
+    type: "string"
+    default_value {
+      s: "in"
+    }
+    allowed_values {
+      list {
+        s: "in"
+        s: "cm"
+      }
+    }
+  }
+  attr {
+    name: "x_density"
+    type: "int"
+    default_value {
+      i: 300
+    }
+  }
+  attr {
+    name: "y_density"
+    type: "int"
+    default_value {
+      i: 300
+    }
+  }
+  attr {
+    name: "xmp_metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "EncodePng"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  attr {
+    name: "compression"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_UINT8
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "EncodeWav"
+  input_arg {
+    name: "audio"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+}
+op {
+  name: "EnqueueInQueueDataset"
+  input_arg {
+    name: "queue"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "Enter"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "frame_name"
+    type: "string"
+  }
+  attr {
+    name: "is_constant"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+}
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Erf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Erf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Erfc"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Erfc"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ExecuteInCriticalSection"
+  input_arg {
+    name: "critical_section"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExecuteInCriticalSection"
+  input_arg {
+    name: "critical_section"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "Exit"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ExpandDims"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dim"
+    type_attr: "Tdim"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tdim"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Expm1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Expm1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ExtractGlimpse"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "offsets"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "glimpse"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "centered"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "normalized"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "uniform_noise"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ExtractImagePatches"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "ExtractImagePatches"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "ExtractImagePatches"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "ExtractImagePatches"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "patches"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksizes"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "rates"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "ExtractJpegShape"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image_shape"
+    type_attr: "output_type"
+  }
+  attr {
+    name: "output_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "FFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "FFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "FIFOQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FIFOQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Fact"
+  output_arg {
+    name: "fact"
+    type: DT_STRING
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgs"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "FakeQueue"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  is_stateful: true
+}
+op {
+  name: "Fill"
+  input_arg {
+    name: "dims"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "Fill"
+  input_arg {
+    name: "dims"
+    type_attr: "index_type"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "index_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FilterDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
   name: "FilterDataset"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "FixedLengthRecordDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "encoding"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "vocab_file"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "distortion"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "num_reserved_ids"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+    default_value {
+      i: 1
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shard"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "unigrams"
+    type: "list(float)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "FixedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "vocab_file"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "distortion"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "num_reserved_ids"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+    default_value {
+      i: 1
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shard"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "unigrams"
+    type: "list(float)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FlatMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "FlatMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "FractionalAvgPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  attr {
+    name: "pooling_ratio"
+    type: "list(float)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "pseudo_random"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "deterministic"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FractionalAvgPoolGrad"
+  input_arg {
+    name: "orig_input_tensor_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FractionalMaxPool"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  attr {
+    name: "pooling_ratio"
+    type: "list(float)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "pseudo_random"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "deterministic"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FractionalMaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "row_pooling_sequence"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "col_pooling_sequence"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "overlapping"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "FusedBatchNorm"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormGrad"
+  input_arg {
+    name: "y_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "x_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormGradV2"
+  input_arg {
+    name: "y_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "x_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormGradV2"
+  input_arg {
+    name: "y_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "x_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedBatchNormV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "FusedPadConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "FusedResizeAndPadConv2D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "paddings"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "resize_align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Gather"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "GatherNd"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "GatherV2"
+  input_arg {
+    name: "params"
+    type_attr: "Tparams"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tparams"
+  }
+  attr {
+    name: "Tparams"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "GenerateVocabRemapping"
+  input_arg {
+    name: "new_vocab_file"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "old_vocab_file"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "remapping"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_present"
+    type: DT_INT32
+  }
+  attr {
+    name: "new_vocab_offset"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_new_vocab"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "GenerateVocabRemapping"
+  input_arg {
+    name: "new_vocab_file"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "old_vocab_file"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "remapping"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_present"
+    type: DT_INT32
+  }
+  attr {
+    name: "new_vocab_offset"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_new_vocab"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "old_vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 23
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "GetSessionHandleV2"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "GetSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionTensor"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Greater"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "GreaterEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "GroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "GroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "GuaranteeConst"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "HSVToRGB"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "HSVToRGB"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "HashTable"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "HashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "HistogramFixedWidth"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "value_range"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "nbins"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "out"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "IFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "IFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "IFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+}
+op {
+  name: "IRFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "IRFFT2D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "IRFFT3D"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "Identity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "IdentityN"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "IdentityReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "IdentityReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
+op {
+  name: "IdentityReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Igamma"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Igammac"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Imag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ImageSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "bad_color"
+    type: "tensor"
+    default_value {
+      tensor {
+        dtype: DT_UINT8
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        int_val: 255
+        int_val: 0
+        int_val: 0
+        int_val: 255
+      }
+    }
+  }
+}
+op {
+  name: "ImageSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "max_images"
+    type: "int"
+    default_value {
+      i: 3
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "bad_color"
+    type: "tensor"
+    default_value {
+      tensor {
+        dtype: DT_UINT8
+        tensor_shape {
+          dim {
+            size: 4
+          }
+        }
+        int_val: 255
+        int_val: 0
+        int_val: 0
+        int_val: 255
+      }
+    }
+  }
+}
+op {
+  name: "ImmutableConst"
+  output_arg {
+    name: "tensor"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "memory_region_name"
+    type: "string"
+  }
+}
+op {
+  name: "InTopK"
+  input_arg {
+    name: "predictions"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "targets"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "precision"
+    type: DT_BOOL
+  }
+  attr {
+    name: "k"
+    type: "int"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "InTopKV2"
+  input_arg {
+    name: "predictions"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "targets"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "precision"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "InitializeTable"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+}
+op {
+  name: "InitializeTableFromTextFile"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+  }
+}
+op {
+  name: "InitializeTableFromTextFileV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InitializeTableV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "InterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "InterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 17
+  }
+}
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Invert"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "Invert"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "InvertPermutation"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "IsFinite"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsFinite"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsInf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsInf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "IsVariableInitialized"
+  input_arg {
+    name: "ref"
+    type_attr: "dtype"
+    is_ref: true
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "Iterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorFromStringHandle"
+  input_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorFromStringHandle"
+  input_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorGetNext"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorGetNextSync"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorSetStatsAggregator"
+  input_arg {
+    name: "iterator_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stats_aggregator_handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorToStringHandle"
+  input_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "string_handle"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LMDBReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "LRN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "LRN"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "LRNGrad"
+  input_arg {
+    name: "input_grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "LRNGrad"
+  input_arg {
+    name: "input_grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
+  }
+  attr {
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "LatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "LearnedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "LearnedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "LeftShift"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Lgamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Lgamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LinSpace"
+  input_arg {
+    name: "start"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "stop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "LinSpace"
+  input_arg {
+    name: "start"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "stop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ListDiff"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "LoadAndRemapMatrix"
+  input_arg {
+    name: "ckpt_path"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "old_tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "row_remapping"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "col_remapping"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "initializing_values"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_matrix"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_rows"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cols"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "max_rows_in_memory"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "LogMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sign"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "log_abs_determinant"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "LogSoftmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "logsoftmax"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LogSoftmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "logsoftmax"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LogUniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "LogUniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "LogicalAnd"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
+op {
+  name: "LogicalNot"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "LogicalOr"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
+op {
+  name: "LookupTableExport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableExportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableFind"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableFindV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableImport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableImportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableInsert"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableSize"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+}
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "LoopCond"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "MakeIterator"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "MapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_batches"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapStage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatchingFiles"
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+}
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_upper"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type_attr: "Tindex"
+  }
+  input_arg {
+    name: "num_upper"
+    type_attr: "Tindex"
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindex"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDiag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixLogarithm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSetDiag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "orig_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "orig_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradV2"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "MaxPoolWithArgmax"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Mean"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Merge"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "value_index"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MergeSummary"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+    number_attr: "N"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
+  }
+  attr {
+    name: "delete_old_dirs"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
+  }
+  attr {
+    name: "delete_old_dirs"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Mfcc"
+  input_arg {
+    name: "spectrogram"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "upper_frequency_limit"
+    type: "float"
+    default_value {
+      f: 4000
+    }
+  }
+  attr {
+    name: "lower_frequency_limit"
+    type: "float"
+    default_value {
+      f: 20
+    }
+  }
+  attr {
+    name: "filterbank_channel_count"
+    type: "int"
+    default_value {
+      i: 40
+    }
+  }
+  attr {
+    name: "dct_coefficient_count"
+    type: "int"
+    default_value {
+      i: 13
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "MirrorPad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+}
+op {
+  name: "MirrorPadGrad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Multinomial"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_samples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "output_dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTable"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTableV2"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTable"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableOfTensors"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableOfTensorsV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "NegTrain"
+  input_arg {
+    name: "w_in"
+    type: DT_FLOAT
+    is_ref: true
+  }
+  input_arg {
+    name: "w_out"
+    type: DT_FLOAT
+    is_ref: true
+  }
+  input_arg {
+    name: "examples"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "labels"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "lr"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "vocab_count"
+    type: "list(int)"
+  }
+  attr {
+    name: "num_negative_samples"
+    type: "int"
+  }
+  deprecation {
+    version: 19
+  }
+  is_stateful: true
+}
+op {
+  name: "NextIteration"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "NoOp"
+}
+op {
+  name: "NonMaxSuppression"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "iou_threshold"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+}
+op {
+  name: "NonMaxSuppressionV2"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "OneHot"
+  input_arg {
+    name: "indices"
+    type_attr: "TI"
+  }
+  input_arg {
+    name: "depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "on_value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "off_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "TI"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "OneShotIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "dataset_factory"
+    type: "func"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
+  }
+}
+op {
+  name: "OrderedMapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapStage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Pack"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "Pad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PadV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  input_arg {
+    name: "constant_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PaddingFIFOQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PaddingFIFOQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParallelConcat"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "ParallelDynamicStitch"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "merged"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "ParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParseExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "sparse_keys"
+    type: DT_STRING
+    number_attr: "Nsparse"
+  }
+  input_arg {
+    name: "dense_keys"
+    type: DT_STRING
+    number_attr: "Ndense"
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "Nsparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Ndense"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSingleExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "num_sparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "num_sparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSingleSequenceExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "feature_list_dense_missing_assumed_empty"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "context_sparse_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_sparse"
+  }
+  input_arg {
+    name: "context_dense_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_dense"
+  }
+  input_arg {
+    name: "feature_list_sparse_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_sparse"
+  }
+  input_arg {
+    name: "feature_list_dense_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_dense"
+  }
+  input_arg {
+    name: "context_dense_defaults"
+    type_list_attr: "Tcontext_dense"
+  }
+  input_arg {
+    name: "debug_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "context_sparse_indices"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_sparse_values"
+    type_list_attr: "context_sparse_types"
+  }
+  output_arg {
+    name: "context_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_dense_values"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "feature_list_sparse_indices"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_sparse_values"
+    type_list_attr: "feature_list_sparse_types"
+  }
+  output_arg {
+    name: "feature_list_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_dense_values"
+    type_list_attr: "feature_list_dense_types"
+  }
+  attr {
+    name: "Ncontext_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tcontext_dense"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "context_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseTensor"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  deprecation {
+    version: 23
+  }
+}
+op {
+  name: "PlaceholderWithDefault"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "Polygamma"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+}
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PrependFromQueueAndPaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PreventGradient"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "PyFuncStateless"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
+op {
+  name: "Qr"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r"
+    type_attr: "T"
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 21
   }
+}
+op {
+  name: "QuantizeAndDequantize"
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "predicate"
-    type: "func"
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeDownAndShrinkRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeDownAndShrinkRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "FilterDataset"
+  name: "QuantizeV2"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
-  attr {
-    name: "predicate"
-    type: "func"
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
   }
 }
 op {
-  name: "FixedLengthRecordDataset"
-  input_arg {
-    name: "filenames"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "header_bytes"
-    type: DT_INT64
-  }
+  name: "QuantizeV2"
   input_arg {
-    name: "record_bytes"
-    type: DT_INT64
+    name: "input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "footer_bytes"
-    type: DT_INT64
+    name: "min_range"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "buffer_size"
-    type: DT_INT64
+    name: "max_range"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "FixedLengthRecordReader"
   output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "output_min"
+    type: DT_FLOAT
   }
-  attr {
-    name: "record_bytes"
-    type: "int"
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "container"
+    name: "mode"
     type: "string"
     default_value {
-      s: ""
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
     }
   }
   attr {
-    name: "shared_name"
+    name: "round_mode"
     type: "string"
     default_value {
-      s: ""
+      s: "HALF_AWAY_FROM_ZERO"
+    }
+    allowed_values {
+      list {
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "FixedLengthRecordReader"
-  output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
   }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
   }
-  attr {
-    name: "record_bytes"
-    type: "int"
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
   }
-  attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "hop_bytes"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "container"
+    name: "mode"
     type: "string"
     default_value {
-      s: ""
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
     }
   }
   attr {
-    name: "shared_name"
+    name: "round_mode"
     type: "string"
     default_value {
-      s: ""
+      s: "HALF_AWAY_FROM_ZERO"
+    }
+    allowed_values {
+      list {
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "FixedLengthRecordReaderV2"
+  name: "QuantizedAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
   output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
+    name: "z"
+    type_attr: "Toutput"
   }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
   }
-  attr {
-    name: "record_bytes"
-    type: "int"
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
   }
   attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "Toutput"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "FixedLengthRecordReaderV2"
+  name: "QuantizedAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
+  }
   output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
+    name: "z"
+    type_attr: "Toutput"
   }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
   }
-  attr {
-    name: "record_bytes"
-    type: "int"
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
   }
   attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "hop_bytes"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "Toutput"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_QINT32
     }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "FixedLengthRecordReaderV2"
-  output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
+  name: "QuantizedAvgPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "header_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
   }
-  attr {
-    name: "record_bytes"
-    type: "int"
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
   }
-  attr {
-    name: "footer_bytes"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "hop_bytes"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "ksize"
+    type: "list(int)"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "encoding"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "FixedUnigramCandidateSampler"
+  name: "QuantizedAvgPool"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+    name: "output"
+    type_attr: "T"
   }
   output_arg {
-    name: "true_expected_count"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "sampled_expected_count"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
-    name: "unique"
-    type: "bool"
+    name: "ksize"
+    type: "list(int)"
   }
   attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "vocab_file"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "distortion"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "num_reserved_ids"
-    type: "int"
-    default_value {
-      i: 0
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
-  attr {
-    name: "num_shards"
-    type: "int"
-    default_value {
-      i: 1
-    }
-    has_minimum: true
-    minimum: 1
+}
+op {
+  name: "QuantizedBatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "Tinput"
   }
-  attr {
-    name: "shard"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "t_min"
+    type: DT_FLOAT
   }
-  attr {
-    name: "unigrams"
-    type: "list(float)"
-    default_value {
-      list {
-      }
-    }
+  input_arg {
+    name: "t_max"
+    type: DT_FLOAT
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "m"
+    type_attr: "Tinput"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "FixedUnigramCandidateSampler"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "m_max"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
   }
-  output_arg {
-    name: "true_expected_count"
+  input_arg {
+    name: "v_min"
     type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_expected_count"
+  input_arg {
+    name: "v_max"
     type: DT_FLOAT
   }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "beta"
+    type_attr: "Tinput"
   }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "beta_min"
+    type: DT_FLOAT
   }
-  attr {
-    name: "unique"
-    type: "bool"
+  input_arg {
+    name: "beta_max"
+    type: DT_FLOAT
   }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
   }
-  attr {
-    name: "vocab_file"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
   }
-  attr {
-    name: "distortion"
-    type: "float"
-    default_value {
-      f: 1
-    }
+  input_arg {
+    name: "gamma_max"
+    type: DT_FLOAT
   }
-  attr {
-    name: "num_reserved_ids"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  output_arg {
+    name: "result"
+    type_attr: "out_type"
   }
-  attr {
-    name: "num_shards"
-    type: "int"
-    default_value {
-      i: 1
-    }
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "shard"
-    type: "int"
-    default_value {
-      i: 0
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
-    has_minimum: true
   }
   attr {
-    name: "unigrams"
-    type: "list(float)"
-    default_value {
+    name: "out_type"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "variance_epsilon"
+    type: "float"
   }
   attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "scale_after_normalization"
+    type: "bool"
   }
-  is_stateful: true
 }
 op {
-  name: "FlatMapDataset"
+  name: "QuantizedBatchNormWithGlobalNormalization"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "t"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "t_min"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+  input_arg {
+    name: "t_max"
+    type: DT_FLOAT
   }
-  attr {
-    name: "f"
-    type: "func"
+  input_arg {
+    name: "m"
+    type_attr: "Tinput"
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "m_max"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
   }
-  is_stateful: true
-}
-op {
-  name: "FlatMapDataset"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "v_min"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "v_max"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+  input_arg {
+    name: "beta"
+    type_attr: "Tinput"
   }
-  attr {
-    name: "f"
-    type: "func"
+  input_arg {
+    name: "beta_min"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "beta_max"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "Floor"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "gamma_max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "y"
-    type_attr: "T"
+    name: "result"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
-}
-op {
-  name: "FloorDiv"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
 }
 op {
-  name: "FloorMod"
+  name: "QuantizedBiasAdd"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input"
+    type_attr: "T1"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "bias"
+    type_attr: "T2"
   }
-  output_arg {
-    name: "z"
-    type_attr: "T"
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "FractionalAvgPool"
   input_arg {
-    name: "value"
-    type_attr: "T"
+    name: "min_bias"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  input_arg {
+    name: "max_bias"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "row_pooling_sequence"
-    type: DT_INT64
+    name: "output"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "col_pooling_sequence"
-    type: DT_INT64
-  }
-  attr {
-    name: "pooling_ratio"
-    type: "list(float)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "pseudo_random"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "overlapping"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "min_out"
+    type: DT_FLOAT
   }
-  attr {
-    name: "deterministic"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
   }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
 }
 op {
-  name: "FractionalAvgPoolGrad"
+  name: "QuantizedBiasAdd"
   input_arg {
-    name: "orig_input_tensor_shape"
-    type: DT_INT64
+    name: "input"
+    type_attr: "T1"
   }
   input_arg {
-    name: "out_backprop"
-    type_attr: "T"
+    name: "bias"
+    type_attr: "T2"
   }
   input_arg {
-    name: "row_pooling_sequence"
-    type: DT_INT64
+    name: "min_input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "col_pooling_sequence"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "overlapping"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "max_input"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "min_bias"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "FractionalMaxPool"
   input_arg {
-    name: "value"
-    type_attr: "T"
+    name: "max_bias"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "row_pooling_sequence"
-    type: DT_INT64
+    name: "min_out"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "col_pooling_sequence"
-    type: DT_INT64
-  }
-  attr {
-    name: "pooling_ratio"
-    type: "list(float)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "pseudo_random"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "overlapping"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "deterministic"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "max_out"
+    type: DT_FLOAT
   }
   attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
 }
 op {
-  name: "FractionalMaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "QuantizedConcat"
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "concat_dim"
+    type: DT_INT32
   }
   input_arg {
-    name: "out_backprop"
+    name: "values"
     type_attr: "T"
+    number_attr: "N"
   }
   input_arg {
-    name: "row_pooling_sequence"
-    type: DT_INT64
+    name: "input_mins"
+    type: DT_FLOAT
+    number_attr: "N"
   }
   input_arg {
-    name: "col_pooling_sequence"
-    type: DT_INT64
+    name: "input_maxes"
+    type: DT_FLOAT
+    number_attr: "N"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
   attr {
-    name: "overlapping"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
   }
 }
 op {
-  name: "FusedBatchNorm"
+  name: "QuantizedConv2D"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "scale"
-    type_attr: "T"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "offset"
-    type_attr: "T"
+    name: "min_input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "mean"
-    type_attr: "T"
+    name: "max_input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "variance"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+    name: "min_filter"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "batch_mean"
-    type_attr: "T"
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "batch_variance"
-    type_attr: "T"
+    name: "output"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "reserve_space_1"
-    type_attr: "T"
+    name: "min_output"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "reserve_space_2"
-    type_attr: "T"
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "epsilon"
-    type: "float"
-    default_value {
-      f: 0.0001
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "out_type"
+    type: "type"
     default_value {
-      s: "NHWC"
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "is_training"
-    type: "bool"
-    default_value {
-      b: true
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
 }
 op {
-  name: "FusedBatchNormGrad"
+  name: "QuantizedConv2D"
   input_arg {
-    name: "y_backprop"
-    type_attr: "T"
+    name: "input"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "scale"
-    type_attr: "T"
+    name: "min_input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "reserve_space_1"
-    type_attr: "T"
+    name: "max_input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "reserve_space_2"
-    type_attr: "T"
+    name: "min_filter"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "x_backprop"
-    type_attr: "T"
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "scale_backprop"
-    type_attr: "T"
+    name: "output"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "offset_backprop"
-    type_attr: "T"
+    name: "min_output"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "reserve_space_3"
-    type_attr: "T"
+    name: "max_output"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "reserve_space_4"
-    type_attr: "T"
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
   }
   attr {
-    name: "T"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "epsilon"
-    type: "float"
+    name: "out_type"
+    type: "type"
     default_value {
-      f: 0.0001
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "data_format"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
     type: "string"
-    default_value {
-      s: "NHWC"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "is_training"
-    type: "bool"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      b: true
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
 }
 op {
-  name: "FusedBatchNormGradV2"
+  name: "QuantizedConv2D"
   input_arg {
-    name: "y_backprop"
-    type_attr: "T"
+    name: "input"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "scale"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "reserve_space_1"
-    type_attr: "U"
+    name: "max_input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "reserve_space_2"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "x_backprop"
-    type_attr: "T"
+    name: "min_filter"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "scale_backprop"
-    type_attr: "U"
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "offset_backprop"
-    type_attr: "U"
+    name: "output"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "reserve_space_3"
-    type_attr: "U"
+    name: "min_output"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "reserve_space_4"
-    type_attr: "U"
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "U"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "epsilon"
-    type: "float"
+    name: "out_type"
+    type: "type"
     default_value {
-      f: 0.0001
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "data_format"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
     type: "string"
-    default_value {
-      s: "NHWC"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "is_training"
-    type: "bool"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      b: true
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
     }
   }
 }
 op {
-  name: "FusedBatchNormV2"
+  name: "QuantizedInstanceNorm"
   input_arg {
     name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "scale"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "offset"
-    type_attr: "U"
-  }
-  input_arg {
-    name: "mean"
-    type_attr: "U"
+    name: "x_min"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "variance"
-    type_attr: "U"
+    name: "x_max"
+    type: DT_FLOAT
   }
   output_arg {
     name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "batch_mean"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "batch_variance"
-    type_attr: "U"
-  }
-  output_arg {
-    name: "reserve_space_1"
-    type_attr: "U"
+    name: "y_min"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "reserve_space_2"
-    type_attr: "U"
+    name: "y_max"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "U"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
+    name: "output_range_given"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "epsilon"
+    name: "given_y_min"
     type: "float"
     default_value {
-      f: 0.0001
+      f: 0
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "given_y_max"
+    type: "float"
     default_value {
-      s: "NHWC"
+      f: 0
     }
   }
   attr {
-    name: "is_training"
-    type: "bool"
+    name: "variance_epsilon"
+    type: "float"
     default_value {
-      b: true
+      f: 1e-05
+    }
+  }
+  attr {
+    name: "min_separation"
+    type: "float"
+    default_value {
+      f: 0.001
     }
   }
 }
 op {
-  name: "FusedPadConv2D"
+  name: "QuantizedInstanceNorm"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "paddings"
-    type: DT_INT32
+    name: "x_min"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "x_max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
+  output_arg {
+    name: "y_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "y_max"
+    type: DT_FLOAT
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "mode"
-    type: "string"
-    allowed_values {
-      list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
-      }
+    name: "output_range_given"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "given_y_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "given_y_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+    default_value {
+      f: 1e-05
+    }
+  }
+  attr {
+    name: "min_separation"
+    type: "float"
+    default_value {
+      f: 0.001
     }
   }
 }
 op {
-  name: "FusedResizeAndPadConv2D"
+  name: "QuantizedMatMul"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "a"
+    type_attr: "T1"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "b"
+    type_attr: "T2"
   }
   input_arg {
-    name: "paddings"
-    type: DT_INT32
+    name: "min_a"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "max_a"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_b"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_b"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "out"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "T1"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "resize_align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
+    name: "T2"
+    type: "type"
     allowed_values {
       list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
-}
-op {
-  name: "Gather"
-  input_arg {
-    name: "params"
-    type_attr: "Tparams"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tparams"
-  }
   attr {
-    name: "validate_indices"
+    name: "transpose_a"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
   attr {
-    name: "Tparams"
-    type: "type"
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "Tindices"
+    name: "Tactivation"
     type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
 }
 op {
-  name: "GatherNd"
+  name: "QuantizedMatMul"
   input_arg {
-    name: "params"
-    type_attr: "Tparams"
+    name: "a"
+    type_attr: "T1"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "Tparams"
-  }
-  attr {
-    name: "Tparams"
-    type: "type"
+    name: "b"
+    type_attr: "T2"
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "min_a"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "GatherV2"
   input_arg {
-    name: "params"
-    type_attr: "Tparams"
+    name: "max_a"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "min_b"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "axis"
-    type_attr: "Taxis"
+    name: "max_b"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "Tparams"
+    name: "out"
+    type_attr: "Toutput"
   }
-  attr {
-    name: "Tparams"
-    type: "type"
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tindices"
+    name: "T1"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "Taxis"
+    name: "T2"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
-}
-op {
-  name: "GenerateVocabRemapping"
-  input_arg {
-    name: "new_vocab_file"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "old_vocab_file"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "remapping"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "num_present"
-    type: DT_INT32
-  }
-  attr {
-    name: "new_vocab_offset"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_new_vocab"
-    type: "int"
-    has_minimum: true
-  }
-}
-op {
-  name: "GenerateVocabRemapping"
-  input_arg {
-    name: "new_vocab_file"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "old_vocab_file"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "remapping"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "num_present"
-    type: DT_INT32
-  }
-  attr {
-    name: "new_vocab_offset"
-    type: "int"
-    has_minimum: true
-  }
   attr {
-    name: "num_new_vocab"
-    type: "int"
-    has_minimum: true
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
-    name: "old_vocab_size"
-    type: "int"
+    name: "transpose_a"
+    type: "bool"
     default_value {
-      i: -1
+      b: false
     }
-    has_minimum: true
-    minimum: -1
-  }
-}
-op {
-  name: "GetSessionHandle"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
   }
   attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "GetSessionHandle"
-  input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "T"
+    name: "Tactivation"
     type: "type"
-  }
-  deprecation {
-    version: 23
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
 }
 op {
-  name: "GetSessionHandle"
+  name: "QuantizedMaxPool"
   input_arg {
-    name: "value"
+    name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "GetSessionHandleV2"
   input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "T"
-    type: "type"
+    name: "min_input"
+    type: DT_FLOAT
   }
-  is_stateful: true
-}
-op {
-  name: "GetSessionTensor"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "max_input"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "value"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-}
-op {
-  name: "Greater"
-  input_arg {
-    name: "x"
+    name: "output"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
-    type_attr: "T"
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
-}
-op {
-  name: "Greater"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "ksize"
+    type: "list(int)"
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  attr {
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
 }
 op {
-  name: "GreaterEqual"
+  name: "QuantizedMaxPool"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
-}
-op {
-  name: "GreaterEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "ksize"
+    type: "list(int)"
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  attr {
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
 }
 op {
-  name: "GroupByWindowDataset"
+  name: "QuantizedMul"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "x"
+    type_attr: "T1"
   }
   input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
+    name: "y"
+    type_attr: "T2"
   }
   input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
+    name: "min_x"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "max_x"
+    type: DT_FLOAT
   }
-  attr {
-    name: "key_func"
-    type: "func"
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
   }
-  attr {
-    name: "reduce_func"
-    type: "func"
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
   }
-  attr {
-    name: "window_size_func"
-    type: "func"
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
   }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
   }
   attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
   }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "GroupByWindowDataset"
+  name: "QuantizedMul"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "x"
+    type_attr: "T1"
   }
   input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
+    name: "y"
+    type_attr: "T2"
   }
   input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
+    name: "min_x"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "max_x"
+    type: DT_FLOAT
   }
-  attr {
-    name: "key_func"
-    type: "func"
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
   }
-  attr {
-    name: "reduce_func"
-    type: "func"
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
   }
-  attr {
-    name: "window_size_func"
-    type: "func"
+  output_arg {
+    name: "z"
+    type_attr: "Toutput"
   }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
   }
   attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "Toutput"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
+  is_commutative: true
 }
 op {
-  name: "HSVToRGB"
+  name: "QuantizedRelu"
   input_arg {
-    name: "images"
-    type_attr: "T"
+    name: "features"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
     type: "type"
     default_value {
-      type: DT_FLOAT
+      type: DT_QUINT8
     }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
 }
 op {
-  name: "HashTable"
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  name: "QuantizedRelu"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "min_features"
+    type: DT_FLOAT
   }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
   }
-  attr {
-    name: "key_dtype"
-    type: "type"
+  output_arg {
+    name: "activations"
+    type_attr: "out_type"
   }
-  attr {
-    name: "value_dtype"
-    type: "type"
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
   }
-  is_stateful: true
-}
-op {
-  name: "HashTableV2"
   output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "max_activations"
+    type: DT_FLOAT
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "out_type"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_QUINT8
     }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  is_stateful: true
 }
 op {
-  name: "HistogramFixedWidth"
+  name: "QuantizedRelu6"
   input_arg {
-    name: "values"
-    type_attr: "T"
+    name: "features"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "value_range"
-    type_attr: "T"
+    name: "min_features"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "nbins"
-    type: DT_INT32
+    name: "max_features"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "out"
-    type_attr: "dtype"
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "dtype"
+    name: "out_type"
     type: "type"
     default_value {
-      type: DT_INT32
+      type: DT_QUINT8
     }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
 }
 op {
-  name: "HistogramSummary"
+  name: "QuantizedRelu6"
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "features"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "values"
-    type_attr: "T"
+    name: "min_features"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_features"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
-}
-op {
-  name: "HistogramSummary"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
     default_value {
-      type: DT_FLOAT
+      type: DT_QUINT8
     }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
 }
 op {
-  name: "IFFT"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "IFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "IFFT3D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
-  }
-}
-op {
-  name: "IRFFT"
+  name: "QuantizedReluX"
   input_arg {
-    name: "input"
-    type: DT_COMPLEX64
+    name: "features"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
+    name: "max_value"
     type: DT_FLOAT
   }
-}
-op {
-  name: "IRFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
   input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
+    name: "min_features"
     type: DT_FLOAT
   }
-}
-op {
-  name: "IRFFT3D"
-  input_arg {
-    name: "input"
-    type: DT_COMPLEX64
-  }
   input_arg {
-    name: "fft_length"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
+    name: "max_features"
     type: DT_FLOAT
   }
-}
-op {
-  name: "Identity"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "IdentityN"
-  input_arg {
-    name: "input"
-    type_list_attr: "T"
+    name: "activations"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "output"
-    type_list_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "min_activations"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "IdentityReader"
   output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "max_activations"
+    type: DT_FLOAT
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "out_type"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "IdentityReaderV2"
-  output_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  name: "QuantizedReluX"
+  input_arg {
+    name: "features"
+    type_attr: "Tinput"
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "max_value"
+    type: DT_FLOAT
   }
-  is_stateful: true
-}
-op {
-  name: "Igamma"
   input_arg {
-    name: "a"
-    type_attr: "T"
+    name: "min_features"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "max_features"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "activations"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_activations"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
-}
-op {
-  name: "Igammac"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
 }
 op {
-  name: "IgnoreErrorsDataset"
+  name: "QuantizedReshape"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "tensor"
+    type_attr: "T"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
   }
-  is_stateful: true
-}
-op {
-  name: "IgnoreErrorsDataset"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input_max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Imag"
-  input_arg {
-    name: "input"
+    name: "output"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "Tout"
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
   attr {
-    name: "Tout"
+    name: "Tshape"
     type: "type"
     default_value {
-      type: DT_FLOAT
+      type: DT_INT32
     }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "ImageSummary"
+  name: "QuantizedResizeBilinear"
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "images"
+    type_attr: "T"
   }
   input_arg {
-    name: "tensor"
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
     type_attr: "T"
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
+    name: "out_min"
+    type: DT_FLOAT
   }
-  attr {
-    name: "max_images"
-    type: "int"
-    default_value {
-      i: 3
-    }
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "out_max"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_UINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_FLOAT
-        type: DT_HALF
       }
     }
   }
   attr {
-    name: "bad_color"
-    type: "tensor"
+    name: "align_corners"
+    type: "bool"
     default_value {
-      tensor {
-        dtype: DT_UINT8
-        tensor_shape {
-          dim {
-            size: 4
-          }
-        }
-        int_val: 255
-        int_val: 0
-        int_val: 0
-        int_val: 255
-      }
+      b: false
     }
   }
 }
 op {
-  name: "ImageSummary"
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
+  name: "QueueClose"
   input_arg {
-    name: "tensor"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "summary"
+    name: "handle"
     type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "max_images"
-    type: "int"
+    name: "cancel_pending_enqueues"
+    type: "bool"
     default_value {
-      i: 3
+      b: false
     }
-    has_minimum: true
-    minimum: 1
   }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_FLOAT
-        type: DT_HALF
-        type: DT_DOUBLE
-      }
-    }
+}
+op {
+  name: "QueueCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
   }
   attr {
-    name: "bad_color"
-    type: "tensor"
+    name: "cancel_pending_enqueues"
+    type: "bool"
     default_value {
-      tensor {
-        dtype: DT_UINT8
-        tensor_shape {
-          dim {
-            size: 4
-          }
-        }
-        int_val: 255
-        int_val: 0
-        int_val: 0
-        int_val: 255
-      }
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ImmutableConst"
-  output_arg {
-    name: "tensor"
-    type_attr: "dtype"
+  name: "QueueDequeue"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
-  attr {
-    name: "dtype"
-    type: "type"
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
   }
   attr {
-    name: "shape"
-    type: "shape"
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "memory_region_name"
-    type: "string"
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
 }
 op {
-  name: "InTopK"
+  name: "QueueDequeueMany"
   input_arg {
-    name: "predictions"
-    type: DT_FLOAT
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "targets"
-    type_attr: "T"
+    name: "n"
+    type: DT_INT32
   }
   output_arg {
-    name: "precision"
-    type: DT_BOOL
+    name: "components"
+    type_list_attr: "component_types"
   }
   attr {
-    name: "k"
-    type: "int"
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "timeout_ms"
+    type: "int"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      i: -1
     }
   }
 }
 op {
-  name: "InTopKV2"
-  input_arg {
-    name: "predictions"
-    type: DT_FLOAT
-  }
+  name: "QueueDequeueManyV2"
   input_arg {
-    name: "targets"
-    type_attr: "T"
+    name: "handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "k"
-    type_attr: "T"
+    name: "n"
+    type: DT_INT32
   }
   output_arg {
-    name: "precision"
-    type: DT_BOOL
+    name: "components"
+    type_list_attr: "component_types"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "timeout_ms"
+    type: "int"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      i: -1
     }
   }
+  is_stateful: true
 }
 op {
-  name: "InitializeTable"
+  name: "QueueDequeueUpTo"
   input_arg {
-    name: "table_handle"
+    name: "handle"
     type: DT_STRING
     is_ref: true
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tkey"
+    name: "n"
+    type: DT_INT32
   }
-  input_arg {
-    name: "values"
-    type_attr: "Tval"
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
   }
   attr {
-    name: "Tkey"
-    type: "type"
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "Tval"
-    type: "type"
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
 }
 op {
-  name: "InitializeTableFromTextFile"
+  name: "QueueDequeueUpToV2"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "filename"
-    type: DT_STRING
+    name: "n"
+    type: DT_INT32
   }
-  attr {
-    name: "key_index"
-    type: "int"
-    has_minimum: true
-    minimum: -2
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
   }
   attr {
-    name: "value_index"
-    type: "int"
+    name: "component_types"
+    type: "list(type)"
     has_minimum: true
-    minimum: -2
+    minimum: 1
   }
   attr {
-    name: "vocab_size"
+    name: "timeout_ms"
     type: "int"
     default_value {
       i: -1
     }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "delimiter"
-    type: "string"
-    default_value {
-      s: "\t"
-    }
   }
+  is_stateful: true
 }
 op {
-  name: "InitializeTableFromTextFileV2"
+  name: "QueueDequeueV2"
   input_arg {
-    name: "table_handle"
+    name: "handle"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  attr {
-    name: "key_index"
-    type: "int"
-    has_minimum: true
-    minimum: -2
+  output_arg {
+    name: "components"
+    type_list_attr: "component_types"
   }
   attr {
-    name: "value_index"
-    type: "int"
+    name: "component_types"
+    type: "list(type)"
     has_minimum: true
-    minimum: -2
+    minimum: 1
   }
   attr {
-    name: "vocab_size"
+    name: "timeout_ms"
     type: "int"
     default_value {
       i: -1
     }
-    has_minimum: true
-    minimum: -1
-  }
-  attr {
-    name: "delimiter"
-    type: "string"
-    default_value {
-      s: "\t"
-    }
   }
   is_stateful: true
 }
 op {
-  name: "InitializeTableV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
+  name: "QueueEnqueue"
   input_arg {
-    name: "keys"
-    type_attr: "Tkey"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "values"
-    type_attr: "Tval"
+    name: "components"
+    type_list_attr: "Tcomponents"
   }
   attr {
-    name: "Tkey"
-    type: "type"
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "Tval"
-    type: "type"
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "InterleaveDataset"
+  name: "QueueEnqueueMany"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "components"
+    type_list_attr: "Tcomponents"
   }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
+  attr {
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
-  output_arg {
+}
+op {
+  name: "QueueEnqueueManyV2"
+  input_arg {
     name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
   }
   attr {
-    name: "output_types"
+    name: "Tcomponents"
     type: "list(type)"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
   is_stateful: true
 }
 op {
-  name: "InterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
+  name: "QueueEnqueueV2"
   input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  output_arg {
     name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
   }
   attr {
-    name: "output_types"
+    name: "Tcomponents"
     type: "list(type)"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "timeout_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "Inv"
+  name: "QueueIsClosed"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 17
+    name: "is_closed"
+    type: DT_BOOL
   }
 }
 op {
-  name: "InvGrad"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "QueueIsClosedV2"
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "handle"
+    type: DT_RESOURCE
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-  deprecation {
-    version: 17
+    name: "is_closed"
+    type: DT_BOOL
   }
+  is_stateful: true
 }
 op {
-  name: "InvGrad"
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
+  name: "QueueSize"
   input_arg {
-    name: "dy"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "size"
+    type: DT_INT32
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+}
+op {
+  name: "QueueSizeV2"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
   }
-  deprecation {
-    version: 17
+  output_arg {
+    name: "size"
+    type: DT_INT32
   }
+  is_stateful: true
 }
 op {
-  name: "Invert"
+  name: "RFFT"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-      }
-    }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
   }
 }
 op {
-  name: "Invert"
+  name: "RFFT2D"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
   }
 }
 op {
-  name: "InvertPermutation"
+  name: "RFFT3D"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  input_arg {
+    name: "fft_length"
+    type: DT_INT32
   }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
   }
 }
 op {
-  name: "IsFinite"
+  name: "RGBToHSV"
   input_arg {
-    name: "x"
+    name: "images"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -15012,21 +36036,25 @@ op {
   }
 }
 op {
-  name: "IsInf"
+  name: "RGBToHSV"
   input_arg {
-    name: "x"
+    name: "images"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -15034,123 +36062,66 @@ op {
   }
 }
 op {
-  name: "IsNan"
+  name: "RandomCrop"
   input_arg {
-    name: "x"
+    name: "image"
     type_attr: "T"
   }
+  input_arg {
+    name: "size"
+    type: DT_INT64
+  }
   output_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-}
-op {
-  name: "IsVariableInitialized"
-  input_arg {
-    name: "ref"
-    type_attr: "dtype"
-    is_ref: true
-  }
-  output_arg {
-    name: "is_initialized"
-    type: DT_BOOL
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "Iterator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-  }
-  attr {
-    name: "container"
-    type: "string"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "IteratorFromStringHandle"
-  input_arg {
-    name: "string_handle"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "IteratorFromStringHandle"
-  input_arg {
-    name: "string_handle"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "seed"
+    type: "int"
     default_value {
-      list {
-      }
+      i: 0
     }
-    has_minimum: true
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "seed2"
+    type: "int"
     default_value {
-      list {
-      }
+      i: 0
     }
-    has_minimum: true
+  }
+  deprecation {
+    version: 8
   }
   is_stateful: true
 }
 op {
-  name: "IteratorGetNext"
+  name: "RandomDataset"
   input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
   }
   output_arg {
-    name: "components"
-    type_list_attr: "output_types"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
     name: "output_types"
@@ -15167,33 +36138,13 @@ op {
   is_stateful: true
 }
 op {
-  name: "IteratorSetStatsAggregator"
-  input_arg {
-    name: "iterator_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "stats_aggregator_handle"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "IteratorToStringHandle"
+  name: "RandomGamma"
   input_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "string_handle"
-    type: DT_STRING
+    name: "shape"
+    type_attr: "S"
   }
-  is_stateful: true
-}
-op {
-  name: "L2Loss"
   input_arg {
-    name: "t"
+    name: "alpha"
     type_attr: "T"
   }
   output_arg {
@@ -15201,27 +36152,29 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "T"
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_HALF
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-}
-op {
-  name: "L2Loss"
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
@@ -15233,208 +36186,250 @@ op {
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "LMDBReader"
+  name: "RandomPoisson"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "dtype"
+  }
   output_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type_attr: "dtype"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "seed"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "seed2"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
   is_stateful: true
 }
 op {
-  name: "LRN"
+  name: "RandomPoisson"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "dtype"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "dtype"
   }
   attr {
-    name: "depth_radius"
+    name: "seed"
     type: "int"
     default_value {
-      i: 5
-    }
-  }
-  attr {
-    name: "bias"
-    type: "float"
-    default_value {
-      f: 1
+      i: 0
     }
   }
   attr {
-    name: "alpha"
-    type: "float"
+    name: "seed2"
+    type: "int"
     default_value {
-      f: 1
+      i: 0
     }
   }
   attr {
-    name: "beta"
-    type: "float"
-    default_value {
-      f: 0.5
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+  deprecation {
+    version: 25
+  }
+  is_stateful: true
 }
 op {
-  name: "LRNGrad"
-  input_arg {
-    name: "input_grads"
-    type_attr: "T"
-  }
+  name: "RandomPoissonV2"
   input_arg {
-    name: "input_image"
-    type_attr: "T"
+    name: "shape"
+    type_attr: "S"
   }
   input_arg {
-    name: "output_image"
-    type_attr: "T"
+    name: "rate"
+    type_attr: "R"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "dtype"
   }
   attr {
-    name: "depth_radius"
+    name: "seed"
     type: "int"
     default_value {
-      i: 5
+      i: 0
     }
   }
   attr {
-    name: "bias"
-    type: "float"
+    name: "seed2"
+    type: "int"
     default_value {
-      f: 1
+      i: 0
     }
   }
   attr {
-    name: "alpha"
-    type: "float"
-    default_value {
-      f: 1
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "beta"
-    type: "float"
+    name: "R"
+    type: "type"
     default_value {
-      f: 0.5
+      type: DT_DOUBLE
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     default_value {
-      type: DT_FLOAT
+      type: DT_INT64
     }
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "LatencyStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "RandomShuffle"
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "value"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
   }
+  is_stateful: true
 }
 op {
-  name: "LearnedUnigramCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
-  }
+  name: "RandomShuffleQueue"
   output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "num_true"
-    type: "int"
+    name: "component_types"
+    type: "list(type)"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "num_sampled"
-    type: "int"
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
     has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "unique"
-    type: "bool"
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
   attr {
-    name: "range_max"
+    name: "min_after_dequeue"
     type: "int"
-    has_minimum: true
-    minimum: 1
+    default_value {
+      i: 0
+    }
   }
   attr {
     name: "seed"
@@ -15450,46 +36445,56 @@ op {
       i: 0
     }
   }
-}
-op {
-  name: "LearnedUnigramCandidateSampler"
-  input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  is_stateful: true
+}
+op {
+  name: "RandomShuffleQueueV2"
   output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
+    name: "handle"
+    type: DT_RESOURCE
   }
   attr {
-    name: "num_true"
-    type: "int"
+    name: "component_types"
+    type: "list(type)"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "num_sampled"
-    type: "int"
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
     has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "unique"
-    type: "bool"
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
   attr {
-    name: "range_max"
+    name: "min_after_dequeue"
     type: "int"
-    has_minimum: true
-    minimum: 1
+    default_value {
+      i: 0
+    }
   }
   attr {
     name: "seed"
@@ -15499,234 +36504,161 @@ op {
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   is_stateful: true
 }
 op {
-  name: "LeftShift"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "RandomStandardNormal"
   input_arg {
-    name: "y"
+    name: "shape"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "output"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
-  is_commutative: true
-}
-op {
-  name: "Less"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
-}
-op {
-  name: "Less"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "LessEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "RandomStandardNormal"
   input_arg {
-    name: "y"
+    name: "shape"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "output"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
-}
-op {
-  name: "LessEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Lgamma"
+  name: "RandomUniform"
   input_arg {
-    name: "x"
+    name: "shape"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
-    type_attr: "T"
+    name: "output"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
-}
-op {
-  name: "LinSpace"
-  input_arg {
-    name: "start"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "stop"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
         type: DT_INT32
@@ -15734,35 +36666,47 @@ op {
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ListDiff"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "RandomUniform"
   input_arg {
-    name: "y"
+    name: "shape"
     type_attr: "T"
   }
   output_arg {
-    name: "out"
-    type_attr: "T"
+    name: "output"
+    type_attr: "dtype"
   }
-  output_arg {
-    name: "idx"
-    type_attr: "out_idx"
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "out_idx"
+    name: "dtype"
     type: "type"
-    default_value {
-      type: DT_INT32
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_INT32
@@ -15770,835 +36714,745 @@ op {
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "LoadAndRemapMatrix"
-  input_arg {
-    name: "ckpt_path"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "old_tensor_name"
-    type: DT_STRING
-  }
+  name: "RandomUniformInt"
   input_arg {
-    name: "row_remapping"
-    type: DT_INT64
+    name: "shape"
+    type_attr: "T"
   }
   input_arg {
-    name: "col_remapping"
-    type: DT_INT64
+    name: "minval"
+    type_attr: "Tout"
   }
   input_arg {
-    name: "initializing_values"
-    type: DT_FLOAT
+    name: "maxval"
+    type_attr: "Tout"
   }
   output_arg {
-    name: "output_matrix"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_rows"
-    type: "int"
-    has_minimum: true
+    name: "output"
+    type_attr: "Tout"
   }
   attr {
-    name: "num_cols"
+    name: "seed"
     type: "int"
-    has_minimum: true
-    minimum: 1
+    default_value {
+      i: 0
+    }
   }
   attr {
-    name: "max_rows_in_memory"
+    name: "seed2"
     type: "int"
     default_value {
-      i: -1
+      i: 0
     }
   }
-  is_stateful: true
-}
-op {
-  name: "Log"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tout"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-}
-op {
-  name: "Log1p"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "LogMatrixDeterminant"
+  name: "Range"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "start"
+    type_attr: "Tidx"
   }
-  output_arg {
-    name: "sign"
-    type_attr: "T"
+  input_arg {
+    name: "limit"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "log_abs_determinant"
-    type_attr: "T"
+    name: "output"
+    type_attr: "Tidx"
   }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "LogSoftmax"
+  name: "Range"
   input_arg {
-    name: "logits"
-    type_attr: "T"
+    name: "start"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "limit"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "delta"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "logsoftmax"
-    type_attr: "T"
+    name: "output"
+    type_attr: "Tidx"
   }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "LogUniformCandidateSampler"
+  name: "RangeDataset"
   input_arg {
-    name: "true_classes"
+    name: "start"
     type: DT_INT64
   }
-  output_arg {
-    name: "sampled_candidates"
+  input_arg {
+    name: "stop"
     type: DT_INT64
   }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
+  input_arg {
+    name: "step"
+    type: DT_INT64
   }
   output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "num_sampled"
-    type: "int"
+    name: "output_types"
+    type: "list(type)"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "range_max"
-    type: "int"
+    name: "output_shapes"
+    type: "list(shape)"
     has_minimum: true
     minimum: 1
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
+  is_stateful: true
 }
 op {
-  name: "LogUniformCandidateSampler"
+  name: "Rank"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "true_expected_count"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "sampled_expected_count"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "num_true"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "unique"
-    type: "bool"
-  }
-  attr {
-    name: "range_max"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "output"
+    type: DT_INT32
   }
   attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "T"
+    type: "type"
   }
-  is_stateful: true
 }
 op {
-  name: "LogicalAnd"
+  name: "ReadFile"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "filename"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "contents"
+    type: DT_STRING
   }
+}
+op {
+  name: "ReadVariableOp"
   input_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "resource"
+    type: DT_RESOURCE
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "value"
+    type_attr: "dtype"
   }
-  is_commutative: true
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
 }
 op {
-  name: "LogicalNot"
+  name: "ReaderNumRecordsProduced"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
   }
   output_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "records_produced"
+    type: DT_INT64
   }
 }
 op {
-  name: "LogicalOr"
-  input_arg {
-    name: "x"
-    type: DT_BOOL
-  }
+  name: "ReaderNumRecordsProducedV2"
   input_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "reader_handle"
+    type: DT_RESOURCE
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "records_produced"
+    type: DT_INT64
   }
-  is_commutative: true
+  is_stateful: true
 }
 op {
-  name: "LookupTableExport"
+  name: "ReaderNumWorkUnitsCompleted"
   input_arg {
-    name: "table_handle"
+    name: "reader_handle"
     type: DT_STRING
     is_ref: true
   }
   output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
-  }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
-  }
-  attr {
-    name: "Tkeys"
-    type: "type"
-  }
-  attr {
-    name: "Tvalues"
-    type: "type"
+    name: "units_completed"
+    type: DT_INT64
   }
 }
 op {
-  name: "LookupTableExportV2"
+  name: "ReaderNumWorkUnitsCompletedV2"
   input_arg {
-    name: "table_handle"
+    name: "reader_handle"
     type: DT_RESOURCE
   }
   output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
-  }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
-  }
-  attr {
-    name: "Tkeys"
-    type: "type"
-  }
-  attr {
-    name: "Tvalues"
-    type: "type"
+    name: "units_completed"
+    type: DT_INT64
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableFind"
+  name: "ReaderRead"
   input_arg {
-    name: "table_handle"
+    name: "reader_handle"
     type: DT_STRING
     is_ref: true
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+    name: "queue_handle"
+    type: DT_STRING
+    is_ref: true
   }
   output_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
+    name: "key"
+    type: DT_STRING
   }
-  attr {
-    name: "Tout"
-    type: "type"
+  output_arg {
+    name: "value"
+    type: DT_STRING
   }
 }
 op {
-  name: "LookupTableFindV2"
+  name: "ReaderReadUpTo"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "queue_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+    name: "num_records"
+    type: DT_INT64
   }
   output_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
+    name: "keys"
+    type: DT_STRING
   }
-  attr {
-    name: "Tout"
-    type: "type"
+  output_arg {
+    name: "values"
+    type: DT_STRING
   }
-  is_stateful: true
 }
 op {
-  name: "LookupTableImport"
+  name: "ReaderReadUpToV2"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "reader_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "queue_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "num_records"
+    type: DT_INT64
   }
-  attr {
-    name: "Tin"
-    type: "type"
+  output_arg {
+    name: "keys"
+    type: DT_STRING
   }
-  attr {
-    name: "Tout"
-    type: "type"
+  output_arg {
+    name: "values"
+    type: DT_STRING
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableImportV2"
+  name: "ReaderReadV2"
   input_arg {
-    name: "table_handle"
+    name: "reader_handle"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "queue_handle"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "Tin"
-    type: "type"
+  output_arg {
+    name: "key"
+    type: DT_STRING
   }
-  attr {
-    name: "Tout"
-    type: "type"
+  output_arg {
+    name: "value"
+    type: DT_STRING
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableInsert"
+  name: "ReaderReset"
   input_arg {
-    name: "table_handle"
+    name: "reader_handle"
     type: DT_STRING
     is_ref: true
   }
+}
+op {
+  name: "ReaderResetV2"
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "reader_handle"
+    type: DT_RESOURCE
   }
+  is_stateful: true
+}
+op {
+  name: "ReaderRestoreState"
   input_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
   }
-  attr {
-    name: "Tout"
-    type: "type"
+  input_arg {
+    name: "state"
+    type: DT_STRING
   }
 }
 op {
-  name: "LookupTableInsertV2"
+  name: "ReaderRestoreStateV2"
   input_arg {
-    name: "table_handle"
+    name: "reader_handle"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
-  }
-  attr {
-    name: "Tout"
-    type: "type"
+    name: "state"
+    type: DT_STRING
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableSize"
+  name: "ReaderSerializeState"
   input_arg {
-    name: "table_handle"
+    name: "reader_handle"
     type: DT_STRING
     is_ref: true
   }
   output_arg {
-    name: "size"
-    type: DT_INT64
+    name: "state"
+    type: DT_STRING
   }
 }
 op {
-  name: "LookupTableSizeV2"
+  name: "ReaderSerializeStateV2"
   input_arg {
-    name: "table_handle"
+    name: "reader_handle"
     type: DT_RESOURCE
   }
   output_arg {
-    name: "size"
-    type: DT_INT64
+    name: "state"
+    type: DT_STRING
   }
   is_stateful: true
 }
 op {
-  name: "LoopCond"
+  name: "Real"
   input_arg {
     name: "input"
-    type: DT_BOOL
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type: DT_BOOL
+    type_attr: "Tout"
   }
-}
-op {
-  name: "MakeIterator"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
+  attr {
+    name: "Tout"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "MapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
+  name: "RealDiv"
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_parallel_batches"
-    type: DT_INT64
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+}
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "MapClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
-    has_minimum: true
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+}
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MapDataset"
+  name: "ReciprocalGrad"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+}
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "MapDataset"
+  name: "ReciprocalGrad"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "y"
+    type_attr: "T"
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "dy"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "MapIncompleteSize"
+  name: "RecordInput"
   output_arg {
-    name: "size"
-    type: DT_INT32
+    name: "records"
+    type: DT_STRING
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "file_pattern"
+    type: "string"
   }
   attr {
-    name: "memory_limit"
+    name: "file_random_seed"
     type: "int"
     default_value {
-      i: 0
+      i: 301
     }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "file_shuffle_shift_ratio"
+    type: "float"
     default_value {
-      s: ""
+      f: 0
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "file_buffer_size"
+    type: "int"
     default_value {
-      s: ""
+      i: 10000
     }
   }
-  is_stateful: true
-}
-op {
-  name: "MapPeek"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
   attr {
-    name: "capacity"
+    name: "file_parallelism"
     type: "int"
     default_value {
-      i: 0
+      i: 16
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
+    name: "batch_size"
     type: "int"
     default_value {
-      i: 0
+      i: 32
     }
-    has_minimum: true
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  is_stateful: true
+}
+op {
+  name: "RecordInput"
+  output_arg {
+    name: "records"
+    type: DT_STRING
   }
   attr {
-    name: "container"
+    name: "file_pattern"
     type: "string"
+  }
+  attr {
+    name: "file_random_seed"
+    type: "int"
     default_value {
-      s: ""
+      i: 301
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "file_shuffle_shift_ratio"
+    type: "float"
     default_value {
-      s: ""
+      f: 0
     }
   }
-  is_stateful: true
-}
-op {
-  name: "MapSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
   attr {
-    name: "capacity"
+    name: "file_buffer_size"
     type: "int"
     default_value {
-      i: 0
+      i: 10000
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
+    name: "file_parallelism"
     type: "int"
     default_value {
-      i: 0
+      i: 16
     }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "batch_size"
+    type: "int"
     default_value {
-      s: ""
+      i: 32
     }
   }
   attr {
-    name: "shared_name"
+    name: "compression_type"
     type: "string"
     default_value {
       s: ""
@@ -16607,250 +37461,240 @@ op {
   is_stateful: true
 }
 op {
-  name: "MapStage"
+  name: "ReduceJoin"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "inputs"
+    type: DT_STRING
   }
   input_arg {
-    name: "indices"
+    name: "reduction_indices"
     type: DT_INT32
   }
-  input_arg {
-    name: "values"
-    type_list_attr: "fake_dtypes"
+  output_arg {
+    name: "output"
+    type: DT_STRING
   }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "separator"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
-    has_minimum: true
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+}
+op {
+  name: "RefEnter"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "container"
+    name: "frame_name"
     type: "string"
+  }
+  attr {
+    name: "is_constant"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "parallel_iterations"
+    type: "int"
     default_value {
-      s: ""
+      i: 10
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MapUnstage"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
+  name: "RefExit"
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "data"
+    type_attr: "T"
+    is_ref: true
   }
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "output"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "T"
+    type: "type"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+}
+op {
+  name: "RefIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "T"
+    type: "type"
   }
-  is_stateful: true
+  allows_uninitialized_input: true
 }
 op {
-  name: "MapUnstageNoKey"
+  name: "RefMerge"
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+    is_ref: true
   }
   output_arg {
-    name: "key"
-    type: DT_INT64
+    name: "output"
+    type_attr: "T"
+    is_ref: true
   }
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+    name: "value_index"
+    type: DT_INT32
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "memory_limit"
+    name: "N"
     type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
     has_minimum: true
     minimum: 1
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+}
+op {
+  name: "RefNextIteration"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "T"
+    type: "type"
   }
-  is_stateful: true
 }
 op {
-  name: "MatMul"
+  name: "RefSelect"
   input_arg {
-    name: "a"
-    type_attr: "T"
+    name: "index"
+    type: DT_INT32
   }
   input_arg {
-    name: "b"
+    name: "inputs"
     type_attr: "T"
+    number_attr: "N"
+    is_ref: true
   }
   output_arg {
-    name: "product"
+    name: "output"
     type_attr: "T"
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "MatchingFiles"
+  name: "RefSwitch"
   input_arg {
-    name: "pattern"
-    type: DT_STRING
+    name: "data"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "pred"
+    type: DT_BOOL
   }
   output_arg {
-    name: "filenames"
-    type: DT_STRING
+    name: "output_false"
+    type_attr: "T"
+    is_ref: true
   }
-}
-op {
-  name: "MatrixBandPart"
-  input_arg {
-    name: "input"
+  output_arg {
+    name: "output_true"
     type_attr: "T"
+    is_ref: true
   }
-  input_arg {
-    name: "num_lower"
-    type: DT_INT64
+  attr {
+    name: "T"
+    type: "type"
   }
+  allows_uninitialized_input: true
+}
+op {
+  name: "Relu"
   input_arg {
-    name: "num_upper"
-    type: DT_INT64
+    name: "features"
+    type_attr: "T"
   }
   output_arg {
-    name: "band"
+    name: "activations"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
   }
 }
 op {
-  name: "MatrixDeterminant"
+  name: "Relu"
   input_arg {
-    name: "input"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "activations"
     type_attr: "T"
   }
   attr {
@@ -16860,18 +37704,27 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MatrixDeterminant"
+  name: "Relu"
   input_arg {
-    name: "input"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "activations"
     type_attr: "T"
   }
   attr {
@@ -16881,50 +37734,87 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MatrixDiag"
+  name: "Relu"
   input_arg {
-    name: "diagonal"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "activations"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
 }
 op {
-  name: "MatrixDiagPart"
+  name: "Relu6"
   input_arg {
-    name: "input"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "diagonal"
+    name: "activations"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
   }
 }
 op {
-  name: "MatrixExponential"
+  name: "Relu6"
   input_arg {
-    name: "input"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "activations"
     type_attr: "T"
   }
   attr {
@@ -16932,141 +37822,161 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MatrixInverse"
+  name: "Relu6"
   input_arg {
-    name: "input"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "activations"
     type_attr: "T"
   }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MatrixInverse"
+  name: "Relu6"
   input_arg {
-    name: "input"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "activations"
     type_attr: "T"
   }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MatrixSetDiag"
+  name: "Relu6Grad"
   input_arg {
-    name: "input"
+    name: "gradients"
     type_attr: "T"
   }
   input_arg {
-    name: "diagonal"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "backprops"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
   }
 }
 op {
-  name: "MatrixSolve"
+  name: "Relu6Grad"
   input_arg {
-    name: "matrix"
+    name: "gradients"
     type_attr: "T"
   }
   input_arg {
-    name: "rhs"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "backprops"
     type_attr: "T"
   }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MatrixSolveLs"
+  name: "Relu6Grad"
   input_arg {
-    name: "matrix"
+    name: "gradients"
     type_attr: "T"
   }
   input_arg {
-    name: "rhs"
+    name: "features"
     type_attr: "T"
   }
-  input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
-  }
   output_arg {
-    name: "output"
+    name: "backprops"
     type_attr: "T"
   }
   attr {
@@ -17074,35 +37984,34 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-  attr {
-    name: "fast"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
 }
 op {
-  name: "MatrixSolveLs"
+  name: "Relu6Grad"
   input_arg {
-    name: "matrix"
+    name: "gradients"
     type_attr: "T"
   }
   input_arg {
-    name: "rhs"
+    name: "features"
     type_attr: "T"
   }
-  input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
-  }
   output_arg {
-    name: "output"
+    name: "backprops"
     type_attr: "T"
   }
   attr {
@@ -17110,122 +38019,102 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "fast"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
 }
 op {
-  name: "MatrixTriangularSolve"
+  name: "ReluGrad"
   input_arg {
-    name: "matrix"
+    name: "gradients"
     type_attr: "T"
   }
   input_arg {
-    name: "rhs"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "backprops"
     type_attr: "T"
   }
-  attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MatrixTriangularSolve"
+  name: "ReluGrad"
   input_arg {
-    name: "matrix"
+    name: "gradients"
     type_attr: "T"
   }
   input_arg {
-    name: "rhs"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "backprops"
     type_attr: "T"
   }
-  attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Max"
+  name: "ReluGrad"
   input_arg {
-    name: "input"
+    name: "gradients"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "features"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "backprops"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
@@ -17233,56 +38122,34 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "Max"
+  name: "ReluGrad"
   input_arg {
-    name: "input"
+    name: "gradients"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "features"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "backprops"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
@@ -17290,452 +38157,519 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
 }
 op {
-  name: "MaxPool"
+  name: "RemoteCall"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "target"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_list_attr: "Tout"
   }
   attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
-    }
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "Tout"
+    type: "list(type)"
     has_minimum: true
-    minimum: 4
+    minimum: 1
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "RemoteFusedGraphExecute"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "Toutputs"
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
     has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "Toutputs"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
-    name: "data_format"
+    name: "serialized_remote_fused_graph_execute_info"
     type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
   }
 }
 op {
-  name: "MaxPool"
+  name: "RepeatDataset"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "output_shapes"
+    type: "list(shape)"
     has_minimum: true
-    minimum: 4
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "RepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "output_types"
+    type: "list(type)"
     has_minimum: true
-    minimum: 4
+    minimum: 1
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RequantizationRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
+    name: "Tinput"
+    type: "type"
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
 }
 op {
-  name: "MaxPool"
+  name: "RequantizationRange"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
         type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+}
+op {
+  name: "Requantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "Tinput"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
+    name: "out_type"
+    type: "type"
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "Requantize"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "Tinput"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "Reshape"
   input_arg {
-    name: "input"
+    name: "tensor"
     type_attr: "T"
   }
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "Tshape"
+    type: "type"
     default_value {
-      s: "NDHWC"
+      type: DT_INT32
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+}
+op {
+  name: "ResizeArea"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type: DT_FLOAT
-  }
+  name: "ResizeArea"
   input_arg {
-    name: "orig_output"
-    type: DT_FLOAT
+    name: "images"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "size"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "resized_images"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type: DT_FLOAT
-  }
+  name: "ResizeBicubic"
   input_arg {
-    name: "orig_output"
-    type: DT_FLOAT
+    name: "images"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "size"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "resized_images"
+    type: DT_FLOAT
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "align_corners"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      b: false
     }
   }
+}
+op {
+  name: "ResizeBicubic"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "TInput"
-  }
+  name: "ResizeBicubicGrad"
   input_arg {
-    name: "orig_output"
-    type_attr: "TInput"
+    name: "grads"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "grad"
+    name: "original_image"
     type_attr: "T"
   }
   output_arg {
@@ -17743,140 +38677,148 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "align_corners"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      b: false
     }
   }
+}
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "TInput"
-    type: "type"
+    name: "align_corners"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
+      b: false
     }
   }
 }
 op {
-  name: "MaxPool3DGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "ResizeBilinear"
   input_arg {
-    name: "orig_output"
+    name: "images"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "size"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "resized_images"
+    type: DT_FLOAT
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "align_corners"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      b: false
     }
   }
+}
+op {
+  name: "ResizeBilinear"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "ResizeBilinearGrad"
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "grads"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "grad"
+    name: "original_image"
     type_attr: "T"
   }
   output_arg {
@@ -17884,256 +38826,261 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "align_corners"
+    type: "bool"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+      b: false
     }
   }
+}
+op {
+  name: "ResizeBilinearGrad"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_BFLOAT16
         type: DT_HALF
+        type: DT_DOUBLE
       }
     }
   }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "ResizeNearestNeighbor"
   input_arg {
-    name: "orig_output"
+    name: "images"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "size"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
+    name: "resized_images"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "align_corners"
+    type: "bool"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+      b: false
     }
   }
+}
+op {
+  name: "ResizeNearestNeighbor"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "resized_images"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_INT8
         type: DT_UINT8
         type: DT_INT16
-        type: DT_INT8
         type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
         type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+  attr {
+    name: "align_corners"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "ResizeNearestNeighborGrad"
   input_arg {
-    name: "orig_output"
+    name: "grads"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "size"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "align_corners"
+    type: "bool"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+      b: false
     }
   }
+}
+op {
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradGrad"
+  name: "ResourceApplyAdadelta"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "accum_update"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -18142,69 +39089,61 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradGrad"
+  name: "ResourceApplyAdadelta"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "accum_update"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -18213,68 +39152,63 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradGradV2"
+  name: "ResourceApplyAdadelta"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "accum_update"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "ksize"
-    type: DT_INT32
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
   attr {
     name: "T"
     type: "type"
@@ -18283,65 +39217,50 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradGradV2"
+  name: "ResourceApplyAdagrad"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
+    name: "grad"
     type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
   attr {
     name: "T"
     type: "type"
@@ -18349,68 +39268,47 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradGradWithArgmax"
+  name: "ResourceApplyAdagrad"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
+    name: "lr"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -18419,66 +39317,49 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradGradWithArgmax"
+  name: "ResourceApplyAdagrad"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
+    name: "lr"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -18487,274 +39368,250 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradV2"
+  name: "ResourceApplyAdagrad"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
+    name: "grad"
     type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradV2"
+  name: "ResourceApplyAdagradDA"
   input_arg {
-    name: "orig_input"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "orig_output"
-    type_attr: "T"
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "ksize"
-    type: DT_INT32
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
+    name: "l1"
     type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "ResourceApplyAdagradDA"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
+    name: "lr"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
   }
   attr {
-    name: "Targmax"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "ResourceApplyAdagradDA"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
   }
   attr {
     name: "T"
@@ -18763,66 +39620,66 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "ResourceApplyAdagradDA"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
   }
   attr {
     name: "T"
@@ -18832,255 +39689,308 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolV2"
+  name: "ResourceApplyAdam"
   input_arg {
-    name: "input"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
     type_attr: "T"
   }
   input_arg {
-    name: "ksize"
-    type: DT_INT32
+    name: "beta2_power"
+    type_attr: "T"
   }
   input_arg {
-    name: "strides"
-    type: DT_INT32
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolV2"
+  name: "ResourceApplyAdam"
   input_arg {
-    name: "input"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
     type_attr: "T"
   }
   input_arg {
-    name: "ksize"
-    type: DT_INT32
+    name: "beta2_power"
+    type_attr: "T"
   }
   input_arg {
-    name: "strides"
-    type: DT_INT32
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "use_nesterov"
+    type: "bool"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
-      }
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "ResourceApplyAdam"
   input_arg {
-    name: "input"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "beta2_power"
     type_attr: "T"
   }
-  output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
-    name: "Targmax"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT64
-    }
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "use_nesterov"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "ResourceApplyAdam"
   input_arg {
-    name: "input"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "beta2_power"
     type_attr: "T"
   }
-  output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
   }
-  attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -19089,65 +39999,81 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "ResourceApplyAdam"
   input_arg {
-    name: "input"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "beta2_power"
     type_attr: "T"
   }
-  output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
   }
-  attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -19157,67 +40083,68 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "Maximum"
+  name: "ResourceApplyAddSign"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "m"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "z"
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Mean"
   input_arg {
-    name: "input"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "sign_decay"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "beta"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -19238,43 +40165,49 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Mean"
+  name: "ResourceApplyAddSign"
   input_arg {
-    name: "input"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "alpha"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "sign_decay"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -19297,167 +40230,120 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Merge"
+  name: "ResourceApplyAddSign"
   input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "value_index"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
+    name: "var"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
   }
-}
-op {
-  name: "MergeSummary"
   input_arg {
-    name: "inputs"
-    type: DT_STRING
-    number_attr: "N"
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
   }
-}
-op {
-  name: "MergeV2Checkpoints"
   input_arg {
-    name: "checkpoint_prefixes"
-    type: DT_STRING
+    name: "beta"
+    type_attr: "T"
   }
   input_arg {
-    name: "destination_prefix"
-    type: DT_STRING
+    name: "grad"
+    type_attr: "T"
   }
   attr {
-    name: "delete_old_dirs"
-    type: "bool"
-    default_value {
-      b: true
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
-}
-op {
-  name: "MergeV2Checkpoints"
-  input_arg {
-    name: "checkpoint_prefixes"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "destination_prefix"
-    type: DT_STRING
-  }
   attr {
-    name: "delete_old_dirs"
+    name: "use_locking"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "Mfcc"
+  name: "ResourceApplyCenteredRMSProp"
   input_arg {
-    name: "spectrogram"
-    type: DT_FLOAT
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "sample_rate"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "upper_frequency_limit"
-    type: "float"
-    default_value {
-      f: 4000
-    }
+    name: "mg"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "lower_frequency_limit"
-    type: "float"
-    default_value {
-      f: 20
-    }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "filterbank_channel_count"
-    type: "int"
-    default_value {
-      i: 40
-    }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "dct_coefficient_count"
-    type: "int"
-    default_value {
-      i: 13
-    }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-}
-op {
-  name: "Min"
   input_arg {
-    name: "input"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "momentum"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -19482,39 +40368,51 @@ op {
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Min"
+  name: "ResourceApplyCenteredRMSProp"
   input_arg {
-    name: "input"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "rho"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "momentum"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -19541,171 +40439,122 @@ op {
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Minimum"
+  name: "ResourceApplyCenteredRMSProp"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "mg"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "z"
-    type_attr: "T"
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
   }
-  is_commutative: true
-}
-op {
-  name: "MirrorPad"
   input_arg {
-    name: "input"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  output_arg {
-    name: "output"
+    name: "rho"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    allowed_values {
-      list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
-      }
-    }
-  }
-}
-op {
-  name: "MirrorPadGrad"
   input_arg {
-    name: "input"
+    name: "momentum"
     type_attr: "T"
   }
   input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
+    name: "epsilon"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "mode"
-    type: "string"
-    allowed_values {
-      list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
-      }
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Mod"
+  name: "ResourceApplyCenteredRMSProp"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "mg"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "z"
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-}
-op {
-  name: "Mul"
   input_arg {
-    name: "x"
+    name: "momentum"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "epsilon"
     type_attr: "T"
   }
-  output_arg {
-    name: "z"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   attr {
@@ -19713,49 +40562,68 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_INT8
         type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
         type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  is_commutative: true
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "Multinomial"
+  name: "ResourceApplyFtrl"
   input_arg {
-    name: "logits"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "num_samples"
-    type: DT_INT32
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type: DT_INT64
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -19764,45 +40632,63 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "Multinomial"
+  name: "ResourceApplyFtrl"
   input_arg {
-    name: "logits"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "num_samples"
-    type: DT_INT32
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type: DT_INT64
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -19811,321 +40697,204 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  is_stateful: true
-}
-op {
-  name: "MutableDenseHashTable"
-  input_arg {
-    name: "empty_key"
-    type_attr: "key_dtype"
-  }
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
   attr {
-    name: "use_node_name_sharing"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
-  }
-  attr {
-    name: "initial_num_buckets"
-    type: "int"
-    default_value {
-      i: 131072
-    }
-  }
-  attr {
-    name: "max_load_factor"
-    type: "float"
-    default_value {
-      f: 0.8
-    }
-  }
   is_stateful: true
 }
 op {
-  name: "MutableDenseHashTableV2"
+  name: "ResourceApplyFtrl"
   input_arg {
-    name: "empty_key"
-    type_attr: "key_dtype"
-  }
-  output_arg {
-    name: "table_handle"
+    name: "var"
     type: DT_RESOURCE
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "key_dtype"
-    type: "type"
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "value_dtype"
-    type: "type"
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  attr {
-    name: "initial_num_buckets"
-    type: "int"
-    default_value {
-      i: 131072
-    }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "max_load_factor"
-    type: "float"
-    default_value {
-      f: 0.8
-    }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "MutableHashTable"
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
     }
   }
   attr {
-    name: "use_node_name_sharing"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
   is_stateful: true
 }
 op {
-  name: "MutableHashTableOfTensors"
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "key_dtype"
-    type: "type"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  attr {
-    name: "value_dtype"
-    type: "type"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "MutableHashTableOfTensorsV2"
-  output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
   attr {
-    name: "use_node_name_sharing"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
-  attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
-  }
   is_stateful: true
 }
 op {
-  name: "MutableHashTableV2"
-  output_arg {
-    name: "table_handle"
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
     type: DT_RESOURCE
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  attr {
-    name: "key_dtype"
-    type: "type"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "value_dtype"
-    type: "type"
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "Neg"
   input_arg {
-    name: "x"
+    name: "l2"
     type_attr: "T"
   }
-  output_arg {
-    name: "y"
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
     type_attr: "T"
   }
   attr {
@@ -20133,180 +40902,212 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "NegTrain"
+  name: "ResourceApplyFtrlV2"
   input_arg {
-    name: "w_in"
-    type: DT_FLOAT
-    is_ref: true
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "w_out"
-    type: DT_FLOAT
-    is_ref: true
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "examples"
-    type: DT_INT32
+    name: "linear"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "labels"
-    type: DT_INT32
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
     name: "lr"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "vocab_count"
-    type: "list(int)"
+    type_attr: "T"
   }
-  attr {
-    name: "num_negative_samples"
-    type: "int"
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  deprecation {
-    version: 19
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "NextIteration"
   input_arg {
-    name: "data"
+    name: "l2_shrinkage"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "lr_power"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-}
-op {
-  name: "NoOp"
-}
-op {
-  name: "NonMaxSuppression"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "iou_threshold"
-    type: "float"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      f: 0.5
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "NonMaxSuppressionV2"
+  name: "ResourceApplyFtrlV2"
   input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "scores"
-    type: DT_FLOAT
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_output_size"
-    type: DT_INT32
+    name: "linear"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-}
-op {
-  name: "NotEqual"
   input_arg {
-    name: "x"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "l2"
     type_attr: "T"
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT8
+        type: DT_UINT16
         type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_QUINT8
+        type: DT_COMPLEX128
         type: DT_QINT8
+        type: DT_QUINT8
         type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-  is_commutative: true
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "NthElement"
+  name: "ResourceApplyFtrlV2"
   input_arg {
-    name: "input"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "n"
-    type: DT_INT32
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "values"
+  input_arg {
+    name: "l1"
     type_attr: "T"
   }
-  attr {
-    name: "reverse"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -20316,112 +41117,89 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "OneHot"
-  input_arg {
-    name: "indices"
-    type_attr: "TI"
-  }
+  name: "ResourceApplyGradientDescent"
   input_arg {
-    name: "depth"
-    type: DT_INT32
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "on_value"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "off_value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
+    name: "delta"
     type_attr: "T"
   }
-  attr {
-    name: "axis"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "TI"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
-}
-op {
-  name: "OneShotIterator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "dataset_factory"
-    type: "func"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "OnesLike"
+  name: "ResourceApplyGradientDescent"
   input_arg {
-    name: "x"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
     type_attr: "T"
   }
-  output_arg {
-    name: "y"
+  input_arg {
+    name: "delta"
     type_attr: "T"
   }
   attr {
@@ -20431,1484 +41209,1566 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "OrderedMapClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "OrderedMapIncompleteSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  name: "ResourceApplyGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+  input_arg {
+    name: "delta"
+    type_attr: "T"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "OrderedMapPeek"
+  name: "ResourceApplyGradientDescent"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "alpha"
+    type_attr: "T"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "delta"
+    type_attr: "T"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "OrderedMapSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
+  name: "ResourceApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_nesterov"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "OrderedMapStage"
+  name: "ResourceApplyMomentum"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "values"
-    type_list_attr: "fake_dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
   attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_nesterov"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "OrderedMapUnstage"
+  name: "ResourceApplyMomentum"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "accum"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_nesterov"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "OrderedMapUnstageNoKey"
+  name: "ResourceApplyMomentum"
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "var"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "key"
-    type: DT_INT64
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_nesterov"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "Pack"
+  name: "ResourceApplyPowerSign"
   input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "T"
-    type: "type"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "axis"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
   }
-}
-op {
-  name: "Pad"
   input_arg {
-    name: "input"
+    name: "sign_decay"
     type_attr: "T"
   }
   input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
+    name: "beta"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "PadV2"
+  name: "ResourceApplyPowerSign"
   input_arg {
-    name: "input"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
+    name: "logbase"
+    type_attr: "T"
   }
   input_arg {
-    name: "constant_values"
+    name: "sign_decay"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "beta"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
-    name: "Tpaddings"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "PaddedBatchDataset"
+  name: "ResourceApplyPowerSign"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
+    name: "m"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
+    name: "logbase"
+    type_attr: "T"
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
   }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   is_stateful: true
 }
 op {
-  name: "PaddedBatchDataset"
+  name: "ResourceApplyProximalAdagrad"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "l1"
+    type_attr: "T"
   }
-}
-op {
-  name: "PaddingFIFOQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
+    name: "T"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "PaddingFIFOQueueV2"
-  output_arg {
-    name: "handle"
+  name: "ResourceApplyProximalAdagrad"
+  input_arg {
+    name: "var"
     type: DT_RESOURCE
   }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "ParallelConcat"
+  name: "ResourceApplyProximalAdagrad"
   input_arg {
-    name: "values"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
-    number_attr: "N"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "l1"
     type_attr: "T"
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "shape"
-    type: "shape"
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "ParallelDynamicStitch"
+  name: "ResourceApplyProximalAdagrad"
   input_arg {
-    name: "indices"
-    type: DT_INT32
-    number_attr: "N"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "data"
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
-    number_attr: "N"
   }
-  output_arg {
-    name: "merged"
+  input_arg {
+    name: "l1"
     type_attr: "T"
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "ParallelInterleaveDataset"
+  name: "ResourceApplyProximalGradientDescent"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "alpha"
+    type_attr: "T"
   }
   input_arg {
-    name: "cycle_length"
-    type: DT_INT64
+    name: "l1"
+    type_attr: "T"
   }
   input_arg {
-    name: "block_length"
-    type: DT_INT64
+    name: "l2"
+    type_attr: "T"
   }
   input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+    name: "delta"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "ParallelMapDataset"
+  name: "ResourceApplyProximalGradientDescent"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "alpha"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "f"
-    type: "func"
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "delta"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   is_stateful: true
 }
 op {
-  name: "ParallelMapDataset"
+  name: "ResourceApplyProximalGradientDescent"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "alpha"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "f"
-    type: "func"
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "delta"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "ParameterizedTruncatedNormal"
-  input_arg {
-    name: "shape"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "means"
-    type_attr: "dtype"
-  }
+  name: "ResourceApplyProximalGradientDescent"
   input_arg {
-    name: "stdevs"
-    type_attr: "dtype"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "minvals"
-    type_attr: "dtype"
+    name: "alpha"
+    type_attr: "T"
   }
   input_arg {
-    name: "maxvals"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "delta"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "ParseExample"
+  name: "ResourceApplyRMSProp"
   input_arg {
-    name: "serialized"
-    type: DT_STRING
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "names"
-    type: DT_STRING
+    name: "ms"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "sparse_keys"
-    type: DT_STRING
-    number_attr: "Nsparse"
+    name: "mom"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "dense_keys"
-    type: DT_STRING
-    number_attr: "Ndense"
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-    number_attr: "Nsparse"
-  }
-  output_arg {
-    name: "sparse_values"
-    type_list_attr: "sparse_types"
-  }
-  output_arg {
-    name: "sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nsparse"
+    name: "rho"
+    type_attr: "T"
   }
-  output_arg {
-    name: "dense_values"
-    type_list_attr: "Tdense"
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
-  attr {
-    name: "Nsparse"
-    type: "int"
-    has_minimum: true
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-  attr {
-    name: "Ndense"
-    type: "int"
-    has_minimum: true
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
-        type: DT_STRING
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
+  is_stateful: true
 }
 op {
-  name: "ParseSingleSequenceExample"
+  name: "ResourceApplyRMSProp"
   input_arg {
-    name: "serialized"
-    type: DT_STRING
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "feature_list_dense_missing_assumed_empty"
-    type: DT_STRING
+    name: "ms"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "context_sparse_keys"
-    type: DT_STRING
-    number_attr: "Ncontext_sparse"
+    name: "mom"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "context_dense_keys"
-    type: DT_STRING
-    number_attr: "Ncontext_dense"
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "feature_list_sparse_keys"
-    type: DT_STRING
-    number_attr: "Nfeature_list_sparse"
+    name: "rho"
+    type_attr: "T"
   }
   input_arg {
-    name: "feature_list_dense_keys"
-    type: DT_STRING
-    number_attr: "Nfeature_list_dense"
+    name: "momentum"
+    type_attr: "T"
   }
   input_arg {
-    name: "context_dense_defaults"
-    type_list_attr: "Tcontext_dense"
+    name: "epsilon"
+    type_attr: "T"
   }
   input_arg {
-    name: "debug_name"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "context_sparse_indices"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
-  }
-  output_arg {
-    name: "context_sparse_values"
-    type_list_attr: "context_sparse_types"
-  }
-  output_arg {
-    name: "context_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
-  }
-  output_arg {
-    name: "context_dense_values"
-    type_list_attr: "Tcontext_dense"
-  }
-  output_arg {
-    name: "feature_list_sparse_indices"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
-  }
-  output_arg {
-    name: "feature_list_sparse_values"
-    type_list_attr: "feature_list_sparse_types"
-  }
-  output_arg {
-    name: "feature_list_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
-  }
-  output_arg {
-    name: "feature_list_dense_values"
-    type_list_attr: "feature_list_dense_types"
-  }
-  attr {
-    name: "Ncontext_sparse"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "Ncontext_dense"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "Nfeature_list_sparse"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "Nfeature_list_dense"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "grad"
+    type_attr: "T"
   }
   attr {
-    name: "context_sparse_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
-        type: DT_STRING
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tcontext_dense"
-    type: "list(type)"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+      b: false
     }
   }
-  attr {
-    name: "feature_list_dense_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "context_dense_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
-    name: "feature_list_sparse_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
-        type: DT_STRING
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "feature_list_dense_shapes"
-    type: "list(shape)"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      list {
-      }
+      b: false
     }
-    has_minimum: true
   }
+  is_stateful: true
 }
 op {
-  name: "ParseTensor"
+  name: "ResourceApplyRMSProp"
   input_arg {
-    name: "serialized"
-    type: DT_STRING
+    name: "var"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "out_type"
-    type: "type"
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
   }
-}
-op {
-  name: "Placeholder"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "dtype"
-    type: "type"
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-  attr {
-    name: "shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
-}
-op {
-  name: "Placeholder"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-  attr {
-    name: "dtype"
-    type: "type"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
-    name: "shape"
-    type: "shape"
-    default_value {
-      shape {
-        unknown_rank: true
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "PlaceholderV2"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-}
-op {
-  name: "PlaceholderV2"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
   attr {
-    name: "shape"
-    type: "shape"
-  }
-  deprecation {
-    version: 23
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "PlaceholderWithDefault"
-  input_arg {
-    name: "input"
-    type_attr: "dtype"
+  name: "ResourceCountUpTo"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
   }
   output_arg {
     name: "output"
-    type_attr: "dtype"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "limit"
+    type: "int"
   }
   attr {
-    name: "shape"
-    type: "shape"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "Polygamma"
+  name: "ResourceGather"
   input_arg {
-    name: "a"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "output"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "PopulationCount"
+  name: "ResourceScatterAdd"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "y"
-    type: DT_UINT8
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
-}
-op {
-  name: "PopulationCount"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_UINT8
-  }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Pow"
+  name: "ResourceScatterAdd"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "z"
-    type_attr: "T"
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "PrefetchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   is_stateful: true
 }
 op {
-  name: "PrefetchDataset"
+  name: "ResourceScatterAdd"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "indices"
+    type_attr: "Tindices"
   }
-}
-op {
-  name: "PreventGradient"
   input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "message"
-    type: "string"
-    default_value {
-      s: ""
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Print"
+  name: "ResourceScatterAdd"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "data"
-    type_list_attr: "U"
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
-  }
-  attr {
-    name: "U"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "message"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "first_n"
-    type: "int"
-    default_value {
-      i: -1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
   attr {
-    name: "summarize"
-    type: "int"
-    default_value {
-      i: 3
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   is_stateful: true
 }
 op {
-  name: "Print"
+  name: "ResourceScatterNdUpdate"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "ref"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "data"
-    type_list_attr: "U"
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "updates"
     type_attr: "T"
   }
   attr {
@@ -21916,143 +42776,90 @@ op {
     type: "type"
   }
   attr {
-    name: "U"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "message"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "first_n"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "summarize"
-    type: "int"
-    default_value {
-      i: 3
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "PriorityQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    default_value {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
-    has_minimum: true
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: true
     }
   }
   is_stateful: true
 }
 op {
-  name: "PriorityQueueV2"
-  output_arg {
-    name: "handle"
+  name: "ResourceScatterUpdate"
+  input_arg {
+    name: "resource"
     type: DT_RESOURCE
   }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    has_minimum: true
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   is_stateful: true
 }
 op {
-  name: "Prod"
+  name: "ResourceScatterUpdate"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
@@ -22070,15 +42877,15 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
         type: DT_INT32
@@ -22086,46 +42893,41 @@ op {
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Prod"
+  name: "ResourceScatterUpdate"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "resource"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22133,11 +42935,8 @@ op {
     }
   }
   attr {
-    name: "Tidx"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
         type: DT_INT32
@@ -22145,192 +42944,146 @@ op {
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "PyFunc"
+  name: "ResourceScatterUpdate"
   input_arg {
-    name: "input"
-    type_list_attr: "Tin"
+    name: "resource"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
-  attr {
-    name: "token"
-    type: "string"
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
   }
   attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+    name: "dtype"
+    type: "type"
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   is_stateful: true
 }
 op {
-  name: "PyFuncStateless"
+  name: "ResourceSparseApplyAdadelta"
   input_arg {
-    name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
+    name: "var"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "token"
-    type: "string"
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-}
-op {
-  name: "Qr"
   input_arg {
-    name: "input"
+    name: "rho"
     type_attr: "T"
   }
-  output_arg {
-    name: "q"
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
-  output_arg {
-    name: "r"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  attr {
-    name: "full_matrices"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
-}
-op {
-  name: "QuantizeAndDequantize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "range_given"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
   }
-}
-op {
-  name: "QuantizeAndDequantize"
   input_arg {
-    name: "input"
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "rho"
     type_attr: "T"
   }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
@@ -22339,110 +43092,75 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  deprecation {
-    version: 21
-  }
-}
-op {
-  name: "QuantizeAndDequantize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "range_given"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
   }
-  deprecation {
-    version: 22
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
   }
-}
-op {
-  name: "QuantizeAndDequantizeV2"
   input_arg {
-    name: "input"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "input_min"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "input_max"
+    name: "epsilon"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
@@ -22451,1288 +43169,1353 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-}
-op {
-  name: "QuantizeAndDequantizeV3"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_min"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_max"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_bits"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "range_given"
+    name: "use_locking"
     type: "bool"
     default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "QuantizeDownAndShrinkRange"
+  name: "ResourceSparseApplyAdadelta"
   input_arg {
-    name: "input"
-    type_attr: "Tinput"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "input_min"
-    type: DT_FLOAT
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "input_max"
-    type: DT_FLOAT
+    name: "accum_update"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "Tinput"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "out_type"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "QuantizeV2"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
-    name: "input"
-    type: DT_FLOAT
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "min_range"
-    type: DT_FLOAT
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
+    name: "Tindices"
+    type: "type"
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "QuantizeV2"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
-    name: "input"
-    type: DT_FLOAT
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "min_range"
-    type: DT_FLOAT
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
+    name: "Tindices"
+    type: "type"
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "QuantizeV2"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
-    name: "input"
-    type: DT_FLOAT
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "min_range"
-    type: DT_FLOAT
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_range"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
+    name: "Tindices"
+    type: "type"
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "round_mode"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: "HALF_AWAY_FROM_ZERO"
-    }
-    allowed_values {
-      list {
-        s: "HALF_AWAY_FROM_ZERO"
-        s: "HALF_TO_EVEN"
-      }
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "QuantizedAdd"
-  input_arg {
-    name: "x"
-    type_attr: "T1"
-  }
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
-    name: "y"
-    type_attr: "T2"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "min_x"
-    type: DT_FLOAT
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_x"
-    type: DT_FLOAT
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "min_y"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_y"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "z"
-    type_attr: "Toutput"
-  }
-  output_arg {
-    name: "min_z"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_z"
-    type: DT_FLOAT
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "T1"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "T2"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Toutput"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
+      b: false
     }
   }
-  is_commutative: true
+  is_stateful: true
 }
 op {
-  name: "QuantizedAvgPool"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
-    name: "input"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "min_input"
-    type: DT_FLOAT
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "max_input"
-    type: DT_FLOAT
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "l1"
     type_attr: "T"
   }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "Tindices"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "QuantizedBatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "t_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "t_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "m"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "m_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "m_max"
-    type: DT_FLOAT
-  }
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
-    name: "v"
-    type_attr: "Tinput"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "v_min"
-    type: DT_FLOAT
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "v_max"
-    type: DT_FLOAT
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "beta"
-    type_attr: "Tinput"
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "beta_min"
-    type: DT_FLOAT
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "beta_max"
-    type: DT_FLOAT
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "gamma"
-    type_attr: "Tinput"
+    name: "l1"
+    type_attr: "T"
   }
   input_arg {
-    name: "gamma_min"
-    type: DT_FLOAT
+    name: "l2"
+    type_attr: "T"
   }
   input_arg {
-    name: "gamma_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "result"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "result_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "result_max"
-    type: DT_FLOAT
+    name: "global_step"
+    type: DT_INT64
   }
   attr {
-    name: "Tinput"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "out_type"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "variance_epsilon"
-    type: "float"
-  }
-  attr {
-    name: "scale_after_normalization"
+    name: "use_locking"
     type: "bool"
+    default_value {
+      b: false
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "QuantizedBiasAdd"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
-    name: "input"
-    type_attr: "T1"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "bias"
-    type_attr: "T2"
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "min_input"
-    type: DT_FLOAT
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_input"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "min_bias"
-    type: DT_FLOAT
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "max_bias"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "min_out"
-    type: DT_FLOAT
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  output_arg {
-    name: "max_out"
-    type: DT_FLOAT
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
   }
   attr {
-    name: "T2"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "out_type"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "QuantizedConcat"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
-    name: "concat_dim"
-    type: DT_INT32
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "input_mins"
-    type: DT_FLOAT
-    number_attr: "N"
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "input_maxes"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "QuantizedConv2D"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
   input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "min_input"
-    type: DT_FLOAT
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_input"
-    type: DT_FLOAT
+    name: "l1"
+    type_attr: "T"
   }
   input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
+    name: "l2"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
+    name: "global_step"
+    type: DT_INT64
   }
   attr {
-    name: "Tinput"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tfilter"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "out_type"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "QuantizedInstanceNorm"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
-    name: "x"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "x_min"
-    type: DT_FLOAT
+    name: "rho"
+    type_attr: "T"
   }
   input_arg {
-    name: "x_max"
-    type: DT_FLOAT
+    name: "momentum"
+    type_attr: "T"
   }
-  output_arg {
-    name: "y"
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
-  output_arg {
-    name: "y_min"
-    type: DT_FLOAT
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "y_max"
-    type: DT_FLOAT
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "output_range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "given_y_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "given_y_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "variance_epsilon"
-    type: "float"
-    default_value {
-      f: 1e-05
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "min_separation"
-    type: "float"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      f: 0.001
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "QuantizedMatMul"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
-    name: "a"
-    type_attr: "T1"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "b"
-    type_attr: "T2"
+    name: "mg"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "min_a"
-    type: DT_FLOAT
+    name: "ms"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_a"
-    type: DT_FLOAT
+    name: "mom"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "min_b"
-    type: DT_FLOAT
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_b"
-    type: DT_FLOAT
+    name: "rho"
+    type_attr: "T"
   }
-  output_arg {
-    name: "out"
-    type_attr: "Toutput"
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
-  output_arg {
-    name: "min_out"
-    type: DT_FLOAT
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-  output_arg {
-    name: "max_out"
-    type: DT_FLOAT
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  attr {
-    name: "T1"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "T2"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Toutput"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_QINT32
-    }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
-  attr {
-    name: "Tactivation"
-    type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
-  }
+  is_stateful: true
 }
 op {
-  name: "QuantizedMaxPool"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
-    name: "input"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "min_input"
-    type: DT_FLOAT
+    name: "rho"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_input"
-    type: DT_FLOAT
+    name: "momentum"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "Tindices"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "QuantizedMul"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
-    name: "x"
-    type_attr: "T1"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "y"
-    type_attr: "T2"
+    name: "mg"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "min_x"
-    type: DT_FLOAT
+    name: "ms"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_x"
-    type: DT_FLOAT
+    name: "mom"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "min_y"
-    type: DT_FLOAT
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "max_y"
-    type: DT_FLOAT
+    name: "rho"
+    type_attr: "T"
   }
-  output_arg {
-    name: "z"
-    type_attr: "Toutput"
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
-  output_arg {
-    name: "min_z"
-    type: DT_FLOAT
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-  output_arg {
-    name: "max_z"
-    type: DT_FLOAT
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "T1"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "T2"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Toutput"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
+      b: false
     }
   }
-  is_commutative: true
+  is_stateful: true
 }
 op {
-  name: "QuantizedRelu"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
-    name: "features"
-    type_attr: "Tinput"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "min_features"
-    type: DT_FLOAT
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_features"
-    type: DT_FLOAT
+    name: "linear"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "activations"
-    type_attr: "out_type"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "min_activations"
-    type: DT_FLOAT
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "max_activations"
-    type: DT_FLOAT
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
-    name: "Tinput"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "out_type"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "QuantizedRelu6"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
-    name: "features"
-    type_attr: "Tinput"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "min_features"
-    type: DT_FLOAT
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_features"
-    type: DT_FLOAT
+    name: "linear"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "activations"
-    type_attr: "out_type"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "min_activations"
-    type: DT_FLOAT
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "max_activations"
-    type: DT_FLOAT
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
-    name: "Tinput"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "out_type"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "QuantizedReluX"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
-    name: "features"
-    type_attr: "Tinput"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_value"
-    type: DT_FLOAT
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "min_features"
-    type: DT_FLOAT
+    name: "linear"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_features"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "activations"
-    type_attr: "out_type"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "min_activations"
-    type: DT_FLOAT
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "max_activations"
-    type: DT_FLOAT
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
-    name: "Tinput"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "out_type"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_QUINT8
-    }
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "QuantizedReshape"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
-    name: "tensor"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "shape"
-    type_attr: "Tshape"
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "input_min"
-    type: DT_FLOAT
+    name: "linear"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "input_max"
-    type: DT_FLOAT
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "Tshape"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
         type: DT_INT32
@@ -23740,943 +44523,934 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "QuantizedResizeBilinear"
+  name: "ResourceSparseApplyFtrlV2"
   input_arg {
-    name: "images"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "min"
-    type: DT_FLOAT
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "max"
-    type: DT_FLOAT
+    name: "l1"
+    type_attr: "T"
   }
-  output_arg {
-    name: "resized_images"
+  input_arg {
+    name: "l2"
     type_attr: "T"
   }
-  output_arg {
-    name: "out_min"
-    type: DT_FLOAT
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
   }
-  output_arg {
-    name: "out_max"
-    type: DT_FLOAT
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_FLOAT
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-}
-op {
-  name: "QueueClose"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
   attr {
-    name: "cancel_pending_enqueues"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "QueueCloseV2"
+  name: "ResourceSparseApplyFtrlV2"
   input_arg {
-    name: "handle"
+    name: "var"
     type: DT_RESOURCE
   }
-  attr {
-    name: "cancel_pending_enqueues"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
   }
-  is_stateful: true
-}
-op {
-  name: "QueueDequeue"
   input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "linear"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-}
-op {
-  name: "QueueDequeueMany"
   input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "l1"
+    type_attr: "T"
   }
   input_arg {
-    name: "n"
-    type: DT_INT32
+    name: "l2"
+    type_attr: "T"
   }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "timeout_ms"
-    type: "int"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      i: -1
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "QueueDequeueManyV2"
+  name: "ResourceSparseApplyFtrlV2"
   input_arg {
-    name: "handle"
+    name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "n"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "accum"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
   }
-  is_stateful: true
-}
-op {
-  name: "QueueDequeueUpTo"
   input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "n"
-    type: DT_INT32
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-}
-op {
-  name: "QueueDequeueUpToV2"
   input_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "l2_shrinkage"
+    type_attr: "T"
   }
   input_arg {
-    name: "n"
-    type: DT_INT32
+    name: "lr_power"
+    type_attr: "T"
   }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "timeout_ms"
-    type: "int"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      i: -1
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "QueueDequeueV2"
+  name: "ResourceSparseApplyFtrlV2"
   input_arg {
-    name: "handle"
+    name: "var"
     type: DT_RESOURCE
   }
-  output_arg {
-    name: "components"
-    type_list_attr: "component_types"
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "QueueEnqueue"
   input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-}
-op {
-  name: "QueueEnqueueMany"
   input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "l2_shrinkage"
+    type_attr: "T"
   }
   input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "timeout_ms"
-    type: "int"
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
     default_value {
-      i: -1
+      b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "QueueEnqueueManyV2"
+  name: "ResourceSparseApplyMomentum"
   input_arg {
-    name: "handle"
+    name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
+    name: "accum"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "timeout_ms"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "QueueEnqueueV2"
   input_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
+    name: "momentum"
+    type_attr: "T"
   }
   attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
   attr {
-    name: "timeout_ms"
-    type: "int"
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
     default_value {
-      i: -1
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "QueueIsClosed"
+  name: "ResourceSparseApplyMomentum"
   input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "is_closed"
-    type: DT_BOOL
+    name: "var"
+    type: DT_RESOURCE
   }
-}
-op {
-  name: "QueueIsClosedV2"
   input_arg {
-    name: "handle"
+    name: "accum"
     type: DT_RESOURCE
   }
-  output_arg {
-    name: "is_closed"
-    type: DT_BOOL
-  }
-  is_stateful: true
-}
-op {
-  name: "QueueSize"
   input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
+    name: "lr"
+    type_attr: "T"
   }
-}
-op {
-  name: "QueueSizeV2"
   input_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT32
+    name: "grad"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "RFFT"
   input_arg {
-    name: "input"
-    type: DT_FLOAT
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "fft_length"
-    type: DT_INT32
+    name: "momentum"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
-}
-op {
-  name: "RFFT2D"
-  input_arg {
-    name: "input"
-    type: DT_FLOAT
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
-  input_arg {
-    name: "fft_length"
-    type: DT_INT32
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "RFFT3D"
+  name: "ResourceSparseApplyMomentum"
   input_arg {
-    name: "input"
-    type: DT_FLOAT
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "fft_length"
-    type: DT_INT32
+    name: "accum"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-}
-op {
-  name: "RGBToHSV"
   input_arg {
-    name: "images"
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "RandomCrop"
-  input_arg {
-    name: "image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "seed"
-    type: "int"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "use_nesterov"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
-  deprecation {
-    version: 8
-  }
   is_stateful: true
 }
 op {
-  name: "RandomGamma"
+  name: "ResourceSparseApplyMomentum"
   input_arg {
-    name: "shape"
-    type_attr: "S"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "alpha"
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
   attr {
-    name: "S"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
         type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-  is_stateful: true
-}
-op {
-  name: "RandomPoisson"
-  input_arg {
-    name: "shape"
-    type_attr: "S"
-  }
-  input_arg {
-    name: "rate"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
   attr {
-    name: "seed"
-    type: "int"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "use_nesterov"
+    type: "bool"
     default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "S"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "RandomPoisson"
+  name: "ResourceSparseApplyProximalAdagrad"
   input_arg {
-    name: "shape"
-    type_attr: "S"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "rate"
-    type_attr: "dtype"
+    name: "accum"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "S"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "dtype"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-  deprecation {
-    version: 25
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   is_stateful: true
 }
 op {
-  name: "RandomPoissonV2"
+  name: "ResourceSparseApplyProximalAdagrad"
   input_arg {
-    name: "shape"
-    type_attr: "S"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "rate"
-    type_attr: "R"
+    name: "accum"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "S"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "R"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_DOUBLE
-    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
       }
     }
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "RandomShuffle"
+  name: "ResourceSparseApplyProximalAdagrad"
   input_arg {
-    name: "value"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "l1"
     type_attr: "T"
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "RandomShuffleQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
+    allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "min_after_dequeue"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "RandomShuffleQueueV2"
-  output_arg {
-    name: "handle"
+  name: "ResourceSparseApplyProximalAdagrad"
+  input_arg {
+    name: "var"
     type: DT_RESOURCE
   }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "min_after_dequeue"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "RandomStandardNormal"
   input_arg {
-    name: "shape"
+    name: "l1"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
@@ -24685,45 +45459,65 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "RandomUniform"
+  name: "ResourceSparseApplyProximalGradientDescent"
   input_arg {
-    name: "shape"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
@@ -24732,52 +45526,67 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "RandomUniformInt"
+  name: "ResourceSparseApplyProximalGradientDescent"
   input_arg {
-    name: "shape"
-    type_attr: "T"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "minval"
-    type_attr: "Tout"
+    name: "alpha"
+    type_attr: "T"
   }
   input_arg {
-    name: "maxval"
-    type_attr: "Tout"
+    name: "l1"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type_attr: "Tout"
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "Tout"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
@@ -24786,877 +45595,1067 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "Range"
+  name: "ResourceSparseApplyProximalGradientDescent"
   input_arg {
-    name: "start"
-    type_attr: "Tidx"
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "limit"
-    type_attr: "Tidx"
+    name: "alpha"
+    type_attr: "T"
   }
   input_arg {
-    name: "delta"
-    type_attr: "Tidx"
+    name: "l1"
+    type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type_attr: "Tidx"
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_INT32
         type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "RangeDataset"
+  name: "ResourceSparseApplyProximalGradientDescent"
   input_arg {
-    name: "start"
-    type: DT_INT64
+    name: "var"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "stop"
-    type: DT_INT64
+    name: "alpha"
+    type_attr: "T"
   }
   input_arg {
-    name: "step"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "l1"
+    type_attr: "T"
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "Rank"
   input_arg {
-    name: "input"
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type: DT_INT32
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
     type: "type"
-  }
-}
-op {
-  name: "ReadFile"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "contents"
-    type: DT_STRING
-  }
-}
-op {
-  name: "ReadVariableOp"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "value"
-    type_attr: "dtype"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "dtype"
+    name: "Tindices"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
-  is_stateful: true
-}
-op {
-  name: "ReaderNumRecordsProduced"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "records_produced"
-    type: DT_INT64
-  }
-}
-op {
-  name: "ReaderNumRecordsProducedV2"
-  input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "records_produced"
-    type: DT_INT64
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   is_stateful: true
 }
 op {
-  name: "ReaderNumWorkUnitsCompleted"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "units_completed"
-    type: DT_INT64
-  }
-}
-op {
-  name: "ReaderNumWorkUnitsCompletedV2"
+  name: "ResourceSparseApplyRMSProp"
   input_arg {
-    name: "reader_handle"
+    name: "var"
     type: DT_RESOURCE
   }
-  output_arg {
-    name: "units_completed"
-    type: DT_INT64
-  }
-  is_stateful: true
-}
-op {
-  name: "ReaderRead"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "queue_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "key"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "value"
-    type: DT_STRING
-  }
-}
-op {
-  name: "ReaderReadUpTo"
   input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "ms"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "queue_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "mom"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "num_records"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "keys"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "values"
-    type: DT_STRING
+    name: "lr"
+    type_attr: "T"
   }
-}
-op {
-  name: "ReaderReadUpToV2"
   input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
+    name: "rho"
+    type_attr: "T"
   }
   input_arg {
-    name: "queue_handle"
-    type: DT_RESOURCE
+    name: "momentum"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_records"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "keys"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "values"
-    type: DT_STRING
+    name: "epsilon"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "ReaderReadV2"
   input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "queue_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "key"
-    type: DT_STRING
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "value"
-    type: DT_STRING
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
-  is_stateful: true
-}
-op {
-  name: "ReaderReset"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
-}
-op {
-  name: "ReaderResetV2"
-  input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   is_stateful: true
 }
 op {
-  name: "ReaderRestoreState"
-  input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
-  }
+  name: "ResourceSparseApplyRMSProp"
   input_arg {
-    name: "state"
-    type: DT_STRING
+    name: "var"
+    type: DT_RESOURCE
   }
-}
-op {
-  name: "ReaderRestoreStateV2"
   input_arg {
-    name: "reader_handle"
+    name: "ms"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "state"
-    type: DT_STRING
+    name: "mom"
+    type: DT_RESOURCE
   }
-  is_stateful: true
-}
-op {
-  name: "ReaderSerializeState"
   input_arg {
-    name: "reader_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "state"
-    type: DT_STRING
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-}
-op {
-  name: "ReaderSerializeStateV2"
   input_arg {
-    name: "reader_handle"
-    type: DT_RESOURCE
+    name: "momentum"
+    type_attr: "T"
   }
-  output_arg {
-    name: "state"
-    type: DT_STRING
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "Real"
   input_arg {
-    name: "input"
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type_attr: "Tout"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_COMPLEX64
-    }
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "Tout"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "RealDiv"
+  name: "ResourceSparseApplyRMSProp"
   input_arg {
-    name: "x"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "rho"
     type_attr: "T"
   }
-  output_arg {
-    name: "z"
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT8
         type: DT_UINT16
         type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-}
-op {
-  name: "Reciprocal"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "ReciprocalGrad"
+  name: "ResourceSparseApplyRMSProp"
   input_arg {
-    name: "x"
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "rho"
     type_attr: "T"
   }
-  output_arg {
-    name: "z"
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
         type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "ReciprocalGrad"
+  name: "ResourceStridedSliceAssign"
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "ref"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "dy"
-    type_attr: "T"
+    name: "begin"
+    type_attr: "Index"
   }
-  output_arg {
-    name: "z"
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-}
-op {
-  name: "RecordInput"
-  output_arg {
-    name: "records"
-    type: DT_STRING
-  }
   attr {
-    name: "file_pattern"
-    type: "string"
-  }
-  attr {
-    name: "file_random_seed"
+    name: "begin_mask"
     type: "int"
     default_value {
-      i: 301
+      i: 0
     }
   }
   attr {
-    name: "file_shuffle_shift_ratio"
-    type: "float"
+    name: "end_mask"
+    type: "int"
     default_value {
-      f: 0
+      i: 0
     }
   }
   attr {
-    name: "file_buffer_size"
+    name: "ellipsis_mask"
     type: "int"
     default_value {
-      i: 10000
+      i: 0
     }
   }
   attr {
-    name: "file_parallelism"
+    name: "new_axis_mask"
     type: "int"
     default_value {
-      i: 16
+      i: 0
     }
   }
   attr {
-    name: "batch_size"
+    name: "shrink_axis_mask"
     type: "int"
     default_value {
-      i: 32
+      i: 0
     }
   }
   is_stateful: true
 }
 op {
-  name: "ReduceJoin"
+  name: "Restore"
   input_arg {
-    name: "inputs"
+    name: "file_pattern"
     type: DT_STRING
   }
   input_arg {
-    name: "reduction_indices"
-    type: DT_INT32
+    name: "tensor_name"
+    type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type: DT_STRING
+    name: "tensor"
+    type_attr: "dt"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "dt"
+    type: "type"
   }
   attr {
-    name: "separator"
-    type: "string"
+    name: "preferred_shard"
+    type: "int"
     default_value {
-      s: ""
+      i: -1
     }
   }
 }
 op {
-  name: "RefEnter"
+  name: "Restore"
   input_arg {
-    name: "data"
-    type_attr: "T"
-    is_ref: true
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
-    is_ref: true
+    name: "tensor"
+    type_attr: "dt"
   }
   attr {
-    name: "T"
+    name: "dt"
     type: "type"
   }
   attr {
-    name: "frame_name"
-    type: "string"
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
   }
   attr {
-    name: "is_constant"
-    type: "bool"
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
     default_value {
-      b: false
+      i: -1
     }
   }
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
   attr {
-    name: "parallel_iterations"
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
     type: "int"
     default_value {
-      i: 10
+      i: -1
     }
   }
+  is_stateful: true
 }
 op {
-  name: "RefExit"
+  name: "RestoreV2"
   input_arg {
-    name: "data"
-    type_attr: "T"
-    is_ref: true
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
-    is_ref: true
+    name: "tensors"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "RefIdentity"
+  name: "RestoreV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
-    is_ref: true
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
-    is_ref: true
+    name: "tensors"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  allows_uninitialized_input: true
+  is_stateful: true
 }
 op {
-  name: "RefMerge"
+  name: "Reverse"
   input_arg {
-    name: "inputs"
+    name: "tensor"
     type_attr: "T"
-    number_attr: "N"
-    is_ref: true
+  }
+  input_arg {
+    name: "dims"
+    type: DT_BOOL
   }
   output_arg {
     name: "output"
     type_attr: "T"
-    is_ref: true
-  }
-  output_arg {
-    name: "value_index"
-    type: DT_INT32
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "RefNextIteration"
+  name: "Reverse"
   input_arg {
-    name: "data"
+    name: "tensor"
     type_attr: "T"
-    is_ref: true
+  }
+  input_arg {
+    name: "dims"
+    type: DT_BOOL
   }
   output_arg {
     name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
   }
 }
 op {
-  name: "RefSelect"
+  name: "Reverse"
   input_arg {
-    name: "index"
-    type: DT_INT32
+    name: "tensor"
+    type_attr: "T"
   }
   input_arg {
-    name: "inputs"
-    type_attr: "T"
-    number_attr: "N"
-    is_ref: true
+    name: "dims"
+    type: DT_BOOL
   }
   output_arg {
     name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
   }
 }
 op {
-  name: "RefSwitch"
+  name: "ReverseSequence"
   input_arg {
-    name: "data"
+    name: "input"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "pred"
-    type: DT_BOOL
+    name: "seq_lengths"
+    type_attr: "Tlen"
   }
   output_arg {
-    name: "output_false"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
-  output_arg {
-    name: "output_true"
-    type_attr: "T"
-    is_ref: true
+  attr {
+    name: "seq_dim"
+    type: "int"
+  }
+  attr {
+    name: "batch_dim"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
     name: "T"
     type: "type"
   }
-  allows_uninitialized_input: true
-}
-op {
-  name: "Relu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tlen"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "Relu"
+  name: "ReverseV2"
   input_arg {
-    name: "features"
+    name: "tensor"
     type_attr: "T"
   }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
   output_arg {
-    name: "activations"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_UINT8
-        type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Relu6"
+  name: "ReverseV2"
   input_arg {
-    name: "features"
+    name: "tensor"
     type_attr: "T"
   }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
   output_arg {
-    name: "activations"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_UINT8
-        type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
         type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
 }
 op {
-  name: "Relu6"
+  name: "ReverseV2"
   input_arg {
-    name: "features"
+    name: "tensor"
     type_attr: "T"
   }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
   output_arg {
-    name: "activations"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_UINT8
-        type: DT_INT16
         type: DT_INT8
         type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
 }
 op {
-  name: "Relu6Grad"
+  name: "ReverseV2"
   input_arg {
-    name: "gradients"
+    name: "tensor"
     type_attr: "T"
   }
   input_arg {
-    name: "features"
-    type_attr: "T"
+    name: "axis"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "backprops"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_UINT8
-        type: DT_INT16
         type: DT_INT8
         type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
         type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
 }
 op {
-  name: "Relu6Grad"
+  name: "RightShift"
   input_arg {
-    name: "gradients"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "features"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "backprops"
+    name: "z"
     type_attr: "T"
   }
   attr {
@@ -25664,33 +46663,27 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
         type: DT_UINT16
-        type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "ReluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
+  name: "Rint"
   input_arg {
-    name: "features"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "backprops"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -25700,29 +46693,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "ReluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
+  name: "Rint"
   input_arg {
-    name: "features"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "backprops"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -25730,311 +46712,263 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "RemoteCall"
+  name: "Roll"
   input_arg {
-    name: "target"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "args"
-    type_list_attr: "Tin"
+    name: "shift"
+    type_attr: "Tshift"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
   }
   output_arg {
     name: "output"
-    type_list_attr: "Tout"
+    type_attr: "T"
   }
   attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "Tshift"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "Taxis"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "RemoteFusedGraphExecute"
+  name: "Round"
   input_arg {
-    name: "inputs"
-    type_list_attr: "Tinputs"
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "outputs"
-    type_list_attr: "Toutputs"
-  }
-  attr {
-    name: "Tinputs"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Toutputs"
-    type: "list(type)"
-    has_minimum: true
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "serialized_remote_fused_graph_execute_info"
-    type: "string"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "RepeatDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "Round"
   input_arg {
-    name: "count"
-    type: DT_INT64
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "RepeatDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "Rsqrt"
   input_arg {
-    name: "count"
-    type: DT_INT64
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "RequantizationRange"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
-  }
+  name: "Rsqrt"
   input_arg {
-    name: "input_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "Tinput"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Requantize"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "input_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "input_max"
-    type: DT_FLOAT
-  }
+  name: "RsqrtGrad"
   input_arg {
-    name: "requested_output_min"
-    type: DT_FLOAT
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "requested_output_max"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "output_max"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "out_type"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Reshape"
+  name: "RsqrtGrad"
   input_arg {
-    name: "tensor"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "shape"
-    type_attr: "Tshape"
+    name: "dy"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tshape"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "ResizeArea"
+  name: "RsqrtGrad"
   input_arg {
-    name: "images"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "dy"
+    type_attr: "T"
   }
   output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
+    name: "z"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ResizeArea"
+  name: "SampleDistortedBoundingBox"
   input_arg {
-    name: "images"
+    name: "image_size"
     type_attr: "T"
   }
   input_arg {
+    name: "bounding_boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "begin"
+    type_attr: "T"
+  }
+  output_arg {
     name: "size"
-    type: DT_INT32
+    type_attr: "T"
   }
   output_arg {
-    name: "resized_images"
+    name: "bboxes"
     type: DT_FLOAT
   }
   attr {
@@ -26042,38 +46976,95 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
         type: DT_UINT8
+        type: DT_INT8
         type: DT_INT16
-        type: DT_UINT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "min_object_covered"
+    type: "float"
+    default_value {
+      f: 0.1
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.75
+        f: 1.33
+      }
+    }
+  }
+  attr {
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
     type: "bool"
     default_value {
       b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ResizeBicubic"
+  name: "SampleDistortedBoundingBoxV2"
   input_arg {
-    name: "images"
+    name: "image_size"
     type_attr: "T"
   }
   input_arg {
+    name: "bounding_boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_object_covered"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "begin"
+    type_attr: "T"
+  }
+  output_arg {
     name: "size"
-    type: DT_INT32
+    type_attr: "T"
   }
   output_arg {
-    name: "resized_images"
+    name: "bboxes"
     type: DT_FLOAT
   }
   attr {
@@ -26086,72 +47077,249 @@ op {
         type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "aspect_ratio_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.75
+        f: 1.33
+      }
+    }
+  }
+  attr {
+    name: "area_range"
+    type: "list(float)"
+    default_value {
+      list {
+        f: 0.05
+        f: 1
+      }
+    }
+  }
+  attr {
+    name: "max_attempts"
+    type: "int"
+    default_value {
+      i: 100
+    }
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
     type: "bool"
     default_value {
       b: false
     }
   }
+  is_stateful: true
 }
 op {
-  name: "ResizeBicubic"
+  name: "Save"
   input_arg {
-    name: "images"
-    type_attr: "T"
+    name: "filename"
+    type: DT_STRING
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Save"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "SaveSlices"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shapes_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SaveSlices"
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shapes_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "SaveV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "SaveV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ScalarSummary"
+  input_arg {
+    name: "tags"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
   }
   output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
+    name: "summary"
+    type: DT_STRING
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
+        type: DT_INT8
         type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
         type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ResizeBicubicGrad"
+  name: "ScalarSummary"
   input_arg {
-    name: "grads"
-    type: DT_FLOAT
+    name: "tags"
+    type: DT_STRING
   }
   input_arg {
-    name: "original_image"
+    name: "values"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "summary"
+    type: DT_STRING
   }
   attr {
     name: "T"
@@ -26160,107 +47328,154 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ResizeBilinear"
+  name: "ScalarSummary"
   input_arg {
-    name: "images"
-    type_attr: "T"
+    name: "tags"
+    type: DT_STRING
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "values"
+    type_attr: "T"
   }
   output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
+    name: "summary"
+    type: DT_STRING
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-  attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ResizeBilinear"
+  name: "ScalarSummary"
   input_arg {
-    name: "images"
-    type_attr: "T"
+    name: "tags"
+    type: DT_STRING
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "values"
+    type_attr: "T"
   }
   output_arg {
-    name: "resized_images"
-    type: DT_FLOAT
+    name: "summary"
+    type: DT_STRING
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_UINT8
         type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
+        type: DT_INT8
         type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
         type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+}
+op {
+  name: "ScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
   attr {
-    name: "align_corners"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "ResizeBilinearGrad"
+  name: "ScatterAdd"
   input_arg {
-    name: "grads"
-    type: DT_FLOAT
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "original_image"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -26268,13 +47483,34 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_HALF
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
@@ -26282,37 +47518,61 @@ op {
   }
 }
 op {
-  name: "ResizeNearestNeighbor"
+  name: "ScatterAdd"
   input_arg {
-    name: "images"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
   }
   output_arg {
-    name: "resized_images"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT8
+        type: DT_UINT16
         type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_INT32
         type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
@@ -26320,38 +47580,62 @@ op {
   }
 }
 op {
-  name: "ResizeNearestNeighbor"
+  name: "ScatterAdd"
   input_arg {
-    name: "images"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
   }
   output_arg {
-    name: "resized_images"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT16
         type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_INT32
         type: DT_INT64
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
@@ -26359,35 +47643,62 @@ op {
   }
 }
 op {
-  name: "ResizeNearestNeighborGrad"
+  name: "ScatterAdd"
   input_arg {
-    name: "grads"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "size"
-    type: DT_INT32
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_INT16
         type: DT_INT8
-        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "align_corners"
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
@@ -26395,34 +47706,24 @@ op {
   }
 }
 op {
-  name: "ResourceApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
+  name: "ScatterDiv"
   input_arg {
-    name: "lr"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "epsilon"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -26446,6 +47747,16 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -26453,37 +47764,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
+  name: "ScatterDiv"
   input_arg {
-    name: "lr"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "epsilon"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -26509,6 +47809,16 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -26516,25 +47826,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyAdagrad"
+  name: "ScatterDiv"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "lr"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -26555,6 +47866,19 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -26565,25 +47889,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyAdagrad"
+  name: "ScatterDiv"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "lr"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -26592,23 +47917,34 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -26616,41 +47952,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
+  name: "ScatterMul"
   input_arg {
-    name: "grad"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l1"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "l2"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
+    is_ref: true
   }
   attr {
     name: "T"
@@ -26674,6 +47995,16 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -26681,41 +48012,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
+  name: "ScatterMul"
   input_arg {
-    name: "grad"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l1"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "l2"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
+    is_ref: true
   }
   attr {
     name: "T"
@@ -26741,6 +48057,16 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -26748,49 +48074,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyAdam"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "v"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "ScatterMul"
   input_arg {
-    name: "beta1"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "beta2"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "epsilon"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -26811,6 +48114,19 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -26821,49 +48137,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyAdam"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "v"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "ScatterMul"
   input_arg {
-    name: "beta1"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "beta2"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "epsilon"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -26872,78 +48165,94 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "use_nesterov"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyAdam"
+  name: "ScatterNd"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "m"
-    type: DT_RESOURCE
+    name: "updates"
+    type_attr: "T"
   }
   input_arg {
-    name: "v"
-    type: DT_RESOURCE
+    name: "shape"
+    type_attr: "Tindices"
   }
-  input_arg {
-    name: "beta1_power"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
+  attr {
+    name: "T"
+    type: "type"
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
+}
+op {
+  name: "ScatterNdAdd"
   input_arg {
-    name: "beta1"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "beta2"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "epsilon"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -26964,56 +48273,46 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "use_nesterov"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyAddSign"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "m"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "ScatterNdAdd"
   input_arg {
-    name: "alpha"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "sign_decay"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "beta"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -27039,6 +48338,16 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -27046,45 +48355,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "ScatterNdAdd"
   input_arg {
-    name: "rho"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "momentum"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "epsilon"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -27105,6 +48395,19 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -27115,45 +48418,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "ScatterNdAdd"
   input_arg {
-    name: "rho"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "momentum"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "epsilon"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -27162,23 +48446,34 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -27186,40 +48481,23 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
+  name: "ScatterNdNonAliasingAdd"
   input_arg {
-    name: "lr"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l2"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "lr_power"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -27245,46 +48523,32 @@ op {
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
+  name: "ScatterNdNonAliasingAdd"
   input_arg {
-    name: "lr"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l2"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "lr_power"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -27312,50 +48576,32 @@ op {
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "ScatterNdNonAliasingAdd"
   input_arg {
-    name: "l1"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l2_shrinkage"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "lr_power"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -27377,54 +48623,39 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "ScatterNdNonAliasingAdd"
   input_arg {
-    name: "l1"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l2_shrinkage"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "lr_power"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -27434,17 +48665,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -27452,28 +48684,36 @@ op {
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyGradientDescent"
+  name: "ScatterNdSub"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "alpha"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "delta"
+    name: "updates"
     type_attr: "T"
   }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
   attr {
     name: "T"
     type: "type"
@@ -27496,6 +48736,16 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -27503,21 +48753,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyGradientDescent"
+  name: "ScatterNdSub"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "alpha"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "delta"
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -27543,6 +48798,16 @@ op {
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "use_locking"
     type: "bool"
@@ -27550,29 +48815,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
+  name: "ScatterNdSub"
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "grad"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "momentum"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -27593,6 +48855,19 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -27603,36 +48878,26 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
+  name: "ScatterNdSub"
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "grad"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "momentum"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -27641,17 +48906,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -27659,50 +48925,84 @@ op {
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "use_nesterov"
+    name: "use_locking"
     type: "bool"
     default_value {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyPowerSign"
+  name: "ScatterNdUpdate"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "m"
-    type: DT_RESOURCE
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "lr"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "logbase"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
+}
+op {
+  name: "ScatterSub"
   input_arg {
-    name: "sign_decay"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "beta"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "grad"
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -27723,8 +49023,16 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -27735,33 +49043,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
+  name: "ScatterSub"
   input_arg {
-    name: "lr"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l2"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -27782,6 +49083,18 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -27792,33 +49105,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
+  name: "ScatterSub"
   input_arg {
-    name: "lr"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l2"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -27841,6 +49147,17 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -27851,29 +49168,26 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
+  name: "ScatterSub"
   input_arg {
-    name: "alpha"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l2"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "delta"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -27882,18 +49196,31 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -27904,51 +49231,38 @@ op {
       b: false
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
+  name: "ScatterUpdate"
   input_arg {
-    name: "alpha"
+    name: "ref"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "l2"
+    name: "updates"
     type_attr: "T"
   }
-  input_arg {
-    name: "delta"
+  output_arg {
+    name: "output_ref"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_INT64
       }
     }
   }
@@ -27956,108 +49270,215 @@ op {
     name: "use_locking"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceApplyRMSProp"
+  name: "SdcaFprint"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
   }
+}
+op {
+  name: "SdcaOptimizer"
   input_arg {
-    name: "ms"
-    type: DT_RESOURCE
+    name: "sparse_example_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
   }
   input_arg {
-    name: "mom"
-    type: DT_RESOURCE
+    name: "sparse_feature_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "sparse_feature_values"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features_with_values"
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "dense_features"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
   }
   input_arg {
-    name: "momentum"
-    type_attr: "T"
+    name: "example_weights"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "example_labels"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_example_state_data"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "out_delta_sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  output_arg {
+    name: "out_delta_dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "loss_type"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        s: "logistic_loss"
+        s: "squared_loss"
+        s: "hinge_loss"
+        s: "smooth_hinge_loss"
       }
     }
   }
   attr {
-    name: "use_locking"
+    name: "adaptative"
     type: "bool"
     default_value {
       b: false
     }
   }
-  is_stateful: true
+  attr {
+    name: "num_sparse_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_sparse_features_with_values"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_dense_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "num_loss_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_inner_iterations"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
 }
 op {
-  name: "ResourceApplyRMSProp"
+  name: "SdcaShrinkL1"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "weights"
+    type: DT_FLOAT
+    number_attr: "num_features"
+    is_ref: true
   }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
   }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
+  attr {
+    name: "l1"
+    type: "float"
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
+  attr {
+    name: "l2"
+    type: "float"
   }
+}
+op {
+  name: "SegmentMax"
   input_arg {
-    name: "rho"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentMax"
   input_arg {
-    name: "epsilon"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -28067,17 +49488,12 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -28085,30 +49501,52 @@ op {
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceCountUpTo"
+  name: "SegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "segment_ids"
+    type_attr: "Tindices"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "limit"
-    type: "int"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
@@ -28117,32 +49555,40 @@ op {
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceGather"
+  name: "SegmentMax"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "data"
+    type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
   output_arg {
     name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
     name: "Tindices"
@@ -28154,40 +49600,34 @@ op {
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceScatterAdd"
+  name: "SegmentMean"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "data"
+    type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
       }
     }
@@ -28202,40 +49642,34 @@ op {
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceScatterAdd"
+  name: "SegmentMean"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "data"
+    type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -28252,43 +49686,38 @@ op {
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceScatterUpdate"
+  name: "SegmentMean"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "data"
+    type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -28302,42 +49731,21 @@ op {
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "SegmentMean"
   input_arg {
-    name: "rho"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "segment_ids"
+    type_attr: "Tindices"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   attr {
     name: "T"
     type: "type"
@@ -28345,18 +49753,16 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -28370,49 +49776,21 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdadelta"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "SegmentMin"
   input_arg {
-    name: "rho"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "segment_ids"
+    type_attr: "Tindices"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   attr {
     name: "T"
     type: "type"
@@ -28420,20 +49798,13 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -28447,37 +49818,21 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "SegmentMin"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -28485,18 +49840,15 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -28510,37 +49862,21 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "SegmentMin"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -28548,20 +49884,16 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -28575,53 +49907,21 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
+  name: "SegmentMin"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
   attr {
     name: "T"
     type: "type"
@@ -28629,18 +49929,16 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -28654,53 +49952,21 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
+  name: "SegmentProd"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
   attr {
     name: "T"
     type: "type"
@@ -28720,8 +49986,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -28735,57 +49999,21 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
+  name: "SegmentProd"
   input_arg {
-    name: "momentum"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "segment_ids"
+    type_attr: "Tindices"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   attr {
     name: "T"
     type: "type"
@@ -28805,6 +50033,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -28815,60 +50045,24 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyCenteredRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
+      }
+    }
   }
+}
+op {
+  name: "SegmentProd"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -28890,6 +50084,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -28903,51 +50098,19 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
+  name: "SegmentProd"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -28957,18 +50120,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -28982,51 +50148,19 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
+  name: "SegmentSum"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -29048,8 +50182,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -29063,55 +50195,19 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
+  name: "SegmentSum"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -29133,6 +50229,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -29146,55 +50244,19 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
+  name: "SegmentSum"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -29218,6 +50280,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -29231,39 +50294,19 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "SegmentSum"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
+    name: "segment_ids"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "momentum"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   attr {
@@ -29273,18 +50316,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -29298,46 +50344,177 @@ op {
       }
     }
   }
+}
+op {
+  name: "Select"
+  input_arg {
+    name: "condition"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  deprecation {
+    version: 11
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
   attr {
-    name: "use_locking"
+    name: "compute_v"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
   attr {
-    name: "use_nesterov"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
     type: "bool"
     default_value {
-      b: false
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyMomentum"
+  name: "Selu"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "features"
+    type_attr: "T"
   }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
+}
+op {
+  name: "Selu"
   input_arg {
-    name: "lr"
+    name: "features"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "activations"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "SeluGrad"
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "gradients"
+    type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
     type_attr: "T"
   }
   attr {
@@ -29345,179 +50522,246 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+}
+op {
+  name: "SeluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+}
+op {
+  name: "SerializeIterator"
+  input_arg {
+    name: "resource_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "serialized"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "SerializeManySparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SerializeManySparse"
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type_attr: "out_type"
   }
   attr {
-    name: "use_nesterov"
-    type: "bool"
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
     default_value {
-      b: false
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
+  name: "SerializeSparse"
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
+    name: "sparse_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "lr"
+    name: "sparse_values"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
   }
+}
+op {
+  name: "SerializeSparse"
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "sparse_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "grad"
+    name: "sparse_values"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "serialized_sparse"
+    type_attr: "out_type"
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
   }
   attr {
-    name: "Tindices"
+    name: "out_type"
     type: "type"
+    default_value {
+      type: DT_STRING
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_STRING
+        type: DT_VARIANT
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalAdagrad"
+  name: "SerializeTensor"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "tensor"
+    type_attr: "T"
   }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
+  output_arg {
+    name: "serialized"
+    type: DT_STRING
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
+  attr {
+    name: "T"
+    type: "type"
   }
+}
+op {
+  name: "SetSize"
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "set_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "l2"
+    name: "set_values"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "set_shape"
+    type: DT_INT64
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "validate_indices"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_STRING
       }
     }
   }
+}
+op {
+  name: "Shape"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
   attr {
-    name: "Tindices"
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
         type: DT_INT32
@@ -29525,66 +50769,35 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "alpha"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
+  name: "ShapeN"
   input_arg {
-    name: "l2"
+    name: "input"
     type_attr: "T"
+    number_attr: "N"
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+    number_attr: "N"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
   }
   attr {
-    name: "Tindices"
+    name: "out_type"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
         type: DT_INT32
@@ -29592,524 +50805,638 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
+}
+op {
+  name: "ShardedFilename"
+  input_arg {
+    name: "basename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shard"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "filename"
+    type: DT_STRING
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalGradientDescent"
+  name: "ShardedFilespec"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "basename"
+    type: DT_STRING
   }
   input_arg {
-    name: "alpha"
-    type_attr: "T"
+    name: "num_shards"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "filename"
+    type: DT_STRING
   }
+}
+op {
+  name: "ShuffleAndRepeatDataset"
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "buffer_size"
+    type: DT_INT64
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "seed"
+    type: DT_INT64
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyRMSProp"
+  name: "ShuffleDataset"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "ms"
-    type: DT_RESOURCE
+    name: "buffer_size"
+    type: DT_INT64
   }
   input_arg {
-    name: "mom"
-    type: DT_RESOURCE
+    name: "seed"
+    type: DT_INT64
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+}
+op {
+  name: "ShuffleDataset"
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "momentum"
-    type_attr: "T"
+    name: "buffer_size"
+    type: DT_INT64
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "seed"
+    type: DT_INT64
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+}
+op {
+  name: "Sigmoid"
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
+}
+op {
+  name: "Sigmoid"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "mom"
-    type: DT_RESOURCE
-  }
+  name: "SigmoidGrad"
   input_arg {
-    name: "lr"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "dy"
     type_attr: "T"
   }
-  input_arg {
-    name: "momentum"
+  output_arg {
+    name: "z"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SigmoidGrad"
   input_arg {
-    name: "epsilon"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "dy"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+}
+op {
+  name: "Sign"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "ResourceStridedSliceAssign"
-  input_arg {
-    name: "ref"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "begin"
-    type_attr: "Index"
-  }
-  input_arg {
-    name: "end"
-    type_attr: "Index"
-  }
+  name: "Sign"
   input_arg {
-    name: "strides"
-    type_attr: "Index"
+    name: "x"
+    type_attr: "T"
   }
-  input_arg {
-    name: "value"
+  output_arg {
+    name: "y"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Index"
-    type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "begin_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
+}
+op {
+  name: "Sin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "end_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "ellipsis_mask"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
-  attr {
-    name: "new_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
-    }
+}
+op {
+  name: "Sin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "shrink_axis_mask"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
+  name: "Sinh"
   input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "dt"
+    name: "T"
     type: "type"
-  }
-  attr {
-    name: "preferred_shard"
-    type: "int"
-    default_value {
-      i: -1
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
 }
 op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
+  name: "Sinh"
   input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "dt"
+    name: "T"
     type: "type"
-  }
-  attr {
-    name: "preferred_shard"
-    type: "int"
-    default_value {
-      i: -1
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
-  }
+  name: "Size"
   input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "output"
+    type_attr: "out_type"
   }
   attr {
-    name: "dt"
+    name: "T"
     type: "type"
   }
   attr {
-    name: "preferred_shard"
-    type: "int"
+    name: "out_type"
+    type: "type"
     default_value {
-      i: -1
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
 }
 op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
+  name: "SkipDataset"
   input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
+    name: "count"
+    type: DT_INT64
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "dt"
-    type: "type"
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "preferred_shard"
-    type: "int"
-    default_value {
-      i: -1
-    }
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
   is_stateful: true
 }
 op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
-  }
+  name: "SkipDataset"
   input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+    name: "count"
+    type: DT_INT64
   }
   output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "dtypes"
+    name: "output_types"
     type: "list(type)"
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
 }
 op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
+  name: "Skipgram"
+  output_arg {
+    name: "vocab_word"
     type: DT_STRING
   }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+  output_arg {
+    name: "vocab_freq"
+    type: DT_INT32
   }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+  output_arg {
+    name: "words_per_epoch"
+    type: DT_INT64
   }
   output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+    name: "current_epoch"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "total_words_processed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "examples"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "labels"
+    type: DT_INT32
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "filename"
+    type: "string"
   }
-  is_stateful: true
-}
-op {
-  name: "Reverse"
-  input_arg {
-    name: "tensor"
-    type_attr: "T"
+  attr {
+    name: "batch_size"
+    type: "int"
   }
-  input_arg {
-    name: "dims"
-    type: DT_BOOL
+  attr {
+    name: "window_size"
+    type: "int"
+    default_value {
+      i: 5
+    }
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "min_count"
+    type: "int"
+    default_value {
+      i: 5
+    }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "subsample"
+    type: "float"
+    default_value {
+      f: 0.001
     }
   }
+  deprecation {
+    version: 19
+  }
+  is_stateful: true
 }
 op {
-  name: "Reverse"
+  name: "Slice"
   input_arg {
-    name: "tensor"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "dims"
-    type: DT_BOOL
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "size"
+    type_attr: "Index"
   }
   output_arg {
     name: "output"
@@ -30118,33 +51445,24 @@ op {
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
         type: DT_INT32
         type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
       }
     }
   }
 }
 op {
-  name: "Reverse"
+  name: "Snapshot"
   input_arg {
-    name: "tensor"
+    name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "dims"
-    type: DT_BOOL
-  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -30152,222 +51470,122 @@ op {
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
-      }
-    }
   }
 }
 op {
-  name: "ReverseSequence"
+  name: "Softmax"
   input_arg {
-    name: "input"
+    name: "logits"
     type_attr: "T"
   }
-  input_arg {
-    name: "seq_lengths"
-    type_attr: "Tlen"
-  }
   output_arg {
-    name: "output"
+    name: "softmax"
     type_attr: "T"
   }
-  attr {
-    name: "seq_dim"
-    type: "int"
-  }
-  attr {
-    name: "batch_dim"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tlen"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "ReverseV2"
+  name: "Softmax"
   input_arg {
-    name: "tensor"
+    name: "logits"
     type_attr: "T"
   }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
   output_arg {
-    name: "output"
+    name: "softmax"
     type_attr: "T"
   }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "ReverseV2"
+  name: "SoftmaxCrossEntropyWithLogits"
   input_arg {
-    name: "tensor"
+    name: "features"
     type_attr: "T"
   }
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "labels"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "loss"
     type_attr: "T"
   }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
       }
     }
   }
 }
 op {
-  name: "ReverseV2"
+  name: "SoftmaxCrossEntropyWithLogits"
   input_arg {
-    name: "tensor"
+    name: "features"
     type_attr: "T"
   }
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "labels"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "loss"
     type_attr: "T"
   }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
       }
     }
   }
 }
 op {
-  name: "RightShift"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "Softplus"
   input_arg {
-    name: "y"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "activations"
     type_attr: "T"
   }
   attr {
@@ -30375,27 +51593,27 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
         type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
         type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_HALF
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "Rint"
+  name: "Softplus"
   input_arg {
-    name: "x"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "activations"
     type_attr: "T"
   }
   attr {
@@ -30405,18 +51623,27 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Round"
+  name: "Softplus"
   input_arg {
-    name: "x"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "activations"
     type_attr: "T"
   }
   attr {
@@ -30424,25 +51651,30 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "Rsqrt"
+  name: "Softplus"
   input_arg {
-    name: "x"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "activations"
     type_attr: "T"
   }
   attr {
@@ -30450,27 +51682,34 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "RsqrtGrad"
+  name: "SoftplusGrad"
   input_arg {
-    name: "x"
+    name: "gradients"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "backprops"
     type_attr: "T"
   }
   attr {
@@ -30478,27 +51717,31 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "RsqrtGrad"
+  name: "SoftplusGrad"
   input_arg {
-    name: "y"
+    name: "gradients"
     type_attr: "T"
   }
   input_arg {
-    name: "dy"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "backprops"
     type_attr: "T"
   }
   attr {
@@ -30506,354 +51749,290 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "SampleDistortedBoundingBox"
+  name: "SoftplusGrad"
   input_arg {
-    name: "image_size"
+    name: "gradients"
     type_attr: "T"
   }
   input_arg {
-    name: "bounding_boxes"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "begin"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "size"
+    name: "backprops"
     type_attr: "T"
   }
-  output_arg {
-    name: "bboxes"
-    type: DT_FLOAT
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "min_object_covered"
-    type: "float"
-    default_value {
-      f: 0.1
-    }
-  }
-  attr {
-    name: "aspect_ratio_range"
-    type: "list(float)"
-    default_value {
-      list {
-        f: 0.75
-        f: 1.33
-      }
-    }
-  }
-  attr {
-    name: "area_range"
-    type: "list(float)"
-    default_value {
-      list {
-        f: 0.05
-        f: 1
-      }
-    }
-  }
-  attr {
-    name: "max_attempts"
-    type: "int"
-    default_value {
-      i: 100
-    }
-  }
-  attr {
-    name: "use_image_if_no_bounding_boxes"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "SampleDistortedBoundingBoxV2"
+  name: "SoftplusGrad"
   input_arg {
-    name: "image_size"
+    name: "gradients"
     type_attr: "T"
   }
   input_arg {
-    name: "bounding_boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_object_covered"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "begin"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "size"
+    name: "backprops"
     type_attr: "T"
   }
-  output_arg {
-    name: "bboxes"
-    type: DT_FLOAT
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT8
         type: DT_INT16
-        type: DT_INT32
+        type: DT_INT8
         type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "aspect_ratio_range"
-    type: "list(float)"
-    default_value {
-      list {
-        f: 0.75
-        f: 1.33
-      }
-    }
-  }
-  attr {
-    name: "area_range"
-    type: "list(float)"
-    default_value {
-      list {
-        f: 0.05
-        f: 1
-      }
-    }
-  }
-  attr {
-    name: "max_attempts"
-    type: "int"
-    default_value {
-      i: 100
-    }
-  }
-  attr {
-    name: "use_image_if_no_bounding_boxes"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "Save"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
+  name: "Softsign"
   input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+    name: "features"
+    type_attr: "T"
   }
-  input_arg {
-    name: "data"
-    type_list_attr: "T"
+  output_arg {
+    name: "activations"
+    type_attr: "T"
   }
   attr {
     name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
   }
 }
 op {
-  name: "Save"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
+  name: "Softsign"
   input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+    name: "features"
+    type_attr: "T"
   }
-  input_arg {
-    name: "data"
-    type_list_attr: "T"
+  output_arg {
+    name: "activations"
+    type_attr: "T"
   }
   attr {
     name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "SaveSlices"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
-  }
+  name: "Softsign"
   input_arg {
-    name: "shapes_and_slices"
-    type: DT_STRING
+    name: "features"
+    type_attr: "T"
   }
-  input_arg {
-    name: "data"
-    type_list_attr: "T"
+  output_arg {
+    name: "activations"
+    type_attr: "T"
   }
   attr {
     name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
 }
 op {
-  name: "SaveSlices"
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
-  }
+  name: "Softsign"
   input_arg {
-    name: "shapes_and_slices"
-    type: DT_STRING
+    name: "features"
+    type_attr: "T"
   }
-  input_arg {
-    name: "data"
-    type_list_attr: "T"
+  output_arg {
+    name: "activations"
+    type_attr: "T"
   }
   attr {
     name: "T"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "SaveV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
-  }
+  name: "SoftsignGrad"
   input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+    name: "gradients"
+    type_attr: "T"
   }
   input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+    name: "features"
+    type_attr: "T"
   }
-  input_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
   }
 }
 op {
-  name: "SaveV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
-  }
+  name: "SoftsignGrad"
   input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+    name: "gradients"
+    type_attr: "T"
   }
   input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+    name: "features"
+    type_attr: "T"
   }
-  input_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "ScalarSummary"
+  name: "SoftsignGrad"
   input_arg {
-    name: "tags"
-    type: DT_STRING
+    name: "gradients"
+    type_attr: "T"
   }
   input_arg {
-    name: "values"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
+    name: "backprops"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -30869,23 +52048,26 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "ScalarSummary"
+  name: "SoftsignGrad"
   input_arg {
-    name: "tags"
-    type: DT_STRING
+    name: "gradients"
+    type_attr: "T"
   }
   input_arg {
-    name: "values"
+    name: "features"
     type_attr: "T"
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
+    name: "backprops"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -30895,10 +52077,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -30908,74 +52091,174 @@ op {
   }
 }
 op {
-  name: "ScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "SpaceToBatch"
   input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "paddings"
+    type_attr: "Tpaddings"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
+    name: "block_size"
+    type: "int"
     has_minimum: true
+    minimum: 2
+  }
+}
+op {
+  name: "SpaceToBatchND"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "block_shape"
+    type_attr: "Tblock_shape"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "Tblock_shape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "ScatterAdd"
+  name: "SpaceToDepth"
   input_arg {
-    name: "ref"
+    name: "input"
     type_attr: "T"
-    is_ref: true
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
   }
+}
+op {
+  name: "SpaceToDepth"
   input_arg {
-    name: "updates"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "block_size"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
@@ -30996,45 +52279,87 @@ op {
     }
   }
   attr {
-    name: "Tindices"
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "use_locking"
+    name: "has_known_shape"
     type: "bool"
-    default_value {
-      b: false
-    }
   }
 }
 op {
-  name: "ScatterAdd"
+  name: "SparseAccumulatorApplyGradient"
   input_arg {
-    name: "ref"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
     is_ref: true
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "local_step"
+    type: DT_INT64
   }
   input_arg {
-    name: "updates"
-    type_attr: "T"
+    name: "gradient_indices"
+    type: DT_INT64
   }
-  output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
@@ -31054,49 +52379,93 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "Tindices"
+    name: "has_known_shape"
+    type: "bool"
+  }
+}
+op {
+  name: "SparseAccumulatorApplyGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "local_step"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "gradient_values"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "gradient_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
         type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "use_locking"
+    name: "has_known_shape"
     type: "bool"
-    default_value {
-      b: false
-    }
   }
 }
 op {
-  name: "ScatterDiv"
+  name: "SparseAccumulatorTakeGradient"
   input_arg {
-    name: "ref"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
     is_ref: true
   }
   input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
     name: "indices"
-    type_attr: "Tindices"
+    type: DT_INT64
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
   }
   output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
+    name: "shape"
+    type: DT_INT64
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
@@ -31117,46 +52486,80 @@ op {
       }
     }
   }
+}
+op {
+  name: "SparseAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
   attr {
-    name: "Tindices"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterDiv"
+  name: "SparseAccumulatorTakeGradient"
   input_arg {
-    name: "ref"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
     is_ref: true
   }
   input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
     name: "indices"
-    type_attr: "Tindices"
+    type: DT_INT64
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
   }
   output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
+    name: "shape"
+    type: DT_INT64
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
@@ -31176,46 +52579,101 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+}
+op {
+  name: "SparseAccumulatorTakeGradient"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "num_required"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "shape"
+    type: DT_INT64
+  }
   attr {
-    name: "Tindices"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
         type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterMul"
+  name: "SparseAdd"
   input_arg {
-    name: "ref"
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "a_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "updates"
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
     type_attr: "T"
   }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
   output_arg {
-    name: "output_ref"
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
     type_attr: "T"
-    is_ref: true
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
   }
   attr {
     name: "T"
@@ -31240,42 +52698,64 @@ op {
     }
   }
   attr {
-    name: "Tindices"
+    name: "Treal"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterMul"
+  name: "SparseAdd"
   input_arg {
-    name: "ref"
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "a_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "updates"
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
     type_attr: "T"
   }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
   output_arg {
-    name: "output_ref"
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
     type_attr: "T"
-    is_ref: true
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
   }
   attr {
     name: "T"
@@ -31302,75 +52782,66 @@ op {
     }
   }
   attr {
-    name: "Tindices"
+    name: "Treal"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterNd"
+  name: "SparseAdd"
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "a_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "updates"
+    name: "a_values"
     type_attr: "T"
   }
   input_arg {
-    name: "shape"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
+    name: "a_shape"
+    type: DT_INT64
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
   }
-}
-op {
-  name: "ScatterNdAdd"
   input_arg {
-    name: "ref"
+    name: "b_values"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "b_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "updates"
-    type_attr: "T"
+    name: "thresh"
+    type_attr: "Treal"
   }
   output_arg {
-    name: "output_ref"
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
     type_attr: "T"
-    is_ref: true
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
   }
   attr {
     name: "T"
@@ -31391,46 +52862,74 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "Tindices"
+    name: "Treal"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterNdAdd"
+  name: "SparseAdd"
   input_arg {
-    name: "ref"
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "a_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "updates"
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
     type_attr: "T"
   }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "thresh"
+    type_attr: "Treal"
+  }
   output_arg {
-    name: "output_ref"
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sum_values"
     type_attr: "T"
-    is_ref: true
+  }
+  output_arg {
+    name: "sum_shape"
+    type: DT_INT64
   }
   attr {
     name: "T"
@@ -31439,17 +52938,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -31457,39 +52957,50 @@ op {
     }
   }
   attr {
-    name: "Tindices"
+    name: "Treal"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
         type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterNdNonAliasingAdd"
+  name: "SparseAddGrad"
   input_arg {
-    name: "input"
+    name: "backprop_val_grad"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "a_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "updates"
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "b_val_grad"
     type_attr: "T"
   }
   attr {
@@ -31514,33 +53025,31 @@ op {
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
 }
 op {
-  name: "ScatterNdNonAliasingAdd"
+  name: "SparseAddGrad"
   input_arg {
-    name: "input"
+    name: "backprop_val_grad"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "a_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "updates"
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "b_val_grad"
     type_attr: "T"
   }
   attr {
@@ -31567,36 +53076,32 @@ op {
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
 }
 op {
-  name: "ScatterNdSub"
+  name: "SparseAddGrad"
   input_arg {
-    name: "ref"
+    name: "backprop_val_grad"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "a_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "updates"
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "b_val_grad"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -31617,46 +53122,38 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterNdSub"
+  name: "SparseAddGrad"
   input_arg {
-    name: "ref"
+    name: "backprop_val_grad"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "a_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "updates"
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sum_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "a_val_grad"
     type_attr: "T"
   }
   output_arg {
-    name: "output_ref"
+    name: "b_val_grad"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -31665,100 +53162,64 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "ScatterNdUpdate"
+  name: "SparseApplyAdadelta"
   input_arg {
-    name: "ref"
+    name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "output_ref"
+  input_arg {
+    name: "accum_update"
     type_attr: "T"
     is_ref: true
   }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-}
-op {
-  name: "ScatterSub"
   input_arg {
-    name: "ref"
+    name: "grad"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
     name: "indices"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
   output_arg {
-    name: "output_ref"
+    name: "out"
     type_attr: "T"
     is_ref: true
   }
@@ -31803,22 +53264,44 @@ op {
   }
 }
 op {
-  name: "ScatterSub"
+  name: "SparseApplyAdadelta"
   input_arg {
-    name: "ref"
+    name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "updates"
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   output_arg {
-    name: "output_ref"
+    name: "out"
     type_attr: "T"
     is_ref: true
   }
@@ -31865,211 +53348,46 @@ op {
   }
 }
 op {
-  name: "ScatterUpdate"
+  name: "SparseApplyAdadelta"
   input_arg {
-    name: "ref"
+    name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_ref"
+    name: "accum"
     type_attr: "T"
     is_ref: true
   }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "SdcaFprint"
-  input_arg {
-    name: "input"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-}
-op {
-  name: "SdcaOptimizer"
-  input_arg {
-    name: "sparse_example_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "sparse_feature_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
-  }
-  input_arg {
-    name: "sparse_feature_values"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features_with_values"
-  }
-  input_arg {
-    name: "dense_features"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
-  }
-  input_arg {
-    name: "example_weights"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "example_labels"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-    number_attr: "num_sparse_features"
-  }
   input_arg {
-    name: "sparse_weights"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features"
+    name: "accum_update"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "dense_weights"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "example_state_data"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out_example_state_data"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "out_delta_sparse_weights"
-    type: DT_FLOAT
-    number_attr: "num_sparse_features"
-  }
-  output_arg {
-    name: "out_delta_dense_weights"
-    type: DT_FLOAT
-    number_attr: "num_dense_features"
-  }
-  attr {
-    name: "loss_type"
-    type: "string"
-    allowed_values {
-      list {
-        s: "logistic_loss"
-        s: "squared_loss"
-        s: "hinge_loss"
-        s: "smooth_hinge_loss"
-      }
-    }
-  }
-  attr {
-    name: "adaptative"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "num_sparse_features"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_sparse_features_with_values"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "num_dense_features"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "l1"
-    type: "float"
-  }
-  attr {
-    name: "l2"
-    type: "float"
-  }
-  attr {
-    name: "num_loss_partitions"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "num_inner_iterations"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "rho"
+    type_attr: "T"
   }
-}
-op {
-  name: "SdcaShrinkL1"
   input_arg {
-    name: "weights"
-    type: DT_FLOAT
-    number_attr: "num_features"
-    is_ref: true
-  }
-  attr {
-    name: "num_features"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "l1"
-    type: "float"
-  }
-  attr {
-    name: "l2"
-    type: "float"
+    name: "epsilon"
+    type_attr: "T"
   }
-}
-op {
-  name: "SegmentMax"
   input_arg {
-    name: "data"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "segment_ids"
+    name: "indices"
     type_attr: "Tindices"
   }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -32078,13 +53396,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -32098,106 +53424,55 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SegmentMax"
+  name: "SparseApplyAdadelta"
   input_arg {
-    name: "data"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "SegmentMean"
   input_arg {
-    name: "data"
+    name: "accum_update"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
+    name: "lr"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-}
-op {
-  name: "SegmentMean"
   input_arg {
-    name: "data"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "segment_ids"
+    name: "indices"
     type_attr: "Tindices"
   }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -32207,11 +53482,17 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -32228,62 +53509,42 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SegmentMin"
+  name: "SparseApplyAdagrad"
   input_arg {
-    name: "data"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
-  }
-  output_arg {
-    name: "output"
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-}
-op {
-  name: "SegmentMin"
   input_arg {
-    name: "data"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "segment_ids"
+    name: "indices"
     type_attr: "Tindices"
   }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -32292,15 +53553,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -32314,20 +53578,42 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SegmentProd"
+  name: "SparseApplyAdagrad"
   input_arg {
-    name: "data"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "segment_ids"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
     type_attr: "Tindices"
   }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -32348,6 +53634,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -32361,20 +53649,42 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SegmentProd"
+  name: "SparseApplyAdagrad"
   input_arg {
-    name: "data"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "segment_ids"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
     type_attr: "Tindices"
   }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -32397,6 +53707,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -32410,20 +53721,42 @@ op {
       }
     }
   }
-}
-op {
-  name: "SegmentSum"
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
   input_arg {
-    name: "data"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "segment_ids"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
     type_attr: "Tindices"
   }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -32432,18 +53765,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -32457,20 +53793,59 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SegmentSum"
+  name: "SparseApplyAdagradDA"
   input_arg {
-    name: "data"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "segment_ids"
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
     type_attr: "Tindices"
   }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -32491,8 +53866,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -32506,302 +53879,176 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Select"
-  input_arg {
-    name: "condition"
-    type: DT_BOOL
-  }
+  name: "SparseApplyAdagradDA"
   input_arg {
-    name: "t"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "e"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
+    name: "gradient_accumulator"
     type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SelfAdjointEig"
   input_arg {
-    name: "input"
+    name: "gradient_squared_accumulator"
     type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-  deprecation {
-    version: 11
-  }
-}
-op {
-  name: "SelfAdjointEigV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "e"
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "v"
+  input_arg {
+    name: "l1"
     type_attr: "T"
   }
-  attr {
-    name: "compute_v"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "SelfAdjointEigV2"
   input_arg {
-    name: "input"
+    name: "l2"
     type_attr: "T"
   }
-  output_arg {
-    name: "e"
-    type_attr: "T"
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
   }
   output_arg {
-    name: "v"
+    name: "out"
     type_attr: "T"
-  }
-  attr {
-    name: "compute_v"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "Selu"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-}
-op {
-  name: "SeluGrad"
-  input_arg {
-    name: "gradients"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "outputs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "backprops"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
 }
 op {
-  name: "SerializeIterator"
-  input_arg {
-    name: "resource_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "serialized"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
-op {
-  name: "SerializeManySparse"
-  input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-  }
+  name: "SparseApplyAdagradDA"
   input_arg {
-    name: "sparse_values"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "sparse_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "serialized_sparse"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
   }
-}
-op {
-  name: "SerializeSparse"
   input_arg {
-    name: "sparse_indices"
-    type: DT_INT64
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "sparse_values"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "sparse_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "serialized_sparse"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
+    name: "indices"
+    type_attr: "Tindices"
   }
-}
-op {
-  name: "SerializeTensor"
   input_arg {
-    name: "tensor"
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SetSize"
   input_arg {
-    name: "set_indices"
-    type: DT_INT64
+    name: "l1"
+    type_attr: "T"
   }
   input_arg {
-    name: "set_values"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "set_shape"
+    name: "global_step"
     type: DT_INT64
   }
   output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "validate_indices"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
-        type: DT_STRING
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-}
-op {
-  name: "Shape"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
   attr {
-    name: "out_type"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
         type: DT_INT32
@@ -32809,364 +54056,460 @@ op {
       }
     }
   }
-}
-op {
-  name: "ShapeN"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-    number_attr: "N"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
   attr {
-    name: "out_type"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      b: false
     }
   }
 }
 op {
-  name: "ShardedFilename"
+  name: "SparseApplyAdagradDA"
   input_arg {
-    name: "basename"
-    type: DT_STRING
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "shard"
-    type: DT_INT32
+    name: "gradient_accumulator"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "num_shards"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "filename"
-    type: DT_STRING
+    name: "gradient_squared_accumulator"
+    type_attr: "T"
+    is_ref: true
   }
-}
-op {
-  name: "ShardedFilespec"
   input_arg {
-    name: "basename"
-    type: DT_STRING
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_shards"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "filename"
-    type: DT_STRING
+    name: "indices"
+    type_attr: "Tindices"
   }
-}
-op {
-  name: "ShuffleDataset"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "buffer_size"
-    type: DT_INT64
+    name: "l1"
+    type_attr: "T"
   }
   input_arg {
-    name: "seed"
-    type: DT_INT64
+    name: "l2"
+    type_attr: "T"
   }
   input_arg {
-    name: "seed2"
+    name: "global_step"
     type: DT_INT64
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ShuffleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "ShuffleDataset"
+  name: "SparseApplyCenteredRMSProp"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "buffer_size"
-    type: DT_INT64
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "seed"
-    type: DT_INT64
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "seed2"
-    type: DT_INT64
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "reshuffle_each_iteration"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-}
-op {
-  name: "Sigmoid"
   input_arg {
-    name: "x"
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   output_arg {
-    name: "y"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SigmoidGrad"
+  name: "SparseApplyCenteredRMSProp"
   input_arg {
-    name: "x"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "y"
+    name: "mg"
     type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "z"
+  input_arg {
+    name: "ms"
     type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
   }
-}
-op {
-  name: "SigmoidGrad"
   input_arg {
-    name: "y"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "dy"
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   output_arg {
-    name: "z"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "Sign"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Sin"
+  name: "SparseApplyCenteredRMSProp"
   input_arg {
-    name: "x"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   output_arg {
-    name: "y"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Sinh"
+  name: "SparseApplyCenteredRMSProp"
   input_arg {
-    name: "x"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mg"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "ms"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "mom"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   output_arg {
-    name: "y"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
         type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "Size"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
   attr {
-    name: "out_type"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
         type: DT_INT32
@@ -33174,150 +54517,172 @@ op {
       }
     }
   }
-}
-op {
-  name: "SkipDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "count"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "SkipDataset"
+  name: "SparseApplyFtrl"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "count"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
   }
-}
-op {
-  name: "Skipgram"
-  output_arg {
-    name: "vocab_word"
-    type: DT_STRING
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  output_arg {
-    name: "vocab_freq"
-    type: DT_INT32
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "words_per_epoch"
-    type: DT_INT64
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  output_arg {
-    name: "current_epoch"
-    type: DT_INT32
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-  output_arg {
-    name: "total_words_processed"
-    type: DT_INT64
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-  output_arg {
-    name: "examples"
-    type: DT_INT32
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   output_arg {
-    name: "labels"
-    type: DT_INT32
-  }
-  attr {
-    name: "filename"
-    type: "string"
-  }
-  attr {
-    name: "batch_size"
-    type: "int"
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "window_size"
-    type: "int"
-    default_value {
-      i: 5
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
     }
   }
   attr {
-    name: "min_count"
-    type: "int"
-    default_value {
-      i: 5
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "subsample"
-    type: "float"
+    name: "use_locking"
+    type: "bool"
     default_value {
-      f: 0.001
+      b: false
     }
   }
-  deprecation {
-    version: 19
-  }
-  is_stateful: true
 }
 op {
-  name: "Slice"
+  name: "SparseApplyFtrl"
   input_arg {
-    name: "input"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "begin"
-    type_attr: "Index"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "size"
-    type_attr: "Index"
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "Index"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
@@ -33326,68 +54691,59 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Softmax"
+  name: "SparseApplyFtrl"
   input_arg {
-    name: "logits"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "softmax"
+  input_arg {
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-}
-op {
-  name: "SoftmaxCrossEntropyWithLogits"
   input_arg {
-    name: "features"
+    name: "linear"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "labels"
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
-    name: "loss"
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
-  output_arg {
-    name: "backprop"
+  input_arg {
+    name: "l1"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
   }
-}
-op {
-  name: "Softplus"
   input_arg {
-    name: "features"
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
-    name: "activations"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -33396,26 +54752,87 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Softplus"
+  name: "SparseApplyFtrl"
   input_arg {
-    name: "features"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
-    name: "activations"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -33425,31 +54842,90 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SoftplusGrad"
+  name: "SparseApplyFtrlV2"
   input_arg {
-    name: "gradients"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "features"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
-    name: "backprops"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -33458,30 +54934,88 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SoftplusGrad"
+  name: "SparseApplyFtrlV2"
   input_arg {
-    name: "gradients"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "features"
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
-    name: "backprops"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -33490,56 +55024,90 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "Softsign"
-  input_arg {
-    name: "features"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "activations"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "Softsign"
+  name: "SparseApplyFtrlV2"
   input_arg {
-    name: "features"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "linear"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
-    name: "activations"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -33548,64 +55116,91 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SoftsignGrad"
+  name: "SparseApplyFtrlV2"
   input_arg {
-    name: "gradients"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "features"
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
-  output_arg {
-    name: "backprops"
+  input_arg {
+    name: "linear"
     type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
   }
-}
-op {
-  name: "SoftsignGrad"
   input_arg {
-    name: "gradients"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "features"
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
     type_attr: "T"
   }
   output_arg {
-    name: "backprops"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -33615,42 +55210,26 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "SpaceToBatch"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
   attr {
-    name: "Tpaddings"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
         type: DT_INT32
@@ -33659,53 +55238,71 @@ op {
     }
   }
   attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "SpaceToBatchND"
+  name: "SparseApplyMomentum"
   input_arg {
-    name: "input"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "block_shape"
-    type_attr: "Tblock_shape"
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tblock_shape"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "Tpaddings"
+    name: "Tindices"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
         type: DT_INT32
@@ -33713,88 +55310,56 @@ op {
       }
     }
   }
-}
-op {
-  name: "SpaceToDepth"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "SpaceToDepth"
+  name: "SparseApplyMomentum"
   input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "block_size"
-    type: "int"
-    has_minimum: true
-    minimum: 2
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
-      }
-    }
-  }
-}
-op {
-  name: "SparseAccumulatorApplyGradient"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "accum"
+    type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "local_step"
-    type: DT_INT64
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "gradient_indices"
-    type: DT_INT64
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "gradient_values"
-    type_attr: "dtype"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "gradient_shape"
-    type: DT_INT64
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -33812,39 +55377,71 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "has_known_shape"
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
     type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "SparseAccumulatorApplyGradient"
+  name: "SparseApplyMomentum"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "var"
+    type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "local_step"
-    type: DT_INT64
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "gradient_indices"
-    type: DT_INT64
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "gradient_values"
-    type_attr: "dtype"
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "gradient_shape"
-    type: DT_INT64
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -33864,149 +55461,154 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "has_known_shape"
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
     type: "bool"
+    default_value {
+      b: false
+    }
   }
 }
 op {
-  name: "SparseAccumulatorTakeGradient"
+  name: "SparseApplyMomentum"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "var"
+    type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "num_required"
-    type: DT_INT32
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
-  output_arg {
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
     name: "indices"
-    type: DT_INT64
+    type_attr: "Tindices"
   }
-  output_arg {
-    name: "values"
-    type_attr: "dtype"
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
   }
   output_arg {
-    name: "shape"
-    type: DT_INT64
+    name: "out"
+    type_attr: "T"
+    is_ref: true
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "SparseAccumulatorTakeGradient"
-  input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "num_required"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "values"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "shape"
-    type: DT_INT64
-  }
   attr {
-    name: "dtype"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseAdd"
+  name: "SparseApplyProximalAdagrad"
   input_arg {
-    name: "a_indices"
-    type: DT_INT64
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "a_values"
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "a_shape"
-    type: DT_INT64
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "b_indices"
-    type: DT_INT64
+    name: "l1"
+    type_attr: "T"
   }
   input_arg {
-    name: "b_values"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "b_shape"
-    type: DT_INT64
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "thresh"
-    type_attr: "Treal"
-  }
-  output_arg {
-    name: "sum_indices"
-    type: DT_INT64
+    name: "indices"
+    type_attr: "Tindices"
   }
   output_arg {
-    name: "sum_values"
+    name: "out"
     type_attr: "T"
-  }
-  output_arg {
-    name: "sum_shape"
-    type: DT_INT64
+    is_ref: true
   }
   attr {
     name: "T"
@@ -34031,64 +55633,59 @@ op {
     }
   }
   attr {
-    name: "Treal"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseAdd"
+  name: "SparseApplyProximalAdagrad"
   input_arg {
-    name: "a_indices"
-    type: DT_INT64
+    name: "var"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "a_values"
+    name: "accum"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "a_shape"
-    type: DT_INT64
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "b_indices"
-    type: DT_INT64
+    name: "l1"
+    type_attr: "T"
   }
   input_arg {
-    name: "b_values"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "b_shape"
-    type: DT_INT64
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "thresh"
-    type_attr: "Treal"
-  }
-  output_arg {
-    name: "sum_indices"
-    type: DT_INT64
+    name: "indices"
+    type_attr: "Tindices"
   }
   output_arg {
-    name: "sum_values"
+    name: "out"
     type_attr: "T"
-  }
-  output_arg {
-    name: "sum_shape"
-    type: DT_INT64
+    is_ref: true
   }
   attr {
     name: "T"
@@ -34115,50 +55712,59 @@ op {
     }
   }
   attr {
-    name: "Treal"
+    name: "Tindices"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseAddGrad"
+  name: "SparseApplyProximalAdagrad"
   input_arg {
-    name: "backprop_val_grad"
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "a_indices"
-    type: DT_INT64
+    name: "l2"
+    type_attr: "T"
   }
   input_arg {
-    name: "b_indices"
-    type: DT_INT64
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
-    name: "sum_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "a_val_grad"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   output_arg {
-    name: "b_val_grad"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -34179,35 +55785,66 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseAddGrad"
+  name: "SparseApplyProximalAdagrad"
   input_arg {
-    name: "backprop_val_grad"
+    name: "var"
     type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "a_indices"
-    type: DT_INT64
+    name: "accum"
+    type_attr: "T"
+    is_ref: true
   }
   input_arg {
-    name: "b_indices"
-    type: DT_INT64
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "sum_indices"
-    type: DT_INT64
+    name: "l1"
+    type_attr: "T"
   }
-  output_arg {
-    name: "a_val_grad"
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
   output_arg {
-    name: "b_val_grad"
+    name: "out"
     type_attr: "T"
+    is_ref: true
   }
   attr {
     name: "T"
@@ -34216,51 +55853,59 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "SparseApplyAdadelta"
+  name: "SparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum_update"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
@@ -34317,32 +55962,22 @@ op {
   }
 }
 op {
-  name: "SparseApplyAdadelta"
+  name: "SparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum_update"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
@@ -34401,19 +56036,22 @@ op {
   }
 }
 op {
-  name: "SparseApplyAdagrad"
+  name: "SparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "alpha"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "lr"
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
@@ -34448,6 +56086,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -34470,19 +56111,22 @@ op {
   }
 }
 op {
-  name: "SparseApplyAdagrad"
+  name: "SparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "accum"
+    name: "alpha"
     type_attr: "T"
-    is_ref: true
   }
   input_arg {
-    name: "lr"
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
@@ -34505,17 +56149,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -34541,45 +56186,45 @@ op {
   }
 }
 op {
-  name: "SparseApplyAdagradDA"
+  name: "SparseApplyRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "ms"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "mom"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "grad"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "rho"
+    type_attr: "T"
   }
   input_arg {
-    name: "lr"
+    name: "momentum"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
+    name: "indices"
+    type_attr: "Tindices"
   }
   output_arg {
     name: "out"
@@ -34627,45 +56272,45 @@ op {
   }
 }
 op {
-  name: "SparseApplyAdagradDA"
+  name: "SparseApplyRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "ms"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "mom"
     type_attr: "T"
     is_ref: true
   }
   input_arg {
-    name: "grad"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "rho"
+    type_attr: "T"
   }
   input_arg {
-    name: "lr"
+    name: "momentum"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
+    name: "indices"
+    type_attr: "Tindices"
   }
   output_arg {
     name: "out"
@@ -34715,17 +56360,12 @@ op {
   }
 }
 op {
-  name: "SparseApplyCenteredRMSProp"
+  name: "SparseApplyRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
-  input_arg {
-    name: "mg"
-    type_attr: "T"
-    is_ref: true
-  }
   input_arg {
     name: "ms"
     type_attr: "T"
@@ -34784,6 +56424,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -34806,17 +56449,12 @@ op {
   }
 }
 op {
-  name: "SparseApplyCenteredRMSProp"
+  name: "SparseApplyRMSProp"
   input_arg {
     name: "var"
     type_attr: "T"
     is_ref: true
   }
-  input_arg {
-    name: "mg"
-    type_attr: "T"
-    is_ref: true
-  }
   input_arg {
     name: "ms"
     type_attr: "T"
@@ -34863,17 +56501,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -34899,53 +56538,58 @@ op {
   }
 }
 op {
-  name: "SparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
+  name: "SparseConcat"
   input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
   }
   input_arg {
-    name: "linear"
+    name: "values"
     type_attr: "T"
-    is_ref: true
+    number_attr: "N"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
   }
-  input_arg {
-    name: "lr"
+  output_arg {
+    name: "output_values"
     type_attr: "T"
   }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
   }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
+  attr {
+    name: "concat_dim"
+    type: "int"
   }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
   }
+}
+op {
+  name: "SparseConditionalAccumulator"
   output_arg {
-    name: "out"
-    type_attr: "T"
+    name: "handle"
+    type: DT_STRING
     is_ref: true
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
@@ -34967,160 +56611,294 @@ op {
     }
   }
   attr {
-    name: "Tindices"
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
     default_value {
-      b: false
+      s: ""
     }
   }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "SparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type_attr: "T"
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
     is_ref: true
   }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
     is_ref: true
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  is_stateful: true
+}
+op {
+  name: "SparseCross"
   input_arg {
     name: "indices"
-    type_attr: "Tindices"
+    type: DT_INT64
+    number_attr: "N"
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "values"
+    type_list_attr: "sparse_types"
   }
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
   }
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "hashed_output"
+    type: "bool"
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
   }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
+  attr {
+    name: "hash_key"
+    type: "int"
   }
-  output_arg {
-    name: "out"
-    type_attr: "T"
-    is_ref: true
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_STRING
       }
     }
   }
   attr {
-    name: "Tindices"
+    name: "out_type"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
         type: DT_INT64
+        type: DT_STRING
       }
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
+    name: "internal_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
     }
   }
 }
 op {
-  name: "SparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "SparseDenseCwiseAdd"
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "sp_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "l2"
+    name: "sp_values"
     type_attr: "T"
   }
   input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
+    name: "sp_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "lr_power"
+    name: "dense"
     type_attr: "T"
   }
   output_arg {
-    name: "out"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -35144,73 +56922,28 @@ op {
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "SparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "linear"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "SparseDenseCwiseAdd"
   input_arg {
-    name: "l1"
-    type_attr: "T"
+    name: "sp_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "l2"
+    name: "sp_values"
     type_attr: "T"
   }
   input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
+    name: "sp_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "lr_power"
+    name: "dense"
     type_attr: "T"
   }
   output_arg {
-    name: "out"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -35236,56 +56969,28 @@ op {
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "SparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
+  name: "SparseDenseCwiseAdd"
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "sp_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "grad"
+    name: "sp_values"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "sp_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "momentum"
+    name: "dense"
     type_attr: "T"
   }
   output_arg {
-    name: "out"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -35306,66 +57011,34 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "SparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
+  name: "SparseDenseCwiseAdd"
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "sp_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "grad"
+    name: "sp_values"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "sp_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "momentum"
+    name: "dense"
     type_attr: "T"
   }
   output_arg {
-    name: "out"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -35374,84 +57047,46 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "SparseApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
+  name: "SparseDenseCwiseDiv"
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "sp_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "l1"
+    name: "sp_values"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "sp_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "grad"
+    name: "dense"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   output_arg {
-    name: "out"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -35475,60 +57110,28 @@ op {
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "SparseApplyProximalAdagrad"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "accum"
-    type_attr: "T"
-    is_ref: true
-  }
+  name: "SparseDenseCwiseDiv"
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "sp_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "l1"
+    name: "sp_values"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "sp_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "grad"
+    name: "dense"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   output_arg {
-    name: "out"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -35554,55 +57157,28 @@ op {
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "SparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
+  name: "SparseDenseCwiseDiv"
   input_arg {
-    name: "alpha"
-    type_attr: "T"
+    name: "sp_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "l1"
+    name: "sp_values"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "sp_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "grad"
+    name: "dense"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   output_arg {
-    name: "out"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -35623,58 +57199,34 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "SparseApplyProximalGradientDescent"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
+  name: "SparseDenseCwiseDiv"
   input_arg {
-    name: "alpha"
-    type_attr: "T"
+    name: "sp_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "l1"
+    name: "sp_values"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
-    type_attr: "T"
+    name: "sp_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "grad"
+    name: "dense"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   output_arg {
-    name: "out"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -35683,86 +57235,46 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
-op {
-  name: "SparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
+}
+op {
+  name: "SparseDenseCwiseMul"
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "sp_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "momentum"
+    name: "sp_values"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "sp_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "grad"
+    name: "dense"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   output_arg {
-    name: "out"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -35786,69 +57298,28 @@ op {
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "SparseApplyRMSProp"
-  input_arg {
-    name: "var"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "ms"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "mom"
-    type_attr: "T"
-    is_ref: true
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
+  name: "SparseDenseCwiseMul"
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "sp_indices"
+    type: DT_INT64
   }
   input_arg {
-    name: "momentum"
+    name: "sp_values"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "sp_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "grad"
+    name: "dense"
     type_attr: "T"
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
   output_arg {
-    name: "out"
+    name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
@@ -35874,78 +57345,32 @@ op {
       }
     }
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
-  name: "SparseConcat"
+  name: "SparseDenseCwiseMul"
   input_arg {
-    name: "indices"
+    name: "sp_indices"
     type: DT_INT64
-    number_attr: "N"
   }
   input_arg {
-    name: "values"
+    name: "sp_values"
     type_attr: "T"
-    number_attr: "N"
   }
   input_arg {
-    name: "shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output_indices"
+    name: "sp_shape"
     type: DT_INT64
   }
-  output_arg {
-    name: "output_values"
+  input_arg {
+    name: "dense"
     type_attr: "T"
   }
   output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
-  attr {
-    name: "concat_dim"
-    type: "int"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-}
-op {
-  name: "SparseConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "dtype"
-    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
@@ -35962,99 +57387,78 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
-  attr {
-    name: "shape"
-    type: "shape"
+}
+op {
+  name: "SparseDenseCwiseMul"
+  input_arg {
+    name: "sp_indices"
+    type: DT_INT64
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "sp_values"
+    type_attr: "T"
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "sp_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "SparseConditionalAccumulator"
   output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
 }
 op {
-  name: "SparseCross"
+  name: "SparseFillEmptyRows"
   input_arg {
     name: "indices"
     type: DT_INT64
-    number_attr: "N"
   }
   input_arg {
     name: "values"
-    type_list_attr: "sparse_types"
+    type_attr: "T"
   }
   input_arg {
-    name: "shapes"
+    name: "dense_shape"
     type: DT_INT64
-    number_attr: "N"
   }
   input_arg {
-    name: "dense_inputs"
-    type_list_attr: "dense_types"
+    name: "default_value"
+    type_attr: "T"
   }
   output_arg {
     name: "output_indices"
@@ -36062,95 +57466,142 @@ op {
   }
   output_arg {
     name: "output_values"
-    type_attr: "out_type"
+    type_attr: "T"
   }
   output_arg {
-    name: "output_shape"
+    name: "empty_row_indicator"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "reverse_index_map"
     type: DT_INT64
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseFillEmptyRowsGrad"
+  input_arg {
+    name: "reverse_index_map"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "grad_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "d_default_value"
+    type_attr: "T"
   }
   attr {
-    name: "hashed_output"
-    type: "bool"
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseMatMul"
+  input_arg {
+    name: "a"
+    type_attr: "Ta"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "Tb"
+  }
+  output_arg {
+    name: "product"
+    type: DT_FLOAT
   }
   attr {
-    name: "num_buckets"
-    type: "int"
-    has_minimum: true
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "hash_key"
-    type: "int"
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_INT64
-        type: DT_STRING
-      }
+    name: "a_is_sparse"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "dense_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_INT64
-        type: DT_STRING
-      }
+    name: "b_is_sparse"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "out_type"
+    name: "Ta"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_INT64
-        type: DT_STRING
+        type: DT_FLOAT
+        type: DT_BFLOAT16
       }
     }
   }
   attr {
-    name: "internal_type"
+    name: "Tb"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_INT64
-        type: DT_STRING
+        type: DT_FLOAT
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "SparseDenseCwiseAdd"
+  name: "SparseReduceMax"
   input_arg {
-    name: "sp_indices"
+    name: "input_indices"
     type: DT_INT64
   }
   input_arg {
-    name: "sp_values"
+    name: "input_values"
     type_attr: "T"
   }
   input_arg {
-    name: "sp_shape"
+    name: "input_shape"
     type: DT_INT64
   }
   input_arg {
-    name: "dense"
-    type_attr: "T"
+    name: "reduction_axes"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -36158,44 +57609,46 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "SparseDenseCwiseAdd"
+  name: "SparseReduceMax"
   input_arg {
-    name: "sp_indices"
+    name: "input_indices"
     type: DT_INT64
   }
   input_arg {
-    name: "sp_values"
+    name: "input_values"
     type_attr: "T"
   }
   input_arg {
-    name: "sp_shape"
+    name: "input_shape"
     type: DT_INT64
   }
   input_arg {
-    name: "dense"
-    type_attr: "T"
+    name: "reduction_axes"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -36203,17 +57656,12 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -36222,27 +57670,34 @@ op {
   }
 }
 op {
-  name: "SparseDenseCwiseDiv"
+  name: "SparseReduceMax"
   input_arg {
-    name: "sp_indices"
+    name: "input_indices"
     type: DT_INT64
   }
   input_arg {
-    name: "sp_values"
+    name: "input_values"
     type_attr: "T"
   }
   input_arg {
-    name: "sp_shape"
+    name: "input_shape"
     type: DT_INT64
   }
   input_arg {
-    name: "dense"
-    type_attr: "T"
+    name: "reduction_axes"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -36250,44 +57705,49 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "SparseDenseCwiseDiv"
+  name: "SparseReduceMax"
   input_arg {
-    name: "sp_indices"
+    name: "input_indices"
     type: DT_INT64
   }
   input_arg {
-    name: "sp_values"
+    name: "input_values"
     type_attr: "T"
   }
   input_arg {
-    name: "sp_shape"
+    name: "input_shape"
     type: DT_INT64
   }
   input_arg {
-    name: "dense"
-    type_attr: "T"
+    name: "reduction_axes"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -36295,17 +57755,13 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -36314,27 +57770,42 @@ op {
   }
 }
 op {
-  name: "SparseDenseCwiseMul"
+  name: "SparseReduceMaxSparse"
   input_arg {
-    name: "sp_indices"
+    name: "input_indices"
     type: DT_INT64
   }
   input_arg {
-    name: "sp_values"
+    name: "input_values"
     type_attr: "T"
   }
   input_arg {
-    name: "sp_shape"
+    name: "input_shape"
     type: DT_INT64
   }
   input_arg {
-    name: "dense"
-    type_attr: "T"
+    name: "reduction_axes"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
     type_attr: "T"
   }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -36342,44 +57813,54 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "SparseDenseCwiseMul"
+  name: "SparseReduceMaxSparse"
   input_arg {
-    name: "sp_indices"
+    name: "input_indices"
     type: DT_INT64
   }
   input_arg {
-    name: "sp_values"
+    name: "input_values"
     type_attr: "T"
   }
   input_arg {
-    name: "sp_shape"
+    name: "input_shape"
     type: DT_INT64
   }
   input_arg {
-    name: "dense"
-    type_attr: "T"
+    name: "reduction_axes"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
     type_attr: "T"
   }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -36387,17 +57868,12 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -36406,22 +57882,22 @@ op {
   }
 }
 op {
-  name: "SparseFillEmptyRows"
+  name: "SparseReduceMaxSparse"
   input_arg {
-    name: "indices"
+    name: "input_indices"
     type: DT_INT64
   }
   input_arg {
-    name: "values"
+    name: "input_values"
     type_attr: "T"
   }
   input_arg {
-    name: "dense_shape"
+    name: "input_shape"
     type: DT_INT64
   }
   input_arg {
-    name: "default_value"
-    type_attr: "T"
+    name: "reduction_axes"
+    type: DT_INT32
   }
   output_arg {
     name: "output_indices"
@@ -36432,112 +57908,97 @@ op {
     type_attr: "T"
   }
   output_arg {
-    name: "empty_row_indicator"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "reverse_index_map"
+    name: "output_shape"
     type: DT_INT64
   }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
 }
 op {
-  name: "SparseFillEmptyRowsGrad"
+  name: "SparseReduceMaxSparse"
   input_arg {
-    name: "reverse_index_map"
+    name: "input_indices"
     type: DT_INT64
   }
   input_arg {
-    name: "grad_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "d_values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "d_default_value"
+    name: "input_values"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "SparseMatMul"
   input_arg {
-    name: "a"
-    type_attr: "Ta"
+    name: "input_shape"
+    type: DT_INT64
   }
   input_arg {
-    name: "b"
-    type_attr: "Tb"
+    name: "reduction_axes"
+    type: DT_INT32
   }
   output_arg {
-    name: "product"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "output_indices"
+    type: DT_INT64
   }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
   }
-  attr {
-    name: "a_is_sparse"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
   }
   attr {
-    name: "b_is_sparse"
+    name: "keep_dims"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "Ta"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tb"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "SparseReduceMax"
+  name: "SparseReduceSum"
   input_arg {
     name: "input_indices"
     type: DT_INT64
@@ -36572,19 +58033,24 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "SparseReduceMax"
+  name: "SparseReduceSum"
   input_arg {
     name: "input_indices"
     type: DT_INT64
@@ -36619,12 +58085,17 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -36633,7 +58104,7 @@ op {
   }
 }
 op {
-  name: "SparseReduceMaxSparse"
+  name: "SparseReduceSum"
   input_arg {
     name: "input_indices"
     type: DT_INT64
@@ -36651,17 +58122,9 @@ op {
     type: DT_INT32
   }
   output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
+    name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
   attr {
     name: "keep_dims"
     type: "bool"
@@ -36676,19 +58139,27 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "SparseReduceMaxSparse"
+  name: "SparseReduceSum"
   input_arg {
     name: "input_indices"
     type: DT_INT64
@@ -36706,17 +58177,9 @@ op {
     type: DT_INT32
   }
   output_arg {
-    name: "output_indices"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "output_values"
+    name: "output"
     type_attr: "T"
   }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
-  }
   attr {
     name: "keep_dims"
     type: "bool"
@@ -36732,11 +58195,17 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -36745,7 +58214,7 @@ op {
   }
 }
 op {
-  name: "SparseReduceSum"
+  name: "SparseReduceSumSparse"
   input_arg {
     name: "input_indices"
     type: DT_INT64
@@ -36763,9 +58232,17 @@ op {
     type: DT_INT32
   }
   output_arg {
-    name: "output"
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
     type_attr: "T"
   }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
   attr {
     name: "keep_dims"
     type: "bool"
@@ -36797,7 +58274,7 @@ op {
   }
 }
 op {
-  name: "SparseReduceSum"
+  name: "SparseReduceSumSparse"
   input_arg {
     name: "input_indices"
     type: DT_INT64
@@ -36815,9 +58292,17 @@ op {
     type: DT_INT32
   }
   output_arg {
-    name: "output"
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
     type_attr: "T"
   }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
   attr {
     name: "keep_dims"
     type: "bool"
@@ -36906,6 +58391,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -36954,17 +58442,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -37017,13 +58506,307 @@ op {
     name: "output_indices"
     type: DT_INT64
   }
-  output_arg {
-    name: "output_shape"
-    type: DT_INT64
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+}
+op {
+  name: "SparseSegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentMeanGrad"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_dim0"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentMeanWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSqrtN"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSqrtNGrad"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_dim0"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSqrtNWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "SparseSegmentMean"
+  name: "SparseSegmentSum"
   input_arg {
     name: "data"
     type_attr: "T"
@@ -37047,6 +58830,13 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
@@ -37065,9 +58855,9 @@ op {
   }
 }
 op {
-  name: "SparseSegmentMeanGrad"
+  name: "SparseSegmentSum"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
@@ -37078,10 +58868,6 @@ op {
     name: "segment_ids"
     type: DT_INT32
   }
-  input_arg {
-    name: "output_dim0"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -37093,6 +58879,15 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -37111,7 +58906,7 @@ op {
   }
 }
 op {
-  name: "SparseSegmentSqrtN"
+  name: "SparseSegmentSum"
   input_arg {
     name: "data"
     type_attr: "T"
@@ -37135,6 +58930,16 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -37153,9 +58958,9 @@ op {
   }
 }
 op {
-  name: "SparseSegmentSqrtNGrad"
+  name: "SparseSegmentSum"
   input_arg {
-    name: "grad"
+    name: "data"
     type_attr: "T"
   }
   input_arg {
@@ -37166,10 +58971,6 @@ op {
     name: "segment_ids"
     type: DT_INT32
   }
-  input_arg {
-    name: "output_dim0"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -37181,6 +58982,16 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -37199,7 +59010,7 @@ op {
   }
 }
 op {
-  name: "SparseSegmentSum"
+  name: "SparseSegmentSumWithNumSegments"
   input_arg {
     name: "data"
     type_attr: "T"
@@ -37212,6 +59023,10 @@ op {
     name: "segment_ids"
     type: DT_INT32
   }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -37230,6 +59045,9 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -37246,9 +59064,22 @@ op {
       }
     }
   }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
-  name: "SparseSegmentSum"
+  name: "SparseSegmentSumWithNumSegments"
   input_arg {
     name: "data"
     type_attr: "T"
@@ -37261,6 +59092,10 @@ op {
     name: "segment_ids"
     type: DT_INT32
   }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -37273,10 +59108,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -37297,6 +59133,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "SparseSlice"
@@ -37409,6 +59258,102 @@ op {
     }
   }
 }
+op {
+  name: "SparseSoftmaxCrossEntropyWithLogits"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "labels"
+    type_attr: "Tlabels"
+  }
+  output_arg {
+    name: "loss"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tlabels"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMaximum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
 op {
   name: "SparseSparseMaximum"
   input_arg {
@@ -37457,6 +59402,8 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -37511,6 +59458,119 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMaximum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
@@ -37568,6 +59628,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -37627,6 +59689,67 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "SparseSparseMinimum"
+  input_arg {
+    name: "a_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -37787,6 +59910,122 @@ op {
     }
   }
 }
+op {
+  name: "SparseTensorDenseAdd"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseTensorDenseAdd"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "SparseTensorDenseMatMul"
   input_arg {
@@ -38140,6 +60379,31 @@ op {
     }
   }
 }
+op {
+  name: "Sqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SqrtGrad"
   input_arg {
@@ -38196,6 +60460,61 @@ op {
     }
   }
 }
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Square"
   input_arg {
@@ -38206,6 +60525,37 @@ op {
     name: "y"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "SquaredDifference"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -38221,6 +60571,7 @@ op {
       }
     }
   }
+  is_commutative: true
 }
 op {
   name: "SquaredDifference"
@@ -38242,6 +60593,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -38664,6 +61016,61 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomUniform"
   input_arg {
@@ -38706,6 +61113,61 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessTruncatedNormal"
   input_arg {
@@ -38748,6 +61210,61 @@ op {
     }
   }
 }
+op {
+  name: "StatelessTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatsAggregatorHandle"
   output_arg {
@@ -39181,27 +61698,57 @@ op {
   }
 }
 op {
-  name: "StringToNumber"
+  name: "StringToNumber"
+  input_arg {
+    name: "string_tensor"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sub"
   input_arg {
-    name: "string_tensor"
-    type: DT_STRING
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "out_type"
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "out_type"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -39228,6 +61775,10 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -39256,6 +61807,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -39415,6 +61967,126 @@ op {
     }
   }
 }
+op {
+  name: "Sum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Svd"
   input_arg {
@@ -39560,6 +62232,39 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TFRecordReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
 op {
   name: "TFRecordReaderV2"
   output_arg {
@@ -39708,6 +62413,33 @@ op {
     }
   }
 }
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Tanh"
   input_arg {
@@ -39732,6 +62464,31 @@ op {
     }
   }
 }
+op {
+  name: "Tanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TanhGrad"
   input_arg {
@@ -39788,6 +62545,35 @@ op {
     }
   }
 }
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TemporaryVariable"
   output_arg {
@@ -39880,6 +62666,16 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "TensorArrayCloseV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  deprecation {
+    version: 26
+  }
+}
 op {
   name: "TensorArrayCloseV3"
   input_arg {
@@ -40057,6 +62853,41 @@ op {
     }
   }
 }
+op {
+  name: "TensorArrayGatherV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  deprecation {
+    version: 26
+  }
+}
 op {
   name: "TensorArrayGatherV3"
   input_arg {
@@ -40134,6 +62965,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorArrayGradV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
 op {
   name: "TensorArrayGradV3"
   input_arg {
@@ -40240,6 +63094,32 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorArrayReadV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "value"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
 op {
   name: "TensorArrayReadV3"
   input_arg {
@@ -40322,6 +63202,36 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorArrayScatterV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
 op {
   name: "TensorArrayScatterV3"
   input_arg {
@@ -40384,6 +63294,24 @@ op {
     type: DT_INT32
   }
 }
+op {
+  name: "TensorArraySizeV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  deprecation {
+    version: 26
+  }
+}
 op {
   name: "TensorArraySizeV3"
   input_arg {
@@ -40458,6 +63386,36 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorArraySplitV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
 op {
   name: "TensorArraySplitV3"
   input_arg {
@@ -40559,6 +63517,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorArrayV2"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
 op {
   name: "TensorArrayV3"
   input_arg {
@@ -40664,117 +63671,339 @@ op {
       s: ""
     }
   }
-  is_stateful: true
+  is_stateful: true
+}
+op {
+  name: "TensorArrayWrite"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 16
+  }
+}
+op {
+  name: "TensorArrayWriteV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TensorArrayWriteV2"
+  input_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  deprecation {
+    version: 26
+  }
+}
+op {
+  name: "TensorArrayWriteV3"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorListElementShape"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListFromTensor"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListGetItem"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "item"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
 }
 op {
-  name: "TensorArrayWrite"
+  name: "TensorListLength"
   input_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "input_handle"
+    type: DT_VARIANT
   }
-  input_arg {
-    name: "index"
+  output_arg {
+    name: "length"
     type: DT_INT32
   }
+}
+op {
+  name: "TensorListPopBack"
   input_arg {
-    name: "value"
-    type_attr: "T"
+    name: "input_handle"
+    type: DT_VARIANT
   }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
   }
   output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
+    name: "tensor"
+    type_attr: "element_dtype"
   }
   attr {
-    name: "T"
+    name: "element_dtype"
     type: "type"
   }
-  deprecation {
-    version: 16
-  }
 }
 op {
-  name: "TensorArrayWriteV2"
+  name: "TensorListPushBack"
   input_arg {
-    name: "handle"
-    type: DT_STRING
+    name: "input_handle"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "index"
-    type: DT_INT32
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
   }
+}
+op {
+  name: "TensorListReserve"
   input_arg {
-    name: "value"
-    type_attr: "T"
+    name: "element_shape"
+    type_attr: "shape_type"
   }
   input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
+    name: "num_elements"
+    type: DT_INT32
   }
   output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "TensorArrayWriteV3"
+  name: "TensorListSetItem"
   input_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "input_handle"
+    type: DT_VARIANT
   }
   input_arg {
     name: "index"
     type: DT_INT32
   }
   input_arg {
-    name: "value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "flow_in"
-    type: DT_FLOAT
+    name: "item"
+    type_attr: "element_dtype"
   }
   output_arg {
-    name: "flow_out"
-    type: DT_FLOAT
+    name: "output_handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
+    name: "element_dtype"
     type: "type"
   }
-  is_stateful: true
 }
 op {
-  name: "TensorDataset"
+  name: "TensorListStack"
   input_arg {
-    name: "components"
-    type_list_attr: "Toutput_types"
+    name: "input_handle"
+    type: DT_VARIANT
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "tensor"
+    type_attr: "element_dtype"
   }
   attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "element_dtype"
+    type: "type"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "num_elements"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
-  is_stateful: true
 }
 op {
   name: "TensorSliceDataset"
@@ -40910,6 +64139,39 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TextLineReader"
+  output_arg {
+    name: "reader_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "skip_header_lines"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  deprecation {
+    version: 26
+  }
+  is_stateful: true
+}
 op {
   name: "TextLineReaderV2"
   output_arg {
@@ -41200,6 +64462,149 @@ op {
     version: 7
   }
 }
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
+op {
+  name: "TopKV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+}
 op {
   name: "TopKV2"
   input_arg {
@@ -41239,6 +64644,8 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -41284,6 +64691,53 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "TopKV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -41354,6 +64808,68 @@ op {
     }
   }
 }
+op {
+  name: "TruncateDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "TruncateMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "TruncateMod"
   input_arg {
@@ -41375,58 +64891,189 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "TruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Unbatch"
+  input_arg {
+    name: "batched_tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "unbatched_tensor"
+    type_attr: "T"
+  }
+  attr {
+    name: "timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
 }
 op {
-  name: "TruncatedNormal"
+  name: "UnbatchGrad"
   input_arg {
-    name: "shape"
+    name: "original_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "id"
+    type: DT_INT64
+  }
   output_arg {
-    name: "output"
-    type_attr: "dtype"
+    name: "batched_grad"
+    type_attr: "T"
   }
   attr {
-    name: "seed"
-    type: "int"
+    name: "container"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "shared_name"
+    type: "string"
     default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+      s: ""
     }
   }
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
   }
-  is_stateful: true
 }
 op {
   name: "UniformCandidateSampler"
@@ -41571,6 +65218,114 @@ op {
     }
   }
 }
+op {
+  name: "UniqueDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "UniqueV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UniqueV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "UniqueWithCounts"
   input_arg {
@@ -41635,6 +65390,34 @@ op {
     }
   }
 }
+op {
+  name: "UnravelIndex"
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "dims"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "UnsortedSegmentMax"
   input_arg {
@@ -41729,6 +65512,130 @@ op {
     }
   }
 }
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentMax"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "UnsortedSegmentSum"
   input_arg {
@@ -41833,6 +65740,140 @@ op {
     }
   }
 }
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "UnsortedSegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Unstage"
   output_arg {
@@ -42084,6 +66125,86 @@ op {
     }
   }
 }
+op {
+  name: "Where"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_BOOL
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+        type: DT_BOOL
+      }
+    }
+  }
+}
+op {
+  name: "Where"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_BOOL
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+}
 op {
   name: "WholeFileReader"
   output_arg {
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 61089658d71db9e5db95660c4addf3aec3849338..81e9fcfa959dc906f34a2a1bf6cc77aefe4aaeaf 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -47,20 +47,7 @@ REGISTER_OP("Switch")
     .Output("output_false: T")
     .Output("output_true: T")
     .Attr("T: type")
-    .SetShapeFn(SwitchShape)
-    .Doc(R"doc(
-Forwards `data` to the output port determined by `pred`.
-
-If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
-the data goes to `output_false`.
-
-See also `RefSwitch` and `Merge`.
-
-data: The tensor to be forwarded to the appropriate output.
-pred: A scalar that specifies which output port will receive data.
-output_false: If `pred` is false, data will be forwarded to this output.
-output_true: If `pred` is true, data will be forwarded to this output.
-)doc");
+    .SetShapeFn(SwitchShape);
 
 REGISTER_OP("RefSwitch")
     .Input("data: Ref(T)")
@@ -69,20 +56,7 @@ REGISTER_OP("RefSwitch")
     .Output("output_true: Ref(T)")
     .Attr("T: type")
     .SetAllowsUninitializedInput()
-    .SetShapeFn(SwitchShape)
-    .Doc(R"doc(
-Forwards the ref tensor `data` to the output port determined by `pred`.
-
-If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
-the data goes to `output_false`.
-
-See also `Switch` and `Merge`.
-
-data: The ref tensor to be forwarded to the appropriate output.
-pred: A scalar that specifies which output port will receive data.
-output_false: If `pred` is false, data will be forwarded to this output.
-output_true: If `pred` is true, data will be forwarded to this output.
-)doc");
+    .SetShapeFn(SwitchShape);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("RefSelect")
@@ -110,14 +84,7 @@ REGISTER_OP("RefSelect")
       }
       c->set_output(0, first_input);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Forwards the `index`th element of `inputs` to `output`.
-
-index: A scalar that determines the input that gets selected.
-inputs: A list of ref tensors, one of which will be forwarded to `output`.
-output: The forwarded tensor.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 namespace {
@@ -153,20 +120,7 @@ REGISTER_OP("Merge")
     .Output("value_index: int32")
     .Attr("T: type")
     .Attr("N: int >= 1")
-    .SetShapeFn(MergeShape)
-    .Doc(R"doc(
-Forwards the value of an available tensor from `inputs` to `output`.
-
-`Merge` waits for at least one of the tensors in `inputs` to become available.
-It is usually combined with `Switch` to implement branching.
-
-`Merge` forwards the first tensor to become available to `output`, and sets
-`value_index` to its index in `inputs`.
-
-inputs: The input tensors, exactly one of which will become available.
-output: Will be set to the available input tensor.
-value_index: The index of the chosen input tensor in `inputs`.
-)doc");
+    .SetShapeFn(MergeShape);
 
 REGISTER_OP("RefMerge")
     .Input("inputs: Ref(N * T)")
@@ -174,20 +128,7 @@ REGISTER_OP("RefMerge")
     .Output("value_index: int32")
     .Attr("T: type")
     .Attr("N: int >= 1")
-    .SetShapeFn(MergeShape)
-    .Doc(R"doc(
-Forwards the value of an available tensor from `inputs` to `output`.
-
-`Merge` waits for at least one of the tensors in `inputs` to become available.
-It is usually combined with `Switch` to implement branching.
-
-`Merge` forwards the first tensor for become available to `output`, and sets
-`value_index` to its index in `inputs`.
-
-inputs: The input tensors, exactly one of which will become available.
-output: Will be set to the available input tensor.
-value_index: The index of the chosen input tensor in `inputs`.
-)doc");
+    .SetShapeFn(MergeShape);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Enter")
@@ -214,22 +155,7 @@ REGISTER_OP("Enter")
       }
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-Creates or finds a child frame, and makes `data` available to the child frame.
-
-This op is used together with `Exit` to create loops in the graph.
-The unique `frame_name` is used by the `Executor` to identify frames. If
-`is_constant` is true, `output` is a constant in the child frame; otherwise
-it may be changed in the child frame. At most `parallel_iterations` iterations
-are run in parallel in the child frame.
-
-data: The tensor to be made available to the child frame.
-frame_name: The name of the child frame.
-is_constant: If true, the output is constant within the child frame.
-parallel_iterations: The number of iterations allowed to run in parallel.
-output: The same tensor as `data`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("RefEnter")
@@ -239,75 +165,33 @@ REGISTER_OP("RefEnter")
     .Attr("frame_name: string")
     .Attr("is_constant: bool = false")
     .Attr("parallel_iterations: int = 10")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Creates or finds a child frame, and makes `data` available to the child frame.
-
-The unique `frame_name` is used by the `Executor` to identify frames. If
-`is_constant` is true, `output` is a constant in the child frame; otherwise
-it may be changed in the child frame. At most `parallel_iterations` iterations
-are run in parallel in the child frame.
-
-data: The tensor to be made available to the child frame.
-frame_name: The name of the child frame.
-is_constant: If true, the output is constant within the child frame.
-parallel_iterations: The number of iterations allowed to run in parallel.
-output: The same tensor as `data`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Exit")
     .Input("data: T")
     .Output("output: T")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Exits the current frame to its parent frame.
-
-Exit makes its input `data` available to the parent frame.
-
-data: The tensor to be made available to the parent frame.
-output: The same tensor as `data`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("RefExit")
     .Input("data: Ref(T)")
     .Output("output: Ref(T)")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Exits the current frame to its parent frame.
-
-Exit makes its input `data` available to the parent frame.
-
-data: The tensor to be made available to the parent frame.
-output: The same tensor as `data`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("NextIteration")
     .Input("data: T")
     .Output("output: T")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Makes its input available to the next iteration.
-
-data: The tensor to be made available to the next iteration.
-output: The same tensor as `data`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("RefNextIteration")
     .Input("data: Ref(T)")
     .Output("output: Ref(T)")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Makes its input available to the next iteration.
-
-data: The tensor to be made available to the next iteration.
-output: The same tensor as `data`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("LoopCond")
@@ -315,40 +199,15 @@ REGISTER_OP("LoopCond")
     .Output("output: bool")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRank(c, 0);
-    })
-    .Doc(R"doc(
-Forwards the input to the output.
-
-This operator represents the loop termination condition used by the
-"pivot" switches of a loop.
-
-input: A boolean scalar, representing the branch predicate of the Switch op.
-output: The same tensor as `input`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
-REGISTER_OP("ControlTrigger")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"docstring(
-Does nothing. Serves as a control trigger for scheduling.
-
-Only useful as a placeholder for control edges.
-)docstring");
+REGISTER_OP("ControlTrigger").SetShapeFn(shape_inference::NoOutputs);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Abort")
     .Attr("error_msg: string = ''")
     .Attr("exit_without_error: bool = false")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Raise a exception to abort the process when called.
-
-If exit_without_error is true, the process will exit normally,
-otherwise it will exit with a SIGABORT signal.
-
-Returns nothing but an exception.
-
-error_msg: A string which is the message associated with the exception.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ctc_ops.cc b/tensorflow/core/ops/ctc_ops.cc
index 1a69106d80ba9962f40a2353637b456294ae22d6..f2322c730bc8b18d2229b0ea4cc5ded0f8021ef5 100644
--- a/tensorflow/core/ops/ctc_ops.cc
+++ b/tensorflow/core/ops/ctc_ops.cc
@@ -59,30 +59,7 @@ REGISTER_OP("CTCLoss")
       c->set_output(0, c->Vector(batch_size));
       c->set_output(1, inputs);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
-the gradient.  This class performs the softmax operation for you, so inputs
-should be e.g. linear projections of outputs by an LSTM.
-
-inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-labels_indices: The indices of a `SparseTensor<int32, 2>`.
-  `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
-  `(batch b, time t)`.
-labels_values: The values (labels) associated with the given batch and time.
-sequence_length: A vector containing sequence lengths (batch).
-preprocess_collapse_repeated: Scalar, if true then repeated labels are
-  collapsed prior to the CTC calculation.
-ctc_merge_repeated: Scalar.  If set to false, *during* CTC calculation
-  repeated non-blank labels will not be merged and are interpreted as
-  individual labels.  This is a simplified version of CTC.
-ignore_longer_outputs_than_inputs: Scalar. If set to true, during CTC
-  calculation, items that have longer output sequences than input sequences
-  are skipped: they don't contribute to the loss term and have zero-gradient.
-loss: A vector (batch) containing log-probabilities.
-gradient: The gradient of `loss`.  3-D, shape:
-  `(max_time x batch_size x num_classes)`.
-)doc");
+    });
 
 REGISTER_OP("CTCGreedyDecoder")
     .Input("inputs: float")
@@ -110,32 +87,7 @@ REGISTER_OP("CTCGreedyDecoder")
       c->set_output(2, c->Vector(2));
       c->set_output(3, c->Matrix(batch_size, 1));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Performs greedy decoding on the logits given in inputs.
-
-A note about the attribute merge_repeated: if enabled, when
-consecutive logits' maximum indices are the same, only the first of
-these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
-becomes "A B B" if merge_repeated = True and "A B B B B" if
-merge_repeated = False.
-
-Regardless of the value of merge_repeated, if the maximum index of a given
-time and batch corresponds to the blank, index `(num_classes - 1)`, no new
-element is emitted.
-
-inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-sequence_length: A vector containing sequence lengths, size `(batch_size)`.
-merge_repeated: If True, merge repeated classes in output.
-decoded_indices: Indices matrix, size `(total_decoded_outputs x 2)`,
-  of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].
-decoded_values: Values vector, size: `(total_decoded_outputs)`,
-  of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.
-decoded_shape: Shape vector, size `(2)`, of the decoded SparseTensor.
-  Values are: `[batch_size, max_decoded_length]`.
-log_probability: Matrix, size `(batch_size x 1)`, containing sequence
-  log-probabilities.
-)doc");
+    });
 
 REGISTER_OP("CTCBeamSearchDecoder")
     .Input("inputs: float")
@@ -176,32 +128,6 @@ REGISTER_OP("CTCBeamSearchDecoder")
       }
       c->set_output(out_idx++, c->Matrix(batch_size, top_paths));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Performs beam search decoding on the logits given in input.
-
-A note about the attribute merge_repeated: For the beam search decoder,
-this means that if consecutive entries in a beam are the same, only
-the first of these is emitted.  That is, when the top path is "A B B B B",
-"A B" is returned if merge_repeated = True but "A B B B B" is
-returned if merge_repeated = False.
-
-inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-sequence_length: A vector containing sequence lengths, size `(batch)`.
-beam_width: A scalar >= 0 (beam search beam width).
-top_paths: A scalar >= 0, <= beam_width (controls output size).
-merge_repeated: If true, merge repeated classes in output.
-decoded_indices: A list (length: top_paths) of indices matrices.  Matrix j,
-  size `(total_decoded_outputs[j] x 2)`, has indices of a
-  `SparseTensor<int64, 2>`.  The rows store: [batch, time].
-decoded_values: A list (length: top_paths) of values vectors.  Vector j,
-  size `(length total_decoded_outputs[j])`, has the values of a
-  `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.
-decoded_shape: A list (length: top_paths) of shape vector.  Vector j,
-  size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
-  Its values are: `[batch_size, max_decoded_length[j]]`.
-log_probability: A matrix, shaped: `(batch_size x top_paths)`.  The
-  sequence log-probabilities.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index ac2dc601f1f6b48905f1269b8726ac30ba5dda67..4f946fb3ca7608816180351b7753d01f13d469f2 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -84,51 +84,7 @@ REGISTER_OP("DynamicPartition")
       }
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-
-For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-are placed in `outputs[i]` in lexicographic order of `js`, and the first
-dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-In detail,
-
-```python
-    outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
-
-    outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-```
-
-`data.shape` must start with `partitions.shape`.
-
-For example:
-
-```python
-    # Scalar partitions.
-    partitions = 1
-    num_partitions = 2
-    data = [10, 20]
-    outputs[0] = []  # Empty with shape [0, 2]
-    outputs[1] = [[10, 20]]
-
-    # Vector partitions.
-    partitions = [0, 0, 1, 1, 0]
-    num_partitions = 2
-    data = [10, 20, 30, 40, 50]
-    outputs[0] = [10, 20, 50]
-    outputs[1] = [30, 40]
-```
-
-See `dynamic_stitch` for an example on how to merge partitions back.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
-</div>
-
-partitions: Any shape.  Indices in the range `[0, num_partitions)`.
-num_partitions: The number of partitions to output.
-)doc");
+    });
 
 namespace {
 
@@ -189,73 +145,7 @@ REGISTER_OP("DynamicStitch")
     .Output("merged: T")
     .Attr("N : int >= 1")
     .Attr("T : type")
-    .SetShapeFn(DynamicStitchShapeFunction)
-    .Doc(R"doc(
-Interleave the values from the `data` tensors into a single tensor.
-
-Builds a merged tensor such that
-
-```python
-    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-```
-
-For example, if each `indices[m]` is scalar or vector, we have
-
-```python
-    # Scalar indices:
-    merged[indices[m], ...] = data[m][...]
-
-    # Vector indices:
-    merged[indices[m][i], ...] = data[m][i, ...]
-```
-
-Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-`constant`, the output shape is
-
-    merged.shape = [max(indices)] + constant
-
-Values are merged in order, so if an index appears in both `indices[m][i]` and
-`indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
-merged result. If you do not need this guarantee, ParallelDynamicStitch might
-perform better on some devices.
-
-For example:
-
-```python
-    indices[0] = 6
-    indices[1] = [4, 1]
-    indices[2] = [[5, 2], [0, 3]]
-    data[0] = [61, 62]
-    data[1] = [[41, 42], [11, 12]]
-    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-              [51, 52], [61, 62]]
-```
-
-This method can be used to merge partitions created by `dynamic_partition`
-as illustrated on the following example:
-
-```python
-    # Apply function (increments x_i) on elements for which a certain condition
-    # apply (x_i != -1 in this example).
-    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-    condition_mask=tf.not_equal(x,tf.constant(-1.))
-    partitioned_data = tf.dynamic_partition(
-        x, tf.cast(condition_mask, tf.int32) , 2)
-    partitioned_data[1] = partitioned_data[1] + 1.0
-    condition_indices = tf.dynamic_partition(
-        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-    x = tf.dynamic_stitch(condition_indices, partitioned_data)
-    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-    # unchanged.
-```
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-</div>
-)doc");
+    .SetShapeFn(DynamicStitchShapeFunction);
 
 REGISTER_OP("ParallelDynamicStitch")
     .Input("indices: N * int32")
@@ -263,72 +153,7 @@ REGISTER_OP("ParallelDynamicStitch")
     .Output("merged: T")
     .Attr("N : int >= 1")
     .Attr("T : type")
-    .SetShapeFn(DynamicStitchShapeFunction)
-    .Doc(R"doc(
-Interleave the values from the `data` tensors into a single tensor.
-
-Builds a merged tensor such that
-
-```python
-    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-```
-
-For example, if each `indices[m]` is scalar or vector, we have
-
-```python
-    # Scalar indices:
-    merged[indices[m], ...] = data[m][...]
-
-    # Vector indices:
-    merged[indices[m][i], ...] = data[m][i, ...]
-```
-
-Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-`constant`, the output shape is
-
-    merged.shape = [max(indices)] + constant
-
-Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-and `indices[n][j]`, the result may be invalid. This differs from the normal
-DynamicStitch operator that defines the behavior in that case.
-
-For example:
-
-```python
-    indices[0] = 6
-    indices[1] = [4, 1]
-    indices[2] = [[5, 2], [0, 3]]
-    data[0] = [61, 62]
-    data[1] = [[41, 42], [11, 12]]
-    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-              [51, 52], [61, 62]]
-```
-
-This method can be used to merge partitions created by `dynamic_partition`
-as illustrated on the following example:
-
-```python
-    # Apply function (increments x_i) on elements for which a certain condition
-    # apply (x_i != -1 in this example).
-    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-    condition_mask=tf.not_equal(x,tf.constant(-1.))
-    partitioned_data = tf.dynamic_partition(
-        x, tf.cast(condition_mask, tf.int32) , 2)
-    partitioned_data[1] = partitioned_data[1] + 1.0
-    condition_indices = tf.dynamic_partition(
-        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-    x = tf.dynamic_stitch(condition_indices, partitioned_data)
-    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-    # unchanged.
-```
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-</div>
-)doc");
+    .SetShapeFn(DynamicStitchShapeFunction);
 
 // --------------------------------------------------------------------------
 
@@ -346,29 +171,10 @@ Status TwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
   return Status::OK();
 }
 
-Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
-  ShapeHandle handle;
-  DimensionHandle unused_handle;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
-  for (int i = 1; i < c->num_inputs(); ++i) {
-    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
-    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
-  }
-  for (int i = 0; i < c->num_outputs(); ++i) {
-    c->set_output(i, c->Scalar());
-  }
-  return Status::OK();
-}
-
 Status TwoElementOutput(InferenceContext* c) {
   c->set_output(0, c->Vector(2));
   return Status::OK();
 }
-
-Status ScalarOutput(InferenceContext* c) {
-  c->set_output(0, c->Scalar());
-  return Status::OK();
-}
 }  // namespace
 
 REGISTER_OP("RandomShuffleQueue")
@@ -382,29 +188,7 @@ REGISTER_OP("RandomShuffleQueue")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-A queue that randomizes the order of elements.
-
-handle: The handle to the queue.
-component_types: The type of each component in a value.
-shapes: The shape of each component in a value. The length of this attr must
-  be either 0 or the same as the length of component_types. If the length of
-  this attr is 0, the shapes of queue elements are not constrained, and
-  only one element may be dequeued at a time.
-capacity: The upper bound on the number of elements in this queue.
-  Negative numbers mean no limit.
-min_after_dequeue: Dequeue will block unless there would be this
-  many elements after the dequeue or the queue is closed. This
-  ensures a minimum level of mixing of elements.
-seed: If either seed or seed2 is set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, a random seed is used.
-seed2: A second seed to avoid seed collision.
-container: If non-empty, this queue is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this queue will be shared under the given name
-  across multiple sessions.
-)doc");
+    .SetShapeFn(TwoElementOutput);
 
 REGISTER_OP("RandomShuffleQueueV2")
     .Output("handle: resource")
@@ -417,29 +201,7 @@ REGISTER_OP("RandomShuffleQueueV2")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A queue that randomizes the order of elements.
-
-handle: The handle to the queue.
-component_types: The type of each component in a value.
-shapes: The shape of each component in a value. The length of this attr must
-  be either 0 or the same as the length of component_types. If the length of
-  this attr is 0, the shapes of queue elements are not constrained, and
-  only one element may be dequeued at a time.
-capacity: The upper bound on the number of elements in this queue.
-  Negative numbers mean no limit.
-min_after_dequeue: Dequeue will block unless there would be this
-  many elements after the dequeue or the queue is closed. This
-  ensures a minimum level of mixing of elements.
-seed: If either seed or seed2 is set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, a random seed is used.
-seed2: A second seed to avoid seed collision.
-container: If non-empty, this queue is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this queue will be shared under the given name
-  across multiple sessions.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("FIFOQueue")
     .Output("handle: Ref(string)")
@@ -449,23 +211,7 @@ REGISTER_OP("FIFOQueue")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-A queue that produces elements in first-in first-out order.
-
-handle: The handle to the queue.
-component_types: The type of each component in a value.
-shapes: The shape of each component in a value. The length of this attr must
-  be either 0 or the same as the length of component_types. If the length of
-  this attr is 0, the shapes of queue elements are not constrained, and
-  only one element may be dequeued at a time.
-capacity: The upper bound on the number of elements in this queue.
-  Negative numbers mean no limit.
-container: If non-empty, this queue is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this queue will be shared under the given name
-  across multiple sessions.
-)doc");
+    .SetShapeFn(TwoElementOutput);
 
 REGISTER_OP("FIFOQueueV2")
     .Output("handle: resource")
@@ -475,23 +221,7 @@ REGISTER_OP("FIFOQueueV2")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A queue that produces elements in first-in first-out order.
-
-handle: The handle to the queue.
-component_types: The type of each component in a value.
-shapes: The shape of each component in a value. The length of this attr must
-  be either 0 or the same as the length of component_types. If the length of
-  this attr is 0, the shapes of queue elements are not constrained, and
-  only one element may be dequeued at a time.
-capacity: The upper bound on the number of elements in this queue.
-  Negative numbers mean no limit.
-container: If non-empty, this queue is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this queue will be shared under the given name
-  across multiple sessions.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("PaddingFIFOQueue")
     .Output("handle: Ref(string)")
@@ -501,31 +231,7 @@ REGISTER_OP("PaddingFIFOQueue")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-A queue that produces elements in first-in first-out order.
-
-Variable-size shapes are allowed by setting the corresponding shape dimensions
-to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-size of any given element in the minibatch.  See below for details.
-
-handle: The handle to the queue.
-component_types: The type of each component in a value.
-shapes: The shape of each component in a value. The length of this attr must
-  be either 0 or the same as the length of component_types.
-  Shapes of fixed rank but variable size are allowed by setting
-  any shape dimension to -1.  In this case, the inputs' shape may vary along
-  the given dimension, and DequeueMany will pad the given dimension with
-  zeros up to the maximum shape of all elements in the given batch.
-  If the length of this attr is 0, different queue elements may have
-  different ranks and shapes, but only one element may be dequeued at a time.
-capacity: The upper bound on the number of elements in this queue.
-  Negative numbers mean no limit.
-container: If non-empty, this queue is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this queue will be shared under the given name
-  across multiple sessions.
-)doc");
+    .SetShapeFn(TwoElementOutput);
 
 REGISTER_OP("PaddingFIFOQueueV2")
     .Output("handle: resource")
@@ -535,31 +241,7 @@ REGISTER_OP("PaddingFIFOQueueV2")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A queue that produces elements in first-in first-out order.
-
-Variable-size shapes are allowed by setting the corresponding shape dimensions
-to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-size of any given element in the minibatch.  See below for details.
-
-handle: The handle to the queue.
-component_types: The type of each component in a value.
-shapes: The shape of each component in a value. The length of this attr must
-  be either 0 or the same as the length of component_types.
-  Shapes of fixed rank but variable size are allowed by setting
-  any shape dimension to -1.  In this case, the inputs' shape may vary along
-  the given dimension, and DequeueMany will pad the given dimension with
-  zeros up to the maximum shape of all elements in the given batch.
-  If the length of this attr is 0, different queue elements may have
-  different ranks and shapes, but only one element may be dequeued at a time.
-capacity: The upper bound on the number of elements in this queue.
-  Negative numbers mean no limit.
-container: If non-empty, this queue is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this queue will be shared under the given name
-  across multiple sessions.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("PriorityQueue")
     .Output("handle: Ref(string)")
@@ -569,29 +251,7 @@ REGISTER_OP("PriorityQueue")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-A queue that produces elements sorted by the first component value.
-
-Note that the PriorityQueue requires the first component of any element
-to be a scalar int64, in addition to the other elements declared by
-component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
-and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
-entry in their input (resp. output) lists.
-
-handle: The handle to the queue.
-component_types: The type of each component in a value.
-shapes: The shape of each component in a value. The length of this attr must
-  be either 0 or the same as the length of component_types. If the length of
-  this attr is 0, the shapes of queue elements are not constrained, and
-  only one element may be dequeued at a time.
-capacity: The upper bound on the number of elements in this queue.
-  Negative numbers mean no limit.
-container: If non-empty, this queue is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this queue will be shared under the given name
-  across multiple sessions.
-)doc");
+    .SetShapeFn(TwoElementOutput);
 
 REGISTER_OP("PriorityQueueV2")
     .Output("handle: resource")
@@ -601,158 +261,48 @@ REGISTER_OP("PriorityQueueV2")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A queue that produces elements sorted by the first component value.
-
-Note that the PriorityQueue requires the first component of any element
-to be a scalar int64, in addition to the other elements declared by
-component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
-and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
-entry in their input (resp. output) lists.
-
-handle: The handle to the queue.
-component_types: The type of each component in a value.
-shapes: The shape of each component in a value. The length of this attr must
-  be either 0 or the same as the length of component_types. If the length of
-  this attr is 0, the shapes of queue elements are not constrained, and
-  only one element may be dequeued at a time.
-capacity: The upper bound on the number of elements in this queue.
-  Negative numbers mean no limit.
-container: If non-empty, this queue is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this queue will be shared under the given name
-  across multiple sessions.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("FakeQueue")
     .Input("resource: resource")
     .Output("handle: Ref(string)")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc("Deprecated. Do not use.");
+    .SetShapeFn(TwoElementOutput);
 
 REGISTER_OP("QueueEnqueue")
     .Input("handle: Ref(string)")
     .Input("components: Tcomponents")
     .Attr("Tcomponents: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Enqueues a tuple of one or more tensors in the given queue.
-
-The components input has k elements, which correspond to the components of
-tuples stored in the given queue.
-
-N.B. If the queue is full, this operation will block until the given
-element has been enqueued (or 'timeout_ms' elapses, if specified).
-
-handle: The handle to a queue.
-components: One or more tensors from which the enqueued tensors should be taken.
-timeout_ms: If the queue is full, this operation will block for up to
-  timeout_ms milliseconds.
-  Note: This option is not supported yet.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("QueueEnqueueV2")
     .Input("handle: resource")
     .Input("components: Tcomponents")
     .Attr("Tcomponents: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Enqueues a tuple of one or more tensors in the given queue.
-
-The components input has k elements, which correspond to the components of
-tuples stored in the given queue.
-
-N.B. If the queue is full, this operation will block until the given
-element has been enqueued (or 'timeout_ms' elapses, if specified).
-
-handle: The handle to a queue.
-components: One or more tensors from which the enqueued tensors should be taken.
-timeout_ms: If the queue is full, this operation will block for up to
-  timeout_ms milliseconds.
-  Note: This option is not supported yet.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("QueueEnqueueMany")
     .Input("handle: Ref(string)")
     .Input("components: Tcomponents")
     .Attr("Tcomponents: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Enqueues zero or more tuples of one or more tensors in the given queue.
-
-This operation slices each component tensor along the 0th dimension to
-make multiple queue elements. All of the tuple components must have the
-same size in the 0th dimension.
-
-The components input has k elements, which correspond to the components of
-tuples stored in the given queue.
-
-N.B. If the queue is full, this operation will block until the given
-elements have been enqueued (or 'timeout_ms' elapses, if specified).
-
-handle: The handle to a queue.
-components: One or more tensors from which the enqueued tensors should
-  be taken.
-timeout_ms: If the queue is too full, this operation will block for up
-  to timeout_ms milliseconds.
-  Note: This option is not supported yet.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("QueueEnqueueManyV2")
     .Input("handle: resource")
     .Input("components: Tcomponents")
     .Attr("Tcomponents: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Enqueues zero or more tuples of one or more tensors in the given queue.
-
-This operation slices each component tensor along the 0th dimension to
-make multiple queue elements. All of the tuple components must have the
-same size in the 0th dimension.
-
-The components input has k elements, which correspond to the components of
-tuples stored in the given queue.
-
-N.B. If the queue is full, this operation will block until the given
-elements have been enqueued (or 'timeout_ms' elapses, if specified).
-
-handle: The handle to a queue.
-components: One or more tensors from which the enqueued tensors should
-  be taken.
-timeout_ms: If the queue is too full, this operation will block for up
-  to timeout_ms milliseconds.
-  Note: This option is not supported yet.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("QueueDequeue")
     .Input("handle: Ref(string)")
     .Output("components: component_types")
     .Attr("component_types: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Dequeues a tuple of one or more tensors from the given queue.
-
-This operation has k outputs, where k is the number of components
-in the tuples stored in the given queue, and output i is the ith
-component of the dequeued tuple.
-
-N.B. If the queue is empty, this operation will block until an element
-has been dequeued (or 'timeout_ms' elapses, if specified).
-
-handle: The handle to a queue.
-components: One or more tensors that were dequeued as a tuple.
-component_types: The type of each component in a tuple.
-timeout_ms: If the queue is empty, this operation will block for up to
-  timeout_ms milliseconds.
-  Note: This option is not supported yet.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("QueueDequeueV2")
     .Input("handle: resource")
@@ -769,24 +319,7 @@ REGISTER_OP("QueueDequeueV2")
       } else {
         return shape_inference::UnknownShape(c);
       }
-    })
-    .Doc(R"doc(
-Dequeues a tuple of one or more tensors from the given queue.
-
-This operation has k outputs, where k is the number of components
-in the tuples stored in the given queue, and output i is the ith
-component of the dequeued tuple.
-
-N.B. If the queue is empty, this operation will block until an element
-has been dequeued (or 'timeout_ms' elapses, if specified).
-
-handle: The handle to a queue.
-components: One or more tensors that were dequeued as a tuple.
-component_types: The type of each component in a tuple.
-timeout_ms: If the queue is empty, this operation will block for up to
-  timeout_ms milliseconds.
-  Note: This option is not supported yet.
-)doc");
+    });
 
 REGISTER_OP("QueueDequeueMany")
     .Input("handle: Ref(string)")
@@ -794,32 +327,7 @@ REGISTER_OP("QueueDequeueMany")
     .Output("components: component_types")
     .Attr("component_types: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Dequeues `n` tuples of one or more tensors from the given queue.
-
-If the queue is closed and there are fewer than `n` elements, then an
-OutOfRange error is returned.
-
-This operation concatenates queue-element component tensors along the
-0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size `n` in the 0th dimension.
-
-This operation has `k` outputs, where `k` is the number of components in
-the tuples stored in the given queue, and output `i` is the ith
-component of the dequeued tuple.
-
-N.B. If the queue is empty, this operation will block until `n` elements
-have been dequeued (or 'timeout_ms' elapses, if specified).
-
-handle: The handle to a queue.
-n: The number of tuples to dequeue.
-components: One or more tensors that were dequeued as a tuple.
-component_types: The type of each component in a tuple.
-timeout_ms: If the queue has fewer than n elements, this operation
-  will block for up to timeout_ms milliseconds.
-  Note: This option is not supported yet.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("QueueDequeueManyV2")
     .Input("handle: resource")
@@ -839,32 +347,7 @@ REGISTER_OP("QueueDequeueManyV2")
         n_shape = c->Vector(n);
       }
       return DequeueManyV2Shape(c, n_shape);
-    })
-    .Doc(R"doc(
-Dequeues `n` tuples of one or more tensors from the given queue.
-
-If the queue is closed and there are fewer than `n` elements, then an
-OutOfRange error is returned.
-
-This operation concatenates queue-element component tensors along the
-0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size `n` in the 0th dimension.
-
-This operation has `k` outputs, where `k` is the number of components in
-the tuples stored in the given queue, and output `i` is the ith
-component of the dequeued tuple.
-
-N.B. If the queue is empty, this operation will block until `n` elements
-have been dequeued (or 'timeout_ms' elapses, if specified).
-
-handle: The handle to a queue.
-n: The number of tuples to dequeue.
-components: One or more tensors that were dequeued as a tuple.
-component_types: The type of each component in a tuple.
-timeout_ms: If the queue has fewer than n elements, this operation
-  will block for up to timeout_ms milliseconds.
-  Note: This option is not supported yet.
-)doc");
+    });
 
 REGISTER_OP("QueueDequeueUpTo")
     .Input("handle: Ref(string)")
@@ -872,36 +355,7 @@ REGISTER_OP("QueueDequeueUpTo")
     .Output("components: component_types")
     .Attr("component_types: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Dequeues `n` tuples of one or more tensors from the given queue.
-
-This operation is not supported by all queues.  If a queue does not support
-DequeueUpTo, then an Unimplemented error is returned.
-
-If the queue is closed and there are more than 0 but less than `n`
-elements remaining, then instead of returning an OutOfRange error like
-QueueDequeueMany, less than `n` elements are returned immediately.  If
-the queue is closed and there are 0 elements left in the queue, then
-an OutOfRange error is returned just like in QueueDequeueMany.
-Otherwise the behavior is identical to QueueDequeueMany:
-
-This operation concatenates queue-element component tensors along the
-0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size `n` in the 0th dimension.
-
-This operation has k outputs, where `k` is the number of components in
-the tuples stored in the given queue, and output `i` is the ith
-component of the dequeued tuple.
-
-handle: The handle to a queue.
-n: The number of tuples to dequeue.
-components: One or more tensors that were dequeued as a tuple.
-component_types: The type of each component in a tuple.
-timeout_ms: If the queue has fewer than n elements, this operation
-  will block for up to timeout_ms milliseconds.
-  Note: This option is not supported yet.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("QueueDequeueUpToV2")
     .Input("handle: resource")
@@ -911,133 +365,44 @@ REGISTER_OP("QueueDequeueUpToV2")
     .Attr("timeout_ms: int = -1")
     .SetShapeFn([](InferenceContext* c) {
       return DequeueManyV2Shape(c, c->Vector(InferenceContext::kUnknownDim));
-    })
-    .Doc(R"doc(
-Dequeues `n` tuples of one or more tensors from the given queue.
-
-This operation is not supported by all queues.  If a queue does not support
-DequeueUpTo, then an Unimplemented error is returned.
-
-If the queue is closed and there are more than 0 but less than `n`
-elements remaining, then instead of returning an OutOfRange error like
-QueueDequeueMany, less than `n` elements are returned immediately.  If
-the queue is closed and there are 0 elements left in the queue, then
-an OutOfRange error is returned just like in QueueDequeueMany.
-Otherwise the behavior is identical to QueueDequeueMany:
-
-This operation concatenates queue-element component tensors along the
-0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size n in the 0th dimension.
-
-This operation has `k` outputs, where `k` is the number of components in
-the tuples stored in the given queue, and output `i` is the ith
-component of the dequeued tuple.
-
-handle: The handle to a queue.
-n: The number of tuples to dequeue.
-components: One or more tensors that were dequeued as a tuple.
-component_types: The type of each component in a tuple.
-timeout_ms: If the queue has fewer than n elements, this operation
-  will block for up to timeout_ms milliseconds.
-  Note: This option is not supported yet.
-)doc");
+    });
 
 REGISTER_OP("QueueClose")
     .Input("handle: Ref(string)")
     .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
-    .Attr("cancel_pending_enqueues: bool = false")
-    .Doc(R"doc(
-Closes the given queue.
-
-This operation signals that no more elements will be enqueued in the
-given queue. Subsequent Enqueue(Many) operations will fail.
-Subsequent Dequeue(Many) operations will continue to succeed if
-sufficient elements remain in the queue. Subsequent Dequeue(Many)
-operations that would block will fail immediately.
-
-handle: The handle to a queue.
-cancel_pending_enqueues: If true, all pending enqueue requests that are
-  blocked on the given queue will be canceled.
-)doc");
+    .Attr("cancel_pending_enqueues: bool = false");
 
 REGISTER_OP("QueueCloseV2")
     .Input("handle: resource")
     .SetShapeFn(shape_inference::NoOutputs)
-    .Attr("cancel_pending_enqueues: bool = false")
-    .Doc(R"doc(
-Closes the given queue.
-
-This operation signals that no more elements will be enqueued in the
-given queue. Subsequent Enqueue(Many) operations will fail.
-Subsequent Dequeue(Many) operations will continue to succeed if
-sufficient elements remain in the queue. Subsequent Dequeue(Many)
-operations that would block will fail immediately.
-
-handle: The handle to a queue.
-cancel_pending_enqueues: If true, all pending enqueue requests that are
-  blocked on the given queue will be canceled.
-)doc");
+    .Attr("cancel_pending_enqueues: bool = false");
 
 REGISTER_OP("QueueIsClosed")
     .Input("handle: Ref(string)")
     .Output("is_closed: bool")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Returns true if queue is closed.
-
-This operation returns true if the queue is closed and false if the queue
-is open.
-
-handle: The handle to a queue.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("QueueIsClosedV2")
     .Input("handle: resource")
     .Output("is_closed: bool")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Returns true if queue is closed.
-
-This operation returns true if the queue is closed and false if the queue
-is open.
-
-handle: The handle to a queue.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("QueueSize")
     .Input("handle: Ref(string)")
     .Output("size: int32")
-    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
-    .Doc(R"doc(
-Computes the number of elements in the given queue.
-
-handle: The handle to a queue.
-size: The number of elements in the given queue.
-)doc");
+    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs);
 
 REGISTER_OP("QueueSizeV2")
     .Input("handle: resource")
     .Output("size: int32")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Computes the number of elements in the given queue.
-
-handle: The handle to a queue.
-size: The number of elements in the given queue.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
 
 REGISTER_OP("AccumulatorNumAccumulated")
     .Input("handle: Ref(string)")
     .Output("num_accumulated: int32")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Returns the number of gradients aggregated in the given accumulators.
-
-handle: The handle to an accumulator.
-num_accumulated: The number of gradients aggregated in the given accumulator.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("AccumulatorSetGlobalStep")
     .Input("handle: Ref(string)")
@@ -1046,16 +411,7 @@ REGISTER_OP("AccumulatorSetGlobalStep")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Updates the accumulator with a new value for global_step.
-
-Logs warning if the accumulator's value is already higher than
-new_global_step.
-
-handle: The handle to an accumulator.
-new_global_step: The new global_step value to set.
-)doc");
+    });
 
 REGISTER_OP("ConditionalAccumulator")
     .Output("handle: Ref(string)")
@@ -1067,25 +423,7 @@ REGISTER_OP("ConditionalAccumulator")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(2));
       return Status::OK();
-    })
-    .Doc(R"doc(
-A conditional accumulator for aggregating gradients.
-
-The accumulator accepts gradients marked with local_step greater or
-equal to the most recent global_step known to the accumulator. The
-average can be extracted from the accumulator, provided sufficient
-gradients have been accumulated. Extracting the average automatically
-resets the aggregate to 0, and increments the global_step recorded by
-the accumulator.
-
-handle: The handle to the accumulator.
-dtype: The type of the value being accumulated.
-shape: The shape of the values, can be [], in which case shape is unknown.
-container: If non-empty, this accumulator is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this accumulator will be shared under the
-  given name across multiple sessions.
-)doc");
+    });
 
 REGISTER_OP("AccumulatorApplyGradient")
     .Input("handle: Ref(string)")
@@ -1096,18 +434,7 @@ REGISTER_OP("AccumulatorApplyGradient")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Applies a gradient to a given accumulator.
-
-Does not add if local_step is lesser than the accumulator's global_step.
-
-handle: The handle to a accumulator.
-local_step: The local_step value at which the gradient was computed.
-gradient: A tensor of the gradient to be accumulated.
-dtype: The data type of accumulated gradients. Needs to correspond to the type
-  of the accumulator.
-)doc");
+    });
 
 REGISTER_OP("AccumulatorTakeGradient")
     .Input("handle: Ref(string)")
@@ -1121,22 +448,7 @@ REGISTER_OP("AccumulatorTakeGradient")
       // shape information.
       return shape_inference::UnknownShape(c);
     })
-    .Attr("dtype: numbertype")
-    .Doc(R"doc(
-Extracts the average gradient in the given ConditionalAccumulator.
-
-The op blocks until sufficient (i.e., more than num_required)
-gradients have been accumulated.  If the accumulator has already
-aggregated more than num_required gradients, it returns the average of
-the accumulated gradients.  Also automatically increments the recorded
-global_step in the accumulator by 1, and resets the aggregate to 0.
-
-handle: The handle to an accumulator.
-num_required: Number of gradients required before we return an aggregate.
-average: The average of the accumulated gradients.
-dtype: The data type of accumulated gradients. Needs to correspond to the type
-  of the accumulator.
-)doc");
+    .Attr("dtype: numbertype");
 
 REGISTER_OP("SparseConditionalAccumulator")
     .Output("handle: Ref(string)")
@@ -1148,25 +460,7 @@ REGISTER_OP("SparseConditionalAccumulator")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(2));
       return Status::OK();
-    })
-    .Doc(R"doc(
-A conditional accumulator for aggregating sparse gradients.
-
-The accumulator accepts gradients marked with local_step greater or
-equal to the most recent global_step known to the accumulator. The
-average can be extracted from the accumulator, provided sufficient
-gradients have been accumulated. Extracting the average automatically
-resets the aggregate to 0, and increments the global_step recorded by
-the accumulator.
-
-handle: The handle to the accumulator.
-dtype: The type of the value being accumulated.
-shape: The shape of the values.
-container: If non-empty, this accumulator is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this accumulator will be shared under the given name
-  across multiple sessions.
-)doc");
+    });
 
 REGISTER_OP("SparseAccumulatorApplyGradient")
     .Input("handle: Ref(string)")
@@ -1180,26 +474,7 @@ REGISTER_OP("SparseAccumulatorApplyGradient")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Applies a sparse gradient to a given accumulator.
-
-Does not add if local_step is smaller than the accumulator's
-global_step.
-
-handle: The handle to a accumulator.
-local_step: The local_step value at which the sparse gradient was computed.
-gradient_indices: Indices of the sparse gradient to be accumulated. Must be a
-  vector.
-gradient_values: Values are the non-zero slices of the gradient, and must have
-  the same first dimension as indices, i.e., the nnz represented by indices and
-  values must be consistent.
-gradient_shape: Shape of the sparse gradient to be accumulated.
-dtype: The data type of accumulated gradients. Needs to correspond to the type
-  of the accumulator.
-has_known_shape: Boolean indicating whether gradient_shape is unknown, in which
-  case the input is ignored during validation.
-)doc");
+    });
 
 REGISTER_OP("SparseAccumulatorTakeGradient")
     .Input("handle: Ref(string)")
@@ -1215,25 +490,7 @@ REGISTER_OP("SparseAccumulatorTakeGradient")
       // by 'handle', but which is not available here, so we lose
       // shape information.
       return shape_inference::UnknownShape(c);
-    })
-    .Doc(R"doc(
-Extracts the average sparse gradient in a SparseConditionalAccumulator.
-
-The op will blocks until sufficient (i.e., more than num_required)
-gradients have been accumulated. If the accumulator has already
-aggregated more than num_required gradients, it will return its
-average of the accumulated gradients.  Also automatically increments
-the recorded global_step in the accumulator by 1, and resets the
-aggregate to 0.
-
-handle: The handle to a SparseConditionalAccumulator.
-num_required: Number of gradients required before we return an aggregate.
-indices: Indices of the average of the accumulated sparse gradients.
-values: Values of the average of the accumulated sparse gradients.
-shape: Shape of the average of the accumulated sparse gradients.
-dtype: The data type of accumulated gradients. Needs to correspond to the type
-  of the accumulator.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1243,17 +500,7 @@ REGISTER_OP("StackV2")
     .Attr("elem_type: type")
     .Attr("stack_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-A stack that produces elements in first-in last-out order.
-
-max_size: The maximum size of the stack if non-negative. If negative, the stack
-  size is unlimited.
-handle: The handle to the stack.
-elem_type: The type of the elements on the stack.
-stack_name: Overrides the name used for the temporary stack resource. Default
-value is the name of the 'Stack' op (which is guaranteed unique).
-)doc");
+    .SetShapeFn(TwoElementOutput);
 
 REGISTER_OP("StackPushV2")
     .Input("handle: resource")
@@ -1264,37 +511,17 @@ REGISTER_OP("StackPushV2")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->input(1));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Push an element onto the stack.
-
-handle: The handle to a stack.
-elem: The tensor to be pushed onto the stack.
-output: The same tensor as the input 'elem'.
-swap_memory: Swap `elem` to CPU. Default to false.
-)doc");
+    });
 
 REGISTER_OP("StackPopV2")
     .Input("handle: resource")
     .Output("elem: elem_type")
     .Attr("elem_type: type")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Pop the element at the top of the stack.
-
-handle: The handle to a stack.
-elem: The tensor that is popped from the top of the stack.
-elem_type: The type of the elem that is popped.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("StackCloseV2")
     .Input("handle: resource")
-    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
-    .Doc(R"doc(
-Delete the stack from its resource container.
-
-handle: The handle to a stack.
-)doc");
+    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs);
 
 // Deprecated ref-typed variants of stack.
 
@@ -1303,10 +530,7 @@ REGISTER_OP("Stack")
     .Attr("elem_type: type")
     .Attr("stack_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Deprecated, use StackV2.
-)doc");
+    .SetShapeFn(TwoElementOutput);
 
 REGISTER_OP("StackPush")
     .Input("handle: Ref(string)")
@@ -1317,26 +541,17 @@ REGISTER_OP("StackPush")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->input(1));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Deprecated, use StackPushV2.
-)doc");
+    });
 
 REGISTER_OP("StackPop")
     .Input("handle: Ref(string)")
     .Output("elem: elem_type")
     .Attr("elem_type: type")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Deprecated, use StackPopV2.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("StackClose")
     .Input("handle: Ref(string)")
-    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
-    .Doc(R"doc(
-Deprecated, use StackCloseV2.
-)doc");
+    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs);
 
 // --------------------------------------------------------------------------
 
@@ -1356,35 +571,21 @@ REGISTER_OP("TensorArrayV3")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
       c->set_output(0, c->Vector(2));
       c->set_output(1, c->Scalar());
+      bool identical_shapes;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("identical_element_shapes", &identical_shapes));
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("dtype", &t));
+      PartialTensorShape p;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_shape", &p));
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(p, &s));
+      if (c->FullyDefined(s) || identical_shapes) {
+        c->set_output_handle_shapes_and_types(
+            0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+      }
       return Status::OK();
-    })
-    .Doc(R"doc(
-An array of Tensors of given size.
-
-Write data via Write and read via Read or Pack.
-
-handle: The handle to the TensorArray.
-flow: A scalar used to control gradient flow.
-size: The size of the array.
-dtype: The type of the elements on the tensor_array.
-element_shape: The expected shape of an element, if known. Used to
-  validate the shapes of TensorArray elements. If this shape is not
-  fully specified, gathering zero-size TensorArrays is an error.
-dynamic_size: A boolean that determines whether writes to the TensorArray
-  are allowed to grow the size.  By default, this is not allowed.
-clear_after_read: If true (default), Tensors in the TensorArray are cleared
-  after being read.  This disables multiple read semantics but allows early
-  release of memory.
-identical_element_shapes: If true (default is false), then all
-  elements in the TensorArray will be expected to have have identical shapes.
-  This allows certain behaviors, like dynamically checking for
-  consistent shapes on write, and being able to fill in properly
-  shaped zero tensors on stack -- even if the element_shape attribute
-  is not fully defined.
-tensor_array_name: Overrides the name used for the temporary tensor_array
-  resource. Default value is the name of the 'TensorArray' op (which
-  is guaranteed unique).
-)doc");
+    });
 
 REGISTER_OP("TensorArrayGradV3")
     .Input("handle: resource")
@@ -1400,53 +601,12 @@ REGISTER_OP("TensorArrayGradV3")
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
       c->set_output(0, c->Vector(2));
       c->set_output(1, c->Scalar());
+      if (c->input_handle_shapes_and_types(0)) {
+        c->set_output_handle_shapes_and_types(
+            0, *c->input_handle_shapes_and_types(0));
+      }
       return Status::OK();
-    })
-    .Doc(R"doc(
-Creates a TensorArray for storing the gradients of values in the given handle.
-
-If the given TensorArray gradient already exists, returns a reference to it.
-
-Locks the size of the original TensorArray by disabling its dynamic size flag.
-
-**A note about the input flow_in:**
-
-The handle flow_in forces the execution of the gradient lookup to occur
-only after certain other operations have occurred.  For example, when
-the forward TensorArray is dynamically sized, writes to this TensorArray
-may resize the object.  The gradient TensorArray is statically sized based
-on the size of the forward TensorArray when this operation executes.
-Furthermore, the size of the forward TensorArray is frozen by this call.
-As a result, the flow is used to ensure that the call to generate the gradient
-TensorArray only happens after all writes are executed.
-
-In the case of dynamically sized TensorArrays, gradient computation should
-only be performed on read operations that have themselves been chained via
-flow to occur only after all writes have executed. That way the final size
-of the forward TensorArray is known when this operation is called.
-
-**A note about the source attribute:**
-
-TensorArray gradient calls use an accumulator TensorArray object.  If
-multiple gradients are calculated and run in the same session, the multiple
-gradient nodes may accidentally flow through the same accumulator TensorArray.
-This double counts and generally breaks the TensorArray gradient flow.
-
-The solution is to identify which gradient call this particular
-TensorArray gradient is being called in.  This is performed by identifying
-a unique string (e.g. "gradients", "gradients_1", ...) from the input
-gradient Tensor's name.  This string is used as a suffix when creating
-the TensorArray gradient object here (the attribute `source`).
-
-The attribute `source` is added as a suffix to the forward TensorArray's
-name when performing the creation / lookup, so that each separate gradient
-calculation gets its own TensorArray accumulator.
-
-handle: The handle to the forward TensorArray.
-flow_in: A float scalar that enforces proper chaining of operations.
-source: The gradient source string, used to decide which gradient TensorArray
-  to return.
-)doc");
+    });
 
 REGISTER_OP("TensorArrayWriteV3")
     .Input("handle: resource")
@@ -1464,17 +624,17 @@ REGISTER_OP("TensorArrayWriteV3")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    })
-    .Doc(R"doc(
-Push an element onto the tensor_array.
 
-handle: The handle to a TensorArray.
-index: The position to write to inside the TensorArray.
-value: The tensor to write to the TensorArray.
-flow_in: A float scalar that enforces proper chaining of operations.
-flow_out: A float scalar that enforces proper chaining of operations.
-)doc");
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr && !handle_data->empty()) {
+        shape_inference::ShapeAndType shape_and_type = (*handle_data)[0];
+        ShapeHandle value_shape = c->input(2);
+        TF_RETURN_IF_ERROR(
+            c->Merge(shape_and_type.shape, value_shape, &unused));
+      }
+
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("TensorArrayReadV3")
     .Input("handle: resource")
@@ -1490,16 +650,15 @@ REGISTER_OP("TensorArrayReadV3")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      return shape_inference::UnknownShape(c);
-    })
-    .Doc(R"doc(
-Read an element from the TensorArray into output `value`.
-
-handle: The handle to a TensorArray.
-dtype: The type of the elem that is returned.
-flow_in: A float scalar that enforces proper chaining of operations.
-value: The tensor that is read from the TensorArray.
-)doc");
+      auto shapes = c->input_handle_shapes_and_types(0);
+      if (shapes != nullptr && !shapes->empty()) {
+        ShapeHandle tensor_shape = shapes->at(0).shape;
+        c->set_output(0, tensor_shape);
+        return Status::OK();
+      } else {
+        return shape_inference::UnknownShape(c);
+      }
+    });
 
 REGISTER_OP("TensorArrayGatherV3")
     .Input("handle: resource")
@@ -1516,22 +675,7 @@ REGISTER_OP("TensorArrayGatherV3")
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
       return shape_inference::UnknownShape(c);
-    })
-    .Doc(R"doc(
-Gather specific elements from the TensorArray into output `value`.
-
-All elements selected by `indices` must have the same shape.
-
-handle: The handle to a TensorArray.
-indices: The locations in the TensorArray from which to read tensor elements.
-dtype: The type of the elem that is returned.
-element_shape: The expected shape of an element, if known. Used to
-  validate the shapes of TensorArray elements. If this shape is not
-  fully specified, gathering zero-size TensorArrays is an error.
-flow_in: A float scalar that enforces proper chaining of operations.
-value: All of the elements in the TensorArray, concatenated along a new
-  axis (the new dimension 0).
-)doc");
+    });
 
 REGISTER_OP("TensorArrayScatterV3")
     .Input("handle: resource")
@@ -1548,18 +692,7 @@ REGISTER_OP("TensorArrayScatterV3")
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), 0), 2, &unused_dim));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       return shape_inference::ScalarShape(c);
-    })
-    .Doc(R"doc(
-Scatter the data from the input value into specific TensorArray elements.
-
-`indices` must be a vector, its length must match the first dim of `value`.
-
-handle: The handle to a TensorArray.
-indices: The locations at which to write the tensor elements.
-value: The concatenated tensor to write to the TensorArray.
-flow_in: A float scalar that enforces proper chaining of operations.
-flow_out: A float scalar that enforces proper chaining of operations.
-)doc");
+    });
 
 REGISTER_OP("TensorArrayConcatV3")
     .Input("handle: resource")
@@ -1578,35 +711,7 @@ REGISTER_OP("TensorArrayConcatV3")
       c->set_output(0, c->UnknownShape());
       c->set_output(1, c->Vector(c->UnknownDim()));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Concat the elements from the TensorArray into value `value`.
-
-Takes `T` elements of shapes
-
-  ```
-  (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
-  ```
-
-and concatenates them into a Tensor of shape:
-
-  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
-
-All elements must have the same shape (excepting the first dimension).
-
-handle: The handle to a TensorArray.
-dtype: The type of the elem that is returned.
-flow_in: A float scalar that enforces proper chaining of operations.
-element_shape_except0: The expected shape of an element, if known,
-  excluding the first dimension. Used to validate the shapes of
-  TensorArray elements. If this shape is not fully specified, concatenating
-  zero-size TensorArrays is an error.
-value: All of the elements in the TensorArray, concatenated along the first
-  axis.
-lengths: A vector of the row sizes of the original T elements in the
-  value output.  In the example above, this would be the values:
-  `(n1, n2, ..., n(T-1))`.
-)doc");
+    });
 
 REGISTER_OP("TensorArraySplitV3")
     .Input("handle: resource")
@@ -1624,35 +729,7 @@ REGISTER_OP("TensorArraySplitV3")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       return shape_inference::ScalarShape(c);
-    })
-    .Doc(R"doc(
-Split the data from the input value into TensorArray elements.
-
-Assuming that `lengths` takes on values
-
-  ```(n0, n1, ..., n(T-1))```
-
-and that `value` has shape
-
-  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
-
-this splits values into a TensorArray with T tensors.
-
-TensorArray index t will be the subtensor of values with starting position
-
-  ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
-
-and having size
-
-  ```nt x d0 x d1 x ...```
-
-handle: The handle to a TensorArray.
-value: The concatenated tensor to write to the TensorArray.
-lengths: The vector of lengths, how to split the rows of value into the
-  TensorArray.
-flow_in: A float scalar that enforces proper chaining of operations.
-flow_out: A float scalar that enforces proper chaining of operations.
-)doc");
+    });
 
 REGISTER_OP("TensorArraySizeV3")
     .Input("handle: resource")
@@ -1664,14 +741,7 @@ REGISTER_OP("TensorArraySizeV3")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
       return shape_inference::ScalarShape(c);
-    })
-    .Doc(R"doc(
-Get the current size of the TensorArray.
-
-handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-flow_in: A float scalar that enforces proper chaining of operations.
-size: The current size of the TensorArray.
-)doc");
+    });
 
 REGISTER_OP("TensorArrayCloseV3")
     .Input("handle: resource")
@@ -1681,15 +751,7 @@ REGISTER_OP("TensorArrayCloseV3")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Delete the TensorArray from its resource container.
-
-This enables the user to close and release the resource in the middle
-of a step/run.
-
-handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1706,7 +768,6 @@ REGISTER_OP("TensorArray")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayV2")
     .Input("size: int32")
     .Attr("dtype: type")
@@ -1722,7 +783,7 @@ REGISTER_OP("TensorArrayV2")
       c->set_output(0, c->Vector(2));
       return Status::OK();
     })
-    .Doc("Deprecated. Use TensorArrayV3");
+    .Deprecated(26, "Use TensorArrayV3");
 REGISTER_OP("TensorArrayGrad")
     .Input("handle: string")
     .Input("flow_in: float")
@@ -1731,7 +792,6 @@ REGISTER_OP("TensorArrayGrad")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayGradV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayGradV2")
     .Input("handle: string")
     .Input("flow_in: float")
@@ -1746,7 +806,7 @@ REGISTER_OP("TensorArrayGradV2")
       c->set_output(0, c->Vector(2));
       return Status::OK();
     })
-    .Doc("Deprecated. Use TensorArrayGradV3");
+    .Deprecated(26, "Use TensorArrayGradV3");
 REGISTER_OP("TensorArrayWrite")
     .Input("handle: Ref(string)")
     .Input("index: int32")
@@ -1756,7 +816,6 @@ REGISTER_OP("TensorArrayWrite")
     .Attr("T: type")
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayWriteV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayWriteV2")
     .Input("handle: string")
     .Input("index: int32")
@@ -1775,7 +834,7 @@ REGISTER_OP("TensorArrayWriteV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       return shape_inference::ScalarShape(c);
     })
-    .Doc("Deprecated. Use TensorArrayGradV3");
+    .Deprecated(26, "Use TensorArrayWriteV3");
 REGISTER_OP("TensorArrayRead")
     .Input("handle: Ref(string)")
     .Input("index: int32")
@@ -1784,7 +843,6 @@ REGISTER_OP("TensorArrayRead")
     .Attr("dtype: type")
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayReadV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayReadV2")
     .Input("handle: string")
     .Input("index: int32")
@@ -1801,7 +859,7 @@ REGISTER_OP("TensorArrayReadV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
       return shape_inference::UnknownShape(c);
     })
-    .Doc("Deprecated. Use TensorArrayReadV3");
+    .Deprecated(26, "Use TensorArrayReadV3");
 REGISTER_OP("TensorArrayPack")
     .Input("handle: Ref(string)")
     .Input("flow_in: float")
@@ -1827,7 +885,6 @@ REGISTER_OP("TensorArrayGather")
     .Attr("element_shape: shape = { unknown_rank: true }")
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArrayGatherV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayGatherV2")
     .Input("handle: string")
     .Input("indices: int32")
@@ -1844,7 +901,7 @@ REGISTER_OP("TensorArrayGatherV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
       return shape_inference::UnknownShape(c);
     })
-    .Doc("Deprecated. Use TensorArrayGatherV3");
+    .Deprecated(26, "Use TensorArrayGatherV3");
 REGISTER_OP("TensorArrayScatter")
     .Input("handle: Ref(string)")
     .Input("indices: int32")
@@ -1854,7 +911,6 @@ REGISTER_OP("TensorArrayScatter")
     .Attr("T: type")
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(19, "Use TensorArrayGradV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayScatterV2")
     .Input("handle: string")
     .Input("indices: int32")
@@ -1871,7 +927,7 @@ REGISTER_OP("TensorArrayScatterV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       return shape_inference::ScalarShape(c);
     })
-    .Doc("Deprecated. Use TensorArrayScatterV3");
+    .Deprecated(26, "Use TensorArrayScatterV3");
 REGISTER_OP("TensorArrayConcat")
     .Input("handle: Ref(string)")
     .Input("flow_in: float")
@@ -1898,8 +954,7 @@ REGISTER_OP("TensorArrayConcatV2")
       c->set_output(0, c->UnknownShape());
       c->set_output(1, c->Vector(c->UnknownDim()));
       return Status::OK();
-    })
-    .Doc("Deprecated. Use TensorArrayConcatV3");
+    });
 REGISTER_OP("TensorArraySplit")
     .Input("handle: Ref(string)")
     .Input("value: T")
@@ -1909,7 +964,6 @@ REGISTER_OP("TensorArraySplit")
     .Attr("T: type")
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArraySplitV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArraySplitV2")
     .Input("handle: string")
     .Input("value: T")
@@ -1927,14 +981,13 @@ REGISTER_OP("TensorArraySplitV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       return shape_inference::ScalarShape(c);
     })
-    .Doc("Deprecated. Use TensorArraySplitV3");
+    .Deprecated(26, "Use TensorArraySplitV3");
 REGISTER_OP("TensorArraySize")
     .Input("handle: Ref(string)")
     .Input("flow_in: float")
     .Output("size: int32")
     .SetShapeFn(shape_inference::UnknownShape)
     .Deprecated(16, "Use TensorArraySizeV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArraySizeV2")
     .Input("handle: string")
     .Input("flow_in: float")
@@ -1946,12 +999,11 @@ REGISTER_OP("TensorArraySizeV2")
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
       return shape_inference::ScalarShape(c);
     })
-    .Doc("Deprecated. Use TensorArraySizeV3");
+    .Deprecated(26, "Use TensorArraySizeV3");
 REGISTER_OP("TensorArrayClose")
     .Input("handle: Ref(string)")
     .SetShapeFn([](InferenceContext* c) { return Status::OK(); })
     .Deprecated(16, "Use TensorArrayCloseV3");
-// TODO(cwhipkey): mark this deprecated in favor of V3.
 REGISTER_OP("TensorArrayCloseV2")
     .Input("handle: string")
     .SetShapeFn([](InferenceContext* c) {
@@ -1961,7 +1013,7 @@ REGISTER_OP("TensorArrayCloseV2")
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
       return Status::OK();
     })
-    .Doc("Deprecated. Use TensorArrayCloseV3");
+    .Deprecated(26, "Use TensorArrayCloseV3");
 
 // --------------------------------------------------------------------------
 
@@ -1973,31 +1025,7 @@ REGISTER_OP("Barrier")
     .Attr("capacity: int = -1")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Defines a barrier that persists across different graph executions.
-
-A barrier represents a key-value map, where each key is a string, and
-each value is a tuple of tensors.
-
-At runtime, the barrier contains 'complete' and 'incomplete'
-elements. A complete element has defined tensors for all components of
-its value tuple, and may be accessed using BarrierTakeMany. An
-incomplete element has some undefined components in its value tuple,
-and may be updated using BarrierInsertMany.
-
-handle: The handle to the barrier.
-component_types: The type of each component in a value.
-shapes: The shape of each component in a value. Each shape must be 1 in the
-  first dimension. The length of this attr must be the same as the length of
-  component_types.
-capacity: The capacity of the barrier.  The default capacity is MAX_INT32,
-  which is the largest capacity of the underlying queue.
-container: If non-empty, this barrier is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this barrier will be shared under the given name
-  across multiple sessions.
-)doc");
+    .SetShapeFn(TwoElementOutput);
 
 REGISTER_OP("BarrierInsertMany")
     .Input("handle: Ref(string)")
@@ -2016,21 +1044,7 @@ REGISTER_OP("BarrierInsertMany")
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 1, &values));
       TF_RETURN_IF_ERROR(c->Merge(keys, c->Vector(c->Dim(values, 0)), &handle));
       return Status::OK();
-    })
-    .Doc(R"doc(
-For each key, assigns the respective value to the specified component.
-
-If a key is not found in the barrier, this operation will create a new
-incomplete element. If a key is found in the barrier, and the element
-already has a value at component_index, this operation will fail with
-INVALID_ARGUMENT, and leave the barrier in an undefined state.
-
-handle: The handle to a barrier.
-component_index: The component of the barrier elements that is being assigned.
-keys: A one-dimensional tensor of keys, with length n.
-values: An any-dimensional tensor of values, which are associated with the
-  respective keys. The 0th dimension must have length n.
-)doc");
+    });
 
 REGISTER_OP("BarrierTakeMany")
     .Input("handle: Ref(string)")
@@ -2042,78 +1056,22 @@ REGISTER_OP("BarrierTakeMany")
     .Attr("allow_small_batch: bool = false")
     .Attr("wait_for_incomplete: bool = false")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Takes the given number of completed elements from a barrier.
-
-This operation concatenates completed-element component tensors along
-the 0th dimension to make a single component tensor.
-
-Elements come out of the barrier when they are complete, and in the order
-in which they were placed into the barrier.  The indices output provides
-information about the batch in which each element was originally inserted
-into the barrier.
-
-handle: The handle to a barrier.
-num_elements: A single-element tensor containing the number of elements to
-  take.
-indices: A one-dimensional tensor of indices, with length num_elems.
-  These indices refer to the batch in which the values were placed into the
-  barrier (starting with MIN_LONG and increasing with each BarrierInsertMany).
-keys: A one-dimensional tensor of keys, with length num_elements.
-values: One any-dimensional tensor per component in a barrier element. All
-  values have length num_elements in the 0th dimension.
-component_types: The type of each component in a value.
-allow_small_batch: Allow to return less than num_elements items if barrier is
-  already closed.
-timeout_ms: If the queue is empty, this operation will block for up to
-  timeout_ms milliseconds.
-  Note: This option is not supported yet.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("BarrierClose")
     .Input("handle: Ref(string)")
     .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
-    .Attr("cancel_pending_enqueues: bool = false")
-    .Doc(R"doc(
-Closes the given barrier.
-
-This operation signals that no more new elements will be inserted in the
-given barrier. Subsequent InsertMany that try to introduce a new key will fail.
-Subsequent InsertMany operations that just add missing components to already
-existing elements will continue to succeed. Subsequent TakeMany operations will
-continue to succeed if sufficient completed elements remain in the barrier.
-Subsequent TakeMany operations that would block will fail immediately.
-
-handle: The handle to a barrier.
-cancel_pending_enqueues: If true, all pending enqueue requests that are
-  blocked on the barrier's queue will be canceled. InsertMany will fail, even
-  if no new key is introduced.
-)doc");
+    .Attr("cancel_pending_enqueues: bool = false");
 
 REGISTER_OP("BarrierReadySize")
     .Input("handle: Ref(string)")
     .Output("size: int32")
-    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
-    .Doc(R"doc(
-Computes the number of complete elements in the given barrier.
-
-handle: The handle to a barrier.
-size: The number of complete elements (i.e. those with all of their value
-  components set) in the barrier.
-)doc");
+    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs);
 
 REGISTER_OP("BarrierIncompleteSize")
     .Input("handle: Ref(string)")
     .Output("size: int32")
-    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
-    .Doc(R"doc(
-Computes the number of incomplete elements in the given barrier.
-
-handle: The handle to a barrier.
-size: The number of incomplete elements (i.e. those with some of their value
-  components not set) in the barrier.
-)doc");
+    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs);
 
 // --------------------------------------------------------------------------
 
@@ -2121,57 +1079,35 @@ REGISTER_OP("GetSessionHandle")
     .Input("value: T")
     .Output("handle: string")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Store the input tensor in the state of the current session.
-
-value: The tensor to be stored.
-handle: The handle for the tensor stored in the session state, represented
-  as a string.
-)doc");
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("GetSessionHandleV2")
     .Input("value: T")
     .Output("handle: resource")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Store the input tensor in the state of the current session.
-
-value: The tensor to be stored.
-handle: The handle for the tensor stored in the session state, represented
-  as a ResourceHandle object.
-)doc");
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("GetSessionTensor")
     .Input("handle: string")
     .Output("value: dtype")
     .Attr("dtype: type")
+    .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
       return shape_inference::UnknownShape(c);
-    })
-    .Doc(R"doc(
-Get the value of the tensor specified by its handle.
-
-handle: The handle for a tensor stored in the session state.
-value: The tensor for the given handle.
-dtype: The type of the output value.
-)doc");
+    });
 
 REGISTER_OP("DeleteSessionTensor")
     .Input("handle: string")
+    .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Delete the tensor specified by its handle in the session.
-
-handle: The handle for a tensor stored in the session state.
-)doc");
+    });
 
 REGISTER_OP("Stage")
     .Input("values: dtypes")
@@ -2181,23 +1117,7 @@ REGISTER_OP("Stage")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(shape_inference::UnknownShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Stage values similar to a lightweight Enqueue.
-
-The basic functionality of this Op is similar to a queue with many
-fewer capabilities and options.  This Op is optimized for performance.
-
-values: a list of tensors
-dtypes A list of data types that inserted values should adhere to.
-capacity: Maximum number of elements in the Staging Area. If > 0, inserts
-  on the container will block when the capacity is reached.
-memory_limit: The maximum number of bytes allowed for Tensors in the Staging Area.
-  If > 0, inserts will block until sufficient space is available.
-container: If non-empty, this queue is placed in the given container. Otherwise,
-  a default container is used.
-shared_name: It is necessary to match this name to the matching Unstage Op.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("Unstage")
     .Output("values: dtypes")
@@ -2207,13 +1127,7 @@ REGISTER_OP("Unstage")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(shape_inference::UnknownShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op is similar to a lightweight Dequeue.
-
-The basic functionality is similar to dequeue with many fewer
-capabilities and options.  This Op is optimized for performance.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("StagePeek")
     .Input("index: int32")
@@ -2224,13 +1138,7 @@ REGISTER_OP("StagePeek")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(shape_inference::UnknownShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op peeks at the values at the specified index.  If the
-underlying container does not contain sufficient elements
-this op will block until it does.   This Op is optimized for
-performance.
-    )doc");
+    .SetIsStateful();
 
 REGISTER_OP("StageSize")
     .Output("size: int32")
@@ -2240,10 +1148,7 @@ REGISTER_OP("StageSize")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(shape_inference::ScalarShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op returns the number of elements in the underlying container.
-    )doc");
+    .SetIsStateful();
 
 REGISTER_OP("StageClear")
     .Attr("capacity: int >= 0 = 0")
@@ -2252,10 +1157,7 @@ REGISTER_OP("StageClear")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(shape_inference::UnknownShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op removes all elements in the underlying container.
-    )doc");
+    .SetIsStateful();
 
 // UnorderedMap
 REGISTER_OP("MapStage")
@@ -2269,19 +1171,7 @@ REGISTER_OP("MapStage")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
-    .SetIsStateful()
-    .Doc(R"doc(
-Stage (key, values) in the underlying container which behaves like a hashtable.
-
-key: int64
-values: a list of tensors
-dtypes A list of data types that inserted values should adhere to.
-capacity: Maximum number of elements in the Staging Area. If > 0, inserts
-  on the container will block when the capacity is reached.
-container: If non-empty, this queue is placed in the given container. Otherwise,
-  a default container is used.
-shared_name: It is necessary to match this name to the matching Unstage Op.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("MapPeek")
     .Input("key: int64")
@@ -2293,12 +1183,7 @@ REGISTER_OP("MapPeek")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::UnknownShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op peeks at the values at the specified key.  If the
-underlying container does not contain this key
-this op will block until it does.
-    )doc");
+    .SetIsStateful();
 
 REGISTER_OP("MapUnstage")
     .Input("key: int64")
@@ -2310,12 +1195,7 @@ REGISTER_OP("MapUnstage")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::UnknownShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op removes and returns the values associated with the key
-from the underlying container.   If the underlying container
-does not contain this key, the op will block until it does.
-    )doc");
+    .SetIsStateful();
 
 REGISTER_OP("MapUnstageNoKey")
     .Input("indices: int32")
@@ -2327,12 +1207,7 @@ REGISTER_OP("MapUnstageNoKey")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::UnknownShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op removes and returns a random (key, value)
-from the underlying container.   If the underlying container
-does not contain elements, the op will block until it does.
-      )doc");
+    .SetIsStateful();
 
 REGISTER_OP("MapSize")
     .Output("size: int32")
@@ -2342,10 +1217,7 @@ REGISTER_OP("MapSize")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op returns the number of elements in the underlying container.
-    )doc");
+    .SetIsStateful();
 
 REGISTER_OP("MapIncompleteSize")
     .Output("size: int32")
@@ -2355,10 +1227,7 @@ REGISTER_OP("MapIncompleteSize")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op returns the number of incomplete elements in the underlying container.
-    )doc");
+    .SetIsStateful();
 
 REGISTER_OP("MapClear")
     .Attr("capacity: int >= 0 = 0")
@@ -2367,10 +1236,7 @@ REGISTER_OP("MapClear")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op removes all elements in the underlying container.
-    )doc");
+    .SetIsStateful();
 
 // OrderedMap
 REGISTER_OP("OrderedMapStage")
@@ -2384,20 +1250,7 @@ REGISTER_OP("OrderedMapStage")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
-    .SetIsStateful()
-    .Doc(R"doc(
-Stage (key, values) in the underlying container which behaves like a ordered
-associative container.   Elements are ordered by key.
-
-key: int64
-values: a list of tensors
-dtypes A list of data types that inserted values should adhere to.
-capacity: Maximum number of elements in the Staging Area. If > 0, inserts
-  on the container will block when the capacity is reached.
-container: If non-empty, this queue is placed in the given container. Otherwise,
-  a default container is used.
-shared_name: It is necessary to match this name to the matching Unstage Op.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("OrderedMapPeek")
     .Input("key: int64")
@@ -2409,13 +1262,7 @@ REGISTER_OP("OrderedMapPeek")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::UnknownShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op peeks at the values at the specified key.  If the
-underlying container does not contain this key
-this op will block until it does.   This Op is optimized for
-performance.
-    )doc");
+    .SetIsStateful();
 
 REGISTER_OP("OrderedMapUnstage")
     .Input("key: int64")
@@ -2427,12 +1274,7 @@ REGISTER_OP("OrderedMapUnstage")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::UnknownShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op removes and returns the values associated with the key
-from the underlying container.   If the underlying container
-does not contain this key, the op will block until it does.
-    )doc");
+    .SetIsStateful();
 
 REGISTER_OP("OrderedMapUnstageNoKey")
     .Input("indices: int32")
@@ -2444,12 +1286,7 @@ REGISTER_OP("OrderedMapUnstageNoKey")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::UnknownShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op removes and returns the (key, value) element with the smallest
-key from the underlying container.   If the underlying container
-does not contain elements, the op will block until it does.
-      )doc");
+    .SetIsStateful();
 
 REGISTER_OP("OrderedMapSize")
     .Output("size: int32")
@@ -2459,10 +1296,7 @@ REGISTER_OP("OrderedMapSize")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op returns the number of elements in the underlying container.
-    )doc");
+    .SetIsStateful();
 
 REGISTER_OP("OrderedMapIncompleteSize")
     .Output("size: int32")
@@ -2472,10 +1306,7 @@ REGISTER_OP("OrderedMapIncompleteSize")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op returns the number of incomplete elements in the underlying container.
-    )doc");
+    .SetIsStateful();
 
 REGISTER_OP("OrderedMapClear")
     .Attr("capacity: int >= 0 = 0")
@@ -2484,10 +1315,7 @@ REGISTER_OP("OrderedMapClear")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
-    .SetIsStateful()
-    .Doc(R"doc(
-Op removes all elements in the underlying container.
-    )doc");
+    .SetIsStateful();
 
 REGISTER_OP("RecordInput")
     .Output("records: string")
@@ -2497,19 +1325,8 @@ REGISTER_OP("RecordInput")
     .Attr("file_buffer_size: int = 10000")
     .Attr("file_parallelism: int = 16")
     .Attr("batch_size: int = 32")
+    .Attr("compression_type: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Emits randomized records.
-
-records: A tensor of shape [batch_size].
-file_pattern: Glob pattern for the data files.
-file_random_seed: Random seeds used to produce randomized records.
-file_shuffle_shift_ratio: Shifts the list of files after the list is randomly
-    shuffled.
-file_buffer_size: The randomization shuffling buffer.
-file_parallelism: How many sstables are opened and concurrently iterated over.
-batch_size: The batch size.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 6bf226e7a535f6c36b4b5b7e660641f22157829c..9e98f56c745a2b0b16531e2785e43ba8464d42b8 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -39,13 +39,10 @@ REGISTER_OP("TensorDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): Validate that
-                                               // `components` have shapes
-                                               // compatible with
-                                               // `output_shapes`.
-    .Doc(R"doc(
-Creates a dataset that emits `components` as a tuple of tensors once.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate that
+                                                // `components` have shapes
+                                                // compatible with
+                                                // `output_shapes`.
 
 REGISTER_OP("TensorSliceDataset")
     .Input("components: Toutput_types")
@@ -54,13 +51,10 @@ REGISTER_OP("TensorSliceDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): Validate that the
-                                               // dim-0 slices of `components`
-                                               // have shapes compatible with
-                                               // `output_shapes`.
-    .Doc(R"doc(
-Creates a dataset that emits each dim-0 slice of `components` once.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate that the
+                                                // dim-0 slices of `components`
+                                                // have shapes compatible with
+                                                // `output_shapes`.
 
 REGISTER_OP("SparseTensorSliceDataset")
     .Input("indices: int64")
@@ -70,10 +64,7 @@ REGISTER_OP("SparseTensorSliceDataset")
     .Attr("Tvalues: type")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that splits a SparseTensor into elements row-wise.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ZipDataset")
     .Input("input_datasets: N * variant")
@@ -81,10 +72,7 @@ REGISTER_OP("ZipDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("N: int >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that zips together `input_datasets`.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ConcatenateDataset")
     .Input("input_dataset: variant")
@@ -92,10 +80,7 @@ REGISTER_OP("ConcatenateDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that concatenates `input_dataset` with `another_dataset`.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("RepeatDataset")
     .Input("input_dataset: variant")
@@ -103,14 +88,8 @@ REGISTER_OP("RepeatDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): Validate the shape
-                                               // of `count`.
-    .Doc(R"doc(
-Creates a dataset that emits the outputs of `input_dataset` `count` times.
-
-count: A scalar representing the number of times that `input_dataset` should
-  be repeated. A value of `-1` indicates that it should be repeated infinitely.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate the
+                                                // shape of `count`.
 
 REGISTER_OP("TakeDataset")
     .Input("input_dataset: variant")
@@ -118,14 +97,7 @@ REGISTER_OP("TakeDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that contains `count` elements from the `input_dataset`.
-
-count: A scalar representing the number of elements from the `input_dataset`
-  that should be taken. A value of `-1` indicates that all of `input_dataset`
-  is taken.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("SkipDataset")
     .Input("input_dataset: variant")
@@ -133,23 +105,7 @@ REGISTER_OP("SkipDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that skips `count` elements from the `input_dataset`.
-
-count: A scalar representing the number of elements from the `input_dataset`
-  that should be skipped.  If count is -1, skips everything.
-)doc");
-
-REGISTER_OP("IgnoreErrorsDataset")
-    .Input("input_dataset: variant")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("BytesProducedStatsDataset")
     .Input("input_dataset: variant")
@@ -157,10 +113,7 @@ REGISTER_OP("BytesProducedStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("LatencyStatsDataset")
     .Input("input_dataset: variant")
@@ -168,10 +121,7 @@ REGISTER_OP("LatencyStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Records the latency of producing `input_dataset` elements in a StatsAggregator.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("MapDataset")
     .Input("input_dataset: variant")
@@ -181,10 +131,7 @@ REGISTER_OP("MapDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that applies `f` to the outputs of `input_dataset`.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ParallelMapDataset")
     .Input("input_dataset: variant")
@@ -195,16 +142,7 @@ REGISTER_OP("ParallelMapDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that applies `f` to the outputs of `input_dataset`.
-
-Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
-to `num_parallel_calls` copies of `f` in parallel.
-
-num_parallel_calls: The number of concurrent invocations of `f` that process
-  elements from `input_dataset` in parallel.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("MapAndBatchDataset")
     .Input("input_dataset: variant")
@@ -216,21 +154,7 @@ REGISTER_OP("MapAndBatchDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that applies `f` to the outputs of `input_dataset` and then
-batches `batch_size` of them.
-
-Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
-to `batch_size * num_parallel_batches` copies of `f` in parallel.
-
-batch_size: A scalar representing the number of elements to accumulate in a
-  batch. It determines the number of concurrent invocations of `f` that process
-  elements from `input_dataset` in parallel.
-num_parallel_batches: A scalar representing the number of batches to create in
-  parallel. Processing multiple batches in parallel benefits workloads prone to
-  stragglers.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
@@ -238,13 +162,7 @@ REGISTER_OP("PrefetchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that asynchronously prefetches elements from `input_dataset`.
-
-buffer_size: The maximum number of elements to buffer in an iterator over
-  this dataset.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ScanDataset")
     .Input("input_dataset: variant")
@@ -256,10 +174,7 @@ REGISTER_OP("ScanDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset successively reduces `f` over the elements of `input_dataset`.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("FlatMapDataset")
     .Input("input_dataset: variant")
@@ -269,18 +184,7 @@ REGISTER_OP("FlatMapDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that applies `f` to the outputs of `input_dataset`.
-
-Unlike MapDataset, the `f` in FlatMapDataset is expected to return a
-Dataset variant, and FlatMapDataset will flatten successive results
-into a single Dataset.
-
-f: A function mapping elements of `input_dataset`, concatenated with
-  `other_arguments`, to a Dataset variant that contains elements matching
-  `output_types` and `output_shapes`.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("InterleaveDataset")
     .Input("input_dataset: variant")
@@ -292,20 +196,7 @@ REGISTER_OP("InterleaveDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that applies `f` to the outputs of `input_dataset`.
-
-Unlike MapDataset, the `f` in InterleaveDataset is expected to return
-a Dataset variant, and InterleaveDataset will flatten successive
-results into a single Dataset. Unlike FlatMapDataset,
-InterleaveDataset will interleave sequences of up to `block_length`
-consecutive elements from `cycle_length` input elements.
-
-f: A function mapping elements of `input_dataset`, concatenated with
-  `other_arguments`, to a Dataset variant that contains elements matching
-  `output_types` and `output_shapes`.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ParallelInterleaveDataset")
     .Input("input_dataset: variant")
@@ -313,27 +204,14 @@ REGISTER_OP("ParallelInterleaveDataset")
     .Input("cycle_length: int64")
     .Input("block_length: int64")
     .Input("sloppy: bool")
+    .Input("buffer_output_elements: int64")
+    .Input("prefetch_input_elements: int64")
     .Output("handle: variant")
     .Attr("f: func")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that applies `f` to the outputs of `input_dataset`.
-
-The resulting dataset is similar to the `InterleaveDataset`, with the exception
-that if retrieving the next value from a dataset would cause the requester to
-block, it will skip that input dataset. This dataset is especially useful
-when loading data from a variable-latency datastores (e.g. HDFS, GCS), as it
-allows the training step to proceed so long as some data is available.
-
-!! WARNING !! This dataset is not deterministic!
-
-f: A function mapping elements of `input_dataset`, concatenated with
-   `other_arguments`, to a Dataset variant that contains elements matching
-   `output_types` and `output_shapes`.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("GroupByWindowDataset")
     .Input("input_dataset: variant")
@@ -350,15 +228,7 @@ REGISTER_OP("GroupByWindowDataset")
     .Attr("Twindow_size_func_other_arguments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that computes a windowed group-by on `input_dataset`.
-
-// TODO(mrry): Support non-int64 keys.
-
-key_func: A function mapping an element of `input_dataset`, concatenated
-  with `key_func_other_arguments` to a scalar value of type DT_INT64.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("FilterDataset")
     .Input("input_dataset: variant")
@@ -368,20 +238,7 @@ REGISTER_OP("FilterDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset containing elements of `input_dataset` matching `predicate`.
-
-The `predicate` function must return a scalar boolean and accept the
-following arguments:
-
-* One tensor for each component of an element of `input_dataset`.
-* One tensor for each value in `other_arguments`.
-
-predicate: A function returning a scalar boolean.
-other_arguments: A list of tensors, typically values that were captured when
-  building a closure for `predicate`.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("BatchDataset")
     .Input("input_dataset: variant")
@@ -389,13 +246,7 @@ REGISTER_OP("BatchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that batches `batch_size` elements from `input_dataset`.
-
-batch_size: A scalar representing the number of elements to accumulate in a
-  batch.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("PaddedBatchDataset")
     .Input("input_dataset: variant")
@@ -406,50 +257,26 @@ REGISTER_OP("PaddedBatchDataset")
     .Attr("Toutput_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("N: int >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): Validate that
-                                               // `padded_shapes` are all
-                                               // vectors, the lengths of
-                                               // `output_types` and
-                                               // `output_shapes` are `N`,
-                                               // the `output_shapes` are (as
-                                               // far as possible to tell
-                                               // statically) compatible with
-                                               // `padded_shapes`, and
-                                               // that `padding_values` are
-                                               // all scalars.
-    .Doc(R"doc(
-Creates a dataset that batches and pads `batch_size` elements from the input.
-
-batch_size: A scalar representing the number of elements to accumulate in a
-  batch.
-padded_shapes: A list of int64 tensors representing the desired padded shapes
-  of the corresponding output components. These shapes may be partially
-  specified, using `-1` to indicate that a particular dimension should be
-  padded to the maximum size of all batch elements.
-padding_values: A list of scalars containing the padding value to use for
-  each of the outputs.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate that
+                                                // `padded_shapes` are all
+                                                // vectors, the lengths of
+                                                // `output_types` and
+                                                // `output_shapes` are `N`,
+                                                // the `output_shapes` are (as
+                                                // far as possible to tell
+                                                // statically) compatible with
+                                                // `padded_shapes`, and
+                                                // that `padding_values` are
+                                                // all scalars.
 
 REGISTER_OP("DenseToSparseBatchDataset")
     .Input("input_dataset: variant")
     .Input("batch_size: int64")
     .Input("row_shape: int64")
     .Output("handle: variant")
-    // NOTE(mrry): the 0th and 2nd elements will be DT_INT64.
     .Attr("output_types: list(type) >= 1")
-    // NOTE(mrry): the 1st and 2nd elements will be vectors.
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that yields a SparseTensor for each element of the input.
-
-input_dataset: A handle to an input dataset. Must have a single component.
-batch_size: A scalar representing the number of elements to accumulate in a
-  batch.
-row_shape: A vector representing the dense shape of each row in the produced
-  SparseTensor. The shape may be partially specified, using `-1` to indicate
-  that a particular dimension should use the maximum size of all batch elements.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("RangeDataset")
     .Input("start: int64")
@@ -460,14 +287,17 @@ REGISTER_OP("RangeDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset with a range of values. Corresponds to python's xrange.
+    .SetShapeFn(shape_inference::ScalarShape);
 
-start: corresponds to start in python's xrange().
-stop: corresponds to stop in python's xrange().
-step: corresponds to step in python's xrange().
-)doc");
+REGISTER_OP("RandomDataset")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ShuffleDataset")
     .Input("input_dataset: variant")
@@ -478,23 +308,18 @@ REGISTER_OP("ShuffleDataset")
     .Attr("reshuffle_each_iteration: bool = true")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
-
-buffer_size: The number of output elements to buffer in an iterator over
-  this dataset. Compare with the `min_after_dequeue` attr when creating a
-  `RandomShuffleQueue`.
-reshuffle_each_iteration: If true, each iterator over this dataset will be given
-  a different pseudorandomly generated seed, based on a sequence seeded by the
-  `seed` and `seed2` inputs. If false, each iterator will be given the same
-  seed, and repeated iteration over this dataset will yield the exact same
-  sequence of results.
-seed: A scalar seed for the random number generator. If either seed or
-  seed2 is set to be non-zero, the random number generator is seeded
-  by the given seed.  Otherwise, a random seed is used.
-seed2: A second scalar seed to avoid seed collision.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ShuffleAndRepeatDataset")
+    .Input("input_dataset: variant")
+    .Input("buffer_size: int64")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Input("count: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("CacheDataset")
     .Input("input_dataset: variant")
@@ -502,18 +327,14 @@ REGISTER_OP("CacheDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that caches elements from `input_dataset`.
-
-A CacheDataset will iterate over the input_dataset, and store tensors. If the
-cache already exists, the cache will be used. If the cache is inappropriate
-(e.g. cannot be opened, contains tensors of the wrong shape / size), an error
-will the returned when used.
+    .SetShapeFn(shape_inference::ScalarShape);
 
-filename: A path on the filesystem where we should cache the dataset. Note: this
-  will be a directory.
-)doc");
+REGISTER_OP("UniqueDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("TextLineDataset")
     .Input("filenames: string")
@@ -522,19 +343,10 @@ REGISTER_OP("TextLineDataset")
     .Output("handle: variant")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): validate
-                                               // that `filenames` is
-                                               // a scalar or a
-                                               // vector.
-    .Doc(R"doc(
-Creates a dataset that emits the lines of one or more text files.
-
-filenames: A scalar or a vector containing the name(s) of the file(s) to be
-  read.
-compression_type: A scalar containing either (i) the empty string (no
-  compression), (ii) "ZLIB", or (iii) "GZIP".
-buffer_size: A scalar containing the number of bytes to buffer.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): validate
+                                                // that `filenames` is
+                                                // a scalar or a
+                                                // vector.
 
 REGISTER_OP("SqlDataset")
     .Input("driver_name: string")
@@ -545,14 +357,7 @@ REGISTER_OP("SqlDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that executes a SQL query and emits rows of the result set.
-
-driver_name: The database type. Currently, the only supported type is 'sqlite'.
-data_source_name: A connection string to connect to the database.
-query: A SQL query to execute.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("FixedLengthRecordDataset")
     .Input("filenames: string")
@@ -563,19 +368,7 @@ REGISTER_OP("FixedLengthRecordDataset")
     .Output("handle: variant")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that emits the records from one or more binary files.
-
-filenames: A scalar or a vector containing the name(s) of the file(s) to be
-  read.
-header_bytes: A scalar representing the number of bytes to skip at the
-  beginning of a file.
-record_bytes: A scalar representing the number of bytes in each record.
-footer_bytes: A scalar representing the number of bytes to skip at the end
-  of a file.
-buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("TFRecordDataset")
     .Input("filenames: string")
@@ -584,17 +377,7 @@ REGISTER_OP("TFRecordDataset")
     .Output("handle: variant")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Creates a dataset that emits the records from one or more TFRecord files.
-
-filenames: A scalar or vector containing the name(s) of the file(s) to be
-  read.
-compression_type: A scalar containing either (i) the empty string (no
-  compression), (ii) "ZLIB", or (iii) "GZIP".
-buffer_size: A scalar representing the number of bytes to buffer. A value of
-  0 means no buffering will be performed.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("Iterator")
     .Output("handle: resource")
@@ -602,24 +385,12 @@ REGISTER_OP("Iterator")
     .Attr("container: string")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A container for an iterator resource.
-
-handle: A handle to the iterator that can be passed to a "MakeIterator"
-  or "IteratorGetNext" op.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("MakeIterator")
     .Input("dataset: variant")
     .Input("iterator: resource")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Makes a new iterator from the given `dataset` and stores it in `iterator`.
-
-This operation may be executed multiple times. Each execution will reset the
-iterator in `iterator` to the first element of `dataset`.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("OneShotIterator")
     .Output("handle: resource")
@@ -629,166 +400,113 @@ REGISTER_OP("OneShotIterator")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Makes a "one-shot" iterator that can be iterated only once.
-
-A one-shot iterator bundles the logic for defining the dataset and
-the state of the iterator in a single op, which allows simple input
-pipelines to be defined without an additional initialization
-("MakeIterator") step.
-
-One-shot iterators have the following limitations:
-
-* They do not support parameterization: all logic for creating the underlying
-  dataset must be bundled in the `dataset_factory` function.
-* They are not resettable. Once a one-shot iterator reaches the end of its
-  underlying dataset, subsequent "IteratorGetNext" operations on that
-  iterator will always produce an `OutOfRange` error.
-
-For greater flexibility, use "Iterator" and "MakeIterator" to define
-an iterator using an arbitrary subgraph, which may capture tensors
-(including fed values) as parameters, and which may be reset multiple
-times by rerunning "MakeIterator".
-
-handle: A handle to the iterator that can be passed to an "IteratorGetNext"
-  op.
-dataset_factory: A function of type `() -> DT_VARIANT`, where the returned
-  DT_VARIANT is a dataset.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
+
+namespace {
+
+Status IteratorGetNextShapeFn(shape_inference::InferenceContext* c) {
+  shape_inference::ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+  std::vector<PartialTensorShape> output_shapes;
+  TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+  if (output_shapes.size() != c->num_outputs()) {
+    return errors::InvalidArgument(
+        "`output_shapes` must be the same length as `output_types` (",
+        output_shapes.size(), " vs. ", c->num_outputs());
+  }
+  for (size_t i = 0; i < output_shapes.size(); ++i) {
+    shape_inference::ShapeHandle output_shape_handle;
+    TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+        output_shapes[i], &output_shape_handle));
+    c->set_output(static_cast<int>(i), output_shape_handle);
+  }
+  return Status::OK();
+}
+
+}  // namespace
 
 REGISTER_OP("IteratorGetNext")
     .Input("iterator: resource")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      std::vector<PartialTensorShape> output_shapes;
-      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
-      if (output_shapes.size() != c->num_outputs()) {
-        return errors::InvalidArgument(
-            "`output_shapes` must be the same length as `output_types` (",
-            output_shapes.size(), " vs. ", c->num_outputs());
-      }
-      for (size_t i = 0; i < output_shapes.size(); ++i) {
-        shape_inference::ShapeHandle output_shape_handle;
-        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
-            output_shapes[i], &output_shape_handle));
-        c->set_output(static_cast<int>(i), output_shape_handle);
-      }
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Gets the next output from the given iterator.
-)doc");
+    .SetShapeFn(IteratorGetNextShapeFn);
+
+REGISTER_OP("IteratorGetNextSync")
+    .Input("iterator: resource")
+    .Output("components: output_types")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(IteratorGetNextShapeFn);
 
 REGISTER_OP("DatasetToSingleElement")
     .Input("dataset: variant")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      std::vector<PartialTensorShape> output_shapes;
-      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
-      if (output_shapes.size() != c->num_outputs()) {
-        return errors::InvalidArgument(
-            "`output_shapes` must be the same length as `output_types` (",
-            output_shapes.size(), " vs. ", c->num_outputs());
-      }
-      for (size_t i = 0; i < output_shapes.size(); ++i) {
-        shape_inference::ShapeHandle output_shape_handle;
-        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
-            output_shapes[i], &output_shape_handle));
-        c->set_output(static_cast<int>(i), output_shape_handle);
-      }
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Outputs the single element from the given dataset.
-
-dataset: A handle to a dataset that contains a single element.
-components: The components of the single element of `input`.
-)doc");
+    .SetShapeFn(IteratorGetNextShapeFn);
 
 REGISTER_OP("IteratorToStringHandle")
     .Input("resource_handle: resource")
     .Output("string_handle: string")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Converts the given `resource_handle` representing an iterator to a string.
-
-resource_handle: A handle to an iterator resource.
-string_handle: A string representation of the given handle.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("IteratorFromStringHandle")
     .Input("string_handle: string")
     .Output("resource_handle: resource")
     .Attr("output_types: list(type) >= 0 = []")
     .Attr("output_shapes: list(shape) >= 0 = []")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Converts the given string representing a handle to an iterator to a resource.
-
-string_handle: A string representation of the given handle.
-resource_handle: A handle to an iterator resource.
-output_types: If specified, defines the type of each tuple component in an
-  element produced by the resulting iterator.
-output_shapes: If specified, defines the shape of each tuple component in an
-  element produced by the resulting iterator.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("SerializeIterator")
     .Input("resource_handle: resource")
     .Output("serialized: variant")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Converts the given `resource_handle` representing an iterator to a variant tensor.
-
-resource_handle: A handle to an iterator resource.
-serialized: A variant tensor storing the state of the iterator contained in the
-  resource.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("DeserializeIterator")
     .Input("resource_handle: resource")
     .Input("serialized: variant")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Converts the given variant tensor to an iterator and stores it in the given resource.
-
-resource_handle: A handle to an iterator resource.
-serialized: A variant tensor storing the state of the iterator contained in the
-  resource.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("StatsAggregatorHandle")
     .Output("handle: resource")
     .SetShapeFn(shape_inference::ScalarShape)
     .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Doc(R"doc(
-Creates a statistics manager resource.
-)doc");
+    .Attr("shared_name: string = ''");
 
 REGISTER_OP("IteratorSetStatsAggregator")
     .Input("iterator_handle: resource")
     .Input("stats_aggregator_handle: resource")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Associates the given iterator with the given statistics aggregator.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("StatsAggregatorSummary")
     .Input("iterator: resource")
     .Output("summary: string")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Produces a summary of any statistics recorded by the given statistics manager.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("PrependFromQueueAndPaddedBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("batch_size: int64")
+    .Input("padded_shapes: N * int64")
+    .Input("padding_values: Toutput_types")
+    .Output("handle: variant")
+    .Attr("Toutput_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    // TODO(ebrevdo): Validate that `padded_shapes` are all vectors, the lengths
+    // of `Toutput_types` and `output_shapes` are `N`, that the
+    // length of `output_types` is `N`, the `output_shapes` are
+    // (as far as possible to tell statically) compatible with `padded_shapes`,
+    // and that `padding_values` are all scalars.
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("EnqueueInQueueDataset")
+    .Input("queue: variant")
+    .Input("components: Tcomponents")
+    .Attr("Tcomponents: list(type) >= 1")
+    .SetIsStateful()  // To avoid CSE on multiple calls to Enqueue.
+    // TODO(ebrevdo): SetShapeFn to test input dtypes and shapes by
+    // reading from queue handle (is that even possible?).
+    .SetShapeFn(shape_inference::NoOutputs);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_grad.cc b/tensorflow/core/ops/functional_grad.cc
index 6df3536795ce7772faef72d63e0cb276719d7b44..eeccb72da65d7cef1073f54bf7f639436f69e930 100644
--- a/tensorflow/core/ops/functional_grad.cc
+++ b/tensorflow/core/ops/functional_grad.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/function.h"
 #include <vector>
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 5fd21ec88faef160d99122f1b8bc6e2f877d8694..9e18d20db65075e471862def3924811b260f8a08 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -38,33 +38,7 @@ REGISTER_OP("SymbolicGradient")
         c->set_output(i, c->input(i));
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradient function for function f via backpropagation.
-
-input: a list of input tensors of size N + M;
-output: a list of output tensors of size N;
-Tin: the type list for the input list.
-Tout: the type list for the input list.
-f: The function we want to compute the gradient for.
-
-The function 'f' must be a numerical function which takes N inputs and
-produces M outputs. Its gradient function 'g', which is computed by
-this SymbolicGradient op is a function taking N + M inputs and
-produces N outputs.
-
-I.e. if we have
-   (y1, y2, ..., y_M) = f(x1, x2, ..., x_N),
-then, g is
-   (dL/dx1, dL/dx2, ..., dL/dx_N) = g(x1, x2, ..., x_N,
-                                     dL/dy1, dL/dy2, ..., dL/dy_M),
-
-where L is a scalar-value function of (x1, x2, ..., xN) (e.g., the
-loss function). dL/dx_i is the partial derivative of L with respect
-to x_i.
-
-(Needs some math expert to say the comment above better.)
-)doc");
+    });
 
 REGISTER_OP("RemoteCall")
     .Input("target: string")
@@ -73,15 +47,64 @@ REGISTER_OP("RemoteCall")
     .Attr("Tin: list(type)")
     .Attr("Tout: list(type)")
     .Attr("f: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("_If")
+    .Input("cond: Tcond")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("Tcond: type")
+    .Attr("Tin: list(type)")
+    .Attr("Tout: list(type)")
+    .Attr("then_branch: func")
+    .Attr("else_branch: func")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Runs function `f` on a remote device indicated by `target`.
+output = cond ? then_branch(input) : else_branch(input)
 
-target: A fully specified device name where we want to run the function.
-args: A list of arguments for the function.
-output: A list of return values.
-Tin: The type list for the arguments.
-Tout: The type list for the return values.
-f: The function to run remotely.
+cond: A Tensor. If the tensor is a scalar of non-boolean type, the
+    scalar is converted to a boolean according to the
+    following rule: if the scalar is a numerical value, non-zero means
+    True and zero means False; if the scalar is a string, non-empty
+    means True and empty means False. If the tensor is not a scalar,
+    being empty means False and being non-empty means True.
+input: A list of input tensors.
+then_branch: A function that takes 'inputs' and returns a list of
+    tensors, whose types are the same as what else_branch returns.
+else_branch: A function that takes 'inputs' and returns a list of
+    tensors.  whose types are the same as what then_branch returns.
 )doc");
+
+// TODO(b/37549631) setting the While Op to always be stateful is too
+// conservative.
+REGISTER_OP("_While")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: list(type) >= 0")
+    .Attr("cond: func")
+    .Attr("body: func")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        c->set_output(i, c->input(i));
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+output = input; While (Cond(output)) { output = Body(output) }
+
+input: A list of input tensors whose types are T.
+output: A list of output tensors whose types are T.
+cond: A function takes 'input' and returns a tensor.  If the tensor is
+    a scalar of non-boolean, the scalar is converted to a boolean
+    according to the following rule: if the scalar is a numerical
+    value, non-zero means True and zero means False; if the scalar is
+    a string, non-empty means True and empty means False. If the
+    tensor is not a scalar, non-emptiness means True and False
+    otherwise.
+body: A function that takes a list of tensors and returns another
+      list of tensors. Both lists have the same types as specified
+      by T.
+)doc");
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 13fbd2fa515c5a7e0ec06cdc4c585f4dc691a928..c3b08e067a2c35432e45f98ef9d57af629b90e02 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -25,42 +25,6 @@ using shape_inference::ShapeHandle;
 
 namespace {
 
-const char kDecodeJpegCommonDocStr[] = R"doc(
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the JPEG-encoded image.
-*   1: output a grayscale image.
-*   3: output an RGB image.
-
-If needed, the JPEG-encoded image is transformed to match the requested number
-of color channels.
-
-The attr `ratio` allows downscaling the image by an integer factor during
-decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-downscaling the image later.
-
-)doc";
-
-const char kDecodeJpegCommonParamsDocStr[] = R"doc(
-channels: Number of color channels for the decoded image.
-ratio: Downscaling ratio.
-fancy_upscaling: If true use a slower but nicer upscaling of the
-  chroma planes (yuv420/422 only).
-try_recover_truncated:  If true try to recover an image from truncated input.
-acceptable_fraction: The minimum required fraction of lines before a truncated
-  input is accepted.
-dct_method: string specifying a hint about the algorithm used for
-  decompression.  Defaults to "" which maps to a system-specific
-  default.  Currently valid values are ["INTEGER_FAST",
-  "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-  jpeg library changes to a version that does not have that specific
-  option.)
-image: 3-D with shape `[height, width, channels]`..
-)doc";
-
 // Sets output[0] to shape [batch_dim,height,width,channel_dim], where
 // height and width come from the size_tensor.
 Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim,
@@ -153,26 +117,7 @@ REGISTER_OP("ResizeArea")
     .Output("resized_images: float")
     .Attr("T: {int8, uint8, int16, uint16, int32, int64, half, float, double}")
     .Attr("align_corners: bool = false")
-    .SetShapeFn(ResizeShapeFn)
-    .Doc(R"doc(
-Resize `images` to `size` using area interpolation.
-
-Input images can be of different types but output images are always float.
-
-Each output pixel is computed by first transforming the pixel's footprint into
-the input tensor and then averaging the pixels that intersect the footprint. An
-input pixel's contribution to the average is weighted by the fraction of its
-area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
-
-images: 4-D with shape `[batch, height, width, channels]`.
-size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-  new size for the images.
-align_corners: If true, rescale input by (new_height - 1) / (height - 1), which
-  exactly aligns the 4 corners of images and resized images. If false, rescale
-  by new_height / height. Treat similarly the width dimension.
-resized_images: 4-D with shape
-  `[batch, new_height, new_width, channels]`.
-)doc");
+    .SetShapeFn(ResizeShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ResizeBicubic")
@@ -181,21 +126,7 @@ REGISTER_OP("ResizeBicubic")
     .Output("resized_images: float")
     .Attr("T: {int8, uint8, int16, uint16, int32, int64, half, float, double}")
     .Attr("align_corners: bool = false")
-    .SetShapeFn(ResizeShapeFn)
-    .Doc(R"doc(
-Resize `images` to `size` using bicubic interpolation.
-
-Input images can be of different types but output images are always float.
-
-images: 4-D with shape `[batch, height, width, channels]`.
-size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-  new size for the images.
-align_corners: If true, rescale input by (new_height - 1) / (height - 1), which
-  exactly aligns the 4 corners of images and resized images. If false, rescale
-  by new_height / height. Treat similarly the width dimension.
-resized_images: 4-D with shape
-  `[batch, new_height, new_width, channels]`.
-)doc");
+    .SetShapeFn(ResizeShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ResizeBicubicGrad")
@@ -207,43 +138,18 @@ REGISTER_OP("ResizeBicubicGrad")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(1));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradient of bicubic interpolation.
-
-grads: 4-D with shape `[batch, height, width, channels]`.
-original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-  The image tensor that was resized.
-align_corners: If true, rescale grads by (orig_height - 1) / (height - 1), which
-  exactly aligns the 4 corners of grads and original_image. If false, rescale by
-  orig_height / height. Treat similarly the width dimension.
-output: 4-D with shape `[batch, orig_height, orig_width, channels]`.
-  Gradients with respect to the input image. Input image must have been
-  float or double.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ResizeBilinear")
     .Input("images: T")
     .Input("size: int32")
     .Output("resized_images: float")
-    .Attr("T: {int8, uint8, int16, uint16, int32, int64, half, float, double}")
+    .Attr(
+        "T: {int8, uint8, int16, uint16, int32, int64, bfloat16, half, "
+        "float, double}")
     .Attr("align_corners: bool = false")
-    .SetShapeFn(ResizeShapeFn)
-    .Doc(R"doc(
-Resize `images` to `size` using bilinear interpolation.
-
-Input images can be of different types but output images are always float.
-
-images: 4-D with shape `[batch, height, width, channels]`.
-size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-  new size for the images.
-align_corners: If true, rescale input by (new_height - 1) / (height - 1), which
-  exactly aligns the 4 corners of images and resized images. If false, rescale
-  by new_height / height. Treat similarly the width dimension.
-resized_images: 4-D with shape
-  `[batch, new_height, new_width, channels]`.
-)doc");
+    .SetShapeFn(ResizeShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("QuantizedResizeBilinear")
@@ -265,46 +171,19 @@ REGISTER_OP("QuantizedResizeBilinear")
       c->set_output(1, c->MakeShape({}));
       c->set_output(2, c->MakeShape({}));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Resize quantized `images` to `size` using quantized bilinear interpolation.
-
-Input images and output images must be quantized types.
-
-images: 4-D with shape `[batch, height, width, channels]`.
-size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-  new size for the images.
-align_corners: If true, rescale input by (new_height - 1) / (height - 1), which
-  exactly aligns the 4 corners of images and resized images. If false, rescale
-  by new_height / height. Treat similarly the width dimension.
-resized_images: 4-D with shape
-  `[batch, new_height, new_width, channels]`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ResizeBilinearGrad")
     .Input("grads: float")
     .Input("original_image: T")
     .Output("output: T")
-    .Attr("T: {float, half, double}")
+    .Attr("T: {float, bfloat16, half, double}")
     .Attr("align_corners: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(1));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradient of bilinear interpolation.
-
-grads: 4-D with shape `[batch, height, width, channels]`.
-original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-  The image tensor that was resized.
-align_corners: If true, rescale grads by (orig_height - 1) / (height - 1), which
-  exactly aligns the 4 corners of grads and original_image. If false, rescale by
-  orig_height / height. Treat similarly the width dimension.
-output: 4-D with shape `[batch, orig_height, orig_width, channels]`.
-  Gradients with respect to the input image. Input image must have been
-  float or double.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ResizeNearestNeighbor")
@@ -313,19 +192,7 @@ REGISTER_OP("ResizeNearestNeighbor")
     .Output("resized_images: T")
     .Attr("T: {int8, uint8, int16, uint16, int32, int64, half, float, double}")
     .Attr("align_corners: bool = false")
-    .SetShapeFn(ResizeShapeFn)
-    .Doc(R"doc(
-Resize `images` to `size` using nearest neighbor interpolation.
-
-images: 4-D with shape `[batch, height, width, channels]`.
-size:= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-  new size for the images.
-align_corners: If true, rescale input by (new_height - 1) / (height - 1), which
-  exactly aligns the 4 corners of images and resized images. If false, rescale
-  by new_height / height. Treat similarly the width dimension.
-resized_images: 4-D with shape
-  `[batch, new_height, new_width, channels]`.
-)doc");
+    .SetShapeFn(ResizeShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ResizeNearestNeighborGrad")
@@ -354,19 +221,7 @@ REGISTER_OP("ResizeNearestNeighborGrad")
       }
       c->set_output(0, input);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradient of nearest neighbor interpolation.
-
-grads: 4-D with shape `[batch, height, width, channels]`.
-size:= A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-  original input size.
-align_corners: If true, rescale grads by (orig_height - 1) / (height - 1), which
-  exactly aligns the 4 corners of grads and original_image. If false, rescale by
-  orig_height / height. Treat similarly the width dimension.
-output: 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-  with respect to the input image.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("RandomCrop")
@@ -399,25 +254,7 @@ REGISTER_OP("RandomCrop")
       }
       c->set_output(0, c->MakeShape({h, w, channels}));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Randomly crop `image`.
-
-`size` is a 1-D int64 tensor with 2 elements representing the crop height and
-width.  The values must be non negative.
-
-This Op picks a random location in `image` and crops a `height` by `width`
-rectangle from that location.  The random location is picked so the cropped
-area will fit inside the original image.
-
-image: 3-D of shape `[height, width, channels]`.
-size: 1-D of length 2 containing: `crop_height`, `crop_width`..
-seed: If either seed or seed2 are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: An second seed to avoid seed collision.
-output: 3-D of shape `[crop_height, crop_width, channels].`
-)doc");
+    });
 // TODO(shlens): Support variable rank in RandomCrop.
 
 // --------------------------------------------------------------------------
@@ -430,17 +267,7 @@ REGISTER_OP("DecodeJpeg")
     .Attr("acceptable_fraction: float = 1.0")
     .Attr("dct_method: string = ''")
     .Output("image: uint8")
-    .SetShapeFn(DecodeImageShapeFn)
-    .Doc(strings::StrCat(R"doc(
-Decode a JPEG-encoded image to a uint8 tensor.
-)doc",
-                         kDecodeJpegCommonDocStr, R"doc(
-This op also supports decoding PNGs and non-animated GIFs since the interface is
-the same, though it is cleaner to use `tf.image.decode_image`.
-
-contents: 0-D.  The JPEG-encoded image.
-)doc",
-                         kDecodeJpegCommonParamsDocStr));
+    .SetShapeFn(DecodeImageShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("DecodeAndCropJpeg")
@@ -482,18 +309,7 @@ REGISTER_OP("DecodeAndCropJpeg")
       }
       c->set_output(0, c->MakeShape({h, w, channels_dim}));
       return Status::OK();
-    })
-    .Doc(strings::StrCat(R"doc(
-Decode and Crop a JPEG-encoded image to a uint8 tensor.
-)doc",
-                         kDecodeJpegCommonDocStr, R"doc(
-It is equivalent to a combination of decode and crop, but much faster by only
-decoding partial jpeg image.
-
-contents: 0-D.  The JPEG-encoded image.
-crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
-)doc",
-                         kDecodeJpegCommonParamsDocStr));
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("EncodeJpeg")
@@ -508,40 +324,7 @@ REGISTER_OP("EncodeJpeg")
     .Attr("y_density: int = 300")
     .Attr("xmp_metadata: string = ''")
     .Output("contents: string")
-    .SetShapeFn(EncodeImageShapeFn)
-    .Doc(R"doc(
-JPEG-encode an image.
-
-`image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-
-The attr `format` can be used to override the color format of the encoded
-output.  Values can be:
-
-*   `''`: Use a default format based on the number of channels in the image.
-*   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-    of `image` must be 1.
-*   `rgb`: Output an RGB JPEG image. The `channels` dimension
-    of `image` must be 3.
-
-If `format` is not specified or is the empty string, a default format is picked
-in function of the number of channels in `image`:
-
-*   1: Output a grayscale image.
-*   3: Output an RGB image.
-
-image: 3-D with shape `[height, width, channels]`.
-format: Per pixel image format.
-quality: Quality of the compression from 0 to 100 (higher is better and slower).
-progressive: If True, create a JPEG that loads progressively (coarse to fine).
-optimize_size: If True, spend CPU/RAM to reduce size with no quality change.
-chroma_downsampling: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-density_unit: Unit used to specify `x_density` and `y_density`:
-   pixels per inch (`'in'`) or centimeter (`'cm'`).
-x_density: Horizontal pixels per density unit.
-y_density: Vertical pixels per density unit.
-xmp_metadata: If not empty, embed this XMP metadata in the image header.
-contents: 0-D. JPEG-encoded image.
-)doc");
+    .SetShapeFn(EncodeImageShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("ExtractJpegShape")
@@ -553,17 +336,7 @@ REGISTER_OP("ExtractJpegShape")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
       c->set_output(0, c->Vector(3));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Extract the shape information of a JPEG-encoded image.
-
-This op only parses the image header, so it is much faster than DecodeJpeg.
-
-contents: 0-D. The JPEG-encoded image.
-image_shape: 1-D. The image shape with format [height, width, channels].
-output_type: (Optional) The output type of the operation (int32 or int64).
-    Defaults to int32.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("AdjustContrast")
@@ -576,10 +349,7 @@ REGISTER_OP("AdjustContrast")
     .Deprecated(2, "Use AdjustContrastv2 instead")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
-    })
-    .Doc(R"Doc(
-Deprecated. Disallowed in GraphDef version >= 2.
-)Doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("AdjustContrastv2")
@@ -588,24 +358,7 @@ REGISTER_OP("AdjustContrastv2")
     .Output("output: float")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
-    })
-    .Doc(R"Doc(
-Adjust the contrast of one or more images.
-
-`images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-interpreted as `[height, width, channels]`.  The other dimensions only
-represent a collection of images, such as `[batch, height, width, channels].`
-
-Contrast is adjusted independently for each channel of each image.
-
-For each channel, the Op first computes the mean of the image pixels in the
-channel and then adjusts each component of each pixel to
-`(x - mean) * contrast_factor + mean`.
-
-images: Images to adjust.  At least 3-D.
-contrast_factor: A float multiplier for adjusting contrast.
-output: The contrast-adjusted image or images.
-)Doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("AdjustHue")
@@ -614,21 +367,7 @@ REGISTER_OP("AdjustHue")
     .Output("output: float")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
-    })
-    .Doc(R"Doc(
-Adjust the hue of one or more images.
-
-`images` is a tensor of at least 3 dimensions.  The last dimension is
-interpretted as channels, and must be three.
-
-The input image is considered in the RGB colorspace. Conceptually, the RGB
-colors are first mapped into HSV. A delta is then applied all the hue values,
-and then remapped back to RGB colorspace.
-
-images: Images to adjust.  At least 3-D.
-delta: A float delta to add to the hue.
-output: The hue-adjusted image or images.
-)Doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("AdjustSaturation")
@@ -637,21 +376,7 @@ REGISTER_OP("AdjustSaturation")
     .Output("output: float")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
-    })
-    .Doc(R"Doc(
-Adjust the saturation of one or more images.
-
-`images` is a tensor of at least 3 dimensions.  The last dimension is
-interpretted as channels, and must be three.
-
-The input image is considered in the RGB colorspace. Conceptually, the RGB
-colors are first mapped into HSV. A scale is then applied all the saturation
-values, and then remapped back to RGB colorspace.
-
-images: Images to adjust.  At least 3-D.
-scale: A float scale to add to the saturation.
-output: The hue-adjusted image or images.
-)Doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("DecodePng")
@@ -659,30 +384,7 @@ REGISTER_OP("DecodePng")
     .Attr("channels: int = 0")
     .Attr("dtype: {uint8, uint16} = DT_UINT8")
     .Output("image: dtype")
-    .SetShapeFn(DecodeImageShapeFn)
-    .Doc(R"doc(
-Decode a PNG-encoded image to a uint8 or uint16 tensor.
-
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the PNG-encoded image.
-*   1: output a grayscale image.
-*   3: output an RGB image.
-*   4: output an RGBA image.
-
-If needed, the PNG-encoded image is transformed to match the requested number
-of color channels.
-
-This op also supports decoding JPEGs and non-animated GIFs since the interface
-is the same, though it is cleaner to use `tf.image.decode_image`.
-
-contents: 0-D.  The PNG-encoded image.
-channels: Number of color channels for the decoded image.
-image: 3-D with shape `[height, width, channels]`.
-)doc");
+    .SetShapeFn(DecodeImageShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("EncodePng")
@@ -690,48 +392,14 @@ REGISTER_OP("EncodePng")
     .Attr("T: {uint8, uint16} = DT_UINT8")
     .Input("image: T")
     .Output("contents: string")
-    .SetShapeFn(EncodeImageShapeFn)
-    .Doc(R"doc(
-PNG-encode an image.
-
-`image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-where `channels` is:
-
-*   1: for grayscale.
-*   2: for grayscale + alpha.
-*   3: for RGB.
-*   4: for RGBA.
-
-The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-default or a value from 0 to 9.  9 is the highest compression level, generating
-the smallest output, but is slower.
-
-image: 3-D with shape `[height, width, channels]`.
-compression: Compression level.
-contents: 0-D. PNG-encoded image.
-)doc");
+    .SetShapeFn(EncodeImageShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("DecodeBmp")
     .Input("contents: string")
     .Output("image: uint8")
     .Attr("channels: int = 0")
-    .SetShapeFn(DecodeImageShapeFn)
-    .Doc(R"doc(
-Decode the first frame of a BMP-encoded image to a uint8 tensor.
-
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the BMP-encoded image.
-*   3: output an RGB image.
-*   4: output an RGBA image.
-
-contents: 0-D.  The BMP-encoded image.
-image: 3-D with shape `[height, width, channels]`. RGB order
-)doc");
+    .SetShapeFn(DecodeImageShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("DecodeGif")
@@ -744,61 +412,21 @@ REGISTER_OP("DecodeGif")
                                      InferenceContext::kUnknownDim,
                                      InferenceContext::kUnknownDim, 3}));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Decode the first frame of a GIF-encoded image to a uint8 tensor.
-
-GIF with frame or transparency compression are not supported
-convert animated GIF from compressed to uncompressed by:
-
-    convert $src.gif -coalesce $dst.gif
-
-This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-`tf.image.decode_image`.
-
-contents: 0-D.  The GIF-encoded image.
-image: 4-D with shape `[num_frames, height, width, 3]`. RGB order
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("RGBToHSV")
     .Input("images: T")
     .Output("output: T")
-    .Attr("T: {float, double} = DT_FLOAT")
-    .SetShapeFn(ColorspaceShapeFn)
-    .Doc(R"doc(
-Converts one or more images from RGB to HSV.
-
-Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-value of the pixels. The output is only well defined if the value in `images`
-are in `[0,1]`.
-
-`output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-`output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
-
-images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
-output: `images` converted to HSV.
-)doc");
+    .Attr("T: {half, bfloat16, float, double} = DT_FLOAT")
+    .SetShapeFn(ColorspaceShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("HSVToRGB")
     .Input("images: T")
     .Output("output: T")
-    .Attr("T: {float, double} = DT_FLOAT")
-    .SetShapeFn(ColorspaceShapeFn)
-    .Doc(R"doc(
-Convert one or more images from HSV to RGB.
-
-Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-value of the pixels. The output is only well defined if the value in `images`
-are in `[0,1]`.
-
-See `rgb_to_hsv` for a description of the HSV encoding.
-
-images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
-output: `images` converted to RGB.
-)doc");
+    .Attr("T: {half, bfloat16, float, double} = DT_FLOAT")
+    .SetShapeFn(ColorspaceShapeFn);
 
 // --------------------------------------------------------------------------
 REGISTER_OP("DrawBoundingBoxes")
@@ -808,28 +436,7 @@ REGISTER_OP("DrawBoundingBoxes")
     .Attr("T: {float, half} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
-    })
-    .Doc(R"doc(
-Draw bounding boxes on a batch of images.
-
-Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-boxes specified by the locations in `boxes`. The coordinates of the each
-bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-height of the underlying image.
-
-For example, if an image is 100 x 200 pixels (height x width) and the bounding
-box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
-
-Parts of the bounding box may fall outside the image.
-
-images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-  boxes.
-output: 4-D with the same shape as `images`. The batch of input images with
-  bounding boxes drawn on the images.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("SampleDistortedBoundingBox")
@@ -848,81 +455,22 @@ REGISTER_OP("SampleDistortedBoundingBox")
     .Attr("use_image_if_no_bounding_boxes: bool = false")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle image_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &image_size));
+      ShapeHandle bounding_boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &bounding_boxes));
+      // image_size: 1-D with [height, width, channels]
+      // bounding_boxes: 3-D with shape [batch, N, 4]
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(image_size, 0), 3, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(bounding_boxes, 2), 4, &unused));
+
       c->set_output(0, c->Vector(3));
       c->set_output(1, c->Vector(3));
       c->set_output(2, c->MakeShape({1, 1, 4}));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Generate a single randomly distorted bounding box for an image.
-
-Bounding box annotations are often supplied in addition to ground-truth labels
-in image recognition or object localization tasks. A common technique for
-training such a system is to randomly distort an image while preserving
-its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-localization of an object, i.e. bounding box, given an `image_size`,
-`bounding_boxes` and a series of constraints.
-
-The output of this Op is a single bounding box that may be used to crop the
-original image. The output is returned as 3 tensors: `begin`, `size` and
-`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-what the bounding box looks like.
-
-Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-height of the underlying image.
-
-For example,
-
-```python
-    # Generate a single distorted bounding box.
-    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-        tf.shape(image),
-        bounding_boxes=bounding_boxes)
-
-    # Draw the bounding box in an image summary.
-    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-                                                  bbox_for_draw)
-    tf.image_summary('images_with_box', image_with_box)
-
-    # Employ the bounding box to distort the image.
-    distorted_image = tf.slice(image, begin, size)
-```
-
-Note that if no bounding box information is available, setting
-`use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-false and no bounding boxes are supplied, an error is raised.
-
-image_size: 1-D, containing `[height, width, channels]`.
-bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-  associated with the image.
-begin: 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-  `tf.slice`.
-size: 1-D, containing `[target_height, target_width, -1]`. Provide as input to
-  `tf.slice`.
-bboxes: 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-  Provide as input to `tf.image.draw_bounding_boxes`.
-seed: If either `seed` or `seed2` are set to non-zero, the random number
-  generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-  seed.
-seed2: A second seed to avoid seed collision.
-min_object_covered: The cropped area of the image must contain at least this
-  fraction of any bounding box supplied. The value of this parameter should be
-  non-negative. In the case of 0, the cropped area does not need to overlap
-  any of the bounding boxes supplied.
-aspect_ratio_range: The cropped area of the image must have an aspect ratio =
-  width / height within this range.
-area_range: The cropped area of the image must contain a fraction of the
-  supplied image within in this range.
-max_attempts: Number of attempts at generating a cropped region of the image
-  of the specified constraints. After `max_attempts` failures, return the entire
-  image.
-use_image_if_no_bounding_boxes: Controls behavior if no bounding boxes supplied.
-  If true, assume an implicit bounding box covering the whole input. If false,
-  raise an error.
-)doc");
+    });
 
 REGISTER_OP("SampleDistortedBoundingBoxV2")
     .Input("image_size: T")
@@ -940,81 +488,24 @@ REGISTER_OP("SampleDistortedBoundingBoxV2")
     .Attr("use_image_if_no_bounding_boxes: bool = false")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle image_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &image_size));
+      ShapeHandle bounding_boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &bounding_boxes));
+      ShapeHandle min_object_covered;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &min_object_covered));
+      // image_size: 1-D with [height, width, channels]
+      // bounding_boxes: 3-D with shape [batch, N, 4]
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(image_size, 0), 3, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(bounding_boxes, 2), 4, &unused));
+
       c->set_output(0, c->Vector(3));
       c->set_output(1, c->Vector(3));
       c->set_output(2, c->MakeShape({1, 1, 4}));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Generate a single randomly distorted bounding box for an image.
-
-Bounding box annotations are often supplied in addition to ground-truth labels
-in image recognition or object localization tasks. A common technique for
-training such a system is to randomly distort an image while preserving
-its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-localization of an object, i.e. bounding box, given an `image_size`,
-`bounding_boxes` and a series of constraints.
-
-The output of this Op is a single bounding box that may be used to crop the
-original image. The output is returned as 3 tensors: `begin`, `size` and
-`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-what the bounding box looks like.
-
-Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-height of the underlying image.
-
-For example,
-
-```python
-    # Generate a single distorted bounding box.
-    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-        tf.shape(image),
-        bounding_boxes=bounding_boxes)
-
-    # Draw the bounding box in an image summary.
-    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-                                                  bbox_for_draw)
-    tf.image_summary('images_with_box', image_with_box)
-
-    # Employ the bounding box to distort the image.
-    distorted_image = tf.slice(image, begin, size)
-```
-
-Note that if no bounding box information is available, setting
-`use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-false and no bounding boxes are supplied, an error is raised.
-
-image_size: 1-D, containing `[height, width, channels]`.
-bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-  associated with the image.
-min_object_covered: The cropped area of the image must contain at least this
-  fraction of any bounding box supplied. The value of this parameter should be
-  non-negative. In the case of 0, the cropped area does not need to overlap
-  any of the bounding boxes supplied.
-begin: 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-  `tf.slice`.
-size: 1-D, containing `[target_height, target_width, -1]`. Provide as input to
-  `tf.slice`.
-bboxes: 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-  Provide as input to `tf.image.draw_bounding_boxes`.
-seed: If either `seed` or `seed2` are set to non-zero, the random number
-  generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-  seed.
-seed2: A second seed to avoid seed collision.
-aspect_ratio_range: The cropped area of the image must have an aspect ratio =
-  width / height within this range.
-area_range: The cropped area of the image must contain a fraction of the
-  supplied image within in this range.
-max_attempts: Number of attempts at generating a cropped region of the image
-  of the specified constraints. After `max_attempts` failures, return the entire
-  image.
-use_image_if_no_bounding_boxes: Controls behavior if no bounding boxes supplied.
-  If true, assume an implicit bounding box covering the whole input. If false,
-  raise an error.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1046,48 +537,7 @@ REGISTER_OP("ExtractGlimpse")
 
       return SetOutputToSizedImage(c, batch_dim, 1 /* size_input_idx */,
                                    c->Dim(input, 3));
-    })
-    .Doc(R"doc(
-Extracts a glimpse from the input tensor.
-
-Returns a set of windows called glimpses extracted at location
-`offsets` from the input tensor. If the windows only partially
-overlaps the inputs, the non overlapping areas will be filled with
-random noise.
-
-The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-glimpse_width, channels]`. The channels and batch dimensions are the
-same as that of the input tensor. The height and width of the output
-windows are specified in the `size` parameter.
-
-The argument `normalized` and `centered` controls how the windows are built:
-
-* If the coordinates are normalized but not centered, 0.0 and 1.0
-  correspond to the minimum and maximum of each height and width
-  dimension.
-* If the coordinates are both normalized and centered, they range from
-  -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-  left corner, the lower right corner is located at (1.0, 1.0) and the
-  center is at (0, 0).
-* If the coordinates are not normalized they are interpreted as
-  numbers of pixels.
-
-input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-size: A 1-D tensor of 2 elements containing the size of the glimpses
-  to extract.  The glimpse height must be specified first, following
-  by the glimpse width.
-offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-  the y, x locations of the center of each window.
-glimpse: A tensor representing the glimpses `[batch_size,
-  glimpse_height, glimpse_width, channels]`.
-centered: indicates if the offset coordinates are centered relative to
-  the image, in which case the (0, 0) offset is relative to the center
-  of the input images. If false, the (0,0) offset corresponds to the
-  upper left corner of the input images.
-normalized: indicates if the offset coordinates are normalized.
-uniform_noise: indicates if the noise should be generated using a
-  uniform distribution or a Gaussian distribution.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1120,44 +570,7 @@ REGISTER_OP("CropAndResize")
 
       return SetOutputToSizedImage(c, num_boxes_dim, 3 /* size_input_idx */,
                                    c->Dim(input, 3));
-    })
-    .Doc(R"doc(
-Extracts crops from the input image tensor and bilinearly resizes them (possibly
-with aspect ratio change) to a common output size specified by `crop_size`. This
-is more general than the `crop_to_bounding_box` op which extracts a fixed size
-slice from the input image and does not allow resizing or aspect ratio change.
-
-Returns a tensor with `crops` from the input `image` at positions defined at the
-bounding box locations in `boxes`. The cropped boxes are all resized (with
-bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
-resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
-method will give identical results to using `tf.image.resize_bilinear()`
-with `align_corners=True`.
-
-image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-  Both `image_height` and `image_width` need to be positive.
-boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-  specifies the coordinates of a box in the `box_ind[i]` image and is specified
-  in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-  `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-  `[0, 1]` interval of normalized image height is mapped to
-  `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
-  which case the sampled crop is an up-down flipped version of the original
-  image. The width dimension is treated similarly. Normalized coordinates
-  outside the `[0, 1]` range are allowed, in which case we use
-  `extrapolation_value` to extrapolate the input image values.
-box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-  The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-  cropped image patches are resized to this size. The aspect ratio of the image
-  content is not preserved. Both `crop_height` and `crop_width` need to be
-  positive.
-crops: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-method: A string specifying the interpolation method. Only 'bilinear' is
-  supported for now.
-extrapolation_value: Value used for extrapolation, when applicable.
-)doc");
+    });
 
 REGISTER_OP("CropAndResizeGradImage")
     .Input("grads: float")
@@ -1173,30 +586,7 @@ REGISTER_OP("CropAndResizeGradImage")
       TF_RETURN_IF_ERROR(c->WithRank(out, 4, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradient of the crop_and_resize op wrt the input image tensor.
-
-grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-  specifies the coordinates of a box in the `box_ind[i]` image and is specified
-  in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-  `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-  `[0, 1]` interval of normalized image height is mapped to
-  `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-  which case the sampled crop is an up-down flipped version of the original
-  image. The width dimension is treated similarly. Normalized coordinates
-  outside the `[0, 1]` range are allowed, in which case we use
-  `extrapolation_value` to extrapolate the input image values.
-box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-  The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-  containing the original image size. Both `image_height` and `image_width` need
-  to be positive.
-output: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-method: A string specifying the interpolation method. Only 'bilinear' is
-  supported for now.
-)doc");
+    });
 
 REGISTER_OP("CropAndResizeGradBoxes")
     .Input("grads: float")
@@ -1209,29 +599,7 @@ REGISTER_OP("CropAndResizeGradBoxes")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(2));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
-
-grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-  Both `image_height` and `image_width` need to be positive.
-boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-  specifies the coordinates of a box in the `box_ind[i]` image and is specified
-  in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-  `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-  `[0, 1]` interval of normalized image height is mapped to
-  `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-  which case the sampled crop is an up-down flipped version of the original
-  image. The width dimension is treated similarly. Normalized coordinates
-  outside the `[0, 1]` range are allowed, in which case we use
-  `extrapolation_value` to extrapolate the input image values.
-box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-  The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-output: A 2-D tensor of shape `[num_boxes, 4]`.
-method: A string specifying the interpolation method. Only 'bilinear' is
-  supported for now.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1242,37 +610,24 @@ REGISTER_OP("NonMaxSuppression")
     .Output("selected_indices: int32")
     .Attr("iou_threshold: float = 0.5")
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+      ShapeHandle scores;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+      ShapeHandle max_output_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+      DimensionHandle unused;
+      // The boxes[0] and scores[0] are both num_boxes.
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
+      // The boxes[1] is 4.
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
       c->set_output(0, c->Vector(c->UnknownDim()));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Greedily selects a subset of bounding boxes in descending order of score,
-pruning away boxes that have high intersection-over-union (IOU) overlap
-with previously selected boxes.  Bounding boxes are supplied as
-[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-diagonal pair of box corners and the coordinates can be provided as normalized
-(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-is agnostic to where the origin is in the coordinate system.  Note that this
-algorithm is invariant to orthogonal transformations and translations
-of the coordinate system; thus translating or reflections of the coordinate
-system result in the same boxes being selected by the algorithm.
-The output of this operation is a set of integers indexing into the input
-collection of bounding boxes representing the selected boxes.  The bounding
-box coordinates corresponding to the selected indices can then be obtained
-using the `tf.gather operation`.  For example:
-  selected_indices = tf.image.non_max_suppression(
-      boxes, scores, max_output_size, iou_threshold)
-  selected_boxes = tf.gather(boxes, selected_indices)
-boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-  score corresponding to each box (each row of boxes).
-max_output_size: A scalar integer tensor representing the maximum number of
-  boxes to be selected by non max suppression.
-iou_threshold: A float representing the threshold for deciding whether boxes
-  overlap too much with respect to IOU.
-selected_indices: A 1-D integer tensor of shape `[M]` representing the selected
-  indices from the boxes tensor, where `M <= max_output_size`.
-)doc");
+    });
 
 REGISTER_OP("NonMaxSuppressionV2")
     .Input("boxes: float")
@@ -1281,39 +636,25 @@ REGISTER_OP("NonMaxSuppressionV2")
     .Input("iou_threshold: float")
     .Output("selected_indices: int32")
     .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+      ShapeHandle scores;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+      ShapeHandle max_output_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+      ShapeHandle iou_threshold;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
+      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+      DimensionHandle unused;
+      // The boxes[0] and scores[0] are both num_boxes.
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
+      // The boxes[1] is 4.
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
       c->set_output(0, c->Vector(c->UnknownDim()));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Greedily selects a subset of bounding boxes in descending order of score,
-pruning away boxes that have high intersection-over-union (IOU) overlap
-with previously selected boxes.  Bounding boxes are supplied as
-[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-diagonal pair of box corners and the coordinates can be provided as normalized
-(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-is agnostic to where the origin is in the coordinate system.  Note that this
-algorithm is invariant to orthogonal transformations and translations
-of the coordinate system; thus translating or reflections of the coordinate
-system result in the same boxes being selected by the algorithm.
-
-The output of this operation is a set of integers indexing into the input
-collection of bounding boxes representing the selected boxes.  The bounding
-box coordinates corresponding to the selected indices can then be obtained
-using the `tf.gather operation`.  For example:
-
-  selected_indices = tf.image.non_max_suppression_v2(
-      boxes, scores, max_output_size, iou_threshold)
-  selected_boxes = tf.gather(boxes, selected_indices)
-
-boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-  score corresponding to each box (each row of boxes).
-max_output_size: A scalar integer tensor representing the maximum number of
-  boxes to be selected by non max suppression.
-iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-  boxes overlap too much with respect to IOU.
-selected_indices: A 1-D integer tensor of shape `[M]` representing the selected
-  indices from the boxes tensor, where `M <= max_output_size`.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/io_ops.cc b/tensorflow/core/ops/io_ops.cc
index 082d18c1d5fef539a144cea5285cdb90d661f62f..7db4d0c4b667ebb11aa95142b88e615248926562 100644
--- a/tensorflow/core/ops/io_ops.cc
+++ b/tensorflow/core/ops/io_ops.cc
@@ -81,21 +81,7 @@ REGISTER_OP("SaveV2")
       // TODO(mrry): Attempt to parse the shapes_and_slices values and use
       // them to constrain the shape of the remaining inputs.
       return Status::OK();
-    })
-    .Doc(R"doc(
-Saves tensors in V2 checkpoint format.
-
-By default, saves the named tensors in full.  If the caller wishes to save
-specific slices of full tensors, "shape_and_slices" should be non-empty strings
-and correspondingly well-formed.
-
-prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-  write the tensors.
-tensor_names: shape {N}. The names of the tensors to be saved.
-shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-  Empty strings indicate that they are non-partitioned tensors.
-tensors: `N` tensors to save.
-)doc");
+    });
 
 REGISTER_OP("RestoreV2")
     .Input("prefix: string")
@@ -141,33 +127,7 @@ REGISTER_OP("RestoreV2")
       } else {
         return UnknownShape(c);
       }
-    })
-    .Doc(R"doc(
-Restores tensors from a V2 checkpoint.
-
-For backward compatibility with the V1 format, this Op currently allows
-restoring from a V1 checkpoint as well:
-  - This Op first attempts to find the V2 index file pointed to by "prefix", and
-    if found proceed to read it as a V2 checkpoint;
-  - Otherwise the V1 read path is invoked.
-Relying on this behavior is not recommended, as the ability to fall back to read
-V1 might be deprecated and eventually removed.
-
-By default, restores the named tensors in full.  If the caller wishes to restore
-specific slices of stored tensors, "shape_and_slices" should be non-empty
-strings and correspondingly well-formed.
-
-Callers must ensure all the named tensors are indeed stored in the checkpoint.
-
-prefix: Must have a single element.  The prefix of a V2 checkpoint.
-tensor_names: shape {N}.  The names of the tensors to be restored.
-shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-  Empty strings indicate that they are non-partitioned tensors.
-dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-  those stored in the checkpoint.
-tensors: shape {N}.  The restored tensors, whose shapes are read from the
-  checkpoint directly.
-)doc");
+    });
 
 REGISTER_OP("MergeV2Checkpoints")
     .Input("checkpoint_prefixes: string")
@@ -179,23 +139,7 @@ REGISTER_OP("MergeV2Checkpoints")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-V2 format specific: merges the metadata files of sharded checkpoints.  The
-result is one logical checkpoint, with one physical metadata file and renamed
-data files.
-
-Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
-
-If delete_old_dirs is true, attempts to delete recursively the dirname of each
-path in the input checkpoint_prefixes.  This is useful when those paths are non
-user-facing temporary locations.
-
-checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-  as one of the checkpoint_prefixes.
-delete_old_dirs: see above.
-)doc");
+    });
 
 REGISTER_OP("Save")
     .Input("filename: string")
@@ -217,20 +161,7 @@ REGISTER_OP("Save")
           c->WithValue(c->Dim(s, 0), c->num_inputs() - 2, &unused_dim));
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-Saves the input tensors to disk.
-
-The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-is written to `filename` with name `tensor_names[i]`.
-
-See also `SaveSlices`.
-
-filename: Must have a single element. The name of the file to which we write
-  the tensor.
-tensor_names: Shape `[N]`. The names of the tensors to be saved.
-data: `N` tensors to save.
-)doc");
+    });
 
 REGISTER_OP("SaveSlices")
     .Input("filename: string")
@@ -256,39 +187,7 @@ REGISTER_OP("SaveSlices")
       // TODO(mrry): Attempt to parse the shapes_and_slices values and use
       // them to constrain the shape of the remaining inputs.
       return Status::OK();
-    })
-    .Doc(R"doc(
-Saves input tensors slices to disk.
-
-This is like `Save` except that tensors can be listed in the saved file as being
-a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-have as many elements as `tensor_names`.
-
-Elements of the `shapes_and_slices` input must either be:
-
-*  The empty string, in which case the corresponding tensor is
-   saved normally.
-*  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-   `dimI` are the dimensions of the larger tensor and `slice-spec`
-   specifies what part is covered by the tensor to save.
-
-`slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-where each `sliceI` is either:
-
-*  The string `-` meaning that the slice covers all indices of this dimension
-*  `start,length` where `start` and `length` are integers.  In that
-   case the slice covers `length` indices starting at `start`.
-
-See also `Save`.
-
-filename: Must have a single element. The name of the file to which we write the
-  tensor.
-tensor_names: Shape `[N]`. The names of the tensors to be saved.
-shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-  saving the tensors.
-data: `N` tensors to save.
-)doc");
+    });
 
 REGISTER_OP("Restore")
     .Input("file_pattern: string")
@@ -303,36 +202,7 @@ REGISTER_OP("Restore")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       c->set_output(0, c->UnknownShape());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Restores a tensor from checkpoint files.
-
-Reads a tensor stored in one or several files. If there are several files (for
-instance because a tensor was saved as slices), `file_pattern` may contain
-wildcard symbols (`*` and `?`) in the filename portion only, not in the
-directory portion.
-
-If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-in which file the requested tensor is likely to be found. This op will first
-open the file at index `preferred_shard` in the list of matching files and try
-to restore tensors from that file.  Only if some tensors or tensor slices are
-not found in that first file, then the Op opens all the files. Setting
-`preferred_shard` to match the value passed as the `shard` input
-of a matching `Save` Op may speed up Restore.  This attribute only affects
-performance, not correctness.  The default value -1 means files are processed in
-order.
-
-See also `RestoreSlice`.
-
-file_pattern: Must have a single element. The pattern of the files from
-  which we read the tensor.
-tensor_name: Must have a single element. The name of the tensor to be
-  restored.
-tensor: The restored tensor.
-dt: The type of the tensor to be restored.
-preferred_shard: Index of file to open first if multiple files match
-  `file_pattern`.
-)doc");
+    });
 
 REGISTER_OP("RestoreSlice")
     .Input("file_pattern: string")
@@ -371,48 +241,20 @@ REGISTER_OP("RestoreSlice")
         c->set_output(0, c->UnknownShape());
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-Restores a tensor from checkpoint files.
-
-This is like `Restore` except that restored tensor can be listed as filling
-only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-larger tensor and the slice that the restored tensor covers.
-
-The `shape_and_slice` input has the same format as the
-elements of the `shapes_and_slices` input of the `SaveSlices` op.
-
-file_pattern: Must have a single element. The pattern of the files from
-  which we read the tensor.
-tensor_name: Must have a single element. The name of the tensor to be
-  restored.
-shape_and_slice: Scalar. The shapes and slice specifications to use when
-  restoring a tensors.
-tensor: The restored tensor.
-dt: The type of the tensor to be restored.
-preferred_shard: Index of file to open first if multiple files match
-  `file_pattern`. See the documentation for `Restore`.
-)doc");
+    });
 
 REGISTER_OP("ShardedFilename")
     .Input("basename: string")
     .Input("shard: int32")
     .Input("num_shards: int32")
     .Output("filename: string")
-    .SetShapeFn(ScalarInputsAndOutputs)
-    .Doc(R"doc(
-Generate a sharded filename. The filename is printf formatted as
-   %s-%05d-of-%05d, basename, shard, num_shards.
-)doc");
+    .SetShapeFn(ScalarInputsAndOutputs);
 
 REGISTER_OP("ShardedFilespec")
     .Input("basename: string")
     .Input("num_shards: int32")
     .Output("filename: string")
-    .SetShapeFn(ScalarInputsAndOutputs)
-    .Doc(R"doc(
-Generate a glob pattern matching all sharded file names.
-)doc");
+    .SetShapeFn(ScalarInputsAndOutputs);
 
 // Reader source ops ----------------------------------------------------------
 
@@ -421,40 +263,15 @@ REGISTER_OP("WholeFileReader")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-A Reader that outputs the entire contents of a file as a value.
-
-To use, enqueue filenames in a Queue.  The output of ReaderRead will
-be a filename (key) and the contents of that file (value).
-
-reader_handle: The handle to reference the Reader.
-container: If non-empty, this reader is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this reader is named in the given bucket
-             with this shared_name. Otherwise, the node name is used instead.
-)doc");
+    .SetShapeFn(TwoElementOutput);
 
 REGISTER_OP("WholeFileReaderV2")
     .Output("reader_handle: resource")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A Reader that outputs the entire contents of a file as a value.
-
-To use, enqueue filenames in a Queue.  The output of ReaderRead will
-be a filename (key) and the contents of that file (value).
+    .SetShapeFn(shape_inference::ScalarShape);
 
-reader_handle: The handle to reference the Reader.
-container: If non-empty, this reader is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this reader is named in the given bucket
-             with this shared_name. Otherwise, the node name is used instead.
-)doc");
-
-// TODO(cwhipkey): mark this deprecated in favor of V2.
 REGISTER_OP("TextLineReader")
     .Output("reader_handle: Ref(string)")
     .Attr("skip_header_lines: int = 0")
@@ -462,16 +279,7 @@ REGISTER_OP("TextLineReader")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
     .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-A Reader that outputs the lines of a file delimited by '\n'.
-
-reader_handle: The handle to reference the Reader.
-skip_header_lines: Number of lines to skip from the beginning of every file.
-container: If non-empty, this reader is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this reader is named in the given bucket
-             with this shared_name. Otherwise, the node name is used instead.
-)doc");
+    .Deprecated(26, "Use TextLineReaderV2");
 
 REGISTER_OP("TextLineReaderV2")
     .Output("reader_handle: resource")
@@ -479,19 +287,8 @@ REGISTER_OP("TextLineReaderV2")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A Reader that outputs the lines of a file delimited by '\n'.
-
-reader_handle: The handle to reference the Reader.
-skip_header_lines: Number of lines to skip from the beginning of every file.
-container: If non-empty, this reader is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this reader is named in the given bucket
-             with this shared_name. Otherwise, the node name is used instead.
-)doc");
-
-// TODO(cwhipkey): mark this deprecated in favor of V2.
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("FixedLengthRecordReader")
     .Output("reader_handle: Ref(string)")
     .Attr("header_bytes: int = 0")
@@ -502,20 +299,7 @@ REGISTER_OP("FixedLengthRecordReader")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
     .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-A Reader that outputs fixed-length records from a file.
-
-reader_handle: The handle to reference the Reader.
-header_bytes: Number of bytes in the header, defaults to 0.
-record_bytes: Number of bytes in the record.
-footer_bytes: Number of bytes in the footer, defaults to 0.
-hop_bytes: Number of bytes to hop before each read. Default of 0 means using
-        record_bytes.
-container: If non-empty, this reader is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this reader is named in the given bucket
-             with this shared_name. Otherwise, the node name is used instead.
-)doc");
+    .Deprecated(26, "Use FixedLengthRecordReaderV2");
 
 REGISTER_OP("FixedLengthRecordReaderV2")
     .Output("reader_handle: resource")
@@ -527,25 +311,8 @@ REGISTER_OP("FixedLengthRecordReaderV2")
     .Attr("shared_name: string = ''")
     .Attr("encoding: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A Reader that outputs fixed-length records from a file.
-
-reader_handle: The handle to reference the Reader.
-header_bytes: Number of bytes in the header, defaults to 0.
-record_bytes: Number of bytes in the record.
-footer_bytes: Number of bytes in the footer, defaults to 0.
-hop_bytes: Number of bytes to hop before each read. Default of 0 means using
-        record_bytes.
-container: If non-empty, this reader is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this reader is named in the given bucket
-             with this shared_name. Otherwise, the node name is used instead.
-encoding: The type of encoding for the file. Currently ZLIB and GZIP
-        are supported. Defaults to none.
-)doc");
-
-// TODO(cwhipkey): mark this deprecated in favor of V2.
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("TFRecordReader")
     .Output("reader_handle: Ref(string)")
     .Attr("container: string = ''")
@@ -553,15 +320,7 @@ REGISTER_OP("TFRecordReader")
     .Attr("compression_type: string = ''")
     .SetIsStateful()
     .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-A Reader that outputs the records from a TensorFlow Records file.
-
-reader_handle: The handle to reference the Reader.
-container: If non-empty, this reader is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this reader is named in the given bucket
-             with this shared_name. Otherwise, the node name is used instead.
-)doc");
+    .Deprecated(26, "Use TFRecordReaderV2");
 
 REGISTER_OP("TFRecordReaderV2")
     .Output("reader_handle: resource")
@@ -569,70 +328,29 @@ REGISTER_OP("TFRecordReaderV2")
     .Attr("shared_name: string = ''")
     .Attr("compression_type: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A Reader that outputs the records from a TensorFlow Records file.
-
-reader_handle: The handle to reference the Reader.
-container: If non-empty, this reader is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this reader is named in the given bucket
-             with this shared_name. Otherwise, the node name is used instead.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("LMDBReader")
     .Output("reader_handle: Ref(string)")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-A Reader that outputs the records from a LMDB file.
-reader_handle: The handle to reference the Reader.
-container: If non-empty, this reader is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this reader is named in the given bucket
-             with this shared_name. Otherwise, the node name is used instead.
-)doc");
-
-// TODO(cwhipkey): mark this deprecated in favor of V2.
+    .SetShapeFn(TwoElementOutput);
+
 REGISTER_OP("IdentityReader")
     .Output("reader_handle: Ref(string)")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
     .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-A Reader that outputs the queued work as both the key and value.
-
-To use, enqueue strings in a Queue.  ReaderRead will take the front
-work string and output (work, work).
-
-reader_handle: The handle to reference the Reader.
-container: If non-empty, this reader is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this reader is named in the given bucket
-             with this shared_name. Otherwise, the node name is used instead.
-)doc");
+    .Deprecated(26, "Use IdentityReaderV2");
 
 REGISTER_OP("IdentityReaderV2")
     .Output("reader_handle: resource")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A Reader that outputs the queued work as both the key and value.
-
-To use, enqueue strings in a Queue.  ReaderRead will take the front
-work string and output (work, work).
-
-reader_handle: The handle to reference the Reader.
-container: If non-empty, this reader is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this reader is named in the given bucket
-             with this shared_name. Otherwise, the node name is used instead.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 // Ops that operate on Readers ------------------------------------------------
 
@@ -641,38 +359,14 @@ REGISTER_OP("ReaderRead")
     .Input("queue_handle: Ref(string)")
     .Output("key: string")
     .Output("value: string")
-    .SetShapeFn(TwoElementVectorAndScalarOutputs)
-    .Doc(R"doc(
-Returns the next record (key, value pair) produced by a Reader.
-
-Will dequeue from the input queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has finished
-with the previous file).
-
-reader_handle: Handle to a Reader.
-queue_handle: Handle to a Queue, with string work items.
-key: A scalar.
-value: A scalar.
-)doc");
+    .SetShapeFn(TwoElementVectorAndScalarOutputs);
 
 REGISTER_OP("ReaderReadV2")
     .Input("reader_handle: resource")
     .Input("queue_handle: resource")
     .Output("key: string")
     .Output("value: string")
-    .SetShapeFn(ScalarInputsAndOutputs)
-    .Doc(R"doc(
-Returns the next record (key, value pair) produced by a Reader.
-
-Will dequeue from the input queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has finished
-with the previous file).
-
-reader_handle: Handle to a Reader.
-queue_handle: Handle to a Queue, with string work items.
-key: A scalar.
-value: A scalar.
-)doc");
+    .SetShapeFn(ScalarInputsAndOutputs);
 
 REGISTER_OP("ReaderReadUpTo")
     .Input("reader_handle: Ref(string)")
@@ -689,21 +383,7 @@ REGISTER_OP("ReaderReadUpTo")
       c->set_output(0, out);
       c->set_output(1, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns up to `num_records` (key, value) pairs produced by a Reader.
-
-Will dequeue from the input queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has finished
-with the previous file).
-It may return less than `num_records` even before the last batch.
-
-reader_handle: Handle to a `Reader`.
-queue_handle: Handle to a `Queue`, with string work items.
-num_records: number of records to read from `Reader`.
-keys: A 1-D tensor.
-values: A 1-D tensor.
-)doc");
+    });
 
 REGISTER_OP("ReaderReadUpToV2")
     .Input("reader_handle: resource")
@@ -720,93 +400,37 @@ REGISTER_OP("ReaderReadUpToV2")
       c->set_output(0, out);
       c->set_output(1, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns up to `num_records` (key, value) pairs produced by a Reader.
-
-Will dequeue from the input queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has finished
-with the previous file).
-It may return less than `num_records` even before the last batch.
-
-reader_handle: Handle to a `Reader`.
-queue_handle: Handle to a `Queue`, with string work items.
-num_records: number of records to read from `Reader`.
-keys: A 1-D tensor.
-values: A 1-D tensor.
-)doc");
+    });
 
 REGISTER_OP("ReaderNumRecordsProduced")
     .Input("reader_handle: Ref(string)")
     .Output("records_produced: int64")
-    .SetShapeFn(TwoElementVectorAndScalarOutputs)
-    .Doc(R"doc(
-Returns the number of records this Reader has produced.
-
-This is the same as the number of ReaderRead executions that have
-succeeded.
-
-reader_handle: Handle to a Reader.
-)doc");
+    .SetShapeFn(TwoElementVectorAndScalarOutputs);
 
 REGISTER_OP("ReaderNumRecordsProducedV2")
     .Input("reader_handle: resource")
     .Output("records_produced: int64")
-    .SetShapeFn(ScalarInputsAndOutputs)
-    .Doc(R"doc(
-Returns the number of records this Reader has produced.
-
-This is the same as the number of ReaderRead executions that have
-succeeded.
-
-reader_handle: Handle to a Reader.
-)doc");
+    .SetShapeFn(ScalarInputsAndOutputs);
 
 REGISTER_OP("ReaderNumWorkUnitsCompleted")
     .Input("reader_handle: Ref(string)")
     .Output("units_completed: int64")
-    .SetShapeFn(TwoElementVectorAndScalarOutputs)
-    .Doc(R"doc(
-Returns the number of work units this Reader has finished processing.
-
-reader_handle: Handle to a Reader.
-)doc");
+    .SetShapeFn(TwoElementVectorAndScalarOutputs);
 
 REGISTER_OP("ReaderNumWorkUnitsCompletedV2")
     .Input("reader_handle: resource")
     .Output("units_completed: int64")
-    .SetShapeFn(ScalarInputsAndOutputs)
-    .Doc(R"doc(
-Returns the number of work units this Reader has finished processing.
-
-reader_handle: Handle to a Reader.
-)doc");
+    .SetShapeFn(ScalarInputsAndOutputs);
 
 REGISTER_OP("ReaderSerializeState")
     .Input("reader_handle: Ref(string)")
     .Output("state: string")
-    .SetShapeFn(TwoElementVectorAndScalarOutputs)
-    .Doc(R"doc(
-Produce a string tensor that encodes the state of a Reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-reader_handle: Handle to a Reader.
-)doc");
+    .SetShapeFn(TwoElementVectorAndScalarOutputs);
 
 REGISTER_OP("ReaderSerializeStateV2")
     .Input("reader_handle: resource")
     .Output("state: string")
-    .SetShapeFn(ScalarInputsAndOutputs)
-    .Doc(R"doc(
-Produce a string tensor that encodes the state of a Reader.
-
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-
-reader_handle: Handle to a Reader.
-)doc");
+    .SetShapeFn(ScalarInputsAndOutputs);
 
 REGISTER_OP("ReaderRestoreState")
     .Input("reader_handle: Ref(string)")
@@ -820,17 +444,7 @@ REGISTER_OP("ReaderRestoreState")
 
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-reader_handle: Handle to a Reader.
-state: Result of a ReaderSerializeState of a Reader with type
-  matching reader_handle.
-)doc");
+    });
 
 REGISTER_OP("ReaderRestoreStateV2")
     .Input("reader_handle: resource")
@@ -840,45 +454,22 @@ REGISTER_OP("ReaderRestoreStateV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Restore a reader to a previously saved state.
-
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-
-reader_handle: Handle to a Reader.
-state: Result of a ReaderSerializeState of a Reader with type
-  matching reader_handle.
-)doc");
+    });
 
 REGISTER_OP("ReaderReset")
     .Input("reader_handle: Ref(string)")
-    .SetShapeFn(TwoElementVectorAndScalarOutputs)
-    .Doc(R"doc(
-Restore a Reader to its initial clean state.
-
-reader_handle: Handle to a Reader.
-)doc");
+    .SetShapeFn(TwoElementVectorAndScalarOutputs);
 
 REGISTER_OP("ReaderResetV2")
     .Input("reader_handle: resource")
-    .SetShapeFn(ScalarInputsAndOutputs)
-    .Doc(R"doc(
-Restore a Reader to its initial clean state.
-
-reader_handle: Handle to a Reader.
-)doc");
+    .SetShapeFn(ScalarInputsAndOutputs);
 
 // Other input Ops ----------------------------------------------------------
 
 REGISTER_OP("ReadFile")
     .Input("filename: string")
     .Output("contents: string")
-    .SetShapeFn(ScalarInputsAndOutputs)
-    .Doc(R"doc(
-Reads and outputs the entire contents of the input filename.
-)doc");
+    .SetShapeFn(ScalarInputsAndOutputs);
 
 REGISTER_OP("WriteFile")
     .Input("filename: string")
@@ -888,14 +479,7 @@ REGISTER_OP("WriteFile")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Writes contents to the file at input filename. Creates file and recursively
-creates directory if not existing.
-
-filename: scalar. The name of the file to which we write the contents.
-contents: scalar. The content to be written to the output file.
-)doc");
+    });
 
 REGISTER_OP("MatchingFiles")
     .Input("pattern: string")
@@ -905,15 +489,6 @@ REGISTER_OP("MatchingFiles")
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns the set of files matching one or more glob patterns.
-
-Note that this routine only supports wildcard characters in the
-basename portion of the pattern, not in the directory portion.
-
-pattern: Shell wildcard pattern(s). Scalar or vector of type string.
-filenames: A vector of matching filenames.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 53e2360d2321a21c658f5abb87bfbc78e2564f26..f37f79ddbf9614e9fcd128e8d23f71c0f354add2 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -202,17 +202,7 @@ REGISTER_OP("MatrixDeterminant")
       TF_RETURN_IF_ERROR(c->Subshape(input, 0, -2, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the determinant of one or more square matrices.
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. The output is a tensor containing the determinants
-for all input submatrices `[..., :, :]`.
-
-input: Shape is `[..., M, M]`.
-output: Shape is `[...]`.
-)doc");
+    });
 
 REGISTER_OP("LogMatrixDeterminant")
     .Input("input: T")
@@ -235,126 +225,39 @@ REGISTER_OP("LogMatrixDeterminant")
       TF_RETURN_IF_ERROR(c->Subshape(input, 0, -2, &out));
       c->set_output(1, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the sign and the log of the absolute value of the determinant of
-one or more square matrices.
-
-The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-form square matrices. The outputs are two tensors containing the signs and
-absolute values of the log determinants for all N input submatrices
-`[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-is the LU decomposition of the input and P is the corresponding
-permutation matrix.
-
-input: Shape is `[N, M, M]`.
-sign: The signs of the log determinants of the inputs. Shape is `[N]`.
-log_abs_determinant: The logs of the absolute values of the determinants
-of the N input matrices.  Shape is `[N]`.
-)doc");
+    });
 
 REGISTER_OP("MatrixInverse")
     .Input("input: T")
     .Output("output: T")
     .Attr("adjoint: bool = False")
     .Attr("T: {double, float, complex64, complex128}")
-    .SetShapeFn(BatchUnchangedSquareShapeFn)
-    .Doc(R"doc(
-Computes the inverse of one or more square invertible matrices or their
-adjoints (conjugate transposes).
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. The output is a tensor of the same shape as the input
-containing the inverse for all input submatrices `[..., :, :]`.
-
-The op uses LU decomposition with partial pivoting to compute the inverses.
-
-If a matrix is not invertible there is no guarantee what the op does. It
-may detect the condition and raise an exception or it may simply return a
-garbage result.
-
-input: Shape is `[..., M, M]`.
-output: Shape is `[..., M, M]`.
-
-@compatibility(numpy)
-Equivalent to np.linalg.inv
-@end_compatibility
-)doc");
+    .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("MatrixExponential")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: {double, float, complex64, complex128}")
-    .SetShapeFn(BatchUnchangedSquareShapeFn)
-    .Doc(R"doc(
-Computes the matrix exponential of one or more square matrices:
-
-exp(A) = \sum_{n=0}^\infty A^n/n!
-
-The exponential is computed using a combination of the scaling and squaring
-method and the Pade approximation. Details can be founds in:
-Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
-revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. The output is a tensor of the same shape as the input
-containing the exponential for all input submatrices `[..., :, :]`.
-
-input: Shape is `[..., M, M]`.
-output: Shape is `[..., M, M]`.
+    .SetShapeFn(BatchUnchangedSquareShapeFn);
 
-@compatibility(scipy)
-Equivalent to scipy.linalg.expm
-@end_compatibility
-)doc");
+REGISTER_OP("MatrixLogarithm")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: {complex64, complex128}")
+    .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("Cholesky")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: {double, float, complex64, complex128}")
-    .SetShapeFn(BatchUnchangedSquareShapeFn)
-    .Doc(R"doc(
-Computes the Cholesky decomposition of one or more square matrices.
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices.
-
-The input has to be symmetric and positive definite. Only the lower-triangular
-part of the input will be used for this operation. The upper-triangular part
-will not be read.
-
-The output is a tensor of the same shape as the input
-containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
-
-**Note**: The gradient computation on GPU is faster for large matrices but
-not for large batch dimensions when the submatrices are small. In this
-case it might be faster to use the CPU.
-
-input: Shape is `[..., M, M]`.
-output: Shape is `[..., M, M]`.
-)doc");
+    .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("CholeskyGrad")
     .Input("l: T")
     .Input("grad: T")
     .Output("output: T")
     .Attr("T: {float, double}")
-    .SetShapeFn(BatchUnchangedSquareShapeFn)
-    .Doc(R"doc(
-Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
-
-For an explanation see "Differentiation of the Cholesky algorithm" by
-Iain Murray http://arxiv.org/abs/1602.07527.
-
-l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-  Algorithm depends only on lower triangular part of the innermost matrices of
-  this tensor.
-grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-  Algorithm depends only on lower triangular part of the innermost matrices of
-  this tensor.
-output: Symmetrized version of df/dA . Shape is `[..., M, M]`
-)doc");
+    .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("SelfAdjointEig")
     .Input("input: T")
@@ -374,20 +277,7 @@ REGISTER_OP("SelfAdjointEig")
       TF_RETURN_IF_ERROR(c->Concatenate(s, c->Matrix(d_plus_1, d), &s));
       c->set_output(0, s);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices, with the same constraints as the single matrix
-SelfAdjointEig.
-
-The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
-
-input: Shape is `[..., M, M]`.
-output: Shape is `[..., M+1, M]`.
-)doc");
+    });
 
 REGISTER_OP("SelfAdjointEigV2")
     .Input("input: T")
@@ -395,27 +285,7 @@ REGISTER_OP("SelfAdjointEigV2")
     .Output("v: T")
     .Attr("compute_v: bool = True")
     .Attr("T: {double, float, complex64, complex128}")
-    .SetShapeFn(SelfAdjointEigV2ShapeFn)
-    .Doc(R"doc(
-Computes the eigen decomposition of one or more square self-adjoint matrices.
-
-Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
-
-```python
-# a is a tensor.
-# e is a tensor of eigenvalues.
-# v is a tensor of eigenvectors.
-e, v = self_adjoint_eig(a)
-e = self_adjoint_eig(a, compute_v=False)
-```
-
-input: `Tensor` input of shape `[N, N]`.
-compute_v: If `True` then eigenvectors will be computed and returned in `v`.
-  Otherwise, only the eigenvalues will be computed.
-e: Eigenvalues. Shape is `[N]`.
-v: Eigenvectors. Shape is `[N, N]`.
-)doc");
+    .SetShapeFn(SelfAdjointEigV2ShapeFn);
 
 REGISTER_OP("MatrixSolve")
     .Input("matrix: T")
@@ -425,23 +295,7 @@ REGISTER_OP("MatrixSolve")
     .Attr("T: {double, float, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       return MatrixSolveShapeFn(c, true /* square (*/);
-    })
-    .Doc(R"doc(
-Solves systems of linear equations.
-
-`Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-If `adjoint` is `True` then each output matrix satisfies
-`adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
-
-matrix: Shape is `[..., M, M]`.
-rhs: Shape is `[..., M, K]`.
-output: Shape is `[..., M, K]`.
-adjoint: Boolean indicating whether to solve with `matrix` or its (block-wise)
-         adjoint.
-)doc");
+    });
 
 REGISTER_OP("MatrixTriangularSolve")
     .Input("matrix: T")
@@ -452,37 +306,7 @@ REGISTER_OP("MatrixTriangularSolve")
     .Attr("T: {double, float, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       return MatrixSolveShapeFn(c, true /* square (*/);
-    })
-    .Doc(R"doc(
-Solves systems of linear equations with upper or lower triangular matrices by
-backsubstitution.
-
-`matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-square matrices. If `lower` is `True` then the strictly upper triangular part
-of each inner-most matrix is assumed to be zero and not accessed.
-If `lower` is False then the strictly lower triangular part of each inner-most
-matrix is assumed to be zero and not accessed.
-`rhs` is a tensor of shape `[..., M, K]`.
-
-The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-`True` then the innermost matrices in `output` satisfy matrix equations
-`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-If `adjoint` is `False` then the strictly then the  innermost matrices in
-`output` satisfy matrix equations
-`adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
-
-matrix: Shape is `[..., M, M]`.
-rhs: Shape is `[..., M, K]`.
-output: Shape is `[..., M, K]`.
-lower: Boolean indicating whether the innermost matrices in `matrix` are
-       lower or upper triangular.
-adjoint: Boolean indicating whether to solve with `matrix` or its (block-wise)
-         adjoint.
-
-@compatibility(numpy)
-Equivalent to np.linalg.triangular_solve
-@end_compatibility
-)doc");
+    });
 
 REGISTER_OP("MatrixSolveLs")
     .Input("matrix: T")
@@ -495,54 +319,7 @@ REGISTER_OP("MatrixSolveLs")
       ShapeHandle l2_regularizer;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &l2_regularizer));
       return MatrixSolveShapeFn(c, false /* square */);
-    })
-    .Doc(R"doc(
-Solves one or more linear least-squares problems.
-
-`matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-type as `matrix` and shape `[..., M, K]`.
-The output is a tensor shape `[..., N, K]` where each output matrix solves
-each of the equations
-`matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-in the least squares sense.
-
-We use the following notation for (complex) matrix and right-hand sides
-in the batch:
-
-`matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-`rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-`output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-`l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
-
-If `fast` is `True`, then the solution is computed by solving the normal
-equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-\\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-\lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-\\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-minimum-norm solution to the under-determined linear system, i.e.
-\\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-when \\(A\\) is numerically full rank and has a condition number
-\\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
-sufficiently large.
-
-If `fast` is `False` an algorithm based on the numerically robust complete
-orthogonal decomposition is used. This computes the minimum-norm
-least-squares solution, even when \\(A\\) is rank deficient. This path is
-typically 6-7 times slower than the fast path. If `fast` is `False` then
-`l2_regularizer` is ignored.
-
-matrix: Shape is `[..., M, N]`.
-rhs: Shape is `[..., M, K]`.
-output: Shape is `[..., N, K]`.
-l2_regularizer: Scalar tensor.
-
-@compatibility(numpy)
-Equivalent to np.linalg.lstsq
-@end_compatibility
-)doc");
+    });
 
 REGISTER_OP("Qr")
     .Input("input: T")
@@ -550,31 +327,7 @@ REGISTER_OP("Qr")
     .Output("r: T")
     .Attr("full_matrices: bool = False")
     .Attr("T: {double, float, complex64, complex128}")
-    .SetShapeFn(QrShapeFn)
-    .Doc(R"doc(
-Computes the QR decompositions of one or more matrices.
-
-Computes the QR decomposition of each inner matrix in `tensor` such that
-`tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
-
-```python
-# a is a tensor.
-# q is a tensor of orthonormal matrices.
-# r is a tensor of upper triangular matrices.
-q, r = qr(a)
-q_full, r_full = qr(a, full_matrices=True)
-```
-
-input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-  form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-q: Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-  shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-  `[..., M, M]`.
-r: Triangular factor. If `full_matrices` is `False` then shape is
-  `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-full_matrices: If true, compute full-sized `q` and `r`. If false
-  (the default), compute only the leading `P` columns of `q`.
-)doc");
+    .SetShapeFn(QrShapeFn);
 
 REGISTER_OP("Svd")
     .Input("input: T")
@@ -584,38 +337,7 @@ REGISTER_OP("Svd")
     .Attr("compute_uv: bool = True")
     .Attr("full_matrices: bool = False")
     .Attr("T: {double, float, complex64, complex128}")
-    .SetShapeFn(SvdShapeFn)
-    .Doc(R"doc(
-Computes the singular value decompositions of one or more matrices.
-
-Computes the SVD of each inner matrix in `input` such that
-`input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
-
-```python
-# a is a tensor containing a batch of matrices.
-# s is a tensor of singular values for each matrix.
-# u is the tensor containing of left singular vectors for each matrix.
-# v is the tensor containing of right singular vectors for each matrix.
-s, u, v = svd(a)
-s, _, _ = svd(a, compute_uv=False)
-```
-
-input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-  form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-s: Singular values. Shape is `[..., P]`.
-u: Left singular vectors. If `full_matrices` is `False` then shape is
-  `[..., M, P]`; if `full_matrices` is `True` then shape is
-  `[..., M, M]`. Undefined if `compute_uv` is `False`.
-v: Left singular vectors. If `full_matrices` is `False` then shape is
-  `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-  Undefined if `compute_uv` is false.
-compute_uv: If true, left and right singular vectors will be
-  computed and returned in `u` and `v`, respectively.
-  If false, `u` and `v` are not set and should never referenced.
-full_matrices: If true, compute full-sized `u` and `v`. If false
-  (the default), compute only the leading `P` singular vectors.
-  Ignored if `compute_uv` is `False`.
-)doc");
+    .SetShapeFn(SvdShapeFn);
 
 // Deprecated op registrations:
 
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3487c955cbb2b06bdb33000da549c0fc6e7f86e8
--- /dev/null
+++ b/tensorflow/core/ops/list_ops.cc
@@ -0,0 +1,257 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+namespace {
+
+REGISTER_OP("EmptyTensorList")
+    .Input("element_shape: shape_type")
+    .Output("handle: variant")
+    .Attr("element_dtype: type")
+    .Attr("shape_type: {int32, int64}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      shape_inference::ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      c->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListPushBack")
+    .Input("input_handle: variant")
+    .Input("tensor: element_dtype")
+    .Output("output_handle: variant")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      shape_inference::ShapeHandle s = c->UnknownShape();
+
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr && handle_data->size() != 1) {
+        return errors::InvalidArgument(
+            "Trying to push to list with wrong variant data.");
+      }
+      if (handle_data != nullptr) {
+        const shape_inference::ShapeAndType& list_shape_type =
+            (*handle_data)[0];
+        if (list_shape_type.dtype != t) {
+          return errors::InvalidArgument(
+              "Trying to push to list with wrong element dtype. List has type ",
+              DataTypeString(list_shape_type.dtype),
+              " but trying to push element with type ", DataTypeString(t));
+        }
+        shape_inference::ShapeHandle ignored;
+        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
+        s = list_shape_type.shape;
+      }
+      c->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListLength")
+    .Input("input_handle: variant")
+    .Output("length: int32")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("TensorListPopBack")
+    .Input("input_handle: variant")
+    .Output("output_handle: variant")
+    .Output("tensor: element_dtype")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      shape_inference::ShapeHandle s = c->UnknownShape();
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr && handle_data->size() != 1) {
+        return errors::InvalidArgument(
+            "Trying to read from list with invalid variant data.");
+      }
+      if (handle_data != nullptr) {
+        const shape_inference::ShapeAndType& list_shape_type =
+            (*handle_data)[0];
+        if (list_shape_type.dtype != t) {
+          return errors::InvalidArgument(
+              "Trying to read from list with wrong element dtype. List has "
+              "type ",
+              DataTypeString(list_shape_type.dtype),
+              " but trying to push element with type ", DataTypeString(t));
+        }
+        shape_inference::ShapeHandle ignored;
+        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+        s = list_shape_type.shape;
+      }
+      c->set_output(1, s);
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListStack")
+    .Input("input_handle: variant")
+    .Output("tensor: element_dtype")
+    .Attr("element_dtype: type")
+    .Attr("num_elements: int = -1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      shape_inference::ShapeHandle s = c->UnknownShape();
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr && handle_data->size() != 1) {
+        return errors::InvalidArgument(
+            "Trying to read from list with wrong variant data.");
+      }
+      if (handle_data != nullptr) {
+        const shape_inference::ShapeAndType& list_shape_type =
+            (*handle_data)[0];
+        if (list_shape_type.dtype != t) {
+          return errors::InvalidArgument(
+              "Trying to read from list with wrong element dtype. List has "
+              "type ",
+              DataTypeString(list_shape_type.dtype), " but expectec type ",
+              DataTypeString(t));
+        }
+        shape_inference::ShapeHandle ignored;
+        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
+        if (!c->FullyDefined(s) || !c->FullyDefined(list_shape_type.shape)) {
+          return errors::InvalidArgument(
+              "Can only gather from a list with fully defined shapes.");
+        }
+        s = list_shape_type.shape;
+      }
+      int expected_num_elements = -1;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_elements", &expected_num_elements));
+      shape_inference::ShapeHandle num_elements;
+      if (expected_num_elements == -1) {
+        num_elements = c->MakeShape({c->UnknownDim()});
+      } else {
+        num_elements = c->MakeShape({expected_num_elements});
+      }
+      shape_inference::ShapeHandle result;
+      TF_RETURN_IF_ERROR(c->Concatenate(num_elements, s, &result));
+      c->set_output(0, result);
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListFromTensor")
+    .Input("tensor: element_dtype")
+    .Input("element_shape: shape_type")
+    .Output("output_handle: variant")
+    .Attr("element_dtype: type")
+    .Attr("shape_type: {int32, int64}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      shape_inference::ShapeHandle s = c->input(0);
+      shape_inference::ShapeHandle o;
+      TF_RETURN_IF_ERROR(c->Subshape(s, 1, &o));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &element_shape));
+      TF_RETURN_IF_ERROR(c->Merge(o, element_shape, &o));
+      c->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{{element_shape, t}});
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListElementShape")
+    .Input("input_handle: variant")
+    .Output("element_shape: shape_type")
+    .Attr("shape_type: {int32, int64}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data == nullptr) {
+        c->set_output(0, c->Vector(c->UnknownDim()));
+        return Status::OK();
+      }
+      c->set_output(0, c->Vector(c->Rank((*handle_data)[0].shape)));
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListReserve")
+    .Input("element_shape: shape_type")
+    .Input("num_elements: int32")
+    .Output("handle: variant")
+    .Attr("element_dtype: type")
+    .Attr("shape_type: {int32, int64}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      c->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListGetItem")
+    .Input("input_handle: variant")
+    .Input("index: int32")
+    .Output("item: element_dtype")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
+      if (handle_data != nullptr) {
+        const shape_inference::ShapeAndType& list_shape_type =
+            (*handle_data)[0];
+        element_shape = list_shape_type.shape;
+        if (list_shape_type.dtype != t) {
+          return errors::InvalidArgument("Expected list with element dtype ",
+                                         DataTypeString(t),
+                                         " but got list with element dtype ",
+                                         DataTypeString(list_shape_type.dtype));
+        }
+      }
+      c->set_output(0, element_shape);
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListSetItem")
+    .Input("input_handle: variant")
+    .Input("index: int32")
+    .Input("item: element_dtype")
+    .Output("output_handle: variant")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      c->set_output(0, c->Scalar());
+      if (handle_data == nullptr) {
+        c->set_output_handle_shapes_and_types(0, {{c->UnknownShape(), t}});
+        return Status::OK();
+      }
+      const shape_inference::ShapeAndType& list_shape_type = (*handle_data)[0];
+      shape_inference::ShapeHandle s = c->input(2);
+      TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &s));
+      c->set_output_handle_shapes_and_types(0, *handle_data);
+      return Status::OK();
+    });
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/logging_ops.cc b/tensorflow/core/ops/logging_ops.cc
index e6995821df700ef6d6a736645e4d18c961b089a8..d263dc25b29d5c867a10ef20ea1b39fa9b9662f1 100644
--- a/tensorflow/core/ops/logging_ops.cc
+++ b/tensorflow/core/ops/logging_ops.cc
@@ -25,17 +25,7 @@ REGISTER_OP("Assert")
     .SetIsStateful()
     .Attr("T: list(type)")
     .Attr("summarize: int = 3")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Asserts that the given condition is true.
-
-If `condition` evaluates to false, print the list of tensors in `data`.
-`summarize` determines how many entries of the tensors to print.
-
-condition: The condition to evaluate.
-data: The tensors to print out when condition is false.
-summarize: Print this many entries of each tensor.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("Print")
     .Input("input: T")
@@ -47,19 +37,7 @@ REGISTER_OP("Print")
     .Attr("message: string = ''")
     .Attr("first_n: int = -1")
     .Attr("summarize: int = 3")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Prints a list of tensors.
-
-Passes `input` through to `output` and prints `data` when evaluating.
-
-input: The tensor passed to `output`
-data: A list of tensors to print out when op is evaluated.
-output:= The unmodified `input` tensor
-message: A string, prefix of the error message.
-first_n: Only log `first_n` number of times. -1 disables logging.
-summarize: Only print this many entries of each tensor.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // ----------------------------------------------------------------------------
 // Operators that deal with SummaryProtos (encoded as DT_STRING tensors) as
@@ -73,15 +51,7 @@ REGISTER_OP("TensorSummaryV2")
     .Input("serialized_summary_metadata: string")
     .Output("summary: string")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
-
-tag: A string attached to this summary. Used for organization in TensorBoard.
-tensor: A tensor to serialize.
-serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-  data.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("TensorSummary")
     .Input("tensor: T")
@@ -90,56 +60,21 @@ REGISTER_OP("TensorSummary")
     .Attr("description: string = ''")
     .Attr("labels: list(string) = []")
     .Attr("display_name: string = ''")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Outputs a `Summary` protocol buffer with a tensor.
-
-This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-a tag as well as a serialized SummaryMetadata proto string that contains
-plugin-specific data. We will keep this op to maintain backwards compatibility.
-
-tensor: A tensor to serialize.
-description: A json-encoded SummaryDescription proto.
-labels: An unused list of strings.
-display_name: An unused string.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ScalarSummary")
     .Input("tags: string")
     .Input("values: T")
     .Output("summary: string")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Outputs a `Summary` protocol buffer with scalar values.
-
-The input `tags` and `values` must have the same shape.  The generated summary
-has a summary value for each tag-value pair in `tags` and `values`.
-
-tags: Tags for the summary.
-values: Same shape as `tags.  Values for the summary.
-summary: Scalar.  Serialized `Summary` protocol buffer.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("HistogramSummary")
     .Input("tag: string")
     .Input("values: T")
     .Output("summary: string")
     .Attr("T: realnumbertype = DT_FLOAT")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Outputs a `Summary` protocol buffer with a histogram.
-
-The generated
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-has one summary value containing a histogram for `values`.
-
-This op reports an `InvalidArgument` error if any value is not finite.
-
-tag: Scalar.  Tag to use for the `Summary.Value`.
-values: Any shape. Values to use to build the histogram.
-summary: Scalar. Serialized `Summary` protocol buffer.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ImageSummary")
     .Input("tag: string")
@@ -151,51 +86,7 @@ REGISTER_OP("ImageSummary")
         "bad_color: tensor = { dtype: DT_UINT8 "
         "tensor_shape: { dim { size: 4 } } "
         "int_val: 255 int_val: 0 int_val: 0 int_val: 255 }")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Outputs a `Summary` protocol buffer with images.
-
-The summary has up to `max_images` summary values containing images. The
-images are built from `tensor` which must be 4-D with shape `[batch_size,
-height, width, channels]` and where `channels` can be:
-
-*  1: `tensor` is interpreted as Grayscale.
-*  3: `tensor` is interpreted as RGB.
-*  4: `tensor` is interpreted as RGBA.
-
-The images have the same number of channels as the input tensor. For float
-input, the values are normalized one image at a time to fit in the range
-`[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-normalization algorithms:
-
-*  If the input values are all positive, they are rescaled so the largest one
-   is 255.
-
-*  If any input value is negative, the values are shifted so input value 0.0
-   is at 127.  They are then rescaled so that either the smallest value is 0,
-   or the largest one is 255.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_images` is 1, the summary value tag is '*tag*/image'.
-*  If `max_images` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
-
-The `bad_color` argument is the color to use in the generated images for
-non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-Each element must be in the range `[0, 255]` (It represents the value of a
-pixel in the output image).  Non-finite values in the input tensor are
-replaced by this tensor in the output image.  The default value is the color
-red.
-
-tag: Scalar. Used to build the `tag` attribute of the summary values.
-tensor: 4-D of shape `[batch_size, height, width, channels]` where
-  `channels` is 1, 3, or 4.
-max_images: Max number of batch elements to generate images for.
-bad_color: Color to use for pixels with non-finite values.
-summary: Scalar. Serialized `Summary` protocol buffer.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("AudioSummaryV2")
     .Input("tag: string")
@@ -203,28 +94,7 @@ REGISTER_OP("AudioSummaryV2")
     .Input("sample_rate: float")
     .Output("summary: string")
     .Attr("max_outputs: int >= 1 = 3")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Outputs a `Summary` protocol buffer with audio.
-
-The summary has up to `max_outputs` summary values containing audio. The
-audio is built from `tensor` which must be 3-D with shape `[batch_size,
-frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-*  If `max_outputs` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-
-tag: Scalar. Used to build the `tag` attribute of the summary values.
-tensor: 2-D of shape `[batch_size, frames]`.
-sample_rate: The sample rate of the signal in hertz.
-max_outputs: Max number of batch elements to generate audio for.
-summary: Scalar. Serialized `Summary` protocol buffer.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("AudioSummary")
     .Input("tag: string")
@@ -233,48 +103,12 @@ REGISTER_OP("AudioSummary")
     .Attr("sample_rate: float")
     .Attr("max_outputs: int >= 1 = 3")
     .SetShapeFn(shape_inference::ScalarShape)
-    .Deprecated(15, "Use AudioSummaryV2.")
-    .Doc(R"doc(
-Outputs a `Summary` protocol buffer with audio.
-
-The summary has up to `max_outputs` summary values containing audio. The
-audio is built from `tensor` which must be 3-D with shape `[batch_size,
-frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-*  If `max_outputs` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-
-tag: Scalar. Used to build the `tag` attribute of the summary values.
-tensor: 2-D of shape `[batch_size, frames]`.
-sample_rate: The sample rate of the signal in hertz.
-max_outputs: Max number of batch elements to generate audio for.
-summary: Scalar. Serialized `Summary` protocol buffer.
-)doc");
+    .Deprecated(15, "Use AudioSummaryV2.");
 
 REGISTER_OP("MergeSummary")
     .Input("inputs: N * string")
     .Output("summary: string")
     .Attr("N : int >= 1")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Merges summaries.
-
-This op creates a
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-protocol buffer that contains the union of all the values in the input
-summaries.
-
-When the Op is run, it reports an `InvalidArgument` error if multiple values
-in the summaries to merge use the same tag.
-
-inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-  buffers.
-summary: Scalar. Serialized `Summary` protocol buffer.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index dac02dad8bb861fee0e16e0acb0c8e17688e05fb..444aa8b9544c62d81f288f21e4eaaac23d8691cb 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/dataset_stateful_op_whitelist.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -83,21 +84,7 @@ REGISTER_OP("LookupTableFind")
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
       c->set_output(0, c->UnknownShape());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Looks up keys in a table, outputs the corresponding values.
-
-The tensor `keys` must of the same type as the keys of the table.
-The output `values` is of the type of the table values.
-
-The scalar `default_value` is the value output for keys not present in the
-table. It must also be of the same type as the table values.
-
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Same shape as `keys`.  Values found in the table, or `default_values`
-   for missing keys.
-)doc");
+    });
 
 REGISTER_OP("LookupTableFindV2")
     .Input("table_handle: resource")
@@ -115,21 +102,9 @@ REGISTER_OP("LookupTableFindV2")
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
       c->set_output(0, c->UnknownShape());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Looks up keys in a table, outputs the corresponding values.
-
-The tensor `keys` must of the same type as the keys of the table.
-The output `values` is of the type of the table values.
-
-The scalar `default_value` is the value output for keys not present in the
-table. It must also be of the same type as the table values.
-
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Same shape as `keys`.  Values found in the table, or `default_values`
-   for missing keys.
-)doc");
+    });
+WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableFindV2");
+// TODO(b/72710477): Update this.
 
 REGISTER_OP("LookupTableInsert")
     .Input("table_handle: Ref(string)")
@@ -145,17 +120,7 @@ REGISTER_OP("LookupTableInsert")
 
       // TODO(ebrevdo): Validate keys and values shape.
       return Status::OK();
-    })
-    .Doc(R"doc(
-Updates the table to associates keys with values.
-
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
-
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Values to associate with keys.
-)doc");
+    });
 
 REGISTER_OP("LookupTableInsertV2")
     .Input("table_handle: resource")
@@ -169,39 +134,17 @@ REGISTER_OP("LookupTableInsertV2")
 
       // TODO: Validate keys and values shape.
       return Status::OK();
-    })
-    .Doc(R"doc(
-Updates the table to associates keys with values.
-
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
-
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Values to associate with keys.
-)doc");
+    });
 
 REGISTER_OP("LookupTableSize")
     .Input("table_handle: Ref(string)")
     .Output("size: int64")
-    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
-    .Doc(R"doc(
-Computes the number of elements in the given table.
-
-table_handle: Handle to the table.
-size: Scalar that contains number of elements in the table.
-)doc");
+    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs);
 
 REGISTER_OP("LookupTableSizeV2")
     .Input("table_handle: resource")
     .Output("size: int64")
-    .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs)
-    .Doc(R"doc(
-Computes the number of elements in the given table.
-
-table_handle: Handle to the table.
-size: Scalar that contains number of elements in the table.
-)doc");
+    .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs);
 
 REGISTER_OP("LookupTableExport")
     .Input("table_handle: Ref(string)")
@@ -221,14 +164,7 @@ REGISTER_OP("LookupTableExport")
       c->set_output(0, keys);
       c->set_output(1, values);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Outputs all keys and values in the table.
-
-table_handle: Handle to the table.
-keys: Vector of all keys present in the table.
-values: Tensor of all values in the table. Indexed in parallel with `keys`.
-)doc");
+    });
 
 REGISTER_OP("LookupTableExportV2")
     .Input("table_handle: resource")
@@ -246,14 +182,7 @@ REGISTER_OP("LookupTableExportV2")
       c->set_output(0, keys);
       c->set_output(1, values);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Outputs all keys and values in the table.
-
-table_handle: Handle to the table.
-keys: Vector of all keys present in the table.
-values: Tensor of all values in the table. Indexed in parallel with `keys`.
-)doc");
+    });
 
 REGISTER_OP("LookupTableImport")
     .Input("table_handle: Ref(string)")
@@ -269,17 +198,7 @@ REGISTER_OP("LookupTableImport")
 
       // TODO(ebrevdo): Validate keys and values shape.
       return Status::OK();
-    })
-    .Doc(R"doc(
-Replaces the contents of the table with the specified keys and values.
-
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
-
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Values to associate with keys.
-)doc");
+    });
 
 REGISTER_OP("LookupTableImportV2")
     .Input("table_handle: resource")
@@ -293,17 +212,7 @@ REGISTER_OP("LookupTableImportV2")
 
       // TODO: Validate keys and values shape.
       return Status::OK();
-    })
-    .Doc(R"doc(
-Replaces the contents of the table with the specified keys and values.
-
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
-
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Values to associate with keys.
-)doc");
+    });
 
 REGISTER_OP("HashTable")
     .Output("table_handle: Ref(string)")
@@ -313,24 +222,7 @@ REGISTER_OP("HashTable")
     .Attr("key_dtype: type")
     .Attr("value_dtype: type")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Creates a non-initialized hash table.
-
-This op creates a hash table, specifying the type of its keys and values.
-Before using the table you will have to initialize it.  After initialization the
-table will be immutable.
-
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-use_node_name_sharing: If true and shared_name is empty, the table is shared
-  using the node name.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-)doc");
+    .SetShapeFn(TwoElementOutput);
 
 REGISTER_OP("HashTableV2")
     .Output("table_handle: resource")
@@ -340,24 +232,7 @@ REGISTER_OP("HashTableV2")
     .Attr("key_dtype: type")
     .Attr("value_dtype: type")
     .SetIsStateful()
-    .SetShapeFn(ScalarOutput)
-    .Doc(R"doc(
-Creates a non-initialized hash table.
-
-This op creates a hash table, specifying the type of its keys and values.
-Before using the table you will have to initialize it.  After initialization the
-table will be immutable.
-
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-use_node_name_sharing: If true and shared_name is empty, the table is shared
-  using the node name.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-)doc");
+    .SetShapeFn(ScalarOutput);
 
 REGISTER_OP("MutableHashTable")
     .Output("table_handle: Ref(string)")
@@ -367,24 +242,7 @@ REGISTER_OP("MutableHashTable")
     .Attr("key_dtype: type")
     .Attr("value_dtype: type")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Creates an empty hash table.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-use_node_name_sharing: If true and shared_name is empty, the table is shared
-  using the node name.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-)doc");
+    .SetShapeFn(TwoElementOutput);
 
 REGISTER_OP("MutableHashTableV2")
     .Output("table_handle: resource")
@@ -394,24 +252,7 @@ REGISTER_OP("MutableHashTableV2")
     .Attr("key_dtype: type")
     .Attr("value_dtype: type")
     .SetIsStateful()
-    .SetShapeFn(ScalarOutput)
-    .Doc(R"doc(
-Creates an empty hash table.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-use_node_name_sharing: If true and shared_name is empty, the table is shared
-  using the node name.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-)doc");
+    .SetShapeFn(ScalarOutput);
 
 REGISTER_OP("MutableHashTableOfTensors")
     .Output("table_handle: Ref(string)")
@@ -422,22 +263,7 @@ REGISTER_OP("MutableHashTableOfTensors")
     .Attr("value_dtype: type")
     .Attr("value_shape: shape = {}")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Creates an empty hash table.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a vector. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-)doc");
+    .SetShapeFn(TwoElementOutput);
 
 REGISTER_OP("MutableHashTableOfTensorsV2")
     .Output("table_handle: resource")
@@ -448,22 +274,7 @@ REGISTER_OP("MutableHashTableOfTensorsV2")
     .Attr("value_dtype: type")
     .Attr("value_shape: shape = {}")
     .SetIsStateful()
-    .SetShapeFn(ScalarOutput)
-    .Doc(R"doc(
-Creates an empty hash table.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a vector. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-)doc");
+    .SetShapeFn(ScalarOutput);
 
 REGISTER_OP("MutableDenseHashTable")
     .Input("empty_key: key_dtype")
@@ -477,32 +288,7 @@ REGISTER_OP("MutableDenseHashTable")
     .Attr("initial_num_buckets: int = 131072")  // 2^17
     .Attr("max_load_factor: float = 0.8")
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
-    .Doc(R"doc(
-Creates an empty hash table that uses tensors as the backing store.
-
-It uses "open addressing" with quadratic reprobing to resolve
-collisions.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-
-empty_key: The key used to represent empty key buckets internally. Must not
-  be used in insert or lookup operations.
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-value_shape: The shape of each value.
-initial_num_buckets: The initial number of hash table buckets. Must be a power
-  to 2.
-max_load_factor: The maximum ratio between number of entries and number of
-  buckets before growing the table. Must be between 0 and 1.
-)doc");
+    .SetShapeFn(TwoElementOutput);
 
 REGISTER_OP("MutableDenseHashTableV2")
     .Input("empty_key: key_dtype")
@@ -516,32 +302,7 @@ REGISTER_OP("MutableDenseHashTableV2")
     .Attr("initial_num_buckets: int = 131072")  // 2^17
     .Attr("max_load_factor: float = 0.8")
     .SetIsStateful()
-    .SetShapeFn(ScalarOutput)
-    .Doc(R"doc(
-Creates an empty hash table that uses tensors as the backing store.
-
-It uses "open addressing" with quadratic reprobing to resolve
-collisions.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-
-empty_key: The key used to represent empty key buckets internally. Must not
-  be used in insert or lookup operations.
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-value_shape: The shape of each value.
-initial_num_buckets: The initial number of hash table buckets. Must be a power
-  to 2.
-max_load_factor: The maximum ratio between number of entries and number of
-  buckets before growing the table. Must be between 0 and 1.
-)doc");
+    .SetShapeFn(ScalarOutput);
 
 REGISTER_OP("InitializeTable")
     .Input("table_handle: Ref(string)")
@@ -559,14 +320,7 @@ REGISTER_OP("InitializeTable")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
       TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Table initializer that takes two tensors for keys and values respectively.
-
-table_handle: Handle to a table which will be initialized.
-keys: Keys of type Tkey.
-values: Values of type Tval.
-)doc");
+    });
 
 REGISTER_OP("InitializeTableV2")
     .Input("table_handle: resource")
@@ -582,14 +336,7 @@ REGISTER_OP("InitializeTableV2")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
       TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Table initializer that takes two tensors for keys and values respectively.
-
-table_handle: Handle to a table which will be initialized.
-keys: Keys of type Tkey.
-values: Values of type Tval.
-)doc");
+    });
 
 REGISTER_OP("InitializeTableFromTextFile")
     .Input("table_handle: Ref(string)")
@@ -606,29 +353,7 @@ REGISTER_OP("InitializeTableFromTextFile")
 
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &handle));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Initializes a table from a text file.
-
-It inserts one key-value pair into the table for each line of the file.
-The key and value is extracted from the whole line content, elements from the
-split line based on `delimiter` or the line number (starting from zero).
-Where to extract the key and value from a line is specified by `key_index` and
-`value_index`.
-
-- A value of -1 means use the line number(starting from zero), expects `int64`.
-- A value of -2 means use the whole line content, expects `string`.
-- A value >= 0 means use the index (starting at zero) of the split line based
-  on `delimiter`.
-
-table_handle: Handle to a table which will be initialized.
-filename: Filename of a vocabulary text file.
-key_index: Column index in a line to get the table `key` values from.
-value_index: Column index that represents information of a line to get the table
-  `value` values from.
-vocab_size: Number of elements of the file, use -1 if unknown.
-delimiter: Delimiter to separate fields in a line.
-)doc");
+    });
 
 REGISTER_OP("InitializeTableFromTextFileV2")
     .Input("table_handle: resource")
@@ -643,28 +368,6 @@ REGISTER_OP("InitializeTableFromTextFileV2")
 
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &handle));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Initializes a table from a text file.
-
-It inserts one key-value pair into the table for each line of the file.
-The key and value is extracted from the whole line content, elements from the
-split line based on `delimiter` or the line number (starting from zero).
-Where to extract the key and value from a line is specified by `key_index` and
-`value_index`.
-
-- A value of -1 means use the line number(starting from zero), expects `int64`.
-- A value of -2 means use the whole line content, expects `string`.
-- A value >= 0 means use the index (starting at zero) of the split line based
-  on `delimiter`.
-
-table_handle: Handle to a table which will be initialized.
-filename: Filename of a vocabulary text file.
-key_index: Column index in a line to get the table `key` values from.
-value_index: Column index that represents information of a line to get the table
-  `value` values from.
-vocab_size: Number of elements of the file, use -1 if unknown.
-delimiter: Delimiter to separate fields in a line.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/manip_ops.cc b/tensorflow/core/ops/manip_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95b4774fe6e230800e71d237c2cd027acf6e054b
--- /dev/null
+++ b/tensorflow/core/ops/manip_ops.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+// --------------------------------------------------------------------------
+REGISTER_OP("Roll")
+    .Input("input: T")
+    .Input("shift: Tshift")
+    .Input("axis: Taxis")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tshift: {int32,int64}")
+    .Attr("Taxis: {int32,int64}")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index df75caca37a616f75263e35a0d5e725f36e1307b..8f33d51d5a20fc207102e4bf79e7605d9817eb9f 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -40,12 +40,7 @@ REGISTER_OP("AddN")
       }
       c->set_output(0, cur);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Add all input tensors element wise.
-
-inputs: Must all be the same size and shape.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -62,22 +57,7 @@ REGISTER_OP("AccumulateNV2")
     .Attr("shape: shape")
     .SetIsCommutative()
     .SetIsAggregate()
-    .SetShapeFn(shape_inference::ExplicitShape)
-    .Doc(R"doc(
-Returns the element-wise sum of a list of tensors.
-
-`tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-wait for all of its inputs to be ready before beginning to sum. This can
-save memory if inputs are ready at different times, since minimum temporary
-storage is proportional to the output size rather than the inputs size.
-
-Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
-
-Returns a `Tensor` of same shape and type as the elements of `inputs`.
-
-inputs: A list of `Tensor` objects, each with same shape and type.
-shape: Shape of elements of `inputs`.
-)doc");
+    .SetShapeFn(shape_inference::ExplicitShape);
 
 // --------------------------------------------------------------------------
 
@@ -85,7 +65,7 @@ REGISTER_OP("BatchMatMul")
     .Input("x: T")
     .Input("y: T")
     .Output("output: T")
-    .Attr("T: {half, float, double, int32, complex64, complex128}")
+    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
     .SetShapeFn([](InferenceContext* c) {
@@ -120,35 +100,7 @@ REGISTER_OP("BatchMatMul")
           batch_dims, c->Matrix(output_rows, output_cols), &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Multiplies slices of two tensors in batches.
-
-Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-viewed as an element of a batch), and arranges the individual results
-in a single output tensor of the same batch size. Each of the
-individual slices can optionally be adjointed (to adjoint a matrix
-means to transpose and conjugate it) before multiplication by setting
-the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-
-The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-and `[..., r_y, c_y]`.
-
-The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-
-    r_o = c_x if adj_x else r_x
-    c_o = r_y if adj_y else c_y
-
-It is computed as:
-
-    output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-
-x: 2-D or higher with shape `[..., r_x, c_x]`.
-y: 2-D or higher with shape `[..., r_y, c_y]`.
-output: 3-D or higher with shape `[..., r_o, c_o]`
-adj_x: If `True`, adjoint the slices of `x`. Defaults to `False`.
-adj_y: If `True`, adjoint the slices of `y`. Defaults to `False`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 // Casting Ops
@@ -162,10 +114,7 @@ REGISTER_OP("Cast")
     .Output("y: DstT")
     .Attr("SrcT: type")
     .Attr("DstT: type")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Cast x of type SrcT to y of DstT.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("_HostCast")
     .Input("x: SrcT")
@@ -184,295 +133,111 @@ _HostCast requires its input and produces its output in host memory.
 REGISTER_OP("Abs")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, float, double, int32, int64}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Computes the absolute value of a tensor.
-
-Given a tensor `x`, this operation returns a tensor containing the absolute
-value of each element in `x`. For example, if x is an input element and y is
-an output element, this operation computes \\(y = |x|\\).
-)doc");
+    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("ComplexAbs")
     .Input("x: T")
     .Output("y: Tout")
     .Attr("T: {complex64, complex128} = DT_COMPLEX64")
     .Attr("Tout: {float, double} = DT_FLOAT")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Computes the complex absolute value of a tensor.
-
-Given a tensor `x` of complex numbers, this operation returns a tensor of type
-`float` or `double` that is the absolute value of each element in `x`. All
-elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-value is computed as \\( \sqrt{a^2 + b^2}\\).
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // Declares cwise unary operations signature: 't -> 't
-#define UNARY()                                                              \
-  Input("x: T")                                                              \
-      .Output("y: T")                                                        \
-      .Attr("T: {half, float, double, int32, int64, complex64, complex128}") \
+#define UNARY()                                                          \
+  Input("x: T")                                                          \
+      .Output("y: T")                                                    \
+      .Attr(                                                             \
+          "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+          "complex128}")                                                 \
       .SetShapeFn(shape_inference::UnchangedShape)
 
-#define UNARY_REAL()                    \
-  Input("x: T")                         \
-      .Output("y: T")                   \
-      .Attr("T: {half, float, double}") \
+#define UNARY_REAL()                              \
+  Input("x: T")                                   \
+      .Output("y: T")                             \
+      .Attr("T: {half, bfloat16, float, double}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
-#define UNARY_COMPLEX()                                        \
-  Input("x: T")                                                \
-      .Output("y: T")                                          \
-      .Attr("T: {half, float, double, complex64, complex128}") \
+#define UNARY_COMPLEX()                                                  \
+  Input("x: T")                                                          \
+      .Output("y: T")                                                    \
+      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
-#define UNARY_GRADIENT_COMPLEX()                               \
-  Input("y: T")                                                \
-      .Input("dy: T")                                          \
-      .Output("z: T")                                          \
-      .Attr("T: {half, float, double, complex64, complex128}") \
+#define UNARY_GRADIENT_COMPLEX()                                         \
+  Input("y: T")                                                          \
+      .Input("dy: T")                                                    \
+      .Output("z: T")                                                    \
+      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
-REGISTER_OP("Neg")
-    .UNARY()
-    .Doc(R"doc(
-Computes numerical negative value element-wise.
-I.e., \\(y = -x\\).
-)doc");
+REGISTER_OP("Neg").UNARY();
 
-REGISTER_OP("Inv")
-    .UNARY()
-    .Doc(R"doc(
-Computes the reciprocal of x element-wise.
-I.e., \\(y = 1 / x\\).
-)doc")
-    .Deprecated(17, "Use Reciprocal");
+REGISTER_OP("Inv").UNARY();
 
-REGISTER_OP("InvGrad")
-    .UNARY_GRADIENT_COMPLEX()
-    .Doc(R"doc(
-Computes the gradient for the inverse of `x` wrt its input.
+REGISTER_OP("InvGrad").UNARY_GRADIENT_COMPLEX();
 
-Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-is the corresponding input gradient.
-)doc")
-    .Deprecated(17, "Use ReciprocalGrad");
-
-REGISTER_OP("Reciprocal")
-    .UNARY()
-    .Doc(R"doc(
-Computes the reciprocal of x element-wise.
-I.e., \\(y = 1 / x\\).
-)doc");
-
-REGISTER_OP("ReciprocalGrad")
-    .UNARY_GRADIENT_COMPLEX()
-    .Doc(R"doc(
-Computes the gradient for the inverse of `x` wrt its input.
-
-Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-is the corresponding input gradient.
-)doc");
-
-REGISTER_OP("Square")
-    .UNARY()
-    .Doc(R"doc(
-Computes square of x element-wise.
-I.e., \\(y = x * x = x^2\\).
-)doc");
-
-REGISTER_OP("Sqrt")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes square root of x element-wise.
-I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-)doc");
+REGISTER_OP("Reciprocal").UNARY();
 
-REGISTER_OP("SqrtGrad")
-    .UNARY_GRADIENT_COMPLEX()
-    .Doc(R"doc(
-Computes the gradient for the sqrt of `x` wrt its input.
+REGISTER_OP("ReciprocalGrad").UNARY_GRADIENT_COMPLEX();
 
-Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-is the corresponding input gradient.
-)doc");
+REGISTER_OP("Square").UNARY();
 
-REGISTER_OP("Rsqrt")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes reciprocal of square root of x element-wise.
-I.e., \\(y = 1 / \sqrt{x}\\).
-)doc");
+REGISTER_OP("Sqrt").UNARY_COMPLEX();
 
-REGISTER_OP("Round")
-    .UNARY()
-    .Doc(R"doc(
-Rounds the values of a tensor to the nearest integer, element-wise.
+REGISTER_OP("SqrtGrad").UNARY_GRADIENT_COMPLEX();
 
-Rounds half to even.  Also known as bankers rounding. If you want to round
-according to the current system rounding mode use std::cint.
-)doc");
+REGISTER_OP("Rsqrt").UNARY_COMPLEX();
 
-REGISTER_OP("RsqrtGrad")
-    .UNARY_GRADIENT_COMPLEX()
-    .Doc(R"doc(
-Computes the gradient for the rsqrt of `x` wrt its input.
+REGISTER_OP("Round").UNARY();
 
-Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-is the corresponding input gradient.
-)doc");
+REGISTER_OP("RsqrtGrad").UNARY_GRADIENT_COMPLEX();
 
-REGISTER_OP("Exp")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes exponential of x element-wise.  \\(y = e^x\\).
-)doc");
+REGISTER_OP("Exp").UNARY_COMPLEX();
 
-REGISTER_OP("Expm1")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes exponential of x - 1 element-wise.
-I.e., \\(y = (\exp x) - 1\\).
-)doc");
+REGISTER_OP("Expm1").UNARY_COMPLEX();
 
-REGISTER_OP("Log")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes natural logarithm of x element-wise.
-I.e., \\(y = \log_e x\\).
-)doc");
+REGISTER_OP("Log").UNARY_COMPLEX();
 
-REGISTER_OP("Log1p")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes natural logarithm of (1 + x) element-wise.
-I.e., \\(y = \log_e (1 + x)\\).
-)doc");
+REGISTER_OP("Log1p").UNARY_COMPLEX();
 
-REGISTER_OP("Sinh")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes hyperbolic sine of x element-wise.
-)doc");
+REGISTER_OP("Sinh").UNARY_COMPLEX();
 
-REGISTER_OP("Cosh")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes hyperbolic cosine of x element-wise.
-)doc");
+REGISTER_OP("Cosh").UNARY_COMPLEX();
 
-REGISTER_OP("Tanh")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes hyperbolic tangent of `x` element-wise.
-)doc");
+REGISTER_OP("Tanh").UNARY_COMPLEX();
 
-REGISTER_OP("Asinh")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes inverse hyperbolic sine of x element-wise.
-)doc");
+REGISTER_OP("Asinh").UNARY_COMPLEX();
 
-REGISTER_OP("Acosh")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes inverse hyperbolic cosine of x element-wise.
-)doc");
+REGISTER_OP("Acosh").UNARY_COMPLEX();
 
-REGISTER_OP("Atanh")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes inverse hyperbolic tangent of x element-wise.
-)doc");
+REGISTER_OP("Atanh").UNARY_COMPLEX();
 
-REGISTER_OP("TanhGrad")
-    .UNARY_GRADIENT_COMPLEX()
-    .Doc(R"doc(
-Computes the gradient for the tanh of `x` wrt its input.
+REGISTER_OP("TanhGrad").UNARY_GRADIENT_COMPLEX();
 
-Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-is the corresponding input gradient.
-)doc");
+REGISTER_OP("Lgamma").UNARY_REAL();
 
-REGISTER_OP("Lgamma")
-    .UNARY_REAL()
-    .Doc(R"doc(
-Computes the log of the absolute value of `Gamma(x)` element-wise.
-)doc");
+REGISTER_OP("Digamma").UNARY_REAL();
 
-REGISTER_OP("Digamma")
-    .UNARY_REAL()
-    .Doc(R"doc(
-Computes Psi, the derivative of Lgamma (the log of the absolute value of
-`Gamma(x)`), element-wise.
-)doc");
+REGISTER_OP("Erf").UNARY_REAL();
 
-REGISTER_OP("Erf")
-    .UNARY_REAL()
-    .Doc(R"doc(
-Computes the Gauss error function of `x` element-wise.
-)doc");
+REGISTER_OP("Erfc").UNARY_REAL();
 
-REGISTER_OP("Erfc")
-    .UNARY_REAL()
-    .Doc(R"doc(
-Computes the complementary error function of `x` element-wise.
-)doc");
+REGISTER_OP("Sigmoid").UNARY_COMPLEX();
 
-REGISTER_OP("Sigmoid")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes sigmoid of `x` element-wise.
+REGISTER_OP("SigmoidGrad").UNARY_GRADIENT_COMPLEX();
 
-Specifically, `y = 1 / (1 + exp(-x))`.
-)doc");
+REGISTER_OP("Sin").UNARY_COMPLEX();
 
-REGISTER_OP("SigmoidGrad")
-    .UNARY_GRADIENT_COMPLEX()
-    .Doc(R"doc(
-Computes the gradient of the sigmoid of `x` wrt its input.
+REGISTER_OP("Cos").UNARY_COMPLEX();
 
-Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-`dy` is the corresponding input gradient.
-)doc");
+REGISTER_OP("Tan").UNARY();
 
-REGISTER_OP("Sin")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes sin of x element-wise.
-)doc");
+REGISTER_OP("Asin").UNARY();
 
-REGISTER_OP("Cos")
-    .UNARY_COMPLEX()
-    .Doc(R"doc(
-Computes cos of x element-wise.
-)doc");
+REGISTER_OP("Acos").UNARY();
 
-REGISTER_OP("Tan")
-    .UNARY()
-    .Doc(R"doc(
-Computes tan of x element-wise.
-)doc");
-
-REGISTER_OP("Asin")
-    .UNARY()
-    .Doc(R"doc(
-Computes asin of x element-wise.
-)doc");
-
-REGISTER_OP("Acos")
-    .UNARY()
-    .Doc(R"doc(
-Computes acos of x element-wise.
-)doc");
-
-REGISTER_OP("Atan")
-    .UNARY()
-    .Doc(R"doc(
-Computes atan of x element-wise.
-)doc");
+REGISTER_OP("Atan").UNARY();
 
 #undef UNARY
 #undef UNARY_REAL
@@ -481,117 +246,67 @@ Computes atan of x element-wise.
 REGISTER_OP("IsNan")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns which elements of x are NaN.
-
-@compatibility(numpy)
-Equivalent to np.isnan
-@end_compatibility
-)doc");
+    .Attr("T: {half, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("IsInf")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns which elements of x are Inf.
-
-@compatibility(numpy)
-Equivalent to np.isinf
-@end_compatibility
-)doc");
+    .Attr("T: {half, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("IsFinite")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns which elements of x are finite.
-
-@compatibility(numpy)
-Equivalent to np.isfinite
-@end_compatibility
-)doc");
+    .Attr("T: {half, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Sign")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, float, double, int32, int64, complex64, complex128}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns an element-wise indication of the sign of a number.
-
-`y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
-
-For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-)doc");
+    .Attr(
+        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "complex128}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Floor")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns element-wise largest integer not greater than x.
-)doc");
+    .Attr("T: {half, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Ceil")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns element-wise smallest integer in not less than x.
-)doc");
+    .Attr("T: {half, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Rint")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns element-wise integer closest to x.
-
-If the result is midway between two representable values,
-the even representable is chosen.
-For example:
-
-```
-rint(-1.5) ==> -2.0
-rint(0.5000001) ==> 1.0
-rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-```
-)doc");
+    .Attr("T: {bfloat16, float, double}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // Declares cwise binary operations signature: 't, 't -> 't.
 
-#define BINARY_MORE()                                                       \
-  Input("x: T").Input("y: T").Output("z: T").Attr(                          \
-      "T: {half, float, double, uint8, int8, uint16, int16, int32, int64, " \
-      "complex64, complex128}")
+#define BINARY_MORE()                                                          \
+  Input("x: T").Input("y: T").Output("z: T").Attr(                             \
+      "T: {half, bfloat16, float, double, uint8, int8, uint16, int16, int32, " \
+      "int64, complex64, complex128}")
 
-#define BINARY_FEWER()                             \
-  Input("x: T").Input("y: T").Output("z: T").Attr( \
-      "T: {half, float, double, int32, int64, complex64, complex128}")
+#define BINARY_FEWER()                                               \
+  Input("x: T").Input("y: T").Output("z: T").Attr(                   \
+      "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+      "complex128}")
 
 REGISTER_OP("Add")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, float, double, uint8, int8, int16, int32, int64, complex64, "
-        "complex128, string}")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns x + y element-wise.
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "complex64, complex128, string}")
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 // TODO(rmlarsen): Add a Python wrapper that swiches non-string instances to
 // use AddV2 (b/68646025).
@@ -600,17 +315,11 @@ REGISTER_OP("AddV2")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, float, double, uint8, int8, int16, int32, int64, complex64, "
-        "complex128}")
+        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .SetIsAggregate()
-    .SetIsCommutative()
-    .Doc(R"doc(
-Returns x + y element-wise.
-
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+    .SetIsCommutative();
 
 REGISTER_OP("_MklAdd")
     .Input("x: T")
@@ -630,15 +339,8 @@ Returns x + y element-wise.
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
-REGISTER_OP("Sub")
-    .BINARY_MORE()
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns x - y element-wise.
-
-*NOTE*: `Sub` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+REGISTER_OP("Sub").BINARY_MORE().SetShapeFn(
+    shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("_MklSub")
     .BINARY_FEWER()
@@ -653,16 +355,8 @@ Returns x - y element-wise.
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
-REGISTER_OP("Mul")
-    .BINARY_MORE()
-    .SetIsCommutative()
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns x * y element-wise.
-
-*NOTE*: `Mul` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+REGISTER_OP("Mul").BINARY_MORE().SetIsCommutative().SetShapeFn(
+    shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("_MklMul")
     .BINARY_MORE()
@@ -678,63 +372,24 @@ Returns x * y element-wise.
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
-REGISTER_OP("Div")
-    .BINARY_MORE()
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns x / y element-wise.
-
-*NOTE*: `Div` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+REGISTER_OP("Div").BINARY_MORE().SetShapeFn(
+    shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("FloorDiv")
     .BINARY_MORE()
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns x // y element-wise.
-
-*NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("TruncateDiv")
     .BINARY_MORE()
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns x / y element-wise for integer types.
-
-Truncation designates that negative numbers will round fractional quantities
-toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-than Python semantics. See `FloorDiv` for a division function that matches
-Python Semantics.
-
-*NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
-
-REGISTER_OP("RealDiv")
-    .BINARY_MORE()
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns x / y element-wise for real types.
-
-If `x` and `y` are reals, this will return the floating-point division.
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
-*NOTE*: `Div` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+REGISTER_OP("RealDiv").BINARY_MORE().SetShapeFn(
+    shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("SquaredDifference")
     .BINARY_FEWER()
     .SetIsCommutative()
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns (x - y)(x - y) element-wise.
-
-*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("_MklSquaredDifference")
     .BINARY_FEWER()
@@ -757,15 +412,9 @@ REGISTER_OP("Maximum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, float, double, int32, int64}")
+    .Attr("T: {half, bfloat16, float, double, int32, int64}")
     .SetIsCommutative()
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns the max of x and y (i.e. x > y ? x : y) element-wise.
-
-*NOTE*: `Maximum` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("_MklMaximum")
     .Input("x: T")
@@ -788,174 +437,74 @@ REGISTER_OP("Minimum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, float, double, int32, int64}")
+    .Attr("T: {half, bfloat16, float, double, int32, int64}")
     .SetIsCommutative()
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-
-*NOTE*: `Minimum` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Mod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, float, double}")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns element-wise remainder of division. This emulates C semantics in that
-the result here is consistent with a truncating divide. E.g.
-`tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
-
-*NOTE*: `Mod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("FloorMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, float, double}")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("TruncateMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, float, double}")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Returns element-wise remainder of division. This emulates C semantics in that
-the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-y + truncate_mod(x, y) = x`.
-
-*NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Pow")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, float, double, int32, int64, complex64, complex128}")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Computes the power of one value to another.
-
-Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-corresponding elements in `x` and `y`. For example:
-
-```
-# tensor 'x' is [[2, 2]], [3, 3]]
-# tensor 'y' is [[8, 16], [2, 3]]
-tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-```
-)doc");
+    .Attr(
+        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "complex128}")
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Igammac")
     .Input("a: T")
     .Input("x: T")
     .Output("z: T")
     .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-
-The upper regularized incomplete Gamma function is defined as:
-
-\\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
-
-where
-
-\\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
-
-is the upper incomplete Gama function.
-
-Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-Gamma function.
-)doc");
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Igamma")
     .Input("a: T")
     .Input("x: T")
     .Output("z: T")
     .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Compute the lower regularized incomplete Gamma function `Q(a, x)`.
-
-The lower regularized incomplete Gamma function is defined as:
-
-
-\\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
-
-where
-
-\\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
-
-is the lower incomplete Gamma function.
-
-Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-Gamma function.
-)doc");
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Zeta")
     .Input("x: T")
     .Input("q: T")
     .Output("z: T")
     .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-
-The Hurwitz zeta function is defined as:
-
-
-\\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-
-)doc");
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Polygamma")
     .Input("a: T")
     .Input("x: T")
     .Output("z: T")
     .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Compute the polygamma function \\(\psi^{(n)}(x)\\).
-
-The polygamma function is defined as:
-
-
-\\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
-
-where \\(\psi(x)\\) is the digamma function.
-)doc");
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Atan2")
     .Input("y: T")
     .Input("x: T")
     .Output("z: T")
-    .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
-This is the angle \( \theta \in [-\pi, \pi] \) such that
-\[ x = r \cos(\theta) \]
-and
-\[ y = r \sin(\theta) \]
-where \(r = \sqrt(x^2 + y^2) \).
-)doc");
+    .Attr("T: {bfloat16, float, double}")
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Betainc")
     .Input("a: T")
@@ -994,24 +543,7 @@ REGISTER_OP("Betainc")
 
       c->set_output(0, output);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-
-The regularized incomplete beta integral is defined as:
-
-
-\\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
-
-where
-
-
-\\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
-
-
-is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-beta function.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1024,74 +556,32 @@ beta function.
       .Attr("T: realnumbertype") \
       .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
 
-REGISTER_OP("Less")
-    .COMPARISON()
-    .Doc(R"doc(
-Returns the truth value of (x < y) element-wise.
+REGISTER_OP("Less").COMPARISON();
 
-*NOTE*: `Less` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+REGISTER_OP("LessEqual").COMPARISON();
 
-REGISTER_OP("LessEqual")
-    .COMPARISON()
-    .Doc(R"doc(
-Returns the truth value of (x <= y) element-wise.
+REGISTER_OP("Greater").COMPARISON();
 
-*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
-
-REGISTER_OP("Greater")
-    .COMPARISON()
-    .Doc(R"doc(
-Returns the truth value of (x > y) element-wise.
-
-*NOTE*: `Greater` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
-
-REGISTER_OP("GreaterEqual")
-    .COMPARISON()
-    .Doc(R"doc(
-Returns the truth value of (x >= y) element-wise.
-
-*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+REGISTER_OP("GreaterEqual").COMPARISON();
 
 #undef COMPARISON
 
 // --------------------------------------------------------------------------
 
-#define EQUALITY_COMPARISON()                                           \
-  Input("x: T")                                                         \
-      .Input("y: T")                                                    \
-      .Output("z: bool")                                                \
-      .SetIsCommutative()                                               \
-      .Attr(                                                            \
-          "T: {half, float, double, uint8, int8, int16, int32, int64, " \
-          "complex64, "                                                 \
-          "quint8, qint8, qint32, string, bool, complex128}")           \
+#define EQUALITY_COMPARISON()                                              \
+  Input("x: T")                                                            \
+      .Input("y: T")                                                       \
+      .Output("z: bool")                                                   \
+      .SetIsCommutative()                                                  \
+      .Attr(                                                               \
+          "T: {half, bfloat16, float, double, uint8, int8, int16, int32, " \
+          "int64, complex64, quint8, qint8, qint32, string, bool, "        \
+          "complex128}")                                                   \
       .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
 
-REGISTER_OP("Equal")
-    .EQUALITY_COMPARISON()
-    .Doc(R"doc(
-Returns the truth value of (x == y) element-wise.
+REGISTER_OP("Equal").EQUALITY_COMPARISON();
 
-*NOTE*: `Equal` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
-
-REGISTER_OP("NotEqual")
-    .EQUALITY_COMPARISON()
-    .Doc(R"doc(
-Returns the truth value of (x != y) element-wise.
-
-*NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+REGISTER_OP("NotEqual").EQUALITY_COMPARISON();
 
 #undef EQUALITY_COMPARISON
 
@@ -1102,20 +592,14 @@ REGISTER_OP("ApproximateEqual")
     .SetIsCommutative()
     .Attr("T: numbertype")
     .Attr("tolerance: float = 0.00001")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns the truth value of abs(x-y) < tolerance element-wise.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
 
 REGISTER_OP("LogicalNot")
     .Input("x: bool")
     .Output("y: bool")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns the truth value of NOT x element-wise.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 #define BINARY_LOGICAL()  \
   Input("x: bool")        \
@@ -1124,23 +608,9 @@ Returns the truth value of NOT x element-wise.
       .SetIsCommutative() \
       .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
 
-REGISTER_OP("LogicalAnd")
-    .BINARY_LOGICAL()
-    .Doc(R"doc(
-Returns the truth value of x AND y element-wise.
-
-*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+REGISTER_OP("LogicalAnd").BINARY_LOGICAL();
 
-REGISTER_OP("LogicalOr")
-    .BINARY_LOGICAL()
-    .Doc(R"doc(
-Returns the truth value of x OR y element-wise.
-
-*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+REGISTER_OP("LogicalOr").BINARY_LOGICAL();
 
 #undef BINARY_LOGICAL
 
@@ -1233,55 +703,7 @@ REGISTER_OP("Select")
       c->set_output(0, data);
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-Selects elements from `t` or `e`, depending on `condition`.
-
-The `t`, and `e` tensors must all have the same shape, and the
-output will also have that shape.
-
-The `condition` tensor must be a scalar if `t` and `e` are scalars.
-If `t` and `e` are vectors or higher rank, then `condition` must be either a
-scalar, a vector with size matching the first dimension of `t`, or must have
-the same shape as `t`.
-
-The `condition` tensor acts as a mask that chooses, based on the value at each
-element, whether the corresponding element / row in the output should be
-taken from `t` (if true) or `e` (if false).
-
-If `condition` is a vector and `t` and `e` are higher rank matrices, then
-it chooses which row (outer dimension) to copy from `t` and `e`.
-If `condition` has the same shape as `t` and `e`, then it chooses which
-element to copy from `t` and `e`.
-
-For example:
-
-```python
-# 'condition' tensor is [[True,  False]
-#                        [False, True]]
-# 't' is [[1, 2],
-#         [3, 4]]
-# 'e' is [[5, 6],
-#         [7, 8]]
-select(condition, t, e)  # => [[1, 6], [7, 4]]
-
-
-# 'condition' tensor is [True, False]
-# 't' is [[1, 2],
-#         [3, 4]]
-# 'e' is [[5, 6],
-#         [7, 8]]
-select(condition, t, e) ==> [[1, 2],
-                             [7, 8]]
-
-```
-
-t:= A `Tensor` which may have the same shape as `condition`.
-    If `condition` is rank 1, `t` may have higher rank,
-    but its first dimension must match the size of `condition`.
-e:= A `Tensor` with the same type and shape as `t`.
-output:= A `Tensor` with the same type and shape as `t` and `e`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1291,22 +713,8 @@ REGISTER_OP("MatMul")
     .Output("product: T")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
-    .Attr("T: {half, float, double, int32, complex64, complex128}")
-    .SetShapeFn(shape_inference::MatMulShape)
-    .Doc(R"doc(
-Multiply the matrix "a" by the matrix "b".
-
-The inputs must be two-dimensional matrices and the inner dimension of
-"a" (after being transposed if transpose_a is true) must match the
-outer dimension of "b" (after being transposed if transposed_b is
-true).
-
-*Note*: The default kernel implementation for MatMul on GPUs uses
-cublas.
-
-transpose_a: If true, "a" is transposed before multiplication.
-transpose_b: If true, "b" is transposed before multiplication.
-)doc");
+    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
+    .SetShapeFn(shape_inference::MatMulShape);
 
 REGISTER_OP("SparseMatMul")
     .Input("a: Ta")
@@ -1318,18 +726,7 @@ REGISTER_OP("SparseMatMul")
     .Attr("b_is_sparse: bool = false")
     .Attr("Ta: {float, bfloat16} = DT_FLOAT")
     .Attr("Tb: {float, bfloat16} = DT_FLOAT")
-    .SetShapeFn(shape_inference::MatMulShape)
-    .Doc(R"doc(
-Multiply matrix "a" by matrix "b".
-
-The inputs must be two-dimensional matrices and the inner dimension of "a" must
-match the outer dimension of "b". This op is optimized for the case where at
-least one of "a" or "b" is sparse. The breakeven for using this versus a dense
-matrix multiply on one platform was 30% zero values in the sparse matrix.
-
-The gradient computation of this operation will only take advantage of sparsity
-in the input gradient when that gradient comes from a Relu.
-)doc");
+    .SetShapeFn(shape_inference::MatMulShape);
 
 // --------------------------------------------------------------------------
 
@@ -1342,21 +739,7 @@ REGISTER_OP("Sum")
     .Attr("keep_dims: bool = false")
     .Attr("T: numbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(shape_inference::ReductionShape)
-    .Doc(R"doc(
-Computes the sum of elements across dimensions of a tensor.
-
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-
-input: The tensor to reduce.
-reduction_indices: The dimensions to reduce. Must be in the range
-  `[-rank(input), rank(input))`.
-keep_dims: If true, retain reduced dimensions with length 1.
-output: The reduced tensor.
-)doc");
+    .SetShapeFn(shape_inference::ReductionShape);
 
 REGISTER_OP("Mean")
     .Input("input: T")
@@ -1365,21 +748,7 @@ REGISTER_OP("Mean")
     .Attr("keep_dims: bool = false")
     .Attr("T: numbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(shape_inference::ReductionShape)
-    .Doc(R"doc(
-Computes the mean of elements across dimensions of a tensor.
-
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-
-input: The tensor to reduce.
-reduction_indices: The dimensions to reduce. Must be in the range
-  `[-rank(input), rank(input))`.
-keep_dims: If true, retain reduced dimensions with length 1.
-output: The reduced tensor.
-)doc");
+    .SetShapeFn(shape_inference::ReductionShape);
 
 REGISTER_OP("Prod")
     .Input("input: T")
@@ -1388,21 +757,7 @@ REGISTER_OP("Prod")
     .Attr("keep_dims: bool = false")
     .Attr("T: numbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(shape_inference::ReductionShape)
-    .Doc(R"doc(
-Computes the product of elements across dimensions of a tensor.
-
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-
-input: The tensor to reduce.
-reduction_indices: The dimensions to reduce. Must be in the range
-  `[-rank(input), rank(input))`.
-keep_dims: If true, retain reduced dimensions with length 1.
-output: The reduced tensor.
-)doc");
+    .SetShapeFn(shape_inference::ReductionShape);
 
 REGISTER_OP("Min")
     .Input("input: T")
@@ -1411,21 +766,7 @@ REGISTER_OP("Min")
     .Attr("keep_dims: bool = false")
     .Attr("T: numbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(shape_inference::ReductionShape)
-    .Doc(R"doc(
-Computes the minimum of elements across dimensions of a tensor.
-
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-
-input: The tensor to reduce.
-reduction_indices: The dimensions to reduce. Must be in the range
-  `[-rank(input), rank(input))`.
-keep_dims: If true, retain reduced dimensions with length 1.
-output: The reduced tensor.
-)doc");
+    .SetShapeFn(shape_inference::ReductionShape);
 
 REGISTER_OP("Max")
     .Input("input: T")
@@ -1434,21 +775,7 @@ REGISTER_OP("Max")
     .Attr("keep_dims: bool = false")
     .Attr("T: numbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(shape_inference::ReductionShape)
-    .Doc(R"doc(
-Computes the maximum of elements across dimensions of a tensor.
-
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-
-input: The tensor to reduce.
-reduction_indices: The dimensions to reduce. Must be in the range
-  `[-rank(input), rank(input))`.
-keep_dims: If true, retain reduced dimensions with length 1.
-output: The reduced tensor.
-)doc");
+    .SetShapeFn(shape_inference::ReductionShape);
 
 namespace {
 
@@ -1516,16 +843,7 @@ REGISTER_OP("ArgMax")
     .Attr("T: numbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("output_type: {int32, int64} = DT_INT64")
-    .SetShapeFn(ArgOpShape)
-    .Doc(R"doc(
-Returns the index with the largest value across dimensions of a tensor.
-
-Note that in case of ties the identity of the return value is not guaranteed.
-
-dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-  Describes which dimension of the input Tensor to reduce across. For vectors,
-  use dimension = 0.
-)doc");
+    .SetShapeFn(ArgOpShape);
 
 REGISTER_OP("ArgMin")
     .Input("input: T")
@@ -1534,16 +852,7 @@ REGISTER_OP("ArgMin")
     .Attr("T: numbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("output_type: {int32, int64} = DT_INT64")
-    .SetShapeFn(ArgOpShape)
-    .Doc(R"doc(
-Returns the index with the smallest value across dimensions of a tensor.
-
-Note that in case of ties the identity of the return value is not guaranteed.
-
-dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-  Describes which dimension of the input Tensor to reduce across. For vectors,
-  use dimension = 0.
-)doc");
+    .SetShapeFn(ArgOpShape);
 
 namespace {
 
@@ -1605,22 +914,61 @@ Status SparseSegmentReductionGradShapeFn(InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->Subshape(data_shape, 1, &subshape));
 
   const Tensor* dim0 = c->input_tensor(3);
-  ShapeHandle dim0_shape;
+  ShapeHandle dim0_shape;
+  if (dim0 == nullptr) {
+    // We don't have the value at inference time, so the output
+    // shape is unknown.
+    dim0_shape = c->Vector(InferenceContext::kUnknownDim);
+  } else {
+    auto dim0_value = dim0->scalar<int32>()();
+    if (dim0_value < 0) {
+      return errors::InvalidArgument(
+          "Cannot specify a negative value for output_dim0");
+    }
+    dim0_shape = c->Vector(dim0_value);
+  }
+
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->Concatenate(dim0_shape, subshape, &out));
+  c->set_output(0, out);
+  return Status::OK();
+}
+
+Status SparseSegmentReductionWithNumSegmentsShapeFn(InferenceContext* c) {
+  ShapeHandle data_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &data_shape));
+
+  ShapeHandle indices_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &indices_shape));
+
+  ShapeHandle segment_ids_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &segment_ids_shape));
+
+  ShapeHandle num_segments_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &num_segments_shape));
+
+  // indices and segment_ids should merge cleanly.
+  ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->Merge(indices_shape, segment_ids_shape, &unused));
+
+  ShapeHandle subshape;
+  TF_RETURN_IF_ERROR(c->Subshape(data_shape, 1, &subshape));
+
+  ShapeHandle out;
+  const Tensor* dim0 = c->input_tensor(3);
   if (dim0 == nullptr) {
     // We don't have the value at inference time, so the output
     // shape is unknown.
-    dim0_shape = c->Vector(InferenceContext::kUnknownDim);
+    TF_RETURN_IF_ERROR(c->Concatenate(c->Vector(InferenceContext::kUnknownDim),
+                                      subshape, &out));
   } else {
     auto dim0_value = dim0->scalar<int32>()();
     if (dim0_value < 0) {
       return errors::InvalidArgument(
-          "Cannot specify a negative value for output_dim0");
+          "Cannot specify a negative value for num_segments");
     }
-    dim0_shape = c->Vector(dim0_value);
+    TF_RETURN_IF_ERROR(c->Concatenate(c->Vector(dim0_value), subshape, &out));
   }
-
-  ShapeHandle out;
-  TF_RETURN_IF_ERROR(c->Concatenate(dim0_shape, subshape, &out));
   c->set_output(0, out);
   return Status::OK();
 }
@@ -1663,29 +1011,7 @@ REGISTER_OP("SegmentSum")
     .Output("output: T")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
-    .SetShapeFn(SegmentReductionShapeFn)
-    .Doc(R"doc(
-Computes the sum along segments of a tensor.
-
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Computes a tensor such that
-\\(output_i = \sum_j data_j\\) where sum is over `j` such
-that `segment_ids[j] == i`.
-
-If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-</div>
-
-segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-first dimension.  Values should be sorted and can be repeated.
-
-output: Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-)doc");
+    .SetShapeFn(SegmentReductionShapeFn);
 
 REGISTER_OP("SegmentMean")
     .Input("data: T")
@@ -1693,30 +1019,7 @@ REGISTER_OP("SegmentMean")
     .Output("output: T")
     .Attr("T: realnumbertype")
     .Attr("Tindices: {int32,int64}")
-    .SetShapeFn(SegmentReductionShapeFn)
-    .Doc(R"doc(
-Computes the mean along segments of a tensor.
-
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Computes a tensor such that
-\\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-over `j` such that `segment_ids[j] == i` and `N` is the total number of
-values summed.
-
-If the mean is empty for a given segment ID `i`, `output[i] = 0`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-</div>
-
-segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-first dimension.  Values should be sorted and can be repeated.
-
-output: Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-)doc");
+    .SetShapeFn(SegmentReductionShapeFn);
 
 REGISTER_OP("SegmentProd")
     .Input("data: T")
@@ -1724,29 +1027,7 @@ REGISTER_OP("SegmentProd")
     .Output("output: T")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
-    .SetShapeFn(SegmentReductionShapeFn)
-    .Doc(R"doc(
-Computes the product along segments of a tensor.
-
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Computes a tensor such that
-\\(output_i = \prod_j data_j\\) where the product is over `j` such
-that `segment_ids[j] == i`.
-
-If the product is empty for a given segment ID `i`, `output[i] = 1`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
-</div>
-
-segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-first dimension.  Values should be sorted and can be repeated.
-
-output: Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-)doc");
+    .SetShapeFn(SegmentReductionShapeFn);
 
 REGISTER_OP("SegmentMin")
     .Input("data: T")
@@ -1754,29 +1035,7 @@ REGISTER_OP("SegmentMin")
     .Output("output: T")
     .Attr("T: realnumbertype")
     .Attr("Tindices: {int32,int64}")
-    .SetShapeFn(SegmentReductionShapeFn)
-    .Doc(R"doc(
-Computes the minimum along segments of a tensor.
-
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Computes a tensor such that
-\\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-that `segment_ids[j] == i`.
-
-If the min is empty for a given segment ID `i`, `output[i] = 0`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-</div>
-
-segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-first dimension.  Values should be sorted and can be repeated.
-
-output: Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-)doc");
+    .SetShapeFn(SegmentReductionShapeFn);
 
 REGISTER_OP("SegmentMax")
     .Input("data: T")
@@ -1784,103 +1043,48 @@ REGISTER_OP("SegmentMax")
     .Output("output: T")
     .Attr("T: realnumbertype")
     .Attr("Tindices: {int32,int64}")
-    .SetShapeFn(SegmentReductionShapeFn)
-    .Doc(R"doc(
-Computes the maximum along segments of a tensor.
-
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Computes a tensor such that
-\\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-that `segment_ids[j] == i`.
-
-If the max is empty for a given segment ID `i`, `output[i] = 0`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-</div>
-
-segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-first dimension.  Values should be sorted and can be repeated.
-
-output: Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-)doc");
+    .SetShapeFn(SegmentReductionShapeFn);
 
 REGISTER_OP("UnsortedSegmentSum")
     .Input("data: T")
     .Input("segment_ids: Tindices")
-    .Input("num_segments: int32")
+    .Input("num_segments: Tnumsegments")
     .Output("output: T")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
-    .SetShapeFn(UnsortedSegmentReductionShapeFn)
-    .Doc(R"doc(
-Computes the sum along segments of a tensor.
-
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Computes a tensor such that
-`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
-that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-need not be sorted and need not cover all values in the full
-range of valid values.
-
-If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-If the given segment ID `i` is negative, the value is dropped and will not be
-added to the sum of the segment.
-
-`num_segments` should equal the number of distinct segment IDs.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-</div>
-
-segment_ids: A tensor whose shape is a prefix of `data.shape`.
-
-output: Has same shape as data, except for the first `segment_ids.rank`
-  dimensions, which are replaced with a single dimension which has size
-  `num_segments`.
-
-)doc");
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(UnsortedSegmentReductionShapeFn);
 
 REGISTER_OP("UnsortedSegmentMax")
     .Input("data: T")
     .Input("segment_ids: Tindices")
-    .Input("num_segments: int32")
+    .Input("num_segments: Tnumsegments")
     .Output("output: T")
     .Attr("T: realnumbertype")
     .Attr("Tindices: {int32,int64}")
-    .SetShapeFn(UnsortedSegmentReductionShapeFn)
-    .Doc(R"doc(
-Computes the Max along segments of a tensor.
-
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-Instead of computing the sum over segments, it computes the maximum
-such that:
-
-\\(output_i = \max_j data_j\\) where max is over `j` such
-that `segment_ids[j] == i`.
-
-If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
- `output[i] = numeric_limits<T>::min()`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-</div>
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(UnsortedSegmentReductionShapeFn);
 
-segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-first dimension.
+REGISTER_OP("UnsortedSegmentMin")
+    .Input("data: T")
+    .Input("segment_ids: Tindices")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(UnsortedSegmentReductionShapeFn);
 
-output: Has same shape as data, except for dimension 0 which
-has size `num_segments`.
+REGISTER_OP("UnsortedSegmentProd")
+    .Input("data: T")
+    .Input("segment_ids: Tindices")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(UnsortedSegmentReductionShapeFn);
 
-)doc");
 REGISTER_OP("SparseSegmentSum")
     .Input("data: T")
     .Input("indices: Tidx")
@@ -1888,46 +1092,18 @@ REGISTER_OP("SparseSegmentSum")
     .Output("output: T")
     .Attr("T: realnumbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(SparseSegmentReductionShapeFn)
-    .Doc(R"doc(
-Computes the sum along sparse segments of a tensor.
-
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-dimension, selecting a subset of dimension 0, specified by `indices`.
-
-For example:
-
-```python
-c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+    .SetShapeFn(SparseSegmentReductionShapeFn);
 
-# Select two rows, one segment.
-tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-# => [[0 0 0 0]]
-
-# Select two rows, two segment.
-tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-# => [[ 1  2  3  4]
-#     [-1 -2 -3 -4]]
-
-# Select all rows, two segments.
-tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-# => [[0 0 0 0]
-#     [5 6 7 8]]
-
-# Which is equivalent to:
-tf.segment_sum(c, tf.constant([0, 0, 1]))
-```
-
-indices: A 1-D tensor. Has same rank as `segment_ids`.
-
-segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-
-output: Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-)doc");
+REGISTER_OP("SparseSegmentSumWithNumSegments")
+    .Input("data: T")
+    .Input("indices: Tidx")
+    .Input("segment_ids: int32")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(SparseSegmentReductionWithNumSegmentsShapeFn);
 
 REGISTER_OP("SparseSegmentMean")
     .Input("data: T")
@@ -1936,24 +1112,18 @@ REGISTER_OP("SparseSegmentMean")
     .Output("output: T")
     .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(SparseSegmentReductionShapeFn)
-    .Doc(R"doc(
-Computes the mean along sparse segments of a tensor.
-
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-dimension, selecting a subset of dimension 0, specified by `indices`.
+    .SetShapeFn(SparseSegmentReductionShapeFn);
 
-indices: A 1-D tensor. Has same rank as `segment_ids`.
-
-segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-
-output: Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
-)doc");
+REGISTER_OP("SparseSegmentMeanWithNumSegments")
+    .Input("data: T")
+    .Input("indices: Tidx")
+    .Input("segment_ids: int32")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: {float, double}")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(SparseSegmentReductionWithNumSegmentsShapeFn);
 
 REGISTER_OP("SparseSegmentMeanGrad")
     .Input("grad: T")
@@ -1963,18 +1133,7 @@ REGISTER_OP("SparseSegmentMeanGrad")
     .Output("output: T")
     .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(SparseSegmentReductionGradShapeFn)
-    .Doc(R"doc(
-Computes gradients for SparseSegmentMean.
-
-Returns tensor "output" with same shape as grad, except for dimension 0 whose
-value is output_dim0.
-
-grad: gradient propagated to the SparseSegmentMean op.
-indices: indices passed to the corresponding SparseSegmentMean op.
-segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-)doc");
+    .SetShapeFn(SparseSegmentReductionGradShapeFn);
 
 REGISTER_OP("SparseSegmentSqrtN")
     .Input("data: T")
@@ -1983,23 +1142,18 @@ REGISTER_OP("SparseSegmentSqrtN")
     .Output("output: T")
     .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(SparseSegmentReductionShapeFn)
-    .Doc(R"doc(
-Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-
-N is the size of the segment being reduced.
+    .SetShapeFn(SparseSegmentReductionShapeFn);
 
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-indices: A 1-D tensor. Has same rank as `segment_ids`.
-
-segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-
-output: Has same shape as data, except for dimension 0 which
-  has size `k`, the number of segments.
-
-)doc");
+REGISTER_OP("SparseSegmentSqrtNWithNumSegments")
+    .Input("data: T")
+    .Input("indices: Tidx")
+    .Input("segment_ids: int32")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: {float, double}")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(SparseSegmentReductionWithNumSegmentsShapeFn);
 
 REGISTER_OP("SparseSegmentSqrtNGrad")
     .Input("grad: T")
@@ -2009,18 +1163,7 @@ REGISTER_OP("SparseSegmentSqrtNGrad")
     .Output("output: T")
     .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(SparseSegmentReductionGradShapeFn)
-    .Doc(R"doc(
-Computes gradients for SparseSegmentSqrtN.
-
-Returns tensor "output" with same shape as grad, except for dimension 0 whose
-value is output_dim0.
-
-grad: gradient propagated to the SparseSegmentSqrtN op.
-indices: indices passed to the corresponding SparseSegmentSqrtN op.
-segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-)doc");
+    .SetShapeFn(SparseSegmentReductionGradShapeFn);
 
 REGISTER_OP("All")
     .Input("input: bool")
@@ -2028,21 +1171,7 @@ REGISTER_OP("All")
     .Output("output: bool")
     .Attr("keep_dims: bool = false")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(shape_inference::ReductionShape)
-    .Doc(R"doc(
-Computes the "logical and" of elements across dimensions of a tensor.
-
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-
-input: The tensor to reduce.
-reduction_indices: The dimensions to reduce. Must be in the range
-  `[-rank(input), rank(input))`.
-keep_dims: If true, retain reduced dimensions with length 1.
-output: The reduced tensor.
-)doc");
+    .SetShapeFn(shape_inference::ReductionShape);
 
 REGISTER_OP("Any")
     .Input("input: bool")
@@ -2050,21 +1179,7 @@ REGISTER_OP("Any")
     .Attr("keep_dims: bool = false")
     .Output("output: bool")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(shape_inference::ReductionShape)
-    .Doc(R"doc(
-Computes the "logical or" of elements across dimensions of a tensor.
-
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-
-input: The tensor to reduce.
-reduction_indices: The dimensions to reduce. Must be in the range
-  `[-rank(input), rank(input))`.
-keep_dims: If true, retain reduced dimensions with length 1.
-output: The reduced tensor.
-)doc");
+    .SetShapeFn(shape_inference::ReductionShape);
 
 // --------------------------------------------------------------------------
 
@@ -2077,12 +1192,12 @@ Status RangeSize(const Tensor* start_t, const Tensor* limit_t,
   T limit = limit_t->scalar<T>()();
   T delta = delta_t->scalar<T>()();
   if (start > limit && delta > 0) {
-    return errors::InvalidArgument("Requires start <= limit when delta > 0: ",
-                                   start, "/", limit);
+    return errors::InvalidArgument(
+        "Requires start <= limit when delta > 0: ", start, "/", limit);
   }
   if (start < limit && delta < 0) {
-    return errors::InvalidArgument("Requires start >= limit when delta < 0: ",
-                                   start, "/", limit);
+    return errors::InvalidArgument(
+        "Requires start >= limit when delta < 0: ", start, "/", limit);
   }
   if (delta == 0) {
     return errors::InvalidArgument("Requires delta != 0");
@@ -2103,7 +1218,7 @@ REGISTER_OP("Range")
     .Input("limit: Tidx")
     .Input("delta: Tidx")
     .Output("output: Tidx")
-    .Attr("Tidx: {float, double, int32, int64} = DT_INT32")
+    .Attr("Tidx: {bfloat16, float, double, int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_WITH_CONTEXT_IF_ERROR(c->WithRank(c->input(0), 0, &unused),
@@ -2131,34 +1246,14 @@ REGISTER_OP("Range")
         return RangeSize<double>(start_t, limit_t, delta_t, c);
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-Creates a sequence of numbers.
-
-This operation creates a sequence of numbers that begins at `start` and
-extends by increments of `delta` up to but not including `limit`.
-
-For example:
-
-```
-# 'start' is 3
-# 'limit' is 18
-# 'delta' is 3
-tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-```
-
-start: 0-D (scalar). First entry in the sequence.
-limit: 0-D (scalar). Upper limit of sequence, exclusive.
-delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
-output: 1-D.
-)doc");
+    });
 
 REGISTER_OP("LinSpace")
     .Input("start: T")
     .Input("stop: T")
     .Input("num: Tidx")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
@@ -2183,25 +1278,7 @@ REGISTER_OP("LinSpace")
       if (num <= 0) return errors::InvalidArgument("Requires num > 0: ", num);
       c->set_output(0, c->Vector(num));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Generates values in an interval.
-
-A sequence of `num` evenly-spaced values are generated beginning at `start`.
-If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-so that the last one is exactly `stop`.
-
-For example:
-
-```
-tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-```
-
-start: First entry in the range.
-stop: Last entry in the range.
-num: Number of values to generate.
-output: 1-D. The generated values.
-)doc");
+    });
 
 REGISTER_OP("Complex")
     .Input("real: T")
@@ -2209,120 +1286,34 @@ REGISTER_OP("Complex")
     .Output("out: Tout")
     .Attr("T: {float, double} = DT_FLOAT")
     .Attr("Tout: {complex64, complex128} = DT_COMPLEX64")
-    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
-    .Doc(R"doc(
-Converts two real numbers to a complex number.
-
-Given a tensor `real` representing the real part of a complex number, and a
-tensor `imag` representing the imaginary part of a complex number, this
-operation returns complex numbers elementwise of the form \\(a + bj\\), where
-*a* represents the `real` part and *b* represents the `imag` part.
-
-The input tensors `real` and `imag` must have the same shape.
-
-For example:
-
-```
-# tensor 'real' is [2.25, 3.25]
-# tensor `imag` is [4.75, 5.75]
-tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-```
-)doc");
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Real")
     .Input("input: T")
     .Output("output: Tout")
     .Attr("T: {complex64, complex128} = DT_COMPLEX64")
     .Attr("Tout: {float, double} = DT_FLOAT")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns the real part of a complex number.
-
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-type `float` that is the real part of each element in `input`. All elements in
-`input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
- part returned by this operation and *b* is the imaginary part.
-
-For example:
-
-```
-# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.real(input) ==> [-2.25, 3.25]
-```
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Imag")
     .Input("input: T")
     .Output("output: Tout")
     .Attr("T: {complex64, complex128} = DT_COMPLEX64")
     .Attr("Tout: {float, double} = DT_FLOAT")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns the imaginary part of a complex number.
-
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-type `float` that is the imaginary part of each element in `input`. All
-elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-is the real part and *b* is the imaginary part returned by this operation.
-
-For example:
-
-```
-# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.imag(input) ==> [4.75, 5.75]
-```
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Angle")
     .Input("input: T")
     .Output("output: Tout")
     .Attr("T: {complex64, complex128} = DT_COMPLEX64")
     .Attr("Tout: {float, double} = DT_FLOAT")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns the argument of a complex number.
-
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-type `float` that is the argument of each element in `input`. All elements in
-`input` must be complex numbers of the form \\(a + bj\\), where *a*
-is the real part and *b* is the imaginary part.
-
-The argument returned by this operation is of the form \\(atan2(b, a)\\).
-
-For example:
-
-```
-# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.angle(input) ==> [2.0132, 1.056]
-```
-
-@compatibility(numpy)
-Equivalent to np.angle.
-@end_compatibility
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Conj")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: {complex64, complex128, variant} = DT_COMPLEX64")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Returns the complex conjugate of a complex number.
-
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-complex numbers that are the complex conjugate of each element in `input`. The
-complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-real part and *b* is the imaginary part.
-
-The complex conjugate returned by this operation is of the form \\(a - bj\\).
-
-For example:
-
-```
-# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-```
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 // --------------------------------------------------------------------------
 
@@ -2349,18 +1340,7 @@ REGISTER_OP("Cross")
       }
       c->set_output(0, a_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Compute the pairwise cross product.
-
-`a` and `b` must be the same shape; they can either be simple 3-element vectors,
-or any shape where the innermost dimension is 3. In the latter case, each pair
-of corresponding 3-element vectors is cross-multiplied independently.
-
-a: A tensor containing 3-element vectors.
-b: Another tensor, of same type and shape as `a`.
-product: Pairwise cross product of the vectors in `a` and `b`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -2381,33 +1361,7 @@ REGISTER_OP("HistogramFixedWidth")
         c->set_output(0, c->UnknownShapeOfRank(1));
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-Return histogram of values.
-
-Given the tensor `values`, this operation returns a rank 1 histogram counting
-the number of entries in `values` that fall into every bin.  The bins are
-equal width and determined by the arguments `value_range` and `nbins`.
-
-```python
-# Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-nbins = 5
-value_range = [0.0, 5.0]
-new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-
-with tf.get_default_session() as sess:
-  hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-  variables.global_variables_initializer().run()
-  sess.run(hist) => [2, 1, 1, 0, 2]
-```
-
-values:  Numeric `Tensor`.
-value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
-  values <= value_range[0] will be mapped to hist[0],
-  values >= value_range[1] will be mapped to hist[-1].
-nbins:  Scalar `int32 Tensor`.  Number of histogram bins.
-out: A 1-D `Tensor` holding histogram of values.
-)doc");
+    });
 
 REGISTER_OP("Bincount")
     .Input("arr: int32")
@@ -2418,27 +1372,7 @@ REGISTER_OP("Bincount")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->UnknownShapeOfRank(1));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Counts the number of occurrences of each value in an integer array.
-
-Outputs a vector with length `size` and the same dtype as `weights`. If
-`weights` are empty, then index `i` stores the number of times the value `i` is
-counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-the value in `weights` at each index where the corresponding value in `arr` is
-`i`.
-
-Values in `arr` outside of the range [0, size) are ignored.
-
-arr: int32 `Tensor`.
-size: non-negative int32 scalar `Tensor`.
-weights: is an int32, int64, float32, or float64 `Tensor` with the same
-    shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-    equal to 1.
-
-bins: 1D `Tensor` with length equal to `size`. The counts or summed weights for
-    each value in the range [0, size).
-)doc");
+    });
 
 REGISTER_OP("Cumsum")
     .Input("x: T")
@@ -2448,47 +1382,7 @@ REGISTER_OP("Cumsum")
     .Output("out: T")
     .Attr("T: numbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Compute the cumulative sum of the tensor `x` along `axis`.
-
-By default, this op performs an inclusive cumsum, which means that the first
-element of the input is identical to the first element of the output:
-
-```python
-tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-```
-
-By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-performed instead:
-
-```python
-tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-```
-
-By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-opposite direction:
-
-```python
-tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-```
-
-This is more efficient than using separate `tf.reverse` ops.
-
-The `reverse` and `exclusive` kwargs can also be combined:
-
-```python
-tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-```
-
-x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-  `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-  `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-  `[-rank(x), rank(x))`.
-exclusive: If `True`, perform exclusive cumsum.
-reverse: A `bool` (default: False).
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Cumprod")
     .Input("x: T")
@@ -2498,47 +1392,7 @@ REGISTER_OP("Cumprod")
     .Output("out: T")
     .Attr("T: numbertype")
     .Attr("Tidx: {int32, int64} = DT_INT32")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Compute the cumulative product of the tensor `x` along `axis`.
-
-By default, this op performs an inclusive cumprod, which means that the first
-element of the input is identical to the first element of the output:
-
-```python
-tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-```
-
-By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-performed instead:
-
-```python
-tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-```
-
-By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-opposite direction:
-
-```python
-tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-```
-
-This is more efficient than using separate `tf.reverse` ops.
-
-The `reverse` and `exclusive` kwargs can also be combined:
-
-```python
-tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-```
-
-x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-  `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-  `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-  `[-rank(x), rank(x))`.
-exclusive: If `True`, perform exclusive cumprod.
-reverse: A `bool` (default: False).
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("QuantizedMatMul")
     .Input("a: T1")
@@ -2567,29 +1421,7 @@ REGISTER_OP("QuantizedMatMul")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Perform a quantized matrix multiplication of  `a` by the matrix `b`.
-
-The inputs must be two-dimensional matrices and the inner dimension of
-`a` (after being transposed if `transpose_a` is non-zero) must match the
-outer dimension of `b` (after being transposed if `transposed_b` is
-non-zero).
-
-a: Must be a two-dimensional tensor.
-b: Must be a two-dimensional tensor.
-transpose_a: If true, `a` is transposed before multiplication.
-transpose_b: If true, `b` is transposed before multiplication.
-min_a: The float value that the lowest quantized `a` value represents.
-max_a: The float value that the highest quantized `a` value represents.
-min_b: The float value that the lowest quantized `b` value represents.
-max_b: The float value that the highest quantized `b` value represents.
-min_out: The float value that the lowest quantized output value represents.
-max_out: The float value that the highest quantized output value represents.
-Tactivation: The type of output produced by activation function
-    following this operation.
-
-)doc");
+    });
 
 REGISTER_OP("QuantizedMul")
     .Input("x: T1")
@@ -2610,20 +1442,7 @@ REGISTER_OP("QuantizedMul")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns x * y element-wise, working on quantized buffers.
-
-min_x: The float value that the lowest quantized `x` value represents.
-max_x: The float value that the highest quantized `x` value represents.
-min_y: The float value that the lowest quantized `y` value represents.
-max_y: The float value that the highest quantized `y` value represents.
-min_z: The float value that the lowest quantized output value represents.
-max_z: The float value that the highest quantized output value represents.
-
-*NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+    });
 
 REGISTER_OP("QuantizedAdd")
     .Input("x: T1")
@@ -2644,20 +1463,7 @@ REGISTER_OP("QuantizedAdd")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Returns x + y element-wise, working on quantized buffers.
-
-min_x: The float value that the lowest quantized `x` value represents.
-max_x: The float value that the highest quantized `x` value represents.
-min_y: The float value that the lowest quantized `y` value represents.
-max_y: The float value that the highest quantized `y` value represents.
-min_z: The float value that the lowest quantized output value represents.
-max_z: The float value that the highest quantized output value represents.
-
-*NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
-broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-)doc");
+    });
 
 REGISTER_OP("QuantizeDownAndShrinkRange")
     .Input("input: Tinput")
@@ -2676,40 +1482,7 @@ REGISTER_OP("QuantizeDownAndShrinkRange")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Convert the quantized 'input' tensor into a lower-precision 'output', using the
-actual distribution of the values to maximize the usage of the lower bit depth
-and adjusting the output min and max ranges accordingly.
-
-[input_min, input_max] are scalar floats that specify the range for the float
-interpretation of the 'input' data. For example, if input_min is -1.0f and
-input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-
-This operator tries to squeeze as much precision as possible into an output with
-a lower bit depth by calculating the actual min and max values found in the
-data. For example, maybe that quint16 input has no values lower than 16,384 and
-none higher than 49,152. That means only half the range is actually needed, all
-the float interpretations are between -0.5f and 0.5f, so if we want to compress
-the data into a quint8 output, we can use that range rather than the theoretical
--1.0f to 1.0f that is suggested by the input min and max.
-
-In practice, this is most useful for taking output from operations like
-QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-may have large potential output ranges, but in practice have a distribution of
-input values that only uses a small fraction of the possible range. By feeding
-that output into this operator, we can reduce it from 32 bits down to 8 with
-minimal loss of accuracy.
-
-input_min: The float value that the minimum quantized input value represents.
-input_max: The float value that the maximum quantized input value represents.
-Tinput: The type of the input.
-output_min: The float value that the minimum quantized output value represents.
-output_max: The float value that the maximum quantized output value represents.
-out_type: The type of the output. Should be a lower bit depth than Tinput.
-
-)doc");
+    });
 
 REGISTER_OP("Requantize")
     .Input("input: Tinput")
@@ -2732,26 +1505,7 @@ REGISTER_OP("Requantize")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Convert the quantized 'input' tensor into a lower-precision 'output', using the
-output range specified with 'requested_output_min' and 'requested_output_max'.
-
-[input_min, input_max] are scalar floats that specify the range for the float
-interpretation of the 'input' data. For example, if input_min is -1.0f and
-input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-
-input_min: The float value that the minimum quantized input value represents.
-input_max: The float value that the maximum quantized input value represents.
-Tinput: The type of the input.
-requested_output_min: The float value that the minimum quantized output value represents.
-requested_output_max: The float value that the maximum quantized output value represents.
-output_min: The requested_output_min value is copied into this output.
-output_max: The requested_output_max value is copied into this output.
-out_type: The type of the output. Should be a lower bit depth than Tinput.
-
-)doc");
+    });
 
 REGISTER_OP("CompareAndBitpack")
     .Input("input: T")
@@ -2777,39 +1531,7 @@ REGISTER_OP("CompareAndBitpack")
       c->set_output(0, output);
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
-
-Each comparison returns a boolean `true` (if `input_value > threshold`)
-or and `false` otherwise.
-
-This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-algorithms that use hashing approximations of cosine and `L2` distances;
-codes can be generated from an input via:
-
-```python
-codebook_size = 50
-codebook_bits = codebook_size * 32
-codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-                           dtype=x.dtype,
-                           initializer=tf.orthogonal_initializer())
-codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-# now codes has shape x.shape[:-1] + [codebook_size]
-```
-
-**NOTE**: Currently, the innermost dimension of the tensor must be divisible
-by 8.
-
-Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
-a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
-
-input: Values to compare against `threshold` and bitpack.
-threshold: Threshold to compare against.
-T: The type of the input and threshold.
-output: The bitpacked comparisons.
-)doc");
+    });
 
 REGISTER_OP("RequantizationRange")
     .Input("input: Tinput")
@@ -2825,20 +1547,7 @@ REGISTER_OP("RequantizationRange")
       c->set_output(0, c->Scalar());
       c->set_output(1, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Given a quantized tensor described by (input, input_min, input_max), outputs a
-range that covers the actual values present in that tensor.  This op is
-typically used to produce the requested_output_min and requested_output_max for
-Requantize.
-
-input_min: The float value that the minimum quantized input value represents.
-input_max: The float value that the maximum quantized input value represents.
-Tinput: The type of the input.
-output_min: The computed min output.
-output_max: the computed max output.
-
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -2847,29 +1556,7 @@ REGISTER_OP("Bucketize")
     .Output("output: int32")
     .Attr("T: {int32, int64, float, double}")
     .Attr("boundaries: list(float)")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Bucketizes 'input' based on 'boundaries'.
-
-For example, if the inputs are
-    boundaries = [0, 10, 100]
-    input = [[-5, 10000]
-             [150,   10]
-             [5,    100]]
-
-then the output will be
-    output = [[0, 3]
-              [3, 2]
-              [1, 3]]
-
-input: Any shape of Tensor contains with int or float type.
-boundaries: A sorted list of floats gives the boundary of the buckets.
-output: Same shape with 'input', each value of input replaced with bucket index.
-
-@compatibility(numpy)
-Equivalent to np.digitize.
-@end_compatibility
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 #ifdef INTEL_MKL
 REGISTER_OP("_MklAddN")
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 3dfa776d26f53c5f341332b3a2bdf5fd95067049..ca3772e6f89805b70f05f1c9fd5e36ee99f2d510 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -522,7 +522,7 @@ TEST(MathOpsTest, Cross_ShapeFn) {
   INFER_ERROR("Dimension 0 in both shapes must be equal, but", op, "[3];[5]");
   INFER_ERROR("Dimension must be 3 but", op, "[3,5];[3,5]");
 
-  INFER_OK(op, "?;?", "?");
+  INFER_OK(op, "?;?", "in0");
   INFER_OK(op, "[?];[?]", "in0");
   INFER_OK(op, "[1,?,3];[?,?,?]", "in0");
 }
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 654e890b5739e8a4e6f817bb43f697200566e654..67481fd202b3c3b35033b72e4c1c5fd294d98696 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -73,25 +73,8 @@ REGISTER_OP("AvgPool")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {half, float, double}")
-    .SetShapeFn(shape_inference::AvgPoolShape)
-    .Doc(R"doc(
-Performs average pooling on the input.
-
-Each entry in `output` is the mean of the corresponding size `ksize`
-window in `value`.
-
-value: 4-D with shape `[batch, height, width, channels]`.
-ksize: The size of the sliding window for each dimension of `value`.
-strides: The stride of the sliding window for each dimension of `value`.
-padding: The type of padding algorithm to use.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-output: The average pooled output tensor.
-)doc");
+    .Attr("T: {half, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::AvgPoolShape);
 
 REGISTER_OP("AvgPoolGrad")
     .Input("orig_input_shape: int32")
@@ -101,30 +84,14 @@ REGISTER_OP("AvgPoolGrad")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
       TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
       c->set_output(0, s);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes gradients of the average pooling function.
-
-orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-  the output of `avg_pool`.
-ksize: The size of the sliding window for each dimension of the input.
-strides: The stride of the sliding window for each dimension of the input.
-padding: The type of padding algorithm to use.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-output: 4-D.  Gradients w.r.t. the input of `avg_pool`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -154,28 +121,7 @@ REGISTER_OP("BatchNormWithGlobalNormalization")
       TF_RETURN_IF_ERROR(c->ReplaceDim(input, 3, last_dim, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Batch normalization.
-
-This op is deprecated. Prefer `tf.nn.batch_normalization`.
-
-t: A 4D input Tensor.
-m: A 1D mean Tensor with size matching the last dimension of t.
-  This is the first output from tf.nn.moments,
-  or a saved moving average thereof.
-v: A 1D variance Tensor with size matching the last dimension of t.
-  This is the second output from tf.nn.moments,
-  or a saved moving average thereof.
-beta: A 1D beta Tensor with size matching the last dimension of t.
-  An offset to be added to the normalized tensor.
-gamma: A 1D gamma Tensor with size matching the last dimension of t.
-  If "scale_after_normalization" is true, this tensor will be multiplied
-  with the normalized tensor.
-variance_epsilon: A small float number to avoid dividing by 0.
-scale_after_normalization: A bool indicating whether the resulted tensor
-  needs to be multiplied with gamma.
-)doc");
+    });
 
 REGISTER_OP("BatchNormWithGlobalNormalizationGrad")
     .Input("t: T")
@@ -215,33 +161,7 @@ REGISTER_OP("BatchNormWithGlobalNormalizationGrad")
       c->set_output(3, vector_shape);
       c->set_output(4, vector_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Gradients for batch normalization.
-
-This op is deprecated. See `tf.nn.batch_normalization`.
-
-t: A 4D input Tensor.
-m: A 1D mean Tensor with size matching the last dimension of t.
-  This is the first output from tf.nn.moments,
-  or a saved moving average thereof.
-v: A 1D variance Tensor with size matching the last dimension of t.
-  This is the second output from tf.nn.moments,
-  or a saved moving average thereof.
-gamma: A 1D gamma Tensor with size matching the last dimension of t.
-  If "scale_after_normalization" is true, this Tensor will be multiplied
-  with the normalized Tensor.
-backprop: 4D backprop Tensor.
-variance_epsilon: A small float number to avoid dividing by 0.
-scale_after_normalization: A bool indicating whether the resulted tensor
-  needs to be multiplied with gamma.
-
-dx: 4D backprop tensor for input.
-dm: 1D backprop tensor for mean.
-dv: 1D backprop tensor for variance.
-db: 1D backprop tensor for beta.
-dg: 1D backprop tensor for gamma.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -260,34 +180,7 @@ REGISTER_OP("FusedBatchNorm")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
     .Attr("is_training: bool = true")
-    .SetShapeFn(shape_inference::FusedBatchNormShape)
-    .Doc(R"doc(
-Batch normalization.
-Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-The size of 1D Tensors matches the dimension C of the 4D Tensors.
-
-x: A 4D Tensor for input data.
-scale: A 1D Tensor for scaling factor, to scale the normalized x.
-offset: A 1D Tensor for offset, to shift to the normalized x.
-mean: A 1D Tensor for population mean. Used for inference only;
-      must be empty for training.
-variance: A 1D Tensor for population variance. Used for inference only;
-          must be empty for training.
-y: A 4D Tensor for output data.
-batch_mean: A 1D Tensor for the computed batch mean, to be used by TensorFlow
-            to compute the running mean.
-batch_variance: A 1D Tensor for the computed batch variance, to be used by
-                TensorFlow to compute the running variance.
-reserve_space_1: A 1D Tensor for the computed batch mean, to be reused
-                 in the gradient computation.
-reserve_space_2: A 1D Tensor for the computed batch variance (inverted variance
-                 in the cuDNN case), to be reused in the gradient computation.
-T: The data type for the elements of input and output Tensors.
-epsilon: A small float number added to the variance of x.
-data_format: The data format for x and y. Either "NHWC" (default) or "NCHW".
-is_training: A bool value to indicate the operation is for training (default)
-             or inference.
-)doc");
+    .SetShapeFn(shape_inference::FusedBatchNormShape);
 
 REGISTER_OP("FusedBatchNormV2")
     .Input("x: T")
@@ -300,40 +193,12 @@ REGISTER_OP("FusedBatchNormV2")
     .Output("batch_variance: U")
     .Output("reserve_space_1: U")
     .Output("reserve_space_2: U")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
     .Attr("is_training: bool = true")
-    .SetShapeFn(shape_inference::FusedBatchNormShape)
-    .Doc(R"doc(
-Batch normalization.
-Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-The size of 1D Tensors matches the dimension C of the 4D Tensors.
-
-x: A 4D Tensor for input data.
-scale: A 1D Tensor for scaling factor, to scale the normalized x.
-offset: A 1D Tensor for offset, to shift to the normalized x.
-mean: A 1D Tensor for population mean. Used for inference only;
-      must be empty for training.
-variance: A 1D Tensor for population variance. Used for inference only;
-          must be empty for training.
-y: A 4D Tensor for output data.
-batch_mean: A 1D Tensor for the computed batch mean, to be used by TensorFlow
-            to compute the running mean.
-batch_variance: A 1D Tensor for the computed batch variance, to be used by
-                TensorFlow to compute the running variance.
-reserve_space_1: A 1D Tensor for the computed batch mean, to be reused
-                 in the gradient computation.
-reserve_space_2: A 1D Tensor for the computed batch variance (inverted variance
-                 in the cuDNN case), to be reused in the gradient computation.
-T: The data type for the elements of input and output Tensors.
-U: The data type for the scale, offset, mean, and variance.
-epsilon: A small float number added to the variance of x.
-data_format: The data format for x and y. Either "NHWC" (default) or "NCHW".
-is_training: A bool value to indicate the operation is for training (default)
-             or inference.
-)doc");
+    .SetShapeFn(shape_inference::FusedBatchNormShape);
 
 REGISTER_OP("FusedBatchNormGrad")
     .Input("y_backprop: T")
@@ -350,37 +215,7 @@ REGISTER_OP("FusedBatchNormGrad")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
     .Attr("is_training: bool = true")
-    .SetShapeFn(shape_inference::FusedBatchNormGradShape)
-    .Doc(R"doc(
-Gradient for batch normalization.
-Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-The size of 1D Tensors matches the dimension C of the 4D Tensors.
-
-y_backprop: A 4D Tensor for the gradient with respect to y.
-x: A 4D Tensor for input data.
-scale: A 1D Tensor for scaling factor, to scale the normalized x.
-reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-                 mean to be reused in gradient computation. When is_training is
-                 False, a 1D Tensor for the population mean to be reused in both
-                 1st and 2nd order gradient computation.
-reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-                 variance (inverted variance in the cuDNN case) to be reused in
-                 gradient computation. When is_training is False, a 1D Tensor
-                 for the population variance to be reused in both 1st and 2nd
-                 order gradient computation.
-x_backprop: A 4D Tensor for the gradient with respect to x.
-scale_backprop: A 1D Tensor for the gradient with respect to scale.
-offset_backprop: A 1D Tensor for the gradient with respect to offset.
-reserve_space_3: Unused placeholder to match the mean input in FusedBatchNorm.
-reserve_space_4: Unused placeholder to match the variance input
-                 in FusedBatchNorm.
-T: The data type for the elements of input and output Tensors.
-epsilon: A small float number added to the variance of x.
-data_format: The data format for y_backprop, x, x_backprop.
-             Either "NHWC" (default) or "NCHW".
-is_training: A bool value to indicate the operation is for training (default)
-             or inference.
-)doc");
+    .SetShapeFn(shape_inference::FusedBatchNormGradShape);
 
 REGISTER_OP("FusedBatchNormGradV2")
     .Input("y_backprop: T")
@@ -393,43 +228,12 @@ REGISTER_OP("FusedBatchNormGradV2")
     .Output("offset_backprop: U")
     .Output("reserve_space_3: U")
     .Output("reserve_space_4: U")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
     .Attr("is_training: bool = true")
-    .SetShapeFn(shape_inference::FusedBatchNormGradShape)
-    .Doc(R"doc(
-Gradient for batch normalization.
-Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-The size of 1D Tensors matches the dimension C of the 4D Tensors.
-
-y_backprop: A 4D Tensor for the gradient with respect to y.
-x: A 4D Tensor for input data.
-scale: A 1D Tensor for scaling factor, to scale the normalized x.
-reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-                 mean to be reused in gradient computation. When is_training is
-                 False, a 1D Tensor for the population mean to be reused in both
-                 1st and 2nd order gradient computation.
-reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-                 variance (inverted variance in the cuDNN case) to be reused in
-                 gradient computation. When is_training is False, a 1D Tensor
-                 for the population variance to be reused in both 1st and 2nd
-                 order gradient computation.
-x_backprop: A 4D Tensor for the gradient with respect to x.
-scale_backprop: A 1D Tensor for the gradient with respect to scale.
-offset_backprop: A 1D Tensor for the gradient with respect to offset.
-reserve_space_3: Unused placeholder to match the mean input in FusedBatchNorm.
-reserve_space_4: Unused placeholder to match the variance input
-                 in FusedBatchNorm.
-T: The data type for the elements of input and output Tensors.
-U: The data type for the scale, offset, mean, and variance.
-epsilon: A small float number added to the variance of x.
-data_format: The data format for y_backprop, x, x_backprop.
-             Either "NHWC" (default) or "NCHW".
-is_training: A bool value to indicate the operation is for training (default)
-             or inference.
-)doc");
+    .SetShapeFn(shape_inference::FusedBatchNormGradShape);
 
 // --------------------------------------------------------------------------
 
@@ -439,24 +243,7 @@ REGISTER_OP("BiasAdd")
     .Input("bias: T")
     .Attr(GetConvnetDataFormatAttrString())
     .Output("output: T")
-    .SetShapeFn(shape_inference::BiasAddShape)
-    .Doc(R"doc(
-Adds `bias` to `value`.
-
-This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-Broadcasting is supported, so `value` may have any number of dimensions.
-
-value: Any number of dimensions.
-bias: 1-D with size the last dimension of `value`.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the bias tensor will be added to the last dimension
-    of the value tensor.
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-    The tensor will be added to "in_channels", the third-to-the-last
-        dimension.
-output: Broadcasted sum of `value` and `bias`.
-)doc");
+    .SetShapeFn(shape_inference::BiasAddShape);
 // --------------------------------------------------------------------------
 
 REGISTER_OP("BiasAddGrad")
@@ -464,24 +251,7 @@ REGISTER_OP("BiasAddGrad")
     .Input("out_backprop: T")
     .Attr(GetConvnetDataFormatAttrString())
     .Output("output: T")
-    .SetShapeFn(shape_inference::BiasAddGradShape)
-    .Doc(R"doc(
-The backward operation for "BiasAdd" on the "bias" tensor.
-
-It accumulates all the values from out_backprop into the feature dimension.
-For NHWC data format, the feature dimension is the last. For NCHW data format,
-the feature dimension is the third-to-last.
-
-out_backprop: Any number of dimensions.
-output: 1-D with size the feature dimension of `out_backprop`.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the bias tensor will be added to the last dimension
-    of the value tensor.
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-    The tensor will be added to "in_channels", the third-to-the-last
-        dimension.
-)doc");
+    .SetShapeFn(shape_inference::BiasAddGradShape);
 // --------------------------------------------------------------------------
 
 REGISTER_OP("BiasAddV1")
@@ -489,111 +259,39 @@ REGISTER_OP("BiasAddV1")
     .Input("value: T")
     .Input("bias: T")
     .Output("output: T")
-    .SetShapeFn(shape_inference::BiasAddShape)
-    .Doc(R"doc(
-Adds `bias` to `value`.
-
-This is a deprecated version of BiasAdd and will be soon removed.
-
-This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-Broadcasting is supported, so `value` may have any number of dimensions.
-
-value: Any number of dimensions.
-bias: 1-D with size the last dimension of `value`.
-output: Broadcasted sum of `value` and `bias`.
-)doc");
+    .SetShapeFn(shape_inference::BiasAddShape);
 // --------------------------------------------------------------------------
 
 REGISTER_OP("Conv2D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .SetShapeFn(shape_inference::Conv2DShape)
-    .Doc(R"doc(
-Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-
-Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-and a filter / kernel tensor of shape
-`[filter_height, filter_width, in_channels, out_channels]`, this op
-performs the following:
-
-1. Flattens the filter to a 2-D matrix with shape
-   `[filter_height * filter_width * in_channels, output_channels]`.
-2. Extracts image patches from the input tensor to form a *virtual*
-   tensor of shape `[batch, out_height, out_width,
-   filter_height * filter_width * in_channels]`.
-3. For each patch, right-multiplies the filter matrix and the image patch
-   vector.
-
-In detail, with the default NHWC format,
-
-    output[b, i, j, k] =
-        sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-                        filter[di, dj, q, k]
-
-Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-
-input: A 4-D tensor. The dimension order is interpreted according to the value
-    of `data_format`, see below for details.
-filter: A 4-D tensor of shape
-    `[filter_height, filter_width, in_channels, out_channels]`
-output: A 4-D tensor. The dimension order is determined by the value of
-    `data_format`, see below for details.
-strides: 1-D tensor of length 4.  The stride of the sliding window for each
-  dimension of `input`. The dimension order is determined by the value of
-    `data_format`, see below for details.
-padding: The type of padding algorithm to use.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, height, width, channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, channels, height, width].
-)doc");
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv2DShape);
 
 REGISTER_OP("Conv2DBackpropInput")
     .Input("input_sizes: int32")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
       TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
       c->set_output(0, s);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradients of convolution with respect to the input.
-
-input_sizes: An integer vector representing the shape of `input`,
-  where `input` is a 4-D `[batch, height, width, channels]` tensor.
-filter: 4-D with shape
-  `[filter_height, filter_width, in_channels, out_channels]`.
-out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-  Gradients w.r.t. the output of the convolution.
-strides: The stride of the sliding window for each dimension of the input
-  of the convolution. Must be in the same order as the dimension specified with
-  format.
-padding: The type of padding algorithm to use.
-output: 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-  w.r.t. the input of the convolution.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-)doc");
+    });
 
 // TODO(jeff): Instead of 'use_cudnn_for_gpu', maybe we should have a
 // more general string attribute ('kernel_impl'?) that can be used to
@@ -603,40 +301,19 @@ REGISTER_OP("Conv2DBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
       TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
       c->set_output(0, s);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradients of convolution with respect to the filter.
-
-input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-filter_sizes: An integer vector representing the tensor shape of `filter`,
-  where `filter` is a 4-D
-  `[filter_height, filter_width, in_channels, out_channels]` tensor.
-out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-  Gradients w.r.t. the output of the convolution.
-strides: The stride of the sliding window for each dimension of the input
-  of the convolution. Must be in the same order as the dimension specified with
-  format.
-padding: The type of padding algorithm to use.
-output: 4-D with shape
-  `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-  the `filter` input of the convolution.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-)doc");
+    });
 
 namespace {
 
@@ -733,6 +410,22 @@ Status CommonFusedConvCalculations(InferenceContext* c, bool has_resize) {
 
 }  // namespace
 
+REGISTER_OP("DataFormatDimMap")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: {int32, int64} = DT_INT32")
+    .Attr("src_format: string = 'NHWC'")
+    .Attr("dst_format: string = 'NCHW'")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("DataFormatVecPermute")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: {int32, int64} = DT_INT32")
+    .Attr("src_format: string = 'NHWC'")
+    .Attr("dst_format: string = 'NCHW'")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 REGISTER_OP("FusedResizeAndPadConv2D")
     .Input("input: T")
     .Input("size: int32")
@@ -746,35 +439,7 @@ REGISTER_OP("FusedResizeAndPadConv2D")
     .Attr(GetPaddingAttrString())
     .SetShapeFn([](InferenceContext* c) {
       return CommonFusedConvCalculations(c, true /* has_resize */);
-    })
-    .Doc(R"doc(
-Performs a resize and padding as a preprocess during a convolution.
-
-It's often possible to do spatial transformations more efficiently as part of
-the packing stage of a convolution, so this op allows for an optimized
-implementation where these stages are fused together. This prevents the need to
-write out the intermediate results as whole tensors, reducing memory pressure,
-and we can get some latency gains by merging the transformation calculations.
-The data_format attribute for Conv2D isn't supported by this op, and defaults to
-'NHWC' order.
-Internally this op uses a single per-graph scratch buffer, which means that it
-will block if multiple versions are being run in parallel. This is because this
-operator is primarily an optimization to minimize memory usage.
-
-input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-  new size for the images.
-paddings: A two-column matrix specifying the padding sizes. The number of
-  rows must be the same as the rank of `input`.
-filter: 4-D with shape
-  `[filter_height, filter_width, in_channels, out_channels]`.
-resize_align_corners: If true, rescale input by (new_height - 1) / (height - 1),
-  which exactly aligns the 4 corners of images and resized images. If false, rescale
-  by new_height / height. Treat similarly the width dimension.
-strides: 1-D of length 4.  The stride of the sliding window for each dimension
-   of `input`. Must be in the same order as the dimension specified with format.
-padding: The type of padding algorithm to use.
- )doc");
+    });
 
 REGISTER_OP("FusedPadConv2D")
     .Input("input: T")
@@ -787,31 +452,7 @@ REGISTER_OP("FusedPadConv2D")
     .Attr(GetPaddingAttrString())
     .SetShapeFn([](InferenceContext* c) {
       return CommonFusedConvCalculations(c, false /* has_resize */);
-    })
-    .Doc(R"doc(
-Performs a padding as a preprocess during a convolution.
-
-Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-implementation where the spatial padding transformation stage is fused with the
-im2col lookup, but in this case without the bilinear filtering required for
-resizing. Fusing the padding prevents the need to write out the intermediate
-results as whole tensors, reducing memory pressure, and we can get some latency
-gains by merging the transformation calculations.
-The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-order is used instead.
-Internally this op uses a single per-graph scratch buffer, which means that it
-will block if multiple versions are being run in parallel. This is because this
-operator is primarily an optimization to minimize memory usage.
-
-input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-paddings: A two-column matrix specifying the padding sizes. The number of
-  rows must be the same as the rank of `input`.
-filter: 4-D with shape
-  `[filter_height, filter_width, in_channels, out_channels]`.
-strides: 1-D of length 4.  The stride of the sliding window for each dimension
-   of `input`. Must be in the same order as the dimension specified with format.
-padding: The type of padding algorithm to use.
- )doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -819,158 +460,60 @@ REGISTER_OP("DepthwiseConv2dNative")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape)
-    .Doc(R"doc(
-Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-
-Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-and a filter / kernel tensor of shape
-`[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-a different filter to each input channel (expanding from 1 channel to
-`channel_multiplier` channels for each), then concatenates the results
-together. Thus, the output has `in_channels * channel_multiplier` channels.
-
-```
-for k in 0..in_channels-1
-  for q in 0..channel_multiplier-1
-    output[b, i, j, k * channel_multiplier + q] =
-      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-                        filter[di, dj, k, q]
-```
-
-Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-
-strides: 1-D of length 4.  The stride of the sliding window for each dimension
-  of `input`.
-padding: The type of padding algorithm to use.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, height, width, channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, channels, height, width].
-)doc");
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape);
 
 REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
     .Input("input_sizes: int32")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
       TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
       c->set_output(0, s);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradients of depthwise convolution with respect to the input.
-
-input_sizes: An integer vector representing the shape of `input`, based
-  on `data_format`.  For example, if `data_format` is 'NHWC' then
-   `input` is a 4-D `[batch, height, width, channels]` tensor.
-filter: 4-D with shape
-  `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-out_backprop: 4-D with shape  based on `data_format`.
-  For example, if `data_format` is 'NHWC' then
-  out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-  Gradients w.r.t. the output of the convolution.
-strides: The stride of the sliding window for each dimension of the input
-  of the convolution.
-padding: The type of padding algorithm to use.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, height, width, channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, channels, height, width].
-output: 4-D with shape according to `data_format`.  For example, if
-  `data_format` is 'NHWC', output shape is `[batch, in_height,
-  in_width, in_channels]`.  Gradient w.r.t. the input of the
-  convolution.
-)doc");
+    });
 
 REGISTER_OP("DepthwiseConv2dNativeBackpropFilter")
     .Input("input: T")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
       TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
       c->set_output(0, s);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradients of depthwise convolution with respect to the filter.
-
-input: 4-D with shape based on `data_format`.  For example, if
-  `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-  in_width, in_channels]` tensor.
-filter_sizes: An integer vector representing the tensor shape of `filter`,
-  where `filter` is a 4-D
-  `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-out_backprop: 4-D with shape  based on `data_format`.
-  For example, if `data_format` is 'NHWC' then
-  out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-  Gradients w.r.t. the output of the convolution.
-strides: The stride of the sliding window for each dimension of the input
-  of the convolution.
-padding: The type of padding algorithm to use.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, height, width, channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, channels, height, width].
-output: 4-D with shape
-  `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-  the `filter` input of the convolution.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("Conv3D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .SetShapeFn(shape_inference::Conv3DShape)
-    .Doc(R"doc(
-Computes a 3-D convolution given 5-D `input` and `filter` tensors.
-
-In signal processing, cross-correlation is a measure of similarity of
-two waveforms as a function of a time-lag applied to one of them. This
-is also known as a sliding dot product or sliding inner-product.
-
-Our Conv3D implements a form of cross-correlation.
-
-input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-  out_channels]`. `in_channels` must match between `input` and `filter`.
-strides: 1-D tensor of length 5. The stride of the sliding window for each
-  dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-padding: The type of padding algorithm to use.
-data_format: The data format of the input and output data. With the
-    default format "NDHWC", the data is stored in the order of:
-        [batch, in_depth, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCDHW", the data storage order is:
-        [batch, in_channels, in_depth, in_height, in_width].
-)doc");
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv3DShape);
 
 REGISTER_OP("Conv3DBackpropInput")
     .Input("input: T")
@@ -983,20 +526,7 @@ REGISTER_OP("Conv3DBackpropInput")
     .Deprecated(10, "Use Conv3DBackpropInputV2")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
-    })
-    .Doc(R"doc(
-Computes the gradients of 3-D convolution with respect to the input.
-
-input: Shape `[batch, depth, rows, cols, in_channels]`.
-filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-  `in_channels` must match between `input` and `filter`.
-out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-  out_channels]`.
-strides: 1-D tensor of length 5. The stride of the sliding window for each
-  dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-padding: The type of padding algorithm to use.
-
-)doc");
+    });
 
 REGISTER_OP("Conv3DBackpropFilter")
     .Input("input: T")
@@ -1012,94 +542,43 @@ REGISTER_OP("Conv3DBackpropFilter")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 5, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradients of 3-D convolution with respect to the filter.
-
-input: Shape `[batch, depth, rows, cols, in_channels]`.
-filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-  `in_channels` must match between `input` and `filter`.
-out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-  out_channels]`.
-strides: 1-D tensor of length 5. The stride of the sliding window for each
-  dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-padding: The type of padding algorithm to use.
-
-)doc");
+    });
 
 REGISTER_OP("Conv3DBackpropInputV2")
     .Input("input_sizes: int32")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
       TF_RETURN_IF_ERROR(c->WithRank(s, 5, &s));
       c->set_output(0, s);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradients of 3-D convolution with respect to the input.
-
-input_sizes: An integer vector representing the tensor shape of `input`,
-   where `input` is a 5-D
-   `[batch, depth, rows, cols, in_channels]` tensor.
-filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-  `in_channels` must match between `input` and `filter`.
-out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-  out_channels]`.
-strides: 1-D tensor of length 5. The stride of the sliding window for each
-  dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-padding: The type of padding algorithm to use.
-data_format: The data format of the input and output data. With the
-    default format "NDHWC", the data is stored in the order of:
-        [batch, in_depth, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCDHW", the data storage order is:
-        [batch, in_channels, in_depth, in_height, in_width].
-
-)doc");
+    });
 
 REGISTER_OP("Conv3DBackpropFilterV2")
     .Input("input: T")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
       TF_RETURN_IF_ERROR(c->WithRank(s, 5, &s));
       c->set_output(0, s);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradients of 3-D convolution with respect to the filter.
-
-input: Shape `[batch, depth, rows, cols, in_channels]`.
-filter_sizes: An integer vector representing the tensor shape of `filter`,
-  where `filter` is a 5-D
-  `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-  tensor.
-out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-  out_channels]`.
-strides: 1-D tensor of length 5. The stride of the sliding window for each
-  dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-padding: The type of padding algorithm to use.
-data_format: The data format of the input and output data. With the
-    default format "NDHWC", the data is stored in the order of:
-        [batch, in_depth, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCDHW", the data storage order is:
-        [batch, in_channels, in_depth, in_height, in_width].
-
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1110,24 +589,8 @@ REGISTER_OP("AvgPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {float, double}")
-    .SetShapeFn(shape_inference::Pool3DShape)
-    .Doc(R"doc(
-Performs 3D average pooling on the input.
-
-ksize: 1-D tensor of length 5. The size of the window for each dimension of
-  the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-strides: 1-D tensor of length 5. The stride of the sliding window for each
-  dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-padding: The type of padding algorithm to use.
-input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-output: The average pooled output tensor.
-data_format: The data format of the input and output data. With the
-    default format "NDHWC", the data is stored in the order of:
-        [batch, in_depth, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCDHW", the data storage order is:
-        [batch, in_channels, in_depth, in_height, in_width].
-)doc");
+    .Attr("T: {bfloat16, float, double}")
+    .SetShapeFn(shape_inference::Pool3DShape);
 
 REGISTER_OP("AvgPool3DGrad")
     .Input("orig_input_shape: int32")
@@ -1137,31 +600,14 @@ REGISTER_OP("AvgPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
       TF_RETURN_IF_ERROR(c->WithRank(s, 5, &s));
       c->set_output(0, s);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes gradients of average pooling function.
-
-ksize: 1-D tensor of length 5. The size of the window for each dimension of
-  the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-strides: 1-D tensor of length 5. The stride of the sliding window for each
-  dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-padding: The type of padding algorithm to use.
-orig_input_shape: The original input dimensions.
-grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-output: The backprop for input.
-data_format: The data format of the input and output data. With the
-    default format "NDHWC", the data is stored in the order of:
-        [batch, in_depth, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCDHW", the data storage order is:
-        [batch, in_channels, in_depth, in_height, in_width].
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1172,24 +618,8 @@ REGISTER_OP("MaxPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {float}")
-    .SetShapeFn(shape_inference::Pool3DShape)
-    .Doc(R"doc(
-Performs 3D max pooling on the input.
-
-ksize: 1-D tensor of length 5. The size of the window for each dimension of
-  the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-strides: 1-D tensor of length 5. The stride of the sliding window for each
-  dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-padding: The type of padding algorithm to use.
-input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-output: The max pooled output tensor.
-data_format: The data format of the input and output data. With the
-    default format "NDHWC", the data is stored in the order of:
-        [batch, in_depth, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCDHW", the data storage order is:
-        [batch, in_channels, in_depth, in_height, in_width].
-)doc");
+    .Attr("T: {bfloat16, float}")
+    .SetShapeFn(shape_inference::Pool3DShape);
 
 REGISTER_OP("MaxPool3DGrad")
     .Input("orig_input: TInput")
@@ -1200,28 +630,11 @@ REGISTER_OP("MaxPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {float} = DT_FLOAT")
-    .Attr("TInput: {float} = DT_FLOAT")
+    .Attr("T: {bfloat16, float} = DT_FLOAT")
+    .Attr("TInput: {bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
-    })
-    .Doc(R"doc(
-Computes gradients of max pooling function.
-
-ksize: 1-D tensor of length 5. The size of the window for each dimension of
-  the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-strides: 1-D tensor of length 5. The stride of the sliding window for each
-  dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-padding: The type of padding algorithm to use.
-orig_input: The original input tensor.
-orig_output: The original output tensor.
-grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-data_format: The data format of the input and output data. With the
-    default format "NDHWC", the data is stored in the order of:
-        [batch, in_depth, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCDHW", the data storage order is:
-        [batch, in_channels, in_depth, in_height, in_width].
-)doc");
+    });
 
 REGISTER_OP("MaxPool3DGradGrad")
     .Input("orig_input: T")
@@ -1241,43 +654,15 @@ REGISTER_OP("MaxPool3DGradGrad")
       // Validate 'orig_output' is same shape as 'output'
       TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->output(0), &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes second-order gradients of the maxpooling function.
-
-ksize: 1-D tensor of length 5. The size of the window for each dimension of
-  the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-strides: 1-D tensor of length 5. The stride of the sliding window for each
-  dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-padding: The type of padding algorithm to use.
-orig_input: The original input tensor.
-orig_output: The original output tensor.
-grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-output: Gradients of gradients w.r.t. the input to `max_pool`.
-data_format: The data format of the input and output data. With the
-    default format "NDHWC", the data is stored in the order of:
-        [batch, in_depth, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCDHW", the data storage order is:
-        [batch, in_channels, in_depth, in_height, in_width].
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
 REGISTER_OP("L2Loss")
     .Input("t: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-L2 Loss.
-
-Computes half the L2 norm of a tensor without the `sqrt`:
-
-    output = sum(t ** 2) / 2
-
-t: Typically 2-D, but may have any dimensions.
-output: 0-D.
-)doc");
+    .Attr("T: {half, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::ScalarShape);
 
 // --------------------------------------------------------------------------
 
@@ -1288,31 +673,10 @@ REGISTER_OP("LRN")
     .Attr("bias: float = 1.0")
     .Attr("alpha: float = 1.0")
     .Attr("beta: float = 0.5")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: {half, bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 4);
-    })
-    .Doc(R"doc(
-Local Response Normalization.
-
-The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-dimension), and each vector is normalized independently.  Within a given vector,
-each component is divided by the weighted, squared sum of inputs within
-`depth_radius`.  In detail,
-
-    sqr_sum[a, b, c, d] =
-        sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-    output = input / (bias + alpha * sqr_sum) ** beta
-
-For details, see [Krizhevsky et al., ImageNet classification with deep
-convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
-
-input: 4-D.
-depth_radius: 0-D.  Half-width of the 1-D normalization window.
-bias: An offset (usually positive to avoid dividing by 0).
-alpha: A scale factor, usually positive.
-beta: An exponent.
-)doc");
+    });
 
 REGISTER_OP("LRNGrad")
     .Input("input_grads: T")
@@ -1323,7 +687,7 @@ REGISTER_OP("LRNGrad")
     .Attr("bias: float = 1.0")
     .Attr("alpha: float = 1.0")
     .Attr("beta: float = 0.5")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: {half, bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &s));  // input_grads
@@ -1331,53 +695,26 @@ REGISTER_OP("LRNGrad")
       TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s));     // output_image
       c->set_output(0, s);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Gradients for Local Response Normalization.
-
-input_grads: 4-D with shape `[batch, height, width, channels]`.
-input_image: 4-D with shape `[batch, height, width, channels]`.
-output_image: 4-D with shape `[batch, height, width, channels]`.
-depth_radius: A depth radius.
-bias: An offset (usually > 0 to avoid dividing by 0).
-alpha: A scale factor, usually positive.
-beta: An exponent.
-output: The gradients for LRN.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
 REGISTER_OP("MaxPool")
     .Attr(
-        "T: {float, double, int32, int64, uint8, int16, int8, uint16, "
-        "half, qint8} = DT_FLOAT")
+        "T: {half, bfloat16, float, double, int32, int64, uint8, int16, int8, "
+        "uint16, qint8} = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
     .Input("input: T")
     .Output("output: T")
-    .SetShapeFn(shape_inference::MaxPoolShape)
-    .Doc(R"doc(
-Performs max pooling on the input.
-
-ksize: The size of the window for each dimension of the input tensor.
-strides: The stride of the sliding window for each dimension of the
-  input tensor.
-padding: The type of padding algorithm to use.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-input: 4-D input to pool over.
-output: The max pooled output tensor.
-)doc");
+    .SetShapeFn(shape_inference::MaxPoolShape);
 
 REGISTER_OP("MaxPoolV2")
     .Attr(
-        "T: {float, double, int32, int64, uint8, int16, int8, uint16, "
-        "half, qint8} = DT_FLOAT")
+        "T: {half, bfloat16, float, double, int32, int64, uint8, int16, int8, "
+        "uint16, qint8} = DT_FLOAT")
     .Attr(GetPaddingAttrString())
     .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
     .Input("input: T")
@@ -1387,22 +724,7 @@ REGISTER_OP("MaxPoolV2")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::MaxPoolV2Shape(c, 3));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Performs max pooling on the input.
-
-ksize: The size of the window for each dimension of the input tensor.
-strides: The stride of the sliding window for each dimension of the
-  input tensor.
-padding: The type of padding algorithm to use.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-input: 4-D input to pool over.
-output: The max pooled output tensor.
-)doc");
+    });
 
 REGISTER_OP("MaxPoolGrad")
     .Attr("ksize: list(int) >= 4")
@@ -1416,24 +738,7 @@ REGISTER_OP("MaxPoolGrad")
     .Attr("T: realnumbertype = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 4);
-    })
-    .Doc(R"doc(
-Computes gradients of the maxpooling function.
-
-ksize: The size of the window for each dimension of the input tensor.
-strides: The stride of the sliding window for each dimension of the
-  input tensor.
-padding: The type of padding algorithm to use.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-orig_input: The original input tensor.
-orig_output: The original output tensor.
-grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-output: Gradients w.r.t. the input to `max_pool`.
-)doc");
+    });
 
 REGISTER_OP("MaxPoolGradV2")
     .Attr(GetPaddingAttrString())
@@ -1447,24 +752,7 @@ REGISTER_OP("MaxPoolGradV2")
     .Attr("T: realnumbertype = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 4);
-    })
-    .Doc(R"doc(
-Computes gradients of the maxpooling function.
-
-ksize: The size of the window for each dimension of the input tensor.
-strides: The stride of the sliding window for each dimension of the
-  input tensor.
-padding: The type of padding algorithm to use.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-orig_input: The original input tensor.
-orig_output: The original output tensor.
-grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-output: Gradients w.r.t. the input to `max_pool`.
-)doc");
+    });
 
 REGISTER_OP("MaxPoolGradGrad")
     .Attr("ksize: list(int) >= 4")
@@ -1484,24 +772,7 @@ REGISTER_OP("MaxPoolGradGrad")
       // Validate 'orig_output' is same shape as 'output'
       TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->output(0), &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes second-order gradients of the maxpooling function.
-
-ksize: The size of the window for each dimension of the input tensor.
-strides: The stride of the sliding window for each dimension of the
-  input tensor.
-padding: The type of padding algorithm to use.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-orig_input: The original input tensor.
-orig_output: The original output tensor.
-grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-output: Gradients of gradients w.r.t. the input to `max_pool`.
-)doc");
+    });
 
 REGISTER_OP("MaxPoolGradGradV2")
     .Attr(GetPaddingAttrString())
@@ -1521,24 +792,7 @@ REGISTER_OP("MaxPoolGradGradV2")
       // Validate 'orig_output' is same shape as 'output'
       TF_RETURN_IF_ERROR(c->Merge(c->input(1), c->output(0), &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes second-order gradients of the maxpooling function.
-
-ksize: The size of the window for each dimension of the input tensor.
-strides: The stride of the sliding window for each dimension of the
-  input tensor.
-padding: The type of padding algorithm to use.
-data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
-orig_input: The original input tensor.
-orig_output: The original output tensor.
-grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-output: Gradients of gradients w.r.t. the input to `max_pool`.
-)doc");
+    });
 
 REGISTER_OP("MaxPoolWithArgmax")
     .Attr("ksize: list(int) >= 4")
@@ -1553,27 +807,7 @@ REGISTER_OP("MaxPoolWithArgmax")
       TF_RETURN_IF_ERROR(shape_inference::MaxPoolShape(c));
       c->set_output(1, c->output(0));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Performs max pooling on the input and outputs both max values and indices.
-
-The indices in `argmax` are flattened, so that a maximum value at position
-`[b, y, x, c]` becomes flattened index
-`((b * height + y) * width + x) * channels + c`.
-
-The indices returned are always in `[0, height) x [0, width)` before flattening,
-even if padding is involved and the mathematically correct answer is outside
-(either negative or too large).  This is a bug, but fixing it is difficult to do
-in a safe backwards compatible way, especially due to flattening.
-
-ksize: The size of the window for each dimension of the input tensor.
-strides: The stride of the sliding window for each dimension of the
-  input tensor.
-padding: The type of padding algorithm to use.
-input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-output: The max pooled output tensor.
-argmax: 4-D.  The flattened indices of the max values chosen for each output.
-)doc");
+    });
 
 REGISTER_OP("MaxPoolGradWithArgmax")
     .Attr("ksize: list(int) >= 4")
@@ -1587,20 +821,7 @@ REGISTER_OP("MaxPoolGradWithArgmax")
     .Attr("T: realnumbertype")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 4);
-    })
-    .Doc(R"doc(
-Computes gradients of the maxpooling function.
-
-ksize: The size of the window for each dimension of the input tensor.
-strides: The stride of the sliding window for each dimension of the
-  input tensor.
-padding: The type of padding algorithm to use.
-input: The original input.
-grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-  output of `max_pool`.
-argmax: The indices of the maximum values chosen for each output of `max_pool`.
-output: Gradients w.r.t. the input of `max_pool`.
-)doc");
+    });
 
 REGISTER_OP("MaxPoolGradGradWithArgmax")
     .Attr("ksize: list(int) >= 4")
@@ -1620,20 +841,7 @@ REGISTER_OP("MaxPoolGradGradWithArgmax")
       // Validate 'argmax' is same shape as 'output'
       TF_RETURN_IF_ERROR(c->Merge(c->input(2), c->output(0), &unused));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes second-order gradients of the maxpooling function.
-
-ksize: The size of the window for each dimension of the input tensor.
-strides: The stride of the sliding window for each dimension of the
-  input tensor.
-padding: The type of padding algorithm to use.
-input: The original input.
-grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-  input of `max_pool`.
-argmax: The indices of the maximum values chosen for each output of `max_pool`.
-output: Gradients of gradients w.r.t. the input of `max_pool`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1717,43 +925,7 @@ REGISTER_OP("Dilation2D")
           {batch_size_dim, output_rows, output_cols, output_depth_dim});
       c->set_output(0, output_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
-
-The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-`filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-input channel is processed independently of the others with its own structuring
-function. The `output` tensor has shape
-`[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-tensor depend on the `padding` algorithm. We currently only support the default
-"NHWC" `data_format`.
-
-In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-(for consistency with `conv2d`, we use unmirrored filters):
-
-    output[b, y, x, c] =
-       max_{dy, dx} input[b,
-                          strides[1] * y + rates[1] * dy,
-                          strides[2] * x + rates[2] * dx,
-                          c] +
-                    filter[dy, dx, c]
-
-Max-pooling is a special case when the filter has size equal to the pooling
-kernel size and contains all zeros.
-
-Note on duality: The dilation of `input` by the `filter` is equal to the
-negation of the erosion of `-input` by the reflected `filter`.
-
-input: 4-D with shape `[batch, in_height, in_width, depth]`.
-filter: 3-D with shape `[filter_height, filter_width, depth]`.
-strides: The stride of the sliding window for each dimension of the input
- tensor. Must be: `[1, stride_height, stride_width, 1]`.
-rates: The input stride for atrous morphological dilation. Must be:
- `[1, rate_height, rate_width, 1]`.
-padding: The type of padding algorithm to use.
-output: 4-D with shape `[batch, out_height, out_width, depth]`.
-)doc");
+    });
 
 REGISTER_OP("Dilation2DBackpropInput")
     .Input("input: T")
@@ -1764,20 +936,7 @@ REGISTER_OP("Dilation2DBackpropInput")
     .Attr("strides: list(int) >= 4")
     .Attr("rates: list(int) >= 4")
     .Attr(GetPaddingAttrString())
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Computes the gradient of morphological 2-D dilation with respect to the input.
-
-input: 4-D with shape `[batch, in_height, in_width, depth]`.
-filter: 3-D with shape `[filter_height, filter_width, depth]`.
-out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-in_backprop: 4-D with shape `[batch, in_height, in_width, depth]`.
-strides: 1-D of length 4. The stride of the sliding window for each dimension of
-  the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-rates: 1-D of length 4. The input stride for atrous morphological dilation.
-  Must be: `[1, rate_height, rate_width, 1]`.
-padding: The type of padding algorithm to use.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Dilation2DBackpropFilter")
     .Input("input: T")
@@ -1791,20 +950,7 @@ REGISTER_OP("Dilation2DBackpropFilter")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(1));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes the gradient of morphological 2-D dilation with respect to the filter.
-
-input: 4-D with shape `[batch, in_height, in_width, depth]`.
-filter: 3-D with shape `[filter_height, filter_width, depth]`.
-out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-filter_backprop: 3-D with shape `[filter_height, filter_width, depth]`.
-strides: 1-D of length 4. The stride of the sliding window for each dimension of
-  the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-rates: 1-D of length 4. The input stride for atrous morphological dilation.
-  Must be: `[1, rate_height, rate_width, 1]`.
-padding: The type of padding algorithm to use.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1812,190 +958,99 @@ REGISTER_OP("Relu")
     .Input("features: T")
     .Output("activations: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Computes rectified linear: `max(features, 0)`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("ReluGrad")
     .Input("gradients: T")
     .Input("features: T")
     .Output("backprops: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
-    .Doc(R"doc(
-Computes rectified linear gradients for a Relu operation.
-
-gradients: The backpropagated gradients to the corresponding Relu operation.
-features: The features passed as input to the corresponding Relu operation, OR
-  the outputs of that operation (both work equivalently).
-backprops: `gradients * (features > 0)`.
-)doc");
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
 
 REGISTER_OP("Relu6")
     .Input("features: T")
     .Output("activations: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Computes rectified linear 6: `min(max(features, 0), 6)`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Relu6Grad")
     .Input("gradients: T")
     .Input("features: T")
     .Output("backprops: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
-    .Doc(R"doc(
-Computes rectified linear 6 gradients for a Relu6 operation.
-
-gradients: The backpropagated gradients to the corresponding Relu6 operation.
-features: The features passed as input to the corresponding Relu6 operation, or
-  its output; using either one produces the same result.
-backprops: The gradients:
-  `gradients * (features > 0) * (features < 6)`.
-)doc");
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
 
 REGISTER_OP("Elu")
     .Input("features: T")
     .Output("activations: T")
-    .Attr("T: {half, float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-
-See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-](http://arxiv.org/abs/1511.07289)
-)doc");
+    .Attr("T: {half, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("EluGrad")
     .Input("gradients: T")
     .Input("outputs: T")
     .Output("backprops: T")
-    .Attr("T: {half, float, double}")
-    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
-    .Doc(R"doc(
-Computes gradients for the exponential linear (Elu) operation.
-
-gradients: The backpropagated gradients to the corresponding Elu operation.
-outputs: The outputs of the corresponding Elu operation.
-backprops: The gradients: `gradients * (outputs + 1)` if outputs < 0,
-`gradients` otherwise.
-)doc");
+    .Attr("T: {half, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
 
 REGISTER_OP("Selu")
     .Input("features: T")
     .Output("activations: T")
-    .Attr("T: {half, float, double}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
-if < 0, `scale * features` otherwise.
-
-See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-)doc");
+    .Attr("T: {half, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("SeluGrad")
     .Input("gradients: T")
     .Input("outputs: T")
     .Output("backprops: T")
-    .Attr("T: {half, float, double}")
-    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
-    .Doc(R"doc(
-Computes gradients for the scaled exponential linear (Selu) operation.
-
-gradients: The backpropagated gradients to the corresponding Selu operation.
-outputs: The outputs of the corresponding Selu operation.
-backprops: The gradients: `gradients * (outputs + scale * alpha)`
-if outputs < 0, `scale * gradients` otherwise.
-)doc");
+    .Attr("T: {half, bfloat16, float, double}")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
 
 REGISTER_OP("Softplus")
     .Input("features: T")
     .Output("activations: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Computes softplus: `log(exp(features) + 1)`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("SoftplusGrad")
     .Input("gradients: T")
     .Input("features: T")
     .Output("backprops: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
-    .Doc(R"doc(
-Computes softplus gradients for a softplus operation.
-
-gradients: The backpropagated gradients to the corresponding softplus operation.
-features: The features passed as input to the corresponding softplus operation.
-backprops: The gradients: `gradients / (1 + exp(-features))`.
-)doc");
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
 
 REGISTER_OP("Softsign")
     .Input("features: T")
     .Output("activations: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Computes softsign: `features / (abs(features) + 1)`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("SoftsignGrad")
     .Input("gradients: T")
     .Input("features: T")
     .Output("backprops: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
-    .Doc(R"doc(
-Computes softsign gradients for a softsign operation.
-
-gradients: The backpropagated gradients to the corresponding softsign operation.
-features: The features passed as input to the corresponding softsign operation.
-backprops: The gradients: `gradients / (1 + abs(features)) ** 2`.
-)doc");
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
 
 // --------------------------------------------------------------------------
 
 REGISTER_OP("Softmax")
     .Input("logits: T")
     .Output("softmax: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
-    })
-    .Doc(R"doc(
-Computes softmax activations.
-
-For each batch `i` and class `j` we have
-
-    softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
-
-logits: 2-D with shape `[batch_size, num_classes]`.
-softmax: Same shape as `logits`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
 REGISTER_OP("LogSoftmax")
     .Input("logits: T")
     .Output("logsoftmax: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
-    })
-    .Doc(R"doc(
-Computes log softmax activations.
-
-For each batch `i` and class `j` we have
-
-    logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
-
-logits: 2-D with shape `[batch_size, num_classes]`.
-logsoftmax: Same shape as `logits`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -2004,7 +1059,7 @@ REGISTER_OP("SoftmaxCrossEntropyWithLogits")
     .Input("labels: T")
     .Output("loss: T")
     .Output("backprop: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
@@ -2014,26 +1069,14 @@ REGISTER_OP("SoftmaxCrossEntropyWithLogits")
       c->set_output(0, c->Vector(batch_size));
       c->set_output(1, input);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes softmax cross entropy cost and gradients to backpropagate.
-
-Inputs are the logits, not probabilities.
-
-features: batch_size x num_classes matrix
-labels: batch_size x num_classes matrix
-  The caller must ensure that each batch of labels represents a valid
-  probability distribution.
-loss: Per example loss (batch_size vector).
-backprop: backpropagated gradients (batch_size x num_classes matrix).
-)doc");
+    });
 
 REGISTER_OP("SparseSoftmaxCrossEntropyWithLogits")
     .Input("features: T")
     .Input("labels: Tlabels")
     .Output("loss: T")
     .Output("backprop: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("Tlabels: {int32, int64} = DT_INT64")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle features;
@@ -2049,23 +1092,7 @@ REGISTER_OP("SparseSoftmaxCrossEntropyWithLogits")
       c->set_output(0, c->Vector(batch_size));
       c->set_output(1, features);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes softmax cross entropy cost and gradients to backpropagate.
-
-Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-a matrix of label probabilities, but rather a single label per row
-of features.  This label is considered to have probability 1.0 for the
-given row.
-
-Inputs are the logits, not probabilities.
-
-features: batch_size x num_classes matrix
-labels: batch_size vector with values in [0, num_classes).
-  This is the label for the given minibatch entry.
-loss: Per example loss (batch_size vector).
-backprop: backpropagated gradients (batch_size x num_classes matrix).
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -2085,31 +1112,7 @@ REGISTER_OP("InTopK")
           c->Merge(c->Dim(predictions, 0), c->Dim(targets, 0), &batch_size));
       c->set_output(0, c->Vector(batch_size));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Says whether the targets are in the top `K` predictions.
-
-This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-prediction for the target class is among the top `k` predictions among
-all predictions for example `i`. Note that the behavior of `InTopK` differs
-from the `TopK` op in its handling of ties; if multiple classes have the
-same prediction value and straddle the top-`k` boundary, all of those
-classes are considered to be in the top `k`.
-
-More formally, let
-
-  \\(predictions_i\\) be the predictions for all classes for example `i`,
-  \\(targets_i\\) be the target class for example `i`,
-  \\(out_i\\) be the output for example `i`,
-
-$$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-
-predictions: A `batch_size` x `classes` tensor.
-targets: A `batch_size` vector of class ids.
-k: Number of top elements to look at for computing precision.
-precision: Computed Precision at `k` as a `bool Tensor`.
-
-)doc");
+    });
 
 // This is the same as `InTopK`, but takes `k` as in input rather than an attr.
 REGISTER_OP("InTopKV2")
@@ -2128,31 +1131,7 @@ REGISTER_OP("InTopKV2")
           c->Merge(c->Dim(predictions, 0), c->Dim(targets, 0), &batch_size));
       c->set_output(0, c->Vector(batch_size));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Says whether the targets are in the top `K` predictions.
-
-This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-prediction for the target class is among the top `k` predictions among
-all predictions for example `i`. Note that the behavior of `InTopK` differs
-from the `TopK` op in its handling of ties; if multiple classes have the
-same prediction value and straddle the top-`k` boundary, all of those
-classes are considered to be in the top `k`.
-
-More formally, let
-
-  \\(predictions_i\\) be the predictions for all classes for example `i`,
-  \\(targets_i\\) be the target class for example `i`,
-  \\(out_i\\) be the output for example `i`,
-
-$$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-
-predictions: A `batch_size` x `classes` tensor.
-targets: A `batch_size` vector of class ids.
-k: Number of top elements to look at for computing precision.
-precision: Computed precision at `k` as a `bool Tensor`.
-
-)doc");
+    });
 
 namespace {
 
@@ -2176,9 +1155,9 @@ Status TopKShapeFn(InferenceContext* c) {
   DimensionHandle last_dim = c->Dim(input, -1);
   if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) &&
       c->Value(last_dim) < c->Value(k_dim)) {
-    return errors::InvalidArgument("input must have last dimension >= k = ",
-                                   c->Value(k_dim), " but is ",
-                                   c->Value(last_dim));
+    return errors::InvalidArgument(
+        "input must have last dimension >= k = ", c->Value(k_dim), " but is ",
+        c->Value(last_dim));
   }
 
   // Replace last_dim with k_dim.
@@ -2200,31 +1179,7 @@ REGISTER_OP("TopK")
     .Attr("sorted: bool = true")
     .Attr("T: realnumbertype")
     .Deprecated(7, "Use TopKV2 instead")
-    .SetShapeFn(TopKShapeFn)
-    .Doc(R"doc(
-Finds values and indices of the `k` largest elements for the last dimension.
-
-If the input is a vector (rank-1), finds the `k` largest entries in the vector
-and outputs their values and indices as vectors.  Thus `values[j]` is the
-`j`-th largest entry in `input`, and its index is `indices[j]`.
-
-For matrices (resp. higher rank input), computes the top `k` entries in each
-row (resp. vector along the last dimension).  Thus,
-
-    values.shape = indices.shape = input.shape[:-1] + [k]
-
-If two elements are equal, the lower-index element appears first.
-
-If `k` varies dynamically, use `TopKV2` below.
-
-input: 1-D or higher with last dimension at least `k`.
-k: Number of top elements to look for along the last dimension (along each
-  row for matrices).
-sorted: If true the resulting `k` elements will be sorted by the values in
-  descending order.
-values: The `k` largest elements along each last dimensional slice.
-indices: The indices of `values` within the last dimension of `input`.
-)doc");
+    .SetShapeFn(TopKShapeFn);
 
 // This is the same as `TopK`, but takes `k` as in input rather than an attr.
 REGISTER_OP("TopKV2")
@@ -2234,29 +1189,7 @@ REGISTER_OP("TopKV2")
     .Output("indices: int32")
     .Attr("sorted: bool = true")
     .Attr("T: realnumbertype")
-    .SetShapeFn(TopKShapeFn)
-    .Doc(R"doc(
-Finds values and indices of the `k` largest elements for the last dimension.
-
-If the input is a vector (rank-1), finds the `k` largest entries in the vector
-and outputs their values and indices as vectors.  Thus `values[j]` is the
-`j`-th largest entry in `input`, and its index is `indices[j]`.
-
-For matrices (resp. higher rank input), computes the top `k` entries in each
-row (resp. vector along the last dimension).  Thus,
-
-    values.shape = indices.shape = input.shape[:-1] + [k]
-
-If two elements are equal, the lower-index element appears first.
-
-input: 1-D or higher with last dimension at least `k`.
-k: 0-D.  Number of top elements to look for along the last dimension (along each
-  row for matrices).
-sorted: If true the resulting `k` elements will be sorted by the values in
-  descending order.
-values: The `k` largest elements along each last dimensional slice.
-indices: The indices of `values` within the last dimension of `input`.
-)doc");
+    .SetShapeFn(TopKShapeFn);
 
 // --------------------------------------------------------------------------
 
@@ -2278,9 +1211,9 @@ REGISTER_OP("NthElement")
       DimensionHandle last_dim = c->Dim(input, -1);
       if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) &&
           c->Value(last_dim) <= c->Value(n_dim)) {
-        return errors::InvalidArgument("Input must have last dimension > n = ",
-                                       c->Value(n_dim), " but is ",
-                                       c->Value(last_dim));
+        return errors::InvalidArgument(
+            "Input must have last dimension > n = ", c->Value(n_dim),
+            " but is ", c->Value(last_dim));
       }
 
       // Reduce last_dim for output tensor
@@ -2288,25 +1221,7 @@ REGISTER_OP("NthElement")
       TF_RETURN_IF_ERROR(c->Subshape(input, 0, -1, &s));
       c->set_output(0, s);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Finds values of the `n`-th order statistic for the last dimension.
-
-If the input is a vector (rank-1), finds the entries which is the nth-smallest
-value in the vector and outputs their values as scalar tensor.
-
-For matrices (resp. higher rank input), computes the entries which is the
-nth-smallest value in each row (resp. vector along the last dimension). Thus,
-
-    values.shape = input.shape[:-1]
-
-input: 1-D or higher with last dimension at least `n+1`.
-n: 0-D. Position of sorted vector to select along the last dimension (along
-  each row for matrices). Valid range of n is `[0, input.shape[:-1])`
-reverse: When set to True, find the nth-largest value in the vector and vice
-  versa.
-values: The `n`-th order statistic along each last dimensional slice.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -2322,70 +1237,7 @@ REGISTER_OP("FractionalMaxPool")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .Attr("T: {float, double, int32, int64}")
-    .SetShapeFn(FractionalPoolShapeFn)
-    .Doc(R"doc(
-Performs fractional max pooling on the input.
-
-Fractional max pooling is slightly different than regular max pooling.  In
-regular max pooling, you downsize an input set by taking the maximum value of
-smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-a factor of N, where N is an integer.  Fractional max pooling, as you might
-expect from the word "fractional", means that the overall reduction ratio N
-does not have to be an integer.
-
-The sizes of the pooling regions are generated randomly but are fairly uniform.
-For example, let's look at the height dimension, and the constraints on the
-list of rows that will be pool boundaries.
-
-First we define the following:
-
-1.  input_row_length : the number of rows from the input set
-2.  output_row_length : which will be smaller than the input
-3.  alpha = input_row_length / output_row_length : our reduction ratio
-4.  K = floor(alpha)
-5.  row_pooling_sequence : this is the result list of pool boundary rows
-
-Then, row_pooling_sequence should satisfy:
-
-1.  a[0] = 0 : the first value of the sequence is 0
-2.  a[end] = input_row_length : the last value of the sequence is the size
-3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-4.  length(row_pooling_sequence) = output_row_length+1
-
-For more details on fractional max pooling, see this paper:
-[Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
-
-value: 4-D with shape `[batch, height, width, channels]`.
-pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-  supports row and col dimension and should be >= 1.0. For example, a valid
-  pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-  must be 1.0 because we don't allow pooling on batch and channels
-  dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-  respectively.
-pseudo_random: When set to True, generates the pooling sequence in a
-  pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-  Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-  difference between pseudorandom and random.
-overlapping: When set to True, it means when pooling, the values at the boundary
-  of adjacent pooling cells are used by both cells. For example:
-
-  `index  0  1  2  3  4`
-
-  `value  20 5  16 3  7`
-
-  If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-  The result would be [20, 16] for fractional max pooling.
-deterministic: When set to True, a fixed pooling region will be used when
-  iterating over a FractionalMaxPool node in the computation graph. Mainly used
-  in unit test to make FractionalMaxPool deterministic.
-seed: If either seed or seed2 are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: An second seed to avoid seed collision.
-output: output tensor after fractional max pooling.
-row_pooling_sequence: row pooling sequence, needed to calculate gradient.
-col_pooling_sequence: column pooling sequence, needed to calculate gradient.
-)doc");
+    .SetShapeFn(FractionalPoolShapeFn);
 
 REGISTER_OP("FractionalMaxPoolGrad")
     .Input("orig_input: T")
@@ -2398,29 +1250,7 @@ REGISTER_OP("FractionalMaxPoolGrad")
     .Attr("T: {float, double, int32, int64}")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRank(c, 4);
-    })
-    .Doc(R"doc(
-Computes gradient of the FractionalMaxPool function.
-
-orig_input: Original input for `fractional_max_pool`
-orig_output: Original output for `fractional_max_pool`
-out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-  w.r.t. the output of `fractional_max_pool`.
-row_pooling_sequence: row pooling sequence, form pooling region with
-  col_pooling_sequence.
-col_pooling_sequence: column pooling sequence, form pooling region with
-  row_pooling sequence.
-overlapping: When set to True, it means when pooling, the values at the boundary
-  of adjacent pooling cells are used by both cells. For example:
-
-  `index  0  1  2  3  4`
-
-  `value  20 5  16 3  7`
-
-  If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-  The result would be [20, 16] for fractional max pooling.
-output: 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-)doc");
+    });
 
 // --------------------------------------------------------------------------
 
@@ -2436,46 +1266,7 @@ REGISTER_OP("FractionalAvgPool")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .Attr("T: {float, double, int32, int64}")
-    .SetShapeFn(FractionalPoolShapeFn)
-    .Doc(R"doc(
-Performs fractional average pooling on the input.
-
-Fractional average pooling is similar to Fractional max pooling in the pooling
-region generation step. The only difference is that after pooling regions are
-generated, a mean operation is performed instead of a max operation in each
-pooling region.
-
-value: 4-D with shape `[batch, height, width, channels]`.
-pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-  supports row and col dimension and should be >= 1.0. For example, a valid
-  pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-  must be 1.0 because we don't allow pooling on batch and channels
-  dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-  respectively.
-pseudo_random: When set to True, generates the pooling sequence in a
-  pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-  Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-  difference between pseudorandom and random.
-overlapping: When set to True, it means when pooling, the values at the boundary
-  of adjacent pooling cells are used by both cells. For example:
-
-  `index  0  1  2  3  4`
-
-  `value  20 5  16 3  7`
-
-  If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-  The result would be [41/3, 26/3] for fractional avg pooling.
-deterministic: When set to True, a fixed pooling region will be used when
-  iterating over a FractionalAvgPool node in the computation graph. Mainly used
-  in unit test to make FractionalAvgPool deterministic.
-seed: If either seed or seed2 are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: An second seed to avoid seed collision.
-output: output tensor after fractional avg pooling.
-row_pooling_sequence: row pooling sequence, needed to calculate gradient.
-col_pooling_sequence: column pooling sequence, needed to calculate gradient.
-)doc");
+    .SetShapeFn(FractionalPoolShapeFn);
 
 REGISTER_OP("FractionalAvgPoolGrad")
     .Input("orig_input_tensor_shape: int64")
@@ -2494,34 +1285,7 @@ REGISTER_OP("FractionalAvgPoolGrad")
         c->set_output(0, c->UnknownShapeOfRank(4));
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes gradient of the FractionalAvgPool function.
-
-Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-out_backprop to those indices that form the same pooling cell. Therefore, we
-just need to know the shape of original input tensor, instead of the whole
-tensor.
-
-orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-  w.r.t. the output of `fractional_avg_pool`.
-row_pooling_sequence: row pooling sequence, form pooling region with
-  col_pooling_sequence.
-col_pooling_sequence: column pooling sequence, form pooling region with
-  row_pooling sequence.
-overlapping: When set to True, it means when pooling, the values at the boundary
-  of adjacent pooling cells are used by both cells. For example:
-
-  `index  0  1  2  3  4`
-
-  `value  20 5  16 3  7`
-
-  If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-  The result would be [41/3, 26/3] for fractional avg pooling.
-output: 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-)doc");
+    });
 
 REGISTER_OP("QuantizedAvgPool")
     .Input("input: T")
@@ -2542,22 +1306,7 @@ REGISTER_OP("QuantizedAvgPool")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Produces the average pool of the input tensor for quantized types.
-
-input: 4-D with shape `[batch, height, width, channels]`.
-ksize: The size of the window for each dimension of the input tensor.
-  The length must be 4 to match the number of dimensions of the input.
-strides: The stride of the sliding window for each dimension of the input
-  tensor.  The length must be 4 to match the number of dimensions of the input.
-padding: The type of padding algorithm to use.
-min_input: The float value that the lowest quantized input value represents.
-max_input: The float value that the highest quantized input value represents.
-min_output: The float value that the lowest quantized output value represents.
-max_output: The float value that the highest quantized output value represents.
-
-)doc");
+    });
 
 REGISTER_OP("QuantizedBiasAdd")
     .Input("input: T1")
@@ -2582,21 +1331,7 @@ REGISTER_OP("QuantizedBiasAdd")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Adds Tensor 'bias' to Tensor 'input' for Quantized types.
-
-Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
-
-bias: A 1D bias Tensor with size matching the last dimension of 'input'.
-min_input: The float value that the lowest quantized input value represents.
-max_input: The float value that the highest quantized input value represents.
-min_bias: The float value that the lowest quantized bias value represents.
-max_bias: The float value that the highest quantized bias value represents.
-min_out: The float value that the lowest quantized output value represents.
-max_out: The float value that the highest quantized output value represents.
-
-)doc");
+    });
 
 REGISTER_OP("QuantizedConv2D")
     .Input("input: Tinput")
@@ -2613,6 +1348,7 @@ REGISTER_OP("QuantizedConv2D")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -2623,26 +1359,7 @@ REGISTER_OP("QuantizedConv2D")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes a 2D convolution given quantized 4D input and filter tensors.
-The inputs are quantized tensors where the lowest value represents the real
-number of the associated minimum, and the highest represents the maximum.
-This means that you can only interpret the quantized output in the same way, by
-taking the returned minimum and maximum values into account.
-
-filter: filter's input_depth dimension must match input's depth dimensions.
-strides: The stride of the sliding window for each dimension of the input
-  tensor.
-padding: The type of padding algorithm to use.
-min_input: The float value that the lowest quantized input value represents.
-max_input: The float value that the highest quantized input value represents.
-min_filter: The float value that the lowest quantized filter value represents.
-max_filter: The float value that the highest quantized filter value represents.
-min_output: The float value that the lowest quantized output value represents.
-max_output: The float value that the highest quantized output value represents.
-
-)doc");
+    });
 
 REGISTER_OP("QuantizedMaxPool")
     .Input("input: T")
@@ -2663,22 +1380,7 @@ REGISTER_OP("QuantizedMaxPool")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Produces the max pool of the input tensor for quantized types.
-
-input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-ksize: The size of the window for each dimension of the input tensor.
-  The length must be 4 to match the number of dimensions of the input.
-strides: The stride of the sliding window for each dimension of the input
-  tensor. The length must be 4 to match the number of dimensions of the input.
-padding: The type of padding algorithm to use.
-min_input: The float value that the lowest quantized input value represents.
-max_input: The float value that the highest quantized input value represents.
-min_output: The float value that the lowest quantized output value represents.
-max_output: The float value that the highest quantized output value represents.
-
-)doc");
+    });
 
 REGISTER_OP("QuantizedRelu")
     .Input("features: Tinput")
@@ -2697,17 +1399,7 @@ REGISTER_OP("QuantizedRelu")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes Quantized Rectified Linear: `max(features, 0)`
-
-activations: Has the same output shape as "features".
-min_features: The float value that the lowest quantized value represents.
-max_features: The float value that the highest quantized value represents.
-min_activations: The float value that the lowest quantized value represents.
-max_activations: The float value that the highest quantized value represents.
-
-)doc");
+    });
 
 REGISTER_OP("QuantizedRelu6")
     .Input("features: Tinput")
@@ -2726,17 +1418,7 @@ REGISTER_OP("QuantizedRelu6")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
-
-activations: Has the same output shape as "features".
-min_features: The float value that the lowest quantized value represents.
-max_features: The float value that the highest quantized value represents.
-min_activations: The float value that the lowest quantized value represents.
-max_activations: The float value that the highest quantized value represents.
-
-)doc");
+    });
 
 REGISTER_OP("QuantizedReluX")
     .Input("features: Tinput")
@@ -2756,17 +1438,7 @@ REGISTER_OP("QuantizedReluX")
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
-
-activations: Has the same output shape as "features".
-min_features: The float value that the lowest quantized value represents.
-max_features: The float value that the highest quantized value represents.
-min_activations: The float value that the lowest quantized value represents.
-max_activations: The float value that the highest quantized value represents.
-
-)doc");
+    });
 
 REGISTER_OP("QuantizedBatchNormWithGlobalNormalization")
     .Input("t: Tinput")
@@ -2809,39 +1481,7 @@ REGISTER_OP("QuantizedBatchNormWithGlobalNormalization")
       c->set_output(2, c->Scalar());
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-Quantized Batch normalization.
-
-This op is deprecated and will be removed in the future. Prefer
-`tf.nn.batch_normalization`.
-
-t: A 4D input Tensor.
-t_min: The value represented by the lowest quantized input.
-t_max: The value represented by the highest quantized input.
-m: A 1D mean Tensor with size matching the last dimension of t.
-  This is the first output from tf.nn.moments,
-  or a saved moving average thereof.
-m_min: The value represented by the lowest quantized mean.
-m_max: The value represented by the highest quantized mean.
-v: A 1D variance Tensor with size matching the last dimension of t.
-  This is the second output from tf.nn.moments,
-  or a saved moving average thereof.
-v_min: The value represented by the lowest quantized variance.
-v_max: The value represented by the highest quantized variance.
-beta: A 1D beta Tensor with size matching the last dimension of t.
-  An offset to be added to the normalized tensor.
-beta_min: The value represented by the lowest quantized offset.
-beta_max: The value represented by the highest quantized offset.
-gamma: A 1D gamma Tensor with size matching the last dimension of t.
-  If "scale_after_normalization" is true, this tensor will be multiplied
-  with the normalized tensor.
-gamma_min: The value represented by the lowest quantized gamma.
-gamma_max: The value represented by the highest quantized gamma.
-variance_epsilon: A small float number to avoid dividing by 0.
-scale_after_normalization: A bool indicating whether the resulted tensor
-  needs to be multiplied with gamma.
-)doc");
+    });
 
 #ifdef INTEL_MKL
 REGISTER_OP("_MklConv2D")
@@ -2866,6 +1506,25 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("__MklDummyConv2DWithBias")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("bias: T")
+    .Output("output: T")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Doc(R"doc(
+Dummy node that enables fusing Conv2D and BiasAdd operator for MKL. This node
+does not perform anything. It is just created as an intermediate output of
+merging Conv2D and BiasAdd.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklConv2DWithBias")
     .Input("input: T")
     .Input("filter: T")
@@ -2919,6 +1578,88 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("__MklDummyConv2DBackpropFilterWithBias")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Output("output: T")
+    .Output("bias_grad: T")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input_shape;
+      // Fetch the data_format attribute, which may not exist.
+      string data_format;
+      Status s = c->GetAttr("data_format", &data_format);
+
+      if (s.ok() && data_format == "NCHW") {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -3)));
+      } else {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -1)));
+      }
+      ShapeHandle sh;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &sh));
+      TF_RETURN_IF_ERROR(c->WithRank(sh, 4, &sh));
+      c->set_output(0, sh);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Dummy node that enables fusing Conv2DBackpropFilter and BiasAddGrad operator
+for MKL. This node does not perform anything. It is just created as an
+intermediate output of merging Conv2DBackpropFilter and BiasAddGrad.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklConv2DBackpropFilterWithBias")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter_size: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("bias_grad: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_bias_grad: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input_shape;
+      // Fetch the data_format attribute, which may not exist.
+      string data_format;
+      Status s = c->GetAttr("data_format", &data_format);
+
+      if (s.ok() && data_format == "NCHW") {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -3)));
+      } else {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -1)));
+      }
+      ShapeHandle sh;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &sh));
+      TF_RETURN_IF_ERROR(c->WithRank(sh, 4, &sh));
+      c->set_output(0, sh);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of Conv2DBackpropFilterWithBias. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the filter.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklConv2DWithBiasBackpropBias")
     .Input("out_backprop: T")
     .Input("mkl_out_backprop: uint8")
@@ -2995,6 +1736,78 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklElu")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of Elu operator. Uses MKL DNN APIs to implement Elu operator.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklEluGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Input("mkl_gradients: uint8")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of EluGrad operator. Uses MKL DNN APIs to compute Elu
+gradients for Elu operation.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklSoftmax")
+    .Input("logits: T")
+    .Input("mkl_logits: uint8")
+    .Output("softmax: T")
+    .Output("mkl_softmax: uint8")
+    .Attr("T: {half, float, double}")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
+    })
+    .Doc(R"doc(
+MKL version of ReluGrad operator. Uses MKL DNN APIs to compute rectified
+linear gradients for Relu operation.
+)doc");
+
+REGISTER_OP("_MklTanh")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of Tanh operator. Uses MKL DNN APIs to implement Tanh operator.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklTanhGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Input("mkl_gradients: uint8")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of TanhGrad operator. Uses MKL DNN APIs to compute tanh
+gradients for Tanh operation.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklMaxPool")
     .Attr("T: {float, half} = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")
@@ -3005,7 +1818,11 @@ REGISTER_OP("_MklMaxPool")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
+#ifdef INTEL_MKL_ML
     .Output("workspace: T")
+#else
+    .Output("workspace: uint8")
+#endif
     .Output("mkl_output: uint8")
     .Output("mkl_workspace: uint8")
     .SetShapeFn(shape_inference::MaxPoolShape)
@@ -3027,7 +1844,11 @@ REGISTER_OP("_MklMaxPoolGrad")
     .Input("orig_input: T")
     .Input("orig_output: T")
     .Input("grad: T")
+#ifdef INTEL_MKL_ML
     .Input("workspace: T")
+#else
+    .Input("workspace: uint8")
+#endif
     .Input("mkl_orig_input: uint8")
     .Input("mkl_orig_output: uint8")
     .Input("mkl_grad: uint8")
@@ -3095,7 +1916,11 @@ REGISTER_OP("_MklLRN")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
+#ifdef INTEL_MKL_ML
     .Output("workspace: T")
+#else
+    .Output("workspace: uint8")
+#endif
     .Output("mkl_output: uint8")
     .Output("mkl_workspace: uint8")
     .Attr("depth_radius: int = 5")
@@ -3119,7 +1944,11 @@ REGISTER_OP("_MklLRNGrad")
     .Input("input_grads: T")
     .Input("input_image: T")
     .Input("output_image: T")
+#ifdef INTEL_MKL_ML
     .Input("workspace: T")
+#else
+    .Input("workspace: uint8")
+#endif
     .Input("mkl_input_grads: uint8")
     .Input("mkl_input_image: uint8")
     .Input("mkl_output_image: uint8")
diff --git a/tensorflow/core/ops/no_op.cc b/tensorflow/core/ops/no_op.cc
index e62353bb7f9e0c8b7ec753027a9da274bb6497e7..560e9e8daec0d6556e274f9fa4b12762847093e3 100644
--- a/tensorflow/core/ops/no_op.cc
+++ b/tensorflow/core/ops/no_op.cc
@@ -18,8 +18,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("NoOp")
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc("Does nothing. Only useful as a placeholder for control edges.");
+REGISTER_OP("NoOp").SetShapeFn(shape_inference::NoOutputs);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 9c41957ae6aa4ae1a893f09b6e5282a123831e38..45ff08f38b134f963460d15f949411a7f1619d0c 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -6,7 +6,6 @@ op {
     default_value {
       s: ""
     }
-    description: "A string which is the message associated with the exception."
   }
   attr {
     name: "exit_without_error"
@@ -15,8 +14,6 @@ op {
       b: false
     }
   }
-  summary: "Raise a exception to abort the process when called."
-  description: "If exit_without_error is true, the process will exit normally,\notherwise it will exit with a SIGABORT signal.\n\nReturns nothing but an exception."
 }
 op {
   name: "Abs"
@@ -34,6 +31,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -41,14 +39,11 @@ op {
       }
     }
   }
-  summary: "Computes the absolute value of a tensor."
-  description: "Given a tensor `x`, this operation returns a tensor containing the absolute\nvalue of each element in `x`. For example, if x is an input element and y is\nan output element, this operation computes \\\\(y = |x|\\\\)."
 }
 op {
   name: "AccumulateNV2"
   input_arg {
     name: "inputs"
-    description: "A list of `Tensor` objects, each with same shape and type."
     type_attr: "T"
     number_attr: "N"
   }
@@ -69,17 +64,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -89,10 +85,7 @@ op {
   attr {
     name: "shape"
     type: "shape"
-    description: "Shape of elements of `inputs`."
   }
-  summary: "Returns the element-wise sum of a list of tensors."
-  description: "`tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not\nwait for all of its inputs to be ready before beginning to sum. This can\nsave memory if inputs are ready at different times, since minimum temporary\nstorage is proportional to the output size rather than the inputs size.\n\nUnlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.\n\nReturns a `Tensor` of same shape and type as the elements of `inputs`."
   is_aggregate: true
   is_commutative: true
 }
@@ -100,124 +93,107 @@ op {
   name: "AccumulatorApplyGradient"
   input_arg {
     name: "handle"
-    description: "The handle to a accumulator."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "local_step"
-    description: "The local_step value at which the gradient was computed."
     type: DT_INT64
   }
   input_arg {
     name: "gradient"
-    description: "A tensor of the gradient to be accumulated."
     type_attr: "dtype"
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The data type of accumulated gradients. Needs to correspond to the type\nof the accumulator."
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  summary: "Applies a gradient to a given accumulator."
-  description: "Does not add if local_step is lesser than the accumulator\'s global_step."
 }
 op {
   name: "AccumulatorNumAccumulated"
   input_arg {
     name: "handle"
-    description: "The handle to an accumulator."
     type: DT_STRING
     is_ref: true
   }
   output_arg {
     name: "num_accumulated"
-    description: "The number of gradients aggregated in the given accumulator."
     type: DT_INT32
   }
-  summary: "Returns the number of gradients aggregated in the given accumulators."
 }
 op {
   name: "AccumulatorSetGlobalStep"
   input_arg {
     name: "handle"
-    description: "The handle to an accumulator."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "new_global_step"
-    description: "The new global_step value to set."
     type: DT_INT64
   }
-  summary: "Updates the accumulator with a new value for global_step."
-  description: "Logs warning if the accumulator\'s value is already higher than\nnew_global_step."
 }
 op {
   name: "AccumulatorTakeGradient"
   input_arg {
     name: "handle"
-    description: "The handle to an accumulator."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "num_required"
-    description: "Number of gradients required before we return an aggregate."
     type: DT_INT32
   }
   output_arg {
     name: "average"
-    description: "The average of the accumulated gradients."
     type_attr: "dtype"
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The data type of accumulated gradients. Needs to correspond to the type\nof the accumulator."
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  summary: "Extracts the average gradient in the given ConditionalAccumulator."
-  description: "The op blocks until sufficient (i.e., more than num_required)\ngradients have been accumulated.  If the accumulator has already\naggregated more than num_required gradients, it returns the average of\nthe accumulated gradients.  Also automatically increments the recorded\nglobal_step in the accumulator by 1, and resets the aggregate to 0."
 }
 op {
   name: "Acos"
@@ -235,6 +211,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -244,7 +221,6 @@ op {
       }
     }
   }
-  summary: "Computes acos of x element-wise."
 }
 op {
   name: "Acosh"
@@ -262,6 +238,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -269,7 +246,6 @@ op {
       }
     }
   }
-  summary: "Computes inverse hyperbolic cosine of x element-wise."
 }
 op {
   name: "Add"
@@ -291,6 +267,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -304,29 +281,23 @@ op {
       }
     }
   }
-  summary: "Returns x + y element-wise."
-  description: "*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "AddManySparseToTensorsMap"
   input_arg {
     name: "sparse_indices"
-    description: "2-D.  The `indices` of the minibatch `SparseTensor`.\n`sparse_indices[:, 0]` must be ordered values in `[0, N)`."
     type: DT_INT64
   }
   input_arg {
     name: "sparse_values"
-    description: "1-D.  The `values` of the minibatch `SparseTensor`."
     type_attr: "T"
   }
   input_arg {
     name: "sparse_shape"
-    description: "1-D.  The `shape` of the minibatch `SparseTensor`.\nThe minibatch size `N == sparse_shape[0]`."
     type: DT_INT64
   }
   output_arg {
     name: "sparse_handles"
-    description: "1-D.  The handles of the `SparseTensor` now stored in the\n`SparseTensorsMap`.  Shape: `[N]`."
     type: DT_INT64
   }
   attr {
@@ -339,7 +310,6 @@ op {
     default_value {
       s: ""
     }
-    description: "The container name for the `SparseTensorsMap` created by this op."
   }
   attr {
     name: "shared_name"
@@ -347,17 +317,13 @@ op {
     default_value {
       s: ""
     }
-    description: "The shared name for the `SparseTensorsMap` created by this op.\nIf blank, the new Operation\'s unique name is used."
   }
-  summary: "Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles."
-  description: "A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,\n`sparse_values`, and `sparse_shape`, where\n\n```sparse_indices.shape[1] == sparse_shape.shape[0] == R```\n\nAn `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`\nhaving a first `sparse_indices` column taking values between `[0, N)`, where\nthe minibatch size `N == sparse_shape[0]`.\n\nThe input `SparseTensor` must have rank `R` greater than 1, and the first\ndimension is treated as the minibatch dimension.  Elements of the `SparseTensor`\nmust be sorted in increasing order of this first dimension.  The stored\n`SparseTensor` objects pointed to by each row of the output `sparse_handles`\nwill have rank `R-1`.\n\nThe `SparseTensor` values can then be read out as part of a minibatch by passing\nthe given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure\nthe correct `SparseTensorsMap` is accessed, ensure that the same\n`container` and `shared_name` are passed to that Op.  If no `shared_name`\nis provided here, instead use the *name* of the Operation created by calling\n`AddManySparseToTensorsMap` as the `shared_name` passed to\n`TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated."
   is_stateful: true
 }
 op {
   name: "AddN"
   input_arg {
     name: "inputs"
-    description: "Must all be the same size and shape."
     type_attr: "T"
     number_attr: "N"
   }
@@ -378,17 +344,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -396,7 +363,6 @@ op {
       }
     }
   }
-  summary: "Add all input tensors element wise."
   is_aggregate: true
   is_commutative: true
 }
@@ -404,22 +370,18 @@ op {
   name: "AddSparseToTensorsMap"
   input_arg {
     name: "sparse_indices"
-    description: "2-D.  The `indices` of the `SparseTensor`."
     type: DT_INT64
   }
   input_arg {
     name: "sparse_values"
-    description: "1-D.  The `values` of the `SparseTensor`."
     type_attr: "T"
   }
   input_arg {
     name: "sparse_shape"
-    description: "1-D.  The `shape` of the `SparseTensor`."
     type: DT_INT64
   }
   output_arg {
     name: "sparse_handle"
-    description: "0-D.  The handle of the `SparseTensor` now stored in the\n`SparseTensorsMap`."
     type: DT_INT64
   }
   attr {
@@ -432,7 +394,6 @@ op {
     default_value {
       s: ""
     }
-    description: "The container name for the `SparseTensorsMap` created by this op."
   }
   attr {
     name: "shared_name"
@@ -440,10 +401,7 @@ op {
     default_value {
       s: ""
     }
-    description: "The shared name for the `SparseTensorsMap` created by this op.\nIf blank, the new Operation\'s unique name is used."
   }
-  summary: "Add a `SparseTensor` to a `SparseTensorsMap` return its handle."
-  description: "A `SparseTensor` is represented by three tensors: `sparse_indices`,\n`sparse_values`, and `sparse_shape`.\n\nThis operator takes the given `SparseTensor` and adds it to a container\nobject (a `SparseTensorsMap`).  A unique key within this container is generated\nin the form of an `int64`, and this is the value that is returned.\n\nThe `SparseTensor` can then be read out as part of a minibatch by passing\nthe key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure\nthe correct `SparseTensorsMap` is accessed, ensure that the same\n`container` and `shared_name` are passed to that Op.  If no `shared_name`\nis provided here, instead use the *name* of the Operation created by calling\n`AddSparseToTensorsMap` as the `shared_name` passed to\n`TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated."
   is_stateful: true
 }
 op {
@@ -466,6 +424,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -478,8 +437,6 @@ op {
       }
     }
   }
-  summary: "Returns x + y element-wise."
-  description: "*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_aggregate: true
   is_commutative: true
 }
@@ -520,7 +477,6 @@ op {
       }
     }
   }
-  summary: "Deprecated. Disallowed in GraphDef version >= 2."
   deprecation {
     version: 2
     explanation: "Use AdjustContrastv2 instead"
@@ -530,77 +486,59 @@ op {
   name: "AdjustContrastv2"
   input_arg {
     name: "images"
-    description: "Images to adjust.  At least 3-D."
     type: DT_FLOAT
   }
   input_arg {
     name: "contrast_factor"
-    description: "A float multiplier for adjusting contrast."
     type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    description: "The contrast-adjusted image or images."
     type: DT_FLOAT
   }
-  summary: "Adjust the contrast of one or more images."
-  description: "`images` is a tensor of at least 3 dimensions.  The last 3 dimensions are\ninterpreted as `[height, width, channels]`.  The other dimensions only\nrepresent a collection of images, such as `[batch, height, width, channels].`\n\nContrast is adjusted independently for each channel of each image.\n\nFor each channel, the Op first computes the mean of the image pixels in the\nchannel and then adjusts each component of each pixel to\n`(x - mean) * contrast_factor + mean`."
 }
 op {
   name: "AdjustHue"
   input_arg {
     name: "images"
-    description: "Images to adjust.  At least 3-D."
     type: DT_FLOAT
   }
   input_arg {
     name: "delta"
-    description: "A float delta to add to the hue."
     type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    description: "The hue-adjusted image or images."
     type: DT_FLOAT
   }
-  summary: "Adjust the hue of one or more images."
-  description: "`images` is a tensor of at least 3 dimensions.  The last dimension is\ninterpretted as channels, and must be three.\n\nThe input image is considered in the RGB colorspace. Conceptually, the RGB\ncolors are first mapped into HSV. A delta is then applied all the hue values,\nand then remapped back to RGB colorspace."
 }
 op {
   name: "AdjustSaturation"
   input_arg {
     name: "images"
-    description: "Images to adjust.  At least 3-D."
     type: DT_FLOAT
   }
   input_arg {
     name: "scale"
-    description: "A float scale to add to the saturation."
     type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    description: "The hue-adjusted image or images."
     type: DT_FLOAT
   }
-  summary: "Adjust the saturation of one or more images."
-  description: "`images` is a tensor of at least 3 dimensions.  The last dimension is\ninterpretted as channels, and must be three.\n\nThe input image is considered in the RGB colorspace. Conceptually, the RGB\ncolors are first mapped into HSV. A scale is then applied all the saturation\nvalues, and then remapped back to RGB colorspace."
 }
 op {
   name: "All"
   input_arg {
     name: "input"
-    description: "The tensor to reduce."
     type: DT_BOOL
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    description: "The reduced tensor."
     type: DT_BOOL
   }
   attr {
@@ -609,7 +547,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, retain reduced dimensions with length 1."
   }
   attr {
     name: "Tidx"
@@ -624,49 +561,40 @@ op {
       }
     }
   }
-  summary: "Computes the \"logical and\" of elements across dimensions of a tensor."
-  description: "Reduces `input` along the dimensions given in `reduction_indices`. Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_indices`. If `keep_dims` is true, the reduced dimensions are\nretained with length 1."
 }
 op {
   name: "AllCandidateSampler"
   input_arg {
     name: "true_classes"
-    description: "A batch_size * num_true matrix, in which each row contains the\nIDs of the num_true target_classes in the corresponding original label."
     type: DT_INT64
   }
   output_arg {
     name: "sampled_candidates"
-    description: "A vector of length num_sampled, in which each element is\nthe ID of a sampled candidate."
     type: DT_INT64
   }
   output_arg {
     name: "true_expected_count"
-    description: "A batch_size * num_true matrix, representing\nthe number of times each candidate is expected to occur in a batch\nof sampled candidates. If unique=true, then this is a probability."
     type: DT_FLOAT
   }
   output_arg {
     name: "sampled_expected_count"
-    description: "A vector of length num_sampled, for each sampled\ncandidate representing the number of times the candidate is expected\nto occur in a batch of sampled candidates.  If unique=true, then this is a\nprobability."
     type: DT_FLOAT
   }
   attr {
     name: "num_true"
     type: "int"
-    description: "Number of true labels per context."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to produce."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "unique"
     type: "bool"
-    description: "If unique is true, we sample with rejection, so that all sampled\ncandidates in a batch are unique. This requires some approximation to\nestimate the post-rejection sampling probabilities."
   }
   attr {
     name: "seed"
@@ -674,7 +602,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either seed or seed2 are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -682,10 +609,7 @@ op {
     default_value {
       i: 0
     }
-    description: "An second seed to avoid seed collision."
   }
-  summary: "Generates labels for candidate sampling with a learned unigram distribution."
-  description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
   is_stateful: true
 }
 op {
@@ -724,24 +648,19 @@ op {
       }
     }
   }
-  summary: "Returns the argument of a complex number."
-  description: "Given a tensor `input` of complex numbers, this operation returns a tensor of\ntype `float` that is the argument of each element in `input`. All elements in\n`input` must be complex numbers of the form \\\\(a + bj\\\\), where *a*\nis the real part and *b* is the imaginary part.\n\nThe argument returned by this operation is of the form \\\\(atan2(b, a)\\\\).\n\nFor example:\n\n```\n# tensor \'input\' is [-2.25 + 4.75j, 3.25 + 5.75j]\ntf.angle(input) ==> [2.0132, 1.056]\n```\n\n@compatibility(numpy)\nEquivalent to np.angle.\n@end_compatibility"
 }
 op {
   name: "Any"
   input_arg {
     name: "input"
-    description: "The tensor to reduce."
     type: DT_BOOL
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    description: "The reduced tensor."
     type: DT_BOOL
   }
   attr {
@@ -750,7 +669,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, retain reduced dimensions with length 1."
   }
   attr {
     name: "Tidx"
@@ -765,52 +683,42 @@ op {
       }
     }
   }
-  summary: "Computes the \"logical or\" of elements across dimensions of a tensor."
-  description: "Reduces `input` along the dimensions given in `reduction_indices`. Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_indices`. If `keep_dims` is true, the reduced dimensions are\nretained with length 1."
 }
 op {
   name: "ApplyAdadelta"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "accum_update"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "rho"
-    description: "Decay factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "epsilon"
-    description: "Constant factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -821,17 +729,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -844,38 +753,30 @@ op {
     default_value {
       b: false
     }
-    description: "If True, updating of the var, accum and update_accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'*var\' according to the adadelta scheme."
-  description: "accum = rho() * accum + (1 - rho()) * grad.square();\nupdate = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;\nupdate_accum = rho() * update_accum + (1 - rho()) * update.square();\nvar -= update;"
 }
 op {
   name: "ApplyAdagrad"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -886,17 +787,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -909,59 +811,47 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the adagrad scheme."
-  description: "accum += grad * grad\nvar -= lr * grad * (1 / sqrt(accum))"
 }
 op {
   name: "ApplyAdagradDA"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "gradient_accumulator"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "gradient_squared_accumulator"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "global_step"
-    description: "Training step number. Must be a scalar."
     type: DT_INT64
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -972,17 +862,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -995,68 +886,55 @@ op {
     default_value {
       b: false
     }
-    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'*var\' according to the proximal adagrad scheme."
 }
 op {
   name: "ApplyAdam"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "m"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "v"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "beta1_power"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "beta2_power"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "beta1"
-    description: "Momentum factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "beta2"
-    description: "Momentum factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "epsilon"
-    description: "Ridge term. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -1067,17 +945,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -1090,7 +969,6 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var, m, and v tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
   attr {
     name: "use_nesterov"
@@ -1098,53 +976,42 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, uses the nesterov update."
   }
-  summary: "Update \'*var\' according to the Adam algorithm."
-  description: "lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)\nm_t <- beta1 * m_{t-1} + (1 - beta1) * g_t\nv_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t\nvariable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)"
 }
 op {
   name: "ApplyAddSign"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "m"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "alpha"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "sign_decay"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "beta"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -1155,17 +1022,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -1178,45 +1046,36 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and m tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the AddSign update."
-  description: "m_t <- beta1 * m_{t-1} + (1 - beta1) * g\nupdate <- (alpha + sign_decay * sign(g) *sign(m)) * g\nvariable <- variable - lr_t * update"
 }
 op {
   name: "ApplyCenteredRMSProp"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "mg"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "ms"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "mom"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "rho"
-    description: "Decay rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -1225,17 +1084,14 @@ op {
   }
   input_arg {
     name: "epsilon"
-    description: "Ridge term. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -1246,17 +1102,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -1269,59 +1126,47 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var, mg, ms, and mom tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the centered RMSProp algorithm."
-  description: "The centered RMSProp algorithm uses an estimate of the centered second moment\n(i.e., the variance) for normalization, as opposed to regular RMSProp, which\nuses the (uncentered) second moment. This often helps with training, but is\nslightly more expensive in terms of computation and memory.\n\nNote that in dense implementation of this algorithm, mg, ms, and mom will\nupdate even if the grad is zero, but in this sparse implementation, mg, ms,\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nmean_grad = decay * mean_grad + (1-decay) * gradient\n\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)\n\nmg <- rho * mg_{t-1} + (1-rho) * grad\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)\nvar <- var - mom"
 }
 op {
   name: "ApplyFtrl"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "linear"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regulariation. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regulariation. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "lr_power"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -1332,17 +1177,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -1355,49 +1201,39 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
-  description: "accum_new = accum + grad * grad\nlinear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
 }
 op {
   name: "ApplyFtrlV2"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "linear"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regulariation. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 shrinkage regulariation. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -1406,12 +1242,10 @@ op {
   }
   input_arg {
     name: "lr_power"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -1422,17 +1256,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -1445,32 +1280,25 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
-  description: "grad_with_shrinkage = grad + 2 * l2_shrinkage * var\naccum_new = accum + grad_with_shrinkage * grad_with_shrinkage\nlinear += grad_with_shrinkage +\n    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
 }
 op {
   name: "ApplyGradientDescent"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "alpha"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "delta"
-    description: "The change."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -1481,17 +1309,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -1504,42 +1333,34 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'*var\' by subtracting \'alpha\' * \'delta\' from it."
 }
 op {
   name: "ApplyMomentum"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "momentum"
-    description: "Momentum. Must be a scalar."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -1550,17 +1371,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -1573,7 +1395,6 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
   attr {
     name: "use_nesterov"
@@ -1581,53 +1402,42 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, the tensor passed to compute grad will be\nvar - lr * momentum * accum, so in the end, the var you get is actually\nvar - lr * momentum * accum."
   }
-  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
-  description: "want to use Nesterov momentum.\n\naccum = accum * momentum + grad\nvar -= lr * accum"
 }
 op {
   name: "ApplyPowerSign"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "m"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "logbase"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "sign_decay"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "beta"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -1638,17 +1448,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -1661,48 +1472,38 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and m tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the AddSign update."
-  description: "m_t <- beta1 * m_{t-1} + (1 - beta1) * g\nupdate <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g\nvariable <- variable - lr_t * update"
 }
 op {
   name: "ApplyProximalAdagrad"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -1713,17 +1514,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -1736,42 +1538,33 @@ op {
     default_value {
       b: false
     }
-    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'*var\' and \'*accum\' according to FOBOS with Adagrad learning rate."
-  description: "accum += grad * grad\nprox_v = var - lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}"
 }
 op {
   name: "ApplyProximalGradientDescent"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "alpha"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "delta"
-    description: "The change."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -1782,17 +1575,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -1805,39 +1599,31 @@ op {
     default_value {
       b: false
     }
-    description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'*var\' as FOBOS algorithm with fixed learning rate."
-  description: "prox_v = var - alpha * delta\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}"
 }
 op {
   name: "ApplyRMSProp"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "ms"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "mom"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "rho"
-    description: "Decay rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -1846,17 +1632,14 @@ op {
   }
   input_arg {
     name: "epsilon"
-    description: "Ridge term. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -1867,17 +1650,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -1890,10 +1674,7 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var, ms, and mom tensors is protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the RMSProp algorithm."
-  description: "Note that in dense implementation of this algorithm, ms and mom will\nupdate even if the grad is zero, but in this sparse implementation, ms\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
 }
 op {
   name: "ApproximateEqual"
@@ -1916,17 +1697,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -1940,7 +1722,6 @@ op {
       f: 1e-05
     }
   }
-  summary: "Returns the truth value of abs(x-y) < tolerance element-wise."
   is_commutative: true
 }
 op {
@@ -1951,7 +1732,6 @@ op {
   }
   input_arg {
     name: "dimension"
-    description: "int32 or int64, must be in the range `[-rank(input), rank(input))`.\nDescribes which dimension of the input Tensor to reduce across. For vectors,\nuse dimension = 0."
     type_attr: "Tidx"
   }
   output_arg {
@@ -1965,17 +1745,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -2008,8 +1789,6 @@ op {
       }
     }
   }
-  summary: "Returns the index with the largest value across dimensions of a tensor."
-  description: "Note that in case of ties the identity of the return value is not guaranteed."
 }
 op {
   name: "ArgMin"
@@ -2019,7 +1798,6 @@ op {
   }
   input_arg {
     name: "dimension"
-    description: "int32 or int64, must be in the range `[-rank(input), rank(input))`.\nDescribes which dimension of the input Tensor to reduce across. For vectors,\nuse dimension = 0."
     type_attr: "Tidx"
   }
   output_arg {
@@ -2033,17 +1811,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -2076,8 +1855,6 @@ op {
       }
     }
   }
-  summary: "Returns the index with the smallest value across dimensions of a tensor."
-  description: "Note that in case of ties the identity of the return value is not guaranteed."
 }
 op {
   name: "AsString"
@@ -2110,7 +1887,6 @@ op {
     default_value {
       i: -1
     }
-    description: "The post-decimal precision to use for floating point numbers.\nOnly used if precision > -1."
   }
   attr {
     name: "scientific"
@@ -2118,7 +1894,6 @@ op {
     default_value {
       b: false
     }
-    description: "Use scientific notation for floating point numbers."
   }
   attr {
     name: "shortest"
@@ -2126,7 +1901,6 @@ op {
     default_value {
       b: false
     }
-    description: "Use shortest representation (either scientific or standard) for\nfloating point numbers."
   }
   attr {
     name: "width"
@@ -2134,7 +1908,6 @@ op {
     default_value {
       i: -1
     }
-    description: "Pad pre-decimal numbers to this width.\nApplies to both floating point and integer numbers.\nOnly used if width > -1."
   }
   attr {
     name: "fill"
@@ -2142,10 +1915,7 @@ op {
     default_value {
       s: ""
     }
-    description: "The value to pad if width > -1.  If empty, pads with spaces.\nAnother typical value is \'0\'.  String cannot be longer than 1 character."
   }
-  summary: "Converts each entry in the given tensor to strings.  Supports many numeric"
-  description: "types and boolean."
 }
 op {
   name: "Asin"
@@ -2163,6 +1933,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -2172,7 +1943,6 @@ op {
       }
     }
   }
-  summary: "Computes asin of x element-wise."
 }
 op {
   name: "Asinh"
@@ -2190,6 +1960,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -2197,18 +1968,15 @@ op {
       }
     }
   }
-  summary: "Computes inverse hyperbolic sine of x element-wise."
 }
 op {
   name: "Assert"
   input_arg {
     name: "condition"
-    description: "The condition to evaluate."
     type: DT_BOOL
   }
   input_arg {
     name: "data"
-    description: "The tensors to print out when condition is false."
     type_list_attr: "T"
   }
   attr {
@@ -2223,28 +1991,22 @@ op {
     default_value {
       i: 3
     }
-    description: "Print this many entries of each tensor."
   }
-  summary: "Asserts that the given condition is true."
-  description: "If `condition` evaluates to false, print the list of tensors in `data`.\n`summarize` determines how many entries of the tensors to print."
   is_stateful: true
 }
 op {
   name: "Assign"
   input_arg {
     name: "ref"
-    description: "Should be from a `Variable` node. May be uninitialized."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "value"
-    description: "The value to be assigned to the variable."
     type_attr: "T"
   }
   output_arg {
     name: "output_ref"
-    description: "= Same as \"ref\".  Returned as a convenience for operations that want\nto use the new value after the variable has been reset."
     type_attr: "T"
     is_ref: true
   }
@@ -2258,7 +2020,6 @@ op {
     default_value {
       b: true
     }
-    description: "If true, the operation will validate that the shape\nof \'value\' matches the shape of the Tensor being assigned to.  If false,\n\'ref\' will take on the shape of \'value\'."
   }
   attr {
     name: "use_locking"
@@ -2266,28 +2027,22 @@ op {
     default_value {
       b: true
     }
-    description: "If True, the assignment will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'ref\' by assigning \'value\' to it."
-  description: "This operation outputs \"ref\" after the assignment is done.\nThis makes it easier to chain operations that need to use the reset value."
   allows_uninitialized_input: true
 }
 op {
   name: "AssignAdd"
   input_arg {
     name: "ref"
-    description: "Should be from a `Variable` node."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "value"
-    description: "The value to be added to the variable."
     type_attr: "T"
   }
   output_arg {
     name: "output_ref"
-    description: "= Same as \"ref\".  Returned as a convenience for operations that want\nto use the new value after the variable has been updated."
     type_attr: "T"
     is_ref: true
   }
@@ -2298,17 +2053,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -2321,48 +2077,37 @@ op {
     default_value {
       b: false
     }
-    description: "If True, the addition will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'ref\' by adding \'value\' to it."
-  description: "This operation outputs \"ref\" after the update is done.\nThis makes it easier to chain operations that need to use the reset value."
 }
 op {
   name: "AssignAddVariableOp"
   input_arg {
     name: "resource"
-    description: "handle to the resource in which to store the variable."
     type: DT_RESOURCE
   }
   input_arg {
     name: "value"
-    description: "the value by which the variable will be incremented."
     type_attr: "dtype"
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "the dtype of the value."
   }
-  summary: "Adds a value to the current value of a variable."
-  description: "Any ReadVariableOp which depends directly or indirectly on this assign is\nguaranteed to see the incremented value or a subsequent newer one.\n\nOutputs the incremented value, which can be used to totally order the\nincrements to this variable."
   is_stateful: true
 }
 op {
   name: "AssignSub"
   input_arg {
     name: "ref"
-    description: "Should be from a `Variable` node."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "value"
-    description: "The value to be subtracted to the variable."
     type_attr: "T"
   }
   output_arg {
     name: "output_ref"
-    description: "= Same as \"ref\".  Returned as a convenience for operations that want\nto use the new value after the variable has been updated."
     type_attr: "T"
     is_ref: true
   }
@@ -2373,17 +2118,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -2396,51 +2142,38 @@ op {
     default_value {
       b: false
     }
-    description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'ref\' by subtracting \'value\' from it."
-  description: "This operation outputs \"ref\" after the update is done.\nThis makes it easier to chain operations that need to use the reset value."
 }
 op {
   name: "AssignSubVariableOp"
   input_arg {
     name: "resource"
-    description: "handle to the resource in which to store the variable."
     type: DT_RESOURCE
   }
   input_arg {
     name: "value"
-    description: "the value by which the variable will be incremented."
     type_attr: "dtype"
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "the dtype of the value."
   }
-  summary: "Subtracts a value from the current value of a variable."
-  description: "Any ReadVariableOp which depends directly or indirectly on this assign is\nguaranteed to see the incremented value or a subsequent newer one.\n\nOutputs the incremented value, which can be used to totally order the\nincrements to this variable."
   is_stateful: true
 }
 op {
   name: "AssignVariableOp"
   input_arg {
     name: "resource"
-    description: "handle to the resource in which to store the variable."
     type: DT_RESOURCE
   }
   input_arg {
     name: "value"
-    description: "the value to set the new tensor to use."
     type_attr: "dtype"
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "the dtype of the value."
   }
-  summary: "Assigns a new value to a variable."
-  description: "Any ReadVariableOp with a control dependency on this op is guaranteed to return\nthis value or a subsequent newer value of the variable."
   is_stateful: true
 }
 op {
@@ -2459,6 +2192,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -2468,7 +2202,6 @@ op {
       }
     }
   }
-  summary: "Computes atan of x element-wise."
 }
 op {
   name: "Atan2"
@@ -2489,13 +2222,12 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes arctangent of `y/x` element-wise, respecting signs of the arguments."
-  description: "This is the angle \\( \\theta \\in [-\\pi, \\pi] \\) such that\n\\[ x = r \\cos(\\theta) \\]\nand\n\\[ y = r \\sin(\\theta) \\]\nwhere \\(r = \\sqrt(x^2 + y^2) \\)."
 }
 op {
   name: "Atanh"
@@ -2513,6 +2245,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -2520,29 +2253,24 @@ op {
       }
     }
   }
-  summary: "Computes inverse hyperbolic tangent of x element-wise."
 }
 op {
   name: "AudioSpectrogram"
   input_arg {
     name: "input"
-    description: "Float representation of audio data."
     type: DT_FLOAT
   }
   output_arg {
     name: "spectrogram"
-    description: "3D representation of the audio frequencies as an image."
     type: DT_FLOAT
   }
   attr {
     name: "window_size"
     type: "int"
-    description: "How wide the input window is in samples. For the highest efficiency\nthis should be a power of two, but other values are accepted."
   }
   attr {
     name: "stride"
     type: "int"
-    description: "How widely apart the center of adjacent sample windows should be."
   }
   attr {
     name: "magnitude_squared"
@@ -2550,32 +2278,25 @@ op {
     default_value {
       b: false
     }
-    description: "Whether to return the squared magnitude or just the\nmagnitude. Using squared magnitude can avoid extra calculations."
   }
-  summary: "Produces a visualization of audio data over time."
-  description: "Spectrograms are a standard way of representing audio information as a series of\nslices of frequency information, one slice for each window of time. By joining\nthese together into a sequence, they form a distinctive fingerprint of the sound\nover time.\n\nThis op expects to receive audio data as an input, stored as floats in the range\n-1 to 1, together with a window width in samples, and a stride specifying how\nfar to move the window between slices. From this it generates a three\ndimensional output. The lowest dimension has an amplitude value for each\nfrequency during that time slice. The next dimension is time, with successive\nfrequency slices. The final dimension is for the channels in the input, so a\nstereo audio input would have two here for example.\n\nThis means the layout when converted and saved as an image is rotated 90 degrees\nclockwise from a typical spectrogram. Time is descending down the Y axis, and\nthe frequency decreases from left to right.\n\nEach value in the result represents the square root of the sum of the real and\nimaginary parts of an FFT on the current window of samples. In this way, the\nlowest dimension represents the power of each frequency in the current window,\nand adjacent windows are concatenated in the next dimension.\n\nTo get a more intuitive and visual look at what this operation does, you can run\ntensorflow/examples/wav_to_spectrogram to read in an audio file and save out the\nresulting spectrogram as a PNG image."
 }
 op {
   name: "AudioSummary"
   input_arg {
     name: "tag"
-    description: "Scalar. Used to build the `tag` attribute of the summary values."
     type: DT_STRING
   }
   input_arg {
     name: "tensor"
-    description: "2-D of shape `[batch_size, frames]`."
     type: DT_FLOAT
   }
   output_arg {
     name: "summary"
-    description: "Scalar. Serialized `Summary` protocol buffer."
     type: DT_STRING
   }
   attr {
     name: "sample_rate"
     type: "float"
-    description: "The sample rate of the signal in hertz."
   }
   attr {
     name: "max_outputs"
@@ -2583,12 +2304,9 @@ op {
     default_value {
       i: 3
     }
-    description: "Max number of batch elements to generate audio for."
     has_minimum: true
     minimum: 1
   }
-  summary: "Outputs a `Summary` protocol buffer with audio."
-  description: "The summary has up to `max_outputs` summary values containing audio. The\naudio is built from `tensor` which must be 3-D with shape `[batch_size,\nframes, channels]` or 2-D with shape `[batch_size, frames]`. The values are\nassumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.\n\nThe `tag` argument is a scalar `Tensor` of type `string`.  It is used to\nbuild the `tag` of the summary values:\n\n*  If `max_outputs` is 1, the summary value tag is \'*tag*/audio\'.\n*  If `max_outputs` is greater than 1, the summary value tags are\n   generated sequentially as \'*tag*/audio/0\', \'*tag*/audio/1\', etc."
   deprecation {
     version: 15
     explanation: "Use AudioSummaryV2."
@@ -2598,22 +2316,18 @@ op {
   name: "AudioSummaryV2"
   input_arg {
     name: "tag"
-    description: "Scalar. Used to build the `tag` attribute of the summary values."
     type: DT_STRING
   }
   input_arg {
     name: "tensor"
-    description: "2-D of shape `[batch_size, frames]`."
     type: DT_FLOAT
   }
   input_arg {
     name: "sample_rate"
-    description: "The sample rate of the signal in hertz."
     type: DT_FLOAT
   }
   output_arg {
     name: "summary"
-    description: "Scalar. Serialized `Summary` protocol buffer."
     type: DT_STRING
   }
   attr {
@@ -2622,43 +2336,35 @@ op {
     default_value {
       i: 3
     }
-    description: "Max number of batch elements to generate audio for."
     has_minimum: true
     minimum: 1
   }
-  summary: "Outputs a `Summary` protocol buffer with audio."
-  description: "The summary has up to `max_outputs` summary values containing audio. The\naudio is built from `tensor` which must be 3-D with shape `[batch_size,\nframes, channels]` or 2-D with shape `[batch_size, frames]`. The values are\nassumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.\n\nThe `tag` argument is a scalar `Tensor` of type `string`.  It is used to\nbuild the `tag` of the summary values:\n\n*  If `max_outputs` is 1, the summary value tag is \'*tag*/audio\'.\n*  If `max_outputs` is greater than 1, the summary value tags are\n   generated sequentially as \'*tag*/audio/0\', \'*tag*/audio/1\', etc."
 }
 op {
   name: "AvgPool"
   input_arg {
     name: "value"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The average pooled output tensor."
     type_attr: "T"
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "The size of the sliding window for each dimension of `value`."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of `value`."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -2672,7 +2378,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -2686,44 +2391,38 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Performs average pooling on the input."
-  description: "Each entry in `output` is the mean of the corresponding size `ksize`\nwindow in `value`."
 }
 op {
   name: "AvgPool3D"
   input_arg {
     name: "input"
-    description: "Shape `[batch, depth, rows, cols, channels]` tensor to pool over."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The average pooled output tensor."
     type_attr: "T"
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "1-D tensor of length 5. The size of the window for each dimension of\nthe input tensor. Must have `ksize[0] = ksize[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D tensor of length 5. The stride of the sliding window for each\ndimension of `input`. Must have `strides[0] = strides[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -2737,7 +2436,6 @@ op {
     default_value {
       s: "NDHWC"
     }
-    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
     allowed_values {
       list {
         s: "NDHWC"
@@ -2750,48 +2448,42 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Performs 3D average pooling on the input."
 }
 op {
   name: "AvgPool3DGrad"
   input_arg {
     name: "orig_input_shape"
-    description: "The original input dimensions."
     type: DT_INT32
   }
   input_arg {
     name: "grad"
-    description: "Output backprop of shape `[batch, depth, rows, cols, channels]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The backprop for input."
     type_attr: "T"
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "1-D tensor of length 5. The size of the window for each dimension of\nthe input tensor. Must have `ksize[0] = ksize[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D tensor of length 5. The stride of the sliding window for each\ndimension of `input`. Must have `strides[0] = strides[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -2805,7 +2497,6 @@ op {
     default_value {
       s: "NDHWC"
     }
-    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
     allowed_values {
       list {
         s: "NDHWC"
@@ -2818,48 +2509,42 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes gradients of average pooling function."
 }
 op {
   name: "AvgPoolGrad"
   input_arg {
     name: "orig_input_shape"
-    description: "1-D.  Shape of the original input to `avg_pool`."
     type: DT_INT32
   }
   input_arg {
     name: "grad"
-    description: "4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.\nthe output of `avg_pool`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "4-D.  Gradients w.r.t. the input of `avg_pool`."
     type_attr: "T"
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "The size of the sliding window for each dimension of the input."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the input."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -2873,7 +2558,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -2887,25 +2571,23 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes gradients of the average pooling function."
 }
 op {
   name: "Barrier"
   output_arg {
     name: "handle"
-    description: "The handle to the barrier."
     type: DT_STRING
     is_ref: true
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a value."
     has_minimum: true
     minimum: 1
   }
@@ -2916,7 +2598,6 @@ op {
       list {
       }
     }
-    description: "The shape of each component in a value. Each shape must be 1 in the\nfirst dimension. The length of this attr must be the same as the length of\ncomponent_types."
     has_minimum: true
   }
   attr {
@@ -2925,7 +2606,6 @@ op {
     default_value {
       i: -1
     }
-    description: "The capacity of the barrier.  The default capacity is MAX_INT32,\nwhich is the largest capacity of the underlying queue."
   }
   attr {
     name: "container"
@@ -2933,7 +2613,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this barrier is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -2941,17 +2620,13 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this barrier will be shared under the given name\nacross multiple sessions."
   }
-  summary: "Defines a barrier that persists across different graph executions."
-  description: "A barrier represents a key-value map, where each key is a string, and\neach value is a tuple of tensors.\n\nAt runtime, the barrier contains \'complete\' and \'incomplete\'\nelements. A complete element has defined tensors for all components of\nits value tuple, and may be accessed using BarrierTakeMany. An\nincomplete element has some undefined components in its value tuple,\nand may be updated using BarrierInsertMany."
   is_stateful: true
 }
 op {
   name: "BarrierClose"
   input_arg {
     name: "handle"
-    description: "The handle to a barrier."
     type: DT_STRING
     is_ref: true
   }
@@ -2961,42 +2636,33 @@ op {
     default_value {
       b: false
     }
-    description: "If true, all pending enqueue requests that are\nblocked on the barrier\'s queue will be canceled. InsertMany will fail, even\nif no new key is introduced."
   }
-  summary: "Closes the given barrier."
-  description: "This operation signals that no more new elements will be inserted in the\ngiven barrier. Subsequent InsertMany that try to introduce a new key will fail.\nSubsequent InsertMany operations that just add missing components to already\nexisting elements will continue to succeed. Subsequent TakeMany operations will\ncontinue to succeed if sufficient completed elements remain in the barrier.\nSubsequent TakeMany operations that would block will fail immediately."
 }
 op {
   name: "BarrierIncompleteSize"
   input_arg {
     name: "handle"
-    description: "The handle to a barrier."
     type: DT_STRING
     is_ref: true
   }
   output_arg {
     name: "size"
-    description: "The number of incomplete elements (i.e. those with some of their value\ncomponents not set) in the barrier."
     type: DT_INT32
   }
-  summary: "Computes the number of incomplete elements in the given barrier."
 }
 op {
   name: "BarrierInsertMany"
   input_arg {
     name: "handle"
-    description: "The handle to a barrier."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "keys"
-    description: "A one-dimensional tensor of keys, with length n."
     type: DT_STRING
   }
   input_arg {
     name: "values"
-    description: "An any-dimensional tensor of values, which are associated with the\nrespective keys. The 0th dimension must have length n."
     type_attr: "T"
   }
   attr {
@@ -3006,58 +2672,46 @@ op {
   attr {
     name: "component_index"
     type: "int"
-    description: "The component of the barrier elements that is being assigned."
   }
-  summary: "For each key, assigns the respective value to the specified component."
-  description: "If a key is not found in the barrier, this operation will create a new\nincomplete element. If a key is found in the barrier, and the element\nalready has a value at component_index, this operation will fail with\nINVALID_ARGUMENT, and leave the barrier in an undefined state."
 }
 op {
   name: "BarrierReadySize"
   input_arg {
     name: "handle"
-    description: "The handle to a barrier."
     type: DT_STRING
     is_ref: true
   }
   output_arg {
     name: "size"
-    description: "The number of complete elements (i.e. those with all of their value\ncomponents set) in the barrier."
     type: DT_INT32
   }
-  summary: "Computes the number of complete elements in the given barrier."
 }
 op {
   name: "BarrierTakeMany"
   input_arg {
     name: "handle"
-    description: "The handle to a barrier."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "num_elements"
-    description: "A single-element tensor containing the number of elements to\ntake."
     type: DT_INT32
   }
   output_arg {
     name: "indices"
-    description: "A one-dimensional tensor of indices, with length num_elems.\nThese indices refer to the batch in which the values were placed into the\nbarrier (starting with MIN_LONG and increasing with each BarrierInsertMany)."
     type: DT_INT64
   }
   output_arg {
     name: "keys"
-    description: "A one-dimensional tensor of keys, with length num_elements."
     type: DT_STRING
   }
   output_arg {
     name: "values"
-    description: "One any-dimensional tensor per component in a barrier element. All\nvalues have length num_elements in the 0th dimension."
     type_list_attr: "component_types"
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a value."
     has_minimum: true
     minimum: 1
   }
@@ -3067,7 +2721,6 @@ op {
     default_value {
       b: false
     }
-    description: "Allow to return less than num_elements items if barrier is\nalready closed."
   }
   attr {
     name: "wait_for_incomplete"
@@ -3082,10 +2735,84 @@ op {
     default_value {
       i: -1
     }
-    description: "If the queue is empty, this operation will block for up to\ntimeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Takes the given number of completed elements from a barrier."
-  description: "This operation concatenates completed-element component tensors along\nthe 0th dimension to make a single component tensor.\n\nElements come out of the barrier when they are complete, and in the order\nin which they were placed into the barrier.  The indices output provides\ninformation about the batch in which each element was originally inserted\ninto the barrier."
+}
+op {
+  name: "Batch"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batched_tensors"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "grad_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
 }
 op {
   name: "BatchCholesky"
@@ -3149,7 +2876,6 @@ op {
   }
   input_arg {
     name: "batch_size"
-    description: "A scalar representing the number of elements to accumulate in a\nbatch."
     type: DT_INT64
   }
   output_arg {
@@ -3168,7 +2894,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that batches `batch_size` elements from `input_dataset`."
 }
 op {
   name: "BatchFFT"
@@ -3264,17 +2989,14 @@ op {
   name: "BatchMatMul"
   input_arg {
     name: "x"
-    description: "2-D or higher with shape `[..., r_x, c_x]`."
     type_attr: "T"
   }
   input_arg {
     name: "y"
-    description: "2-D or higher with shape `[..., r_y, c_y]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "3-D or higher with shape `[..., r_o, c_o]`"
     type_attr: "T"
   }
   attr {
@@ -3283,6 +3005,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -3297,7 +3020,6 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, adjoint the slices of `x`. Defaults to `False`."
   }
   attr {
     name: "adj_y"
@@ -3305,10 +3027,7 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, adjoint the slices of `y`. Defaults to `False`."
   }
-  summary: "Multiplies slices of two tensors in batches."
-  description: "Multiplies all slices of `Tensor` `x` and `y` (each slice can be\nviewed as an element of a batch), and arranges the individual results\nin a single output tensor of the same batch size. Each of the\nindividual slices can optionally be adjointed (to adjoint a matrix\nmeans to transpose and conjugate it) before multiplication by setting\nthe `adj_x` or `adj_y` flag to `True`, which are by default `False`.\n\nThe input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`\nand `[..., r_y, c_y]`.\n\nThe output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:\n\n    r_o = c_x if adj_x else r_x\n    c_o = r_y if adj_y else c_y\n\nIt is computed as:\n\n    output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])"
 }
 op {
   name: "BatchMatrixBandPart"
@@ -3580,27 +3299,22 @@ op {
   name: "BatchNormWithGlobalNormalization"
   input_arg {
     name: "t"
-    description: "A 4D input Tensor."
     type_attr: "T"
   }
   input_arg {
     name: "m"
-    description: "A 1D mean Tensor with size matching the last dimension of t.\nThis is the first output from tf.nn.moments,\nor a saved moving average thereof."
     type_attr: "T"
   }
   input_arg {
     name: "v"
-    description: "A 1D variance Tensor with size matching the last dimension of t.\nThis is the second output from tf.nn.moments,\nor a saved moving average thereof."
     type_attr: "T"
   }
   input_arg {
     name: "beta"
-    description: "A 1D beta Tensor with size matching the last dimension of t.\nAn offset to be added to the normalized tensor."
     type_attr: "T"
   }
   input_arg {
     name: "gamma"
-    description: "A 1D gamma Tensor with size matching the last dimension of t.\nIf \"scale_after_normalization\" is true, this tensor will be multiplied\nwith the normalized tensor."
     type_attr: "T"
   }
   output_arg {
@@ -3614,17 +3328,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -3634,15 +3349,11 @@ op {
   attr {
     name: "variance_epsilon"
     type: "float"
-    description: "A small float number to avoid dividing by 0."
   }
   attr {
     name: "scale_after_normalization"
     type: "bool"
-    description: "A bool indicating whether the resulted tensor\nneeds to be multiplied with gamma."
   }
-  summary: "Batch normalization."
-  description: "This op is deprecated. Prefer `tf.nn.batch_normalization`."
   deprecation {
     version: 9
     explanation: "Use tf.nn.batch_normalization()"
@@ -3652,52 +3363,42 @@ op {
   name: "BatchNormWithGlobalNormalizationGrad"
   input_arg {
     name: "t"
-    description: "A 4D input Tensor."
     type_attr: "T"
   }
   input_arg {
     name: "m"
-    description: "A 1D mean Tensor with size matching the last dimension of t.\nThis is the first output from tf.nn.moments,\nor a saved moving average thereof."
     type_attr: "T"
   }
   input_arg {
     name: "v"
-    description: "A 1D variance Tensor with size matching the last dimension of t.\nThis is the second output from tf.nn.moments,\nor a saved moving average thereof."
     type_attr: "T"
   }
   input_arg {
     name: "gamma"
-    description: "A 1D gamma Tensor with size matching the last dimension of t.\nIf \"scale_after_normalization\" is true, this Tensor will be multiplied\nwith the normalized Tensor."
     type_attr: "T"
   }
   input_arg {
     name: "backprop"
-    description: "4D backprop Tensor."
     type_attr: "T"
   }
   output_arg {
     name: "dx"
-    description: "4D backprop tensor for input."
     type_attr: "T"
   }
   output_arg {
     name: "dm"
-    description: "1D backprop tensor for mean."
     type_attr: "T"
   }
   output_arg {
     name: "dv"
-    description: "1D backprop tensor for variance."
     type_attr: "T"
   }
   output_arg {
     name: "db"
-    description: "1D backprop tensor for beta."
     type_attr: "T"
   }
   output_arg {
     name: "dg"
-    description: "1D backprop tensor for gamma."
     type_attr: "T"
   }
   attr {
@@ -3707,17 +3408,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -3727,15 +3429,11 @@ op {
   attr {
     name: "variance_epsilon"
     type: "float"
-    description: "A small float number to avoid dividing by 0."
   }
   attr {
     name: "scale_after_normalization"
     type: "bool"
-    description: "A bool indicating whether the resulted tensor\nneeds to be multiplied with gamma."
   }
-  summary: "Gradients for batch normalization."
-  description: "This op is deprecated. See `tf.nn.batch_normalization`."
   deprecation {
     version: 9
     explanation: "Use tf.nn.batch_normalization()"
@@ -3855,17 +3553,14 @@ op {
   name: "BatchToSpace"
   input_arg {
     name: "input"
-    description: "4-D tensor with shape\n`[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,\n  depth]`. Note that the batch size of the input tensor must be divisible by\n`block_size * block_size`."
     type_attr: "T"
   }
   input_arg {
     name: "crops"
-    description: "2-D tensor of non-negative integers with shape `[2, 2]`. It specifies\nhow many elements to crop from the intermediate result across the spatial\ndimensions as follows:\n\n    crops = [[crop_top, crop_bottom], [crop_left, crop_right]]"
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    description: "4-D with shape `[batch, height, width, depth]`, where:\n\n      height = height_pad - crop_top - crop_bottom\n      width = width_pad - crop_left - crop_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:\n\n```\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```\nx = [[[[1], [3]], [[5], [7]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```"
     type_attr: "T"
   }
   attr {
@@ -3891,24 +3586,19 @@ op {
       }
     }
   }
-  summary: "BatchToSpace for 4-D tensors of type T."
-  description: "This is a legacy version of the more general BatchToSpaceND.\n\nRearranges (permutes) data from batch into blocks of spatial data, followed by\ncropping. This is the reverse transformation of SpaceToBatch. More specifically,\nthis op outputs a copy of the input tensor where values from the `batch`\ndimension are moved in spatial blocks to the `height` and `width` dimensions,\nfollowed by cropping along the `height` and `width` dimensions."
 }
 op {
   name: "BatchToSpaceND"
   input_arg {
     name: "input"
-    description: "N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,\nwhere spatial_shape has M dimensions."
     type_attr: "T"
   }
   input_arg {
     name: "block_shape"
-    description: "1-D with shape `[M]`, all values must be >= 1."
     type_attr: "Tblock_shape"
   }
   input_arg {
     name: "crops"
-    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input\n  dimension `i + 1`, which corresponds to spatial dimension `i`.  It is\n  required that\n  `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.\n\nThis operation is equivalent to the following steps:\n\n1. Reshape `input` to `reshaped` of shape:\n     [block_shape[0], ..., block_shape[M-1],\n      batch / prod(block_shape),\n      input_shape[1], ..., input_shape[N-1]]\n\n2. Permute dimensions of `reshaped` to produce `permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1], block_shape[0],\n      ...,\n      input_shape[M], block_shape[M-1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\n3. Reshape `permuted` to produce `reshaped_permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0],\n      ...,\n      input_shape[M] * block_shape[M-1],\n\n      input_shape[M+1],\n      ...,\n      input_shape[N-1]]\n\n4. Crop the start and end of dimensions `[1, ..., M]` of\n   `reshaped_permuted` according to `crops` to produce the output of shape:\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],\n      ...,\n      input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [2, 0]]`:\n\n```\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```"
     type_attr: "Tcrops"
   }
   output_arg {
@@ -3945,8 +3635,6 @@ op {
       }
     }
   }
-  summary: "BatchToSpace for N-D tensors of type T."
-  description: "This operation reshapes the \"batch\" dimension 0 into `M + 1` dimensions of shape\n`block_shape + [batch]`, interleaves these blocks back into the grid defined by\nthe spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as\nthe input.  The spatial dimensions of this intermediate result are then\noptionally cropped according to `crops` to produce the output.  This is the\nreverse of SpaceToBatch.  See below for a precise description."
 }
 op {
   name: "Betainc"
@@ -3976,24 +3664,19 @@ op {
       }
     }
   }
-  summary: "Compute the regularized incomplete beta integral \\\\(I_x(a, b)\\\\)."
-  description: "The regularized incomplete beta integral is defined as:\n\n\n\\\\(I_x(a, b) = \\frac{B(x; a, b)}{B(a, b)}\\\\)\n\nwhere\n\n\n\\\\(B(x; a, b) = \\int_0^x t^{a-1} (1 - t)^{b-1} dt\\\\)\n\n\nis the incomplete beta function and \\\\(B(a, b)\\\\) is the *complete*\nbeta function."
 }
 op {
   name: "BiasAdd"
   input_arg {
     name: "value"
-    description: "Any number of dimensions."
     type_attr: "T"
   }
   input_arg {
     name: "bias"
-    description: "1-D with size the last dimension of `value`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Broadcasted sum of `value` and `bias`."
     type_attr: "T"
   }
   attr {
@@ -4003,17 +3686,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -4026,7 +3710,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the bias tensor will be added to the last dimension\nof the value tensor.\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width].\nThe tensor will be added to \"in_channels\", the third-to-the-last\n    dimension."
     allowed_values {
       list {
         s: "NHWC"
@@ -4034,19 +3717,15 @@ op {
       }
     }
   }
-  summary: "Adds `bias` to `value`."
-  description: "This is a special case of `tf.add` where `bias` is restricted to be 1-D.\nBroadcasting is supported, so `value` may have any number of dimensions."
 }
 op {
   name: "BiasAddGrad"
   input_arg {
     name: "out_backprop"
-    description: "Any number of dimensions."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "1-D with size the feature dimension of `out_backprop`."
     type_attr: "T"
   }
   attr {
@@ -4056,17 +3735,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -4079,7 +3759,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the bias tensor will be added to the last dimension\nof the value tensor.\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width].\nThe tensor will be added to \"in_channels\", the third-to-the-last\n    dimension."
     allowed_values {
       list {
         s: "NHWC"
@@ -4087,24 +3766,19 @@ op {
       }
     }
   }
-  summary: "The backward operation for \"BiasAdd\" on the \"bias\" tensor."
-  description: "It accumulates all the values from out_backprop into the feature dimension.\nFor NHWC data format, the feature dimension is the last. For NCHW data format,\nthe feature dimension is the third-to-last."
 }
 op {
   name: "BiasAddV1"
   input_arg {
     name: "value"
-    description: "Any number of dimensions."
     type_attr: "T"
   }
   input_arg {
     name: "bias"
-    description: "1-D with size the last dimension of `value`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Broadcasted sum of `value` and `bias`."
     type_attr: "T"
   }
   attr {
@@ -4114,46 +3788,41 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  summary: "Adds `bias` to `value`."
-  description: "This is a deprecated version of BiasAdd and will be soon removed.\n\nThis is a special case of `tf.add` where `bias` is restricted to be 1-D.\nBroadcasting is supported, so `value` may have any number of dimensions."
 }
 op {
   name: "Bincount"
   input_arg {
     name: "arr"
-    description: "int32 `Tensor`."
     type: DT_INT32
   }
   input_arg {
     name: "size"
-    description: "non-negative int32 scalar `Tensor`."
     type: DT_INT32
   }
   input_arg {
     name: "weights"
-    description: "is an int32, int64, float32, or float64 `Tensor` with the same\nshape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights\nequal to 1."
     type_attr: "T"
   }
   output_arg {
     name: "bins"
-    description: "1D `Tensor` with length equal to `size`. The counts or summed weights for\neach value in the range [0, size)."
     type_attr: "T"
   }
   attr {
@@ -4168,8 +3837,6 @@ op {
       }
     }
   }
-  summary: "Counts the number of occurrences of each value in an integer array."
-  description: "Outputs a vector with length `size` and the same dtype as `weights`. If\n`weights` are empty, then index `i` stores the number of times the value `i` is\ncounted in `arr`. If `weights` are non-empty, then index `i` stores the sum of\nthe value in `weights` at each index where the corresponding value in `arr` is\n`i`.\n\nValues in `arr` outside of the range [0, size) are ignored."
 }
 op {
   name: "Bitcast"
@@ -4186,6 +3853,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT64
@@ -4210,6 +3878,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT64
@@ -4229,8 +3898,6 @@ op {
       }
     }
   }
-  summary: "Bitcasts a tensor from one type to another without copying data."
-  description: "Given a tensor `input`, this operation returns a tensor that has the same buffer\ndata as `input` with datatype `type`.\n\nIf the input datatype `T` is larger than the output datatype `type` then the\nshape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].\n\nIf `T` is smaller than `type`, the operator requires that the rightmost\ndimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from\n[..., sizeof(`type`)/sizeof(`T`)] to [...].\n\n*NOTE*: Bitcast is implemented as a low-level cast, so machines with different\nendian orderings will give different results."
 }
 op {
   name: "BitwiseAnd"
@@ -4262,8 +3929,6 @@ op {
       }
     }
   }
-  summary: "Elementwise computes the bitwise AND of `x` and `y`."
-  description: "The result will have those bits set, that are set in both `x` and `y`. The\ncomputation is performed on the underlying representations of `x` and `y`."
   is_commutative: true
 }
 op {
@@ -4296,8 +3961,6 @@ op {
       }
     }
   }
-  summary: "Elementwise computes the bitwise OR of `x` and `y`."
-  description: "The result will have those bits set, that are set in `x`, `y` or both. The\ncomputation is performed on the underlying representations of `x` and `y`."
   is_commutative: true
 }
 op {
@@ -4330,8 +3993,6 @@ op {
       }
     }
   }
-  summary: "Elementwise computes the bitwise XOR of `x` and `y`."
-  description: "The result will have those bits set, that are different in `x` and `y`. The\ncomputation is performed on the underlying representations of `x` and `y`."
   is_commutative: true
 }
 op {
@@ -4361,8 +4022,6 @@ op {
       }
     }
   }
-  summary: "Return the shape of s0 op s1 with broadcast."
-  description: "Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the\nbroadcasted shape. `s0`, `s1` and `r0` are all integer vectors."
 }
 op {
   name: "BroadcastGradientArgs"
@@ -4395,19 +4054,15 @@ op {
       }
     }
   }
-  summary: "Return the reduction indices for computing gradients of s0 op s1 with broadcast."
-  description: "This is typically used by gradient computations for a broadcasting operation."
 }
 op {
   name: "Bucketize"
   input_arg {
     name: "input"
-    description: "Any shape of Tensor contains with int or float type."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Same shape with \'input\', each value of input replaced with bucket index.\n\n@compatibility(numpy)\nEquivalent to np.digitize.\n@end_compatibility"
     type: DT_INT32
   }
   attr {
@@ -4425,10 +4080,7 @@ op {
   attr {
     name: "boundaries"
     type: "list(float)"
-    description: "A sorted list of floats gives the boundary of the buckets."
   }
-  summary: "Bucketizes \'input\' based on \'boundaries\'."
-  description: "For example, if the inputs are\n    boundaries = [0, 10, 100]\n    input = [[-5, 10000]\n             [150,   10]\n             [5,    100]]\n\nthen the output will be\n    output = [[0, 3]\n              [3, 2]\n              [1, 3]]"
 }
 op {
   name: "BytesProducedStatsDataset"
@@ -4456,54 +4108,45 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Records the bytes size of each element of `input_dataset` in a StatsAggregator."
 }
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
     name: "inputs"
-    description: "3-D, shape: `(max_time x batch_size x num_classes)`, the logits."
     type: DT_FLOAT
   }
   input_arg {
     name: "sequence_length"
-    description: "A vector containing sequence lengths, size `(batch)`."
     type: DT_INT32
   }
   output_arg {
     name: "decoded_indices"
-    description: "A list (length: top_paths) of indices matrices.  Matrix j,\nsize `(total_decoded_outputs[j] x 2)`, has indices of a\n`SparseTensor<int64, 2>`.  The rows store: [batch, time]."
     type: DT_INT64
     number_attr: "top_paths"
   }
   output_arg {
     name: "decoded_values"
-    description: "A list (length: top_paths) of values vectors.  Vector j,\nsize `(length total_decoded_outputs[j])`, has the values of a\n`SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j."
     type: DT_INT64
     number_attr: "top_paths"
   }
   output_arg {
     name: "decoded_shape"
-    description: "A list (length: top_paths) of shape vector.  Vector j,\nsize `(2)`, stores the shape of the decoded `SparseTensor[j]`.\nIts values are: `[batch_size, max_decoded_length[j]]`."
     type: DT_INT64
     number_attr: "top_paths"
   }
   output_arg {
     name: "log_probability"
-    description: "A matrix, shaped: `(batch_size x top_paths)`.  The\nsequence log-probabilities."
     type: DT_FLOAT
   }
   attr {
     name: "beam_width"
     type: "int"
-    description: "A scalar >= 0 (beam search beam width)."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "top_paths"
     type: "int"
-    description: "A scalar >= 0, <= beam_width (controls output size)."
     has_minimum: true
     minimum: 1
   }
@@ -4513,41 +4156,32 @@ op {
     default_value {
       b: true
     }
-    description: "If true, merge repeated classes in output."
   }
-  summary: "Performs beam search decoding on the logits given in input."
-  description: "A note about the attribute merge_repeated: For the beam search decoder,\nthis means that if consecutive entries in a beam are the same, only\nthe first of these is emitted.  That is, when the top path is \"A B B B B\",\n\"A B\" is returned if merge_repeated = True but \"A B B B B\" is\nreturned if merge_repeated = False."
 }
 op {
   name: "CTCGreedyDecoder"
   input_arg {
     name: "inputs"
-    description: "3-D, shape: `(max_time x batch_size x num_classes)`, the logits."
     type: DT_FLOAT
   }
   input_arg {
     name: "sequence_length"
-    description: "A vector containing sequence lengths, size `(batch_size)`."
     type: DT_INT32
   }
   output_arg {
     name: "decoded_indices"
-    description: "Indices matrix, size `(total_decoded_outputs x 2)`,\nof a `SparseTensor<int64, 2>`.  The rows store: [batch, time]."
     type: DT_INT64
   }
   output_arg {
     name: "decoded_values"
-    description: "Values vector, size: `(total_decoded_outputs)`,\nof a `SparseTensor<int64, 2>`.  The vector stores the decoded classes."
     type: DT_INT64
   }
   output_arg {
     name: "decoded_shape"
-    description: "Shape vector, size `(2)`, of the decoded SparseTensor.\nValues are: `[batch_size, max_decoded_length]`."
     type: DT_INT64
   }
   output_arg {
     name: "log_probability"
-    description: "Matrix, size `(batch_size x 1)`, containing sequence\nlog-probabilities."
     type: DT_FLOAT
   }
   attr {
@@ -4556,41 +4190,32 @@ op {
     default_value {
       b: false
     }
-    description: "If True, merge repeated classes in output."
   }
-  summary: "Performs greedy decoding on the logits given in inputs."
-  description: "A note about the attribute merge_repeated: if enabled, when\nconsecutive logits\' maximum indices are the same, only the first of\nthese is emitted.  Labeling the blank \'*\', the sequence \"A B B * B B\"\nbecomes \"A B B\" if merge_repeated = True and \"A B B B B\" if\nmerge_repeated = False.\n\nRegardless of the value of merge_repeated, if the maximum index of a given\ntime and batch corresponds to the blank, index `(num_classes - 1)`, no new\nelement is emitted."
 }
 op {
   name: "CTCLoss"
   input_arg {
     name: "inputs"
-    description: "3-D, shape: `(max_time x batch_size x num_classes)`, the logits."
     type: DT_FLOAT
   }
   input_arg {
     name: "labels_indices"
-    description: "The indices of a `SparseTensor<int32, 2>`.\n`labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for\n`(batch b, time t)`."
     type: DT_INT64
   }
   input_arg {
     name: "labels_values"
-    description: "The values (labels) associated with the given batch and time."
     type: DT_INT32
   }
   input_arg {
     name: "sequence_length"
-    description: "A vector containing sequence lengths (batch)."
     type: DT_INT32
   }
   output_arg {
     name: "loss"
-    description: "A vector (batch) containing log-probabilities."
     type: DT_FLOAT
   }
   output_arg {
     name: "gradient"
-    description: "The gradient of `loss`.  3-D, shape:\n`(max_time x batch_size x num_classes)`."
     type: DT_FLOAT
   }
   attr {
@@ -4599,7 +4224,6 @@ op {
     default_value {
       b: false
     }
-    description: "Scalar, if true then repeated labels are\ncollapsed prior to the CTC calculation."
   }
   attr {
     name: "ctc_merge_repeated"
@@ -4607,7 +4231,6 @@ op {
     default_value {
       b: true
     }
-    description: "Scalar.  If set to false, *during* CTC calculation\nrepeated non-blank labels will not be merged and are interpreted as\nindividual labels.  This is a simplified version of CTC."
   }
   attr {
     name: "ignore_longer_outputs_than_inputs"
@@ -4615,10 +4238,7 @@ op {
     default_value {
       b: false
     }
-    description: "Scalar. If set to true, during CTC\ncalculation, items that have longer output sequences than input sequences\nare skipped: they don\'t contribute to the loss term and have zero-gradient."
   }
-  summary: "Calculates the CTC Loss (log probability) for each batch entry.  Also calculates"
-  description: "the gradient.  This class performs the softmax operation for you, so inputs\nshould be e.g. linear projections of outputs by an LSTM."
 }
 op {
   name: "CacheDataset"
@@ -4628,7 +4248,6 @@ op {
   }
   input_arg {
     name: "filename"
-    description: "A path on the filesystem where we should cache the dataset. Note: this\nwill be a directory."
     type: DT_STRING
   }
   output_arg {
@@ -4647,8 +4266,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that caches elements from `input_dataset`."
-  description: "A CacheDataset will iterate over the input_dataset, and store tensors. If the\ncache already exists, the cache will be used. If the cache is inappropriate\n(e.g. cannot be opened, contains tensors of the wrong shape / size), an error\nwill the returned when used."
 }
 op {
   name: "Cast"
@@ -4668,7 +4285,6 @@ op {
     name: "DstT"
     type: "type"
   }
-  summary: "Cast x of type SrcT to y of DstT."
 }
 op {
   name: "Ceil"
@@ -4686,12 +4302,12 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Returns element-wise smallest integer in not less than x."
 }
 op {
   name: "CheckNumerics"
@@ -4709,6 +4325,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -4717,21 +4334,16 @@ op {
   attr {
     name: "message"
     type: "string"
-    description: "Prefix of the error message."
   }
-  summary: "Checks a tensor for NaN and Inf values."
-  description: "When run, reports an `InvalidArgument` error if `tensor` has any values\nthat are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is."
 }
 op {
   name: "Cholesky"
   input_arg {
     name: "input"
-    description: "Shape is `[..., M, M]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Shape is `[..., M, M]`."
     type_attr: "T"
   }
   attr {
@@ -4746,24 +4358,19 @@ op {
       }
     }
   }
-  summary: "Computes the Cholesky decomposition of one or more square matrices."
-  description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices.\n\nThe input has to be symmetric and positive definite. Only the lower-triangular\npart of the input will be used for this operation. The upper-triangular part\nwill not be read.\n\nThe output is a tensor of the same shape as the input\ncontaining the Cholesky decompositions for all input submatrices `[..., :, :]`.\n\n**Note**: The gradient computation on GPU is faster for large matrices but\nnot for large batch dimensions when the submatrices are small. In this\ncase it might be faster to use the CPU."
 }
 op {
   name: "CholeskyGrad"
   input_arg {
     name: "l"
-    description: "Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.\nAlgorithm depends only on lower triangular part of the innermost matrices of\nthis tensor."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "df/dl where f is some scalar function. Shape is `[..., M, M]`.\nAlgorithm depends only on lower triangular part of the innermost matrices of\nthis tensor."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Symmetrized version of df/dA . Shape is `[..., M, M]`"
     type_attr: "T"
   }
   attr {
@@ -4776,30 +4383,24 @@ op {
       }
     }
   }
-  summary: "Computes the reverse mode backpropagated gradient of the Cholesky algorithm."
-  description: "For an explanation see \"Differentiation of the Cholesky algorithm\" by\nIain Murray http://arxiv.org/abs/1602.07527."
 }
 op {
   name: "CompareAndBitpack"
   input_arg {
     name: "input"
-    description: "Values to compare against `threshold` and bitpack."
     type_attr: "T"
   }
   input_arg {
     name: "threshold"
-    description: "Threshold to compare against."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The bitpacked comparisons."
     type: DT_UINT8
   }
   attr {
     name: "T"
     type: "type"
-    description: "The type of the input and threshold."
     allowed_values {
       list {
         type: DT_BOOL
@@ -4813,8 +4414,6 @@ op {
       }
     }
   }
-  summary: "Compare values of `input` to `threshold` and pack resulting bits into a `uint8`."
-  description: "Each comparison returns a boolean `true` (if `input_value > threshold`)\nor and `false` otherwise.\n\nThis operation is useful for Locality-Sensitive-Hashing (LSH) and other\nalgorithms that use hashing approximations of cosine and `L2` distances;\ncodes can be generated from an input via:\n\n```python\ncodebook_size = 50\ncodebook_bits = codebook_size * 32\ncodebook = tf.get_variable(\'codebook\', [x.shape[-1].value, codebook_bits],\n                           dtype=x.dtype,\n                           initializer=tf.orthogonal_initializer())\ncodes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)\ncodes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32\n# now codes has shape x.shape[:-1] + [codebook_size]\n```\n\n**NOTE**: Currently, the innermost dimension of the tensor must be divisible\nby 8.\n\nGiven an `input` shaped `[s0, s1, ..., s_n]`, the output is\na `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`."
 }
 op {
   name: "Complex"
@@ -4856,8 +4455,6 @@ op {
       }
     }
   }
-  summary: "Converts two real numbers to a complex number."
-  description: "Given a tensor `real` representing the real part of a complex number, and a\ntensor `imag` representing the imaginary part of a complex number, this\noperation returns complex numbers elementwise of the form \\\\(a + bj\\\\), where\n*a* represents the `real` part and *b* represents the `imag` part.\n\nThe input tensors `real` and `imag` must have the same shape.\n\nFor example:\n\n```\n# tensor \'real\' is [2.25, 3.25]\n# tensor `imag` is [4.75, 5.75]\ntf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]\n```"
 }
 op {
   name: "ComplexAbs"
@@ -4895,40 +4492,32 @@ op {
       }
     }
   }
-  summary: "Computes the complex absolute value of a tensor."
-  description: "Given a tensor `x` of complex numbers, this operation returns a tensor of type\n`float` or `double` that is the absolute value of each element in `x`. All\nelements in `x` must be complex numbers of the form \\\\(a + bj\\\\). The absolute\nvalue is computed as \\\\( \\sqrt{a^2 + b^2}\\\\)."
 }
 op {
   name: "ComputeAccidentalHits"
   input_arg {
     name: "true_classes"
-    description: "The true_classes output of UnpackSparseLabels."
     type: DT_INT64
   }
   input_arg {
     name: "sampled_candidates"
-    description: "The sampled_candidates output of CandidateSampler."
     type: DT_INT64
   }
   output_arg {
     name: "indices"
-    description: "A vector of indices corresponding to rows of true_candidates."
     type: DT_INT32
   }
   output_arg {
     name: "ids"
-    description: "A vector of IDs of positions in sampled_candidates that match a true_label\nfor the row with the corresponding index in indices."
     type: DT_INT64
   }
   output_arg {
     name: "weights"
-    description: "A vector of the same length as indices and ids, in which each element\nis -FLOAT_MAX."
     type: DT_FLOAT
   }
   attr {
     name: "num_true"
     type: "int"
-    description: "Number of true labels per context."
   }
   attr {
     name: "seed"
@@ -4936,7 +4525,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either seed or seed2 are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -4944,27 +4532,21 @@ op {
     default_value {
       i: 0
     }
-    description: "An second seed to avoid seed collision."
   }
-  summary: "Computes the ids of the positions in sampled_candidates that match true_labels."
-  description: "When doing log-odds NCE, the result of this op should be passed through a\nSparseToDense op, then added to the logits of the sampled candidates. This has\nthe effect of \'removing\' the sampled labels that match the true labels by\nmaking the classifier sure that they are sampled labels."
 }
 op {
   name: "Concat"
   input_arg {
     name: "concat_dim"
-    description: "0-D.  The dimension along which to concatenate.  Must be in the\nrange [0, rank(values))."
     type: DT_INT32
   }
   input_arg {
     name: "values"
-    description: "The `N` Tensors to concatenate. Their ranks and types must match,\nand their sizes must match in all dimensions except `concat_dim`."
     type_attr: "T"
     number_attr: "N"
   }
   output_arg {
     name: "output"
-    description: "A `Tensor` with the concatenation of values stacked along the\n`concat_dim` dimension.  This tensor\'s shape matches that of `values` except\nin `concat_dim` where it has the sum of the sizes."
     type_attr: "T"
   }
   attr {
@@ -4977,24 +4559,20 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Concatenates tensors along one dimension."
 }
 op {
   name: "ConcatOffset"
   input_arg {
     name: "concat_dim"
-    description: "The dimension along which to concatenate."
     type: DT_INT32
   }
   input_arg {
     name: "shape"
-    description: "The `N` int32 vectors representing shape of tensors being concatenated."
     type: DT_INT32
     number_attr: "N"
   }
   output_arg {
     name: "offset"
-    description: "The `N` int32 vectors representing the starting offset\nof input tensors within the concatenated output."
     type: DT_INT32
     number_attr: "N"
   }
@@ -5004,25 +4582,20 @@ op {
     has_minimum: true
     minimum: 2
   }
-  summary: "Computes offsets of concat inputs within its output."
-  description: "For example:\n\n```\n# \'x\' is [2, 2, 7]\n# \'y\' is [2, 3, 7]\n# \'z\' is [2, 5, 7]\nconcat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]\n```\n\nThis is typically used by gradient computations for a concat operation."
 }
 op {
   name: "ConcatV2"
   input_arg {
     name: "values"
-    description: "List of `N` Tensors to concatenate. Their ranks and types must match,\nand their sizes must match in all dimensions except `concat_dim`."
     type_attr: "T"
     number_attr: "N"
   }
   input_arg {
     name: "axis"
-    description: "0-D.  The dimension along which to concatenate.  Must be in the\nrange [-rank(values), rank(values))."
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    description: "A `Tensor` with the concatenation of values stacked along the\n`concat_dim` dimension.  This tensor\'s shape matches that of `values` except\nin `concat_dim` where it has the sum of the sizes."
     type_attr: "T"
   }
   attr {
@@ -5048,7 +4621,6 @@ op {
       }
     }
   }
-  summary: "Concatenates tensors along one dimension."
 }
 op {
   name: "ConcatenateDataset"
@@ -5076,35 +4648,33 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that concatenates `input_dataset` with `another_dataset`."
 }
 op {
   name: "ConditionalAccumulator"
   output_arg {
     name: "handle"
-    description: "The handle to the accumulator."
     type: DT_STRING
     is_ref: true
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of the value being accumulated."
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -5114,7 +4684,6 @@ op {
   attr {
     name: "shape"
     type: "shape"
-    description: "The shape of the values, can be [], in which case shape is unknown."
   }
   attr {
     name: "container"
@@ -5122,7 +4691,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this accumulator is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -5130,10 +4698,7 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this accumulator will be shared under the\ngiven name across multiple sessions."
   }
-  summary: "A conditional accumulator for aggregating gradients."
-  description: "The accumulator accepts gradients marked with local_step greater or\nequal to the most recent global_step known to the accumulator. The\naverage can be extracted from the accumulator, provided sufficient\ngradients have been accumulated. Extracting the average automatically\nresets the aggregate to 0, and increments the global_step recorded by\nthe accumulator."
   is_stateful: true
 }
 op {
@@ -5160,8 +4725,6 @@ op {
       }
     }
   }
-  summary: "Returns the complex conjugate of a complex number."
-  description: "Given a tensor `input` of complex numbers, this operation returns a tensor of\ncomplex numbers that are the complex conjugate of each element in `input`. The\ncomplex numbers in `input` must be of the form \\\\(a + bj\\\\), where *a* is the\nreal part and *b* is the imaginary part.\n\nThe complex conjugate returned by this operation is of the form \\\\(a - bj\\\\).\n\nFor example:\n\n```\n# tensor \'input\' is [-2.25 + 4.75j, 3.25 + 5.75j]\ntf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]\n```"
 }
 op {
   name: "ConjugateTranspose"
@@ -5194,8 +4757,6 @@ op {
       }
     }
   }
-  summary: "Shuffle dimensions of x according to a permutation and conjugate the result."
-  description: "The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:\n  `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`\n  `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`"
 }
 op {
   name: "Const"
@@ -5206,34 +4767,27 @@ op {
   attr {
     name: "value"
     type: "tensor"
-    description: "Attr `value` is the tensor to return."
   }
   attr {
     name: "dtype"
     type: "type"
   }
-  summary: "Returns a constant tensor."
 }
 op {
   name: "ControlTrigger"
-  summary: "Does nothing. Serves as a control trigger for scheduling."
-  description: "Only useful as a placeholder for control edges."
 }
 op {
   name: "Conv2D"
   input_arg {
     name: "input"
-    description: "A 4-D tensor. The dimension order is interpreted according to the value\nof `data_format`, see below for details."
     type_attr: "T"
   }
   input_arg {
     name: "filter"
-    description: "A 4-D tensor of shape\n`[filter_height, filter_width, in_channels, out_channels]`"
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "A 4-D tensor. The dimension order is determined by the value of\n`data_format`, see below for details."
     type_attr: "T"
   }
   attr {
@@ -5242,6 +4796,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -5249,7 +4804,6 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D tensor of length 4.  The stride of the sliding window for each\ndimension of `input`. The dimension order is determined by the value of\n  `data_format`, see below for details."
   }
   attr {
     name: "use_cudnn_on_gpu"
@@ -5261,7 +4815,6 @@ op {
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -5275,7 +4828,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, height, width, channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, channels, height, width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -5283,29 +4835,35 @@ op {
       }
     }
   }
-  summary: "Computes a 2-D convolution given 4-D `input` and `filter` tensors."
-  description: "Given an input tensor of shape `[batch, in_height, in_width, in_channels]`\nand a filter / kernel tensor of shape\n`[filter_height, filter_width, in_channels, out_channels]`, this op\nperforms the following:\n\n1. Flattens the filter to a 2-D matrix with shape\n   `[filter_height * filter_width * in_channels, output_channels]`.\n2. Extracts image patches from the input tensor to form a *virtual*\n   tensor of shape `[batch, out_height, out_width,\n   filter_height * filter_width * in_channels]`.\n3. For each patch, right-multiplies the filter matrix and the image patch\n   vector.\n\nIn detail, with the default NHWC format,\n\n    output[b, i, j, k] =\n        sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *\n                        filter[di, dj, q, k]\n\nMust have `strides[0] = strides[3] = 1`.  For the most common case of the same\nhorizontal and vertices strides, `strides = [1, stride, stride, 1]`."
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "Conv2DBackpropFilter"
   input_arg {
     name: "input"
-    description: "4-D with shape `[batch, in_height, in_width, in_channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "filter_sizes"
-    description: "An integer vector representing the tensor shape of `filter`,\nwhere `filter` is a 4-D\n`[filter_height, filter_width, in_channels, out_channels]` tensor."
     type: DT_INT32
   }
   input_arg {
     name: "out_backprop"
-    description: "4-D with shape `[batch, out_height, out_width, out_channels]`.\nGradients w.r.t. the output of the convolution."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "4-D with shape\n`[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.\nthe `filter` input of the convolution."
     type_attr: "T"
   }
   attr {
@@ -5314,6 +4872,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -5321,7 +4880,6 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the input\nof the convolution. Must be in the same order as the dimension specified with\nformat."
   }
   attr {
     name: "use_cudnn_on_gpu"
@@ -5333,7 +4891,6 @@ op {
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -5347,7 +4904,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -5355,28 +4911,35 @@ op {
       }
     }
   }
-  summary: "Computes the gradients of convolution with respect to the filter."
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "Conv2DBackpropInput"
   input_arg {
     name: "input_sizes"
-    description: "An integer vector representing the shape of `input`,\nwhere `input` is a 4-D `[batch, height, width, channels]` tensor."
     type: DT_INT32
   }
   input_arg {
     name: "filter"
-    description: "4-D with shape\n`[filter_height, filter_width, in_channels, out_channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "out_backprop"
-    description: "4-D with shape `[batch, out_height, out_width, out_channels]`.\nGradients w.r.t. the output of the convolution."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient\nw.r.t. the input of the convolution."
     type_attr: "T"
   }
   attr {
@@ -5385,6 +4948,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -5392,7 +4956,6 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the input\nof the convolution. Must be in the same order as the dimension specified with\nformat."
   }
   attr {
     name: "use_cudnn_on_gpu"
@@ -5404,7 +4967,6 @@ op {
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -5418,7 +4980,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -5426,18 +4987,27 @@ op {
       }
     }
   }
-  summary: "Computes the gradients of convolution with respect to the input."
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "Conv3D"
   input_arg {
     name: "input"
-    description: "Shape `[batch, in_depth, in_height, in_width, in_channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "filter"
-    description: "Shape `[filter_depth, filter_height, filter_width, in_channels,\nout_channels]`. `in_channels` must match between `input` and `filter`."
     type_attr: "T"
   }
   output_arg {
@@ -5450,6 +5020,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5458,14 +5029,12 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D tensor of length 5. The stride of the sliding window for each\ndimension of `input`. Must have `strides[0] = strides[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -5479,7 +5048,6 @@ op {
     default_value {
       s: "NDHWC"
     }
-    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
     allowed_values {
       list {
         s: "NDHWC"
@@ -5487,24 +5055,32 @@ op {
       }
     }
   }
-  summary: "Computes a 3-D convolution given 5-D `input` and `filter` tensors."
-  description: "In signal processing, cross-correlation is a measure of similarity of\ntwo waveforms as a function of a time-lag applied to one of them. This\nis also known as a sliding dot product or sliding inner-product.\n\nOur Conv3D implements a form of cross-correlation."
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "Conv3DBackpropFilter"
   input_arg {
     name: "input"
-    description: "Shape `[batch, depth, rows, cols, in_channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "filter"
-    description: "Shape `[depth, rows, cols, in_channels, out_channels]`.\n`in_channels` must match between `input` and `filter`."
     type_attr: "T"
   }
   input_arg {
     name: "out_backprop"
-    description: "Backprop signal of shape `[batch, out_depth, out_rows, out_cols,\nout_channels]`."
     type_attr: "T"
   }
   output_arg {
@@ -5525,14 +5101,12 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D tensor of length 5. The stride of the sliding window for each\ndimension of `input`. Must have `strides[0] = strides[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -5540,7 +5114,6 @@ op {
       }
     }
   }
-  summary: "Computes the gradients of 3-D convolution with respect to the filter."
   deprecation {
     version: 10
     explanation: "Use Conv3DBackpropFilterV2"
@@ -5550,17 +5123,14 @@ op {
   name: "Conv3DBackpropFilterV2"
   input_arg {
     name: "input"
-    description: "Shape `[batch, depth, rows, cols, in_channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "filter_sizes"
-    description: "An integer vector representing the tensor shape of `filter`,\nwhere `filter` is a 5-D\n`[filter_depth, filter_height, filter_width, in_channels, out_channels]`\ntensor."
     type: DT_INT32
   }
   input_arg {
     name: "out_backprop"
-    description: "Backprop signal of shape `[batch, out_depth, out_rows, out_cols,\nout_channels]`."
     type_attr: "T"
   }
   output_arg {
@@ -5573,6 +5143,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5581,14 +5152,12 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D tensor of length 5. The stride of the sliding window for each\ndimension of `input`. Must have `strides[0] = strides[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -5602,7 +5171,6 @@ op {
     default_value {
       s: "NDHWC"
     }
-    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
     allowed_values {
       list {
         s: "NDHWC"
@@ -5610,23 +5178,32 @@ op {
       }
     }
   }
-  summary: "Computes the gradients of 3-D convolution with respect to the filter."
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "Conv3DBackpropInput"
   input_arg {
     name: "input"
-    description: "Shape `[batch, depth, rows, cols, in_channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "filter"
-    description: "Shape `[depth, rows, cols, in_channels, out_channels]`.\n`in_channels` must match between `input` and `filter`."
     type_attr: "T"
   }
   input_arg {
     name: "out_backprop"
-    description: "Backprop signal of shape `[batch, out_depth, out_rows, out_cols,\nout_channels]`."
     type_attr: "T"
   }
   output_arg {
@@ -5647,14 +5224,12 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D tensor of length 5. The stride of the sliding window for each\ndimension of `input`. Must have `strides[0] = strides[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -5662,7 +5237,6 @@ op {
       }
     }
   }
-  summary: "Computes the gradients of 3-D convolution with respect to the input."
   deprecation {
     version: 10
     explanation: "Use Conv3DBackpropInputV2"
@@ -5672,17 +5246,14 @@ op {
   name: "Conv3DBackpropInputV2"
   input_arg {
     name: "input_sizes"
-    description: "An integer vector representing the tensor shape of `input`,\nwhere `input` is a 5-D\n`[batch, depth, rows, cols, in_channels]` tensor."
     type: DT_INT32
   }
   input_arg {
     name: "filter"
-    description: "Shape `[depth, rows, cols, in_channels, out_channels]`.\n`in_channels` must match between `input` and `filter`."
     type_attr: "T"
   }
   input_arg {
     name: "out_backprop"
-    description: "Backprop signal of shape `[batch, out_depth, out_rows, out_cols,\nout_channels]`."
     type_attr: "T"
   }
   output_arg {
@@ -5695,6 +5266,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5703,14 +5275,12 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D tensor of length 5. The stride of the sliding window for each\ndimension of `input`. Must have `strides[0] = strides[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -5724,7 +5294,6 @@ op {
     default_value {
       s: "NDHWC"
     }
-    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
     allowed_values {
       list {
         s: "NDHWC"
@@ -5732,7 +5301,19 @@ op {
       }
     }
   }
-  summary: "Computes the gradients of 3-D convolution with respect to the input."
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "Copy"
@@ -5824,6 +5405,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -5831,7 +5413,6 @@ op {
       }
     }
   }
-  summary: "Computes cos of x element-wise."
 }
 op {
   name: "Cosh"
@@ -5849,6 +5430,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -5856,25 +5438,21 @@ op {
       }
     }
   }
-  summary: "Computes hyperbolic cosine of x element-wise."
 }
 op {
   name: "CountUpTo"
   input_arg {
     name: "ref"
-    description: "Should be from a scalar `Variable` node."
     type_attr: "T"
     is_ref: true
   }
   output_arg {
     name: "output"
-    description: "A copy of the input before increment. If nothing else modifies the\ninput, the values produced will all be distinct."
     type_attr: "T"
   }
   attr {
     name: "limit"
     type: "int"
-    description: "If incrementing ref would bring it above limit, instead generates an\n\'OutOfRange\' error."
   }
   attr {
     name: "T"
@@ -5886,33 +5464,49 @@ op {
       }
     }
   }
-  summary: "Increments \'ref\' until it reaches \'limit\'."
+}
+op {
+  name: "CriticalSectionOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
 }
 op {
   name: "CropAndResize"
   input_arg {
     name: "image"
-    description: "A 4-D tensor of shape `[batch, image_height, image_width, depth]`.\nBoth `image_height` and `image_width` need to be positive."
     type_attr: "T"
   }
   input_arg {
     name: "boxes"
-    description: "A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor\nspecifies the coordinates of a box in the `box_ind[i]` image and is specified\nin normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of\n`y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the\n`[0, 1]` interval of normalized image height is mapped to\n`[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in\nwhich case the sampled crop is an up-down flipped version of the original\nimage. The width dimension is treated similarly. Normalized coordinates\noutside the `[0, 1]` range are allowed, in which case we use\n`extrapolation_value` to extrapolate the input image values."
     type: DT_FLOAT
   }
   input_arg {
     name: "box_ind"
-    description: "A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.\nThe value of `box_ind[i]` specifies the image that the `i`-th box refers to."
     type: DT_INT32
   }
   input_arg {
     name: "crop_size"
-    description: "A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All\ncropped image patches are resized to this size. The aspect ratio of the image\ncontent is not preserved. Both `crop_height` and `crop_width` need to be\npositive."
     type: DT_INT32
   }
   output_arg {
     name: "crops"
-    description: "A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`."
     type: DT_FLOAT
   }
   attr {
@@ -5938,7 +5532,6 @@ op {
     default_value {
       s: "bilinear"
     }
-    description: "A string specifying the interpolation method. Only \'bilinear\' is\nsupported for now."
     allowed_values {
       list {
         s: "bilinear"
@@ -5951,36 +5544,28 @@ op {
     default_value {
       f: 0
     }
-    description: "Value used for extrapolation, when applicable."
   }
-  summary: "Extracts crops from the input image tensor and bilinearly resizes them (possibly"
-  description: "with aspect ratio change) to a common output size specified by `crop_size`. This\nis more general than the `crop_to_bounding_box` op which extracts a fixed size\nslice from the input image and does not allow resizing or aspect ratio change.\n\nReturns a tensor with `crops` from the input `image` at positions defined at the\nbounding box locations in `boxes`. The cropped boxes are all resized (with\nbilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The\nresult is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The\nresizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the\nmethod will give identical results to using `tf.image.resize_bilinear()`\nwith `align_corners=True`."
 }
 op {
   name: "CropAndResizeGradBoxes"
   input_arg {
     name: "grads"
-    description: "A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`."
     type: DT_FLOAT
   }
   input_arg {
     name: "image"
-    description: "A 4-D tensor of shape `[batch, image_height, image_width, depth]`.\nBoth `image_height` and `image_width` need to be positive."
     type_attr: "T"
   }
   input_arg {
     name: "boxes"
-    description: "A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor\nspecifies the coordinates of a box in the `box_ind[i]` image and is specified\nin normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of\n`y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the\n`[0, 1]` interval of normalized image height is mapped to\n`[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in\nwhich case the sampled crop is an up-down flipped version of the original\nimage. The width dimension is treated similarly. Normalized coordinates\noutside the `[0, 1]` range are allowed, in which case we use\n`extrapolation_value` to extrapolate the input image values."
     type: DT_FLOAT
   }
   input_arg {
     name: "box_ind"
-    description: "A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.\nThe value of `box_ind[i]` specifies the image that the `i`-th box refers to."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "A 2-D tensor of shape `[num_boxes, 4]`."
     type: DT_FLOAT
   }
   attr {
@@ -6006,40 +5591,33 @@ op {
     default_value {
       s: "bilinear"
     }
-    description: "A string specifying the interpolation method. Only \'bilinear\' is\nsupported for now."
     allowed_values {
       list {
         s: "bilinear"
       }
     }
   }
-  summary: "Computes the gradient of the crop_and_resize op wrt the input boxes tensor."
 }
 op {
   name: "CropAndResizeGradImage"
   input_arg {
     name: "grads"
-    description: "A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`."
     type: DT_FLOAT
   }
   input_arg {
     name: "boxes"
-    description: "A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor\nspecifies the coordinates of a box in the `box_ind[i]` image and is specified\nin normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of\n`y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the\n`[0, 1]` interval of normalized image height is mapped to\n`[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in\nwhich case the sampled crop is an up-down flipped version of the original\nimage. The width dimension is treated similarly. Normalized coordinates\noutside the `[0, 1]` range are allowed, in which case we use\n`extrapolation_value` to extrapolate the input image values."
     type: DT_FLOAT
   }
   input_arg {
     name: "box_ind"
-    description: "A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.\nThe value of `box_ind[i]` specifies the image that the `i`-th box refers to."
     type: DT_INT32
   }
   input_arg {
     name: "image_size"
-    description: "A 1-D tensor with value `[batch, image_height, image_width, depth]`\ncontaining the original image size. Both `image_height` and `image_width` need\nto be positive."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "A 4-D tensor of shape `[batch, image_height, image_width, depth]`."
     type_attr: "T"
   }
   attr {
@@ -6059,30 +5637,25 @@ op {
     default_value {
       s: "bilinear"
     }
-    description: "A string specifying the interpolation method. Only \'bilinear\' is\nsupported for now."
     allowed_values {
       list {
         s: "bilinear"
       }
     }
   }
-  summary: "Computes the gradient of the crop_and_resize op wrt the input image tensor."
 }
 op {
   name: "Cross"
   input_arg {
     name: "a"
-    description: "A tensor containing 3-element vectors."
     type_attr: "T"
   }
   input_arg {
     name: "b"
-    description: "Another tensor, of same type and shape as `a`."
     type_attr: "T"
   }
   output_arg {
     name: "product"
-    description: "Pairwise cross product of the vectors in `a` and `b`."
     type_attr: "T"
   }
   attr {
@@ -6093,10 +5666,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -6104,19 +5678,15 @@ op {
       }
     }
   }
-  summary: "Compute the pairwise cross product."
-  description: "`a` and `b` must be the same shape; they can either be simple 3-element vectors,\nor any shape where the innermost dimension is 3. In the latter case, each pair\nof corresponding 3-element vectors is cross-multiplied independently."
 }
 op {
   name: "Cumprod"
   input_arg {
     name: "x"
-    description: "A `Tensor`. Must be one of the following types: `float32`, `float64`,\n`int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,\n`complex128`, `qint8`, `quint8`, `qint32`, `half`."
     type_attr: "T"
   }
   input_arg {
     name: "axis"
-    description: "A `Tensor` of type `int32` (default: 0). Must be in the range\n`[-rank(x), rank(x))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -6129,7 +5699,6 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, perform exclusive cumprod."
   }
   attr {
     name: "reverse"
@@ -6137,7 +5706,6 @@ op {
     default_value {
       b: false
     }
-    description: "A `bool` (default: False)."
   }
   attr {
     name: "T"
@@ -6146,17 +5714,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -6176,19 +5745,15 @@ op {
       }
     }
   }
-  summary: "Compute the cumulative product of the tensor `x` along `axis`."
-  description: "By default, this op performs an inclusive cumprod, which means that the first\nelement of the input is identical to the first element of the output:\n\n```python\ntf.cumprod([a, b, c])  # => [a, a * b, a * b * c]\n```\n\nBy setting the `exclusive` kwarg to `True`, an exclusive cumprod is\nperformed instead:\n\n```python\ntf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]\n```\n\nBy setting the `reverse` kwarg to `True`, the cumprod is performed in the\nopposite direction:\n\n```python\ntf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]\n```\n\nThis is more efficient than using separate `tf.reverse` ops.\n\nThe `reverse` and `exclusive` kwargs can also be combined:\n\n```python\ntf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]\n```"
 }
 op {
   name: "Cumsum"
   input_arg {
     name: "x"
-    description: "A `Tensor`. Must be one of the following types: `float32`, `float64`,\n`int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,\n`complex128`, `qint8`, `quint8`, `qint32`, `half`."
     type_attr: "T"
   }
   input_arg {
     name: "axis"
-    description: "A `Tensor` of type `int32` (default: 0). Must be in the range\n`[-rank(x), rank(x))`."
     type_attr: "Tidx"
   }
   output_arg {
@@ -6201,7 +5766,6 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, perform exclusive cumsum."
   }
   attr {
     name: "reverse"
@@ -6209,7 +5773,6 @@ op {
     default_value {
       b: false
     }
-    description: "A `bool` (default: False)."
   }
   attr {
     name: "T"
@@ -6218,17 +5781,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -6248,19 +5812,91 @@ op {
       }
     }
   }
-  summary: "Compute the cumulative sum of the tensor `x` along `axis`."
-  description: "By default, this op performs an inclusive cumsum, which means that the first\nelement of the input is identical to the first element of the output:\n\n```python\ntf.cumsum([a, b, c])  # => [a, a + b, a + b + c]\n```\n\nBy setting the `exclusive` kwarg to `True`, an exclusive cumsum is\nperformed instead:\n\n```python\ntf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]\n```\n\nBy setting the `reverse` kwarg to `True`, the cumsum is performed in the\nopposite direction:\n\n```python\ntf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]\n```\n\nThis is more efficient than using separate `tf.reverse` ops.\n\nThe `reverse` and `exclusive` kwargs can also be combined:\n\n```python\ntf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]\n```"
+}
+op {
+  name: "DataFormatDimMap"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "src_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "dst_format"
+    type: "string"
+    default_value {
+      s: "NCHW"
+    }
+  }
+}
+op {
+  name: "DataFormatVecPermute"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "src_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+  }
+  attr {
+    name: "dst_format"
+    type: "string"
+    default_value {
+      s: "NCHW"
+    }
+  }
 }
 op {
   name: "DatasetToSingleElement"
   input_arg {
     name: "dataset"
-    description: "A handle to a dataset that contains a single element."
     type: DT_VARIANT
   }
   output_arg {
     name: "components"
-    description: "The components of the single element of `input`."
     type_list_attr: "output_types"
   }
   attr {
@@ -6275,7 +5911,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Outputs the single element from the given dataset."
 }
 op {
   name: "DebugGradientIdentity"
@@ -6291,8 +5926,24 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Identity op for gradient debugging."
-  description: "This op is hidden from public in Python. It is used by TensorFlow Debugger to\nregister gradient tensors for gradient debugging."
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugGradientRefIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    is_ref: true
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
   allows_uninitialized_input: true
 }
 op {
@@ -6479,17 +6130,14 @@ op {
   name: "DecodeAndCropJpeg"
   input_arg {
     name: "contents"
-    description: "0-D.  The JPEG-encoded image."
     type: DT_STRING
   }
   input_arg {
     name: "crop_window"
-    description: "1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width]."
     type: DT_INT32
   }
   output_arg {
     name: "image"
-    description: "3-D with shape `[height, width, channels]`.."
     type: DT_UINT8
   }
   attr {
@@ -6498,7 +6146,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Number of color channels for the decoded image."
   }
   attr {
     name: "ratio"
@@ -6506,7 +6153,6 @@ op {
     default_value {
       i: 1
     }
-    description: "Downscaling ratio."
   }
   attr {
     name: "fancy_upscaling"
@@ -6514,7 +6160,6 @@ op {
     default_value {
       b: true
     }
-    description: "If true use a slower but nicer upscaling of the\nchroma planes (yuv420/422 only)."
   }
   attr {
     name: "try_recover_truncated"
@@ -6522,7 +6167,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true try to recover an image from truncated input."
   }
   attr {
     name: "acceptable_fraction"
@@ -6530,7 +6174,6 @@ op {
     default_value {
       f: 1
     }
-    description: "The minimum required fraction of lines before a truncated\ninput is accepted."
   }
   attr {
     name: "dct_method"
@@ -6538,36 +6181,27 @@ op {
     default_value {
       s: ""
     }
-    description: "string specifying a hint about the algorithm used for\ndecompression.  Defaults to \"\" which maps to a system-specific\ndefault.  Currently valid values are [\"INTEGER_FAST\",\n\"INTEGER_ACCURATE\"].  The hint may be ignored (e.g., the internal\njpeg library changes to a version that does not have that specific\noption.)"
   }
-  summary: "Decode and Crop a JPEG-encoded image to a uint8 tensor."
-  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the JPEG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n\nIf needed, the JPEG-encoded image is transformed to match the requested number\nof color channels.\n\nThe attr `ratio` allows downscaling the image by an integer factor during\ndecoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than\ndownscaling the image later.\n\n\nIt is equivalent to a combination of decode and crop, but much faster by only\ndecoding partial jpeg image."
 }
 op {
   name: "DecodeBase64"
   input_arg {
     name: "input"
-    description: "Base64 strings to decode."
     type: DT_STRING
   }
   output_arg {
     name: "output"
-    description: "Decoded strings."
     type: DT_STRING
   }
-  summary: "Decode web-safe base64-encoded strings."
-  description: "Input may or may not have padding at the end. See EncodeBase64 for padding.\nWeb-safe means that input must use - and _ instead of + and /."
 }
 op {
   name: "DecodeBmp"
   input_arg {
     name: "contents"
-    description: "0-D.  The BMP-encoded image."
     type: DT_STRING
   }
   output_arg {
     name: "image"
-    description: "3-D with shape `[height, width, channels]`. RGB order"
     type: DT_UINT8
   }
   attr {
@@ -6577,24 +6211,19 @@ op {
       i: 0
     }
   }
-  summary: "Decode the first frame of a BMP-encoded image to a uint8 tensor."
-  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the BMP-encoded image.\n*   3: output an RGB image.\n*   4: output an RGBA image."
 }
 op {
   name: "DecodeCSV"
   input_arg {
     name: "records"
-    description: "Each string is a record/row in the csv and all records should have\nthe same format."
     type: DT_STRING
   }
   input_arg {
     name: "record_defaults"
-    description: "One tensor per column of the input record, with either a\nscalar default value for that column or empty if the column is required."
     type_list_attr: "OUT_TYPE"
   }
   output_arg {
     name: "output"
-    description: "Each tensor will have the same shape as records."
     type_list_attr: "OUT_TYPE"
   }
   attr {
@@ -6618,7 +6247,6 @@ op {
     default_value {
       s: ","
     }
-    description: "char delimiter to separate fields in a record."
   }
   attr {
     name: "use_quote_delim"
@@ -6626,7 +6254,6 @@ op {
     default_value {
       b: true
     }
-    description: "If false, treats double quotation marks as regular\ncharacters inside of the string fields (ignoring RFC 4180, Section 2,\nBullet 5)."
   }
   attr {
     name: "na_value"
@@ -6634,51 +6261,56 @@ op {
     default_value {
       s: ""
     }
-    description: "Additional string to recognize as NA/NaN."
   }
-  summary: "Convert CSV records to tensors. Each column maps to one tensor."
-  description: "RFC 4180 format is expected for the CSV records.\n(https://tools.ietf.org/html/rfc4180)\nNote that we allow leading and trailing spaces with int or float field."
+}
+op {
+  name: "DecodeCompressed"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
 }
 op {
   name: "DecodeGif"
   input_arg {
     name: "contents"
-    description: "0-D.  The GIF-encoded image."
     type: DT_STRING
   }
   output_arg {
     name: "image"
-    description: "4-D with shape `[num_frames, height, width, 3]`. RGB order"
     type: DT_UINT8
   }
-  summary: "Decode the first frame of a GIF-encoded image to a uint8 tensor."
-  description: "GIF with frame or transparency compression are not supported\nconvert animated GIF from compressed to uncompressed by:\n\n    convert $src.gif -coalesce $dst.gif\n\nThis op also supports decoding JPEGs and PNGs, though it is cleaner to use\n`tf.image.decode_image`."
 }
 op {
   name: "DecodeJSONExample"
   input_arg {
     name: "json_examples"
-    description: "Each string is a JSON object serialized according to the JSON\nmapping of the Example proto."
     type: DT_STRING
   }
   output_arg {
     name: "binary_examples"
-    description: "Each string is a binary Example protocol buffer corresponding\nto the respective element of `json_examples`."
     type: DT_STRING
   }
-  summary: "Convert JSON-encoded Example records to binary protocol buffer strings."
-  description: "This op translates a tensor containing Example records, encoded using\nthe [standard JSON\nmapping](https://developers.google.com/protocol-buffers/docs/proto3#json),\ninto a tensor containing the same records encoded as binary protocol\nbuffers. The resulting tensor can then be fed to any of the other\nExample-parsing ops."
 }
 op {
   name: "DecodeJpeg"
   input_arg {
     name: "contents"
-    description: "0-D.  The JPEG-encoded image."
     type: DT_STRING
   }
   output_arg {
     name: "image"
-    description: "3-D with shape `[height, width, channels]`.."
     type: DT_UINT8
   }
   attr {
@@ -6687,7 +6319,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Number of color channels for the decoded image."
   }
   attr {
     name: "ratio"
@@ -6695,7 +6326,6 @@ op {
     default_value {
       i: 1
     }
-    description: "Downscaling ratio."
   }
   attr {
     name: "fancy_upscaling"
@@ -6703,7 +6333,6 @@ op {
     default_value {
       b: true
     }
-    description: "If true use a slower but nicer upscaling of the\nchroma planes (yuv420/422 only)."
   }
   attr {
     name: "try_recover_truncated"
@@ -6711,7 +6340,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true try to recover an image from truncated input."
   }
   attr {
     name: "acceptable_fraction"
@@ -6719,7 +6347,6 @@ op {
     default_value {
       f: 1
     }
-    description: "The minimum required fraction of lines before a truncated\ninput is accepted."
   }
   attr {
     name: "dct_method"
@@ -6727,21 +6354,16 @@ op {
     default_value {
       s: ""
     }
-    description: "string specifying a hint about the algorithm used for\ndecompression.  Defaults to \"\" which maps to a system-specific\ndefault.  Currently valid values are [\"INTEGER_FAST\",\n\"INTEGER_ACCURATE\"].  The hint may be ignored (e.g., the internal\njpeg library changes to a version that does not have that specific\noption.)"
   }
-  summary: "Decode a JPEG-encoded image to a uint8 tensor."
-  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the JPEG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n\nIf needed, the JPEG-encoded image is transformed to match the requested number\nof color channels.\n\nThe attr `ratio` allows downscaling the image by an integer factor during\ndecoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than\ndownscaling the image later.\n\n\nThis op also supports decoding PNGs and non-animated GIFs since the interface is\nthe same, though it is cleaner to use `tf.image.decode_image`."
 }
 op {
   name: "DecodePng"
   input_arg {
     name: "contents"
-    description: "0-D.  The PNG-encoded image."
     type: DT_STRING
   }
   output_arg {
     name: "image"
-    description: "3-D with shape `[height, width, channels]`."
     type_attr: "dtype"
   }
   attr {
@@ -6750,7 +6372,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Number of color channels for the decoded image."
   }
   attr {
     name: "dtype"
@@ -6765,19 +6386,15 @@ op {
       }
     }
   }
-  summary: "Decode a PNG-encoded image to a uint8 or uint16 tensor."
-  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the PNG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the PNG-encoded image is transformed to match the requested number\nof color channels.\n\nThis op also supports decoding JPEGs and non-animated GIFs since the interface\nis the same, though it is cleaner to use `tf.image.decode_image`."
 }
 op {
   name: "DecodeRaw"
   input_arg {
     name: "bytes"
-    description: "All the elements must have the same length."
     type: DT_STRING
   }
   output_arg {
     name: "output"
-    description: "A Tensor with one more dimension than the input `bytes`.  The\nadded dimension will have size equal to the length of the elements\nof `bytes` divided by the number of bytes to represent `out_type`."
     type_attr: "out_type"
   }
   attr {
@@ -6803,25 +6420,20 @@ op {
     default_value {
       b: true
     }
-    description: "Whether the input `bytes` are in little-endian order.\nIgnored for `out_type` values that are stored in a single byte like\n`uint8`."
   }
-  summary: "Reinterpret the bytes of a string as a vector of numbers."
 }
 op {
   name: "DecodeWav"
   input_arg {
     name: "contents"
-    description: "The WAV-encoded audio, usually from a file."
     type: DT_STRING
   }
   output_arg {
     name: "audio"
-    description: "2-D with shape `[length, channels]`."
     type: DT_FLOAT
   }
   output_arg {
     name: "sample_rate"
-    description: "Scalar holding the sample rate found in the WAV header."
     type: DT_INT32
   }
   attr {
@@ -6830,7 +6442,6 @@ op {
     default_value {
       i: -1
     }
-    description: "Number of sample channels wanted."
   }
   attr {
     name: "desired_samples"
@@ -6838,45 +6449,36 @@ op {
     default_value {
       i: -1
     }
-    description: "Length of audio requested."
   }
-  summary: "Decode a 16-bit PCM WAV file to a float tensor."
-  description: "The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.\n\nWhen desired_channels is set, if the input contains fewer channels than this\nthen the last channel will be duplicated to give the requested number, else if\nthe input has more channels than requested then the additional channels will be\nignored.\n\nIf desired_samples is set, then the audio will be cropped or padded with zeroes\nto the requested length.\n\nThe first output contains a Tensor with the content of the audio samples. The\nlowest dimension will be the number of channels, and the second will be the\nnumber of samples. For example, a ten-sample-long stereo WAV file should give an\noutput shape of [10, 2]."
 }
 op {
   name: "DeleteSessionTensor"
   input_arg {
     name: "handle"
-    description: "The handle for a tensor stored in the session state."
     type: DT_STRING
   }
-  summary: "Delete the tensor specified by its handle in the session."
+  is_stateful: true
 }
 op {
   name: "DenseToDenseSetOperation"
   input_arg {
     name: "set1"
-    description: "`Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.\nDimension `n` contains values in a set, duplicates are allowed but ignored."
     type_attr: "T"
   }
   input_arg {
     name: "set2"
-    description: "`Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.\nDimension `n` contains values in a set, duplicates are allowed but ignored."
     type_attr: "T"
   }
   output_arg {
     name: "result_indices"
-    description: "2D indices of a `SparseTensor`."
     type: DT_INT64
   }
   output_arg {
     name: "result_values"
-    description: "1D values of a `SparseTensor`."
     type_attr: "T"
   }
   output_arg {
     name: "result_shape"
-    description: "1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is\nthe same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`\nis the max result set size across all `0...n-1` dimensions."
     type: DT_INT64
   }
   attr {
@@ -6905,24 +6507,19 @@ op {
       }
     }
   }
-  summary: "Applies set operation along last dimension of 2 `Tensor` inputs."
-  description: "See SetOperationOp::SetOperationFromContext for values of `set_operation`.\n\nOutput `result` is a `SparseTensor` represented by `result_indices`,\n`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this\nhas rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`\ndimension contains the result of `set_operation` applied to the corresponding\n`[0...n-1]` dimension of `set`."
 }
 op {
   name: "DenseToSparseBatchDataset"
   input_arg {
     name: "input_dataset"
-    description: "A handle to an input dataset. Must have a single component."
     type: DT_VARIANT
   }
   input_arg {
     name: "batch_size"
-    description: "A scalar representing the number of elements to accumulate in a\nbatch."
     type: DT_INT64
   }
   input_arg {
     name: "row_shape"
-    description: "A vector representing the dense shape of each row in the produced\nSparseTensor. The shape may be partially specified, using `-1` to indicate\nthat a particular dimension should use the maximum size of all batch elements."
     type: DT_INT64
   }
   output_arg {
@@ -6941,43 +6538,35 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that yields a SparseTensor for each element of the input."
 }
 op {
   name: "DenseToSparseSetOperation"
   input_arg {
     name: "set1"
-    description: "`Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.\nDimension `n` contains values in a set, duplicates are allowed but ignored."
     type_attr: "T"
   }
   input_arg {
     name: "set2_indices"
-    description: "2D `Tensor`, indices of a `SparseTensor`. Must be in row-major\norder."
     type: DT_INT64
   }
   input_arg {
     name: "set2_values"
-    description: "1D `Tensor`, values of a `SparseTensor`. Must be in row-major\norder."
     type_attr: "T"
   }
   input_arg {
     name: "set2_shape"
-    description: "1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must\nbe the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the\nmax set size across `n-1` dimensions."
     type: DT_INT64
   }
   output_arg {
     name: "result_indices"
-    description: "2D indices of a `SparseTensor`."
     type: DT_INT64
   }
   output_arg {
     name: "result_values"
-    description: "1D values of a `SparseTensor`."
     type_attr: "T"
   }
   output_arg {
     name: "result_shape"
-    description: "1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is\nthe same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`\nis the max result set size across all `0...n-1` dimensions."
     type: DT_INT64
   }
   attr {
@@ -7006,8 +6595,6 @@ op {
       }
     }
   }
-  summary: "Applies set operation along last dimension of `Tensor` and `SparseTensor`."
-  description: "See SetOperationOp::SetOperationFromContext for values of `set_operation`.\n\nInput `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,\nand `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same\nas `set1`. Dimension `n` contains values in a set, duplicates are allowed but\nignored.\n\nIf `validate_indices` is `True`, this op validates the order and range of `set2`\nindices.\n\nOutput `result` is a `SparseTensor` represented by `result_indices`,\n`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this\nhas rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`\ndimension contains the result of `set_operation` applied to the corresponding\n`[0...n-1]` dimension of `set`."
 }
 op {
   name: "DepthToSpace"
@@ -7026,7 +6613,6 @@ op {
   attr {
     name: "block_size"
     type: "int"
-    description: "The size of the spatial block, same as in Space2Depth."
     has_minimum: true
     minimum: 2
   }
@@ -7044,8 +6630,6 @@ op {
       }
     }
   }
-  summary: "DepthToSpace for tensors of type T."
-  description: "Rearranges data from depth into blocks of spatial data.\nThis is the reverse transformation of SpaceToDepth. More specifically,\nthis op outputs a copy of the input tensor where values from the `depth`\ndimension are moved in spatial blocks to the `height` and `width` dimensions.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Chunks of data of size `block_size * block_size` from depth are rearranged\n    into non-overlapping blocks of size `block_size x block_size`\n  * The width the output tensor is `input_depth * block_size`, whereas the\n    height is `input_height * block_size`.\n  * The Y, X coordinates within each block of the output image are determined\n    by the high order component of the input channel index.\n  * The depth of the input tensor must be divisible by\n    `block_size * block_size`.\n\nThe `data_format` attr specifies the layout of the input and output tensors\nwith the following options:\n  \"NHWC\": `[ batch, height, width, channels ]`\n  \"NCHW\": `[ batch, channels, height, width ]`\n  \"NCHW_VECT_C\":\n      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`\n\nIt is useful to consider the operation as transforming a 6-D Tensor.\ne.g. for data_format = NHWC,\n     Each element in the input tensor can be specified via 6 coordinates,\n     ordered by decreasing memory layout significance as:\n     n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates\n                        within the input image, bX, bY means coordinates\n                        within the output block, oC means output channels).\n     The output would be the input transposed to the following layout:\n     n,iY,bY,iX,bX,oC\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given an input of shape `[1, 1, 1, 4]`, data_format = \"NHWC\" and\nblock_size = 2:\n\n```\nx = [[[[1, 2, 3, 4]]]]\n\n```\n\nThis operation will output a tensor of shape `[1, 2, 2, 1]`:\n\n```\n   [[[[1], [2]],\n     [[3], [4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,\nthe corresponding output will have 2x2 elements and will have a depth of\n1 channel (1 = `4 / (block_size * block_size)`).\nThe output element shape is `[2, 2, 1]`.\n\nFor an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.\n\n```\nx = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nThis operation, for block size of 2, will return the following tensor of shape\n`[1, 2, 2, 3]`\n\n```\n   [[[[1, 2, 3], [4, 5, 6]],\n     [[7, 8, 9], [10, 11, 12]]]]\n\n```\n\nSimilarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:\n\n```\nx =  [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 4 4 1]`:\n\n```\nx = [[[ [1],   [2],  [5],  [6]],\n      [ [3],   [4],  [7],  [8]],\n      [ [9],  [10], [13],  [14]],\n      [ [11], [12], [15],  [16]]]]\n\n```"
 }
 op {
   name: "DepthwiseConv2dNative"
@@ -7066,6 +6650,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7074,12 +6660,10 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D of length 4.  The stride of the sliding window for each dimension\nof `input`."
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -7093,7 +6677,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, height, width, channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, channels, height, width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -7101,29 +6684,35 @@ op {
       }
     }
   }
-  summary: "Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors."
-  description: "Given an input tensor of shape `[batch, in_height, in_width, in_channels]`\nand a filter / kernel tensor of shape\n`[filter_height, filter_width, in_channels, channel_multiplier]`, containing\n`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies\na different filter to each input channel (expanding from 1 channel to\n`channel_multiplier` channels for each), then concatenates the results\ntogether. Thus, the output has `in_channels * channel_multiplier` channels.\n\n```\nfor k in 0..in_channels-1\n  for q in 0..channel_multiplier-1\n    output[b, i, j, k * channel_multiplier + q] =\n      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *\n                        filter[di, dj, k, q]\n```\n\nMust have `strides[0] = strides[3] = 1`.  For the most common case of the same\nhorizontal and vertices strides, `strides = [1, stride, stride, 1]`."
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "DepthwiseConv2dNativeBackpropFilter"
   input_arg {
     name: "input"
-    description: "4-D with shape based on `data_format`.  For example, if\n`data_format` is \'NHWC\' then `input` is a 4-D `[batch, in_height,\nin_width, in_channels]` tensor."
     type_attr: "T"
   }
   input_arg {
     name: "filter_sizes"
-    description: "An integer vector representing the tensor shape of `filter`,\nwhere `filter` is a 4-D\n`[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor."
     type: DT_INT32
   }
   input_arg {
     name: "out_backprop"
-    description: "4-D with shape  based on `data_format`.\nFor example, if `data_format` is \'NHWC\' then\nout_backprop shape is `[batch, out_height, out_width, out_channels]`.\nGradients w.r.t. the output of the convolution."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "4-D with shape\n`[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.\nthe `filter` input of the convolution."
     type_attr: "T"
   }
   attr {
@@ -7131,6 +6720,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7139,12 +6729,10 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the input\nof the convolution."
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -7158,7 +6746,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, height, width, channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, channels, height, width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -7166,28 +6753,35 @@ op {
       }
     }
   }
-  summary: "Computes the gradients of depthwise convolution with respect to the filter."
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "DepthwiseConv2dNativeBackpropInput"
   input_arg {
     name: "input_sizes"
-    description: "An integer vector representing the shape of `input`, based\non `data_format`.  For example, if `data_format` is \'NHWC\' then\n `input` is a 4-D `[batch, height, width, channels]` tensor."
     type: DT_INT32
   }
   input_arg {
     name: "filter"
-    description: "4-D with shape\n`[filter_height, filter_width, in_channels, depthwise_multiplier]`."
     type_attr: "T"
   }
   input_arg {
     name: "out_backprop"
-    description: "4-D with shape  based on `data_format`.\nFor example, if `data_format` is \'NHWC\' then\nout_backprop shape is `[batch, out_height, out_width, out_channels]`.\nGradients w.r.t. the output of the convolution."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "4-D with shape according to `data_format`.  For example, if\n`data_format` is \'NHWC\', output shape is `[batch, in_height,\nin_width, in_channels]`.  Gradient w.r.t. the input of the\nconvolution."
     type_attr: "T"
   }
   attr {
@@ -7195,6 +6789,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7203,12 +6798,10 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the input\nof the convolution."
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -7222,7 +6815,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, height, width, channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, channels, height, width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -7230,7 +6822,18 @@ op {
       }
     }
   }
-  summary: "Computes the gradients of depthwise convolution with respect to the input."
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "Dequantize"
@@ -7240,12 +6843,10 @@ op {
   }
   input_arg {
     name: "min_range"
-    description: "The minimum scalar value possibly produced for the input."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_range"
-    description: "The maximum scalar value possibly produced for the input."
     type: DT_FLOAT
   }
   output_arg {
@@ -7259,9 +6860,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -7279,29 +6880,23 @@ op {
       }
     }
   }
-  summary: "Dequantize the \'input\' tensor into a float Tensor."
-  description: "[min_range, max_range] are scalar floats that specify the range for\nthe \'input\' data. The \'mode\' attribute controls exactly which calculations are\nused to convert the float values to their quantized equivalents.\n\nIn \'MIN_COMBINED\' mode, each value of the tensor will undergo the following:\n\n```\nif T == qint8, in[i] += (range(T) + 1)/ 2.0\nout[i] = min_range + (in[i]* (max_range - min_range) / range(T))\n```\nhere `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`\n\n*MIN_COMBINED Mode Example*\n\nIf the input comes from a QuantizedRelu6, the output type is\nquint8 (range of 0-255) but the possible range of QuantizedRelu6 is\n0-6.  The min_range and max_range values are therefore 0.0 and 6.0.\nDequantize on quint8 will take each value, cast to float, and multiply\nby 6 / 255.\nNote that if quantizedtype is qint8, the operation will additionally add\neach value by 128 prior to casting.\n\nIf the mode is \'MIN_FIRST\', then this approach is used:\n\n```c++\nnum_discrete_values = 1 << (# of bits in T)\nrange_adjust = num_discrete_values / (num_discrete_values - 1)\nrange = (range_max - range_min) * range_adjust\nrange_scale = range / num_discrete_values\nconst double offset_input = static_cast<double>(input) - lowest_quantized;\nresult = range_min + ((input - numeric_limits<T>::min()) * range_scale)\n```\n\n*SCALED mode Example*\n\n`SCALED` mode matches the quantization approach used in\n`QuantizeAndDequantize{V2|V3}`.\n\nIf the mode is `SCALED`, we do not use the full range of the output type,\nchoosing to elide the lowest possible value for symmetry (e.g., output range is\n-127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to\n0.\n\nWe first find the range of values in our tensor. The\nrange we use is always centered on 0, so we find m such that\n```c++\n  m = max(abs(input_min), abs(input_max))\n```\n\nOur input tensor range is then `[-m, m]`.\n\nNext, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.\nIf T is signed, this is\n```\n  num_bits = sizeof(T) * 8\n  [min_fixed, max_fixed] =\n      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]\n```\n\nOtherwise, if T is unsigned, the fixed-point range is\n```\n  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]\n```\n\nFrom this we compute our scaling factor, s:\n```c++\n  s = (2 * m) / (max_fixed - min_fixed)\n```\n\nNow we can dequantize the elements of our tensor:\n```c++\nresult = input * s\n```"
 }
 op {
   name: "DeserializeIterator"
   input_arg {
     name: "resource_handle"
-    description: "A handle to an iterator resource."
     type: DT_RESOURCE
   }
   input_arg {
     name: "serialized"
-    description: "A variant tensor storing the state of the iterator contained in the\nresource."
     type: DT_VARIANT
   }
-  summary: "Converts the given variant tensor to an iterator and stores it in the given resource."
   is_stateful: true
 }
 op {
   name: "DeserializeManySparse"
   input_arg {
     name: "serialized_sparse"
-    description: "2-D, The `N` serialized `SparseTensor` objects.\nMust have 3 columns."
     type: DT_STRING
   }
   output_arg {
@@ -7319,17 +6914,13 @@ op {
   attr {
     name: "dtype"
     type: "type"
-    description: "The `dtype` of the serialized `SparseTensor` objects."
   }
-  summary: "Deserialize and concatenate `SparseTensors` from a serialized minibatch."
-  description: "The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where\n`N` is the minibatch size and the rows correspond to packed outputs of\n`SerializeSparse`.  The ranks of the original `SparseTensor` objects\nmust all match.  When the final `SparseTensor` is created, it has rank one\nhigher than the ranks of the incoming `SparseTensor` objects\n(they have been concatenated along a new row dimension).\n\nThe output `SparseTensor` object\'s shape values for all dimensions but the\nfirst are the max across the input `SparseTensor` objects\' shape values\nfor the corresponding dimensions.  Its first shape value is `N`, the minibatch\nsize.\n\nThe input `SparseTensor` objects\' indices are assumed ordered in\nstandard lexicographic order.  If this is not the case, after this\nstep run `SparseReorder` to restore index ordering.\n\nFor example, if the serialized input is a `[2 x 3]` matrix representing two\noriginal `SparseTensor` objects:\n\n    index = [ 0]\n            [10]\n            [20]\n    values = [1, 2, 3]\n    shape = [50]\n\nand\n\n    index = [ 2]\n            [10]\n    values = [4, 5]\n    shape = [30]\n\nthen the final deserialized `SparseTensor` will be:\n\n    index = [0  0]\n            [0 10]\n            [0 20]\n            [1  2]\n            [1 10]\n    values = [1, 2, 3, 4, 5]\n    shape = [2 50]"
 }
 op {
   name: "DeserializeSparse"
   input_arg {
     name: "serialized_sparse"
-    description: "The serialized `SparseTensor` objects. The last dimension\nmust have 3 columns."
-    type: DT_STRING
+    type_attr: "Tserialized"
   }
   output_arg {
     name: "sparse_indices"
@@ -7346,16 +6937,26 @@ op {
   attr {
     name: "dtype"
     type: "type"
-    description: "The `dtype` of the serialized `SparseTensor` objects."
   }
-  summary: "Deserialize `SparseTensor` objects."
-}
-op {
-  name: "DestroyResourceOp"
-  input_arg {
-    name: "resource"
-    description: "handle to the resource to delete."
-    type: DT_RESOURCE
+  attr {
+    name: "Tserialized"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
+}
+op {
+  name: "DestroyResourceOp"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
   }
   attr {
     name: "ignore_lookup_error"
@@ -7363,17 +6964,13 @@ op {
     default_value {
       b: true
     }
-    description: "whether to ignore the error when the resource\ndoesn\'t exist."
   }
-  summary: "Deletes the resource specified by the handle."
-  description: "All subsequent operations using the resource will result in a NotFound\nerror status."
   is_stateful: true
 }
 op {
   name: "DestroyTemporaryVariable"
   input_arg {
     name: "ref"
-    description: "A reference to the temporary variable tensor."
     type_attr: "T"
     is_ref: true
   }
@@ -7388,16 +6985,12 @@ op {
   attr {
     name: "var_name"
     type: "string"
-    description: "Name of the temporary variable, usually the name of the matching\n\'TemporaryVariable\' op."
   }
-  summary: "Destroys the temporary variable and returns its final value."
-  description: "Sets output to the value of the Tensor pointed to by \'ref\', then destroys\nthe temporary variable called \'var_name\'.\nAll other uses of \'ref\' *must* have executed before this op.\nThis is typically achieved by chaining the ref through each assign op, or by\nusing control dependencies.\n\nOutputs the final value of the tensor pointed to by \'ref\'."
 }
 op {
   name: "Diag"
   input_arg {
     name: "diagonal"
-    description: "Rank k tensor where k is at most 1."
     type_attr: "T"
   }
   output_arg {
@@ -7409,6 +7002,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -7418,19 +7012,15 @@ op {
       }
     }
   }
-  summary: "Returns a diagonal tensor with a given diagonal values."
-  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of\nrank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:\n\n`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.\n\nFor example:\n\n```\n# \'diagonal\' is [1, 2, 3, 4]\ntf.diag(diagonal) ==> [[1, 0, 0, 0]\n                       [0, 2, 0, 0]\n                       [0, 0, 3, 0]\n                       [0, 0, 0, 4]]\n```"
 }
 op {
   name: "DiagPart"
   input_arg {
     name: "input"
-    description: "Rank k tensor where k is even and not zero."
     type_attr: "T"
   }
   output_arg {
     name: "diagonal"
-    description: "The extracted diagonal."
     type_attr: "T"
   }
   attr {
@@ -7438,6 +7028,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -7447,8 +7038,6 @@ op {
       }
     }
   }
-  summary: "Returns the diagonal part of the tensor."
-  description: "This operation returns a tensor with the `diagonal` part\nof the `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a\ntensor of rank `k` with dimensions `[D1,..., Dk]` where:\n\n`diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.\n\nFor example:\n\n```\n# \'input\' is [[1, 0, 0, 0]\n              [0, 2, 0, 0]\n              [0, 0, 3, 0]\n              [0, 0, 0, 4]]\n\ntf.diag_part(input) ==> [1, 2, 3, 4]\n```"
 }
 op {
   name: "Digamma"
@@ -7466,29 +7055,25 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes Psi, the derivative of Lgamma (the log of the absolute value of"
-  description: "`Gamma(x)`), element-wise."
 }
 op {
   name: "Dilation2D"
   input_arg {
     name: "input"
-    description: "4-D with shape `[batch, in_height, in_width, depth]`."
     type_attr: "T"
   }
   input_arg {
     name: "filter"
-    description: "3-D with shape `[filter_height, filter_width, depth]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "4-D with shape `[batch, out_height, out_width, depth]`."
     type_attr: "T"
   }
   attr {
@@ -7499,10 +7084,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -7513,21 +7099,18 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the input\ntensor. Must be: `[1, stride_height, stride_width, 1]`."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "rates"
     type: "list(int)"
-    description: "The input stride for atrous morphological dilation. Must be:\n`[1, rate_height, rate_width, 1]`."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -7535,29 +7118,23 @@ op {
       }
     }
   }
-  summary: "Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors."
-  description: "The `input` tensor has shape `[batch, in_height, in_width, depth]` and the\n`filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each\ninput channel is processed independently of the others with its own structuring\nfunction. The `output` tensor has shape\n`[batch, out_height, out_width, depth]`. The spatial dimensions of the output\ntensor depend on the `padding` algorithm. We currently only support the default\n\"NHWC\" `data_format`.\n\nIn detail, the grayscale morphological 2-D dilation is the max-sum correlation\n(for consistency with `conv2d`, we use unmirrored filters):\n\n    output[b, y, x, c] =\n       max_{dy, dx} input[b,\n                          strides[1] * y + rates[1] * dy,\n                          strides[2] * x + rates[2] * dx,\n                          c] +\n                    filter[dy, dx, c]\n\nMax-pooling is a special case when the filter has size equal to the pooling\nkernel size and contains all zeros.\n\nNote on duality: The dilation of `input` by the `filter` is equal to the\nnegation of the erosion of `-input` by the reflected `filter`."
 }
 op {
   name: "Dilation2DBackpropFilter"
   input_arg {
     name: "input"
-    description: "4-D with shape `[batch, in_height, in_width, depth]`."
     type_attr: "T"
   }
   input_arg {
     name: "filter"
-    description: "3-D with shape `[filter_height, filter_width, depth]`."
     type_attr: "T"
   }
   input_arg {
     name: "out_backprop"
-    description: "4-D with shape `[batch, out_height, out_width, depth]`."
     type_attr: "T"
   }
   output_arg {
     name: "filter_backprop"
-    description: "3-D with shape `[filter_height, filter_width, depth]`."
     type_attr: "T"
   }
   attr {
@@ -7568,10 +7145,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -7582,21 +7160,18 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D of length 4. The stride of the sliding window for each dimension of\nthe input tensor. Must be: `[1, stride_height, stride_width, 1]`."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "rates"
     type: "list(int)"
-    description: "1-D of length 4. The input stride for atrous morphological dilation.\nMust be: `[1, rate_height, rate_width, 1]`."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -7604,28 +7179,23 @@ op {
       }
     }
   }
-  summary: "Computes the gradient of morphological 2-D dilation with respect to the filter."
 }
 op {
   name: "Dilation2DBackpropInput"
   input_arg {
     name: "input"
-    description: "4-D with shape `[batch, in_height, in_width, depth]`."
     type_attr: "T"
   }
   input_arg {
     name: "filter"
-    description: "3-D with shape `[filter_height, filter_width, depth]`."
     type_attr: "T"
   }
   input_arg {
     name: "out_backprop"
-    description: "4-D with shape `[batch, out_height, out_width, depth]`."
     type_attr: "T"
   }
   output_arg {
     name: "in_backprop"
-    description: "4-D with shape `[batch, in_height, in_width, depth]`."
     type_attr: "T"
   }
   attr {
@@ -7636,10 +7206,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -7650,21 +7221,18 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D of length 4. The stride of the sliding window for each dimension of\nthe input tensor. Must be: `[1, stride_height, stride_width, 1]`."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "rates"
     type: "list(int)"
-    description: "1-D of length 4. The input stride for atrous morphological dilation.\nMust be: `[1, rate_height, rate_width, 1]`."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -7672,7 +7240,6 @@ op {
       }
     }
   }
-  summary: "Computes the gradient of morphological 2-D dilation with respect to the input."
 }
 op {
   name: "Div"
@@ -7694,6 +7261,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -7707,24 +7275,19 @@ op {
       }
     }
   }
-  summary: "Returns x / y element-wise."
-  description: "*NOTE*: `Div` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "DrawBoundingBoxes"
   input_arg {
     name: "images"
-    description: "4-D with shape `[batch, height, width, depth]`. A batch of images."
     type_attr: "T"
   }
   input_arg {
     name: "boxes"
-    description: "3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding\nboxes."
     type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    description: "4-D with the same shape as `images`. The batch of input images with\nbounding boxes drawn on the images."
     type_attr: "T"
   }
   attr {
@@ -7740,8 +7303,6 @@ op {
       }
     }
   }
-  summary: "Draw bounding boxes on a batch of images."
-  description: "Outputs a copy of `images` but draws on top of the pixels zero or more bounding\nboxes specified by the locations in `boxes`. The coordinates of the each\nbounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The\nbounding box coordinates are floats in `[0.0, 1.0]` relative to the width and\nheight of the underlying image.\n\nFor example, if an image is 100 x 200 pixels (height x width) and the bounding\nbox is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of\nthe bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).\n\nParts of the bounding box may fall outside the image."
 }
 op {
   name: "DynamicPartition"
@@ -7751,7 +7312,6 @@ op {
   }
   input_arg {
     name: "partitions"
-    description: "Any shape.  Indices in the range `[0, num_partitions)`."
     type: DT_INT32
   }
   output_arg {
@@ -7762,7 +7322,6 @@ op {
   attr {
     name: "num_partitions"
     type: "int"
-    description: "The number of partitions to output."
     has_minimum: true
     minimum: 1
   }
@@ -7770,8 +7329,6 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Partitions `data` into `num_partitions` tensors using indices from `partitions`."
-  description: "For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`\nbecomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`\nare placed in `outputs[i]` in lexicographic order of `js`, and the first\ndimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.\nIn detail,\n\n```python\n    outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]\n\n    outputs[i] = pack([data[js, ...] for js if partitions[js] == i])\n```\n\n`data.shape` must start with `partitions.shape`.\n\nFor example:\n\n```python\n    # Scalar partitions.\n    partitions = 1\n    num_partitions = 2\n    data = [10, 20]\n    outputs[0] = []  # Empty with shape [0, 2]\n    outputs[1] = [[10, 20]]\n\n    # Vector partitions.\n    partitions = [0, 0, 1, 1, 0]\n    num_partitions = 2\n    data = [10, 20, 30, 40, 50]\n    outputs[0] = [10, 20, 50]\n    outputs[1] = [30, 40]\n```\n\nSee `dynamic_stitch` for an example on how to merge partitions back.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/DynamicPartition.png\" alt>\n</div>"
 }
 op {
   name: "DynamicStitch"
@@ -7799,44 +7356,61 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Interleave the values from the `data` tensors into a single tensor."
-  description: "Builds a merged tensor such that\n\n```python\n    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]\n```\n\nFor example, if each `indices[m]` is scalar or vector, we have\n\n```python\n    # Scalar indices:\n    merged[indices[m], ...] = data[m][...]\n\n    # Vector indices:\n    merged[indices[m][i], ...] = data[m][i, ...]\n```\n\nEach `data[i].shape` must start with the corresponding `indices[i].shape`,\nand the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we\nmust have `data[i].shape = indices[i].shape + constant`.  In terms of this\n`constant`, the output shape is\n\n    merged.shape = [max(indices)] + constant\n\nValues are merged in order, so if an index appears in both `indices[m][i]` and\n`indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the\nmerged result. If you do not need this guarantee, ParallelDynamicStitch might\nperform better on some devices.\n\nFor example:\n\n```python\n    indices[0] = 6\n    indices[1] = [4, 1]\n    indices[2] = [[5, 2], [0, 3]]\n    data[0] = [61, 62]\n    data[1] = [[41, 42], [11, 12]]\n    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]\n    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],\n              [51, 52], [61, 62]]\n```\n\nThis method can be used to merge partitions created by `dynamic_partition`\nas illustrated on the following example:\n\n```python\n    # Apply function (increments x_i) on elements for which a certain condition\n    # apply (x_i != -1 in this example).\n    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])\n    condition_mask=tf.not_equal(x,tf.constant(-1.))\n    partitioned_data = tf.dynamic_partition(\n        x, tf.cast(condition_mask, tf.int32) , 2)\n    partitioned_data[1] = partitioned_data[1] + 1.0\n    condition_indices = tf.dynamic_partition(\n        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)\n    x = tf.dynamic_stitch(condition_indices, partitioned_data)\n    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain\n    # unchanged.\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/DynamicStitch.png\" alt>\n</div>"
+}
+op {
+  name: "EagerPyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
 }
 op {
   name: "EditDistance"
   input_arg {
     name: "hypothesis_indices"
-    description: "The indices of the hypothesis list SparseTensor.\nThis is an N x R int64 matrix."
     type: DT_INT64
   }
   input_arg {
     name: "hypothesis_values"
-    description: "The values of the hypothesis list SparseTensor.\nThis is an N-length vector."
     type_attr: "T"
   }
   input_arg {
     name: "hypothesis_shape"
-    description: "The shape of the hypothesis list SparseTensor.\nThis is an R-length vector."
     type: DT_INT64
   }
   input_arg {
     name: "truth_indices"
-    description: "The indices of the truth list SparseTensor.\nThis is an M x R int64 matrix."
     type: DT_INT64
   }
   input_arg {
     name: "truth_values"
-    description: "The values of the truth list SparseTensor.\nThis is an M-length vector."
     type_attr: "T"
   }
   input_arg {
     name: "truth_shape"
-    description: "truth indices, vector."
     type: DT_INT64
   }
   output_arg {
     name: "output"
-    description: "A dense float tensor with rank R - 1.\n\nFor the example input:\n\n    // hypothesis represents a 2x1 matrix with variable-length values:\n    //   (0,0) = [\"a\"]\n    //   (1,0) = [\"b\"]\n    hypothesis_indices = [[0, 0, 0],\n                          [1, 0, 0]]\n    hypothesis_values = [\"a\", \"b\"]\n    hypothesis_shape = [2, 1, 1]\n\n    // truth represents a 2x2 matrix with variable-length values:\n    //   (0,0) = []\n    //   (0,1) = [\"a\"]\n    //   (1,0) = [\"b\", \"c\"]\n    //   (1,1) = [\"a\"]\n    truth_indices = [[0, 1, 0],\n                     [1, 0, 0],\n                     [1, 0, 1],\n                     [1, 1, 0]]\n    truth_values = [\"a\", \"b\", \"c\", \"a\"]\n    truth_shape = [2, 2, 2]\n    normalize = true\n\nThe output will be:\n\n    // output is a 2x2 matrix with edit distances normalized by truth lengths.\n    output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis\n              [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis"
     type: DT_FLOAT
   }
   attr {
@@ -7845,14 +7419,11 @@ op {
     default_value {
       b: true
     }
-    description: "boolean (if true, edit distances are normalized by length of truth).\n\nThe output is:"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Computes the (possibly normalized) Levenshtein Edit Distance."
-  description: "The inputs are variable-length sequences provided by SparseTensors\n  (hypothesis_indices, hypothesis_values, hypothesis_shape)\nand\n  (truth_indices, truth_values, truth_shape).\n\nThe inputs are:"
 }
 op {
   name: "Elu"
@@ -7870,29 +7441,25 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise."
-  description: "See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)\n](http://arxiv.org/abs/1511.07289)"
 }
 op {
   name: "EluGrad"
   input_arg {
     name: "gradients"
-    description: "The backpropagated gradients to the corresponding Elu operation."
     type_attr: "T"
   }
   input_arg {
     name: "outputs"
-    description: "The outputs of the corresponding Elu operation."
     type_attr: "T"
   }
   output_arg {
     name: "backprops"
-    description: "The gradients: `gradients * (outputs + 1)` if outputs < 0,\n`gradients` otherwise."
     type_attr: "T"
   }
   attr {
@@ -7901,23 +7468,46 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes gradients for the exponential linear (Elu) operation."
+}
+op {
+  name: "EmptyTensorList"
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "EncodeBase64"
   input_arg {
     name: "input"
-    description: "Strings to be encoded."
     type: DT_STRING
   }
   output_arg {
     name: "output"
-    description: "Input strings encoded in base64."
     type: DT_STRING
   }
   attr {
@@ -7926,21 +7516,16 @@ op {
     default_value {
       b: false
     }
-    description: "Bool whether padding is applied at the ends."
   }
-  summary: "Encode strings into web-safe base64 format."
-  description: "Refer to the following article for more information on base64 format:\nen.wikipedia.org/wiki/Base64. Base64 strings may have padding with \'=\' at the\nend so that the encoded has length multiple of 4. See Padding section of the\nlink above.\n\nWeb-safe means that the encoder uses - and _ instead of + and /."
 }
 op {
   name: "EncodeJpeg"
   input_arg {
     name: "image"
-    description: "3-D with shape `[height, width, channels]`."
     type: DT_UINT8
   }
   output_arg {
     name: "contents"
-    description: "0-D. JPEG-encoded image."
     type: DT_STRING
   }
   attr {
@@ -7949,7 +7534,6 @@ op {
     default_value {
       s: ""
     }
-    description: "Per pixel image format."
     allowed_values {
       list {
         s: ""
@@ -7964,7 +7548,6 @@ op {
     default_value {
       i: 95
     }
-    description: "Quality of the compression from 0 to 100 (higher is better and slower)."
   }
   attr {
     name: "progressive"
@@ -7972,7 +7555,6 @@ op {
     default_value {
       b: false
     }
-    description: "If True, create a JPEG that loads progressively (coarse to fine)."
   }
   attr {
     name: "optimize_size"
@@ -7980,7 +7562,6 @@ op {
     default_value {
       b: false
     }
-    description: "If True, spend CPU/RAM to reduce size with no quality change."
   }
   attr {
     name: "chroma_downsampling"
@@ -7988,7 +7569,6 @@ op {
     default_value {
       b: true
     }
-    description: "See http://en.wikipedia.org/wiki/Chroma_subsampling."
   }
   attr {
     name: "density_unit"
@@ -7996,7 +7576,6 @@ op {
     default_value {
       s: "in"
     }
-    description: "Unit used to specify `x_density` and `y_density`:\npixels per inch (`\'in\'`) or centimeter (`\'cm\'`)."
     allowed_values {
       list {
         s: "in"
@@ -8010,7 +7589,6 @@ op {
     default_value {
       i: 300
     }
-    description: "Horizontal pixels per density unit."
   }
   attr {
     name: "y_density"
@@ -8018,7 +7596,6 @@ op {
     default_value {
       i: 300
     }
-    description: "Vertical pixels per density unit."
   }
   attr {
     name: "xmp_metadata"
@@ -8026,21 +7603,16 @@ op {
     default_value {
       s: ""
     }
-    description: "If not empty, embed this XMP metadata in the image header."
   }
-  summary: "JPEG-encode an image."
-  description: "`image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.\n\nThe attr `format` can be used to override the color format of the encoded\noutput.  Values can be:\n\n*   `\'\'`: Use a default format based on the number of channels in the image.\n*   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension\n    of `image` must be 1.\n*   `rgb`: Output an RGB JPEG image. The `channels` dimension\n    of `image` must be 3.\n\nIf `format` is not specified or is the empty string, a default format is picked\nin function of the number of channels in `image`:\n\n*   1: Output a grayscale image.\n*   3: Output an RGB image."
 }
 op {
   name: "EncodePng"
   input_arg {
     name: "image"
-    description: "3-D with shape `[height, width, channels]`."
     type_attr: "T"
   }
   output_arg {
     name: "contents"
-    description: "0-D. PNG-encoded image."
     type: DT_STRING
   }
   attr {
@@ -8049,7 +7621,6 @@ op {
     default_value {
       i: -1
     }
-    description: "Compression level."
   }
   attr {
     name: "T"
@@ -8064,39 +7635,48 @@ op {
       }
     }
   }
-  summary: "PNG-encode an image."
-  description: "`image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`\nwhere `channels` is:\n\n*   1: for grayscale.\n*   2: for grayscale + alpha.\n*   3: for RGB.\n*   4: for RGBA.\n\nThe ZLIB compression level, `compression`, can be -1 for the PNG-encoder\ndefault or a value from 0 to 9.  9 is the highest compression level, generating\nthe smallest output, but is slower."
 }
 op {
   name: "EncodeWav"
   input_arg {
     name: "audio"
-    description: "2-D with shape `[length, channels]`."
     type: DT_FLOAT
   }
   input_arg {
     name: "sample_rate"
-    description: "Scalar containing the sample frequency."
     type: DT_INT32
   }
   output_arg {
     name: "contents"
-    description: "0-D. WAV-encoded file contents."
     type: DT_STRING
   }
-  summary: "Encode audio data using the WAV file format."
-  description: "This operation will generate a string suitable to be saved out to create a .wav\naudio file. It will be encoded in the 16-bit PCM format. It takes in float\nvalues in the range -1.0f to 1.0f, and any outside that value will be clamped to\nthat range.\n\n`audio` is a 2-D float Tensor of shape `[length, channels]`.\n`sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100)."
+}
+op {
+  name: "EnqueueInQueueDataset"
+  input_arg {
+    name: "queue"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "components"
+    type_list_attr: "Tcomponents"
+  }
+  attr {
+    name: "Tcomponents"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
 }
 op {
   name: "Enter"
   input_arg {
     name: "data"
-    description: "The tensor to be made available to the child frame."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The same tensor as `data`."
     type_attr: "T"
   }
   attr {
@@ -8106,7 +7686,6 @@ op {
   attr {
     name: "frame_name"
     type: "string"
-    description: "The name of the child frame."
   }
   attr {
     name: "is_constant"
@@ -8114,7 +7693,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, the output is constant within the child frame."
   }
   attr {
     name: "parallel_iterations"
@@ -8122,10 +7700,7 @@ op {
     default_value {
       i: 10
     }
-    description: "The number of iterations allowed to run in parallel."
   }
-  summary: "Creates or finds a child frame, and makes `data` available to the child frame."
-  description: "This op is used together with `Exit` to create loops in the graph.\nThe unique `frame_name` is used by the `Executor` to identify frames. If\n`is_constant` is true, `output` is a constant in the child frame; otherwise\nit may be changed in the child frame. At most `parallel_iterations` iterations\nare run in parallel in the child frame."
 }
 op {
   name: "Equal"
@@ -8147,6 +7722,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -8164,8 +7740,6 @@ op {
       }
     }
   }
-  summary: "Returns the truth value of (x == y) element-wise."
-  description: "*NOTE*: `Equal` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -8184,12 +7758,12 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes the Gauss error function of `x` element-wise."
 }
 op {
   name: "Erfc"
@@ -8207,31 +7781,62 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes the complementary error function of `x` element-wise."
+}
+op {
+  name: "ExecuteInCriticalSection"
+  input_arg {
+    name: "critical_section"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  is_stateful: true
 }
 op {
   name: "Exit"
   input_arg {
     name: "data"
-    description: "The tensor to be made available to the parent frame."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The same tensor as `data`."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Exits the current frame to its parent frame."
-  description: "Exit makes its input `data` available to the parent frame."
 }
 op {
   name: "Exp"
@@ -8249,6 +7854,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -8256,7 +7862,6 @@ op {
       }
     }
   }
-  summary: "Computes exponential of x element-wise.  \\\\(y = e^x\\\\)."
 }
 op {
   name: "ExpandDims"
@@ -8266,12 +7871,10 @@ op {
   }
   input_arg {
     name: "dim"
-    description: "0-D (scalar). Specifies the dimension index at which to\nexpand the shape of `input`. Must be in the range\n`[-rank(input) - 1, rank(input)]`."
     type_attr: "Tdim"
   }
   output_arg {
     name: "output"
-    description: "Contains the same data as `input`, but its shape has an additional\ndimension of size 1 added."
     type_attr: "T"
   }
   attr {
@@ -8291,8 +7894,6 @@ op {
       }
     }
   }
-  summary: "Inserts a dimension of 1 into a tensor\'s shape."
-  description: "Given a tensor `input`, this operation inserts a dimension of 1 at the\ndimension index `dim` of `input`\'s shape. The dimension index `dim` starts at\nzero; if you specify a negative number for `dim` it is counted backward from\nthe end.\n\nThis operation is useful if you want to add a batch dimension to a single\nelement. For example, if you have a single image of shape `[height, width,\nchannels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,\nwhich will make the shape `[1, height, width, channels]`.\n\nOther examples:\n\n```\n# \'t\' is a tensor of shape [2]\nshape(expand_dims(t, 0)) ==> [1, 2]\nshape(expand_dims(t, 1)) ==> [2, 1]\nshape(expand_dims(t, -1)) ==> [2, 1]\n\n# \'t2\' is a tensor of shape [2, 3, 5]\nshape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]\nshape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]\nshape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]\n```\n\nThis operation requires that:\n\n`-1-input.dims() <= dim <= input.dims()`\n\nThis operation is related to `squeeze()`, which removes dimensions of\nsize 1."
 }
 op {
   name: "Expm1"
@@ -8310,6 +7911,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -8317,29 +7919,23 @@ op {
       }
     }
   }
-  summary: "Computes exponential of x - 1 element-wise."
-  description: "I.e., \\\\(y = (\\exp x) - 1\\\\)."
 }
 op {
   name: "ExtractGlimpse"
   input_arg {
     name: "input"
-    description: "A 4-D float tensor of shape `[batch_size, height, width, channels]`."
     type: DT_FLOAT
   }
   input_arg {
     name: "size"
-    description: "A 1-D tensor of 2 elements containing the size of the glimpses\nto extract.  The glimpse height must be specified first, following\nby the glimpse width."
     type: DT_INT32
   }
   input_arg {
     name: "offsets"
-    description: "A 2-D integer tensor of shape `[batch_size, 2]` containing\nthe y, x locations of the center of each window."
     type: DT_FLOAT
   }
   output_arg {
     name: "glimpse"
-    description: "A tensor representing the glimpses `[batch_size,\nglimpse_height, glimpse_width, channels]`."
     type: DT_FLOAT
   }
   attr {
@@ -8348,7 +7944,6 @@ op {
     default_value {
       b: true
     }
-    description: "indicates if the offset coordinates are centered relative to\nthe image, in which case the (0, 0) offset is relative to the center\nof the input images. If false, the (0,0) offset corresponds to the\nupper left corner of the input images."
   }
   attr {
     name: "normalized"
@@ -8356,7 +7951,6 @@ op {
     default_value {
       b: true
     }
-    description: "indicates if the offset coordinates are normalized."
   }
   attr {
     name: "uniform_noise"
@@ -8364,41 +7958,33 @@ op {
     default_value {
       b: true
     }
-    description: "indicates if the noise should be generated using a\nuniform distribution or a Gaussian distribution."
   }
-  summary: "Extracts a glimpse from the input tensor."
-  description: "Returns a set of windows called glimpses extracted at location\n`offsets` from the input tensor. If the windows only partially\noverlaps the inputs, the non overlapping areas will be filled with\nrandom noise.\n\nThe result is a 4-D tensor of shape `[batch_size, glimpse_height,\nglimpse_width, channels]`. The channels and batch dimensions are the\nsame as that of the input tensor. The height and width of the output\nwindows are specified in the `size` parameter.\n\nThe argument `normalized` and `centered` controls how the windows are built:\n\n* If the coordinates are normalized but not centered, 0.0 and 1.0\n  correspond to the minimum and maximum of each height and width\n  dimension.\n* If the coordinates are both normalized and centered, they range from\n  -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper\n  left corner, the lower right corner is located at (1.0, 1.0) and the\n  center is at (0, 0).\n* If the coordinates are not normalized they are interpreted as\n  numbers of pixels."
 }
 op {
   name: "ExtractImagePatches"
   input_arg {
     name: "images"
-    description: "4-D Tensor with shape `[batch, in_rows, in_cols, depth]`."
     type_attr: "T"
   }
   output_arg {
     name: "patches"
-    description: "4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *\nksize_cols * depth]` containing image patches with size\n`ksize_rows x ksize_cols x depth` vectorized in the \"depth\" dimension. Note\n`out_rows` and `out_cols` are the dimensions of the output patches."
     type_attr: "T"
   }
   attr {
     name: "ksizes"
     type: "list(int)"
-    description: "The size of the sliding window for each dimension of `images`."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D of length 4. How far the centers of two consecutive patches are in\nthe images. Must be: `[1, stride_rows, stride_cols, 1]`."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "rates"
     type: "list(int)"
-    description: "1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the\ninput stride, specifying how far two consecutive patch samples are in the\ninput. Equivalent to extracting patches with\n`patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by\nsubsampling them spatially by a factor of `rates`. This is equivalent to\n`rate` in dilated (a.k.a. Atrous) convolutions."
     has_minimum: true
     minimum: 4
   }
@@ -8410,10 +7996,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -8424,7 +8011,6 @@ op {
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use.\n\nWe specify the size-related attributes as:\n\n```python\n      ksizes = [1, ksize_rows, ksize_cols, 1]\n      strides = [1, strides_rows, strides_cols, 1]\n      rates = [1, rates_rows, rates_cols, 1]\n```"
     allowed_values {
       list {
         s: "SAME"
@@ -8432,18 +8018,15 @@ op {
       }
     }
   }
-  summary: "Extract `patches` from `images` and put them in the \"depth\" output dimension."
 }
 op {
   name: "ExtractJpegShape"
   input_arg {
     name: "contents"
-    description: "0-D. The JPEG-encoded image."
     type: DT_STRING
   }
   output_arg {
     name: "image_shape"
-    description: "1-D. The image shape with format [height, width, channels]."
     type_attr: "output_type"
   }
   attr {
@@ -8452,7 +8035,6 @@ op {
     default_value {
       type: DT_INT32
     }
-    description: "(Optional) The output type of the operation (int32 or int64).\nDefaults to int32."
     allowed_values {
       list {
         type: DT_INT32
@@ -8460,66 +8042,50 @@ op {
       }
     }
   }
-  summary: "Extract the shape information of a JPEG-encoded image."
-  description: "This op only parses the image header, so it is much faster than DecodeJpeg."
 }
 op {
   name: "FFT"
   input_arg {
     name: "input"
-    description: "A complex64 tensor."
     type: DT_COMPLEX64
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most\n  dimension of `input` is replaced with its 1D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fft\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Fast Fourier transform."
-  description: "Computes the 1-dimensional discrete Fourier transform over the inner-most\ndimension of `input`."
 }
 op {
   name: "FFT2D"
   input_arg {
     name: "input"
-    description: "A complex64 tensor."
     type: DT_COMPLEX64
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their 2D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fft2\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "2D fast Fourier transform."
-  description: "Computes the 2-dimensional discrete Fourier transform over the inner-most\n2 dimensions of `input`."
 }
 op {
   name: "FFT3D"
   input_arg {
     name: "input"
-    description: "A complex64 tensor."
     type: DT_COMPLEX64
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their 3D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fftn with 3 dimensions.\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "3D fast Fourier transform."
-  description: "Computes the 3-dimensional discrete Fourier transform over the inner-most 3\ndimensions of `input`."
 }
 op {
   name: "FIFOQueue"
   output_arg {
     name: "handle"
-    description: "The handle to the queue."
     type: DT_STRING
     is_ref: true
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a value."
     has_minimum: true
     minimum: 1
   }
@@ -8530,7 +8096,6 @@ op {
       list {
       }
     }
-    description: "The shape of each component in a value. The length of this attr must\nbe either 0 or the same as the length of component_types. If the length of\nthis attr is 0, the shapes of queue elements are not constrained, and\nonly one element may be dequeued at a time."
     has_minimum: true
   }
   attr {
@@ -8539,7 +8104,6 @@ op {
     default_value {
       i: -1
     }
-    description: "The upper bound on the number of elements in this queue.\nNegative numbers mean no limit."
   }
   attr {
     name: "container"
@@ -8547,7 +8111,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -8555,22 +8118,18 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue will be shared under the given name\nacross multiple sessions."
   }
-  summary: "A queue that produces elements in first-in first-out order."
   is_stateful: true
 }
 op {
   name: "FIFOQueueV2"
   output_arg {
     name: "handle"
-    description: "The handle to the queue."
     type: DT_RESOURCE
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a value."
     has_minimum: true
     minimum: 1
   }
@@ -8581,7 +8140,6 @@ op {
       list {
       }
     }
-    description: "The shape of each component in a value. The length of this attr must\nbe either 0 or the same as the length of component_types. If the length of\nthis attr is 0, the shapes of queue elements are not constrained, and\nonly one element may be dequeued at a time."
     has_minimum: true
   }
   attr {
@@ -8590,7 +8148,6 @@ op {
     default_value {
       i: -1
     }
-    description: "The upper bound on the number of elements in this queue.\nNegative numbers mean no limit."
   }
   attr {
     name: "container"
@@ -8598,7 +8155,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -8606,9 +8162,7 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue will be shared under the given name\nacross multiple sessions."
   }
-  summary: "A queue that produces elements in first-in first-out order."
   is_stateful: true
 }
 op {
@@ -8617,7 +8171,6 @@ op {
     name: "fact"
     type: DT_STRING
   }
-  summary: "Output a fact about factorials."
 }
 op {
   name: "FakeQuantWithMinMaxArgs"
@@ -8657,24 +8210,19 @@ op {
       b: false
     }
   }
-  summary: "Fake-quantize the \'inputs\' tensor, type float to \'outputs\' tensor of same type."
-  description: "Attributes `[min; max]` define the clamping range for the `inputs` data.\n`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`\nwhen `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and\nthen de-quantized and output as floats in `[min; max]` interval.\n`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.\n\nQuantization is called fake since the output is still in floating point."
 }
 op {
   name: "FakeQuantWithMinMaxArgsGradient"
   input_arg {
     name: "gradients"
-    description: "Backpropagated gradients above the FakeQuantWithMinMaxArgs operation."
     type: DT_FLOAT
   }
   input_arg {
     name: "inputs"
-    description: "Values passed as inputs to the FakeQuantWithMinMaxArgs operation."
     type: DT_FLOAT
   }
   output_arg {
     name: "backprops"
-    description: "Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:\n`gradients * (inputs >= min && inputs <= max)`."
     type: DT_FLOAT
   }
   attr {
@@ -8705,7 +8253,6 @@ op {
       b: false
     }
   }
-  summary: "Compute gradients for a FakeQuantWithMinMaxArgs operation."
 }
 op {
   name: "FakeQuantWithMinMaxVars"
@@ -8739,19 +8286,15 @@ op {
       b: false
     }
   }
-  summary: "Fake-quantize the \'inputs\' tensor of type float via global float scalars `min`"
-  description: "and `max` to \'outputs\' tensor of same shape as `inputs`.\n\n`[min; max]` define the clamping range for the `inputs` data.\n`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`\nwhen `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and\nthen de-quantized and output as floats in `[min; max]` interval.\n`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.\n\nThis operation has a gradient and thus allows for training `min` and `max`\nvalues."
 }
 op {
   name: "FakeQuantWithMinMaxVarsGradient"
   input_arg {
     name: "gradients"
-    description: "Backpropagated gradients above the FakeQuantWithMinMaxVars operation."
     type: DT_FLOAT
   }
   input_arg {
     name: "inputs"
-    description: "Values passed as inputs to the FakeQuantWithMinMaxVars operation.\nmin, max: Quantization interval, scalar floats."
     type: DT_FLOAT
   }
   input_arg {
@@ -8764,17 +8307,14 @@ op {
   }
   output_arg {
     name: "backprops_wrt_input"
-    description: "Backpropagated gradients w.r.t. inputs:\n`gradients * (inputs >= min && inputs <= max)`."
     type: DT_FLOAT
   }
   output_arg {
     name: "backprop_wrt_min"
-    description: "Backpropagated gradients w.r.t. min parameter:\n`sum(gradients * (inputs < min))`."
     type: DT_FLOAT
   }
   output_arg {
     name: "backprop_wrt_max"
-    description: "Backpropagated gradients w.r.t. max parameter:\n`sum(gradients * (inputs > max))`."
     type: DT_FLOAT
   }
   attr {
@@ -8783,7 +8323,6 @@ op {
     default_value {
       i: 8
     }
-    description: "The bitwidth of the quantization; between 2 and 8, inclusive."
   }
   attr {
     name: "narrow_range"
@@ -8791,9 +8330,7 @@ op {
     default_value {
       b: false
     }
-    description: "Whether to quantize into 2^num_bits - 1 distinct values."
   }
-  summary: "Compute gradients for a FakeQuantWithMinMaxVars operation."
 }
 op {
   name: "FakeQuantWithMinMaxVarsPerChannel"
@@ -8827,19 +8364,15 @@ op {
       b: false
     }
   }
-  summary: "Fake-quantize the \'inputs\' tensor of type float and one of the shapes: `[d]`,"
-  description: "`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`\nto \'outputs\' tensor of same shape as `inputs`.\n\n`[min; max]` define the clamping range for the `inputs` data.\n`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`\nwhen `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and\nthen de-quantized and output as floats in `[min; max]` interval.\n`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.\n\nThis operation has a gradient and thus allows for training `min` and `max`\nvalues."
 }
 op {
   name: "FakeQuantWithMinMaxVarsPerChannelGradient"
   input_arg {
     name: "gradients"
-    description: "Backpropagated gradients above the FakeQuantWithMinMaxVars operation,\nshape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`."
     type: DT_FLOAT
   }
   input_arg {
     name: "inputs"
-    description: "Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape\n  same as `gradients`.\nmin, max: Quantization interval, floats of shape `[d]`."
     type: DT_FLOAT
   }
   input_arg {
@@ -8852,17 +8385,14 @@ op {
   }
   output_arg {
     name: "backprops_wrt_input"
-    description: "Backpropagated gradients w.r.t. inputs, shape same as\n`inputs`:\n  `gradients * (inputs >= min && inputs <= max)`."
     type: DT_FLOAT
   }
   output_arg {
     name: "backprop_wrt_min"
-    description: "Backpropagated gradients w.r.t. min parameter, shape `[d]`:\n`sum_per_d(gradients * (inputs < min))`."
     type: DT_FLOAT
   }
   output_arg {
     name: "backprop_wrt_max"
-    description: "Backpropagated gradients w.r.t. max parameter, shape `[d]`:\n`sum_per_d(gradients * (inputs > max))`."
     type: DT_FLOAT
   }
   attr {
@@ -8871,7 +8401,6 @@ op {
     default_value {
       i: 8
     }
-    description: "The bitwidth of the quantization; between 2 and 8, inclusive."
   }
   attr {
     name: "narrow_range"
@@ -8879,9 +8408,7 @@ op {
     default_value {
       b: false
     }
-    description: "Whether to quantize into 2^num_bits - 1 distinct values."
   }
-  summary: "Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation."
 }
 op {
   name: "FakeQueue"
@@ -8894,19 +8421,16 @@ op {
     type: DT_STRING
     is_ref: true
   }
-  summary: "Deprecated. Do not use."
   is_stateful: true
 }
 op {
   name: "Fill"
   input_arg {
     name: "dims"
-    description: "1-D. Represents the shape of the output tensor."
-    type: DT_INT32
+    type_attr: "index_type"
   }
   input_arg {
     name: "value"
-    description: "0-D (scalar). Value to fill the returned tensor.\n\n@compatibility(numpy)\nEquivalent to np.full\n@end_compatibility"
     type_attr: "T"
   }
   output_arg {
@@ -8917,8 +8441,19 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Creates a tensor filled with a scalar value."
-  description: "This operation creates a tensor of shape `dims` and fills it with `value`.\n\nFor example:\n\n```\n# Output tensor has shape [2, 3].\nfill([2, 3], 9) ==> [[9, 9, 9]\n                     [9, 9, 9]]\n```"
+  attr {
+    name: "index_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "FilterDataset"
@@ -8928,7 +8463,6 @@ op {
   }
   input_arg {
     name: "other_arguments"
-    description: "A list of tensors, typically values that were captured when\nbuilding a closure for `predicate`."
     type_list_attr: "Targuments"
   }
   output_arg {
@@ -8938,7 +8472,6 @@ op {
   attr {
     name: "predicate"
     type: "func"
-    description: "A function returning a scalar boolean."
   }
   attr {
     name: "Targuments"
@@ -8957,48 +8490,39 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset containing elements of `input_dataset` matching `predicate`."
-  description: "The `predicate` function must return a scalar boolean and accept the\nfollowing arguments:\n\n* One tensor for each component of an element of `input_dataset`.\n* One tensor for each value in `other_arguments`."
 }
 op {
   name: "FixedLengthRecordDataset"
   input_arg {
     name: "filenames"
-    description: "A scalar or a vector containing the name(s) of the file(s) to be\nread."
     type: DT_STRING
   }
   input_arg {
     name: "header_bytes"
-    description: "A scalar representing the number of bytes to skip at the\nbeginning of a file."
     type: DT_INT64
   }
   input_arg {
     name: "record_bytes"
-    description: "A scalar representing the number of bytes in each record."
     type: DT_INT64
   }
   input_arg {
     name: "footer_bytes"
-    description: "A scalar representing the number of bytes to skip at the end\nof a file."
     type: DT_INT64
   }
   input_arg {
     name: "buffer_size"
-    description: "A scalar representing the number of bytes to buffer. Must be > 0."
     type: DT_INT64
   }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
-  summary: "Creates a dataset that emits the records from one or more binary files."
   is_stateful: true
 }
 op {
   name: "FixedLengthRecordReader"
   output_arg {
     name: "reader_handle"
-    description: "The handle to reference the Reader."
     type: DT_STRING
     is_ref: true
   }
@@ -9008,12 +8532,10 @@ op {
     default_value {
       i: 0
     }
-    description: "Number of bytes in the header, defaults to 0."
   }
   attr {
     name: "record_bytes"
     type: "int"
-    description: "Number of bytes in the record."
   }
   attr {
     name: "footer_bytes"
@@ -9021,7 +8543,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Number of bytes in the footer, defaults to 0."
   }
   attr {
     name: "hop_bytes"
@@ -9029,7 +8550,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Number of bytes to hop before each read. Default of 0 means using\nrecord_bytes."
   }
   attr {
     name: "container"
@@ -9037,7 +8557,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -9045,16 +8564,17 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
   }
-  summary: "A Reader that outputs fixed-length records from a file."
+  deprecation {
+    version: 26
+    explanation: "Use FixedLengthRecordReaderV2"
+  }
   is_stateful: true
 }
 op {
   name: "FixedLengthRecordReaderV2"
   output_arg {
     name: "reader_handle"
-    description: "The handle to reference the Reader."
     type: DT_RESOURCE
   }
   attr {
@@ -9063,12 +8583,10 @@ op {
     default_value {
       i: 0
     }
-    description: "Number of bytes in the header, defaults to 0."
   }
   attr {
     name: "record_bytes"
     type: "int"
-    description: "Number of bytes in the record."
   }
   attr {
     name: "footer_bytes"
@@ -9076,7 +8594,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Number of bytes in the footer, defaults to 0."
   }
   attr {
     name: "hop_bytes"
@@ -9084,7 +8601,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Number of bytes to hop before each read. Default of 0 means using\nrecord_bytes."
   }
   attr {
     name: "container"
@@ -9092,7 +8608,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -9100,7 +8615,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
   }
   attr {
     name: "encoding"
@@ -9108,56 +8622,46 @@ op {
     default_value {
       s: ""
     }
-    description: "The type of encoding for the file. Currently ZLIB and GZIP\nare supported. Defaults to none."
   }
-  summary: "A Reader that outputs fixed-length records from a file."
   is_stateful: true
 }
 op {
   name: "FixedUnigramCandidateSampler"
   input_arg {
     name: "true_classes"
-    description: "A batch_size * num_true matrix, in which each row contains the\nIDs of the num_true target_classes in the corresponding original label."
     type: DT_INT64
   }
   output_arg {
     name: "sampled_candidates"
-    description: "A vector of length num_sampled, in which each element is\nthe ID of a sampled candidate."
     type: DT_INT64
   }
   output_arg {
     name: "true_expected_count"
-    description: "A batch_size * num_true matrix, representing\nthe number of times each candidate is expected to occur in a batch\nof sampled candidates. If unique=true, then this is a probability."
     type: DT_FLOAT
   }
   output_arg {
     name: "sampled_expected_count"
-    description: "A vector of length num_sampled, for each sampled\ncandidate representing the number of times the candidate is expected\nto occur in a batch of sampled candidates.  If unique=true, then this is a\nprobability."
     type: DT_FLOAT
   }
   attr {
     name: "num_true"
     type: "int"
-    description: "Number of true labels per context."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "unique"
     type: "bool"
-    description: "If unique is true, we sample with rejection, so that all sampled\ncandidates in a batch are unique. This requires some approximation to\nestimate the post-rejection sampling probabilities."
   }
   attr {
     name: "range_max"
     type: "int"
-    description: "The sampler will sample integers from the interval [0, range_max)."
     has_minimum: true
     minimum: 1
   }
@@ -9167,7 +8671,6 @@ op {
     default_value {
       s: ""
     }
-    description: "Each valid line in this file (which should have a CSV-like format)\ncorresponds to a valid word ID. IDs are in sequential order, starting from\nnum_reserved_ids. The last entry in each line is expected to be a value\ncorresponding to the count or relative probability. Exactly one of vocab_file\nand unigrams needs to be passed to this op."
   }
   attr {
     name: "distortion"
@@ -9175,7 +8678,6 @@ op {
     default_value {
       f: 1
     }
-    description: "The distortion is used to skew the unigram probability distribution.\nEach weight is first raised to the distortion\'s power before adding to the\ninternal unigram distribution. As a result, distortion = 1.0 gives regular\nunigram sampling (as defined by the vocab file), and distortion = 0.0 gives\na uniform distribution."
   }
   attr {
     name: "num_reserved_ids"
@@ -9183,7 +8685,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Optionally some reserved IDs can be added in the range [0,\n..., num_reserved_ids) by the users. One use case is that a special unknown\nword token is used as ID 0. These IDs will have a sampling probability of 0."
   }
   attr {
     name: "num_shards"
@@ -9191,7 +8692,6 @@ op {
     default_value {
       i: 1
     }
-    description: "A sampler can be used to sample from a subset of the original range\nin order to speed up the whole computation through parallelism. This parameter\n(together with \'shard\') indicates the number of partitions that are being\nused in the overall computation."
     has_minimum: true
     minimum: 1
   }
@@ -9201,7 +8701,6 @@ op {
     default_value {
       i: 0
     }
-    description: "A sampler can be used to sample from a subset of the original range\nin order to speed up the whole computation through parallelism. This parameter\n(together with \'num_shards\') indicates the particular partition number of a\nsampler op, when partitioning is being used."
     has_minimum: true
   }
   attr {
@@ -9211,7 +8710,6 @@ op {
       list {
       }
     }
-    description: "A list of unigram counts or probabilities, one per ID in sequential\norder. Exactly one of vocab_file and unigrams should be passed to this op."
   }
   attr {
     name: "seed"
@@ -9219,7 +8717,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either seed or seed2 are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -9227,10 +8724,7 @@ op {
     default_value {
       i: 0
     }
-    description: "An second seed to avoid seed collision."
   }
-  summary: "Generates labels for candidate sampling with a learned unigram distribution."
-  description: "A unigram sampler could use a fixed unigram distribution read from a\nfile or passed in as an in-memory array instead of building up the distribution\nfrom data on the fly. There is also an option to skew the distribution by\napplying a distortion power to the weights.\n\nThe vocabulary file should be in CSV-like format, with the last field\nbeing the weight associated with the word.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
   is_stateful: true
 }
 op {
@@ -9250,7 +8744,6 @@ op {
   attr {
     name: "f"
     type: "func"
-    description: "A function mapping elements of `input_dataset`, concatenated with\n`other_arguments`, to a Dataset variant that contains elements matching\n`output_types` and `output_shapes`."
   }
   attr {
     name: "Targuments"
@@ -9269,8 +8762,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
-  description: "Unlike MapDataset, the `f` in FlatMapDataset is expected to return a\nDataset variant, and FlatMapDataset will flatten successive results\ninto a single Dataset."
 }
 op {
   name: "Floor"
@@ -9288,12 +8779,12 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Returns element-wise largest integer not greater than x."
 }
 op {
   name: "FloorDiv"
@@ -9315,6 +8806,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -9328,8 +8820,6 @@ op {
       }
     }
   }
-  summary: "Returns x // y element-wise."
-  description: "*NOTE*: `FloorDiv` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "FloorMod"
@@ -9352,40 +8842,34 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Returns element-wise remainder of division. When `x < 0` xor `y < 0` is"
-  description: "true, this follows Python semantics in that the result here is consistent\nwith a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.\n\n*NOTE*: `FloorMod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "FractionalAvgPool"
   input_arg {
     name: "value"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "output tensor after fractional avg pooling."
     type_attr: "T"
   }
   output_arg {
     name: "row_pooling_sequence"
-    description: "row pooling sequence, needed to calculate gradient."
     type: DT_INT64
   }
   output_arg {
     name: "col_pooling_sequence"
-    description: "column pooling sequence, needed to calculate gradient."
     type: DT_INT64
   }
   attr {
     name: "pooling_ratio"
     type: "list(float)"
-    description: "Pooling ratio for each dimension of `value`, currently only\nsupports row and col dimension and should be >= 1.0. For example, a valid\npooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements\nmust be 1.0 because we don\'t allow pooling on batch and channels\ndimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions\nrespectively."
     has_minimum: true
     minimum: 4
   }
@@ -9395,7 +8879,6 @@ op {
     default_value {
       b: false
     }
-    description: "When set to True, generates the pooling sequence in a\npseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin\nGraham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for\ndifference between pseudorandom and random."
   }
   attr {
     name: "overlapping"
@@ -9403,7 +8886,6 @@ op {
     default_value {
       b: false
     }
-    description: "When set to True, it means when pooling, the values at the boundary\nof adjacent pooling cells are used by both cells. For example:\n\n`index  0  1  2  3  4`\n\n`value  20 5  16 3  7`\n\nIf the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.\nThe result would be [41/3, 26/3] for fractional avg pooling."
   }
   attr {
     name: "deterministic"
@@ -9411,7 +8893,6 @@ op {
     default_value {
       b: false
     }
-    description: "When set to True, a fixed pooling region will be used when\niterating over a FractionalAvgPool node in the computation graph. Mainly used\nin unit test to make FractionalAvgPool deterministic."
   }
   attr {
     name: "seed"
@@ -9419,7 +8900,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either seed or seed2 are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -9427,7 +8907,6 @@ op {
     default_value {
       i: 0
     }
-    description: "An second seed to avoid seed collision."
   }
   attr {
     name: "T"
@@ -9441,34 +8920,27 @@ op {
       }
     }
   }
-  summary: "Performs fractional average pooling on the input."
-  description: "Fractional average pooling is similar to Fractional max pooling in the pooling\nregion generation step. The only difference is that after pooling regions are\ngenerated, a mean operation is performed instead of a max operation in each\npooling region."
 }
 op {
   name: "FractionalAvgPoolGrad"
   input_arg {
     name: "orig_input_tensor_shape"
-    description: "Original input tensor shape for `fractional_avg_pool`"
     type: DT_INT64
   }
   input_arg {
     name: "out_backprop"
-    description: "4-D with shape `[batch, height, width, channels]`.  Gradients\nw.r.t. the output of `fractional_avg_pool`."
     type_attr: "T"
   }
   input_arg {
     name: "row_pooling_sequence"
-    description: "row pooling sequence, form pooling region with\ncol_pooling_sequence."
     type: DT_INT64
   }
   input_arg {
     name: "col_pooling_sequence"
-    description: "column pooling sequence, form pooling region with\nrow_pooling sequence."
     type: DT_INT64
   }
   output_arg {
     name: "output"
-    description: "4-D.  Gradients w.r.t. the input of `fractional_avg_pool`."
     type_attr: "T"
   }
   attr {
@@ -9477,7 +8949,6 @@ op {
     default_value {
       b: false
     }
-    description: "When set to True, it means when pooling, the values at the boundary\nof adjacent pooling cells are used by both cells. For example:\n\n`index  0  1  2  3  4`\n\n`value  20 5  16 3  7`\n\nIf the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.\nThe result would be [41/3, 26/3] for fractional avg pooling."
   }
   attr {
     name: "T"
@@ -9491,35 +8962,28 @@ op {
       }
     }
   }
-  summary: "Computes gradient of the FractionalAvgPool function."
-  description: "Unlike FractionalMaxPoolGrad, we don\'t need to find arg_max for\nFractionalAvgPoolGrad, we just need to evenly back-propagate each element of\nout_backprop to those indices that form the same pooling cell. Therefore, we\njust need to know the shape of original input tensor, instead of the whole\ntensor."
 }
 op {
   name: "FractionalMaxPool"
   input_arg {
     name: "value"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "output tensor after fractional max pooling."
     type_attr: "T"
   }
   output_arg {
     name: "row_pooling_sequence"
-    description: "row pooling sequence, needed to calculate gradient."
     type: DT_INT64
   }
   output_arg {
     name: "col_pooling_sequence"
-    description: "column pooling sequence, needed to calculate gradient."
     type: DT_INT64
   }
   attr {
     name: "pooling_ratio"
     type: "list(float)"
-    description: "Pooling ratio for each dimension of `value`, currently only\nsupports row and col dimension and should be >= 1.0. For example, a valid\npooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements\nmust be 1.0 because we don\'t allow pooling on batch and channels\ndimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions\nrespectively."
     has_minimum: true
     minimum: 4
   }
@@ -9529,7 +8993,6 @@ op {
     default_value {
       b: false
     }
-    description: "When set to True, generates the pooling sequence in a\npseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin\nGraham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for\ndifference between pseudorandom and random."
   }
   attr {
     name: "overlapping"
@@ -9537,7 +9000,6 @@ op {
     default_value {
       b: false
     }
-    description: "When set to True, it means when pooling, the values at the boundary\nof adjacent pooling cells are used by both cells. For example:\n\n`index  0  1  2  3  4`\n\n`value  20 5  16 3  7`\n\nIf the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.\nThe result would be [20, 16] for fractional max pooling."
   }
   attr {
     name: "deterministic"
@@ -9545,7 +9007,6 @@ op {
     default_value {
       b: false
     }
-    description: "When set to True, a fixed pooling region will be used when\niterating over a FractionalMaxPool node in the computation graph. Mainly used\nin unit test to make FractionalMaxPool deterministic."
   }
   attr {
     name: "seed"
@@ -9553,7 +9014,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either seed or seed2 are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -9561,7 +9021,6 @@ op {
     default_value {
       i: 0
     }
-    description: "An second seed to avoid seed collision."
   }
   attr {
     name: "T"
@@ -9575,39 +9034,31 @@ op {
       }
     }
   }
-  summary: "Performs fractional max pooling on the input."
-  description: "Fractional max pooling is slightly different than regular max pooling.  In\nregular max pooling, you downsize an input set by taking the maximum value of\nsmaller N x N subsections of the set (often 2x2), and try to reduce the set by\na factor of N, where N is an integer.  Fractional max pooling, as you might\nexpect from the word \"fractional\", means that the overall reduction ratio N\ndoes not have to be an integer.\n\nThe sizes of the pooling regions are generated randomly but are fairly uniform.\nFor example, let\'s look at the height dimension, and the constraints on the\nlist of rows that will be pool boundaries.\n\nFirst we define the following:\n\n1.  input_row_length : the number of rows from the input set\n2.  output_row_length : which will be smaller than the input\n3.  alpha = input_row_length / output_row_length : our reduction ratio\n4.  K = floor(alpha)\n5.  row_pooling_sequence : this is the result list of pool boundary rows\n\nThen, row_pooling_sequence should satisfy:\n\n1.  a[0] = 0 : the first value of the sequence is 0\n2.  a[end] = input_row_length : the last value of the sequence is the size\n3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size\n4.  length(row_pooling_sequence) = output_row_length+1\n\nFor more details on fractional max pooling, see this paper:\n[Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)"
 }
 op {
   name: "FractionalMaxPoolGrad"
   input_arg {
     name: "orig_input"
-    description: "Original input for `fractional_max_pool`"
     type_attr: "T"
   }
   input_arg {
     name: "orig_output"
-    description: "Original output for `fractional_max_pool`"
     type_attr: "T"
   }
   input_arg {
     name: "out_backprop"
-    description: "4-D with shape `[batch, height, width, channels]`.  Gradients\nw.r.t. the output of `fractional_max_pool`."
     type_attr: "T"
   }
   input_arg {
     name: "row_pooling_sequence"
-    description: "row pooling sequence, form pooling region with\ncol_pooling_sequence."
     type: DT_INT64
   }
   input_arg {
     name: "col_pooling_sequence"
-    description: "column pooling sequence, form pooling region with\nrow_pooling sequence."
     type: DT_INT64
   }
   output_arg {
     name: "output"
-    description: "4-D.  Gradients w.r.t. the input of `fractional_max_pool`."
     type_attr: "T"
   }
   attr {
@@ -9616,7 +9067,6 @@ op {
     default_value {
       b: false
     }
-    description: "When set to True, it means when pooling, the values at the boundary\nof adjacent pooling cells are used by both cells. For example:\n\n`index  0  1  2  3  4`\n\n`value  20 5  16 3  7`\n\nIf the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.\nThe result would be [20, 16] for fractional max pooling."
   }
   attr {
     name: "T"
@@ -9630,64 +9080,52 @@ op {
       }
     }
   }
-  summary: "Computes gradient of the FractionalMaxPool function."
 }
 op {
   name: "FusedBatchNorm"
   input_arg {
     name: "x"
-    description: "A 4D Tensor for input data."
     type_attr: "T"
   }
   input_arg {
     name: "scale"
-    description: "A 1D Tensor for scaling factor, to scale the normalized x."
     type_attr: "T"
   }
   input_arg {
     name: "offset"
-    description: "A 1D Tensor for offset, to shift to the normalized x."
     type_attr: "T"
   }
   input_arg {
     name: "mean"
-    description: "A 1D Tensor for population mean. Used for inference only;\nmust be empty for training."
     type_attr: "T"
   }
   input_arg {
     name: "variance"
-    description: "A 1D Tensor for population variance. Used for inference only;\nmust be empty for training."
     type_attr: "T"
   }
   output_arg {
     name: "y"
-    description: "A 4D Tensor for output data."
     type_attr: "T"
   }
   output_arg {
     name: "batch_mean"
-    description: "A 1D Tensor for the computed batch mean, to be used by TensorFlow\nto compute the running mean."
     type_attr: "T"
   }
   output_arg {
     name: "batch_variance"
-    description: "A 1D Tensor for the computed batch variance, to be used by\nTensorFlow to compute the running variance."
     type_attr: "T"
   }
   output_arg {
     name: "reserve_space_1"
-    description: "A 1D Tensor for the computed batch mean, to be reused\nin the gradient computation."
     type_attr: "T"
   }
   output_arg {
     name: "reserve_space_2"
-    description: "A 1D Tensor for the computed batch variance (inverted variance\nin the cuDNN case), to be reused in the gradient computation."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    description: "The data type for the elements of input and output Tensors."
     allowed_values {
       list {
         type: DT_FLOAT
@@ -9700,7 +9138,6 @@ op {
     default_value {
       f: 0.0001
     }
-    description: "A small float number added to the variance of x."
   }
   attr {
     name: "data_format"
@@ -9708,7 +9145,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "The data format for x and y. Either \"NHWC\" (default) or \"NCHW\"."
   }
   attr {
     name: "is_training"
@@ -9716,67 +9152,53 @@ op {
     default_value {
       b: true
     }
-    description: "A bool value to indicate the operation is for training (default)\nor inference."
   }
-  summary: "Batch normalization."
-  description: "Note that the size of 4D Tensors are defined by either \"NHWC\" or \"NCHW\".\nThe size of 1D Tensors matches the dimension C of the 4D Tensors."
 }
 op {
   name: "FusedBatchNormGrad"
   input_arg {
     name: "y_backprop"
-    description: "A 4D Tensor for the gradient with respect to y."
     type_attr: "T"
   }
   input_arg {
     name: "x"
-    description: "A 4D Tensor for input data."
     type_attr: "T"
   }
   input_arg {
     name: "scale"
-    description: "A 1D Tensor for scaling factor, to scale the normalized x."
     type_attr: "T"
   }
   input_arg {
     name: "reserve_space_1"
-    description: "When is_training is True, a 1D Tensor for the computed batch\nmean to be reused in gradient computation. When is_training is\nFalse, a 1D Tensor for the population mean to be reused in both\n1st and 2nd order gradient computation."
     type_attr: "T"
   }
   input_arg {
     name: "reserve_space_2"
-    description: "When is_training is True, a 1D Tensor for the computed batch\nvariance (inverted variance in the cuDNN case) to be reused in\ngradient computation. When is_training is False, a 1D Tensor\nfor the population variance to be reused in both 1st and 2nd\norder gradient computation."
     type_attr: "T"
   }
   output_arg {
     name: "x_backprop"
-    description: "A 4D Tensor for the gradient with respect to x."
     type_attr: "T"
   }
   output_arg {
     name: "scale_backprop"
-    description: "A 1D Tensor for the gradient with respect to scale."
     type_attr: "T"
   }
   output_arg {
     name: "offset_backprop"
-    description: "A 1D Tensor for the gradient with respect to offset."
     type_attr: "T"
   }
   output_arg {
     name: "reserve_space_3"
-    description: "Unused placeholder to match the mean input in FusedBatchNorm."
     type_attr: "T"
   }
   output_arg {
     name: "reserve_space_4"
-    description: "Unused placeholder to match the variance input\nin FusedBatchNorm."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    description: "The data type for the elements of input and output Tensors."
     allowed_values {
       list {
         type: DT_FLOAT
@@ -9789,7 +9211,6 @@ op {
     default_value {
       f: 0.0001
     }
-    description: "A small float number added to the variance of x."
   }
   attr {
     name: "data_format"
@@ -9797,7 +9218,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "The data format for y_backprop, x, x_backprop.\nEither \"NHWC\" (default) or \"NCHW\"."
   }
   attr {
     name: "is_training"
@@ -9805,70 +9225,57 @@ op {
     default_value {
       b: true
     }
-    description: "A bool value to indicate the operation is for training (default)\nor inference."
   }
-  summary: "Gradient for batch normalization."
-  description: "Note that the size of 4D Tensors are defined by either \"NHWC\" or \"NCHW\".\nThe size of 1D Tensors matches the dimension C of the 4D Tensors."
 }
 op {
   name: "FusedBatchNormGradV2"
   input_arg {
     name: "y_backprop"
-    description: "A 4D Tensor for the gradient with respect to y."
     type_attr: "T"
   }
   input_arg {
     name: "x"
-    description: "A 4D Tensor for input data."
     type_attr: "T"
   }
   input_arg {
     name: "scale"
-    description: "A 1D Tensor for scaling factor, to scale the normalized x."
     type: DT_FLOAT
   }
   input_arg {
     name: "reserve_space_1"
-    description: "When is_training is True, a 1D Tensor for the computed batch\nmean to be reused in gradient computation. When is_training is\nFalse, a 1D Tensor for the population mean to be reused in both\n1st and 2nd order gradient computation."
     type_attr: "U"
   }
   input_arg {
     name: "reserve_space_2"
-    description: "When is_training is True, a 1D Tensor for the computed batch\nvariance (inverted variance in the cuDNN case) to be reused in\ngradient computation. When is_training is False, a 1D Tensor\nfor the population variance to be reused in both 1st and 2nd\norder gradient computation."
     type_attr: "U"
   }
   output_arg {
     name: "x_backprop"
-    description: "A 4D Tensor for the gradient with respect to x."
     type_attr: "T"
   }
   output_arg {
     name: "scale_backprop"
-    description: "A 1D Tensor for the gradient with respect to scale."
     type_attr: "U"
   }
   output_arg {
     name: "offset_backprop"
-    description: "A 1D Tensor for the gradient with respect to offset."
     type_attr: "U"
   }
   output_arg {
     name: "reserve_space_3"
-    description: "Unused placeholder to match the mean input in FusedBatchNorm."
     type_attr: "U"
   }
   output_arg {
     name: "reserve_space_4"
-    description: "Unused placeholder to match the variance input\nin FusedBatchNorm."
     type_attr: "U"
   }
   attr {
     name: "T"
     type: "type"
-    description: "The data type for the elements of input and output Tensors."
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -9876,7 +9283,6 @@ op {
   attr {
     name: "U"
     type: "type"
-    description: "The data type for the scale, offset, mean, and variance."
     allowed_values {
       list {
         type: DT_FLOAT
@@ -9889,7 +9295,6 @@ op {
     default_value {
       f: 0.0001
     }
-    description: "A small float number added to the variance of x."
   }
   attr {
     name: "data_format"
@@ -9897,7 +9302,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "The data format for y_backprop, x, x_backprop.\nEither \"NHWC\" (default) or \"NCHW\"."
   }
   attr {
     name: "is_training"
@@ -9905,70 +9309,57 @@ op {
     default_value {
       b: true
     }
-    description: "A bool value to indicate the operation is for training (default)\nor inference."
   }
-  summary: "Gradient for batch normalization."
-  description: "Note that the size of 4D Tensors are defined by either \"NHWC\" or \"NCHW\".\nThe size of 1D Tensors matches the dimension C of the 4D Tensors."
 }
 op {
   name: "FusedBatchNormV2"
   input_arg {
     name: "x"
-    description: "A 4D Tensor for input data."
     type_attr: "T"
   }
   input_arg {
     name: "scale"
-    description: "A 1D Tensor for scaling factor, to scale the normalized x."
     type_attr: "U"
   }
   input_arg {
     name: "offset"
-    description: "A 1D Tensor for offset, to shift to the normalized x."
     type_attr: "U"
   }
   input_arg {
     name: "mean"
-    description: "A 1D Tensor for population mean. Used for inference only;\nmust be empty for training."
     type_attr: "U"
   }
   input_arg {
     name: "variance"
-    description: "A 1D Tensor for population variance. Used for inference only;\nmust be empty for training."
     type_attr: "U"
   }
   output_arg {
     name: "y"
-    description: "A 4D Tensor for output data."
     type_attr: "T"
   }
   output_arg {
     name: "batch_mean"
-    description: "A 1D Tensor for the computed batch mean, to be used by TensorFlow\nto compute the running mean."
     type_attr: "U"
   }
   output_arg {
     name: "batch_variance"
-    description: "A 1D Tensor for the computed batch variance, to be used by\nTensorFlow to compute the running variance."
     type_attr: "U"
   }
   output_arg {
     name: "reserve_space_1"
-    description: "A 1D Tensor for the computed batch mean, to be reused\nin the gradient computation."
     type_attr: "U"
   }
   output_arg {
     name: "reserve_space_2"
-    description: "A 1D Tensor for the computed batch variance (inverted variance\nin the cuDNN case), to be reused in the gradient computation."
     type_attr: "U"
   }
   attr {
     name: "T"
     type: "type"
-    description: "The data type for the elements of input and output Tensors."
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -9976,7 +9367,6 @@ op {
   attr {
     name: "U"
     type: "type"
-    description: "The data type for the scale, offset, mean, and variance."
     allowed_values {
       list {
         type: DT_FLOAT
@@ -9989,7 +9379,6 @@ op {
     default_value {
       f: 0.0001
     }
-    description: "A small float number added to the variance of x."
   }
   attr {
     name: "data_format"
@@ -9997,7 +9386,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "The data format for x and y. Either \"NHWC\" (default) or \"NCHW\"."
   }
   attr {
     name: "is_training"
@@ -10005,26 +9393,20 @@ op {
     default_value {
       b: true
     }
-    description: "A bool value to indicate the operation is for training (default)\nor inference."
   }
-  summary: "Batch normalization."
-  description: "Note that the size of 4D Tensors are defined by either \"NHWC\" or \"NCHW\".\nThe size of 1D Tensors matches the dimension C of the 4D Tensors."
 }
 op {
   name: "FusedPadConv2D"
   input_arg {
     name: "input"
-    description: "4-D with shape `[batch, in_height, in_width, in_channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "paddings"
-    description: "A two-column matrix specifying the padding sizes. The number of\nrows must be the same as the rank of `input`."
     type: DT_INT32
   }
   input_arg {
     name: "filter"
-    description: "4-D with shape\n`[filter_height, filter_width, in_channels, out_channels]`."
     type_attr: "T"
   }
   output_arg {
@@ -10053,12 +9435,10 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D of length 4.  The stride of the sliding window for each dimension\nof `input`. Must be in the same order as the dimension specified with format."
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -10066,29 +9446,23 @@ op {
       }
     }
   }
-  summary: "Performs a padding as a preprocess during a convolution."
-  description: "Similar to FusedResizeAndPadConv2d, this op allows for an optimized\nimplementation where the spatial padding transformation stage is fused with the\nim2col lookup, but in this case without the bilinear filtering required for\nresizing. Fusing the padding prevents the need to write out the intermediate\nresults as whole tensors, reducing memory pressure, and we can get some latency\ngains by merging the transformation calculations.\nThe data_format attribute for Conv2D isn\'t supported by this op, and \'NHWC\'\norder is used instead.\nInternally this op uses a single per-graph scratch buffer, which means that it\nwill block if multiple versions are being run in parallel. This is because this\noperator is primarily an optimization to minimize memory usage."
 }
 op {
   name: "FusedResizeAndPadConv2D"
   input_arg {
     name: "input"
-    description: "4-D with shape `[batch, in_height, in_width, in_channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "size"
-    description: "A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The\nnew size for the images."
     type: DT_INT32
   }
   input_arg {
     name: "paddings"
-    description: "A two-column matrix specifying the padding sizes. The number of\nrows must be the same as the rank of `input`."
     type: DT_INT32
   }
   input_arg {
     name: "filter"
-    description: "4-D with shape\n`[filter_height, filter_width, in_channels, out_channels]`."
     type_attr: "T"
   }
   output_arg {
@@ -10110,7 +9484,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, rescale input by (new_height - 1) / (height - 1),\nwhich exactly aligns the 4 corners of images and resized images. If false, rescale\nby new_height / height. Treat similarly the width dimension."
   }
   attr {
     name: "mode"
@@ -10125,12 +9498,10 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D of length 4.  The stride of the sliding window for each dimension\nof `input`. Must be in the same order as the dimension specified with format."
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -10138,8 +9509,6 @@ op {
       }
     }
   }
-  summary: "Performs a resize and padding as a preprocess during a convolution."
-  description: "It\'s often possible to do spatial transformations more efficiently as part of\nthe packing stage of a convolution, so this op allows for an optimized\nimplementation where these stages are fused together. This prevents the need to\nwrite out the intermediate results as whole tensors, reducing memory pressure,\nand we can get some latency gains by merging the transformation calculations.\nThe data_format attribute for Conv2D isn\'t supported by this op, and defaults to\n\'NHWC\' order.\nInternally this op uses a single per-graph scratch buffer, which means that it\nwill block if multiple versions are being run in parallel. This is because this\noperator is primarily an optimization to minimize memory usage."
 }
 op {
   name: "Gather"
@@ -10176,24 +9545,19 @@ op {
       }
     }
   }
-  summary: "Gather slices from `params` according to `indices`."
-  description: "`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).\nProduces an output tensor with shape `indices.shape + params.shape[1:]` where:\n\n```python\n    # Scalar indices\n    output[:, ..., :] = params[indices, :, ... :]\n\n    # Vector indices\n    output[i, :, ..., :] = params[indices[i], :, ... :]\n\n    # Higher rank indices\n    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]\n```\n\nIf `indices` is a permutation and `len(indices) == params.shape[0]` then\nthis operation will permute `params` accordingly.\n\n`validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in\n`indices` are always validated to be within range. If assigned to GPU,\nout-of-bound indices result in safe but unspecified behavior, which may include\nraising an error.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/Gather.png\" alt>\n</div>"
 }
 op {
   name: "GatherNd"
   input_arg {
     name: "params"
-    description: "The tensor from which to gather values."
     type_attr: "Tparams"
   }
   input_arg {
     name: "indices"
-    description: "Index tensor."
     type_attr: "Tindices"
   }
   output_arg {
     name: "output"
-    description: "Values from `params` gathered from indices given by `indices`, with\nshape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`."
     type_attr: "Tparams"
   }
   attr {
@@ -10210,29 +9574,23 @@ op {
       }
     }
   }
-  summary: "Gather slices from `params` into a Tensor with shape specified by `indices`."
-  description: "`indices` is an K-dimensional integer tensor, best thought of as a\n(K-1)-dimensional tensor of indices into `params`, where each element defines a\nslice of `params`:\n\n    output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]\n\nWhereas in @{tf.gather} `indices` defines slices into the first\ndimension of `params`, in `tf.gather_nd`, `indices` defines slices into the\nfirst `N` dimensions of `params`, where `N = indices.shape[-1]`.\n\nThe last dimension of `indices` can be at most the rank of\n`params`:\n\n    indices.shape[-1] <= params.rank\n\nThe last dimension of `indices` corresponds to elements\n(if `indices.shape[-1] == params.rank`) or slices\n(if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`\nof `params`.  The output tensor has shape\n\n    indices.shape[:-1] + params.shape[indices.shape[-1]:]\n\nSome examples below.\n\nSimple indexing into a matrix:\n\n```python\n    indices = [[0, 0], [1, 1]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [\'a\', \'d\']\n```\n\nSlice indexing into a matrix:\n\n```python\n    indices = [[1], [0]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'c\', \'d\'], [\'a\', \'b\']]\n```\n\nIndexing into a 3-tensor:\n\n```python\n    indices = [[1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[0, 1], [1, 0]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'c0\', \'d0\'], [\'a1\', \'b1\']]\n\n\n    indices = [[0, 0, 1], [1, 0, 1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [\'b0\', \'b1\']\n```\n\nBatched indexing into a matrix:\n\n```python\n    indices = [[[0, 0]], [[0, 1]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'a\'], [\'b\']]\n```\n\nBatched slice indexing into a matrix:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[[\'c\', \'d\']], [[\'a\', \'b\']]]\n```\n\nBatched indexing into a 3-tensor:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[[\'a1\', \'b1\'], [\'c1\', \'d1\']]],\n              [[[\'a0\', \'b0\'], [\'c0\', \'d0\']]]]\n\n    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'c0\', \'d0\'], [\'a1\', \'b1\']],\n              [[\'a0\', \'b0\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'b0\', \'b1\'], [\'d0\', \'c1\']]\n```"
 }
 op {
   name: "GatherV2"
   input_arg {
     name: "params"
-    description: "The tensor from which to gather values. Must be at least rank\n`axis + 1`."
     type_attr: "Tparams"
   }
   input_arg {
     name: "indices"
-    description: "Index tensor. Must be in range `[0, params.shape[axis])`."
     type_attr: "Tindices"
   }
   input_arg {
     name: "axis"
-    description: "The axis in `params` to gather `indices` from. Defaults to the first\ndimension. Supports negative indexes."
     type_attr: "Taxis"
   }
   output_arg {
     name: "output"
-    description: "Values from `params` gathered from indices given by `indices`, with\nshape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`."
     type_attr: "Tparams"
   }
   attr {
@@ -10259,41 +9617,33 @@ op {
       }
     }
   }
-  summary: "Gather slices from `params` axis `axis` according to `indices`."
-  description: "`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).\nProduces an output tensor with shape `params.shape[:axis] + indices.shape +\nparams.shape[axis + 1:]` where:\n\n```python\n    # Scalar indices (output is rank(params) - 1).\n    output[a_0, ..., a_n, b_0, ..., b_n] =\n      params[a_0, ..., a_n, indices, b_0, ..., b_n]\n\n    # Vector indices (output is rank(params)).\n    output[a_0, ..., a_n, i, b_0, ..., b_n] =\n      params[a_0, ..., a_n, indices[i], b_0, ..., b_n]\n\n    # Higher rank indices (output is rank(params) + rank(indices) - 1).\n    output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =\n      params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/Gather.png\" alt>\n</div>"
 }
 op {
   name: "GenerateVocabRemapping"
   input_arg {
     name: "new_vocab_file"
-    description: "Path to the new vocab file."
     type: DT_STRING
   }
   input_arg {
     name: "old_vocab_file"
-    description: "Path to the old vocab file."
     type: DT_STRING
   }
   output_arg {
     name: "remapping"
-    description: "A Tensor of length num_new_vocab where the element at index i\nis equal to the old ID that maps to the new ID i.  This element is -1 for any\nnew ID that is not found in the old vocabulary."
     type: DT_INT64
   }
   output_arg {
     name: "num_present"
-    description: "Number of new vocab entries found in old vocab."
     type: DT_INT32
   }
   attr {
     name: "new_vocab_offset"
     type: "int"
-    description: "How many entries into the new vocab file to start reading."
     has_minimum: true
   }
   attr {
     name: "num_new_vocab"
     type: "int"
-    description: "Number of entries in the new vocab file to remap."
     has_minimum: true
   }
   attr {
@@ -10302,68 +9652,57 @@ op {
     default_value {
       i: -1
     }
-    description: "Number of entries in the old vocab file to consider.  If -1,\nuse the entire old vocabulary."
     has_minimum: true
     minimum: -1
   }
-  summary: "Given a path to new and old vocabulary files, returns a remapping Tensor of"
-  description: "length `num_new_vocab`, where `remapping[i]` contains the row number in the old\nvocabulary that corresponds to row `i` in the new vocabulary (starting at line\n`new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`\nin the new vocabulary is not in the old vocabulary.  The old vocabulary is\nconstrained to the first `old_vocab_size` entries if `old_vocab_size` is not the\ndefault value of -1.\n\n`num_vocab_offset` enables\nuse in the partitioned variable case, and should generally be set through\nexamining partitioning info.  The format of the files should be a text file,\nwith each line containing a single entity within the vocabulary.\n\nFor example, with `new_vocab_file` a text file containing each of the following\nelements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],\n`num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be\n`[0, -1, 2]`.\n\nThe op also returns a count of how many entries in the new vocabulary\nwere present in the old vocabulary, which is used to calculate the number of\nvalues to initialize in a weight matrix remapping\n\nThis functionality can be used to remap both row vocabularies (typically,\nfeatures) and column vocabularies (typically, classes) from TensorFlow\ncheckpoints.  Note that the partitioning logic relies on contiguous vocabularies\ncorresponding to div-partitioned variables.  Moreover, the underlying remapping\nuses an IndexTable (as opposed to an inexact CuckooTable), so client code should\nuse the corresponding index_table_from_file() as the FeatureColumn framework\ndoes (as opposed to tf.feature_to_id(), which uses a CuckooTable)."
 }
 op {
   name: "GetSessionHandle"
   input_arg {
     name: "value"
-    description: "The tensor to be stored."
     type_attr: "T"
   }
   output_arg {
     name: "handle"
-    description: "The handle for the tensor stored in the session state, represented\nas a string."
     type: DT_STRING
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Store the input tensor in the state of the current session."
+  is_stateful: true
 }
 op {
   name: "GetSessionHandleV2"
   input_arg {
     name: "value"
-    description: "The tensor to be stored."
     type_attr: "T"
   }
   output_arg {
     name: "handle"
-    description: "The handle for the tensor stored in the session state, represented\nas a ResourceHandle object."
     type: DT_RESOURCE
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Store the input tensor in the state of the current session."
   is_stateful: true
 }
 op {
   name: "GetSessionTensor"
   input_arg {
     name: "handle"
-    description: "The handle for a tensor stored in the session state."
     type: DT_STRING
   }
   output_arg {
     name: "value"
-    description: "The tensor for the given handle."
     type_attr: "dtype"
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of the output value."
   }
-  summary: "Get the value of the tensor specified by its handle."
+  is_stateful: true
 }
 op {
   name: "Greater"
@@ -10387,10 +9726,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -10398,8 +9738,6 @@ op {
       }
     }
   }
-  summary: "Returns the truth value of (x > y) element-wise."
-  description: "*NOTE*: `Greater` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "GreaterEqual"
@@ -10423,10 +9761,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -10434,8 +9773,6 @@ op {
       }
     }
   }
-  summary: "Returns the truth value of (x >= y) element-wise."
-  description: "*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "GroupByWindowDataset"
@@ -10462,7 +9799,6 @@ op {
   attr {
     name: "key_func"
     type: "func"
-    description: "A function mapping an element of `input_dataset`, concatenated\nwith `key_func_other_arguments` to a scalar value of type DT_INT64."
   }
   attr {
     name: "reduce_func"
@@ -10499,19 +9835,31 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that computes a windowed group-by on `input_dataset`."
-  description: "// TODO(mrry): Support non-int64 keys."
+}
+op {
+  name: "GuaranteeConst"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
 }
 op {
   name: "HSVToRGB"
   input_arg {
     name: "images"
-    description: "1-D or higher rank. HSV data to convert. Last dimension must be size 3."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "`images` converted to RGB."
     type_attr: "T"
   }
   attr {
@@ -10522,19 +9870,18 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Convert one or more images from HSV to RGB."
-  description: "Outputs a tensor of the same shape as the `images` tensor, containing the RGB\nvalue of the pixels. The output is only well defined if the value in `images`\nare in `[0,1]`.\n\nSee `rgb_to_hsv` for a description of the HSV encoding."
 }
 op {
   name: "HashTable"
   output_arg {
     name: "table_handle"
-    description: "Handle to a table."
     type: DT_STRING
     is_ref: true
   }
@@ -10544,7 +9891,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -10552,7 +9898,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
   }
   attr {
     name: "use_node_name_sharing"
@@ -10560,27 +9905,21 @@ op {
     default_value {
       b: false
     }
-    description: "If true and shared_name is empty, the table is shared\nusing the node name."
   }
   attr {
     name: "key_dtype"
     type: "type"
-    description: "Type of the table keys."
   }
   attr {
     name: "value_dtype"
     type: "type"
-    description: "Type of the table values."
   }
-  summary: "Creates a non-initialized hash table."
-  description: "This op creates a hash table, specifying the type of its keys and values.\nBefore using the table you will have to initialize it.  After initialization the\ntable will be immutable."
   is_stateful: true
 }
 op {
   name: "HashTableV2"
   output_arg {
     name: "table_handle"
-    description: "Handle to a table."
     type: DT_RESOURCE
   }
   attr {
@@ -10589,7 +9928,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -10597,7 +9935,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
   }
   attr {
     name: "use_node_name_sharing"
@@ -10605,42 +9942,33 @@ op {
     default_value {
       b: false
     }
-    description: "If true and shared_name is empty, the table is shared\nusing the node name."
   }
   attr {
     name: "key_dtype"
     type: "type"
-    description: "Type of the table keys."
   }
   attr {
     name: "value_dtype"
     type: "type"
-    description: "Type of the table values."
   }
-  summary: "Creates a non-initialized hash table."
-  description: "This op creates a hash table, specifying the type of its keys and values.\nBefore using the table you will have to initialize it.  After initialization the\ntable will be immutable."
   is_stateful: true
 }
 op {
   name: "HistogramFixedWidth"
   input_arg {
     name: "values"
-    description: "Numeric `Tensor`."
     type_attr: "T"
   }
   input_arg {
     name: "value_range"
-    description: "Shape [2] `Tensor` of same `dtype` as `values`.\nvalues <= value_range[0] will be mapped to hist[0],\nvalues >= value_range[1] will be mapped to hist[-1]."
     type_attr: "T"
   }
   input_arg {
     name: "nbins"
-    description: "Scalar `int32 Tensor`.  Number of histogram bins."
     type: DT_INT32
   }
   output_arg {
     name: "out"
-    description: "A 1-D `Tensor` holding histogram of values."
     type_attr: "dtype"
   }
   attr {
@@ -10668,24 +9996,19 @@ op {
       }
     }
   }
-  summary: "Return histogram of values."
-  description: "Given the tensor `values`, this operation returns a rank 1 histogram counting\nthe number of entries in `values` that fall into every bin.  The bins are\nequal width and determined by the arguments `value_range` and `nbins`.\n\n```python\n# Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)\nnbins = 5\nvalue_range = [0.0, 5.0]\nnew_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]\n\nwith tf.get_default_session() as sess:\n  hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)\n  variables.global_variables_initializer().run()\n  sess.run(hist) => [2, 1, 1, 0, 2]\n```"
 }
 op {
   name: "HistogramSummary"
   input_arg {
     name: "tag"
-    description: "Scalar.  Tag to use for the `Summary.Value`."
     type: DT_STRING
   }
   input_arg {
     name: "values"
-    description: "Any shape. Values to use to build the histogram."
     type_attr: "T"
   }
   output_arg {
     name: "summary"
-    description: "Scalar. Serialized `Summary` protocol buffer."
     type: DT_STRING
   }
   attr {
@@ -10699,10 +10022,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -10710,113 +10034,84 @@ op {
       }
     }
   }
-  summary: "Outputs a `Summary` protocol buffer with a histogram."
-  description: "The generated\n[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)\nhas one summary value containing a histogram for `values`.\n\nThis op reports an `InvalidArgument` error if any value is not finite."
 }
 op {
   name: "IFFT"
   input_arg {
     name: "input"
-    description: "A complex64 tensor."
     type: DT_COMPLEX64
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most\n  dimension of `input` is replaced with its inverse 1D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifft\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Inverse fast Fourier transform."
-  description: "Computes the inverse 1-dimensional discrete Fourier transform over the\ninner-most dimension of `input`."
 }
 op {
   name: "IFFT2D"
   input_arg {
     name: "input"
-    description: "A complex64 tensor."
     type: DT_COMPLEX64
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their inverse 2D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifft2\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Inverse 2D fast Fourier transform."
-  description: "Computes the inverse 2-dimensional discrete Fourier transform over the\ninner-most 2 dimensions of `input`."
 }
 op {
   name: "IFFT3D"
   input_arg {
     name: "input"
-    description: "A complex64 tensor."
     type: DT_COMPLEX64
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their inverse 3D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifftn with 3 dimensions.\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Inverse 3D fast Fourier transform."
-  description: "Computes the inverse 3-dimensional discrete Fourier transform over the\ninner-most 3 dimensions of `input`."
 }
 op {
   name: "IRFFT"
   input_arg {
     name: "input"
-    description: "A complex64 tensor."
     type: DT_COMPLEX64
   }
   input_arg {
     name: "fft_length"
-    description: "An int32 tensor of shape [1]. The FFT length."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "A float32 tensor of the same rank as `input`. The inner-most\n  dimension of `input` is replaced with the `fft_length` samples of its inverse\n  1D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.irfft\n@end_compatibility"
     type: DT_FLOAT
   }
-  summary: "Inverse real-valued fast Fourier transform."
-  description: "Computes the inverse 1-dimensional discrete Fourier transform of a real-valued\nsignal over the inner-most dimension of `input`.\n\nThe inner-most dimension of `input` is assumed to be the result of `RFFT`: the\n`fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If\n`fft_length` is not provided, it is computed from the size of the inner-most\ndimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to\ncompute `input` is odd, it should be provided since it cannot be inferred\nproperly.\n\nAlong the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller\nthan the corresponding dimension of `input`, the dimension is cropped. If it is\nlarger, the dimension is padded with zeros."
 }
 op {
   name: "IRFFT2D"
   input_arg {
     name: "input"
-    description: "A complex64 tensor."
     type: DT_COMPLEX64
   }
   input_arg {
     name: "fft_length"
-    description: "An int32 tensor of shape [2]. The FFT length for each dimension."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "A float32 tensor of the same rank as `input`. The inner-most 2\n  dimensions of `input` are replaced with the `fft_length` samples of their\n  inverse 2D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.irfft2\n@end_compatibility"
     type: DT_FLOAT
   }
-  summary: "Inverse 2D real-valued fast Fourier transform."
-  description: "Computes the inverse 2-dimensional discrete Fourier transform of a real-valued\nsignal over the inner-most 2 dimensions of `input`.\n\nThe inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:\nThe inner-most dimension contains the `fft_length / 2 + 1` unique components of\nthe DFT of a real-valued signal. If `fft_length` is not provided, it is computed\nfrom the size of the inner-most 2 dimensions of `input`. If the FFT length used\nto compute `input` is odd, it should be provided since it cannot be inferred\nproperly.\n\nAlong each axis `IRFFT2D` is computed on, if `fft_length` (or\n`fft_length / 2 + 1` for the inner-most dimension) is smaller than the\ncorresponding dimension of `input`, the dimension is cropped. If it is larger,\nthe dimension is padded with zeros."
 }
 op {
   name: "IRFFT3D"
   input_arg {
     name: "input"
-    description: "A complex64 tensor."
     type: DT_COMPLEX64
   }
   input_arg {
     name: "fft_length"
-    description: "An int32 tensor of shape [3]. The FFT length for each dimension."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "A float32 tensor of the same rank as `input`. The inner-most 3\n  dimensions of `input` are replaced with the `fft_length` samples of their\n  inverse 3D real Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.irfftn with 3 dimensions.\n@end_compatibility"
     type: DT_FLOAT
   }
-  summary: "Inverse 3D real-valued fast Fourier transform."
-  description: "Computes the inverse 3-dimensional discrete Fourier transform of a real-valued\nsignal over the inner-most 3 dimensions of `input`.\n\nThe inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:\nThe inner-most dimension contains the `fft_length / 2 + 1` unique components of\nthe DFT of a real-valued signal. If `fft_length` is not provided, it is computed\nfrom the size of the inner-most 3 dimensions of `input`. If the FFT length used\nto compute `input` is odd, it should be provided since it cannot be inferred\nproperly.\n\nAlong each axis `IRFFT3D` is computed on, if `fft_length` (or\n`fft_length / 2 + 1` for the inner-most dimension) is smaller than the\ncorresponding dimension of `input`, the dimension is cropped. If it is larger,\nthe dimension is padded with zeros."
 }
 op {
   name: "Identity"
@@ -10832,7 +10127,6 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Return a tensor with the same shape and contents as the input tensor or value."
 }
 op {
   name: "IdentityN"
@@ -10850,14 +10144,11 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Returns a list of tensors with the same shapes and contents as the input"
-  description: "tensors.\n\nThis op can be used to override the gradient for complicated functions. For\nexample, suppose y = f(x) and we wish to apply a custom function g for backprop\nsuch that dx = g(dy). In Python,\n\n```python\nwith tf.get_default_graph().gradient_override_map(\n    {\'IdentityN\': \'OverrideGradientWithG\'}):\n  y, _ = identity_n([f(x), x])\n\n@tf.RegisterGradient(\'OverrideGradientWithG\')\ndef ApplyG(op, dy, _):\n  return [None, g(dy)]  # Do not backprop to f(x).\n```"
 }
 op {
   name: "IdentityReader"
   output_arg {
     name: "reader_handle"
-    description: "The handle to reference the Reader."
     type: DT_STRING
     is_ref: true
   }
@@ -10867,7 +10158,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -10875,17 +10165,17 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
   }
-  summary: "A Reader that outputs the queued work as both the key and value."
-  description: "To use, enqueue strings in a Queue.  ReaderRead will take the front\nwork string and output (work, work)."
+  deprecation {
+    version: 26
+    explanation: "Use IdentityReaderV2"
+  }
   is_stateful: true
 }
 op {
   name: "IdentityReaderV2"
   output_arg {
     name: "reader_handle"
-    description: "The handle to reference the Reader."
     type: DT_RESOURCE
   }
   attr {
@@ -10894,7 +10184,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -10902,10 +10191,7 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
   }
-  summary: "A Reader that outputs the queued work as both the key and value."
-  description: "To use, enqueue strings in a Queue.  ReaderRead will take the front\nwork string and output (work, work)."
   is_stateful: true
 }
 op {
@@ -10932,8 +10218,6 @@ op {
       }
     }
   }
-  summary: "Compute the lower regularized incomplete Gamma function `Q(a, x)`."
-  description: "The lower regularized incomplete Gamma function is defined as:\n\n\n\\\\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\\\)\n\nwhere\n\n\\\\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\\\)\n\nis the lower incomplete Gamma function.\n\nNote, above `Q(a, x)` (`Igammac`) is the upper regularized complete\nGamma function."
 }
 op {
   name: "Igammac"
@@ -10959,32 +10243,6 @@ op {
       }
     }
   }
-  summary: "Compute the upper regularized incomplete Gamma function `Q(a, x)`."
-  description: "The upper regularized incomplete Gamma function is defined as:\n\n\\\\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\\\)\n\nwhere\n\n\\\\(Gamma(a, x) = int_{x}^{\\infty} t^{a-1} exp(-t) dt\\\\)\n\nis the upper incomplete Gama function.\n\nNote, above `P(a, x)` (`Igamma`) is the lower regularized complete\nGamma function."
-}
-op {
-  name: "IgnoreErrorsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  summary: "Creates a dataset that contains the elements of `input_dataset` ignoring errors."
 }
 op {
   name: "Imag"
@@ -11022,24 +10280,19 @@ op {
       }
     }
   }
-  summary: "Returns the imaginary part of a complex number."
-  description: "Given a tensor `input` of complex numbers, this operation returns a tensor of\ntype `float` that is the imaginary part of each element in `input`. All\nelements in `input` must be complex numbers of the form \\\\(a + bj\\\\), where *a*\nis the real part and *b* is the imaginary part returned by this operation.\n\nFor example:\n\n```\n# tensor \'input\' is [-2.25 + 4.75j, 3.25 + 5.75j]\ntf.imag(input) ==> [4.75, 5.75]\n```"
 }
 op {
   name: "ImageSummary"
   input_arg {
     name: "tag"
-    description: "Scalar. Used to build the `tag` attribute of the summary values."
     type: DT_STRING
   }
   input_arg {
     name: "tensor"
-    description: "4-D of shape `[batch_size, height, width, channels]` where\n`channels` is 1, 3, or 4."
     type_attr: "T"
   }
   output_arg {
     name: "summary"
-    description: "Scalar. Serialized `Summary` protocol buffer."
     type: DT_STRING
   }
   attr {
@@ -11048,7 +10301,6 @@ op {
     default_value {
       i: 3
     }
-    description: "Max number of batch elements to generate images for."
     has_minimum: true
     minimum: 1
   }
@@ -11084,10 +10336,7 @@ op {
         int_val: 255
       }
     }
-    description: "Color to use for pixels with non-finite values."
   }
-  summary: "Outputs a `Summary` protocol buffer with images."
-  description: "The summary has up to `max_images` summary values containing images. The\nimages are built from `tensor` which must be 4-D with shape `[batch_size,\nheight, width, channels]` and where `channels` can be:\n\n*  1: `tensor` is interpreted as Grayscale.\n*  3: `tensor` is interpreted as RGB.\n*  4: `tensor` is interpreted as RGBA.\n\nThe images have the same number of channels as the input tensor. For float\ninput, the values are normalized one image at a time to fit in the range\n`[0, 255]`.  `uint8` values are unchanged.  The op uses two different\nnormalization algorithms:\n\n*  If the input values are all positive, they are rescaled so the largest one\n   is 255.\n\n*  If any input value is negative, the values are shifted so input value 0.0\n   is at 127.  They are then rescaled so that either the smallest value is 0,\n   or the largest one is 255.\n\nThe `tag` argument is a scalar `Tensor` of type `string`.  It is used to\nbuild the `tag` of the summary values:\n\n*  If `max_images` is 1, the summary value tag is \'*tag*/image\'.\n*  If `max_images` is greater than 1, the summary value tags are\n   generated sequentially as \'*tag*/image/0\', \'*tag*/image/1\', etc.\n\nThe `bad_color` argument is the color to use in the generated images for\nnon-finite input values.  It is a `unit8` 1-D tensor of length `channels`.\nEach element must be in the range `[0, 255]` (It represents the value of a\npixel in the output image).  Non-finite values in the input tensor are\nreplaced by this tensor in the output image.  The default value is the color\nred."
 }
 op {
   name: "ImmutableConst"
@@ -11098,42 +10347,33 @@ op {
   attr {
     name: "dtype"
     type: "type"
-    description: "Type of the returned tensor."
   }
   attr {
     name: "shape"
     type: "shape"
-    description: "Shape of the returned tensor."
   }
   attr {
     name: "memory_region_name"
     type: "string"
-    description: "Name of readonly memory region used by the tensor, see\nNewReadOnlyMemoryRegionFromFile in tensorflow::Env."
   }
-  summary: "Returns immutable tensor from memory region."
-  description: "The current implementation memmaps the tensor from a file."
 }
 op {
   name: "InTopK"
   input_arg {
     name: "predictions"
-    description: "A `batch_size` x `classes` tensor."
     type: DT_FLOAT
   }
   input_arg {
     name: "targets"
-    description: "A `batch_size` vector of class ids."
     type_attr: "T"
   }
   output_arg {
     name: "precision"
-    description: "Computed Precision at `k` as a `bool Tensor`."
     type: DT_BOOL
   }
   attr {
     name: "k"
     type: "int"
-    description: "Number of top elements to look at for computing precision."
   }
   attr {
     name: "T"
@@ -11148,29 +10388,23 @@ op {
       }
     }
   }
-  summary: "Says whether the targets are in the top `K` predictions."
-  description: "This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the\nprediction for the target class is among the top `k` predictions among\nall predictions for example `i`. Note that the behavior of `InTopK` differs\nfrom the `TopK` op in its handling of ties; if multiple classes have the\nsame prediction value and straddle the top-`k` boundary, all of those\nclasses are considered to be in the top `k`.\n\nMore formally, let\n\n  \\\\(predictions_i\\\\) be the predictions for all classes for example `i`,\n  \\\\(targets_i\\\\) be the target class for example `i`,\n  \\\\(out_i\\\\) be the output for example `i`,\n\n$$out_i = predictions_{i, targets_i} \\in TopKIncludingTies(predictions_i)$$"
 }
 op {
   name: "InTopKV2"
   input_arg {
     name: "predictions"
-    description: "A `batch_size` x `classes` tensor."
     type: DT_FLOAT
   }
   input_arg {
     name: "targets"
-    description: "A `batch_size` vector of class ids."
     type_attr: "T"
   }
   input_arg {
     name: "k"
-    description: "Number of top elements to look at for computing precision."
     type_attr: "T"
   }
   output_arg {
     name: "precision"
-    description: "Computed precision at `k` as a `bool Tensor`."
     type: DT_BOOL
   }
   attr {
@@ -11186,25 +10420,20 @@ op {
       }
     }
   }
-  summary: "Says whether the targets are in the top `K` predictions."
-  description: "This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the\nprediction for the target class is among the top `k` predictions among\nall predictions for example `i`. Note that the behavior of `InTopK` differs\nfrom the `TopK` op in its handling of ties; if multiple classes have the\nsame prediction value and straddle the top-`k` boundary, all of those\nclasses are considered to be in the top `k`.\n\nMore formally, let\n\n  \\\\(predictions_i\\\\) be the predictions for all classes for example `i`,\n  \\\\(targets_i\\\\) be the target class for example `i`,\n  \\\\(out_i\\\\) be the output for example `i`,\n\n$$out_i = predictions_{i, targets_i} \\in TopKIncludingTies(predictions_i)$$"
 }
 op {
   name: "InitializeTable"
   input_arg {
     name: "table_handle"
-    description: "Handle to a table which will be initialized."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "keys"
-    description: "Keys of type Tkey."
     type_attr: "Tkey"
   }
   input_arg {
     name: "values"
-    description: "Values of type Tval."
     type_attr: "Tval"
   }
   attr {
@@ -11215,32 +10444,27 @@ op {
     name: "Tval"
     type: "type"
   }
-  summary: "Table initializer that takes two tensors for keys and values respectively."
 }
 op {
   name: "InitializeTableFromTextFile"
   input_arg {
     name: "table_handle"
-    description: "Handle to a table which will be initialized."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "filename"
-    description: "Filename of a vocabulary text file."
     type: DT_STRING
   }
   attr {
     name: "key_index"
     type: "int"
-    description: "Column index in a line to get the table `key` values from."
     has_minimum: true
     minimum: -2
   }
   attr {
     name: "value_index"
     type: "int"
-    description: "Column index that represents information of a line to get the table\n`value` values from."
     has_minimum: true
     minimum: -2
   }
@@ -11250,7 +10474,6 @@ op {
     default_value {
       i: -1
     }
-    description: "Number of elements of the file, use -1 if unknown."
     has_minimum: true
     minimum: -1
   }
@@ -11260,34 +10483,27 @@ op {
     default_value {
       s: "\t"
     }
-    description: "Delimiter to separate fields in a line."
   }
-  summary: "Initializes a table from a text file."
-  description: "It inserts one key-value pair into the table for each line of the file.\nThe key and value is extracted from the whole line content, elements from the\nsplit line based on `delimiter` or the line number (starting from zero).\nWhere to extract the key and value from a line is specified by `key_index` and\n`value_index`.\n\n- A value of -1 means use the line number(starting from zero), expects `int64`.\n- A value of -2 means use the whole line content, expects `string`.\n- A value >= 0 means use the index (starting at zero) of the split line based\n  on `delimiter`."
 }
 op {
   name: "InitializeTableFromTextFileV2"
   input_arg {
     name: "table_handle"
-    description: "Handle to a table which will be initialized."
     type: DT_RESOURCE
   }
   input_arg {
     name: "filename"
-    description: "Filename of a vocabulary text file."
     type: DT_STRING
   }
   attr {
     name: "key_index"
     type: "int"
-    description: "Column index in a line to get the table `key` values from."
     has_minimum: true
     minimum: -2
   }
   attr {
     name: "value_index"
     type: "int"
-    description: "Column index that represents information of a line to get the table\n`value` values from."
     has_minimum: true
     minimum: -2
   }
@@ -11297,7 +10513,6 @@ op {
     default_value {
       i: -1
     }
-    description: "Number of elements of the file, use -1 if unknown."
     has_minimum: true
     minimum: -1
   }
@@ -11307,27 +10522,21 @@ op {
     default_value {
       s: "\t"
     }
-    description: "Delimiter to separate fields in a line."
   }
-  summary: "Initializes a table from a text file."
-  description: "It inserts one key-value pair into the table for each line of the file.\nThe key and value is extracted from the whole line content, elements from the\nsplit line based on `delimiter` or the line number (starting from zero).\nWhere to extract the key and value from a line is specified by `key_index` and\n`value_index`.\n\n- A value of -1 means use the line number(starting from zero), expects `int64`.\n- A value of -2 means use the whole line content, expects `string`.\n- A value >= 0 means use the index (starting at zero) of the split line based\n  on `delimiter`."
   is_stateful: true
 }
 op {
   name: "InitializeTableV2"
   input_arg {
     name: "table_handle"
-    description: "Handle to a table which will be initialized."
     type: DT_RESOURCE
   }
   input_arg {
     name: "keys"
-    description: "Keys of type Tkey."
     type_attr: "Tkey"
   }
   input_arg {
     name: "values"
-    description: "Values of type Tval."
     type_attr: "Tval"
   }
   attr {
@@ -11338,7 +10547,6 @@ op {
     name: "Tval"
     type: "type"
   }
-  summary: "Table initializer that takes two tensors for keys and values respectively."
   is_stateful: true
 }
 op {
@@ -11366,7 +10574,6 @@ op {
   attr {
     name: "f"
     type: "func"
-    description: "A function mapping elements of `input_dataset`, concatenated with\n`other_arguments`, to a Dataset variant that contains elements matching\n`output_types` and `output_shapes`."
   }
   attr {
     name: "Targuments"
@@ -11385,8 +10592,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
-  description: "Unlike MapDataset, the `f` in InterleaveDataset is expected to return\na Dataset variant, and InterleaveDataset will flatten successive\nresults into a single Dataset. Unlike FlatMapDataset,\nInterleaveDataset will interleave sequences of up to `block_length`\nconsecutive elements from `cycle_length` input elements."
 }
 op {
   name: "Inv"
@@ -11404,6 +10609,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -11413,12 +10619,6 @@ op {
       }
     }
   }
-  summary: "Computes the reciprocal of x element-wise."
-  description: "I.e., \\\\(y = 1 / x\\\\)."
-  deprecation {
-    version: 17
-    explanation: "Use Reciprocal"
-  }
 }
 op {
   name: "InvGrad"
@@ -11440,6 +10640,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -11447,12 +10648,6 @@ op {
       }
     }
   }
-  summary: "Computes the gradient for the inverse of `x` wrt its input."
-  description: "Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`\nis the corresponding input gradient."
-  deprecation {
-    version: 17
-    explanation: "Use ReciprocalGrad"
-  }
 }
 op {
   name: "Invert"
@@ -11480,19 +10675,15 @@ op {
       }
     }
   }
-  summary: "Flips all bits elementwise."
-  description: "The result will have exactly those bits set, that are not set in `x`. The\ncomputation is performed on the underlying representation of x."
 }
 op {
   name: "InvertPermutation"
   input_arg {
     name: "x"
-    description: "1-D."
     type_attr: "T"
   }
   output_arg {
     name: "y"
-    description: "1-D."
     type_attr: "T"
   }
   attr {
@@ -11508,8 +10699,6 @@ op {
       }
     }
   }
-  summary: "Computes the inverse permutation of a tensor."
-  description: "This operation computes the inverse of an index permutation. It takes a 1-D\ninteger tensor `x`, which represents the indices of a zero-based array, and\nswaps each value with its index position. In other words, for an output tensor\n`y` and an input tensor `x`, this operation computes the following:\n\n`y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`\n\nThe values must include 0. There can be no duplicate values or negative values.\n\nFor example:\n\n```\n# tensor `x` is [3, 4, 0, 2, 1]\ninvert_permutation(x) ==> [2, 4, 3, 0, 1]\n```"
 }
 op {
   name: "IsFinite"
@@ -11527,13 +10716,12 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Returns which elements of x are finite."
-  description: "@compatibility(numpy)\nEquivalent to np.isfinite\n@end_compatibility"
 }
 op {
   name: "IsInf"
@@ -11551,13 +10739,12 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Returns which elements of x are Inf."
-  description: "@compatibility(numpy)\nEquivalent to np.isinf\n@end_compatibility"
 }
 op {
   name: "IsNan"
@@ -11575,19 +10762,17 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Returns which elements of x are NaN."
-  description: "@compatibility(numpy)\nEquivalent to np.isnan\n@end_compatibility"
 }
 op {
   name: "IsVariableInitialized"
   input_arg {
     name: "ref"
-    description: "Should be from a `Variable` node. May be uninitialized."
     type_attr: "dtype"
     is_ref: true
   }
@@ -11598,17 +10783,13 @@ op {
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of elements in the variable tensor."
   }
-  summary: "Checks whether a tensor has been initialized."
-  description: "Outputs boolean scalar indicating whether the tensor has been initialized."
   allows_uninitialized_input: true
 }
 op {
   name: "Iterator"
   output_arg {
     name: "handle"
-    description: "A handle to the iterator that can be passed to a \"MakeIterator\"\nor \"IteratorGetNext\" op."
     type: DT_RESOURCE
   }
   attr {
@@ -11631,19 +10812,16 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "A container for an iterator resource."
   is_stateful: true
 }
 op {
   name: "IteratorFromStringHandle"
   input_arg {
     name: "string_handle"
-    description: "A string representation of the given handle."
     type: DT_STRING
   }
   output_arg {
     name: "resource_handle"
-    description: "A handle to an iterator resource."
     type: DT_RESOURCE
   }
   attr {
@@ -11653,7 +10831,6 @@ op {
       list {
       }
     }
-    description: "If specified, defines the type of each tuple component in an\nelement produced by the resulting iterator."
     has_minimum: true
   }
   attr {
@@ -11663,10 +10840,8 @@ op {
       list {
       }
     }
-    description: "If specified, defines the shape of each tuple component in an\nelement produced by the resulting iterator."
     has_minimum: true
   }
-  summary: "Converts the given string representing a handle to an iterator to a resource."
   is_stateful: true
 }
 op {
@@ -11691,7 +10866,30 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Gets the next output from the given iterator."
+  is_stateful: true
+}
+op {
+  name: "IteratorGetNextSync"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
   is_stateful: true
 }
 op {
@@ -11704,34 +10902,28 @@ op {
     name: "stats_aggregator_handle"
     type: DT_RESOURCE
   }
-  summary: "Associates the given iterator with the given statistics aggregator."
   is_stateful: true
 }
 op {
   name: "IteratorToStringHandle"
   input_arg {
     name: "resource_handle"
-    description: "A handle to an iterator resource."
     type: DT_RESOURCE
   }
   output_arg {
     name: "string_handle"
-    description: "A string representation of the given handle."
     type: DT_STRING
   }
-  summary: "Converts the given `resource_handle` representing an iterator to a string."
   is_stateful: true
 }
 op {
   name: "L2Loss"
   input_arg {
     name: "t"
-    description: "Typically 2-D, but may have any dimensions."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "0-D."
     type_attr: "T"
   }
   attr {
@@ -11740,19 +10932,17 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "L2 Loss."
-  description: "Computes half the L2 norm of a tensor without the `sqrt`:\n\n    output = sum(t ** 2) / 2"
 }
 op {
   name: "LMDBReader"
   output_arg {
     name: "reader_handle"
-    description: "The handle to reference the Reader."
     type: DT_STRING
     is_ref: true
   }
@@ -11762,7 +10952,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -11770,16 +10959,13 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
   }
-  summary: "A Reader that outputs the records from a LMDB file."
   is_stateful: true
 }
 op {
   name: "LRN"
   input_arg {
     name: "input"
-    description: "4-D."
     type_attr: "T"
   }
   output_arg {
@@ -11792,7 +10978,6 @@ op {
     default_value {
       i: 5
     }
-    description: "0-D.  Half-width of the 1-D normalization window."
   }
   attr {
     name: "bias"
@@ -11800,7 +10985,6 @@ op {
     default_value {
       f: 1
     }
-    description: "An offset (usually positive to avoid dividing by 0)."
   }
   attr {
     name: "alpha"
@@ -11808,7 +10992,6 @@ op {
     default_value {
       f: 1
     }
-    description: "A scale factor, usually positive."
   }
   attr {
     name: "beta"
@@ -11816,7 +10999,6 @@ op {
     default_value {
       f: 0.5
     }
-    description: "An exponent."
   }
   attr {
     name: "T"
@@ -11826,34 +11008,29 @@ op {
     }
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
-  summary: "Local Response Normalization."
-  description: "The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last\ndimension), and each vector is normalized independently.  Within a given vector,\neach component is divided by the weighted, squared sum of inputs within\n`depth_radius`.  In detail,\n\n    sqr_sum[a, b, c, d] =\n        sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)\n    output = input / (bias + alpha * sqr_sum) ** beta\n\nFor details, see [Krizhevsky et al., ImageNet classification with deep\nconvolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks)."
 }
 op {
   name: "LRNGrad"
   input_arg {
     name: "input_grads"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "input_image"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "output_image"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The gradients for LRN."
     type_attr: "T"
   }
   attr {
@@ -11862,7 +11039,6 @@ op {
     default_value {
       i: 5
     }
-    description: "A depth radius."
   }
   attr {
     name: "bias"
@@ -11870,7 +11046,6 @@ op {
     default_value {
       f: 1
     }
-    description: "An offset (usually > 0 to avoid dividing by 0)."
   }
   attr {
     name: "alpha"
@@ -11878,7 +11053,6 @@ op {
     default_value {
       f: 1
     }
-    description: "A scale factor, usually positive."
   }
   attr {
     name: "beta"
@@ -11886,7 +11060,6 @@ op {
     default_value {
       f: 0.5
     }
-    description: "An exponent."
   }
   attr {
     name: "T"
@@ -11896,12 +11069,12 @@ op {
     }
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
-  summary: "Gradients for Local Response Normalization."
 }
 op {
   name: "LatencyStatsDataset"
@@ -11929,53 +11102,44 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Records the latency of producing `input_dataset` elements in a StatsAggregator."
 }
 op {
   name: "LearnedUnigramCandidateSampler"
   input_arg {
     name: "true_classes"
-    description: "A batch_size * num_true matrix, in which each row contains the\nIDs of the num_true target_classes in the corresponding original label."
     type: DT_INT64
   }
   output_arg {
     name: "sampled_candidates"
-    description: "A vector of length num_sampled, in which each element is\nthe ID of a sampled candidate."
     type: DT_INT64
   }
   output_arg {
     name: "true_expected_count"
-    description: "A batch_size * num_true matrix, representing\nthe number of times each candidate is expected to occur in a batch\nof sampled candidates. If unique=true, then this is a probability."
     type: DT_FLOAT
   }
   output_arg {
     name: "sampled_expected_count"
-    description: "A vector of length num_sampled, for each sampled\ncandidate representing the number of times the candidate is expected\nto occur in a batch of sampled candidates.  If unique=true, then this is a\nprobability."
     type: DT_FLOAT
   }
   attr {
     name: "num_true"
     type: "int"
-    description: "Number of true labels per context."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "unique"
     type: "bool"
-    description: "If unique is true, we sample with rejection, so that all sampled\ncandidates in a batch are unique. This requires some approximation to\nestimate the post-rejection sampling probabilities."
   }
   attr {
     name: "range_max"
     type: "int"
-    description: "The sampler will sample integers from the interval [0, range_max)."
     has_minimum: true
     minimum: 1
   }
@@ -11985,7 +11149,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either seed or seed2 are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -11993,10 +11156,7 @@ op {
     default_value {
       i: 0
     }
-    description: "An second seed to avoid seed collision."
   }
-  summary: "Generates labels for candidate sampling with a learned unigram distribution."
-  description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
   is_stateful: true
 }
 op {
@@ -12029,8 +11189,6 @@ op {
       }
     }
   }
-  summary: "Elementwise computes the bitwise left-shift of `x` and `y`."
-  description: "If `y` is negative, or greater than or equal to the width of `x` in bits the\nresult is implementation defined."
   is_commutative: true
 }
 op {
@@ -12055,10 +11213,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -12066,8 +11225,6 @@ op {
       }
     }
   }
-  summary: "Returns the truth value of (x < y) element-wise."
-  description: "*NOTE*: `Less` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "LessEqual"
@@ -12091,10 +11248,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -12102,8 +11260,6 @@ op {
       }
     }
   }
-  summary: "Returns the truth value of (x <= y) element-wise."
-  description: "*NOTE*: `LessEqual` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "Lgamma"
@@ -12121,33 +11277,29 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes the log of the absolute value of `Gamma(x)` element-wise."
 }
 op {
   name: "LinSpace"
   input_arg {
     name: "start"
-    description: "First entry in the range."
     type_attr: "T"
   }
   input_arg {
     name: "stop"
-    description: "Last entry in the range."
     type_attr: "T"
   }
   input_arg {
     name: "num"
-    description: "Number of values to generate."
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    description: "1-D. The generated values."
     type_attr: "T"
   }
   attr {
@@ -12155,6 +11307,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -12173,29 +11326,23 @@ op {
       }
     }
   }
-  summary: "Generates values in an interval."
-  description: "A sequence of `num` evenly-spaced values are generated beginning at `start`.\nIf `num > 1`, the values in the sequence increase by `stop - start / num - 1`,\nso that the last one is exactly `stop`.\n\nFor example:\n\n```\ntf.linspace(10.0, 12.0, 3, name=\"linspace\") => [ 10.0  11.0  12.0]\n```"
 }
 op {
   name: "ListDiff"
   input_arg {
     name: "x"
-    description: "1-D. Values to keep."
     type_attr: "T"
   }
   input_arg {
     name: "y"
-    description: "1-D. Values to remove."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "1-D. Values present in `x` but not in `y`."
     type_attr: "T"
   }
   output_arg {
     name: "idx"
-    description: "1-D. Positions of `x` values preserved in `out`."
     type_attr: "out_idx"
   }
   attr {
@@ -12215,51 +11362,41 @@ op {
       }
     }
   }
-  summary: "Computes the difference between two lists of numbers or strings."
-  description: "Given a list `x` and a list `y`, this operation returns a list `out` that\nrepresents all values that are in `x` but not in `y`. The returned list `out`\nis sorted in the same order that the numbers appear in `x` (duplicates are\npreserved). This operation also returns a list `idx` that represents the\nposition of each `out` element in `x`. In other words:\n\n`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`\n\nFor example, given this input:\n\n```\nx = [1, 2, 3, 4, 5, 6]\ny = [1, 3, 5]\n```\n\nThis operation would return:\n\n```\nout ==> [2, 4, 6]\nidx ==> [1, 3, 5]\n```"
 }
 op {
   name: "LoadAndRemapMatrix"
   input_arg {
     name: "ckpt_path"
-    description: "Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from\nwhich the old matrix `Tensor` will be loaded."
     type: DT_STRING
   }
   input_arg {
     name: "old_tensor_name"
-    description: "Name of the 2-D `Tensor` to load from checkpoint."
     type: DT_STRING
   }
   input_arg {
     name: "row_remapping"
-    description: "An int `Tensor` of row remappings (generally created by\n`generate_vocab_remapping`).  Even if no row remapping is needed, this must\nstill be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted\nindex-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`)."
     type: DT_INT64
   }
   input_arg {
     name: "col_remapping"
-    description: "An int `Tensor` of column remappings (generally created by\n`generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping\nis to be done (e.g. column ordering is the same)."
     type: DT_INT64
   }
   input_arg {
     name: "initializing_values"
-    description: "A float `Tensor` containing  values to fill in for cells\nin the output matrix that are not loaded from the checkpoint. Length must be\nexactly the same as the number of missing / new cells."
     type: DT_FLOAT
   }
   output_arg {
     name: "output_matrix"
-    description: "Output matrix containing existing values loaded from the\ncheckpoint, and with any missing values filled in from initializing_values."
     type: DT_FLOAT
   }
   attr {
     name: "num_rows"
     type: "int"
-    description: "Number of rows (length of the 1st dimension) in the output matrix."
     has_minimum: true
   }
   attr {
     name: "num_cols"
     type: "int"
-    description: "Number of columns (length of the 2nd dimension) in the output matrix."
     has_minimum: true
     minimum: 1
   }
@@ -12269,10 +11406,7 @@ op {
     default_value {
       i: -1
     }
-    description: "The maximum number of rows to load from the checkpoint at\nonce. If less than or equal to 0, the entire matrix will be loaded into\nmemory. Setting this arg trades increased disk reads for lower memory usage."
   }
-  summary: "Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint"
-  description: "at `ckpt_path` and potentially reorders its rows and columns using the\nspecified remappings.\n\nMost users should use one of the wrapper initializers (such as\n`tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this\nfunction directly.\n\nThe remappings are 1-D tensors with the following properties:\n\n* `row_remapping` must have exactly `num_rows` entries. Row `i` of the output\n  matrix will be initialized from the row corresponding to index\n  `row_remapping[i]` in the old `Tensor` from the checkpoint.\n* `col_remapping` must have either 0 entries (indicating that no column\n  reordering is needed) or `num_cols` entries. If specified, column `j` of the\n  output matrix will be initialized from the column corresponding to index\n  `col_remapping[j]` in the old `Tensor` from the checkpoint.\n* A value of -1 in either of the remappings signifies a \"missing\" entry. In that\n  case, values from the `initializing_values` tensor will be used to fill that\n  missing row or column. If `row_remapping` has `r` missing entries and\n  `col_remapping` has `c` missing entries, then the following condition must be\n  true:\n\n`(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`\n\nThe remapping tensors can be generated using the GenerateVocabRemapping op.\n\nAs an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],\ninitializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing\nthe value from row i, column j of the old tensor in the checkpoint, the output\nmatrix will look like the following:\n\n[[w(1, 0),  w(1, 2),  0.5],\n [w(0, 0),  w(0, 2), -0.5],\n [0.25,    -0.25,      42]]"
   is_stateful: true
 }
 op {
@@ -12291,6 +11425,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -12298,8 +11433,6 @@ op {
       }
     }
   }
-  summary: "Computes natural logarithm of x element-wise."
-  description: "I.e., \\\\(y = \\log_e x\\\\)."
 }
 op {
   name: "Log1p"
@@ -12317,6 +11450,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -12324,24 +11458,19 @@ op {
       }
     }
   }
-  summary: "Computes natural logarithm of (1 + x) element-wise."
-  description: "I.e., \\\\(y = \\log_e (1 + x)\\\\)."
 }
 op {
   name: "LogMatrixDeterminant"
   input_arg {
     name: "input"
-    description: "Shape is `[N, M, M]`."
     type_attr: "T"
   }
   output_arg {
     name: "sign"
-    description: "The signs of the log determinants of the inputs. Shape is `[N]`."
     type_attr: "T"
   }
   output_arg {
     name: "log_abs_determinant"
-    description: "The logs of the absolute values of the determinants\nof the N input matrices.  Shape is `[N]`."
     type_attr: "T"
   }
   attr {
@@ -12356,19 +11485,15 @@ op {
       }
     }
   }
-  summary: "Computes the sign and the log of the absolute value of the determinant of"
-  description: "one or more square matrices.\n\nThe input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions\nform square matrices. The outputs are two tensors containing the signs and\nabsolute values of the log determinants for all N input submatrices\n`[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).\nThe log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU\nis the LU decomposition of the input and P is the corresponding\npermutation matrix."
 }
 op {
   name: "LogSoftmax"
   input_arg {
     name: "logits"
-    description: "2-D with shape `[batch_size, num_classes]`."
     type_attr: "T"
   }
   output_arg {
     name: "logsoftmax"
-    description: "Same shape as `logits`."
     type_attr: "T"
   }
   attr {
@@ -12377,59 +11502,50 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes log softmax activations."
-  description: "For each batch `i` and class `j` we have\n\n    logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))"
 }
 op {
   name: "LogUniformCandidateSampler"
   input_arg {
     name: "true_classes"
-    description: "A batch_size * num_true matrix, in which each row contains the\nIDs of the num_true target_classes in the corresponding original label."
     type: DT_INT64
   }
   output_arg {
     name: "sampled_candidates"
-    description: "A vector of length num_sampled, in which each element is\nthe ID of a sampled candidate."
     type: DT_INT64
   }
   output_arg {
     name: "true_expected_count"
-    description: "A batch_size * num_true matrix, representing\nthe number of times each candidate is expected to occur in a batch\nof sampled candidates. If unique=true, then this is a probability."
     type: DT_FLOAT
   }
   output_arg {
     name: "sampled_expected_count"
-    description: "A vector of length num_sampled, for each sampled\ncandidate representing the number of times the candidate is expected\nto occur in a batch of sampled candidates.  If unique=true, then this is a\nprobability."
     type: DT_FLOAT
   }
   attr {
     name: "num_true"
     type: "int"
-    description: "Number of true labels per context."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "unique"
     type: "bool"
-    description: "If unique is true, we sample with rejection, so that all sampled\ncandidates in a batch are unique. This requires some approximation to\nestimate the post-rejection sampling probabilities."
   }
   attr {
     name: "range_max"
     type: "int"
-    description: "The sampler will sample integers from the interval [0, range_max)."
     has_minimum: true
     minimum: 1
   }
@@ -12439,7 +11555,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either seed or seed2 are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -12447,10 +11562,7 @@ op {
     default_value {
       i: 0
     }
-    description: "An second seed to avoid seed collision."
   }
-  summary: "Generates labels for candidate sampling with a log-uniform distribution."
-  description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
   is_stateful: true
 }
 op {
@@ -12467,8 +11579,6 @@ op {
     name: "z"
     type: DT_BOOL
   }
-  summary: "Returns the truth value of x AND y element-wise."
-  description: "*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
@@ -12481,7 +11591,6 @@ op {
     name: "y"
     type: DT_BOOL
   }
-  summary: "Returns the truth value of NOT x element-wise."
 }
 op {
   name: "LogicalOr"
@@ -12497,26 +11606,21 @@ op {
     name: "z"
     type: DT_BOOL
   }
-  summary: "Returns the truth value of x OR y element-wise."
-  description: "*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
   name: "LookupTableExport"
   input_arg {
     name: "table_handle"
-    description: "Handle to the table."
     type: DT_STRING
     is_ref: true
   }
   output_arg {
     name: "keys"
-    description: "Vector of all keys present in the table."
     type_attr: "Tkeys"
   }
   output_arg {
     name: "values"
-    description: "Tensor of all values in the table. Indexed in parallel with `keys`."
     type_attr: "Tvalues"
   }
   attr {
@@ -12527,23 +11631,19 @@ op {
     name: "Tvalues"
     type: "type"
   }
-  summary: "Outputs all keys and values in the table."
 }
 op {
   name: "LookupTableExportV2"
   input_arg {
     name: "table_handle"
-    description: "Handle to the table."
     type: DT_RESOURCE
   }
   output_arg {
     name: "keys"
-    description: "Vector of all keys present in the table."
     type_attr: "Tkeys"
   }
   output_arg {
     name: "values"
-    description: "Tensor of all values in the table. Indexed in parallel with `keys`."
     type_attr: "Tvalues"
   }
   attr {
@@ -12554,20 +11654,17 @@ op {
     name: "Tvalues"
     type: "type"
   }
-  summary: "Outputs all keys and values in the table."
   is_stateful: true
 }
 op {
   name: "LookupTableFind"
   input_arg {
     name: "table_handle"
-    description: "Handle to the table."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "keys"
-    description: "Any shape.  Keys to look up."
     type_attr: "Tin"
   }
   input_arg {
@@ -12576,7 +11673,6 @@ op {
   }
   output_arg {
     name: "values"
-    description: "Same shape as `keys`.  Values found in the table, or `default_values`\nfor missing keys."
     type_attr: "Tout"
   }
   attr {
@@ -12587,19 +11683,15 @@ op {
     name: "Tout"
     type: "type"
   }
-  summary: "Looks up keys in a table, outputs the corresponding values."
-  description: "The tensor `keys` must of the same type as the keys of the table.\nThe output `values` is of the type of the table values.\n\nThe scalar `default_value` is the value output for keys not present in the\ntable. It must also be of the same type as the table values."
 }
 op {
   name: "LookupTableFindV2"
   input_arg {
     name: "table_handle"
-    description: "Handle to the table."
     type: DT_RESOURCE
   }
   input_arg {
     name: "keys"
-    description: "Any shape.  Keys to look up."
     type_attr: "Tin"
   }
   input_arg {
@@ -12608,7 +11700,6 @@ op {
   }
   output_arg {
     name: "values"
-    description: "Same shape as `keys`.  Values found in the table, or `default_values`\nfor missing keys."
     type_attr: "Tout"
   }
   attr {
@@ -12619,26 +11710,21 @@ op {
     name: "Tout"
     type: "type"
   }
-  summary: "Looks up keys in a table, outputs the corresponding values."
-  description: "The tensor `keys` must of the same type as the keys of the table.\nThe output `values` is of the type of the table values.\n\nThe scalar `default_value` is the value output for keys not present in the\ntable. It must also be of the same type as the table values."
   is_stateful: true
 }
 op {
   name: "LookupTableImport"
   input_arg {
     name: "table_handle"
-    description: "Handle to the table."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "keys"
-    description: "Any shape.  Keys to look up."
     type_attr: "Tin"
   }
   input_arg {
     name: "values"
-    description: "Values to associate with keys."
     type_attr: "Tout"
   }
   attr {
@@ -12649,24 +11735,19 @@ op {
     name: "Tout"
     type: "type"
   }
-  summary: "Replaces the contents of the table with the specified keys and values."
-  description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
 }
 op {
   name: "LookupTableImportV2"
   input_arg {
     name: "table_handle"
-    description: "Handle to the table."
     type: DT_RESOURCE
   }
   input_arg {
     name: "keys"
-    description: "Any shape.  Keys to look up."
     type_attr: "Tin"
   }
   input_arg {
     name: "values"
-    description: "Values to associate with keys."
     type_attr: "Tout"
   }
   attr {
@@ -12677,26 +11758,21 @@ op {
     name: "Tout"
     type: "type"
   }
-  summary: "Replaces the contents of the table with the specified keys and values."
-  description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
   is_stateful: true
 }
 op {
   name: "LookupTableInsert"
   input_arg {
     name: "table_handle"
-    description: "Handle to the table."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "keys"
-    description: "Any shape.  Keys to look up."
     type_attr: "Tin"
   }
   input_arg {
     name: "values"
-    description: "Values to associate with keys."
     type_attr: "Tout"
   }
   attr {
@@ -12707,24 +11783,19 @@ op {
     name: "Tout"
     type: "type"
   }
-  summary: "Updates the table to associates keys with values."
-  description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
 }
 op {
   name: "LookupTableInsertV2"
   input_arg {
     name: "table_handle"
-    description: "Handle to the table."
     type: DT_RESOURCE
   }
   input_arg {
     name: "keys"
-    description: "Any shape.  Keys to look up."
     type_attr: "Tin"
   }
   input_arg {
     name: "values"
-    description: "Values to associate with keys."
     type_attr: "Tout"
   }
   attr {
@@ -12735,54 +11806,42 @@ op {
     name: "Tout"
     type: "type"
   }
-  summary: "Updates the table to associates keys with values."
-  description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
   is_stateful: true
 }
 op {
   name: "LookupTableSize"
   input_arg {
     name: "table_handle"
-    description: "Handle to the table."
     type: DT_STRING
     is_ref: true
   }
   output_arg {
     name: "size"
-    description: "Scalar that contains number of elements in the table."
     type: DT_INT64
   }
-  summary: "Computes the number of elements in the given table."
 }
 op {
   name: "LookupTableSizeV2"
   input_arg {
     name: "table_handle"
-    description: "Handle to the table."
     type: DT_RESOURCE
   }
   output_arg {
     name: "size"
-    description: "Scalar that contains number of elements in the table."
     type: DT_INT64
   }
-  summary: "Computes the number of elements in the given table."
   is_stateful: true
 }
 op {
   name: "LoopCond"
   input_arg {
     name: "input"
-    description: "A boolean scalar, representing the branch predicate of the Switch op."
     type: DT_BOOL
   }
   output_arg {
     name: "output"
-    description: "The same tensor as `input`."
     type: DT_BOOL
   }
-  summary: "Forwards the input to the output."
-  description: "This operator represents the loop termination condition used by the\n\"pivot\" switches of a loop."
 }
 op {
   name: "MakeIterator"
@@ -12794,8 +11853,6 @@ op {
     name: "iterator"
     type: DT_RESOURCE
   }
-  summary: "Makes a new iterator from the given `dataset` and stores it in `iterator`."
-  description: "This operation may be executed multiple times. Each execution will reset the\niterator in `iterator` to the first element of `dataset`."
   is_stateful: true
 }
 op {
@@ -12810,12 +11867,10 @@ op {
   }
   input_arg {
     name: "batch_size"
-    description: "A scalar representing the number of elements to accumulate in a\nbatch. It determines the number of concurrent invocations of `f` that process\nelements from `input_dataset` in parallel."
     type: DT_INT64
   }
   input_arg {
     name: "num_parallel_batches"
-    description: "A scalar representing the number of batches to create in\nparallel. Processing multiple batches in parallel benefits workloads prone to\nstragglers."
     type: DT_INT64
   }
   output_arg {
@@ -12843,8 +11898,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset` and then"
-  description: "batches `batch_size` of them.\n\nUnlike a \"MapDataset\", which applies `f` sequentially, this dataset invokes up\nto `batch_size * num_parallel_batches` copies of `f` in parallel."
 }
 op {
   name: "MapClear"
@@ -12882,7 +11935,6 @@ op {
       s: ""
     }
   }
-  summary: "Op removes all elements in the underlying container."
   is_stateful: true
 }
 op {
@@ -12920,7 +11972,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
 }
 op {
   name: "MapIncompleteSize"
@@ -12962,7 +12013,6 @@ op {
       s: ""
     }
   }
-  summary: "Op returns the number of incomplete elements in the underlying container."
   is_stateful: true
 }
 op {
@@ -13015,8 +12065,6 @@ op {
       s: ""
     }
   }
-  summary: "Op peeks at the values at the specified key.  If the"
-  description: "underlying container does not contain this key\nthis op will block until it does."
   is_stateful: true
 }
 op {
@@ -13059,14 +12107,12 @@ op {
       s: ""
     }
   }
-  summary: "Op returns the number of elements in the underlying container."
   is_stateful: true
 }
 op {
   name: "MapStage"
   input_arg {
     name: "key"
-    description: "int64"
     type: DT_INT64
   }
   input_arg {
@@ -13075,7 +12121,6 @@ op {
   }
   input_arg {
     name: "values"
-    description: "a list of tensors\ndtypes A list of data types that inserted values should adhere to."
     type_list_attr: "fake_dtypes"
   }
   attr {
@@ -13084,7 +12129,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Maximum number of elements in the Staging Area. If > 0, inserts\non the container will block when the capacity is reached."
     has_minimum: true
   }
   attr {
@@ -13111,7 +12155,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue is placed in the given container. Otherwise,\na default container is used."
   }
   attr {
     name: "shared_name"
@@ -13119,9 +12162,7 @@ op {
     default_value {
       s: ""
     }
-    description: "It is necessary to match this name to the matching Unstage Op."
   }
-  summary: "Stage (key, values) in the underlying container which behaves like a hashtable."
   is_stateful: true
 }
 op {
@@ -13174,8 +12215,6 @@ op {
       s: ""
     }
   }
-  summary: "Op removes and returns the values associated with the key"
-  description: "from the underlying container.   If the underlying container\ndoes not contain this key, the op will block until it does."
   is_stateful: true
 }
 op {
@@ -13228,8 +12267,6 @@ op {
       s: ""
     }
   }
-  summary: "Op removes and returns a random (key, value)"
-  description: "from the underlying container.   If the underlying container\ndoes not contain elements, the op will block until it does."
   is_stateful: true
 }
 op {
@@ -13252,7 +12289,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, \"a\" is transposed before multiplication."
   }
   attr {
     name: "transpose_b"
@@ -13260,7 +12296,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, \"b\" is transposed before multiplication."
   }
   attr {
     name: "T"
@@ -13268,6 +12303,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -13276,63 +12312,62 @@ op {
       }
     }
   }
-  summary: "Multiply the matrix \"a\" by the matrix \"b\"."
-  description: "The inputs must be two-dimensional matrices and the inner dimension of\n\"a\" (after being transposed if transpose_a is true) must match the\nouter dimension of \"b\" (after being transposed if transposed_b is\ntrue).\n\n*Note*: The default kernel implementation for MatMul on GPUs uses\ncublas."
 }
 op {
   name: "MatchingFiles"
   input_arg {
     name: "pattern"
-    description: "Shell wildcard pattern(s). Scalar or vector of type string."
     type: DT_STRING
   }
   output_arg {
     name: "filenames"
-    description: "A vector of matching filenames."
     type: DT_STRING
   }
-  summary: "Returns the set of files matching one or more glob patterns."
-  description: "Note that this routine only supports wildcard characters in the\nbasename portion of the pattern, not in the directory portion."
 }
 op {
   name: "MatrixBandPart"
   input_arg {
     name: "input"
-    description: "Rank `k` tensor."
     type_attr: "T"
   }
   input_arg {
     name: "num_lower"
-    description: "0-D tensor. Number of subdiagonals to keep. If negative, keep entire\nlower triangle."
-    type: DT_INT64
+    type_attr: "Tindex"
   }
   input_arg {
     name: "num_upper"
-    description: "0-D tensor. Number of superdiagonals to keep. If negative, keep\nentire upper triangle."
-    type: DT_INT64
+    type_attr: "Tindex"
   }
   output_arg {
     name: "band"
-    description: "Rank `k` tensor of the same shape as input. The extracted banded tensor."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Copy a tensor setting everything outside a central band in each innermost matrix"
-  description: "to zero.\n\nThe `band` part is computed as follows:\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor with the same shape where\n\n`band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.\n\nThe indicator function\n\n`in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&\n                 (num_upper < 0 || (n-m) <= num_upper)`.\n\nFor example:\n\n```\n# if \'input\' is [[ 0,  1,  2, 3]\n                 [-1,  0,  1, 2]\n                 [-2, -1,  0, 1]\n                 [-3, -2, -1, 0]],\n\ntf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]\n                                       [-1,  0,  1, 2]\n                                       [ 0, -1,  0, 1]\n                                       [ 0,  0, -1, 0]],\n\ntf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]\n                                      [-1,  0,  1, 0]\n                                      [-2, -1,  0, 1]\n                                      [ 0, -2, -1, 0]]\n```\n\nUseful special cases:\n\n```\n tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.\n tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.\n tf.matrix_band_part(input, 0, 0) ==> Diagonal.\n```"
+  attr {
+    name: "Tindex"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "MatrixDeterminant"
   input_arg {
     name: "input"
-    description: "Shape is `[..., M, M]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Shape is `[...]`."
     type_attr: "T"
   }
   attr {
@@ -13347,57 +12382,45 @@ op {
       }
     }
   }
-  summary: "Computes the determinant of one or more square matrices."
-  description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. The output is a tensor containing the determinants\nfor all input submatrices `[..., :, :]`."
 }
 op {
   name: "MatrixDiag"
   input_arg {
     name: "diagonal"
-    description: "Rank `k`, where `k >= 1`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Returns a batched diagonal tensor with a given batched diagonal values."
-  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a\ntensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:\n\n`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.\n\nFor example:\n\n```\n# \'diagonal\' is [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nand diagonal.shape = (2, 4)\n\ntf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]\n                                     [0, 2, 0, 0]\n                                     [0, 0, 3, 0]\n                                     [0, 0, 0, 4]],\n                                    [[5, 0, 0, 0]\n                                     [0, 6, 0, 0]\n                                     [0, 0, 7, 0]\n                                     [0, 0, 0, 8]]]\n\nwhich has shape (2, 4, 4)\n```"
 }
 op {
   name: "MatrixDiagPart"
   input_arg {
     name: "input"
-    description: "Rank `k` tensor where `k >= 2`."
     type_attr: "T"
   }
   output_arg {
     name: "diagonal"
-    description: "The extracted diagonal(s) having shape\n`diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Returns the batched diagonal part of a batched tensor."
-  description: "This operation returns a tensor with the `diagonal` part\nof the batched `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:\n\n`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.\n\nThe input must be at least a matrix.\n\nFor example:\n\n```\n# \'input\' is [[[1, 0, 0, 0]\n               [0, 2, 0, 0]\n               [0, 0, 3, 0]\n               [0, 0, 0, 4]],\n              [[5, 0, 0, 0]\n               [0, 6, 0, 0]\n               [0, 0, 7, 0]\n               [0, 0, 0, 8]]]\n\nand input.shape = (2, 4, 4)\n\ntf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nwhich has shape (2, 4)\n```"
 }
 op {
   name: "MatrixExponential"
   input_arg {
     name: "input"
-    description: "Shape is `[..., M, M]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Shape is `[..., M, M]`.\n\n@compatibility(scipy)\nEquivalent to scipy.linalg.expm\n@end_compatibility"
     type_attr: "T"
   }
   attr {
@@ -13412,19 +12435,15 @@ op {
       }
     }
   }
-  summary: "Computes the matrix exponential of one or more square matrices:"
-  description: "exp(A) = \\sum_{n=0}^\\infty A^n/n!\n\nThe exponential is computed using a combination of the scaling and squaring\nmethod and the Pade approximation. Details can be founds in:\nNicholas J. Higham, \"The scaling and squaring method for the matrix exponential\nrevisited,\" SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.\n\nThe input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. The output is a tensor of the same shape as the input\ncontaining the exponential for all input submatrices `[..., :, :]`."
 }
 op {
   name: "MatrixInverse"
   input_arg {
     name: "input"
-    description: "Shape is `[..., M, M]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Shape is `[..., M, M]`.\n\n@compatibility(numpy)\nEquivalent to np.linalg.inv\n@end_compatibility"
     type_attr: "T"
   }
   attr {
@@ -13446,48 +12465,59 @@ op {
       }
     }
   }
-  summary: "Computes the inverse of one or more square invertible matrices or their"
-  description: "adjoints (conjugate transposes).\n\nThe input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. The output is a tensor of the same shape as the input\ncontaining the inverse for all input submatrices `[..., :, :]`.\n\nThe op uses LU decomposition with partial pivoting to compute the inverses.\n\nIf a matrix is not invertible there is no guarantee what the op does. It\nmay detect the condition and raise an exception or it may simply return a\ngarbage result."
+}
+op {
+  name: "MatrixLogarithm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
 }
 op {
   name: "MatrixSetDiag"
   input_arg {
     name: "input"
-    description: "Rank `k+1`, where `k >= 1`."
     type_attr: "T"
   }
   input_arg {
     name: "diagonal"
-    description: "Rank `k`, where `k >= 1`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Rank `k+1`, with `output.shape = input.shape`."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Returns a batched matrix tensor with new batched diagonal values."
-  description: "Given `input` and `diagonal`, this operation returns a tensor with the\nsame shape and values as `input`, except for the main diagonal of the\ninnermost matrices.  These will be overwritten by the values in `diagonal`.\n\nThe output is computed as follows:\n\nAssume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has\n`k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a\ntensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:\n\n  * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.\n  * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`."
 }
 op {
   name: "MatrixSolve"
   input_arg {
     name: "matrix"
-    description: "Shape is `[..., M, M]`."
     type_attr: "T"
   }
   input_arg {
     name: "rhs"
-    description: "Shape is `[..., M, K]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Shape is `[..., M, K]`."
     type_attr: "T"
   }
   attr {
@@ -13496,7 +12526,6 @@ op {
     default_value {
       b: false
     }
-    description: "Boolean indicating whether to solve with `matrix` or its (block-wise)\nadjoint."
   }
   attr {
     name: "T"
@@ -13510,29 +12539,23 @@ op {
       }
     }
   }
-  summary: "Solves systems of linear equations."
-  description: "`Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is\na tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix\nsatisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.\nIf `adjoint` is `True` then each output matrix satisfies\n`adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`."
 }
 op {
   name: "MatrixSolveLs"
   input_arg {
     name: "matrix"
-    description: "Shape is `[..., M, N]`."
     type_attr: "T"
   }
   input_arg {
     name: "rhs"
-    description: "Shape is `[..., M, K]`."
     type_attr: "T"
   }
   input_arg {
     name: "l2_regularizer"
-    description: "Scalar tensor.\n\n@compatibility(numpy)\nEquivalent to np.linalg.lstsq\n@end_compatibility"
     type: DT_DOUBLE
   }
   output_arg {
     name: "output"
-    description: "Shape is `[..., N, K]`."
     type_attr: "T"
   }
   attr {
@@ -13554,24 +12577,19 @@ op {
       b: true
     }
   }
-  summary: "Solves one or more linear least-squares problems."
-  description: "`matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions\nform real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same\ntype as `matrix` and shape `[..., M, K]`.\nThe output is a tensor shape `[..., N, K]` where each output matrix solves\neach of the equations\n`matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`\nin the least squares sense.\n\nWe use the following notation for (complex) matrix and right-hand sides\nin the batch:\n\n`matrix`=\\\\(A \\in \\mathbb{C}^{m \\times n}\\\\),\n`rhs`=\\\\(B  \\in \\mathbb{C}^{m \\times k}\\\\),\n`output`=\\\\(X  \\in \\mathbb{C}^{n \\times k}\\\\),\n`l2_regularizer`=\\\\(\\lambda \\in \\mathbb{R}\\\\).\n\nIf `fast` is `True`, then the solution is computed by solving the normal\nequations using Cholesky decomposition. Specifically, if \\\\(m \\ge n\\\\) then\n\\\\(X = (A^H A + \\lambda I)^{-1} A^H B\\\\), which solves the least-squares\nproblem \\\\(X = \\mathrm{argmin}_{Z \\in \\Re^{n \\times k} } ||A Z - B||_F^2 +\n\\lambda ||Z||_F^2\\\\). If \\\\(m \\lt n\\\\) then `output` is computed as\n\\\\(X = A^H (A A^H + \\lambda I)^{-1} B\\\\), which (for \\\\(\\lambda = 0\\\\)) is the\nminimum-norm solution to the under-determined linear system, i.e.\n\\\\(X = \\mathrm{argmin}_{Z \\in \\mathbb{C}^{n \\times k} } ||Z||_F^2 \\\\),\nsubject to \\\\(A Z = B\\\\). Notice that the fast path is only numerically stable\nwhen \\\\(A\\\\) is numerically full rank and has a condition number\n\\\\(\\mathrm{cond}(A) \\lt \\frac{1}{\\sqrt{\\epsilon_{mach} } }\\\\) or\\\\(\\lambda\\\\) is\nsufficiently large.\n\nIf `fast` is `False` an algorithm based on the numerically robust complete\northogonal decomposition is used. This computes the minimum-norm\nleast-squares solution, even when \\\\(A\\\\) is rank deficient. This path is\ntypically 6-7 times slower than the fast path. If `fast` is `False` then\n`l2_regularizer` is ignored."
 }
 op {
   name: "MatrixTriangularSolve"
   input_arg {
     name: "matrix"
-    description: "Shape is `[..., M, M]`."
     type_attr: "T"
   }
   input_arg {
     name: "rhs"
-    description: "Shape is `[..., M, K]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Shape is `[..., M, K]`."
     type_attr: "T"
   }
   attr {
@@ -13580,7 +12598,6 @@ op {
     default_value {
       b: true
     }
-    description: "Boolean indicating whether the innermost matrices in `matrix` are\nlower or upper triangular."
   }
   attr {
     name: "adjoint"
@@ -13588,7 +12605,6 @@ op {
     default_value {
       b: false
     }
-    description: "Boolean indicating whether to solve with `matrix` or its (block-wise)\n         adjoint.\n\n@compatibility(numpy)\nEquivalent to np.linalg.triangular_solve\n@end_compatibility"
   }
   attr {
     name: "T"
@@ -13602,24 +12618,19 @@ op {
       }
     }
   }
-  summary: "Solves systems of linear equations with upper or lower triangular matrices by"
-  description: "backsubstitution.\n\n`matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form\nsquare matrices. If `lower` is `True` then the strictly upper triangular part\nof each inner-most matrix is assumed to be zero and not accessed.\nIf `lower` is False then the strictly lower triangular part of each inner-most\nmatrix is assumed to be zero and not accessed.\n`rhs` is a tensor of shape `[..., M, K]`.\n\nThe output is a tensor of shape `[..., M, K]`. If `adjoint` is\n`True` then the innermost matrices in `output` satisfy matrix equations\n`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.\nIf `adjoint` is `False` then the strictly then the  innermost matrices in\n`output` satisfy matrix equations\n`adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`."
 }
 op {
   name: "Max"
   input_arg {
     name: "input"
-    description: "The tensor to reduce."
     type_attr: "T"
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    description: "The reduced tensor."
     type_attr: "T"
   }
   attr {
@@ -13628,7 +12639,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, retain reduced dimensions with length 1."
   }
   attr {
     name: "T"
@@ -13637,17 +12647,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -13667,19 +12678,15 @@ op {
       }
     }
   }
-  summary: "Computes the maximum of elements across dimensions of a tensor."
-  description: "Reduces `input` along the dimensions given in `reduction_indices`. Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_indices`. If `keep_dims` is true, the reduced dimensions are\nretained with length 1."
 }
 op {
   name: "MaxPool"
   input_arg {
     name: "input"
-    description: "4-D input to pool over."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The max pooled output tensor."
     type_attr: "T"
   }
   attr {
@@ -13690,6 +12697,8 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -13698,7 +12707,6 @@ op {
         type: DT_INT16
         type: DT_INT8
         type: DT_UINT16
-        type: DT_HALF
         type: DT_QINT8
       }
     }
@@ -13706,21 +12714,18 @@ op {
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "The size of the window for each dimension of the input tensor."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the\ninput tensor."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -13734,7 +12739,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -13743,38 +12747,32 @@ op {
       }
     }
   }
-  summary: "Performs max pooling on the input."
 }
 op {
   name: "MaxPool3D"
   input_arg {
     name: "input"
-    description: "Shape `[batch, depth, rows, cols, channels]` tensor to pool over."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The max pooled output tensor."
     type_attr: "T"
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "1-D tensor of length 5. The size of the window for each dimension of\nthe input tensor. Must have `ksize[0] = ksize[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D tensor of length 5. The stride of the sliding window for each\ndimension of `input`. Must have `strides[0] = strides[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -13788,7 +12786,6 @@ op {
     default_value {
       s: "NDHWC"
     }
-    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
     allowed_values {
       list {
         s: "NDHWC"
@@ -13801,27 +12798,24 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
   }
-  summary: "Performs 3D max pooling on the input."
 }
 op {
   name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
-    description: "The original input tensor."
     type_attr: "TInput"
   }
   input_arg {
     name: "orig_output"
-    description: "The original output tensor."
     type_attr: "TInput"
   }
   input_arg {
     name: "grad"
-    description: "Output backprop of shape `[batch, depth, rows, cols, channels]`."
     type_attr: "T"
   }
   output_arg {
@@ -13831,21 +12825,18 @@ op {
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "1-D tensor of length 5. The size of the window for each dimension of\nthe input tensor. Must have `ksize[0] = ksize[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D tensor of length 5. The stride of the sliding window for each\ndimension of `input`. Must have `strides[0] = strides[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -13859,7 +12850,6 @@ op {
     default_value {
       s: "NDHWC"
     }
-    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
     allowed_values {
       list {
         s: "NDHWC"
@@ -13875,6 +12865,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -13887,52 +12878,45 @@ op {
     }
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
   }
-  summary: "Computes gradients of max pooling function."
 }
 op {
   name: "MaxPool3DGradGrad"
   input_arg {
     name: "orig_input"
-    description: "The original input tensor."
     type_attr: "T"
   }
   input_arg {
     name: "orig_output"
-    description: "The original output tensor."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "Output backprop of shape `[batch, depth, rows, cols, channels]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Gradients of gradients w.r.t. the input to `max_pool`."
     type_attr: "T"
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "1-D tensor of length 5. The size of the window for each dimension of\nthe input tensor. Must have `ksize[0] = ksize[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "1-D tensor of length 5. The stride of the sliding window for each\ndimension of `input`. Must have `strides[0] = strides[4] = 1`."
     has_minimum: true
     minimum: 5
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -13946,7 +12930,6 @@ op {
     default_value {
       s: "NDHWC"
     }
-    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
     allowed_values {
       list {
         s: "NDHWC"
@@ -13963,48 +12946,40 @@ op {
       }
     }
   }
-  summary: "Computes second-order gradients of the maxpooling function."
 }
 op {
   name: "MaxPoolGrad"
   input_arg {
     name: "orig_input"
-    description: "The original input tensor."
     type_attr: "T"
   }
   input_arg {
     name: "orig_output"
-    description: "The original output tensor."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "4-D.  Gradients w.r.t. the output of `max_pool`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Gradients w.r.t. the input to `max_pool`."
     type_attr: "T"
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "The size of the window for each dimension of the input tensor."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the\ninput tensor."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -14018,7 +12993,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -14037,10 +13011,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -14048,48 +13023,40 @@ op {
       }
     }
   }
-  summary: "Computes gradients of the maxpooling function."
 }
 op {
   name: "MaxPoolGradGrad"
   input_arg {
     name: "orig_input"
-    description: "The original input tensor."
     type_attr: "T"
   }
   input_arg {
     name: "orig_output"
-    description: "The original output tensor."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "4-D.  Gradients of gradients w.r.t. the input of `max_pool`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Gradients of gradients w.r.t. the input to `max_pool`."
     type_attr: "T"
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "The size of the window for each dimension of the input tensor."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the\ninput tensor."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -14103,7 +13070,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -14119,10 +13085,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -14130,44 +13097,36 @@ op {
       }
     }
   }
-  summary: "Computes second-order gradients of the maxpooling function."
 }
 op {
   name: "MaxPoolGradGradV2"
   input_arg {
     name: "orig_input"
-    description: "The original input tensor."
     type_attr: "T"
   }
   input_arg {
     name: "orig_output"
-    description: "The original output tensor."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "4-D.  Gradients of gradients w.r.t. the input of `max_pool`."
     type_attr: "T"
   }
   input_arg {
     name: "ksize"
-    description: "The size of the window for each dimension of the input tensor."
     type: DT_INT32
   }
   input_arg {
     name: "strides"
-    description: "The stride of the sliding window for each dimension of the\ninput tensor."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "Gradients of gradients w.r.t. the input to `max_pool`."
     type_attr: "T"
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -14181,7 +13140,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -14197,10 +13155,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -14208,48 +13167,40 @@ op {
       }
     }
   }
-  summary: "Computes second-order gradients of the maxpooling function."
 }
 op {
   name: "MaxPoolGradGradWithArgmax"
   input_arg {
     name: "input"
-    description: "The original input."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the\ninput of `max_pool`."
     type_attr: "T"
   }
   input_arg {
     name: "argmax"
-    description: "The indices of the maximum values chosen for each output of `max_pool`."
     type_attr: "Targmax"
   }
   output_arg {
     name: "output"
-    description: "Gradients of gradients w.r.t. the input of `max_pool`."
     type_attr: "T"
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "The size of the window for each dimension of the input tensor."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the\ninput tensor."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -14275,10 +13226,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -14286,44 +13238,36 @@ op {
       }
     }
   }
-  summary: "Computes second-order gradients of the maxpooling function."
 }
 op {
   name: "MaxPoolGradV2"
   input_arg {
     name: "orig_input"
-    description: "The original input tensor."
     type_attr: "T"
   }
   input_arg {
     name: "orig_output"
-    description: "The original output tensor."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "4-D.  Gradients w.r.t. the output of `max_pool`."
     type_attr: "T"
   }
   input_arg {
     name: "ksize"
-    description: "The size of the window for each dimension of the input tensor."
     type: DT_INT32
   }
   input_arg {
     name: "strides"
-    description: "The stride of the sliding window for each dimension of the\ninput tensor."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "Gradients w.r.t. the input to `max_pool`."
     type_attr: "T"
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -14337,7 +13281,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -14356,10 +13299,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -14367,48 +13311,40 @@ op {
       }
     }
   }
-  summary: "Computes gradients of the maxpooling function."
 }
 op {
   name: "MaxPoolGradWithArgmax"
   input_arg {
     name: "input"
-    description: "The original input."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the\noutput of `max_pool`."
     type_attr: "T"
   }
   input_arg {
     name: "argmax"
-    description: "The indices of the maximum values chosen for each output of `max_pool`."
     type_attr: "Targmax"
   }
   output_arg {
     name: "output"
-    description: "Gradients w.r.t. the input of `max_pool`."
     type_attr: "T"
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "The size of the window for each dimension of the input tensor."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the\ninput tensor."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -14434,10 +13370,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -14445,28 +13382,23 @@ op {
       }
     }
   }
-  summary: "Computes gradients of the maxpooling function."
 }
 op {
   name: "MaxPoolV2"
   input_arg {
     name: "input"
-    description: "4-D input to pool over."
     type_attr: "T"
   }
   input_arg {
     name: "ksize"
-    description: "The size of the window for each dimension of the input tensor."
     type: DT_INT32
   }
   input_arg {
     name: "strides"
-    description: "The stride of the sliding window for each dimension of the\ninput tensor."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "The max pooled output tensor."
     type_attr: "T"
   }
   attr {
@@ -14477,6 +13409,8 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14485,7 +13419,6 @@ op {
         type: DT_INT16
         type: DT_INT8
         type: DT_UINT16
-        type: DT_HALF
         type: DT_QINT8
       }
     }
@@ -14493,7 +13426,6 @@ op {
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -14507,7 +13439,6 @@ op {
     default_value {
       s: "NHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
     allowed_values {
       list {
         s: "NHWC"
@@ -14516,36 +13447,30 @@ op {
       }
     }
   }
-  summary: "Performs max pooling on the input."
 }
 op {
   name: "MaxPoolWithArgmax"
   input_arg {
     name: "input"
-    description: "4-D with shape `[batch, height, width, channels]`.  Input to pool over."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The max pooled output tensor."
     type_attr: "T"
   }
   output_arg {
     name: "argmax"
-    description: "4-D.  The flattened indices of the max values chosen for each output."
     type_attr: "Targmax"
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "The size of the window for each dimension of the input tensor."
     has_minimum: true
     minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the\ninput tensor."
     has_minimum: true
     minimum: 4
   }
@@ -14565,7 +13490,6 @@ op {
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -14581,10 +13505,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -14592,8 +13517,6 @@ op {
       }
     }
   }
-  summary: "Performs max pooling on the input and outputs both max values and indices."
-  description: "The indices in `argmax` are flattened, so that a maximum value at position\n`[b, y, x, c]` becomes flattened index\n`((b * height + y) * width + x) * channels + c`.\n\nThe indices returned are always in `[0, height) x [0, width)` before flattening,\neven if padding is involved and the mathematically correct answer is outside\n(either negative or too large).  This is a bug, but fixing it is difficult to do\nin a safe backwards compatible way, especially due to flattening."
 }
 op {
   name: "Maximum"
@@ -14615,6 +13538,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14622,25 +13546,20 @@ op {
       }
     }
   }
-  summary: "Returns the max of x and y (i.e. x > y ? x : y) element-wise."
-  description: "*NOTE*: `Maximum` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
   name: "Mean"
   input_arg {
     name: "input"
-    description: "The tensor to reduce."
     type_attr: "T"
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    description: "The reduced tensor."
     type_attr: "T"
   }
   attr {
@@ -14649,7 +13568,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, retain reduced dimensions with length 1."
   }
   attr {
     name: "T"
@@ -14658,17 +13576,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -14688,25 +13607,20 @@ op {
       }
     }
   }
-  summary: "Computes the mean of elements across dimensions of a tensor."
-  description: "Reduces `input` along the dimensions given in `reduction_indices`. Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_indices`. If `keep_dims` is true, the reduced dimensions are\nretained with length 1."
 }
 op {
   name: "Merge"
   input_arg {
     name: "inputs"
-    description: "The input tensors, exactly one of which will become available."
     type_attr: "T"
     number_attr: "N"
   }
   output_arg {
     name: "output"
-    description: "Will be set to the available input tensor."
     type_attr: "T"
   }
   output_arg {
     name: "value_index"
-    description: "The index of the chosen input tensor in `inputs`."
     type: DT_INT32
   }
   attr {
@@ -14719,20 +13633,16 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Forwards the value of an available tensor from `inputs` to `output`."
-  description: "`Merge` waits for at least one of the tensors in `inputs` to become available.\nIt is usually combined with `Switch` to implement branching.\n\n`Merge` forwards the first tensor to become available to `output`, and sets\n`value_index` to its index in `inputs`."
 }
 op {
   name: "MergeSummary"
   input_arg {
     name: "inputs"
-    description: "Can be of any shape.  Each must contain serialized `Summary` protocol\nbuffers."
     type: DT_STRING
     number_attr: "N"
   }
   output_arg {
     name: "summary"
-    description: "Scalar. Serialized `Summary` protocol buffer."
     type: DT_STRING
   }
   attr {
@@ -14741,19 +13651,15 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Merges summaries."
-  description: "This op creates a\n[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)\nprotocol buffer that contains the union of all the values in the input\nsummaries.\n\nWhen the Op is run, it reports an `InvalidArgument` error if multiple values\nin the summaries to merge use the same tag."
 }
 op {
   name: "MergeV2Checkpoints"
   input_arg {
     name: "checkpoint_prefixes"
-    description: "prefixes of V2 checkpoints to merge."
     type: DT_STRING
   }
   input_arg {
     name: "destination_prefix"
-    description: "scalar.  The desired final prefix.  Allowed to be the same\nas one of the checkpoint_prefixes."
     type: DT_STRING
   }
   attr {
@@ -14762,22 +13668,17 @@ op {
     default_value {
       b: true
     }
-    description: "see above."
   }
-  summary: "V2 format specific: merges the metadata files of sharded checkpoints.  The"
-  description: "result is one logical checkpoint, with one physical metadata file and renamed\ndata files.\n\nIntended for \"grouping\" multiple checkpoints in a sharded checkpoint setup.\n\nIf delete_old_dirs is true, attempts to delete recursively the dirname of each\npath in the input checkpoint_prefixes.  This is useful when those paths are non\nuser-facing temporary locations."
   is_stateful: true
 }
 op {
   name: "Mfcc"
   input_arg {
     name: "spectrogram"
-    description: "Typically produced by the Spectrogram op, with magnitude_squared\nset to true."
     type: DT_FLOAT
   }
   input_arg {
     name: "sample_rate"
-    description: "How many samples per second the source audio used."
     type: DT_INT32
   }
   output_arg {
@@ -14790,7 +13691,6 @@ op {
     default_value {
       f: 4000
     }
-    description: "The highest frequency to use when calculating the\nceptstrum."
   }
   attr {
     name: "lower_frequency_limit"
@@ -14798,7 +13698,6 @@ op {
     default_value {
       f: 20
     }
-    description: "The lowest frequency to use when calculating the\nceptstrum."
   }
   attr {
     name: "filterbank_channel_count"
@@ -14806,7 +13705,6 @@ op {
     default_value {
       i: 40
     }
-    description: "Resolution of the Mel bank used internally."
   }
   attr {
     name: "dct_coefficient_count"
@@ -14814,26 +13712,20 @@ op {
     default_value {
       i: 13
     }
-    description: "How many output channels to produce per time slice."
   }
-  summary: "Transforms a spectrogram into a form that\'s useful for speech recognition."
-  description: "Mel Frequency Cepstral Coefficients are a way of representing audio data that\'s\nbeen effective as an input feature for machine learning. They are created by\ntaking the spectrum of a spectrogram (a \'cepstrum\'), and discarding some of the\nhigher frequencies that are less significant to the human ear. They have a long\nhistory in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum\nis a good resource to learn more."
 }
 op {
   name: "Min"
   input_arg {
     name: "input"
-    description: "The tensor to reduce."
     type_attr: "T"
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    description: "The reduced tensor."
     type_attr: "T"
   }
   attr {
@@ -14842,7 +13734,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, retain reduced dimensions with length 1."
   }
   attr {
     name: "T"
@@ -14851,17 +13742,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -14881,8 +13773,6 @@ op {
       }
     }
   }
-  summary: "Computes the minimum of elements across dimensions of a tensor."
-  description: "Reduces `input` along the dimensions given in `reduction_indices`. Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_indices`. If `keep_dims` is true, the reduced dimensions are\nretained with length 1."
 }
 op {
   name: "Minimum"
@@ -14904,6 +13794,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14911,25 +13802,20 @@ op {
       }
     }
   }
-  summary: "Returns the min of x and y (i.e. x < y ? x : y) element-wise."
-  description: "*NOTE*: `Minimum` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
   name: "MirrorPad"
   input_arg {
     name: "input"
-    description: "The input tensor to be padded."
     type_attr: "T"
   }
   input_arg {
     name: "paddings"
-    description: "A two-column matrix specifying the padding sizes. The number of\nrows must be the same as the rank of `input`."
     type_attr: "Tpaddings"
   }
   output_arg {
     name: "output"
-    description: "The padded tensor."
     type_attr: "T"
   }
   attr {
@@ -14952,7 +13838,6 @@ op {
   attr {
     name: "mode"
     type: "string"
-    description: "Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions\ndo not include the borders, while in symmetric mode the padded regions\ndo include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`\nis `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and\nit is `[1, 2, 3, 3, 2]` in symmetric mode."
     allowed_values {
       list {
         s: "REFLECT"
@@ -14960,24 +13845,19 @@ op {
       }
     }
   }
-  summary: "Pads a tensor with mirrored values."
-  description: "This operation pads a `input` with mirrored values according to the `paddings`\nyou specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is\nthe rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many values to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many values to add after the contents of `input`\nin that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater\nthan `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true\n(if false, respectively).\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 2, 3], [4, 5, 6]].\n# \'paddings\' is [[1, 1]], [2, 2]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]\n                      [2, 1, 1, 2, 3, 3, 2]\n                      [5, 4, 4, 5, 6, 6, 5]\n                      [5, 4, 4, 5, 6, 6, 5]]\n```"
 }
 op {
   name: "MirrorPadGrad"
   input_arg {
     name: "input"
-    description: "The input tensor to be folded."
     type_attr: "T"
   }
   input_arg {
     name: "paddings"
-    description: "A two-column matrix specifying the padding sizes. The number of\nrows must be the same as the rank of `input`."
     type_attr: "Tpaddings"
   }
   output_arg {
     name: "output"
-    description: "The folded tensor."
     type_attr: "T"
   }
   attr {
@@ -15000,7 +13880,6 @@ op {
   attr {
     name: "mode"
     type: "string"
-    description: "The mode used in the `MirrorPad` op."
     allowed_values {
       list {
         s: "REFLECT"
@@ -15008,8 +13887,6 @@ op {
       }
     }
   }
-  summary: "Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor."
-  description: "This operation folds the padded areas of `input` by `MirrorPad` according to the\n`paddings` you specify. `paddings` must be the same as `paddings` argument\ngiven to the corresponding `MirrorPad` op.\n\nThe folded size of each dimension D of the output is:\n\n`input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].\n# \'paddings\' is [[0, 1]], [0, 1]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[ 1,  5]\n                      [11, 28]]\n```"
 }
 op {
   name: "Mod"
@@ -15032,13 +13909,12 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Returns element-wise remainder of division. This emulates C semantics in that"
-  description: "the result here is consistent with a truncating divide. E.g.\n`tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.\n\n*NOTE*: `Mod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "Mul"
@@ -15060,6 +13936,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -15073,26 +13950,21 @@ op {
       }
     }
   }
-  summary: "Returns x * y element-wise."
-  description: "*NOTE*: `Mul` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
   name: "Multinomial"
   input_arg {
     name: "logits"
-    description: "2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`\nrepresents the unnormalized log probabilities for all classes."
     type_attr: "T"
   }
   input_arg {
     name: "num_samples"
-    description: "0-D.  Number of independent samples to draw for each row slice."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`\ncontains the drawn class labels with range `[0, num_classes)`."
-    type: DT_INT64
+    type_attr: "output_dtype"
   }
   attr {
     name: "seed"
@@ -15100,7 +13972,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either seed or seed2 is set to be non-zero, the internal random number\ngenerator is seeded by the given seed.  Otherwise, a random seed is used."
   }
   attr {
     name: "seed2"
@@ -15108,7 +13979,6 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "T"
@@ -15118,10 +13988,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -15129,19 +14000,29 @@ op {
       }
     }
   }
-  summary: "Draws samples from a multinomial distribution."
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   is_stateful: true
 }
 op {
   name: "MutableDenseHashTable"
   input_arg {
     name: "empty_key"
-    description: "The key used to represent empty key buckets internally. Must not\nbe used in insert or lookup operations."
     type_attr: "key_dtype"
   }
   output_arg {
     name: "table_handle"
-    description: "Handle to a table."
     type: DT_STRING
     is_ref: true
   }
@@ -15151,7 +14032,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -15159,7 +14039,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
   }
   attr {
     name: "use_node_name_sharing"
@@ -15171,12 +14050,10 @@ op {
   attr {
     name: "key_dtype"
     type: "type"
-    description: "Type of the table keys."
   }
   attr {
     name: "value_dtype"
     type: "type"
-    description: "Type of the table values."
   }
   attr {
     name: "value_shape"
@@ -15185,7 +14062,6 @@ op {
       shape {
       }
     }
-    description: "The shape of each value."
   }
   attr {
     name: "initial_num_buckets"
@@ -15193,7 +14069,6 @@ op {
     default_value {
       i: 131072
     }
-    description: "The initial number of hash table buckets. Must be a power\nto 2."
   }
   attr {
     name: "max_load_factor"
@@ -15201,22 +14076,17 @@ op {
     default_value {
       f: 0.8
     }
-    description: "The maximum ratio between number of entries and number of\nbuckets before growing the table. Must be between 0 and 1."
   }
-  summary: "Creates an empty hash table that uses tensors as the backing store."
-  description: "It uses \"open addressing\" with quadratic reprobing to resolve\ncollisions.\n\nThis op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
 op {
   name: "MutableDenseHashTableV2"
   input_arg {
     name: "empty_key"
-    description: "The key used to represent empty key buckets internally. Must not\nbe used in insert or lookup operations."
     type_attr: "key_dtype"
   }
   output_arg {
     name: "table_handle"
-    description: "Handle to a table."
     type: DT_RESOURCE
   }
   attr {
@@ -15225,7 +14095,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -15233,7 +14102,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
   }
   attr {
     name: "use_node_name_sharing"
@@ -15245,12 +14113,10 @@ op {
   attr {
     name: "key_dtype"
     type: "type"
-    description: "Type of the table keys."
   }
   attr {
     name: "value_dtype"
     type: "type"
-    description: "Type of the table values."
   }
   attr {
     name: "value_shape"
@@ -15259,7 +14125,6 @@ op {
       shape {
       }
     }
-    description: "The shape of each value."
   }
   attr {
     name: "initial_num_buckets"
@@ -15267,7 +14132,6 @@ op {
     default_value {
       i: 131072
     }
-    description: "The initial number of hash table buckets. Must be a power\nto 2."
   }
   attr {
     name: "max_load_factor"
@@ -15275,17 +14139,13 @@ op {
     default_value {
       f: 0.8
     }
-    description: "The maximum ratio between number of entries and number of\nbuckets before growing the table. Must be between 0 and 1."
   }
-  summary: "Creates an empty hash table that uses tensors as the backing store."
-  description: "It uses \"open addressing\" with quadratic reprobing to resolve\ncollisions.\n\nThis op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
 op {
   name: "MutableHashTable"
   output_arg {
     name: "table_handle"
-    description: "Handle to a table."
     type: DT_STRING
     is_ref: true
   }
@@ -15295,7 +14155,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -15303,7 +14162,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
   }
   attr {
     name: "use_node_name_sharing"
@@ -15311,27 +14169,21 @@ op {
     default_value {
       b: false
     }
-    description: "If true and shared_name is empty, the table is shared\nusing the node name."
   }
   attr {
     name: "key_dtype"
     type: "type"
-    description: "Type of the table keys."
   }
   attr {
     name: "value_dtype"
     type: "type"
-    description: "Type of the table values."
   }
-  summary: "Creates an empty hash table."
-  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
 op {
   name: "MutableHashTableOfTensors"
   output_arg {
     name: "table_handle"
-    description: "Handle to a table."
     type: DT_STRING
     is_ref: true
   }
@@ -15341,7 +14193,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -15349,7 +14200,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
   }
   attr {
     name: "use_node_name_sharing"
@@ -15361,12 +14211,10 @@ op {
   attr {
     name: "key_dtype"
     type: "type"
-    description: "Type of the table keys."
   }
   attr {
     name: "value_dtype"
     type: "type"
-    description: "Type of the table values."
   }
   attr {
     name: "value_shape"
@@ -15376,15 +14224,12 @@ op {
       }
     }
   }
-  summary: "Creates an empty hash table."
-  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a vector. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
 op {
   name: "MutableHashTableOfTensorsV2"
   output_arg {
     name: "table_handle"
-    description: "Handle to a table."
     type: DT_RESOURCE
   }
   attr {
@@ -15393,7 +14238,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -15401,7 +14245,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
   }
   attr {
     name: "use_node_name_sharing"
@@ -15413,12 +14256,10 @@ op {
   attr {
     name: "key_dtype"
     type: "type"
-    description: "Type of the table keys."
   }
   attr {
     name: "value_dtype"
     type: "type"
-    description: "Type of the table values."
   }
   attr {
     name: "value_shape"
@@ -15428,15 +14269,12 @@ op {
       }
     }
   }
-  summary: "Creates an empty hash table."
-  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a vector. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
 op {
   name: "MutableHashTableV2"
   output_arg {
     name: "table_handle"
-    description: "Handle to a table."
     type: DT_RESOURCE
   }
   attr {
@@ -15445,7 +14283,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -15453,7 +14290,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
   }
   attr {
     name: "use_node_name_sharing"
@@ -15461,20 +14297,15 @@ op {
     default_value {
       b: false
     }
-    description: "If true and shared_name is empty, the table is shared\nusing the node name."
   }
   attr {
     name: "key_dtype"
     type: "type"
-    description: "Type of the table keys."
   }
   attr {
     name: "value_dtype"
     type: "type"
-    description: "Type of the table values."
   }
-  summary: "Creates an empty hash table."
-  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
 op {
@@ -15493,6 +14324,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -15502,31 +14334,25 @@ op {
       }
     }
   }
-  summary: "Computes numerical negative value element-wise."
-  description: "I.e., \\\\(y = -x\\\\)."
 }
 op {
   name: "NegTrain"
   input_arg {
     name: "w_in"
-    description: "input word embedding."
     type: DT_FLOAT
     is_ref: true
   }
   input_arg {
     name: "w_out"
-    description: "output word embedding."
     type: DT_FLOAT
     is_ref: true
   }
   input_arg {
     name: "examples"
-    description: "A vector of word ids."
     type: DT_INT32
   }
   input_arg {
     name: "labels"
-    description: "A vector of word ids."
     type: DT_INT32
   }
   input_arg {
@@ -15536,14 +14362,11 @@ op {
   attr {
     name: "vocab_count"
     type: "list(int)"
-    description: "Count of words in the vocabulary."
   }
   attr {
     name: "num_negative_samples"
     type: "int"
-    description: "Number of negative samples per example."
   }
-  summary: "Training via negative sampling."
   deprecation {
     version: 19
     explanation: "Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result"
@@ -15554,44 +14377,36 @@ op {
   name: "NextIteration"
   input_arg {
     name: "data"
-    description: "The tensor to be made available to the next iteration."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The same tensor as `data`."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Makes its input available to the next iteration."
 }
 op {
   name: "NoOp"
-  summary: "Does nothing. Only useful as a placeholder for control edges."
 }
 op {
   name: "NonMaxSuppression"
   input_arg {
     name: "boxes"
-    description: "A 2-D float tensor of shape `[num_boxes, 4]`."
     type: DT_FLOAT
   }
   input_arg {
     name: "scores"
-    description: "A 1-D float tensor of shape `[num_boxes]` representing a single\nscore corresponding to each box (each row of boxes)."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_output_size"
-    description: "A scalar integer tensor representing the maximum number of\nboxes to be selected by non max suppression."
     type: DT_INT32
   }
   output_arg {
     name: "selected_indices"
-    description: "A 1-D integer tensor of shape `[M]` representing the selected\nindices from the boxes tensor, where `M <= max_output_size`."
     type: DT_INT32
   }
   attr {
@@ -15600,40 +14415,30 @@ op {
     default_value {
       f: 0.5
     }
-    description: "A float representing the threshold for deciding whether boxes\noverlap too much with respect to IOU."
   }
-  summary: "Greedily selects a subset of bounding boxes in descending order of score,"
-  description: "pruning away boxes that have high intersection-over-union (IOU) overlap\nwith previously selected boxes.  Bounding boxes are supplied as\n[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any\ndiagonal pair of box corners and the coordinates can be provided as normalized\n(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm\nis agnostic to where the origin is in the coordinate system.  Note that this\nalgorithm is invariant to orthogonal transformations and translations\nof the coordinate system; thus translating or reflections of the coordinate\nsystem result in the same boxes being selected by the algorithm.\nThe output of this operation is a set of integers indexing into the input\ncollection of bounding boxes representing the selected boxes.  The bounding\nbox coordinates corresponding to the selected indices can then be obtained\nusing the `tf.gather operation`.  For example:\n  selected_indices = tf.image.non_max_suppression(\n      boxes, scores, max_output_size, iou_threshold)\n  selected_boxes = tf.gather(boxes, selected_indices)"
 }
 op {
   name: "NonMaxSuppressionV2"
   input_arg {
     name: "boxes"
-    description: "A 2-D float tensor of shape `[num_boxes, 4]`."
     type: DT_FLOAT
   }
   input_arg {
     name: "scores"
-    description: "A 1-D float tensor of shape `[num_boxes]` representing a single\nscore corresponding to each box (each row of boxes)."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_output_size"
-    description: "A scalar integer tensor representing the maximum number of\nboxes to be selected by non max suppression."
     type: DT_INT32
   }
   input_arg {
     name: "iou_threshold"
-    description: "A 0-D float tensor representing the threshold for deciding whether\nboxes overlap too much with respect to IOU."
     type: DT_FLOAT
   }
   output_arg {
     name: "selected_indices"
-    description: "A 1-D integer tensor of shape `[M]` representing the selected\nindices from the boxes tensor, where `M <= max_output_size`."
     type: DT_INT32
   }
-  summary: "Greedily selects a subset of bounding boxes in descending order of score,"
-  description: "pruning away boxes that have high intersection-over-union (IOU) overlap\nwith previously selected boxes.  Bounding boxes are supplied as\n[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any\ndiagonal pair of box corners and the coordinates can be provided as normalized\n(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm\nis agnostic to where the origin is in the coordinate system.  Note that this\nalgorithm is invariant to orthogonal transformations and translations\nof the coordinate system; thus translating or reflections of the coordinate\nsystem result in the same boxes being selected by the algorithm.\n\nThe output of this operation is a set of integers indexing into the input\ncollection of bounding boxes representing the selected boxes.  The bounding\nbox coordinates corresponding to the selected indices can then be obtained\nusing the `tf.gather operation`.  For example:\n\n  selected_indices = tf.image.non_max_suppression_v2(\n      boxes, scores, max_output_size, iou_threshold)\n  selected_boxes = tf.gather(boxes, selected_indices)"
 }
 op {
   name: "NotEqual"
@@ -15655,6 +14460,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -15672,25 +14478,20 @@ op {
       }
     }
   }
-  summary: "Returns the truth value of (x != y) element-wise."
-  description: "*NOTE*: `NotEqual` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
   name: "NthElement"
   input_arg {
     name: "input"
-    description: "1-D or higher with last dimension at least `n+1`."
     type_attr: "T"
   }
   input_arg {
     name: "n"
-    description: "0-D. Position of sorted vector to select along the last dimension (along\neach row for matrices). Valid range of n is `[0, input.shape[:-1])`"
     type: DT_INT32
   }
   output_arg {
     name: "values"
-    description: "The `n`-th order statistic along each last dimensional slice."
     type_attr: "T"
   }
   attr {
@@ -15699,7 +14500,6 @@ op {
     default_value {
       b: false
     }
-    description: "When set to True, find the nth-largest value in the vector and vice\nversa."
   }
   attr {
     name: "T"
@@ -15709,10 +14509,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -15720,34 +14521,27 @@ op {
       }
     }
   }
-  summary: "Finds values of the `n`-th order statistic for the last dimension."
-  description: "If the input is a vector (rank-1), finds the entries which is the nth-smallest\nvalue in the vector and outputs their values as scalar tensor.\n\nFor matrices (resp. higher rank input), computes the entries which is the\nnth-smallest value in each row (resp. vector along the last dimension). Thus,\n\n    values.shape = input.shape[:-1]"
 }
 op {
   name: "OneHot"
   input_arg {
     name: "indices"
-    description: "A tensor of indices."
     type_attr: "TI"
   }
   input_arg {
     name: "depth"
-    description: "A scalar defining the depth of the one hot dimension."
     type: DT_INT32
   }
   input_arg {
     name: "on_value"
-    description: "A scalar defining the value to fill in output when `indices[j] = i`."
     type_attr: "T"
   }
   input_arg {
     name: "off_value"
-    description: "A scalar defining the value to fill in output when `indices[j] != i`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The one-hot tensor."
     type_attr: "T"
   }
   attr {
@@ -15756,7 +14550,6 @@ op {
     default_value {
       i: -1
     }
-    description: "The axis to fill (default: -1, a new inner-most axis)."
   }
   attr {
     name: "T"
@@ -15776,20 +14569,16 @@ op {
       }
     }
   }
-  summary: "Returns a one-hot tensor."
-  description: "The locations represented by indices in `indices` take value `on_value`,\nwhile all other locations take value `off_value`.\n\nIf the input `indices` is rank `N`, the output will have rank `N+1`,\nThe new axis is created at dimension `axis` (default: the new axis is\nappended at the end).\n\nIf `indices` is a scalar the output shape will be a vector of length `depth`.\n\nIf `indices` is a vector of length `features`, the output shape will be:\n```\n  features x depth if axis == -1\n  depth x features if axis == 0\n```\n\nIf `indices` is a matrix (batch) with shape `[batch, features]`,\nthe output shape will be:\n```\n  batch x features x depth if axis == -1\n  batch x depth x features if axis == 1\n  depth x batch x features if axis == 0\n```\n\n\nExamples\n=========\n\nSuppose that\n\n```\n  indices = [0, 2, -1, 1]\n  depth = 3\n  on_value = 5.0\n  off_value = 0.0\n  axis = -1\n```\n\nThen output is `[4 x 3]`:\n\n    ```output =\n      [5.0 0.0 0.0]  // one_hot(0)\n      [0.0 0.0 5.0]  // one_hot(2)\n      [0.0 0.0 0.0]  // one_hot(-1)\n      [0.0 5.0 0.0]  // one_hot(1)\n    ```\n\nSuppose that\n\n```\n  indices = [0, 2, -1, 1]\n  depth = 3\n  on_value = 0.0\n  off_value = 3.0\n  axis = 0\n```\n\nThen output is `[3 x 4]`:\n\n    ```output =\n      [0.0 3.0 3.0 3.0]\n      [3.0 3.0 3.0 0.0]\n      [3.0 3.0 3.0 3.0]\n      [3.0 0.0 3.0 3.0]\n    //  ^                one_hot(0)\n    //      ^            one_hot(2)\n    //          ^        one_hot(-1)\n    //              ^    one_hot(1)\n    ```\nSuppose that\n\n```\n  indices = [[0, 2], [1, -1]]\n  depth = 3\n  on_value = 1.0\n  off_value = 0.0\n  axis = -1\n```\n\nThen output is `[2 x 2 x 3]`:\n\n    ```output =\n      [\n        [1.0, 0.0, 0.0]  // one_hot(0)\n        [0.0, 0.0, 1.0]  // one_hot(2)\n      ][\n        [0.0, 1.0, 0.0]  // one_hot(1)\n        [0.0, 0.0, 0.0]  // one_hot(-1)\n      ]```"
 }
 op {
   name: "OneShotIterator"
   output_arg {
     name: "handle"
-    description: "A handle to the iterator that can be passed to an \"IteratorGetNext\"\nop."
     type: DT_RESOURCE
   }
   attr {
     name: "dataset_factory"
     type: "func"
-    description: "A function of type `() -> DT_VARIANT`, where the returned\nDT_VARIANT is a dataset."
   }
   attr {
     name: "output_types"
@@ -15817,20 +14606,16 @@ op {
       s: ""
     }
   }
-  summary: "Makes a \"one-shot\" iterator that can be iterated only once."
-  description: "A one-shot iterator bundles the logic for defining the dataset and\nthe state of the iterator in a single op, which allows simple input\npipelines to be defined without an additional initialization\n(\"MakeIterator\") step.\n\nOne-shot iterators have the following limitations:\n\n* They do not support parameterization: all logic for creating the underlying\n  dataset must be bundled in the `dataset_factory` function.\n* They are not resettable. Once a one-shot iterator reaches the end of its\n  underlying dataset, subsequent \"IteratorGetNext\" operations on that\n  iterator will always produce an `OutOfRange` error.\n\nFor greater flexibility, use \"Iterator\" and \"MakeIterator\" to define\nan iterator using an arbitrary subgraph, which may capture tensors\n(including fed values) as parameters, and which may be reset multiple\ntimes by rerunning \"MakeIterator\"."
   is_stateful: true
 }
 op {
   name: "OnesLike"
   input_arg {
     name: "x"
-    description: "a tensor of type T."
     type_attr: "T"
   }
   output_arg {
     name: "y"
-    description: "a tensor of the same shape and type as x but filled with ones."
     type_attr: "T"
   }
   attr {
@@ -15838,16 +14623,21 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_BOOL
       }
     }
   }
-  summary: "Returns a tensor of ones with the same shape and type as x."
 }
 op {
   name: "OrderedMapClear"
@@ -15885,7 +14675,6 @@ op {
       s: ""
     }
   }
-  summary: "Op removes all elements in the underlying container."
   is_stateful: true
 }
 op {
@@ -15928,7 +14717,6 @@ op {
       s: ""
     }
   }
-  summary: "Op returns the number of incomplete elements in the underlying container."
   is_stateful: true
 }
 op {
@@ -15981,8 +14769,6 @@ op {
       s: ""
     }
   }
-  summary: "Op peeks at the values at the specified key.  If the"
-  description: "underlying container does not contain this key\nthis op will block until it does.   This Op is optimized for\nperformance."
   is_stateful: true
 }
 op {
@@ -16025,14 +14811,12 @@ op {
       s: ""
     }
   }
-  summary: "Op returns the number of elements in the underlying container."
   is_stateful: true
 }
 op {
   name: "OrderedMapStage"
   input_arg {
     name: "key"
-    description: "int64"
     type: DT_INT64
   }
   input_arg {
@@ -16041,7 +14825,6 @@ op {
   }
   input_arg {
     name: "values"
-    description: "a list of tensors\ndtypes A list of data types that inserted values should adhere to."
     type_list_attr: "fake_dtypes"
   }
   attr {
@@ -16050,7 +14833,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Maximum number of elements in the Staging Area. If > 0, inserts\non the container will block when the capacity is reached."
     has_minimum: true
   }
   attr {
@@ -16077,7 +14859,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue is placed in the given container. Otherwise,\na default container is used."
   }
   attr {
     name: "shared_name"
@@ -16085,10 +14866,7 @@ op {
     default_value {
       s: ""
     }
-    description: "It is necessary to match this name to the matching Unstage Op."
   }
-  summary: "Stage (key, values) in the underlying container which behaves like a ordered"
-  description: "associative container.   Elements are ordered by key."
   is_stateful: true
 }
 op {
@@ -16141,8 +14919,6 @@ op {
       s: ""
     }
   }
-  summary: "Op removes and returns the values associated with the key"
-  description: "from the underlying container.   If the underlying container\ndoes not contain this key, the op will block until it does."
   is_stateful: true
 }
 op {
@@ -16195,21 +14971,17 @@ op {
       s: ""
     }
   }
-  summary: "Op removes and returns the (key, value) element with the smallest"
-  description: "key from the underlying container.   If the underlying container\ndoes not contain elements, the op will block until it does."
   is_stateful: true
 }
 op {
   name: "Pack"
   input_arg {
     name: "values"
-    description: "Must be of same shape and type."
     type_attr: "T"
     number_attr: "N"
   }
   output_arg {
     name: "output"
-    description: "The packed tensor."
     type_attr: "T"
   }
   attr {
@@ -16228,10 +15000,7 @@ op {
     default_value {
       i: 0
     }
-    description: "Dimension along which to pack.  Negative values wrap around, so the\nvalid range is `[-(R+1), R+1)`."
   }
-  summary: "Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor."
-  description: "Packs the `N` tensors in `values` into a tensor with rank one higher than each\ntensor in `values`, by packing them along the `axis` dimension.\nGiven a list of tensors of shape `(A, B, C)`;\n\nif `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.\nif `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.\nEtc.\n\nFor example:\n\n```\n# \'x\' is [1, 4]\n# \'y\' is [2, 5]\n# \'z\' is [3, 6]\npack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\npack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]\n```\n\nThis is the opposite of `unpack`."
 }
 op {
   name: "Pad"
@@ -16264,8 +15033,6 @@ op {
       }
     }
   }
-  summary: "Pads a tensor with zeros."
-  description: "This operation pads a `input` with zeros according to the `paddings` you\nspecify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the\nrank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many zeros to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many zeros to add after the contents of `input`\nin that dimension.\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 1], [2, 2]]\n# \'paddings\' is [[1, 1], [2, 2]]\n# rank of \'t\' is 2\npad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]\n                      [0, 0, 1, 1, 0, 0]\n                      [0, 0, 2, 2, 0, 0]\n                      [0, 0, 0, 0, 0, 0]]\n```"
 }
 op {
   name: "PadV2"
@@ -16302,8 +15069,6 @@ op {
       }
     }
   }
-  summary: "Pads a tensor."
-  description: "This operation pads `input` according to the `paddings` and `constant_values`\nyou specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is\nthe rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many padding values to add before the contents of `input` in that dimension,\nand `paddings[D, 1]` indicates how many padding values to add after the contents\nof `input` in that dimension. `constant_values` is a scalar tensor of the same\ntype as `input` that indicates the value to use for padding `input`.\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 1], [2, 2]]\n# \'paddings\' is [[1, 1], [2, 2]]\n# \'constant_values\' is 0\n# rank of \'t\' is 2\npad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]\n                      [0, 0, 1, 1, 0, 0]\n                      [0, 0, 2, 2, 0, 0]\n                      [0, 0, 0, 0, 0, 0]]\n```"
 }
 op {
   name: "PaddedBatchDataset"
@@ -16313,18 +15078,15 @@ op {
   }
   input_arg {
     name: "batch_size"
-    description: "A scalar representing the number of elements to accumulate in a\nbatch."
     type: DT_INT64
   }
   input_arg {
     name: "padded_shapes"
-    description: "A list of int64 tensors representing the desired padded shapes\nof the corresponding output components. These shapes may be partially\nspecified, using `-1` to indicate that a particular dimension should be\npadded to the maximum size of all batch elements."
     type: DT_INT64
     number_attr: "N"
   }
   input_arg {
     name: "padding_values"
-    description: "A list of scalars containing the padding value to use for\neach of the outputs."
     type_list_attr: "Toutput_types"
   }
   output_arg {
@@ -16349,20 +15111,17 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that batches and pads `batch_size` elements from the input."
 }
 op {
   name: "PaddingFIFOQueue"
   output_arg {
     name: "handle"
-    description: "The handle to the queue."
     type: DT_STRING
     is_ref: true
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a value."
     has_minimum: true
     minimum: 1
   }
@@ -16373,7 +15132,6 @@ op {
       list {
       }
     }
-    description: "The shape of each component in a value. The length of this attr must\nbe either 0 or the same as the length of component_types.\nShapes of fixed rank but variable size are allowed by setting\nany shape dimension to -1.  In this case, the inputs\' shape may vary along\nthe given dimension, and DequeueMany will pad the given dimension with\nzeros up to the maximum shape of all elements in the given batch.\nIf the length of this attr is 0, different queue elements may have\ndifferent ranks and shapes, but only one element may be dequeued at a time."
     has_minimum: true
   }
   attr {
@@ -16382,7 +15140,6 @@ op {
     default_value {
       i: -1
     }
-    description: "The upper bound on the number of elements in this queue.\nNegative numbers mean no limit."
   }
   attr {
     name: "container"
@@ -16390,7 +15147,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -16398,23 +15154,18 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue will be shared under the given name\nacross multiple sessions."
   }
-  summary: "A queue that produces elements in first-in first-out order."
-  description: "Variable-size shapes are allowed by setting the corresponding shape dimensions\nto 0 in the shape attr.  In this case DequeueMany will pad up to the maximum\nsize of any given element in the minibatch.  See below for details."
   is_stateful: true
 }
 op {
   name: "PaddingFIFOQueueV2"
   output_arg {
     name: "handle"
-    description: "The handle to the queue."
     type: DT_RESOURCE
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a value."
     has_minimum: true
     minimum: 1
   }
@@ -16425,7 +15176,6 @@ op {
       list {
       }
     }
-    description: "The shape of each component in a value. The length of this attr must\nbe either 0 or the same as the length of component_types.\nShapes of fixed rank but variable size are allowed by setting\nany shape dimension to -1.  In this case, the inputs\' shape may vary along\nthe given dimension, and DequeueMany will pad the given dimension with\nzeros up to the maximum shape of all elements in the given batch.\nIf the length of this attr is 0, different queue elements may have\ndifferent ranks and shapes, but only one element may be dequeued at a time."
     has_minimum: true
   }
   attr {
@@ -16434,7 +15184,6 @@ op {
     default_value {
       i: -1
     }
-    description: "The upper bound on the number of elements in this queue.\nNegative numbers mean no limit."
   }
   attr {
     name: "container"
@@ -16442,7 +15191,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -16450,23 +15198,18 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue will be shared under the given name\nacross multiple sessions."
   }
-  summary: "A queue that produces elements in first-in first-out order."
-  description: "Variable-size shapes are allowed by setting the corresponding shape dimensions\nto 0 in the shape attr.  In this case DequeueMany will pad up to the maximum\nsize of any given element in the minibatch.  See below for details."
   is_stateful: true
 }
 op {
   name: "ParallelConcat"
   input_arg {
     name: "values"
-    description: "Tensors to be concatenated. All must have size 1 in the first dimension\nand same shape."
     type_attr: "T"
     number_attr: "N"
   }
   output_arg {
     name: "output"
-    description: "The concatenated tensor."
     type_attr: "T"
   }
   attr {
@@ -16482,10 +15225,7 @@ op {
   attr {
     name: "shape"
     type: "shape"
-    description: "the final shape of the result; should be equal to the shapes of any input\nbut with the number of input values in the first dimension."
   }
-  summary: "Concatenates a list of `N` tensors along the first dimension."
-  description: "The input tensors are all required to have size 1 in the first dimension.\n\nFor example:\n\n```\n# \'x\' is [[1, 4]]\n# \'y\' is [[2, 5]]\n# \'z\' is [[3, 6]]\nparallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\n```\n\nThe difference between concat and parallel_concat is that concat requires all\nof the inputs be computed before the operation will begin but doesn\'t require\nthat the input shapes be known during graph construction.  Parallel concat\nwill copy pieces of the input into the output as they become available, in\nsome situations this can provide a performance benefit."
 }
 op {
   name: "ParallelDynamicStitch"
@@ -16513,8 +15253,6 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Interleave the values from the `data` tensors into a single tensor."
-  description: "Builds a merged tensor such that\n\n```python\n    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]\n```\n\nFor example, if each `indices[m]` is scalar or vector, we have\n\n```python\n    # Scalar indices:\n    merged[indices[m], ...] = data[m][...]\n\n    # Vector indices:\n    merged[indices[m][i], ...] = data[m][i, ...]\n```\n\nEach `data[i].shape` must start with the corresponding `indices[i].shape`,\nand the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we\nmust have `data[i].shape = indices[i].shape + constant`.  In terms of this\n`constant`, the output shape is\n\n    merged.shape = [max(indices)] + constant\n\nValues may be merged in parallel, so if an index appears in both `indices[m][i]`\nand `indices[n][j]`, the result may be invalid. This differs from the normal\nDynamicStitch operator that defines the behavior in that case.\n\nFor example:\n\n```python\n    indices[0] = 6\n    indices[1] = [4, 1]\n    indices[2] = [[5, 2], [0, 3]]\n    data[0] = [61, 62]\n    data[1] = [[41, 42], [11, 12]]\n    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]\n    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],\n              [51, 52], [61, 62]]\n```\n\nThis method can be used to merge partitions created by `dynamic_partition`\nas illustrated on the following example:\n\n```python\n    # Apply function (increments x_i) on elements for which a certain condition\n    # apply (x_i != -1 in this example).\n    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])\n    condition_mask=tf.not_equal(x,tf.constant(-1.))\n    partitioned_data = tf.dynamic_partition(\n        x, tf.cast(condition_mask, tf.int32) , 2)\n    partitioned_data[1] = partitioned_data[1] + 1.0\n    condition_indices = tf.dynamic_partition(\n        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)\n    x = tf.dynamic_stitch(condition_indices, partitioned_data)\n    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain\n    # unchanged.\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/DynamicStitch.png\" alt>\n</div>"
 }
 op {
   name: "ParallelInterleaveDataset"
@@ -16538,6 +15276,14 @@ op {
     name: "sloppy"
     type: DT_BOOL
   }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -16545,7 +15291,6 @@ op {
   attr {
     name: "f"
     type: "func"
-    description: "A function mapping elements of `input_dataset`, concatenated with\n`other_arguments`, to a Dataset variant that contains elements matching\n`output_types` and `output_shapes`."
   }
   attr {
     name: "Targuments"
@@ -16564,8 +15309,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
-  description: "The resulting dataset is similar to the `InterleaveDataset`, with the exception\nthat if retrieving the next value from a dataset would cause the requester to\nblock, it will skip that input dataset. This dataset is especially useful\nwhen loading data from a variable-latency datastores (e.g. HDFS, GCS), as it\nallows the training step to proceed so long as some data is available.\n\n!! WARNING !! This dataset is not deterministic!"
 }
 op {
   name: "ParallelMapDataset"
@@ -16579,7 +15322,6 @@ op {
   }
   input_arg {
     name: "num_parallel_calls"
-    description: "The number of concurrent invocations of `f` that process\nelements from `input_dataset` in parallel."
     type: DT_INT32
   }
   output_arg {
@@ -16607,39 +15349,31 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
-  description: "Unlike a \"MapDataset\", which applies `f` sequentially, this dataset invokes up\nto `num_parallel_calls` copies of `f` in parallel."
 }
 op {
   name: "ParameterizedTruncatedNormal"
   input_arg {
     name: "shape"
-    description: "The shape of the output tensor. Batches are indexed by the 0th dimension."
     type_attr: "T"
   }
   input_arg {
     name: "means"
-    description: "The mean parameter of each batch."
     type_attr: "dtype"
   }
   input_arg {
     name: "stdevs"
-    description: "The standard deviation parameter of each batch. Must be greater than 0."
     type_attr: "dtype"
   }
   input_arg {
     name: "minvals"
-    description: "The minimum cutoff. May be -infinity."
     type_attr: "dtype"
   }
   input_arg {
     name: "maxvals"
-    description: "The maximum cutoff. May be +infinity, and must be more than the minval\nfor each batch."
     type_attr: "dtype"
   }
   output_arg {
     name: "output"
-    description: "A matrix of shape num_batches x samples_per_batch, filled with random\ntruncated normal values using the parameters for each row."
     type_attr: "dtype"
   }
   attr {
@@ -16648,7 +15382,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either `seed` or `seed2` are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -16656,15 +15389,14 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of the output."
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -16680,37 +15412,30 @@ op {
       }
     }
   }
-  summary: "Outputs random values from a normal distribution. The parameters may each be a"
-  description: "scalar which applies to the entire output, or a vector of length shape[0] which\nstores the parameters for each batch."
   is_stateful: true
 }
 op {
   name: "ParseExample"
   input_arg {
     name: "serialized"
-    description: "A vector containing a batch of binary serialized Example protos."
     type: DT_STRING
   }
   input_arg {
     name: "names"
-    description: "A vector containing the names of the serialized protos.\nMay contain, for example, table key (descriptive) names for the\ncorresponding serialized protos.  These are purely useful for debugging\npurposes, and the presence of values here has no effect on the output.\nMay also be an empty vector if no names are available.\nIf non-empty, this vector must be the same length as \"serialized\"."
     type: DT_STRING
   }
   input_arg {
     name: "sparse_keys"
-    description: "A list of Nsparse string Tensors (scalars).\nThe keys expected in the Examples\' features associated with sparse values."
     type: DT_STRING
     number_attr: "Nsparse"
   }
   input_arg {
     name: "dense_keys"
-    description: "A list of Ndense string Tensors (scalars).\nThe keys expected in the Examples\' features associated with dense values."
     type: DT_STRING
     number_attr: "Ndense"
   }
   input_arg {
     name: "dense_defaults"
-    description: "A list of Ndense Tensors (some may be empty).\ndense_defaults[j] provides default values\nwhen the example\'s feature_map lacks dense_key[j].  If an empty Tensor is\nprovided for dense_defaults[j], then the Feature dense_keys[j] is required.\nThe input type is inferred from dense_defaults[j], even when it\'s empty.\nIf dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,\nthen the shape of dense_defaults[j] must match that of dense_shapes[j].\nIf dense_shapes[j] has an undefined major dimension (variable strides dense\nfeature), dense_defaults[j] must contain a single element:\nthe padding element."
     type_list_attr: "Tdense"
   }
   output_arg {
@@ -16744,7 +15469,6 @@ op {
   attr {
     name: "sparse_types"
     type: "list(type)"
-    description: "A list of Nsparse types; the data types of data in each Feature\ngiven in sparse_keys.\nCurrently the ParseExample supports DT_FLOAT (FloatList),\nDT_INT64 (Int64List), and DT_STRING (BytesList)."
     has_minimum: true
     allowed_values {
       list {
@@ -16769,55 +15493,118 @@ op {
   attr {
     name: "dense_shapes"
     type: "list(shape)"
-    description: "A list of Ndense shapes; the shapes of data in each Feature\ngiven in dense_keys.\nThe number of elements in the Feature corresponding to dense_key[j]\nmust always equal dense_shapes[j].NumEntries().\nIf dense_shapes[j] == (D0, D1, ..., DN) then the shape of output\nTensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):\nThe dense outputs are just the inputs row-stacked by batch.\nThis works for dense_shapes[j] = (-1, D1, ..., DN).  In this case\nthe shape of the output Tensor dense_values[j] will be\n(|serialized|, M, D1, .., DN), where M is the maximum number of blocks\nof elements of length D1 * .... * DN, across all minibatch entries\nin the input.  Any minibatch entry with less than M blocks of elements of\nlength D1 * ... * DN will be padded with the corresponding default_value\nscalar element along the second dimension."
     has_minimum: true
   }
-  summary: "Transforms a vector of brain.Example protos (as strings) into typed tensors."
+}
+op {
+  name: "ParseSingleExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "num_sparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "num_sparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
 }
 op {
   name: "ParseSingleSequenceExample"
   input_arg {
     name: "serialized"
-    description: "A scalar containing a binary serialized SequenceExample proto."
     type: DT_STRING
   }
   input_arg {
     name: "feature_list_dense_missing_assumed_empty"
-    description: "A vector listing the\nFeatureList keys which may be missing from the SequenceExample.  If the\nassociated FeatureList is missing, it is treated as empty.  By default,\nany FeatureList not listed in this vector must exist in the SequenceExample."
     type: DT_STRING
   }
   input_arg {
     name: "context_sparse_keys"
-    description: "A list of Ncontext_sparse string Tensors (scalars).\nThe keys expected in the Examples\' features associated with context_sparse\nvalues."
     type: DT_STRING
     number_attr: "Ncontext_sparse"
   }
   input_arg {
     name: "context_dense_keys"
-    description: "A list of Ncontext_dense string Tensors (scalars).\nThe keys expected in the SequenceExamples\' context features associated with\ndense values."
     type: DT_STRING
     number_attr: "Ncontext_dense"
   }
   input_arg {
     name: "feature_list_sparse_keys"
-    description: "A list of Nfeature_list_sparse string Tensors\n(scalars).  The keys expected in the FeatureLists associated with sparse\nvalues."
     type: DT_STRING
     number_attr: "Nfeature_list_sparse"
   }
   input_arg {
     name: "feature_list_dense_keys"
-    description: "A list of Nfeature_list_dense string Tensors (scalars).\nThe keys expected in the SequenceExamples\' feature_lists associated\nwith lists of dense values."
     type: DT_STRING
     number_attr: "Nfeature_list_dense"
   }
   input_arg {
     name: "context_dense_defaults"
-    description: "A list of Ncontext_dense Tensors (some may be empty).\ncontext_dense_defaults[j] provides default values\nwhen the SequenceExample\'s context map lacks context_dense_key[j].\nIf an empty Tensor is provided for context_dense_defaults[j],\nthen the Feature context_dense_keys[j] is required.\nThe input type is inferred from context_dense_defaults[j], even when it\'s\nempty.  If context_dense_defaults[j] is not empty, its shape must match\ncontext_dense_shapes[j]."
     type_list_attr: "Tcontext_dense"
   }
   input_arg {
     name: "debug_name"
-    description: "A scalar containing the name of the serialized proto.\nMay contain, for example, table key (descriptive) name for the\ncorresponding serialized proto.  This is purely useful for debugging\npurposes, and the presence of values here has no effect on the output.\nMay also be an empty scalar if no name is available."
     type: DT_STRING
   }
   output_arg {
@@ -16895,7 +15682,6 @@ op {
       list {
       }
     }
-    description: "A list of Ncontext_sparse types; the data types of data in\neach context Feature given in context_sparse_keys.\nCurrently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),\nDT_INT64 (Int64List), and DT_STRING (BytesList)."
     has_minimum: true
     allowed_values {
       list {
@@ -16944,7 +15730,6 @@ op {
       list {
       }
     }
-    description: "A list of Ncontext_dense shapes; the shapes of data in\neach context Feature given in context_dense_keys.\nThe number of elements in the Feature corresponding to context_dense_key[j]\nmust always equal context_dense_shapes[j].NumEntries().\nThe shape of context_dense_values[j] will match context_dense_shapes[j]."
     has_minimum: true
   }
   attr {
@@ -16954,7 +15739,6 @@ op {
       list {
       }
     }
-    description: "A list of Nfeature_list_sparse types; the data types\nof data in each FeatureList given in feature_list_sparse_keys.\nCurrently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),\nDT_INT64 (Int64List), and DT_STRING (BytesList)."
     has_minimum: true
     allowed_values {
       list {
@@ -16971,41 +15755,33 @@ op {
       list {
       }
     }
-    description: "A list of Nfeature_list_dense shapes; the shapes of\ndata in each FeatureList given in feature_list_dense_keys.\nThe shape of each Feature in the FeatureList corresponding to\nfeature_list_dense_key[j] must always equal\nfeature_list_dense_shapes[j].NumEntries()."
     has_minimum: true
   }
-  summary: "Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors."
 }
 op {
   name: "ParseTensor"
   input_arg {
     name: "serialized"
-    description: "A scalar string containing a serialized TensorProto proto."
     type: DT_STRING
   }
   output_arg {
     name: "output"
-    description: "A Tensor of type `out_type`."
     type_attr: "out_type"
   }
   attr {
     name: "out_type"
     type: "type"
-    description: "The type of the serialized tensor.  The provided type must match the\ntype of the serialized tensor and no implicit conversion will take place."
   }
-  summary: "Transforms a serialized tensorflow.TensorProto proto into a Tensor."
 }
 op {
   name: "Placeholder"
   output_arg {
     name: "output"
-    description: "A placeholder tensor that must be replaced using the feed mechanism."
     type_attr: "dtype"
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of elements in the tensor."
   }
   attr {
     name: "shape"
@@ -17015,30 +15791,22 @@ op {
         unknown_rank: true
       }
     }
-    description: "(Optional) The shape of the tensor. If the shape has 0 dimensions, the\nshape is unconstrained."
   }
-  summary: "A placeholder op for a value that will be fed into the computation."
-  description: "N.B. This operation will fail with an error if it is executed. It is\nintended as a way to represent a value that will always be fed, and to\nprovide attrs that enable the fed value to be checked at runtime."
 }
 op {
   name: "PlaceholderV2"
   output_arg {
     name: "output"
-    description: "A placeholder tensor that must be replaced using the feed mechanism."
     type_attr: "dtype"
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of elements in the tensor."
   }
   attr {
     name: "shape"
     type: "shape"
-    description: "The shape of the tensor. The shape can be any partially-specified\nshape.  To be unconstrained, pass in a shape with unknown rank."
   }
-  summary: "A placeholder op for a value that will be fed into the computation."
-  description: "N.B. This operation will fail with an error if it is executed. It is\nintended as a way to represent a value that will always be fed, and to\nprovide attrs that enable the fed value to be checked at runtime."
   deprecation {
     version: 23
     explanation: "Placeholder now behaves the same as PlaceholderV2."
@@ -17048,25 +15816,20 @@ op {
   name: "PlaceholderWithDefault"
   input_arg {
     name: "input"
-    description: "The default value to produce when `output` is not fed."
     type_attr: "dtype"
   }
   output_arg {
     name: "output"
-    description: "A placeholder tensor that defaults to `input` if it is not fed."
     type_attr: "dtype"
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of elements in the tensor."
   }
   attr {
     name: "shape"
     type: "shape"
-    description: "The (possibly partial) shape of the tensor."
   }
-  summary: "A placeholder op that passes through `input` when its output is not fed."
 }
 op {
   name: "Polygamma"
@@ -17092,8 +15855,6 @@ op {
       }
     }
   }
-  summary: "Compute the polygamma function \\\\(\\psi^{(n)}(x)\\\\)."
-  description: "The polygamma function is defined as:\n\n\n\\\\(\\psi^{(n)}(x) = \\frac{d^n}{dx^n} \\psi(x)\\\\)\n\nwhere \\\\(\\psi(x)\\\\) is the digamma function."
 }
 op {
   name: "PopulationCount"
@@ -17121,8 +15882,6 @@ op {
       }
     }
   }
-  summary: "Computes element-wise population count (a.k.a. popcount, bitsum, bitcount)."
-  description: "For each entry in `x`, calculates the number of `1` (on) bits in the binary\nrepresentation of that entry.\n\n**NOTE**: It is more efficient to first `tf.bitcast` your tensors into\n`int32` or `int64` and perform the bitcount on the result, than to feed in\n8- or 16-bit inputs and then aggregate the resulting counts."
 }
 op {
   name: "Pow"
@@ -17144,6 +15903,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -17153,8 +15913,6 @@ op {
       }
     }
   }
-  summary: "Computes the power of one value to another."
-  description: "Given a tensor `x` and a tensor `y`, this operation computes \\\\(x^y\\\\) for\ncorresponding elements in `x` and `y`. For example:\n\n```\n# tensor \'x\' is [[2, 2]], [3, 3]]\n# tensor \'y\' is [[8, 16], [2, 3]]\ntf.pow(x, y) ==> [[256, 65536], [9, 27]]\n```"
 }
 op {
   name: "PrefetchDataset"
@@ -17164,7 +15922,6 @@ op {
   }
   input_arg {
     name: "buffer_size"
-    description: "The maximum number of elements to buffer in an iterator over\nthis dataset."
     type: DT_INT64
   }
   output_arg {
@@ -17183,18 +15940,57 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that asynchronously prefetches elements from `input_dataset`."
+}
+op {
+  name: "PrependFromQueueAndPaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
 }
 op {
   name: "PreventGradient"
   input_arg {
     name: "input"
-    description: "any tensor."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "the same input tensor."
     type_attr: "T"
   }
   attr {
@@ -17207,26 +16003,20 @@ op {
     default_value {
       s: ""
     }
-    description: "Will be printed in the error when anyone tries to differentiate\nthis operation."
   }
-  summary: "An identity op that triggers an error if a gradient is requested."
-  description: "When executed in a graph, this op outputs its input tensor as-is.\n\nWhen building ops to compute gradients, the TensorFlow gradient system\nwill return an error when trying to lookup the gradient of this op,\nbecause no gradient must ever be registered for this function.  This\nop exists to prevent subtle bugs from silently returning unimplemented\ngradients in some corner cases."
 }
 op {
   name: "Print"
   input_arg {
     name: "input"
-    description: "The tensor passed to `output`"
     type_attr: "T"
   }
   input_arg {
     name: "data"
-    description: "A list of tensors to print out when op is evaluated."
     type_list_attr: "U"
   }
   output_arg {
     name: "output"
-    description: "= The unmodified `input` tensor"
     type_attr: "T"
   }
   attr {
@@ -17244,7 +16034,6 @@ op {
     default_value {
       s: ""
     }
-    description: "A string, prefix of the error message."
   }
   attr {
     name: "first_n"
@@ -17252,7 +16041,6 @@ op {
     default_value {
       i: -1
     }
-    description: "Only log `first_n` number of times. -1 disables logging."
   }
   attr {
     name: "summarize"
@@ -17260,17 +16048,13 @@ op {
     default_value {
       i: 3
     }
-    description: "Only print this many entries of each tensor."
   }
-  summary: "Prints a list of tensors."
-  description: "Passes `input` through to `output` and prints `data` when evaluating."
   is_stateful: true
 }
 op {
   name: "PriorityQueue"
   output_arg {
     name: "handle"
-    description: "The handle to the queue."
     type: DT_STRING
     is_ref: true
   }
@@ -17281,13 +16065,11 @@ op {
       list {
       }
     }
-    description: "The type of each component in a value."
     has_minimum: true
   }
   attr {
     name: "shapes"
     type: "list(shape)"
-    description: "The shape of each component in a value. The length of this attr must\nbe either 0 or the same as the length of component_types. If the length of\nthis attr is 0, the shapes of queue elements are not constrained, and\nonly one element may be dequeued at a time."
     has_minimum: true
   }
   attr {
@@ -17296,7 +16078,6 @@ op {
     default_value {
       i: -1
     }
-    description: "The upper bound on the number of elements in this queue.\nNegative numbers mean no limit."
   }
   attr {
     name: "container"
@@ -17304,7 +16085,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -17312,17 +16092,13 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue will be shared under the given name\nacross multiple sessions."
   }
-  summary: "A queue that produces elements sorted by the first component value."
-  description: "Note that the PriorityQueue requires the first component of any element\nto be a scalar int64, in addition to the other elements declared by\ncomponent_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue\nand DequeueMany) on a PriorityQueue will all require (resp. output) one extra\nentry in their input (resp. output) lists."
   is_stateful: true
 }
 op {
   name: "PriorityQueueV2"
   output_arg {
     name: "handle"
-    description: "The handle to the queue."
     type: DT_RESOURCE
   }
   attr {
@@ -17332,13 +16108,11 @@ op {
       list {
       }
     }
-    description: "The type of each component in a value."
     has_minimum: true
   }
   attr {
     name: "shapes"
     type: "list(shape)"
-    description: "The shape of each component in a value. The length of this attr must\nbe either 0 or the same as the length of component_types. If the length of\nthis attr is 0, the shapes of queue elements are not constrained, and\nonly one element may be dequeued at a time."
     has_minimum: true
   }
   attr {
@@ -17347,7 +16121,6 @@ op {
     default_value {
       i: -1
     }
-    description: "The upper bound on the number of elements in this queue.\nNegative numbers mean no limit."
   }
   attr {
     name: "container"
@@ -17355,7 +16128,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -17363,27 +16135,21 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue will be shared under the given name\nacross multiple sessions."
   }
-  summary: "A queue that produces elements sorted by the first component value."
-  description: "Note that the PriorityQueue requires the first component of any element\nto be a scalar int64, in addition to the other elements declared by\ncomponent_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue\nand DequeueMany) on a PriorityQueue will all require (resp. output) one extra\nentry in their input (resp. output) lists."
   is_stateful: true
 }
 op {
   name: "Prod"
   input_arg {
     name: "input"
-    description: "The tensor to reduce."
     type_attr: "T"
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    description: "The reduced tensor."
     type_attr: "T"
   }
   attr {
@@ -17392,7 +16158,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, retain reduced dimensions with length 1."
   }
   attr {
     name: "T"
@@ -17401,17 +16166,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -17431,40 +16197,31 @@ op {
       }
     }
   }
-  summary: "Computes the product of elements across dimensions of a tensor."
-  description: "Reduces `input` along the dimensions given in `reduction_indices`. Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_indices`. If `keep_dims` is true, the reduced dimensions are\nretained with length 1."
 }
 op {
   name: "PyFunc"
   input_arg {
     name: "input"
-    description: "List of Tensors that will provide input to the Op."
     type_list_attr: "Tin"
   }
   output_arg {
     name: "output"
-    description: "The outputs from the Op."
     type_list_attr: "Tout"
   }
   attr {
     name: "token"
     type: "string"
-    description: "A token representing a registered python function in this address space."
   }
   attr {
     name: "Tin"
     type: "list(type)"
-    description: "Data types of the inputs to the op."
     has_minimum: true
   }
   attr {
     name: "Tout"
     type: "list(type)"
-    description: "Data types of the outputs from the op.\nThe length of the list specifies the number of outputs."
     has_minimum: true
   }
-  summary: "Invokes a python function to compute func(input)->output."
-  description: "This operation is considered stateful. For a stateless version, see\nPyFuncStateless."
   is_stateful: true
 }
 op {
@@ -17491,23 +16248,19 @@ op {
     type: "list(type)"
     has_minimum: true
   }
-  summary: "A stateless version of PyFunc."
 }
 op {
   name: "Qr"
   input_arg {
     name: "input"
-    description: "A tensor of shape `[..., M, N]` whose inner-most 2 dimensions\nform matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`."
     type_attr: "T"
   }
   output_arg {
     name: "q"
-    description: "Orthonormal basis for range of `a`. If `full_matrices` is `False` then\nshape is `[..., M, P]`; if `full_matrices` is `True` then shape is\n`[..., M, M]`."
     type_attr: "T"
   }
   output_arg {
     name: "r"
-    description: "Triangular factor. If `full_matrices` is `False` then shape is\n`[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`."
     type_attr: "T"
   }
   attr {
@@ -17516,7 +16269,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, compute full-sized `q` and `r`. If false\n(the default), compute only the leading `P` columns of `q`."
   }
   attr {
     name: "T"
@@ -17530,8 +16282,6 @@ op {
       }
     }
   }
-  summary: "Computes the QR decompositions of one or more matrices."
-  description: "Computes the QR decomposition of each inner matrix in `tensor` such that\n`tensor[..., :, :] = q[..., :, :] * r[..., :,:])`\n\n```python\n# a is a tensor.\n# q is a tensor of orthonormal matrices.\n# r is a tensor of upper triangular matrices.\nq, r = qr(a)\nq_full, r_full = qr(a, full_matrices=True)\n```"
 }
 op {
   name: "QuantizeAndDequantize"
@@ -17583,12 +16333,12 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Use QuantizeAndDequantizeV2 instead."
   deprecation {
     version: 22
     explanation: "Replaced by QuantizeAndDequantizeV2"
@@ -17598,17 +16348,14 @@ op {
   name: "QuantizeAndDequantizeV2"
   input_arg {
     name: "input"
-    description: "Tensor to quantize and then dequantize."
     type_attr: "T"
   }
   input_arg {
     name: "input_min"
-    description: "If range_given, this is the min of the range, otherwise this input\nwill be ignored."
     type_attr: "T"
   }
   input_arg {
     name: "input_max"
-    description: "If range_given, this is the max of the range, otherwise this input\nwill be ignored."
     type_attr: "T"
   }
   output_arg {
@@ -17621,7 +16368,6 @@ op {
     default_value {
       b: true
     }
-    description: "If the quantization is signed or unsigned."
   }
   attr {
     name: "num_bits"
@@ -17629,7 +16375,6 @@ op {
     default_value {
       i: 8
     }
-    description: "The bitwidth of the quantization."
   }
   attr {
     name: "range_given"
@@ -17637,20 +16382,18 @@ op {
     default_value {
       b: false
     }
-    description: "If the range is given or should be computed from the tensor."
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Quantizes then dequantizes a tensor."
-  description: "This op simulates the precision loss from the quantized forward pass by:\n1. Quantizing the tensor to fixed point numbers, which should match the target\n   quantization method when it is used in inference.\n2. Dequantizing it back to floating point numbers for the following ops, most\n   likely matmul.\n\nThere are different ways to quantize. This version does not use the full range\nof the output type, choosing to elide the lowest possible value for symmetry\n(e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit\nquantization), so that 0.0 maps to 0.\n\nTo perform this op, we first find the range of values in our tensor. The range\nwe use is always centered on 0, so we find m such that\n\n1. m = max(abs(input_min), abs(input_max)) if range_given is true,\n2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.\n\nOur input tensor range is then [-m, m].\n\nNext, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].\nIf signed_input is true, this is\n\n  [min_fixed, max_fixed ] =\n      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].\n\nOtherwise, if signed_input is false, the fixed-point range is\n\n  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].\n\nFrom this we compute our scaling factor, s:\n\n  s = (max_fixed - min_fixed) / (2 * m).\n\nNow we can quantize and dequantize the elements of our tensor.  An element e\nis transformed into e\':\n\n  e\' = (e * s).round_to_nearest() / s.\n\nNote that we have a different number of buckets in the signed vs. unsigned\ncases.  For example, if num_bits == 8, we get 254 buckets in the signed case\nvs. 255 in the unsigned case.\n\nFor example, suppose num_bits = 8 and m = 1.  Then\n\n  [min_fixed, max_fixed] = [-127, 127], and\n  s = (127 + 127) / 2 = 127.\n\nGiven the vector {-1, -0.5, 0, 0.3}, this is quantized to\n{-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}."
 }
 op {
   name: "QuantizeAndDequantizeV3"
@@ -17693,13 +16436,12 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Quantizes then dequantizes a tensor."
-  description: "This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a\ntensor, so its value can change during training."
 }
 op {
   name: "QuantizeDownAndShrinkRange"
@@ -17709,12 +16451,10 @@ op {
   }
   input_arg {
     name: "input_min"
-    description: "The float value that the minimum quantized input value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "input_max"
-    description: "The float value that the maximum quantized input value represents."
     type: DT_FLOAT
   }
   output_arg {
@@ -17723,44 +16463,38 @@ op {
   }
   output_arg {
     name: "output_min"
-    description: "The float value that the minimum quantized output value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "output_max"
-    description: "The float value that the maximum quantized output value represents."
     type: DT_FLOAT
   }
   attr {
     name: "Tinput"
     type: "type"
-    description: "The type of the input."
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
     name: "out_type"
     type: "type"
-    description: "The type of the output. Should be a lower bit depth than Tinput."
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
-  summary: "Convert the quantized \'input\' tensor into a lower-precision \'output\', using the"
-  description: "actual distribution of the values to maximize the usage of the lower bit depth\nand adjusting the output min and max ranges accordingly.\n\n[input_min, input_max] are scalar floats that specify the range for the float\ninterpretation of the \'input\' data. For example, if input_min is -1.0f and\ninput_max is 1.0f, and we are dealing with quint16 quantized data, then a 0\nvalue in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.\n\nThis operator tries to squeeze as much precision as possible into an output with\na lower bit depth by calculating the actual min and max values found in the\ndata. For example, maybe that quint16 input has no values lower than 16,384 and\nnone higher than 49,152. That means only half the range is actually needed, all\nthe float interpretations are between -0.5f and 0.5f, so if we want to compress\nthe data into a quint8 output, we can use that range rather than the theoretical\n-1.0f to 1.0f that is suggested by the input min and max.\n\nIn practice, this is most useful for taking output from operations like\nQuantizedMatMul that can produce higher bit-depth outputs than their inputs and\nmay have large potential output ranges, but in practice have a distribution of\ninput values that only uses a small fraction of the possible range. By feeding\nthat output into this operator, we can reduce it from 32 bits down to 8 with\nminimal loss of accuracy."
 }
 op {
   name: "QuantizeV2"
@@ -17770,27 +16504,22 @@ op {
   }
   input_arg {
     name: "min_range"
-    description: "The minimum scalar value possibly produced for the input."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_range"
-    description: "The maximum scalar value possibly produced for the input."
     type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    description: "The quantized data produced from the float input."
     type_attr: "T"
   }
   output_arg {
     name: "output_min"
-    description: "The actual minimum scalar value used for the output."
     type: DT_FLOAT
   }
   output_arg {
     name: "output_max"
-    description: "The actual maximum scalar value used for the output."
     type: DT_FLOAT
   }
   attr {
@@ -17800,9 +16529,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -17833,8 +16562,6 @@ op {
       }
     }
   }
-  summary: "Quantize the \'input\' tensor of type float to \'output\' tensor of type \'T\'."
-  description: "[min_range, max_range] are scalar floats that specify the range for\nthe \'input\' data. The \'mode\' attribute controls exactly which calculations are\nused to convert the float values to their quantized equivalents.  The\n\'round_mode\' attribute controls which rounding tie-breaking algorithm is used\nwhen rounding float values to their quantized equivalents.\n\nIn \'MIN_COMBINED\' mode, each value of the tensor will undergo the following:\n\n```\nout[i] = (in[i] - min_range) * range(T) / (max_range - min_range)\nif T == qint8, out[i] -= (range(T) + 1) / 2.0\n```\nhere `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`\n\n*MIN_COMBINED Mode Example*\n\nAssume the input is type float and has a possible range of [0.0, 6.0] and the\noutput type is quint8 ([0, 255]). The min_range and max_range values should be\nspecified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each\nvalue of the input by 255/6 and cast to quint8.\n\nIf the output type was qint8 ([-128, 127]), the operation will additionally\nsubtract each value by 128 prior to casting, so that the range of values aligns\nwith the range of qint8.\n\nIf the mode is \'MIN_FIRST\', then this approach is used:\n\n```\nnum_discrete_values = 1 << (# of bits in T)\nrange_adjust = num_discrete_values / (num_discrete_values - 1)\nrange = (range_max - range_min) * range_adjust\nrange_scale = num_discrete_values / range\nquantized = round(input * range_scale) - round(range_min * range_scale) +\n  numeric_limits<T>::min()\nquantized = max(quantized, numeric_limits<T>::min())\nquantized = min(quantized, numeric_limits<T>::max())\n```\n\nThe biggest difference between this and MIN_COMBINED is that the minimum range\nis rounded first, before it\'s subtracted from the rounded value. With\nMIN_COMBINED, a small bias is introduced where repeated iterations of quantizing\nand dequantizing will introduce a larger and larger error.\n\n*SCALED mode Example*\n\n`SCALED` mode matches the quantization approach used in\n`QuantizeAndDequantize{V2|V3}`.\n\nIf the mode is `SCALED`, we do not use the full range of the output type,\nchoosing to elide the lowest possible value for symmetry (e.g., output range is\n-127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to\n0.\n\nWe first find the range of values in our tensor. The\nrange we use is always centered on 0, so we find m such that\n```c++\n  m = max(abs(input_min), abs(input_max))\n```\n\nOur input tensor range is then `[-m, m]`.\n\nNext, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.\nIf T is signed, this is\n```\n  num_bits = sizeof(T) * 8\n  [min_fixed, max_fixed] =\n      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]\n```\n\nOtherwise, if T is unsigned, the fixed-point range is\n```\n  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]\n```\n\nFrom this we compute our scaling factor, s:\n```c++\n  s = (max_fixed - min_fixed) / (2 * m)\n```\n\nNow we can quantize the elements of our tensor:\n```c++\nresult = round(input * s)\n```\n\nOne thing to watch out for is that the operator may choose to adjust the\nrequested minimum and maximum values slightly during the quantization process,\nso you should always use the output ports as the range for further calculations.\nFor example, if the requested minimum and maximum values are close to equal,\nthey will be separated by a small epsilon value to prevent ill-formed quantized\nbuffers from being created. Otherwise, you can end up with buffers where all the\nquantized values map to the same float value, which causes problems for\noperations that have to perform further calculations on them."
 }
 op {
   name: "QuantizedAdd"
@@ -17848,22 +16575,18 @@ op {
   }
   input_arg {
     name: "min_x"
-    description: "The float value that the lowest quantized `x` value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_x"
-    description: "The float value that the highest quantized `x` value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "min_y"
-    description: "The float value that the lowest quantized `y` value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_y"
-    description: "The float value that the highest quantized `y` value represents."
     type: DT_FLOAT
   }
   output_arg {
@@ -17872,12 +16595,10 @@ op {
   }
   output_arg {
     name: "min_z"
-    description: "The float value that the lowest quantized output value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "max_z"
-    description: "The float value that the highest quantized output value represents.\n\n*NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about\nbroadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
     type: DT_FLOAT
   }
   attr {
@@ -17887,9 +16608,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -17900,9 +16621,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -17916,30 +16637,26 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
-  summary: "Returns x + y element-wise, working on quantized buffers."
   is_commutative: true
 }
 op {
   name: "QuantizedAvgPool"
   input_arg {
     name: "input"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "min_input"
-    description: "The float value that the lowest quantized input value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_input"
-    description: "The float value that the highest quantized input value represents."
     type: DT_FLOAT
   }
   output_arg {
@@ -17948,12 +16665,10 @@ op {
   }
   output_arg {
     name: "min_output"
-    description: "The float value that the lowest quantized output value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "max_output"
-    description: "The float value that the highest quantized output value represents."
     type: DT_FLOAT
   }
   attr {
@@ -17963,26 +16678,23 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "The size of the window for each dimension of the input tensor.\nThe length must be 4 to match the number of dimensions of the input."
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the input\ntensor.  The length must be 4 to match the number of dimensions of the input."
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -17990,83 +16702,67 @@ op {
       }
     }
   }
-  summary: "Produces the average pool of the input tensor for quantized types."
 }
 op {
   name: "QuantizedBatchNormWithGlobalNormalization"
   input_arg {
     name: "t"
-    description: "A 4D input Tensor."
     type_attr: "Tinput"
   }
   input_arg {
     name: "t_min"
-    description: "The value represented by the lowest quantized input."
     type: DT_FLOAT
   }
   input_arg {
     name: "t_max"
-    description: "The value represented by the highest quantized input."
     type: DT_FLOAT
   }
   input_arg {
     name: "m"
-    description: "A 1D mean Tensor with size matching the last dimension of t.\nThis is the first output from tf.nn.moments,\nor a saved moving average thereof."
     type_attr: "Tinput"
   }
   input_arg {
     name: "m_min"
-    description: "The value represented by the lowest quantized mean."
     type: DT_FLOAT
   }
   input_arg {
     name: "m_max"
-    description: "The value represented by the highest quantized mean."
     type: DT_FLOAT
   }
   input_arg {
     name: "v"
-    description: "A 1D variance Tensor with size matching the last dimension of t.\nThis is the second output from tf.nn.moments,\nor a saved moving average thereof."
     type_attr: "Tinput"
   }
   input_arg {
     name: "v_min"
-    description: "The value represented by the lowest quantized variance."
     type: DT_FLOAT
   }
   input_arg {
     name: "v_max"
-    description: "The value represented by the highest quantized variance."
     type: DT_FLOAT
   }
   input_arg {
     name: "beta"
-    description: "A 1D beta Tensor with size matching the last dimension of t.\nAn offset to be added to the normalized tensor."
     type_attr: "Tinput"
   }
   input_arg {
     name: "beta_min"
-    description: "The value represented by the lowest quantized offset."
     type: DT_FLOAT
   }
   input_arg {
     name: "beta_max"
-    description: "The value represented by the highest quantized offset."
     type: DT_FLOAT
   }
   input_arg {
     name: "gamma"
-    description: "A 1D gamma Tensor with size matching the last dimension of t.\nIf \"scale_after_normalization\" is true, this tensor will be multiplied\nwith the normalized tensor."
     type_attr: "Tinput"
   }
   input_arg {
     name: "gamma_min"
-    description: "The value represented by the lowest quantized gamma."
     type: DT_FLOAT
   }
   input_arg {
     name: "gamma_max"
-    description: "The value represented by the highest quantized gamma."
     type: DT_FLOAT
   }
   output_arg {
@@ -18088,9 +16784,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18101,24 +16797,20 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
     name: "variance_epsilon"
     type: "float"
-    description: "A small float number to avoid dividing by 0."
   }
   attr {
     name: "scale_after_normalization"
     type: "bool"
-    description: "A bool indicating whether the resulted tensor\nneeds to be multiplied with gamma."
   }
-  summary: "Quantized Batch normalization."
-  description: "This op is deprecated and will be removed in the future. Prefer\n`tf.nn.batch_normalization`."
 }
 op {
   name: "QuantizedBiasAdd"
@@ -18128,27 +16820,22 @@ op {
   }
   input_arg {
     name: "bias"
-    description: "A 1D bias Tensor with size matching the last dimension of \'input\'."
     type_attr: "T2"
   }
   input_arg {
     name: "min_input"
-    description: "The float value that the lowest quantized input value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_input"
-    description: "The float value that the highest quantized input value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "min_bias"
-    description: "The float value that the lowest quantized bias value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_bias"
-    description: "The float value that the highest quantized bias value represents."
     type: DT_FLOAT
   }
   output_arg {
@@ -18157,12 +16844,10 @@ op {
   }
   output_arg {
     name: "min_out"
-    description: "The float value that the lowest quantized output value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "max_out"
-    description: "The float value that the highest quantized output value represents."
     type: DT_FLOAT
   }
   attr {
@@ -18172,9 +16857,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18185,9 +16870,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18198,53 +16883,44 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
-  summary: "Adds Tensor \'bias\' to Tensor \'input\' for Quantized types."
-  description: "Broadcasts the values of bias on dimensions 0..N-2 of \'input\'."
 }
 op {
   name: "QuantizedConcat"
   input_arg {
     name: "concat_dim"
-    description: "0-D.  The dimension along which to concatenate.  Must be in the\nrange [0, rank(values))."
     type: DT_INT32
   }
   input_arg {
     name: "values"
-    description: "The `N` Tensors to concatenate. Their ranks and types must match,\nand their sizes must match in all dimensions except `concat_dim`."
     type_attr: "T"
     number_attr: "N"
   }
   input_arg {
     name: "input_mins"
-    description: "The minimum scalar values for each of the input tensors."
     type: DT_FLOAT
     number_attr: "N"
   }
   input_arg {
     name: "input_maxes"
-    description: "The maximum scalar values for each of the input tensors."
     type: DT_FLOAT
     number_attr: "N"
   }
   output_arg {
     name: "output"
-    description: "A `Tensor` with the concatenation of values stacked along the\n`concat_dim` dimension.  This tensor\'s shape matches that of `values` except\nin `concat_dim` where it has the sum of the sizes."
     type_attr: "T"
   }
   output_arg {
     name: "output_min"
-    description: "The float value that the minimum quantized output value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "output_max"
-    description: "The float value that the maximum quantized output value represents."
     type: DT_FLOAT
   }
   attr {
@@ -18257,7 +16933,6 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Concatenates quantized tensors along one dimension."
 }
 op {
   name: "QuantizedConv2D"
@@ -18267,27 +16942,22 @@ op {
   }
   input_arg {
     name: "filter"
-    description: "filter\'s input_depth dimension must match input\'s depth dimensions."
     type_attr: "Tfilter"
   }
   input_arg {
     name: "min_input"
-    description: "The float value that the lowest quantized input value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_input"
-    description: "The float value that the highest quantized input value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "min_filter"
-    description: "The float value that the lowest quantized filter value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_filter"
-    description: "The float value that the highest quantized filter value represents."
     type: DT_FLOAT
   }
   output_arg {
@@ -18296,12 +16966,10 @@ op {
   }
   output_arg {
     name: "min_output"
-    description: "The float value that the lowest quantized output value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "max_output"
-    description: "The float value that the highest quantized output value represents."
     type: DT_FLOAT
   }
   attr {
@@ -18311,9 +16979,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18324,9 +16992,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18340,21 +17008,19 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the input\ntensor."
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -18362,39 +17028,43 @@ op {
       }
     }
   }
-  summary: "Computes a 2D convolution given quantized 4D input and filter tensors."
-  description: "The inputs are quantized tensors where the lowest value represents the real\nnumber of the associated minimum, and the highest represents the maximum.\nThis means that you can only interpret the quantized output in the same way, by\ntaking the returned minimum and maximum values into account."
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "QuantizedInstanceNorm"
   input_arg {
     name: "x"
-    description: "A 4D input Tensor."
     type_attr: "T"
   }
   input_arg {
     name: "x_min"
-    description: "The value represented by the lowest quantized input."
     type: DT_FLOAT
   }
   input_arg {
     name: "x_max"
-    description: "The value represented by the highest quantized input."
     type: DT_FLOAT
   }
   output_arg {
     name: "y"
-    description: "A 4D Tensor."
     type_attr: "T"
   }
   output_arg {
     name: "y_min"
-    description: "The value represented by the lowest quantized output."
     type: DT_FLOAT
   }
   output_arg {
     name: "y_max"
-    description: "The value represented by the highest quantized output."
     type: DT_FLOAT
   }
   attr {
@@ -18404,9 +17074,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18416,7 +17086,6 @@ op {
     default_value {
       b: false
     }
-    description: "If True, `given_y_min` and `given_y_min`\nand `given_y_max` are used as the output range. Otherwise,\nthe implementation computes the output range."
   }
   attr {
     name: "given_y_min"
@@ -18424,7 +17093,6 @@ op {
     default_value {
       f: 0
     }
-    description: "Output in `y_min` if `output_range_given` is True."
   }
   attr {
     name: "given_y_max"
@@ -18432,7 +17100,6 @@ op {
     default_value {
       f: 0
     }
-    description: "Output in `y_max` if `output_range_given` is True."
   }
   attr {
     name: "variance_epsilon"
@@ -18440,7 +17107,6 @@ op {
     default_value {
       f: 1e-05
     }
-    description: "A small float number to avoid dividing by 0."
   }
   attr {
     name: "min_separation"
@@ -18448,40 +17114,32 @@ op {
     default_value {
       f: 0.001
     }
-    description: "Minimum value of `y_max - y_min`"
   }
-  summary: "Quantized Instance normalization."
 }
 op {
   name: "QuantizedMatMul"
   input_arg {
     name: "a"
-    description: "Must be a two-dimensional tensor."
     type_attr: "T1"
   }
   input_arg {
     name: "b"
-    description: "Must be a two-dimensional tensor."
     type_attr: "T2"
   }
   input_arg {
     name: "min_a"
-    description: "The float value that the lowest quantized `a` value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_a"
-    description: "The float value that the highest quantized `a` value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "min_b"
-    description: "The float value that the lowest quantized `b` value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_b"
-    description: "The float value that the highest quantized `b` value represents."
     type: DT_FLOAT
   }
   output_arg {
@@ -18490,12 +17148,10 @@ op {
   }
   output_arg {
     name: "min_out"
-    description: "The float value that the lowest quantized output value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "max_out"
-    description: "The float value that the highest quantized output value represents."
     type: DT_FLOAT
   }
   attr {
@@ -18505,9 +17161,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18518,9 +17174,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18534,9 +17190,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18546,7 +17202,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, `a` is transposed before multiplication."
   }
   attr {
     name: "transpose_b"
@@ -18554,7 +17209,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, `b` is transposed before multiplication."
   }
   attr {
     name: "Tactivation"
@@ -18562,35 +17216,29 @@ op {
     default_value {
       type: DT_QUINT8
     }
-    description: "The type of output produced by activation function\nfollowing this operation."
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
-  summary: "Perform a quantized matrix multiplication of  `a` by the matrix `b`."
-  description: "The inputs must be two-dimensional matrices and the inner dimension of\n`a` (after being transposed if `transpose_a` is non-zero) must match the\nouter dimension of `b` (after being transposed if `transposed_b` is\nnon-zero)."
 }
 op {
   name: "QuantizedMaxPool"
   input_arg {
     name: "input"
-    description: "The 4D (batch x rows x cols x depth) Tensor to MaxReduce over."
     type_attr: "T"
   }
   input_arg {
     name: "min_input"
-    description: "The float value that the lowest quantized input value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_input"
-    description: "The float value that the highest quantized input value represents."
     type: DT_FLOAT
   }
   output_arg {
@@ -18599,12 +17247,10 @@ op {
   }
   output_arg {
     name: "min_output"
-    description: "The float value that the lowest quantized output value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "max_output"
-    description: "The float value that the highest quantized output value represents."
     type: DT_FLOAT
   }
   attr {
@@ -18614,26 +17260,23 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "The size of the window for each dimension of the input tensor.\nThe length must be 4 to match the number of dimensions of the input."
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the input\ntensor. The length must be 4 to match the number of dimensions of the input."
   }
   attr {
     name: "padding"
     type: "string"
-    description: "The type of padding algorithm to use."
     allowed_values {
       list {
         s: "SAME"
@@ -18641,7 +17284,6 @@ op {
       }
     }
   }
-  summary: "Produces the max pool of the input tensor for quantized types."
 }
 op {
   name: "QuantizedMul"
@@ -18655,22 +17297,18 @@ op {
   }
   input_arg {
     name: "min_x"
-    description: "The float value that the lowest quantized `x` value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_x"
-    description: "The float value that the highest quantized `x` value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "min_y"
-    description: "The float value that the lowest quantized `y` value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_y"
-    description: "The float value that the highest quantized `y` value represents."
     type: DT_FLOAT
   }
   output_arg {
@@ -18679,12 +17317,10 @@ op {
   }
   output_arg {
     name: "min_z"
-    description: "The float value that the lowest quantized output value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "max_z"
-    description: "The float value that the highest quantized output value represents.\n\n*NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about\nbroadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
     type: DT_FLOAT
   }
   attr {
@@ -18694,9 +17330,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18707,9 +17343,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18723,13 +17359,12 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
-  summary: "Returns x * y element-wise, working on quantized buffers."
   is_commutative: true
 }
 op {
@@ -18740,27 +17375,22 @@ op {
   }
   input_arg {
     name: "min_features"
-    description: "The float value that the lowest quantized value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_features"
-    description: "The float value that the highest quantized value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "activations"
-    description: "Has the same output shape as \"features\"."
     type_attr: "out_type"
   }
   output_arg {
     name: "min_activations"
-    description: "The float value that the lowest quantized value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "max_activations"
-    description: "The float value that the highest quantized value represents."
     type: DT_FLOAT
   }
   attr {
@@ -18770,9 +17400,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18786,13 +17416,12 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
-  summary: "Computes Quantized Rectified Linear: `max(features, 0)`"
 }
 op {
   name: "QuantizedRelu6"
@@ -18802,27 +17431,22 @@ op {
   }
   input_arg {
     name: "min_features"
-    description: "The float value that the lowest quantized value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_features"
-    description: "The float value that the highest quantized value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "activations"
-    description: "Has the same output shape as \"features\"."
     type_attr: "out_type"
   }
   output_arg {
     name: "min_activations"
-    description: "The float value that the lowest quantized value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "max_activations"
-    description: "The float value that the highest quantized value represents."
     type: DT_FLOAT
   }
   attr {
@@ -18832,9 +17456,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18848,13 +17472,12 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
-  summary: "Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`"
 }
 op {
   name: "QuantizedReluX"
@@ -18868,27 +17491,22 @@ op {
   }
   input_arg {
     name: "min_features"
-    description: "The float value that the lowest quantized value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "max_features"
-    description: "The float value that the highest quantized value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "activations"
-    description: "Has the same output shape as \"features\"."
     type_attr: "out_type"
   }
   output_arg {
     name: "min_activations"
-    description: "The float value that the lowest quantized value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "max_activations"
-    description: "The float value that the highest quantized value represents."
     type: DT_FLOAT
   }
   attr {
@@ -18898,9 +17516,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -18914,13 +17532,12 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
-  summary: "Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`"
 }
 op {
   name: "QuantizedReshape"
@@ -18930,17 +17547,14 @@ op {
   }
   input_arg {
     name: "shape"
-    description: "Defines the shape of the output tensor."
     type_attr: "Tshape"
   }
   input_arg {
     name: "input_min"
-    description: "The minimum value of the input."
     type: DT_FLOAT
   }
   input_arg {
     name: "input_max"
-    description: "The maximum value of the input."
     type: DT_FLOAT
   }
   output_arg {
@@ -18949,12 +17563,10 @@ op {
   }
   output_arg {
     name: "output_min"
-    description: "This value is copied from input_min."
     type: DT_FLOAT
   }
   output_arg {
     name: "output_max"
-    description: "This value is copied from input_max."
     type: DT_FLOAT
   }
   attr {
@@ -18974,19 +17586,15 @@ op {
       }
     }
   }
-  summary: "Reshapes a quantized tensor as per the Reshape op."
-  description: "```"
 }
 op {
   name: "QuantizedResizeBilinear"
   input_arg {
     name: "images"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "size"
-    description: "= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The\nnew size for the images."
     type: DT_INT32
   }
   input_arg {
@@ -18999,7 +17607,6 @@ op {
   }
   output_arg {
     name: "resized_images"
-    description: "4-D with shape\n`[batch, new_height, new_width, channels]`."
     type_attr: "T"
   }
   output_arg {
@@ -19027,16 +17634,12 @@ op {
     default_value {
       b: false
     }
-    description: "If true, rescale input by (new_height - 1) / (height - 1), which\nexactly aligns the 4 corners of images and resized images. If false, rescale\nby new_height / height. Treat similarly the width dimension."
   }
-  summary: "Resize quantized `images` to `size` using quantized bilinear interpolation."
-  description: "Input images and output images must be quantized types."
 }
 op {
   name: "QueueClose"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_STRING
     is_ref: true
   }
@@ -19046,16 +17649,12 @@ op {
     default_value {
       b: false
     }
-    description: "If true, all pending enqueue requests that are\nblocked on the given queue will be canceled."
   }
-  summary: "Closes the given queue."
-  description: "This operation signals that no more elements will be enqueued in the\ngiven queue. Subsequent Enqueue(Many) operations will fail.\nSubsequent Dequeue(Many) operations will continue to succeed if\nsufficient elements remain in the queue. Subsequent Dequeue(Many)\noperations that would block will fail immediately."
 }
 op {
   name: "QueueCloseV2"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_RESOURCE
   }
   attr {
@@ -19064,29 +17663,23 @@ op {
     default_value {
       b: false
     }
-    description: "If true, all pending enqueue requests that are\nblocked on the given queue will be canceled."
   }
-  summary: "Closes the given queue."
-  description: "This operation signals that no more elements will be enqueued in the\ngiven queue. Subsequent Enqueue(Many) operations will fail.\nSubsequent Dequeue(Many) operations will continue to succeed if\nsufficient elements remain in the queue. Subsequent Dequeue(Many)\noperations that would block will fail immediately."
   is_stateful: true
 }
 op {
   name: "QueueDequeue"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_STRING
     is_ref: true
   }
   output_arg {
     name: "components"
-    description: "One or more tensors that were dequeued as a tuple."
     type_list_attr: "component_types"
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a tuple."
     has_minimum: true
     minimum: 1
   }
@@ -19096,33 +17689,26 @@ op {
     default_value {
       i: -1
     }
-    description: "If the queue is empty, this operation will block for up to\ntimeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues a tuple of one or more tensors from the given queue."
-  description: "This operation has k outputs, where k is the number of components\nin the tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until an element\nhas been dequeued (or \'timeout_ms\' elapses, if specified)."
 }
 op {
   name: "QueueDequeueMany"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "n"
-    description: "The number of tuples to dequeue."
     type: DT_INT32
   }
   output_arg {
     name: "components"
-    description: "One or more tensors that were dequeued as a tuple."
     type_list_attr: "component_types"
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a tuple."
     has_minimum: true
     minimum: 1
   }
@@ -19132,32 +17718,25 @@ op {
     default_value {
       i: -1
     }
-    description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
-  description: "If the queue is closed and there are fewer than `n` elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size `n` in the 0th dimension.\n\nThis operation has `k` outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until `n` elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
 }
 op {
   name: "QueueDequeueManyV2"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_RESOURCE
   }
   input_arg {
     name: "n"
-    description: "The number of tuples to dequeue."
     type: DT_INT32
   }
   output_arg {
     name: "components"
-    description: "One or more tensors that were dequeued as a tuple."
     type_list_attr: "component_types"
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a tuple."
     has_minimum: true
     minimum: 1
   }
@@ -19167,34 +17746,27 @@ op {
     default_value {
       i: -1
     }
-    description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
-  description: "If the queue is closed and there are fewer than `n` elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size `n` in the 0th dimension.\n\nThis operation has `k` outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until `n` elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
   is_stateful: true
 }
 op {
   name: "QueueDequeueUpTo"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "n"
-    description: "The number of tuples to dequeue."
     type: DT_INT32
   }
   output_arg {
     name: "components"
-    description: "One or more tensors that were dequeued as a tuple."
     type_list_attr: "component_types"
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a tuple."
     has_minimum: true
     minimum: 1
   }
@@ -19204,32 +17776,25 @@ op {
     default_value {
       i: -1
     }
-    description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
-  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than `n`\nelements remaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If\nthe queue is closed and there are 0 elements left in the queue, then\nan OutOfRange error is returned just like in QueueDequeueMany.\nOtherwise the behavior is identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size `n` in the 0th dimension.\n\nThis operation has k outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple."
 }
 op {
   name: "QueueDequeueUpToV2"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_RESOURCE
   }
   input_arg {
     name: "n"
-    description: "The number of tuples to dequeue."
     type: DT_INT32
   }
   output_arg {
     name: "components"
-    description: "One or more tensors that were dequeued as a tuple."
     type_list_attr: "component_types"
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a tuple."
     has_minimum: true
     minimum: 1
   }
@@ -19239,28 +17804,22 @@ op {
     default_value {
       i: -1
     }
-    description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
-  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than `n`\nelements remaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If\nthe queue is closed and there are 0 elements left in the queue, then\nan OutOfRange error is returned just like in QueueDequeueMany.\nOtherwise the behavior is identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has `k` outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple."
   is_stateful: true
 }
 op {
   name: "QueueDequeueV2"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_RESOURCE
   }
   output_arg {
     name: "components"
-    description: "One or more tensors that were dequeued as a tuple."
     type_list_attr: "component_types"
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a tuple."
     has_minimum: true
     minimum: 1
   }
@@ -19270,23 +17829,18 @@ op {
     default_value {
       i: -1
     }
-    description: "If the queue is empty, this operation will block for up to\ntimeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues a tuple of one or more tensors from the given queue."
-  description: "This operation has k outputs, where k is the number of components\nin the tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until an element\nhas been dequeued (or \'timeout_ms\' elapses, if specified)."
   is_stateful: true
 }
 op {
   name: "QueueEnqueue"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "components"
-    description: "One or more tensors from which the enqueued tensors should be taken."
     type_list_attr: "Tcomponents"
   }
   attr {
@@ -19301,22 +17855,17 @@ op {
     default_value {
       i: -1
     }
-    description: "If the queue is full, this operation will block for up to\ntimeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Enqueues a tuple of one or more tensors in the given queue."
-  description: "The components input has k elements, which correspond to the components of\ntuples stored in the given queue.\n\nN.B. If the queue is full, this operation will block until the given\nelement has been enqueued (or \'timeout_ms\' elapses, if specified)."
 }
 op {
   name: "QueueEnqueueMany"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "components"
-    description: "One or more tensors from which the enqueued tensors should\nbe taken."
     type_list_attr: "Tcomponents"
   }
   attr {
@@ -19331,21 +17880,16 @@ op {
     default_value {
       i: -1
     }
-    description: "If the queue is too full, this operation will block for up\nto timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Enqueues zero or more tuples of one or more tensors in the given queue."
-  description: "This operation slices each component tensor along the 0th dimension to\nmake multiple queue elements. All of the tuple components must have the\nsame size in the 0th dimension.\n\nThe components input has k elements, which correspond to the components of\ntuples stored in the given queue.\n\nN.B. If the queue is full, this operation will block until the given\nelements have been enqueued (or \'timeout_ms\' elapses, if specified)."
 }
 op {
   name: "QueueEnqueueManyV2"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_RESOURCE
   }
   input_arg {
     name: "components"
-    description: "One or more tensors from which the enqueued tensors should\nbe taken."
     type_list_attr: "Tcomponents"
   }
   attr {
@@ -19360,22 +17904,17 @@ op {
     default_value {
       i: -1
     }
-    description: "If the queue is too full, this operation will block for up\nto timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Enqueues zero or more tuples of one or more tensors in the given queue."
-  description: "This operation slices each component tensor along the 0th dimension to\nmake multiple queue elements. All of the tuple components must have the\nsame size in the 0th dimension.\n\nThe components input has k elements, which correspond to the components of\ntuples stored in the given queue.\n\nN.B. If the queue is full, this operation will block until the given\nelements have been enqueued (or \'timeout_ms\' elapses, if specified)."
   is_stateful: true
 }
 op {
   name: "QueueEnqueueV2"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_RESOURCE
   }
   input_arg {
     name: "components"
-    description: "One or more tensors from which the enqueued tensors should be taken."
     type_list_attr: "Tcomponents"
   }
   attr {
@@ -19390,17 +17929,13 @@ op {
     default_value {
       i: -1
     }
-    description: "If the queue is full, this operation will block for up to\ntimeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Enqueues a tuple of one or more tensors in the given queue."
-  description: "The components input has k elements, which correspond to the components of\ntuples stored in the given queue.\n\nN.B. If the queue is full, this operation will block until the given\nelement has been enqueued (or \'timeout_ms\' elapses, if specified)."
   is_stateful: true
 }
 op {
   name: "QueueIsClosed"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_STRING
     is_ref: true
   }
@@ -19408,124 +17943,96 @@ op {
     name: "is_closed"
     type: DT_BOOL
   }
-  summary: "Returns true if queue is closed."
-  description: "This operation returns true if the queue is closed and false if the queue\nis open."
 }
 op {
   name: "QueueIsClosedV2"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_RESOURCE
   }
   output_arg {
     name: "is_closed"
     type: DT_BOOL
   }
-  summary: "Returns true if queue is closed."
-  description: "This operation returns true if the queue is closed and false if the queue\nis open."
   is_stateful: true
 }
 op {
   name: "QueueSize"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_STRING
     is_ref: true
   }
   output_arg {
     name: "size"
-    description: "The number of elements in the given queue."
     type: DT_INT32
   }
-  summary: "Computes the number of elements in the given queue."
 }
 op {
   name: "QueueSizeV2"
   input_arg {
     name: "handle"
-    description: "The handle to a queue."
     type: DT_RESOURCE
   }
   output_arg {
     name: "size"
-    description: "The number of elements in the given queue."
     type: DT_INT32
   }
-  summary: "Computes the number of elements in the given queue."
   is_stateful: true
 }
 op {
   name: "RFFT"
   input_arg {
     name: "input"
-    description: "A float32 tensor."
     type: DT_FLOAT
   }
   input_arg {
     name: "fft_length"
-    description: "An int32 tensor of shape [1]. The FFT length."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same rank as `input`. The inner-most\n  dimension of `input` is replaced with the `fft_length / 2 + 1` unique\n  frequency components of its 1D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfft\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Real-valued fast Fourier transform."
-  description: "Computes the 1-dimensional discrete Fourier transform of a real-valued signal\nover the inner-most dimension of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the\n`fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,\nfollowed by the `fft_length / 2` positive-frequency terms.\n\nAlong the axis `RFFT` is computed on, if `fft_length` is smaller than the\ncorresponding dimension of `input`, the dimension is cropped. If it is larger,\nthe dimension is padded with zeros."
 }
 op {
   name: "RFFT2D"
   input_arg {
     name: "input"
-    description: "A float32 tensor."
     type: DT_FLOAT
   }
   input_arg {
     name: "fft_length"
-    description: "An int32 tensor of shape [2]. The FFT length for each dimension."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same rank as `input`. The inner-most 2\n  dimensions of `input` are replaced with their 2D Fourier transform. The\n  inner-most dimension contains `fft_length / 2 + 1` unique frequency\n  components.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfft2\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "2D real-valued fast Fourier transform."
-  description: "Computes the 2-dimensional discrete Fourier transform of a real-valued signal\nover the inner-most 2 dimensions of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the\n`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension\nof `output`: the zero-frequency term, followed by the `fft_length / 2`\npositive-frequency terms.\n\nAlong each axis `RFFT2D` is computed on, if `fft_length` is smaller than the\ncorresponding dimension of `input`, the dimension is cropped. If it is larger,\nthe dimension is padded with zeros."
 }
 op {
   name: "RFFT3D"
   input_arg {
     name: "input"
-    description: "A float32 tensor."
     type: DT_FLOAT
   }
   input_arg {
     name: "fft_length"
-    description: "An int32 tensor of shape [3]. The FFT length for each dimension."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same rank as `input`. The inner-most 3\n  dimensions of `input` are replaced with the their 3D Fourier transform. The\n  inner-most dimension contains `fft_length / 2 + 1` unique frequency\n  components.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfftn with 3 dimensions.\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "3D real-valued fast Fourier transform."
-  description: "Computes the 3-dimensional discrete Fourier transform of a real-valued signal\nover the inner-most 3 dimensions of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the\n`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension\nof `output`: the zero-frequency term, followed by the `fft_length / 2`\npositive-frequency terms.\n\nAlong each axis `RFFT3D` is computed on, if `fft_length` is smaller than the\ncorresponding dimension of `input`, the dimension is cropped. If it is larger,\nthe dimension is padded with zeros."
 }
 op {
   name: "RGBToHSV"
   input_arg {
     name: "images"
-    description: "1-D or higher rank. RGB data to convert. Last dimension must be size 3."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "`images` converted to HSV."
     type_attr: "T"
   }
   attr {
@@ -19536,29 +18043,26 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Converts one or more images from RGB to HSV."
-  description: "Outputs a tensor of the same shape as the `images` tensor, containing the HSV\nvalue of the pixels. The output is only well defined if the value in `images`\nare in `[0,1]`.\n\n`output[..., 0]` contains hue, `output[..., 1]` contains saturation, and\n`output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0\ncorresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue."
 }
 op {
   name: "RandomCrop"
   input_arg {
     name: "image"
-    description: "3-D of shape `[height, width, channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "size"
-    description: "1-D of length 2 containing: `crop_height`, `crop_width`.."
     type: DT_INT64
   }
   output_arg {
     name: "output"
-    description: "3-D of shape `[crop_height, crop_width, channels].`"
     type_attr: "T"
   }
   attr {
@@ -19582,7 +18086,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either seed or seed2 are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -19590,31 +18093,53 @@ op {
     default_value {
       i: 0
     }
-    description: "An second seed to avoid seed collision."
   }
-  summary: "Randomly crop `image`."
-  description: "`size` is a 1-D int64 tensor with 2 elements representing the crop height and\nwidth.  The values must be non negative.\n\nThis Op picks a random location in `image` and crops a `height` by `width`\nrectangle from that location.  The random location is picked so the cropped\narea will fit inside the original image."
   deprecation {
     version: 8
     explanation: "Random crop is now pure Python"
   }
   is_stateful: true
 }
+op {
+  name: "RandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "RandomGamma"
   input_arg {
     name: "shape"
-    description: "1-D integer tensor. Shape of independent samples to draw from each\ndistribution described by the shape parameters given in alpha."
     type_attr: "S"
   }
   input_arg {
     name: "alpha"
-    description: "A tensor in which each scalar is a \"shape\" parameter describing the\nassociated gamma distribution."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "A tensor with shape `shape + shape(alpha)`. Each slice\n`[:, ..., :, i0, i1, ...iN]` contains the samples drawn for\n`alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha."
     type_attr: "T"
   }
   attr {
@@ -19623,7 +18148,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either `seed` or `seed2` are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -19631,7 +18155,6 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "S"
@@ -19654,8 +18177,6 @@ op {
       }
     }
   }
-  summary: "Outputs random values from the Gamma distribution(s) described by alpha."
-  description: "This op uses the algorithm by Marsaglia et al. to acquire samples via\ntransformation-rejection from pairs of uniform and normal random variables.\nSee http://dl.acm.org/citation.cfm?id=358414"
   is_stateful: true
 }
 op {
@@ -19707,7 +18228,6 @@ op {
       }
     }
   }
-  summary: "Use RandomPoissonV2 instead."
   deprecation {
     version: 25
     explanation: "Replaced by RandomPoissonV2"
@@ -19718,17 +18238,14 @@ op {
   name: "RandomPoissonV2"
   input_arg {
     name: "shape"
-    description: "1-D integer tensor. Shape of independent samples to draw from each\ndistribution described by the shape parameters given in rate."
     type_attr: "S"
   }
   input_arg {
     name: "rate"
-    description: "A tensor in which each scalar is a \"rate\" parameter describing the\nassociated poisson distribution."
     type_attr: "R"
   }
   output_arg {
     name: "output"
-    description: "A tensor with shape `shape + shape(rate)`. Each slice\n`[:, ..., :, i0, i1, ...iN]` contains the samples drawn for\n`rate[i0, i1, ...iN]`."
     type_attr: "dtype"
   }
   attr {
@@ -19737,7 +18254,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either `seed` or `seed2` are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -19745,7 +18261,6 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "S"
@@ -19789,20 +18304,16 @@ op {
       }
     }
   }
-  summary: "Outputs random values from the Poisson distribution(s) described by rate."
-  description: "This op uses two algorithms, depending on rate. If rate >= 10, then\nthe algorithm by Hormann is used to acquire samples via\ntransformation-rejection.\nSee http://www.sciencedirect.com/science/article/pii/0167668793909974.\n\nOtherwise, Knuth\'s algorithm is used to acquire samples via multiplying uniform\nrandom variables.\nSee Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer\nProgramming, Volume 2. Addison Wesley"
   is_stateful: true
 }
 op {
   name: "RandomShuffle"
   input_arg {
     name: "value"
-    description: "The tensor to be shuffled."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "A tensor of same shape and type as `value`, shuffled along its first\ndimension."
     type_attr: "T"
   }
   attr {
@@ -19811,7 +18322,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either `seed` or `seed2` are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -19819,28 +18329,23 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Randomly shuffles a tensor along its first dimension."
-  description: "  The tensor is shuffled along dimension 0, such that each `value[j]` is mapped\n  to one and only one `output[i]`. For example, a mapping that might occur for a\n  3x2 tensor is:\n\n```\n[[1, 2],       [[5, 6],\n [3, 4],  ==>   [1, 2],\n [5, 6]]        [3, 4]]\n```"
   is_stateful: true
 }
 op {
   name: "RandomShuffleQueue"
   output_arg {
     name: "handle"
-    description: "The handle to the queue."
     type: DT_STRING
     is_ref: true
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a value."
     has_minimum: true
     minimum: 1
   }
@@ -19851,7 +18356,6 @@ op {
       list {
       }
     }
-    description: "The shape of each component in a value. The length of this attr must\nbe either 0 or the same as the length of component_types. If the length of\nthis attr is 0, the shapes of queue elements are not constrained, and\nonly one element may be dequeued at a time."
     has_minimum: true
   }
   attr {
@@ -19860,7 +18364,6 @@ op {
     default_value {
       i: -1
     }
-    description: "The upper bound on the number of elements in this queue.\nNegative numbers mean no limit."
   }
   attr {
     name: "min_after_dequeue"
@@ -19868,7 +18371,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Dequeue will block unless there would be this\nmany elements after the dequeue or the queue is closed. This\nensures a minimum level of mixing of elements."
   }
   attr {
     name: "seed"
@@ -19876,7 +18378,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either seed or seed2 is set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, a random seed is used."
   }
   attr {
     name: "seed2"
@@ -19884,7 +18385,6 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "container"
@@ -19892,7 +18392,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -19900,22 +18399,18 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue will be shared under the given name\nacross multiple sessions."
   }
-  summary: "A queue that randomizes the order of elements."
   is_stateful: true
 }
 op {
   name: "RandomShuffleQueueV2"
   output_arg {
     name: "handle"
-    description: "The handle to the queue."
     type: DT_RESOURCE
   }
   attr {
     name: "component_types"
     type: "list(type)"
-    description: "The type of each component in a value."
     has_minimum: true
     minimum: 1
   }
@@ -19926,7 +18421,6 @@ op {
       list {
       }
     }
-    description: "The shape of each component in a value. The length of this attr must\nbe either 0 or the same as the length of component_types. If the length of\nthis attr is 0, the shapes of queue elements are not constrained, and\nonly one element may be dequeued at a time."
     has_minimum: true
   }
   attr {
@@ -19935,7 +18429,6 @@ op {
     default_value {
       i: -1
     }
-    description: "The upper bound on the number of elements in this queue.\nNegative numbers mean no limit."
   }
   attr {
     name: "min_after_dequeue"
@@ -19943,7 +18436,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Dequeue will block unless there would be this\nmany elements after the dequeue or the queue is closed. This\nensures a minimum level of mixing of elements."
   }
   attr {
     name: "seed"
@@ -19951,7 +18443,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either seed or seed2 is set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, a random seed is used."
   }
   attr {
     name: "seed2"
@@ -19959,7 +18450,6 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "container"
@@ -19967,7 +18457,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -19975,21 +18464,17 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue will be shared under the given name\nacross multiple sessions."
   }
-  summary: "A queue that randomizes the order of elements."
   is_stateful: true
 }
 op {
   name: "RandomStandardNormal"
   input_arg {
     name: "shape"
-    description: "The shape of the output tensor."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "A tensor of the specified shape filled with random normal values."
     type_attr: "dtype"
   }
   attr {
@@ -19998,7 +18483,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either `seed` or `seed2` are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -20006,15 +18490,14 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of the output."
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -20030,20 +18513,16 @@ op {
       }
     }
   }
-  summary: "Outputs random values from a normal distribution."
-  description: "The generated values will have mean 0 and standard deviation 1."
   is_stateful: true
 }
 op {
   name: "RandomUniform"
   input_arg {
     name: "shape"
-    description: "The shape of the output tensor."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "A tensor of the specified shape filled with uniform random values."
     type_attr: "dtype"
   }
   attr {
@@ -20052,7 +18531,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either `seed` or `seed2` are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -20060,15 +18538,14 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of the output."
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -20084,30 +18561,24 @@ op {
       }
     }
   }
-  summary: "Outputs random values from a uniform distribution."
-  description: "The generated values follow a uniform distribution in the range `[0, 1)`. The\nlower bound 0 is included in the range, while the upper bound 1 is excluded."
   is_stateful: true
 }
 op {
   name: "RandomUniformInt"
   input_arg {
     name: "shape"
-    description: "The shape of the output tensor."
     type_attr: "T"
   }
   input_arg {
     name: "minval"
-    description: "0-D.  Inclusive lower bound on the generated integers."
     type_attr: "Tout"
   }
   input_arg {
     name: "maxval"
-    description: "0-D.  Exclusive upper bound on the generated integers."
     type_attr: "Tout"
   }
   output_arg {
     name: "output"
-    description: "A tensor of the specified shape filled with uniform random integers."
     type_attr: "Tout"
   }
   attr {
@@ -20116,7 +18587,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either `seed` or `seed2` are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -20124,7 +18594,6 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "Tout"
@@ -20146,30 +18615,24 @@ op {
       }
     }
   }
-  summary: "Outputs random integers from a uniform distribution."
-  description: "The generated values are uniform integers in the range `[minval, maxval)`.\nThe lower bound `minval` is included in the range, while the upper bound\n`maxval` is excluded.\n\nThe random integers are slightly biased unless `maxval - minval` is an exact\npower of two.  The bias is small for values of `maxval - minval` significantly\nsmaller than the range of the output (either `2^32` or `2^64`)."
   is_stateful: true
 }
 op {
   name: "Range"
   input_arg {
     name: "start"
-    description: "0-D (scalar). First entry in the sequence."
     type_attr: "Tidx"
   }
   input_arg {
     name: "limit"
-    description: "0-D (scalar). Upper limit of sequence, exclusive."
     type_attr: "Tidx"
   }
   input_arg {
     name: "delta"
-    description: "0-D (scalar). Optional. Default is 1. Number that increments `start`."
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    description: "1-D."
     type_attr: "Tidx"
   }
   attr {
@@ -20180,6 +18643,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -20187,24 +18651,19 @@ op {
       }
     }
   }
-  summary: "Creates a sequence of numbers."
-  description: "This operation creates a sequence of numbers that begins at `start` and\nextends by increments of `delta` up to but not including `limit`.\n\nFor example:\n\n```\n# \'start\' is 3\n# \'limit\' is 18\n# \'delta\' is 3\ntf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]\n```"
 }
 op {
   name: "RangeDataset"
   input_arg {
     name: "start"
-    description: "corresponds to start in python\'s xrange()."
     type: DT_INT64
   }
   input_arg {
     name: "stop"
-    description: "corresponds to stop in python\'s xrange()."
     type: DT_INT64
   }
   input_arg {
     name: "step"
-    description: "corresponds to step in python\'s xrange()."
     type: DT_INT64
   }
   output_arg {
@@ -20223,7 +18682,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset with a range of values. Corresponds to python\'s xrange."
   is_stateful: true
 }
 op {
@@ -20240,8 +18698,6 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Returns the rank of a tensor."
-  description: "This operation returns an integer representing the rank of `input`.\n\nFor example:\n\n```\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\n# shape of tensor \'t\' is [2, 2, 3]\nrank(t) ==> 3\n```\n\n**Note**: The rank of a tensor is not the same as the rank of a matrix. The rank\nof a tensor is the number of indices required to uniquely select each element\nof the tensor. Rank is also known as \"order\", \"degree\", or \"ndims.\""
 }
 op {
   name: "ReadFile"
@@ -20253,13 +18709,11 @@ op {
     name: "contents"
     type: DT_STRING
   }
-  summary: "Reads and outputs the entire contents of the input filename."
 }
 op {
   name: "ReadVariableOp"
   input_arg {
     name: "resource"
-    description: "handle to the resource in which to store the variable."
     type: DT_RESOURCE
   }
   output_arg {
@@ -20269,17 +18723,13 @@ op {
   attr {
     name: "dtype"
     type: "type"
-    description: "the dtype of the value."
   }
-  summary: "Reads the value of a variable."
-  description: "The tensor returned by this operation is immutable.\n\nThe value returned by this operation is guaranteed to be influenced by all the\nwrites on which this operation depends directly or indirectly, and to not be\ninfluenced by any of the writes which depend directly or indirectly on this\noperation."
   is_stateful: true
 }
 op {
   name: "ReaderNumRecordsProduced"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a Reader."
     type: DT_STRING
     is_ref: true
   }
@@ -20287,29 +18737,23 @@ op {
     name: "records_produced"
     type: DT_INT64
   }
-  summary: "Returns the number of records this Reader has produced."
-  description: "This is the same as the number of ReaderRead executions that have\nsucceeded."
 }
 op {
   name: "ReaderNumRecordsProducedV2"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a Reader."
     type: DT_RESOURCE
   }
   output_arg {
     name: "records_produced"
     type: DT_INT64
   }
-  summary: "Returns the number of records this Reader has produced."
-  description: "This is the same as the number of ReaderRead executions that have\nsucceeded."
   is_stateful: true
 }
 op {
   name: "ReaderNumWorkUnitsCompleted"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a Reader."
     type: DT_STRING
     is_ref: true
   }
@@ -20317,195 +18761,153 @@ op {
     name: "units_completed"
     type: DT_INT64
   }
-  summary: "Returns the number of work units this Reader has finished processing."
 }
 op {
   name: "ReaderNumWorkUnitsCompletedV2"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a Reader."
     type: DT_RESOURCE
   }
   output_arg {
     name: "units_completed"
     type: DT_INT64
   }
-  summary: "Returns the number of work units this Reader has finished processing."
   is_stateful: true
 }
 op {
   name: "ReaderRead"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a Reader."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "queue_handle"
-    description: "Handle to a Queue, with string work items."
     type: DT_STRING
     is_ref: true
   }
   output_arg {
     name: "key"
-    description: "A scalar."
     type: DT_STRING
   }
   output_arg {
     name: "value"
-    description: "A scalar."
     type: DT_STRING
   }
-  summary: "Returns the next record (key, value pair) produced by a Reader."
-  description: "Will dequeue from the input queue if necessary (e.g. when the\nReader needs to start reading from a new file since it has finished\nwith the previous file)."
 }
 op {
   name: "ReaderReadUpTo"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a `Reader`."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "queue_handle"
-    description: "Handle to a `Queue`, with string work items."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "num_records"
-    description: "number of records to read from `Reader`."
     type: DT_INT64
   }
   output_arg {
     name: "keys"
-    description: "A 1-D tensor."
     type: DT_STRING
   }
   output_arg {
     name: "values"
-    description: "A 1-D tensor."
     type: DT_STRING
   }
-  summary: "Returns up to `num_records` (key, value) pairs produced by a Reader."
-  description: "Will dequeue from the input queue if necessary (e.g. when the\nReader needs to start reading from a new file since it has finished\nwith the previous file).\nIt may return less than `num_records` even before the last batch."
 }
 op {
   name: "ReaderReadUpToV2"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a `Reader`."
     type: DT_RESOURCE
   }
   input_arg {
     name: "queue_handle"
-    description: "Handle to a `Queue`, with string work items."
     type: DT_RESOURCE
   }
   input_arg {
     name: "num_records"
-    description: "number of records to read from `Reader`."
     type: DT_INT64
   }
   output_arg {
     name: "keys"
-    description: "A 1-D tensor."
     type: DT_STRING
   }
   output_arg {
     name: "values"
-    description: "A 1-D tensor."
     type: DT_STRING
   }
-  summary: "Returns up to `num_records` (key, value) pairs produced by a Reader."
-  description: "Will dequeue from the input queue if necessary (e.g. when the\nReader needs to start reading from a new file since it has finished\nwith the previous file).\nIt may return less than `num_records` even before the last batch."
   is_stateful: true
 }
 op {
   name: "ReaderReadV2"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a Reader."
     type: DT_RESOURCE
   }
   input_arg {
     name: "queue_handle"
-    description: "Handle to a Queue, with string work items."
     type: DT_RESOURCE
   }
   output_arg {
     name: "key"
-    description: "A scalar."
     type: DT_STRING
   }
   output_arg {
     name: "value"
-    description: "A scalar."
     type: DT_STRING
   }
-  summary: "Returns the next record (key, value pair) produced by a Reader."
-  description: "Will dequeue from the input queue if necessary (e.g. when the\nReader needs to start reading from a new file since it has finished\nwith the previous file)."
   is_stateful: true
 }
 op {
   name: "ReaderReset"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a Reader."
     type: DT_STRING
     is_ref: true
   }
-  summary: "Restore a Reader to its initial clean state."
 }
 op {
   name: "ReaderResetV2"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a Reader."
     type: DT_RESOURCE
   }
-  summary: "Restore a Reader to its initial clean state."
   is_stateful: true
 }
 op {
   name: "ReaderRestoreState"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a Reader."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "state"
-    description: "Result of a ReaderSerializeState of a Reader with type\nmatching reader_handle."
     type: DT_STRING
   }
-  summary: "Restore a reader to a previously saved state."
-  description: "Not all Readers support being restored, so this can produce an\nUnimplemented error."
 }
 op {
   name: "ReaderRestoreStateV2"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a Reader."
     type: DT_RESOURCE
   }
   input_arg {
     name: "state"
-    description: "Result of a ReaderSerializeState of a Reader with type\nmatching reader_handle."
     type: DT_STRING
   }
-  summary: "Restore a reader to a previously saved state."
-  description: "Not all Readers support being restored, so this can produce an\nUnimplemented error."
   is_stateful: true
 }
 op {
   name: "ReaderSerializeState"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a Reader."
     type: DT_STRING
     is_ref: true
   }
@@ -20513,22 +18915,17 @@ op {
     name: "state"
     type: DT_STRING
   }
-  summary: "Produce a string tensor that encodes the state of a Reader."
-  description: "Not all Readers support being serialized, so this can produce an\nUnimplemented error."
 }
 op {
   name: "ReaderSerializeStateV2"
   input_arg {
     name: "reader_handle"
-    description: "Handle to a Reader."
     type: DT_RESOURCE
   }
   output_arg {
     name: "state"
     type: DT_STRING
   }
-  summary: "Produce a string tensor that encodes the state of a Reader."
-  description: "Not all Readers support being serialized, so this can produce an\nUnimplemented error."
   is_stateful: true
 }
 op {
@@ -20567,8 +18964,6 @@ op {
       }
     }
   }
-  summary: "Returns the real part of a complex number."
-  description: "Given a tensor `input` of complex numbers, this operation returns a tensor of\ntype `float` that is the real part of each element in `input`. All elements in\n`input` must be complex numbers of the form \\\\(a + bj\\\\), where *a* is the real\n part returned by this operation and *b* is the imaginary part.\n\nFor example:\n\n```\n# tensor \'input\' is [-2.25 + 4.75j, 3.25 + 5.75j]\ntf.real(input) ==> [-2.25, 3.25]\n```"
 }
 op {
   name: "RealDiv"
@@ -20590,6 +18985,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -20603,8 +18999,6 @@ op {
       }
     }
   }
-  summary: "Returns x / y element-wise for real types."
-  description: "If `x` and `y` are reals, this will return the floating-point division.\n\n*NOTE*: `Div` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "Reciprocal"
@@ -20622,6 +19016,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -20631,8 +19026,6 @@ op {
       }
     }
   }
-  summary: "Computes the reciprocal of x element-wise."
-  description: "I.e., \\\\(y = 1 / x\\\\)."
 }
 op {
   name: "ReciprocalGrad"
@@ -20654,6 +19047,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -20661,20 +19055,16 @@ op {
       }
     }
   }
-  summary: "Computes the gradient for the inverse of `x` wrt its input."
-  description: "Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`\nis the corresponding input gradient."
 }
 op {
   name: "RecordInput"
   output_arg {
     name: "records"
-    description: "A tensor of shape [batch_size]."
     type: DT_STRING
   }
   attr {
     name: "file_pattern"
     type: "string"
-    description: "Glob pattern for the data files."
   }
   attr {
     name: "file_random_seed"
@@ -20682,7 +19072,6 @@ op {
     default_value {
       i: 301
     }
-    description: "Random seeds used to produce randomized records."
   }
   attr {
     name: "file_shuffle_shift_ratio"
@@ -20690,7 +19079,6 @@ op {
     default_value {
       f: 0
     }
-    description: "Shifts the list of files after the list is randomly\nshuffled."
   }
   attr {
     name: "file_buffer_size"
@@ -20698,7 +19086,6 @@ op {
     default_value {
       i: 10000
     }
-    description: "The randomization shuffling buffer."
   }
   attr {
     name: "file_parallelism"
@@ -20706,7 +19093,6 @@ op {
     default_value {
       i: 16
     }
-    description: "How many sstables are opened and concurrently iterated over."
   }
   attr {
     name: "batch_size"
@@ -20714,26 +19100,28 @@ op {
     default_value {
       i: 32
     }
-    description: "The batch size."
   }
-  summary: "Emits randomized records."
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
   name: "ReduceJoin"
   input_arg {
     name: "inputs"
-    description: "The input to be joined.  All reduced indices must have non-zero size."
     type: DT_STRING
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce over.  Dimensions are reduced in the\norder specified.  Omitting `reduction_indices` is equivalent to passing\n`[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "Has shape equal to that of the input with reduced dimensions removed or\nset to `1` depending on `keep_dims`."
     type: DT_STRING
   }
   attr {
@@ -20742,7 +19130,6 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, retain reduced dimensions with length `1`."
   }
   attr {
     name: "separator"
@@ -20750,22 +19137,17 @@ op {
     default_value {
       s: ""
     }
-    description: "The separator to use when joining."
   }
-  summary: "Joins a string Tensor across the given dimensions."
-  description: "Computes the string join across dimensions in the given string Tensor of shape\n`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input\nstrings with the given separator (default: empty string).  Negative indices are\ncounted backwards from the end, with `-1` being equivalent to `n - 1`.\n\nFor example:\n\n```python\n# tensor `a` is [[\"a\", \"b\"], [\"c\", \"d\"]]\ntf.reduce_join(a, 0) ==> [\"ac\", \"bd\"]\ntf.reduce_join(a, 1) ==> [\"ab\", \"cd\"]\ntf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> [\"ac\", \"bd\"]\ntf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> [\"ab\", \"cd\"]\ntf.reduce_join(a, 0, keep_dims=True) ==> [[\"ac\", \"bd\"]]\ntf.reduce_join(a, 1, keep_dims=True) ==> [[\"ab\"], [\"cd\"]]\ntf.reduce_join(a, 0, separator=\".\") ==> [\"a.c\", \"b.d\"]\ntf.reduce_join(a, [0, 1]) ==> [\"acbd\"]\ntf.reduce_join(a, [1, 0]) ==> [\"abcd\"]\ntf.reduce_join(a, []) ==> [\"abcd\"]\n```"
 }
 op {
   name: "RefEnter"
   input_arg {
     name: "data"
-    description: "The tensor to be made available to the child frame."
     type_attr: "T"
     is_ref: true
   }
   output_arg {
     name: "output"
-    description: "The same tensor as `data`."
     type_attr: "T"
     is_ref: true
   }
@@ -20776,7 +19158,6 @@ op {
   attr {
     name: "frame_name"
     type: "string"
-    description: "The name of the child frame."
   }
   attr {
     name: "is_constant"
@@ -20784,7 +19165,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, the output is constant within the child frame."
   }
   attr {
     name: "parallel_iterations"
@@ -20792,22 +19172,17 @@ op {
     default_value {
       i: 10
     }
-    description: "The number of iterations allowed to run in parallel."
   }
-  summary: "Creates or finds a child frame, and makes `data` available to the child frame."
-  description: "The unique `frame_name` is used by the `Executor` to identify frames. If\n`is_constant` is true, `output` is a constant in the child frame; otherwise\nit may be changed in the child frame. At most `parallel_iterations` iterations\nare run in parallel in the child frame."
 }
 op {
   name: "RefExit"
   input_arg {
     name: "data"
-    description: "The tensor to be made available to the parent frame."
     type_attr: "T"
     is_ref: true
   }
   output_arg {
     name: "output"
-    description: "The same tensor as `data`."
     type_attr: "T"
     is_ref: true
   }
@@ -20815,8 +19190,6 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Exits the current frame to its parent frame."
-  description: "Exit makes its input `data` available to the parent frame."
 }
 op {
   name: "RefIdentity"
@@ -20834,27 +19207,23 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Return the same ref tensor as the input ref tensor."
   allows_uninitialized_input: true
 }
 op {
   name: "RefMerge"
   input_arg {
     name: "inputs"
-    description: "The input tensors, exactly one of which will become available."
     type_attr: "T"
     number_attr: "N"
     is_ref: true
   }
   output_arg {
     name: "output"
-    description: "Will be set to the available input tensor."
     type_attr: "T"
     is_ref: true
   }
   output_arg {
     name: "value_index"
-    description: "The index of the chosen input tensor in `inputs`."
     type: DT_INT32
   }
   attr {
@@ -20867,20 +19236,16 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Forwards the value of an available tensor from `inputs` to `output`."
-  description: "`Merge` waits for at least one of the tensors in `inputs` to become available.\nIt is usually combined with `Switch` to implement branching.\n\n`Merge` forwards the first tensor for become available to `output`, and sets\n`value_index` to its index in `inputs`."
 }
 op {
   name: "RefNextIteration"
   input_arg {
     name: "data"
-    description: "The tensor to be made available to the next iteration."
     type_attr: "T"
     is_ref: true
   }
   output_arg {
     name: "output"
-    description: "The same tensor as `data`."
     type_attr: "T"
     is_ref: true
   }
@@ -20888,25 +19253,21 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Makes its input available to the next iteration."
 }
 op {
   name: "RefSelect"
   input_arg {
     name: "index"
-    description: "A scalar that determines the input that gets selected."
     type: DT_INT32
   }
   input_arg {
     name: "inputs"
-    description: "A list of ref tensors, one of which will be forwarded to `output`."
     type_attr: "T"
     number_attr: "N"
     is_ref: true
   }
   output_arg {
     name: "output"
-    description: "The forwarded tensor."
     type_attr: "T"
     is_ref: true
   }
@@ -20920,30 +19281,25 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Forwards the `index`th element of `inputs` to `output`."
 }
 op {
   name: "RefSwitch"
   input_arg {
     name: "data"
-    description: "The ref tensor to be forwarded to the appropriate output."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "pred"
-    description: "A scalar that specifies which output port will receive data."
     type: DT_BOOL
   }
   output_arg {
     name: "output_false"
-    description: "If `pred` is false, data will be forwarded to this output."
     type_attr: "T"
     is_ref: true
   }
   output_arg {
     name: "output_true"
-    description: "If `pred` is true, data will be forwarded to this output."
     type_attr: "T"
     is_ref: true
   }
@@ -20951,8 +19307,6 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Forwards the ref tensor `data` to the output port determined by `pred`."
-  description: "If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,\nthe data goes to `output_false`.\n\nSee also `Switch` and `Merge`."
   allows_uninitialized_input: true
 }
 op {
@@ -20973,10 +19327,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -20984,7 +19339,6 @@ op {
       }
     }
   }
-  summary: "Computes rectified linear: `max(features, 0)`."
 }
 op {
   name: "Relu6"
@@ -21004,10 +19358,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -21015,23 +19370,19 @@ op {
       }
     }
   }
-  summary: "Computes rectified linear 6: `min(max(features, 0), 6)`."
 }
 op {
   name: "Relu6Grad"
   input_arg {
     name: "gradients"
-    description: "The backpropagated gradients to the corresponding Relu6 operation."
     type_attr: "T"
   }
   input_arg {
     name: "features"
-    description: "The features passed as input to the corresponding Relu6 operation, or\nits output; using either one produces the same result."
     type_attr: "T"
   }
   output_arg {
     name: "backprops"
-    description: "The gradients:\n`gradients * (features > 0) * (features < 6)`."
     type_attr: "T"
   }
   attr {
@@ -21042,10 +19393,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -21053,23 +19405,19 @@ op {
       }
     }
   }
-  summary: "Computes rectified linear 6 gradients for a Relu6 operation."
 }
 op {
   name: "ReluGrad"
   input_arg {
     name: "gradients"
-    description: "The backpropagated gradients to the corresponding Relu operation."
     type_attr: "T"
   }
   input_arg {
     name: "features"
-    description: "The features passed as input to the corresponding Relu operation, OR\nthe outputs of that operation (both work equivalently)."
     type_attr: "T"
   }
   output_arg {
     name: "backprops"
-    description: "`gradients * (features > 0)`."
     type_attr: "T"
   }
   attr {
@@ -21080,10 +19428,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -21091,56 +19440,46 @@ op {
       }
     }
   }
-  summary: "Computes rectified linear gradients for a Relu operation."
 }
 op {
   name: "RemoteCall"
   input_arg {
     name: "target"
-    description: "A fully specified device name where we want to run the function."
     type: DT_STRING
   }
   input_arg {
     name: "args"
-    description: "A list of arguments for the function."
     type_list_attr: "Tin"
   }
   output_arg {
     name: "output"
-    description: "A list of return values."
     type_list_attr: "Tout"
   }
   attr {
     name: "Tin"
     type: "list(type)"
-    description: "The type list for the arguments."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "Tout"
     type: "list(type)"
-    description: "The type list for the return values."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "f"
     type: "func"
-    description: "The function to run remotely."
   }
-  summary: "Runs function `f` on a remote device indicated by `target`."
 }
 op {
   name: "RemoteFusedGraphExecute"
   input_arg {
     name: "inputs"
-    description: "Arbitrary number of tensors with arbitrary data types"
     type_list_attr: "Tinputs"
   }
   output_arg {
     name: "outputs"
-    description: "Arbitrary number of tensors with arbitrary data types"
     type_list_attr: "Toutputs"
   }
   attr {
@@ -21156,10 +19495,7 @@ op {
   attr {
     name: "serialized_remote_fused_graph_execute_info"
     type: "string"
-    description: "Serialized protocol buffer\nof RemoteFusedGraphExecuteInfo which contains graph specifications."
   }
-  summary: "Execute a sub graph on a remote processor."
-  description: "The graph specifications(such as graph itself, input tensors and output names)\nare stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo\nas serialized_remote_fused_graph_execute_info.\nThe specifications will be passed to a dedicated registered\nremote fused graph executor.  The executor will send the graph specifications\nto a remote processor and execute that graph.  The execution results\nwill be passed to consumer nodes as outputs of this node."
 }
 op {
   name: "RepeatDataset"
@@ -21169,7 +19505,6 @@ op {
   }
   input_arg {
     name: "count"
-    description: "A scalar representing the number of times that `input_dataset` should\nbe repeated. A value of `-1` indicates that it should be repeated infinitely."
     type: DT_INT64
   }
   output_arg {
@@ -21188,7 +19523,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that emits the outputs of `input_dataset` `count` times."
 }
 op {
   name: "RequantizationRange"
@@ -21198,40 +19532,33 @@ op {
   }
   input_arg {
     name: "input_min"
-    description: "The float value that the minimum quantized input value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "input_max"
-    description: "The float value that the maximum quantized input value represents."
     type: DT_FLOAT
   }
   output_arg {
     name: "output_min"
-    description: "The computed min output."
     type: DT_FLOAT
   }
   output_arg {
     name: "output_max"
-    description: "the computed max output."
     type: DT_FLOAT
   }
   attr {
     name: "Tinput"
     type: "type"
-    description: "The type of the input."
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
-  summary: "Given a quantized tensor described by (input, input_min, input_max), outputs a"
-  description: "range that covers the actual values present in that tensor.  This op is\ntypically used to produce the requested_output_min and requested_output_max for\nRequantize."
 }
 op {
   name: "Requantize"
@@ -21241,22 +19568,18 @@ op {
   }
   input_arg {
     name: "input_min"
-    description: "The float value that the minimum quantized input value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "input_max"
-    description: "The float value that the maximum quantized input value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "requested_output_min"
-    description: "The float value that the minimum quantized output value represents."
     type: DT_FLOAT
   }
   input_arg {
     name: "requested_output_max"
-    description: "The float value that the maximum quantized output value represents."
     type: DT_FLOAT
   }
   output_arg {
@@ -21265,44 +19588,38 @@ op {
   }
   output_arg {
     name: "output_min"
-    description: "The requested_output_min value is copied into this output."
     type: DT_FLOAT
   }
   output_arg {
     name: "output_max"
-    description: "The requested_output_max value is copied into this output."
     type: DT_FLOAT
   }
   attr {
     name: "Tinput"
     type: "type"
-    description: "The type of the input."
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
     name: "out_type"
     type: "type"
-    description: "The type of the output. Should be a lower bit depth than Tinput."
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
-  summary: "Convert the quantized \'input\' tensor into a lower-precision \'output\', using the"
-  description: "output range specified with \'requested_output_min\' and \'requested_output_max\'.\n\n[input_min, input_max] are scalar floats that specify the range for the float\ninterpretation of the \'input\' data. For example, if input_min is -1.0f and\ninput_max is 1.0f, and we are dealing with quint16 quantized data, then a 0\nvalue in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f."
 }
 op {
   name: "Reshape"
@@ -21312,7 +19629,6 @@ op {
   }
   input_arg {
     name: "shape"
-    description: "Defines the shape of the output tensor."
     type_attr: "Tshape"
   }
   output_arg {
@@ -21336,24 +19652,19 @@ op {
       }
     }
   }
-  summary: "Reshapes a tensor."
-  description: "Given `tensor`, this operation returns a tensor that has the same values\nas `tensor` with shape `shape`.\n\nIf one component of `shape` is the special value -1, the size of that dimension\nis computed so that the total size remains constant.  In particular, a `shape`\nof `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.\n\nIf `shape` is 1-D or higher, then the operation returns a tensor with shape\n`shape` filled with the values of `tensor`. In this case, the number of elements\nimplied by `shape` must be the same as the number of elements in `tensor`.\n\nFor example:\n\n```\n# tensor \'t\' is [1, 2, 3, 4, 5, 6, 7, 8, 9]\n# tensor \'t\' has shape [9]\nreshape(t, [3, 3]) ==> [[1, 2, 3],\n                        [4, 5, 6],\n                        [7, 8, 9]]\n\n# tensor \'t\' is [[[1, 1], [2, 2]],\n#                [[3, 3], [4, 4]]]\n# tensor \'t\' has shape [2, 2, 2]\nreshape(t, [2, 4]) ==> [[1, 1, 2, 2],\n                        [3, 3, 4, 4]]\n\n# tensor \'t\' is [[[1, 1, 1],\n#                 [2, 2, 2]],\n#                [[3, 3, 3],\n#                 [4, 4, 4]],\n#                [[5, 5, 5],\n#                 [6, 6, 6]]]\n# tensor \'t\' has shape [3, 2, 3]\n# pass \'[-1]\' to flatten \'t\'\nreshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]\n\n# -1 can also be used to infer the shape\n\n# -1 is inferred to be 9:\nreshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 2:\nreshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 3:\nreshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],\n                              [2, 2, 2],\n                              [3, 3, 3]],\n                             [[4, 4, 4],\n                              [5, 5, 5],\n                              [6, 6, 6]]]\n\n# tensor \'t\' is [7]\n# shape `[]` reshapes to a scalar\nreshape(t, []) ==> 7\n```"
 }
 op {
   name: "ResizeArea"
   input_arg {
     name: "images"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "size"
-    description: "= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The\nnew size for the images."
     type: DT_INT32
   }
   output_arg {
     name: "resized_images"
-    description: "4-D with shape\n`[batch, new_height, new_width, channels]`."
     type: DT_FLOAT
   }
   attr {
@@ -21379,26 +19690,20 @@ op {
     default_value {
       b: false
     }
-    description: "If true, rescale input by (new_height - 1) / (height - 1), which\nexactly aligns the 4 corners of images and resized images. If false, rescale\nby new_height / height. Treat similarly the width dimension."
   }
-  summary: "Resize `images` to `size` using area interpolation."
-  description: "Input images can be of different types but output images are always float.\n\nEach output pixel is computed by first transforming the pixel\'s footprint into\nthe input tensor and then averaging the pixels that intersect the footprint. An\ninput pixel\'s contribution to the average is weighted by the fraction of its\narea that intersects the footprint.  This is the same as OpenCV\'s INTER_AREA."
 }
 op {
   name: "ResizeBicubic"
   input_arg {
     name: "images"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "size"
-    description: "= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The\nnew size for the images."
     type: DT_INT32
   }
   output_arg {
     name: "resized_images"
-    description: "4-D with shape\n`[batch, new_height, new_width, channels]`."
     type: DT_FLOAT
   }
   attr {
@@ -21424,26 +19729,20 @@ op {
     default_value {
       b: false
     }
-    description: "If true, rescale input by (new_height - 1) / (height - 1), which\nexactly aligns the 4 corners of images and resized images. If false, rescale\nby new_height / height. Treat similarly the width dimension."
   }
-  summary: "Resize `images` to `size` using bicubic interpolation."
-  description: "Input images can be of different types but output images are always float."
 }
 op {
   name: "ResizeBicubicGrad"
   input_arg {
     name: "grads"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type: DT_FLOAT
   }
   input_arg {
     name: "original_image"
-    description: "4-D with shape `[batch, orig_height, orig_width, channels]`,\nThe image tensor that was resized."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "4-D with shape `[batch, orig_height, orig_width, channels]`.\nGradients with respect to the input image. Input image must have been\nfloat or double."
     type_attr: "T"
   }
   attr {
@@ -21462,25 +19761,20 @@ op {
     default_value {
       b: false
     }
-    description: "If true, rescale grads by (orig_height - 1) / (height - 1), which\nexactly aligns the 4 corners of grads and original_image. If false, rescale by\norig_height / height. Treat similarly the width dimension."
   }
-  summary: "Computes the gradient of bicubic interpolation."
 }
 op {
   name: "ResizeBilinear"
   input_arg {
     name: "images"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "size"
-    description: "= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The\nnew size for the images."
     type: DT_INT32
   }
   output_arg {
     name: "resized_images"
-    description: "4-D with shape\n`[batch, new_height, new_width, channels]`."
     type: DT_FLOAT
   }
   attr {
@@ -21494,6 +19788,7 @@ op {
         type: DT_UINT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -21506,26 +19801,20 @@ op {
     default_value {
       b: false
     }
-    description: "If true, rescale input by (new_height - 1) / (height - 1), which\nexactly aligns the 4 corners of images and resized images. If false, rescale\nby new_height / height. Treat similarly the width dimension."
   }
-  summary: "Resize `images` to `size` using bilinear interpolation."
-  description: "Input images can be of different types but output images are always float."
 }
 op {
   name: "ResizeBilinearGrad"
   input_arg {
     name: "grads"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type: DT_FLOAT
   }
   input_arg {
     name: "original_image"
-    description: "4-D with shape `[batch, orig_height, orig_width, channels]`,\nThe image tensor that was resized."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "4-D with shape `[batch, orig_height, orig_width, channels]`.\nGradients with respect to the input image. Input image must have been\nfloat or double."
     type_attr: "T"
   }
   attr {
@@ -21534,6 +19823,7 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_DOUBLE
       }
@@ -21545,25 +19835,20 @@ op {
     default_value {
       b: false
     }
-    description: "If true, rescale grads by (orig_height - 1) / (height - 1), which\nexactly aligns the 4 corners of grads and original_image. If false, rescale by\norig_height / height. Treat similarly the width dimension."
   }
-  summary: "Computes the gradient of bilinear interpolation."
 }
 op {
   name: "ResizeNearestNeighbor"
   input_arg {
     name: "images"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "size"
-    description: "= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The\nnew size for the images."
     type: DT_INT32
   }
   output_arg {
     name: "resized_images"
-    description: "4-D with shape\n`[batch, new_height, new_width, channels]`."
     type_attr: "T"
   }
   attr {
@@ -21589,25 +19874,20 @@ op {
     default_value {
       b: false
     }
-    description: "If true, rescale input by (new_height - 1) / (height - 1), which\nexactly aligns the 4 corners of images and resized images. If false, rescale\nby new_height / height. Treat similarly the width dimension."
   }
-  summary: "Resize `images` to `size` using nearest neighbor interpolation."
 }
 op {
   name: "ResizeNearestNeighborGrad"
   input_arg {
     name: "grads"
-    description: "4-D with shape `[batch, height, width, channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "size"
-    description: "= A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The\noriginal input size."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients\nwith respect to the input image."
     type_attr: "T"
   }
   attr {
@@ -21630,45 +19910,36 @@ op {
     default_value {
       b: false
     }
-    description: "If true, rescale grads by (orig_height - 1) / (height - 1), which\nexactly aligns the 4 corners of grads and original_image. If false, rescale by\norig_height / height. Treat similarly the width dimension."
   }
-  summary: "Computes the gradient of nearest neighbor interpolation."
 }
 op {
   name: "ResourceApplyAdadelta"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum_update"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "rho"
-    description: "Decay factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "epsilon"
-    description: "Constant factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   attr {
@@ -21678,17 +19949,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -21701,32 +19973,25 @@ op {
     default_value {
       b: false
     }
-    description: "If True, updating of the var, accum and update_accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'*var\' according to the adadelta scheme."
-  description: "accum = rho() * accum + (1 - rho()) * grad.square();\nupdate = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;\nupdate_accum = rho() * update_accum + (1 - rho()) * update.square();\nvar -= update;"
   is_stateful: true
 }
 op {
   name: "ResourceApplyAdagrad"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   attr {
@@ -21736,17 +20001,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -21759,52 +20025,41 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the adagrad scheme."
-  description: "accum += grad * grad\nvar -= lr * grad * (1 / sqrt(accum))"
   is_stateful: true
 }
 op {
   name: "ResourceApplyAdagradDA"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "gradient_accumulator"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "gradient_squared_accumulator"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "global_step"
-    description: "Training step number. Must be a scalar."
     type: DT_INT64
   }
   attr {
@@ -21814,17 +20069,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -21837,61 +20093,49 @@ op {
     default_value {
       b: false
     }
-    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'*var\' according to the proximal adagrad scheme."
   is_stateful: true
 }
 op {
   name: "ResourceApplyAdam"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "m"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "v"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "beta1_power"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "beta2_power"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "beta1"
-    description: "Momentum factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "beta2"
-    description: "Momentum factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "epsilon"
-    description: "Ridge term. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   attr {
@@ -21901,17 +20145,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -21924,7 +20169,6 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var, m, and v tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
   attr {
     name: "use_nesterov"
@@ -21932,47 +20176,37 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, uses the nesterov update."
   }
-  summary: "Update \'*var\' according to the Adam algorithm."
-  description: "lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)\nm_t <- beta1 * m_{t-1} + (1 - beta1) * g_t\nv_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t\nvariable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)"
   is_stateful: true
 }
 op {
   name: "ResourceApplyAddSign"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "m"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "alpha"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "sign_decay"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "beta"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   attr {
@@ -21982,17 +20216,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22005,42 +20240,33 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and m tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the AddSign update."
-  description: "m_t <- beta1 * m_{t-1} + (1 - beta1) * g\nupdate <- (alpha + sign_decay * sign(g) *sign(m)) * g\nvariable <- variable - lr_t * update"
   is_stateful: true
 }
 op {
   name: "ResourceApplyCenteredRMSProp"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "mg"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "ms"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "mom"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "rho"
-    description: "Decay rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -22049,12 +20275,10 @@ op {
   }
   input_arg {
     name: "epsilon"
-    description: "Ridge term. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   attr {
@@ -22064,17 +20288,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22087,52 +20312,41 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var, mg, ms, and mom tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the centered RMSProp algorithm."
-  description: "The centered RMSProp algorithm uses an estimate of the centered second moment\n(i.e., the variance) for normalization, as opposed to regular RMSProp, which\nuses the (uncentered) second moment. This often helps with training, but is\nslightly more expensive in terms of computation and memory.\n\nNote that in dense implementation of this algorithm, mg, ms, and mom will\nupdate even if the grad is zero, but in this sparse implementation, mg, ms,\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nmean_grad = decay * mean_grad + (1-decay) * gradient\n\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)\n\nmg <- rho * mg_{t-1} + (1-rho) * grad\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)\nvar <- var - mom"
   is_stateful: true
 }
 op {
   name: "ResourceApplyFtrl"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "linear"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regulariation. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regulariation. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "lr_power"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   attr {
@@ -22142,17 +20356,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22165,47 +20380,37 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
-  description: "accum_new = accum + grad * grad\nlinear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
   is_stateful: true
 }
 op {
   name: "ResourceApplyFtrlV2"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "linear"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regulariation. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 shrinkage regulariation. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -22214,7 +20419,6 @@ op {
   }
   input_arg {
     name: "lr_power"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   attr {
@@ -22224,17 +20428,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22247,27 +20452,21 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
-  description: "grad_with_shrinkage = grad + 2 * l2_shrinkage * var\naccum_new = accum + grad_with_shrinkage * grad_with_shrinkage\nlinear += grad_with_shrinkage +\n    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
   is_stateful: true
 }
 op {
   name: "ResourceApplyGradientDescent"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "alpha"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "delta"
-    description: "The change."
     type_attr: "T"
   }
   attr {
@@ -22277,17 +20476,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22300,36 +20500,29 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'*var\' by subtracting \'alpha\' * \'delta\' from it."
   is_stateful: true
 }
 op {
   name: "ResourceApplyMomentum"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "momentum"
-    description: "Momentum. Must be a scalar."
     type_attr: "T"
   }
   attr {
@@ -22339,17 +20532,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22362,7 +20556,6 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
   attr {
     name: "use_nesterov"
@@ -22370,47 +20563,37 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, the tensor passed to compute grad will be\nvar - lr * momentum * accum, so in the end, the var you get is actually\nvar - lr * momentum * accum."
   }
-  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
-  description: "want to use Nesterov momentum.\n\naccum = accum * momentum + grad\nvar -= lr * accum"
   is_stateful: true
 }
 op {
   name: "ResourceApplyPowerSign"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "m"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "logbase"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "sign_decay"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "beta"
-    description: "Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   attr {
@@ -22420,17 +20603,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22443,42 +20627,33 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and m tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the AddSign update."
-  description: "m_t <- beta1 * m_{t-1} + (1 - beta1) * g\nupdate <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g\nvariable <- variable - lr_t * update"
   is_stateful: true
 }
 op {
   name: "ResourceApplyProximalAdagrad"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   attr {
@@ -22488,17 +20663,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22511,37 +20687,29 @@ op {
     default_value {
       b: false
     }
-    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'*var\' and \'*accum\' according to FOBOS with Adagrad learning rate."
-  description: "accum += grad * grad\nprox_v = var - lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}"
   is_stateful: true
 }
 op {
   name: "ResourceApplyProximalGradientDescent"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "alpha"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "delta"
-    description: "The change."
     type_attr: "T"
   }
   attr {
@@ -22551,17 +20719,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22574,37 +20743,29 @@ op {
     default_value {
       b: false
     }
-    description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update \'*var\' as FOBOS algorithm with fixed learning rate."
-  description: "prox_v = var - alpha * delta\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}"
   is_stateful: true
 }
 op {
   name: "ResourceApplyRMSProp"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "ms"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "mom"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "rho"
-    description: "Decay rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -22613,12 +20774,10 @@ op {
   }
   input_arg {
     name: "epsilon"
-    description: "Ridge term. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   attr {
@@ -22628,17 +20787,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22651,28 +20811,22 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var, ms, and mom tensors is protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the RMSProp algorithm."
-  description: "Note that in dense implementation of this algorithm, ms and mom will\nupdate even if the grad is zero, but in this sparse implementation, ms\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
   is_stateful: true
 }
 op {
   name: "ResourceCountUpTo"
   input_arg {
     name: "resource"
-    description: "Should be from a scalar `Variable` node."
     type: DT_RESOURCE
   }
   output_arg {
     name: "output"
-    description: "A copy of the input before increment. If nothing else modifies the\ninput, the values produced will all be distinct."
     type_attr: "T"
   }
   attr {
     name: "limit"
     type: "int"
-    description: "If incrementing ref would bring it above limit, instead generates an\n\'OutOfRange\' error."
   }
   attr {
     name: "T"
@@ -22684,7 +20838,6 @@ op {
       }
     }
   }
-  summary: "Increments variable pointed to by \'resource\' until it reaches \'limit\'."
   is_stateful: true
 }
 op {
@@ -22722,25 +20875,20 @@ op {
       }
     }
   }
-  summary: "Gather slices from the variable pointed to by `resource` according to `indices`."
-  description: "`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).\nProduces an output tensor with shape `indices.shape + params.shape[1:]` where:\n\n```python\n    # Scalar indices\n    output[:, ..., :] = params[indices, :, ... :]\n\n    # Vector indices\n    output[i, :, ..., :] = params[indices[i], :, ... :]\n\n    # Higher rank indices\n    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]\n```"
   is_stateful: true
 }
 op {
   name: "ResourceScatterAdd"
   input_arg {
     name: "resource"
-    description: "Should be from a `Variable` node."
     type: DT_RESOURCE
   }
   input_arg {
     name: "indices"
-    description: "A tensor of indices into the first dimension of `ref`."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A tensor of updated values to add to `ref`."
     type_attr: "dtype"
   }
   attr {
@@ -22750,17 +20898,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22777,50 +20926,25 @@ op {
       }
     }
   }
-  summary: "Adds sparse updates to the variable referenced by `resource`."
-  description: "This operation computes\n\n    # Scalar indices\n    ref[indices, ...] += updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] += updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\'https://www.tensorflow.org/images/ScatterAdd.png\' alt>\n</div>"
   is_stateful: true
 }
 op {
-  name: "ResourceScatterUpdate"
+  name: "ResourceScatterNdUpdate"
   input_arg {
-    name: "resource"
-    description: "Should be from a `Variable` node."
+    name: "ref"
     type: DT_RESOURCE
   }
   input_arg {
     name: "indices"
-    description: "A tensor of indices into the first dimension of `ref`."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A tensor of updated values to add to `ref`."
-    type_attr: "dtype"
+    type_attr: "T"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
   }
   attr {
     name: "Tindices"
@@ -22832,49 +20956,77 @@ op {
       }
     }
   }
-  summary: "Assigns sparse updates to the variable referenced by `resource`."
-  description: "This operation computes\n\n    # Scalar indices\n    ref[indices, ...] = updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] = updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]"
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdadelta"
-  input_arg {
-    name: "var"
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterUpdate"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum_update"
-    description: ": Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Learning rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "rho"
-    description: "Decay factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "epsilon"
-    description: "Constant factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   attr {
@@ -22884,17 +21036,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22917,36 +21070,29 @@ op {
     default_value {
       b: false
     }
-    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "var: Should be from a Variable()."
   is_stateful: true
 }
 op {
   name: "ResourceSparseApplyAdagrad"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Learning rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   attr {
@@ -22956,17 +21102,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -22989,57 +21136,45 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the adagrad scheme."
-  description: "That is for rows we have grad for, we update var and accum as follows:\naccum += grad * grad\nvar -= lr * grad * (1 / sqrt(accum))"
   is_stateful: true
 }
 op {
   name: "ResourceSparseApplyAdagradDA"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "gradient_accumulator"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "gradient_squared_accumulator"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   input_arg {
     name: "lr"
-    description: "Learning rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "global_step"
-    description: "Training step number. Must be a scalar."
     type: DT_INT64
   }
   attr {
@@ -23049,17 +21184,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -23082,41 +21218,33 @@ op {
     default_value {
       b: false
     }
-    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update entries in \'*var\' and \'*accum\' according to the proximal adagrad scheme."
   is_stateful: true
 }
 op {
   name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "mg"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "ms"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "mom"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "rho"
-    description: "Decay rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -23125,17 +21253,14 @@ op {
   }
   input_arg {
     name: "epsilon"
-    description: "Ridge term. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var, ms and mom."
     type_attr: "Tindices"
   }
   attr {
@@ -23145,17 +21270,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -23178,57 +21304,45 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var, mg, ms, and mom tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the centered RMSProp algorithm."
-  description: "The centered RMSProp algorithm uses an estimate of the centered second moment\n(i.e., the variance) for normalization, as opposed to regular RMSProp, which\nuses the (uncentered) second moment. This often helps with training, but is\nslightly more expensive in terms of computation and memory.\n\nNote that in dense implementation of this algorithm, mg, ms, and mom will\nupdate even if the grad is zero, but in this sparse implementation, mg, ms,\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nmean_grad = decay * mean_grad + (1-decay) * gradient\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
   is_stateful: true
 }
 op {
   name: "ResourceSparseApplyFtrl"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "linear"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "lr_power"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   attr {
@@ -23238,17 +21352,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -23271,52 +21386,41 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
-  description: "That is for rows we have grad for, we update var, accum and linear as follows:\naccum_new = accum + grad * grad\nlinear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
   is_stateful: true
 }
 op {
   name: "ResourceSparseApplyFtrlV2"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "linear"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 shrinkage regulariation. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -23325,7 +21429,6 @@ op {
   }
   input_arg {
     name: "lr_power"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   attr {
@@ -23335,17 +21438,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -23368,42 +21472,33 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
-  description: "That is for rows we have grad for, we update var, accum and linear as follows:\ngrad_with_shrinkage = grad + 2 * l2_shrinkage * var\naccum_new = accum + grad_with_shrinkage * grad_with_shrinkage\nlinear += grad_with_shrinkage +\n    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
   is_stateful: true
 }
 op {
   name: "ResourceSparseApplyMomentum"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Learning rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   input_arg {
     name: "momentum"
-    description: "Momentum. Must be a scalar."
     type_attr: "T"
   }
   attr {
@@ -23413,17 +21508,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -23446,7 +21542,6 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
   attr {
     name: "use_nesterov"
@@ -23454,47 +21549,37 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, the tensor passed to compute grad will be\nvar - lr * momentum * accum, so in the end, the var you get is actually\nvar - lr * momentum * accum."
   }
-  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
-  description: "Set use_nesterov = True if you want to use Nesterov momentum.\n\nThat is for rows we have grad for, we update var and accum as follows:\n\naccum = accum * momentum + grad\nvar -= lr * accum"
   is_stateful: true
 }
 op {
   name: "ResourceSparseApplyProximalAdagrad"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Learning rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   attr {
@@ -23504,17 +21589,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -23537,42 +21623,33 @@ op {
     default_value {
       b: false
     }
-    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Sparse update entries in \'*var\' and \'*accum\' according to FOBOS algorithm."
-  description: "That is for rows we have grad for, we update var and accum as follows:\naccum += grad * grad\nprox_v = var\nprox_v -= lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}"
   is_stateful: true
 }
 op {
   name: "ResourceSparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "alpha"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   attr {
@@ -23582,17 +21659,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -23615,37 +21693,29 @@ op {
     default_value {
       b: false
     }
-    description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Sparse update \'*var\' as FOBOS algorithm with fixed learning rate."
-  description: "That is for rows we have grad for, we update var as follows:\nprox_v = var - alpha * grad\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}"
   is_stateful: true
 }
 op {
   name: "ResourceSparseApplyRMSProp"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "ms"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "mom"
-    description: "Should be from a Variable()."
     type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "rho"
-    description: "Decay rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -23654,17 +21724,14 @@ op {
   }
   input_arg {
     name: "epsilon"
-    description: "Ridge term. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var, ms and mom."
     type_attr: "Tindices"
   }
   attr {
@@ -23674,17 +21741,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -23707,10 +21775,7 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var, ms, and mom tensors is protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the RMSProp algorithm."
-  description: "Note that in dense implementation of this algorithm, ms and mom will\nupdate even if the grad is zero, but in this sparse implementation, ms\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
   is_stateful: true
 }
 op {
@@ -23784,31 +21849,25 @@ op {
       i: 0
     }
   }
-  summary: "Assign `value` to the sliced l-value reference of `ref`."
-  description: "The values of `value` are assigned to the positions in the variable\n`ref` that are selected by the slice parameters. The slice parameters\n`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.\n\nNOTE this op currently does not support broadcasting and so `value`\'s\nshape must be exactly the shape produced by the slice of `ref`."
   is_stateful: true
 }
 op {
   name: "Restore"
   input_arg {
     name: "file_pattern"
-    description: "Must have a single element. The pattern of the files from\nwhich we read the tensor."
     type: DT_STRING
   }
   input_arg {
     name: "tensor_name"
-    description: "Must have a single element. The name of the tensor to be\nrestored."
     type: DT_STRING
   }
   output_arg {
     name: "tensor"
-    description: "The restored tensor."
     type_attr: "dt"
   }
   attr {
     name: "dt"
     type: "type"
-    description: "The type of the tensor to be restored."
   }
   attr {
     name: "preferred_shard"
@@ -23816,38 +21875,30 @@ op {
     default_value {
       i: -1
     }
-    description: "Index of file to open first if multiple files match\n`file_pattern`."
   }
-  summary: "Restores a tensor from checkpoint files."
-  description: "Reads a tensor stored in one or several files. If there are several files (for\ninstance because a tensor was saved as slices), `file_pattern` may contain\nwildcard symbols (`*` and `?`) in the filename portion only, not in the\ndirectory portion.\n\nIf a `file_pattern` matches several files, `preferred_shard` can be used to hint\nin which file the requested tensor is likely to be found. This op will first\nopen the file at index `preferred_shard` in the list of matching files and try\nto restore tensors from that file.  Only if some tensors or tensor slices are\nnot found in that first file, then the Op opens all the files. Setting\n`preferred_shard` to match the value passed as the `shard` input\nof a matching `Save` Op may speed up Restore.  This attribute only affects\nperformance, not correctness.  The default value -1 means files are processed in\norder.\n\nSee also `RestoreSlice`."
   is_stateful: true
 }
 op {
   name: "RestoreSlice"
   input_arg {
     name: "file_pattern"
-    description: "Must have a single element. The pattern of the files from\nwhich we read the tensor."
     type: DT_STRING
   }
   input_arg {
     name: "tensor_name"
-    description: "Must have a single element. The name of the tensor to be\nrestored."
     type: DT_STRING
   }
   input_arg {
     name: "shape_and_slice"
-    description: "Scalar. The shapes and slice specifications to use when\nrestoring a tensors."
     type: DT_STRING
   }
   output_arg {
     name: "tensor"
-    description: "The restored tensor."
     type_attr: "dt"
   }
   attr {
     name: "dt"
     type: "type"
-    description: "The type of the tensor to be restored."
   }
   attr {
     name: "preferred_shard"
@@ -23855,60 +21906,47 @@ op {
     default_value {
       i: -1
     }
-    description: "Index of file to open first if multiple files match\n`file_pattern`. See the documentation for `Restore`."
   }
-  summary: "Restores a tensor from checkpoint files."
-  description: "This is like `Restore` except that restored tensor can be listed as filling\nonly a slice of a larger tensor.  `shape_and_slice` specifies the shape of the\nlarger tensor and the slice that the restored tensor covers.\n\nThe `shape_and_slice` input has the same format as the\nelements of the `shapes_and_slices` input of the `SaveSlices` op."
   is_stateful: true
 }
 op {
   name: "RestoreV2"
   input_arg {
     name: "prefix"
-    description: "Must have a single element.  The prefix of a V2 checkpoint."
     type: DT_STRING
   }
   input_arg {
     name: "tensor_names"
-    description: "shape {N}.  The names of the tensors to be restored."
     type: DT_STRING
   }
   input_arg {
     name: "shape_and_slices"
-    description: "shape {N}.  The slice specs of the tensors to be restored.\nEmpty strings indicate that they are non-partitioned tensors."
     type: DT_STRING
   }
   output_arg {
     name: "tensors"
-    description: "shape {N}.  The restored tensors, whose shapes are read from the\ncheckpoint directly."
     type_list_attr: "dtypes"
   }
   attr {
     name: "dtypes"
     type: "list(type)"
-    description: "shape {N}.  The list of expected dtype for the tensors.  Must match\nthose stored in the checkpoint."
     has_minimum: true
     minimum: 1
   }
-  summary: "Restores tensors from a V2 checkpoint."
-  description: "For backward compatibility with the V1 format, this Op currently allows\nrestoring from a V1 checkpoint as well:\n  - This Op first attempts to find the V2 index file pointed to by \"prefix\", and\n    if found proceed to read it as a V2 checkpoint;\n  - Otherwise the V1 read path is invoked.\nRelying on this behavior is not recommended, as the ability to fall back to read\nV1 might be deprecated and eventually removed.\n\nBy default, restores the named tensors in full.  If the caller wishes to restore\nspecific slices of stored tensors, \"shape_and_slices\" should be non-empty\nstrings and correspondingly well-formed.\n\nCallers must ensure all the named tensors are indeed stored in the checkpoint."
   is_stateful: true
 }
 op {
   name: "Reverse"
   input_arg {
     name: "tensor"
-    description: "Up to 8-D."
     type_attr: "T"
   }
   input_arg {
     name: "dims"
-    description: "1-D. The dimensions to reverse."
     type: DT_BOOL
   }
   output_arg {
     name: "output"
-    description: "The same shape as `tensor`."
     type_attr: "T"
   }
   attr {
@@ -23932,30 +21970,24 @@ op {
       }
     }
   }
-  summary: "Reverses specific dimensions of a tensor."
-  description: "Given a `tensor`, and a `bool` tensor `dims` representing the dimensions\nof `tensor`, this operation reverses each dimension i of `tensor` where\n`dims[i]` is `True`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions\nof `tensor` must equal the number of elements in `dims`. In other words:\n\n`rank(tensor) = size(dims)`\n\nFor example:\n\n```\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [False, False, False, True]\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is [False, True, False, False]\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is [False, False, True, False]\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
 }
 op {
   name: "ReverseSequence"
   input_arg {
     name: "input"
-    description: "The input to reverse."
     type_attr: "T"
   }
   input_arg {
     name: "seq_lengths"
-    description: "1-D with length `input.dims(batch_dim)` and\n`max(seq_lengths) <= input.dims(seq_dim)`"
     type_attr: "Tlen"
   }
   output_arg {
     name: "output"
-    description: "The partially reversed input. It has the same shape as `input`."
     type_attr: "T"
   }
   attr {
     name: "seq_dim"
     type: "int"
-    description: "The dimension which is partially reversed."
   }
   attr {
     name: "batch_dim"
@@ -23963,7 +21995,6 @@ op {
     default_value {
       i: 0
     }
-    description: "The dimension along which reversal is performed."
   }
   attr {
     name: "T"
@@ -23982,24 +22013,19 @@ op {
       }
     }
   }
-  summary: "Reverses variable length slices."
-  description: "This op first slices `input` along the dimension `batch_dim`, and for each\nslice `i`, reverses the first `seq_lengths[i]` elements along\nthe dimension `seq_dim`.\n\nThe elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,\nand `seq_lengths` must be a vector of length `input.dims[batch_dim]`.\n\nThe output slice `i` along dimension `batch_dim` is then given by input\nslice `i`, with the first `seq_lengths[i]` slices along dimension\n`seq_dim` reversed.\n\nFor example:\n\n```\n# Given this:\nbatch_dim = 0\nseq_dim = 1\ninput.dims = (4, 8, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]\noutput[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]\noutput[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]\noutput[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[0, 7:, :, ...] = input[0, 7:, :, ...]\noutput[1, 2:, :, ...] = input[1, 2:, :, ...]\noutput[2, 3:, :, ...] = input[2, 3:, :, ...]\noutput[3, 2:, :, ...] = input[3, 2:, :, ...]\n```\n\nIn contrast, if:\n\n```\n# Given this:\nbatch_dim = 2\nseq_dim = 0\ninput.dims = (8, ?, 4, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]\noutput[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]\noutput[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]\noutput[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]\noutput[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]\noutput[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]\noutput[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]\n```"
 }
 op {
   name: "ReverseV2"
   input_arg {
     name: "tensor"
-    description: "Up to 8-D."
     type_attr: "T"
   }
   input_arg {
     name: "axis"
-    description: "1-D. The indices of the dimensions to reverse. Must be in the range\n`[-rank(tensor), rank(tensor))`."
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    description: "The same shape as `tensor`."
     type_attr: "T"
   }
   attr {
@@ -24028,6 +22054,7 @@ op {
         type: DT_INT64
         type: DT_BOOL
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -24036,8 +22063,6 @@ op {
       }
     }
   }
-  summary: "Reverses specific dimensions of a tensor."
-  description: "NOTE `tf.reverse` has now changed behavior in preparation for 1.0.\n`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.\n\nGiven a `tensor`, and a `int32` tensor `axis` representing the set of\ndimensions of `tensor` to reverse. This operation reverses each dimension\n`i` for which there exists `j` s.t. `axis[j] == i`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions specified\nin `axis` may be 0 or more entries. If an index is specified more than\nonce, a InvalidArgument error is raised.\n\nFor example:\n\n```\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [3] or \'dims\' is [-1]\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is \'[1]\' (or \'dims\' is \'[-3]\')\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is \'[2]\' (or \'dims\' is \'[-2]\')\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
 }
 op {
   name: "RightShift"
@@ -24069,8 +22094,6 @@ op {
       }
     }
   }
-  summary: "Elementwise computes the bitwise right-shift of `x` and `y`."
-  description: "Performs a logical shift for unsigned integer types, and an arithmetic shift\nfor signed integer types.\n\nIf `y` is negative, or greater than or equal to than the width of `x` in bits\nthe result is implementation defined."
   is_commutative: true
 }
 op {
@@ -24088,13 +22111,55 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Returns element-wise integer closest to x."
-  description: "If the result is midway between two representable values,\nthe even representable is chosen.\nFor example:\n\n```\nrint(-1.5) ==> -2.0\nrint(0.5000001) ==> 1.0\nrint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]\n```"
+}
+op {
+  name: "Roll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "shift"
+    type_attr: "Tshift"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshift"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "Round"
@@ -24112,6 +22177,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -24121,8 +22187,6 @@ op {
       }
     }
   }
-  summary: "Rounds the values of a tensor to the nearest integer, element-wise."
-  description: "Rounds half to even.  Also known as bankers rounding. If you want to round\naccording to the current system rounding mode use std::cint."
 }
 op {
   name: "Rsqrt"
@@ -24140,6 +22204,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -24147,8 +22212,6 @@ op {
       }
     }
   }
-  summary: "Computes reciprocal of square root of x element-wise."
-  description: "I.e., \\\\(y = 1 / \\sqrt{x}\\\\)."
 }
 op {
   name: "RsqrtGrad"
@@ -24170,6 +22233,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -24177,34 +22241,27 @@ op {
       }
     }
   }
-  summary: "Computes the gradient for the rsqrt of `x` wrt its input."
-  description: "Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`\nis the corresponding input gradient."
 }
 op {
   name: "SampleDistortedBoundingBox"
   input_arg {
     name: "image_size"
-    description: "1-D, containing `[height, width, channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "bounding_boxes"
-    description: "3-D with shape `[batch, N, 4]` describing the N bounding boxes\nassociated with the image."
     type: DT_FLOAT
   }
   output_arg {
     name: "begin"
-    description: "1-D, containing `[offset_height, offset_width, 0]`. Provide as input to\n`tf.slice`."
     type_attr: "T"
   }
   output_arg {
     name: "size"
-    description: "1-D, containing `[target_height, target_width, -1]`. Provide as input to\n`tf.slice`."
     type_attr: "T"
   }
   output_arg {
     name: "bboxes"
-    description: "3-D with shape `[1, 1, 4]` containing the distorted bounding box.\nProvide as input to `tf.image.draw_bounding_boxes`."
     type: DT_FLOAT
   }
   attr {
@@ -24226,7 +22283,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either `seed` or `seed2` are set to non-zero, the random number\ngenerator is seeded by the given `seed`.  Otherwise, it is seeded by a random\nseed."
   }
   attr {
     name: "seed2"
@@ -24234,7 +22290,6 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "min_object_covered"
@@ -24242,7 +22297,6 @@ op {
     default_value {
       f: 0.1
     }
-    description: "The cropped area of the image must contain at least this\nfraction of any bounding box supplied. The value of this parameter should be\nnon-negative. In the case of 0, the cropped area does not need to overlap\nany of the bounding boxes supplied."
   }
   attr {
     name: "aspect_ratio_range"
@@ -24253,7 +22307,6 @@ op {
         f: 1.33
       }
     }
-    description: "The cropped area of the image must have an aspect ratio =\nwidth / height within this range."
   }
   attr {
     name: "area_range"
@@ -24264,7 +22317,6 @@ op {
         f: 1
       }
     }
-    description: "The cropped area of the image must contain a fraction of the\nsupplied image within in this range."
   }
   attr {
     name: "max_attempts"
@@ -24272,7 +22324,6 @@ op {
     default_value {
       i: 100
     }
-    description: "Number of attempts at generating a cropped region of the image\nof the specified constraints. After `max_attempts` failures, return the entire\nimage."
   }
   attr {
     name: "use_image_if_no_bounding_boxes"
@@ -24280,42 +22331,33 @@ op {
     default_value {
       b: false
     }
-    description: "Controls behavior if no bounding boxes supplied.\nIf true, assume an implicit bounding box covering the whole input. If false,\nraise an error."
   }
-  summary: "Generate a single randomly distorted bounding box for an image."
-  description: "Bounding box annotations are often supplied in addition to ground-truth labels\nin image recognition or object localization tasks. A common technique for\ntraining such a system is to randomly distort an image while preserving\nits content, i.e. *data augmentation*. This Op outputs a randomly distorted\nlocalization of an object, i.e. bounding box, given an `image_size`,\n`bounding_boxes` and a series of constraints.\n\nThe output of this Op is a single bounding box that may be used to crop the\noriginal image. The output is returned as 3 tensors: `begin`, `size` and\n`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the\nimage. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize\nwhat the bounding box looks like.\n\nBounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The\nbounding box coordinates are floats in `[0.0, 1.0]` relative to the width and\nheight of the underlying image.\n\nFor example,\n\n```python\n    # Generate a single distorted bounding box.\n    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(\n        tf.shape(image),\n        bounding_boxes=bounding_boxes)\n\n    # Draw the bounding box in an image summary.\n    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),\n                                                  bbox_for_draw)\n    tf.image_summary(\'images_with_box\', image_with_box)\n\n    # Employ the bounding box to distort the image.\n    distorted_image = tf.slice(image, begin, size)\n```\n\nNote that if no bounding box information is available, setting\n`use_image_if_no_bounding_boxes = true` will assume there is a single implicit\nbounding box covering the whole image. If `use_image_if_no_bounding_boxes` is\nfalse and no bounding boxes are supplied, an error is raised."
   is_stateful: true
 }
 op {
   name: "SampleDistortedBoundingBoxV2"
   input_arg {
     name: "image_size"
-    description: "1-D, containing `[height, width, channels]`."
     type_attr: "T"
   }
   input_arg {
     name: "bounding_boxes"
-    description: "3-D with shape `[batch, N, 4]` describing the N bounding boxes\nassociated with the image."
     type: DT_FLOAT
   }
   input_arg {
     name: "min_object_covered"
-    description: "The cropped area of the image must contain at least this\nfraction of any bounding box supplied. The value of this parameter should be\nnon-negative. In the case of 0, the cropped area does not need to overlap\nany of the bounding boxes supplied."
     type: DT_FLOAT
   }
   output_arg {
     name: "begin"
-    description: "1-D, containing `[offset_height, offset_width, 0]`. Provide as input to\n`tf.slice`."
     type_attr: "T"
   }
   output_arg {
     name: "size"
-    description: "1-D, containing `[target_height, target_width, -1]`. Provide as input to\n`tf.slice`."
     type_attr: "T"
   }
   output_arg {
     name: "bboxes"
-    description: "3-D with shape `[1, 1, 4]` containing the distorted bounding box.\nProvide as input to `tf.image.draw_bounding_boxes`."
     type: DT_FLOAT
   }
   attr {
@@ -24337,7 +22379,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either `seed` or `seed2` are set to non-zero, the random number\ngenerator is seeded by the given `seed`.  Otherwise, it is seeded by a random\nseed."
   }
   attr {
     name: "seed2"
@@ -24345,7 +22386,6 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "aspect_ratio_range"
@@ -24356,7 +22396,6 @@ op {
         f: 1.33
       }
     }
-    description: "The cropped area of the image must have an aspect ratio =\nwidth / height within this range."
   }
   attr {
     name: "area_range"
@@ -24367,7 +22406,6 @@ op {
         f: 1
       }
     }
-    description: "The cropped area of the image must contain a fraction of the\nsupplied image within in this range."
   }
   attr {
     name: "max_attempts"
@@ -24375,7 +22413,6 @@ op {
     default_value {
       i: 100
     }
-    description: "Number of attempts at generating a cropped region of the image\nof the specified constraints. After `max_attempts` failures, return the entire\nimage."
   }
   attr {
     name: "use_image_if_no_bounding_boxes"
@@ -24383,27 +22420,21 @@ op {
     default_value {
       b: false
     }
-    description: "Controls behavior if no bounding boxes supplied.\nIf true, assume an implicit bounding box covering the whole input. If false,\nraise an error."
   }
-  summary: "Generate a single randomly distorted bounding box for an image."
-  description: "Bounding box annotations are often supplied in addition to ground-truth labels\nin image recognition or object localization tasks. A common technique for\ntraining such a system is to randomly distort an image while preserving\nits content, i.e. *data augmentation*. This Op outputs a randomly distorted\nlocalization of an object, i.e. bounding box, given an `image_size`,\n`bounding_boxes` and a series of constraints.\n\nThe output of this Op is a single bounding box that may be used to crop the\noriginal image. The output is returned as 3 tensors: `begin`, `size` and\n`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the\nimage. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize\nwhat the bounding box looks like.\n\nBounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The\nbounding box coordinates are floats in `[0.0, 1.0]` relative to the width and\nheight of the underlying image.\n\nFor example,\n\n```python\n    # Generate a single distorted bounding box.\n    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(\n        tf.shape(image),\n        bounding_boxes=bounding_boxes)\n\n    # Draw the bounding box in an image summary.\n    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),\n                                                  bbox_for_draw)\n    tf.image_summary(\'images_with_box\', image_with_box)\n\n    # Employ the bounding box to distort the image.\n    distorted_image = tf.slice(image, begin, size)\n```\n\nNote that if no bounding box information is available, setting\n`use_image_if_no_bounding_boxes = true` will assume there is a single implicit\nbounding box covering the whole image. If `use_image_if_no_bounding_boxes` is\nfalse and no bounding boxes are supplied, an error is raised."
   is_stateful: true
 }
 op {
   name: "Save"
   input_arg {
     name: "filename"
-    description: "Must have a single element. The name of the file to which we write\nthe tensor."
     type: DT_STRING
   }
   input_arg {
     name: "tensor_names"
-    description: "Shape `[N]`. The names of the tensors to be saved."
     type: DT_STRING
   }
   input_arg {
     name: "data"
-    description: "`N` tensors to save."
     type_list_attr: "T"
   }
   attr {
@@ -24412,30 +22443,24 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Saves the input tensors to disk."
-  description: "The size of `tensor_names` must match the number of tensors in `data`. `data[i]`\nis written to `filename` with name `tensor_names[i]`.\n\nSee also `SaveSlices`."
   is_stateful: true
 }
 op {
   name: "SaveSlices"
   input_arg {
     name: "filename"
-    description: "Must have a single element. The name of the file to which we write the\ntensor."
     type: DT_STRING
   }
   input_arg {
     name: "tensor_names"
-    description: "Shape `[N]`. The names of the tensors to be saved."
     type: DT_STRING
   }
   input_arg {
     name: "shapes_and_slices"
-    description: "Shape `[N]`.  The shapes and slice specifications to use when\nsaving the tensors."
     type: DT_STRING
   }
   input_arg {
     name: "data"
-    description: "`N` tensors to save."
     type_list_attr: "T"
   }
   attr {
@@ -24444,30 +22469,24 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Saves input tensors slices to disk."
-  description: "This is like `Save` except that tensors can be listed in the saved file as being\na slice of a larger tensor.  `shapes_and_slices` specifies the shape of the\nlarger tensor and the slice that this tensor covers. `shapes_and_slices` must\nhave as many elements as `tensor_names`.\n\nElements of the `shapes_and_slices` input must either be:\n\n*  The empty string, in which case the corresponding tensor is\n   saved normally.\n*  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the\n   `dimI` are the dimensions of the larger tensor and `slice-spec`\n   specifies what part is covered by the tensor to save.\n\n`slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`\nwhere each `sliceI` is either:\n\n*  The string `-` meaning that the slice covers all indices of this dimension\n*  `start,length` where `start` and `length` are integers.  In that\n   case the slice covers `length` indices starting at `start`.\n\nSee also `Save`."
   is_stateful: true
 }
 op {
   name: "SaveV2"
   input_arg {
     name: "prefix"
-    description: "Must have a single element. The prefix of the V2 checkpoint to which we\nwrite the tensors."
     type: DT_STRING
   }
   input_arg {
     name: "tensor_names"
-    description: "shape {N}. The names of the tensors to be saved."
     type: DT_STRING
   }
   input_arg {
     name: "shape_and_slices"
-    description: "shape {N}.  The slice specs of the tensors to be saved.\nEmpty strings indicate that they are non-partitioned tensors."
     type: DT_STRING
   }
   input_arg {
     name: "tensors"
-    description: "`N` tensors to save."
     type_list_attr: "dtypes"
   }
   attr {
@@ -24476,25 +22495,20 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Saves tensors in V2 checkpoint format."
-  description: "By default, saves the named tensors in full.  If the caller wishes to save\nspecific slices of full tensors, \"shape_and_slices\" should be non-empty strings\nand correspondingly well-formed."
   is_stateful: true
 }
 op {
   name: "ScalarSummary"
   input_arg {
     name: "tags"
-    description: "Tags for the summary."
     type: DT_STRING
   }
   input_arg {
     name: "values"
-    description: "Same shape as `tags.  Values for the summary."
     type_attr: "T"
   }
   output_arg {
     name: "summary"
-    description: "Scalar.  Serialized `Summary` protocol buffer."
     type: DT_STRING
   }
   attr {
@@ -24505,10 +22519,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -24516,8 +22531,6 @@ op {
       }
     }
   }
-  summary: "Outputs a `Summary` protocol buffer with scalar values."
-  description: "The input `tags` and `values` must have the same shape.  The generated summary\nhas a summary value for each tag-value pair in `tags` and `values`."
 }
 op {
   name: "ScanDataset"
@@ -24564,29 +22577,24 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset successively reduces `f` over the elements of `input_dataset`."
 }
 op {
   name: "ScatterAdd"
   input_arg {
     name: "ref"
-    description: "Should be from a `Variable` node."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "indices"
-    description: "A tensor of indices into the first dimension of `ref`."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A tensor of updated values to add to `ref`."
     type_attr: "T"
   }
   output_arg {
     name: "output_ref"
-    description: "= Same as `ref`.  Returned as a convenience for operations that want\nto use the updated values after the update is done."
     type_attr: "T"
     is_ref: true
   }
@@ -24597,17 +22605,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -24630,32 +22639,25 @@ op {
     default_value {
       b: false
     }
-    description: "If True, the addition will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Adds sparse updates to a variable reference."
-  description: "This operation computes\n\n    # Scalar indices\n    ref[indices, ...] += updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] += updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterAdd.png\" alt>\n</div>"
 }
 op {
   name: "ScatterDiv"
   input_arg {
     name: "ref"
-    description: "Should be from a `Variable` node."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "indices"
-    description: "A tensor of indices into the first dimension of `ref`."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A tensor of values that `ref` is divided by."
     type_attr: "T"
   }
   output_arg {
     name: "output_ref"
-    description: "= Same as `ref`.  Returned as a convenience for operations that want\nto use the updated values after the update is done."
     type_attr: "T"
     is_ref: true
   }
@@ -24666,17 +22668,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -24699,32 +22702,25 @@ op {
     default_value {
       b: false
     }
-    description: "If True, the operation will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Divides a variable reference by sparse updates."
-  description: "This operation computes\n\n```python\n    # Scalar indices\n    ref[indices, ...] /= updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] /= updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions divide.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`."
 }
 op {
   name: "ScatterMul"
   input_arg {
     name: "ref"
-    description: "Should be from a `Variable` node."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "indices"
-    description: "A tensor of indices into the first dimension of `ref`."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A tensor of updated values to multiply to `ref`."
     type_attr: "T"
   }
   output_arg {
     name: "output_ref"
-    description: "= Same as `ref`.  Returned as a convenience for operations that want\nto use the updated values after the update is done."
     type_attr: "T"
     is_ref: true
   }
@@ -24735,17 +22731,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -24768,31 +22765,24 @@ op {
     default_value {
       b: false
     }
-    description: "If True, the operation will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Multiplies sparse updates into a variable reference."
-  description: "This operation computes\n\n```python\n    # Scalar indices\n    ref[indices, ...] *= updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] *= updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions multiply.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`."
 }
 op {
   name: "ScatterNd"
   input_arg {
     name: "indices"
-    description: "Index tensor."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "Updates to scatter into output."
     type_attr: "T"
   }
   input_arg {
     name: "shape"
-    description: "1-D. The shape of the resulting tensor."
     type_attr: "Tindices"
   }
   output_arg {
     name: "output"
-    description: "A new tensor with the given shape and updates applied according\nto the indices."
     type_attr: "T"
   }
   attr {
@@ -24809,30 +22799,24 @@ op {
       }
     }
   }
-  summary: "Scatter `updates` into a new (initially zero) tensor according to `indices`."
-  description: "Creates a new tensor by applying sparse `updates` to individual\nvalues or slices within a zero tensor of the given `shape` according to\nindices.  This operator is the inverse of the @{tf.gather_nd} operator which\nextracts values or slices from a given tensor.\n\n**WARNING**: The order in which updates are applied is nondeterministic, so the\noutput will be nondeterministic if `indices` contains duplicates.\n\n`indices` is an integer tensor containing indices into a new tensor of shape\n`shape`.  The last dimension of `indices` can be at most the rank of `shape`:\n\n    indices.shape[-1] <= shape.rank\n\nThe last dimension of `indices` corresponds to indices into elements\n(if `indices.shape[-1] = shape.rank`) or slices\n(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of\n`shape`.  `updates` is a tensor with shape\n\n    indices.shape[:-1] + shape[indices.shape[-1]:]\n\nThe simplest form of scatter is to insert individual elements in a tensor by\nindex. For example, say we want to insert 4 scattered elements in a rank-1\ntensor with 8 elements.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterNd1.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    shape = tf.constant([8])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print(sess.run(scatter))\n```\n\nThe resulting tensor would look like this:\n\n    [0, 11, 0, 10, 9, 0, 0, 12]\n\nWe can also, insert entire slices of a higher rank tensor all at once. For\nexample, if we wanted to insert two slices in the first dimension of a\nrank-3 tensor with two matrices of new values.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterNd2.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[0], [2]])\n    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]],\n                           [[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]]])\n    shape = tf.constant([4, 4, 4])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print(sess.run(scatter))\n```\n\nThe resulting tensor would look like this:\n\n    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],\n     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]"
 }
 op {
   name: "ScatterNdAdd"
   input_arg {
     name: "ref"
-    description: "A mutable Tensor. Should be from a Variable node."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "indices"
-    description: "A Tensor. Must be one of the following types: int32, int64.\nA tensor of indices into ref."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A Tensor. Must have the same type as ref. A tensor of updated values\nto add to ref."
     type_attr: "T"
   }
   output_arg {
     name: "output_ref"
-    description: "Same as ref. Returned as a convenience for operations that want\nto use the updated values after the update is done."
     type_attr: "T"
     is_ref: true
   }
@@ -24843,17 +22827,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -24876,31 +22861,24 @@ op {
     default_value {
       b: false
     }
-    description: "An optional bool. Defaults to True. If True, the assignment will\nbe protected by a lock; otherwise the behavior is undefined,\nbut may exhibit less contention."
   }
-  summary: "Applies sparse addition between `updates` and individual values or slices"
-  description: "within a given variable according to `indices`.\n\n`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `ref`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `ref`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].\n```\n\nFor example, say we want to add 4 scattered elements to a rank-1 tensor to 8\nelements. In Python, that addition would look like this:\n\n    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    add = tf.scatter_nd_add(ref, indices, updates)\n    with tf.Session() as sess:\n      print sess.run(add)\n\nThe resulting update to ref would look like this:\n\n    [1, 13, 3, 14, 14, 6, 7, 20]\n\nSee @{tf.scatter_nd} for more details about how to make updates to\nslices."
 }
 op {
   name: "ScatterNdNonAliasingAdd"
   input_arg {
     name: "input"
-    description: "A Tensor."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A Tensor. Must be one of the following types: `int32`, `int64`.\nA tensor of indices into `input`."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A Tensor. Must have the same type as ref. A tensor of updated values\nto add to `input`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "A `Tensor` with the same shape as `input`, containing values of `input`\nupdated with `updates`."
     type_attr: "T"
   }
   attr {
@@ -24910,17 +22888,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -24937,30 +22916,24 @@ op {
       }
     }
   }
-  summary: "Applies sparse addition to `input` using individual values or slices"
-  description: "from `updates` according to indices `indices`.  The updates are non-aliasing:\n`input` is only modified in-place if no other operations will use it.\nOtherwise, a copy of `input` is made.  This operation has a gradient with\nrespect to both `input` and `updates`.\n\n`input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `input`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or `(P-K)`-dimensional slices\n(if `K < P`) along the `K`th dimension of `input`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].\n```\n\nFor example, say we want to add 4 scattered elements to a rank-1 tensor to 8\nelements. In Python, that addition would look like this:\n\n    input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    output = tf.scatter_nd_non_aliasing_add(input, indices, updates)\n    with tf.Session() as sess:\n      print(sess.run(output))\n\nThe resulting value `output` would look like this:\n\n    [1, 13, 3, 14, 14, 6, 7, 20]\n\nSee @{tf.scatter_nd} for more details about how to make updates to slices."
 }
 op {
   name: "ScatterNdSub"
   input_arg {
     name: "ref"
-    description: "A mutable Tensor. Should be from a Variable node."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "indices"
-    description: "A Tensor. Must be one of the following types: int32, int64.\nA tensor of indices into ref."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A Tensor. Must have the same type as ref. A tensor of updated values\nto subtract from ref."
     type_attr: "T"
   }
   output_arg {
     name: "output_ref"
-    description: "Same as ref. Returned as a convenience for operations that want\nto use the updated values after the update is done."
     type_attr: "T"
     is_ref: true
   }
@@ -24971,17 +22944,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -25004,32 +22978,25 @@ op {
     default_value {
       b: false
     }
-    description: "An optional bool. Defaults to True. If True, the assignment will\nbe protected by a lock; otherwise the behavior is undefined,\nbut may exhibit less contention."
   }
-  summary: "Applies sparse subtraction between `updates` and individual values or slices"
-  description: "within a given variable according to `indices`.\n\n`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `ref`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `ref`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].\n```\n\nFor example, say we want to subtract 4 scattered elements from a rank-1 tensor\nwith 8 elements. In Python, that subtraction would look like this:\n\n    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    sub = tf.scatter_nd_sub(ref, indices, updates)\n    with tf.Session() as sess:\n      print sess.run(sub)\n\nThe resulting update to ref would look like this:\n\n    [1, -9, 3, -6, -4, 6, 7, -4]\n\nSee @{tf.scatter_nd} for more details about how to make updates to\nslices."
 }
 op {
   name: "ScatterNdUpdate"
   input_arg {
     name: "ref"
-    description: "A mutable Tensor. Should be from a Variable node."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "indices"
-    description: "A Tensor. Must be one of the following types: int32, int64.\nA tensor of indices into ref."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A Tensor. Must have the same type as ref. A tensor of updated\nvalues to add to ref."
     type_attr: "T"
   }
   output_arg {
     name: "output_ref"
-    description: "Same as ref. Returned as a convenience for operations that want to\nuse the updated values after the update is done."
     type_attr: "T"
     is_ref: true
   }
@@ -25053,32 +23020,25 @@ op {
     default_value {
       b: true
     }
-    description: "An optional bool. Defaults to True. If True, the assignment will\nbe protected by a lock; otherwise the behavior is undefined,\nbut may exhibit less contention."
   }
-  summary: "Applies sparse `updates` to individual values or slices within a given"
-  description: "variable according to `indices`.\n\n`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `ref`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `ref`.\n\n`updates` is `Tensor` of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].\n```\n\nFor example, say we want to update 4 scattered elements to a rank-1 tensor to\n8 elements. In Python, that update would look like this:\n\n```python\n    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])\n    indices = tf.constant([[4], [3], [1] ,[7]])\n    updates = tf.constant([9, 10, 11, 12])\n    update = tf.scatter_nd_update(ref, indices, updates)\n    with tf.Session() as sess:\n      print sess.run(update)\n```\n\nThe resulting update to ref would look like this:\n\n    [1, 11, 3, 10, 9, 6, 7, 12]\n\nSee @{tf.scatter_nd} for more details about how to make updates to\nslices."
 }
 op {
   name: "ScatterSub"
   input_arg {
     name: "ref"
-    description: "Should be from a `Variable` node."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "indices"
-    description: "A tensor of indices into the first dimension of `ref`."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A tensor of updated values to subtract from `ref`."
     type_attr: "T"
   }
   output_arg {
     name: "output_ref"
-    description: "= Same as `ref`.  Returned as a convenience for operations that want\nto use the updated values after the update is done."
     type_attr: "T"
     is_ref: true
   }
@@ -25089,17 +23049,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -25122,32 +23083,25 @@ op {
     default_value {
       b: false
     }
-    description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Subtracts sparse updates to a variable reference."
-  description: "```python\n    # Scalar indices\n    ref[indices, ...] -= updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] -= updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their (negated) contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterSub.png\" alt>\n</div>"
 }
 op {
   name: "ScatterUpdate"
   input_arg {
     name: "ref"
-    description: "Should be from a `Variable` node."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "indices"
-    description: "A tensor of indices into the first dimension of `ref`."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A tensor of updated values to store in `ref`."
     type_attr: "T"
   }
   output_arg {
     name: "output_ref"
-    description: "= Same as `ref`.  Returned as a convenience for operations that want\nto use the updated values after the update is done."
     type_attr: "T"
     is_ref: true
   }
@@ -25171,105 +23125,85 @@ op {
     default_value {
       b: true
     }
-    description: "If True, the assignment will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Applies sparse updates to a variable reference."
-  description: "This operation computes\n\n```python\n    # Scalar indices\n    ref[indices, ...] = updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] = updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nIf values in `ref` is to be updated more than once, because there are\nduplicate entries in `indices`, the order at which the updates happen\nfor each value is undefined.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterUpdate.png\" alt>\n</div>"
 }
 op {
   name: "SdcaFprint"
   input_arg {
     name: "input"
-    description: "vector of strings to compute fingerprints on."
     type: DT_STRING
   }
   output_arg {
     name: "output"
-    description: "a (N,2) shaped matrix where N is the number of elements in the input\nvector. Each row contains the low and high parts of the fingerprint."
     type: DT_INT64
   }
-  summary: "Computes fingerprints of the input strings."
 }
 op {
   name: "SdcaOptimizer"
   input_arg {
     name: "sparse_example_indices"
-    description: "a list of vectors which contain example indices."
     type: DT_INT64
     number_attr: "num_sparse_features"
   }
   input_arg {
     name: "sparse_feature_indices"
-    description: "a list of vectors which contain feature indices."
     type: DT_INT64
     number_attr: "num_sparse_features"
   }
   input_arg {
     name: "sparse_feature_values"
-    description: "a list of vectors which contains feature value\nassociated with each feature group."
     type: DT_FLOAT
     number_attr: "num_sparse_features_with_values"
   }
   input_arg {
     name: "dense_features"
-    description: "a list of matrices which contains the dense feature values."
     type: DT_FLOAT
     number_attr: "num_dense_features"
   }
   input_arg {
     name: "example_weights"
-    description: "a vector which contains the weight associated with each\nexample."
     type: DT_FLOAT
   }
   input_arg {
     name: "example_labels"
-    description: "a vector which contains the label/target associated with each\nexample."
     type: DT_FLOAT
   }
   input_arg {
     name: "sparse_indices"
-    description: "a list of vectors where each value is the indices which has\ncorresponding weights in sparse_weights. This field maybe omitted for the\ndense approach."
     type: DT_INT64
     number_attr: "num_sparse_features"
   }
   input_arg {
     name: "sparse_weights"
-    description: "a list of vectors where each value is the weight associated with\na sparse feature group."
     type: DT_FLOAT
     number_attr: "num_sparse_features"
   }
   input_arg {
     name: "dense_weights"
-    description: "a list of vectors where the values are the weights associated\nwith a dense feature group."
     type: DT_FLOAT
     number_attr: "num_dense_features"
   }
   input_arg {
     name: "example_state_data"
-    description: "a list of vectors containing the example state data."
     type: DT_FLOAT
   }
   output_arg {
     name: "out_example_state_data"
-    description: "a list of vectors containing the updated example state\ndata."
     type: DT_FLOAT
   }
   output_arg {
     name: "out_delta_sparse_weights"
-    description: "a list of vectors where each value is the delta\nweights associated with a sparse feature group."
     type: DT_FLOAT
     number_attr: "num_sparse_features"
   }
   output_arg {
     name: "out_delta_dense_weights"
-    description: "a list of vectors where the values are the delta\nweights associated with a dense feature group."
     type: DT_FLOAT
     number_attr: "num_dense_features"
   }
   attr {
     name: "loss_type"
     type: "string"
-    description: "Type of the primal loss. Currently SdcaSolver supports logistic,\nsquared and hinge losses."
     allowed_values {
       list {
         s: "logistic_loss"
@@ -25285,58 +23219,47 @@ op {
     default_value {
       b: false
     }
-    description: "Whether to use Adapative SDCA for the inner loop."
   }
   attr {
     name: "num_sparse_features"
     type: "int"
-    description: "Number of sparse feature groups to train on."
     has_minimum: true
   }
   attr {
     name: "num_sparse_features_with_values"
     type: "int"
-    description: "Number of sparse feature groups with values\nassociated with it, otherwise implicitly treats values as 1.0."
     has_minimum: true
   }
   attr {
     name: "num_dense_features"
     type: "int"
-    description: "Number of dense feature groups to train on."
     has_minimum: true
   }
   attr {
     name: "l1"
     type: "float"
-    description: "Symmetric l1 regularization strength."
   }
   attr {
     name: "l2"
     type: "float"
-    description: "Symmetric l2 regularization strength."
   }
   attr {
     name: "num_loss_partitions"
     type: "int"
-    description: "Number of partitions of the global loss function."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "num_inner_iterations"
     type: "int"
-    description: "Number of iterations per mini-batch."
     has_minimum: true
     minimum: 1
   }
-  summary: "Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for"
-  description: "linear models with L1 + L2 regularization. As global optimization objective is\nstrongly-convex, the optimizer optimizes the dual objective at each step. The\noptimizer applies each update one example at a time. Examples are sampled\nuniformly, and the optimizer is learning rate free and enjoys linear convergence\nrate.\n\n[Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>\nShai Shalev-Shwartz, Tong Zhang. 2012\n\n$$Loss Objective = \\sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$\n\n[Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>\nChenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,\nPeter Richtarik, Martin Takac. 2015\n\n[Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>\nDominik Csiba, Zheng Qu, Peter Richtarik. 2015"
 }
 op {
   name: "SdcaShrinkL1"
   input_arg {
     name: "weights"
-    description: "a list of vectors where each value is the weight associated with a\nfeature group."
     type: DT_FLOAT
     number_attr: "num_features"
     is_ref: true
@@ -25344,20 +23267,16 @@ op {
   attr {
     name: "num_features"
     type: "int"
-    description: "Number of feature groups to apply shrinking step."
     has_minimum: true
   }
   attr {
     name: "l1"
     type: "float"
-    description: "Symmetric l1 regularization strength."
   }
   attr {
     name: "l2"
     type: "float"
-    description: "Symmetric l2 regularization strength. Should be a positive float."
   }
-  summary: "Applies L1 regularization shrink step on the parameters."
 }
 op {
   name: "SegmentMax"
@@ -25367,12 +23286,10 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    description: "A 1-D tensor whose rank is equal to the rank of `data`\'s\nfirst dimension.  Values should be sorted and can be repeated."
     type_attr: "Tindices"
   }
   output_arg {
     name: "output"
-    description: "Has same shape as data, except for dimension 0 which\nhas size `k`, the number of segments."
     type_attr: "T"
   }
   attr {
@@ -25383,10 +23300,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -25404,8 +23322,6 @@ op {
       }
     }
   }
-  summary: "Computes the maximum along segments of a tensor."
-  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\max_j(data_j)\\\\) where `max` is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the max is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentMax.png\" alt>\n</div>"
 }
 op {
   name: "SegmentMean"
@@ -25415,12 +23331,10 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    description: "A 1-D tensor whose rank is equal to the rank of `data`\'s\nfirst dimension.  Values should be sorted and can be repeated."
     type_attr: "Tindices"
   }
   output_arg {
     name: "output"
-    description: "Has same shape as data, except for dimension 0 which\nhas size `k`, the number of segments."
     type_attr: "T"
   }
   attr {
@@ -25431,10 +23345,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -25452,8 +23367,6 @@ op {
       }
     }
   }
-  summary: "Computes the mean along segments of a tensor."
-  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\frac{\\sum_j data_j}{N}\\\\) where `mean` is\nover `j` such that `segment_ids[j] == i` and `N` is the total number of\nvalues summed.\n\nIf the mean is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentMean.png\" alt>\n</div>"
 }
 op {
   name: "SegmentMin"
@@ -25463,12 +23376,10 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    description: "A 1-D tensor whose rank is equal to the rank of `data`\'s\nfirst dimension.  Values should be sorted and can be repeated."
     type_attr: "Tindices"
   }
   output_arg {
     name: "output"
-    description: "Has same shape as data, except for dimension 0 which\nhas size `k`, the number of segments."
     type_attr: "T"
   }
   attr {
@@ -25479,10 +23390,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -25500,8 +23412,6 @@ op {
       }
     }
   }
-  summary: "Computes the minimum along segments of a tensor."
-  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\min_j(data_j)\\\\) where `min` is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the min is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentMin.png\" alt>\n</div>"
 }
 op {
   name: "SegmentProd"
@@ -25511,12 +23421,10 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    description: "A 1-D tensor whose rank is equal to the rank of `data`\'s\nfirst dimension.  Values should be sorted and can be repeated."
     type_attr: "Tindices"
   }
   output_arg {
     name: "output"
-    description: "Has same shape as data, except for dimension 0 which\nhas size `k`, the number of segments."
     type_attr: "T"
   }
   attr {
@@ -25526,17 +23434,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -25553,8 +23462,6 @@ op {
       }
     }
   }
-  summary: "Computes the product along segments of a tensor."
-  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\prod_j data_j\\\\) where the product is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the product is empty for a given segment ID `i`, `output[i] = 1`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentProd.png\" alt>\n</div>"
 }
 op {
   name: "SegmentSum"
@@ -25564,12 +23471,10 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    description: "A 1-D tensor whose rank is equal to the rank of `data`\'s\nfirst dimension.  Values should be sorted and can be repeated."
     type_attr: "Tindices"
   }
   output_arg {
     name: "output"
-    description: "Has same shape as data, except for dimension 0 which\nhas size `k`, the number of segments."
     type_attr: "T"
   }
   attr {
@@ -25579,17 +23484,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -25606,8 +23512,6 @@ op {
       }
     }
   }
-  summary: "Computes the sum along segments of a tensor."
-  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentSum.png\" alt>\n</div>"
 }
 op {
   name: "Select"
@@ -25617,36 +23521,29 @@ op {
   }
   input_arg {
     name: "t"
-    description: "= A `Tensor` which may have the same shape as `condition`.\nIf `condition` is rank 1, `t` may have higher rank,\nbut its first dimension must match the size of `condition`."
     type_attr: "T"
   }
   input_arg {
     name: "e"
-    description: "= A `Tensor` with the same type and shape as `t`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "= A `Tensor` with the same type and shape as `t` and `e`."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Selects elements from `t` or `e`, depending on `condition`."
-  description: "The `t`, and `e` tensors must all have the same shape, and the\noutput will also have that shape.\n\nThe `condition` tensor must be a scalar if `t` and `e` are scalars.\nIf `t` and `e` are vectors or higher rank, then `condition` must be either a\nscalar, a vector with size matching the first dimension of `t`, or must have\nthe same shape as `t`.\n\nThe `condition` tensor acts as a mask that chooses, based on the value at each\nelement, whether the corresponding element / row in the output should be\ntaken from `t` (if true) or `e` (if false).\n\nIf `condition` is a vector and `t` and `e` are higher rank matrices, then\nit chooses which row (outer dimension) to copy from `t` and `e`.\nIf `condition` has the same shape as `t` and `e`, then it chooses which\nelement to copy from `t` and `e`.\n\nFor example:\n\n```python\n# \'condition\' tensor is [[True,  False]\n#                        [False, True]]\n# \'t\' is [[1, 2],\n#         [3, 4]]\n# \'e\' is [[5, 6],\n#         [7, 8]]\nselect(condition, t, e)  # => [[1, 6], [7, 4]]\n\n\n# \'condition\' tensor is [True, False]\n# \'t\' is [[1, 2],\n#         [3, 4]]\n# \'e\' is [[5, 6],\n#         [7, 8]]\nselect(condition, t, e) ==> [[1, 2],\n                             [7, 8]]\n\n```"
 }
 op {
   name: "SelfAdjointEig"
   input_arg {
     name: "input"
-    description: "Shape is `[..., M, M]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Shape is `[..., M+1, M]`."
     type_attr: "T"
   }
   attr {
@@ -25659,8 +23556,6 @@ op {
       }
     }
   }
-  summary: "Computes the Eigen Decomposition of a batch of square self-adjoint matrices."
-  description: "The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices, with the same constraints as the single matrix\nSelfAdjointEig.\n\nThe result is a [..., M+1, M] matrix with [..., 0,:] containing the\neigenvalues, and subsequent [...,1:, :] containing the eigenvectors."
   deprecation {
     version: 11
     explanation: "Use SelfAdjointEigV2 instead."
@@ -25670,17 +23565,14 @@ op {
   name: "SelfAdjointEigV2"
   input_arg {
     name: "input"
-    description: "`Tensor` input of shape `[N, N]`."
     type_attr: "T"
   }
   output_arg {
     name: "e"
-    description: "Eigenvalues. Shape is `[N]`."
     type_attr: "T"
   }
   output_arg {
     name: "v"
-    description: "Eigenvectors. Shape is `[N, N]`."
     type_attr: "T"
   }
   attr {
@@ -25689,7 +23581,6 @@ op {
     default_value {
       b: true
     }
-    description: "If `True` then eigenvectors will be computed and returned in `v`.\nOtherwise, only the eigenvalues will be computed."
   }
   attr {
     name: "T"
@@ -25703,8 +23594,6 @@ op {
       }
     }
   }
-  summary: "Computes the eigen decomposition of one or more square self-adjoint matrices."
-  description: "Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in\n`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.\n\n```python\n# a is a tensor.\n# e is a tensor of eigenvalues.\n# v is a tensor of eigenvectors.\ne, v = self_adjoint_eig(a)\ne = self_adjoint_eig(a, compute_v=False)\n```"
 }
 op {
   name: "Selu"
@@ -25722,29 +23611,25 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`"
-  description: "if < 0, `scale * features` otherwise.\n\nSee [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)"
 }
 op {
   name: "SeluGrad"
   input_arg {
     name: "gradients"
-    description: "The backpropagated gradients to the corresponding Selu operation."
     type_attr: "T"
   }
   input_arg {
     name: "outputs"
-    description: "The outputs of the corresponding Selu operation."
     type_attr: "T"
   }
   output_arg {
     name: "backprops"
-    description: "The gradients: `gradients * (outputs + scale * alpha)`\nif outputs < 0, `scale * gradients` otherwise."
     type_attr: "T"
   }
   attr {
@@ -25753,122 +23638,128 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes gradients for the scaled exponential linear (Selu) operation."
 }
 op {
   name: "SerializeIterator"
   input_arg {
     name: "resource_handle"
-    description: "A handle to an iterator resource."
     type: DT_RESOURCE
   }
   output_arg {
     name: "serialized"
-    description: "A variant tensor storing the state of the iterator contained in the\nresource."
     type: DT_VARIANT
   }
-  summary: "Converts the given `resource_handle` representing an iterator to a variant tensor."
   is_stateful: true
 }
 op {
   name: "SerializeManySparse"
   input_arg {
     name: "sparse_indices"
-    description: "2-D.  The `indices` of the minibatch `SparseTensor`."
     type: DT_INT64
   }
   input_arg {
     name: "sparse_values"
-    description: "1-D.  The `values` of the minibatch `SparseTensor`."
     type_attr: "T"
   }
   input_arg {
     name: "sparse_shape"
-    description: "1-D.  The `shape` of the minibatch `SparseTensor`."
     type: DT_INT64
   }
   output_arg {
     name: "serialized_sparse"
-    type: DT_STRING
+    type_attr: "out_type"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`."
-  description: "The `SparseTensor` must have rank `R` greater than 1, and the first dimension\nis treated as the minibatch dimension.  Elements of the `SparseTensor`\nmust be sorted in increasing order of this first dimension.  The serialized\n`SparseTensor` objects going into each row of `serialized_sparse` will have\nrank `R-1`.\n\nThe minibatch size `N` is extracted from `sparse_shape[0]`."
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
 }
 op {
   name: "SerializeSparse"
   input_arg {
     name: "sparse_indices"
-    description: "2-D.  The `indices` of the `SparseTensor`."
     type: DT_INT64
   }
   input_arg {
     name: "sparse_values"
-    description: "1-D.  The `values` of the `SparseTensor`."
     type_attr: "T"
   }
   input_arg {
     name: "sparse_shape"
-    description: "1-D.  The `shape` of the `SparseTensor`."
     type: DT_INT64
   }
   output_arg {
     name: "serialized_sparse"
-    type: DT_STRING
+    type_attr: "out_type"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object."
-}
-op {
-  name: "SerializeTensor"
-  input_arg {
-    name: "tensor"
-    description: "A Tensor of type `T`."
-    type_attr: "T"
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_STRING
+    }
+    allowed_values {
+      list {
+        type: DT_STRING
+        type: DT_VARIANT
+      }
+    }
+  }
+}
+op {
+  name: "SerializeTensor"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
   }
   output_arg {
     name: "serialized"
-    description: "A serialized TensorProto proto of the input tensor."
     type: DT_STRING
   }
   attr {
     name: "T"
     type: "type"
-    description: "The type of the input tensor."
   }
-  summary: "Transforms a Tensor into a serialized TensorProto proto."
 }
 op {
   name: "SetSize"
   input_arg {
     name: "set_indices"
-    description: "2D `Tensor`, indices of a `SparseTensor`."
     type: DT_INT64
   }
   input_arg {
     name: "set_values"
-    description: "1D `Tensor`, values of a `SparseTensor`."
     type_attr: "T"
   }
   input_arg {
     name: "set_shape"
-    description: "1D `Tensor`, shape of a `SparseTensor`."
     type: DT_INT64
   }
   output_arg {
     name: "size"
-    description: "For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st\n`n-1` dimensions as `set`. Each value is the number of unique elements in\nthe corresponding `[0...n-1]` dimension of `set`."
     type: DT_INT32
   }
   attr {
@@ -25893,8 +23784,6 @@ op {
       }
     }
   }
-  summary: "Number of unique elements along last dimension of input `set`."
-  description: "Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,\nand `set_shape`. The last dimension contains values in a set, duplicates are\nallowed but ignored.\n\nIf `validate_indices` is `True`, this op validates the order and range of `set`\nindices."
 }
 op {
   name: "Shape"
@@ -25923,8 +23812,6 @@ op {
       }
     }
   }
-  summary: "Returns the shape of a tensor."
-  description: "This operation returns a 1-D integer tensor representing the shape of `input`.\n\nFor example:\n\n```\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\nshape(t) ==> [2, 2, 3]\n```"
 }
 op {
   name: "ShapeN"
@@ -25961,8 +23848,6 @@ op {
       }
     }
   }
-  summary: "Returns shape of tensors."
-  description: "This operation returns N 1-D integer tensors representing shape of `input[i]s`."
 }
 op {
   name: "ShardedFilename"
@@ -25982,8 +23867,6 @@ op {
     name: "filename"
     type: DT_STRING
   }
-  summary: "Generate a sharded filename. The filename is printf formatted as"
-  description: "   %s-%05d-of-%05d, basename, shard, num_shards."
 }
 op {
   name: "ShardedFilespec"
@@ -25999,7 +23882,45 @@ op {
     name: "filename"
     type: DT_STRING
   }
-  summary: "Generate a glob pattern matching all sharded file names."
+}
+op {
+  name: "ShuffleAndRepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
 }
 op {
   name: "ShuffleDataset"
@@ -26009,17 +23930,14 @@ op {
   }
   input_arg {
     name: "buffer_size"
-    description: "The number of output elements to buffer in an iterator over\nthis dataset. Compare with the `min_after_dequeue` attr when creating a\n`RandomShuffleQueue`."
     type: DT_INT64
   }
   input_arg {
     name: "seed"
-    description: "A scalar seed for the random number generator. If either seed or\nseed2 is set to be non-zero, the random number generator is seeded\nby the given seed.  Otherwise, a random seed is used."
     type: DT_INT64
   }
   input_arg {
     name: "seed2"
-    description: "A second scalar seed to avoid seed collision."
     type: DT_INT64
   }
   output_arg {
@@ -26032,7 +23950,6 @@ op {
     default_value {
       b: true
     }
-    description: "If true, each iterator over this dataset will be given\na different pseudorandomly generated seed, based on a sequence seeded by the\n`seed` and `seed2` inputs. If false, each iterator will be given the same\nseed, and repeated iteration over this dataset will yield the exact same\nsequence of results."
   }
   attr {
     name: "output_types"
@@ -26046,7 +23963,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that shuffles elements from `input_dataset` pseudorandomly."
 }
 op {
   name: "Sigmoid"
@@ -26064,6 +23980,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -26071,8 +23988,6 @@ op {
       }
     }
   }
-  summary: "Computes sigmoid of `x` element-wise."
-  description: "Specifically, `y = 1 / (1 + exp(-x))`."
 }
 op {
   name: "SigmoidGrad"
@@ -26094,6 +24009,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -26101,8 +24017,6 @@ op {
       }
     }
   }
-  summary: "Computes the gradient of the sigmoid of `x` wrt its input."
-  description: "Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and\n`dy` is the corresponding input gradient."
 }
 op {
   name: "Sign"
@@ -26120,6 +24034,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -26129,8 +24044,6 @@ op {
       }
     }
   }
-  summary: "Returns an element-wise indication of the sign of a number."
-  description: "`y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.\n\nFor complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`."
 }
 op {
   name: "Sin"
@@ -26148,6 +24061,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -26155,7 +24069,6 @@ op {
       }
     }
   }
-  summary: "Computes sin of x element-wise."
 }
 op {
   name: "Sinh"
@@ -26173,6 +24086,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -26180,7 +24094,6 @@ op {
       }
     }
   }
-  summary: "Computes hyperbolic sine of x element-wise."
 }
 op {
   name: "Size"
@@ -26209,8 +24122,6 @@ op {
       }
     }
   }
-  summary: "Returns the size of a tensor."
-  description: "This operation returns an integer representing the number of elements in\n`input`.\n\nFor example:\n\n```\n# \'t\' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]\nsize(t) ==> 12\n```"
 }
 op {
   name: "SkipDataset"
@@ -26220,7 +24131,6 @@ op {
   }
   input_arg {
     name: "count"
-    description: "A scalar representing the number of elements from the `input_dataset`\nthat should be skipped.  If count is -1, skips everything."
     type: DT_INT64
   }
   output_arg {
@@ -26239,54 +24149,44 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that skips `count` elements from the `input_dataset`."
 }
 op {
   name: "Skipgram"
   output_arg {
     name: "vocab_word"
-    description: "A vector of words in the corpus."
     type: DT_STRING
   }
   output_arg {
     name: "vocab_freq"
-    description: "Frequencies of words. Sorted in the non-ascending order."
     type: DT_INT32
   }
   output_arg {
     name: "words_per_epoch"
-    description: "Number of words per epoch in the data file."
     type: DT_INT64
   }
   output_arg {
     name: "current_epoch"
-    description: "The current epoch number."
     type: DT_INT32
   }
   output_arg {
     name: "total_words_processed"
-    description: "The total number of words processed so far."
     type: DT_INT64
   }
   output_arg {
     name: "examples"
-    description: "A vector of word ids."
     type: DT_INT32
   }
   output_arg {
     name: "labels"
-    description: "A vector of word ids."
     type: DT_INT32
   }
   attr {
     name: "filename"
     type: "string"
-    description: "The corpus\'s text file name."
   }
   attr {
     name: "batch_size"
     type: "int"
-    description: "The size of produced batch."
   }
   attr {
     name: "window_size"
@@ -26294,7 +24194,6 @@ op {
     default_value {
       i: 5
     }
-    description: "The number of words to predict to the left and right of the target."
   }
   attr {
     name: "min_count"
@@ -26302,7 +24201,6 @@ op {
     default_value {
       i: 5
     }
-    description: "The minimum number of word occurrences for it to be included in the\nvocabulary."
   }
   attr {
     name: "subsample"
@@ -26310,9 +24208,7 @@ op {
     default_value {
       f: 0.001
     }
-    description: "Threshold for word occurrence. Words that appear with higher\nfrequency will be randomly down-sampled. Set to 0 to disable."
   }
-  summary: "Parses a text file and creates a batch of examples."
   deprecation {
     version: 19
     explanation: "Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result"
@@ -26327,12 +24223,10 @@ op {
   }
   input_arg {
     name: "begin"
-    description: "begin[i] specifies the offset into the \'i\'th dimension of\n\'input\' to slice from."
     type_attr: "Index"
   }
   input_arg {
     name: "size"
-    description: "size[i] specifies the number of elements of the \'i\'th dimension\nof \'input\' to slice. If size[i] is -1, all remaining elements in dimension\ni are included in the slice (i.e. this is equivalent to setting\nsize[i] = input.dim_size(i) - begin[i])."
     type_attr: "Index"
   }
   output_arg {
@@ -26353,19 +24247,30 @@ op {
       }
     }
   }
-  summary: "Return a slice from \'input\'."
-  description: "The output tensor is a tensor with dimensions described by \'size\'\nwhose values are extracted from \'input\' starting at the offsets in\n\'begin\'.\n\n*Requirements*:\n  0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)"
+}
+op {
+  name: "Snapshot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
 }
 op {
   name: "Softmax"
   input_arg {
     name: "logits"
-    description: "2-D with shape `[batch_size, num_classes]`."
     type_attr: "T"
   }
   output_arg {
     name: "softmax"
-    description: "Same shape as `logits`."
     type_attr: "T"
   }
   attr {
@@ -26374,34 +24279,29 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes softmax activations."
-  description: "For each batch `i` and class `j` we have\n\n    softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))"
 }
 op {
   name: "SoftmaxCrossEntropyWithLogits"
   input_arg {
     name: "features"
-    description: "batch_size x num_classes matrix"
     type_attr: "T"
   }
   input_arg {
     name: "labels"
-    description: "batch_size x num_classes matrix\nThe caller must ensure that each batch of labels represents a valid\nprobability distribution."
     type_attr: "T"
   }
   output_arg {
     name: "loss"
-    description: "Per example loss (batch_size vector)."
     type_attr: "T"
   }
   output_arg {
     name: "backprop"
-    description: "backpropagated gradients (batch_size x num_classes matrix)."
     type_attr: "T"
   }
   attr {
@@ -26410,13 +24310,12 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Computes softmax cross entropy cost and gradients to backpropagate."
-  description: "Inputs are the logits, not probabilities."
 }
 op {
   name: "Softplus"
@@ -26436,10 +24335,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -26447,23 +24347,19 @@ op {
       }
     }
   }
-  summary: "Computes softplus: `log(exp(features) + 1)`."
 }
 op {
   name: "SoftplusGrad"
   input_arg {
     name: "gradients"
-    description: "The backpropagated gradients to the corresponding softplus operation."
     type_attr: "T"
   }
   input_arg {
     name: "features"
-    description: "The features passed as input to the corresponding softplus operation."
     type_attr: "T"
   }
   output_arg {
     name: "backprops"
-    description: "The gradients: `gradients / (1 + exp(-features))`."
     type_attr: "T"
   }
   attr {
@@ -26474,10 +24370,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -26485,7 +24382,6 @@ op {
       }
     }
   }
-  summary: "Computes softplus gradients for a softplus operation."
 }
 op {
   name: "Softsign"
@@ -26505,10 +24401,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -26516,23 +24413,19 @@ op {
       }
     }
   }
-  summary: "Computes softsign: `features / (abs(features) + 1)`."
 }
 op {
   name: "SoftsignGrad"
   input_arg {
     name: "gradients"
-    description: "The backpropagated gradients to the corresponding softsign operation."
     type_attr: "T"
   }
   input_arg {
     name: "features"
-    description: "The features passed as input to the corresponding softsign operation."
     type_attr: "T"
   }
   output_arg {
     name: "backprops"
-    description: "The gradients: `gradients / (1 + abs(features)) ** 2`."
     type_attr: "T"
   }
   attr {
@@ -26543,10 +24436,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -26554,18 +24448,15 @@ op {
       }
     }
   }
-  summary: "Computes softsign gradients for a softsign operation."
 }
 op {
   name: "SpaceToBatch"
   input_arg {
     name: "input"
-    description: "4-D with shape `[batch, height, width, depth]`."
     type_attr: "T"
   }
   input_arg {
     name: "paddings"
-    description: "2-D tensor of non-negative integers with shape `[2, 2]`. It specifies\n  the padding of the input with zeros across the spatial dimensions as follows:\n\n      paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]\n\n  The effective spatial dimensions of the zero-padded input tensor will be:\n\n      height_pad = pad_top + height + pad_bottom\n      width_pad = pad_left + width + pad_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\n  * Non-overlapping blocks of size `block_size x block size` in the height and\n    width dimensions are rearranged into the batch dimension at each location.\n  * The batch of the output tensor is `batch * block_size * block_size`.\n  * Both height_pad and width_pad must be divisible by block_size.\n\nThe shape of the output will be:\n\n    [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,\n     depth]\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 2, 1]` and value:\n\n```\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
     type_attr: "Tpaddings"
   }
   output_arg {
@@ -26595,24 +24486,19 @@ op {
     has_minimum: true
     minimum: 2
   }
-  summary: "SpaceToBatch for 4-D tensors of type T."
-  description: "This is a legacy version of the more general SpaceToBatchND.\n\nZero-pads and then rearranges (permutes) blocks of spatial data into batch.\nMore specifically, this op outputs a copy of the input tensor where values from\nthe `height` and `width` dimensions are moved to the `batch` dimension. After\nthe zero-padding, both `height` and `width` of the input must be divisible by the\nblock size."
 }
 op {
   name: "SpaceToBatchND"
   input_arg {
     name: "input"
-    description: "N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,\nwhere spatial_shape has `M` dimensions."
     type_attr: "T"
   }
   input_arg {
     name: "block_shape"
-    description: "1-D with shape `[M]`, all values must be >= 1."
     type_attr: "Tblock_shape"
   }
   input_arg {
     name: "paddings"
-    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension\n  `i + 1`, which corresponds to spatial dimension `i`.  It is required that\n  `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.\n\nThis operation is equivalent to the following steps:\n\n1. Zero-pad the start and end of dimensions `[1, ..., M]` of the\n   input according to `paddings` to produce `padded` of shape `padded_shape`.\n\n2. Reshape `padded` to `reshaped_padded` of shape:\n\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n       block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1],\n      block_shape[M-1]] +\n     remaining_shape\n\n3. Permute dimensions of `reshaped_padded` to produce\n   `permuted_reshaped_padded` of shape:\n\n     block_shape +\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\n4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch\n   dimension, producing an output tensor of shape:\n\n     [batch * prod(block_shape)] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and\n    paddings = `[[0, 0], [2, 0]]`:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 3, 1]` and value:\n\n```\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
     type_attr: "Tpaddings"
   }
   output_arg {
@@ -26649,8 +24535,6 @@ op {
       }
     }
   }
-  summary: "SpaceToBatch for N-D tensors of type T."
-  description: "This operation divides \"spatial\" dimensions `[1, ..., M]` of the input into a\ngrid of blocks of shape `block_shape`, and interleaves these blocks with the\n\"batch\" dimension (0) such that in the output, the spatial dimensions\n`[1, ..., M]` correspond to the position within the grid, and the batch\ndimension combines both the position within a spatial block and the original\nbatch position.  Prior to division into blocks, the spatial dimensions of the\ninput are optionally zero padded according to `paddings`.  See below for a\nprecise description."
 }
 op {
   name: "SpaceToDepth"
@@ -26669,7 +24553,6 @@ op {
   attr {
     name: "block_size"
     type: "int"
-    description: "The size of the spatial block."
     has_minimum: true
     minimum: 2
   }
@@ -26687,56 +24570,49 @@ op {
       }
     }
   }
-  summary: "SpaceToDepth for tensors of type T."
-  description: "Rearranges blocks of spatial data, into depth. More specifically,\nthis op outputs a copy of the input tensor where values from the `height`\nand `width` dimensions are moved to the `depth` dimension.\nThe attr `block_size` indicates the input block size.\n\n  * Non-overlapping blocks of size `block_size x block size` are rearranged\n    into depth at each location.\n  * The depth of the output tensor is `block_size * block_size * input_depth`.\n  * The Y, X coordinates within each block of the input become the high order\n    component of the output channel index.\n  * The input tensor\'s height and width must be divisible by block_size.\n\nThe `data_format` attr specifies the layout of the input and output tensors\nwith the following options:\n  \"NHWC\": `[ batch, height, width, channels ]`\n  \"NCHW\": `[ batch, channels, height, width ]`\n  \"NCHW_VECT_C\":\n      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`\n\nIt is useful to consider the operation as transforming a 6-D Tensor.\ne.g. for data_format = NHWC,\n     Each element in the input tensor can be specified via 6 coordinates,\n     ordered by decreasing memory layout significance as:\n     n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates\n                        within the output image, bX, bY means coordinates\n                        within the input block, iC means input channels).\n     The output would be a transpose to the following layout:\n     n,oY,oX,bY,bX,iC\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given an input of shape `[1, 2, 2, 1]`, data_format = \"NHWC\" and\nblock_size = 2:\n\n```\nx = [[[[1], [2]],\n      [[3], [4]]]]\n```\n\nThis operation will output a tensor of shape `[1, 1, 1, 4]`:\n\n```\n[[[[1, 2, 3, 4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,\nthe corresponding output will have a single element (i.e. width and height are\nboth 1) and will have a depth of 4 channels (1 * block_size * block_size).\nThe output element shape is `[1, 1, 4]`.\n\nFor an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThis operation, for block_size of 2, will return the following tensor of shape\n`[1, 1, 1, 12]`\n\n```\n[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nSimilarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:\n\n```\nx = [[[[1],   [2],  [5],  [6]],\n      [[3],   [4],  [7],  [8]],\n      [[9],  [10], [13],  [14]],\n      [[11], [12], [15],  [16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 2 2 4]`:\n\n```\nx = [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```"
 }
 op {
   name: "SparseAccumulatorApplyGradient"
   input_arg {
     name: "handle"
-    description: "The handle to a accumulator."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "local_step"
-    description: "The local_step value at which the sparse gradient was computed."
     type: DT_INT64
   }
   input_arg {
     name: "gradient_indices"
-    description: "Indices of the sparse gradient to be accumulated. Must be a\nvector."
     type: DT_INT64
   }
   input_arg {
     name: "gradient_values"
-    description: "Values are the non-zero slices of the gradient, and must have\nthe same first dimension as indices, i.e., the nnz represented by indices and\nvalues must be consistent."
     type_attr: "dtype"
   }
   input_arg {
     name: "gradient_shape"
-    description: "Shape of the sparse gradient to be accumulated."
     type: DT_INT64
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The data type of accumulated gradients. Needs to correspond to the type\nof the accumulator."
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -26746,102 +24622,85 @@ op {
   attr {
     name: "has_known_shape"
     type: "bool"
-    description: "Boolean indicating whether gradient_shape is unknown, in which\ncase the input is ignored during validation."
   }
-  summary: "Applies a sparse gradient to a given accumulator."
-  description: "Does not add if local_step is smaller than the accumulator\'s\nglobal_step."
 }
 op {
   name: "SparseAccumulatorTakeGradient"
   input_arg {
     name: "handle"
-    description: "The handle to a SparseConditionalAccumulator."
     type: DT_STRING
     is_ref: true
   }
   input_arg {
     name: "num_required"
-    description: "Number of gradients required before we return an aggregate."
     type: DT_INT32
   }
   output_arg {
     name: "indices"
-    description: "Indices of the average of the accumulated sparse gradients."
     type: DT_INT64
   }
   output_arg {
     name: "values"
-    description: "Values of the average of the accumulated sparse gradients."
     type_attr: "dtype"
   }
   output_arg {
     name: "shape"
-    description: "Shape of the average of the accumulated sparse gradients."
     type: DT_INT64
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The data type of accumulated gradients. Needs to correspond to the type\nof the accumulator."
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  summary: "Extracts the average sparse gradient in a SparseConditionalAccumulator."
-  description: "The op will blocks until sufficient (i.e., more than num_required)\ngradients have been accumulated. If the accumulator has already\naggregated more than num_required gradients, it will return its\naverage of the accumulated gradients.  Also automatically increments\nthe recorded global_step in the accumulator by 1, and resets the\naggregate to 0."
 }
 op {
   name: "SparseAdd"
   input_arg {
     name: "a_indices"
-    description: "2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix."
     type: DT_INT64
   }
   input_arg {
     name: "a_values"
-    description: "1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector."
     type_attr: "T"
   }
   input_arg {
     name: "a_shape"
-    description: "1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector."
     type: DT_INT64
   }
   input_arg {
     name: "b_indices"
-    description: "2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix."
     type: DT_INT64
   }
   input_arg {
     name: "b_values"
-    description: "1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector."
     type_attr: "T"
   }
   input_arg {
     name: "b_shape"
-    description: "1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector."
     type: DT_INT64
   }
   input_arg {
     name: "thresh"
-    description: "0-D.  The magnitude threshold that determines if an output value/index\npair takes space."
     type_attr: "Treal"
   }
   output_arg {
@@ -26863,17 +24722,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -26888,10 +24748,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -26899,39 +24760,31 @@ op {
       }
     }
   }
-  summary: "Adds two `SparseTensor` objects to produce another `SparseTensor`."
-  description: "The input `SparseTensor` objects\' indices are assumed ordered in standard\nlexicographic order.  If this is not the case, before this step run\n`SparseReorder` to restore index ordering.\n\nBy default, if two values sum to zero at some index, the output `SparseTensor`\nwould still include that particular location in its index, storing a zero in the\ncorresponding value slot.  To override this, callers can specify `thresh`,\nindicating that if the sum has a magnitude strictly smaller than `thresh`, its\ncorresponding value and index would then not be included.  In particular,\n`thresh == 0` (default) means everything is kept and actual thresholding happens\nonly for a positive value.\n\nIn the following shapes, `nnz` is the count after taking `thresh` into account."
 }
 op {
   name: "SparseAddGrad"
   input_arg {
     name: "backprop_val_grad"
-    description: "1-D with shape `[nnz(sum)]`.  The gradient with respect to\nthe non-empty values of the sum."
     type_attr: "T"
   }
   input_arg {
     name: "a_indices"
-    description: "2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`."
     type: DT_INT64
   }
   input_arg {
     name: "b_indices"
-    description: "2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`."
     type: DT_INT64
   }
   input_arg {
     name: "sum_indices"
-    description: "2-D.  The `indices` of the sum `SparseTensor`, size\n`[nnz(sum), ndims]`."
     type: DT_INT64
   }
   output_arg {
     name: "a_val_grad"
-    description: "1-D with shape `[nnz(A)]`. The gradient with respect to the\nnon-empty values of A."
     type_attr: "T"
   }
   output_arg {
     name: "b_val_grad"
-    description: "1-D with shape `[nnz(B)]`. The gradient with respect to the\nnon-empty values of B."
     type_attr: "T"
   }
   attr {
@@ -26941,25 +24794,24 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  summary: "The gradient operator for the SparseAdd op."
-  description: "The SparseAdd op calculates A + B, where A, B, and the sum are all represented\nas `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.\nnon-empty values of the sum, and outputs the gradients w.r.t. the non-empty\nvalues of A and B."
 }
 op {
   name: "SparseApplyAdadelta"
@@ -26970,44 +24822,36 @@ op {
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "accum_update"
-    description: ": Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Learning rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "rho"
-    description: "Decay factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "epsilon"
-    description: "Constant factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -27018,17 +24862,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -27051,42 +24896,34 @@ op {
     default_value {
       b: false
     }
-    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "var: Should be from a Variable()."
 }
 op {
   name: "SparseApplyAdagrad"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Learning rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -27097,17 +24934,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -27130,64 +24968,51 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the adagrad scheme."
-  description: "That is for rows we have grad for, we update var and accum as follows:\naccum += grad * grad\nvar -= lr * grad * (1 / sqrt(accum))"
 }
 op {
   name: "SparseApplyAdagradDA"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "gradient_accumulator"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "gradient_squared_accumulator"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   input_arg {
     name: "lr"
-    description: "Learning rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "global_step"
-    description: "Training step number. Must be a scalar."
     type: DT_INT64
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -27198,17 +25023,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -27231,44 +25057,36 @@ op {
     default_value {
       b: false
     }
-    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Update entries in \'*var\' and \'*accum\' according to the proximal adagrad scheme."
 }
 op {
   name: "SparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "mg"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "ms"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "mom"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "rho"
-    description: "Decay rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -27277,22 +25095,18 @@ op {
   }
   input_arg {
     name: "epsilon"
-    description: "Ridge term. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var, ms and mom."
     type_attr: "Tindices"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -27303,17 +25117,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -27336,64 +25151,51 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var, mg, ms, and mom tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the centered RMSProp algorithm."
-  description: "The centered RMSProp algorithm uses an estimate of the centered second moment\n(i.e., the variance) for normalization, as opposed to regular RMSProp, which\nuses the (uncentered) second moment. This often helps with training, but is\nslightly more expensive in terms of computation and memory.\n\nNote that in dense implementation of this algorithm, mg, ms, and mom will\nupdate even if the grad is zero, but in this sparse implementation, mg, ms,\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nmean_grad = decay * mean_grad + (1-decay) * gradient\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
 }
 op {
   name: "SparseApplyFtrl"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "linear"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "lr_power"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -27404,17 +25206,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -27437,54 +25240,43 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
-  description: "That is for rows we have grad for, we update var, accum and linear as follows:\naccum_new = accum + grad * grad\nlinear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
 }
 op {
   name: "SparseApplyFtrlV2"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "linear"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 shrinkage regulariation. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -27493,12 +25285,10 @@ op {
   }
   input_arg {
     name: "lr_power"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -27509,17 +25299,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -27542,48 +25333,38 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
-  description: "That is for rows we have grad for, we update var, accum and linear as follows:\ngrad_with_shrinkage = grad + 2 * l2_shrinkage * var\naccum_new = accum + grad_with_shrinkage * grad_with_shrinkage\nlinear += grad_with_shrinkage +\n    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var\nquadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2\nvar = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0\naccum = accum_new"
 }
 op {
   name: "SparseApplyMomentum"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Learning rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   input_arg {
     name: "momentum"
-    description: "Momentum. Must be a scalar."
     type_attr: "T"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -27594,17 +25375,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -27627,7 +25409,6 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var and accum tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
   attr {
     name: "use_nesterov"
@@ -27635,53 +25416,42 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, the tensor passed to compute grad will be\nvar - lr * momentum * accum, so in the end, the var you get is actually\nvar - lr * momentum * accum."
   }
-  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
-  description: "Set use_nesterov = True if you want to use Nesterov momentum.\n\nThat is for rows we have grad for, we update var and accum as follows:\n\naccum = accum * momentum + grad\nvar -= lr * accum"
 }
 op {
   name: "SparseApplyProximalAdagrad"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "accum"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Learning rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -27692,17 +25462,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -27725,47 +25496,37 @@ op {
     default_value {
       b: false
     }
-    description: "If True, updating of the var and accum tensors will be protected by\na lock; otherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Sparse update entries in \'*var\' and \'*accum\' according to FOBOS algorithm."
-  description: "That is for rows we have grad for, we update var and accum as follows:\naccum += grad * grad\nprox_v = var\nprox_v -= lr * grad * (1 / sqrt(accum))\nvar = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}"
 }
 op {
   name: "SparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "alpha"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l1"
-    description: "L1 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "l2"
-    description: "L2 regularization. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var and accum."
     type_attr: "Tindices"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -27776,17 +25537,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -27809,39 +25571,31 @@ op {
     default_value {
       b: false
     }
-    description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
-  summary: "Sparse update \'*var\' as FOBOS algorithm with fixed learning rate."
-  description: "That is for rows we have grad for, we update var as follows:\nprox_v = var - alpha * grad\nvar = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}"
 }
 op {
   name: "SparseApplyRMSProp"
   input_arg {
     name: "var"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "ms"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "mom"
-    description: "Should be from a Variable()."
     type_attr: "T"
     is_ref: true
   }
   input_arg {
     name: "lr"
-    description: "Scaling factor. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "rho"
-    description: "Decay rate. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
@@ -27850,22 +25604,18 @@ op {
   }
   input_arg {
     name: "epsilon"
-    description: "Ridge term. Must be a scalar."
     type_attr: "T"
   }
   input_arg {
     name: "grad"
-    description: "The gradient."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A vector of indices into the first dimension of var, ms and mom."
     type_attr: "Tindices"
   }
   output_arg {
     name: "out"
-    description: "Same as \"var\"."
     type_attr: "T"
     is_ref: true
   }
@@ -27876,17 +25626,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -27909,50 +25660,40 @@ op {
     default_value {
       b: false
     }
-    description: "If `True`, updating of the var, ms, and mom tensors is protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
-  summary: "Update \'*var\' according to the RMSProp algorithm."
-  description: "Note that in dense implementation of this algorithm, ms and mom will\nupdate even if the grad is zero, but in this sparse implementation, ms\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
 }
 op {
   name: "SparseConcat"
   input_arg {
     name: "indices"
-    description: "2-D.  Indices of each input `SparseTensor`."
     type: DT_INT64
     number_attr: "N"
   }
   input_arg {
     name: "values"
-    description: "1-D.  Non-empty values of each `SparseTensor`."
     type_attr: "T"
     number_attr: "N"
   }
   input_arg {
     name: "shapes"
-    description: "1-D.  Shapes of each `SparseTensor`."
     type: DT_INT64
     number_attr: "N"
   }
   output_arg {
     name: "output_indices"
-    description: "2-D.  Indices of the concatenated `SparseTensor`."
     type: DT_INT64
   }
   output_arg {
     name: "output_values"
-    description: "1-D.  Non-empty values of the concatenated `SparseTensor`."
     type_attr: "T"
   }
   output_arg {
     name: "output_shape"
-    description: "1-D.  Shape of the concatenated `SparseTensor`."
     type: DT_INT64
   }
   attr {
     name: "concat_dim"
     type: "int"
-    description: "Dimension to concatenate along. Must be in range [-rank, rank),\nwhere rank is the number of dimensions in each input `SparseTensor`."
   }
   attr {
     name: "N"
@@ -27964,36 +25705,33 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Concatenates a list of `SparseTensor` along the specified dimension."
-  description: "Concatenation is with respect to the dense versions of these sparse tensors.\nIt is assumed that each input is a `SparseTensor` whose elements are ordered\nalong increasing dimension number.\n\nAll inputs\' shapes must match, except for the concat dimension.  The\n`indices`, `values`, and `shapes` lists must have the same length.\n\nThe output shape is identical to the inputs\', except along the concat\ndimension, where it is the sum of the inputs\' sizes along that dimension.\n\nThe output elements will be resorted to preserve the sort order along\nincreasing dimension number.\n\nThis op runs in `O(M log M)` time, where `M` is the total number of non-empty\nvalues across all inputs. This is due to the need for an internal sort in\norder to concatenate efficiently across an arbitrary dimension.\n\nFor example, if `concat_dim = 1` and the inputs are\n\n    sp_inputs[0]: shape = [2, 3]\n    [0, 2]: \"a\"\n    [1, 0]: \"b\"\n    [1, 1]: \"c\"\n\n    sp_inputs[1]: shape = [2, 4]\n    [0, 1]: \"d\"\n    [0, 2]: \"e\"\n\nthen the output will be\n\n    shape = [2, 7]\n    [0, 2]: \"a\"\n    [0, 4]: \"d\"\n    [0, 5]: \"e\"\n    [1, 0]: \"b\"\n    [1, 1]: \"c\"\n\nGraphically this is equivalent to doing\n\n    [    a] concat [  d e  ] = [    a   d e  ]\n    [b c  ]        [       ]   [b c          ]"
 }
 op {
   name: "SparseConditionalAccumulator"
   output_arg {
     name: "handle"
-    description: "The handle to the accumulator."
     type: DT_STRING
     is_ref: true
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of the value being accumulated."
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -28003,7 +25741,6 @@ op {
   attr {
     name: "shape"
     type: "shape"
-    description: "The shape of the values."
   }
   attr {
     name: "container"
@@ -28011,7 +25748,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this accumulator is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -28019,49 +25755,39 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this accumulator will be shared under the given name\nacross multiple sessions."
   }
-  summary: "A conditional accumulator for aggregating sparse gradients."
-  description: "The accumulator accepts gradients marked with local_step greater or\nequal to the most recent global_step known to the accumulator. The\naverage can be extracted from the accumulator, provided sufficient\ngradients have been accumulated. Extracting the average automatically\nresets the aggregate to 0, and increments the global_step recorded by\nthe accumulator."
   is_stateful: true
 }
 op {
   name: "SparseCross"
   input_arg {
     name: "indices"
-    description: "2-D.  Indices of each input `SparseTensor`."
     type: DT_INT64
     number_attr: "N"
   }
   input_arg {
     name: "values"
-    description: "1-D.   values of each `SparseTensor`."
     type_list_attr: "sparse_types"
   }
   input_arg {
     name: "shapes"
-    description: "1-D.   Shapes of each `SparseTensor`."
     type: DT_INT64
     number_attr: "N"
   }
   input_arg {
     name: "dense_inputs"
-    description: "2-D.    Columns represented by dense `Tensor`."
     type_list_attr: "dense_types"
   }
   output_arg {
     name: "output_indices"
-    description: "2-D.  Indices of the concatenated `SparseTensor`."
     type: DT_INT64
   }
   output_arg {
     name: "output_values"
-    description: "1-D.  Non-empty values of the concatenated or hashed\n`SparseTensor`."
     type_attr: "out_type"
   }
   output_arg {
     name: "output_shape"
-    description: "1-D.  Shape of the concatenated `SparseTensor`."
     type: DT_INT64
   }
   attr {
@@ -28072,18 +25798,15 @@ op {
   attr {
     name: "hashed_output"
     type: "bool"
-    description: "If true, returns the hash of the cross instead of the string.\nThis will allow us avoiding string manipulations."
   }
   attr {
     name: "num_buckets"
     type: "int"
-    description: "It is used if hashed_output is true.\noutput = hashed_value%num_buckets if num_buckets > 0 else hashed_value."
     has_minimum: true
   }
   attr {
     name: "hash_key"
     type: "int"
-    description: "Specify the hash_key that will be used by the `FingerprintCat64`\nfunction to combine the crosses fingerprints."
   }
   attr {
     name: "sparse_types"
@@ -28127,34 +25850,27 @@ op {
       }
     }
   }
-  summary: "Generates sparse cross from a list of sparse and dense tensors."
-  description: "The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each\nrepresenting features of one feature column. It outputs a 2D `SparseTensor` with\nthe batchwise crosses of these features.\n\nFor example, if the inputs are\n\n    inputs[0]: SparseTensor with shape = [2, 2]\n    [0, 0]: \"a\"\n    [1, 0]: \"b\"\n    [1, 1]: \"c\"\n\n    inputs[1]: SparseTensor with shape = [2, 1]\n    [0, 0]: \"d\"\n    [1, 0]: \"e\"\n\n    inputs[2]: Tensor [[\"f\"], [\"g\"]]\n\nthen the output will be\n\n    shape = [2, 2]\n    [0, 0]: \"a_X_d_X_f\"\n    [1, 0]: \"b_X_e_X_g\"\n    [1, 1]: \"c_X_e_X_g\"\n\nif hashed_output=true then the output will be\n\n    shape = [2, 2]\n    [0, 0]: FingerprintCat64(\n                Fingerprint64(\"f\"), FingerprintCat64(\n                    Fingerprint64(\"d\"), Fingerprint64(\"a\")))\n    [1, 0]: FingerprintCat64(\n                Fingerprint64(\"g\"), FingerprintCat64(\n                    Fingerprint64(\"e\"), Fingerprint64(\"b\")))\n    [1, 1]: FingerprintCat64(\n                Fingerprint64(\"g\"), FingerprintCat64(\n                    Fingerprint64(\"e\"), Fingerprint64(\"c\")))"
 }
 op {
   name: "SparseDenseCwiseAdd"
   input_arg {
     name: "sp_indices"
-    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, possibly not in canonical ordering."
     type: DT_INT64
   }
   input_arg {
     name: "sp_values"
-    description: "1-D.  `N` non-empty values corresponding to `sp_indices`."
     type_attr: "T"
   }
   input_arg {
     name: "sp_shape"
-    description: "1-D.  Shape of the input SparseTensor."
     type: DT_INT64
   }
   input_arg {
     name: "dense"
-    description: "`R`-D.  The dense Tensor operand."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "1-D.  The `N` values that are operated on."
     type_attr: "T"
   }
   attr {
@@ -28164,51 +25880,45 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  summary: "Adds up a SparseTensor and a dense Tensor, using these special rules:"
-  description: "(1) Broadcasts the dense side to have the same shape as the sparse side, if\n    eligible;\n(2) Then, only the dense values pointed to by the indices of the SparseTensor\n    participate in the cwise addition.\n\nBy these rules, the result is a logical SparseTensor with exactly the same\nindices and shape, but possibly with different non-zero values.  The output of\nthis Op is the resultant non-zero values."
 }
 op {
   name: "SparseDenseCwiseDiv"
   input_arg {
     name: "sp_indices"
-    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, possibly not in canonical ordering."
     type: DT_INT64
   }
   input_arg {
     name: "sp_values"
-    description: "1-D.  `N` non-empty values corresponding to `sp_indices`."
     type_attr: "T"
   }
   input_arg {
     name: "sp_shape"
-    description: "1-D.  Shape of the input SparseTensor."
     type: DT_INT64
   }
   input_arg {
     name: "dense"
-    description: "`R`-D.  The dense Tensor operand."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "1-D.  The `N` values that are operated on."
     type_attr: "T"
   }
   attr {
@@ -28218,51 +25928,45 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  summary: "Component-wise divides a SparseTensor by a dense Tensor."
-  description: "*Limitation*: this Op only broadcasts the dense side to the sparse side, but not\nthe other direction."
 }
 op {
   name: "SparseDenseCwiseMul"
   input_arg {
     name: "sp_indices"
-    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, possibly not in canonical ordering."
     type: DT_INT64
   }
   input_arg {
     name: "sp_values"
-    description: "1-D.  `N` non-empty values corresponding to `sp_indices`."
     type_attr: "T"
   }
   input_arg {
     name: "sp_shape"
-    description: "1-D.  Shape of the input SparseTensor."
     type: DT_INT64
   }
   input_arg {
     name: "dense"
-    description: "`R`-D.  The dense Tensor operand."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "1-D.  The `N` values that are operated on."
     type_attr: "T"
   }
   attr {
@@ -28272,46 +25976,41 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  summary: "Component-wise multiplies a SparseTensor by a dense Tensor."
-  description: "The output locations corresponding to the implicitly zero elements in the sparse\ntensor will be zero (i.e., will not take up storage space), regardless of the\ncontents of the dense tensor (even if it\'s +/-INF and that INF*0 == NaN).\n\n*Limitation*: this Op only broadcasts the dense side to the sparse side, but not\nthe other direction."
 }
 op {
   name: "SparseFillEmptyRows"
   input_arg {
     name: "indices"
-    description: "2-D. the indices of the sparse tensor."
     type: DT_INT64
   }
   input_arg {
     name: "values"
-    description: "1-D. the values of the sparse tensor."
     type_attr: "T"
   }
   input_arg {
     name: "dense_shape"
-    description: "1-D. the shape of the sparse tensor."
     type: DT_INT64
   }
   input_arg {
     name: "default_value"
-    description: "0-D. default value to insert into location `[row, 0, ..., 0]`\n  for rows missing from the input sparse tensor.\noutput indices: 2-D. the indices of the filled sparse tensor."
     type_attr: "T"
   }
   output_arg {
@@ -28320,54 +26019,43 @@ op {
   }
   output_arg {
     name: "output_values"
-    description: "1-D. the values of the filled sparse tensor."
     type_attr: "T"
   }
   output_arg {
     name: "empty_row_indicator"
-    description: "1-D. whether the dense row was missing in the\ninput sparse tensor."
     type: DT_BOOL
   }
   output_arg {
     name: "reverse_index_map"
-    description: "1-D. a map from the input indices to the output indices."
     type: DT_INT64
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Fills empty rows in the input 2-D `SparseTensor` with a default value."
-  description: "The input `SparseTensor` is represented via the tuple of inputs\n(`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the\nsame `dense_shape` but with indices `output_indices` and values\n`output_values`.\n\nThis op inserts a single entry for every row that doesn\'t have any values.\nThe index is created as `[row, 0, ..., 0]` and the inserted value\nis `default_value`.\n\nFor example, suppose `sp_input` has shape `[5, 6]` and non-empty values:\n\n    [0, 1]: a\n    [0, 3]: b\n    [2, 0]: c\n    [3, 1]: d\n\nRows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:\n\n    [0, 1]: a\n    [0, 3]: b\n    [1, 0]: default_value\n    [2, 0]: c\n    [3, 1]: d\n    [4, 0]: default_value\n\nThe output `SparseTensor` will be in row-major order and will have the\nsame shape as the input.\n\nThis op also returns an indicator vector shaped `[dense_shape[0]]` such that\n\n    empty_row_indicator[i] = True iff row i was an empty row.\n\nAnd a reverse index map vector shaped `[indices.shape[0]]` that is used during\nbackpropagation,\n\n    reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]"
 }
 op {
   name: "SparseFillEmptyRowsGrad"
   input_arg {
     name: "reverse_index_map"
-    description: "1-D.  The reverse index map from SparseFillEmptyRows."
     type: DT_INT64
   }
   input_arg {
     name: "grad_values"
-    description: "1-D.  The gradients from backprop."
     type_attr: "T"
   }
   output_arg {
     name: "d_values"
-    description: "1-D.  The backprop into values."
     type_attr: "T"
   }
   output_arg {
     name: "d_default_value"
-    description: "0-D.  The backprop into default_value."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "The gradient of SparseFillEmptyRows."
-  description: "Takes vectors reverse_index_map, shaped `[N]`, and grad_values,\nshaped `[N_full]`, where `N_full >= N` and copies data into either\n`d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and\n`d_default_value` is a scalar.\n\n  d_values[j] = grad_values[reverse_index_map[j]]\n  d_default_value = sum_{k : 0 .. N_full - 1} (\n     grad_values[k] * 1{k not in reverse_index_map})"
 }
 op {
   name: "SparseMatMul"
@@ -28437,34 +26125,27 @@ op {
       }
     }
   }
-  summary: "Multiply matrix \"a\" by matrix \"b\"."
-  description: "The inputs must be two-dimensional matrices and the inner dimension of \"a\" must\nmatch the outer dimension of \"b\". This op is optimized for the case where at\nleast one of \"a\" or \"b\" is sparse. The breakeven for using this versus a dense\nmatrix multiply on one platform was 30% zero values in the sparse matrix.\n\nThe gradient computation of this operation will only take advantage of sparsity\nin the input gradient when that gradient comes from a Relu."
 }
 op {
   name: "SparseReduceMax"
   input_arg {
     name: "input_indices"
-    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, possibly not in canonical ordering."
     type: DT_INT64
   }
   input_arg {
     name: "input_values"
-    description: "1-D.  `N` non-empty values corresponding to `input_indices`."
     type_attr: "T"
   }
   input_arg {
     name: "input_shape"
-    description: "1-D.  Shape of the input SparseTensor."
     type: DT_INT64
   }
   input_arg {
     name: "reduction_axes"
-    description: "1-D.  Length-`K` vector containing the reduction axes."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "`R-K`-D.  The reduced Tensor."
     type_attr: "T"
   }
   attr {
@@ -28473,7 +26154,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, retain reduced dimensions with length 1."
   }
   attr {
     name: "T"
@@ -28483,10 +26163,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -28494,29 +26175,23 @@ op {
       }
     }
   }
-  summary: "Computes the max of elements across dimensions of a SparseTensor."
-  description: "This Op takes a SparseTensor and is the sparse counterpart to\n`tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`\ninstead of a sparse one.\n\nReduces `sp_input` along the dimensions given in `reduction_axes`.  Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained\nwith length 1.\n\nIf `reduction_axes` has no entries, all dimensions are reduced, and a tensor\nwith a single element is returned.  Additionally, the axes can be negative,\nwhich are interpreted according to the indexing rules in Python."
 }
 op {
   name: "SparseReduceMaxSparse"
   input_arg {
     name: "input_indices"
-    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, possibly not in canonical ordering."
     type: DT_INT64
   }
   input_arg {
     name: "input_values"
-    description: "1-D.  `N` non-empty values corresponding to `input_indices`."
     type_attr: "T"
   }
   input_arg {
     name: "input_shape"
-    description: "1-D.  Shape of the input SparseTensor."
     type: DT_INT64
   }
   input_arg {
     name: "reduction_axes"
-    description: "1-D.  Length-`K` vector containing the reduction axes."
     type: DT_INT32
   }
   output_arg {
@@ -28537,7 +26212,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, retain reduced dimensions with length 1."
   }
   attr {
     name: "T"
@@ -28547,10 +26221,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -28558,34 +26233,27 @@ op {
       }
     }
   }
-  summary: "Computes the max of elements across dimensions of a SparseTensor."
-  description: "This Op takes a SparseTensor and is the sparse counterpart to\n`tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a\nSparseTensor.\n\nReduces `sp_input` along the dimensions given in `reduction_axes`.  Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained\nwith length 1.\n\nIf `reduction_axes` has no entries, all dimensions are reduced, and a tensor\nwith a single element is returned.  Additionally, the axes can be negative,\nwhich are interpreted according to the indexing rules in Python."
 }
 op {
   name: "SparseReduceSum"
   input_arg {
     name: "input_indices"
-    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, possibly not in canonical ordering."
     type: DT_INT64
   }
   input_arg {
     name: "input_values"
-    description: "1-D.  `N` non-empty values corresponding to `input_indices`."
     type_attr: "T"
   }
   input_arg {
     name: "input_shape"
-    description: "1-D.  Shape of the input SparseTensor."
     type: DT_INT64
   }
   input_arg {
     name: "reduction_axes"
-    description: "1-D.  Length-`K` vector containing the reduction axes."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "`R-K`-D.  The reduced Tensor."
     type_attr: "T"
   }
   attr {
@@ -28594,7 +26262,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, retain reduced dimensions with length 1."
   }
   attr {
     name: "T"
@@ -28603,46 +26270,41 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  summary: "Computes the sum of elements across dimensions of a SparseTensor."
-  description: "This Op takes a SparseTensor and is the sparse counterpart to\n`tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`\ninstead of a sparse one.\n\nReduces `sp_input` along the dimensions given in `reduction_axes`.  Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained\nwith length 1.\n\nIf `reduction_axes` has no entries, all dimensions are reduced, and a tensor\nwith a single element is returned.  Additionally, the axes can be negative,\nwhich are interpreted according to the indexing rules in Python."
 }
 op {
   name: "SparseReduceSumSparse"
   input_arg {
     name: "input_indices"
-    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, possibly not in canonical ordering."
     type: DT_INT64
   }
   input_arg {
     name: "input_values"
-    description: "1-D.  `N` non-empty values corresponding to `input_indices`."
     type_attr: "T"
   }
   input_arg {
     name: "input_shape"
-    description: "1-D.  Shape of the input SparseTensor."
     type: DT_INT64
   }
   input_arg {
     name: "reduction_axes"
-    description: "1-D.  Length-`K` vector containing the reduction axes."
     type: DT_INT32
   }
   output_arg {
@@ -28663,7 +26325,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, retain reduced dimensions with length 1."
   }
   attr {
     name: "T"
@@ -28672,89 +26333,74 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  summary: "Computes the sum of elements across dimensions of a SparseTensor."
-  description: "This Op takes a SparseTensor and is the sparse counterpart to\n`tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a\nSparseTensor.\n\nReduces `sp_input` along the dimensions given in `reduction_axes`.  Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained\nwith length 1.\n\nIf `reduction_axes` has no entries, all dimensions are reduced, and a tensor\nwith a single element is returned.  Additionally, the axes can be negative,\nwhich are interpreted according to the indexing rules in Python."
 }
 op {
   name: "SparseReorder"
   input_arg {
     name: "input_indices"
-    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, possibly not in canonical ordering."
     type: DT_INT64
   }
   input_arg {
     name: "input_values"
-    description: "1-D.  `N` non-empty values corresponding to `input_indices`."
     type_attr: "T"
   }
   input_arg {
     name: "input_shape"
-    description: "1-D.  Shape of the input SparseTensor."
     type: DT_INT64
   }
   output_arg {
     name: "output_indices"
-    description: "2-D.  `N x R` matrix with the same indices as input_indices, but\nin canonical row-major ordering."
     type: DT_INT64
   }
   output_arg {
     name: "output_values"
-    description: "1-D.  `N` non-empty values corresponding to `output_indices`."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Reorders a SparseTensor into the canonical, row-major ordering."
-  description: "Note that by convention, all sparse ops preserve the canonical ordering along\nincreasing dimension number. The only time ordering can be violated is during\nmanual manipulation of the indices and values vectors to add entries.\n\nReordering does not affect the shape of the SparseTensor.\n\nIf the tensor has rank `R` and `N` non-empty values, `input_indices` has\nshape `[N, R]`, input_values has length `N`, and input_shape has length `R`."
 }
 op {
   name: "SparseReshape"
   input_arg {
     name: "input_indices"
-    description: "2-D.  `N x R_in` matrix with the indices of non-empty values in a\nSparseTensor."
     type: DT_INT64
   }
   input_arg {
     name: "input_shape"
-    description: "1-D.  `R_in` vector with the input SparseTensor\'s dense shape."
     type: DT_INT64
   }
   input_arg {
     name: "new_shape"
-    description: "1-D.  `R_out` vector with the requested new dense shape."
     type: DT_INT64
   }
   output_arg {
     name: "output_indices"
-    description: "2-D.  `N x R_out` matrix with the updated indices of non-empty\nvalues in the output SparseTensor."
     type: DT_INT64
   }
   output_arg {
     name: "output_shape"
-    description: "1-D.  `R_out` vector with the full dense shape of the output\nSparseTensor.  This is the same as `new_shape` but with any -1 dimensions\nfilled in."
     type: DT_INT64
   }
-  summary: "Reshapes a SparseTensor to represent values in a new dense shape."
-  description: "This operation has the same semantics as reshape on the represented dense\ntensor.  The `input_indices` are recomputed based on the requested `new_shape`.\n\nIf one component of `new_shape` is the special value -1, the size of that\ndimension is computed so that the total dense size remains constant.  At\nmost one component of `new_shape` can be -1.  The number of dense elements\nimplied by `new_shape` must be the same as the number of dense elements\noriginally implied by `input_shape`.\n\nReshaping does not affect the order of values in the SparseTensor.\n\nIf the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`\nhas length `R_out`, then `input_indices` has shape `[N, R_in]`,\n`input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and\n`output_shape` has length `R_out`."
 }
 op {
   name: "SparseSegmentMean"
@@ -28764,17 +26410,14 @@ op {
   }
   input_arg {
     name: "indices"
-    description: "A 1-D tensor. Has same rank as `segment_ids`."
     type_attr: "Tidx"
   }
   input_arg {
     name: "segment_ids"
-    description: "A 1-D tensor. Values should be sorted and can be repeated."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "Has same shape as data, except for dimension 0 which\nhas size `k`, the number of segments."
     type_attr: "T"
   }
   attr {
@@ -28800,29 +26443,23 @@ op {
       }
     }
   }
-  summary: "Computes the mean along sparse segments of a tensor."
-  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nLike `SegmentMean`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`."
 }
 op {
   name: "SparseSegmentMeanGrad"
   input_arg {
     name: "grad"
-    description: "gradient propagated to the SparseSegmentMean op."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "indices passed to the corresponding SparseSegmentMean op."
     type_attr: "Tidx"
   }
   input_arg {
     name: "segment_ids"
-    description: "segment_ids passed to the corresponding SparseSegmentMean op."
     type: DT_INT32
   }
   input_arg {
     name: "output_dim0"
-    description: "dimension 0 of \"data\" passed to SparseSegmentMean op."
     type: DT_INT32
   }
   output_arg {
@@ -28852,28 +26489,27 @@ op {
       }
     }
   }
-  summary: "Computes gradients for SparseSegmentMean."
-  description: "Returns tensor \"output\" with same shape as grad, except for dimension 0 whose\nvalue is output_dim0."
 }
 op {
-  name: "SparseSegmentSqrtN"
+  name: "SparseSegmentMeanWithNumSegments"
   input_arg {
     name: "data"
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "A 1-D tensor. Has same rank as `segment_ids`."
     type_attr: "Tidx"
   }
   input_arg {
     name: "segment_ids"
-    description: "A 1-D tensor. Values should be sorted and can be repeated."
     type: DT_INT32
   }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
   output_arg {
     name: "output"
-    description: "Has same shape as data, except for dimension 0 which\nhas size `k`, the number of segments."
     type_attr: "T"
   }
   attr {
@@ -28899,29 +26535,78 @@ op {
       }
     }
   }
-  summary: "Computes the sum along sparse segments of a tensor divided by the sqrt of N."
-  description: "N is the size of the segment being reduced.\n\nRead @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments."
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
-  name: "SparseSegmentSqrtNGrad"
+  name: "SparseSegmentSqrtN"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSqrtNGrad"
   input_arg {
     name: "grad"
-    description: "gradient propagated to the SparseSegmentSqrtN op."
     type_attr: "T"
   }
   input_arg {
     name: "indices"
-    description: "indices passed to the corresponding SparseSegmentSqrtN op."
     type_attr: "Tidx"
   }
   input_arg {
     name: "segment_ids"
-    description: "segment_ids passed to the corresponding SparseSegmentSqrtN op."
     type: DT_INT32
   }
   input_arg {
     name: "output_dim0"
-    description: "dimension 0 of \"data\" passed to SparseSegmentSqrtN op."
     type: DT_INT32
   }
   output_arg {
@@ -28951,8 +26636,65 @@ op {
       }
     }
   }
-  summary: "Computes gradients for SparseSegmentSqrtN."
-  description: "Returns tensor \"output\" with same shape as grad, except for dimension 0 whose\nvalue is output_dim0."
+}
+op {
+  name: "SparseSegmentSqrtNWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "SparseSegmentSum"
@@ -28962,17 +26704,14 @@ op {
   }
   input_arg {
     name: "indices"
-    description: "A 1-D tensor. Has same rank as `segment_ids`."
     type_attr: "Tidx"
   }
   input_arg {
     name: "segment_ids"
-    description: "A 1-D tensor. Values should be sorted and can be repeated."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "Has same shape as data, except for dimension 0 which\nhas size `k`, the number of segments."
     type_attr: "T"
   }
   attr {
@@ -28982,11 +26721,68 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
         type: DT_INT32
         type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SparseSegmentSumWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -29007,34 +26803,40 @@ op {
       }
     }
   }
-  summary: "Computes the sum along sparse segments of a tensor."
-  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nLike `SegmentSum`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`.\n\nFor example:\n\n```python\nc = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])\n\n# Select two rows, one segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))\n# => [[0 0 0 0]]\n\n# Select two rows, two segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))\n# => [[ 1  2  3  4]\n#     [-1 -2 -3 -4]]\n\n# Select all rows, two segments.\ntf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))\n# => [[0 0 0 0]\n#     [5 6 7 8]]\n\n# Which is equivalent to:\ntf.segment_sum(c, tf.constant([0, 0, 1]))\n```"
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "SparseSlice"
   input_arg {
     name: "indices"
-    description: "2-D tensor represents the indices of the sparse tensor."
     type: DT_INT64
   }
   input_arg {
     name: "values"
-    description: "1-D tensor represents the values of the sparse tensor."
     type_attr: "T"
   }
   input_arg {
     name: "shape"
-    description: "1-D. tensor represents the shape of the sparse tensor."
     type: DT_INT64
   }
   input_arg {
     name: "start"
-    description: "1-D. tensor represents the start of the slice."
     type: DT_INT64
   }
   input_arg {
     name: "size"
-    description: "1-D. tensor represents the size of the slice.\noutput indices: A list of 1-D tensors represents the indices of the output\nsparse tensors."
     type: DT_INT64
   }
   output_arg {
@@ -29043,41 +26845,33 @@ op {
   }
   output_arg {
     name: "output_values"
-    description: "A list of 1-D tensors represents the values of the output sparse\ntensors."
     type_attr: "T"
   }
   output_arg {
     name: "output_shape"
-    description: "A list of 1-D tensors represents the shape of the output sparse\ntensors."
     type: DT_INT64
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Slice a `SparseTensor` based on the `start` and `size`."
-  description: "For example, if the input is\n\n    input_tensor = shape = [2, 7]\n    [    a   d e  ]\n    [b c          ]\n\nGraphically the output tensors are:\n\n    sparse_slice([0, 0], [2, 4]) = shape = [2, 4]\n    [    a  ]\n    [b c    ]\n\n    sparse_slice([0, 4], [2, 3]) = shape = [2, 3]\n    [ d e  ]\n    [      ]"
 }
 op {
   name: "SparseSoftmax"
   input_arg {
     name: "sp_indices"
-    description: "2-D.  `NNZ x R` matrix with the indices of non-empty values in a\nSparseTensor, in canonical ordering."
     type: DT_INT64
   }
   input_arg {
     name: "sp_values"
-    description: "1-D.  `NNZ` non-empty values corresponding to `sp_indices`."
     type_attr: "T"
   }
   input_arg {
     name: "sp_shape"
-    description: "1-D.  Shape of the input SparseTensor."
     type: DT_INT64
   }
   output_arg {
     name: "output"
-    description: "1-D.  The `NNZ` values for the result `SparseTensor`."
     type_attr: "T"
   }
   attr {
@@ -29090,29 +26884,23 @@ op {
       }
     }
   }
-  summary: "Applies softmax to a batched N-D `SparseTensor`."
-  description: "The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`\n(where `N >= 2`), and with indices sorted in the canonical lexicographic order.\n\nThis op is equivalent to applying the normal `tf.nn.softmax()` to each innermost\nlogical submatrix with shape `[B, C]`, but with the catch that *the implicitly\nzero elements do not participate*.  Specifically, the algorithm is equivalent\nto the following:\n\n  (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix\n      with shape `[B, C]`, along the size-C dimension;\n  (2) Masks out the original implicitly-zero locations;\n  (3) Renormalizes the remaining elements.\n\nHence, the `SparseTensor` result has exactly the same non-zero indices and\nshape."
 }
 op {
   name: "SparseSoftmaxCrossEntropyWithLogits"
   input_arg {
     name: "features"
-    description: "batch_size x num_classes matrix"
     type_attr: "T"
   }
   input_arg {
     name: "labels"
-    description: "batch_size vector with values in [0, num_classes).\nThis is the label for the given minibatch entry."
     type_attr: "Tlabels"
   }
   output_arg {
     name: "loss"
-    description: "Per example loss (batch_size vector)."
     type_attr: "T"
   }
   output_arg {
     name: "backprop"
-    description: "backpropagated gradients (batch_size x num_classes matrix)."
     type_attr: "T"
   }
   attr {
@@ -29121,6 +26909,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -29139,49 +26928,39 @@ op {
       }
     }
   }
-  summary: "Computes softmax cross entropy cost and gradients to backpropagate."
-  description: "Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept\na matrix of label probabilities, but rather a single label per row\nof features.  This label is considered to have probability 1.0 for the\ngiven row.\n\nInputs are the logits, not probabilities."
 }
 op {
   name: "SparseSparseMaximum"
   input_arg {
     name: "a_indices"
-    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, in the canonical lexicographic ordering."
     type: DT_INT64
   }
   input_arg {
     name: "a_values"
-    description: "1-D.  `N` non-empty values corresponding to `a_indices`."
     type_attr: "T"
   }
   input_arg {
     name: "a_shape"
-    description: "1-D.  Shape of the input SparseTensor."
     type: DT_INT64
   }
   input_arg {
     name: "b_indices"
-    description: "counterpart to `a_indices` for the other operand."
     type: DT_INT64
   }
   input_arg {
     name: "b_values"
-    description: "counterpart to `a_values` for the other operand; must be of the same dtype."
     type_attr: "T"
   }
   input_arg {
     name: "b_shape"
-    description: "counterpart to `a_shape` for the other operand; the two shapes must be equal."
     type: DT_INT64
   }
   output_arg {
     name: "output_indices"
-    description: "2-D.  The indices of the output SparseTensor."
     type: DT_INT64
   }
   output_arg {
     name: "output_values"
-    description: "1-D.  The values of the output SparseTensor."
     type_attr: "T"
   }
   attr {
@@ -29192,10 +26971,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -29203,49 +26983,39 @@ op {
       }
     }
   }
-  summary: "Returns the element-wise max of two SparseTensors."
-  description: "Assumes the two SparseTensors have the same shape, i.e., no broadcasting."
 }
 op {
   name: "SparseSparseMinimum"
   input_arg {
     name: "a_indices"
-    description: "2-D.  `N x R` matrix with the indices of non-empty values in a\nSparseTensor, in the canonical lexicographic ordering."
     type: DT_INT64
   }
   input_arg {
     name: "a_values"
-    description: "1-D.  `N` non-empty values corresponding to `a_indices`."
     type_attr: "T"
   }
   input_arg {
     name: "a_shape"
-    description: "1-D.  Shape of the input SparseTensor."
     type: DT_INT64
   }
   input_arg {
     name: "b_indices"
-    description: "counterpart to `a_indices` for the other operand."
     type: DT_INT64
   }
   input_arg {
     name: "b_values"
-    description: "counterpart to `a_values` for the other operand; must be of the same dtype."
     type_attr: "T"
   }
   input_arg {
     name: "b_shape"
-    description: "counterpart to `a_shape` for the other operand; the two shapes must be equal."
     type: DT_INT64
   }
   output_arg {
     name: "output_indices"
-    description: "2-D.  The indices of the output SparseTensor."
     type: DT_INT64
   }
   output_arg {
     name: "output_values"
-    description: "1-D.  The values of the output SparseTensor."
     type_attr: "T"
   }
   attr {
@@ -29255,46 +27025,41 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  summary: "Returns the element-wise min of two SparseTensors."
-  description: "Assumes the two SparseTensors have the same shape, i.e., no broadcasting."
 }
 op {
   name: "SparseSplit"
   input_arg {
     name: "split_dim"
-    description: "0-D.  The dimension along which to split.  Must be in the range\n`[0, rank(shape))`."
     type: DT_INT64
   }
   input_arg {
     name: "indices"
-    description: "2-D tensor represents the indices of the sparse tensor."
     type: DT_INT64
   }
   input_arg {
     name: "values"
-    description: "1-D tensor represents the values of the sparse tensor."
     type_attr: "T"
   }
   input_arg {
     name: "shape"
-    description: "1-D. tensor represents the shape of the sparse tensor.\noutput indices: A list of 1-D tensors represents the indices of the output\nsparse tensors."
     type: DT_INT64
   }
   output_arg {
@@ -29304,20 +27069,17 @@ op {
   }
   output_arg {
     name: "output_values"
-    description: "A list of 1-D tensors represents the values of the output sparse\ntensors."
     type_attr: "T"
     number_attr: "num_split"
   }
   output_arg {
     name: "output_shape"
-    description: "A list of 1-D tensors represents the shape of the output sparse\ntensors."
     type: DT_INT64
     number_attr: "num_split"
   }
   attr {
     name: "num_split"
     type: "int"
-    description: "The number of ways to split."
     has_minimum: true
     minimum: 1
   }
@@ -29325,29 +27087,23 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Split a `SparseTensor` into `num_split` tensors along one dimension."
-  description: "If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices\n`[0 : shape[split_dim] % num_split]` gets one extra dimension.\nFor example, if `split_dim = 1` and `num_split = 2` and the input is\n\n    input_tensor = shape = [2, 7]\n    [    a   d e  ]\n    [b c          ]\n\nGraphically the output tensors are:\n\n    output_tensor[0] = shape = [2, 4]\n    [    a  ]\n    [b c    ]\n\n    output_tensor[1] = shape = [2, 3]\n    [ d e  ]\n    [      ]"
 }
 op {
   name: "SparseTensorDenseAdd"
   input_arg {
     name: "a_indices"
-    description: "2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`."
     type_attr: "Tindices"
   }
   input_arg {
     name: "a_values"
-    description: "1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`."
     type_attr: "T"
   }
   input_arg {
     name: "a_shape"
-    description: "1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`."
     type_attr: "Tindices"
   }
   input_arg {
     name: "b"
-    description: "`ndims`-D Tensor.  With shape `a_shape`."
     type_attr: "T"
   }
   output_arg {
@@ -29361,17 +27117,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -29388,29 +27145,23 @@ op {
       }
     }
   }
-  summary: "Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`."
-  description: "This Op does not require `a_indices` be sorted in standard lexicographic order."
 }
 op {
   name: "SparseTensorDenseMatMul"
   input_arg {
     name: "a_indices"
-    description: "2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix."
     type_attr: "Tindices"
   }
   input_arg {
     name: "a_values"
-    description: "1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector."
     type_attr: "T"
   }
   input_arg {
     name: "a_shape"
-    description: "1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector."
     type: DT_INT64
   }
   input_arg {
     name: "b"
-    description: "2-D.  A dense Matrix."
     type_attr: "T"
   }
   output_arg {
@@ -29440,7 +27191,6 @@ op {
     default_value {
       b: false
     }
-    description: "Use the adjoint of A in the matrix multiply.  If A is complex, this\nis transpose(conj(A)).  Otherwise it\'s transpose(A)."
   }
   attr {
     name: "adjoint_b"
@@ -29448,10 +27198,7 @@ op {
     default_value {
       b: false
     }
-    description: "Use the adjoint of B in the matrix multiply.  If B is complex, this\nis transpose(conj(B)).  Otherwise it\'s transpose(B)."
   }
-  summary: "Multiply SparseTensor (of rank 2) \"A\" by dense matrix \"B\"."
-  description: "No validity checking is performed on the indices of A.  However, the following\ninput format is recommended for optimal behavior:\n\nif adjoint_a == false:\n  A should be sorted in lexicographically increasing order.  Use SparseReorder\n  if you\'re not sure.\nif adjoint_a == true:\n  A should be sorted in order of increasing dimension 1 (i.e., \"column major\"\n  order instead of \"row major\" order)."
 }
 op {
   name: "SparseTensorSliceDataset"
@@ -29475,34 +27222,28 @@ op {
     name: "Tvalues"
     type: "type"
   }
-  summary: "Creates a dataset that splits a SparseTensor into elements row-wise."
   is_stateful: true
 }
 op {
   name: "SparseToDense"
   input_arg {
     name: "sparse_indices"
-    description: "0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete\nindex where `sparse_values[i]` will be placed."
     type_attr: "Tindices"
   }
   input_arg {
     name: "output_shape"
-    description: "1-D.  Shape of the dense output tensor."
     type_attr: "Tindices"
   }
   input_arg {
     name: "sparse_values"
-    description: "1-D.  Values corresponding to each row of `sparse_indices`,\nor a scalar value to be used for all sparse indices."
     type_attr: "T"
   }
   input_arg {
     name: "default_value"
-    description: "Scalar value to set for indices not specified in\n`sparse_indices`."
     type_attr: "T"
   }
   output_arg {
     name: "dense"
-    description: "Dense output tensor of shape `output_shape`."
     type_attr: "T"
   }
   attr {
@@ -29511,7 +27252,6 @@ op {
     default_value {
       b: true
     }
-    description: "If true, indices are checked to make sure they are sorted in\nlexicographic order and that there are no repeats."
   }
   attr {
     name: "T"
@@ -29527,54 +27267,43 @@ op {
       }
     }
   }
-  summary: "Converts a sparse representation into a dense tensor."
-  description: "Builds an array `dense` with shape `output_shape` such that\n\n```\n# If sparse_indices is scalar\ndense[i] = (i == sparse_indices ? sparse_values : default_value)\n\n# If sparse_indices is a vector, then for each i\ndense[sparse_indices[i]] = sparse_values[i]\n\n# If sparse_indices is an n by d matrix, then for each i in [0, n)\ndense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]\n```\n\nAll other values in `dense` are set to `default_value`.  If `sparse_values` is a\nscalar, all sparse indices are set to this single value.\n\nIndices should be sorted in lexicographic order, and indices must not\ncontain any repeats. If `validate_indices` is true, these properties\nare checked during execution."
 }
 op {
   name: "SparseToSparseSetOperation"
   input_arg {
     name: "set1_indices"
-    description: "2D `Tensor`, indices of a `SparseTensor`. Must be in row-major\norder."
     type: DT_INT64
   }
   input_arg {
     name: "set1_values"
-    description: "1D `Tensor`, values of a `SparseTensor`. Must be in row-major\norder."
     type_attr: "T"
   }
   input_arg {
     name: "set1_shape"
-    description: "1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must\nbe the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the\nmax set size across `0...n-1` dimensions."
     type: DT_INT64
   }
   input_arg {
     name: "set2_indices"
-    description: "2D `Tensor`, indices of a `SparseTensor`. Must be in row-major\norder."
     type: DT_INT64
   }
   input_arg {
     name: "set2_values"
-    description: "1D `Tensor`, values of a `SparseTensor`. Must be in row-major\norder."
     type_attr: "T"
   }
   input_arg {
     name: "set2_shape"
-    description: "1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must\nbe the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the\nmax set size across `0...n-1` dimensions."
     type: DT_INT64
   }
   output_arg {
     name: "result_indices"
-    description: "2D indices of a `SparseTensor`."
     type: DT_INT64
   }
   output_arg {
     name: "result_values"
-    description: "1D values of a `SparseTensor`."
     type_attr: "T"
   }
   output_arg {
     name: "result_shape"
-    description: "1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is\nthe same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`\nis the max result set size across all `0...n-1` dimensions."
     type: DT_INT64
   }
   attr {
@@ -29603,31 +27332,25 @@ op {
       }
     }
   }
-  summary: "Applies set operation along last dimension of 2 `SparseTensor` inputs."
-  description: "See SetOperationOp::SetOperationFromContext for values of `set_operation`.\n\nIf `validate_indices` is `True`, `SparseToSparseSetOperation` validates the\norder and range of `set1` and `set2` indices.\n\nInput `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,\nand `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same\nas `set2`. Dimension `n` contains values in a set, duplicates are allowed but\nignored.\n\nInput `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,\nand `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same\nas `set1`. Dimension `n` contains values in a set, duplicates are allowed but\nignored.\n\nIf `validate_indices` is `True`, this op validates the order and range of `set1`\nand `set2` indices.\n\nOutput `result` is a `SparseTensor` represented by `result_indices`,\n`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this\nhas rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`\ndimension contains the result of `set_operation` applied to the corresponding\n`[0...n-1]` dimension of `set`."
 }
 op {
   name: "Split"
   input_arg {
     name: "split_dim"
-    description: "0-D.  The dimension along which to split.  Must be in the range\n`[-rank(value), rank(value))`."
     type: DT_INT32
   }
   input_arg {
     name: "value"
-    description: "The tensor to split."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "They are identically shaped tensors, whose shape matches that of `value`\nexcept along `split_dim`, where their sizes are\n`values.shape[split_dim] / num_split`."
     type_attr: "T"
     number_attr: "num_split"
   }
   attr {
     name: "num_split"
     type: "int"
-    description: "The number of ways to split.  Must evenly divide\n`value.shape[split_dim]`."
     has_minimum: true
     minimum: 1
   }
@@ -29635,28 +27358,23 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Splits a tensor into `num_split` tensors along one dimension."
 }
 op {
   name: "SplitV"
   input_arg {
     name: "value"
-    description: "The tensor to split."
     type_attr: "T"
   }
   input_arg {
     name: "size_splits"
-    description: "list containing the sizes of each output tensor along the split\ndimension. Must sum to the dimension of value along split_dim.\nCan contain one -1 indicating that dimension is to be inferred."
     type_attr: "Tlen"
   }
   input_arg {
     name: "split_dim"
-    description: "0-D.  The dimension along which to split.  Must be in the range\n`[-rank(value), rank(value))`."
     type: DT_INT32
   }
   output_arg {
     name: "output"
-    description: "Tensors whose shape matches that of `value`\nexcept along `split_dim`, where their sizes are\n`size_splits[i]`."
     type_attr: "T"
     number_attr: "num_split"
   }
@@ -29683,23 +27401,19 @@ op {
       }
     }
   }
-  summary: "Splits a tensor into `num_split` tensors along one dimension."
 }
 op {
   name: "SqlDataset"
   input_arg {
     name: "driver_name"
-    description: "The database type. Currently, the only supported type is \'sqlite\'."
     type: DT_STRING
   }
   input_arg {
     name: "data_source_name"
-    description: "A connection string to connect to the database."
     type: DT_STRING
   }
   input_arg {
     name: "query"
-    description: "A SQL query to execute."
     type: DT_STRING
   }
   output_arg {
@@ -29718,7 +27432,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that executes a SQL query and emits rows of the result set."
   is_stateful: true
 }
 op {
@@ -29737,6 +27450,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -29744,8 +27458,6 @@ op {
       }
     }
   }
-  summary: "Computes square root of x element-wise."
-  description: "I.e., \\\\(y = \\sqrt{x} = x^{1/2}\\\\)."
 }
 op {
   name: "SqrtGrad"
@@ -29767,6 +27479,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -29774,8 +27487,6 @@ op {
       }
     }
   }
-  summary: "Computes the gradient for the sqrt of `x` wrt its input."
-  description: "Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`\nis the corresponding input gradient."
 }
 op {
   name: "Square"
@@ -29793,6 +27504,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -29802,8 +27514,6 @@ op {
       }
     }
   }
-  summary: "Computes square of x element-wise."
-  description: "I.e., \\\\(y = x * x = x^2\\\\)."
 }
 op {
   name: "SquaredDifference"
@@ -29825,6 +27535,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -29834,20 +27545,16 @@ op {
       }
     }
   }
-  summary: "Returns (x - y)(x - y) element-wise."
-  description: "*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
   is_commutative: true
 }
 op {
   name: "Squeeze"
   input_arg {
     name: "input"
-    description: "The `input` to squeeze."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Contains the same data as `input`, but has one or more dimensions of\nsize 1 removed."
     type_attr: "T"
   }
   attr {
@@ -29861,11 +27568,8 @@ op {
       list {
       }
     }
-    description: "If specified, only squeezes the dimensions listed. The dimension\nindex starts at 0. It is an error to squeeze a dimension that is not 1. Must\nbe in the range `[-rank(input), rank(input))`."
     has_minimum: true
   }
-  summary: "Removes dimensions of size 1 from the shape of a tensor."
-  description: "Given a tensor `input`, this operation returns a tensor of the same type with\nall dimensions of size 1 removed. If you don\'t want to remove all size 1\ndimensions, you can remove specific size 1 dimensions by specifying\n`squeeze_dims`.\n\nFor example:\n\n```\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t)) ==> [2, 3]\n```\n\nOr, to remove specific size 1 dimensions:\n\n```\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]\n```"
 }
 op {
   name: "Stack"
@@ -29885,7 +27589,6 @@ op {
       s: ""
     }
   }
-  summary: "Deprecated, use StackV2."
   is_stateful: true
 }
 op {
@@ -29895,16 +27598,13 @@ op {
     type: DT_STRING
     is_ref: true
   }
-  summary: "Deprecated, use StackCloseV2."
 }
 op {
   name: "StackCloseV2"
   input_arg {
     name: "handle"
-    description: "The handle to a stack."
     type: DT_RESOURCE
   }
-  summary: "Delete the stack from its resource container."
   is_stateful: true
 }
 op {
@@ -29922,26 +27622,21 @@ op {
     name: "elem_type"
     type: "type"
   }
-  summary: "Deprecated, use StackPopV2."
 }
 op {
   name: "StackPopV2"
   input_arg {
     name: "handle"
-    description: "The handle to a stack."
     type: DT_RESOURCE
   }
   output_arg {
     name: "elem"
-    description: "The tensor that is popped from the top of the stack."
     type_attr: "elem_type"
   }
   attr {
     name: "elem_type"
     type: "type"
-    description: "The type of the elem that is popped."
   }
-  summary: "Pop the element at the top of the stack."
   is_stateful: true
 }
 op {
@@ -29970,23 +27665,19 @@ op {
       b: false
     }
   }
-  summary: "Deprecated, use StackPushV2."
 }
 op {
   name: "StackPushV2"
   input_arg {
     name: "handle"
-    description: "The handle to a stack."
     type: DT_RESOURCE
   }
   input_arg {
     name: "elem"
-    description: "The tensor to be pushed onto the stack."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The same tensor as the input \'elem\'."
     type_attr: "T"
   }
   attr {
@@ -29999,27 +27690,22 @@ op {
     default_value {
       b: false
     }
-    description: "Swap `elem` to CPU. Default to false."
   }
-  summary: "Push an element onto the stack."
   is_stateful: true
 }
 op {
   name: "StackV2"
   input_arg {
     name: "max_size"
-    description: "The maximum size of the stack if non-negative. If negative, the stack\nsize is unlimited."
     type: DT_INT32
   }
   output_arg {
     name: "handle"
-    description: "The handle to the stack."
     type: DT_RESOURCE
   }
   attr {
     name: "elem_type"
     type: "type"
-    description: "The type of the elements on the stack."
   }
   attr {
     name: "stack_name"
@@ -30027,16 +27713,13 @@ op {
     default_value {
       s: ""
     }
-    description: "Overrides the name used for the temporary stack resource. Default\nvalue is the name of the \'Stack\' op (which is guaranteed unique)."
   }
-  summary: "A stack that produces elements in first-in last-out order."
   is_stateful: true
 }
 op {
   name: "Stage"
   input_arg {
     name: "values"
-    description: "a list of tensors\ndtypes A list of data types that inserted values should adhere to."
     type_list_attr: "dtypes"
   }
   attr {
@@ -30045,7 +27728,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Maximum number of elements in the Staging Area. If > 0, inserts\non the container will block when the capacity is reached."
     has_minimum: true
   }
   attr {
@@ -30054,7 +27736,6 @@ op {
     default_value {
       i: 0
     }
-    description: "The maximum number of bytes allowed for Tensors in the Staging Area.\nIf > 0, inserts will block until sufficient space is available."
     has_minimum: true
   }
   attr {
@@ -30069,7 +27750,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this queue is placed in the given container. Otherwise,\na default container is used."
   }
   attr {
     name: "shared_name"
@@ -30077,10 +27757,7 @@ op {
     default_value {
       s: ""
     }
-    description: "It is necessary to match this name to the matching Unstage Op."
   }
-  summary: "Stage values similar to a lightweight Enqueue."
-  description: "The basic functionality of this Op is similar to a queue with many\nfewer capabilities and options.  This Op is optimized for performance."
   is_stateful: true
 }
 op {
@@ -30119,7 +27796,6 @@ op {
       s: ""
     }
   }
-  summary: "Op removes all elements in the underlying container."
   is_stateful: true
 }
 op {
@@ -30168,8 +27844,6 @@ op {
       s: ""
     }
   }
-  summary: "Op peeks at the values at the specified index.  If the"
-  description: "underlying container does not contain sufficient elements\nthis op will block until it does.   This Op is optimized for\nperformance."
   is_stateful: true
 }
 op {
@@ -30212,24 +27886,20 @@ op {
       s: ""
     }
   }
-  summary: "Op returns the number of elements in the underlying container."
   is_stateful: true
 }
 op {
   name: "StatelessRandomNormal"
   input_arg {
     name: "shape"
-    description: "The shape of the output tensor."
     type_attr: "T"
   }
   input_arg {
     name: "seed"
-    description: "2 seeds (shape [2])."
-    type: DT_INT64
+    type_attr: "Tseed"
   }
   output_arg {
     name: "output"
-    description: "Random values with specified shape."
     type_attr: "dtype"
   }
   attr {
@@ -30238,7 +27908,6 @@ op {
     default_value {
       type: DT_FLOAT
     }
-    description: "The type of the output."
     allowed_values {
       list {
         type: DT_HALF
@@ -30260,24 +27929,32 @@ op {
       }
     }
   }
-  summary: "Outputs deterministic pseudorandom values from a normal distribution."
-  description: "The generated values will have mean 0 and standard deviation 1.\n\nThe outputs are a deterministic function of `shape` and `seed`."
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "StatelessRandomUniform"
   input_arg {
     name: "shape"
-    description: "The shape of the output tensor."
     type_attr: "T"
   }
   input_arg {
     name: "seed"
-    description: "2 seeds (shape [2])."
-    type: DT_INT64
+    type_attr: "Tseed"
   }
   output_arg {
     name: "output"
-    description: "Random values with specified shape."
     type_attr: "dtype"
   }
   attr {
@@ -30286,7 +27963,6 @@ op {
     default_value {
       type: DT_FLOAT
     }
-    description: "The type of the output."
     allowed_values {
       list {
         type: DT_HALF
@@ -30308,24 +27984,32 @@ op {
       }
     }
   }
-  summary: "Outputs deterministic pseudorandom random values from a uniform distribution."
-  description: "The generated values follow a uniform distribution in the range `[0, 1)`. The\nlower bound 0 is included in the range, while the upper bound 1 is excluded.\n\nThe outputs are a deterministic function of `shape` and `seed`."
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "StatelessTruncatedNormal"
   input_arg {
     name: "shape"
-    description: "The shape of the output tensor."
     type_attr: "T"
   }
   input_arg {
     name: "seed"
-    description: "2 seeds (shape [2])."
-    type: DT_INT64
+    type_attr: "Tseed"
   }
   output_arg {
     name: "output"
-    description: "Random values with specified shape."
     type_attr: "dtype"
   }
   attr {
@@ -30334,7 +28018,6 @@ op {
     default_value {
       type: DT_FLOAT
     }
-    description: "The type of the output."
     allowed_values {
       list {
         type: DT_HALF
@@ -30356,8 +28039,19 @@ op {
       }
     }
   }
-  summary: "Outputs deterministic pseudorandom values from a truncated normal distribution."
-  description: "The generated values follow a normal distribution with mean 0 and standard\ndeviation 1, except that values whose magnitude is more than 2 standard\ndeviations from the mean are dropped and re-picked.\n\nThe outputs are a deterministic function of `shape` and `seed`."
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "StatsAggregatorHandle"
@@ -30379,7 +28073,6 @@ op {
       s: ""
     }
   }
-  summary: "Creates a statistics manager resource."
   is_stateful: true
 }
 op {
@@ -30392,7 +28085,6 @@ op {
     name: "summary"
     type: DT_STRING
   }
-  summary: "Produces a summary of any statistics recorded by the given statistics manager."
   is_stateful: true
 }
 op {
@@ -30409,8 +28101,6 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Stops gradient computation."
-  description: "When executed in a graph, this op outputs its input tensor as-is.\n\nWhen building ops to compute gradients, this op prevents the contribution of\nits inputs to be taken into account.  Normally, the gradient generator adds ops\nto a graph to compute the derivatives of a specified \'loss\' by recursively\nfinding out inputs that contributed to its computation.  If you insert this op\nin the graph it inputs are masked from the gradient generator.  They are not\ntaken into account for computing gradients.\n\nThis is useful any time you want to compute a value with TensorFlow but need\nto pretend that the value was a constant. Some examples include:\n\n*  The *EM* algorithm where the *M-step* should not involve backpropagation\n   through the output of the *E-step*.\n*  Contrastive divergence training of Boltzmann machines where, when\n   differentiating the energy function, the training must not backpropagate\n   through the graph that generated the samples from the model.\n*  Adversarial training, where no backprop should happen through the adversarial\n   example generation process."
 }
 op {
   name: "StridedSlice"
@@ -30420,17 +28110,14 @@ op {
   }
   input_arg {
     name: "begin"
-    description: "`begin[k]` specifies the offset into the `k`th range specification.\nThe exact dimension this corresponds to will be determined by context.\nOut-of-bounds values will be silently clamped. If the `k`th bit of\n`begin_mask` then `begin[k]` is ignored and the full range of the\nappropriate dimension is used instead. Negative values causes indexing\nto start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`."
     type_attr: "Index"
   }
   input_arg {
     name: "end"
-    description: "`end[i]` is like `begin` with the exception that `end_mask` is\nused to determine full ranges."
     type_attr: "Index"
   }
   input_arg {
     name: "strides"
-    description: "`strides[i]` specifies the increment in the `i`th specification\nafter extracting a given element. Negative indices will reverse\nthe original order. Out or range values are\nclamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`"
     type_attr: "Index"
   }
   output_arg {
@@ -30457,7 +28144,6 @@ op {
     default_value {
       i: 0
     }
-    description: "a bitmask where a bit i being 1 means to ignore the begin\nvalue and instead use the largest interval possible. At runtime\nbegin[i] will be replaced with `[0, n-1) if `stride[i] > 0` or\n`[-1, n-1]` if `stride[i] < 0`"
   }
   attr {
     name: "end_mask"
@@ -30465,7 +28151,6 @@ op {
     default_value {
       i: 0
     }
-    description: "analogous to `begin_mask`"
   }
   attr {
     name: "ellipsis_mask"
@@ -30473,7 +28158,6 @@ op {
     default_value {
       i: 0
     }
-    description: "a bitmask where bit `i` being 1 means the `i`th\nposition is actually an ellipsis. One bit at most can be 1.\nIf `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`\nis provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis\nimplicitly creates as many range specifications as necessary to fully\nspecify the sliced range for every dimension. For example for a 4-dimensional\ntensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`."
   }
   attr {
     name: "new_axis_mask"
@@ -30481,7 +28165,6 @@ op {
     default_value {
       i: 0
     }
-    description: "a bitmask where bit `i` being 1 means the `i`th\nspecification creates a new shape 1 dimension. For example\n`foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor."
   }
   attr {
     name: "shrink_axis_mask"
@@ -30489,10 +28172,7 @@ op {
     default_value {
       i: 0
     }
-    description: "a bitmask where bit `i` implies that the `i`th\nspecification should shrink the dimensionality. begin and end\nmust imply a slice of size 1 in the dimension. For example in\npython one might do `foo[:, 3, :]` which would result in\n`shrink_axis_mask` being 2."
   }
-  summary: "Return a strided slice from `input`."
-  description: "Note, most python users will want to use the Python `Tensor.__getitem__`\nor `Variable.__getitem__` rather than this op directly.\n\nThe goal of this op is to produce a new tensor with a subset of\nthe elements from the `n` dimensional `input` tensor. The subset is chosen using\na sequence of `m` sparse range specifications encoded into the arguments\nof this function. Note, in some cases\n`m` could be equal to `n`, but this need not be the case. Each\nrange specification entry can be one of the following:\n\n- An ellipsis (...). Ellipses are used to imply zero or more\n  dimensions of full-dimension selection and are produced using\n  `ellipsis_mask`. For example, `foo[...]` is the identity slice.\n\n- A new axis. This is used to insert a new shape=1 dimension and is\n  produced using `new_axis_mask`. For example, `foo[:, ...]` where\n  `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.\n\n\n- A range `begin:end:stride`. This is used to specify how much to choose from\n  a given dimension. `stride` can be any integer but 0.  `begin` is an integer\n  which represents the index of the first value to select while `end` represents\n  the index of the last value to select. The number of values selected in each\n  dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.\n  `begin` and `end` can be negative where `-1` is the last element, `-2` is\n  the second to last. `begin_mask` controls whether to replace the explicitly\n  given `begin` with an implicit effective value of `0` if `stride > 0` and\n  `-1` if `stride < 0`. `end_mask` is analogous but produces the number\n  required to create the largest open interval. For example, given a shape\n  `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do\n  not assume this is equivalent to `foo[0:-1]` which has an effective `begin`\n  and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the\n  first dimension of a tensor while dropping the last two (in the original\n  order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.\n\n- A single index. This is used to keep only elements that have a given\n  index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a\n  shape `(6,)` tensor. This is encoded in `begin` and `end` and\n  `shrink_axis_mask`.\n\nEach conceptual range specification is encoded in the op\'s argument. This\nencoding is best understand by considering a non-trivial example. In\nparticular,\n`foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as\n\n```\nbegin = [1, 2, x, x, 0, x] # x denotes don\'t care (usually 0)\nend = [2, 4, x, x, -3, x]\nstrides = [1, 1, x, x, -1, 1]\nbegin_mask = 1<<4 | 1 << 5 = 48\nend_mask = 1<<5 = 32\nellipsis_mask = 1<<3 = 8\nnew_axis_mask = 1<<2 4\nshrink_axis_mask = 1<<0\n```\n\nIn this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of\nthe slice becomes (2, 1, 5, 5, 2, 5).\nLet us walk step by step through each argument specification.\n\n1.  The first argument in the example slice is turned into `begin = 1` and\n`end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we\nalso set the appropriate bit in `shrink_axis_mask`.\n\n2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have\nzero bits contributed.\n\n3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1\ndimension in the final shape. Dummy values are contributed to begin,\nend and stride, while the new_axis_mask bit is set.\n\n4. `...` grab the full ranges from as many dimensions as needed to\nfully specify a slice for every dimension of the input shape.\n\n5. `:-3:-1` shows the use of negative indices. A negative index `i` associated\nwith a dimension that has shape `s` is converted to a positive index\n`s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion\nis done internally so begin, end and strides receive x, -3, and -1.\nThe appropriate begin_mask bit is set to indicate the start range is the\nfull range (ignoring the x).\n\n6. `:` indicates that the entire contents of the corresponding dimension\nis selected. This is equivalent to `::` or `0::1`. begin, end, and strides\nreceive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and\n`end_mask` are also set.\n\n*Requirements*:\n  `0 != strides[i] for i in [0, m)`\n  `ellipsis_mask must be a power of two (only one ellipsis)`"
 }
 op {
   name: "StridedSliceAssign"
@@ -30571,8 +28251,6 @@ op {
       i: 0
     }
   }
-  summary: "Assign `value` to the sliced l-value reference of `ref`."
-  description: "The values of `value` are assigned to the positions in the variable\n`ref` that are selected by the slice parameters. The slice parameters\n`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.\n\nNOTE this op currently does not support broadcasting and so `value`\'s\nshape must be exactly the shape produced by the slice of `ref`."
 }
 op {
   name: "StridedSliceGrad"
@@ -30649,14 +28327,11 @@ op {
       i: 0
     }
   }
-  summary: "Returns the gradient of `StridedSlice`."
-  description: "Since `StridedSlice` cuts out pieces of its `input` which is size\n`shape`, its gradient will have the same shape (which is passed here\nas `shape`). The gradient will be zero in any element that the slice\ndoes not select.\n\nArguments are the same as StridedSliceGrad with the exception that\n`dy` is the input gradient to be propagated and `shape` is the\nshape of `StridedSlice`\'s `input`."
 }
 op {
   name: "StringJoin"
   input_arg {
     name: "inputs"
-    description: "A list of string tensors.  The tensors must all have the same shape,\nor be scalars.  Scalars may be mixed in; these will be broadcast to the shape\nof non-scalar inputs."
     type: DT_STRING
     number_attr: "N"
   }
@@ -30676,36 +28351,28 @@ op {
     default_value {
       s: ""
     }
-    description: "string, an optional join separator."
   }
-  summary: "Joins the strings in the given list of string tensors into one tensor;"
-  description: "with the given separator (default is an empty separator)."
 }
 op {
   name: "StringSplit"
   input_arg {
     name: "input"
-    description: "1-D. Strings to split."
     type: DT_STRING
   }
   input_arg {
     name: "delimiter"
-    description: "0-D. Delimiter characters (bytes), or empty string."
     type: DT_STRING
   }
   output_arg {
     name: "indices"
-    description: "A dense matrix of int64 representing the indices of the sparse tensor."
     type: DT_INT64
   }
   output_arg {
     name: "values"
-    description: "A vector of strings corresponding to the splited values."
     type: DT_STRING
   }
   output_arg {
     name: "shape"
-    description: "a length-2 vector of int64 representing the shape of the sparse\ntensor, where the first value is N and the second value is the maximum number\nof tokens in a single input entry."
     type: DT_INT64
   }
   attr {
@@ -30714,10 +28381,7 @@ op {
     default_value {
       b: true
     }
-    description: "A `bool`. If `True`, skip the empty strings from the result."
   }
-  summary: "Split elements of `input` based on `delimiter` into a `SparseTensor`."
-  description: "Let N be the size of source (typically N will be the batch size). Split each\nelement of `input` based on `delimiter` and return a `SparseTensor`\ncontaining the splitted tokens. Empty tokens are ignored.\n\n`delimiter` can be empty, or a string of split characters. If `delimiter` is an\n empty string, each element of `input` is split into individual single-byte\n character strings, including splitting of UTF-8 multibyte sequences. Otherwise\n every character of `delimiter` is a potential split point.\n\nFor example:\n  N = 2, input[0] is \'hello world\' and input[1] is \'a b c\', then the output\n  will be\n\n  indices = [0, 0;\n             0, 1;\n             1, 0;\n             1, 1;\n             1, 2]\n  shape = [2, 3]\n  values = [\'hello\', \'world\', \'a\', \'b\', \'c\']"
 }
 op {
   name: "StringToHashBucket"
@@ -30727,67 +28391,52 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A Tensor of the same shape as the input `string_tensor`."
     type: DT_INT64
   }
   attr {
     name: "num_buckets"
     type: "int"
-    description: "The number of buckets."
     has_minimum: true
     minimum: 1
   }
-  summary: "Converts each string in the input Tensor to its hash mod by a number of buckets."
-  description: "The hash function is deterministic on the content of the string within the\nprocess.\n\nNote that the hash function may change from time to time.\nThis functionality will be deprecated and it\'s recommended to use\n`tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`."
 }
 op {
   name: "StringToHashBucketFast"
   input_arg {
     name: "input"
-    description: "The strings to assign a hash bucket."
     type: DT_STRING
   }
   output_arg {
     name: "output"
-    description: "A Tensor of the same shape as the input `string_tensor`."
     type: DT_INT64
   }
   attr {
     name: "num_buckets"
     type: "int"
-    description: "The number of buckets."
     has_minimum: true
     minimum: 1
   }
-  summary: "Converts each string in the input Tensor to its hash mod by a number of buckets."
-  description: "The hash function is deterministic on the content of the string within the\nprocess and will never change. However, it is not suitable for cryptography.\nThis function may be used when CPU time is scarce and inputs are trusted or\nunimportant. There is a risk of adversaries constructing inputs that all hash\nto the same bucket. To prevent this problem, use a strong hash function with\n`tf.string_to_hash_bucket_strong`."
 }
 op {
   name: "StringToHashBucketStrong"
   input_arg {
     name: "input"
-    description: "The strings to assign a hash bucket."
     type: DT_STRING
   }
   output_arg {
     name: "output"
-    description: "A Tensor of the same shape as the input `string_tensor`."
     type: DT_INT64
   }
   attr {
     name: "num_buckets"
     type: "int"
-    description: "The number of buckets."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "key"
     type: "list(int)"
-    description: "The key for the keyed hash function passed as a list of two uint64\nelements."
   }
-  summary: "Converts each string in the input Tensor to its hash mod by a number of buckets."
-  description: "The hash function is deterministic on the content of the string within the\nprocess. The hash function is a keyed hash function, where attribute `key`\ndefines the key of the hash function. `key` is an array of 2 elements.\n\nA strong hash is important when inputs may be malicious, e.g. URLs with\nadditional components. Adversaries could try to make their inputs hash to the\nsame bucket for a denial-of-service attack or to skew the results. A strong\nhash prevents this by making it difficult, if not infeasible, to compute inputs\nthat hash to the same bucket. This comes at a cost of roughly 4x higher compute\ntime than `tf.string_to_hash_bucket_fast`."
 }
 op {
   name: "StringToNumber"
@@ -30797,7 +28446,6 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A Tensor of the same shape as the input `string_tensor`."
     type_attr: "out_type"
   }
   attr {
@@ -30806,7 +28454,6 @@ op {
     default_value {
       type: DT_FLOAT
     }
-    description: "The numeric type to interpret each string in `string_tensor` as."
     allowed_values {
       list {
         type: DT_FLOAT
@@ -30816,8 +28463,6 @@ op {
       }
     }
   }
-  summary: "Converts each string in the input Tensor to the specified numeric type."
-  description: "(Note that int32 overflow results in an error while float overflow\nresults in a rounded value.)"
 }
 op {
   name: "Sub"
@@ -30839,6 +28484,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -30852,29 +28498,23 @@ op {
       }
     }
   }
-  summary: "Returns x - y element-wise."
-  description: "*NOTE*: `Sub` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "Substr"
   input_arg {
     name: "input"
-    description: "Tensor of strings"
     type: DT_STRING
   }
   input_arg {
     name: "pos"
-    description: "Scalar defining the position of first character in each substring"
     type_attr: "T"
   }
   input_arg {
     name: "len"
-    description: "Scalar defining the number of characters to include in each substring"
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Tensor of substrings"
     type: DT_STRING
   }
   attr {
@@ -30887,24 +28527,19 @@ op {
       }
     }
   }
-  summary: "Return substrings from `Tensor` of strings."
-  description: "For each string in the input `Tensor`, creates a substring starting at index\n`pos` with a total length of `len`.\n\nIf `len` defines a substring that would extend beyond the length of the input\nstring, then as many characters as possible are used.\n\nIf `pos` is negative or specifies a character index larger than any of the input\nstrings, then an `InvalidArgumentError` is thrown.\n\n`pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on\nOp creation.\n\n*NOTE*: `Substr` supports broadcasting up to two dimensions. More about\nbroadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)\n\n---\n\nExamples\n\nUsing scalar `pos` and `len`:\n\n```python\ninput = [b\'Hello\', b\'World\']\nposition = 1\nlength = 3\n\noutput = [b\'ell\', b\'orl\']\n```\n\nUsing `pos` and `len` with same shape as `input`:\n\n```python\ninput = [[b\'ten\', b\'eleven\', b\'twelve\'],\n         [b\'thirteen\', b\'fourteen\', b\'fifteen\'],\n         [b\'sixteen\', b\'seventeen\', b\'eighteen\']]\nposition = [[1, 2, 3],\n            [1, 2, 3],\n            [1, 2, 3]]\nlength =   [[2, 3, 4],\n            [4, 3, 2],\n            [5, 5, 5]]\n\noutput = [[b\'en\', b\'eve\', b\'lve\'],\n          [b\'hirt\', b\'urt\', b\'te\'],\n          [b\'ixtee\', b\'vente\', b\'hteen\']]\n```\n\nBroadcasting `pos` and `len` onto `input`:\n\n```\ninput = [[b\'ten\', b\'eleven\', b\'twelve\'],\n         [b\'thirteen\', b\'fourteen\', b\'fifteen\'],\n         [b\'sixteen\', b\'seventeen\', b\'eighteen\'],\n         [b\'nineteen\', b\'twenty\', b\'twentyone\']]\nposition = [1, 2, 3]\nlength =   [1, 2, 3]\n\noutput = [[b\'e\', b\'ev\', b\'lve\'],\n          [b\'h\', b\'ur\', b\'tee\'],\n          [b\'i\', b\'ve\', b\'hte\'],\n          [b\'i\', b\'en\', b\'nty\']]\n```\n\nBroadcasting `input` onto `pos` and `len`:\n\n```\ninput = b\'thirteen\'\nposition = [1, 5, 7]\nlength =   [3, 2, 1]\n\noutput = [b\'hir\', b\'ee\', b\'n\']\n```"
 }
 op {
   name: "Sum"
   input_arg {
     name: "input"
-    description: "The tensor to reduce."
     type_attr: "T"
   }
   input_arg {
     name: "reduction_indices"
-    description: "The dimensions to reduce. Must be in the range\n`[-rank(input), rank(input))`."
     type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    description: "The reduced tensor."
     type_attr: "T"
   }
   attr {
@@ -30913,7 +28548,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, retain reduced dimensions with length 1."
   }
   attr {
     name: "T"
@@ -30922,17 +28556,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -30952,29 +28587,23 @@ op {
       }
     }
   }
-  summary: "Computes the sum of elements across dimensions of a tensor."
-  description: "Reduces `input` along the dimensions given in `reduction_indices`. Unless\n`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in\n`reduction_indices`. If `keep_dims` is true, the reduced dimensions are\nretained with length 1."
 }
 op {
   name: "Svd"
   input_arg {
     name: "input"
-    description: "A tensor of shape `[..., M, N]` whose inner-most 2 dimensions\nform matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`."
     type_attr: "T"
   }
   output_arg {
     name: "s"
-    description: "Singular values. Shape is `[..., P]`."
     type_attr: "T"
   }
   output_arg {
     name: "u"
-    description: "Left singular vectors. If `full_matrices` is `False` then shape is\n`[..., M, P]`; if `full_matrices` is `True` then shape is\n`[..., M, M]`. Undefined if `compute_uv` is `False`."
     type_attr: "T"
   }
   output_arg {
     name: "v"
-    description: "Left singular vectors. If `full_matrices` is `False` then shape is\n`[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.\nUndefined if `compute_uv` is false."
     type_attr: "T"
   }
   attr {
@@ -30983,7 +28612,6 @@ op {
     default_value {
       b: true
     }
-    description: "If true, left and right singular vectors will be\ncomputed and returned in `u` and `v`, respectively.\nIf false, `u` and `v` are not set and should never referenced."
   }
   attr {
     name: "full_matrices"
@@ -30991,7 +28619,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true, compute full-sized `u` and `v`. If false\n(the default), compute only the leading `P` singular vectors.\nIgnored if `compute_uv` is `False`."
   }
   attr {
     name: "T"
@@ -31005,100 +28632,81 @@ op {
       }
     }
   }
-  summary: "Computes the singular value decompositions of one or more matrices."
-  description: "Computes the SVD of each inner matrix in `input` such that\n`input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`\n\n```python\n# a is a tensor containing a batch of matrices.\n# s is a tensor of singular values for each matrix.\n# u is the tensor containing of left singular vectors for each matrix.\n# v is the tensor containing of right singular vectors for each matrix.\ns, u, v = svd(a)\ns, _, _ = svd(a, compute_uv=False)\n```"
 }
 op {
   name: "Switch"
   input_arg {
     name: "data"
-    description: "The tensor to be forwarded to the appropriate output."
     type_attr: "T"
   }
   input_arg {
     name: "pred"
-    description: "A scalar that specifies which output port will receive data."
     type: DT_BOOL
   }
   output_arg {
     name: "output_false"
-    description: "If `pred` is false, data will be forwarded to this output."
     type_attr: "T"
   }
   output_arg {
     name: "output_true"
-    description: "If `pred` is true, data will be forwarded to this output."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Forwards `data` to the output port determined by `pred`."
-  description: "If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,\nthe data goes to `output_false`.\n\nSee also `RefSwitch` and `Merge`."
 }
 op {
   name: "SymbolicGradient"
   input_arg {
     name: "input"
-    description: "a list of input tensors of size N + M;"
     type_list_attr: "Tin"
   }
   output_arg {
     name: "output"
-    description: "a list of output tensors of size N;"
     type_list_attr: "Tout"
   }
   attr {
     name: "Tin"
     type: "list(type)"
-    description: "the type list for the input list."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "Tout"
     type: "list(type)"
-    description: "the type list for the input list."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "f"
     type: "func"
-    description: "The function we want to compute the gradient for.\n\nThe function \'f\' must be a numerical function which takes N inputs and\nproduces M outputs. Its gradient function \'g\', which is computed by\nthis SymbolicGradient op is a function taking N + M inputs and\nproduces N outputs.\n\nI.e. if we have\n   (y1, y2, ..., y_M) = f(x1, x2, ..., x_N),\nthen, g is\n   (dL/dx1, dL/dx2, ..., dL/dx_N) = g(x1, x2, ..., x_N,\n                                     dL/dy1, dL/dy2, ..., dL/dy_M),\n\nwhere L is a scalar-value function of (x1, x2, ..., xN) (e.g., the\nloss function). dL/dx_i is the partial derivative of L with respect\nto x_i.\n\n(Needs some math expert to say the comment above better.)"
   }
-  summary: "Computes the gradient function for function f via backpropagation."
 }
 op {
   name: "TFRecordDataset"
   input_arg {
     name: "filenames"
-    description: "A scalar or vector containing the name(s) of the file(s) to be\nread."
     type: DT_STRING
   }
   input_arg {
     name: "compression_type"
-    description: "A scalar containing either (i) the empty string (no\ncompression), (ii) \"ZLIB\", or (iii) \"GZIP\"."
     type: DT_STRING
   }
   input_arg {
     name: "buffer_size"
-    description: "A scalar representing the number of bytes to buffer. A value of\n0 means no buffering will be performed."
     type: DT_INT64
   }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
-  summary: "Creates a dataset that emits the records from one or more TFRecord files."
   is_stateful: true
 }
 op {
   name: "TFRecordReader"
   output_arg {
     name: "reader_handle"
-    description: "The handle to reference the Reader."
     type: DT_STRING
     is_ref: true
   }
@@ -31108,7 +28716,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -31116,7 +28723,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
   }
   attr {
     name: "compression_type"
@@ -31125,14 +28731,16 @@ op {
       s: ""
     }
   }
-  summary: "A Reader that outputs the records from a TensorFlow Records file."
+  deprecation {
+    version: 26
+    explanation: "Use TFRecordReaderV2"
+  }
   is_stateful: true
 }
 op {
   name: "TFRecordReaderV2"
   output_arg {
     name: "reader_handle"
-    description: "The handle to reference the Reader."
     type: DT_RESOURCE
   }
   attr {
@@ -31141,7 +28749,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -31149,7 +28756,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
   }
   attr {
     name: "compression_type"
@@ -31158,7 +28764,6 @@ op {
       s: ""
     }
   }
-  summary: "A Reader that outputs the records from a TensorFlow Records file."
   is_stateful: true
 }
 op {
@@ -31169,7 +28774,6 @@ op {
   }
   input_arg {
     name: "count"
-    description: "A scalar representing the number of elements from the `input_dataset`\nthat should be taken. A value of `-1` indicates that all of `input_dataset`\nis taken."
     type: DT_INT64
   }
   output_arg {
@@ -31188,34 +28792,28 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that contains `count` elements from the `input_dataset`."
 }
 op {
   name: "TakeManySparseFromTensorsMap"
   input_arg {
     name: "sparse_handles"
-    description: "1-D, The `N` serialized `SparseTensor` objects.\nShape: `[N]`."
     type: DT_INT64
   }
   output_arg {
     name: "sparse_indices"
-    description: "2-D.  The `indices` of the minibatch `SparseTensor`."
     type: DT_INT64
   }
   output_arg {
     name: "sparse_values"
-    description: "1-D.  The `values` of the minibatch `SparseTensor`."
     type_attr: "dtype"
   }
   output_arg {
     name: "sparse_shape"
-    description: "1-D.  The `shape` of the minibatch `SparseTensor`."
     type: DT_INT64
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The `dtype` of the `SparseTensor` objects stored in the\n`SparseTensorsMap`."
   }
   attr {
     name: "container"
@@ -31223,7 +28821,6 @@ op {
     default_value {
       s: ""
     }
-    description: "The container name for the `SparseTensorsMap` read by this op."
   }
   attr {
     name: "shared_name"
@@ -31231,10 +28828,7 @@ op {
     default_value {
       s: ""
     }
-    description: "The shared name for the `SparseTensorsMap` read by this op.\nIt should not be blank; rather the `shared_name` or unique Operation name\nof the Op that created the original `SparseTensorsMap` should be used."
   }
-  summary: "Read `SparseTensors` from a `SparseTensorsMap` and concatenate them."
-  description: "The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where\n`N` is the minibatch size and the rows correspond to the output handles of\n`AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the\noriginal `SparseTensor` objects that went into the given input ops must all\nmatch.  When the final `SparseTensor` is created, it has rank one\nhigher than the ranks of the incoming `SparseTensor` objects\n(they have been concatenated along a new row dimension on the left).\n\nThe output `SparseTensor` object\'s shape values for all dimensions but the\nfirst are the max across the input `SparseTensor` objects\' shape values\nfor the corresponding dimensions.  Its first shape value is `N`, the minibatch\nsize.\n\nThe input `SparseTensor` objects\' indices are assumed ordered in\nstandard lexicographic order.  If this is not the case, after this\nstep run `SparseReorder` to restore index ordering.\n\nFor example, if the handles represent an input, which is a `[2, 3]` matrix\nrepresenting two original `SparseTensor` objects:\n\n```\n    index = [ 0]\n            [10]\n            [20]\n    values = [1, 2, 3]\n    shape = [50]\n```\n\nand\n\n```\n    index = [ 2]\n            [10]\n    values = [4, 5]\n    shape = [30]\n```\n\nthen the final `SparseTensor` will be:\n\n```\n    index = [0  0]\n            [0 10]\n            [0 20]\n            [1  2]\n            [1 10]\n    values = [1, 2, 3, 4, 5]\n    shape = [2 50]\n```"
   is_stateful: true
 }
 op {
@@ -31253,6 +28847,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -31262,7 +28857,6 @@ op {
       }
     }
   }
-  summary: "Computes tan of x element-wise."
 }
 op {
   name: "Tanh"
@@ -31280,6 +28874,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -31287,7 +28882,6 @@ op {
       }
     }
   }
-  summary: "Computes hyperbolic tangent of `x` element-wise."
 }
 op {
   name: "TanhGrad"
@@ -31309,6 +28903,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -31316,26 +28911,21 @@ op {
       }
     }
   }
-  summary: "Computes the gradient for the tanh of `x` wrt its input."
-  description: "Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`\nis the corresponding input gradient."
 }
 op {
   name: "TemporaryVariable"
   output_arg {
     name: "ref"
-    description: "A reference to the variable tensor."
     type_attr: "dtype"
     is_ref: true
   }
   attr {
     name: "shape"
     type: "shape"
-    description: "The shape of the variable tensor."
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of elements in the variable tensor."
   }
   attr {
     name: "var_name"
@@ -31343,10 +28933,7 @@ op {
     default_value {
       s: ""
     }
-    description: "Overrides the name used for the temporary variable resource. Default\nvalue is the name of the \'TemporaryVariable\' op (which is guaranteed unique)."
   }
-  summary: "Returns a tensor that may be mutated, but only persists within a single step."
-  description: "This is an experimental op for internal use only and it is possible to use this\nop in unsafe ways.  DO NOT USE unless you fully understand the risks.\n\nIt is the caller\'s responsibility to ensure that \'ref\' is eventually passed to a\nmatching \'DestroyTemporaryVariable\' op after all other uses have completed.\n\nOutputs a ref to the tensor state so it may be read or modified.\n\n  E.g.\n      var = state_ops._temporary_variable([1, 2], types.float_)\n      var_name = var.op.name\n      var = state_ops.assign(var, [[4.0, 5.0]])\n      var = state_ops.assign_add(var, [[6.0, 7.0]])\n      final = state_ops._destroy_temporary_variable(var, var_name=var_name)"
   is_stateful: true
 }
 op {
@@ -31418,17 +29005,17 @@ op {
     name: "handle"
     type: DT_STRING
   }
-  summary: "Deprecated. Use TensorArrayCloseV3"
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayCloseV3"
+  }
 }
 op {
   name: "TensorArrayCloseV3"
   input_arg {
     name: "handle"
-    description: "The handle to a TensorArray (output of TensorArray or TensorArrayGrad)."
     type: DT_RESOURCE
   }
-  summary: "Delete the TensorArray from its resource container."
-  description: "This enables the user to close and release the resource in the middle\nof a step/run."
   is_stateful: true
 }
 op {
@@ -31499,34 +29086,28 @@ op {
       }
     }
   }
-  summary: "Deprecated. Use TensorArrayConcatV3"
 }
 op {
   name: "TensorArrayConcatV3"
   input_arg {
     name: "handle"
-    description: "The handle to a TensorArray."
     type: DT_RESOURCE
   }
   input_arg {
     name: "flow_in"
-    description: "A float scalar that enforces proper chaining of operations."
     type: DT_FLOAT
   }
   output_arg {
     name: "value"
-    description: "All of the elements in the TensorArray, concatenated along the first\naxis."
     type_attr: "dtype"
   }
   output_arg {
     name: "lengths"
-    description: "A vector of the row sizes of the original T elements in the\nvalue output.  In the example above, this would be the values:\n`(n1, n2, ..., n(T-1))`."
     type: DT_INT64
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of the elem that is returned."
   }
   attr {
     name: "element_shape_except0"
@@ -31536,10 +29117,7 @@ op {
         unknown_rank: true
       }
     }
-    description: "The expected shape of an element, if known,\nexcluding the first dimension. Used to validate the shapes of\nTensorArray elements. If this shape is not fully specified, concatenating\nzero-size TensorArrays is an error."
   }
-  summary: "Concat the elements from the TensorArray into value `value`."
-  description: "Takes `T` elements of shapes\n\n  ```\n  (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)\n  ```\n\nand concatenates them into a Tensor of shape:\n\n  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```\n\nAll elements must have the same shape (excepting the first dimension)."
   is_stateful: true
 }
 op {
@@ -31610,34 +29188,32 @@ op {
       }
     }
   }
-  summary: "Deprecated. Use TensorArrayGatherV3"
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayGatherV3"
+  }
 }
 op {
   name: "TensorArrayGatherV3"
   input_arg {
     name: "handle"
-    description: "The handle to a TensorArray."
     type: DT_RESOURCE
   }
   input_arg {
     name: "indices"
-    description: "The locations in the TensorArray from which to read tensor elements."
     type: DT_INT32
   }
   input_arg {
     name: "flow_in"
-    description: "A float scalar that enforces proper chaining of operations."
     type: DT_FLOAT
   }
   output_arg {
     name: "value"
-    description: "All of the elements in the TensorArray, concatenated along a new\naxis (the new dimension 0)."
     type_attr: "dtype"
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of the elem that is returned."
   }
   attr {
     name: "element_shape"
@@ -31647,10 +29223,7 @@ op {
         unknown_rank: true
       }
     }
-    description: "The expected shape of an element, if known. Used to\nvalidate the shapes of TensorArray elements. If this shape is not\nfully specified, gathering zero-size TensorArrays is an error."
   }
-  summary: "Gather specific elements from the TensorArray into output `value`."
-  description: "All elements selected by `indices` must have the same shape."
   is_stateful: true
 }
 op {
@@ -31696,19 +29269,20 @@ op {
     name: "source"
     type: "string"
   }
-  summary: "Deprecated. Use TensorArrayGradV3"
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayGradV3"
+  }
   is_stateful: true
 }
 op {
   name: "TensorArrayGradV3"
   input_arg {
     name: "handle"
-    description: "The handle to the forward TensorArray."
     type: DT_RESOURCE
   }
   input_arg {
     name: "flow_in"
-    description: "A float scalar that enforces proper chaining of operations."
     type: DT_FLOAT
   }
   output_arg {
@@ -31722,10 +29296,7 @@ op {
   attr {
     name: "source"
     type: "string"
-    description: "The gradient source string, used to decide which gradient TensorArray\nto return."
   }
-  summary: "Creates a TensorArray for storing the gradients of values in the given handle."
-  description: "If the given TensorArray gradient already exists, returns a reference to it.\n\nLocks the size of the original TensorArray by disabling its dynamic size flag.\n\n**A note about the input flow_in:**\n\nThe handle flow_in forces the execution of the gradient lookup to occur\nonly after certain other operations have occurred.  For example, when\nthe forward TensorArray is dynamically sized, writes to this TensorArray\nmay resize the object.  The gradient TensorArray is statically sized based\non the size of the forward TensorArray when this operation executes.\nFurthermore, the size of the forward TensorArray is frozen by this call.\nAs a result, the flow is used to ensure that the call to generate the gradient\nTensorArray only happens after all writes are executed.\n\nIn the case of dynamically sized TensorArrays, gradient computation should\nonly be performed on read operations that have themselves been chained via\nflow to occur only after all writes have executed. That way the final size\nof the forward TensorArray is known when this operation is called.\n\n**A note about the source attribute:**\n\nTensorArray gradient calls use an accumulator TensorArray object.  If\nmultiple gradients are calculated and run in the same session, the multiple\ngradient nodes may accidentally flow through the same accumulator TensorArray.\nThis double counts and generally breaks the TensorArray gradient flow.\n\nThe solution is to identify which gradient call this particular\nTensorArray gradient is being called in.  This is performed by identifying\na unique string (e.g. \"gradients\", \"gradients_1\", ...) from the input\ngradient Tensor\'s name.  This string is used as a suffix when creating\nthe TensorArray gradient object here (the attribute `source`).\n\nThe attribute `source` is added as a suffix to the forward TensorArray\'s\nname when performing the creation / lookup, so that each separate gradient\ncalculation gets its own TensorArray accumulator."
   is_stateful: true
 }
 op {
@@ -31811,13 +29382,15 @@ op {
     name: "dtype"
     type: "type"
   }
-  summary: "Deprecated. Use TensorArrayReadV3"
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayReadV3"
+  }
 }
 op {
   name: "TensorArrayReadV3"
   input_arg {
     name: "handle"
-    description: "The handle to a TensorArray."
     type: DT_RESOURCE
   }
   input_arg {
@@ -31826,20 +29399,16 @@ op {
   }
   input_arg {
     name: "flow_in"
-    description: "A float scalar that enforces proper chaining of operations."
     type: DT_FLOAT
   }
   output_arg {
     name: "value"
-    description: "The tensor that is read from the TensorArray."
     type_attr: "dtype"
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of the elem that is returned."
   }
-  summary: "Read an element from the TensorArray into output `value`."
   is_stateful: true
 }
 op {
@@ -31900,41 +29469,37 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Deprecated. Use TensorArrayScatterV3"
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayScatterV3"
+  }
 }
 op {
   name: "TensorArrayScatterV3"
   input_arg {
     name: "handle"
-    description: "The handle to a TensorArray."
     type: DT_RESOURCE
   }
   input_arg {
     name: "indices"
-    description: "The locations at which to write the tensor elements."
     type: DT_INT32
   }
   input_arg {
     name: "value"
-    description: "The concatenated tensor to write to the TensorArray."
     type_attr: "T"
   }
   input_arg {
     name: "flow_in"
-    description: "A float scalar that enforces proper chaining of operations."
     type: DT_FLOAT
   }
   output_arg {
     name: "flow_out"
-    description: "A float scalar that enforces proper chaining of operations."
     type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Scatter the data from the input value into specific TensorArray elements."
-  description: "`indices` must be a vector, its length must match the first dim of `value`."
   is_stateful: true
 }
 op {
@@ -31971,26 +29536,25 @@ op {
     name: "size"
     type: DT_INT32
   }
-  summary: "Deprecated. Use TensorArraySizeV3"
+  deprecation {
+    version: 26
+    explanation: "Use TensorArraySizeV3"
+  }
 }
 op {
   name: "TensorArraySizeV3"
   input_arg {
     name: "handle"
-    description: "The handle to a TensorArray (output of TensorArray or TensorArrayGrad)."
     type: DT_RESOURCE
   }
   input_arg {
     name: "flow_in"
-    description: "A float scalar that enforces proper chaining of operations."
     type: DT_FLOAT
   }
   output_arg {
     name: "size"
-    description: "The current size of the TensorArray."
     type: DT_INT32
   }
-  summary: "Get the current size of the TensorArray."
   is_stateful: true
 }
 op {
@@ -32051,41 +29615,37 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Deprecated. Use TensorArraySplitV3"
+  deprecation {
+    version: 26
+    explanation: "Use TensorArraySplitV3"
+  }
 }
 op {
   name: "TensorArraySplitV3"
   input_arg {
     name: "handle"
-    description: "The handle to a TensorArray."
     type: DT_RESOURCE
   }
   input_arg {
     name: "value"
-    description: "The concatenated tensor to write to the TensorArray."
     type_attr: "T"
   }
   input_arg {
     name: "lengths"
-    description: "The vector of lengths, how to split the rows of value into the\nTensorArray."
     type: DT_INT64
   }
   input_arg {
     name: "flow_in"
-    description: "A float scalar that enforces proper chaining of operations."
     type: DT_FLOAT
   }
   output_arg {
     name: "flow_out"
-    description: "A float scalar that enforces proper chaining of operations."
     type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Split the data from the input value into TensorArray elements."
-  description: "Assuming that `lengths` takes on values\n\n  ```(n0, n1, ..., n(T-1))```\n\nand that `value` has shape\n\n  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,\n\nthis splits values into a TensorArray with T tensors.\n\nTensorArray index t will be the subtensor of values with starting position\n\n  ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```\n\nand having size\n\n  ```nt x d0 x d1 x ...```"
   is_stateful: true
 }
 op {
@@ -32160,30 +29720,29 @@ op {
       s: ""
     }
   }
-  summary: "Deprecated. Use TensorArrayV3"
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayV3"
+  }
   is_stateful: true
 }
 op {
   name: "TensorArrayV3"
   input_arg {
     name: "size"
-    description: "The size of the array."
     type: DT_INT32
   }
   output_arg {
     name: "handle"
-    description: "The handle to the TensorArray."
     type: DT_RESOURCE
   }
   output_arg {
     name: "flow"
-    description: "A scalar used to control gradient flow."
     type: DT_FLOAT
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of the elements on the tensor_array."
   }
   attr {
     name: "element_shape"
@@ -32193,7 +29752,6 @@ op {
         unknown_rank: true
       }
     }
-    description: "The expected shape of an element, if known. Used to\nvalidate the shapes of TensorArray elements. If this shape is not\nfully specified, gathering zero-size TensorArrays is an error."
   }
   attr {
     name: "dynamic_size"
@@ -32201,7 +29759,6 @@ op {
     default_value {
       b: false
     }
-    description: "A boolean that determines whether writes to the TensorArray\nare allowed to grow the size.  By default, this is not allowed."
   }
   attr {
     name: "clear_after_read"
@@ -32209,7 +29766,6 @@ op {
     default_value {
       b: true
     }
-    description: "If true (default), Tensors in the TensorArray are cleared\nafter being read.  This disables multiple read semantics but allows early\nrelease of memory."
   }
   attr {
     name: "identical_element_shapes"
@@ -32217,7 +29773,6 @@ op {
     default_value {
       b: false
     }
-    description: "If true (default is false), then all\nelements in the TensorArray will be expected to have have identical shapes.\nThis allows certain behaviors, like dynamically checking for\nconsistent shapes on write, and being able to fill in properly\nshaped zero tensors on stack -- even if the element_shape attribute\nis not fully defined."
   }
   attr {
     name: "tensor_array_name"
@@ -32225,10 +29780,7 @@ op {
     default_value {
       s: ""
     }
-    description: "Overrides the name used for the temporary tensor_array\nresource. Default value is the name of the \'TensorArray\' op (which\nis guaranteed unique)."
   }
-  summary: "An array of Tensors of given size."
-  description: "Write data via Write and read via Read or Pack."
   is_stateful: true
 }
 op {
@@ -32289,40 +29841,37 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Deprecated. Use TensorArrayGradV3"
+  deprecation {
+    version: 26
+    explanation: "Use TensorArrayWriteV3"
+  }
 }
 op {
   name: "TensorArrayWriteV3"
   input_arg {
     name: "handle"
-    description: "The handle to a TensorArray."
     type: DT_RESOURCE
   }
   input_arg {
     name: "index"
-    description: "The position to write to inside the TensorArray."
     type: DT_INT32
   }
   input_arg {
     name: "value"
-    description: "The tensor to write to the TensorArray."
     type_attr: "T"
   }
   input_arg {
     name: "flow_in"
-    description: "A float scalar that enforces proper chaining of operations."
     type: DT_FLOAT
   }
   output_arg {
     name: "flow_out"
-    description: "A float scalar that enforces proper chaining of operations."
     type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Push an element onto the tensor_array."
   is_stateful: true
 }
 op {
@@ -32347,9 +29896,200 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that emits `components` as a tuple of tensors once."
   is_stateful: true
 }
+op {
+  name: "TensorListElementShape"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListFromTensor"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListGetItem"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "item"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListLength"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "length"
+    type: DT_INT32
+  }
+}
+op {
+  name: "TensorListPopBack"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListPushBack"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListReserve"
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListSetItem"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "item"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListStack"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "num_elements"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
 op {
   name: "TensorSliceDataset"
   input_arg {
@@ -32372,14 +30112,12 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that emits each dim-0 slice of `components` once."
   is_stateful: true
 }
 op {
   name: "TensorSummary"
   input_arg {
     name: "tensor"
-    description: "A tensor to serialize."
     type_attr: "T"
   }
   output_arg {
@@ -32396,7 +30134,6 @@ op {
     default_value {
       s: ""
     }
-    description: "A json-encoded SummaryDescription proto."
   }
   attr {
     name: "labels"
@@ -32405,7 +30142,6 @@ op {
       list {
       }
     }
-    description: "An unused list of strings."
   }
   attr {
     name: "display_name"
@@ -32413,26 +30149,20 @@ op {
     default_value {
       s: ""
     }
-    description: "An unused string."
   }
-  summary: "Outputs a `Summary` protocol buffer with a tensor."
-  description: "This op is being phased out in favor of TensorSummaryV2, which lets callers pass\na tag as well as a serialized SummaryMetadata proto string that contains\nplugin-specific data. We will keep this op to maintain backwards compatibility."
 }
 op {
   name: "TensorSummaryV2"
   input_arg {
     name: "tag"
-    description: "A string attached to this summary. Used for organization in TensorBoard."
     type: DT_STRING
   }
   input_arg {
     name: "tensor"
-    description: "A tensor to serialize."
     type_attr: "T"
   }
   input_arg {
     name: "serialized_summary_metadata"
-    description: "A serialized SummaryMetadata proto. Contains plugin\ndata."
     type: DT_STRING
   }
   output_arg {
@@ -32443,37 +30173,31 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Outputs a `Summary` protocol buffer with a tensor and per-plugin data."
 }
 op {
   name: "TextLineDataset"
   input_arg {
     name: "filenames"
-    description: "A scalar or a vector containing the name(s) of the file(s) to be\nread."
     type: DT_STRING
   }
   input_arg {
     name: "compression_type"
-    description: "A scalar containing either (i) the empty string (no\ncompression), (ii) \"ZLIB\", or (iii) \"GZIP\"."
     type: DT_STRING
   }
   input_arg {
     name: "buffer_size"
-    description: "A scalar containing the number of bytes to buffer."
     type: DT_INT64
   }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
-  summary: "Creates a dataset that emits the lines of one or more text files."
   is_stateful: true
 }
 op {
   name: "TextLineReader"
   output_arg {
     name: "reader_handle"
-    description: "The handle to reference the Reader."
     type: DT_STRING
     is_ref: true
   }
@@ -32483,7 +30207,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Number of lines to skip from the beginning of every file."
   }
   attr {
     name: "container"
@@ -32491,7 +30214,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -32499,16 +30221,17 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
   }
-  summary: "A Reader that outputs the lines of a file delimited by \'\\n\'."
+  deprecation {
+    version: 26
+    explanation: "Use TextLineReaderV2"
+  }
   is_stateful: true
 }
 op {
   name: "TextLineReaderV2"
   output_arg {
     name: "reader_handle"
-    description: "The handle to reference the Reader."
     type: DT_RESOURCE
   }
   attr {
@@ -32517,7 +30240,6 @@ op {
     default_value {
       i: 0
     }
-    description: "Number of lines to skip from the beginning of every file."
   }
   attr {
     name: "container"
@@ -32525,7 +30247,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -32533,56 +30254,46 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
   }
-  summary: "A Reader that outputs the lines of a file delimited by \'\\n\'."
   is_stateful: true
 }
 op {
   name: "ThreadUnsafeUnigramCandidateSampler"
   input_arg {
     name: "true_classes"
-    description: "A batch_size * num_true matrix, in which each row contains the\nIDs of the num_true target_classes in the corresponding original label."
     type: DT_INT64
   }
   output_arg {
     name: "sampled_candidates"
-    description: "A vector of length num_sampled, in which each element is\nthe ID of a sampled candidate."
     type: DT_INT64
   }
   output_arg {
     name: "true_expected_count"
-    description: "A batch_size * num_true matrix, representing\nthe number of times each candidate is expected to occur in a batch\nof sampled candidates. If unique=true, then this is a probability."
     type: DT_FLOAT
   }
   output_arg {
     name: "sampled_expected_count"
-    description: "A vector of length num_sampled, for each sampled\ncandidate representing the number of times the candidate is expected\nto occur in a batch of sampled candidates.  If unique=true, then this is a\nprobability."
     type: DT_FLOAT
   }
   attr {
     name: "num_true"
     type: "int"
-    description: "Number of true labels per context."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "unique"
     type: "bool"
-    description: "If unique is true, we sample with rejection, so that all sampled\ncandidates in a batch are unique. This requires some approximation to\nestimate the post-rejection sampling probabilities."
   }
   attr {
     name: "range_max"
     type: "int"
-    description: "The sampler will sample integers from the interval [0, range_max)."
     has_minimum: true
     minimum: 1
   }
@@ -32592,7 +30303,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either seed or seed2 are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -32600,22 +30310,17 @@ op {
     default_value {
       i: 0
     }
-    description: "An second seed to avoid seed collision."
   }
-  summary: "Generates labels for candidate sampling with a learned unigram distribution."
-  description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
   is_stateful: true
 }
 op {
   name: "Tile"
   input_arg {
     name: "input"
-    description: "1-D or higher."
     type_attr: "T"
   }
   input_arg {
     name: "multiples"
-    description: "1-D. Length must be the same as the number of dimensions in `input`"
     type_attr: "Tmultiples"
   }
   output_arg {
@@ -32639,8 +30344,6 @@ op {
       }
     }
   }
-  summary: "Constructs a tensor by tiling a given tensor."
-  description: "This operation creates a new tensor by replicating `input` `multiples` times.\nThe output tensor\'s i\'th dimension has `input.dims(i) * multiples[i]` elements,\nand the values of `input` are replicated `multiples[i]` times along the \'i\'th\ndimension. For example, tiling `[a b c d]` by `[2]` produces\n`[a b c d a b c d]`."
 }
 op {
   name: "TileGrad"
@@ -32660,8 +30363,6 @@ op {
     name: "T"
     type: "type"
   }
-  summary: "Returns the gradient of `Tile`."
-  description: "Since `Tile` takes an input and repeats the input `multiples` times\nalong each dimension, `TileGrad` takes in `multiples` and aggregates\neach repeated tile of `input` into `output`."
   deprecation {
     version: 3
     explanation: "TileGrad has been replaced with reduce_sum"
@@ -32671,23 +30372,19 @@ op {
   name: "TopK"
   input_arg {
     name: "input"
-    description: "1-D or higher with last dimension at least `k`."
     type_attr: "T"
   }
   output_arg {
     name: "values"
-    description: "The `k` largest elements along each last dimensional slice."
     type_attr: "T"
   }
   output_arg {
     name: "indices"
-    description: "The indices of `values` within the last dimension of `input`."
     type: DT_INT32
   }
   attr {
     name: "k"
     type: "int"
-    description: "Number of top elements to look for along the last dimension (along each\nrow for matrices)."
     has_minimum: true
   }
   attr {
@@ -32696,7 +30393,6 @@ op {
     default_value {
       b: true
     }
-    description: "If true the resulting `k` elements will be sorted by the values in\ndescending order."
   }
   attr {
     name: "T"
@@ -32706,10 +30402,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -32717,8 +30414,6 @@ op {
       }
     }
   }
-  summary: "Finds values and indices of the `k` largest elements for the last dimension."
-  description: "If the input is a vector (rank-1), finds the `k` largest entries in the vector\nand outputs their values and indices as vectors.  Thus `values[j]` is the\n`j`-th largest entry in `input`, and its index is `indices[j]`.\n\nFor matrices (resp. higher rank input), computes the top `k` entries in each\nrow (resp. vector along the last dimension).  Thus,\n\n    values.shape = indices.shape = input.shape[:-1] + [k]\n\nIf two elements are equal, the lower-index element appears first.\n\nIf `k` varies dynamically, use `TopKV2` below."
   deprecation {
     version: 7
     explanation: "Use TopKV2 instead"
@@ -32728,22 +30423,18 @@ op {
   name: "TopKV2"
   input_arg {
     name: "input"
-    description: "1-D or higher with last dimension at least `k`."
     type_attr: "T"
   }
   input_arg {
     name: "k"
-    description: "0-D.  Number of top elements to look for along the last dimension (along each\nrow for matrices)."
     type: DT_INT32
   }
   output_arg {
     name: "values"
-    description: "The `k` largest elements along each last dimensional slice."
     type_attr: "T"
   }
   output_arg {
     name: "indices"
-    description: "The indices of `values` within the last dimension of `input`."
     type: DT_INT32
   }
   attr {
@@ -32752,7 +30443,6 @@ op {
     default_value {
       b: true
     }
-    description: "If true the resulting `k` elements will be sorted by the values in\ndescending order."
   }
   attr {
     name: "T"
@@ -32762,10 +30452,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -32773,8 +30464,6 @@ op {
       }
     }
   }
-  summary: "Finds values and indices of the `k` largest elements for the last dimension."
-  description: "If the input is a vector (rank-1), finds the `k` largest entries in the vector\nand outputs their values and indices as vectors.  Thus `values[j]` is the\n`j`-th largest entry in `input`, and its index is `indices[j]`.\n\nFor matrices (resp. higher rank input), computes the top `k` entries in each\nrow (resp. vector along the last dimension).  Thus,\n\n    values.shape = indices.shape = input.shape[:-1] + [k]\n\nIf two elements are equal, the lower-index element appears first."
 }
 op {
   name: "Transpose"
@@ -32807,8 +30496,6 @@ op {
       }
     }
   }
-  summary: "Shuffle dimensions of x according to a permutation."
-  description: "The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:\n  `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`"
 }
 op {
   name: "TruncateDiv"
@@ -32830,6 +30517,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -32843,8 +30531,6 @@ op {
       }
     }
   }
-  summary: "Returns x / y element-wise for integer types."
-  description: "Truncation designates that negative numbers will round fractional quantities\ntoward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different\nthan Python semantics. See `FloorDiv` for a division function that matches\nPython Semantics.\n\n*NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "TruncateMod"
@@ -32867,24 +30553,21 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  summary: "Returns element-wise remainder of division. This emulates C semantics in that"
-  description: "the result here is consistent with a truncating divide. E.g. `truncate(x / y) *\ny + truncate_mod(x, y) = x`.\n\n*NOTE*: `TruncateMod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "TruncatedNormal"
   input_arg {
     name: "shape"
-    description: "The shape of the output tensor."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "A tensor of the specified shape filled with random truncated normal\nvalues."
     type_attr: "dtype"
   }
   attr {
@@ -32893,7 +30576,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either `seed` or `seed2` are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -32901,15 +30583,14 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of the output."
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -32925,55 +30606,127 @@ op {
       }
     }
   }
-  summary: "Outputs random values from a truncated normal distribution."
-  description: "The generated values follow a normal distribution with mean 0 and standard\ndeviation 1, except that values whose magnitude is more than 2 standard\ndeviations from the mean are dropped and re-picked."
   is_stateful: true
 }
+op {
+  name: "Unbatch"
+  input_arg {
+    name: "batched_tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "unbatched_tensor"
+    type_attr: "T"
+  }
+  attr {
+    name: "timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "UnbatchGrad"
+  input_arg {
+    name: "original_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "batch_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "id"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "batched_grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "UniformCandidateSampler"
   input_arg {
     name: "true_classes"
-    description: "A batch_size * num_true matrix, in which each row contains the\nIDs of the num_true target_classes in the corresponding original label."
     type: DT_INT64
   }
   output_arg {
     name: "sampled_candidates"
-    description: "A vector of length num_sampled, in which each element is\nthe ID of a sampled candidate."
     type: DT_INT64
   }
   output_arg {
     name: "true_expected_count"
-    description: "A batch_size * num_true matrix, representing\nthe number of times each candidate is expected to occur in a batch\nof sampled candidates. If unique=true, then this is a probability."
     type: DT_FLOAT
   }
   output_arg {
     name: "sampled_expected_count"
-    description: "A vector of length num_sampled, for each sampled\ncandidate representing the number of times the candidate is expected\nto occur in a batch of sampled candidates.  If unique=true, then this is a\nprobability."
     type: DT_FLOAT
   }
   attr {
     name: "num_true"
     type: "int"
-    description: "Number of true labels per context."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
   attr {
     name: "unique"
     type: "bool"
-    description: "If unique is true, we sample with rejection, so that all sampled\ncandidates in a batch are unique. This requires some approximation to\nestimate the post-rejection sampling probabilities."
   }
   attr {
     name: "range_max"
     type: "int"
-    description: "The sampler will sample integers from the interval [0, range_max)."
     has_minimum: true
     minimum: 1
   }
@@ -32983,7 +30736,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either seed or seed2 are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -32991,27 +30743,21 @@ op {
     default_value {
       i: 0
     }
-    description: "An second seed to avoid seed collision."
   }
-  summary: "Generates labels for candidate sampling with a uniform distribution."
-  description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
   is_stateful: true
 }
 op {
   name: "Unique"
   input_arg {
     name: "x"
-    description: "1-D."
     type_attr: "T"
   }
   output_arg {
     name: "y"
-    description: "1-D."
     type_attr: "T"
   }
   output_arg {
     name: "idx"
-    description: "1-D."
     type_attr: "out_idx"
   }
   attr {
@@ -33031,29 +30777,95 @@ op {
       }
     }
   }
-  summary: "Finds unique elements in a 1-D tensor."
-  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx = unique(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\n```"
+}
+op {
+  name: "UniqueDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "UniqueV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Taxis"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "idx"
+    type_attr: "out_idx"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Taxis"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "out_idx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "UniqueWithCounts"
   input_arg {
     name: "x"
-    description: "1-D."
     type_attr: "T"
   }
   output_arg {
     name: "y"
-    description: "1-D."
     type_attr: "T"
   }
   output_arg {
     name: "idx"
-    description: "1-D."
     type_attr: "out_idx"
   }
   output_arg {
     name: "count"
-    description: "1-D."
     type_attr: "out_idx"
   }
   attr {
@@ -33073,19 +30885,15 @@ op {
       }
     }
   }
-  summary: "Finds unique elements in a 1-D tensor."
-  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. Finally, it returns a third tensor `count` that\ncontains the count of each element of `y` in `x`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx, count = unique_with_counts(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\ncount ==> [2, 1, 3, 1, 2]\n```"
 }
 op {
   name: "Unpack"
   input_arg {
     name: "value"
-    description: "1-D or higher, with `axis` dimension size equal to `num`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "The list of tensors unpacked from `value`."
     type_attr: "T"
     number_attr: "num"
   }
@@ -33104,10 +30912,35 @@ op {
     default_value {
       i: 0
     }
-    description: "Dimension along which to unpack.  Negative values wrap around, so the\nvalid range is `[-R, R)`."
   }
-  summary: "Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors."
-  description: "Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.\nFor example, given a tensor of shape `(A, B, C, D)`;\n\nIf `axis == 0` then the i\'th tensor in `output` is the slice `value[i, :, :, :]`\n  and each tensor in `output` will have shape `(B, C, D)`. (Note that the\n  dimension unpacked along is gone, unlike `split`).\n\nIf `axis == 1` then the i\'th tensor in `output` is the slice `value[:, i, :, :]`\n  and each tensor in `output` will have shape `(A, C, D)`.\nEtc.\n\nThis is the opposite of `pack`."
+}
+op {
+  name: "UnravelIndex"
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "dims"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tidx"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "UnsortedSegmentMax"
@@ -33117,16 +30950,14 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    description: "A 1-D tensor whose rank is equal to the rank of `data`\'s\nfirst dimension."
     type_attr: "Tindices"
   }
   input_arg {
     name: "num_segments"
-    type: DT_INT32
+    type_attr: "Tnumsegments"
   }
   output_arg {
     name: "output"
-    description: "Has same shape as data, except for dimension 0 which\nhas size `num_segments`."
     type_attr: "T"
   }
   attr {
@@ -33137,10 +30968,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -33158,8 +30990,19 @@ op {
       }
     }
   }
-  summary: "Computes the Max along segments of a tensor."
-  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nThis operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).\nInstead of computing the sum over segments, it computes the maximum\nsuch that:\n\n\\\\(output_i = \\max_j data_j\\\\) where max is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,\n `output[i] = numeric_limits<T>::min()`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/UnsortedSegmentMax.png\" alt>\n</div>"
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "UnsortedSegmentSum"
@@ -33169,16 +31012,14 @@ op {
   }
   input_arg {
     name: "segment_ids"
-    description: "A tensor whose shape is a prefix of `data.shape`."
     type_attr: "Tindices"
   }
   input_arg {
     name: "num_segments"
-    type: DT_INT32
+    type_attr: "Tnumsegments"
   }
   output_arg {
     name: "output"
-    description: "Has same shape as data, except for the first `segment_ids.rank`\ndimensions, which are replaced with a single dimension which has size\n`num_segments`."
     type_attr: "T"
   }
   attr {
@@ -33188,17 +31029,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -33215,8 +31057,19 @@ op {
       }
     }
   }
-  summary: "Computes the sum along segments of a tensor."
-  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/UnsortedSegmentSum.png\" alt>\n</div>"
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "Unstage"
@@ -33260,8 +31113,6 @@ op {
       s: ""
     }
   }
-  summary: "Op is similar to a lightweight Dequeue."
-  description: "The basic functionality is similar to dequeue with many fewer\ncapabilities and options.  This Op is optimized for performance."
   is_stateful: true
 }
 op {
@@ -33276,7 +31127,6 @@ op {
     default_value {
       s: ""
     }
-    description: "the container this variable is placed in."
   }
   attr {
     name: "shared_name"
@@ -33284,34 +31134,27 @@ op {
     default_value {
       s: ""
     }
-    description: "the name by which this variable is referred to."
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "the type of this variable. Must agree with the dtypes\nof all ops using this variable."
   }
   attr {
     name: "shape"
     type: "shape"
-    description: "The (possibly partially specified) shape of this variable."
   }
-  summary: "Creates a handle to a Variable resource."
   is_stateful: true
 }
 op {
   name: "VarIsInitializedOp"
   input_arg {
     name: "resource"
-    description: "the input resource handle."
     type: DT_RESOURCE
   }
   output_arg {
     name: "is_initialized"
-    description: "a scalar boolean which is true if the variable has been\ninitialized."
     type: DT_BOOL
   }
-  summary: "Checks whether a resource handle-based variable has been initialized."
   is_stateful: true
 }
 op {
@@ -33343,7 +31186,6 @@ op {
       s: ""
     }
   }
-  summary: "Use VariableV2 instead."
   is_stateful: true
 }
 op {
@@ -33369,27 +31211,22 @@ op {
       }
     }
   }
-  summary: "Returns the shape of the variable pointed to by `resource`."
-  description: "This operation returns a 1-D integer tensor representing the shape of `input`.\n\nFor example:\n\n```\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\nshape(t) ==> [2, 2, 3]\n```"
   is_stateful: true
 }
 op {
   name: "VariableV2"
   output_arg {
     name: "ref"
-    description: "A reference to the variable tensor."
     type_attr: "dtype"
     is_ref: true
   }
   attr {
     name: "shape"
     type: "shape"
-    description: "The shape of the variable tensor."
   }
   attr {
     name: "dtype"
     type: "type"
-    description: "The type of elements in the variable tensor."
   }
   attr {
     name: "container"
@@ -33397,7 +31234,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this variable is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -33405,10 +31241,7 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this variable is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
   }
-  summary: "Holds state in the form of a tensor that persists across steps."
-  description: "Outputs a ref to the tensor state so it may be read or modified.\nTODO(zhifengc/mrry): Adds a pointer to a more detail document\nabout sharing states in tensorflow."
   is_stateful: true
 }
 op {
@@ -33431,17 +31264,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -33449,14 +31283,11 @@ op {
       }
     }
   }
-  summary: "Returns locations of nonzero / true values in a tensor."
-  description: "This operation returns the coordinates of true elements in `input`. The\ncoordinates are returned in a 2-D tensor where the first dimension (rows)\nrepresents the number of true elements, and the second dimension (columns)\nrepresents the coordinates of the true elements. Keep in mind, the shape of\nthe output tensor can vary depending on how many true values there are in\n`input`. Indices are output in row-major order.\n\nFor example:\n\n```\n# \'input\' tensor is [[True, False]\n#                    [True, False]]\n# \'input\' has two true values, so output has two coordinates.\n# \'input\' has rank of 2, so coordinates have two indices.\nwhere(input) ==> [[0, 0],\n                  [1, 0]]\n\n# `input` tensor is [[[True, False]\n#                     [True, False]]\n#                    [[False, True]\n#                     [False, True]]\n#                    [[False, False]\n#                     [False, True]]]\n# \'input\' has 5 true values, so output has 5 coordinates.\n# \'input\' has rank of 3, so coordinates have three indices.\nwhere(input) ==> [[0, 0, 0],\n                  [0, 1, 0],\n                  [1, 0, 1],\n                  [1, 1, 1],\n                  [2, 1, 1]]\n\n# `input` tensor is [[[1.5,  0.0]\n#                     [-0.5, 0.0]]\n#                    [[0.0,  0.25]\n#                     [0.0,  0.75]]\n#                    [[0.0,  0.0]\n#                     [0.0,  0.01]]]\n# \'input\' has 5 nonzero values, so output has 5 coordinates.\n# \'input\' has rank of 3, so coordinates have three indices.\nwhere(input) ==> [[0, 0, 0],\n                  [0, 1, 0],\n                  [1, 0, 1],\n                  [1, 1, 1],\n                  [2, 1, 1]]\n\n# `input` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]\n#                     [0.0 + 0.5j, 0.0  + 0.0j]]\n#                    [[0.0 + 0.0j, 0.25 + 1.5j]\n#                     [0.0 + 0.0j, 0.75 + 0.0j]]\n#                    [[0.0 + 0.0j, 0.0  + 0.0j]\n#                     [0.0 + 0.0j, 0.01 + 0.0j]]]\n# \'input\' has 5 nonzero magnitude values, so output has 5 coordinates.\n# \'input\' has rank of 3, so coordinates have three indices.\nwhere(input) ==> [[0, 0, 0],\n                  [0, 1, 0],\n                  [1, 0, 1],\n                  [1, 1, 1],\n                  [2, 1, 1]]\n```"
 }
 op {
   name: "WholeFileReader"
   output_arg {
     name: "reader_handle"
-    description: "The handle to reference the Reader."
     type: DT_STRING
     is_ref: true
   }
@@ -33466,7 +31297,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -33474,17 +31304,13 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
   }
-  summary: "A Reader that outputs the entire contents of a file as a value."
-  description: "To use, enqueue filenames in a Queue.  The output of ReaderRead will\nbe a filename (key) and the contents of that file (value)."
   is_stateful: true
 }
 op {
   name: "WholeFileReaderV2"
   output_arg {
     name: "reader_handle"
-    description: "The handle to reference the Reader."
     type: DT_RESOURCE
   }
   attr {
@@ -33493,7 +31319,6 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
   }
   attr {
     name: "shared_name"
@@ -33501,44 +31326,34 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
   }
-  summary: "A Reader that outputs the entire contents of a file as a value."
-  description: "To use, enqueue filenames in a Queue.  The output of ReaderRead will\nbe a filename (key) and the contents of that file (value)."
   is_stateful: true
 }
 op {
   name: "WriteFile"
   input_arg {
     name: "filename"
-    description: "scalar. The name of the file to which we write the contents."
     type: DT_STRING
   }
   input_arg {
     name: "contents"
-    description: "scalar. The content to be written to the output file."
     type: DT_STRING
   }
-  summary: "Writes contents to the file at input filename. Creates file and recursively"
-  description: "creates directory if not existing."
 }
 op {
   name: "ZerosLike"
   input_arg {
     name: "x"
-    description: "a tensor of type T."
     type_attr: "T"
   }
   output_arg {
     name: "y"
-    description: "a tensor of the same shape and type as x but filled with zeros."
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
   }
-  summary: "Returns a tensor of zeros with the same shape and type as x."
 }
 op {
   name: "Zeta"
@@ -33564,8 +31379,6 @@ op {
       }
     }
   }
-  summary: "Compute the Hurwitz zeta function \\\\(\\zeta(x, q)\\\\)."
-  description: "The Hurwitz zeta function is defined as:\n\n\n\\\\(\\zeta(x, q) = \\sum_{n=0}^{\\infty} (q + n)^{-x}\\\\)"
 }
 op {
   name: "ZipDataset"
@@ -33596,5 +31409,4 @@ op {
     has_minimum: true
     minimum: 1
   }
-  summary: "Creates a dataset that zips together `input_datasets`."
 }
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index 40ec792ef82ff5e0bdf6d0c4e35bf18f5560c5a7..ddd2aa92748f244c2d132f00780a0d6424f1e595 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -35,18 +35,13 @@ REGISTER_OP("DecodeRaw")
           c->input(0), c->Vector(InferenceContext::kUnknownDim), &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Reinterpret the bytes of a string as a vector of numbers.
+    });
 
-bytes: All the elements must have the same length.
-little_endian: Whether the input `bytes` are in little-endian order.
-  Ignored for `out_type` values that are stored in a single byte like
-  `uint8`.
-output: A Tensor with one more dimension than the input `bytes`.  The
-  added dimension will have size equal to the length of the elements
-  of `bytes` divided by the number of bytes to represent `out_type`.
-)doc");
+REGISTER_OP("DecodeCompressed")
+    .Input("bytes: string")
+    .Output("output: string")
+    .Attr("compression_type: string = ''")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("ParseExample")
     .Input("serialized: string")
@@ -64,7 +59,7 @@ REGISTER_OP("ParseExample")
     .Attr("Tdense: list({float,int64,string}) >= 0")
     .Attr("dense_shapes: list(shape) >= 0")
     .SetShapeFn([](InferenceContext* c) {
-      ParseSingleExampleAttrs attrs;
+      ParseExampleAttrs attrs;
       TF_RETURN_IF_ERROR(attrs.Init(c));
 
       ShapeHandle input;
@@ -93,50 +88,49 @@ REGISTER_OP("ParseExample")
         c->set_output(output_idx++, dense);
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-Transforms a vector of brain.Example protos (as strings) into typed tensors.
+    });
 
-serialized: A vector containing a batch of binary serialized Example protos.
-names: A vector containing the names of the serialized protos.
-  May contain, for example, table key (descriptive) names for the
-  corresponding serialized protos.  These are purely useful for debugging
-  purposes, and the presence of values here has no effect on the output.
-  May also be an empty vector if no names are available.
-  If non-empty, this vector must be the same length as "serialized".
-dense_keys: A list of Ndense string Tensors (scalars).
-  The keys expected in the Examples' features associated with dense values.
-dense_defaults: A list of Ndense Tensors (some may be empty).
-  dense_defaults[j] provides default values
-  when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-  provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-  The input type is inferred from dense_defaults[j], even when it's empty.
-  If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-  then the shape of dense_defaults[j] must match that of dense_shapes[j].
-  If dense_shapes[j] has an undefined major dimension (variable strides dense
-  feature), dense_defaults[j] must contain a single element:
-  the padding element.
-dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-  given in dense_keys.
-  The number of elements in the Feature corresponding to dense_key[j]
-  must always equal dense_shapes[j].NumEntries().
-  If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-  Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-  The dense outputs are just the inputs row-stacked by batch.
-  This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-  the shape of the output Tensor dense_values[j] will be
-  (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-  of elements of length D1 * .... * DN, across all minibatch entries
-  in the input.  Any minibatch entry with less than M blocks of elements of
-  length D1 * ... * DN will be padded with the corresponding default_value
-  scalar element along the second dimension.
-sparse_keys: A list of Nsparse string Tensors (scalars).
-  The keys expected in the Examples' features associated with sparse values.
-sparse_types: A list of Nsparse types; the data types of data in each Feature
-  given in sparse_keys.
-  Currently the ParseExample supports DT_FLOAT (FloatList),
-  DT_INT64 (Int64List), and DT_STRING (BytesList).
-)doc");
+REGISTER_OP("ParseSingleExample")
+    .Input("serialized: string")
+    .Input("dense_defaults: Tdense")
+    .Output("sparse_indices: num_sparse * int64")
+    .Output("sparse_values: sparse_types")
+    .Output("sparse_shapes: num_sparse * int64")
+    .Output("dense_values: Tdense")
+    .Attr("num_sparse: int >= 0")
+    .Attr("sparse_keys: list(string) >= 0")
+    .Attr("dense_keys: list(string) >= 0")
+    .Attr("sparse_types: list({float,int64,string}) >= 0")
+    .Attr("Tdense: list({float,int64,string}) >= 0")
+    .Attr("dense_shapes: list(shape) >= 0")
+    .SetShapeFn([](InferenceContext* c) {
+      ParseSingleExampleAttrs attrs;
+      TF_RETURN_IF_ERROR(attrs.Init(c));
+
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &input));
+
+      // Output sparse_indices, sparse_values, and sparse_shapes.
+      int output_idx = 0;
+      for (int i = 0; i < attrs.sparse_keys.size(); ++i) {
+        c->set_output(output_idx++, c->Matrix(c->UnknownDim(), 1));
+      }
+      for (int i = 0; i < attrs.sparse_keys.size(); ++i) {
+        c->set_output(output_idx++, c->Vector(c->UnknownDim()));
+      }
+      for (int i = 0; i < attrs.sparse_keys.size(); ++i) {
+        c->set_output(output_idx++, c->Vector(1));
+      }
+
+      // Output dense_shapes.
+      for (int i = 0; i < attrs.dense_keys.size(); ++i) {
+        ShapeHandle dense;
+        TF_RETURN_IF_ERROR(
+            c->MakeShapeFromPartialTensorShape(attrs.dense_shapes[i], &dense));
+        c->set_output(output_idx++, dense);
+      }
+      return Status::OK();
+    });
 
 REGISTER_OP("ParseSingleSequenceExample")
     .Input("serialized: string")
@@ -224,106 +218,24 @@ REGISTER_OP("ParseSingleSequenceExample")
         c->set_output(output_idx++, s);
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
-
-serialized: A scalar containing a binary serialized SequenceExample proto.
-feature_list_dense_missing_assumed_empty: A vector listing the
-  FeatureList keys which may be missing from the SequenceExample.  If the
-  associated FeatureList is missing, it is treated as empty.  By default,
-  any FeatureList not listed in this vector must exist in the SequenceExample.
-context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-  The keys expected in the SequenceExamples' context features associated with
-  dense values.
-feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-  The keys expected in the SequenceExamples' feature_lists associated
-  with lists of dense values.
-context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-  context_dense_defaults[j] provides default values
-  when the SequenceExample's context map lacks context_dense_key[j].
-  If an empty Tensor is provided for context_dense_defaults[j],
-  then the Feature context_dense_keys[j] is required.
-  The input type is inferred from context_dense_defaults[j], even when it's
-  empty.  If context_dense_defaults[j] is not empty, its shape must match
-  context_dense_shapes[j].
-debug_name: A scalar containing the name of the serialized proto.
-  May contain, for example, table key (descriptive) name for the
-  corresponding serialized proto.  This is purely useful for debugging
-  purposes, and the presence of values here has no effect on the output.
-  May also be an empty scalar if no name is available.
-context_dense_shapes: A list of Ncontext_dense shapes; the shapes of data in
-  each context Feature given in context_dense_keys.
-  The number of elements in the Feature corresponding to context_dense_key[j]
-  must always equal context_dense_shapes[j].NumEntries().
-  The shape of context_dense_values[j] will match context_dense_shapes[j].
-feature_list_dense_shapes: A list of Nfeature_list_dense shapes; the shapes of
-  data in each FeatureList given in feature_list_dense_keys.
-  The shape of each Feature in the FeatureList corresponding to
-  feature_list_dense_key[j] must always equal
-  feature_list_dense_shapes[j].NumEntries().
-context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-  The keys expected in the Examples' features associated with context_sparse
-  values.
-context_sparse_types: A list of Ncontext_sparse types; the data types of data in
-  each context Feature given in context_sparse_keys.
-  Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-  DT_INT64 (Int64List), and DT_STRING (BytesList).
-feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-  (scalars).  The keys expected in the FeatureLists associated with sparse
-  values.
-feature_list_sparse_types: A list of Nfeature_list_sparse types; the data types
-  of data in each FeatureList given in feature_list_sparse_keys.
-  Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-  DT_INT64 (Int64List), and DT_STRING (BytesList).
-)doc");
+    });
 
 REGISTER_OP("ParseTensor")
     .Input("serialized: string")
     .Output("output: out_type")
     .Attr("out_type: type")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Transforms a serialized tensorflow.TensorProto proto into a Tensor.
-
-serialized: A scalar string containing a serialized TensorProto proto.
-out_type: The type of the serialized tensor.  The provided type must match the
-  type of the serialized tensor and no implicit conversion will take place.
-output: A Tensor of type `out_type`.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("SerializeTensor")
     .Input("tensor: T")
     .Output("serialized: string")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Transforms a Tensor into a serialized TensorProto proto.
-
-tensor: A Tensor of type `T`.
-T: The type of the input tensor.
-serialized: A serialized TensorProto proto of the input tensor.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("DecodeJSONExample")
     .Input("json_examples: string")
     .Output("binary_examples: string")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Convert JSON-encoded Example records to binary protocol buffer strings.
-
-This op translates a tensor containing Example records, encoded using
-the [standard JSON
-mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-into a tensor containing the same records encoded as binary protocol
-buffers. The resulting tensor can then be fed to any of the other
-Example-parsing ops.
-
-json_examples: Each string is a JSON object serialized according to the JSON
-  mapping of the Example proto.
-binary_examples: Each string is a binary Example protocol buffer corresponding
-  to the respective element of `json_examples`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("DecodeCSV")
     .Input("records: string")
@@ -347,39 +259,12 @@ REGISTER_OP("DecodeCSV")
       // Propagate shape of the records input.
       for (int i = 0; i < c->num_outputs(); ++i) c->set_output(i, c->input(0));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Convert CSV records to tensors. Each column maps to one tensor.
-
-RFC 4180 format is expected for the CSV records.
-(https://tools.ietf.org/html/rfc4180)
-Note that we allow leading and trailing spaces with int or float field.
-
-records: Each string is a record/row in the csv and all records should have
-  the same format.
-record_defaults: One tensor per column of the input record, with either a
-  scalar default value for that column or empty if the column is required.
-field_delim: char delimiter to separate fields in a record.
-use_quote_delim: If false, treats double quotation marks as regular
-  characters inside of the string fields (ignoring RFC 4180, Section 2,
-  Bullet 5).
-na_value: Additional string to recognize as NA/NaN.
-output: Each tensor will have the same shape as records.
-)doc");
+    });
 
 REGISTER_OP("StringToNumber")
     .Input("string_tensor: string")
     .Output("output: out_type")
     .Attr("out_type: {float, double, int32, int64} = DT_FLOAT")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Converts each string in the input Tensor to the specified numeric type.
-
-(Note that int32 overflow results in an error while float overflow
-results in a rounded value.)
-
-out_type: The numeric type to interpret each string in `string_tensor` as.
-output: A Tensor of the same shape as the input `string_tensor`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/parsing_ops_test.cc b/tensorflow/core/ops/parsing_ops_test.cc
index c6e521e33e98017ee7cfd96c88ee82d3d338967f..9121d7ae924fc161ca07017d0057fbf876a9ed12 100644
--- a/tensorflow/core/ops/parsing_ops_test.cc
+++ b/tensorflow/core/ops/parsing_ops_test.cc
@@ -119,7 +119,7 @@ TEST(ParsingOpsTest, ParseExample_ShapeFn) {
            ("[?,2];[?,2];[?];[?];[2];[2];"         // sparse outputs
             "[d0_0,1];[d0_0,1,2];[d0_0,1,2,3]"));  // dense outputs
 
-  // Confirm an error from ParseSingleExampleAttrs.Init().
+  // Confirm an error from ParseExampleAttrs.Init().
   set_outputs(2, 3, true /* add_extra_shape */);
   INFER_ERROR("len(dense_keys) != len(dense_shapes)", op,
               "?;?;?;?;?;?;?;?;?;?");
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index 2429171fa93093362510601c5167d63a62caec54..f6c668f5c98efff07a49be15b1187f1858800110 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -29,24 +29,9 @@ REGISTER_OP("RandomUniform")
     .Output("output: dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
-    .Attr("dtype: {half,float,double}")
+    .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape)
-    .Doc(R"doc(
-Outputs random values from a uniform distribution.
-
-The generated values follow a uniform distribution in the range `[0, 1)`. The
-lower bound 0 is included in the range, while the upper bound 1 is excluded.
-
-shape: The shape of the output tensor.
-dtype: The type of the output.
-seed: If either `seed` or `seed2` are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: A second seed to avoid seed collision.
-
-output: A tensor of the specified shape filled with uniform random values.
-)doc");
+    .SetShapeFn(shape_inference::RandomShape);
 
 REGISTER_OP("RandomUniformInt")
     .Input("shape: T")
@@ -58,28 +43,7 @@ REGISTER_OP("RandomUniformInt")
     .Attr("seed2: int = 0")
     .Attr("Tout: {int32, int64}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape)
-    .Doc(R"doc(
-Outputs random integers from a uniform distribution.
-
-The generated values are uniform integers in the range `[minval, maxval)`.
-The lower bound `minval` is included in the range, while the upper bound
-`maxval` is excluded.
-
-The random integers are slightly biased unless `maxval - minval` is an exact
-power of two.  The bias is small for values of `maxval - minval` significantly
-smaller than the range of the output (either `2^32` or `2^64`).
-
-shape: The shape of the output tensor.
-minval: 0-D.  Inclusive lower bound on the generated integers.
-maxval: 0-D.  Exclusive upper bound on the generated integers.
-seed: If either `seed` or `seed2` are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: A second seed to avoid seed collision.
-
-output: A tensor of the specified shape filled with uniform random integers.
-)doc");
+    .SetShapeFn(shape_inference::RandomShape);
 
 REGISTER_OP("RandomStandardNormal")
     .Input("shape: T")
@@ -87,23 +51,9 @@ REGISTER_OP("RandomStandardNormal")
     .Output("output: dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
-    .Attr("dtype: {half,float,double}")
+    .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape)
-    .Doc(R"doc(
-Outputs random values from a normal distribution.
-
-The generated values will have mean 0 and standard deviation 1.
-
-shape: The shape of the output tensor.
-dtype: The type of the output.
-seed: If either `seed` or `seed2` are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: A second seed to avoid seed collision.
-
-output: A tensor of the specified shape filled with random normal values.
-)doc");
+    .SetShapeFn(shape_inference::RandomShape);
 
 REGISTER_OP("ParameterizedTruncatedNormal")
     .Input("shape: T")
@@ -115,29 +65,9 @@ REGISTER_OP("ParameterizedTruncatedNormal")
     .Output("output: dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
-    .Attr("dtype: {half,float,double}")
+    .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape)
-    .Doc(R"doc(
-Outputs random values from a normal distribution. The parameters may each be a
-scalar which applies to the entire output, or a vector of length shape[0] which
-stores the parameters for each batch.
-
-shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-means: The mean parameter of each batch.
-stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-minvals: The minimum cutoff. May be -infinity.
-maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-  for each batch.
-dtype: The type of the output.
-seed: If either `seed` or `seed2` are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: A second seed to avoid seed collision.
-
-output: A matrix of shape num_batches x samples_per_batch, filled with random
-  truncated normal values using the parameters for each row.
-)doc");
+    .SetShapeFn(shape_inference::RandomShape);
 
 REGISTER_OP("TruncatedNormal")
     .Input("shape: T")
@@ -145,26 +75,9 @@ REGISTER_OP("TruncatedNormal")
     .Output("output: dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
-    .Attr("dtype: {half,float,double}")
+    .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape)
-    .Doc(R"doc(
-Outputs random values from a truncated normal distribution.
-
-The generated values follow a normal distribution with mean 0 and standard
-deviation 1, except that values whose magnitude is more than 2 standard
-deviations from the mean are dropped and re-picked.
-
-shape: The shape of the output tensor.
-dtype: The type of the output.
-seed: If either `seed` or `seed2` are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: A second seed to avoid seed collision.
-
-output: A tensor of the specified shape filled with random truncated normal
-  values.
-)doc");
+    .SetShapeFn(shape_inference::RandomShape);
 
 REGISTER_OP("RandomShuffle")
     .Input("value: T")
@@ -173,38 +86,17 @@ REGISTER_OP("RandomShuffle")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .Attr("T: type")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Randomly shuffles a tensor along its first dimension.
-
-  The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-  to one and only one `output[i]`. For example, a mapping that might occur for a
-  3x2 tensor is:
-
-```
-[[1, 2],       [[5, 6],
- [3, 4],  ==>   [1, 2],
- [5, 6]]        [3, 4]]
-```
-
-value: The tensor to be shuffled.
-seed: If either `seed` or `seed2` are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: A second seed to avoid seed collision.
-
-output: A tensor of same shape and type as `value`, shuffled along its first
-  dimension.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Multinomial")
     .SetIsStateful()
     .Input("logits: T")
     .Input("num_samples: int32")
-    .Output("output: int64")
+    .Output("output: output_dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .Attr("T: realnumbertype")
+    .Attr("output_dtype: {int32, int64} = DT_INT64")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle logits_shape;
       ShapeHandle unused;
@@ -214,19 +106,7 @@ REGISTER_OP("Multinomial")
       TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(1, &num_samples));
       c->set_output(0, c->Matrix(c->Dim(logits_shape, 0), num_samples));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Draws samples from a multinomial distribution.
-
-logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-  represents the unnormalized log probabilities for all classes.
-num_samples: 0-D.  Number of independent samples to draw for each row slice.
-seed: If either seed or seed2 is set to be non-zero, the internal random number
-  generator is seeded by the given seed.  Otherwise, a random seed is used.
-seed2: A second seed to avoid seed collision.
-output: 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-  contains the drawn class labels with range `[0, num_classes)`.
-)doc");
+    });
 
 REGISTER_OP("RandomGamma")
     .SetIsStateful()
@@ -243,27 +123,7 @@ REGISTER_OP("RandomGamma")
       TF_RETURN_IF_ERROR(c->Concatenate(out, c->input(1), &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Outputs random values from the Gamma distribution(s) described by alpha.
-
-This op uses the algorithm by Marsaglia et al. to acquire samples via
-transformation-rejection from pairs of uniform and normal random variables.
-See http://dl.acm.org/citation.cfm?id=358414
-
-shape: 1-D integer tensor. Shape of independent samples to draw from each
-  distribution described by the shape parameters given in alpha.
-alpha: A tensor in which each scalar is a "shape" parameter describing the
-  associated gamma distribution.
-seed: If either `seed` or `seed2` are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: A second seed to avoid seed collision.
-
-output: A tensor with shape `shape + shape(alpha)`. Each slice
-  `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-  `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-)doc");
+    });
 
 REGISTER_OP("RandomPoisson")
     .SetIsStateful()
@@ -281,10 +141,7 @@ REGISTER_OP("RandomPoisson")
       c->set_output(0, out);
       return Status::OK();
     })
-    .Deprecated(25, "Replaced by RandomPoissonV2")
-    .Doc(R"doc(
-Use RandomPoissonV2 instead.
-)doc");
+    .Deprecated(25, "Replaced by RandomPoissonV2");
 
 REGISTER_OP("RandomPoissonV2")
     .SetIsStateful()
@@ -302,32 +159,6 @@ REGISTER_OP("RandomPoissonV2")
       TF_RETURN_IF_ERROR(c->Concatenate(out, c->input(1), &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Outputs random values from the Poisson distribution(s) described by rate.
-
-This op uses two algorithms, depending on rate. If rate >= 10, then
-the algorithm by Hormann is used to acquire samples via
-transformation-rejection.
-See http://www.sciencedirect.com/science/article/pii/0167668793909974.
-
-Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-random variables.
-See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-Programming, Volume 2. Addison Wesley
-
-shape: 1-D integer tensor. Shape of independent samples to draw from each
-  distribution described by the shape parameters given in rate.
-rate: A tensor in which each scalar is a "rate" parameter describing the
-  associated poisson distribution.
-seed: If either `seed` or `seed2` are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: A second seed to avoid seed collision.
-
-output: A tensor with shape `shape + shape(rate)`. Each slice
-  `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-  `rate[i0, i1, ...iN]`.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/remote_fused_graph_ops.cc b/tensorflow/core/ops/remote_fused_graph_ops.cc
index 85370e648c4d43e9595ac16402eb99aa851382d1..d90466673349fb2e75ba2cc81b181520d2bd52b2 100644
--- a/tensorflow/core/ops/remote_fused_graph_ops.cc
+++ b/tensorflow/core/ops/remote_fused_graph_ops.cc
@@ -36,23 +36,6 @@ REGISTER_OP("RemoteFusedGraphExecute")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("Toutputs: list(type) >= 0")
     .Attr("serialized_remote_fused_graph_execute_info: string")
-    .SetShapeFn(RemoteFusedGraphExecuteShapeFn)
-    .Doc(R"doc(
-Execute a sub graph on a remote processor.
-
-The graph specifications(such as graph itself, input tensors and output names)
-are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-as serialized_remote_fused_graph_execute_info.
-The specifications will be passed to a dedicated registered
-remote fused graph executor.  The executor will send the graph specifications
-to a remote processor and execute that graph.  The execution results
-will be passed to consumer nodes as outputs of this node.
-
-inputs: Arbitrary number of tensors with arbitrary data types
-outputs: Arbitrary number of tensors with arbitrary data types
-serialized_remote_fused_graph_execute_info: Serialized protocol buffer
-of RemoteFusedGraphExecuteInfo which contains graph specifications.
-
-)doc");
+    .SetShapeFn(RemoteFusedGraphExecuteShapeFn);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index cdfbec85cf1194d02c81cb4a3d66563dc85dfa57..8dae7e1ff5f872c33dd56509c0349180cec78593 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -76,51 +76,19 @@ REGISTER_OP("VarHandleOp")
                                             std::vector<ShapeAndType>{{s, t}});
 
       return Status::OK();
-    })
-    .Doc(R"(
-Creates a handle to a Variable resource.
-
-container: the container this variable is placed in.
-shared_name: the name by which this variable is referred to.
-dtype: the type of this variable. Must agree with the dtypes
-  of all ops using this variable.
-shape: The (possibly partially specified) shape of this variable.
-)");
+    });
 
 REGISTER_OP("ReadVariableOp")
     .Input("resource: resource")
     .Output("value: dtype")
     .Attr("dtype: type")
-    .SetShapeFn(ReadVariableShapeFn)
-    .Doc(R"(
-Reads the value of a variable.
-
-The tensor returned by this operation is immutable.
-
-The value returned by this operation is guaranteed to be influenced by all the
-writes on which this operation depends directly or indirectly, and to not be
-influenced by any of the writes which depend directly or indirectly on this
-operation.
-
-resource: handle to the resource in which to store the variable.
-dtype: the dtype of the value.
-)");
+    .SetShapeFn(ReadVariableShapeFn);
 
 REGISTER_OP("DestroyResourceOp")
     .Input("resource: resource")
     .Attr("ignore_lookup_error: bool = true")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"(
-Deletes the resource specified by the handle.
-
-All subsequent operations using the resource will result in a NotFound
-error status.
-
-resource: handle to the resource to delete.
-ignore_lookup_error: whether to ignore the error when the resource
-  doesn't exist.
-)");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 Status CreateAssignShapeFn(InferenceContext* c) {
   ShapeAndType handle_shape_and_type;
@@ -137,74 +105,34 @@ REGISTER_OP("AssignVariableOp")
     .Input("resource: resource")
     .Input("value: dtype")
     .Attr("dtype: type")
-    .SetShapeFn(CreateAssignShapeFn)
-    .Doc(R"(
-Assigns a new value to a variable.
-
-Any ReadVariableOp with a control dependency on this op is guaranteed to return
-this value or a subsequent newer value of the variable.
-
-resource: handle to the resource in which to store the variable.
-value: the value to set the new tensor to use.
-dtype: the dtype of the value.
-)");
+    .SetShapeFn(CreateAssignShapeFn);
 
 REGISTER_OP("AssignAddVariableOp")
     .Input("resource: resource")
     .Input("value: dtype")
     .Attr("dtype: type")
-    .SetShapeFn(CreateAssignShapeFn)
-    .Doc(R"(
-Adds a value to the current value of a variable.
-
-Any ReadVariableOp which depends directly or indirectly on this assign is
-guaranteed to see the incremented value or a subsequent newer one.
-
-Outputs the incremented value, which can be used to totally order the
-increments to this variable.
-
-resource: handle to the resource in which to store the variable.
-value: the value by which the variable will be incremented.
-dtype: the dtype of the value.
-)");
+    .SetShapeFn(CreateAssignShapeFn);
 
 REGISTER_OP("AssignSubVariableOp")
     .Input("resource: resource")
     .Input("value: dtype")
     .Attr("dtype: type")
-    .SetShapeFn(CreateAssignShapeFn)
-    .Doc(R"(
-Subtracts a value from the current value of a variable.
-
-Any ReadVariableOp which depends directly or indirectly on this assign is
-guaranteed to see the incremented value or a subsequent newer one.
-
-Outputs the incremented value, which can be used to totally order the
-increments to this variable.
-
-resource: handle to the resource in which to store the variable.
-value: the value by which the variable will be incremented.
-dtype: the dtype of the value.
-)");
+    .SetShapeFn(CreateAssignShapeFn);
 
 REGISTER_OP("VarIsInitializedOp")
     .Input("resource: resource")
     .Output("is_initialized: bool")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
-    .Doc(R"doc(
-Checks whether a resource handle-based variable has been initialized.
-
-resource: the input resource handle.
-is_initialized: a scalar boolean which is true if the variable has been
-initialized.
-)doc");
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
 
 Status VariableShapeShapeFn(InferenceContext* c) {
   auto* handle_data = c->input_handle_shapes_and_types(0);
   if (handle_data == nullptr || handle_data->empty()) {
     return errors::InvalidArgument("Handle doesn't have shape information.");
   }
-  c->set_output(0, (*handle_data)[0].shape);
+  ShapeHandle var_shape = (*handle_data)[0].shape;
+  int64 rank = c->RankKnown(var_shape) ? c->Rank(var_shape)
+                                       : InferenceContext::kUnknownDim;
+  c->set_output(0, c->Vector(rank));
   return Status::OK();
 }
 
@@ -212,20 +140,7 @@ REGISTER_OP("VariableShape")
     .Input("input: resource")
     .Output("output: out_type")
     .Attr("out_type: {int32, int64} = DT_INT32")
-    .SetShapeFn(VariableShapeShapeFn)
-    .Doc(R"doc(
-Returns the shape of the variable pointed to by `resource`.
-
-This operation returns a 1-D integer tensor representing the shape of `input`.
-
-For example:
-
-```
-# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-shape(t) ==> [2, 2, 3]
-```
-
-)doc");
+    .SetShapeFn(VariableShapeShapeFn);
 
 REGISTER_OP("ResourceGather")
     .Input("resource: resource")
@@ -250,25 +165,7 @@ REGISTER_OP("ResourceGather")
       TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, params_subshape, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Gather slices from the variable pointed to by `resource` according to `indices`.
-
-`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-
-```python
-    # Scalar indices
-    output[:, ..., :] = params[indices, :, ... :]
-
-    # Vector indices
-    output[i, :, ..., :] = params[indices[i], :, ... :]
-
-    # Higher rank indices
-    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-```
-
-)doc");
+    });
 
 REGISTER_OP("ResourceScatterAdd")
     .Input("resource: resource")
@@ -290,40 +187,13 @@ REGISTER_OP("ResourceScatterAdd")
       TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, var_subshape, &concat));
       TF_RETURN_IF_ERROR(c->Merge(c->input(2), concat, &unused_updates_shape));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Adds sparse updates to the variable referenced by `resource`.
-
-This operation computes
-
-    # Scalar indices
-    ref[indices, ...] += updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] += updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their contributions add.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-</div>
-
-resource: Should be from a `Variable` node.
-indices: A tensor of indices into the first dimension of `ref`.
-updates: A tensor of updated values to add to `ref`.
-)doc");
+    });
 
 REGISTER_OP("ResourceScatterUpdate")
     .Input("resource: resource")
     .Input("indices: Tindices")
     .Input("updates: dtype")
-    .Attr("dtype: numbertype")
+    .Attr("dtype: type")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeAndType handle_shape_and_type;
@@ -339,24 +209,36 @@ REGISTER_OP("ResourceScatterUpdate")
       TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, var_subshape, &concat));
       TF_RETURN_IF_ERROR(c->Merge(c->input(2), concat, &unused_updates_shape));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Assigns sparse updates to the variable referenced by `resource`.
-
-This operation computes
-
-    # Scalar indices
-    ref[indices, ...] = updates[...]
+    });
 
-    # Vector indices (for each i)
-    ref[indices[i], ...] = updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
-
-resource: Should be from a `Variable` node.
-indices: A tensor of indices into the first dimension of `ref`.
-updates: A tensor of updated values to add to `ref`.
-)doc");
+REGISTER_OP("CriticalSectionOp")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Output("resource: resource")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("ExecuteInCriticalSection")
+    .Input("critical_section: resource")
+    .Input("arguments: Targuments")
+    .Output("outputs: output_types")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 0")
+    .Attr("output_shapes: list(shape) >= 0")
+    .SetShapeFn([](InferenceContext* c) {
+      std::vector<PartialTensorShape> output_shapes;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+      for (int i = 0; i < output_shapes.size(); ++i) {
+        ShapeHandle s;
+        TF_RETURN_IF_ERROR(
+            c->MakeShapeFromPartialTensorShape(output_shapes[i], &s));
+        c->set_output(i, s);
+      }
+      return Status::OK();
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/script_ops.cc b/tensorflow/core/ops/script_ops.cc
index 8197327b562c5296e4bcbe43ce9ca81696dedf8b..d8716f0389a3bbb9fce88860fd136df04b702475 100644
--- a/tensorflow/core/ops/script_ops.cc
+++ b/tensorflow/core/ops/script_ops.cc
@@ -25,20 +25,7 @@ REGISTER_OP("PyFunc")
     .Attr("Tin: list(type) >= 0")
     .Attr("Tout: list(type) >=0")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Invokes a python function to compute func(input)->output.
-
-This operation is considered stateful. For a stateless version, see
-PyFuncStateless.
-
-token: A token representing a registered python function in this address space.
-input: List of Tensors that will provide input to the Op.
-output: The outputs from the Op.
-Tin: Data types of the inputs to the op.
-Tout: Data types of the outputs from the op.
-      The length of the list specifies the number of outputs.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("PyFuncStateless")
     .Input("input: Tin")
@@ -46,9 +33,15 @@ REGISTER_OP("PyFuncStateless")
     .Attr("token: string")
     .Attr("Tin: list(type) >= 0")
     .Attr("Tout: list(type) >= 0")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-A stateless version of PyFunc.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("EagerPyFunc")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("token: string")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("Tout: list(type) >=0")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/sdca_ops.cc b/tensorflow/core/ops/sdca_ops.cc
index dea75a1af83456f730a6c98cc40fd26d02ca2fda..4025070adb2b193edacdaf728f240961bf9d2530 100644
--- a/tensorflow/core/ops/sdca_ops.cc
+++ b/tensorflow/core/ops/sdca_ops.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::ShapeHandle;
 using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
 
 // --------------------------------------------------------------------------
 static Status ApplySdcaOptimizerShapeFn(InferenceContext* c) {
@@ -63,78 +63,14 @@ REGISTER_OP("SdcaOptimizer")
     .Output("out_example_state_data: float")
     .Output("out_delta_sparse_weights: num_sparse_features * float")
     .Output("out_delta_dense_weights: num_dense_features * float")
-    .SetShapeFn(ApplySdcaOptimizerShapeFn)
-    .Doc(R"doc(
-Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-linear models with L1 + L2 regularization. As global optimization objective is
-strongly-convex, the optimizer optimizes the dual objective at each step. The
-optimizer applies each update one example at a time. Examples are sampled
-uniformly, and the optimizer is learning rate free and enjoys linear convergence
-rate.
-
-[Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-Shai Shalev-Shwartz, Tong Zhang. 2012
-
-$$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
-
-[Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-Peter Richtarik, Martin Takac. 2015
-
-[Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
-
-loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-  squared and hinge losses.
-adaptative: Whether to use Adapative SDCA for the inner loop.
-num_sparse_features: Number of sparse feature groups to train on.
-num_sparse_features_with_values: Number of sparse feature groups with values
-  associated with it, otherwise implicitly treats values as 1.0.
-num_dense_features: Number of dense feature groups to train on.
-l1: Symmetric l1 regularization strength.
-l2: Symmetric l2 regularization strength.
-num_loss_partitions: Number of partitions of the global loss function.
-num_inner_iterations: Number of iterations per mini-batch.
-sparse_example_indices: a list of vectors which contain example indices.
-sparse_feature_indices: a list of vectors which contain feature indices.
-sparse_feature_values: a list of vectors which contains feature value
-  associated with each feature group.
-dense_features: a list of matrices which contains the dense feature values.
-example_weights: a vector which contains the weight associated with each
-  example.
-example_labels: a vector which contains the label/target associated with each
-  example.
-sparse_indices: a list of vectors where each value is the indices which has
-  corresponding weights in sparse_weights. This field maybe omitted for the
-  dense approach.
-sparse_weights: a list of vectors where each value is the weight associated with
-  a sparse feature group.
-dense_weights: a list of vectors where the values are the weights associated
- with a dense feature group.
-example_state_data: a list of vectors containing the example state data.
-out_example_state_data: a list of vectors containing the updated example state
-  data.
-out_delta_sparse_weights: a list of vectors where each value is the delta
-  weights associated with a sparse feature group.
-out_delta_dense_weights: a list of vectors where the values are the delta
-  weights associated with a dense feature group.
-)doc");
+    .SetShapeFn(ApplySdcaOptimizerShapeFn);
 
 REGISTER_OP("SdcaShrinkL1")
     .Attr("num_features: int >= 0")
     .Attr("l1: float")
     .Attr("l2: float")
     .Input("weights: Ref(num_features * float)")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Applies L1 regularization shrink step on the parameters.
-
-num_features: Number of feature groups to apply shrinking step.
-l1: Symmetric l1 regularization strength.
-l2: Symmetric l2 regularization strength. Should be a positive float.
-weights: a list of vectors where each value is the weight associated with a
-  feature group.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("SdcaFprint")
     .Input("input: string")
@@ -146,13 +82,6 @@ REGISTER_OP("SdcaFprint")
       TF_RETURN_IF_ERROR(c->Concatenate(handle, c->Vector(2), &output_shape));
       c->set_output(0, output_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Computes fingerprints of the input strings.
-
-input: vector of strings to compute fingerprints on.
-output: a (N,2) shaped matrix where N is the number of elements in the input
-  vector. Each row contains the low and high parts of the fingerprint.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/set_ops.cc b/tensorflow/core/ops/set_ops.cc
index 85d1335dcf9b362a856f058758ebe7b130302357..5eb1c4d87d4443532b1ce2ecbe3baad304d98f4e 100644
--- a/tensorflow/core/ops/set_ops.cc
+++ b/tensorflow/core/ops/set_ops.cc
@@ -30,24 +30,7 @@ REGISTER_OP("SetSize")
     .Attr("validate_indices: bool = true")
     .Attr("T: {int8, int16, int32, int64, uint8, uint16, string}")
     .Output("size: int32")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Number of unique elements along last dimension of input `set`.
-
-Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-and `set_shape`. The last dimension contains values in a set, duplicates are
-allowed but ignored.
-
-If `validate_indices` is `True`, this op validates the order and range of `set`
-indices.
-
-set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-set_values: 1D `Tensor`, values of a `SparseTensor`.
-set_shape: 1D `Tensor`, shape of a `SparseTensor`.
-size: For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-    `n-1` dimensions as `set`. Each value is the number of unique elements in
-    the corresponding `[0...n-1]` dimension of `set`.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("DenseToDenseSetOperation")
     .Input("set1: T")
@@ -103,28 +86,7 @@ REGISTER_OP("DenseToDenseSetOperation")
       c->set_output(1, c->Vector(c->UnknownDim()));
       c->set_output(2, c->Vector(output_rank));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Applies set operation along last dimension of 2 `Tensor` inputs.
-
-See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-
-Output `result` is a `SparseTensor` represented by `result_indices`,
-`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-dimension contains the result of `set_operation` applied to the corresponding
-`[0...n-1]` dimension of `set`.
-
-set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-    Dimension `n` contains values in a set, duplicates are allowed but ignored.
-set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
-    Dimension `n` contains values in a set, duplicates are allowed but ignored.
-result_indices: 2D indices of a `SparseTensor`.
-result_values: 1D values of a `SparseTensor`.
-result_shape: 1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-    the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-    is the max result set size across all `0...n-1` dimensions.
-)doc");
+    });
 
 REGISTER_OP("DenseToSparseSetOperation")
     .Input("set1: T")
@@ -168,41 +130,7 @@ REGISTER_OP("DenseToSparseSetOperation")
       c->set_output(1, c->Vector(c->UnknownDim()));
       c->set_output(2, c->Vector(output_rank_dim));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Applies set operation along last dimension of `Tensor` and `SparseTensor`.
-
-See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-
-Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-ignored.
-
-If `validate_indices` is `True`, this op validates the order and range of `set2`
-indices.
-
-Output `result` is a `SparseTensor` represented by `result_indices`,
-`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-dimension contains the result of `set_operation` applied to the corresponding
-`[0...n-1]` dimension of `set`.
-
-set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-    Dimension `n` contains values in a set, duplicates are allowed but ignored.
-set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-    order.
-set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-    order.
-set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-    be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-    max set size across `n-1` dimensions.
-result_indices: 2D indices of a `SparseTensor`.
-result_values: 1D values of a `SparseTensor`.
-result_shape: 1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-    the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-    is the max result set size across all `0...n-1` dimensions.
-)doc");
+    });
 
 REGISTER_OP("SparseToSparseSetOperation")
     .Input("set1_indices: int64")
@@ -258,53 +186,6 @@ REGISTER_OP("SparseToSparseSetOperation")
       c->set_output(1, c->Vector(c->UnknownDim()));
       c->set_output(2, c->Vector(output_rank_dim));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Applies set operation along last dimension of 2 `SparseTensor` inputs.
-
-See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-
-If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-order and range of `set1` and `set2` indices.
-
-Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-ignored.
-
-Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-ignored.
-
-If `validate_indices` is `True`, this op validates the order and range of `set1`
-and `set2` indices.
-
-Output `result` is a `SparseTensor` represented by `result_indices`,
-`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-dimension contains the result of `set_operation` applied to the corresponding
-`[0...n-1]` dimension of `set`.
-
-set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-    order.
-set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-    order.
-set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-    be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-    max set size across `0...n-1` dimensions.
-set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-    order.
-set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-    order.
-set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-    be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-    max set size across `0...n-1` dimensions.
-result_indices: 2D indices of a `SparseTensor`.
-result_values: 1D values of a `SparseTensor`.
-result_shape: 1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-    the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-    is the max result set size across all `0...n-1` dimensions.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 8414519f0b34c7e8adcef25371257c6b2e7538c1..acc8c782efe7371a42adf8fe587168fd978732a6 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -57,26 +57,7 @@ REGISTER_OP("SparseAddGrad")
       c->set_output(0, c->Vector(c->Dim(a_indices, 0)));
       c->set_output(1, c->Vector(c->Dim(b_indices, 0)));
       return Status::OK();
-    })
-    .Doc(R"doc(
-The gradient operator for the SparseAdd op.
-
-The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-values of A and B.
-
-backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-  the non-empty values of the sum.
-a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-  `[nnz(sum), ndims]`.
-a_val_grad: 1-D with shape `[nnz(A)]`. The gradient with respect to the
-  non-empty values of A.
-b_val_grad: 1-D with shape `[nnz(B)]`. The gradient with respect to the
-  non-empty values of B.
-)doc");
+    });
 
 REGISTER_OP("SparseAdd")
     .Input("a_indices: int64")
@@ -99,33 +80,7 @@ REGISTER_OP("SparseAdd")
       c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
       c->set_output(2, a_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Adds two `SparseTensor` objects to produce another `SparseTensor`.
-
-The input `SparseTensor` objects' indices are assumed ordered in standard
-lexicographic order.  If this is not the case, before this step run
-`SparseReorder` to restore index ordering.
-
-By default, if two values sum to zero at some index, the output `SparseTensor`
-would still include that particular location in its index, storing a zero in the
-corresponding value slot.  To override this, callers can specify `thresh`,
-indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-corresponding value and index would then not be included.  In particular,
-`thresh == 0` (default) means everything is kept and actual thresholding happens
-only for a positive value.
-
-In the following shapes, `nnz` is the count after taking `thresh` into account.
-
-a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-thresh: 0-D.  The magnitude threshold that determines if an output value/index
-pair takes space.
-)doc");
+    });
 
 REGISTER_OP("SparseTensorDenseMatMul")
     .Input("a_indices: Tindices")
@@ -161,36 +116,15 @@ REGISTER_OP("SparseTensorDenseMatMul")
       TF_RETURN_IF_ERROR(c->Merge(inner_left, inner_right, &unused_dim));
       c->set_output(0, c->Matrix(output_left, output_right));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-
-No validity checking is performed on the indices of A.  However, the following
-input format is recommended for optimal behavior:
-
-if adjoint_a == false:
-  A should be sorted in lexicographically increasing order.  Use SparseReorder
-  if you're not sure.
-if adjoint_a == true:
-  A should be sorted in order of increasing dimension 1 (i.e., "column major"
-  order instead of "row major" order).
-
-a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-b: 2-D.  A dense Matrix.
-adjoint_a: Use the adjoint of A in the matrix multiply.  If A is complex, this
-  is transpose(conj(A)).  Otherwise it's transpose(A).
-adjoint_b: Use the adjoint of B in the matrix multiply.  If B is complex, this
-  is transpose(conj(B)).  Otherwise it's transpose(B).
-)doc");
+    });
 
 REGISTER_OP("SerializeSparse")
     .Input("sparse_indices: int64")
     .Input("sparse_values: T")
     .Input("sparse_shape: int64")
     .Attr("T: type")
-    .Output("serialized_sparse: string")
+    .Output("serialized_sparse: out_type")
+    .Attr("out_type: {string, variant} = DT_STRING")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused));
@@ -198,21 +132,15 @@ REGISTER_OP("SerializeSparse")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
       c->set_output(0, c->Vector(3));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object.
-
-sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-sparse_values: 1-D.  The `values` of the `SparseTensor`.
-sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-)doc");
+    });
 
 REGISTER_OP("SerializeManySparse")
     .Input("sparse_indices: int64")
     .Input("sparse_values: T")
     .Input("sparse_shape: int64")
     .Attr("T: type")
-    .Output("serialized_sparse: string")
+    .Output("serialized_sparse: out_type")
+    .Attr("out_type: {string, variant} = DT_STRING")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused));
@@ -220,29 +148,15 @@ REGISTER_OP("SerializeManySparse")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
       c->set_output(0, c->Matrix(InferenceContext::kUnknownDim, 3));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
-
-The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-is treated as the minibatch dimension.  Elements of the `SparseTensor`
-must be sorted in increasing order of this first dimension.  The serialized
-`SparseTensor` objects going into each row of `serialized_sparse` will have
-rank `R-1`.
-
-The minibatch size `N` is extracted from `sparse_shape[0]`.
-
-sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-)doc");
+    });
 
 REGISTER_OP("DeserializeSparse")
-    .Input("serialized_sparse: string")
-    .Attr("dtype: type")
+    .Input("serialized_sparse: Tserialized")
     .Output("sparse_indices: int64")
     .Output("sparse_values: dtype")
     .Output("sparse_shape: int64")
+    .Attr("dtype: type")
+    .Attr("Tserialized: {string, variant} = DT_STRING")
     .SetShapeFn([](InferenceContext* c) {
       // serialized sparse is [?, ..., ?, 3] vector.
       DimensionHandle unused;
@@ -252,21 +166,14 @@ REGISTER_OP("DeserializeSparse")
       c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
       c->set_output(2, c->Vector(InferenceContext::kUnknownDim));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Deserialize `SparseTensor` objects.
-
-serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-  must have 3 columns.
-dtype: The `dtype` of the serialized `SparseTensor` objects.
-)doc");
+    });
 
 REGISTER_OP("DeserializeManySparse")
     .Input("serialized_sparse: string")
-    .Attr("dtype: type")
     .Output("sparse_indices: int64")
     .Output("sparse_values: dtype")
     .Output("sparse_shape: int64")
+    .Attr("dtype: type")
     .SetShapeFn([](InferenceContext* c) {
       // serialized sparse is [?,3] matrix.
       ShapeHandle serialized_sparse;
@@ -280,56 +187,7 @@ REGISTER_OP("DeserializeManySparse")
       c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
       c->set_output(2, c->Vector(InferenceContext::kUnknownDim));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-
-The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-`N` is the minibatch size and the rows correspond to packed outputs of
-`SerializeSparse`.  The ranks of the original `SparseTensor` objects
-must all match.  When the final `SparseTensor` is created, it has rank one
-higher than the ranks of the incoming `SparseTensor` objects
-(they have been concatenated along a new row dimension).
-
-The output `SparseTensor` object's shape values for all dimensions but the
-first are the max across the input `SparseTensor` objects' shape values
-for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-size.
-
-The input `SparseTensor` objects' indices are assumed ordered in
-standard lexicographic order.  If this is not the case, after this
-step run `SparseReorder` to restore index ordering.
-
-For example, if the serialized input is a `[2 x 3]` matrix representing two
-original `SparseTensor` objects:
-
-    index = [ 0]
-            [10]
-            [20]
-    values = [1, 2, 3]
-    shape = [50]
-
-and
-
-    index = [ 2]
-            [10]
-    values = [4, 5]
-    shape = [30]
-
-then the final deserialized `SparseTensor` will be:
-
-    index = [0  0]
-            [0 10]
-            [0 20]
-            [1  2]
-            [1 10]
-    values = [1, 2, 3, 4, 5]
-    shape = [2 50]
-
-serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-  Must have 3 columns.
-dtype: The `dtype` of the serialized `SparseTensor` objects.
-)doc");
+    });
 
 REGISTER_OP("SparseToDense")
     .Input("sparse_indices: Tindices")
@@ -345,41 +203,7 @@ REGISTER_OP("SparseToDense")
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Converts a sparse representation into a dense tensor.
-
-Builds an array `dense` with shape `output_shape` such that
-
-```
-# If sparse_indices is scalar
-dense[i] = (i == sparse_indices ? sparse_values : default_value)
-
-# If sparse_indices is a vector, then for each i
-dense[sparse_indices[i]] = sparse_values[i]
-
-# If sparse_indices is an n by d matrix, then for each i in [0, n)
-dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-```
-
-All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-scalar, all sparse indices are set to this single value.
-
-Indices should be sorted in lexicographic order, and indices must not
-contain any repeats. If `validate_indices` is true, these properties
-are checked during execution.
-
-sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-  index where `sparse_values[i]` will be placed.
-output_shape: 1-D.  Shape of the dense output tensor.
-sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-  or a scalar value to be used for all sparse indices.
-default_value: Scalar value to set for indices not specified in
-  `sparse_indices`.
-validate_indices: If true, indices are checked to make sure they are sorted in
-  lexicographic order and that there are no repeats.
-dense: Dense output tensor of shape `output_shape`.
-)doc");
+    });
 
 REGISTER_OP("SparseConcat")
     .Input("indices: N * int64")
@@ -424,61 +248,7 @@ REGISTER_OP("SparseConcat")
       c->set_output(1, c->Vector(output_row_count));
       c->set_output(2, output_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Concatenates a list of `SparseTensor` along the specified dimension.
-
-Concatenation is with respect to the dense versions of these sparse tensors.
-It is assumed that each input is a `SparseTensor` whose elements are ordered
-along increasing dimension number.
-
-All inputs' shapes must match, except for the concat dimension.  The
-`indices`, `values`, and `shapes` lists must have the same length.
-
-The output shape is identical to the inputs', except along the concat
-dimension, where it is the sum of the inputs' sizes along that dimension.
-
-The output elements will be resorted to preserve the sort order along
-increasing dimension number.
-
-This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-values across all inputs. This is due to the need for an internal sort in
-order to concatenate efficiently across an arbitrary dimension.
-
-For example, if `concat_dim = 1` and the inputs are
-
-    sp_inputs[0]: shape = [2, 3]
-    [0, 2]: "a"
-    [1, 0]: "b"
-    [1, 1]: "c"
-
-    sp_inputs[1]: shape = [2, 4]
-    [0, 1]: "d"
-    [0, 2]: "e"
-
-then the output will be
-
-    shape = [2, 7]
-    [0, 2]: "a"
-    [0, 4]: "d"
-    [0, 5]: "e"
-    [1, 0]: "b"
-    [1, 1]: "c"
-
-Graphically this is equivalent to doing
-
-    [    a] concat [  d e  ] = [    a   d e  ]
-    [b c  ]        [       ]   [b c          ]
-
-indices: 2-D.  Indices of each input `SparseTensor`.
-values: 1-D.  Non-empty values of each `SparseTensor`.
-shapes: 1-D.  Shapes of each `SparseTensor`.
-output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
-output_values: 1-D.  Non-empty values of the concatenated `SparseTensor`.
-output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
-concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-    where rank is the number of dimensions in each input `SparseTensor`.
-)doc");
+    });
 
 REGISTER_OP("SparseCross")
     .Input("indices: N * int64")
@@ -501,62 +271,7 @@ REGISTER_OP("SparseCross")
       c->set_output(1, c->Vector(c->UnknownDim()));
       c->set_output(2, c->Vector(2));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Generates sparse cross from a list of sparse and dense tensors.
-
-The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-representing features of one feature column. It outputs a 2D `SparseTensor` with
-the batchwise crosses of these features.
-
-For example, if the inputs are
-
-    inputs[0]: SparseTensor with shape = [2, 2]
-    [0, 0]: "a"
-    [1, 0]: "b"
-    [1, 1]: "c"
-
-    inputs[1]: SparseTensor with shape = [2, 1]
-    [0, 0]: "d"
-    [1, 0]: "e"
-
-    inputs[2]: Tensor [["f"], ["g"]]
-
-then the output will be
-
-    shape = [2, 2]
-    [0, 0]: "a_X_d_X_f"
-    [1, 0]: "b_X_e_X_g"
-    [1, 1]: "c_X_e_X_g"
-
-if hashed_output=true then the output will be
-
-    shape = [2, 2]
-    [0, 0]: FingerprintCat64(
-                Fingerprint64("f"), FingerprintCat64(
-                    Fingerprint64("d"), Fingerprint64("a")))
-    [1, 0]: FingerprintCat64(
-                Fingerprint64("g"), FingerprintCat64(
-                    Fingerprint64("e"), Fingerprint64("b")))
-    [1, 1]: FingerprintCat64(
-                Fingerprint64("g"), FingerprintCat64(
-                    Fingerprint64("e"), Fingerprint64("c")))
-
-indices: 2-D.  Indices of each input `SparseTensor`.
-values: 1-D.   values of each `SparseTensor`.
-shapes: 1-D.   Shapes of each `SparseTensor`.
-dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-hashed_output: If true, returns the hash of the cross instead of the string.
-  This will allow us avoiding string manipulations.
-num_buckets: It is used if hashed_output is true.
-  output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-  function to combine the crosses fingerprints.
-output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
-output_values: 1-D.  Non-empty values of the concatenated or hashed
-  `SparseTensor`.
-output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
-)doc");
+    });
 
 REGISTER_OP("SparseSplit")
     .Input("split_dim: int64")
@@ -585,41 +300,7 @@ REGISTER_OP("SparseSplit")
       for (int i = 0; i < num_splits; ++i)
         c->set_output(out_idx++, output_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Split a `SparseTensor` into `num_split` tensors along one dimension.
-
-If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-`[0 : shape[split_dim] % num_split]` gets one extra dimension.
-For example, if `split_dim = 1` and `num_split = 2` and the input is
-
-    input_tensor = shape = [2, 7]
-    [    a   d e  ]
-    [b c          ]
-
-Graphically the output tensors are:
-
-    output_tensor[0] = shape = [2, 4]
-    [    a  ]
-    [b c    ]
-
-    output_tensor[1] = shape = [2, 3]
-    [ d e  ]
-    [      ]
-
-split_dim: 0-D.  The dimension along which to split.  Must be in the range
-  `[0, rank(shape))`.
-num_split: The number of ways to split.
-indices: 2-D tensor represents the indices of the sparse tensor.
-values: 1-D tensor represents the values of the sparse tensor.
-shape: 1-D. tensor represents the shape of the sparse tensor.
-output indices: A list of 1-D tensors represents the indices of the output
-sparse tensors.
-output_values: A list of 1-D tensors represents the values of the output sparse
-  tensors.
-output_shape: A list of 1-D tensors represents the shape of the output sparse
-  tensors.
-)doc");
+    });
 
 REGISTER_OP("SparseSlice")
     .Input("indices: int64")
@@ -642,38 +323,7 @@ REGISTER_OP("SparseSlice")
       c->set_output(1, output_values);
       c->set_output(2, output_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Slice a `SparseTensor` based on the `start` and `size`.
-
-For example, if the input is
-
-    input_tensor = shape = [2, 7]
-    [    a   d e  ]
-    [b c          ]
-
-Graphically the output tensors are:
-
-    sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-    [    a  ]
-    [b c    ]
-
-    sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-    [ d e  ]
-    [      ]
-
-indices: 2-D tensor represents the indices of the sparse tensor.
-values: 1-D tensor represents the values of the sparse tensor.
-shape: 1-D. tensor represents the shape of the sparse tensor.
-start: 1-D. tensor represents the start of the slice.
-size: 1-D. tensor represents the size of the slice.
-output indices: A list of 1-D tensors represents the indices of the output
-sparse tensors.
-output_values: A list of 1-D tensors represents the values of the output sparse
-  tensors.
-output_shape: A list of 1-D tensors represents the shape of the output sparse
-  tensors.
-)doc");
+    });
 
 REGISTER_OP("SparseReorder")
     .Input("input_indices: int64")
@@ -694,27 +344,7 @@ REGISTER_OP("SparseReorder")
       c->set_output(0, indices);
       c->set_output(1, values);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Reorders a SparseTensor into the canonical, row-major ordering.
-
-Note that by convention, all sparse ops preserve the canonical ordering along
-increasing dimension number. The only time ordering can be violated is during
-manual manipulation of the indices and values vectors to add entries.
-
-Reordering does not affect the shape of the SparseTensor.
-
-If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
-
-input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-  SparseTensor, possibly not in canonical ordering.
-input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-input_shape: 1-D.  Shape of the input SparseTensor.
-output_indices: 2-D.  `N x R` matrix with the same indices as input_indices, but
-  in canonical row-major ordering.
-output_values: 1-D.  `N` non-empty values corresponding to `output_indices`.
-)doc");
+    });
 
 REGISTER_OP("SparseReshape")
     .Input("input_indices: int64")
@@ -734,36 +364,7 @@ REGISTER_OP("SparseReshape")
       c->set_output(0, c->Matrix(c->Dim(indices, 0), c->Dim(new_shape, 0)));
       c->set_output(1, new_shape);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Reshapes a SparseTensor to represent values in a new dense shape.
-
-This operation has the same semantics as reshape on the represented dense
-tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-
-If one component of `new_shape` is the special value -1, the size of that
-dimension is computed so that the total dense size remains constant.  At
-most one component of `new_shape` can be -1.  The number of dense elements
-implied by `new_shape` must be the same as the number of dense elements
-originally implied by `input_shape`.
-
-Reshaping does not affect the order of values in the SparseTensor.
-
-If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-`input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-`output_shape` has length `R_out`.
-
-input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-  SparseTensor.
-input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-new_shape: 1-D.  `R_out` vector with the requested new dense shape.
-output_indices: 2-D.  `N x R_out` matrix with the updated indices of non-empty
-  values in the output SparseTensor.
-output_shape: 1-D.  `R_out` vector with the full dense shape of the output
-  SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-  filled in.
-)doc");
+    });
 
 REGISTER_OP("SparseTensorDenseAdd")
     .Input("a_indices: Tindices")
@@ -776,17 +377,7 @@ REGISTER_OP("SparseTensorDenseAdd")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(3));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
-
-This Op does not require `a_indices` be sorted in standard lexicographic order.
-
-a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-b: `ndims`-D Tensor.  With shape `a_shape`.
-)doc");
+    });
 
 REGISTER_OP("SparseReduceMax")
     .Input("input_indices: int64")
@@ -796,31 +387,7 @@ REGISTER_OP("SparseReduceMax")
     .Attr("keep_dims: bool = False")
     .Output("output: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Computes the max of elements across dimensions of a SparseTensor.
-
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-instead of a sparse one.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-which are interpreted according to the indexing rules in Python.
-
-input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-  SparseTensor, possibly not in canonical ordering.
-input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-input_shape: 1-D.  Shape of the input SparseTensor.
-reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-keep_dims: If true, retain reduced dimensions with length 1.
-output: `R-K`-D.  The reduced Tensor.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("SparseReduceMaxSparse")
     .Input("input_indices: int64")
@@ -832,30 +399,7 @@ REGISTER_OP("SparseReduceMaxSparse")
     .Output("output_values: T")
     .Output("output_shape: int64")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Computes the max of elements across dimensions of a SparseTensor.
-
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-SparseTensor.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-which are interpreted according to the indexing rules in Python.
-
-input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-  SparseTensor, possibly not in canonical ordering.
-input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-input_shape: 1-D.  Shape of the input SparseTensor.
-reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-keep_dims: If true, retain reduced dimensions with length 1.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("SparseReduceSum")
     .Input("input_indices: int64")
@@ -865,31 +409,7 @@ REGISTER_OP("SparseReduceSum")
     .Attr("keep_dims: bool = False")
     .Output("output: T")
     .Attr("T: numbertype")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Computes the sum of elements across dimensions of a SparseTensor.
-
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-instead of a sparse one.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-which are interpreted according to the indexing rules in Python.
-
-input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-  SparseTensor, possibly not in canonical ordering.
-input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-input_shape: 1-D.  Shape of the input SparseTensor.
-reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-keep_dims: If true, retain reduced dimensions with length 1.
-output: `R-K`-D.  The reduced Tensor.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("SparseReduceSumSparse")
     .Input("input_indices: int64")
@@ -901,30 +421,7 @@ REGISTER_OP("SparseReduceSumSparse")
     .Output("output_values: T")
     .Output("output_shape: int64")
     .Attr("T: numbertype")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Computes the sum of elements across dimensions of a SparseTensor.
-
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-SparseTensor.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-which are interpreted according to the indexing rules in Python.
-
-input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-  SparseTensor, possibly not in canonical ordering.
-input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-input_shape: 1-D.  Shape of the input SparseTensor.
-reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-keep_dims: If true, retain reduced dimensions with length 1.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 #define SPARSE_DENSE_CWISE_SIGNATURE()                           \
   Input("sp_indices: int64")                                     \
@@ -940,63 +437,11 @@ keep_dims: If true, retain reduced dimensions with length 1.
         return Status::OK();                                     \
       })
 
-REGISTER_OP("SparseDenseCwiseMul")
-    .SPARSE_DENSE_CWISE_SIGNATURE()
-    .Doc(R"doc(
-Component-wise multiplies a SparseTensor by a dense Tensor.
-
-The output locations corresponding to the implicitly zero elements in the sparse
-tensor will be zero (i.e., will not take up storage space), regardless of the
-contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
-
-*Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-the other direction.
-
-sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-  SparseTensor, possibly not in canonical ordering.
-sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-sp_shape: 1-D.  Shape of the input SparseTensor.
-dense: `R`-D.  The dense Tensor operand.
-output: 1-D.  The `N` values that are operated on.
-)doc");
-
-REGISTER_OP("SparseDenseCwiseDiv")
-    .SPARSE_DENSE_CWISE_SIGNATURE()
-    .Doc(R"doc(
-Component-wise divides a SparseTensor by a dense Tensor.
-
-*Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-the other direction.
-
-sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-  SparseTensor, possibly not in canonical ordering.
-sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-sp_shape: 1-D.  Shape of the input SparseTensor.
-dense: `R`-D.  The dense Tensor operand.
-output: 1-D.  The `N` values that are operated on.
-)doc");
-
-REGISTER_OP("SparseDenseCwiseAdd")
-    .SPARSE_DENSE_CWISE_SIGNATURE()
-    .Doc(R"doc(
-Adds up a SparseTensor and a dense Tensor, using these special rules:
-
-(1) Broadcasts the dense side to have the same shape as the sparse side, if
-    eligible;
-(2) Then, only the dense values pointed to by the indices of the SparseTensor
-    participate in the cwise addition.
+REGISTER_OP("SparseDenseCwiseMul").SPARSE_DENSE_CWISE_SIGNATURE();
 
-By these rules, the result is a logical SparseTensor with exactly the same
-indices and shape, but possibly with different non-zero values.  The output of
-this Op is the resultant non-zero values.
+REGISTER_OP("SparseDenseCwiseDiv").SPARSE_DENSE_CWISE_SIGNATURE();
 
-sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-  SparseTensor, possibly not in canonical ordering.
-sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-sp_shape: 1-D.  Shape of the input SparseTensor.
-dense: `R`-D.  The dense Tensor operand.
-output: 1-D.  The `N` values that are operated on.
-)doc");
+REGISTER_OP("SparseDenseCwiseAdd").SPARSE_DENSE_CWISE_SIGNATURE();
 
 #undef SPARSE_DENSE_CWISE_SIGNATURE
 
@@ -1014,32 +459,7 @@ REGISTER_OP("SparseSoftmax")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
       c->set_output(0, values);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Applies softmax to a batched N-D `SparseTensor`.
-
-The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-(where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-
-This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-zero elements do not participate*.  Specifically, the algorithm is equivalent
-to the following:
-
-  (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-      with shape `[B, C]`, along the size-C dimension;
-  (2) Masks out the original implicitly-zero locations;
-  (3) Renormalizes the remaining elements.
-
-Hence, the `SparseTensor` result has exactly the same non-zero indices and
-shape.
-
-sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-  SparseTensor, in canonical ordering.
-sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-sp_shape: 1-D.  Shape of the input SparseTensor.
-output: 1-D.  The `NNZ` values for the result `SparseTensor`.
-)doc");
+    });
 
 REGISTER_OP("SparseSparseMaximum")
     .Input("a_indices: int64")
@@ -1051,23 +471,7 @@ REGISTER_OP("SparseSparseMaximum")
     .Output("output_indices: int64")
     .Output("output_values: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(SparseSparseMinOrMaxShapeFn)
-    .Doc(R"doc(
-Returns the element-wise max of two SparseTensors.
-
-Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-
-a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-  SparseTensor, in the canonical lexicographic ordering.
-a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-a_shape: 1-D.  Shape of the input SparseTensor.
-b_indices: counterpart to `a_indices` for the other operand.
-b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-
-output_indices: 2-D.  The indices of the output SparseTensor.
-output_values: 1-D.  The values of the output SparseTensor.
-)doc");
+    .SetShapeFn(SparseSparseMinOrMaxShapeFn);
 
 REGISTER_OP("SparseSparseMinimum")
     .Input("a_indices: int64")
@@ -1079,23 +483,7 @@ REGISTER_OP("SparseSparseMinimum")
     .Output("output_indices: int64")
     .Output("output_values: T")
     .Attr("T: numbertype")
-    .SetShapeFn(SparseSparseMinOrMaxShapeFn)
-    .Doc(R"doc(
-Returns the element-wise min of two SparseTensors.
-
-Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-
-a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-  SparseTensor, in the canonical lexicographic ordering.
-a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-a_shape: 1-D.  Shape of the input SparseTensor.
-b_indices: counterpart to `a_indices` for the other operand.
-b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-
-output_indices: 2-D.  The indices of the output SparseTensor.
-output_values: 1-D.  The values of the output SparseTensor.
-)doc");
+    .SetShapeFn(SparseSparseMinOrMaxShapeFn);
 
 REGISTER_OP("AddSparseToTensorsMap")
     .Input("sparse_indices: int64")
@@ -1113,34 +501,7 @@ REGISTER_OP("AddSparseToTensorsMap")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
       c->set_output(0, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
-
-A `SparseTensor` is represented by three tensors: `sparse_indices`,
-`sparse_values`, and `sparse_shape`.
-
-This operator takes the given `SparseTensor` and adds it to a container
-object (a `SparseTensorsMap`).  A unique key within this container is generated
-in the form of an `int64`, and this is the value that is returned.
-
-The `SparseTensor` can then be read out as part of a minibatch by passing
-the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-the correct `SparseTensorsMap` is accessed, ensure that the same
-`container` and `shared_name` are passed to that Op.  If no `shared_name`
-is provided here, instead use the *name* of the Operation created by calling
-`AddSparseToTensorsMap` as the `shared_name` passed to
-`TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
-
-sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-sparse_values: 1-D.  The `values` of the `SparseTensor`.
-sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-sparse_handle: 0-D.  The handle of the `SparseTensor` now stored in the
-  `SparseTensorsMap`.
-container: The container name for the `SparseTensorsMap` created by this op.
-shared_name: The shared name for the `SparseTensorsMap` created by this op.
-  If blank, the new Operation's unique name is used.
-)doc");
+    });
 
 REGISTER_OP("AddManySparseToTensorsMap")
     .Input("sparse_indices: int64")
@@ -1158,44 +519,7 @@ REGISTER_OP("AddManySparseToTensorsMap")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
-
-A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-`sparse_values`, and `sparse_shape`, where
-
-```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
-
-An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-having a first `sparse_indices` column taking values between `[0, N)`, where
-the minibatch size `N == sparse_shape[0]`.
-
-The input `SparseTensor` must have rank `R` greater than 1, and the first
-dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-must be sorted in increasing order of this first dimension.  The stored
-`SparseTensor` objects pointed to by each row of the output `sparse_handles`
-will have rank `R-1`.
-
-The `SparseTensor` values can then be read out as part of a minibatch by passing
-the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-the correct `SparseTensorsMap` is accessed, ensure that the same
-`container` and `shared_name` are passed to that Op.  If no `shared_name`
-is provided here, instead use the *name* of the Operation created by calling
-`AddManySparseToTensorsMap` as the `shared_name` passed to
-`TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
-
-sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-  `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-  The minibatch size `N == sparse_shape[0]`.
-sparse_handles: 1-D.  The handles of the `SparseTensor` now stored in the
-  `SparseTensorsMap`.  Shape: `[N]`.
-container: The container name for the `SparseTensorsMap` created by this op.
-shared_name: The shared name for the `SparseTensorsMap` created by this op.
-  If blank, the new Operation's unique name is used.
-)doc");
+    });
 
 REGISTER_OP("TakeManySparseFromTensorsMap")
     .Input("sparse_handles: int64")
@@ -1216,71 +540,7 @@ REGISTER_OP("TakeManySparseFromTensorsMap")
       c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
       c->set_output(2, c->Vector(InferenceContext::kUnknownDim));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
-
-The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-`N` is the minibatch size and the rows correspond to the output handles of
-`AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-original `SparseTensor` objects that went into the given input ops must all
-match.  When the final `SparseTensor` is created, it has rank one
-higher than the ranks of the incoming `SparseTensor` objects
-(they have been concatenated along a new row dimension on the left).
-
-The output `SparseTensor` object's shape values for all dimensions but the
-first are the max across the input `SparseTensor` objects' shape values
-for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-size.
-
-The input `SparseTensor` objects' indices are assumed ordered in
-standard lexicographic order.  If this is not the case, after this
-step run `SparseReorder` to restore index ordering.
-
-For example, if the handles represent an input, which is a `[2, 3]` matrix
-representing two original `SparseTensor` objects:
-
-```
-    index = [ 0]
-            [10]
-            [20]
-    values = [1, 2, 3]
-    shape = [50]
-```
-
-and
-
-```
-    index = [ 2]
-            [10]
-    values = [4, 5]
-    shape = [30]
-```
-
-then the final `SparseTensor` will be:
-
-```
-    index = [0  0]
-            [0 10]
-            [0 20]
-            [1  2]
-            [1 10]
-    values = [1, 2, 3, 4, 5]
-    shape = [2 50]
-```
-
-sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-  Shape: `[N]`.
-sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-dtype: The `dtype` of the `SparseTensor` objects stored in the
-  `SparseTensorsMap`.
-container: The container name for the `SparseTensorsMap` read by this op.
-shared_name: The shared name for the `SparseTensorsMap` read by this op.
-  It should not be blank; rather the `shared_name` or unique Operation name
-  of the Op that created the original `SparseTensorsMap` should be used.
-)doc");
+    });
 
 REGISTER_OP("SparseFillEmptyRows")
     .Input("indices: int64")
@@ -1319,59 +579,7 @@ REGISTER_OP("SparseFillEmptyRows")
       c->set_output(2, empty_row_indicator);
       c->set_output(3, reverse_index_map);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Fills empty rows in the input 2-D `SparseTensor` with a default value.
-
-The input `SparseTensor` is represented via the tuple of inputs
-(`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
-same `dense_shape` but with indices `output_indices` and values
-`output_values`.
-
-This op inserts a single entry for every row that doesn't have any values.
-The index is created as `[row, 0, ..., 0]` and the inserted value
-is `default_value`.
-
-For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-
-    [0, 1]: a
-    [0, 3]: b
-    [2, 0]: c
-    [3, 1]: d
-
-Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
-
-    [0, 1]: a
-    [0, 3]: b
-    [1, 0]: default_value
-    [2, 0]: c
-    [3, 1]: d
-    [4, 0]: default_value
-
-The output `SparseTensor` will be in row-major order and will have the
-same shape as the input.
-
-This op also returns an indicator vector shaped `[dense_shape[0]]` such that
-
-    empty_row_indicator[i] = True iff row i was an empty row.
-
-And a reverse index map vector shaped `[indices.shape[0]]` that is used during
-backpropagation,
-
-    reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
-
-
-indices: 2-D. the indices of the sparse tensor.
-values: 1-D. the values of the sparse tensor.
-dense_shape: 1-D. the shape of the sparse tensor.
-default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
-  for rows missing from the input sparse tensor.
-output indices: 2-D. the indices of the filled sparse tensor.
-output_values: 1-D. the values of the filled sparse tensor.
-empty_row_indicator: 1-D. whether the dense row was missing in the
-  input sparse tensor.
-reverse_index_map: 1-D. a map from the input indices to the output indices.
-)doc");
+    });
 
 REGISTER_OP("SparseFillEmptyRowsGrad")
     .Input("reverse_index_map: int64")
@@ -1387,23 +595,6 @@ REGISTER_OP("SparseFillEmptyRowsGrad")
       c->set_output(0, reverse_index_map);
       c->set_output(1, c->Scalar());
       return Status::OK();
-    })
-    .Doc(R"doc(
-The gradient of SparseFillEmptyRows.
-
-Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-shaped `[N_full]`, where `N_full >= N` and copies data into either
-`d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-`d_default_value` is a scalar.
-
-  d_values[j] = grad_values[reverse_index_map[j]]
-  d_default_value = sum_{k : 0 .. N_full - 1} (
-     grad_values[k] * 1{k not in reverse_index_map})
-
-reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
-grad_values: 1-D.  The gradients from backprop.
-d_values: 1-D.  The backprop into values.
-d_default_value: 0-D.  The backprop into default_value.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/spectral_ops.cc b/tensorflow/core/ops/spectral_ops.cc
index 592aaa25c3e76186a2dfcf720f62a7a97e10fbf1..508cea3495a9e811d4d12bf022b0ddfdcb33d718 100644
--- a/tensorflow/core/ops/spectral_ops.cc
+++ b/tensorflow/core/ops/spectral_ops.cc
@@ -29,126 +29,42 @@ REGISTER_OP("FFT")
     .Output("output: complex64")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
-    })
-    .Doc(R"doc(
-Fast Fourier transform.
-
-Computes the 1-dimensional discrete Fourier transform over the inner-most
-dimension of `input`.
-
-input: A complex64 tensor.
-output: A complex64 tensor of the same shape as `input`. The inner-most
-  dimension of `input` is replaced with its 1D Fourier transform.
-
-@compatibility(numpy)
-Equivalent to np.fft.fft
-@end_compatibility
-)doc");
+    });
 
 REGISTER_OP("IFFT")
     .Input("input: complex64")
     .Output("output: complex64")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
-    })
-    .Doc(R"doc(
-Inverse fast Fourier transform.
-
-Computes the inverse 1-dimensional discrete Fourier transform over the
-inner-most dimension of `input`.
-
-input: A complex64 tensor.
-output: A complex64 tensor of the same shape as `input`. The inner-most
-  dimension of `input` is replaced with its inverse 1D Fourier transform.
-
-@compatibility(numpy)
-Equivalent to np.fft.ifft
-@end_compatibility
-)doc");
+    });
 
 REGISTER_OP("FFT2D")
     .Input("input: complex64")
     .Output("output: complex64")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 2);
-    })
-    .Doc(R"doc(
-2D fast Fourier transform.
-
-Computes the 2-dimensional discrete Fourier transform over the inner-most
-2 dimensions of `input`.
-
-input: A complex64 tensor.
-output: A complex64 tensor of the same shape as `input`. The inner-most 2
-  dimensions of `input` are replaced with their 2D Fourier transform.
-
-@compatibility(numpy)
-Equivalent to np.fft.fft2
-@end_compatibility
-)doc");
+    });
 
 REGISTER_OP("IFFT2D")
     .Input("input: complex64")
     .Output("output: complex64")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 2);
-    })
-    .Doc(R"doc(
-Inverse 2D fast Fourier transform.
-
-Computes the inverse 2-dimensional discrete Fourier transform over the
-inner-most 2 dimensions of `input`.
-
-input: A complex64 tensor.
-output: A complex64 tensor of the same shape as `input`. The inner-most 2
-  dimensions of `input` are replaced with their inverse 2D Fourier transform.
-
-@compatibility(numpy)
-Equivalent to np.fft.ifft2
-@end_compatibility
-)doc");
+    });
 
 REGISTER_OP("FFT3D")
     .Input("input: complex64")
     .Output("output: complex64")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
-    })
-    .Doc(R"doc(
-3D fast Fourier transform.
-
-Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-dimensions of `input`.
-
-input: A complex64 tensor.
-output: A complex64 tensor of the same shape as `input`. The inner-most 3
-  dimensions of `input` are replaced with their 3D Fourier transform.
-
-@compatibility(numpy)
-Equivalent to np.fft.fftn with 3 dimensions.
-@end_compatibility
-)doc");
+    });
 
 REGISTER_OP("IFFT3D")
     .Input("input: complex64")
     .Output("output: complex64")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
-    })
-    .Doc(R"doc(
-Inverse 3D fast Fourier transform.
-
-Computes the inverse 3-dimensional discrete Fourier transform over the
-inner-most 3 dimensions of `input`.
-
-input: A complex64 tensor.
-output: A complex64 tensor of the same shape as `input`. The inner-most 3
-  dimensions of `input` are replaced with their inverse 3D Fourier transform.
-
-@compatibility(numpy)
-Equivalent to np.fft.ifftn with 3 dimensions.
-@end_compatibility
-)doc");
+    });
 
 Status RFFTShape(InferenceContext* c, const bool forward, const int rank) {
   ShapeHandle out;
@@ -190,196 +106,37 @@ REGISTER_OP("RFFT")
     .Input("input: float")
     .Input("fft_length: int32")
     .Output("output: complex64")
-    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 1); })
-    .Doc(R"doc(
-Real-valued fast Fourier transform.
-
-Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-over the inner-most dimension of `input`.
-
-Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-`fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-followed by the `fft_length / 2` positive-frequency terms.
-
-Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-corresponding dimension of `input`, the dimension is cropped. If it is larger,
-the dimension is padded with zeros.
-
-input: A float32 tensor.
-fft_length: An int32 tensor of shape [1]. The FFT length.
-output: A complex64 tensor of the same rank as `input`. The inner-most
-  dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-  frequency components of its 1D Fourier transform.
-
-@compatibility(numpy)
-Equivalent to np.fft.rfft
-@end_compatibility
-)doc");
+    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 1); });
 
 REGISTER_OP("IRFFT")
     .Input("input: complex64")
     .Input("fft_length: int32")
     .Output("output: float")
-    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 1); })
-    .Doc(R"doc(
-Inverse real-valued fast Fourier transform.
-
-Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-signal over the inner-most dimension of `input`.
-
-The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-`fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-`fft_length` is not provided, it is computed from the size of the inner-most
-dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-compute `input` is odd, it should be provided since it cannot be inferred
-properly.
-
-Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-than the corresponding dimension of `input`, the dimension is cropped. If it is
-larger, the dimension is padded with zeros.
-
-input: A complex64 tensor.
-fft_length: An int32 tensor of shape [1]. The FFT length.
-output: A float32 tensor of the same rank as `input`. The inner-most
-  dimension of `input` is replaced with the `fft_length` samples of its inverse
-  1D Fourier transform.
-
-@compatibility(numpy)
-Equivalent to np.fft.irfft
-@end_compatibility
-)doc");
+    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 1); });
 
 REGISTER_OP("RFFT2D")
     .Input("input: float")
     .Input("fft_length: int32")
     .Output("output: complex64")
-    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 2); })
-    .Doc(R"doc(
-2D real-valued fast Fourier transform.
-
-Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-over the inner-most 2 dimensions of `input`.
-
-Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-of `output`: the zero-frequency term, followed by the `fft_length / 2`
-positive-frequency terms.
-
-Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-corresponding dimension of `input`, the dimension is cropped. If it is larger,
-the dimension is padded with zeros.
-
-input: A float32 tensor.
-fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-output: A complex64 tensor of the same rank as `input`. The inner-most 2
-  dimensions of `input` are replaced with their 2D Fourier transform. The
-  inner-most dimension contains `fft_length / 2 + 1` unique frequency
-  components.
-
-@compatibility(numpy)
-Equivalent to np.fft.rfft2
-@end_compatibility
-)doc");
+    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 2); });
 
 REGISTER_OP("IRFFT2D")
     .Input("input: complex64")
     .Input("fft_length: int32")
     .Output("output: float")
-    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 2); })
-    .Doc(R"doc(
-Inverse 2D real-valued fast Fourier transform.
-
-Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-signal over the inner-most 2 dimensions of `input`.
-
-The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-to compute `input` is odd, it should be provided since it cannot be inferred
-properly.
-
-Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-`fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-corresponding dimension of `input`, the dimension is cropped. If it is larger,
-the dimension is padded with zeros.
-
-input: A complex64 tensor.
-fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-output: A float32 tensor of the same rank as `input`. The inner-most 2
-  dimensions of `input` are replaced with the `fft_length` samples of their
-  inverse 2D Fourier transform.
-
-@compatibility(numpy)
-Equivalent to np.fft.irfft2
-@end_compatibility
-)doc");
+    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 2); });
 
 REGISTER_OP("RFFT3D")
     .Input("input: float")
     .Input("fft_length: int32")
     .Output("output: complex64")
-    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 3); })
-    .Doc(R"doc(
-3D real-valued fast Fourier transform.
-
-Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-over the inner-most 3 dimensions of `input`.
-
-Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-of `output`: the zero-frequency term, followed by the `fft_length / 2`
-positive-frequency terms.
-
-Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-corresponding dimension of `input`, the dimension is cropped. If it is larger,
-the dimension is padded with zeros.
-
-input: A float32 tensor.
-fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-output: A complex64 tensor of the same rank as `input`. The inner-most 3
-  dimensions of `input` are replaced with the their 3D Fourier transform. The
-  inner-most dimension contains `fft_length / 2 + 1` unique frequency
-  components.
-
-@compatibility(numpy)
-Equivalent to np.fft.rfftn with 3 dimensions.
-@end_compatibility
-)doc");
+    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 3); });
 
 REGISTER_OP("IRFFT3D")
     .Input("input: complex64")
     .Input("fft_length: int32")
     .Output("output: float")
-    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 3); })
-    .Doc(R"doc(
-Inverse 3D real-valued fast Fourier transform.
-
-Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-signal over the inner-most 3 dimensions of `input`.
-
-The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-to compute `input` is odd, it should be provided since it cannot be inferred
-properly.
-
-Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-`fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-corresponding dimension of `input`, the dimension is cropped. If it is larger,
-the dimension is padded with zeros.
-
-input: A complex64 tensor.
-fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-output: A float32 tensor of the same rank as `input`. The inner-most 3
-  dimensions of `input` are replaced with the `fft_length` samples of their
-  inverse 3D real Fourier transform.
-
-@compatibility(numpy)
-Equivalent to np.irfftn with 3 dimensions.
-@end_compatibility
-)doc");
+    .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 3); });
 
 // Deprecated ops:
 REGISTER_OP("BatchFFT")
diff --git a/tensorflow/core/ops/spectral_ops_test.cc b/tensorflow/core/ops/spectral_ops_test.cc
index 0f8a3e6ef1366b2de08ee352bc54d1bf874a6bed..b1c5e95fc5ce25496d18202182cc418496349bb6 100644
--- a/tensorflow/core/ops/spectral_ops_test.cc
+++ b/tensorflow/core/ops/spectral_ops_test.cc
@@ -22,7 +22,7 @@ namespace tensorflow {
 TEST(MathOpsTest, FFT_ShapeFn) {
   for (const auto* op_name : {"FFT", "IFFT"}) {
     ShapeInferenceTestOp op(op_name);
-    INFER_OK(op, "?", "?");
+    INFER_OK(op, "?", "in0");
     INFER_ERROR("Shape must be at least rank 1 but is rank 0", op, "[]");
     INFER_OK(op, "[?]", "in0");
     INFER_OK(op, "[1]", "in0");
@@ -31,7 +31,7 @@ TEST(MathOpsTest, FFT_ShapeFn) {
 
   for (const auto* op_name : {"FFT2D", "IFFT2D"}) {
     ShapeInferenceTestOp op(op_name);
-    INFER_OK(op, "?", "?");
+    INFER_OK(op, "?", "in0");
     INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
     INFER_OK(op, "[?,1]", "in0");
     INFER_OK(op, "[1,2]", "in0");
@@ -40,7 +40,7 @@ TEST(MathOpsTest, FFT_ShapeFn) {
 
   for (const auto* op_name : {"FFT3D", "IFFT3D"}) {
     ShapeInferenceTestOp op(op_name);
-    INFER_OK(op, "?", "?");
+    INFER_OK(op, "?", "in0");
     INFER_ERROR("Shape must be at least rank 3 but is rank 2", op, "[1,2]");
     INFER_OK(op, "[?,1,?]", "in0");
     INFER_OK(op, "[1,2,3]", "in0");
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index da5f091e9f1988721b1947ad812851e0322efa9e..7a524b60c0aa711f36158b73b93fa91606266592 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -28,22 +28,7 @@ REGISTER_OP("VariableV2")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ExplicitShape)
-    .Doc(R"doc(
-Holds state in the form of a tensor that persists across steps.
-
-Outputs a ref to the tensor state so it may be read or modified.
-TODO(zhifengc/mrry): Adds a pointer to a more detail document
-about sharing states in tensorflow.
-
-ref: A reference to the variable tensor.
-shape: The shape of the variable tensor.
-dtype: The type of elements in the variable tensor.
-container: If non-empty, this variable is placed in the given container.
-        Otherwise, a default container is used.
-shared_name: If non-empty, this variable is named in the given bucket
-             with this shared_name. Otherwise, the node name is used instead.
-)doc");
+    .SetShapeFn(shape_inference::ExplicitShape);
 
 REGISTER_OP("Variable")
     .Output("ref: Ref(dtype)")
@@ -67,23 +52,14 @@ REGISTER_OP("Variable")
       TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &out));
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc("Use VariableV2 instead.");
+    });
 
 REGISTER_OP("IsVariableInitialized")
     .Input("ref: Ref(dtype)")
     .Output("is_initialized: bool")
     .Attr("dtype: type")
     .SetAllowsUninitializedInput()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Checks whether a tensor has been initialized.
-
-Outputs boolean scalar indicating whether the tensor has been initialized.
-
-ref: Should be from a `Variable` node. May be uninitialized.
-dtype: The type of elements in the variable tensor.
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("TemporaryVariable")
     .Output("ref: Ref(dtype)")
@@ -91,53 +67,14 @@ REGISTER_OP("TemporaryVariable")
     .Attr("dtype: type")
     .Attr("var_name: string = ''")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ExplicitShape)
-    .Doc(R"doc(
-Returns a tensor that may be mutated, but only persists within a single step.
-
-This is an experimental op for internal use only and it is possible to use this
-op in unsafe ways.  DO NOT USE unless you fully understand the risks.
-
-It is the caller's responsibility to ensure that 'ref' is eventually passed to a
-matching 'DestroyTemporaryVariable' op after all other uses have completed.
-
-Outputs a ref to the tensor state so it may be read or modified.
-
-  E.g.
-      var = state_ops._temporary_variable([1, 2], types.float_)
-      var_name = var.op.name
-      var = state_ops.assign(var, [[4.0, 5.0]])
-      var = state_ops.assign_add(var, [[6.0, 7.0]])
-      final = state_ops._destroy_temporary_variable(var, var_name=var_name)
-
-ref: A reference to the variable tensor.
-shape: The shape of the variable tensor.
-dtype: The type of elements in the variable tensor.
-var_name: Overrides the name used for the temporary variable resource. Default
-value is the name of the 'TemporaryVariable' op (which is guaranteed unique).
-)doc");
+    .SetShapeFn(shape_inference::ExplicitShape);
 
 REGISTER_OP("DestroyTemporaryVariable")
     .Input("ref: Ref(T)")
     .Output("value: T")
     .Attr("T: type")
     .Attr("var_name: string")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Destroys the temporary variable and returns its final value.
-
-Sets output to the value of the Tensor pointed to by 'ref', then destroys
-the temporary variable called 'var_name'.
-All other uses of 'ref' *must* have executed before this op.
-This is typically achieved by chaining the ref through each assign op, or by
-using control dependencies.
-
-Outputs the final value of the tensor pointed to by 'ref'.
-
-ref: A reference to the temporary variable tensor.
-var_name: Name of the temporary variable, usually the name of the matching
-'TemporaryVariable' op.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Assign")
     .Input("ref: Ref(T)")
@@ -156,23 +93,7 @@ REGISTER_OP("Assign")
 
       c->set_output(0, c->input(1));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Update 'ref' by assigning 'value' to it.
-
-This operation outputs "ref" after the assignment is done.
-This makes it easier to chain operations that need to use the reset value.
-
-ref: Should be from a `Variable` node. May be uninitialized.
-value: The value to be assigned to the variable.
-validate_shape: If true, the operation will validate that the shape
-  of 'value' matches the shape of the Tensor being assigned to.  If false,
-  'ref' will take on the shape of 'value'.
-use_locking: If True, the assignment will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-output_ref:= Same as "ref".  Returned as a convenience for operations that want
-  to use the new value after the variable has been reset.
-)doc");
+    });
 
 REGISTER_OP("AssignAdd")
     .Input("ref: Ref(T)")
@@ -180,20 +101,7 @@ REGISTER_OP("AssignAdd")
     .Output("output_ref: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
-    .Doc(R"doc(
-Update 'ref' by adding 'value' to it.
-
-This operation outputs "ref" after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-ref: Should be from a `Variable` node.
-value: The value to be added to the variable.
-use_locking: If True, the addition will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-output_ref:= Same as "ref".  Returned as a convenience for operations that want
-  to use the new value after the variable has been updated.
-)doc");
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
 
 REGISTER_OP("AssignSub")
     .Input("ref: Ref(T)")
@@ -201,20 +109,7 @@ REGISTER_OP("AssignSub")
     .Output("output_ref: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
-    .Doc(R"doc(
-Update 'ref' by subtracting 'value' from it.
-
-This operation outputs "ref" after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-ref: Should be from a `Variable` node.
-value: The value to be subtracted to the variable.
-use_locking: If True, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-output_ref:= Same as "ref".  Returned as a convenience for operations that want
-  to use the new value after the variable has been updated.
-)doc");
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
 
 namespace {
 
@@ -243,44 +138,7 @@ REGISTER_OP("ScatterUpdate")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = true")
-    .SetShapeFn(ScatterUpdateShape)
-    .Doc(R"doc(
-Applies sparse updates to a variable reference.
-
-This operation computes
-
-```python
-    # Scalar indices
-    ref[indices, ...] = updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] = updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
-```
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-If values in `ref` is to be updated more than once, because there are
-duplicate entries in `indices`, the order at which the updates happen
-for each value is undefined.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/ScatterUpdate.png" alt>
-</div>
-
-ref: Should be from a `Variable` node.
-indices: A tensor of indices into the first dimension of `ref`.
-updates: A tensor of updated values to store in `ref`.
-output_ref:= Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-use_locking: If True, the assignment will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    .SetShapeFn(ScatterUpdateShape);
 
 REGISTER_OP("ScatterAdd")
     .Input("ref: Ref(T)")
@@ -290,41 +148,7 @@ REGISTER_OP("ScatterAdd")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(ScatterUpdateShape)
-    .Doc(R"doc(
-Adds sparse updates to a variable reference.
-
-This operation computes
-
-    # Scalar indices
-    ref[indices, ...] += updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] += updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their contributions add.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
-</div>
-
-ref: Should be from a `Variable` node.
-indices: A tensor of indices into the first dimension of `ref`.
-updates: A tensor of updated values to add to `ref`.
-output_ref:= Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-use_locking: If True, the addition will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    .SetShapeFn(ScatterUpdateShape);
 
 REGISTER_OP("ScatterSub")
     .Input("ref: Ref(T)")
@@ -334,41 +158,7 @@ REGISTER_OP("ScatterSub")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(ScatterUpdateShape)
-    .Doc(R"doc(
-Subtracts sparse updates to a variable reference.
-
-```python
-    # Scalar indices
-    ref[indices, ...] -= updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] -= updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-```
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their (negated) contributions add.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/ScatterSub.png" alt>
-</div>
-
-ref: Should be from a `Variable` node.
-indices: A tensor of indices into the first dimension of `ref`.
-updates: A tensor of updated values to subtract from `ref`.
-output_ref:= Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-use_locking: If True, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    .SetShapeFn(ScatterUpdateShape);
 
 REGISTER_OP("ScatterMul")
     .Input("ref: Ref(T)")
@@ -378,39 +168,7 @@ REGISTER_OP("ScatterMul")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(ScatterUpdateShape)
-    .Doc(R"doc(
-Multiplies sparse updates into a variable reference.
-
-This operation computes
-
-```python
-    # Scalar indices
-    ref[indices, ...] *= updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] *= updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
-```
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their contributions multiply.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-ref: Should be from a `Variable` node.
-indices: A tensor of indices into the first dimension of `ref`.
-updates: A tensor of updated values to multiply to `ref`.
-output_ref:= Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-use_locking: If True, the operation will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    .SetShapeFn(ScatterUpdateShape);
 
 REGISTER_OP("ScatterDiv")
     .Input("ref: Ref(T)")
@@ -420,39 +178,7 @@ REGISTER_OP("ScatterDiv")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(ScatterUpdateShape)
-    .Doc(R"doc(
-Divides a variable reference by sparse updates.
-
-This operation computes
-
-```python
-    # Scalar indices
-    ref[indices, ...] /= updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] /= updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
-```
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their contributions divide.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-ref: Should be from a `Variable` node.
-indices: A tensor of indices into the first dimension of `ref`.
-updates: A tensor of values that `ref` is divided by.
-output_ref:= Same as `ref`.  Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-use_locking: If True, the operation will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    .SetShapeFn(ScatterUpdateShape);
 
 REGISTER_OP("ScatterNdUpdate")
     .Input("ref: Ref(T)")
@@ -462,56 +188,16 @@ REGISTER_OP("ScatterNdUpdate")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = true")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape)
-    .Doc(R"doc(
-Applies sparse `updates` to individual values or slices within a given
-variable according to `indices`.
-
-`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `ref`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `ref`.
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
 
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-```
-
-For example, say we want to update 4 scattered elements to a rank-1 tensor to
-8 elements. In Python, that update would look like this:
-
-```python
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1] ,[7]])
-    updates = tf.constant([9, 10, 11, 12])
-    update = tf.scatter_nd_update(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(update)
-```
-
-The resulting update to ref would look like this:
-
-    [1, 11, 3, 10, 9, 6, 7, 12]
-
-See @{tf.scatter_nd} for more details about how to make updates to
-slices.
-
-ref: A mutable Tensor. Should be from a Variable node.
-indices: A Tensor. Must be one of the following types: int32, int64.
-  A tensor of indices into ref.
-updates: A Tensor. Must have the same type as ref. A tensor of updated
-  values to add to ref.
-use_locking: An optional bool. Defaults to True. If True, the assignment will
-  be protected by a lock; otherwise the behavior is undefined,
-  but may exhibit less contention.
-output_ref: Same as ref. Returned as a convenience for operations that want to
-  use the updated values after the update is done.
-)doc");
+REGISTER_OP("ResourceScatterNdUpdate")
+    .Input("ref: resource")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = true")
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
 
 REGISTER_OP("ScatterNdAdd")
     .Input("ref: Ref(T)")
@@ -521,54 +207,7 @@ REGISTER_OP("ScatterNdAdd")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape)
-    .Doc(R"doc(
-Applies sparse addition between `updates` and individual values or slices
-within a given variable according to `indices`.
-
-`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `ref`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `ref`.
-
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-```
-
-For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-elements. In Python, that addition would look like this:
-
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    add = tf.scatter_nd_add(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(add)
-
-The resulting update to ref would look like this:
-
-    [1, 13, 3, 14, 14, 6, 7, 20]
-
-See @{tf.scatter_nd} for more details about how to make updates to
-slices.
-
-ref: A mutable Tensor. Should be from a Variable node.
-indices: A Tensor. Must be one of the following types: int32, int64.
-  A tensor of indices into ref.
-updates: A Tensor. Must have the same type as ref. A tensor of updated values
-  to add to ref.
-use_locking: An optional bool. Defaults to True. If True, the assignment will
-  be protected by a lock; otherwise the behavior is undefined,
-  but may exhibit less contention.
-output_ref: Same as ref. Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-)doc");
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
 
 REGISTER_OP("ScatterNdSub")
     .Input("ref: Ref(T)")
@@ -578,54 +217,7 @@ REGISTER_OP("ScatterNdSub")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(shape_inference::ScatterNdUpdateShape)
-    .Doc(R"doc(
-Applies sparse subtraction between `updates` and individual values or slices
-within a given variable according to `indices`.
-
-`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `ref`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `ref`.
-
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-```
-
-For example, say we want to subtract 4 scattered elements from a rank-1 tensor
-with 8 elements. In Python, that subtraction would look like this:
-
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    sub = tf.scatter_nd_sub(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(sub)
-
-The resulting update to ref would look like this:
-
-    [1, -9, 3, -6, -4, 6, 7, -4]
-
-See @{tf.scatter_nd} for more details about how to make updates to
-slices.
-
-ref: A mutable Tensor. Should be from a Variable node.
-indices: A Tensor. Must be one of the following types: int32, int64.
-  A tensor of indices into ref.
-updates: A Tensor. Must have the same type as ref. A tensor of updated values
-  to subtract from ref.
-use_locking: An optional bool. Defaults to True. If True, the assignment will
-  be protected by a lock; otherwise the behavior is undefined,
-  but may exhibit less contention.
-output_ref: Same as ref. Returned as a convenience for operations that want
-  to use the updated values after the update is done.
-)doc");
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
 
 REGISTER_OP("CountUpTo")
     .Input("ref: Ref(T)")
@@ -637,16 +229,7 @@ REGISTER_OP("CountUpTo")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &output));
       c->set_output(0, output);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Increments 'ref' until it reaches 'limit'.
-
-ref: Should be from a scalar `Variable` node.
-limit: If incrementing ref would bring it above limit, instead generates an
-  'OutOfRange' error.
-output: A copy of the input before increment. If nothing else modifies the
-  input, the values produced will all be distinct.
-)doc");
+    });
 
 REGISTER_OP("ResourceCountUpTo")
     .Input("resource: resource")
@@ -670,15 +253,6 @@ REGISTER_OP("ResourceCountUpTo")
       TF_RETURN_IF_ERROR(c->WithRank(shape_and_type.shape, 0, &output));
       c->set_output(0, output);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Increments variable pointed to by 'resource' until it reaches 'limit'.
-
-resource: Should be from a scalar `Variable` node.
-limit: If incrementing ref would bring it above limit, instead generates an
-  'OutOfRange' error.
-output: A copy of the input before increment. If nothing else modifies the
-  input, the values produced will all be distinct.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/stateless_random_ops.cc b/tensorflow/core/ops/stateless_random_ops.cc
index 3e1f8781fcd7718e3443b0b4bee5ea5d33980524..553850610a3c51986664fee52e04809626de22c1 100644
--- a/tensorflow/core/ops/stateless_random_ops.cc
+++ b/tensorflow/core/ops/stateless_random_ops.cc
@@ -46,52 +46,13 @@ static Status StatelessShape(shape_inference::InferenceContext* context) {
       .SetShapeFn(StatelessShape)
 
 // This op is exposed through contrib/stateless only.  The interface may change.
-REGISTER_STATELESS_OP("StatelessRandomUniform")
-    .Doc(R"doc(
-Outputs deterministic pseudorandom random values from a uniform distribution.
-
-The generated values follow a uniform distribution in the range `[0, 1)`. The
-lower bound 0 is included in the range, while the upper bound 1 is excluded.
-
-The outputs are a deterministic function of `shape` and `seed`.
-
-shape: The shape of the output tensor.
-dtype: The type of the output.
-seed: 2 seeds (shape [2]).
-output: Random values with specified shape.
-)doc");
+REGISTER_STATELESS_OP("StatelessRandomUniform");
 
 // This op is exposed through contrib/stateless only.  The interface may change.
-REGISTER_STATELESS_OP("StatelessRandomNormal")
-    .Doc(R"doc(
-Outputs deterministic pseudorandom values from a normal distribution.
-
-The generated values will have mean 0 and standard deviation 1.
-
-The outputs are a deterministic function of `shape` and `seed`.
-
-shape: The shape of the output tensor.
-dtype: The type of the output.
-seed: 2 seeds (shape [2]).
-output: Random values with specified shape.
-)doc");
+REGISTER_STATELESS_OP("StatelessRandomNormal");
 
 // This op is exposed through contrib/stateless only.  The interface may change.
-REGISTER_STATELESS_OP("StatelessTruncatedNormal")
-    .Doc(R"doc(
-Outputs deterministic pseudorandom values from a truncated normal distribution.
-
-The generated values follow a normal distribution with mean 0 and standard
-deviation 1, except that values whose magnitude is more than 2 standard
-deviations from the mean are dropped and re-picked.
-
-The outputs are a deterministic function of `shape` and `seed`.
-
-shape: The shape of the output tensor.
-dtype: The type of the output.
-seed: 2 seeds (shape [2]).
-output: Random values with specified shape.
-)doc");
+REGISTER_STATELESS_OP("StatelessTruncatedNormal");
 
 #undef REGISTER_STATELESS_OP
 
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index aebd14c7e55c6bd794e85061de275407a82f71c5..e4c5bcfb540660a609aca013b795d566e69f54a8 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -27,67 +27,20 @@ REGISTER_OP("StringToHashBucketFast")
     .Input("input: string")
     .Output("output: int64")
     .Attr("num_buckets: int >= 1")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Converts each string in the input Tensor to its hash mod by a number of buckets.
-
-The hash function is deterministic on the content of the string within the
-process and will never change. However, it is not suitable for cryptography.
-This function may be used when CPU time is scarce and inputs are trusted or
-unimportant. There is a risk of adversaries constructing inputs that all hash
-to the same bucket. To prevent this problem, use a strong hash function with
-`tf.string_to_hash_bucket_strong`.
-
-input: The strings to assign a hash bucket.
-num_buckets: The number of buckets.
-output: A Tensor of the same shape as the input `string_tensor`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("StringToHashBucketStrong")
     .Input("input: string")
     .Output("output: int64")
     .Attr("num_buckets: int >= 1")
     .Attr("key: list(int)")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Converts each string in the input Tensor to its hash mod by a number of buckets.
-
-The hash function is deterministic on the content of the string within the
-process. The hash function is a keyed hash function, where attribute `key`
-defines the key of the hash function. `key` is an array of 2 elements.
-
-A strong hash is important when inputs may be malicious, e.g. URLs with
-additional components. Adversaries could try to make their inputs hash to the
-same bucket for a denial-of-service attack or to skew the results. A strong
-hash prevents this by making it difficult, if not infeasible, to compute inputs
-that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-time than `tf.string_to_hash_bucket_fast`.
-
-input: The strings to assign a hash bucket.
-num_buckets: The number of buckets.
-key: The key for the keyed hash function passed as a list of two uint64
-  elements.
-output: A Tensor of the same shape as the input `string_tensor`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("StringToHashBucket")
     .Input("string_tensor: string")
     .Output("output: int64")
     .Attr("num_buckets: int >= 1")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Converts each string in the input Tensor to its hash mod by a number of buckets.
-
-The hash function is deterministic on the content of the string within the
-process.
-
-Note that the hash function may change from time to time.
-This functionality will be deprecated and it's recommended to use
-`tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
-
-num_buckets: The number of buckets.
-output: A Tensor of the same shape as the input `string_tensor`.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("ReduceJoin")
     .Input("inputs: string")
@@ -95,41 +48,7 @@ REGISTER_OP("ReduceJoin")
     .Attr("keep_dims: bool = false")
     .Attr("separator: string = ''")
     .Output("output: string")
-    .SetShapeFn(shape_inference::ReductionShape)
-    .Doc(R"doc(
-Joins a string Tensor across the given dimensions.
-
-Computes the string join across dimensions in the given string Tensor of shape
-`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
-strings with the given separator (default: empty string).  Negative indices are
-counted backwards from the end, with `-1` being equivalent to `n - 1`.
-
-For example:
-
-```python
-# tensor `a` is [["a", "b"], ["c", "d"]]
-tf.reduce_join(a, 0) ==> ["ac", "bd"]
-tf.reduce_join(a, 1) ==> ["ab", "cd"]
-tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-tf.reduce_join(a, [0, 1]) ==> ["acbd"]
-tf.reduce_join(a, [1, 0]) ==> ["abcd"]
-tf.reduce_join(a, []) ==> ["abcd"]
-```
-
-inputs: The input to be joined.  All reduced indices must have non-zero size.
-reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-  order specified.  Omitting `reduction_indices` is equivalent to passing
-  `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
-keep_dims: If `True`, retain reduced dimensions with length `1`.
-separator: The separator to use when joining.
-
-output: Has shape equal to that of the input with reduced dimensions removed or
-  set to `1` depending on `keep_dims`.
-)doc");
+    .SetShapeFn(shape_inference::ReductionShape);
 
 REGISTER_OP("AsString")
     .Input("input: T")
@@ -140,22 +59,7 @@ REGISTER_OP("AsString")
     .Attr("shortest: bool = false")
     .Attr("width: int = -1")
     .Attr("fill: string = ''")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Converts each entry in the given tensor to strings.  Supports many numeric
-types and boolean.
-
-precision: The post-decimal precision to use for floating point numbers.
-  Only used if precision > -1.
-scientific: Use scientific notation for floating point numbers.
-shortest: Use shortest representation (either scientific or standard) for
-  floating point numbers.
-width: Pad pre-decimal numbers to this width.
-  Applies to both floating point and integer numbers.
-  Only used if width > -1.
-fill: The value to pad if width > -1.  If empty, pads with spaces.
-  Another typical value is '0'.  String cannot be longer than 1 character.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("StringJoin")
     .Input("inputs: N * string")
@@ -185,16 +89,7 @@ REGISTER_OP("StringJoin")
       }
       c->set_output(0, out);
       return Status::OK();
-    })
-    .Doc(R"doc(
-Joins the strings in the given list of string tensors into one tensor;
-with the given separator (default is an empty separator).
-
-inputs: A list of string tensors.  The tensors must all have the same shape,
-  or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-  of non-scalar inputs.
-separator: string, an optional join separator.
-)doc");
+    });
 
 REGISTER_OP("StringSplit")
     .Input("input: string")
@@ -212,74 +107,18 @@ REGISTER_OP("StringSplit")
       c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
       c->set_output(2, c->Vector(2));
       return Status::OK();
-    })
-    .Doc(R"doc(
-Split elements of `input` based on `delimiter` into a `SparseTensor`.
-
-Let N be the size of source (typically N will be the batch size). Split each
-element of `input` based on `delimiter` and return a `SparseTensor`
-containing the splitted tokens. Empty tokens are ignored.
-
-`delimiter` can be empty, or a string of split characters. If `delimiter` is an
- empty string, each element of `input` is split into individual single-byte
- character strings, including splitting of UTF-8 multibyte sequences. Otherwise
- every character of `delimiter` is a potential split point.
-
-For example:
-  N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-  will be
-
-  indices = [0, 0;
-             0, 1;
-             1, 0;
-             1, 1;
-             1, 2]
-  shape = [2, 3]
-  values = ['hello', 'world', 'a', 'b', 'c']
-
-input: 1-D. Strings to split.
-delimiter: 0-D. Delimiter characters (bytes), or empty string.
-skip_empty: A `bool`. If `True`, skip the empty strings from the result.
-indices: A dense matrix of int64 representing the indices of the sparse tensor.
-values: A vector of strings corresponding to the splited values.
-shape: a length-2 vector of int64 representing the shape of the sparse
-  tensor, where the first value is N and the second value is the maximum number
-  of tokens in a single input entry.
-)doc");
+    });
 
 REGISTER_OP("EncodeBase64")
     .Input("input: string")
     .Output("output: string")
     .Attr("pad: bool = false")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Encode strings into web-safe base64 format.
-
-Refer to the following article for more information on base64 format:
-en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-end so that the encoded has length multiple of 4. See Padding section of the
-link above.
-
-Web-safe means that the encoder uses - and _ instead of + and /.
-
-input: Strings to be encoded.
-output: Input strings encoded in base64.
-pad: Bool whether padding is applied at the ends.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("DecodeBase64")
     .Input("input: string")
     .Output("output: string")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Decode web-safe base64-encoded strings.
-
-Input may or may not have padding at the end. See EncodeBase64 for padding.
-Web-safe means that input must use - and _ instead of + and /.
-
-input: Base64 strings to decode.
-output: Decoded strings.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Substr")
     .Input("input: string")
@@ -298,96 +137,14 @@ REGISTER_OP("Substr")
         DimensionHandle pos_dim = c->Dim(pos_shape, i);
         DimensionHandle len_dim = c->Dim(len_shape, i);
         if (c->Value(pos_dim) != c->Value(len_dim)) {
-          return errors::InvalidArgument("pos and len shapes must match: ",
-                                         c->DebugString(pos_shape), " vs. ",
-                                         c->DebugString(len_shape));
+          return errors::InvalidArgument(
+              "pos and len shapes must match: ", c->DebugString(pos_shape),
+              " vs. ", c->DebugString(len_shape));
         }
       }
       // c->input(0) is the ShapeHandle to input strings
       // BroadcastBinaryOpShapeFn infers shape from c->input(0) and c->input(1).
       return shape_inference::BroadcastBinaryOpShapeFn(c);
-    })
-    .Doc(R"doc(
-Return substrings from `Tensor` of strings.
-
-For each string in the input `Tensor`, creates a substring starting at index
-`pos` with a total length of `len`.
-
-If `len` defines a substring that would extend beyond the length of the input
-string, then as many characters as possible are used.
-
-If `pos` is negative or specifies a character index larger than any of the input
-strings, then an `InvalidArgumentError` is thrown.
-
-`pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-Op creation.
-
-*NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
----
-
-Examples
-
-Using scalar `pos` and `len`:
-
-```python
-input = [b'Hello', b'World']
-position = 1
-length = 3
-
-output = [b'ell', b'orl']
-```
-
-Using `pos` and `len` with same shape as `input`:
-
-```python
-input = [[b'ten', b'eleven', b'twelve'],
-         [b'thirteen', b'fourteen', b'fifteen'],
-         [b'sixteen', b'seventeen', b'eighteen']]
-position = [[1, 2, 3],
-            [1, 2, 3],
-            [1, 2, 3]]
-length =   [[2, 3, 4],
-            [4, 3, 2],
-            [5, 5, 5]]
-
-output = [[b'en', b'eve', b'lve'],
-          [b'hirt', b'urt', b'te'],
-          [b'ixtee', b'vente', b'hteen']]
-```
-
-Broadcasting `pos` and `len` onto `input`:
-
-```
-input = [[b'ten', b'eleven', b'twelve'],
-         [b'thirteen', b'fourteen', b'fifteen'],
-         [b'sixteen', b'seventeen', b'eighteen'],
-         [b'nineteen', b'twenty', b'twentyone']]
-position = [1, 2, 3]
-length =   [1, 2, 3]
-
-output = [[b'e', b'ev', b'lve'],
-          [b'h', b'ur', b'tee'],
-          [b'i', b've', b'hte'],
-          [b'i', b'en', b'nty']]
-```
-
-Broadcasting `input` onto `pos` and `len`:
-
-```
-input = b'thirteen'
-position = [1, 5, 7]
-length =   [3, 2, 1]
-
-output = [b'hir', b'ee', b'n']
-```
-
-input: Tensor of strings
-pos: Scalar defining the position of first character in each substring
-len: Scalar defining the number of characters to include in each substring
-output: Tensor of substrings
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 405318caf20183ce267e84cd2554ed8c77a5b409..6ce9595fb60b78525bde19515077f7245a219d39 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -22,48 +22,6 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
-const char kAddSignCommonDocStr[] = R"doc(
-Update '*var' according to the AddSign update.
-
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-variable <- variable - lr_t * update
-
-var: Should be from a Variable().
-m: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-sign_decay: Must be a scalar.
-alpha: Must be a scalar.
-beta: Must be a scalar.
-grad: The gradient.
-)doc";
-
-const char kPowerSignCommonDocStr[] = R"doc(
-Update '*var' according to the AddSign update.
-
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-variable <- variable - lr_t * update
-
-var: Should be from a Variable().
-m: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-logbase: Must be a scalar.
-sign_decay: Must be a scalar.
-beta: Must be a scalar.
-grad: The gradient.
-)doc";
-
-const char kOutDocStr[] = R"doc(
-out: Same as "var".
-)doc";
-
-const char kLockDocStr[] = R"doc(
-use_locking: If `True`, updating of the var and m tensors is
-  protected by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc";
-
 static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
   auto* handle_data = c->input_handle_shapes_and_types(input);
   if (handle_data != nullptr && !handle_data->empty() &&
@@ -116,17 +74,7 @@ REGISTER_OP("ApplyGradientDescent")
     .Output("out: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(ApplyGradientDescentShapeFn)
-    .Doc(R"doc(
-Update '*var' by subtracting 'alpha' * 'delta' from it.
-
-var: Should be from a Variable().
-alpha: Scaling factor. Must be a scalar.
-delta: The change.
-out: Same as "var".
-use_locking: If `True`, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    .SetShapeFn(ApplyGradientDescentShapeFn);
 
 REGISTER_OP("ResourceApplyGradientDescent")
     .Input("var: resource")
@@ -134,16 +82,7 @@ REGISTER_OP("ResourceApplyGradientDescent")
     .Input("delta: T")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
-    .SetShapeFn(ApplyGradientDescentShapeFn)
-    .Doc(R"doc(
-Update '*var' by subtracting 'alpha' * 'delta' from it.
-
-var: Should be from a Variable().
-alpha: Scaling factor. Must be a scalar.
-delta: The change.
-use_locking: If `True`, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    .SetShapeFn(ApplyGradientDescentShapeFn);
 
 static Status ApplyProximalGradientDescentShapeFn(InferenceContext* c,
                                                   bool sparse) {
@@ -171,21 +110,7 @@ REGISTER_OP("ApplyProximalGradientDescent")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyProximalGradientDescentShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' as FOBOS algorithm with fixed learning rate.
-prox_v = var - alpha * delta
-var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-
-var: Should be from a Variable().
-alpha: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-delta: The change.
-out: Same as "var".
-use_locking: If True, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 REGISTER_OP("SparseApplyProximalGradientDescent")
     .Input("var: Ref(T)")
@@ -200,24 +125,7 @@ REGISTER_OP("SparseApplyProximalGradientDescent")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyProximalGradientDescentShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Sparse update '*var' as FOBOS algorithm with fixed learning rate.
-
-That is for rows we have grad for, we update var as follows:
-prox_v = var - alpha * grad
-var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-
-var: Should be from a Variable().
-alpha: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-out: Same as "var".
-use_locking: If True, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceApplyProximalGradientDescent")
     .Input("var: resource")
@@ -229,20 +137,7 @@ REGISTER_OP("ResourceApplyProximalGradientDescent")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyProximalGradientDescentShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' as FOBOS algorithm with fixed learning rate.
-prox_v = var - alpha * delta
-var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-
-var: Should be from a Variable().
-alpha: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-delta: The change.
-use_locking: If True, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceSparseApplyProximalGradientDescent")
     .Input("var: resource")
@@ -256,23 +151,7 @@ REGISTER_OP("ResourceSparseApplyProximalGradientDescent")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyProximalGradientDescentShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Sparse update '*var' as FOBOS algorithm with fixed learning rate.
-
-That is for rows we have grad for, we update var as follows:
-prox_v = var - alpha * grad
-var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-
-var: Should be from a Variable().
-alpha: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-use_locking: If True, the subtraction will be protected by a lock;
-  otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 static Status ApplyAdadeltaShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -304,26 +183,7 @@ REGISTER_OP("ApplyAdadelta")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdadeltaShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the adadelta scheme.
-
-accum = rho() * accum + (1 - rho()) * grad.square();
-update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-update_accum = rho() * update_accum + (1 - rho()) * update.square();
-var -= update;
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-accum_update: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-rho: Decay factor. Must be a scalar.
-epsilon: Constant factor. Must be a scalar.
-grad: The gradient.
-out: Same as "var".
-use_locking: If True, updating of the var, accum and update_accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 REGISTER_OP("SparseApplyAdadelta")
     .Input("var: Ref(T)")
@@ -340,20 +200,7 @@ REGISTER_OP("SparseApplyAdadelta")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdadeltaShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-var: Should be from a Variable().
-accum: Should be from a Variable().
-accum_update:: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-rho: Decay factor. Must be a scalar.
-epsilon: Constant factor. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-out: Same as "var".
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceApplyAdadelta")
     .Input("var: resource")
@@ -367,25 +214,7 @@ REGISTER_OP("ResourceApplyAdadelta")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdadeltaShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the adadelta scheme.
-
-accum = rho() * accum + (1 - rho()) * grad.square();
-update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-update_accum = rho() * update_accum + (1 - rho()) * update.square();
-var -= update;
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-accum_update: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-rho: Decay factor. Must be a scalar.
-epsilon: Constant factor. Must be a scalar.
-grad: The gradient.
-use_locking: If True, updating of the var, accum and update_accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceSparseApplyAdadelta")
     .Input("var: resource")
@@ -401,19 +230,7 @@ REGISTER_OP("ResourceSparseApplyAdadelta")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdadeltaShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-var: Should be from a Variable().
-accum: Should be from a Variable().
-accum_update:: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-rho: Decay factor. Must be a scalar.
-epsilon: Constant factor. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 static Status ApplyAdagradShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -438,22 +255,7 @@ REGISTER_OP("ApplyAdagrad")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the adagrad scheme.
-
-accum += grad * grad
-var -= lr * grad * (1 / sqrt(accum))
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-grad: The gradient.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceApplyAdagrad")
     .Input("var: resource")
@@ -464,21 +266,7 @@ REGISTER_OP("ResourceApplyAdagrad")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the adagrad scheme.
-
-accum += grad * grad
-var -= lr * grad * (1 / sqrt(accum))
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-grad: The gradient.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 static Status ApplyProximalAdagradShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -507,23 +295,7 @@ REGISTER_OP("ApplyProximalAdagrad")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyProximalAdagradShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
-accum += grad * grad
-prox_v = var - lr * grad * (1 / sqrt(accum))
-var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-out: Same as "var".
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceApplyProximalAdagrad")
     .Input("var: resource")
@@ -536,22 +308,7 @@ REGISTER_OP("ResourceApplyProximalAdagrad")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyProximalAdagradShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
-accum += grad * grad
-prox_v = var - lr * grad * (1 / sqrt(accum))
-var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 REGISTER_OP("SparseApplyAdagrad")
     .Input("var: Ref(T)")
@@ -565,24 +322,7 @@ REGISTER_OP("SparseApplyAdagrad")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
-
-That is for rows we have grad for, we update var and accum as follows:
-accum += grad * grad
-var -= lr * grad * (1 / sqrt(accum))
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceSparseApplyAdagrad")
     .Input("var: resource")
@@ -595,23 +335,7 @@ REGISTER_OP("ResourceSparseApplyAdagrad")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
-
-That is for rows we have grad for, we update var and accum as follows:
-accum += grad * grad
-var -= lr * grad * (1 / sqrt(accum))
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 static Status ApplyAdagradDAShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -647,22 +371,7 @@ REGISTER_OP("ApplyAdagradDA")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradDAShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the proximal adagrad scheme.
-
-var: Should be from a Variable().
-gradient_accumulator: Should be from a Variable().
-gradient_squared_accumulator: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-global_step: Training step number. Must be a scalar.
-out: Same as "var".
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 REGISTER_OP("SparseApplyAdagradDA")
     .Input("var: Ref(T)")
@@ -680,23 +389,7 @@ REGISTER_OP("SparseApplyAdagradDA")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradDAShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
-
-var: Should be from a Variable().
-gradient_accumulator: Should be from a Variable().
-gradient_squared_accumulator: Should be from a Variable().
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-lr: Learning rate. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-global_step: Training step number. Must be a scalar.
-out: Same as "var".
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 REGISTER_OP("SparseApplyProximalAdagrad")
     .Input("var: Ref(T)")
@@ -712,27 +405,7 @@ REGISTER_OP("SparseApplyProximalAdagrad")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyProximalAdagradShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
-
-That is for rows we have grad for, we update var and accum as follows:
-accum += grad * grad
-prox_v = var
-prox_v -= lr * grad * (1 / sqrt(accum))
-var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-out: Same as "var".
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceApplyAdagradDA")
     .Input("var: resource")
@@ -747,21 +420,7 @@ REGISTER_OP("ResourceApplyAdagradDA")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradDAShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the proximal adagrad scheme.
-
-var: Should be from a Variable().
-gradient_accumulator: Should be from a Variable().
-gradient_squared_accumulator: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-global_step: Training step number. Must be a scalar.
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceSparseApplyAdagradDA")
     .Input("var: resource")
@@ -778,22 +437,7 @@ REGISTER_OP("ResourceSparseApplyAdagradDA")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdagradDAShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
-
-var: Should be from a Variable().
-gradient_accumulator: Should be from a Variable().
-gradient_squared_accumulator: Should be from a Variable().
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-lr: Learning rate. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-global_step: Training step number. Must be a scalar.
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceSparseApplyProximalAdagrad")
     .Input("var: resource")
@@ -808,26 +452,7 @@ REGISTER_OP("ResourceSparseApplyProximalAdagrad")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyProximalAdagradShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
-
-That is for rows we have grad for, we update var and accum as follows:
-accum += grad * grad
-prox_v = var
-prox_v -= lr * grad * (1 / sqrt(accum))
-var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-use_locking: If True, updating of the var and accum tensors will be protected by
-a lock; otherwise the behavior is undefined, but may exhibit less contention.
-)doc");
+    });
 
 static Status ApplyFtrlShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -861,29 +486,7 @@ REGISTER_OP("ApplyFtrl")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyFtrlShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the Ftrl-proximal scheme.
-
-accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regulariation. Must be a scalar.
-l2: L2 regulariation. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("SparseApplyFtrl")
     .Input("var: Ref(T)")
@@ -901,31 +504,7 @@ REGISTER_OP("SparseApplyFtrl")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyFtrlShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-
-That is for rows we have grad for, we update var, accum and linear as follows:
-accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceApplyFtrl")
     .Input("var: resource")
@@ -940,28 +519,7 @@ REGISTER_OP("ResourceApplyFtrl")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyFtrlShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the Ftrl-proximal scheme.
-
-accum_new = accum + grad * grad
-linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regulariation. Must be a scalar.
-l2: L2 regulariation. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceSparseApplyFtrl")
     .Input("var: resource")
@@ -978,30 +536,7 @@ REGISTER_OP("ResourceSparseApplyFtrl")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyFtrlShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-
-That is for rows we have grad for, we update var, accum and linear as follows:
-accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: L2 regularization. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("ApplyFtrlV2")
     .Input("var: Ref(T)")
@@ -1018,32 +553,7 @@ REGISTER_OP("ApplyFtrlV2")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyFtrlShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the Ftrl-proximal scheme.
-
-grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
-    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regulariation. Must be a scalar.
-l2: online L2 regulariation. Must be a scalar.
-l2: L2 shrinkage regulariation. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("SparseApplyFtrlV2")
     .Input("var: Ref(T)")
@@ -1062,34 +572,7 @@ REGISTER_OP("SparseApplyFtrlV2")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyFtrlShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-
-That is for rows we have grad for, we update var, accum and linear as follows:
-grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
-    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: onine L2 regularization. Must be a scalar.
-l2: L2 shrinkage regulariation. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceApplyFtrlV2")
     .Input("var: resource")
@@ -1105,31 +588,7 @@ REGISTER_OP("ResourceApplyFtrlV2")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyFtrlShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the Ftrl-proximal scheme.
-
-grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
-    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regulariation. Must be a scalar.
-l2: onine L2 regularization. Must be a scalar.
-l2: L2 shrinkage regulariation. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceSparseApplyFtrlV2")
     .Input("var: resource")
@@ -1147,33 +606,7 @@ REGISTER_OP("ResourceSparseApplyFtrlV2")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyFtrlShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-
-That is for rows we have grad for, we update var, accum and linear as follows:
-grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
-    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-linear: Should be from a Variable().
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-lr: Scaling factor. Must be a scalar.
-l1: L1 regularization. Must be a scalar.
-l2: onine L2 regularization. Must be a scalar.
-l2: L2 shrinkage regulariation. Must be a scalar.
-lr_power: Scaling factor. Must be a scalar.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 static Status ApplyMomentumShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -1202,27 +635,7 @@ REGISTER_OP("ApplyMomentum")
     .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyMomentumShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-want to use Nesterov momentum.
-
-accum = accum * momentum + grad
-var -= lr * accum
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-grad: The gradient.
-momentum: Momentum. Must be a scalar.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be
-var - lr * momentum * accum, so in the end, the var you get is actually
-var - lr * momentum * accum.
-)doc");
+    });
 
 REGISTER_OP("SparseApplyMomentum")
     .Input("var: Ref(T)")
@@ -1238,30 +651,7 @@ REGISTER_OP("SparseApplyMomentum")
     .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyMomentumShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-Set use_nesterov = True if you want to use Nesterov momentum.
-
-That is for rows we have grad for, we update var and accum as follows:
-
-accum = accum * momentum + grad
-var -= lr * accum
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-momentum: Momentum. Must be a scalar.
-out: Same as "var".
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be
-var - lr * momentum * accum, so in the end, the var you get is actually
-var - lr * momentum * accum.
-)doc");
+    });
 
 REGISTER_OP("ResourceApplyMomentum")
     .Input("var: resource")
@@ -1274,26 +664,7 @@ REGISTER_OP("ResourceApplyMomentum")
     .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyMomentumShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-want to use Nesterov momentum.
-
-accum = accum * momentum + grad
-var -= lr * accum
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-grad: The gradient.
-momentum: Momentum. Must be a scalar.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be
-var - lr * momentum * accum, so in the end, the var you get is actually
-var - lr * momentum * accum.
-)doc");
+    });
 
 REGISTER_OP("ResourceSparseApplyMomentum")
     .Input("var: resource")
@@ -1308,29 +679,7 @@ REGISTER_OP("ResourceSparseApplyMomentum")
     .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyMomentumShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-Set use_nesterov = True if you want to use Nesterov momentum.
-
-That is for rows we have grad for, we update var and accum as follows:
-
-accum = accum * momentum + grad
-var -= lr * accum
-
-var: Should be from a Variable().
-accum: Should be from a Variable().
-lr: Learning rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var and accum.
-momentum: Momentum. Must be a scalar.
-use_locking: If `True`, updating of the var and accum tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be
-var - lr * momentum * accum, so in the end, the var you get is actually
-var - lr * momentum * accum.
-)doc");
+    });
 
 static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -1368,31 +717,7 @@ REGISTER_OP("ApplyAdam")
     .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdamShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the Adam algorithm.
-
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
-
-var: Should be from a Variable().
-m: Should be from a Variable().
-v: Should be from a Variable().
-beta1_power: Must be a scalar.
-beta2_power: Must be a scalar.
-lr: Scaling factor. Must be a scalar.
-beta1: Momentum factor. Must be a scalar.
-beta2: Momentum factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-grad: The gradient.
-out: Same as "var".
-use_locking: If `True`, updating of the var, m, and v tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-use_nesterov: If `True`, uses the nesterov update.
-)doc");
+    });
 
 REGISTER_OP("ResourceApplyAdam")
     .Input("var: resource")
@@ -1410,30 +735,7 @@ REGISTER_OP("ResourceApplyAdam")
     .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdamShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the Adam algorithm.
-
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
-
-var: Should be from a Variable().
-m: Should be from a Variable().
-v: Should be from a Variable().
-beta1_power: Must be a scalar.
-beta2_power: Must be a scalar.
-lr: Scaling factor. Must be a scalar.
-beta1: Momentum factor. Must be a scalar.
-beta2: Momentum factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-grad: The gradient.
-use_locking: If `True`, updating of the var, m, and v tensors will be protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-use_nesterov: If `True`, uses the nesterov update.
-)doc");
+    });
 
 static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -1484,32 +786,7 @@ REGISTER_OP("ApplyRMSProp")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyRMSPropShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the RMSProp algorithm.
-Note that in dense implementation of this algorithm, ms and mom will
-update even if the grad is zero, but in this sparse implementation, ms
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-out: Same as "var".
-use_locking: If `True`, updating of the var, ms, and mom tensors is protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("ApplyCenteredRMSProp")
     .Input("var: Ref(T)")
@@ -1526,41 +803,7 @@ REGISTER_OP("ApplyCenteredRMSProp")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyCenteredRMSPropShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the centered RMSProp algorithm.
-The centered RMSProp algorithm uses an estimate of the centered second moment
-(i.e., the variance) for normalization, as opposed to regular RMSProp, which
-uses the (uncentered) second moment. This often helps with training, but is
-slightly more expensive in terms of computation and memory.
-
-Note that in dense implementation of this algorithm, mg, ms, and mom will
-update even if the grad is zero, but in this sparse implementation, mg, ms,
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-mean_grad = decay * mean_grad + (1-decay) * gradient
-
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-
-mg <- rho * mg_{t-1} + (1-rho) * grad
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-mg: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-out: Same as "var".
-use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
-  protected by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("SparseApplyRMSProp")
     .Input("var: Ref(T)")
@@ -1578,33 +821,7 @@ REGISTER_OP("SparseApplyRMSProp")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyRMSPropShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the RMSProp algorithm.
-Note that in dense implementation of this algorithm, ms and mom will
-update even if the grad is zero, but in this sparse implementation, ms
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var, ms and mom.
-out: Same as "var".
-use_locking: If `True`, updating of the var, ms, and mom tensors is protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("SparseApplyCenteredRMSProp")
     .Input("var: Ref(T)")
@@ -1623,40 +840,7 @@ REGISTER_OP("SparseApplyCenteredRMSProp")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyCenteredRMSPropShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the centered RMSProp algorithm.
-The centered RMSProp algorithm uses an estimate of the centered second moment
-(i.e., the variance) for normalization, as opposed to regular RMSProp, which
-uses the (uncentered) second moment. This often helps with training, but is
-slightly more expensive in terms of computation and memory.
-
-Note that in dense implementation of this algorithm, mg, ms, and mom will
-update even if the grad is zero, but in this sparse implementation, mg, ms,
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-mean_grad = decay * mean_grad + (1-decay) * gradient
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-mg: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var, ms and mom.
-out: Same as "var".
-use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
-  protected by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceApplyRMSProp")
     .Input("var: resource")
@@ -1671,31 +855,7 @@ REGISTER_OP("ResourceApplyRMSProp")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyRMSPropShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the RMSProp algorithm.
-Note that in dense implementation of this algorithm, ms and mom will
-update even if the grad is zero, but in this sparse implementation, ms
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-use_locking: If `True`, updating of the var, ms, and mom tensors is protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceApplyCenteredRMSProp")
     .Input("var: resource")
@@ -1711,40 +871,7 @@ REGISTER_OP("ResourceApplyCenteredRMSProp")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyCenteredRMSPropShapeFn(c, false /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the centered RMSProp algorithm.
-The centered RMSProp algorithm uses an estimate of the centered second moment
-(i.e., the variance) for normalization, as opposed to regular RMSProp, which
-uses the (uncentered) second moment. This often helps with training, but is
-slightly more expensive in terms of computation and memory.
-
-Note that in dense implementation of this algorithm, mg, ms, and mom will
-update even if the grad is zero, but in this sparse implementation, mg, ms,
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-mean_grad = decay * mean_grad + (1-decay) * gradient
-
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-
-mg <- rho * mg_{t-1} + (1-rho) * grad
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-mg: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
-  protected by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceSparseApplyRMSProp")
     .Input("var: resource")
@@ -1761,32 +888,7 @@ REGISTER_OP("ResourceSparseApplyRMSProp")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyRMSPropShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the RMSProp algorithm.
-Note that in dense implementation of this algorithm, ms and mom will
-update even if the grad is zero, but in this sparse implementation, ms
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var, ms and mom.
-use_locking: If `True`, updating of the var, ms, and mom tensors is protected
-  by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 REGISTER_OP("ResourceSparseApplyCenteredRMSProp")
     .Input("var: resource")
@@ -1804,39 +906,7 @@ REGISTER_OP("ResourceSparseApplyCenteredRMSProp")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyCenteredRMSPropShapeFn(c, true /* sparse */);
-    })
-    .Doc(R"doc(
-Update '*var' according to the centered RMSProp algorithm.
-The centered RMSProp algorithm uses an estimate of the centered second moment
-(i.e., the variance) for normalization, as opposed to regular RMSProp, which
-uses the (uncentered) second moment. This often helps with training, but is
-slightly more expensive in terms of computation and memory.
-
-Note that in dense implementation of this algorithm, mg, ms, and mom will
-update even if the grad is zero, but in this sparse implementation, mg, ms,
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-mean_grad = decay * mean_grad + (1-decay) * gradient
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-
-var: Should be from a Variable().
-mg: Should be from a Variable().
-ms: Should be from a Variable().
-mom: Should be from a Variable().
-lr: Scaling factor. Must be a scalar.
-epsilon: Ridge term. Must be a scalar.
-rho: Decay rate. Must be a scalar.
-grad: The gradient.
-indices: A vector of indices into the first dimension of var, ms and mom.
-use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
-  protected by a lock; otherwise the behavior is undefined, but may exhibit less
-  contention.
-)doc");
+    });
 
 static Status ApplyAddSignShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -1867,8 +937,7 @@ REGISTER_OP("ApplyAddSign")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAddSignShapeFn(c, /*sparse=*/false);
-    })
-    .Doc(strings::StrCat(kAddSignCommonDocStr, kOutDocStr, kLockDocStr));
+    });
 
 REGISTER_OP("ResourceApplyAddSign")
     .Input("var: resource")
@@ -1882,8 +951,7 @@ REGISTER_OP("ResourceApplyAddSign")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAddSignShapeFn(c, /*sparse=*/false);
-    })
-    .Doc(strings::StrCat(kAddSignCommonDocStr, kLockDocStr));
+    });
 
 static Status ApplyPowerSignShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
@@ -1914,8 +982,7 @@ REGISTER_OP("ApplyPowerSign")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyPowerSignShapeFn(c, /*sparse=*/false);
-    })
-    .Doc(strings::StrCat(kPowerSignCommonDocStr, kOutDocStr, kLockDocStr));
+    });
 
 REGISTER_OP("ResourceApplyPowerSign")
     .Input("var: resource")
@@ -1929,8 +996,6 @@ REGISTER_OP("ResourceApplyPowerSign")
     .Attr("use_locking: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyPowerSignShapeFn(c, /*sparse=*/false);
-    })
-    .Doc(strings::StrCat(kPowerSignCommonDocStr, kLockDocStr));
-
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc
index de4e3cd9e70014ea9b29d4d473d94c0abb52eabc..0f309c1f4e956c98b6f20fa3b6c810116a2b339c 100644
--- a/tensorflow/core/ops/training_ops_test.cc
+++ b/tensorflow/core/ops/training_ops_test.cc
@@ -24,7 +24,7 @@ static void TestGradAndIndicesErrorHandling(const ShapeInferenceTestOp& op,
                                             string shape_spec_middle,
                                             const string& shape_spec_end = "") {
   auto shape_spec = [&shape_spec_middle, shape_spec_end](
-      const char* var_spec, const char* grad_indices_spec) {
+                        const char* var_spec, const char* grad_indices_spec) {
     return strings::StrCat(var_spec, ";", shape_spec_middle, ";",
                            grad_indices_spec, shape_spec_end);
   };
diff --git a/tensorflow/core/ops/word2vec_ops.cc b/tensorflow/core/ops/word2vec_ops.cc
index b6acc2213c3a2ca1669d6f055c6acefc78de79c3..ed685dcf0ae9a3c61a1db491751f7de4e981300d 100644
--- a/tensorflow/core/ops/word2vec_ops.cc
+++ b/tensorflow/core/ops/word2vec_ops.cc
@@ -33,25 +33,7 @@ REGISTER_OP("Skipgram")
     .Attr("batch_size: int")
     .Attr("window_size: int = 5")
     .Attr("min_count: int = 5")
-    .Attr("subsample: float = 1e-3")
-    .Doc(R"doc(
-Parses a text file and creates a batch of examples.
-
-vocab_word: A vector of words in the corpus.
-vocab_freq: Frequencies of words. Sorted in the non-ascending order.
-words_per_epoch: Number of words per epoch in the data file.
-current_epoch: The current epoch number.
-total_words_processed: The total number of words processed so far.
-examples: A vector of word ids.
-labels: A vector of word ids.
-filename: The corpus's text file name.
-batch_size: The size of produced batch.
-window_size: The number of words to predict to the left and right of the target.
-min_count: The minimum number of word occurrences for it to be included in the
-    vocabulary.
-subsample: Threshold for word occurrence. Words that appear with higher
-    frequency will be randomly down-sampled. Set to 0 to disable.
-)doc");
+    .Attr("subsample: float = 1e-3");
 
 REGISTER_OP("NegTrain")
     .Deprecated(19,
@@ -64,16 +46,6 @@ REGISTER_OP("NegTrain")
     .Input("lr: float")
     .SetIsStateful()
     .Attr("vocab_count: list(int)")
-    .Attr("num_negative_samples: int")
-    .Doc(R"doc(
-Training via negative sampling.
-
-w_in: input word embedding.
-w_out: output word embedding.
-examples: A vector of word ids.
-labels: A vector of word ids.
-vocab_count: Count of words in the vocabulary.
-num_negative_samples: Number of negative samples per example.
-)doc");
+    .Attr("num_negative_samples: int");
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 624145da75194fac7f859d4df0f6f51fe7ac5eff..9ba25dea4fb278cbfaf4080e21beef8a3e9de769 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -10,6 +10,8 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
+    "tf_copts",
+    "if_windows",
 )
 
 filegroup(
@@ -29,6 +31,7 @@ filegroup(
 cc_library(
     name = "expiring_lru_cache",
     hdrs = ["expiring_lru_cache.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = ["//tensorflow/core:lib"],
 )
@@ -37,6 +40,7 @@ cc_library(
     name = "file_block_cache",
     srcs = ["file_block_cache.cc"],
     hdrs = ["file_block_cache.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = ["//tensorflow/core:lib"],
 )
@@ -45,6 +49,7 @@ cc_library(
     name = "gcs_dns_cache",
     srcs = ["gcs_dns_cache.cc"],
     hdrs = ["gcs_dns_cache.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":http_request",
@@ -52,10 +57,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gcs_throttle",
+    srcs = ["gcs_throttle.cc"],
+    hdrs = ["gcs_throttle.h"],
+    copts = tf_copts(),
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "gcs_file_system",
     srcs = ["gcs_file_system.cc"],
     hdrs = ["gcs_file_system.h"],
+    copts = tf_copts(),
     linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
     visibility = ["//visibility:public"],
     deps = [
@@ -63,6 +80,7 @@ cc_library(
         ":expiring_lru_cache",
         ":file_block_cache",
         ":gcs_dns_cache",
+        ":gcs_throttle",
         ":google_auth_provider",
         ":http_request",
         ":retrying_file_system",
@@ -78,6 +96,7 @@ cc_library(
 cc_library(
     name = "http_request",
     hdrs = ["http_request.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/core:framework_headers_lib",
@@ -89,12 +108,13 @@ cc_library(
     name = "curl_http_request",
     srcs = ["curl_http_request.cc"],
     hdrs = ["curl_http_request.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":http_request",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_internal",
-        "@curl//:curl",
+        "@curl",
     ],
 )
 
@@ -104,13 +124,14 @@ cc_library(
     hdrs = [
         "http_request_fake.h",
     ],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":curl_http_request",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:test",
-        "@curl//:curl",
+        "@curl",
     ],
 )
 
@@ -121,6 +142,7 @@ cc_library(
         "auth_provider.h",
         "google_auth_provider.h",
     ],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":curl_http_request",
@@ -136,6 +158,7 @@ cc_library(
     name = "now_seconds_env",
     testonly = 1,
     hdrs = ["now_seconds_env.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/core:lib",
@@ -151,6 +174,7 @@ cc_library(
     hdrs = [
         "oauth_client.h",
     ],
+    copts = tf_copts(),
     deps = [
         ":curl_http_request",
         ":http_request",
@@ -169,6 +193,7 @@ cc_library(
     hdrs = [
         "retrying_utils.h",
     ],
+    copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_internal",
@@ -183,6 +208,7 @@ cc_library(
     hdrs = [
         "retrying_file_system.h",
     ],
+    copts = tf_copts(),
     deps = [
         ":retrying_utils",
         "//tensorflow/core:framework_headers_lib",
@@ -198,6 +224,7 @@ cc_library(
     hdrs = [
         "time_util.h",
     ],
+    copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_internal",
@@ -247,6 +274,7 @@ tf_cc_test(
     name = "gcs_dns_cache_test",
     size = "small",
     srcs = ["gcs_dns_cache_test.cc"],
+    linkopts = if_windows(["-DEFAULTLIB:ws2_32.lib"]),
     deps = [
         ":gcs_dns_cache",
         "//tensorflow/core:lib",
@@ -255,6 +283,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gcs_throttle_test",
+    size = "small",
+    srcs = ["gcs_throttle_test.cc"],
+    linkopts = if_windows(["-DEFAULTLIB:ws2_32.lib"]),
+    deps = [
+        ":gcs_throttle",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "curl_http_request_test",
     size = "small",
diff --git a/tensorflow/core/platform/cloud/curl_http_request.cc b/tensorflow/core/platform/cloud/curl_http_request.cc
index d01734ba3a649afa73a5fc8ad59a01a7cc6c3088..88a5d1e96dc2fcb7d12e2c0891d2f04d64bac594 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+
 #include "tensorflow/core/platform/cloud/curl_http_request.h"
 
 #include "tensorflow/core/lib/core/errors.h"
@@ -29,16 +31,6 @@ namespace {
 // Set to 1 to enable verbose debug output from curl.
 constexpr uint64 kVerboseOutput = 0;
 
-// Timeout for the whole request. Set only to prevent hanging indefinitely.
-constexpr uint32 kRequestTimeoutSeconds = 3600;  // 1 hour
-
-// Timeout for the connection phase.
-constexpr uint32 kConnectTimeoutSeconds = 120;  // 2 minutes
-
-// The maximum period of request inactivity, after which the request
-// is terminated.
-constexpr uint64 kInactivityTimeoutSeconds = 60;  // 1 minute
-
 // Proxy to the real libcurl implementation.
 class LibCurlProxy : public LibCurl {
  public:
@@ -117,6 +109,10 @@ class LibCurlProxy : public LibCurl {
   }
 
   void curl_free(void* p) override { ::curl_free(p); }
+
+  const char* curl_easy_strerror(CURLcode errornum) override {
+    return ::curl_easy_strerror(errornum);
+  }
 };
 }  // namespace
 
@@ -125,31 +121,9 @@ CurlHttpRequest::CurlHttpRequest() : CurlHttpRequest(LibCurlProxy::Load()) {}
 CurlHttpRequest::CurlHttpRequest(LibCurl* libcurl, Env* env)
     : libcurl_(libcurl), env_(env) {
   default_response_buffer_.reserve(CURL_MAX_WRITE_SIZE);
-}
-
-CurlHttpRequest::~CurlHttpRequest() {
-  if (curl_headers_) {
-    libcurl_->curl_slist_free_all(curl_headers_);
-  }
-  if (resolve_list_) {
-    libcurl_->curl_slist_free_all(resolve_list_);
-  }
-  if (put_body_) {
-    fclose(put_body_);
-  }
-  if (curl_) {
-    libcurl_->curl_easy_cleanup(curl_);
-  }
-}
 
-Status CurlHttpRequest::Init() {
-  if (is_initialized_) {
-    return errors::FailedPrecondition("Already initialized.");
-  }
   curl_ = libcurl_->curl_easy_init();
-  if (!curl_) {
-    return errors::Internal("Couldn't initialize a curl session.");
-  }
+  CHECK(curl_ != nullptr) << "Couldn't initialize a curl session.";
 
   // NOTE: CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt is configured by
   //       default in //third_party:curl.BUILD and can be customized via an
@@ -161,9 +135,6 @@ Status CurlHttpRequest::Init() {
       strings::StrCat("TensorFlow/", TF_VERSION_STRING).c_str());
   // Do not use signals for timeouts - does not work in multi-threaded programs.
   libcurl_->curl_easy_setopt(curl_, CURLOPT_NOSIGNAL, 1L);
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_TIMEOUT, kRequestTimeoutSeconds);
-  libcurl_->curl_easy_setopt(curl_, CURLOPT_CONNECTTIMEOUT,
-                             kConnectTimeoutSeconds);
   libcurl_->curl_easy_setopt(curl_, CURLOPT_HTTP_VERSION,
                              CURL_HTTP_VERSION_2_0);
 
@@ -175,13 +146,22 @@ Status CurlHttpRequest::Init() {
 
   // If response buffer is not set, libcurl will print results to stdout,
   // so we always set it.
-  is_initialized_ = true;
-  auto s = SetResultBuffer(&default_response_buffer_);
-  if (!s.ok()) {
-    is_initialized_ = false;
-    return s;
+  SetResultBuffer(&default_response_buffer_);
+}
+
+CurlHttpRequest::~CurlHttpRequest() {
+  if (curl_headers_) {
+    libcurl_->curl_slist_free_all(curl_headers_);
+  }
+  if (resolve_list_) {
+    libcurl_->curl_slist_free_all(resolve_list_);
+  }
+  if (put_body_) {
+    fclose(put_body_);
+  }
+  if (curl_) {
+    libcurl_->curl_easy_cleanup(curl_);
   }
-  return Status::OK();
 }
 
 string CurlHttpRequest::EscapeString(const string& str) {
@@ -191,64 +171,52 @@ string CurlHttpRequest::EscapeString(const string& str) {
   return out_str;
 }
 
-Status CurlHttpRequest::SetUri(const string& uri) {
-  TF_RETURN_IF_ERROR(CheckInitialized());
-  TF_RETURN_IF_ERROR(CheckNotSent());
+void CurlHttpRequest::SetUri(const string& uri) {
+  CheckNotSent();
   is_uri_set_ = true;
+  uri_ = uri;
   libcurl_->curl_easy_setopt(curl_, CURLOPT_URL, uri.c_str());
-  return Status::OK();
 }
 
-Status CurlHttpRequest::SetRange(uint64 start, uint64 end) {
-  TF_RETURN_IF_ERROR(CheckInitialized());
-  TF_RETURN_IF_ERROR(CheckNotSent());
+void CurlHttpRequest::SetRange(uint64 start, uint64 end) {
+  CheckNotSent();
   libcurl_->curl_easy_setopt(curl_, CURLOPT_RANGE,
                              strings::StrCat(start, "-", end).c_str());
-  return Status::OK();
 }
 
-Status CurlHttpRequest::AddHeader(const string& name, const string& value) {
-  TF_RETURN_IF_ERROR(CheckInitialized());
-  TF_RETURN_IF_ERROR(CheckNotSent());
+void CurlHttpRequest::AddHeader(const string& name, const string& value) {
+  CheckNotSent();
   curl_headers_ = libcurl_->curl_slist_append(
       curl_headers_, strings::StrCat(name, ": ", value).c_str());
-  return Status::OK();
 }
 
-Status CurlHttpRequest::AddResolveOverride(const string& hostname, int64 port,
-                                           const string& ip_addr) {
-  TF_RETURN_IF_ERROR(CheckInitialized());
-  TF_RETURN_IF_ERROR(CheckNotSent());
+void CurlHttpRequest::AddResolveOverride(const string& hostname, int64 port,
+                                         const string& ip_addr) {
+  CheckNotSent();
   // Resolve values are hostname:port:IP.add.ress
   resolve_list_ = libcurl_->curl_slist_append(
       resolve_list_,
       strings::StrCat(hostname, ":", port, ":", ip_addr).c_str());
-  return Status::OK();
 }
 
-Status CurlHttpRequest::AddAuthBearerHeader(const string& auth_token) {
-  TF_RETURN_IF_ERROR(CheckInitialized());
-  TF_RETURN_IF_ERROR(CheckNotSent());
+void CurlHttpRequest::AddAuthBearerHeader(const string& auth_token) {
+  CheckNotSent();
   if (!auth_token.empty()) {
-    return AddHeader("Authorization", strings::StrCat("Bearer ", auth_token));
+    AddHeader("Authorization", strings::StrCat("Bearer ", auth_token));
   }
-  return Status::OK();
 }
 
-Status CurlHttpRequest::SetDeleteRequest() {
-  TF_RETURN_IF_ERROR(CheckInitialized());
-  TF_RETURN_IF_ERROR(CheckNotSent());
-  TF_RETURN_IF_ERROR(CheckMethodNotSet());
+void CurlHttpRequest::SetDeleteRequest() {
+  CheckNotSent();
+  CheckMethodNotSet();
   is_method_set_ = true;
   libcurl_->curl_easy_setopt(curl_, CURLOPT_CUSTOMREQUEST, "DELETE");
-  return Status::OK();
 }
 
 Status CurlHttpRequest::SetPutFromFile(const string& body_filepath,
                                        size_t offset) {
-  TF_RETURN_IF_ERROR(CheckInitialized());
-  TF_RETURN_IF_ERROR(CheckNotSent());
-  TF_RETURN_IF_ERROR(CheckMethodNotSet());
+  CheckNotSent();
+  CheckMethodNotSet();
   is_method_set_ = true;
   if (put_body_) {
     fclose(put_body_);
@@ -272,10 +240,9 @@ Status CurlHttpRequest::SetPutFromFile(const string& body_filepath,
   return Status::OK();
 }
 
-Status CurlHttpRequest::SetPutEmptyBody() {
-  TF_RETURN_IF_ERROR(CheckInitialized());
-  TF_RETURN_IF_ERROR(CheckNotSent());
-  TF_RETURN_IF_ERROR(CheckMethodNotSet());
+void CurlHttpRequest::SetPutEmptyBody() {
+  CheckNotSent();
+  CheckMethodNotSet();
   is_method_set_ = true;
   libcurl_->curl_easy_setopt(curl_, CURLOPT_PUT, 1);
   curl_headers_ =
@@ -284,13 +251,11 @@ Status CurlHttpRequest::SetPutEmptyBody() {
                              reinterpret_cast<void*>(this));
   libcurl_->curl_easy_setopt(curl_, CURLOPT_READFUNCTION,
                              &CurlHttpRequest::ReadCallback);
-  return Status::OK();
 }
 
-Status CurlHttpRequest::SetPostFromBuffer(const char* buffer, size_t size) {
-  TF_RETURN_IF_ERROR(CheckInitialized());
-  TF_RETURN_IF_ERROR(CheckNotSent());
-  TF_RETURN_IF_ERROR(CheckMethodNotSet());
+void CurlHttpRequest::SetPostFromBuffer(const char* buffer, size_t size) {
+  CheckNotSent();
+  CheckMethodNotSet();
   is_method_set_ = true;
   curl_headers_ = libcurl_->curl_slist_append(
       curl_headers_, strings::StrCat("Content-Length: ", size).c_str());
@@ -300,13 +265,11 @@ Status CurlHttpRequest::SetPostFromBuffer(const char* buffer, size_t size) {
   libcurl_->curl_easy_setopt(curl_, CURLOPT_READFUNCTION,
                              &CurlHttpRequest::ReadCallback);
   post_body_buffer_ = StringPiece(buffer, size);
-  return Status::OK();
 }
 
-Status CurlHttpRequest::SetPostEmptyBody() {
-  TF_RETURN_IF_ERROR(CheckInitialized());
-  TF_RETURN_IF_ERROR(CheckNotSent());
-  TF_RETURN_IF_ERROR(CheckMethodNotSet());
+void CurlHttpRequest::SetPostEmptyBody() {
+  CheckNotSent();
+  CheckMethodNotSet();
   is_method_set_ = true;
   libcurl_->curl_easy_setopt(curl_, CURLOPT_POST, 1);
   curl_headers_ =
@@ -315,15 +278,11 @@ Status CurlHttpRequest::SetPostEmptyBody() {
                              reinterpret_cast<void*>(this));
   libcurl_->curl_easy_setopt(curl_, CURLOPT_READFUNCTION,
                              &CurlHttpRequest::ReadCallback);
-  return Status::OK();
 }
 
-Status CurlHttpRequest::SetResultBuffer(std::vector<char>* out_buffer) {
-  TF_RETURN_IF_ERROR(CheckInitialized());
-  TF_RETURN_IF_ERROR(CheckNotSent());
-  if (!out_buffer) {
-    return errors::InvalidArgument("out_buffer cannot be null");
-  }
+void CurlHttpRequest::SetResultBuffer(std::vector<char>* out_buffer) {
+  CheckNotSent();
+  CHECK(out_buffer != nullptr);
 
   out_buffer->clear();
   response_buffer_ = out_buffer;
@@ -332,7 +291,67 @@ Status CurlHttpRequest::SetResultBuffer(std::vector<char>* out_buffer) {
                              reinterpret_cast<void*>(this));
   libcurl_->curl_easy_setopt(curl_, CURLOPT_WRITEFUNCTION,
                              &CurlHttpRequest::WriteCallback);
-  return Status::OK();
+}
+
+void CurlHttpRequest::SetResultBufferDirect(char* buffer, size_t size) {
+  CHECK(buffer != nullptr);
+  CheckNotSent();
+
+  direct_response_ = DirectResponseState{buffer, size, 0};
+
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_WRITEDATA,
+                             reinterpret_cast<void*>(this));
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_WRITEFUNCTION,
+                             &CurlHttpRequest::WriteCallbackDirect);
+}
+
+bool CurlHttpRequest::IsDirectResponse() const {
+  return direct_response_.buffer_ != nullptr;
+}
+
+size_t CurlHttpRequest::WriteCallbackDirect(const void* ptr, size_t size,
+                                            size_t nmemb, void* userdata) {
+  CHECK(ptr != nullptr);
+  auto that = reinterpret_cast<CurlHttpRequest*>(userdata);
+  DirectResponseState* state = &that->direct_response_;
+  CHECK(state->buffer_ != nullptr);
+  CHECK(state->bytes_transferred_ <= state->buffer_size_);
+
+  size_t curl_bytes_received = size * nmemb;
+  size_t user_buffer_bytes_available =
+      state->buffer_size_ - state->bytes_transferred_;
+
+  // The HTTP server may send a response body that is longer than what we
+  // expected. We must not use CHECK() for this situation, because that would
+  // imply a code bug (in this client code) where none exists; the violation of
+  // expectations would have been caused by the server, not the client. So we
+  // report a log warning, if an HTTP server is misbehaving.
+  if (curl_bytes_received > user_buffer_bytes_available) {
+    LOG(WARNING) << "The HTTP response body that we received is longer than we "
+                    "requested or expected. "
+                 << "Total bytes requested: " << state->buffer_size_
+                 << " Bytes received (so far) in HTTP response body: "
+                 << (state->bytes_transferred_ + curl_bytes_received);
+  }
+
+  size_t bytes_to_copy =
+      std::min<size_t>(curl_bytes_received, user_buffer_bytes_available);
+  memcpy(&state->buffer_[state->bytes_transferred_], ptr, bytes_to_copy);
+  state->bytes_transferred_ += bytes_to_copy;
+  return bytes_to_copy;
+}
+
+size_t CurlHttpRequest::GetResultBufferDirectBytesTransferred() {
+  CHECK(direct_response_.buffer_ != nullptr);
+  return direct_response_.bytes_transferred_;
+}
+
+void CurlHttpRequest::SetTimeouts(uint32 connection, uint32 inactivity,
+                                  uint32 total) {
+  CheckNotSent();
+  connect_timeout_secs_ = connection;
+  inactivity_timeout_secs_ = inactivity;
+  request_timeout_secs_ = total;
 }
 
 size_t CurlHttpRequest::WriteCallback(const void* ptr, size_t size,
@@ -381,12 +400,11 @@ size_t CurlHttpRequest::HeaderCallback(const void* ptr, size_t size,
 }
 
 Status CurlHttpRequest::Send() {
-  TF_RETURN_IF_ERROR(CheckInitialized());
-  TF_RETURN_IF_ERROR(CheckNotSent());
+  CheckNotSent();
+  CHECK(is_uri_set_) << "URI has not been set.";
+
   is_sent_ = true;
-  if (!is_uri_set_) {
-    return errors::FailedPrecondition("URI has not been set.");
-  }
+
   if (curl_headers_) {
     libcurl_->curl_easy_setopt(curl_, CURLOPT_HTTPHEADER, curl_headers_);
   }
@@ -398,6 +416,10 @@ Status CurlHttpRequest::Send() {
   libcurl_->curl_easy_setopt(curl_, CURLOPT_HEADERFUNCTION,
                              &CurlHttpRequest::HeaderCallback);
 
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_TIMEOUT, request_timeout_secs_);
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_CONNECTTIMEOUT,
+                             connect_timeout_secs_);
+
   char error_buffer[CURL_ERROR_SIZE] = {0};
   libcurl_->curl_easy_setopt(curl_, CURLOPT_ERRORBUFFER, error_buffer);
 
@@ -413,6 +435,8 @@ Status CurlHttpRequest::Send() {
       ", error code ", curl_result, ", error message '", error_buffer, "')");
 
   Status result;
+  StringPiece response = GetResponse();
+  string extended_error_message;
   switch (response_code_) {
     // The group of response codes indicating that the request achieved
     // the expected goal.
@@ -445,7 +469,15 @@ Status CurlHttpRequest::Send() {
     // PERMISSION_DENIED indicates an authentication or an authorization issue.
     case 401:  // Unauthorized
     case 403:  // Forbidden
-      result = errors::PermissionDenied(error_message);
+      if (!response.empty()) {
+        extended_error_message = strings::StrCat(
+            error_message, ", response ",
+            response.substr(
+                0, std::min(response.size(), response_to_error_limit_)));
+        result = errors::PermissionDenied(extended_error_message);
+      } else {
+        result = errors::PermissionDenied(error_message);
+      }
       break;
 
     // NOT_FOUND indicates that the requested resource does not exist.
@@ -484,25 +516,23 @@ Status CurlHttpRequest::Send() {
   return result;
 }
 
-Status CurlHttpRequest::CheckInitialized() const {
-  if (!is_initialized_) {
-    return errors::FailedPrecondition("The object has not been initialized.");
-  }
-  return Status::OK();
+void CurlHttpRequest::CheckMethodNotSet() const {
+  CHECK(!is_method_set_) << "HTTP method has been already set.";
 }
 
-Status CurlHttpRequest::CheckMethodNotSet() const {
-  if (is_method_set_) {
-    return errors::FailedPrecondition("HTTP method has been already set.");
-  }
-  return Status::OK();
+void CurlHttpRequest::CheckNotSent() const {
+  CHECK(!is_sent_) << "The request has already been sent.";
 }
 
-Status CurlHttpRequest::CheckNotSent() const {
-  if (is_sent_) {
-    return errors::FailedPrecondition("The request has already been sent.");
+StringPiece CurlHttpRequest::GetResponse() const {
+  StringPiece response;
+  if (IsDirectResponse()) {
+    response = StringPiece(direct_response_.buffer_,
+                           direct_response_.bytes_transferred_);
+  } else {
+    response = StringPiece(response_buffer_->data(), response_buffer_->size());
   }
-  return Status::OK();
+  return response;
 }
 
 string CurlHttpRequest::GetResponseHeader(const string& name) const {
@@ -528,12 +558,37 @@ int CurlHttpRequest::ProgressCallback(void* this_object, curl_off_t dltotal,
     return 0;
   }
 
-  if (now - that->last_progress_timestamp_ > kInactivityTimeoutSeconds) {
+  if (now - that->last_progress_timestamp_ > that->inactivity_timeout_secs_) {
+    double lookup_time = -1;
+    const auto lookup_time_status = that->libcurl_->curl_easy_getinfo(
+        that->curl_, CURLINFO_NAMELOOKUP_TIME, &lookup_time);
+
+    double connect_time = -1;
+    const auto connect_time_status = that->libcurl_->curl_easy_getinfo(
+        that->curl_, CURLINFO_CONNECT_TIME, &connect_time);
+
+    double pretransfer_time = -1;
+    const auto pretransfer_time_status = that->libcurl_->curl_easy_getinfo(
+        that->curl_, CURLINFO_PRETRANSFER_TIME, &pretransfer_time);
+
+    double starttransfer_time = -1;
+    const auto starttransfer_time_status = that->libcurl_->curl_easy_getinfo(
+        that->curl_, CURLINFO_PRETRANSFER_TIME, &starttransfer_time);
+
     LOG(ERROR) << "The transmission  of request " << this_object
-               << " has been stuck at " << current_progress << " of "
-               << dltotal + ultotal << " bytes for "
-               << now - that->last_progress_timestamp_
-               << " seconds and will be aborted.";
+               << " (URI: " << that->uri_ << ") has been stuck at "
+               << current_progress << " of " << dltotal + ultotal
+               << " bytes for " << now - that->last_progress_timestamp_
+               << " seconds and will be aborted. CURL timing information: "
+               << "lookup time: " << lookup_time << " ("
+               << that->libcurl_->curl_easy_strerror(lookup_time_status)
+               << "), connect time: " << connect_time << " ("
+               << that->libcurl_->curl_easy_strerror(connect_time_status)
+               << "), pre-transfer time: " << pretransfer_time << " ("
+               << that->libcurl_->curl_easy_strerror(pretransfer_time_status)
+               << "), start-transfer time: " << starttransfer_time << " ("
+               << that->libcurl_->curl_easy_strerror(starttransfer_time_status)
+               << ")";
     return 1;  // Will abort the request.
   }
 
diff --git a/tensorflow/core/platform/cloud/curl_http_request.h b/tensorflow/core/platform/cloud/curl_http_request.h
index 2396593d6de015d7e002cc59a5ca12a092ab6e86..cfa26f2b795a6cc33aba308597c77088362f1e1b 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.h
+++ b/tensorflow/core/platform/cloud/curl_http_request.h
@@ -57,28 +57,26 @@ class CurlHttpRequest : public HttpRequest {
   CurlHttpRequest(LibCurl* libcurl, Env* env);
   ~CurlHttpRequest() override;
 
-  Status Init() override;
-
   /// Sets the request URI.
-  Status SetUri(const string& uri) override;
+  void SetUri(const string& uri) override;
 
   /// \brief Sets the Range header.
   ///
   /// Used for random seeks, for example "0-999" returns the first 1000 bytes
   /// (note that the right border is included).
-  Status SetRange(uint64 start, uint64 end) override;
+  void SetRange(uint64 start, uint64 end) override;
 
   /// Sets a request header.
-  Status AddHeader(const string& name, const string& value) override;
+  void AddHeader(const string& name, const string& value) override;
 
-  Status AddResolveOverride(const string& hostname, int64 port,
-                            const string& ip_addr) override;
+  void AddResolveOverride(const string& hostname, int64 port,
+                          const string& ip_addr) override;
 
   /// Sets the 'Authorization' header to the value of 'Bearer ' + auth_token.
-  Status AddAuthBearerHeader(const string& auth_token) override;
+  void AddAuthBearerHeader(const string& auth_token) override;
 
   /// Makes the request a DELETE request.
-  Status SetDeleteRequest() override;
+  void SetDeleteRequest() override;
 
   /// \brief Makes the request a PUT request.
   ///
@@ -87,21 +85,44 @@ class CurlHttpRequest : public HttpRequest {
   Status SetPutFromFile(const string& body_filepath, size_t offset) override;
 
   /// Makes the request a PUT request with an empty body.
-  Status SetPutEmptyBody() override;
+  void SetPutEmptyBody() override;
 
   /// \brief Makes the request a POST request.
   ///
   /// The request body will be taken from the specified buffer.
-  Status SetPostFromBuffer(const char* buffer, size_t size) override;
+  void SetPostFromBuffer(const char* buffer, size_t size) override;
 
   /// Makes the request a POST request with an empty body.
-  Status SetPostEmptyBody() override;
+  void SetPostEmptyBody() override;
 
   /// \brief Specifies the buffer for receiving the response body.
   ///
   /// Size of out_buffer after an access will be exactly the number of bytes
   /// read. Existing content of the vector will be cleared.
-  Status SetResultBuffer(std::vector<char>* out_buffer) override;
+  void SetResultBuffer(std::vector<char>* out_buffer) override;
+
+  /// \brief Specifies the buffer for receiving the response body, when the
+  /// caller knows the maximum size of the response body.
+  ///
+  /// This method allows the caller to receive the response body without an
+  /// additional intermediate buffer allocation and copy.  This method should
+  /// be called before calling Send(). After Send() has succeeded, the caller
+  /// should use the GetResultBufferDirectBytesTransferred() method in order
+  /// to learn how many bytes were transferred.
+  ///
+  /// Using this method is mutually exclusive with using SetResultBuffer().
+  void SetResultBufferDirect(char* buffer, size_t size) override;
+
+  /// \brief Distinguish response type (direct vs. implicit).
+  bool IsDirectResponse() const;
+
+  /// \brief Returns the number of bytes (of the response body) that were
+  /// transferred, when using the SetResultBufferDirect() method. The returned
+  /// value will always be less than or equal to the 'size' parameter that
+  /// was passed to SetResultBufferDirect(). If the actual HTTP response body
+  /// was greater than 'size' bytes, then this transfer method will only copy
+  /// the first 'size' bytes, and the rest will be ignored.
+  size_t GetResultBufferDirectBytesTransferred() override;
 
   /// \brief Returns the response headers of a completed request.
   ///
@@ -120,10 +141,16 @@ class CurlHttpRequest : public HttpRequest {
   // Url encodes str and returns a new string.
   string EscapeString(const string& str) override;
 
+  void SetTimeouts(uint32 connection, uint32 inactivity, uint32 total) override;
+
  private:
   /// A write callback in the form which can be accepted by libcurl.
   static size_t WriteCallback(const void* ptr, size_t size, size_t nmemb,
                               void* userdata);
+
+  /// Processes response body content received when using SetResultBufferDirect.
+  static size_t WriteCallbackDirect(const void* ptr, size_t size, size_t nmemb,
+                                    void* userdata);
   /// A read callback in the form which can be accepted by libcurl.
   static size_t ReadCallback(void* ptr, size_t size, size_t nmemb,
                              FILE* userdata);
@@ -134,9 +161,9 @@ class CurlHttpRequest : public HttpRequest {
   static int ProgressCallback(void* this_object, curl_off_t dltotal,
                               curl_off_t dlnow, curl_off_t ultotal,
                               curl_off_t ulnow);
-  Status CheckInitialized() const;
-  Status CheckMethodNotSet() const;
-  Status CheckNotSent() const;
+  void CheckMethodNotSet() const;
+  void CheckNotSent() const;
+  StringPiece GetResponse() const;
 
   LibCurl* libcurl_;
   Env* env_;
@@ -147,6 +174,14 @@ class CurlHttpRequest : public HttpRequest {
   size_t post_body_read_ = 0;
 
   std::vector<char>* response_buffer_ = nullptr;
+
+  struct DirectResponseState {
+    char* buffer_;
+    size_t buffer_size_;
+    size_t bytes_transferred_;
+  };
+  DirectResponseState direct_response_ = {};
+
   CURL* curl_ = nullptr;
   curl_slist* curl_headers_ = nullptr;
   curl_slist* resolve_list_ = nullptr;
@@ -162,12 +197,26 @@ class CurlHttpRequest : public HttpRequest {
   // The last progress in terms of bytes transmitted.
   curl_off_t last_progress_bytes_ = 0;
 
+  // The maximum period of request inactivity.
+  uint32 inactivity_timeout_secs_ = 60;  // 1 minute
+
+  // Timeout for the connection phase.
+  uint32 connect_timeout_secs_ = 120;  // 2 minutes
+
+  // Tiemout for the whole request. Set only to prevent hanging indefinitely.
+  uint32 request_timeout_secs_ = 3600;  // 1 hour
+
   // Members to enforce the usage flow.
-  bool is_initialized_ = false;
   bool is_uri_set_ = false;
   bool is_method_set_ = false;
   bool is_sent_ = false;
 
+  // Store the URI to help disambiguate requests when errors occur.
+  string uri_;
+
+  // Limit the size of a http response that is copied into an error message.
+  const size_t response_to_error_limit_ = 500;
+
   TF_DISALLOW_COPY_AND_ASSIGN(CurlHttpRequest);
 };
 
@@ -205,6 +254,8 @@ class LibCurl {
   virtual void curl_slist_free_all(curl_slist* list) = 0;
   virtual char* curl_easy_escape(CURL* curl, const char* str, int length) = 0;
   virtual void curl_free(void* p) = 0;
+
+  virtual const char* curl_easy_strerror(CURLcode errornum) = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/curl_http_request_test.cc b/tensorflow/core/platform/cloud/curl_http_request_test.cc
index 6c0f0818527fdc2610d2f54a965db23a636a98c7..86d26a028733c303b85390b0be8fb8808c6e082a 100644
--- a/tensorflow/core/platform/cloud/curl_http_request_test.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request_test.cc
@@ -219,6 +219,10 @@ class FakeLibCurl : public LibCurl {
   }
   void curl_free(void* p) override { port::Free(p); }
 
+  const char* curl_easy_strerror(CURLcode errornum) override {
+    return "<unimplemented>";
+  }
+
   // Variables defining the behavior of this fake.
   string response_content_;
   uint64 response_code_;
@@ -259,17 +263,15 @@ class FakeLibCurl : public LibCurl {
 TEST(CurlHttpRequestTest, GetRequest) {
   FakeLibCurl libcurl("get response", 200);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
 
   std::vector<char> scratch;
   scratch.insert(scratch.begin(), kTestContent.begin(), kTestContent.end());
-  StringPiece result;
   scratch.reserve(100);
 
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
-  TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
-  TF_EXPECT_OK(http_request.SetRange(100, 199));
-  TF_EXPECT_OK(http_request.SetResultBuffer(&scratch));
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetRange(100, 199);
+  http_request.SetResultBuffer(&scratch);
   TF_EXPECT_OK(http_request.Send());
 
   EXPECT_EQ("get response", string(scratch.begin(), scratch.end()));
@@ -285,18 +287,48 @@ TEST(CurlHttpRequestTest, GetRequest) {
   EXPECT_EQ(200, http_request.GetResponseCode());
 }
 
+TEST(CurlHttpRequestTest, GetRequest_Direct) {
+  FakeLibCurl libcurl("get response", 200);
+  CurlHttpRequest http_request(&libcurl);
+
+  std::vector<char> scratch(100, 0);
+
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetRange(100, 199);
+  http_request.SetResultBufferDirect(scratch.data(), scratch.capacity());
+  TF_EXPECT_OK(http_request.Send());
+
+  string expected_response = "get response";
+  size_t response_bytes_transferred =
+      http_request.GetResultBufferDirectBytesTransferred();
+  EXPECT_EQ(response_bytes_transferred, expected_response.size());
+  EXPECT_EQ(
+      "get response",
+      string(scratch.begin(), scratch.begin() + response_bytes_transferred));
+
+  // Check interactions with libcurl.
+  EXPECT_TRUE(libcurl.is_initialized_);
+  EXPECT_EQ("http://www.testuri.com", libcurl.url_);
+  EXPECT_EQ("100-199", libcurl.range_);
+  EXPECT_EQ("", libcurl.custom_request_);
+  EXPECT_EQ(1, libcurl.headers_->size());
+  EXPECT_EQ("Authorization: Bearer fake-bearer", (*libcurl.headers_)[0]);
+  EXPECT_FALSE(libcurl.is_post_);
+  EXPECT_EQ(200, http_request.GetResponseCode());
+}
+
 TEST(CurlHttpRequestTest, GetRequest_Empty) {
   FakeLibCurl libcurl("", 200);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
 
   std::vector<char> scratch;
   scratch.resize(0);
 
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
-  TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
-  TF_EXPECT_OK(http_request.SetRange(100, 199));
-  TF_EXPECT_OK(http_request.SetResultBuffer(&scratch));
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetRange(100, 199);
+  http_request.SetResultBuffer(&scratch);
   TF_EXPECT_OK(http_request.Send());
 
   EXPECT_TRUE(scratch.empty());
@@ -316,15 +348,14 @@ TEST(CurlHttpRequestTest, GetRequest_RangeOutOfBound) {
   FakeLibCurl libcurl("get response", 416);
   libcurl.curl_easy_perform_result_ = CURLE_WRITE_ERROR;
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
 
   std::vector<char> scratch;
   scratch.insert(scratch.end(), kTestContent.begin(), kTestContent.end());
 
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
-  TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
-  TF_EXPECT_OK(http_request.SetRange(100, 199));
-  TF_EXPECT_OK(http_request.SetResultBuffer(&scratch));
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetRange(100, 199);
+  http_request.SetResultBuffer(&scratch);
   TF_EXPECT_OK(http_request.Send());
 
   EXPECT_TRUE(scratch.empty());
@@ -335,15 +366,14 @@ TEST(CurlHttpRequestTest, GetRequest_503) {
   FakeLibCurl libcurl("get response", 503);
   libcurl.curl_easy_perform_result_ = CURLE_WRITE_ERROR;
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
 
   std::vector<char> scratch;
   scratch.insert(scratch.end(), kTestContent.begin(), kTestContent.end());
 
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
-  TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
-  TF_EXPECT_OK(http_request.SetRange(100, 199));
-  TF_EXPECT_OK(http_request.SetResultBuffer(&scratch));
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetRange(100, 199);
+  http_request.SetResultBuffer(&scratch);
   const auto& status = http_request.Send();
   EXPECT_EQ(error::UNAVAILABLE, status.code());
   EXPECT_EQ(
@@ -358,12 +388,11 @@ TEST(CurlHttpRequestTest, GetRequest_HttpCode0) {
   libcurl.curl_easy_perform_result_ = CURLE_OPERATION_TIMEDOUT;
   libcurl.curl_easy_perform_error_message_ = "Operation timed out";
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
 
   std::vector<char> scratch;
   scratch.insert(scratch.end(), kTestContent.begin(), kTestContent.end());
 
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
+  http_request.SetUri("http://www.testuri.com");
   const auto& status = http_request.Send();
   EXPECT_EQ(error::UNAVAILABLE, status.code());
   EXPECT_EQ(
@@ -378,9 +407,8 @@ TEST(CurlHttpRequestTest, ResponseHeaders) {
       "get response", 200,
       {"Location: abcd", "Content-Type: text", "unparsable header"});
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
 
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
+  http_request.SetUri("http://www.testuri.com");
   TF_EXPECT_OK(http_request.Send());
 
   EXPECT_EQ("abcd", http_request.GetResponseHeader("Location"));
@@ -391,15 +419,14 @@ TEST(CurlHttpRequestTest, ResponseHeaders) {
 TEST(CurlHttpRequestTest, PutRequest_WithBody_FromFile) {
   FakeLibCurl libcurl("", 200);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
 
   auto content_filename = io::JoinPath(testing::TmpDir(), "content");
   std::ofstream content(content_filename, std::ofstream::binary);
   content << "post body content";
   content.close();
 
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
-  TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
   TF_EXPECT_OK(http_request.SetPutFromFile(content_filename, 0));
   TF_EXPECT_OK(http_request.Send());
 
@@ -419,15 +446,14 @@ TEST(CurlHttpRequestTest, PutRequest_WithBody_FromFile) {
 TEST(CurlHttpRequestTest, PutRequest_WithBody_FromFile_NonZeroOffset) {
   FakeLibCurl libcurl("", 200);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
 
   auto content_filename = io::JoinPath(testing::TmpDir(), "content");
   std::ofstream content(content_filename, std::ofstream::binary);
   content << "post body content";
   content.close();
 
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
-  TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
   TF_EXPECT_OK(http_request.SetPutFromFile(content_filename, 7));
   TF_EXPECT_OK(http_request.Send());
 
@@ -440,11 +466,10 @@ TEST(CurlHttpRequestTest, PutRequest_WithBody_FromFile_NonZeroOffset) {
 TEST(CurlHttpRequestTest, PutRequest_WithoutBody) {
   FakeLibCurl libcurl("", 200);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
 
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
-  TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
-  TF_EXPECT_OK(http_request.SetPutEmptyBody());
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetPutEmptyBody();
   TF_EXPECT_OK(http_request.Send());
 
   // Check interactions with libcurl.
@@ -461,13 +486,12 @@ TEST(CurlHttpRequestTest, PutRequest_WithoutBody) {
 TEST(CurlHttpRequestTest, PostRequest_WithBody_FromMemory) {
   FakeLibCurl libcurl("", 200);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
 
   string content = "post body content";
 
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
-  TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
-  TF_EXPECT_OK(http_request.SetPostFromBuffer(content.c_str(), content.size()));
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetPostFromBuffer(content.c_str(), content.size());
   TF_EXPECT_OK(http_request.Send());
 
   // Check interactions with libcurl.
@@ -484,11 +508,9 @@ TEST(CurlHttpRequestTest, PostRequest_WithBody_FromMemory) {
 TEST(CurlHttpRequestTest, PostRequest_WithoutBody) {
   FakeLibCurl libcurl("", 200);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
-
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
-  TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
-  TF_EXPECT_OK(http_request.SetPostEmptyBody());
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetPostEmptyBody();
   TF_EXPECT_OK(http_request.Send());
 
   // Check interactions with libcurl.
@@ -505,11 +527,9 @@ TEST(CurlHttpRequestTest, PostRequest_WithoutBody) {
 TEST(CurlHttpRequestTest, DeleteRequest) {
   FakeLibCurl libcurl("", 200);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
-
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
-  TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
-  TF_EXPECT_OK(http_request.SetDeleteRequest());
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetDeleteRequest();
   TF_EXPECT_OK(http_request.Send());
 
   // Check interactions with libcurl.
@@ -524,65 +544,37 @@ TEST(CurlHttpRequestTest, DeleteRequest) {
 TEST(CurlHttpRequestTest, WrongSequenceOfCalls_NoUri) {
   FakeLibCurl libcurl("", 200);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
-
-  auto s = http_request.Send();
-  ASSERT_TRUE(errors::IsFailedPrecondition(s));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("URI has not been set"));
+  ASSERT_DEATH((void)http_request.Send(), "URI has not been set");
 }
 
 TEST(CurlHttpRequestTest, WrongSequenceOfCalls_TwoSends) {
   FakeLibCurl libcurl("", 200);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
-
-  TF_EXPECT_OK(http_request.SetUri("http://www.google.com"));
+  http_request.SetUri("http://www.google.com");
   TF_EXPECT_OK(http_request.Send());
-  auto s = http_request.Send();
-  ASSERT_TRUE(errors::IsFailedPrecondition(s));
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("The request has already been sent"));
+  ASSERT_DEATH((void)http_request.Send(), "The request has already been sent");
 }
 
 TEST(CurlHttpRequestTest, WrongSequenceOfCalls_ReusingAfterSend) {
   FakeLibCurl libcurl("", 200);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
-
-  TF_EXPECT_OK(http_request.SetUri("http://www.google.com"));
+  http_request.SetUri("http://www.google.com");
   TF_EXPECT_OK(http_request.Send());
-  auto s = http_request.SetUri("http://mail.google.com");
-  ASSERT_TRUE(errors::IsFailedPrecondition(s));
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("The request has already been sent"));
+  ASSERT_DEATH(http_request.SetUri("http://mail.google.com"),
+               "The request has already been sent");
 }
 
 TEST(CurlHttpRequestTest, WrongSequenceOfCalls_SettingMethodTwice) {
   FakeLibCurl libcurl("", 200);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
-
-  TF_EXPECT_OK(http_request.SetDeleteRequest());
-  auto s = http_request.SetPostEmptyBody();
-  ASSERT_TRUE(errors::IsFailedPrecondition(s));
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("HTTP method has been already set"));
-}
-
-TEST(CurlHttpRequestTest, WrongSequenceOfCalls_NotInitialized) {
-  FakeLibCurl libcurl("", 200);
-  CurlHttpRequest http_request(&libcurl);
-
-  auto s = http_request.SetPostEmptyBody();
-  ASSERT_TRUE(errors::IsFailedPrecondition(s));
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("The object has not been initialized"));
+  http_request.SetDeleteRequest();
+  ASSERT_DEATH(http_request.SetPostEmptyBody(),
+               "HTTP method has been already set");
 }
 
 TEST(CurlHttpRequestTest, EscapeString) {
   FakeLibCurl libcurl("get response", 200);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
   const string test_string = "a/b/c";
   EXPECT_EQ("a%2Fb%2Fc", http_request.EscapeString(test_string));
 }
@@ -590,17 +582,15 @@ TEST(CurlHttpRequestTest, EscapeString) {
 TEST(CurlHttpRequestTest, ErrorReturnsNoResponse) {
   FakeLibCurl libcurl("get response", 500);
   CurlHttpRequest http_request(&libcurl);
-  TF_EXPECT_OK(http_request.Init());
 
   std::vector<char> scratch;
   scratch.insert(scratch.begin(), kTestContent.begin(), kTestContent.end());
-  StringPiece result;
   scratch.reserve(100);
 
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
-  TF_EXPECT_OK(http_request.AddAuthBearerHeader("fake-bearer"));
-  TF_EXPECT_OK(http_request.SetRange(100, 199));
-  TF_EXPECT_OK(http_request.SetResultBuffer(&scratch));
+  http_request.SetUri("http://www.testuri.com");
+  http_request.AddAuthBearerHeader("fake-bearer");
+  http_request.SetRange(100, 199);
+  http_request.SetResultBuffer(&scratch);
   EXPECT_EQ(error::UNAVAILABLE, http_request.Send().code());
 
   EXPECT_EQ("", string(scratch.begin(), scratch.end()));
@@ -618,8 +608,7 @@ TEST(CurlHttpRequestTest, ProgressIsOk) {
       },
       &env);
   CurlHttpRequest http_request(&libcurl, &env);
-  TF_EXPECT_OK(http_request.Init());
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
+  http_request.SetUri("http://www.testuri.com");
   TF_EXPECT_OK(http_request.Send());
 }
 
@@ -635,8 +624,7 @@ TEST(CurlHttpRequestTest, ProgressIsStuck) {
       },
       &env);
   CurlHttpRequest http_request(&libcurl, &env);
-  TF_EXPECT_OK(http_request.Init());
-  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
+  http_request.SetUri("http://www.testuri.com");
   auto status = http_request.Send();
   EXPECT_EQ(error::UNAVAILABLE, status.code());
   EXPECT_EQ(
diff --git a/tensorflow/core/platform/cloud/expiring_lru_cache.h b/tensorflow/core/platform/cloud/expiring_lru_cache.h
index 3fc23a4306eb96e85099bd63c9c83c6663fe7e3c..c738497ddd533b5b9a8339e51a21ac204acf68b5 100644
--- a/tensorflow/core/platform/cloud/expiring_lru_cache.h
+++ b/tensorflow/core/platform/cloud/expiring_lru_cache.h
@@ -88,6 +88,13 @@ class ExpiringLRUCache {
     return s;
   }
 
+  /// Clear the cache.
+  void Clear() {
+    mutex_lock lock(mu_);
+    cache_.clear();
+    lru_list_.clear();
+  }
+
   /// Accessors for cache parameters.
   uint64 max_age() const { return max_age_; }
   size_t max_entries() const { return max_entries_; }
diff --git a/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc b/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
index 8f8d5744a4576991c0056bfefeb30c4bc58549e0..3bc6db38429155ca61732b44da3815422b480c92 100644
--- a/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
+++ b/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
@@ -152,5 +152,27 @@ TEST(ExpiringLRUCacheTest, LookupOrCompute) {
   EXPECT_EQ(num_compute_calls, 6);
 }
 
+TEST(ExpiringLRUCacheTest, Clear) {
+  ExpiringLRUCache<int> cache(1, 4);
+  cache.Insert("a", 1);
+  cache.Insert("b", 2);
+  cache.Insert("c", 3);
+  cache.Insert("d", 4);
+  int value = 0;
+  EXPECT_TRUE(cache.Lookup("a", &value));
+  EXPECT_EQ(value, 1);
+  EXPECT_TRUE(cache.Lookup("b", &value));
+  EXPECT_EQ(value, 2);
+  EXPECT_TRUE(cache.Lookup("c", &value));
+  EXPECT_EQ(value, 3);
+  EXPECT_TRUE(cache.Lookup("d", &value));
+  EXPECT_EQ(value, 4);
+  cache.Clear();
+  EXPECT_FALSE(cache.Lookup("a", &value));
+  EXPECT_FALSE(cache.Lookup("b", &value));
+  EXPECT_FALSE(cache.Lookup("c", &value));
+  EXPECT_FALSE(cache.Lookup("d", &value));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/file_block_cache.cc b/tensorflow/core/platform/cloud/file_block_cache.cc
index a472ae52fcde114aa5660ee2f6fc2e9323b2ad68..6add1142a15fb69044828bd82a6d6e838959de08 100644
--- a/tensorflow/core/platform/cloud/file_block_cache.cc
+++ b/tensorflow/core/platform/cloud/file_block_cache.cc
@@ -123,10 +123,15 @@ Status FileBlockCache::MaybeFetch(const Key& key,
       case FetchState::CREATED:
         block->state = FetchState::FETCHING;
         block->mu.unlock();  // Release the lock while making the API call.
-        status.Update(
-            block_fetcher_(key.first, key.second, block_size_, &block->data));
+        block->data.clear();
+        block->data.resize(block_size_, 0);
+        size_t bytes_transferred;
+        status.Update(block_fetcher_(key.first, key.second, block_size_,
+                                     block->data.data(), &bytes_transferred));
         block->mu.lock();  // Reacquire the lock immediately afterwards
         if (status.ok()) {
+          block->data.resize(bytes_transferred, 0);
+          block->data.shrink_to_fit();
           downloaded_block = true;
           block->state = FetchState::FINISHED;
         } else {
@@ -150,15 +155,15 @@ Status FileBlockCache::MaybeFetch(const Key& key,
 }
 
 Status FileBlockCache::Read(const string& filename, size_t offset, size_t n,
-                            std::vector<char>* out) {
-  out->clear();
+                            char* buffer, size_t* bytes_transferred) {
+  *bytes_transferred = 0;
   if (n == 0) {
     return Status::OK();
   }
   if (block_size_ == 0 || max_bytes_ == 0) {
     // The cache is effectively disabled, so we pass the read through to the
     // fetcher without breaking it up into blocks.
-    return block_fetcher_(filename, offset, n, out);
+    return block_fetcher_(filename, offset, n, buffer, bytes_transferred);
   }
   // Calculate the block-aligned start and end of the read.
   size_t start = block_size_ * (offset / block_size_);
@@ -166,6 +171,7 @@ Status FileBlockCache::Read(const string& filename, size_t offset, size_t n,
   if (finish < offset + n) {
     finish += block_size_;
   }
+  size_t total_bytes_transferred = 0;
   // Now iterate through the blocks, reading them one at a time.
   for (size_t pos = start; pos < finish; pos += block_size_) {
     Key key = std::make_pair(filename, pos);
@@ -181,7 +187,10 @@ Status FileBlockCache::Read(const string& filename, size_t offset, size_t n,
       // The requested offset is at or beyond the end of the file. This can
       // happen if `offset` is not block-aligned, and the read returns the last
       // block in the file, which does not extend all the way out to `offset`.
-      return errors::OutOfRange("EOF at offset ", offset);
+      *bytes_transferred = total_bytes_transferred;
+      return errors::OutOfRange("EOF at offset ", offset, " in file ", filename,
+                                " at position ", pos, "with data size ",
+                                data.size());
     }
     auto begin = data.begin();
     if (offset > pos) {
@@ -194,13 +203,16 @@ Status FileBlockCache::Read(const string& filename, size_t offset, size_t n,
       end -= (pos + data.size()) - (offset + n);
     }
     if (begin < end) {
-      out->insert(out->end(), begin, end);
+      size_t bytes_to_copy = end - begin;
+      memcpy(&buffer[total_bytes_transferred], &*begin, bytes_to_copy);
+      total_bytes_transferred += bytes_to_copy;
     }
     if (data.size() < block_size_) {
       // The block was a partial block and thus signals EOF at its upper bound.
       break;
     }
   }
+  *bytes_transferred = total_bytes_transferred;
   return Status::OK();
 }
 
@@ -226,6 +238,14 @@ void FileBlockCache::Prune() {
   }
 }
 
+void FileBlockCache::Flush() {
+  mutex_lock lock(mu_);
+  block_map_.clear();
+  lru_list_.clear();
+  lra_list_.clear();
+  cache_size_ = 0;
+}
+
 void FileBlockCache::RemoveFile(const string& filename) {
   mutex_lock lock(mu_);
   RemoveFile_Locked(filename);
diff --git a/tensorflow/core/platform/cloud/file_block_cache.h b/tensorflow/core/platform/cloud/file_block_cache.h
index 36dbf9db83238fa05e3b010c2a73cb823623f54b..5c180e2332042af3ae938c2685ac416952b00187 100644
--- a/tensorflow/core/platform/cloud/file_block_cache.h
+++ b/tensorflow/core/platform/cloud/file_block_cache.h
@@ -43,8 +43,9 @@ class FileBlockCache {
   /// cache is constructed. The returned Status should be OK as long as the
   /// read from the remote filesystem succeeded (similar to the semantics of the
   /// read(2) system call).
-  typedef std::function<Status(const string&, size_t, size_t,
-                               std::vector<char>*)>
+  typedef std::function<Status(const string& filename, size_t offset,
+                               size_t buffer_size, char* buffer,
+                               size_t* bytes_transferred)>
       BlockFetcher;
 
   FileBlockCache(size_t block_size, size_t max_bytes, uint64 max_staleness,
@@ -83,12 +84,15 @@ class FileBlockCache {
   ///    placed in `out`.
   /// 4) OK otherwise (i.e. the read succeeded, and at least one byte was placed
   ///    in `out`).
-  Status Read(const string& filename, size_t offset, size_t n,
-              std::vector<char>* out);
+  Status Read(const string& filename, size_t offset, size_t n, char* buffer,
+              size_t* bytes_transferred);
 
   /// Remove all cached blocks for `filename`.
   void RemoveFile(const string& filename) LOCKS_EXCLUDED(mu_);
 
+  /// Remove all cached data.
+  void Flush() LOCKS_EXCLUDED(mu_);
+
   /// Accessors for cache parameters.
   size_t block_size() const { return block_size_; }
   size_t max_bytes() const { return max_bytes_; }
diff --git a/tensorflow/core/platform/cloud/file_block_cache_test.cc b/tensorflow/core/platform/cloud/file_block_cache_test.cc
index 081b32af64636105925240da70bf050cdec2c4b9..596fdbf19eb03a70c5659d392db368b3cdb791fe 100644
--- a/tensorflow/core/platform/cloud/file_block_cache_test.cc
+++ b/tensorflow/core/platform/cloud/file_block_cache_test.cc
@@ -25,6 +25,18 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+Status ReadCache(FileBlockCache* cache, const string& filename, size_t offset,
+                 size_t n, std::vector<char>* out) {
+  out->clear();
+  out->resize(n, 0);
+  size_t bytes_transferred = 0;
+  Status status =
+      cache->Read(filename, offset, n, out->data(), &bytes_transferred);
+  EXPECT_LE(bytes_transferred, n);
+  out->resize(bytes_transferred, n);
+  return status;
+}
+
 TEST(FileBlockCacheTest, PassThrough) {
   const string want_filename = "foo/bar";
   const size_t want_offset = 42;
@@ -32,12 +44,13 @@ TEST(FileBlockCacheTest, PassThrough) {
   int calls = 0;
   auto fetcher = [&calls, want_filename, want_offset, want_n](
                      const string& got_filename, size_t got_offset,
-                     size_t got_n, std::vector<char>* out) {
+                     size_t got_n, char* buffer, size_t* bytes_transferred) {
     EXPECT_EQ(got_filename, want_filename);
     EXPECT_EQ(got_offset, want_offset);
     EXPECT_EQ(got_n, want_n);
     calls++;
-    out->resize(got_n, 'x');
+    memset(buffer, 'x', got_n);
+    *bytes_transferred = got_n;
     return Status::OK();
   };
   // If block_size, max_bytes, or both are zero, the cache is a pass-through.
@@ -45,11 +58,11 @@ TEST(FileBlockCacheTest, PassThrough) {
   FileBlockCache cache2(0, 1, 0, fetcher);
   FileBlockCache cache3(0, 0, 0, fetcher);
   std::vector<char> out;
-  TF_EXPECT_OK(cache1.Read(want_filename, want_offset, want_n, &out));
+  TF_EXPECT_OK(ReadCache(&cache1, want_filename, want_offset, want_n, &out));
   EXPECT_EQ(calls, 1);
-  TF_EXPECT_OK(cache2.Read(want_filename, want_offset, want_n, &out));
+  TF_EXPECT_OK(ReadCache(&cache2, want_filename, want_offset, want_n, &out));
   EXPECT_EQ(calls, 2);
-  TF_EXPECT_OK(cache3.Read(want_filename, want_offset, want_n, &out));
+  TF_EXPECT_OK(ReadCache(&cache3, want_filename, want_offset, want_n, &out));
   EXPECT_EQ(calls, 3);
 }
 
@@ -63,13 +76,13 @@ TEST(FileBlockCacheTest, BlockAlignment) {
   }
   // The fetcher just fetches slices of the buffer.
   auto fetcher = [&buf](const string& filename, size_t offset, size_t n,
-                        std::vector<char>* out) {
+                        char* buffer, size_t* bytes_transferred) {
     if (offset < buf.size()) {
-      if (offset + n > buf.size()) {
-        out->insert(out->end(), buf.begin() + offset, buf.end());
-      } else {
-        out->insert(out->end(), buf.begin() + offset, buf.begin() + offset + n);
-      }
+      size_t bytes_to_copy = std::min<size_t>(buf.size() - offset, n);
+      memcpy(buffer, buf.data() + offset, bytes_to_copy);
+      *bytes_transferred = bytes_to_copy;
+    } else {
+      *bytes_transferred = 0;
     }
     return Status::OK();
   };
@@ -80,7 +93,7 @@ TEST(FileBlockCacheTest, BlockAlignment) {
     for (size_t offset = 0; offset < 10; offset++) {
       for (size_t n = block_size - 2; n <= block_size + 2; n++) {
         std::vector<char> got;
-        TF_EXPECT_OK(cache.Read("", offset, n, &got));
+        TF_EXPECT_OK(ReadCache(&cache, "", offset, n, &got));
         // Verify the size of the read.
         if (offset + n <= size) {
           // Expect a full read.
@@ -108,24 +121,27 @@ TEST(FileBlockCacheTest, CacheHits) {
   const size_t block_size = 16;
   std::set<size_t> calls;
   auto fetcher = [&calls, block_size](const string& filename, size_t offset,
-                                      size_t n, std::vector<char>* out) {
+                                      size_t n, char* buffer,
+                                      size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
     EXPECT_EQ(calls.find(offset), calls.end()) << "at offset " << offset;
     calls.insert(offset);
-    out->resize(n, 'x');
+    memset(buffer, 'x', n);
+    *bytes_transferred = n;
     return Status::OK();
   };
   const uint32 block_count = 256;
   FileBlockCache cache(block_size, block_count * block_size, 0, fetcher);
   std::vector<char> out;
+  out.resize(block_count, 0);
   // The cache has space for `block_count` blocks. The loop with i = 0 should
   // fill the cache, and the loop with i = 1 should be all cache hits. The
   // fetcher checks that it is called once and only once for each offset (to
   // fetch the corresponding block).
   for (int i = 0; i < 2; i++) {
     for (int j = 0; j < block_count; j++) {
-      TF_EXPECT_OK(cache.Read("", block_size * j, block_size, &out));
+      TF_EXPECT_OK(ReadCache(&cache, "", block_size * j, block_size, &out));
     }
   }
 }
@@ -138,36 +154,39 @@ TEST(FileBlockCacheTest, OutOfRange) {
   bool second_block = false;
   auto fetcher = [block_size, file_size, &first_block, &second_block](
                      const string& filename, size_t offset, size_t n,
-                     std::vector<char>* out) {
+                     char* buffer, size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
+    size_t bytes_to_copy = 0;
     if (offset == 0) {
       // The first block (16 bytes) of the file.
-      out->resize(n, 'x');
+      memset(buffer, 'x', n);
+      bytes_to_copy = n;
       first_block = true;
     } else if (offset == block_size) {
       // The second block (8 bytes) of the file.
-      out->resize(file_size - block_size, 'x');
+      bytes_to_copy = file_size - block_size;
+      memset(buffer, 'x', bytes_to_copy);
       second_block = true;
     }
+    *bytes_transferred = bytes_to_copy;
     return Status::OK();
   };
   FileBlockCache cache(block_size, block_size, 0, fetcher);
   std::vector<char> out;
   // Reading the first 16 bytes should be fine.
-  TF_EXPECT_OK(cache.Read("", 0, block_size, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", 0, block_size, &out));
   EXPECT_TRUE(first_block);
   EXPECT_EQ(out.size(), block_size);
   // Reading at offset file_size + 4 will read the second block (since the read
   // at file_size + 4 = 28 will be aligned to an offset of 16) but will return
   // OutOfRange because the offset is past the end of the 24-byte file.
-  Status status = cache.Read("", file_size + 4, 4, &out);
+  Status status = ReadCache(&cache, "", file_size + 4, 4, &out);
   EXPECT_EQ(status.code(), error::OUT_OF_RANGE);
   EXPECT_TRUE(second_block);
-  EXPECT_EQ(out.size(), 0);
   // Reading the second full block will return 8 bytes, from a cache hit.
   second_block = false;
-  TF_EXPECT_OK(cache.Read("", block_size, block_size, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", block_size, block_size, &out));
   EXPECT_FALSE(second_block);
   EXPECT_EQ(out.size(), file_size - block_size);
 }
@@ -178,20 +197,22 @@ TEST(FileBlockCacheTest, Inconsistent) {
   const size_t block_size = 16;
   // This fetcher returns OK but only fills in one byte for any offset.
   auto fetcher = [block_size](const string& filename, size_t offset, size_t n,
-                              std::vector<char>* out) {
+                              char* buffer, size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset % block_size, 0);
-    out->resize(1, 'x');
+    EXPECT_GE(n, 1);
+    memset(buffer, 'x', 1);
+    *bytes_transferred = 1;
     return Status::OK();
   };
   FileBlockCache cache(block_size, 2 * block_size, 0, fetcher);
   std::vector<char> out;
   // Read the second block; this should yield an OK status and a single byte.
-  TF_EXPECT_OK(cache.Read("", block_size, block_size, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", block_size, block_size, &out));
   EXPECT_EQ(out.size(), 1);
   // Now read the first block; this should yield an INTERNAL error because we
   // had already cached a partial block at a later position.
-  Status status = cache.Read("", 0, block_size, &out);
+  Status status = ReadCache(&cache, "", 0, block_size, &out);
   EXPECT_EQ(status.code(), error::INTERNAL);
 }
 
@@ -199,14 +220,16 @@ TEST(FileBlockCacheTest, LRU) {
   const size_t block_size = 16;
   std::list<size_t> calls;
   auto fetcher = [&calls, block_size](const string& filename, size_t offset,
-                                      size_t n, std::vector<char>* out) {
+                                      size_t n, char* buffer,
+                                      size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_FALSE(calls.empty()) << "at offset = " << offset;
     if (!calls.empty()) {
       EXPECT_EQ(offset, calls.front());
       calls.pop_front();
     }
-    out->resize(n, 'x');
+    memset(buffer, 'x', n);
+    *bytes_transferred = n;
     return Status::OK();
   };
   const uint32 block_count = 2;
@@ -216,38 +239,39 @@ TEST(FileBlockCacheTest, LRU) {
   // fetcher calls that the cache makes.
   calls.push_back(0);
   // Cache miss - drains an element from `calls`.
-  TF_EXPECT_OK(cache.Read("", 0, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", 0, 1, &out));
   // Cache hit - does not drain an element from `calls`.
-  TF_EXPECT_OK(cache.Read("", 0, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", 0, 1, &out));
   calls.push_back(block_size);
   // Cache miss followed by cache hit.
-  TF_EXPECT_OK(cache.Read("", block_size, 1, &out));
-  TF_EXPECT_OK(cache.Read("", block_size, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", block_size, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", block_size, 1, &out));
   calls.push_back(2 * block_size);
   // Cache miss followed by cache hit.  Causes eviction of LRU element.
-  TF_EXPECT_OK(cache.Read("", 2 * block_size, 1, &out));
-  TF_EXPECT_OK(cache.Read("", 2 * block_size, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", 2 * block_size, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", 2 * block_size, 1, &out));
   // LRU element was at offset 0.  Cache miss.
   calls.push_back(0);
-  TF_EXPECT_OK(cache.Read("", 0, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", 0, 1, &out));
   // Element at 2 * block_size is still in cache, and this read should update
   // its position in the LRU list so it doesn't get evicted by the next read.
-  TF_EXPECT_OK(cache.Read("", 2 * block_size, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", 2 * block_size, 1, &out));
   // Element at block_size was evicted.  Reading this element will also cause
   // the LRU element (at 0) to be evicted.
   calls.push_back(block_size);
-  TF_EXPECT_OK(cache.Read("", block_size, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", block_size, 1, &out));
   // Element at 0 was evicted again.
   calls.push_back(0);
-  TF_EXPECT_OK(cache.Read("", 0, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", 0, 1, &out));
 }
 
 TEST(FileBlockCacheTest, MaxStaleness) {
   int calls = 0;
   auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
-                          std::vector<char>* out) {
+                          char* buffer, size_t* bytes_transferred) {
     calls++;
-    out->resize(n, 'x');
+    memset(buffer, 'x', n);
+    *bytes_transferred = n;
     return Status::OK();
   };
   std::vector<char> out;
@@ -256,14 +280,14 @@ TEST(FileBlockCacheTest, MaxStaleness) {
   // expected.
   FileBlockCache cache1(8, 16, 2 /* max staleness */, fetcher, env.get());
   // Execute the first read to load the block.
-  TF_EXPECT_OK(cache1.Read("", 0, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache1, "", 0, 1, &out));
   EXPECT_EQ(calls, 1);
   // Now advance the clock one second at a time and redo the read. The call
   // count should advance every 3 seconds (i.e. every time the staleness is
   // greater than 2).
   for (int i = 1; i <= 10; i++) {
     env->SetNowSeconds(i + 1);
-    TF_EXPECT_OK(cache1.Read("", 0, 1, &out));
+    TF_EXPECT_OK(ReadCache(&cache1, "", 0, 1, &out));
     EXPECT_EQ(calls, 1 + i / 3);
   }
   // Now create a cache with max staleness of 0, and verify that it also works
@@ -272,27 +296,27 @@ TEST(FileBlockCacheTest, MaxStaleness) {
   env->SetNowSeconds(0);
   FileBlockCache cache2(8, 16, 0 /* max staleness */, fetcher, env.get());
   // Execute the first read to load the block.
-  TF_EXPECT_OK(cache2.Read("", 0, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache2, "", 0, 1, &out));
   EXPECT_EQ(calls, 1);
   // Advance the clock by a huge amount and verify that the cached block is
   // used to satisfy the read.
   env->SetNowSeconds(365 * 24 * 60 * 60);  // ~1 year, just for fun.
-  TF_EXPECT_OK(cache2.Read("", 0, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache2, "", 0, 1, &out));
   EXPECT_EQ(calls, 1);
 }
 
 TEST(FileBlockCacheTest, RemoveFile) {
   int calls = 0;
   auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
-                          std::vector<char>* out) {
+                          char* buffer, size_t* bytes_transferred) {
     calls++;
     char c = (filename == "a") ? 'a' : (filename == "b") ? 'b' : 'x';
     if (offset > 0) {
       // The first block is lower case and all subsequent blocks are upper case.
       c = toupper(c);
     }
-    out->clear();
-    out->resize(n, c);
+    memset(buffer, c, n);
+    *bytes_transferred = n;
     return Status::OK();
   };
   // This cache has space for 4 blocks; we'll read from two files.
@@ -304,41 +328,41 @@ TEST(FileBlockCacheTest, RemoveFile) {
   std::vector<char> A(n, 'A');
   std::vector<char> B(n, 'B');
   // Fill the cache.
-  TF_EXPECT_OK(cache.Read("a", 0, n, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "a", 0, n, &out));
   EXPECT_EQ(out, a);
   EXPECT_EQ(calls, 1);
-  TF_EXPECT_OK(cache.Read("a", 8, n, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "a", 8, n, &out));
   EXPECT_EQ(out, A);
   EXPECT_EQ(calls, 2);
-  TF_EXPECT_OK(cache.Read("b", 0, n, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "b", 0, n, &out));
   EXPECT_EQ(out, b);
   EXPECT_EQ(calls, 3);
-  TF_EXPECT_OK(cache.Read("b", 8, n, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "b", 8, n, &out));
   EXPECT_EQ(out, B);
   EXPECT_EQ(calls, 4);
   // All four blocks should be in the cache now.
-  TF_EXPECT_OK(cache.Read("a", 0, n, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "a", 0, n, &out));
   EXPECT_EQ(out, a);
-  TF_EXPECT_OK(cache.Read("a", 8, n, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "a", 8, n, &out));
   EXPECT_EQ(out, A);
-  TF_EXPECT_OK(cache.Read("b", 0, n, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "b", 0, n, &out));
   EXPECT_EQ(out, b);
-  TF_EXPECT_OK(cache.Read("b", 8, n, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "b", 8, n, &out));
   EXPECT_EQ(out, B);
   EXPECT_EQ(calls, 4);
   // Remove the blocks from "a".
   cache.RemoveFile("a");
   // Both blocks from "b" should still be there.
-  TF_EXPECT_OK(cache.Read("b", 0, n, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "b", 0, n, &out));
   EXPECT_EQ(out, b);
-  TF_EXPECT_OK(cache.Read("b", 8, n, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "b", 8, n, &out));
   EXPECT_EQ(out, B);
   EXPECT_EQ(calls, 4);
   // The blocks from "a" should not be there.
-  TF_EXPECT_OK(cache.Read("a", 0, n, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "a", 0, n, &out));
   EXPECT_EQ(out, a);
   EXPECT_EQ(calls, 5);
-  TF_EXPECT_OK(cache.Read("a", 8, n, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "a", 8, n, &out));
   EXPECT_EQ(out, A);
   EXPECT_EQ(calls, 6);
 }
@@ -346,10 +370,10 @@ TEST(FileBlockCacheTest, RemoveFile) {
 TEST(FileBlockCacheTest, Prune) {
   int calls = 0;
   auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
-                          std::vector<char>* out) {
+                          char* buffer, size_t* bytes_transferred) {
     calls++;
-    out->clear();
-    out->resize(n, 'x');
+    memset(buffer, 'x', n);
+    *bytes_transferred = n;
     return Status::OK();
   };
   std::vector<char> out;
@@ -360,20 +384,20 @@ TEST(FileBlockCacheTest, Prune) {
   FileBlockCache cache(8, 32, 1 /* max staleness */, fetcher, env.get());
   // Read three blocks into the cache, and advance the timestamp by one second
   // with each read. Start with a block of "a" at the current timestamp `now`.
-  TF_EXPECT_OK(cache.Read("a", 0, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "a", 0, 1, &out));
   // Now load a block of a different file "b" at timestamp `now` + 1
   env->SetNowSeconds(now + 1);
-  TF_EXPECT_OK(cache.Read("b", 0, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "b", 0, 1, &out));
   // Now load a different block of file "a" at timestamp `now` + 1. When the
   // first block of "a" expires, this block should also be removed because it
   // also belongs to file "a".
-  TF_EXPECT_OK(cache.Read("a", 8, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "a", 8, 1, &out));
   // Ensure that all blocks are in the cache (i.e. reads are cache hits).
   EXPECT_EQ(cache.CacheSize(), 24);
   EXPECT_EQ(calls, 3);
-  TF_EXPECT_OK(cache.Read("a", 0, 1, &out));
-  TF_EXPECT_OK(cache.Read("b", 0, 1, &out));
-  TF_EXPECT_OK(cache.Read("a", 8, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "a", 0, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "b", 0, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "a", 8, 1, &out));
   EXPECT_EQ(calls, 3);
   // Advance the fake timestamp so that "a" becomes stale via its first block.
   env->SetNowSeconds(now + 2);
@@ -389,7 +413,7 @@ TEST(FileBlockCacheTest, Prune) {
   // There should be one block left in the cache, and it should be the first
   // block of "b".
   EXPECT_EQ(cache.CacheSize(), 8);
-  TF_EXPECT_OK(cache.Read("b", 0, 1, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "b", 0, 1, &out));
   EXPECT_EQ(calls, 3);
   // Advance the fake time to `now` + 3, at which point "b" becomes stale.
   env->SetNowSeconds(now + 3);
@@ -409,14 +433,14 @@ TEST(FileBlockCacheTest, ParallelReads) {
   const int callers = 4;
   BlockingCounter counter(callers);
   auto fetcher = [&counter](const string& filename, size_t offset, size_t n,
-                            std::vector<char>* out) {
+                            char* buffer, size_t* bytes_transferred) {
     counter.DecrementCount();
     if (!counter.WaitFor(std::chrono::seconds(10))) {
       // This avoids having the test time out, which is harder to debug.
       return errors::FailedPrecondition("desired concurrency not reached");
     }
-    out->clear();
-    out->resize(n, 'x');
+    memset(buffer, 'x', n);
+    *bytes_transferred = n;
     return Status::OK();
   };
   const int block_size = 8;
@@ -426,7 +450,8 @@ TEST(FileBlockCacheTest, ParallelReads) {
     threads.emplace_back(
         Env::Default()->StartThread({}, "caller", [&cache, i, block_size]() {
           std::vector<char> out;
-          TF_EXPECT_OK(cache.Read("a", i * block_size, block_size, &out));
+          TF_EXPECT_OK(
+              ReadCache(&cache, "a", i * block_size, block_size, &out));
           std::vector<char> x(block_size, 'x');
           EXPECT_EQ(out, x);
         }));
@@ -443,11 +468,12 @@ TEST(FileBlockCacheTest, CoalesceConcurrentReads) {
   Notification notification;
   auto fetcher = [&num_requests, &notification, block_size](
                      const string& filename, size_t offset, size_t n,
-                     std::vector<char>* out) {
+                     char* buffer, size_t* bytes_transferred) {
     EXPECT_EQ(n, block_size);
     EXPECT_EQ(offset, 0);
     num_requests++;
-    out->resize(n, 'x');
+    memset(buffer, 'x', n);
+    *bytes_transferred = n;
     notification.Notify();
     // Wait for other thread to issue read.
     Env::Default()->SleepForMicroseconds(100000);  // 0.1 secs
@@ -456,19 +482,38 @@ TEST(FileBlockCacheTest, CoalesceConcurrentReads) {
   FileBlockCache cache(block_size, block_size, 0, fetcher);
   // Fork off thread for parallel read.
   std::unique_ptr<Thread> concurrent(
-      Env::Default()->StartThread({}, "concurrent", [&cache] {
+      Env::Default()->StartThread({}, "concurrent", [&cache, block_size] {
         std::vector<char> out;
-        TF_EXPECT_OK(cache.Read("", 0, block_size / 2, &out));
+        TF_EXPECT_OK(ReadCache(&cache, "", 0, block_size / 2, &out));
         EXPECT_EQ(out.size(), block_size / 2);
       }));
   EXPECT_TRUE(WaitForNotificationWithTimeout(&notification, 10000))
       << "Timeout waiting for concurrent thread to start.";
   std::vector<char> out;
-  TF_EXPECT_OK(cache.Read("", block_size / 2, block_size / 2, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", block_size / 2, block_size / 2, &out));
   EXPECT_EQ(out.size(), block_size / 2);
 
   EXPECT_EQ(1, num_requests);
 }
 
+TEST(FileBlockCacheTest, Flush) {
+  int calls = 0;
+  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+                          char* buffer, size_t* bytes_transferred) {
+    calls++;
+    memset(buffer, 'x', n);
+    *bytes_transferred = n;
+    return Status::OK();
+  };
+  FileBlockCache cache(16, 32, 0, fetcher);
+  std::vector<char> out;
+  TF_EXPECT_OK(ReadCache(&cache, "", 0, 16, &out));
+  TF_EXPECT_OK(ReadCache(&cache, "", 0, 16, &out));
+  EXPECT_EQ(calls, 1);
+  cache.Flush();
+  TF_EXPECT_OK(ReadCache(&cache, "", 0, 16, &out));
+  EXPECT_EQ(calls, 2);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.cc b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
index 63f2da065db9c85eaac0f6ae1f64a079440a9eaf..4d9aff4d24f06c7bd1269ad590c9687092a5b132 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.cc
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
@@ -14,60 +14,86 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/cloud/gcs_dns_cache.h"
-
+#ifndef _WIN32
 #include <arpa/inet.h>
 #include <netdb.h>
+#else
+#include <Windows.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#endif
 #include <sys/types.h>
 
 namespace tensorflow {
 
 namespace {
 
-constexpr char kStorageHost[] = "storage.googleapis.com";
-constexpr char kWwwHost[] = "www.googleapis.com";
+const std::vector<string>& kCachedDomainNames =
+    *new std::vector<string>{"www.googleapis.com", "storage.googleapis.com"};
+
+inline void print_getaddrinfo_error(const string& name, int error_code) {
+#ifndef _WIN32
+  if (error_code == EAI_SYSTEM) {
+    LOG(ERROR) << "Error resolving " << name
+               << " (EAI_SYSTEM): " << strerror(errno);
+  } else {
+    LOG(ERROR) << "Error resolving " << name << ": "
+               << gai_strerror(error_code);
+  }
+#else
+  // TODO:WSAGetLastError is better than gai_strerror
+  LOG(ERROR) << "Error resolving " << name << ": " << gai_strerror(error_code);
+#endif
+}
 
+// Selects one item at random from a vector of items, using a uniform
+// distribution.
+template <typename T>
+const T& SelectRandomItemUniform(std::default_random_engine* random,
+                                 const std::vector<T>& items) {
+  CHECK_GT(items.size(), 0);
+  std::uniform_int_distribution<size_t> distribution(0u, items.size() - 1u);
+  size_t choice_index = distribution(*random);
+  return items[choice_index];
+}
 }  // namespace
 
 GcsDnsCache::GcsDnsCache(Env* env, int64 refresh_rate_secs)
     : env_(env), refresh_rate_secs_(refresh_rate_secs) {}
 
-Status GcsDnsCache::AnnotateRequest(HttpRequest* request) {
+void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
   // TODO(saeta): Blacklist failing IP addresses.
   mutex_lock l(mu_);
   if (!started_) {
+    VLOG(1) << "Starting GCS DNS cache.";
     DCHECK(!worker_) << "Worker thread already exists!";
     // Perform DNS resolutions to warm the cache.
-    std::vector<string> www_addresses = ResolveName(kWwwHost);
-    std::vector<string> storage_addresses = ResolveName(kStorageHost);
-    www_addresses.swap(www_addresses_);
-    storage_addresses.swap(storage_addresses_);
+    addresses_ = ResolveNames(kCachedDomainNames);
 
     // Note: we opt to use a thread instead of a delayed closure.
     worker_.reset(env_->StartThread(
         {}, "gcs_dns_worker", std::bind(&GcsDnsCache::WorkerThread, this)));
     started_ = true;
   }
-  if (!storage_addresses_.empty()) {
-    std::uniform_int_distribution<> storage_dist(0,
-                                                 storage_addresses_.size() - 1);
-    size_t index = storage_dist(random_);
-    TF_RETURN_IF_ERROR(request->AddResolveOverride(kStorageHost, 443,
-                                                   storage_addresses_[index]));
-  } else {
-    LOG(WARNING) << "No IP addresses available for " << kStorageHost;
-  }
-  if (!www_addresses_.empty()) {
-    std::uniform_int_distribution<> www_dist(0, www_addresses_.size() - 1);
-    size_t index = www_dist(random_);
-    TF_RETURN_IF_ERROR(
-        request->AddResolveOverride(kWwwHost, 443, www_addresses_[index]));
-  } else {
-    LOG(WARNING) << "No IP addresses available for " << kWwwHost;
+
+  CHECK_EQ(kCachedDomainNames.size(), addresses_.size());
+  for (size_t i = 0; i < kCachedDomainNames.size(); ++i) {
+    const string& name = kCachedDomainNames[i];
+    const std::vector<string>& addresses = addresses_[i];
+    if (!addresses.empty()) {
+      const string& chosen_address =
+          SelectRandomItemUniform(&random_, addresses);
+      request->AddResolveOverride(name, 443, chosen_address);
+      VLOG(1) << "Annotated DNS mapping: " << name << " --> " << chosen_address;
+    } else {
+      LOG(WARNING) << "No IP addresses available for " << name;
+    }
   }
-  return Status::OK();
 }
 
 /* static */ std::vector<string> GcsDnsCache::ResolveName(const string& name) {
+  VLOG(1) << "Resolving DNS name: " << name;
+
   addrinfo hints;
   memset(&hints, 0, sizeof(hints));
   hints.ai_family = AF_INET;  // Only use IPv4 for now.
@@ -77,7 +103,7 @@ Status GcsDnsCache::AnnotateRequest(HttpRequest* request) {
 
   std::vector<string> output;
   if (return_code == 0) {
-    for (addrinfo* i = result; i != nullptr; i = i->ai_next) {
+    for (const addrinfo* i = result; i != nullptr; i = i->ai_next) {
       if (i->ai_family != AF_INET || i->ai_addr->sa_family != AF_INET) {
         LOG(WARNING) << "Non-IPv4 address returned. ai_family: " << i->ai_family
                      << ". sa_family: " << i->ai_addr->sa_family << ".";
@@ -93,16 +119,11 @@ Status GcsDnsCache::AnnotateRequest(HttpRequest* request) {
                    << ": " << strerror(errno);
       } else {
         output.emplace_back(buf);
+        VLOG(1) << "... address: " << buf;
       }
     }
   } else {
-    if (return_code == EAI_SYSTEM) {
-      LOG(ERROR) << "Error resolving " << name
-                 << " (EAI_SYSTEM): " << strerror(errno);
-    } else {
-      LOG(ERROR) << "Error resolving " << name << ": "
-                 << gai_strerror(return_code);
-    }
+    print_getaddrinfo_error(name, return_code);
   }
   if (result != nullptr) {
     freeaddrinfo(result);
@@ -110,6 +131,25 @@ Status GcsDnsCache::AnnotateRequest(HttpRequest* request) {
   return output;
 }
 
+// Performs DNS resolution for a set of DNS names. The return vector contains
+// one element for each element in 'names', and each element is itself a
+// vector of IP addresses (in textual form).
+//
+// If DNS resolution fails for any name, then that slot in the return vector
+// will still be present, but will be an empty vector.
+//
+// Ensures: names.size() == return_value.size()
+
+std::vector<std::vector<string>> GcsDnsCache::ResolveNames(
+    const std::vector<string>& names) {
+  std::vector<std::vector<string>> all_addresses;
+  all_addresses.reserve(names.size());
+  for (const string& name : names) {
+    all_addresses.push_back(ResolveName(name));
+  }
+  return all_addresses;
+}
+
 void GcsDnsCache::WorkerThread() {
   while (true) {
     {
@@ -119,15 +159,14 @@ void GcsDnsCache::WorkerThread() {
       cond_var_.wait_for(l, std::chrono::seconds(refresh_rate_secs_));
       if (cancelled_) return;
     }
+
     // Resolve DNS values
-    std::vector<string> www_addresses = ResolveName(kWwwHost);
-    std::vector<string> storage_addresses = ResolveName(kStorageHost);
+    auto new_addresses = ResolveNames(kCachedDomainNames);
 
     {
       mutex_lock l(mu_);
       // Update instance variables.
-      www_addresses.swap(www_addresses_);
-      storage_addresses.swap(storage_addresses_);
+      addresses_.swap(new_addresses);
     }
   }
 }
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.h b/tensorflow/core/platform/cloud/gcs_dns_cache.h
index 7a4d3847a5ac82b1ced742a20ca18ba84bf6fa7c..40f16f10443a6729477310db44b789d71a0ffd48 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.h
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
-#define THIRD_PARTY_TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
+#ifndef TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
+#define TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
 
 #include <random>
 
@@ -48,10 +48,12 @@ class GcsDnsCache {
   }
 
   // Annotate the given HttpRequest with resolve overrides from the cache.
-  Status AnnotateRequest(HttpRequest* request);
+  void AnnotateRequest(HttpRequest* request);
 
  private:
   static std::vector<string> ResolveName(const string& name);
+  static std::vector<std::vector<string>> ResolveNames(
+      const std::vector<string>& names);
   void WorkerThread();
 
   // Define a friend class for testing.
@@ -63,12 +65,13 @@ class GcsDnsCache {
   std::default_random_engine random_ GUARDED_BY(mu_);
   bool started_ GUARDED_BY(mu_) = false;
   bool cancelled_ GUARDED_BY(mu_) = false;
-  std::vector<string> www_addresses_ GUARDED_BY(mu_);
-  std::vector<string> storage_addresses_ GUARDED_BY(mu_);
   std::unique_ptr<Thread> worker_ GUARDED_BY(mu_);  // After mutable vars.
   const int64 refresh_rate_secs_;
+
+  // Entries in this vector correspond to entries in kCachedDomainNames.
+  std::vector<std::vector<string>> addresses_ GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
+#endif  // TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc b/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
index 8d1a108f30dd0461a1cd08dd217badbdf24fc400..8be452ff44d03bf3a8a66b99b0e65f98da537d5f 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
@@ -21,14 +21,11 @@ namespace tensorflow {
 
 class TestHttpRequest : public HttpRequest {
  public:
-  Status Init() override { return Status::OK(); }
-  Status SetUri(const string& uri) override { return Status::OK(); }
-  Status SetRange(uint64 start, uint64 end) override { return Status::OK(); }
-  Status AddHeader(const string& name, const string& value) override {
-    return Status::OK();
-  }
-  Status AddResolveOverride(const string& hostname, int64 port,
-                            const string& ip_addr) override {
+  void SetUri(const string& uri) override {}
+  void SetRange(uint64 start, uint64 end) override {}
+  void AddHeader(const string& name, const string& value) override {}
+  void AddResolveOverride(const string& hostname, int64 port,
+                          const string& ip_addr) override {
     EXPECT_EQ(port, 443) << "Unexpected port set for hostname: " << hostname;
     auto itr = resolve_overrides_.find(hostname);
     EXPECT_EQ(itr, resolve_overrides_.end())
@@ -36,34 +33,30 @@ class TestHttpRequest : public HttpRequest {
 
     resolve_overrides_.insert(
         std::map<string, string>::value_type(hostname, ip_addr));
-    return Status::OK();
   }
 
-  Status AddAuthBearerHeader(const string& auth_token) override {
-    return Status::OK();
-  }
+  void AddAuthBearerHeader(const string& auth_token) override {}
 
-  Status SetDeleteRequest() override { return Status::OK(); }
+  void SetDeleteRequest() override {}
 
   Status SetPutFromFile(const string& body_filepath, size_t offset) override {
     return Status::OK();
   }
-  Status SetPutEmptyBody() override { return Status::OK(); }
-
-  Status SetPostFromBuffer(const char* buffer, size_t size) override {
-    return Status::OK();
-  }
-  Status SetPostEmptyBody() override { return Status::OK(); }
-
-  Status SetResultBuffer(std::vector<char>* out_buffer) override {
-    return Status::OK();
-  }
+  void SetPutEmptyBody() override {}
+  void SetPostFromBuffer(const char* buffer, size_t size) override {}
+  void SetPostEmptyBody() override {}
+  void SetResultBuffer(std::vector<char>* out_buffer) override {}
+  void SetResultBufferDirect(char* buffer, size_t size) override {}
+  size_t GetResultBufferDirectBytesTransferred() override { return 0; }
 
   string GetResponseHeader(const string& name) const override { return ""; }
   uint64 GetResponseCode() const override { return 0; }
   Status Send() override { return Status::OK(); }
   string EscapeString(const string& str) override { return ""; }
 
+  void SetTimeouts(uint32 connection, uint32 inactivity,
+                   uint32 total) override {}
+
   std::map<string, string> resolve_overrides_;
 };
 
@@ -83,13 +76,11 @@ class GcsDnsCacheTest : public ::testing::Test {
     {
       mutex_lock l(d.mu_);
       d.started_ = true;  // Avoid creating a thread.
-      d.www_addresses_ = {"192.168.1.1"};
-      d.storage_addresses_ = {"172.134.1.1"};
+      d.addresses_ = {{"192.168.1.1"}, {"172.134.1.1"}};
     }
 
     TestHttpRequest req;
-    Status s = d.AnnotateRequest(&req);
-    EXPECT_TRUE(s.ok()) << s;
+    d.AnnotateRequest(&req);
     EXPECT_EQ("192.168.1.1", req.resolve_overrides_["www.googleapis.com"]);
     EXPECT_EQ("172.134.1.1", req.resolve_overrides_["storage.googleapis.com"]);
   }
@@ -99,8 +90,7 @@ class GcsDnsCacheTest : public ::testing::Test {
     // a timely manner.
     GcsDnsCache d;
     TestHttpRequest req;
-    Status s = d.AnnotateRequest(&req);
-    EXPECT_TRUE(s.ok()) << s;
+    d.AnnotateRequest(&req);
   }
 };
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 54d38fe962bf90e55d8b138e2e734b994b642395..01ca0d76bab2720513775ef33ff8670bd148c241 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -22,6 +22,9 @@ limitations under the License.
 #include <cstring>
 #include <fstream>
 #include <vector>
+#ifdef _WIN32
+#include <io.h>  // for _mktemp
+#endif
 #include "include/json/json.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -40,8 +43,13 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
-namespace tensorflow {
+#ifdef _WIN32
+#ifdef DeleteFile
+#undef DeleteFile
+#endif
+#endif
 
+namespace tensorflow {
 namespace {
 
 constexpr char kGcsUriBase[] = "https://www.googleapis.com/storage/v1/";
@@ -50,9 +58,6 @@ constexpr char kGcsUploadUriBase[] =
 constexpr char kStorageHost[] = "storage.googleapis.com";
 constexpr size_t kReadAppendableFileBufferSize = 1024 * 1024;  // In bytes.
 constexpr int kGetChildrenDefaultPageSize = 1000;
-// Initial delay before retrying a GCS upload.
-// Subsequent delays can be larger due to exponential back-off.
-constexpr uint64 kUploadRetryDelayMicros = 1000000L;
 // The HTTP response code "308 Resume Incomplete".
 constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
 // The environment variable that overrides the size of the readahead buffer.
@@ -94,17 +99,52 @@ const FileStatistics DIRECTORY_STAT(0, 0, true);
 // variable to a positive integer describing the frequency used to refresh the
 // userspace DNS cache.
 constexpr char kResolveCacheSecs[] = "GCS_RESOLVE_REFRESH_SECS";
-
+// The environment variable to configure the http request's connection timeout.
+constexpr char kRequestConnectionTimeout[] =
+    "GCS_REQUEST_CONNECTION_TIMEOUT_SECS";
+// The environment varaible to configure the http request's idle timeout.
+constexpr char kRequestIdleTimeout[] = "GCS_REQUEST_IDLE_TIMEOUT_SECS";
+// The environment variable to configure the overall request timeout for
+// metadata requests.
+constexpr char kMetadataRequestTimeout[] = "GCS_METADATA_REQUEST_TIMEOUT_SECS";
+// The environment variable to configure the overall request timeout for
+// block reads requests.
+constexpr char kReadRequestTimeout[] = "GCS_READ_REQUEST_TIMEOUT_SECS";
+// The environment variable to configure the overall request timeout for
+// upload requests.
+constexpr char kWriteRequestTimeout[] = "GCS_WRITE_REQUEST_TIMEOUT_SECS";
+// The environment variable to configure an additional header to send with
+// all requests to GCS (format HEADERNAME:HEADERCONTENT)
+constexpr char kAdditionalRequestHeader[] = "GCS_ADDITIONAL_REQUEST_HEADER";
+// The environment variable to configure the throttle (format: <int64>)
+constexpr char kThrottleRate[] = "GCS_THROTTLE_TOKEN_RATE";
+// The environment variable to configure the token bucket size (format: <int64>)
+constexpr char kThrottleBucket[] = "GCS_THROTTLE_BUCKET_SIZE";
+// The environment variable that controls the number of tokens per request.
+// (format: <int64>)
+constexpr char kTokensPerRequest[] = "GCS_TOKENS_PER_REQUEST";
+// The environment variable to configure the initial tokens (format: <int64>)
+constexpr char kInitialTokens[] = "GCS_INITIAL_TOKENS";
+
+// TODO: DO NOT use a hardcoded path
 Status GetTmpFilename(string* filename) {
   if (!filename) {
     return errors::Internal("'filename' cannot be nullptr.");
   }
+#ifndef _WIN32
   char buffer[] = "/tmp/gcs_filesystem_XXXXXX";
   int fd = mkstemp(buffer);
   if (fd < 0) {
     return errors::Internal("Failed to create a temporary file.");
   }
   close(fd);
+#else
+  char buffer[] = "/tmp/gcs_filesystem_XXXXXX";
+  char* ret = _mktemp(buffer);
+  if (ret == nullptr) {
+    return errors::Internal("Failed to create a temporary file.");
+  }
+#endif
   *filename = buffer;
   return Status::OK();
 }
@@ -180,17 +220,21 @@ std::set<string> AddAllSubpaths(const std::vector<string>& paths) {
 
 Status ParseJson(StringPiece json, Json::Value* result) {
   Json::Reader reader;
-  if (!reader.parse(json.ToString(), *result)) {
+  if (!reader.parse(json.data(), json.data() + json.size(), *result)) {
     return errors::Internal("Couldn't parse JSON response from GCS.");
   }
   return Status::OK();
 }
 
+Status ParseJson(const std::vector<char>& json, Json::Value* result) {
+  return ParseJson(StringPiece{json.data(), json.size()}, result);
+}
+
 /// Reads a JSON value with the given name from a parent JSON value.
-Status GetValue(const Json::Value& parent, const string& name,
+Status GetValue(const Json::Value& parent, const char* name,
                 Json::Value* result) {
   *result = parent.get(name, Json::Value::null);
-  if (*result == Json::Value::null) {
+  if (result->isNull()) {
     return errors::Internal("The field '", name,
                             "' was expected in the JSON response.");
   }
@@ -198,7 +242,7 @@ Status GetValue(const Json::Value& parent, const string& name,
 }
 
 /// Reads a string JSON value with the given name from a parent JSON value.
-Status GetStringValue(const Json::Value& parent, const string& name,
+Status GetStringValue(const Json::Value& parent, const char* name,
                       string* result) {
   Json::Value result_value;
   TF_RETURN_IF_ERROR(GetValue(parent, name, &result_value));
@@ -212,7 +256,7 @@ Status GetStringValue(const Json::Value& parent, const string& name,
 }
 
 /// Reads a long JSON value with the given name from a parent JSON value.
-Status GetInt64Value(const Json::Value& parent, const string& name,
+Status GetInt64Value(const Json::Value& parent, const char* name,
                      int64* result) {
   Json::Value result_value;
   TF_RETURN_IF_ERROR(GetValue(parent, name, &result_value));
@@ -221,7 +265,7 @@ Status GetInt64Value(const Json::Value& parent, const string& name,
     return Status::OK();
   }
   if (result_value.isString() &&
-      strings::safe_strto64(result_value.asString().c_str(), result)) {
+      strings::safe_strto64(result_value.asCString(), result)) {
     return Status::OK();
   }
   return errors::Internal(
@@ -230,8 +274,7 @@ Status GetInt64Value(const Json::Value& parent, const string& name,
 }
 
 /// Reads a boolean JSON value with the given name from a parent JSON value.
-Status GetBoolValue(const Json::Value& parent, const string& name,
-                    bool* result) {
+Status GetBoolValue(const Json::Value& parent, const char* name, bool* result) {
   Json::Value result_value;
   TF_RETURN_IF_ERROR(GetValue(parent, name, &result_value));
   if (!result_value.isBool()) {
@@ -253,11 +296,11 @@ class GcsRandomAccessFile : public RandomAccessFile {
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     *result = StringPiece();
-    std::vector<char> out;
-    TF_RETURN_IF_ERROR(file_block_cache_->Read(filename_, offset, n, &out));
-    std::memcpy(scratch, out.data(), std::min(out.size(), n));
-    *result = StringPiece(scratch, std::min(out.size(), n));
-    if (result->size() < n) {
+    size_t bytes_transferred;
+    TF_RETURN_IF_ERROR(file_block_cache_->Read(filename_, offset, n, scratch,
+                                               &bytes_transferred));
+    *result = StringPiece(scratch, bytes_transferred);
+    if (bytes_transferred < n) {
       // This is not an error per se. The RandomAccessFile interface expects
       // that Read returns OutOfRange if fewer bytes were read than requested.
       return errors::OutOfRange("EOF reached, ", result->size(),
@@ -281,17 +324,18 @@ class GcsRandomAccessFile : public RandomAccessFile {
 class GcsWritableFile : public WritableFile {
  public:
   GcsWritableFile(const string& bucket, const string& object,
-                  AuthProvider* auth_provider,
-                  HttpRequest::Factory* http_request_factory,
+                  GcsFileSystem* filesystem,
+                  GcsFileSystem::TimeoutConfig* timeouts,
                   std::function<void()> file_cache_erase,
                   int64 initial_retry_delay_usec)
       : bucket_(bucket),
         object_(object),
-        auth_provider_(auth_provider),
-        http_request_factory_(http_request_factory),
+        filesystem_(filesystem),
+        timeouts_(timeouts),
         file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
         initial_retry_delay_usec_(initial_retry_delay_usec) {
+    // TODO: to make it safer, outfile_ should be constructed from an FD
     if (GetTmpFilename(&tmp_content_filename_).ok()) {
       outfile_.open(tmp_content_filename_,
                     std::ofstream::binary | std::ofstream::app);
@@ -304,15 +348,14 @@ class GcsWritableFile : public WritableFile {
   /// with the content to be appended. The class takes onwnership of the
   /// specified tmp file and deletes it on close.
   GcsWritableFile(const string& bucket, const string& object,
-                  AuthProvider* auth_provider,
-                  const string& tmp_content_filename,
-                  HttpRequest::Factory* http_request_factory,
+                  GcsFileSystem* filesystem, const string& tmp_content_filename,
+                  GcsFileSystem::TimeoutConfig* timeouts,
                   std::function<void()> file_cache_erase,
                   int64 initial_retry_delay_usec)
       : bucket_(bucket),
         object_(object),
-        auth_provider_(auth_provider),
-        http_request_factory_(http_request_factory),
+        filesystem_(filesystem),
+        timeouts_(timeouts),
         file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
         initial_retry_delay_usec_(initial_retry_delay_usec) {
@@ -416,7 +459,7 @@ class GcsWritableFile : public WritableFile {
       return errors::Internal("'size' cannot be nullptr");
     }
     const auto tellp = outfile_.tellp();
-    if (tellp == -1) {
+    if (tellp == static_cast<std::streampos>(-1)) {
       return errors::Internal(
           "Could not get the size of the internal temporary file.");
     }
@@ -432,20 +475,18 @@ class GcsWritableFile : public WritableFile {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
-    string auth_token;
-    TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_, &auth_token));
-
     std::vector<char> output_buffer;
-    std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-    TF_RETURN_IF_ERROR(request->Init());
-    TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
+
+    request->SetUri(strings::StrCat(
         kGcsUploadUriBase, "b/", bucket_,
-        "/o?uploadType=resumable&name=", request->EscapeString(object_))));
-    TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
-    TF_RETURN_IF_ERROR(request->AddHeader("X-Upload-Content-Length",
-                                          std::to_string(file_size)));
-    TF_RETURN_IF_ERROR(request->SetPostEmptyBody());
-    TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
+        "/o?uploadType=resumable&name=", request->EscapeString(object_)));
+    request->AddHeader("X-Upload-Content-Length", std::to_string(file_size));
+    request->SetPostEmptyBody();
+    request->SetResultBuffer(&output_buffer);
+    request->SetTimeouts(timeouts_->connect, timeouts_->idle,
+                         timeouts_->metadata);
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
         request->Send(), " when initiating an upload to ", GetGcsPath());
     *session_uri = request->GetResponseHeader("Location");
@@ -470,16 +511,13 @@ class GcsWritableFile : public WritableFile {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
-    string auth_token;
-    TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_, &auth_token));
-
-    std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-    TF_RETURN_IF_ERROR(request->Init());
-    TF_RETURN_IF_ERROR(request->SetUri(session_uri));
-    TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
-    TF_RETURN_IF_ERROR(request->AddHeader(
-        "Content-Range", strings::StrCat("bytes */", file_size)));
-    TF_RETURN_IF_ERROR(request->SetPutEmptyBody());
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
+    request->SetUri(session_uri);
+    request->SetTimeouts(timeouts_->connect, timeouts_->idle,
+                         timeouts_->metadata);
+    request->AddHeader("Content-Range", strings::StrCat("bytes */", file_size));
+    request->SetPutEmptyBody();
     const Status& status = request->Send();
     if (status.ok()) {
       *completed = true;
@@ -519,18 +557,16 @@ class GcsWritableFile : public WritableFile {
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
-    string auth_token;
-    TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_, &auth_token));
-
-    std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-    TF_RETURN_IF_ERROR(request->Init());
-    TF_RETURN_IF_ERROR(request->SetUri(session_uri));
-    TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
+    request->SetUri(session_uri);
     if (file_size > 0) {
-      TF_RETURN_IF_ERROR(request->AddHeader(
-          "Content-Range", strings::StrCat("bytes ", start_offset, "-",
-                                           file_size - 1, "/", file_size)));
+      request->AddHeader("Content-Range",
+                         strings::StrCat("bytes ", start_offset, "-",
+                                         file_size - 1, "/", file_size));
     }
+    request->SetTimeouts(timeouts_->connect, timeouts_->idle, timeouts_->write);
+
     TF_RETURN_IF_ERROR(
         request->SetPutFromFile(tmp_content_filename_, start_offset));
     TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when uploading ",
@@ -546,10 +582,10 @@ class GcsWritableFile : public WritableFile {
 
   string bucket_;
   string object_;
-  AuthProvider* auth_provider_;
+  GcsFileSystem* const filesystem_;  // Not owned.
   string tmp_content_filename_;
   std::ofstream outfile_;
-  HttpRequest::Factory* http_request_factory_;
+  GcsFileSystem::TimeoutConfig* timeouts_;
   std::function<void()> file_cache_erase_;
   bool sync_needed_;  // whether there is buffered data that needs to be synced
   int64 initial_retry_delay_usec_;
@@ -579,6 +615,11 @@ bool GetEnvVar(const char* varname, bool (*convert)(StringPiece, T*),
   return convert(env_value, value);
 }
 
+bool StringPieceIdentity(StringPiece str, StringPiece* value) {
+  *value = str;
+  return true;
+}
+
 }  // namespace
 
 GcsFileSystem::GcsFileSystem()
@@ -634,6 +675,80 @@ GcsFileSystem::GcsFileSystem()
   if (GetEnvVar(kResolveCacheSecs, strings::safe_strto64,
                 &resolve_frequency_secs)) {
     dns_cache_.reset(new GcsDnsCache(resolve_frequency_secs));
+    VLOG(1) << "GCS DNS cache is enabled.  " << kResolveCacheSecs << " = "
+            << resolve_frequency_secs;
+  } else {
+    VLOG(1) << "GCS DNS cache is disabled, because " << kResolveCacheSecs
+            << " = 0 (or is not set)";
+  }
+
+  // Get the additional header
+  StringPiece add_header_contents;
+  if (GetEnvVar(kAdditionalRequestHeader, StringPieceIdentity,
+                &add_header_contents)) {
+    size_t split = add_header_contents.find(':', 0);
+
+    if (split != StringPiece::npos) {
+      StringPiece header_name = add_header_contents.substr(0, split);
+      StringPiece header_value = add_header_contents.substr(split + 1);
+
+      if (!header_name.empty() && !header_value.empty()) {
+        additional_header_.reset(new std::pair<const string, const string>(
+            header_name.ToString(), header_value.ToString()));
+
+        VLOG(1) << "GCS additional header ENABLED. "
+                << "Name: " << additional_header_->first << ", "
+                << "Value: " << additional_header_->second;
+      } else {
+        LOG(ERROR) << "GCS additional header DISABLED. Invalid contents: "
+                   << add_header_contents;
+      }
+    } else {
+      LOG(ERROR) << "GCS additional header DISABLED. Invalid contents: "
+                 << add_header_contents;
+    }
+  } else {
+    VLOG(1) << "GCS additional header DISABLED. No environment variable set.";
+  }
+
+  // Apply the overrides for request timeouts
+  uint32 timeout_value;
+  if (GetEnvVar(kRequestConnectionTimeout, strings::safe_strtou32,
+                &timeout_value)) {
+    timeouts_.connect = timeout_value;
+  }
+  if (GetEnvVar(kRequestIdleTimeout, strings::safe_strtou32, &timeout_value)) {
+    timeouts_.idle = timeout_value;
+  }
+  if (GetEnvVar(kMetadataRequestTimeout, strings::safe_strtou32,
+                &timeout_value)) {
+    timeouts_.metadata = timeout_value;
+  }
+  if (GetEnvVar(kReadRequestTimeout, strings::safe_strtou32, &timeout_value)) {
+    timeouts_.read = timeout_value;
+  }
+  if (GetEnvVar(kWriteRequestTimeout, strings::safe_strtou32, &timeout_value)) {
+    timeouts_.write = timeout_value;
+  }
+
+  int64 token_value;
+  if (GetEnvVar(kThrottleRate, strings::safe_strto64, &token_value)) {
+    GcsThrottleConfig config;
+    config.enabled = true;
+    config.token_rate = token_value;
+
+    if (GetEnvVar(kThrottleBucket, strings::safe_strto64, &token_value)) {
+      config.bucket_size = token_value;
+    }
+
+    if (GetEnvVar(kTokensPerRequest, strings::safe_strto64, &token_value)) {
+      config.tokens_per_request = token_value;
+    }
+
+    if (GetEnvVar(kInitialTokens, strings::safe_strto64, &token_value)) {
+      config.initial_tokens = token_value;
+    }
+    throttle_.SetConfig(config);
   }
 }
 
@@ -643,7 +758,9 @@ GcsFileSystem::GcsFileSystem(
     size_t block_size, size_t max_bytes, uint64 max_staleness,
     uint64 stat_cache_max_age, size_t stat_cache_max_entries,
     uint64 matching_paths_cache_max_age,
-    size_t matching_paths_cache_max_entries, int64 initial_retry_delay_usec)
+    size_t matching_paths_cache_max_entries, int64 initial_retry_delay_usec,
+    TimeoutConfig timeouts,
+    std::pair<const string, const string>* additional_header)
     : auth_provider_(std::move(auth_provider)),
       http_request_factory_(std::move(http_request_factory)),
       file_block_cache_(
@@ -651,7 +768,9 @@ GcsFileSystem::GcsFileSystem(
       stat_cache_(new StatCache(stat_cache_max_age, stat_cache_max_entries)),
       matching_paths_cache_(new MatchingPathsCache(
           matching_paths_cache_max_age, matching_paths_cache_max_entries)),
-      initial_retry_delay_usec_(initial_retry_delay_usec) {}
+      timeouts_(timeouts),
+      initial_retry_delay_usec_(initial_retry_delay_usec),
+      additional_header_(additional_header) {}
 
 Status GcsFileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
@@ -667,45 +786,53 @@ std::unique_ptr<FileBlockCache> GcsFileSystem::MakeFileBlockCache(
   std::unique_ptr<FileBlockCache> file_block_cache(
       new FileBlockCache(block_size, max_bytes, max_staleness,
                          [this](const string& filename, size_t offset, size_t n,
-                                std::vector<char>* out) {
-                           return LoadBufferFromGCS(filename, offset, n, out);
+                                char* buffer, size_t* bytes_transferred) {
+                           return LoadBufferFromGCS(filename, offset, n, buffer,
+                                                    bytes_transferred);
                          }));
   return file_block_cache;
 }
 
 // A helper function to actually read the data from GCS.
 Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
-                                        size_t n, std::vector<char>* out) {
+                                        size_t n, char* buffer,
+                                        size_t* bytes_transferred) {
+  *bytes_transferred = 0;
+
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(filename, false, &bucket, &object));
-  string auth_token;
-  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
 
-  std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-  TF_RETURN_IF_ERROR(request->Init());
-  TF_RETURN_IF_ERROR(
-      request->SetUri(strings::StrCat("https://", kStorageHost, "/", bucket,
-                                      "/", request->EscapeString(object))));
-  TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
-  TF_RETURN_IF_ERROR(request->SetRange(offset, offset + n - 1));
-  TF_RETURN_IF_ERROR(request->SetResultBuffer(out));
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
+                                  "when reading gs://", bucket, "/", object);
 
-  if (dns_cache_) {
-    TF_RETURN_IF_ERROR(dns_cache_->AnnotateRequest(request.get()));
-  }
+  request->SetUri(strings::StrCat("https://", kStorageHost, "/", bucket, "/",
+                                  request->EscapeString(object)));
+  request->SetRange(offset, offset + n - 1);
+  request->SetResultBufferDirect(buffer, n);
+  request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.read);
 
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when reading gs://",
                                   bucket, "/", object);
 
-  if (out->size() < block_size()) {
+  size_t bytes_read = request->GetResultBufferDirectBytesTransferred();
+  *bytes_transferred = bytes_read;
+  VLOG(1) << "Successful read of gs://" << bucket << "/" << object << " @ "
+          << offset << " of size: " << bytes_read;
+
+  throttle_.RecordResponse(bytes_read);
+
+  if (bytes_read < block_size()) {
     // Check stat cache to see if we encountered an interrupted read.
     FileStatistics stat;
     if (stat_cache_->Lookup(filename, &stat)) {
-      if (offset + out->size() < stat.length) {
+      if (offset + bytes_read < stat.length) {
         return errors::Internal(strings::Printf(
             "File contents are inconsistent for file: %s @ %lu.",
             filename.c_str(), offset));
       }
+      VLOG(2) << "Successful integrity check for: gs://" << bucket << "/"
+              << object << " @ " << offset;
     }
   }
 
@@ -717,7 +844,7 @@ Status GcsFileSystem::NewWritableFile(const string& fname,
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(
-      bucket, object, auth_provider_.get(), http_request_factory_.get(),
+      bucket, object, this, &timeouts_,
       [this, fname]() { file_block_cache_->RemoveFile(fname); },
       initial_retry_delay_usec_));
   return Status::OK();
@@ -758,8 +885,7 @@ Status GcsFileSystem::NewAppendableFile(const string& fname,
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(
-      bucket, object, auth_provider_.get(), old_content_filename,
-      http_request_factory_.get(),
+      bucket, object, this, old_content_filename, &timeouts_,
       [this, fname]() { file_block_cache_->RemoveFile(fname); },
       initial_retry_delay_usec_));
   return Status::OK();
@@ -833,44 +959,43 @@ Status GcsFileSystem::StatForObject(const string& fname, const string& bucket,
         "'object' must be a non-empty string. (File: %s)", fname.c_str()));
   }
 
-  StatCache::ComputeFunc compute_func =
-      [this, &bucket, &object](const string& fname, FileStatistics* stat) {
-        string auth_token;
-        TF_RETURN_IF_ERROR(
-            AuthProvider::GetToken(auth_provider_.get(), &auth_token));
+  StatCache::ComputeFunc compute_func = [this, &bucket, &object](
+                                            const string& fname,
+                                            FileStatistics* stat) {
+    std::vector<char> output_buffer;
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
+                                    " when reading metadata of gs://", bucket,
+                                    "/", object);
 
-        std::vector<char> output_buffer;
-        std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-        TF_RETURN_IF_ERROR(request->Init());
-        TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
-            kGcsUriBase, "b/", bucket, "/o/", request->EscapeString(object),
-            "?fields=size%2Cupdated")));
-        TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
-        TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
+    request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/",
+                                    request->EscapeString(object),
+                                    "?fields=size%2Cupdated"));
+    request->SetResultBuffer(&output_buffer);
+    request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
 
-        if (dns_cache_) {
-          TF_RETURN_IF_ERROR(dns_cache_->AnnotateRequest(request.get()));
-        }
-        TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
-                                        " when reading metadata of gs://",
-                                        bucket, "/", object);
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
+                                    " when reading metadata of gs://", bucket,
+                                    "/", object);
 
-        StringPiece response_piece =
-            StringPiece(output_buffer.data(), output_buffer.size());
-        Json::Value root;
-        TF_RETURN_IF_ERROR(ParseJson(response_piece, &root));
+    Json::Value root;
+    TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
 
-        // Parse file size.
-        TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &(stat->length)));
+    // Parse file size.
+    TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &stat->length));
 
-        // Parse file modification time.
-        string updated;
-        TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
-        TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
+    // Parse file modification time.
+    string updated;
+    TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
+    TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
 
-        stat->is_directory = false;
-        return Status::OK();
-      };
+    VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- "
+            << " length: " << stat->length
+            << "; mtime_nsec: " << stat->mtime_nsec << "; updated: " << updated;
+
+    stat->is_directory = false;
+    return Status::OK();
+  };
 
   TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(fname, stat, compute_func));
   if (stat->is_directory) {
@@ -884,14 +1009,11 @@ Status GcsFileSystem::BucketExists(const string& bucket, bool* result) {
   if (!result) {
     return errors::Internal("'result' cannot be nullptr.");
   }
-  string auth_token;
-  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
 
-  std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-  TF_RETURN_IF_ERROR(request->Init());
-  TF_RETURN_IF_ERROR(
-      request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket)));
-  TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
+  request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket));
+  request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
   const Status status = request->Send();
   switch (status.code()) {
     case errors::Code::OK:
@@ -991,13 +1113,9 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
   string nextPageToken;
   uint64 retrieved_results = 0;
   while (true) {  // A loop over multiple result pages.
-    string auth_token;
-    TF_RETURN_IF_ERROR(
-        AuthProvider::GetToken(auth_provider_.get(), &auth_token));
-
     std::vector<char> output_buffer;
-    std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-    TF_RETURN_IF_ERROR(request->Init());
+    std::unique_ptr<HttpRequest> request;
+    TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
     auto uri = strings::StrCat(kGcsUriBase, "b/", bucket, "/o");
     if (recursive) {
       uri = strings::StrCat(uri, "?fields=items%2Fname%2CnextPageToken");
@@ -1020,21 +1138,15 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
       uri =
           strings::StrCat(uri, "&maxResults=", max_results - retrieved_results);
     }
-    TF_RETURN_IF_ERROR(request->SetUri(uri));
-    TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
-    TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
-
-    if (dns_cache_) {
-      TF_RETURN_IF_ERROR(dns_cache_->AnnotateRequest(request.get()));
-    }
+    request->SetUri(uri);
+    request->SetResultBuffer(&output_buffer);
+    request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
 
     TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when reading ", dirname);
     Json::Value root;
-    StringPiece response_piece =
-        StringPiece(output_buffer.data(), output_buffer.size());
-    TF_RETURN_IF_ERROR(ParseJson(response_piece, &root));
+    TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
     const auto items = root.get("items", Json::Value::null);
-    if (items != Json::Value::null) {
+    if (!items.isNull()) {
       if (!items.isArray()) {
         return errors::Internal(
             "Expected an array 'items' in the GCS response.");
@@ -1065,7 +1177,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
       }
     }
     const auto prefixes = root.get("prefixes", Json::Value::null);
-    if (prefixes != Json::Value::null) {
+    if (!prefixes.isNull()) {
       // Subfolders are returned for the non-recursive mode.
       if (!prefixes.isArray()) {
         return errors::Internal(
@@ -1073,7 +1185,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
       }
       for (size_t i = 0; i < prefixes.size(); i++) {
         const auto prefix = prefixes.get(i, Json::Value::null);
-        if (prefix == Json::Value::null || !prefix.isString()) {
+        if (prefix.isNull() || !prefix.isString()) {
           return errors::Internal(
               "'prefixes' was expected to be an array of strings in the GCS "
               "response.");
@@ -1092,7 +1204,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
       }
     }
     const auto token = root.get("nextPageToken", Json::Value::null);
-    if (token == Json::Value::null) {
+    if (token.isNull()) {
       return Status::OK();
     }
     if (!token.isString()) {
@@ -1139,15 +1251,13 @@ Status GcsFileSystem::DeleteFile(const string& fname) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
 
-  string auth_token;
-  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
+  request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/",
+                                  request->EscapeString(object)));
+  request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
+  request->SetDeleteRequest();
 
-  std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-  TF_RETURN_IF_ERROR(request->Init());
-  TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
-      kGcsUriBase, "b/", bucket, "/o/", request->EscapeString(object))));
-  TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
-  TF_RETURN_IF_ERROR(request->SetDeleteRequest());
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when deleting ", fname);
   file_block_cache_->RemoveFile(fname);
   return Status::OK();
@@ -1230,28 +1340,23 @@ Status GcsFileSystem::RenameObject(const string& src, const string& target) {
   TF_RETURN_IF_ERROR(
       ParseGcsPath(target, false, &target_bucket, &target_object));
 
-  string auth_token;
-  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
-
-  std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-  TF_RETURN_IF_ERROR(request->Init());
-  TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
-      kGcsUriBase, "b/", src_bucket, "/o/", request->EscapeString(src_object),
-      "/rewriteTo/b/", target_bucket, "/o/",
-      request->EscapeString(target_object))));
-  TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
-  TF_RETURN_IF_ERROR(request->SetPostEmptyBody());
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
+  request->SetUri(strings::StrCat(kGcsUriBase, "b/", src_bucket, "/o/",
+                                  request->EscapeString(src_object),
+                                  "/rewriteTo/b/", target_bucket, "/o/",
+                                  request->EscapeString(target_object)));
+  request->SetPostEmptyBody();
+  request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
   std::vector<char> output_buffer;
-  TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
+  request->SetResultBuffer(&output_buffer);
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when renaming ", src,
                                   " to ", target);
   // Flush the target from the block cache.  The source will be flushed in the
   // DeleteFile call below.
   file_block_cache_->RemoveFile(target);
   Json::Value root;
-  StringPiece response_piece =
-      StringPiece(output_buffer.data(), output_buffer.size());
-  TF_RETURN_IF_ERROR(ParseJson(response_piece, &root));
+  TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
   bool done;
   TF_RETURN_IF_ERROR(GetBoolValue(root, "done", &done));
   if (!done) {
@@ -1340,6 +1445,42 @@ Status GcsFileSystem::DeleteRecursively(const string& dirname,
   return Status::OK();
 }
 
+// Flushes all caches for filesystem metadata and file contents. Useful for
+// reclaiming memory once filesystem operations are done (e.g. model is loaded),
+// or for resetting the filesystem to a consistent state.
+void GcsFileSystem::FlushCaches() {
+  file_block_cache_->Flush();
+  stat_cache_->Clear();
+  matching_paths_cache_->Clear();
+}
+
+// Creates an HttpRequest and sets several parameters that are common to all
+// requests.  All code (in GcsFileSystem) that creates an HttpRequest should
+// go through this method, rather than directly using http_request_factory_.
+Status GcsFileSystem::CreateHttpRequest(std::unique_ptr<HttpRequest>* request) {
+  std::unique_ptr<HttpRequest> new_request{http_request_factory_->Create()};
+  if (dns_cache_) {
+    dns_cache_->AnnotateRequest(new_request.get());
+  }
+
+  string auth_token;
+  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
+
+  new_request->AddAuthBearerHeader(auth_token);
+
+  if (additional_header_) {
+    new_request->AddHeader(additional_header_->first,
+                           additional_header_->second);
+  }
+
+  if (!throttle_.AdmitRequest()) {
+    return errors::Unavailable("Request throttled");
+  }
+
+  *request = std::move(new_request);
+  return Status::OK();
+}
+
 REGISTER_FILE_SYSTEM("gs", RetryingGcsFileSystem);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 4b4853c838abb2d2cc1a6cf68877a0dedcbcc15c..e8edde8a445aad4c0310394d89480dc6ae445dfa 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -17,12 +17,15 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_GCS_FILE_SYSTEM_H_
 
 #include <string>
+#include <utility>
 #include <vector>
+
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/cloud/auth_provider.h"
 #include "tensorflow/core/platform/cloud/expiring_lru_cache.h"
 #include "tensorflow/core/platform/cloud/file_block_cache.h"
 #include "tensorflow/core/platform/cloud/gcs_dns_cache.h"
+#include "tensorflow/core/platform/cloud/gcs_throttle.h"
 #include "tensorflow/core/platform/cloud/http_request.h"
 #include "tensorflow/core/platform/cloud/retrying_file_system.h"
 #include "tensorflow/core/platform/file_system.h"
@@ -35,6 +38,8 @@ namespace tensorflow {
 /// which adds retry logic to GCS operations.
 class GcsFileSystem : public FileSystem {
  public:
+  struct TimeoutConfig;
+
   GcsFileSystem();
   GcsFileSystem(std::unique_ptr<AuthProvider> auth_provider,
                 std::unique_ptr<HttpRequest::Factory> http_request_factory,
@@ -42,7 +47,8 @@ class GcsFileSystem : public FileSystem {
                 uint64 stat_cache_max_age, size_t stat_cache_max_entries,
                 uint64 matching_paths_cache_max_age,
                 size_t matching_paths_cache_max_entries,
-                int64 initial_retry_delay_usec);
+                int64 initial_retry_delay_usec, TimeoutConfig timeouts,
+                std::pair<const string, const string>* additional_header);
 
   Status NewRandomAccessFile(
       const string& filename,
@@ -82,11 +88,20 @@ class GcsFileSystem : public FileSystem {
   Status DeleteRecursively(const string& dirname, int64* undeleted_files,
                            int64* undeleted_dirs) override;
 
+  void FlushCaches() override;
+
   /// These accessors are mainly for testing purposes, to verify that the
   /// environment variables that control these parameters are handled correctly.
   size_t block_size() const { return file_block_cache_->block_size(); }
   size_t max_bytes() const { return file_block_cache_->max_bytes(); }
   uint64 max_staleness() const { return file_block_cache_->max_staleness(); }
+  TimeoutConfig timeouts() const { return timeouts_; }
+  string additional_header_name() const {
+    return additional_header_ ? additional_header_->first : "";
+  }
+  string additional_header_value() const {
+    return additional_header_ ? additional_header_->second : "";
+  }
 
   uint64 stat_cache_max_age() const { return stat_cache_->max_age(); }
   size_t stat_cache_max_entries() const { return stat_cache_->max_entries(); }
@@ -98,6 +113,43 @@ class GcsFileSystem : public FileSystem {
     return matching_paths_cache_->max_entries();
   }
 
+  /// Structure containing the information for timeouts related to accessing the
+  /// GCS APIs.
+  ///
+  /// All values are in seconds.
+  struct TimeoutConfig {
+    // The request connection timeout. If a connection cannot be established
+    // within `connect` seconds, abort the request.
+    uint32 connect = 120;  // 2 minutes
+
+    // The request idle timeout. If a request has seen no activity in `idle`
+    // seconds, abort the request.
+    uint32 idle = 60;  // 1 minute
+
+    // The maximum total time a metadata request can take. If a request has not
+    // completed within `metadata` seconds, the request is aborted.
+    uint32 metadata = 3600;  // 1 hour
+
+    // The maximum total time a block read request can take. If a request has
+    // not completed within `read` seconds, the request is aborted.
+    uint32 read = 3600;  // 1 hour
+
+    // The maximum total time an upload request can take. If a request has not
+    // completed within `write` seconds, the request is aborted.
+    uint32 write = 3600;  // 1 hour
+
+    TimeoutConfig() {}
+    TimeoutConfig(uint32 connect, uint32 idle, uint32 metadata, uint32 read,
+                  uint32 write)
+        : connect(connect),
+          idle(idle),
+          metadata(metadata),
+          read(read),
+          write(write) {}
+  };
+
+  Status CreateHttpRequest(std::unique_ptr<HttpRequest>* request);
+
  private:
   /// \brief Checks if the bucket exists. Returns OK if the check succeeded.
   ///
@@ -137,12 +189,13 @@ class GcsFileSystem : public FileSystem {
 
   /// Loads file contents from GCS for a given filename, offset, and length.
   Status LoadBufferFromGCS(const string& filename, size_t offset, size_t n,
-                           std::vector<char>* out);
+                           char* buffer, size_t* bytes_transferred);
 
   std::unique_ptr<AuthProvider> auth_provider_;
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
   std::unique_ptr<FileBlockCache> file_block_cache_;
   std::unique_ptr<GcsDnsCache> dns_cache_;
+  GcsThrottle throttle_;
 
   using StatCache = ExpiringLRUCache<FileStatistics>;
   std::unique_ptr<StatCache> stat_cache_;
@@ -150,9 +203,14 @@ class GcsFileSystem : public FileSystem {
   using MatchingPathsCache = ExpiringLRUCache<std::vector<string>>;
   std::unique_ptr<MatchingPathsCache> matching_paths_cache_;
 
+  TimeoutConfig timeouts_;
+
   /// The initial delay for exponential backoffs when retrying failed calls.
   const int64 initial_retry_delay_usec_ = 1000000L;
 
+  // Additional header material to be transmitted with all GCS requests
+  std::unique_ptr<std::pair<const string, const string>> additional_header_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(GcsFileSystem);
 };
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 7614ec4d7f01369eff1b21141818c673154b7542..d452074ce312f98abe6b058ea56d2e0ce4cf047a 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -22,6 +22,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+static GcsFileSystem::TimeoutConfig kTestTimeoutConfig(5, 1, 10, 20, 30);
+
 class FakeAuthProvider : public AuthProvider {
  public:
   Status GetToken(string* token) override {
@@ -35,12 +37,14 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-5\n",
+           "Range: 0-5\n"
+           "Timeouts: 5 1 20\n",
            "012345"),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 6-11\n",
+           "Range: 6-11\n"
+           "Timeouts: 5 1 20\n",
            "6789")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -49,7 +53,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -73,12 +78,14 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_differentN) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-2\n",
+           "Range: 0-2\n"
+           "Timeouts: 5 1 20\n",
            "012"),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 3-12\n",
+           "Range: 3-12\n"
+           "Timeouts: 5 1 20\n",
            "3456789")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -87,7 +94,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_differentN) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
@@ -116,17 +124,20 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-8\n",
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
            "012345678"),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 9-17\n",
+           "Range: 9-17\n"
+           "Timeouts: 5 1 20\n",
            "9abcde"),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 18-26\n",
+           "Range: 18-26\n"
+           "Timeouts: 5 1 20\n",
            "")});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -135,7 +146,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
       9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
       0 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
 
   char scratch[100];
   StringPiece result;
@@ -185,17 +197,62 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
   EXPECT_EQ("0123", result);
 }
 
+TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
+  // Our underlying file in this test is a 15 byte file with contents
+  // "0123456789abcde".
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "012345678"),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "012345678")});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
+      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+
+  char scratch[100];
+  StringPiece result;
+  std::unique_ptr<RandomAccessFile> file;
+  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+  // Read the first chunk. The cache will be populated with the first block of
+  // 9 bytes.
+  scratch[5] = 'x';
+  TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
+  EXPECT_EQ("0123", result);
+  EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
+  // Flush caches and read the second chunk. This will be a cache miss, and
+  // the same block will be fetched again.
+  fs.FlushCaches();
+  TF_EXPECT_OK(file->Read(4, 4, &result, scratch));
+  EXPECT_EQ("4567", result);
+}
+
 TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
   // Our underlying file in this test is a 16 byte file with contents
   // "0123456789abcdef".
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest("Uri: https://storage.googleapis.com/bucket/object\n"
                            "Auth Token: fake_token\n"
-                           "Range: 0-7\n",
+                           "Range: 0-7\n"
+                           "Timeouts: 5 1 20\n",
                            "01234567"),
        new FakeHttpRequest("Uri: https://storage.googleapis.com/bucket/object\n"
                            "Auth Token: fake_token\n"
-                           "Range: 8-15\n",
+                           "Range: 8-15\n"
+                           "Timeouts: 5 1 20\n",
                            "89abcdef")});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -204,7 +261,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
       8 /* block size */, 16 /* max bytes */, 3600 /* max staleness */,
       0 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
   char scratch[100];
   StringPiece result;
   // There should only be two HTTP requests issued to GCS even though we iterate
@@ -245,7 +303,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
       0 /* read ahead bytes */, 0 /* max bytes */, 0 /* max staleness */,
       0 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
 
   std::unique_ptr<RandomAccessFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -257,24 +316,28 @@ TEST(GcsFileSystemTest, NewWritableFile) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
-           "Range: 0-7\n",
+           "Range: 0-7\n"
+           "Timeouts: 5 1 20\n",
            "01234567"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
            "uploadType=resumable&name=path%2Fwriteable\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            ""),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
-           "Range: 0-7\n",
+           "Range: 0-7\n"
+           "Timeouts: 5 1 20\n",
            "01234567")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -283,7 +346,8 @@ TEST(GcsFileSystemTest, NewWritableFile) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   // Read from the file first, to fill the block cache.
   std::unique_ptr<RandomAccessFile> rfile;
@@ -315,15 +379,18 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
            "uploadType=resumable&name=path%2Fwriteable.txt\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            "", errors::Unavailable("503"), 503),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
                            "", errors::FailedPrecondition("308"), nullptr,
@@ -331,10 +398,12 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 11-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: ntent2\n",
                            "", errors::Unavailable("503"), 503),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
                            "", errors::FailedPrecondition("308"), nullptr,
@@ -342,6 +411,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 13-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: ent2\n",
                            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -351,7 +421,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceeds) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -369,29 +440,34 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
-           "Range: 0-7\n",
+           "Range: 0-7\n"
+           "Timeouts: 5 1 20\n",
            "01234567"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
            "uploadType=resumable&name=path%2Fwriteable\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            "", errors::Unavailable("503"), 503),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
                            "", Status::OK(), nullptr, {}, 201),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
-           "Range: 0-7\n",
+           "Range: 0-7\n"
+           "Timeouts: 5 1 20\n",
            "01234567")});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -400,7 +476,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
       8 /* block size */, 8 /* max bytes */, 3600 /* max staleness */,
       0 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
   // Pull the file's first block into the cache. This will trigger the first
   // HTTP request to GCS.
   std::unique_ptr<RandomAccessFile> rfile;
@@ -434,17 +511,20 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
            "uploadType=resumable&name=path%2Fwriteable.txt\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            "", errors::Unavailable("503"), 503)});
   for (int i = 0; i < 10; i++) {
     requests.emplace_back(new FakeHttpRequest(
         "Uri: https://custom/upload/location\n"
         "Auth Token: fake_token\n"
+        "Timeouts: 5 1 10\n"
         "Header Content-Range: bytes */17\n"
         "Put: yes\n",
         "", errors::FailedPrecondition("important HTTP error 308"), nullptr,
@@ -453,6 +533,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
         "Uri: https://custom/upload/location\n"
         "Auth Token: fake_token\n"
         "Header Content-Range: bytes 11-16/17\n"
+        "Timeouts: 5 1 30\n"
         "Put body: ntent2\n",
         "", errors::Unavailable("important HTTP error 503"), 503));
   }
@@ -463,12 +544,14 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
       "uploadType=resumable&name=path%2Fwriteable.txt\n"
       "Auth Token: fake_token\n"
       "Header X-Upload-Content-Length: 17\n"
-      "Post: yes\n",
+      "Post: yes\n"
+      "Timeouts: 5 1 10\n",
       "", {{"Location", "https://custom/upload/location"}}));
   requests.emplace_back(
       new FakeHttpRequest("Uri: https://custom/upload/location\n"
                           "Auth Token: fake_token\n"
                           "Header Content-Range: bytes 0-16/17\n"
+                          "Timeouts: 5 1 30\n"
                           "Put body: content1,content2\n",
                           ""));
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -478,7 +561,8 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   2 /* initial retry delay */);
+                   2 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -500,11 +584,13 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
            "uploadType=resumable&name=path%2Fwriteable.txt\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            "", errors::NotFound("important HTTP error 410"),
                            410),
@@ -515,11 +601,13 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
            "uploadType=resumable&name=path%2Fwriteable.txt\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -529,7 +617,8 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<WritableFile> file;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable.txt", &file));
@@ -558,7 +647,8 @@ TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -570,24 +660,28 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fappendable\n"
            "Auth Token: fake_token\n"
-           "Range: 0-31\n",
+           "Range: 0-31\n"
+           "Timeouts: 5 1 20\n",
            "content1,"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
            "uploadType=resumable&name=path%2Fappendable\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 17\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
                            "Header Content-Range: bytes 0-16/17\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            ""),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fappendable\n"
            "Auth Token: fake_token\n"
-           "Range: 0-31\n",
+           "Range: 0-31\n"
+           "Timeouts: 5 1 20\n",
            "01234567")});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -596,7 +690,8 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
       32 /* block size */, 32 /* max bytes */, 0 /* max staleness */,
       0 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
 
   // Create an appendable file. This should read the file from GCS, and pull its
   // contents into the block cache.
@@ -629,7 +724,8 @@ TEST(GcsFileSystemTest, NewAppendableFile_NoObjectName) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<WritableFile> file;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -642,7 +738,8 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Frandom_access.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"", content.size(),
                            "\", \"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
@@ -650,7 +747,7 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
                            "path%2Frandom_access.txt\n"
                            "Auth Token: fake_token\n"
                            "Range: 0-",
-                           content.size() - 1, "\n"),
+                           content.size() - 1, "\n", "Timeouts: 5 1 20\n"),
            content)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -659,7 +756,8 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile(
@@ -678,7 +776,8 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile_NoObjectName) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::unique_ptr<ReadOnlyMemoryRegion> region;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -689,7 +788,8 @@ TEST(GcsFileSystemTest, FileExists_YesAsObject) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
       "path%2Ffile1.txt?fields=size%2Cupdated\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -699,7 +799,8 @@ TEST(GcsFileSystemTest, FileExists_YesAsObject) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt"));
 }
@@ -709,13 +810,15 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsubfolder?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubfolder%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/subfolder/\" }]}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -725,7 +828,8 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder"));
 }
@@ -734,11 +838,13 @@ TEST(GcsFileSystemTest, FileExists_YesAsBucket) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"size\": \"100\"}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"size\": \"100\"}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -747,7 +853,8 @@ TEST(GcsFileSystemTest, FileExists_YesAsBucket) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.FileExists("gs://bucket1"));
   TF_EXPECT_OK(fs.FileExists("gs://bucket1/"));
@@ -758,13 +865,15 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Ffile1.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Ffile1.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": []}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -773,7 +882,8 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(errors::Code::NOT_FOUND,
             fs.FileExists("gs://bucket/path/file1.txt").code());
@@ -783,11 +893,13 @@ TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket2\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket2\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -796,7 +908,8 @@ TEST(GcsFileSystemTest, FileExists_NotAsBucket) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.FileExists("gs://bucket2/").code());
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -808,19 +921,22 @@ TEST(GcsFileSystemTest, FileExists_StatCache) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Ffile1.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsubfolder?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubfolder%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/subfolder/\" }]}")});
   GcsFileSystem fs(
@@ -830,7 +946,8 @@ TEST(GcsFileSystemTest, FileExists_StatCache) {
       0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
       3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
 
   // The stat cache will ensure that repeated lookups don't trigger additional
   // HTTP requests.
@@ -845,7 +962,8 @@ TEST(GcsFileSystemTest, GetChildren_NoItems) {
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&prefix="
       "path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"prefixes\": [\"path/subpath/\"]}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -854,7 +972,8 @@ TEST(GcsFileSystemTest, GetChildren_NoItems) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -867,7 +986,8 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles) {
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&prefix="
       "path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/file3.txt\" }],"
@@ -879,7 +999,8 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -893,7 +1014,8 @@ TEST(GcsFileSystemTest, GetChildren_SelfDirectoryMarker) {
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&prefix="
       "path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path/file3.txt\" }],"
@@ -905,7 +1027,8 @@ TEST(GcsFileSystemTest, GetChildren_SelfDirectoryMarker) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -918,7 +1041,8 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles_NoSlash) {
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&prefix="
       "path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/file3.txt\" }],"
@@ -930,7 +1054,8 @@ TEST(GcsFileSystemTest, GetChildren_ThreeFiles_NoSlash) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -943,7 +1068,8 @@ TEST(GcsFileSystemTest, GetChildren_Root) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket-a-b-c/o?"
       "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -952,7 +1078,8 @@ TEST(GcsFileSystemTest, GetChildren_Root) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket-a-b-c", &children));
@@ -965,7 +1092,8 @@ TEST(GcsFileSystemTest, GetChildren_Empty) {
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&prefix="
       "path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -974,7 +1102,8 @@ TEST(GcsFileSystemTest, GetChildren_Empty) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path/", &children));
@@ -988,7 +1117,8 @@ TEST(GcsFileSystemTest, GetChildren_Pagination) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&"
            "prefix=path%2F\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"nextPageToken\": \"ABCD==\", "
            "\"items\": [ "
            "  { \"name\": \"path/file1.txt\" },"
@@ -999,7 +1129,8 @@ TEST(GcsFileSystemTest, GetChildren_Pagination) {
            "fields=items%2Fname%2Cprefixes%2CnextPageToken&delimiter=%2F&"
            "prefix=path%2F"
            "&pageToken=ABCD==\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/file4.txt\" },"
            "  { \"name\": \"path/file5.txt\" }]}")});
@@ -1011,7 +1142,8 @@ TEST(GcsFileSystemTest, GetChildren_Pagination) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> children;
   TF_EXPECT_OK(fs.GetChildren("gs://bucket/path", &children));
@@ -1025,7 +1157,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_NoWildcard) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubpath%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/subpath/file2.txt\" }]}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1035,7 +1168,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_NoWildcard) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> result;
   TF_EXPECT_OK(
@@ -1048,7 +1182,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_BucketAndWildcard) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
@@ -1060,7 +1195,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_BucketAndWildcard) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/*/*", &result));
@@ -1074,7 +1210,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_Matches) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
@@ -1086,7 +1223,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_Matches) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file2.txt", &result));
@@ -1098,7 +1236,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SelfDirectoryMarker) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/\" },"
       "  { \"name\": \"path/file3.txt\" }]}")});
@@ -1109,7 +1248,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SelfDirectoryMarker) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
@@ -1120,7 +1260,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" },"
       "  { \"name\": \"path/subpath/file2.txt\" },"
@@ -1132,7 +1273,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*/file3.txt", &result));
@@ -1148,7 +1290,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_OnlyWildcard) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   std::vector<string> result;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1160,13 +1303,15 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubpath%2F\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/subpath/file2.txt\" }]}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/file1.txt\" },"
            "  { \"name\": \"path/subpath/file2.txt\" },"
@@ -1178,7 +1323,8 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    3600 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   // Repeated calls to fs.GetMatchingPaths on these patterns should not lead to
   // any additional HTTP requests to GCS.
@@ -1196,22 +1342,70 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache) {
   }
 }
 
+TEST(GcsFileSystemTest, GetMatchingPaths_Cache_Flush) {
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
+           "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubpath%2F\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           "{\"items\": [ "
+           "  { \"name\": \"path/subpath/file2.txt\" }]}"),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
+           "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubpath%2F\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           "{\"items\": [ "
+           "  { \"name\": \"path/subpath/file2.txt\" }]}")});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   3600 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
+
+  // This loop should trigger the first HTTP request to GCS.
+  for (int i = 0; i < 10; i++) {
+    std::vector<string> result;
+    TF_EXPECT_OK(
+        fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt", &result));
+    EXPECT_EQ(std::vector<string>({"gs://bucket/path/subpath/file2.txt"}),
+              result);
+  }
+  // After flushing caches, there should be another (identical) request to GCS.
+  fs.FlushCaches();
+  for (int i = 0; i < 10; i++) {
+    std::vector<string> result;
+    TF_EXPECT_OK(
+        fs.GetMatchingPaths("gs://bucket/path/subpath/file2.txt", &result));
+    EXPECT_EQ(std::vector<string>({"gs://bucket/path/subpath/file2.txt"}),
+              result);
+  }
+}
+
 TEST(GcsFileSystemTest, DeleteFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Ffile1.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-15\n",
+           "Range: 0-15\n"
+           "Timeouts: 5 1 20\n",
            "01234567"),
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile1.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Ffile1.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-15\n",
+           "Range: 0-15\n"
+           "Timeouts: 5 1 20\n",
            "76543210")});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1220,7 +1414,8 @@ TEST(GcsFileSystemTest, DeleteFile) {
       16 /* block size */, 16 /* max bytes */, 0 /* max staleness */,
       0 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
 
   // Do an initial read of the file to load its contents into the block cache.
   char scratch[100];
@@ -1246,7 +1441,8 @@ TEST(GcsFileSystemTest, DeleteFile_NoObjectName) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
             fs.DeleteFile("gs://bucket/").code());
@@ -1256,7 +1452,8 @@ TEST(GcsFileSystemTest, DeleteDir_Empty) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken&prefix=path%2F&maxResults=2\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1265,7 +1462,8 @@ TEST(GcsFileSystemTest, DeleteDir_Empty) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -1275,12 +1473,14 @@ TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F&maxResults=2\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/\" }]}"),
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2F\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1290,7 +1490,8 @@ TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket/path/"));
 }
@@ -1298,7 +1499,8 @@ TEST(GcsFileSystemTest, DeleteDir_OnlyDirMarkerLeft) {
 TEST(GcsFileSystemTest, DeleteDir_BucketOnly) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?fields=items%2F"
-      "name%2CnextPageToken&maxResults=2\nAuth Token: fake_token\n",
+      "name%2CnextPageToken&maxResults=2\nAuth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1307,7 +1509,8 @@ TEST(GcsFileSystemTest, DeleteDir_BucketOnly) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.DeleteDir("gs://bucket"));
 }
@@ -1316,7 +1519,8 @@ TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
       "fields=items%2Fname%2CnextPageToken&prefix=path%2F&maxResults=2\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{\"items\": [ "
       "  { \"name\": \"path/file1.txt\" }]}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1326,7 +1530,8 @@ TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.DeleteDir("gs://bucket/path/").code());
@@ -1336,7 +1541,8 @@ TEST(GcsFileSystemTest, GetFileSize) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
       "file.txt?fields=size%2Cupdated\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1346,7 +1552,8 @@ TEST(GcsFileSystemTest, GetFileSize) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   uint64 size;
   TF_EXPECT_OK(fs.GetFileSize("gs://bucket/file.txt", &size));
@@ -1362,7 +1569,8 @@ TEST(GcsFileSystemTest, GetFileSize_NoObjectName) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   uint64 size;
   EXPECT_EQ(errors::Code::INVALID_ARGUMENT,
@@ -1376,14 +1584,16 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path1%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path1/subfolder/file1.txt\" }]}"),
        // Requesting the full list of files in the folder.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path1%2F\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path1/\" },"  // A directory marker.
            "  { \"name\": \"path1/subfolder/file1.txt\" },"
@@ -1393,13 +1603,15 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path1%2F/rewriteTo/b/bucket/o/path2%2F\n"
            "Auth Token: fake_token\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "{\"done\": true}"),
        // Deleting the original directory marker.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path1%2F\n"
            "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            ""),
        // Copying the first file.
@@ -1408,13 +1620,15 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "path1%2Fsubfolder%2Ffile1.txt/rewriteTo/b/bucket/o/"
            "path2%2Fsubfolder%2Ffile1.txt\n"
            "Auth Token: fake_token\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "{\"done\": true}"),
        // Deleting the first original file.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path1%2Fsubfolder%2Ffile1.txt\n"
            "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            ""),
        // Copying the second file.
@@ -1422,13 +1636,15 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path1%2Ffile2.txt/rewriteTo/b/bucket/o/path2%2Ffile2.txt\n"
            "Auth Token: fake_token\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "{\"done\": true}"),
        // Deleting the second original file.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path1%2Ffile2.txt\n"
            "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1438,7 +1654,8 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.RenameFile("gs://bucket/path1", "gs://bucket/path2/"));
 }
@@ -1448,25 +1665,29 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-15\n",
+           "Range: 0-15\n"
+           "Timeouts: 5 1 20\n",
            "01234567"),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-15\n",
+           "Range: 0-15\n"
+           "Timeouts: 5 1 20\n",
            "76543210"),
        // IsDirectory is checking whether there are children objects.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsrc.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // Copying to the new location.
@@ -1474,24 +1695,28 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt/rewriteTo/b/bucket/o/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "{\"done\": true}"),
        // Deleting the original file.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            ""),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-15\n",
+           "Range: 0-15\n"
+           "Timeouts: 5 1 20\n",
            "89abcdef"),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
-           "Range: 0-15\n",
+           "Range: 0-15\n"
+           "Timeouts: 5 1 20\n",
            "fedcba98")});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1500,7 +1725,8 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
       16 /* block size */, 64 /* max bytes */, 0 /* max staleness */,
       0 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
   // Do an initial read of the source and destination files to load their
   // contents into the block cache.
   char scratch[100];
@@ -1531,13 +1757,15 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsrc.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // Copying to the new location.
@@ -1545,13 +1773,15 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt/rewriteTo/b/bucket/o/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "{\"done\": true}"),
        // Deleting the original file - the deletion returns a failure.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            "", errors::Unavailable("503"), 503),
        // Deleting the original file again - the deletion returns NOT_FOUND.
@@ -1559,6 +1789,7 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            "", errors::NotFound("404"), 404)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1568,7 +1799,8 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(
       fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
@@ -1582,13 +1814,15 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsrc.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // Copying to the new location.
@@ -1596,7 +1830,8 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Fsrc.txt/rewriteTo/b/bucket/o/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "{\"done\": false}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1605,7 +1840,8 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(
       errors::Code::UNIMPLEMENTED,
@@ -1617,7 +1853,8 @@ TEST(GcsFileSystemTest, Stat_Object) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
       "file.txt?fields=size%2Cupdated\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       strings::StrCat("{\"size\": \"1010\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1627,7 +1864,8 @@ TEST(GcsFileSystemTest, Stat_Object) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
@@ -1641,13 +1879,15 @@ TEST(GcsFileSystemTest, Stat_Folder) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "subfolder?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=subfolder%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"subfolder/\" }]}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1657,7 +1897,8 @@ TEST(GcsFileSystemTest, Stat_Folder) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder", &stat));
@@ -1671,13 +1912,15 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1686,7 +1929,8 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/path", &stat).code());
@@ -1695,7 +1939,8 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
 TEST(GcsFileSystemTest, Stat_Bucket) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1704,7 +1949,8 @@ TEST(GcsFileSystemTest, Stat_Bucket) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("gs://bucket/", &stat));
@@ -1716,7 +1962,8 @@ TEST(GcsFileSystemTest, Stat_Bucket) {
 TEST(GcsFileSystemTest, Stat_BucketNotFound) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "", errors::NotFound("404"), 404)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1725,7 +1972,8 @@ TEST(GcsFileSystemTest, Stat_BucketNotFound) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   FileStatistics stat;
   EXPECT_EQ(error::Code::NOT_FOUND, fs.Stat("gs://bucket/", &stat).code());
@@ -1736,19 +1984,22 @@ TEST(GcsFileSystemTest, Stat_Cache) {
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "file.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "subfolder?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=subfolder%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"subfolder/\" }]}")});
   GcsFileSystem fs(
@@ -1758,7 +2009,8 @@ TEST(GcsFileSystemTest, Stat_Cache) {
       0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
       3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */);
+      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
 
   // Repeated calls to fs.Stat on these paths should not lead to any additional
   // HTTP requests to GCS.
@@ -1775,18 +2027,64 @@ TEST(GcsFileSystemTest, Stat_Cache) {
   }
 }
 
+TEST(GcsFileSystemTest, Stat_Cache_Flush) {
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "file.txt?fields=size%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"1010\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "file.txt?fields=size%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"1010\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+  // There should be a single HTTP request to GCS for fs.Stat in this loop.
+  for (int i = 0; i < 10; i++) {
+    FileStatistics stat;
+    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+    EXPECT_EQ(1010, stat.length);
+    EXPECT_NEAR(1461971724896, stat.mtime_nsec / 1000 / 1000, 1);
+    EXPECT_FALSE(stat.is_directory);
+  }
+  // After flushing caches, there should be a second request to GCS for fs.Stat.
+  fs.FlushCaches();
+  for (int i = 0; i < 10; i++) {
+    FileStatistics stat;
+    TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+    EXPECT_EQ(1010, stat.length);
+    EXPECT_NEAR(1461971724896, stat.mtime_nsec / 1000 / 1000, 1);
+    EXPECT_FALSE(stat.is_directory);
+  }
+}
+
 TEST(GcsFileSystemTest, IsDirectory_NotFound) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=file.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "file.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1795,7 +2093,8 @@ TEST(GcsFileSystemTest, IsDirectory_NotFound) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(error::Code::NOT_FOUND,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -1807,12 +2106,14 @@ TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=file.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "file.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"size\": \"1010\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1822,7 +2123,8 @@ TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(error::Code::FAILED_PRECONDITION,
             fs.IsDirectory("gs://bucket/file.txt").code());
@@ -1834,13 +2136,15 @@ TEST(GcsFileSystemTest, IsDirectory_Yes) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=subfolder%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [{\"name\": \"subfolder/\"}]}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=subfolder%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [{\"name\": \"subfolder/\"}]}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1849,7 +2153,8 @@ TEST(GcsFileSystemTest, IsDirectory_Yes) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/subfolder/"));
@@ -1859,11 +2164,13 @@ TEST(GcsFileSystemTest, IsDirectory_Bucket) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1872,7 +2179,8 @@ TEST(GcsFileSystemTest, IsDirectory_Bucket) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket"));
   TF_EXPECT_OK(fs.IsDirectory("gs://bucket/"));
@@ -1881,7 +2189,8 @@ TEST(GcsFileSystemTest, IsDirectory_Bucket) {
 TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-      "Auth Token: fake_token\n",
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
       "", errors::NotFound("404"), 404)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1890,7 +2199,8 @@ TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   EXPECT_EQ(error::Code::NOT_FOUND, fs.IsDirectory("gs://bucket/").code());
 }
@@ -1902,10 +2212,12 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
            "uploadType=resumable&name=subpath%2F\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 0\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: \n",
                            ""),
        new FakeHttpRequest(
@@ -1913,10 +2225,12 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
            "uploadType=resumable&name=subpath%2F\n"
            "Auth Token: fake_token\n"
            "Header X-Upload-Content-Length: 0\n"
-           "Post: yes\n",
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
            "", {{"Location", "https://custom/upload/location"}}),
        new FakeHttpRequest("Uri: https://custom/upload/location\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 30\n"
                            "Put body: \n",
                            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -1926,7 +2240,8 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath"));
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath/"));
@@ -1936,11 +2251,13 @@ TEST(GcsFileSystemTest, CreateDir_Bucket) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            ""),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1949,7 +2266,8 @@ TEST(GcsFileSystemTest, CreateDir_Bucket) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/"));
   TF_EXPECT_OK(fs.CreateDir("gs://bucket"));
@@ -1962,14 +2280,16 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/file1.txt\" }]}"),
        // GetChildren recursively.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/\" },"  // The current directory's marker.
            "  { \"name\": \"path/file1.txt\" },"
@@ -1979,30 +2299,35 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2F\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
        // Delete the object - fails and will be retried.
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile1.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "", errors::Unavailable("500"), 500),
        // Delete the object again.
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile1.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
        // Delete the object.
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Fsubpath%2Ffile2.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
        // Delete the object.
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile3.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "")});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -2012,7 +2337,8 @@ TEST(GcsFileSystemTest, DeleteRecursively_Ok) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -2028,14 +2354,16 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/file1.txt\" }]}"),
        // Calling GetChildren recursively.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{\"items\": [ "
            "  { \"name\": \"path/file1.txt\" },"
            "  { \"name\": \"path/subpath/\" },"
@@ -2045,12 +2373,14 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile1.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
        // Deleting the directory marker gs://bucket/path/ - fails with 404.
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Fsubpath%2F\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "", errors::NotFound("404"), 404),
        // Checking if gs://bucket/path/subpath/ is a folder - it is.
@@ -2058,19 +2388,22 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsubpath%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            strings::StrCat("{\"items\": [ "
                            "    { \"name\": \"path/subpath/\" }]}")),
        // Deleting the object gs://bucket/path/subpath/file2.txt
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Fsubpath%2Ffile2.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
        // Deleting the object s://bucket/path/file3.txt - fails with 404.
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/path%2Ffile3.txt\n"
                            "Auth Token: fake_token\n"
+                           "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            "", errors::NotFound("404"), 404),
        // Checking if gs://bucket/path/file3.txt/ is a folder - it's not.
@@ -2078,13 +2411,15 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2Ffile3.txt%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        // Checking if gs://bucket/path/file3.txt is an object - fails with 404.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path%2Ffile3.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
 
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -2094,7 +2429,8 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   int64 undeleted_files, undeleted_dirs;
   TF_EXPECT_OK(fs.DeleteRecursively("gs://bucket/path", &undeleted_files,
@@ -2110,13 +2446,15 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
            "fields=items%2Fname%2CnextPageToken&prefix=path%2F"
            "&maxResults=1\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "{}"),
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
            "path?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n",
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -2125,7 +2463,8 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
                    0 /* stat cache max age */, 0 /* stat cache max entries */,
                    0 /* matching paths cache max age */,
                    0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */);
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
 
   int64 undeleted_files, undeleted_dirs;
   EXPECT_EQ(error::Code::NOT_FOUND,
@@ -2136,12 +2475,75 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
   EXPECT_EQ(1, undeleted_dirs);
 }
 
+TEST(GcsFileSystemTest, AdditionalRequestHeaderTest) {
+  GcsFileSystem fs1;
+  EXPECT_EQ("", fs1.additional_header_name());
+  EXPECT_EQ("", fs1.additional_header_value());
+
+  setenv("GCS_ADDITIONAL_REQUEST_HEADER",
+         "X-Add-Header:My Additional Header Value", 1);
+  GcsFileSystem fs2;
+  EXPECT_EQ("X-Add-Header", fs2.additional_header_name());
+  EXPECT_EQ("My Additional Header Value", fs2.additional_header_value());
+
+  setenv("GCS_ADDITIONAL_REQUEST_HEADER", "Someinvalidheadervalue", 1);
+  GcsFileSystem fs3;
+  EXPECT_EQ("", fs3.additional_header_name());
+  EXPECT_EQ("", fs3.additional_header_value());
+
+  setenv("GCS_ADDITIONAL_REQUEST_HEADER", ":thisisinvalid", 1);
+  GcsFileSystem fs4;
+  EXPECT_EQ("", fs4.additional_header_name());
+  EXPECT_EQ("", fs4.additional_header_value());
+
+  setenv("GCS_ADDITIONAL_REQUEST_HEADER", "soisthis:", 1);
+  GcsFileSystem fs5;
+  EXPECT_EQ("", fs5.additional_header_name());
+  EXPECT_EQ("", fs5.additional_header_value());
+
+  setenv("GCS_ADDITIONAL_REQUEST_HEADER", "a:b", 1);
+  GcsFileSystem fs6;
+  EXPECT_EQ("a", fs6.additional_header_name());
+  EXPECT_EQ("b", fs6.additional_header_value());
+
+  auto* add_header = new std::pair<const string, const string>(
+      "mynewheader", "newheadercontents");
+
+  std::vector<HttpRequest*> requests(
+      {// IsDirectory is checking whether there are children objects.
+       new FakeHttpRequest("Uri: https://www.googleapis.com/fake\n"
+                           "Auth Token: fake_token\n"
+                           "Header mynewheader: newheadercontents\n"
+                           "Header Hello: world\n",
+                           "{}")});
+  GcsFileSystem fs7(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, add_header /* gcs additional header */);
+
+  std::unique_ptr<HttpRequest> request;
+  TF_EXPECT_OK(fs7.CreateHttpRequest(&request));
+  request->SetUri("https://www.googleapis.com/fake");
+  request->AddHeader("Hello", "world");
+  TF_EXPECT_OK(request->Send());
+}
+
 TEST(GcsFileSystemTest, OverrideCacheParameters) {
   // Verify defaults are propagated correctly.
   GcsFileSystem fs1;
   EXPECT_EQ(128 * 1024 * 1024, fs1.block_size());
   EXPECT_EQ(2 * fs1.block_size(), fs1.max_bytes());
   EXPECT_EQ(0, fs1.max_staleness());
+  EXPECT_EQ(120, fs1.timeouts().connect);
+  EXPECT_EQ(60, fs1.timeouts().idle);
+  EXPECT_EQ(3600, fs1.timeouts().metadata);
+  EXPECT_EQ(3600, fs1.timeouts().read);
+  EXPECT_EQ(3600, fs1.timeouts().write);
 
   // Verify legacy readahead buffer override sets block size.
   setenv("GCS_READAHEAD_BUFFER_SIZE_BYTES", "123456789", 1);
@@ -2167,6 +2569,43 @@ TEST(GcsFileSystemTest, OverrideCacheParameters) {
   EXPECT_EQ(32, fs4.stat_cache_max_entries());
   EXPECT_EQ(30, fs4.matching_paths_cache_max_age());
   EXPECT_EQ(64, fs4.matching_paths_cache_max_entries());
+
+  // Verify timeout overrides.
+  setenv("GCS_REQUEST_CONNECTION_TIMEOUT_SECS", "10", 1);
+  setenv("GCS_REQUEST_IDLE_TIMEOUT_SECS", "5", 1);
+  setenv("GCS_METADATA_REQUEST_TIMEOUT_SECS", "20", 1);
+  setenv("GCS_READ_REQUEST_TIMEOUT_SECS", "30", 1);
+  setenv("GCS_WRITE_REQUEST_TIMEOUT_SECS", "40", 1);
+  GcsFileSystem fs5;
+  EXPECT_EQ(10, fs5.timeouts().connect);
+  EXPECT_EQ(5, fs5.timeouts().idle);
+  EXPECT_EQ(20, fs5.timeouts().metadata);
+  EXPECT_EQ(30, fs5.timeouts().read);
+  EXPECT_EQ(40, fs5.timeouts().write);
+}
+
+TEST(GcsFileSystemTest, CreateHttpRequest) {
+  std::vector<HttpRequest*> requests(
+      {// IsDirectory is checking whether there are children objects.
+       new FakeHttpRequest("Uri: https://www.googleapis.com/fake\n"
+                           "Auth Token: fake_token\n"
+                           "Header Hello: world\n",
+                           "{}")});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
+
+  std::unique_ptr<HttpRequest> request;
+  TF_EXPECT_OK(fs.CreateHttpRequest(&request));
+  request->SetUri("https://www.googleapis.com/fake");
+  request->AddHeader("Hello", "world");
+  TF_EXPECT_OK(request->Send());
 }
 
 }  // namespace
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.cc b/tensorflow/core/platform/cloud/gcs_throttle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb5f8958a37f45aeac1a836ca037f91931bb34a6
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_throttle.cc
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/gcs_throttle.h"
+
+#include <algorithm>
+
+namespace tensorflow {
+
+GcsThrottle::GcsThrottle(EnvTime* env_time)
+    : last_updated_secs_(env_time->NowSeconds()),
+      available_tokens_(0),
+      env_time_(env_time) {}
+
+bool GcsThrottle::AdmitRequest() {
+  mutex_lock l(mu_);
+  if (!config_.enabled) return true;
+  UpdateState();
+  if (available_tokens_ < config_.tokens_per_request) {
+    return false;
+  }
+  available_tokens_ -= config_.tokens_per_request;
+  return true;
+}
+
+void GcsThrottle::RecordResponse(size_t num_bytes) {
+  mutex_lock l(mu_);
+  if (!config_.enabled) return;
+  UpdateState();
+  available_tokens_ -= request_bytes_to_tokens(num_bytes);
+}
+
+void GcsThrottle::SetConfig(GcsThrottleConfig config) {
+  mutex_lock l(mu_);
+  config_ = config;
+  available_tokens_ = config.initial_tokens;
+  last_updated_secs_ = env_time_->NowSeconds();
+}
+
+void GcsThrottle::UpdateState() {
+  // TODO(b/72643279): Switch to a monotonic clock.
+  int64 now = env_time_->NowSeconds();
+  uint64 delta_secs =
+      std::max(0LL, now - static_cast<int64>(last_updated_secs_));
+  available_tokens_ += delta_secs * config_.token_rate;
+  available_tokens_ = std::min(available_tokens_, config_.bucket_size);
+  last_updated_secs_ = now;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.h b/tensorflow/core/platform/cloud/gcs_throttle.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a89daef084e921f1ad8bd856cefcc62d0d7aa1c
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_throttle.h
@@ -0,0 +1,156 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
+
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+/**
+ * GcsThrottleConfig is used to configure the GcsThrottle.
+ */
+struct GcsThrottleConfig {
+  /**
+   * enabled is true if GcsThrottle should throttle requests, false otherwise.
+   */
+  bool enabled = false;
+
+  /**
+   * token_rate is the number of tokens accrued every second that can be used
+   * for making requests to the GCS service.
+   */
+  int64 token_rate = 100000;  // Approximately 800 MBits/second bandwidth-only.
+
+  /**
+   * bucket_size is the maximum number of available tokens the GcsThrottle can
+   * accrue.
+   */
+  int64 bucket_size = 10000000;  // 10 million tokens total
+
+  /**
+   * tokens_per_request determines the number of tokens consumed for every
+   * request.
+   *
+   * Note: tokens are also consumed in proportion to the response size.
+   */
+  int64 tokens_per_request = 100;
+
+  /**
+   * initial_tokens determines how many tokens should be available immediately
+   * after the GcsThrottle is constructed.
+   */
+  int64 initial_tokens = 0;
+};
+
+/**
+ * GcsThrottle is used to ensure fair use of the available GCS capacity.
+ *
+ * GcsThrottle operates around a concept of tokens. Tokens are consumed when
+ * making requests to the GCS service. Tokens are consumed both based on the
+ * number of requests made, as well as the bandwidth consumed (response sizes).
+ *
+ * GcsThrottle is thread safe and can be used from multiple threads.
+ */
+class GcsThrottle {
+ public:
+  /**
+   * Constructs a GcsThrottle.
+   */
+  explicit GcsThrottle(EnvTime* env_time = EnvTime::Default());
+
+  /**
+   * AdmitRequest updates the GcsThrottle to record a request will be made.
+   *
+   * AdmitRequest should be called before any request is made. AdmitRequest
+   * returns false if the request should be denied. If AdmitRequest
+   * returns false, no tokens are consumed. If true is returned, the configured
+   * number of tokens are consumed.
+   */
+  bool AdmitRequest();
+
+  /**
+   * RecordResponse updates the GcsThrottle to record a request has been made.
+   *
+   * RecordResponse should be called after the response has been received.
+   * RecordResponse will update the internal state based on the number of bytes
+   * in the response.
+   *
+   * Note: we split up the request and the response in this fashion in order to
+   * avoid penalizing consumers who are using large readahead buffers at higher
+   * layers of the I/O stack.
+   */
+  void RecordResponse(size_t num_bytes);
+
+  /**
+   * SetConfig sets the configuration for GcsThrottle and re-initializes state.
+   *
+   * After calling this, the token pool will be config.initial_tokens.
+   */
+  void SetConfig(GcsThrottleConfig config);
+
+  /**
+   * available_tokens gives a snapshot of how many tokens are available.
+   *
+   * The returned value should not be used to make admission decisions. The
+   * purpose of this function is to make available to monitoring or other
+   * instrumentation the number of available tokens in the pool.
+   */
+  inline int64 available_tokens() {
+    mutex_lock l(mu_);
+    if (!config_.enabled) return 0;
+    UpdateState();
+    return available_tokens_;
+  }
+
+ private:
+  /**
+   * UpdateState updates the available_tokens_ and last_updated_secs_ variables.
+   *
+   * UpdateState should be called in order to mark the passage of time, and
+   * therefore add tokens to the availble_tokens_ pool.
+   */
+  void UpdateState() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  inline uint64 request_bytes_to_tokens(size_t num_bytes) {
+    return num_bytes >> 10;
+  }
+
+  mutex mu_;
+
+  /**
+   * last_updated_secs_ records the number of seconds since the Unix epoch that
+   * the internal state of the GcsThrottle was updated. This is important when
+   * determining the number of tokens to add to the available_tokens_ pool.
+   */
+  uint64 last_updated_secs_ GUARDED_BY(mu_) = 0;
+
+  /**
+   * available_tokens_ records how many tokens are available to be consumed.
+   *
+   * Note: it is possible for available_tokens_ to become negative. If a
+   * response comes back that consumes more than the available tokens, the count
+   * will go negative, and block future requests until we have available tokens.
+   */
+  int64 available_tokens_ GUARDED_BY(mu_) = 0;
+
+  EnvTime* const env_time_;
+  GcsThrottleConfig config_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_THROTTLE_H_
diff --git a/tensorflow/core/platform/cloud/gcs_throttle_test.cc b/tensorflow/core/platform/cloud/gcs_throttle_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..694756022e37263a07f8215bf7496c9ca130fd58
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_throttle_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/gcs_throttle.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+namespace {
+
+class TestTime : public EnvTime {
+ public:
+  uint64 NowMicros() override { return now_; }
+
+  void SetTime(uint64 now_micros) { now_ = now_micros; }
+
+  void AdvanceSeconds(int64 secs) { now_ += secs * 1000000L; }
+
+ private:
+  uint64 now_ = 1234567890000000ULL;
+};
+
+class GcsThrottleTest : public ::testing::Test {
+ protected:
+  GcsThrottleTest() : throttle_(&time_) {
+    config_.enabled = true;
+    throttle_.SetConfig(config_);
+  }
+
+  GcsThrottleConfig config_;
+  TestTime time_;
+  GcsThrottle throttle_;
+};
+
+TEST_F(GcsThrottleTest, ReplenishTokens) {
+  EXPECT_EQ(0, throttle_.available_tokens());
+  time_.AdvanceSeconds(1);
+  EXPECT_EQ(100000, throttle_.available_tokens());
+  time_.AdvanceSeconds(2);
+  EXPECT_EQ(300000, throttle_.available_tokens());
+}
+
+TEST_F(GcsThrottleTest, RejectRequest) {
+  EXPECT_EQ(0, throttle_.available_tokens());
+  time_.AdvanceSeconds(1);
+  EXPECT_TRUE(throttle_.AdmitRequest());
+  EXPECT_EQ(99900, throttle_.available_tokens());
+  for (int i = 1; i < 1000; i++) {
+    EXPECT_TRUE(throttle_.AdmitRequest());
+  }
+  EXPECT_FALSE(throttle_.AdmitRequest());
+}
+
+TEST_F(GcsThrottleTest, MarkResponses) {
+  time_.AdvanceSeconds(1);
+  EXPECT_TRUE(throttle_.AdmitRequest());
+  throttle_.RecordResponse(128000000);  // 128 MB response
+  EXPECT_EQ(-25100, throttle_.available_tokens());
+  EXPECT_FALSE(throttle_.AdmitRequest());
+  time_.AdvanceSeconds(1);
+  EXPECT_TRUE(throttle_.AdmitRequest())
+      << "Available tokens: " << throttle_.available_tokens();
+}
+
+TEST_F(GcsThrottleTest, Skippingtime_) {
+  EXPECT_EQ(0, throttle_.available_tokens());
+  time_.AdvanceSeconds(90);
+  EXPECT_EQ(9000000, throttle_.available_tokens());
+}
+
+TEST_F(GcsThrottleTest, BucketLimit) {
+  time_.AdvanceSeconds(120);
+  EXPECT_EQ(10000000, throttle_.available_tokens());
+}
+
+TEST_F(GcsThrottleTest, ReverseTime) {
+  time_.AdvanceSeconds(1);
+  EXPECT_EQ(100000, throttle_.available_tokens());
+  time_.AdvanceSeconds(-3600);
+  EXPECT_EQ(100000, throttle_.available_tokens());
+  time_.AdvanceSeconds(1);
+  EXPECT_EQ(200000, throttle_.available_tokens());
+}
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index f6fd8373cd593da3afdb159640b9cd29fcb795b5..7e39b63e3e8e19b3ed9e05e5c49422b42774567c 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/cloud/google_auth_provider.h"
+#ifndef _WIN32
 #include <pwd.h>
-#include <sys/types.h>
 #include <unistd.h>
+#else
+#include <sys/types.h>
+#endif
 #include <fstream>
 #include "include/json/json.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -208,10 +211,9 @@ Status GoogleAuthProvider::GetTokenFromGce() {
     std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
     std::vector<char> response_buffer;
     const uint64 request_timestamp_sec = env_->NowSeconds();
-    TF_RETURN_IF_ERROR(request->Init());
-    TF_RETURN_IF_ERROR(request->SetUri(kGceTokenUrl));
-    TF_RETURN_IF_ERROR(request->AddHeader("Metadata-Flavor", "Google"));
-    TF_RETURN_IF_ERROR(request->SetResultBuffer(&response_buffer));
+    request->SetUri(kGceTokenUrl);
+    request->AddHeader("Metadata-Flavor", "Google");
+    request->SetResultBuffer(&response_buffer);
     TF_RETURN_IF_ERROR(request->Send());
     StringPiece response =
         StringPiece(&response_buffer[0], response_buffer.size());
diff --git a/tensorflow/core/platform/cloud/http_request.h b/tensorflow/core/platform/cloud/http_request.h
index 02d9e9054ad3b22f3cd15cf7b24d917184db264b..df8a5b86a0b9b3354514be69cb03dd6472e51e86 100644
--- a/tensorflow/core/platform/cloud/http_request.h
+++ b/tensorflow/core/platform/cloud/http_request.h
@@ -50,33 +50,31 @@ class HttpRequest {
   HttpRequest() {}
   virtual ~HttpRequest() {}
 
-  virtual Status Init() = 0;
-
   /// Sets the request URI.
-  virtual Status SetUri(const string& uri) = 0;
+  virtual void SetUri(const string& uri) = 0;
 
   /// \brief Sets the Range header.
   ///
   /// Used for random seeks, for example "0-999" returns the first 1000 bytes
   /// (note that the right border is included).
-  virtual Status SetRange(uint64 start, uint64 end) = 0;
+  virtual void SetRange(uint64 start, uint64 end) = 0;
 
   /// Sets a request header.
-  virtual Status AddHeader(const string& name, const string& value) = 0;
+  virtual void AddHeader(const string& name, const string& value) = 0;
 
   /// Sets a DNS resolve mapping (to skip DNS resolution).
   ///
   /// Note: because GCS is available over HTTPS, we cannot replace the hostname
   /// in the URI with an IP address, as that will cause the certificate check
   /// to fail.
-  virtual Status AddResolveOverride(const string& hostname, int64 port,
-                                    const string& ip_addr) = 0;
+  virtual void AddResolveOverride(const string& hostname, int64 port,
+                                  const string& ip_addr) = 0;
 
   /// Sets the 'Authorization' header to the value of 'Bearer ' + auth_token.
-  virtual Status AddAuthBearerHeader(const string& auth_token) = 0;
+  virtual void AddAuthBearerHeader(const string& auth_token) = 0;
 
   /// Makes the request a DELETE request.
-  virtual Status SetDeleteRequest() = 0;
+  virtual void SetDeleteRequest() = 0;
 
   /// \brief Makes the request a PUT request.
   ///
@@ -85,21 +83,35 @@ class HttpRequest {
   virtual Status SetPutFromFile(const string& body_filepath, size_t offset) = 0;
 
   /// Makes the request a PUT request with an empty body.
-  virtual Status SetPutEmptyBody() = 0;
+  virtual void SetPutEmptyBody() = 0;
 
   /// \brief Makes the request a POST request.
   ///
   /// The request body will be taken from the specified buffer.
-  virtual Status SetPostFromBuffer(const char* buffer, size_t size) = 0;
+  virtual void SetPostFromBuffer(const char* buffer, size_t size) = 0;
 
   /// Makes the request a POST request with an empty body.
-  virtual Status SetPostEmptyBody() = 0;
+  virtual void SetPostEmptyBody() = 0;
 
   /// \brief Specifies the buffer for receiving the response body.
   ///
   /// Size of out_buffer after an access will be exactly the number of bytes
   /// read. Existing content of the vector will be cleared.
-  virtual Status SetResultBuffer(std::vector<char>* out_buffer) = 0;
+  virtual void SetResultBuffer(std::vector<char>* out_buffer) = 0;
+
+  /// \brief Specifies the buffer for receiving the response body.
+  ///
+  /// This method should be used when a caller knows the upper bound of the
+  /// size of the response data.  The caller provides a pre-allocated buffer
+  /// and its size. After the Send() method is called, the
+  /// GetResultBufferDirectBytesTransferred() method may be used to learn to the
+  /// number of bytes that were transferred using this method.
+  virtual void SetResultBufferDirect(char* buffer, size_t size) = 0;
+
+  /// \brief Returns the number of bytes transferred, when using
+  /// SetResultBufferDirect(). This method may only be used when using
+  /// SetResultBufferDirect().
+  virtual size_t GetResultBufferDirectBytesTransferred() = 0;
 
   /// \brief Returns the response headers of a completed request.
   ///
@@ -118,6 +130,16 @@ class HttpRequest {
   // Url encodes str and returns a new string.
   virtual string EscapeString(const string& str) = 0;
 
+  /// \brief Set timeouts for this request.
+  ///
+  /// The connection parameter controls how long we should wait for the
+  /// connection to be established. The inactivity parameter controls how long
+  /// we should wait between additional responses from the server. Finally the
+  /// total parameter controls the maximum total connection time to prevent
+  /// hanging indefinitely.
+  virtual void SetTimeouts(uint32 connection, uint32 inactivity,
+                           uint32 total) = 0;
+
   TF_DISALLOW_COPY_AND_ASSIGN(HttpRequest);
 };
 
diff --git a/tensorflow/core/platform/cloud/http_request_fake.h b/tensorflow/core/platform/cloud/http_request_fake.h
index bfe04f6363b6cde227f73333f2351b550be1dde1..7711eaceb290fb21c54c9656c473d912ebbd84cf 100644
--- a/tensorflow/core/platform/cloud/http_request_fake.h
+++ b/tensorflow/core/platform/cloud/http_request_fake.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_HTTP_REQUEST_FAKE_H_
 #define TENSORFLOW_CORE_PLATFORM_HTTP_REQUEST_FAKE_H_
 
+#include <algorithm>
 #include <fstream>
 #include <string>
 #include <vector>
@@ -74,27 +75,19 @@ class FakeHttpRequest : public CurlHttpRequest {
         response_headers_(response_headers),
         response_code_(response_code) {}
 
-  Status Init() override { return Status::OK(); }
-  Status SetUri(const string& uri) override {
-    actual_request_ += "Uri: " + uri + "\n";
-    return Status::OK();
+  void SetUri(const string& uri) override {
+    actual_uri_ += "Uri: " + uri + "\n";
   }
-  Status SetRange(uint64 start, uint64 end) override {
+  void SetRange(uint64 start, uint64 end) override {
     actual_request_ += strings::StrCat("Range: ", start, "-", end, "\n");
-    return Status::OK();
   }
-  Status AddHeader(const string& name, const string& value) override {
+  void AddHeader(const string& name, const string& value) override {
     actual_request_ += "Header " + name + ": " + value + "\n";
-    return Status::OK();
   }
-  Status AddAuthBearerHeader(const string& auth_token) override {
+  void AddAuthBearerHeader(const string& auth_token) override {
     actual_request_ += "Auth Token: " + auth_token + "\n";
-    return Status::OK();
-  }
-  Status SetDeleteRequest() override {
-    actual_request_ += "Delete: yes\n";
-    return Status::OK();
   }
+  void SetDeleteRequest() override { actual_request_ += "Delete: yes\n"; }
   Status SetPutFromFile(const string& body_filepath, size_t offset) override {
     std::ifstream stream(body_filepath);
     const string& content = string(std::istreambuf_iterator<char>(stream),
@@ -103,37 +96,44 @@ class FakeHttpRequest : public CurlHttpRequest {
     actual_request_ += "Put body: " + content + "\n";
     return Status::OK();
   }
-  Status SetPostFromBuffer(const char* buffer, size_t size) override {
+  void SetPostFromBuffer(const char* buffer, size_t size) override {
     if (captured_post_body_) {
       *captured_post_body_ = string(buffer, size);
     } else {
       actual_request_ +=
           strings::StrCat("Post body: ", StringPiece(buffer, size), "\n");
     }
-    return Status::OK();
-  }
-  Status SetPutEmptyBody() override {
-    actual_request_ += "Put: yes\n";
-    return Status::OK();
   }
-  Status SetPostEmptyBody() override {
+  void SetPutEmptyBody() override { actual_request_ += "Put: yes\n"; }
+  void SetPostEmptyBody() override {
     if (captured_post_body_) {
       *captured_post_body_ = "<empty>";
     } else {
       actual_request_ += "Post: yes\n";
     }
-    return Status::OK();
   }
-  Status SetResultBuffer(std::vector<char>* buffer) override {
+  void SetResultBuffer(std::vector<char>* buffer) override {
     buffer->clear();
     buffer_ = buffer;
-    return Status::OK();
+  }
+  void SetResultBufferDirect(char* buffer, size_t size) override {
+    direct_result_buffer_ = buffer;
+    direct_result_buffer_size_ = size;
+  }
+  size_t GetResultBufferDirectBytesTransferred() override {
+    return direct_result_bytes_transferred_;
   }
   Status Send() override {
-    EXPECT_EQ(expected_request_, actual_request_) << "Unexpected HTTP request.";
+    EXPECT_EQ(expected_request_, actual_request())
+        << "Unexpected HTTP request.";
     if (buffer_) {
-      buffer_->insert(buffer_->begin(), response_.c_str(),
-                      response_.c_str() + response_.size());
+      buffer_->insert(buffer_->begin(), response_.data(),
+                      response_.data() + response_.size());
+    } else if (direct_result_buffer_ != nullptr) {
+      size_t bytes_to_copy =
+          std::min<size_t>(direct_result_buffer_size_, response_.size());
+      memcpy(direct_result_buffer_, response_.data(), bytes_to_copy);
+      direct_result_bytes_transferred_ += bytes_to_copy;
     }
     return response_status_;
   }
@@ -160,9 +160,26 @@ class FakeHttpRequest : public CurlHttpRequest {
 
   virtual uint64 GetResponseCode() const override { return response_code_; }
 
+  void SetTimeouts(uint32 connection, uint32 inactivity,
+                   uint32 total) override {
+    actual_request_ += strings::StrCat("Timeouts: ", connection, " ",
+                                       inactivity, " ", total, "\n");
+  }
+
  private:
+  string actual_request() const {
+    string s;
+    s.append(actual_uri_);
+    s.append(actual_request_);
+    return s;
+  }
+
   std::vector<char>* buffer_ = nullptr;
+  char* direct_result_buffer_ = nullptr;
+  size_t direct_result_buffer_size_ = 0;
+  size_t direct_result_bytes_transferred_ = 0;
   string expected_request_;
+  string actual_uri_;
   string actual_request_;
   string response_;
   Status response_status_;
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index c700b97dc95f85400f9a8c214ea1ccc2b1a3e436..06849f9093099b23c8e60350fe5cd9d8282a2836 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -14,9 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/cloud/oauth_client.h"
+#ifndef _WIN32
 #include <pwd.h>
 #include <sys/types.h>
 #include <unistd.h>
+#else
+#include <sys/types.h>
+#endif
 #include <fstream>
 #include <openssl/bio.h>
 #include <openssl/evp.h>
@@ -212,11 +216,9 @@ Status OAuthClient::GetTokenFromServiceAccountJson(
   // Send the request to the Google OAuth 2.0 server to get the token.
   std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
   std::vector<char> response_buffer;
-  TF_RETURN_IF_ERROR(request->Init());
-  TF_RETURN_IF_ERROR(request->SetUri(oauth_server_uri.ToString()));
-  TF_RETURN_IF_ERROR(
-      request->SetPostFromBuffer(request_body.c_str(), request_body.size()));
-  TF_RETURN_IF_ERROR(request->SetResultBuffer(&response_buffer));
+  request->SetUri(oauth_server_uri.ToString());
+  request->SetPostFromBuffer(request_body.c_str(), request_body.size());
+  request->SetResultBuffer(&response_buffer);
   TF_RETURN_IF_ERROR(request->Send());
 
   StringPiece response =
@@ -246,11 +248,9 @@ Status OAuthClient::GetTokenFromRefreshTokenJson(
 
   std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
   std::vector<char> response_buffer;
-  TF_RETURN_IF_ERROR(request->Init());
-  TF_RETURN_IF_ERROR(request->SetUri(oauth_server_uri.ToString()));
-  TF_RETURN_IF_ERROR(
-      request->SetPostFromBuffer(request_body.c_str(), request_body.size()));
-  TF_RETURN_IF_ERROR(request->SetResultBuffer(&response_buffer));
+  request->SetUri(oauth_server_uri.ToString());
+  request->SetPostFromBuffer(request_body.c_str(), request_body.size());
+  request->SetResultBuffer(&response_buffer);
   TF_RETURN_IF_ERROR(request->Send());
 
   StringPiece response =
diff --git a/tensorflow/core/platform/cloud/oauth_client.h b/tensorflow/core/platform/cloud/oauth_client.h
index 1614c7b315f67f5976a2d18a6d281afe7459f4f1..519d69acf982c7d004d2c15cd47cf8743669f8fe 100644
--- a/tensorflow/core/platform/cloud/oauth_client.h
+++ b/tensorflow/core/platform/cloud/oauth_client.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
 
 #include <memory>
 #include "include/json/json.h"
@@ -59,4 +59,4 @@ class OAuthClient {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_OAUTH_CLIENT_H_
diff --git a/tensorflow/core/platform/cloud/oauth_client_test.cc b/tensorflow/core/platform/cloud/oauth_client_test.cc
index 236259dbc16ffc806779bd100e1ec6ace2b7bb39..ad569758cc6ec11555a81a3bc7fbefbc580d6529 100644
--- a/tensorflow/core/platform/cloud/oauth_client_test.cc
+++ b/tensorflow/core/platform/cloud/oauth_client_test.cc
@@ -160,12 +160,12 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
   ASSERT_EQ(1, EVP_DigestVerifyInit(md_ctx, nullptr, md, nullptr, key));
   ASSERT_EQ(1, EVP_DigestVerifyUpdate(md_ctx, header_dot_claim.c_str(),
                                       header_dot_claim.size()));
-  ASSERT_EQ(
-      1,
-      EVP_DigestVerifyFinal(
-          md_ctx, const_cast<unsigned char*>(
-                      reinterpret_cast<const unsigned char*>(signature.data())),
-          signature.size()));
+  ASSERT_EQ(1,
+            EVP_DigestVerifyFinal(
+                md_ctx,
+                const_cast<unsigned char*>(
+                    reinterpret_cast<const unsigned char*>(signature.data())),
+                signature.size()));
   EVP_MD_CTX_cleanup(md_ctx);
 
   // Free all the crypto-related resources.
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.cc b/tensorflow/core/platform/cloud/retrying_file_system.cc
index c3b6831361305f69e8a9882dbff90ce139ca13c0..be9ebe67b18e7be76e95149258cb1fcce6047d85 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system.cc
@@ -25,7 +25,6 @@ namespace tensorflow {
 
 namespace {
 
-
 class RetryingRandomAccessFile : public RandomAccessFile {
  public:
   RetryingRandomAccessFile(std::unique_ptr<RandomAccessFile> base_file,
@@ -203,4 +202,6 @@ Status RetryingFileSystem::DeleteRecursively(const string& dirname,
       initial_delay_microseconds_);
 }
 
+void RetryingFileSystem::FlushCaches() { base_file_system_->FlushCaches(); }
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.h b/tensorflow/core/platform/cloud/retrying_file_system.h
index d9d8ea6b004c3cf1d0d77ff65fa415e746310afd..a262a5fd940f9b269721790c80caaef38d79d690 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.h
+++ b/tensorflow/core/platform/cloud/retrying_file_system.h
@@ -69,6 +69,8 @@ class RetryingFileSystem : public FileSystem {
   Status DeleteRecursively(const string& dirname, int64* undeleted_files,
                            int64* undeleted_dirs) override;
 
+  void FlushCaches() override;
+
  private:
   std::unique_ptr<FileSystem> base_file_system_;
   const int64 initial_delay_microseconds_;
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index 232dcb3e71aa7c5b05b45e37332fe58970fc3fe8..d3f763bb3c845436e8458135a0a754d8cb002957 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -84,7 +84,8 @@ class MockWritableFile : public WritableFile {
 
 class MockFileSystem : public FileSystem {
  public:
-  explicit MockFileSystem(const ExpectedCalls& calls) : calls_(calls) {}
+  explicit MockFileSystem(const ExpectedCalls& calls, bool* flushed = nullptr)
+      : calls_(calls), flushed_(flushed) {}
 
   Status NewRandomAccessFile(
       const string& fname, std::unique_ptr<RandomAccessFile>* result) override {
@@ -156,11 +157,18 @@ class MockFileSystem : public FileSystem {
     return calls_.ConsumeNextCall("DeleteRecursively");
   }
 
+  void FlushCaches() override {
+    if (flushed_) {
+      *flushed_ = true;
+    }
+  }
+
   std::unique_ptr<WritableFile> writable_file_to_return;
   std::unique_ptr<RandomAccessFile> random_access_file_to_return;
 
  private:
   MockCallSequence calls_;
+  bool* flushed_ = nullptr;
 };
 
 TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
@@ -702,5 +710,14 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
       << status;
 }
 
+TEST(RetryingFileSystemTest, FlushCaches) {
+  ExpectedCalls none;
+  bool flushed = false;
+  std::unique_ptr<MockFileSystem> base_fs(new MockFileSystem(none, &flushed));
+  RetryingFileSystem fs(std::move(base_fs), 0);
+  fs.FlushCaches();
+  EXPECT_TRUE(flushed);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/retrying_utils.h b/tensorflow/core/platform/cloud/retrying_utils.h
index 99ab216e97fc9fcdf02e0776dd252b808a43df7a..546b8d1c4a4842f44f6c490eb05cc3cac29aa023 100644
--- a/tensorflow/core/platform/cloud/retrying_utils.h
+++ b/tensorflow/core/platform/cloud/retrying_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
 
 #include <functional>
 #include "tensorflow/core/lib/core/status.h"
@@ -47,4 +47,4 @@ class RetryingUtils {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_UTILS_H_
diff --git a/tensorflow/core/platform/cloud/time_util.cc b/tensorflow/core/platform/cloud/time_util.cc
index 2f8643f3c7f39c53566d481c078d8f71b44bbedd..0587a65c299778b95ccdec86e03c9f5dca8ec878 100644
--- a/tensorflow/core/platform/cloud/time_util.cc
+++ b/tensorflow/core/platform/cloud/time_util.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include <cmath>
 #include <cstdio>
 #include <ctime>
+#ifdef _WIN32
+#define timegm _mkgmtime
+#endif
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/platform/cloud/time_util.h b/tensorflow/core/platform/cloud/time_util.h
index b1bb7f111970b51dcd2dcba47a3c20f8388bca42..d6d4bc499fe2430e8f5c97ca23c9db7345de11b4 100644
--- a/tensorflow/core/platform/cloud/time_util.h
+++ b/tensorflow/core/platform/cloud/time_util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
 
 #include "tensorflow/core/lib/core/status.h"
 
@@ -26,4 +26,4 @@ Status ParseRfc3339Time(const string& time, int64* mtime_nsec);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_TIME_UTIL_H_
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index b0d7b3a67ae9f92d8e321978a3b899c243c22d1d..b5706581580ea00865b45cf50a4d92d22c647e53 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -97,14 +97,17 @@ std::once_flag g_cpu_feature_guard_warn_once_flag;
 void InfoAboutUnusedCPUFeatures() {
   std::call_once(g_cpu_feature_guard_warn_once_flag, [] {
     string missing_instructions;
-#ifdef PLATFORM_WINDOWS
+#if defined(_MSC_VER) && !defined(__clang__)
+
 #ifndef __AVX__
     CheckIfFeatureUnused(CPUFeature::AVX, "AVX", missing_instructions);
 #endif  // __AVX__
 #ifndef __AVX2__
     CheckIfFeatureUnused(CPUFeature::AVX2, "AVX2", missing_instructions);
 #endif  // __AVX2__
-#else   // ifdef platform windows
+
+#else  // if defined(_MSC_VER) && !defined(__clang__)
+
 #ifndef __SSE__
     CheckIfFeatureUnused(CPUFeature::SSE, "SSE", missing_instructions);
 #endif  // __SSE__
@@ -132,7 +135,7 @@ void InfoAboutUnusedCPUFeatures() {
 #ifndef __FMA__
     CheckIfFeatureUnused(CPUFeature::FMA, "FMA", missing_instructions);
 #endif  // __FMA__
-#endif  // else of ifdef platform windows
+#endif  // else of if defined(_MSC_VER) && !defined(__clang__)
     if (!missing_instructions.empty()) {
       LOG(INFO) << "Your CPU supports instructions that this TensorFlow "
                 << "binary was not compiled to use:" << missing_instructions;
diff --git a/tensorflow/core/platform/cuda_libdevice_path.h b/tensorflow/core/platform/cuda_libdevice_path.h
index 601d0db6d47c7f09300970a454b618653f8f9596..6ef565ecd3c6460791b49a25fd4277e9393cfdd0 100644
--- a/tensorflow/core/platform/cuda_libdevice_path.h
+++ b/tensorflow/core/platform/cuda_libdevice_path.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
+#define TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -29,4 +29,4 @@ string LibdeviceRoot();
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
diff --git a/tensorflow/core/platform/cuda_libdevice_path_test.cc b/tensorflow/core/platform/cuda_libdevice_path_test.cc
index 639f6804ea236b86f458263091f371c1374e50ae..2d34239a9958d722a1cb84213657ca8229ebaf2c 100644
--- a/tensorflow/core/platform/cuda_libdevice_path_test.cc
+++ b/tensorflow/core/platform/cuda_libdevice_path_test.cc
@@ -27,8 +27,7 @@ TEST(CudaLibdevicePathTest, LibdevicePath) {
   VLOG(2) << "Libdevice root = " << LibdeviceRoot();
   std::vector<string> libdevice_files;
   TF_EXPECT_OK(Env::Default()->GetMatchingPaths(
-      io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"),
-      &libdevice_files));
+      io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"), &libdevice_files));
   EXPECT_LT(0, libdevice_files.size());
 }
 #endif
diff --git a/tensorflow/core/platform/cupti_wrapper.h b/tensorflow/core/platform/cupti_wrapper.h
index c909dcd35bae4eef8cf165aa00349b079245db85..9a17ab60c0d2ebcd4401707a23a76f381aeb5994 100644
--- a/tensorflow/core/platform/cupti_wrapper.h
+++ b/tensorflow/core/platform/cupti_wrapper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUPTI_WRAPPER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUPTI_WRAPPER_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CUPTI_WRAPPER_H_
+#define TENSORFLOW_CORE_PLATFORM_CUPTI_WRAPPER_H_
 
 #include "tensorflow/core/platform/platform.h"
 
@@ -24,4 +24,4 @@ limitations under the License.
 #include "tensorflow/core/platform/default/gpu/cupti_wrapper.h"
 #endif
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_CUPTI_WRAPPER_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CUPTI_WRAPPER_H_
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 0f8cf8f122355651b8793366677de6b7fc9584aa..2102c5cca383b553c56fb3704596e3d1335c55c2 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -3,6 +3,7 @@
 load("@protobuf_archive//:protobuf.bzl", "proto_gen")
 load("@protobuf_archive//:protobuf.bzl", "py_proto_library")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")
+load("//tensorflow:tensorflow.bzl", "if_windows")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
@@ -66,16 +67,14 @@ def pyx_library(
       pxd_srcs.append(src)
 
   # Invoke cython to produce the shared object libraries.
-  cpp_outs = [src.split(".")[0] + ".cpp" for src in pyx_srcs]
-  native.genrule(
-      name = name + "_cython_translation",
-      srcs = pyx_srcs,
-      outs = cpp_outs,
-      cmd = ("PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS)"
-             # Rename outputs to expected location.
-             + """ && python -c 'import shutil, sys; n = len(sys.argv); [shutil.copyfile(src.split(".")[0] + ".cpp", dst) for src, dst in zip(sys.argv[1:], sys.argv[1+n//2:])]' $(SRCS) $(OUTS)"""),
-      tools = ["@cython//:cython_binary"] + pxd_srcs,
-  )
+  for filename in pyx_srcs:
+    native.genrule(
+        name = filename + "_cython_translation",
+        srcs = [filename],
+        outs = [filename.split(".")[0] + ".cpp"],
+        cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
+        tools = ["@cython//:cython_binary"] + pxd_srcs,
+    )
 
   shared_objects = []
   for src in pyx_srcs:
@@ -358,7 +357,9 @@ def tf_additional_proto_hdrs():
       "platform/default/integral_types.h",
       "platform/default/logging.h",
       "platform/default/protobuf.h"
-  ]
+  ] + if_windows([
+      "platform/windows/integral_types.h",
+  ])
 
 def tf_additional_proto_srcs():
   return [
@@ -377,6 +378,14 @@ def tf_protos_all():
       extra_deps=tf_protos_all_impl(),
       otherwise=["//tensorflow/core:protos_all_cc"])
 
+def tf_protos_grappler_impl():
+  return ["//tensorflow/core/grappler/costs:op_performance_data_cc_impl"]
+
+def tf_protos_grappler():
+  return if_static(
+      extra_deps=tf_protos_grappler_impl(),
+      otherwise=["//tensorflow/core/grappler/costs:op_performance_data_cc"])
+
 def tf_env_time_hdrs():
   return [
       "platform/env_time.h",
@@ -458,7 +467,6 @@ def tf_additional_lib_deps():
 
 def tf_additional_core_deps():
   return select({
-      "//tensorflow:with_gcp_support_windows_override": [],
       "//tensorflow:with_gcp_support_android_override": [],
       "//tensorflow:with_gcp_support_ios_override": [],
       "//tensorflow:with_gcp_support": [
@@ -510,6 +518,7 @@ def tf_additional_cloud_kernel_deps():
 def tf_lib_proto_parsing_deps():
   return [
       ":protos_all_cc",
+      "//third_party/eigen3",
       "//tensorflow/core/platform/default/build_config:proto_parsing",
   ]
 
@@ -531,6 +540,9 @@ def tf_additional_gdr_lib_defines():
       "//conditions:default": [],
   })
 
+def tf_py_clif_cc(name, visibility=None, **kwargs):
+  pass
+
 def tf_pyclif_proto_library(name, proto_lib, proto_srcfile="", visibility=None,
                             **kwargs):
   pass
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index f2fadb45589a8b44d29db045ca4585b578c5301d..2cd607edbe554cd18d21626e258176e8570282ed 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -122,7 +122,7 @@ cc_library(
         "//tensorflow/core:protos_cc",
         "@com_googlesource_code_re2//:re2",
         "@farmhash_archive//:farmhash",
-        "@fft2d//:fft2d",
+        "@fft2d",
         "@highwayhash//:sip_hash",
         "@png_archive//:png",
     ],
@@ -140,7 +140,7 @@ cc_library(
     name = "jpeg",
     copts = tf_copts(),
     deps = [
-        "@jpeg//:jpeg",
+        "@jpeg",
     ],
 )
 
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 6e98f12114ec6bf715ca8ddcc02dbe8ff8aa8812..09029a4b256beceeb69c735c15bb1587cb1e06ac 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -19,6 +19,9 @@ def tf_additional_plugin_deps():
 def tf_additional_xla_deps_py():
   return []
 
+def tf_additional_grpc_deps_py():
+  return []
+
 def tf_additional_license_deps():
   return select({
       str(Label("//tensorflow:with_xla_support")): ["@llvm//:LICENSE.TXT"],
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index f4b0f16393d70521386ad49fbf010591e5afb08c..8e60a7f0910ff9cf77a33f9d72d680ec42847777 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -579,8 +579,10 @@ Status DeviceTracerImpl::Collect(StepStatsCollector *collector) {
   // TODO(pbar) Handle device IDs and prefix properly.
   const string prefix = "";
   const int id = 0;
-  const string stream_device = strings::StrCat(prefix, "/device:GPU:", id, "/stream:");
-  const string memcpy_device = strings::StrCat(prefix, "/device:GPU:", id, "/memcpy");
+  const string stream_device =
+      strings::StrCat(prefix, "/device:GPU:", id, "/stream:");
+  const string memcpy_device =
+      strings::StrCat(prefix, "/device:GPU:", id, "/memcpy");
 
   mutex_lock l2(trace_mu_);
   for (const auto &rec : kernel_records_) {
diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.h b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
index 38e01cefad8aac372a1d5e65f984ac62623336de..acd889e47496f8bc1cc9f89c3848848d47c4e91f 100644
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.h
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEFAULT_CUPTI_WRAPPER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEFAULT_CUPTI_WRAPPER_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_CUPTI_WRAPPER_H_
+#define TENSORFLOW_CORE_PLATFORM_DEFAULT_CUPTI_WRAPPER_H_
 
 #if GOOGLE_CUDA
 
@@ -76,4 +76,4 @@ class CuptiWrapper {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEFAULT_CUPTI_WRAPPER_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_CUPTI_WRAPPER_H_
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index ebdd4b624aa423983cdeb2d31c0bf27ff30c89e2..2b874da1981bed396330ca3c526d82779046bdf2 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -83,15 +83,14 @@ void LogMessage::GenerateLogMessage() {
   const size_t time_buffer_size = 30;
   char time_buffer[time_buffer_size];
   strftime(time_buffer, time_buffer_size, "%Y-%m-%d %H:%M:%S",
-	   localtime(&now_seconds));
+           localtime(&now_seconds));
 
   // TODO(jeff,sanjay): Replace this with something that logs through the env.
   fprintf(stderr, "%s.%06d: %c %s:%d] %s\n", time_buffer, micros_remainder,
-	  "IWEF"[severity_], fname_, line_, str().c_str());
+          "IWEF"[severity_], fname_, line_, str().c_str());
 }
 #endif
 
-
 namespace {
 
 // Parse log level (int64) from environment variable (char*)
@@ -114,6 +113,8 @@ int64 LogLevelStrToInt(const char* tf_env_var_val) {
   return level;
 }
 
+}  // namespace
+
 int64 MinLogLevelFromEnv() {
   const char* tf_env_var_val = getenv("TF_CPP_MIN_LOG_LEVEL");
   return LogLevelStrToInt(tf_env_var_val);
@@ -124,8 +125,6 @@ int64 MinVLogLevelFromEnv() {
   return LogLevelStrToInt(tf_env_var_val);
 }
 
-}  // namespace
-
 LogMessage::~LogMessage() {
   // Read the min log level once during the first call to logging.
   static int64 min_log_level = MinLogLevelFromEnv();
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index d5f7350cdd805eb71edab0fde72db8383c32addb..f0efa31d5576393e9d9bba6e39a454b2a33cddc3 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -19,8 +19,8 @@ limitations under the License.
 // IWYU pragma: private, include "third_party/tensorflow/core/platform/logging.h"
 // IWYU pragma: friend third_party/tensorflow/core/platform/logging.h
 
-#include <sstream>
 #include <limits>
+#include <sstream>
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -205,16 +205,18 @@ string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) {
   inline string* name##Impl(int v1, int v2, const char* exprtext) {       \
     return name##Impl<int, int>(v1, v2, exprtext);                        \
   }                                                                       \
-  inline string* name##Impl(const size_t v1, const int v2, const char* exprtext) {       \
+  inline string* name##Impl(const size_t v1, const int v2,                \
+                            const char* exprtext) {                       \
     if (TF_PREDICT_FALSE(v2 < 0)) {                                       \
-       return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext);\
+      return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext); \
     }                                                                     \
     const size_t uval = (size_t)((unsigned)v1);                           \
     return name##Impl<size_t, size_t>(uval, v2, exprtext);                \
   }                                                                       \
-  inline string* name##Impl(const int v1, const size_t v2, const char* exprtext) {       \
-    if (TF_PREDICT_FALSE(v2 >= std::numeric_limits<int>::max())) {      \
-       return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext);\
+  inline string* name##Impl(const int v1, const size_t v2,                \
+                            const char* exprtext) {                       \
+    if (TF_PREDICT_FALSE(v2 >= std::numeric_limits<int>::max())) {        \
+      return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext); \
     }                                                                     \
     const size_t uval = (size_t)((unsigned)v2);                           \
     return name##Impl<size_t, size_t>(v1, uval, exprtext);                \
@@ -225,12 +227,12 @@ string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) {
 // This happens if, for example, those are used as token names in a
 // yacc grammar.
 TF_DEFINE_CHECK_OP_IMPL(Check_EQ,
-                        == )  // Compilation error with CHECK_EQ(NULL, x)?
-TF_DEFINE_CHECK_OP_IMPL(Check_NE, != )  // Use CHECK(x == NULL) instead.
-TF_DEFINE_CHECK_OP_IMPL(Check_LE, <= )
-TF_DEFINE_CHECK_OP_IMPL(Check_LT, < )
-TF_DEFINE_CHECK_OP_IMPL(Check_GE, >= )
-TF_DEFINE_CHECK_OP_IMPL(Check_GT, > )
+                        ==)  // Compilation error with CHECK_EQ(NULL, x)?
+TF_DEFINE_CHECK_OP_IMPL(Check_NE, !=)  // Use CHECK(x == NULL) instead.
+TF_DEFINE_CHECK_OP_IMPL(Check_LE, <=)
+TF_DEFINE_CHECK_OP_IMPL(Check_LT, <)
+TF_DEFINE_CHECK_OP_IMPL(Check_GE, >=)
+TF_DEFINE_CHECK_OP_IMPL(Check_GT, >)
 #undef TF_DEFINE_CHECK_OP_IMPL
 
 // In optimized mode, use CheckOpString to hint to compiler that
@@ -305,6 +307,10 @@ T&& CheckNotNull(const char* file, int line, const char* exprtext, T&& t) {
   return std::forward<T>(t);
 }
 
+int64 MinLogLevelFromEnv();
+
+int64 MinVLogLevelFromEnv();
+
 }  // namespace internal
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/mutex.h b/tensorflow/core/platform/default/mutex.h
index c3e44c42d942326af210e1038da20bf655d14a10..044c754e80bd0dee04c73e969c325a2aa4a89c31 100644
--- a/tensorflow/core/platform/default/mutex.h
+++ b/tensorflow/core/platform/default/mutex.h
@@ -31,6 +31,8 @@ namespace tensorflow {
 
 enum LinkerInitialized { LINKER_INITIALIZED };
 
+class condition_variable;
+
 // Mimic std::mutex + C++17's shared_mutex, adding a LinkerInitialized
 // constructor interface.  This type is as fast as mutex, but is also a shared
 // lock.
diff --git a/tensorflow/core/platform/default/stacktrace.h b/tensorflow/core/platform/default/stacktrace.h
index 5f3073262ab9d86b3ee922195f1b5bf28d47414e..c8e297fa8d8c1ee48b060e6e2c7ee89eb0d23b39 100644
--- a/tensorflow/core/platform/default/stacktrace.h
+++ b/tensorflow/core/platform/default/stacktrace.h
@@ -17,12 +17,63 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_DEFAULT_STACKTRACE_H_
 
 #include "tensorflow/core/platform/platform.h"
+#if !defined(IS_MOBILE_PLATFORM) && defined(PLATFORM_POSIX) && \
+    (defined(__clang__) || defined(__GNUC__))
+#define TF_GENERATE_BACKTRACE
+#endif
+
+#if defined(TF_GENERATE_BACKTRACE)
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#endif  // defined(TF_GENERATE_BACKTRACE)
+
+#include <sstream>
+#include <string>
+#include "tensorflow/core/platform/abi.h"
 
 namespace tensorflow {
 
-inline string CurrentStackTrace() { return "No stack trace available"; }
-
-inline void DebugWriteToString(const char* data, void* arg) {}
+// Function to create a pretty stacktrace.
+inline std::string CurrentStackTrace() {
+#if defined(TF_GENERATE_BACKTRACE)
+  std::stringstream ss("");
+  ss << "*** Begin stack trace ***" << std::endl;
+
+  // Get the mangled stack trace.
+  int buffer_size = 128;
+  void* trace[128];
+  buffer_size = backtrace(trace, buffer_size);
+
+  for (int i = 0; i < buffer_size; ++i) {
+    const char* symbol = "";
+    Dl_info info;
+    if (dladdr(trace[i], &info)) {
+      if (info.dli_sname != nullptr) {
+        symbol = info.dli_sname;
+      }
+    }
+
+    std::string demangled = tensorflow::port::MaybeAbiDemangle(symbol);
+    if (demangled.length()) {
+      ss << "\t" << demangled << std::endl;
+    } else {
+      ss << "\t" << symbol << std::endl;
+    }
+  }
+
+  ss << "*** End stack trace ***" << std::endl;
+  return ss.str();
+#else
+  return std::string();
+#endif  // defined(TF_GENERATE_BACKTRACE)
+}
+
+inline void DebugWriteToString(const char* data, void* arg) {
+  reinterpret_cast<std::string*>(arg)->append(data);
+}
 
 // A dummy class that does nothing.  Someday, add real support.
 class SavedStackTrace {
diff --git a/tensorflow/core/platform/demangle.h b/tensorflow/core/platform/demangle.h
index c2def217a12dd201245bc8e3e6629f2456198f2e..ce33be2e6899e9770e8cdd7831f16cdb4856d6af 100644
--- a/tensorflow/core/platform/demangle.h
+++ b/tensorflow/core/platform/demangle.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
+#define TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -28,4 +28,4 @@ string Demangle(const char* mangled);
 }  // namespace port
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_DEMANGLE_H_
diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc
index f13b0af2a79bec4538c64cbc475681f6eb0ce127..e00dbdb4ae5ef682369b345353e236a6084460ef 100644
--- a/tensorflow/core/platform/denormal.cc
+++ b/tensorflow/core/platform/denormal.cc
@@ -41,8 +41,8 @@ namespace tensorflow {
 namespace port {
 
 ScopedFlushDenormal::ScopedFlushDenormal() {
-// For now, we flush denormals only on SSE 3.  Other architectures such as ARM
-// can be added as needed.
+  // For now, we flush denormals only on SSE 3.  Other architectures such as ARM
+  // can be added as needed.
 
 #ifdef DENORM_USE_INTRINSICS
   if (TestCPUFeature(SSE3)) {
diff --git a/tensorflow/core/platform/device_tracer_test.cc b/tensorflow/core/platform/device_tracer_test.cc
index c0c08dabacbcb9fdbbfd9bdbe16bcfaea7328507..89f14e905afa4e2c10055f59721fe4cabf082781 100644
--- a/tensorflow/core/platform/device_tracer_test.cc
+++ b/tensorflow/core/platform/device_tracer_test.cc
@@ -77,7 +77,8 @@ class DeviceTracerTest : public ::testing::Test {
 
     Node* y_neg = test::graph::Unary(&graph, "Neg", i);
     y_neg_ = y_neg->name();
-    y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
+    y_neg->set_assigned_device_name(
+        "/job:localhost/replica:0/task:0/device:GPU:0");
 
     test::graph::ToGraphDef(&graph, &def_);
   }
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index 12ef55ec26e3355f08235cce557b9c7ae0618f04..12509c250eab9047b869694e930bf523a975a4f8 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -20,6 +20,10 @@ limitations under the License.
 #if defined(__APPLE__)
 #include <mach-o/dyld.h>
 #endif
+#if defined(__FreeBSD__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
 #if defined(PLATFORM_WINDOWS)
 #include <windows.h>
 #include "tensorflow/core/platform/windows/windows_file_system.h"
@@ -40,6 +44,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+// 128KB copy buffer
+constexpr size_t kCopyFileBufferSize = 128 * 1024;
+
 class FileSystemRegistryImpl : public FileSystemRegistry {
  public:
   Status Register(const string& scheme, Factory factory) override;
@@ -88,8 +95,12 @@ Status Env::GetFileSystemForFile(const string& fname, FileSystem** result) {
   io::ParseURI(fname, &scheme, &host, &path);
   FileSystem* file_system = file_system_registry_->Lookup(scheme.ToString());
   if (!file_system) {
-    return errors::Unimplemented("File system scheme ", scheme,
-                                 " not implemented");
+    if (scheme.empty()) {
+      scheme = "[local]";
+    }
+
+    return errors::Unimplemented("File system scheme '", scheme,
+                                 "' not implemented (file: '", fname, "')");
   }
   *result = file_system;
   return Status::OK();
@@ -104,6 +115,18 @@ Status Env::RegisterFileSystem(const string& scheme,
   return file_system_registry_->Register(scheme, std::move(factory));
 }
 
+Status Env::FlushFileSystemCaches() {
+  std::vector<string> schemes;
+  TF_RETURN_IF_ERROR(GetRegisteredFileSystemSchemes(&schemes));
+  for (const string& scheme : schemes) {
+    FileSystem* fs = nullptr;
+    TF_RETURN_IF_ERROR(
+        GetFileSystemForFile(io::CreateURI(scheme, "", ""), &fs));
+    fs->FlushCaches();
+  }
+  return Status::OK();
+}
+
 Status Env::NewRandomAccessFile(const string& fname,
                                 std::unique_ptr<RandomAccessFile>* result) {
   FileSystem* fs;
@@ -157,8 +180,8 @@ bool Env::FilesExist(const std::vector<string>& files,
     if (!file_system) {
       fs_result = false;
       if (fs_status) {
-        Status s = errors::Unimplemented("File system scheme ", itr.first,
-                                         " not implemented");
+        Status s = errors::Unimplemented("File system scheme '", itr.first,
+                                         "' not implemented");
         local_status.resize(itr.second.size(), s);
       }
     } else {
@@ -258,6 +281,17 @@ Status Env::RenameFile(const string& src, const string& target) {
   return src_fs->RenameFile(src, target);
 }
 
+Status Env::CopyFile(const string& src, const string& target) {
+  FileSystem* src_fs;
+  FileSystem* target_fs;
+  TF_RETURN_IF_ERROR(GetFileSystemForFile(src, &src_fs));
+  TF_RETURN_IF_ERROR(GetFileSystemForFile(target, &target_fs));
+  if (src_fs == target_fs) {
+    return src_fs->CopyFile(src, target);
+  }
+  return FileSystemCopyFile(src_fs, src, target_fs, target);
+}
+
 string Env::GetExecutablePath() {
   char exe_path[PATH_MAX] = {0};
 #ifdef __APPLE__
@@ -266,6 +300,14 @@ string Env::GetExecutablePath() {
   char unresolved_path[buffer_size];
   _NSGetExecutablePath(unresolved_path, &buffer_size);
   CHECK(realpath(unresolved_path, exe_path));
+#elif defined(__FreeBSD__)
+  int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
+  size_t exe_path_size = PATH_MAX;
+
+  if (sysctl(mib, 4, exe_path, &exe_path_size, NULL, 0) != 0) {
+    // Resolution of path failed
+    return "";
+  }
 #elif defined(PLATFORM_WINDOWS)
   HMODULE hModule = GetModuleHandleW(NULL);
   WCHAR wc_file_path[MAX_PATH] = {0};
@@ -288,30 +330,47 @@ bool Env::LocalTempFilename(string* filename) {
   // Try each directory, as they might be full, have inappropriate
   // permissions or have different problems at times.
   for (const string& dir : dirs) {
+    *filename = io::JoinPath(dir, "tempfile-");
+    if (CreateUniqueFileName(filename, "")) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool Env::CreateUniqueFileName(string* prefix, const string& suffix) {
 #ifdef __APPLE__
-    uint64_t tid64;
-    pthread_threadid_np(nullptr, &tid64);
-    int32 tid = static_cast<int32>(tid64);
-    int32 pid = static_cast<int32>(getpid());
+  uint64_t tid64;
+  pthread_threadid_np(nullptr, &tid64);
+  int32 tid = static_cast<int32>(tid64);
+  int32 pid = static_cast<int32>(getpid());
+#elif defined(__FreeBSD__)
+  // Has to be casted to long first, else this error appears:
+  // static_cast from 'pthread_t' (aka 'pthread *') to 'int32' (aka 'int')
+  // is not allowed
+  int32 tid = static_cast<int32>(static_cast<int64>(pthread_self()));
+  int32 pid = static_cast<int32>(getpid());
 #elif defined(PLATFORM_WINDOWS)
-    int32 tid = static_cast<int32>(GetCurrentThreadId());
-    int32 pid = static_cast<int32>(GetCurrentProcessId());
+  int32 tid = static_cast<int32>(GetCurrentThreadId());
+  int32 pid = static_cast<int32>(GetCurrentProcessId());
 #else
-    int32 tid = static_cast<int32>(pthread_self());
-    int32 pid = static_cast<int32>(getpid());
+  int32 tid = static_cast<int32>(pthread_self());
+  int32 pid = static_cast<int32>(getpid());
 #endif
-    uint64 now_microsec = NowMicros();
+  uint64 now_microsec = NowMicros();
 
-    *filename = io::JoinPath(
-        dir, strings::Printf("tempfile-%s-%x-%d-%llx", port::Hostname().c_str(),
-                             tid, pid, now_microsec));
-    if (FileExists(*filename).ok()) {
-      filename->clear();
-    } else {
-      return true;
-    }
+  *prefix += strings::Printf("%s-%x-%d-%llx", port::Hostname().c_str(), tid,
+                             pid, now_microsec);
+
+  if (!suffix.empty()) {
+    *prefix += suffix;
+  }
+  if (FileExists(*prefix).ok()) {
+    prefix->clear();
+    return false;
+  } else {
+    return true;
   }
-  return false;
 }
 
 Thread::~Thread() {}
@@ -361,6 +420,29 @@ Status WriteStringToFile(Env* env, const string& fname,
   return s;
 }
 
+Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
+                          FileSystem* target_fs, const string& target) {
+  std::unique_ptr<RandomAccessFile> src_file;
+  TF_RETURN_IF_ERROR(src_fs->NewRandomAccessFile(src, &src_file));
+
+  std::unique_ptr<WritableFile> target_file;
+  TF_RETURN_IF_ERROR(target_fs->NewWritableFile(target, &target_file));
+
+  uint64 offset = 0;
+  std::unique_ptr<char[]> scratch(new char[kCopyFileBufferSize]);
+  Status s = Status::OK();
+  while (s.ok()) {
+    StringPiece result;
+    s = src_file->Read(offset, kCopyFileBufferSize, &result, scratch.get());
+    if (!(s.ok() || s.code() == error::OUT_OF_RANGE)) {
+      return s;
+    }
+    TF_RETURN_IF_ERROR(target_file->Append(result));
+    offset += result.size();
+  }
+  return target_file->Close();
+}
+
 // A ZeroCopyInputStream on a RandomAccessFile.
 namespace {
 class FileStream : public ::tensorflow::protobuf::io::ZeroCopyInputStream {
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index da8c3e2d7e8a50c9d441cd371078fa86aae13179..4ce4e0b4e024d50ae2bd081ec7b8b155060d2a4a 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -68,10 +68,13 @@ class Env {
   /// \brief Returns the file system schemes registered for this Env.
   virtual Status GetRegisteredFileSystemSchemes(std::vector<string>* schemes);
 
-  // \brief Register a file system for a scheme.
+  /// \brief Register a file system for a scheme.
   virtual Status RegisterFileSystem(const string& scheme,
                                     FileSystemRegistry::Factory factory);
 
+  /// \brief Flush filesystem caches for all registered filesystems.
+  Status FlushFileSystemCaches();
+
   /// \brief Creates a brand new random access read-only file with the
   /// specified name.
 
@@ -211,6 +214,9 @@ class Env {
   /// replaced.
   Status RenameFile(const string& src, const string& target);
 
+  /// \brief Copy the src to target.
+  Status CopyFile(const string& src, const string& target);
+
   /// \brief Returns the absolute path of the current executable. It resolves
   /// symlinks if there is any.
   string GetExecutablePath();
@@ -218,6 +224,10 @@ class Env {
   /// Creates a local unique temporary file name. Returns true if success.
   bool LocalTempFilename(string* filename);
 
+  /// Creates a local unique file name that starts with |prefix| and ends with
+  /// |suffix|. Returns true if success.
+  bool CreateUniqueFileName(string* prefix, const string& suffix);
+
   // TODO(jeff,sanjay): Add back thread/thread-pool support if needed.
   // TODO(jeff,sanjay): if needed, tighten spec so relative to epoch, or
   // provide a routine to get the absolute time.
@@ -279,7 +289,7 @@ class Env {
   // "version" should be the version of the library or NULL
   // returns the name that LoadLibrary() can use
   virtual string FormatLibraryFileName(const string& name,
-      const string& version) = 0;
+                                       const string& version) = 0;
 
  private:
   // Returns a possible list of local temporary directories.
@@ -346,6 +356,7 @@ class EnvWrapper : public Env {
                                const string& version) override {
     return target_->FormatLibraryFileName(name, version);
   }
+
  private:
   Env* target_;
 };
@@ -373,6 +384,11 @@ struct ThreadOptions {
   size_t guard_size = 0;  // 0: use system default value
 };
 
+/// A utility routine: copy contents of `src` in file system `src_fs`
+/// to `target` in file system `target_fs`.
+Status FileSystemCopyFile(FileSystem* src_fs, const string& src,
+                          FileSystem* target_fs, const string& target);
+
 /// A utility routine: reads contents of named file into `*data`
 Status ReadFileToString(Env* env, const string& fname, string* data);
 
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index c9b362f18235f8ddec0994bc1110aaec950eef72..47ddf0ccb93e827d410e87050d6802747fb84fbf 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -280,6 +281,15 @@ class TmpDirFileSystem : public NullFileSystem {
     StringPiece scheme, host, path;
     io::ParseURI(dir, &scheme, &host, &path);
     if (path.empty()) return errors::NotFound(dir, " not found");
+    // The special "flushed" file exists only if the filesystem's caches have
+    // been flushed.
+    if (path == "/flushed") {
+      if (flushed_) {
+        return Status::OK();
+      } else {
+        return errors::NotFound("FlushCaches() not called yet");
+      }
+    }
     return Env::Default()->FileExists(io::JoinPath(BaseDir(), path));
   }
 
@@ -294,10 +304,23 @@ class TmpDirFileSystem : public NullFileSystem {
     }
     return Env::Default()->CreateDir(io::JoinPath(BaseDir(), path));
   }
+
+  void FlushCaches() override { flushed_ = true; }
+
+ private:
+  bool flushed_ = false;
 };
 
 REGISTER_FILE_SYSTEM("tmpdirfs", TmpDirFileSystem);
 
+TEST_F(DefaultEnvTest, FlushFileSystemCaches) {
+  Env* env = Env::Default();
+  const string flushed = "tmpdirfs://testhost/flushed";
+  EXPECT_EQ(error::Code::NOT_FOUND, env->FileExists(flushed).code());
+  TF_EXPECT_OK(env->FlushFileSystemCaches());
+  TF_EXPECT_OK(env->FileExists(flushed));
+}
+
 TEST_F(DefaultEnvTest, RecursivelyCreateDirWithUri) {
   Env* env = Env::Default();
   const string create_path = "tmpdirfs://testhost/a/b/c/d";
@@ -340,4 +363,18 @@ TEST_F(DefaultEnvTest, LocalTempFilename) {
   EXPECT_FALSE(env->FileExists(filename).ok());
 }
 
+TEST_F(DefaultEnvTest, CreateUniqueFileName) {
+  Env* env = Env::Default();
+
+  string prefix = "tempfile-prefix-";
+  string suffix = ".tmp";
+  string filename = prefix;
+
+  EXPECT_TRUE(env->CreateUniqueFileName(&filename, suffix));
+
+  StringPiece str(filename);
+  EXPECT_TRUE(str.starts_with(prefix));
+  EXPECT_TRUE(str.ends_with(suffix));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_statistics.h b/tensorflow/core/platform/file_statistics.h
index 7629db6ef9e216d652b819a5bc19af1ab6a38058..9e3489b1adb8c7af1651c1b30539c5083a201979 100644
--- a/tensorflow/core/platform/file_statistics.h
+++ b/tensorflow/core/platform/file_statistics.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
+#define TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -36,4 +36,4 @@ struct FileStatistics {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_FILE_STATISTICS_H_
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index 938f5af487ab05182cea30996f19c1e40ab1b535..271d73f5f1a7bd3e1301520aed09cbafd89c8ebc 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -73,6 +73,8 @@ Status FileSystem::IsDirectory(const string& name) {
   return Status(tensorflow::error::FAILED_PRECONDITION, "Not a directory");
 }
 
+void FileSystem::FlushCaches() {}
+
 RandomAccessFile::~RandomAccessFile() {}
 
 WritableFile::~WritableFile() {}
@@ -129,18 +131,19 @@ Status FileSystem::GetMatchingPaths(const string& pattern,
     if (children.empty()) continue;
     // This IsDirectory call can be expensive for some FS. Parallelizing it.
     children_dir_status.resize(children.size());
-    ForEach(0, children.size(), [this, &current_dir, &children, &fixed_prefix,
-                                 &children_dir_status](int i) {
-      const string child_path = io::JoinPath(current_dir, children[i]);
-      // In case the child_path doesn't start with the fixed_prefix then
-      // we don't need to explore this path.
-      if (!StringPiece(child_path).starts_with(fixed_prefix)) {
-        children_dir_status[i] =
-            Status(tensorflow::error::CANCELLED, "Operation not needed");
-      } else {
-        children_dir_status[i] = IsDirectory(child_path);
-      }
-    });
+    ForEach(0, children.size(),
+            [this, &current_dir, &children, &fixed_prefix,
+             &children_dir_status](int i) {
+              const string child_path = io::JoinPath(current_dir, children[i]);
+              // In case the child_path doesn't start with the fixed_prefix then
+              // we don't need to explore this path.
+              if (!StringPiece(child_path).starts_with(fixed_prefix)) {
+                children_dir_status[i] = Status(tensorflow::error::CANCELLED,
+                                                "Operation not needed");
+              } else {
+                children_dir_status[i] = IsDirectory(child_path);
+              }
+            });
     for (int i = 0; i < children.size(); ++i) {
       const string child_path = io::JoinPath(current_dir, children[i]);
       // If the IsDirectory call was cancelled we bail.
@@ -262,4 +265,8 @@ Status FileSystem::RecursivelyCreateDir(const string& dirname) {
   return Status::OK();
 }
 
+Status FileSystem::CopyFile(const string& src, const string& target) {
+  return FileSystemCopyFile(this, src, this, target);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index 903df96b58a7304f04e618613c7fa9561fe798a2..3085b6958fd921ae124b885107e807f0a02e1d9d 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -189,6 +189,9 @@ class FileSystem {
   /// \brief Overwrites the target if it exists.
   virtual Status RenameFile(const string& src, const string& target) = 0;
 
+  /// \brief Copy the src to target.
+  virtual Status CopyFile(const string& src, const string& target);
+
   /// \brief Translate an URI to a filename for the FileSystem implementation.
   ///
   /// The implementation in this class cleans up the path, removing
@@ -206,6 +209,9 @@ class FileSystem {
   ///  * UNIMPLEMENTED - The file factory doesn't support directories.
   virtual Status IsDirectory(const string& fname);
 
+  /// \brief Flushes any cached filesystem objects from memory.
+  virtual void FlushCaches();
+
   FileSystem() {}
 
   virtual ~FileSystem();
diff --git a/tensorflow/core/platform/gif.h b/tensorflow/core/platform/gif.h
index 9c72d34ff518abcabf773af607589fe8114beebf..ab095a35c93517c6527b55bd922dbeb46d695ca4 100644
--- a/tensorflow/core/platform/gif.h
+++ b/tensorflow/core/platform/gif.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/gif.h"
-#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID)
 #include <gif_lib.h>
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 0baeac09841073ad6013a4700646e82d5d97182f..74863293a32451e8881c93de468539b913169aaa 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -164,8 +164,9 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
   } else {
     hdfs_->hdfsBuilderSetNameNode(builder, nn.c_str());
   }
-  // KERB_TICKET_CACHE_PATH will be deleted in the future, Because KRB5CCNAME is the build in
-  // environment variable of Kerberos, so KERB_TICKET_CACHE_PATH and related code are unnecessary.
+  // KERB_TICKET_CACHE_PATH will be deleted in the future, Because KRB5CCNAME is
+  // the build in environment variable of Kerberos, so KERB_TICKET_CACHE_PATH
+  // and related code are unnecessary.
   char* ticket_cache_path = getenv("KERB_TICKET_CACHE_PATH");
   if (ticket_cache_path != nullptr) {
     hdfs_->hdfsBuilderSetKerbTicketCachePath(builder, ticket_cache_path);
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.h b/tensorflow/core/platform/hadoop/hadoop_file_system.h
index 447e83158ab6c54a505190bd451fdbdcb678a7f1..5f2b222622cf01033af117f92d49458eeae00e6f 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.h
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
 
 #include "tensorflow/core/platform/env.h"
 
@@ -70,4 +70,4 @@ class HadoopFileSystem : public FileSystem {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
diff --git a/tensorflow/core/platform/jpeg.h b/tensorflow/core/platform/jpeg.h
index edbcbd960a7d61970119bfb385f075e1d3ffb96f..1b5e633f0aad09850afa82bee59d45c7943bbd8a 100644
--- a/tensorflow/core/platform/jpeg.h
+++ b/tensorflow/core/platform/jpeg.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/jpeg.h"
-#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID)
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 47523c7d2b09275be3747e684df1b656534ed6ea..6119edfd5a63d1aa4e81bb91d95736ed2835c478 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -93,7 +93,8 @@ limitations under the License.
   ((sizeof(a) / sizeof(*(a))) / \
    static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
 
-#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L || \
+    (defined(_MSC_VER) && _MSC_VER >= 1900)
 // Define this to 1 if the code is compiled in C++11 mode; leave it
 // undefined otherwise.  Do NOT define it to 0 -- that causes
 // '#ifdef LANG_CXX11' to behave differently from '#if LANG_CXX11'.
diff --git a/tensorflow/core/platform/mem.h b/tensorflow/core/platform/mem.h
index dc389a8741501d27394ac559c95eaa73c2014afd..7bb9fc264fbf6ee3f20e9b2687c9ba52b6171ec4 100644
--- a/tensorflow/core/platform/mem.h
+++ b/tensorflow/core/platform/mem.h
@@ -59,6 +59,9 @@ void MallocExtension_ReleaseToSystem(std::size_t num_bytes);
 // routine, this routine returns 0.
 std::size_t MallocExtension_GetAllocatedSize(const void* p);
 
+// Returns the amount of RAM available in kB, or INT64_MAX if unknown.
+int64 AvailableRam();
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/png.h b/tensorflow/core/platform/png.h
index 5b0203c343e6b1764a9cc8a7908919422d826bcb..dad18d72195953e78c6a169a19b9182ae6571485 100644
--- a/tensorflow/core/platform/png.h
+++ b/tensorflow/core/platform/png.h
@@ -20,7 +20,8 @@ limitations under the License.
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/platform/google/build_config/png.h"
-#elif defined(PLATFORM_POSIX)|| defined(PLATFORM_WINDOWS) ||defined(PLATFORM_POSIX_ANDROID)
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID)
 #include <png.h>
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index ba3c4e709078adf8c60cf49ab06c7194cf887cc1..8097624e09f81364071895ad114f26f93f4aab14 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -136,15 +136,19 @@ void Env::GetLocalTempDirectories(std::vector<string>* list) {
   // Directories, in order of preference. If we find a dir that
   // exists, we stop adding other less-preferred dirs
   const char* candidates[] = {
-      // Non-null only during unittest/regtest
-      getenv("TEST_TMPDIR"),
+    // Non-null only during unittest/regtest
+    getenv("TEST_TMPDIR"),
 
-      // Explicitly-supplied temp dirs
-      getenv("TMPDIR"),
-      getenv("TMP"),
+    // Explicitly-supplied temp dirs
+    getenv("TMPDIR"),
+    getenv("TMP"),
 
-      // If all else fails
-      "/tmp",
+#if defined(__ANDROID__)
+    "/data/local/tmp",
+#endif
+
+    // If all else fails
+    "/tmp",
   };
 
   for (const char* d : candidates) {
diff --git a/tensorflow/core/platform/posix/error.cc b/tensorflow/core/platform/posix/error.cc
index cda6d7d8f9d6ad3e7f2c8fa56cc99a8dbe07fa00..2bb9443fb3c45e0cd4bb31a48539355747684b5f 100644
--- a/tensorflow/core/platform/posix/error.cc
+++ b/tensorflow/core/platform/posix/error.cc
@@ -73,19 +73,19 @@ error::Code ErrnoToCode(int err_number) {
     case ECHILD:      // No child processes
     case EISCONN:     // Socket is connected
 #if !defined(_WIN32) && !defined(__HAIKU__)
-    case ENOTBLK:     // Block device required
+    case ENOTBLK:  // Block device required
 #endif
-    case ENOTCONN:    // The socket is not connected
-    case EPIPE:       // Broken pipe
+    case ENOTCONN:  // The socket is not connected
+    case EPIPE:     // Broken pipe
 #if !defined(_WIN32)
-    case ESHUTDOWN:   // Cannot send after transport endpoint shutdown
+    case ESHUTDOWN:  // Cannot send after transport endpoint shutdown
 #endif
-    case ETXTBSY:     // Text file busy
+    case ETXTBSY:  // Text file busy
       code = error::FAILED_PRECONDITION;
       break;
-    case ENOSPC:   // No space left on device
+    case ENOSPC:  // No space left on device
 #if !defined(_WIN32)
-    case EDQUOT:   // Disk quota exceeded
+    case EDQUOT:  // Disk quota exceeded
 #endif
     case EMFILE:   // Too many open files
     case EMLINK:   // Too many links
@@ -95,7 +95,7 @@ error::Code ErrnoToCode(int err_number) {
     case ENOMEM:   // Not enough space
     case ENOSR:    // No STREAM resources
 #if !defined(_WIN32) && !defined(__HAIKU__)
-    case EUSERS:   // Too many users
+    case EUSERS:  // Too many users
 #endif
       code = error::RESOURCE_EXHAUSTED;
       break;
@@ -104,17 +104,17 @@ error::Code ErrnoToCode(int err_number) {
     case ERANGE:     // Result too large
       code = error::OUT_OF_RANGE;
       break;
-    case ENOSYS:           // Function not implemented
-    case ENOTSUP:          // Operation not supported
-    case EAFNOSUPPORT:     // Address family not supported
+    case ENOSYS:        // Function not implemented
+    case ENOTSUP:       // Operation not supported
+    case EAFNOSUPPORT:  // Address family not supported
 #if !defined(_WIN32)
-    case EPFNOSUPPORT:     // Protocol family not supported
+    case EPFNOSUPPORT:  // Protocol family not supported
 #endif
     case EPROTONOSUPPORT:  // Protocol not supported
 #if !defined(_WIN32) && !defined(__HAIKU__)
     case ESOCKTNOSUPPORT:  // Socket type not supported
 #endif
-    case EXDEV:            // Improper link
+    case EXDEV:  // Improper link
       code = error::UNIMPLEMENTED;
       break;
     case EAGAIN:        // Resource temporarily unavailable
@@ -123,7 +123,7 @@ error::Code ErrnoToCode(int err_number) {
     case ECONNRESET:    // Connection reset
     case EINTR:         // Interrupted function call
 #if !defined(_WIN32)
-    case EHOSTDOWN:     // Host is down
+    case EHOSTDOWN:  // Host is down
 #endif
     case EHOSTUNREACH:  // Host is unreachable
     case ENETDOWN:      // Network is down
@@ -139,7 +139,7 @@ error::Code ErrnoToCode(int err_number) {
       break;
     case EDEADLK:  // Resource deadlock avoided
 #if !defined(_WIN32)
-    case ESTALE:   // Stale file handle
+    case ESTALE:  // Stale file handle
 #endif
       code = error::ABORTED;
       break;
@@ -158,7 +158,7 @@ error::Code ErrnoToCode(int err_number) {
     case ENOMSG:       // No message of the desired type
     case EPROTO:       // Protocol error
 #if !defined(_WIN32) && !defined(__HAIKU__)
-    case EREMOTE:      // Object is remote
+    case EREMOTE:  // Object is remote
 #endif
       code = error::UNKNOWN;
       break;
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 614ee00b0133976e9fe49caf7c75a01194e10237..494acde803a778fb839a7444e4d5ac2fd094eb09 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #if defined(__linux__) && !defined(__ANDROID__)
 #include <sched.h>
+#include <sys/sysinfo.h>
 #endif
 #include <stdio.h>
 #include <stdlib.h>
@@ -171,5 +172,16 @@ double NominalCPUFrequency() {
 #endif
 }
 
+int64 AvailableRam() {
+#if defined(__linux__) && !defined(__ANDROID__)
+  struct sysinfo info;
+  int err = sysinfo(&info);
+  if (err == 0) {
+    return info.freeram / 1024;
+  }
+#endif
+  return INT64_MAX;
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index fb7a5a9995985fd09936472d4b6f8a45254f7312..9a8021565cbcc2a172a23439d2a7139108c0df39 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include <fcntl.h>
 #include <stdio.h>
 #include <sys/mman.h>
+#if !defined(__APPLE__)
+#include <sys/sendfile.h>
+#endif
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>
@@ -34,6 +37,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+// 128KB of copy buffer
+constexpr size_t kPosixCopyFileBufferSize = 128 * 1024;
+
 // pread() based random-access
 class PosixRandomAccessFile : public RandomAccessFile {
  private:
@@ -276,4 +282,70 @@ Status PosixFileSystem::RenameFile(const string& src, const string& target) {
   return result;
 }
 
+Status PosixFileSystem::CopyFile(const string& src, const string& target) {
+  string translated_src = TranslateName(src);
+  struct stat sbuf;
+  if (stat(translated_src.c_str(), &sbuf) != 0) {
+    return IOError(src, errno);
+  }
+  int src_fd = open(translated_src.c_str(), O_RDONLY);
+  if (src_fd < 0) {
+    return IOError(src, errno);
+  }
+  string translated_target = TranslateName(target);
+  // O_WRONLY | O_CREAT:
+  //   Open file for write and if file does not exist, create the file.
+  // S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH:
+  //   Create the file with permission of 0644
+  int target_fd = open(translated_target.c_str(), O_WRONLY | O_CREAT,
+                       S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+  if (target_fd < 0) {
+    close(src_fd);
+    return IOError(target, errno);
+  }
+  int rc = 0;
+  off_t offset = 0;
+  std::unique_ptr<char[]> buffer(new char[kPosixCopyFileBufferSize]);
+  while (offset < sbuf.st_size) {
+    // Use uint64 for safe compare SSIZE_MAX
+    uint64 chunk = sbuf.st_size - offset;
+    if (chunk > SSIZE_MAX) {
+      chunk = SSIZE_MAX;
+    }
+#if defined(__linux__) && !defined(__ANDROID__)
+    rc = sendfile(target_fd, src_fd, &offset, static_cast<size_t>(chunk));
+#else
+    if (chunk > kPosixCopyFileBufferSize) {
+      chunk = kPosixCopyFileBufferSize;
+    }
+    rc = read(src_fd, buffer.get(), static_cast<size_t>(chunk));
+    if (rc <= 0) {
+      break;
+    }
+    rc = write(target_fd, buffer.get(), static_cast<size_t>(chunk));
+    offset += chunk;
+#endif
+    if (rc <= 0) {
+      break;
+    }
+  }
+
+  Status result = Status::OK();
+  if (rc < 0) {
+    result = IOError(target, errno);
+  }
+
+  // Keep the error code
+  rc = close(target_fd);
+  if (rc < 0 && result == Status::OK()) {
+    result = IOError(target, errno);
+  }
+  rc = close(src_fd);
+  if (rc < 0 && result == Status::OK()) {
+    result = IOError(target, errno);
+  }
+
+  return result;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/posix_file_system.h b/tensorflow/core/platform/posix/posix_file_system.h
index fe050fd5a0ee87efb339c9ee2b9e447fce803615..98ffa43b8acf8a10a4ace1bf11cc7d6f5e8a95a7 100644
--- a/tensorflow/core/platform/posix/posix_file_system.h
+++ b/tensorflow/core/platform/posix/posix_file_system.h
@@ -56,6 +56,8 @@ class PosixFileSystem : public FileSystem {
   Status GetFileSize(const string& fname, uint64* size) override;
 
   Status RenameFile(const string& src, const string& target) override;
+
+  Status CopyFile(const string& src, const string& target) override;
 };
 
 Status IOError(const string& context, int err_number);
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
index fb1955edde2abfd3fe5267e1319ea128138ee092..12dc9c58b38d01f6efc5644193fbf38b0e70c8d1 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
@@ -118,9 +118,10 @@ int64 AndroidArmV7ACpuUtilsHelper::ReadCpuFrequencyFile(
   const int retval = fscanf(fp, "%lld", &freq_in_khz);
   if (retval < 0) {
     LOG(WARNING) << "Failed to \"" << file_path << "\"";
+    fclose(fp);
     return INVALID_CPU_FREQUENCY;
   }
-  pclose(fp);
+  fclose(fp);
   return freq_in_khz * 1000;  // The file contains cpu frequency in khz
 }
 
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
index 8604b01c53ef69040a919dadda73df897e98b0e1..ce2069b004473a684a1882068d3479ed049c58d6 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h
@@ -58,8 +58,8 @@ class AndroidArmV7ACpuUtilsHelper : public ICpuUtilsHelper {
   TF_DISALLOW_COPY_AND_ASSIGN(AndroidArmV7ACpuUtilsHelper);
 };
 
-}  // profile_utils
-}  // tensorflow
+}  // namespace profile_utils
+}  // namespace tensorflow
 
 #endif  // defined(__ANDROID__) && (__ANDROID_API__ >= 21) &&
         // (defined(__ARM_ARCH_7A__) || defined(__aarch64__))
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc
index d3362690d7e08c8e88e8168b62c8134b6af5a319..02de7d1362bbfca645d07ee72165283351944b9b 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc
@@ -28,15 +28,17 @@ namespace profile_utils {
 
 static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
 
-#if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
-   /* static */ uint64 CpuUtils::GetCycleCounterFrequency() {
-     static const uint64 cpu_frequency = GetCycleCounterFrequencyImpl();
-     return cpu_frequency;
+#if (defined(__powerpc__) ||                                             \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+    (defined(__s390x__))
+/* static */ uint64 CpuUtils::GetCycleCounterFrequency() {
+  static const uint64 cpu_frequency = GetCycleCounterFrequencyImpl();
+  return cpu_frequency;
 }
 #else
-   /* static */ int64 CpuUtils::GetCycleCounterFrequency() {
-     static const int64 cpu_frequency = GetCycleCounterFrequencyImpl();
-     return cpu_frequency;
+/* static */ int64 CpuUtils::GetCycleCounterFrequency() {
+  static const int64 cpu_frequency = GetCycleCounterFrequencyImpl();
+  return cpu_frequency;
 }
 #endif
 
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.h b/tensorflow/core/platform/profile_utils/cpu_utils.h
index 5d215b4804dbee8cb785c99b09ec725101bacb4e..7b580c8bf606cdd9acf998fa21cb1d946e5e6ada 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.h
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.h
@@ -42,7 +42,7 @@ namespace profile_utils {
 class CpuUtils {
  public:
   // Constant for invalid frequency.
-  // This value is returned when the furequency is not obtained somehow.
+  // This value is returned when the frequency is not obtained somehow.
   static constexpr int64 INVALID_FREQUENCY = -1;
   static constexpr uint64 DUMMY_CYCLE_CLOCK = 1;
 
@@ -94,16 +94,18 @@ class CpuUtils {
 #endif
   }
 
-  // Return cycle counter frequency.
-  // As this method caches the cpu frequency internally,
-  // the first call will incur overhead, but not subsequent calls.
-  #if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
-     static uint64 GetCycleCounterFrequency();
-  #else
-     static int64 GetCycleCounterFrequency();
-  #endif
+// Return cycle counter frequency.
+// As this method caches the cpu frequency internally,
+// the first call will incur overhead, but not subsequent calls.
+#if (defined(__powerpc__) ||                                             \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+    (defined(__s390x__))
+  static uint64 GetCycleCounterFrequency();
+#else
+  static int64 GetCycleCounterFrequency();
+#endif
 
-  // Return micro secound per each clock
+  // Return micro second per each clock
   // As this method caches the cpu frequency internally,
   // the first call will incur overhead, but not subsequent calls.
   static double GetMicroSecPerClock();
diff --git a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
index 5b11b684dd9833bf742faaeaa3e79d2b49a78c6d..eb8161fbfd5ddfc796edd66a9119ad70c3c1de8e 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils_test.cc
@@ -53,15 +53,17 @@ TEST_F(CpuUtilsTest, CheckGetCurrentClockCycle) {
 }
 
 TEST_F(CpuUtilsTest, CheckCycleCounterFrequency) {
-  #if (defined(__powerpc__) || defined(__ppc__) && ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || (defined(__s390x__))
-     const uint64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
-     CHECK_GT(cpu_frequency, 0);
-     CHECK_NE(cpu_frequency, unsigned(CpuUtils::INVALID_FREQUENCY));
-  #else
-     const int64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
-     CHECK_GT(cpu_frequency, 0);
-     CHECK_NE(cpu_frequency, CpuUtils::INVALID_FREQUENCY);
-  #endif
+#if (defined(__powerpc__) ||                                             \
+     defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
+    (defined(__s390x__))
+  const uint64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
+  CHECK_GT(cpu_frequency, 0);
+  CHECK_NE(cpu_frequency, unsigned(CpuUtils::INVALID_FREQUENCY));
+#else
+  const int64 cpu_frequency = CpuUtils::GetCycleCounterFrequency();
+  CHECK_GT(cpu_frequency, 0);
+  CHECK_NE(cpu_frequency, CpuUtils::INVALID_FREQUENCY);
+#endif
   if (DBG) {
     LOG(INFO) << "Cpu frequency = " << cpu_frequency;
   }
diff --git a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
index 51c54d50d1dadcf78e8263ce44b07c998b68c05c..11b739c0096b5b5fd498bb5c753a54c8b1628208 100644
--- a/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
+++ b/tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h
@@ -47,7 +47,7 @@ class ICpuUtilsHelper {
   TF_DISALLOW_COPY_AND_ASSIGN(ICpuUtilsHelper);
 };
 
-}  // profile_utils
-}  // tensorflow
+}  // namespace profile_utils
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_PLATFORM_PROFILEUTILS_I_CPU_UTILS_HELPER_H__
diff --git a/tensorflow/core/platform/protobuf_internal.h b/tensorflow/core/platform/protobuf_internal.h
index 7d6e8f57a62e08a7897bdccdeb7033363b282bd4..2f151a5aee6af067e4536bb569b4c0799c831b98 100644
--- a/tensorflow/core/platform/protobuf_internal.h
+++ b/tensorflow/core/platform/protobuf_internal.h
@@ -45,8 +45,8 @@ Status ParseAny(const google::protobuf::Any& any, T* message,
 #ifdef TENSORFLOW_LITE_PROTOS
   if (any.type_url() != strings::StrCat("type.googleapis.com/", type_name)) {
     return errors::FailedPrecondition(
-        "Expected Any type_url for: ", type_name, ". Got: ",
-        string(any.type_url().data(), any.type_url().size()), ".");
+        "Expected Any type_url for: ", type_name,
+        ". Got: ", string(any.type_url().data(), any.type_url().size()), ".");
   }
   if (!message->ParseFromString(any.value())) {
     return errors::FailedPrecondition("Failed to unpack: ",
diff --git a/tensorflow/core/platform/s3/BUILD b/tensorflow/core/platform/s3/BUILD
index b7bc1a11d6583787e2c0fb07d004dc2badc5bcca..3a0ad2e9bd09211aa452f8b39b621343a113785d 100644
--- a/tensorflow/core/platform/s3/BUILD
+++ b/tensorflow/core/platform/s3/BUILD
@@ -28,6 +28,8 @@ filegroup(
 tf_cc_binary(
     name = "s3_file_system.so",
     srcs = [
+        "aws_logging.cc",
+        "aws_logging.h",
         "s3_crypto.cc",
         "s3_crypto.h",
         "s3_file_system.cc",
@@ -43,8 +45,8 @@ tf_cc_binary(
     linkshared = 1,
     deps = [
         "//tensorflow/core:framework_headers_lib",
-        "@aws//:aws",
-        "@curl//:curl",
+        "@aws",
+        "@curl",
         "@protobuf_archive//:protobuf_headers",
     ],
 )
@@ -60,12 +62,28 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "@aws//:aws",
+        "@aws",
         "@boringssl//:crypto",
     ],
     alwayslink = 1,
 )
 
+cc_library(
+    name = "aws_logging",
+    srcs = [
+        "aws_logging.cc",
+    ],
+    hdrs = [
+        "aws_logging.h",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "@aws",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "s3_file_system",
     srcs = [
@@ -75,10 +93,11 @@ cc_library(
         "s3_file_system.h",
     ],
     deps = [
+        ":aws_logging",
         ":s3_crypto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "@aws//:aws",
+        "@aws",
     ],
     alwayslink = 1,
 )
@@ -98,6 +117,6 @@ tf_cc_test(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "@aws//:aws",
+        "@aws",
     ],
 )
diff --git a/tensorflow/core/platform/s3/aws_logging.cc b/tensorflow/core/platform/s3/aws_logging.cc
new file mode 100644
index 0000000000000000000000000000000000000000..44317f1a3e41831b903bd0044d53d1eba80168df
--- /dev/null
+++ b/tensorflow/core/platform/s3/aws_logging.cc
@@ -0,0 +1,122 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/platform/s3/aws_logging.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+#include <aws/core/Aws.h>
+#include <aws/core/utils/logging/AWSLogging.h>
+#include <aws/core/utils/logging/LogSystemInterface.h>
+
+#include <cstdarg>
+
+namespace tensorflow {
+
+AWSLogSystem::AWSLogSystem(Aws::Utils::Logging::LogLevel log_level)
+    : log_level_(log_level) {}
+
+void AWSLogSystem::Log(Aws::Utils::Logging::LogLevel log_level, const char* tag,
+                       const char* format, ...) {
+  std::va_list args;
+  va_start(args, format);
+
+  const string s = strings::Printf(format, args);
+
+  va_end(args);
+
+  LogMessage(log_level, s);
+}
+
+void AWSLogSystem::LogStream(Aws::Utils::Logging::LogLevel log_level,
+                             const char* tag,
+                             const Aws::OStringStream& message_stream) {
+  LogMessage(log_level, message_stream.rdbuf()->str().c_str());
+}
+
+void AWSLogSystem::LogMessage(Aws::Utils::Logging::LogLevel log_level,
+                              const std::string& message) {
+  if (message == "Initializing Curl library") return;
+  switch (log_level) {
+    case Aws::Utils::Logging::LogLevel::Info:
+      LOG(INFO) << message;
+      break;
+    case Aws::Utils::Logging::LogLevel::Warn:
+      LOG(WARNING) << message;
+      break;
+    case Aws::Utils::Logging::LogLevel::Error:
+      LOG(ERROR) << message;
+      break;
+    case Aws::Utils::Logging::LogLevel::Fatal:
+      LOG(FATAL) << message;
+      break;
+    default:
+      LOG(ERROR) << message;
+      break;
+  }
+}
+
+namespace {
+static const char* kAWSLoggingTag = "AWSLogging";
+
+Aws::Utils::Logging::LogLevel ParseLogLevelFromEnv() {
+  Aws::Utils::Logging::LogLevel log_level = Aws::Utils::Logging::LogLevel::Info;
+
+  const int64_t level = tensorflow::internal::MinLogLevelFromEnv();
+
+  switch (level) {
+    case INFO:
+      log_level = Aws::Utils::Logging::LogLevel::Info;
+      break;
+    case WARNING:
+      log_level = Aws::Utils::Logging::LogLevel::Warn;
+      break;
+    case ERROR:
+      log_level = Aws::Utils::Logging::LogLevel::Error;
+      break;
+    case FATAL:
+      log_level = Aws::Utils::Logging::LogLevel::Fatal;
+      break;
+    default:
+      log_level = Aws::Utils::Logging::LogLevel::Info;
+      break;
+  }
+
+  return log_level;
+}
+}  // namespace
+
+static bool initialized = false;
+static mutex s3_logging_mutex(LINKER_INITIALIZED);
+void AWSLogSystem::InitializeAWSLogging() {
+  std::lock_guard<mutex> s3_logging_lock(s3_logging_mutex);
+  if (!initialized) {
+    Aws::Utils::Logging::InitializeAWSLogging(
+        Aws::MakeShared<AWSLogSystem>(kAWSLoggingTag, ParseLogLevelFromEnv()));
+    initialized = true;
+    return;
+  }
+}
+
+void AWSLogSystem::ShutdownAWSLogging() {
+  std::lock_guard<mutex> s3_logging_lock(s3_logging_mutex);
+  if (initialized) {
+    Aws::Utils::Logging::ShutdownAWSLogging();
+    initialized = false;
+    return;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/s3/aws_logging.h b/tensorflow/core/platform/s3/aws_logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0da8f3c83524df682e65878f39a2f500aa64a6b
--- /dev/null
+++ b/tensorflow/core/platform/s3/aws_logging.h
@@ -0,0 +1,68 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_S3_S3_LOGGING_H_
+#define TENSORFLOW_CONTRIB_S3_S3_LOGGING_H_
+
+#include <atomic>
+#include <string>
+
+#include <aws/core/utils/logging/LogLevel.h>
+#include <aws/core/utils/logging/LogSystemInterface.h>
+#include "tensorflow/core/platform/default/logging.h"
+
+namespace tensorflow {
+
+class AWSLogSystem : public Aws::Utils::Logging::LogSystemInterface {
+ public:
+  static void InitializeAWSLogging();
+  static void ShutdownAWSLogging();
+
+  explicit AWSLogSystem(Aws::Utils::Logging::LogLevel log_level);
+  virtual ~AWSLogSystem() = default;
+
+  // Gets the currently configured log level.
+  virtual Aws::Utils::Logging::LogLevel GetLogLevel(void) const override {
+    return log_level_;
+  }
+
+  // Set a new log level. This has the immediate effect of changing the log.
+  void SetLogLevel(Aws::Utils::Logging::LogLevel log_level) {
+    log_level_.store(log_level);
+  }
+
+  // Does a printf style output to ProcessFormattedStatement. Don't use this,
+  // it's unsafe. See LogStream.
+  // Since non-static C++ methods have an implicit this argument,
+  // TF_PRINTF_ATTRIBUTE should be counted from two (vs. one).
+  virtual void Log(Aws::Utils::Logging::LogLevel log_level, const char* tag,
+                   const char* format, ...) override TF_PRINTF_ATTRIBUTE(4, 5);
+
+  // Writes the stream to ProcessFormattedStatement.
+  virtual void LogStream(Aws::Utils::Logging::LogLevel log_level,
+                         const char* tag,
+                         const Aws::OStringStream& messageStream) override;
+
+ private:
+  void LogMessage(Aws::Utils::Logging::LogLevel log_level,
+                  const string& message);
+  std::atomic<Aws::Utils::Logging::LogLevel> log_level_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AWSLogSystem);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_S3_S3_LOGGING_H_
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 234f3c3aed7f036892227dd2ba96a3e1393517b4..301fcb9dbf653d29f6ac5321332c8764adaad681 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -12,13 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/s3/s3_file_system.h"
+#include "tensorflow/core/platform/s3/aws_logging.h"
 #include "tensorflow/core/platform/s3/s3_crypto.h"
 
 #include <aws/core/Aws.h>
+#include <aws/core/config/AWSProfileConfigLoader.h>
 #include <aws/core/utils/FileSystemUtils.h>
+#include <aws/core/utils/StringUtils.h>
+#include <aws/core/utils/logging/AWSLogging.h>
+#include <aws/core/utils/logging/LogSystemInterface.h>
+#include <aws/core/utils/StringUtils.h>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/S3Errors.h>
 #include <aws/s3/model/CopyObjectRequest.h>
@@ -33,6 +40,7 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
 static const char* kS3FileSystemAllocationTag = "S3FileSystemAllocation";
 static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
 static const int kS3GetChildrenMaxKeys = 100;
@@ -49,9 +57,39 @@ Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
     if (endpoint) {
       cfg.endpointOverride = Aws::String(endpoint);
     }
-    const char* region = getenv("S3_REGION");
+    const char* region = getenv("AWS_REGION");
+    if (!region) {
+      // TODO (yongtang): `S3_REGION` should be deprecated after 2.0.
+      region = getenv("S3_REGION");
+    }
     if (region) {
       cfg.region = Aws::String(region);
+    } else {
+      // Load config file (e.g., ~/.aws/config) only if AWS_SDK_LOAD_CONFIG
+      // is set with a truthy value.
+      const char* load_config_env = getenv("AWS_SDK_LOAD_CONFIG");
+      string load_config =
+          load_config_env ? str_util::Lowercase(load_config_env) : "";
+      if (load_config == "true" || load_config == "1") {
+        Aws::String config_file;
+        // If AWS_CONFIG_FILE is set then use it, otherwise use ~/.aws/config.
+        const char* config_file_env = getenv("AWS_CONFIG_FILE");
+        if (config_file_env) {
+          config_file = config_file_env;
+        } else {
+          const char* home_env = getenv("HOME");
+          if (home_env) {
+            config_file = home_env;
+            config_file += "/.aws/config";
+          }
+        }
+        Aws::Config::AWSConfigFileProfileConfigLoader loader(config_file);
+        loader.Load();
+        auto profiles = loader.GetProfiles();
+        if (!profiles["default"].GetRegion().empty()) {
+          cfg.region = profiles["default"].GetRegion();
+        }
+      }
     }
     const char* use_https = getenv("S3_USE_HTTPS");
     if (use_https) {
@@ -69,6 +107,22 @@ Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
         cfg.verifySSL = true;
       }
     }
+    const char* connect_timeout = getenv("S3_CONNECT_TIMEOUT_MSEC");
+    if (connect_timeout) {
+      int64 timeout;
+
+      if (strings::safe_strto64(connect_timeout, &timeout)) {
+        cfg.connectTimeoutMs = timeout;
+      }
+    }
+    const char* request_timeout = getenv("S3_REQUEST_TIMEOUT_MSEC");
+    if (request_timeout) {
+      int64 timeout;
+
+      if (strings::safe_strto64(request_timeout, &timeout)) {
+        cfg.requestTimeoutMs = timeout;
+      }
+    }
 
     init = true;
   }
@@ -76,6 +130,15 @@ Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
   return cfg;
 };
 
+void ShutdownClient(Aws::S3::S3Client* s3_client) {
+  if (s3_client != nullptr) {
+    delete s3_client;
+    Aws::SDKOptions options;
+    Aws::ShutdownAPI(options);
+    AWSLogSystem::ShutdownAWSLogging();
+  }
+}
+
 Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
                    string* object) {
   if (!bucket || !object) {
@@ -103,12 +166,12 @@ Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
 
 class S3RandomAccessFile : public RandomAccessFile {
  public:
-  S3RandomAccessFile(const string& bucket, const string& object)
-      : bucket_(bucket), object_(object) {}
+  S3RandomAccessFile(const string& bucket, const string& object,
+                     std::shared_ptr<Aws::S3::S3Client> s3_client)
+      : bucket_(bucket), object_(object), s3_client_(s3_client) {}
 
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
-    Aws::S3::S3Client s3Client(GetDefaultClientConfig());
     Aws::S3::Model::GetObjectRequest getObjectRequest;
     getObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
     string bytes = strings::StrCat("bytes=", offset, "-", offset + n - 1);
@@ -116,7 +179,7 @@ class S3RandomAccessFile : public RandomAccessFile {
     getObjectRequest.SetResponseStreamFactory([]() {
       return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag);
     });
-    auto getObjectOutcome = s3Client.GetObject(getObjectRequest);
+    auto getObjectOutcome = this->s3_client_->GetObject(getObjectRequest);
     if (!getObjectOutcome.IsSuccess()) {
       n = 0;
       *result = StringPiece(scratch, n);
@@ -134,13 +197,16 @@ class S3RandomAccessFile : public RandomAccessFile {
  private:
   string bucket_;
   string object_;
+  std::shared_ptr<Aws::S3::S3Client> s3_client_;
 };
 
 class S3WritableFile : public WritableFile {
  public:
-  S3WritableFile(const string& bucket, const string& object)
+  S3WritableFile(const string& bucket, const string& object,
+                 std::shared_ptr<Aws::S3::S3Client> s3_client)
       : bucket_(bucket),
         object_(object),
+        s3_client_(s3_client),
         sync_needed_(true),
         outfile_(Aws::MakeShared<Aws::Utils::TempFile>(
             kS3FileSystemAllocationTag, "/tmp/s3_filesystem_XXXXXX",
@@ -179,17 +245,13 @@ class S3WritableFile : public WritableFile {
     if (!sync_needed_) {
       return Status::OK();
     }
-    Aws::Client::ClientConfiguration clientConfig = GetDefaultClientConfig();
-    clientConfig.connectTimeoutMs = 300000;
-    clientConfig.requestTimeoutMs = 600000;
-    Aws::S3::S3Client s3Client(clientConfig);
     Aws::S3::Model::PutObjectRequest putObjectRequest;
     putObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
     long offset = outfile_->tellp();
     outfile_->seekg(0);
     putObjectRequest.SetBody(outfile_);
     putObjectRequest.SetContentLength(offset);
-    auto putObjectOutcome = s3Client.PutObject(putObjectRequest);
+    auto putObjectOutcome = this->s3_client_->PutObject(putObjectRequest);
     outfile_->clear();
     outfile_->seekp(offset);
     if (!putObjectOutcome.IsSuccess()) {
@@ -204,6 +266,7 @@ class S3WritableFile : public WritableFile {
  private:
   string bucket_;
   string object_;
+  std::shared_ptr<Aws::S3::S3Client> s3_client_;
   bool sync_needed_;
   std::shared_ptr<Aws::Utils::TempFile> outfile_;
 };
@@ -220,27 +283,48 @@ class S3ReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
   uint64 length_;
 };
 
-S3FileSystem::S3FileSystem() {
-  Aws::SDKOptions options;
-  options.cryptoOptions.sha256Factory_create_fn = []() {
-    return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag);
-  };
-  options.cryptoOptions.sha256HMACFactory_create_fn = []() {
-    return Aws::MakeShared<S3SHA256HmacFactory>(S3CryptoAllocationTag);
-  };
-  Aws::InitAPI(options);
-}
+}  // namespace
+
+S3FileSystem::S3FileSystem()
+    : s3_client_(nullptr, ShutdownClient), client_lock_() {}
+
+S3FileSystem::~S3FileSystem() {}
+
+// Initializes s3_client_, if needed, and returns it.
+std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() {
+  std::lock_guard<mutex> lock(this->client_lock_);
+
+  if (this->s3_client_.get() == nullptr) {
+    AWSLogSystem::InitializeAWSLogging();
+
+    Aws::SDKOptions options;
+    options.cryptoOptions.sha256Factory_create_fn = []() {
+      return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag);
+    };
+    options.cryptoOptions.sha256HMACFactory_create_fn = []() {
+      return Aws::MakeShared<S3SHA256HmacFactory>(S3CryptoAllocationTag);
+    };
+    Aws::InitAPI(options);
+
+    // The creation of S3Client disables virtual addressing:
+    //   S3Client(clientConfiguration, signPayloads, useVirtualAdressing = true)
+    // The purpose is to address the issue encountered when there is an `.`
+    // in the bucket name. Due to TLS hostname validation or DNS rules,
+    // the bucket may not be resolved. Disabling of virtual addressing
+    // should address the issue. See GitHub issue 16397 for details.
+    this->s3_client_ = std::shared_ptr<Aws::S3::S3Client>(new Aws::S3::S3Client(
+        GetDefaultClientConfig(),
+        Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, false));
+  }
 
-S3FileSystem::~S3FileSystem() {
-  Aws::SDKOptions options;
-  Aws::ShutdownAPI(options);
+  return this->s3_client_;
 }
 
 Status S3FileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3RandomAccessFile(bucket, object));
+  result->reset(new S3RandomAccessFile(bucket, object, this->GetS3Client()));
   return Status::OK();
 }
 
@@ -248,7 +332,7 @@ Status S3FileSystem::NewWritableFile(const string& fname,
                                      std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3WritableFile(bucket, object));
+  result->reset(new S3WritableFile(bucket, object, this->GetS3Client()));
   return Status::OK();
 }
 
@@ -263,7 +347,7 @@ Status S3FileSystem::NewAppendableFile(const string& fname,
 
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3WritableFile(bucket, object));
+  result->reset(new S3WritableFile(bucket, object, this->GetS3Client()));
 
   while (true) {
     status = reader->Read(offset, kS3ReadAppendableFileBufferSize, &read_chunk,
@@ -314,7 +398,6 @@ Status S3FileSystem::GetChildren(const string& dir,
     prefix.push_back('/');
   }
 
-  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
   Aws::S3::Model::ListObjectsRequest listObjectsRequest;
   listObjectsRequest.WithBucket(bucket.c_str())
       .WithPrefix(prefix.c_str())
@@ -325,7 +408,8 @@ Status S3FileSystem::GetChildren(const string& dir,
 
   Aws::S3::Model::ListObjectsResult listObjectsResult;
   do {
-    auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+    auto listObjectsOutcome =
+        this->GetS3Client()->ListObjects(listObjectsRequest);
     if (!listObjectsOutcome.IsSuccess()) {
       string error = strings::StrCat(
           listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -359,11 +443,10 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, true, &bucket, &object));
 
-  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
   if (object.empty()) {
     Aws::S3::Model::HeadBucketRequest headBucketRequest;
     headBucketRequest.WithBucket(bucket.c_str());
-    auto headBucketOutcome = s3Client.HeadBucket(headBucketRequest);
+    auto headBucketOutcome = this->GetS3Client()->HeadBucket(headBucketRequest);
     if (!headBucketOutcome.IsSuccess()) {
       string error = strings::StrCat(
           headBucketOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -381,7 +464,7 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
   headObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str());
   headObjectRequest.SetResponseStreamFactory(
       []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
-  auto headObjectOutcome = s3Client.HeadObject(headObjectRequest);
+  auto headObjectOutcome = this->GetS3Client()->HeadObject(headObjectRequest);
   if (headObjectOutcome.IsSuccess()) {
     stats->length = headObjectOutcome.GetResult().GetContentLength();
     stats->is_directory = 0;
@@ -399,7 +482,8 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
       .WithMaxKeys(1);
   listObjectsRequest.SetResponseStreamFactory(
       []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
-  auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+  auto listObjectsOutcome =
+      this->GetS3Client()->ListObjects(listObjectsRequest);
   if (listObjectsOutcome.IsSuccess()) {
     if (listObjectsOutcome.GetResult().GetContents().size() > 0) {
       stats->length = 0;
@@ -417,11 +501,11 @@ Status S3FileSystem::DeleteFile(const string& fname) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
 
-  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
   Aws::S3::Model::DeleteObjectRequest deleteObjectRequest;
   deleteObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str());
 
-  auto deleteObjectOutcome = s3Client.DeleteObject(deleteObjectRequest);
+  auto deleteObjectOutcome =
+      this->GetS3Client()->DeleteObject(deleteObjectRequest);
   if (!deleteObjectOutcome.IsSuccess()) {
     string error = strings::StrCat(
         deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -436,10 +520,9 @@ Status S3FileSystem::CreateDir(const string& dirname) {
   TF_RETURN_IF_ERROR(ParseS3Path(dirname, true, &bucket, &object));
 
   if (object.empty()) {
-    Aws::S3::S3Client s3Client(GetDefaultClientConfig());
     Aws::S3::Model::HeadBucketRequest headBucketRequest;
     headBucketRequest.WithBucket(bucket.c_str());
-    auto headBucketOutcome = s3Client.HeadBucket(headBucketRequest);
+    auto headBucketOutcome = this->GetS3Client()->HeadBucket(headBucketRequest);
     if (!headBucketOutcome.IsSuccess()) {
       return errors::NotFound("The bucket ", bucket, " was not found.");
     }
@@ -459,7 +542,6 @@ Status S3FileSystem::DeleteDir(const string& dirname) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(dirname, false, &bucket, &object));
 
-  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
   string prefix = object;
   if (prefix.back() != '/') {
     prefix.push_back('/');
@@ -470,7 +552,8 @@ Status S3FileSystem::DeleteDir(const string& dirname) {
       .WithMaxKeys(2);
   listObjectsRequest.SetResponseStreamFactory(
       []() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
-  auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+  auto listObjectsOutcome =
+      this->GetS3Client()->ListObjects(listObjectsRequest);
   if (listObjectsOutcome.IsSuccess()) {
     auto contents = listObjectsOutcome.GetResult().GetContents();
     if (contents.size() > 1 ||
@@ -510,8 +593,6 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
     }
   }
 
-  Aws::S3::S3Client s3Client(GetDefaultClientConfig());
-
   Aws::S3::Model::CopyObjectRequest copyObjectRequest;
   Aws::S3::Model::DeleteObjectRequest deleteObjectRequest;
 
@@ -524,7 +605,8 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
 
   Aws::S3::Model::ListObjectsResult listObjectsResult;
   do {
-    auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
+    auto listObjectsOutcome =
+        this->GetS3Client()->ListObjects(listObjectsRequest);
     if (!listObjectsOutcome.IsSuccess()) {
       string error = strings::StrCat(
           listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -537,13 +619,15 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
       Aws::String src_key = object.GetKey();
       Aws::String target_key = src_key;
       target_key.replace(0, src_object.length(), target_object.c_str());
-      Aws::String source = Aws::String(src_bucket.c_str()) + "/" + src_key;
+      Aws::String source = Aws::String(src_bucket.c_str()) + "/" +
+                           Aws::Utils::StringUtils::URLEncode(src_key.c_str());
 
       copyObjectRequest.SetBucket(target_bucket.c_str());
       copyObjectRequest.SetKey(target_key);
       copyObjectRequest.SetCopySource(source);
 
-      auto copyObjectOutcome = s3Client.CopyObject(copyObjectRequest);
+      auto copyObjectOutcome =
+          this->GetS3Client()->CopyObject(copyObjectRequest);
       if (!copyObjectOutcome.IsSuccess()) {
         string error = strings::StrCat(
             copyObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
@@ -554,7 +638,8 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
       deleteObjectRequest.SetBucket(src_bucket.c_str());
       deleteObjectRequest.SetKey(src_key.c_str());
 
-      auto deleteObjectOutcome = s3Client.DeleteObject(deleteObjectRequest);
+      auto deleteObjectOutcome =
+          this->GetS3Client()->DeleteObject(deleteObjectRequest);
       if (!deleteObjectOutcome.IsSuccess()) {
         string error = strings::StrCat(
             deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 31ba3cecc5d0283f83091bfb687445a6ce87a344..31264be621d93c1efb68f7b0b49e28cb65b05de1 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
 #define TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
 
+#include <aws/s3/S3Client.h>
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 
@@ -53,6 +55,26 @@ class S3FileSystem : public FileSystem {
   Status GetFileSize(const string& fname, uint64* size) override;
 
   Status RenameFile(const string& src, const string& target) override;
+
+ private:
+  // Returns the member S3 client, initializing as-needed.
+  // When the client tries to access the object in S3, e.g.,
+  //   s3://bucket-name/path/to/object
+  // the behavior could be controlled by various environmental
+  // variables.
+  // By default S3 access regional endpoint, with region
+  // controlled by `AWS_REGION`. The endpoint could be overridden
+  // explicitly with `S3_ENDPOINT`. S3 uses HTTPS by default.
+  // If S3_USE_HTTPS=0 is specified, HTTP is used. Also,
+  // S3_VERIFY_SSL=0 could disable SSL verification in case
+  // HTTPS is used.
+  // This S3 Client does not support Virtual Hosted–Style Method
+  // for a bucket.
+  std::shared_ptr<Aws::S3::S3Client> GetS3Client();
+
+  std::shared_ptr<Aws::S3::S3Client> s3_client_;
+  // Lock held when checking for s3_client_ initialization.
+  mutex client_lock_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/s3/s3_file_system_test.cc b/tensorflow/core/platform/s3/s3_file_system_test.cc
index 0b42f5fcec0041a01a571b1e38dedaa7ef191c22..d4411d98657811c0bf6858c5ac48c7991e8bed5a 100644
--- a/tensorflow/core/platform/s3/s3_file_system_test.cc
+++ b/tensorflow/core/platform/s3/s3_file_system_test.cc
@@ -130,6 +130,8 @@ TEST_F(S3FileSystemTest, NewReadOnlyMemoryRegionFromFile) {
 
 TEST_F(S3FileSystemTest, FileExists) {
   const string fname = TmpDir("FileExists");
+  // Ensure the file doesn't yet exist.
+  TF_ASSERT_OK(s3fs.DeleteFile(fname));
   EXPECT_EQ(error::Code::NOT_FOUND, s3fs.FileExists(fname).code());
   TF_ASSERT_OK(WriteString(fname, "test"));
   TF_EXPECT_OK(s3fs.FileExists(fname));
diff --git a/tensorflow/core/platform/setround.cc b/tensorflow/core/platform/setround.cc
index 0c66da09bb9aa1c892063be11c66aedaf75d7eb6..592626bfa17e691d1b10ddce5c7f0f31ed825861 100644
--- a/tensorflow/core/platform/setround.cc
+++ b/tensorflow/core/platform/setround.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/platform/setround.h"
 
-
 namespace tensorflow {
 namespace port {
 
diff --git a/tensorflow/core/platform/stacktrace_handler.cc b/tensorflow/core/platform/stacktrace_handler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff31c97be0a76b425503120c326a79f5a62d3377
--- /dev/null
+++ b/tensorflow/core/platform/stacktrace_handler.cc
@@ -0,0 +1,135 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/platform.h"
+
+#if !defined(PLATFORM_GOOGLE) && !defined(IS_MOBILE_PLATFORM) && \
+    defined(PLATFORM_POSIX) && (defined(__clang__) || defined(__GNUC__))
+#define TF_GENERATE_STACKTRACE
+#endif
+
+#if defined(TF_GENERATE_STACKTRACE)
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <string>
+
+#include "tensorflow/core/platform/abi.h"
+#include "tensorflow/core/platform/stacktrace.h"
+
+#endif  // defined(TF_GENERATE_STACKTRACE)
+
+namespace tensorflow {
+namespace testing {
+
+#if defined(TF_GENERATE_STACKTRACE)
+// This function will print stacktrace to STDERR.
+// It avoids using malloc, so it makes sure to dump the stack even when the heap
+// is corrupted. However, it can dump mangled symbols.
+inline void SafePrintStackTrace() {
+  static const char begin_msg[] = "*** BEGIN MANGLED STACK TRACE ***\n";
+  (void)write(STDERR_FILENO, begin_msg, strlen(begin_msg));
+
+  int buffer_size = 128;
+  void *trace[128];
+  // Run backtrace to get the size of the stacktrace
+  buffer_size = backtrace(trace, buffer_size);
+
+  // Print a mangled stacktrace to STDERR as safely as possible.
+  backtrace_symbols_fd(trace, buffer_size, STDERR_FILENO);
+
+  static const char end_msg[] = "*** END MANGLED STACK TRACE ***\n\n";
+  (void)write(STDERR_FILENO, end_msg, strlen(end_msg));
+}
+
+static void StacktraceHandler(int sig, siginfo_t *si, void *v) {
+  // Make sure our handler does not deadlock. And this should be the last thing
+  // our program does. Therefore, set a timer to kill the program in 60
+  // seconds.
+  struct itimerval timer;
+  timer.it_value.tv_sec = 60;
+  timer.it_value.tv_usec = 0;
+  timer.it_interval.tv_sec = 0;
+  timer.it_interval.tv_usec = 0;
+  setitimer(ITIMER_REAL, &timer, 0);
+
+  struct sigaction sa_timeout;
+  memset(&sa_timeout, 0, sizeof(sa_timeout));
+  sa_timeout.sa_handler = SIG_DFL;
+  sigaction(SIGALRM, &sa_timeout, 0);
+
+  char buf[128];
+
+  snprintf(buf, sizeof(buf), "*** Received signal %d ***\n", sig);
+  (void)write(STDERR_FILENO, buf, strlen(buf));
+
+  // Print "a" stack trace, as safely as possible.
+  SafePrintStackTrace();
+
+  // Up until this line, we made sure not to allocate memory, to be able to dump
+  // a stack trace even in the event of heap corruption. After this line, we
+  // will try to print more human readable things to the terminal.
+  // But these have a higher probability to fail.
+  std::string stacktrace = CurrentStackTrace();
+  (void)write(STDERR_FILENO, stacktrace.c_str(), stacktrace.length());
+
+  // Abort the program.
+  struct sigaction sa;
+  sigemptyset(&sa.sa_mask);
+  sa.sa_flags = 0;
+  sa.sa_handler = SIG_DFL;
+  sigaction(SIGABRT, &sa, NULL);
+  abort();
+}
+
+void InstallStacktraceHandler() {
+  int handled_signals[] = {SIGSEGV, SIGABRT, SIGBUS, SIGILL, SIGFPE};
+
+  for (int i = 0; i < sizeof(handled_signals) / sizeof(int); i++) {
+    int sig = handled_signals[i];
+    struct sigaction sa;
+    struct sigaction osa;
+
+    sigemptyset(&sa.sa_mask);
+    sa.sa_flags = SA_SIGINFO | SA_RESETHAND;
+    sa.sa_sigaction = &StacktraceHandler;
+    if (sigaction(sig, &sa, &osa) != 0) {
+      char buf[128];
+      snprintf(buf, sizeof(buf),
+               "Warning, can't install backtrace signal handler for signal %d, "
+               "errno:%d \n",
+               sig, errno);
+      (void)write(STDERR_FILENO, buf, strlen(buf));
+    } else if (osa.sa_handler != SIG_DFL) {
+      char buf[128];
+      snprintf(buf, sizeof(buf),
+               "Warning, backtrace signal handler for signal %d overwrote "
+               "previous handler.\n",
+               sig);
+      (void)write(STDERR_FILENO, buf, strlen(buf));
+    }
+  }
+}
+
+#else
+void InstallStacktraceHandler() {}
+#endif  // defined(TF_GENERATE_STACKTRACE)
+
+}  // namespace testing
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/stacktrace_handler.h b/tensorflow/core/platform/stacktrace_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..a52970fdaaa6693d537ac42b3d237ce3eb6a7755
--- /dev/null
+++ b/tensorflow/core/platform/stacktrace_handler.h
@@ -0,0 +1,28 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_BACKTRACE_H_
+#define TENSORFLOW_CORE_PLATFORM_BACKTRACE_H_
+
+namespace tensorflow {
+namespace testing {
+
+// Installs signal handlers to print out stack trace.
+void InstallStacktraceHandler();
+
+}  // namespace testing
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_BACKTRACE_H_
diff --git a/tensorflow/core/platform/stacktrace_handler_test.cc b/tensorflow/core/platform/stacktrace_handler_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..958c7de232ed4d11a72d6a245c83afb8f62574cd
--- /dev/null
+++ b/tensorflow/core/platform/stacktrace_handler_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Testing proper operation of the stacktrace handler.
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <string>
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+#define READ_BUFFER_SIZE 1024
+
+TEST(StacktraceHandlerTest, GeneratesStacktrace) {
+  // Create a pipe to write/read the child stdout.
+  int test_pipe[2];
+  EXPECT_EQ(pipe(test_pipe), 0);
+
+  // Fork the process.
+  int test_pid = fork();
+
+  if (test_pid == 0) {
+    // Child process.
+    // Close the read end of the pipe, redirect stdout and sleep.
+    close(test_pipe[0]);
+    dup2(test_pipe[1], STDOUT_FILENO);
+    dup2(test_pipe[1], STDERR_FILENO);
+    sleep(10);
+  } else {
+    // Parent process.
+    // Close the write end of the pipe, wait a little and send SIGABRT to the
+    // child process. Then watch the pipe.
+    close(test_pipe[1]);
+    sleep(1);
+
+    // Send the signal.
+    kill(test_pid, SIGABRT);
+
+    // Read from the pipe.
+    char buffer[READ_BUFFER_SIZE];
+    std::string child_output = "";
+    while (true) {
+      int read_length = read(test_pipe[0], buffer, READ_BUFFER_SIZE);
+      if (read_length > 0) {
+        child_output += std::string(buffer, read_length);
+      } else {
+        break;
+      }
+    }
+    close(test_pipe[0]);
+
+    // Just make sure we can detect one of the calls in testing stack.
+    string test_stack_frame = "testing::internal::UnitTestImpl::RunAllTests()";
+
+    // Print the stack trace detected for information.
+    LOG(INFO) << "Output from the child process:";
+    LOG(INFO) << child_output;
+
+    EXPECT_NE(child_output.find(test_stack_frame), std::string::npos);
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/test_benchmark.h b/tensorflow/core/platform/test_benchmark.h
index a6636225ccbbc8154e290cd7f1aa6cafe3d2027a..327237dba933230cb313dd06091d2ff2ca3cc4b2 100644
--- a/tensorflow/core/platform/test_benchmark.h
+++ b/tensorflow/core/platform/test_benchmark.h
@@ -60,7 +60,7 @@ class Benchmark {
  private:
   string name_;
   int num_args_;
-  std::vector<std::pair<int, int>> args_;
+  std::vector<std::pair<int, int> > args_;
   void (*fn0_)(int) = nullptr;
   void (*fn1_)(int, int) = nullptr;
   void (*fn2_)(int, int, int) = nullptr;
diff --git a/tensorflow/core/platform/test_main.cc b/tensorflow/core/platform/test_main.cc
index 96c88afcc4b48be97682e06f2b728fd35b79e0da..677114f5f22b4fe70c6f006e536a2da5f17977d6 100644
--- a/tensorflow/core/platform/test_main.cc
+++ b/tensorflow/core/platform/test_main.cc
@@ -27,12 +27,14 @@ limitations under the License.
 #include <iostream>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/stacktrace_handler.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 GTEST_API_ int main(int argc, char** argv) {
   std::cout << "Running main() from test_main.cc\n";
 
+  tensorflow::testing::InstallStacktraceHandler();
   testing::InitGoogleTest(&argc, argv);
   for (int i = 1; i < argc; i++) {
     if (tensorflow::StringPiece(argv[i]).starts_with("--benchmarks=")) {
diff --git a/tensorflow/core/platform/types.h b/tensorflow/core/platform/types.h
index 93b82ecb7a7e668d5ea3d428861776388ddef9bb..e2dd5b003f291b6ce88ebabe2d66114762bd2c57 100644
--- a/tensorflow/core/platform/types.h
+++ b/tensorflow/core/platform/types.h
@@ -22,13 +22,21 @@ limitations under the License.
 // Include appropriate platform-dependent implementations
 #if defined(PLATFORM_GOOGLE) || defined(GOOGLE_INTEGRAL_TYPES)
 #include "tensorflow/core/platform/google/integral_types.h"
+#elif defined(PLATFORM_WINDOWS)
+#include "tensorflow/core/platform/windows/integral_types.h"
 #elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
-    defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS)
+    defined(PLATFORM_GOOGLE_ANDROID)
 #include "tensorflow/core/platform/default/integral_types.h"
 #else
 #error Define the appropriate PLATFORM_<foo> macro for this platform
 #endif
 
+#if defined(PLATFORM_WINDOWS)
+#include "tensorflow/core/platform/windows/cpu_info.h"
+#endif
+
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+
 namespace tensorflow {
 
 // Define tensorflow::string to refer to appropriate platform specific type.
diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h
index d6e78dbc8f9f25070d94141e46d35dcb8d727ef7..f20939d3c0ff02be30f19be170644fab44b6f45e 100644
--- a/tensorflow/core/platform/windows/cpu_info.h
+++ b/tensorflow/core/platform/windows/cpu_info.h
@@ -22,8 +22,10 @@ limitations under the License.
 // Byte order defines provided by gcc. MSVC doesn't define those so
 // we define them here.
 // We assume that all windows platform out there are little endian.
+#if defined(_MSC_VER) && !defined(__clang__)
 #define __ORDER_LITTLE_ENDIAN__ 0x4d2
 #define __ORDER_BIG_ENDIAN__ 0x10e1
 #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+#endif
 
 #endif  // TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 788a4bf4b1af74393099d1b590a1e589d9a07f25..41b264417071cadb5f70806b458ee2b46ebb2feb 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #undef LoadLibrary
 #undef ERROR
 
+#include <string>
 #include <thread>
 #include <vector>
-#include <string>
 
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/load_library.h"
@@ -53,8 +53,7 @@ class StdThread : public Thread {
 
 class WindowsEnv : public Env {
  public:
-  WindowsEnv()
-      : GetSystemTimePreciseAsFileTime_(NULL) {
+  WindowsEnv() : GetSystemTimePreciseAsFileTime_(NULL) {
     // GetSystemTimePreciseAsFileTime function is only available in the latest
     // versions of Windows. For that reason, we try to look it up in
     // kernel32.dll at runtime and use an alternative option if the function
@@ -72,8 +71,8 @@ class WindowsEnv : public Env {
   }
 
   bool MatchPath(const string& path, const string& pattern) override {
-      std::wstring ws_path(WindowsFileSystem::Utf8ToWideChar(path));
-      std::wstring ws_pattern(WindowsFileSystem::Utf8ToWideChar(pattern));
+    std::wstring ws_path(WindowsFileSystem::Utf8ToWideChar(path));
+    std::wstring ws_pattern(WindowsFileSystem::Utf8ToWideChar(pattern));
     return PathMatchSpecW(ws_path.c_str(), ws_pattern.c_str()) == TRUE;
   }
 
@@ -122,14 +121,14 @@ class WindowsEnv : public Env {
     SetThreadpoolTimer(timer, &FileDueTime, 0, 0);
   }
 
-  Status LoadLibrary(const char *library_filename, void** handle) override {
+  Status LoadLibrary(const char* library_filename, void** handle) override {
     std::string file_name = library_filename;
     std::replace(file_name.begin(), file_name.end(), '/', '\\');
 
     std::wstring ws_file_name(WindowsFileSystem::Utf8ToWideChar(file_name));
 
     HMODULE hModule = LoadLibraryExW(ws_file_name.c_str(), NULL,
-      LOAD_WITH_ALTERED_SEARCH_PATH);
+                                     LOAD_WITH_ALTERED_SEARCH_PATH);
     if (!hModule) {
       return errors::NotFound(file_name + " not found");
     }
@@ -138,31 +137,30 @@ class WindowsEnv : public Env {
   }
 
   Status GetSymbolFromLibrary(void* handle, const char* symbol_name,
-    void** symbol) override {
+                              void** symbol) override {
     FARPROC found_symbol;
 
     found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
     if (found_symbol == NULL) {
       return errors::NotFound(std::string(symbol_name) + " not found");
     }
-    *symbol = (void **)found_symbol;
+    *symbol = (void**)found_symbol;
     return Status::OK();
   }
 
-  string FormatLibraryFileName(const string& name, const string& version)
-    override {
+  string FormatLibraryFileName(const string& name,
+                               const string& version) override {
     string filename;
     if (version.size() == 0) {
       filename = name + ".dll";
-    }
-    else {
+    } else {
       filename = name + version + ".dll";
     }
     return filename;
   }
 
  private:
-  typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
+  typedef VOID(WINAPI* FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
   FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
 };
 
diff --git a/tensorflow/core/platform/windows/error.cc b/tensorflow/core/platform/windows/error.cc
index 39e941a3834f7f7cd03e7791d43d56f190dc1fd6..291fc5003fb6bbc07274cdea72d73e92a453f363 100644
--- a/tensorflow/core/platform/windows/error.cc
+++ b/tensorflow/core/platform/windows/error.cc
@@ -21,7 +21,7 @@ namespace internal {
 std::string GetWindowsErrorMessage(DWORD err) {
   LPSTR buffer = NULL;
   DWORD flags = FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
-      FORMAT_MESSAGE_IGNORE_INSERTS;
+                FORMAT_MESSAGE_IGNORE_INSERTS;
   FormatMessageA(flags, NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
                  reinterpret_cast<LPSTR>(&buffer), 0, NULL);
   std::string message = buffer;
diff --git a/tensorflow/core/platform/windows/error.h b/tensorflow/core/platform/windows/error.h
index 026e0d5aa946f7c851dacc05a3306631e06886aa..ba643a0fa8f92f58fbd88ac00fba3f663bb7e0f2 100644
--- a/tensorflow/core/platform/windows/error.h
+++ b/tensorflow/core/platform/windows/error.h
@@ -24,9 +24,7 @@ namespace tensorflow {
 namespace internal {
 
 std::string GetWindowsErrorMessage(DWORD err);
-
-}
 }
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_ERROR_H_
-
diff --git a/tensorflow/core/platform/windows/integral_types.h b/tensorflow/core/platform/windows/integral_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..46338a536dbc3541763e62954fee74b2a5a0700b
--- /dev/null
+++ b/tensorflow/core/platform/windows/integral_types.h
@@ -0,0 +1,25 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
+#define TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
+
+#include "tensorflow/core/platform/default/integral_types.h"
+
+#include <cstddef>
+
+typedef std::ptrdiff_t ssize_t;
+
+#endif  // TENSORFLOW_PLATFORM_WINDOWS_INTEGRAL_TYPES_H_
diff --git a/tensorflow/core/platform/windows/net.cc b/tensorflow/core/platform/windows/net.cc
index 46eb072d42592028859122a4cad3d9478a96476e..2ab558ab95cafd15b10f7b887c846b32ab7e4c47 100644
--- a/tensorflow/core/platform/windows/net.cc
+++ b/tensorflow/core/platform/windows/net.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 #undef ERROR
 
-#pragma comment(lib,"Ws2_32.lib")
+#pragma comment(lib, "Ws2_32.lib")
 
 namespace tensorflow {
 namespace internal {
@@ -44,8 +44,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   CHECK_GE(*port, 0);
   CHECK_LE(*port, 65535);
   if (sock == INVALID_SOCKET) {
-    LOG(ERROR) << "socket() failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(ERROR) << "socket() failed: "
+               << GetWindowsErrorMessage(WSAGetLastError());
     return false;
   }
 
@@ -54,8 +54,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   int result = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
                           reinterpret_cast<const char*>(&one), sizeof(one));
   if (result == SOCKET_ERROR) {
-    LOG(ERROR) << "setsockopt() failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(ERROR) << "setsockopt() failed: "
+               << GetWindowsErrorMessage(WSAGetLastError());
     closesocket(sock);
     return false;
   }
@@ -66,8 +66,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   addr.sin_port = htons((uint16_t)*port);
   result = bind(sock, (struct sockaddr*)&addr, sizeof(addr));
   if (result == SOCKET_ERROR) {
-    LOG(WARNING) << "bind(port=" << *port << ") failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(WARNING) << "bind(port=" << *port
+                 << ") failed: " << GetWindowsErrorMessage(WSAGetLastError());
     closesocket(sock);
     return false;
   }
@@ -75,8 +75,8 @@ bool IsPortAvailable(int* port, bool is_tcp) {
   // Get the bound port number.
   result = getsockname(sock, (struct sockaddr*)&addr, &addr_len);
   if (result == SOCKET_ERROR) {
-    LOG(WARNING) << "getsockname() failed: " <<
-        GetWindowsErrorMessage(WSAGetLastError());
+    LOG(WARNING) << "getsockname() failed: "
+                 << GetWindowsErrorMessage(WSAGetLastError());
     closesocket(sock);
     return false;
   }
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index e327d53949caf7e2d30e6deba0be2848f010afc2..582b232054b850a2ef5ab8f47c089eb35a7bb3cf 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -149,8 +149,20 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 string Demangle(const char* mangled) { return mangled; }
 
 double NominalCPUFrequency() {
-  // TODO(yuefengz): implement it for this platform.
+#ifdef TENSORFLOW_USE_ABSL
+  return absl::base_internal::NominalCPUFrequency();
+#else
   return 1.0;
+#endif
+}
+
+int64 AvailableRam() {
+  MEMORYSTATUSEX statex;
+  statex.dwLength = sizeof(statex);
+  if (GlobalMemoryStatusEx(&statex)) {
+    return statex.ullAvailPhys / 1024;
+  }
+  return INT64_MAX;
 }
 
 }  // namespace port
diff --git a/tensorflow/core/platform/windows/subprocess.h b/tensorflow/core/platform/windows/subprocess.h
index b65313363ed79ab327414179a9923ba2d436dd0b..66ec44885d52195b807f4957aec6d590324b2975 100644
--- a/tensorflow/core/platform/windows/subprocess.h
+++ b/tensorflow/core/platform/windows/subprocess.h
@@ -19,8 +19,7 @@ limitations under the License.
 namespace tensorflow {
 
 // SubProcess is not yet implemented for Windows.
-class SubProcess {
-};
+class SubProcess {};
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/windows/test.cc b/tensorflow/core/platform/windows/test.cc
index 0ffd02ff14849d77761e85c30388dc49a53c84db..584acad91b24fc6be9b93f71b7d44b0fba3cb2e8 100644
--- a/tensorflow/core/platform/windows/test.cc
+++ b/tensorflow/core/platform/windows/test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/net.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/net.h"
 
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 604348fe03a01d44195ba8a8ff427ae3ef3a4137..b6b3722caae4dc0cdc0ddff91be479ab91a744b2 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <Shlwapi.h>
 #include <Windows.h>
 #include <direct.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <io.h>
-#include <Shlwapi.h>
 #undef StrCat
 #include <stdio.h>
 #include <sys/stat.h>
@@ -75,16 +75,16 @@ SSIZE_T pread(HANDLE hfile, char* src, size_t num_bytes, uint64_t offset) {
   if (TRUE == read_result) {
     result = bytes_read;
   } else if ((FALSE == read_result) &&
-      ((last_error = GetLastError()) != ERROR_IO_PENDING)) {
+             ((last_error = GetLastError()) != ERROR_IO_PENDING)) {
     result = (last_error == ERROR_HANDLE_EOF) ? 0 : -1;
   } else {
-    if (ERROR_IO_PENDING == last_error) { // Otherwise bytes_read already has the result.
-      BOOL overlapped_result = ::GetOverlappedResult(hfile, &overlapped,
-                                                     &bytes_read, TRUE);
+    if (ERROR_IO_PENDING ==
+        last_error) {  // Otherwise bytes_read already has the result.
+      BOOL overlapped_result =
+          ::GetOverlappedResult(hfile, &overlapped, &bytes_read, TRUE);
       if (FALSE == overlapped_result) {
         result = (::GetLastError() == ERROR_HANDLE_EOF) ? 0 : -1;
-      }
-      else {
+      } else {
         result = bytes_read;
       }
     }
@@ -151,11 +151,11 @@ class WindowsWritableFile : public WritableFile {
   Status Append(const StringPiece& data) override {
     DWORD bytes_written = 0;
     DWORD data_size = static_cast<DWORD>(data.size());
-    BOOL write_result = ::WriteFile(hfile_, data.data(), data_size,
-                                    &bytes_written, NULL);
+    BOOL write_result =
+        ::WriteFile(hfile_, data.data(), data_size, &bytes_written, NULL);
     if (FALSE == write_result) {
-      return IOErrorFromWindowsError(
-          "Failed to WriteFile: " + filename_, ::GetLastError());
+      return IOErrorFromWindowsError("Failed to WriteFile: " + filename_,
+                                     ::GetLastError());
     }
 
     assert(size_t(bytes_written) == data.size());
@@ -171,8 +171,8 @@ class WindowsWritableFile : public WritableFile {
     }
 
     if (FALSE == ::CloseHandle(hfile_)) {
-      return IOErrorFromWindowsError(
-          "CloseHandle failed for: " + filename_, ::GetLastError());
+      return IOErrorFromWindowsError("CloseHandle failed for: " + filename_,
+                                     ::GetLastError());
     }
 
     hfile_ = INVALID_HANDLE_VALUE;
@@ -187,9 +187,7 @@ class WindowsWritableFile : public WritableFile {
     return Status::OK();
   }
 
-  Status Sync() override {
-    return Flush();
-  }
+  Status Sync() override { return Flush(); }
 };
 
 class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
@@ -204,7 +202,10 @@ class WinReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
  public:
   WinReadOnlyMemoryRegion(const std::string& filename, HANDLE hfile,
                           HANDLE hmap, const void* address, uint64 length)
-      : filename_(filename), hfile_(hfile), hmap_(hmap), address_(address),
+      : filename_(filename),
+        hfile_(hfile),
+        hmap_(hmap),
+        address_(address),
         length_(length) {}
 
   ~WinReadOnlyMemoryRegion() {
@@ -238,9 +239,9 @@ Status WindowsFileSystem::NewRandomAccessFile(
   // almost all tests would work with a possible exception of fault_injection.
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
 
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ,
-                               share_mode, NULL, OPEN_EXISTING, file_flags,
-                               NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ, share_mode, NULL,
+                    OPEN_EXISTING, file_flags, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "NewRandomAccessFile failed to Create/Open: " + fname;
@@ -258,9 +259,9 @@ Status WindowsFileSystem::NewWritableFile(
   result->reset();
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE,
-                               share_mode, NULL, CREATE_ALWAYS,
-                               FILE_ATTRIBUTE_NORMAL, NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE, share_mode,
+                    NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "Failed to create a NewWriteableFile: " + fname;
@@ -278,9 +279,9 @@ Status WindowsFileSystem::NewAppendableFile(
   result->reset();
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE,
-                               share_mode, NULL, OPEN_ALWAYS,
-                               FILE_ATTRIBUTE_NORMAL, NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_WRITE, share_mode,
+                    NULL, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     string context = "Failed to create a NewAppendableFile: " + fname;
@@ -316,9 +317,9 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
   file_flags |= FILE_FLAG_OVERLAPPED;
 
   DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  HANDLE hfile = ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ,
-                               share_mode, NULL, OPEN_EXISTING, file_flags,
-                               NULL);
+  HANDLE hfile =
+      ::CreateFileW(ws_translated_fname.c_str(), GENERIC_READ, share_mode, NULL,
+                    OPEN_EXISTING, file_flags, NULL);
 
   if (INVALID_HANDLE_VALUE == hfile) {
     return IOErrorFromWindowsError(
@@ -345,28 +346,32 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile(
                                        NULL);  // Mapping name
 
     if (!hmap) {
-      string context = "Failed to create file mapping for "
-                       "NewReadOnlyMemoryRegionFromFile: " + fname;
+      string context =
+          "Failed to create file mapping for "
+          "NewReadOnlyMemoryRegionFromFile: " +
+          fname;
       return IOErrorFromWindowsError(context, ::GetLastError());
     }
 
     UniqueCloseHandlePtr map_guard(hmap, CloseHandleFunc);
 
-    const void* mapped_region = ::MapViewOfFileEx(
-        hmap, FILE_MAP_READ,
-        0,  // High DWORD of access start
-        0,  // Low DWORD
-        file_size,
-        NULL);  // Let the OS choose the mapping
+    const void* mapped_region =
+        ::MapViewOfFileEx(hmap, FILE_MAP_READ,
+                          0,  // High DWORD of access start
+                          0,  // Low DWORD
+                          file_size,
+                          NULL);  // Let the OS choose the mapping
 
     if (!mapped_region) {
-      string context = "Failed to MapViewOfFile for "
-                       "NewReadOnlyMemoryRegionFromFile: " + fname;
+      string context =
+          "Failed to MapViewOfFile for "
+          "NewReadOnlyMemoryRegionFromFile: " +
+          fname;
       return IOErrorFromWindowsError(context, ::GetLastError());
     }
 
-    result->reset(new WinReadOnlyMemoryRegion(fname, hfile, hmap,
-                                              mapped_region, file_size));
+    result->reset(new WinReadOnlyMemoryRegion(fname, hfile, hmap, mapped_region,
+                                              file_size));
 
     map_guard.release();
     file_guard.release();
@@ -404,8 +409,8 @@ Status WindowsFileSystem::GetChildren(const string& dir,
   }
 
   do {
-	string file_name = WideCharToUtf8(find_data.cFileName);
-	const StringPiece basename = file_name;
+    string file_name = WideCharToUtf8(find_data.cFileName);
+    const StringPiece basename = file_name;
     if (basename != "." && basename != "..") {
       result->push_back(file_name);
     }
@@ -457,8 +462,7 @@ Status WindowsFileSystem::GetFileSize(const string& fname, uint64* size) {
     file_size.HighPart = attrs.nFileSizeHigh;
     file_size.LowPart = attrs.nFileSizeLow;
     *size = file_size.QuadPart;
-  }
-  else {
+  } else {
     string context = "Can not get size for: " + fname;
     result = IOErrorFromWindowsError(context, ::GetLastError());
   }
@@ -472,7 +476,7 @@ Status WindowsFileSystem::RenameFile(const string& src, const string& target) {
   std::wstring ws_translated_src = Utf8ToWideChar(TranslateName(src));
   std::wstring ws_translated_target = Utf8ToWideChar(TranslateName(target));
   if (!::MoveFileExW(ws_translated_src.c_str(), ws_translated_target.c_str(),
-      MOVEFILE_REPLACE_EXISTING)) {
+                     MOVEFILE_REPLACE_EXISTING)) {
     string context(strings::StrCat("Failed to rename: ", src, " to: ", target));
     result = IOErrorFromWindowsError(context, ::GetLastError());
   }
diff --git a/tensorflow/core/platform/windows/windows_file_system.h b/tensorflow/core/platform/windows/windows_file_system.h
index 8dcc1530370f0615ec45785a1f3d10ce828d11a3..ba0302f0fd8b56dabaf9271a725bebdac4716102 100644
--- a/tensorflow/core/platform/windows/windows_file_system.h
+++ b/tensorflow/core/platform/windows/windows_file_system.h
@@ -63,33 +63,35 @@ class WindowsFileSystem : public FileSystem {
 
   Status RenameFile(const string& src, const string& target) override;
 
-  string TranslateName(const string& name) const override {
-    return name;
-  }
+  string TranslateName(const string& name) const override { return name; }
 
   static std::wstring Utf8ToWideChar(const string& utf8str) {
-      int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), NULL, 0);
-      std::wstring ws_translated_str(size_required, 0);
-      MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(), &ws_translated_str[0], size_required);
-      return ws_translated_str;
+    int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(),
+                                            (int)utf8str.size(), NULL, 0);
+    std::wstring ws_translated_str(size_required, 0);
+    MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(),
+                        &ws_translated_str[0], size_required);
+    return ws_translated_str;
   }
 
-  static string WideCharToUtf8(const std::wstring &wstr) {
-      if (wstr.empty()) return std::string();
-      int size_required = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL);
-      string utf8_translated_str(size_required, 0);
-      WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), &utf8_translated_str[0], size_required, NULL, NULL);
-      return utf8_translated_str;
+  static string WideCharToUtf8(const std::wstring& wstr) {
+    if (wstr.empty()) return std::string();
+    int size_required = WideCharToMultiByte(
+        CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL);
+    string utf8_translated_str(size_required, 0);
+    WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(),
+                        &utf8_translated_str[0], size_required, NULL, NULL);
+    return utf8_translated_str;
   }
 };
 
 class LocalWinFileSystem : public WindowsFileSystem {
-public:
-    string TranslateName(const string& name) const override {
-      StringPiece scheme, host, path;
-      io::ParseURI(name, &scheme, &host, &path);
-      return path.ToString();
-    }
+ public:
+  string TranslateName(const string& name) const override {
+    StringPiece scheme, host, path;
+    io::ParseURI(name, &scheme, &host, &path);
+    return path.ToString();
+  }
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 9c2e7a61deb93b3ecdd06ef1b15457e8d49470fc..35d99930186381edbb80aa6485856e288f1dd568 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -34,11 +34,22 @@ tf_cc_binary(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/internal:tfprof_options",
+        "//tensorflow/core/profiler:tfprof_options",
         "//tensorflow/core/profiler/internal:tfprof_stats",
         "//tensorflow/core/profiler/internal:tfprof_utils",
         "//tensorflow/core/profiler/internal/advisor:tfprof_advisor",
-        "@linenoise//:linenoise",
+        "@linenoise",
+    ],
+)
+
+cc_library(
+    name = "tfprof_options",
+    srcs = ["tfprof_options.cc"],
+    hdrs = ["tfprof_options.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler:protos_all_cc",
     ],
 )
 
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 9e628b10651423a7ce05392e675453c87f8b6c8c..57d76eb4cb9382790c80a0d55ee94b64e7b9dcdc 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -240,8 +240,9 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
 # can also generate memory profile using `-select bytes`
 tfprof> code -select accelerator_micros -max_depth 100000 -output pprof:outfile=<filename>  -trim_name_regexes .*apply_op.*
 
-# Use pprof to visualize the generated file.
-pprof -png --nodecount=100 --sample_index=1 <filename>
+# Use google-pprof, from the google-perftools package to visualize the generated file.
+# On Ubuntu you can install it with `apt-get install it google-perftools`.
+google-pprof --pdf --nodecount=100 <filename>
 ```
 
 ![PprofGraph](g3doc/pprof.jpg)
@@ -256,7 +257,7 @@ bug fix. `OpLogProto` is a good plus if it is used.
 
 #### Teams
 
-* Xin Pan (xpan@google.com, github: panyx0718)
+* Xin Pan
 * Chris Antaki
 * Yao Zhang
 * Jon Shlens
diff --git a/tensorflow/core/profiler/g3doc/advise.md b/tensorflow/core/profiler/g3doc/advise.md
index d0de8317f6950a89567b6d3c5705c42fcc8f4653..379c3f1ef69a79564669178016f916312eca7d4c 100644
--- a/tensorflow/core/profiler/g3doc/advise.md
+++ b/tensorflow/core/profiler/g3doc/advise.md
@@ -1,6 +1,6 @@
 ## Auto Detect and Advise
 
-tfprof analyzes profiles and generates advises for common issues.
+tfprof analyzes profiles and generates advice for common issues.
 
 ### Run Advise.
 
diff --git a/tensorflow/core/profiler/g3doc/command_line.md b/tensorflow/core/profiler/g3doc/command_line.md
index d41ac7290db66854faa2178e95f9ce00d8c825b6..bbaf55e613f6f30af5c27e6bdf61156859415c29 100644
--- a/tensorflow/core/profiler/g3doc/command_line.md
+++ b/tensorflow/core/profiler/g3doc/command_line.md
@@ -21,7 +21,8 @@ See QuickStart on generating the file.
 
 <b>THE OLD WAY BELOW IS DEPRECATED:</b>
 
-<b>--graph_path:</b> GraphDef proto file (required). Used to build in-memory
+<b>--graph_path:</b> GraphDef proto file (optional in eager execution).
+Used to build in-memory
 data structure of the model. For example, graph.pbtxt written by tf.Supervisor
 can be passed to --graph_path. You can also easily get GraphDef using
 tf.get_default_graph().as_graph_def(add_shapes=True) or other API.
@@ -72,6 +73,15 @@ bazel-bin/tensorflow/core/profiler/profiler help
 ```shell
 # The following commands will start tfprof interactive mode.
 #
+# Recommended:
+#
+# The file contains the binary string of ProfileProto.
+# It contains all needed information in one file.
+bazel-bin/tensorflow/core/profiler/profiler \
+    --profile_path=profile_xxx
+#
+# Alternatively, user can pass separate files.
+#
 # --graph_path contains the model architecutre and tensor shapes.
 # --run_meta_path contains the memory and time information.
 # --op_log_path contains float operation and code traces.
@@ -80,6 +90,11 @@ bazel-bin/tensorflow/core/profiler/profiler help
 # Only includes model architecture, parameters and shapes.
 bazel-bin/tensorflow/core/profiler/profiler \
     --graph_path=graph.pbtxt
+
+# For profiling eager execution, user can only specify run_meta_path
+# and profile execution info of each operation.
+bazel-bin/tensorflow/core/profiler/profiler \
+    --run_meta_path=run_meta
 #
 # Additionally profile ops memory and timing.
 bazel-bin/tensorflow/core/profiler/profiler \
diff --git a/tensorflow/core/profiler/g3doc/options.md b/tensorflow/core/profiler/g3doc/options.md
index 4c73e372e3bd9f24c83bdc0d3b8d98b5f8b03f11..7f2cd3f698c860f16cd7b027b5ff7c8e24338cf0 100644
--- a/tensorflow/core/profiler/g3doc/options.md
+++ b/tensorflow/core/profiler/g3doc/options.md
@@ -60,11 +60,14 @@ Currently, profiler only tracks the allocation of memory. As a result, the
 accumulated memory request is uaually larger than the peak memory of the overall
 model.
 
-bytes: The memory allocations requested by the operation.
-peak_bytes: The peak requested memory (not de-allocated) by the operation.
-residual_bytes: The memory requested by the operation and not de-allocated
+It's recommended to generate timeline to see the allocator memory usage over
+time.
+
+`bytes`: The memory allocations requested by the operation.
+`peak_bytes`: The peak requested memory (not de-allocated) by the operation.
+`residual_bytes`: The memory requested by the operation and not de-allocated
                 when Compute finishes.
-output_bytes: The memory output by the operation. It's not necessarily requested
+`output_bytes`: The memory output by the operation. It's not necessarily requested
               by the current operation. For example, it can be a tensor
               forwarded from input to output, with in-place mutation.
 
@@ -109,8 +112,8 @@ accelerator_micros and cpu_micros. Note: cpu and accelerator can run in parallel
 
 `-account_displayed_op_only`: If True, only account the statistics of ops eventually displayed. If False, account all op statistics matching -account_type_regexes recursively.
 
-
-Notes: See <b>overview</b> sesion on how does above options play with each other to decide the output and counting.
+Notes: See <b>overview</b> session on how does above options play with each
+other to decide the output and counting.
 
 `-select`: Comma-separated list of attributes to show. Supported attributes:
 [bytes|peak_bytes|residual_bytes|output_bytes|micros|accelerator_micros|cpu_micros|params|float_ops|occurrence|tensor_value|device|op_types|input_shapes].
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index edf6b32cfa4b9cd4831ac447e8384ada17d7fd8a..05a798bff80a0775e5170bf8f428d9e88d8060b3 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -16,7 +16,6 @@ cc_library(
         ":tfprof_graph",
         ":tfprof_node",
         ":tfprof_op",
-        ":tfprof_options",
         ":tfprof_scope",
         ":tfprof_show",
         ":tfprof_timeline",
@@ -26,6 +25,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
@@ -47,12 +47,12 @@ cc_library(
     srcs = ["tfprof_node.cc"],
     hdrs = ["tfprof_node.h"],
     deps = [
-        ":tfprof_options",
         ":tfprof_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
@@ -63,7 +63,6 @@ cc_library(
     deps = [
         ":tfprof_constants",
         ":tfprof_node",
-        ":tfprof_options",
         ":tfprof_show",
         ":tfprof_tensor",
         ":tfprof_utils",
@@ -74,6 +73,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
@@ -84,7 +84,6 @@ cc_library(
     deps = [
         ":tfprof_constants",
         ":tfprof_node",
-        ":tfprof_options",
         ":tfprof_show_multi",
         ":tfprof_tensor",
         ":tfprof_utils",
@@ -94,6 +93,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
@@ -104,7 +104,6 @@ cc_library(
     deps = [
         ":tfprof_constants",
         ":tfprof_node",
-        ":tfprof_options",
         ":tfprof_show_multi",
         ":tfprof_timeline",
         ":tfprof_utils",
@@ -116,6 +115,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
@@ -126,7 +126,6 @@ cc_library(
     deps = [
         ":tfprof_constants",
         ":tfprof_node",
-        ":tfprof_options",
         ":tfprof_show",
         ":tfprof_tensor",
         ":tfprof_utils",
@@ -135,6 +134,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
@@ -145,11 +145,11 @@ cc_library(
     deps = [
         ":tfprof_constants",
         ":tfprof_node",
-        ":tfprof_options",
         ":tfprof_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
@@ -161,7 +161,6 @@ cc_library(
         ":tfprof_constants",
         ":tfprof_node",
         ":tfprof_node_show",
-        ":tfprof_options",
         ":tfprof_tensor",
         ":tfprof_timeline",
         ":tfprof_utils",
@@ -170,6 +169,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
@@ -181,7 +181,6 @@ cc_library(
         ":tfprof_constants",
         ":tfprof_node",
         ":tfprof_node_show",
-        ":tfprof_options",
         ":tfprof_scope",
         ":tfprof_show",
         ":tfprof_tensor",
@@ -192,6 +191,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
@@ -209,7 +209,6 @@ tf_cc_test(
     ],
     deps = [
         ":tfprof_constants",
-        ":tfprof_options",
         ":tfprof_stats",
         ":tfprof_tf_testlib",
         ":tfprof_utils",
@@ -218,6 +217,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
@@ -231,7 +231,6 @@ tf_cc_test(
     ],
     deps = [
         ":tfprof_constants",
-        ":tfprof_options",
         ":tfprof_stats",
         ":tfprof_tf_testlib",
         ":tfprof_utils",
@@ -241,6 +240,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
@@ -250,21 +250,10 @@ cc_library(
     hdrs = ["tfprof_utils.h"],
     copts = if_not_windows(["-Wno-sign-compare"]),
     deps = [
-        ":tfprof_options",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:regexp_internal",
-    ],
-)
-
-cc_library(
-    name = "tfprof_options",
-    srcs = ["tfprof_options.cc"],
-    hdrs = ["tfprof_options.h"],
-    deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
@@ -279,13 +268,13 @@ cc_library(
     srcs = ["print_model_analysis.cc"],
     hdrs = ["print_model_analysis.h"],
     deps = [
-        ":tfprof_options",
         ":tfprof_stats",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
         "//tensorflow/core/profiler/internal/advisor:tfprof_advisor",
     ],
     alwayslink = 1,
@@ -305,7 +294,6 @@ tf_cc_test(
     ],
     deps = [
         ":tfprof_constants",
-        ":tfprof_options",
         ":tfprof_stats",
         ":tfprof_tf_testlib",
         ":tfprof_utils",
@@ -314,6 +302,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
@@ -340,7 +329,6 @@ tf_cc_test(
         "testdata/graph.pbtxt",
     ],
     deps = [
-        ":tfprof_options",
         ":tfprof_stats",
         ":tfprof_tf_testlib",
         ":tfprof_utils",
@@ -349,6 +337,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler:tfprof_options",
     ],
 )
 
diff --git a/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h b/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
index c6544fe0b02df1b317db2ce4ab73130f9f155e56..25766668d88925b0d494e5e80284188cc42fb5cd 100644
--- a/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This checker checks the accelerator's utilization.
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
 
 #include "tensorflow/core/profiler/internal/advisor/checker.h"
 
@@ -106,4 +106,4 @@ class AcceleratorUtilizationChecker : public Checker {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/checker.h b/tensorflow/core/profiler/internal/advisor/checker.h
index 4b5ebcf9e83742c8aa3cff072f490c6ca0243061..5d7da39e6b27b01a3438c25c26b70e5e3b65c7ff 100644
--- a/tensorflow/core/profiler/internal/advisor/checker.h
+++ b/tensorflow/core/profiler/internal/advisor/checker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
 
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/profiler/internal/tfprof_stats.h"
@@ -49,4 +49,4 @@ class Checker {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_CHECKER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
index 145782c7bddc3c98f9bdcab179cc303f25755bd5..f5ac5c9c5a428354f57767e812e8292da21f014d 100644
--- a/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // This checker checks the most expensive operations.
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
 
 #include "tensorflow/core/profiler/internal/advisor/checker.h"
 
@@ -137,4 +137,4 @@ class ExpensiveOperationChecker : public Checker {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OP_CHECKER_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OP_CHECKER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/internal_checker_runner.h b/tensorflow/core/profiler/internal/advisor/internal_checker_runner.h
index ec52741b19e6769ec9d571666c063524857dd199..6fc16cf903704ec6ce6fd18ebc0ba67962483795 100644
--- a/tensorflow/core/profiler/internal/advisor/internal_checker_runner.h
+++ b/tensorflow/core/profiler/internal/advisor/internal_checker_runner.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
 
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
 #include "tensorflow/core/profiler/tfprof_options.pb.h"
@@ -31,4 +31,4 @@ AdviceProto RunInternalCheckers(const AdvisorOptionsProto& options,
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_INTERNAL_CHECKER_RUNNER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/operation_checker.h b/tensorflow/core/profiler/internal/advisor/operation_checker.h
index f0bd72fa409a87aa512c8a7f50f33d57ec21e3a7..6c1d5cd6704f2aeaa0eeed25a7cf1ecdbb73919c 100644
--- a/tensorflow/core/profiler/internal/advisor/operation_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/operation_checker.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 // This checker checks common wrong configurations of operations.
 //
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
 
 #include "tensorflow/core/profiler/internal/advisor/checker.h"
 
@@ -74,4 +74,4 @@ class OperationChecker : public Checker {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_OPERATION_CHECKER_H_
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
index 42bd6d54381d50a0670ac23a6ae686bcf0b13c81..270662bd4aca9bb0d17957ef43abd4eda2fa8e4d 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
 
 #include "tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h"
 #include "tensorflow/core/profiler/internal/advisor/checker.h"
@@ -78,4 +78,4 @@ class Advisor {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVICE_H_
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index d05143aff9b8cc0b9a0e9af9445ba79345e4bf62..e968b9c97e28eeae22954102d5f0e07e09d75f7f 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -53,10 +53,13 @@ class TFProfAdvisorTest : public ::testing::Test {
     NodeExecStats node_stat;
     node_stat.set_all_start_micros(start_miros);
     node_stat.set_op_end_rel_micros(end_rel_micros);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0", node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:all",
+    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0",
                       node_stat);
-    node->AddStepStat(step, "/job:localhost/replica:0/task:0/device:GPU:0:stream:0",
+    node->AddStepStat(step,
+                      "/job:localhost/replica:0/task:0/device:GPU:0:stream:all",
+                      node_stat);
+    node->AddStepStat(step,
+                      "/job:localhost/replica:0/task:0/device:GPU:0:stream:0",
                       node_stat);
     return node;
   }
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.cc b/tensorflow/core/profiler/internal/print_model_analysis.cc
index 7a0d590262fe623f701e21c979e53f2abc103305..5a31c7d789e70530586efc1fdfed158d5d19cabb 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.cc
+++ b/tensorflow/core/profiler/internal/print_model_analysis.cc
@@ -22,13 +22,13 @@ limitations under the License.
 #include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/profiler/internal/advisor/tfprof_advisor.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_stats.h"
 #include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_options.pb.h"
 #include "tensorflow/core/profiler/tfprof_output.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -84,12 +84,13 @@ string RunProfile(const string& command, const string& options,
 }  // namespace
 
 bool NewProfiler(const string* graph, const string* op_log) {
-  CHECK(graph) << "graph mustn't be null";
   std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
-  if (!graph_ptr->ParseFromString(*graph)) {
-    if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) {
-      fprintf(stderr, "Failed to parse graph\n");
-      return false;
+  if (graph && !graph->empty()) {
+    if (!graph_ptr->ParseFromString(*graph)) {
+      if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) {
+        fprintf(stderr, "Failed to parse graph\n");
+        return false;
+      }
     }
   }
 
@@ -123,14 +124,15 @@ double AddStep(int64 step, const string* graph, const string* run_meta,
                const string* op_log) {
   CHECK(tf_stat);
 
-  CHECK(graph && !graph->empty());
-  std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
-  if (!graph_ptr->ParseFromString(*graph)) {
-    if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) {
-      fprintf(stderr, "Failed to parse graph\n");
+  if (graph && !graph->empty()) {
+    std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
+    if (!graph_ptr->ParseFromString(*graph)) {
+      if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) {
+        fprintf(stderr, "Failed to parse graph\n");
+      }
     }
+    tf_stat->AddGraph(std::move(graph_ptr));
   }
-  tf_stat->AddGraph(std::move(graph_ptr));
 
   CHECK(run_meta && !run_meta->empty());
   // TODO(xpan): Better error handling.
@@ -154,6 +156,13 @@ string Profile(const string* command, const string* options) {
   return RunProfile(*command, *options, tf_stat);
 }
 
+string SerializeToString() {
+  CHECK(tf_stat);
+  string content;
+  tf_stat->SerializeToString(&content);
+  return content;
+}
+
 void WriteProfile(const string* filename) {
   CHECK(tf_stat);
   CHECK(filename) << "empty file name when asking to write profile.";
@@ -163,11 +172,12 @@ void WriteProfile(const string* filename) {
 string PrintModelAnalysis(const string* graph, const string* run_meta,
                           const string* op_log, const string* command,
                           const string* options) {
-  CHECK(graph) << "graph mustn't be null";
   CHECK(command) << "command mustn't be null";
   CHECK(options) << "options mustn't be null";
   std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
-  graph_ptr->ParseFromString(*graph);
+  if (graph && !graph->empty()) {
+    graph_ptr->ParseFromString(*graph);
+  }
 
   std::unique_ptr<RunMetadata> run_meta_ptr;
   if (run_meta && !run_meta->empty()) {
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.h b/tensorflow/core/profiler/internal/print_model_analysis.h
index 31ff5b07b060b43fab6c0b458f6f43c4dcc0576b..29666ab9364253ea5131cf1739a960182e91cee5 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.h
+++ b/tensorflow/core/profiler/internal/print_model_analysis.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
 
 #include <string>
 
@@ -44,6 +44,9 @@ void WriteProfile(const string* filename);
 // Load the profile to profiler from a proto buffer file.
 void ProfilerFromFile(const string* filename);
 
+// Returns a binary string that represents the serialized ProfileProto.
+string SerializeToString();
+
 string Profile(const string* command, const string* options);
 
 // Single-step Profiler.
@@ -60,4 +63,4 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_PRINT_MODEL_ANALYSIS_H_
diff --git a/tensorflow/core/profiler/internal/testdata/run_meta b/tensorflow/core/profiler/internal/testdata/run_meta
index ae76acb743fc517239206228369b175c00c1c248..eaea62b06c8f1b7a968948614fee208a7b81c9b2 100644
Binary files a/tensorflow/core/profiler/internal/testdata/run_meta and b/tensorflow/core/profiler/internal/testdata/run_meta differ
diff --git a/tensorflow/core/profiler/internal/tfprof_code.h b/tensorflow/core/profiler/internal/tfprof_code.h
index a118752fce59006f1992ef78380920f52024f9a2..38395f967c102f44fb49c49ced676dd5b6c609de 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.h
+++ b/tensorflow/core/profiler/internal/tfprof_code.h
@@ -16,8 +16,8 @@ limitations under the License.
 // Build a tree structure based on the TensorFlow model's python code stacks.
 // Stats are aggregated from descendants to ancestors.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
 
 #include <map>
 #include <memory>
@@ -28,12 +28,12 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/profiler/internal/tfprof_node.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_show_multi.h"
 #include "tensorflow/core/profiler/internal/tfprof_timeline.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
 #include "tensorflow/core/profiler/profile.pb.h"
 #include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
@@ -94,4 +94,4 @@ class TFCode : public TFMultiShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CODE_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_constants.h b/tensorflow/core/profiler/internal/tfprof_constants.h
index 6a4eaaa890c51a1c2a730cfbb96d6d45316789c6..d4a47931a2700794b2d2e3cb932bc5d19dc2d90c 100644
--- a/tensorflow/core/profiler/internal/tfprof_constants.h
+++ b/tensorflow/core/profiler/internal/tfprof_constants.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
 
 namespace tensorflow {
 namespace tfprof {
@@ -34,4 +34,4 @@ static const char* const kCkptVarType = "_checkpoint_variables";
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_CONSTANTS_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_graph.h b/tensorflow/core/profiler/internal/tfprof_graph.h
index 8dac4aee77a456f9bb43d1fea255d8d4655c255b..356a459a65ece4e4395db1da82c99739a6982318 100644
--- a/tensorflow/core/profiler/internal/tfprof_graph.h
+++ b/tensorflow/core/profiler/internal/tfprof_graph.h
@@ -16,8 +16,8 @@ limitations under the License.
 // Build a graph structure based on op inputs/outputs. The graph is a directed
 // acyclic graph pointing *from outputs to inputs*.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
 
 #include <deque>
 #include <map>
@@ -30,9 +30,9 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/profiler/internal/tfprof_node.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_show.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
@@ -86,4 +86,4 @@ class TFGraph : public TFShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_GRAPH_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 671b65d708f57713d984331de73ddf305675b792..86cb20de7bbb4f36bfaa431bc2b81a00dace84df 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -80,10 +80,15 @@ void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
 
 void ExecStep::AddMemoryStats(const string& dev,
                               const NodeExecStats& step_stat) {
-  if (exec_.memory_intialized()) {
+  ExecMemory exec_mem;
+  if (step_stat.all_start_micros() > 0) {
+    exec_mem.set_memory_micros(step_stat.all_start_micros() +
+                               step_stat.op_end_rel_micros());
+  } else {
+    fprintf(stderr, "%s has no start time, skipping\n",
+            step_stat.node_name().c_str());
     return;
   }
-  exec_.set_memory_intialized(true);
 
   int accelerator_allocator_cnt = 0;
   for (const auto& mem : step_stat.memory()) {
@@ -93,14 +98,12 @@ void ExecStep::AddMemoryStats(const string& dev,
       continue;
     }
     ++accelerator_allocator_cnt;
-    exec_.set_allocator_bytes_in_use(
-        std::max(static_cast<int64>(exec_.allocator_bytes_in_use()),
+    exec_mem.set_allocator_bytes_in_use(
+        std::max(static_cast<int64>(exec_mem.allocator_bytes_in_use()),
                  static_cast<int64>(mem.allocator_bytes_in_use())));
-    Allocation allocation;
     for (const auto& alloc : mem.allocation_records()) {
-      allocation.add_allocation_records()->MergeFrom(alloc);
+      allocations_.push_back(alloc);
     }
-    allocations_.push_back(allocation);
   }
   if (accelerator_allocator_cnt > 1) {
     fprintf(stderr, "found %d gpu allocator for 1 node\n",
@@ -121,24 +124,50 @@ void ExecStep::AddMemoryStats(const string& dev,
       uint64 output_ptr =
           output.tensor_description().allocation_description().ptr();
       total_output_bytes += output_bytes;
-      output_memory_[output.slot()] = std::make_pair(output_bytes, output_ptr);
+
+      auto& mem = (*exec_mem.mutable_output_memory())[output.slot()];
+      mem.set_ptr(output_ptr);
+      mem.set_bytes(output_bytes);
     }
   }
-  exec_.set_output_bytes(total_output_bytes);
+  exec_mem.set_output_bytes(total_output_bytes);
 
   if (step_stat.has_memory_stats()) {
-    exec_.set_host_temp_bytes(exec_.host_temp_bytes() +
-                              step_stat.memory_stats().host_temp_memory_size());
-    exec_.set_host_persistent_bytes(
-        exec_.host_persistent_bytes() +
-        step_stat.memory_stats().host_persistent_memory_size());
-    exec_.set_accelerator_temp_bytes(
-        exec_.accelerator_temp_bytes() +
-        step_stat.memory_stats().device_temp_memory_size());
-    exec_.set_accelerator_persistent_bytes(
-        exec_.accelerator_persistent_bytes() +
-        step_stat.memory_stats().device_persistent_memory_size());
+    if (IsPlacedOnCPU(dev)) {
+      // Currently we assume ops placed on gpu only allocate memory on gpu.
+      exec_mem.set_host_temp_bytes(exec_mem.host_temp_bytes() +
+                                   step_stat.memory_stats().temp_memory_size());
+      exec_mem.set_host_persistent_bytes(
+          exec_mem.host_persistent_bytes() +
+          step_stat.memory_stats().persistent_memory_size());
+    } else {
+      exec_mem.set_accelerator_temp_bytes(
+          exec_mem.accelerator_temp_bytes() +
+          step_stat.memory_stats().temp_memory_size());
+      exec_mem.set_accelerator_persistent_bytes(
+          exec_mem.accelerator_persistent_bytes() +
+          step_stat.memory_stats().persistent_memory_size());
+    }
   }
+
+  // TODO(xpan): Make this more accurate:
+  // High level: Memory tracking is suspicous and requires large scale
+  // clean up.
+  // Investigte the memory usage difference between CPU/GPU with OpViewTest.
+  //
+  // 1. OpKernelConstruction::allocate_xxx is not traced. Below, we only
+  //    discuss OpKernelContext-related allocations.
+  // 2. allocate_output calls allocate_tensor, which is properly tracked in
+  //    'NodeExecStats.memory'.
+  // 3. allocate_temp is only tracked through record_xxx_temp. It appears
+  //    in 'NodeExecStats.memory_stats'.
+  // 4. allocate_persistent calls allocate_tensor, which is properly tracked
+  //    in 'NodeExecStats.memory'. However, there is no way to count it as
+  //    persistent now.
+  // 5. record_xxx_persistent is called when allocate_persistent
+  //    is not used and hence tracks some complementary bytes. It appears in
+  //    'NodeExecStats.memory_stats'. It's suspicious. But we should
+  //    use it now since it covers constant op.
   int64 residual_bytes = 0;
   int64 requested_bytes = 0;
   int64 peak_bytes = 0;
@@ -147,9 +176,20 @@ void ExecStep::AddMemoryStats(const string& dev,
     requested_bytes += mem.total_bytes();
     peak_bytes += mem.peak_bytes();
   }
-  exec_.set_requested_bytes(requested_bytes);
-  exec_.set_residual_bytes(residual_bytes);
-  exec_.set_peak_bytes(peak_bytes);
+  residual_bytes += exec_mem.host_persistent_bytes() +
+                    exec_mem.accelerator_persistent_bytes();
+  requested_bytes += exec_mem.host_persistent_bytes() +
+                     exec_mem.accelerator_persistent_bytes() +
+                     exec_mem.host_temp_bytes() +
+                     exec_mem.accelerator_temp_bytes();
+  peak_bytes += exec_mem.host_persistent_bytes() +
+                exec_mem.accelerator_persistent_bytes() +
+                exec_mem.host_temp_bytes() + exec_mem.accelerator_temp_bytes();
+
+  exec_mem.set_requested_bytes(requested_bytes);
+  exec_mem.set_residual_bytes(residual_bytes);
+  exec_mem.set_peak_bytes(peak_bytes);
+  memory_execs_.emplace_back(exec_mem);
 }
 
 void TFGraphNode::AddStepStat(int64 step, const string& device,
@@ -251,5 +291,8 @@ bool IsPlacedOnAccelerator(const string& device) {
   return device.find("gpu") != device.npos ||
          device.find("sycl") != device.npos;
 }
+bool IsPlacedOnCPU(const string& device) {
+  return device.find("cpu") != device.npos;
+}
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h
index e2d0563a0747d7bec74ce3aeb9d5995f47cff915..0a97b1cb0f2568656fbc45883a688d0ecc5c95d8 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.h
+++ b/tensorflow/core/profiler/internal/tfprof_node.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
 
 #include <map>
 #include <set>
@@ -31,8 +31,8 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/regexp.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -109,7 +109,6 @@ class ExecStep {
       const {
     return cpu_execs_;
   }
-
   int64 all_start_micros() const { return exec_.all_start_micros(); }
   int64 latest_end_micros() const { return exec_.latest_end_micros(); }
   int64 lastest_schedule_end_micros() const {
@@ -121,27 +120,73 @@ class ExecStep {
     }
     return ret;
   }
-
-  int64 requested_bytes() const { return exec_.requested_bytes(); }
-  int64 peak_bytes() const { return exec_.peak_bytes(); }
-  int64 residual_bytes() const { return exec_.residual_bytes(); }
-  int64 output_bytes() const { return exec_.output_bytes(); }
+  int64 requested_bytes() const {
+    int64 requested_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      requested_bytes += exec.requested_bytes();
+    }
+    return requested_bytes;
+  }
+  int64 peak_bytes() const {
+    int64 peak_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      peak_bytes += exec.peak_bytes();
+    }
+    return peak_bytes;
+  }
+  int64 residual_bytes() const {
+    int64 residual_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      residual_bytes += exec.residual_bytes();
+    }
+    return residual_bytes;
+  }
+  int64 output_bytes() const {
+    int64 output_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      output_bytes += exec.output_bytes();
+    }
+    return output_bytes;
+  }
   int64 accelerator_temp_bytes() const {
-    return exec_.accelerator_temp_bytes();
+    int64 accelerator_temp_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      accelerator_temp_bytes += exec.accelerator_temp_bytes();
+    }
+    return accelerator_temp_bytes;
+  }
+  int64 host_temp_bytes() const {
+    int64 host_temp_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      host_temp_bytes += exec.host_temp_bytes();
+    }
+    return host_temp_bytes;
   }
-  int64 host_temp_bytes() const { return exec_.host_temp_bytes(); }
   int64 accelerator_persistent_bytes() const {
-    return exec_.accelerator_persistent_bytes();
+    int64 accelerator_persistent_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      accelerator_persistent_bytes += exec.accelerator_persistent_bytes();
+    }
+    return accelerator_persistent_bytes;
   }
-  int64 host_persistent_bytes() const { return exec_.host_persistent_bytes(); }
-  const std::map<int32, std::pair<int64, uint64>>& output_memory() const {
-    return output_memory_;
+  int64 host_persistent_bytes() const {
+    int64 host_persistent_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      host_persistent_bytes += exec.host_persistent_bytes();
+    }
+    return host_persistent_bytes;
   }
-  int64 allocator_bytes_in_use() const {
-    return exec_.allocator_bytes_in_use();
+  std::map<int64, int64> allocator_bytes_in_use() const {
+    std::map<int64, int64> bytes_in_use;
+    for (const ExecMemory& exec : memory_execs_) {
+      bytes_in_use[exec.memory_micros()] = exec.allocator_bytes_in_use();
+    }
+    return bytes_in_use;
   }
 
-  const std::vector<Allocation>& allocations() const { return allocations_; }
+  const std::vector<AllocationRecord>& allocations() const {
+    return allocations_;
+  }
 
   const ExecProfile& ToProto() {
     exec_.mutable_accelerator_execs()->clear();
@@ -169,19 +214,15 @@ class ExecStep {
     for (const string& d : devices_) {
       exec_.add_devices(d);
     }
-
-    exec_.mutable_output_memory()->clear();
-    for (const auto& mem : output_memory_) {
-      auto& mem_pb = (*exec_.mutable_output_memory())[mem.first];
-      mem_pb.set_bytes(mem.second.first);
-      mem_pb.set_ptr(mem.second.second);
-    }
-
     exec_.mutable_allocations()->Clear();
     for (const auto& r : allocations_) {
       exec_.add_allocations()->MergeFrom(r);
     }
 
+    exec_.mutable_memory_execs()->Clear();
+    for (const auto& m : memory_execs_) {
+      exec_.add_memory_execs()->MergeFrom(m);
+    }
     return exec_;
   }
 
@@ -197,6 +238,7 @@ class ExecStep {
     op_execs_.clear();
 
     allocations_.clear();
+    memory_execs_.clear();
 
     for (const auto& exec_time : exec_.accelerator_execs()) {
       auto& exec = accelerator_execs_[exec_time.first];
@@ -214,15 +256,12 @@ class ExecStep {
         op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
       }
     }
-    for (const auto& output_mem : exec_.output_memory()) {
-      auto& mem = output_memory_[output_mem.first];
-      mem.first = output_mem.second.bytes();
-      mem.second = output_mem.second.ptr();
-    }
-
     for (const auto& r : exec_.allocations()) {
       allocations_.push_back(r);
     }
+    for (const auto& m : exec_.memory_execs()) {
+      memory_execs_.push_back(m);
+    }
   }
 
  private:
@@ -237,14 +276,15 @@ class ExecStep {
   std::map<string, std::vector<std::pair<int64, int64>>> cpu_execs_;
   // combines accelerator_execs_ and cpu_execs_.
   std::map<string, std::vector<std::pair<int64, int64>>> op_execs_;
+  // Each ExecMemory corresponds to one scheduling of the op. Normally,
+  // there are multiple schedulings in while_loop.
+  std::vector<ExecMemory> memory_execs_;
   // All devices the op is associated with (e.g. gpu:0 (scheduling),
   // gpu:0:stream:xx (kernel exec), cpu:0 host)
   std::set<string> devices_;
-  // output_idx -> {output_bytes, memory_ptr}
-  std::map<int32, std::pair<int64, uint64>> output_memory_;
 
   // The history of accelerator allocations and deallocations of this step.
-  std::vector<Allocation> allocations_;
+  std::vector<AllocationRecord> allocations_;
 };
 
 #define GRAPH_NODE_BYTES(type)             \
@@ -593,34 +633,20 @@ class TFGraphNode {
   int64 accelerator_persistent_bytes() const {
     int64 persistent_bytes = 0;
     for (const auto& exec : execs_) {
-      persistent_bytes += exec.second.accelerator_persistent_bytes();
+      persistent_bytes = std::max(persistent_bytes,
+                                  exec.second.accelerator_persistent_bytes());
     }
     return persistent_bytes;
   }
-  int64 host_persistent_bytes(int64 step) const {
+  const std::map<int64, int64> allocator_bytes_in_use(int64 step) const {
     auto exec = execs_.find(step);
     if (exec == execs_.end()) {
-      return 0;
-    }
-    return exec->second.host_persistent_bytes();
-  }
-  const std::map<int32, std::pair<int64, uint64>>& output_memory(
-      int64 step) const {
-    auto exec = execs_.find(step);
-    if (exec == execs_.end()) {
-      return empty_output_memory_;
-    }
-    return exec->second.output_memory();
-  }
-  int64 allocator_bytes_in_use(int64 step) const {
-    auto exec = execs_.find(step);
-    if (exec == execs_.end()) {
-      return 0;
+      return empty_bytes_in_use_;
     }
     return exec->second.allocator_bytes_in_use();
   }
 
-  const std::vector<Allocation>& allocations(int64 step) const {
+  const std::vector<AllocationRecord>& allocations(int64 step) const {
     auto exec = execs_.find(step);
     if (exec == execs_.end()) {
       return empty_allocations_;
@@ -725,9 +751,9 @@ class TFGraphNode {
   std::map<int64, ExecStep> execs_;
 
   // Placeholder for empty cases.
-  std::map<int32, std::pair<int64, uint64>> empty_output_memory_;
+  std::map<int64, int64> empty_bytes_in_use_;
   std::map<string, std::vector<std::pair<int64, int64>>> empty_execs_;
-  std::vector<Allocation> empty_allocations_;
+  std::vector<AllocationRecord> empty_allocations_;
 };
 
 class TFMultiGraphNode {
@@ -880,6 +906,7 @@ class TFMultiGraphNode {
   std::map<string, const TFGraphNode*> nodes_;
 };
 
+bool IsPlacedOnCPU(const string& device);
 bool IsPlacedOnAccelerator(const string& device);
 bool CountAsAcceleratorTime(const string& device);
 bool CountAsCPUTime(const string& device);
@@ -888,4 +915,4 @@ bool IsCanonicalDevice(const string& device);
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_node_show.h b/tensorflow/core/profiler/internal/tfprof_node_show.h
index 3788bf3e80dd891d0ff6d71fd029b347c89f999a..517da67d74c5663ecea4fb914ef0940590400489 100644
--- a/tensorflow/core/profiler/internal/tfprof_node_show.h
+++ b/tensorflow/core/profiler/internal/tfprof_node_show.h
@@ -21,8 +21,8 @@ limitations under the License.
 // ScopeNode and GraphNode each maps to one TFGraphNode.
 // CodeNode and OpNode each maps to one TFMultiGraphNode.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
 
 #include <algorithm>
 #include <string>
@@ -32,8 +32,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/profiler/internal/tfprof_constants.h"
 #include "tensorflow/core/profiler/internal/tfprof_node.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
@@ -156,4 +156,4 @@ class OpNode : public ShowMultiNode {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_op.cc b/tensorflow/core/profiler/internal/tfprof_op.cc
index 5a8429d4893effc8bbfa0bf69e18b4a182e9a5df..3dce1d85db35436d162e73bf0946b320b899d5eb 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.cc
+++ b/tensorflow/core/profiler/internal/tfprof_op.cc
@@ -113,8 +113,9 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
     root_->formatted_str = FormatNode(root_.get(), root_.get(), opts);
   }
   if (timeline) {
-    fprintf(stderr, "op view doesn't support timeline yet. "
-                    "Consider graph/scope/code view.\n");
+    fprintf(stderr,
+            "op view doesn't support timeline yet. "
+            "Consider graph/scope/code view.\n");
     return root_.get();
   }
   if (cnodes_map_.empty()) {
@@ -265,9 +266,9 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const {
     double pct = 0.0;
     if (node->proto().total_parameters() > 0) {
       accu_pct = 100.0 * node->proto().total_parameters() /
-          root->proto().total_parameters();
-      pct = 100.0 * node->proto().parameters() /
-          root->proto().total_parameters();
+                 root->proto().total_parameters();
+      pct =
+          100.0 * node->proto().parameters() / root->proto().total_parameters();
     }
     attrs.push_back(strings::Printf(
         "%30s",
@@ -282,9 +283,8 @@ string TFOp::FormatNode(OpNode* node, OpNode* root, const Options& opts) const {
     double pct = 0.0;
     if (node->proto().total_float_ops() > 0) {
       accu_pct = 100.0 * node->proto().total_float_ops() /
-          root->proto().total_float_ops();
-      pct = 100.0 * node->proto().float_ops() /
-          root->proto().total_float_ops();
+                 root->proto().total_float_ops();
+      pct = 100.0 * node->proto().float_ops() / root->proto().total_float_ops();
     }
 
     attrs.push_back(strings::Printf(
diff --git a/tensorflow/core/profiler/internal/tfprof_op.h b/tensorflow/core/profiler/internal/tfprof_op.h
index 55a346c7e8d64ab139ab565ded39a745621d361a..aa22182d36cac8d7e1f9fb3143beadfdfe0efce6 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.h
+++ b/tensorflow/core/profiler/internal/tfprof_op.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Build a flat structure of ops.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
 
 #include <deque>
 #include <map>
@@ -29,9 +29,9 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/profiler/internal/tfprof_node.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_show_multi.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
@@ -41,8 +41,7 @@ namespace tfprof {
 // to input ops.
 class TFOp : public TFMultiShow {
  public:
-  explicit TFOp()
-      : TFMultiShow() {}
+  explicit TFOp() : TFMultiShow() {}
   ~TFOp() override {}
 
   void AddNode(TFGraphNode* node) override;
@@ -51,7 +50,7 @@ class TFOp : public TFMultiShow {
 
  private:
   const ShowMultiNode* ShowInternal(const Options& opts,
-                                   Timeline* timeline) override;
+                                    Timeline* timeline) override;
 
   int64 SearchRoot(const std::vector<OpNode*> nodes,
                    const std::vector<string>& regexes);
@@ -76,4 +75,4 @@ class TFOp : public TFMultiShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OP_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_scope.h b/tensorflow/core/profiler/internal/tfprof_scope.h
index 710991dde6bcda4b10c69124991aa5ba32026f16..235dfde803fa45146484870dfc46ebda367dc29c 100644
--- a/tensorflow/core/profiler/internal/tfprof_scope.h
+++ b/tensorflow/core/profiler/internal/tfprof_scope.h
@@ -17,8 +17,8 @@ limitations under the License.
 // For example, 'name1/name2' is a child of 'name1'.
 // Stats are aggregated from descendants to ancestors.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
 
 #include <map>
 #include <memory>
@@ -29,9 +29,9 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/profiler/internal/tfprof_node.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_show.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
@@ -74,4 +74,4 @@ class TFScope : public TFShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SCOPE_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_show.cc b/tensorflow/core/profiler/internal/tfprof_show.cc
index cf28876089d21f0f8118fbbe0cd51a616e97cbc8..f09cd1dad99de1075d045afc5d413dc33080c70c 100644
--- a/tensorflow/core/profiler/internal/tfprof_show.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show.cc
@@ -25,19 +25,19 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 
-const GraphNodeProto& TFShow::Show(const Options& opts) {
+const GraphNodeProto& TFShow::Show(const string& prefix, const Options& opts) {
   if (opts.output_type == kOutput[0]) {
     Timeline timeline(opts.step, opts.output_options.at(kTimelineOpts[0]));
     return ShowInternal(opts, &timeline)->proto();
   } else {
     const ShowNode* ret = ShowInternal(opts, nullptr);
     if (opts.output_type == kOutput[1]) {
-      printf("%s", ret->formatted_str.c_str());
+      printf("%s", (prefix + ret->formatted_str).c_str());
       fflush(stdout);
     } else if (opts.output_type == kOutput[2]) {
       Status s = WriteStringToFile(Env::Default(),
                                    opts.output_options.at(kFileOpts[0]),
-                                   ret->formatted_str);
+                                   prefix + ret->formatted_str);
       if (!s.ok()) {
         fprintf(stderr, "%s\n", s.ToString().c_str());
       }
diff --git a/tensorflow/core/profiler/internal/tfprof_show.h b/tensorflow/core/profiler/internal/tfprof_show.h
index 08c231bad7f216e0e00322c095017b3f0356f64a..81b021549a49625cd5ba4a6ba8130f12cc7cf5f7 100644
--- a/tensorflow/core/profiler/internal/tfprof_show.h
+++ b/tensorflow/core/profiler/internal/tfprof_show.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Parent class and utilities for tfprof_graph and tfprof_scope.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
 
 #include <algorithm>
 #include <string>
@@ -29,10 +29,10 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/tfprof_constants.h"
 #include "tensorflow/core/profiler/internal/tfprof_node.h"
 #include "tensorflow/core/profiler/internal/tfprof_node_show.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_tensor.h"
 #include "tensorflow/core/profiler/internal/tfprof_timeline.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
@@ -44,7 +44,8 @@ class TFShow {
   virtual ~TFShow() {}
   virtual void AddNode(TFGraphNode* node) = 0;
   virtual void Build() = 0;
-  const GraphNodeProto& Show(const Options& opts);
+  virtual const GraphNodeProto& Show(const string& prefix,
+                                     const Options& opts) final;
 
  protected:
   virtual const ShowNode* ShowInternal(const Options& opts,
@@ -77,40 +78,43 @@ class TFShow {
       return nodes;
     }
     std::vector<T*> sorted_nodes = nodes;
-    std::sort(sorted_nodes.begin(), sorted_nodes.end(), [&opts](const T* n1,
-                                                                const T* n2) {
-      if (n1->name() == kTFProfRoot) return true;
-      if (n2->name() == kTFProfRoot) return false;
-      bool name_cmp = n1->name() < n2->name();
-      if (opts.order_by == kOrderBy[0]) {
-        return name_cmp;
-      } else if (opts.order_by == kOrderBy[1]) {
-        return n1->proto().total_requested_bytes() >
-               n2->proto().total_requested_bytes();
-      } else if (opts.order_by == kOrderBy[2]) {
-        return n1->proto().total_peak_bytes() > n2->proto().total_peak_bytes();
-      } else if (opts.order_by == kOrderBy[3]) {
-        return n1->proto().total_residual_bytes() >
-               n2->proto().total_residual_bytes();
-      } else if (opts.order_by == kOrderBy[4]) {
-        return n1->proto().total_output_bytes() >
-               n2->proto().total_output_bytes();
-      } else if (opts.order_by == kOrderBy[5]) {
-        return n1->proto().total_exec_micros() >
-               n2->proto().total_exec_micros();
-      } else if (opts.order_by == kOrderBy[6]) {
-        return n1->proto().total_accelerator_exec_micros() >
-               n2->proto().total_accelerator_exec_micros();
-      } else if (opts.order_by == kOrderBy[7]) {
-        return n1->proto().total_cpu_exec_micros() >
-               n2->proto().total_cpu_exec_micros();
-      } else if (opts.order_by == kOrderBy[8]) {
-        return n1->proto().total_parameters() > n2->proto().total_parameters();
-      } else if (opts.order_by == kOrderBy[9]) {
-        return n1->proto().total_float_ops() > n2->proto().total_float_ops();
-      }
-      return name_cmp;
-    });
+    std::sort(sorted_nodes.begin(), sorted_nodes.end(),
+              [&opts](const T* n1, const T* n2) {
+                if (n1->name() == kTFProfRoot) return true;
+                if (n2->name() == kTFProfRoot) return false;
+                bool name_cmp = n1->name() < n2->name();
+                if (opts.order_by == kOrderBy[0]) {
+                  return name_cmp;
+                } else if (opts.order_by == kOrderBy[1]) {
+                  return n1->proto().total_requested_bytes() >
+                         n2->proto().total_requested_bytes();
+                } else if (opts.order_by == kOrderBy[2]) {
+                  return n1->proto().total_peak_bytes() >
+                         n2->proto().total_peak_bytes();
+                } else if (opts.order_by == kOrderBy[3]) {
+                  return n1->proto().total_residual_bytes() >
+                         n2->proto().total_residual_bytes();
+                } else if (opts.order_by == kOrderBy[4]) {
+                  return n1->proto().total_output_bytes() >
+                         n2->proto().total_output_bytes();
+                } else if (opts.order_by == kOrderBy[5]) {
+                  return n1->proto().total_exec_micros() >
+                         n2->proto().total_exec_micros();
+                } else if (opts.order_by == kOrderBy[6]) {
+                  return n1->proto().total_accelerator_exec_micros() >
+                         n2->proto().total_accelerator_exec_micros();
+                } else if (opts.order_by == kOrderBy[7]) {
+                  return n1->proto().total_cpu_exec_micros() >
+                         n2->proto().total_cpu_exec_micros();
+                } else if (opts.order_by == kOrderBy[8]) {
+                  return n1->proto().total_parameters() >
+                         n2->proto().total_parameters();
+                } else if (opts.order_by == kOrderBy[9]) {
+                  return n1->proto().total_float_ops() >
+                         n2->proto().total_float_ops();
+                }
+                return name_cmp;
+              });
     return sorted_nodes;
   }
 
@@ -150,4 +154,4 @@ string FormatAcceleratorExecTime(const T* node, const Options& opts) {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_show_multi.cc b/tensorflow/core/profiler/internal/tfprof_show_multi.cc
index eb826a7137618ba964f6b58f225e0921ea7f5c33..7c65d48d4a148399a1f9b4b6ec1a9058166d9cf5 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_multi.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show_multi.cc
@@ -27,19 +27,20 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 
-const MultiGraphNodeProto& TFMultiShow::Show(const Options& opts) {
+const MultiGraphNodeProto& TFMultiShow::Show(const string& prefix,
+                                             const Options& opts) {
   if (opts.output_type == kOutput[0]) {
     Timeline timeline(opts.step, opts.output_options.at(kTimelineOpts[0]));
     return ShowInternal(opts, &timeline)->proto();
   } else {
     const ShowMultiNode* ret = ShowInternal(opts, nullptr);
     if (opts.output_type == kOutput[1]) {
-      printf("%s", ret->formatted_str.c_str());
+      printf("%s", (prefix + ret->formatted_str).c_str());
       fflush(stdout);
     } else if (opts.output_type == kOutput[2]) {
       Status s = WriteStringToFile(Env::Default(),
                                    opts.output_options.at(kFileOpts[0]),
-                                   ret->formatted_str);
+                                   prefix + ret->formatted_str);
       if (!s.ok()) {
         fprintf(stderr, "%s\n", s.ToString().c_str());
       }
diff --git a/tensorflow/core/profiler/internal/tfprof_show_multi.h b/tensorflow/core/profiler/internal/tfprof_show_multi.h
index a632c669336b02106c0c2883c22157b05040f189..711d35f9753cf85f7f318a9ac3de40d6d2bf786e 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_multi.h
+++ b/tensorflow/core/profiler/internal/tfprof_show_multi.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Parent class and utilities for tfprof_code.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
 
 #include <algorithm>
 #include <string>
@@ -29,11 +29,11 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/tfprof_constants.h"
 #include "tensorflow/core/profiler/internal/tfprof_node.h"
 #include "tensorflow/core/profiler/internal/tfprof_node_show.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_show.h"
 #include "tensorflow/core/profiler/internal/tfprof_tensor.h"
 #include "tensorflow/core/profiler/internal/tfprof_timeline.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_output.pb.h"
 
 namespace tensorflow {
@@ -45,11 +45,12 @@ class TFMultiShow {
   virtual ~TFMultiShow() {}
   virtual void AddNode(TFGraphNode* node) = 0;
   virtual void Build() = 0;
-  const MultiGraphNodeProto& Show(const Options& opts);
+  virtual const MultiGraphNodeProto& Show(const string& prefix,
+                                          const Options& opts) final;
 
  protected:
   virtual const ShowMultiNode* ShowInternal(const Options& opts,
-                                           Timeline* timeline) = 0;
+                                            Timeline* timeline) = 0;
 
   bool LookUpCheckPoint(const string& name,
                         std::unique_ptr<TFProfTensor>* tensor);
@@ -126,4 +127,4 @@ class TFMultiShow {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_show_test.cc b/tensorflow/core/profiler/internal/tfprof_show_test.cc
index 1f19f8c322a15a726ce354ecf991ea902788d97b..625f64cae5e0040d93ac0bf1c5b5d0788af74ba1 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show_test.cc
@@ -23,14 +23,21 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/profiler/internal/tfprof_constants.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
 #include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_output.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
+
+string CheckAndRemoveDoc(const string& doc) {
+  auto pos = doc.find("Profile:");
+  CHECK(pos != doc.npos);
+  return doc.substr(pos + 9);
+}
+
 class TFProfShowTest : public ::testing::Test {
  protected:
   TFProfShowTest() {
@@ -105,13 +112,14 @@ TEST_F(TFProfShowTest, DumpScopeMode) {
       "node name | # parameters | # float_ops | requested bytes | peak bytes | "
       "residual bytes | output bytes | total execution time | accelerator "
       "execution time | cpu execution time\n_TFProfRoot (--/451 params, --/0 "
-      "flops, --/0B, --/0B, --/0B, --/2.56KB, --/13us, --/0us, --/13us)\n  DW "
-      "(3x3x3x6, 162/162 params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, "
-      "1.28KB/1.28KB, 2us/2us, 0us/0us, 2us/2us)\n  DW2 (2x2x6x12, 288/288 "
-      "params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, 1.28KB/1.28KB, 11us/11us, "
-      "0us/0us, 11us/11us)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, 0B/0B, "
-      "0B/0B, 0B/0B, 0us/0us, 0us/0us, 0us/0us)\n",
-      dump_str);
+      "flops, --/2.56KB, --/2.56KB, --/2.56KB, --/2.56KB, --/13us, --/0us, "
+      "--/13us)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, 1.28KB/1.28KB, "
+      "1.28KB/1.28KB, 1.28KB/1.28KB, 1.28KB/1.28KB, 2us/2us, 0us/0us, "
+      "2us/2us)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.28KB/1.28KB, "
+      "1.28KB/1.28KB, 1.28KB/1.28KB, 1.28KB/1.28KB, 11us/11us, 0us/0us, "
+      "11us/11us)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, "
+      "0B/0B, 0us/0us, 0us/0us, 0us/0us)\n",
+      CheckAndRemoveDoc(dump_str));
 
   EXPECT_EQ(dump_str, TestToFromProto("scope", opts));
 }
@@ -158,7 +166,7 @@ TEST_F(TFProfShowTest, DumpAcceleratorAndCPUMicros) {
       "0us/0us)\n        ScalarW/Initializer/random_normal/stddev (0us/0us, "
       "0us/0us)\n    ScalarW/read (0us/0us, 0us/0us)\n  init (0us/0us, "
       "0us/0us)\n",
-      dump_str);
+      CheckAndRemoveDoc(dump_str));
 
   EXPECT_EQ(dump_str, TestToFromProto("scope", opts));
 }
@@ -178,22 +186,22 @@ TEST_F(TFProfShowTest, DumpOpMode) {
   EXPECT_EQ(
       "nodename|requestedbytes|totalexecutiontime|acceleratorexecutiontime|"
       "cpuexecutiontime|#parameters|#float_ops|opoccurrence(run|defined)|"
-      "inputshapes\nVariableV20B(0.00%,0.00%),13us(100.00%,0.26%),0us(100.00%,"
-      "0.00%),13us(100.00%,0.29%),451params(100.00%,100.00%),0float_ops(100.00%"
-      ",0.00%),2|3\n\ninput_type:\t(run*2|defined*3)\texec_time:13us\n\nAdd0B("
-      "0.00%,0.00%),0us(99.74%,0.00%),0us(100.00%,0.00%),0us(99.71%,0.00%),"
-      "0params(0.00%,0.00%),0float_ops(100.00%,0.00%),0|3\n\ninput_type:0:1,"
-      "\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:2x2x6x12,\t1:1\t("
-      "run*0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6,\t1:1\t(run*0|"
-      "defined*1)\texec_time:0us\n\nAssign0B(0.00%,0.00%),0us(99.74%,0.00%),"
-      "0us(100.00%,0.00%),0us(99.71%,0.00%),0params(0.00%,0.00%),0float_ops("
-      "100.00%,0.00%),0|3\n\ninput_type:0:1,\t1:1\t(run*0|defined*1)\texec_"
+      "inputshapes\nVariableV22.56KB(100.00%,8.40%),13us(100.00%,0.26%),0us("
+      "100.00%,0.00%),13us(100.00%,0.29%),451params(100.00%,100.00%),0float_"
+      "ops(100.00%,0.00%),2|3\n\ninput_type:\t(run*2|defined*3)\texec_time:"
+      "13us\n\nAdd0B(0.00%,0.00%),0us(99.74%,0.00%),0us(100.00%,0.00%),0us(99."
+      "71%,0.00%),0params(0.00%,0.00%),0float_ops(100.00%,0.00%),0|3\n\ninput_"
+      "type:0:1,\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:2x2x6x12,"
+      "\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6,\t1:1\t("
+      "run*0|defined*1)\texec_time:0us\n\nAssign0B(0.00%,0.00%),0us(99.74%,0."
+      "00%),0us(100.00%,0.00%),0us(99.71%,0.00%),0params(0.00%,0.00%),0float_"
+      "ops(100.00%,0.00%),0|3\n\ninput_type:0:1,\t1:1\t(run*0|defined*1)\texec_"
       "time:0us\ninput_type:0:2x2x6x12,\t1:2x2x6x12\t(run*0|defined*1)\texec_"
       "time:0us\ninput_type:0:3x3x3x6,\t1:3x3x3x6\t(run*0|defined*1)\texec_"
       "time:0us\n\nConst0B(0.00%,0.00%),2us(99.74%,0.04%),0us(100.00%,0.00%),"
       "2us(99.71%,0.04%),0params(0.00%,0.00%),0float_ops(100.00%,0.00%),1|"
-      "10\n\ninput_type:\t(run*1|defined*10)\texec_time:2us\n\nConv2D14.59KB("
-      "100.00%,100.00%),4.89ms(99.70%,98.87%),404us(100.00%,100.00%),4.49ms(99."
+      "10\n\ninput_type:\t(run*1|defined*10)\texec_time:2us\n\nConv2D27.90KB("
+      "91.60%,91.60%),4.89ms(99.70%,98.87%),404us(100.00%,100.00%),4.49ms(99."
       "67%,98.77%),0params(0.00%,0.00%),10.44kfloat_ops(100.00%,100.00%),2|"
       "2\n\ninput_type:0:2x3x3x6,\t1:2x2x6x12\t(run*1|defined*1)\texec_time:"
       "597us\ninput_type:0:2x6x6x3,\t1:3x3x3x6\t(run*1|defined*1)\texec_time:4."
@@ -202,7 +210,7 @@ TEST_F(TFProfShowTest, DumpOpMode) {
       "type:0:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:2x2x6x12\t(run*"
       "0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6\t(run*0|defined*1)"
       "\texec_time:0us\n\n",
-      StringReplace(dump_str, " ", ""));
+      StringReplace(CheckAndRemoveDoc(dump_str), " ", ""));
 
   EXPECT_EQ(dump_str, TestToFromProto("op", opts, true));
 }
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index 7943c075e0243e652cb19125dae95b04dc709f97..5b91309c800fe877ddd45413be4b32125cf7980d 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -26,6 +26,9 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 namespace {
+
+const char* const kProfilePrefix = "Profile:\n";
+
 bool CreateRunMetadataNode(const string& name, NodeDef* def) {
   // TODO(xpan): Better solution than blacklisting this 2 nodes. They
   // actually cost some resources, maybe include them. Some nodes, such
@@ -48,6 +51,7 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
                  std::unique_ptr<OpLogProto> op_log,
                  std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader)
     : has_code_traces_(false),
+      miss_accelerator_stream_(false),
       ckpt_reader_(std::move(ckpt_reader)) {
   CHECK(graph) << "Must at least have GraphDef";
 
@@ -70,7 +74,9 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
 
 TFStats::TFStats(const string& filename,
                  std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader)
-    : has_code_traces_(false), ckpt_reader_(std::move(ckpt_reader)) {
+    : has_code_traces_(false),
+      miss_accelerator_stream_(false),
+      ckpt_reader_(std::move(ckpt_reader)) {
   string str;
   Status s = ReadFileToString(Env::Default(), filename, &str);
   if (!s.ok()) {
@@ -141,18 +147,21 @@ const GraphNodeProto& TFStats::ShowGraphNode(const string& cmd,
   if (!Validate(opts)) {
     return empty_graph_node_;
   }
+  string prefix = MaybeReportMissingTrace();
+  prefix += QueryDoc(cmd, opts) + kProfilePrefix;
+
   if (cmd == kCmds[0]) {
-    return scope_view_->Show(opts);
+    return scope_view_->Show(prefix, opts);
   } else if (cmd == kCmds[1]) {
     if (opts.step < 0 && opts.output_type == kOutput[0]) {
       for (int64 step : steps_) {
         Options nopts = opts;
         nopts.step = step;
-        graph_view_->Show(nopts);
+        graph_view_->Show(prefix, nopts);
       }
       return empty_graph_node_;
     }
-    return graph_view_->Show(opts);
+    return graph_view_->Show(prefix, opts);
   } else {
     fprintf(stderr, "Unknown command: %s\n", cmd.c_str());
     return empty_graph_node_;
@@ -164,14 +173,17 @@ const MultiGraphNodeProto& TFStats::ShowMultiGraphNode(
   if (!Validate(opts)) {
     return empty_multi_graph_node_;
   }
+  string prefix = MaybeReportMissingTrace();
+  prefix += QueryDoc(cmd, opts) + kProfilePrefix;
+
   if (cmd == kCmds[2]) {
     if (!has_code_traces()) {
       fprintf(stderr, "No code trace information\n");
       return empty_multi_graph_node_;
     }
-    return code_view_->Show(opts);
+    return code_view_->Show(prefix, opts);
   } else if (cmd == kCmds[3]) {
-    return op_view_->Show(opts);
+    return op_view_->Show(prefix, opts);
   } else {
     fprintf(stderr, "Unknown command: %s\n", cmd.c_str());
     return empty_multi_graph_node_;
@@ -258,7 +270,17 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
   }
   steps_.insert(step);
 
+  bool has_gpu_scheduling = false;
+  bool has_gpu_stream = false;
+
   for (const auto& dev_stat : run_meta->step_stats().dev_stats()) {
+    string dev = str_util::Lowercase(dev_stat.device());
+    if (IsPlacedOnAccelerator(dev)) {
+      has_gpu_scheduling = true;
+      if (CountAsAcceleratorTime(dev)) {
+        has_gpu_stream = true;
+      }
+    }
     for (const NodeExecStats& node_stat : dev_stat.node_stats()) {
       string name = node_stat.node_name();
       // Sometimes the node_name is suffixed with unnecessary information.
@@ -280,9 +302,26 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
       }
     }
   }
+
+  if (has_gpu_scheduling && !has_gpu_stream) {
+    miss_accelerator_stream_ = true;
+  }
 }
 
-void TFStats::WriteProfile(const string& filename) {
+string TFStats::MaybeReportMissingTrace() const {
+  string report = "";
+  if (miss_accelerator_stream_) {
+    report +=
+        "\n\nFound accelerator operation but misses accelerator "
+        "stream stats!\n\n"
+        "It's likely a gpu tracing issue rather than tf-profiler issue.\n"
+        "If you found your operation missing accelerator time, "
+        "consider filing a bug to xprof-dev@!\n\n";
+  }
+  return report;
+}
+
+void TFStats::SerializeToString(string* content) {
   ProfileProto profile;
   for (const auto& entry : id_to_string_) {
     (*profile.mutable_id_to_string())[entry.first] = entry.second;
@@ -296,11 +335,17 @@ void TFStats::WriteProfile(const string& filename) {
   }
 
   profile.set_has_trace(has_code_traces_);
+  profile.set_miss_accelerator_stream(miss_accelerator_stream_);
   for (int64 s : steps_) {
     profile.add_steps(s);
   }
-  Status s =
-      WriteStringToFile(Env::Default(), filename, profile.SerializeAsString());
+  *content = profile.SerializeAsString();
+}
+
+void TFStats::WriteProfile(const string& filename) {
+  string content;
+  SerializeToString(&content);
+  Status s = WriteStringToFile(Env::Default(), filename, content);
   if (!s.ok()) {
     fprintf(stderr, "%s\n", s.ToString().c_str());
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h
index d46d9235560c673323d243a40f21bbd06aa9416d..db148c936c9746c773213a9a49803103814906d3 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.h
+++ b/tensorflow/core/profiler/internal/tfprof_stats.h
@@ -20,8 +20,8 @@ limitations under the License.
 // 3. Accept command and options to selectively aggregate stats for analysis
 //    and print out the results.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
 
 #include <map>
 #include <memory>
@@ -34,17 +34,17 @@ limitations under the License.
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/profiler/internal/tfprof_code.h"
 #include "tensorflow/core/profiler/internal/tfprof_graph.h"
 #include "tensorflow/core/profiler/internal/tfprof_node.h"
 #include "tensorflow/core/profiler/internal/tfprof_op.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_scope.h"
 #include "tensorflow/core/profiler/internal/tfprof_show.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
 #include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_output.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -83,7 +83,7 @@ class TFStats {
   const MultiGraphNodeProto& ShowMultiGraphNode(const string& cmd,
                                                 const Options& opts) const;
 
-  // A a (partial) graph to existing graph.
+  // Add a (partial) graph to existing graph.
   void AddGraph(std::unique_ptr<GraphDef> graph);
 
   // Add a step of run time meta data.
@@ -92,6 +92,7 @@ class TFStats {
   // and code traces.
   void AddOpLogProto(std::unique_ptr<OpLogProto> op_log);
 
+  void SerializeToString(string* content);
   void WriteProfile(const string& filename);
 
   // For test purpose only.
@@ -99,9 +100,11 @@ class TFStats {
 
  private:
   bool Validate(const Options& opts) const;
+  string MaybeReportMissingTrace() const;
 
   std::set<int64> steps_;
   bool has_code_traces_;
+  bool miss_accelerator_stream_;
   std::unique_ptr<TFScope> scope_view_;
   std::unique_ptr<TFGraph> graph_view_;
   std::unique_ptr<TFCode> code_view_;
@@ -115,11 +118,11 @@ class TFStats {
   MultiGraphNodeProto empty_multi_graph_node_;
 
   std::map<int64, string> id_to_string_;
-  // Graph nodes covered by RunMetdata, that is traced with run time stats.
+  // Graph nodes covered by RunMetadata, that is traced with run time stats.
   std::set<int64> covered_nodes_;
 };
 
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_STATS_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_stats_test.cc b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
index 2f2101d76bfd4c0741fff0eb9762444cd8b6fd92..564278c9963836f1e8486cbbdc0901b782ae2f61 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/profiler/internal/tfprof_constants.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
 #include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_output.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
@@ -89,21 +89,27 @@ TEST_F(TFProfStatsTest, CustomOpType) {
 
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
-      "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_parameters: "
-      "451\nchildren {\n  name: \"DW\"\n  exec_micros: 2\n  parameters: 162\n  "
-      "total_exec_micros: 2\n  total_parameters: 162\n  devices: "
+      "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_requested_bytes: "
+      "2560\ntotal_parameters: 451\nchildren {\n  name: \"DW\"\n  exec_micros: "
+      "2\n  requested_bytes: 1280\n  parameters: 162\n  total_exec_micros: 2\n "
+      " total_requested_bytes: 1280\n  total_parameters: 162\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  cpu_exec_micros: 2\n  "
       "total_cpu_exec_micros: 2\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  output_bytes: 1280\n  total_output_bytes: "
-      "1280\n}\nchildren {\n  name: \"DW2\"\n  exec_micros: 11\n  parameters: "
-      "288\n  total_exec_micros: 11\n  total_parameters: 288\n  devices: "
+      "total_definition_count: 1\n  peak_bytes: 1280\n  residual_bytes: 1280\n "
+      " output_bytes: 1280\n  total_peak_bytes: 1280\n  total_residual_bytes: "
+      "1280\n  total_output_bytes: 1280\n}\nchildren {\n  name: \"DW2\"\n  "
+      "exec_micros: 11\n  requested_bytes: 1280\n  parameters: 288\n  "
+      "total_exec_micros: 11\n  total_requested_bytes: 1280\n  "
+      "total_parameters: 288\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  cpu_exec_micros: 11\n  "
       "total_cpu_exec_micros: 11\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  output_bytes: 1280\n  total_output_bytes: "
-      "1280\n}\nchildren {\n  name: \"ScalarW\"\n  parameters: 1\n  "
-      "total_parameters: 1\n  total_definition_count: "
+      "total_definition_count: 1\n  peak_bytes: 1280\n  residual_bytes: 1280\n "
+      " output_bytes: 1280\n  total_peak_bytes: 1280\n  total_residual_bytes: "
+      "1280\n  total_output_bytes: 1280\n}\nchildren {\n  name: \"ScalarW\"\n  "
+      "parameters: 1\n  total_parameters: 1\n  total_definition_count: "
       "1\n}\ntotal_cpu_exec_micros: 13\ntotal_run_count: "
-      "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n",
+      "2\ntotal_definition_count: 3\ntotal_peak_bytes: "
+      "2560\ntotal_residual_bytes: 2560\ntotal_output_bytes: 2560\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -119,21 +125,27 @@ TEST_F(TFProfStatsTest, CheckPointOpType) {
 
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
-      "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_parameters: "
-      "451\nchildren {\n  name: \"DW\"\n  exec_micros: 2\n  parameters: 162\n  "
-      "total_exec_micros: 2\n  total_parameters: 162\n  devices: "
+      "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_requested_bytes: "
+      "2560\ntotal_parameters: 451\nchildren {\n  name: \"DW\"\n  exec_micros: "
+      "2\n  requested_bytes: 1280\n  parameters: 162\n  total_exec_micros: 2\n "
+      " total_requested_bytes: 1280\n  total_parameters: 162\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  cpu_exec_micros: 2\n  "
       "total_cpu_exec_micros: 2\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  output_bytes: 1280\n  total_output_bytes: "
-      "1280\n}\nchildren {\n  name: \"DW2\"\n  exec_micros: 11\n  parameters: "
-      "288\n  total_exec_micros: 11\n  total_parameters: 288\n  devices: "
+      "total_definition_count: 1\n  peak_bytes: 1280\n  residual_bytes: 1280\n "
+      " output_bytes: 1280\n  total_peak_bytes: 1280\n  total_residual_bytes: "
+      "1280\n  total_output_bytes: 1280\n}\nchildren {\n  name: \"DW2\"\n  "
+      "exec_micros: 11\n  requested_bytes: 1280\n  parameters: 288\n  "
+      "total_exec_micros: 11\n  total_requested_bytes: 1280\n  "
+      "total_parameters: 288\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  cpu_exec_micros: 11\n  "
       "total_cpu_exec_micros: 11\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  output_bytes: 1280\n  total_output_bytes: "
-      "1280\n}\nchildren {\n  name: \"ScalarW\"\n  parameters: 1\n  "
-      "total_parameters: 1\n  total_definition_count: "
+      "total_definition_count: 1\n  peak_bytes: 1280\n  residual_bytes: 1280\n "
+      " output_bytes: 1280\n  total_peak_bytes: 1280\n  total_residual_bytes: "
+      "1280\n  total_output_bytes: 1280\n}\nchildren {\n  name: \"ScalarW\"\n  "
+      "parameters: 1\n  total_parameters: 1\n  total_definition_count: "
       "1\n}\ntotal_cpu_exec_micros: 13\ntotal_run_count: "
-      "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n",
+      "2\ntotal_definition_count: 3\ntotal_peak_bytes: "
+      "2560\ntotal_residual_bytes: 2560\ntotal_output_bytes: 2560\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -150,7 +162,7 @@ TEST_F(TFProfStatsTest, TestGraph) {
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: "
-      "14592\ntotal_parameters: 451\nchildren {\n  name: "
+      "30464\ntotal_parameters: 451\nchildren {\n  name: "
       "\"DW/Initializer/random_normal/mul\"\n  children {\n    name: "
       "\"DW/Initializer/random_normal/RandomStandardNormal\"\n    children {\n "
       "     name: \"DW/Initializer/random_normal/shape\"\n      "
@@ -166,7 +178,7 @@ TEST_F(TFProfStatsTest, TestGraph) {
       "4\n}\ntotal_float_ops: 10440\ntotal_accelerator_exec_micros: "
       "404\ntotal_cpu_exec_micros: 4541\ntotal_run_count: "
       "6\ntotal_definition_count: 32\ntotal_peak_bytes: "
-      "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
+      "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -181,9 +193,9 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: "
-      "14592\ntotal_parameters: 451\nchildren {\n  name: \"Conv2D\"\n  "
-      "exec_micros: 4292\n  requested_bytes: 9472\n  total_exec_micros: 4292\n "
-      " total_requested_bytes: 9472\n  devices: "
+      "30464\ntotal_parameters: 451\nchildren {\n  name: \"Conv2D\"\n  "
+      "exec_micros: 4292\n  requested_bytes: 18176\n  total_exec_micros: "
+      "4292\n  total_requested_bytes: 18176\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  float_ops: 5832\n  "
       "total_float_ops: 5832\n  input_shapes {\n    key: 0\n    value {\n      "
       "dim {\n        size: 2\n      }\n      dim {\n        size: 6\n      "
@@ -194,11 +206,11 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
       "6\n      }\n    }\n  }\n  accelerator_exec_micros: 226\n  "
       "cpu_exec_micros: 4066\n  total_accelerator_exec_micros: 226\n  "
       "total_cpu_exec_micros: 4066\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  peak_bytes: 5888\n  residual_bytes: 768\n  "
-      "output_bytes: 768\n  total_peak_bytes: 5888\n  total_residual_bytes: "
+      "total_definition_count: 1\n  peak_bytes: 14592\n  residual_bytes: 768\n "
+      " output_bytes: 768\n  total_peak_bytes: 14592\n  total_residual_bytes: "
       "768\n  total_output_bytes: 768\n}\nchildren {\n  name: \"Conv2D_1\"\n  "
-      "exec_micros: 597\n  requested_bytes: 5120\n  total_exec_micros: 597\n  "
-      "total_requested_bytes: 5120\n  devices: "
+      "exec_micros: 597\n  requested_bytes: 9728\n  total_exec_micros: 597\n  "
+      "total_requested_bytes: 9728\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  float_ops: 4608\n  "
       "total_float_ops: 4608\n  input_shapes {\n    key: 0\n    value {\n      "
       "dim {\n        size: 2\n      }\n      dim {\n        size: 3\n      "
@@ -209,12 +221,12 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
       "12\n      }\n    }\n  }\n  accelerator_exec_micros: 178\n  "
       "cpu_exec_micros: 419\n  total_accelerator_exec_micros: 178\n  "
       "total_cpu_exec_micros: 419\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  peak_bytes: 4096\n  residual_bytes: 512\n  "
-      "output_bytes: 512\n  total_peak_bytes: 4096\n  total_residual_bytes: "
+      "total_definition_count: 1\n  peak_bytes: 8704\n  residual_bytes: 512\n  "
+      "output_bytes: 512\n  total_peak_bytes: 8704\n  total_residual_bytes: "
       "512\n  total_output_bytes: 512\n}\ntotal_float_ops: "
       "10440\ntotal_accelerator_exec_micros: 404\ntotal_cpu_exec_micros: "
       "4541\ntotal_run_count: 6\ntotal_definition_count: 35\ntotal_peak_bytes: "
-      "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
+      "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -231,9 +243,9 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\ntotal_exec_micros: 597\ntotal_requested_bytes: "
-      "5120\nchildren {\n  name: \"Conv2D_1\"\n  exec_micros: 597\n  "
-      "requested_bytes: 5120\n  total_exec_micros: 597\n  "
-      "total_requested_bytes: 5120\n  devices: "
+      "9728\nchildren {\n  name: \"Conv2D_1\"\n  exec_micros: 597\n  "
+      "requested_bytes: 9728\n  total_exec_micros: 597\n  "
+      "total_requested_bytes: 9728\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  float_ops: 4608\n  "
       "total_float_ops: 4608\n  input_shapes {\n    key: 0\n    value {\n      "
       "dim {\n        size: 2\n      }\n      dim {\n        size: 3\n      "
@@ -244,12 +256,12 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
       "12\n      }\n    }\n  }\n  accelerator_exec_micros: 178\n  "
       "cpu_exec_micros: 419\n  total_accelerator_exec_micros: 178\n  "
       "total_cpu_exec_micros: 419\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  peak_bytes: 4096\n  residual_bytes: 512\n  "
-      "output_bytes: 512\n  total_peak_bytes: 4096\n  total_residual_bytes: "
+      "total_definition_count: 1\n  peak_bytes: 8704\n  residual_bytes: 512\n  "
+      "output_bytes: 512\n  total_peak_bytes: 8704\n  total_residual_bytes: "
       "512\n  total_output_bytes: 512\n}\ntotal_float_ops: "
       "4608\ntotal_accelerator_exec_micros: 178\ntotal_cpu_exec_micros: "
       "419\ntotal_run_count: 1\ntotal_definition_count: 2\ntotal_peak_bytes: "
-      "4096\ntotal_residual_bytes: 512\ntotal_output_bytes: 512\n",
+      "8704\ntotal_residual_bytes: 512\ntotal_output_bytes: 512\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -265,8 +277,9 @@ TEST_F(TFProfStatsTest, TestShowTensorValue) {
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: "
-      "14592\ntotal_parameters: 451\nchildren {\n  name: \"DW\"\n  "
-      "exec_micros: 2\n  parameters: 162\n  total_exec_micros: 2\n  "
+      "30464\ntotal_parameters: 451\nchildren {\n  name: \"DW\"\n  "
+      "exec_micros: 2\n  requested_bytes: 1280\n  parameters: 162\n  "
+      "total_exec_micros: 2\n  total_requested_bytes: 1280\n  "
       "total_parameters: 162\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  tensor_value {\n    dtype: "
       "DT_FLOAT\n    value_double: -0.000534315\n    value_double: "
@@ -351,11 +364,13 @@ TEST_F(TFProfStatsTest, TestShowTensorValue) {
       "value_double: 0.000374641\n    value_double: -0.00149603\n    "
       "value_double: -0.000317367\n    value_double: -0.000417829\n  }\n  "
       "cpu_exec_micros: 2\n  total_cpu_exec_micros: 2\n  run_count: 1\n  "
-      "total_run_count: 1\n  total_definition_count: 10\n  output_bytes: "
-      "1280\n  total_output_bytes: 1280\n}\ntotal_float_ops: "
-      "10440\ntotal_accelerator_exec_micros: 404\ntotal_cpu_exec_micros: "
-      "4541\ntotal_run_count: 6\ntotal_definition_count: 35\ntotal_peak_bytes: "
-      "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
+      "total_run_count: 1\n  total_definition_count: 10\n  peak_bytes: 1280\n  "
+      "residual_bytes: 1280\n  output_bytes: 1280\n  total_peak_bytes: 1280\n  "
+      "total_residual_bytes: 1280\n  total_output_bytes: "
+      "1280\n}\ntotal_float_ops: 10440\ntotal_accelerator_exec_micros: "
+      "404\ntotal_cpu_exec_micros: 4541\ntotal_run_count: "
+      "6\ntotal_definition_count: 35\ntotal_peak_bytes: "
+      "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 }
diff --git a/tensorflow/core/profiler/internal/tfprof_tensor.h b/tensorflow/core/profiler/internal/tfprof_tensor.h
index 9f72e081c91957f6534334e56bf85a6b1d36a1ba..7a0885772001a1c4b587cff54739264bb5542925 100644
--- a/tensorflow/core/profiler/internal/tfprof_tensor.h
+++ b/tensorflow/core/profiler/internal/tfprof_tensor.h
@@ -19,8 +19,8 @@ limitations under the License.
 //    is not supported by TensorFlow CheckPointReader library, though it is
 //    supported in current code.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
 
 #include <typeinfo>
 
@@ -173,4 +173,4 @@ class TFProfTensor {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_tensor_test.cc b/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
index c68888e88fcedc174a9d28bb43408a9a95d50a6b..7fa79d23d853229b32ebd93ddb0640d9c75b323d 100644
--- a/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_stats.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
 #include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_output.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc
index bdb000747db72900d748c22140ca38e571db6691..b0dd8ce5e0f046325a309060b19467b7c1494568 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc
@@ -153,10 +153,8 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
 
   std::map<int64, int64> allocs;
   for (const auto& alloc : node->node->allocations(step)) {
-    for (const auto& r : alloc.allocation_records()) {
-      allocs[r.alloc_micros()] += r.alloc_bytes();
-      dev.tracked_allocations[r.alloc_micros()] += r.alloc_bytes();
-    }
+    allocs[alloc.alloc_micros()] += alloc.alloc_bytes();
+    dev.tracked_allocations[alloc.alloc_micros()] += alloc.alloc_bytes();
   }
   dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
   allocs[0] += node->node->accelerator_persistent_bytes();
@@ -167,9 +165,9 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
     last += it->second;
     aggregate_allocs[it->first] = last;
   }
-  int64 end_micros = node->node->lastest_schedule_end_micros(step);
-  if (end_micros > 0 && node->node->allocator_bytes_in_use(step) > 0) {
-    dev.allocations[end_micros] = node->node->allocator_bytes_in_use(step);
+  for (const auto& bytes_in_use : node->node->allocator_bytes_in_use(step)) {
+    if (bytes_in_use.first <= 0) continue;
+    dev.allocations[bytes_in_use.first] = bytes_in_use.second;
   }
 }
 
@@ -265,6 +263,10 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
     }
   }
   for (const auto& dev : mem_tracker_.devices()) {
+    if (IsPlacedOnCPU(dev.first)) {
+      // TODO(xpan): Maybe also support CPU allocator memory tracking.
+      continue;
+    }
     int64 pid = AllocatePID();
     chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
     int64 pid2 = AllocatePID();
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h
index b8174cdecbd764ff784049e75d0a62c038c05978..baf3fb2bedb13e13b21940485ec439c19a97dd02 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.h
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.h
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
 
 #include "include/json/json.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/profiler/internal/tfprof_node_show.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -178,7 +178,6 @@ class Timeline {
   int64 step_;
   const string outfile_;
   int64 next_pid_ = 0;
-  int64 allocator_pid_ = -1;
   MemoryTracker mem_tracker_;
   ChromeTraceFormatter chrome_formatter_;
   std::map<string, int64> device_pids_;
@@ -191,4 +190,4 @@ class Timeline {
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
index 91eac0cf7617eba54f6938fb893192d2a8fe2eaf..e8bd326aa256acf0cc5c2c87abdc8e9662300603 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
@@ -23,12 +23,12 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/profiler/internal/tfprof_constants.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
 #include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/profiler/tfprof_output.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -71,7 +71,7 @@ TEST_F(TFProfTimelineTest, GraphView) {
 
   string dump_str;
   TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file + "_0", &dump_str));
-  EXPECT_EQ(7932146665024565912ull, Hash64(dump_str));
+  EXPECT_EQ(16556121177519539380ull, Hash64(dump_str));
 }
 
 TEST_F(TFProfTimelineTest, ScopeView) {
diff --git a/tensorflow/core/profiler/internal/tfprof_utils.cc b/tensorflow/core/profiler/internal/tfprof_utils.cc
index 1ce59ad7552179a6bec387763960d7311958f594..7712ebd926f1df2d65b7f7d732b55846654ed218 100644
--- a/tensorflow/core/profiler/internal/tfprof_utils.cc
+++ b/tensorflow/core/profiler/internal/tfprof_utils.cc
@@ -297,21 +297,134 @@ void PrintHelp() {
       "See https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/"
       "g3doc/command_line.md for command line tool tutorial.\n");
   printf(
-      "profiler --profile_path=<ProfileProto binary file> # required\nor:\n"
-      "profiler --graph_path=<GraphDef proto file>  # required\n"
-      "         --run_meta_patn=<RunMetadata proto file>  # optional\n"
-      "         --run_log_path=<OpLogProto proto file>  # optional\n\n");
+      "profiler --profile_path=<ProfileProto binary file> # required\n"
+      "\nOr:\n\n"
+      "profiler --graph_path=<GraphDef proto file>  "
+      "# Contains model graph info (no needed for eager execution)\n"
+      "         --run_meta_path=<RunMetadata proto file>  "
+      "# Contains runtime info. Optional.\n"
+      "         --run_log_path=<OpLogProto proto file>  "
+      "# Contains extra source code, flops, custom type info. Optional\n\n");
   printf(
-      "\nCommands:\n"
+      "\nTo skip interactive mode, append one of the following commands:\n"
       "  scope: Organize profiles based on name scopes.\n"
       "  graph: Organize profiles based on graph node input/output.\n"
       "  op: Organize profiles based on operation type.\n"
       "  code: Organize profiles based on python codes (need op_log_path).\n"
-      "  advise: Auto-profile and advise.\n"
+      "  advise: Auto-profile and advise. (experimental)\n"
       "  set: Set options that will be default for follow up commands.\n"
       "  help: Show helps.\n");
   fflush(stdout);
 }
 
+static const char* const kTotalMicrosHelp =
+    "total execution time: Sum of accelerator execution time and cpu execution "
+    "time.";
+static const char* const kAccMicrosHelp =
+    "accelerator execution time: Time spent executing on the accelerator. "
+    "This is normally measured by the actual hardware library.";
+static const char* const kCPUHelp =
+    "cpu execution time: The time from the start to the end of the operation. "
+    "It's the sum of actual cpu run time plus the time that it spends waiting "
+    "if part of computation is launched asynchronously.";
+static const char* const kBytes =
+    "requested bytes: The memory requested by the operation, accumulatively.";
+static const char* const kPeakBytes =
+    "peak bytes: The peak amount of memory that the operation is holding at "
+    "some point.";
+static const char* const kResidualBytes =
+    "residual bytes: The memory not de-allocated after the operation finishes.";
+static const char* const kOutputBytes =
+    "output bytes: The memory that is output from the operation (not "
+    "necessarilty allocated by the operation)";
+static const char* const kOccurrence =
+    "occurrence: The number of times it occurs";
+static const char* const kInputShapes =
+    "input shape: The shape of input tensors";
+static const char* const kDevice = "device: which device is placed on.";
+static const char* const kFloatOps =
+    "flops: Number of float operations. Note: Please read the implementation "
+    "for the math behind it.";
+static const char* const kParams =
+    "param: Number of parameters (in the Variable).";
+static const char* const kTensorValue = "tensor_value: Not supported now.";
+static const char* const kOpTypes =
+    "op_types: The attributes of the operation, includes the Kernel name "
+    "device placed on and user-defined strings.";
+
+static const char* const kScope =
+    "scope: The nodes in the model graph are organized by their names, which "
+    "is hierarchical like filesystem.";
+static const char* const kCode =
+    "code: When python trace is available, the nodes are python lines and "
+    "their are organized by the python call stack.";
+static const char* const kOp =
+    "op: The nodes are operation kernel type, such as MatMul, Conv2D. Graph "
+    "nodes belonging to the same type are aggregated together.";
+static const char* const kAdvise =
+    "advise: Automatically profile and discover issues. (Experimental)";
+static const char* const kSet =
+    "set: Set a value for an option for future use.";
+static const char* const kHelp = "help: Print helping messages.";
+
+string QueryDoc(const string& cmd, const Options& opts) {
+  string cmd_help = "";
+  if (cmd == kCmds[0]) {
+    cmd_help = kScope;
+  } else if (cmd == kCmds[1]) {
+    cmd_help = kScope;
+  } else if (cmd == kCmds[2]) {
+    cmd_help = kCode;
+  } else if (cmd == kCmds[3]) {
+    cmd_help = kOp;
+  } else if (cmd == kCmds[4]) {
+    cmd_help = kAdvise;
+  } else if (cmd == kCmds[5]) {
+    cmd_help = kSet;
+  } else if (cmd == kCmds[6]) {
+    cmd_help = kHelp;
+  } else {
+    cmd_help = "Unknown command: " + cmd;
+  }
+
+  std::vector<string> helps;
+  for (const string& s : opts.select) {
+    if (s == kShown[0]) {
+      helps.push_back(kBytes);
+    } else if (s == kShown[1]) {
+      helps.push_back(strings::StrCat(kTotalMicrosHelp, "\n", kCPUHelp, "\n",
+                                      kAccMicrosHelp));
+    } else if (s == kShown[2]) {
+      helps.push_back(kParams);
+    } else if (s == kShown[3]) {
+      helps.push_back(kFloatOps);
+    } else if (s == kShown[4]) {
+      helps.push_back(kTensorValue);
+    } else if (s == kShown[5]) {
+      helps.push_back(kDevice);
+    } else if (s == kShown[6]) {
+      helps.push_back(kOpTypes);
+    } else if (s == kShown[7]) {
+      helps.push_back(kOccurrence);
+    } else if (s == kShown[8]) {
+      helps.push_back(kInputShapes);
+    } else if (s == kShown[9]) {
+      helps.push_back(kAccMicrosHelp);
+    } else if (s == kShown[10]) {
+      helps.push_back(kCPUHelp);
+    } else if (s == kShown[11]) {
+      helps.push_back(kPeakBytes);
+    } else if (s == kShown[12]) {
+      helps.push_back(kResidualBytes);
+    } else if (s == kShown[13]) {
+      helps.push_back(kOutputBytes);
+    } else {
+      helps.push_back("Unknown select: " + s);
+    }
+  }
+  return strings::StrCat("\nDoc:\n", cmd_help, "\n",
+                         str_util::Join(helps, "\n"), "\n\n");
+}
+
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/tfprof_utils.h b/tensorflow/core/profiler/internal/tfprof_utils.h
index 3407517ce01bbccd5fd82b03f9251fef5015c461..d4f80afce0c3145bed18ab677f7537a41dea778c 100644
--- a/tensorflow/core/profiler/internal/tfprof_utils.h
+++ b/tensorflow/core/profiler/internal/tfprof_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
 
 #include <string>
 #include <vector>
@@ -22,8 +22,8 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 #include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -66,7 +66,10 @@ Status ReadProtoFile(Env* env, const string& fname, T* proto,
 
 void PrintHelp();
 
+// Generate helper message based on the command and options.
+string QueryDoc(const string& cmd, const Options& opts);
+
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_UTILS_H_
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index b280242df18272b63c7b6a683e70db6c2e315c4d..808e3c853bec0efb9523ee413f3d5272a833358d 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -31,13 +31,13 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/core/profiler/internal/advisor/tfprof_advisor.h"
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
 #include "tensorflow/core/profiler/internal/tfprof_stats.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
 #include "tensorflow/core/profiler/tfprof_log.pb.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -140,10 +140,12 @@ int Run(int argc, char** argv) {
   }
   port::InitMain(argv[0], &argc, &argv);
 
-  if (!FLAGS_profile_path.empty() && !FLAGS_graph_path.empty()) {
+  if (!FLAGS_profile_path.empty() &&
+      (!FLAGS_graph_path.empty() || !FLAGS_run_meta_path.empty())) {
     fprintf(stderr,
-            "both --graph_path and --profile_path are set. "
-            "Ignore graph_path\n");
+            "--profile_path is set, do not set --graph_path or "
+            "--run_meta_path\n");
+    return 1;
   }
 
   std::vector<string> account_type_regexes =
@@ -165,7 +167,8 @@ int Run(int argc, char** argv) {
   CHECK(s.ok()) << s.ToString();
 
   string cmd = "";
-  if (argc == 1 && FLAGS_graph_path.empty() && FLAGS_profile_path.empty()) {
+  if (argc == 1 && FLAGS_graph_path.empty() && FLAGS_profile_path.empty() &&
+      FLAGS_run_meta_path.empty()) {
     PrintHelp();
     return 0;
   } else if (argc > 1) {
@@ -202,8 +205,14 @@ int Run(int argc, char** argv) {
         "Try to use a single --profile_path instead of "
         "graph_path,op_log_path,run_meta_path\n");
     std::unique_ptr<GraphDef> graph(new GraphDef());
-    TF_CHECK_OK(
-        ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false));
+    if (!FLAGS_graph_path.empty()) {
+      s = ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false);
+      if (!s.ok()) {
+        fprintf(stderr, "Failed to read graph_path: %s\n",
+                s.ToString().c_str());
+        return 1;
+      }
+    }
 
     std::unique_ptr<OpLogProto> op_log(new OpLogProto());
     if (!FLAGS_op_log_path.empty()) {
diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto
index f92301133a3102a2e4233326dd811169e1ecd105..90b9e293ec7851ef58be195db2b76175bf5bd74a 100644
--- a/tensorflow/core/profiler/tfprof_log.proto
+++ b/tensorflow/core/profiler/tfprof_log.proto
@@ -54,6 +54,9 @@ message ProfileProto {
   map<int64, ProfileNode> nodes = 1;
   // Whether or not has code traces.
   bool has_trace = 2;
+  // Whether or not the TF device tracer fails to return accelerator
+  // information (which could lead to 0 accelerator execution time).
+  bool miss_accelerator_stream = 5;
   // Traced steps.
   repeated int64 steps = 3;
 
@@ -90,10 +93,6 @@ message ProfileNode {
   map<int64, ExecProfile> execs = 12;
 }
 
-message Allocation {
-  repeated AllocationRecord allocation_records = 1;
-}
-
 message ExecProfile {
   // Can be larger than 1 if run multiple times in loop.
   int64 run_count = 1;
@@ -110,34 +109,42 @@ message ExecProfile {
   // For cpu, vector size can be larger than 1 if in tf.while_loop.
   map<string, ExecTime> cpu_execs = 5;
 
-  map<int32, Memory> output_memory = 17;
+  // Each entry to memory information of a scheduling of the node.
+  // Normally, there will be multiple entries in while_loop.
+  repeated ExecMemory memory_execs = 7;
+  // The allocation and deallocation times and sizes throughout execution.
+  repeated AllocationRecord allocations = 11;
+  // The devices related to this execution.
+  repeated string devices = 6;
+}
 
-  repeated Allocation allocations = 18;
+message ExecTime {
+  repeated Tuple times = 1;
+}
 
-  repeated string devices = 6;
+message ExecMemory {
+  // This is the timestamp when the memory information was tracked.
+  int64 memory_micros = 1;
+  // NOTE: Please don't depend on the following 4 fields yet. Due to
+  // TensorFlow internal tracing issues, the numbers can be quite wrong.
+  // TODO(xpan): Fix the TensorFlow internal tracing.
+  int64 host_temp_bytes = 2;
+  int64 host_persistent_bytes = 3;
+  int64 accelerator_temp_bytes = 4;
+  int64 accelerator_persistent_bytes = 5;
 
   // Total bytes requested by the op.
-  int64 requested_bytes = 7;
+  int64 requested_bytes = 6;
   // Total bytes requested by the op and released before op end.
-  int64 peak_bytes = 8;
+  int64 peak_bytes = 7;
   // Total bytes requested by the op and not released after op end.
-  int64 residual_bytes = 9;
+  int64 residual_bytes = 8;
   // Total bytes output by the op (not necessarily requested by the op).
-  int64 output_bytes = 10;
-  // Total temporary bytes allocated and released by the op.
-  int64 host_temp_bytes = 11;
-  // Total persistent bytes (e.g. variable) allocated by the op.
-  int64 host_persistent_bytes = 12;
-  int64 accelerator_temp_bytes = 13;
-  int64 accelerator_persistent_bytes = 14;
+  int64 output_bytes = 9;
   // The total number of bytes currently allocated by the allocator if >0.
-  int64 allocator_bytes_in_use = 15;
-
-  bool memory_intialized = 16;
-}
-
-message ExecTime {
-  repeated Tuple times = 1;
+  int64 allocator_bytes_in_use = 10;
+  // The memory of each output of the operation.
+  map<int32, Memory> output_memory = 11;
 }
 
 message Tuple {
diff --git a/tensorflow/core/profiler/internal/tfprof_options.cc b/tensorflow/core/profiler/tfprof_options.cc
similarity index 99%
rename from tensorflow/core/profiler/internal/tfprof_options.cc
rename to tensorflow/core/profiler/tfprof_options.cc
index 663427254182ba57bfba75efa5be82464e5c44f8..9e5ef0a0a31600e12e76cb8f5f3e5a1c6f62a3d5 100644
--- a/tensorflow/core/profiler/internal/tfprof_options.cc
+++ b/tensorflow/core/profiler/tfprof_options.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/internal/tfprof_options.h"
+#include "tensorflow/core/profiler/tfprof_options.h"
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/profiler/internal/tfprof_options.h b/tensorflow/core/profiler/tfprof_options.h
similarity index 96%
rename from tensorflow/core/profiler/internal/tfprof_options.h
rename to tensorflow/core/profiler/tfprof_options.h
index 463f5b3c3a69b3105141faea0c669a83c181bd93..d61deb72ac45517587739722457299acffa18a4c 100644
--- a/tensorflow/core/profiler/internal/tfprof_options.h
+++ b/tensorflow/core/profiler/tfprof_options.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
 
 #include <set>
 #include <string>
@@ -183,4 +183,4 @@ tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
 }  // namespace tfprof
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_OPTIONS_H_
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 1916316245063bd6e8903573a961295f3b79bcf6..3606c5f127ce1f533d018e645b0a48c20e79cd8d 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -42,18 +42,24 @@ message GPUOptions {
   // A comma-separated list of GPU ids that determines the 'visible'
   // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
   // can see 8 GPU devices in the process, and one wanted to map
-  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1", then one
-  // would specify this field as "5,3".  This field is similar in
+  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1",
+  // then one would specify this field as "5,3".  This field is similar in
   // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
   // it applies to the visible GPU devices in the process.
   //
-  // NOTE: The GPU driver provides the process with the visible GPUs
-  // in an order which is not guaranteed to have any correlation to
-  // the *physical* GPU id in the machine.  This field is used for
-  // remapping "visible" to "virtual", which means this operates only
-  // after the process starts.  Users are required to use vendor
-  // specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
-  // physical to visible device mapping prior to invoking TensorFlow.
+  // NOTE:
+  // 1. The GPU driver provides the process with the visible GPUs
+  //    in an order which is not guaranteed to have any correlation to
+  //    the *physical* GPU id in the machine.  This field is used for
+  //    remapping "visible" to "virtual", which means this operates only
+  //    after the process starts.  Users are required to use vendor
+  //    specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
+  //    physical to visible device mapping prior to invoking TensorFlow.
+  // 2. In the code, the ids in this list are also called "CUDA GPU id"s,
+  //    and the 'virtual' ids of GPU devices (i.e. the ids in the device
+  //    name "/device:GPU:<id>") are also called "TF GPU id"s. Please
+  //    refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h
+  //    for more information.
   string visible_device_list = 5;
 
   // In the event polling loop sleep this many microseconds between
@@ -77,6 +83,52 @@ message GPUOptions {
   // memory is unpageable, having too much pinned memory might negatively impact
   // the overall host system performance.
   bool force_gpu_compatible = 8;
+
+  // Everything inside Experimental is subject to change and is not subject
+  // to API stability guarantees in
+  // https://www.tensorflow.org/programmers_guide/version_compat.
+  message Experimental {
+    // Configuration for breaking down a visible GPU into multiple "virtual"
+    // devices.
+    message VirtualDevices {
+      // Per "virtual" device memory limit, in MB. The number of elements in
+      // the list is the number of virtual devices to create on the
+      // corresponding visible GPU (see "virtual_devices" below).
+      // If empty, it will create single virtual device taking all available
+      // memory from the device.
+      //
+      // For the concept of "visible" and "virtual" GPU, see the comments for
+      // "visible_device_list" above for more information.
+      repeated float memory_limit_mb = 1;
+    }
+
+    // The multi virtual device settings. If empty (not set), it will create
+    // single virtual device on each visible GPU, according to the settings
+    // in "visible_device_list" above. Otherwise, the number of elements in the
+    // list must be the same as the number of visible GPUs (after
+    // "visible_device_list" filtering if it is set), and the string represented
+    // device names (e.g. /device:GPU:<id>) will refer to the virtual
+    // devices and have the <id> field assigned sequentially starting from 0,
+    // according to the order they appear in this list and the "memory_limit"
+    // list inside each element. For example,
+    //   visible_device_list = "1,0"
+    //   virtual_devices { memory_limit: 1GB memory_limit: 2GB }
+    //   virtual_devices {}
+    // will create three virtual devices as:
+    //   /device:GPU:0 -> visible GPU 1 with 1GB memory
+    //   /device:GPU:1 -> visible GPU 1 with 2GB memory
+    //   /device:GPU:2 -> visible GPU 0 with all available memory
+    //
+    // NOTE:
+    // 1. It's invalid to set both this and "per_process_gpu_memory_fraction"
+    //    at the same time.
+    // 2. Currently this setting is per-process, not per-session. Using
+    //    different settings in different sessions within same process will
+    //    result in undefined behavior.
+    repeated VirtualDevices virtual_devices = 1;
+  }
+
+  Experimental experimental = 9;
 };
 
 // Options passed to the graph optimizer
@@ -335,7 +387,7 @@ message RunOptions {
   // EXPERIMENTAL.  Options used to initialize DebuggerState, if enabled.
   DebugOptions debug_options = 6;
 
-  // When enabled, causes tensor alllocation information to be included in
+  // When enabled, causes tensor allocation information to be included in
   // the error message when the Run() call fails because the allocator ran
   // out of memory (OOM).
   //
diff --git a/tensorflow/core/protobuf/control_flow.proto b/tensorflow/core/protobuf/control_flow.proto
index 48f503225447c26f8959ba379656361292052b44..2c9476a08ad946e7f019475055397fcd6cfbbc5a 100644
--- a/tensorflow/core/protobuf/control_flow.proto
+++ b/tensorflow/core/protobuf/control_flow.proto
@@ -66,4 +66,9 @@ message WhileContextDef {
 
   // Values and external values in control flow context.
   ValuesDef values_def = 9;
+
+  // Optional name of the maximum_iterations tensor.
+  string maximum_iterations_name = 11;
+
+  // Next available id: 12.
 }
diff --git a/tensorflow/core/protobuf/critical_section.proto b/tensorflow/core/protobuf/critical_section.proto
new file mode 100644
index 0000000000000000000000000000000000000000..0b3f531e6d9f59f05dfc0b7b36beda334f9f5101
--- /dev/null
+++ b/tensorflow/core/protobuf/critical_section.proto
@@ -0,0 +1,22 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "CriticalSectionProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+// Protocol buffer representing a CriticalSection.
+message CriticalSectionDef {
+  // Name of the critical section handle.
+  string critical_section_name = 1;
+}
+
+// Protocol buffer representing a CriticalSection execution.
+message CriticalSectionExecutionDef {
+  // Name of the critical section handle.
+  string execute_in_critical_section_name = 1;
+  // Whether this operation requires exclusive access to its resources,
+  // (i.e., no other CriticalSections may request the same resources).
+  bool exclusive_resource_access = 2;
+}
diff --git a/tensorflow/core/protobuf/debug.proto b/tensorflow/core/protobuf/debug.proto
index 136c627e25f33cb9b4ff2de7725406c0f800a5b1..56983f3b7d464f88cebe608ac15882f04f27b003 100644
--- a/tensorflow/core/protobuf/debug.proto
+++ b/tensorflow/core/protobuf/debug.proto
@@ -60,3 +60,25 @@ message DebugOptions {
   // step count.
   int64 global_step = 10;
 }
+
+message DebuggedSourceFile {
+  // The host name on which a source code file is located.
+  string host = 1;
+
+  // Path to the source code file.
+  string file_path = 2;
+
+  // The timestamp at which the source code file is last modified.
+  int64 last_modified = 3;
+
+  // Byte size of the file.
+  int64 bytes = 4;
+
+  // Line-by-line content of the source code file.
+  repeated string lines = 5;
+}
+
+message DebuggedSourceFiles {
+  // A collection of source code files.
+  repeated DebuggedSourceFile source_files = 1;
+}
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index 6b25a86ba46b9285100f7d91ebade711f0425874..0437cb1b83e12d83bf3b8713e2940a6d45173fb5 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -23,6 +23,7 @@ option java_package = "org.tensorflow.distruntime";
 
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
+import "tensorflow/core/lib/core/error_codes.proto";
 import "tensorflow/core/protobuf/config.proto";
 import "tensorflow/core/protobuf/named_tensor.proto";
 
@@ -129,6 +130,13 @@ message RunStepRequest {
   // Partial run handle (optional). If specified, this will be a partial run
   // execution, run up to the specified fetches.
   string partial_run_handle = 6;
+
+  // If true then some errors, e.g., execution errors that have long
+  // error messages, may return an OK RunStepResponse with the actual
+  // error saved in the status_code/status_error_message fields of the
+  // response body. This is a workaround since the RPC subsystem may
+  // truncate long metadata messages.
+  bool store_errors_in_response_body = 7;
 }
 
 message RunStepResponse {
@@ -138,6 +146,13 @@ message RunStepResponse {
 
   // Returned metadata if requested in the options.
   RunMetadata metadata = 2;
+
+  // If store_errors_in_response_body is true in the request, then
+  // optionally the server may return an OK status for the RPC and
+  // fill the true status into the fields below, to allow for messages
+  // that are too long to fit in metadata.
+  error.Code status_code = 3;
+  string status_error_message = 4;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/protobuf/meta_graph.proto b/tensorflow/core/protobuf/meta_graph.proto
index 47ec2aa1efeb11135b95b3b2c4342b77f0a9866b..fd86c0da12b26cf5ed8a7846d159dd6feb4ddc4e 100644
--- a/tensorflow/core/protobuf/meta_graph.proto
+++ b/tensorflow/core/protobuf/meta_graph.proto
@@ -61,6 +61,10 @@ message MetaGraphDef {
     // graph. This will be populated by the framework, which will overwrite any
     // user supplied value.
     string tensorflow_git_version = 6;
+
+    // A flag to denote whether default-valued attrs have been stripped from
+    // the nodes in this graph_def.
+    bool stripped_default_attrs = 7;
   }
   MetaInfoDef meta_info_def = 1;
 
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 3b5d1563a2695c4b33d596f0493e38ff044b3c38..0e9e202bc9a2d2368772c7fede9eb877d9d99023 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -29,19 +29,21 @@ message RewriterConfig {
     AGGRESSIVE = 3;
   }
 
-  // Optimize tensor layouts
+  // Optimize tensor layouts (default is ON)
   Toggle layout_optimizer = 1;
   // Fold constants (default is ON)
   Toggle constant_folding = 3;
   // Arithmetic optimizations (default is ON)
   Toggle arithmetic_optimization = 7;
-  // Control dependency optimizations (default is OFF).
+  // Control dependency optimizations (default is ON).
   Toggle dependency_optimization = 8;
+  // Loop optimizations (default is OFF).
+  Toggle loop_optimization = 9;
   // If true, don't remove unnecessary ops from the graph
   bool disable_model_pruning = 2;
 
   enum MemOptType {
-    // The default setting (currently disabled)
+    // The default setting (SCHEDULING_HEURISTICS only)
     DEFAULT_MEM_OPT = 0;
     // Disabled in the meta-optimizer.
     NO_MEM_OPT = 1;
@@ -53,6 +55,7 @@ message RewriterConfig {
     // selected automatically.
     SWAPPING_HEURISTICS = 4;
     RECOMPUTATION_HEURISTICS = 5;
+    SCHEDULING_HEURISTICS = 6;
     // Use any combination of swapping and recomputation heuristics.
     HEURISTICS = 3;
   }
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 385e2dd163b8c668357ea9fabd1dee7d9a675729..3e7289bd919015dcb6712ee89ccee3605dc6d907 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -27,6 +27,7 @@ import "tensorflow/core/framework/step_stats.proto";
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor.proto";
+import "tensorflow/core/lib/core/error_codes.proto";
 import "tensorflow/core/protobuf/config.proto";
 import "tensorflow/core/protobuf/debug.proto";
 import "tensorflow/core/protobuf/named_tensor.proto";
@@ -226,7 +227,14 @@ message RunGraphRequest {
   // True if this is the last partial run request in a sequence of requests.
   bool is_last_partial_run = 7;
 
-  // Next: 9
+  // If true then some errors, e.g., execution errors that have long
+  // error messages, may return an OK RunGraphResponse with the actual
+  // error saved in the status_code/status_error_message fields of the
+  // response body. This is a workaround since the RPC subsystem may
+  // truncate long metadata messages.
+  bool store_errors_in_response_body = 9;
+
+  // Next: 10
 }
 
 message RunGraphResponse {
@@ -240,6 +248,13 @@ message RunGraphResponse {
   StepStats step_stats = 2;
   CostGraphDef cost_graph = 3;
   repeated GraphDef partition_graph = 4;
+
+  // If store_errors_in_response_body is true in the request, then
+  // optionally the server may return an OK status for the RPC and
+  // fill the true status into the fields below, to allow for messages
+  // that are too long to fit in metadata.
+  error.Code status_code = 5;
+  string status_error_message = 6;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -277,7 +292,10 @@ message RecvTensorRequest {
   // into a RunGraph call on the same WorkerService.
   int64 step_id = 1;
 
-  // A key that identifies the tensor to be received.
+  // A key identifying the channel to receive tensors from. A RecvTensor request
+  // retrieves one tensor from the channel, but multiple tensors can be sent and
+  // received over the same channel with multiple RecvTensor requests. See
+  // rendezvous.h for details.
   string rendezvous_key = 2;
 
   // If true, use an out-of-band DMA mechanism to transfer the
@@ -292,6 +310,16 @@ message RecvTensorRequest {
 
   // Optional information needed by the RPC subsystem.
   google.protobuf.Any transport_options = 6;
+
+  // Unique identifier for this request. Every RecvTensorRequest must have a
+  // unique request_id, and retried RecvTensorRequests must have the same
+  // request_id. If request_id is zero, retry detection is disabled.
+  //
+  // Retried RecvTensorRequests are problematic because a RecvTensor with no
+  // corresponding sender will wait forever, and the tensor may have been
+  // delivered to a previous retry. Workers use request_ids to reject retried
+  // RecvTensor requests instead of waiting forever.
+  int64 request_id = 7;
 }
 
 message RecvTensorResponse {
diff --git a/tensorflow/core/public/session.h b/tensorflow/core/public/session.h
index bca384e59fe9412a77398a81f0c8abbfd512e51a..75ad50f6f2d59a8f4b8282d8e7b395e2323d62e1 100644
--- a/tensorflow/core/public/session.h
+++ b/tensorflow/core/public/session.h
@@ -186,7 +186,7 @@ class Session {
   /// the `SessionOptions::target` field).
   virtual Status Close() = 0;
 
-  // NOTE(ashankar): As of July 2017, this method was added to faciliate some
+  // NOTE(ashankar): As of July 2017, this method was added to facilitate some
   // experimentation. Reconsider/re-evaluate after September 2017.
   //
   // Sets `*output` to the `DeviceMgr` that owns accessible devices in the
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index ec077c42837e517f94955956ed75430b7a3d0a30..50bfa9126789033c617e22f25dbb76273fccfc60 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 4
+#define TF_MINOR_VERSION 6
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
@@ -91,10 +91,15 @@ limitations under the License.
 // 24. Deprecate lookup ops (v1) ops in favor of v2 (30may2017)
 // 25. Deprecate stack (v1) ops in favor of v2 (2017/6/15).
 // 25. Deprecate RandomPoisson (v1) ops in favor of v2 (2017/10/25).
+// 26. Add a bool 'stripped_default_attrs' to MetaInfoDef indicating
+//     whether default-valued attrs have been stripped from the nodes in the
+//     GraphDef. (7dec2017)
+// 27. Deprecate TensorArray ops v2 in favor of v3 and deprecated io_ops
+//     deprecated in favor of V2 ops. (2018/01/23)
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 24
+#define TF_GRAPH_DEF_VERSION 26
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
@@ -119,5 +124,7 @@ extern const char* tf_compiler_version();
 extern const char* tf_git_version();
 // Value of the _GLIBCXX_USE_CXX11_ABI flag, or 0 if it's not set.
 extern const int tf_cxx11_abi_flag();
+// Returns 1 if build is monolithic, or 0 otherwise.
+extern const int tf_monolithic_build();
 
 #endif  // TENSORFLOW_CORE_PUBLIC_VERSION_H_
diff --git a/tensorflow/core/user_ops/fact.cc b/tensorflow/core/user_ops/fact.cc
index c512275506436d54829b355dbbd9711115d364b3..3a4fc8115a7f91badfeda369a599b3dba3057c63 100644
--- a/tensorflow/core/user_ops/fact.cc
+++ b/tensorflow/core/user_ops/fact.cc
@@ -18,27 +18,23 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
-using namespace tensorflow;
+REGISTER_OP("Fact").Output("fact: string");
 
-REGISTER_OP("Fact")
-    .Output("fact: string")
-    .Doc(R"doc(
-Output a fact about factorials.
-)doc");
-
-class FactOp : public OpKernel {
+class FactOp : public tensorflow::OpKernel {
  public:
-  explicit FactOp(OpKernelConstruction* context) : OpKernel(context) {}
+  explicit FactOp(tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {}
 
-  void Compute(OpKernelContext* context) override {
+  void Compute(tensorflow::OpKernelContext* context) override {
     // Output a scalar string.
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape(), &output_tensor));
+    tensorflow::Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, tensorflow::TensorShape(), &output_tensor));
+    using tensorflow::string;
     auto output = output_tensor->template scalar<string>();
 
     output() = "0! == 1";
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("Fact").Device(DEVICE_CPU), FactOp);
+REGISTER_KERNEL_BUILDER(Name("Fact").Device(tensorflow::DEVICE_CPU), FactOp);
diff --git a/tensorflow/core/util/bcast.cc b/tensorflow/core/util/bcast.cc
index 1eab7e3d024c181f260500686b9127dd76dbe206..3a5f1f83af8d2d2324f3139568aa69f204cf1248 100644
--- a/tensorflow/core/util/bcast.cc
+++ b/tensorflow/core/util/bcast.cc
@@ -69,9 +69,9 @@ BCast::BCast(const Vec& sx, const Vec& sy, const bool fewer_dims_optimization) {
       State curr = UNKNOWN;
       const int64 x_i = x[i];  // i-th dimension of x.
       const int64 y_i = y[i];  // i-th dimension of y.
-      int64 o_i;   // i-th dimension of the output.
-      int64 bx_i;  // i-th broadcast for x.
-      int64 by_i;  // i-th broadcast for y.
+      int64 o_i;               // i-th dimension of the output.
+      int64 bx_i;              // i-th broadcast for x.
+      int64 by_i;              // i-th broadcast for y.
       // Invariant:
       //   o_i = x_i * bx_i = y_i * by_i
       if (x_i == y_i) {
diff --git a/tensorflow/core/util/command_line_flags.h b/tensorflow/core/util/command_line_flags.h
index 121c7063c9ebf6d447d0077f612386e316e05624..928ae8a4e9405f30ec994110e9032c6c19dd1b7f 100644
--- a/tensorflow/core/util/command_line_flags.h
+++ b/tensorflow/core/util/command_line_flags.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
-#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
+#ifndef TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
+#define TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
 
 #include <functional>
 #include <string>
@@ -134,4 +134,4 @@ class Flags {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
+#endif  // TENSORFLOW_CORE_UTIL_COMMAND_LINE_FLAGS_H
diff --git a/tensorflow/core/util/ctc/ctc_beam_entry.h b/tensorflow/core/util/ctc/ctc_beam_entry.h
index d30ab3f4dadb28a0c63632357bc7d631e2bdc81f..53087821d7b4bc0f98e77be9274cbdb4c675c10f 100644
--- a/tensorflow/core/util/ctc/ctc_beam_entry.h
+++ b/tensorflow/core/util/ctc/ctc_beam_entry.h
@@ -52,26 +52,25 @@ struct BeamProbability {
   float label;
 };
 
+template <class CTCBeamState>
+class BeamRoot;
+
 template <class CTCBeamState = EmptyBeamState>
 struct BeamEntry {
-  // Default constructor does not create a vector of children.
-  BeamEntry() : parent(nullptr), label(-1) {}
-  // Constructor giving parent, label, and number of children does
-  // create a vector of children.  The object pointed to by p
-  // cannot be copied and should not be moved, otherwise parent will
-  // become invalid.
-  BeamEntry(BeamEntry* p, int l) : parent(p), label(l) {}
+  // BeamRoot<CTCBeamState>::AddEntry() serves as the factory method.
+  friend BeamEntry<CTCBeamState>* BeamRoot<CTCBeamState>::AddEntry(
+      BeamEntry<CTCBeamState>* p, int l);
   inline bool Active() const { return newp.total != kLogZero; }
   // Return the child at the given index, or construct a new one in-place if
   // none was found.
   BeamEntry& GetChild(int ind) {
     auto entry = children.emplace(ind, nullptr);
     auto& child_entry = entry.first->second;
-    // If this is a new child, populate the uniqe_ptr.
+    // If this is a new child, populate the BeamEntry<CTCBeamState>*.
     if (entry.second) {
-      child_entry.reset(new BeamEntry(this, ind));
+      child_entry = beam_root->AddEntry(this, ind);
     }
-    return *(child_entry.get());
+    return *child_entry;
   }
   std::vector<int> LabelSeq(bool merge_repeated) const {
     std::vector<int> labels;
@@ -90,15 +89,45 @@ struct BeamEntry {
 
   BeamEntry<CTCBeamState>* parent;
   int label;
-  gtl::FlatMap<int, std::unique_ptr<BeamEntry<CTCBeamState>>> children;
+  // All instances of child BeamEntry are owned by *beam_root.
+  gtl::FlatMap<int, BeamEntry<CTCBeamState>*> children;
   BeamProbability oldp;
   BeamProbability newp;
   CTCBeamState state;
 
  private:
+  // Constructor giving parent, label, and the beam_root.
+  // The object pointed to by p cannot be copied and should not be moved,
+  // otherwise parent will become invalid.
+  // This private constructor is only called through the factory method
+  // BeamRoot<CTCBeamState>::AddEntry().
+  BeamEntry(BeamEntry* p, int l, BeamRoot<CTCBeamState>* beam_root)
+      : parent(p), label(l), beam_root(beam_root) {}
+  BeamRoot<CTCBeamState>* beam_root;
   TF_DISALLOW_COPY_AND_ASSIGN(BeamEntry);
 };
 
+// This class owns all instances of BeamEntry.  This is used to avoid recursive
+// destructor call during destruction.
+template <class CTCBeamState = EmptyBeamState>
+class BeamRoot {
+ public:
+  BeamRoot(BeamEntry<CTCBeamState>* p, int l) { root_entry_ = AddEntry(p, l); }
+  BeamRoot(const BeamRoot&) = delete;
+  BeamRoot& operator=(const BeamRoot&) = delete;
+
+  BeamEntry<CTCBeamState>* AddEntry(BeamEntry<CTCBeamState>* p, int l) {
+    auto* new_entry = new BeamEntry<CTCBeamState>(p, l, this);
+    beam_entries_.emplace_back(new_entry);
+    return new_entry;
+  }
+  BeamEntry<CTCBeamState>* RootEntry() const { return root_entry_; }
+
+ private:
+  BeamEntry<CTCBeamState>* root_entry_ = nullptr;
+  std::vector<std::unique_ptr<BeamEntry<CTCBeamState>>> beam_entries_;
+};
+
 // BeamComparer is the default beam comparer provided in CTCBeamSearch.
 template <class CTCBeamState = EmptyBeamState>
 class BeamComparer {
diff --git a/tensorflow/core/util/ctc/ctc_beam_search.h b/tensorflow/core/util/ctc/ctc_beam_search.h
index 372f25a1434036ef6022841665f6f942af046dc1..709c65fc9659e5b76ffa42f6e3a2030e8cdc9676 100644
--- a/tensorflow/core/util/ctc/ctc_beam_search.h
+++ b/tensorflow/core/util/ctc/ctc_beam_search.h
@@ -16,11 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_SEARCH_H_
 #define TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_SEARCH_H_
 
+#include <algorithm>
 #include <cmath>
+#include <limits>
 #include <memory>
+#include <vector>
 
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/top_n.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -69,6 +73,7 @@ class CTCBeamSearchDecoder : public CTCDecoder {
   //   P(l=abc? @ t=3) = P(a @ 0)*P(b @ 1)*P(c @ 2)*P(? @ 3)
   // but we calculate it recursively for speed purposes.
   typedef ctc_beam_search::BeamEntry<CTCBeamState> BeamEntry;
+  typedef ctc_beam_search::BeamRoot<CTCBeamState> BeamRoot;
   typedef ctc_beam_search::BeamProbability BeamProbability;
 
  public:
@@ -142,7 +147,7 @@ class CTCBeamSearchDecoder : public CTCDecoder {
   float label_selection_margin_ = -1;  // -1 means unlimited.
 
   gtl::TopN<BeamEntry*, CTCBeamComparer> leaves_;
-  std::unique_ptr<BeamEntry> beam_root_;
+  std::unique_ptr<BeamRoot> beam_root_;
   BaseBeamScorer<CTCBeamState>* beam_scorer_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(CTCBeamSearchDecoder);
@@ -367,15 +372,15 @@ void CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::Reset() {
 
   // This beam root, and all of its children, will be in memory until
   // the next reset.
-  beam_root_.reset(new BeamEntry(nullptr, -1));
-  beam_root_->newp.total = 0.0;  // ln(1)
-  beam_root_->newp.blank = 0.0;  // ln(1)
+  beam_root_.reset(new BeamRoot(nullptr, -1));
+  beam_root_->RootEntry()->newp.total = 0.0;  // ln(1)
+  beam_root_->RootEntry()->newp.blank = 0.0;  // ln(1)
 
   // Add the root as the initial leaf.
-  leaves_.push(beam_root_.get());
+  leaves_.push(beam_root_->RootEntry());
 
   // Call initialize state on the root object.
-  beam_scorer_->InitializeState(&beam_root_->state);
+  beam_scorer_->InitializeState(&beam_root_->RootEntry()->state);
 }
 
 template <typename CTCBeamState, typename CTCBeamComparer>
diff --git a/tensorflow/core/util/ctc/ctc_decoder.h b/tensorflow/core/util/ctc/ctc_decoder.h
index 5b28aeb70ad4bd91800dda824f0bdffd5fcbea7c..b8bab69053fa65d4a29eb08ba10154c1b68a184d 100644
--- a/tensorflow/core/util/ctc/ctc_decoder.h
+++ b/tensorflow/core/util/ctc/ctc_decoder.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_CTC_CTC_DECODER_H_
 #define TENSORFLOW_CORE_UTIL_CTC_CTC_DECODER_H_
 
+#include <memory>
+#include <vector>
+
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/util/ctc/ctc_loss_calculator.h b/tensorflow/core/util/ctc/ctc_loss_calculator.h
index be00895b0d3517fe06a852685f79f32e5a0b5167..dd1163310bf406b66bdd450ac6bf840272f7c592 100644
--- a/tensorflow/core/util/ctc/ctc_loss_calculator.h
+++ b/tensorflow/core/util/ctc/ctc_loss_calculator.h
@@ -130,13 +130,13 @@ Status CTCLossCalculator::CalculateLoss(
   for (int t = 1; t < num_time_steps; ++t) {
     if (inputs[t].rows() != batch_size) {
       return errors::InvalidArgument("Expected batch size at t: ", t,
-                                     " to be: ", batch_size, " but got: ",
-                                     inputs[t].rows());
+                                     " to be: ", batch_size,
+                                     " but got: ", inputs[t].rows());
     }
     if (inputs[t].cols() != num_classes) {
       return errors::InvalidArgument("Expected class count at t: ", t,
-                                     " to be: ", num_classes, " but got: ",
-                                     inputs[t].cols());
+                                     " to be: ", num_classes,
+                                     " but got: ", inputs[t].cols());
     }
   }
 
@@ -282,8 +282,8 @@ Status CTCLossCalculator::PopulateLPrimes(
     LabelSequences* l_primes) const {
   // labels is a Label array of size batch_size
   if (labels.size() != batch_size) {
-    return errors::InvalidArgument("labels.size() != batch_size: ",
-                                   labels.size(), " vs. ", batch_size);
+    return errors::InvalidArgument(
+        "labels.size() != batch_size: ", labels.size(), " vs. ", batch_size);
   }
 
   *max_u_prime = 0;  // keep track of longest l' modified label sequence.
@@ -325,12 +325,13 @@ Status CTCLossCalculator::PopulateLPrimes(
     for (int l_i : l) {
       if (l_i < 0) {
         return errors::InvalidArgument(
-            "All labels must be nonnegative integers, batch: ", b, " labels: ",
-            str_util::Join(l, ","));
+            "All labels must be nonnegative integers, batch: ", b,
+            " labels: ", str_util::Join(l, ","));
       } else if (l_i >= num_classes) {
         return errors::InvalidArgument(
-            "No label may be greater than num_classes. ", "num_classes: ",
-            num_classes, ", batch: ", b, " labels: ", str_util::Join(l, ","));
+            "No label may be greater than num_classes. ",
+            "num_classes: ", num_classes, ", batch: ", b,
+            " labels: ", str_util::Join(l, ","));
       }
     }
     if (!ignore_longer_outputs_than_inputs) {
diff --git a/tensorflow/core/util/cuda_device_functions.h b/tensorflow/core/util/cuda_device_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2d4e470c82d9a1480ac1bf7726a7a7a9ae08715
--- /dev/null
+++ b/tensorflow/core/util/cuda_device_functions.h
@@ -0,0 +1,635 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_
+#define TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_
+
+/**
+ * Wrappers and helpers for CUDA device code.
+ *
+ * Wraps the warp-cooperative intrinsics introduced in CUDA 9 to provide
+ * backwards compatibility, see go/volta-porting for details.
+ * Provides atomic operations on types that aren't natively supported.
+ */
+
+#if GOOGLE_CUDA
+
+#include <algorithm>
+#include <complex>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "cuda/include/cuda.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace detail {
+
+// Helper for range-based for loop using 'delta' increments.
+// Usage: see CudaGridRange?() functions below.
+template <typename T>
+class CudaGridRange {
+  struct Iterator {
+    __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
+    __device__ T operator*() const { return index_; }
+    __device__ Iterator& operator++() {
+      index_ += delta_;
+      return *this;
+    }
+    __device__ bool operator!=(const Iterator& other) const {
+      bool greater = index_ > other.index_;
+      bool less = index_ < other.index_;
+      // Anything past an end iterator (delta_ == 0) is equal.
+      // In range-based for loops, this optimizes to 'return less'.
+      if (!other.delta_) {
+        return less;
+      }
+      if (!delta_) {
+        return greater;
+      }
+      return less || greater;
+    }
+
+   private:
+    T index_;
+    const T delta_;
+  };
+
+ public:
+  __device__ CudaGridRange(T begin, T delta, T end)
+      : begin_(begin), delta_(delta), end_(end) {}
+
+  __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
+  __device__ Iterator end() const { return Iterator{end_, 0}; }
+
+ private:
+  T begin_;
+  T delta_;
+  T end_;
+};
+
+}  // namespace detail
+
+// Helper to visit indices in the range 0 <= i < count, using the x-coordinate
+// of the global thread index. That is, each index i is visited by all threads
+// with the same x-coordinate.
+// Usage: for(int i : CudaGridRangeX(count)) { visit(i); }
+template <typename T>
+__device__ detail::CudaGridRange<T> CudaGridRangeX(T count) {
+  return detail::CudaGridRange<T>(blockIdx.x * blockDim.x + threadIdx.x,
+                                  gridDim.x * blockDim.x, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
+// Usage: for(int i : CudaGridRangeY(count)) { visit(i); }
+template <typename T>
+__device__ detail::CudaGridRange<T> CudaGridRangeY(T count) {
+  return detail::CudaGridRange<T>(blockIdx.y * blockDim.y + threadIdx.y,
+                                  gridDim.y * blockDim.y, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
+// Usage: for(int i : CudaGridRangeZ(count)) { visit(i); }
+template <typename T>
+__device__ detail::CudaGridRange<T> CudaGridRangeZ(T count) {
+  return detail::CudaGridRange<T>(blockIdx.z * blockDim.z + threadIdx.z,
+                                  gridDim.z * blockDim.z, count);
+}
+
+// Mask for all 32 threads in a warp.
+const unsigned kCudaWarpAll = 0xffffffff;
+
+// Returns the warp lane ID of the calling thread
+__device__ inline unsigned CudaLaneId() {
+  unsigned int lane_id;
+  asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
+  return lane_id;
+}
+
+namespace detail {
+// Returns true if mask is a valid parameter for __shfl*sync to return a well
+// defined value, assuming the calling lane will read from src_lane as part of
+// the shuffle operation.
+//
+// Specifically, returns true iff mask has the calling lane bit and the src_lane
+// bit set, and the src_lane calls this function with the same mask value
+// (required for the two threads to wait for each other).
+//
+// On Volta, for some invalid masks, this function hangs or returns false
+// positives, because the implementation shuffles with the same mask that
+// we are validating. Run on Pascal if you suspect that the mask is incorrect.
+__device__ inline bool CudaValidateShuffleSyncMask(unsigned mask,
+                                                   unsigned src_lane) {
+  unsigned src_dst_mask = 1u << CudaLaneId() | 1u << src_lane;
+#if CUDA_VERSION >= 9000
+  unsigned src_lane_mask = __shfl_sync(mask, mask, src_lane);
+#else
+  unsigned src_lane_mask = __shfl(mask, src_lane);
+#endif
+  return (src_dst_mask & ~mask) == 0 && src_lane_mask == mask;
+}
+
+// Returns the actual source lane for shuffle.
+__device__ inline unsigned CudaShuffleGetSrcLane(int src_lane, int width) {
+  int lane_id = CudaLaneId();
+  int lane_base = lane_id & ~width + 1;
+  int lane_offset = src_lane & width - 1;
+  return lane_base + lane_offset;
+}
+
+// Returns the source lane for shuffle up.
+__device__ inline unsigned CudaShuffleUpGetSrcLane(unsigned delta, int width) {
+  unsigned lane_id = CudaLaneId();
+  if ((lane_id & width - 1) < delta) {
+    return lane_id;
+  }
+  return lane_id - delta;
+}
+
+// Returns the source lane for shuffle down.
+__device__ inline unsigned CudaShuffleDownGetSrcLane(unsigned delta,
+                                                     int width) {
+  unsigned lane_id = CudaLaneId();
+  if ((lane_id & width - 1) + delta >= width) {
+    return lane_id;
+  }
+  return lane_id + delta;
+}
+
+// Returns the source lane for shuffle xor.
+__device__ inline unsigned CudaShuffleXorGetSrcLane(int lane_mask, int width) {
+  int lane_id = CudaLaneId();
+  int src_lane = lane_id ^ lane_mask;
+  if (src_lane > (lane_id | width - 1)) {
+    return lane_id;
+  }
+  return src_lane;
+}
+}  // namespace detail
+
+// For all *_sync wrappers below, it is illegal to synchronize threads from
+// different program locations, because that is not supported before sm_70.
+// In other words, all threads in 'mask' must call the functions in convergence.
+// Code that requires sm_70 (and CUDA 9) may use the intrinsic directly.
+//
+// It is also illegal to shuffle with a mask that produces an undefined result
+// for any of the threads. Specifically, all source threads of the shuffle
+// must have their corresponding bit in 'mask' set.
+
+// Wrapper for __syncwarp. No-op for CUDA 8 and earlier.
+__device__ inline void CudaSyncWarp(unsigned mask = kCudaWarpAll) {
+  assert(mask & 1u << CudaLaneId());
+#if CUDA_VERSION >= 9000
+  __syncwarp(mask);
+#endif
+}
+
+// Wrapper for __ballot_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+__device__ inline unsigned CudaBallotSync(unsigned mask, int pred) {
+  assert(mask & 1u << CudaLaneId());
+#if CUDA_VERSION >= 9000
+  return __ballot_sync(mask, pred);
+#else
+  return __ballot(pred) & mask;  // Apply mask to match __ballot_sync's spec.
+#endif
+}
+
+// Wrapper for __any_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+__device__ inline int CudaAnySync(unsigned mask, int pred) {
+  assert(mask & 1u << CudaLaneId());
+#if CUDA_VERSION >= 9000
+  return __any_sync(mask, pred);
+#else
+  return __any(pred);
+#endif
+}
+
+// Wrapper for __all_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+__device__ inline int CudaAllSync(unsigned mask, int pred) {
+  assert(mask & 1u << CudaLaneId());
+#if CUDA_VERSION >= 9000
+  return __all_sync(mask, pred);
+#else
+  return __all(pred);
+#endif
+}
+
+// Wrapper for __shfl_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+template <typename T>
+__device__ T CudaShuffleSync(unsigned mask, T value, int src_lane,
+                             int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::CudaValidateShuffleSyncMask(
+      mask, detail::CudaShuffleGetSrcLane(src_lane, width)));
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(mask, value, src_lane, width);
+#else
+  return __shfl(value, src_lane, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double CudaShuffleSync(unsigned mask, double value,
+                                         int src_lane, int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = CudaShuffleSync(mask, hi, src_lane, width);
+  lo = CudaShuffleSync(mask, lo, src_lane, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+// Wrapper for __shfl_up_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+template <typename T>
+__device__ inline T CudaShuffleUpSync(unsigned mask, T value, unsigned delta,
+                                      int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::CudaValidateShuffleSyncMask(
+      mask, detail::CudaShuffleUpGetSrcLane(delta, width)));
+#if CUDA_VERSION >= 9000
+  return __shfl_up_sync(mask, value, delta, width);
+#else
+  return __shfl_up(value, delta, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double CudaShuffleUpSync(unsigned mask, double value,
+                                           unsigned delta,
+                                           int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = CudaShuffleUpSync(mask, hi, delta, width);
+  lo = CudaShuffleUpSync(mask, lo, delta, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+// Wrapper for __shfl_down_sync. All threads in 'mask' must call this function
+// in convergence, see comment above for details.
+template <typename T>
+__device__ inline T CudaShuffleDownSync(unsigned mask, T value, unsigned delta,
+                                        int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::CudaValidateShuffleSyncMask(
+      mask, detail::CudaShuffleDownGetSrcLane(delta, width)));
+#if CUDA_VERSION >= 9000
+  return __shfl_down_sync(mask, value, delta, width);
+#else
+  return __shfl_down(value, delta, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double CudaShuffleDownSync(unsigned mask, double value,
+                                             unsigned delta,
+                                             int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = CudaShuffleDownSync(mask, hi, delta, width);
+  lo = CudaShuffleDownSync(mask, lo, delta, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+// Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in
+// convergence, see comment above for details.
+template <typename T>
+__device__ T CudaShuffleXorSync(unsigned mask, T value, int lane_mask,
+                                int width = warpSize) {
+  assert(!(width & width - 1));
+  assert(detail::CudaValidateShuffleSyncMask(
+      mask, detail::CudaShuffleXorGetSrcLane(lane_mask, width)));
+#if CUDA_VERSION >= 9000
+  return __shfl_xor_sync(mask, value, lane_mask, width);
+#else
+  return __shfl_xor(value, lane_mask, width);
+#endif
+}
+
+// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
+// instead of float for lo and hi (which is incorrect with ftz, for example).
+// See b/69446944.
+__device__ inline double CudaShuffleXorSync(unsigned mask, double value,
+                                            int lane_mask,
+                                            int width = warpSize) {
+  unsigned lo, hi;
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
+  hi = CudaShuffleXorSync(mask, hi, lane_mask, width);
+  lo = CudaShuffleXorSync(mask, lo, lane_mask, width);
+  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
+  return value;
+}
+
+// Wrapper for __ldg.
+template <typename T>
+__host__ __device__ T CudaLdg(const T* address) {
+#if __CUDA_ARCH__ >= 350
+  return __ldg(address);
+#else
+  return *address;
+#endif
+}
+
+__host__ __device__ inline bool CudaLdg(const bool* address) {
+  return CudaLdg(reinterpret_cast<const char*>(address)) != 0;
+}
+
+__host__ __device__ inline std::complex<float> CudaLdg(
+    const std::complex<float>* address) {
+#if __CUDA_ARCH__ >= 350
+  float2 mem = __ldg(reinterpret_cast<const float2*>(address));
+  return std::complex<float>(mem.x, mem.y);
+#else
+  return *address;
+#endif
+}
+
+__host__ __device__ inline std::complex<double> CudaLdg(
+    const std::complex<double>* address) {
+#if __CUDA_ARCH__ >= 350
+  double2 mem = __ldg(reinterpret_cast<const double2*>(address));
+  return std::complex<double>(mem.x, mem.y);
+#else
+  return *address;
+#endif
+}
+
+// Zeroes count elements starting at ptr using all threads of a 1-D grid.
+// Note: this function does not synchronize, and therefore the memory range is
+// not guaranteed to be zero until the next kernel launch.
+template <typename T>
+__global__ void SetZero(const int count, T* ptr) {
+  // Check that the grid is one dimensional and index doesn't overflow.
+  assert(blockDim.y == 1 && blockDim.z == 1);
+  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
+  for (int i : CudaGridRangeX(count)) {
+    ptr[i] = T(0);
+  }
+}
+
+// Helper to set all tensor entries to a specific value.
+template <typename T>
+__global__ void SetToValue(const int count, T* ptr, T value) {
+  // Check that the grid is one dimensional and index doesn't overflow.
+  assert(blockDim.y == 1 && blockDim.z == 1);
+  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
+  for (int i : CudaGridRangeX(count)) {
+    ptr[i] = value;
+  }
+}
+
+namespace detail {
+// Helper function for atomic accumulation implemented as CAS.
+template <typename T, typename F>
+__device__ T CudaAtomicCasHelper(T* ptr, F accumulate) {
+  T old = *ptr;
+  T assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(ptr, assumed, accumulate(assumed));
+  } while (assumed != old);
+  return old;
+}
+
+// Overload for floating point (using integer comparison to handle NaN
+// correctly).
+template <typename F>
+__device__ float CudaAtomicCasHelper(float* ptr, F accumulate) {
+  return __float_as_int(
+      CudaAtomicCasHelper(reinterpret_cast<int32*>(ptr), [accumulate](int32 a) {
+        return __float_as_int(accumulate(__int_as_float(a)));
+      }));
+}
+template <typename F>
+__device__ double CudaAtomicCasHelper(double* ptr, F accumulate) {
+  return __longlong_as_double(CudaAtomicCasHelper(
+      reinterpret_cast<tensorflow::uint64*>(ptr),
+      [accumulate](tensorflow::uint64 a) {
+        return __double_as_longlong(accumulate(__longlong_as_double(a)));
+      }));
+}
+
+// Overload of above function for half. Note that we don't have
+// atomicCAS() for anything less than 32 bits, so we need to include the
+// other 16 bits in the operation.
+//
+// This version is going to be very slow
+// under high concurrency, since most threads will be spinning on failing
+// their compare-and-swap tests. (The fact that we get false sharing on the
+// neighboring fp16 makes this even worse.) If you are doing a large reduction,
+// you are much better off with doing the intermediate steps in fp32 and then
+// switching to fp16 as late as you can in the calculations.
+//
+// Note: Assumes little endian.
+template <typename F>
+__device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
+#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
+  static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
+#endif
+  namespace half_impl = Eigen::half_impl;
+  intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
+  assert(!(intptr & 0x1));  // should be 2-aligned.
+  if (intptr & 0x2) {
+    // The half is in the second part of the uint32 (upper 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr - 2);
+    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short high = static_cast<unsigned short>(arg >> 16);
+      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
+      return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
+    });
+    return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
+  } else {
+    // The half is in the first part of the uint32 (lower 16 bits).
+    uint32* address = reinterpret_cast<uint32*>(intptr);
+    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
+      unsigned short low = static_cast<unsigned short>(arg & 0xffff);
+      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
+      return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
+    });
+    return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
+  }
+}
+
+template <typename From, typename To>
+using ToTypeIfConvertible =
+    typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
+
+}  // namespace detail
+
+// CUDA provides atomic ops, but not for all types.  We provide wrappers
+// for some ops and provide implementation for all reasonable types.
+
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicAdd(T* ptr, U value) {
+  return atomicAdd(ptr, value);
+}
+
+__device__ inline Eigen::half CudaAtomicAdd(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a + value; });
+}
+
+
+#if __CUDA_ARCH__ < 600
+__device__ inline double CudaAtomicAdd(double* ptr, double value) {
+  return detail::CudaAtomicCasHelper(ptr,
+                                     [value](double a) { return a + value; });
+}
+#elif __clang__
+// Clang cannot compile __nvvm_atom_add_gen_d builtin yet, use inline PTX.
+// see https://reviews.llvm.org/D39638
+__device__ inline double CudaAtomicAdd(double* ptr, double value) {
+  double result;
+  asm volatile("atom.add.f64 %0, [%1], %2;"
+               : "=d"(result)
+               : "l"(ptr), "d"(value)
+               : "memory");
+  return result;
+}
+#endif
+// CudaAtomicAdd
+// Specializations of CudaAtomicAdd for complex types, which CudaAtomicAdd does
+// not support. We treat a std::complex<T>* as a T* (the C++ standard section
+// 26.4.4 allows this explicitly) and atomic add the real and imaginary
+// components individually. The operation as a whole is not atomic, but we can
+// safely treat the components independently for the purpose of accumulating.
+__device__ inline std::complex<float> CudaAtomicAdd(std::complex<float>* ptr,
+                                                    std::complex<float> value) {
+  auto ptr_scalar = reinterpret_cast<float*>(ptr);
+  return std::complex<float>(CudaAtomicAdd(ptr_scalar, value.real()),
+                             CudaAtomicAdd(ptr_scalar + 1, value.imag()));
+}
+
+__device__ inline std::complex<double> CudaAtomicAdd(
+    std::complex<double>* ptr, std::complex<double> value) {
+  auto ptr_scalar = reinterpret_cast<double*>(ptr);
+  return std::complex<double>(CudaAtomicAdd(ptr_scalar, value.real()),
+                              CudaAtomicAdd(ptr_scalar + 1, value.imag()));
+}
+
+// CudaAtomicSub
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicSub(T* ptr, U value) {
+  return atomicSub(ptr, value);
+}
+
+// Specializations of substraction which add the negative value.
+__device__ inline float CudaAtomicSub(float* ptr, float value) {
+  return CudaAtomicAdd(ptr, -value);
+}
+
+__device__ inline double CudaAtomicSub(double* ptr, double value) {
+  return CudaAtomicAdd(ptr, -value);
+}
+
+__device__ inline tensorflow::uint64 CudaAtomicSub(tensorflow::uint64* ptr,
+                                                   tensorflow::uint64 value) {
+  return CudaAtomicAdd(ptr, -value);
+}
+
+__device__ inline Eigen::half CudaAtomicSub(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return a - value; });
+}
+
+// CudaAtomicMax
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMax(T* ptr, U value) {
+  return atomicMax(ptr, value);
+}
+
+__device__ inline float CudaAtomicMax(float* ptr, float value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](float a) { return max(a, value); });
+}
+
+__device__ inline double CudaAtomicMax(double* ptr, double value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](double a) { return max(a, value); });
+}
+
+__device__ inline Eigen::half CudaAtomicMax(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return max(a, value); });
+}
+
+#if __CUDA_ARCH__ < 320
+__device__ inline tensorflow::uint64 CudaAtomicMax(tensorflow::uint64* ptr,
+                                                   tensorflow::uint64 value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](tensorflow::uint64 a) { return max(a, value); });
+}
+#endif
+
+// CudaAtomicMin
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMin(T* ptr, U value) {
+  return atomicMin(ptr, value);
+}
+
+__device__ inline float CudaAtomicMin(float* ptr, float value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](float a) { return min(a, value); });
+}
+
+__device__ inline double CudaAtomicMin(double* ptr, double value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](double a) { return min(a, value); });
+}
+
+__device__ inline Eigen::half CudaAtomicMin(Eigen::half* ptr,
+                                            Eigen::half value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](Eigen::half a) { return min(a, value); });
+}
+
+#if __CUDA_ARCH__ < 320
+__device__ inline tensorflow::uint64 CudaAtomicMin(tensorflow::uint64* ptr,
+                                                   tensorflow::uint64 value) {
+  return detail::CudaAtomicCasHelper(
+      ptr, [value](tensorflow::uint64 a) { return min(a, value); });
+}
+#endif
+
+// CudaAtomicMul
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMul(T* ptr, U value) {
+  return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a * value; });
+}
+
+// CudaAtomicDiv
+template <typename T, typename U>
+__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicDiv(T* ptr, U value) {
+  return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a / value; });
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index cf11f419a4effd868fa9c933240acb9a05bfa355..3c59524cb6f85911544b8f2d7d3339e19af7f5b4 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -18,299 +18,79 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 
-#include <algorithm>
+#include "tensorflow/core/util/cuda_device_functions.h"
+#include "tensorflow/core/util/cuda_launch_config.h"
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "cuda/include/cuda.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor.h"
-#include "tensorflow/core/platform/types.h"
+// Deprecated, use 'for(int i : CudaGridRangeX(n))' instead.
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+  for (int i : ::tensorflow::CudaGridRangeX<int>(n))
+// Deprecated, use 'for(int i : CudaGridRange?(n))' instead.
+#define CUDA_AXIS_KERNEL_LOOP(i, n, axis) \
+  for (int i : ::tensorflow::CudaGridRange##axis<int>(n))
 
-// Mask for all 32 threads in a warp.
-#define CUDA_WARP_ALL 0xFFFFFFFF
-
-#if defined(CUDA_VERSION) && CUDA_VERSION < 9000
-// CUDA 9.0 introduces a new, light-weight barrier synchronization primitive
-// that operates at the warp-scope. This is required to ensure visibility of
-// reads/writes among threads that can make indepenent progress on Volta.
-// For previous CUDA versions these synchronizations not necessary, and we
-// define an empty function as a convenience for backward compatibility.
-__device__ inline void __syncwarp(unsigned mask = CUDA_WARP_ALL) {}
-
-// CUDA 9.0 deprecates the warp-intrinsic functions (shfl, ballot, etc.) in
-// favor of synchronizing versions. These ensure that all warp lanes specified
-// in mask execute the intrinsic in convergence. Here we provide legacy mappings
-// to the less-verbose routines provided in previous versions of CUDA.
-#define __ballot_sync(mask, predicate) __ballot(predicate)
-#define __shfl_sync(mask, val, srcLane, width) __shfl(val, srcLane, width)
-#define __shfl_down_sync(mask, val, delta, width) __shfl_down(val, delta, width)
-#define __shfl_up_sync(mask, val, delta, width) __shfl_up(val, delta, width)
-#define __shfl_xor_sync(mask, val, laneMask, width) \
-  __shfl_xor(val, laneMask, width)
-#endif
-
-// Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and
-// GetCuda3DLaunchConfig:
-//
-// There are two versions of GetCudaLaunchConfig and GetCuda2DLaunchConfig, one
-// version uses heuristics without any knowledge of the device kernel, the other
-// version uses cudaOccupancyMaxPotentialBlockSize to determine the theoretical
-// launch parameters that maximize occupancy. Currently, only the maximum
-// occupancy version of GetCuda3DLaunchConfig is available.
-//
-// For large number of work elements, the convention is that each kernel would
-// iterate through its assigned range. The return value of GetCudaLaunchConfig
-// is struct CudaLaunchConfig, which contains all the information needed for the
-// kernel launch, including: virtual number of threads, the number of threads
-// per block and number of threads per block used inside <<< >>> of a kernel
-// launch. GetCuda2DLaunchConfig and GetCuda3DLaunchConfig does the same thing
-// as CudaLaunchConfig. The only difference is the dimension. The macros
-// CUDA_1D_KERNEL_LOOP and CUDA_AXIS_KERNEL_LOOP might be used to do inner loop.
-//
-/* Sample code:
-
-__global__ void MyKernel1D(CudaLaunchConfig config, other_args...) {
-  CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
-    do_your_job_here;
-  }
+namespace tensorflow {
+__host__ __device__ inline tensorflow::bfloat16 CudaLdg(
+    const tensorflow::bfloat16* address) {
+  tensorflow::bfloat16 return_value;
+  return_value.value = CudaLdg(reinterpret_cast<const uint16_t*>(address));
+  return return_value;
 }
 
-__global__ void MyKernel2D(Cuda2DLaunchConfig config, other_args...) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
-      do_your_job_here;
-    }
-  }
+template <typename T>
+__host__ __device__ inline T ldg(const T* ptr) {
+  return CudaLdg(ptr);
 }
 
-__global__ void MyKernel3D(Cuda3DLaunchConfig config, other_args...) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
-      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
-        do_your_job_here;
-      }
-    }
-  }
+template <typename T>
+__host__ __device__ inline const T& tf_min(const T& x, const T& y) {
+  return x < y ? x : y;
 }
 
-void MyDriverFunc(const GPUDevice &d) {
-  // use heuristics
-  CudaLaunchConfig cfg1 = GetCudaLaunchConfig(10240, d);
-  MyKernel1D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg1, other_args...);
-  Cuda2DLaunchConfig cfg2 = GetCuda2DLaunchConfig(10240, 10240, d);
-  MyKernel2D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg2, other_args...);
-  Cuda3DLaunchConfig cfg3 = GetCuda3DLaunchConfig(4096, 4096, 100, d);
-  MyKernel3D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg3, other_args...);
-
-  // maximize occupancy
-  CudaLaunchConfig cfg4 = GetCudaLaunchConfig(10240, d, MyKernel1D, 0, 0 );
-  MyKernel1D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg4, other_args...);
-  Cuda2DLaunchConfig cfg5 = GetCuda2DLaunchConfig(10240, 10240, d,
-                                                  MyKernel1D, 0, 0);
-  MyKernel2D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg5, other_args...);
-  Cuda3DLaunchConfig cfg6 = GetCuda3DLaunchConfig(4096, 4096, 100, d,
-                                                  MyKernel1D, 0, 0);
-  MyKernel3D <<<config.block_count,
-                config.thread_per_block, 0, d.stream()>>> (cfg6, other_args...);
+template <typename T>
+__host__ __device__ inline const T& tf_max(const T& x, const T& y) {
+  return x < y ? y : x;
 }
 
-// See the test for this for more example:
-//
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
-
-*/
-
-#define CUDA_1D_KERNEL_LOOP(i, n)                            \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
-       i += blockDim.x * gridDim.x)
-
-#define CUDA_AXIS_KERNEL_LOOP(i, n, axis)                                  \
-  for (int i = blockIdx.axis * blockDim.axis + threadIdx.axis; i < n.axis; \
-       i += blockDim.axis * gridDim.axis)
-
-#define DIV_UP(a, b) (((a) + (b)-1) / (b))
-
-namespace tensorflow {
-
-typedef Eigen::GpuDevice GPUDevice;
-
-struct CudaLaunchConfig {
-  // Logical number of thread that works on the elements. If each logical
-  // thread works on exactly a single element, this is the same as the working
-  // element count.
-  int virtual_thread_count = -1;
-  // Number of threads per block.
-  int thread_per_block = -1;
-  // Number of blocks for Cuda kernel launch.
-  int block_count = -1;
-};
-
-// Calculate the Cuda launch config we should use for a kernel launch.
-// This is assuming the kernel is quite simple and will largely be
-// memory-limited.
-// REQUIRES: work_element_count > 0.
-inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
-                                            const GPUDevice& d) {
-  CHECK_GT(work_element_count, 0);
-  CudaLaunchConfig config;
-  const int virtual_thread_count = work_element_count;
-  const int physical_thread_count = std::min(
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
-      virtual_thread_count);
-  const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
-  const int block_count =
-      std::min(DIV_UP(physical_thread_count, thread_per_block),
-               d.getNumCudaMultiProcessors());
-
-  config.virtual_thread_count = virtual_thread_count;
-  config.thread_per_block = thread_per_block;
-  config.block_count = block_count;
-  return config;
+// Overloads of the above functions for float and double.
+__host__ __device__ inline float tf_min(float x, float y) {
+  return fminf(x, y);
 }
-
-// Calculate the Cuda launch config we should use for a kernel launch. This
-// variant takes the resource limits of func into account to maximize occupancy.
-// REQUIRES: work_element_count > 0.
-template <typename DeviceFunc>
-inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
-                                            const GPUDevice& d, DeviceFunc func,
-                                            size_t dynamic_shared_memory_size,
-                                            int block_size_limit) {
-  CHECK_GT(work_element_count, 0);
-  CudaLaunchConfig config;
-  int block_count = 0;
-  int thread_per_block = 0;
-
-  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
-      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
-      block_size_limit);
-  CHECK_EQ(err, cudaSuccess);
-
-  block_count =
-      std::min(block_count, DIV_UP(work_element_count, thread_per_block));
-
-  config.virtual_thread_count = work_element_count;
-  config.thread_per_block = thread_per_block;
-  config.block_count = block_count;
-  return config;
+__host__ __device__ inline double tf_min(double x, double y) {
+  return fmin(x, y);
 }
-
-struct Cuda2DLaunchConfig {
-  dim3 virtual_thread_count = dim3(0, 0, 0);
-  dim3 thread_per_block = dim3(0, 0, 0);
-  dim3 block_count = dim3(0, 0, 0);
-};
-
-inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
-                                                const GPUDevice& d) {
-  Cuda2DLaunchConfig config;
-
-  if (xdim <= 0 || ydim <= 0) {
-    return config;
-  }
-
-  const int kThreadsPerBlock = 256;
-  int block_cols = std::min(xdim, kThreadsPerBlock);
-  // ok to round down here and just do more loops in the kernel
-  int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
-
-  const int physical_thread_count =
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor();
-
-  const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
-
-  config.virtual_thread_count = dim3(xdim, ydim, 1);
-  config.thread_per_block = dim3(block_cols, block_rows, 1);
-
-  int grid_x = std::min(DIV_UP(xdim, block_cols), max_blocks);
-
-  config.block_count = dim3(
-      grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
-  return config;
+__host__ __device__ inline float tf_max(float x, float y) {
+  return fmaxf(x, y);
+}
+__host__ __device__ inline double tf_max(double x, double y) {
+  return fmax(x, y);
 }
 
-// Calculate the Cuda 2D and 3D launch config we should use for a kernel launch.
-// This variant takes the resource limits of func into account to maximize
-// occupancy.
-using Cuda3DLaunchConfig = Cuda2DLaunchConfig;
-
-template <typename DeviceFunc>
-inline Cuda3DLaunchConfig GetCuda3DLaunchConfig(
-    int xdim, int ydim, int zdim, const GPUDevice& d, DeviceFunc func,
-    size_t dynamic_shared_memory_size, int block_size_limit) {
-  Cuda3DLaunchConfig config;
-
-  if (xdim <= 0 || ydim <= 0 || zdim <= 0) {
-    return config;
-  }
-
-  int dev;
-  cudaGetDevice(&dev);
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, dev);
-  int xthreadlimit = deviceProp.maxThreadsDim[0];
-  int ythreadlimit = deviceProp.maxThreadsDim[1];
-  int zthreadlimit = deviceProp.maxThreadsDim[2];
-  int xgridlimit = deviceProp.maxGridSize[0];
-  int ygridlimit = deviceProp.maxGridSize[1];
-  int zgridlimit = deviceProp.maxGridSize[2];
-
-  int block_count = 0;
-  int thread_per_block = 0;
-  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
-      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
-      block_size_limit);
-  CHECK_EQ(err, cudaSuccess);
-
-#define MIN3(a, b, c) std::min((a), std::min((b), (c)))
-  int threadsx = MIN3(xdim, thread_per_block, xthreadlimit);
-  int threadsy =
-      MIN3(ydim, std::max(thread_per_block / threadsx, 1), ythreadlimit);
-  int threadsz =
-      MIN3(zdim, std::max(thread_per_block / (threadsx * threadsy), 1),
-           zthreadlimit);
-
-  int blocksx = MIN3(block_count, DIV_UP(xdim, threadsx), xgridlimit);
-  int blocksy =
-      MIN3(DIV_UP(block_count, blocksx), DIV_UP(ydim, threadsy), ygridlimit);
-  int blocksz = MIN3(DIV_UP(block_count, (blocksx * blocksy)),
-                     DIV_UP(zdim, threadsz), zgridlimit);
-#undef MIN3
+__device__ inline Eigen::half CudaShuffleSync(unsigned mask, Eigen::half value,
+                                              int src_lane,
+                                              int width = warpSize) {
+  return Eigen::half(
+      CudaShuffleSync(mask, static_cast<uint16>(value), src_lane, width));
+}
 
-  config.virtual_thread_count = dim3(xdim, ydim, zdim);
-  config.thread_per_block = dim3(threadsx, threadsy, threadsz);
-  config.block_count = dim3(blocksx, blocksy, blocksz);
-  return config;
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleUpSync(
+    unsigned mask, Eigen::half value, int delta, int width = warpSize) {
+  return Eigen::half(
+      CudaShuffleUpSync(mask, static_cast<uint16>(value), delta, width));
 }
 
-template <typename DeviceFunc>
-inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(
-    int xdim, int ydim, const GPUDevice& d, DeviceFunc func,
-    size_t dynamic_shared_memory_size, int block_size_limit) {
-  return GetCuda3DLaunchConfig(xdim, ydim, 1, d, func,
-                               dynamic_shared_memory_size, block_size_limit);
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleDownSync(
+    unsigned mask, Eigen::half value, int delta, int width = warpSize) {
+  return Eigen::half(
+      CudaShuffleDownSync(mask, static_cast<uint16>(value), delta, width));
 }
 
-// Returns a raw reference to the current cuda stream.  Required by a
-// number of kernel calls (for which StreamInterface* does not work), i.e.
-// CUB and certain cublas primitives.
-inline const cudaStream_t& GetCudaStream(OpKernelContext* context) {
-  const cudaStream_t* ptr = CHECK_NOTNULL(
-      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
-                                                ->stream()
-                                                ->implementation()
-                                                ->CudaStreamMemberHack()));
-  return *ptr;
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleXorSync(
+    unsigned mask, Eigen::half value, int lane_mask, int width = warpSize) {
+  return Eigen::half(
+      CudaShuffleXorSync(mask, static_cast<uint16>(value), lane_mask, width));
 }
 
 namespace cuda_helper {
-
 template <typename IntType>
 __device__ IntType upper_bound(IntType* first, IntType count, IntType val) {
   IntType* orig = first;
@@ -330,481 +110,8 @@ __device__ IntType upper_bound(IntType* first, IntType count, IntType val) {
 
   return first - orig;
 }
-
 }  // namespace cuda_helper
-
-template <typename T>
-__device__ __host__ inline T ldg(const T* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return __ldg(address);
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline std::complex<float> ldg(
-    const std::complex<float>* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  float2 mem = __ldg(reinterpret_cast<const float2*>(address));
-  return std::complex<float>(mem.x, mem.y);
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline std::complex<double> ldg(
-    const std::complex<double>* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  double2 mem = __ldg(reinterpret_cast<const double2*>(address));
-  return std::complex<double>(mem.x, mem.y);
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline Eigen::half ldg(const Eigen::half* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return Eigen::half_impl::raw_uint16_to_half(
-      __ldg(reinterpret_cast<const uint16_t*>(address)));
-#else
-  return *address;
-#endif
-}
-
-template <>
-__device__ __host__ inline bool ldg(const bool* address) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-  return *reinterpret_cast<const bool*>(
-      __ldg(reinterpret_cast<const char*>(address)));
-#else
-  return *address;
-#endif
-}
-
-// CUDA provides atomic ops, but not for all types.  We provide wrappers
-// for some ops and provide implementation for all reasonable types.
-#define CUDA_ATOMIC_WRAPPER(op, T) \
-  __device__ __forceinline__ T CudaAtomic##op(T* address, T val)
-
-#define USE_CUDA_ATOMIC(op, T) \
-  CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
-
-// For atomicAdd.
-USE_CUDA_ATOMIC(Add, int32);
-USE_CUDA_ATOMIC(Add, uint32);
-USE_CUDA_ATOMIC(Add, uint64);
-USE_CUDA_ATOMIC(Add, float);
-
-// For atomicMax.
-USE_CUDA_ATOMIC(Max, int32);
-USE_CUDA_ATOMIC(Max, uint32);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
-USE_CUDA_ATOMIC(Max, uint64);
-#else
-// The uint64 overload of atomicMax() is only available for __CUDA_ARCH__ >=
-// 350.  If not satisfied, we provide a custom implementation using atomicCAS().
-CUDA_ATOMIC_WRAPPER(Max, uint64) {
-  uint64* address_as_ull = reinterpret_cast<uint64*>(address);
-  uint64 old = *address_as_ull, assumed;
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed, max(val, assumed));
-  } while (assumed != old);
-
-  return old;
-}
-#endif
-
-// Custom implementation of atomicAdd for double.
-// This implementation is copied from CUDA manual.
-CUDA_ATOMIC_WRAPPER(Add, double) {
-  uint64* address_as_ull = reinterpret_cast<uint64*>(address);
-  uint64 old = *address_as_ull, assumed;
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-
-    // Note: uses integer comparison to avoid hang in case of NaN
-  } while (assumed != old);
-
-  return __longlong_as_double(old);
-}
-
-// Custom implementation of atomicAdd for std::complex<float>.
-// This implementation performs to atomic additions on the components.
-CUDA_ATOMIC_WRAPPER(Add, std::complex<float>) {
-#if defined(__CUDA_ARCH__)
-#if __CUDA_ARCH__ >= 350
-  float2* addr_as_float2 = reinterpret_cast<float2*>(address);
-  float2* val_as_float2 = reinterpret_cast<float2*>(&val);
-  CudaAtomicAdd(&(addr_as_float2->x), val_as_float2->x);
-  CudaAtomicAdd(&(addr_as_float2->y), val_as_float2->y);
-#else
-  static_assert(sizeof(std::complex<float>) == 2 * sizeof(float),
-                "Unable to compile CudaAtomicAdd for complex64 because "
-                "sizeof(complex64) != 2*sizeof(float32)");
-  float* addr_as_float = reinterpret_cast<float*>(address);
-  float* val_as_float = reinterpret_cast<float*>(&val);
-  CudaAtomicAdd(addr_as_float, *val_as_float);
-  CudaAtomicAdd(addr_as_float + 1, *(val_as_float + 1));
-#endif
-#endif
-  return *address;
-}
-
-// Custom implementation of atomicAdd for std::complex<double>.
-// This implementation performs to atomic additions on the components
-// using the double atomic wrapper above.
-CUDA_ATOMIC_WRAPPER(Add, complex128) {
-#if defined(__CUDA_ARCH__)
-#if __CUDA_ARCH__ >= 350
-  double2* addr_as_double2 = reinterpret_cast<double2*>(address);
-  double2* val_as_double2 = reinterpret_cast<double2*>(&val);
-  CudaAtomicAdd(&(addr_as_double2->x), val_as_double2->x);
-  CudaAtomicAdd(&(addr_as_double2->y), val_as_double2->y);
-#else
-  static_assert(sizeof(std::complex<double>) == 2 * sizeof(double),
-                "Unable to compile CudaAtomicAdd for complex128 because "
-                "sizeof(complex128) != 2*sizeof(float64)");
-  double* addr_as_double = reinterpret_cast<double*>(address);
-  double* val_as_double = reinterpret_cast<double*>(&val);
-  CudaAtomicAdd(addr_as_double, *val_as_double);
-  CudaAtomicAdd(addr_as_double + 1, *(val_as_double + 1));
-#endif
-#endif
-  return *address;
-}
-
-// Helper functions for CudaAtomicAdd(half*, half), below.
-//
-// Note that if __CUDA_ARCH__ >= 530, we could probably use __hadd2()
-// for a more efficient implementation, assuming that adding -0.0
-// will never harm the neighboring value. In this version, we take special
-// care to guarantee the bits of the untouched value are unchanged.
-inline __device__ uint32 add_to_low_half(uint32 val, float x) {
-  Eigen::half low_half;
-  low_half.x = static_cast<uint16>(val & 0xffffu);
-  low_half = static_cast<Eigen::half>(static_cast<float>(low_half) + x);
-  return (val & 0xffff0000u) | low_half.x;
-}
-
-inline __device__ uint32 add_to_high_half(uint32 val, float x) {
-  Eigen::half high_half;
-  high_half.x = static_cast<uint16>(val >> 16);
-  high_half = static_cast<Eigen::half>(static_cast<float>(high_half) + x);
-  return (val & 0xffffu) | (high_half.x << 16);
-}
-
-// Custom implementation of atomicAdd for half. Note that we don't have
-// atomicCAS() for anything less than 32 bits, so we need to include the
-// other 16 bits in the operation.
-//
-// Unlike the other atomic adds, this version is going to be very slow
-// under high concurrency, since most threads will be spinning on failing
-// their compare-and-swap tests. (The fact that we get false sharing on the
-// neighboring fp16 makes this even worse.) If you are doing a large reduction,
-// you are much better off with doing the intermediate steps in fp32 and then
-// switching to fp16 as late as you can in the calculations.
-//
-// Note: Assumes little endian.
-CUDA_ATOMIC_WRAPPER(Add, Eigen::half) {
-  float val_as_float(val);
-  intptr_t address_int = reinterpret_cast<intptr_t>(address);
-  if ((address_int & 0x2) == 0) {
-    // The half is in the first part of the uint32 (lower 16 bits).
-    uint32* address_as_uint32 = reinterpret_cast<uint32*>(address);
-    assert(((intptr_t)address_as_uint32 & 0x3) == 0);
-    uint32 old = *address_as_uint32, assumed;
-
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_uint32, assumed,
-                      add_to_low_half(assumed, val_as_float));
-
-      // Note: uses integer comparison to avoid hang in case of NaN
-    } while (assumed != old);
-
-    Eigen::half ret;
-    ret.x = old & 0xffffu;
-    return ret;
-  } else {
-    // The half is in the second part of the uint32 (upper 16 bits).
-    uint32* address_as_uint32 = reinterpret_cast<uint32*>(address_int - 2);
-    assert(((intptr_t)address_as_uint32 & 0x3) == 0);
-    uint32 old = *address_as_uint32, assumed;
-
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_uint32, assumed,
-                      add_to_high_half(assumed, val_as_float));
-
-      // Note: uses integer comparison to avoid hang in case of NaN
-    } while (assumed != old);
-
-    Eigen::half ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
-
-template <typename T>
-__global__ void SetZero(const int nthreads, T* bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) { *(bottom_diff + index) = T(0); }
-}
-
-// For atomicSub.
-
-// Custom implementation for sub by just negating the value.
-#define WRAPPED_ATOMIC_SUB(T) \
-  CUDA_ATOMIC_WRAPPER(Sub, T) { return CudaAtomicAdd(address, -val); }
-
-WRAPPED_ATOMIC_SUB(uint64);
-WRAPPED_ATOMIC_SUB(int32);
-WRAPPED_ATOMIC_SUB(uint32);
-WRAPPED_ATOMIC_SUB(Eigen::half);
-WRAPPED_ATOMIC_SUB(float);
-WRAPPED_ATOMIC_SUB(double);
-
-CUDA_ATOMIC_WRAPPER(Sub, complex64) {
-  const std::complex<float> Tneg(-val.real(), -val.imag());
-  return CudaAtomicAdd(address, Tneg);
-}
-
-CUDA_ATOMIC_WRAPPER(Sub, complex128) {
-  const std::complex<double> Tneg(-val.real(), -val.imag());
-  return CudaAtomicAdd(address, Tneg);
-}
-
-#undef WRAPPED_ATOMIC_SUB
-
-// For atomicMul.
-CUDA_ATOMIC_WRAPPER(Mul, int32) {
-  int32 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, val * assumed);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Mul, uint32) {
-  uint32 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, val * assumed);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Mul, uint64) {
-  uint64 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, val * assumed);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Mul, float) {
-  int32* address_as_int = reinterpret_cast<int32*>(address);
-  int32 old = *address_as_int, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_int, assumed,
-                    __float_as_int(val * __int_as_float(assumed)));
-  } while (assumed != old);
-  return __int_as_float(old);
-}
-
-CUDA_ATOMIC_WRAPPER(Mul, double) {
-  uint64* address_as_ull = reinterpret_cast<uint64*>(address);
-  uint64 old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val * __longlong_as_double(assumed)));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-
-// For atomicDiv.
-CUDA_ATOMIC_WRAPPER(Div, int32) {
-  int32 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, assumed / val);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Div, uint32) {
-  uint32 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, assumed / val);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Div, uint64) {
-  uint64 old = *address, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address, assumed, assumed / val);
-  } while (assumed != old);
-  return old;
-}
-
-CUDA_ATOMIC_WRAPPER(Div, float) {
-  int32* address_as_int = reinterpret_cast<int32*>(address);
-  int32 old = *address_as_int, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_int, assumed,
-                    __float_as_int(__int_as_float(assumed) / val));
-  } while (assumed != old);
-  return __int_as_float(old);
-}
-
-CUDA_ATOMIC_WRAPPER(Div, double) {
-  uint64* address_as_ull = reinterpret_cast<uint64*>(address);
-  uint64 old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(__longlong_as_double(assumed) / val));
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-
-#undef USE_CUDA_ATOMIC
-#undef CUDA_ATOMIC_WRAPPER
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tf_min(const T& x, const T& y) {
-  return x > y ? y : x;
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tf_max(const T& x, const T& y) {
-  return x < y ? y : x;
-}
-
-__device__ EIGEN_ALWAYS_INLINE unsigned CudaBallot(unsigned mask,
-                                                   int predicate) {
-  return __ballot_sync(mask, predicate);
-}
-
-template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffle(unsigned mask, T value,
-                                             int srcLane,
-                                             int width = warpSize) {
-  return __shfl_sync(mask, value, srcLane, width);
-}
-
-// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
-// instead of float for lo and hi (which is incorrect with ftz, for example).
-// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
-// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffle(unsigned mask, double value,
-                                                  int srcLane,
-                                                  int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
-  hi = __shfl_sync(mask, hi, srcLane, width);
-  lo = __shfl_sync(mask, lo, srcLane, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
-}
-
-template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffleUp(unsigned mask, T value,
-                                               int delta,
-                                               int width = warpSize) {
-  return __shfl_up_sync(mask, value, delta, width);
-}
-
-// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
-// instead of float for lo and hi (which is incorrect with ftz, for example).
-// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
-// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffleUp(unsigned mask, double value,
-                                                    int delta,
-                                                    int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
-  hi = __shfl_up_sync(mask, hi, delta, width);
-  lo = __shfl_up_sync(mask, lo, delta, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
-}
-
-template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffleDown(unsigned mask, T value,
-                                                 int delta,
-                                                 int width = warpSize) {
-  return __shfl_down_sync(mask, value, delta, width);
-}
-
-__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleDown(
-    unsigned mask, Eigen::half value, int delta, int width = warpSize) {
-  return Eigen::half(
-      __shfl_down_sync(mask, static_cast<uint16>(value), delta, width));
-}
-
-// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
-// instead of float for lo and hi (which is incorrect with ftz, for example).
-// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
-// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffleDown(unsigned mask,
-                                                      double value, int delta,
-                                                      int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
-  hi = __shfl_down_sync(mask, hi, delta, width);
-  lo = __shfl_down_sync(mask, lo, delta, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
-}
-
-template <typename T>
-__device__ EIGEN_ALWAYS_INLINE T CudaShuffleXor(unsigned mask, T value,
-                                                int laneMask,
-                                                int width = warpSize) {
-  return __shfl_xor_sync(mask, value, laneMask, width);
-}
-
-__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleXor(
-    unsigned mask, Eigen::half value, int laneMask, int width = warpSize) {
-  return Eigen::half(
-      __shfl_xor_sync(mask, static_cast<uint16>(value), laneMask, width));
-}
-
-// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
-// instead of float for lo and hi (which is incorrect with ftz, for example).
-// A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
-// TODO(csigg): remove when the bug is fixed in the next CUDA release.
-__device__ EIGEN_ALWAYS_INLINE double CudaShuffleXor(unsigned mask,
-                                                     double value, int laneMask,
-                                                     int width = warpSize) {
-  unsigned lo, hi;
-  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
-  hi = __shfl_xor_sync(mask, hi, laneMask, width);
-  lo = __shfl_xor_sync(mask, lo, laneMask, width);
-  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
-  return value;
-}
-
 }  // namespace tensorflow
 
-#undef DIV_UP
-
 #endif  // GOOGLE_CUDA
-
 #endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
diff --git a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
index 6991554effd9088c04bfcb71f274b82408507463..732ed33ede17bc90d3301d3f1eee6302a96028d7 100644
--- a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
@@ -52,11 +52,11 @@ __global__ void Count1D(CudaLaunchConfig config, int bufsize, int* outbuf) {
   }
 }
 __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     if (x < 0) {  // x might overflow when testing extreme case
       break;
     }
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {
       if (y < 0) {  // y might overflow when testing extreme case
         break;
       }
@@ -66,15 +66,15 @@ __global__ void Count2D(Cuda2DLaunchConfig config, int bufsize, int* outbuf) {
   }
 }
 __global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
-  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {
     if (x < 0) {  // x might overflow when testing extreme case
       break;
     }
-    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {
       if (y < 0) {  // y might overflow when testing extreme case
         break;
       }
-      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
+      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) {
         if (z < 0) {  // z might overflow when testing extreme case
           break;
         }
@@ -87,6 +87,44 @@ __global__ void Count3D(Cuda3DLaunchConfig config, int bufsize, int* outbuf) {
   }
 }
 
+__global__ void CudaShuffleGetSrcLaneTest(unsigned* failure_count) {
+  unsigned lane_id = CudaLaneId();
+  for (int width = warpSize; width > 1; width /= 2) {
+    auto check_result = [&](const char* op_name, int param, unsigned actual,
+                            unsigned expected) {
+      if (actual != expected) {
+        printf("Cuda%sGetSrcLane(%d, %d) for lane %d returned %d, not %d\n",
+               op_name, param, width, lane_id, actual, expected);
+        CudaAtomicAdd(failure_count, 1);
+      }
+    };
+    for (int src_lane = -warpSize; src_lane <= warpSize; ++src_lane) {
+      unsigned actual_lane = detail::CudaShuffleGetSrcLane(src_lane, width);
+      unsigned expect_lane =
+          CudaShuffleSync(kCudaWarpAll, lane_id, src_lane, width);
+      check_result("Shuffle", src_lane, actual_lane, expect_lane);
+    }
+    for (unsigned delta = 0; delta <= warpSize; ++delta) {
+      unsigned actual_lane = detail::CudaShuffleUpGetSrcLane(delta, width);
+      unsigned expect_lane =
+          CudaShuffleUpSync(kCudaWarpAll, lane_id, delta, width);
+      check_result("ShuffleUp", delta, actual_lane, expect_lane);
+    }
+    for (unsigned delta = 0; delta <= warpSize; ++delta) {
+      unsigned actual_lane = detail::CudaShuffleDownGetSrcLane(delta, width);
+      unsigned expect_lane =
+          CudaShuffleDownSync(kCudaWarpAll, lane_id, delta, width);
+      check_result("ShuffleDown", delta, actual_lane, expect_lane);
+    }
+    for (int lane_lane = warpSize; lane_lane > 0; lane_lane /= 2) {
+      unsigned actual_lane = detail::CudaShuffleXorGetSrcLane(lane_lane, width);
+      unsigned expect_lane =
+          CudaShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width);
+      check_result("ShuffleXor", lane_lane, actual_lane, expect_lane);
+    }
+  }
+}
+
 }  // namespace
 
 class CudaLaunchConfigTest : public ::testing::Test {
@@ -94,7 +132,7 @@ class CudaLaunchConfigTest : public ::testing::Test {
   const int bufsize = 1024;
   int* outbuf = nullptr;
   Eigen::CudaStreamDevice stream;
-  GPUDevice d = GPUDevice(&stream);
+  Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
 
   virtual void SetUp() {
     cudaError_t err = cudaMallocManaged(&outbuf, sizeof(int) * bufsize);
@@ -111,27 +149,27 @@ class CudaLaunchConfigTest : public ::testing::Test {
 TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
   CudaLaunchConfig cfg;
 
-  // test valid inputs
-  #define TEST_LAUNCH_PARAMETER(work_element_count)                             \
-    cfg = GetCudaLaunchConfig(bufsize, d);                                      \
-    SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>     \
-                                                                (cfg, outbuf);  \
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCudaLaunchConfig(work_element_count, d);                           \
-    Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0));\
-                                                                                \
-    cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                 \
-    SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>     \
-                                                                (cfg, outbuf);  \
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0);            \
-    Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
+// test valid inputs
+#define TEST_LAUNCH_PARAMETER(work_element_count)                              \
+  cfg = GetCudaLaunchConfig(bufsize, d);                                       \
+  SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(     \
+      cfg, outbuf);                                                            \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCudaLaunchConfig(work_element_count, d);                            \
+  Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0)); \
+                                                                               \
+  cfg = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                  \
+  SetOutbufZero<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(     \
+      cfg, outbuf);                                                            \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCudaLaunchConfig(work_element_count, d, Count1D, 0, 0);             \
+  Count1D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(work_element_count, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128);
   TEST_LAUNCH_PARAMETER(129);
@@ -143,7 +181,7 @@ TEST_F(CudaLaunchConfigTest, GetCudaLaunchConfig) {
   TEST_LAUNCH_PARAMETER(8192);
   TEST_LAUNCH_PARAMETER(123456);
   TEST_LAUNCH_PARAMETER(1 << 30);
-  #undef TEST_LAUNCH_PARAMETER
+#undef TEST_LAUNCH_PARAMETER
 }
 
 bool operator==(const Cuda2DLaunchConfig& a, const Cuda2DLaunchConfig& b) {
@@ -162,27 +200,27 @@ TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
   Cuda2DLaunchConfig cfg;
   CudaLaunchConfig cfg1d;
 
-  // test valid inputs
-  #define TEST_LAUNCH_PARAMETER(dimx, dimy)                                     \
-    cfg1d = GetCudaLaunchConfig(bufsize, d);                                    \
-    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
-                                                                (cfg1d, outbuf);\
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                 \
-    Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0));       \
-                                                                                \
-    cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
-    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
-                                                                (cfg1d, outbuf);\
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                  \
-    Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(dimx * dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
+// test valid inputs
+#define TEST_LAUNCH_PARAMETER(dimx, dimy)                                      \
+  cfg1d = GetCudaLaunchConfig(bufsize, d);                                     \
+  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
+      cfg1d, outbuf);                                                          \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCuda2DLaunchConfig(dimx, dimy, d);                                  \
+  Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0));         \
+                                                                               \
+  cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                \
+  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
+      cfg1d, outbuf);                                                          \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCuda2DLaunchConfig(dimx, dimy, d, Count2D, 0, 0);                   \
+  Count2D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(dimx* dimy, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128);
   TEST_LAUNCH_PARAMETER(129, 64);
@@ -195,24 +233,24 @@ TEST_F(CudaLaunchConfigTest, GetCuda2DLaunchConfig) {
   TEST_LAUNCH_PARAMETER(123456, 12);
   TEST_LAUNCH_PARAMETER(1, 1 << 30);
   TEST_LAUNCH_PARAMETER(1 << 30, 1);
-  #undef TEST_LAUNCH_PARAMETER
+#undef TEST_LAUNCH_PARAMETER
 }
 
 TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
   Cuda3DLaunchConfig cfg;
   CudaLaunchConfig cfg1d;
 
-  // test valid inputs
-  #define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                               \
-    cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);               \
-    SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>> \
-                                                                (cfg1d, outbuf);\
-    CUDA_ASSERT_SUCCESS                                                         \
-    cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);            \
-    Count3D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>> (         \
-        cfg, bufsize, outbuf);                                                  \
-    CUDA_EXPECT_SUCCESS                                                         \
-    EXPECT_EQ(dimx * dimy * dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
+// test valid inputs
+#define TEST_LAUNCH_PARAMETER(dimx, dimy, dimz)                                \
+  cfg1d = GetCudaLaunchConfig(bufsize, d, SetOutbufZero, 0, 0);                \
+  SetOutbufZero<<<cfg1d.block_count, cfg1d.thread_per_block, 0, d.stream()>>>( \
+      cfg1d, outbuf);                                                          \
+  CUDA_ASSERT_SUCCESS                                                          \
+  cfg = GetCuda3DLaunchConfig(dimx, dimy, dimz, d, Count3D, 0, 0);             \
+  Count3D<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(           \
+      cfg, bufsize, outbuf);                                                   \
+  CUDA_EXPECT_SUCCESS                                                          \
+  EXPECT_EQ(dimx* dimy* dimz, std::accumulate(outbuf, outbuf + bufsize, 0))
 
   TEST_LAUNCH_PARAMETER(128, 128, 128);
   TEST_LAUNCH_PARAMETER(129, 64, 1024);
@@ -226,7 +264,17 @@ TEST_F(CudaLaunchConfigTest, GetCuda3DLaunchConfig) {
   TEST_LAUNCH_PARAMETER(1, 1, 1 << 30);
   TEST_LAUNCH_PARAMETER(1, 1 << 30, 1);
   TEST_LAUNCH_PARAMETER(1 << 30, 1, 1);
-  #undef TEST_LAUNCH_PARAMETER
+#undef TEST_LAUNCH_PARAMETER
+}
+
+TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
+  unsigned* failure_count;
+  ASSERT_EQ(cudaMallocManaged(&failure_count, sizeof(unsigned)), cudaSuccess);
+  *failure_count = 0;
+  CudaShuffleGetSrcLaneTest<<<1, 32>>>(failure_count);
+  ASSERT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+  ASSERT_EQ(*failure_count, 0);
+  cudaFree(failure_count);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/cuda_launch_config.h b/tensorflow/core/util/cuda_launch_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ea33ee6cf2195cc0192c59d694672f0d4c69a56
--- /dev/null
+++ b/tensorflow/core/util/cuda_launch_config.h
@@ -0,0 +1,284 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_CUDA_LAUNCH_CONFIG_H_
+#define TENSORFLOW_CORE_UTIL_CUDA_LAUNCH_CONFIG_H_
+
+#if GOOGLE_CUDA
+
+#include <algorithm>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "cuda/include/cuda.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
+
+// Usage of GetCudaLaunchConfig, GetCuda2DLaunchConfig, and
+// GetCuda3DLaunchConfig:
+//
+// There are two versions of GetCudaLaunchConfig and GetCuda2DLaunchConfig, one
+// version uses heuristics without any knowledge of the device kernel, the other
+// version uses cudaOccupancyMaxPotentialBlockSize to determine the theoretical
+// launch parameters that maximize occupancy. Currently, only the maximum
+// occupancy version of GetCuda3DLaunchConfig is available.
+//
+// For large number of work elements, the convention is that each kernel would
+// iterate through its assigned range. The return value of GetCudaLaunchConfig
+// is struct CudaLaunchConfig, which contains all the information needed for the
+// kernel launch, including: virtual number of threads, the number of threads
+// per block and number of threads per block used inside <<< >>> of a kernel
+// launch. GetCuda2DLaunchConfig and GetCuda3DLaunchConfig does the same thing
+// as CudaLaunchConfig. The only difference is the dimension. The macros
+// CUDA_1D_KERNEL_LOOP and CUDA_AXIS_KERNEL_LOOP might be used to do inner loop.
+//
+/* Sample code:
+
+__global__ void MyKernel1D(CudaLaunchConfig config, other_args...) {
+  CUDA_1D_KERNEL_LOOP(x, config.virtual_thread_count) {
+    do_your_job_here;
+  }
+}
+
+__global__ void MyKernel2D(Cuda2DLaunchConfig config, other_args...) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+      do_your_job_here;
+    }
+  }
+}
+
+__global__ void MyKernel3D(Cuda3DLaunchConfig config, other_args...) {
+  CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count, x) {
+    CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count, y) {
+      CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count, z) {
+        do_your_job_here;
+      }
+    }
+  }
+}
+
+void MyDriverFunc(const Eigen::GpuDevice &d) {
+  // use heuristics
+  CudaLaunchConfig cfg1 = GetCudaLaunchConfig(10240, d);
+  MyKernel1D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg1, other_args...);
+  Cuda2DLaunchConfig cfg2 = GetCuda2DLaunchConfig(10240, 10240, d);
+  MyKernel2D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg2, other_args...);
+  Cuda3DLaunchConfig cfg3 = GetCuda3DLaunchConfig(4096, 4096, 100, d);
+  MyKernel3D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg3, other_args...);
+
+  // maximize occupancy
+  CudaLaunchConfig cfg4 = GetCudaLaunchConfig(10240, d, MyKernel1D, 0, 0 );
+  MyKernel1D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg4, other_args...);
+  Cuda2DLaunchConfig cfg5 = GetCuda2DLaunchConfig(10240, 10240, d,
+                                                  MyKernel1D, 0, 0);
+  MyKernel2D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg5, other_args...);
+  Cuda3DLaunchConfig cfg6 = GetCuda3DLaunchConfig(4096, 4096, 100, d,
+                                                  MyKernel1D, 0, 0);
+  MyKernel3D <<<config.block_count,
+                config.thread_per_block, 0, d.stream()>>> (cfg6, other_args...);
+}
+
+// See the test for this for more example:
+//
+https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+
+*/
+
+namespace tensorflow {
+
+inline int DivUp(int a, int b) { return (a + b - 1) / b; }
+
+struct CudaLaunchConfig {
+  // Logical number of thread that works on the elements. If each logical
+  // thread works on exactly a single element, this is the same as the working
+  // element count.
+  int virtual_thread_count = -1;
+  // Number of threads per block.
+  int thread_per_block = -1;
+  // Number of blocks for Cuda kernel launch.
+  int block_count = -1;
+};
+
+// Calculate the Cuda launch config we should use for a kernel launch.
+// This is assuming the kernel is quite simple and will largely be
+// memory-limited.
+// REQUIRES: work_element_count > 0.
+inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                            const Eigen::GpuDevice& d) {
+  CHECK_GT(work_element_count, 0);
+  CudaLaunchConfig config;
+  const int virtual_thread_count = work_element_count;
+  const int physical_thread_count = std::min(
+      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
+      virtual_thread_count);
+  const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
+  const int block_count =
+      std::min(DivUp(physical_thread_count, thread_per_block),
+               d.getNumCudaMultiProcessors());
+
+  config.virtual_thread_count = virtual_thread_count;
+  config.thread_per_block = thread_per_block;
+  config.block_count = block_count;
+  return config;
+}
+
+// Calculate the Cuda launch config we should use for a kernel launch. This
+// variant takes the resource limits of func into account to maximize occupancy.
+// REQUIRES: work_element_count > 0.
+template <typename DeviceFunc>
+inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                            const Eigen::GpuDevice& d,
+                                            DeviceFunc func,
+                                            size_t dynamic_shared_memory_size,
+                                            int block_size_limit) {
+  CHECK_GT(work_element_count, 0);
+  CudaLaunchConfig config;
+  int block_count = 0;
+  int thread_per_block = 0;
+
+  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+      block_size_limit);
+  CHECK_EQ(err, cudaSuccess);
+
+  block_count =
+      std::min(block_count, DivUp(work_element_count, thread_per_block));
+
+  config.virtual_thread_count = work_element_count;
+  config.thread_per_block = thread_per_block;
+  config.block_count = block_count;
+  return config;
+}
+
+struct Cuda2DLaunchConfig {
+  dim3 virtual_thread_count = dim3(0, 0, 0);
+  dim3 thread_per_block = dim3(0, 0, 0);
+  dim3 block_count = dim3(0, 0, 0);
+};
+
+inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
+                                                const Eigen::GpuDevice& d) {
+  Cuda2DLaunchConfig config;
+
+  if (xdim <= 0 || ydim <= 0) {
+    return config;
+  }
+
+  const int kThreadsPerBlock = 256;
+  int block_cols = std::min(xdim, kThreadsPerBlock);
+  // ok to round down here and just do more loops in the kernel
+  int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
+
+  const int physical_thread_count =
+      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor();
+
+  const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
+
+  config.virtual_thread_count = dim3(xdim, ydim, 1);
+  config.thread_per_block = dim3(block_cols, block_rows, 1);
+
+  int grid_x = std::min(DivUp(xdim, block_cols), max_blocks);
+
+  config.block_count = dim3(
+      grid_x, std::min(max_blocks / grid_x, std::max(ydim / block_rows, 1)), 1);
+  return config;
+}
+
+// Calculate the Cuda 2D and 3D launch config we should use for a kernel launch.
+// This variant takes the resource limits of func into account to maximize
+// occupancy.
+using Cuda3DLaunchConfig = Cuda2DLaunchConfig;
+
+template <typename DeviceFunc>
+inline Cuda3DLaunchConfig GetCuda3DLaunchConfig(
+    int xdim, int ydim, int zdim, const Eigen::GpuDevice& d, DeviceFunc func,
+    size_t dynamic_shared_memory_size, int block_size_limit) {
+  Cuda3DLaunchConfig config;
+
+  if (xdim <= 0 || ydim <= 0 || zdim <= 0) {
+    return config;
+  }
+
+  int dev;
+  cudaGetDevice(&dev);
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, dev);
+  int xthreadlimit = deviceProp.maxThreadsDim[0];
+  int ythreadlimit = deviceProp.maxThreadsDim[1];
+  int zthreadlimit = deviceProp.maxThreadsDim[2];
+  int xgridlimit = deviceProp.maxGridSize[0];
+  int ygridlimit = deviceProp.maxGridSize[1];
+  int zgridlimit = deviceProp.maxGridSize[2];
+
+  int block_count = 0;
+  int thread_per_block = 0;
+  cudaError_t err = cudaOccupancyMaxPotentialBlockSize(
+      &block_count, &thread_per_block, func, dynamic_shared_memory_size,
+      block_size_limit);
+  CHECK_EQ(err, cudaSuccess);
+
+  auto min3 = [](int a, int b, int c) { return std::min(a, std::min(b, c)); };
+
+  int threadsx = min3(xdim, thread_per_block, xthreadlimit);
+  int threadsy =
+      min3(ydim, std::max(thread_per_block / threadsx, 1), ythreadlimit);
+  int threadsz =
+      min3(zdim, std::max(thread_per_block / (threadsx * threadsy), 1),
+           zthreadlimit);
+
+  int blocksx = min3(block_count, DivUp(xdim, threadsx), xgridlimit);
+  int blocksy =
+      min3(DivUp(block_count, blocksx), DivUp(ydim, threadsy), ygridlimit);
+  int blocksz = min3(DivUp(block_count, (blocksx * blocksy)),
+                     DivUp(zdim, threadsz), zgridlimit);
+
+  config.virtual_thread_count = dim3(xdim, ydim, zdim);
+  config.thread_per_block = dim3(threadsx, threadsy, threadsz);
+  config.block_count = dim3(blocksx, blocksy, blocksz);
+  return config;
+}
+
+template <typename DeviceFunc>
+inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(
+    int xdim, int ydim, const Eigen::GpuDevice& d, DeviceFunc func,
+    size_t dynamic_shared_memory_size, int block_size_limit) {
+  return GetCuda3DLaunchConfig(xdim, ydim, 1, d, func,
+                               dynamic_shared_memory_size, block_size_limit);
+}
+
+// Returns a raw reference to the current cuda stream.  Required by a
+// number of kernel calls (for which StreamInterface* does not work), i.e.
+// CUB and certain cublas primitives.
+inline const cudaStream_t& GetCudaStream(OpKernelContext* context) {
+  const cudaStream_t* ptr = CHECK_NOTNULL(
+      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+                                                ->stream()
+                                                ->implementation()
+                                                ->CudaStreamMemberHack()));
+  return *ptr;
+}
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
diff --git a/tensorflow/core/util/equal_graph_def.cc b/tensorflow/core/util/equal_graph_def.cc
index a3b7db98cc00eff703bbce95cb3fae7e83be35b5..f1ec497a6772c84d599a76169515ef417c11f430 100644
--- a/tensorflow/core/util/equal_graph_def.cc
+++ b/tensorflow/core/util/equal_graph_def.cc
@@ -148,7 +148,10 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
       first_control_input = i;
       break;
     }
-    if (actual.input(i) != expected.input(i)) {
+    // Special case for inputs: "tensor" is equivalent to "tensor:0"
+    if (actual.input(i) != expected.input(i) &&
+        actual.input(i) != strings::StrCat(expected.input(i), ":0") &&
+        strings::StrCat(actual.input(i), ":0") != expected.input(i)) {
       if (diff != nullptr) {
         *diff = strings::StrCat("Node named '", actual.name(), "' has input ",
                                 i, " '", actual.input(i),
diff --git a/tensorflow/core/util/event.proto b/tensorflow/core/util/event.proto
index 5c3799c13228142fcd8b81e3db85332f6e618d4f..65d2c5a09c5c98a70e834e182d5751350506a1a1 100644
--- a/tensorflow/core/util/event.proto
+++ b/tensorflow/core/util/event.proto
@@ -80,3 +80,8 @@ message TaggedRunMetadata {
   // deserialization.
   bytes run_metadata = 2;
 }
+
+// For communicating live events back to a coordinator
+message SessionStatus {
+  repeated Event event = 1;
+}
diff --git a/tensorflow/core/util/events_writer.cc b/tensorflow/core/util/events_writer.cc
index 23b00e23dd0e7054aaf0e4e442c60f1372ce2d5b..49507616ed8c6461f8d59d8899d93abb4ba58cd2 100644
--- a/tensorflow/core/util/events_writer.cc
+++ b/tensorflow/core/util/events_writer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <stddef.h>  // for NULL
 
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -35,10 +36,21 @@ EventsWriter::EventsWriter(const string& file_prefix)
       file_prefix_(file_prefix),
       num_outstanding_events_(0) {}
 
-bool EventsWriter::InitIfNeeded() {
+EventsWriter::~EventsWriter() {
+  Close().IgnoreError();  // Autoclose in destructor.
+}
+
+Status EventsWriter::Init() { return InitWithSuffix(""); }
+
+Status EventsWriter::InitWithSuffix(const string& suffix) {
+  file_suffix_ = suffix;
+  return InitIfNeeded();
+}
+
+Status EventsWriter::InitIfNeeded() {
   if (recordio_writer_ != nullptr) {
     CHECK(!filename_.empty());
-    if (FileHasDisappeared()) {
+    if (!FileStillExists().ok()) {
       // Warn user of data loss and let .reset() below do basic cleanup.
       if (num_outstanding_events_ > 0) {
         LOG(WARNING) << "Re-initialization, attempting to open a new file, "
@@ -46,7 +58,7 @@ bool EventsWriter::InitIfNeeded() {
       }
     } else {
       // No-op: File is present and writer is initialized.
-      return true;
+      return Status::OK();
     }
   }
 
@@ -57,15 +69,12 @@ bool EventsWriter::InitIfNeeded() {
                       static_cast<int64>(time_in_seconds),
                       port::Hostname().c_str(), file_suffix_.c_str());
 
-  Status s = env_->NewWritableFile(filename_, &recordio_file_);
-  if (!s.ok()) {
-    LOG(ERROR) << "Could not open events file: " << filename_ << ": " << s;
-    return false;
-  }
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      env_->NewWritableFile(filename_, &recordio_file_),
+      "Creating writable file ", filename_);
   recordio_writer_.reset(new io::RecordWriter(recordio_file_.get()));
   if (recordio_writer_ == nullptr) {
-    LOG(ERROR) << "Could not create record writer";
-    return false;
+    return errors::Unknown("Could not create record writer");
   }
   num_outstanding_events_ = 0;
   VLOG(1) << "Successfully opened events file: " << filename_;
@@ -77,21 +86,21 @@ bool EventsWriter::InitIfNeeded() {
     event.set_wall_time(time_in_seconds);
     event.set_file_version(strings::StrCat(kVersionPrefix, kCurrentVersion));
     WriteEvent(event);
-    Flush();
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(Flush(), "Flushing first event.");
   }
-  return true;
+  return Status::OK();
 }
 
 string EventsWriter::FileName() {
   if (filename_.empty()) {
-    InitIfNeeded();
+    InitIfNeeded().IgnoreError();
   }
   return filename_;
 }
 
 void EventsWriter::WriteSerializedEvent(StringPiece event_str) {
   if (recordio_writer_ == nullptr) {
-    if (!InitIfNeeded()) {
+    if (!InitIfNeeded().ok()) {
       LOG(ERROR) << "Write failed because file could not be opened.";
       return;
     }
@@ -108,60 +117,51 @@ void EventsWriter::WriteEvent(const Event& event) {
   WriteSerializedEvent(record);
 }
 
-bool EventsWriter::Flush() {
-  if (num_outstanding_events_ == 0) return true;
+Status EventsWriter::Flush() {
+  if (num_outstanding_events_ == 0) return Status::OK();
   CHECK(recordio_file_ != nullptr) << "Unexpected NULL file";
 
-  if (!recordio_writer_->Flush().ok()) {
-    LOG(ERROR) << "Failed to flush " << num_outstanding_events_ << " events to "
-               << filename_;
-    return false;
-  }
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(recordio_writer_->Flush(), "Failed to flush ",
+                                  num_outstanding_events_, " to ", filename_);
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(recordio_file_->Sync(), "Failed to sync ",
+                                  num_outstanding_events_, " to ", filename_);
 
-  // The FileHasDisappeared() condition is necessary because
-  // recordio_writer_->Sync() can return true even if the underlying
+  // The FileStillExists() condition is necessary because
+  // recordio_writer_->Sync() can return OK even if the underlying
   // file has been deleted.  EventWriter.FileDeletionBeforeWriting
   // demonstrates this and will fail if the FileHasDisappeared()
   // condition is removed.
   // Also, we deliberately attempt to Sync() before checking for a
   // disappearing file, in case for some file system File::Exists() is
   // false after File::Open() but before File::Sync().
-  if (!recordio_file_->Flush().ok() || !recordio_file_->Sync().ok() ||
-      FileHasDisappeared()) {
-    LOG(ERROR) << "Failed to flush " << num_outstanding_events_ << " events to "
-               << filename_;
-    return false;
-  }
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(FileStillExists(), "Failed to flush ",
+                                  num_outstanding_events_, " to ", filename_);
   VLOG(1) << "Wrote " << num_outstanding_events_ << " events to disk.";
   num_outstanding_events_ = 0;
-  return true;
+  return Status::OK();
 }
 
-bool EventsWriter::Close() {
-  bool return_value = Flush();
+Status EventsWriter::Close() {
+  Status status = Flush();
   if (recordio_file_ != nullptr) {
-    Status s = recordio_file_->Close();
-    if (!s.ok()) {
-      LOG(ERROR) << "Error when closing previous event file: " << filename_
-                 << ": " << s;
-      return_value = false;
+    Status close_status = recordio_file_->Close();
+    if (!close_status.ok()) {
+      status = close_status;
     }
     recordio_writer_.reset(nullptr);
     recordio_file_.reset(nullptr);
   }
   num_outstanding_events_ = 0;
-  return return_value;
+  return status;
 }
 
-bool EventsWriter::FileHasDisappeared() {
+Status EventsWriter::FileStillExists() {
   if (env_->FileExists(filename_).ok()) {
-    return false;
-  } else {
-    // This can happen even with non-null recordio_writer_ if some other
-    // process has removed the file.
-    LOG(ERROR) << "The events file " << filename_ << " has disappeared.";
-    return true;
+    return Status::OK();
   }
+  // This can happen even with non-null recordio_writer_ if some other
+  // process has removed the file.
+  return errors::Unknown("The events file ", filename_, " has disappeared.");
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/events_writer.h b/tensorflow/core/util/events_writer.h
index a1a8cf790d4e2735d705cc2050c14970e5bfab4a..5dbaf97af4ad145cb09009b44d6f93d1c270d17d 100644
--- a/tensorflow/core/util/events_writer.h
+++ b/tensorflow/core/util/events_writer.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/record_writer.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
@@ -43,7 +45,7 @@ class EventsWriter {
   // Note that it is not recommended to simultaneously have two
   // EventWriters writing to the same file_prefix.
   explicit EventsWriter(const string& file_prefix);
-  ~EventsWriter() { Close(); }  // Autoclose in destructor.
+  ~EventsWriter();
 
   // Sets the event file filename and opens file for writing.  If not called by
   // user, will be invoked automatically by a call to FileName() or Write*().
@@ -51,11 +53,8 @@ class EventsWriter {
   // and is open this is a no-op.  If on the other hand the file was opened,
   // but has since disappeared (e.g. deleted by another process), this will open
   // a new file with a new timestamp in its filename.
-  bool Init() { return InitWithSuffix(""); }
-  bool InitWithSuffix(const string& suffix) {
-    file_suffix_ = suffix;
-    return InitIfNeeded();
-  }
+  Status Init();
+  Status InitWithSuffix(const string& suffix);
 
   // Returns the filename for the current events file:
   // filename_ = [file_prefix_].out.events.[timestamp].[hostname][suffix]
@@ -77,12 +76,12 @@ class EventsWriter {
   // be written too.
   //   Close() calls Flush() and then closes the current events file.
   // Returns true only if both the flush and the closure were successful.
-  bool Flush();
-  bool Close();
+  Status Flush();
+  Status Close();
 
  private:
-  bool FileHasDisappeared();  // True if event_file_path_ does not exist.
-  bool InitIfNeeded();
+  Status FileStillExists();  // OK if event_file_path_ exists.
+  Status InitIfNeeded();
 
   Env* env_;
   const string file_prefix_;
diff --git a/tensorflow/core/util/events_writer_test.cc b/tensorflow/core/util/events_writer_test.cc
index a6286ea701f09b94fe18cb373a42b5a83aab893a..a75b26abc631eb782ba527f9d15ac25ce9f72b2b 100644
--- a/tensorflow/core/util/events_writer_test.cc
+++ b/tensorflow/core/util/events_writer_test.cc
@@ -112,7 +112,7 @@ TEST(EventWriter, WriteFlush) {
   string file_prefix = GetDirName("/writeflush_test");
   EventsWriter writer(file_prefix);
   WriteFile(&writer);
-  EXPECT_TRUE(writer.Flush());
+  TF_EXPECT_OK(writer.Flush());
   string filename = writer.FileName();
   VerifyFile(filename);
 }
@@ -121,7 +121,7 @@ TEST(EventWriter, WriteClose) {
   string file_prefix = GetDirName("/writeclose_test");
   EventsWriter writer(file_prefix);
   WriteFile(&writer);
-  EXPECT_TRUE(writer.Close());
+  TF_EXPECT_OK(writer.Close());
   string filename = writer.FileName();
   VerifyFile(filename);
 }
@@ -143,7 +143,7 @@ TEST(EventWriter, FailFlush) {
   TF_EXPECT_OK(env()->FileExists(filename));
   TF_ASSERT_OK(env()->DeleteFile(filename));
   EXPECT_EQ(errors::Code::NOT_FOUND, env()->FileExists(filename).code());
-  EXPECT_FALSE(writer.Flush());
+  EXPECT_FALSE(writer.Flush().ok());
   EXPECT_EQ(errors::Code::NOT_FOUND, env()->FileExists(filename).code());
 }
 
@@ -155,18 +155,18 @@ TEST(EventWriter, FailClose) {
   TF_EXPECT_OK(env()->FileExists(filename));
   TF_ASSERT_OK(env()->DeleteFile(filename));
   EXPECT_EQ(errors::Code::NOT_FOUND, env()->FileExists(filename).code());
-  EXPECT_FALSE(writer.Close());
+  EXPECT_FALSE(writer.Close().ok());
   EXPECT_EQ(errors::Code::NOT_FOUND, env()->FileExists(filename).code());
 }
 
 TEST(EventWriter, InitWriteClose) {
   string file_prefix = GetDirName("/initwriteclose_test");
   EventsWriter writer(file_prefix);
-  EXPECT_TRUE(writer.Init());
+  TF_EXPECT_OK(writer.Init());
   string filename0 = writer.FileName();
   TF_EXPECT_OK(env()->FileExists(filename0));
   WriteFile(&writer);
-  EXPECT_TRUE(writer.Close());
+  TF_EXPECT_OK(writer.Close());
   string filename1 = writer.FileName();
   EXPECT_EQ(filename0, filename1);
   VerifyFile(filename1);
@@ -178,7 +178,7 @@ TEST(EventWriter, NameWriteClose) {
   string filename = writer.FileName();
   TF_EXPECT_OK(env()->FileExists(filename));
   WriteFile(&writer);
-  EXPECT_TRUE(writer.Close());
+  TF_EXPECT_OK(writer.Close());
   VerifyFile(filename);
 }
 
@@ -186,7 +186,7 @@ TEST(EventWriter, NameClose) {
   string file_prefix = GetDirName("/nameclose_test");
   EventsWriter writer(file_prefix);
   string filename = writer.FileName();
-  EXPECT_TRUE(writer.Close());
+  TF_EXPECT_OK(writer.Close());
   TF_EXPECT_OK(env()->FileExists(filename));
   TF_ASSERT_OK(env()->DeleteFile(filename));
 }
@@ -199,9 +199,9 @@ TEST(EventWriter, FileDeletionBeforeWriting) {
   env()->SleepForMicroseconds(
       2000000);  // To make sure timestamp part of filename will differ.
   TF_ASSERT_OK(env()->DeleteFile(filename0));
-  EXPECT_TRUE(writer.Init());  // Init should reopen file.
+  TF_EXPECT_OK(writer.Init());  // Init should reopen file.
   WriteFile(&writer);
-  EXPECT_TRUE(writer.Flush());
+  TF_EXPECT_OK(writer.Flush());
   string filename1 = writer.FileName();
   EXPECT_NE(filename0, filename1);
   VerifyFile(filename1);
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index b9cf97195be2ed9ddf526842b3f2c3b59f4cb5b6..7946fa1782ab3ebb225adfc2a139f5a755ddbe8b 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -94,9 +94,29 @@ class Feature {
     return Status::OK();
   }
 
+  bool GetNumElementsInBytesList(int* num_elements) {
+    protobuf::io::CodedInputStream stream(
+        reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
+    EnableAliasing(&stream);
+    uint32 length = 0;
+    if (!stream.ReadVarint32(&length)) return false;
+    auto limit = stream.PushLimit(length);
+    *num_elements = 0;
+    while (!stream.ExpectAtEnd()) {
+      if (!stream.ExpectTag(kDelimitedTag(1))) return false;
+      uint32 bytes_length = 0;
+      if (!stream.ReadVarint32(&bytes_length)) return false;
+      if (!stream.Skip(bytes_length)) return false;
+      ++*num_elements;
+    }
+    stream.PopLimit(limit);
+    return true;
+  }
+
   template <typename Result>
   bool ParseBytesList(Result* bytes_list) {
     DCHECK(bytes_list != nullptr);
+
     protobuf::io::CodedInputStream stream(
         reinterpret_cast<const uint8*>(serialized_.data()), serialized_.size());
 
@@ -447,6 +467,28 @@ class LimitedArraySlice {
   T* end_;
 };
 
+void LogDenseFeatureDataLoss(StringPiece feature_name) {
+  LOG(WARNING) << "Data loss! Feature '" << feature_name
+               << "' is present in multiple concatenated "
+                  "tf.Examples. Ignoring all but last one.";
+  static auto* duplicated_dense_feature = monitoring::Counter<0>::New(
+      "/tensorflow/core/util/example_proto_fast_parsing/"
+      "duplicated_dense_feature",
+      "Dense feature appears twice in a tf.Example");
+  duplicated_dense_feature->GetCell()->IncrementBy(1);
+}
+
+void LogSparseFeatureDataLoss(StringPiece feature_name) {
+  LOG(WARNING) << "Data loss! Feature '" << feature_name
+               << "' is present in multiple concatenated "
+                  "tf.Examples. Ignoring all but last one.";
+  static auto* duplicated_sparse_feature = monitoring::Counter<0>::New(
+      "/tensorflow/core/util/example_proto_fast_parsing/"
+      "duplicated_sparse_feature",
+      "Sparse feature appears twice in a tf.Example");
+  duplicated_sparse_feature->GetCell()->IncrementBy(1);
+}
+
 Status FastParseSerializedExample(
     const string& serialized_example, const string& example_name,
     const size_t example_index, const Config& config,
@@ -510,14 +552,7 @@ Status FastParseSerializedExample(
       // If feature was already visited, skip.
       // Compare comment at the beginning of the loop.
       if (dense_feature_last_example[d] == example_index) {
-        LOG(WARNING) << "Data loss! Feature '" << feature_name
-                     << "' in present in multiple concatenated "
-                        "tf.Examples. Ignoring all but last one.";
-        static auto* duplicated_dense_feature = monitoring::Counter<0>::New(
-            "/tensorflow/core/util/example_proto_fast_parsing/"
-            "duplicated_dense_feature",
-            "Dense feature appears twice in a tf.Example");
-        duplicated_dense_feature->GetCell()->IncrementBy(1);
+        LogDenseFeatureDataLoss(feature_name);
         continue;
       }
       dense_feature_last_example[d] = example_index;
@@ -639,14 +674,7 @@ Status FastParseSerializedExample(
       // If feature was already visited, skip.
       // Compare comment at the beginning of the loop.
       if (sparse_feature_last_example[d] == example_index) {
-        LOG(WARNING) << "Data loss! Feature '" << feature_name
-                     << "' in present in multiple concatenated "
-                        "tf.Examples. Ignoring all but last one.";
-        static auto* duplicated_sparse_feature = monitoring::Counter<0>::New(
-            "/tensorflow/core/util/example_proto_fast_parsing/"
-            "duplicated_sparse_feature",
-            "sparse feature appears twice in a tf.Example");
-        duplicated_sparse_feature->GetCell()->IncrementBy(1);
+        LogSparseFeatureDataLoss(feature_name);
         continue;
       }
       sparse_feature_last_example[d] = example_index;
@@ -1099,5 +1127,333 @@ Status FastParseExample(const Config& config,
   return Status::OK();
 }
 
+Status FastParseSingleExample(const Config& config, const string& serialized,
+                              Result* result) {
+  DCHECK(result != nullptr);
+  // Check config so we can safely CHECK(false) in switches on config.*.dtype
+  for (auto& c : config.sparse) {
+    TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype));
+  }
+  for (auto& c : config.dense) {
+    TF_RETURN_IF_ERROR(CheckConfigDataType(c.dtype));
+  }
+
+  // TODO(mrry): Cache the construction of this map at Op construction time.
+  size_t config_size = config.dense.size() + config.sparse.size();
+  SeededHasher hasher;
+  // Build config index.
+  PresizedCuckooMap<std::pair<size_t, Type>> config_index(config_size);
+  bool ok = true;
+  for (size_t i = 0; i < 1000; ++i) {
+    for (size_t d = 0; d < config.dense.size(); ++d) {
+      ok &= config_index.InsertUnique(hasher(config.dense[d].feature_name),
+                                      {d, Type::Dense});
+    }
+    for (size_t d = 0; d < config.sparse.size(); ++d) {
+      ok &= config_index.InsertUnique(hasher(config.sparse[d].feature_name),
+                                      {d, Type::Sparse});
+    }
+    if (ok) break;
+    LOG(WARNING) << "Collision found. This should happen only if you have "
+                    "around 2^32 entries in your config.";
+    hasher.seed++;
+    config_index.Clear(config_size);
+  }
+  if (!ok) {
+    return errors::Internal(
+        "Could not avoid collision. This should not happen.");
+  }
+
+  // Allocate dense output tensors.
+  for (size_t d = 0; d < config.dense.size(); ++d) {
+    if (!config.dense[d].variable_length) {
+      TensorShape values_shape;
+      if (!config.dense[d].shape.AsTensorShape(&values_shape)) {
+        return errors::Internal(
+            "Fixed-length shape was not a statically defined shape.");
+      }
+      result->dense_values.emplace_back(config.dense[d].dtype, values_shape);
+    } else {
+      // Variable-length tensor will be allocated later.
+      result->dense_values.emplace_back();
+    }
+  }
+
+  // Allocate sparse output tensors.
+  for (size_t d = 0; d < config.sparse.size(); ++d) {
+    // The dense_shape is always a vector of length 1.
+    result->sparse_shapes.emplace_back(DT_INT64, TensorShape({1}));
+    // Variable-length tensors will be allocated later.
+    result->sparse_indices.emplace_back();
+    result->sparse_values.emplace_back();
+  }
+
+  parsed::Example parsed_example;
+  if (!ParseExample(serialized, &parsed_example)) {
+    return errors::InvalidArgument("Could not parse example input, value: '",
+                                   serialized, "'");
+  }
+  std::vector<bool> sparse_feature_already_seen(config.sparse.size(), false);
+  std::vector<bool> dense_feature_already_seen(config.dense.size(), false);
+
+  // Handle features present in the example.
+  const size_t parsed_example_size = parsed_example.size();
+  for (size_t i = 0; i < parsed_example_size; ++i) {
+    // This is a logic that standard protobuf parsing is implementing.
+    // I.e. last entry in the map overwrites all the previous ones.
+    parsed::FeatureMapEntry& name_and_feature =
+        parsed_example[parsed_example_size - i - 1];
+
+    const StringPiece feature_name = name_and_feature.first;
+    parsed::Feature& feature = name_and_feature.second;
+
+    std::pair<size_t, Type> d_and_type;
+    uint64 h = hasher(feature_name);
+    if (!config_index.Find(h, &d_and_type)) continue;
+
+    size_t d = d_and_type.first;
+    bool is_dense = d_and_type.second == Type::Dense;
+
+    {
+      // Testing for PresizedCuckooMap collision.
+      // TODO(lew): Use dense_hash_map and avoid this and hasher creation.
+      const string& config_feature_name = is_dense
+                                              ? config.dense[d].feature_name
+                                              : config.sparse[d].feature_name;
+      if (feature_name != config_feature_name) continue;
+    }
+
+    auto example_error = [feature_name](StringPiece suffix) {
+      return errors::InvalidArgument("Key: ", feature_name, ".  ", suffix);
+    };
+
+    auto parse_error = [feature_name] {
+      return errors::InvalidArgument("Key: ", feature_name,
+                                     ".  Can't parse serialized Example.");
+    };
+
+    DataType example_dtype;
+    TF_RETURN_IF_ERROR(feature.ParseDataType(&example_dtype));
+    if (example_dtype == DT_INVALID) continue;
+
+    if (is_dense && !config.dense[d].variable_length) {
+      // If feature was already visited, skip.
+      // Compare comment at the beginning of the loop.
+      if (dense_feature_already_seen[d]) {
+        LogDenseFeatureDataLoss(feature_name);
+        continue;
+      }
+      dense_feature_already_seen[d] = true;
+
+      if (example_dtype != config.dense[d].dtype) {
+        return example_error(strings::StrCat(
+            "Data types don't match. Data type: ",
+            DataTypeString(example_dtype),
+            " but expected type: ", DataTypeString(config.dense[d].dtype)));
+      }
+
+      Tensor* out = &result->dense_values[d];
+      const std::size_t num_elements = config.dense[d].elements_per_stride;
+
+      switch (example_dtype) {
+        case DT_INT64: {
+          auto out_p = out->flat<int64>().data();
+          LimitedArraySlice<int64> slice(out_p, num_elements);
+          if (!feature.ParseInt64List(&slice)) return parse_error();
+          if (slice.EndDistance() != 0) {
+            return parse_error();
+          }
+          break;
+        }
+        case DT_FLOAT: {
+          auto out_p = out->flat<float>().data();
+          LimitedArraySlice<float> slice(out_p, num_elements);
+          if (!feature.ParseFloatList(&slice)) return parse_error();
+          if (slice.EndDistance() != 0) {
+            return parse_error();
+          }
+          break;
+        }
+        case DT_STRING: {
+          auto out_p = out->flat<string>().data();
+          LimitedArraySlice<string> slice(out_p, num_elements);
+          if (!feature.ParseBytesList(&slice)) return parse_error();
+          if (slice.EndDistance() != 0) {
+            return parse_error();
+          }
+          break;
+        }
+        default:
+          LOG(FATAL) << "Should not happen.";
+      }
+
+    } else {  // if variable length
+      SparseBuffer out_temp;
+      const size_t num_elements_divisor =
+          is_dense ? config.dense[d].elements_per_stride : 1;
+      size_t num_elements;
+
+      if (is_dense) {
+        // If feature was already visited, skip.
+        // Compare comment at the beginning of the loop.
+        if (dense_feature_already_seen[d]) {
+          LogDenseFeatureDataLoss(feature_name);
+          continue;
+        }
+        dense_feature_already_seen[d] = true;
+        if (example_dtype != config.dense[d].dtype) {
+          return example_error(strings::StrCat(
+              "Data types don't match. Data type: ",
+              DataTypeString(example_dtype),
+              " but expected type: ", DataTypeString(config.dense[d].dtype)));
+        }
+      } else {
+        // If feature was already visited, skip.
+        // Compare comment at the beginning of the loop.
+        if (sparse_feature_already_seen[d]) {
+          LogSparseFeatureDataLoss(feature_name);
+          continue;
+        }
+        sparse_feature_already_seen[d] = true;
+
+        // Handle sparse features.
+        if (example_dtype != DT_INVALID &&
+            example_dtype != config.sparse[d].dtype) {
+          return example_error(strings::StrCat(
+              "Data types don't match. ",
+              "Expected type: ", DataTypeString(config.sparse[d].dtype),
+              ", Actual type: ", DataTypeString(example_dtype)));
+        }
+      }
+
+      switch (example_dtype) {
+        case DT_INT64: {
+          // TODO(mrry): Use the fact that the `int64_list` is packed to read
+          // out the length and pre-allocate the output tensor.
+          if (!feature.ParseInt64List(&out_temp.int64_list))
+            return parse_error();
+          num_elements = out_temp.int64_list.size();
+          break;
+        }
+        case DT_FLOAT: {
+          // TODO(mrry): Use the fact that the `float_list` is packed to read
+          // out the length and pre-allocate the output tensor.
+          if (!feature.ParseFloatList(&out_temp.float_list))
+            return parse_error();
+          num_elements = out_temp.float_list.size();
+          break;
+        }
+        case DT_STRING: {
+          int actual_num_elements = 0;
+          if (!feature.GetNumElementsInBytesList(&actual_num_elements)) {
+            return parse_error();
+          }
+          out_temp.bytes_list.reserve(actual_num_elements);
+          if (!feature.ParseBytesList(&out_temp.bytes_list))
+            return parse_error();
+          num_elements = out_temp.bytes_list.size();
+          break;
+        }
+        default:
+          LOG(FATAL) << "Should not happen. " << DataTypeString(example_dtype);
+      }
+
+      if (num_elements % num_elements_divisor != 0) {
+        return parse_error();
+      }
+
+      Tensor* out;
+      if (is_dense) {
+        TensorShape values_shape;
+        values_shape.AddDim(num_elements / num_elements_divisor);
+        for (int i = 1; i < config.dense[d].shape.dims(); ++i) {
+          values_shape.AddDim(config.dense[d].shape.dim_size(i));
+        }
+
+        out = &result->dense_values[d];
+        *out = Tensor(config.dense[d].dtype, values_shape);
+
+      } else {
+        Tensor* out_indices = &result->sparse_indices[d];
+        Tensor* out_dense_shape = &result->sparse_shapes[d];
+        out = &result->sparse_values[d];
+
+        // TODO(mrry): Investigate the possibility of not materializing
+        // the indices (and perhaps dense_shape) until they are needed.
+        *out_indices = Tensor(
+            DT_INT64, TensorShape({static_cast<int64>(num_elements), 1}));
+        auto indices_flat = out_indices->flat<int64>();
+        for (size_t i = 0; i < num_elements; ++i) {
+          indices_flat(i) = static_cast<int64>(i);
+        }
+
+        *out_dense_shape = Tensor(DT_INT64, TensorShape({1}));
+        auto shapes_shape_t = out_dense_shape->vec<int64>();
+        shapes_shape_t(0) = num_elements;
+
+        *out = Tensor(config.sparse[d].dtype,
+                      TensorShape({static_cast<int64>(num_elements)}));
+      }
+
+      switch (example_dtype) {
+        case DT_INT64: {
+          CopyOrMoveBlock(out_temp.int64_list.begin(),
+                          out_temp.int64_list.end(), out->flat<int64>().data());
+          break;
+        }
+        case DT_FLOAT: {
+          CopyOrMoveBlock(out_temp.float_list.begin(),
+                          out_temp.float_list.end(), out->flat<float>().data());
+          break;
+        }
+        case DT_STRING: {
+          CopyOrMoveBlock(out_temp.bytes_list.begin(),
+                          out_temp.bytes_list.end(),
+                          out->flat<string>().data());
+          break;
+        }
+        default:
+          LOG(FATAL) << "Should not happen.";
+      }
+    }
+  }
+
+  // Handle missing dense features.
+  for (size_t d = 0; d < config.dense.size(); ++d) {
+    if (!dense_feature_already_seen[d]) {
+      if (!config.dense[d].variable_length) {
+        // Handle missing fixed-length dense feature.
+        if (config.dense[d].default_value.NumElements() == 0) {
+          return errors::InvalidArgument(
+              "Feature: ", config.dense[d].feature_name,
+              " (data type: ", DataTypeString(config.dense[d].dtype), ")",
+              " is required but could not be found.");
+        }
+        result->dense_values[d] = config.dense[d].default_value;
+      } else {
+        // Handle missing varlen dense feature.
+        TensorShape empty_shape;
+        empty_shape.AddDim(0);
+        for (int i = 1; i < config.dense[d].shape.dims(); ++i) {
+          empty_shape.AddDim(config.dense[d].shape.dim_size(i));
+        }
+        result->dense_values[d] = Tensor(config.dense[d].dtype, empty_shape);
+      }
+    }
+  }
+
+  // Handle missing sparse features.
+  for (size_t d = 0; d < config.sparse.size(); ++d) {
+    if (!sparse_feature_already_seen[d]) {
+      result->sparse_indices[d] = Tensor(DT_INT64, TensorShape({0, 1}));
+      result->sparse_values[d] =
+          Tensor(config.sparse[d].dtype, TensorShape({0}));
+      result->sparse_shapes[d].vec<int64>()(0) = 0;
+    }
+  }
+
+  return Status::OK();
+}
+
 }  // namespace example
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/example_proto_fast_parsing.h b/tensorflow/core/util/example_proto_fast_parsing.h
index 20536cee163ba926a16f78e5014c5abd2958f5f2..1b08f0226735d0efe6ab9e8a17453311aa032ab0 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.h
+++ b/tensorflow/core/util/example_proto_fast_parsing.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
+#ifndef TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
+#define TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
 
 #include <string>
 #include <unordered_map>
@@ -79,6 +79,12 @@ Status FastParseExample(const FastParseExampleConfig& config,
                         gtl::ArraySlice<string> example_names,
                         thread::ThreadPool* thread_pool, Result* result);
 
+// TODO(mrry): Move the hash table construction into the config object.
+typedef FastParseExampleConfig FastParseSingleExampleConfig;
+
+Status FastParseSingleExample(const FastParseSingleExampleConfig& config,
+                              const string& serialized, Result* result);
+
 // This function parses serialized Example and populates given example.
 // It uses the same specialized parser as FastParseExample which is efficient.
 // But then constructs Example which is relatively slow.
@@ -88,4 +94,4 @@ bool TestFastParse(const string& serialized, Example* example);
 }  // namespace example
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
+#endif  // TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_
diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc
index 9b6a8e12511448b72e17a0b20a4418c4a5cd2c7a..13e41c17f7c7df5ad581bd3f6a39051641139258 100644
--- a/tensorflow/core/util/example_proto_fast_parsing_test.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc
@@ -57,6 +57,7 @@ void TestCorrectness(const string& serialized) {
   Example example;
   Example fast_example;
   EXPECT_TRUE(example.ParseFromString(serialized));
+  example.DiscardUnknownFields();
   EXPECT_TRUE(TestFastParse(serialized, &fast_example));
   EXPECT_EQ(example.DebugString(), fast_example.DebugString());
   if (example.DebugString() != fast_example.DebugString()) {
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index 4b5bf6311233a57914d624a5b77707d02c5bec37..e156a3bc8f0f01acc543e9b385bd9782870be52a 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -247,8 +247,9 @@ Status SingleExampleProtoToTensors(
       bool types_match;
       TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match));
       if (!types_match) {
-        return errors::InvalidArgument("Name: ", example_name, ", Feature: ",
-                                       key, ".  Data types don't match. ",
+        return errors::InvalidArgument("Name: ", example_name,
+                                       ", Feature: ", key,
+                                       ".  Data types don't match. ",
                                        "Expected type: ", DataTypeString(dtype),
                                        "  Feature is: ", ProtoDebugString(f));
       }
@@ -278,8 +279,9 @@ Status SingleExampleProtoToTensors(
       bool types_match;
       TF_RETURN_IF_ERROR(CheckTypesMatch(f, dtype, &types_match));
       if (!types_match) {
-        return errors::InvalidArgument("Name: ", example_name, ", Feature: ",
-                                       key, ".  Data types don't match. ",
+        return errors::InvalidArgument("Name: ", example_name,
+                                       ", Feature: ", key,
+                                       ".  Data types don't match. ",
                                        "Expected type: ", DataTypeString(dtype),
                                        "  Feature is: ", ProtoDebugString(f));
       }
@@ -400,7 +402,7 @@ Status BatchExampleProtoToTensors(
   return Status::OK();
 }
 
-Status ParseSingleExampleAttrs::FinishInit() {
+Status ParseExampleAttrs::FinishInit() {
   if (static_cast<size_t>(num_sparse) != sparse_types.size()) {
     return errors::InvalidArgument("len(sparse_keys) != len(sparse_types)");
   }
@@ -422,6 +424,25 @@ Status ParseSingleExampleAttrs::FinishInit() {
   return Status::OK();
 }
 
+Status ParseSingleExampleAttrs::FinishInit() {
+  if (sparse_keys.size() != sparse_types.size()) {
+    return errors::InvalidArgument("len(sparse_keys) != len(sparse_types)");
+  }
+  if (dense_keys.size() != dense_types.size()) {
+    return errors::InvalidArgument("len(dense_keys) != len(dense_types)");
+  }
+  if (dense_keys.size() != dense_shapes.size()) {
+    return errors::InvalidArgument("len(dense_keys) != len(dense_shapes)");
+  }
+  for (const DataType& type : dense_types) {
+    TF_RETURN_IF_ERROR(CheckValidType(type));
+  }
+  for (const DataType& type : sparse_types) {
+    TF_RETURN_IF_ERROR(CheckValidType(type));
+  }
+  return Status::OK();
+}
+
 Status ParseSingleSequenceExampleAttrs::FinishInit() {
   if (static_cast<size_t>(num_context_sparse) != context_sparse_types.size()) {
     return errors::InvalidArgument(
diff --git a/tensorflow/core/util/example_proto_helper.h b/tensorflow/core/util/example_proto_helper.h
index 7414d61e8bd850863c8e59c1262121e11559fcff..e51170496217d01084ebbc671524ca7829847a41 100644
--- a/tensorflow/core/util/example_proto_helper.h
+++ b/tensorflow/core/util/example_proto_helper.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
+#ifndef TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
+#define TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
 
 #include <string>
 #include <vector>
@@ -148,9 +148,9 @@ Tensor FeatureSparseCopy(const std::size_t batch, const string& key,
 int64 CopyIntoSparseTensor(const Tensor& in, const int batch,
                            const int64 offset, Tensor* indices, Tensor* values);
 
-// Parses the attributes passed to ParseSingleExample.
+// Parses the attributes passed to ParseExample.
 // REQUIRES: Init must be called after construction.
-class ParseSingleExampleAttrs {
+class ParseExampleAttrs {
  public:
   template <typename ContextType>
   Status Init(ContextType* ctx) {
@@ -205,6 +205,72 @@ class ParseSingleExampleAttrs {
   Status FinishInit();  // for context-independent parts of Init.
 };
 
+// Parses the attributes passed to ParseSingleExample.
+// REQUIRES: Init must be called after construction.
+class ParseSingleExampleAttrs {
+ public:
+  template <typename ContextType>
+  Status Init(ContextType* ctx) {
+    TF_RETURN_IF_ERROR(ctx->GetAttr("sparse_keys", &sparse_keys));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("sparse_types", &sparse_types));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("dense_keys", &dense_keys));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("Tdense", &dense_types));
+    TF_RETURN_IF_ERROR(ctx->GetAttr("dense_shapes", &dense_shapes));
+
+    int num_sparse;
+    TF_RETURN_IF_ERROR(ctx->GetAttr("num_sparse", &num_sparse));
+    if (num_sparse != sparse_keys.size() || num_sparse != sparse_types.size()) {
+      return errors::InvalidArgument(
+          "num_sparse (", num_sparse, ") must match the size of sparse_keys (",
+          sparse_keys.size(), ") and sparse_types (", sparse_types.size(), ")");
+    }
+
+    // Temporary check until we start allowing a variable length outer
+    // dimension.
+    for (int i = 0; i < dense_shapes.size(); ++i) {
+      bool shape_ok = true;
+      if (dense_shapes[i].dims() == -1) {
+        shape_ok = false;
+      } else {
+        for (int d = 1; d < dense_shapes[i].dims(); ++d) {
+          if (dense_shapes[i].dim_size(d) == -1) {
+            shape_ok = false;
+          }
+        }
+      }
+      if (!shape_ok) {
+        return errors::InvalidArgument(
+            "dense_shapes[", i,
+            "] has unknown rank or unknown inner dimensions: ",
+            dense_shapes[i].DebugString());
+      }
+      TensorShape dense_shape;
+      if (dense_shapes[i].dims() > 0 && dense_shapes[i].dim_size(0) == -1) {
+        variable_length.push_back(true);
+        for (int d = 1; d < dense_shapes[i].dims(); ++d) {
+          dense_shape.AddDim(dense_shapes[i].dim_size(d));
+        }
+      } else {
+        variable_length.push_back(false);
+        dense_shapes[i].AsTensorShape(&dense_shape);
+      }
+      elements_per_stride.push_back(dense_shape.num_elements());
+    }
+    return FinishInit();
+  }
+
+  std::vector<string> sparse_keys;
+  std::vector<DataType> sparse_types;
+  std::vector<string> dense_keys;
+  std::vector<DataType> dense_types;
+  std::vector<PartialTensorShape> dense_shapes;
+  std::vector<bool> variable_length;
+  std::vector<std::size_t> elements_per_stride;
+
+ private:
+  Status FinishInit();  // for context-independent parts of Init.
+};
+
 // Parses the attributes passed to ParseSingleSequenceExample.
 // REQUIRES: Init must be called after construction.
 class ParseSingleSequenceExampleAttrs {
@@ -248,4 +314,4 @@ class ParseSingleSequenceExampleAttrs {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
+#endif  // TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_HELPER_H_
diff --git a/tensorflow/core/util/matmul_autotune.h b/tensorflow/core/util/matmul_autotune.h
index 53666238836b89db3198adce9620fcbd7c59a12c..5846cae2fc73f822633dd0fa1667ee2f55d487bc 100644
--- a/tensorflow/core/util/matmul_autotune.h
+++ b/tensorflow/core/util/matmul_autotune.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // The utility to check matmul autotune related flags.
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
+#ifndef TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
+#define TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
 
 namespace tensorflow {
 
@@ -25,4 +25,4 @@ bool MatmulDoFP32ComputationFP16Input();
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
+#endif  // TENSORFLOW_CORE_UTIL_MATMUL_AUTOTUNE_H_
diff --git a/tensorflow/core/util/memmapped_file_system_test.cc b/tensorflow/core/util/memmapped_file_system_test.cc
index 616eb5dac32188688ac01cf49ff583dc1623d5ad..504d2d353f8f76f77e4efd3e4a6a6edcaa200711 100644
--- a/tensorflow/core/util/memmapped_file_system_test.cc
+++ b/tensorflow/core/util/memmapped_file_system_test.cc
@@ -144,8 +144,8 @@ TEST(MemmappedFileSystemTest, ProxyToDefault) {
   TF_ASSERT_OK(memmapped_env.NewAppendableFile(filename, &writable_file_temp));
   // Making sure to clean up after the test finishes.
   const auto adh = [&memmapped_env, &filename](WritableFile* f) {
-      delete f;
-      TF_CHECK_OK(memmapped_env.DeleteFile(filename));
+    delete f;
+    TF_CHECK_OK(memmapped_env.DeleteFile(filename));
   };
   std::unique_ptr<WritableFile, decltype(adh)> writable_file(
       writable_file_temp.release(), adh);
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 148c7851bd448a0af754644f09a4d1e0511efe44..db4c5c35e365ca4eed48e07cbae3ad83bcb28622 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::engine;
@@ -210,31 +210,32 @@ class MklShape {
     CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS);
   }
 
-// The following methods are used for serializing and de-serializing the
-// contents of the mklshape object.
-// The data is serialized in this order
-// isMklTensor_
-// dimension_
-// sizes_
-// strides_
-// mklLayout_
-// tfLayout_
-// tf_to_mkl_dim_map_
+  // The following methods are used for serializing and de-serializing the
+  // contents of the mklshape object.
+  // The data is serialized in this order
+  // isMklTensor_
+  // dimension_
+  // sizes_
+  // strides_
+  // mklLayout_
+  // tfLayout_
+  // tf_to_mkl_dim_map_
 
 #define SIZE_OF_MKL_DNN_BUF \
   (dnnLayoutSerializationBufferSize_F32())  // Size of buffer needed to
                                             // serialize dnn_layout pointer
 
-// Size of buffer to hold the serialized object, the size is computed as follows
-// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) + sizeof(strides_)
-// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
-// + sizeof(tf_to_mkl_dim_map_)
+  // Size of buffer to hold the serialized object, the size is computed as
+  // follows sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) +
+  // sizeof(strides_)
+  // + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer)
+  // + sizeof(tf_to_mkl_dim_map_)
 
 #define SIZE_OF_MKL_SERIAL_DATA(dims) \
   (2 * sizeof(size_t) + 3 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF)
 
-// First we need to define some macro for offsets into the serial buffer where
-// different elements of Mklshape is written/read from
+  // First we need to define some macro for offsets into the serial buffer where
+  // different elements of Mklshape is written/read from
 
 #define IS_MKL_TENSOR_OFFSET 0
 // Location from start of buffer where isMklTensor_ is serialized
@@ -324,10 +325,14 @@ class MklShape {
       nullptr;  // TF dimension corresponding to this MKL dimension
 };
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 // Forward decl
 TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
+memory::dims CalculateTFStrides(const memory::dims& dims_tf_order);
+memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
+                                        const memory::dims& strides,
+                                        memory::data_type dtype);
 
 class MklDnnShape {
  private:
@@ -364,6 +369,52 @@ class MklDnnShape {
   ~MklDnnShape() {}
   TF_DISALLOW_COPY_AND_ASSIGN(MklDnnShape);  // Cannot copy
 
+  /// Helper function to compare memory::desc objects for MklDnn.
+  /// May be this should go into MklDnn directly.
+  inline bool CompareMklDnnLayouts(const memory::desc& md1,
+                                   const memory::desc& md2) const {
+    mkldnn_memory_desc_t mdd1 = md1.data;
+    mkldnn_memory_desc_t mdd2 = md2.data;
+    const char* d1 = reinterpret_cast<const char*>(&mdd1);
+    const char* d2 = reinterpret_cast<const char*>(&mdd2);
+
+    size_t md_size = sizeof(mdd1);
+    for (size_t i = 0; i < md_size; i++) {
+      if (*d1++ != *d2++) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Equality function for MklDnnShape objects
+  /// @return true if both are equal; false otherwise.
+  inline bool operator==(const MklDnnShape& input_shape) const {
+    if (this->IsMklTensor() != input_shape.IsMklTensor()) {
+      return false;
+    }
+
+    // If input tensors are in Mkl layout, then we check for dimensions and
+    // sizes.
+    if (this->IsMklTensor()) {
+      return this->GetTfShape() == input_shape.GetTfShape() &&
+             CompareMklDnnLayouts(this->GetMklLayout(),
+                                  input_shape.GetMklLayout());
+    }
+
+    return true;
+  }
+
+  /// Equality operator for MklDnnShape and TFShape.
+  /// Returns: true if TF shapes for both are the same, false otherwise
+  inline bool operator==(const TensorShape& input_shape) const {
+    if (!this->IsMklTensor()) {
+      return false;
+    }
+
+    return this->GetTfShape() == input_shape;
+  }
+
   inline const bool IsMklTensor() const { return data_.is_mkl_tensor_; }
   inline void SetMklTensor(bool is_mkl_tensor) {
     data_.is_mkl_tensor_ = is_mkl_tensor;
@@ -405,7 +456,7 @@ class MklDnnShape {
   inline memory::dims GetSizesAsMklDnnDims() const {
     memory::dims retVal;
     if (data_.is_mkl_tensor_) {
-      int dimensions = sizeof(data_.sizes_) / sizeof(data_.sizes_[0]);
+      size_t dimensions = sizeof(data_.sizes_) / sizeof(data_.sizes_[0]);
       for (size_t i = 0; i < dimensions; i++) {
         if (data_.sizes_[i] != INVALID_DIM_SIZE)
           retVal.push_back(data_.sizes_[i]);
@@ -423,12 +474,21 @@ class MklDnnShape {
 
   /// Return TensorShape that describes the Tensorflow shape of the tensor
   /// represented by this MklShape.
-  inline TensorShape GetTfShape() {
+  inline TensorShape GetTfShape() const {
     CHECK_EQ(data_.is_mkl_tensor_, true);
 
     std::vector<int32> shape(data_.dimension_, -1);
-    for (size_t idx = 0; idx < data_.dimension_; ++idx) {
-      shape[idx] = data_.sizes_[TfDimIdx(idx)];
+    if (data_.tf_data_format_ != memory::format::blocked) {
+      for (size_t idx = 0; idx < data_.dimension_; ++idx) {
+        shape[idx] = data_.sizes_[TfDimIdx(idx)];
+      }
+    } else {
+      // If Tensorflow shape is in Blocked format, then we don't have dimension
+      // map for it. So we just create Tensorflow shape from sizes in the
+      // specified order.
+      for (size_t idx = 0; idx < data_.dimension_; ++idx) {
+        shape[idx] = data_.sizes_[idx];
+      }
     }
 
     TensorShape ts;
@@ -444,6 +504,12 @@ class MklDnnShape {
     CHECK_NOTNULL(pd);
     data_.mkl_md_ = pd->desc().data;
   }
+
+  inline void SetMklLayout(memory::desc* md) {
+    CHECK_NOTNULL(md);
+    data_.mkl_md_ = md->data;
+  }
+
   inline const memory::desc GetMklLayout() const {
     return memory::desc(data_.mkl_md_);
   }
@@ -452,7 +518,8 @@ class MklDnnShape {
     return data_.tf_data_format_;
   }
   /// We don't create primitive_descriptor for TensorFlow layout now.
-  /// We use lazy evaluation and create it only when needed.
+  /// We use lazy evaluation and create it only when needed. Input format can
+  /// also be Blocked format.
   inline void SetTfLayout(size_t dims, const memory::dims& sizes,
                           memory::format format) {
     CHECK_EQ(dims, sizes.size());
@@ -461,15 +528,26 @@ class MklDnnShape {
       data_.sizes_[ii] = sizes[ii];
     }
     data_.tf_data_format_ = format;
-    SetTfDimOrder(dims, format);
+    if (format != memory::format::blocked) {
+      SetTfDimOrder(dims, format);
+    }
   }
+
   inline const memory::desc GetTfLayout() const {
     memory::dims dims;
     for (size_t ii = 0; ii < data_.dimension_; ii++) {
       dims.push_back(data_.sizes_[ii]);
     }
-    return memory::desc(dims, data_.T_, data_.tf_data_format_);
+
+    // Create Blocked memory desc if input TF format was set like that.
+    if (data_.tf_data_format_ == memory::format::blocked) {
+      auto strides = CalculateTFStrides(dims);
+      return CreateBlockedMemDescHelper(dims, strides, data_.T_);
+    } else {
+      return memory::desc(dims, data_.T_, data_.tf_data_format_);
+    }
   }
+
   inline const memory::desc GetCurLayout() const {
     return IsMklTensor() ? GetMklLayout() : GetTfLayout();
   }
@@ -579,8 +657,13 @@ class MklDnnShape {
 #endif
 
 // List of MklShape objects. Used in Concat/Split layers.
+
 typedef std::vector<MklShape> MklShapeList;
 
+#ifndef INTEL_MKL_ML
+typedef std::vector<MklDnnShape> MklDnnShapeList;
+#endif
+
 // Check if all tensors specified by MklShapes are MKL tensors.
 inline bool AreAllMklTensors(const MklShapeList& shapes) {
   for (auto& s : shapes) {
@@ -591,6 +674,7 @@ inline bool AreAllMklTensors(const MklShapeList& shapes) {
   return true;
 }
 
+#ifdef INTEL_MKL_ML
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklShape& mkl_shape) {
@@ -615,32 +699,15 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
 
   return output_tensor;
 }
-
-#ifdef INTEL_MKL_DNN
+#else
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklDnnShape& mkl_shape) {
   Tensor output_tensor;
   TensorShape output_shape;
 
-#if 0
-  // TODO(nhasabni): need to implement
-  for (size_t j = 0; j < mkl_shape.GetDimension(); j++) {
-    // Outermost to innermost dimension
-    output_shape.AddDim(mkl_shape.GetSizes()[mkl_shape.tf_dim_idx(j)]);
-  }
-
-  // Allocate output tensor.
-  context->allocate_temp(DataTypeToEnum<T>::v(), output_shape, &output_tensor);
-
-  dnnLayout_t output_layout = static_cast<dnnLayout_t>(mkl_shape.GetTfLayout());
-  void* input_buffer = const_cast<T*>(mkl_tensor.flat<T>().data());
-  void* output_buffer = const_cast<T*>(output_tensor.flat<T>().data());
-
-  if (mkl_tensor.NumElements() != 0) {
-    mkl_shape.GetConvertedFlatData(output_layout, input_buffer, output_buffer);
-  }
-#endif
+  TF_CHECK_OK(
+      Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function"));
 
   return output_tensor;
 }
@@ -658,7 +725,7 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
           sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) {
   mklshape->DeSerializeMklDnnShape(
       ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
@@ -682,6 +749,8 @@ inline void GetMklInputList(OpKernelContext* ctext, StringPiece name,
   ctext->input_list(name, input_tensors);
 }
 
+#ifdef INTEL_MKL_ML
+
 inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
                             MklShapeList* mkl_shapes) {
   OpInputList input_mkl_tensors;
@@ -694,7 +763,23 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
   }
 }
 
-#ifdef INTEL_MKL_DNN
+#else
+
+inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
+                            MklDnnShapeList* mkl_shapes) {
+  OpInputList input_mkl_tensors;
+  GetMklInputList(ctext, strings::StrCat("mkl_", name), &input_mkl_tensors);
+
+  for (int i = 0; i < input_mkl_tensors.size(); i++) {
+    (*mkl_shapes)[i].DeSerializeMklDnnShape(
+        input_mkl_tensors[i].flat<uint8>().data(),
+        input_mkl_tensors[i].flat<uint8>().size() * sizeof(uint8));
+  }
+}
+
+#endif
+
+#ifndef INTEL_MKL_ML
 /// Get shape of input tensor pointed by 'input_idx' in TensorShape format.
 /// If the input tensor is in MKL layout, then obtains TensorShape from
 /// MklShape.
@@ -729,7 +814,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 // Allocate the second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -766,7 +851,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 // Allocate the output tensor, create a second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -790,7 +875,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
 
 // Allocates a temp tensor and returns the data buffer for temporary storage.
 // Currently
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 template <typename T>
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                            const memory::primitive_desc& pd, void** buf_out) {
@@ -888,8 +973,8 @@ inline int64 GetMklTensorDim(const MklShape& mkl_shape, char dimension) {
   return mkl_shape.dim_size(index);
 }
 
-inline void CopyMklTensorInToOut(OpKernelContext* context,
-                                 int idx_in, int idx_out) {
+inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in,
+                                 int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -909,8 +994,9 @@ inline void CopyMklTensorInToOut(OpKernelContext* context,
   context->set_output(idx_meta_out, meta_output);
 }
 
-inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
-                                         int idx_in, int idx_out,
+#ifdef INTEL_MKL_ML
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
+                                         int idx_out,
                                          const TensorShape& shape) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
@@ -926,9 +1012,30 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
   CHECK(output.CopyFrom(data, shape));
   context->set_output(idx_data_out, output);
 }
+#else
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in,
+                                         int idx_out,
+                                         const TensorShape& shape) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
 
-inline void ForwardTfTensorInToOut(OpKernelContext* context,
-                                  int idx_in, int idx_out) {
+  const Tensor& data = context->input(idx_data_in);
+  MklDnnShape mkl_shape_output;
+  mkl_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape_output);
+  Tensor output(data.dtype());
+  // TODO(intel_tf): alternatively, call forward_input_to_output_with_shape(...)
+  CHECK(output.CopyFrom(data, shape));
+  context->set_output(idx_data_out, output);
+}
+#endif
+
+#ifdef INTEL_MKL_ML
+
+inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in,
+                                   int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -944,8 +1051,29 @@ inline void ForwardTfTensorInToOut(OpKernelContext* context,
   }
 }
 
-inline void ForwardMklTensorInToOut(OpKernelContext* context,
-                                   int idx_in, int idx_out) {
+#else
+
+inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in,
+                                   int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  MklDnnShape dnn_shape_output;
+  dnn_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, dnn_shape_output);
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+  }
+}
+
+#endif
+
+inline void ForwardMklTensorInToOut(OpKernelContext* context, int idx_in,
+                                    int idx_out) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
@@ -962,6 +1090,25 @@ inline void ForwardMklTensorInToOut(OpKernelContext* context,
   }
 }
 
+#ifndef INTEL_MKL_ML
+inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
+                                                int idx_in, int idx_out,
+                                                const MklDnnShape& mkl_shape) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape);
+
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+  }
+}
+#endif
+
 // Forward the MKL shape ONLY (used in elementwise and other ops where
 // we call the eigen implementation and MKL shape is not used)
 inline void ForwardMklMetaDataInToOut(OpKernelContext* context,
@@ -985,6 +1132,10 @@ inline void SetDummyMklShapeOutput(OpKernelContext* context,
   AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
 }
 
+#ifdef INTEL_MKL_ML
+// We don't need these functions in MKLDNN. We have defined equality operator
+// on MklDnnShape class directly.
+
 // Checks if the TF shape for both MKL tensors is the same or not
 // Returns: true if both TF shapes are the same, false otherwise
 inline bool MklCompareShapes(const MklShape* input_shape_0,
@@ -1051,6 +1202,7 @@ inline bool MklCompareShapes(const TensorShape* input_shape_0,
 
   return true;
 }
+#endif
 
 // These functions do not compile with MKL-DNN since mkl.h is missing.
 // We may need to remove them later.
@@ -1064,11 +1216,11 @@ inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) {
   int64 H = input.dim_size(1);
   int64 W = input.dim_size(2);
   int64 C = input.dim_size(3);
-  int64 stride_n = H*W*C;
-# pragma omp parallel for num_threads(16)
+  int64 stride_n = H * W * C;
+#pragma omp parallel for num_threads(16)
   for (int64 n = 0; n < N; ++n) {
-    mkl_somatcopy('R', 'T', H*W, C, 1, buf_in + n*stride_n, C,
-        buf_out + n*stride_n, H*W);
+    mkl_somatcopy('R', 'T', H * W, C, 1, buf_in + n * stride_n, C,
+                  buf_out + n * stride_n, H * W);
   }
 }
 
@@ -1080,17 +1232,17 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
   int64 H = (*output)->dim_size(1);
   int64 W = (*output)->dim_size(2);
   int64 C = (*output)->dim_size(3);
-  int64 stride_n = H*W*C;
-# pragma omp parallel for num_threads(16)
+  int64 stride_n = H * W * C;
+#pragma omp parallel for num_threads(16)
   for (int64 n = 0; n < N; ++n) {
-    mkl_somatcopy('R', 'T', C, H*W, 1, buf_in + n*stride_n, H*W,
-        buf_out + n*stride_n, C);
+    mkl_somatcopy('R', 'T', C, H * W, 1, buf_in + n * stride_n, H * W,
+                  buf_out + n * stride_n, C);
   }
 }
 
 // -------------------------------------------------------------------
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 /// Return MKL-DNN data type (memory::data_type) for input type T
 ///
@@ -1132,6 +1284,10 @@ inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) {
   else if (format == memory::format::nchw)
     return FORMAT_NCHW;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
+
+  // Return to prevent compiler warnings, otherwise TF_CHECK_OK will ensure
+  // that we don't come here.
+  return FORMAT_NHWC;
 }
 
 /// Map TensorShape object into memory::dims required by MKL-DNN
@@ -1175,6 +1331,23 @@ inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape,
   return memory::dims({n, c, h, w});
 }
 
+/// Overloaded version of function above. Input parameters are
+/// self-explanatory.
+inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
+                                     TensorFormat format) {
+  // Check validity of format.
+  CHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+           memory::format::format_undef);
+
+  int n = in_dims[GetTensorDimIndex(format, 'N')];
+  int c = in_dims[GetTensorDimIndex(format, 'C')];
+  int h = in_dims[GetTensorDimIndex(format, 'H')];
+  int w = in_dims[GetTensorDimIndex(format, 'W')];
+
+  // MKL-DNN requires dimensions in NCHW format.
+  return memory::dims({n, c, h, w});
+}
+
 /// Map MklDnn memory::dims object into TensorShape object.
 ///
 /// This function will simply map input shape in MKL-DNN memory::dims format
@@ -1217,6 +1390,42 @@ inline padding_kind TFPaddingToMklDnnPadding(Padding pad) {
   return padding_kind::zero;
 }
 
+/// Helper function to create memory descriptor in Blocked format
+///
+/// @input: Tensor dimensions
+/// @input: strides corresponding to dimensions. One can use utility
+///         function such as CalculateTFStrides to compute strides
+///         for given dimensions.
+/// @return: memory::desc object corresponding to blocked memory format
+///          for given dimensions and strides.
+inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
+                                               const memory::dims& strides,
+                                               memory::data_type dtype) {
+  CHECK_EQ(dim.size(), strides.size());
+
+  // We have to construct memory descriptor in a C style. This is not at all
+  // ideal but MKLDNN does not offer any API to construct descriptor in
+  // blocked format except a copy constructor that accepts
+  // mkldnn_memory_desc_t.
+  mkldnn_memory_desc_t md;
+  md.primitive_kind = mkldnn_memory;
+  md.ndims = dim.size();
+  md.format = mkldnn_blocked;
+  md.data_type = memory::convert_to_c(dtype);
+
+  for (size_t i = 0; i < dim.size(); i++) {
+    md.layout_desc.blocking.block_dims[i] = 1;
+    md.layout_desc.blocking.strides[1][i] = 1;
+    md.layout_desc.blocking.strides[0][i] = strides[i];
+    md.layout_desc.blocking.padding_dims[i] = dim[i];
+    md.layout_desc.blocking.offset_padding_to_data[i] = 0;
+    md.dims[i] = dim[i];
+  }
+  md.layout_desc.blocking.offset_padding = 0;
+
+  return memory::desc(md);
+}
+
 /*
  * Class to represent all the resources corresponding to a tensor in TensorFlow
  * that are required to execute an operation (such as Convolution).
@@ -1286,29 +1495,7 @@ class MklDnnData {
   ///          for given dimensions and strides.
   static inline memory::desc CreateBlockedMemDesc(const memory::dims& dim,
                                                   const memory::dims& strides) {
-    CHECK_EQ(dim.size(), strides.size());
-
-    // We have to construct memory descriptor in a C style. This is not at all
-    // ideal but MKLDNN does not offer any API to construct descriptor in
-    // blocked format except a copy constructor that accepts
-    // mkldnn_memory_desc_t.
-    mkldnn_memory_desc_t md;
-    md.primitive_kind = mkldnn_memory;
-    md.ndims = dim.size();
-    md.format = mkldnn_blocked;
-    md.data_type = memory::convert_to_c(MklDnnType<T>());
-
-    for (size_t i = 0; i < dim.size(); i++) {
-      md.layout_desc.blocking.block_dims[i] = 1;
-      md.layout_desc.blocking.strides[1][i] = 1;
-      md.layout_desc.blocking.strides[0][i] = strides[i];
-      md.layout_desc.blocking.padding_dims[i] = dim[i];
-      md.layout_desc.blocking.offset_padding_to_data[i] = 0;
-      md.dims[i] = dim[i];
-    }
-    md.layout_desc.blocking.offset_padding = 0;
-
-    return memory::desc(md);
+    return CreateBlockedMemDescHelper(dim, strides, MklDnnType<T>());
   }
 
   /// A version of SetUsrMem call that allows user to create memory in blocked
@@ -1438,6 +1625,18 @@ class MklDnnData {
     return op_pd != user_memory_->get_primitive_desc();
   }
 
+  /// Predicate that checks if we need to reorder user's memory into memory
+  /// based on the provided format.
+  ///
+  /// @input: target_format - memory format of the given input of an
+  ///               operation
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool IsReorderNeeded(const memory::format& target_format) const {
+    CHECK_NOTNULL(user_memory_);
+    return target_format !=
+           user_memory_->get_primitive_desc().desc().data.format;
+  }
+
   /// Function to create a reorder from memory pointed by from to memory pointed
   /// by to. Returns created primitive.
   inline primitive CreateReorder(const memory* from, const memory* to) const {
@@ -1554,7 +1753,7 @@ class MklDnnData {
   }
 };
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
index 8b73eadb40046518179fcaaa5c244aa7f3d52ebe..cd1d0713ad58b594005847f48943a228743e530d 100644
--- a/tensorflow/core/util/mkl_util_test.cc
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-#ifdef INTEL_MKL_DNN
+#ifndef INTEL_MKL_ML
 
 TEST(MklUtilTest, MklDnnTfShape) {
   auto cpu_engine = engine(engine::cpu, 0);
@@ -84,7 +84,7 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) {
   EXPECT_EQ(b_md2.data.format, mkldnn_blocked);
 }
 
-#endif  // INTEL_MKL_DNN
+#endif  // INTEL_MKL_ML
 }  // namespace
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h
index e7dab830f0ec9e3401d621f04358d3ee62cb0b63..f88ad2faaff344832d65b04357c3d8c2665ebad5 100644
--- a/tensorflow/core/util/presized_cuckoo_map.h
+++ b/tensorflow/core/util/presized_cuckoo_map.h
@@ -67,7 +67,7 @@ inline uint64 multiply_high_u64(uint64 x, uint64 y) {
   return prod_hi + (prod_mid1 >> 32) + (prod_mid2 >> 32) + carry;
 #endif
 }
-}
+}  // namespace presized_cuckoo_map
 
 template <class value>
 class PresizedCuckooMap {
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 1cb07718feee820c334d8f5183cafb2de0cb009b..575c27d4ef72ec33c4b9352de59fc806b12d6385 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -29,8 +29,8 @@ namespace {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected)) << s << " does not contain "
-                                                 << expected;
+  EXPECT_TRUE(StringPiece(s).contains(expected))
+      << s << " does not contain " << expected;
 }
 
 TEST(TestReporter, NoLogging) {
diff --git a/tensorflow/core/util/session_message.cc b/tensorflow/core/util/session_message.cc
new file mode 100644
index 0000000000000000000000000000000000000000..28a6517a1a3c584b896c0b51f9937bc786283b16
--- /dev/null
+++ b/tensorflow/core/util/session_message.cc
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/session_message.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/util/event.pb.h"
+
+static const int kMaxLogEvents = 1000;
+
+namespace tensorflow {
+
+SessionLogger::SessionLogger() : status_(new SessionStatus) {}
+
+SessionLogger::~SessionLogger() {}
+
+string SessionLogger::DebugString() { return "SessionLogger"; }
+
+void SessionLogger::Log(StringPiece message) {
+  mutex_lock lock(mu_);
+
+  Event* event = status_->add_event();
+  event->set_wall_time(Env::Default()->NowMicros());
+  event->set_step(0);
+  LogMessage* log = event->mutable_log_message();
+  log->set_message(message.ToString());
+  log->set_level(LogMessage::INFO);
+
+  // Clip log events by 10% if we overflow
+  if (status_->event_size() > kMaxLogEvents) {
+    auto events = status_->mutable_event();
+    events->DeleteSubrange(0, kMaxLogEvents / 10);
+  }
+}
+
+SessionLogger* GetSessionLogger(ResourceMgr* rm) {
+  SessionLogger* logger;
+
+  std::function<Status(SessionLogger**)> status_creator =
+      [](SessionLogger** result) {
+        *result = new SessionLogger();
+        return Status::OK();
+      };
+
+  if (!rm->LookupOrCreate<SessionLogger>("session", "status", &logger,
+                                         status_creator)
+           .ok()) {
+    return nullptr;
+  }
+
+  return logger;
+}
+
+void LogSessionMessage(ResourceMgr* rm, StringPiece message) {
+  return GetSessionLogger(rm)->Log(message);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/session_message.h b/tensorflow/core/util/session_message.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0f3d78b46a50386403c453fcc92d56a456206de
--- /dev/null
+++ b/tensorflow/core/util/session_message.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_SESSION_MESSAGE_H_
+#define TENSORFLOW_CORE_UTIL_SESSION_MESSAGE_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+class ResourceMgr;
+class SessionStatus;
+
+class SessionLogger : public ResourceBase {
+ public:
+  SessionLogger();
+  ~SessionLogger();
+
+  void Log(StringPiece message);
+  string DebugString() override;
+
+  const SessionStatus& status() { return *status_; }
+
+ private:
+  std::unique_ptr<SessionStatus> status_;
+  mutex mu_;
+};
+
+// Return a SessionLogger instance for the current session.  If the logger
+// will be used across multiple computations, you must explicitly acquire
+// and release references using Ref()/Unref().
+//
+// Returns nullptr if a logger cannot be created.
+SessionLogger* GetSessionLogger(ResourceMgr* rm);
+
+// Attach `message` to the logger for the current session.
+void LogSessionMessage(ResourceMgr* rm, StringPiece message);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_SESSION_MESSAGE_H
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 0ea74c38b1916f777eaaf7b0907b614e680ea6e7..258ee418c145bae161c7603d4249875fb687c94a 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/sparse/dim_comparator.h"
 #include "tensorflow/core/util/sparse/group_iterator.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace sparse {
@@ -59,8 +59,8 @@ class SparseTensor {
         shape_(shape.begin(), shape.end()),
         order_(order.begin(), order.end()),
         dims_(GetDimsFromIx(ix)) {
-    CHECK_EQ(ix.dtype(), DT_INT64) << "indices must be type int64 but got: "
-                                   << ix.dtype();
+    CHECK_EQ(ix.dtype(), DT_INT64)
+        << "indices must be type int64 but got: " << ix.dtype();
     CHECK(TensorShapeUtils::IsVector(vals.shape()))
         << "vals must be a vec, but got: " << vals.shape().DebugString();
     CHECK_EQ(ix.shape().dim_size(0), vals.shape().dim_size(0))
@@ -69,6 +69,21 @@ class SparseTensor {
     CHECK_EQ(shape.size(), dims_) << "Shape rank must be SparseTensor rank.";
   }
 
+  SparseTensor(const SparseTensor& other)
+      : SparseTensor(other.ix_, other.vals_, other.shape_, other.order_) {}
+
+  SparseTensor(SparseTensor&& other)
+      : SparseTensor(std::move(other.ix_), std::move(other.vals_),
+                     std::move(other.shape_), std::move(other.order_)) {}
+
+  SparseTensor& operator=(const SparseTensor& other) {
+    ix_ = other.ix_;
+    vals_ = other.vals_;
+    shape_ = other.shape_;
+    order_ = other.order_;
+    return *this;
+  }
+
   std::size_t num_entries() const { return ix_.dim_size(0); }
 
   int dims() const { return shape_.size(); }
@@ -601,7 +616,7 @@ SparseTensor SparseTensor::Slice(const SparseTensor& input_tensor,
   int index = 0;
   for (int i = 0; i < input_tensor.indices().dim_size(0) && index < count;
        i++) {
-    // The logic here is similiar as the above except that the above
+    // The logic here is similar as the above except that the above
     // only count the number of indices while here we actually generate
     // the output.
     bool hit = true;
diff --git a/tensorflow/core/util/sparse/sparse_tensor_test.cc b/tensorflow/core/util/sparse/sparse_tensor_test.cc
index efdd97fd3d6ffa5c1f66f2a0950d7bd44ba01eb1..85de0320857e307ea54594c2eff611b9e413945b 100644
--- a/tensorflow/core/util/sparse/sparse_tensor_test.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace sparse {
diff --git a/tensorflow/core/util/stream_executor_util.h b/tensorflow/core/util/stream_executor_util.h
index 6a5ddec04c9d6c2f723e0caa7343103f09c63183..f7767ace716782e53a2023bea7acc7b2f3c6604c 100644
--- a/tensorflow/core/util/stream_executor_util.h
+++ b/tensorflow/core/util/stream_executor_util.h
@@ -41,9 +41,10 @@ class StreamExecutorUtil {
   // This assumes that the error codes between the two implementations
   // match.
   static Status ConvertStatus(const perftools::gputools::port::Status& s) {
-    return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
-                                              static_cast<int>(s.code())),
-                                          s.error_message());
+    return s.ok() ? Status::OK()
+                  : Status(static_cast<tensorflow::error::Code>(
+                               static_cast<int>(s.code())),
+                           s.error_message());
   }
 };
 
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index cfe9275a09189b0d72e57a79cd860de9ab5d82b8..aca60b942d15841438329c922a8aaaded7b08430 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -218,8 +218,8 @@ Status ValidateStridedSliceOp(
 
   // Step 2: Make a sparse spec into a full index spec
   //
-  // The sparse spec does not corresopnds to the number of dimensions
-  // Make a dense spec that corresponds to thte number of dimensions
+  // The sparse spec does not correspond to the number of dimensions
+  // Make a dense spec that corresponds to the number of dimensions
   //
   // For example suppose foo[...,3:] on foo.shape=(2,2,3) then
   // we need to produce the missing begin_mask for the first two
diff --git a/tensorflow/core/util/strided_slice_op.h b/tensorflow/core/util/strided_slice_op.h
index abca98f27b534ea3c4fc2bb7832a38ea6f47df0c..25ecccd28550e943e4a7ab9bc1529426ea8454d2 100644
--- a/tensorflow/core/util/strided_slice_op.h
+++ b/tensorflow/core/util/strided_slice_op.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
+#ifndef TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
+#define TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -62,4 +62,4 @@ Status ValidateStridedSliceOp(
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
+#endif  // TENSORFLOW_CORE_UTIL_STRIDED_SLICE_OP_H_
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index d0e54b7e4774e8cd2b2295df4f3fa4c724acbfac..0426fee0e2679718a80cfb46bcb78a668c6b6e83 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -51,6 +51,9 @@ const int kTensorBundleMinProducer = 0;
 const int kTensorBundleMinConsumer = 0;
 const int kTensorBundleVersion = 1;
 
+// Size of our input buffer for streaming reads
+static const int kBufferSize = 1024 * 1024;
+
 // Key to the special BundleHeaderProto entry.  Do not change this, as clients
 // can make the assumption that the header is always the first entry in the
 // bundle.
@@ -141,7 +144,11 @@ Status ReadVariantTensor(io::InputBuffer* buffered_file, Tensor* ret,
         buffered_file->ReadNBytes(string_length, &buffer[0], &bytes_read));
     *actual_crc32c = crc32c::Extend(*actual_crc32c, buffer.data(), bytes_read);
     VariantTensorDataProto proto;
-    proto.ParseFromString(buffer);
+    if (!proto.ParseFromString(buffer)) {
+      return errors::DataLoss("Unable to parse VariantTensorDataProto from ",
+                              "buffer of size ", string_length, ". ",
+                              "Bundle entry offset: ", offset, " size: ", size);
+    }
     Variant v = proto;
     if (!DecodeUnaryVariant(&v)) {
       return errors::Internal("Could not decode variant with type_name: \"",
@@ -279,7 +286,7 @@ Status WriteVariantTensor(const Tensor& val, FileOutputBuffer* out,
     TF_RETURN_IF_ERROR(out->Append(len));
     *crc32c = crc32c::Extend(*crc32c, reinterpret_cast<const char*>(&elem_size),
                              sizeof(uint64));
-    *bytes_written += sizeof(uint64);
+    *bytes_written += len.size();
 
     // Write the serialized variant.
     TF_RETURN_IF_ERROR(out->Append(elem));
@@ -299,43 +306,6 @@ Status WriteVariantTensor(const Tensor& val, FileOutputBuffer* out,
   return Status::OK();
 }
 
-// Reads file[offset:offset+size) into destination[0:size).  Each Read() copies
-// at most "buffer_size" bytes.
-//
-// REQUIRES: "file" contains at least "offset + size" bytes.
-// REQUIRES: "destination" contains at least "size" bytes.
-// On error, "destination" may contain garbage.
-Status ReadInputByChunk(const RandomAccessFile* file, size_t offset,
-                        size_t size, size_t buffer_size, char* destination) {
-  if (size == 0) return Status::OK();
-  CHECK_GT(size, 0);
-  CHECK_GT(buffer_size, 0);
-  size_t bytes_read = 0;
-  StringPiece result;
-
-  while (bytes_read < size) {
-    const size_t desired_bytes = std::min(buffer_size, size - bytes_read);
-    Status status = file->Read(offset + bytes_read, desired_bytes, &result,
-                               destination + bytes_read);
-
-    if (!status.ok()) {
-      return status;
-    } else if (result.size() != desired_bytes) {
-      return errors::DataLoss("Requested ", desired_bytes, " bytes but read ",
-                              result.size(), " bytes.");
-    } else if (result.data() == destination + bytes_read) {
-      // Data is already in the correct location.
-    } else {
-      // memmove is guaranteed to handle overlaps safely (although the src and
-      // dst buffers should not overlap for this function).
-      memmove(destination + bytes_read, result.data(), result.size());
-    }
-    bytes_read += result.size();
-  }
-  CHECK_EQ(bytes_read, size);
-  return Status::OK();
-}
-
 // Returns whether "slice_spec" is a full slice, with respect to the full shape.
 //
 // This can happen say, when "slice_spec" is
@@ -379,10 +349,27 @@ table::Options TableBuilderOptions() {
   return o;
 }
 
+// Writes zeros to output buffer to align the next write to the requested
+// alignment. "size" is the current size of the buffer and is updated to the
+// new size.
+Status PadAlignment(FileOutputBuffer* out, int alignment, int64* size) {
+  int bytes_over = *size % alignment;
+  if (bytes_over == 0) {
+    return Status::OK();
+  }
+  int bytes_to_write = alignment - bytes_over;
+  Status status = out->Append(string(bytes_to_write, '\0'));
+  if (status.ok()) {
+    *size += bytes_to_write;
+  }
+  return status;
+}
+
 }  // namespace
 
-BundleWriter::BundleWriter(Env* env, StringPiece prefix)
+BundleWriter::BundleWriter(Env* env, StringPiece prefix, const Options& options)
     : env_(env),
+      options_(options),
       prefix_(prefix.ToString()),
       tmp_metadata_path_(strings::StrCat(MetaFilename(prefix_), ".tempstate",
                                          random::New64())),
@@ -436,6 +423,7 @@ Status BundleWriter::Add(StringPiece key, const Tensor& val) {
     entry->set_size(data_bytes_written);
     entry->set_crc32c(crc32c::Mask(crc32c));
     size_ += data_bytes_written;
+    status_ = PadAlignment(out_.get(), options_.data_alignment, &size_);
   }
   return status_;
 }
@@ -705,13 +693,6 @@ Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
   return status;
 }
 
-// TODO(b/64763924): Remove after Jan 1st 2018.
-bool GetLenientNames() {
-  const char* lenient_names_str = std::getenv("TF_SAVER_LENIENT_NAMES");
-  return lenient_names_str != nullptr &&
-         std::strcmp(lenient_names_str, "") != 0;
-}
-
 // Interface for reading a tensor bundle.
 
 BundleReader::BundleReader(Env* env, StringPiece prefix)
@@ -757,7 +738,6 @@ BundleReader::BundleReader(Env* env, StringPiece prefix)
   }
   status_ = CheckVersions(header.version(), kTensorBundleVersion,
                           kTensorBundleMinProducer, "Checkpoint", "checkpoint");
-  lenient_names_ = GetLenientNames();
 }
 
 BundleReader::~BundleReader() {
@@ -780,23 +760,6 @@ Status BundleReader::GetBundleEntryProto(StringPiece key,
   TF_CHECK_OK(status_);
   Seek(key);
   if (!iter_->Valid() || iter_->key() != key) {
-    if (lenient_names_ && !key.ends_with(":0")) {
-      // TODO(b/64763924): Remove after Jan 1st 2018.
-      // Try appending ":0" to the key.
-      const string key_with_colon_zero = key.ToString() + ":0";
-      Status status = GetBundleEntryProto(key_with_colon_zero, entry);
-      if (status.ok()) {
-        LOG(WARNING) << "Key " << key << " was not found; using key "
-                     << key_with_colon_zero << " instead. This lenient naming "
-                     << "behavior will be removed on Jan 1st 2018, so please "
-                     << "update your checkpoint file.";
-        return status;
-      } else if (status.code() != error::NOT_FOUND) {
-        return status;
-      }
-      LOG(INFO) << "Looked for both " << key << " and " << key_with_colon_zero
-                << " in checkpoint.";
-    }
     return errors::NotFound("Key ", key, " not found in checkpoint");
   }
 
@@ -847,8 +810,7 @@ Status BundleReader::GetValue(const BundleEntryProto& entry, Tensor* val) {
     std::unique_ptr<RandomAccessFile> file = nullptr;
     TF_RETURN_IF_ERROR(env_->NewRandomAccessFile(
         DataFilename(prefix_, entry.shard_id(), num_shards_), &file));
-    buffered_file =
-        new io::InputBuffer(file.release(), 256 << 10 /* 256KB buffer */);
+    buffered_file = new io::InputBuffer(file.release(), kBufferSize);
     // The InputBuffer and RandomAccessFile objects are both released in dtor.
     data_[entry.shard_id()] = buffered_file;
   }
@@ -856,14 +818,21 @@ Status BundleReader::GetValue(const BundleEntryProto& entry, Tensor* val) {
 
   TF_RETURN_IF_ERROR(buffered_file->Seek(entry.offset()));
   uint32 actual_crc32c = 0;
+
   if (DataTypeCanUseMemcpy(entry.dtype())) {
-    // Important: ReadInputByChunk() bounds the readahead as min(buffer, actual
-    // bytes needed).  This is critical when reading small tensors, so we don't
-    // rely on io::InputBuffer's blind buffering here.
     char* backing_buffer = const_cast<char*>((ret->tensor_data().data()));
-    TF_RETURN_IF_ERROR(ReadInputByChunk(buffered_file->file(), entry.offset(),
-                                        entry.size(), 8 << 20 /* 8MB buffer */,
-                                        backing_buffer));
+    size_t unused_bytes_read;
+    if (entry.size() > kBufferSize) {
+      StringPiece sp;
+      TF_RETURN_IF_ERROR(buffered_file->file()->Read(
+          entry.offset(), entry.size(), &sp, backing_buffer));
+      if (sp.data() != backing_buffer) {
+        memmove(backing_buffer, sp.data(), entry.size());
+      }
+    } else {
+      TF_RETURN_IF_ERROR(buffered_file->ReadNBytes(entry.size(), backing_buffer,
+                                                   &unused_bytes_read));
+    }
     actual_crc32c = crc32c::Value(backing_buffer, entry.size());
   } else if (entry.dtype() == DT_VARIANT) {
     // Relies on io::InputBuffer's buffering, because we issue many neighboring
@@ -944,8 +913,8 @@ Status BundleReader::LookupSlice(StringPiece full_tensor_key,
 Status BundleReader::GetSliceValue(StringPiece full_tensor_key,
                                    const BundleEntryProto& full_tensor_entry,
                                    const TensorSlice& slice_spec, Tensor* val) {
-  using checkpoint::TensorSliceSet;
   using checkpoint::RegisterTensorSlice;
+  using checkpoint::TensorSliceSet;
   DCHECK_GE(full_tensor_entry.slices_size(), 0);
 
   const TensorShape full_shape(TensorShape(full_tensor_entry.shape()));
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
index 129646cb6935dfa16eecd7c5bd880544c8545366..d30ce3f0cf1df2f622994a47164fa91dbfea3e5c 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.h
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -107,7 +107,14 @@ extern const char* const kHeaderEntryKey;
 // All threads accessing the same BundleWriter must synchronize.
 class BundleWriter {
  public:
-  BundleWriter(Env* env, StringPiece prefix);
+  struct Options {
+    Options() {}
+    // Alignment, in bytes, for tensor data.
+    // Must be >= 1. The default size of 1 densely packs tensors.
+    int data_alignment{1};
+  };
+  BundleWriter(Env* env, StringPiece prefix,
+               const Options& options = Options());
 
   // Adds the tensor "val" under key "key".
   // Across calls "key" must be unique but can be added in any order.
@@ -140,6 +147,7 @@ class BundleWriter {
 
  private:
   Env* const env_;  // Not owned.
+  const Options options_;
   const string prefix_;
   const string tmp_metadata_path_;
   const string tmp_data_path_;
@@ -292,10 +300,7 @@ class BundleReader {
   // the header entry in the metadata table.
   int num_shards_;
 
-  // If set to true, try reading key + ":0" whenever key is not found in the
-  // bundle. This is a temporary measure that will be removed on Jan 1st 2018.
-  // TODO(b/64763924): Remove after Jan 1st 2018.
-  bool lenient_names_;
+  friend class TensorBundleAlignmentTest;  // For testing data alignment.
 
   TF_DISALLOW_COPY_AND_ASSIGN(BundleReader);
 };
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index 341aae36f4165767d56f28bcf733146f473c897b..08f1aa7125bc47421e0db24a9db6f6e2b2f1e365 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/table_builder.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 
@@ -770,4 +771,91 @@ TEST(TensorBundleTest, VersionTest) {
   }
 }
 
+class TensorBundleAlignmentTest : public ::testing::Test {
+ protected:
+  template <typename T>
+  void ExpectAlignment(BundleReader* reader, const string& key, int alignment) {
+    BundleEntryProto full_tensor_entry;
+    TF_ASSERT_OK(reader->GetBundleEntryProto(key, &full_tensor_entry));
+    EXPECT_EQ(0, full_tensor_entry.offset() % alignment);
+  }
+};
+
+TEST_F(TensorBundleAlignmentTest, AlignmentTest) {
+  {
+    BundleWriter::Options opts;
+    opts.data_alignment = 42;
+    BundleWriter writer(Env::Default(), Prefix("foo"), opts);
+    TF_EXPECT_OK(writer.Add("foo_003", Constant_2x3<float>(3)));
+    TF_EXPECT_OK(writer.Add("foo_000", Constant_2x3<float>(0)));
+    TF_EXPECT_OK(writer.Add("foo_002", Constant_2x3<float>(2)));
+    TF_EXPECT_OK(writer.Add("foo_001", Constant_2x3<float>(1)));
+    TF_ASSERT_OK(writer.Finish());
+  }
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+    EXPECT_EQ(
+        AllTensorKeys(&reader),
+        std::vector<string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
+    Expect<float>(&reader, "foo_000", Constant_2x3<float>(0));
+    Expect<float>(&reader, "foo_001", Constant_2x3<float>(1));
+    Expect<float>(&reader, "foo_002", Constant_2x3<float>(2));
+    Expect<float>(&reader, "foo_003", Constant_2x3<float>(3));
+  }
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+    ExpectNext<float>(&reader, Constant_2x3<float>(0));
+    ExpectNext<float>(&reader, Constant_2x3<float>(1));
+    ExpectNext<float>(&reader, Constant_2x3<float>(2));
+    ExpectNext<float>(&reader, Constant_2x3<float>(3));
+    EXPECT_TRUE(reader.Valid());
+    reader.Next();
+    EXPECT_FALSE(reader.Valid());
+  }
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+    ExpectAlignment<float>(&reader, "foo_000", 42);
+    ExpectAlignment<float>(&reader, "foo_001", 42);
+    ExpectAlignment<float>(&reader, "foo_002", 42);
+    ExpectAlignment<float>(&reader, "foo_003", 42);
+  }
+}
+
+static void BM_BundleAlignmentByteOff(int iters, int alignment,
+                                      int tensor_size) {
+  testing::StopTiming();
+  {
+    BundleWriter::Options opts;
+    opts.data_alignment = alignment;
+    BundleWriter writer(Env::Default(), Prefix("foo"), opts);
+    TF_CHECK_OK(writer.Add("small", Constant(true, TensorShape({1}))));
+    TF_CHECK_OK(writer.Add("big", Constant(32.1, TensorShape({tensor_size}))));
+    TF_CHECK_OK(writer.Finish());
+  }
+  BundleReader reader(Env::Default(), Prefix("foo"));
+  TF_CHECK_OK(reader.status());
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    Tensor t;
+    TF_CHECK_OK(reader.Lookup("big", &t));
+  }
+  testing::StopTiming();
+}
+
+#define BM_BundleAlignment(ALIGN, SIZE)                        \
+  static void BM_BundleAlignment_##ALIGN##_##SIZE(int iters) { \
+    BM_BundleAlignmentByteOff(iters, ALIGN, SIZE);             \
+  }                                                            \
+  BENCHMARK(BM_BundleAlignment_##ALIGN##_##SIZE)
+
+BM_BundleAlignment(1, 512);
+BM_BundleAlignment(1, 4096);
+BM_BundleAlignment(1, 1048576);
+BM_BundleAlignment(4096, 512);
+BM_BundleAlignment(4096, 4096);
+BM_BundleAlignment(4096, 1048576);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/tensor_slice_reader.h b/tensorflow/core/util/tensor_slice_reader.h
index 4bb2b246158cb2c3387467d0cd89408a6dee9608..263f56c7fcb2fa822de2e0adb5e346feddc71cc2 100644
--- a/tensorflow/core/util/tensor_slice_reader.h
+++ b/tensorflow/core/util/tensor_slice_reader.h
@@ -15,7 +15,6 @@ limitations under the License.
 
 // The utility to read checkpoints for google brain tensor ops and v3
 // checkpoints for dist_belief.
-//
 
 #ifndef TENSORFLOW_UTIL_TENSOR_SLICE_READER_H_
 #define TENSORFLOW_UTIL_TENSOR_SLICE_READER_H_
diff --git a/tensorflow/core/util/tensor_slice_reader_cache.cc b/tensorflow/core/util/tensor_slice_reader_cache.cc
index 0f009d7de57a3cf1471c1ba694d3a771bc00635c..424f8098a9c1e3cec3851be06d04d49bed93e9af 100644
--- a/tensorflow/core/util/tensor_slice_reader_cache.cc
+++ b/tensorflow/core/util/tensor_slice_reader_cache.cc
@@ -55,7 +55,7 @@ const TensorSliceReader* TensorSliceReaderCache::GetReader(
     TensorSliceReader::OpenTableFunction open_function, int preferred_shard) {
   mutex_lock l(mu_);
 
-#if defined(__GXX_RTTI) ||  defined(_CPPRTTI)
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
   // Get the function pointer from the open_function value.
   TensorSliceReaderCache::OpenFuncType* func_ptr =
       open_function.target<TensorSliceReaderCache::OpenFuncType>();
diff --git a/tensorflow/core/util/tensor_slice_reader_cache.h b/tensorflow/core/util/tensor_slice_reader_cache.h
index bdd36a2791db690824032f25e339354d23f59441..63a8d0b068d21c8e178f3dd344b15db6484a8453 100644
--- a/tensorflow/core/util/tensor_slice_reader_cache.h
+++ b/tensorflow/core/util/tensor_slice_reader_cache.h
@@ -15,7 +15,6 @@ limitations under the License.
 
 // The utility to read checkpoints for google brain tensor ops and v3
 // checkpoints for dist_belief.
-//
 
 #ifndef TENSORFLOW_UTIL_TENSOR_SLICE_READER_CACHE_H_
 #define TENSORFLOW_UTIL_TENSOR_SLICE_READER_CACHE_H_
diff --git a/tensorflow/core/util/tensor_slice_set.cc b/tensorflow/core/util/tensor_slice_set.cc
index 4217df90ca147ccc17cadf6c46c6e4ef4524f12b..7c1d325c0a54e7ba5261f645a2962970fa2d3630 100644
--- a/tensorflow/core/util/tensor_slice_set.cc
+++ b/tensorflow/core/util/tensor_slice_set.cc
@@ -188,9 +188,9 @@ Status RegisterTensorSlice(
     }
     if (type != tss->type()) {
       return errors::Internal("Incompatible tensor types detected for tensor ",
-                              name, ": existing = ",
-                              DataTypeString(tss->type()), ", new = ",
-                              DataTypeString(type));
+                              name,
+                              ": existing = ", DataTypeString(tss->type()),
+                              ", new = ", DataTypeString(type));
     }
   }
   // Register the tensor slices without the actual data.
diff --git a/tensorflow/core/util/tensor_slice_util.h b/tensorflow/core/util/tensor_slice_util.h
index c7edae66b267d4cbd88d497c745b4d81802ab3a9..8f5a6f1d93591e94ec759d343ec26146c67552c0 100644
--- a/tensorflow/core/util/tensor_slice_util.h
+++ b/tensorflow/core/util/tensor_slice_util.h
@@ -139,9 +139,9 @@ static bool CopyDataFromTensorSliceToTensorSlice(const TensorShape& shape,
                                                  const TensorSlice& slice_d,
                                                  const SrcT* ptr_s,
                                                  DstT* ptr_d) {
-  CHECK_LE(shape.dims(), kTensorSliceMaxRank) << "Only tensors of size up to "
-                                              << kTensorSliceMaxRank
-                                              << " are supported";
+  CHECK_LE(shape.dims(), kTensorSliceMaxRank)
+      << "Only tensors of size up to " << kTensorSliceMaxRank
+      << " are supported";
   // We need to compute the intersection of the two slices.
   TensorSlice inter;
   if (!slice_s.Intersect(slice_d, &inter)) {
diff --git a/tensorflow/core/util/tensor_slice_writer.h b/tensorflow/core/util/tensor_slice_writer.h
index 95d6384afecd28025cc5e14c6f525caeafe1f0a5..2888c66d10fa3c2ab0eaf755a23da3eb3fcd6b09 100644
--- a/tensorflow/core/util/tensor_slice_writer.h
+++ b/tensorflow/core/util/tensor_slice_writer.h
@@ -15,7 +15,6 @@ limitations under the License.
 
 // The utility to write checkpoints for google brain tensor ops and v3
 // checkpoints for dist_belief.
-//
 
 #ifndef TENSORFLOW_UTIL_TENSOR_SLICE_WRITER_H_
 #define TENSORFLOW_UTIL_TENSOR_SLICE_WRITER_H_
@@ -102,8 +101,8 @@ Status TensorSliceWriter::Add(const string& name, const TensorShape& shape,
   // The tensor and the slice have to be compatible
   if (shape.dims() != slice.dims()) {
     return errors::Internal("Incompatible tensor shape and slice: ", "shape = ",
-                            shape.DebugString(), ", slice = ",
-                            slice.DebugString());
+                            shape.DebugString(),
+                            ", slice = ", slice.DebugString());
   }
   DataType dt = DataTypeToEnum<T>::value;
   // We need to add an entry for "name" if there isn't an entry already.
@@ -115,9 +114,9 @@ Status TensorSliceWriter::Add(const string& name, const TensorShape& shape,
     CHECK_EQ(name, ssm.name()) << ProtoShortDebugString(ssm);
     TensorShape ssm_shape(ssm.shape());
     if (!shape.IsSameSize(ssm_shape)) {
-      return errors::Internal("Mismatching shapes: existing tensor = ",
-                              ssm_shape.DebugString(), ", trying to add name ",
-                              name, ", shape = ", shape.DebugString());
+      return errors::Internal(
+          "Mismatching shapes: existing tensor = ", ssm_shape.DebugString(),
+          ", trying to add name ", name, ", shape = ", shape.DebugString());
     }
     if (dt != ssm.type()) {
       return errors::Internal(
diff --git a/tensorflow/docs_src/about/bib.md b/tensorflow/docs_src/about/bib.md
index c9f0c532c62791a9fcf854f11fd2f330955ee7d6..5593a3d95c435df38174fde5db37f4dd3437acd4 100644
--- a/tensorflow/docs_src/about/bib.md
+++ b/tensorflow/docs_src/about/bib.md
@@ -60,7 +60,7 @@ author={
     Lukasz~Kaiser and
     Manjunath~Kudlur and
     Josh~Levenberg and
-    Dan~Man\'{e} and
+    Dandelion~Man\'{e} and
     Rajat~Monga and
     Sherry~Moore and
     Derek~Murray and
diff --git a/tensorflow/docs_src/about/roadmap.md b/tensorflow/docs_src/about/roadmap.md
index 3ee825ed400de93553bf69fee065fcf8ef13be4d..ce9e619b10f2476256664608f657221ff9275475 100644
--- a/tensorflow/docs_src/about/roadmap.md
+++ b/tensorflow/docs_src/about/roadmap.md
@@ -1,37 +1,86 @@
 # Roadmap
-**Last updated: January 23, 2017**
+**Last updated: Feb 12, 2018**
 
-TensorFlow is a fast moving project. In order for the community to better
-understand what the near future will bring, this document shares what we are
-working on internally. Many of these features were requested by the community,
-and we welcome
-[contributions](https://github.com/tensorflow/tensorflow/labels/stat%3Acontributions%20welcome).
+TensorFlow is a rapidly moving, community supported project. This document is intended 
+to provide guidance about priorities and focus areas of the core set of TensorFlow 
+developers and about functionality that can be expected in the upcoming releases of 
+TensorFlow. Many of these areas are driven by  community use cases, and we welcome 
+further 
+[contributions](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md) 
+to TensorFlow.
 
-The features on this list are targeted for the next few months. At this point,
-we do not have timelines for these features.
+The features below do not have concrete release dates. However, the majority can be 
+expected in the next one to two releases. 
 
-### Improve non-Python language support
+### APIs
+#### High Level APIs:
+* Easy multi-GPU utilization with Estimators
+* Easy-to-use high-level pre-made estimators for Gradient Boosted Trees, Time Series, and other models
 
-* Support for adding gradient computation for graphs constructed in other
-  languages (C++, Java, Go etc.)
+#### Eager Execution:
+* Efficient utilization of multiple GPUs
+* Distributed training (multi-machine)
+* Performance improvements
+* Simpler export to a GraphDef/SavedModel 
 
-### Making TensorFlow easier to use
-* High-level APIs
-* Well-maintained models showing best practices
+#### Keras API:
+* Better integration with tf.data (ability to call `model.fit` with data tensors)
+* Full support for Eager execution (both Eager support for the regular Keras API, and ability 
+to create Keras models Eager- style via Model subclassing)
+* Better distribution/multi-GPU support and TPU support (including a smoother model-to-estimator workflow)
 
-### Performance
-* Speed and memory benchmarks
-* Distributed full model benchmarks
-* Performance and memory usage improvements
+#### Official Models:
+* A set of 
+[reference models](https://github.com/tensorflow/models/tree/master/official) 
+across image recognition, speech, object detection, and 
+  translation that demonstrate best practices and serve as a starting point for 
+  high-performance model development.
+
+#### Contrib:
+* Deprecation notices added to parts of tf.contrib where preferred implementations exist outside of tf.contrib.
+* As much as possible, large projects inside tf.contrib moved to separate repositories.
+* The tf.contrib module will eventually be discontinued in its current form, experimental development will in future happen in other repositories.
 
-### Core Features
-* Automatic op placement ([#2126](https://github.com/tensorflow/tensorflow/issues/2126))
-* Support for graph-level functions
+
+#### Probabilistic Reasoning and Statistical Analysis:
+* Rich set of tools for probabilistic and statistical analysis in tf.distributions 
+  and tf.probability. These include new samplers, layers, optimizers, losses, and structured models
+* Statistical tools for hypothesis testing, convergence diagnostics, and sample statistics
+* Edward 2.0: High-level API for probabilistic programming
 
 ### Platforms
-* OpenCL support ([#22](https://github.com/tensorflow/tensorflow/issues/22))
+#### TFLite:
+* Increased coverage of supported ops in TFLite
+* Easier conversion of a trained TF graph for use on TFLite
+* Support for GPU acceleration in TFLite (iOS and Andorid)
+* Support for hardware accelerators via Android NeuralNets API 
+* Improved CPU performance by quantization and other network optimizations (eg. pruning, distillation)
+* Increased support for devices beyond Android and iOS (eg. RPi, Cortex-M)
+
+### Performance
+#### Distributed TensorFlow:
+* Multi-GPU support optimized for a variety of GPU topologies
+* Improved mechanisms for distributing computations on several machines
+
+#### Optimizations:
+* Mixed precision training support with initial example model and guide
+* Native TensorRT support
+* Int8 support for SkyLake via MKL
+* Dynamic loading of SIMD-optimized kernels
+
+### Documentation and Usability:
+* Updated documentation, tutorials and Getting Started guides
+* Process to enable external contributions to tutorials, documentation, and blogs showcasing best practice use-cases of TensorFlow and high-impact applications
+
+### Community and Partner Engagement
+#### Special Interest Groups: 
+* Mobilizing the community to work together in focused domains
+* [tf-distribute](https://groups.google.com/a/tensorflow.org/forum/#!forum/tf-distribute)
+: build and packaging of TF
+* More to be identified and launched
 
-### Community
-* More educational resources
-* Better integration of TensorFlow into the opensource big data ecosystem (e.g.
-[#2655](https://github.com/tensorflow/tensorflow/issues/2655))
+#### Community:
+* Incorporate public feedback on significant design decisions via a Request-for-Comment (RFC) process
+* Formalize process for external contributions to land in TensorFlow and associated projects 
+* Grow global TF communities and user groups
+* Collaborate with partners to co-develop and publish research papers
diff --git a/tensorflow/docs_src/api_guides/cc/guide.md b/tensorflow/docs_src/api_guides/cc/guide.md
index 81fb1e1fda277e8035ada5a410b966fe2de35a09..4e51ada58a3f85e4b21f1c1aec036116d37a72cf 100644
--- a/tensorflow/docs_src/api_guides/cc/guide.md
+++ b/tensorflow/docs_src/api_guides/cc/guide.md
@@ -1,6 +1,6 @@
 # C++ API
 
-Note: By default [tensorflow.org](http://tensorflow.org) shows docs for the
+Note: By default [tensorflow.org](https://www.tensorflow.org) shows docs for the
 most recent stable version. The instructions in this doc require building from
 source. You will probably want to build from the `master` version of tensorflow.
 You should, as a result, be sure you are following the
diff --git a/tensorflow/docs_src/api_guides/python/client.md b/tensorflow/docs_src/api_guides/python/client.md
index 97c19863600a4b67c7af966d3fd2ef8def36fa20..eef23696db27e187124d2c0921c055c2da6f5613 100644
--- a/tensorflow/docs_src/api_guides/python/client.md
+++ b/tensorflow/docs_src/api_guides/python/client.md
@@ -3,8 +3,8 @@
 
 This library contains classes for launching graphs and executing operations.
 
-The @{$get_started/get_started} guide has
-examples of how a graph is launched in a @{tf.Session}.
+@{$programmers_guide/low_level_intro$This guide} has examples of how a graph
+is launched in a @{tf.Session}.
 
 ## Session management
 
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md
deleted file mode 100644
index fc5d5d70d7ebf42c16294c84c2cc3f8381dae236..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md
+++ /dev/null
@@ -1 +0,0 @@
-# BayesFlow Entropy (contrib)
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md
deleted file mode 100644
index d855787ae695f115368ab76671182f3a6e490411..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md
+++ /dev/null
@@ -1 +0,0 @@
-# BayesFlow Stochastic Graph (contrib)
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md
deleted file mode 100644
index 1cc1ac5d7e670a243f1dcda6ef8c59b6c6d8de2d..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# BayesFlow Stochastic Tensors (contrib)
-[TOC]
-
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md
deleted file mode 100644
index 8f08c09c8fbbc9b5b6ab8612f140f4b7ca7d8b73..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# BayesFlow Variational Inference (contrib)
-[TOC]
-
-Variational inference.
diff --git a/tensorflow/docs_src/api_guides/python/contrib.copy_graph.md b/tensorflow/docs_src/api_guides/python/contrib.copy_graph.md
deleted file mode 100644
index f61f4c764d289814439bb8c5d33bdfb46d208866..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.copy_graph.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copying Graph Elements (contrib)
-[TOC]
-
-Functions for copying elements from one graph to another.
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.md
index 7a3d509b75198461430195aa70a336f94b7f8cfa..533d7dac1373f61ca92dba288a7d29e07e0f37d3 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.distributions.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.distributions.md
@@ -17,7 +17,6 @@ initialized with parameters that define the distributions.
 
 *   @{tf.contrib.distributions.Binomial}
 *   @{tf.contrib.distributions.Bernoulli}
-*   @{tf.contrib.distributions.BernoulliWithSigmoidProbs}
 *   @{tf.contrib.distributions.Beta}
 *   @{tf.contrib.distributions.Categorical}
 *   @{tf.contrib.distributions.Chi2}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.opt.md b/tensorflow/docs_src/api_guides/python/contrib.opt.md
deleted file mode 100644
index 944a80a5ccb0201b5b5a0cf3b57ca31dfc7ce01a..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.opt.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Optimization (contrib)
-[TOC]
-
-opt: A module containing optimization routines.
diff --git a/tensorflow/docs_src/api_guides/python/contrib.signal.md b/tensorflow/docs_src/api_guides/python/contrib.signal.md
index 85ef3ad1341380607f457e9112e39930c569357d..0f7690f80a5bcb4a776df21cf0768f1540f01baf 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.signal.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.signal.md
@@ -28,14 +28,14 @@ The `axis` parameter to @{tf.contrib.signal.frame} allows you to frame tensors
 with inner structure (e.g. a spectrogram):
 
 ```python
-# `magnitude_spectrograms` is a [batch_size, ?, 127] tensor of spectrograms. We
+# `magnitude_spectrograms` is a [batch_size, ?, 129] tensor of spectrograms. We
 # would like to produce overlapping fixed-size spectrogram patches; for example,
 # for use in a situation where a fixed size input is needed.
 magnitude_spectrograms = tf.abs(tf.contrib.signal.stft(
     signals, frame_length=256, frame_step=64, fft_length=256))
 
-# `spectrogram_patches` is a [batch_size, ?, 64, 127] tensor containing a
-# variable number of [64, 127] spectrogram patches per batch item.
+# `spectrogram_patches` is a [batch_size, ?, 64, 129] tensor containing a
+# variable number of [64, 129] spectrogram patches per batch item.
 spectrogram_patches = tf.contrib.signal.frame(
     magnitude_spectrograms, frame_length=64, frame_step=16, axis=1)
 ```
diff --git a/tensorflow/docs_src/api_guides/python/histogram_ops.md b/tensorflow/docs_src/api_guides/python/histogram_ops.md
deleted file mode 100644
index dbd4555429b2a09bdf32e2e421b2d55fac0c0fd0..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/histogram_ops.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Histograms
-[TOC]
-
-## Histograms
-
-*   @{tf.histogram_fixed_width}
diff --git a/tensorflow/docs_src/api_guides/python/image.md b/tensorflow/docs_src/api_guides/python/image.md
index a2c8c3c3c92e2acf177da104304746fb34281de7..051e4547ee6900ded85ae18fb80b51db1eacb009 100644
--- a/tensorflow/docs_src/api_guides/python/image.md
+++ b/tensorflow/docs_src/api_guides/python/image.md
@@ -19,6 +19,7 @@ Note: The PNG encode and decode Ops support RGBA, but the conversions Ops
 presently only support RGB, HSV, and GrayScale. Presently, the alpha channel has
 to be stripped from the image and re-attached using slicing ops.
 
+*   @{tf.image.decode_bmp}
 *   @{tf.image.decode_gif}
 *   @{tf.image.decode_jpeg}
 *   @{tf.image.encode_jpeg}
diff --git a/tensorflow/docs_src/api_guides/python/input_dataset.md b/tensorflow/docs_src/api_guides/python/input_dataset.md
index 94c89c37d520fd1c1ec65fedc813a7b348120913..a6e2fc48e0020ff130f034f747d9ca48b4830c2e 100644
--- a/tensorflow/docs_src/api_guides/python/input_dataset.md
+++ b/tensorflow/docs_src/api_guides/python/input_dataset.md
@@ -18,7 +18,6 @@ Classes that create a dataset from input files.
 Static methods in `Dataset` that create new datasets.
 
 *   @{tf.data.Dataset.from_generator}
-*   @{tf.data.Dataset.from_sparse_tensor_slices}
 *   @{tf.data.Dataset.from_tensor_slices}
 *   @{tf.data.Dataset.from_tensors}
 *   @{tf.data.Dataset.list_files}
@@ -59,8 +58,12 @@ Custom transformation functions can be applied to a `Dataset` using @{tf.data.Da
 *   @{tf.contrib.data.enumerate_dataset}
 *   @{tf.contrib.data.group_by_window}
 *   @{tf.contrib.data.ignore_errors}
+*   @{tf.contrib.data.map_and_batch}
+*   @{tf.contrib.data.padded_batch_and_drop_remainder}
+*   @{tf.contrib.data.parallel_interleave}
 *   @{tf.contrib.data.rejection_resample}
-*   @{tf.contrib.data.sloppy_interleave}
+*   @{tf.contrib.data.scan}
+*   @{tf.contrib.data.shuffle_and_repeat}
 *   @{tf.contrib.data.unbatch}
 
 ## Iterating over datasets
@@ -77,5 +80,7 @@ The `Iterator` class also contains static methods that create a @{tf.data.Iterat
 
 ## Extra functions from `tf.contrib.data`
 
+*   @{tf.contrib.data.get_single_element}
+*   @{tf.contrib.data.make_saveable_from_iterator}
 *   @{tf.contrib.data.read_batch_features}
 
diff --git a/tensorflow/docs_src/api_guides/python/meta_graph.md b/tensorflow/docs_src/api_guides/python/meta_graph.md
index fa4cee87007cfd77663e74956fcfe0f15c55c52c..0eff9000931666dce742358a290f25bb2b5a7b16 100644
--- a/tensorflow/docs_src/api_guides/python/meta_graph.md
+++ b/tensorflow/docs_src/api_guides/python/meta_graph.md
@@ -221,15 +221,9 @@ Here are some of the typical usage models:
     # Addes loss and train.
     labels = tf.constant(0, tf.int32, shape=[100], name="labels")
     batch_size = tf.size(labels)
-    labels = tf.expand_dims(labels, 1)
-    indices = tf.expand_dims(tf.range(0, batch_size), 1)
-    concated = tf.concat([indices, labels], 1)
-    onehot_labels = tf.sparse_to_dense(
-        concated, tf.stack([batch_size, 10]), 1.0, 0.0)
     logits = tf.get_collection("logits")[0]
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
-        labels=onehot_labels, logits=logits, name="xentropy")
-    loss = tf.reduce_mean(cross_entropy, name="xentropy_mean")
+    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
+                                                  logits=logits)
 
     tf.summary.scalar('loss', loss)
     # Creates the gradient descent optimizer with the given learning rate.
diff --git a/tensorflow/docs_src/api_guides/python/nn.md b/tensorflow/docs_src/api_guides/python/nn.md
index eb3b251099d320244bc212698c45647038df44ae..8e6fd1cff93332b84f552c18f627ba05dc67103e 100644
--- a/tensorflow/docs_src/api_guides/python/nn.md
+++ b/tensorflow/docs_src/api_guides/python/nn.md
@@ -226,6 +226,8 @@ TensorFlow provides several operations that help you perform classification.
 *   @{tf.nn.softmax}
 *   @{tf.nn.log_softmax}
 *   @{tf.nn.softmax_cross_entropy_with_logits}
+*   @{tf.nn.softmax_cross_entropy_with_logits_v2} - identical to the base
+    version, except it allows gradient propagation into the labels.
 *   @{tf.nn.sparse_softmax_cross_entropy_with_logits}
 *   @{tf.nn.weighted_cross_entropy_with_logits}
 
diff --git a/tensorflow/docs_src/api_guides/python/python_io.md b/tensorflow/docs_src/api_guides/python/python_io.md
index a5444408fe8f276028b6cedd5044947051043d31..06282e49d5247ee1ad22eb5bce872ae2c08514e2 100644
--- a/tensorflow/docs_src/api_guides/python/python_io.md
+++ b/tensorflow/docs_src/api_guides/python/python_io.md
@@ -14,16 +14,16 @@ suitable if fast sharding or other non-sequential access is desired.
 
 ## TFRecords Format Details
 
-A TFRecords file contains a sequence of strings with CRC hashes.  Each record
-has the format
+A TFRecords file contains a sequence of strings with CRC32C (32-bit CRC using
+the Castagnoli polynomial) hashes.  Each record has the format
 
     uint64 length
     uint32 masked_crc32_of_length
     byte   data[length]
     uint32 masked_crc32_of_data
 
-and the records are concatenated together to produce the file.  The CRC32s
-are [described here](https://en.wikipedia.org/wiki/Cyclic_redundancy_check),
-and the mask of a CRC is
+and the records are concatenated together to produce the file. CRCs are
+[described here](https://en.wikipedia.org/wiki/Cyclic_redundancy_check), and
+the mask of a CRC is
 
     masked_crc = ((crc >> 15) | (crc << 17)) + 0xa282ead8ul
diff --git a/tensorflow/docs_src/api_guides/python/reading_data.md b/tensorflow/docs_src/api_guides/python/reading_data.md
index b3ebaa0f0a3645256d4e92632a10a53e4eb243cb..b3ca9583704eb30e097bb4d7c438ea8c3662df40 100644
--- a/tensorflow/docs_src/api_guides/python/reading_data.md
+++ b/tensorflow/docs_src/api_guides/python/reading_data.md
@@ -1,11 +1,11 @@
 # Reading data
 
 Note: The preferred way to feed data into a tensorflow program is using the
-@{$datasets$Datasets API}.
+@{$datasets$`tf.data` API}.
 
 There are four methods of getting data into a TensorFlow program:
 
-*   `Dataset` API: Easily construct a complex input pipeline. (preferred method)
+*   `tf.data` API: Easily construct a complex input pipeline. (preferred method)
 *   Feeding: Python code provides the data when running each step.
 *   `QueueRunner`: a queue-based input pipeline reads the data from files
     at the beginning of a TensorFlow graph.
@@ -14,26 +14,27 @@ There are four methods of getting data into a TensorFlow program:
 
 [TOC]
 
-## Dataset API
+## `tf.data` API
 
 See the @{$datasets$programmer's guide} for an in-depth explanation of
-@{tf.data.Dataset}. The `Dataset` API allows you to extract and preprocess data
-from different input/file formats, and apply transformations such as batch,
-shuffle, and map to the dataset. This is an improved version of the old input
-methods, feeding and `QueueRunner`.
+@{tf.data.Dataset}. The `tf.data` API enables you to extract and preprocess data
+from different input/file formats, and apply transformations such as batching,
+shuffling, and mapping functions over the dataset. This is an improved version
+of the old input methods---feeding and `QueueRunner`---which are described
+below for historical purposes.
 
 ## Feeding
 
+Warning: "Feeding" is the least efficient way to feed data into a TensorFlow
+program and should only be used for small experiments and debugging.
+
 TensorFlow's feed mechanism lets you inject data into any Tensor in a
-computation graph. A python computation can thus feed data directly into the
+computation graph. A Python computation can thus feed data directly into the
 graph.
 
 Supply feed data through the `feed_dict` argument to a run() or eval() call
 that initiates computation.
 
-Warning: "Feeding" is the least efficient way to feed data into a tensorflow
-program and should only be used for small experiments and debugging.
-
 ```python
 with tf.Session():
   input = tf.placeholder(tf.float32)
@@ -50,11 +51,14 @@ it is executed without a feed, so you won't forget to feed it.
 
 An example using `placeholder` and feeding to train on MNIST data can be found
 in
-[`tensorflow/examples/tutorials/mnist/fully_connected_feed.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/fully_connected_feed.py),
-and is described in the @{$mechanics$MNIST tutorial}.
+[`tensorflow/examples/tutorials/mnist/fully_connected_feed.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/fully_connected_feed.py).
 
 ## `QueueRunner`
 
+Warning: This section discusses implementing input pipelines using the
+queue-based APIs which can be cleanly replaced by the @{$datasets$`tf.data`
+API}.
+
 A typical queue-based pipeline for reading records from files has the following stages:
 
 1.  The list of filenames
@@ -66,9 +70,6 @@ A typical queue-based pipeline for reading records from files has the following
 7.  *Optional* preprocessing
 8.  Example queue
 
-Warning: This section discusses implementing input pipelines using the
-queue-based APIs which can be cleanly replaced by the @{$datasets$Datasets API}.
-
 ### Filenames, shuffling, and epoch limits
 
 For the list of filenames, use either a constant string Tensor (like
@@ -173,14 +174,25 @@ For example,
 [`tensorflow/examples/how_tos/reading_data/convert_to_records.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/convert_to_records.py)
 converts MNIST data to this format.
 
-To read a file of TFRecords, use
-@{tf.TFRecordReader} with
-the @{tf.parse_single_example}
-decoder. The `parse_single_example` op decodes the example protocol buffers into
-tensors. An MNIST example using the data produced by `convert_to_records` can be
-found in
-[`tensorflow/examples/how_tos/reading_data/fully_connected_reader.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py),
-which you can compare with the `fully_connected_feed` version.
+The recommended way to read a TFRecord file is with a @{tf.data.TFRecordDataset}, [as in this example](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py):
+
+``` python
+    dataset = tf.data.TFRecordDataset(filename)
+    dataset = dataset.repeat(num_epochs)
+
+    # map takes a python function and applies it to every sample
+    dataset = dataset.map(decode)
+```
+
+To acomplish the same task with a queue based input pipeline requires the following code 
+(using the same `decode` function from the above example): 
+
+``` python
+  filename_queue = tf.train.string_input_producer([filename], num_epochs=num_epochs)
+  reader = tf.TFRecordReader()
+  _, serialized_example = reader.read(filename_queue)
+  image,label = decode(serialized_example)
+```
 
 ### Preprocessing
 
@@ -499,7 +511,7 @@ You can have the train and eval in the same graph in the same process, and share
 their trained variables or layers. See @{$variables$the shared variables tutorial}.
 
 To support the single-graph approach
-@{$programmers_guide/datasets$Datasets} also supplies
+@{$programmers_guide/datasets$`tf.data`} also supplies
 @{$programmers_guide/datasets#creating_an_iterator$advanced iterator types} that
 that allow the user to change the input pipeline without rebuilding the graph or
 session.
diff --git a/tensorflow/docs_src/get_started/linear_regression.md b/tensorflow/docs_src/api_guides/python/regression_examples.md
similarity index 97%
rename from tensorflow/docs_src/get_started/linear_regression.md
rename to tensorflow/docs_src/api_guides/python/regression_examples.md
index 45cb9d829cfbc1b1efb735cc1ea27e33159db724..7de2be05521d9293e33664cdbbd7bf16b9ad7c52 100644
--- a/tensorflow/docs_src/get_started/linear_regression.md
+++ b/tensorflow/docs_src/api_guides/python/regression_examples.md
@@ -38,7 +38,7 @@ The preceding examples rely on the following data set utility:
   <tr> <th>Utility</th> <th>Description</th></tr>
 
   <tr>
-    <td><a href="../../examples/get_started/regression/imports85.py">imports85.py</a></td>
+    <td><a href="https://www.tensorflow.org/code/tensorflow/examples/get_started/regression/imports85.py">imports85.py</a></td>
     <td>This program provides utility functions that load the
         <tt>imports85</tt> data set into formats that other TensorFlow
         programs (for example, <tt>linear_regression.py</tt> and
@@ -229,4 +229,4 @@ passed through to the `model_fn` when the `model_fn` is called.
 The `model_fn` returns an
 @{tf.estimator.EstimatorSpec$`EstimatorSpec`} which is a simple structure
 indicating to the `Estimator` which operations should be run to accomplish
-varions tasks.
+various tasks.
diff --git a/tensorflow/docs_src/api_guides/python/script_ops.md b/tensorflow/docs_src/api_guides/python/script_ops.md
deleted file mode 100644
index ab49a570c135fefdcb3f4c7d4e4d35df38092b98..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/script_ops.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Wraps python functions
-
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
-
-[TOC]
-
-## Script Language Operators
-
-TensorFlow provides allows you to wrap python/numpy functions as
-TensorFlow operators.
-
-*   @{tf.py_func}
diff --git a/tensorflow/docs_src/api_guides/python/train.md b/tensorflow/docs_src/api_guides/python/train.md
index 943394f4ae05b9b5b379a58e2fc86341fbbfb6c4..80fe9784de64c3b3f1843cad07bb02507f682eaf 100644
--- a/tensorflow/docs_src/api_guides/python/train.md
+++ b/tensorflow/docs_src/api_guides/python/train.md
@@ -24,6 +24,8 @@ of the subclasses.
 *   @{tf.train.ProximalAdagradOptimizer}
 *   @{tf.train.RMSPropOptimizer}
 
+See @{tf.contrib.opt} for more optimizers.
+
 ## Gradient Computation
 
 TensorFlow provides functions to compute the derivatives for a given
@@ -57,6 +59,9 @@ gradients.
 *   @{tf.train.natural_exp_decay}
 *   @{tf.train.piecewise_constant}
 *   @{tf.train.polynomial_decay}
+*   @{tf.train.cosine_decay}
+*   @{tf.train.linear_cosine_decay}
+*   @{tf.train.noisy_linear_cosine_decay}
 
 ## Moving Averages
 
diff --git a/tensorflow/docs_src/community/benchmarks.md b/tensorflow/docs_src/community/benchmarks.md
index 3bdbabf4bbc7a9ebb1992619cb3c51a95429a0b1..67856ce8698aec0cecf6718d8d4580c67a9eb321 100644
--- a/tensorflow/docs_src/community/benchmarks.md
+++ b/tensorflow/docs_src/community/benchmarks.md
@@ -1,4 +1,4 @@
-# Benchmarks
+# Defining and Running Benchmarks
 
 This guide contains instructions for defining and running a TensorFlow benchmark. These benchmarks store output in [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) format. If these benchmarks are added to TensorFlow github repo, then we will run them daily with our continuous build and display a graph on our dashboard: https://benchmarks-dot-tensorflow-testing.appspot.com/.
 
@@ -52,6 +52,19 @@ Key points to note in the example above:
 * Benchmark method calls `report_benchmark` to report the metric value.
 
 
+## Running with Python
+
+Use the `--benchmarks` flag to run the benchmark with python. A [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto will be printed.
+
+```
+python sample_benchmark.py --benchmarks=SampleBenchmark
+```
+
+Setting the flag as `--benchmarks=.` or `--benchmarks=all` would work as well.
+
+(Please ensure that Tensorflow is installed to successfully import the package in the line `import tensorflow as tf`. For installation instructions, see [Installing TensorFlow](https://www.tensorflow.org/install/). This step is not necessary when running with bazel.)
+
+
 ## Adding a `bazel` Target
 
 We have a special target called `tf_py_logged_benchmark` for benchmarks defined under TensorFlow github repo. `tf_py_logged_benchmark` should wrap around a regular `py_test` target. Running a `tf_py_logged_benchmark` would print a [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) proto. Defining a `tf_py_logged_benchmark` also lets us run it with TensorFlow continuous build.
@@ -84,7 +97,7 @@ load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
 
 tf_py_logged_benchmark(
     name = "sample_logged_benchmark",
-    target = "//tensorflow/tools/test:sample_benchmark",
+    target = "//tensorflow/examples/benchmark:sample_benchmark",
 )
 ```
 
diff --git a/tensorflow/docs_src/community/style_guide.md b/tensorflow/docs_src/community/style_guide.md
index a4c4e2674ee78b2248323a0275a737d6417c5f99..c9268790a71fad9328f60f6a889c19c32117497e 100644
--- a/tensorflow/docs_src/community/style_guide.md
+++ b/tensorflow/docs_src/community/style_guide.md
@@ -59,14 +59,14 @@ filegroup(
             "**/OWNERS",
         ],
     ),
-    visibility = ["//third_party/tensorflow:__subpackages__"],
+    visibility = ["//tensorflow:__subpackages__"],
 )
 ```
 
 * When adding new BUILD file, add this line to `tensorflow/BUILD` file into `all_opensource_files` target.
 
 ```
-"//third_party/tensorflow/<directory>:all_files",
+"//tensorflow/<directory>:all_files",
 ```
 
 * For all Python BUILD targets (libraries and tests) add next line:
diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
index a3abf2550757e825ae2d023018def919de1bcd8f..9f6fe91b1490ef4ffe43acc877ecb83cc9121118 100644
--- a/tensorflow/docs_src/community/welcome.md
+++ b/tensorflow/docs_src/community/welcome.md
@@ -12,7 +12,6 @@ The source code for TensorFlow is on
 Before contributing to TensorFlow source code, please review the
 [Contribution guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
 
-
 ### Projects developed by the TensorFlow community
 
 The TensorFlow community has created many great projects around TensorFlow, including:
@@ -65,5 +64,6 @@ please read the following list carefully:
     [TensorFlow issues tracker](https://github.com/tensorflow/tensorflow/issues)
     on GitHub.  For example, use the issue tracker to request a
     new operation in TensorFlow.
-
+  * To report vulnerabilities, please follow our
+    [vulnerability disclosure guidelines](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/SECURITY.md).
 
diff --git a/tensorflow/docs_src/deploy/distributed.md b/tensorflow/docs_src/deploy/distributed.md
index f3e2fac49f21e0777bc2c9f46c8a5d5c12f9bed4..d7ed6b1debdf256a800aed7304152acf5972bf72 100644
--- a/tensorflow/docs_src/deploy/distributed.md
+++ b/tensorflow/docs_src/deploy/distributed.md
@@ -2,8 +2,8 @@
 
 This document shows how to create a cluster of TensorFlow servers, and how to
 distribute a computation graph across that cluster. We assume that you are
-familiar with the @{$get_started/get_started$basic concepts} of
-writing TensorFlow programs.
+familiar with the @{$programmers_guide/low_level_intro$basic concepts} of
+writing low level TensorFlow programs.
 
 ## Hello distributed TensorFlow!
 
diff --git a/tensorflow/docs_src/deploy/index.md b/tensorflow/docs_src/deploy/index.md
index 5831960b4f6e383a6babb0823893a5d9ec5017f0..07b1bc9257ff7b132d22ac186a2f462e9c784867 100644
--- a/tensorflow/docs_src/deploy/index.md
+++ b/tensorflow/docs_src/deploy/index.md
@@ -7,6 +7,8 @@ the following documents:
     a cluster of TensorFlow servers.
   * @{$hadoop$How to run TensorFlow on Hadoop}, which has a highly
     self-explanatory title.
+  * @{$s3$How to run TensorFlow with the S3 filesystem}, which explains how
+    to run TensorFlow with the S3 file system.
   * The entire document set for [TensorFlow serving](/serving), an open-source,
     flexible, high-performance serving system for machine-learned models
     designed for production environments. TensorFlow Serving provides
diff --git a/tensorflow/docs_src/deploy/leftnav_files b/tensorflow/docs_src/deploy/leftnav_files
index f8f8d578e602cac8dd814326e318ebe0e85ec700..c682e7add16c741279aedb40c1b12f4ca8f0286a 100644
--- a/tensorflow/docs_src/deploy/leftnav_files
+++ b/tensorflow/docs_src/deploy/leftnav_files
@@ -1,3 +1,4 @@
 index.md
 distributed.md
 hadoop.md
+s3.md
diff --git a/tensorflow/docs_src/deploy/s3.md b/tensorflow/docs_src/deploy/s3.md
new file mode 100644
index 0000000000000000000000000000000000000000..38f84286347622d1de0646cdc621d5fb1447e588
--- /dev/null
+++ b/tensorflow/docs_src/deploy/s3.md
@@ -0,0 +1,40 @@
+# How to run TensorFlow on S3
+
+This document describes how to run TensorFlow on S3 file system.
+
+## S3
+
+We assume that you are familiar with @{$reading_data$reading data}.
+
+To use S3 with TensorFlow, change the file paths you use to read and write
+data to an S3 path. For example:
+
+```python
+filenames = ["s3://bucketname/path/to/file1.tfrecord",
+             "s3://bucketname/path/to/file2.tfrecord"]
+dataset = tf.data.TFRecordDataset(filenames)
+```
+
+When reading or writing data on S3 with your TensorFlow program, the behavior
+could be controlled by various environmental variables:
+
+*   **AWS_REGION**: By default, regional endpoint is used for S3, with region
+    controlled by `AWS_REGION`. If `AWS_REGION` is not specified, then
+    `us-east-1` is used.
+*   **S3_ENDPOINT**: The endpoint could be overridden explicitly with
+    `S3_ENDPOINT` specified.
+*   **S3_USE_HTTPS**: HTTPS is used to access S3 by default, unless
+    `S3_USE_HTTPS=0`.
+*   **S3_VERIFY_SSL**: If HTTPS is used, SSL verification could be disabled
+    with `S3_VERIFY_SSL=0`.
+
+To read or write objects in a bucket that is no publicly accessible,
+AWS credentials must be provided through one of the following methods:
+
+*   Set credentials in the AWS credentials profile file on the local system,
+    located at: `~/.aws/credentials` on Linux, macOS, or Unix, or
+    `C:\Users\USERNAME\.aws\credentials` on Windows.
+*   Set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment
+    variables.
+*   If TensorFlow is deployed on an EC2 instance, specify an IAM role and then
+    give the EC2 instance access to that role.
diff --git a/tensorflow/docs_src/extend/add_filesys.md b/tensorflow/docs_src/extend/add_filesys.md
index 44ba198998c7103d9a45e3ce6e6b9235a0b8bfa0..06f11de4eb0ea7878b01cd37d994c5a40ec400be 100644
--- a/tensorflow/docs_src/extend/add_filesys.md
+++ b/tensorflow/docs_src/extend/add_filesys.md
@@ -35,6 +35,7 @@ Note that TensorFlow already includes many filesystem implementations, such as:
 
 *   HDFS - the Hadoop File System
 *   GCS - Google Cloud Storage filesystem
+*   S3 - Amazon Simple Storage Service filesystem
 *   A "memory-mapped-file" filesystem
 
 The rest of this guide describes how to implement a custom filesystem.
@@ -80,6 +81,8 @@ filesystem implementations call their existing libraries. Examples include:
     plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/hadoop/hadoop_file_system.h)
 *   [GCS
     plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/cloud/gcs_file_system.h)
+*   [S3
+    plugin](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/platform/s3/s3_file_system.h)
 
 #### The File interfaces
 
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index c52279b212f46215125a20815f97b07b012a5513..15075e1df8e703415b4acb8e53f76dc9a4a41b50 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -1,6 +1,6 @@
 # Adding a New Op
 
-Note: By default [tensorflow.org](http://tensorflow.org) shows docs for the
+Note: By default [www.tensorflow.org](https://www.tensorflow.org) shows docs for the
 most recent stable version. The instructions in this doc require building from
 source. You will probably want to build from the `master` version of tensorflow.
 You should, as a result, be sure you are following the
diff --git a/tensorflow/docs_src/extend/architecture.md b/tensorflow/docs_src/extend/architecture.md
index 21816502acec7abfca670cac1bceda3e29144b53..c0fc714a4405d6189d187f1552ab96ea2d37dd24 100644
--- a/tensorflow/docs_src/extend/architecture.md
+++ b/tensorflow/docs_src/extend/architecture.md
@@ -7,7 +7,7 @@ learning models and system-level optimizations.
 This document describes the system architecture that makes possible this
 combination of scale and flexibility. It assumes that you have basic familiarity
 with TensorFlow programming concepts such as the computation graph, operations,
-and sessions. See @{$get_started/get_started$Getting Started}
+and sessions. See @{$programmers_guide/low_level_intro$this document}
 for an introduction to these topics. Some familiarity
 with @{$distributed$distributed TensorFlow}
 will also be helpful.
diff --git a/tensorflow/docs_src/extend/estimators.md b/tensorflow/docs_src/extend/estimators.md
deleted file mode 100644
index 7e6507c5840fe621aeb91842c9a83554e568db99..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/extend/estimators.md
+++ /dev/null
@@ -1,698 +0,0 @@
-# Creating Estimators in tf.estimator
-
-The tf.estimator framework makes it easy to construct and train machine
-learning models via its high-level Estimator API. `Estimator`
-offers classes you can instantiate to quickly configure common model types such
-as regressors and classifiers:
-
-*   @{tf.estimator.LinearClassifier}:
-    Constructs a linear classification model.
-*   @{tf.estimator.LinearRegressor}:
-    Constructs a linear regression model.
-*   @{tf.estimator.DNNClassifier}:
-    Construct a neural network classification model.
-*   @{tf.estimator.DNNRegressor}:
-    Construct a neural network regression model.
-*   @{tf.estimator.DNNLinearCombinedClassifier}:
-    Construct a neural network and linear combined classification model.
-*   @{tf.estimator.DNNLinearCombinedRegressor}:
-    Construct a neural network and linear combined regression model.
-
-But what if none of `tf.estimator`'s predefined model types meets your needs?
-Perhaps you need more granular control over model configuration, such as
-the ability to customize the loss function used for optimization, or specify
-different activation functions for each neural network layer. Or maybe you're
-implementing a ranking or recommendation system, and neither a classifier nor a
-regressor is appropriate for generating predictions.
-
-This tutorial covers how to create your own `Estimator` using the building
-blocks provided in `tf.estimator`, which will predict the ages of
-[abalones](https://en.wikipedia.org/wiki/Abalone) based on their physical
-measurements. You'll learn how to do the following:
-
-*   Instantiate an `Estimator`
-*   Construct a custom model function
-*   Configure a neural network using `tf.feature_column` and `tf.layers`
-*   Choose an appropriate loss function from `tf.losses`
-*   Define a training op for your model
-*   Generate and return predictions
-
-## Prerequisites
-
-This tutorial assumes you already know tf.estimator API basics, such as
-feature columns, input functions, and `train()`/`evaluate()`/`predict()`
-operations. If you've never used tf.estimator before, or need a refresher,
-you should first review the following tutorials:
-
-*   @{$get_started/estimator$tf.estimator Quickstart}: Quick introduction to
-    training a neural network using tf.estimator.
-*   @{$wide$TensorFlow Linear Model Tutorial}: Introduction to
-    feature columns, and an overview on building a linear classifier in
-    tf.estimator.
-*   @{$input_fn$Building Input Functions with tf.estimator}: Overview of how
-    to construct an input_fn to preprocess and feed data into your models.
-
-## An Abalone Age Predictor {#abalone-predictor}
-
-It's possible to estimate the age of an
-[abalone](https://en.wikipedia.org/wiki/Abalone) (sea snail) by the number of
-rings on its shell. However, because this task requires cutting, staining, and
-viewing the shell under a microscope, it's desirable to find other measurements
-that can predict age.
-
-The [Abalone Data Set](https://archive.ics.uci.edu/ml/datasets/Abalone) contains
-the following
-[feature data](https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.names)
-for abalone:
-
-| Feature        | Description                                               |
-| -------------- | --------------------------------------------------------- |
-| Length         | Length of abalone (in longest direction; in mm)           |
-| Diameter       | Diameter of abalone (measurement perpendicular to length; in mm)|
-| Height         | Height of abalone (with its meat inside shell; in mm)     |
-| Whole Weight   | Weight of entire abalone (in grams)                       |
-| Shucked Weight | Weight of abalone meat only (in grams)                    |
-| Viscera Weight | Gut weight of abalone (in grams), after bleeding          |
-| Shell Weight   | Weight of dried abalone shell (in grams)                  |
-
-The label to predict is number of rings, as a proxy for abalone age.
-
-![Abalone shell](https://www.tensorflow.org/images/abalone_shell.jpg)
-**[“Abalone shell”](https://www.flickr.com/photos/thenickster/16641048623/) (by [Nicki Dugan
-Pogue](https://www.flickr.com/photos/thenickster/), CC BY-SA 2.0)**
-
-## Setup
-
-This tutorial uses three data sets.
-[`abalone_train.csv`](http://download.tensorflow.org/data/abalone_train.csv)
-contains labeled training data comprising 3,320 examples.
-[`abalone_test.csv`](http://download.tensorflow.org/data/abalone_test.csv)
-contains labeled test data for 850 examples.
-[`abalone_predict`](http://download.tensorflow.org/data/abalone_predict.csv)
-contains 7 examples on which to make predictions.
-
-The following sections walk through writing the `Estimator` code step by step;
-the [full, final code is available
-here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/estimators/abalone.py).
-
-## Loading Abalone CSV Data into TensorFlow Datasets
-
-To feed the abalone dataset into the model, you'll need to download and load the
-CSVs into TensorFlow `Dataset`s. First, add some standard Python and TensorFlow
-imports, and set up FLAGS:
-
-```python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-import tempfile
-
-# Import urllib
-from six.moves import urllib
-
-import numpy as np
-import tensorflow as tf
-
-FLAGS = None
-```
-
-Enable logging:
-
-```python
-tf.logging.set_verbosity(tf.logging.INFO)
-```
-
-Then define a function to load the CSVs (either from files specified in
-command-line options, or downloaded from
-[tensorflow.org](https://www.tensorflow.org/)):
-
-```python
-def maybe_download(train_data, test_data, predict_data):
-  """Maybe downloads training data and returns train and test file names."""
-  if train_data:
-    train_file_name = train_data
-  else:
-    train_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve(
-        "http://download.tensorflow.org/data/abalone_train.csv",
-        train_file.name)
-    train_file_name = train_file.name
-    train_file.close()
-    print("Training data is downloaded to %s" % train_file_name)
-
-  if test_data:
-    test_file_name = test_data
-  else:
-    test_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve(
-        "http://download.tensorflow.org/data/abalone_test.csv", test_file.name)
-    test_file_name = test_file.name
-    test_file.close()
-    print("Test data is downloaded to %s" % test_file_name)
-
-  if predict_data:
-    predict_file_name = predict_data
-  else:
-    predict_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve(
-        "http://download.tensorflow.org/data/abalone_predict.csv",
-        predict_file.name)
-    predict_file_name = predict_file.name
-    predict_file.close()
-    print("Prediction data is downloaded to %s" % predict_file_name)
-
-  return train_file_name, test_file_name, predict_file_name
-```
-
-Finally, create `main()` and load the abalone CSVs into `Datasets`, defining
-flags to allow users to optionally specify CSV files for training, test, and
-prediction datasets via the command line (by default, files will be downloaded
-from [tensorflow.org](https://www.tensorflow.org/)):
-
-```python
-def main(unused_argv):
-  # Load datasets
-  abalone_train, abalone_test, abalone_predict = maybe_download(
-    FLAGS.train_data, FLAGS.test_data, FLAGS.predict_data)
-
-  # Training examples
-  training_set = tf.contrib.learn.datasets.base.load_csv_without_header(
-      filename=abalone_train, target_dtype=np.int, features_dtype=np.float64)
-
-  # Test examples
-  test_set = tf.contrib.learn.datasets.base.load_csv_without_header(
-      filename=abalone_test, target_dtype=np.int, features_dtype=np.float64)
-
-  # Set of 7 examples for which to predict abalone ages
-  prediction_set = tf.contrib.learn.datasets.base.load_csv_without_header(
-      filename=abalone_predict, target_dtype=np.int, features_dtype=np.float64)
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.register("type", "bool", lambda v: v.lower() == "true")
-  parser.add_argument(
-      "--train_data", type=str, default="", help="Path to the training data.")
-  parser.add_argument(
-      "--test_data", type=str, default="", help="Path to the test data.")
-  parser.add_argument(
-      "--predict_data",
-      type=str,
-      default="",
-      help="Path to the prediction data.")
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
-```
-
-## Instantiating an Estimator
-
-When defining a model using one of tf.estimator's provided classes, such as
-`DNNClassifier`, you supply all the configuration parameters right in the
-constructor, e.g.:
-
-```python
-my_nn = tf.estimator.DNNClassifier(feature_columns=[age, height, weight],
-                                   hidden_units=[10, 10, 10],
-                                   activation_fn=tf.nn.relu,
-                                   dropout=0.2,
-                                   n_classes=3,
-                                   optimizer="Adam")
-```
-
-You don't need to write any further code to instruct TensorFlow how to train the
-model, calculate loss, or return predictions; that logic is already baked into
-the `DNNClassifier`.
-
-By contrast, when you're creating your own estimator from scratch, the
-constructor accepts just two high-level parameters for model configuration,
-`model_fn` and `params`:
-
-```python
-nn = tf.estimator.Estimator(model_fn=model_fn, params=model_params)
-```
-
-*   `model_fn`: A function object that contains all the aforementioned logic to
-    support training, evaluation, and prediction. You are responsible for
-    implementing that functionality. The next section, [Constructing the
-    `model_fn`](#constructing-modelfn) covers creating a model function in
-    detail.
-
-*   `params`: An optional dict of hyperparameters (e.g., learning rate, dropout)
-    that will be passed into the `model_fn`.
-
-Note: Just like `tf.estimator`'s predefined regressors and classifiers, the
-`Estimator` initializer also accepts the general configuration arguments
-`model_dir` and `config`.
-
-For the abalone age predictor, the model will accept one hyperparameter:
-learning rate. Define `LEARNING_RATE` as a constant at the beginning of your
-code (highlighted in bold below), right after the logging configuration:
-
-<pre class="prettyprint"><code class="lang-python">tf.logging.set_verbosity(tf.logging.INFO)
-
-<strong># Learning rate for the model
-LEARNING_RATE = 0.001</strong></code></pre>
-
-Note: Here, `LEARNING_RATE` is set to `0.001`, but you can tune this value as
-needed to achieve the best results during model training.
-
-Then, add the following code to `main()`, which creates the dict `model_params`
-containing the learning rate and instantiates the `Estimator`:
-
-```python
-# Set model params
-model_params = {"learning_rate": LEARNING_RATE}
-
-# Instantiate Estimator
-nn = tf.estimator.Estimator(model_fn=model_fn, params=model_params)
-```
-
-## Constructing the `model_fn` {#constructing-modelfn}
-
-The basic skeleton for an `Estimator` API model function looks like this:
-
-```python
-def model_fn(features, labels, mode, params):
-   # Logic to do the following:
-   # 1. Configure the model via TensorFlow operations
-   # 2. Define the loss function for training/evaluation
-   # 3. Define the training operation/optimizer
-   # 4. Generate predictions
-   # 5. Return predictions/loss/train_op/eval_metric_ops in EstimatorSpec object
-   return EstimatorSpec(mode, predictions, loss, train_op, eval_metric_ops)
-```
-
-The `model_fn` must accept three arguments:
-
-*   `features`: A dict containing the features passed to the model via
-    `input_fn`.
-*   `labels`: A `Tensor` containing the labels passed to the model via
-    `input_fn`. Will be empty for `predict()` calls, as these are the values the
-    model will infer.
-*   `mode`: One of the following @{tf.estimator.ModeKeys} string values
-    indicating the context in which the model_fn was invoked:
-    *   `tf.estimator.ModeKeys.TRAIN` The `model_fn` was invoked in training
-        mode, namely via a `train()` call.
-    *   `tf.estimator.ModeKeys.EVAL`. The `model_fn` was invoked in
-        evaluation mode, namely via an `evaluate()` call.
-    *   `tf.estimator.ModeKeys.PREDICT`. The `model_fn` was invoked in
-        predict mode, namely via a `predict()` call.
-
-`model_fn` may also accept a `params` argument containing a dict of
-hyperparameters used for training (as shown in the skeleton above).
-
-The body of the function performs the following tasks (described in detail in the
-sections that follow):
-
-*   Configuring the model—here, for the abalone predictor, this will be a neural
-    network.
-*   Defining the loss function used to calculate how closely the model's
-    predictions match the target values.
-*   Defining the training operation that specifies the `optimizer` algorithm to
-    minimize the loss values calculated by the loss function.
-
-The `model_fn` must return a @{tf.estimator.EstimatorSpec}
-object, which contains the following values:
-
-*   `mode` (required). The mode in which the model was run. Typically, you will
-    return the `mode` argument of the `model_fn` here.
-
-*   `predictions` (required in `PREDICT` mode). A dict that maps key names of
-    your choice to `Tensor`s containing the predictions from the model, e.g.:
-
-    ```python
-    predictions = {"results": tensor_of_predictions}
-    ```
-
-    In `PREDICT` mode, the dict that you return in `EstimatorSpec` will then be
-    returned by `predict()`, so you can construct it in the format in which
-    you'd like to consume it.
-
-
-*   `loss` (required in `EVAL` and `TRAIN` mode). A `Tensor` containing a scalar
-    loss value: the output of the model's loss function (discussed in more depth
-    later in [Defining loss for the model](#defining-loss)) calculated over all
-    the input examples. This is used in `TRAIN` mode for error handling and
-    logging, and is automatically included as a metric in `EVAL` mode.
-
-*   `train_op` (required only in `TRAIN` mode). An Op that runs one step of
-    training.
-
-*   `eval_metric_ops` (optional). A dict of name/value pairs specifying the
-    metrics that will be calculated when the model runs in `EVAL` mode. The name
-    is a label of your choice for the metric, and the value is the result of
-    your metric calculation. The @{tf.metrics}
-    module provides predefined functions for a variety of common metrics. The
-    following `eval_metric_ops` contains an `"accuracy"` metric calculated using
-    `tf.metrics.accuracy`:
-
-    ```python
-    eval_metric_ops = {
-        "accuracy": tf.metrics.accuracy(labels, predictions)
-    }
-    ```
-
-    If you do not specify `eval_metric_ops`, only `loss` will be calculated
-    during evaluation.
-
-### Configuring a neural network with `tf.feature_column` and `tf.layers`
-
-Constructing a [neural
-network](https://en.wikipedia.org/wiki/Artificial_neural_network) entails
-creating and connecting the input layer, the hidden layers, and the output
-layer.
-
-The input layer is a series of nodes (one for each feature in the model) that
-will accept the feature data that is passed to the `model_fn` in the `features`
-argument. If `features` contains an n-dimensional `Tensor` with all your feature
-data, then it can serve as the input layer.
-If `features` contains a dict of @{$linear#feature-columns-and-transformations$feature columns} passed to
-the model via an input function, you can convert it to an input-layer `Tensor`
-with the @{tf.feature_column.input_layer} function.
-
-```python
-input_layer = tf.feature_column.input_layer(
-    features=features, feature_columns=[age, height, weight])
-```
-
-As shown above, `input_layer()` takes two required arguments:
-
-*   `features`. A mapping from string keys to the `Tensors` containing the
-    corresponding feature data. This is exactly what is passed to the `model_fn`
-    in the `features` argument.
-*   `feature_columns`. A list of all the `FeatureColumns` in the model—`age`,
-    `height`, and `weight` in the above example.
-
-The input layer of the neural network then must be connected to one or more
-hidden layers via an [activation
-function](https://en.wikipedia.org/wiki/Activation_function) that performs a
-nonlinear transformation on the data from the previous layer. The last hidden
-layer is then connected to the output layer, the final layer in the model.
-`tf.layers` provides the `tf.layers.dense` function for constructing fully
-connected layers. The activation is controlled by the `activation` argument.
-Some options to pass to the `activation` argument are:
-
-*   `tf.nn.relu`. The following code creates a layer of `units` nodes fully
-    connected to the previous layer `input_layer` with a
-    [ReLU activation function](https://en.wikipedia.org/wiki/Rectifier_\(neural_networks\))
-    (@{tf.nn.relu}):
-
-    ```python
-    hidden_layer = tf.layers.dense(
-        inputs=input_layer, units=10, activation=tf.nn.relu)
-    ```
-
-*   `tf.nn.relu6`. The following code creates a layer of `units` nodes fully
-    connected to the previous layer `hidden_layer` with a ReLU 6 activation
-    function (@{tf.nn.relu6}):
-
-    ```python
-    second_hidden_layer = tf.layers.dense(
-        inputs=hidden_layer, units=20, activation=tf.nn.relu)
-    ```
-
-*   `None`. The following code creates a layer of `units` nodes fully connected
-    to the previous layer `second_hidden_layer` with *no* activation function,
-    just a linear transformation:
-
-    ```python
-    output_layer = tf.layers.dense(
-        inputs=second_hidden_layer, units=3, activation=None)
-    ```
-
-Other activation functions are possible, e.g.:
-
-```python
-output_layer = tf.layers.dense(inputs=second_hidden_layer,
-                               units=10,
-                               activation_fn=tf.sigmoid)
-```
-
-The above code creates the neural network layer `output_layer`, which is fully
-connected to `second_hidden_layer` with a sigmoid activation function
-(@{tf.sigmoid}). For a list of predefined
-activation functions available in TensorFlow, see the @{$python/nn#activation_functions$API docs}.
-
-Putting it all together, the following code constructs a full neural network for
-the abalone predictor, and captures its predictions:
-
-```python
-def model_fn(features, labels, mode, params):
-  """Model function for Estimator."""
-
-  # Connect the first hidden layer to input layer
-  # (features["x"]) with relu activation
-  first_hidden_layer = tf.layers.dense(features["x"], 10, activation=tf.nn.relu)
-
-  # Connect the second hidden layer to first hidden layer with relu
-  second_hidden_layer = tf.layers.dense(
-      first_hidden_layer, 10, activation=tf.nn.relu)
-
-  # Connect the output layer to second hidden layer (no activation fn)
-  output_layer = tf.layers.dense(second_hidden_layer, 1)
-
-  # Reshape output layer to 1-dim Tensor to return predictions
-  predictions = tf.reshape(output_layer, [-1])
-  predictions_dict = {"ages": predictions}
-  ...
-```
-
-Here, because you'll be passing the abalone `Datasets` using `numpy_input_fn`
-as shown below, `features` is a dict `{"x": data_tensor}`, so
-`features["x"]` is the input layer. The network contains two hidden
-layers, each with 10 nodes and a ReLU activation function. The output layer
-contains no activation function, and is
-@{tf.reshape} to a one-dimensional
-tensor to capture the model's predictions, which are stored in
-`predictions_dict`.
-
-### Defining loss for the model {#defining-loss}
-
-The `EstimatorSpec` returned by the `model_fn` must contain `loss`: a `Tensor`
-representing the loss value, which quantifies how well the model's predictions
-reflect the label values during training and evaluation runs. The @{tf.losses}
-module provides convenience functions for calculating loss using a variety of
-metrics, including:
-
-*   `absolute_difference(labels, predictions)`. Calculates loss using the
-    [absolute-difference
-    formula](https://en.wikipedia.org/wiki/Deviation_\(statistics\)#Unsigned_or_absolute_deviation)
-    (also known as L<sub>1</sub> loss).
-
-*   `log_loss(labels, predictions)`. Calculates loss using the [logistic loss
-    forumula](https://en.wikipedia.org/wiki/Loss_functions_for_classification#Logistic_loss)
-    (typically used in logistic regression).
-
-*   `mean_squared_error(labels, predictions)`. Calculates loss using the [mean
-    squared error](https://en.wikipedia.org/wiki/Mean_squared_error) (MSE; also
-    known as L<sub>2</sub> loss).
-
-The following example adds a definition for `loss` to the abalone `model_fn`
-using `mean_squared_error()` (in bold):
-
-<pre class="prettyprint"><code class="lang-python">def model_fn(features, labels, mode, params):
-  """Model function for Estimator."""
-
-  # Connect the first hidden layer to input layer
-  # (features["x"]) with relu activation
-  first_hidden_layer = tf.layers.dense(features["x"], 10, activation=tf.nn.relu)
-
-  # Connect the second hidden layer to first hidden layer with relu
-  second_hidden_layer = tf.layers.dense(
-      first_hidden_layer, 10, activation=tf.nn.relu)
-
-  # Connect the output layer to second hidden layer (no activation fn)
-  output_layer = tf.layers.dense(second_hidden_layer, 1)
-
-  # Reshape output layer to 1-dim Tensor to return predictions
-  predictions = tf.reshape(output_layer, [-1])
-  predictions_dict = {"ages": predictions}
-
-
-  <strong># Calculate loss using mean squared error
-  loss = tf.losses.mean_squared_error(labels, predictions)</strong>
-  ...</code></pre>
-
-See the @{$python/contrib.losses$API guide} for a
-full list of loss functions and more details on supported arguments and usage.
-
-Supplementary metrics for evaluation can be added to an `eval_metric_ops` dict.
-The following code defines an `rmse` metric, which calculates the root mean
-squared error for the model predictions. Note that the `labels` tensor is cast
-to a `float64` type to match the data type of the `predictions` tensor, which
-will contain real values:
-
-```python
-eval_metric_ops = {
-    "rmse": tf.metrics.root_mean_squared_error(
-        tf.cast(labels, tf.float64), predictions)
-}
-```
-
-### Defining the training op for the model
-
-The training op defines the optimization algorithm TensorFlow will use when
-fitting the model to the training data. Typically when training, the goal is to
-minimize loss. A simple way to create the training op is to instantiate a
-`tf.train.Optimizer` subclass and call the `minimize` method.
-
-The following code defines a training op for the abalone `model_fn` using the
-loss value calculated in [Defining Loss for the Model](#defining-loss), the
-learning rate passed to the function in `params`, and the gradient descent
-optimizer. For `global_step`, the convenience function
-@{tf.train.get_global_step} takes care of generating an integer variable:
-
-```python
-optimizer = tf.train.GradientDescentOptimizer(
-    learning_rate=params["learning_rate"])
-train_op = optimizer.minimize(
-    loss=loss, global_step=tf.train.get_global_step())
-```
-
-For a full list of optimizers, and other details, see the
-@{$python/train#optimizers$API guide}.
-
-### The complete abalone `model_fn`
-
-Here's the final, complete `model_fn` for the abalone age predictor. The
-following code configures the neural network; defines loss and the training op;
-and returns a `EstimatorSpec` object containing `mode`, `predictions_dict`, `loss`,
-and `train_op`:
-
-```python
-def model_fn(features, labels, mode, params):
-  """Model function for Estimator."""
-
-  # Connect the first hidden layer to input layer
-  # (features["x"]) with relu activation
-  first_hidden_layer = tf.layers.dense(features["x"], 10, activation=tf.nn.relu)
-
-  # Connect the second hidden layer to first hidden layer with relu
-  second_hidden_layer = tf.layers.dense(
-      first_hidden_layer, 10, activation=tf.nn.relu)
-
-  # Connect the output layer to second hidden layer (no activation fn)
-  output_layer = tf.layers.dense(second_hidden_layer, 1)
-
-  # Reshape output layer to 1-dim Tensor to return predictions
-  predictions = tf.reshape(output_layer, [-1])
-
-  # Provide an estimator spec for `ModeKeys.PREDICT`.
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    return tf.estimator.EstimatorSpec(
-        mode=mode,
-        predictions={"ages": predictions})
-
-  # Calculate loss using mean squared error
-  loss = tf.losses.mean_squared_error(labels, predictions)
-
-  # Calculate root mean squared error as additional eval metric
-  eval_metric_ops = {
-      "rmse": tf.metrics.root_mean_squared_error(
-          tf.cast(labels, tf.float64), predictions)
-  }
-
-  optimizer = tf.train.GradientDescentOptimizer(
-      learning_rate=params["learning_rate"])
-  train_op = optimizer.minimize(
-      loss=loss, global_step=tf.train.get_global_step())
-
-  # Provide an estimator spec for `ModeKeys.EVAL` and `ModeKeys.TRAIN` modes.
-  return tf.estimator.EstimatorSpec(
-      mode=mode,
-      loss=loss,
-      train_op=train_op,
-      eval_metric_ops=eval_metric_ops)
-```
-
-## Running the Abalone Model
-
-You've instantiated an `Estimator` for the abalone predictor and defined its
-behavior in `model_fn`; all that's left to do is train, evaluate, and make
-predictions.
-
-Add the following code to the end of `main()` to fit the neural network to the
-training data and evaluate accuracy:
-
-```python
-train_input_fn = tf.estimator.inputs.numpy_input_fn(
-    x={"x": np.array(training_set.data)},
-    y=np.array(training_set.target),
-    num_epochs=None,
-    shuffle=True)
-
-# Train
-nn.train(input_fn=train_input_fn, steps=5000)
-
-# Score accuracy
-test_input_fn = tf.estimator.inputs.numpy_input_fn(
-    x={"x": np.array(test_set.data)},
-    y=np.array(test_set.target),
-    num_epochs=1,
-    shuffle=False)
-
-ev = nn.evaluate(input_fn=test_input_fn)
-print("Loss: %s" % ev["loss"])
-print("Root Mean Squared Error: %s" % ev["rmse"])
-```
-
-Note: The above code uses input functions to feed feature (`x`) and label (`y`)
-`Tensor`s into the model for both training (`train_input_fn`) and evaluation
-(`test_input_fn`). To learn more about input functions, see the tutorial
-@{$input_fn$Building Input Functions with tf.estimator}.
-
-Then run the code. You should see output like the following:
-
-```none
-...
-INFO:tensorflow:loss = 4.86658, step = 4701
-INFO:tensorflow:loss = 4.86191, step = 4801
-INFO:tensorflow:loss = 4.85788, step = 4901
-...
-INFO:tensorflow:Saving evaluation summary for 5000 step: loss = 5.581
-Loss: 5.581
-```
-
-The loss score reported is the mean squared error returned from the `model_fn`
-when run on the `ABALONE_TEST` data set.
-
-To predict ages for the `ABALONE_PREDICT` data set, add the following to
-`main()`:
-
-```python
-# Print out predictions
-predict_input_fn = tf.estimator.inputs.numpy_input_fn(
-    x={"x": prediction_set.data},
-    num_epochs=1,
-    shuffle=False)
-predictions = nn.predict(input_fn=predict_input_fn)
-for i, p in enumerate(predictions):
-  print("Prediction %s: %s" % (i + 1, p["ages"]))
-```
-
-Here, the `predict()` function returns results in `predictions` as an iterable.
-The `for` loop enumerates and prints out the results. Rerun the code, and you
-should see output similar to the following:
-
-```python
-...
-Prediction 1: 4.92229
-Prediction 2: 10.3225
-Prediction 3: 7.384
-Prediction 4: 10.6264
-Prediction 5: 11.0862
-Prediction 6: 9.39239
-Prediction 7: 11.1289
-```
-
-## Additional Resources
-
-Congrats! You've successfully built a tf.estimator `Estimator` from scratch.
-For additional reference materials on building `Estimator`s, see the following
-sections of the API guides:
-
-*   @{$python/contrib.layers$Layers}
-*   @{$python/contrib.losses$Losses}
-*   @{$python/contrib.layers#optimization$Optimization}
diff --git a/tensorflow/docs_src/extend/index.md b/tensorflow/docs_src/extend/index.md
index 00b168c6be96a158c3be69fbcefbf941c0fbbe4d..bdff60b39ec6fe939273a529ec4e46407cface8a 100644
--- a/tensorflow/docs_src/extend/index.md
+++ b/tensorflow/docs_src/extend/index.md
@@ -14,9 +14,6 @@ TensorFlow:
     add support for your own shared or distributed filesystem.
   * @{$new_data_formats$Custom Data Readers}, which details how to add support
     for your own file and record formats.
-  * @{$extend/estimators$Creating Estimators in tf.contrib.learn}, which explains how
-    to write your own custom Estimator.  For example, you could build your
-    own Estimator to implement some variation on standard linear regression.
 
 Python is currently the only language supported by TensorFlow's API stability
 promises.  However, TensorFlow also provides functionality in C++, Java, and Go,
diff --git a/tensorflow/docs_src/extend/leftnav_files b/tensorflow/docs_src/extend/leftnav_files
index 8dbb54f6f63e26e3af725fe55a4d7b2b5ba3cd5d..12315b711b6d1c74bd3b5a5195f6c5c995d2d63f 100644
--- a/tensorflow/docs_src/extend/leftnav_files
+++ b/tensorflow/docs_src/extend/leftnav_files
@@ -3,6 +3,5 @@ architecture.md
 adding_an_op.md
 add_filesys.md
 new_data_formats.md
-estimators.md
 language_bindings.md
 tool_developers/index.md
diff --git a/tensorflow/docs_src/get_started/checkpoints.md b/tensorflow/docs_src/get_started/checkpoints.md
new file mode 100644
index 0000000000000000000000000000000000000000..dfa2110e691167f54e6ea8b7a832f0a88d0ec41a
--- /dev/null
+++ b/tensorflow/docs_src/get_started/checkpoints.md
@@ -0,0 +1,238 @@
+# Checkpoints
+
+This document examines how to save and restore TensorFlow models built with
+Estimators. TensorFlow provides two model formats:
+
+*   checkpoints, which is a format dependent on the code that created
+    the model.
+*   SavedModel, which is a format independent of the code that created
+    the model.
+
+This document focuses on checkpoints. For details on SavedModel, see the
+@{$saved_model$Saving and Restoring} chapter of the
+*TensorFlow Programmer's Guide*.
+
+
+## Sample code
+
+This document relies on the same
+[Iris classification example](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py) detailed in @{$premade_estimators$Getting Started with TensorFlow}.
+To download and access the example, invoke the following two commands:
+
+```shell
+git clone https://github.com/tensorflow/models/
+cd models/samples/core/get_started
+```
+
+Most of the code snippets in this document are minor variations
+on `premade_estimator.py`.
+
+
+## Saving partially-trained models
+
+Estimators automatically write the following to disk:
+
+*   **checkpoints**, which are versions of the model created during training.
+*   **event files**, which contain information that
+    [TensorBoard](https://developers.google.com/machine-learning/glossary/#TensorBoard)
+    uses to create visualizations.
+
+To specify the top-level directory in which the Estimator stores its
+information, assign a value to the optional `model_dir` argument of any
+Estimator's constructor.  For example, the following code sets the `model_dir`
+argument to the `models/iris` directory:
+
+```python
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    hidden_units=[10, 10],
+    n_classes=3,
+    model_dir='models/iris')
+```
+
+Suppose you call the Estimator's `train` method. For example:
+
+
+```python
+classifier.train(
+        input_fn=lambda:train_input_fn(train_x, train_y, batch_size=100),
+                steps=200)
+```
+
+As suggested by the following diagrams, the first call to `train`
+adds checkpoints and other files to the `model_dir` directory:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/first_train_calls.png">
+</div>
+<div style="text-align: center">
+The first call to train().
+</div>
+
+
+To see the objects in the created `model_dir` directory on a
+UNIX-based system, just call `ls` as follows:
+
+```none
+$ ls -1 models/iris
+checkpoint
+events.out.tfevents.timestamp.hostname
+graph.pbtxt
+model.ckpt-1.data-00000-of-00001
+model.ckpt-1.index
+model.ckpt-1.meta
+model.ckpt-200.data-00000-of-00001
+model.ckpt-200.index
+model.ckpt-200.meta
+```
+
+The preceding `ls` command shows that the Estimator created checkpoints
+at steps 1 (the start of training) and 200 (the end of training).
+
+
+### Default checkpoint directory
+
+If you don't specify `model_dir` in an Estimator's constructor, the Estimator
+writes checkpoint files to a temporary directory chosen by Python's
+[tempfile.mkdtemp](https://docs.python.org/3/library/tempfile.html#tempfile.mkdtemp)
+function. For example, the following Estimator constructor does *not* specify
+the `model_dir` argument:
+
+```python
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    hidden_units=[10, 10],
+    n_classes=3)
+
+print(classifier.model_dir)
+```
+
+The `tempfile.mkdtemp` function picks a secure, temporary directory
+appropriate for your operating system. For example, a typical temporary
+directory on macOS might be something like the following:
+
+```None
+/var/folders/0s/5q9kfzfj3gx2knj0vj8p68yc00dhcr/T/tmpYm1Rwa
+```
+
+### Checkpointing Frequency
+
+By default, the Estimator saves
+[checkpoints](https://developers.google.com/machine-learning/glossary/#checkpoint)
+in the `model_dir` according to the following schedule:
+
+*   Writes a checkpoint every 10 minutes (600 seconds).
+*   Writes a checkpoint when the `train` method starts (first iteration)
+    and completes (final iteration).
+*   Retains only the 5 most recent checkpoints in the directory.
+
+You may alter the default schedule by taking the following steps:
+
+1.  Create a @{tf.estimator.RunConfig$`RunConfig`} object that defines the
+    desired schedule.
+2.  When instantiating the Estimator, pass that `RunConfig` object to the
+    Estimator's `config` argument.
+
+For example, the following code changes the checkpointing schedule to every
+20 minutes and retains the 10 most recent checkpoints:
+
+```python
+my_checkpointing_config = tf.estimator.RunConfig(
+    save_checkpoints_secs = 20*60,  # Save checkpoints every 20 minutes.
+    keep_checkpoint_max = 10,       # Retain the 10 most recent checkpoints.
+)
+
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    hidden_units=[10, 10],
+    n_classes=3,
+    model_dir='models/iris',
+    config=my_checkpointing_config)
+```
+
+## Restoring your model
+
+The first time you call an Estimator's `train` method, TensorFlow saves a
+checkpoint to the `model_dir`. Each subsequent call to the Estimator's
+`train`, `eval`, or `predict` method causes the following:
+
+1.  The Estimator builds the model's
+    [graph](https://developers.google.com/machine-learning/glossary/#graph)
+    by running the `model_fn()`.  (For details on the `model_fn()`, see
+    @{$custom_estimators$Creating Custom Estimators.})
+2.  The Estimator initializes the weights of the new model from the data
+    stored in the most recent checkpoint.
+
+In other words, as the following illustration suggests, once checkpoints
+exist, TensorFlow rebuilds the model each time you call `train()`,
+`evaluate()`, or `predict()`.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/subsequent_calls.png">
+</div>
+<div style="text-align: center">
+Subsequent calls to train(), evaluate(), or predict()
+</div>
+
+
+### Avoiding a bad restoration
+
+Restoring a model's state from a checkpoint only works if the model
+and checkpoint are compatible.  For example, suppose you trained a
+`DNNClassifier` Estimator containing two hidden layers,
+each having 10 nodes:
+
+```python
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=feature_columns,
+    hidden_units=[10, 10],
+    n_classes=3,
+    model_dir='models/iris')
+
+classifier.train(
+    input_fn=lambda:train_input_fn(train_x, train_y, batch_size=100),
+        steps=200)
+```
+
+After training (and, therefore, after creating checkpoints in `models/iris`),
+imagine that you changed the number of neurons in each hidden layer from 10 to
+20 and then attempted to retrain the model:
+
+``` python
+classifier2 = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    hidden_units=[20, 20],  # Change the number of neurons in the model.
+    n_classes=3,
+    model_dir='models/iris')
+
+classifier.train(
+    input_fn=lambda:train_input_fn(train_x, train_y, batch_size=100),
+        steps=200)
+```
+
+Since the state in the checkpoint is incompatible with the model described
+in `classifier2`, retraining fails with the following error:
+
+```None
+...
+InvalidArgumentError (see above for traceback): tensor_name =
+dnn/hiddenlayer_1/bias/t_0/Adagrad; shape in shape_and_slice spec [10]
+does not match the shape stored in checkpoint: [20]
+```
+
+To run experiments in which you train and compare slightly different
+versions of a model, save a copy of the code that created each
+`model-dir`, possibly by creating a separate git branch for each version.
+This separation will keep your checkpoints recoverable.
+
+## Summary
+
+Checkpoints provide an easy automatic mechanism for saving and restoring
+models created by Estimators.
+
+See the @{$saved_model$Saving and Restoring}
+chapter of the *TensorFlow Programmer's Guide* for details on:
+
+*   Saving and restoring models using low-level TensorFlow APIs.
+*   Exporting and importing models in the SavedModel format, which is a
+    language-neutral, recoverable, serialization format.
diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
new file mode 100644
index 0000000000000000000000000000000000000000..42a246678a054d637fea5a82a03ecb84ff412bd9
--- /dev/null
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -0,0 +1,602 @@
+
+# Creating Custom Estimators
+
+This document introduces custom Estimators. In particular, this document
+demonstrates how to create a custom @{tf.estimator.Estimator$Estimator} that
+mimics the behavior of the pre-made Estimator
+@{tf.estimator.DNNClassifier$`DNNClassifier`} in solving the Iris problem. See
+the @{$get_started/premade_estimators$Pre-Made Estimators chapter} for details
+on the Iris problem.
+
+To download and access the example code invoke the following two commands:
+
+```shell
+git clone https://github.com/tensorflow/models/
+cd models/samples/core/get_started
+```
+
+In this document we will be looking at
+[`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py).
+You can run it with the following command:
+
+```bsh
+python custom_estimator.py
+```
+
+If you are feeling impatient, feel free to compare and contrast
+[`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py)
+with
+[`premade_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py).
+(which is in the same directory).
+
+
+
+## Pre-made vs. custom
+
+As the following figure shows, pre-made Estimators are subclasses of the
+@{tf.estimator.Estimator} base class, while custom Estimators are an instance
+of tf.estimator.Estimator:
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="display:block; margin: 0 auto"
+  alt="Premade estimators are sub-classes of `Estimator`. Custom Estimators are usually (direct) instances of `Estimator`"
+  src="../images/custom_estimators/estimator_types.png">
+</div>
+<div style="text-align: center">
+Pre-made and custom Estimators are all Estimators.
+</div>
+
+Pre-made Estimators are fully baked. Sometimes though, you need more control
+over an Estimator's behavior.  That's where custom Estimators come in. You can
+create a custom Estimator to do just about anything. If you want hidden layers
+connected in some unusual fashion, write a custom Estimator. If you want to
+calculate a unique
+[metric](https://developers.google.com/machine-learning/glossary/#metric)
+for your model, write a custom Estimator.  Basically, if you want an Estimator
+optimized for your specific problem, write a custom Estimator.
+
+A model function (or `model_fn`) implements the ML algorithm. The
+only difference between working with pre-made Estimators and custom Estimators
+is:
+
+* With pre-made Estimators, someone already wrote the model function for you.
+* With custom Estimators, you must write the model function.
+
+Your model function could implement a wide range of algorithms, defining all
+sorts of hidden layers and metrics.  Like input functions, all model functions
+must accept a standard group of input parameters and return a standard group of
+output values. Just as input functions can leverage the Dataset API, model
+functions can leverage the Layers API and the Metrics API.
+
+Let's see how to solve the Iris problem with a custom Estimator. A quick
+reminder--here's the organization of the Iris model that we're trying to mimic:
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="display:block; margin: 0 auto"
+  alt="A diagram of the network architecture: Inputs, 2 hidden layers, and outputs"
+  src="../images/custom_estimators/full_network.png">
+</div>
+<div style="text-align: center">
+Our implementation of Iris contains four features, two hidden layers,
+and a logits output layer.
+</div>
+
+## Write an Input function
+
+Our custom Estimator implementation uses the same input function as our
+@{$get_started/premade_estimators$pre-made Estimator implementation}, from
+[`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py).
+Namely:
+
+```python
+def train_input_fn(features, labels, batch_size):
+    """An input function for training"""
+    # Convert the inputs to a Dataset.
+    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
+
+    # Shuffle, repeat, and batch the examples.
+    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
+
+    # Return the read end of the pipeline.
+    return dataset.make_one_shot_iterator().get_next()
+```
+
+This input function builds an input pipeline that yields batches of
+`(features, labels)` pairs, where `features` is a dictionary features.
+
+## Create feature columns
+
+As detailed in the @{$get_started/premade_estimators$Premade Estimators} and
+@{$get_started/feature_columns$Feature Columns} chapters, you must define
+your model's feature columns to specify how the model should use each feature.
+Whether working with pre-made Estimators or custom Estimators, you define
+feature columns in the same fashion.
+
+The following code creates a simple `numeric_column` for each input feature,
+indicating that the value of the input feature should be used directly as an
+input to the model:
+
+```python
+# Feature columns describe how to use the input.
+my_feature_columns = []
+for key in train_x.keys():
+    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
+```
+
+## Write a model function
+
+The model function we'll use has the following call signature:
+
+```python
+def my_model_fn(
+   features, # This is batch_features from input_fn
+   labels,   # This is batch_labels from input_fn
+   mode,     # An instance of tf.estimator.ModeKeys
+   params):  # Additional configuration
+```
+
+The first two arguments are the batches of features and labels returned from
+the input function; that is, `features` and `labels` are the handles to the
+data your model will use. The `mode` argument indicates whether the caller is
+requesting training, predicting, or evaluation.
+
+The caller may pass `params` to an Estimator's constructor. Any `params` passed
+to the constructor are in turn passed on to the `model_fn`. In
+[`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py)
+the following lines create the estimator and set the params to configure the
+model. This configuration step is similar to how we configured the @{tf.estimator.DNNClassifier} in
+@{$get_started/premade_estimators}.
+
+```python
+classifier = tf.estimator.Estimator(
+    model_fn=my_model,
+    params={
+        'feature_columns': my_feature_columns,
+        # Two hidden layers of 10 nodes each.
+        'hidden_units': [10, 10],
+        # The model must choose between 3 classes.
+        'n_classes': 3,
+    })
+```
+
+To implement a typical model function, you must do the following:
+
+* [Define the model](#define_the_model).
+* Specify additional calculations for each of
+  the [three different modes](#modes):
+  * [Predict](#predict)
+  * [Evaluate](#evaluate)
+  * [Train](#train)
+
+## Define the model
+
+The basic deep neural network model must define the following three sections:
+
+* An [input layer](https://developers.google.com/machine-learning/glossary/#input_layer)
+* One or more [hidden layers](https://developers.google.com/machine-learning/glossary/#hidden_layer)
+* An [output layer](https://developers.google.com/machine-learning/glossary/#output_layer)
+
+### Define the input layer
+
+The first line of the `model_fn` calls @{tf.feature_column.input_layer} to
+convert the feature dictionary and `feature_columns` into input for your model,
+as follows:
+
+```python
+    # Use `input_layer` to apply the feature columns.
+    net = tf.feature_column.input_layer(features, params['feature_columns'])
+```
+
+The preceding line applies the transformations defined by your feature columns,
+creating the model's input layer.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="display:block; margin: 0 auto"
+  alt="A diagram of the input layer, in this case a 1:1 mapping from raw-inputs to features."
+  src="../images/custom_estimators/input_layer.png">
+</div>
+
+
+### Hidden Layers
+
+If you are creating a deep neural network, you must define one or more hidden
+layers. The Layers API provides a rich set of functions to define all types of
+hidden layers, including convolutional, pooling, and dropout layers. For Iris,
+we're simply going to call @{tf.layers.dense} to create hidden layers, with
+dimensions defined by `params['hidden_layers']`. In a `dense` layer each node
+is connected to every node in the preceding layer.  Here's the relevant code:
+
+``` python
+    # Build the hidden layers, sized according to the 'hidden_units' param.
+    for units in params['hidden_units']:
+        net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
+```
+
+* The `units` parameter defines the number of output neurons in a given layer.
+* The `activation` parameter defines the [activation function](https://developers.google.com/machine-learning/glossary/#a) —
+  [Relu](https://developers.google.com/machine-learning/glossary/#ReLU) in this
+  case.
+
+The variable `net` here signifies the current top layer of the network. During
+the first iteration, `net` signifies the input layer. On each loop iteration
+`tf.layers.dense` creates a new layer, which takes the previous layer's output
+as its input, using the variable `net`.
+
+After creating two hidden layers, our network looks as follows. For
+simplicity, the figure does not show all the units in each layer.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="display:block; margin: 0 auto"
+  alt="The input layer with two hidden layers added."
+  src="../images/custom_estimators/add_hidden_layer.png">
+</div>
+
+Note that @{tf.layers.dense} provides many additional capabilities, including
+the ability to set a multitude of regularization parameters. For the sake of
+simplicity, though, we're going to simply accept the default values of the
+other parameters.
+
+### Output Layer
+
+We'll define the output layer by calling @{tf.layers.dense} yet again, this
+time without an activation function:
+
+```python
+    # Compute logits (1 per class).
+    logits = tf.layers.dense(net, params['n_classes'], activation=None)
+```
+
+Here, `net` signifies the final hidden layer. Therefore, the full set of layers
+is now connected as follows:
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="display:block; margin: 0 auto"
+  alt="A logit output layer connected to the top hidden layer"
+  src="../images/custom_estimators/add_logits.png">
+</div>
+<div style="text-align: center">
+The final hidden layer feeds into the output layer.
+</div>
+
+When defining an output layer, the `units` parameter specifies the number of
+outputs. So, by setting `units` to `params['n_classes']`, the model produces
+one output value per class. Each element of the output vector will contain the
+score, or "logit", calculated for the associated class of Iris: Setosa,
+Versicolor, or Virginica, respectively.
+
+Later on, these logits will be transformed into probabilities by the
+@{tf.nn.softmax} function.
+
+## Implement training, evaluation, and prediction {#modes}
+
+The final step in creating a model function is to write branching code that
+implements prediction, evaluation, and training.
+
+The model function gets invoked whenever someone calls the Estimator's `train`,
+`evaluate`, or `predict` methods. Recall that the signature for the model
+function looks like this:
+
+``` python
+def my_model_fn(
+   features, # This is batch_features from input_fn
+   labels,   # This is batch_labels from input_fn
+   mode,     # An instance of tf.estimator.ModeKeys, see below
+   params):  # Additional configuration
+```
+
+Focus on that third argument, mode. As the following table shows, when someone
+calls `train`, `evaluate`, or `predict`, the Estimator framework invokes your model
+function with the mode parameter set as follows:
+
+| Estimator method                 |    Estimator Mode |
+|:---------------------------------|:------------------|
+|@{tf.estimator.Estimator.train$`train()`} |@{tf.estimator.ModeKeys.TRAIN$`ModeKeys.TRAIN`} |
+|@{tf.estimator.Estimator.evaluate$`evaluate()`}  |@{tf.estimator.ModeKeys.EVAL$`ModeKeys.EVAL`}      |
+|@{tf.estimator.Estimator.predict$`predict()`}|@{tf.estimator.ModeKeys.PREDICT$`ModeKeys.PREDICT`} |
+
+For example, suppose you instantiate a custom Estimator to generate an object
+named `classifier`. Then, you make the following call:
+
+``` python
+classifier = tf.estimator.Estimator(...)
+classifier.train(input_fn=lambda: my_input_fn(FILE_TRAIN, True, 500))
+```
+The Estimator framework then calls your model function with mode set to
+`ModeKeys.TRAIN`.
+
+Your model function must provide code to handle all three of the mode values.
+For each mode value, your code must return an instance of
+`tf.estimator.EstimatorSpec`, which contains the information the caller
+requires. Let's examine each mode.
+
+### Predict
+
+When the Estimator's `predict` method is called, the `model_fn` receives
+`mode = ModeKeys.PREDICT`. In this case, the model function must return a
+`tf.estimator.EstimatorSpec` containing the prediction.
+
+The model must have been trained prior to making a prediction. The trained model
+is stored on disk in the `model_dir` directory established when you
+instantiated the Estimator.
+
+The code to generate the prediction for this model looks as follows:
+
+```python
+# Compute predictions.
+predicted_classes = tf.argmax(logits, 1)
+if mode == tf.estimator.ModeKeys.PREDICT:
+    predictions = {
+        'class_ids': predicted_classes[:, tf.newaxis],
+        'probabilities': tf.nn.softmax(logits),
+        'logits': logits,
+    }
+    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+```
+The prediction dictionary contains everything that your model returns when run
+in prediction mode.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="display:block; margin: 0 auto"
+  alt="Additional outputs added to the output layer."
+  src="../images/custom_estimators/add_predictions.png">
+</div>
+
+The `predictions` holds the following three key/value pairs:
+
+*   `class_ids` holds the class id (0, 1, or 2) representing the model's
+    prediction of the most likely species for this example.
+*   `probabilities` holds the three probabilities (in this example, 0.02, 0.95,
+    and 0.03)
+*   `logit` holds the raw logit values (in this example, -1.3, 2.6, and -0.9)
+
+We return that dictionary to the caller via the `predictions` parameter of the
+@{tf.estimator.EstimatorSpec}. The Estimator's
+@{tf.estimator.Estimator.predict$`predict`} method will yield these
+dictionaries.
+
+### Calculate the loss
+
+For both [training](#train) and [evaluation](#evaluate) we need to calculate the
+model's loss. This is the
+[objective](https://developers.google.com/machine-learning/glossary/#objective)
+that will be optimized.
+
+We can calculate the loss by calling @{tf.losses.sparse_softmax_cross_entropy}.
+The value returned by this function will be lowest, approximately 0,
+probability of the correct class (at index `label`) is near 1.0. The loss value
+returned is progressively larger as the probability of the correct class
+decreases.
+
+This function returns the average over the whole batch.
+
+```python
+# Compute loss.
+loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
+```
+
+### Evaluate
+
+When the Estimator's `evaluate` method is called, the `model_fn` receives
+`mode = ModeKeys.EVAL`. In this case, the model function must return a
+`tf.estimator.EstimatorSpec` containing the model's loss and optionally one
+or more metrics.
+
+Although returning metrics is optional, most custom Estimators do return at
+least one metric. TensorFlow provides a Metrics module @{tf.metrics} to
+calculate common metrics.  For brevity's sake, we'll only return accuracy. The
+@{tf.metrics.accuracy} function compares our predictions against the
+true values, that is, against the labels provided by the input function. The
+@{tf.metrics.accuracy} function requires the labels and predictions to have the
+same shape. Here's the call to @{tf.metrics.accuracy}:
+
+``` python
+# Compute evaluation metrics.
+accuracy = tf.metrics.accuracy(labels=labels,
+                               predictions=predicted_classes,
+                               name='acc_op')
+```
+
+The @{tf.estimator.EstimatorSpec$`EstimatorSpec`} returned for evaluation
+typically contains the following information:
+
+* `loss`, which is the model's loss
+* `eval_metric_ops`, which is an optional dictionary of metrics.
+
+So, we'll create a dictionary containing our sole metric. If we had calculated
+other metrics, we would have added them as additional key/value pairs to that
+same dictionary.  Then, we'll pass that dictionary in the `eval_metric_ops`
+argument of `tf.estimator.EstimatorSpec`. Here's the code:
+
+```python
+metrics = {'accuracy': accuracy}
+tf.summary.scalar('accuracy', accuracy[1])
+
+if mode == tf.estimator.ModeKeys.EVAL:
+    return tf.estimator.EstimatorSpec(
+        mode, loss=loss, eval_metric_ops=metrics)
+```
+
+The @{tf.summary.scalar} will make accuracy available to TensorBoard
+in both `TRAIN` and `EVAL` modes. (More on this later).
+
+### Train
+
+When the Estimator's `train` method is called, the `model_fn` is called
+with `mode = ModeKeys.TRAIN`. In this case, the model function must return an
+`EstimatorSpec` that contains the loss and a training operation.
+
+Building the training operation will require an optimizer. We will use
+@{tf.train.AdagradOptimizer} because we're mimicking the `DNNClassifier`, which
+also uses `Adagrad` by default. The `tf.train` package provides many other
+optimizers—feel free to experiment with them.
+
+Here is the code that builds the optimizer:
+
+``` python
+optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
+```
+
+Next, we build the training operation using the optimizer's
+@{tf.train.Optimizer.minimize$`minimize`} method on the loss we calculated
+earlier.
+
+The `minimize` method also takes a `global_step` parameter. TensorFlow uses this
+parameter to count the number of training steps that have been processed
+(to know when to end a training run). Furthermore, the `global_step` is
+essential for TensorBoard graphs to work correctly. Simply call
+@{tf.train.get_global_step} and pass the result to the `global_step`
+argument of `minimize`.
+
+Here's the code to train the model:
+
+``` python
+train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
+```
+
+The @{tf.estimator.EstimatorSpec$`EstimatorSpec`} returned for training
+must have the following fields set:
+
+* `loss`, which contains the value of the loss function.
+* `train_op`, which executes a training step.
+
+Here's our code to call `EstimatorSpec`:
+
+```python
+return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+```
+
+The model function is now complete.
+
+## The custom Estimator
+
+Instantiate the custom Estimator through the Estimator base class as follows:
+
+```python
+    # Build 2 hidden layer DNN with 10, 10 units respectively.
+    classifier = tf.estimator.Estimator(
+        model_fn=my_model,
+        params={
+            'feature_columns': my_feature_columns,
+            # Two hidden layers of 10 nodes each.
+            'hidden_units': [10, 10],
+            # The model must choose between 3 classes.
+            'n_classes': 3,
+        })
+```
+Here the `params` dictionary serves the same purpose as the key-word
+arguments of `DNNClassifier`; that is, the `params` dictionary lets you
+configure your Estimator without modifying the code in the `model_fn`.
+
+The rest of the code to train, evaluate, and generate predictions using our
+Estimator is the same as in the
+@{$get_started/premade_estimators$Premade Estimators} chapter. For
+example, the following line will train the model:
+
+```python
+# Train the Model.
+classifier.train(
+    input_fn=lambda:iris_data.train_input_fn(train_x, train_y, args.batch_size),
+    steps=args.train_steps)
+```
+
+## TensorBoard
+
+You can view training results for your custom Estimator in TensorBoard. To see
+this reporting, start TensorBoard from your command line as follows:
+
+```bsh
+# Replace PATH with the actual path passed as model_dir
+tensorboard --logdir=PATH
+```
+
+Then, open TensorBoard by browsing to: [http://localhost:6006](http://localhost:6006)
+
+All the pre-made Estimators automatically log a lot of information to
+TensorBoard. With custom Estimators, however, TensorBoard only provides one
+default log (a graph of the loss) plus the information you explicitly tell
+TensorBoard to log. For the custom Estimator you just created, TensorBoard
+generates the following:
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+
+<img style="display:block; margin: 0 auto"
+  alt="Accuracy, 'scalar' graph from tensorboard"
+  src="../images/custom_estimators/accuracy.png">
+
+<img style="display:block; margin: 0 auto"
+  alt="loss 'scalar' graph from tensorboard"
+  src="../images/custom_estimators/loss.png">
+
+<img style="display:block; margin: 0 auto"
+  alt="steps/second 'scalar' graph from tensorboard"
+  src="../images/custom_estimators/steps_per_second.png">
+</div>
+
+<div style="text-align: center">
+TensorBoard displays three graphs.
+</div>
+
+
+In brief, here's what the three graphs tell you:
+
+* global_step/sec: A performance indicator showing how many batches (gradient
+  updates) we processed per second as the model trains.
+
+* loss: The loss reported.
+
+* accuracy: The accuracy is recorded by the following two lines:
+
+  * `eval_metric_ops={'my_accuracy': accuracy})`, during evaluation.
+  * `tf.summary.scalar('accuracy', accuracy[1])`, during training.
+
+These tensorboard graphs are one of the main reasons it's important to pass a
+`global_step` to your optimizer's `minimize` method. The model can't record
+the x-coordinate for these graphs without it.
+
+Note the following in the `my_accuracy` and `loss` graphs:
+
+* The orange line represents training.
+* The blue dot represents evaluation.
+
+During training, summaries (the orange line) are recorded periodically as
+batches are processed, which is why it becomes a graph spanning x-axis range.
+
+By contrast, evaluation produces only a single point on the graph for each call
+to `evaluate`. This point contains the average over the entire evaluation call.
+This has no width on the graph as it is evaluated entirely from the model state
+at a particular training step (from a single checkpoint).
+
+As suggested in the following figure, you may see and also selectively
+disable/enable the reporting using the controls on the left side.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="display:block; margin: 0 auto"
+  alt="Check-boxes allowing the user to select which runs are shown."
+  src="../images/custom_estimators/select_run.jpg">
+</div>
+<div style="text-align: center">
+Enable or disable reporting.
+</div>
+
+
+## Summary
+
+Although pre-made Estimators can be an effective way to quickly create new
+models, you will often need the additional flexibility that custom Estimators
+provide. Fortunately, pre-made and custom Estimators follow the same
+programming model. The only practical difference is that you must write a model
+function for custom Estimators; everything else is the same.
+
+For more details, be sure to check out:
+
+* The
+  [official TensorFlow implementation of MNIST](https://github.com/tensorflow/models/tree/master/official/mnist),
+  which uses a custom estimator.
+* The TensorFlow
+  [official models repository](https://github.com/tensorflow/models/tree/master/official),
+  which contains more curated examples using custom estimators.
+* This [TensorBoard video](https://youtu.be/eBbEDRsCmv4), which introduces
+  TensorBoard.
+* The @{$low_level_intro$Low Level Introduction}, which demonstrates
+  how to experiment directly with TensorFlow's low level APIs, making debugging
+  easier.
diff --git a/tensorflow/docs_src/get_started/datasets_quickstart.md b/tensorflow/docs_src/get_started/datasets_quickstart.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8a2ab6e56130c7805d48477301c63d88f87489c
--- /dev/null
+++ b/tensorflow/docs_src/get_started/datasets_quickstart.md
@@ -0,0 +1,402 @@
+# Datasets Quick Start
+
+The @{tf.data} module contains a collection of classes that allows you to
+easily load data, manipulate it, and pipe it into your model. This document
+introduces the API by walking through two simple examples:
+
+* Reading in-memory data from numpy arrays.
+* Reading lines from a csv file.
+
+<!-- TODO(markdaoust): Add links to an example reading from multiple-files
+(image_retraining), and a from_generator example. -->
+
+## Basic input
+
+Taking slices from an array is the simplest way to get started with `tf.data`.
+
+The @{$get_started/premade_estimators$Premade Estimators} chapter describes
+the following `train_input_fn`, from
+[`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py),
+to pipe the data into the Estimator:
+
+``` python
+def train_input_fn(features, labels, batch_size):
+    """An input function for training"""
+    # Convert the inputs to a Dataset.
+    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
+
+    # Shuffle, repeat, and batch the examples.
+    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
+
+    # Build the Iterator, and return the read end of the pipeline.
+    return dataset.make_one_shot_iterator().get_next()
+```
+
+Let's look at this more closely.
+
+### Arguments
+
+This function expects three arguments. Arguments expecting an "array" can
+accept nearly anything that can be converted to an array with `numpy.array`.
+One exception is
+[`tuple`](https://docs.python.org/3/tutorial/datastructures.html#tuples-and-sequences)
+which has special meaning for `Datasets`.
+
+* `features`: A `{'feature_name':array}` dictionary (or
+  [`DataFrame`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html))
+  containing the raw input features.
+* `labels` : An array containing the
+  [label](https://developers.google.com/machine-learning/glossary/#label)
+  for each example.
+* `batch_size` : An integer indicating the desired batch size.
+
+In [`premade_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py)
+we retrieved the Iris data using the `iris_data.load_data()` function.
+You can run it, and unpack the results as follows:
+
+``` python
+import iris_data
+
+# Fetch the data
+train, test = iris_data.load_data()
+features, labels = train
+```
+
+Then we passed this data to the input function, with a line similar to this:
+
+``` python
+batch_size=100
+iris_data.train_input_fn(features, labels, batch_size)
+```
+
+Let's walk through the `train_input_fn()`.
+
+### Slices
+
+In the simplest cases, @{tf.data.Dataset.from_tensor_slices} function takes an
+array and returns a @{tf.data.Dataset} representing slices of the array. For
+example, an array containing the @{$tutorials/layers$mnist training data}
+has a shape of `(60000, 28, 28)`. Passing this to `from_tensor_slices` returns
+a `Dataset` object containing 60000 slices, each one a 28x28 image.
+
+The code that returns this `Dataset` is as follows:
+
+``` python
+train, test = tf.keras.datasets.mnist.load_data()
+mnist_x, mnist_y = train
+
+mnist_ds = tf.data.Dataset.from_tensor_slices(mnist_x)
+print(mnist_ds)
+```
+
+This will print the following line, showing the @{$programmers_guide/tensors#shapes$shapes} and @{$programmers_guide/tensors#data_types$types} of the items in
+the dataset. Note that the dataset does not know how many items it contains.
+
+``` None
+<TensorSliceDataset shapes: (28,28), types: tf.uint8>
+```
+
+The dataset above represents a collection of simple arrays, but datasets are
+much more powerful than this. Datasets transparently handle any nested
+combination of dictionaries or tuples. For example, ensuring that `features`
+is a standard dictionary, you can then convert the dictionary of arrays to
+a `Dataset` of dictionaries as follows:
+
+``` python
+dataset = tf.data.Dataset.from_tensor_slices(dict(features))
+print(dataset)
+```
+``` None
+<TensorSliceDataset
+
+  shapes: {
+    SepalLength: (), PetalWidth: (),
+    PetalLength: (), SepalWidth: ()},
+
+  types: {
+      SepalLength: tf.float64, PetalWidth: tf.float64,
+      PetalLength: tf.float64, SepalWidth: tf.float64}
+>
+```
+
+Here we see that when a `Dataset` contains structured elements, the `shapes`
+and `types` of the `Dataset` take on the same structure. This dataset contains
+dictionaries of @{$programmers_guide/tensors#rank$scalars}, all of type
+`tf.float64`.
+
+The first line of `train_input_fn` uses the same functionality, but adds
+another level of structure. It creates a dataset containing
+`(features, labels)` pairs.
+
+The following code shows that the label is a scalar with type `int64`:
+
+``` python
+# Convert the inputs to a Dataset.
+dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
+print(dataset)
+```
+```
+<TensorSliceDataset
+    shapes: (
+        {
+          SepalLength: (), PetalWidth: (),
+          PetalLength: (), SepalWidth: ()},
+        ()),
+
+    types: (
+        {
+          SepalLength: tf.float64, PetalWidth: tf.float64,
+          PetalLength: tf.float64, SepalWidth: tf.float64},
+        tf.int64)>
+```
+
+### Manipulation
+
+Currently the `Dataset` would iterate over the data once, in a fixed order, and
+only produce a single element at a time. It needs further processing before it
+can be used for training. Fortunately, the `tf.data.Dataset` class provides
+methods to better prepare the data for training. The next line of the input
+function takes advantage of several of these methods:
+
+``` python
+# Shuffle, repeat, and batch the examples.
+dataset = dataset.shuffle(1000).repeat().batch(batch_size)
+```
+
+The @{tf.data.Dataset.shuffle$`shuffle`} method uses a fixed-size buffer to
+shuffle the items as they pass through. Setting a `buffer_size` greater than
+the number of examples in the `Dataset` ensures that the data is completely
+shuffled. The Iris data set only contains 150 examples.
+
+The @{tf.data.Dataset.repeat$`repeat`} method has the `Dataset` restart when
+it reaches the end. To limit the number of epochs, set the `count` argument.
+
+The @{tf.data.Dataset.repeat$`batch`} method collects a number of examples and
+stacks them, to create batches. This adds a dimension to their shape. The new
+dimension is added as the first dimension. The following code uses
+the `batch` method on the MNIST `Dataset`, from earlier. This results in a
+`Dataset` containing 3D arrays representing stacks of `(28,28)` images:
+
+``` python
+print(mnist_ds.batch(100))
+```
+
+``` none
+<BatchDataset
+  shapes: (?, 28, 28),
+  types: tf.uint8>
+```
+Note that the dataset has an unknown batch size because the last batch will
+have fewer elements.
+
+In `train_input_fn`, after batching the `Dataset` contains 1D vectors of
+elements where each scalar was previously:
+
+```python
+print(dataset)
+```
+```
+<TensorSliceDataset
+    shapes: (
+        {
+          SepalLength: (?,), PetalWidth: (?,),
+          PetalLength: (?,), SepalWidth: (?,)},
+        (?,)),
+
+    types: (
+        {
+          SepalLength: tf.float64, PetalWidth: tf.float64,
+          PetalLength: tf.float64, SepalWidth: tf.float64},
+        tf.int64)>
+```
+
+
+### Return
+
+<!-- TODO(markdaoust) This line can be simplified to "return dataset" -->
+
+The `train`, `evaluate`, and `predict` methods of every Estimator require
+input functions to return a `(features, label)` pair containing
+@{$programmers_guide/tensors$tensorflow tensors}. The `train_input_fn` uses
+the following line to convert the Dataset into the expected format:
+
+```python
+# Build the Iterator, and return the read end of the pipeline.
+features_result, labels_result = dataset.make_one_shot_iterator().get_next()
+```
+
+The result is a structure of @{$programmers_guide/tensors$TensorFlow tensors},
+matching the layout of the items in the `Dataset`.
+For an introduction to what these objects are and how to work with them,
+see @{$programmers_guide/low_level_intro}.
+
+``` python
+print((features_result, labels_result))
+```
+
+```None
+({
+    'SepalLength': <tf.Tensor 'IteratorGetNext:2' shape=(?,) dtype=float64>,
+    'PetalWidth': <tf.Tensor 'IteratorGetNext:1' shape=(?,) dtype=float64>,
+    'PetalLength': <tf.Tensor 'IteratorGetNext:0' shape=(?,) dtype=float64>,
+    'SepalWidth': <tf.Tensor 'IteratorGetNext:3' shape=(?,) dtype=float64>},
+Tensor("IteratorGetNext_1:4", shape=(?,), dtype=int64))
+```
+
+## Reading a CSV File
+
+The most common real-world use case for the `Dataset` class is to stream data
+from files on disk. The @{tf.data} module includes a variety of
+file readers. Let's see how parsing the Iris dataset from the csv file looks
+using a `Dataset`.
+
+The following call to the `iris_data.maybe_download` function downloads the
+data if necessary, and returns the pathnames of the resulting files:
+
+``` python
+import iris_data
+train_path, test_path = iris_data.maybe_download()
+```
+
+The [`iris_data.csv_input_fn`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py)
+function contains an alternative implementation that parses the csv files using
+a `Dataset`.
+
+Let's look at how to build an Estimator-compatible input function that reads
+from the local files.
+
+### Build the `Dataset`
+
+We start by building a @{tf.data.TextLineDataset$`TextLineDataset`} object to
+read the file one line at a time. Then, we call the
+@{tf.data.Dataset.skip$`skip`} method to skip over the first line of the file, which contains a header, not an example:
+
+``` python
+ds = tf.data.TextLineDataset(train_path).skip(1)
+```
+
+### Build a csv line parser
+
+Ultimately we will need to parse each of the lines in the dataset, to
+produce the necessary `(features, label)` pairs.
+
+We will start by building a function to parse a single line.
+
+The following `iris_data.parse_line` function accomplishes this task using the
+@{tf.decode_csv} function, and some simple python code:
+
+We must parse each of the lines in the dataset in order to generate the
+necessary `(features, label)` pairs. The following `_parse_line` function
+calls @{tf.decode_csv} to parse a single line into its features
+and the label. Since Estimators require that features be represented as a
+dictionary, we rely on Python's built-in `dict` and `zip` functions to build
+that dictionary.  The feature names are the keys of that dictionary.
+We then call the dictionary's `pop` method to remove the label field from
+the features dictionary:
+
+``` python
+# Metadata describing the text columns
+COLUMNS = ['SepalLength', 'SepalWidth',
+           'PetalLength', 'PetalWidth',
+           'label']
+FIELD_DEFAULTS = [[0.0], [0.0], [0.0], [0.0], [0]]
+def _parse_line(line):
+    # Decode the line into its fields
+    fields = tf.decode_csv(line, FIELD_DEFAULTS)
+
+    # Pack the result into a dictionary
+    features = dict(zip(COLUMNS,fields))
+
+    # Separate the label from the features
+    label = features.pop('label')
+
+    return features, label
+```
+
+### Parse the lines
+
+Datasets have many methods for manipulating the data while it is being piped
+to a model. The most heavily-used method is @{tf.data.Dataset.map$`map`}, which
+applies a transformation to each element of the `Dataset`.
+
+The `map` method takes a `map_func` argument that describes how each item in the
+`Dataset` should be transformed.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/datasets/map.png">
+</div>
+<div style="text-align: center">
+The @{tf.data.Dataset.map$`map`} method applies the `map_func` to
+transform each item in the <code>Dataset</code>.
+</div>
+
+So to parse the lines as they are streamed out of the csv file, we pass our
+`_parse_line` function to the `map` method:
+
+``` python
+ds = ds.map(_parse_line)
+print(ds)
+```
+``` None
+<MapDataset
+shapes: (
+    {SepalLength: (), PetalWidth: (), ...},
+    ()),
+types: (
+    {SepalLength: tf.float32, PetalWidth: tf.float32, ...},
+    tf.int32)>
+```
+
+Now instead of simple scalar strings, the dataset contains `(features, label)`
+pairs.
+
+the remainder of the `iris_data.csv_input_fn` function is identical
+to `iris_data.train_input_fn` which was covered in the in the
+[Basic input](#basic_input) section.
+
+### Try it out
+
+This function can be used as a replacement for
+`iris_data.train_input_fn`. It can be used to feed an estimator as follows:
+
+``` python
+train_path, test_path = iris_data.maybe_download()
+
+# All the inputs are numeric
+feature_columns = [
+    tf.feature_column.numeric_column(name)
+    for name in iris_data.CSV_COLUMN_NAMES[:-1]]
+
+# Build the estimator
+est = tf.estimator.LinearClassifier(feature_columns,
+                                    n_classes=3)
+# Train the estimator
+batch_size = 100
+est.train(
+    steps=1000,
+    input_fn=lambda : iris_data.csv_input_fn(train_path, batch_size))
+```
+
+Estimators expect an `input_fn` to take no arguments. To work around this
+restriction, we use `lambda` to capture the arguments and provide the expected
+interface.
+
+## Summary
+
+The `tf.data` module provides a collection of classes and functions for easily
+reading data from a variety of sources. Furthermore, `tf.data` has simple
+powerful methods for applying a wide variety of standard and custom
+transformations.
+
+Now you have the basic idea of how to efficiently load data into an
+Estimator. Consider the following documents next:
+
+
+* @{$get_started/custom_estimators}, which demonstrates how to build your own
+  custom `Estimator` model.
+* The @{$low_level_intro#datasets$Low Level Introduction}, which demonstrates
+  how to experiment directly with `tf.data.Datasets` using TensorFlow's low
+  level APIs.
+* @{$programmers_guide/datasets} which goes into great detail about additional
+  functionality of `Datasets`.
+
diff --git a/tensorflow/docs_src/get_started/estimator.md b/tensorflow/docs_src/get_started/estimator.md
deleted file mode 100644
index 790de6679b0bdbe5f91fd03e3ebfedc278b5b3c8..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/get_started/estimator.md
+++ /dev/null
@@ -1,410 +0,0 @@
-# tf.estimator Quickstart
-
-TensorFlow’s high-level machine learning API (tf.estimator) makes it easy to
-configure, train, and evaluate a variety of machine learning models. In this
-tutorial, you’ll use tf.estimator to construct a
-[neural network](https://en.wikipedia.org/wiki/Artificial_neural_network)
-classifier and train it on the
-[Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set) to
-predict flower species based on sepal/petal geometry. You'll write code to
-perform the following five steps:
-
-1.  Load CSVs containing Iris training/test data into a TensorFlow `Dataset`
-2.  Construct a @{tf.estimator.DNNClassifier$neural network classifier}
-3.  Train the model using the training data
-4.  Evaluate the accuracy of the model
-5.  Classify new samples
-
-NOTE: Remember to @{$install$install TensorFlow on your machine}
-before getting started with this tutorial.
-
-## Complete Neural Network Source Code
-
-Here is the full code for the neural network classifier:
-
-```python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-from six.moves.urllib.request import urlopen
-
-import numpy as np
-import tensorflow as tf
-
-# Data sets
-IRIS_TRAINING = "iris_training.csv"
-IRIS_TRAINING_URL = "http://download.tensorflow.org/data/iris_training.csv"
-
-IRIS_TEST = "iris_test.csv"
-IRIS_TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
-
-
-def main():
-  # If the training and test sets aren't stored locally, download them.
-  if not os.path.exists(IRIS_TRAINING):
-    raw = urlopen(IRIS_TRAINING_URL).read()
-    with open(IRIS_TRAINING, "wb") as f:
-      f.write(raw)
-
-  if not os.path.exists(IRIS_TEST):
-    raw = urlopen(IRIS_TEST_URL).read()
-    with open(IRIS_TEST, "wb") as f:
-      f.write(raw)
-
-  # Load datasets.
-  training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=IRIS_TRAINING,
-      target_dtype=np.int,
-      features_dtype=np.float32)
-  test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=IRIS_TEST,
-      target_dtype=np.int,
-      features_dtype=np.float32)
-
-  # Specify that all features have real-value data
-  feature_columns = [tf.feature_column.numeric_column("x", shape=[4])]
-
-  # Build 3 layer DNN with 10, 20, 10 units respectively.
-  classifier = tf.estimator.DNNClassifier(feature_columns=feature_columns,
-                                          hidden_units=[10, 20, 10],
-                                          n_classes=3,
-                                          model_dir="/tmp/iris_model")
-  # Define the training inputs
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={"x": np.array(training_set.data)},
-      y=np.array(training_set.target),
-      num_epochs=None,
-      shuffle=True)
-
-  # Train model.
-  classifier.train(input_fn=train_input_fn, steps=2000)
-
-  # Define the test inputs
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={"x": np.array(test_set.data)},
-      y=np.array(test_set.target),
-      num_epochs=1,
-      shuffle=False)
-
-  # Evaluate accuracy.
-  accuracy_score = classifier.evaluate(input_fn=test_input_fn)["accuracy"]
-
-  print("\nTest Accuracy: {0:f}\n".format(accuracy_score))
-
-  # Classify two new flower samples.
-  new_samples = np.array(
-      [[6.4, 3.2, 4.5, 1.5],
-       [5.8, 3.1, 5.0, 1.7]], dtype=np.float32)
-  predict_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={"x": new_samples},
-      num_epochs=1,
-      shuffle=False)
-
-  predictions = list(classifier.predict(input_fn=predict_input_fn))
-  predicted_classes = [p["classes"] for p in predictions]
-
-  print(
-      "New Samples, Class Predictions:    {}\n"
-      .format(predicted_classes))
-
-if __name__ == "__main__":
-    main()
-```
-
-The following sections walk through the code in detail.
-
-## Load the Iris CSV data to TensorFlow
-
-The [Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set) contains
-150 rows of data, comprising 50 samples from each of three related Iris species:
-*Iris setosa*, *Iris virginica*, and *Iris versicolor*.
-
-![Petal geometry compared for three iris species: Iris setosa, Iris virginica, and Iris versicolor](https://www.tensorflow.org/images/iris_three_species.jpg) **From left to right,
-[*Iris setosa*](https://commons.wikimedia.org/w/index.php?curid=170298) (by
-[Radomil](https://commons.wikimedia.org/wiki/User:Radomil), CC BY-SA 3.0),
-[*Iris versicolor*](https://commons.wikimedia.org/w/index.php?curid=248095) (by
-[Dlanglois](https://commons.wikimedia.org/wiki/User:Dlanglois), CC BY-SA 3.0),
-and [*Iris virginica*](https://www.flickr.com/photos/33397993@N05/3352169862)
-(by [Frank Mayfield](https://www.flickr.com/photos/33397993@N05), CC BY-SA
-2.0).**
-
-Each row contains the following data for each flower sample:
-[sepal](https://en.wikipedia.org/wiki/Sepal) length, sepal width,
-[petal](https://en.wikipedia.org/wiki/Petal) length, petal width, and flower
-species. Flower species are represented as integers, with 0 denoting *Iris
-setosa*, 1 denoting *Iris versicolor*, and 2 denoting *Iris virginica*.
-
-Sepal Length | Sepal Width | Petal Length | Petal Width | Species
-:----------- | :---------- | :----------- | :---------- | :-------
-5.1          | 3.5         | 1.4          | 0.2         | 0
-4.9          | 3.0         | 1.4          | 0.2         | 0
-4.7          | 3.2         | 1.3          | 0.2         | 0
-&hellip;     | &hellip;    | &hellip;     | &hellip;    | &hellip;
-7.0          | 3.2         | 4.7          | 1.4         | 1
-6.4          | 3.2         | 4.5          | 1.5         | 1
-6.9          | 3.1         | 4.9          | 1.5         | 1
-&hellip;     | &hellip;    | &hellip;     | &hellip;    | &hellip;
-6.5          | 3.0         | 5.2          | 2.0         | 2
-6.2          | 3.4         | 5.4          | 2.3         | 2
-5.9          | 3.0         | 5.1          | 1.8         | 2
-
-For this tutorial, the Iris data has been randomized and split into two separate
-CSVs:
-
-*   A training set of 120 samples
-    ([iris_training.csv](http://download.tensorflow.org/data/iris_training.csv))
-*   A test set of 30 samples
-    ([iris_test.csv](http://download.tensorflow.org/data/iris_test.csv)).
-
-To get started, first import all the necessary modules, and define where to
-download and store the dataset:
-
-```python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-from six.moves.urllib.request import urlopen
-
-import tensorflow as tf
-import numpy as np
-
-IRIS_TRAINING = "iris_training.csv"
-IRIS_TRAINING_URL = "http://download.tensorflow.org/data/iris_training.csv"
-
-IRIS_TEST = "iris_test.csv"
-IRIS_TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
-```
-
-Then, if the training and test sets aren't already stored locally, download
-them.
-
-```python
-if not os.path.exists(IRIS_TRAINING):
-  raw = urlopen(IRIS_TRAINING_URL).read()
-  with open(IRIS_TRAINING,'wb') as f:
-    f.write(raw)
-
-if not os.path.exists(IRIS_TEST):
-  raw = urlopen(IRIS_TEST_URL).read()
-  with open(IRIS_TEST,'wb') as f:
-    f.write(raw)
-```
-
-Next, load the training and test sets into `Dataset`s using the
-[`load_csv_with_header()`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/datasets/base.py)
-method in `learn.datasets.base`. The `load_csv_with_header()` method takes three
-required arguments:
-
-*   `filename`, which takes the filepath to the CSV file
-*   `target_dtype`, which takes the
-    [`numpy` datatype](http://docs.scipy.org/doc/numpy/user/basics.types.html)
-    of the dataset's target value.
-*   `features_dtype`, which takes the
-    [`numpy` datatype](http://docs.scipy.org/doc/numpy/user/basics.types.html)
-    of the dataset's feature values.
-
-
-Here, the target (the value you're training the model to predict) is flower
-species, which is an integer from 0&ndash;2, so the appropriate `numpy` datatype
-is `np.int`:
-
-```python
-# Load datasets.
-training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-    filename=IRIS_TRAINING,
-    target_dtype=np.int,
-    features_dtype=np.float32)
-test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-    filename=IRIS_TEST,
-    target_dtype=np.int,
-    features_dtype=np.float32)
-```
-
-`Dataset`s in tf.contrib.learn are
-[named tuples](https://docs.python.org/2/library/collections.html#collections.namedtuple);
-you can access feature data and target values via the `data` and `target`
-fields. Here, `training_set.data` and `training_set.target` contain the feature
-data and target values for the training set, respectively, and `test_set.data`
-and `test_set.target` contain feature data and target values for the test set.
-
-Later on, in
-["Fit the DNNClassifier to the Iris Training Data,"](#fit-dnnclassifier)
-you'll use `training_set.data` and
-`training_set.target` to train your model, and in
-["Evaluate Model Accuracy,"](#evaluate-accuracy) you'll use `test_set.data` and
-`test_set.target`. But first, you'll construct your model in the next section.
-
-## Construct a Deep Neural Network Classifier
-
-tf.estimator offers a variety of predefined models, called `Estimator`s, which
-you can use "out of the box" to run training and evaluation operations on your
-data.
-Here, you'll configure a Deep Neural Network Classifier model to fit the Iris
-data. Using tf.estimator, you can instantiate your
-@{tf.estimator.DNNClassifier} with just a couple lines of code:
-
-```python
-# Specify that all features have real-value data
-feature_columns = [tf.feature_column.numeric_column("x", shape=[4])]
-
-# Build 3 layer DNN with 10, 20, 10 units respectively.
-classifier = tf.estimator.DNNClassifier(feature_columns=feature_columns,
-                                        hidden_units=[10, 20, 10],
-                                        n_classes=3,
-                                        model_dir="/tmp/iris_model")
-```
-
-The code above first defines the model's feature columns, which specify the data
-type for the features in the data set. All the feature data is continuous, so
-`tf.feature_column.numeric_column` is the appropriate function to use to
-construct the feature columns. There are four features in the data set (sepal
-width, sepal height, petal width, and petal height), so accordingly `shape`
-must be set to `[4]` to hold all the data.
-
-Then, the code creates a `DNNClassifier` model using the following arguments:
-
-*   `feature_columns=feature_columns`. The set of feature columns defined above.
-*   `hidden_units=[10, 20, 10]`. Three
-    [hidden layers](http://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw),
-    containing 10, 20, and 10 neurons, respectively.
-*   `n_classes=3`. Three target classes, representing the three Iris species.
-*   `model_dir=/tmp/iris_model`. The directory in which TensorFlow will save
-    checkpoint data and TensorBoard summaries during model training.
-
-## Describe the training input pipeline {#train-input}
-
-The `tf.estimator` API uses input functions, which create the TensorFlow
-operations that generate data for the model.
-We can use `tf.estimator.inputs.numpy_input_fn` to produce the input pipeline:
-
-```python
-# Define the training inputs
-train_input_fn = tf.estimator.inputs.numpy_input_fn(
-    x={"x": np.array(training_set.data)},
-    y=np.array(training_set.target),
-    num_epochs=None,
-    shuffle=True)
-```
-
-## Fit the DNNClassifier to the Iris Training Data {#fit-dnnclassifier}
-
-Now that you've configured your DNN `classifier` model, you can fit it to the
-Iris training data using the @{tf.estimator.Estimator.train$`train`} method.
-Pass `train_input_fn` as the `input_fn`, and the number of steps to train
-(here, 2000):
-
-```python
-# Train model.
-classifier.train(input_fn=train_input_fn, steps=2000)
-```
-
-The state of the model is preserved in the `classifier`, which means you can
-train iteratively if you like. For example, the above is equivalent to the
-following:
-
-```python
-classifier.train(input_fn=train_input_fn, steps=1000)
-classifier.train(input_fn=train_input_fn, steps=1000)
-```
-
-However, if you're looking to track the model while it trains, you'll likely
-want to instead use a TensorFlow @{tf.train.SessionRunHook$`SessionRunHook`}
-to perform logging operations.
-
-## Evaluate Model Accuracy {#evaluate-accuracy}
-
-You've trained your `DNNClassifier` model on the Iris training data; now, you
-can check its accuracy on the Iris test data using the
-@{tf.estimator.Estimator.evaluate$`evaluate`} method. Like `train`,
-`evaluate` takes an input function that builds its input pipeline. `evaluate`
-returns a `dict`s with the evaluation results. The following code passes the
-Iris test data&mdash;`test_set.data` and `test_set.target`&mdash;to `evaluate`
-and prints the `accuracy` from the results:
-
-```python
-# Define the test inputs
-test_input_fn = tf.estimator.inputs.numpy_input_fn(
-    x={"x": np.array(test_set.data)},
-    y=np.array(test_set.target),
-    num_epochs=1,
-    shuffle=False)
-
-# Evaluate accuracy.
-accuracy_score = classifier.evaluate(input_fn=test_input_fn)["accuracy"]
-
-print("\nTest Accuracy: {0:f}\n".format(accuracy_score))
-```
-
-Note: The `num_epochs=1` argument to `numpy_input_fn` is important here.
-`test_input_fn` will iterate over the data once, and then raise
-`OutOfRangeError`. This error signals the classifier to stop evaluating, so it
-will evaluate over the input once.
-
-When you run the full script, it will print something close to:
-
-```
-Test Accuracy: 0.966667
-```
-
-Your accuracy result may vary a bit, but should be higher than 90%. Not bad for
-a relatively small data set!
-
-## Classify New Samples
-
-Use the estimator's `predict()` method to classify new samples. For example, say
-you have these two new flower samples:
-
-Sepal Length | Sepal Width | Petal Length | Petal Width
-:----------- | :---------- | :----------- | :----------
-6.4          | 3.2         | 4.5          | 1.5
-5.8          | 3.1         | 5.0          | 1.7
-
-You can predict their species using the `predict()` method. `predict` returns a
-generator of dicts, which can easily be converted to a list. The following code
-retrieves and prints the class predictions:
-
-```python
-# Classify two new flower samples.
-new_samples = np.array(
-    [[6.4, 3.2, 4.5, 1.5],
-     [5.8, 3.1, 5.0, 1.7]], dtype=np.float32)
-predict_input_fn = tf.estimator.inputs.numpy_input_fn(
-    x={"x": new_samples},
-    num_epochs=1,
-    shuffle=False)
-
-predictions = list(classifier.predict(input_fn=predict_input_fn))
-predicted_classes = [p["classes"] for p in predictions]
-
-print(
-    "New Samples, Class Predictions:    {}\n"
-    .format(predicted_classes))
-```
-
-Your results should look as follows:
-
-```
-New Samples, Class Predictions:    [1 2]
-```
-
-The model thus predicts that the first sample is *Iris versicolor*, and the
-second sample is *Iris virginica*.
-
-## Additional Resources
-
-*   To learn more about using tf.estimator to create linear models, see
-    @{$linear$Large-scale Linear Models with TensorFlow}.
-
-*   To build your own Estimator using tf.estimator APIs, check out
-    @{$extend/estimators$Creating Estimators}.
-
-*   To experiment with neural network modeling and visualization in the browser,
-    check out [Deep Playground](http://playground.tensorflow.org/).
-
-*   For more advanced tutorials on neural networks, see
-    @{$deep_cnn$Convolutional Neural Networks} and @{$recurrent$Recurrent Neural
-    Networks}.
diff --git a/tensorflow/docs_src/get_started/feature_columns.md b/tensorflow/docs_src/get_started/feature_columns.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad3e1fe3e3a4e3f5278e76bcaa0fc8eee2faf374
--- /dev/null
+++ b/tensorflow/docs_src/get_started/feature_columns.md
@@ -0,0 +1,572 @@
+# Feature Columns
+
+This document details feature columns. Think of **feature columns** as the
+intermediaries between raw data and Estimators. Feature columns are very rich,
+enabling you to transform a diverse range of raw data into formats that
+Estimators can use, allowing easy experimentation.
+
+In @{$get_started/premade_estimators$Premade Estimators}, we used the premade
+Estimator, @{tf.estimator.DNNClassifier$`DNNClassifier`} to train a model to
+predict different types of Iris flowers from four input features. That example
+created only numerical feature columns (of type
+@{tf.feature_column.numeric_column}). Although numerical feature columns model
+the lengths of petals and sepals effectively, real world data sets contain all
+kinds of features, many of which are non-numerical.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/feature_cloud.jpg">
+</div>
+<div style="text-align: center">
+Some real-world features (such as, longitude) are numerical, but many are not.
+</div>
+
+## Input to a Deep Neural Network
+
+What kind of data can a deep neural network operate on? The answer
+is, of course, numbers (for example, `tf.float32`). After all, every neuron in
+a neural network performs multiplication and addition operations on weights and
+input data. Real-life input data, however, often contains non-numerical
+(categorical) data. For example, consider a `product_class` feature that can
+contain the following three non-numerical values:
+
+* `kitchenware`
+* `electronics`
+* `sports`
+
+ML models generally represent categorical values as simple vectors in which a
+1 represents the presence of a value and a 0 represents the absence of a value.
+For example, when `product_class` is set to `sports`, an ML model would usually
+represent `product_class` as  `[0, 0, 1]`, meaning:
+
+* `0`: `kitchenware` is absent
+* `0`: `electronics` is absent
+* `1`: `sports` is present
+
+So, although raw data can be numerical or categorical, an ML model represents
+all features as numbers.
+
+## Feature Columns
+
+As the following figure suggests, you specify the input to a model through the
+`feature_columns` argument of an Estimator (`DNNClassifier` for Iris).
+Feature Columns bridge input data (as returned by `input_fn`) with your model.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/inputs_to_model_bridge.jpg">
+</div>
+<div style="text-align: center">
+Feature columns bridge raw data with the data your model needs.
+</div>
+
+To create feature columns, call functions from the
+@{tf.feature_column} module. This document explains nine of the functions in
+that module. As the following figure shows, all nine functions return either a
+Categorical-Column or a Dense-Column object, except `bucketized_column`, which
+inherits from both classes:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/some_constructors.jpg">
+</div>
+<div style="text-align: center">
+Feature column methods fall into two main categories and one hybrid category.
+</div>
+
+Let's look at these functions in more detail.
+
+### Numeric column
+
+The Iris classifier calls the @{tf.feature_column.numeric_column} function for
+all input features:
+
+  * `SepalLength`
+  * `SepalWidth`
+  * `PetalLength`
+  * `PetalWidth`
+
+Although `tf.numeric_column` provides optional arguments, calling
+`tf.numeric_column` without any arguments, as follows, is a fine way to specify
+a numerical value with the default data type (`tf.float32`) as input to your
+model:
+
+```python
+# Defaults to a tf.float32 scalar.
+numeric_feature_column = tf.feature_column.numeric_column(key="SepalLength")
+```
+
+To specify a non-default numerical data type, use the `dtype` argument. For
+example:
+
+``` python
+# Represent a tf.float64 scalar.
+numeric_feature_column = tf.feature_column.numeric_column(key="SepalLength",
+                                                          dtype=tf.float64)
+```
+
+By default, a numeric column creates a single value (scalar). Use the shape
+argument to specify another shape. For example:
+
+<!--TODO(markdaoust) link to full example-->
+```python
+# Represent a 10-element vector in which each cell contains a tf.float32.
+vector_feature_column = tf.feature_column.numeric_column(key="Bowling",
+                                                         shape=10)
+
+# Represent a 10x5 matrix in which each cell contains a tf.float32.
+matrix_feature_column = tf.feature_column.numeric_column(key="MyMatrix",
+                                                         shape=[10,5])
+```
+### Bucketized column
+
+Often, you don't want to feed a number directly into the model, but instead
+split its value into different categories based on numerical ranges.  To do so,
+create a @{tf.feature_column.bucketized_column$bucketized column}. For
+example, consider raw data that represents the year a house was built. Instead
+of representing that year as a scalar numeric column, we could split the year
+into the following four buckets:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/bucketized_column.jpg">
+</div>
+<div style="text-align: center">
+Dividing year data into four buckets.
+</div>
+
+The model will represent the buckets as follows:
+
+|Date Range |Represented as... |
+|:----------|:-----------------|
+|< 1960               | [1, 0, 0, 0] |
+|>= 1960 but < 1980   | [0, 1, 0, 0] |
+|>= 1980 but < 2000   | [0, 0, 1, 0] |
+|> 2000               | [0, 0, 0, 1] |
+
+Why would you want to split a number—a perfectly valid input to your
+model—into a categorical value? Well, notice that the categorization splits a
+single input number into a four-element vector. Therefore, the model now can
+learn _four individual weights_ rather than just one; four weights creates a
+richer model than one weight. More importantly, bucketizing enables the model
+to clearly distinguish between different year categories since only one of the
+elements is set (1) and the other three elements are cleared (0). When we just
+use a single number (a year) as input, the model can only learn a linear
+relationship. So, bucketing provides the model with additional flexibility that
+the model can use to learn.
+
+The following code demonstrates how to create a bucketized feature:
+
+<!--TODO(markdaoust) link to full example - housing price grid?-->
+```python
+# First, convert the raw input to a numeric column.
+numeric_feature_column = tf.feature_column.numeric_column("Year")
+
+# Then, bucketize the numeric column on the years 1960, 1980, and 2000.
+bucketized_feature_column = tf.feature_column.bucketized_column(
+    source_column = numeric_feature_column,
+    boundaries = [1960, 1980, 2000])
+```
+Note that specifying a _three_-element boundaries vector creates a
+_four_-element bucketized vector.
+
+
+### Categorical identity column
+
+**Categorical identity columns** can be seen as a special case of bucketized
+columns. In traditional bucketized columns, each bucket represents a range of
+values (for example, from 1960 to 1979). In a categorical identity column, each
+bucket represents a single, unique integer. For example, let's say you want to
+represent the integer range `[0, 4)`.  That is, you want to represent the
+integers 0, 1, 2, or 3. In this case, the categorical identity mapping looks
+like this:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/categorical_column_with_identity.jpg">
+</div>
+<div style="text-align: center">
+A categorical identity column mapping. Note that this is a one-hot
+encoding, not a binary numerical encoding.
+</div>
+
+As with bucketized columns, a model can learn a separate weight for each class
+in a categorical identity column. For example, instead of using a string to
+represent the `product_class`, let's represent each class with a unique integer
+value. That is:
+
+* `0="kitchenware"`
+* `1="electronics"`
+* `2="sport"`
+
+Call @{tf.feature_column.categorical_column_with_identity} to implement a
+categorical identity column. For example:
+
+``` python
+# Create categorical output for an integer feature named "my_feature_b",
+# The values of my_feature_b must be >= 0 and < num_buckets
+identity_feature_column = tf.feature_column.categorical_column_with_identity(
+    key='my_feature_b',
+    num_buckets=4) # Values [0, 4)
+
+# In order for the preceding call to work, the input_fn() must return
+# a dictionary containing 'my_feature_b' as a key. Furthermore, the values
+# assigned to 'my_feature_b' must belong to the set [0, 4).
+def input_fn():
+    ...
+    return ({ 'my_feature_a':[7, 9, 5, 2], 'my_feature_b':[3, 1, 2, 2] },
+            [Label_values])
+```
+
+### Categorical vocabulary column
+
+We cannot input strings directly to a model. Instead, we must first map strings
+to numeric or categorical values. Categorical vocabulary columns provide a good
+way to represent strings as a one-hot vector. For example:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/categorical_column_with_vocabulary.jpg">
+</div>
+<div style="text-align: center">
+Mapping string values to vocabulary columns.
+</div>
+
+As you can see, categorical vocabulary columns are kind of an enum version of
+categorical identity columns. TensorFlow provides two different functions to
+create categorical vocabulary columns:
+
+* @{tf.feature_column.categorical_column_with_vocabulary_list}
+* @{tf.feature_column.categorical_column_with_vocabulary_file}
+
+`categorical_column_with_vocabulary_list` maps each string to an integer based
+on an explicit vocabulary list. For example:
+
+```python
+# Given input "feature_name_from_input_fn" which is a string,
+# create a categorical feature by mapping the input to one of
+# the elements in the vocabulary list.
+vocabulary_feature_column =
+    tf.feature_column.categorical_column_with_vocabulary_list(
+        key="a feature returned by input_fn()",
+        vocabulary_list=["kitchenware", "electronics", "sports"])
+```
+
+The preceding function is pretty straightforward, but it has a significant
+drawback. Namely, there's way too much typing when the vocabulary list is long.
+For these cases, call
+`tf.feature_column.categorical_column_with_vocabulary_file` instead, which lets
+you place the vocabulary words in a separate file. For example:
+
+```python
+
+# Given input "feature_name_from_input_fn" which is a string,
+# create a categorical feature to our model by mapping the input to one of
+# the elements in the vocabulary file
+vocabulary_feature_column =
+    tf.feature_column.categorical_column_with_vocabulary_file(
+        key="a feature returned by input_fn()",
+        vocabulary_file="product_class.txt",
+        vocabulary_size=3)
+```
+
+`product_class.txt` should contain one line for each vocabulary element. In our
+case:
+
+```None
+kitchenware
+electronics
+sports
+```
+
+### Hashed Column
+
+So far, we've worked with a naively small number of categories. For example,
+our product_class example has only 3 categories. Often though, the number of
+categories can be so big that it's not possible to have individual categories
+for each vocabulary word or integer because that would consume too much memory.
+For these cases, we can instead turn the question around and ask, "How many
+categories am I willing to have for my input?"  In fact, the
+@{tf.feature_column.categorical_column_with_hash_bucket} function enables you
+to specify the number of categories. For this type of feature column the model
+calculates a hash value of the input, then puts it into one of
+the `hash_bucket_size` categories using the modulo operator, as in the following
+pseudocode:
+
+```python
+# pseudocode
+feature_id = hash(raw_feature) % hash_buckets_size
+```
+
+The code to create the `feature_column` might look something like this:
+
+``` python
+hashed_feature_column =
+    tf.feature_column.categorical_column_with_hash_bucket(
+        key = "some_feature",
+        hash_buckets_size = 100) # The number of categories
+```
+At this point, you might rightfully think: "This is crazy!" After all, we are
+forcing the different input values to a smaller set of categories. This means
+that two probably unrelated inputs will be mapped to the same
+category, and consequently mean the same thing to the neural network. The
+following figure illustrates this dilemma, showing that kitchenware and sports
+both get assigned to category (hash bucket) 12:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/hashed_column.jpg">
+</div>
+<div style="text-align: center">
+Representing data with hash buckets.
+</div>
+
+As with many counterintuitive phenomena in machine learning, it turns out that
+hashing often works well in practice. That's because hash categories provide
+the model with some separation. The model can use additional features to further
+separate kitchenware from sports.
+
+### Crossed column
+
+Combining features into a single feature, better known as
+[feature crosses](https://developers.google.com/machine-learning/glossary/#feature_cross),
+enables the model to learn separate weights for each combination of
+features.
+
+More concretely, suppose we want our model to calculate real estate prices in
+Atlanta, GA. Real-estate prices within this city vary greatly depending on
+location. Representing latitude and longitude as separate features isn't very
+useful in identifying real-estate location dependencies; however, crossing
+latitude and longitude into a single feature can pinpoint locations. Suppose we
+represent Atlanta as a grid of 100x100 rectangular sections, identifying each
+of the 10,000 sections by a feature cross of latitude and longitude. This
+feature cross enables the model to train on pricing conditions related to each
+individual section, which is a much stronger signal than latitude and longitude
+alone.
+
+The following figure shows our plan, with the latitude & longitude values for
+the corners of the city in red text:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/Atlanta.jpg">
+</div>
+<div style="text-align: center">
+Map of Atlanta. Imagine this map divided into 10,000 sections of
+equal size.
+</div>
+
+For the solution, we used a combination of the `bucketized_column` we looked at
+earlier, with the @{tf.feature_column.crossed_column} function.
+
+<!--TODO(markdaoust) link to full example-->
+
+``` python
+def make_dataset(latitude, longitude, labels):
+    assert latitude.shape == longitude.shape == labels.shape
+
+    features = {'latitude': latitude.flatten(),
+                'longitude': longitude.flatten()}
+    labels=labels.flatten()
+
+    return tf.data.Dataset.from_tensor_slices((features, labels))
+
+
+# Bucketize the latitude and longitude usig the `edges`
+latitude_bucket_fc = tf.feature_column.bucketized_column(
+    tf.feature_column.numeric_column('latitude'),
+    list(atlanta.latitude.edges))
+
+longitude_bucket_fc = tf.feature_column.bucketized_column(
+    tf.feature_column.numeric_column('longitude'),
+    list(atlanta.longitude.edges))
+
+# Cross the bucketized columns, using 5000 hash bins.
+crossed_lat_lon_fc = tf.feature_column.crossed_column(
+    [latitude_bucket_fc, longitude_bucket_fc], 5000)
+
+fc = [
+    latitude_bucket_fc,
+    longitude_bucket_fc,
+    crossed_lat_lon_fc]
+
+# Build and train the Estimator.
+est = tf.estimator.LinearRegressor(fc, ...)
+```
+
+You may create a feature cross from either of the following:
+
+* Feature names; that is, names from the `dict` returned from `input_fn`.
+* Any categorical column, except `categorical_column_with_hash_bucket`
+  (since `crossed_column` hashes the input).
+
+When the feature columns `latitude_bucket_fc` and `longitude_bucket_fc` are
+crossed, TensorFlow will create `(latitude_fc, longitude_fc)` pairs for each
+example. This would produce a full grid of possibilities as follows:
+
+``` None
+ (0,0),  (0,1)...  (0,99)
+ (1,0),  (1,1)...  (1,99)
+   ...     ...       ...
+(99,0), (99,1)...(99, 99)
+```
+
+Except that a full grid would only be tractable for inputs with limited
+vocabularies. Instead of building this, potentially huge, table of inputs,
+the `crossed_column` only builds the number requested by the `hash_bucket_size`
+argument. The feature column assigns an example to a index by running a hash
+function on the tuple of inputs, followed by a modulo operation with
+`hash_bucket_size`.
+
+As discussed earlier, performing the
+hash and modulo function limits the number of categories, but can cause category
+collisions; that is, multiple (latitude, longitude) feature crosses will end
+up in the same hash bucket. In practice though, performing feature crosses
+still adds significant value to the learning capability of your models.
+
+Somewhat counterintuitively, when creating feature crosses, you typically still
+should include the original (uncrossed) features in your model (as in the
+preceding code snippet). The independent latitude and longitude features help the
+model distinguish between examples where a hash collision has occurred in the
+crossed feature.
+
+## Indicator and embedding columns
+
+Indicator columns and embedding columns never work on features directly, but
+instead take categorical columns as input.
+
+When using an indicator column, we're telling TensorFlow to do exactly what
+we've seen in our categorical product_class example. That is, an
+**indicator column** treats each category as an element in a one-hot vector,
+where the matching category has value 1 and the rest have 0s:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/categorical_column_with_identity.jpg">
+</div>
+<div style="text-align: center">
+Representing data in indicator columns.
+</div>
+
+Here's how you create an indicator column by calling
+@{tf.feature_column.indicator_column}:
+
+``` python
+categorical_column = ... # Create any type of categorical column.
+
+# Represent the categorical column as an indicator column.
+indicator_column = tf.feature_column.indicator_column(categorical_column)
+```
+
+Now, suppose instead of having just three possible classes, we have a million.
+Or maybe a billion. For a number of reasons, as the number of categories grow
+large, it becomes infeasible to train a neural network using indicator columns.
+
+We can use an embedding column to overcome this limitation. Instead of
+representing the data as a one-hot vector of many dimensions, an
+**embedding column** represents that data as a lower-dimensional, ordinary
+vector in which each cell can contain any number, not just 0 or 1. By
+permitting a richer palette of numbers for every cell, an embedding column
+contains far fewer cells than an indicator column.
+
+Let's look at an example comparing indicator and embedding columns. Suppose our
+input examples consist of different words from a limited palette of only 81
+words. Further suppose that the data set provides the following input
+words in 4 separate examples:
+
+* `"dog"`
+* `"spoon"`
+* `"scissors"`
+* `"guitar"`
+
+In that case, the following figure illustrates the processing path for
+embedding columns or indicator columns.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/embedding_vs_indicator.jpg">
+</div>
+<div style="text-align: center">
+An embedding column stores categorical data in a lower-dimensional
+vector than an indicator column. (We just placed random numbers into the
+embedding vectors; training determines the actual numbers.)
+</div>
+
+When an example is processed, one of the `categorical_column_with...` functions
+maps the example string to a numerical categorical value. For example, a
+function maps "spoon" to `[32]`. (The 32 comes from our imagination—the actual
+values depend on the mapping function.) You may then represent these numerical
+categorical values in either of the following two ways:
+
+* As an indicator column. A function converts each numeric categorical value
+  into an 81-element vector (because our palette consists of 81 words), placing
+  a 1 in the index of the categorical value (0, 32, 79, 80) and a 0 in all the
+  other positions.
+
+* As an embedding column. A function uses the numerical categorical values
+  `(0, 32, 79, 80)` as indices to a lookup table. Each slot in that lookup table
+  contains a 3-element vector.
+
+How do the values in the embeddings vectors magically get assigned? Actually,
+the assignments happen during training. That is, the model learns the best way
+to map your input numeric categorical values to the embeddings vector value in
+order to solve your problem. Embedding columns increase your model's
+capabilities, since an embeddings vector learns new relationships between
+categories from the training data.
+
+Why is the embedding vector size 3 in our example? Well, the following "formula"
+provides a general rule of thumb about the number of embedding dimensions:
+
+```python
+embedding_dimensions =  number_of_categories**0.25
+```
+
+That is, the embedding vector dimension should be the 4th root of the number of
+categories. Since our vocabulary size in this example is 81, the recommended
+number of dimensions is 3:
+
+``` python
+3 =  81**0.25
+```
+Note that this is just a general guideline; you can set the number of embedding
+dimensions as you please.
+
+Call @{tf.feature_column.embedding_column} to create an `embedding_column` as
+suggested by the following snippet:
+
+``` python
+categorical_column = ... # Create any categorical column
+
+# Represent the categorical column as an embedding column.
+# This means creating a one-hot vector with one element for each category.
+embedding_column = tf.feature_column.embedding_column(
+    categorical_column=categorical_column,
+    dimension=dimension_of_embedding_vector)
+```
+
+@{$programmers_guide/embedding$Embeddings} is a significant topic within machine
+learning. This information was just to get you started using them as feature
+columns.
+
+## Passing feature columns to Estimators
+
+As the following list indicates, not all Estimators permit all types of
+`feature_columns` argument(s):
+
+* @{tf.estimator.LinearClassifier$`LinearClassifier`} and
+  @{tf.estimator.LinearRegressor$`LinearRegressor`}: Accept all types of
+  feature column.
+* @{tf.estimator.DNNClassifier$`DNNClassifier`} and
+  @{tf.estimator.DNNRegressor$`DNNRegressor`}: Only accept dense columns. Other
+  column types must be wrapped in either an `indicator_column` or
+  `embedding_column`.
+* @{tf.estimator.DNNLinearCombinedClassifier$`DNNLinearCombinedClassifier`} and
+  @{tf.estimator.DNNLinearCombinedRegressor$`DNNLinearCombinedRegressor`}:
+    * The `linear_feature_columns` argument accepts any feature column type.
+    * The `dnn_feature_columns` argument only accepts dense columns.
+
+## Other Sources
+
+For more examples on feature columns, view the following:
+
+* The @{$low_level_intro#feature_columns$Low Level Introduction} demonstrates how
+  experiment directly with `feature_columns` using TensorFlow's low level APIs.
+* The @{$wide$wide} and @{$wide_and_deep$Wide & Deep} Tutorials solve a
+  binary classification problem using `feature_columns` on a variety of input
+  data types.
+
+To learn more about embeddings, see the following:
+
+* [Deep Learning, NLP, and representations](http://colah.github.io/posts/2014-07-NLP-RNNs-Representations/)
+  (Chris Olah's blog)
+* The TensorFlow [Embedding Projector](http://projector.tensorflow.org)
diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md
deleted file mode 100644
index 231108215ac73bc9ab87a896b3441a7da5f2b507..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/get_started/get_started.md
+++ /dev/null
@@ -1,480 +0,0 @@
-# Getting Started With TensorFlow
-
-This guide gets you started programming in TensorFlow. Before using this guide,
-@{$install$install TensorFlow}. To get the most out of
-this guide, you should know the following:
-
-*   How to program in Python.
-*   At least a little bit about arrays.
-*   Ideally, something about machine learning. However, if you know little or
-    nothing about machine learning, then this is still the first guide you
-    should read.
-
-TensorFlow provides multiple APIs. The lowest level API--TensorFlow Core--
-provides you with complete programming control. We recommend TensorFlow Core for
-machine learning researchers and others who require fine levels of control over
-their models. The higher level APIs are built on top of TensorFlow Core. These
-higher level APIs are typically easier to learn and use than TensorFlow Core. In
-addition, the higher level APIs make repetitive tasks easier and more consistent
-between different users. A high-level API like tf.estimator helps you manage
-data sets, estimators, training and inference.
-
-This guide begins with a tutorial on TensorFlow Core. Later, we
-demonstrate how to implement the same model in tf.estimator. Knowing
-TensorFlow Core principles will give you a great mental model of how things are
-working internally when you use the more compact higher level API.
-
-# Tensors
-
-The central unit of data in TensorFlow is the **tensor**. A tensor consists of a
-set of primitive values shaped into an array of any number of dimensions. A
-tensor's **rank** is its number of dimensions. Here are some examples of
-tensors:
-
-```python
-3 # a rank 0 tensor; a scalar with shape []
-[1., 2., 3.] # a rank 1 tensor; a vector with shape [3]
-[[1., 2., 3.], [4., 5., 6.]] # a rank 2 tensor; a matrix with shape [2, 3]
-[[[1., 2., 3.]], [[7., 8., 9.]]] # a rank 3 tensor with shape [2, 1, 3]
-```
-
-## TensorFlow Core tutorial
-
-### Importing TensorFlow
-
-The canonical import statement for TensorFlow programs is as follows:
-
-```python
-import tensorflow as tf
-```
-This gives Python access to all of TensorFlow's classes, methods, and symbols.
-Most of the documentation assumes you have already done this.
-
-### The Computational Graph
-
-You might think of TensorFlow Core programs as consisting of two discrete
-sections:
-
-1.  Building the computational graph.
-2.  Running the computational graph.
-
-A **computational graph** is a series of TensorFlow operations arranged into a
-graph of nodes.
-Let's build a simple computational graph. Each node takes zero
-or more tensors as inputs and produces a tensor as an output. One type of node
-is a constant. Like all TensorFlow constants, it takes no inputs, and it outputs
-a value it stores internally. We can create two floating point Tensors `node1`
-and `node2` as follows:
-
-```python
-node1 = tf.constant(3.0, dtype=tf.float32)
-node2 = tf.constant(4.0) # also tf.float32 implicitly
-print(node1, node2)
-```
-
-The final print statement produces
-
-```
-Tensor("Const:0", shape=(), dtype=float32) Tensor("Const_1:0", shape=(), dtype=float32)
-```
-
-Notice that printing the nodes does not output the values `3.0` and `4.0` as you
-might expect. Instead, they are nodes that, when evaluated, would produce 3.0
-and 4.0, respectively. To actually evaluate the nodes, we must run the
-computational graph within a **session**. A session encapsulates the control and
-state of the TensorFlow runtime.
-
-The following code creates a `Session` object and then invokes its `run` method
-to run enough of the computational graph to evaluate `node1` and `node2`. By
-running the computational graph in a session as follows:
-
-```python
-sess = tf.Session()
-print(sess.run([node1, node2]))
-```
-
-we see the expected values of 3.0 and 4.0:
-
-```
-[3.0, 4.0]
-```
-
-We can build more complicated computations by combining `Tensor` nodes with
-operations (Operations are also nodes). For example, we can add our two
-constant nodes and produce a new graph as follows:
-
-```python
-from __future__ import print_function
-node3 = tf.add(node1, node2)
-print("node3:", node3)
-print("sess.run(node3):", sess.run(node3))
-```
-
-The last two print statements produce
-
-```
-node3: Tensor("Add:0", shape=(), dtype=float32)
-sess.run(node3): 7.0
-```
-
-TensorFlow provides a utility called TensorBoard that can display a picture of
-the computational graph. Here is a screenshot showing how TensorBoard
-visualizes the graph:
-
-![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_add.png)
-
-As it stands, this graph is not especially interesting because it always
-produces a constant result. A graph can be parameterized to accept external
-inputs, known as **placeholders**. A **placeholder** is a promise to provide a
-value later.
-
-```python
-a = tf.placeholder(tf.float32)
-b = tf.placeholder(tf.float32)
-adder_node = a + b  # + provides a shortcut for tf.add(a, b)
-```
-
-The preceding three lines are a bit like a function or a lambda in which we
-define two input parameters (a and b) and then an operation on them. We can
-evaluate this graph with multiple inputs by using the feed_dict argument to
-the [run method](https://www.tensorflow.org/api_docs/python/tf/Session#run)
-to feed concrete values to the placeholders:
-
-```python
-print(sess.run(adder_node, {a: 3, b: 4.5}))
-print(sess.run(adder_node, {a: [1, 3], b: [2, 4]}))
-```
-resulting in the output
-
-```
-7.5
-[ 3.  7.]
-```
-
-In TensorBoard, the graph looks like this:
-
-![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_adder.png)
-
-We can make the computational graph more complex by adding another operation.
-For example,
-
-```python
-add_and_triple = adder_node * 3.
-print(sess.run(add_and_triple, {a: 3, b: 4.5}))
-```
-produces the output
-```
-22.5
-```
-
-The preceding computational graph would look as follows in TensorBoard:
-
-![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_triple.png)
-
-In machine learning we will typically want a model that can take arbitrary
-inputs, such as the one above.  To make the model trainable, we need to be able
-to modify the graph to get new outputs with the same input.  **Variables** allow
-us to add trainable parameters to a graph.  They are constructed with a type and
-initial value:
-
-
-```python
-W = tf.Variable([.3], dtype=tf.float32)
-b = tf.Variable([-.3], dtype=tf.float32)
-x = tf.placeholder(tf.float32)
-linear_model = W*x + b
-```
-
-Constants are initialized when you call `tf.constant`, and their value can never
-change. By contrast, variables are not initialized when you call `tf.Variable`.
-To initialize all the variables in a TensorFlow program, you must explicitly
-call a special operation as follows:
-
-```python
-init = tf.global_variables_initializer()
-sess.run(init)
-```
-It is important to realize `init` is a handle to the TensorFlow sub-graph that
-initializes all the global variables. Until we call `sess.run`, the variables
-are uninitialized.
-
-
-Since `x` is a placeholder, we can evaluate `linear_model` for several values of
-`x` simultaneously as follows:
-
-```python
-print(sess.run(linear_model, {x: [1, 2, 3, 4]}))
-```
-to produce the output
-```
-[ 0.          0.30000001  0.60000002  0.90000004]
-```
-
-We've created a model, but we don't know how good it is yet. To evaluate the
-model on training data, we need a `y` placeholder to provide the desired values,
-and we need to write a loss function.
-
-A loss function measures how far apart the
-current model is from the provided data. We'll use a standard loss model for
-linear regression, which sums the squares of the deltas between the current
-model and the provided data. `linear_model - y` creates a vector where each
-element is the corresponding example's error delta. We call `tf.square` to
-square that error. Then, we sum all the squared errors to create a single scalar
-that abstracts the error of all examples using `tf.reduce_sum`:
-
-```python
-y = tf.placeholder(tf.float32)
-squared_deltas = tf.square(linear_model - y)
-loss = tf.reduce_sum(squared_deltas)
-print(sess.run(loss, {x: [1, 2, 3, 4], y: [0, -1, -2, -3]}))
-```
-producing the loss value
-```
-23.66
-```
-
-We could improve this manually by reassigning the values of `W` and `b` to the
-perfect values of -1 and 1. A variable is initialized to the value provided to
-`tf.Variable` but can be changed using operations like `tf.assign`. For example,
-`W=-1` and `b=1` are the optimal parameters for our model. We can change `W` and
-`b` accordingly:
-
-```python
-fixW = tf.assign(W, [-1.])
-fixb = tf.assign(b, [1.])
-sess.run([fixW, fixb])
-print(sess.run(loss, {x: [1, 2, 3, 4], y: [0, -1, -2, -3]}))
-```
-The final print shows the loss now is zero.
-```
-0.0
-```
-
-We guessed the "perfect" values of `W` and `b`, but the whole point of machine
-learning is to find the correct model parameters automatically.  We will show
-how to accomplish this in the next section.
-
-## tf.train API
-
-A complete discussion of machine learning is out of the scope of this tutorial.
-However, TensorFlow provides **optimizers** that slowly change each variable in
-order to minimize the loss function. The simplest optimizer is **gradient
-descent**. It modifies each variable according to the magnitude of the
-derivative of loss with respect to that variable. In general, computing symbolic
-derivatives manually is tedious and error-prone. Consequently, TensorFlow can
-automatically produce derivatives given only a description of the model using
-the function `tf.gradients`. For simplicity, optimizers typically do this
-for you. For example,
-
-```python
-optimizer = tf.train.GradientDescentOptimizer(0.01)
-train = optimizer.minimize(loss)
-```
-
-```python
-sess.run(init) # reset variables to incorrect defaults.
-for i in range(1000):
-  sess.run(train, {x: [1, 2, 3, 4], y: [0, -1, -2, -3]})
-
-print(sess.run([W, b]))
-```
-results in the final model parameters:
-```
-[array([-0.9999969], dtype=float32), array([ 0.99999082], dtype=float32)]
-```
-
-Now we have done actual machine learning!  Although this simple linear
-regression model does not require much TensorFlow core code, more complicated
-models and methods to feed data into your models necessitate more code. Thus,
-TensorFlow provides higher level abstractions for common patterns, structures,
-and functionality. We will learn how to use some of these abstractions in the
-next section.
-
-### Complete program
-
-The completed trainable linear regression model is shown here:
-
-```python
-import tensorflow as tf
-
-# Model parameters
-W = tf.Variable([.3], dtype=tf.float32)
-b = tf.Variable([-.3], dtype=tf.float32)
-# Model input and output
-x = tf.placeholder(tf.float32)
-linear_model = W*x + b
-y = tf.placeholder(tf.float32)
-
-# loss
-loss = tf.reduce_sum(tf.square(linear_model - y)) # sum of the squares
-# optimizer
-optimizer = tf.train.GradientDescentOptimizer(0.01)
-train = optimizer.minimize(loss)
-
-# training data
-x_train = [1, 2, 3, 4]
-y_train = [0, -1, -2, -3]
-# training loop
-init = tf.global_variables_initializer()
-sess = tf.Session()
-sess.run(init) # initialize variables with incorrect defaults.
-for i in range(1000):
-  sess.run(train, {x: x_train, y: y_train})
-
-# evaluate training accuracy
-curr_W, curr_b, curr_loss = sess.run([W, b, loss], {x: x_train, y: y_train})
-print("W: %s b: %s loss: %s"%(curr_W, curr_b, curr_loss))
-```
-When run, it produces
-```
-W: [-0.9999969] b: [ 0.99999082] loss: 5.69997e-11
-```
-
-Notice that the loss is a very small number (very close to zero). If you run
-this program, your loss may not be exactly the same as the aforementioned loss
-because the model is initialized with pseudorandom values.
-
-This more complicated program can still be visualized in TensorBoard
-![TensorBoard final model visualization](https://www.tensorflow.org/images/getting_started_final.png)
-
-## `tf.estimator`
-
-`tf.estimator` is a high-level TensorFlow library that simplifies the
-mechanics of machine learning, including the following:
-
-*   running training loops
-*   running evaluation loops
-*   managing data sets
-
-tf.estimator defines many common models.
-
-### Basic usage
-
-Notice how much simpler the linear regression program becomes with
-`tf.estimator`:
-
-```python
-# NumPy is often used to load, manipulate and preprocess data.
-import numpy as np
-import tensorflow as tf
-
-# Declare list of features. We only have one numeric feature. There are many
-# other types of columns that are more complicated and useful.
-feature_columns = [tf.feature_column.numeric_column("x", shape=[1])]
-
-# An estimator is the front end to invoke training (fitting) and evaluation
-# (inference). There are many predefined types like linear regression,
-# linear classification, and many neural network classifiers and regressors.
-# The following code provides an estimator that does linear regression.
-estimator = tf.estimator.LinearRegressor(feature_columns=feature_columns)
-
-# TensorFlow provides many helper methods to read and set up data sets.
-# Here we use two data sets: one for training and one for evaluation
-# We have to tell the function how many batches
-# of data (num_epochs) we want and how big each batch should be.
-x_train = np.array([1., 2., 3., 4.])
-y_train = np.array([0., -1., -2., -3.])
-x_eval = np.array([2., 5., 8., 1.])
-y_eval = np.array([-1.01, -4.1, -7, 0.])
-input_fn = tf.estimator.inputs.numpy_input_fn(
-    {"x": x_train}, y_train, batch_size=4, num_epochs=None, shuffle=True)
-train_input_fn = tf.estimator.inputs.numpy_input_fn(
-    {"x": x_train}, y_train, batch_size=4, num_epochs=1000, shuffle=False)
-eval_input_fn = tf.estimator.inputs.numpy_input_fn(
-    {"x": x_eval}, y_eval, batch_size=4, num_epochs=1000, shuffle=False)
-
-# We can invoke 1000 training steps by invoking the method and passing the
-# training data set.
-estimator.train(input_fn=input_fn, steps=1000)
-
-# Here we evaluate how well our model did.
-train_metrics = estimator.evaluate(input_fn=train_input_fn)
-eval_metrics = estimator.evaluate(input_fn=eval_input_fn)
-print("train metrics: %r"% train_metrics)
-print("eval metrics: %r"% eval_metrics)
-```
-When run, it produces something like
-```
-train metrics: {'average_loss': 1.4833182e-08, 'global_step': 1000, 'loss': 5.9332727e-08}
-eval metrics: {'average_loss': 0.0025353201, 'global_step': 1000, 'loss': 0.01014128}
-```
-Notice how our eval data has a higher loss, but it is still close to zero.
-That means we are learning properly.
-
-### A custom model
-
-`tf.estimator` does not lock you into its predefined models. Suppose we
-wanted to create a custom model that is not built into TensorFlow. We can still
-retain the high level abstraction of data set, feeding, training, etc. of
-`tf.estimator`. For illustration, we will show how to implement our own
-equivalent model to `LinearRegressor` using our knowledge of the lower level
-TensorFlow API.
-
-To define a custom model that works with `tf.estimator`, we need to use
-`tf.estimator.Estimator`. `tf.estimator.LinearRegressor` is actually
-a sub-class of `tf.estimator.Estimator`. Instead of sub-classing
-`Estimator`, we simply provide `Estimator` a function `model_fn` that tells
-`tf.estimator` how it can evaluate predictions, training steps, and
-loss. The code is as follows:
-
-```python
-import numpy as np
-import tensorflow as tf
-
-# Declare list of features, we only have one real-valued feature
-def model_fn(features, labels, mode):
-  # Build a linear model and predict values
-  W = tf.get_variable("W", [1], dtype=tf.float64)
-  b = tf.get_variable("b", [1], dtype=tf.float64)
-  y = W*features['x'] + b
-  # Loss sub-graph
-  loss = tf.reduce_sum(tf.square(y - labels))
-  # Training sub-graph
-  global_step = tf.train.get_global_step()
-  optimizer = tf.train.GradientDescentOptimizer(0.01)
-  train = tf.group(optimizer.minimize(loss),
-                   tf.assign_add(global_step, 1))
-  # EstimatorSpec connects subgraphs we built to the
-  # appropriate functionality.
-  return tf.estimator.EstimatorSpec(
-      mode=mode,
-      predictions=y,
-      loss=loss,
-      train_op=train)
-
-estimator = tf.estimator.Estimator(model_fn=model_fn)
-# define our data sets
-x_train = np.array([1., 2., 3., 4.])
-y_train = np.array([0., -1., -2., -3.])
-x_eval = np.array([2., 5., 8., 1.])
-y_eval = np.array([-1.01, -4.1, -7., 0.])
-input_fn = tf.estimator.inputs.numpy_input_fn(
-    {"x": x_train}, y_train, batch_size=4, num_epochs=None, shuffle=True)
-train_input_fn = tf.estimator.inputs.numpy_input_fn(
-    {"x": x_train}, y_train, batch_size=4, num_epochs=1000, shuffle=False)
-eval_input_fn = tf.estimator.inputs.numpy_input_fn(
-    {"x": x_eval}, y_eval, batch_size=4, num_epochs=1, shuffle=False)
-
-# train
-estimator.train(input_fn=input_fn, steps=1000)
-# Here we evaluate how well our model did.
-train_metrics = estimator.evaluate(input_fn=train_input_fn)
-eval_metrics = estimator.evaluate(input_fn=eval_input_fn)
-print("train metrics: %r"% train_metrics)
-print("eval metrics: %r"% eval_metrics)
-```
-When run, it produces
-```
-train metrics: {'loss': 1.227995e-11, 'global_step': 1000}
-eval metrics: {'loss': 0.01010036, 'global_step': 1000}
-```
-
-Notice how the contents of the custom `model_fn()` function are very similar
-to our manual model training loop from the lower level API.
-
-## Next steps
-
-Now you have a working knowledge of the basics of TensorFlow. We have several
-more tutorials that you can look at to learn more. If you are a beginner in
-machine learning see @{$beginners$MNIST for beginners},
-otherwise see @{$pros$Deep MNIST for experts}.
diff --git a/tensorflow/docs_src/get_started/get_started_for_beginners.md b/tensorflow/docs_src/get_started/get_started_for_beginners.md
new file mode 100644
index 0000000000000000000000000000000000000000..367c187e35ac5182b89e9a11cf8aec05e5250d57
--- /dev/null
+++ b/tensorflow/docs_src/get_started/get_started_for_beginners.md
@@ -0,0 +1,738 @@
+# Getting Started for ML Beginners
+
+This document explains how to use machine learning to classify (categorize)
+Iris flowers by species.  This document dives deeply into the TensorFlow
+code to do exactly that, explaining ML fundamentals along the way.
+
+If the following list describes you, then you are in the right place:
+
+*   You know little to nothing about machine learning.
+*   You want to learn how to write TensorFlow programs.
+*   You can code (at least a little) in Python.
+
+If you are already familiar with basic machine learning concepts
+but are new to TensorFlow, read
+@{$premade_estimators$Getting Started with TensorFlow: for ML Experts}.
+
+## The Iris classification problem
+
+Imagine you are a botanist seeking an automated way to classify each
+Iris flower you find.  Machine learning provides many ways to classify flowers.
+For instance, a sophisticated machine learning program could classify flowers
+based on photographs.  Our ambitions are more modest--we're going to classify
+Iris flowers based solely on the length and width of their
+[sepals](https://en.wikipedia.org/wiki/Sepal) and
+[petals](https://en.wikipedia.org/wiki/Petal).
+
+The Iris genus entails about 300 species, but our program will classify only
+the following three:
+
+*   Iris setosa
+*   Iris virginica
+*   Iris versicolor
+
+<div style="margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%"
+  alt="Petal geometry compared for three iris species: Iris setosa, Iris virginica, and Iris versicolor"
+  src="../images/iris_three_species.jpg">
+</div>
+
+**From left to right,
+[*Iris setosa*](https://commons.wikimedia.org/w/index.php?curid=170298) (by
+[Radomil](https://commons.wikimedia.org/wiki/User:Radomil), CC BY-SA 3.0),
+[*Iris versicolor*](https://commons.wikimedia.org/w/index.php?curid=248095) (by
+[Dlanglois](https://commons.wikimedia.org/wiki/User:Dlanglois), CC BY-SA 3.0),
+and [*Iris virginica*](https://www.flickr.com/photos/33397993@N05/3352169862)
+(by [Frank Mayfield](https://www.flickr.com/photos/33397993@N05), CC BY-SA
+2.0).**
+<p>&nbsp;</p>
+
+Fortunately, someone has already created [a data set of 120 Iris
+flowers](https://en.wikipedia.org/wiki/Iris_flower_data_set)
+with the sepal and petal measurements.  This data set has become
+one of the canonical introductions to machine learning classification problems.
+(The [MNIST database](https://en.wikipedia.org/wiki/MNIST_database),
+which contains handwritten digits, is another popular classification
+problem.) The first 5 entries of the Iris data set
+look as follows:
+
+| Sepal length | sepal width | petal length | petal width | species
+| ---          | ---         | ---          | ---         | ---
+|6.4           | 2.8         | 5.6          | 2.2         | 2
+|5.0           | 2.3         | 3.3          | 1.0         | 1
+|4.9           | 2.5         | 4.5          | 1.7         | 2
+|4.9           | 3.1         | 1.5          | 0.1         | 0
+|5.7           | 3.8         | 1.7          | 0.3         | 0
+
+Let's introduce some terms:
+
+*   The last column (species) is called the
+    [**label**](https://developers.google.com/machine-learning/glossary/#label);
+    the first four columns are called
+    [**features**](https://developers.google.com/machine-learning/glossary/#feature).
+    Features are characteristics of an example, while the label is
+    the thing we're trying to predict.
+
+*   An [**example**](https://developers.google.com/machine-learning/glossary/#example)
+    consists of the set of features and the label for one sample
+    flower. The preceding table shows 5 examples from a data set of
+    120 examples.
+
+Each label is naturally a string (for example, "setosa"), but machine learning
+typically relies on numeric values. Therefore, someone mapped each string to
+a number.  Here's the representation scheme:
+
+* 0 represents setosa
+* 1 represents versicolor
+* 2 represents virginica
+
+
+## Models and training
+
+A **model** is the relationship between features
+and the label.  For the Iris problem, the model defines the relationship
+between the sepal and petal measurements and the Iris species.
+Some simple models can be described with a few lines of algebra;
+more complex machine learning models
+contain such a large number of interlacing mathematical functions and
+parameters that they become hard to summarize mathematically.
+
+Could you determine the relationship between the four features and the
+Iris species *without* using machine learning?  That is, could you use
+traditional programming techniques (for example, a lot of conditional
+statements) to create a model?  Maybe. You could play with the data set
+long enough to determine the right relationships of petal and sepal
+measurements to particular species.  However, a good machine learning
+approach *determines the model for you*.  That is, if you feed enough
+representative examples into the right machine learning model type, the program
+will determine the relationship between sepals, petals, and species.
+
+**Training** is the stage of machine learning in which the model is
+gradually optimized (learned).  The Iris problem is an example
+of [**supervised machine
+learning**](https://developers.google.com/machine-learning/glossary/#supervised_machine_learning)
+in which a model is trained from examples that contain labels.  (In
+[**unsupervised machine
+learning**](https://developers.google.com/machine-learning/glossary/#unsupervised_machine_learning),
+the examples don't contain labels. Instead, the model typically finds
+patterns among the features.)
+
+
+
+
+## Get the sample program
+
+Prior to playing with the sample code in this document, do the following:
+
+1.  @{$install$Install TensorFlow}.
+2.  If you installed TensorFlow with virtualenv or Anaconda, activate your
+    TensorFlow environment.
+3.  Install or upgrade pandas by issuing the following command:
+
+     `pip install pandas`
+
+
+Take the following steps to get the sample program:
+
+1. Clone the TensorFlow Models repository from github by entering the following
+   command:
+
+       `git clone https://github.com/tensorflow/models`
+
+2. Change directory within that branch to the location containing the examples
+   used in this document:
+
+       `cd models/samples/core/get_started/`
+
+In that `get_started` directory, you'll find a program
+named `premade_estimator.py`.
+
+
+## Run the sample program
+
+You run TensorFlow programs as you would run any Python program. Therefore,
+issue the following command from a command line to
+run `premade_estimators.py`:
+
+``` bash
+python premade_estimator.py
+```
+
+Running the program should output a whole bunch of information ending with
+three prediction lines like the following:
+
+```None
+...
+Prediction is "Setosa" (99.6%), expected "Setosa"
+
+Prediction is "Versicolor" (99.8%), expected "Versicolor"
+
+Prediction is "Virginica" (97.9%), expected "Virginica"
+```
+
+If the program generates errors instead of predictions, ask yourself the
+following questions:
+
+* Did you install TensorFlow properly?
+* Are you using the correct version of TensorFlow?  The `premade_estimators.py`
+  program requires at least TensorFlow v1.4.
+* If you installed TensorFlow with virtualenv or Anaconda, did you activate
+  the environment?
+
+
+
+## The TensorFlow programming stack
+
+As the following illustration shows, TensorFlow
+provides a programming stack consisting of multiple API layers:
+
+<div style="margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/tensorflow_programming_environment.png">
+</div>
+
+**The TensorFlow Programming Environment.**
+<p>&nbsp;</p>
+
+As you start writing TensorFlow programs, we strongly recommend focusing on
+the following two high-level APIs:
+
+*   Estimators
+*   Datasets
+
+Although we'll grab an occasional convenience function from other APIs,
+this document focuses on the preceding two APIs.
+
+
+## The program itself
+
+Thanks for your patience; let's dig into the code.
+The general outline of `premade_estimator.py`--and many other TensorFlow
+programs--is as follows:
+
+*   Import and parse the data sets.
+*   Create feature columns to describe the data.
+*   Select the type of model
+*   Train the model.
+*   Evaluate the model's effectiveness.
+*   Let the trained model make predictions.
+
+The following subsections detail each part.
+
+
+### Import and parse the data sets
+
+The Iris program requires the data from the following two .csv files:
+
+*   `http://download.tensorflow.org/data/iris_training.csv`, which contains
+    the training set.
+*   `http://download.tensorflow.org/data/iris_test.csv`, which contains the
+    the test set.
+
+The **training set** contains the examples that we'll use to train the model;
+the **test set** contains the examples that we'll use to evaluate the trained
+model's effectiveness.
+
+The training set and test set started out as a
+single data set.  Then, someone split the examples, with the majority going into
+the training set and the remainder going into the test set.  Adding
+examples to the training set usually builds a better model; however, adding
+more examples to the test set enables us to better gauge the model's
+effectiveness. Regardless of the split, the examples in the test set
+must be separate from the examples in the training set.  Otherwise, you can't
+accurately determine the model's effectiveness.
+
+The `premade_estimators.py` program relies on the `load_data` function
+in the adjacent [`iris_data.py`](
+https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py)
+file to read in and parse the training set and test set.
+Here is a heavily commented version of the function:
+
+```python
+TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
+TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
+
+CSV_COLUMN_NAMES = ['SepalLength', 'SepalWidth',
+                    'PetalLength', 'PetalWidth', 'Species']
+
+...
+
+def load_data(label_name='Species'):
+    """Parses the csv file in TRAIN_URL and TEST_URL."""
+
+    # Create a local copy of the training set.
+    train_path = tf.keras.utils.get_file(fname=TRAIN_URL.split('/')[-1],
+                                         origin=TRAIN_URL)
+    # train_path now holds the pathname: ~/.keras/datasets/iris_training.csv
+
+    # Parse the local CSV file.
+    train = pd.read_csv(filepath_or_buffer=train_path,
+                        names=CSV_COLUMN_NAMES,  # list of column names
+                        header=0  # ignore the first row of the CSV file.
+                       )
+    # train now holds a pandas DataFrame, which is data structure
+    # analogous to a table.
+
+    # 1. Assign the DataFrame's labels (the right-most column) to train_label.
+    # 2. Delete (pop) the labels from the DataFrame.
+    # 3. Assign the remainder of the DataFrame to train_features
+    train_features, train_label = train, train.pop(label_name)
+
+    # Apply the preceding logic to the test set.
+    test_path = tf.keras.utils.get_file(TEST_URL.split('/')[-1], TEST_URL)
+    test = pd.read_csv(test_path, names=CSV_COLUMN_NAMES, header=0)
+    test_features, test_label = test, test.pop(label_name)
+
+    # Return four DataFrames.
+    return (train_features, train_label), (test_features, test_label)
+```
+
+Keras is an open-sourced machine learning library; `tf.keras` is a TensorFlow
+implementation of Keras.  The `premade_estimator.py` program only accesses
+one `tf.keras` function; namely, the `tf.keras.utils.get_file` convenience
+function, which copies a remote CSV file to a local file system.
+
+The call to `load_data` returns two `(feature,label)` pairs, for the training
+and test sets respectively:
+
+```python
+    # Call load_data() to parse the CSV file.
+    (train_feature, train_label), (test_feature, test_label) = load_data()
+```
+
+Pandas is an open-source Python library leveraged by several
+TensorFlow functions.  A pandas
+[**DataFrame**](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)
+is a table with named columns headers and numbered rows.
+The features returned by `load_data` are packed in `DataFrames`.
+For example, the `test_feature` DataFrame looks as follows:
+
+```none
+    SepalLength  SepalWidth  PetalLength  PetalWidth
+0           5.9         3.0          4.2         1.5
+1           6.9         3.1          5.4         2.1
+2           5.1         3.3          1.7         0.5
+...
+27          6.7         3.1          4.7         1.5
+28          6.7         3.3          5.7         2.5
+29          6.4         2.9          4.3         1.3
+```
+
+
+### Describe the data
+
+A **feature column** is a data structure that tells your model
+how to interpret the data in each feature.  In the Iris problem,
+we want the model to interpret the data in each
+feature as its literal floating-point value; that is, we want the
+model to interpret an input value like 5.4 as, well, 5.4.  However,
+in other machine learning problems, it is often desirable to interpret
+data less literally.  Using feature columns to
+interpret data is such a rich topic that we devote an entire
+@{$feature_columns$document} to it.
+
+From a code perspective, you build a list of `feature_column` objects by calling
+functions from the @{tf.feature_column} module. Each object describes an input
+to the model. To tell the model to interpret data as a floating-point value,
+call @{tf.feature_column.numeric_column}.  In `premade_estimator.py`, all
+four features should be interpreted as literal floating-point values, so
+the code to create a feature column looks as follows:
+
+```python
+# Create feature columns for all features.
+my_feature_columns = []
+for key in train_x.keys():
+    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
+```
+
+Here is a less elegant, but possibly clearer, alternative way to
+encode the preceding block:
+
+```python
+my_feature_columns = [
+    tf.feature_column.numeric_column(key='SepalLength'),
+    tf.feature_column.numeric_column(key='SepalWidth'),
+    tf.feature_column.numeric_column(key='PetalLength'),
+    tf.feature_column.numeric_column(key='PetalWidth')
+]
+```
+
+
+### Select the type of model
+
+We need to select the kind of model that will be trained.
+Lots of model types exist; picking the ideal type takes experience.
+We've selected a neural network to solve the Iris problem.  [**Neural
+networks**](https://developers.google.com/machine-learning/glossary/#neural_network)
+can find complex relationships between features and the label.
+A neural network is a highly-structured graph, organized into one or more
+[**hidden layers**](https://developers.google.com/machine-learning/glossary/#hidden_layer).
+Each hidden layer consists of one or more
+[**neurons**](https://developers.google.com/machine-learning/glossary/#neuron).
+There are several categories of neural networks.
+We'll be using a [**fully connected neural
+network**](https://developers.google.com/machine-learning/glossary/#fully_connected_layer),
+which means that the neurons in one layer take inputs from *every* neuron in
+the previous layer.  For example, the following figure illustrates a 
+fully connected neural network consisting of three hidden layers:
+
+*   The first hidden layer contains four neurons.
+*   The second hidden layer contains three neurons.
+*   The third hidden layer contains two neurons.
+
+<div style="margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/simple_dnn.svg">
+</div>
+
+**A neural network with three hidden layers.**
+<p>&nbsp;</p>
+
+To specify a model type, instantiate an
+[**Estimator**](https://developers.google.com/machine-learning/glossary/#Estimators)
+class.  TensorFlow provides two categories of Estimators:
+
+*   [**pre-made
+    Estimators**](https://developers.google.com/machine-learning/glossary/#pre-made_Estimator),
+    which someone else has already written for you.
+*   [**custom
+    Estimators**](https://developers.google.com/machine-learning/glossary/#custom_estimator),
+    which you must code yourself, at least partially.
+
+To implement a neural network, the `premade_estimators.py` program uses
+a pre-made Estimator named @{tf.estimator.DNNClassifier}.  This Estimator
+builds a neural network that classifies examples.  The following call
+instantiates `DNNClassifier`:
+
+```python
+    classifier = tf.estimator.DNNClassifier(
+        feature_columns=my_feature_columns,
+        hidden_units=[10, 10],
+        n_classes=3)
+```
+
+Use the `hidden_units` parameter to define the number of neurons
+in each hidden layer of the neural network.  Assign this parameter
+a list. For example:
+
+```python
+        hidden_units=[10, 10],
+```
+
+The length of the list assigned to `hidden_units` identifies the number of
+hidden layers (2, in this case).
+Each value in the list represents the number of neurons in a particular
+hidden layer (10 in the first hidden layer and 10 in the second hidden layer).
+To change the number of hidden layers or neurons, simply assign a different
+list to the `hidden_units` parameter.
+
+The ideal number of hidden layers and neurons depends on the problem
+and the data set. Like many aspects of machine learning,
+picking the ideal shape of the neural network requires some mixture
+of knowledge and experimentation.
+As a rule of thumb, increasing the number of hidden layers and neurons
+*typically* creates a more powerful model, which requires more data to
+train effectively.
+
+The `n_classes` parameter specifies the number of possible values that the
+neural network can predict.  Since the Iris problem classifies 3 Iris species,
+we set `n_classes` to 3.
+
+The constructor for `tf.Estimator.DNNClassifier` takes an optional argument
+named `optimizer`, which our sample code chose not to specify.  The
+[**optimizer**](https://developers.google.com/machine-learning/glossary/#optimizer)
+controls how the model will train.  As you develop more expertise in machine
+learning, optimizers and
+[**learning
+rate**](https://developers.google.com/machine-learning/glossary/#learning_rate)
+will become very important.
+
+
+
+### Train the model
+
+Instantiating a `tf.Estimator.DNNClassifier` creates a framework for learning 
+the model. Basically, we've wired a network but haven't yet let data flow 
+through it. To train the neural network, call the Estimator object's `train` 
+method. For example:
+
+```python
+    classifier.train(
+        input_fn=lambda:train_input_fn(train_feature, train_label, args.batch_size),
+        steps=args.train_steps)
+```
+
+The `steps` argument tells `train` to stop training after the specified
+number of iterations.  Increasing `steps` increases the amount of time
+the model will train.  Counter-intuitively, training a model longer
+does not guarantee a better model.  The default value of `args.train_steps`
+is 1000.  The number of steps to train is a
+[**hyperparameter**](https://developers.google.com/machine-learning/glossary/#hyperparameter)
+you can tune. Choosing the right number of steps usually
+requires both experience and experimentation.
+
+The `input_fn` parameter identifies the function that supplies the
+training data.  The call to the `train` method indicates that the
+`train_input_fn` function will supply the training data.  Here's that
+method's signature:
+
+```python
+def train_input_fn(features, labels, batch_size):
+```
+
+We're passing the following arguments to `train_input_fn`:
+
+* `train_feature` is a Python dictionary in which:
+    * Each key is the name of a feature.
+    * Each value is an array containing the values for each example in the
+      training set.
+* `train_label` is an array containing the values of the label for every
+  example in the training set.
+* `args.batch_size` is an integer defining the [**batch
+  size**](https://developers.google.com/machine-learning/glossary/#batch_size).
+
+The `train_input_fn` function relies on the **Dataset API**. This is a
+high-level TensorFlow API for reading data and transforming it into a form
+that the `train` method requires.  The following call converts the
+input features and labels into a `tf.data.Dataset` object, which is the base
+class of the Dataset API:
+
+```python
+    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
+```
+
+The `tf.dataset` class provides many useful functions for preparing examples
+for training. The following line calls three of those functions:
+
+```python
+    dataset = dataset.shuffle(buffer_size=1000).repeat(count=None).batch(batch_size)
+```
+
+Training works best if the training examples are in
+random order.  To randomize the examples, call
+`tf.data.Dataset.shuffle`.  Setting the `buffer_size` to a value
+larger than the number of examples (120) ensures that the data will
+be well shuffled.
+
+During training, the `train` method typically processes the
+examples multiple times.  Calling the
+`tf.data.Dataset.repeat` method without any arguments ensures
+that the `train` method has an infinite supply of (now shuffled)
+training set examples.
+
+The `train` method processes a
+[**batch**](https://developers.google.com/machine-learning/glossary/#batch)
+of examples at a time.
+The `tf.data.Dataset.batch` method creates a batch by
+concatenating multiple examples.
+This program sets the default [**batch
+size**](https://developers.google.com/machine-learning/glossary/#batch_size)
+to 100, meaning that the `batch` method will concatenate groups of
+100 examples.  The ideal batch size depends on the problem.  As a rule
+of thumb, smaller batch sizes usually enable the `train` method to train
+the model faster at the expense (sometimes) of accuracy.
+
+The following `return` statement passes a batch of examples back to
+the caller (the `train` method).
+
+```python
+   return dataset.make_one_shot_iterator().get_next()
+```
+
+
+### Evaluate the model
+
+**Evaluating** means determining how effectively the model makes
+predictions.  To determine the Iris classification model's effectiveness,
+pass some sepal and petal measurements to the model and ask the model
+to predict what Iris species they represent. Then compare the model's
+prediction against the actual label.  For example, a model that picked
+the correct species on half the input examples would have an
+[accuracy](https://developers.google.com/machine-learning/glossary/#accuracy)
+of 0.5.  The following suggests a more effective model:
+
+
+<table>
+  <tr>
+    <th style="background-color:darkblue" colspan="5">
+       Test Set</th>
+  </tr>
+  <tr>
+    <th colspan="4">Features</th>
+    <th colspan="1">Label</th>
+    <th colspan="1">Prediction</th>
+  </tr>
+  <tr> <td>5.9</td> <td>3.0</td> <td>4.3</td> <td>1.5</td> <td>1</td> 
+          <td style="background-color:green">1</td></tr>
+  <tr> <td>6.9</td> <td>3.1</td> <td>5.4</td> <td>2.1</td> <td>2</td> 
+          <td style="background-color:green">2</td></tr>
+  <tr> <td>5.1</td> <td>3.3</td> <td>1.7</td> <td>0.5</td> <td>0</td> 
+          <td style="background-color:green">0</td></tr>
+  <tr> <td>6.0</td> <td>3.4</td> <td>4.5</td> <td>1.6</td> <td>1</td> 
+          <td style="background-color:red">2</td></tr>
+  <tr> <td>5.5</td> <td>2.5</td> <td>4.0</td> <td>1.3</td> <td>1</td> 
+          <td style="background-color:green">1</td></tr>
+</table>
+
+**A model that is 80% accurate.**
+<p>&nbsp;</p>
+
+To evaluate a model's effectiveness, each Estimator provides an `evaluate`
+method.  The `premade_estimator.py` program calls `evaluate` as follows:
+
+```python
+# Evaluate the model.
+eval_result = classifier.evaluate(
+    input_fn=lambda:eval_input_fn(test_x, test_y, args.batch_size))
+
+print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
+```
+
+The call to `classifier.evaluate` is similar to the call to `classifier.train`.
+The biggest difference is that `classifier.evaluate` must get its examples
+from the test set rather than the training set.  In other words, to
+fairly assess a model's effectiveness, the examples used to
+*evaluate* a model must be different from the examples used to *train*
+the model.  The `eval_input_fn` function serves a batch of examples from
+the test set.  Here's the `eval_input_fn` method:
+
+```python
+def eval_input_fn(features, labels=None, batch_size=None):
+    """An input function for evaluation or prediction"""
+    if labels is None:
+        # No labels, use only features.
+        inputs = features
+    else:
+        inputs = (features, labels)
+
+    # Convert inputs to a tf.dataset object.
+    dataset = tf.data.Dataset.from_tensor_slices(inputs)
+
+    # Batch the examples
+    assert batch_size is not None, "batch_size must not be None"
+    dataset = dataset.batch(batch_size)
+
+    # Return the read end of the pipeline.
+    return dataset.make_one_shot_iterator().get_next()
+```
+
+In brief, `eval_input_fn` does the following when called by
+`classifier.evaluate`:
+
+1.  Converts the features and labels from the test set to a `tf.dataset`
+    object.
+2.  Creates a batch of test set examples.  (There's no need to shuffle
+    or repeat the test set examples.)
+3.  Returns that batch of test set examples to `classifier.evaluate`.
+
+Running this code yields the following output (or something close to it):
+
+```none
+Test set accuracy: 0.967
+```
+
+An accuracy of 0.967 implies that our trained model correctly classified 29
+out of the 30 Iris species in the test set.
+
+
+### Predicting
+
+We've now trained a model and "proven" that it is good--but not
+perfect--at classifying Iris species.  Now let's use the trained
+model to make some predictions on [**unlabeled
+examples**](https://developers.google.com/machine-learning/glossary/#unlabeled_example);
+that is, on examples that contain features but not a label.
+
+In real-life, the unlabeled examples could come from lots of different
+sources including apps, CSV files, and data feeds.  For now, we're simply
+going to manually provide the following three unlabeled examples:
+
+```python
+    predict_x = {
+        'SepalLength': [5.1, 5.9, 6.9],
+        'SepalWidth': [3.3, 3.0, 3.1],
+        'PetalLength': [1.7, 4.2, 5.4],
+        'PetalWidth': [0.5, 1.5, 2.1],
+    }
+```
+
+Every Estimator provides a `predict` method, which `premade_estimator.py`
+calls as follows:
+
+```python
+predictions = classifier.predict(
+    input_fn=lambda:eval_input_fn(predict_x,
+                                  labels=None,
+                                  batch_size=args.batch_size))
+```
+
+As with the `evaluate` method, our `predict` method also gathers examples
+from the `eval_input_fn` method.
+
+When doing predictions, we're *not* passing labels to `eval_input_fn`.
+Therefore, `eval_input_fn` does the following:
+
+1.  Converts the features from the 3-element manual set we just created.
+2.  Creates a batch of 3 examples from that manual set.
+3.  Returns that batch of examples to `classifier.predict`.
+
+The `predict` method returns a python iterable, yielding a dictionary of
+prediction results for each example.  This dictionary contains several keys.
+The `probabilities` key holds a list of three floating-point values,
+each representing the probability that the input example is a particular
+Iris species.  For example, consider the following `probabilities` list:
+
+```none
+'probabilities': array([  1.19127117e-08,   3.97069454e-02,   9.60292995e-01])
+```
+
+The preceding list indicates:
+
+*   A negligible chance of the Iris being Setosa.
+*   A 3.97% chance of the Iris being Versicolor.
+*   A 96.0% chance of the Iris being Virginica.
+
+The `class_ids` key holds a one-element array that identifies the most
+probable species.  For example:
+
+```none
+'class_ids': array([2])
+```
+
+The number `2` corresponds to Virginica.  The following code iterates
+through the returned `predictions` to report on each prediction:
+
+``` python
+for pred_dict, expec in zip(predictions, expected):
+    template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')
+
+    class_id = pred_dict['class_ids'][0]
+    probability = pred_dict['probabilities'][class_id]
+    print(template.format(iris_data.SPECIES[class_id], 100 * probability, expec))
+```
+
+Running the program yields the following output:
+
+
+``` None
+...
+Prediction is "Setosa" (99.6%), expected "Setosa"
+
+Prediction is "Versicolor" (99.8%), expected "Versicolor"
+
+Prediction is "Virginica" (97.9%), expected "Virginica"
+```
+
+
+## Summary
+
+<!--TODO(barryr): When MLCC is released, add pointers to relevant sections.-->
+This document provides a short introduction to machine learning.
+
+Because `premade_estimators.py` relies on high-level APIs, much of the
+mathematical complexity in machine learning is hidden.
+If you intend to become more proficient in machine learning, we recommend
+ultimately learning more about [**gradient
+descent**](https://developers.google.com/machine-learning/glossary/#gradient_descent),
+batching, and neural networks.
+
+We recommend reading the @{$feature_columns$Feature Columns} document next,
+which explains how to represent different kinds of data in machine learning.
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index 003fac1a287688e1d1d343b1dcc834500fd20856..b7bd1286e3ce9026df49718d94cf53cf784a3be8 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -1,36 +1,35 @@
 # Getting Started
 
-For a brief overview of TensorFlow programming fundamentals, see the following
-guide:
-
-  * @{$get_started/get_started$Getting Started with TensorFlow}
-
-MNIST has become the canonical dataset for trying out a new machine learning
-toolkit.  We offer three guides that each demonstrate a different approach
-to training an MNIST model on TensorFlow:
-
-  * @{$mnist/beginners$MNIST for ML Beginners}, which introduces MNIST through
-    the high-level API.
-  * @{$mnist/pros$Deep MNIST for Experts}, which is more-in depth than
-    "MNIST for ML Beginners," and assumes some familiarity with machine
-    learning concepts.
-  * @{$mnist/mechanics$TensorFlow Mechanics 101}, which introduces MNIST through
-    the low-level API.
-
-For developers new to TensorFlow, the high-level API is a good place to start.
-To learn about the high-level API, read the following guides:
-
-  * @{$get_started/estimator$tf.estimator Quickstart}, which introduces this
-    API.
-  * @{$get_started/input_fn$Building Input Functions},
-    which takes you into a somewhat more sophisticated use of this API.
-
-TensorBoard is a utility to visualize different aspects of machine learning.
-The following guides explain how to use TensorBoard:
-
-  * @{$get_started/summaries_and_tensorboard$TensorBoard: Visualizing Learning},
-    which gets you started.
-  * @{$get_started/graph_viz$TensorBoard: Graph Visualization}, which explains
-    how to visualize the computational graph.  Graph visualization is typically
-    more useful for programmers using the low-level API.
-
+TensorFlow is a tool for machine learning. While it contains a wide range of
+functionality, TensorFlow is mainly designed for deep neural network models.
+
+TensorFlow provides many APIs. This section focuses on the high-level APIs.
+If you are new to TensorFlow, begin by reading one of the following documents:
+
+  * @{$get_started/get_started_for_beginners}, which is aimed at readers
+    new to machine learning.
+  * @{$get_started/premade_estimators}, which is aimed at readers who have
+    experience in machine learning.
+
+Then, read the following documents, which demonstrate the key features
+in the high-level APIs:
+
+  * @{$get_started/checkpoints}, which explains how to save training progress
+    and resume where you left off.
+  * @{$get_started/feature_columns}, which shows how an
+    Estimator can handle a variety of input data types without changes to the
+    model.
+  * @{$get_started/datasets_quickstart}, which introduces TensorFlow's
+    input pipelines.
+  * @{$get_started/custom_estimators}, which demonstrates how
+    to build and train models you design yourself.
+
+For more advanced users:
+
+  * The @{$low_level_intro$Low Level Introduction} demonstrates how to use
+    TensorFlow outside of the Estimator framework, for debugging and
+    experimentation.
+  * The @{$programmers_guide$Programmer's Guide} details major
+    TensorFlow components.
+  * The @{$tutorials$Tutorials} provide walkthroughs of a variety of
+    TensorFlow models.
diff --git a/tensorflow/docs_src/get_started/input_fn.md b/tensorflow/docs_src/get_started/input_fn.md
deleted file mode 100644
index f0dcdc47ff1fd70bc8fce670a51d0cef8234e4ba..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/get_started/input_fn.md
+++ /dev/null
@@ -1,438 +0,0 @@
-# Building Input Functions with tf.estimator
-
-This tutorial introduces you to creating input functions in tf.estimator.
-You'll get an overview of how to construct an `input_fn` to preprocess and feed
-data into your models. Then, you'll implement an `input_fn` that feeds training,
-evaluation, and prediction data into a neural network regressor for predicting
-median house values.
-
-## Custom Input Pipelines with input_fn
-
-The `input_fn` is used to pass feature and target data to the `train`,
-`evaluate`, and `predict` methods of the `Estimator`.
-The user can do feature engineering or pre-processing inside the `input_fn`.
-Here's an example taken from the @{$get_started/estimator$tf.estimator Quickstart tutorial}:
-
-```python
-import numpy as np
-
-training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-    filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
-
-train_input_fn = tf.estimator.inputs.numpy_input_fn(
-    x={"x": np.array(training_set.data)},
-    y=np.array(training_set.target),
-    num_epochs=None,
-    shuffle=True)
-
-classifier.train(input_fn=train_input_fn, steps=2000)
-```
-
-### Anatomy of an input_fn
-
-The following code illustrates the basic skeleton for an input function:
-
-```python
-def my_input_fn():
-
-    # Preprocess your data here...
-
-    # ...then return 1) a mapping of feature columns to Tensors with
-    # the corresponding feature data, and 2) a Tensor containing labels
-    return feature_cols, labels
-```
-
-The body of the input function contains the specific logic for preprocessing
-your input data, such as scrubbing out bad examples or
-[feature scaling](https://en.wikipedia.org/wiki/Feature_scaling).
-
-Input functions must return the following two values containing the final
-feature and label data to be fed into your model (as shown in the above code
-skeleton):
-
-<dl>
-  <dt><code>feature_cols</code></dt>
-  <dd>A dict containing key/value pairs that map feature column
-names to <code>Tensor</code>s (or <code>SparseTensor</code>s) containing the corresponding feature
-data.</dd>
-  <dt><code>labels</code></dt>
-  <dd>A <code>Tensor</code> containing your label (target) values: the values your model aims to predict.</dd>
-</dl>
-
-### Converting Feature Data to Tensors
-
-If your feature/label data is a python array or stored in
-[_pandas_](http://pandas.pydata.org/) dataframes or
-[numpy](http://www.numpy.org/) arrays, you can use the following methods to
-construct `input_fn`:
-
-```python
-import numpy as np
-# numpy input_fn.
-my_input_fn = tf.estimator.inputs.numpy_input_fn(
-    x={"x": np.array(x_data)},
-    y=np.array(y_data),
-    ...)
-```
-
-```python
-import pandas as pd
-# pandas input_fn.
-my_input_fn = tf.estimator.inputs.pandas_input_fn(
-    x=pd.DataFrame({"x": x_data}),
-    y=pd.Series(y_data),
-    ...)
-```
-
-For [sparse, categorical data](https://en.wikipedia.org/wiki/Sparse_matrix)
-(data where the majority of values are 0), you'll instead want to populate a
-`SparseTensor`, which is instantiated with three arguments:
-
-<dl>
-  <dt><code>dense_shape</code></dt>
-  <dd>The shape of the tensor. Takes a list indicating the number of elements in each dimension. For example, <code>dense_shape=[3,6]</code> specifies a two-dimensional 3x6 tensor, <code>dense_shape=[2,3,4]</code> specifies a three-dimensional 2x3x4 tensor, and <code>dense_shape=[9]</code> specifies a one-dimensional tensor with 9 elements.</dd>
-  <dt><code>indices</code></dt>
-  <dd>The indices of the elements in your tensor that contain nonzero values. Takes a list of terms, where each term is itself a list containing the index of a nonzero element. (Elements are zero-indexed—i.e., [0,0] is the index value for the element in the first column of the first row in a two-dimensional tensor.) For example, <code>indices=[[1,3], [2,4]]</code> specifies that the elements with indexes of [1,3] and [2,4] have nonzero values.</dd>
-  <dt><code>values</code></dt>
-  <dd>A one-dimensional tensor of values. Term <code>i</code> in <code>values</code> corresponds to term <code>i</code> in <code>indices</code> and specifies its value. For example, given <code>indices=[[1,3], [2,4]]</code>, the parameter <code>values=[18, 3.6]</code> specifies that element [1,3] of the tensor has a value of 18, and element [2,4] of the tensor has a value of 3.6.</dd>
-</dl>
-
-The following code defines a two-dimensional `SparseTensor` with 3 rows and 5
-columns. The element with index [0,1] has a value of 6, and the element with
-index [2,4] has a value of 0.5 (all other values are 0):
-
-```python
-sparse_tensor = tf.SparseTensor(indices=[[0,1], [2,4]],
-                                values=[6, 0.5],
-                                dense_shape=[3, 5])
-```
-
-This corresponds to the following dense tensor:
-
-```none
-[[0, 6, 0, 0, 0]
- [0, 0, 0, 0, 0]
- [0, 0, 0, 0, 0.5]]
-```
-
-For more on `SparseTensor`, see @{tf.SparseTensor}.
-
-### Passing input_fn Data to Your Model
-
-To feed data to your model for training, you simply pass the input function
-you've created to your `train` operation as the value of the `input_fn`
-parameter, e.g.:
-
-```python
-classifier.train(input_fn=my_input_fn, steps=2000)
-```
-
-Note that the `input_fn` parameter must receive a function object (i.e.,
-`input_fn=my_input_fn`), not the return value of a function call
-(`input_fn=my_input_fn()`). This means that if you try to pass parameters to the
-`input_fn` in your `train` call, as in the following code, it will result in a
-`TypeError`:
-
-```python
-classifier.train(input_fn=my_input_fn(training_set), steps=2000)
-```
-
-However, if you'd like to be able to parameterize your input function, there are
-other methods for doing so. You can employ a wrapper function that takes no
-arguments as your `input_fn` and use it to invoke your input function
-with the desired parameters. For example:
-
-```python
-def my_input_fn(data_set):
-  ...
-
-def my_input_fn_training_set():
-  return my_input_fn(training_set)
-
-classifier.train(input_fn=my_input_fn_training_set, steps=2000)
-```
-
-Alternatively, you can use Python's [`functools.partial`](https://docs.python.org/2/library/functools.html#functools.partial)
-function to construct a new function object with all parameter values fixed:
-
-```python
-classifier.train(
-    input_fn=functools.partial(my_input_fn, data_set=training_set),
-    steps=2000)
-```
-
-A third option is to wrap your `input_fn` invocation in a
-[`lambda`](https://docs.python.org/3/tutorial/controlflow.html#lambda-expressions)
-and pass it to the `input_fn` parameter:
-
-```python
-classifier.train(input_fn=lambda: my_input_fn(training_set), steps=2000)
-```
-
-One big advantage of designing your input pipeline as shown above—to accept a
-parameter for data set—is that you can pass the same `input_fn` to `evaluate`
-and `predict` operations by just changing the data set argument, e.g.:
-
-```python
-classifier.evaluate(input_fn=lambda: my_input_fn(test_set), steps=2000)
-```
-
-This approach enhances code maintainability: no need to define multiple
-`input_fn` (e.g. `input_fn_train`, `input_fn_test`, `input_fn_predict`) for each
-type of operation.
-
-Finally, you can use the methods in `tf.estimator.inputs` to create `input_fn`
-from numpy or pandas data sets. The additional benefit is that you can use
-more arguments, such as `num_epochs` and `shuffle` to control how the `input_fn`
-iterates over the data:
-
-```python
-import pandas as pd
-
-def get_input_fn_from_pandas(data_set, num_epochs=None, shuffle=True):
-  return tf.estimator.inputs.pandas_input_fn(
-      x=pd.DataFrame(...),
-      y=pd.Series(...),
-      num_epochs=num_epochs,
-      shuffle=shuffle)
-```
-
-```python
-import numpy as np
-
-def get_input_fn_from_numpy(data_set, num_epochs=None, shuffle=True):
-  return tf.estimator.inputs.numpy_input_fn(
-      x={...},
-      y=np.array(...),
-      num_epochs=num_epochs,
-      shuffle=shuffle)
-```
-
-### A Neural Network Model for Boston House Values
-
-In the remainder of this tutorial, you'll write an input function for
-preprocessing a subset of Boston housing data pulled from the UCI Housing Data
-Set and use it to feed data to
-a neural network regressor for predicting median house values.
-
-The [Boston CSV data sets](#setup) you'll use to train your neural network
-contain the following
-[feature data](https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names)
-for Boston suburbs:
-
-Feature | Description
-------- | ---------------------------------------------------------------
-CRIM    | Crime rate per capita
-ZN      | Fraction of residential land zoned to permit 25,000+ sq ft lots
-INDUS   | Fraction of land that is non-retail business
-NOX     | Concentration of nitric oxides in parts per 10 million
-RM      | Average Rooms per dwelling
-AGE     | Fraction of owner-occupied residences built before 1940
-DIS     | Distance to Boston-area employment centers
-TAX     | Property tax rate per $10,000
-PTRATIO | Student-teacher ratio
-
-And the label your model will predict is MEDV, the median value of
-owner-occupied residences in thousands of dollars.
-
-## Setup {#setup}
-
-Download the following data sets:
-[boston_train.csv](http://download.tensorflow.org/data/boston_train.csv),
-[boston_test.csv](http://download.tensorflow.org/data/boston_test.csv), and
-[boston_predict.csv](http://download.tensorflow.org/data/boston_predict.csv).
-
-The following sections provide a step-by-step walkthrough of how to create an
-input function, feed these data sets into a neural network regressor, train and
-evaluate the model, and make house value predictions. The full, final code is [available
-here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/input_fn/boston.py).
-
-### Importing the Housing Data
-
-To start, set up your imports (including `pandas` and `tensorflow`) and set logging verbosity to
-`INFO` for more detailed log output:
-
-```python
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import itertools
-
-import pandas as pd
-import tensorflow as tf
-
-tf.logging.set_verbosity(tf.logging.INFO)
-```
-
-Define the column names for the data set in `COLUMNS`. To distinguish features
-from the label, also define `FEATURES` and `LABEL`. Then read the three CSVs
-([train](http://download.tensorflow.org/data/boston_train.csv),
-[test](http://download.tensorflow.org/data/boston_test.csv), and
-[predict](http://download.tensorflow.org/data/boston_predict.csv)) into _pandas_
-`DataFrame`s:
-
-```python
-COLUMNS = ["crim", "zn", "indus", "nox", "rm", "age",
-           "dis", "tax", "ptratio", "medv"]
-FEATURES = ["crim", "zn", "indus", "nox", "rm",
-            "age", "dis", "tax", "ptratio"]
-LABEL = "medv"
-
-training_set = pd.read_csv("boston_train.csv", skipinitialspace=True,
-                           skiprows=1, names=COLUMNS)
-test_set = pd.read_csv("boston_test.csv", skipinitialspace=True,
-                       skiprows=1, names=COLUMNS)
-prediction_set = pd.read_csv("boston_predict.csv", skipinitialspace=True,
-                             skiprows=1, names=COLUMNS)
-```
-
-### Defining FeatureColumns and Creating the Regressor
-
-Next, create a list of `FeatureColumn`s for the input data, which formally
-specify the set of features to use for training. Because all features in the
-housing data set contain continuous values, you can create their
-`FeatureColumn`s using the `tf.contrib.layers.real_valued_column()` function:
-
-```python
-feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]
-```
-
-NOTE: For a more in-depth overview of feature columns, see
-@{$linear#feature-columns-and-transformations$this introduction},
-and for an example that illustrates how to define `FeatureColumns` for
-categorical data, see the @{$wide$Linear Model Tutorial}.
-
-Now, instantiate a `DNNRegressor` for the neural network regression model.
-You'll need to provide two arguments here: `hidden_units`, a hyperparameter
-specifying the number of nodes in each hidden layer (here, two hidden layers
-with 10 nodes each), and `feature_columns`, containing the list of
-`FeatureColumns` you just defined:
-
-```python
-regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols,
-                                      hidden_units=[10, 10],
-                                      model_dir="/tmp/boston_model")
-```
-
-### Building the input_fn
-
-To pass input data into the `regressor`, write a factory method that accepts a
-_pandas_ `Dataframe` and returns an `input_fn`:
-
-```python
-def get_input_fn(data_set, num_epochs=None, shuffle=True):
-  return tf.estimator.inputs.pandas_input_fn(
-      x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
-      y = pd.Series(data_set[LABEL].values),
-      num_epochs=num_epochs,
-      shuffle=shuffle)
-```
-
-Note that the input data is passed into `input_fn` in the `data_set` argument,
-which means the function can process any of the `DataFrame`s you've imported:
-`training_set`, `test_set`, and `prediction_set`.
-
-Two additional arguments are provided:
-* `num_epochs`: controls the number of
-  epochs to iterate over data. For training, set this to `None`, so the
-  `input_fn` keeps returning data until the required number of train steps is
-  reached. For evaluate and predict, set this to 1, so the `input_fn` will
-  iterate over the data once and then raise `OutOfRangeError`. That error will
-  signal the `Estimator` to stop evaluate or predict.
-* `shuffle`: Whether to shuffle the data. For evaluate and predict, set this to
-  `False`, so the `input_fn` iterates over the data sequentially. For train,
-  set this to `True`.
-
-### Training the Regressor
-
-To train the neural network regressor, run `train` with the `training_set`
-passed to the `input_fn` as follows:
-
-```python
-regressor.train(input_fn=get_input_fn(training_set), steps=5000)
-```
-
-You should see log output similar to the following, which reports training loss
-for every 100 steps:
-
-```none
-INFO:tensorflow:Step 1: loss = 483.179
-INFO:tensorflow:Step 101: loss = 81.2072
-INFO:tensorflow:Step 201: loss = 72.4354
-...
-INFO:tensorflow:Step 1801: loss = 33.4454
-INFO:tensorflow:Step 1901: loss = 32.3397
-INFO:tensorflow:Step 2001: loss = 32.0053
-INFO:tensorflow:Step 4801: loss = 27.2791
-INFO:tensorflow:Step 4901: loss = 27.2251
-INFO:tensorflow:Saving checkpoints for 5000 into /tmp/boston_model/model.ckpt.
-INFO:tensorflow:Loss for final step: 27.1674.
-```
-
-### Evaluating the Model
-
-Next, see how the trained model performs against the test data set. Run
-`evaluate`, and this time pass the `test_set` to the `input_fn`:
-
-```python
-ev = regressor.evaluate(
-    input_fn=get_input_fn(test_set, num_epochs=1, shuffle=False))
-```
-
-Retrieve the loss from the `ev` results and print it to output:
-
-```python
-loss_score = ev["loss"]
-print("Loss: {0:f}".format(loss_score))
-```
-
-You should see results similar to the following:
-
-```none
-INFO:tensorflow:Eval steps [0,1) for training step 5000.
-INFO:tensorflow:Saving evaluation summary for 5000 step: loss = 11.9221
-Loss: 11.922098
-```
-
-### Making Predictions
-
-Finally, you can use the model to predict median house values for the
-`prediction_set`, which contains feature data but no labels for six examples:
-
-```python
-y = regressor.predict(
-    input_fn=get_input_fn(prediction_set, num_epochs=1, shuffle=False))
-# .predict() returns an iterator of dicts; convert to a list and print
-# predictions
-predictions = list(p["predictions"] for p in itertools.islice(y, 6))
-print("Predictions: {}".format(str(predictions)))
-```
-
-Your results should contain six house-value predictions in thousands of dollars,
-e.g:
-
-```none
-Predictions: [ 33.30348587  17.04452896  22.56370163  34.74345398  14.55953979
-  19.58005714]
-```
-
-## Additional Resources
-
-This tutorial focused on creating an `input_fn` for a neural network regressor.
-To learn more about using `input_fn`s for other types of models, check out the
-following resources:
-
-*   @{$linear$Large-scale Linear Models with TensorFlow}: This
-    introduction to linear models in TensorFlow provides a high-level overview
-    of feature columns and techniques for transforming input data.
-
-*   @{$wide$TensorFlow Linear Model Tutorial}: This tutorial covers
-    creating `FeatureColumn`s and an `input_fn` for a linear classification
-    model that predicts income range based on census data.
-
-*   @{$wide_and_deep$TensorFlow Wide & Deep Learning Tutorial}: Building on
-    the @{$wide$Linear Model Tutorial}, this tutorial covers
-    `FeatureColumn` and `input_fn` creation for a "wide and deep" model that
-    combines a linear model and a neural network using
-    `DNNLinearCombinedClassifier`.
diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files
index bb67eaddda369c0271c4fdb17a686016ffa80a2e..437791d6a32db3e43415e381a034424ae8225f6f 100644
--- a/tensorflow/docs_src/get_started/leftnav_files
+++ b/tensorflow/docs_src/get_started/leftnav_files
@@ -1,10 +1,11 @@
 index.md
-get_started.md
-mnist/beginners.md
-mnist/pros.md
-mnist/mechanics.md
-estimator.md
-input_fn.md
-summaries_and_tensorboard.md
-graph_viz.md
-tensorboard_histograms.md
+
+### Getting Started
+get_started_for_beginners.md
+premade_estimators.md
+
+### Details
+checkpoints.md
+feature_columns.md
+datasets_quickstart.md
+custom_estimators.md
diff --git a/tensorflow/docs_src/get_started/mnist/beginners.md b/tensorflow/docs_src/get_started/mnist/beginners.md
deleted file mode 100644
index 38c467ddc32c9ca21432cc7fe74a594446804293..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/get_started/mnist/beginners.md
+++ /dev/null
@@ -1,455 +0,0 @@
-# MNIST For ML Beginners
-
-*This tutorial is intended for readers who are new to both machine learning and
-TensorFlow. If you already know what MNIST is, and what softmax (multinomial
-logistic) regression is, you might prefer this
-@{$pros$faster paced tutorial}.  Be sure to
-@{$install$install TensorFlow} before starting either
-tutorial.*
-
-When one learns how to program, there's a tradition that the first thing you do
-is print "Hello World." Just like programming has Hello World, machine learning
-has MNIST.
-
-MNIST is a simple computer vision dataset. It consists of images of handwritten
-digits like these:
-
-<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/MNIST.png">
-</div>
-
-It also includes labels for each image, telling us which digit it is. For
-example, the labels for the above images are 5, 0, 4, and 1.
-
-In this tutorial, we're going to train a model to look at images and predict
-what digits they are. Our goal isn't to train a really elaborate model that
-achieves state-of-the-art performance -- although we'll give you code to do that
-later! -- but rather to dip a toe into using TensorFlow. As such, we're going
-to start with a very simple model, called a Softmax Regression.
-
-The actual code for this tutorial is very short, and all the interesting
-stuff happens in just three lines. However, it is very
-important to understand the ideas behind it: both how TensorFlow works and the
-core machine learning concepts. Because of this, we are going to very carefully
-work through the code.
-
-## About this tutorial
-
-This tutorial is an explanation, line by line, of what is happening in the
-[mnist_softmax.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_softmax.py) code.
-
-You can use this tutorial in a few different ways, including:
-
-- Copy and paste each code snippet, line by line, into a Python environment as
-  you read through the explanations of each line.
-
-- Run the entire `mnist_softmax.py` Python file either before or after reading
-  through the explanations, and use this tutorial to understand the lines of
-  code that aren't clear to you.
-
-What we will accomplish in this tutorial:
-
-- Learn about the MNIST data and softmax regressions
-
-- Create a function that is a model for recognizing digits, based on looking at
-  every pixel in the image
-
-- Use TensorFlow to train the model to recognize digits by having it "look" at
-  thousands of examples (and run our first TensorFlow session to do so)
-
-- Check the model's accuracy with our test data
-
-## The MNIST Data
-
-The MNIST data is hosted on
-[Yann LeCun's website](http://yann.lecun.com/exdb/mnist/).  If you are copying and
-pasting in the code from this tutorial, start here with these two lines of code
-which will download and read in the data automatically:
-
-```python
-from tensorflow.examples.tutorials.mnist import input_data
-mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
-```
-
-The MNIST data is split into three parts: 55,000 data points of training
-data (`mnist.train`), 10,000 points of test data (`mnist.test`), and 5,000
-points of validation data (`mnist.validation`). This split is very important:
-it's essential in machine learning that we have separate data which we don't
-learn from so that we can make sure that what we've learned actually
-generalizes!
-
-As mentioned earlier, every MNIST data point has two parts: an image of a
-handwritten digit and a corresponding label. We'll call the images "x"
-and the labels "y". Both the training set and test set contain images and their
-corresponding labels; for example the training images are `mnist.train.images`
-and the training labels are `mnist.train.labels`.
-
-Each image is 28 pixels by 28 pixels. We can interpret this as a big array of
-numbers:
-
-<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/MNIST-Matrix.png">
-</div>
-
-We can flatten this array into a vector of 28x28 = 784 numbers. It doesn't
-matter how we flatten the array, as long as we're consistent between images.
-From this perspective, the MNIST images are just a bunch of points in a
-784-dimensional vector space, with a
-[very rich structure](https://colah.github.io/posts/2014-10-Visualizing-MNIST/)
-(warning: computationally intensive visualizations).
-
-Flattening the data throws away information about the 2D structure of the image.
-Isn't that bad? Well, the best computer vision methods do exploit this
-structure, and we will in later tutorials. But the simple method we will be
-using here, a softmax regression (defined below), won't.
-
-The result is that `mnist.train.images` is a tensor (an n-dimensional array)
-with a shape of `[55000, 784]`. The first dimension is an index into the list
-of images and the second dimension is the index for each pixel in each image.
-Each entry in the tensor is a pixel intensity between 0 and 1, for a particular
-pixel in a particular image.
-
-<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/mnist-train-xs.png">
-</div>
-
-Each image in MNIST has a corresponding label, a number between 0 and 9
-representing the digit drawn in the image.
-
-For the purposes of this tutorial, we're going to want our labels as "one-hot
-vectors". A one-hot vector is a vector which is 0 in most dimensions, and 1 in a
-single dimension. In this case, the \\(n\\)th digit will be represented as a
-vector which is 1 in the \\(n\\)th dimension. For example, 3 would be
-\\([0,0,0,1,0,0,0,0,0,0]\\).  Consequently, `mnist.train.labels` is a
-`[55000, 10]` array of floats.
-
-<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/mnist-train-ys.png">
-</div>
-
-We're now ready to actually make our model!
-
-## Softmax Regressions
-
-We know that every image in MNIST is of a handwritten digit between zero and
-nine.  So there are only ten possible things that a given image can be. We want
-to be able to look at an image and give the probabilities for it being each
-digit. For example, our model might look at a picture of a nine and be 80% sure
-it's a nine, but give a 5% chance to it being an eight (because of the top loop)
-and a bit of probability to all the others because it isn't 100% sure.
-
-This is a classic case where a softmax regression is a natural, simple model.
-If you want to assign probabilities to an object being one of several different
-things, softmax is the thing to do, because softmax gives us a list of values
-between 0 and 1 that add up to 1. Even later on, when we train more sophisticated
-models, the final step will be a layer of softmax.
-
-A softmax regression has two steps: first we add up the evidence of our input
-being in certain classes, and then we convert that evidence into probabilities.
-
-To tally up the evidence that a given image is in a particular class, we do a
-weighted sum of the pixel intensities. The weight is negative if that pixel
-having a high intensity is evidence against the image being in that class, and
-positive if it is evidence in favor.
-
-The following diagram shows the weights one model learned for each of these
-classes. Red represents negative weights, while blue represents positive
-weights.
-
-<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/softmax-weights.png">
-</div>
-
-We also add some extra evidence called a bias. Basically, we want to be able
-to say that some things are more likely independent of the input. The result is
-that the evidence for a class \\(i\\) given an input \\(x\\) is:
-
-$$\text{evidence}_i = \sum_j W_{i,~ j} x_j + b_i$$
-
-where \\(W_i\\) is the weights and \\(b_i\\) is the bias for class \\(i\\),
-and \\(j\\) is an index for summing over the pixels in our input image \\(x\\).
-We then convert the evidence tallies into our predicted probabilities
-\\(y\\) using the "softmax" function:
-
-$$y = \text{softmax}(\text{evidence})$$
-
-Here softmax is serving as an "activation" or "link" function, shaping
-the output of our linear function into the form we want -- in this case, a
-probability distribution over 10 cases.
-You can think of it as converting tallies
-of evidence into probabilities of our input being in each class.
-It's defined as:
-
-$$\text{softmax}(evidence) = \text{normalize}(\exp(evidence))$$
-
-If you expand that equation out, you get:
-
-$$\text{softmax}(evidence)_i = \frac{\exp(evidence_i)}{\sum_j \exp(evidence_j)}$$
-
-But it's often more helpful to think of softmax the first way: exponentiating
-its inputs and then normalizing them.  The exponentiation means that one more
-unit of evidence increases the weight given to any hypothesis multiplicatively.
-And conversely, having one less unit of evidence means that a hypothesis gets a
-fraction of its earlier weight. No hypothesis ever has zero or negative
-weight. Softmax then normalizes these weights, so that they add up to one,
-forming a valid probability distribution. (To get more intuition about the
-softmax function, check out the
-[section](http://neuralnetworksanddeeplearning.com/chap3.html#softmax) on it in
-Michael Nielsen's book, complete with an interactive visualization.)
-
-You can picture our softmax regression as looking something like the following,
-although with a lot more \\(x\\)s. For each output, we compute a weighted sum of
-the \\(x\\)s, add a bias, and then apply softmax.
-
-<div style="width:55%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/softmax-regression-scalargraph.png">
-</div>
-
-If we write that out as equations, we get:
-
-<div style="width:52%; margin-left:25%; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/softmax-regression-scalarequation.png"
-   alt="[y1, y2, y3] = softmax(W11*x1 + W12*x2 + W13*x3 + b1,  W21*x1 + W22*x2 + W23*x3 + b2,  W31*x1 + W32*x2 + W33*x3 + b3)">
-</div>
-
-We can "vectorize" this procedure, turning it into a matrix multiplication
-and vector addition. This is helpful for computational efficiency. (It's also
-a useful way to think.)
-
-<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/softmax-regression-vectorequation.png"
- alt="[y1, y2, y3] = softmax([[W11, W12, W13], [W21, W22, W23], [W31, W32, W33]]*[x1, x2, x3] + [b1, b2, b3])">
-</div>
-
-More compactly, we can just write:
-
-$$y = \text{softmax}(Wx + b)$$
-
-Now let's turn that into something that TensorFlow can use.
-
-## Implementing the Regression
-
-
-To do efficient numerical computing in Python, we typically use libraries like
-[NumPy](http://www.numpy.org) that do expensive operations such as matrix
-multiplication outside Python, using highly efficient code implemented in
-another language.  Unfortunately, there can still be a lot of overhead from
-switching back to Python every operation. This overhead is especially bad if you
-want to run computations on GPUs or in a distributed manner, where there can be
-a high cost to transferring data.
-
-TensorFlow also does its heavy lifting outside Python, but it takes things a
-step further to avoid this overhead.  Instead of running a single expensive
-operation independently from Python, TensorFlow lets us describe a graph of
-interacting operations that run entirely outside Python. (Approaches like this
-can be seen in a few machine learning libraries.)
-
-To use TensorFlow, first we need to import it.
-
-```python
-import tensorflow as tf
-```
-
-We describe these interacting operations by manipulating symbolic variables.
-Let's create one:
-
-```python
-x = tf.placeholder(tf.float32, [None, 784])
-```
-
-`x` isn't a specific value. It's a `placeholder`, a value that we'll input when
-we ask TensorFlow to run a computation. We want to be able to input any number
-of MNIST images, each flattened into a 784-dimensional vector. We represent
-this as a 2-D tensor of floating-point numbers, with a shape `[None, 784]`.
-(Here `None` means that a dimension can be of any length.)
-
-We also need the weights and biases for our model. We could imagine treating
-these like additional inputs, but TensorFlow has an even better way to handle
-it: `Variable`.  A `Variable` is a modifiable tensor that lives in TensorFlow's
-graph of interacting operations. It can be used and even modified by the
-computation. For machine learning applications, one generally has the model
-parameters be `Variable`s.
-
-```python
-W = tf.Variable(tf.zeros([784, 10]))
-b = tf.Variable(tf.zeros([10]))
-```
-
-We create these `Variable`s by giving `tf.Variable` the initial value of the
-`Variable`: in this case, we initialize both `W` and `b` as tensors full of
-zeros. Since we are going to learn `W` and `b`, it doesn't matter very much
-what they initially are.
-
-Notice that `W` has a shape of [784, 10] because we want to multiply the
-784-dimensional image vectors by it to produce 10-dimensional vectors of
-evidence for the difference classes. `b` has a shape of [10] so we can add it
-to the output.
-
-We can now implement our model. It only takes one line to define it!
-
-```python
-y = tf.nn.softmax(tf.matmul(x, W) + b)
-```
-
-First, we multiply `x` by `W` with the expression `tf.matmul(x, W)`. This is
-flipped from when we multiplied them in our equation, where we had \\(Wx\\), as
-a small trick to deal with `x` being a 2D tensor with multiple inputs. We then
-add `b`, and finally apply `tf.nn.softmax`.
-
-That's it. It only took us one line to define our model, after a couple short
-lines of setup. That isn't because TensorFlow is designed to make a softmax
-regression particularly easy: it's just a very flexible way to describe many
-kinds of numerical computations, from machine learning models to physics
-simulations. And once defined, our model can be run on different devices:
-your computer's CPU, GPUs, and even phones!
-
-
-## Training
-
-In order to train our model, we need to define what it means for the model to be
-good. Well, actually, in machine learning we typically define what it means for
-a model to be bad. We call this the cost, or the loss, and it represents how far
-off our model is from our desired outcome. We try to minimize that error, and
-the smaller the error margin, the better our model is.
-
-One very common, very nice function to determine the loss of a model is called
-"cross-entropy." Cross-entropy arises from thinking about information
-compressing codes in information theory but it winds up being an important idea
-in lots of areas, from gambling to machine learning. It's defined as:
-
-$$H_{y'}(y) = -\sum_i y'_i \log(y_i)$$
-
-Where \\(y\\) is our predicted probability distribution, and \\(y'\\) is the true
-distribution (the one-hot vector with the digit labels).  In some rough sense, the
-cross-entropy is measuring how inefficient our predictions are for describing
-the truth. Going into more detail about cross-entropy is beyond the scope of
-this tutorial, but it's well worth
-[understanding](https://colah.github.io/posts/2015-09-Visual-Information).
-
-To implement cross-entropy we need to first add a new placeholder to input the
-correct answers:
-
-```python
-y_ = tf.placeholder(tf.float32, [None, 10])
-```
-
-Then we can implement the cross-entropy function, \\(-\sum y'\log(y)\\):
-
-```python
-cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
-```
-
-First, `tf.log` computes the logarithm of each element of `y`. Next, we multiply
-each element of `y_` with the corresponding element of `tf.log(y)`. Then
-`tf.reduce_sum` adds the elements in the second dimension of y, due to the
-`reduction_indices=[1]` parameter. Finally, `tf.reduce_mean` computes the mean
-over all the examples in the batch.
-
-Note that in the source code, we don't use this formulation, because it is
-numerically unstable.  Instead, we apply
-`tf.nn.softmax_cross_entropy_with_logits` on the unnormalized logits (e.g., we
-call `softmax_cross_entropy_with_logits` on `tf.matmul(x, W) + b`), because this
-more numerically stable function internally computes the softmax activation.  In
-your code, consider using `tf.nn.softmax_cross_entropy_with_logits`
-instead.
-
-Now that we know what we want our model to do, it's very easy to have TensorFlow
-train it to do so.  Because TensorFlow knows the entire graph of your
-computations, it can automatically use the
-[backpropagation algorithm](https://colah.github.io/posts/2015-08-Backprop) to
-efficiently determine how your variables affect the loss you ask it to
-minimize. Then it can apply your choice of optimization algorithm to modify the
-variables and reduce the loss.
-
-```python
-train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
-```
-
-In this case, we ask TensorFlow to minimize `cross_entropy` using the
-[gradient descent algorithm](https://en.wikipedia.org/wiki/Gradient_descent)
-with a learning rate of 0.5. Gradient descent is a simple procedure, where
-TensorFlow simply shifts each variable a little bit in the direction that
-reduces the cost. But TensorFlow also provides
-@{$python/train#Optimizers$many other optimization algorithms}:
-using one is as simple as tweaking one line.
-
-What TensorFlow actually does here, behind the scenes, is to add new operations
-to your graph which implement backpropagation and gradient descent. Then it
-gives you back a single operation which, when run, does a step of gradient
-descent training, slightly tweaking your variables to reduce the loss.
-
-
-We can now launch the model in an `InteractiveSession`:
-
-```python
-sess = tf.InteractiveSession()
-```
-
-We first have to create an operation to initialize the variables we created:
-
-```python
-tf.global_variables_initializer().run()
-```
-
-
-Let's train -- we'll run the training step 1000 times!
-
-```python
-for _ in range(1000):
-  batch_xs, batch_ys = mnist.train.next_batch(100)
-  sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
-```
-
-Each step of the loop, we get a "batch" of one hundred random data points from
-our training set. We run `train_step` feeding in the batches data to replace
-the `placeholder`s.
-
-Using small batches of random data is called stochastic training -- in this
-case, stochastic gradient descent. Ideally, we'd like to use all our data for
-every step of training because that would give us a better sense of what we
-should be doing, but that's expensive. So, instead, we use a different subset
-every time. Doing this is cheap and has much of the same benefit.
-
-
-
-## Evaluating Our Model
-
-How well does our model do?
-
-Well, first let's figure out where we predicted the correct label. `tf.argmax`
-is an extremely useful function which gives you the index of the highest entry
-in a tensor along some axis. For example, `tf.argmax(y,1)` is the label our
-model thinks is most likely for each input, while `tf.argmax(y_,1)` is the
-correct label. We can use `tf.equal` to check if our prediction matches the
-truth.
-
-```python
-correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
-```
-
-That gives us a list of booleans. To determine what fraction are correct, we
-cast to floating point numbers and then take the mean. For example,
-`[True, False, True, True]` would become `[1,0,1,1]` which would become `0.75`.
-
-```python
-accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-```
-
-Finally, we ask for our accuracy on our test data.
-
-```python
-print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
-```
-
-This should be about 92%.
-
-Is that good? Well, not really. In fact, it's pretty bad. This is because we're
-using a very simple model. With some small changes, we can get to 97%. The best
-models can get to over 99.7% accuracy! (For more information, have a look at
-this
-[list of results](https://rodrigob.github.io/are_we_there_yet/build/classification_datasets_results).)
-
-What matters is that we learned from this model. Still, if you're feeling a bit
-down about these results, check out
-@{$pros$the next tutorial} where we do a lot
-better, and learn how to build more sophisticated models using TensorFlow!
diff --git a/tensorflow/docs_src/get_started/mnist/mechanics.md b/tensorflow/docs_src/get_started/mnist/mechanics.md
deleted file mode 100644
index 27fae45b5b0b4126132556cfac312fbb3c4f515a..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/get_started/mnist/mechanics.md
+++ /dev/null
@@ -1,489 +0,0 @@
-# TensorFlow Mechanics 101
-
-Code: [tensorflow/examples/tutorials/mnist/](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/)
-
-The goal of this tutorial is to show how to use TensorFlow to train and
-evaluate a simple feed-forward neural network for handwritten digit
-classification using the (classic) MNIST data set.  The intended audience for
-this tutorial is experienced machine learning users interested in using
-TensorFlow.
-
-These tutorials are not intended for teaching Machine Learning in general.
-
-Please ensure you have followed the instructions to
-@{$install$install TensorFlow}.
-
-## Tutorial Files
-
-This tutorial references the following files:
-
-File | Purpose
---- | ---
-[`mnist.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist.py) | The code to build a fully-connected MNIST model.
-[`fully_connected_feed.py`](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/fully_connected_feed.py) | The main code to train the built MNIST model against the downloaded dataset using a feed dictionary.
-
-Simply run the `fully_connected_feed.py` file directly to start training:
-
-```bash
-python fully_connected_feed.py
-```
-
-## Prepare the Data
-
-MNIST is a classic problem in machine learning. The problem is to look at
-greyscale 28x28 pixel images of handwritten digits and determine which digit
-the image represents, for all the digits from zero to nine.
-
-![MNIST Digits](https://www.tensorflow.org/images/mnist_digits.png "MNIST Digits")
-
-For more information, refer to [Yann LeCun's MNIST page](http://yann.lecun.com/exdb/mnist/)
-or [Chris Olah's visualizations of MNIST](http://colah.github.io/posts/2014-10-Visualizing-MNIST/).
-
-### Download
-
-At the top of the `run_training()` method, the `input_data.read_data_sets()`
-function will ensure that the correct data has been downloaded to your local
-training folder and then unpack that data to return a dictionary of `DataSet`
-instances.
-
-```python
-data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)
-```
-
-**NOTE**: The `fake_data` flag is used for unit-testing purposes and may be
-safely ignored by the reader.
-
-Dataset | Purpose
---- | ---
-`data_sets.train` | 55000 images and labels, for primary training.
-`data_sets.validation` | 5000 images and labels, for iterative validation of training accuracy.
-`data_sets.test` | 10000 images and labels, for final testing of trained accuracy.
-
-### Inputs and Placeholders
-
-The `placeholder_inputs()` function creates two @{tf.placeholder}
-ops that define the shape of the inputs, including the `batch_size`, to the
-rest of the graph and into which the actual training examples will be fed.
-
-```python
-images_placeholder = tf.placeholder(tf.float32, shape=(batch_size,
-                                                       mnist.IMAGE_PIXELS))
-labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size))
-```
-
-Further down, in the training loop, the full image and label datasets are
-sliced to fit the `batch_size` for each step, matched with these placeholder
-ops, and then passed into the `sess.run()` function using the `feed_dict`
-parameter.
-
-## Build the Graph
-
-After creating placeholders for the data, the graph is built from the
-`mnist.py` file according to a 3-stage pattern: `inference()`, `loss()`, and
-`training()`.
-
-1.  `inference()` - Builds the graph as far as required for running
-the network forward to make predictions.
-1.  `loss()` - Adds to the inference graph the ops required to generate
-loss.
-1.  `training()` - Adds to the loss graph the ops required to compute
-and apply gradients.
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="https://www.tensorflow.org/images/mnist_subgraph.png">
-</div>
-
-### Inference
-
-The `inference()` function builds the graph as far as needed to
-return the tensor that would contain the output predictions.
-
-It takes the images placeholder as input and builds on top
-of it a pair of fully connected layers with [ReLU](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) activation followed by a ten
-node linear layer specifying the output logits.
-
-Each layer is created beneath a unique @{tf.name_scope}
-that acts as a prefix to the items created within that scope.
-
-```python
-with tf.name_scope('hidden1'):
-```
-
-Within the defined scope, the weights and biases to be used by each of these
-layers are generated into @{tf.Variable}
-instances, with their desired shapes:
-
-```python
-weights = tf.Variable(
-    tf.truncated_normal([IMAGE_PIXELS, hidden1_units],
-                        stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),
-    name='weights')
-biases = tf.Variable(tf.zeros([hidden1_units]),
-                     name='biases')
-```
-
-When, for instance, these are created under the `hidden1` scope, the unique
-name given to the weights variable would be "`hidden1/weights`".
-
-Each variable is given initializer ops as part of their construction.
-
-In this most common case, the weights are initialized with the
-@{tf.truncated_normal}
-and given their shape of a 2-D tensor with
-the first dim representing the number of units in the layer from which the
-weights connect and the second dim representing the number of
-units in the layer to which the weights connect.  For the first layer, named
-`hidden1`, the dimensions are `[IMAGE_PIXELS, hidden1_units]` because the
-weights are connecting the image inputs to the hidden1 layer.  The
-`tf.truncated_normal` initializer generates a random distribution with a given
-mean and standard deviation.
-
-Then the biases are initialized with @{tf.zeros}
-to ensure they start with all zero values, and their shape is simply the number
-of units in the layer to which they connect.
-
-The graph's three primary ops -- two @{tf.nn.relu}
-ops wrapping @{tf.matmul}
-for the hidden layers and one extra `tf.matmul` for the logits -- are then
-created, each in turn, with separate `tf.Variable` instances connected to each
-of the input placeholders or the output tensors of the previous layer.
-
-```python
-hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
-```
-
-```python
-hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
-```
-
-```python
-logits = tf.matmul(hidden2, weights) + biases
-```
-
-Finally, the `logits` tensor that will contain the output is returned.
-
-### Loss
-
-The `loss()` function further builds the graph by adding the required loss
-ops.
-
-First, the values from the `labels_placeholder` are converted to 64-bit integers. Then, a @{tf.nn.sparse_softmax_cross_entropy_with_logits} op is added to automatically produce 1-hot labels from the `labels_placeholder` and compare the output logits from the `inference()` function with those 1-hot labels.
-
-```python
-labels = tf.to_int64(labels)
-cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-    labels=labels, logits=logits, name='xentropy')
-```
-
-It then uses @{tf.reduce_mean}
-to average the cross entropy values across the batch dimension (the first
-dimension) as the total loss.
-
-```python
-loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
-```
-
-And the tensor that will then contain the loss value is returned.
-
-> Note: Cross-entropy is an idea from information theory that allows us
-> to describe how bad it is to believe the predictions of the neural network,
-> given what is actually true. For more information, read the blog post Visual
-> Information Theory (http://colah.github.io/posts/2015-09-Visual-Information/)
-
-### Training
-
-The `training()` function adds the operations needed to minimize the loss via
-[Gradient Descent](https://en.wikipedia.org/wiki/Gradient_descent).
-
-Firstly, it takes the loss tensor from the `loss()` function and hands it to a
-@{tf.summary.scalar},
-an op for generating summary values into the events file when used with a
-@{tf.summary.FileWriter} (see below).  In this case, it will emit the snapshot value of
-the loss every time the summaries are written out.
-
-```python
-tf.summary.scalar('loss', loss)
-```
-
-Next, we instantiate a @{tf.train.GradientDescentOptimizer}
-responsible for applying gradients with the requested learning rate.
-
-```python
-optimizer = tf.train.GradientDescentOptimizer(learning_rate)
-```
-
-We then generate a single variable to contain a counter for the global
-training step and the @{tf.train.Optimizer.minimize}
-op is used to both update the trainable weights in the system and increment the
-global step.  This op is, by convention, known as the `train_op` and is what must
-be run by a TensorFlow session in order to induce one full step of training
-(see below).
-
-```python
-global_step = tf.Variable(0, name='global_step', trainable=False)
-train_op = optimizer.minimize(loss, global_step=global_step)
-```
-
-## Train the Model
-
-Once the graph is built, it can be iteratively trained and evaluated in a loop
-controlled by the user code in `fully_connected_feed.py`.
-
-### The Graph
-
-At the top of the `run_training()` function is a python `with` command that
-indicates all of the built ops are to be associated with the default
-global @{tf.Graph}
-instance.
-
-```python
-with tf.Graph().as_default():
-```
-
-A `tf.Graph` is a collection of ops that may be executed together as a group.
-Most TensorFlow uses will only need to rely on the single default graph.
-
-More complicated uses with multiple graphs are possible, but beyond the scope of
-this simple tutorial.
-
-### The Session
-
-Once all of the build preparation has been completed and all of the necessary
-ops generated, a @{tf.Session}
-is created for running the graph.
-
-```python
-sess = tf.Session()
-```
-
-Alternately, a `Session` may be generated into a `with` block for scoping:
-
-```python
-with tf.Session() as sess:
-```
-
-The empty parameter to session indicates that this code will attach to
-(or create if not yet created) the default local session.
-
-Immediately after creating the session, all of the `tf.Variable`
-instances are initialized by calling @{tf.Session.run}
-on their initialization op.
-
-```python
-init = tf.global_variables_initializer()
-sess.run(init)
-```
-
-The @{tf.Session.run}
-method will run the complete subset of the graph that
-corresponds to the op(s) passed as parameters.  In this first call, the `init`
-op is a @{tf.group}
-that contains only the initializers for the variables.  None of the rest of the
-graph is run here; that happens in the training loop below.
-
-### Train Loop
-
-After initializing the variables with the session, training may begin.
-
-The user code controls the training per step, and the simplest loop that
-can do useful training is:
-
-```python
-for step in xrange(FLAGS.max_steps):
-    sess.run(train_op)
-```
-
-However, this tutorial is slightly more complicated in that it must also slice
-up the input data for each step to match the previously generated placeholders.
-
-#### Feed the Graph
-
-For each step, the code will generate a feed dictionary that will contain the
-set of examples on which to train for the step, keyed by the placeholder
-ops they represent.
-
-In the `fill_feed_dict()` function, the given `DataSet` is queried for its next
-`batch_size` set of images and labels, and tensors matching the placeholders are
-filled containing the next images and labels.
-
-```python
-images_feed, labels_feed = data_set.next_batch(FLAGS.batch_size,
-                                               FLAGS.fake_data)
-```
-
-A python dictionary object is then generated with the placeholders as keys and
-the representative feed tensors as values.
-
-```python
-feed_dict = {
-    images_placeholder: images_feed,
-    labels_placeholder: labels_feed,
-}
-```
-
-This is passed into the `sess.run()` function's `feed_dict` parameter to provide
-the input examples for this step of training.
-
-#### Check the Status
-
-The code specifies two values to fetch in its run call: `[train_op, loss]`.
-
-```python
-for step in xrange(FLAGS.max_steps):
-    feed_dict = fill_feed_dict(data_sets.train,
-                               images_placeholder,
-                               labels_placeholder)
-    _, loss_value = sess.run([train_op, loss],
-                             feed_dict=feed_dict)
-```
-
-Because there are two values to fetch, `sess.run()` returns a tuple with two
-items.  Each `Tensor` in the list of values to fetch corresponds to a numpy
-array in the returned tuple, filled with the value of that tensor during this
-step of training. Since `train_op` is an `Operation` with no output value, the
-corresponding element in the returned tuple is `None` and, thus,
-discarded. However, the value of the `loss` tensor may become NaN if the model
-diverges during training, so we capture this value for logging.
-
-Assuming that the training runs fine without NaNs, the training loop also
-prints a simple status text every 100 steps to let the user know the state of
-training.
-
-```python
-if step % 100 == 0:
-    print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
-```
-
-#### Visualize the Status
-
-In order to emit the events files used by @{$summaries_and_tensorboard$TensorBoard},
-all of the summaries (in this case, only one) are collected into a single Tensor
-during the graph building phase.
-
-```python
-summary = tf.summary.merge_all()
-```
-
-And then after the session is created, a @{tf.summary.FileWriter}
-may be instantiated to write the events files, which
-contain both the graph itself and the values of the summaries.
-
-```python
-summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
-```
-
-Lastly, the events file will be updated with new summary values every time the
-`summary` is evaluated and the output passed to the writer's `add_summary()`
-function.
-
-```python
-summary_str = sess.run(summary, feed_dict=feed_dict)
-summary_writer.add_summary(summary_str, step)
-```
-
-When the events files are written, TensorBoard may be run against the training
-folder to display the values from the summaries.
-
-![MNIST TensorBoard](https://www.tensorflow.org/images/mnist_tensorboard.png "MNIST TensorBoard")
-
-**NOTE**: For more info about how to build and run Tensorboard, please see the accompanying tutorial @{$summaries_and_tensorboard$Tensorboard: Visualizing Learning}.
-
-#### Save a Checkpoint
-
-In order to emit a checkpoint file that may be used to later restore a model
-for further training or evaluation, we instantiate a
-@{tf.train.Saver}.
-
-```python
-saver = tf.train.Saver()
-```
-
-In the training loop, the @{tf.train.Saver.save}
-method will periodically be called to write a checkpoint file to the training
-directory with the current values of all the trainable variables.
-
-```python
-saver.save(sess, FLAGS.train_dir, global_step=step)
-```
-
-At some later point in the future, training might be resumed by using the
-@{tf.train.Saver.restore}
-method to reload the model parameters.
-
-```python
-saver.restore(sess, FLAGS.train_dir)
-```
-
-## Evaluate the Model
-
-Every thousand steps, the code will attempt to evaluate the model against both
-the training and test datasets.  The `do_eval()` function is called thrice, for
-the training, validation, and test datasets.
-
-```python
-print('Training Data Eval:')
-do_eval(sess,
-        eval_correct,
-        images_placeholder,
-        labels_placeholder,
-        data_sets.train)
-print('Validation Data Eval:')
-do_eval(sess,
-        eval_correct,
-        images_placeholder,
-        labels_placeholder,
-        data_sets.validation)
-print('Test Data Eval:')
-do_eval(sess,
-        eval_correct,
-        images_placeholder,
-        labels_placeholder,
-        data_sets.test)
-```
-
-> Note that more complicated usage would usually sequester the `data_sets.test`
-> to only be checked after significant amounts of hyperparameter tuning.  For
-> the sake of a simple little MNIST problem, however, we evaluate against all of
-> the data.
-
-### Build the Eval Graph
-
-Before entering the training loop, the Eval op should have been built
-by calling the `evaluation()` function from `mnist.py` with the same
-logits/labels parameters as the `loss()` function.
-
-```python
-eval_correct = mnist.evaluation(logits, labels_placeholder)
-```
-
-The `evaluation()` function simply generates a @{tf.nn.in_top_k}
-op that can automatically score each model output as correct if the true label
-can be found in the K most-likely predictions.  In this case, we set the value
-of K to 1 to only consider a prediction correct if it is for the true label.
-
-```python
-eval_correct = tf.nn.in_top_k(logits, labels, 1)
-```
-
-### Eval Output
-
-One can then create a loop for filling a `feed_dict` and calling `sess.run()`
-against the `eval_correct` op to evaluate the model on the given dataset.
-
-```python
-for step in xrange(steps_per_epoch):
-    feed_dict = fill_feed_dict(data_set,
-                               images_placeholder,
-                               labels_placeholder)
-    true_count += sess.run(eval_correct, feed_dict=feed_dict)
-```
-
-The `true_count` variable simply accumulates all of the predictions that the
-`in_top_k` op has determined to be correct.  From there, the precision may be
-calculated from simply dividing by the total number of examples.
-
-```python
-precision = true_count / num_examples
-print('  Num examples: %d  Num correct: %d  Precision @ 1: %0.04f' %
-      (num_examples, true_count, precision))
-```
diff --git a/tensorflow/docs_src/get_started/mnist/pros.md b/tensorflow/docs_src/get_started/mnist/pros.md
deleted file mode 100644
index 4933dd28cd37e695a10ab28832f26a613589d01a..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/get_started/mnist/pros.md
+++ /dev/null
@@ -1,435 +0,0 @@
-# Deep MNIST for Experts
-
-TensorFlow is a powerful library for doing large-scale numerical computation.
-One of the tasks at which it excels is implementing and training deep neural
-networks.  In this tutorial we will learn the basic building blocks of a
-TensorFlow model while constructing a deep convolutional MNIST classifier.
-
-*This introduction assumes familiarity with neural networks and the MNIST
-dataset. If you don't have
-a background with them, check out the
-@{$beginners$introduction for beginners}. Be sure to
-@{$install$install TensorFlow} before starting.*
-
-
-## About this tutorial
-
-The first part of this tutorial explains what is happening in the
-[mnist_softmax.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_softmax.py)
-code, which is a basic implementation of a Tensorflow model.  The second part
-shows some ways to improve the accuracy.
-
-You can copy and paste each code snippet from this tutorial into a Python
-environment to follow along, or you can download the fully implemented deep net
-from [mnist_deep.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_deep.py)
-.
-
-What we will accomplish in this tutorial:
-
-- Create a softmax regression function that is a model for recognizing MNIST
-  digits, based on looking at every pixel in the image
-
-- Use Tensorflow to train the model to recognize digits by having it "look" at
-  thousands of examples (and run our first Tensorflow session to do so)
-
-- Check the model's accuracy with our test data
-
-- Build, train, and test a multilayer convolutional neural network to improve
-  the results
-
-## Setup
-
-Before we create our model, we will first load the MNIST dataset, and start a
-TensorFlow session.
-
-### Load MNIST Data
-
-If you are copying and pasting in the code from this tutorial, start here with
-these two lines of code which will download and read in the data automatically:
-
-```python
-from tensorflow.examples.tutorials.mnist import input_data
-mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
-```
-
-Here `mnist` is a lightweight class which stores the training, validation, and
-testing sets as NumPy arrays.  It also provides a function for iterating through
-data minibatches, which we will use below.
-
-### Start TensorFlow InteractiveSession
-
-TensorFlow relies on a highly efficient C++ backend to do its computation. The
-connection to this backend is called a session.  The common usage for TensorFlow
-programs is to first create a graph and then launch it in a session.
-
-Here we instead use the convenient `InteractiveSession` class, which makes
-TensorFlow more flexible about how you structure your code.  It allows you to
-interleave operations which build a
-@{$get_started/get_started#the_computational_graph$computation graph}
-with ones that run the graph.  This is particularly convenient when working in
-interactive contexts like IPython.  If you are not using an
-`InteractiveSession`, then you should build the entire computation graph before
-starting a session and
-@{$get_started/get_started#the_computational_graph$launching the graph}.
-
-```python
-import tensorflow as tf
-sess = tf.InteractiveSession()
-```
-
-#### Computation Graph
-
-To do efficient numerical computing in Python, we typically use libraries like
-[NumPy](http://www.numpy.org/) that do expensive operations such as matrix
-multiplication outside Python, using highly efficient code implemented in
-another language.  Unfortunately, there can still be a lot of overhead from
-switching back to Python every operation. This overhead is especially bad if you
-want to run computations on GPUs or in a distributed manner, where there can be
-a high cost to transferring data.
-
-TensorFlow also does its heavy lifting outside Python, but it takes things a
-step further to avoid this overhead.  Instead of running a single expensive
-operation independently from Python, TensorFlow lets us describe a graph of
-interacting operations that run entirely outside Python.  This approach is
-similar to that used in Theano or Torch.
-
-The role of the Python code is therefore to build this external computation
-graph, and to dictate which parts of the computation graph should be run. See
-the @{$get_started/get_started#the_computational_graph$Computation Graph}
-section of @{$get_started/get_started} for more detail.
-
-## Build a Softmax Regression Model
-
-In this section we will build a softmax regression model with a single linear
-layer. In the next section, we will extend this to the case of softmax
-regression with a multilayer convolutional network.
-
-### Placeholders
-
-We start building the computation graph by creating nodes for the
-input images and target output classes.
-
-```python
-x = tf.placeholder(tf.float32, shape=[None, 784])
-y_ = tf.placeholder(tf.float32, shape=[None, 10])
-```
-
-Here `x` and `y_` aren't specific values. Rather, they are each a `placeholder`
--- a value that we'll input when we ask TensorFlow to run a computation.
-
-The input images `x` will consist of a 2d tensor of floating point numbers.
-Here we assign it a `shape` of `[None, 784]`, where `784` is the dimensionality
-of a single flattened 28 by 28 pixel MNIST image, and `None` indicates that the
-first dimension, corresponding to the batch size, can be of any size.  The
-target output classes `y_` will also consist of a 2d tensor, where each row is a
-one-hot 10-dimensional vector indicating which digit class (zero through nine)
-the corresponding MNIST image belongs to.
-
-The `shape` argument to `placeholder` is optional, but it allows TensorFlow
-to automatically catch bugs stemming from inconsistent tensor shapes.
-
-### Variables
-
-We now define the weights `W` and biases `b` for our model. We could imagine
-treating these like additional inputs, but TensorFlow has an even better way to
-handle them: `Variable`.  A `Variable` is a value that lives in TensorFlow's
-computation graph.  It can be used and even modified by the computation. In
-machine learning applications, one generally has the model parameters be
-`Variable`s.
-
-```python
-W = tf.Variable(tf.zeros([784,10]))
-b = tf.Variable(tf.zeros([10]))
-```
-
-We pass the initial value for each parameter in the call to `tf.Variable`.  In
-this case, we initialize both `W` and `b` as tensors full of zeros. `W` is a
-784x10 matrix (because we have 784 input features and 10 outputs) and `b` is a
-10-dimensional vector (because we have 10 classes).
-
-Before `Variable`s can be used within a session, they must be initialized using
-that session.  This step takes the initial values (in this case tensors full of
-zeros) that have already been specified, and assigns them to each
-`Variable`. This can be done for all `Variables` at once:
-
-```python
-sess.run(tf.global_variables_initializer())
-```
-
-### Predicted Class and Loss Function
-
-We can now implement our regression model. It only takes one line!  We multiply
-the vectorized input images `x` by the weight matrix `W`, add the bias `b`.
-
-```python
-y = tf.matmul(x,W) + b
-```
-
-We can specify a loss function just as easily. Loss indicates how bad the
-model's prediction was on a single example; we try to minimize that while
-training across all the examples. Here, our loss function is the cross-entropy
-between the target and the softmax activation function applied to the model's
-prediction.  As in the beginners tutorial, we use the stable formulation:
-
-```python
-cross_entropy = tf.reduce_mean(
-    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
-```
-
-Note that `tf.nn.softmax_cross_entropy_with_logits` internally applies the
-softmax on the model's unnormalized model prediction and sums across all
-classes, and `tf.reduce_mean` takes the average over these sums.
-
-## Train the Model
-
-Now that we have defined our model and training loss function, it is
-straightforward to train using TensorFlow.  Because TensorFlow knows the entire
-computation graph, it can use automatic differentiation to find the gradients of
-the loss with respect to each of the variables.  TensorFlow has a variety of
-@{$python/train#optimizers$built-in optimization algorithms}.
-For this example, we will use steepest gradient descent, with a step length of
-0.5, to descend the cross entropy.
-
-```python
-train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
-```
-
-What TensorFlow actually did in that single line was to add new operations to
-the computation graph. These operations included ones to compute gradients,
-compute parameter update steps, and apply update steps to the parameters.
-
-The returned operation `train_step`, when run, will apply the gradient descent
-updates to the parameters. Training the model can therefore be accomplished by
-repeatedly running `train_step`.
-
-```python
-for _ in range(1000):
-  batch = mnist.train.next_batch(100)
-  train_step.run(feed_dict={x: batch[0], y_: batch[1]})
-```
-
-We load 100 training examples in each training iteration. We then run the
-`train_step` operation, using `feed_dict` to replace the `placeholder` tensors
-`x` and `y_` with the training examples.  Note that you can replace any tensor
-in your computation graph using `feed_dict` -- it's not restricted to just
-`placeholder`s.
-
-### Evaluate the Model
-
-How well did our model do?
-
-First we'll figure out where we predicted the correct label. `tf.argmax` is an
-extremely useful function which gives you the index of the highest entry in a
-tensor along some axis. For example, `tf.argmax(y,1)` is the label our model
-thinks is most likely for each input, while `tf.argmax(y_,1)` is the true
-label. We can use `tf.equal` to check if our prediction matches the truth.
-
-```python
-correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
-```
-
-That gives us a list of booleans. To determine what fraction are correct, we
-cast to floating point numbers and then take the mean. For example,
-`[True, False, True, True]` would become `[1,0,1,1]` which would become `0.75`.
-
-```python
-accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-```
-
-Finally, we can evaluate our accuracy on the test data. This should be about
-92% correct.
-
-```python
-print(accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
-```
-
-## Build a Multilayer Convolutional Network
-
-Getting 92% accuracy on MNIST is bad. It's almost embarrassingly bad. In this
-section, we'll fix that, jumping from a very simple model to something
-moderately sophisticated: a small convolutional neural network. This will get us
-to around 99.2% accuracy -- not state of the art, but respectable.
-
-Here is a diagram, created with TensorBoard, of the model we will build:
-
-<div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img src="https://www.tensorflow.org/images/mnist_deep.png">
-</div>
-
-### Weight Initialization
-
-To create this model, we're going to need to create a lot of weights and biases.
-One should generally initialize weights with a small amount of noise for
-symmetry breaking, and to prevent 0 gradients. Since we're using
-[ReLU](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) neurons, it is
-also good practice to initialize them with a slightly positive initial bias to
-avoid "dead neurons". Instead of doing this repeatedly while we build the model,
-let's create two handy functions to do it for us.
-
-```python
-def weight_variable(shape):
-  initial = tf.truncated_normal(shape, stddev=0.1)
-  return tf.Variable(initial)
-
-def bias_variable(shape):
-  initial = tf.constant(0.1, shape=shape)
-  return tf.Variable(initial)
-```
-
-### Convolution and Pooling
-
-TensorFlow also gives us a lot of flexibility in convolution and pooling
-operations. How do we handle the boundaries? What is our stride size?
-In this example, we're always going to choose the vanilla version.
-Our convolutions uses a stride of one and are zero padded so that the
-output is the same size as the input. Our pooling is plain old max pooling
-over 2x2 blocks. To keep our code cleaner, let's also abstract those operations
-into functions.
-
-```python
-def conv2d(x, W):
-  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
-
-def max_pool_2x2(x):
-  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
-                        strides=[1, 2, 2, 1], padding='SAME')
-```
-
-### First Convolutional Layer
-
-We can now implement our first layer. It will consist of convolution, followed
-by max pooling. The convolution will compute 32 features for each 5x5 patch.
-Its weight tensor will have a shape of `[5, 5, 1, 32]`. The first two
-dimensions are the patch size, the next is the number of input channels, and
-the last is the number of output channels. We will also have a bias vector with
-a component for each output channel.
-
-```python
-W_conv1 = weight_variable([5, 5, 1, 32])
-b_conv1 = bias_variable([32])
-```
-
-To apply the layer, we first reshape `x` to a 4d tensor, with the second and
-third dimensions corresponding to image width and height, and the final
-dimension corresponding to the number of color channels.
-
-```python
-x_image = tf.reshape(x, [-1, 28, 28, 1])
-```
-
-We then convolve `x_image` with the weight tensor, add the
-bias, apply the ReLU function, and finally max pool. The `max_pool_2x2` method will
-reduce the image size to 14x14.
-
-```python
-h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
-h_pool1 = max_pool_2x2(h_conv1)
-```
-
-### Second Convolutional Layer
-
-In order to build a deep network, we stack several layers of this type. The
-second layer will have 64 features for each 5x5 patch.
-
-```python
-W_conv2 = weight_variable([5, 5, 32, 64])
-b_conv2 = bias_variable([64])
-
-h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
-h_pool2 = max_pool_2x2(h_conv2)
-```
-
-### Densely Connected Layer
-
-Now that the image size has been reduced to 7x7, we add a fully-connected layer
-with 1024 neurons to allow processing on the entire image. We reshape the tensor
-from the pooling layer into a batch of vectors,
-multiply by a weight matrix, add a bias, and apply a ReLU.
-
-```python
-W_fc1 = weight_variable([7 * 7 * 64, 1024])
-b_fc1 = bias_variable([1024])
-
-h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
-h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
-```
-
-#### Dropout
-
-To reduce overfitting, we will apply [dropout](
-https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf) before the readout layer.
-We create a `placeholder` for the probability that a neuron's output is kept
-during dropout. This allows us to turn dropout on during training, and turn it
-off during testing.
-TensorFlow's `tf.nn.dropout` op automatically handles scaling neuron outputs in
-addition to masking them, so dropout just works without any additional
-scaling.<sup id="a1">[1](#f1)</sup>
-
-```python
-keep_prob = tf.placeholder(tf.float32)
-h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
-```
-
-### Readout Layer
-
-Finally, we add a layer, just like for the one layer softmax regression
-above.
-
-```python
-W_fc2 = weight_variable([1024, 10])
-b_fc2 = bias_variable([10])
-
-y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
-```
-
-### Train and Evaluate the Model
-
-How well does this model do? To train and evaluate it we will use code that is
-nearly identical to that for the simple one layer SoftMax network above.
-
-The differences are that:
-
-- We will replace the steepest gradient descent optimizer with the more
-  sophisticated ADAM optimizer.
-
-- We will include the additional parameter `keep_prob` in `feed_dict` to control
-  the dropout rate.
-
-- We will add logging to every 100th iteration in the training process.
-
-We will also use tf.Session rather than tf.InteractiveSession. This better
-separates the process of creating the graph (model specification) and the
-process of evaluating the graph (model fitting). It generally makes for cleaner
-code. The tf.Session is created within a [`with` block](https://docs.python.org/3/whatsnew/2.6.html#pep-343-the-with-statement)
-so that it is automatically destroyed once the block is exited.
-
-Feel free to run this code. Be aware that it does 20,000 training iterations
-and may take a while (possibly up to half an hour), depending on your processor.
-
-```python
-cross_entropy = tf.reduce_mean(
-    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
-train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
-correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
-accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-
-with tf.Session() as sess:
-  sess.run(tf.global_variables_initializer())
-  for i in range(20000):
-    batch = mnist.train.next_batch(50)
-    if i % 100 == 0:
-      train_accuracy = accuracy.eval(feed_dict={
-          x: batch[0], y_: batch[1], keep_prob: 1.0})
-      print('step %d, training accuracy %g' % (i, train_accuracy))
-    train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
-
-  print('test accuracy %g' % accuracy.eval(feed_dict={
-      x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
-```
-
-The final test set accuracy after running this code should be approximately 99.2%.
-
-We have learned how to quickly and easily build, train, and evaluate a
-fairly sophisticated deep learning model using TensorFlow.
-
-<b id="f1">1</b>: For this small convolutional network, performance is actually nearly identical with and without dropout. Dropout is often very effective at reducing overfitting, but it is most useful when training very large neural networks. [↩](#a1)
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
index ff839fd040167dc16087311666ff25da2088c519..6bffd2e065548a42eb726df34542ecc7480ad38d 100644
--- a/tensorflow/docs_src/get_started/premade_estimators.md
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -2,33 +2,39 @@
 # Getting Started with TensorFlow
 
 This document introduces the TensorFlow programming environment and shows you
-how to write the Iris classification problem in TensorFlow.
+how to solve the Iris classification problem in TensorFlow.
 
-Prior to reading this document, do the following:
+## Prerequisites
 
-* [Install TensorFlow](install/index.md).
+Prior to using the sample code in this document, you'll need to do the
+following:
+
+* @{$install$Install TensorFlow}.
 * If you installed TensorFlow with virtualenv or Anaconda, activate your
   TensorFlow environment.
-* To keep the data import simple, our Iris example uses Pandas. You can
-  install Pandas with:
+* Install or upgrade pandas by issuing the following command:
 
-      `pip install pandas`
+        pip install pandas
 
 ## Getting the sample code
 
-Take the following steps to get the sample code for this program:
+Take the following steps to get the sample code we'll be going through:
 
-1. Clone the TensorFlow Models repository from github by entering the following
+1. Clone the TensorFlow Models repository from GitHub by entering the following
    command:
 
-       `git clone https://github.com/tensorflow/models`
+        git clone https://github.com/tensorflow/models
 
 1. Change directory within that branch to the location containing the examples
    used in this document:
 
-       `cd models/samples/core/get_started/`
+        cd models/samples/core/get_started/
 
-The program described in this document is called `premade_estimator.py`.
+The program described in this document is
+[`premade_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py).
+This program uses
+[`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py)
+to fetch its training data.
 
 ### Running the program
 
@@ -38,15 +44,15 @@ You run TensorFlow programs as you would run any Python program. For example:
 python premade_estimator.py
 ```
 
-The program should output training logs and some predictions against a test
-set. For example, the first line in the following output shows that the model
-thinks there is a 99.6% chance that the first example in the test set is a
-Sentosa. Since the test set `expected "Setosa"`, this appears to be a good
-prediction.
+The program should output training logs followed by some predictions against
+the test set. For example, the first line in the following output shows that
+the model thinks there is a 99.6% chance that the first example in the test
+set is a Setosa. Since the test set expected Setosa, this appears to be
+a good prediction.
 
 ``` None
 ...
-Prediction is "Sentosa" (99.6%), expected "Setosa"
+Prediction is "Setosa" (99.6%), expected "Setosa"
 
 Prediction is "Versicolor" (99.8%), expected "Versicolor"
 
@@ -57,9 +63,9 @@ If the program generates errors instead of answers, ask yourself the following
 questions:
 
 * Did you install TensorFlow properly?
-* Are you using the correct version of tensorflow?
+* Are you using the correct version of TensorFlow?
 * Did you activate the environment you installed TensorFlow in? (This is
-  only relevant in certain installation environments.)
+  only relevant in certain installation mechanisms.)
 
 ## The programming stack
 
@@ -67,21 +73,18 @@ Before getting into the details of the program itself, let's investigate the
 programming environment. As the following illustration shows, TensorFlow
 provides a programming stack consisting of multiple API layers:
 
-<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="../images/tensorflow_programming_environment.png">
 </div>
-<div style="text-align: center">
-The TensorFlow Programming Environment
-</div>
 
 We strongly recommend writing TensorFlow programs with the following APIs:
 
-* Estimators, which represent a complete model. The Estimator API provides
-  methods to train the model, to judge the model's accuracy, and to generate
-  predictions.
-* Datasets, which build a data input pipeline. The Dataset API has methods to
-  load and manipulate data, and feed it into your model. The Datasets API meshes
-  well with the Estimators API.
+* @{$programmers_guide/estimators$Estimators}, which represent a complete model.
+  The Estimator API provides methods to train the model, to judge the model's
+  accuracy, and to generate predictions.
+* @{$get_started/datasets_quickstart$Datasets}, which build a data input
+  pipeline. The Dataset API has methods to load and manipulate data, and feed
+  it into your model. The Dataset API meshes well with the Estimators API.
 
 ## Classifying irises: an overview
 
@@ -95,6 +98,7 @@ classifies Iris flowers into three different species based on the size of their
   alt="Petal geometry compared for three iris species: Iris setosa, Iris virginica, and Iris versicolor"
   src="../images/iris_three_species.jpg">
 </div>
+
 **From left to right,
 [*Iris setosa*](https://commons.wikimedia.org/w/index.php?curid=170298) (by
 [Radomil](https://commons.wikimedia.org/wiki/User:Radomil), CC BY-SA 3.0),
@@ -106,15 +110,17 @@ and [*Iris virginica*](https://www.flickr.com/photos/33397993@N05/3352169862)
 
 ### The data set
 
-The Iris data set contains four features and one label.  The four features
-identify the following botanical characteristics of individual Iris flowers:
+The Iris data set contains four features and one
+[label](https://developers.google.com/machine-learning/glossary/#label).
+The four features identify the following botanical characteristics of
+individual Iris flowers:
 
 * sepal length
 * sepal width
 * petal length
 * petal width
 
-Our model will represent these features as float32 numerical data.
+Our model will represent these features as `float32` numerical data.
 
 The label identifies the Iris species, which must be one of the following:
 
@@ -128,7 +134,7 @@ The following table shows three examples in the data set:
 
 |sepal length | sepal width | petal length | petal width| species (label) |
 |------------:|------------:|-------------:|-----------:|:---------------:|
-|         5.1 |         3.3 |          1.7 |        0.5 |   0 (Sentosa)   |
+|         5.1 |         3.3 |          1.7 |        0.5 |   0 (Setosa)   |
 |         5.0 |         2.3 |          3.3 |        1.0 |   1 (versicolor)|
 |         6.4 |         2.8 |          5.6 |        2.2 |   2 (virginica) |
 
@@ -143,14 +149,10 @@ topology:
 The following figure illustrates the features, hidden layers, and predictions
 (not all of the nodes in the hidden layers are shown):
 
-
 <div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%"
   alt="A diagram of the network architecture: Inputs, 2 hidden layers, and outputs"
-  src="../images/iris_model.png">
-</div>
-<div style="text-align: center">
-The Model.
+  src="../images/custom_estimators/full_network.png">
 </div>
 
 ### Inference
@@ -169,12 +171,12 @@ example is an Iris Versicolor.
 
 ## Overview of programming with Estimators
 
-An Estimator is TensorFlow's high level representation of a complete model. It
+An Estimator is TensorFlow's high-level representation of a complete model. It
 handles the details of initialization, logging, saving and restoring, and many
 other features so you can concentrate on your model. For more details see
 @{$programmers_guide/estimators}.
 
-An "Estimator" is any class derived from @{tf.estimator.Estimator}. TensorFlow
+An Estimator is any class derived from @{tf.estimator.Estimator}. TensorFlow
 provides a collection of
 [pre-made Estimators](https://developers.google.com/machine-learning/glossary/#pre-made_Estimator)
 (for example, `LinearRegressor`) to implement common ML algorithms. Beyond
@@ -194,22 +196,24 @@ following tasks:
 * Call one or more methods on the Estimator object, passing the appropriate
   input function as the source of the data.
 
-Let's see how those tasks are implemented in Iris.
+Let's see how those tasks are implemented for Iris classification.
 
 ## Create input functions
 
 You must create input functions to supply data for training,
 evaluating, and prediction.
 
-An **input function** is a function that returns the following two-element
-tuple:
+An **input function** is a function that returns a @{tf.data.Dataset} object
+which outputs the following two-element tuple:
 
-* "features" - A Python dictionary in which:
+* [`features`](https://developers.google.com/machine-learning/glossary/#feature) - A Python dictionary in which:
     * Each key is the name of a feature.
     * Each value is an array containing all of that feature's values.
-* "label" - An array containing the values of the label for every example.
+* `label` - An array containing the values of the
+  [label](https://developers.google.com/machine-learning/glossary/#label) for
+  every example.
 
-Just to demonstrate the format of the input function here's a simple
+Just to demonstrate the format of the input function, here's a simple
 implementation:
 
 ```python
@@ -222,10 +226,10 @@ def input_evaluation_set():
     return features, labels
 ```
 
-Your input function may generate the "features" dictionary and "label" list any
+Your input function may generate the `features` dictionary and `label` list any
 way you like. However, we recommend using TensorFlow's Dataset API, which can
-deftly parse all sorts of data. At a high-level, the Datasets API consists of
-the following classes:
+parse all sorts of data. At a high level, the Dataset API consists of the
+following classes:
 
 <div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%"
@@ -233,51 +237,50 @@ the following classes:
   src="../images/dataset_classes.png">
 </div>
 
+Where the individual members are:
 
-Where:
-
-* Dataset: Base class containing methods to create and transform datasets. Also
-  allows you to initialize a dataset from data in memory, or from a Python
-  generator.
-* TextLineDataset: Reads lines from text files.
-* TFRecordDataset: Reads records from TFRecord files.
-* FixedLengthRecordDataset: Reads fixed size records from binary files.
-* Iterator: Provides a way to access one data set element at a time.
+* `Dataset` - Base class containing methods to create and transform
+  datasets. Also allows you to initialize a dataset from data in memory, or from
+  a Python generator.
+* `TextLineDataset` - Reads lines from text files.
+* `TFRecordDataset` - Reads records from TFRecord files.
+* `FixedLengthRecordDataset` - Reads fixed size records from binary files.
+* `Iterator` - Provides a way to access one data set element at a time.
 
 The Dataset API can handle a lot of common cases for you. For example,
 using the Dataset API, you can easily read in records from a large collection
 of files in parallel and join them into a single stream.
 
-To keep things simple in this example we are going to load the data with pandas, and build our input pipeline from this in-memory data.
+To keep things simple in this example we are going to load the data with
+[pandas](https://pandas.pydata.org/), and build our input pipeline from this
+in-memory data.
 
-Here is the input function used for training in this program:
+Here is the input function used for training in this program, which is available
+in [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py):
 
 ``` python
 def train_input_fn(features, labels, batch_size):
     """An input function for training"""
     # Convert the inputs to a Dataset.
-    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
+    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
 
     # Shuffle, repeat, and batch the examples.
-    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
-
-    # Build the Iterator, and return the read end of the pipeline.
-    return dataset.make_one_shot_iterator().get_next()
+    return dataset.shuffle(1000).repeat().batch(batch_size)
 ```
 
-## Define the Feature Columns
+## Define the feature columns
 
-A [**Feature Column**](https://developers.google.com/machine-learning/glossary/#feature_columns)
-is an object describing how the model should use raw input features from the
+A [**feature column**](https://developers.google.com/machine-learning/glossary/#feature_columns)
+is an object describing how the model should use raw input data from the
 features dictionary. When you build an Estimator model, you pass it a list of
 feature columns that describes each of the features you want the model to use.
-
-These objects are created by functions in the @{tf.feature_column} module. `tf.feature_column` methods provide many different ways to represent data.
+The @{tf.feature_column} module provides many options for representing data
+to the model.
 
 For Iris, the 4 raw features are numeric values, so we'll build a list of
-feature columns, to tell the Estimator model to represent each of the four
+feature columns to tell the Estimator model to represent each of the four
 features as 32-bit floating-point values. Therefore, the code to create the
-Feature Column is simply:
+feature column is:
 
 ```python
 # Feature columns describe how to use the input.
@@ -286,29 +289,29 @@ for key in train_x.keys():
     my_feature_columns.append(tf.feature_column.numeric_column(key=key))
 ```
 
-Feature Columns can be far more sophisticated than those we're showing here.
-<!--TODO(markdaoust) add link to feature_columns doc when it exists.-->
+Feature columns can be far more sophisticated than those we're showing here.  We
+detail feature columns @{$get_started/feature_columns$later on} in our Getting
+Started guide.
 
 Now that we have the description of how we want the model to represent the raw
 features, we can build the estimator.
 
 
-## Instantiate an Estimator
+## Instantiate an estimator
 
-The Iris problem is a classic classifier problem. Fortunately, TensorFlow
+The Iris problem is a classic classification problem. Fortunately, TensorFlow
 provides several pre-made classifier Estimators, including:
 
-* @{tf.estimator.DNNClassifier}—for deep models that perform multi-class
+* @{tf.estimator.DNNClassifier} for deep models that perform multi-class
   classification.
-* @{tf.estimator.DNNLinearCombinedClassifier}—for wide-n-deep models.
-* @{tf.estimator.LinearClassifier}—for linear models that feed results into
-  binary classifiers.
+* @{tf.estimator.DNNLinearCombinedClassifier} for wide & deep models.
+* @{tf.estimator.LinearClassifier} for classifiers based on linear models.
 
 For the Iris problem, `tf.estimator.DNNClassifier` seems like the best choice.
 Here's how we instantiated this Estimator:
 
 ```python
-# Build 2 hidden layer DNN with 10, 10 units respectively.
+# Build a DNN with 2 hidden layers and 10 nodes in each hidden layer.
 classifier = tf.estimator.DNNClassifier(
     feature_columns=my_feature_columns,
     # Two hidden layers of 10 nodes each.
@@ -332,14 +335,15 @@ Train the model by calling the Estimator's `train` method as follows:
 ```python
 # Train the Model.
 classifier.train(
-    input_fn=lambda:train_input_fn(train_x, train_y, args.batch_size),
+    input_fn=lambda:iris_data.train_input_fn(train_x, train_y, args.batch_size),
     steps=args.train_steps)
 ```
 
-Here we wrap up our `input_fn` call in a [`lambda`](https://docs.python.org/3/tutorial/controlflow.html)
-to allow the Estimator to call it, at the correct time, with no arguments.
-The `steps` argument tells the method to stop training after a number of
-training steps.
+Here we wrap up our `input_fn` call in a
+[`lambda`](https://docs.python.org/3/tutorial/controlflow.html)
+to capture the arguments while providing an input function that takes no
+arguments, as expected by the Estimator. The `steps` argument tells the method
+to stop training after a number of training steps.
 
 ### Evaluate the trained model
 
@@ -350,14 +354,14 @@ model on the test data:
 ```python
 # Evaluate the model.
 eval_result = classifier.evaluate(
-    input_fn=lambda:eval_input_fn(test_x, test_y, args.batch_size))
+    input_fn=lambda:iris_data.eval_input_fn(test_x, test_y, args.batch_size))
 
 print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
 ```
 
-Note how unlike our call to the `train` method, we did not pass the `steps`
-argument to evaluate. Our `eval_input_fn` doesn't use the `repeat` method on
-the dataset, so evaluation just runs to the end of the data.
+Unlike our call to the `train` method, we did not pass the `steps`
+argument to evaluate. Our `eval_input_fn` only yields a single
+[epoch](https://developers.google.com/machine-learning/glossary/#epoch) of data.
 
 Running this code yields the following output (or something similar):
 
@@ -369,7 +373,7 @@ Test set accuracy: 0.967
 
 We now have a trained model that produces good evaluation results.
 We can now use the trained model to predict the species of an Iris flower
-based on some unlabeled measurments. As with training and evaluation, we make
+based on some unlabeled measurements. As with training and evaluation, we make
 predictions using a single function call:
 
 ```python
@@ -383,7 +387,8 @@ predict_x = {
 }
 
 predictions = classifier.predict(
-    input_fn=lambda:eval_input_fn(predict_x, batch_size=args.batch_size))
+    input_fn=lambda:iris_data.eval_input_fn(predict_x,
+                                            batch_size=args.batch_size))
 ```
 
 The `predict` method returns a Python iterable, yielding a dictionary of
@@ -397,29 +402,35 @@ for pred_dict, expec in zip(predictions, expected):
 
     class_id = pred_dict['class_ids'][0]
     probability = pred_dict['probabilities'][class_id]
-    print(template.format(SPECIES[class_id], 100 * probability, expec))
+
+    print(template.format(iris_data.SPECIES[class_id],
+                          100 * probability, expec))
 ```
 
 Running the preceding code yields the following output:
 
 ``` None
 ...
-Prediction is "Sentosa" (99.6%), expected "Setosa"
+Prediction is "Setosa" (99.6%), expected "Setosa"
 
 Prediction is "Versicolor" (99.8%), expected "Versicolor"
 
 Prediction is "Virginica" (97.9%), expected "Virginica"
 ```
 
-## Next
 
-Now that you've gotten started writing TensorFlow programs.
+## Summary
+
+Pre-made Estimators are an effective way to quickly create standard models.
+
+Now that you've gotten started writing TensorFlow programs, consider the
+following material:
 
-* For more on Datasets, see the
-  @{$programmers_guide/datasets$Programmer's guide} and
-  @{tf.data$reference documentation}.
-* For more on Estimators, see the
-  @{$programmers_guide/estimators$Programmer's guide} and
-  @{tf.estimator$reference documentation}.
-<!--TODO(markdaoust) add links to next get_started section when it exists.-->
+* @{$get_started/checkpoints$Checkpoints} to learn how to save and restore
+  models.
+* @{$get_started/datasets_quickstart$Datasets} to learn more about importing
+  data into your
+  model.
+* @{$get_started/custom_estimators$Creating Custom Estimators} to learn how to
+  write your own Estimator, customized for a particular problem.
 
diff --git a/tensorflow/docs_src/install/index.md b/tensorflow/docs_src/install/index.md
index c4fc882ddd43eed8fd1c8562f6ac89a7dd68535d..3c8488643f071c147dfbc4e0b4b4760b0a817718 100644
--- a/tensorflow/docs_src/install/index.md
+++ b/tensorflow/docs_src/install/index.md
@@ -4,7 +4,7 @@ We've built and tested TensorFlow on the following 64-bit laptop/desktop
 operating systems:
 
   * MacOS X 10.11 (El Capitan) or later.
-  * Ubuntu 14.04 or later
+  * Ubuntu 16.04 or later
   * Windows 7 or later.
 
 Although you might be able to install TensorFlow on other laptop or desktop
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index df622c6ac57907122e4d236e3623d947dc35ac58..a783205b4a2d24182de6496e0173635990120185 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.4.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 8b3da49a0d4bca1b2bc2293520e0b946a7727c88..5249e04615b506186a12807bb71ec4079db8156c 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.4.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.6.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 6eb81582491899c9c278c41fb39ae21d7fc3f4a9..0c6c773e62483b2272cf3b80da0932b4b800bb71 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -17,7 +17,7 @@ instructions might also work on other variants, we have only tested
 (and we only support) these instructions on machines meeting the
 following requirements:
 
-  * Ubuntu 14.04 or higher; 64-bit, x86
+  * Ubuntu 16.04 or higher; 64-bit, x86
   * macOS X 10.11 (El Capitan) or higher
   * Windows 7 or higher; 64-bit, x86
 
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.4.0</version>
+  <version>1.6.0-rc0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.4.0</version>
+                 <version>1.6.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -113,6 +113,29 @@ Maven projects. If not, check
 [Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow)
 for possible solutions.  You can skip reading the rest of this document.
 
+### GPU support
+
+If your Linux system has an NVIDIA® GPU and your TensorFlow Java program
+requires GPU acceleration, then add the following to the project's `pom.xml`
+instead:
+
+```xml
+<dependency>
+  <groupId>org.tensorflow</groupId>
+  <artifactId>libtensorflow</artifactId>
+  <version>1.6.0-rc0</version>
+</dependency>
+<dependency>
+  <groupId>org.tensorflow</groupId>
+  <artifactId>libtensorflow_jni_gpu</artifactId>
+  <version>1.6.0-rc0</version>
+</dependency>
+```
+
+GPU acceleration is available via Maven only for Linux and only if your system
+meets the
+@{$install_linux#determine_which_tensorflow_to_install$requirements for GPU}.
+
 ## Using TensorFlow with JDK
 
 This section describes how to use TensorFlow using the `java` and `javac`
@@ -124,7 +147,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -143,7 +166,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.4.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.6.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -151,10 +174,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.6.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.4.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.6.0-rc0.zip).
   3. Extract this .zip file.
 
 
@@ -202,7 +225,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.4.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.6.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -216,11 +239,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.4.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.6.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.4.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.6.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 28b04bab9561a050aee2acb4bb8b472a86c12b95..105b225177315db07b1117c3ece4b77dd2b60cb2 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -6,7 +6,7 @@ tested (and we only support) these instructions on machines meeting the
 following requirements:
 
   * 64-bit desktops or laptops
-  * Ubuntu 14.04 or higher
+  * Ubuntu 16.04 or higher
 
 
 ## Determine which TensorFlow to install
@@ -31,13 +31,13 @@ If you are installing TensorFlow with GPU support using one of the
 mechanisms described in this guide, then the following NVIDIA software
 must be installed on your system:
 
-  * CUDA® Toolkit 8.0. For details, see
+  * CUDA® Toolkit 9.0. For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
     Ensure that you append the relevant Cuda pathnames to the
     `LD_LIBRARY_PATH` environment variable as described in the
     NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v6.0. For details, see
+  * The NVIDIA drivers associated with CUDA Toolkit 9.0.
+  * cuDNN v7.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
     Ensure that you create the `CUDA_HOME` environment variable as
     described in the NVIDIA documentation.
@@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -293,7 +293,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -480,7 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -531,7 +531,7 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started/get_started$Getting Started with TensorFlow}.
+If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with TensorFlow}.
 
 If the system outputs an error message instead of a greeting, see [Common
 installation problems](#common_installation_problems).
@@ -648,14 +648,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,14 +667,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -686,14 +686,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -705,57 +705,16 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.6.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 Note that GPU support requires the NVIDIA hardware and software described in
 [NVIDIA requirements to run TensorFlow with GPU support](#NVIDIARequirements).
-
-<a name="Protobuf31"></a>
-## Protobuf pip package 3.1
-
-You can skip this section unless you are seeing problems related
-to the protobuf pip package.
-
-**NOTE:** If your TensorFlow programs are running slowly, you might
-have a problem related to the protobuf pip package.
-
-The TensorFlow pip package depends on protobuf pip package version 3.1. The
-protobuf pip package downloaded from PyPI (when invoking
-<tt>pip install protobuf</tt>) is a Python-only library containing
-Python implementations of proto serialization/deserialization that can run
-**10x-50x slower** than the C++ implementation. Protobuf also supports a
-binary extension for the Python package that contains fast
-C++ based proto parsing.  This extension is not available in the
-standard Python-only pip package.  We have created a custom binary
-pip package for protobuf that contains the binary extension. To install
-the custom binary protobuf pip package, invoke one of the following commands:
-
-  * for Python 2.7:
-
-  <pre>
-  $ <b>pip install --upgrade \
-  https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.1.0-cp27-none-linux_x86_64.whl</b></pre>
-
-  * for Python 3.5:
-
-  <pre>
-  $ <b>pip3 install --upgrade \
-  https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.1.0-cp35-none-linux_x86_64.whl</b></pre>
-
-Installing this protobuf package will overwrite the existing protobuf package.
-Note that the binary pip package already has support for protobufs
-larger than 64MB, which should fix errors such as these:
-
-<pre>[libprotobuf ERROR google/protobuf/src/google/protobuf/io/coded_stream.cc:207]
-A protocol message was rejected because it was too big (more than 67108864 bytes).
-To increase the limit (or to disable these warnings), see
-CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.</pre>
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 79b383817b4865dab20232b453d522c2613f9e9d..a6ea548cfbdb3070c19b5c19ebc903ca76a4656a 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -79,22 +79,23 @@ Take the following steps to install TensorFlow with Virtualenv:
   4. Activate the Virtualenv environment by issuing one of the
      following commands:
 
-     <pre>$ <b>source ~/tensorflow/bin/activate</b>      # If using bash, sh, ksh, or zsh
-    $ <b>source ~/tensorflow/bin/activate.csh</b>  # If using csh or tcsh </pre>
+     <pre>$ <b>cd <i>targetDirectory</i></b>
+    $ <b>source ./bin/activate</b>      # If using bash, sh, ksh, or zsh
+    $ <b>source ./bin/activate.csh</b>  # If using csh or tcsh </pre>
 
      The preceding `source` command should change your prompt to the following:
 
-     <pre> (tensorflow)$ </pre>
+     <pre> (<i>targetDirectory</i>)$ </pre>
 
   5. Ensure pip ≥8.1 is installed:
 
-     <pre> (tensorflow)$ <b>easy_install -U pip</b></pre>
+     <pre> (<i>targetDirectory</i>)$ <b>easy_install -U pip</b></pre>
 
   6. Issue one of the following commands to install TensorFlow and all the
      packages that TensorFlow requires into the active Virtualenv environment:
 
-     <pre> (tensorflow)$ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
-     (tensorflow)$ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
+     <pre> (<i>targetDirectory</i>)$ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
+     (<i>targetDirectory</i>)$ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
 
   7. Optional. If Step 6 failed (typically because you invoked a pip version
      lower than 8.1), install TensorFlow in the active
@@ -114,7 +115,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -128,16 +129,18 @@ to confirm that the installation worked properly.
 
 Note that you must activate the Virtualenv environment each time you
 use TensorFlow in a new shell.  If the Virtualenv environment is not
-currently active (that is, the prompt is not `(tensorflow)`, invoke
+currently active (that is, the prompt is not `(<i>targetDirectory</i>)`, invoke
 one of the following commands:
 
-<pre>$ <b>source ~/tensorflow/bin/activate</b>      # bash, sh, ksh, or zsh
-$ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh </pre>
+<pre>$ <b>cd <i>targetDirectory</i></b>
+$ <b>source ./bin/activate</b>      # If using bash, sh, ksh, or zsh
+$ <b>source ./bin/activate.csh</b>  # If using csh or tcsh </pre>
+
 
 Your prompt will transform to the following to indicate that your
 tensorflow environment is active:
 
-<pre> (tensorflow)$ </pre>
+<pre> (<i>targetDirectory</i>)$ </pre>
 
 When the Virtualenv environment is active, you may run
 TensorFlow programs from this shell.
@@ -145,7 +148,7 @@ TensorFlow programs from this shell.
 When you are done using TensorFlow, you may deactivate the
 environment by issuing the following command:
 
-<pre> (tensorflow)$ <b>deactivate</b> </pre>
+<pre> (<i>targetDirectory</i>)$ <b>deactivate</b> </pre>
 
 The prompt will revert back to your default prompt (as defined by `PS1`).
 
@@ -235,7 +238,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -331,20 +334,20 @@ Take the following steps to install TensorFlow in an Anaconda environment:
   3. Activate the conda environment by issuing the following command:
 
      <pre>$ <b>source activate tensorflow</b>
-     (tensorflow)$  # Your prompt should change</pre>
+     (<i>targetDirectory</i>)$  # Your prompt should change</pre>
 
   4. Issue a command of the following format to install
      TensorFlow inside your conda environment:
 
-     <pre>(tensorflow)<b>$ pip install --ignore-installed --upgrade</b> <i>TF_PYTHON_URL</i></pre>
+     <pre>(<i>targetDirectory</i>)<b>$ pip install --ignore-installed --upgrade</b> <i>TF_PYTHON_URL</i></pre>
 
      where <i>TF_PYTHON_URL</i> is the
      [URL of the TensorFlow Python package](#the_url_of_the_tensorflow_python_package).
      For example, the following command installs the CPU-only version of
      TensorFlow for Python 2.7:
 
-     <pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl</b></pre>
+     <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -395,7 +398,7 @@ writing TensorFlow programs:
 <pre>Hello, TensorFlow!</pre>
 
 If you are new to TensorFlow, see
-@{$get_started/get_started$Getting Started with TensorFlow}.
+@{$get_started/premade_estimators$Getting Started with TensorFlow}.
 
 If the system outputs an error message instead of a greeting, see
 [Common installation problems](#common_installation_problems).
@@ -517,7 +520,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -525,46 +528,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.6.0rc0-py3-none-any.whl
 </pre>
-
-
-
-<a name="Protobuf31"></a>
-## Protobuf pip package 3.1
-
-You can skip this section unless you are seeing problems related
-to the protobuf pip package.
-
-**NOTE:** If your TensorFlow programs are running slowly, you might
-have a problem related to the protobuf pip package.
-
-The TensorFlow pip package depends on protobuf pip package version 3.1. The
-protobuf pip package downloaded from PyPI (when invoking
-<tt>pip install protobuf</tt>) is a Python-only library containing
-Python implementations of proto serialization/deserialization that can run
-**10x-50x slower** than the C++ implementation. Protobuf also supports a
-binary extension for the Python package that contains fast
-C++ based proto parsing.  This extension is not available in the
-standard Python-only pip package.  We have created a custom binary
-pip package for protobuf that contains the binary extension. To install
-the custom binary protobuf pip package, invoke one of the following commands:
-
-  * for Python 2.7:
-
-    <pre>$ <b>pip install --upgrade \
-    https://storage.googleapis.com/tensorflow/mac/cpu/protobuf-3.1.0-cp27-none-macosx_10_11_x86_64.whl</b></pre>
-
-  * for Python 3.n:
-
-    <pre>$ <b>pip3 install --upgrade \
-    https://storage.googleapis.com/tensorflow/mac/cpu/protobuf-3.1.0-cp35-none-macosx_10_11_x86_64.whl</b></pre>
-
-Installing this protobuf package will overwrite the existing protobuf package.
-Note that the binary pip package already has support for protobufs
-larger than 64MB, which should fix errors such as these:
-
-<pre>[libprotobuf ERROR google/protobuf/src/google/protobuf/io/coded_stream.cc:207]
-A protocol message was rejected because it was too big (more than 67108864 bytes).
-To increase the limit (or to disable these warnings), see
-CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.</pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index c01aa907a37cca3e1ef976ddd64ab2d50a6f5d33..7853ec11f59632537ed1f9ebd3bc8f999dd088c7 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -25,8 +25,10 @@ like to try to build TensorFlow on Windows anyway, use either of the
 following:
 
 *   [Bazel on Windows](https://bazel.build/versions/master/docs/windows.html)
-*   [TensorFlow CMake build](https://github.com/tensorflow/tensorflow/tree/r0.12/tensorflow/contrib/cmake)
+*   [TensorFlow CMake build](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/cmake)
 
+Note: Starting from 1.6 release, our prebuilt binaries will use AVX
+instructions. Older CPUs may not be able to execute these binaries.
 
 ## Determine which TensorFlow to install
 
@@ -131,7 +133,7 @@ The following NVIDIA <i>hardware</i> must be installed on your system:
 
 The following NVIDIA <i>software</i> must be installed on your system:
 
-  * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 8.0.
+  * NVIDIA's Cuda Toolkit (>= 7.0). We recommend version 9.0.
     For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/#axzz4VZnqTJ2A).
     Ensure that you append the relevant Cuda pathnames to the
@@ -180,7 +182,7 @@ If bazel is not installed on your system, install it now by following
 
 ### Install python dependencies
 
-To install TensorFlow, you must install the following packages:
+To build TensorFlow, you must install the following packages:
 
   * six
   * numpy, which is a numerical processing package that TensorFlow requires.
@@ -196,7 +198,11 @@ After installing pip, invoke the following commands:
 
 <pre> $ <b>sudo pip install six numpy wheel</b> </pre>
 
-
+Note: These are just the minimum requirements to _build_ tensorflow. Installing
+the pip package will download additional packages required to _run_ it. If you
+plan on executing tasks directly with `bazel` , without the pip installation,
+you may need to install additional python packages. For example, you should
+`pip install mock enum34` before running TensorFlow's tests with bazel.
 
 ### Optional: install TensorFlow for GPU prerequisites
 
@@ -215,7 +221,7 @@ problem, do either of the following:
   * Download Xcode 7.2 and select it as your default by issuing the following
     command:
 
-    <pre> $ <b>sudo xcode-select -s /Application/Xcode-7.2/Xcode.app</b></pre>
+    <pre> $ <b>sudo xcode-select -s /Applications/Xcode-7.2/Xcode.app</b></pre>
 
 **NOTE:** Your system must fulfill the NVIDIA software requirements described
 in one of the following documents:
@@ -266,8 +272,6 @@ Found possible Python library paths:
 Please input the desired Python library path to use.  Default is [/usr/lib/python2.7/dist-packages]
 
 Using python library path: /usr/local/lib/python2.7/dist-packages
-Do you wish to build TensorFlow with MKL support? [y/N]
-No MKL support will be enabled for TensorFlow
 Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]:
 Do you wish to use jemalloc as the malloc implementation? [Y/n]
 jemalloc enabled
@@ -285,11 +289,11 @@ Do you wish to build TensorFlow with CUDA support? [y/N] <b>Y</b>
 CUDA support will be enabled for TensorFlow
 Do you want to use clang as CUDA compiler? [y/N]
 nvcc will be used as CUDA compiler
-Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: <b>8.0</b>
-Please specify the location where CUDA 8.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
+Please specify the Cuda SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
+Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
-Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: <b>6</b>
-Please specify the location where cuDNN 6 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
+Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7</b>
+Please specify the location where cuDNN 7 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify a list of comma-separated Cuda compute capabilities you want to build with.
 You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
 Please note that each additional compute capability significantly increases your build time and binary size.
@@ -355,10 +359,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.4.0 on Linux:
+for TensorFlow 1.6.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.4.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.6.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -389,7 +393,7 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started/get_started$Getting Started with
+If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with
 TensorFlow}.
 
 If the system outputs an error message instead of a greeting, see [Common
@@ -441,15 +445,28 @@ Stack Overflow and specify the `tensorflow` tag.
   <td>Invoking `python` or `ipython` generates the following error:
   <pre>ImportError: cannot import name pywrap_tensorflow</pre></td>
 </tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/questions/45276830">45276830</a></td>
+  <td><pre>external/local_config_cc/BUILD:50:5: in apple_cc_toolchain rule
+  @local_config_cc//:cc-compiler-darwin_x86_64: Xcode version must be specified
+  to use an Apple CROSSTOOL.</pre>
+  </td>
+</tr>
+
 </table>
 
 ## Tested source configurations
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>6</td><td>8</td></tr>
- <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.3.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.2.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>5.1</td><td>8</td></tr>
@@ -462,8 +479,10 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
- <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.1.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.2</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.1.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.2</td><td>5.1</td><td>8</td></tr>
@@ -474,6 +493,10 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.6.0rc0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.6.0rc0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.5.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 8d0eb7966fdf17be1c259627a64803f0a392943a..657d37f6bcb953a2faa7cc93bdbb716a57788db8 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -30,13 +30,13 @@ If you are installing TensorFlow with GPU support using one of the mechanisms
 described in this guide, then the following NVIDIA software must be
 installed on your system:
 
-  * CUDA® Toolkit 8.0. For details, see
+  * CUDA® Toolkit 9.0. For details, see
     [NVIDIA's
     documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/)
     Ensure that you append the relevant Cuda pathnames to the `%PATH%`
     environment variable as described in the NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v6.0. For details, see
+  * The NVIDIA drivers associated with CUDA Toolkit 9.0.
+  * cuDNN v7.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
     Note that cuDNN is typically installed in a different location from the
     other CUDA DLLs. Ensure that you add the directory where you installed
@@ -153,7 +153,7 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started/get_started$Getting Started with
+If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with
 TensorFlow}.
 
 If the system outputs an error message instead of a greeting, see [Common
diff --git a/tensorflow/docs_src/install/leftnav_files b/tensorflow/docs_src/install/leftnav_files
index bc30d37bd08863d52e6ada370ac98e49b0aca54d..e523e06f67aad508238ee0965f34ebe16c77bf90 100644
--- a/tensorflow/docs_src/install/leftnav_files
+++ b/tensorflow/docs_src/install/leftnav_files
@@ -1,10 +1,16 @@
-install_linux.md
-install_mac.md
-install_windows.md
-install_sources.md
+index.md
+
+### Python
+install_linux.md: Ubuntu
+install_mac.md: MacOS
+install_windows.md: Windows
+install_sources.md: From source
 >>>
 migration.md
->>>
-install_java.md
-install_go.md
-install_c.md
+
+### Other Languages
+install_java.md: Java
+install_go.md: Go
+install_c.md: C
+
+
diff --git a/tensorflow/docs_src/mobile/leftnav_files b/tensorflow/docs_src/mobile/leftnav_files
index 4d2c3b62341717d90d6e4afabd105d7fd7a7866d..ac50f528ba468d8a830c059539d3399f413f39c8 100644
--- a/tensorflow/docs_src/mobile/leftnav_files
+++ b/tensorflow/docs_src/mobile/leftnav_files
@@ -1,6 +1,7 @@
 index.md
 ### TensorFlow Lite
 tflite/index.md
+tflite/demo_android.md
 >>>
 ### TensorFlow Mobile
 mobile_intro.md
diff --git a/tensorflow/docs_src/mobile/tflite/demo_android.md b/tensorflow/docs_src/mobile/tflite/demo_android.md
new file mode 100644
index 0000000000000000000000000000000000000000..79b567897cb8a38ed2e27e73aa7e8fee95f718b8
--- /dev/null
+++ b/tensorflow/docs_src/mobile/tflite/demo_android.md
@@ -0,0 +1,39 @@
+# TensorFlow Lite Demo for Android
+
+The TensorFlow Lite demo is a camera app that continuously classifies whatever
+it sees from your device's back camera, using a quantized MobileNet model.
+
+You'll need an Android device running Android 5.0 or higher to run the demo.
+
+To get you started working with TensorFlow Lite on Android, we'll walk you
+through building and deploying our TensorFlow demo app in Android Studio.
+
+It's also possible to build the demo app with Bazel, but we only recommend
+this for advanced users who are very familiar with the Bazel build
+environment. For more information on that, see our page [on Github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite#building-tensorflow-lite-and-the-demo-app-from-source).
+
+## Build and deploy with Android Studio
+
+1. Clone the TensorFlow repository from GitHub if you haven't already:
+
+        git clone https://github.com/tensorflow/tensorflow
+
+2. Install the latest version of Android Studio from [here](https://developer.android.com/studio/index.html).
+
+3. From the **Welcome to Android Studio** screen, use the **Import Project
+   (Gradle, Eclipse ADT, etc)** option to import the
+   `tensorflow/contrib/lite/java/demo` directory as an existing Android Studio
+   Project.
+
+    Android Studio may prompt you to install Gradle upgrades and other tool
+    versions; you should accept these upgrades.
+
+4. Download the TensorFlow Lite MobileNet model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip).
+
+    Unzip this and copy the `mobilenet_quant_v1_224.tflite` file to the assets
+    directory: `tensorflow/contrib/lite/java/demo/app/src/main/assets/`
+
+5. Build and run the app in Android Studio.
+
+You'll have to grant permissions for the app to use the device's camera. Point
+the camera at various objects and enjoy seeing how the model classifies things!
diff --git a/tensorflow/docs_src/mobile/tflite/index.md b/tensorflow/docs_src/mobile/tflite/index.md
index 49d93669a2808159a87538ab1191def5ed9ab9d4..beb24794fc98724e2423e02a71028f79be45cf75 100644
--- a/tensorflow/docs_src/mobile/tflite/index.md
+++ b/tensorflow/docs_src/mobile/tflite/index.md
@@ -95,7 +95,7 @@ following:
 
     All of the following models are guaranteed to work out of the box:
 
-    - Inception V3, a popular model for detecting the the dominant objects
+    - Inception V3, a popular model for detecting the dominant objects
       present in an image.
 
     - [MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md),
@@ -155,7 +155,7 @@ retraining for both floating point and quantized inference.
 
 The following diagram shows the architectural design of TensorFlow Lite:
 
-<img src = "/images/tflite-architecture.jpg">
+![tensorflow lite architecture](https://www.tensorflow.org/images/tflite-architecture.jpg)
 
 Starting with a trained TensorFlow model on disk, you'll convert that model to
 the TensorFlow Lite file format (`.tflite`) using the TensorFlow Lite
diff --git a/tensorflow/docs_src/performance/datasets_performance.md b/tensorflow/docs_src/performance/datasets_performance.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f95e17c3598c23645fad07441c267266e5ef34e
--- /dev/null
+++ b/tensorflow/docs_src/performance/datasets_performance.md
@@ -0,0 +1,331 @@
+# Input Pipeline Performance Guide
+
+GPUs and TPUs can radically reduce the time required to execute a single
+training step. Achieving peak performance requires an efficient input pipeline
+that delivers data for the next step before the current step has finished. The
+`tf.data` API helps to build flexible and efficient input pipelines. This
+document explains the `tf.data` API's features and best practices for building
+high performance TensorFlow input pipelines across a variety of models and
+accelerators.
+
+This guide does the following:
+
+*   Illustrates that TensorFlow input pipelines are essentially an
+    [ETL](https://en.wikipedia.org/wiki/Extract,_transform,_load) process.
+*   Describes common performance optimizations in the context of the `tf.data`
+    API.
+*   Discusses the performance implications of the order in which you apply
+    transformations.
+*   Summarizes the best practices for designing performant TensorFlow input
+    pipelines.
+
+
+## Input Pipeline Structure
+
+A typical TensorFlow training input pipeline can be framed as an ETL process:
+
+1.  **Extract**: Read data from persistent storage -- either local (e.g. HDD or
+    SSD) or remote (e.g. [GCS](https://cloud.google.com/storage/) or
+    [HDFS](https://en.wikipedia.org/wiki/Apache_Hadoop#Hadoop_distributed_file_system)).
+2.  **Transform**: Use CPU cores to parse and perform preprocessing operations
+    on the data such as image decompression, data augmentation transformations
+    (such as random crop, flips, and color distortions), shuffling, and batching.
+3.  **Load**: Load the transformed data onto the accelerator device(s) (for
+    example, GPU(s) or TPU(s)) that execute the machine learning model.
+
+This pattern effectively utilizes the CPU, while reserving the accelerator for
+the heavy lifting of training your model. In addition, viewing input pipelines
+as an ETL process provides structure that facilitates the application of
+performance optimizations.
+
+When using the @{tf.estimator.Estimator} API, the first two phases (Extract and
+Transform) are captured in the `input_fn` passed to
+@{tf.estimator.Estimator.train}. In code, this might look like the following
+(naive, sequential) implementation:
+
+```
+def parse_fn(example):
+  "Parse TFExample records and perform simple data augmentation."
+  example_fmt = {
+    "image": tf.FixedLengthFeature((), tf.string, ""),
+    "label": tf.FixedLengthFeature((), tf.int64, -1)
+  }
+  parsed = tf.parse_single_example(example, example_fmt)
+  image = tf.image.decode_image(parsed["image"])
+  image = _augment_helper(image)  # augments image using slice, reshape, resize_bilinear
+  return image, parsed["label"]
+
+def input_fn():
+  files = tf.data.Dataset.list_files("/path/to/dataset/train-*.tfrecord")
+  dataset = files.interleave(tf.data.TFRecordDataset)
+  dataset = dataset.shuffle(buffer_size=FLAGS.shuffle_buffer_size)
+  dataset = dataset.map(map_func=parse_fn)
+  dataset = dataset.batch(batch_size=FLAGS.batch_size)
+  return dataset
+```
+
+The next section builds on this input pipeline, adding performance
+optimizations.
+
+## Optimizing Performance
+
+As new computing devices (such as GPUs and TPUs) make it possible to train
+neural networks at an increasingly fast rate, the CPU processing is prone to
+becoming the bottleneck. The `tf.data` API provides users with building blocks
+to design input pipelines that effectively utilize the CPU, optimizing each step
+of the ETL process.
+
+### Pipelining
+
+To perform a training step, you must first extract and transform the training
+data and then feed it to a model running on an accelerator. However, in a naive
+synchronous implementation, while the CPU is preparing the data, the accelerator
+is sitting idle. Conversely, while the accelerator is training the model, the
+CPU is sitting idle. The training step time is thus the sum of both CPU
+pre-processing time and the accelerator training time.
+
+**Pipelining** overlaps the preprocessing and model execution of a training
+step. While the accelerator is performing training step `N`, the CPU is
+preparing the data for step `N+1`. Doing so reduces the step time to the maximum
+(as opposed to the sum) of the training and the time it takes to extract and
+transform the data.
+
+Without pipelining, the CPU and the GPU/TPU sit idle much of the time:
+
+![without pipelining](https://www.tensorflow.org/images/datasets_without_pipelining.png)
+
+With pipelining, idle time diminishes significantly:
+
+![with pipelining](https://www.tensorflow.org/images/datasets_with_pipelining.png)
+
+The `tf.data` API provides a software pipelining mechanism through the
+@{tf.data.Dataset.prefetch} transformation, which can be used to decouple the
+time data is produced from the time it is consumed. In particular, the
+transformation uses a background thread and an internal buffer to prefetch
+elements from the input dataset ahead of the time they are requested. Thus, to
+achieve the pipelining effect illustrated above, you can add `prefetch(1)` as
+the final transformation to your dataset pipeline (or `prefetch(n)` if a single
+training step consumes n elements).
+
+To apply this change to our running example, change:
+
+```
+dataset = dataset.batch(batch_size=FLAGS.batch_size)
+return dataset
+```
+
+to:
+
+
+```
+dataset = dataset.batch(batch_size=FLAGS.batch_size)
+dataset = dataset.prefetch(buffer_size=FLAGS.prefetch_buffer_size)
+return dataset
+```
+
+Note that the prefetch transformation will yield benefits any time there is an
+opportunity to overlap the work of a "producer" with the work of a "consumer."
+The preceding recommendation is simply the most common application.
+
+### Parallelize Data Transformation
+
+When preparing a batch, input elements may need to be pre-processed. To this
+end, the `tf.data` API offers the @{tf.data.Dataset.map} transformation, which
+applies a user-defined function (for example, `parse_fn` from the running
+example) to each element of the input dataset. Because input elements are
+independent of one another, the pre-processing can be parallelized across
+multiple CPU cores. To make this possible, the `map` transformation provides the
+`num_parallel_calls` argument to specify the level of parallelism. For example,
+the following diagram illustrates the effect of setting `num_parallel_calls=2`
+to the `map` transformation:
+
+![parallel map](https://www.tensorflow.org/images/datasets_parallel_map.png)
+
+Choosing the best value for the `num_parallel_calls` argument depends on your
+hardware, characteristics of your training data (such as its size and shape),
+the cost of your map function, and what other processing is happening on the
+CPU at the same time; a simple heuristic is to use the number of available CPU
+cores. For instance, if the machine executing the example above had 4 cores, it
+would have been more efficient to set `num_parallel_calls=4`. On the other hand,
+setting `num_parallel_calls` to a value much greater than the number of
+available CPUs can lead to inefficient scheduling, resulting in a slowdown.
+
+To apply this change to our running example, change:
+
+```
+dataset = dataset.map(map_func=parse_fn)
+```
+
+to:
+
+```
+dataset = dataset.map(map_func=parse_fn, num_parallel_calls=FLAGS.num_parallel_calls)
+```
+
+Furthermore, if your batch size is in the hundreds or thousands, your pipeline
+will likely additionally benefit from parallelizing the batch creation. To this
+end, the `tf.data` API provides the @{tf.contrib.data.map_and_batch}
+transformation, which effectively "fuses" the map and batch transformations.
+
+To apply this change to our running example, change:
+
+```
+dataset = dataset.map(map_func=parse_fn, num_parallel_calls=FLAGS.num_parallel_calls)
+dataset = dataset.batch(batch_size=FLAGS.batch_size)
+```
+
+to:
+
+```
+dataset = dataset.apply(tf.contrib.data.map_and_batch(
+    map_func=parse_fn, batch_size=FLAGS.batch_size))
+```
+
+### Parallelize Data Extraction
+
+In a real-world setting, the input data may be stored remotely (for example,
+GCS or HDFS), either because the input data would not fit locally or because the
+training is distributed and it would not make sense to replicate the input data
+on every machine. A dataset pipeline that works well when reading data locally
+might become bottlenecked on I/O when reading data remotely because of the
+following differences between local and remote storage:
+
+
+*   **Time-to-first-byte:** Reading the first byte of a file from remote storage
+    can take orders of magnitude longer than from local storage.
+*   **Read throughput:** While remote storage typically offers large aggregate
+    bandwidth, reading a single file might only be able to utilize a small
+    fraction of this bandwidth.
+
+In addition, once the raw bytes are read into memory, it may also be necessary
+to deserialize or decrypt the data
+(e.g. [protobuf](https://developers.google.com/protocol-buffers/)), which adds
+additional overhead. This overhead is present irrespective of whether the data
+is stored locally or remotely, but can be worse in the remote case if data is
+not prefetched effectively.
+
+To mitigate the impact of the various data extraction overheads, the `tf.data`
+API offers the @{tf.contrib.data.parallel_interleave} transformation. Use this
+transformation to parallelize the execution of and interleave the contents of
+other datasets (such as data file readers). The
+number of datasets to overlap can be specified by the `cycle_length` argument.
+
+The following diagram illustrates the effect of supplying `cycle_length=2` to
+the `parallel_interleave` transformation:
+
+![parallel io](https://www.tensorflow.org/images/datasets_parallel_io.png)
+
+To apply this change to our running example, change:
+
+```
+dataset = files.interleave(tf.data.TFRecordDataset)
+```
+
+to:
+
+```
+dataset = files.apply(tf.contrib.data.parallel_interleave(
+    tf.data.TFRecordDataset, cycle_length=FLAGS.num_parallel_readers))
+```
+
+
+The throughput of remote storage systems can vary over time due to load or
+network events. To account for this variance, the `parallel_interleave`
+transformation can optionally use prefetching. (See
+@{tf.contrib.data.parallel_interleave} for details).
+
+By default, the `parallel_interleave` transformation provides a deterministic
+ordering of elements to aid reproducibility. As an alternative to prefetching
+(which may be ineffective in some cases), the `parallel_interleave`
+transformation also provides an option that can boost performance at the expense
+of ordering guarantees. In particular, if the `sloppy` argument is set to true,
+the transformation may depart from its otherwise deterministic ordering, by
+temporarily skipping over files whose elements are not available when the next
+element is requested.
+
+## Performance Considerations
+
+The `tf.data` API is designed around composable transformations to provide its
+users with flexibility. Although many of these transformations are commutative,
+the ordering of certain transformations has performance implications.
+
+### Map and Batch
+
+Invoking the user-defined function passed into the `map` transformation has
+overhead related to scheduling and executing the user-defined function.
+Normally, this overhead is small compared to the amount of computation performed
+by the function. However, if `map` does little work, this overhead can dominate
+the total cost. In such cases, we recommend vectorizing the user-defined
+function (that is, have it operate over a batch of inputs at once) and apply the
+`batch` transformation _before_ the `map` transformation.
+
+### Map and Cache
+
+The @{tf.data.Dataset.cache} transformation can cache a dataset, either in
+memory or on local storage. If the user-defined function passed into the `map`
+transformation is expensive, apply the cache transformation after the map
+transformation as long as the resulting dataset can still fit into memory or
+local storage. If the user-defined function increases the space required to
+store the dataset beyond the cache capacity, consider pre-processing your data
+before your training job to reduce resource usage.
+
+### Map and Interleave / Prefetch / Shuffle
+
+A number of transformations, including `interleave`, `prefetch`, and `shuffle`,
+maintain an internal buffer of elements. If the user-defined function passed
+into the `map` transformation changes the size of the elements, then the
+ordering of the map transformation and the transformations that buffer elements
+affects the memory usage. In general, we recommend choosing the order that
+results in lower memory footprint, unless different ordering is desirable for
+performance (for example, to enable fusing of the map and batch transformations).
+
+### Repeat and Shuffle
+
+The @{tf.data.Dataset.repeat} transformation repeats the input data a finite (or
+infinite) number of times; each repetition of the data is typically referred to
+as an _epoch_. The @{tf.data.Dataset.shuffle} transformation randomizes the
+order of the dataset's examples.
+
+If the `repeat` transformation is applied before the `shuffle` transformation,
+then the epoch boundaries are blurred. That is, certain elements can be repeated
+before other elements appear even once. On the other hand, if the `shuffle`
+transformation is applied before the repeat transformation, then performance
+might slow down at the beginning of each epoch related to initialization of the
+internal state of the `shuffle` transformation. In other words, the former
+(`repeat` before `shuffle`) provides better performance, while the latter
+(`shuffle` before `repeat`) provides stronger ordering guarantees.
+
+When possible, we recommend using the fused
+@{tf.contrib.data.shuffle_and_repeat} transformation, which combines the best of
+both worlds (good performance and strong ordering guarantees). Otherwise, we
+recommend shuffling before repeating.
+
+## Summary of Best Practices
+
+Here is a summary of the best practices for designing input pipelines:
+
+*   Use the `prefetch` transformation to overlap the work of a producer and
+    consumer. In particular, we recommend adding prefetch(n) (where n is the
+    number of elements / batches consumed by a training step) to the end of your
+    input pipeline to overlap the transformations performed on the CPU with the
+    training done on the accelerator.
+*   Parallelize the `map` transformation by setting the `num_parallel_calls`
+    argument. We recommend using the number of available CPU cores for its value.
+*   If you are combining pre-processed elements into a batch using the `batch`
+    transformation, we recommend using the fused `map_and_batch` transformation;
+    especially if you are using large batch sizes.
+*   If you are working with data stored remotely and / or requiring
+    deserialization, we recommend using the `parallel_interleave`
+    transformation to overlap the reading (and deserialization) of data from
+    different files.
+*   Vectorize cheap user-defined functions passed in to the `map` transformation
+    to amortize the overhead associated with scheduling and executing the
+    function.
+*   If your data can fit into memory, use the `cache` transformation to cache it
+    in memory during the first epoch, so that subsequent epochs can avoid the
+    overhead associated with reading, parsing, and transforming it.
+*   If your pre-processing increases the size of your data, we recommend
+    applying the `interleave`, `prefetch`, and `shuffle` first (if possible) to
+    reduce memory usage.
+*   We recommend applying the `shuffle` transformation _before_ the `repeat`
+    transformation, ideally using the fused `shuffle_and_repeat` transformation.
diff --git a/tensorflow/docs_src/performance/leftnav_files b/tensorflow/docs_src/performance/leftnav_files
index d22847322084d584a4ddc713486109ede838fee8..316f023f43dcfe781c7819d1681335267ddd5f76 100644
--- a/tensorflow/docs_src/performance/leftnav_files
+++ b/tensorflow/docs_src/performance/leftnav_files
@@ -1,8 +1,9 @@
 performance_guide.md
+datasets_performance.md
 performance_models.md
 benchmarks.md
-quantization.md
->>>
+
+### XLA
 xla/index.md
 xla/broadcasting.md
 xla/developing_new_backend.md
@@ -10,3 +11,6 @@ xla/jit.md
 xla/operation_semantics.md
 xla/shapes.md
 xla/tfcompile.md
+
+### Quantization
+quantization.md
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 17f71a6d7705c75e7322932cc652ec6728c8c626..cd47fc2803bc1429d28bd0ae4c2ad68e632a6f03 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -18,6 +18,7 @@ following sections:
 *   [Input pipeline optimizations](#input-pipeline-optimization)
 *   [Data formats](#data-formats)
 *   [Common fused Ops](#common-fused-ops)
+*   [RNN Performance](#rnn-performance)
 *   [Building and installing from source](#building-and-installing-from-source)
 
 ### Input pipeline optimization
@@ -65,22 +66,25 @@ with tf.device('/cpu:0'):
 If using `tf.estimator.Estimator` the input function is automatically placed on
 the CPU.
 
-#### Using the Dataset API
+#### Using the tf.data API
 
-The @{$datasets$Dataset API} is replacing `queue_runner` as the recommended API
-for building input pipelines. The API was added to contrib as part of TensorFlow
-1.2 and will move to core in the near future. This
+The @{$datasets$tf.data API} is replacing `queue_runner` as the recommended API
+for building input pipelines. This
 [ResNet example](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator/cifar10_main.py)
 ([arXiv:1512.03385](https://arxiv.org/abs/1512.03385))
-training CIFAR-10 illustrates the use of the Dataset API along with
-`tf.estimator.Estimator`. The Dataset API utilizes C++ multi-threading and has a
-much lower overhead than the Python-based `queue_runner` that is limited by
-Python's multi-threading performance.
+training CIFAR-10 illustrates the use of the `tf.data` API along with
+`tf.estimator.Estimator`.
+
+The `tf.data` API utilizes C++ multi-threading and has a much lower overhead
+than the Python-based `queue_runner` that is limited by Python's multi-threading
+performance. A detailed performance guide for the `tf.data` API can be found
+[here](#datasets_performance).
 
 While feeding data using a `feed_dict` offers a high level of flexibility, in
-most instances using `feed_dict` does not scale optimally. However, in instances
-where only a single GPU is being used the difference can be negligible. Using
-the Dataset API is still strongly recommended. Try to avoid the following:
+general `feed_dict` does not provide a scalable solution. If only a single GPU
+is used, the difference between the `tf.data` API and `feed_dict` performance
+may be negligible. Our recommendation is to avoid using `feed_dict` for all but
+trivial examples. In particular, avoid using `feed_dict` with large inputs:
 
 ```python
 # feed_dict often results in suboptimal performance when using large inputs.
@@ -197,6 +201,53 @@ since before TensorFlow 1.0.
 bn = tf.contrib.layers.batch_norm(input_layer, fused=True, data_format='NCHW')
 ```
 
+### RNN Performance
+
+There are many ways to specify an RNN computation in TensorFlow and they have
+trade-offs with respect to model flexibility and performance. The
+@{tf.nn.rnn_cell.BasicLSTMCell} should be considered a reference implementation
+and used only as a last resort when no other options will work.
+
+When using one of the cells, rather than the fully fused RNN layers, you have a
+choice of whether to use @{tf.nn.static_rnn} or @{tf.nn.dynamic_rnn}.  There
+shouldn't generally be a performance difference at runtime, but large unroll
+amounts can increase the graph size of the @{tf.nn.static_rnn} and cause long
+compile times.  An additional advantage of @{tf.nn.dynamic_rnn} is that it can
+optionally swap memory from the GPU to the CPU to enable training of very long
+sequences.  Depending on the model and hardware configuration, this can come at
+a performance cost.  It is also possible to run multiple iterations of
+@{tf.nn.dynamic_rnn} and the underlying @{tf.while_loop} construct in parallel,
+although this is rarely useful with RNN models as they are inherently
+sequential.
+
+On NVIDIA GPUs, the use of @{tf.contrib.cudnn_rnn} should always be preferred
+unless you want layer normalization, which it doesn't support.  It is often at
+least an order of magnitude faster than @{tf.contrib.rnn.BasicLSTMCell} and
+@{tf.contrib.rnn.LSTMBlockCell} and uses 3-4x less memory than
+@{tf.contrib.rnn.BasicLSTMCell}.
+
+If you need to run one step of the RNN at a time, as might be the case in
+reinforcement learning with a recurrent policy, then you should use the
+@{tf.contrib.rnn.LSTMBlockCell} with your own environment interaction loop
+inside a @{tf.while_loop} construct. Running one step of the RNN at a time and
+returning to Python is possible, but it will be slower.
+
+On CPUs, mobile devices, and if @{tf.contrib.cudnn_rnn} is not available on
+your GPU, the fastest and most memory efficient option is
+@{tf.contrib.rnn.LSTMBlockFusedCell}.
+
+For all of the less common cell types like @{tf.contrib.rnn.NASCell},
+@{tf.contrib.rnn.PhasedLSTMCell}, @{tf.contrib.rnn.UGRNNCell},
+@{tf.contrib.rnn.GLSTMCell}, @{tf.contrib.rnn.Conv1DLSTMCell},
+@{tf.contrib.rnn.Conv2DLSTMCell}, @{tf.contrib.rnn.LayerNormBasicLSTMCell},
+etc., one should be aware that they are implemented in the graph like
+@{tf.contrib.rnn.BasicLSTMCell} and as such will suffer from the same poor
+performance and high memory usage.  One should consider whether or not those
+trade-offs are worth it before using these cells. For example, while layer
+normalization can speed up convergence, because cuDNN is 20x faster the fastest
+wall clock time to convergence is usually obtained without it.
+
+
 ### Building and installing from source
 
 The default TensorFlow binaries target the broadest range of hardware to make
@@ -447,7 +498,7 @@ For TensorFlow source versions after 1.3.0:
 ```bash
 ./configure
 # Pick the desired options
-bazel build --config=mkl -c opt //tensorflow/tools/pip_package:build_pip_package
+bazel build --config=mkl --config=opt //tensorflow/tools/pip_package:build_pip_package
 
 ```
 
diff --git a/tensorflow/docs_src/performance/xla/broadcasting.md b/tensorflow/docs_src/performance/xla/broadcasting.md
index 8dbf0d0446f41b26489912734bc11704e61efeab..ca3bddf758cf64e7c580f9babfe559ae23708705 100644
--- a/tensorflow/docs_src/performance/xla/broadcasting.md
+++ b/tensorflow/docs_src/performance/xla/broadcasting.md
@@ -33,11 +33,11 @@ In Numpy, this is called [broadcasting]
 
 ## Principles
 
-XLA is a low-level infrastructure with a XLA language this is as strict and
-explicit as possible, avoiding implicit and "magical" features that may make
-some computations slightly easier to define, at the cost of more assumptions
-baked into user code that will be difficult to change in the long term. If
-necessary, implicit and magical features can be added in client-level wrappers.
+The XLA language is as strict and explicit as possible, avoiding implicit and
+"magical" features. Such features may make some computations slightly easier to
+define, at the cost of more assumptions baked into user code that will be
+difficult to change in the long term. If necessary, implicit and magical
+features can be added in client-level wrappers.
 
 In regards to broadcasting, explicit broadcasting specifications on operations
 between arrays of different ranks is required. This is different from Numpy,
diff --git a/tensorflow/docs_src/performance/xla/developing_new_backend.md b/tensorflow/docs_src/performance/xla/developing_new_backend.md
index 28010ff1b785813e15c56d4bb5c26b0bcedce3d9..74ea15bb2bac2014257f0b1719820f7ee313b66b 100644
--- a/tensorflow/docs_src/performance/xla/developing_new_backend.md
+++ b/tensorflow/docs_src/performance/xla/developing_new_backend.md
@@ -62,11 +62,11 @@ If it is not possible to utilize LLVM, then the best option is to implement a
 new backend for XLA for the desired hardware. This option requires the most
 effort. The classes that need to be implemented are as follows:
 
-*   [StreamExecutor](https://www.tensorflow.org/code/tensorflow/stream_executor/stream_executor.h):
+*   [`StreamExecutor`](https://www.tensorflow.org/code/tensorflow/stream_executor/stream_executor.h):
     For many devices not all methods of `StreamExecutor` are needed. See
     existing `StreamExecutor` implementations for details.
-*   [xla::Compiler](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/compiler.h):
-    This class encapsulates the compilation of a HLO computation into an
+*   [`xla::Compiler`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/compiler.h):
+    This class encapsulates the compilation of an HLO computation into an
     `xla::Executable`.
 *   [`xla::Executable`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/executable.h):
     This class is used to launch a compiled computation on the platform.
diff --git a/tensorflow/docs_src/performance/xla/index.md b/tensorflow/docs_src/performance/xla/index.md
index 19045b45d92a2ca42c3943bc0662ca42bd0c2c24..a8847830740302a0de6f57cb3b7a0d6c7e096d32 100644
--- a/tensorflow/docs_src/performance/xla/index.md
+++ b/tensorflow/docs_src/performance/xla/index.md
@@ -65,18 +65,19 @@ The following diagram shows the compilation process in XLA:
   <img src="https://www.tensorflow.org/images/how-does-xla-work.png">
 </div>
 
-XLA comes with several optimizations and analyzes that are target-independent,
-such as [CSE](https://en.wikipedia.org/wiki/Common_subexpression_elimination),
+XLA comes with several optimizations and analysis passes that are
+target-independent, such as
+[CSE](https://en.wikipedia.org/wiki/Common_subexpression_elimination),
 target-independent operation fusion, and buffer analysis for allocating runtime
 memory for the computation.
 
 After the target-independent step, XLA sends the HLO computation to a backend.
-The backend can perform further HLO-level analyzes and optimizations, this time
-with target specific information and needs in mind. For example, the XLA GPU
-backend may perform operation fusion beneficial specifically for the GPU
-programming model and determine how to partition the computation into streams.
-At this stage, backends may also pattern-match certain operations or
-combinations thereof to optimized library calls.
+The backend can perform further HLO-level optimizations, this time with target
+specific information and needs in mind. For example, the XLA GPU backend may
+perform operation fusion beneficial specifically for the GPU programming model
+and determine how to partition the computation into streams. At this stage,
+backends may also pattern-match certain operations or combinations thereof to
+optimized library calls.
 
 The next step is target-specific code generation. The CPU and GPU backends
 included with XLA use [LLVM](http://llvm.org) for low-level IR, optimization,
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index 217f542caa64a5fafb61536a3b9591cae42b517b..5431572db83a84c034c56656928bdc927e708dc9 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -13,6 +13,154 @@ arbitrary-dimensional array. For convenience, special cases have more specific
 and familiar names; for example a *vector* is a 1-dimensional array and a
 *matrix* is a 2-dimensional array.
 
+## BatchNormGrad
+
+See also
+[`ComputationBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h)
+and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
+for a detailed description of the algorithm.
+
+Calculates gradients of batch norm.
+
+<b> `BatchNormGrad(operand, scale, mean, variance, grad_output, epsilon, feature_index)` </b>
+
+| Arguments       | Type                    | Semantics                        |
+| --------------  | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
+:                 :                         : normalized (x)                   :
+| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\gamma\\))                   :
+| `mean`          | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))  |
+| `variance`      | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\sigma^2\\))                 :
+| `grad_output`   | `ComputationDataHandle` | Gradients passed to              |
+:                 :                         : `BatchNormTraining`              :
+:                 :                         : (\\( \nabla y\\))                :
+| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
+| `feature_index` | `int64`                 | Index to feature dimension in    |
+:                 :                         : `operand`                        :
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the gradients with
+respect to `operand`, `offset` and `scale` across all the other dimensions. The
+`feature_index` must be a valid index for the feature dimension in `operand`.
+
+The three gradients are defined by the following formulas:
+
+\\( \nabla x = \nabla y * \gamma * \sqrt{\sigma^2+\epsilon} \\)
+
+\\( \nabla \gamma = sum(\nabla y * (x - \mu) * \sqrt{\sigma^2 + \epsilon}) \\)
+
+\\( \nabla \beta = sum(\nabla y) \\)
+
+The inputs `mean` and `variance` represents moments value
+across batch and spatial dimensions.
+
+The output type is a tuple of three handles:
+
+|Outputs       | Type                    | Semantics                           |
+|------------- | ----------------------- | ------------------------------------|
+|`grad_operand`| `ComputationDataHandle` | gradient with respect to input      |
+:              :                         : `operand`                           :
+|`grad_scale`  | `ComputationDataHandle` | gradient with respect to input      |
+:              :                         : `scale`                             :
+|`grad_offset` | `ComputationDataHandle` | gradient with respect to input      |
+:              :                         : `offset`                            :
+
+
+## BatchNormInference
+
+See also
+[`ComputationBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
+[the original batch normalization paper](https://arxiv.org/abs/1502.03167)
+for a detailed description of the algorithm.
+
+Normalizes an array across batch and spatial dimensions.
+
+<b> `BatchNormInference(operand, scale, offset, mean, variance, epsilon, feature_index)` </b>
+
+| Arguments       | Type                    | Semantics                       |
+| --------------  | ----------------------- | ------------------------------- |
+| `operand`       | `ComputationDataHandle` | n dimensional array to be       |
+:                 :                         : normalized                      :
+| `scale`         | `ComputationDataHandle` | 1 dimensional array             |
+| `offset`        | `ComputationDataHandle` | 1 dimensional array             |
+| `mean`          | `ComputationDataHandle` | 1 dimensional array             |
+| `variance`      | `ComputationDataHandle` | 1 dimensional array             |
+| `epsilon`       | `float`                 | Epsilon value                   |
+| `feature_index` | `int64`                 | Index to feature dimension in   |
+:                 :                         : `operand`                       :
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the mean and variance
+across all the other dimensions and uses the mean and variance to normalize each
+element in `operand`. The `feature_index` must be a valid index for the feature
+dimension in `operand`.
+
+`BatchNormInference`  is equivalent to calling `BatchNormTraining` without
+computing `mean` and `variance` for each batch. It uses the input `mean` and
+`variance` instead as estimated values. The purpose of this op is to reduce
+latency in inference, hence the name `BatchNormInference`.
+
+The output is an n-dimensional, normalized array with the same shape as input
+`operand`.
+
+## BatchNormTraining
+
+See also
+[`ComputationBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
+[`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
+for a detailed description of the algorithm.
+
+Normalizes an array across batch and spatial dimensions.
+
+<b> `BatchNormTraining(operand, scale, offset, epsilon, feature_index)` </b>
+
+| Arguments       | Type                    | Semantics                        |
+| --------------- | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
+:                 :                         : normalized                       :
+| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\gamma\\))                   :
+| `offset`        | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\beta\\ )                    :
+| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
+| `feature_index` | `int64`                 | Index to feature dimension       |
+:                 :                         : in `operand`                     :
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the mean and variance
+across all the other dimensions and uses the mean and variance to normalize each
+element in `operand`. The `feature_index` must be a valid index for the feature
+dimension in `operand`.
+
+The algorithm goes as follows for each batch in `operand` \\(x\\) that
+contains `m` elements with `w` and `h` as the size of spatial dimensions (
+assuming `operand` is an 4 dimensional array):
+
+- Calculates batch mean \\(\mu_l\\) for each feature `l` in feature dimension:
+\\(\mu_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h x_{ijkl}\\)
+
+- Calculates batch variance \\(\sigma^2_l\\):
+\\(\sigma^2_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h (x_{ijkl} - \mu_l)^2\\)
+
+- Normalizes, scales and shifts:
+\\(y_{ijkl}=\frac{\gamma_l(x_{ijkl}-\mu_l)}{\sqrt[2]{\sigma^2_l+\epsilon}}+\beta_l\\)
+
+The epsilon value, usually a small number, is added to avoid divide-by-zero errors.
+
+The output type is a tuple of three `ComputationDataHandle`s:
+
+| Outputs      | Type                    | Semantics                            |
+| ------------ | ----------------------- | -------------------------------------|
+| `output`     | `ComputationDataHandle` | n dimensional array with the same    |
+:              :                         : shape as input `operand` (y)         :
+| `batch_mean` | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))      |
+| `batch_var`  | `ComputationDataHandle` | 1 dimensional array (\\(\sigma^2\\)) |
+
+The `batch_mean` and `batch_var` are moments calculated across the batch and
+spatial dimensions using the formulas above.
+
 ## BitcastConvertType
 
 See also
@@ -104,7 +252,7 @@ Clamps an operand to within the range between a minimum and maximum value.
 Given an operand and minimum and maximum values, returns the operand if it is in
 the range between the minimum and maximum, else returns the minimum value if the
 operand is below this range or the maximum value if the operand is above this
-range.  That is, `clamp(a, x, b) =  max(min(a, x), b)`.
+range.  That is, `clamp(a, x, b) =  min(max(a, x), b)`.
 
 All three arrays must be the same shape. Alternately, as a restricted form of
 [broadcasting](broadcasting.md), `min` and/or `max` can be a scalar of type `T`.
@@ -239,39 +387,34 @@ Diagram:
   <img style="width:100%" src="https://www.tensorflow.org/images/ops_concatenate.png">
 </div>
 
-## ConvertElementType
-
-See also
-[`ComputationBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-Similar to an element-wise `static_cast` in C++, performs an element-wise
-conversion operation from a data shape to a target shape. The dimensions must
-match, and the conversion is an element-wise one; e.g. `s32` elements become
-`f32` elements via an `s32`-to-`f32` conversion routine.
+## Conditional
 
-<b> `ConvertElementType(operand, new_element_type)` </b>
+See also [`ComputationBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-Arguments          | Type                    | Semantics
------------------- | ----------------------- | ---------------------------
-`operand`          | `ComputationDataHandle` | array of type T with dims D
-`new_element_type` | `PrimitiveType`         | type U
+<b> `Conditional(pred, true_operand, true_computation, false_operand,
+    false_computation)` </b>
 
-The dimensions of the operand and the target shape must match. The source and
-destination element types must not be tuples.
+| Arguments           | Type                    | Semantics                   |
+| ------------------- | ----------------------- | --------------------------- |
+| `pred`              | `ComputationDataHandle` | Scalar of type `PRED`       |
+| `true_operand`      | `ComputationDataHandle` | Argument of type `T_0`      |
+| `true_computation`  | `Computation`           | Computation of type `T_0 -> |
+:                     :                         : S`                          :
+| `false_operand`     | `ComputationDataHandle` | Argument of type `T_1`      |
+| `false_computation` | `Computation`           | Computation of type `T_1 -> |
+:                     :                         : S`                          :
 
-A conversion such as `T=s32` to `U=f32` will perform a normalizing int-to-float
-conversion routine such as round-to-nearest-even.
+Executes `true_computation` if `pred` is `true`, `false_computation` if `pred`
+is `false`, and returns the result.
 
-> Note: The precise float-to-int and visa-versa conversions are currently
-> unspecified, but may become additional arguments to the convert operation in
-> the future.  Not all possible conversions have been implemented for all
->targets.
+The `true_computation` must take in a single argument of type `T_0` and will be
+invoked with `true_operand` which must be of the same type. The
+`false_computation` must take in a single argument of type `T_1` and will be
+invoked with `false_operand` which must be of the same type. The type of the
+returned value of `true_computation` and `false_computation` must be the same.
 
-```
-let a: s32[3] = {0, 1, 2};
-let b: f32[3] = convert(a, f32);
-then b == f32[3]{0.0, 1.0, 2.0}
-```
+Note that only one of `true_computation` and `false_computation` will be
+executed depending on the value of `pred`.
 
 ## Conv (convolution)
 
@@ -395,6 +538,40 @@ for (b, oz, oy, ox) {  // output coordinates
 }
 ```
 
+## ConvertElementType
+
+See also
+[`ComputationBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+Similar to an element-wise `static_cast` in C++, performs an element-wise
+conversion operation from a data shape to a target shape. The dimensions must
+match, and the conversion is an element-wise one; e.g. `s32` elements become
+`f32` elements via an `s32`-to-`f32` conversion routine.
+
+<b> `ConvertElementType(operand, new_element_type)` </b>
+
+Arguments          | Type                    | Semantics
+------------------ | ----------------------- | ---------------------------
+`operand`          | `ComputationDataHandle` | array of type T with dims D
+`new_element_type` | `PrimitiveType`         | type U
+
+The dimensions of the operand and the target shape must match. The source and
+destination element types must not be tuples.
+
+A conversion such as `T=s32` to `U=f32` will perform a normalizing int-to-float
+conversion routine such as round-to-nearest-even.
+
+> Note: The precise float-to-int and visa-versa conversions are currently
+> unspecified, but may become additional arguments to the convert operation in
+> the future.  Not all possible conversions have been implemented for all
+>targets.
+
+```
+let a: s32[3] = {0, 1, 2};
+let b: f32[3] = convert(a, f32);
+then b == f32[3]{0.0, 1.0, 2.0}
+```
+
 ## CrossReplicaSum
 
 See also
@@ -409,9 +586,9 @@ Computes a sum across replicas.
 | `operand`    | `ComputationDataHandle` | Array to sum across replicas.      |
 
 The output shape is the same as the input shape. For example, if there are two
-replicas and the operand has the value `(1.0, 2.5)` and `(3.0, 5.1)`
+replicas and the operand has the value `(1.0, 2.5)` and `(3.0, 5.25)`
 respectively on the two replicas, then the output value from this op will be
-`(4.0, 7.6)` on both replicas.
+`(4.0, 7.75)` on both replicas.
 
 Computing the result of CrossReplicaSum requires having one input from each
 replica, so if one replica executes a CrossReplicaSum node more times than
@@ -511,282 +688,344 @@ contracted dimensions of `lhs` and `rhs` must be of the same size. In practice,
 it can be used to perform dot products between vectors, vector/matrix
 multiplications or matrix/matrix multiplications.
 
-## Element-wise binary arithmetic operations
+## DotGeneral
 
 See also
-[`ComputationBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`ComputationBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-A set of element-wise binary arithmetic operations is supported.
+<b> `DotGeneral(lhs, rhs, dimension_numbers)` </b>
 
-<b> `Op(lhs, rhs)` </b>
+| Arguments | Type                    | Semantics
+| --------- | ----------------------- | ---------------
+| `lhs`     | `ComputationDataHandle` | array of type T
+| `rhs`     | `ComputationDataHandle` | array of type T
+| `dimension_numbers` | `DotDimensionNumbers` | array of type T
 
-Where `Op` is one of `Add` (addition), `Sub` (subtraction), `Mul`
-(multiplication), `Div` (division), `Rem` (remainder), `Max` (maximum), `Min`
-(minimum), `LogicalAnd` (logical AND), or `LogicalOr` (logical OR).
+As Dot, but allows contracting and batch dimension numbers to be specified for
+both the 'lhs' and 'rhs'.
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ----------------------------------------
-`lhs`     | `ComputationDataHandle` | left-hand-side operand: array of type T
-`rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
+| DotDimensionNumbers Fields | Type                    | Semantics
+| --------- | ----------------------- | ---------------
+| 'lhs_contracting_dimensions' | repeated int64 | 'lhs' contracting dimension numbers |
+| 'rhs_contracting_dimensions' | repeated int64 | 'rhs' contracting dimension numbers |
+| 'lhs_batch_dimensions' | repeated int64 | 'lhs' batch dimension numbers |
+| 'rhs_batch_dimensions' | repeated int64 | 'rhs' batch dimension numbers |
 
-The arguments' shapes have to be either similar or compatible. See the
-@{$broadcasting$broadcasting} documentation about what it means for shapes to
-be compatible. The result of an operation has a shape which is the result of
-broadcasting the two input arrays. In this variant, operations between arrays of
-different ranks are *not* supported, unless one of the operands is a scalar.
+DotGeneral performs the sum of products over contracting dimensions specified
+in 'dimension_numbers'.
 
-When `Op` is `Rem`, the sign of the result is taken from the dividend, and the
-absolute value of the result is always less than the divisor's absolute value.
+Associated contracting dimension numbers from the 'lhs' and 'rhs' do not need
+to be the same, but must be listed in the same order in both
+'lhs/rhs_contracting_dimensions' arrays and have the same dimension sizes.
+There must be exactly one contracting dimension on both 'lhs' and 'rhs'.
 
-An alternative variant with different-rank broadcasting support exists for these
-operations:
+Example with contracting dimension numbers:
 
-<b> `Op(lhs, rhs, broadcast_dimensions)` </b>
+```
+lhs = { {1.0, 2.0, 3.0},
+        {4.0, 5.0, 6.0} }
 
-Where `Op` is the same as above. This variant of the operation should be used
-for arithmetic operations between arrays of different ranks (such as adding a
-matrix to a vector).
+rhs = { {1.0, 1.0, 1.0},
+        {2.0, 2.0, 2.0} }
 
-The additional `broadcast_dimensions` operand is a slice of integers used to
-expand the rank of the lower-rank operand up to the rank of the higher-rank
-operand. `broadcast_dimensions` maps the dimensions of the lower-rank shape to
-the dimensions of the higher-rank shape. The unmapped dimensions of the expanded
-shape are filled with dimensions of size one. Degenerate-dimension broadcasting
-then broadcasts the shapes along these degenerate dimension to equalize the
-shapes of both operands. The semantics are described in detail on the
-@{$broadcasting$broadcasting page}.
+DotDimensionNumbers dnums;
+dnums.add_lhs_contracting_dimensions(1);
+dnums.add_rhs_contracting_dimensions(1);
 
-## Element-wise comparison operations
+DotGeneral(lhs, rhs, dnums) -> { {6.0, 12.0},
+                                 {15.0, 30.0} }
+```
 
-See also
-[`ComputationBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+Associated batch dimension numbers from the 'lhs' and 'rhs' must have the same
+dimension number, must be listed in the same order in both arrays, must
+have the same dimension sizes, and must be ordered before contracting and
+non-contracting/non-batch dimension numbers.
 
-A set of standard element-wise binary comparison operations is supported. Note
-that standard IEEE 754 floating-point comparison semantics apply when comparing
-floating-point types.
+Example with batch dimension numbers (batch size 2, 2x2 matrices):
 
-<b> `Op(lhs, rhs)` </b>
+```
+lhs = { { {1.0, 2.0},
+          {3.0, 4.0} },
+        { {5.0, 6.0},
+          {7.0, 8.0} } }
+
+rhs = { { {1.0, 0.0},
+          {0.0, 1.0} },
+        { {1.0, 0.0},
+          {0.0, 1.0} } }
+
+DotDimensionNumbers dnums;
+dnums.add_lhs_contracting_dimensions(2);
+dnums.add_rhs_contracting_dimensions(1);
+dnums.add_lhs_batch_dimensions(0);
+dnums.add_rhs_batch_dimensions(0);
+
+DotGeneral(lhs, rhs, dnums) -> { { {1.0, 2.0},
+                                   {3.0, 4.0} },
+                                 { {5.0, 6.0},
+                                   {7.0, 8.0} } }
+```
 
-Where `Op` is one of `Eq` (equal-to), `Ne` (not equal-to), `Ge`
-(greater-or-equal-than), `Gt` (greater-than), `Le` (less-or-equal-than), `Lt`
-(less-than).
+| Input                               | Output            | Semantics        |
+| ----------------------------------- | ----------------- | ---------------- |
+| [b0, m, k] `dot` [b0, k, n]         | [b0, m, n]        |  batch matmul    |
+| [b0, b1, m, k] `dot` [b0, b1, k, n] | [b0, b1, m, n]    |  batch matmul    |
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ----------------------------------------
-`lhs`     | `ComputationDataHandle` | left-hand-side operand: array of type T
-`rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
+It follows that the resulting dimension number starts with the batch dimension,
+then the 'lhs' non-contracting/non-batch dimension, and finally the 'rhs'
+non-contracting/non-batch dimension.
 
-The arguments' shapes have to be either similar or compatible. See the
-@{$broadcasting$broadcasting} documentation about what it means for shapes to
-be compatible. The result of an operation has a shape which is the result of
-broadcasting the two input arrays with the element type `PRED`. In this variant,
-operations between arrays of different ranks are *not* supported, unless one of
-the operands is a scalar.
+## DynamicSlice
 
-An alternative variant with different-rank broadcasting support exists for these
-operations:
+See also
+[`ComputationBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-<b> `Op(lhs, rhs, broadcast_dimensions)` </b>
+DynamicSlice extracts a sub-array from the input array at dynamic
+`start_indices`. The size of the slice in each dimension is passed in
+`size_indices`, which specify the end point of exclusive slice intervals in each
+dimension: [start, start + size). The shape of `start_indices` must be rank ==
+1, with dimension size equal to the rank of `operand`.
+Note: handling of out-of-bounds slice indices (generated by incorrect runtime
+calculation of 'start_indices') is currently implementation-defined. Currently,
+slice indices are computed modulo input dimension sizes to prevent out-of-bound
+array accesses, but this behavior may change in future implementations.
 
-Where `Op` is the same as above. This variant of the operation should be used
-for comparison operations between arrays of different ranks (such as adding a
-matrix to a vector).
+<b> `DynamicSlice(operand, start_indices, size_indices)` </b>
 
-The additional `broadcast_dimensions` operand is a slice of integers specifying
-the dimensions to use for broadcasting the operands. The semantics are described
-in detail on the @{$broadcasting$broadcasting page}.
+| Arguments       | Type                    | Semantics                        |
+| --------------- | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
+| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
+:                 :                         : containing the starting indices  :
+:                 :                         : of the slice for each dimension. :
+:                 :                         : Value must be greater than or    :
+:                 :                         : equal to zero.                   :
+| `size_indices`  | `ArraySlice<int64>`     | List of N integers containing    |
+:                 :                         : the slice size for each          :
+:                 :                         : dimension. Each value must be    :
+:                 :                         : strictly greater than zero, and  :
+:                 :                         : start + size must be less than   :
+:                 :                         : or equal to the size of the      :
+:                 :                         : dimension to avoid wrapping      :
+:                 :                         : modulo dimension size.           :
 
-## Element-wise unary functions
+1-dimensional example:
 
-ComputationBuilder supports these element-wise unary functions:
+```
+let a = {0.0, 1.0, 2.0, 3.0, 4.0}
+let s = {2}
 
-<b>`Abs(operand)`</b> Element-wise abs `x -> |x|`.
+DynamicSlice(a, s, {2}) produces:
+  {2.0, 3.0}
+```
 
-<b>`Ceil(operand)`</b> Element-wise ceil `x -> ⌈x⌉`.
+2-dimensional example:
 
-<b>`Cos(operand)`</b> Element-wise cosine `x -> cos(x)`.
+```
+let b =
+ { {0.0,  1.0,  2.0},
+   {3.0,  4.0,  5.0},
+   {6.0,  7.0,  8.0},
+   {9.0, 10.0, 11.0} }
+let s = {2, 1}
 
-<b>`Exp(operand)`</b> Element-wise natural exponential `x -> e^x`.
+DynamicSlice(b, s, {2, 2}) produces:
+  { { 7.0,  8.0},
+    {10.0, 11.0} }
+```
+## DynamicUpdateSlice
 
-<b>`Floor(operand)`</b> Element-wise floor `x -> ⌊x⌋`.
+See also
+[`ComputationBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-<b>`IsFinite(operand)`</b> Tests whether each element of `operand` is finite,
-i.e., is not positive or negative infinity, and is not `NaN`. Returns an array
-of `PRED` values with the same shape as the input, where each element is `true`
-if and only if the corresponding input element is finite.
+DynamicUpdateSlice generates a result which is the value of the input array
+`operand`, with a slice `update` overwritten at `start_indices`.
+The shape of `update` determines the shape of the sub-array of the result which
+is updated.
+The shape of `start_indices` must be rank == 1, with dimension size equal to
+the rank of `operand`.
+Note: handling of out-of-bounds slice indices (generated by incorrect runtime
+calculation of 'start_indices') is currently implementation-defined. Currently,
+slice indices are computed modulo update dimension sizes to prevent out-of-bound
+array accesses, but this behavior may change in future implementations.
 
-<b>`Log(operand)`</b> Element-wise natural logarithm `x -> ln(x)`.
+<b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
 
-<b>`LogicalNot(operand)`</b> Element-wise logical not `x -> !(x)`.
+| Arguments       | Type                    | Semantics                        |
+| --------------- | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
+| `update`        | `ComputationDataHandle` | N dimensional array of type T    |
+:                 :                         : containing the slice update.     :
+:                 :                         : Each dimension of update shape    :
+:                 :                         : must be strictly greater than    :
+:                 :                         : zero, and start + update must be :
+:                 :                         : less than operand size for each  :
+:                 :                         : dimension to avoid generating    :
+:                 :                         : out-of-bounds update indices.    :
+| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
+:                 :                         : containing the starting indices  :
+:                 :                         : of the slice for each dimension. :
+:                 :                         : Value must be greater than or    :
+:                 :                         : equal to zero.                   :
 
-<b>`Neg(operand)`</b> Element-wise negation `x -> -x`.
+1-dimensional example:
 
-<b>`Sign(operand)`</b> Element-wise sign operation `x -> sgn(x)` where
+```
+let a = {0.0, 1.0, 2.0, 3.0, 4.0}
+let u = {5.0, 6.0}
+let s = {2}
 
-$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ 0 & x = 0\\ 1 & x > 0 \end{cases}$$
+DynamicUpdateSlice(a, u, s) produces:
+  {0.0, 1.0, 5.0, 6.0, 4.0}
+```
 
-using the comparison operator of the element type of `operand`.
+2-dimensional example:
 
-<b>`Tanh(operand)`</b> Element-wise hyperbolic tangent `x -> tanh(x)`.
+```
+let b =
+ { {0.0,  1.0,  2.0},
+   {3.0,  4.0,  5.0},
+   {6.0,  7.0,  8.0},
+   {9.0, 10.0, 11.0} }
+let u =
+ { {12.0,  13.0},
+   {14.0,  15.0},
+   {16.0,  17.0} }
 
+let s = {1, 1}
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ---------------------------
-`operand` | `ComputationDataHandle` | The operand to the function
+DynamicUpdateSlice(b, u, s) produces:
+ { {0.0,  1.0,  2.0},
+   {3.0, 12.0, 13.0},
+   {6.0, 14.0, 15.0},
+   {9.0, 16.0, 17.0} }
+```
 
-The function is applied to each element in the `operand` array, resulting in an
-array with the same shape. It is allowed for `operand` to be a scalar (rank 0).
+## Element-wise binary arithmetic operations
 
+See also
+[`ComputationBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-## BatchNormTraining
+A set of element-wise binary arithmetic operations is supported.
 
-See also
-[`ComputationBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
-[`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
-for a detailed description of the algorithm.
+<b> `Op(lhs, rhs)` </b>
 
-<b> Warning: Not implemented on GPU backend yet. </b>
+Where `Op` is one of `Add` (addition), `Sub` (subtraction), `Mul`
+(multiplication), `Div` (division), `Rem` (remainder), `Max` (maximum), `Min`
+(minimum), `LogicalAnd` (logical AND), or `LogicalOr` (logical OR).
 
-Normalizes an array across batch and spatial dimensions.
+Arguments | Type                    | Semantics
+--------- | ----------------------- | ----------------------------------------
+`lhs`     | `ComputationDataHandle` | left-hand-side operand: array of type T
+`rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
 
-<b> `BatchNormTraining(operand, scale, offset, epsilon, feature_index)` </b>
+The arguments' shapes have to be either similar or compatible. See the
+@{$broadcasting$broadcasting} documentation about what it means for shapes to
+be compatible. The result of an operation has a shape which is the result of
+broadcasting the two input arrays. In this variant, operations between arrays of
+different ranks are *not* supported, unless one of the operands is a scalar.
 
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
-:                 :                         : normalized                       :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\gamma\\))                   :
-| `offset`        | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\beta\\ )                    :
-| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
-| `feature_index` | `int64`                 | Index to feature dimension       |
-:                 :                         : in `operand`                     :
+When `Op` is `Rem`, the sign of the result is taken from the dividend, and the
+absolute value of the result is always less than the divisor's absolute value.
 
+An alternative variant with different-rank broadcasting support exists for these
+operations:
 
-For each feature in the feature dimension (`feature_index` is the index for the
-feature dimension in `operand`), the operation calculates the mean and variance
-across all the other dimensions and use the mean and variance to normalize each
-element in `operand`. The `feature_index` must be a valid index for the feature
-dimension in `operand`.
+<b> `Op(lhs, rhs, broadcast_dimensions)` </b>
 
-The algorithm goes as follows for each batch in `operand` \\(x\\) that
-contains `m` elements with `w` and `h` as the size of spatial dimensions (
-assuming `operand` is an 4 dimensional array):
+Where `Op` is the same as above. This variant of the operation should be used
+for arithmetic operations between arrays of different ranks (such as adding a
+matrix to a vector).
 
-- Calculates batch mean \\(\mu_l\\) for each feature `l` in feature dimension:
-\\(\mu_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h x_{ijkl}\\)
+The additional `broadcast_dimensions` operand is a slice of integers used to
+expand the rank of the lower-rank operand up to the rank of the higher-rank
+operand. `broadcast_dimensions` maps the dimensions of the lower-rank shape to
+the dimensions of the higher-rank shape. The unmapped dimensions of the expanded
+shape are filled with dimensions of size one. Degenerate-dimension broadcasting
+then broadcasts the shapes along these degenerate dimension to equalize the
+shapes of both operands. The semantics are described in detail on the
+@{$broadcasting$broadcasting page}.
 
-- Calculates batch variance \\(\sigma^2_l\\):
-\\(\sigma^2_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h (x_{ijkl} - \mu_l)^2\\)
+## Element-wise comparison operations
 
-- Normalizes, scales and shifts:
-\\(y_{ijkl}=\frac{\gamma_l(x_{ijkl}-\mu_l)}{\sqrt[2]{\sigma^2_l+\epsilon}}+\beta_l\\)
+See also
+[`ComputationBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-The epsilon value, usually a small number, is added to avoid divide-by-zero errors.
+A set of standard element-wise binary comparison operations is supported. Note
+that standard IEEE 754 floating-point comparison semantics apply when comparing
+floating-point types.
 
-The output type is a tuple of three ComputationDataHandles:
+<b> `Op(lhs, rhs)` </b>
 
-| Outputs      | Type                    | Semantics                            |
-| ------------ | ----------------------- | -------------------------------------|
-| `output`     | `ComputationDataHandle` | n dimensional array with the same    |
-:              :                         : shape as input `operand` (y)         :
-| `batch_mean` | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))      |
-| `batch_var`  | `ComputationDataHandle` | 1 dimensional array (\\(\sigma^2\\)) |
+Where `Op` is one of `Eq` (equal-to), `Ne` (not equal-to), `Ge`
+(greater-or-equal-than), `Gt` (greater-than), `Le` (less-or-equal-than), `Lt`
+(less-than).
 
-The `batch_mean` and `batch_var` are moments calculated across the batch and
-spatial dimensions using the formulas above.
+Arguments | Type                    | Semantics
+--------- | ----------------------- | ----------------------------------------
+`lhs`     | `ComputationDataHandle` | left-hand-side operand: array of type T
+`rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
 
-## BatchNormInference
+The arguments' shapes have to be either similar or compatible. See the
+@{$broadcasting$broadcasting} documentation about what it means for shapes to
+be compatible. The result of an operation has a shape which is the result of
+broadcasting the two input arrays with the element type `PRED`. In this variant,
+operations between arrays of different ranks are *not* supported, unless one of
+the operands is a scalar.
 
-See also
-[`ComputationBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+An alternative variant with different-rank broadcasting support exists for these
+operations:
 
-<b> Warning: Not implemented yet. </b>
+<b> `Op(lhs, rhs, broadcast_dimensions)` </b>
 
-Normalizes an array across batch and spatial dimensions.
+Where `Op` is the same as above. This variant of the operation should be used
+for comparison operations between arrays of different ranks (such as adding a
+matrix to a vector).
 
-<b> `BatchNormInference(operand, scale, offset, mean, variance, epsilon, feature_index)` </b>
+The additional `broadcast_dimensions` operand is a slice of integers specifying
+the dimensions to use for broadcasting the operands. The semantics are described
+in detail on the @{$broadcasting$broadcasting page}.
 
-| Arguments       | Type                    | Semantics                       |
-| --------------  | ----------------------- | ------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be       |
-:                 :                         : normalized                      :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array             |
-| `offset`        | `ComputationDataHandle` | 1 dimensional array             |
-| `mean`          | `ComputationDataHandle` | 1 dimensional array             |
-| `variance`      | `ComputationDataHandle` | 1 dimensional array             |
-| `epsilon`       | `float`                 | Epsilon value                   |
-| `feature_index` | `int64`                 | Index to feature dimension in   |
-:                 :                         : `operand`                       :
+## Element-wise unary functions
 
-For each feature in the feature dimension (`feature_index` is the index for the
-feature dimension in `operand`), the operation calculates the mean and variance
-across all the other dimensions and use the mean and variance to normalize each
-element in `operand`. The `feature_index` must be a valid index for the feature
-dimension in `operand`.
+ComputationBuilder supports these element-wise unary functions:
 
-`BatchNormInference`  is equivalent to calling `BatchNormTraining` without
-computing `mean` and `variance` for each batch. It uses the input `mean` and
-`variance` instead as estimated values. The purpose of this op is to reduce
-latency in inference, hence the name `BatchNormInference`.
+<b>`Abs(operand)`</b> Element-wise abs `x -> |x|`.
 
-The output is a n dimensional, normalized array with the same shape as input
-`operand`.
+<b>`Ceil(operand)`</b> Element-wise ceil `x -> ⌈x⌉`.
 
-## BatchNormGrad
+<b>`Cos(operand)`</b> Element-wise cosine `x -> cos(x)`.
 
-See also
-[`ComputationBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+<b>`Exp(operand)`</b> Element-wise natural exponential `x -> e^x`.
 
-<b> Warning: Not implemented yet. </b>
+<b>`Floor(operand)`</b> Element-wise floor `x -> ⌊x⌋`.
 
-Calculates gradients of batch norm.
+<b>`IsFinite(operand)`</b> Tests whether each element of `operand` is finite,
+i.e., is not positive or negative infinity, and is not `NaN`. Returns an array
+of `PRED` values with the same shape as the input, where each element is `true`
+if and only if the corresponding input element is finite.
 
-<b> `BatchNormGrad(operand, scale, mean, variance, grad_output, epsilon, feature_index)` </b>
+<b>`Log(operand)`</b> Element-wise natural logarithm `x -> ln(x)`.
 
-| Arguments       | Type                    | Semantics                        |
-| --------------  | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
-:                 :                         : normalized (x)                   :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\gamma\\))                   :
-| `mean`          | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))  |
-| `variance`      | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\sigma^2\\))                 :
-| `grad_output`   | `ComputationDataHandle` | Gradients passed to              |
-:                 :                         : `BatchNormTraining`              :
-:                 :                         : (\\( \nabla y\\))                :
-| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
-| `feature_index` | `int64`                 | Index to feature dimension in    |
-:                 :                         : `operand`                        :
+<b>`LogicalNot(operand)`</b> Element-wise logical not `x -> !(x)`.
 
-For each feature in the feature dimension (`feature_index` is the index for the
-feature dimension in `operand`), the operation calculates the gradients with
-respect to `operand`, `offset` and `scale` across all the other dimensions. The
-`feature_index` must be a valid index for the feature dimension in `operand`.
+<b>`Neg(operand)`</b> Element-wise negation `x -> -x`.
 
-The three gradients are defined by the following formulas:
+<b>`Sign(operand)`</b> Element-wise sign operation `x -> sgn(x)` where
 
-\\( \nabla x = \nabla y * \gamma * \sqrt{\sigma^2+\epsilon} \\)
+$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ 0 & x = 0\\ 1 & x > 0 \end{cases}$$
 
-\\( \nabla \gamma = sum(\nabla y * (x - \mu) * \sqrt{\sigma^2 + \epsilon}) \\)
+using the comparison operator of the element type of `operand`.
 
-\\( \nabla \beta = sum(\nabla y) \\)
+<b>`Tanh(operand)`</b> Element-wise hyperbolic tangent `x -> tanh(x)`.
 
-The inputs `mean` and `variance` represents moments value
-across batch and spatial dimensions.
 
-The output type is a tuple of three ComputationDataHandles:
+Arguments | Type                    | Semantics
+--------- | ----------------------- | ---------------------------
+`operand` | `ComputationDataHandle` | The operand to the function
 
-|Outputs       | Type                    | Semantics                           |
-|------------- | ----------------------- | ------------------------------------|
-|`grad_operand`| `ComputationDataHandle` | gradient with respect to input      |
-:              :                         : `operand`                           :
-|`grad_offset` | `ComputationDataHandle` | gradient with respect to input      |
-:              :                         : `offset`                            :
-|`grad_scale`  | `ComputationDataHandle` | gradient with respect to input      |
-:              :                         : `scale`                             :
+The function is applied to each element in the `operand` array, resulting in an
+array with the same shape. It is allowed for `operand` to be a scalar (rank 0).
 
 
 ## GetTupleElement
@@ -955,61 +1194,6 @@ transfer. The context is a tuple of {receive buffer (shape), request identifier
 Given a context created by a `Recv` instruction, waits for the data transfer to
 complete and returns the received data.
 
-## Send
-
-See also
-[`ComputationBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-<b> `Send(operand, channel_handle)` </b>
-
-| Arguments        | Type                    | Semantics                        |
-| ---------------- | ----------------------- | -------------------------------- |
-| `operand`        | `ComputationDataHandle` | data to send (array of type T)   |
-| `channel_handle` | `ChannelHandle`         | unique identifier for each send/recv pair |
-
-Sends the given operand data to a `Recv` instruction in another computation
-that shares the same channel handle. Does not return any data.
-
-Similar to the `Recv` operation, the client API of `Send` operation represents
-synchronous communication, and is internally decomposed into 2 HLO instructions
-(`Send` and `SendDone`) to enable asynchronous data transfers. See also
-[`HloInstruction::CreateSend` and `HloInstruction::CreateSendDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
-
-<b>`Send(HloInstruction operand, int64 channel_id)`</b>
-
-Initiates an asynchronous transfer of the operand to the resources allocated by
-the `Recv` instruction with the same channel id. Returns a context, which is
-used by a following `SendDone` instruction to wait for the completion of the
-data transfer. The context is a tuple of {operand (shape), request identifier
-(U32)} and it can only be used by a `SendDone` instruction.
-
-<b> `SendDone(HloInstruction context)` </b>
-
-Given a context created by a `Send` instruction, waits for the data transfer to
-complete.  The instruction does not return any data.
-
-<b> Scheduling of channel instructions </b>
-
-The execution order of the 4 instructions for each channel (`Recv`, `RecvDone`,
-`Send`, `SendDone`) is as below.
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:70%" src="../../images/send_recv_order.png">
-</div>
-
-* `Recv` happens before `Send`
-* `Send` happens before `RecvDone`
-* `Recv` happens before `RecvDone`
-* `Send` happens before `SendDone`
-
-When the backend compilers generate a linear schedule for each computation that
-communicates via channel instructions, there must not be cycles across the
-computations. For example, below schedules lead to deadlocks.
-
-<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/send_recv_schedule.png">
-</div>
-
 ## Reduce
 
 See also
@@ -1163,7 +1347,6 @@ must have a non-negative number of mantissa bits.  The number of exponent or
 mantissa bits may exceed the corresponding value for type `T`; the corresponding
 portion of the conversion is then simply a no-op.
 
-
 ## ReduceWindow
 
 See also
@@ -1348,63 +1531,97 @@ the reversing dimensions, its index i is transformed into N - 1 - i).
 One use for the `Rev` operation is to reverse the convolution weight array along
 the two window dimensions during the gradient computation in neural networks.
 
-## RngBernoulli
+## RngNormal
 
 See also
-[`ComputationBuilder::RngBernoulli`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`ComputationBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
 Constructs an output of a given shape with random numbers generated following
-the Bernoulli distribution. The parameter needs to be a scalar valued F32
-operand while the output shape needs to have elemental type U32.
+the $$N(\mu, \sigma)$$ normal distribution. The parameters `mu` and `sigma`, and
+output shape have to have elemental type F32. The parameters furthermore have to
+be scalar valued.
 
-<b>`RngBernoulli(mean, shape)`</b>
+<b>`RngNormal(mean, sigma, shape)`</b>
 
-| Arguments | Type                    | Semantics                             |
-| --------- | ----------------------- | ------------------------------------- |
-| `mean`    | `ComputationDataHandle` | Scalar of type F32 specifying mean of |
-:           :                         : generated numbers                     :
-| `shape`   | `Shape`                 | Output shape of type U32              |
+| Arguments | Type                    | Semantics                              |
+| --------- | ----------------------- | -------------------------------------- |
+| `mu`      | `ComputationDataHandle` | Scalar of type F32 specifying mean of  |
+:           :                         : generated numbers                      :
+| `sigma`   | `ComputationDataHandle` | Scalar of type F32 specifying standard |
+:           :                         : deviation of generated numbers         :
+| `shape`   | `Shape`                 | Output shape of type F32               |
 
-## RngNormal
+## RngUniform
 
 See also
-[`ComputationBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`ComputationBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
 Constructs an output of a given shape with random numbers generated following
-the $$N(\mu, \sigma)$$ normal distribution. The parameters `mu` and `sigma`, and
-output shape have to have elemental type F32. The parameters furthermore have to
-be scalar valued.
+the uniform distribution over the interval $$[a,b)$$. The parameters and output
+shape may be either F32, S32 or U32, but the types have to be consistent.
+Furthermore, the parameters need to be scalar valued. If $$b <= a$$ the result
+is implementation-defined.
 
-<b>`RngNormal(mean, sigma, shape)`</b>
+<b>`RngUniform(a, b, shape)`</b>
+
+| Arguments | Type                    | Semantics                         |
+| --------- | ----------------------- | --------------------------------- |
+| `a`       | `ComputationDataHandle` | Scalar of type T specifying lower |
+:           :                         : limit of interval                 :
+| `b`       | `ComputationDataHandle` | Scalar of type T specifying upper |
+:           :                         : limit of interval                 :
+| `shape`   | `Shape`                 | Output shape of type T            |
+
+## Select
+
+See also
+[`ComputationBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+Constructs an output array from elements of two input arrays, based on the
+values of a predicate array.
+
+<b> `Select(pred, on_true, on_false)` </b>
+
+Arguments  | Type                    | Semantics
+---------- | ----------------------- | ------------------
+`pred`     | `ComputationDataHandle` | array of type PRED
+`on_true`  | `ComputationDataHandle` | array of type T
+`on_false` | `ComputationDataHandle` | array of type T
+
+The arrays `on_true` and `on_false` must have the same shape. This is also the
+shape of the output array. The array `pred` must have the same dimensionality as
+`on_true` and `on_false`, with the `PRED` element type.
 
-| Arguments | Type                    | Semantics                              |
-| --------- | ----------------------- | -------------------------------------- |
-| `mu`      | `ComputationDataHandle` | Scalar of type F32 specifying mean of  |
-:           :                         : generated numbers                      :
-| `sigma`   | `ComputationDataHandle` | Scalar of type F32 specifying standard |
-:           :                         : deviation of generated numbers         :
-| `shape`   | `Shape`                 | Output shape of type F32               |
+For each element `P` of `pred`, the corresponding element of the output array is
+taken from `on_true` if the value of `P` is `true`, and from `on_false` if the
+value of `P` is `false`. As a restricted form of [broadcasting]
+(broadcasting.md), `pred` can be a scalar of type `PRED`. In this case, the
+output array is taken wholly from `on_true` if `pred` is `true`, and from
+`on_false` if `pred` is `false`.
 
-## RngUniform
+Example with non-scalar `pred`:
 
-See also
-[`ComputationBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+```
+let pred: PRED[4] = {true, false, false, true};
+let v1: s32[4] = {1, 2, 3, 4};
+let v2: s32[4] = {100, 200, 300, 400};
+==>
+Select(pred, v1, v2) = s32[4]{1, 200, 300, 4};
+```
 
-Constructs an output of a given shape with random numbers generated following
-the uniform distribution over the interval $$[a,b)$$. The parameters and output
-shape may be either F32, S32 or U32, but the types have to be consistent.
-Furthermore, the parameters need to be scalar valued. If $$b <= a$$ the result
-is implementation-defined.
+Example with scalar `pred`:
 
-<b>`RngUniform(a, b, shape)`</b>
+```
+let pred: PRED = true;
+let v1: s32[4] = {1, 2, 3, 4};
+let v2: s32[4] = {100, 200, 300, 400};
+==>
+Select(pred, v1, v2) = s32[4]{1, 2, 3, 4};
+```
 
-| Arguments | Type                    | Semantics                         |
-| --------- | ----------------------- | --------------------------------- |
-| `a`       | `ComputationDataHandle` | Scalar of type T specifying lower |
-:           :                         : limit of interval                 :
-| `b`       | `ComputationDataHandle` | Scalar of type T specifying upper |
-:           :                         : limit of interval                 :
-| `shape`   | `Shape`                 | Output shape of type T            |
+Selections between tuples are supported. Tuples are considered to be scalar
+types for this purpose. If `on_true` and `on_false` are tuples (which must have
+the same shape!) then `pred` has to be a scalar of type `PRED`.
 
 ## SelectAndScatter
 
@@ -1487,56 +1704,60 @@ non-deterministic. Therefore, the `scatter` function should not be overly
 sensitive to reassociation. See the discussion about associativity in the
 context of [`Reduce`](#reduce) for more details.
 
-## Select
+## Send
 
 See also
-[`ComputationBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`ComputationBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-Constructs an output array from elements of two input arrays, based on the
-values of a predicate array.
+<b> `Send(operand, channel_handle)` </b>
 
-<b> `Select(pred, on_true, on_false)` </b>
+| Arguments        | Type                    | Semantics                        |
+| ---------------- | ----------------------- | -------------------------------- |
+| `operand`        | `ComputationDataHandle` | data to send (array of type T)   |
+| `channel_handle` | `ChannelHandle`         | unique identifier for each send/recv pair |
 
-Arguments  | Type                    | Semantics
----------- | ----------------------- | ------------------
-`pred`     | `ComputationDataHandle` | array of type PRED
-`on_true`  | `ComputationDataHandle` | array of type T
-`on_false` | `ComputationDataHandle` | array of type T
+Sends the given operand data to a `Recv` instruction in another computation
+that shares the same channel handle. Does not return any data.
 
-The arrays `on_true` and `on_false` must have the same shape. This is also the
-shape of the output array. The array `pred` must have the same dimensionality as
-`on_true` and `on_false`, with the `PRED` element type.
+Similar to the `Recv` operation, the client API of `Send` operation represents
+synchronous communication, and is internally decomposed into 2 HLO instructions
+(`Send` and `SendDone`) to enable asynchronous data transfers. See also
+[`HloInstruction::CreateSend` and `HloInstruction::CreateSendDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
 
-For each element `P` of `pred`, the corresponding element of the output array is
-taken from `on_true` if the value of `P` is `true`, and from `on_false` if the
-value of `P` is `false`. As a restricted form of [broadcasting]
-(broadcasting.md), `pred` can be a scalar of type `PRED`. In this case, the
-output array is taken wholly from `on_true` if `pred` is `true`, and from
-`on_false` if `pred` is `false`.
+<b>`Send(HloInstruction operand, int64 channel_id)`</b>
 
-Example with non-scalar `pred`:
+Initiates an asynchronous transfer of the operand to the resources allocated by
+the `Recv` instruction with the same channel id. Returns a context, which is
+used by a following `SendDone` instruction to wait for the completion of the
+data transfer. The context is a tuple of {operand (shape), request identifier
+(U32)} and it can only be used by a `SendDone` instruction.
 
-```
-let pred: PRED[4] = {true, false, false, true};
-let v1: s32[4] = {1, 2, 3, 4};
-let v2: s32[4] = {100, 200, 300, 400};
-==>
-Select(pred, v1, v2) = s32[4]{1, 200, 300, 4};
-```
+<b> `SendDone(HloInstruction context)` </b>
 
-Example with scalar `pred`:
+Given a context created by a `Send` instruction, waits for the data transfer to
+complete.  The instruction does not return any data.
 
-```
-let pred: PRED = true;
-let v1: s32[4] = {1, 2, 3, 4};
-let v2: s32[4] = {100, 200, 300, 400};
-==>
-Select(pred, v1, v2) = s32[4]{1, 2, 3, 4};
-```
+<b> Scheduling of channel instructions </b>
 
-Selections between tuples are supported. Tuples are considered to be scalar
-types for this purpose. If `on_true` and `on_false` are tuples (which must have
-the same shape!) then `pred` has to be a scalar of type `PRED`.
+The execution order of the 4 instructions for each channel (`Recv`, `RecvDone`,
+`Send`, `SendDone`) is as below.
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:70%" src="../../images/send_recv_order.png">
+</div>
+
+* `Recv` happens before `Send`
+* `Send` happens before `RecvDone`
+* `Recv` happens before `RecvDone`
+* `Send` happens before `SendDone`
+
+When the backend compilers generate a linear schedule for each computation that
+communicates via channel instructions, there must not be cycles across the
+computations. For example, below schedules lead to deadlocks.
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="../../images/send_recv_schedule.png">
+</div>
 
 ## Slice
 
@@ -1590,132 +1811,6 @@ Slice(b, {2, 1}, {4, 3}) produces:
     {10.0, 11.0} }
 ```
 
-## DynamicSlice
-
-See also
-[`ComputationBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-DynamicSlice extracts a sub-array from the input array at dynamic
-`start_indices`. The size of the slice in each dimension is passed in
-`size_indices`, which specify the end point of exclusive slice intervals in each
-dimension: [start, start + size). The shape of `start_indices` must be rank ==
-1, with dimension size equal to the rank of `operand`.
-Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined. Currently,
-slice indices are computed modulo input dimension sizes to prevent out-of-bound
-array accesses, but this behavior may change in future implementations.
-
-<b> `DynamicSlice(operand, start_indices, size_indices)` </b>
-
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
-| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
-:                 :                         : containing the starting indices  :
-:                 :                         : of the slice for each dimension. :
-:                 :                         : Value must be greater than or    :
-:                 :                         : equal to zero.                   :
-| `size_indices`  | `ArraySlice<int64>`     | List of N integers containing    |
-:                 :                         : the slice size for each          :
-:                 :                         : dimension. Each value must be    :
-:                 :                         : strictly greater than zero, and  :
-:                 :                         : start + size must be less than   :
-:                 :                         : or equal to the size of the      :
-:                 :                         : dimension to avoid wrapping      :
-:                 :                         : modulo dimension size.           :
-
-1-dimensional example:
-
-```
-let a = {0.0, 1.0, 2.0, 3.0, 4.0}
-let s = {2}
-
-DynamicSlice(a, s, {2}) produces:
-  {2.0, 3.0}
-```
-
-2-dimensional example:
-
-```
-let b =
- { {0.0,  1.0,  2.0},
-   {3.0,  4.0,  5.0},
-   {6.0,  7.0,  8.0},
-   {9.0, 10.0, 11.0} }
-let s = {2, 1}
-
-DynamicSlice(b, s, {2, 2}) produces:
-  { { 7.0,  8.0},
-    {10.0, 11.0} }
-```
-## DynamicUpdateSlice
-
-See also
-[`ComputationBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-DynamicUpdateSlice generates a result which is the value of the input array
-`operand`, with a slice `update` overwritten at `start_indices`.
-The shape of `update` determines the shape of the sub-array of the result which
-is updated.
-The shape of `start_indices` must be rank == 1, with dimension size equal to
-the rank of `operand`.
-Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined. Currently,
-slice indices are computed modulo update dimension sizes to prevent out-of-bound
-array accesses, but this behavior may change in future implementations.
-
-<b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
-
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
-| `update`        | `ComputationDataHandle` | N dimensional array of type T    |
-:                 :                         : containing the slice update.     :
-:                 :                         : Each dimension of update shape    :
-:                 :                         : must be strictly greater than    :
-:                 :                         : zero, and start + update must be :
-:                 :                         : less than operand size for each  :
-:                 :                         : dimension to avoid generating    :
-:                 :                         : out-of-bounds update indices.    :
-| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
-:                 :                         : containing the starting indices  :
-:                 :                         : of the slice for each dimension. :
-:                 :                         : Value must be greater than or    :
-:                 :                         : equal to zero.                   :
-
-1-dimensional example:
-
-```
-let a = {0.0, 1.0, 2.0, 3.0, 4.0}
-let u = {5.0, 6.0}
-let s = {2}
-
-DynamicUpdateSlice(a, u, s) produces:
-  {0.0, 1.0, 5.0, 6.0, 4.0}
-```
-
-2-dimensional example:
-
-```
-let b =
- { {0.0,  1.0,  2.0},
-   {3.0,  4.0,  5.0},
-   {6.0,  7.0,  8.0},
-   {9.0, 10.0, 11.0} }
-let u =
- { {12.0,  13.0},
-   {14.0,  15.0},
-   {16.0,  17.0} }
-
-let s = {1, 1}
-
-DynamicUpdateSlice(b, u, s) produces:
- { {0.0,  1.0,  2.0},
-   {3.0, 12.0, 13.0},
-   {6.0, 14.0, 15.0},
-   {9.0, 16.0, 17.0} }
-```
-
 ## Sort
 
 See also
diff --git a/tensorflow/docs_src/programmers_guide/datasets.md b/tensorflow/docs_src/programmers_guide/datasets.md
index 9ced56f0f5b7de7c60dd1393fce95667b0c5303d..d19200e80cdfe6620789ddd273647660c10b2a60 100644
--- a/tensorflow/docs_src/programmers_guide/datasets.md
+++ b/tensorflow/docs_src/programmers_guide/datasets.md
@@ -1,16 +1,16 @@
 # Importing Data
 
-The @{tf.data.Dataset$`Dataset`} API enables you to build complex input pipelines from
+The @{tf.data} API enables you to build complex input pipelines from
 simple, reusable pieces. For example, the pipeline for an image model might
 aggregate data from files in a distributed file system, apply random
 perturbations to each image, and merge randomly selected images into a batch
 for training. The pipeline for a text model might involve extracting symbols
 from raw text data, converting them to embedding identifiers with a lookup
-table, and batching together sequences of different lengths. The `Dataset` API
+table, and batching together sequences of different lengths. The `tf.data` API
 makes it easy to deal with large amounts of data, different data formats, and
 complicated transformations.
 
-The `Dataset` API introduces two new abstractions to TensorFlow:
+The `tf.data` API introduces two new abstractions to TensorFlow:
 
 * A `tf.data.Dataset` represents a sequence of elements, in which
   each element contains one or more `Tensor` objects. For example, in an image
@@ -121,7 +121,7 @@ dataset3 = dataset3.filter(lambda x, (y, z): ...)
 ### Creating an iterator
 
 Once you have built a `Dataset` to represent your input data, the next step is to
-create an `Iterator` to access elements from that dataset.  The `Dataset` API
+create an `Iterator` to access elements from that dataset.  The `tf.data` API
 currently supports the following iterators, in increasing level of
 sophistication:
 
@@ -322,9 +322,10 @@ sess.run(iterator.initializer)
 next1, (next2, next3) = iterator.get_next()
 ```
 
-Note that evaluating *any* of `next1`, `next2`, or `next3` will advance the
-iterator for all components. A typical consumer of an iterator will include all
-components in a single expression.
+Note that `next1`, `next2`, and `next3` are tensors produced by the
+same op/node (created by `Iterator.get_next()`). Therefore,  evaluating *any* of
+these tensors will advance the iterator for all components. A typical consumer
+of an iterator will include all components in a single expression.
 
 ## Reading input data
 
@@ -379,7 +380,7 @@ sess.run(iterator.initializer, feed_dict={features_placeholder: features,
 
 ### Consuming TFRecord data
 
-The `Dataset` API supports a variety of file formats so that you can process
+The `tf.data` API supports a variety of file formats so that you can process
 large datasets that do not fit in memory. For example, the TFRecord file format
 is a simple record-oriented binary format that many TensorFlow applications use
 for training data. The `tf.data.TFRecordDataset` class enables you to
@@ -455,9 +456,6 @@ dataset = dataset.flat_map(
         .filter(lambda line: tf.not_equal(tf.substr(line, 0, 1), "#"))))
 ```
 
-For a full example of parsing a CSV file using datasets, see [`imports85.py`](https://www.tensorflow.org/code/tensorflow/examples/get_started/regression/imports85.py)
-in @{$get_started/linear_regression}.
-
 <!--
 TODO(mrry): Add these sections.
 
@@ -540,7 +538,7 @@ import cv2
 # Use a custom OpenCV function to read the image, instead of the standard
 # TensorFlow `tf.read_file()` operation.
 def _read_py_function(filename, label):
-  image_decoded = cv2.imread(image_string, cv2.IMREAD_GRAYSCALE)
+  image_decoded = cv2.imread(filename.decode(), cv2.IMREAD_GRAYSCALE)
   return image_decoded, label
 
 # Use standard TensorFlow operations to resize the image to a fixed shape.
@@ -628,7 +626,7 @@ TODO(mrry): Add this section.
 
 ### Processing multiple epochs
 
-The `Dataset` API offers two main ways to process multiple epochs of the same
+The `tf.data` API offers two main ways to process multiple epochs of the same
 data.
 
 The simplest way to iterate over a dataset in multiple epochs is to use the
@@ -693,7 +691,7 @@ dataset = dataset.repeat()
 The @{tf.train.MonitoredTrainingSession} API simplifies many aspects of running
 TensorFlow in a distributed setting. `MonitoredTrainingSession` uses the
 @{tf.errors.OutOfRangeError} to signal that training has completed, so to use it
-with the `Dataset` API, we recommend using
+with the `tf.data` API, we recommend using
 `Dataset.make_one_shot_iterator()`. For example:
 
 ```python
@@ -735,7 +733,7 @@ def dataset_input_fn():
     parsed = tf.parse_single_example(record, keys_to_features)
 
     # Perform additional preprocessing on the parsed data.
-    image = tf.decode_jpeg(parsed["image_data"])
+    image = tf.image.decode_jpeg(parsed["image_data"])
     image = tf.reshape(image, [299, 299, 1])
     label = tf.cast(parsed["label"], tf.int32)
 
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index 25cb72008d5a5418f46aa543871e97cee996ecb5..c8fdae6f60c33776b6d9a8c1a33666ce4ddb1cb2 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -1,4 +1,4 @@
-# Debugging TensorFlow Programs
+# TensorFlow Debugger
 
 <!-- [comment]: TODO(barryr): Links to and from sections on "Graphs" & "Monitoring Learning". -->
 
@@ -159,6 +159,7 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | | `-r <range>` | Highlight elements falling into specified numerical range. Multiple ranges can be used in conjunction. | `pt hidden/Relu:0 -a -r [[-inf,-1],[1,inf]]` |
 | | `-n <number>` | Print dump corresponding to specified 0-based dump number. Required for tensors with multiple dumps. | `pt -n 0 hidden/Relu:0` |
 | | `-s` | Include a summary of the numeric values of the tensor (applicable only to non-empty tensors with Boolean and numeric types such as `int*` and `float*`.) | `pt -s hidden/Relu:0[0:50,:]` |
+| | `-w` | Write the value of the tensor (possibly sliced) to a Numpy file using [`numpy.save()`](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.save.html) | `pt -s hidden/Relu:0 -w /tmp/relu.npy` |
 | **`@[coordinates]`** | | Navigate to specified element in `pt` output. | `@[10,0]` or `@10,0` |
 | **`/regex`** | |  [less](https://linux.die.net/man/1/less)-style search for given regular expression. | `/inf` |
 | **`/`** | | Scroll to the next line with matches to the searched regex (if any). | `/` |
@@ -167,6 +168,7 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | **eval** | | **Evaluate arbitrary Python and numpy expression.** | |
 | | `eval <expression>` | Evaluate a Python / numpy expression, with numpy available as `np` and debug tensor names enclosed in backticks. | ``eval "np.matmul((`output/Identity:0` / `Softmax:0`).T, `Softmax:0`)"`` |
 | | `-a` | Print a large-sized evaluation result in its entirety, i.e., without using ellipses. | ``eval -a 'np.sum(`Softmax:0`, axis=1)'`` |
+| | `-w` | Write the result of the evaluation to a Numpy file using [`numpy.save()`](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.save.html) | ``eval -a 'np.sum(`Softmax:0`, axis=1)' -w /tmp/softmax_sum.npy`` |
 | **`ni`** | | **Display node information.** | |
 | | `-a` | Include node attributes in the output. | `ni -a hidden/Relu` |
 | | `-d` | List the debug dumps available from the node. | `ni -d hidden/Relu` |
@@ -212,7 +214,7 @@ navigate between these screens by clicking the `<--` and
 ### Other Features of the tfdbg CLI
 
 In addition to the commands listed above, the tfdbg CLI provides the following
-addditional features:
+additional features:
 
 *   To navigate through previous tfdbg commands, type in a few characters
     followed by the Up or Down arrow keys. tfdbg will show you the history of
@@ -338,11 +340,11 @@ tfdbg> ni cross_entropy/Log
 ![tfdbg run-end UI: infs and nans](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_node_info.png)
 
 You can see that this node has the op type `Log`
-and that its input is the node `softmax/Softmax`. Run the following command to
+and that its input is the node `Softmax`. Run the following command to
 take a closer look at the input tensor:
 
 ```none
-tfdbg> pt softmax/Softmax:0
+tfdbg> pt Softmax:0
 ```
 
 Examine the values in the input tensor, searching for zeros:
@@ -392,7 +394,7 @@ diff = -(y_ * tf.log(y))
 to the built-in, numerically-stable implementation of softmax cross-entropy:
 
 ```python
-diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=logits)
+diff = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=logits)
 ```
 
 Rerun with the `--debug` flag as follows:
diff --git a/tensorflow/docs_src/programmers_guide/embedding.md b/tensorflow/docs_src/programmers_guide/embedding.md
index 4095c6c97a4703bdf16e8feceaacdefaa50488b3..e8027fc12b368ddfbc51cc47441478901d7caec7 100644
--- a/tensorflow/docs_src/programmers_guide/embedding.md
+++ b/tensorflow/docs_src/programmers_guide/embedding.md
@@ -2,9 +2,10 @@
 
 This document introduces the concept of embeddings, gives a simple example of
 how to train an embedding in TensorFlow, and explains how to view embeddings
-with the TensorBoard Embedding Projector. The first two parts target newcomers
-to machine learning or TensorFlow, and the Embedding Projector how-to is for
-users at all levels.
+with the TensorBoard Embedding Projector
+([live example](http://projector.tensorflow.org)). The first two parts target
+newcomers to machine learning or TensorFlow, and the Embedding Projector how-to
+is for users at all levels.
 
 [TOC]
 
@@ -119,7 +120,7 @@ data set.
   text patterns.
 
 Further useful articles are
-[How to Use t-SNE Effectively](distill.pub/2016/misread-tsne/) and
+[How to Use t-SNE Effectively](https://distill.pub/2016/misread-tsne/) and
 [Principal Component Analysis Explained Visually](http://setosa.io/ev/principal-component-analysis/).
 
 ### Exploration
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index 6544a16f2bcb1ebbbe33489bd1a0974aa30f6a17..ffadf29ad7710de860a56253f279204d17cc318a 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -134,7 +134,7 @@ The heart of every Estimator--whether pre-made or custom--is its
 evaluation, and prediction. When you are using a pre-made Estimator,
 someone else has already implemented the model function. When relying
 on a custom Estimator, you must write the model function yourself. A
-@{$extend/estimators$companion document}
+@{$get_started/custom_estimators$companion document}
 explains how to write the model function.
 
 
@@ -186,9 +186,9 @@ est_inception_v3.train(input_fn=train_input_fn, steps=2000)
 ```
 Note that the names of feature columns and labels of a keras estimator come from
 the corresponding compiled keras model. For example, the input key names for
-@{$get_started/input_fn} in above `est_inception_v3` estimator can be obtained
-from `keras_inception_v3.input_names`, and similarily, the predicted output
-names can be obtained from `keras_inception_v3.output_names`.
+`train_input_fn` above can be obtained from `keras_inception_v3.input_names`,
+and similarly, the predicted output names can be obtained from
+`keras_inception_v3.output_names`.
 
 For more details, please refer to the documentation for
 @{tf.keras.estimator.model_to_estimator}.
diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md
index 67ed0a9a607677242838199d346393439b48545d..70931f2862de98cb1e934f85919d558a3b36304a 100644
--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@@ -68,14 +68,6 @@ dictionary that maps @{tf.Tensor} objects to
 numpy arrays (and some other types), which will be used as the values of those
 tensors in the execution of a step.
 
-Often, you have certain tensors, such as inputs, that will always be fed. The
-@{tf.placeholder} op allows you
-to define tensors that *must* be fed, and optionally allows you to constrain
-their shape as well. See the
-@{$beginners$beginners' MNIST tutorial} for an
-example of how placeholders and feeding can be used to provide the training data
-for a neural network.
-
 #### What is the difference between `Session.run()` and `Tensor.eval()`?
 
 If `t` is a @{tf.Tensor} object,
@@ -300,7 +292,7 @@ functions, methods, and properties. We also adhere to the
 [Google Python style guide](https://google.github.io/styleguide/pyguide.html).
 
 The TensorFlow C++ code base adheres to the
-[Google C++ style guide](http://google.github.io/styleguide/cppguide.html).
+[Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
 
 (<sup>*</sup> With one exception: we use 2-space indentation instead of 4-space
 indentation.)
diff --git a/tensorflow/docs_src/get_started/graph_viz.md b/tensorflow/docs_src/programmers_guide/graph_viz.md
similarity index 98%
rename from tensorflow/docs_src/get_started/graph_viz.md
rename to tensorflow/docs_src/programmers_guide/graph_viz.md
index 06ec427b757d6a34270b646341786bc8925473d5..f581ae56dae45238d697196e8ad56c86f7309604 100644
--- a/tensorflow/docs_src/get_started/graph_viz.md
+++ b/tensorflow/docs_src/programmers_guide/graph_viz.md
@@ -248,8 +248,9 @@ The images below show the CIFAR-10 model with tensor shape information:
 Often it is useful to collect runtime metadata for a run, such as total memory
 usage, total compute time, and tensor shapes for nodes. The code example below
 is a snippet from the train and test section of a modification of the
-@{$beginners$simple MNIST tutorial},
-in which we have recorded summaries and runtime statistics. See the @{$summaries_and_tensorboard#serializing-the-data$Summaries Tutorial}
+@{$layers$simple MNIST tutorial}, in which we have recorded summaries and
+runtime statistics. See the
+@{$summaries_and_tensorboard#serializing-the-data$Summaries Tutorial}
 for details on how to record summaries.
 Full source is [here](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py).
 
diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index 984058297f9ae1ad25ea4c0ef036f0477a6ac024..9049a5a9f3d44e255188c6c41cdb12a619464379 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -125,14 +125,14 @@ an operation:
   @{tf.Tensor} accepts an optional `name` argument. For example,
   `tf.constant(42.0, name="answer")` creates a new @{tf.Operation} named
   `"answer"` and returns a @{tf.Tensor} named `"answer:0"`. If the default graph
-  already contained an operation named `"answer"`, the TensorFlow would append
+  already contains an operation named `"answer"`, then TensorFlow would append
   `"_1"`, `"_2"`, and so on to the name, in order to make it unique.
 
 * The @{tf.name_scope} function makes it possible to add a **name scope** prefix
   to all operations created in a particular context. The current name scope
   prefix is a `"/"`-delimited list of the names of all active @{tf.name_scope}
   context managers. If a name scope has already been used in the current
-  context, TensorFlow appens `"_1"`, `"_2"`, and so on. For example:
+  context, TensorFlow appends `"_1"`, `"_2"`, and so on. For example:
 
   ```python
   c_0 = tf.constant(0, name="c")  # => operation named "c"
@@ -487,7 +487,7 @@ subgraph inside.
 ![](../images/mnist_deep.png)
 
 For more information about visualizing your TensorFlow application with
-TensorBoard, see the [TensorBoard tutorial](TODO).
+TensorBoard, see the [TensorBoard tutorial](../get_started/summaries_and_tensorboard.md).
 
 ## Programming with multiple graphs
 
diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index 2e2cf7c0818bb4854675726b1660f31fb73cb3d4..7a5e90081d9145ca934929f0af11f2a40cb2dcae 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -1,16 +1,24 @@
 # Programmer's Guide
 
-The documents in this unit dive into the details of writing TensorFlow
-code.  For TensorFlow 1.3, we revised this document extensively.
-The units are now as follows:
+The documents in this unit dive into the details of how TensorFlow
+works. The units are as follows:
 
-  * @{$programmers_guide/estimators$Estimators}, which introduces a high-level
+## High Level APIs
+
+  * @{$programmers_guide/estimators}, which introduces a high-level
     TensorFlow API that greatly simplifies ML programming.
-  * @{$programmers_guide/tensors$Tensors}, which explains how to create,
+  * @{$programmers_guide/datasets}, which explains how to
+    set up data pipelines to read data sets into your TensorFlow program.
+
+## Low Level APIs
+
+  * @{$programmers_guide/low_level_intro}, which introduces the
+    basics of how you can use TensorFlow outside of the high Level APIs.
+  * @{$programmers_guide/tensors}, which explains how to create,
     manipulate, and access Tensors--the fundamental object in TensorFlow.
-  * @{$programmers_guide/variables$Variables}, which details how
+  * @{$programmers_guide/variables}, which details how
     to represent shared, persistent state in your program.
-  * @{$programmers_guide/graphs$Graphs and Sessions}, which explains:
+  * @{$programmers_guide/graphs}, which explains:
       * dataflow graphs, which are TensorFlow's representation of computations
         as dependencies between operations.
       * sessions, which are TensorFlow's mechanism for running dataflow graphs
@@ -20,18 +28,40 @@ The units are now as follows:
     such as Estimators or Keras, the high-level API creates and manages
     graphs and sessions for you, but understanding graphs and sessions
     can still be helpful.
-  * @{$programmers_guide/saved_model$Saving and Restoring}, which
+  * @{$programmers_guide/saved_model}, which
     explains how to save and restore variables and models.
-  * @{$programmers_guide/datasets$Input Pipelines}, which explains how to
-    set up data pipelines to read data sets into your TensorFlow program.
-  * @{$programmers_guide/embedding$Embeddings}, which introduces the concept
+  * @{$using_gpu} explains how TensorFlow assigns operations to
+    devices and how you can change the arrangement manually.
+
+
+## ML Concepts
+
+  * @{$programmers_guide/embedding}, which introduces the concept
     of embeddings, provides a simple example of training an embedding in
     TensorFlow, and explains how to view embeddings with the TensorBoard
     Embedding Projector.
-  * @{$programmers_guide/debugger$Debugging TensorFlow Programs}, which
+
+## Debugging
+
+  * @{$programmers_guide/debugger}, which
     explains how to use the TensorFlow debugger (tfdbg).
-  * @{$programmers_guide/version_compat$TensorFlow Version Compatibility},
+
+## TensorBoard
+
+TensorBoard is a utility to visualize different aspects of machine learning.
+The following guides explain how to use TensorBoard:
+
+  * @{$programmers_guide/summaries_and_tensorboard},
+    which introduces TensorBoard.
+  * @{$programmers_guide/graph_viz}, which
+    explains how to visualize the computational graph.
+  * @{$programmers_guide/tensorboard_histograms} which demonstrates the how to
+    use TensorBoard's histogram dashboard.
+
+
+## Misc
+
+  * @{$programmers_guide/version_compat},
     which explains backward compatibility guarantees and non-guarantees.
-  * @{$programmers_guide/faq$FAQ}, which contains frequently asked
-    questions about TensorFlow. (We have not revised this document for v1.3,
-    except to remove some obsolete information.)
+  * @{$programmers_guide/faq}, which contains frequently asked
+    questions about TensorFlow.
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index 5adc7fad6ce2200d52e79d35234a209d87ad3d58..3fe4cb2ddaee40d9d6c6470bee171dedb27ad890 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -1,12 +1,31 @@
 index.md
+
+### High Level APIs
 estimators.md
+datasets.md
+
+### Low Level APIs
+low_level_intro.md
 tensors.md
 variables.md
 graphs.md
 saved_model.md
-datasets.md
+
+### Accelerators
+using_gpu.md
+using_tpu.md
+
+### ML Concepts
 embedding.md
+
+### Debugging
 debugger.md
-supervisor.md
+
+### TensorBoard
+summaries_and_tensorboard.md: Visualizing Learning
+graph_viz.md: Graphs
+tensorboard_histograms.md: Histograms
+
+### Misc
 version_compat.md
 faq.md
diff --git a/tensorflow/docs_src/programmers_guide/low_level_intro.md b/tensorflow/docs_src/programmers_guide/low_level_intro.md
new file mode 100644
index 0000000000000000000000000000000000000000..8f6d3fbd46d8b76d6033d95fd51c1df45733f5a3
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/low_level_intro.md
@@ -0,0 +1,587 @@
+# Introduction
+
+This guide gets you started programming in the low-level TensorFlow APIs
+(TensorFlow Core), showing you how to:
+
+  * Manage your own TensorFlow program (a `tf.Graph`) and TensorFlow
+    runtime (a `tf.Session`), instead of relying on Estimators to manage them.
+  * Run TensorFlow operations, using a `tf.Session`.
+  * Use high level components ([datasets](#datasets), [layers](#layers), and
+    [feature_columns](#feature_columns)) in this low level environment.
+  * Build your own training loop, instead of using the one
+    @{$get_started/premade_estimators$provided by Estimators}.
+
+We recommend using the higher level APIs to build models when possible.
+Knowing TensorFlow Core is valuable for the following reasons:
+
+  * Experimentation and debugging are both more straight forward
+    when you can use low level TensorFlow operations directly.
+  * It gives you a mental model of how things work internally when
+    using the higher level APIs.
+
+## Setup
+
+Before using this guide, @{$install$install TensorFlow}.
+
+To get the most out of this guide, you should know the following:
+
+*   How to program in Python.
+*   At least a little bit about arrays.
+*   Ideally, something about machine learning.
+
+Feel free to launch `python` and follow along with this walkthrough.
+Run the following lines to set up your Python environment:
+
+```python
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+```
+
+## Tensor Values
+
+The central unit of data in TensorFlow is the **tensor**. A tensor consists of a
+set of primitive values shaped into an array of any number of dimensions. A
+tensor's **rank** is its number of dimensions, while its **shape** is a tuple
+of integers specifying the array's length along each dimension. Here are some
+examples of tensor values:
+
+```python
+3. # a rank 0 tensor; a scalar with shape [],
+[1., 2., 3.] # a rank 1 tensor; a vector with shape [3]
+[[1., 2., 3.], [4., 5., 6.]] # a rank 2 tensor; a matrix with shape [2, 3]
+[[[1., 2., 3.]], [[7., 8., 9.]]] # a rank 3 tensor with shape [2, 1, 3]
+```
+
+TensorFlow uses numpy arrays to represent tensor **values**.
+
+## TensorFlow Core Walkthrough
+
+You might think of TensorFlow Core programs as consisting of two discrete
+sections:
+
+1.  Building the computational graph (a @{tf.Graph}).
+2.  Running the computational graph (using a @{tf.Session}).
+
+### Graph
+
+A **computational graph** is a series of TensorFlow operations arranged into a
+graph. The graph is composed of two types of objects.
+
+  * @{tf.Operation$Operations} (or "ops"): The nodes of the graph.
+    Operations describe calculations that consume and produce tensors.
+  * @{tf.Tensor$Tensors}: The edges in the graph. These represent the values
+    that will flow through the graph. Most TensorFlow functions return
+    `tf.Tensors`.
+
+Important: `tf.Tensors` do not have values, they are just handles to elements
+in the computation graph.
+
+Let's build a simple computational graph. The most basic operation is a
+constant. The Python function that builds the operation takes a tensor value as
+input. The resulting operation takes no inputs. When run, it outputs the
+value that was passed to the constructor. We can create two floating point
+constants `a` and `b` as follows:
+
+```python
+a = tf.constant(3.0, dtype=tf.float32)
+b = tf.constant(4.0) # also tf.float32 implicitly
+total = a + b
+print(a)
+print(b)
+print(total)
+```
+
+The print statements produce:
+
+```
+Tensor("Const:0", shape=(), dtype=float32)
+Tensor("Const_1:0", shape=(), dtype=float32)
+Tensor("add:0", shape=(), dtype=float32)
+```
+
+Notice that printing the tensors does not output the values `3.0`, `4.0`, and
+`7.0` as you might expect. The above statements only build the computation
+graph. These `tf.Tensor` objects just represent the results of the operations
+that will be run.
+
+Each operation in a graph is given a unique name. This name is independent of
+the names the objects are assigned to in Python. Tensors are named after the
+operation that produces them followed by an output index, as in
+`"add:0"` above.
+
+### TensorBoard
+
+TensorFlow provides a utility called TensorBoard. One of TensorBoard's many
+capabilities is visualizing a computation graph. You can easily do this with
+a few simple commands.
+
+First you save the computation graph to a TensorBoard summary file as
+follows:
+
+```
+writer = tf.summary.FileWriter('.')
+writer.add_graph(tf.get_default_graph())
+```
+
+This will produce an `event` file in the current directory with a name in the
+following format:
+
+```
+events.out.tfevents.{timestamp}.{hostname}
+```
+
+Now, in a new terminal, launch TensorBoard with the following shell command:
+
+```bsh
+tensorboard --logdir .
+```
+
+Then open TensorBoard's [graphs page](http://localhost:6006/#graphs) in your
+browser, and you should see a graph similar to the following:
+
+![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_add.png)
+
+For more about TensorBoard's graph visualization tools see @{$graph_viz}.
+
+### Session
+
+To evaluate tensors, instantiate a @{tf.Session} object, informally known as a
+**session**. A session encapsulates the state of the TensorFlow runtime, and
+runs TensorFlow operations. If a `tf.Graph` is like a `.py` file, a `tf.Session`
+is like the `python` executable.
+
+The following code creates a `tf.Session` object and then invokes its `run`
+method to evaluate the `total` tensor we created above:
+
+```python
+sess = tf.Session()
+print(sess.run(total))
+```
+
+When you request the output of a node with `Session.run` TensorFlow backtracks
+through the graph and runs all the nodes that provide input to the requested
+output node. So this prints the expected value of 7.0:
+
+```
+7.0
+```
+
+You can pass multiple tensors to `tf.Session.run`. The `run` method
+transparently handles any combination of tuples or dictionaries, as in the
+following example:
+
+```python
+print(sess.run({'ab':(a, b), 'total':total}))
+```
+
+which returns the results in a structure of the same layout:
+
+``` None
+{'total': 7.0, 'ab': (3.0, 4.0)}
+```
+
+During a call to `tf.Session.run` any `tf.Tensor` only has a single value.
+For example, the following code calls `tf.random_uniform` to produce a
+`tf.Tensor` that generates a random 3-element vector (with values in `[0,1)`):
+
+```python
+vec = tf.random_uniform(shape=(3,))
+out1 = vec + 1
+out2 = vec + 2
+print(sess.run(vec))
+print(sess.run(vec))
+print(sess.run((out1, out2)))
+```
+
+The result shows a different random value on each call to `run`, but
+a consistent value during a single `run` (`out1` and `out2` receive the same
+random input):
+
+```
+[ 0.52917576  0.64076328  0.68353939]
+[ 0.66192627  0.89126778  0.06254101]
+(
+  array([ 1.88408756,  1.87149239,  1.84057522], dtype=float32),
+  array([ 2.88408756,  2.87149239,  2.84057522], dtype=float32)
+)
+```
+
+Some TensorFlow functions return `tf.Operations` instead of `tf.Tensors`.
+The result of calling `run` on an Operation is `None`. You run an operation
+to cause a side-effect, not to retrieve a value. Examples of this include the
+[initialization](#Initializing Layers), and [training](#Training) ops
+demonstrated later.
+
+### Feeding
+
+As it stands, this graph is not especially interesting because it always
+produces a constant result. A graph can be parameterized to accept external
+inputs, known as **placeholders**. A **placeholder** is a promise to provide a
+value later, like a function argument.
+
+```python
+x = tf.placeholder(tf.float32)
+y = tf.placeholder(tf.float32)
+z = x + y
+```
+
+The preceding three lines are a bit like a function in which we
+define two input parameters (`x` and `y`) and then an operation on them. We can
+evaluate this graph with multiple inputs by using the `feed_dict` argument of
+the @{tf.Session.run$run method} to feed concrete values to the placeholders:
+
+```python
+print(sess.run(z, feed_dict={x: 3, y: 4.5}))
+print(sess.run(z, feed_dict={x: [1, 3], y: [2, 4]}))
+```
+This results in the following output:
+
+```
+7.5
+[ 3.  7.]
+```
+
+Also note that the `feed_dict` argument can be used to overwrite any tensor in
+the graph. The only difference between placeholders and other `tf.Tensors` is
+that placeholders throw an error if no value is fed to them.
+
+## Datasets
+
+Placeholders work for simple experiments, but @{tf.data$Datasets} are the
+preferred method of streaming data into a model.
+
+To get a runnable `tf.Tensor` from a Dataset you must first convert it to a
+@{tf.data.Iterator}, and then call the Iterator's
+@{tf.data.Iterator.get_next$`get_next`} method.
+
+The simplest way to create an Iterator is with the
+@{tf.data.Dataset.make_one_shot_iterator$`make_one_shot_iterator`} method.
+For example, in the following code the `next_item` tensor will return a row from
+the `my_data` array on each `run` call:
+
+``` python
+my_data = [
+    [0, 1,],
+    [2, 3,],
+    [4, 5,],
+    [6, 7,],
+]
+slices = tf.data.Dataset.from_tensor_slices(my_data)
+next_item = slices.make_one_shot_iterator().get_next()
+```
+
+Reaching the end of the data stream causes `Dataset` to throw an
+@{tf.errors.OutOfRangeError$`OutOfRangeError`}. For example, the following code
+reads the `next_item` until there is no more data to read:
+
+``` python
+while True:
+  try:
+    print(sess.run(next_item))
+  except tf.errors.OutOfRangeError:
+    break
+```
+
+For more details on Datasets and Iterators see: @{$programmers_guide/datasets}.
+
+## Layers
+
+A trainable model must modify the values in the graph to get new outputs with
+the same input.  @{tf.layers$Layers} are the preferred way to add trainable
+parameters to a graph.
+
+Layers package together both the variables and the operations that act
+on them, . For example a
+[densely-connected layer](https://developers.google.com/machine-learning/glossary/#fully_connected_layer)
+performs a weighted sum across all inputs
+for each output and applies an optional
+[activation function](https://developers.google.com/machine-learning/glossary/#activation_function).
+The connection weights and biases are managed by the layer object.
+
+### Creating Layers
+
+The following code creates a @{tf.layers.Dense$`Dense`} layer that takes a
+batch of input vectors, and produces a single output value for each. To apply a
+layer to an input, call the layer as if it were a function. For example:
+
+```python
+x = tf.placeholder(tf.float32, shape=[None, 3])
+linear_model = tf.layers.Dense(units=1)
+y = linear_model(x)
+```
+
+The layer inspects its input to determine sizes for its internal variables. So
+here we must set the shape of the `x` placeholder so that the layer can
+build a weight matrix of the correct size.
+
+Now that we have defined the calculation of the output, `y`, there is one more
+detail we need to take care of before we run the calculation.
+
+### Initializing Layers
+
+The layer contains variables that must be **initialized** before they can be
+used. While it is possible to initialize variables individually, you can easily
+initialize all the variables in a TensorFlow graph as follows:
+
+```python
+init = tf.global_variables_initializer()
+sess.run(init)
+```
+
+Important: Calling `tf.global_variables_initializer` only
+creates and returns a handle to a TensorFlow operation. That op
+will initialize all the global variables when we run it with `tf.Session.run`.
+
+Also note that this `global_variables_initializer` only initializes variables
+that existed in the graph when the  initializer was created. So the initializer
+should be one of the last things added during graph construction.
+
+### Executing Layers
+
+Now that the layer is initialized, we can evaluate the `linear_model`'s output
+tensor as we would any other tensor. For example, the following code:
+
+```python
+print(sess.run(y, {x: [[1, 2, 3],[4, 5, 6]]}))
+```
+
+will generate a two-element output vector such as the following:
+
+```
+[[-3.41378999]
+ [-9.14999008]]
+```
+
+### Layer Function shortcuts
+
+For each layer class (like @{tf.layers.Dense}) TensorFlow also supplies a
+shortcut function (like @{tf.layers.dense}). The only difference is that the
+shortcut function versions create and run the layer in a single call. For
+example, the following code is equivalent to the earlier version:
+
+```python
+x = tf.placeholder(tf.float32, shape=[None, 3])
+y = tf.layers.dense(x, units=1)
+
+init = tf.global_variables_initializer()
+sess.run(init)
+
+print(sess.run(y, {x: [[1, 2, 3], [4, 5, 6]]}))
+```
+
+While convenient, this approach allows no access to the @{tf.layers.Layer}
+object. This makes introspection and debugging more difficult,
+and layer reuse impossible.
+
+## Feature columns
+
+The easiest way to experiment with feature columns is using the
+@{tf.feature_column.input_layer} function. This function only accepts
+@{$get_started/feature_columns$dense columns} as inputs, so to view the result
+of a categorical column you must wrap it in an
+@{tf.feature_column.indicator_column}. For example:
+
+``` python
+features = {
+    'sales' : [[5], [10], [8], [9]],
+    'department': ['sports', 'sports', 'gardening', 'gardening']}
+
+department_column = tf.feature_column.categorical_column_with_vocabulary_list(
+        'department', ['sports', 'gardening'])
+department_column = tf.feature_column.indicator_column(department_column)
+
+columns = [
+    tf.feature_column.numeric_column('sales'),
+    department_column
+]
+
+inputs = tf.feature_column.input_layer(features, columns)
+```
+
+Running the `inputs` tensor will parse the `features` into a batch of vectors.
+
+Feature columns can have internal state, like layers, so they often need to be
+initialized. Categorical columns use @{tf.contrib.lookup$lookup tables}
+internally and these require a separate initialization op,
+@{tf.tables_initializer}.
+
+``` python
+var_init = tf.global_variables_initializer()
+table_init = tf.tables_initializer()
+sess = tf.Session()
+sess.run((var_init, table_init))
+```
+
+Once the internal state has been initialized you can run `inputs` like any
+other `tf.Tensor`:
+
+```python
+print(sess.run(inputs))
+```
+
+This shows how the feature columns have packed the input vectors, with the
+one-hot "department" as the first two indices and "sales" as the third.
+
+```None
+[[  1.   0.   5.]
+ [  1.   0.  10.]
+ [  0.   1.   8.]
+ [  0.   1.   9.]]
+```
+
+## Training
+
+Now that you're familiar with the basics of core TensorFlow, let's train a
+small regression model manually.
+
+### Define the data
+
+First let's define some inputs, `x`, and the expected output for each input,
+`y_true`:
+
+```python
+x = tf.constant([[1], [2], [3], [4]], dtype=tf.float32)
+y_true = tf.constant([[0], [-1], [-2], [-3]], dtype=tf.float32)
+```
+
+### Define the model
+
+Next, build a simple linear model, with 1 output:
+
+``` python
+linear_model = tf.layers.Dense(units=1)
+
+y_pred = linear_model(x)
+```
+
+You can evaluate the predictions as follows:
+
+``` python
+sess = tf.Session()
+init = tf.global_variables_initializer()
+sess.run(init)
+
+print(sess.run(y_pred))
+```
+
+The model hasn't yet been trained, so the four "predicted" values aren't very
+good. Here's what we got; your own output will almost certainly differ:
+
+``` None
+[[ 0.02631879]
+ [ 0.05263758]
+ [ 0.07895637]
+ [ 0.10527515]]
+```
+
+### loss
+
+To optimize a model, you first need to define the loss. We'll use the mean
+square error, a standard loss for regression problems.
+
+While you could do this manually with lower level math operations,
+the @{tf.losses} module provides a set of common loss functions. You can use it
+to calculate the mean square error as follows:
+
+``` python
+loss = tf.losses.mean_squared_error(labels=y_true, predictions=y_pred)
+
+print(sess.run(loss))
+```
+This will produce a loss value, something like:
+
+``` None
+2.23962
+```
+
+### Training
+
+TensorFlow provides
+[**optimizers**](https://developers.google.com/machine-learning/glossary/#optimizer)
+implementing standard optimization algorithms. These are implemented as
+sub-classes of @{tf.train.Optimizer}. They incrementally change each
+variable in order to minimizethe loss. The simplest optimization algorithm is
+[**gradient descent**](https://developers.google.com/machine-learning/glossary/#gradient_descent),
+implemented by @{tf.train.GradientDescentOptimizer}. It modifies each
+variable according to the magnitude of the derivative of loss with respect to
+that variable. For example:
+
+```python
+optimizer = tf.train.GradientDescentOptimizer(0.01)
+train = optimizer.minimize(loss)
+```
+
+This code builds all the graph components necessary for the optimization, and
+returns a training operation. When run, the training op will update variables
+in the graph. You might run it as follows:
+
+```python
+for i in range(100):
+  _, loss_value = sess.run((train, loss))
+  print(loss_value)
+```
+
+Since `train` is an op, not a tensor, it doesn't return a value when run.
+To see the progression of the loss during training, we run the loss tensor at
+the same time, producing output like the following:
+
+``` None
+1.35659
+1.00412
+0.759167
+0.588829
+0.470264
+0.387626
+0.329918
+0.289511
+0.261112
+0.241046
+...
+```
+
+### Complete program
+
+```python
+x = tf.constant([[1], [2], [3], [4]], dtype=tf.float32)
+y_true = tf.constant([[0], [-1], [-2], [-3]], dtype=tf.float32)
+
+linear_model = tf.layers.Dense(units=1)
+
+y_pred = linear_model(x)
+loss = tf.losses.mean_squared_error(labels=y_true, predictions=y_pred)
+
+optimizer = tf.train.GradientDescentOptimizer(0.01)
+train = optimizer.minimize(loss)
+
+init = tf.global_variables_initializer()
+
+sess = tf.Session()
+sess.run(init)
+for i in range(100):
+  _, loss_value = sess.run((train, loss))
+  print(loss_value)
+
+print(sess.run(y_pred))
+```
+
+## Next steps
+
+To learn more about building models with TensorFlow consider the following:
+
+* @{$get_started/custom_estimators$Custom Estimators}, to learn how to build
+  customized models with TensorFlow. Your knowledge of TensorFlow Core will
+  help you understand and debug your own models.
+
+If you want to learn more about the inner workings of TensorFlow consider the
+following documents, which go into more depth on many of the topics discussed
+here:
+
+* @{$graphs}
+* @{$tensors}
+* @{$variables}
+
+
diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md
index 54693f3d4d356da93e6e31595d04ed58e173e061..f27a658342b8d33407e1c6ed5799a10c2305a74c 100644
--- a/tensorflow/docs_src/programmers_guide/saved_model.md
+++ b/tensorflow/docs_src/programmers_guide/saved_model.md
@@ -285,7 +285,7 @@ with tf.Session(graph=tf.Graph()) as sess:
 ```
 
 
-### Loading a Savedmodel in C++
+### Loading a SavedModel in C++
 
 The C++ version of the SavedModel
 [loader](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/loader.h)
@@ -303,6 +303,30 @@ LoadSavedModel(session_options, run_options, export_dir, {kSavedModelTagTrain},
                &bundle);
 ```
 
+### Loading and Serving a SavedModel in TensorFlow Serving
+
+You can easily load and serve a SavedModel with the TensorFlow Serving Model
+Server binary. See [instructions](https://www.tensorflow.org/serving/setup#installing_using_apt-get)
+on how to install the server, or build it if you wish.
+
+Once you have the Model Server, run it with:
+```
+tensorflow_model_server --port=port-numbers --model_name=your-model-name --model_base_path=your_model_base_path
+```
+Set the port and model_name flags to values of your choosing. The
+model_base_path flag expects to be to a base directory, with each version of
+your model residing in a numerically named subdirectory. If you only have a
+single version of your model, simply place it in a subdirectory like so:
+* Place the model in /tmp/model/0001
+* Set model_base_path to /tmp/model
+
+Store different versions of your model in numerically named subdirectories of a
+common base directory. For example, suppose the base directory is `/tmp/model`.
+If you have only one version of your model, store it in `/tmp/model/0001`. If
+you have two versions of your model, store the second version in
+`/tmp/model/0002`, and so on.  Set the `--model-base_path` flag to the base
+directory (`/tmp/model`, in this example).  TensorFlow Model Server will serve
+the model in the highest numbered subdirectory of that base directory.
 
 ### Standard constants
 
@@ -349,10 +373,10 @@ SavedModel format. This section explains how to:
 
 ### Preparing serving inputs
 
-During training, an @{$input_fn$`input_fn()`} ingests data and prepares it for
-use by the model.  At serving time, similarly, a `serving_input_receiver_fn()`
-accepts inference requests and prepares them for the model.  This function
-has the following purposes:
+During training, an @{$premade_estimators#input_fn$`input_fn()`} ingests data
+and prepares it for use by the model.  At serving time, similarly, a
+`serving_input_receiver_fn()` accepts inference requests and prepares them for
+the model.  This function has the following purposes:
 
 *  To add placeholders to the graph that the serving system will feed
    with inference requests.
@@ -479,10 +503,10 @@ does not specify one.
 ### Serving the exported model locally
 
 For local deployment, you can serve your model using
-[TensorFlow Serving](http://github.com/tensorflow/serving), an open-source project that loads a
-SavedModel and exposes it as a [gRPC](http://www.grpc.io/) service.
+[TensorFlow Serving](https://github.com/tensorflow/serving), an open-source project that loads a
+SavedModel and exposes it as a [gRPC](https://www.grpc.io/) service.
 
-First, [install TensorFlow Serving](http://github.com/tensorflow/serving).
+First, [install TensorFlow Serving](https://github.com/tensorflow/serving).
 
 Then build and run the local model server, substituting `$export_dir_base` with
 the path to the SavedModel you exported above:
@@ -736,6 +760,7 @@ The `run` command provides the following two ways to pass inputs to the model:
 
 * `--inputs` option enables you to pass numpy ndarray in files.
 * `--input_exprs` option enables you to pass Python expressions.
+* `--input_examples` option enables you to pass `tf.train.Example`.
 
 
 #### `--inputs`
@@ -789,19 +814,31 @@ inputs that match the dtype and shape of the model's `SignatureDef`s.
 For example:
 
 ```bsh
-`input_key=[[1], [2], [3]]`
+`<input_key>=[[1],[2],[3]]`
 ```
 
 In addition to Python expressions, you may also pass numpy functions. For
 example:
 
 ```bsh
-input_key=np.ones((32, 32, 3))
+`<input_key>=np.ones((32,32,3))`
 ```
 
 (Note that the `numpy` module is already available to you as `np`.)
 
 
+#### `--inputs_examples`
+
+To pass `tf.train.Example` as inputs, specify the `--input_examples` option.
+For each input key, it takes a list of dictionary, where each dictionary is an
+instance of `tf.train.Example`. The dictionary keys are the features and the
+values are the value lists for each feature.
+For example:
+
+```bsh
+`<input_key>=[{"age":[22,24],"education":["BS","MS"]}]`
+```
+
 #### Save Output
 
 By default, the SavedModel CLI writes output to stdout. If a directory is
diff --git a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md b/tensorflow/docs_src/programmers_guide/summaries_and_tensorboard.md
similarity index 96%
rename from tensorflow/docs_src/get_started/summaries_and_tensorboard.md
rename to tensorflow/docs_src/programmers_guide/summaries_and_tensorboard.md
index ce5db079ba3a502ffdec96191b03a8b951ac3db6..05dfdfdc4d2257fc680e7fa99b666ef86e3bef09 100644
--- a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
+++ b/tensorflow/docs_src/programmers_guide/summaries_and_tensorboard.md
@@ -76,7 +76,7 @@ data than you need, though. Instead, consider running the merged summary op
 every `n` steps.
 
 The code example below is a modification of the
-@{$beginners$simple MNIST tutorial},
+@{$layers$simple MNIST tutorial},
 in which we have added some summary ops, and run them every ten steps. If you
 run this and then launch `tensorboard --logdir=/tmp/tensorflow/mnist`, you'll be able
 to visualize statistics, such as how the weights or accuracy varied during
@@ -137,12 +137,10 @@ with tf.name_scope('cross_entropy'):
   #
   # can be numerically unstable.
   #
-  # So here we use tf.nn.softmax_cross_entropy_with_logits on the
-  # raw outputs of the nn_layer above, and then average across
-  # the batch.
-  diff = tf.nn.softmax_cross_entropy_with_logits(targets=y_, logits=y)
+  # So here we use tf.losses.sparse_softmax_cross_entropy on the
+  # raw logit outputs of the nn_layer above.
   with tf.name_scope('total'):
-    cross_entropy = tf.reduce_mean(diff)
+    cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
 tf.summary.scalar('cross_entropy', cross_entropy)
 
 with tf.name_scope('train'):
diff --git a/tensorflow/docs_src/get_started/tensorboard_histograms.md b/tensorflow/docs_src/programmers_guide/tensorboard_histograms.md
similarity index 100%
rename from tensorflow/docs_src/get_started/tensorboard_histograms.md
rename to tensorflow/docs_src/programmers_guide/tensorboard_histograms.md
diff --git a/tensorflow/docs_src/programmers_guide/tensors.md b/tensorflow/docs_src/programmers_guide/tensors.md
index 47d4db2a568c9f8009982e44a85e44f0250860c1..58a80d533927e4f0d1458f87406914c1efa00605 100644
--- a/tensorflow/docs_src/programmers_guide/tensors.md
+++ b/tensorflow/docs_src/programmers_guide/tensors.md
@@ -112,8 +112,8 @@ For example, the following method programmatically determines the rank
 of the `tf.Tensor` defined in the previous section:
 
 ```python
-r = tf.rank(my3d)
-# After the graph runs, r will hold the value 3.
+r = tf.rank(my_image)
+# After the graph runs, r will hold the value 4.
 ```
 
 ### Referring to `tf.Tensor` slices
diff --git a/tensorflow/docs_src/tutorials/using_gpu.md b/tensorflow/docs_src/programmers_guide/using_gpu.md
similarity index 99%
rename from tensorflow/docs_src/tutorials/using_gpu.md
rename to tensorflow/docs_src/programmers_guide/using_gpu.md
index de8d88ce766cb1314cadd62e3f3e26f8cd36d1b9..c429ca4750753278e4736650a08fd0c71e0d9fad 100644
--- a/tensorflow/docs_src/tutorials/using_gpu.md
+++ b/tensorflow/docs_src/programmers_guide/using_gpu.md
@@ -172,7 +172,7 @@ If you would like to run TensorFlow on multiple GPUs, you can construct your
 model in a multi-tower fashion where each tower is assigned to a different GPU.
 For example:
 
-```
+``` python
 # Creates a graph.
 c = []
 for d in ['/device:GPU:2', '/device:GPU:3']:
diff --git a/tensorflow/docs_src/programmers_guide/using_tpu.md b/tensorflow/docs_src/programmers_guide/using_tpu.md
new file mode 100644
index 0000000000000000000000000000000000000000..d74d7f3181c9cf44e6c97e13742db682858f4694
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/using_tpu.md
@@ -0,0 +1,396 @@
+# Using TPUs
+
+This document walks through the principal TensorFlow APIs necessary to make
+effective use of a [Cloud TPU](https://cloud.google.com/tpu/), and highlights
+the differences between regular TensorFlow usage, and usage on a TPU.
+
+This doc is aimed at users who:
+
+* Are familiar with TensorFlow's `Estimator` and `Dataset` APIs
+* Have maybe [tried out a Cloud TPU](https://cloud.google.com/tpu/docs/quickstart)
+  using an existing model.
+* Have, perhaps, skimmed the code of an example TPU model
+  [[1]](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_tpu.py)
+  [[2]](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models).
+* Are interested in porting an existing `Estimator` model to
+  run on Cloud TPUs
+
+## TPUEstimator
+
+@{tf.estimator.Estimator$Estimators} are TensorFlow's model-level abstraction.
+Standard `Estimators` can drive models on CPU and GPUs. You must use
+@{tf.contrib.tpu.TPUEstimator} to drive a model on TPUs.
+
+Refer to TensorFlow's Getting Started section for an introduction to the basics
+of using a @{$get_started/premade_estimators$pre-made `Estimator`}, and
+@{$get_started/custom_estimators$custom `Estimator`s}.
+
+The `TPUEstimator` class differs somewhat from the `Estimator` class.
+
+The simplest way to maintain a model that can be run both on CPU/GPU or on a
+Cloud TPU is to define the model's inference phase (from inputs to predictions)
+outside of the `model_fn`. Then maintain separate implementations of the
+`Estimator` setup and `model_fn`, both wrapping this inference step. For an
+example of this pattern compare the `mnist.py` and `mnist_tpu.py` implementation in
+[tensorflow/models](https://github.com/tensorflow/models/tree/master/official/mnist).
+
+### Running a `TPUEstimator` locally
+
+To create a standard `Estimator` you call the constructor, and pass it a
+`model_fn`, for example:
+
+```
+my_estimator = tf.estimator.Estimator(
+  model_fn=my_model_fn)
+```
+
+The changes required to use a @{tf.contrib.tpu.TPUEstimator} on your local
+machine are relatively minor. The constructor requires two additional arguments.
+You should set the `use_tpu` argument to `False`, and pass a
+@{tf.contrib.tpu.RunConfig} as the `config` argument, as shown below:
+
+``` python
+my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
+    model_fn=my_model_fn,
+    config=tf.contrib.tpu.RunConfig()
+    use_tpu=False)
+```
+
+Just this simple change will allow you to run a `TPUEstimator` locally.
+The majority of example TPU models can be run in this local mode,
+by setting the command line flags as follows:
+
+
+```
+$> python mnist_tpu.py --use_tpu=false --master=''
+```
+
+Note: This `use_tpu=False` argument is useful for trying out the `TPUEstimator`
+API. It is not meant to be a complete TPU compatibility test. Successfully
+running a model locally in a `TPUEstimator` does not guarantee that it will
+work on a TPU.
+
+
+### Building a `tpu.RunConfig`
+
+While the default `RunConfig` is sufficient  for local training, these settings
+cannot be ignored in real usage.
+
+A more typical setup for a `RunConfig`, that can be switched to use a Cloud
+TPU, might be as follows:
+
+``` python
+import tempfile
+import subprocess
+
+class FLAGS(object):
+  use_tpu=False
+  tpu_name=None
+  # Use a local temporary path for the `model_dir`
+  model_dir = tempfile.mkdtemp()
+  # Number of training steps to run on the Cloud TPU before returning control.
+  iterations = 50
+  # A single Cloud TPU has 8 shards.
+  num_shards = 8
+
+if FLAGS.use_tpu:
+    my_project_name = subprocess.check_output([
+        'gcloud','config','get-value','project'])
+    my_zone = subprocess.check_output([
+        'gcloud','config','get-value','compute/zone'])
+    cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+            tpu_names=[FLAGS.tpu_name],
+            zone=my_zone,
+            project=my_project)
+    master = tpu_cluster_resolver.get_master()
+else:
+    master = ''
+
+my_tpu_run_config = tf.contrib.tpu.RunConfig(
+    master=master,
+    evaluation_master=master,
+    model_dir=FLAGS.model_dir,
+    session_config=tf.ConfigProto(
+        allow_soft_placement=True, log_device_placement=True),
+    tpu_config=tf.contrib.tpu.TPUConfig(FLAGS.iterations,
+                                        FLAGS.num_shards),
+)
+```
+
+Then you must pass the @{tf.contrib.tpu.RunConfig} to the constructor:
+
+``` python
+my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
+    model_fn=my_model_fn,
+    config = my_tpu_run_config,
+    use_tpu=FLAGS.use_tpu)
+```
+
+Typically the `FLAGS` would be set by command line arguments. To switch from
+training locally to training on a cloud TPU you would need to:
+
+  1) Set `FLAGS.use_tpu` to `True`
+  1) Set `FLAGS.tpu_name` so the
+     `tf.contrib.cluster_resolver.TPUClusterResolver` can find it
+  1) Set `FLAGS.model_dir` to a Google Cloud Storage bucket url (`gs://`).
+
+
+## Optimizer
+
+When training on a cloud TPU you **must** wrap the optimizer in a
+@{tf.contrib.tpu.CrossShardOptimizer}, which uses an `allreduce` to aggregate
+gradients and broadcast the result to each shard (each TPU core).
+
+The `CrossShardOptimizer` is not compatible with local training. So, to have
+the same code run both locally and on a Cloud TPU, add lines like the following:
+
+``` python
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
+if FLAGS.use_tpu:
+  optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
+```
+
+If you prefer to avoid a global `FLAGS` variable in your model code, one
+approach is to set the optimizer as one of the `Estimator`'s params,
+as follows:
+
+``` python
+my_tpu_estimator = tf.contrib.tpu.TPUEstimator(
+    model_fn=my_model_fn,
+    config = my_tpu_run_config,
+    use_tpu=FLAGS.use_tpu,
+    params={'optimizer':optimizer})
+```
+
+## Model Function
+
+This section details the changes you must make to the model function
+(`model_fn()`) to make it `TPUEstimator` compatible.
+
+### Static shapes
+
+During regular usage TensorFlow attempts to determine the shapes of each
+`tf.Tensor` during graph construction. During execution any unknown shape
+dimensions are determined dynamically,
+see @{$programmers_guide/tensors#shape$Tensor Shapes} for more details.
+
+To run on Cloud TPUs TensorFlow models are compiled using @{$xla$XLA}.
+XLA uses a similar system for determining shapes at compile time. XLA requires
+that all tensor dimensions be statically defined at compile time. All shapes
+must evaluate to a constant, and not depend on external data, or stateful
+operations like variables or a random number generator.
+
+
+### Summaries
+
+Remove any use of `tf.summary` from your model.
+
+@{$summaries_and_tensorboard$TensorBoard summaries} are a great way see inside
+your model. A minimal set of basic summaries are automatically recorded by the
+`TPUEstimator`, to `event` files in the `model_dir`. Custom summaries, however,
+are currently unsupported when training on a Cloud TPU. So while the
+`TPUEstimator` will still run locally with summaries, it will fail if used on a
+TPU.
+
+### Metrics
+
+Build your evaluation metrics dictionary in a stand-alone `metric_fn`.
+
+<!-- TODO(markdaoust) link to programmers_guide/metrics when it exists -->
+
+Evaluation metrics are an essential part of training a model. These are fully
+supported on Cloud TPUs, but with a slightly different syntax.
+
+A standard @{tf.metrics} returns two tensors. The first returns the running
+average of the metric value, while the second updates the running average and
+returns the value for this batch:
+
+```
+running_average, current_batch = tf.metrics.accuracy(labels, predictions)
+```
+
+In a standard `Estimator` you create a dictionary of these pairs, and return it
+as part of the `EstimatorSpec`.
+
+```python
+my_metrics = {'accuracy': tf.metrics.accuracy(labels, predictions)}
+
+return tf.estimator.EstimatorSpec(
+  ...
+  eval_metric_ops=my_metrics
+)
+```
+
+In a `TPUEstimator` you instead pass a function (which returns a metrics
+dictionary) and a list of argument tensors, as shown below:
+
+```python
+def my_metric_fn(labels, predictions):
+   return {'accuracy': tf.metrics.accuracy(labels, predictions)}
+
+return tf.contrib.tpu.TPUEstimatorSpec(
+  ...
+  eval_metrics=(my_metric_fn, [labels, predictions])
+)
+```
+
+### Use `TPUEstimatorSpec`
+
+`TPUEstimatorSpec` do not support hooks, and require function wrappers for
+some fields.
+
+An `Estimator`'s `model_fn` must return an `EstimatorSpec`. An `EstimatorSpec`
+is a simple structure of named fields containing all the `tf.Tensors` of the
+model that the `Estimator` may need to interact with.
+
+`TPUEstimators` use a @{tf.contrib.tpu.TPUEstimatorSpec}. There are a few
+differences between it and a standard @{tf.estimator.EstimatorSpec}:
+
+
+*  The `eval_metric_ops` must be wrapped into a `metrics_fn`, this field is
+   renamed `eval_metrics` ([see above](#metrics)).
+*  The @{tf.train.SessionRunHook$hooks} are unsupported, so these fields are
+   omitted.
+*  The @{tf.train.Scaffold$`scaffold`}, if used, must also be wrapped in a
+   function. This field is renamed to `scaffold_fn`.
+
+`Scaffold` and `Hooks` are for advanced usage, and can typically be omitted.
+
+## Input functions
+
+Input functions work mainly unchanged as they run on the host computer, not the
+Cloud TPU itself. This section explains the two necessary adjustments.
+
+### Params argument
+
+<!-- TODO(markdaoust) link to input_fn doc when it exists -->
+
+The `input_fn` for a standard `Estimator` _can_ include a
+`params` argument; the `input_fn` for a `TPUEstimator` *must* include a
+`params` argument. This is necessary to allow the estimator to set the batch
+size for each replica of the input stream. So the minimum signature for an
+`input_fn` for a `TPUEstimator` is:
+
+```
+def my_input_fn(params):
+  pass
+```
+
+Where `params['batch-size']` will contain the batch size.
+
+### Static shapes and batch size
+
+The input pipeline generated by your `input_fn` is run on CPU. So it is mostly
+free strict static shape requirements imposed by the XLA/TPU environment. The
+one requirement is that the batches of data fed from your input pipeline to
+the TPU have a static shape, as determined by the standard TensorFlow shape
+inference algorithm. Intermediate tensors are free to have a dynamic shapes.
+If shape inference has failed, but the shape is known it is possible to
+impose the correct shape using `tf.set_shape()`. 
+
+In the example below the shape
+inference algorithm fails, but it is corrected using `set_shape`:
+
+```
+>>> x = tf.zeros(tf.constant([1,2,3])+1)
+>>> x.shape
+
+TensorShape([Dimension(None), Dimension(None), Dimension(None)])
+
+>>> x.set_shape([2,3,4])
+```
+
+In many cases the batch size is the only unknown dimension.
+
+A typical input pipeline, using `tf.data`, will usually produce batches of a
+fixed size. The last batch of a finite `Dataset`, however, is typically smaller,
+containing just the remaining elements. Since a `Dataset` does not know its own
+length or finiteness, the standard @{tf.data.Dataset.batch$`batch`} method
+cannot determine if all batches will have a fixed size batch on its own:
+
+```
+>>> params = {'batch_size':32}
+>>> ds = tf.data.Dataset.from_tensors([0, 1, 2])
+>>> ds = ds.repeat().batch(params['batch-size'])
+>>> ds
+
+<BatchDataset shapes: (?, 3), types: tf.int32>
+```
+
+The most straightforward fix is to
+@{tf.data.Dataset.apply$apply} @{tf.contrib.data.batch_and_drop_remainder}
+as follows:
+
+```
+>>> params = {'batch_size':32}
+>>> ds = tf.data.Dataset.from_tensors([0, 1, 2])
+>>> ds = ds.repeat().apply(
+...     tf.contrib.data.batch_and_drop_remainder(params['batch-size']))
+>>> ds
+
+ <_RestructuredDataset shapes: (32, 3), types: tf.int32>
+```
+
+The one downside to this approach is that, as the name implies, this batching
+method throws out any fractional batch at the end of the dataset. This is fine
+for an infinitely repeating dataset being used for training, but could be a
+problem if you want to train for an exact number of epochs.
+
+To do an exact 1-epoch of _evaluation_ you can work around this by manually
+padding the length of the batches, and setting the padding entries to have zero
+weight when creating your `tf.metrics`.
+
+## Datasets
+
+Efficient use of the `tf.data.Dataset` API is critical when using a Cloud
+TPU, as it is impossible to use the Cloud TPU's unless you can feed it data
+quickly enough. See @{$datasets_performance} for details on dataset performance.
+
+For all but the simplest experimentation (using
+@{tf.data.Dataset.from_tensor_slices} or other in-graph data) you will need to
+store all data files read by the `TPUEstimator`'s `Dataset` in Google Cloud
+Storage Buckets.
+
+<!--TODO(markdaoust): link to the `TFRecord` doc when it exists.-->
+
+For most use-cases, we recommend converting your data into `TFRecord`
+format and using a @{tf.data.TFRecordDataset} to read it. This, however, is not
+a hard requirement and you can use other dataset readers
+(`FixedLengthRecordDataset` or `TextLineDataset`) if you prefer.
+
+Small datasets can be loaded entirely into memory using
+@{tf.data.Dataset.cache}.
+
+Regardless of the data format used, it is strongly recommended that you
+@{$performance_guide#use_large_files$use large files}, on the order of
+100MB. This is especially important in this networked setting as the overhead
+of opening a file is significantly higher.
+
+It is also important, regardless of the type of reader used, to enable buffering
+using the `buffer_size` argument to the constructor. This argument is specified
+in bytes. A minimum of a few MB (`buffer_size=8*1024*1024`) is recommended so
+that data is available when needed.
+
+The TPU-demos repo includes
+[a script](https://github.com/tensorflow/tpu-demos/blob/master/cloud_tpu/datasets/imagenet_to_gcs.py)
+for downloading the imagenet dataset and converting it to an appropriate format.
+This together with the imagenet
+[models](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models)
+included in the repo demonstrate all of these best-practices.
+
+
+## What Next
+
+For details on how to actually set up and run a Cloud TPU see:
+
+ * [Google Cloud TPU Documentation](https://cloud.google.com/tpu/docs/)
+
+This document is by no means exhaustive. The best source of more detail on how
+to make a Cloud TPU compatible model are the example models published in:
+
+ * The [TPU Demos Repository.](https://github.com/tensorflow/tpu-demos/)
+
+For more information about tuning TensorFlow code for performance see:
+
+ * The @{$performance$Performance Section.}
+
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index 16753c931f151ea6d3ce7cd465bf98d23cde78ae..64250738056043e236b5eb236bcbf29375655260 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -205,7 +205,7 @@ methods:
 v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer())
 assignment = v.assign_add(1)
 tf.global_variables_initializer().run()
-assignment.run()
+sess.run(assignment)  # or assignment.op.run(), or assignment.eval()
 ```
 
 Most TensorFlow optimizers have specialized ops that efficiently update the
diff --git a/tensorflow/docs_src/programmers_guide/version_compat.md b/tensorflow/docs_src/programmers_guide/version_compat.md
index d3e8e425091aac52b435479c4086bf7a4043dd19..a28f1385c87c7a083ee96977c5ab268c6977e17e 100644
--- a/tensorflow/docs_src/programmers_guide/version_compat.md
+++ b/tensorflow/docs_src/programmers_guide/version_compat.md
@@ -60,7 +60,7 @@ patch versions.  The public APIs consist of
     * [`tensor_shape`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/tensor_shape.proto)
     * [`types`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/types.proto)
 
-## What is *not* covered
+## What is *not* covered {not_covered}
 
 Some API functions are explicitly marked as "experimental" and can change in
 backward incompatible ways between minor releases. These include:
diff --git a/tensorflow/docs_src/tutorials/audio_recognition.md b/tensorflow/docs_src/tutorials/audio_recognition.md
index 336f4d9c18b45cda2441bc7a83e9698bbd618d22..7d79f433c41b42a268816d8277ea69b0d62a04f3 100644
--- a/tensorflow/docs_src/tutorials/audio_recognition.md
+++ b/tensorflow/docs_src/tutorials/audio_recognition.md
@@ -246,7 +246,7 @@ results as in your server testing.
 The demo app updates its UI list of results automatically based on the labels
 text file you copy into assets alongside your frozen graph, which means you can
 easily try out different models without needing to make any code changes. You
-will need to updaye `LABEL_FILENAME` and `MODEL_FILENAME` to point to the files
+will need to update `LABEL_FILENAME` and `MODEL_FILENAME` to point to the files
 you've added if you change the paths though.
 
 ## How does this Model Work?
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index 6f802fd106d0e7cc8b2049af2548c51803b43195..679754020470dddfcffa76e62ca8f55a439ec4f5 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -195,9 +195,8 @@ The usual method for training a network to perform N-way classification is
 aka. *softmax regression*. Softmax regression applies a
 @{tf.nn.softmax$softmax} nonlinearity to the
 output of the network and calculates the
-@{tf.nn.softmax_cross_entropy_with_logits$cross-entropy}
-between the normalized predictions and a
-@{tf.sparse_to_dense$1-hot encoding} of the label.
+@{tf.nn.sparse_softmax_cross_entropy_with_logits$cross-entropy}
+between the normalized predictions and the label index.
 For regularization, we also apply the usual
 @{tf.nn.l2_loss$weight decay} losses to all learned
 variables.  The objective function for the model is the sum of the cross entropy
diff --git a/tensorflow/docs_src/tutorials/image_recognition.md b/tensorflow/docs_src/tutorials/image_recognition.md
index 32257f87d6662f44536f45510b6a7c82628de2ff..332bcf54f02e6e3c7d805746011dfab642943cfe 100644
--- a/tensorflow/docs_src/tutorials/image_recognition.md
+++ b/tensorflow/docs_src/tutorials/image_recognition.md
@@ -450,9 +450,7 @@ covering them.
 
 To find out more about implementing convolutional neural networks, you can jump
 to the TensorFlow @{$deep_cnn$deep convolutional networks tutorial},
-or start a bit more gently with our
-@{$beginners$ML beginner} or @{$pros$ML expert}
-MNIST starter tutorials. Finally, if you want to get up to speed on research
-in this area, you can
+or start a bit more gently with our @{$layers$MNIST starter tutorial}.
+Finally, if you want to get up to speed on research in this area, you can
 read the recent work of all the papers referenced in this tutorial.
 
diff --git a/tensorflow/docs_src/tutorials/image_retraining.md b/tensorflow/docs_src/tutorials/image_retraining.md
index ad565e6d8be5e1e1c0efe5993608a4c1083e562b..df15bc0a9c3763aa51c2fc8cf36ce9fc3544ae68 100644
--- a/tensorflow/docs_src/tutorials/image_retraining.md
+++ b/tensorflow/docs_src/tutorials/image_retraining.md
@@ -44,8 +44,14 @@ following command (these examples are not included in the installation):
 
 ```sh
 git clone https://github.com/tensorflow/tensorflow
+```
+
+Then checkout the version of the tensorflow repository matching your
+installation and this tutorial as follows:
 
+``` sh
 cd tensorflow
+git checkout {version}
 ```
 
 In the simplest cases the retrainer can then be run like this:
@@ -384,7 +390,7 @@ image size that your model expects, as follows:
 python tensorflow/examples/label_image/label_image.py \
 --graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
 --input_layer=input \
---output_layer=final_result:0 \
+--output_layer=final_result \
 --input_height=224 --input_width=224 \
 --input_mean=128 --input_std=128 \
 --image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
diff --git a/tensorflow/docs_src/tutorials/index.md b/tensorflow/docs_src/tutorials/index.md
index a34dbd69569be9cd234e98009ed148080fbbdb70..8c697e48e550c4e425db33bab7257532d209ac7a 100644
--- a/tensorflow/docs_src/tutorials/index.md
+++ b/tensorflow/docs_src/tutorials/index.md
@@ -1,53 +1,60 @@
 # Tutorials
 
+
 This section contains tutorials demonstrating how to do specific tasks
 in TensorFlow.  If you are new to TensorFlow, we recommend reading the
-documents in the "Get Started" section before reading these tutorials.
+documents in the "@{$get_started$Get Started}" section before reading
+these tutorials.
 
-The following tutorial explains the interaction of CPUs and GPUs on a
-TensorFlow system:
+## Images
 
-  * @{$using_gpu$Using GPUs}
+These tutorials cover different aspects of image recognition:
 
-The following tutorials cover different aspects of image recognition:
+  * @{$layers}, which introduces convolutional neural networks (CNNs) and
+    demonstrates how to build a CNN in TensorFlow.
+  * @{$image_recognition}, which introduces the field of image recognition and
+    uses a pre-trained model (Inception) for recognizing images.
+  * @{$image_retraining}, which has a wonderfully self-explanatory title.
+  * @{$deep_cnn}, which demonstrates how to build a small CNN for recognizing
+    images.  This tutorial is aimed at advanced TensorFlow users.
 
-  * @{$image_recognition$Image Recognition}, which introduces the field of
-    image recognition and a model (Inception) for recognizing images.
-  * @{$image_retraining$How to Retrain Inception's Final Layer for New Categories},
-    which has a wonderfully self-explanatory title.
-  * @{$layers$A Guide to TF Layers: Building a Convolutional Neural Network},
-    which introduces convolutional neural networks (CNNs) and demonstrates how
-    to build a CNN in TensorFlow.
-  * @{$deep_cnn$Convolutional Neural Networks}, which demonstrates how to
-    build a small CNN for recognizing images.  This tutorial is aimed at
-    advanced TensorFlow users.
 
-The following tutorials focus on machine learning problems in human language:
+## Sequences
 
-  * @{$word2vec$Vector Representations of Words}, which demonstrates how to
-    create an embedding for words.
-  * @{$recurrent$Recurrent Neural Networks}, which demonstrates how to use a
+These tutorials focus on machine learning problems dealing with sequence data.
+
+  * @{$recurrent}, which demonstrates how to use a
     recurrent neural network to predict the next word in a sentence.
-  * @{$seq2seq$Sequence-to-Sequence Models}, which demonstrates how to use a
+  * @{$seq2seq}, which demonstrates how to use a
     sequence-to-sequence model to translate text from English to French.
+  * @{$recurrent_quickdraw}
+    builds a classification model for drawings, directly from the sequence of
+    pen strokes.
+  * @{$audio_recognition}, which shows how to
+    build a basic speech recognition network.
 
-The following tutorials focus on linear models:
+## Data representation
 
-  * @{$linear$Large-Scale Linear Models with TensorFlow}, which introduces
-    linear models and demonstrates how to build them with the high-level API.
-  * @{$wide$TensorFlow Linear Model Tutorial}, which demonstrates how to solve
-    a binary classification problem in TensorFlow.
-  * @{$wide_and_deep$TensorFlow Wide & Deep Learning Tutorial}, which explains
-    how to use the high-level API to jointly train both a wide linear model
-    and a deep feed-forward neural network.
-  * @{$kernel_methods$Improving Linear Models Using Explicit Kernel Methods},
+These tutorials demonstrate various data representations that can be used in
+TensorFlow.
+
+  * @{$wide}, uses
+    @{tf.feature_column$feature columns} to feed a variety of data types
+    to linear model, to solve a classification problem.
+  * @{$wide_and_deep}, builds on the
+    above linear model tutorial, adding a deep feed-forward neural network
+    component and a DNN-compatible data representation.
+  * @{$word2vec}, which demonstrates how to
+    create an embedding for words.
+  * @{$kernel_methods},
     which shows how to improve the quality of a linear model by using explicit
     kernel mappings.
-  * @{$audio_recognition$Simple Audio Recognition}, which shows how to
-    build a basic speech recognition network.
 
-Although TensorFlow specializes in machine learning, you may also use
-TensorFlow to solve other kinds of math problems.  For example:
+## Non Machine Learning
+
+Although TensorFlow specializes in machine learning, the core of TensorFlow is
+a powerful numeric computation system which you can also use to solve other
+kinds of math problems.  For example:
 
-  * @{$mandelbrot$Mandelbrot Set}
-  * @{$pdes$Partial Differential Equations}
+  * @{$mandelbrot}
+  * @{$pdes}
diff --git a/tensorflow/docs_src/tutorials/kernel_methods.md b/tensorflow/docs_src/tutorials/kernel_methods.md
index 324c34fdfa84d922f298d87d77e8e1d635f876ae..63f408c2ca304d6345ffff459b799b011f8d8035 100644
--- a/tensorflow/docs_src/tutorials/kernel_methods.md
+++ b/tensorflow/docs_src/tutorials/kernel_methods.md
@@ -1,5 +1,10 @@
 # Improving Linear Models Using Explicit Kernel Methods
 
+Note: This document uses a deprecated version of ${tf.estimator},
+which has a ${tf.contrib.learn.estimator$different interface}.
+It also uses other `contrib` methods whose
+${$version_compat#not_covered$API may not be stable}.
+
 In this tutorial, we demonstrate how combining (explicit) kernel methods with
 linear models can drastically increase the latters' quality of predictions
 without significantly increasing training and inference times. Unlike dual
@@ -44,18 +49,18 @@ respectively. Each split contains one numpy array for images (with shape
 tutorial, we only use the train and validation splits to train and evaluate our
 models respectively.
 
-In order to feed data to a tf.contrib.learn Estimator, it is helpful to convert
+In order to feed data to a `tf.contrib.learn Estimator`, it is helpful to convert
 it to Tensors. For this, we will use an `input function` which adds Ops to the
 TensorFlow graph that, when executed, create mini-batches of Tensors to be used
 downstream. For more background on input functions, check
-@{$get_started/input_fn$Building Input Functions with tf.contrib.learn}. In this
-example, we will use the `tf.train.shuffle_batch` Op which, besides converting
-numpy arrays to Tensors, allows us to specify the batch_size and whether to
-randomize the input every time the input_fn Ops are executed (randomization
-typically expedites convergence during training). The full code for loading and
-preparing the data is shown in the snippet below. In this example, we use
-mini-batches of size 256 for training and the entire sample (5K entries) for
-evaluation. Feel free to experiment with different batch sizes.
+@{$get_started/premade_estimators#input_fn$this section on input functions}.
+In this example, we will use the `tf.train.shuffle_batch` Op which, besides
+converting numpy arrays to Tensors, allows us to specify the batch_size and
+whether to randomize the input every time the input_fn Ops are executed
+(randomization typically expedites convergence during training). The full code
+for loading and preparing the data is shown in the snippet below. In this
+example, we use mini-batches of size 256 for training and the entire sample
+(5K entries) for evaluation. Feel free to experiment with different batch sizes.
 
 ```python
 import numpy as np
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index e808a3677f2a3e89597ef82cc86dd3646775d693..b898cbe29c2bac9ade341fe3b3566e42e133fc5b 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -169,9 +169,7 @@ def cnn_model_fn(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
 
   # Calculate Loss (for both TRAIN and EVAL modes)
-  onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Configure the Training Op (for TRAIN mode)
   if mode == tf.estimator.ModeKeys.TRAIN:
@@ -192,7 +190,7 @@ def cnn_model_fn(features, labels, mode):
 The following sections (with headings corresponding to each code block above)
 dive deeper into the `tf.layers` code used to create each layer, as well as how
 to calculate loss, configure the training op, and generate predictions. If
-you're already experienced with CNNs and @{$extend/estimators$TensorFlow `Estimator`s},
+you're already experienced with CNNs and @{$get_started/custom_estimators$TensorFlow `Estimator`s},
 and find the above code intuitive, you may want to skim these sections or just
 skip ahead to ["Training and Evaluating the CNN MNIST
 Classifier"](#training-and-evaluating-the-cnn-mnist-classifier).
@@ -536,8 +534,8 @@ if mode == tf.estimator.ModeKeys.TRAIN:
 ```
 
 > Note: For a more in-depth look at configuring training ops for Estimator model
-> functions, see @{$extend/estimators#defining-the-training-op-for-the-model$"Defining
-> the training op for the model"} in the @{$extend/estimators$"Creating Estimations in
+> functions, see @{$get_started/custom_estimators#defining-the-training-op-for-the-model$"Defining
+> the training op for the model"} in the @{$get_started/custom_estimators$"Creating Estimations in
 > tf.estimator"} tutorial.
 
 ### Add evaluation metrics
@@ -601,7 +599,7 @@ be saved (here, we specify the temp directory `/tmp/mnist_convnet_model`, but
 feel free to change to another directory of your choice).
 
 > Note: For an in-depth walkthrough of the TensorFlow `Estimator` API, see the
-> tutorial @{$extend/estimators$"Creating Estimators in tf.estimator."}
+> tutorial @{$get_started/custom_estimators$"Creating Estimators in tf.estimator."}
 
 ### Set Up a Logging Hook {#set_up_a_logging_hook}
 
@@ -720,10 +718,9 @@ Here, we've achieved an accuracy of 97.3% on our test data set.
 To learn more about TensorFlow Estimators and CNNs in TensorFlow, see the
 following resources:
 
-*   @{$extend/estimators$Creating Estimators in tf.estimator}. An
-    introduction to the TensorFlow Estimator API, which walks through
+*   @{$get_started/custom_estimators$Creating Estimators in tf.estimator}
+    provides an introduction to the TensorFlow Estimator API. It walks through
     configuring an Estimator, writing a model function, calculating loss, and
     defining a training op.
-*   @{$pros#build-a-multilayer-convolutional-network$Deep MNIST for Experts: Building a Multilayer CNN}. Walks
-    through how to build a MNIST CNN classification model *without layers* using
-    lower-level TensorFlow operations.
+*   @{$deep_cnn} walks through how to build a MNIST CNN classification model
+    *without estimators* using lower-level TensorFlow operations.
diff --git a/tensorflow/docs_src/tutorials/leftnav_files b/tensorflow/docs_src/tutorials/leftnav_files
index 5a5d6ca558867e1c8f3dca221a98ca7c0a7ee986..888052428f951fa1a7cbd9c6d35497a056387097 100644
--- a/tensorflow/docs_src/tutorials/leftnav_files
+++ b/tensorflow/docs_src/tutorials/leftnav_files
@@ -1,16 +1,23 @@
 index.md
-using_gpu.md
-image_recognition.md
-image_retraining.md
-layers.md
+
+### Images
+layers.md: MNIST
+image_recognition.md: Image Recognition
+image_retraining.md: Image Retraining
 deep_cnn.md
-word2vec.md
+
+### Sequences
 recurrent.md
-seq2seq.md
-linear.md
-wide.md
-wide_and_deep.md
-kernel_methods.md
+seq2seq.md: Neural Machine Translation
+recurrent_quickdraw.md: Drawing Classification
 audio_recognition.md
+
+### Data Representation
+wide.md: Linear Models
+wide_and_deep.md: Wide & Deep Learning
+word2vec.md
+kernel_methods.md: Kernel Methods
+
+### Non-ML
 mandelbrot.md
 pdes.md
diff --git a/tensorflow/docs_src/tutorials/linear.md b/tensorflow/docs_src/tutorials/linear.md
index d333d01279067de47819410795505f731e14fed3..265ded877d1ff9fb0b1cc2ad678729a3b7247aa8 100644
--- a/tensorflow/docs_src/tutorials/linear.md
+++ b/tensorflow/docs_src/tutorials/linear.md
@@ -1,36 +1,37 @@
 # Large-scale Linear Models with TensorFlow
 
-The tf.estimator API provides (among other things) a rich set of tools for
+@{tf.estimator$Estimators} provides (among other things) a rich set of tools for
 working with linear models in TensorFlow. This document provides an overview of
 those tools. It explains:
 
-   * what a linear model is.
-   * why you might want to use a linear model.
-   * how tf.estimator makes it easy to build linear models in TensorFlow.
-   * how you can use tf.estimator to combine linear models with
-   deep learning to get the advantages of both.
+   * What a linear model is.
+   * Why you might want to use a linear model.
+   * How Estimators make it easy to build linear models in TensorFlow.
+   * How you can use Estimators to combine linear models with.
+     deep learning to get the advantages of both.
 
-Read this overview to decide whether the tf.estimator linear model tools might
+Read this overview to decide whether the Estimator's linear model tools  might
 be useful to you. Then do the @{$wide$Linear Models tutorial} to
 give it a try. This overview uses code samples from the tutorial, but the
 tutorial walks through the code in greater detail.
 
 To understand this overview it will help to have some familiarity
-with basic machine learning concepts, and also with @{$get_started/estimator$`tf.estimator`}.
+with basic machine learning concepts, and also with
+@{$get_started/premade_estimators$Estimators}.
 
 [TOC]
 
 ## What is a linear model?
 
-A *linear model* uses a single weighted sum of features to make a prediction.
+A **linear model** uses a single weighted sum of features to make a prediction.
 For example, if you have [data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names)
 on age, years of education, and weekly hours of
-work for a population, you can learn weights for each of those numbers so that
+work for a population, a model can learn weights for each of those numbers so that
 their weighted sum estimates a person's salary. You can also use linear models
 for classification.
 
 Some linear models transform the weighted sum into a more convenient form. For
-example, *logistic regression* plugs the weighted sum into the logistic
+example, [**logistic regression**](https://developers.google.com/machine-learning/glossary/#logistic_regression) plugs the weighted sum into the logistic
 function to turn the output into a value between 0 and 1. But you still just
 have one weight for each input feature.
 
@@ -51,10 +52,10 @@ Linear models:
    * provide an excellent starting point for learning about machine learning.
    * are widely used in industry.
 
-## How does tf.estimator help you build linear models?
+## How do Estimators help you build linear models?
 
 You can build a linear model from scratch in TensorFlow without the help of a
-special API. But tf.estimator provides some tools that make it easier to build
+special API. But Estimators provides some tools that make it easier to build
 effective large-scale linear models.
 
 ### Feature columns and transformations
@@ -86,10 +87,10 @@ become [0, 1, 0] and 'green' would become [0, 0, 1]. These vectors are called
 "sparse" because they may be very long, with many zeros, when the set of
 possible values is very large (such as all English words).
 
-While you don't need to use categorical columns to use tf.estimator linear
-models, one of the strengths of linear models is their ability to deal with
-large sparse vectors. Sparse features are a primary use case for the
-tf.estimator linear model tools.
+While you don't need to use categorical columns to use the linear model tools
+provided by Estimators, one of the strengths of linear models is their ability
+to deal with large sparse vectors. Sparse features are a primary use case for
+the linear model tools provided by Estimators.
 
 ##### Encoding sparse columns
 
@@ -173,7 +174,7 @@ the data itself. You provide the data through an input function.
 The input function must return a dictionary of tensors. Each key corresponds to
 the name of a `FeatureColumn`. Each key's value is a tensor containing the
 values of that feature for all data instances. See
-@{$input_fn$Building Input Functions with tf.estimator} for a
+@{$premade_estimators#input_fn} for a
 more comprehensive look at input functions, and `input_fn` in the
 [linear models tutorial code](https://github.com/tensorflow/models/tree/master/official/wide_deep/wide_deep.py)
 for an example implementation of an input function.
@@ -220,7 +221,7 @@ for key in sorted(results):
 
 ### Wide and deep learning
 
-The tf.estimator API also provides an estimator class that lets you jointly
+The `tf.estimator` module also provides an estimator class that lets you jointly
 train a linear model and a deep neural network. This novel approach combines the
 ability of linear models to "memorize" key features with the generalization
 ability of neural nets. Use `tf.estimator.DNNLinearCombinedClassifier` to
diff --git a/tensorflow/docs_src/tutorials/recurrent.md b/tensorflow/docs_src/tutorials/recurrent.md
index 3bae9bb457a0696722d239e664207a4d8021f0d8..14da2c8785276abb34d6959d738f5b39e6c6a2e8 100644
--- a/tensorflow/docs_src/tutorials/recurrent.md
+++ b/tensorflow/docs_src/tutorials/recurrent.md
@@ -57,6 +57,7 @@ important to note that `current_batch_of_words` does not correspond to a
 TensorFlow will automatically sum the gradients of each batch for you.
 
 For example:
+
 ```
  t=0  t=1    t=2  t=3     t=4
 [The, brown, fox, is,     quick]
diff --git a/tensorflow/docs_src/tutorials/recurrent_quickdraw.md b/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
new file mode 100644
index 0000000000000000000000000000000000000000..e22536adb6f0b893602ff79612cfb01e10586a18
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
@@ -0,0 +1,410 @@
+# Recurrent Neural Networks for Drawing Classification
+
+[Quick, Draw!]: http://quickdraw.withgoogle.com
+
+[Quick, Draw!] is a game where a player is challenged to draw a number of
+objects and see if a computer can recognize the drawing.
+
+The recognition in [Quick, Draw!] is performed by a classifier that takes the
+user input, given as a sequence of strokes of points in x and y, and recognizes
+the object category that the user tried to draw.
+
+In this tutorial we'll show how to build an RNN-based recognizer for this
+problem. The model will use a combination of convolutional layers, LSTM layers,
+and a softmax output layer to classify the drawings:
+
+<center> ![RNN model structure](../images/quickdraw_model.png) </center>
+
+The figure above shows the structure of the model that we will build in this
+tutorial. The input is a drawing that is encoded as a sequence of strokes of
+points in x, y, and n, where n indicates whether a the point is the first point
+in a new stroke.
+
+Then, a series of 1-dimensional convolutions is applied. Then LSTM layers are
+applied and the sum of the outputs of all LSTM steps is fed into a softmax layer
+to make a classification decision among the classes of drawings that we know.
+
+This tutorial uses the data from actual [Quick, Draw!] games [that is publicly
+available](https://quickdraw.withgoogle.com/data). This dataset contains of 50M
+drawings in 345 categories.
+
+## Run the tutorial code
+
+To try the code for this tutorial:
+
+1.  @{$install$Install TensorFlow} if you haven't already.
+1.  Download the [tutorial code]
+(https://github.com/tensorflow/models/tree/master/tutorials/rnn/quickdraw/train_model.py).
+1.  [Download the data](#download-the-data) in `TFRecord` format from
+    [here](http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz) and unzip it. More details about [how to
+    obtain the original Quick, Draw!
+    data](#optional-download-the-full-quick-draw-data) and [how to convert that
+    to `TFRecord` files](#optional-converting-the-data) is available below.
+
+1.  Execute the tutorial code with the following command to train the RNN-based
+    model described in this tutorial. Make sure to adjust the paths to point to
+    the unzipped data from the download in step 3.
+
+```shell
+  python train_model.py \
+    --training_data=rnn_tutorial_data/training.tfrecord-?????-of-????? \
+    --eval_data=rnn_tutorial_data/eval.tfrecord-?????-of-????? \
+    --classes_file=rnn_tutorial_data/training.tfrecord.classes
+```
+
+## Tutorial details
+
+### Download the data
+
+We make the data that we use in this tutorial available as `TFRecord` files
+containing `TFExamples`. You can download the data from here:
+
+http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz
+
+Alternatively you can download the original data in `ndjson` format from the
+Google cloud and convert it to the `TFRecord` files containing `TFExamples`
+yourself as described in the next section.
+
+### Optional: Download the full Quick Draw Data
+
+The full [Quick, Draw!](https://quickdraw.withgoogle.com)
+[dataset](https://quickdraw.withgoogle.com/data) is available on Google Cloud
+Storage as [ndjson](http://ndjson.org/) files separated by category. You can
+[browse the list of files in Cloud
+Console](https://console.cloud.google.com/storage/quickdraw_dataset).
+
+To download the data we recommend using
+[gsutil](https://cloud.google.com/storage/docs/gsutil_install#install) to
+download the entire dataset. Note that the original .ndjson files require
+downloading ~22GB.
+
+Then use the following command to check that your gsutil installation works and
+that you can access the data bucket:
+
+```shell
+gsutil ls -r "gs://quickdraw_dataset/full/simplified/*"
+```
+
+which will output a long list of files like the following:
+
+```shell
+gs://quickdraw_dataset/full/simplified/The Eiffel Tower.ndjson
+gs://quickdraw_dataset/full/simplified/The Great Wall of China.ndjson
+gs://quickdraw_dataset/full/simplified/The Mona Lisa.ndjson
+gs://quickdraw_dataset/full/simplified/aircraft carrier.ndjson
+...
+```
+
+Then create a folder and download the dataset there.
+
+```shell
+mkdir rnn_tutorial_data
+cd rnn_tutorial_data
+gsutil -m cp "gs://quickdraw_dataset/full/simplified/*" .
+```
+
+This download will take a while and download a bit more than 23GB of data.
+
+### Optional: Converting the data
+
+To convert the `ndjson` files to
+@{$python/python_io#tfrecords_format_details$TFRecord} files containing
+${tf.train.Example} protos run the following command.
+
+```shell
+   python create_dataset.py --ndjson_path rnn_tutorial_data \
+      --output_path rnn_tutorial_data
+```
+
+This will store the data in 10 shards of
+@{$python/python_io#tfrecords_format_details$TFRecord} files with 10000 items
+per class for the training data and 1000 items per class as eval data.
+
+This conversion process is described in more detail in the following.
+
+The original QuickDraw data is formatted as `ndjson` files where each line
+contains a JSON object like the following:
+
+```json
+{"word":"cat",
+ "countrycode":"VE",
+ "timestamp":"2017-03-02 23:25:10.07453 UTC",
+ "recognized":true,
+ "key_id":"5201136883597312",
+ "drawing":[
+   [
+     [130,113,99,109,76,64,55,48,48,51,59,86,133,154,170,203,214,217,215,208,186,176,162,157,132],
+     [72,40,27,79,82,88,100,120,134,152,165,184,189,186,179,152,131,114,100,89,76,0,31,65,70]
+   ],[
+     [76,28,7],
+     [136,128,128]
+   ],[
+     [76,23,0],
+     [160,164,175]
+   ],[
+     [87,52,37],
+     [175,191,204]
+   ],[
+     [174,220,246,251],
+     [134,132,136,139]
+   ],[
+     [175,255],
+     [147,168]
+   ],[
+     [171,208,215],
+     [164,198,210]
+   ],[
+     [130,110,108,111,130,139,139,119],
+     [129,134,137,144,148,144,136,130]
+   ],[
+     [107,106],
+     [96,113]
+   ]
+ ]
+}
+```
+
+For our purpose of building a classifier we only care about the fields "`word`"
+and "`drawing`". While parsing the ndjson files, we process them line by line
+using a function that converts the strokes from the `drawing` field into a
+tensor of size `[number of points, 3]` containing the differences of consecutive
+points. This function also returns the class name as a string.
+
+```python
+def parse_line(ndjson_line):
+  """Parse an ndjson line and return ink (as np array) and classname."""
+  sample = json.loads(ndjson_line)
+  class_name = sample["word"]
+  inkarray = sample["drawing"]
+  stroke_lengths = [len(stroke[0]) for stroke in inkarray]
+  total_points = sum(stroke_lengths)
+  np_ink = np.zeros((total_points, 3), dtype=np.float32)
+  current_t = 0
+  for stroke in inkarray:
+    for i in [0, 1]:
+      np_ink[current_t:(current_t + len(stroke[0])), i] = stroke[i]
+    current_t += len(stroke[0])
+    np_ink[current_t - 1, 2] = 1  # stroke_end
+  # Preprocessing.
+  # 1. Size normalization.
+  lower = np.min(np_ink[:, 0:2], axis=0)
+  upper = np.max(np_ink[:, 0:2], axis=0)
+  scale = upper - lower
+  scale[scale == 0] = 1
+  np_ink[:, 0:2] = (np_ink[:, 0:2] - lower) / scale
+  # 2. Compute deltas.
+  np_ink = np_ink[1:, 0:2] - np_ink[0:-1, 0:2]
+  return np_ink, class_name
+```
+
+Since we want the data to be shuffled for writing we read from each of the
+category files in random order and write to a random shard.
+
+For the training data we read the first 10000 items for each class and for the
+eval data we read the next 1000 items for each class.
+
+This data is then reformatted into a tensor of shape `[num_training_samples,
+max_length, 3]`. Then we determine the bounding box of the original drawing in
+screen coordinates and normalize the size such that the drawing has unit height.
+
+<center> ![Size normalization](../images/quickdraw_sizenormalization.png) </center>
+
+Finally, we compute the differences between consecutive points and store these
+as a `VarLenFeature` in a
+[tensorflow.Example](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+under the key `ink`. In addition we store the `class_index` as a single entry
+`FixedLengthFeature` and the `shape` of the `ink` as a `FixedLengthFeature` of
+length 2.
+
+### Defining the model
+
+To define the model we create a new `Estimator`. If you want to read more about
+estimators, we recommend @{$get_started/custom_estimators$this tutorial}.
+
+To build the model, we:
+
+1.  reshape the input back into the original shape - where the mini batch is
+    padded to the maximal length of its contents. In addition to the ink data we
+    also have the lengths for each example and the target class. This happens in
+    the function [`_get_input_tensors`](#-get-input-tensors).
+
+1.  pass the input through to a series of convolution layers in
+    [`_add_conv_layers`](#-add-conv-layers).
+
+1.  pass the output of the convolutions into a series of bidirectional LSTM
+    layers in [`_add_rnn_layers`](#-add-rnn-layers). At the end of that, the
+    outputs for each time step are summed up to have a compact, fixed length
+    embedding of the input.
+
+1.  classify this embedding using a softmax layer in
+    [`_add_fc_layers`](#-add-fc-layers).
+
+In code this looks like:
+
+```python
+inks, lengths, targets = _get_input_tensors(features, targets)
+convolved = _add_conv_layers(inks)
+final_state = _add_rnn_layers(convolved, lengths)
+logits =_add_fc_layers(final_state)
+```
+
+### _get_input_tensors
+
+To obtain the input features we first obtain the shape from the features dict
+and then create a 1D tensor of size `[batch_size]` containing the lengths of the
+input sequences. The ink is stored as a SparseTensor in the features dict which
+we convert into a dense tensor and then reshape to be `[batch_size, ?, 3]`. And
+finally, if targets were passed in we make sure they are stored as a 1D tensor
+of size `[batch_size]`
+
+In code this looks like this:
+
+```python
+shapes = features["shape"]
+lengths = tf.squeeze(
+    tf.slice(shapes, begin=[0, 0], size=[params["batch_size"], 1]))
+inks = tf.reshape(
+    tf.sparse_tensor_to_dense(features["ink"]),
+    [params["batch_size"], -1, 3])
+if targets is not None:
+  targets = tf.squeeze(targets)
+```
+
+### _add_conv_layers
+
+The desired number of convolution layers and the lengths of the filters is
+configured through the parameters `num_conv` and `conv_len` in the `params`
+dict.
+
+The input is a sequence where each point has dimensionality 3. We are going to
+use 1D convolutions where we treat the 3 input features as channels. That means
+that the input is a `[batch_size, length, 3]` tensor and the output will be a
+`[batch_size, length, number_of_filters]` tensor.
+
+```python
+convolved = inks
+for i in range(len(params.num_conv)):
+  convolved_input = convolved
+  if params.batch_norm:
+    convolved_input = tf.layers.batch_normalization(
+        convolved_input,
+        training=(mode == tf.estimator.ModeKeys.TRAIN))
+  # Add dropout layer if enabled and not first convolution layer.
+  if i > 0 and params.dropout:
+    convolved_input = tf.layers.dropout(
+        convolved_input,
+        rate=params.dropout,
+        training=(mode == tf.estimator.ModeKeys.TRAIN))
+  convolved = tf.layers.conv1d(
+      convolved_input,
+      filters=params.num_conv[i],
+      kernel_size=params.conv_len[i],
+      activation=None,
+      strides=1,
+      padding="same",
+      name="conv1d_%d" % i)
+return convolved, lengths
+```
+
+### _add_rnn_layers
+
+We pass the output from the convolutions into bidirectional LSTM layers for
+which we use a helper function from contrib.
+
+```python
+outputs, _, _ = contrib_rnn.stack_bidirectional_dynamic_rnn(
+    cells_fw=[cell(params.num_nodes) for _ in range(params.num_layers)],
+    cells_bw=[cell(params.num_nodes) for _ in range(params.num_layers)],
+    inputs=convolved,
+    sequence_length=lengths,
+    dtype=tf.float32,
+    scope="rnn_classification")
+```
+
+see the code for more details and how to use `CUDA` accelerated implementations.
+
+To create a compact, fixed-length embedding, we sum up the output of the LSTMs.
+We first zero out the regions of the batch where the sequences have no data.
+
+```python
+mask = tf.tile(
+    tf.expand_dims(tf.sequence_mask(lengths, tf.shape(outputs)[1]), 2),
+    [1, 1, tf.shape(outputs)[2]])
+zero_outside = tf.where(mask, outputs, tf.zeros_like(outputs))
+outputs = tf.reduce_sum(zero_outside, axis=1)
+```
+
+### _add_fc_layers
+
+The embedding of the input is passed into a fully connected layer which we then
+use as a softmax layer.
+
+```python
+tf.layers.dense(final_state, params.num_classes)
+```
+
+### Loss, predictions, and optimizer
+
+Finally, we need to add a loss, a training op, and predictions to create the
+`ModelFn`:
+
+```python
+cross_entropy = tf.reduce_mean(
+    tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=targets, logits=logits))
+# Add the optimizer.
+train_op = tf.contrib.layers.optimize_loss(
+    loss=cross_entropy,
+    global_step=tf.train.get_global_step(),
+    learning_rate=params.learning_rate,
+    optimizer="Adam",
+    # some gradient clipping stabilizes training in the beginning.
+    clip_gradients=params.gradient_clipping_norm,
+    summaries=["learning_rate", "loss", "gradients", "gradient_norm"])
+predictions = tf.argmax(logits, axis=1)
+return model_fn_lib.ModelFnOps(
+    mode=mode,
+    predictions={"logits": logits,
+                 "predictions": predictions},
+    loss=cross_entropy,
+    train_op=train_op,
+    eval_metric_ops={"accuracy": tf.metrics.accuracy(targets, predictions)})
+```
+
+### Training and evaluating the model
+
+To train and evaluate the model we can rely on the functionalities of the
+`Estimator` APIs and easily run training and evaluation with the `Experiment`
+APIs:
+
+```python
+  estimator = tf.estimator.Estimator(
+      model_fn=model_fn,
+      model_dir=output_dir,
+      config=config,
+      params=model_params)
+  # Train the model.
+  tf.contrib.learn.Experiment(
+      estimator=estimator,
+      train_input_fn=get_input_fn(
+          mode=tf.contrib.learn.ModeKeys.TRAIN,
+          tfrecord_pattern=FLAGS.training_data,
+          batch_size=FLAGS.batch_size),
+      train_steps=FLAGS.steps,
+      eval_input_fn=get_input_fn(
+          mode=tf.contrib.learn.ModeKeys.EVAL,
+          tfrecord_pattern=FLAGS.eval_data,
+          batch_size=FLAGS.batch_size),
+      min_eval_frequency=1000)
+```
+
+Note that this tutorial is just a quick example on a relatively small dataset to
+get you familiar with the APIs of recurrent neural networks and estimators. Such
+models can be even more powerful if you try them on a large dataset.
+
+When training the model for 1M steps you can expect to get an accuracy of
+approximately of approximately 70% on the top-1 candidate. Note that this
+accuracy is sufficient to build the quickdraw game because of the game dynamics
+the user will be able to adjust their drawing until it is ready. Also, the game
+does not use the top-1 candidate only but accepts a drawing as correct if the
+target category shows up with a score better than a fixed threshold.
diff --git a/tensorflow/docs_src/tutorials/wide.md b/tensorflow/docs_src/tutorials/wide.md
index 68dda1f2222b4175cd891d727065c93da6a5e68f..005dc020f94f666da295f4ff0342fae858121012 100644
--- a/tensorflow/docs_src/tutorials/wide.md
+++ b/tensorflow/docs_src/tutorials/wide.md
@@ -55,7 +55,7 @@ and continuous columns:
 
 Here's a list of columns available in the Census Income dataset:
 
-| Column Name    | Type        | Description                       | {.sortable}
+| Column Name    | Type        | Description                       |
 | -------------- | ----------- | --------------------------------- |
 | age            | Continuous  | The age of the individual         |
 | workclass      | Categorical | The type of employer the          |
@@ -82,7 +82,7 @@ Here's a list of columns available in the Census Income dataset:
 | hours_per_week | Continuous  | Hours worked per week.            |
 | native_country | Categorical | Country of origin of the          |
 :                :             : individual.                       :
-| income         | Categorical | ">50K" or "<=50K", meaning        |
+| income_bracket | Categorical | ">50K" or "<=50K", meaning        |
 :                :             : whether the person makes more     :
 :                :             : than $50,000 annually.            :
 
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index 46df5973e89a1a87c79bde95262d04b5be88f54e..12146477972a116903f731a03b9755aafd92acc1 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -92,7 +92,7 @@ android_binary(
 filegroup(
     name = "external_assets",
     srcs = [
-        "@inception5h//:model_files",
+        "@inception_v1//:model_files",
         "@mobile_ssd//:model_files",
         "@speech_commands//:model_files",
         "@stylize//:model_files",
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index 79202a38d7199033a9fefa8c6ba71e383aa0bf19..30a26d13c5734c5cf4a3b565c793db3e093c8271 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -126,6 +126,10 @@ the Android NDK and SDK must be installed on your system.
 2.  The Android NDK is required to build the native (C/C++) TensorFlow code. The
     current recommended version is 14b, which may be found
     [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-14b-downloads).
+
+      * NDK 16, the revision released in November 2017, is **incompatible** with
+        Bazel. See [here](https://github.com/tensorflow/tensorflow/issues/14918).
+
 3.  The Android SDK and build tools may be obtained
     [here](https://developer.android.com/tools/revisions/build-tools.html), or
     alternatively as part of [Android
@@ -133,8 +137,16 @@ the Android NDK and SDK must be installed on your system.
     23 is required to build the TF Android demo (though it will run on API >= 21
     devices).
 
+      - The Android Studio SDK Manager's NDK installer will install the latest
+        revision of the NDK, which is **incompatible** with Bazel. You'll need
+        to download an older version manually, as (2) suggests.
+
 ##### Edit WORKSPACE
 
+NOTE: As long as you have the SDK and NDK installed, the `./configure` script
+will create these rules for you. Answer "Yes" when the script asks to
+automatically configure the `./WORKSPACE`.
+
 The Android entries in
 [`<workspace_root>/WORKSPACE`](../../../WORKSPACE#L19-L36) must be uncommented
 with the paths filled in appropriately depending on where you installed the NDK
@@ -156,7 +168,7 @@ download-models.gradle.
 
 **Optional**: If you wish to place the models in your assets manually, remove
 all of the `model_files` entries from the `assets` list in `tensorflow_demo`
-found in the `[BUILD](BUILD)` file. Then download and extract the archives
+found in the [`BUILD`](BUILD#L92) file. Then download and extract the archives
 yourself to the `assets` directory in the source tree:
 
 ```bash
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index 48f566f825d2714fe5970531e3d9c9f0f7ca940e..0767726aa9a248fb073fbd4114f47d1b4ed6901b 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -28,8 +28,8 @@ buildscript {
     }
 
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.0'
-        classpath 'org.apache.httpcomponents:httpclient:4.5.2'
+        classpath 'com.android.tools.build:gradle:3.0.1'
+        classpath 'org.apache.httpcomponents:httpclient:4.5.4'
     }
 }
 
@@ -56,10 +56,12 @@ def nativeOutDir = 'libs/' + cpuType
 def nativeBuildRule = 'buildNativeBazel'
 def demoLibPath = '../../../bazel-bin/tensorflow/examples/android/libtensorflow_demo.so'
 def inferenceLibPath = '../../../bazel-bin/tensorflow/contrib/android/libtensorflow_inference.so'
+
+// Override for Makefile builds.
 if (nativeBuildSystem == 'makefile') {
     nativeBuildRule = 'buildNativeMake'
-    demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_demo.so'
-    inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_inference.so'
+    demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/android_' + cpuType + '/libtensorflow_demo.so'
+    inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/android_' + cpuType + '/libtensorflow_inference.so'
 }
 
 // If building with Bazel, this is the location of the bazel binary.
@@ -75,7 +77,7 @@ apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 23
-    buildToolsVersion "25.0.2"
+    buildToolsVersion '26.0.2'
 
     if (nativeBuildSystem == 'cmake') {
         defaultConfig {
@@ -154,7 +156,8 @@ task buildNativeMake(type: Exec) {
          '-s',  \
          'tensorflow/contrib/makefile/sub_makefiles/android/Makefile.in',  \
          '-t',  \
-         'libtensorflow_inference.so libtensorflow_demo.so'  \
+         'libtensorflow_inference.so libtensorflow_demo.so all'  \
+         , '-a', cpuType  \
          //, '-T'  // Uncomment to skip protobuf and speed up subsequent builds.
 }
 
diff --git a/tensorflow/examples/android/download-models.gradle b/tensorflow/examples/android/download-models.gradle
index 0e2cf65f538f49779b851c3f84259bf839ea90ef..d3b67eab52bfbcf006755bb36396a0d71fb66f77 100644
--- a/tensorflow/examples/android/download-models.gradle
+++ b/tensorflow/examples/android/download-models.gradle
@@ -9,7 +9,7 @@
  */
 // hard coded model files
 // LINT.IfChange
-def models = ['inception5h.zip',
+def models = ['inception_v1.zip',
               'object_detection/ssd_mobilenet_v1_android_export.zip',
               'stylize_v1.zip',
               'speech_commands_conv_actions.zip']
diff --git a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000000000000000000000000000000000000..13372aef5e24af05341d49695ee84e5f9b594659
Binary files /dev/null and b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000000000000000000000000000000000000..bd9ee87db3742e9f8c62df2ec9a7852550d9bbc9
--- /dev/null
+++ b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Sat Nov 18 15:06:47 CET 2017
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.1-all.zip
diff --git a/tensorflow/examples/android/gradlew b/tensorflow/examples/android/gradlew
new file mode 100644
index 0000000000000000000000000000000000000000..9d82f78915133e1c35a6ea51252590fb38efac2f
--- /dev/null
+++ b/tensorflow/examples/android/gradlew
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn ( ) {
+    echo "$*"
+}
+
+die ( ) {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+esac
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
+function splitJvmOpts() {
+    JVM_OPTS=("$@")
+}
+eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
+JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
+
+exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
diff --git a/tensorflow/examples/android/gradlew.bat b/tensorflow/examples/android/gradlew.bat
new file mode 100644
index 0000000000000000000000000000000000000000..8a0b282aa6885fb573c106b3551f7275c5f17e8e
--- /dev/null
+++ b/tensorflow/examples/android/gradlew.bat
@@ -0,0 +1,90 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windowz variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+if "%@eval[2+2]" == "4" goto 4NT_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+goto execute
+
+:4NT_args
+@rem Get arguments from the 4NT Shell from JP Software
+set CMD_LINE_ARGS=%$
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/tensorflow/examples/android/jni/object_tracking/config.h b/tensorflow/examples/android/jni/object_tracking/config.h
index 86e9fc71b690f6dfda9658d9f081e990dbb9a612..47de2d2c15b3f7141182efb261a79a40e0da2e93 100644
--- a/tensorflow/examples/android/jni/object_tracking/config.h
+++ b/tensorflow/examples/android/jni/object_tracking/config.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_CONFIG_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_CONFIG_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_CONFIG_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_CONFIG_H_
 
 #include <math.h>
 
@@ -297,4 +297,4 @@ struct TrackerConfig {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_CONFIG_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_CONFIG_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/flow_cache.h b/tensorflow/examples/android/jni/object_tracking/flow_cache.h
index 8813ab6d71846f5ce2e13a2853594de43d95b0b7..b62e334ecd7de55a31e4904c655c0659b0507639 100644
--- a/tensorflow/examples/android/jni/object_tracking/flow_cache.h
+++ b/tensorflow/examples/android/jni/object_tracking/flow_cache.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
 
 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
@@ -303,4 +303,4 @@ class FlowCache {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/frame_pair.h b/tensorflow/examples/android/jni/object_tracking/frame_pair.h
index 8f409fe80612e0115ca03b01ccfd5f7dd8a5f110..6c8ac9be9810327505f0a4f8c80f7099f060a5da 100644
--- a/tensorflow/examples/android/jni/object_tracking/frame_pair.h
+++ b/tensorflow/examples/android/jni/object_tracking/frame_pair.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
 
 #include "tensorflow/examples/android/jni/object_tracking/keypoint.h"
 
@@ -100,4 +100,4 @@ class FramePair {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/geom.h b/tensorflow/examples/android/jni/object_tracking/geom.h
index 2819063616566a8f83b0cdb5beee48ebbb55e2f6..c975e40144b47337482dcbd4120d645f44fcaf7d 100644
--- a/tensorflow/examples/android/jni/object_tracking/geom.h
+++ b/tensorflow/examples/android/jni/object_tracking/geom.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
 
 #include "tensorflow/examples/android/jni/object_tracking/logging.h"
 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
@@ -316,4 +316,4 @@ inline BoundingSquare GetCenteredSquare(const BoundingBox& original_box) {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/gl_utils.h b/tensorflow/examples/android/jni/object_tracking/gl_utils.h
index bd5c233f4f31ad3a7d99b762911a9fb0acbcd36a..a29e677d3c534cacf41434e53f6ca286d4c1b17c 100755
--- a/tensorflow/examples/android/jni/object_tracking/gl_utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/gl_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GL_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GL_UTILS_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GL_UTILS_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GL_UTILS_H_
 
 #include <GLES/gl.h>
 #include <GLES/glext.h>
@@ -52,4 +52,4 @@ inline static void MapWorldSquareToUnitSquare(const BoundingSquare& square) {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GL_UTILS_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GL_UTILS_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/image-inl.h b/tensorflow/examples/android/jni/object_tracking/image-inl.h
index 9c4c389aa716e640f9dc7a9953266f65c3b997bd..61d69908b5508de3f2d2f670ba5f926e9901f751 100644
--- a/tensorflow/examples/android/jni/object_tracking/image-inl.h
+++ b/tensorflow/examples/android/jni/object_tracking/image-inl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_INL_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_INL_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_INL_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_INL_H_
 
 #include <stdint.h>
 
@@ -641,4 +641,4 @@ inline void Image<T>::FromArray(const T* const pixels, const int stride,
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_INL_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_INL_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/image.h b/tensorflow/examples/android/jni/object_tracking/image.h
index b7a2301f5e1fc0c29ea2b4dd7f539d3438a65871..a436f0e0a13a695e6713eeafaa565495f0353662 100644
--- a/tensorflow/examples/android/jni/object_tracking/image.h
+++ b/tensorflow/examples/android/jni/object_tracking/image.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_H_
 
 #include <stdint.h>
 
@@ -338,4 +338,4 @@ inline std::ostream& operator<<(std::ostream& stream, const Image<t>& image) {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/image_data.h b/tensorflow/examples/android/jni/object_tracking/image_data.h
index 445cdb57a310cddd6f3b7e4e01ee105080f3fdd9..c4f91d8cbd801db11ce740c23360a3c021e2b548 100644
--- a/tensorflow/examples/android/jni/object_tracking/image_data.h
+++ b/tensorflow/examples/android/jni/object_tracking/image_data.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_DATA_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_DATA_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_DATA_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_DATA_H_
 
 #include <stdint.h>
 #include <memory>
@@ -261,4 +261,4 @@ class ImageData {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_DATA_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_DATA_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/image_utils.h b/tensorflow/examples/android/jni/object_tracking/image_utils.h
index ac9ffd90f8a167199bbcc777df74c11630a1ef41..b4ad7000b3321e5b921187e0aa3cba69a2bfb2a6 100644
--- a/tensorflow/examples/android/jni/object_tracking/image_utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/image_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_UTILS_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_UTILS_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_UTILS_H_
 
 #include <stdint.h>
 
@@ -295,4 +295,4 @@ inline void NormalizeImage(Image<float>* const image) {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_UTILS_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_UTILS_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/integral_image.h b/tensorflow/examples/android/jni/object_tracking/integral_image.h
index 8e82334abf684dba6de8247d013893baa2cda953..caf9b7d2ab88f17ee7fc614175165133c5513356 100755
--- a/tensorflow/examples/android/jni/object_tracking/integral_image.h
+++ b/tensorflow/examples/android/jni/object_tracking/integral_image.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
 
 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
@@ -184,4 +184,4 @@ class IntegralImage : public Image<uint32_t> {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/jni_utils.h b/tensorflow/examples/android/jni/object_tracking/jni_utils.h
index 21fbabb5211ad51ea4c77885c5e8e8135b8aa96e..b81d9e0c1262234cfc6f0c5ba6bdc9a16713283f 100644
--- a/tensorflow/examples/android/jni/object_tracking/jni_utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/jni_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_JNI_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_JNI_UTILS_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_JNI_UTILS_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_JNI_UTILS_H_
 
 #include <stdint.h>
 
diff --git a/tensorflow/examples/android/jni/object_tracking/keypoint.h b/tensorflow/examples/android/jni/object_tracking/keypoint.h
index 719f9aff3f80a2328083aa8fe0bcfff587fb38c6..93405a5b2a83f4bb4ad7d97bef2ff361b3578b94 100644
--- a/tensorflow/examples/android/jni/object_tracking/keypoint.h
+++ b/tensorflow/examples/android/jni/object_tracking/keypoint.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
 
 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
@@ -45,4 +45,4 @@ inline std::ostream& operator<<(std::ostream& stream, const Keypoint keypoint) {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/keypoint_detector.h b/tensorflow/examples/android/jni/object_tracking/keypoint_detector.h
index 33d228128d64060123f7aab8b84b23eb87d6fc84..2e85b835a7067b0a1d37908d187680bbc0a91ca6 100644
--- a/tensorflow/examples/android/jni/object_tracking/keypoint_detector.h
+++ b/tensorflow/examples/android/jni/object_tracking/keypoint_detector.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_DETECTOR_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_DETECTOR_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_DETECTOR_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_DETECTOR_H_
 
 #include <stdint.h>
 #include <vector>
@@ -125,4 +125,4 @@ class KeypointDetector {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_DETECTOR_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_DETECTOR_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/logging.h b/tensorflow/examples/android/jni/object_tracking/logging.h
index dbc89af2f7ecd52cd1fff449665630fc0107b1af..852a7493993c104e0d0d7837774073dd8355e960 100644
--- a/tensorflow/examples/android/jni/object_tracking/logging.h
+++ b/tensorflow/examples/android/jni/object_tracking/logging.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
 
 #include <android/log.h>
 #include <string.h>
@@ -118,4 +118,4 @@ void LogPrintF(const int severity, const char* format, ...);
 
 #endif
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_LOG_STREAMING_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/object_detector.h b/tensorflow/examples/android/jni/object_tracking/object_detector.h
index 252556767807a78b0dcbc68c940f5509618cae86..a65c7b0db70bd0fe57826deaab231f545a4fe510 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_detector.h
+++ b/tensorflow/examples/android/jni/object_tracking/object_detector.h
@@ -20,8 +20,8 @@ limitations under the License.
 // Defines the ObjectDetector class that is the main interface for detecting
 // ObjectModelBases in frames.
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_DETECTOR_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_DETECTOR_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_DETECTOR_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_DETECTOR_H_
 
 #include <float.h>
 #include <map>
@@ -227,4 +227,4 @@ class ObjectDetector : public ObjectDetectorBase {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_DETECTOR_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_DETECTOR_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/object_model.h b/tensorflow/examples/android/jni/object_tracking/object_model.h
index be33aea638bf82df60ba151b64bca26fe261402c..5e81c4908080668849a654450cc10e95ec694889 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_model.h
+++ b/tensorflow/examples/android/jni/object_tracking/object_model.h
@@ -19,8 +19,8 @@ limitations under the License.
 
 // Contains ObjectModelBase declaration.
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
 
 #ifdef __RENDER_OPENGL__
 #include <GLES/gl.h>
@@ -99,4 +99,4 @@ class ObjectModel : public ObjectModelBase {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_DETECTION_OBJECT_MODEL_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/object_tracker.h b/tensorflow/examples/android/jni/object_tracking/object_tracker.h
index eb281fad3726cf782c1b937c3a213ba7f926bf88..20c7627fc5f0c0718f67eb230d00a8582b637e2c 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_tracker.h
+++ b/tensorflow/examples/android/jni/object_tracking/object_tracker.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_TRACKER_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_TRACKER_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_TRACKER_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_TRACKER_H_
 
 #include <map>
 #include <string>
@@ -267,4 +267,4 @@ inline std::ostream& operator<<(std::ostream& stream,
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_TRACKER_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_TRACKER_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/optical_flow.h b/tensorflow/examples/android/jni/object_tracking/optical_flow.h
index 2206375bebd80e75a9fe2a52609c6d8b3875b65e..f98ae22bd646775871832a40e4c9c0e72916ca4a 100644
--- a/tensorflow/examples/android/jni/object_tracking/optical_flow.h
+++ b/tensorflow/examples/android/jni/object_tracking/optical_flow.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
 
 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
@@ -97,4 +97,4 @@ class OpticalFlow {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/sprite.h b/tensorflow/examples/android/jni/object_tracking/sprite.h
index 05a13fea111941b2f36daa3694fba1a11ecd411a..b54a68458f108bf736a4daf237d34fc10742e1a6 100755
--- a/tensorflow/examples/android/jni/object_tracking/sprite.h
+++ b/tensorflow/examples/android/jni/object_tracking/sprite.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
 
 #include <GLES/gl.h>
 #include <GLES/glext.h>
@@ -199,4 +199,4 @@ class Sprite {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_SPRITE_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/time_log.h b/tensorflow/examples/android/jni/object_tracking/time_log.h
index 60911da396c2e7ce0315e1b53a32773bd7b233c3..0073e115963ffc28ed22d5e50809d1e9f70094f4 100644
--- a/tensorflow/examples/android/jni/object_tracking/time_log.h
+++ b/tensorflow/examples/android/jni/object_tracking/time_log.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Utility functions for performance profiling.
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TIME_LOG_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TIME_LOG_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TIME_LOG_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TIME_LOG_H_
 
 #include <stdint.h>
 
@@ -134,4 +134,4 @@ inline static void TimeLog(const char* const str) {
 inline static void PrintTimeLog() {}
 #endif
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TIME_LOG_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TIME_LOG_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/tracked_object.h b/tensorflow/examples/android/jni/object_tracking/tracked_object.h
index cda14e19d26260703cbc213592c1795865e021a5..d7f1a7019bb2cb93e86d3de9122d597e6d907a7a 100644
--- a/tensorflow/examples/android/jni/object_tracking/tracked_object.h
+++ b/tensorflow/examples/android/jni/object_tracking/tracked_object.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TRACKED_OBJECT_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TRACKED_OBJECT_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TRACKED_OBJECT_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TRACKED_OBJECT_H_
 
 #ifdef __RENDER_OPENGL__
 #include "tensorflow/examples/android/jni/object_tracking/gl_utils.h"
@@ -183,4 +183,4 @@ inline std::ostream& operator<<(std::ostream& stream,
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TRACKED_OBJECT_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TRACKED_OBJECT_H_
diff --git a/tensorflow/examples/android/jni/object_tracking/utils.h b/tensorflow/examples/android/jni/object_tracking/utils.h
index 51cdfcdcfb123b8d604d3f33db85628d6c67fb18..2e98734ec4e7e44894cb78e753ac7084d62c87a8 100644
--- a/tensorflow/examples/android/jni/object_tracking/utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_UTILS_H_
+#ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_UTILS_H_
+#define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_UTILS_H_
 
 #include <math.h>
 #include <stdint.h>
@@ -378,4 +378,4 @@ inline bool Invert2x2(const T* const a, float* const a_inv) {
 
 }  // namespace tf_tracking
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_UTILS_H_
+#endif  // TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_UTILS_H_
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
index 4e45f42d0c97ed9dad9f9702adc3c1efe658699f..8bd4abb154a8f8c74f2195d4acbb99d3d5d498ea 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
@@ -333,8 +333,12 @@ public abstract class CameraActivity extends Activity
           continue;
         }
 
-        useCamera2API = isHardwareLevelSupported(characteristics,
-            CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_FULL);
+        // Fallback to camera1 API for internal cameras that don't have full support.
+        // This should help with legacy situations where using the camera2 API causes
+        // distorted or otherwise broken previews.
+        useCamera2API = (facing == CameraCharacteristics.LENS_FACING_EXTERNAL)
+            || isHardwareLevelSupported(characteristics, 
+                                        CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_FULL);
         LOGGER.i("Camera API lv2?: %s", useCamera2API);
         return cameraId;
       }
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java b/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
index a317273acdff016c824031e06c413ecc01f82ec8..068c7b0d945669b8207097e81c03ade07bc7ca73 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
@@ -81,8 +81,11 @@ public class LegacyCameraConnectionFragment extends Fragment {
 
           try {
             Camera.Parameters parameters = camera.getParameters();
-            parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
-
+            List<String> focusModes = parameters.getSupportedFocusModes();
+            if (focusModes != null
+                && focusModes.contains(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE)) {
+              parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
+            }
             List<Camera.Size> cameraSizes = parameters.getSupportedPreviewSizes();
             Size[] sizes = new Size[cameraSizes.size()];
             int i = 0;
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
index 2fe2ba539edc84e80baf36b6d1ac1e192bc92163..af6af2bc8f508a70aa7e44a7236f0e7ea5e3d71c 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
@@ -199,7 +199,7 @@ public class MultiBoxTracker {
       final int w,
       final int h,
       final int rowStride,
-      final int sensorOrienation,
+      final int sensorOrientation,
       final byte[] frame,
       final long timestamp) {
     if (objectTracker == null && !initialized) {
@@ -209,7 +209,7 @@ public class MultiBoxTracker {
       objectTracker = ObjectTracker.getInstance(w, h, rowStride, true);
       frameWidth = w;
       frameHeight = h;
-      this.sensorOrientation = sensorOrienation;
+      this.sensorOrientation = sensorOrientation;
       initialized = true;
 
       if (objectTracker == null) {
diff --git a/tensorflow/examples/how_tos/reading_data/convert_to_records.py b/tensorflow/examples/how_tos/reading_data/convert_to_records.py
index a402eac053cb474db0fd90876501a9c13906ea82..c89e83956322cb87a4cf41c6b7172f03d941b429 100644
--- a/tensorflow/examples/how_tos/reading_data/convert_to_records.py
+++ b/tensorflow/examples/how_tos/reading_data/convert_to_records.py
@@ -55,12 +55,15 @@ def convert_to(data_set, name):
   with tf.python_io.TFRecordWriter(filename) as writer:
     for index in range(num_examples):
       image_raw = images[index].tostring()
-      example = tf.train.Example(features=tf.train.Features(feature={
-          'height': _int64_feature(rows),
-          'width': _int64_feature(cols),
-          'depth': _int64_feature(depth),
-          'label': _int64_feature(int(labels[index])),
-          'image_raw': _bytes_feature(image_raw)}))
+      example = tf.train.Example(
+          features=tf.train.Features(
+              feature={
+                  'height': _int64_feature(rows),
+                  'width': _int64_feature(cols),
+                  'depth': _int64_feature(depth),
+                  'label': _int64_feature(int(labels[index])),
+                  'image_raw': _bytes_feature(image_raw)
+              }))
       writer.write(example.SerializeToString())
 
 
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index a9ed02dd1a60ad79c2943212155bad864a750a99..461fb1c5173f66278eb585d30bd8749a58fb6245 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Train and Eval the MNIST network.
 
 This version is like fully_connected_feed.py but uses data converted
@@ -45,9 +44,7 @@ TRAIN_FILE = 'train.tfrecords'
 VALIDATION_FILE = 'validation.tfrecords'
 
 
-def read_and_decode(filename_queue):
-  reader = tf.TFRecordReader()
-  _, serialized_example = reader.read(filename_queue)
+def decode(serialized_example):
   features = tf.parse_single_example(
       serialized_example,
       # Defaults are not specified since both keys are required.
@@ -60,19 +57,26 @@ def read_and_decode(filename_queue):
   # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
   # [mnist.IMAGE_PIXELS].
   image = tf.decode_raw(features['image_raw'], tf.uint8)
-  image.set_shape([mnist.IMAGE_PIXELS])
+  image.set_shape((mnist.IMAGE_PIXELS))
+
+  # Convert label from a scalar uint8 tensor to an int32 scalar.
+  label = tf.cast(features['label'], tf.int32)
 
+  return image, label
+
+
+def augment(image, label):
   # OPTIONAL: Could reshape into a 28x28 image and apply distortions
   # here.  Since we are not applying any distortions in this
   # example, and the next step expects the image to be flattened
   # into a vector, we don't bother.
+  return image, label
 
+
+def normalize(image, label):
   # Convert from [0, 255] -> [-0.5, 0.5] floats.
   image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
 
-  # Convert label from a scalar uint8 tensor to an int32 scalar.
-  label = tf.cast(features['label'], tf.int32)
-
   return image, label
 
 
@@ -91,31 +95,33 @@ def inputs(train, batch_size, num_epochs):
       in the range [-0.5, 0.5].
     * labels is an int32 tensor with shape [batch_size] with the true label,
       a number in the range [0, mnist.NUM_CLASSES).
-    Note that an tf.train.QueueRunner is added to the graph, which
-    must be run using e.g. tf.train.start_queue_runners().
+
+    This function creates a one_shot_iterator, meaning that it will only iterate
+    over the dataset once. On the other hand there is no special initialization
+    required.
   """
-  if not num_epochs: num_epochs = None
-  filename = os.path.join(FLAGS.train_dir,
-                          TRAIN_FILE if train else VALIDATION_FILE)
+  if not num_epochs:
+    num_epochs = None
+  filename = os.path.join(FLAGS.train_dir, TRAIN_FILE
+                          if train else VALIDATION_FILE)
 
   with tf.name_scope('input'):
-    filename_queue = tf.train.string_input_producer(
-        [filename], num_epochs=num_epochs)
+    # TFRecordDataset opens a protobuf and reads entries line by line
+    # could also be [list, of, filenames]
+    dataset = tf.data.TFRecordDataset(filename)
+    dataset = dataset.repeat(num_epochs)
 
-    # Even when reading in multiple threads, share the filename
-    # queue.
-    image, label = read_and_decode(filename_queue)
+    # map takes a python function and applies it to every sample
+    dataset = dataset.map(decode)
+    dataset = dataset.map(augment)
+    dataset = dataset.map(normalize)
 
-    # Shuffle the examples and collect them into batch_size batches.
-    # (Internally uses a RandomShuffleQueue.)
-    # We run this in two threads to avoid being a bottleneck.
-    images, sparse_labels = tf.train.shuffle_batch(
-        [image, label], batch_size=batch_size, num_threads=2,
-        capacity=1000 + 3 * batch_size,
-        # Ensures a minimum amount of shuffling of examples.
-        min_after_dequeue=1000)
+    #the parameter is the queue size
+    dataset = dataset.shuffle(1000 + 3 * batch_size)
+    dataset = dataset.batch(batch_size)
 
-    return images, sparse_labels
+    iterator = dataset.make_one_shot_iterator()
+  return iterator.get_next()
 
 
 def run_training():
@@ -124,16 +130,14 @@ def run_training():
   # Tell TensorFlow that the model will be built into the default Graph.
   with tf.Graph().as_default():
     # Input images and labels.
-    images, labels = inputs(train=True, batch_size=FLAGS.batch_size,
-                            num_epochs=FLAGS.num_epochs)
+    image_batch, label_batch = inputs(
+        train=True, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs)
 
     # Build a Graph that computes predictions from the inference model.
-    logits = mnist.inference(images,
-                             FLAGS.hidden1,
-                             FLAGS.hidden2)
+    logits = mnist.inference(image_batch, FLAGS.hidden1, FLAGS.hidden2)
 
     # Add to the Graph the loss calculation.
-    loss = mnist.loss(logits, labels)
+    loss = mnist.loss(logits, label_batch)
 
     # Add to the Graph operations that train the model.
     train_op = mnist.training(loss, FLAGS.learning_rate)
@@ -143,45 +147,33 @@ def run_training():
                        tf.local_variables_initializer())
 
     # Create a session for running operations in the Graph.
-    sess = tf.Session()
-
-    # Initialize the variables (the trained variables and the
-    # epoch counter).
-    sess.run(init_op)
-
-    # Start input enqueue threads.
-    coord = tf.train.Coordinator()
-    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
-
-    try:
-      step = 0
-      while not coord.should_stop():
-        start_time = time.time()
-
-        # Run one step of the model.  The return values are
-        # the activations from the `train_op` (which is
-        # discarded) and the `loss` op.  To inspect the values
-        # of your ops or variables, you may include them in
-        # the list passed to sess.run() and the value tensors
-        # will be returned in the tuple from the call.
-        _, loss_value = sess.run([train_op, loss])
-
-        duration = time.time() - start_time
-
-        # Print an overview fairly often.
-        if step % 100 == 0:
-          print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
-                                                     duration))
-        step += 1
-    except tf.errors.OutOfRangeError:
-      print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
-    finally:
-      # When done, ask the threads to stop.
-      coord.request_stop()
-
-    # Wait for threads to finish.
-    coord.join(threads)
-    sess.close()
+    with tf.Session() as sess:
+      # Initialize the variables (the trained variables and the
+      # epoch counter).
+      sess.run(init_op)
+      try:
+        step = 0
+        while True:  #train until OutOfRangeError
+          start_time = time.time()
+
+          # Run one step of the model.  The return values are
+          # the activations from the `train_op` (which is
+          # discarded) and the `loss` op.  To inspect the values
+          # of your ops or variables, you may include them in
+          # the list passed to sess.run() and the value tensors
+          # will be returned in the tuple from the call.
+          _, loss_value = sess.run([train_op, loss])
+
+          duration = time.time() - start_time
+
+          # Print an overview fairly often.
+          if step % 100 == 0:
+            print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
+                                                       duration))
+          step += 1
+      except tf.errors.OutOfRangeError:
+        print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs,
+                                                          step))
 
 
 def main(_):
@@ -194,37 +186,27 @@ if __name__ == '__main__':
       '--learning_rate',
       type=float,
       default=0.01,
-      help='Initial learning rate.'
-  )
+      help='Initial learning rate.')
   parser.add_argument(
       '--num_epochs',
       type=int,
       default=2,
-      help='Number of epochs to run trainer.'
-  )
+      help='Number of epochs to run trainer.')
   parser.add_argument(
       '--hidden1',
       type=int,
       default=128,
-      help='Number of units in hidden layer 1.'
-  )
+      help='Number of units in hidden layer 1.')
   parser.add_argument(
       '--hidden2',
       type=int,
       default=32,
-      help='Number of units in hidden layer 2.'
-  )
-  parser.add_argument(
-      '--batch_size',
-      type=int,
-      default=100,
-      help='Batch size.'
-  )
+      help='Number of units in hidden layer 2.')
+  parser.add_argument('--batch_size', type=int, default=100, help='Batch size.')
   parser.add_argument(
       '--train_dir',
       type=str,
       default='/tmp/data',
-      help='Directory with the training data.'
-  )
+      help='Directory with the training data.')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index ebddfb20f4b60986fba1cdbfe3fcb184149b0a99..58c5f87884e5a091300f128403d00fb90bad59fe 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -344,8 +344,8 @@ def maybe_download_and_extract(data_url):
     filepath, _ = urllib.request.urlretrieve(data_url, filepath, _progress)
     print()
     statinfo = os.stat(filepath)
-    tf.logging.info('Successfully downloaded', filename, statinfo.st_size,
-                    'bytes.')
+    tf.logging.info('Successfully downloaded %s %d bytes.',
+                    filename, statinfo.st_size)
     print('Extracting file from ', filepath)
     tarfile.open(filepath, 'r:gz').extractall(dest_directory)
   else:
@@ -539,10 +539,8 @@ def get_random_cached_bottlenecks(sess, image_lists, how_many, category,
           sess, image_lists, label_name, image_index, image_dir, category,
           bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
           resized_input_tensor, bottleneck_tensor, architecture)
-      ground_truth = np.zeros(class_count, dtype=np.float32)
-      ground_truth[label_index] = 1.0
       bottlenecks.append(bottleneck)
-      ground_truths.append(ground_truth)
+      ground_truths.append(label_index)
       filenames.append(image_name)
   else:
     # Retrieve all bottlenecks.
@@ -555,10 +553,8 @@ def get_random_cached_bottlenecks(sess, image_lists, how_many, category,
             sess, image_lists, label_name, image_index, image_dir, category,
             bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
             resized_input_tensor, bottleneck_tensor, architecture)
-        ground_truth = np.zeros(class_count, dtype=np.float32)
-        ground_truth[label_index] = 1.0
         bottlenecks.append(bottleneck)
-        ground_truths.append(ground_truth)
+        ground_truths.append(label_index)
         filenames.append(image_name)
   return bottlenecks, ground_truths, filenames
 
@@ -610,10 +606,8 @@ def get_random_distorted_bottlenecks(
     bottleneck_values = sess.run(bottleneck_tensor,
                                  {resized_input_tensor: distorted_image_data})
     bottleneck_values = np.squeeze(bottleneck_values)
-    ground_truth = np.zeros(class_count, dtype=np.float32)
-    ground_truth[label_index] = 1.0
     bottlenecks.append(bottleneck_values)
-    ground_truths.append(ground_truth)
+    ground_truths.append(label_index)
   return bottlenecks, ground_truths
 
 
@@ -774,9 +768,8 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
         shape=[None, bottleneck_tensor_size],
         name='BottleneckInputPlaceholder')
 
-    ground_truth_input = tf.placeholder(tf.float32,
-                                        [None, class_count],
-                                        name='GroundTruthInput')
+    ground_truth_input = tf.placeholder(
+        tf.int64, [None], name='GroundTruthInput')
 
   # Organizing the following ops as `final_training_ops` so they're easier
   # to see in TensorBoard
@@ -823,10 +816,8 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
   tf.summary.histogram('activations', final_tensor)
 
   with tf.name_scope('cross_entropy'):
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+    cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
         labels=ground_truth_input, logits=logits)
-    with tf.name_scope('total'):
-      cross_entropy_mean = tf.reduce_mean(cross_entropy)
 
   tf.summary.scalar('cross_entropy', cross_entropy_mean)
 
@@ -852,8 +843,7 @@ def add_evaluation_step(result_tensor, ground_truth_tensor):
   with tf.name_scope('accuracy'):
     with tf.name_scope('correct_prediction'):
       prediction = tf.argmax(result_tensor, 1)
-      correct_prediction = tf.equal(
-          prediction, tf.argmax(ground_truth_tensor, 1))
+      correct_prediction = tf.equal(prediction, ground_truth_tensor)
     with tf.name_scope('accuracy'):
       evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   tf.summary.scalar('accuracy', evaluation_step)
@@ -1178,7 +1168,7 @@ def main(_):
     if FLAGS.print_misclassified_test_images:
       tf.logging.info('=== MISCLASSIFIED TEST IMAGES ===')
       for i, test_filename in enumerate(test_filenames):
-        if predictions[i] != test_ground_truth[i].argmax():
+        if predictions[i] != test_ground_truth[i]:
           tf.logging.info('%70s  %s' %
                           (test_filename,
                            list(image_lists.keys())[predictions[i]]))
diff --git a/tensorflow/examples/image_retraining/retrain_test.py b/tensorflow/examples/image_retraining/retrain_test.py
index 2de4c4ec99f87544bfda9d0fe5977f60742d82a0..8b8dd45fd72e3d29bdb7f6291cc53b912adf3644 100644
--- a/tensorflow/examples/image_retraining/retrain_test.py
+++ b/tensorflow/examples/image_retraining/retrain_test.py
@@ -87,7 +87,7 @@ class ImageRetrainingTest(test_util.TensorFlowTestCase):
   def testAddEvaluationStep(self):
     with tf.Graph().as_default():
       final = tf.placeholder(tf.float32, [1], name='final')
-      gt = tf.placeholder(tf.float32, [1], name='gt')
+      gt = tf.placeholder(tf.int64, [1], name='gt')
       self.assertIsNotNone(retrain.add_evaluation_step(final, gt))
 
   def testAddJpegDecoding(self):
diff --git a/tensorflow/examples/ios/.gitignore b/tensorflow/examples/ios/.gitignore
index e572b3012c600ab856ac8e5bd71e4291b1ba7bcf..dbabfb33bf11e0436d8900ba9f2d1ba6195a9a47 100644
--- a/tensorflow/examples/ios/.gitignore
+++ b/tensorflow/examples/ios/.gitignore
@@ -2,3 +2,6 @@ project.xcworkspace
 xcuserdata
 imagenet_comp_graph_label_strings.txt
 tensorflow_inception_graph.pb
+simple/data/LICENSE
+camera/data/LICENSE
+benchmark/data/LICENSE
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index 9207fc6332db9870fbb5e2b4bd6b77a5a24fbb23..2abbe9dacca79b8d6e516550e28a9b203b18f123 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -51,6 +51,16 @@ tf_cc_binary(
     }),
 )
 
+py_binary(
+    name = "label_image_py",
+    srcs = ["label_image.py"],
+    main = "label_image.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/examples/label_image/README.md b/tensorflow/examples/label_image/README.md
index a9e44745e5cfa673c19d2c2fb434251b12d7aad6..cfd0132a7ae385d1a79c4b8f26b54244dcd3a087 100644
--- a/tensorflow/examples/label_image/README.md
+++ b/tensorflow/examples/label_image/README.md
@@ -73,10 +73,23 @@ Python than the Python code mentioned in the
 [Inception tutorial](https://www.tensorflow.org/tutorials/image_recognition/).
 and could be easier to add visualization or debug code.
 
-With tensorflow python package installed, you can run it like:
+
+`bazel-bin/tensorflow/examples/label_image/label_image_py` should be there after
+```bash
+$ bazel build tensorflow/examples/label_image/...
+```
+
+Run
+
+```bash
+$ bazel-bin/tensorflow/examples/label_image/label_image_py
+```
+
+Or, with tensorflow python package installed, you can run it like:
 ```bash
 $ python3 tensorflow/examples/label_image/label_image.py
 ```
+
 And get result similar to this:
 ```
 military uniform 0.834305
diff --git a/tensorflow/examples/label_image/label_image.py b/tensorflow/examples/label_image/label_image.py
index 39d09813375687fc954cab3d55ce997f8684da17..fe5e0fc684abce08d3d7b7f3fa22bb5ba701c64a 100644
--- a/tensorflow/examples/label_image/label_image.py
+++ b/tensorflow/examples/label_image/label_image.py
@@ -18,11 +18,11 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import sys
 
 import numpy as np
 import tensorflow as tf
 
+
 def load_graph(model_file):
   graph = tf.Graph()
   graph_def = tf.GraphDef()
@@ -34,24 +34,28 @@ def load_graph(model_file):
 
   return graph
 
-def read_tensor_from_image_file(file_name, input_height=299, input_width=299,
-				input_mean=0, input_std=255):
+
+def read_tensor_from_image_file(file_name,
+                                input_height=299,
+                                input_width=299,
+                                input_mean=0,
+                                input_std=255):
   input_name = "file_reader"
   output_name = "normalized"
   file_reader = tf.read_file(file_name, input_name)
   if file_name.endswith(".png"):
-    image_reader = tf.image.decode_png(file_reader, channels = 3,
-                                       name='png_reader')
+    image_reader = tf.image.decode_png(
+        file_reader, channels=3, name="png_reader")
   elif file_name.endswith(".gif"):
-    image_reader = tf.squeeze(tf.image.decode_gif(file_reader,
-                                                  name='gif_reader'))
+    image_reader = tf.squeeze(
+        tf.image.decode_gif(file_reader, name="gif_reader"))
   elif file_name.endswith(".bmp"):
-    image_reader = tf.image.decode_bmp(file_reader, name='bmp_reader')
+    image_reader = tf.image.decode_bmp(file_reader, name="bmp_reader")
   else:
-    image_reader = tf.image.decode_jpeg(file_reader, channels = 3,
-                                        name='jpeg_reader')
+    image_reader = tf.image.decode_jpeg(
+        file_reader, channels=3, name="jpeg_reader")
   float_caster = tf.cast(image_reader, tf.float32)
-  dims_expander = tf.expand_dims(float_caster, 0);
+  dims_expander = tf.expand_dims(float_caster, 0)
   resized = tf.image.resize_bilinear(dims_expander, [input_height, input_width])
   normalized = tf.divide(tf.subtract(resized, [input_mean]), [input_std])
   sess = tf.Session()
@@ -59,6 +63,7 @@ def read_tensor_from_image_file(file_name, input_height=299, input_width=299,
 
   return result
 
+
 def load_labels(label_file):
   label = []
   proto_as_ascii_lines = tf.gfile.GFile(label_file).readlines()
@@ -66,6 +71,7 @@ def load_labels(label_file):
     label.append(l.rstrip())
   return label
 
+
 if __name__ == "__main__":
   file_name = "tensorflow/examples/label_image/data/grace_hopper.jpg"
   model_file = \
@@ -110,20 +116,22 @@ if __name__ == "__main__":
     output_layer = args.output_layer
 
   graph = load_graph(model_file)
-  t = read_tensor_from_image_file(file_name,
-                                  input_height=input_height,
-                                  input_width=input_width,
-                                  input_mean=input_mean,
-                                  input_std=input_std)
+  t = read_tensor_from_image_file(
+      file_name,
+      input_height=input_height,
+      input_width=input_width,
+      input_mean=input_mean,
+      input_std=input_std)
 
   input_name = "import/" + input_layer
   output_name = "import/" + output_layer
-  input_operation = graph.get_operation_by_name(input_name);
-  output_operation = graph.get_operation_by_name(output_name);
+  input_operation = graph.get_operation_by_name(input_name)
+  output_operation = graph.get_operation_by_name(output_name)
 
   with tf.Session(graph=graph) as sess:
-    results = sess.run(output_operation.outputs[0],
-                      {input_operation.outputs[0]: t})
+    results = sess.run(output_operation.outputs[0], {
+        input_operation.outputs[0]: t
+    })
   results = np.squeeze(results)
 
   top_k = results.argsort()[-5:][::-1]
diff --git a/tensorflow/examples/learn/iris_custom_decay_dnn.py b/tensorflow/examples/learn/iris_custom_decay_dnn.py
index 072357e51c418ae1163debe29516c31ccc367386..4a219694d10ef075e0e0403cdd7ed100c39ddadd 100644
--- a/tensorflow/examples/learn/iris_custom_decay_dnn.py
+++ b/tensorflow/examples/learn/iris_custom_decay_dnn.py
@@ -46,12 +46,8 @@ def my_model(features, labels, mode):
     }
     return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
-  # Convert the labels to a one-hot tensor of shape (length of features, 3) and
-  # with a on-value of 1 for each one-hot vector of length 3.
-  onehot_labels = tf.one_hot(labels, 3, 1, 0)
   # Compute loss.
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op with exponentially decaying learning rate.
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/iris_custom_model.py b/tensorflow/examples/learn/iris_custom_model.py
index 471a99ba76dd8012ba3b1a519d5d07fb378f89e7..c6bdb86ba52b9715b977909d9b7d0fbc59161a53 100644
--- a/tensorflow/examples/learn/iris_custom_model.py
+++ b/tensorflow/examples/learn/iris_custom_model.py
@@ -47,12 +47,8 @@ def my_model(features, labels, mode):
     }
     return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
-  # Convert the labels to a one-hot tensor of shape (length of features, 3) and
-  # with a on-value of 1 for each one-hot vector of length 3.
-  onehot_labels = tf.one_hot(labels, 3, 1, 0)
   # Compute loss.
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op.
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/mnist.py b/tensorflow/examples/learn/mnist.py
index 88425ea0d0bf72fb7e7d9cbab27da023f3ade122..98819b20bfea5021d52e2c50b004bccdaf1f25e7 100644
--- a/tensorflow/examples/learn/mnist.py
+++ b/tensorflow/examples/learn/mnist.py
@@ -77,9 +77,7 @@ def conv_model(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
   # Compute loss.
-  onehot_labels = tf.one_hot(tf.cast(labels, tf.int32), N_DIGITS, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op.
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/multiple_gpu.py b/tensorflow/examples/learn/multiple_gpu.py
index a294950a386a7207858bbcff345f14de44ffb9ca..3bad22ddf66b7981930637d64cc8653e3fb29cdf 100644
--- a/tensorflow/examples/learn/multiple_gpu.py
+++ b/tensorflow/examples/learn/multiple_gpu.py
@@ -65,12 +65,8 @@ def my_model(features, labels, mode):
       }
       return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
-    # Convert the labels to a one-hot tensor of shape (length of features, 3)
-    # and with a on-value of 1 for each one-hot vector of length 3.
-    onehot_labels = tf.one_hot(labels, 3, 1, 0)
     # Compute loss.
-    loss = tf.losses.softmax_cross_entropy(
-        onehot_labels=onehot_labels, logits=logits)
+    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
     # Create training op.
     if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/resnet.py b/tensorflow/examples/learn/resnet.py
index 1e0966475b01d067330dc4797032d561857fd208..9542e552504580a6614f8bd2f43c38dfa795750f 100755
--- a/tensorflow/examples/learn/resnet.py
+++ b/tensorflow/examples/learn/resnet.py
@@ -151,9 +151,7 @@ def res_net_model(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
   # Compute loss.
-  onehot_labels = tf.one_hot(tf.cast(labels, tf.int32), N_DIGITS, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op.
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index ba89c532be5fa0e13a2dcb1f7894be4c631507d7..e4e61862b02f9827f42c8d0052a7be8a57502dd8 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -34,8 +34,7 @@ MAX_LABEL = 15
 WORDS_FEATURE = 'words'  # Name of the input words feature.
 
 
-def estimator_spec_for_softmax_classification(
-    logits, labels, mode):
+def estimator_spec_for_softmax_classification(logits, labels, mode):
   """Returns EstimatorSpec instance for softmax classification."""
   predicted_classes = tf.argmax(logits, 1)
   if mode == tf.estimator.ModeKeys.PREDICT:
@@ -46,17 +45,15 @@ def estimator_spec_for_softmax_classification(
             'prob': tf.nn.softmax(logits)
         })
 
-  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
   if mode == tf.estimator.ModeKeys.TRAIN:
     optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
     return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
 
   eval_metric_ops = {
-      'accuracy': tf.metrics.accuracy(
-          labels=labels, predictions=predicted_classes)
+      'accuracy':
+          tf.metrics.accuracy(labels=labels, predictions=predicted_classes)
   }
   return tf.estimator.EstimatorSpec(
       mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
@@ -69,8 +66,7 @@ def bag_of_words_model(features, labels, mode):
   bow_embedding_column = tf.feature_column.embedding_column(
       bow_column, dimension=EMBEDDING_SIZE)
   bow = tf.feature_column.input_layer(
-      features,
-      feature_columns=[bow_embedding_column])
+      features, feature_columns=[bow_embedding_column])
   logits = tf.layers.dense(bow, MAX_LABEL, activation=None)
 
   return estimator_spec_for_softmax_classification(
@@ -112,9 +108,9 @@ def main(unused_argv):
   # Prepare training and testing data
   dbpedia = tf.contrib.learn.datasets.load_dataset(
       'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
-  x_train = pandas.Series(dbpedia.train.data[:,1])
+  x_train = pandas.Series(dbpedia.train.data[:, 1])
   y_train = pandas.Series(dbpedia.train.target)
-  x_test = pandas.Series(dbpedia.test.data[:,1])
+  x_test = pandas.Series(dbpedia.test.data[:, 1])
   y_test = pandas.Series(dbpedia.test.target)
 
   # Process vocabulary
@@ -154,10 +150,7 @@ def main(unused_argv):
 
   # Predict.
   test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={WORDS_FEATURE: x_test},
-      y=y_test,
-      num_epochs=1,
-      shuffle=False)
+      x={WORDS_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
   predictions = classifier.predict(input_fn=test_input_fn)
   y_predicted = np.array(list(p['class'] for p in predictions))
   y_predicted = y_predicted.reshape(np.array(y_test).shape)
diff --git a/tensorflow/examples/learn/text_classification_character_cnn.py b/tensorflow/examples/learn/text_classification_character_cnn.py
index 363ff003628e03be40c1be6b7b32e12a07533047..afda170e2a9c1b0281fdd3d7ed210a1bfcd4481b 100644
--- a/tensorflow/examples/learn/text_classification_character_cnn.py
+++ b/tensorflow/examples/learn/text_classification_character_cnn.py
@@ -88,9 +88,7 @@ def char_cnn_model(features, labels, mode):
             'prob': tf.nn.softmax(logits)
         })
 
-  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
   if mode == tf.estimator.ModeKeys.TRAIN:
     optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
diff --git a/tensorflow/examples/learn/text_classification_character_rnn.py b/tensorflow/examples/learn/text_classification_character_rnn.py
index 86adc056add508c309b3a5b93e58e9c195995642..15733821fb17eb17269fea295020f6690bb62854 100644
--- a/tensorflow/examples/learn/text_classification_character_rnn.py
+++ b/tensorflow/examples/learn/text_classification_character_rnn.py
@@ -59,9 +59,7 @@ def char_rnn_model(features, labels, mode):
             'prob': tf.nn.softmax(logits)
         })
 
-  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
   if mode == tf.estimator.ModeKeys.TRAIN:
     optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
diff --git a/tensorflow/examples/learn/text_classification_cnn.py b/tensorflow/examples/learn/text_classification_cnn.py
index be262285a3a7aa0d6b9430a2226b448fe674cd7f..9e21aee87f629835222ab367dc3ed55863f553e4 100644
--- a/tensorflow/examples/learn/text_classification_cnn.py
+++ b/tensorflow/examples/learn/text_classification_cnn.py
@@ -87,9 +87,7 @@ def cnn_model(features, labels, mode):
             'prob': tf.nn.softmax(logits)
         })
 
-  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
   if mode == tf.estimator.ModeKeys.TRAIN:
     optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
diff --git a/tensorflow/examples/speech_commands/accuracy_utils.h b/tensorflow/examples/speech_commands/accuracy_utils.h
index 8d918cb64b064e10bd6f3e42e3e56d86d74242c6..eea048365bc9ff53bdd767be436fb657b43793c7 100644
--- a/tensorflow/examples/speech_commands/accuracy_utils.h
+++ b/tensorflow/examples/speech_commands/accuracy_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_ACCURACY_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_ACCURACY_UTILS_H_
+#ifndef TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_ACCURACY_UTILS_H_
+#define TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_ACCURACY_UTILS_H_
 
 #include <vector>
 
@@ -57,4 +57,4 @@ void PrintAccuracyStats(const StreamingAccuracyStats& stats);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_ACCURACY_UTILS_H_
+#endif  // TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_ACCURACY_UTILS_H_
diff --git a/tensorflow/examples/speech_commands/input_data.py b/tensorflow/examples/speech_commands/input_data.py
index 751652b330cd203efe216567172fd3dbb4a5b401..e7db9cddf02daf9a32d3ed859ee9bd35b2cae838 100644
--- a/tensorflow/examples/speech_commands/input_data.py
+++ b/tensorflow/examples/speech_commands/input_data.py
@@ -417,8 +417,7 @@ class AudioProcessor(object):
       sess: TensorFlow session that was active when processor was created.
 
     Returns:
-      List of sample data for the transformed samples, and list of labels in
-      one-hot form.
+      List of sample data for the transformed samples, and list of label indexes
     """
     # Pick one of the partitions to choose samples from.
     candidates = self.data_index[mode]
@@ -428,7 +427,7 @@ class AudioProcessor(object):
       sample_count = max(0, min(how_many, len(candidates) - offset))
     # Data and labels will be populated and returned.
     data = np.zeros((sample_count, model_settings['fingerprint_size']))
-    labels = np.zeros((sample_count, model_settings['label_count']))
+    labels = np.zeros(sample_count)
     desired_samples = model_settings['desired_samples']
     use_background = self.background_data and (mode == 'training')
     pick_deterministically = (mode != 'training')
@@ -483,7 +482,7 @@ class AudioProcessor(object):
       # Run the graph to produce the output audio.
       data[i - offset, :] = sess.run(self.mfcc_, feed_dict=input_dict).flatten()
       label_index = self.word_to_index[sample['label']]
-      labels[i - offset, label_index] = 1
+      labels[i - offset] = label_index
     return data, labels
 
   def get_unprocessed_data(self, how_many, model_settings, mode):
diff --git a/tensorflow/examples/speech_commands/label_wav_dir.py b/tensorflow/examples/speech_commands/label_wav_dir.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f305359e380e7192795851112c8261ea896c290
--- /dev/null
+++ b/tensorflow/examples/speech_commands/label_wav_dir.py
@@ -0,0 +1,136 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Runs a trained audio graph against WAVE files and reports the results.
+
+The model, labels and .wav files specified in the arguments will be loaded, and
+then the predictions from running the model against the audio data will be
+printed to the console. This is a useful script for sanity checking trained
+models, and as an example of how to use an audio model from Python.
+
+Here's an example of running it:
+
+python tensorflow/examples/speech_commands/label_wav_dir.py \
+--graph=/tmp/my_frozen_graph.pb \
+--labels=/tmp/speech_commands_train/conv_labels.txt \
+--wav_dir=/tmp/speech_dataset/left
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+import glob
+
+import tensorflow as tf
+
+# pylint: disable=unused-import
+from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
+# pylint: enable=unused-import
+
+FLAGS = None
+
+
+def load_graph(filename):
+  """Unpersists graph from file as default graph."""
+  with tf.gfile.FastGFile(filename, 'rb') as f:
+    graph_def = tf.GraphDef()
+    graph_def.ParseFromString(f.read())
+    tf.import_graph_def(graph_def, name='')
+
+
+def load_labels(filename):
+  """Read in labels, one label per line."""
+  return [line.rstrip() for line in tf.gfile.GFile(filename)]
+
+
+def run_graph(wav_dir, labels, input_layer_name, output_layer_name,
+              num_top_predictions):
+  """Runs the audio data through the graph and prints predictions."""
+  with tf.Session() as sess:
+    # Feed the audio data as input to the graph.
+    #   predictions  will contain a two-dimensional array, where one
+    #   dimension represents the input image count, and the other has
+    #   predictions per class
+    for wav_path in glob.glob(wav_dir + "/*.wav"):
+      if not wav_path or not tf.gfile.Exists(wav_path):
+        tf.logging.fatal('Audio file does not exist %s', wav_path)
+
+      with open(wav_path, 'rb') as wav_file:
+        wav_data = wav_file.read()
+
+      softmax_tensor = sess.graph.get_tensor_by_name(output_layer_name)
+      predictions, = sess.run(softmax_tensor, {input_layer_name: wav_data})
+
+      # Sort to show labels in order of confidence
+      print('\n%s' % (wav_path.split('/')[-1]))
+      top_k = predictions.argsort()[-num_top_predictions:][::-1]
+      for node_id in top_k:
+        human_string = labels[node_id]
+        score = predictions[node_id]
+        print('%s (score = %.5f)' % (human_string, score))
+
+    return 0
+
+
+def label_wav(wav_dir, labels, graph, input_name, output_name, how_many_labels):
+  """Loads the model and labels, and runs the inference to print predictions."""
+  if not labels or not tf.gfile.Exists(labels):
+    tf.logging.fatal('Labels file does not exist %s', labels)
+
+  if not graph or not tf.gfile.Exists(graph):
+    tf.logging.fatal('Graph file does not exist %s', graph)
+
+  labels_list = load_labels(labels)
+
+  # load graph, which is stored in the default session
+  load_graph(graph)
+
+  run_graph(wav_dir, labels_list, input_name, output_name, how_many_labels)
+
+
+def main(_):
+  """Entry point for script, converts flags to arguments."""
+  label_wav(FLAGS.wav_dir, FLAGS.labels, FLAGS.graph, FLAGS.input_name,
+            FLAGS.output_name, FLAGS.how_many_labels)
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--wav_dir', type=str, default='', help='Audio file to be identified.')
+  parser.add_argument(
+      '--graph', type=str, default='', help='Model to use for identification.')
+  parser.add_argument(
+      '--labels', type=str, default='', help='Path to file containing labels.')
+  parser.add_argument(
+      '--input_name',
+      type=str,
+      default='wav_data:0',
+      help='Name of WAVE data input node in model.')
+  parser.add_argument(
+      '--output_name',
+      type=str,
+      default='labels_softmax:0',
+      help='Name of node outputting a prediction in the model.')
+  parser.add_argument(
+      '--how_many_labels',
+      type=int,
+      default=3,
+      help='Number of results to show.')
+
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/speech_commands/recognize_commands.h b/tensorflow/examples/speech_commands/recognize_commands.h
index 7f8041f9ed39c4847b05b2ac748f8f526adbab44..a7cd194bec5612122cdf167aafda9b0786d770d8 100644
--- a/tensorflow/examples/speech_commands/recognize_commands.h
+++ b/tensorflow/examples/speech_commands/recognize_commands.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
+#ifndef TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
+#define TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
 
 #include <deque>
 #include <unordered_set>
@@ -76,4 +76,4 @@ class RecognizeCommands {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
+#endif  // TENSORFLOW_EXAMPLES_SPEECH_COMMANDS_RECOGNIZE_COMMANDS_H_
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index f46d5e59b46a9be8b261aade7dbeb4b41ba69b97..f084931215261f183f1ecfc5517ea9a5126db039 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -133,7 +133,7 @@ def main(_):
 
   # Define loss and optimizer
   ground_truth_input = tf.placeholder(
-      tf.float32, [None, label_count], name='groundtruth_input')
+      tf.int64, [None], name='groundtruth_input')
 
   # Optionally we can add runtime checks to spot when NaNs or other symptoms of
   # numerical errors start occurring during training.
@@ -144,9 +144,8 @@ def main(_):
 
   # Create the back propagation and training evaluation machinery in the graph.
   with tf.name_scope('cross_entropy'):
-    cross_entropy_mean = tf.reduce_mean(
-        tf.nn.softmax_cross_entropy_with_logits(
-            labels=ground_truth_input, logits=logits))
+    cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
+        labels=ground_truth_input, logits=logits)
   tf.summary.scalar('cross_entropy', cross_entropy_mean)
   with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
     learning_rate_input = tf.placeholder(
@@ -154,13 +153,13 @@ def main(_):
     train_step = tf.train.GradientDescentOptimizer(
         learning_rate_input).minimize(cross_entropy_mean)
   predicted_indices = tf.argmax(logits, 1)
-  expected_indices = tf.argmax(ground_truth_input, 1)
-  correct_prediction = tf.equal(predicted_indices, expected_indices)
-  confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices, num_classes=label_count)
+  correct_prediction = tf.equal(predicted_indices, ground_truth_input)
+  confusion_matrix = tf.confusion_matrix(
+      ground_truth_input, predicted_indices, num_classes=label_count)
   evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   tf.summary.scalar('accuracy', evaluation_step)
 
-  global_step = tf.contrib.framework.get_or_create_global_step()
+  global_step = tf.train.get_or_create_global_step()
   increment_global_step = tf.assign(global_step, global_step + 1)
 
   saver = tf.train.Saver(tf.global_variables())
@@ -358,12 +357,12 @@ if __name__ == '__main__':
       '--window_size_ms',
       type=float,
       default=30.0,
-      help='How long each spectrogram timeslice is',)
+      help='How long each spectrogram timeslice is.',)
   parser.add_argument(
       '--window_stride_ms',
       type=float,
       default=10.0,
-      help='How long each spectrogram timeslice is',)
+      help='How far to move in time between spectogram timeslices.',)
   parser.add_argument(
       '--dct_coefficient_count',
       type=int,
diff --git a/tensorflow/examples/tutorials/layers/cnn_mnist.py b/tensorflow/examples/tutorials/layers/cnn_mnist.py
index 2124843fcb21d0c4a28ef9a11aba012a5a116e84..1e8d7d05e1c6af08d788857e74c04134333d019c 100644
--- a/tensorflow/examples/tutorials/layers/cnn_mnist.py
+++ b/tensorflow/examples/tutorials/layers/cnn_mnist.py
@@ -97,9 +97,7 @@ def cnn_model_fn(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
 
   # Calculate Loss (for both TRAIN and EVAL modes)
-  onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Configure the Training Op (for TRAIN mode)
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/tutorials/mnist/input_data.py b/tensorflow/examples/tutorials/mnist/input_data.py
index f1a7e1c4af57dba4f06326eb8b03c7eddae86b51..fa148ae3e6f44e140e3b4fb6a4204a601b6c0a24 100644
--- a/tensorflow/examples/tutorials/mnist/input_data.py
+++ b/tensorflow/examples/tutorials/mnist/input_data.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# pylint: disable=unused-import
 import gzip
 import os
 import tempfile
@@ -27,3 +28,4 @@ from six.moves import urllib
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
+# pylint: enable=unused-import
diff --git a/tensorflow/examples/tutorials/mnist/mnist.py b/tensorflow/examples/tutorials/mnist/mnist.py
index 3585043a2a9f1920422c50cd60ce18fcfa646419..7cedd0e264f35ac4ab924c93032b019e2aae78cf 100644
--- a/tensorflow/examples/tutorials/mnist/mnist.py
+++ b/tensorflow/examples/tutorials/mnist/mnist.py
@@ -94,9 +94,7 @@ def loss(logits, labels):
     loss: Loss tensor of type float.
   """
   labels = tf.to_int64(labels)
-  cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits, name='xentropy')
-  return tf.reduce_mean(cross_entropy, name='xentropy_mean')
+  return tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
 
 def training(loss, learning_rate):
diff --git a/tensorflow/examples/tutorials/mnist/mnist_deep.py b/tensorflow/examples/tutorials/mnist/mnist_deep.py
index a4dbab5123d49ee97445a5921a14bd1764593025..1e0294db27bc675870afceca77a2cdcd4b3f5ad3 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_deep.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_deep.py
@@ -125,27 +125,27 @@ def bias_variable(shape):
 
 def main(_):
   # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+  mnist = input_data.read_data_sets(FLAGS.data_dir)
 
   # Create the model
   x = tf.placeholder(tf.float32, [None, 784])
 
   # Define loss and optimizer
-  y_ = tf.placeholder(tf.float32, [None, 10])
+  y_ = tf.placeholder(tf.int64, [None])
 
   # Build the graph for the deep net
   y_conv, keep_prob = deepnn(x)
 
   with tf.name_scope('loss'):
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_,
-                                                            logits=y_conv)
+    cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+        labels=y_, logits=y_conv)
   cross_entropy = tf.reduce_mean(cross_entropy)
 
   with tf.name_scope('adam_optimizer'):
     train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
 
   with tf.name_scope('accuracy'):
-    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
+    correct_prediction = tf.equal(tf.argmax(y_conv, 1), y_)
     correct_prediction = tf.cast(correct_prediction, tf.float32)
   accuracy = tf.reduce_mean(correct_prediction)
 
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
index addd2d3810219f70ffb5f7c919f01de35dd816d9..47dd6a1947811765101529826c2b24d9798fef1f 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """A very simple MNIST classifier.
 
 See extensive documentation at
@@ -34,7 +33,7 @@ FLAGS = None
 
 def main(_):
   # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+  mnist = input_data.read_data_sets(FLAGS.data_dir)
 
   # Create the model
   x = tf.placeholder(tf.float32, [None, 784])
@@ -43,7 +42,7 @@ def main(_):
   y = tf.matmul(x, W) + b
 
   # Define loss and optimizer
-  y_ = tf.placeholder(tf.float32, [None, 10])
+  y_ = tf.placeholder(tf.int64, [None])
 
   # The raw formulation of cross-entropy,
   #
@@ -52,10 +51,9 @@ def main(_):
   #
   # can be numerically unstable.
   #
-  # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
+  # So here we use tf.losses.sparse_softmax_cross_entropy on the raw
   # outputs of 'y', and then average across the batch.
-  cross_entropy = tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
+  cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
   train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
 
   sess = tf.InteractiveSession()
@@ -66,14 +64,21 @@ def main(_):
     sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
 
   # Test trained model
-  correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+  correct_prediction = tf.equal(tf.argmax(y, 1), y_)
   accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  print(sess.run(accuracy, feed_dict={x: mnist.test.images,
-                                      y_: mnist.test.labels}))
+  print(sess.run(
+      accuracy, feed_dict={
+          x: mnist.test.images,
+          y_: mnist.test.labels
+      }))
+
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
-  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
-                      help='Directory for storing input data')
+  parser.add_argument(
+      '--data_dir',
+      type=str,
+      default='/tmp/tensorflow/mnist/input_data',
+      help='Directory for storing input data')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py b/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
index eaff05913af756c6ab0bf80e8f0893b1d239d60d..e89317494f9b7171a93b2706d9d612d456ddf937 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
@@ -32,7 +32,7 @@ FLAGS = None
 
 def main(_):
   # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+  mnist = input_data.read_data_sets(FLAGS.data_dir)
 
   # Create the model
   x = tf.placeholder(tf.float32, [None, 784])
@@ -41,7 +41,7 @@ def main(_):
   y = tf.matmul(x, w) + b
 
   # Define loss and optimizer
-  y_ = tf.placeholder(tf.float32, [None, 10])
+  y_ = tf.placeholder(tf.int64, [None])
 
   # The raw formulation of cross-entropy,
   #
@@ -50,10 +50,9 @@ def main(_):
   #
   # can be numerically unstable.
   #
-  # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
-  # outputs of 'y', and then average across the batch.
-  cross_entropy = tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
+  # So here we use tf.losses.sparse_softmax_cross_entropy on the raw
+  # logit outputs of 'y', and then average across the batch.
+  cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
   train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
 
   config = tf.ConfigProto()
@@ -86,7 +85,7 @@ def main(_):
       sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
 
   # Test trained model
-  correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+  correct_prediction = tf.equal(tf.argmax(y, 1), y_)
   accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   print(sess.run(accuracy,
                  feed_dict={x: mnist.test.images,
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index c401d09df8ca5132178ab31e3b14b3a5cf98e70d..7967e22d6a0319a530cb2f00e54872f022ac0095 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -38,7 +38,6 @@ FLAGS = None
 def train():
   # Import data
   mnist = input_data.read_data_sets(FLAGS.data_dir,
-                                    one_hot=True,
                                     fake_data=FLAGS.fake_data)
 
   sess = tf.InteractiveSession()
@@ -47,7 +46,7 @@ def train():
   # Input placeholders
   with tf.name_scope('input'):
     x = tf.placeholder(tf.float32, [None, 784], name='x-input')
-    y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
+    y_ = tf.placeholder(tf.int64, [None], name='y-input')
 
   with tf.name_scope('input_reshape'):
     image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
@@ -117,12 +116,12 @@ def train():
     #
     # can be numerically unstable.
     #
-    # So here we use tf.nn.softmax_cross_entropy_with_logits on the
-    # raw outputs of the nn_layer above, and then average across
+    # So here we use tf.losses.sparse_softmax_cross_entropy on the
+    # raw logit outputs of the nn_layer above, and then average across
     # the batch.
-    diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
     with tf.name_scope('total'):
-      cross_entropy = tf.reduce_mean(diff)
+      cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+          labels=y_, logits=y)
   tf.summary.scalar('cross_entropy', cross_entropy)
 
   with tf.name_scope('train'):
@@ -131,7 +130,7 @@ def train():
 
   with tf.name_scope('accuracy'):
     with tf.name_scope('correct_prediction'):
-      correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+      correct_prediction = tf.equal(tf.argmax(y, 1), y_)
     with tf.name_scope('accuracy'):
       accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   tf.summary.scalar('accuracy', accuracy)
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index 142e45a2e8cb244bf1c7015b9001a463bf54b434..14ae7fbf35836ad7f5d56101ae0fc33a3f3fb9ba 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import collections
 import math
 import os
+import sys
+import argparse
 import random
 from tempfile import gettempdir
 import zipfile
@@ -30,6 +32,24 @@ from six.moves import urllib
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
+from tensorflow.contrib.tensorboard.plugins import projector
+
+# Give a folder path as an argument with '--log_dir' to save
+# TensorBoard summaries. Default is a log folder in current directory.
+current_path = os.path.dirname(os.path.realpath(sys.argv[0]))
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '--log_dir',
+    type=str,
+    default=os.path.join(current_path, 'log'),
+    help='The log directory for TensorBoard summaries.')
+FLAGS, unparsed = parser.parse_known_args()
+
+# Create the directory for TensorBoard variables if there is not.
+if not os.path.exists(FLAGS.log_dir):
+  os.makedirs(FLAGS.log_dir)
+
 # Step 1: Download the data.
 url = 'http://mattmahoney.net/dc/'
 
@@ -61,6 +81,7 @@ def read_data(filename):
     data = tf.compat.as_str(f.read(f.namelist()[0])).split()
   return data
 
+
 vocabulary = read_data(filename)
 print('Data size', len(vocabulary))
 
@@ -86,20 +107,22 @@ def build_dataset(words, n_words):
   reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
   return data, count, dictionary, reversed_dictionary
 
+
 # Filling 4 global variables:
 # data - list of codes (integers from 0 to vocabulary_size-1).
 #   This is the original text but words are replaced by their codes
 # count - map of words(strings) to count of occurrences
 # dictionary - map of words(strings) to their codes(integers)
 # reverse_dictionary - maps codes(integers) to words(strings)
-data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
-                                                            vocabulary_size)
+data, count, dictionary, reverse_dictionary = build_dataset(
+    vocabulary, vocabulary_size)
 del vocabulary  # Hint to reduce memory.
 print('Most common words (+UNK)', count[:5])
 print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
 
 data_index = 0
 
+
 # Step 3: Function to generate a training batch for the skip-gram model.
 def generate_batch(batch_size, num_skips, skip_window):
   global data_index
@@ -108,7 +131,7 @@ def generate_batch(batch_size, num_skips, skip_window):
   batch = np.ndarray(shape=(batch_size), dtype=np.int32)
   labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
   span = 2 * skip_window + 1  # [ skip_window target skip_window ]
-  buffer = collections.deque(maxlen=span)
+  buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
   if data_index + span > len(data):
     data_index = 0
   buffer.extend(data[data_index:data_index + span])
@@ -120,7 +143,7 @@ def generate_batch(batch_size, num_skips, skip_window):
       batch[i * num_skips + j] = buffer[skip_window]
       labels[i * num_skips + j, 0] = buffer[context_word]
     if data_index == len(data):
-      buffer[:] = data[:span]
+      buffer.extend(data[0:span])
       data_index = span
     else:
       buffer.append(data[data_index])
@@ -129,96 +152,130 @@ def generate_batch(batch_size, num_skips, skip_window):
   data_index = (data_index + len(data) - span) % len(data)
   return batch, labels
 
+
 batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
 for i in range(8):
-  print(batch[i], reverse_dictionary[batch[i]],
-        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])
+  print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
+        reverse_dictionary[labels[i, 0]])
 
 # Step 4: Build and train a skip-gram model.
 
 batch_size = 128
 embedding_size = 128  # Dimension of the embedding vector.
-skip_window = 1       # How many words to consider left and right.
-num_skips = 2         # How many times to reuse an input to generate a label.
-num_sampled = 64      # Number of negative examples to sample.
+skip_window = 1  # How many words to consider left and right.
+num_skips = 2  # How many times to reuse an input to generate a label.
+num_sampled = 64  # Number of negative examples to sample.
 
 # We pick a random validation set to sample nearest neighbors. Here we limit the
 # validation samples to the words that have a low numeric ID, which by
 # construction are also the most frequent. These 3 variables are used only for
 # displaying model accuracy, they don't affect calculation.
-valid_size = 16     # Random set of words to evaluate similarity on.
+valid_size = 16  # Random set of words to evaluate similarity on.
 valid_window = 100  # Only pick dev samples in the head of the distribution.
 valid_examples = np.random.choice(valid_window, valid_size, replace=False)
 
-
 graph = tf.Graph()
 
 with graph.as_default():
 
   # Input data.
-  train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
-  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
-  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
+  with tf.name_scope('inputs'):
+    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
+    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
+    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
 
   # Ops and variables pinned to the CPU because of missing GPU implementation
   with tf.device('/cpu:0'):
     # Look up embeddings for inputs.
-    embeddings = tf.Variable(
-        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
-    embed = tf.nn.embedding_lookup(embeddings, train_inputs)
+    with tf.name_scope('embeddings'):
+      embeddings = tf.Variable(
+          tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
+      embed = tf.nn.embedding_lookup(embeddings, train_inputs)
 
     # Construct the variables for the NCE loss
-    nce_weights = tf.Variable(
-        tf.truncated_normal([vocabulary_size, embedding_size],
-                            stddev=1.0 / math.sqrt(embedding_size)))
-    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
+    with tf.name_scope('weights'):
+      nce_weights = tf.Variable(
+          tf.truncated_normal(
+              [vocabulary_size, embedding_size],
+              stddev=1.0 / math.sqrt(embedding_size)))
+    with tf.name_scope('biases'):
+      nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
 
   # Compute the average NCE loss for the batch.
   # tf.nce_loss automatically draws a new sample of the negative labels each
   # time we evaluate the loss.
   # Explanation of the meaning of NCE loss:
   #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
-  loss = tf.reduce_mean(
-      tf.nn.nce_loss(weights=nce_weights,
-                     biases=nce_biases,
-                     labels=train_labels,
-                     inputs=embed,
-                     num_sampled=num_sampled,
-                     num_classes=vocabulary_size))
+  with tf.name_scope('loss'):
+    loss = tf.reduce_mean(
+        tf.nn.nce_loss(
+            weights=nce_weights,
+            biases=nce_biases,
+            labels=train_labels,
+            inputs=embed,
+            num_sampled=num_sampled,
+            num_classes=vocabulary_size))
+
+  # Add the loss value as a scalar to summary.
+  tf.summary.scalar('loss', loss)
 
   # Construct the SGD optimizer using a learning rate of 1.0.
-  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
+  with tf.name_scope('optimizer'):
+    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
 
   # Compute the cosine similarity between minibatch examples and all embeddings.
   norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
   normalized_embeddings = embeddings / norm
-  valid_embeddings = tf.nn.embedding_lookup(
-      normalized_embeddings, valid_dataset)
+  valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
+                                            valid_dataset)
   similarity = tf.matmul(
       valid_embeddings, normalized_embeddings, transpose_b=True)
 
+  # Merge all summaries.
+  merged = tf.summary.merge_all()
+
   # Add variable initializer.
   init = tf.global_variables_initializer()
 
+  # Create a saver.
+  saver = tf.train.Saver()
+
 # Step 5: Begin training.
 num_steps = 100001
 
 with tf.Session(graph=graph) as session:
+  # Open a writer to write summaries.
+  writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)
+
   # We must initialize all variables before we use them.
   init.run()
   print('Initialized')
 
   average_loss = 0
   for step in xrange(num_steps):
-    batch_inputs, batch_labels = generate_batch(
-        batch_size, num_skips, skip_window)
+    batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
+                                                skip_window)
     feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
 
+    # Define metadata variable.
+    run_metadata = tf.RunMetadata()
+
     # We perform one update step by evaluating the optimizer op (including it
     # in the list of returned values for session.run()
-    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
+    # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
+    # Feed metadata variable to session for visualizing the graph in TensorBoard.
+    _, summary, loss_val = session.run(
+        [optimizer, merged, loss],
+        feed_dict=feed_dict,
+        run_metadata=run_metadata)
     average_loss += loss_val
 
+    # Add returned summaries to writer in each step.
+    writer.add_summary(summary, step)
+    # Add metadata to visualize the graph for the last run.
+    if step == (num_steps - 1):
+      writer.add_run_metadata(run_metadata, 'step%d' % step)
+
     if step % 2000 == 0:
       if step > 0:
         average_loss /= 2000
@@ -240,6 +297,23 @@ with tf.Session(graph=graph) as session:
         print(log_str)
   final_embeddings = normalized_embeddings.eval()
 
+  # Write corresponding labels for the embeddings.
+  with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f:
+    for i in xrange(vocabulary_size):
+      f.write(reverse_dictionary[i] + '\n')
+
+  # Save the model for checkpoints.
+  saver.save(session, os.path.join(FLAGS.log_dir, 'model.ckpt'))
+
+  # Create a configuration for visualizing embeddings with the labels in TensorBoard.
+  config = projector.ProjectorConfig()
+  embedding_conf = config.embeddings.add()
+  embedding_conf.tensor_name = embeddings.name
+  embedding_conf.metadata_path = os.path.join(FLAGS.log_dir, 'metadata.tsv')
+  projector.visualize_embeddings(writer, config)
+
+writer.close()
+
 # Step 6: Visualize the embeddings.
 
 
@@ -251,21 +325,24 @@ def plot_with_labels(low_dim_embs, labels, filename):
   for i, label in enumerate(labels):
     x, y = low_dim_embs[i, :]
     plt.scatter(x, y)
-    plt.annotate(label,
-                 xy=(x, y),
-                 xytext=(5, 2),
-                 textcoords='offset points',
-                 ha='right',
-                 va='bottom')
+    plt.annotate(
+        label,
+        xy=(x, y),
+        xytext=(5, 2),
+        textcoords='offset points',
+        ha='right',
+        va='bottom')
 
   plt.savefig(filename)
 
+
 try:
   # pylint: disable=g-import-not-at-top
   from sklearn.manifold import TSNE
   import matplotlib.pyplot as plt
 
-  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
+  tsne = TSNE(
+      perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
   plot_only = 500
   low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
   labels = [reverse_dictionary[i] for i in xrange(plot_only)]
diff --git a/tensorflow/examples/udacity/5_word2vec.ipynb b/tensorflow/examples/udacity/5_word2vec.ipynb
index 18c456cad787b2ed5b39d5791de649874bbe7ae3..3b43d1fb55ee5d7f6a91754a221962755f04190c 100644
--- a/tensorflow/examples/udacity/5_word2vec.ipynb
+++ b/tensorflow/examples/udacity/5_word2vec.ipynb
@@ -455,7 +455,7 @@
         "  \n",
         "  # Compute the similarity between minibatch examples and all embeddings.\n",
         "  # We use the cosine distance:\n",
-        "  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))\n",
+        "  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))\n",
         "  normalized_embeddings = embeddings / norm\n",
         "  valid_embeddings = tf.nn.embedding_lookup(\n",
         "    normalized_embeddings, valid_dataset)\n",
diff --git a/tensorflow/examples/udacity/Dockerfile b/tensorflow/examples/udacity/Dockerfile
index 3ca58566c1ddb4c2446f7d9b19ee31fb8b603909..00eb853e527c922121fae6dc5eab42c589b0b238 100644
--- a/tensorflow/examples/udacity/Dockerfile
+++ b/tensorflow/examples/udacity/Dockerfile
@@ -8,7 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN pip install scikit-learn pyreadline Pillow
+RUN pip install scikit-learn pyreadline Pillow imageio
 RUN rm -rf /notebooks/*
 ADD *.ipynb /notebooks/
 WORKDIR /notebooks
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
index 1e375ed48edcc779509179d7eae0ff93bbc87b16..4a429837b7b997f0f6571060280a9a15543b9f54 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
@@ -53,7 +53,8 @@ tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
   //  - Scales, clamps, and converts that spectrogram to 0 to 255 uint8's.
   //  - Reshapes the tensor so that it's [height, width, 1] for imaging.
   //  - Encodes it as a PNG stream and saves it out to a file.
-  Output file_reader = ReadFile(root.WithOpName("input_wav"), input_wav);
+  Output file_reader =
+      tensorflow::ops::ReadFile(root.WithOpName("input_wav"), input_wav);
   DecodeWav wav_decoder =
       DecodeWav(root.WithOpName("wav_decoder"), file_reader);
   Output spectrogram = AudioSpectrogram(root.WithOpName("spectrogram"),
@@ -71,8 +72,8 @@ tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
   Output squeeze = Squeeze(root.WithOpName("squeeze"), expand_dims,
                            Squeeze::Attrs().Axis({0}));
   Output png_encoder = EncodePng(root.WithOpName("png_encoder"), squeeze);
-  WriteFile file_writer =
-      WriteFile(root.WithOpName("output_image"), output_image, png_encoder);
+  tensorflow::ops::WriteFile file_writer = tensorflow::ops::WriteFile(
+      root.WithOpName("output_image"), output_image, png_encoder);
   tensorflow::GraphDef graph;
   TF_RETURN_IF_ERROR(root.ToGraphDef(&graph));
 
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
index fa8cb0abe951957e621703b7e2b9a6774200ac33..eada07e06f95f5ad9b97c2e2a992435de3437da9 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
-#define THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+#ifndef TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+#define TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -28,4 +28,4 @@ tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
                                     tensorflow::int32 stride, float brightness,
                                     const tensorflow::string& output_image);
 
-#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+#endif  // TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index 376e22b38082f7ebeacf49edd44e85c12be2d95f..b1bd87eb0c3b3a498a1db45f11d9a48552e08079 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -26,9 +26,12 @@ from source.
     ([Linux](https://www.tensorflow.org/install/install_sources#PrepareLinux)
     or [OS
     X](https://www.tensorflow.org/install/install_sources#PrepareMac)).
-    If you don't need GPU support, then try the following: `sh # Linux sudo
-    apt-get install python swig python-numpy # OS X with homebrew brew install
-    swig`
+    If you don't need GPU support, then try the following:
+
+    ```sh
+    sudo apt-get install python swig python-numpy # Linux
+    brew install swig                             # OS X with homebrew
+    ```
 
 ### Build
 
diff --git a/tensorflow/go/genop/generate.sh b/tensorflow/go/genop/generate.sh
index 01fcfb9058378b49d1315ddbbcc08e6a5de09d7d..a894c87c2765d01d2310159b19092904ad50a8b3 100644
--- a/tensorflow/go/genop/generate.sh
+++ b/tensorflow/go/genop/generate.sh
@@ -19,6 +19,11 @@ set -e
 go get github.com/golang/protobuf/proto
 go get github.com/golang/protobuf/protoc-gen-go
 
+if [ -z "${GOPATH}" ]
+then
+  GOPATH=$(go env GOPATH)
+fi
+
 cd $(dirname $0)
 for g in $(echo "${GOPATH//:/ }"); do
     TF_DIR="${g}/src/github.com/tensorflow/tensorflow"
diff --git a/tensorflow/go/genop/internal/api_def_map.go b/tensorflow/go/genop/internal/api_def_map.go
new file mode 100644
index 0000000000000000000000000000000000000000..07b689dbba23a3aa991983f3b373fa8445c673e1
--- /dev/null
+++ b/tensorflow/go/genop/internal/api_def_map.go
@@ -0,0 +1,127 @@
+/*
+Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package internal
+
+/*
+#include <stdlib.h>
+#include <string.h>
+
+#include "tensorflow/c/c_api.h"
+*/
+import "C"
+
+import (
+	"errors"
+	"fmt"
+	"runtime"
+	"unsafe"
+
+	"github.com/golang/protobuf/proto"
+	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework"
+)
+
+// Encapsulates a collection of API definitions.
+//
+// apiDefMap represents a map from operation name to corresponding
+// ApiDef proto (see
+// https://www.tensorflow.org/code/tensorflow/core/framework/api_def.proto
+// for ApiDef proto definition).
+type apiDefMap struct {
+	c *C.TF_ApiDefMap
+}
+
+// Creates and returns a new apiDefMap instance.
+//
+// oplist is and OpList proto instance (see
+// https://www.tensorflow.org/code/tensorflow/core/framework/op_def.proto
+// for OpList proto definition).
+
+func newAPIDefMap(oplist *pb.OpList) (*apiDefMap, error) {
+	// Create a buffer containing the serialized OpList.
+	opdefSerialized, err := proto.Marshal(oplist)
+	if err != nil {
+		return nil, fmt.Errorf("could not serialize OpDef for %s", oplist.String())
+	}
+	data := C.CBytes(opdefSerialized)
+	defer C.free(data)
+
+	opbuf := C.TF_NewBuffer()
+	defer C.TF_DeleteBuffer(opbuf)
+	opbuf.data = data
+	opbuf.length = C.size_t(len(opdefSerialized))
+
+	// Create ApiDefMap.
+	status := C.TF_NewStatus()
+	defer C.TF_DeleteStatus(status)
+	capimap := C.TF_NewApiDefMap(opbuf, status)
+	if C.TF_GetCode(status) != C.TF_OK {
+		return nil, errors.New(C.GoString(C.TF_Message(status)))
+	}
+	apimap := &apiDefMap{capimap}
+	runtime.SetFinalizer(
+		apimap,
+		func(a *apiDefMap) {
+			C.TF_DeleteApiDefMap(a.c)
+		})
+	return apimap, nil
+}
+
+// Updates apiDefMap with the overrides specified in `data`.
+//
+// data - ApiDef text proto.
+func (m *apiDefMap) Put(data string) error {
+	cdata := C.CString(data)
+	defer C.free(unsafe.Pointer(cdata))
+	status := C.TF_NewStatus()
+	defer C.TF_DeleteStatus(status)
+	C.TF_ApiDefMapPut(m.c, cdata, C.size_t(len(data)), status)
+	if C.TF_GetCode(status) != C.TF_OK {
+		return errors.New(C.GoString(C.TF_Message(status)))
+	}
+	return nil
+}
+
+// Returns ApiDef proto instance for the TensorFlow operation
+// named `opname`.
+func (m *apiDefMap) Get(opname string) (*pb.ApiDef, error) {
+	cname := C.CString(opname)
+	defer C.free(unsafe.Pointer(cname))
+	status := C.TF_NewStatus()
+	defer C.TF_DeleteStatus(status)
+	apidefBuf := C.TF_ApiDefMapGet(
+		m.c, cname, C.size_t(len(opname)), status)
+	defer C.TF_DeleteBuffer(apidefBuf)
+	if C.TF_GetCode(status) != C.TF_OK {
+		return nil, errors.New(C.GoString(C.TF_Message(status)))
+	}
+	if apidefBuf == nil {
+		return nil, fmt.Errorf("could not find ApiDef for %s", opname)
+	}
+
+	var (
+		apidef = new(pb.ApiDef)
+		size   = int(apidefBuf.length)
+		// A []byte backed by C memory.
+		// See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
+		data = (*[1 << 30]byte)(unsafe.Pointer(apidefBuf.data))[:size:size]
+		err  = proto.Unmarshal(data, apidef)
+	)
+	if err != nil {
+		return nil, err
+	}
+	return apidef, nil
+}
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index dec08dee1ca4f2d85f9bac834323889adad178d3..82f7510f2ed947e0a87e4d88cfce1ecaaa6362f8 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -29,12 +29,18 @@ limitations under the License.
 // encountered.
 package internal
 
-// #include "tensorflow/c/c_api.h"
+/*
+#include <stdlib.h>
+
+#include "tensorflow/c/c_api.h"
+*/
 import "C"
 
 import (
 	"fmt"
 	"io"
+	"io/ioutil"
+	"path"
 	"reflect"
 	"strings"
 	"text/template"
@@ -47,15 +53,23 @@ import (
 // GenerateFunctionsForRegisteredOps writes a Go source code file to w
 // containing functions for each TensorFlow operation registered in the address
 // space of the calling process.
-func GenerateFunctionsForRegisteredOps(w io.Writer) error {
-	ops, err := registeredOps()
+// apidefDirs should be a contain of directories containing api_def_*.pbtxt
+// files to load.
+func GenerateFunctionsForRegisteredOps(
+	w io.Writer, apidefDirs []string) error {
+	ops, apimap, err := registeredOps()
 	if err != nil {
 		return err
 	}
-	return generateFunctionsForOps(w, ops)
+	for _, dir := range apidefDirs {
+		if err = updateAPIDefs(apimap, dir); err != nil {
+			return err
+		}
+	}
+	return generateFunctionsForOps(w, ops, apimap)
 }
 
-func registeredOps() (*pb.OpList, error) {
+func registeredOps() (*pb.OpList, *apiDefMap, error) {
 	buf := C.TF_GetAllOpList()
 	defer C.TF_DeleteBuffer(buf)
 	var (
@@ -66,10 +80,31 @@ func registeredOps() (*pb.OpList, error) {
 		data = (*[1 << 30]byte)(unsafe.Pointer(buf.data))[:size:size]
 		err  = proto.Unmarshal(data, list)
 	)
-	return list, err
+	if err != nil {
+		return nil, nil, err
+	}
+	apimap, err := newAPIDefMap(list)
+	return list, apimap, err
+}
+
+func updateAPIDefs(m *apiDefMap, dir string) error {
+	files, err := ioutil.ReadDir(dir)
+	if err != nil {
+		return err
+	}
+	for _, file := range files {
+		data, err := ioutil.ReadFile(path.Join(dir, file.Name()))
+		if err != nil {
+			return fmt.Errorf("failed to read %q: %v", file.Name(), err)
+		}
+		if err = m.Put(string(data)); err != nil {
+			return fmt.Errorf("failed to process %q: %v", file.Name(), err)
+		}
+	}
+	return nil
 }
 
-func generateFunctionsForOps(w io.Writer, ops *pb.OpList) error {
+func generateFunctionsForOps(w io.Writer, ops *pb.OpList, apimap *apiDefMap) error {
 	thisPackage := reflect.TypeOf(tmplArgs{}).PkgPath()
 	if err := tmplHeader.Execute(w, thisPackage); err != nil {
 		return err
@@ -83,14 +118,18 @@ func generateFunctionsForOps(w io.Writer, ops *pb.OpList) error {
 		if blacklist[op.Name] {
 			continue
 		}
-		if err := generateFunctionForOp(w, op); err != nil {
+		apidef, err := apimap.Get(op.Name)
+		if err != nil {
+			return err
+		}
+		if err := generateFunctionForOp(w, op, apidef); err != nil {
 			return err
 		}
 	}
 	return nil
 }
 
-func generateFunctionForOp(w io.Writer, op *pb.OpDef) error {
+func generateFunctionForOp(w io.Writer, op *pb.OpDef, apidef *pb.ApiDef) error {
 	if strings.HasPrefix(op.Name, "_") { // Internal operation
 		return nil
 	}
@@ -112,12 +151,16 @@ func generateFunctionForOp(w io.Writer, op *pb.OpDef) error {
 			return nil
 		}
 	}
-	if op.Summary == "" {
+	if apidef.Summary == "" {
 		// Undocumented operation, perhaps a sign of not being ready to
 		// export.
 		return nil
 	}
-	return tmplOp.Execute(w, newTmplArgs(op))
+	tmplArgs, err := newTmplArgs(op, apidef)
+	if err != nil {
+		return err
+	}
+	return tmplOp.Execute(w, tmplArgs)
 }
 
 var (
@@ -172,7 +215,7 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 type {{.Op.Name}}Attr func(optionalAttr)
 
 {{range .OptionalAttrs}}
-// {{$.Op.Name}}{{CamelCase .Name}} sets the optional {{.Name}} attribute to value.
+// {{$.Op.Name}}{{CamelCase .RenameTo}} sets the optional {{.RenameTo}} attribute to value.
 {{- if .Description}}
 //
 // value: {{MakeComment .Description}}
@@ -180,9 +223,9 @@ type {{.Op.Name}}Attr func(optionalAttr)
 // If not specified, defaults to {{StripLeadingColon .DefaultValue}}
 {{- if .HasMinimum}}
 //
-// {{if IsListAttr .}}REQUIRES: len(value) >= {{.Minimum}}{{else}}REQUIRES: value >= {{.Minimum}}{{end}}
+// {{if .IsListAttr }}REQUIRES: len(value) >= {{.Minimum}}{{else}}REQUIRES: value >= {{.Minimum}}{{end}}
 {{- end}}
-func {{$.Op.Name}}{{CamelCase .Name}}(value {{GoType .Type}}) {{$.Op.Name}}Attr {
+func {{$.Op.Name}}{{CamelCase .RenameTo}}(value {{GoType .Type}}) {{$.Op.Name}}Attr {
 	return func(m optionalAttr) {
 		m[{{printf "%q" .Name}}] = value
 	}
@@ -192,14 +235,14 @@ func {{$.Op.Name}}{{CamelCase .Name}}(value {{GoType .Type}}) {{$.Op.Name}}Attr
 
 {{- /* Create a godoc friendly comment. */ -}}
 
-// {{MakeComment .Op.Summary}}
+// {{MakeComment .APIDef.Summary}}
 
 {{- with .Op.Deprecation}}
 //
 // DEPRECATED at GraphDef version {{.Version}}: {{.Explanation}}
 {{- end -}}
 
-{{- with .Op.Description}}
+{{- with .APIDef.Description}}
 //
 // {{MakeComment .}}
 {{- end -}}
@@ -207,11 +250,11 @@ func {{$.Op.Name}}{{CamelCase .Name}}(value {{GoType .Type}}) {{$.Op.Name}}Attr
 {{- if .DescribeArguments}}
 //
 // Arguments:
-{{- range .Op.InputArg}}
-//	{{if .Description}}{{Identifier .Name}}: {{MakeComment .Description}}{{end}}
+{{- range .InArgsReordered}}
+//	{{if .Description}}{{Identifier .RenameTo}}: {{MakeComment .Description}}{{end}}
 {{- end -}}
 {{- range .RequiredAttrs}}
-//	{{if .Description}}{{Identifier .Name}}: {{MakeComment .Description}}{{end}}
+//	{{if .Description}}{{Identifier .RenameTo}}: {{MakeComment .Description}}{{end}}
 {{- end -}}
 {{- end -}}
 
@@ -221,12 +264,12 @@ func {{$.Op.Name}}{{CamelCase .Name}}(value {{GoType .Type}}) {{$.Op.Name}}Attr
 {{- else }}
 {{- if .DescribeOutputs}}
 //
-{{- if ((len .Op.OutputArg) eq 1) }}
-// Returns {{range .Op.OutputArg}}{{MakeComment .Description}}{{end}}
+{{- if ((len .OutArgs) eq 1) }}
+// Returns {{range .OutArgs}}{{MakeComment .Description}}{{end}}
 {{- else }}
 // Returns:
-{{- range .Op.OutputArg}}
-//	{{Identifier .Name}}{{if .Description}}: {{MakeComment .Description}}{{end}}
+{{- range .OutArgs}}
+//	{{Identifier .RenameTo}}{{if .Description}}: {{MakeComment .Description}}{{end}}
 {{- end -}}
 {{- end -}}
 {{- end -}}
@@ -247,15 +290,15 @@ func {{.Op.Name}}
 */ -}}
 
 (scope *Scope
-{{- range $i, $a := .Op.InputArg}}, {{Identifier $a.Name}} {{if IsListArg $a}}[]{{end}}tf.Output{{end -}}
-{{range $i, $a := .RequiredAttrs}}, {{Identifier $a.Name}} {{GoType $a.Type}}{{end -}}
+{{- range $i, $a := .InArgsReordered}}, {{Identifier $a.RenameTo}} {{if $a.IsListArg}}[]{{end}}tf.Output{{end -}}
+{{range $i, $a := .RequiredAttrs}}, {{Identifier $a.RenameTo}} {{GoType $a.Type}}{{end -}}
 {{if .OptionalAttrs}}, optional ...{{.Op.Name}}Attr{{end -}}
 )
 
-{{- /* Construct outputs: len(OpDef.OutputArg) or a *tf.Operation */ -}}
+{{- /* Construct outputs: len(.OutArgs) or a *tf.Operation */ -}}
 
-{{if .Op.OutputArg -}}
-({{range $i,$a := .Op.OutputArg}}{{if $i}}, {{end}}{{Identifier $a.Name}} {{if IsListArg $a}}[]{{end}}tf.Output{{end -}})
+{{if .OutArgs -}}
+({{range $i,$a := .OutArgs}}{{if $i}}, {{end}}{{Identifier $a.RenameTo}} {{if $a.IsListArg}}[]{{end}}tf.Output{{end -}})
 {{- else -}}
 (o *tf.Operation)
 {{- end }} {
@@ -263,7 +306,7 @@ func {{.Op.Name}}
 		return
 	}
 	{{if .HasAttrs -}}
-	attrs := map[string]interface{}{ {{- range .RequiredAttrs}}{{printf "%q" .Name}}: {{Identifier .Name}},{{end}}}
+	attrs := map[string]interface{}{ {{- range .RequiredAttrs}}{{printf "%q" .Name}}: {{Identifier .RenameTo}},{{end}}}
 	{{if .OptionalAttrs -}}
 	for _, a := range optional {
 		a(attrs)
@@ -272,16 +315,16 @@ func {{.Op.Name}}
 	{{end -}}
 	opspec := tf.OpSpec{
 		Type: {{printf "%q" .Op.Name}},
-		{{if .Op.InputArg -}}
+		{{if .InArgs -}}
 		Input: []tf.Input{
-			{{range .Op.InputArg}}{{if IsListArg .}}tf.OutputList({{Identifier .Name}}){{else}}{{Identifier .Name}}{{end}}, {{end}}
+			{{range $i,$a := .InArgs}}{{if $a.IsListArg}}tf.OutputList({{Identifier $a.RenameTo}}){{else}}{{Identifier $a.RenameTo}}{{end}}, {{end}}
 		},
 		{{- end}}
 		{{- if .HasAttrs}}
 		Attrs: attrs,
 		{{- end}}
 	}
-	{{- if .Op.OutputArg}}
+	{{- if .OutArgs}}
 	{{- if .HasListOutput}}
 	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
@@ -289,43 +332,105 @@ func {{.Op.Name}}
 	}
 	var idx int
 	var err error
-	{{- range $i, $a := .Op.OutputArg}}
-	{{- if IsListArg $a}}
-	if {{Identifier .Name}}, idx, err = makeOutputList(op, idx, {{printf "%q" .Name}}); err != nil {
+	{{- range $i, $a := .OutArgs}}
+	{{- if $a.IsListArg}}
+	if {{Identifier .RenameTo}}, idx, err = makeOutputList(op, idx, {{printf "%q" .Name}}); err != nil {
 		scope.UpdateErr({{printf "%q" $.Op.Name}}, err)
 		return
 	}
 	{{- else }}
-	{{Identifier .Name}} = op.Output(idx)
+	{{Identifier .RenameTo}} = op.Output(idx)
 	{{- end }}{{- /* if IsListArg */}}
-	{{- end }}{{- /* range .Op.OutputArg */}}
-	return {{range $i, $a := .Op.OutputArg}}{{if $i}}, {{end}}{{Identifier .Name}}{{end}}
+	{{- end }}{{- /* range .OutArgs */}}
+	return {{range $i, $a := .OutArgs}}{{if $i}}, {{end}}{{Identifier .RenameTo}}{{end}}
 	{{- else }}
 	op := scope.AddOperation(opspec)
-	return {{range $i, $a := .Op.OutputArg}}{{if $i}}, {{end}}op.Output({{$i}}){{end}}
+	return {{range $i, $a := .OutArgs}}{{if $i}}, {{end}}op.Output({{$i}}){{end}}
 	{{- end }}{{- /* if .HasListOutput */}}
 	{{- else }}
 	return scope.AddOperation(opspec)
-	{{- end }}{{- /* if .Op.OutputArg */}}
+	{{- end }}{{- /* if .OutArgs */}}
 }
 `))
 )
 
+type attrWrapper struct {
+	op  *pb.OpDef_AttrDef
+	api *pb.ApiDef_Attr
+}
+
+func (a *attrWrapper) Name() string             { return a.api.Name }
+func (a *attrWrapper) RenameTo() string         { return a.api.RenameTo }
+func (a *attrWrapper) Description() string      { return a.api.Description }
+func (a *attrWrapper) Type() string             { return a.op.Type }
+func (a *attrWrapper) IsListAttr() bool         { return isListAttr(a.op) }
+func (a *attrWrapper) HasMinimum() bool         { return a.op.HasMinimum }
+func (a *attrWrapper) Minimum() int64           { return a.op.Minimum }
+func (a *attrWrapper) DefaultValue() interface{} { return a.api.DefaultValue }
+
+type argWrapper struct {
+	op  *pb.OpDef_ArgDef
+	api *pb.ApiDef_Arg
+}
+
+func (a *argWrapper) Name() string        { return a.api.Name }
+func (a *argWrapper) RenameTo() string    { return a.api.RenameTo }
+func (a *argWrapper) Description() string { return a.api.Description }
+func (a *argWrapper) IsListArg() bool     { return isListArg(a.op) }
+
 type tmplArgs struct {
-	Op *pb.OpDef
+	Op     *pb.OpDef
+	APIDef *pb.ApiDef
 	// Op.Attr is split into two categories
 	// (1) Required: These must be specified by the client and are thus
 	//     included in the function signature.
 	// (2) Optional: These need not be specified (as they have default
 	//     values) and thus do not appear in the function signature.
-	RequiredAttrs []*pb.OpDef_AttrDef
-	OptionalAttrs []*pb.OpDef_AttrDef
+	RequiredAttrs []*attrWrapper
+	OptionalAttrs []*attrWrapper
+	InArgs        []*argWrapper
+	// Input arguments ordered based on arg_order field of ApiDef.
+	InArgsReordered []*argWrapper
+	OutArgs         []*argWrapper
 }
 
-func newTmplArgs(op *pb.OpDef) *tmplArgs {
-	ret := tmplArgs{Op: op}
+func newTmplArgs(op *pb.OpDef, apidef *pb.ApiDef) (*tmplArgs, error) {
+	ret := tmplArgs{Op: op, APIDef: apidef}
+
+	// Setup InArgs field
+	for i, in := range op.InputArg {
+		argCombined := argWrapper{op: in, api: apidef.InArg[i]}
+		ret.InArgs = append(ret.InArgs, &argCombined)
+	}
+
+	// Setup OutArgs field
+	for i, out := range op.OutputArg {
+		argCombined := argWrapper{op: out, api: apidef.OutArg[i]}
+		ret.OutArgs = append(ret.OutArgs, &argCombined)
+	}
+
+	// Setup InArgsReordered field
+	for _, argName := range apidef.ArgOrder {
+		// Find the argument in op.InputArg
+		argIndex := -1
+		for i, in := range op.InputArg {
+			if in.Name == argName {
+				argIndex = i
+				break
+			}
+		}
+		if argIndex == -1 {
+			return nil, fmt.Errorf(
+				"couldn't find argument %s in ApiDef for op %s",
+				argName, op.Name)
+		}
+		argCombined := argWrapper{
+			op: op.InputArg[argIndex], api: apidef.InArg[argIndex]}
+		ret.InArgsReordered = append(ret.InArgsReordered, &argCombined)
+	}
+
 	if len(op.Attr) == 0 {
-		return &ret
+		return &ret, nil
 	}
 	// Attributes related to the InputArg's type are inferred automatically
 	// and are not exposed to the client.
@@ -341,28 +446,29 @@ func newTmplArgs(op *pb.OpDef) *tmplArgs {
 			inferred[in.NumberAttr] = true
 		}
 	}
-	for _, attr := range op.Attr {
+	for i, attr := range op.Attr {
 		if inferred[attr.Name] {
 			continue
 		}
+		attrCombined := attrWrapper{op: attr, api: apidef.Attr[i]}
 		if attr.DefaultValue == nil {
-			ret.RequiredAttrs = append(ret.RequiredAttrs, attr)
+			ret.RequiredAttrs = append(ret.RequiredAttrs, &attrCombined)
 		} else {
-			ret.OptionalAttrs = append(ret.OptionalAttrs, attr)
+			ret.OptionalAttrs = append(ret.OptionalAttrs, &attrCombined)
 		}
 	}
-	return &ret
+	return &ret, nil
 }
 
 func (a *tmplArgs) HasAttrs() bool { return len(a.RequiredAttrs)+len(a.OptionalAttrs) > 0 }
 func (a *tmplArgs) DescribeArguments() bool {
-	for _, arg := range a.Op.InputArg {
-		if arg.Description != "" {
+	for _, arg := range a.InArgs {
+		if arg.Description() != "" {
 			return true
 		}
 	}
 	for _, attr := range a.RequiredAttrs {
-		if attr.Description != "" {
+		if attr.Description() != "" {
 			return true
 		}
 	}
@@ -370,16 +476,16 @@ func (a *tmplArgs) DescribeArguments() bool {
 
 }
 func (a *tmplArgs) DescribeOutputs() bool {
-	for _, arg := range a.Op.OutputArg {
-		if arg.Description != "" {
+	for _, arg := range a.OutArgs {
+		if arg.Description() != "" {
 			return true
 		}
 	}
 	return false
 }
 func (a *tmplArgs) HasListOutput() bool {
-	for _, arg := range a.Op.OutputArg {
-		if isListArg(arg) {
+	for _, arg := range a.OutArgs {
+		if arg.IsListArg() {
 			return true
 		}
 	}
diff --git a/tensorflow/go/genop/internal/genop_test.go b/tensorflow/go/genop/internal/genop_test.go
index c984c0063a9f663d82dbb797a5acec1becb79e5f..b3a23dff102a690b1f7f08b675219929355f139f 100644
--- a/tensorflow/go/genop/internal/genop_test.go
+++ b/tensorflow/go/genop/internal/genop_test.go
@@ -25,19 +25,44 @@ import (
 	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework"
 )
 
+// Creates an ApiDef based on opdef and applies overrides
+// from apidefText (ApiDef text proto).
+func GetAPIDef(t *testing.T, opdef *pb.OpDef, apidefText string) *pb.ApiDef {
+	opdefList := &pb.OpList{Op: []*pb.OpDef{opdef}}
+	apimap, err := newAPIDefMap(opdefList)
+	if err != nil {
+		t.Fatal(err)
+	}
+	err = apimap.Put(apidefText)
+	if err != nil {
+		t.Fatal(err)
+	}
+	apidef, err := apimap.Get(opdef.Name)
+	if err != nil {
+		t.Fatal(err)
+	}
+	return apidef
+}
+
 func TestGenerateOp(t *testing.T) {
 	// TestGenerateOp validates the generated source code for an op.
 	// The OpDef for the test cases are simplified forms of real ops.
 	testdata := []struct {
 		tag    string
 		opdef  string
+		apidef string
 		wanted string
 	}{
 		{
 			tag: "NoOp",
 			opdef: `
 name: "NoOp"
+`,
+			apidef: `
+op: <
+graph_op_name: "NoOp"
 summary: "No. Op."
+>
 `,
 			wanted: `
 // No. Op.
@@ -80,8 +105,13 @@ attr: <
     >
   >
 >
+`,
+			apidef: `
+op: <
+graph_op_name: "Add"
 summary: "Returns x + y element-wise."
 description: "Blah blah",
+>
 `,
 			wanted: `
 // Returns x + y element-wise.
@@ -122,7 +152,12 @@ attr: <
   name: "DstT"
   type: "type"
 >
+`,
+			apidef: `
+op: <
+graph_op_name: "Cast"
 summary: "Cast x of type SrcT to y of DstT."
+>
 `,
 			wanted: `
 // Cast x of type SrcT to y of DstT.
@@ -149,12 +184,10 @@ func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
 name: "DecodeJpeg"
 input_arg: <
   name: "contents"
-  description: "0-D.  The JPEG-encoded image."
   type: DT_STRING
 >
 output_arg: <
   name: "image"
-  description: "3-D with shape [height, width, channels]"
   type: DT_UINT8
 >
 attr: <
@@ -163,7 +196,6 @@ attr: <
   default_value: <
     i: 0
   >
-  description: "Number of color channels for the decoded image."
 >
 attr: <
   name: "fancy_upscaling"
@@ -171,7 +203,6 @@ attr: <
   default_value: <
     b: true
   >
-  description: "If true use a slower but nicer upscaling of the\nchroma planes (yuv420/422 only)."
 >
 attr: <
   name: "acceptable_fraction"
@@ -179,10 +210,34 @@ attr: <
   default_value: <
     f: 1
   >
+>
+`,
+			apidef: `
+op: <
+graph_op_name: "DecodeJpeg"
+in_arg: <
+  name: "contents"
+  description: "0-D.  The JPEG-encoded image."
+>
+out_arg: <
+  name: "image"
+  description: "3-D with shape [height, width, channels]"
+>
+attr: <
+  name: "channels"
+  description: "Number of color channels for the decoded image."
+>
+attr: <
+  name: "fancy_upscaling"
+  description: "If true use a slower but nicer upscaling of the\nchroma planes (yuv420/422 only)."
+>
+attr: <
+  name: "acceptable_fraction"
   description: "The minimum required fraction of lines before a truncated\ninput is accepted."
 >
 summary: "Decode a JPEG-encoded image to a uint8 tensor."
 description: "Norna dorna fjord\nkajorna\nhahaha"
+>
 `,
 			wanted: `
 // DecodeJpegAttr is an optional argument to DecodeJpeg.
@@ -270,7 +325,12 @@ attr: <
   name: "T"
   type: "type"
 >
+`,
+			apidef: `
+op: <
+graph_op_name: "TwoOutputs"
 summary: "Op that produces multiple outputs"
+>
 `,
 			wanted: `
 // Op that produces multiple outputs
@@ -326,8 +386,13 @@ attr: <
     >
   >
 >
+`,
+			apidef: `
+op: <
+graph_op_name: "ShapeN"
 summary: "Returns shape of tensors."
 description: "Some description here."
+>
 `,
 			wanted: `
 // ShapeNAttr is an optional argument to ShapeN.
@@ -371,6 +436,102 @@ func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []t
 	}
 	return output
 }
+`,
+		},
+		{
+			tag: "ApiDefOverrides",
+			opdef: `
+name: "TestOp"
+input_arg: <
+  name: "a"
+  type: DT_STRING
+>
+input_arg: <
+  name: "b"
+  type: DT_STRING
+>
+output_arg: <
+  name: "c"
+  type: DT_UINT8
+>
+attr: <
+  name: "d"
+  type: "int"
+  default_value: <
+    i: 0
+  >
+>
+`,
+			apidef: `
+op: <
+graph_op_name: "TestOp"
+in_arg: <
+  name: "a"
+  rename_to: "aa"
+  description: "Description for aa."
+>
+in_arg: <
+  name: "b"
+  rename_to: "bb"
+  description: "Description for bb."
+>
+arg_order: "b"
+arg_order: "a"
+out_arg: <
+  name: "c"
+  rename_to: "cc"
+  description: "Description for cc."
+>
+attr: <
+  name: "d"
+  rename_to: "dd"
+  description: "Description for dd."
+>
+summary: "Summary for TestOp."
+description: "Description for TestOp."
+>
+`,
+			wanted: `
+// TestOpAttr is an optional argument to TestOp.
+type TestOpAttr func(optionalAttr)
+
+// TestOpDd sets the optional dd attribute to value.
+//
+// value: Description for dd.
+// If not specified, defaults to 0
+func TestOpDd(value int64) TestOpAttr {
+	return func(m optionalAttr) {
+		m["d"] = value
+	}
+}
+
+// Summary for TestOp.
+//
+// Description for TestOp.
+//
+// Arguments:
+//	bb: Description for bb.
+//	aa: Description for aa.
+//
+// Returns Description for cc.
+func TestOp(scope *Scope, bb tf.Output, aa tf.Output, optional ...TestOpAttr) (cc tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TestOp",
+		Input: []tf.Input{
+			aa, bb,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 `,
 		},
 	}
@@ -378,11 +539,13 @@ func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []t
 	for _, test := range testdata {
 		t.Run(test.tag, func(t *testing.T) {
 			var opdef pb.OpDef
+			var apidef *pb.ApiDef
 			var buf bytes.Buffer
 			if err := proto.UnmarshalText(test.opdef, &opdef); err != nil {
 				t.Fatal(err)
 			}
-			if err := generateFunctionForOp(&buf, &opdef); err != nil {
+			apidef = GetAPIDef(t, &opdef, test.apidef)
+			if err := generateFunctionForOp(&buf, &opdef, apidef); err != nil {
 				t.Fatal(err)
 			}
 			got, err := format.Source(buf.Bytes())
diff --git a/tensorflow/go/genop/main.go b/tensorflow/go/genop/main.go
index b6f8e2d5a8e30c4721b5c49f64b15f72cc70a794..4a53084ed13b39938ea9ee8b9479d2dd2481e706 100644
--- a/tensorflow/go/genop/main.go
+++ b/tensorflow/go/genop/main.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-//go:generate sh generate.sh
+//go:generate bash generate.sh
 
 // Command genop generates a Go source file with functions for TensorFlow ops.
 package main
@@ -27,15 +27,17 @@ import (
 	"log"
 	"os"
 	"path/filepath"
+	"strings"
 
 	"github.com/tensorflow/tensorflow/tensorflow/go/genop/internal"
 )
 
 func main() {
 	var (
-		filename = flag.String("outfile", "", "File to write generated source code to.")
-		header   = flag.String("header", "", "Path to a file whose contents will be copied into the generated file. Can be empty")
-		buf      bytes.Buffer
+		filename   = flag.String("outfile", "", "File to write generated source code to.")
+		header     = flag.String("header", "", "Path to a file whose contents will be copied into the generated file. Can be empty")
+		apiDefDirs = flag.String("api_def_dirs", "", "Comma-separated directories containing api_def_*.pbtxt files.")
+		buf        bytes.Buffer
 	)
 	flag.Parse()
 	if *filename == "" {
@@ -51,7 +53,13 @@ func main() {
 	}
 	os.MkdirAll(filepath.Dir(*filename), 0755)
 
-	if err := internal.GenerateFunctionsForRegisteredOps(&buf); err != nil {
+	apiDefDirsList := []string{}
+	if len(*apiDefDirs) > 0 {
+		apiDefDirsList = strings.Split(*apiDefDirs, ",")
+	}
+
+	if err := internal.GenerateFunctionsForRegisteredOps(
+		&buf, apiDefDirsList); err != nil {
 		log.Fatal(err)
 	}
 	formatted, err := format.Source(buf.Bytes())
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index 46c600eab17c6c467d0b3a3312f848541f382e80..08943a527cbdc072b12b066240c213be45ffd54c 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -20,6 +20,25 @@ package tensorflow
 //
 // #include <stdlib.h>
 // #include <string.h>
+//
+// void TF_SetAttrShapeList_Helper(TF_OperationDescription* desc,
+//                                 const char* attr_name,
+//                                 const int64_t* flat_dims,
+//                                 const int* num_dims,
+//                                 int num_shapes) {
+//  const int64_t** dims =
+//    (const int64_t**)malloc(sizeof(const int64_t*) * num_shapes);
+//  int i = 0;
+//  for (i = 0; i < num_shapes; i++) {
+//    dims[i] = flat_dims;
+//    if (num_dims[i] > 0) {
+//      // flat_dims will be NULL iff num_shapes is 0 or all elements in num_dims are <= 0.
+//      flat_dims += num_dims[i];
+//    }
+//  }
+//  TF_SetAttrShapeList(desc, attr_name, dims, num_dims, num_shapes);
+//  free(dims);
+// }
 import "C"
 
 import (
@@ -114,6 +133,20 @@ func (g *Graph) Operation(name string) *Operation {
 	return &Operation{cop, g}
 }
 
+// Operations returns a list of all operations in the graph
+func (g *Graph) Operations() []Operation {
+	var pos C.size_t = 0
+	ops := []Operation{}
+	for {
+		cop := C.TF_GraphNextOperation(g.c, &pos)
+		if cop == nil {
+			break
+		}
+		ops = append(ops, Operation{cop, g})
+	}
+	return ops
+}
+
 // OpSpec is the specification of an Operation to be added to a Graph
 // (using Graph.AddOperation).
 type OpSpec struct {
@@ -140,7 +173,11 @@ type OpSpec struct {
 	// operation.
 	Attrs map[string]interface{}
 
-	// Other possible fields: Device, ColocateWith, ControlInputs.
+	// Operations that must be executed before executing the operation
+	// being added.
+	ControlDependencies []*Operation
+
+	// Other possible fields: Device, ColocateWith.
 }
 
 // AddOperation adds an operation to g.
@@ -171,6 +208,9 @@ func (g *Graph) AddOperation(args OpSpec) (*Operation, error) {
 			}
 		}
 	}
+	for _, in := range args.ControlDependencies {
+		C.TF_AddControlInput(cdesc, in.c)
+	}
 	status := newStatus()
 	for name, value := range args.Attrs {
 		if err := setAttr(cdesc, status, name, value); err != nil {
@@ -289,41 +329,37 @@ func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, valu
 			return fmt.Errorf("bad value for attribute %q: %v", name, err)
 		}
 	case Shape:
-		ndims, dims := cshape(value)
+		ndims := C.int(value.NumDimensions())
 		var dimsp *C.int64_t
 		if ndims > 0 {
+			dims := make([]C.int64_t, ndims)
+			for i, d := range value.dims {
+				dims[i] = C.int64_t(d)
+			}
 			dimsp = &dims[0]
 		}
 		C.TF_SetAttrShape(cdesc, cAttrName, dimsp, ndims)
 	case []Shape:
-		ndims := make([]C.int, len(value))
-		dims := make([][]C.int64_t, len(value))
-		dimsp := make([]*C.int64_t, len(value))
-		for i, s := range value {
-			ndims[i], dims[i] = cshape(s)
-			if ndims[i] > 0 {
-				dimsp[i] = &dims[i][0]
-			}
-		}
-		if len(value) > 0 {
-			C.TF_SetAttrShapeList(cdesc, cAttrName, &dimsp[0], &ndims[0], C.int(len(value)))
-		} else {
+		if len(value) == 0 {
 			C.TF_SetAttrShapeList(cdesc, cAttrName, nil, nil, 0)
+		} else {
+			var flatDims []C.int64_t
+			ndims := make([]C.int, len(value))
+			for i, s := range value {
+				nd := s.NumDimensions()
+				ndims[i] = C.int(nd)
+				for _, d := range s.dims {
+					flatDims = append(flatDims, C.int64_t(d))
+				}
+			}
+			var flatDimsp *C.int64_t
+			if len(flatDims) > 0 {
+				flatDimsp = &flatDims[0]
+			}
+			C.TF_SetAttrShapeList_Helper(cdesc, cAttrName, flatDimsp, &ndims[0], C.int(len(value)))
 		}
 	default:
 		return fmt.Errorf("attribute %q has a type (%T) which is not valid for operation attributes", name, value)
 	}
 	return nil
 }
-
-func cshape(s Shape) (C.int, []C.int64_t) {
-	ndims := C.int(s.NumDimensions())
-	if ndims < 0 {
-		return -1, nil
-	}
-	dims := make([]C.int64_t, ndims)
-	for i, s := range s.dims {
-		dims[i] = C.int64_t(s)
-	}
-	return ndims, dims
-}
diff --git a/tensorflow/go/graph_test.go b/tensorflow/go/graph_test.go
index c3120bc720308402b22884f29b7ff87ef035874b..b8d65c54f697153ad236f5e27d9f27d048c3a22e 100644
--- a/tensorflow/go/graph_test.go
+++ b/tensorflow/go/graph_test.go
@@ -29,10 +29,26 @@ func hasOperations(g *Graph, ops ...string) error {
 			missing = append(missing, op)
 		}
 	}
-	if len(missing) == 0 {
-		return nil
+	if len(missing) != 0 {
+		return fmt.Errorf("Graph does not have the operations %v", missing)
 	}
-	return fmt.Errorf("Graph does not have the operations %v", missing)
+
+	inList := map[string]bool{}
+	for _, op := range g.Operations() {
+		inList[op.Name()] = true
+	}
+
+	for _, op := range ops {
+		if !inList[op] {
+			missing = append(missing, op)
+		}
+	}
+
+	if len(missing) != 0 {
+		return fmt.Errorf("Operations %v are missing from graph.Operations()", missing)
+	}
+
+	return nil
 }
 
 func TestGraphWriteToAndImport(t *testing.T) {
diff --git a/tensorflow/go/op/generate.go b/tensorflow/go/op/generate.go
index 17ece1c7a2547ee872bf9b79c99f3ef1f9be1b2c..e5a9bea77091e438d572a2863216744b446095de 100644
--- a/tensorflow/go/op/generate.go
+++ b/tensorflow/go/op/generate.go
@@ -15,6 +15,6 @@ limitations under the License.
 */
 
 //go:generate go generate ../genop
-//go:generate go run ../genop/main.go -outfile wrappers.go
+//go:generate go run ../genop/main.go -outfile wrappers.go -api_def_dirs ../../core/api_def/base_api/
 
 package op
diff --git a/tensorflow/go/op/op_test.go b/tensorflow/go/op/op_test.go
index 2451ba360699a7ac24f64209339e7b4f92ffb548..842dee9ffe396c44cfa26bbc7fd34a598e62bf89 100644
--- a/tensorflow/go/op/op_test.go
+++ b/tensorflow/go/op/op_test.go
@@ -58,3 +58,76 @@ func TestAddOperationFailure(t *testing.T) {
 	_ = resize.Shape()
 	t.Errorf("resize.Shape() should have paniced since the underlying Operation was not created")
 }
+
+func TestShapeAttribute(t *testing.T) {
+	s := NewScope()
+	x := Placeholder(s.SubScope("x"), tf.Int32, PlaceholderShape(tf.MakeShape(1)))
+	y := Placeholder(s.SubScope("y"), tf.Int32, PlaceholderShape(tf.Shape{}))
+	z := Add(s, x, y)
+	graph, err := s.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := tf.NewTensor([]int32{7})
+	if err != nil {
+		t.Fatal(err)
+	}
+	feeds := map[tf.Output]*tf.Tensor{
+		x: value,
+		y: value,
+	}
+	fetched, err := sess.Run(feeds, []tf.Output{z}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if got, want := len(fetched), 1; got != want {
+		t.Fatalf("Fetched %d tensors, expected %d", got, want)
+	}
+	if got, want := fetched[0].Value().([]int32), []int32{14}; len(got) != len(want) || len(got) != 1 || got[0] != want[0] {
+		t.Fatalf("Got %v, want %v", got, want)
+	}
+}
+
+func TestDataset(t *testing.T) {
+	var (
+		s = NewScope()
+
+		// The use of a non-scalar here is inspired by
+		// https://github.com/tensorflow/tensorflow/issues/14891
+		c       = Const(s, []int32{21718, 31415})
+		types   = []tf.DataType{c.DataType()}
+		shapes  = []tf.Shape{c.Shape()}
+		dataset = TensorDataset(s, []tf.Output{c}, shapes)
+
+		iterator = Iterator(s, "", "", types, shapes)
+		next     = IteratorGetNext(s, iterator, types, shapes)
+		init     = MakeIterator(s, dataset, iterator)
+	)
+	graph, err := s.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err := sess.Run(nil, nil, []*tf.Operation{init}); err != nil {
+		t.Fatal(err)
+	}
+	results, err := sess.Run(nil, next, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	got := results[0].Value().([]int32)
+	if len(got) != 2 || got[0] != 21718 || got[1] != 31415 {
+		t.Errorf("Got %v, want {21718, 31415}", got)
+	}
+	if _, err := sess.Run(nil, next, nil); err == nil {
+		t.Errorf("Expected sess.Run() to fail since the iterator should have reached the end of the dataset")
+	}
+}
diff --git a/tensorflow/go/op/scope.go b/tensorflow/go/op/scope.go
index a9ec79463a00022bf85bf00032df9004648525ae..13de4294dc2ebdfff9bb68d277c09239d0bc8593 100644
--- a/tensorflow/go/op/scope.go
+++ b/tensorflow/go/op/scope.go
@@ -33,10 +33,11 @@ import (
 // A Scope object and all its derivates (e.g., obtained from Scope.SubScope)
 // are not safe for concurrent use by multiple goroutines.
 type Scope struct {
-	graph     *tf.Graph
-	namemap   map[string]int
-	namespace string
-	err       *scopeErr
+	graph               *tf.Graph
+	namemap             map[string]int
+	namespace           string
+	controlDependencies []*tf.Operation
+	err                 *scopeErr
 }
 
 // scopeErr is used to share errors between all derivatives of a root scope.
@@ -80,6 +81,7 @@ func (s *Scope) AddOperation(args tf.OpSpec) *tf.Operation {
 	if s.namespace != "" {
 		args.Name = s.namespace + "/" + args.Name
 	}
+	args.ControlDependencies = append(args.ControlDependencies, s.controlDependencies...)
 	op, err := s.graph.AddOperation(args)
 	if err != nil {
 		s.UpdateErr(args.Type, err)
@@ -103,6 +105,28 @@ func (s *Scope) SubScope(namespace string) *Scope {
 	}
 }
 
+// WithControlDependencies returns a new Scope which will cause all operations
+// added to the graph to execute only after all the provided operations have
+// executed first (in addition to any other control dependencies in s).
+func (s *Scope) WithControlDependencies(ops ...*tf.Operation) *Scope {
+	// Force a copy of the control dependencies into a new underlying array on
+	// every call.  We cannot alias the same underlying array as `ops`, otherwise
+	// the user could modify that array after calling s.WithControlDependencies,
+	// which would be confusing.  We cannot alias the same underlying array as the
+	// original `s.controlDependencies`, since Scopes form a logical tree, and
+	// other calls to s.WithControlDependencies could stomp on each other.
+	deps := make([]*tf.Operation, 0, len(s.controlDependencies)+len(ops))
+	deps = append(deps, s.controlDependencies...)
+	deps = append(deps, ops...)
+	return &Scope{
+		graph:               s.graph,
+		namemap:             s.namemap,
+		namespace:           s.namespace,
+		controlDependencies: deps,
+		err:                 s.err,
+	}
+}
+
 // Err returns the error, if any, encountered during the construction
 // of the Graph managed by s.
 //
diff --git a/tensorflow/go/op/scope_test.go b/tensorflow/go/op/scope_test.go
index 6fb5d32e503c7c9a5a48747844da15be81b1de2d..b58a61de98b0f5b04959e1eca35c6b6c4d77e42b 100644
--- a/tensorflow/go/op/scope_test.go
+++ b/tensorflow/go/op/scope_test.go
@@ -69,6 +69,49 @@ func TestScopeSubScopeErrors(t *testing.T) {
 	}
 }
 
+func TestControlDependencies(t *testing.T) {
+	var (
+		s        = NewScope()
+		zero     = Const(s.SubScope("zero"), int32(0))
+		one      = Const(s.SubScope("one"), int32(1))
+		variable = VarHandleOp(s, tf.Int32, tf.ScalarShape())
+		init     = AssignVariableOp(s, variable, zero)
+		update   = AssignAddVariableOp(s, variable, one)
+		readDeps = []*tf.Operation{update}
+	)
+	// We intend for `read` to have a control dependency on `update`.
+	s = s.WithControlDependencies(readDeps...)
+	// Ensure that Scope.WithControlDependencies makes a copy of the underlying
+	// array, rather than just holding a slice reference to the same user-supplied
+	// underlying array.  If the copy is correctly performed, overwriting
+	// readDeps[0] should have no effect on control dependencies for `read`.
+	readDeps[0] = init
+	read := ReadVariableOp(s, variable, tf.Int32)
+
+	graph, err := s.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err = sess.Run(nil, nil, []*tf.Operation{init}); err != nil {
+		t.Fatal(err)
+	}
+	// Without the control dependency, the read operation may not see the
+	// update.
+	for i := int32(0); i < 10; i++ {
+		out, err := sess.Run(nil, []tf.Output{read}, nil)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if got, want := out[0].Value().(int32), i+1; got != want {
+			t.Errorf("Got %d, want %d", got, want)
+		}
+	}
+}
+
 func TestScopeFinalize(t *testing.T) {
 	var (
 		root = NewScope()
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 664e37d3a15ef250e3ef90b3201504c108c5c55b..13f38dfb32a476477d306093bad6b56e1744a640 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -38,26 +38,80 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 	return list, start + size, nil
 }
 
-// Writes a `Summary` protocol buffer with scalar values.
+// WriteImageSummaryAttr is an optional argument to WriteImageSummary.
+type WriteImageSummaryAttr func(optionalAttr)
+
+// WriteImageSummaryMaxImages sets the optional max_images attribute to value.
 //
-// The input `tag` and `value` must have the scalars.
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func WriteImageSummaryMaxImages(value int64) WriteImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_images"] = value
+	}
+}
+
+// Writes a `Summary` protocol buffer with images.
+//
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
+//
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
+//
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
 //
 // Arguments:
 //	writer: A handle to a summary writer.
 //	step: The step to write the summary for.
-//	tag: Tag for the summary.
-//	value: Value for the summary.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//	bad_color: Color to use for pixels with non-finite values.
 //
 // Returns the created operation.
-func WriteScalarSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, value tf.Output) (o *tf.Operation) {
+func WriteImageSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, bad_color tf.Output, optional ...WriteImageSummaryAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "WriteScalarSummary",
+		Type: "WriteImageSummary",
 		Input: []tf.Input{
-			writer, step, tag, value,
+			writer, step, tag, tensor, bad_color,
 		},
+		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
@@ -109,42 +163,58 @@ func WriteSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Outp
 	return scope.AddOperation(opspec)
 }
 
-// Flushes and closes the summary writer.
+// Creates summary database writer accessible by given resource handle.
 //
-// Also removes it from the resource manager. To reopen, use another
-// CreateSummaryFileWriter op.
+// This can be used to write tensors from the execution graph directly
+// to a database. Only SQLite is supported right now. This function
+// will create the schema if it doesn't exist. Entries in the Users,
+// Experiments, and Runs tables will be created automatically if they
+// don't already exist.
 //
 // Arguments:
-//	writer: A handle to the summary writer resource.
+//	writer: Handle to SummaryWriter resource to overwrite.
+//	db_uri: For example "file:/tmp/foo.sqlite".
+//	experiment_name: Can't contain ASCII control characters or <>. Case
+// sensitive. If empty, then the Run will not be associated with any
+// Experiment.
+//	run_name: Can't contain ASCII control characters or <>. Case sensitive.
+// If empty, then each Tag will not be associated with any Run.
+//	user_name: Must be valid as both a DNS label and Linux username. If
+// empty, then the Experiment will not be associated with any User.
 //
 // Returns the created operation.
-func CloseSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) {
+func CreateSummaryDbWriter(scope *Scope, writer tf.Output, db_uri tf.Output, experiment_name tf.Output, run_name tf.Output, user_name tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "CloseSummaryWriter",
+		Type: "CreateSummaryDbWriter",
 		Input: []tf.Input{
-			writer,
+			writer, db_uri, experiment_name, run_name, user_name,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Flushes the writer's unwritten events.
+// Creates a summary file writer accessible by the given resource handle.
 //
 // Arguments:
-//	writer: A handle to the summary writer resource.
+//	writer: A handle to the summary writer resource
+//	logdir: Directory where the event file will be written.
+//	max_queue: Size of the queue of pending events and summaries.
+//	flush_millis: How often, in milliseconds, to flush the pending events and
+// summaries to disk.
+//	filename_suffix: Every event file's name is suffixed with this suffix.
 //
 // Returns the created operation.
-func FlushSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) {
+func CreateSummaryFileWriter(scope *Scope, writer tf.Output, logdir tf.Output, max_queue tf.Output, flush_millis tf.Output, filename_suffix tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FlushSummaryWriter",
+		Type: "CreateSummaryFileWriter",
 		Input: []tf.Input{
-			writer,
+			writer, logdir, max_queue, flush_millis, filename_suffix,
 		},
 	}
 	return scope.AddOperation(opspec)
@@ -208,120 +278,174 @@ func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
-// and `max` to 'outputs' tensor of same shape as `inputs`.
+// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+// are placed in `outputs[i]` in lexicographic order of `js`, and the first
+// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+// In detail,
 //
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+// ```python
+//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
 //
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+// ```
+//
+// `data.shape` must start with `partitions.shape`.
+//
+// For example:
+//
+// ```python
+//     # Scalar partitions.
+//     partitions = 1
+//     num_partitions = 2
+//     data = [10, 20]
+//     outputs[0] = []  # Empty with shape [0, 2]
+//     outputs[1] = [[10, 20]]
+//
+//     # Vector partitions.
+//     partitions = [0, 0, 1, 1, 0]
+//     num_partitions = 2
+//     data = [10, 20, 30, 40, 50]
+//     outputs[0] = [10, 20, 50]
+//     outputs[1] = [30, 40]
+// ```
+//
+// See `dynamic_stitch` for an example on how to merge partitions back.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
+//	num_partitions: The number of partitions to output.
+func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_partitions": num_partitions}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
+		Type: "DynamicPartition",
 		Input: []tf.Input{
-			inputs, min, max,
+			data, partitions,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("DynamicPartition", err)
+		return
+	}
+	return outputs
 }
 
-// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
-type QuantizedInstanceNormAttr func(optionalAttr)
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
 
-// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
 //
-// value: If True, `given_y_min` and `given_y_min`
-// and `given_y_max` are used as the output range. Otherwise,
-// the implementation computes the output range.
-// If not specified, defaults to false
-func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["output_range_given"] = value
+		m["container"] = value
 	}
 }
 
-// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
 //
-// value: Output in `y_min` if `output_range_given` is True.
-// If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["given_y_min"] = value
+		m["shared_name"] = value
 	}
 }
 
-// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
-//
-// value: Output in `y_max` if `output_range_given` is True.
-// If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["given_y_max"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
-//
-// value: A small float number to avoid dividing by 0.
-// If not specified, defaults to 1e-05
-func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["variance_epsilon"] = value
+		m["value_shape"] = value
 	}
 }
 
-// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
+// Creates an empty hash table.
 //
-// value: Minimum value of `y_max - y_min`
-// If not specified, defaults to 0.001
-func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["min_separation"] = value
-	}
-}
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableHashTableOfTensorsV2",
 
-// Quantized Instance normalization.
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+//
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	x: A 4D input Tensor.
-//	x_min: The value represented by the lowest quantized input.
-//	x_max: The value represented by the highest quantized input.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
-func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
+// Returns the created operation.
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -330,1657 +454,1522 @@ func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedInstanceNorm",
+		Type: "ResourceApplyProximalAdagrad",
 		Input: []tf.Input{
-			x, x_min, x_max,
+			var_, accum, lr, l1, l2, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
 
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+// MutableHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["signed_input"] = value
+		m["container"] = value
 	}
 }
 
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["num_bits"] = value
+		m["shared_name"] = value
 	}
 }
 
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
 // If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["range_given"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MutableHashTableV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
+type MapUnstageNoKeyAttr func(optionalAttr)
+
+// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+//
+// REQUIRES: value >= 0
+func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["input_min"] = value
+		m["capacity"] = value
 	}
 }
 
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+//
+// REQUIRES: value >= 0
+func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["input_max"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// Use QuantizeAndDequantizeV2 instead.
+// MapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns a random (key, value)
 //
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
+// from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
+		Type: "MapUnstageNoKey",
 		Input: []tf.Input{
-			input,
+			indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstageNoKey", err)
+		return
+	}
+	return key, values
 }
 
-// OneHotAttr is an optional argument to OneHot.
-type OneHotAttr func(optionalAttr)
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
 
-// OneHotAxis sets the optional axis attribute to value.
+// HashTableV2Container sets the optional container attribute to value.
 //
-// value: The axis to fill (default: -1, a new inner-most axis).
-// If not specified, defaults to -1
-func OneHotAxis(value int64) OneHotAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["container"] = value
 	}
 }
 
-// Returns a one-hot tensor.
-//
-// The locations represented by indices in `indices` take value `on_value`,
-// while all other locations take value `off_value`.
-//
-// If the input `indices` is rank `N`, the output will have rank `N+1`,
-// The new axis is created at dimension `axis` (default: the new axis is
-// appended at the end).
-//
-// If `indices` is a scalar the output shape will be a vector of length `depth`.
-//
-// If `indices` is a vector of length `features`, the output shape will be:
-// ```
-//   features x depth if axis == -1
-//   depth x features if axis == 0
-// ```
+// HashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// If `indices` is a matrix (batch) with shape `[batch, features]`,
-// the output shape will be:
-// ```
-//   batch x features x depth if axis == -1
-//   batch x depth x features if axis == 1
-//   depth x batch x features if axis == 0
-// ```
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 //
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates a non-initialized hash table.
 //
-// Examples
-// =========
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
 //
-// Suppose that
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 5.0
-//   off_value = 0.0
-//   axis = -1
-// ```
-//
-// Then output is `[4 x 3]`:
-//
-//     ```output =
-//       [5.0 0.0 0.0]  // one_hot(0)
-//       [0.0 0.0 5.0]  // one_hot(2)
-//       [0.0 0.0 0.0]  // one_hot(-1)
-//       [0.0 5.0 0.0]  // one_hot(1)
-//     ```
-//
-// Suppose that
-//
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 0.0
-//   off_value = 3.0
-//   axis = 0
-// ```
-//
-// Then output is `[3 x 4]`:
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "HashTableV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Replaces the contents of the table with the specified keys and values.
 //
-//     ```output =
-//       [0.0 3.0 3.0 3.0]
-//       [3.0 3.0 3.0 0.0]
-//       [3.0 3.0 3.0 3.0]
-//       [3.0 0.0 3.0 3.0]
-//     //  ^                one_hot(0)
-//     //      ^            one_hot(2)
-//     //          ^        one_hot(-1)
-//     //              ^    one_hot(1)
-//     ```
-// Suppose that
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
-// ```
-//   indices = [[0, 2], [1, -1]]
-//   depth = 3
-//   on_value = 1.0
-//   off_value = 0.0
-//   axis = -1
-// ```
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
 //
-// Then output is `[2 x 2 x 3]`:
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableImportV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
+
+// MapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-//     ```output =
-//       [
-//         [1.0, 0.0, 0.0]  // one_hot(0)
-//         [0.0, 0.0, 1.0]  // one_hot(2)
-//       ][
-//         [0.0, 1.0, 0.0]  // one_hot(1)
-//         [0.0, 0.0, 0.0]  // one_hot(-1)
-//       ]```
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	indices: A tensor of indices.
-//	depth: A scalar defining the depth of the one hot dimension.
-//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
-//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapPeekContainer(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapPeekSharedName(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
 //
-// Returns The one-hot tensor.
-func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OneHot",
+		Type: "MapPeek",
 		Input: []tf.Input{
-			indices, depth, on_value, off_value,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
+		return
+	}
+	return values
 }
 
-// Bitcasts a tensor from one type to another without copying data.
+// Returns (x - y)(x - y) element-wise.
 //
-// Given a tensor `input`, this operation returns a tensor that has the same buffer
-// data as `input` with datatype `type`.
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SquaredDifference",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Forwards the input to the output.
 //
-// If the input datatype `T` is larger than the output datatype `type` then the
-// shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
+// This operator represents the loop termination condition used by the
+// "pivot" switches of a loop.
 //
-// If `T` is smaller than `type`, the operator requires that the rightmost
-// dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
-// [..., sizeof(`type`)/sizeof(`T`)] to [...].
+// Arguments:
+//	input: A boolean scalar, representing the branch predicate of the Switch op.
 //
-// *NOTE*: Bitcast is implemented as a low-level cast, so machines with different
-// endian orderings will give different results.
-func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output) {
+// Returns The same tensor as `input`.
+func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"type": type_}
 	opspec := tf.OpSpec{
-		Type: "Bitcast",
+		Type: "LoopCond",
 		Input: []tf.Input{
 			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Extract `patches` from `images` and put them in the "depth" output dimension.
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
+
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x * y element-wise, working on quantized buffers.
 //
 // Arguments:
-//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
-//	ksizes: The size of the sliding window for each dimension of `images`.
-//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
-// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
-//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
-// input stride, specifying how far two consecutive patch samples are in the
-// input. Equivalent to extracting patches with
-// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-// subsampling them spatially by a factor of `rates`. This is equivalent to
-// `rate` in dilated (a.k.a. Atrous) convolutions.
-//	padding: The type of padding algorithm to use.
 //
-// We specify the size-related attributes as:
 //
-// ```python
-//       ksizes = [1, ksize_rows, ksize_cols, 1]
-//       strides = [1, strides_rows, strides_cols, 1]
-//       rates = [1, rates_rows, rates_cols, 1]
-// ```
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
 //
-// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
-// ksize_cols * depth]` containing image patches with size
-// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
-// `out_rows` and `out_cols` are the dimensions of the output patches.
-func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ExtractImagePatches",
+		Type: "QuantizedMul",
 		Input: []tf.Input{
-			images,
+			x, y, min_x, max_x, min_y, max_y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// BatchToSpace for N-D tensors of type T.
-//
-// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
-// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-// the input.  The spatial dimensions of this intermediate result are then
-// optionally cropped according to `crops` to produce the output.  This is the
-// reverse of SpaceToBatch.  See below for a precise description.
-//
-// Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has M dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
-//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
-//   required that
-//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Reshape `input` to `reshaped` of shape:
-//      [block_shape[0], ..., block_shape[M-1],
-//       batch / prod(block_shape),
-//       input_shape[1], ..., input_shape[N-1]]
-//
-// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1], block_shape[0],
-//       ...,
-//       input_shape[M], block_shape[M-1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0],
-//       ...,
-//       input_shape[M] * block_shape[M-1],
-//
-//       input_shape[M+1],
-//       ...,
-//       input_shape[N-1]]
-//
-// 4. Crop the start and end of dimensions `[1, ..., M]` of
-//    `reshaped_permuted` according to `crops` to produce the output of shape:
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-//       ...,
-//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
+
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// The output tensor has shape `[1, 4, 4, 1]` and value:
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
 //
-// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [2, 0]]`:
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
 //
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
 //
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+// Arguments:
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
 //
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpaceND",
+		Type: "QuantizedMatMul",
 		Input: []tf.Input{
-			input, block_shape, crops,
+			a, b, min_a, max_a, min_b, max_b,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SpaceToBatch for 4-D tensors of type T.
-//
-// This is a legacy version of the more general SpaceToBatchND.
-//
-// Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
-// More specifically, this op outputs a copy of the input tensor where values from
-// the `height` and `width` dimensions are moved to the `batch` dimension. After
-// the zero-padding, both `height` and `width` of the input must be divisible by the
-// block size.
+// A placeholder op that passes through `input` when its output is not fed.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, depth]`.
-//	paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-//   the padding of the input with zeros across the spatial dimensions as follows:
-//
-//       paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
-//
-//   The effective spatial dimensions of the zero-padded input tensor will be:
-//
-//       height_pad = pad_top + height + pad_bottom
-//       width_pad = pad_left + width + pad_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-//   * Non-overlapping blocks of size `block_size x block size` in the height and
-//     width dimensions are rearranged into the batch dimension at each location.
-//   * The batch of the output tensor is `batch * block_size * block_size`.
-//   * Both height_pad and width_pad must be divisible by block_size.
-//
-// The shape of the output will be:
-//
-//     [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//      depth]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 1]` and value:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// The output tensor has shape `[4, 1, 1, 3]` and value:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]],
-//       [[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-//
-// The output tensor has shape `[4, 2, 2, 1]` and value:
+//	input: The default value to produce when `output` is not fed.
+//	shape: The (possibly partial) shape of the tensor.
 //
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
+// Returns A placeholder tensor that defaults to `input` if it is not fed.
+func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "PlaceholderWithDefault",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the complex conjugate of a complex number.
 //
-// (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
 //
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
 //
-// The output tensor has shape `[8, 1, 2, 1]` and value:
+// For example:
 //
 // ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
 // ```
-//
-// Among others, this operation is useful for reducing atrous convolution into
-// regular convolution.
-//
-func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size int64) (output tf.Output) {
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "SpaceToBatch",
+		Type: "Conj",
 		Input: []tf.Input{
-			input, paddings,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
-type QuantizeAndDequantizeV2Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV2SignedInput sets the optional signed_input attribute to value.
-//
-// value: If the quantization is signed or unsigned.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV2SignedInput(value bool) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
 
-// QuantizeAndDequantizeV2NumBits sets the optional num_bits attribute to value.
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: The bitwidth of the quantization.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeV2NumBits(value int64) QuantizeAndDequantizeV2Attr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["num_bits"] = value
+		m["use_locking"] = value
 	}
 }
 
-// QuantizeAndDequantizeV2RangeGiven sets the optional range_given attribute to value.
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: If the range is given or should be computed from the tensor.
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
 // If not specified, defaults to false
-func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["range_given"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// Quantizes then dequantizes a tensor.
-//
-// This op simulates the precision loss from the quantized forward pass by:
-// 1. Quantizing the tensor to fixed point numbers, which should match the target
-//    quantization method when it is used in inference.
-// 2. Dequantizing it back to floating point numbers for the following ops, most
-//    likely matmul.
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
 //
-// There are different ways to quantize. This version does not use the full range
-// of the output type, choosing to elide the lowest possible value for symmetry
-// (e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit
-// quantization), so that 0.0 maps to 0.
+// Set use_nesterov = True if you want to use Nesterov momentum.
 //
-// To perform this op, we first find the range of values in our tensor. The range
-// we use is always centered on 0, so we find m such that
+// That is for rows we have grad for, we update var and accum as follows:
 //
-// 1. m = max(abs(input_min), abs(input_max)) if range_given is true,
-// 2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
-// Our input tensor range is then [-m, m].
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
 //
-// Next, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].
-// If signed_input is true, this is
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a sequence of numbers.
 //
-//   [min_fixed, max_fixed ] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
 //
-// Otherwise, if signed_input is false, the fixed-point range is
+// For example:
 //
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
 //
-// From this we compute our scaling factor, s:
+// Arguments:
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
 //
-//   s = (max_fixed - min_fixed) / (2 * m).
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Range",
+		Input: []tf.Input{
+			start, limit, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes gradients for SparseSegmentSqrtN.
 //
-// Now we can quantize and dequantize the elements of our tensor.  An element e
-// is transformed into e':
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
-//   e' = (e * s).round_to_nearest() / s.
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtNGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along sparse segments of a tensor.
 //
-// Note that we have a different number of buckets in the signed vs. unsigned
-// cases.  For example, if num_bits == 8, we get 254 buckets in the signed case
-// vs. 255 in the unsigned case.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// For example, suppose num_bits = 8 and m = 1.  Then
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
-//   [min_fixed, max_fixed] = [-127, 127], and
-//   s = (127 + 127) / 2 = 127.
+// Arguments:
 //
-// Given the vector {-1, -0.5, 0, 0.3}, this is quantized to
-// {-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}.
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
 //
-// Arguments:
-//	input: Tensor to quantize and then dequantize.
-//	input_min: If range_given, this is the min of the range, otherwise this input
-// will be ignored.
-//	input_max: If range_given, this is the max of the range, otherwise this input
-// will be ignored.
-func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2Attr) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV2",
+		Type: "SparseSegmentMean",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SpaceToBatch for N-D tensors of type T.
-//
-// This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
-// grid of blocks of shape `block_shape`, and interleaves these blocks with the
-// "batch" dimension (0) such that in the output, the spatial dimensions
-// `[1, ..., M]` correspond to the position within the grid, and the batch
-// dimension combines both the position within a spatial block and the original
-// batch position.  Prior to division into blocks, the spatial dimensions of the
-// input are optionally zero padded according to `paddings`.  See below for a
-// precise description.
+// Pop the element at the top of the stack.
 //
 // Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has `M` dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
-//   `i + 1`, which corresponds to spatial dimension `i`.  It is required that
-//   `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
-//    input according to `paddings` to produce `padded` of shape `padded_shape`.
-//
-// 2. Reshape `padded` to `reshaped_padded` of shape:
-//
-//      [batch] +
-//      [padded_shape[1] / block_shape[0],
-//        block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1],
-//       block_shape[M-1]] +
-//      remaining_shape
-//
-// 3. Permute dimensions of `reshaped_padded` to produce
-//    `permuted_reshaped_padded` of shape:
-//
-//      block_shape +
-//      [batch] +
-//      [padded_shape[1] / block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1]] +
-//      remaining_shape
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
 //
-// 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
-//    dimension, producing an output tensor of shape:
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
+	opspec := tf.OpSpec{
+		Type: "StackPopV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor.
 //
-//      [batch * prod(block_shape)] +
-//      [padded_shape[1] / block_shape[0],
-//       ...,
-//       padded_shape[M] / block_shape[M-1]] +
-//      remaining_shape
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
-// Some examples:
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
+// For example:
 //
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 //
-// The output tensor has shape `[4, 1, 1, 1]` and value:
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
 //
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
 // ```
 //
-// (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
+// Arguments:
 //
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
 //
-// The output tensor has shape `[4, 1, 1, 3]` and value:
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSumWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
+
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
 //
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Converts a sparse representation into a dense tensor.
 //
-// (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
-//     `paddings = [[0, 0], [0, 0]]`:
+// Builds an array `dense` with shape `output_shape` such that
 //
 // ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]],
-//       [[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
 //
-// The output tensor has shape `[4, 2, 2, 1]` and value:
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
 //
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
 // ```
 //
-// (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
-//     paddings = `[[0, 0], [2, 0]]`:
-//
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
 //
-// The output tensor has shape `[8, 1, 3, 1]` and value:
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
 //
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
 //
-// Among others, this operation is useful for reducing atrous convolution into
-// regular convolution.
-func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddings tf.Output) (output tf.Output) {
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SpaceToBatchND",
+		Type: "SparseToDense",
 		Input: []tf.Input{
-			input, block_shape, paddings,
+			sparse_indices, output_shape, sparse_values, default_value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
-
-// SqueezeSqueezeDims sets the optional squeeze_dims attribute to value.
+// Counts the number of occurrences of each value in an integer array.
 //
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to <>
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
 //
-// REQUIRES: len(value) >= 0
-func SqueezeSqueezeDims(value []int64) SqueezeAttr {
-	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
+// Values in `arr` outside of the range [0, size) are ignored.
+//
+// Arguments:
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
+//
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Bincount",
+		Input: []tf.Input{
+			arr, size, weights,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Removes dimensions of size 1 from the shape of a tensor.
+// Computes the sum along sparse segments of a tensor.
 //
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `squeeze_dims`.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
 // For example:
 //
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
-// ```
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 //
-// Or, to remove specific size 1 dimensions:
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
 //
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
 // ```
 //
 // Arguments:
-//	input: The `input` to squeeze.
 //
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Squeeze",
+		Type: "SparseSegmentSum",
 		Input: []tf.Input{
-			input,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// A placeholder op for a value that will be fed into the computation.
-//
-// DEPRECATED at GraphDef version 23: Placeholder now behaves the same as PlaceholderV2.
-//
-// N.B. This operation will fail with an error if it is executed. It is
-// intended as a way to represent a value that will always be fed, and to
-// provide attrs that enable the fed value to be checked at runtime.
-//
-// Arguments:
-//	dtype: The type of elements in the tensor.
-//	shape: The shape of the tensor. The shape can be any partially-specified
-// shape.  To be unconstrained, pass in a shape with unknown rank.
-//
-// Returns A placeholder tensor that must be replaced using the feed mechanism.
-func PlaceholderV2(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "PlaceholderV2",
-
-		Attrs: attrs,
+		Type: "Sinh",
+		Input: []tf.Input{
+			x,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pads a tensor with mirrored values.
+// Computes the sum along segments of a tensor.
 //
-// This operation pads a `input` with mirrored values according to the `paddings`
-// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many values to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many values to add after the contents of `input`
-// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
-// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
-// (if false, respectively).
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// The padded size of each dimension D of the output is:
+// Computes a tensor such that
+// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
 //
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
 //
-// For example:
+// `num_segments` should equal the number of distinct segment IDs.
 //
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6]].
-// # 'paddings' is [[1, 1]], [2, 2]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
-//                       [2, 1, 1, 2, 3, 3, 2]
-//                       [5, 4, 4, 5, 6, 6, 5]
-//                       [5, 4, 4, 5, 6, 6, 5]]
-// ```
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
 //
 // Arguments:
-//	input: The input tensor to be padded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
-// do not include the borders, while in symmetric mode the padded regions
-// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
-// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
-// it is `[1, 2, 3, 3, 2]` in symmetric mode.
 //
-// Returns The padded tensor.
-func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode}
 	opspec := tf.OpSpec{
-		Type: "MirrorPad",
+		Type: "UnsortedSegmentSum",
 		Input: []tf.Input{
-			input, paddings,
+			data, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Return the reduction indices for computing gradients of s0 op s1 with broadcast.
+// Returns which elements of x are finite.
 //
-// This is typically used by gradient computations for a broadcasting operation.
-func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output, r1 tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BroadcastGradientArgs",
+		Type: "IsFinite",
 		Input: []tf.Input{
-			s0, s1,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Return the shape of s0 op s1 with broadcast.
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
+
+// MatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// MatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// Multiply the matrix "a" by the matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
+//
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
+		Type: "MatMul",
 		Input: []tf.Input{
-			s0, s1,
+			a, b,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns locations of nonzero / true values in a tensor.
+// Selects elements from `x` or `y`, depending on `condition`.
 //
-// This operation returns the coordinates of true elements in `input`. The
-// coordinates are returned in a 2-D tensor where the first dimension (rows)
-// represents the number of true elements, and the second dimension (columns)
-// represents the coordinates of the true elements. Keep in mind, the shape of
-// the output tensor can vary depending on how many true values there are in
-// `input`. Indices are output in row-major order.
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
 //
-// For example:
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
 //
-// ```
-// # 'input' tensor is [[True, False]
-// #                    [True, False]]
-// # 'input' has two true values, so output has two coordinates.
-// # 'input' has rank of 2, so coordinates have two indices.
-// where(input) ==> [[0, 0],
-//                   [1, 0]]
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
 //
-// # `input` tensor is [[[True, False]
-// #                     [True, False]]
-// #                    [[False, True]
-// #                     [False, True]]
-// #                    [[False, False]
-// #                     [False, True]]]
-// # 'input' has 5 true values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
+//
+// For example:
+//
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
 //
-// # `input` tensor is [[[1.5,  0.0]
-// #                     [-0.5, 0.0]]
-// #                    [[0.0,  0.25]
-// #                     [0.0,  0.75]]
-// #                    [[0.0,  0.0]
-// #                     [0.0,  0.01]]]
-// # 'input' has 5 nonzero values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
 //
-// # `input` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
-// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
-// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
 // ```
-func Where(scope *Scope, input tf.Output) (index tf.Output) {
+//
+// Arguments:
+//
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Where",
+		Type: "Select",
 		Input: []tf.Input{
-			input,
+			condition, x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the gradient of `Tile`.
-//
-// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
+// Returns the truth value of x OR y element-wise.
 //
-// Since `Tile` takes an input and repeats the input `multiples` times
-// along each dimension, `TileGrad` takes in `multiples` and aggregates
-// each repeated tile of `input` into `output`.
-func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TileGrad",
+		Type: "LogicalOr",
 		Input: []tf.Input{
-			input, multiples,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
-type StridedSliceGradAttr func(optionalAttr)
-
-// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// StridedSliceGradEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
-
-// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
-	}
-}
-
-// Returns the gradient of `StridedSlice`.
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
-// Since `StridedSlice` cuts out pieces of its `input` which is size
-// `shape`, its gradient will have the same shape (which is passed here
-// as `shape`). The gradient will be zero in any element that the slice
-// does not select.
+// The regularized incomplete beta integral is defined as:
 //
-// Arguments are the same as StridedSliceGrad with the exception that
-// `dy` is the input gradient to be propagated and `shape` is the
-// shape of `StridedSlice`'s `input`.
-func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
+//
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StridedSliceGrad",
+		Type: "Betainc",
 		Input: []tf.Input{
-			shape, begin, end, strides, dy,
+			a, b, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Return a slice from 'input'.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// The output tensor is a tensor with dimensions described by 'size'
-// whose values are extracted from 'input' starting at the offsets in
-// 'begin'.
+// N is the size of the segment being reduced.
 //
-// *Requirements*:
-//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
 // Arguments:
 //
-//	begin: begin[i] specifies the offset into the 'i'th dimension of
-// 'input' to slice from.
-//	size: size[i] specifies the number of elements of the 'i'th dimension
-// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
-// i are included in the slice (i.e. this is equivalent to setting
-// size[i] = input.dim_size(i) - begin[i]).
-func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Slice",
+		Type: "SparseSegmentSqrtNWithNumSegments",
 		Input: []tf.Input{
-			input, begin, size,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation and conjugate the result.
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 //
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
-func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+// The upper regularized incomplete Gamma function is defined as:
+//
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+//
+// where
+//
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ConjugateTranspose",
+		Type: "Igammac",
 		Input: []tf.Input{
-			x, perm,
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Checks a tensor for NaN and Inf values.
+// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
+type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
 //
-// When run, reports an `InvalidArgument` error if `tensor` has any values
-// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
+//
+// value: Whether to quantize into 2^num_bits - 1 distinct values.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxVars operation.
 //
 // Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
+// min, max: Quantization interval, scalar floats.
 //
-//	message: Prefix of the error message.
-func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
+//
+//
+// Returns Backpropagated gradients w.r.t. inputs:
+// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
+// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
+// `sum(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"message": message}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "CheckNumerics",
+		Type: "FakeQuantWithMinMaxVarsGradient",
 		Input: []tf.Input{
-			tensor,
+			gradients, inputs, min, max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
 
-// PreventGradientMessage sets the optional message attribute to value.
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["message"] = value
+		m["seed"] = value
 	}
 }
 
-// An identity op that triggers an error if a gradient is requested.
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// When executed in a graph, this op outputs its input tensor as-is.
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a log-uniform distribution.
 //
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	input: any tensor.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PreventGradient",
+		Type: "LogUniformCandidateSampler",
 		Input: []tf.Input{
-			input,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Stops gradient computation.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
-//
-// When building ops to compute gradients, this op prevents the contribution of
-// its inputs to be taken into account.  Normally, the gradient generator adds ops
-// to a graph to compute the derivatives of a specified 'loss' by recursively
-// finding out inputs that contributed to its computation.  If you insert this op
-// in the graph it inputs are masked from the gradient generator.  They are not
-// taken into account for computing gradients.
-//
-// This is useful any time you want to compute a value with TensorFlow but need
-// to pretend that the value was a constant. Some examples include:
-//
-// *  The *EM* algorithm where the *M-step* should not involve backpropagation
-//    through the output of the *E-step*.
-// *  Contrastive divergence training of Boltzmann machines where, when
-//    differentiating the energy function, the training must not backpropagate
-//    through the graph that generated the samples from the model.
-// *  Adversarial training, where no backprop should happen through the adversarial
-//    example generation process.
-func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
+
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+	return func(m optionalAttr) {
+		m["tolerance"] = value
+	}
+}
+
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StopGradient",
+		Type: "ApproximateEqual",
 		Input: []tf.Input{
-			input,
+			x, y,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gather slices from `params` into a Tensor with shape specified by `indices`.
-//
-// `indices` is an K-dimensional integer tensor, best thought of as a
-// (K-1)-dimensional tensor of indices into `params`, where each element defines a
-// slice of `params`:
-//
-//     output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]
-//
-// Whereas in @{tf.gather} `indices` defines slices into the first
-// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
-// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
-//
-// The last dimension of `indices` can be at most the rank of
-// `params`:
-//
-//     indices.shape[-1] <= params.rank
-//
-// The last dimension of `indices` corresponds to elements
-// (if `indices.shape[-1] == params.rank`) or slices
-// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
-// of `params`.  The output tensor has shape
-//
-//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
-//
-// Some examples below.
-//
-// Simple indexing into a matrix:
-//
-// ```python
-//     indices = [[0, 0], [1, 1]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = ['a', 'd']
-// ```
-//
-// Slice indexing into a matrix:
-//
-// ```python
-//     indices = [[1], [0]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['c', 'd'], ['a', 'b']]
-// ```
-//
-// Indexing into a 3-tensor:
-//
-// ```python
-//     indices = [[1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['a1', 'b1'], ['c1', 'd1']]]
-//
-//
-//     indices = [[0, 1], [1, 0]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['c0', 'd0'], ['a1', 'b1']]
-//
-//
-//     indices = [[0, 0, 1], [1, 0, 1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = ['b0', 'b1']
-// ```
-//
-// Batched indexing into a matrix:
-//
-// ```python
-//     indices = [[[0, 0]], [[0, 1]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['a'], ['b']]
-// ```
-//
-// Batched slice indexing into a matrix:
-//
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [[['c', 'd']], [['a', 'b']]]
-// ```
-//
-// Batched indexing into a 3-tensor:
-//
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
-//               [[['a0', 'b0'], ['c0', 'd0']]]]
-//
-//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['c0', 'd0'], ['a1', 'b1']],
-//               [['a0', 'b0'], ['c1', 'd1']]]
-//
-//
-//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['b0', 'b1'], ['d0', 'c1']]
-// ```
-//
-// Arguments:
-//	params: The tensor from which to gather values.
-//	indices: Index tensor.
+// Returns x / y element-wise.
 //
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
-func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GatherNd",
+		Type: "Div",
 		Input: []tf.Input{
-			params, indices,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
-
-// EditDistanceNormalize sets the optional normalize attribute to value.
-//
-// value: boolean (if true, edit distances are normalized by length of truth).
-//
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
-	return func(m optionalAttr) {
-		m["normalize"] = value
-	}
-}
-
-// Computes the (possibly normalized) Levenshtein Edit Distance.
-//
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
-//
-// The inputs are:
-//
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
-//
-// Returns A dense float tensor with rank R - 1.
-//
-// For the example input:
-//
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
-//
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
-//
-// The output will be:
+// Returns x * y element-wise.
 //
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "EditDistance",
+		Type: "Mul",
 		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a batched matrix tensor with new batched diagonal values.
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
+
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// Given `input` and `diagonal`, this operation returns a tensor with the
-// same shape and values as `input`, except for the main diagonal of the
-// innermost matrices.  These will be overwritten by the values in `diagonal`.
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// The output is computed as follows:
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
 //
-// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	input: Rank `k+1`, where `k >= 1`.
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = input.shape`.
-func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSetDiag",
+		Type: "SparseReduceSumSparse",
 		Input: []tf.Input{
-			input, diagonal,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the diagonal part of the tensor.
-//
-// This operation returns a tensor with the `diagonal` part
-// of the `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
-// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
-//
-// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
-//
-// For example:
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
+
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
-// ```
-// # 'input' is [[1, 0, 0, 0]
-//               [0, 2, 0, 0]
-//               [0, 0, 3, 0]
-//               [0, 0, 0, 4]]
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Adds `bias` to `value`.
 //
-// tf.diag_part(input) ==> [1, 2, 3, 4]
-// ```
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	input: Rank k tensor where k is even and not zero.
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns The extracted diagonal.
-func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DiagPart",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			input,
+			value, bias,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DequantizeAttr is an optional argument to Dequantize.
-type DequantizeAttr func(optionalAttr)
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
 
-// DequantizeMode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func DequantizeMode(value string) DequantizeAttr {
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["data_format"] = value
 	}
 }
 
-// Dequantize the 'input' tensor into a float Tensor.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// if T == qint8, in[i] += (range(T) + 1)/ 2.0
-// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// If the input comes from a QuantizedRelu6, the output type is
-// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-// Dequantize on quint8 will take each value, cast to float, and multiply
-// by 6 / 255.
-// Note that if quantizedtype is qint8, the operation will additionally add
-// each value by 128 prior to casting.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```c++
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = range / num_discrete_values
-// const double offset_input = static_cast<double>(input) - lowest_quantized;
-// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-// ```
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (2 * m) / (max_fixed - min_fixed)
-// ```
+// The backward operation for "BiasAdd" on the "bias" tensor.
 //
-// Now we can dequantize the elements of our tensor:
-// ```c++
-// result = input * s
-// ```
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
 //
 // Arguments:
+//	out_backprop: Any number of dimensions.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -1989,9 +1978,9 @@ func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Dequantize",
+		Type: "BiasAddGrad",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -1999,273 +1988,231 @@ func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 	return op.Output(0)
 }
 
-// Returns a tensor of zeros with the same shape and type as x.
-//
-// Arguments:
-//	x: a tensor of type T.
+// Returns x + y element-wise.
 //
-// Returns a tensor of the same shape and type as x but filled with zeros.
-func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ZerosLike",
+		Type: "AddV2",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Splits a tensor into `num_split` tensors along one dimension.
-//
-// Arguments:
-//	value: The tensor to split.
-//	size_splits: list containing the sizes of each output tensor along the split
-// dimension. Must sum to the dimension of value along split_dim.
-// Can contain one -1 indicating that dimension is to be inferred.
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[-rank(value), rank(value))`.
-//
+	return op.Output(0)
+}
+
+// Returns x + y element-wise.
 //
-// Returns Tensors whose shape matches that of `value`
-// except along `split_dim`, where their sizes are
-// `size_splits[i]`.
-func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, split_dim tf.Output, num_split int64) (output []tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "SplitV",
+		Type: "Add",
 		Input: []tf.Input{
-			value, size_splits, split_dim,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("SplitV", err)
-		return
+	return op.Output(0)
+}
+
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
+
+// NthElementReverse sets the optional reverse attribute to value.
+//
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
+// If not specified, defaults to false
+func NthElementReverse(value bool) NthElementAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
 	}
-	return output
 }
 
-// Splits a tensor into `num_split` tensors along one dimension.
+// Finds values of the `n`-th order statistic for the last dimension.
+//
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
+//
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+//
+//     values.shape = input.shape[:-1]
 //
 // Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[-rank(value), rank(value))`.
-//	value: The tensor to split.
-//	num_split: The number of ways to split.  Must evenly divide
-// `value.shape[split_dim]`.
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
 //
-// Returns They are identically shaped tensors, whose shape matches that of `value`
-// except along `split_dim`, where their sizes are
-// `values.shape[split_dim] / num_split`.
-func Split(scope *Scope, split_dim tf.Output, value tf.Output, num_split int64) (output []tf.Output) {
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Split",
+		Type: "NthElement",
 		Input: []tf.Input{
-			split_dim, value,
+			input, n,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Split", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Computes offsets of concat inputs within its output.
+// Computes the Max along segments of a tensor.
 //
-// For example:
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// ```
-// # 'x' is [2, 2, 7]
-// # 'y' is [2, 3, 7]
-// # 'z' is [2, 5, 7]
-// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-// ```
+// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum
+// such that:
 //
-// This is typically used by gradient computations for a concat operation.
+// \\(output_i = \max_j data_j\\) where max is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
+//  `output[i] = numeric_limits<T>::min()`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+// </div>
 //
 // Arguments:
-//	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
-func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.
+//
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ConcatOffset",
+		Type: "UnsortedSegmentMax",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(shape),
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
-		scope.UpdateErr("ConcatOffset", err)
-		return
-	}
-	return offset
+	return op.Output(0)
 }
 
-// Writes a `Summary` protocol buffer with a histogram.
-//
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
-//
-// This op reports an `InvalidArgument` error if any value is not finite.
-//
-// Arguments:
-//	writer: A handle to a summary writer.
-//	step: The step to write the summary for.
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
-//
-// Returns the created operation.
-func WriteHistogramSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, values tf.Output) (o *tf.Operation) {
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "WriteHistogramSummary",
+		Type: "Exp",
 		Input: []tf.Input{
-			writer, step, tag, values,
+			x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Concatenates tensors along one dimension.
+// Returns an element-wise indication of the sign of a number.
 //
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Concat",
+		Type: "Sign",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(values),
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates a list of `N` tensors along the first dimension.
-//
-// The input tensors are all required to have size 1 in the first dimension.
+// QuantizedAddAttr is an optional argument to QuantizedAdd.
+type QuantizedAddAttr func(optionalAttr)
+
+// QuantizedAddToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x + y element-wise, working on quantized buffers.
 //
-// For example:
+// Arguments:
 //
-// ```
-// # 'x' is [[1, 4]]
-// # 'y' is [[2, 5]]
-// # 'z' is [[3, 6]]
-// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// ```
 //
-// The difference between concat and parallel_concat is that concat requires all
-// of the inputs be computed before the operation will begin but doesn't require
-// that the input shapes be known during graph construction.  Parallel concat
-// will copy pieces of the input into the output as they become available, in
-// some situations this can provide a performance benefit.
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
 //
-// Arguments:
-//	values: Tensors to be concatenated. All must have size 1 in the first dimension
-// and same shape.
-//	shape: the final shape of the result; should be equal to the shapes of any input
-// but with the number of input values in the first dimension.
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
 //
-// Returns The concatenated tensor.
-func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
+// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ParallelConcat",
+		Type: "QuantizedAdd",
 		Input: []tf.Input{
-			tf.OutputList(values),
+			x, y, min_x, max_x, min_y, max_y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// UniqueAttr is an optional argument to Unique.
-type UniqueAttr func(optionalAttr)
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
 
-// UniqueOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueOutIdx(value tf.DataType) UniqueAttr {
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["output_type"] = value
 	}
 }
-
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// ```
+
+// Returns the index with the smallest value across dimensions of a tensor.
+//
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
 // Arguments:
-//	x: 1-D.
 //
-// Returns 1-D.1-D.
-func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2274,1024 +2221,885 @@ func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unique",
+		Type: "ArgMin",
 		Input: []tf.Input{
-			x,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// DecodeWavAttr is an optional argument to DecodeWav.
-type DecodeWavAttr func(optionalAttr)
-
-// DecodeWavDesiredChannels sets the optional desired_channels attribute to value.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// value: Number of sample channels wanted.
-// If not specified, defaults to -1
-func DecodeWavDesiredChannels(value int64) DecodeWavAttr {
-	return func(m optionalAttr) {
-		m["desired_channels"] = value
-	}
-}
-
-// DecodeWavDesiredSamples sets the optional desired_samples attribute to value.
+// output range specified with 'requested_output_min' and 'requested_output_max'.
 //
-// value: Length of audio requested.
-// If not specified, defaults to -1
-func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
-	return func(m optionalAttr) {
-		m["desired_samples"] = value
-	}
-}
-
-// Decode a 16-bit PCM WAV file to a float tensor.
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
 //
-// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
+// Arguments:
 //
-// When desired_channels is set, if the input contains fewer channels than this
-// then the last channel will be duplicated to give the requested number, else if
-// the input has more channels than requested then the additional channels will be
-// ignored.
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
 //
-// If desired_samples is set, then the audio will be cropped or padded with zeroes
-// to the requested length.
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "Requantize",
+		Input: []tf.Input{
+			input, input_min, input_max, requested_output_min, requested_output_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes the determinant of one or more square matrices.
 //
-// The first output contains a Tensor with the content of the audio samples. The
-// lowest dimension will be the number of channels, and the second will be the
-// number of samples. For example, a ten-sample-long stereo WAV file should give an
-// output shape of [10, 2].
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	contents: The WAV-encoded audio, usually from a file.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns 2-D with shape `[length, channels]`.Scalar holding the sample rate found in the WAV header.
-func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (audio tf.Output, sample_rate tf.Output) {
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodeWav",
+		Type: "MatrixDeterminant",
 		Input: []tf.Input{
-			contents,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Elementwise computes the bitwise right-shift of `x` and `y`.
-//
-// Performs a logical shift for unsigned integer types, and an arithmetic shift
-// for signed integer types.
-//
-// If `y` is negative, or greater than or equal to than the width of `x` in bits
-// the result is implementation defined.
-func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RightShift",
+		Type: "Sin",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise left-shift of `x` and `y`.
-//
-// If `y` is negative, or greater than or equal to the width of `x` in bits the
-// result is implementation defined.
-func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LeftShift",
+		Type: "Erfc",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise AND of `x` and `y`.
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
 //
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
+		Type: "Digamma",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FixedUnigramCandidateSamplerAttr is an optional argument to FixedUnigramCandidateSampler.
-type FixedUnigramCandidateSamplerAttr func(optionalAttr)
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
 
-// FixedUnigramCandidateSamplerVocabFile sets the optional vocab_file attribute to value.
-//
-// value: Each valid line in this file (which should have a CSV-like format)
-// corresponds to a valid word ID. IDs are in sequential order, starting from
-// num_reserved_ids. The last entry in each line is expected to be a value
-// corresponding to the count or relative probability. Exactly one of vocab_file
-// and unigrams needs to be passed to this op.
-// If not specified, defaults to ""
-func FixedUnigramCandidateSamplerVocabFile(value string) FixedUnigramCandidateSamplerAttr {
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["vocab_file"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// FixedUnigramCandidateSamplerDistortion sets the optional distortion attribute to value.
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// value: The distortion is used to skew the unigram probability distribution.
-// Each weight is first raised to the distortion's power before adding to the
-// internal unigram distribution. As a result, distortion = 1.0 gives regular
-// unigram sampling (as defined by the vocab file), and distortion = 0.0 gives
-// a uniform distribution.
-// If not specified, defaults to 1
-func FixedUnigramCandidateSamplerDistortion(value float32) FixedUnigramCandidateSamplerAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["distortion"] = value
+		m["data_format"] = value
 	}
 }
 
-// FixedUnigramCandidateSamplerNumReservedIds sets the optional num_reserved_ids attribute to value.
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
 //
-// value: Optionally some reserved IDs can be added in the range [0,
-// ..., num_reserved_ids) by the users. One use case is that a special unknown
-// word token is used as ID 0. These IDs will have a sampling probability of 0.
-// If not specified, defaults to 0
-func FixedUnigramCandidateSamplerNumReservedIds(value int64) FixedUnigramCandidateSamplerAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["num_reserved_ids"] = value
+		m["dilations"] = value
 	}
 }
 
-// FixedUnigramCandidateSamplerNumShards sets the optional num_shards attribute to value.
+// Computes the gradients of convolution with respect to the filter.
 //
-// value: A sampler can be used to sample from a subset of the original range
-// in order to speed up the whole computation through parallelism. This parameter
-// (together with 'shard') indicates the number of partitions that are being
-// used in the overall computation.
-// If not specified, defaults to 1
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-// REQUIRES: value >= 1
-func FixedUnigramCandidateSamplerNumShards(value int64) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["num_shards"] = value
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of work units this Reader has finished processing.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumWorkUnitsCompletedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise for real types.
+//
+// If `x` and `y` are reals, this will return the floating-point division.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RealDiv",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FixedUnigramCandidateSamplerShard sets the optional shard attribute to value.
-//
-// value: A sampler can be used to sample from a subset of the original range
-// in order to speed up the whole computation through parallelism. This parameter
-// (together with 'num_shards') indicates the particular partition number of a
-// sampler op, when partitioning is being used.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func FixedUnigramCandidateSamplerShard(value int64) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["shard"] = value
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Lgamma",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FixedUnigramCandidateSamplerUnigrams sets the optional unigrams attribute to value.
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 //
-// value: A list of unigram counts or probabilities, one per ID in sequential
-// order. Exactly one of vocab_file and unigrams should be passed to this op.
-// If not specified, defaults to <>
-func FixedUnigramCandidateSamplerUnigrams(value []float32) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["unigrams"] = value
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
+//
+// Arguments:
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CholeskyGrad",
+		Input: []tf.Input{
+			l, grad,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FixedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FixedUnigramCandidateSamplerSeed(value int64) FixedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acosh",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FixedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FixedUnigramCandidateSamplerSeed2(value int64) FixedUnigramCandidateSamplerAttr {
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["out_type"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// A unigram sampler could use a fixed unigram distribution read from a
-// file or passed in as an in-memory array instead of building up the distribution
-// from data on the fly. There is also an option to skew the distribution by
-// applying a distortion power to the weights.
-//
-// The vocabulary file should be in CSV-like format, with the last field
-// being the weight associated with the word.
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...FixedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FixedUnigramCandidateSampler",
+		Type: "SerializeManySparse",
 		Input: []tf.Input{
-			true_classes,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
-type UniformCandidateSamplerAttr func(optionalAttr)
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
 
-// UniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["element_shape"] = value
 	}
 }
 
-// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["dynamic_size"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayV3
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UniformCandidateSampler",
+		Type: "TensorArrayV2",
 		Input: []tf.Input{
-			true_classes,
+			size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// AbortAttr is an optional argument to Abort.
-type AbortAttr func(optionalAttr)
-
-// AbortErrorMsg sets the optional error_msg attribute to value.
+// Computes the mean along sparse segments of a tensor.
 //
-// value: A string which is the message associated with the exception.
-// If not specified, defaults to ""
-func AbortErrorMsg(value string) AbortAttr {
-	return func(m optionalAttr) {
-		m["error_msg"] = value
+// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which has size
+// `num_segments`.
+func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AbortExitWithoutError sets the optional exit_without_error attribute to value.
-// If not specified, defaults to false
-func AbortExitWithoutError(value bool) AbortAttr {
-	return func(m optionalAttr) {
-		m["exit_without_error"] = value
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cosh",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Raise a exception to abort the process when called.
-//
-// If exit_without_error is true, the process will exit normally,
-// otherwise it will exit with a SIGABORT signal.
-//
-// Returns nothing but an exception.
-//
-// Returns the created operation.
-func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Abort",
-
+		Type: "TensorSliceDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SpaceToDepthAttr is an optional argument to SpaceToDepth.
-type SpaceToDepthAttr func(optionalAttr)
-
-// SpaceToDepthDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func SpaceToDepthDataFormat(value string) SpaceToDepthAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// Computes natural logarithm of (1 + x) element-wise.
+//
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log1p",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SpaceToDepth for tensors of type T.
-//
-// Rearranges blocks of spatial data, into depth. More specifically,
-// this op outputs a copy of the input tensor where values from the `height`
-// and `width` dimensions are moved to the `depth` dimension.
-// The attr `block_size` indicates the input block size.
-//
-//   * Non-overlapping blocks of size `block_size x block size` are rearranged
-//     into depth at each location.
-//   * The depth of the output tensor is `block_size * block_size * input_depth`.
-//   * The Y, X coordinates within each block of the input become the high order
-//     component of the output channel index.
-//   * The input tensor's height and width must be divisible by block_size.
-//
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
-//
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
-//                         within the output image, bX, bY means coordinates
-//                         within the input block, iC means input channels).
-//      The output would be a transpose to the following layout:
-//      n,oY,oX,bY,bX,iC
-//
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
-//
-// For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
-// block_size = 2:
-//
-// ```
-// x = [[[[1], [2]],
-//       [[3], [4]]]]
-// ```
-//
-// This operation will output a tensor of shape `[1, 1, 1, 4]`:
-//
-// ```
-// [[[[1, 2, 3, 4]]]]
-// ```
-//
-// Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
-// the corresponding output will have a single element (i.e. width and height are
-// both 1) and will have a depth of 4 channels (1 * block_size * block_size).
-// The output element shape is `[1, 1, 4]`.
-//
-// For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// This operation, for block_size of 2, will return the following tensor of shape
-// `[1, 1, 1, 12]`
-//
-// ```
-// [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
-//
-// Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
-//
-// ```
-// x = [[[[1],   [2],  [5],  [6]],
-//       [[3],   [4],  [7],  [8]],
-//       [[9],  [10], [13],  [14]],
-//       [[11], [12], [15],  [16]]]]
-// ```
-//
-// the operator will return the following tensor of shape `[1 2 2 4]`:
-//
-// ```
-// x = [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
-// ```
+// Computes rectified linear 6 gradients for a Relu6 operation.
 //
 // Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
 //
-//	block_size: The size of the spatial block.
-func SpaceToDepth(scope *Scope, input tf.Output, block_size int64, optional ...SpaceToDepthAttr) (output tf.Output) {
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SpaceToDepth",
+		Type: "Relu6Grad",
 		Input: []tf.Input{
-			input,
+			gradients, features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Scatter `updates` into a new (initially zero) tensor according to `indices`.
-//
-// Creates a new tensor by applying sparse `updates` to individual
-// values or slices within a zero tensor of the given `shape` according to
-// indices.  This operator is the inverse of the @{tf.gather_nd} operator which
-// extracts values or slices from a given tensor.
-//
-// **WARNING**: The order in which updates are applied is nondeterministic, so the
-// output will be nondeterministic if `indices` contains duplicates.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of scatter is to insert individual elements in a tensor by
-// index. For example, say we want to insert 4 scattered elements in a rank-1
-// tensor with 8 elements.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     shape = tf.constant([8])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [0, 11, 0, 10, 9, 0, 0, 12]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     shape = tf.constant([4, 4, 4])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     with tf.Session() as sess:
-//       print(sess.run(scatter))
-// ```
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
+
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
 //
-// The resulting tensor would look like this:
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using bicubic interpolation.
 //
-//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//	shape: 1-D. The shape of the resulting tensor.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBicubic",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes natural logarithm of x element-wise.
 //
-// Returns A new tensor with the given shape and updates applied according
-// to the indices.
-func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ScatterNd",
+		Type: "Log",
 		Input: []tf.Input{
-			indices, updates, shape,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Exits the current frame to its parent frame.
-//
-// Exit makes its input `data` available to the parent frame.
-//
-// Arguments:
-//	data: The tensor to be made available to the parent frame.
+// Rounds the values of a tensor to the nearest integer, element-wise.
 //
-// Returns The same tensor as `data`.
-func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Exit",
+		Type: "Round",
 		Input: []tf.Input{
-			data,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EnterAttr is an optional argument to Enter.
-type EnterAttr func(optionalAttr)
+// RecordInputAttr is an optional argument to RecordInput.
+type RecordInputAttr func(optionalAttr)
 
-// EnterIsConstant sets the optional is_constant attribute to value.
+// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
 //
-// value: If true, the output is constant within the child frame.
-// If not specified, defaults to false
-func EnterIsConstant(value bool) EnterAttr {
+// value: Random seeds used to produce randomized records.
+// If not specified, defaults to 301
+func RecordInputFileRandomSeed(value int64) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["is_constant"] = value
+		m["file_random_seed"] = value
 	}
 }
 
-// EnterParallelIterations sets the optional parallel_iterations attribute to value.
+// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
 //
-// value: The number of iterations allowed to run in parallel.
-// If not specified, defaults to 10
-func EnterParallelIterations(value int64) EnterAttr {
+// value: Shifts the list of files after the list is randomly
+// shuffled.
+// If not specified, defaults to 0
+func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["parallel_iterations"] = value
+		m["file_shuffle_shift_ratio"] = value
 	}
 }
 
-// Creates or finds a child frame, and makes `data` available to the child frame.
-//
-// This op is used together with `Exit` to create loops in the graph.
-// The unique `frame_name` is used by the `Executor` to identify frames. If
-// `is_constant` is true, `output` is a constant in the child frame; otherwise
-// it may be changed in the child frame. At most `parallel_iterations` iterations
-// are run in parallel in the child frame.
-//
-// Arguments:
-//	data: The tensor to be made available to the child frame.
-//	frame_name: The name of the child frame.
+// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
 //
-// Returns The same tensor as `data`.
-func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"frame_name": frame_name}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Enter",
-		Input: []tf.Input{
-			data,
-		},
-		Attrs: attrs,
+// value: The randomization shuffling buffer.
+// If not specified, defaults to 10000
+func RecordInputFileBufferSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_buffer_size"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Forwards `data` to the output port determined by `pred`.
-//
-// If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
-// the data goes to `output_false`.
-//
-// See also `RefSwitch` and `Merge`.
-//
-// Arguments:
-//	data: The tensor to be forwarded to the appropriate output.
-//	pred: A scalar that specifies which output port will receive data.
+// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
 //
-// Returns If `pred` is false, data will be forwarded to this output.If `pred` is true, data will be forwarded to this output.
-func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Output, output_true tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Switch",
-		Input: []tf.Input{
-			data, pred,
-		},
+// value: How many sstables are opened and concurrently iterated over.
+// If not specified, defaults to 16
+func RecordInputFileParallelism(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_parallelism"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
-type CTCGreedyDecoderAttr func(optionalAttr)
-
-// CTCGreedyDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+// RecordInputBatchSize sets the optional batch_size attribute to value.
 //
-// value: If True, merge repeated classes in output.
-// If not specified, defaults to false
-func CTCGreedyDecoderMergeRepeated(value bool) CTCGreedyDecoderAttr {
+// value: The batch size.
+// If not specified, defaults to 32
+func RecordInputBatchSize(value int64) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["merge_repeated"] = value
+		m["batch_size"] = value
 	}
 }
 
-// Performs greedy decoding on the logits given in inputs.
-//
-// A note about the attribute merge_repeated: if enabled, when
-// consecutive logits' maximum indices are the same, only the first of
-// these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
-// becomes "A B B" if merge_repeated = True and "A B B B B" if
-// merge_repeated = False.
+// RecordInputCompressionType sets the optional compression_type attribute to value.
 //
-// Regardless of the value of merge_repeated, if the maximum index of a given
-// time and batch corresponds to the blank, index `(num_classes - 1)`, no new
-// element is emitted.
+// value: The type of compression for the file. Currently ZLIB and
+// GZIP are supported. Defaults to none.
+// If not specified, defaults to ""
+func RecordInputCompressionType(value string) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Emits randomized records.
 //
 // Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch_size)`.
+//	file_pattern: Glob pattern for the data files.
 //
-// Returns Indices matrix, size `(total_decoded_outputs x 2)`,
-// of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].Values vector, size: `(total_decoded_outputs)`,
-// of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.Shape vector, size `(2)`, of the decoded SparseTensor.
-// Values are: `[batch_size, max_decoded_length]`.Matrix, size `(batch_size x 1)`, containing sequence
-// log-probabilities.
-func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, optional ...CTCGreedyDecoderAttr) (decoded_indices tf.Output, decoded_values tf.Output, decoded_shape tf.Output, log_probability tf.Output) {
+// Returns A tensor of shape [batch_size].
+func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"file_pattern": file_pattern}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CTCGreedyDecoder",
-		Input: []tf.Input{
-			inputs, sequence_length,
-		},
+		Type: "RecordInput",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// CTCLossAttr is an optional argument to CTCLoss.
-type CTCLossAttr func(optionalAttr)
-
-// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
+// Computes reciprocal of square root of x element-wise.
 //
-// value: Scalar, if true then repeated labels are
-// collapsed prior to the CTC calculation.
-// If not specified, defaults to false
-func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["preprocess_collapse_repeated"] = value
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
-//
-// value: Scalar.  If set to false, *during* CTC calculation
-// repeated non-blank labels will not be merged and are interpreted as
-// individual labels.  This is a simplified version of CTC.
-// If not specified, defaults to true
-func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["ctc_merge_repeated"] = value
+	opspec := tf.OpSpec{
+		Type: "Rsqrt",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+// Inserts a dimension of 1 into a tensor's shape.
 //
-// value: Scalar. If set to true, during CTC
-// calculation, items that have longer output sequences than input sequences
-// are skipped: they don't contribute to the loss term and have zero-gradient.
-// If not specified, defaults to false
-func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["ignore_longer_outputs_than_inputs"] = value
-	}
-}
-
-// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
+// Given a tensor `input`, this operation inserts a dimension of 1 at the
+// dimension index `axis` of `input`'s shape. The dimension index `axis` starts at
+// zero; if you specify a negative number for `axis` it is counted backward from
+// the end.
 //
-// the gradient.  This class performs the softmax operation for you, so inputs
-// should be e.g. linear projections of outputs by an LSTM.
+// This operation is useful if you want to add a batch dimension to a single
+// element. For example, if you have a single image of shape `[height, width,
+// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+// which will make the shape `[1, height, width, channels]`.
 //
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
-// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
-// `(batch b, time t)`.
-//	labels_values: The values (labels) associated with the given batch and time.
-//	sequence_length: A vector containing sequence lengths (batch).
+// Other examples:
+//
+// ```
+// # 't' is a tensor of shape [2]
+// shape(expand_dims(t, 0)) ==> [1, 2]
+// shape(expand_dims(t, 1)) ==> [2, 1]
+// shape(expand_dims(t, -1)) ==> [2, 1]
+//
+// # 't2' is a tensor of shape [2, 3, 5]
+// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
+// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
+// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
+// ```
+//
+// This operation requires that:
+//
+// `-1-input.dims() <= dim <= input.dims()`
+//
+// This operation is related to `squeeze()`, which removes dimensions of
+// size 1.
 //
-// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
-// `(max_time x batch_size x num_classes)`.
-func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
+// Arguments:
+//
+//	axis: 0-D (scalar). Specifies the dimension index at which to
+// expand the shape of `input`. Must be in the range
+// `[-rank(input) - 1, rank(input)]`.
+//
+// Returns Contains the same data as `input`, but its shape has an additional
+// dimension of size 1 added.
+func ExpandDims(scope *Scope, input tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CTCLoss",
+		Type: "ExpandDims",
 		Input: []tf.Input{
-			inputs, labels_indices, labels_values, sequence_length,
+			input, axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// OrderedMapSizeAttr is an optional argument to OrderedMapSize.
-type OrderedMapSizeAttr func(optionalAttr)
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
 
-// OrderedMapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapSizeCapacity(value int64) OrderedMapSizeAttr {
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// If not specified, defaults to false
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["adjoint"] = value
 	}
 }
 
-// OrderedMapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Computes the inverse of one or more square invertible matrices or their
 //
-// REQUIRES: value >= 0
-func OrderedMapSizeMemoryLimit(value int64) OrderedMapSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapSizeContainer(value string) OrderedMapSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapSizeSharedName(value string) OrderedMapSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSizeAttr) (size tf.Output) {
+// adjoints (conjugate transposes).
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
+//
+// The op uses LU decomposition with partial pivoting to compute the inverses.
+//
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapSize",
-
+		Type: "MatrixInverse",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OrderedMapUnstageAttr is an optional argument to OrderedMapUnstage.
-type OrderedMapUnstageAttr func(optionalAttr)
-
-// OrderedMapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapUnstageCapacity(value int64) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Computes square of x element-wise.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageMemoryLimit(value int64) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageContainer(value string) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// OrderedMapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageSharedName(value string) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "Square",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Op removes and returns the values associated with the key
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
 //
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageAttr) (values []tf.Output) {
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstage",
+		Type: "Elu",
 		Input: []tf.Input{
-			key, indices,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the reciprocal of x element-wise.
+//
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstage", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "Reciprocal",
+		Input: []tf.Input{
+			x,
+		},
 	}
-	return values
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
-type MapIncompleteSizeAttr func(optionalAttr)
+// OrderedMapClearAttr is an optional argument to OrderedMapClear.
+type OrderedMapClearAttr func(optionalAttr)
 
-// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// OrderedMapClearCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// MapIncompleteSizeContainer sets the optional container attribute to value.
+// OrderedMapClearContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
+func OrderedMapClearContainer(value string) OrderedMapClearAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// OrderedMapClearSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
+func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op returns the number of incomplete elements in the underlying container.
-func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -3300,634 +3108,748 @@ func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncomp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapIncompleteSize",
+		Type: "OrderedMapClear",
 
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MapSizeAttr is an optional argument to MapSize.
-type MapSizeAttr func(optionalAttr)
-
-// MapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Computes the reciprocal of x element-wise.
 //
-// REQUIRES: value >= 0
-func MapSizeCapacity(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeMemoryLimit(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+	opspec := tf.OpSpec{
+		Type: "Inv",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapSizeContainer(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
 
-// MapSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapSizeSharedName(value string) MapSizeAttr {
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["Tout"] = value
 	}
 }
 
-// Op returns the number of elements in the underlying container.
-func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
+// Computes the complex absolute value of a tensor.
+//
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapSize",
-
+		Type: "ComplexAbs",
+		Input: []tf.Input{
+			x,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MapUnstageAttr is an optional argument to MapUnstage.
-type MapUnstageAttr func(optionalAttr)
-
-// MapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Returns the truth value of x AND y element-wise.
 //
-// REQUIRES: value >= 0
-func MapUnstageCapacity(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+	opspec := tf.OpSpec{
+		Type: "LogicalAnd",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageContainer(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"DstT": DstT}
+	opspec := tf.OpSpec{
+		Type: "Cast",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageSharedName(value string) MapUnstageAttr {
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
+
+// MaxKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Op removes and returns the values associated with the key
+// Computes the maximum of elements across dimensions of a tensor.
 //
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapUnstage",
+		Type: "Max",
 		Input: []tf.Input{
-			key, indices,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstage", err)
-		return
-	}
-	return values
+	return op.Output(0)
 }
 
-// Forwards the value of an available tensor from `inputs` to `output`.
-//
-// `Merge` waits for at least one of the tensors in `inputs` to become available.
-// It is usually combined with `Switch` to implement branching.
+// Quantized Batch normalization.
 //
-// `Merge` forwards the first tensor to become available to `output`, and sets
-// `value_index` to its index in `inputs`.
+// This op is deprecated and will be removed in the future. Prefer
+// `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	inputs: The input tensors, exactly one of which will become available.
+//	t: A 4D input Tensor.
+//	t_min: The value represented by the lowest quantized input.
+//	t_max: The value represented by the highest quantized input.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	m_min: The value represented by the lowest quantized mean.
+//	m_max: The value represented by the highest quantized mean.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v_min: The value represented by the lowest quantized variance.
+//	v_max: The value represented by the highest quantized variance.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	beta_min: The value represented by the lowest quantized offset.
+//	beta_max: The value represented by the highest quantized offset.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	gamma_min: The value represented by the lowest quantized gamma.
+//	gamma_max: The value represented by the highest quantized gamma.
 //
-// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
-func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "Merge",
+		Type: "QuantizedBatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
+// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
+type HistogramFixedWidthAttr func(optionalAttr)
 
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
+// HistogramFixedWidthDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT32
+func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["dtype"] = value
 	}
 }
 
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Return histogram of values.
 //
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// Given the tensor `values`, this operation returns a rank 1 histogram counting
+// the number of entries in `values` that fall into every bin.  The bins are
+// equal width and determined by the arguments `value_range` and `nbins`.
+//
+// ```python
+// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+// nbins = 5
+// value_range = [0.0, 5.0]
+// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+//
+// with tf.get_default_session() as sess:
+//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+//   variables.global_variables_initializer().run()
+//   sess.run(hist) => [2, 1, 1, 0, 2]
+// ```
+//
+// Arguments:
+//	values: Numeric `Tensor`.
+//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
+// values <= value_range[0] will be mapped to hist[0],
+// values >= value_range[1] will be mapped to hist[-1].
+//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
+//
+// Returns A 1-D `Tensor` holding histogram of values.
+func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramFixedWidth",
+		Input: []tf.Input{
+			values, value_range, nbins,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
+//
+// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+//
+// Arguments:
+//
+//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_bias: The float value that the lowest quantized bias value represents.
+//	max_bias: The float value that the highest quantized bias value represents.
+//
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "QuantizedBiasAdd",
+		Input: []tf.Input{
+			input, bias, min_input, max_input, min_bias, max_bias,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Op peeks at the values at the specified key.  If the
+// Produces the average pool of the input tensor for quantized types.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.  The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
 //
-// underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MapPeek",
+		Type: "QuantizedAvgPool",
 		Input: []tf.Input{
-			key, indices,
+			input, min_input, max_input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Updates the table to associates keys with values.
+//
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "LookupTableInsertV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
 	}
-	return values
+	return scope.AddOperation(opspec)
 }
 
-// MapStageAttr is an optional argument to MapStage.
-type MapStageAttr func(optionalAttr)
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
 
-// MapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
 //
-// REQUIRES: value >= 0
-func MapStageCapacity(value int64) MapStageAttr {
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["pseudo_random"] = value
 	}
 }
 
-// MapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
 //
-// REQUIRES: value >= 0
-func MapStageMemoryLimit(value int64) MapStageAttr {
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["overlapping"] = value
 	}
 }
 
-// MapStageContainer sets the optional container attribute to value.
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func MapStageContainer(value string) MapStageAttr {
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["deterministic"] = value
 	}
 }
 
-// MapStageSharedName sets the optional shared_name attribute to value.
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
 //
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func MapStageSharedName(value string) MapStageAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["seed"] = value
 	}
 }
 
-// Stage (key, values) in the underlying container which behaves like a hashtable.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
 //
-// Returns the created operation.
-func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// DepthToSpaceAttr is an optional argument to DepthToSpace.
-type DepthToSpaceAttr func(optionalAttr)
-
-// DepthToSpaceDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["seed2"] = value
 	}
 }
 
-// DepthToSpace for tensors of type T.
-//
-// Rearranges data from depth into blocks of spatial data.
-// This is the reverse transformation of SpaceToDepth. More specifically,
-// this op outputs a copy of the input tensor where values from the `depth`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions.
-// The attr `block_size` indicates the input block size and how the data is moved.
-//
-//   * Chunks of data of size `block_size * block_size` from depth are rearranged
-//     into non-overlapping blocks of size `block_size x block_size`
-//   * The width the output tensor is `input_depth * block_size`, whereas the
-//     height is `input_height * block_size`.
-//   * The Y, X coordinates within each block of the output image are determined
-//     by the high order component of the input channel index.
-//   * The depth of the input tensor must be divisible by
-//     `block_size * block_size`.
-//
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
-//
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
-//                         within the input image, bX, bY means coordinates
-//                         within the output block, oC means output channels).
-//      The output would be the input transposed to the following layout:
-//      n,iY,bY,iX,bX,oC
-//
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
-//
-// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
-// block_size = 2:
-//
-// ```
-// x = [[[[1, 2, 3, 4]]]]
-//
-// ```
-//
-// This operation will output a tensor of shape `[1, 2, 2, 1]`:
-//
-// ```
-//    [[[[1], [2]],
-//      [[3], [4]]]]
-// ```
-//
-// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
-// the corresponding output will have 2x2 elements and will have a depth of
-// 1 channel (1 = `4 / (block_size * block_size)`).
-// The output element shape is `[2, 2, 1]`.
-//
-// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
-//
-// ```
-// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
-//
-// This operation, for block size of 2, will return the following tensor of shape
-// `[1, 2, 2, 3]`
-//
-// ```
-//    [[[[1, 2, 3], [4, 5, 6]],
-//      [[7, 8, 9], [10, 11, 12]]]]
-//
-// ```
-//
-// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
-//
-// ```
-// x =  [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
-// ```
-//
-// the operator will return the following tensor of shape `[1 4 4 1]`:
-//
-// ```
-// x = [[[ [1],   [2],  [5],  [6]],
-//       [ [3],   [4],  [7],  [8]],
-//       [ [9],  [10], [13],  [14]],
-//       [ [11], [12], [15],  [16]]]]
+// Performs fractional average pooling on the input.
 //
-// ```
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
 //
 // Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
 //
-//	block_size: The size of the spatial block, same as in Space2Depth.
-func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
+// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthToSpace",
+		Type: "FractionalAvgPool",
 		Input: []tf.Input{
-			input,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// StagePeekAttr is an optional argument to StagePeek.
-type StagePeekAttr func(optionalAttr)
+// RandomCropAttr is an optional argument to RandomCrop.
+type RandomCropAttr func(optionalAttr)
 
-// StagePeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// RandomCropSeed sets the optional seed attribute to value.
 //
-// REQUIRES: value >= 0
-func StagePeekCapacity(value int64) StagePeekAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomCropSeed(value int64) RandomCropAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["seed"] = value
 	}
 }
 
-// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// RandomCropSeed2 sets the optional seed2 attribute to value.
 //
-// REQUIRES: value >= 0
-func StagePeekMemoryLimit(value int64) StagePeekAttr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomCropSeed2(value int64) RandomCropAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["seed2"] = value
 	}
 }
 
-// StagePeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StagePeekContainer(value string) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Randomly crop `image`.
+//
+// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+//
+// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
+// width.  The values must be non negative.
+//
+// This Op picks a random location in `image` and crops a `height` by `width`
+// rectangle from that location.  The random location is picked so the cropped
+// area will fit inside the original image.
+//
+// Arguments:
+//	image: 3-D of shape `[height, width, channels]`.
+//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+//
+// Returns 3-D of shape `[crop_height, crop_width, channels].`
+func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomCrop",
+		Input: []tf.Input{
+			image, size,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StagePeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StagePeekSharedName(value string) StagePeekAttr {
+// TopKV2Attr is an optional argument to TopKV2.
+type TopKV2Attr func(optionalAttr)
+
+// TopKV2Sorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKV2Sorted(value bool) TopKV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["sorted"] = value
 	}
 }
 
-// Op peeks at the values at the specified index.  If the
+// Finds values and indices of the `k` largest elements for the last dimension.
 //
-// underlying container does not contain sufficient elements
-// this op will block until it does.   This Op is optimized for
-// performance.
-func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: 0-D.  Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StagePeek",
+		Type: "TopKV2",
 		Input: []tf.Input{
-			index,
+			input, k,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Returns x // y element-wise.
+//
+// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("StagePeek", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "FloorDiv",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
-	return values
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StageAttr is an optional argument to Stage.
-type StageAttr func(optionalAttr)
-
-// StageCapacity sets the optional capacity attribute to value.
+// Returns a batched diagonal tensor with a given batched diagonal values.
 //
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
 //
-// REQUIRES: value >= 0
-func StageCapacity(value int64) StageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+//
+// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// and diagonal.shape = (2, 4)
+//
+// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+//                                      [0, 2, 0, 0]
+//                                      [0, 0, 3, 0]
+//                                      [0, 0, 0, 4]],
+//                                     [[5, 0, 0, 0]
+//                                      [0, 6, 0, 0]
+//                                      [0, 0, 7, 0]
+//                                      [0, 0, 0, 8]]]
+//
+// which has shape (2, 4, 4)
+// ```
+//
+// Arguments:
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDiag",
+		Input: []tf.Input{
+			diagonal,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StageMemoryLimit sets the optional memory_limit attribute to value.
+// Says whether the targets are in the top `K` predictions.
 //
-// value: The maximum number of bytes allowed for Tensors in the Staging Area.
-// If > 0, inserts will block until sufficient space is available.
-// If not specified, defaults to 0
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
 //
-// REQUIRES: value >= 0
-func StageMemoryLimit(value int64) StageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	opspec := tf.OpSpec{
+		Type: "InTopK",
+		Input: []tf.Input{
+			predictions, targets,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StageContainer sets the optional container attribute to value.
+// Given a quantized tensor described by (input, input_min, input_max), outputs a
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func StageContainer(value string) StageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// range that covers the actual values present in that tensor.  This op is
+// typically used to produce the requested_output_min and requested_output_max for
+// Requantize.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RequantizationRange",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// StageSharedName sets the optional shared_name attribute to value.
+// Returns the truth value of (x <= y) element-wise.
 //
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func StageSharedName(value string) StageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LessEqual",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Stage values similar to a lightweight Enqueue.
+// Computes softmax activations.
 //
-// The basic functionality of this Op is similar to a queue with many
-// fewer capabilities and options.  This Op is optimized for performance.
+// For each batch `i` and class `j` we have
+//
+//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
 //
 // Arguments:
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
+//	logits: 2-D with shape `[batch_size, num_classes]`.
 //
-// Returns the created operation.
-func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Stage",
+		Type: "Softmax",
 		Input: []tf.Input{
-			tf.OutputList(values),
+			logits,
 		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
-type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
-
-// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
+// DecodeBmpAttr is an optional argument to DecodeBmp.
+type DecodeBmpAttr func(optionalAttr)
 
-// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
+// DecodeBmpChannels sets the optional channels attribute to value.
+// If not specified, defaults to 0
+func DecodeBmpChannels(value int64) DecodeBmpAttr {
 	return func(m optionalAttr) {
-		m["narrow_range"] = value
+		m["channels"] = value
 	}
 }
 
-// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
+// Decode the first frame of a BMP-encoded image to a uint8 tensor.
 //
-// Attributes `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// Quantization is called fake since the output is still in floating point.
-func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
+// Accepted values are:
+//
+// *   0: Use the number of channels in the BMP-encoded image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
+//
+// Arguments:
+//	contents: 0-D.  The BMP-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`. RGB order
+func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -3936,9 +3858,9 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgs",
+		Type: "DecodeBmp",
 		Input: []tf.Input{
-			inputs,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -3946,462 +3868,488 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArraySizeV3
-func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+// Computes softsign gradients for a softsign operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
+//
+// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV2",
+		Type: "SoftsignGrad",
 		Input: []tf.Input{
-			handle, flow_in,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayScatterV3
-func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
+
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
+//
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
+	}
+}
+
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+//
+// Arguments:
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV2",
+		Type: "BatchMatMul",
 		Input: []tf.Input{
-			handle, indices, value, flow_in,
+			x, y,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
-func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Pads a tensor.
+//
+// This operation pads `input` according to the `paddings` and `constant_values`
+// you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many padding values to add before the contents of `input` in that dimension,
+// and `paddings[D, 1]` indicates how many padding values to add after the contents
+// of `input` in that dimension. `constant_values` is a scalar tensor of the same
+// type as `input` that indicates the value to use for padding `input`.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # 'constant_values' is 0
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV2",
+		Type: "PadV2",
 		Input: []tf.Input{
-			handle, index, value, flow_in,
+			input, paddings, constant_values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Get the current size of the TensorArray.
-//
-// Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-//	flow_in: A float scalar that enforces proper chaining of operations.
+// Returns which elements of x are NaN.
 //
-// Returns The current size of the TensorArray.
-func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV3",
+		Type: "IsNan",
 		Input: []tf.Input{
-			handle, flow_in,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
-type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
 
-// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["overlapping"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
+// Computes gradient of the FractionalAvgPool function.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LearnedUnigramCandidateSampler",
+		Type: "FractionalAvgPoolGrad",
 		Input: []tf.Input{
-			true_classes,
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Split the data from the input value into TensorArray elements.
-//
-// Assuming that `lengths` takes on values
-//
-//   ```(n0, n1, ..., n(T-1))```
-//
-// and that `value` has shape
-//
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
-//
-// this splits values into a TensorArray with T tensors.
-//
-// TensorArray index t will be the subtensor of values with starting position
-//
-//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
-//
-// and having size
-//
-//   ```nt x d0 x d1 x ...```
+// Computes gradients for the exponential linear (Elu) operation.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	value: The concatenated tensor to write to the TensorArray.
-//	lengths: The vector of lengths, how to split the rows of value into the
-// TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV3",
+		Type: "EluGrad",
 		Input: []tf.Input{
-			handle, value, lengths, flow_in,
+			gradients, outputs,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a diagonal tensor with a given diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
+// The hash function is deterministic on the content of the string within the
+// process.
 //
-// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
 //
-// For example:
+// Arguments:
 //
-// ```
-// # 'diagonal' is [1, 2, 3, 4]
-// tf.diag(diagonal) ==> [[1, 0, 0, 0]
-//                        [0, 2, 0, 0]
-//                        [0, 0, 3, 0]
-//                        [0, 0, 0, 4]]
-// ```
+//	num_buckets: The number of buckets.
 //
-// Arguments:
-//	diagonal: Rank k tensor where k is at most 1.
-func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "Diag",
+		Type: "StringToHashBucket",
 		Input: []tf.Input{
-			diagonal,
+			string_tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
-type TensorArrayConcatV3Attr func(optionalAttr)
-
-// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-//
-// value: The expected shape of an element, if known,
-// excluding the first dimension. Used to validate the shapes of
-// TensorArray elements. If this shape is not fully specified, concatenating
-// zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
-	}
-}
-
-// Concat the elements from the TensorArray into value `value`.
-//
-// Takes `T` elements of shapes
-//
-//   ```
-//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
-//   ```
-//
-// and concatenates them into a Tensor of shape:
+// Creates a dataset that contains `count` elements from the `input_dataset`.
 //
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+// Arguments:
 //
-// All elements must have the same shape (excepting the first dimension).
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
 //
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
 //
-// Returns All of the elements in the TensorArray, concatenated along the first
-// axis.A vector of the row sizes of the original T elements in the
-// value output.  In the example above, this would be the values:
-// `(n1, n2, ..., n(T-1))`.
-func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV3",
+		Type: "TakeDataset",
 		Input: []tf.Input{
-			handle, flow_in,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Scatter the data from the input value into specific TensorArray elements.
-//
-// `indices` must be a vector, its length must match the first dim of `value`.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations at which to write the tensor elements.
-//	value: The concatenated tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV3",
+		Type: "Relu6",
 		Input: []tf.Input{
-			handle, indices, value, flow_in,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Push an element onto the tensor_array.
+// Computes rectified linear gradients for a Relu operation.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	index: The position to write to inside the TensorArray.
-//	value: The tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+//	gradients: The backpropagated gradients to the corresponding Relu operation.
+//	features: The features passed as input to the corresponding Relu operation, OR
+// the outputs of that operation (both work equivalently).
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Returns `gradients * (features > 0)`.
+func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV3",
+		Type: "ReluGrad",
 		Input: []tf.Input{
-			handle, index, value, flow_in,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// Creates a TensorArray for storing the gradients of values in the given handle.
-//
-// If the given TensorArray gradient already exists, returns a reference to it.
-//
-// Locks the size of the original TensorArray by disabling its dynamic size flag.
-//
-// **A note about the input flow_in:**
-//
-// The handle flow_in forces the execution of the gradient lookup to occur
-// only after certain other operations have occurred.  For example, when
-// the forward TensorArray is dynamically sized, writes to this TensorArray
-// may resize the object.  The gradient TensorArray is statically sized based
-// on the size of the forward TensorArray when this operation executes.
-// Furthermore, the size of the forward TensorArray is frozen by this call.
-// As a result, the flow is used to ensure that the call to generate the gradient
-// TensorArray only happens after all writes are executed.
-//
-// In the case of dynamically sized TensorArrays, gradient computation should
-// only be performed on read operations that have themselves been chained via
-// flow to occur only after all writes have executed. That way the final size
-// of the forward TensorArray is known when this operation is called.
-//
-// **A note about the source attribute:**
-//
-// TensorArray gradient calls use an accumulator TensorArray object.  If
-// multiple gradients are calculated and run in the same session, the multiple
-// gradient nodes may accidentally flow through the same accumulator TensorArray.
-// This double counts and generally breaks the TensorArray gradient flow.
-//
-// The solution is to identify which gradient call this particular
-// TensorArray gradient is being called in.  This is performed by identifying
-// a unique string (e.g. "gradients", "gradients_1", ...) from the input
-// gradient Tensor's name.  This string is used as a suffix when creating
-// the TensorArray gradient object here (the attribute `source`).
-//
-// The attribute `source` is added as a suffix to the forward TensorArray's
-// name when performing the creation / lookup, so that each separate gradient
-// calculation gets its own TensorArray accumulator.
+
+// Computes the gradient of morphological 2-D dilation with respect to the input.
 //
 // Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV3",
+		Type: "Dilation2DBackpropInput",
 		Input: []tf.Input{
-			handle, flow_in,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
+// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
+type CTCBeamSearchDecoderAttr func(optionalAttr)
 
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
 //
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+// value: If true, merge repeated classes in output.
+// If not specified, defaults to true
+func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
 	return func(m optionalAttr) {
-		m["swap_memory"] = value
+		m["merge_repeated"] = value
 	}
 }
 
-// Push an element onto the stack.
+// Performs beam search decoding on the logits given in input.
+//
+// A note about the attribute merge_repeated: For the beam search decoder,
+// this means that if consecutive entries in a beam are the same, only
+// the first of these is emitted.  That is, when the top path is "A B B B B",
+// "A B" is returned if merge_repeated = True but "A B B B B" is
+// returned if merge_repeated = False.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch)`.
+//	beam_width: A scalar >= 0 (beam search beam width).
+//	top_paths: A scalar >= 0, <= beam_width (controls output size).
 //
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+// Returns A list (length: top_paths) of indices matrices.  Matrix j,
+// size `(total_decoded_outputs[j] x 2)`, has indices of a
+// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
+// size `(length total_decoded_outputs[j])`, has the values of a
+// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
+// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
+// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
+// sequence log-probabilities.
+func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StackPushV2",
+		Type: "CTCBeamSearchDecoder",
 		Input: []tf.Input{
-			handle, elem,
+			inputs, sequence_length,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	log_probability = op.Output(idx)
+	return decoded_indices, decoded_values, decoded_shape, log_probability
 }
 
-// StackV2Attr is an optional argument to StackV2.
-type StackV2Attr func(optionalAttr)
+// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
+type AudioSpectrogramAttr func(optionalAttr)
 
-// StackV2StackName sets the optional stack_name attribute to value.
+// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
 //
-// value: Overrides the name used for the temporary stack resource. Default
-// value is the name of the 'Stack' op (which is guaranteed unique).
-// If not specified, defaults to ""
-func StackV2StackName(value string) StackV2Attr {
+// value: Whether to return the squared magnitude or just the
+// magnitude. Using squared magnitude can avoid extra calculations.
+// If not specified, defaults to false
+func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
 	return func(m optionalAttr) {
-		m["stack_name"] = value
+		m["magnitude_squared"] = value
 	}
 }
 
-// A stack that produces elements in first-in last-out order.
+// Produces a visualization of audio data over time.
+//
+// Spectrograms are a standard way of representing audio information as a series of
+// slices of frequency information, one slice for each window of time. By joining
+// these together into a sequence, they form a distinctive fingerprint of the sound
+// over time.
+//
+// This op expects to receive audio data as an input, stored as floats in the range
+// -1 to 1, together with a window width in samples, and a stride specifying how
+// far to move the window between slices. From this it generates a three
+// dimensional output. The lowest dimension has an amplitude value for each
+// frequency during that time slice. The next dimension is time, with successive
+// frequency slices. The final dimension is for the channels in the input, so a
+// stereo audio input would have two here for example.
+//
+// This means the layout when converted and saved as an image is rotated 90 degrees
+// clockwise from a typical spectrogram. Time is descending down the Y axis, and
+// the frequency decreases from left to right.
+//
+// Each value in the result represents the square root of the sum of the real and
+// imaginary parts of an FFT on the current window of samples. In this way, the
+// lowest dimension represents the power of each frequency in the current window,
+// and adjacent windows are concatenated in the next dimension.
+//
+// To get a more intuitive and visual look at what this operation does, you can run
+// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
+// resulting spectrogram as a PNG image.
 //
 // Arguments:
-//	max_size: The maximum size of the stack if non-negative. If negative, the stack
-// size is unlimited.
-//	elem_type: The type of the elements on the stack.
+//	input: Float representation of audio data.
+//	window_size: How wide the input window is in samples. For the highest efficiency
+// this should be a power of two, but other values are accepted.
+//	stride: How widely apart the center of adjacent sample windows should be.
 //
-// Returns The handle to the stack.
-func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+// Returns 3D representation of the audio frequencies as an image.
+func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
+	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StackV2",
+		Type: "AudioSpectrogram",
 		Input: []tf.Input{
-			max_size,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -4409,300 +4357,271 @@ func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional .
 	return op.Output(0)
 }
 
-// Returns the batched diagonal part of a batched tensor.
-//
-// This operation returns a tensor with the `diagonal` part
-// of the batched `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
 //
-// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
+// The polygamma function is defined as:
 //
-// The input must be at least a matrix.
 //
-// For example:
+// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
 //
-// ```
-// # 'input' is [[[1, 0, 0, 0]
-//                [0, 2, 0, 0]
-//                [0, 0, 3, 0]
-//                [0, 0, 0, 4]],
-//               [[5, 0, 0, 0]
-//                [0, 6, 0, 0]
-//                [0, 0, 7, 0]
-//                [0, 0, 0, 8]]]
+// where \\(\psi(x)\\) is the digamma function.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Polygamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes second-order gradients of the maxpooling function.
 //
-// and input.shape = (2, 4, 4)
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradGradWithArgmax",
+		Input: []tf.Input{
+			input, grad, argmax,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
+type MaxPoolGradGradV2Attr func(optionalAttr)
+
+// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
 //
-// which has shape (2, 4)
-// ```
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	input: Rank `k` tensor where `k >= 2`.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The extracted diagonal(s) having shape
-// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
-func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDiagPart",
+		Type: "MaxPoolGradGradV2",
 		Input: []tf.Input{
-			input,
+			orig_input, orig_output, grad, ksize, strides,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns true if queue is closed.
+// Fast Fourier transform.
 //
-// This operation returns true if the queue is closed and false if the queue
-// is open.
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
 //
 // Arguments:
-//	handle: The handle to a queue.
-func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft
+// @end_compatibility
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueIsClosedV2",
+		Type: "FFT",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
 
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
+// MaxPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be canceled.
-// If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
 	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
+		m["data_format"] = value
 	}
 }
 
-// Closes the given queue.
-//
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
+// Performs max pooling on the input.
 //
 // Arguments:
-//	handle: The handle to a queue.
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
+		Type: "MaxPool",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
-type QueueDequeueUpToV2Attr func(optionalAttr)
-
-// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Dequeues `n` tuples of one or more tensors from the given queue.
-//
-// This operation is not supported by all queues.  If a queue does not support
-// DequeueUpTo, then an Unimplemented error is returned.
-//
-// If the queue is closed and there are more than 0 but less than `n`
-// elements remaining, then instead of returning an OutOfRange error like
-// QueueDequeueMany, less than `n` elements are returned immediately.  If
-// the queue is closed and there are 0 elements left in the queue, then
-// an OutOfRange error is returned just like in QueueDequeueMany.
-// Otherwise the behavior is identical to QueueDequeueMany:
+// Bucketizes 'input' based on 'boundaries'.
 //
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size n in the 0th dimension.
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
 //
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueUpToV2",
+		Type: "Bucketize",
 		Input: []tf.Input{
-			handle, n,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueUpToV2", err)
-		return
-	}
-	return components
-}
-
-// Deprecated. Use TensorArrayCloseV3
-//
-// Returns the created operation.
-func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
-type QueueDequeueManyV2Attr func(optionalAttr)
-
-// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
+	return op.Output(0)
 }
 
-// Dequeues `n` tuples of one or more tensors from the given queue.
-//
-// If the queue is closed and there are fewer than `n` elements, then an
-// OutOfRange error is returned.
-//
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size `n` in the 0th dimension.
-//
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
-//
-// N.B. If the queue is empty, this operation will block until `n` elements
-// have been dequeued (or 'timeout_ms' elapses, if specified).
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// output of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
+// Returns Gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueManyV2",
+		Type: "MaxPoolGradWithArgmax",
 		Input: []tf.Input{
-			handle, n,
+			input, grad, argmax,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueManyV2", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
-type QueueEnqueueV2Attr func(optionalAttr)
+// CriticalSectionOpAttr is an optional argument to CriticalSectionOp.
+type CriticalSectionOpAttr func(optionalAttr)
 
-// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// CriticalSectionOpContainer sets the optional container attribute to value.
 //
-// value: If the queue is full, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+// value: the container this critical section is placed in.
+// If not specified, defaults to ""
+func CriticalSectionOpContainer(value string) CriticalSectionOpAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["container"] = value
 	}
 }
 
-// Enqueues a tuple of one or more tensors in the given queue.
-//
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
-//
-// N.B. If the queue is full, this operation will block until the given
-// element has been enqueued (or 'timeout_ms' elapses, if specified).
-//
-// Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should be taken.
+// CriticalSectionOpSharedName sets the optional shared_name attribute to value.
 //
-// Returns the created operation.
-func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
+// value: the name by which this critical section is referred to.
+// If not specified, defaults to ""
+func CriticalSectionOpSharedName(value string) CriticalSectionOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a CriticalSection resource.
+func CriticalSectionOp(scope *Scope, optional ...CriticalSectionOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -4711,69 +4630,58 @@ func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, opti
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueV2",
-		Input: []tf.Input{
-			handle, tf.OutputList(components),
-		},
+		Type: "CriticalSectionOp",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
-type ResourceStridedSliceAssignAttr func(optionalAttr)
-
-// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
+// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
+type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
 
-// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
 	return func(m optionalAttr) {
-		m["end_mask"] = value
+		m["min"] = value
 	}
 }
 
-// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
 	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
+		m["max"] = value
 	}
 }
 
-// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
 	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
+		m["num_bits"] = value
 	}
 }
 
-// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
 	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+		m["narrow_range"] = value
 	}
 }
 
-// Assign `value` to the sliced l-value reference of `ref`.
-//
-// The values of `value` are assigned to the positions in the variable
-// `ref` that are selected by the slice parameters. The slice parameters
-// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+// Compute gradients for a FakeQuantWithMinMaxArgs operation.
 //
-// NOTE this op currently does not support broadcasting and so `value`'s
-// shape must be exactly the shape produced by the slice of `ref`.
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
 //
-// Returns the created operation.
-func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+// `gradients * (inputs >= min && inputs <= max)`.
+func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -4782,350 +4690,360 @@ func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, en
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceStridedSliceAssign",
+		Type: "FakeQuantWithMinMaxArgsGradient",
 		Input: []tf.Input{
-			ref, begin, end, strides, value,
+			gradients, inputs,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UnstageAttr is an optional argument to Unstage.
-type UnstageAttr func(optionalAttr)
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
 
-// UnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
 //
-// REQUIRES: value >= 0
-func UnstageCapacity(value int64) UnstageAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["data_format"] = value
 	}
 }
 
-// UnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Performs 3D average pooling on the input.
 //
-// REQUIRES: value >= 0
-func UnstageMemoryLimit(value int64) UnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// UnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnstageContainer(value string) UnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// UnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnstageSharedName(value string) UnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op is similar to a lightweight Dequeue.
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// The basic functionality is similar to dequeue with many fewer
-// capabilities and options.  This Op is optimized for performance.
-func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unstage",
-
+		Type: "AvgPool3D",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g.
+// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("Unstage", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "Mod",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
-	return values
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
-type PriorityQueueV2Attr func(optionalAttr)
-
-// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
-//
-// value: The type of each component in a value.
-// If not specified, defaults to <>
+// Computes square root of x element-wise.
 //
-// REQUIRES: len(value) >= 0
-func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["component_types"] = value
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sqrt",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PriorityQueueV2Capacity sets the optional capacity attribute to value.
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropFilter",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PriorityQueueV2Container sets the optional container attribute to value.
+// Computes the gradient for the rsqrt of `x` wrt its input.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RsqrtGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
+
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["batch_dim"] = value
 	}
 }
 
-// A queue that produces elements sorted by the first component value.
+// Reverses variable length slices.
 //
-// Note that the PriorityQueue requires the first component of any element
-// to be a scalar int64, in addition to the other elements declared by
-// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
-// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
-// entry in their input (resp. output) lists.
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
+//
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
+//
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
+//
+// For example:
+//
+// ```
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
+// ```
+//
+// In contrast, if:
+//
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
 //
 // Arguments:
-//	shapes: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
 //
-// Returns The handle to the queue.
-func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shapes": shapes}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PriorityQueueV2",
-
+		Type: "ReverseSequence",
+		Input: []tf.Input{
+			input, seq_lengths,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StridedSliceAttr is an optional argument to StridedSlice.
-type StridedSliceAttr func(optionalAttr)
-
-// StridedSliceBeginMask sets the optional begin_mask attribute to value.
-//
-// value: a bitmask where a bit i being 1 means to ignore the begin
-// value and instead use the largest interval possible. At runtime
-// begin[i] will be replaced with `[0, n-1) if `stride[i] > 0` or
-// `[-1, n-1]` if `stride[i] < 0`
-// If not specified, defaults to 0
-func StridedSliceBeginMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// StridedSliceEndMask sets the optional end_mask attribute to value.
-//
-// value: analogous to `begin_mask`
-// If not specified, defaults to 0
-func StridedSliceEndMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// StridedSliceEllipsisMask sets the optional ellipsis_mask attribute to value.
-//
-// value: a bitmask where bit `i` being 1 means the `i`th
-// position is actually an ellipsis. One bit at most can be 1.
-// If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
-// is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
-// implicitly creates as many range specifications as necessary to fully
-// specify the sliced range for every dimension. For example for a 4-dimensional
-// tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
-// If not specified, defaults to 0
-func StridedSliceEllipsisMask(value int64) StridedSliceAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
 
-// StridedSliceNewAxisMask sets the optional new_axis_mask attribute to value.
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
 //
-// value: a bitmask where bit `i` being 1 means the `i`th
-// specification creates a new shape 1 dimension. For example
-// `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
-// If not specified, defaults to 0
-func StridedSliceNewAxisMask(value int64) StridedSliceAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
+		m["data_format"] = value
 	}
 }
 
-// StridedSliceShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
 //
-// value: a bitmask where bit `i` implies that the `i`th
-// specification should shrink the dimensionality. begin and end
-// must imply a slice of size 1 in the dimension. For example in
-// python one might do `foo[:, 3, :]` which would result in
-// `shrink_axis_mask` being 2.
-// If not specified, defaults to 0
-func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+		m["dilations"] = value
 	}
 }
 
-// Return a strided slice from `input`.
-//
-// Note, most python users will want to use the Python `Tensor.__getitem__`
-// or `Variable.__getitem__` rather than this op directly.
-//
-// The goal of this op is to produce a new tensor with a subset of
-// the elements from the `n` dimensional `input` tensor. The subset is chosen using
-// a sequence of `m` sparse range specifications encoded into the arguments
-// of this function. Note, in some cases
-// `m` could be equal to `n`, but this need not be the case. Each
-// range specification entry can be one of the following:
-//
-// - An ellipsis (...). Ellipses are used to imply zero or more
-//   dimensions of full-dimension selection and are produced using
-//   `ellipsis_mask`. For example, `foo[...]` is the identity slice.
-//
-// - A new axis. This is used to insert a new shape=1 dimension and is
-//   produced using `new_axis_mask`. For example, `foo[:, ...]` where
-//   `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
-//
-//
-// - A range `begin:end:stride`. This is used to specify how much to choose from
-//   a given dimension. `stride` can be any integer but 0.  `begin` is an integer
-//   which represents the index of the first value to select while `end` represents
-//   the index of the last value to select. The number of values selected in each
-//   dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
-//   `begin` and `end` can be negative where `-1` is the last element, `-2` is
-//   the second to last. `begin_mask` controls whether to replace the explicitly
-//   given `begin` with an implicit effective value of `0` if `stride > 0` and
-//   `-1` if `stride < 0`. `end_mask` is analogous but produces the number
-//   required to create the largest open interval. For example, given a shape
-//   `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
-//   not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
-//   and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
-//   first dimension of a tensor while dropping the last two (in the original
-//   order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
-//
-// - A single index. This is used to keep only elements that have a given
-//   index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
-//   shape `(6,)` tensor. This is encoded in `begin` and `end` and
-//   `shrink_axis_mask`.
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
 //
-// Each conceptual range specification is encoded in the op's argument. This
-// encoding is best understand by considering a non-trivial example. In
-// particular,
-// `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
 //
 // ```
-// begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
-// end = [2, 4, x, x, -3, x]
-// strides = [1, 1, x, x, -1, 1]
-// begin_mask = 1<<4 | 1 << 5 = 48
-// end_mask = 1<<5 = 32
-// ellipsis_mask = 1<<3 = 8
-// new_axis_mask = 1<<2 4
-// shrink_axis_mask = 1<<0
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
 // ```
 //
-// In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
-// the slice becomes (2, 1, 5, 5, 2, 5).
-// Let us walk step by step through each argument specification.
-//
-// 1.  The first argument in the example slice is turned into `begin = 1` and
-// `end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
-// also set the appropriate bit in `shrink_axis_mask`.
-//
-// 2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
-// zero bits contributed.
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 //
-// 3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
-// dimension in the final shape. Dummy values are contributed to begin,
-// end and stride, while the new_axis_mask bit is set.
+// Arguments:
 //
-// 4. `...` grab the full ranges from as many dimensions as needed to
-// fully specify a slice for every dimension of the input shape.
 //
-// 5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
-// with a dimension that has shape `s` is converted to a positive index
-// `s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
-// is done internally so begin, end and strides receive x, -3, and -1.
-// The appropriate begin_mask bit is set to indicate the start range is the
-// full range (ignoring the x).
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNative",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
+
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
 //
-// 6. `:` indicates that the entire contents of the corresponding dimension
-// is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
-// receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
-// `end_mask` are also set.
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Gather specific elements from the TensorArray into output `value`.
 //
-// *Requirements*:
-//   `0 != strides[i] for i in [0, m)`
-//   `ellipsis_mask must be a power of two (only one ellipsis)`
+// All elements selected by `indices` must have the same shape.
 //
 // Arguments:
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-//	begin: `begin[k]` specifies the offset into the `k`th range specification.
-// The exact dimension this corresponds to will be determined by context.
-// Out-of-bounds values will be silently clamped. If the `k`th bit of
-// `begin_mask` then `begin[k]` is ignored and the full range of the
-// appropriate dimension is used instead. Negative values causes indexing
-// to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
-//	end: `end[i]` is like `begin` with the exception that `end_mask` is
-// used to determine full ranges.
-//	strides: `strides[i]` specifies the increment in the `i`th specification
-// after extracting a given element. Negative indices will reverse
-// the original order. Out or range values are
-// clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
-func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, optional ...StridedSliceAttr) (output tf.Output) {
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StridedSlice",
+		Type: "TensorArrayGatherV3",
 		Input: []tf.Input{
-			input, begin, end, strides,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -5133,107 +5051,155 @@ func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output,
 	return op.Output(0)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// Builds a merged tensor such that
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
 //
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
 //
-// For example, if each `indices[m]` is scalar or vector, we have
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketFast",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
 //
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Maximum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs all keys and values in the table.
 //
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
+// Arguments:
+//	table_handle: Handle to the table.
 //
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
 //
-//     merged.shape = [max(indices)] + constant
 //
-// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-// and `indices[n][j]`, the result may be invalid. This differs from the normal
-// DynamicStitch operator that defines the behavior in that case.
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
+	opspec := tf.OpSpec{
+		Type: "LookupTableExportV2",
+		Input: []tf.Input{
+			table_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Real-valued fast Fourier transform.
 //
-// For example:
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
 //
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
 //
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ParallelDynamicStitch",
+		Type: "RFFT",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
-type TensorArrayGatherV2Attr func(optionalAttr)
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
 
-// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["Tout"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayGatherV3
-func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
+// Converts two real numbers to a complex number.
+//
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
+//
+// The input tensors `real` and `imag` must have the same shape.
+//
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV2",
+		Type: "Complex",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			real, imag,
 		},
 		Attrs: attrs,
 	}
@@ -5241,256 +5207,283 @@ func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
-//
-// For example, if each `indices[m]` is scalar or vector, we have
-//
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the imaginary part of a complex number.
 //
-// Values are merged in order, so if an index appears in both `indices[m][i]` and
-// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
-// merged result. If you do not need this guarantee, ParallelDynamicStitch might
-// perform better on some devices.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
 //
 // For example:
 //
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
 // ```
-//
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
-//
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
 // ```
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DynamicStitch",
+		Type: "Imag",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Produces a summary of any statistics recorded by the given statistics manager.
-func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+//
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StatsAggregatorSummary",
+		Type: "Zeta",
 		Input: []tf.Input{
-			iterator,
+			x, q,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
-type FIFOQueueV2Attr func(optionalAttr)
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
 
-// FIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["shapes"] = value
+		m["depth_radius"] = value
 	}
 }
 
-// FIFOQueueV2Capacity sets the optional capacity attribute to value.
+// LRNGradBias sets the optional bias attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["bias"] = value
 	}
 }
 
-// FIFOQueueV2Container sets the optional container attribute to value.
+// LRNGradAlpha sets the optional alpha attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["alpha"] = value
 	}
 }
 
-// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// LRNGradBeta sets the optional beta attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["beta"] = value
 	}
 }
 
-// A queue that produces elements in first-in first-out order.
+// Gradients for Local Response Normalization.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// Returns The handle to the queue.
-func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FIFOQueueV2",
-
+		Type: "LRNGrad",
+		Input: []tf.Input{
+			input_grads, input_image, output_image,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts the given `resource_handle` representing an iterator to a variant tensor.
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
+
+// AnyKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the "logical or" of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	resource_handle: A handle to an iterator resource.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns A variant tensor storing the state of the iterator contained in the
-// resource.
-func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SerializeIterator",
+		Type: "Any",
 		Input: []tf.Input{
-			resource_handle,
+			input, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
+
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Identity",
+		Type: "ResourceApplyFtrl",
 		Input: []tf.Input{
-			input,
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
-type IteratorFromStringHandleAttr func(optionalAttr)
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
 
-// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
-//
-// value: If specified, defines the type of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
+// RandomUniformSeed sets the optional seed attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
 	return func(m optionalAttr) {
-		m["output_types"] = value
+		m["seed"] = value
 	}
 }
 
-// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
-//
-// value: If specified, defines the shape of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
 	return func(m optionalAttr) {
-		m["output_shapes"] = value
+		m["seed2"] = value
 	}
 }
 
-// Converts the given string representing a handle to an iterator to a resource.
+// Outputs random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
 // Arguments:
-//	string_handle: A string representation of the given handle.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns A handle to an iterator resource.
-func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "IteratorFromStringHandle",
+		Type: "RandomUniform",
 		Input: []tf.Input{
-			string_handle,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -5498,21 +5491,30 @@ func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ..
 	return op.Output(0)
 }
 
-// ShapeNAttr is an optional argument to ShapeN.
-type ShapeNAttr func(optionalAttr)
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
 
-// ShapeNOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeNOutType(value tf.DataType) ShapeNAttr {
+// AssertSummarize sets the optional summarize attribute to value.
+//
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["summarize"] = value
 	}
 }
 
-// Returns shape of tensors.
+// Asserts that the given condition is true.
 //
-// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
+//
+// Arguments:
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
+//
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5521,88 +5523,79 @@ func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ShapeN",
+		Type: "Assert",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			condition, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("ShapeN", err)
-		return
-	}
-	return output
+	return scope.AddOperation(opspec)
 }
 
-// Converts the given `resource_handle` representing an iterator to a string.
+// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
 //
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
+// For each entry in `x`, calculates the number of `1` (on) bits in the binary
+// representation of that entry.
 //
-// Returns A string representation of the given handle.
-func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
+// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
+// `int32` or `int64` and perform the bitcount on the result, than to feed in
+// 8- or 16-bit inputs and then aggregate the resulting counts.
+func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IteratorToStringHandle",
+		Type: "PopulationCount",
 		Input: []tf.Input{
-			resource_handle,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs the single element from the given dataset.
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
 //
-// Arguments:
-//	dataset: A handle to a dataset that contains a single element.
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
 //
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
 //
+// Graphically the output tensors are:
 //
-// Returns The components of the single element of `input`.
-func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "DatasetToSingleElement",
-		Input: []tf.Input{
-			dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("DatasetToSingleElement", err)
-		return
-	}
-	return components
-}
-
-// Gets the next output from the given iterator.
-func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNext",
+		Type: "SparseSplit",
 		Input: []tf.Input{
-			iterator,
+			split_dim, indices, values, shape,
 		},
 		Attrs: attrs,
 	}
@@ -5612,354 +5605,265 @@ func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataTyp
 	}
 	var idx int
 	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNext", err)
-		return
-	}
-	return components
-}
-
-// Makes a new iterator from the given `dataset` and stores it in `iterator`.
-//
-// This operation may be executed multiple times. Each execution will reset the
-// iterator in `iterator` to the first element of `dataset`.
-//
-// Returns the created operation.
-func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MakeIterator",
-		Input: []tf.Input{
-			dataset, iterator,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a dataset that emits the records from one or more TFRecord files.
-//
-// Arguments:
-//	filenames: A scalar or vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar representing the number of bytes to buffer. A value of
-// 0 means no buffering will be performed.
-func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "TFRecordDataset",
-		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates quantized tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Creates a dataset that emits the records from one or more binary files.
-//
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	header_bytes: A scalar representing the number of bytes to skip at the
-// beginning of a file.
-//	record_bytes: A scalar representing the number of bytes in each record.
-//	footer_bytes: A scalar representing the number of bytes to skip at the end
-// of a file.
-//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
-func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordDataset",
-		Input: []tf.Input{
-			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return output_indices, output_values, output_shape
 }
 
-// Creates a dataset that executes a SQL query and emits rows of the result set.
-//
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
-//
+// Returns the truth value of (x < y) element-wise.
 //
-func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SqlDataset",
+		Type: "Less",
 		Input: []tf.Input{
-			driver_name, data_source_name, query,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PlaceholderAttr is an optional argument to Placeholder.
-type PlaceholderAttr func(optionalAttr)
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
 
-// PlaceholderShape sets the optional shape attribute to value.
-//
-// value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
-// shape is unconstrained.
-// If not specified, defaults to <unknown_rank:true >
-func PlaceholderShape(value tf.Shape) PlaceholderAttr {
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
 	return func(m optionalAttr) {
-		m["shape"] = value
+		m["out_type"] = value
 	}
 }
 
-// A placeholder op for a value that will be fed into the computation.
-//
-// N.B. This operation will fail with an error if it is executed. It is
-// intended as a way to represent a value that will always be fed, and to
-// provide attrs that enable the fed value to be checked at runtime.
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
 // Arguments:
-//	dtype: The type of elements in the tensor.
 //
-// Returns A placeholder tensor that must be replaced using the feed mechanism.
-func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (output tf.Output) {
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Placeholder",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that caches elements from `input_dataset`.
-//
-// A CacheDataset will iterate over the input_dataset, and store tensors. If the
-// cache already exists, the cache will be used. If the cache is inappropriate
-// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
-// will the returned when used.
-//
-// Arguments:
-//
-//	filename: A path on the filesystem where we should cache the dataset. Note: this
-// will be a directory.
-//
-//
-func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "CacheDataset",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			input_dataset, filename,
+			features, max_value, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Identity op for gradient debugging.
-//
-// This op is hidden from public in Python. It is used by TensorFlow Debugger to
-// register gradient tensors for gradient debugging.
-func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// SummaryWriterAttr is an optional argument to SummaryWriter.
+type SummaryWriterAttr func(optionalAttr)
+
+// SummaryWriterSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func SummaryWriterSharedName(value string) SummaryWriterAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "DebugGradientIdentity",
-		Input: []tf.Input{
-			input,
-		},
+}
+
+// SummaryWriterContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func SummaryWriterContainer(value string) SummaryWriterAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
-func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+// Returns a handle to be used to access a summary writer.
+//
+// The summary writer is an in-graph resource which can be used by ops to write
+// summaries to event files.
+//
+// Returns the summary writer resource. Scalar handle.
+func SummaryWriter(scope *Scope, optional ...SummaryWriterAttr) (writer tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
+		Type: "SummaryWriter",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that yields a SparseTensor for each element of the input.
-//
-// Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
+// Computes gradients for SparseSegmentMean.
 //
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
-func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseBatchDataset",
+		Type: "SparseSegmentMeanGrad",
 		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
+			grad, indices, segment_ids, output_dim0,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that batches and pads `batch_size` elements from the input.
+// Applies softmax to a batched N-D `SparseTensor`.
 //
-// Arguments:
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
 //
-func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
+//
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
+//
+// Arguments:
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "PaddedBatchDataset",
+		Type: "SparseSoftmax",
 		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
+			sp_indices, sp_values, sp_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
-type TensorArrayConcatV2Attr func(optionalAttr)
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
 
-// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
+		m["seed"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayConcatV3
-func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Use RandomPoissonV2 instead.
+//
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV2",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			handle, flow_in,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Converts the given variant tensor to an iterator and stores it in the given resource.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//	serialized: A variant tensor storing the state of the iterator contained in the
-// resource.
+// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
+type MaxPoolGradV2Attr func(optionalAttr)
+
+// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
 //
-// Returns the created operation.
-func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DeserializeIterator",
-		Input: []tf.Input{
-			resource_handle, serialized,
-		},
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Computes gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LatencyStatsDataset",
+		Type: "MaxPoolGradV2",
 		Input: []tf.Input{
-			input_dataset, tag,
+			orig_input, orig_output, grad, ksize, strides,
 		},
 		Attrs: attrs,
 	}
@@ -5967,223 +5871,164 @@ func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, o
 	return op.Output(0)
 }
 
-// Concatenates tensors along one dimension.
+// Restore a reader to a previously saved state.
+//
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
 //
 // Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+// Returns the created operation.
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ConcatV2",
+		Type: "ReaderRestoreStateV2",
 		Input: []tf.Input{
-			tf.OutputList(values), axis,
+			reader_handle, state,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-func IgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "IgnoreErrorsDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
-func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ConcatenateDataset",
+		Type: "ResourceSparseApplyFtrlV2",
 		Input: []tf.Input{
-			input_dataset, another_dataset,
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that splits a SparseTensor into elements row-wise.
-func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
+// Associates the given iterator with the given statistics aggregator.
+//
+// Returns the created operation.
+func IteratorSetStatsAggregator(scope *Scope, iterator_handle tf.Output, stats_aggregator_handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorSliceDataset",
+		Type: "IteratorSetStatsAggregator",
 		Input: []tf.Input{
-			indices, values, dense_shape,
+			iterator_handle, stats_aggregator_handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Reshapes a tensor.
-//
-// Given `tensor`, this operation returns a tensor that has the same values
-// as `tensor` with shape `shape`.
-//
-// If one component of `shape` is the special value -1, the size of that dimension
-// is computed so that the total size remains constant.  In particular, a `shape`
-// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
-//
-// If `shape` is 1-D or higher, then the operation returns a tensor with shape
-// `shape` filled with the values of `tensor`. In this case, the number of elements
-// implied by `shape` must be the same as the number of elements in `tensor`.
-//
-// For example:
-//
-// ```
-// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-// # tensor 't' has shape [9]
-// reshape(t, [3, 3]) ==> [[1, 2, 3],
-//                         [4, 5, 6],
-//                         [7, 8, 9]]
-//
-// # tensor 't' is [[[1, 1], [2, 2]],
-// #                [[3, 3], [4, 4]]]
-// # tensor 't' has shape [2, 2, 2]
-// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
-//                         [3, 3, 4, 4]]
-//
-// # tensor 't' is [[[1, 1, 1],
-// #                 [2, 2, 2]],
-// #                [[3, 3, 3],
-// #                 [4, 4, 4]],
-// #                [[5, 5, 5],
-// #                 [6, 6, 6]]]
-// # tensor 't' has shape [3, 2, 3]
-// # pass '[-1]' to flatten 't'
-// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
-//
-// # -1 can also be used to infer the shape
-//
-// # -1 is inferred to be 9:
-// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 2:
-// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 3:
-// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
-//                               [2, 2, 2],
-//                               [3, 3, 3]],
-//                              [[4, 4, 4],
-//                               [5, 5, 5],
-//                               [6, 6, 6]]]
-//
-// # tensor 't' is [7]
-// # shape `[]` reshapes to a scalar
-// reshape(t, []) ==> 7
-// ```
-//
-// Arguments:
-//
-//	shape: Defines the shape of the output tensor.
-func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
+// Returns element-wise smallest integer in not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Reshape",
+		Type: "Ceil",
 		Input: []tf.Input{
-			tensor, shape,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-//
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// Computes the number of elements in the given table.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
+//	table_handle: Handle to the table.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV2",
+		Type: "LookupTableSizeV2",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold,
+			table_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StatsAggregatorHandleAttr is an optional argument to StatsAggregatorHandle.
-type StatsAggregatorHandleAttr func(optionalAttr)
-
-// StatsAggregatorHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StatsAggregatorHandleContainer(value string) StatsAggregatorHandleAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
 
-// StatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StatsAggregatorHandleSharedName(value string) StatsAggregatorHandleAttr {
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Creates a statistics manager resource.
-func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr) (handle tf.Output) {
+// Computes the gradient of bilinear interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6192,109 +6037,82 @@ func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatsAggregatorHandle",
-
+		Type: "ResizeBilinearGrad",
+		Input: []tf.Input{
+			grads, original_image,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
-type CropAndResizeGradBoxesAttr func(optionalAttr)
-
-// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+// N is the size of the segment being reduced.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
 //
-// Returns A 2-D tensor of shape `[num_boxes, 4]`.
-func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradBoxes",
+		Type: "SparseSegmentSqrtN",
 		Input: []tf.Input{
-			grads, image, boxes, box_ind,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
-type ShuffleDatasetAttr func(optionalAttr)
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
 
-// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
 //
-// value: If true, each iterator over this dataset will be given
-// a different pseudorandomly generated seed, based on a sequence seeded by the
-// `seed` and `seed2` inputs. If false, each iterator will be given the same
-// seed, and repeated iteration over this dataset will yield the exact same
-// sequence of results.
-// If not specified, defaults to true
-func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["reshuffle_each_iteration"] = value
+		m["dtype"] = value
 	}
 }
 
-// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
 //
-// Arguments:
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
+// The outputs are a deterministic function of `shape` and `seed`.
 //
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ShuffleDataset",
+		Type: "StatelessTruncatedNormal",
 		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
@@ -6302,54 +6120,51 @@ func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output
 	return op.Output(0)
 }
 
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
 
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
+// If not specified, defaults to -1
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["preferred_shard"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+// Restores a tensor from checkpoint files.
 //
-// Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
 //
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
 //
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+// Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
+//
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{"dt": dt}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
+		Type: "RestoreSlice",
 		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
+			file_pattern, tensor_name, shape_and_slice,
 		},
 		Attrs: attrs,
 	}
@@ -6357,96 +6172,42 @@ func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_
 	return op.Output(0)
 }
 
-// A container for an iterator resource.
-//
-// Returns A handle to the iterator that can be passed to a "MakeIterator"
-// or "IteratorGetNext" op.
-func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "Iterator",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
-type ExtractGlimpseAttr func(optionalAttr)
-
-// ExtractGlimpseCentered sets the optional centered attribute to value.
-//
-// value: indicates if the offset coordinates are centered relative to
-// the image, in which case the (0, 0) offset is relative to the center
-// of the input images. If false, the (0,0) offset corresponds to the
-// upper left corner of the input images.
-// If not specified, defaults to true
-func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["centered"] = value
-	}
-}
-
-// ExtractGlimpseNormalized sets the optional normalized attribute to value.
-//
-// value: indicates if the offset coordinates are normalized.
-// If not specified, defaults to true
-func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["normalized"] = value
-	}
-}
+// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
+type UniqueWithCountsAttr func(optionalAttr)
 
-// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
-//
-// value: indicates if the noise should be generated using a
-// uniform distribution or a Gaussian distribution.
-// If not specified, defaults to true
-func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
 	return func(m optionalAttr) {
-		m["uniform_noise"] = value
+		m["out_idx"] = value
 	}
 }
 
-// Extracts a glimpse from the input tensor.
+// Finds unique elements in a 1-D tensor.
 //
-// Returns a set of windows called glimpses extracted at location
-// `offsets` from the input tensor. If the windows only partially
-// overlaps the inputs, the non overlapping areas will be filled with
-// random noise.
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. Finally, it returns a third tensor `count` that
+// contains the count of each element of `y` in `x`. In other words:
 //
-// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-// glimpse_width, channels]`. The channels and batch dimensions are the
-// same as that of the input tensor. The height and width of the output
-// windows are specified in the `size` parameter.
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
-// The argument `normalized` and `centered` controls how the windows are built:
+// For example:
 //
-// * If the coordinates are normalized but not centered, 0.0 and 1.0
-//   correspond to the minimum and maximum of each height and width
-//   dimension.
-// * If the coordinates are both normalized and centered, they range from
-//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-//   left corner, the lower right corner is located at (1.0, 1.0) and the
-//   center is at (0, 0).
-// * If the coordinates are not normalized they are interpreted as
-//   numbers of pixels.
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
 //
 // Arguments:
-//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-//	size: A 1-D tensor of 2 elements containing the size of the glimpses
-// to extract.  The glimpse height must be specified first, following
-// by the glimpse width.
-//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-// the y, x locations of the center of each window.
+//	x: 1-D.
 //
-// Returns A tensor representing the glimpses `[batch_size,
-// glimpse_height, glimpse_width, channels]`.
-func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
+// Returns 1-D.1-D.1-D.
+func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6455,142 +6216,41 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExtractGlimpse",
+		Type: "UniqueWithCounts",
 		Input: []tf.Input{
-			input, size, offsets,
+			x,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
-type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
-
-// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
-//
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
-//
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
-//
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
-	}
-}
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
 	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+		m["dtype"] = value
 	}
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.image_summary('images_with_box', image_with_box)
+// Outputs deterministic pseudorandom values from a normal distribution.
 //
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
+// The generated values will have mean 0 and standard deviation 1.
 //
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//	min_object_covered: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6599,189 +6259,202 @@ func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBoxV2",
+		Type: "StatelessRandomNormal",
 		Input: []tf.Input{
-			image_size, bounding_boxes, min_object_covered,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
+// Reshapes a quantized tensor as per the Reshape op.
 //
-// Parts of the bounding box may fall outside the image.
+// ```
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
 //
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
+//
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
+		Type: "QuantizedReshape",
 		Input: []tf.Input{
-			images, boxes,
+			tensor, shape, input_min, input_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Convert one or more images from HSV to RGB.
+// GatherAttr is an optional argument to Gather.
+type GatherAttr func(optionalAttr)
+
+// GatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func GatherValidateIndices(value bool) GatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from `params` according to `indices`.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// See `rgb_to_hsv` for a description of the HSV encoding.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
 //
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+//
+// If `indices` is a permutation and `len(indices) == params.shape[0]` then
+// this operation will permute `params` accordingly.
+//
+// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+// `indices` are always validated to be within range. If assigned to GPU,
+// out-of-bound indices result in safe but unspecified behavior, which may include
+// raising an error.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
+		Type: "Gather",
 		Input: []tf.Input{
-			images,
+			params, indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a list of tensors with the same shapes and contents as the input
-//
-// tensors.
-//
-// This op can be used to override the gradient for complicated functions. For
-// example, suppose y = f(x) and we wish to apply a custom function g for backprop
-// such that dx = g(dy). In Python,
-//
-// ```python
-// with tf.get_default_graph().gradient_override_map(
-//     {'IdentityN': 'OverrideGradientWithG'}):
-//   y, _ = identity_n([f(x), x])
+// Returns the truth value of (x != y) element-wise.
 //
-// @tf.RegisterGradient('OverrideGradientWithG')
-// def ApplyG(op, dy, _):
-//   return [None, g(dy)]  # Do not backprop to f(x).
-// ```
-func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IdentityN",
+		Type: "NotEqual",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("IdentityN", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Decode the first frame of a GIF-encoded image to a uint8 tensor.
+// Inverse 3D real-valued fast Fourier transform.
 //
-// GIF with frame or transparency compression are not supported
-// convert animated GIF from compressed to uncompressed by:
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
 //
-//     convert $src.gif -coalesce $dst.gif
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.image.decode_image`.
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	contents: 0-D.  The GIF-encoded image.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeGif",
+		Type: "IRFFT3D",
 		Input: []tf.Input{
-			contents,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
+// StringSplitAttr is an optional argument to StringSplit.
+type StringSplitAttr func(optionalAttr)
 
-// DecodePngChannels sets the optional channels attribute to value.
+// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodePngDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_UINT8
-func DecodePngDtype(value tf.DataType) DecodePngAttr {
+// value: A `bool`. If `True`, skip the empty strings from the result.
+// If not specified, defaults to true
+func StringSplitSkipEmpty(value bool) StringSplitAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["skip_empty"] = value
 	}
 }
 
-// Decode a PNG-encoded image to a uint8 or uint16 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
 //
-// Accepted values are:
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
 //
-// *   0: Use the number of channels in the PNG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
 //
-// If needed, the PNG-encoded image is transformed to match the requested number
-// of color channels.
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
 //
-// This op also supports decoding JPEGs and non-animated GIFs since the interface
-// is the same, though it is cleaner to use `tf.image.decode_image`.
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
 //
 // Arguments:
-//	contents: 0-D.  The PNG-encoded image.
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
 //
-// Returns 3-D with shape `[height, width, channels]`.
-func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6790,152 +6463,143 @@ func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodePng",
+		Type: "StringSplit",
 		Input: []tf.Input{
-			contents,
+			input, delimiter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Adjust the contrast of one or more images.
+// WriteAudioSummaryAttr is an optional argument to WriteAudioSummary.
+type WriteAudioSummaryAttr func(optionalAttr)
+
+// WriteAudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
 //
-// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-// interpreted as `[height, width, channels]`.  The other dimensions only
-// represent a collection of images, such as `[batch, height, width, channels].`
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
 //
-// Contrast is adjusted independently for each channel of each image.
+// REQUIRES: value >= 1
+func WriteAudioSummaryMaxOutputs(value int64) WriteAudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Writes a `Summary` protocol buffer with audio.
 //
-// For each channel, the Op first computes the mean of the image pixels in the
-// channel and then adjusts each component of each pixel to
-// `(x - mean) * contrast_factor + mean`.
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	contrast_factor: A float multiplier for adjusting contrast.
+//	writer: A handle to a summary writer.
+//	step: The step to write the summary for.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns The contrast-adjusted image or images.
-func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+// Returns the created operation.
+func WriteAudioSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...WriteAudioSummaryAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrastv2",
+		Type: "WriteAudioSummary",
 		Input: []tf.Input{
-			images, contrast_factor,
+			writer, step, tag, tensor, sample_rate,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
-type PaddingFIFOQueueV2Attr func(optionalAttr)
-
-// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types.
-// Shapes of fixed rank but variable size are allowed by setting
-// any shape dimension to -1.  In this case, the inputs' shape may vary along
-// the given dimension, and DequeueMany will pad the given dimension with
-// zeros up to the maximum shape of all elements in the given batch.
-// If the length of this attr is 0, different queue elements may have
-// different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// PaddingFIFOQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
 
-// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// ProdKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func ProdKeepDims(value bool) ProdAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// A queue that produces elements in first-in first-out order.
+// Computes the product of elements across dimensions of a tensor.
 //
-// Variable-size shapes are allowed by setting the corresponding shape dimensions
-// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-// size of any given element in the minibatch.  See below for details.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns The handle to the queue.
-func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PaddingFIFOQueueV2",
-
+		Type: "Prod",
+		Input: []tf.Input{
+			input, axis,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
-type ExtractJpegShapeAttr func(optionalAttr)
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
 
-// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: (Optional) The output type of the operation (int32 or int64).
-// Defaults to int32.
-// If not specified, defaults to DT_INT32
-func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Extract the shape information of a JPEG-encoded image.
+// Resize `images` to `size` using bilinear interpolation.
 //
-// This op only parses the image header, so it is much faster than DecodeJpeg.
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	contents: 0-D. The JPEG-encoded image.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns 1-D. The image shape with format [height, width, channels].
-func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6944,9 +6608,9 @@ func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegS
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExtractJpegShape",
+		Type: "ResizeBilinear",
 		Input: []tf.Input{
-			contents,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -6954,146 +6618,189 @@ func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegS
 	return op.Output(0)
 }
 
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
-
-// DecodeJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
+	opspec := tf.OpSpec{
+		Type: "Softsign",
+		Input: []tf.Input{
+			features,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
+// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
+type GenerateVocabRemappingAttr func(optionalAttr)
 
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+// value: Number of entries in the old vocab file to consider.  If -1,
+// use the entire old vocabulary.
+// If not specified, defaults to -1
 //
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+// REQUIRES: value >= -1
+func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["old_vocab_size"] = value
 	}
 }
 
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
+// Given a path to new and old vocabulary files, returns a remapping Tensor of
 //
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
+// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
+// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
+// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
+// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
+// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
+// default value of -1.
 //
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
+// `num_vocab_offset` enables
+// use in the partitioned variable case, and should generally be set through
+// examining partitioning info.  The format of the files should be a text file,
+// with each line containing a single entity within the vocabulary.
 //
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// For example, with `new_vocab_file` a text file containing each of the following
+// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
+// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
+// `[0, -1, 2]`.
 //
+// The op also returns a count of how many entries in the new vocabulary
+// were present in the old vocabulary, which is used to calculate the number of
+// values to initialize in a weight matrix remapping
 //
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
+// This functionality can be used to remap both row vocabularies (typically,
+// features) and column vocabularies (typically, classes) from TensorFlow
+// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
+// corresponding to div-partitioned variables.  Moreover, the underlying remapping
+// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
+// use the corresponding index_table_from_file() as the FeatureColumn framework
+// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
+//	new_vocab_file: Path to the new vocab file.
+//	old_vocab_file: Path to the old vocab file.
+//	new_vocab_offset: How many entries into the new vocab file to start reading.
+//	num_new_vocab: Number of entries in the new vocab file to remap.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+// Returns A Tensor of length num_new_vocab where the element at index i
+// is equal to the old ID that maps to the new ID i.  This element is -1 for any
+// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
+func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
+		Type: "GenerateVocabRemapping",
 		Input: []tf.Input{
-			contents,
+			new_vocab_file, old_vocab_file,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
-type ResizeNearestNeighborGradAttr func(optionalAttr)
+// Assigns sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] = updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterUpdate",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
 
-// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
+// value: If `True`, perform exclusive cumsum.
 // If not specified, defaults to false
-func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+func CumsumExclusive(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["exclusive"] = value
 	}
 }
 
-// Computes the gradient of nearest neighbor interpolation.
+// CumsumReverse sets the optional reverse attribute to value.
+//
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
+//
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-// original input size.
-//
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-// with respect to the input image.
-func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7102,9 +6809,9 @@ func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, op
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighborGrad",
+		Type: "Cumsum",
 		Input: []tf.Input{
-			grads, size,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
@@ -7112,31 +6819,26 @@ func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, op
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
 
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["out_type"] = value
 	}
 }
 
-// Resize `images` to `size` using nearest neighbor interpolation.
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7145,95 +6847,99 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			images, size,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
-type ResizeBicubicGradAttr func(optionalAttr)
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
 
-// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["header_bytes"] = value
 	}
 }
 
-// Computes the gradient of bicubic interpolation.
-//
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["footer_bytes"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ResizeBicubicGrad",
-		Input: []tf.Input{
-			grads, original_image,
-		},
-		Attrs: attrs,
+}
+
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+//
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["hop_bytes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// SummaryWriterAttr is an optional argument to SummaryWriter.
-type SummaryWriterAttr func(optionalAttr)
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
 
-// SummaryWriterSharedName sets the optional shared_name attribute to value.
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
 // If not specified, defaults to ""
-func SummaryWriterSharedName(value string) SummaryWriterAttr {
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// SummaryWriterContainer sets the optional container attribute to value.
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
+//
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
 // If not specified, defaults to ""
-func SummaryWriterContainer(value string) SummaryWriterAttr {
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["encoding"] = value
 	}
 }
 
-// Returns a handle to be used to access a summary writer.
+// A Reader that outputs fixed-length records from a file.
 //
-// The summary writer is an in-graph resource which can be used by ops to write
-// summaries to event files.
+// Arguments:
+//	record_bytes: Number of bytes in the record.
 //
-// Returns the summary writer resource. Scalar handle.
-func SummaryWriter(scope *Scope, optional ...SummaryWriterAttr) (writer tf.Output) {
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SummaryWriter",
+		Type: "FixedLengthRecordReaderV2",
 
 		Attrs: attrs,
 	}
@@ -7241,181 +6947,174 @@ func SummaryWriter(scope *Scope, optional ...SummaryWriterAttr) (writer tf.Outpu
 	return op.Output(0)
 }
 
-// Returns the set of files matching one or more glob patterns.
+// The gradient operator for the SparseAdd op.
 //
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
 //
 // Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
 //
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
+		Type: "SparseAddGrad",
 		Input: []tf.Input{
-			pattern,
+			backprop_val_grad, a_indices, b_indices, sum_indices,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Store the input tensor in the state of the current session.
-//
-// Arguments:
-//	value: The tensor to be stored.
-//
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
+		Type: "Atan",
 		Input: []tf.Input{
-			value,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adjust the hue of one or more images.
+// Encode audio data using the WAV file format.
 //
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
 //
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
 //
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustHue",
+		Type: "EncodeWav",
 		Input: []tf.Input{
-			images, delta,
+			audio, sample_rate,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restore a Reader to its initial clean state.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
 //
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash prevents this by making it difficult, if not infeasible, to compute inputs
+// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
 //
 // Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key for the keyed hash function passed as a list of two uint64
+// elements.
 //
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
+		Type: "StringToHashBucketStrong",
 		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns the next record (key, value pair) produced by a Reader.
+// Generates values in an interval.
 //
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
+//	start: First entry in the range.
+//	stop: Last entry in the range.
+//	num: Number of values to generate.
 //
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
+		Type: "LinSpace",
 		Input: []tf.Input{
-			reader_handle, queue_handle,
+			start, stop, num,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
 
-// IdentityReaderV2Container sets the optional container attribute to value.
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["ignore_lookup_error"] = value
 	}
 }
 
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+// Deletes the resource specified by the handle.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the queued work as both the key and value.
+// All subsequent operations using the resource will result in a NotFound
+// error status.
 //
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
+// Arguments:
+//	resource: handle to the resource to delete.
 //
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7424,51 +7123,76 @@ func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
-
+		Type: "DestroyResourceOp",
+		Input: []tf.Input{
+			resource,
+		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
 
-// TFRecordReaderV2Container sets the optional container attribute to value.
+// CumprodExclusive sets the optional exclusive attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["exclusive"] = value
 	}
 }
 
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// CumprodReverse sets the optional reverse attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
 	return func(m optionalAttr) {
-		m["compression_type"] = value
+		m["reverse"] = value
 	}
 }
 
-// A Reader that outputs the records from a TensorFlow Records file.
+// Compute the cumulative product of the tensor `x` along `axis`.
 //
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
+//
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7477,53 +7201,101 @@ func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
+		Type: "Cumprod",
+		Input: []tf.Input{
+			x, axis,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
-
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
+// Computes the mean along segments of a tensor.
 //
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
-	}
-}
-
-// TextLineReaderV2Container sets the optional container attribute to value.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
+//
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMean",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["use_locking"] = value
 	}
 }
 
-// A Reader that outputs the lines of a file delimited by '\n'.
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7532,193 +7304,213 @@ func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
+		Type: "ResourceSparseApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
+		Type: "BatchDataset",
 		Input: []tf.Input{
-			basename, num_shards,
+			input_dataset, batch_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Delete the stack from its resource container.
+// Inverse fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
 //
 // Arguments:
-//	handle: The handle to a stack.
+//	input: A complex64 tensor.
 //
-// Returns the created operation.
-func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StackCloseV2",
+		Type: "IFFT",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Generate a sharded filename. The filename is printf formatted as
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
 //
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
-		Input: []tf.Input{
-			basename, shard, num_shards,
-		},
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Saves input tensors slices to disk.
+// LRNBias sets the optional bias attribute to value.
 //
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
 //
-// Elements of the `shapes_and_slices` input must either be:
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
 //
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
 //
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
 //
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
 //
-// See also `Save`.
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
 //
 // Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
-//
-// Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SaveSlices",
+		Type: "LRN",
 		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+			input,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
-
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
-//
-// value: see above.
-// If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
-	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ZipDataset",
+		Input: []tf.Input{
+			tf.OutputList(input_datasets),
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
-//
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
-//
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
-//
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
+// Writes a `GraphDef` protocol buffer to a `SummaryWriter`.
 //
 // Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
+//	writer: Handle of `SummaryWriter`.
+//	step: The step to write the summary for.
+//	tensor: A scalar string of the serialized tf.GraphDef proto.
 //
 // Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+func WriteGraphSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
+		Type: "WriteGraphSummary",
 		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
+			writer, step, tensor,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
-type QueueEnqueueManyV2Attr func(optionalAttr)
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
 
-// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If the queue is too full, this operation will block for up
-// to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Enqueues zero or more tuples of one or more tensors in the given queue.
-//
-// This operation slices each component tensor along the 0th dimension to
-// make multiple queue elements. All of the tuple components must have the
-// same size in the 0th dimension.
-//
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
 //
-// N.B. If the queue is full, this operation will block until the given
-// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should
-// be taken.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7727,66 +7519,87 @@ func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueManyV2",
+		Type: "ResourceSparseApplyAdagrad",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			var_, accum, lr, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
-
-// SvdComputeUv sets the optional compute_uv attribute to value.
+// 2D real-valued fast Fourier transform.
 //
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
-// If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["compute_uv"] = value
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT2D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SvdFullMatrices sets the optional full_matrices attribute to value.
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
+
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
 // If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
-		m["full_matrices"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Computes the singular value decompositions of one or more matrices.
+// Resize `images` to `size` using area interpolation.
 //
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+// Input images can be of different types but output images are always float.
 //
-// ```python
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
-// ```
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7795,103 +7608,91 @@ func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Svd",
+		Type: "ResizeArea",
 		Input: []tf.Input{
-			input,
+			images, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Converts one or more images from RGB to HSV.
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
 //
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
+		Type: "StatelessRandomUniform",
 		Input: []tf.Input{
-			images,
+			shape, seed,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
 
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func AngleTout(value tf.DataType) AngleAttr {
 	return func(m optionalAttr) {
-		m["fast"] = value
+		m["Tout"] = value
 	}
 }
 
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
-//
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
+// Returns the argument of a complex number.
 //
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the argument of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part.
 //
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
-// sufficiently large.
+// The argument returned by this operation is of the form \\(atan2(b, a)\\).
 //
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
+// For example:
 //
-// Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.angle(input) ==> [2.0132, 1.056]
+// ```
 //
 // @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
+// Equivalent to np.angle.
 // @end_compatibility
-//
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7900,9 +7701,9 @@ func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "Angle",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -7910,329 +7711,422 @@ func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer
 	return op.Output(0)
 }
 
-// Adjust the saturation of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A scale is then applied all the saturation
-// values, and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	scale: A float scale to add to the saturation.
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
+
+// VarHandleOpContainer sets the optional container attribute to value.
 //
-// Returns The hue-adjusted image or images.
-func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustSaturation",
-		Input: []tf.Input{
-			images, scale,
-		},
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
-type SelfAdjointEigV2Attr func(optionalAttr)
-
-// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
-//
-// value: If `True` then eigenvectors will be computed and returned in `v`.
-// Otherwise, only the eigenvalues will be computed.
-// If not specified, defaults to true
-func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
+//
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
 	return func(m optionalAttr) {
-		m["compute_v"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Computes the eigen decomposition of one or more square self-adjoint matrices.
-//
-// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
-//
-// ```python
-// # a is a tensor.
-// # e is a tensor of eigenvalues.
-// # v is a tensor of eigenvectors.
-// e, v = self_adjoint_eig(a)
-// e = self_adjoint_eig(a, compute_v=False)
-// ```
+// Creates a handle to a Variable resource.
 //
 // Arguments:
-//	input: `Tensor` input of shape `[N, N]`.
-//
-// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
-func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEigV2",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "VarHandleOp",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
-//
-// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix
-// SelfAdjointEig.
-//
-// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
+// Elementwise computes the bitwise XOR of `x` and `y`.
 //
-// Returns Shape is `[..., M+1, M]`.
-func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+// The result will have those bits set, that are different in `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEig",
+		Type: "BitwiseXor",
 		Input: []tf.Input{
-			input,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Writes contents to the file at input filename. Creates file and recursively
+// Deserialize `SparseTensor` objects.
 //
-// creates directory if not existing.
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
 //
-// Arguments:
-//	filename: scalar. The name of the file to which we write the contents.
-//	contents: scalar. The content to be written to the output file.
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
 //
-// Returns the created operation.
-func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "WriteFile",
+		Type: "DeserializeSparse",
 		Input: []tf.Input{
-			filename, contents,
+			serialized_sparse,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the Cholesky decomposition of one or more square matrices.
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
 //
-// The input has to be symmetric and positive definite. Only the lower-triangular
-// part of the input will be used for this operation. The upper-triangular part
-// will not be read.
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
 //
-// The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
 //
-// **Note**: The gradient computation on GPU is faster for large matrices but
-// not for large batch dimensions when the submatrices are small. In this
-// case it might be faster to use the CPU.
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Cholesky",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			input,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the matrix exponential of one or more square matrices:
-//
-// exp(A) = \sum_{n=0}^\infty A^n/n!
-//
-// The exponential is computed using a combination of the scaling and squaring
-// method and the Pade approximation. Details can be founds in:
-// Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
-// revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
+// SizeAttr is an optional argument to Size.
+type SizeAttr func(optionalAttr)
+
+// SizeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func SizeOutType(value tf.DataType) SizeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the size of a tensor.
 //
-// Arguments:
-//	input: Shape is `[..., M, M]`.
+// This operation returns an integer representing the number of elements in
+// `input`.
 //
-// Returns Shape is `[..., M, M]`.
+// For example:
 //
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.expm
-// @end_compatibility
-func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+// ```
+// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
+// size(t) ==> 12
+// ```
+func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixExponential",
+		Type: "Size",
 		Input: []tf.Input{
 			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Merges summaries.
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
+
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
 //
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse `updates` to individual values or slices within a given
 //
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See @{tf.scatter_nd} for more details about how to make updates to
+// slices.
 //
 // Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MergeSummary",
+		Type: "ResourceScatterNdUpdate",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			ref, indices, updates,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
+// StageSizeAttr is an optional argument to StageSize.
+type StageSizeAttr func(optionalAttr)
 
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// StageSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
+// REQUIRES: value >= 0
+func StageSizeCapacity(value int64) StageSizeAttr {
 	return func(m optionalAttr) {
-		m["max_outputs"] = value
+		m["capacity"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
+// REQUIRES: value >= 0
+func StageSizeMemoryLimit(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageSizeContainer(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageSizeSharedName(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, sample_rate,
-		},
+		Type: "StageSize",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
 
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["iou_threshold"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
+		Type: "NonMaxSuppression",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			boxes, scores, max_output_size,
 		},
 		Attrs: attrs,
 	}
@@ -8240,559 +8134,507 @@ func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output
 	return op.Output(0)
 }
 
-// Returns a tensor of ones with the same shape and type as x.
-//
-// Arguments:
-//	x: a tensor of type T.
-//
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "OnesLike",
+		Type: "TensorDataset",
 		Input: []tf.Input{
-			x,
+			tf.OutputList(components),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
+// Component-wise multiplies a SparseTensor by a dense Tensor.
 //
-// the result here is consistent with a truncating divide. E.g.
-// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
 //
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Mod",
+		Type: "SparseDenseCwiseMul",
 		Input: []tf.Input{
-			x, y,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
+
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilter",
+		Type: "ResourceSparseApplyFtrl",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+
+// Returns which elements of x are Inf.
+//
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
+		Type: "IsInf",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ReverseSequenceAttr is an optional argument to ReverseSequence.
-type ReverseSequenceAttr func(optionalAttr)
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
-// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: The dimension along which reversal is performed.
-// If not specified, defaults to 0
-func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
 	return func(m optionalAttr) {
-		m["batch_dim"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Reverses variable length slices.
-//
-// This op first slices `input` along the dimension `batch_dim`, and for each
-// slice `i`, reverses the first `seq_lengths[i]` elements along
-// the dimension `seq_dim`.
-//
-// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-//
-// The output slice `i` along dimension `batch_dim` is then given by input
-// slice `i`, with the first `seq_lengths[i]` slices along dimension
-// `seq_dim` reversed.
-//
-// For example:
-//
-// ```
-// # Given this:
-// batch_dim = 0
-// seq_dim = 1
-// input.dims = (4, 8, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[0, 7:, :, ...] = input[0, 7:, :, ...]
-// output[1, 2:, :, ...] = input[1, 2:, :, ...]
-// output[2, 3:, :, ...] = input[2, 3:, :, ...]
-// output[3, 2:, :, ...] = input[3, 2:, :, ...]
-// ```
-//
-// In contrast, if:
+// Update '*var' according to the RMSProp algorithm.
 //
-// ```
-// # Given this:
-// batch_dim = 2
-// seq_dim = 0
-// input.dims = (8, ?, 4, ...)
-// seq_lengths = [7, 2, 3, 5]
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
 //
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
 //
-// # while entries past seq_lens are copied through:
-// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-// ```
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	input: The input to reverse.
-//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
-// `max(seq_lengths) <= input.dims(seq_dim)`
-//	seq_dim: The dimension which is partially reversed.
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns The partially reversed input. It has the same shape as `input`.
-func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ReverseSequence",
+		Type: "ResourceSparseApplyRMSProp",
 		Input: []tf.Input{
-			input, seq_lengths,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the gradient for the rsqrt of `x` wrt its input.
+// Returns the truth value of (x > y) element-wise.
 //
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
+		Type: "Greater",
 		Input: []tf.Input{
-			y, dy,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Makes its input available to the next iteration.
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
 //
-// Arguments:
-//	data: The tensor to be made available to the next iteration.
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "NextIteration",
-		Input: []tf.Input{
-			data,
-		},
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Does nothing. Only useful as a placeholder for control edges.
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
 //
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "NoOp",
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within in this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
 
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_image_if_no_bounding_boxes"] = value
 	}
 }
 
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
+// Generate a single randomly distorted bounding box for an image.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
 //
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
 //
-// ```
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
 // ```
 //
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
 //
 // Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
 //
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
-//	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
+		Type: "SampleDistortedBoundingBox",
 		Input: []tf.Input{
-			input, filter,
+			image_size, bounding_boxes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CropAndResizeAttr is an optional argument to CropAndResize.
-type CropAndResizeAttr func(optionalAttr)
-
-// CropAndResizeMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeMethod(value string) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
-//
-// value: Value used for extrapolation, when applicable.
-// If not specified, defaults to 0
-func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["extrapolation_value"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Extracts crops from the input image tensor and bilinearly resizes them (possibly
-//
-// with aspect ratio change) to a common output size specified by `crop_size`. This
-// is more general than the `crop_to_bounding_box` op which extracts a fixed size
-// slice from the input image and does not allow resizing or aspect ratio change.
-//
-// Returns a tensor with `crops` from the input `image` at positions defined at the
-// bounding box locations in `boxes`. The cropped boxes are all resized (with
-// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
-// resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
-// method will give identical results to using `tf.image.resize_bilinear()`
-// with `align_corners=True`.
+// Returns x / y element-wise for integer types.
 //
-// Arguments:
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-// cropped image patches are resized to this size. The aspect ratio of the image
-// content is not preserved. Both `crop_height` and `crop_width` need to be
-// positive.
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
 //
-// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResize",
+		Type: "TruncateDiv",
 		Input: []tf.Input{
-			image, boxes, box_ind, crop_size,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
-type MaxPoolGradAttr func(optionalAttr)
-
-// MaxPoolGradDataFormat sets the optional data_format attribute to value.
+// Restores tensors from a V2 checkpoint.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
+//
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
+//
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
 //
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGrad",
+		Type: "RestoreV2",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			prefix, tensor_names, shape_and_slices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
-
-// EncodeJpegFormat sets the optional format attribute to value.
-//
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["format"] = value
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// EncodeJpegQuality sets the optional quality attribute to value.
-//
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
 	}
+	return tensors
 }
 
-// EncodeJpegProgressive sets the optional progressive attribute to value.
+// Decode web-safe base64-encoded strings.
 //
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["progressive"] = value
-	}
-}
-
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
 //
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["optimize_size"] = value
-	}
-}
-
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+// Arguments:
+//	input: Base64 strings to decode.
 //
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
-//
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["density_unit"] = value
+	opspec := tf.OpSpec{
+		Type: "DecodeBase64",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegXDensity sets the optional x_density attribute to value.
+// Store the input tensor in the state of the current session.
 //
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["x_density"] = value
-	}
-}
-
-// EncodeJpegYDensity sets the optional y_density attribute to value.
+// Arguments:
+//	value: The tensor to be stored.
 //
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandle",
+		Input: []tf.Input{
+			value,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
+		m["use_locking"] = value
 	}
 }
 
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
-//
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
 //
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8801,180 +8643,113 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "ResourceSparseApplyProximalAdagrad",
 		Input: []tf.Input{
-			image,
+			var_, accum, lr, l1, l2, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Gradients for batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. See `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-//
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
+		Type: "Floor",
 		Input: []tf.Input{
-			t, m, v, gamma, backprop,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
-type FusedBatchNormV2Attr func(optionalAttr)
-
-// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+	opspec := tf.OpSpec{
+		Type: "Erf",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
+// Reads the value of a variable.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
+// The tensor returned by this operation is immutable.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
-//
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormV2",
+		Type: "ReadVariableOp",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			resource,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
-
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
 
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the input.
+// Computes gradients of max pooling function.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
+		Type: "MaxPool3DGrad",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -8982,60 +8757,43 @@ func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output,
 	return op.Output(0)
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
 
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9044,648 +8802,739 @@ func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
+		Type: "SparseReduceSum",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
-
-// RandomStandardNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
 
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Outputs random values from a normal distribution.
+// Update '*var' according to the adagrad scheme.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "ResourceApplyAdagrad",
 		Input: []tf.Input{
-			shape,
+			var_, accum, lr, grad,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateMod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes sigmoid of `x` element-wise.
+// Inverse 2D real-valued fast Fourier transform.
 //
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
+//
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sigmoid",
+		Type: "IRFFT2D",
 		Input: []tf.Input{
-			x,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
-type ComputeAccidentalHitsAttr func(optionalAttr)
-
-// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseExample",
+		Input: []tf.Input{
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
 	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+// VariableShapeAttr is an optional argument to VariableShape.
+type VariableShapeAttr func(optionalAttr)
+
+// VariableShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["out_type"] = value
 	}
 }
 
-// Computes the ids of the positions in sampled_candidates that match true_labels.
+// Returns the shape of the variable pointed to by `resource`.
 //
-// When doing log-odds NCE, the result of this op should be passed through a
-// SparseToDense op, then added to the logits of the sampled candidates. This has
-// the effect of 'removing' the sampled labels that match the true labels by
-// making the classifier sure that they are sampled labels.
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-// Arguments:
-//	true_classes: The true_classes output of UnpackSparseLabels.
-//	sampled_candidates: The sampled_candidates output of CandidateSampler.
-//	num_true: Number of true labels per context.
+// For example:
 //
-// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
-// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
-// is -FLOAT_MAX.
-func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ComputeAccidentalHits",
+		Type: "VariableShape",
 		Input: []tf.Input{
-			true_classes, sampled_candidates,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// StageClearAttr is an optional argument to StageClear.
-type StageClearAttr func(optionalAttr)
-
-// StageClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Fills empty rows in the input 2-D `SparseTensor` with a default value.
 //
-// REQUIRES: value >= 0
-func StageClearCapacity(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StageClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// The input `SparseTensor` is represented via the tuple of inputs
+// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+// same `dense_shape` but with indices `output_indices` and values
+// `output_values`.
 //
-// REQUIRES: value >= 0
-func StageClearMemoryLimit(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageClearContainer(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageClearSharedName(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
+// This op inserts a single entry for every row that doesn't have any values.
+// The index is created as `[row, 0, ..., 0]` and the inserted value
+// is `default_value`.
 //
-// Returns the created operation.
-func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
+// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [2, 0]: c
+//     [3, 1]: d
+//
+// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [1, 0]: default_value
+//     [2, 0]: c
+//     [3, 1]: d
+//     [4, 0]: default_value
+//
+// The output `SparseTensor` will be in row-major order and will have the
+// same shape as the input.
+//
+// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+//
+//     empty_row_indicator[i] = True iff row i was an empty row.
+//
+// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+// backpropagation,
+//
+//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+//
+// Arguments:
+//	indices: 2-D. the indices of the sparse tensor.
+//	values: 1-D. the values of the sparse tensor.
+//	dense_shape: 1-D. the shape of the sparse tensor.
+//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
+//   for rows missing from the input sparse tensor.
+// output indices: 2-D. the indices of the filled sparse tensor.
+//
+// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
+// input sparse tensor.1-D. a map from the input indices to the output indices.
+func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StageClear",
-
-		Attrs: attrs,
+		Type: "SparseFillEmptyRows",
+		Input: []tf.Input{
+			indices, values, dense_shape, default_value,
+		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
-
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+// Reverses specific dimensions of a tensor.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the average pooling function.
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
+//
+// `rank(tensor) = size(dims)`
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
 //
 // Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "Reverse",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			tensor, dims,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
+// Computes log softmax activations.
 //
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+// For each batch `i` and class `j` we have
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
 //
 // Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMax",
+		Type: "LogSoftmax",
 		Input: []tf.Input{
-			data, segment_ids,
+			logits,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the rank of a tensor.
+// Computes the inverse permutation of a tensor.
 //
-// This operation returns an integer representing the rank of `input`.
+// This operation computes the inverse of an index permutation. It takes a 1-D
+// integer tensor `x`, which represents the indices of a zero-based array, and
+// swaps each value with its index position. In other words, for an output tensor
+// `y` and an input tensor `x`, this operation computes the following:
+//
+// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
+//
+// The values must include 0. There can be no duplicate values or negative values.
 //
 // For example:
 //
 // ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// # shape of tensor 't' is [2, 2, 3]
-// rank(t) ==> 3
+// # tensor `x` is [3, 4, 0, 2, 1]
+// invert_permutation(x) ==> [2, 4, 3, 0, 1]
 // ```
 //
-// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-// of a tensor is the number of indices required to uniquely select each element
-// of the tensor. Rank is also known as "order", "degree", or "ndims."
-func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.
+func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rank",
+		Type: "InvertPermutation",
 		Input: []tf.Input{
-			input,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
 //
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+// This operation folds the padded areas of `input` by `MirrorPad` according to the
+// `paddings` you specify. `paddings` must be the same as `paddings` argument
+// given to the corresponding `MirrorPad` op.
 //
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
+// The folded size of each dimension D of the output is:
 //
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
+// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
 //
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
+// # 'paddings' is [[0, 1]], [0, 1]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[ 1,  5]
+//                       [11, 28]]
+// ```
 //
 // Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or empty if the column is required.
+//	input: The input tensor to be folded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: The mode used in the `MirrorPad` op.
 //
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+// Returns The folded tensor.
+func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"mode": mode}
 	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
+		Type: "MirrorPadGrad",
 		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
+			input, paddings,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
+//
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
 //
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ParseTensor",
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			serialized,
+			features, labels,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acos",
-		Input: []tf.Input{
-			x,
-		},
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
-//
-// Useful special cases:
-//
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
 //
 // Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "ResourceSparseApplyAdagradDA",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
-
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
-//
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
-	return func(m optionalAttr) {
-		m["little_endian"] = value
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalNot",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Reinterpret the bytes of a string as a vector of numbers.
+// 3D real-valued fast Fourier transform.
+//
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	bytes: All the elements must have the same length.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
 //
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.rfftn with 3 dimensions.
+// @end_compatibility
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
+		Type: "RFFT3D",
 		Input: []tf.Input{
-			bytes,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
-type OrderedMapIncompleteSizeAttr func(optionalAttr)
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
 
-// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["element_shape"] = value
 	}
 }
 
-// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["dynamic_size"] = value
 	}
 }
 
-// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+//
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["clear_after_read"] = value
 	}
 }
 
-// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+//
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["identical_element_shapes"] = value
+	}
+}
+
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+//
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
 // If not specified, defaults to ""
-func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["tensor_array_name"] = value
 	}
 }
 
-// Op returns the number of incomplete elements in the underlying container.
-func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
+// An array of Tensors of given size.
+//
+// Write data via Write and read via Read or Pack.
+//
+// Arguments:
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
+//
+// Returns The handle to the TensorArray.A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapIncompleteSize",
-
+		Type: "TensorArrayV3",
+		Input: []tf.Input{
+			size,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
-
-// RandomShuffleSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
 
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Randomly shuffles a tensor along its first dimension.
-//
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
-//
-// ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
+// Performs 3D max pooling on the input.
 //
 // Arguments:
-//	value: The tensor to be shuffled.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
+		Type: "MaxPool3D",
 		Input: []tf.Input{
-			value,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -9693,151 +9542,152 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxVarsPerChannelAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannel.
-type FakeQuantWithMinMaxVarsPerChannelAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsPerChannelNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsPerChannelNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FakeQuantWithMinMaxVarsPerChannelNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsPerChannelNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
+// Inverse 2D fast Fourier transform.
 //
-// `[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
-// to 'outputs' tensor of same shape as `inputs`.
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
 //
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+// Arguments:
+//	input: A complex64 tensor.
 //
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVarsPerChannel(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelAttr) (outputs tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsPerChannel",
+		Type: "IFFT2D",
 		Input: []tf.Input{
-			inputs, min, max,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
-
-// TruncatedNormalSeed sets the optional seed attribute to value.
+// Creates a tensor filled with a scalar value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// This operation creates a tensor of shape `dims` and fills it with `value`.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// For example:
+//
+// ```
+// # Output tensor has shape [2, 3].
+// fill([2, 3], 9) ==> [[9, 9, 9]
+//                      [9, 9, 9]]
+// ```
+//
+// Arguments:
+//	dims: 1-D. Represents the shape of the output tensor.
+//	value: 0-D (scalar). Value to fill the returned tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.full
+// @end_compatibility
+func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fill",
+		Input: []tf.Input{
+			dims, value,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from a truncated normal distribution.
+// 2D fast Fourier transform.
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	input: A complex64 tensor.
 //
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
+		Type: "FFT2D",
 		Input: []tf.Input{
-			shape,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
 
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
+// Update '*var' as FOBOS algorithm with fixed learning rate.
 //
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
 //
 // Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9846,177 +9696,154 @@ func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
+		Type: "ResourceApplyProximalGradientDescent",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+			var_, alpha, l1, l2, delta,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
-
-// SkipgramWindowSize sets the optional window_size attribute to value.
-//
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["window_size"] = value
-	}
-}
-
-// SkipgramMinCount sets the optional min_count attribute to value.
+// Computes the gradient for the sqrt of `x` wrt its input.
 //
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["min_count"] = value
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// SkipgramSubsample sets the optional subsample attribute to value.
-//
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["subsample"] = value
+	opspec := tf.OpSpec{
+		Type: "SqrtGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Parses a text file and creates a batch of examples.
-//
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+// Get the value of the tensor specified by its handle.
 //
 // Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
 //
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
+		Type: "GetSessionTensor",
+		Input: []tf.Input{
+			handle,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+	return op.Output(0)
 }
 
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
-
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+// Returns x - y element-wise.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from a normal distribution. The parameters may each be a
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
 //
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
+		Type: "SoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
+			features, labels,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
 
-// RandomUniformIntSeed sets the optional seed attribute to value.
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+// value: If `True`, retain reduced dimensions with length `1`.
+// If not specified, defaults to false
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+// ReduceJoinSeparator sets the optional separator attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["separator"] = value
 	}
 }
 
-// Outputs random integers from a uniform distribution.
+// Joins a string Tensor across the given dimensions.
 //
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.
 //
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
+// For example:
+//
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
+// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
+// tf.reduce_join(a, []) ==> ["abcd"]
+// ```
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
 //
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10025,9 +9852,9 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
+		Type: "ReduceJoin",
 		Input: []tf.Input{
-			shape, minval, maxval,
+			inputs, reduction_indices,
 		},
 		Attrs: attrs,
 	}
@@ -10035,358 +9862,396 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf
 	return op.Output(0)
 }
 
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
-//
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
+		Type: "Cos",
 		Input: []tf.Input{
-			json_examples,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] += updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] += updates[i, ...]
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
+
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]`.
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// Returns the created operation.
-func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterAdd",
+		Type: "FusedBatchNormGrad",
 		Input: []tf.Input{
-			resource, indices, updates,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Delete the TensorArray from its resource container.
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
+
+// TopKSorted sets the optional sorted attribute to value.
 //
-// This enables the user to close and release the resource in the middle
-// of a step/run.
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
 //
 // Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
 //
-// Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"k": k}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
+		Type: "TopK",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
-
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Gather slices from the variable pointed to by `resource` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
+// Transforms a Tensor into a serialized TensorProto proto.
 //
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
+// Arguments:
+//	tensor: A Tensor of type `T`.
 //
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+// Returns A serialized TensorProto proto of the input tensor.
+func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceGather",
+		Type: "SerializeTensor",
 		Input: []tf.Input{
-			resource, indices,
+			tensor,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
 
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
+// If not specified, defaults to false
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
+// Solves systems of linear equations.
 //
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 //
 // Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "MatrixSolve",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
-
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
+	return op.Output(0)
 }
 
-// Dequeues a tuple of one or more tensors from the given queue.
+// Looks up keys in a table, outputs the corresponding values.
 //
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
 //
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+//
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
+		Type: "LookupTableFindV2",
 		Input: []tf.Input{
-			handle,
+			table_handle, keys, default_value,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 3D fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "IFFT3D",
+		Input: []tf.Input{
+			input,
+		},
 	}
-	return components
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
-type ParseSingleSequenceExampleAttr func(optionalAttr)
-
-// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+// Adds `bias` to `value`.
 //
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// This is a deprecated version of BiasAdd and will be soon removed.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
-	}
-}
-
-// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAddV1",
+		Input: []tf.Input{
+			value, bias,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+// Reverses specific dimensions of a tensor.
 //
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
+// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
+// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
+// Given a `tensor`, and a `int32` tensor `axis` representing the set of
+// dimensions of `tensor` to reverse. This operation reverses each dimension
+// `i` for which there exists `j` s.t. `axis[j] == i`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions specified
+// in `axis` may be 0 or more entries. If an index is specified more than
+// once, a InvalidArgument error is raised.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [3] or 'dims' is [-1]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is '[1]' (or 'dims' is '[-3]')
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is '[2]' (or 'dims' is '[-2]')
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
+// `[-rank(tensor), rank(tensor))`.
+//
+// Returns The same shape as `tensor`.
+func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReverseV2",
+		Input: []tf.Input{
+			tensor, axis,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
-//
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
+
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
 	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
+		m["Tout"] = value
 	}
 }
 
-// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+// Returns the real part of a complex number.
 //
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
-	}
-}
-
-// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+// For example:
 //
-// Arguments:
-//	serialized: A scalar containing a binary serialized SequenceExample proto.
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExample.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExample.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	debug_name: A scalar containing the name of the serialized proto.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty scalar if no name is available.
-func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10395,105 +10260,65 @@ func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleSequenceExample",
+		Type: "Real",
 		Input: []tf.Input{
-			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+	return op.Output(0)
 }
 
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
 
-// RandomGammaSeed sets the optional seed attribute to value.
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Outputs random values from the Gamma distribution(s) described by alpha.
+// Outputs a `Summary` protocol buffer with audio.
 //
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomGamma",
+		Type: "AudioSummary",
 		Input: []tf.Input{
-			shape, alpha,
+			tag, tensor,
 		},
 		Attrs: attrs,
 	}
@@ -10501,71 +10326,70 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 	return op.Output(0)
 }
 
-// Returns the element-wise sum of a list of tensors.
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
+
+// QrFullMatrices sets the optional full_matrices attribute to value.
 //
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the QR decompositions of one or more matrices.
 //
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
 //
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
 //
 // Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the inverse of `x` wrt its input.
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
+		Type: "Qr",
 		Input: []tf.Input{
-			y, dy,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset with a range of values. Corresponds to python's xrange.
-//
-// Arguments:
-//	start: corresponds to start in python's xrange().
-//	stop: corresponds to stop in python's xrange().
-//	step: corresponds to step in python's xrange().
-//
-//
-func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RangeDataset",
+		Type: "BytesProducedStatsDataset",
 		Input: []tf.Input{
-			start, stop, step,
+			input_dataset, tag,
 		},
 		Attrs: attrs,
 	}
@@ -10573,87 +10397,80 @@ func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output,
 	return op.Output(0)
 }
 
-// Saves tensors in V2 checkpoint format.
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SaveV2",
+		Type: "ResourceSparseApplyProximalGradientDescent",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+			var_, alpha, l1, l2, grad, indices,
 		},
+		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
-
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
-//
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
-// If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
-	return func(m optionalAttr) {
-		m["lower"] = value
-	}
-}
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
 
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
-//
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
+// MeanKeepDims sets the optional keep_dims attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.triangular_solve
-// @end_compatibility
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+func MeanKeepDims(value bool) MeanAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Solves systems of linear equations with upper or lower triangular matrices by
-//
-// backsubstitution.
-//
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
+// Computes the mean of elements across dimensions of a tensor.
 //
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in `output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10662,9 +10479,9 @@ func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
+		Type: "Mean",
 		Input: []tf.Input{
-			matrix, rhs,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -10672,366 +10489,358 @@ func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, option
 	return op.Output(0)
 }
 
-// Computes fingerprints of the input strings.
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
+
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
+//
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["vocab_size"] = value
+	}
+}
+
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+//
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["delimiter"] = value
+	}
+}
+
+// Initializes a table from a text file.
+//
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
+//
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
 //
 // Arguments:
-//	input: vector of strings to compute fingerprints on.
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
 //
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the created operation.
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
+		Type: "InitializeTableFromTextFileV2",
 		Input: []tf.Input{
-			input,
+			table_handle, filename,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
 
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["out_type"] = value
 	}
 }
 
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
+// Computes Quantized Rectified Linear: `max(features, 0)`
+//
+// Arguments:
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedRelu",
+		Input: []tf.Input{
+			features, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
+// Reshapes a SparseTensor to represent values in a new dense shape.
+//
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
+	opspec := tf.OpSpec{
+		Type: "SparseReshape",
+		Input: []tf.Input{
+			input_indices, input_shape, new_shape,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Multiply matrix "a" by matrix "b".
-//
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". This op is optimized for the case where at
-// least one of "a" or "b" is sparse. The breakeven for using this versus a dense
-// matrix multiply on one platform was 30% zero values in the sparse matrix.
+// Deprecated. Use TensorArraySplitV3
 //
-// The gradient computation of this operation will only take advantage of sparsity
-// in the input gradient when that gradient comes from a Relu.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
+		Type: "TensorArraySplitV2",
 		Input: []tf.Input{
-			a, b,
+			handle, value, lengths, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
+// PackAttr is an optional argument to Pack.
+type PackAttr func(optionalAttr)
 
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+// PackAxis sets the optional axis attribute to value.
 //
-// value: Whether to use Adapative SDCA for the inner loop.
-// If not specified, defaults to false
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+// value: Dimension along which to pack.  Negative values wrap around, so the
+// valid range is `[-(R+1), R+1)`.
+// If not specified, defaults to 0
+func PackAxis(value int64) PackAttr {
 	return func(m optionalAttr) {
-		m["adaptative"] = value
+		m["axis"] = value
 	}
 }
 
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
+// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
 //
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
+// Packs the `N` tensors in `values` into a tensor with rank one higher than each
+// tensor in `values`, by packing them along the `axis` dimension.
+// Given a list of tensors of shape `(A, B, C)`;
 //
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
+// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
+// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
+// Etc.
 //
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+// For example:
 //
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
+// ```
+// # 'x' is [1, 4]
+// # 'y' is [2, 5]
+// # 'z' is [3, 6]
+// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
+// ```
 //
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+// This is the opposite of `unpack`.
 //
 // Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
+//	values: Must be of same shape and type.
 //
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+// Returns The packed tensor.
+func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
+		Type: "Pack",
 		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+			tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+	return op.Output(0)
 }
 
-// Computes the minimum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// Reorders a SparseTensor into the canonical, row-major ordering.
 //
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
 //
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+// Reordering does not affect the shape of the SparseTensor.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
 //
 // Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMin",
+		Type: "SparseReorder",
 		Input: []tf.Input{
-			data, segment_ids,
+			input_indices, input_values, input_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
-type QuantizedResizeBilinearAttr func(optionalAttr)
-
-// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
+	return op.Output(0), op.Output(1)
 }
 
-// Resize quantized `images` to `size` using quantized bilinear interpolation.
-//
-// Input images and output images must be quantized types.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-//
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedResizeBilinear",
+		Type: "Relu",
 		Input: []tf.Input{
-			images, size, min, max,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
 
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
-// If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
-//
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
-//
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
+// Update '*var' according to the AddSign update.
 //
-// See also `RestoreSlice`.
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
 //
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+// Returns the created operation.
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Restore",
+		Type: "ResourceApplyAddSign",
 		Input: []tf.Input{
-			file_pattern, tensor_name,
+			var_, m, lr, alpha, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// WriteAudioSummaryAttr is an optional argument to WriteAudioSummary.
-type WriteAudioSummaryAttr func(optionalAttr)
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
 
-// WriteAudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// REQUIRES: value >= 1
-func WriteAudioSummaryMaxOutputs(value int64) WriteAudioSummaryAttr {
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
 	return func(m optionalAttr) {
-		m["max_outputs"] = value
+		m["overlapping"] = value
 	}
 }
 
-// Writes a `Summary` protocol buffer with audio.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// Computes gradient of the FractionalMaxPool function.
 //
 // Arguments:
-//	writer: A handle to a summary writer.
-//	step: The step to write the summary for.
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// Returns the created operation.
-func WriteAudioSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...WriteAudioSummaryAttr) (o *tf.Operation) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11040,130 +10849,107 @@ func WriteAudioSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "WriteAudioSummary",
+		Type: "FractionalMaxPoolGrad",
 		Input: []tf.Input{
-			writer, step, tag, tensor, sample_rate,
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
 
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1),
-// which exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
 	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Performs a resize and padding as a preprocess during a convolution.
-//
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// Update '*var' according to the proximal adagrad scheme.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
+		Type: "ResourceApplyAdagradDA",
 		Input: []tf.Input{
-			input, size, paddings, filter,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
 
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
-//
-//
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
+		Type: "SparseReduceMaxSparse",
 		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -11171,102 +10957,134 @@ func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Out
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Delete the tensor specified by its handle in the session.
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
 //
 // Arguments:
-//	handle: The handle for a tensor stored in the session state.
 //
-// Returns the created operation.
-func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+//
+//
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "DeleteSessionTensor",
+		Type: "RepeatDataset",
 		Input: []tf.Input{
-			handle,
+			input_dataset, count,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
-type DenseToDenseSetOperationAttr func(optionalAttr)
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
 
-// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["container"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `Tensor` inputs.
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
+//
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
+//
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+//
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToDenseSetOperation",
+		Type: "AddManySparseToTensorsMap",
 		Input: []tf.Input{
-			set1, set2,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
 
-// SumKeepDims sets the optional keep_dims attribute to value.
+// MinKeepDims sets the optional keep_dims attribute to value.
 //
 // value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
+func MinKeepDims(value bool) MinAttr {
 	return func(m optionalAttr) {
 		m["keep_dims"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a tensor.
+// Computes the minimum of elements across dimensions of a tensor.
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// Reduces `input` along the dimensions given in `axis`. Unless
 // `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// `axis`. If `keep_dims` is true, the reduced dimensions are
 // retained with length 1.
 //
 // Arguments:
 //	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce. Must be in the range
+//	axis: The dimensions to reduce. Must be in the range
 // `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...SumAttr) (output tf.Output) {
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11275,9 +11093,9 @@ func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Sum",
+		Type: "Min",
 		Input: []tf.Input{
-			input, reduction_indices,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -11285,77 +11103,87 @@ func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ..
 	return op.Output(0)
 }
 
-// Computes the sign and the log of the absolute value of the determinant of
-//
-// one or more square matrices.
-//
-// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-// form square matrices. The outputs are two tensors containing the signs and
-// absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
-// permutation matrix.
-//
-// Arguments:
-//	input: Shape is `[N, M, M]`.
+// Shuffle dimensions of x according to a permutation.
 //
-// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
-// of the N input matrices.  Shape is `[N]`.
-func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogMatrixDeterminant",
+		Type: "Transpose",
 		Input: []tf.Input{
-			input,
+			x, perm,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["data_format"] = value
 	}
 }
 
-// Number of unique elements along last dimension of input `set`.
-//
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the filter.
 //
 // Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SetSize",
+		Type: "DepthwiseConv2dNativeBackpropFilter",
 		Input: []tf.Input{
-			set_indices, set_values, set_shape,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -11363,191 +11191,158 @@ func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shap
 	return op.Output(0)
 }
 
-// The gradient of SparseFillEmptyRows.
-//
-// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-// shaped `[N_full]`, where `N_full >= N` and copies data into either
-// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-// `d_default_value` is a scalar.
-//
-//   d_values[j] = grad_values[reverse_index_map[j]]
-//   d_default_value = sum_{k : 0 .. N_full - 1} (
-//      grad_values[k] * 1{k not in reverse_index_map})
+// Flushes the writer's unwritten events.
 //
 // Arguments:
-//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
-//	grad_values: 1-D.  The gradients from backprop.
+//	writer: A handle to the summary writer resource.
 //
-// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
-func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+// Returns the created operation.
+func FlushSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRowsGrad",
+		Type: "FlushSummaryWriter",
 		Input: []tf.Input{
-			reverse_index_map, grad_values,
+			writer,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Assigns a new value to a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to return
-// this value or a subsequent newer value of the variable.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value to set the new tensor to use.
-//
-// Returns the created operation.
-func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
+
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["mode"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "AssignVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
+}
+
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
-// More formally, let
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
 //
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8, out[i] -= (range(T) + 1) / 2.0
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 //
-// Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+// *MIN_COMBINED Mode Example*
 //
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InTopKV2",
-		Input: []tf.Input{
-			predictions, targets, k,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
-type TakeManySparseFromTensorsMapAttr func(optionalAttr)
-
-// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
 //
-// value: The container name for the `SparseTensorsMap` read by this op.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
 //
-// value: The shared name for the `SparseTensorsMap` read by this op.
-// It should not be blank; rather the `shared_name` or unique Operation name
-// of the Op that created the original `SparseTensorsMap` should be used.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
+// If the mode is 'MIN_FIRST', then this approach is used:
 //
-// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-// `N` is the minibatch size and the rows correspond to the output handles of
-// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-// original `SparseTensor` objects that went into the given input ops must all
-// match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension on the left).
+// ```
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
 //
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
 //
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
+// *SCALED mode Example*
 //
-// For example, if the handles represent an input, which is a `[2, 3]` matrix
-// representing two original `SparseTensor` objects:
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
 //
-// ```
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
 // ```
 //
-// and
+// Our input tensor range is then `[-m, m]`.
 //
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
 // ```
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
 // ```
 //
-// then the final `SparseTensor` will be:
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
 //
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (max_fixed - min_fixed) / (2 * m)
 // ```
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+//
+// Now we can quantize the elements of our tensor:
+// ```c++
+// result = round(input * s)
 // ```
 //
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
+//
 // Arguments:
-//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-// Shape: `[N]`.
-//	dtype: The `dtype` of the `SparseTensor` objects stored in the
-// `SparseTensorsMap`.
 //
-// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
-func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TakeManySparseFromTensorsMap",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
-			sparse_handles,
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
@@ -11555,130 +11350,76 @@ func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
-
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
-//
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
-//
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
+// Component-wise divides a SparseTensor by a dense Tensor.
 //
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
+		Type: "SparseDenseCwiseDiv",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			sp_indices, sp_values, sp_shape, dense,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
-type FusedBatchNormGradV2Attr func(optionalAttr)
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
 
-// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["use_locking"] = value
 	}
 }
 
-// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
+// want to use Nesterov momentum.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11687,239 +11428,188 @@ func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGradV2",
+		Type: "ResourceApplyMomentum",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			var_, accum, lr, grad, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return scope.AddOperation(opspec)
 }
 
-// Constructs a tensor by tiling a given tensor.
-//
-// This operation creates a new tensor by replicating `input` `multiples` times.
-// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-// and the values of `input` are replicated `multiples[i]` times along the 'i'th
-// dimension. For example, tiling `[a b c d]` by `[2]` produces
-// `[a b c d a b c d]`.
+// Returns the truth value of (x >= y) element-wise.
 //
-// Arguments:
-//	input: 1-D or higher.
-//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
-func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tile",
+		Type: "GreaterEqual",
 		Input: []tf.Input{
-			input, multiples,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the element-wise min of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-//
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
 
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
+// Conv3DDataFormat sets the optional data_format attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["data_format"] = value
 	}
 }
 
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// Conv3DDilations sets the optional dilations attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["dilations"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// Our Conv3D implements a form of cross-correlation.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
+		Type: "Conv3D",
 		Input: []tf.Input{
-			true_classes,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
-
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
 //
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseAdd",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// Read an element from the TensorArray into output `value`.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+// Arguments:
+//	handle: The handle to a TensorArray.
+//
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayReadV3",
+		Input: []tf.Input{
+			handle, index, flow_in,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
+
+// EncodePngCompression sets the optional compression attribute to value.
 //
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["compression"] = value
 	}
 }
 
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// PNG-encode an image.
 //
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
 //
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
+//
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11928,9 +11618,9 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
+		Type: "EncodePng",
 		Input: []tf.Input{
-			contents, crop_window,
+			image,
 		},
 		Attrs: attrs,
 	}
@@ -11938,61 +11628,38 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 	return op.Output(0)
 }
 
-// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
-type RandomPoissonV2Attr func(optionalAttr)
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
 
-// RandomPoissonV2Seed sets the optional seed attribute to value.
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["src_format"] = value
 	}
 }
 
-// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomPoissonV2Dtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["dst_format"] = value
 	}
 }
 
-// Outputs random values from the Poisson distribution(s) described by rate.
-//
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+// Returns the permuted vector/tensor in the destination data format given the
 //
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
+// one in the source data format.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
 //
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`.
-func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12001,9 +11668,9 @@ func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoissonV2",
+		Type: "DataFormatVecPermute",
 		Input: []tf.Input{
-			shape, rate,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -12011,51 +11678,75 @@ func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...
 	return op.Output(0)
 }
 
-// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
-type OrderedMapPeekAttr func(optionalAttr)
+// Returns element-wise integer closest to x.
+//
+// If the result is midway between two representable values,
+// the even representable is chosen.
+// For example:
+//
+// ```
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+// ```
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rint",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// OrderedMapPeekCapacity sets the optional capacity attribute to value.
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
+
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// OrderedMapPeekContainer sets the optional container attribute to value.
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op peeks at the values at the specified key.  If the
+// Op removes and returns the (key, value) element with the smallest
 //
-// underlying container does not contain this key
-// this op will block until it does.   This Op is optimized for
-// performance.
-func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12064,9 +11755,9 @@ func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapPeek",
+		Type: "OrderedMapUnstageNoKey",
 		Input: []tf.Input{
-			key, indices,
+			indices,
 		},
 		Attrs: attrs,
 	}
@@ -12076,144 +11767,56 @@ func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.
 	}
 	var idx int
 	var err error
+	key = op.Output(idx)
 	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapPeek", err)
-		return
-	}
-	return values
-}
-
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
-//
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
-//
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseAdd",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes the gradient of the sigmoid of `x` wrt its input.
-//
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Subtracts a value from the current value of a variable.
-//
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
-//
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
+	return key, values
 }
 
-// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
-type SparseReduceMaxAttr func(optionalAttr)
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
 
-// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMax",
+		Type: "MaxPool3DGradGrad",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -12238,6 +11841,20 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 	}
 }
 
+// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
 // Computes the gradients of 3-D convolution with respect to the filter.
 //
 // Arguments:
@@ -12312,181 +11929,289 @@ func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.Dat
 	return outputs
 }
 
-// Computes numerical negative value element-wise.
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Neg",
+		Type: "ThreadUnsafeUnigramCandidateSampler",
 		Input: []tf.Input{
-			x,
+			true_classes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
 
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["data_format"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// Performs max pooling on the input.
 //
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolV2",
+		Input: []tf.Input{
+			input, ksize, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deprecated. Use TensorArrayReadV3
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayReadV2",
+		Input: []tf.Input{
+			handle, index, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Does nothing. Serves as a control trigger for scheduling.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
+// Only useful as a placeholder for control edges.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ControlTrigger",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Batch normalization.
 //
-// Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
+		Type: "BatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			t, m, v, beta, gamma,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
+
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
 }
 
-// Elementwise computes the bitwise OR of `x` and `y`.
-//
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
-		Input: []tf.Input{
-			x, y,
-		},
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
-//
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// AvgPoolAttr is an optional argument to AvgPool.
-type AvgPoolAttr func(optionalAttr)
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+//
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
+	}
+}
 
-// AvgPoolDataFormat sets the optional data_format attribute to value.
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolDataFormat(value string) AvgPoolAttr {
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["max_load_factor"] = value
 	}
 }
 
-// Performs average pooling on the input.
+// Creates an empty hash table that uses tensors as the backing store.
 //
-// Each entry in `output` is the mean of the corresponding size `ksize`
-// window in `value`.
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	ksize: The size of the sliding window for each dimension of `value`.
-//	strides: The stride of the sliding window for each dimension of `value`.
-//	padding: The type of padding algorithm to use.
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
+//	value_dtype: Type of the table values.
 //
-// Returns The average pooled output tensor.
-func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool",
+		Type: "MutableDenseHashTableV2",
 		Input: []tf.Input{
-			value,
+			empty_key,
 		},
 		Attrs: attrs,
 	}
@@ -12494,433 +12219,260 @@ func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
-// Slice a `SparseTensor` based on the `start` and `size`.
-//
-// For example, if the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// Produces the max pool of the input tensor for quantized types.
 //
 // Arguments:
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-//	start: 1-D. tensor represents the start of the slice.
-//	size: 1-D. tensor represents the size of the slice.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "SparseSlice",
+		Type: "QuantizedMaxPool",
 		Input: []tf.Input{
-			indices, values, shape, start, size,
+			input, min_input, max_input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ListDiffAttr is an optional argument to ListDiff.
-type ListDiffAttr func(optionalAttr)
-
-// ListDiffOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Computes the difference between two lists of numbers or strings.
-//
-// Given a list `x` and a list `y`, this operation returns a list `out` that
-// represents all values that are in `x` but not in `y`. The returned list `out`
-// is sorted in the same order that the numbers appear in `x` (duplicates are
-// preserved). This operation also returns a list `idx` that represents the
-// position of each `out` element in `x`. In other words:
-//
-// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-//
-// For example, given this input:
-//
-// ```
-// x = [1, 2, 3, 4, 5, 6]
-// y = [1, 3, 5]
-// ```
-//
-// This operation would return:
-//
-// ```
-// out ==> [2, 4, 6]
-// idx ==> [1, 3, 5]
-// ```
-//
-// Arguments:
-//	x: 1-D. Values to keep.
-//	y: 1-D. Values to remove.
-//
-// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
-func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ListDiff",
+		Type: "Softplus",
 		Input: []tf.Input{
-			x, y,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
-//
-// if hashed_output=true then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
-//
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
-//
-//
-//
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Computes exponential of x - 1 element-wise.
+//
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "SparseCross",
+		Type: "Expm1",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
-type FractionalMaxPoolAttr func(optionalAttr)
-
-// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+// Returns the number of records this Reader has produced.
 //
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumRecordsProducedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
+// Computes the sum along segments of a tensor.
 //
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// `index  0  1  2  3  4`
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
 //
-// `value  20 5  16 3  7`
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
 //
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalMaxPool node in the computation graph. Mainly used
-// in unit test to make FractionalMaxPool deterministic.
-// If not specified, defaults to false
-func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalMaxPoolSeed sets the optional seed attribute to value.
+// Arguments:
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentSum",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Performs fractional max pooling on the input.
-//
-// Fractional max pooling is slightly different than regular max pooling.  In
-// regular max pooling, you downsize an input set by taking the maximum value of
-// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-// a factor of N, where N is an integer.  Fractional max pooling, as you might
-// expect from the word "fractional", means that the overall reduction ratio N
-// does not have to be an integer.
-//
-// The sizes of the pooling regions are generated randomly but are fairly uniform.
-// For example, let's look at the height dimension, and the constraints on the
-// list of rows that will be pool boundaries.
-//
-// First we define the following:
-//
-// 1.  input_row_length : the number of rows from the input set
-// 2.  output_row_length : which will be smaller than the input
-// 3.  alpha = input_row_length / output_row_length : our reduction ratio
-// 4.  K = floor(alpha)
-// 5.  row_pooling_sequence : this is the result list of pool boundary rows
-//
-// Then, row_pooling_sequence should satisfy:
-//
-// 1.  a[0] = 0 : the first value of the sequence is 0
-// 2.  a[end] = input_row_length : the last value of the sequence is the size
-// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-// 4.  length(row_pooling_sequence) = output_row_length+1
-//
-// For more details on fractional max pooling, see this paper:
-// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+// Creates a dataset that emits the lines of one or more text files.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
-//
-// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPool",
+		Type: "TextLineDataset",
 		Input: []tf.Input{
-			value,
+			filenames, compression_type, buffer_size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
-//
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
-//
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
-//
-// then the output will be
-//
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-// Graphically this is equivalent to doing
-//
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
+// Checks whether a resource handle-based variable has been initialized.
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
+//	resource: the input resource handle.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "VarIsInitializedOp",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			resource,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Performs a padding as a preprocess during a convolution.
+// Pads a tensor with zeros.
 //
-// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-// implementation where the spatial padding transformation stage is fused with the
-// im2col lookup, but in this case without the bilinear filtering required for
-// resizing. Fusing the padding prevents the need to write out the intermediate
-// results as whole tensors, reducing memory pressure, and we can get some latency
-// gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-// order is used instead.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
 //
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+// The padded size of each dimension D of the output is:
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "FusedPadConv2D",
+		Type: "Pad",
 		Input: []tf.Input{
-			input, paddings, filter,
+			input, paddings,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns immutable tensor from memory region.
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
 //
-// The current implementation memmaps the tensor from a file.
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
 //
 // Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
+		Type: "SparseTensorDenseMatMul",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
@@ -12991,282 +12543,416 @@ func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.D
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
 
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+// StringJoinSeparator sets the optional separator attribute to value.
 //
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
 	return func(m optionalAttr) {
-		m["adjoint_a"] = value
+		m["separator"] = value
 	}
 }
 
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+// Joins the strings in the given list of string tensors into one tensor;
 //
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
+// with the given separator (default is an empty separator).
+//
+// Arguments:
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringJoin",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+// Returns immutable tensor from memory region.
 //
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
+// The current implementation memmaps the tensor from a file.
 //
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	opspec := tf.OpSpec{
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "IRFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates a list of `SparseTensor` along the specified dimension.
+//
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
+//
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
+//
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "SparseConcat",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// WriteImageSummaryAttr is an optional argument to WriteImageSummary.
-type WriteImageSummaryAttr func(optionalAttr)
-
-// WriteImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
+// Generates sparse cross from a list of sparse and dense tensors.
 //
-// REQUIRES: value >= 1
-func WriteImageSummaryMaxImages(value int64) WriteImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
-	}
-}
-
-// Writes a `Summary` protocol buffer with images.
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
 //
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
+// For example, if the inputs are
 //
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
 //
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
 //
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
+//     inputs[2]: Tensor [["f"], ["g"]]
 //
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
+// then the output will be
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
 //
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+// if hashed_output=true then the output will be
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
 //
 // Arguments:
-//	writer: A handle to a summary writer.
-//	step: The step to write the summary for.
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
-//	bad_color: Color to use for pixels with non-finite values.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
 //
-// Returns the created operation.
-func WriteImageSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, bad_color tf.Output, optional ...WriteImageSummaryAttr) (o *tf.Operation) {
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "WriteImageSummary",
+		Type: "SparseCross",
 		Input: []tf.Input{
-			writer, step, tag, tensor, bad_color,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Pads a tensor with zeros.
+// ListDiffAttr is an optional argument to ListDiff.
+type ListDiffAttr func(optionalAttr)
+
+// ListDiffOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Computes the difference between two lists of numbers or strings.
 //
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
+// Given a list `x` and a list `y`, this operation returns a list `out` that
+// represents all values that are in `x` but not in `y`. The returned list `out`
+// is sorted in the same order that the numbers appear in `x` (duplicates are
+// preserved). This operation also returns a list `idx` that represents the
+// position of each `out` element in `x`. In other words:
 //
-// The padded size of each dimension D of the output is:
+// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
 //
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+// For example, given this input:
 //
-// For example:
+// ```
+// x = [1, 2, 3, 4, 5, 6]
+// y = [1, 3, 5]
+// ```
+//
+// This operation would return:
 //
 // ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
+// out ==> [2, 4, 6]
+// idx ==> [1, 3, 5]
 // ```
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the number of elements in the given queue.
 //
 // Arguments:
-//	handle: The handle to a queue.
+//	x: 1-D. Values to keep.
+//	y: 1-D. Values to remove.
 //
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
+func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
+		Type: "ListDiff",
 		Input: []tf.Input{
-			handle,
+			x, y,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
-//
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
 //
 // Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
+		Type: "SparseTensorDenseAdd",
 		Input: []tf.Input{
-			tag, values,
+			a_indices, a_values, a_shape, b,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that emits the lines of one or more text files.
-//
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
-		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
-		},
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
 //
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
+		Type: "SparseToSparseSetOperation",
 		Input: []tf.Input{
-			reader_handle,
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes exponential of x - 1 element-wise.
+// Computes numerical negative value element-wise.
 //
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Expm1",
+		Type: "Neg",
 		Input: []tf.Input{
 			x,
 		},
@@ -13275,134 +12961,118 @@ func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Batch normalization.
+// Writes a `Summary` protocol buffer with a histogram.
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
 //
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+// This op reports an `InvalidArgument` error if any value is not finite.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+//	writer: A handle to a summary writer.
+//	step: The step to write the summary for.
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns the created operation.
+func WriteHistogramSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "WriteHistogramSummary",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			writer, step, tag, values,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
-
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
 //
-// Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
 //
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
+		Type: "SparseAdd",
 		Input: []tf.Input{
-			input, ksize, strides,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
-type OrderedMapUnstageNoKeyAttr func(optionalAttr)
+// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
+type OrderedMapPeekAttr func(optionalAttr)
 
-// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// OrderedMapPeekCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
+func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
+func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
+// OrderedMapPeekContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
+func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op removes and returns the (key, value) element with the smallest
+// Op peeks at the values at the specified key.  If the
 //
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// underlying container does not contain this key
+// this op will block until it does.   This Op is optimized for
+// performance.
+func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13411,9 +13081,9 @@ func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataTyp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
+		Type: "OrderedMapPeek",
 		Input: []tf.Input{
-			indices,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
@@ -13423,359 +13093,397 @@ func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataTyp
 	}
 	var idx int
 	var err error
-	key = op.Output(idx)
 	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstageNoKey", err)
+		scope.UpdateErr("OrderedMapPeek", err)
 		return
 	}
-	return key, values
+	return values
 }
 
-// Read an element from the TensorArray into output `value`.
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
+
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
 //
-// Arguments:
-//	handle: The handle to a TensorArray.
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
 //
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
 //
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
-		Input: []tf.Input{
-			handle, index, flow_in,
-		},
-		Attrs: attrs,
+}
+
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
 //
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
 //
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
+		Type: "DecodeAndCropJpeg",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			contents, crop_window,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
 
-// Conv3DDataFormat sets the optional data_format attribute to value.
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["seed"] = value
 	}
 }
 
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// Our Conv3D implements a form of cross-correlation.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3D",
+		Type: "AllCandidateSampler",
 		Input: []tf.Input{
-			input, filter,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the truth value of (x >= y) element-wise.
+// Returns the element-wise min of two SparseTensors.
 //
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
+		Type: "SparseSparseMinimum",
 		Input: []tf.Input{
-			x, y,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
-
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// Constructs a tensor by tiling a given tensor.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// This operation creates a new tensor by replicating `input` `multiples` times.
+// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+// and the values of `input` are replicated `multiples[i]` times along the 'i'th
+// dimension. For example, tiling `[a b c d]` by `[2]` produces
+// `[a b c d a b c d]`.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+// Arguments:
+//	input: 1-D or higher.
+//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
+func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Tile",
+		Input: []tf.Input{
+			input, multiples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+// Saves the input tensors to disk.
 //
-// want to use Nesterov momentum.
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// See also `SaveSlices`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
 //
 // Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
+		Type: "Save",
 		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
+			filename, tensor_names, tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns element-wise integer closest to x.
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
 //
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
 //
-// ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rint",
+		Type: "FloorMod",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
 
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` read by this op.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["container"] = value
 	}
 }
 
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["round_mode"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8, out[i] -= (range(T) + 1) / 2.0
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 //
-// *MIN_COMBINED Mode Example*
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
 //
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-// If the mode is 'MIN_FIRST', then this approach is used:
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
 //
 // ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
 // ```
 //
-// Our input tensor range is then `[-m, m]`.
+// and
 //
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
 // ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
 // ```
 //
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
+// then the final `SparseTensor` will be:
 //
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
 // ```
-//
-// Now we can quantize the elements of our tensor:
-// ```c++
-// result = round(input * s)
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
 // ```
 //
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
-//
 // Arguments:
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-//
-//
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
+		Type: "TakeManySparseFromTensorsMap",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			sparse_handles,
 		},
 		Attrs: attrs,
 	}
@@ -13783,154 +13491,164 @@ func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+// Says whether the targets are in the top `K` predictions.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InTopKV2",
+		Input: []tf.Input{
+			predictions, targets, k,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the gradients of depthwise convolution with respect to the filter.
+// Assigns a new value to a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to return
+// this value or a subsequent newer value of the variable.
 //
 // Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value to set the new tensor to use.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+// Returns the created operation.
+func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "AssignVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns a tensor of ones with the same shape and type as x.
+//
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Type: "OnesLike",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation.
+// The gradient of SparseFillEmptyRows.
+//
+// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+// shaped `[N_full]`, where `N_full >= N` and copies data into either
+// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+// `d_default_value` is a scalar.
+//
+//   d_values[j] = grad_values[reverse_index_map[j]]
+//   d_default_value = sum_{k : 0 .. N_full - 1} (
+//      grad_values[k] * 1{k not in reverse_index_map})
 //
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+// Arguments:
+//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+//	grad_values: 1-D.  The gradients from backprop.
+//
+// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
+func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Transpose",
+		Type: "SparseFillEmptyRowsGrad",
 		Input: []tf.Input{
-			x, perm,
+			reverse_index_map, grad_values,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+//
+// if < 0, `scale * features` otherwise.
+//
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReadFile",
+		Type: "Selu",
 		Input: []tf.Input{
-			filename,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
-
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
-//
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
-//
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
-//
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
+// Number of unique elements along last dimension of input `set`.
 //
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
 //
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13939,9 +13657,9 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "SetSize",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			set_indices, set_values, set_shape,
 		},
 		Attrs: attrs,
 	}
@@ -13949,65 +13667,64 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 	return op.Output(0)
 }
 
-// Creates a dataset that emits the outputs of `input_dataset` `count` times.
+// Computes the sign and the log of the absolute value of the determinant of
 //
-// Arguments:
+// one or more square matrices.
 //
-//	count: A scalar representing the number of times that `input_dataset` should
-// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
+// form square matrices. The outputs are two tensors containing the signs and
+// absolute values of the log determinants for all N input submatrices
+// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
+// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
+// is the LU decomposition of the input and P is the corresponding
+// permutation matrix.
 //
+// Arguments:
+//	input: Shape is `[N, M, M]`.
 //
-func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
+// of the N input matrices.  Shape is `[N]`.
+func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RepeatDataset",
+		Type: "LogMatrixDeterminant",
 		Input: []tf.Input{
-			input_dataset, count,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
-type SparseReduceMaxSparseAttr func(optionalAttr)
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
 
-// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
+// SumKeepDims sets the optional keep_dims attribute to value.
 //
 // value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
+func SumKeepDims(value bool) SumAttr {
 	return func(m optionalAttr) {
 		m["keep_dims"] = value
 	}
 }
 
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-// SparseTensor.
+// Computes the sum of elements across dimensions of a tensor.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// Reduces `input` along the dimensions given in `axis`. Unless
 // `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14016,263 +13733,260 @@ func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMaxSparse",
+		Type: "Sum",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
+	return op.Output(0)
 }
 
-// Update '*var' according to the proximal adagrad scheme.
+// Delete the tensor specified by its handle in the session.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	handle: The handle for a tensor stored in the session state.
 //
 // Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
+		Type: "DeleteSessionTensor",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+			handle,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
-
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
+// L2 Loss.
 //
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
+// Computes half the L2 norm of a tensor without the `sqrt`:
 //
-// `index  0  1  2  3  4`
+//     output = sum(t ** 2) / 2
 //
-// `value  20 5  16 3  7`
+// Arguments:
+//	t: Typically 2-D, but may have any dimensions.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "L2Loss",
+		Input: []tf.Input{
+			t,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
+
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Computes gradient of the FractionalMaxPool function.
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
+		Type: "DenseToSparseSetOperation",
 		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			set1, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Does nothing. Serves as a control trigger for scheduling.
-//
-// Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
-	}
-	return scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
-type ResourceApplyAddSignAttr func(optionalAttr)
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
-// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If true, rescale input by (new_height - 1) / (height - 1),
+// which exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
 // If not specified, defaults to false
-func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["resize_align_corners"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
+// Performs a resize and padding as a preprocess during a convolution.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-// variable <- variable - lr_t * update
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	alpha: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// Returns the created operation.
-func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAddSign",
+		Type: "FusedResizeAndPadConv2D",
 		Input: []tf.Input{
-			var_, m, lr, alpha, sign_decay, beta, grad,
+			input, size, paddings, filter,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Reorders a SparseTensor into the canonical, row-major ordering.
-//
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
+// Subtracts a value from the current value of a variable.
 //
-// Reordering does not affect the shape of the SparseTensor.
+// Any ReadVariableOp which depends directly or indirectly on this assign is
+// guaranteed to see the incremented value or a subsequent newer one.
 //
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+// Outputs the incremented value, which can be used to totally order the
+// increments to this variable.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns the created operation.
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReorder",
+		Type: "AssignSubVariableOp",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape,
+			resource, value,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// PackAttr is an optional argument to Pack.
-type PackAttr func(optionalAttr)
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
 
-// PackAxis sets the optional axis attribute to value.
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
 //
-// value: Dimension along which to pack.  Negative values wrap around, so the
-// valid range is `[-(R+1), R+1)`.
-// If not specified, defaults to 0
-func PackAxis(value int64) PackAttr {
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["preferred_shard"] = value
 	}
 }
 
-// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
-//
-// Packs the `N` tensors in `values` into a tensor with rank one higher than each
-// tensor in `values`, by packing them along the `axis` dimension.
-// Given a list of tensors of shape `(A, B, C)`;
-//
-// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-// Etc.
+// Restores a tensor from checkpoint files.
 //
-// For example:
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
 //
-// ```
-// # 'x' is [1, 4]
-// # 'y' is [2, 5]
-// # 'z' is [3, 6]
-// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-// ```
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
 //
-// This is the opposite of `unpack`.
+// See also `RestoreSlice`.
 //
 // Arguments:
-//	values: Must be of same shape and type.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
 //
-// Returns The packed tensor.
-func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dt": dt}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Pack",
+		Type: "Restore",
 		Input: []tf.Input{
-			tf.OutputList(values),
+			file_pattern, tensor_name,
 		},
 		Attrs: attrs,
 	}
@@ -14280,41 +13994,35 @@ func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Out
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
-		Input: []tf.Input{
-			handle, value, lengths, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
 
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear: `max(features, 0)`
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
+//
+// Input images and output images must be quantized types.
 //
 // Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14323,9 +14031,9 @@ func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
+		Type: "QuantizedResizeBilinear",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			images, size, min, max,
 		},
 		Attrs: attrs,
 	}
@@ -14333,126 +14041,193 @@ func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Computes the minimum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BytesProducedStatsDataset",
+		Type: "SegmentMin",
 		Input: []tf.Input{
-			input_dataset, tag,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
 
-// QrFullMatrices sets the optional full_matrices attribute to value.
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
 //
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
+// value: Whether to use Adapative SDCA for the inner loop.
 // If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
 	return func(m optionalAttr) {
-		m["full_matrices"] = value
+		m["adaptative"] = value
 	}
 }
 
-// Computes the QR decompositions of one or more matrices.
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
 //
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
 //
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
 //
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Qr",
+		Type: "SdcaOptimizer",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
 
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["max_outputs"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["a_is_sparse"] = value
+	}
+}
+
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["b_is_sparse"] = value
+	}
+}
+
+// Multiply matrix "a" by matrix "b".
 //
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". This op is optimized for the case where at
+// least one of "a" or "b" is sparse. The breakeven for using this versus a dense
+// matrix multiply on one platform was 30% zero values in the sparse matrix.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+// The gradient computation of this operation will only take advantage of sparsity
+// in the input gradient when that gradient comes from a Relu.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummary",
+		Type: "SparseMatMul",
 		Input: []tf.Input{
-			tag, tensor,
+			a, b,
 		},
 		Attrs: attrs,
 	}
@@ -14460,124 +14235,146 @@ func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate flo
 	return op.Output(0)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-//
-// Given a `tensor`, and a `int32` tensor `axis` representing the set of
-// dimensions of `tensor` to reverse. This operation reverses each dimension
-// `i` for which there exists `j` s.t. `axis[j] == i`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions specified
-// in `axis` may be 0 or more entries. If an index is specified more than
-// once, a InvalidArgument error is raised.
+// Computes the power of one value to another.
 //
-// For example:
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
 //
 // ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
+
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of a tensor.
 //
-// # 'dims' is [3] or 'dims' is [-1]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-// # 'dims' is '[1]' (or 'dims' is '[-3]')
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
+// For example:
 //
-// # 'dims' is '[2]' (or 'dims' is '[-2]')
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
 // ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Shape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes fingerprints of the input strings.
 //
 // Arguments:
-//	tensor: Up to 8-D.
-//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
-// `[-rank(tensor), rank(tensor))`.
+//	input: vector of strings to compute fingerprints on.
 //
-// Returns The same shape as `tensor`.
-func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReverseV2",
+		Type: "SdcaFprint",
 		Input: []tf.Input{
-			tensor, axis,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
+type RandomPoissonV2Attr func(optionalAttr)
 
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// RandomPoissonV2Seed sets the optional seed attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
+// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomPoissonV2Dtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from the Poisson distribution(s) described by rate.
 //
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
 //
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
 //
-// Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`.
+func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14586,93 +14383,121 @@ func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
+		Type: "RandomPoissonV2",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Inverse 3D fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
+
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
 //
-// Arguments:
-//	input: A complex64 tensor.
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
+// If not specified, defaults to true
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["lower"] = value
+	}
+}
+
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
 //
 // @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
+// Equivalent to np.linalg.triangular_solve
 // @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// If not specified, defaults to false
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Solves systems of linear equations with upper or lower triangular matrices by
+//
+// backsubstitution.
+//
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
+//
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in `output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IFFT3D",
+		Type: "MatrixTriangularSolve",
 		Input: []tf.Input{
-			input,
+			matrix, rhs,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Increments variable pointed to by 'resource' until it reaches 'limit'.
-//
-// Arguments:
-//	resource: Should be from a scalar `Variable` node.
-//	limit: If incrementing ref would bring it above limit, instead generates an
-// 'OutOfRange' error.
-//
-//
-// Returns A copy of the input before increment. If nothing else modifies the
-// input, the values produced will all be distinct.
-func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
+// Computes inverse hyperbolic sine of x element-wise.
+func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"limit": limit, "T": T}
 	opspec := tf.OpSpec{
-		Type: "ResourceCountUpTo",
+		Type: "Asinh",
 		Input: []tf.Input{
-			resource,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Looks up keys in a table, outputs the corresponding values.
-//
-// The tensor `keys` must of the same type as the keys of the table.
-// The output `values` is of the type of the table values.
-//
-// The scalar `default_value` is the value output for keys not present in the
-// table. It must also be of the same type as the table values.
+// Creates a dataset with a range of values. Corresponds to python's xrange.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
 //
 //
-// Returns Same shape as `keys`.  Values found in the table, or `default_values`
-// for missing keys.
-func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "LookupTableFindV2",
+		Type: "RangeDataset",
 		Input: []tf.Input{
-			table_handle, keys, default_value,
+			start, stop, step,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -14695,6 +14520,20 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 	}
 }
 
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
 // Computes the gradients of depthwise convolution with respect to the input.
 //
 // Arguments:
@@ -14734,130 +14573,142 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 	return op.Output(0)
 }
 
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
-
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+// Adds sparse updates to the variable referenced by `resource`.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Solves systems of linear equations.
+// This operation computes
 //
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterAdd",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
+		Type: "ReciprocalGrad",
 		Input: []tf.Input{
-			matrix, rhs,
+			y, dy,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Transforms a Tensor into a serialized TensorProto proto.
-//
-// Arguments:
-//	tensor: A Tensor of type `T`.
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
 //
-// Returns A serialized TensorProto proto of the input tensor.
-func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeTensor",
+		Type: "Minimum",
 		Input: []tf.Input{
-			tensor,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
 
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["upper_frequency_limit"] = value
 	}
 }
 
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["lower_frequency_limit"] = value
 	}
 }
 
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["filterbank_channel_count"] = value
 	}
 }
 
-// Gradient for batch normalization.
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["dct_coefficient_count"] = value
+	}
+}
+
+// Transforms a spectrogram into a form that's useful for speech recognition.
 //
-// Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Arguments:
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14866,77 +14717,135 @@ func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
+		Type: "Mfcc",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			spectrogram, sample_rate,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+// Returns the element-wise sum of a list of tensors.
+//
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
+//
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+//
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+//
+// Arguments:
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "Relu",
+		Type: "AccumulateNV2",
 		Input: []tf.Input{
-			features,
+			tf.OutputList(inputs),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// L2 Loss.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// Computes half the L2 norm of a tensor without the `sqrt`:
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
 //
-//     output = sum(t ** 2) / 2
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
+//
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
 //
 // Arguments:
-//	t: Typically 2-D, but may have any dimensions.
 //
-// Returns 0-D.
-func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "L2Loss",
+		Type: "QuantizeDownAndShrinkRange",
 		Input: []tf.Input{
-			t,
+			input, input_min, input_max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
+
+// RandomGammaSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
 
-// ShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["seed2"] = value
 	}
 }
 
-// Returns the shape of a tensor.
+// Outputs random values from the Gamma distribution(s) described by alpha.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
 //
-// For example:
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14945,9 +14854,9 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "RandomGamma",
 		Input: []tf.Input{
-			input,
+			shape, alpha,
 		},
 		Attrs: attrs,
 	}
@@ -14955,322 +14864,381 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
 //
-// Inputs are the logits, not probabilities.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2D convolution given quantized 4D input and filter tensors.
+//
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
+		Type: "QuantizedConv2D",
 		Input: []tf.Input{
-			features, labels,
+			input, filter, min_input, max_input, min_filter, max_filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns x - y element-wise.
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from the variable pointed to by `resource` according to `indices`.
 //
-// *NOTE*: `Sub` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+//
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
+//
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Sub",
+		Type: "ResourceGather",
 		Input: []tf.Input{
-			x, y,
+			resource, indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Get the value of the tensor specified by its handle.
+// Delete the TensorArray from its resource container.
+//
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
 // Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//	dtype: The type of the output value.
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 //
-// Returns The tensor for the given handle.
-func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+// Returns the created operation.
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "GetSessionTensor",
+		Type: "TensorArrayCloseV3",
 		Input: []tf.Input{
 			handle,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
 
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["data_format"] = value
 	}
 }
 
-// Update '*var' as FOBOS algorithm with fixed learning rate.
-//
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
+		Type: "MaxPoolGradGrad",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// 2D fast Fourier transform.
-//
-// Computes the 2-dimensional discrete Fourier transform over the inner-most
-// 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform.
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
+
+// RandomUniformIntSeed sets the optional seed attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft2
-// @end_compatibility
-func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT2D",
-		Input: []tf.Input{
-			input,
-		},
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a tensor filled with a scalar value.
-//
-// This operation creates a tensor of shape `dims` and fills it with `value`.
-//
-// For example:
-//
-// ```
-// # Output tensor has shape [2, 3].
-// fill([2, 3], 9) ==> [[9, 9, 9]
-//                      [9, 9, 9]]
-// ```
-//
-// Arguments:
-//	dims: 1-D. Represents the shape of the output tensor.
-//	value: 0-D (scalar). Value to fill the returned tensor.
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.full
-// @end_compatibility
-func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fill",
-		Input: []tf.Input{
-			dims, value,
-		},
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse 2D fast Fourier transform.
+// Outputs random integers from a uniform distribution.
 //
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
 //
-// Arguments:
-//	input: A complex64 tensor.
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+// Arguments:
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft2
-// @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IFFT2D",
+		Type: "RandomUniformInt",
 		Input: []tf.Input{
-			input,
+			shape, minval, maxval,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
 
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
+// SkipgramWindowSize sets the optional window_size attribute to value.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["window_size"] = value
 	}
 }
 
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+// SkipgramMinCount sets the optional min_count attribute to value.
 //
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
-// If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
 	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+		m["min_count"] = value
 	}
 }
 
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+// SkipgramSubsample sets the optional subsample attribute to value.
 //
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
-// If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["subsample"] = value
 	}
 }
 
-// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+// Parses a text file and creates a batch of examples.
 //
-// value: If true (default is false), then all
-// elements in the TensorArray will be expected to have have identical shapes.
-// This allows certain behaviors, like dynamically checking for
-// consistent shapes on write, and being able to fill in properly
-// shaped zero tensors on stack -- even if the element_shape attribute
-// is not fully defined.
-// If not specified, defaults to false
-func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["identical_element_shapes"] = value
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//
+// Arguments:
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Skipgram",
+
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
 //
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["out_type"] = value
 	}
 }
 
-// An array of Tensors of given size.
-//
-// Write data via Write and read via Read or Pack.
+// Converts each string in the input Tensor to the specified numeric type.
 //
-// Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
 //
-// Returns The handle to the TensorArray.A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
+		Type: "StringToNumber",
 		Input: []tf.Input{
-			size,
+			string_tensor,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
 
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15279,49 +15247,104 @@ func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
+		Type: "ResourceApplyFtrlV2",
 		Input: []tf.Input{
-			var_, alpha, delta,
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
 
-// MultinomialSeed sets the optional seed attribute to value.
+// TruncatedNormalSeed sets the optional seed attribute to value.
 //
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// MultinomialSeed2 sets the optional seed2 attribute to value.
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
 // value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// Draws samples from a multinomial distribution.
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncatedNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxVarsPerChannelAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannel.
+type FakeQuantWithMinMaxVarsPerChannelAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsPerChannelNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsPerChannelNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsPerChannelNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsPerChannelNarrowRange(value bool) FakeQuantWithMinMaxVarsPerChannelAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
 //
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+// `[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
+// to 'outputs' tensor of same shape as `inputs`.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+//
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVarsPerChannel(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15330,9 +15353,9 @@ func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Multinomial",
+		Type: "FakeQuantWithMinMaxVarsPerChannel",
 		Input: []tf.Input{
-			logits, num_samples,
+			inputs, min, max,
 		},
 		Attrs: attrs,
 	}
@@ -15340,998 +15363,927 @@ func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
 
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// RandomShuffleSeed sets the optional seed attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
-		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
-		},
-		Attrs: attrs,
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// Randomly shuffles a tensor along its first dimension.
 //
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
 //
-// Inputs are the logits, not probabilities.
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
+//	value: The tensor to be shuffled.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Type: "RandomShuffle",
 		Input: []tf.Input{
-			features, labels,
+			value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
+// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
+type OrderedMapIncompleteSizeAttr func(optionalAttr)
 
-// TensorSummaryDescription sets the optional description attribute to value.
+// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
 	return func(m optionalAttr) {
-		m["description"] = value
+		m["capacity"] = value
 	}
 }
 
-// TensorSummaryLabels sets the optional labels attribute to value.
+// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
 	return func(m optionalAttr) {
-		m["labels"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
-//
-// value: An unused string.
+// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
 	return func(m optionalAttr) {
-		m["display_name"] = value
+		m["container"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with a tensor.
-//
-// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-// a tag as well as a serialized SummaryMetadata proto string that contains
-// plugin-specific data. We will keep this op to maintain backwards compatibility.
-//
-// Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
+// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummary",
-		Input: []tf.Input{
-			tensor,
-		},
+		Type: "OrderedMapIncompleteSize",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor.
-//
-// This operation folds the padded areas of `input` by `MirrorPad` according to the
-// `paddings` you specify. `paddings` must be the same as `paddings` argument
-// given to the corresponding `MirrorPad` op.
-//
-// The folded size of each dimension D of the output is:
-//
-// `input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
-// # 'paddings' is [[0, 1]], [0, 1]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[ 1,  5]
-//                       [11, 28]]
-// ```
-//
-// Arguments:
-//	input: The input tensor to be folded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: The mode used in the `MirrorPad` op.
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
+
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
 //
-// Returns The folded tensor.
-func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode}
-	opspec := tf.OpSpec{
-		Type: "MirrorPadGrad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the inverse permutation of a tensor.
-//
-// This operation computes the inverse of an index permutation. It takes a 1-D
-// integer tensor `x`, which represents the indices of a zero-based array, and
-// swaps each value with its index position. In other words, for an output tensor
-// `y` and an input tensor `x`, this operation computes the following:
-//
-// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-//
-// The values must include 0. There can be no duplicate values or negative values.
-//
-// For example:
-//
-// ```
-// # tensor `x` is [3, 4, 0, 2, 1]
-// invert_permutation(x) ==> [2, 4, 3, 0, 1]
-// ```
+// Reinterpret the bytes of a string as a vector of numbers.
 //
 // Arguments:
-//	x: 1-D.
+//	bytes: All the elements must have the same length.
 //
-// Returns 1-D.
-func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InvertPermutation",
+		Type: "DecodeRaw",
 		Input: []tf.Input{
-			x,
+			bytes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reverses specific dimensions of a tensor.
+// Copy a tensor setting everything outside a central band in each innermost matrix
 //
-// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-// of `tensor`, this operation reverses each dimension i of `tensor` where
-// `dims[i]` is `True`.
+// to zero.
 //
-// `tensor` can have up to 8 dimensions. The number of dimensions
-// of `tensor` must equal the number of elements in `dims`. In other words:
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
 //
-// `rank(tensor) = size(dims)`
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
 //
 // For example:
 //
 // ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
 //
-// # 'dims' is [False, False, False, True]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
 //
-// # 'dims' is [False, True, False, False]
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
 //
-// # 'dims' is [False, False, True, False]
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
 // ```
 //
 // Arguments:
-//	tensor: Up to 8-D.
-//	dims: 1-D. The dimensions to reverse.
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
 //
-// Returns The same shape as `tensor`.
-func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Reverse",
+		Type: "MatrixBandPart",
 		Input: []tf.Input{
-			tensor, dims,
+			input, num_lower, num_upper,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Fills empty rows in the input 2-D `SparseTensor` with a default value.
-//
-// The input `SparseTensor` is represented via the tuple of inputs
-// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
-// same `dense_shape` but with indices `output_indices` and values
-// `output_values`.
-//
-// This op inserts a single entry for every row that doesn't have any values.
-// The index is created as `[row, 0, ..., 0]` and the inserted value
-// is `default_value`.
-//
-// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [2, 0]: c
-//     [3, 1]: d
-//
-// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [1, 0]: default_value
-//     [2, 0]: c
-//     [3, 1]: d
-//     [4, 0]: default_value
-//
-// The output `SparseTensor` will be in row-major order and will have the
-// same shape as the input.
-//
-// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
+
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
 //
-//     empty_row_indicator[i] = True iff row i was an empty row.
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Decompress strings.
 //
-// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
-// backpropagation,
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
 //
-//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
 //
 // Arguments:
-//	indices: 2-D. the indices of the sparse tensor.
-//	values: 1-D. the values of the sparse tensor.
-//	dense_shape: 1-D. the shape of the sparse tensor.
-//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
-//   for rows missing from the input sparse tensor.
-// output indices: 2-D. the indices of the filled sparse tensor.
+//	bytes: A Tensor of string which is compressed.
 //
-// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
-// input sparse tensor.1-D. a map from the input indices to the output indices.
-func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRows",
+		Type: "DecodeCompressed",
 		Input: []tf.Input{
-			indices, values, dense_shape, default_value,
+			bytes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// Conv2DAttr is an optional argument to Conv2D.
-type Conv2DAttr func(optionalAttr)
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
 
-// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+// WholeFileReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["container"] = value
 	}
 }
 
-// Conv2DDataFormat sets the optional data_format attribute to value.
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func Conv2DDataFormat(value string) Conv2DAttr {
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`, this op
-// performs the following:
-//
-// 1. Flattens the filter to a 2-D matrix with shape
-//    `[filter_height * filter_width * in_channels, output_channels]`.
-// 2. Extracts image patches from the input tensor to form a *virtual*
-//    tensor of shape `[batch, out_height, out_width,
-//    filter_height * filter_width * in_channels]`.
-// 3. For each patch, right-multiplies the filter matrix and the image patch
-//    vector.
-//
-// In detail, with the default NHWC format,
+// A Reader that outputs the entire contents of a file as a value.
 //
-//     output[b, i, j, k] =
-//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-//                         filter[di, dj, q, k]
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
 //
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WholeFileReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a tf.Example proto (as a string) into typed tensors.
 //
 // Arguments:
-//	input: A 4-D tensor. The dimension order is interpreted according to the value
-// of `data_format`, see below for details.
-//	filter: A 4-D tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`
-//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
-// dimension of `input`. The dimension order is determined by the value of
-//   `data_format`, see below for details.
-//	padding: The type of padding algorithm to use.
-//
-// Returns A 4-D tensor. The dimension order is determined by the value of
-// `data_format`, see below for details.
-func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	dense_defaults: A list of Tensors (some may be empty), whose length matches
+// the length of `dense_keys`. dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse features to be parsed from the example. This
+// must match the lengths of `sparse_keys` and `sparse_types`.
+//	sparse_keys: A list of `num_sparse` strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: The keys expected in the Examples' features associated with dense
+// values.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each
+// Feature given in sparse_keys.
+// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: The shapes of data in each Feature given in dense_keys.
+// The length of this list must match the length of `dense_keys`.  The
+// number of elements in the Feature corresponding to dense_key[j] must
+// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+// D1, .., DN), where M is the number of blocks of elements of length
+// D1 * .... * DN, in the input.
+func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleExample",
+		Input: []tf.Input{
+			serialized, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2D",
+		Type: "Acos",
 		Input: []tf.Input{
-			input, filter,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// VariableShapeAttr is an optional argument to VariableShape.
-type VariableShapeAttr func(optionalAttr)
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
 
-// VariableShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["Targmax"] = value
 	}
 }
 
-// Returns the shape of the variable pointed to by `resource`.
+// Performs max pooling on the input and outputs both max values and indices.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index
+// `((b * height + y) * width + x) * channels + c`.
 //
-// For example:
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VariableShape",
+		Type: "MaxPoolWithArgmax",
 		Input: []tf.Input{
 			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+//
+// Arguments:
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
+//
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "ParseTensor",
+		Input: []tf.Input{
+			serialized,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
 
-// StringJoinSeparator sets the optional separator attribute to value.
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
 	return func(m optionalAttr) {
-		m["separator"] = value
+		m["capacity"] = value
 	}
 }
 
-// Joins the strings in the given list of string tensors into one tensor;
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// with the given separator (default is an empty separator).
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
 //
-// Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringJoin",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
+		Type: "MapClear",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// DecodeCSVNaValue sets the optional na_value attribute to value.
+//
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
+//
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
 //
 // Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or empty if the column is required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ParseExample",
+		Type: "DecodeCSV",
 		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+			records, tf.OutputList(record_defaults),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
 		return
 	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
 		return
 	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
+	return output
 }
 
-// Compute the pairwise cross product.
+// Returns the rank of a tensor.
 //
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
+// This operation returns an integer representing the rank of `input`.
 //
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
+// For example:
 //
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
+// ```
+//
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cross",
+		Type: "Rank",
 		Input: []tf.Input{
-			a, b,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 2D real-valued fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
-//
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
+		Type: "Fact",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
+// Makes its input available to the next iteration.
 //
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
 //
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncateMod",
+		Type: "NextIteration",
 		Input: []tf.Input{
-			x, y,
+			data,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
-
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// Creates a dataset that skips `count` elements from the `input_dataset`.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the adagrad scheme.
+// Arguments:
 //
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
 //
-// Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
+		Type: "SkipDataset",
 		Input: []tf.Input{
-			var_, accum, lr, grad,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
-
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-//
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
+		Type: "Tanh",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
-
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// Computes the maximum along segments of a tensor.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of max pooling function.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
+		Type: "SegmentMax",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
-
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
 
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// Computes gradients of the average pooling function.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "AvgPoolGrad",
 		Input: []tf.Input{
-			true_classes,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+// StageClearAttr is an optional argument to StageClear.
+type StageClearAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// StageClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+// REQUIRES: value >= 0
+func StageClearCapacity(value int64) StageClearAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["capacity"] = value
 	}
 }
 
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// StageClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// REQUIRES: value >= 0
+func StageClearMemoryLimit(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageClearContainer(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageClearSharedName(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
 //
 // Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
-		},
+		Type: "StageClear",
+
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Store the input tensor in the state of the current session.
-//
-// Arguments:
-//	value: The tensor to be stored.
+// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
+type ComputeAccidentalHitsAttr func(optionalAttr)
+
+// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a string.
-func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "GetSessionHandle",
-		Input: []tf.Input{
-			value,
-		},
+}
+
+// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Decode web-safe base64-encoded strings.
+// Computes the ids of the positions in sampled_candidates that match true_labels.
 //
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
+// When doing log-odds NCE, the result of this op should be passed through a
+// SparseToDense op, then added to the logits of the sampled candidates. This has
+// the effect of 'removing' the sampled labels that match the true labels by
+// making the classifier sure that they are sampled labels.
 //
 // Arguments:
-//	input: Base64 strings to decode.
+//	true_classes: The true_classes output of UnpackSparseLabels.
+//	sampled_candidates: The sampled_candidates output of CandidateSampler.
+//	num_true: Number of true labels per context.
 //
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
+// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
+// is -FLOAT_MAX.
+func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_true": num_true}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
+		Type: "ComputeAccidentalHits",
 		Input: []tf.Input{
-			input,
+			true_classes, sampled_candidates,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes sigmoid of `x` element-wise.
+//
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tanh",
+		Type: "Sigmoid",
 		Input: []tf.Input{
 			x,
 		},
@@ -16340,215 +16292,200 @@ func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Restores tensors from a V2 checkpoint.
-//
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
-//
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
-//
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
-//
-// Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
+
+// RandomStandardNormalSeed sets the optional seed attribute to value.
 //
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	opspec := tf.OpSpec{
-		Type: "RestoreV2",
-		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	var idx int
-	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
-		return
+}
+
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	return tensors
 }
 
-// Returns x / y element-wise for integer types.
+// Outputs random values from a normal distribution.
 //
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
+// The generated values will have mean 0 and standard deviation 1.
 //
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
+		Type: "RandomStandardNormal",
 		Input: []tf.Input{
-			x, y,
+			shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["epsilon"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
 //
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["min_object_covered"] = value
+		m["is_training"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// Batch normalization.
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNorm",
+		Input: []tf.Input{
+			x, scale, offset, mean, variance,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
+type FusedBatchNormV2Attr func(optionalAttr)
+
+// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
 	return func(m optionalAttr) {
-		m["area_range"] = value
+		m["epsilon"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
 	return func(m optionalAttr) {
-		m["max_attempts"] = value
+		m["data_format"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
 	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+		m["is_training"] = value
 	}
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.image_summary('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
+// Batch normalization.
 //
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16557,75 +16494,58 @@ func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_box
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
+		Type: "FusedBatchNormV2",
 		Input: []tf.Input{
-			image_size, bounding_boxes,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Returns the truth value of (x > y) element-wise.
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
 //
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Greater",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// MultinomialSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["output_dtype"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Draws samples from a multinomial distribution.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
 //
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16634,129 +16554,193 @@ func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "Multinomial",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			logits, num_samples,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns which elements of x are Inf.
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
+
+// EncodeJpegFormat sets the optional format attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["format"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "IsInf",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// EncodeJpegQuality sets the optional quality attribute to value.
+//
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
+// EncodeJpegProgressive sets the optional progressive attribute to value.
+//
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
+	}
+}
 
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If True, spend CPU/RAM to reduce size with no quality change.
 // If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["optimize_size"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
+	}
+}
+
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
+	}
+}
+
+// EncodeJpegXDensity sets the optional x_density attribute to value.
 //
-// Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
+//
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
-		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
-		},
-		Attrs: attrs,
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+//
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Component-wise multiplies a SparseTensor by a dense Tensor.
+// JPEG-encode an image.
 //
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
+		Type: "EncodeJpeg",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
+type MaxPoolGradAttr func(optionalAttr)
+
+// MaxPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorDataset",
+		Type: "MaxPoolGrad",
 		Input: []tf.Input{
-			tf.OutputList(components),
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -16764,49 +16748,66 @@ func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shap
 	return op.Output(0)
 }
 
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
 
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+// CropAndResizeMethod sets the optional method attribute to value.
 //
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeMethod(value string) CropAndResizeAttr {
 	return func(m optionalAttr) {
-		m["iou_threshold"] = value
+		m["method"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// value: Value used for extrapolation, when applicable.
+// If not specified, defaults to 0
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["extrapolation_value"] = value
+	}
+}
+
+// Extracts crops from the input image tensor and bilinearly resizes them (possibly
+//
+// with aspect ratio change) to a common output size specified by `crop_size`. This
+// is more general than the `crop_to_bounding_box` op which extracts a fixed size
+// slice from the input image and does not allow resizing or aspect ratio change.
+//
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
+// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
+// resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
+// method will give identical results to using `tf.image.resize_bilinear()`
+// with `align_corners=True`.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16815,9 +16816,9 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
+		Type: "CropAndResize",
 		Input: []tf.Input{
-			boxes, scores, max_output_size,
+			image, boxes, box_ind, crop_size,
 		},
 		Attrs: attrs,
 	}
@@ -16825,38 +16826,38 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 	return op.Output(0)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
 
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the adadelta scheme.
+// Update '*var' according to the AddSign update.
 //
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
+//	m: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
 //	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16865,216 +16866,201 @@ func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
+		Type: "ResourceApplyPowerSign",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
+			var_, m, lr, logbase, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// StageSizeAttr is an optional argument to StageSize.
-type StageSizeAttr func(optionalAttr)
+// Deprecated. Disallowed in GraphDef version >= 2.
+//
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrast",
+		Input: []tf.Input{
+			images, contrast_factor, min_value, max_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// StageSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Table initializer that takes two tensors for keys and values respectively.
 //
-// REQUIRES: value >= 0
-func StageSizeCapacity(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
+//
+// Returns the created operation.
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InitializeTableV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
+
+// PrintMessage sets the optional message attribute to value.
 //
-// REQUIRES: value >= 0
-func StageSizeMemoryLimit(value int64) StageSizeAttr {
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["message"] = value
 	}
 }
 
-// StageSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageSizeContainer(value string) StageSizeAttr {
+// PrintFirstN sets the optional first_n attribute to value.
+//
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["first_n"] = value
 	}
 }
 
-// StageSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageSizeSharedName(value string) StageSizeAttr {
+// PrintSummarize sets the optional summarize attribute to value.
+//
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["summarize"] = value
 	}
 }
 
-// Op returns the number of elements in the underlying container.
-func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
+// Prints a list of tensors.
+//
+// Passes `input` through to `output` and prints `data` when evaluating.
+//
+// Arguments:
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
+//
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StageSize",
-
+		Type: "Print",
+		Input: []tf.Input{
+			input, tf.OutputList(data),
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SizeAttr is an optional argument to Size.
-type SizeAttr func(optionalAttr)
-
-// SizeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func SizeOutType(value tf.DataType) SizeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns the size of a tensor.
-//
-// This operation returns an integer representing the number of elements in
-// `input`.
-//
-// For example:
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
 //
-// ```
-// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-// size(t) ==> 12
-// ```
-func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// Arguments:
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Size",
+		Type: "TensorSummaryV2",
 		Input: []tf.Input{
-			input,
+			tag, tensor, serialized_summary_metadata,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
-
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
 //
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	buffer_size: The maximum number of elements to buffer in an iterator over
+// this dataset.
 //
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+//
+func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
+		Type: "PrefetchDataset",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+			input_dataset, buffer_size,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
 
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+// TensorSummaryDescription sets the optional description attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["description"] = value
 	}
 }
 
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+// TensorSummaryLabels sets the optional labels attribute to value.
 //
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["labels"] = value
 	}
 }
 
-// Update '*var' according to the Adam algorithm.
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
 //
-// lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-// v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-// variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["display_name"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with a tensor.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
 //
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+// Arguments:
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17083,135 +17069,227 @@ func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
+		Type: "TensorSummary",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+			tensor,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// 3D fast Fourier transform.
+// Computes the gradient for the tanh of `x` wrt its input.
 //
-// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-// dimensions of `input`.
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TanhGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with scalar values.
 //
-// Arguments:
-//	input: A complex64 tensor.
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier transform.
+// Arguments:
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fftn with 3 dimensions.
-// @end_compatibility
-func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT3D",
+		Type: "ScalarSummary",
 		Input: []tf.Input{
-			input,
+			tags, values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserialize `SparseTensor` from a (serialized) string 3-vector (1-D `Tensor`)
+// Outputs a `Summary` protocol buffer with a histogram.
 //
-// object.
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
+//
+// This op reports an `InvalidArgument` error if any value is not finite.
 //
 // Arguments:
-//	serialized_sparse: 1-D, The serialized `SparseTensor` object. Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` object.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
+		Type: "HistogramSummary",
 		Input: []tf.Input{
-			serialized_sparse,
+			tag, values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Elementwise computes the bitwise XOR of `x` and `y`.
+// Computes the number of elements in the given queue.
 //
-// The result will have those bits set, that are different in `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseXor",
+		Type: "QueueSizeV2",
 		Input: []tf.Input{
-			x, y,
+			handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a summary file writer accessible by the given resource handle.
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
+
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
+//
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_images"] = value
+	}
+}
+
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
+//
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+	return func(m optionalAttr) {
+		m["bad_color"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with images.
+//
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
+//
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
+//
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
 //
 // Arguments:
-//	writer: A handle to the summary writer resource
-//	logdir: Directory where the event file will be written.
-//	max_queue: Size of the queue of pending events and summaries.
-//	flush_millis: How often, in milliseconds, to flush the pending events and
-// summaries to disk.
-//	filename_suffix: Every event file's name is suffixed with this suffix.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
 //
-// Returns the created operation.
-func CreateSummaryFileWriter(scope *Scope, writer tf.Output, logdir tf.Output, max_queue tf.Output, flush_millis tf.Output, filename_suffix tf.Output) (o *tf.Operation) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "CreateSummaryFileWriter",
+		Type: "ImageSummary",
 		Input: []tf.Input{
-			writer, logdir, max_queue, flush_millis, filename_suffix,
+			tag, tensor,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
 
-// EncodeBase64Pad sets the optional pad attribute to value.
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
 //
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
 	return func(m optionalAttr) {
-		m["pad"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Encode strings into web-safe base64 format.
+// Outputs a `Summary` protocol buffer with audio.
 //
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
 //
-// Web-safe means that the encoder uses - and _ instead of + and /.
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	input: Strings to be encoded.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17220,9 +17298,9 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "AudioSummaryV2",
 		Input: []tf.Input{
-			input,
+			tag, tensor, sample_rate,
 		},
 		Attrs: attrs,
 	}
@@ -17230,90 +17308,161 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 	return op.Output(0)
 }
 
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
 
-// VarHandleOpContainer sets the optional container attribute to value.
+// AvgPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: the container this variable is placed in.
-// If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolDataFormat(value string) AvgPoolAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["data_format"] = value
 	}
 }
 
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
+// Performs average pooling on the input.
 //
-// value: the name by which this variable is referred to.
-// If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a handle to a Variable resource.
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
 //
 // Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
-
+		Type: "AvgPool",
+		Input: []tf.Input{
+			value,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
+// Merges summaries.
+//
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
+//
+// Arguments:
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Fact",
+		Type: "MergeSummary",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
+
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["container"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom random values from a uniform distribution.
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17322,9 +17471,9 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
+		Type: "AddSparseToTensorsMap",
 		Input: []tf.Input{
-			shape, seed,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -17332,253 +17481,217 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 	return op.Output(0)
 }
 
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
-
-// PrintMessage sets the optional message attribute to value.
+// Writes a `Summary` protocol buffer with scalar values.
 //
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
-}
-
-// PrintFirstN sets the optional first_n attribute to value.
+// The input `tag` and `value` must have the scalars.
+//
+// Arguments:
+//	writer: A handle to a summary writer.
+//	step: The step to write the summary for.
+//	tag: Tag for the summary.
+//	value: Value for the summary.
 //
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["first_n"] = value
+// Returns the created operation.
+func WriteScalarSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// PrintSummarize sets the optional summarize attribute to value.
-//
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
+	opspec := tf.OpSpec{
+		Type: "WriteScalarSummary",
+		Input: []tf.Input{
+			writer, step, tag, value,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Prints a list of tensors.
+// Computes the matrix exponential of one or more square matrices:
 //
-// Passes `input` through to `output` and prints `data` when evaluating.
+// exp(A) = \sum_{n=0}^\infty A^n/n!
+//
+// The exponential is computed using a combination of the scaling and squaring
+// method and the Pade approximation. Details can be founds in:
+// Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
+// revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.expm
+// @end_compatibility
+func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Print",
+		Type: "MatrixExponential",
 		Input: []tf.Input{
-			input, tf.OutputList(data),
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
-type LoadAndRemapMatrixAttr func(optionalAttr)
+// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
+type QueueDequeueUpToV2Attr func(optionalAttr)
 
-// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: The maximum number of rows to load from the checkpoint at
-// once. If less than or equal to 0, the entire matrix will be loaded into
-// memory. Setting this arg trades increased disk reads for lower memory usage.
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
 // If not specified, defaults to -1
-func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
 	return func(m optionalAttr) {
-		m["max_rows_in_memory"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
-//
-// at `ckpt_path` and potentially reorders its rows and columns using the
-// specified remappings.
-//
-// Most users should use one of the wrapper initializers (such as
-// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-// function directly.
-//
-// The remappings are 1-D tensors with the following properties:
-//
-// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-//   matrix will be initialized from the row corresponding to index
-//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
-// * `col_remapping` must have either 0 entries (indicating that no column
-//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
-//   output matrix will be initialized from the column corresponding to index
-//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
-// * A value of -1 in either of the remappings signifies a "missing" entry. In that
-//   case, values from the `initializing_values` tensor will be used to fill that
-//   missing row or column. If `row_remapping` has `r` missing entries and
-//   `col_remapping` has `c` missing entries, then the following condition must be
-//   true:
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+// This operation is not supported by all queues.  If a queue does not support
+// DequeueUpTo, then an Unimplemented error is returned.
 //
-// The remapping tensors can be generated using the GenerateVocabRemapping op.
+// If the queue is closed and there are more than 0 but less than `n`
+// elements remaining, then instead of returning an OutOfRange error like
+// QueueDequeueMany, less than `n` elements are returned immediately.  If
+// the queue is closed and there are 0 elements left in the queue, then
+// an OutOfRange error is returned just like in QueueDequeueMany.
+// Otherwise the behavior is identical to QueueDequeueMany:
 //
-// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-// the value from row i, column j of the old tensor in the checkpoint, the output
-// matrix will look like the following:
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size n in the 0th dimension.
 //
-// [[w(1, 0),  w(1, 2),  0.5],
-//  [w(0, 0),  w(0, 2), -0.5],
-//  [0.25,    -0.25,      42]]
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
 //
 // Arguments:
-//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
-// which the old matrix `Tensor` will be loaded.
-//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-//	row_remapping: An int `Tensor` of row remappings (generally created by
-// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
-// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
-// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
-//	col_remapping: An int `Tensor` of column remappings (generally created by
-// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
-// is to be done (e.g. column ordering is the same).
-//	initializing_values: A float `Tensor` containing  values to fill in for cells
-// in the output matrix that are not loaded from the checkpoint. Length must be
-// exactly the same as the number of missing / new cells.
-//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
-//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns Output matrix containing existing values loaded from the
-// checkpoint, and with any missing values filled in from initializing_values.
-func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadAndRemapMatrix",
+		Type: "QueueDequeueUpToV2",
 		Input: []tf.Input{
-			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
+			handle, n,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueUpToV2", err)
+		return
+	}
+	return components
 }
 
-// Checks whether a resource handle-based variable has been initialized.
+// Computes the Cholesky decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be symmetric and positive definite. Only the lower-triangular
+// part of the input will be used for this operation. The upper-triangular part
+// will not be read.
+//
+// The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+//
+// **Note**: The gradient computation on GPU is faster for large matrices but
+// not for large batch dimensions when the submatrices are small. In this
+// case it might be faster to use the CPU.
 //
 // Arguments:
-//	resource: the input resource handle.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
+		Type: "Cholesky",
 		Input: []tf.Input{
-			resource,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
-
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
+// Writes contents to the file at input filename. Creates file and recursively
 //
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+// creates directory if not existing.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "WriteFile",
 		Input: []tf.Input{
-			images, size,
+			filename, contents,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
 
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
+// AllKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AllKeepDims(value bool) AllAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Returns the real part of a complex number.
+// Computes the "logical and" of elements across dimensions of a tensor.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
-// For example:
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17587,9 +17700,9 @@ func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Real",
+		Type: "All",
 		Input: []tf.Input{
-			input,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -17597,103 +17710,66 @@ func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// 2D real-valued fast Fourier transform.
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
 //
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
 //
-// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	input: Shape is `[..., M, M]`.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT2D",
+		Type: "SelfAdjointEig",
 		Input: []tf.Input{
-			input, fft_length,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// Computes softplus gradients for a softplus operation.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
+		Type: "SoftplusGrad",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
+			gradients, features,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Creates a dataset that contains the unique elements of `input_dataset`.
+func UniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ZipDataset",
+		Type: "UniqueDataset",
 		Input: []tf.Input{
-			tf.OutputList(input_datasets),
+			input_dataset,
 		},
 		Attrs: attrs,
 	}
@@ -17701,98 +17777,172 @@ func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.Data
 	return op.Output(0)
 }
 
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
 
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
+// If not specified, defaults to true
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["compute_v"] = value
 	}
 }
 
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+//
+// ```python
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
+// ```
+//
+// Arguments:
+//	input: `Tensor` input of shape `[N, N]`.
+//
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SelfAdjointEigV2",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+// Adjust the saturation of one or more images.
 //
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	scale: A float scale to add to the saturation.
+//
+// Returns The hue-adjusted image or images.
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustSaturation",
+		Input: []tf.Input{
+			images, scale,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+// Elementwise computes the bitwise OR of `x` and `y`.
 //
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
-// If not specified, defaults to 131072
-func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["initial_num_buckets"] = value
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
-//
-// value: The maximum ratio between number of entries and number of
-// buckets before growing the table. Must be between 0 and 1.
-// If not specified, defaults to 0.8
-func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
+
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
 	return func(m optionalAttr) {
-		m["max_load_factor"] = value
+		m["fast"] = value
 	}
 }
 
-// Creates an empty hash table that uses tensors as the backing store.
+// Solves one or more linear least-squares problems.
 //
-// It uses "open addressing" with quadratic reprobing to resolve
-// collisions.
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
+//
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
+// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
 //
 // Arguments:
-//	empty_key: The key used to represent empty key buckets internally. Must not
-// be used in insert or lookup operations.
-//	value_dtype: Type of the table values.
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
 //
-// Returns Handle to a table.
-func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			empty_key,
+			matrix, rhs, l2_regularizer,
 		},
 		Attrs: attrs,
 	}
@@ -17800,66 +17950,109 @@ func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.D
 	return op.Output(0)
 }
 
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
 
-// LRNDepthRadius sets the optional depth_radius attribute to value.
+// SvdComputeUv sets the optional compute_uv attribute to value.
 //
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["depth_radius"] = value
+		m["compute_uv"] = value
 	}
 }
 
-// LRNBias sets the optional bias attribute to value.
+// SvdFullMatrices sets the optional full_matrices attribute to value.
 //
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["bias"] = value
+		m["full_matrices"] = value
 	}
 }
 
-// LRNAlpha sets the optional alpha attribute to value.
+// Computes the singular value decompositions of one or more matrices.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+//
+// ```python
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Svd",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LRNBeta sets the optional beta attribute to value.
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
+
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
 	return func(m optionalAttr) {
-		m["beta"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Local Response Normalization.
+// Enqueues zero or more tuples of one or more tensors in the given queue.
 //
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
 //
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
 //
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+// N.B. If the queue is full, this operation will block until the given
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
+//
+// Returns the created operation.
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17868,116 +18061,123 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
+		Type: "QueueEnqueueManyV2",
 		Input: []tf.Input{
-			input,
+			handle, tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Inverse fast Fourier transform.
+// Computes the product along segments of a tensor.
 //
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
 //
 // Arguments:
-//	input: A complex64 tensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT",
+		Type: "SegmentProd",
 		Input: []tf.Input{
-			input,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+// Converts one or more images from RGB to HSV.
 //
-// Arguments:
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
 //
+// Arguments:
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
 //
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BatchDataset",
+		Type: "RGBToHSV",
 		Input: []tf.Input{
-			input_dataset, batch_size,
+			images,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+// Does nothing. Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NoOp",
+	}
+	return scope.AddOperation(opspec)
+}
 
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
+
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["delete_old_dirs"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
 //
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+// Arguments:
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
 //
 // Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17986,222 +18186,182 @@ func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
+		Type: "MergeV2Checkpoints",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			checkpoint_prefixes, destination_prefix,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Flips all bits elementwise.
+// Saves input tensors slices to disk.
 //
-// The result will have exactly those bits set, that are not set in `x`. The
-// computation is performed on the underlying representation of x.
-func Invert(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Invert",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the mean along segments of a tensor.
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// Elements of the `shapes_and_slices` input must either be:
 //
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
 //
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
 //
-// Arguments:
+// See also `Save`.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMean",
+		Type: "SaveSlices",
 		Input: []tf.Input{
-			data, segment_ids,
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
-
-// CumprodExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
+// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
+type DenseToDenseSetOperationAttr func(optionalAttr)
 
-// CumprodReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
+// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
+// Applies set operation along last dimension of 2 `Tensor` inputs.
 //
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-// ```
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// This is more efficient than using separate `tf.reverse` ops.
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
+// Arguments:
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
 //
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
 //
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumprod",
+		Type: "DenseToDenseSetOperation",
 		Input: []tf.Input{
-			x, axis,
+			set1, set2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
-
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
+// Generate a sharded filename. The filename is printf formatted as
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShardedFilename",
+		Input: []tf.Input{
+			basename, shard, num_shards,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
+		Type: "ShardedFilespec",
 		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
+			basename, num_shards,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
 
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
 //
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
+		m["skip_header_lines"] = value
+	}
+}
+
+// TextLineReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
 }
 
-// Deletes the resource specified by the handle.
-//
-// All subsequent operations using the resource will result in a NotFound
-// error status.
-//
-// Arguments:
-//	resource: handle to the resource to delete.
+// A Reader that outputs the lines of a file delimited by '\n'.
 //
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18210,221 +18370,240 @@ func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyReso
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
-		Input: []tf.Input{
-			resource,
-		},
+		Type: "TextLineReaderV2",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
-//
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash prevents this by making it difficult, if not infeasible, to compute inputs
-// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key for the keyed hash function passed as a list of two uint64
-// elements.
+// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
+type LoadAndRemapMatrixAttr func(optionalAttr)
+
+// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
+// value: The maximum number of rows to load from the checkpoint at
+// once. If less than or equal to 0, the entire matrix will be loaded into
+// memory. Setting this arg trades increased disk reads for lower memory usage.
+// If not specified, defaults to -1
+func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+	return func(m optionalAttr) {
+		m["max_rows_in_memory"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Encode audio data using the WAV file format.
+// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
 //
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
+// at `ckpt_path` and potentially reorders its rows and columns using the
+// specified remappings.
 //
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+// Most users should use one of the wrapper initializers (such as
+// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+// function directly.
+//
+// The remappings are 1-D tensors with the following properties:
+//
+// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+//   matrix will be initialized from the row corresponding to index
+//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
+// * `col_remapping` must have either 0 entries (indicating that no column
+//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
+//   output matrix will be initialized from the column corresponding to index
+//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
+// * A value of -1 in either of the remappings signifies a "missing" entry. In that
+//   case, values from the `initializing_values` tensor will be used to fill that
+//   missing row or column. If `row_remapping` has `r` missing entries and
+//   `col_remapping` has `c` missing entries, then the following condition must be
+//   true:
+//
+// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+//
+// The remapping tensors can be generated using the GenerateVocabRemapping op.
+//
+// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+// the value from row i, column j of the old tensor in the checkpoint, the output
+// matrix will look like the following:
+//
+// [[w(1, 0),  w(1, 2),  0.5],
+//  [w(0, 0),  w(0, 2), -0.5],
+//  [0.25,    -0.25,      42]]
 //
 // Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
+//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+// which the old matrix `Tensor` will be loaded.
+//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+//	row_remapping: An int `Tensor` of row remappings (generally created by
+// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
+// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+//	col_remapping: An int `Tensor` of column remappings (generally created by
+// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+// is to be done (e.g. column ordering is the same).
+//	initializing_values: A float `Tensor` containing  values to fill in for cells
+// in the output matrix that are not loaded from the checkpoint. Length must be
+// exactly the same as the number of missing / new cells.
+//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
+//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
 //
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
+// Returns Output matrix containing existing values loaded from the
+// checkpoint, and with any missing values filled in from initializing_values.
+func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "EncodeWav",
+		Type: "LoadAndRemapMatrix",
 		Input: []tf.Input{
-			audio, sample_rate,
+			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The gradient operator for the SparseAdd op.
-//
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
+
+// TFRecordReaderV2Container sets the optional container attribute to value.
 //
-// Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
-		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
-		},
+}
+
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Adds `bias` to `value`.
-//
-// This is a deprecated version of BiasAdd and will be soon removed.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
-//
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+// A Reader that outputs the records from a TensorFlow Records file.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
-		Input: []tf.Input{
-			value, bias,
-		},
+		Type: "TFRecordReaderV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
 
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
-//
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
 	return func(m optionalAttr) {
-		m["header_bytes"] = value
+		m["signed_input"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
-//
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
 	return func(m optionalAttr) {
-		m["footer_bytes"] = value
+		m["range_given"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+// Quantizes then dequantizes a tensor.
 //
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["hop_bytes"] = value
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantizeV3",
+		Input: []tf.Input{
+			input, input_min, input_max, num_bits,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
+
+// IdentityReaderV2Container sets the optional container attribute to value.
 //
 // value: If non-empty, this reader is placed in the given container.
 // Otherwise, a default container is used.
 // If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
 //
 // value: If non-empty, this reader is named in the given bucket
 // with this shared_name. Otherwise, the node name is used instead.
 // If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
-//
-// value: The type of encoding for the file. Currently ZLIB and GZIP
-// are supported. Defaults to none.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// A Reader that outputs fixed-length records from a file.
+// A Reader that outputs the queued work as both the key and value.
 //
-// Arguments:
-//	record_bytes: Number of bytes in the record.
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
 //
 // Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
+		Type: "IdentityReaderV2",
 
 		Attrs: attrs,
 	}
@@ -18432,26 +18611,29 @@ func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...Fix
 	return op.Output(0)
 }
 
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
 
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
 //
 // Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns the created operation.
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18460,157 +18642,219 @@ func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, ma
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
+		Type: "ResourceApplyGradientDescent",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			var_, alpha, delta,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the next record (key, value pair) produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
+//
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle,
+		},
+	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
+//
+// Arguments:
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
+//
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadUpToV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle, num_records,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
 
-// CumsumExclusive sets the optional exclusive attribute to value.
+// Restore a Reader to its initial clean state.
 //
-// value: If `True`, perform exclusive cumsum.
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderResetV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
 	return func(m optionalAttr) {
-		m["exclusive"] = value
+		m["use_locking"] = value
 	}
 }
 
-// CumsumReverse sets the optional reverse attribute to value.
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: A `bool` (default: False).
+// value: If `True`, uses the nesterov update.
 // If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
+// Update '*var' according to the Adam algorithm.
 //
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
+// lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+// v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+// variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
 //
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdam",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Store the input tensor in the state of the current session.
 //
-// This is more efficient than using separate `tf.reverse` ops.
+// Arguments:
+//	value: The tensor to be stored.
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandleV2",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the set of files matching one or more glob patterns.
 //
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
 //
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Cumsum",
+		Type: "MatchingFiles",
 		Input: []tf.Input{
-			x, axis,
+			pattern,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AsStringAttr is an optional argument to AsString.
-type AsStringAttr func(optionalAttr)
-
-// AsStringPrecision sets the optional precision attribute to value.
-//
-// value: The post-decimal precision to use for floating point numbers.
-// Only used if precision > -1.
-// If not specified, defaults to -1
-func AsStringPrecision(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["precision"] = value
-	}
-}
-
-// AsStringScientific sets the optional scientific attribute to value.
-//
-// value: Use scientific notation for floating point numbers.
-// If not specified, defaults to false
-func AsStringScientific(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["scientific"] = value
-	}
-}
+// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
+type ResizeBicubicGradAttr func(optionalAttr)
 
-// AsStringShortest sets the optional shortest attribute to value.
+// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: Use shortest representation (either scientific or standard) for
-// floating point numbers.
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
 // If not specified, defaults to false
-func AsStringShortest(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["shortest"] = value
-	}
-}
-
-// AsStringWidth sets the optional width attribute to value.
-//
-// value: Pad pre-decimal numbers to this width.
-// Applies to both floating point and integer numbers.
-// Only used if width > -1.
-// If not specified, defaults to -1
-func AsStringWidth(value int64) AsStringAttr {
+func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
 	return func(m optionalAttr) {
-		m["width"] = value
+		m["align_corners"] = value
 	}
 }
 
-// AsStringFill sets the optional fill attribute to value.
+// Computes the gradient of bicubic interpolation.
 //
-// value: The value to pad if width > -1.  If empty, pads with spaces.
-// Another typical value is '0'.  String cannot be longer than 1 character.
-// If not specified, defaults to ""
-func AsStringFill(value string) AsStringAttr {
-	return func(m optionalAttr) {
-		m["fill"] = value
-	}
-}
-
-// Converts each entry in the given tensor to strings.  Supports many numeric
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// types and boolean.
-func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18619,9 +18863,9 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AsString",
+		Type: "ResizeBicubicGrad",
 		Input: []tf.Input{
-			input,
+			grads, original_image,
 		},
 		Attrs: attrs,
 	}
@@ -18629,155 +18873,189 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 	return op.Output(0)
 }
 
-// Assigns sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
+
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using nearest neighbor interpolation.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
+		Type: "ResizeNearestNeighbor",
 		Input: []tf.Input{
-			resource, indices, updates,
+			images, size,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
-type GenerateVocabRemappingAttr func(optionalAttr)
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
 
-// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
-//
-// value: Number of entries in the old vocab file to consider.  If -1,
-// use the entire old vocabulary.
-// If not specified, defaults to -1
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-// REQUIRES: value >= -1
-func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
 	return func(m optionalAttr) {
-		m["old_vocab_size"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Given a path to new and old vocabulary files, returns a remapping Tensor of
-//
-// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
-// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
-// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
-// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
-// default value of -1.
-//
-// `num_vocab_offset` enables
-// use in the partitioned variable case, and should generally be set through
-// examining partitioning info.  The format of the files should be a text file,
-// with each line containing a single entity within the vocabulary.
-//
-// For example, with `new_vocab_file` a text file containing each of the following
-// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
-// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
-// `[0, -1, 2]`.
-//
-// The op also returns a count of how many entries in the new vocabulary
-// were present in the old vocabulary, which is used to calculate the number of
-// values to initialize in a weight matrix remapping
-//
-// This functionality can be used to remap both row vocabularies (typically,
-// features) and column vocabularies (typically, classes) from TensorFlow
-// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
-// corresponding to div-partitioned variables.  Moreover, the underlying remapping
-// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
-// use the corresponding index_table_from_file() as the FeatureColumn framework
-// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
+// Computes the gradient of nearest neighbor interpolation.
 //
 // Arguments:
-//	new_vocab_file: Path to the new vocab file.
-//	old_vocab_file: Path to the old vocab file.
-//	new_vocab_offset: How many entries into the new vocab file to start reading.
-//	num_new_vocab: Number of entries in the new vocab file to remap.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
 //
-// Returns A Tensor of length num_new_vocab where the element at index i
-// is equal to the old ID that maps to the new ID i.  This element is -1 for any
-// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
-func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "GenerateVocabRemapping",
+		Type: "ResizeNearestNeighborGrad",
 		Input: []tf.Input{
-			new_vocab_file, old_vocab_file,
+			grads, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Softsign",
-		Input: []tf.Input{
-			features,
-		},
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
 
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["dct_method"] = value
 	}
 }
 
-// Resize `images` to `size` using bilinear interpolation.
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
 //
-// Input images can be of different types but output images are always float.
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	contents: 0-D.  The JPEG-encoded image.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18786,9 +19064,9 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
+		Type: "DecodeJpeg",
 		Input: []tf.Input{
-			images, size,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -18796,33 +19074,29 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 	return op.Output(0)
 }
 
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
+// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
+type ExtractJpegShapeAttr func(optionalAttr)
 
-// ProdKeepDims sets the optional keep_dims attribute to value.
+// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
+// value: (Optional) The output type of the operation (int32 or int64).
+// Defaults to int32.
+// If not specified, defaults to DT_INT32
+func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["output_type"] = value
 	}
 }
 
-// Computes the product of elements across dimensions of a tensor.
+// Extract the shape information of a JPEG-encoded image.
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// This op only parses the image header, so it is much faster than DecodeJpeg.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	contents: 0-D. The JPEG-encoded image.
 //
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...ProdAttr) (output tf.Output) {
+// Returns 1-D. The image shape with format [height, width, channels].
+func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18831,9 +19105,9 @@ func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional .
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Prod",
+		Type: "ExtractJpegShape",
 		Input: []tf.Input{
-			input, reduction_indices,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -18841,167 +19115,227 @@ func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional .
 	return op.Output(0)
 }
 
-// StringSplitAttr is an optional argument to StringSplit.
-type StringSplitAttr func(optionalAttr)
+// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
+type PaddingFIFOQueueV2Attr func(optionalAttr)
 
-// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
+// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
 //
-// value: A `bool`. If `True`, skip the empty strings from the result.
-// If not specified, defaults to true
-func StringSplitSkipEmpty(value bool) StringSplitAttr {
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types.
+// Shapes of fixed rank but variable size are allowed by setting
+// any shape dimension to -1.  In this case, the inputs' shape may vary along
+// the given dimension, and DequeueMany will pad the given dimension with
+// zeros up to the maximum shape of all elements in the given batch.
+// If the length of this attr is 0, different queue elements may have
+// different ranks and shapes, but only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["skip_empty"] = value
+		m["shapes"] = value
 	}
 }
 
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
+// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
 //
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// PaddingFIFOQueueV2Container sets the optional container attribute to value.
 //
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements in first-in first-out order.
 //
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
+// Variable-size shapes are allowed by setting the corresponding shape dimensions
+// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+// size of any given element in the minibatch.  See below for details.
 //
 // Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//	component_types: The type of each component in a value.
 //
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns The handle to the queue.
+func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplit",
-		Input: []tf.Input{
-			input, delimiter,
-		},
+		Type: "PaddingFIFOQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Inverse 3D real-valued fast Fourier transform.
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
+
+// DecodePngChannels sets the optional channels attribute to value.
 //
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodePngChannels(value int64) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
 //
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// Accepted values are:
 //
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
 //
-// @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
-// @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The PNG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
+		Type: "DecodePng",
 		Input: []tf.Input{
-			input, fft_length,
+			contents,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x != y) element-wise.
+// Decode the first frame of a GIF-encoded image to a uint8 tensor.
 //
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// GIF with frame or transparency compression are not supported
+// convert animated GIF from compressed to uncompressed by:
+//
+//     convert $src.gif -coalesce $dst.gif
+//
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The GIF-encoded image.
+//
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NotEqual",
+		Type: "DecodeGif",
 		Input: []tf.Input{
-			x, y,
+			contents,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// GatherAttr is an optional argument to Gather.
-type GatherAttr func(optionalAttr)
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
 
-// GatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func GatherValidateIndices(value bool) GatherAttr {
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Gather slices from `params` according to `indices`.
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
 //
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
 //
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
 //
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
 //
-// If `indices` is a permutation and `len(indices) == params.shape[0]` then
-// this operation will permute `params` accordingly.
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
 //
-// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
-// `indices` are always validated to be within range. If assigned to GPU,
-// out-of-bound indices result in safe but unspecified behavior, which may include
-// raising an error.
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
-func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19010,212 +19344,225 @@ func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...Gathe
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Gather",
+		Type: "ResourceApplyCenteredRMSProp",
 		Input: []tf.Input{
-			params, indices,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
+// Returns a list of tensors with the same shapes and contents as the input
 //
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
+// tensors.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+// This op can be used to override the gradient for complicated functions. For
+// example, suppose y = f(x) and we wish to apply a custom function g for backprop
+// such that dx = g(dy). In Python,
+//
+// ```python
+// with tf.get_default_graph().gradient_override_map(
+//     {'IdentityN': 'OverrideGradientWithG'}):
+//   y, _ = identity_n([f(x), x])
+//
+// @tf.RegisterGradient('OverrideGradientWithG')
+// def ApplyG(op, dy, _):
+//   return [None, g(dy)]  # Do not backprop to f(x).
+// ```
+func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "IdentityN",
 		Input: []tf.Input{
-			reader_handle,
+			tf.OutputList(input),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("IdentityN", err)
+		return
+	}
+	return output
 }
 
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
-//
-// If `pos` is negative or specifies a character index larger than any of the input
-// strings, then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
-//
-// Examples
-//
-// Using scalar `pos` and `len`:
-//
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
-//
-// output = [b'ell', b'orl']
-// ```
-//
-// Using `pos` and `len` with same shape as `input`:
-//
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
-//
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
-// ```
-//
-// Broadcasting `pos` and `len` onto `input`:
-//
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
-//
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
+// Computes the gradient of the sigmoid of `x` wrt its input.
 //
-// Broadcasting `input` onto `pos` and `len`:
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SigmoidGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Convert one or more images from HSV to RGB.
 //
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
 //
-// output = [b'hir', b'ee', b'n']
-// ```
+// See `rgb_to_hsv` for a description of the HSV encoding.
 //
 // Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
 //
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Substr",
+		Type: "HSVToRGB",
 		Input: []tf.Input{
-			input, pos, len,
+			images,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
+type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
+
+// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
 }
 
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
-
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["seed2"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
 //
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
+}
+
+// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within in this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
-type UniqueWithCountsAttr func(optionalAttr)
+// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
 
-// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["use_image_if_no_bounding_boxes"] = value
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
+// Generate a single randomly distorted bounding box for an image.
 //
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. Finally, it returns a third tensor `count` that
-// contains the count of each element of `y` in `x`. In other words:
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
 //
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
 //
-// For example:
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
 // ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx, count = unique_with_counts(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// count ==> [2, 1, 3, 1, 2]
-// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
 //
 // Arguments:
-//	x: 1-D.
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//	min_object_covered: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
 //
-// Returns 1-D.1-D.1-D.
-func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19224,9 +19571,9 @@ func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UniqueWithCounts",
+		Type: "SampleDistortedBoundingBoxV2",
 		Input: []tf.Input{
-			x,
+			image_size, bounding_boxes, min_object_covered,
 		},
 		Attrs: attrs,
 	}
@@ -19234,51 +19581,89 @@ func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAtt
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
+// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
+type ExtractGlimpseAttr func(optionalAttr)
 
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+// ExtractGlimpseCentered sets the optional centered attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
-// If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["centered"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
+// ExtractGlimpseNormalized sets the optional normalized attribute to value.
 //
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["normalized"] = value
+	}
+}
+
+// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
 //
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["uniform_noise"] = value
+	}
+}
+
+// Extracts a glimpse from the input tensor.
+//
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
+//
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
+//
+// The argument `normalized` and `centered` controls how the windows are built:
+//
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
 //
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
+		Type: "ExtractGlimpse",
 		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
+			input, size, offsets,
 		},
 		Attrs: attrs,
 	}
@@ -19286,44 +19671,66 @@ func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, s
 	return op.Output(0)
 }
 
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
+// A container for an iterator resource.
+//
+// Returns A handle to the iterator that can be passed to a "MakeIterator"
+// or "IteratorGetNext" op.
+func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "Iterator",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
+type ShuffleDatasetAttr func(optionalAttr)
 
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+// value: If true, each iterator over this dataset will be given
+// a different pseudorandomly generated seed, based on a sequence seeded by the
+// `seed` and `seed2` inputs. If false, each iterator will be given the same
+// seed, and repeated iteration over this dataset will yield the exact same
+// sequence of results.
+// If not specified, defaults to true
+func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["reshuffle_each_iteration"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
+// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// Arguments:
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
 //
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
+		Type: "ShuffleDataset",
 		Input: []tf.Input{
-			shape, seed,
+			input_dataset, buffer_size, seed, seed2,
 		},
 		Attrs: attrs,
 	}
@@ -19331,60 +19738,69 @@ func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, opt
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
+// 3D fast Fourier transform.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
 //
 // Arguments:
+//	input: A complex64 tensor.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.fftn with 3 dimensions.
+// @end_compatibility
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "FFT3D",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
+// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
+type CropAndResizeGradBoxesAttr func(optionalAttr)
 
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["method"] = value
 	}
 }
 
-// Computes the gradient of bilinear interpolation.
+// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+// Returns A 2-D tensor of shape `[num_boxes, 4]`.
+func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19393,9 +19809,9 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
+		Type: "CropAndResizeGradBoxes",
 		Input: []tf.Input{
-			grads, original_image,
+			grads, image, boxes, box_ind,
 		},
 		Attrs: attrs,
 	}
@@ -19403,231 +19819,218 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 	return op.Output(0)
 }
 
-// Computes the number of elements in the given table.
+// Saves tensors in V2 checkpoint format.
+//
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
 //
 // Arguments:
-//	table_handle: Handle to the table.
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
 //
-// Returns Scalar that contains number of elements in the table.
-func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableSizeV2",
+		Type: "SaveV2",
 		Input: []tf.Input{
-			table_handle,
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Component-wise divides a SparseTensor by a dense Tensor.
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// StatsAggregatorHandleAttr is an optional argument to StatsAggregatorHandle.
+type StatsAggregatorHandleAttr func(optionalAttr)
+
+// StatsAggregatorHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StatsAggregatorHandleContainer(value string) StatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
+}
+
+// StatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StatsAggregatorHandleSharedName(value string) StatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Reads the value of a variable.
-//
-// The tensor returned by this operation is immutable.
-//
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+// Creates a statistics manager resource.
+func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
-		Input: []tf.Input{
-			resource,
-		},
+		Type: "StatsAggregatorHandle",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Associates the given iterator with the given statistics aggregator.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// Returns the created operation.
-func IteratorSetStatsAggregator(scope *Scope, iterator_handle tf.Output, stats_aggregator_handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IteratorSetStatsAggregator",
-		Input: []tf.Input{
-			iterator_handle, stats_aggregator_handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
-type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
 //
-// Returns the created operation.
-func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrlV2",
+		Type: "NonMaxSuppressionV2",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+			boxes, scores, max_output_size, iou_threshold,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Restore a reader to a previously saved state.
+// Reshapes a tensor.
 //
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
+// Given `tensor`, this operation returns a tensor that has the same values
+// as `tensor` with shape `shape`.
+//
+// If one component of `shape` is the special value -1, the size of that dimension
+// is computed so that the total size remains constant.  In particular, a `shape`
+// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
+//
+// If `shape` is 1-D or higher, then the operation returns a tensor with shape
+// `shape` filled with the values of `tensor`. In this case, the number of elements
+// implied by `shape` must be the same as the number of elements in `tensor`.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
+// # tensor 't' has shape [9]
+// reshape(t, [3, 3]) ==> [[1, 2, 3],
+//                         [4, 5, 6],
+//                         [7, 8, 9]]
+//
+// # tensor 't' is [[[1, 1], [2, 2]],
+// #                [[3, 3], [4, 4]]]
+// # tensor 't' has shape [2, 2, 2]
+// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
+//                         [3, 3, 4, 4]]
+//
+// # tensor 't' is [[[1, 1, 1],
+// #                 [2, 2, 2]],
+// #                [[3, 3, 3],
+// #                 [4, 4, 4]],
+// #                [[5, 5, 5],
+// #                 [6, 6, 6]]]
+// # tensor 't' has shape [3, 2, 3]
+// # pass '[-1]' to flatten 't'
+// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+//
+// # -1 can also be used to infer the shape
+//
+// # -1 is inferred to be 9:
+// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 2:
+// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 3:
+// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
+//                               [2, 2, 2],
+//                               [3, 3, 3]],
+//                              [[4, 4, 4],
+//                               [5, 5, 5],
+//                               [6, 6, 6]]]
+//
+// # tensor 't' is [7]
+// # shape `[]` reshapes to a scalar
+// reshape(t, []) ==> 7
+// ```
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
 //
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+//	shape: Defines the shape of the output tensor.
+func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
+		Type: "Reshape",
 		Input: []tf.Input{
-			reader_handle, state,
+			tensor, shape,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the absolute value of a tensor.
-//
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Abs",
+		Type: "SparseTensorSliceDataset",
 		Input: []tf.Input{
-			x,
+			indices, values, dense_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Use RandomPoissonV2 instead.
-//
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
+func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
+		Type: "ConcatenateDataset",
 		Input: []tf.Input{
-			shape, rate,
+			input_dataset, another_dataset,
 		},
 		Attrs: attrs,
 	}
@@ -19635,363 +20038,231 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 	return op.Output(0)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-//
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
+// Adds a value to the current value of a variable.
 //
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
+// Any ReadVariableOp which depends directly or indirectly on this assign is
+// guaranteed to see the incremented value or a subsequent newer one.
 //
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
+// Outputs the incremented value, which can be used to totally order the
+// increments to this variable.
 //
 // Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
+		Type: "AssignAddVariableOp",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "LatencyStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentMean.
+// Convert JSON-encoded Example records to binary protocol buffer strings.
 //
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
 //
 // Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
+		Type: "DecodeJSONExample",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			json_examples,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Applies sparse addition to `input` using individual values or slices
-//
-// from `updates` according to indices `indices`.  The updates are non-aliasing:
-// `input` is only modified in-place if no other operations will use it.
-// Otherwise, a copy of `input` is made.  This operation has a gradient with
-// respect to both `input` and `updates`.
-//
-// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `input`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-// (if `K < P`) along the `K`th dimension of `input`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].
-// ```
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 //
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-// elements. In Python, that addition would look like this:
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
 //
-//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(output))
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
 //
-// The resulting value `output` would look like this:
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
 //
-//     [1, 13, 3, 14, 14, 6, 7, 20]
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
 //
-// See @{tf.scatter_nd} for more details about how to make updates to slices.
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
 //
 // Arguments:
-//	input: A Tensor.
-//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
-// A tensor of indices into `input`.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
-// to add to `input`.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A `Tensor` with the same shape as `input`, containing values of `input`
-// updated with `updates`.
-func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ScatterNdNonAliasingAdd",
+		Type: "Dilation2D",
 		Input: []tf.Input{
-			input, indices, updates,
+			input, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
+// Converts the given variant tensor to an iterator and stores it in the given resource.
 //
 // Arguments:
+//	resource_handle: A handle to an iterator resource.
+//	serialized: A variant tensor storing the state of the iterator contained in the
+// resource.
 //
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns the created operation.
+func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
+		Type: "DeserializeIterator",
 		Input: []tf.Input{
-			features, max_value, min_features, max_features,
+			resource_handle, serialized,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
+// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
+type TensorArrayConcatV2Attr func(optionalAttr)
 
-// UnpackAxis sets the optional axis attribute to value.
-//
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
+// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["element_shape_except0"] = value
 	}
 }
 
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-//
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
-//
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
-//
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
-//
-// This is the opposite of `pack`.
-//
-// Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
-//
-//
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+// Deprecated. Use TensorArrayConcatV3
+func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num": num}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unpack",
+		Type: "TensorArrayConcatV2",
 		Input: []tf.Input{
-			value,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
-	}
-	return output
+	return op.Output(0), op.Output(1)
 }
 
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
 // Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//
+func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SparseSplit",
+		Type: "PaddedBatchDataset",
 		Input: []tf.Input{
-			split_dim, indices, values, shape,
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
-}
-
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
-
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If `True`, retain reduced dimensions with length `1`.
-// If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// ReduceJoinSeparator sets the optional separator attribute to value.
-//
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
+	return op.Output(0)
 }
 
-// Joins a string Tensor across the given dimensions.
-//
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.
-//
-// For example:
+// Creates a dataset that batches input elements into a SparseTensor.
 //
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
-// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
-// tf.reduce_join(a, []) ==> ["abcd"]
-// ```
+// Arguments:
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor. The shape may be partially specified, using `-1` to indicate
+// that a particular dimension should use the maximum size of all batch elements.
 //
-// Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
 //
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
+		Type: "DenseToSparseBatchDataset",
 		Input: []tf.Input{
-			inputs, reduction_indices,
+			input_dataset, batch_size, row_shape,
 		},
 		Attrs: attrs,
 	}
@@ -19999,52 +20270,53 @@ func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, opt
 	return op.Output(0)
 }
 
-// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
-//
-// For each entry in `x`, calculates the number of `1` (on) bits in the binary
-// representation of that entry.
+// Deprecated. Use TensorArrayGradV3
 //
-// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
-// `int32` or `int64` and perform the bitcount on the result, than to feed in
-// 8- or 16-bit inputs and then aggregate the resulting counts.
-func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "PopulationCount",
+		Type: "TensorArrayGradV2",
 		Input: []tf.Input{
-			x,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
 
-// AssertSummarize sets the optional summarize attribute to value.
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Asserts that the given condition is true.
-//
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
+// var: Should be from a Variable().
 //
 // Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
+//
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20053,62 +20325,148 @@ func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...Ass
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Assert",
+		Type: "ResourceSparseApplyAdadelta",
 		Input: []tf.Input{
-			condition, tf.OutputList(data),
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
+// Identity op for gradient debugging.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+// This op operates on non-reference-type tensors.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DebugGradientIdentity",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
+// Return substrings from `Tensor` of strings.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
+//
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
+//
+// If `pos` is negative or specifies a character index larger than any of the input
+// strings, then an `InvalidArgumentError` is thrown.
+//
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
+// ```
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
+// ```
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n']
+// ```
+//
+// Arguments:
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
+//
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Substr",
+		Input: []tf.Input{
+			input, pos, len,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Creates a Dataset that returns pseudorandom numbers.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
 //
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+//
+func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "RandomDataset",
 		Input: []tf.Input{
-			shape,
+			seed, seed2,
 		},
 		Attrs: attrs,
 	}
@@ -20116,544 +20474,614 @@ func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ..
 	return op.Output(0)
 }
 
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
-
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// Creates a dataset that shuffles and repeats elements from `input_dataset`
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// pseudorandomly.
+//
+// Arguments:
+//
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//	count: A scalar representing the number of times the underlying dataset
+// should be repeated. The default is `-1`, which results in infinite repetition.
+//
+//
+func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ShuffleAndRepeatDataset",
+		Input: []tf.Input{
+			input_dataset, buffer_size, seed, seed2, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Creates a dataset that caches elements from `input_dataset`.
+//
+// A CacheDataset will iterate over the input_dataset, and store tensors. If the
+// cache already exists, the cache will be used. If the cache is inappropriate
+// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
+// will the returned when used.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
 //
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+//	filename: A path on the filesystem where we should cache the dataset. Note: this
+// will be a directory.
+//
+//
+func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
+		Type: "CacheDataset",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
+			input_dataset, filename,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
+// PlaceholderAttr is an optional argument to Placeholder.
+type PlaceholderAttr func(optionalAttr)
 
-// AnyKeepDims sets the optional keep_dims attribute to value.
+// PlaceholderShape sets the optional shape attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
+// value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
+// shape is unconstrained.
+// If not specified, defaults to <unknown_rank:true >
+func PlaceholderShape(value tf.Shape) PlaceholderAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["shape"] = value
 	}
 }
 
-// Computes the "logical or" of elements across dimensions of a tensor.
+// A placeholder op for a value that will be fed into the computation.
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// N.B. This operation will fail with an error if it is executed. It is
+// intended as a way to represent a value that will always be fed, and to
+// provide attrs that enable the fed value to be checked at runtime.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	dtype: The type of elements in the tensor.
 //
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AnyAttr) (output tf.Output) {
+// Returns A placeholder tensor that must be replaced using the feed mechanism.
+func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Any",
-		Input: []tf.Input{
-			input, reduction_indices,
-		},
+		Type: "Placeholder",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+// Creates a dataset that executes a SQL query and emits rows of the result set.
 //
-// The Hurwitz zeta function is defined as:
+// Arguments:
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
 //
 //
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Zeta",
+		Type: "SqlDataset",
 		Input: []tf.Input{
-			x, q,
+			driver_name, data_source_name, query,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
+// Creates a dataset that emits the records from one or more binary files.
 //
 // Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
-//
-//
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	header_bytes: A scalar representing the number of bytes to skip at the
+// beginning of a file.
+//	record_bytes: A scalar representing the number of bytes in each record.
+//	footer_bytes: A scalar representing the number of bytes to skip at the end
+// of a file.
+//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
+func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
+		Type: "FixedLengthRecordDataset",
 		Input: []tf.Input{
-			input_dataset, count,
+			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the imaginary part of a complex number.
+// Slice a `SparseTensor` based on the `start` and `size`.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
+// For example, if the input is
 //
-// For example:
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// Graphically the output tensors are:
+//
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "SparseSlice",
 		Input: []tf.Input{
-			input,
+			indices, values, shape, start, size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
-
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
+// Concatenates quantized tensors along one dimension.
+//
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConcat",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Converts two real numbers to a complex number.
+// Gradients for batch normalization.
 //
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// The input tensors `real` and `imag` must have the same shape.
+// This op is deprecated. See `tf.nn.batch_normalization`.
 //
-// For example:
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
 //
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalizationGrad",
+		Input: []tf.Input{
+			t, m, v, gamma, backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Creates a dataset that emits the records from one or more TFRecord files.
+//
+// Arguments:
+//	filenames: A scalar or vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar representing the number of bytes to buffer. A value of
+// 0 means no buffering will be performed.
+func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "TFRecordDataset",
 		Input: []tf.Input{
-			real, imag,
+			filenames, compression_type, buffer_size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
+// BatchToSpace for 4-D tensors of type T.
 //
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// This is a legacy version of the more general BatchToSpaceND.
 //
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
+// Rearranges (permutes) data from batch into blocks of spatial data, followed by
+// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+// this op outputs a copy of the input tensor where values from the `batch`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions,
+// followed by cropping along the `height` and `width` dimensions.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//	input: 4-D tensor with shape
+// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//   depth]`. Note that the batch size of the input tensor must be divisible by
+// `block_size * block_size`.
+//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+// how many elements to crop from the intermediate result across the spatial
+// dimensions as follows:
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
+//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+//
+// Returns 4-D with shape `[batch, height, width, depth]`, where:
+//
+//       height = height_pad - crop_top - crop_bottom
+//       width = width_pad - crop_left - crop_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[5], [7]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "IRFFT",
+		Type: "BatchToSpace",
 		Input: []tf.Input{
-			input, fft_length,
+			input, crops,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds a value to the current value of a variable.
-//
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
-//
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
+// Makes a new iterator from the given `dataset` and stores it in `iterator`.
 //
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+// This operation may be executed multiple times. Each execution will reset the
+// iterator in `iterator` to the first element of `dataset`.
 //
 // Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
+		Type: "MakeIterator",
 		Input: []tf.Input{
-			resource, value,
+			dataset, iterator,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Computes inverse hyperbolic sine of x element-wise.
-func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Asinh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Real-valued fast Fourier transform.
+// Adjust the contrast of one or more images.
 //
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
+// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+// interpreted as `[height, width, channels]`.  The other dimensions only
+// represent a collection of images, such as `[batch, height, width, channels].`
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
+// Contrast is adjusted independently for each channel of each image.
 //
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// For each channel, the Op first computes the mean of the image pixels in the
+// channel and then adjusts each component of each pixel to
+// `(x - mean) * contrast_factor + mean`.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
+//	images: Images to adjust.  At least 3-D.
+//	contrast_factor: A float multiplier for adjusting contrast.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns The contrast-adjusted image or images.
+func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT",
+		Type: "AdjustContrastv2",
 		Input: []tf.Input{
-			input, fft_length,
+			images, contrast_factor,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
-
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Gets the next output from the given iterator.
+func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNext",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
 	}
-}
-
-// OrderedMapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNext", err)
+		return
 	}
+	return components
 }
 
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
+// Outputs the single element from the given dataset.
 //
 // Arguments:
-//	key: int64
+//	dataset: A handle to a dataset that contains a single element.
 //
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
 //
 //
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+// Returns The components of the single element of `input`.
+func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
+		Type: "DatasetToSingleElement",
 		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
+			dataset,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the gradient for the tanh of `x` wrt its input.
-//
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "TanhGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("DatasetToSingleElement", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return components
 }
 
-// Outputs all keys and values in the table.
+// Converts the given `resource_handle` representing an iterator to a string.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//
-//
+//	resource_handle: A handle to an iterator resource.
 //
-// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
-func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+// Returns A string representation of the given handle.
+func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "LookupTableExportV2",
+		Type: "IteratorToStringHandle",
 		Input: []tf.Input{
-			table_handle,
+			resource_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
+// ShapeNAttr is an optional argument to ShapeN.
+type ShapeNAttr func(optionalAttr)
+
+// ShapeNOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeNOutType(value tf.DataType) ShapeNAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns shape of tensors.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
+func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
+		Type: "ShapeN",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(input),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("ShapeN", err)
+		return
+	}
+	return output
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
+// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
+type IteratorFromStringHandleAttr func(optionalAttr)
 
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+// value: If specified, defines the type of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["output_types"] = value
 	}
 }
 
-// Gather specific elements from the TensorArray into output `value`.
+// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
-// All elements selected by `indices` must have the same shape.
+// value: If specified, defines the shape of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_shapes"] = value
+	}
+}
+
+// Converts the given string representing a handle to an iterator to a resource.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	string_handle: A string representation of the given handle.
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns A handle to an iterator resource.
+func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "IteratorFromStringHandle",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			string_handle,
 		},
 		Attrs: attrs,
 	}
@@ -20661,340 +21089,257 @@ func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
-// Deprecated. Disallowed in GraphDef version >= 2.
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
 //
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
+		Type: "Atan2",
 		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
+			y, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
-type MaxPoolGradGradAttr func(optionalAttr)
-
-// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Identity",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Gather slices from `params` axis `axis` according to `indices`.
+//
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
+// params.shape[axis + 1:]` where:
+//
+// ```python
+//     # Scalar indices (output is rank(params) - 1).
+//     output[a_0, ..., a_n, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
+//
+//     # Vector indices (output is rank(params)).
+//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
+//
+//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
+//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
+//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	params: The tensor from which to gather values. Must be at least rank
+// `axis + 1`.
+//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
+//	axis: The axis in `params` to gather `indices` from. Defaults to the first
+// dimension. Supports negative indexes.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
+func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGrad",
+		Type: "GatherV2",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			params, indices, axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// 3D real-valued fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 3 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
-//
-// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// Converts the given `resource_handle` representing an iterator to a variant tensor.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	resource_handle: A handle to an iterator resource.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns A variant tensor storing the state of the iterator contained in the
+// resource.
+func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT3D",
+		Type: "SerializeIterator",
 		Input: []tf.Input{
-			input, fft_length,
+			resource_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
+// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
+type FIFOQueueV2Attr func(optionalAttr)
 
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
+// FIFOQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["signed_input"] = value
+		m["shapes"] = value
 	}
 }
 
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+// FIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["range_given"] = value
+		m["capacity"] = value
 	}
 }
 
-// Quantizes then dequantizes a tensor.
+// FIFOQueueV2Container sets the optional container attribute to value.
 //
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
-		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
-		},
-		Attrs: attrs,
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
-
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
+// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Performs 3D average pooling on the input.
+// A queue that produces elements in first-in first-out order.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	component_types: The type of each component in a value.
 //
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+// Returns The handle to the queue.
+func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "FIFOQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Produces the max pool of the input tensor for quantized types.
-//
-// Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Produces a summary of any statistics recorded by the given statistics manager.
+func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
+		Type: "StatsAggregatorSummary",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			iterator,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
-type AvgPool3DGradAttr func(optionalAttr)
-
-// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+// Compute the pairwise cross product.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of average pooling function.
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
 //
 // Arguments:
-//	orig_input_shape: The original input dimensions.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
 //
-// Returns The backprop for input.
-func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3DGrad",
+		Type: "Cross",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			a, b,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Writes a `GraphDef` protocol buffer to a `SummaryWriter`.
-//
-// Arguments:
-//	writer: Handle of `SummaryWriter`.
-//	step: The step to write the summary for.
-//	tensor: A scalar string of the serialized tf.GraphDef proto.
-//
-// Returns the created operation.
-func WriteGraphSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteGraphSummary",
-		Input: []tf.Input{
-			writer, step, tensor,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
-
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+// Performs a padding as a preprocess during a convolution.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes second-order gradients of the maxpooling function.
+// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+// implementation where the spatial padding transformation stage is fused with the
+// im2col lookup, but in this case without the bilinear filtering required for
+// resizing. Fusing the padding prevents the need to write out the intermediate
+// results as whole tensors, reducing memory pressure, and we can get some latency
+// gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+// order is used instead.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
+		return
 	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
+		Type: "FusedPadConv2D",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input, paddings, filter,
 		},
 		Attrs: attrs,
 	}
@@ -21002,61 +21347,73 @@ func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
-type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
 
-// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["max"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["num_bits"] = value
+		m["data_format"] = value
 	}
 }
 
-// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
+// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["narrow_range"] = value
+		m["dilations"] = value
 	}
 }
 
-// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+// Computes the gradients of convolution with respect to the input.
 //
 // Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-// `gradients * (inputs >= min && inputs <= max)`.
-func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgsGradient",
+		Type: "Conv2DBackpropInput",
 		Input: []tf.Input{
-			gradients, inputs,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -21064,140 +21421,405 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 	return op.Output(0)
 }
 
-// Computes gradients of the maxpooling function.
+// Interleave the values from the `data` tensors into a single tensor.
 //
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// output of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// Builds a merged tensor such that
 //
-// Returns Gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
+//
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
+//
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values are merged in order, so if an index appears in both `indices[m][i]` and
+// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
+// merged result. If you do not need this guarantee, ParallelDynamicStitch might
+// perform better on some devices.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradWithArgmax",
+		Type: "DynamicStitch",
 		Input: []tf.Input{
-			input, grad, argmax,
+			tf.OutputList(indices), tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
-
-// StringToNumberOutType sets the optional out_type attribute to value.
+// Returns the truth value of (x == y) element-wise.
 //
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Equal",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
+type TensorArrayGatherV2Attr func(optionalAttr)
+
+// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Converts each string in the input Tensor to the specified numeric type.
+// Deprecated. Use TensorArrayGatherV3
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
+func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGatherV2",
+		Input: []tf.Input{
+			handle, indices, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Interleave the values from the `data` tensors into a single tensor.
+//
+// Builds a merged tensor such that
+//
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
+//
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
+//
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
+// and `indices[n][j]`, the result may be invalid. This differs from the normal
+// DynamicStitch operator that defines the behavior in that case.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "ParallelDynamicStitch",
 		Input: []tf.Input{
-			string_tensor,
+			tf.OutputList(indices), tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalNot",
+		Type: "InvGrad",
 		Input: []tf.Input{
-			x,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
+// StridedSliceAttr is an optional argument to StridedSlice.
+type StridedSliceAttr func(optionalAttr)
 
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+// StridedSliceBeginMask sets the optional begin_mask attribute to value.
 //
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
+// value: a bitmask where a bit i being 1 means to ignore the begin
+// value and instead use the largest interval possible. At runtime
+// begin[i] will be replaced with `[0, n-1) if `stride[i] > 0` or
+// `[-1, n-1]` if `stride[i] < 0`
+// If not specified, defaults to 0
+func StridedSliceBeginMask(value int64) StridedSliceAttr {
 	return func(m optionalAttr) {
-		m["depth_radius"] = value
+		m["begin_mask"] = value
 	}
 }
 
-// LRNGradBias sets the optional bias attribute to value.
+// StridedSliceEndMask sets the optional end_mask attribute to value.
 //
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
+// value: analogous to `begin_mask`
+// If not specified, defaults to 0
+func StridedSliceEndMask(value int64) StridedSliceAttr {
 	return func(m optionalAttr) {
-		m["bias"] = value
+		m["end_mask"] = value
 	}
 }
 
-// LRNGradAlpha sets the optional alpha attribute to value.
+// StridedSliceEllipsisMask sets the optional ellipsis_mask attribute to value.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
+// value: a bitmask where bit `i` being 1 means the `i`th
+// position is actually an ellipsis. One bit at most can be 1.
+// If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
+// is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
+// implicitly creates as many range specifications as necessary to fully
+// specify the sliced range for every dimension. For example for a 4-dimensional
+// tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
+// If not specified, defaults to 0
+func StridedSliceEllipsisMask(value int64) StridedSliceAttr {
 	return func(m optionalAttr) {
-		m["alpha"] = value
+		m["ellipsis_mask"] = value
 	}
 }
 
-// LRNGradBeta sets the optional beta attribute to value.
+// StridedSliceNewAxisMask sets the optional new_axis_mask attribute to value.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
+// value: a bitmask where bit `i` being 1 means the `i`th
+// specification creates a new shape 1 dimension. For example
+// `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
+// If not specified, defaults to 0
+func StridedSliceNewAxisMask(value int64) StridedSliceAttr {
 	return func(m optionalAttr) {
-		m["beta"] = value
+		m["new_axis_mask"] = value
 	}
 }
 
-// Gradients for Local Response Normalization.
+// StridedSliceShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+//
+// value: a bitmask where bit `i` implies that the `i`th
+// specification should shrink the dimensionality. begin and end
+// must imply a slice of size 1 in the dimension. For example in
+// python one might do `foo[:, 3, :]` which would result in
+// `shrink_axis_mask` being 2.
+// If not specified, defaults to 0
+func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Return a strided slice from `input`.
+//
+// Note, most python users will want to use the Python `Tensor.__getitem__`
+// or `Variable.__getitem__` rather than this op directly.
+//
+// The goal of this op is to produce a new tensor with a subset of
+// the elements from the `n` dimensional `input` tensor. The subset is chosen using
+// a sequence of `m` sparse range specifications encoded into the arguments
+// of this function. Note, in some cases
+// `m` could be equal to `n`, but this need not be the case. Each
+// range specification entry can be one of the following:
+//
+// - An ellipsis (...). Ellipses are used to imply zero or more
+//   dimensions of full-dimension selection and are produced using
+//   `ellipsis_mask`. For example, `foo[...]` is the identity slice.
+//
+// - A new axis. This is used to insert a new shape=1 dimension and is
+//   produced using `new_axis_mask`. For example, `foo[:, ...]` where
+//   `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
+//
+//
+// - A range `begin:end:stride`. This is used to specify how much to choose from
+//   a given dimension. `stride` can be any integer but 0.  `begin` is an integer
+//   which represents the index of the first value to select while `end` represents
+//   the index of the last value to select. The number of values selected in each
+//   dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
+//   `begin` and `end` can be negative where `-1` is the last element, `-2` is
+//   the second to last. `begin_mask` controls whether to replace the explicitly
+//   given `begin` with an implicit effective value of `0` if `stride > 0` and
+//   `-1` if `stride < 0`. `end_mask` is analogous but produces the number
+//   required to create the largest open interval. For example, given a shape
+//   `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
+//   not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
+//   and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
+//   first dimension of a tensor while dropping the last two (in the original
+//   order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
+//
+// - A single index. This is used to keep only elements that have a given
+//   index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
+//   shape `(6,)` tensor. This is encoded in `begin` and `end` and
+//   `shrink_axis_mask`.
+//
+// Each conceptual range specification is encoded in the op's argument. This
+// encoding is best understand by considering a non-trivial example. In
+// particular,
+// `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
+//
+// ```
+// begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
+// end = [2, 4, x, x, -3, x]
+// strides = [1, 1, x, x, -1, 1]
+// begin_mask = 1<<4 | 1 << 5 = 48
+// end_mask = 1<<5 = 32
+// ellipsis_mask = 1<<3 = 8
+// new_axis_mask = 1<<2 4
+// shrink_axis_mask = 1<<0
+// ```
+//
+// In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
+// the slice becomes (2, 1, 5, 5, 2, 5).
+// Let us walk step by step through each argument specification.
+//
+// 1.  The first argument in the example slice is turned into `begin = 1` and
+// `end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
+// also set the appropriate bit in `shrink_axis_mask`.
+//
+// 2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
+// zero bits contributed.
+//
+// 3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
+// dimension in the final shape. Dummy values are contributed to begin,
+// end and stride, while the new_axis_mask bit is set.
+//
+// 4. `...` grab the full ranges from as many dimensions as needed to
+// fully specify a slice for every dimension of the input shape.
+//
+// 5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
+// with a dimension that has shape `s` is converted to a positive index
+// `s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
+// is done internally so begin, end and strides receive x, -3, and -1.
+// The appropriate begin_mask bit is set to indicate the start range is the
+// full range (ignoring the x).
+//
+// 6. `:` indicates that the entire contents of the corresponding dimension
+// is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
+// receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
+// `end_mask` are also set.
+//
+// *Requirements*:
+//   `0 != strides[i] for i in [0, m)`
+//   `ellipsis_mask must be a power of two (only one ellipsis)`
 //
 // Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+//	begin: `begin[k]` specifies the offset into the `k`th range specification.
+// The exact dimension this corresponds to will be determined by context.
+// Out-of-bounds values will be silently clamped. If the `k`th bit of
+// `begin_mask` then `begin[k]` is ignored and the full range of the
+// appropriate dimension is used instead. Negative values causes indexing
+// to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
+//	end: `end[i]` is like `begin` with the exception that `end_mask` is
+// used to determine full ranges.
+//	strides: `strides[i]` specifies the increment in the `i`th specification
+// after extracting a given element. Negative indices will reverse
+// the original order. Out or range values are
+// clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
+func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, optional ...StridedSliceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21206,9 +21828,9 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "StridedSlice",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			input, begin, end, strides,
 		},
 		Attrs: attrs,
 	}
@@ -21216,559 +21838,487 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 	return op.Output(0)
 }
 
-// EncodePngAttr is an optional argument to EncodePng.
-type EncodePngAttr func(optionalAttr)
+// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
+type PriorityQueueV2Attr func(optionalAttr)
+
+// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
+//
+// value: The type of each component in a value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["component_types"] = value
+	}
+}
+
+// PriorityQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// PriorityQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
 
-// EncodePngCompression sets the optional compression attribute to value.
+// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// value: Compression level.
-// If not specified, defaults to -1
-func EncodePngCompression(value int64) EncodePngAttr {
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
 	return func(m optionalAttr) {
-		m["compression"] = value
+		m["shared_name"] = value
 	}
 }
 
-// PNG-encode an image.
-//
-// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-// where `channels` is:
-//
-// *   1: for grayscale.
-// *   2: for grayscale + alpha.
-// *   3: for RGB.
-// *   4: for RGBA.
+// A queue that produces elements sorted by the first component value.
 //
-// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-// default or a value from 0 to 9.  9 is the highest compression level, generating
-// the smallest output, but is slower.
+// Note that the PriorityQueue requires the first component of any element
+// to be a scalar int64, in addition to the other elements declared by
+// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
+// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
+// entry in their input (resp. output) lists.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	shapes: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
 //
-// Returns 0-D. PNG-encoded image.
-func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
+// Returns The handle to the queue.
+func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"shapes": shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodePng",
-		Input: []tf.Input{
-			image,
-		},
+		Type: "PriorityQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
+// UnstageAttr is an optional argument to Unstage.
+type UnstageAttr func(optionalAttr)
 
-// MaxPoolDataFormat sets the optional data_format attribute to value.
+// UnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
+// REQUIRES: value >= 0
+func UnstageCapacity(value int64) UnstageAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["capacity"] = value
 	}
 }
 
-// Performs max pooling on the input.
+// UnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// REQUIRES: value >= 0
+func UnstageMemoryLimit(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// UnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnstageContainer(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnstageSharedName(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op is similar to a lightweight Dequeue.
 //
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+// The basic functionality is similar to dequeue with many fewer
+// capabilities and options.  This Op is optimized for performance.
+func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "Unstage",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform over the inner-most
-// dimension of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.fft
-// @end_compatibility
-func FFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "FFT",
-		Input: []tf.Input{
-			input,
-		},
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("Unstage", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return values
 }
 
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
 
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// ArgMaxOutputType sets the optional output_type attribute to value.
 // If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
 	return func(m optionalAttr) {
-		m["Targmax"] = value
+		m["output_type"] = value
 	}
 }
 
-// Performs max pooling on the input and outputs both max values and indices.
-//
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index
-// `((b * height + y) * width + x) * channels + c`.
+// Returns the index with the largest value across dimensions of a tensor.
 //
-// The indices returned are always in `[0, height) x [0, width)` before flattening,
-// even if padding is involved and the mathematically correct answer is outside
-// (either negative or too large).  This is a bug, but fixing it is difficult to do
-// in a safe backwards compatible way, especially due to flattening.
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
 //
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
+		Type: "ArgMax",
 		Input: []tf.Input{
-			input,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
-type MaxPoolGradGradV2Attr func(optionalAttr)
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
 
-// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["begin_mask"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradV2",
-		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
-		},
-		Attrs: attrs,
+}
+
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// input of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradWithArgmax",
-		Input: []tf.Input{
-			input, grad, argmax,
-		},
-		Attrs: attrs,
+}
+
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Compute the polygamma function \\(\psi^{(n)}(x)\\).
-//
-// The polygamma function is defined as:
+// Assign `value` to the sliced l-value reference of `ref`.
 //
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
 //
-// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
 //
-// where \\(\psi(x)\\) is the digamma function.
-func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Polygamma",
+		Type: "ResourceStridedSliceAssign",
 		Input: []tf.Input{
-			a, x,
+			ref, begin, end, strides, value,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
-//
-// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-// input channel is processed independently of the others with its own structuring
-// function. The `output` tensor has shape
-// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-// tensor depend on the `padding` algorithm. We currently only support the default
-// "NHWC" `data_format`.
-//
-// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-// (for consistency with `conv2d`, we use unmirrored filters):
+// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
+type QueueEnqueueV2Attr func(optionalAttr)
+
+// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-//     output[b, y, x, c] =
-//        max_{dy, dx} input[b,
-//                           strides[1] * y + rates[1] * dy,
-//                           strides[2] * x + rates[2] * dx,
-//                           c] +
-//                     filter[dy, dx, c]
+// value: If the queue is full, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues a tuple of one or more tensors in the given queue.
 //
-// Max-pooling is a special case when the filter has size equal to the pooling
-// kernel size and contains all zeros.
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
 //
-// Note on duality: The dilation of `input` by the `filter` is equal to the
-// negation of the erosion of `-input` by the reflected `filter`.
+// N.B. If the queue is full, this operation will block until the given
+// element has been enqueued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: The input stride for atrous morphological dilation. Must be:
-// `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should be taken.
 //
-// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
-func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
+// Returns the created operation.
+func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Dilation2D",
+		Type: "QueueEnqueueV2",
 		Input: []tf.Input{
-			input, filter,
+			handle, tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
-type AudioSpectrogramAttr func(optionalAttr)
+// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
+type QueueDequeueManyV2Attr func(optionalAttr)
 
-// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
+// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: Whether to return the squared magnitude or just the
-// magnitude. Using squared magnitude can avoid extra calculations.
-// If not specified, defaults to false
-func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
 	return func(m optionalAttr) {
-		m["magnitude_squared"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Produces a visualization of audio data over time.
-//
-// Spectrograms are a standard way of representing audio information as a series of
-// slices of frequency information, one slice for each window of time. By joining
-// these together into a sequence, they form a distinctive fingerprint of the sound
-// over time.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// This op expects to receive audio data as an input, stored as floats in the range
-// -1 to 1, together with a window width in samples, and a stride specifying how
-// far to move the window between slices. From this it generates a three
-// dimensional output. The lowest dimension has an amplitude value for each
-// frequency during that time slice. The next dimension is time, with successive
-// frequency slices. The final dimension is for the channels in the input, so a
-// stereo audio input would have two here for example.
+// If the queue is closed and there are fewer than `n` elements, then an
+// OutOfRange error is returned.
 //
-// This means the layout when converted and saved as an image is rotated 90 degrees
-// clockwise from a typical spectrogram. Time is descending down the Y axis, and
-// the frequency decreases from left to right.
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size `n` in the 0th dimension.
 //
-// Each value in the result represents the square root of the sum of the real and
-// imaginary parts of an FFT on the current window of samples. In this way, the
-// lowest dimension represents the power of each frequency in the current window,
-// and adjacent windows are concatenated in the next dimension.
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
 //
-// To get a more intuitive and visual look at what this operation does, you can run
-// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
-// resulting spectrogram as a PNG image.
+// N.B. If the queue is empty, this operation will block until `n` elements
+// have been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	input: Float representation of audio data.
-//	window_size: How wide the input window is in samples. For the highest efficiency
-// this should be a power of two, but other values are accepted.
-//	stride: How widely apart the center of adjacent sample windows should be.
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns 3D representation of the audio frequencies as an image.
-func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSpectrogram",
+		Type: "QueueDequeueManyV2",
 		Input: []tf.Input{
-			input,
+			handle, n,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient of morphological 2-D dilation with respect to the input.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x == y) element-wise.
-//
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueManyV2", err)
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Equal",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return components
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
+
+// EncodeBase64Pad sets the optional pad attribute to value.
 //
-// Returns 3-D with shape `[filter_height, filter_width, depth]`.
-func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
+	return func(m optionalAttr) {
+		m["pad"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes rectified linear gradients for a Relu operation.
+// Encode strings into web-safe base64 format.
+//
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
+//
+// Web-safe means that the encoder uses - and _ instead of + and /.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu operation.
-//	features: The features passed as input to the corresponding Relu operation, OR
-// the outputs of that operation (both work equivalently).
+//	input: Strings to be encoded.
 //
-// Returns `gradients * (features > 0)`.
-func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "ReluGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu6",
+		Type: "EncodeBase64",
 		Input: []tf.Input{
-			features,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that contains `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
+// Deprecated. Use TensorArrayCloseV3
 //
+// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
 //
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the created operation.
+func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TakeDataset",
+		Type: "TensorArrayCloseV2",
 		Input: []tf.Input{
-			input_dataset, count,
+			handle,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process.
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
+
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
 //
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
 //
 // Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
 //
-//	num_buckets: The number of buckets.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
+		Type: "CropAndResizeGradImage",
 		Input: []tf.Input{
-			string_tensor,
+			grads, boxes, box_ind, image_size,
 		},
 		Attrs: attrs,
 	}
@@ -21776,133 +22326,99 @@ func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64
 	return op.Output(0)
 }
 
-// Computes gradients for the exponential linear (Elu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
-//
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "EluGrad",
+		Type: "ReadFile",
 		Input: []tf.Input{
-			gradients, outputs,
+			filename,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
-
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// Computes gradient of the FractionalAvgPool function.
-//
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+// Concatenates tensors along one dimension.
 //
 // Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "ConcatV2",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			tf.OutputList(values), axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+// Forwards the value of an available tensor from `inputs` to `output`.
 //
-// if < 0, `scale * features` otherwise.
+// `Merge` waits for at least one of the tensors in `inputs` to become available.
+// It is usually combined with `Switch` to implement branching.
 //
-// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
+// `Merge` forwards the first tensor to become available to `output`, and sets
+// `value_index` to its index in `inputs`.
+//
+// Arguments:
+//	inputs: The input tensors, exactly one of which will become available.
+//
+// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
+func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Selu",
+		Type: "Merge",
 		Input: []tf.Input{
-			features,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
 
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
 // If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["cancel_pending_enqueues"] = value
 	}
 }
 
-// var: Should be from a Variable().
+// Closes the given queue.
 //
-// Arguments:
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
 //
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// Arguments:
+//	handle: The handle to a queue.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21911,172 +22427,176 @@ func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
+		Type: "QueueCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes inverse hyperbolic tangent of x element-wise.
+func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atanh",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+			x,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns which elements of x are NaN.
+// Returns true if queue is closed.
 //
-// @compatibility(numpy)
-// Equivalent to np.isnan
-// @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+// This operation returns true if the queue is closed and false if the queue
+// is open.
+//
+// Arguments:
+//	handle: The handle to a queue.
+func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsNan",
+		Type: "QueueIsClosedV2",
 		Input: []tf.Input{
-			x,
+			handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pads a tensor.
+// Returns the batched diagonal part of a batched tensor.
 //
-// This operation pads `input` according to the `paddings` and `constant_values`
-// you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many padding values to add before the contents of `input` in that dimension,
-// and `paddings[D, 1]` indicates how many padding values to add after the contents
-// of `input` in that dimension. `constant_values` is a scalar tensor of the same
-// type as `input` that indicates the value to use for padding `input`.
+// This operation returns a tensor with the `diagonal` part
+// of the batched `input`. The `diagonal` part is computed as follows:
 //
-// The padded size of each dimension D of the output is:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
 //
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
+//
+// The input must be at least a matrix.
 //
 // For example:
 //
 // ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # 'constant_values' is 0
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
+// # 'input' is [[[1, 0, 0, 0]
+//                [0, 2, 0, 0]
+//                [0, 0, 3, 0]
+//                [0, 0, 0, 4]],
+//               [[5, 0, 0, 0]
+//                [0, 6, 0, 0]
+//                [0, 0, 7, 0]
+//                [0, 0, 0, 8]]]
+//
+// and input.shape = (2, 4, 4)
+//
+// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// which has shape (2, 4)
 // ```
-func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf.Output) (output tf.Output) {
+//
+// Arguments:
+//	input: Rank `k` tensor where `k >= 2`.
+//
+// Returns The extracted diagonal(s) having shape
+// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
+func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "PadV2",
+		Type: "MatrixDiagPart",
 		Input: []tf.Input{
-			input, paddings, constant_values,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for the scaled exponential linear (Selu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Selu operation.
-//	outputs: The outputs of the corresponding Selu operation.
+// Computes the absolute value of a tensor.
 //
-// Returns The gradients: `gradients * (outputs + scale * alpha)`
-// if outputs < 0, `scale * gradients` otherwise.
-func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SeluGrad",
+		Type: "Abs",
 		Input: []tf.Input{
-			gradients, outputs,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
+// Flushes and closes the summary writer.
+//
+// Also removes it from the resource manager. To reopen, use another
+// CreateSummaryFileWriter op.
+//
+// Arguments:
+//	writer: A handle to the summary writer resource.
+//
+// Returns the created operation.
+func CloseSummaryWriter(scope *Scope, writer tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softplus",
+		Type: "CloseSummaryWriter",
 		Input: []tf.Input{
-			features,
+			writer,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
-
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
-//
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
-	}
-}
+// StackV2Attr is an optional argument to StackV2.
+type StackV2Attr func(optionalAttr)
 
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
+// StackV2StackName sets the optional stack_name attribute to value.
 //
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+// value: Overrides the name used for the temporary stack resource. Default
+// value is the name of the 'Stack' op (which is guaranteed unique).
+// If not specified, defaults to ""
+func StackV2StackName(value string) StackV2Attr {
 	return func(m optionalAttr) {
-		m["adj_y"] = value
+		m["stack_name"] = value
 	}
 }
 
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+// A stack that produces elements in first-in last-out order.
 //
 // Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//	max_size: The maximum size of the stack if non-negative. If negative, the stack
+// size is unlimited.
+//	elem_type: The type of the elements on the stack.
 //
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+// Returns The handle to the stack.
+func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
+		Type: "StackV2",
 		Input: []tf.Input{
-			x, y,
+			max_size,
 		},
 		Attrs: attrs,
 	}
@@ -22084,442 +22604,460 @@ func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMul
 	return op.Output(0)
 }
 
-// Computes softplus gradients for a softplus operation.
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
 //
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
 //
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes softsign gradients for a softsign operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// DecodeBmpAttr is an optional argument to DecodeBmp.
-type DecodeBmpAttr func(optionalAttr)
-
-// DecodeBmpChannels sets the optional channels attribute to value.
-// If not specified, defaults to 0
-func DecodeBmpChannels(value int64) DecodeBmpAttr {
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["container"] = value
 	}
 }
 
-// Decode the first frame of a BMP-encoded image to a uint8 tensor.
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
 //
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
 //
-// Accepted values are:
+// associative container.   Elements are ordered by key.
 //
-// *   0: Use the number of channels in the BMP-encoded image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+// Arguments:
+//	key: int64
 //
-// Arguments:
-//	contents: 0-D.  The BMP-encoded image.
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
 //
-// Returns 3-D with shape `[height, width, channels]`. RGB order
-func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBmp",
+		Type: "OrderedMapStage",
 		Input: []tf.Input{
-			contents,
+			key, indices, tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes softmax activations.
-//
-// For each batch `i` and class `j` we have
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
+
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
 //
-//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+	return func(m optionalAttr) {
+		m["swap_memory"] = value
+	}
+}
+
+// Push an element onto the stack.
 //
 // Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
 //
-// Returns Same shape as `logits`.
-func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Softmax",
+		Type: "StackPushV2",
 		Input: []tf.Input{
-			logits,
+			handle, elem,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
-
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
-//
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
+// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
+type FusedBatchNormGradV2Attr func(optionalAttr)
 
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
 //
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["epsilon"] = value
 	}
 }
 
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
+// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["is_training"] = value
 	}
 }
 
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+// Gradient for batch normalization.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that randomizes the order of elements.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
-
+		Type: "FusedBatchNormGradV2",
+		Input: []tf.Input{
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Outputs a `Summary` protocol buffer with scalar values.
+// Creates a TensorArray for storing the gradients of values in the given handle.
 //
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
+// If the given TensorArray gradient already exists, returns a reference to it.
 //
-// Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
+// Locks the size of the original TensorArray by disabling its dynamic size flag.
 //
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
-		Input: []tf.Input{
-			tags, values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x <= y) element-wise.
+// **A note about the input flow_in:**
 //
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The handle flow_in forces the execution of the gradient lookup to occur
+// only after certain other operations have occurred.  For example, when
+// the forward TensorArray is dynamically sized, writes to this TensorArray
+// may resize the object.  The gradient TensorArray is statically sized based
+// on the size of the forward TensorArray when this operation executes.
+// Furthermore, the size of the forward TensorArray is frozen by this call.
+// As a result, the flow is used to ensure that the call to generate the gradient
+// TensorArray only happens after all writes are executed.
+//
+// In the case of dynamically sized TensorArrays, gradient computation should
+// only be performed on read operations that have themselves been chained via
+// flow to occur only after all writes have executed. That way the final size
+// of the forward TensorArray is known when this operation is called.
+//
+// **A note about the source attribute:**
+//
+// TensorArray gradient calls use an accumulator TensorArray object.  If
+// multiple gradients are calculated and run in the same session, the multiple
+// gradient nodes may accidentally flow through the same accumulator TensorArray.
+// This double counts and generally breaks the TensorArray gradient flow.
+//
+// The solution is to identify which gradient call this particular
+// TensorArray gradient is being called in.  This is performed by identifying
+// a unique string (e.g. "gradients", "gradients_1", ...) from the input
+// gradient Tensor's name.  This string is used as a suffix when creating
+// the TensorArray gradient object here (the attribute `source`).
+//
+// The attribute `source` is added as a suffix to the forward TensorArray's
+// name when performing the creation / lookup, so that each separate gradient
+// calculation gets its own TensorArray accumulator.
+//
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "LessEqual",
+		Type: "TensorArrayGradV3",
 		Input: []tf.Input{
-			x, y,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes log softmax activations.
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+//
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
+//
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
+//
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
 //
-// For each batch `i` and class `j` we have
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
 //
-//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
 //
 // Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+//	input: Values to compare against `threshold` and bitpack.
+//	threshold: Threshold to compare against.
 //
-// Returns Same shape as `logits`.
-func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+// Returns The bitpacked comparisons.
+func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogSoftmax",
+		Type: "CompareAndBitpack",
 		Input: []tf.Input{
-			logits,
+			input, threshold,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Given a quantized tensor described by (input, input_min, input_max), outputs a
-//
-// range that covers the actual values present in that tensor.  This op is
-// typically used to produce the requested_output_min and requested_output_max for
-// Requantize.
+// Push an element onto the tensor_array.
 //
 // Arguments:
+//	handle: The handle to a TensorArray.
+//	index: The position to write to inside the TensorArray.
+//	value: The tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//
-// Returns The computed min output.the computed max output.
-func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RequantizationRange",
+		Type: "TensorArrayWriteV3",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			handle, index, value, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// Scatter the data from the input value into specific TensorArray elements.
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// `indices` must be a vector, its length must match the first dim of `value`.
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	handle: The handle to a TensorArray.
+//	indices: The locations at which to write the tensor elements.
+//	value: The concatenated tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "InTopK",
+		Type: "TensorArrayScatterV3",
 		Input: []tf.Input{
-			predictions, targets,
+			handle, indices, value, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a batched diagonal tensor with a given batched diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
+type TensorArrayConcatV3Attr func(optionalAttr)
+
+// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
 //
-// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+// value: The expected shape of an element, if known,
+// excluding the first dimension. Used to validate the shapes of
+// TensorArray elements. If this shape is not fully specified, concatenating
+// zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape_except0"] = value
+	}
+}
+
+// Concat the elements from the TensorArray into value `value`.
 //
-// For example:
+// Takes `T` elements of shapes
 //
-// ```
-// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+//   ```
+//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
+//   ```
 //
-// and diagonal.shape = (2, 4)
+// and concatenates them into a Tensor of shape:
 //
-// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-//                                      [0, 2, 0, 0]
-//                                      [0, 0, 3, 0]
-//                                      [0, 0, 0, 4]],
-//                                     [[5, 0, 0, 0]
-//                                      [0, 6, 0, 0]
-//                                      [0, 0, 7, 0]
-//                                      [0, 0, 0, 8]]]
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
 //
-// which has shape (2, 4, 4)
-// ```
+// All elements must have the same shape (excepting the first dimension).
 //
 // Arguments:
-//	diagonal: Rank `k`, where `k >= 1`.
+//	handle: The handle to a TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+// Returns All of the elements in the TensorArray, concatenated along the first
+// axis.A vector of the row sizes of the original T elements in the
+// value output.  In the example above, this would be the values:
+// `(n1, n2, ..., n(T-1))`.
+func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDiag",
+		Type: "TensorArrayConcatV3",
 		Input: []tf.Input{
-			diagonal,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
 
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["seed"] = value
 	}
 }
 
-// Performs 3D max pooling on the input.
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution. The parameters may each be a
+//
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
 //
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
+		Type: "ParameterizedTruncatedNormal",
 		Input: []tf.Input{
-			input,
+			shape, means, stdevs, minvals, maxvals,
 		},
 		Attrs: attrs,
 	}
@@ -22527,174 +23065,105 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 	return op.Output(0)
 }
 
-// Returns x // y element-wise.
-//
-// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FloorDiv",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
-
-// TopKSorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
-}
-
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
+// Returns a diagonal tensor with a given diagonal values.
 //
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
 //
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
+// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
 //
-//     values.shape = indices.shape = input.shape[:-1] + [k]
+// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
 //
-// If two elements are equal, the lower-index element appears first.
+// For example:
 //
-// If `k` varies dynamically, use `TopKV2` below.
+// ```
+// # 'diagonal' is [1, 2, 3, 4]
+// tf.diag(diagonal) ==> [[1, 0, 0, 0]
+//                        [0, 2, 0, 0]
+//                        [0, 0, 3, 0]
+//                        [0, 0, 0, 4]]
+// ```
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
-//
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+//	diagonal: Rank k tensor where k is at most 1.
+func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TopK",
+		Type: "Diag",
 		Input: []tf.Input{
-			input,
+			diagonal,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// TopKV2Attr is an optional argument to TopKV2.
-type TopKV2Attr func(optionalAttr)
-
-// TopKV2Sorted sets the optional sorted attribute to value.
+// Split the data from the input value into TensorArray elements.
 //
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKV2Sorted(value bool) TopKV2Attr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
-}
-
-// Finds values and indices of the `k` largest elements for the last dimension.
+// Assuming that `lengths` takes on values
 //
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//   ```(n0, n1, ..., n(T-1))```
 //
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
+// and that `value` has shape
 //
-//     values.shape = indices.shape = input.shape[:-1] + [k]
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
 //
-// If two elements are equal, the lower-index element appears first.
+// this splits values into a TensorArray with T tensors.
+//
+// TensorArray index t will be the subtensor of values with starting position
+//
+//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+//
+// and having size
+//
+//   ```nt x d0 x d1 x ...```
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: 0-D.  Number of top elements to look for along the last dimension (along each
-// row for matrices).
+//	handle: The handle to a TensorArray.
+//	value: The concatenated tensor to write to the TensorArray.
+//	lengths: The vector of lengths, how to split the rows of value into the
+// TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TopKV2",
+		Type: "TensorArraySplitV3",
 		Input: []tf.Input{
-			input, k,
+			handle, value, lengths, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// RandomCropAttr is an optional argument to RandomCrop.
-type RandomCropAttr func(optionalAttr)
-
-// RandomCropSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomCropSeed(value int64) RandomCropAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
 
-// RandomCropSeed2 sets the optional seed2 attribute to value.
+// SerializeSparseOutType sets the optional out_type attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomCropSeed2(value int64) RandomCropAttr {
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["out_type"] = value
 	}
 }
 
-// Randomly crop `image`.
-//
-// DEPRECATED at GraphDef version 8: Random crop is now pure Python
-//
-// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
-// width.  The values must be non negative.
-//
-// This Op picks a random location in `image` and crops a `height` by `width`
-// rectangle from that location.  The random location is picked so the cropped
-// area will fit inside the original image.
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
 //
 // Arguments:
-//	image: 3-D of shape `[height, width, channels]`.
-//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
-//
-// Returns 3-D of shape `[crop_height, crop_width, channels].`
-func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22703,9 +23172,9 @@ func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...Rando
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomCrop",
+		Type: "SerializeSparse",
 		Input: []tf.Input{
-			image, size,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -22713,323 +23182,214 @@ func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...Rando
 	return op.Output(0)
 }
 
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
 
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
 //
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+		m["shapes"] = value
 	}
 }
 
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["capacity"] = value
 	}
 }
 
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
 //
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["deterministic"] = value
+		m["min_after_dequeue"] = value
 	}
 }
 
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
 // If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
 //
-// value: An second seed to avoid seed collision.
+// value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// Performs fractional average pooling on the input.
-//
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
 //
-// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
-		Input: []tf.Input{
-			value,
-		},
-		Attrs: attrs,
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
-
-// Updates the table to associates keys with values.
-//
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
-//
-// Returns the created operation.
-func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableInsertV2",
-		Input: []tf.Input{
-			table_handle, keys, values,
-		},
+
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Produces the average pool of the input tensor for quantized types.
+// A queue that randomizes the order of elements.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.  The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+//	component_types: The type of each component in a value.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAvgPool",
-		Input: []tf.Input{
-			input, min_input, max_input,
-		},
+		Type: "RandomShuffleQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
+// Draw bounding boxes on a batch of images.
 //
-// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-// Arguments:
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
 //
-//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_bias: The float value that the lowest quantized bias value represents.
-//	max_bias: The float value that the highest quantized bias value represents.
+// Parts of the bounding box may fall outside the image.
 //
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBiasAdd",
+		Type: "DrawBoundingBoxes",
 		Input: []tf.Input{
-			input, bias, min_input, max_input, min_bias, max_bias,
+			images, boxes,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates summary database writer accessible by given resource handle.
-//
-// This can be used to write tensors from the execution graph directly
-// to a database. Only SQLite is supported right now. This function
-// will create the schema if it doesn't exist. Entries in the Users,
-// Experiments, and Runs tables will be created automatically if they
-// don't already exist.
-//
-// Arguments:
-//	writer: Handle to SummaryWriter resource to overwrite.
-//	db_uri: For example "file:/tmp/foo.sqlite".
-//	experiment_name: Can't contain ASCII control characters or <>. Case
-// sensitive. If empty, then the Run will not be associated with any
-// Experiment.
-//	run_name: Can't contain ASCII control characters or <>. Case sensitive.
-// If empty, then each Tag will not be associated with any Run.
-//	user_name: Must be valid as both a DNS label and Linux username. If
-// empty, then the Experiment will not be associated with any User.
+// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
+type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+
+// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// Returns the created operation.
-func CreateSummaryDbWriter(scope *Scope, writer tf.Output, db_uri tf.Output, experiment_name tf.Output, run_name tf.Output, user_name tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CreateSummaryDbWriter",
-		Input: []tf.Input{
-			writer, db_uri, experiment_name, run_name, user_name,
-		},
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
-type HistogramFixedWidthAttr func(optionalAttr)
-
-// HistogramFixedWidthDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT32
-func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
+// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["seed2"] = value
 	}
 }
 
-// Return histogram of values.
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// Given the tensor `values`, this operation returns a rank 1 histogram counting
-// the number of entries in `values` that fall into every bin.  The bins are
-// equal width and determined by the arguments `value_range` and `nbins`.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// ```python
-// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-// nbins = 5
-// value_range = [0.0, 5.0]
-// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+// For each batch, this op picks a single set of sampled candidate labels.
 //
-// with tf.get_default_session() as sess:
-//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-//   variables.global_variables_initializer().run()
-//   sess.run(hist) => [2, 1, 1, 0, 2]
-// ```
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	values: Numeric `Tensor`.
-//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
-// values <= value_range[0] will be mapped to hist[0],
-// values >= value_range[1] will be mapped to hist[-1].
-//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns A 1-D `Tensor` holding histogram of values.
-func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "HistogramFixedWidth",
-		Input: []tf.Input{
-			values, value_range, nbins,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Quantized Batch normalization.
-//
-// This op is deprecated and will be removed in the future. Prefer
-// `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	t_min: The value represented by the lowest quantized input.
-//	t_max: The value represented by the highest quantized input.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	m_min: The value represented by the lowest quantized mean.
-//	m_max: The value represented by the highest quantized mean.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v_min: The value represented by the lowest quantized variance.
-//	v_max: The value represented by the highest quantized variance.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	beta_min: The value represented by the lowest quantized offset.
-//	beta_max: The value represented by the highest quantized offset.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	gamma_min: The value represented by the lowest quantized gamma.
-//	gamma_max: The value represented by the highest quantized gamma.
-//
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "QuantizedBatchNormWithGlobalNormalization",
+		Type: "LearnedUnigramCandidateSampler",
 		Input: []tf.Input{
-			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
@@ -23037,122 +23397,103 @@ func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Add all input tensors element wise.
-//
-// Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddN",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
-
-// MaxKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the maximum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Computes gradients for the scaled exponential linear (Selu) operation.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	gradients: The backpropagated gradients to the corresponding Selu operation.
+//	outputs: The outputs of the corresponding Selu operation.
 //
-// Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MaxAttr) (output tf.Output) {
+// Returns The gradients: `gradients * (outputs + scale * alpha)`
+// if outputs < 0, `scale * gradients` otherwise.
+func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Max",
+		Type: "SeluGrad",
 		Input: []tf.Input{
-			input, reduction_indices,
+			gradients, outputs,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
+// Get the current size of the TensorArray.
+//
+// Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns The current size of the TensorArray.
+func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"DstT": DstT}
 	opspec := tf.OpSpec{
-		Type: "Cast",
+		Type: "TensorArraySizeV3",
 		Input: []tf.Input{
-			x,
+			handle, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of x AND y element-wise.
+// Deprecated. Use TensorArrayGradV3
 //
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
+func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
+		Type: "TensorArrayWriteV2",
 		Input: []tf.Input{
-			x, y,
+			handle, index, value, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
+// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
+type SparseReduceMaxAttr func(optionalAttr)
 
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
+// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Computes the complex absolute value of a tensor.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23161,9 +23502,9 @@ func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
+		Type: "SparseReduceMax",
 		Input: []tf.Input{
-			x,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -23171,373 +23512,448 @@ func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Out
 	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
-//
-// DEPRECATED at GraphDef version 17: Use Reciprocal
+// AsStringAttr is an optional argument to AsString.
+type AsStringAttr func(optionalAttr)
+
+// AsStringPrecision sets the optional precision attribute to value.
 //
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Inv",
-		Input: []tf.Input{
-			x,
-		},
+// value: The post-decimal precision to use for floating point numbers.
+// Only used if precision > -1.
+// If not specified, defaults to -1
+func AsStringPrecision(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["precision"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// OrderedMapClearAttr is an optional argument to OrderedMapClear.
-type OrderedMapClearAttr func(optionalAttr)
-
-// OrderedMapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// AsStringScientific sets the optional scientific attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
+// value: Use scientific notation for floating point numbers.
+// If not specified, defaults to false
+func AsStringScientific(value bool) AsStringAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["scientific"] = value
 	}
 }
 
-// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// AsStringShortest sets the optional shortest attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
+// value: Use shortest representation (either scientific or standard) for
+// floating point numbers.
+// If not specified, defaults to false
+func AsStringShortest(value bool) AsStringAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["shortest"] = value
 	}
 }
 
-// OrderedMapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearContainer(value string) OrderedMapClearAttr {
+// AsStringWidth sets the optional width attribute to value.
+//
+// value: Pad pre-decimal numbers to this width.
+// Applies to both floating point and integer numbers.
+// Only used if width > -1.
+// If not specified, defaults to -1
+func AsStringWidth(value int64) AsStringAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["width"] = value
 	}
 }
 
-// OrderedMapClearSharedName sets the optional shared_name attribute to value.
+// AsStringFill sets the optional fill attribute to value.
+//
+// value: The value to pad if width > -1.  If empty, pads with spaces.
+// Another typical value is '0'.  String cannot be longer than 1 character.
 // If not specified, defaults to ""
-func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
+func AsStringFill(value string) AsStringAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["fill"] = value
 	}
 }
 
-// Op removes all elements in the underlying container.
+// Converts each entry in the given tensor to strings.  Supports many numeric
 //
-// Returns the created operation.
-func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
+// types and boolean.
+func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns the element-wise max of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-//
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Computes the gradient for the inverse of `x` wrt its input.
-//
-// DEPRECATED at GraphDef version 17: Use ReciprocalGrad
-//
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InvGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the reciprocal of x element-wise.
-//
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reciprocal",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-//
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Elu",
+		Type: "AsString",
 		Input: []tf.Input{
-			features,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes square of x element-wise.
+// Deprecated. Use TensorArrayScatterV3
 //
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
+func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Square",
+		Type: "TensorArrayScatterV2",
 		Input: []tf.Input{
-			x,
+			handle, indices, value, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+// Applies sparse addition to `input` using individual values or slices
 //
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+// from `updates` according to indices `indices`.  The updates are non-aliasing:
+// `input` is only modified in-place if no other operations will use it.
+// Otherwise, a copy of `input` is made.  This operation has a gradient with
+// respect to both `input` and `updates`.
 //
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `input`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+// (if `K < P`) along the `K`th dimension of `input`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].
+// ```
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+// elements. In Python, that addition would look like this:
+//
+//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(output))
+//
+// The resulting value `output` would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See @{tf.scatter_nd} for more details about how to make updates to slices.
+//
+// Arguments:
+//	input: A Tensor.
+//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+// A tensor of indices into `input`.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
+// to add to `input`.
+//
+// Returns A `Tensor` with the same shape as `input`, containing values of `input`
+// updated with `updates`.
+func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FloorMod",
+		Type: "ScatterNdNonAliasingAdd",
 		Input: []tf.Input{
-			x, y,
+			input, indices, updates,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes square root of x element-wise.
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
+
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
 //
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sqrt",
-		Input: []tf.Input{
-			x,
-		},
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
 
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
 // If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["deterministic"] = value
 	}
 }
 
-// Computes the inverse of one or more square invertible matrices or their
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
 //
-// adjoints (conjugate transposes).
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional max pooling on the input.
 //
-// The op uses LU decomposition with partial pivoting to compute the inverses.
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
 //
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
 //
-// Arguments:
-//	input: Shape is `[..., M, M]`.
+// First we define the following:
 //
-// Returns Shape is `[..., M, M]`.
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
+		Type: "FractionalMaxPool",
 		Input: []tf.Input{
-			input,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the gradient for the sqrt of `x` wrt its input.
+// Deprecated. Use TensorArraySizeV3
 //
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
+func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
+		Type: "TensorArraySizeV2",
 		Input: []tf.Input{
-			y, dy,
+			handle, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inserts a dimension of 1 into a tensor's shape.
-//
-// Given a tensor `input`, this operation inserts a dimension of 1 at the
-// dimension index `dim` of `input`'s shape. The dimension index `dim` starts at
-// zero; if you specify a negative number for `dim` it is counted backward from
-// the end.
+// Conv2DAttr is an optional argument to Conv2D.
+type Conv2DAttr func(optionalAttr)
+
+// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DDataFormat sets the optional data_format attribute to value.
 //
-// This operation is useful if you want to add a batch dimension to a single
-// element. For example, if you have a single image of shape `[height, width,
-// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-// which will make the shape `[1, height, width, channels]`.
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func Conv2DDataFormat(value string) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DDilations sets the optional dilations attribute to value.
 //
-// Other examples:
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DDilations(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
 //
-// ```
-// # 't' is a tensor of shape [2]
-// shape(expand_dims(t, 0)) ==> [1, 2]
-// shape(expand_dims(t, 1)) ==> [2, 1]
-// shape(expand_dims(t, -1)) ==> [2, 1]
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`, this op
+// performs the following:
 //
-// # 't2' is a tensor of shape [2, 3, 5]
-// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-// ```
+// 1. Flattens the filter to a 2-D matrix with shape
+//    `[filter_height * filter_width * in_channels, output_channels]`.
+// 2. Extracts image patches from the input tensor to form a *virtual*
+//    tensor of shape `[batch, out_height, out_width,
+//    filter_height * filter_width * in_channels]`.
+// 3. For each patch, right-multiplies the filter matrix and the image patch
+//    vector.
 //
-// This operation requires that:
+// In detail, with the default NHWC format,
 //
-// `-1-input.dims() <= dim <= input.dims()`
+//     output[b, i, j, k] =
+//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+//                         filter[di, dj, q, k]
 //
-// This operation is related to `squeeze()`, which removes dimensions of
-// size 1.
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 //
 // Arguments:
+//	input: A 4-D tensor. The dimension order is interpreted according to the value
+// of `data_format`, see below for details.
+//	filter: A 4-D tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`
+//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
+// dimension of `input`. The dimension order is determined by the value of
+// `data_format`, see below for details.
+//	padding: The type of padding algorithm to use.
 //
-//	dim: 0-D (scalar). Specifies the dimension index at which to
-// expand the shape of `input`. Must be in the range
-// `[-rank(input) - 1, rank(input)]`.
-//
-// Returns Contains the same data as `input`, but its shape has an additional
-// dimension of size 1 added.
-func ExpandDims(scope *Scope, input tf.Output, dim tf.Output) (output tf.Output) {
+// Returns A 4-D tensor. The dimension order is determined by the value of
+// `data_format`, see below for details.
+func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ExpandDims",
+		Type: "Conv2D",
 		Input: []tf.Input{
-			input, dim,
+			input, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AllAttr is an optional argument to All.
-type AllAttr func(optionalAttr)
+// FakeQuantWithMinMaxArgsAttr is an optional argument to FakeQuantWithMinMaxArgs.
+type FakeQuantWithMinMaxArgsAttr func(optionalAttr)
 
-// AllKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
+// FakeQuantWithMinMaxArgsMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsMin(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxArgsNarrowRange sets the optional narrow_range attribute to value.
 // If not specified, defaults to false
-func AllKeepDims(value bool) AllAttr {
+func FakeQuantWithMinMaxArgsNarrowRange(value bool) FakeQuantWithMinMaxArgsAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["narrow_range"] = value
 	}
 }
 
-// Computes the "logical and" of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
 //
-// Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+// Attributes `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
 //
-// Returns The reduced tensor.
-func All(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AllAttr) (output tf.Output) {
+// Quantization is called fake since the output is still in floating point.
+func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23546,9 +23962,9 @@ func All(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "All",
+		Type: "FakeQuantWithMinMaxArgs",
 		Input: []tf.Input{
-			input, reduction_indices,
+			inputs,
 		},
 		Attrs: attrs,
 	}
@@ -23556,277 +23972,212 @@ func All(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ..
 	return op.Output(0)
 }
 
-// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
-type CTCBeamSearchDecoderAttr func(optionalAttr)
+// StageAttr is an optional argument to Stage.
+type StageAttr func(optionalAttr)
 
-// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+// StageCapacity sets the optional capacity attribute to value.
 //
-// value: If true, merge repeated classes in output.
-// If not specified, defaults to true
-func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageCapacity(value int64) StageAttr {
 	return func(m optionalAttr) {
-		m["merge_repeated"] = value
+		m["capacity"] = value
 	}
 }
 
-// Performs beam search decoding on the logits given in input.
-//
-// A note about the attribute merge_repeated: For the beam search decoder,
-// this means that if consecutive entries in a beam are the same, only
-// the first of these is emitted.  That is, when the top path is "A B B B B",
-// "A B" is returned if merge_repeated = True but "A B B B B" is
-// returned if merge_repeated = False.
+// StageMemoryLimit sets the optional memory_limit attribute to value.
 //
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch)`.
-//	beam_width: A scalar >= 0 (beam search beam width).
-//	top_paths: A scalar >= 0, <= beam_width (controls output size).
+// value: The maximum number of bytes allowed for Tensors in the Staging Area.
+// If > 0, inserts will block until sufficient space is available.
+// If not specified, defaults to 0
 //
-// Returns A list (length: top_paths) of indices matrices.  Matrix j,
-// size `(total_decoded_outputs[j] x 2)`, has indices of a
-// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
-// size `(length total_decoded_outputs[j])`, has the values of a
-// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
-// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
-// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
-// sequence log-probabilities.
-func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "CTCBeamSearchDecoder",
-		Input: []tf.Input{
-			inputs, sequence_length,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
+// REQUIRES: value >= 0
+func StageMemoryLimit(value int64) StageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
+}
+
+// StageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func StageContainer(value string) StageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
+}
+
+// StageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func StageSharedName(value string) StageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	log_probability = op.Output(idx)
-	return decoded_indices, decoded_values, decoded_shape, log_probability
 }
 
-// Computes reciprocal of square root of x element-wise.
+// Stage values similar to a lightweight Enqueue.
 //
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// The basic functionality of this Op is similar to a queue with many
+// fewer capabilities and options.  This Op is optimized for performance.
+//
+// Arguments:
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+// Returns the created operation.
+func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Rsqrt",
+		Type: "Stage",
 		Input: []tf.Input{
-			x,
+			tf.OutputList(values),
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// RecordInputAttr is an optional argument to RecordInput.
-type RecordInputAttr func(optionalAttr)
+// StagePeekAttr is an optional argument to StagePeek.
+type StagePeekAttr func(optionalAttr)
 
-// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
+// StagePeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: Random seeds used to produce randomized records.
-// If not specified, defaults to 301
-func RecordInputFileRandomSeed(value int64) RecordInputAttr {
+// REQUIRES: value >= 0
+func StagePeekCapacity(value int64) StagePeekAttr {
 	return func(m optionalAttr) {
-		m["file_random_seed"] = value
+		m["capacity"] = value
 	}
 }
 
-// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
-//
-// value: Shifts the list of files after the list is randomly
-// shuffled.
+// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
-func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_shuffle_shift_ratio"] = value
-	}
-}
-
-// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
 //
-// value: The randomization shuffling buffer.
-// If not specified, defaults to 10000
-func RecordInputFileBufferSize(value int64) RecordInputAttr {
+// REQUIRES: value >= 0
+func StagePeekMemoryLimit(value int64) StagePeekAttr {
 	return func(m optionalAttr) {
-		m["file_buffer_size"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
-//
-// value: How many sstables are opened and concurrently iterated over.
-// If not specified, defaults to 16
-func RecordInputFileParallelism(value int64) RecordInputAttr {
+// StagePeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StagePeekContainer(value string) StagePeekAttr {
 	return func(m optionalAttr) {
-		m["file_parallelism"] = value
+		m["container"] = value
 	}
 }
 
-// RecordInputBatchSize sets the optional batch_size attribute to value.
-//
-// value: The batch size.
-// If not specified, defaults to 32
-func RecordInputBatchSize(value int64) RecordInputAttr {
+// StagePeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StagePeekSharedName(value string) StagePeekAttr {
 	return func(m optionalAttr) {
-		m["batch_size"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Emits randomized records.
-//
-// Arguments:
-//	file_pattern: Glob pattern for the data files.
+// Op peeks at the values at the specified index.  If the
 //
-// Returns A tensor of shape [batch_size].
-func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
+// underlying container does not contain sufficient elements
+// this op will block until it does.   This Op is optimized for
+// performance.
+func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"file_pattern": file_pattern}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RecordInput",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Rounds the values of a tensor to the nearest integer, element-wise.
-//
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Round",
+		Type: "StagePeek",
 		Input: []tf.Input{
-			x,
+			index,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
-//
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
-//
-// Arguments:
-//	start: First entry in the range.
-//	stop: Last entry in the range.
-//	num: Number of values to generate.
-//
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "LinSpace",
-		Input: []tf.Input{
-			start, stop, num,
-		},
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("StagePeek", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return values
 }
 
-// Computes natural logarithm of x element-wise.
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
+
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
 //
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Log",
-		Input: []tf.Input{
-			x,
-		},
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
-
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["dilations"] = value
 	}
 }
 
-// Resize `images` to `size` using bicubic interpolation.
-//
-// Input images can be of different types but output images are always float.
+// Computes the gradients of 3-D convolution with respect to the input.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
+		Type: "Conv3DBackpropInputV2",
 		Input: []tf.Input{
-			images, size,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -23834,56 +24185,124 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 	return op.Output(0)
 }
 
-// Computes rectified linear 6 gradients for a Relu6 operation.
+// DepthToSpaceAttr is an optional argument to DepthToSpace.
+type DepthToSpaceAttr func(optionalAttr)
+
+// DepthToSpaceDataFormat sets the optional data_format attribute to value.
+// If not specified, defaults to "NHWC"
+func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthToSpace for tensors of type T.
+//
+// Rearranges data from depth into blocks of spatial data.
+// This is the reverse transformation of SpaceToDepth. More specifically,
+// this op outputs a copy of the input tensor where values from the `depth`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions.
+// The attr `block_size` indicates the input block size and how the data is moved.
+//
+//   * Chunks of data of size `block_size * block_size` from depth are rearranged
+//     into non-overlapping blocks of size `block_size x block_size`
+//   * The width the output tensor is `input_depth * block_size`, whereas the
+//     height is `input_height * block_size`.
+//   * The Y, X coordinates within each block of the output image are determined
+//     by the high order component of the input channel index.
+//   * The depth of the input tensor must be divisible by
+//     `block_size * block_size`.
+//
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
+//
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
+//                         within the input image, bX, bY means coordinates
+//                         within the output block, oC means output channels).
+//      The output would be the input transposed to the following layout:
+//      n,iY,bY,iX,bX,oC
+//
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
+// block_size = 2:
+//
+// ```
+// x = [[[[1, 2, 3, 4]]]]
+//
+// ```
+//
+// This operation will output a tensor of shape `[1, 2, 2, 1]`:
+//
+// ```
+//    [[[[1], [2]],
+//      [[3], [4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
+// the corresponding output will have 2x2 elements and will have a depth of
+// 1 channel (1 = `4 / (block_size * block_size)`).
+// The output element shape is `[2, 2, 1]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// This operation, for block size of 2, will return the following tensor of shape
+// `[1, 2, 2, 3]`
+//
+// ```
+//    [[[[1, 2, 3], [4, 5, 6]],
+//      [[7, 8, 9], [10, 11, 12]]]]
+//
+// ```
+//
+// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+//
+// ```
+// x =  [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 4 4 1]`:
+//
+// ```
+// x = [[[ [1],   [2],  [5],  [6]],
+//       [ [3],   [4],  [7],  [8]],
+//       [ [9],  [10], [13],  [14]],
+//       [ [11], [12], [15],  [16]]]]
+//
+// ```
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
-//
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes natural logarithm of (1 + x) element-wise.
 //
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+//	block_size: The size of the spatial block, same as in Space2Depth.
+func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Log1p",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that emits each dim-0 slice of `components` once.
-func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{"block_size": block_size}
+	for _, a := range optional {
+		a(attrs)
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorSliceDataset",
+		Type: "DepthToSpace",
 		Input: []tf.Input{
-			tf.OutputList(components),
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -23891,79 +24310,64 @@ func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf
 	return op.Output(0)
 }
 
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tan",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cosh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
+// MapStageAttr is an optional argument to MapStage.
+type MapStageAttr func(optionalAttr)
 
-// MapClearCapacity sets the optional capacity attribute to value.
+// MapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
+func MapStageCapacity(value int64) MapStageAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// MapStageMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
+func MapStageMemoryLimit(value int64) MapStageAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// MapClearContainer sets the optional container attribute to value.
+// MapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
 // If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
+func MapStageContainer(value string) MapStageAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MapClearSharedName sets the optional shared_name attribute to value.
+// MapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
 // If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
+func MapStageSharedName(value string) MapStageAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op removes all elements in the underlying container.
+// Stage (key, values) in the underlying container which behaves like a hashtable.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
 //
 // Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23972,639 +24376,545 @@ func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapClear",
-
+		Type: "MapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
+// MapUnstageAttr is an optional argument to MapUnstage.
+type MapUnstageAttr func(optionalAttr)
 
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+// MapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageCapacity(value int64) MapUnstageAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["capacity"] = value
 	}
 }
 
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
-// If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
 	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+// MapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageContainer(value string) MapUnstageAttr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["container"] = value
 	}
 }
 
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// MapUnstageSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+func MapUnstageSharedName(value string) MapUnstageAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "MapUnstage",
 		Input: []tf.Input{
-			size,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
-//
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
-//
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstage", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return values
 }
 
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acosh",
-		Input: []tf.Input{
-			x,
-		},
+// MapSizeAttr is an optional argument to MapSize.
+type MapSizeAttr func(optionalAttr)
+
+// MapSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeCapacity(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
-//
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
-//
-// Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
+// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= 0
+func MapSizeMemoryLimit(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
-		Input: []tf.Input{
-			l, grad,
-		},
+}
+
+// MapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapSizeContainer(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes inverse hyperbolic tangent of x element-wise.
-func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atanh",
-		Input: []tf.Input{
-			x,
-		},
+// MapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapSizeSharedName(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
+// Op returns the number of elements in the underlying container.
+func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Lgamma",
-		Input: []tf.Input{
-			x,
-		},
+		Type: "MapSize",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for real types.
-//
-// If `x` and `y` are reals, this will return the floating-point division.
+// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
+type MapIncompleteSizeAttr func(optionalAttr)
+
+// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RealDiv",
-		Input: []tf.Input{
-			x, y,
-		},
+// REQUIRES: value >= 0
+func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the number of work units this Reader has finished processing.
+// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
+// REQUIRES: value >= 0
+func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
-
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+// MapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["container"] = value
 	}
 }
 
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+// Op returns the number of incomplete elements in the underlying container.
+func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
+		Type: "MapIncompleteSize",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
+// OrderedMapUnstageAttr is an optional argument to OrderedMapUnstage.
+type OrderedMapUnstageAttr func(optionalAttr)
 
-// MinKeepDims sets the optional keep_dims attribute to value.
+// OrderedMapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
+// REQUIRES: value >= 0
+func OrderedMapUnstageCapacity(value int64) OrderedMapUnstageAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["capacity"] = value
 	}
 }
 
-// Computes the minimum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// OrderedMapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+// REQUIRES: value >= 0
+func OrderedMapUnstageMemoryLimit(value int64) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageContainer(value string) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageSharedName(value string) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
 //
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MinAttr) (output tf.Output) {
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Min",
+		Type: "OrderedMapUnstage",
 		Input: []tf.Input{
-			input, reduction_indices,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstage", err)
+		return
+	}
+	return values
 }
 
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+// OrderedMapSizeAttr is an optional argument to OrderedMapSize.
+type OrderedMapSizeAttr func(optionalAttr)
+
+// OrderedMapSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= 0
+func OrderedMapSizeCapacity(value int64) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Digamma",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// OrderedMapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapSizeMemoryLimit(value int64) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// OrderedMapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapSizeContainer(value string) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Floor",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// OrderedMapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapSizeSharedName(value string) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+// Op returns the number of elements in the underlying container.
+func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Erf",
-		Input: []tf.Input{
-			x,
-		},
+		Type: "OrderedMapSize",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gather slices from `params` axis `axis` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-// params.shape[axis + 1:]` where:
+// CTCLossAttr is an optional argument to CTCLoss.
+type CTCLossAttr func(optionalAttr)
+
+// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
 //
-// ```python
-//     # Scalar indices (output is rank(params) - 1).
-//     output[a_0, ..., a_n, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
+// value: Scalar, if true then repeated labels are
+// collapsed prior to the CTC calculation.
+// If not specified, defaults to false
+func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["preprocess_collapse_repeated"] = value
+	}
+}
+
+// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
 //
-//     # Vector indices (output is rank(params)).
-//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
+// value: Scalar.  If set to false, *during* CTC calculation
+// repeated non-blank labels will not be merged and are interpreted as
+// individual labels.  This is a simplified version of CTC.
+// If not specified, defaults to true
+func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ctc_merge_repeated"] = value
+	}
+}
+
+// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
 //
-//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
-//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
-//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
-// ```
+// value: Scalar. If set to true, during CTC
+// calculation, items that have longer output sequences than input sequences
+// are skipped: they don't contribute to the loss term and have zero-gradient.
+// If not specified, defaults to false
+func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ignore_longer_outputs_than_inputs"] = value
+	}
+}
+
+// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
+// the gradient.  This class performs the softmax operation for you, so inputs
+// should be e.g. linear projections of outputs by an LSTM.
 //
 // Arguments:
-//	params: The tensor from which to gather values. Must be at least rank
-// `axis + 1`.
-//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
-//	axis: The axis in `params` to gather `indices` from. Defaults to the first
-// dimension. Supports negative indexes.
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
+// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
+// `(batch b, time t)`.
+//	labels_values: The values (labels) associated with the given batch and time.
+//	sequence_length: A vector containing sequence lengths (batch).
 //
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
-func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
+// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
+// `(max_time x batch_size x num_classes)`.
+func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "GatherV2",
-		Input: []tf.Input{
-			params, indices, axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Erfc",
+		Type: "CTCLoss",
 		Input: []tf.Input{
-			x,
+			inputs, labels_indices, labels_values, sequence_length,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sin",
-		Input: []tf.Input{
-			x,
-		},
+// CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
+type CTCGreedyDecoderAttr func(optionalAttr)
+
+// CTCGreedyDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+//
+// value: If True, merge repeated classes in output.
+// If not specified, defaults to false
+func CTCGreedyDecoderMergeRepeated(value bool) CTCGreedyDecoderAttr {
+	return func(m optionalAttr) {
+		m["merge_repeated"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the determinant of one or more square matrices.
+// Performs greedy decoding on the logits given in inputs.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
+// A note about the attribute merge_repeated: if enabled, when
+// consecutive logits' maximum indices are the same, only the first of
+// these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
+// becomes "A B B" if merge_repeated = True and "A B B B B" if
+// merge_repeated = False.
+//
+// Regardless of the value of merge_repeated, if the maximum index of a given
+// time and batch corresponds to the blank, index `(num_classes - 1)`, no new
+// element is emitted.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch_size)`.
 //
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Indices matrix, size `(total_decoded_outputs x 2)`,
+// of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].Values vector, size: `(total_decoded_outputs)`,
+// of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.Shape vector, size `(2)`, of the decoded SparseTensor.
+// Values are: `[batch_size, max_decoded_length]`.Matrix, size `(batch_size x 1)`, containing sequence
+// log-probabilities.
+func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, optional ...CTCGreedyDecoderAttr) (decoded_indices tf.Output, decoded_values tf.Output, decoded_shape tf.Output, log_probability tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cos",
+		Type: "CTCGreedyDecoder",
 		Input: []tf.Input{
-			x,
+			inputs, sequence_length,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// BatchToSpace for 4-D tensors of type T.
-//
-// This is a legacy version of the more general BatchToSpaceND.
-//
-// Rearranges (permutes) data from batch into blocks of spatial data, followed by
-// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-// this op outputs a copy of the input tensor where values from the `batch`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions,
-// followed by cropping along the `height` and `width` dimensions.
-//
-// Arguments:
-//	input: 4-D tensor with shape
-// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//   depth]`. Note that the batch size of the input tensor must be divisible by
-// `block_size * block_size`.
-//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-// how many elements to crop from the intermediate result across the spatial
-// dimensions as follows:
-//
-//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
-//
-//
-// Returns 4-D with shape `[batch, height, width, depth]`, where:
-//
-//       height = height_pad - crop_top - crop_bottom
-//       width = width_pad - crop_left - crop_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
-//
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
+// Forwards `data` to the output port determined by `pred`.
 //
-// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+// If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
+// the data goes to `output_false`.
 //
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
+// See also `RefSwitch` and `Merge`.
 //
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+// Arguments:
+//	data: The tensor to be forwarded to the appropriate output.
+//	pred: A scalar that specifies which output port will receive data.
 //
-// ```
-// x = [[[[1], [3]], [[5], [7]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
+// Returns If `pred` is false, data will be forwarded to this output.If `pred` is true, data will be forwarded to this output.
+func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Output, output_true tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpace",
+		Type: "Switch",
 		Input: []tf.Input{
-			input, crops,
+			data, pred,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Add all input tensors element wise.
+//
+// Arguments:
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AddN",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
+// EnterAttr is an optional argument to Enter.
+type EnterAttr func(optionalAttr)
 
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+// EnterIsConstant sets the optional is_constant attribute to value.
 //
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+// value: If true, the output is constant within the child frame.
+// If not specified, defaults to false
+func EnterIsConstant(value bool) EnterAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["is_constant"] = value
 	}
 }
 
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
+// EnterParallelIterations sets the optional parallel_iterations attribute to value.
 //
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
+// value: The number of iterations allowed to run in parallel.
+// If not specified, defaults to 10
+func EnterParallelIterations(value int64) EnterAttr {
+	return func(m optionalAttr) {
+		m["parallel_iterations"] = value
+	}
+}
+
+// Creates or finds a child frame, and makes `data` available to the child frame.
 //
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
+// This op is used together with `Exit` to create loops in the graph.
+// The unique `frame_name` is used by the `Executor` to identify frames. If
+// `is_constant` is true, `output` is a constant in the child frame; otherwise
+// it may be changed in the child frame. At most `parallel_iterations` iterations
+// are run in parallel in the child frame.
 //
 // Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
+//	data: The tensor to be made available to the child frame.
+//	frame_name: The name of the child frame.
 //
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+// Returns The same tensor as `data`.
+func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"frame_name": frame_name}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToDense",
+		Type: "Enter",
 		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
+			data,
 		},
 		Attrs: attrs,
 	}
@@ -24612,136 +24922,271 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 	return op.Output(0)
 }
 
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
-
-// NthElementReverse sets the optional reverse attribute to value.
+// Produce a string tensor that encodes the state of a Reader.
 //
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderSerializeStateV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Finds values of the `n`-th order statistic for the last dimension.
-//
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
-//
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+// Exits the current frame to its parent frame.
 //
-//     values.shape = input.shape[:-1]
+// Exit makes its input `data` available to the parent frame.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
+//	data: The tensor to be made available to the parent frame.
 //
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
+// Returns The same tensor as `data`.
+func Exit(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "NthElement",
+		Type: "Exit",
 		Input: []tf.Input{
-			input, n,
+			data,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns a copy of the input tensor.
+func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Asin",
+		Type: "Snapshot",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+// Scatter `updates` into a new (initially zero) tensor according to `indices`.
 //
-// output range specified with 'requested_output_min' and 'requested_output_max'.
+// Creates a new tensor by applying sparse `updates` to individual
+// values or slices within a zero tensor of the given `shape` according to
+// indices.  This operator is the inverse of the @{tf.gather_nd} operator which
+// extracts values or slices from a given tensor.
 //
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates.
 //
-// Arguments:
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     shape = tf.constant([8])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [0, 11, 0, 10, 9, 0, 0, 12]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     shape = tf.constant([4, 4, 4])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+//
+// Arguments:
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//	shape: 1-D. The shape of the resulting tensor.
 //
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "Requantize",
+		Type: "ScatterNd",
 		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
+			indices, updates, shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ArgMinAttr is an optional argument to ArgMin.
-type ArgMinAttr func(optionalAttr)
+// SpaceToDepthAttr is an optional argument to SpaceToDepth.
+type SpaceToDepthAttr func(optionalAttr)
 
-// ArgMinOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMinOutputType(value tf.DataType) ArgMinAttr {
+// SpaceToDepthDataFormat sets the optional data_format attribute to value.
+// If not specified, defaults to "NHWC"
+func SpaceToDepthDataFormat(value string) SpaceToDepthAttr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["data_format"] = value
 	}
 }
 
-// Returns the index with the smallest value across dimensions of a tensor.
+// SpaceToDepth for tensors of type T.
 //
-// Note that in case of ties the identity of the return value is not guaranteed.
+// Rearranges blocks of spatial data, into depth. More specifically,
+// this op outputs a copy of the input tensor where values from the `height`
+// and `width` dimensions are moved to the `depth` dimension.
+// The attr `block_size` indicates the input block size.
+//
+//   * Non-overlapping blocks of size `block_size x block size` are rearranged
+//     into depth at each location.
+//   * The depth of the output tensor is `block_size * block_size * input_depth`.
+//   * The Y, X coordinates within each block of the input become the high order
+//     component of the output channel index.
+//   * The input tensor's height and width must be divisible by block_size.
+//
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
+//
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
+//                         within the output image, bX, bY means coordinates
+//                         within the input block, iC means input channels).
+//      The output would be a transpose to the following layout:
+//      n,oY,oX,bY,bX,iC
+//
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
+// block_size = 2:
+//
+// ```
+// x = [[[[1], [2]],
+//       [[3], [4]]]]
+// ```
+//
+// This operation will output a tensor of shape `[1, 1, 1, 4]`:
+//
+// ```
+// [[[[1, 2, 3, 4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
+// the corresponding output will have a single element (i.e. width and height are
+// both 1) and will have a depth of 4 channels (1 * block_size * block_size).
+// The output element shape is `[1, 1, 4]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// This operation, for block_size of 2, will return the following tensor of shape
+// `[1, 1, 1, 12]`
+//
+// ```
+// [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [5],  [6]],
+//       [[3],   [4],  [7],  [8]],
+//       [[9],  [10], [13],  [14]],
+//       [[11], [12], [15],  [16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 2 2 4]`:
+//
+// ```
+// x = [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
 //
 // Arguments:
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
+//	block_size: The size of the spatial block.
+func SpaceToDepth(scope *Scope, input tf.Output, block_size int64, optional ...SpaceToDepthAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"block_size": block_size}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMin",
+		Type: "SpaceToDepth",
 		Input: []tf.Input{
-			input, dimension,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -24749,36 +25194,36 @@ func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+// AbortAttr is an optional argument to Abort.
+type AbortAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// AbortErrorMsg sets the optional error_msg attribute to value.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
+// value: A string which is the message associated with the exception.
+// If not specified, defaults to ""
+func AbortErrorMsg(value string) AbortAttr {
+	return func(m optionalAttr) {
+		m["error_msg"] = value
+	}
+}
+
+// AbortExitWithoutError sets the optional exit_without_error attribute to value.
 // If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+func AbortExitWithoutError(value bool) AbortAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["exit_without_error"] = value
 	}
 }
 
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+// Raise a exception to abort the process when called.
 //
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// If exit_without_error is true, the process will exit normally,
+// otherwise it will exit with a SIGABORT signal.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// Returns nothing but an exception.
 //
 // Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24787,208 +25232,238 @@ func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, al
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
-		},
+		Type: "Abort",
+
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
-type InitializeTableFromTextFileV2Attr func(optionalAttr)
+// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
+type UniformCandidateSamplerAttr func(optionalAttr)
 
-// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
-//
-// value: Number of elements of the file, use -1 if unknown.
-// If not specified, defaults to -1
+// UniformCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// REQUIRES: value >= -1
-func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["vocab_size"] = value
+		m["seed"] = value
 	}
 }
 
-// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// value: Delimiter to separate fields in a line.
-// If not specified, defaults to "\t"
-func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["delimiter"] = value
+		m["seed2"] = value
 	}
 }
 
-// Initializes a table from a text file.
+// Generates labels for candidate sampling with a uniform distribution.
 //
-// It inserts one key-value pair into the table for each line of the file.
-// The key and value is extracted from the whole line content, elements from the
-// split line based on `delimiter` or the line number (starting from zero).
-// Where to extract the key and value from a line is specified by `key_index` and
-// `value_index`.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// - A value of -1 means use the line number(starting from zero), expects `int64`.
-// - A value of -2 means use the whole line content, expects `string`.
-// - A value >= 0 means use the index (starting at zero) of the split line based
-//   on `delimiter`.
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	filename: Filename of a vocabulary text file.
-//	key_index: Column index in a line to get the table `key` values from.
-//	value_index: Column index that represents information of a line to get the table
-// `value` values from.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns the created operation.
-func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "InitializeTableFromTextFileV2",
-		Input: []tf.Input{
-			table_handle, filename,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes atan of x element-wise.
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Atan",
+		Type: "UniformCandidateSampler",
 		Input: []tf.Input{
-			x,
+			true_classes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
+// FixedUnigramCandidateSamplerAttr is an optional argument to FixedUnigramCandidateSampler.
+type FixedUnigramCandidateSamplerAttr func(optionalAttr)
 
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+// FixedUnigramCandidateSamplerVocabFile sets the optional vocab_file attribute to value.
 //
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
+// value: Each valid line in this file (which should have a CSV-like format)
+// corresponds to a valid word ID. IDs are in sequential order, starting from
+// num_reserved_ids. The last entry in each line is expected to be a value
+// corresponding to the count or relative probability. Exactly one of vocab_file
+// and unigrams needs to be passed to this op.
+// If not specified, defaults to ""
+func FixedUnigramCandidateSamplerVocabFile(value string) FixedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
+		m["vocab_file"] = value
 	}
 }
 
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+// FixedUnigramCandidateSamplerDistortion sets the optional distortion attribute to value.
 //
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
+// value: The distortion is used to skew the unigram probability distribution.
+// Each weight is first raised to the distortion's power before adding to the
+// internal unigram distribution. As a result, distortion = 1.0 gives regular
+// unigram sampling (as defined by the vocab file), and distortion = 0.0 gives
+// a uniform distribution.
+// If not specified, defaults to 1
+func FixedUnigramCandidateSamplerDistortion(value float32) FixedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
+		m["distortion"] = value
 	}
 }
 
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+// FixedUnigramCandidateSamplerNumReservedIds sets the optional num_reserved_ids attribute to value.
 //
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
+// value: Optionally some reserved IDs can be added in the range [0,
+// ..., num_reserved_ids) by the users. One use case is that a special unknown
+// word token is used as ID 0. These IDs will have a sampling probability of 0.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerNumReservedIds(value int64) FixedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
+		m["num_reserved_ids"] = value
 	}
 }
 
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+// FixedUnigramCandidateSamplerNumShards sets the optional num_shards attribute to value.
 //
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
+// value: A sampler can be used to sample from a subset of the original range
+// in order to speed up the whole computation through parallelism. This parameter
+// (together with 'shard') indicates the number of partitions that are being
+// used in the overall computation.
+// If not specified, defaults to 1
+//
+// REQUIRES: value >= 1
+func FixedUnigramCandidateSamplerNumShards(value int64) FixedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
+		m["num_shards"] = value
 	}
 }
 
-// Transforms a spectrogram into a form that's useful for speech recognition.
+// FixedUnigramCandidateSamplerShard sets the optional shard attribute to value.
 //
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
+// value: A sampler can be used to sample from a subset of the original range
+// in order to speed up the whole computation through parallelism. This parameter
+// (together with 'num_shards') indicates the particular partition number of a
+// sampler op, when partitioning is being used.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// REQUIRES: value >= 0
+func FixedUnigramCandidateSamplerShard(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["shard"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Mfcc",
-		Input: []tf.Input{
-			spectrogram, sample_rate,
-		},
-		Attrs: attrs,
+}
+
+// FixedUnigramCandidateSamplerUnigrams sets the optional unigrams attribute to value.
+//
+// value: A list of unigram counts or probabilities, one per ID in sequential
+// order. Exactly one of vocab_file and unigrams should be passed to this op.
+// If not specified, defaults to <>
+func FixedUnigramCandidateSamplerUnigrams(value []float32) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["unigrams"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// QuantizedAddAttr is an optional argument to QuantizedAdd.
-type QuantizedAddAttr func(optionalAttr)
+// FixedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerSeed(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
 
-// QuantizedAddToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
+// FixedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerSeed2(value int64) FixedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["Toutput"] = value
+		m["seed2"] = value
 	}
 }
 
-// Returns x + y element-wise, working on quantized buffers.
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// Arguments:
+// A unigram sampler could use a fixed unigram distribution read from a
+// file or passed in as an in-memory array instead of building up the distribution
+// from data on the fly. There is also an option to skew the distribution by
+// applying a distortion power to the weights.
 //
+// The vocabulary file should be in CSV-like format, with the last field
+// being the weight associated with the word.
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// For each batch, this op picks a single set of sampled candidate labels.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
-// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...FixedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAdd",
+		Type: "FixedUnigramCandidateSampler",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
@@ -24996,106 +25471,55 @@ func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns an element-wise indication of the sign of a number.
-//
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+// Elementwise computes the bitwise AND of `x` and `y`.
 //
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sign",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns element-wise smallest integer in not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Ceil",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Exp",
+		Type: "BitwiseAnd",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the Max along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum
-// such that:
-//
-// \\(output_i = \max_j data_j\\) where max is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
-//  `output[i] = numeric_limits<T>::min()`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.
-//
+// Elementwise computes the bitwise left-shift of `x` and `y`.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// If `y` is negative, or greater than or equal to the width of `x` in bits the
+// result is implementation defined.
+func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
+		Type: "LeftShift",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x + y element-wise.
+// Elementwise computes the bitwise right-shift of `x` and `y`.
 //
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Performs a logical shift for unsigned integer types, and an arithmetic shift
+// for signed integer types.
+//
+// If `y` is negative, or greater than or equal to than the width of `x` in bits
+// the result is implementation defined.
+func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Add",
+		Type: "RightShift",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -25104,81 +25528,191 @@ func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Returns x + y element-wise.
+// Adjust the hue of one or more images.
 //
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AddV2",
+		Type: "AdjustHue",
 		Input: []tf.Input{
-			x, y,
+			images, delta,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Saves the input tensors to disk.
-//
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
+
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
 //
-// See also `SaveSlices`.
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of average pooling function.
 //
 // Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Save",
+		Type: "AvgPool3DGrad",
 		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
+			orig_input_shape, grad,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
+// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
+type ParseSingleSequenceExampleAttr func(optionalAttr)
 
-// BiasAddDataFormat sets the optional data_format attribute to value.
+// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["context_sparse_types"] = value
 	}
 }
 
-// Adds `bias` to `value`.
+// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
 //
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+//
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A scalar containing a binary serialized SequenceExample proto.
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExample.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExample.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	debug_name: A scalar containing the name of the serialized proto.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty scalar if no name is available.
+func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25187,51 +25721,98 @@ func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddA
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
+		Type: "ParseSingleSequenceExample",
 		Input: []tf.Input{
-			value, bias,
+			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
 }
 
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
+// DecodeWavAttr is an optional argument to DecodeWav.
+type DecodeWavAttr func(optionalAttr)
 
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+// DecodeWavDesiredChannels sets the optional desired_channels attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+// value: Number of sample channels wanted.
+// If not specified, defaults to -1
+func DecodeWavDesiredChannels(value int64) DecodeWavAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["desired_channels"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
+// DecodeWavDesiredSamples sets the optional desired_samples attribute to value.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
+// value: Length of audio requested.
+// If not specified, defaults to -1
+func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
+	return func(m optionalAttr) {
+		m["desired_samples"] = value
+	}
+}
+
+// Decode a 16-bit PCM WAV file to a float tensor.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// When desired_channels is set, if the input contains fewer channels than this
+// then the last channel will be duplicated to give the requested number, else if
+// the input has more channels than requested then the additional channels will be
+// ignored.
+//
+// If desired_samples is set, then the audio will be cropped or padded with zeroes
+// to the requested length.
+//
+// The first output contains a Tensor with the content of the audio samples. The
+// lowest dimension will be the number of channels, and the second will be the
+// number of samples. For example, a ten-sample-long stereo WAV file should give an
+// output shape of [10, 2].
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	contents: The WAV-encoded audio, usually from a file.
+//
+// Returns 2-D with shape `[length, channels]`.Scalar holding the sample rate found in the WAV header.
+func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (audio tf.Output, sample_rate tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25240,65 +25821,50 @@ func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
+		Type: "DecodeWav",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			contents,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns x * y element-wise.
-//
-// *NOTE*: `Mul` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mul",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x / y element-wise.
-//
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Div",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
+// UniqueAttr is an optional argument to Unique.
+type UniqueAttr func(optionalAttr)
 
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+// UniqueOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueOutIdx(value tf.DataType) UniqueAttr {
 	return func(m optionalAttr) {
-		m["tolerance"] = value
+		m["out_idx"] = value
 	}
 }
 
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.
+func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25307,539 +25873,530 @@ func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...Approx
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
+		Type: "Unique",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
-//
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Maximum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
-type LogUniformCandidateSamplerAttr func(optionalAttr)
-
-// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
+	return op.Output(0), op.Output(1)
 }
 
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// Concatenates a list of `N` tensors along the first dimension.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a log-uniform distribution.
+// The input tensors are all required to have size 1 in the first dimension.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// For example:
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// ```
+// # 'x' is [[1, 4]]
+// # 'y' is [[2, 5]]
+// # 'z' is [[3, 6]]
+// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// ```
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// The difference between concat and parallel_concat is that concat requires all
+// of the inputs be computed before the operation will begin but doesn't require
+// that the input shapes be known during graph construction.  Parallel concat
+// will copy pieces of the input into the output as they become available, in
+// some situations this can provide a performance benefit.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	values: Tensors to be concatenated. All must have size 1 in the first dimension
+// and same shape.
+//	shape: the final shape of the result; should be equal to the shapes of any input
+// but with the number of input values in the first dimension.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns The concatenated tensor.
+func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
+		Type: "ParallelConcat",
 		Input: []tf.Input{
-			true_classes,
+			tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns the truth value of (x < y) element-wise.
+// Concatenates tensors along one dimension.
 //
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Less",
+		Type: "Concat",
 		Input: []tf.Input{
-			x, y,
+			concat_dim, tf.OutputList(values),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
-type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
+// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
 //
-// value: The bitwidth of the quantization; between 2 and 8, inclusive.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
+// The lower regularized incomplete Gamma function is defined as:
 //
-// value: Whether to quantize into 2^num_bits - 1 distinct values.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxVars operation.
 //
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
-// min, max: Quantization interval, scalar floats.
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
 //
+// where
 //
+// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
 //
-// Returns Backpropagated gradients w.r.t. inputs:
-// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
-// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
-// `sum(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsGradient",
+		Type: "Igamma",
 		Input: []tf.Input{
-			gradients, inputs, min, max,
+			a, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
-type MaxPoolGradV2Attr func(optionalAttr)
-
-// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
+// Computes offsets of concat inputs within its output.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
+// For example:
+//
+// ```
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
+// ```
+//
+// This is typically used by gradient computations for a concat operation.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradV2",
+		Type: "ConcatOffset",
 		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
+			concat_dim, tf.OutputList(shape),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-//
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Minimum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
-
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
 	}
+	return offset
 }
 
-// The backward operation for "BiasAdd" on the "bias" tensor.
-//
-// It accumulates all the values from out_backprop into the feature dimension.
-// For NHWC data format, the feature dimension is the last. For NCHW data format,
-// the feature dimension is the third-to-last.
+// Splits a tensor into `num_split` tensors along one dimension.
 //
 // Arguments:
-//	out_backprop: Any number of dimensions.
+//	axis: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
+//	value: The tensor to split.
+//	num_split: The number of ways to split.  Must evenly divide
+// `value.shape[split_dim]`.
 //
-// Returns 1-D with size the feature dimension of `out_backprop`.
-func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+// Returns They are identically shaped tensors, whose shape matches that of `value`
+// except along `axis`, where their sizes are
+// `values.shape[split_dim] / num_split`.
+func Split(scope *Scope, axis tf.Output, value tf.Output, num_split int64) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "BiasAddGrad",
+		Type: "Split",
 		Input: []tf.Input{
-			out_backprop,
+			axis, value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Split", err)
+		return
+	}
+	return output
 }
 
-// Computes the power of one value to another.
+// Splits a tensor into `num_split` tensors along one dimension.
 //
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
+// Arguments:
+//	value: The tensor to split.
+//	size_splits: list containing the sizes of each output tensor along the split
+// dimension. Must sum to the dimension of value along split_dim.
+// Can contain one -1 indicating that dimension is to be inferred.
+//	axis: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
 //
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+//
+// Returns Tensors whose shape matches that of `value`
+// except along `axis`, where their sizes are
+// `size_splits[i]`.
+func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output, num_split int64) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "Pow",
+		Type: "SplitV",
 		Input: []tf.Input{
-			x, y,
+			value, size_splits, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("SplitV", err)
+		return
+	}
+	return output
 }
 
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-//
-// The upper regularized incomplete Gamma function is defined as:
-//
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
-//
-// where
+// Gives a guarantee to the TF runtime that the input tensor is a constant.
 //
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+// The runtime is then free to make optimizations based on this.
 //
-// is the upper incomplete Gama function.
+// Only accepts value typed tensors as inputs and rejects resource variable handles
+// as input.
 //
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Returns the input tensor without modification.
+func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Igammac",
+		Type: "GuaranteeConst",
 		Input: []tf.Input{
-			a, x,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
-//
-// The lower regularized incomplete Gamma function is defined as:
-//
-//
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
-//
-// where
-//
-// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
+// Returns a tensor of zeros with the same shape and type as x.
 //
-// is the lower incomplete Gamma function.
+// Arguments:
+//	x: a tensor of type T.
 //
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Returns a tensor of the same shape and type as x but filled with zeros.
+func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Igamma",
+		Type: "ZerosLike",
 		Input: []tf.Input{
-			a, x,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+// Flips all bits elementwise.
 //
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+// The result will have exactly those bits set, that are not set in `x`. The
+// computation is performed on the underlying representation of x.
+func Invert(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Atan2",
+		Type: "Invert",
 		Input: []tf.Input{
-			y, x,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
+// DequantizeAttr is an optional argument to Dequantize.
+type DequantizeAttr func(optionalAttr)
+
+// DequantizeMode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func DequantizeMode(value string) DequantizeAttr {
+	return func(m optionalAttr) {
+		m["mode"] = value
+	}
+}
+
+// Dequantize the 'input' tensor into a float Tensor.
 //
-// The regularized incomplete beta integral is defined as:
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
 //
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+// ```
+// if T == qint8, in[i] += (range(T) + 1)/ 2.0
+// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 //
-// where
+// *MIN_COMBINED Mode Example*
 //
+// If the input comes from a QuantizedRelu6, the output type is
+// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+// Dequantize on quint8 will take each value, cast to float, and multiply
+// by 6 / 255.
+// Note that if quantizedtype is qint8, the operation will additionally add
+// each value by 128 prior to casting.
 //
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+// If the mode is 'MIN_FIRST', then this approach is used:
 //
+// ```c++
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = range / num_discrete_values
+// const double offset_input = static_cast<double>(input) - lowest_quantized;
+// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+// ```
 //
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (2 * m) / (max_fixed - min_fixed)
+// ```
+//
+// Now we can dequantize the elements of our tensor:
+// ```c++
+// result = input * s
+// ```
+//
+// Arguments:
+//
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Betainc",
+		Type: "Dequantize",
 		Input: []tf.Input{
-			a, b, x,
+			input, min_range, max_range,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of x OR y element-wise.
+// Returns the element-wise max of two SparseTensors.
 //
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalOr",
+		Type: "SparseSparseMaximum",
 		Input: []tf.Input{
-			x, y,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Selects elements from `t` or `e`, depending on `condition`.
-//
-// The `t`, and `e` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `t` and `e` are scalars.
-// If `t` and `e` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `t`, or must have
-// the same shape as `t`.
-//
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `t` (if true) or `e` (if false).
-//
-// If `condition` is a vector and `t` and `e` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `t` and `e`.
-// If `condition` has the same shape as `t` and `e`, then it chooses which
-// element to copy from `t` and `e`.
-//
-// For example:
+// Returns a batched matrix tensor with new batched diagonal values.
 //
-// ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
+// Given `input` and `diagonal`, this operation returns a tensor with the
+// same shape and values as `input`, except for the main diagonal of the
+// innermost matrices.  These will be overwritten by the values in `diagonal`.
 //
+// The output is computed as follows:
 //
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
+// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
 //
-// ```
+//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
 //
 // Arguments:
+//	input: Rank `k+1`, where `k >= 1`.
+//	diagonal: Rank `k`, where `k >= 1`.
 //
-//	t: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `t` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	e: = A `Tensor` with the same type and shape as `t`.
-//
-// Returns = A `Tensor` with the same type and shape as `t` and `e`.
-func Select(scope *Scope, condition tf.Output, t tf.Output, e tf.Output) (output tf.Output) {
+// Returns Rank `k+1`, with `output.shape = input.shape`.
+func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Select",
+		Type: "MatrixSetDiag",
 		Input: []tf.Input{
-			condition, t, e,
+			input, diagonal,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
 
-// MatMulTransposeA sets the optional transpose_a attribute to value.
+// EditDistanceNormalize sets the optional normalize attribute to value.
 //
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// MatMulTransposeB sets the optional transpose_b attribute to value.
+// value: boolean (if true, edit distances are normalized by length of truth).
 //
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
 	return func(m optionalAttr) {
-		m["transpose_b"] = value
+		m["normalize"] = value
 	}
 }
 
-// Multiply the matrix "a" by the matrix "b".
+// Computes the (possibly normalized) Levenshtein Edit Distance.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
+//
+// The inputs are:
+//
+// Arguments:
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
+//
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
 //
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25848,9 +26405,9 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatMul",
+		Type: "EditDistance",
 		Input: []tf.Input{
-			a, b,
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
 		},
 		Attrs: attrs,
 	}
@@ -25858,213 +26415,235 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 	return op.Output(0)
 }
 
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
-
-// MeanKeepDims sets the optional keep_dims attribute to value.
+// Gather slices from `params` into a Tensor with shape specified by `indices`.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the mean of elements across dimensions of a tensor.
+// `indices` is an K-dimensional integer tensor, best thought of as a
+// (K-1)-dimensional tensor of indices into `params`, where each element defines a
+// slice of `params`:
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+//     output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]
+//
+// Whereas in @{tf.gather} `indices` defines slices into the first
+// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
+// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
+//
+// The last dimension of `indices` can be at most the rank of
+// `params`:
+//
+//     indices.shape[-1] <= params.rank
+//
+// The last dimension of `indices` corresponds to elements
+// (if `indices.shape[-1] == params.rank`) or slices
+// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+// of `params`.  The output tensor has shape
+//
+//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
+//
+// Some examples below.
+//
+// Simple indexing into a matrix:
+//
+// ```python
+//     indices = [[0, 0], [1, 1]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = ['a', 'd']
+// ```
+//
+// Slice indexing into a matrix:
+//
+// ```python
+//     indices = [[1], [0]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['c', 'd'], ['a', 'b']]
+// ```
+//
+// Indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['a1', 'b1'], ['c1', 'd1']]]
+//
+//
+//     indices = [[0, 1], [1, 0]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['c0', 'd0'], ['a1', 'b1']]
+//
+//
+//     indices = [[0, 0, 1], [1, 0, 1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = ['b0', 'b1']
+// ```
+//
+// Batched indexing into a matrix:
+//
+// ```python
+//     indices = [[[0, 0]], [[0, 1]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['a'], ['b']]
+// ```
+//
+// Batched slice indexing into a matrix:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [[['c', 'd']], [['a', 'b']]]
+// ```
+//
+// Batched indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
+//               [[['a0', 'b0'], ['c0', 'd0']]]]
+//
+//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['c0', 'd0'], ['a1', 'b1']],
+//               [['a0', 'b0'], ['c1', 'd1']]]
+//
+//
+//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['b0', 'b1'], ['d0', 'c1']]
+// ```
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	params: The tensor from which to gather values.
+//	indices: Index tensor.
 //
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MeanAttr) (output tf.Output) {
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
+func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Mean",
+		Type: "GatherNd",
 		Input: []tf.Input{
-			input, reduction_indices,
+			params, indices,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns which elements of x are finite.
+// Eagerly executes a python function to compute func(input)->output. The
 //
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
 	opspec := tf.OpSpec{
-		Type: "IsFinite",
+		Type: "EagerPyFunc",
 		Input: []tf.Input{
-			x,
+			tf.OutputList(input),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
-
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
+		return
 	}
+	return output
 }
 
-// Returns the index with the largest value across dimensions of a tensor.
+// Stops gradient computation.
 //
-// Note that in case of ties the identity of the return value is not guaranteed.
+// When executed in a graph, this op outputs its input tensor as-is.
 //
-// Arguments:
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
+//
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ArgMax",
+		Type: "StopGradient",
 		Input: []tf.Input{
-			input, dimension,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentSum",
+		Type: "Asin",
 		Input: []tf.Input{
-			data, segment_ids,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
-
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
-	}
-}
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
 
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
+// PreventGradientMessage sets the optional message attribute to value.
 //
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
 	return func(m optionalAttr) {
-		m["bad_color"] = value
+		m["message"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// An identity op that triggers an error if a gradient is requested.
 //
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+// When executed in a graph, this op outputs its input tensor as-is.
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
+//	input: any tensor.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26073,9 +26652,9 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ImageSummary",
+		Type: "PreventGradient",
 		Input: []tf.Input{
-			tag, tensor,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -26083,37 +26662,23 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 	return op.Output(0)
 }
 
-// Bucketizes 'input' based on 'boundaries'.
-//
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
+// Checks a tensor for NaN and Inf values.
 //
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
+// When run, reports an `InvalidArgument` error if `tensor` has any values
+// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
 //
 // Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
-//
-// Returns Same shape with 'input', each value of input replaced with bucket index.
 //
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+//	message: Prefix of the error message.
+func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
+	attrs := map[string]interface{}{"message": message}
 	opspec := tf.OpSpec{
-		Type: "Bucketize",
+		Type: "CheckNumerics",
 		Input: []tf.Input{
-			input,
+			tensor,
 		},
 		Attrs: attrs,
 	}
@@ -26121,503 +26686,713 @@ func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.O
 	return op.Output(0)
 }
 
-// Reshapes a SparseTensor to represent values in a new dense shape.
+// Shuffle dimensions of x according to a permutation and conjugate the result.
 //
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
+func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConjugateTranspose",
+		Input: []tf.Input{
+			x, perm,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UniqueV2Attr is an optional argument to UniqueV2.
+type UniqueV2Attr func(optionalAttr)
+
+// UniqueV2OutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
 //
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. In other words:
 //
-// Reshaping does not affect the order of values in the SparseTensor.
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//	x: A `Tensor`.
+//	axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+// find the unique elements.
 //
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
+// value of x in the output y.
+func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseReshape",
+		Type: "UniqueV2",
 		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
+			x, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Computes the product along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \prod_j data_j\\) where the product is over `j` such
-// that `segment_ids[j] == i`.
+// Return a slice from 'input'.
 //
-// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+// The output tensor is a tensor with dimensions described by 'size'
+// whose values are extracted from 'input' starting at the offsets in
+// 'begin'.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
-// </div>
+// *Requirements*:
+//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
 //
 // Arguments:
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+//	begin: begin[i] specifies the offset into the 'i'th dimension of
+// 'input' to slice from.
+//	size: size[i] specifies the number of elements of the 'i'th dimension
+// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
+// i are included in the slice (i.e. this is equivalent to setting
+// size[i] = input.dim_size(i) - begin[i]).
+func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentProd",
+		Type: "Slice",
 		Input: []tf.Input{
-			data, segment_ids,
+			input, begin, size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// `num_segments` should equal the number of distinct segment IDs.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
+type StridedSliceGradAttr func(optionalAttr)
+
+// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// StridedSliceGradEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Returns the gradient of `StridedSlice`.
 //
+// Since `StridedSlice` cuts out pieces of its `input` which is size
+// `shape`, its gradient will have the same shape (which is passed here
+// as `shape`). The gradient will be zero in any element that the slice
+// does not select.
 //
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Arguments are the same as StridedSliceGrad with the exception that
+// `dy` is the input gradient to be propagated and `shape` is the
+// shape of `StridedSlice`'s `input`.
+func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
+		Type: "StridedSliceGrad",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			shape, begin, end, strides, dy,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns the gradient of `Tile`.
+//
+// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
+//
+// Since `Tile` takes an input and repeats the input `multiples` times
+// along each dimension, `TileGrad` takes in `multiples` and aggregates
+// each repeated tile of `input` into `output`.
+func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sinh",
+		Type: "TileGrad",
 		Input: []tf.Input{
-			x,
+			input, multiples,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
 //
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
+	}
+}
+
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
 //
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-// # => [[0 0 0 0]
-// #     [5 6 7 8]]
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the dimension index in the destination data format given the one in
 //
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
-// ```
+// the source data format.
 //
 // Arguments:
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
+		Type: "DataFormatDimMap",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Counts the number of occurrences of each value in an integer array.
-//
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
-//
-// Values in `arr` outside of the range [0, size) are ignored.
-//
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
+// Return the shape of s0 op s1 with broadcast.
 //
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Bincount",
+		Type: "BroadcastArgs",
 		Input: []tf.Input{
-			arr, size, weights,
+			s0, s1,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pop the element at the top of the stack.
-//
-// Arguments:
-//	handle: The handle to a stack.
-//	elem_type: The type of the elem that is popped.
+// Return the reduction indices for computing gradients of s0 op s1 with broadcast.
 //
-// Returns The tensor that is popped from the top of the stack.
-func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+// This is typically used by gradient computations for a broadcasting operation.
+func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output, r1 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "StackPopV2",
+		Type: "BroadcastGradientArgs",
 		Input: []tf.Input{
-			handle,
+			s0, s1,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
-
-// WholeFileReaderV2Container sets the optional container attribute to value.
+// Pads a tensor with mirrored values.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+// This operation pads a `input` with mirrored values according to the `paddings`
+// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many values to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many values to add after the contents of `input`
+// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
+// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
+// (if false, respectively).
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the entire contents of a file as a value.
+// The padded size of each dimension D of the output is:
 //
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
 //
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6]].
+// # 'paddings' is [[1, 1]], [2, 2]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
+//                       [2, 1, 1, 2, 3, 3, 2]
+//                       [5, 4, 4, 5, 6, 6, 5]
+//                       [5, 4, 4, 5, 6, 6, 5]]
+// ```
+//
+// Arguments:
+//	input: The input tensor to be padded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
+// do not include the borders, while in symmetric mode the padded regions
+// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
+// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
+// it is `[1, 2, 3, 3, 2]` in symmetric mode.
+//
+// Returns The padded tensor.
+func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"mode": mode}
 	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
-
+		Type: "MirrorPad",
+		Input: []tf.Input{
+			input, paddings,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
+// A placeholder op for a value that will be fed into the computation.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// DEPRECATED at GraphDef version 23: Placeholder now behaves the same as PlaceholderV2.
 //
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// N.B. This operation will fail with an error if it is executed. It is
+// intended as a way to represent a value that will always be fed, and to
+// provide attrs that enable the fed value to be checked at runtime.
 //
 // Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor. The shape can be any partially-specified
+// shape.  To be unconstrained, pass in a shape with unknown rank.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns A placeholder tensor that must be replaced using the feed mechanism.
+func PlaceholderV2(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
+		Type: "PlaceholderV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reshapes a quantized tensor as per the Reshape op.
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// ```
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the adadelta scheme.
+//
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
 //
 // Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
 //
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
-//
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
+		Type: "ResourceApplyAdadelta",
 		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
+			var_, accum, accum_update, lr, rho, epsilon, grad,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Computes gradients for SparseSegmentSqrtN.
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
+
+// SqueezeAxis sets the optional axis attribute to value.
 //
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
 //
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
+	return func(m optionalAttr) {
+		m["squeeze_dims"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a sequence of numbers.
+// Removes dimensions of size 1 from the shape of a tensor.
 //
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
 //
 // For example:
 //
 // ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
+//
+// Or, to remove specific size 1 dimensions:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
 // ```
 //
 // Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+//	input: The `input` to squeeze.
 //
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Range",
+		Type: "Squeeze",
 		Input: []tf.Input{
-			start, limit, delta,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
-
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func AngleTout(value tf.DataType) AngleAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the argument of a complex number.
+// SpaceToBatch for N-D tensors of type T.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the argument of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part.
+// This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
+// grid of blocks of shape `block_shape`, and interleaves these blocks with the
+// "batch" dimension (0) such that in the output, the spatial dimensions
+// `[1, ..., M]` correspond to the position within the grid, and the batch
+// dimension combines both the position within a spatial block and the original
+// batch position.  Prior to division into blocks, the spatial dimensions of the
+// input are optionally zero padded according to `paddings`.  See below for a
+// precise description.
 //
-// The argument returned by this operation is of the form \\(atan2(b, a)\\).
+// Arguments:
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has `M` dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	paddings: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
+//   `i + 1`, which corresponds to spatial dimension `i`.  It is required that
+//   `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
 //
-// For example:
+// This operation is equivalent to the following steps:
+//
+// 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
+//    input according to `paddings` to produce `padded` of shape `padded_shape`.
+//
+// 2. Reshape `padded` to `reshaped_padded` of shape:
+//
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//        block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1],
+//       block_shape[M-1]] +
+//      remaining_shape
+//
+// 3. Permute dimensions of `reshaped_padded` to produce
+//    `permuted_reshaped_padded` of shape:
+//
+//      block_shape +
+//      [batch] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
+//    dimension, producing an output tensor of shape:
+//
+//      [batch * prod(block_shape)] +
+//      [padded_shape[1] / block_shape[0],
+//       ...,
+//       padded_shape[M] / block_shape[M-1]] +
+//      remaining_shape
+//
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
 //
 // ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
+// x = [[[[1], [2]], [[3], [4]]]]
 // ```
 //
-// @compatibility(numpy)
-// Equivalent to np.angle.
-// @end_compatibility
-func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
+// The output tensor has shape `[4, 1, 1, 1]` and value:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 3]` and value:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
+//     `paddings = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
+//     paddings = `[[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[8, 1, 3, 1]` and value:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddings tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Angle",
+		Type: "SpaceToBatchND",
 		Input: []tf.Input{
-			input,
+			input, block_shape, paddings,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
+type QuantizeAndDequantizeV2Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV2SignedInput sets the optional signed_input attribute to value.
+//
+// value: If the quantization is signed or unsigned.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV2SignedInput(value bool) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
 }
 
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
-
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// QuantizeAndDequantizeV2NumBits sets the optional num_bits attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+// value: The bitwidth of the quantization.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeV2NumBits(value int64) QuantizeAndDequantizeV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["num_bits"] = value
 	}
 }
 
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// QuantizeAndDequantizeV2RangeGiven sets the optional range_given attribute to value.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
+// value: If the range is given or should be computed from the tensor.
 // If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["range_given"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+// Quantizes then dequantizes a tensor.
 //
-// Set use_nesterov = True if you want to use Nesterov momentum.
+// This op simulates the precision loss from the quantized forward pass by:
+// 1. Quantizing the tensor to fixed point numbers, which should match the target
+//    quantization method when it is used in inference.
+// 2. Dequantizing it back to floating point numbers for the following ops, most
+//    likely matmul.
 //
-// That is for rows we have grad for, we update var and accum as follows:
+// There are different ways to quantize. This version does not use the full range
+// of the output type, choosing to elide the lowest possible value for symmetry
+// (e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit
+// quantization), so that 0.0 maps to 0.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// To perform this op, we first find the range of values in our tensor. The range
+// we use is always centered on 0, so we find m such that
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
+// 1. m = max(abs(input_min), abs(input_max)) if range_given is true,
+// 2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
 //
-// Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+// Our input tensor range is then [-m, m].
+//
+// Next, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].
+// If signed_input is true, this is
+//
+//   [min_fixed, max_fixed ] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].
+//
+// Otherwise, if signed_input is false, the fixed-point range is
+//
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].
+//
+// From this we compute our scaling factor, s:
+//
+//   s = (max_fixed - min_fixed) / (2 * m).
+//
+// Now we can quantize and dequantize the elements of our tensor.  An element e
+// is transformed into e':
+//
+//   e' = (e * s).round_to_nearest() / s.
+//
+// Note that we have a different number of buckets in the signed vs. unsigned
+// cases.  For example, if num_bits == 8, we get 254 buckets in the signed case
+// vs. 255 in the unsigned case.
+//
+// For example, suppose num_bits = 8 and m = 1.  Then
+//
+//   [min_fixed, max_fixed] = [-127, 127], and
+//   s = (127 + 127) / 2 = 127.
+//
+// Given the vector {-1, -0.5, 0, 0.3}, this is quantized to
+// {-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}.
+//
+// Arguments:
+//	input: Tensor to quantize and then dequantize.
+//	input_min: If range_given, this is the min of the range, otherwise this input
+// will be ignored.
+//	input_max: If range_given, this is the max of the range, otherwise this input
+// will be ignored.
+func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26626,546 +27401,617 @@ func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
+		Type: "QuantizeAndDequantizeV2",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
+			input, input_min, input_max,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the complex conjugate of a complex number.
+// SpaceToBatch for 4-D tensors of type T.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
+// This is a legacy version of the more general SpaceToBatchND.
 //
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+// Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
+// More specifically, this op outputs a copy of the input tensor where values from
+// the `height` and `width` dimensions are moved to the `batch` dimension. After
+// the zero-padding, both `height` and `width` of the input must be divisible by the
+// block size.
 //
-// For example:
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, depth]`.
+//	paddings: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+//   the padding of the input with zeros across the spatial dimensions as follows:
+//
+//       paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
+//
+//   The effective spatial dimensions of the zero-padded input tensor will be:
+//
+//       height_pad = pad_top + height + pad_bottom
+//       width_pad = pad_left + width + pad_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+//   * Non-overlapping blocks of size `block_size x block size` in the height and
+//     width dimensions are rearranged into the batch dimension at each location.
+//   * The batch of the output tensor is `batch * block_size * block_size`.
+//   * Both height_pad and width_pad must be divisible by block_size.
+//
+// The shape of the output will be:
+//
+//     [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//      depth]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
 //
 // ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// x = [[[[1], [2]], [[3], [4]]]]
 // ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Conj",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A placeholder op that passes through `input` when its output is not fed.
 //
-// Arguments:
-//	input: The default value to produce when `output` is not fed.
-//	shape: The (possibly partial) shape of the tensor.
+// The output tensor has shape `[4, 1, 1, 1]` and value:
 //
-// Returns A placeholder tensor that defaults to `input` if it is not fed.
-func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"shape": shape}
-	opspec := tf.OpSpec{
-		Type: "PlaceholderWithDefault",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Use TensorArrayReadV3
-func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV2",
-		Input: []tf.Input{
-			handle, index, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
-
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
 //
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+// (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// The output tensor has shape `[4, 1, 1, 3]` and value:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]],
+//       [[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+//
+// The output tensor has shape `[4, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
 //
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+// (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
 //
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Tactivation"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
+// The output tensor has shape `[8, 1, 2, 1]` and value:
 //
-// Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+// Among others, this operation is useful for reducing atrous convolution into
+// regular convolution.
+//
+func SpaceToBatch(scope *Scope, input tf.Output, paddings tf.Output, block_size int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
+		Type: "SpaceToBatch",
 		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
+			input, paddings,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// QuantizedMulAttr is an optional argument to QuantizedMul.
-type QuantizedMulAttr func(optionalAttr)
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
 
-// QuantizedMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
+// UnpackAxis sets the optional axis attribute to value.
+//
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
 	return func(m optionalAttr) {
-		m["Toutput"] = value
+		m["axis"] = value
 	}
 }
 
-// Returns x * y element-wise, working on quantized buffers.
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
 //
-// Arguments:
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
 //
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// This is the opposite of `pack`.
 //
-// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// Arguments:
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
+//
+//
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num": num}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMul",
+		Type: "Unpack",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
+		return
+	}
+	return output
 }
 
-// Forwards the input to the output.
-//
-// This operator represents the loop termination condition used by the
-// "pivot" switches of a loop.
+// Increments variable pointed to by 'resource' until it reaches 'limit'.
 //
 // Arguments:
-//	input: A boolean scalar, representing the branch predicate of the Switch op.
+//	resource: Should be from a scalar `Variable` node.
+//	limit: If incrementing ref would bring it above limit, instead generates an
+// 'OutOfRange' error.
 //
-// Returns The same tensor as `input`.
-func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
+//
+// Returns A copy of the input before increment. If nothing else modifies the
+// input, the values produced will all be distinct.
+func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"limit": limit, "T": T}
 	opspec := tf.OpSpec{
-		Type: "LoopCond",
+		Type: "ResourceCountUpTo",
 		Input: []tf.Input{
-			input,
+			resource,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns (x - y)(x - y) element-wise.
+// Delete the stack from its resource container.
 //
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	handle: The handle to a stack.
+//
+// Returns the created operation.
+func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
+		Type: "StackCloseV2",
 		Input: []tf.Input{
-			x, y,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+// BatchToSpace for N-D tensors of type T.
 //
-// actual distribution of the values to maximize the usage of the lower bit depth
-// and adjusting the output min and max ranges accordingly.
+// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
+// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+// the input.  The spatial dimensions of this intermediate result are then
+// optionally cropped according to `crops` to produce the output.  This is the
+// reverse of SpaceToBatch.  See below for a precise description.
 //
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+// Arguments:
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has M dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+//   required that
+//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
+//
+// This operation is equivalent to the following steps:
+//
+// 1. Reshape `input` to `reshaped` of shape:
+//      [block_shape[0], ..., block_shape[M-1],
+//       batch / prod(block_shape),
+//       input_shape[1], ..., input_shape[N-1]]
+//
+// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1], block_shape[0],
+//       ...,
+//       input_shape[M], block_shape[M-1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0],
+//       ...,
+//       input_shape[M] * block_shape[M-1],
+//
+//       input_shape[M+1],
+//       ...,
+//       input_shape[N-1]]
+//
+// 4. Crop the start and end of dimensions `[1, ..., M]` of
+//    `reshaped_permuted` according to `crops` to produce the output of shape:
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+//       ...,
+//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
 //
-// This operator tries to squeeze as much precision as possible into an output with
-// a lower bit depth by calculating the actual min and max values found in the
-// data. For example, maybe that quint16 input has no values lower than 16,384 and
-// none higher than 49,152. That means only half the range is actually needed, all
-// the float interpretations are between -0.5f and 0.5f, so if we want to compress
-// the data into a quint8 output, we can use that range rather than the theoretical
-// -1.0f to 1.0f that is suggested by the input min and max.
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
 //
-// In practice, this is most useful for taking output from operations like
-// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-// may have large potential output ranges, but in practice have a distribution of
-// input values that only uses a small fraction of the possible range. By feeding
-// that output into this operator, we can reduce it from 32 bits down to 8 with
-// minimal loss of accuracy.
+// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [2, 0]]`:
 //
-// Arguments:
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+// The output tensor has shape `[2, 2, 4, 1]` and value:
 //
-// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
+		Type: "BatchToSpaceND",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			input, block_shape, crops,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+// Extract `patches` from `images` and put them in the "depth" output dimension.
 //
-// Each comparison returns a boolean `true` (if `input_value > threshold`)
-// or and `false` otherwise.
+// Arguments:
+//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
+//	ksizes: The size of the sliding window for each dimension of `images`.
+//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
+// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
+// input stride, specifying how far two consecutive patch samples are in the
+// input. Equivalent to extracting patches with
+// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
+// subsampling them spatially by a factor of `rates`. This is equivalent to
+// `rate` in dilated (a.k.a. Atrous) convolutions.
+//	padding: The type of padding algorithm to use.
 //
-// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-// algorithms that use hashing approximations of cosine and `L2` distances;
-// codes can be generated from an input via:
+// We specify the size-related attributes as:
 //
 // ```python
-// codebook_size = 50
-// codebook_bits = codebook_size * 32
-// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-//                            dtype=x.dtype,
-//                            initializer=tf.orthogonal_initializer())
-// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-// # now codes has shape x.shape[:-1] + [codebook_size]
+//       ksizes = [1, ksize_rows, ksize_cols, 1]
+//       strides = [1, strides_rows, strides_cols, 1]
+//       rates = [1, rates_rows, rates_cols, 1]
 // ```
 //
-// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
-// by 8.
-//
-// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
-// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
-//
-// Arguments:
-//	input: Values to compare against `threshold` and bitpack.
-//	threshold: Threshold to compare against.
-//
-// Returns The bitpacked comparisons.
-func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
+// ksize_cols * depth]` containing image patches with size
+// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
+// `out_rows` and `out_cols` are the dimensions of the output patches.
+func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "CompareAndBitpack",
+		Type: "ExtractImagePatches",
 		Input: []tf.Input{
-			input, threshold,
+			images,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Replaces the contents of the table with the specified keys and values.
+// Bitcasts a tensor from one type to another without copying data.
 //
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
+// Given a tensor `input`, this operation returns a tensor that has the same buffer
+// data as `input` with datatype `type`.
 //
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
+// If the input datatype `T` is larger than the output datatype `type` then the
+// shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
 //
-// Returns the created operation.
-func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// If `T` is smaller than `type`, the operator requires that the rightmost
+// dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
+// [..., sizeof(`type`)/sizeof(`T`)] to [...].
+//
+// *NOTE*: Bitcast is implemented as a low-level cast, so machines with different
+// endian orderings will give different results.
+func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"type": type_}
 	opspec := tf.OpSpec{
-		Type: "LookupTableImportV2",
+		Type: "Bitcast",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			input,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// HashTableV2Attr is an optional argument to HashTableV2.
-type HashTableV2Attr func(optionalAttr)
+// OneHotAttr is an optional argument to OneHot.
+type OneHotAttr func(optionalAttr)
 
-// HashTableV2Container sets the optional container attribute to value.
+// OneHotAxis sets the optional axis attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func HashTableV2Container(value string) HashTableV2Attr {
+// value: The axis to fill (default: -1, a new inner-most axis).
+// If not specified, defaults to -1
+func OneHotAxis(value int64) OneHotAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["axis"] = value
 	}
 }
 
-// HashTableV2SharedName sets the optional shared_name attribute to value.
+// Returns a one-hot tensor.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func HashTableV2SharedName(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// The locations represented by indices in `indices` take value `on_value`,
+// while all other locations take value `off_value`.
 //
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// Creates a non-initialized hash table.
+// If the input `indices` is rank `N`, the output will have rank `N+1`,
+// The new axis is created at dimension `axis` (default: the new axis is
+// appended at the end).
 //
-// This op creates a hash table, specifying the type of its keys and values.
-// Before using the table you will have to initialize it.  After initialization the
-// table will be immutable.
+// If `indices` is a scalar the output shape will be a vector of length `depth`.
 //
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+// If `indices` is a vector of length `features`, the output shape will be:
+// ```
+//   features x depth if axis == -1
+//   depth x features if axis == 0
+// ```
 //
-// Returns Handle to a table.
-func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "HashTableV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
-type MutableHashTableV2Attr func(optionalAttr)
-
-// MutableHashTableV2Container sets the optional container attribute to value.
+// If `indices` is a matrix (batch) with shape `[batch, features]`,
+// the output shape will be:
+// ```
+//   batch x features x depth if axis == -1
+//   batch x depth x features if axis == 1
+//   depth x batch x features if axis == 0
+// ```
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// Examples
+// =========
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 5.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[4 x 3]`:
+//
+//     ```output =
+//       [5.0 0.0 0.0]  // one_hot(0)
+//       [0.0 0.0 5.0]  // one_hot(2)
+//       [0.0 0.0 0.0]  // one_hot(-1)
+//       [0.0 5.0 0.0]  // one_hot(1)
+//     ```
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 0.0
+//   off_value = 3.0
+//   axis = 0
+// ```
+//
+// Then output is `[3 x 4]`:
 //
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// Creates an empty hash table.
+//     ```output =
+//       [0.0 3.0 3.0 3.0]
+//       [3.0 3.0 3.0 0.0]
+//       [3.0 3.0 3.0 3.0]
+//       [3.0 0.0 3.0 3.0]
+//     //  ^                one_hot(0)
+//     //      ^            one_hot(2)
+//     //          ^        one_hot(-1)
+//     //              ^    one_hot(1)
+//     ```
+// Suppose that
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// ```
+//   indices = [[0, 2], [1, -1]]
+//   depth = 3
+//   on_value = 1.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[2 x 2 x 3]`:
+//
+//     ```output =
+//       [
+//         [1.0, 0.0, 0.0]  // one_hot(0)
+//         [0.0, 0.0, 1.0]  // one_hot(2)
+//       ][
+//         [0.0, 1.0, 0.0]  // one_hot(1)
+//         [0.0, 0.0, 0.0]  // one_hot(-1)
+//       ]```
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	indices: A tensor of indices.
+//	depth: A scalar defining the depth of the one hot dimension.
+//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
+//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
 //
-// Returns Handle to a table.
-func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+// Returns The one-hot tensor.
+func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableV2",
-
+		Type: "OneHot",
+		Input: []tf.Input{
+			indices, depth, on_value, off_value,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
-type MapUnstageNoKeyAttr func(optionalAttr)
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
 
-// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Dequeues a tuple of one or more tensors from the given queue.
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns a random (key, value)
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
 //
-// from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapUnstageNoKey",
+		Type: "QueueDequeueV2",
 		Input: []tf.Input{
-			indices,
+			handle,
 		},
 		Attrs: attrs,
 	}
@@ -27175,280 +28021,319 @@ func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, opti
 	}
 	var idx int
 	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstageNoKey", err)
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
 		return
 	}
-	return key, values
+	return components
 }
 
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
-
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// Returns locations of nonzero / true values in a tensor.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+// This operation returns the coordinates of true elements in `condition`. The
+// coordinates are returned in a 2-D tensor where the first dimension (rows)
+// represents the number of true elements, and the second dimension (columns)
+// represents the coordinates of the true elements. Keep in mind, the shape of
+// the output tensor can vary depending on how many true values there are in
+// `condition`. Indices are output in row-major order.
 //
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// For example:
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
+// ```
+// # 'input' tensor is [[True, False]
+// #                    [True, False]]
+// # 'input' has two true values, so output has two coordinates.
+// # 'input' has rank of 2, so coordinates have two indices.
+// where(input) ==> [[0, 0],
+//                   [1, 0]]
 //
-// Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+// # `condition` tensor is [[[True, False]
+// #                     [True, False]]
+// #                    [[False, True]
+// #                     [False, True]]
+// #                    [[False, False]
+// #                     [False, True]]]
+// # 'input' has 5 true values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+//
+// # `condition` tensor is [[[1.5,  0.0]
+// #                     [-0.5, 0.0]]
+// #                    [[0.0,  0.25]
+// #                     [0.0,  0.75]]
+// #                    [[0.0,  0.0]
+// #                     [0.0,  0.01]]]
+// # 'input' has 5 nonzero values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+//
+// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
+// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+// ```
+func Where(scope *Scope, condition tf.Output) (index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
+		Type: "Where",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
+			condition,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
-type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
 
-// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["signed_input"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["num_bits"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
 // If not specified, defaults to false
-func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["range_given"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to <>
-func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["value_shape"] = value
+		m["input_min"] = value
 	}
 }
 
-// Creates an empty hash table.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a vector. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
-//
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_max"] = value
+	}
+}
+
+// Use QuantizeAndDequantizeV2 instead.
 //
-// Returns Handle to a table.
-func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableOfTensorsV2",
-
+		Type: "QuantizeAndDequantize",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-//
-// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-// are placed in `outputs[i]` in lexicographic order of `js`, and the first
-// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-// In detail,
+// Returns the diagonal part of the tensor.
 //
-// ```python
-//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
+// This operation returns a tensor with the `diagonal` part
+// of the `input`. The `diagonal` part is computed as follows:
 //
-//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-// ```
+// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
+// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
 //
-// `data.shape` must start with `partitions.shape`.
+// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
 //
 // For example:
 //
-// ```python
-//     # Scalar partitions.
-//     partitions = 1
-//     num_partitions = 2
-//     data = [10, 20]
-//     outputs[0] = []  # Empty with shape [0, 2]
-//     outputs[1] = [[10, 20]]
-//
-//     # Vector partitions.
-//     partitions = [0, 0, 1, 1, 0]
-//     num_partitions = 2
-//     data = [10, 20, 30, 40, 50]
-//     outputs[0] = [10, 20, 50]
-//     outputs[1] = [30, 40]
 // ```
+// # 'input' is [[1, 0, 0, 0]
+//               [0, 2, 0, 0]
+//               [0, 0, 3, 0]
+//               [0, 0, 0, 4]]
 //
-// See `dynamic_stitch` for an example on how to merge partitions back.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
-// </div>
+// tf.diag_part(input) ==> [1, 2, 3, 4]
+// ```
 //
 // Arguments:
+//	input: Rank k tensor where k is even and not zero.
 //
-//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
-//	num_partitions: The number of partitions to output.
-func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
+// Returns The extracted diagonal.
+func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_partitions": num_partitions}
 	opspec := tf.OpSpec{
-		Type: "DynamicPartition",
+		Type: "DiagPart",
 		Input: []tf.Input{
-			data, partitions,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("DynamicPartition", err)
-		return
-	}
-	return outputs
+	return op.Output(0)
 }
 
-// Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object.
+// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
+type QuantizedInstanceNormAttr func(optionalAttr)
+
+// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
 //
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If True, `given_y_min` and `given_y_min`
+// and `given_y_max` are used as the output range. Otherwise,
+// the implementation computes the output range.
+// If not specified, defaults to false
+func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["output_range_given"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
+}
+
+// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
+//
+// value: Output in `y_min` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["given_y_min"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Table initializer that takes two tensors for keys and values respectively.
+// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
 //
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	keys: Keys of type Tkey.
-//	values: Values of type Tval.
+// value: Output in `y_max` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["given_y_max"] = value
+	}
+}
+
+// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
 //
-// Returns the created operation.
-func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: A small float number to avoid dividing by 0.
+// If not specified, defaults to 1e-05
+func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["variance_epsilon"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "InitializeTableV2",
-		Input: []tf.Input{
-			table_handle, keys, values,
-		},
+}
+
+// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
+//
+// value: Minimum value of `y_max - y_min`
+// If not specified, defaults to 0.001
+func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["min_separation"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
+// Quantized Instance normalization.
 //
 // Arguments:
+//	x: A 4D input Tensor.
+//	x_min: The value represented by the lowest quantized input.
+//	x_max: The value represented by the highest quantized input.
 //
-//	buffer_size: The maximum number of elements to buffer in an iterator over
-// this dataset.
-//
-//
-func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
+func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "PrefetchDataset",
+		Type: "QuantizedInstanceNorm",
 		Input: []tf.Input{
-			input_dataset, buffer_size,
+			x, x_min, x_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
 //
-// Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
+// and `max` to 'outputs' tensor of same shape as `inputs`.
+//
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+//
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
+		Type: "FakeQuantWithMinMaxVars",
 		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
+			inputs, min, max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
diff --git a/tensorflow/go/session.go b/tensorflow/go/session.go
index fc914f86df365e7d328fe2fc95f032885c717b31..db6ae4f26cd92dcf5e542052e4bae561bbefe999 100644
--- a/tensorflow/go/session.go
+++ b/tensorflow/go/session.go
@@ -65,6 +65,51 @@ func NewSession(graph *Graph, options *SessionOptions) (*Session, error) {
 	return s, nil
 }
 
+// Device structure contains information about a device associated with a session, as returned by ListDevices()
+type Device struct {
+	Name, Type       string
+	MemoryLimitBytes int64
+}
+
+// Return list of devices associated with a Session
+func (s *Session) ListDevices() ([]Device, error) {
+	var devices []Device
+
+	status := newStatus()
+	devices_list := C.TF_SessionListDevices(s.c, status.c)
+	if err := status.Err(); err != nil {
+		return nil, fmt.Errorf("SessionListDevices() failed: %v", err)
+	}
+	defer C.TF_DeleteDeviceList(devices_list)
+
+	for i := 0; i < int(C.TF_DeviceListCount(devices_list)); i++ {
+		device_name := C.TF_DeviceListName(devices_list, C.int(i), status.c)
+		if err := status.Err(); err != nil {
+			return nil, fmt.Errorf("DeviceListName(index=%d) failed: %v", i, err)
+		}
+
+		device_type := C.TF_DeviceListType(devices_list, C.int(i), status.c)
+		if err := status.Err(); err != nil {
+			return nil, fmt.Errorf("DeviceListType(index=%d) failed: %v", i, err)
+		}
+
+		memory_limit_bytes := C.TF_DeviceListMemoryBytes(devices_list, C.int(i), status.c)
+		if err := status.Err(); err != nil {
+			return nil, fmt.Errorf("DeviceListMemoryBytes(index=%d) failed: %v", i, err)
+		}
+
+		device := Device{
+			Name:             C.GoString(device_name),
+			Type:             C.GoString(device_type),
+			MemoryLimitBytes: int64(memory_limit_bytes),
+		}
+
+		devices = append(devices, device)
+	}
+
+	return devices, nil
+}
+
 // Run the graph with the associated session starting with the supplied feeds
 // to compute the value of the requested fetches. Runs, but does not return
 // Tensors for operations specified in targets.
diff --git a/tensorflow/go/session_test.go b/tensorflow/go/session_test.go
index 73d78a8e5773d8bc25f349c9736bda4595bea64e..05ace99a2387c6884832427187525f2fb7d5aba2 100644
--- a/tensorflow/go/session_test.go
+++ b/tensorflow/go/session_test.go
@@ -283,3 +283,19 @@ func TestSessionConfig(t *testing.T) {
 		t.Fatalf("Got %v, want -1", output[0].Value())
 	}
 }
+
+func TestListDevices(t *testing.T) {
+	s, err := NewSession(NewGraph(), nil)
+	if err != nil {
+		t.Fatalf("NewSession(): %v", err)
+	}
+
+	devices, err := s.ListDevices()
+	if err != nil {
+		t.Fatalf("ListDevices(): %v", err)
+	}
+
+	if len(devices) == 0 {
+		t.Fatalf("no devices detected")
+	}
+}
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index cd6f4bc1f02326728320c1f307d3ce0fbd744b44..2d25c04dc9b1d0bc2ae831f98c0879e73a6bfafa 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -270,7 +270,7 @@ func typeOf(dt DataType, shape []int64) reflect.Type {
 		}
 	}
 	if ret == nil {
-		panic(bug("DataType %v is not supported", dt))
+		panic(bug("DataType %v is not supported (see https://www.tensorflow.org/code/tensorflow/core/framework/types.proto)", dt))
 	}
 	for range shape {
 		ret = reflect.SliceOf(ret)
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index c0563da06d99bcf06477c094b560ceff6a01eff0..9dee1aa72bf0d76ee35931f1e852bfd22556a540 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -14,6 +14,7 @@ load(
     "tf_copts",
     "tf_custom_op_library",
     "tf_java_test",
+    "tf_cc_test",
 )
 
 java_library(
@@ -97,10 +98,26 @@ tf_java_op_gen_srcjar(
 # file before making it an executable. See tf_java_op_gen_srcjar().
 cc_library(
     name = "java_op_gen_tool",
-    srcs = glob([
-        "src/gen/cc/*.h",
-        "src/gen/cc/*.cc",
-    ]),
+    srcs = [
+        "src/gen/cc/op_gen_main.cc",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":java_op_gen_lib",
+    ],
+)
+
+cc_library(
+    name = "java_op_gen_lib",
+    srcs = [
+        "src/gen/cc/op_generator.cc",
+        "src/gen/cc/source_writer.cc",
+    ],
+    hdrs = [
+        "src/gen/cc/java_defs.h",
+        "src/gen/cc/op_generator.h",
+        "src/gen/cc/source_writer.h",
+    ],
     copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework",
@@ -280,21 +297,6 @@ tf_java_test(
     ],
 )
 
-#java_test(
-#    name = "OperatorProcessorTest",
-#    size = "small",
-#    srcs = ["src/test/java/org/tensorflow/processor/OperatorProcessorTest.java"],
-#    javacopts = JAVACOPTS,
-#    resources = [":processor_test_resources"],
-#    test_class = "org.tensorflow.processor.OperatorProcessorTest",
-#    deps = [
-#        ":processor_library",
-#        "//third_party/java/junit",
-#        "@com_google_testing_compile",
-#        "@com_google_truth",
-#    ],
-#)
-
 filegroup(
     name = "processor_test_resources",
     srcs = glob([
@@ -303,6 +305,20 @@ filegroup(
     ]),
 )
 
+tf_cc_test(
+    name = "source_writer_test",
+    size = "small",
+    srcs = [
+        "src/gen/cc/source_writer_test.cc",
+    ],
+    deps = [
+        ":java_op_gen_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 filegroup(
     name = "libtensorflow_jni",
     srcs = select({
diff --git a/tensorflow/java/maven/.gitignore b/tensorflow/java/maven/.gitignore
index 0e11e83a0cb649425b2072f24b0d7106c08cff81..ff080515d5e730b308bf78f7e28244c6c799cdc3 100644
--- a/tensorflow/java/maven/.gitignore
+++ b/tensorflow/java/maven/.gitignore
@@ -5,7 +5,10 @@ libtensorflow/src
 libtensorflow/target
 libtensorflow_jni/src
 libtensorflow_jni/target
+libtensorflow_jni_gpu/src
+libtensorflow_jni_gpu/target
 tensorflow/src
 tensorflow/target
 proto/src
 proto/target
+pom.xml.versionsBackup
diff --git a/tensorflow/java/maven/README.md b/tensorflow/java/maven/README.md
index 622777536188df4462550b8dc471b64328ad204f..c7e8f0380629f492ade9ba47cdcb4bc286ac82bc 100644
--- a/tensorflow/java/maven/README.md
+++ b/tensorflow/java/maven/README.md
@@ -22,11 +22,12 @@ Hence, the process for building and uploading release artifacts is not a single
 
 ## Artifact Structure
 
-There are six artifacts and thus `pom.xml`s involved in this release:
+There are seven artifacts and thus `pom.xml`s involved in this release:
 
 1.  `tensorflow`: The single dependency for projects requiring TensorFlow for
-    Java. This convenience package depends on the two below, and is the one that
-    should typically be used in other programs.
+    Java. This convenience package depends on `libtensorflow` and
+    `libtensorflow_jni`. Typically, this is the single dependency that should
+    be used by client programs (unless GPU support is required).
 
 2.  `libtensorflow`: Java-only code for the [TensorFlow Java API](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary).
     The `.jar` itself has no native code, but requires the native code be either
@@ -36,15 +37,20 @@ There are six artifacts and thus `pom.xml`s involved in this release:
 3.  `libtensorflow_jni`: The native libraries required by `libtensorflow`.
     Native code for all supported platforms is packaged into a single `.jar`.
 
-4.  `proto`: Generated Java code for TensorFlow protocol buffers
+4.  `libtensorflow_jni_gpu`: The native libraries required by `libtensorflow`
+    with GPU (CUDA) support enabled. Programs requiring GPU-enabled TensorFlow
+    should add a dependency on `libtensorflow` and `libtensorflow_jni_gpu`.
+    As of January 2018, this artifact is *Linux only*.
+
+5.  `proto`: Generated Java code for TensorFlow protocol buffers
     (e.g., `MetaGraphDef`, `ConfigProto` etc.)
 
-5. `tensorflow-android`: A package geared towards
+6. `tensorflow-android`: A package geared towards
     supporting [TensorFlow on Android](../../contrib/android/README.md), and is
     a self-contained Android AAR library containing all necessary native and
     Java code.
 
-6.  [`parentpom`](https://maven.apache.org/pom/index.html): Common settings
+7.  [`parentpom`](https://maven.apache.org/pom/index.html): Common settings
     shared by all of the above.
 
 
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index d365c39ef4a5b10f45f6045567082724510fab54..99add510696c852b224b40fbafd03620f2606cd3 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.4.0</version>
+    <version>1.6.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 0111fc62a4d6bfb27e51fd40778edf37f8c2e501..7bb9879f6838c71f2132dd1e331fdb79ccde8527 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.4.0</version>
+    <version>1.6.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
new file mode 100644
index 0000000000000000000000000000000000000000..268e1bae1fe49b7270b37e1a625f3531a42f556b
--- /dev/null
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -0,0 +1,15 @@
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+  <modelVersion>4.0.0</modelVersion>
+  <description>Platform-dependent native code with GPU (CUDA) support for the TensorFlow Java library.</description>
+  <parent>
+    <groupId>org.tensorflow</groupId>
+    <artifactId>parentpom</artifactId>
+    <version>1.6.0-rc0</version>
+    <relativePath>../</relativePath>
+  </parent>
+  <artifactId>libtensorflow_jni_gpu</artifactId>
+  <packaging>jar</packaging>
+</project>
+
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 06042216b4612e4a55f712b8f941b53c2bdf1daf..6a3abcbc1143598a9405fdd9b7ebf83e1f8196d6 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.4.0</version>
+  <version>1.6.0-rc0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
@@ -29,6 +29,7 @@
   <modules>
     <module>libtensorflow</module>
     <module>libtensorflow_jni</module>
+    <module>libtensorflow_jni_gpu</module>
     <module>tensorflow</module>
     <module>proto</module>
   </modules>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 2c9d76b563377c3fc4ecede0460ef4e53e27b417..54a4fd577a0e3242d4b7f89586b3283f11fca856 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.4.0</version>
+    <version>1.6.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
index a2ce097195450eff566f0be48ca4f1a6b99401cc..6136ccfdfb92d6a71c440b23dc0a13ebe86c52e6 100644
--- a/tensorflow/java/maven/run_inside_container.sh
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -44,7 +44,7 @@ clean() {
   # (though if run inside a clean docker container, there won't be any dirty
   # artifacts lying around)
   mvn -q clean
-  rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow/src libtensorflow/target tensorflow-android/target
+  rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow_jni_gpu/src libtensorflow_jni_gpu/target libtensorflow/src libtensorflow/target tensorflow-android/target
 }
 
 update_version_in_pom() {
@@ -119,6 +119,26 @@ download_libtensorflow_jni() {
   cd "${DIR}"
 }
 
+download_libtensorflow_jni_gpu() {
+  NATIVE_DIR="${DIR}/libtensorflow_jni_gpu/src/main/resources/org/tensorflow/native"
+  mkdir -p "${NATIVE_DIR}"
+  cd "${NATIVE_DIR}"
+
+  mkdir linux-x86_64
+
+  if [[ "${IS_SNAPSHOT}" == "true" ]]; then
+    # Nightly builds from http://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/
+    # and http://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow-windows/
+    curl -L "http://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/TYPE=gpu-linux/lastSuccessfulBuild/artifact/lib_package/libtensorflow_jni-gpu-linux-x86_64.tar.gz" | tar -xvz -C linux-x86_64
+  else
+    curl -L "${RELEASE_URL_PREFIX}/libtensorflow_jni-gpu-linux-x86_64-${TF_VERSION}.tar.gz" | tar -xvz -C linux-x86_64
+  fi
+
+  # Updated timestamps seem to be required to get Maven to pick up the file.
+  touch linux-x86_64/*
+  cd "${DIR}"
+}
+
 # Ideally, the .jar for generated Java code for TensorFlow protocol buffer files
 # would have been produced by bazel rules. However, protocol buffer library
 # support in bazel is in flux. Once
@@ -225,6 +245,7 @@ clean
 update_version_in_pom
 download_libtensorflow
 download_libtensorflow_jni
+download_libtensorflow_jni_gpu
 update_tensorflow_android
 generate_java_protos
 # Build the release artifacts
diff --git a/tensorflow/java/maven/tensorflow-android/update.py b/tensorflow/java/maven/tensorflow-android/update.py
index 7c250718347f5fdd65aaf8003aad75a87a19c96a..4ae666e4e5351f1bdaf79d1b5cfdb63b0f811e2b 100644
--- a/tensorflow/java/maven/tensorflow-android/update.py
+++ b/tensorflow/java/maven/tensorflow-android/update.py
@@ -95,7 +95,7 @@ def main():
     release_prefix = 'https://storage.googleapis.com/tensorflow/libtensorflow'
     info_url = '%s/android_buildinfo-%s.json' % (release_prefix, args.version)
     aar_url = '%s/tensorflow-%s.aar' % (release_prefix, args.version)
-    build_type = 'release-matrix-android'
+    build_type = 'release-matrix-android2'
 
   # Retrieve build information
   build_info = get_json(info_url)
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 474a9adb9ae6cbedcc8f67abb0431710f2ecbef9..76e0fecae4a5134625d812379d7c9029f38d0324 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.4.0</version>
+    <version>1.6.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/gen/cc/java_defs.h b/tensorflow/java/src/gen/cc/java_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..615cdc165b36abdc3cf5e717ddb8b385367c067f
--- /dev/null
+++ b/tensorflow/java/src/gen/cc/java_defs.h
@@ -0,0 +1,273 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
+#define TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
+
+#include <string>
+#include <vector>
+#include <deque>
+
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace java {
+
+// An enumeration of different modifiers commonly used in Java
+enum Modifier {
+  PUBLIC    = (1 << 0),
+  PROTECTED = (1 << 1),
+  PRIVATE   = (1 << 2),
+  STATIC    = (1 << 3),
+  FINAL     = (1 << 4),
+};
+
+class Annotation;
+
+// A definition of any kind of Java type (classes, interfaces...)
+//
+// Note that most of the data fields of this class are only useful in specific
+// contexts and are not required in many cases. For example, annotations and
+// supertypes are only useful when declaring a type.
+class Type {
+ public:
+  enum Kind {
+    PRIMITIVE, CLASS, INTERFACE, ENUM, GENERIC, ANNOTATION
+  };
+  static const Type Byte() {
+    return Type(Type::PRIMITIVE, "byte");
+  }
+  static const Type Char() {
+    return Type(Type::PRIMITIVE, "char");
+  }
+  static const Type Short() {
+    return Type(Type::PRIMITIVE, "short");
+  }
+  static const Type Int() {
+    return Type(Type::PRIMITIVE, "int");
+  }
+  static const Type Long() {
+    return Type(Type::PRIMITIVE, "long");
+  }
+  static const Type Float() {
+    return Type(Type::PRIMITIVE, "float");
+  }
+  static const Type Double() {
+    return Type(Type::PRIMITIVE, "double");
+  }
+  static const Type Boolean() {
+    return Type(Type::PRIMITIVE, "boolean");
+  }
+  static const Type Void() {
+    // For simplicity, we consider 'void' as a primitive type, like the Java
+    // Reflection API does
+    return Type(Type::PRIMITIVE, "void");
+  }
+  static Type Class(const string& name, const string& package = "") {
+    return Type(Type::CLASS, name, package);
+  }
+  static Type Interface(const string& name, const string& package = "") {
+    return Type(Type::INTERFACE, name, package);
+  }
+  static Type Enum(const string& name, const string& package = "") {
+    return Type(Type::ENUM, name, package);
+  }
+  static Type Generic(const string& name = "") {
+    return Type(Type::GENERIC, name);
+  }
+  static Type ClassOf(const Type& type) {
+    return Class("Class").add_parameter(type);
+  }
+  static Type ListOf(const Type& type) {
+    return Interface("List", "java.util").add_parameter(type);
+  }
+  static Type IterableOf(const Type& type) {
+    return Interface("Iterable").add_parameter(type);
+  }
+  const Kind& kind() const { return kind_; }
+  const string& name() const { return name_; }
+  const string& package() const { return package_; }
+  const string& description() const { return description_; }
+  Type& description(const string& description) {
+    description_ = description;
+    return *this;
+  }
+  const std::vector<Type>& parameters() const { return parameters_; }
+  Type& add_parameter(const Type& parameter) {
+    parameters_.push_back(parameter);
+    return *this;
+  }
+  const std::vector<Annotation>& annotations() const { return annotations_; }
+  Type& add_annotation(const Annotation& annotation) {
+    annotations_.push_back(annotation);
+    return *this;
+  }
+  const std::deque<Type>& supertypes() const { return supertypes_; }
+  Type& add_supertype(const Type& type) {
+    if (type.kind_ == CLASS) {
+      supertypes_.push_front(type);  // keep superclass at the front of the list
+    } else if (type.kind_ == INTERFACE) {
+      supertypes_.push_back(type);
+    }
+    return *this;
+  }
+  // Returns true if "type" is of a known collection type (only a few for now)
+  bool IsCollection() const {
+    return name_ == "List" || name_ == "Iterable";
+  }
+  // Returns true if this instance is a wildcard (<?>)
+  bool IsWildcard() const {
+    return kind_ == GENERIC && name_.empty();
+  }
+
+ protected:
+  Type(Kind kind, const string& name, const string& package = "")
+    : kind_(kind), name_(name), package_(package) {}
+
+ private:
+  Kind kind_;
+  string name_;
+  string package_;
+  string description_;
+  std::vector<Type> parameters_;
+  std::vector<Annotation> annotations_;
+  std::deque<Type> supertypes_;
+};
+
+// Definition of a Java annotation
+//
+// This class only defines the usage of an annotation in a specific context,
+// giving optionally a set of attributes to initialize.
+class Annotation : public Type {
+ public:
+  static Annotation Create(const string& type_name, const string& pkg = "") {
+    return Annotation(type_name, pkg);
+  }
+  const string& attributes() const { return attributes_; }
+  Annotation& attributes(const string& attributes) {
+    attributes_ = attributes;
+    return *this;
+  }
+
+ private:
+  string attributes_;
+
+  Annotation(const string& name, const string& package)
+    : Type(Kind::ANNOTATION, name, package) {}
+};
+
+// A definition of a Java variable
+//
+// This class declares an instance of a type, such as a class field or a
+// method argument, which can be documented.
+class Variable {
+ public:
+  static Variable Create(const string& name, const Type& type) {
+    return Variable(name, type, false);
+  }
+  static Variable Varargs(const string& name, const Type& type) {
+    return Variable(name, type, true);
+  }
+  const string& name() const { return name_; }
+  const Type& type() const { return type_; }
+  bool variadic() const { return variadic_; }
+  const string& description() const { return description_; }
+  Variable& description(const string& description) {
+    description_ = description;
+    return *this;
+  }
+ private:
+  string name_;
+  Type type_;
+  bool variadic_;
+  string description_;
+
+  Variable(const string& name, const Type& type, bool variadic)
+    : name_(name), type_(type), variadic_(variadic) {}
+};
+
+// A definition of a Java class method
+//
+// This class defines the signature of a method, including its name, return
+// type and arguments.
+class Method {
+ public:
+  static Method Create(const string& name, const Type& return_type) {
+    return Method(name, return_type, false);
+  }
+  static Method ConstructorFor(const Type& clazz) {
+    return Method(clazz.name(), clazz, true);
+  }
+  bool constructor() const { return constructor_; }
+  const string& name() const { return name_; }
+  const Type& return_type() const { return return_type_; }
+  const string& description() const { return description_; }
+  Method& description(const string& description) {
+    description_ = description;
+    return *this;
+  }
+  const string& return_description() const { return return_description_; }
+  Method& return_description(const string& description) {
+    return_description_ = description;
+    return *this;
+  }
+  const std::vector<Variable>& arguments() const { return arguments_; }
+  Method& add_arguments(const std::vector<Variable>& args) {
+    arguments_.insert(arguments_.cend(), args.cbegin(), args.cend());
+    return *this;
+  }
+  Method& add_argument(const Variable& var) {
+    arguments_.push_back(var);
+    return *this;
+  }
+  const std::vector<Annotation>& annotations() const { return annotations_; }
+  Method& add_annotation(const Annotation& annotation) {
+    annotations_.push_back(annotation);
+    return *this;
+  }
+
+ private:
+  string name_;
+  Type return_type_;
+  bool constructor_;
+  string description_;
+  string return_description_;
+  std::vector<Variable> arguments_;
+  std::vector<Annotation> annotations_;
+
+  Method(const string& name, const Type& return_type, bool constructor)
+    : name_(name), return_type_(return_type), constructor_(constructor) {}
+};
+
+// A piece of code to read from a file.
+class Snippet {
+ public:
+  static Snippet Create(const string& fname, Env* env = Env::Default()) {
+    return Snippet(fname, env);
+  }
+  const string& data() const { return data_; }
+
+ private:
+  string data_;
+
+  Snippet(const string& fname, Env* env) {
+    TF_CHECK_OK(ReadFileToString(env, fname, &data_));
+  }
+};
+
+}  // namespace java
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_JAVA_SRC_GEN_CC_JAVA_DEFS_H_
diff --git a/tensorflow/java/src/gen/cc/op_gen_main.cc b/tensorflow/java/src/gen/cc/op_gen_main.cc
index a7c66dda893a3109e0e0bfe76f5becef766afb0e..bea99f3d7f6bea1ebc7097cbd7aae1fa7e5a87fa 100644
--- a/tensorflow/java/src/gen/cc/op_gen_main.cc
+++ b/tensorflow/java/src/gen/cc/op_gen_main.cc
@@ -25,7 +25,7 @@
 #include "tensorflow/java/src/gen/cc/op_generator.h"
 
 namespace tensorflow {
-namespace op_gen {
+namespace java {
 
 const char kUsageHeader[] =
     "\n\nGenerator of operation wrappers in Java.\n\n"
@@ -51,7 +51,7 @@ const char kUsageHeader[] =
     "Finally, the '--base_package' overrides the default parent package "
     "under which the generated subpackage and classes are to be located.\n\n";
 
-}  // namespace op_gen
+}  // namespace java
 }  // namespace tensorflow
 
 int main(int argc, char* argv[]) {
@@ -67,13 +67,13 @@ int main(int argc, char* argv[]) {
       tensorflow::Flag(
           "base_package", &base_package,
           "Package parent to the generated subpackage and classes")};
-  tensorflow::string usage = tensorflow::op_gen::kUsageHeader;
+  tensorflow::string usage = tensorflow::java::kUsageHeader;
   usage += tensorflow::Flags::Usage(argv[0], flag_list);
   bool parsed_flags_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
   tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
   QCHECK(parsed_flags_ok && !lib_name.empty() && !output_dir.empty()) << usage;
 
-  tensorflow::OpGenerator generator;
+  tensorflow::java::OpGenerator generator;
   tensorflow::OpList ops;
   tensorflow::OpRegistry::Global()->Export(true, &ops);
   tensorflow::Status status =
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index df130c32e6afcba157da282026280756b778f3ad..def06baf2db43e1fa42f03cf9619abd34785cea7 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/java/src/gen/cc/op_generator.h"
 
 namespace tensorflow {
+namespace java {
 namespace {
 
 string CamelCase(const string& str, char delimiter, bool upper) {
@@ -63,4 +64,5 @@ Status OpGenerator::Run(const OpList& ops, const string& lib_name,
   return Status::OK();
 }
 
+}  // namespace java
 }  // namespace tensorflow
diff --git a/tensorflow/java/src/gen/cc/op_generator.h b/tensorflow/java/src/gen/cc/op_generator.h
index eec1082b5162298e68fbd05d82d5563777e865db..4b55ed3ed94f11c1f810c0a56989853ee1154587 100644
--- a/tensorflow/java/src/gen/cc/op_generator.h
+++ b/tensorflow/java/src/gen/cc/op_generator.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
+namespace java {
 
 /// \brief A generator of Java operation wrappers.
 ///
@@ -46,6 +47,7 @@ class OpGenerator {
   Env* env;
 };
 
+}  // namespace java
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_JAVA_SRC_GEN_CC_OP_GENERATOR_H_
diff --git a/tensorflow/java/src/gen/cc/source_writer.cc b/tensorflow/java/src/gen/cc/source_writer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2da81f2911e60be6a47ac13fe8be6142fa283780
--- /dev/null
+++ b/tensorflow/java/src/gen/cc/source_writer.cc
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/java/src/gen/cc/source_writer.h"
+
+namespace tensorflow {
+
+SourceWriter& SourceWriter::Append(const StringPiece& str) {
+  if (!str.empty()) {
+    if (newline_) {
+      DoAppend(left_margin_ + line_prefix_);
+      newline_ = false;
+    }
+    DoAppend(str);
+  }
+  return *this;
+}
+
+SourceWriter& SourceWriter::Write(const string& str) {
+  size_t line_pos = 0;
+  do {
+    size_t start_pos = line_pos;
+    line_pos = str.find('\n', start_pos);
+    if (line_pos != string::npos) {
+      ++line_pos;
+      Append(StringPiece(str.data() + start_pos, line_pos - start_pos));
+      newline_ = true;
+    } else {
+      Append(StringPiece(str.data() + start_pos, str.size() - start_pos));
+    }
+  } while (line_pos != string::npos && line_pos < str.size());
+
+  return *this;
+}
+
+SourceWriter& SourceWriter::EndLine() {
+  Append("\n");
+  newline_ = true;
+  return *this;
+}
+
+SourceWriter& SourceWriter::Indent(int tab) {
+  left_margin_.resize(std::max(static_cast<int>(left_margin_.size() + tab), 0),
+                      ' ');
+  return *this;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/java/src/gen/cc/source_writer.h b/tensorflow/java/src/gen/cc/source_writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..bff26eb185db0cf933632f33f916b87d8a757edd
--- /dev/null
+++ b/tensorflow/java/src/gen/cc/source_writer.h
@@ -0,0 +1,133 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SRC_GEN_CC_SOURCE_WRITER_H_
+#define TENSORFLOW_JAVA_SRC_GEN_CC_SOURCE_WRITER_H_
+
+#include <string>
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+// A utility class for writing source code, normally generated at
+// compile-time.
+//
+// Source writers are language-agnostic and therefore only expose generic
+// methods common to most languages. Extend or wrap this class to implement
+// language-specific features.
+//
+// Note: if you are looking to reuse this class for generating code in another
+// language than Java, please do by moving it at the '//tensorflow/core/lib/io'
+// level.
+class SourceWriter {
+ public:
+  virtual ~SourceWriter() = default;
+
+  // Returns true if the writer is at the beginnig of a new line
+  bool newline() const { return newline_; }
+
+  // Appends a piece of code or text.
+  //
+  // It is expected that no newline character is present in the data provided,
+  // otherwise Write() must be used.
+  SourceWriter& Append(const StringPiece& str);
+
+  // Writes a block of code or text.
+  //
+  // The data might potentially contain newline characters, therefore it will
+  // be scanned to ensure that each line is indented and prefixed properly,
+  // making it a bit slower than Append().
+  SourceWriter& Write(const string& text);
+
+  // Appends a newline character and start writing on a new line.
+  SourceWriter& EndLine();
+
+  // Indents following lines with white spaces.
+  //
+  // Indentation is cumulative, i.e. the provided tabulation is added to the
+  // current indentation value. If the tabulation is negative, the operation
+  // will outdent the source code, until the indentation reaches 0 again.
+  //
+  // For example, calling Indent(2) twice will indent code with 4 white
+  // spaces. Then calling Indent(-2) will outdent the code back to 2 white
+  // spaces.
+  SourceWriter& Indent(int tab);
+
+  // Prefixes following lines with provided character(s).
+  //
+  // A common use case of a prefix is for commenting or documenting the code.
+  //
+  // The prefix is written after the indentation, For example, invoking
+  // Indent(2)->Prefix("//") will result in prefixing lines with "  //".
+  //
+  // An empty value ("") will remove any line prefix that was previously set.
+  SourceWriter& Prefix(const char* line_prefix) {
+    line_prefix_ = line_prefix;
+    return *this;
+  }
+
+ protected:
+  virtual void DoAppend(const StringPiece& str) = 0;
+
+ private:
+  string left_margin_;
+  string line_prefix_;
+  bool newline_ = true;
+};
+
+// A writer that outputs source code into a file.
+//
+// Note: the writer does not acquire the ownership of the file being passed in
+// parameter.
+class SourceFileWriter : public SourceWriter {
+ public:
+  explicit SourceFileWriter(WritableFile* file) : file_(file) {}
+  virtual ~SourceFileWriter() = default;
+
+ protected:
+  void DoAppend(const StringPiece& str) override {
+    TF_CHECK_OK(file_->Append(str));
+  }
+
+ private:
+  WritableFile* file_;
+};
+
+// A writer that outputs source code into a string buffer.
+class SourceBufferWriter : public SourceWriter {
+ public:
+  SourceBufferWriter() : owns_buffer_(true), buffer_(new string()) {}
+  explicit SourceBufferWriter(string* buffer)
+      : owns_buffer_(false), buffer_(buffer) {}
+  virtual ~SourceBufferWriter() {
+    if (owns_buffer_) delete buffer_;
+  }
+  const string& str() { return *buffer_; }
+
+ protected:
+  void DoAppend(const StringPiece& str) override {
+    buffer_->append(str.begin(), str.end());
+  }
+
+ private:
+  bool owns_buffer_;
+  string* buffer_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_JAVA_SRC_GEN_CC_SOURCE_WRITER_H_
diff --git a/tensorflow/java/src/gen/cc/source_writer_test.cc b/tensorflow/java/src/gen/cc/source_writer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9738957548184726395c4e6634ba12a5a9a0109
--- /dev/null
+++ b/tensorflow/java/src/gen/cc/source_writer_test.cc
@@ -0,0 +1,215 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/java/src/gen/cc/source_writer.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(AppendTest, SingleLineText) {
+  SourceBufferWriter writer;
+  writer.Append("You say goodbye and I say hello!");
+
+  const char* expected = "You say goodbye and I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(AppendTest, MultiLineText) {
+  SourceBufferWriter writer;
+  writer.Append("You say goodbye\nand I say hello!");
+
+  const char* expected = "You say goodbye\nand I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(AppendTest, MultiLineTextWithIndent) {
+  SourceBufferWriter writer;
+  writer.Indent(2).Append("You say goodbye\nand I say hello!");
+
+  const char* expected = "  You say goodbye\nand I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(AppendTest, MultiLineTextWithPrefix) {
+  SourceBufferWriter writer;
+  writer.Prefix("--").Append("You say goodbye\nand I say hello!");
+
+  const char* expected = "--You say goodbye\nand I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(AppendTest, MultiLineTextWithIndentAndPrefix) {
+  SourceBufferWriter writer;
+  writer.Indent(2).Prefix("--").Append("You say goodbye\nand I say hello!");
+
+  const char* expected = "  --You say goodbye\nand I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteTest, SingleLineText) {
+  SourceBufferWriter writer;
+  writer.Write("You say goodbye and I say hello!");
+
+  const char* expected = "You say goodbye and I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteTest, MultiLineText) {
+  SourceBufferWriter writer;
+  writer.Write("You say goodbye\nand I say hello!");
+
+  const char* expected = "You say goodbye\nand I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteTest, MultiLineTextWithIndent) {
+  SourceBufferWriter writer;
+  writer.Indent(2).Write("You say goodbye\nand I say hello!");
+
+  const char* expected = "  You say goodbye\n  and I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteTest, MultiLineTextWithPrefix) {
+  SourceBufferWriter writer;
+  writer.Prefix("--").Write("You say goodbye\nand I say hello!");
+
+  const char* expected = "--You say goodbye\n--and I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(WriteTest, MultiLineTextWithIndentAndPrefix) {
+  SourceBufferWriter writer;
+  writer.Indent(2).Prefix("--").Write("You say goodbye\nand I say hello!");
+
+  const char* expected = "  --You say goodbye\n  --and I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(MarginTest, Basic) {
+  SourceBufferWriter writer;
+  writer.Append("You say goodbye").EndLine().Append("and I say hello!");
+
+  const char* expected = "You say goodbye\nand I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(MarginTest, Indent) {
+  SourceBufferWriter writer;
+  writer.Append("You say goodbye")
+      .EndLine()
+      .Indent(2)
+      .Append("and I say hello!");
+
+  const char* expected = "You say goodbye\n  and I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(MarginTest, IndentAndOutdent) {
+  SourceBufferWriter writer;
+  writer.Append("You say goodbye")
+      .EndLine()
+      .Indent(2)
+      .Append("and I say hello!")
+      .EndLine()
+      .Indent(-2)
+      .Append("Hello, hello!");
+
+  const char* expected = "You say goodbye\n  and I say hello!\nHello, hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(MarginTest, Prefix) {
+  SourceBufferWriter writer;
+  writer.Append("You say goodbye")
+      .EndLine()
+      .Prefix("--")
+      .Append("and I say hello!");
+
+  const char* expected = "You say goodbye\n--and I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(MarginTest, PrefixAndRemovePrefix) {
+  SourceBufferWriter writer;
+  writer.Append("You say goodbye")
+      .EndLine()
+      .Prefix("--")
+      .Append("and I say hello!")
+      .EndLine()
+      .Prefix("")
+      .Append("Hello, hello!");
+
+  const char* expected = "You say goodbye\n--and I say hello!\nHello, hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(MarginTest, IndentAndPrefixAndOutdentAndRemovePrefix) {
+  SourceBufferWriter writer;
+  writer.Append("You say goodbye")
+      .EndLine()
+      .Indent(2)
+      .Prefix("--")
+      .Append("and I say hello!")
+      .EndLine()
+      .Indent(-2)
+      .Prefix("")
+      .Append("Hello, hello!");
+
+  const char* expected = "You say goodbye\n  --and I say hello!\nHello, hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(MarginTest, NegativeIndent) {
+  SourceBufferWriter writer;
+  writer.Append("You say goodbye")
+      .EndLine()
+      .Indent(-10)
+      .Append("and I say hello!");
+
+  const char* expected = "You say goodbye\nand I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(MarginTest, CumulativeIndent) {
+  SourceBufferWriter writer;
+  writer.Append("You say goodbye")
+      .EndLine()
+      .Indent(2)
+      .Append("and I say hello!")
+      .EndLine()
+      .Indent(2)
+      .Append("Hello, hello!");
+
+  const char* expected =
+      "You say goodbye\n  and I say hello!\n    Hello, hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+TEST(MarginTest, EmptyPrefix) {
+  SourceBufferWriter writer;
+  writer.Append("You say goodbye")
+      .EndLine()
+      .Prefix("")
+      .Append("and I say hello!");
+
+  const char* expected = "You say goodbye\nand I say hello!";
+  ASSERT_STREQ(expected, writer.str().data());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/java/src/gen/gen_ops.bzl b/tensorflow/java/src/gen/gen_ops.bzl
index 28f0908ec4a7a02a2a66ab44577b36e7bb3b4a53..a6650fc4ea0b67bcea46e8d5e3ec84aaafef0f7a 100644
--- a/tensorflow/java/src/gen/gen_ops.bzl
+++ b/tensorflow/java/src/gen/gen_ops.bzl
@@ -52,7 +52,7 @@ def tf_java_op_gen_srcjar(name,
 
   # Generate a source archive containing generated code for these ops.
   gen_srcjar = out_dir + name + ".srcjar"
-  gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) ."]
+  gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) src"]
   gen_tools += ["@local_jdk//:jar"] + ["@local_jdk//:jdk"]
   gen_tools += tf_binary_additional_srcs()
   native.genrule(
diff --git a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
index 45e42878c770b3c19d96790e5b4bf2ed41a0de29..11fda4fc22aeec9c2d94b5e884c11ceb2a66d29e 100644
--- a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
+++ b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
@@ -77,7 +77,7 @@ public final class OperatorProcessor extends AbstractProcessor {
     TypeElement annotation = annotations.iterator().next();
     Set<? extends Element> annotated = roundEnv.getElementsAnnotatedWith(annotation);
 
-    // If there are no annotated elements, claim the annotion but do nothing.
+    // If there are no annotated elements, claim the annotation but do nothing.
     if (annotated.size() == 0) {
       return true;
     }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
index 499757e8cf4d6166e425d801ce20335bd8ad83e8..cf773e1686dea97f62f432be43f2c10b69fa8e24 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
@@ -88,7 +88,7 @@ final class NativeLibrary {
       // Deletions are in the reverse order of requests, so we need to request that the directory be
       // deleted first, so that it is empty when the request is fulfilled.
       tempPath.deleteOnExit();
-      final String tempDirectory = tempPath.toString();
+      final String tempDirectory = tempPath.getCanonicalPath();
       if (frameworkResource != null) {
         extractResource(frameworkResource, frameworkLibName, tempDirectory);
       } else {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
index beb3635585c33f5a3942e4f7d44ac597daf8ff72..a24150484e83dcccf3e1869155569431969b74cf 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
@@ -352,7 +352,8 @@ public final class OperationBuilder {
 
   private static native void setAttrShape(long handle, String name, long[] shape, int numDims);
 
-  private static native void setAttrShapeList(long handle, String name, long[] shapes, int[] numDims);
+  private static native void setAttrShapeList(
+      long handle, String name, long[] shapes, int[] numDims);
 
   private static native void setAttrStringList(long handle, String name, Object[] value);
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/package-info.java b/tensorflow/java/src/main/java/org/tensorflow/package-info.java
index dd4859e1b14045e4123e7f15fbaff98e14d0b377..521c5c610c1f775cf9174664f5b786786ce1181d 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/package-info.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/package-info.java
@@ -35,5 +35,9 @@ limitations under the License.
  *   <li>Graph execution: Using a Session to execute the graphs and find the best label for an
  *       image.
  * </ul>
+ *
+ * <p>Additional examples can be found in the <a
+ * href="https://github.com/tensorflow/models/tree/master/samples/languages/java">tensorflow/models</a>
+ * GitHub repository.
  */
 package org.tensorflow;
diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD
index 8e95ea4f7936672020f4f196f286ef73661cdcb1..49348daa94ed04990a657922a0fbb515b7721d82 100644
--- a/tensorflow/java/src/main/native/BUILD
+++ b/tensorflow/java/src/main/native/BUILD
@@ -67,6 +67,7 @@ genrule(
 genrule(
     name = "copy_jni_md_h",
     srcs = select({
+        "//tensorflow:windows": ["@bazel_tools//tools/jdk:jni_md_header-windows"],
         "//tensorflow:darwin": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
         "//conditions:default": ["@bazel_tools//tools/jdk:jni_md_header-linux"],
     }),
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.cc b/tensorflow/java/src/main/native/operation_builder_jni.cc
index 71a451ad1309659a9f96d9b9eedf60a8b3fd9683..55d214a7c4b81a01e48121214e91397626652f11 100644
--- a/tensorflow/java/src/main/native/operation_builder_jni.cc
+++ b/tensorflow/java/src/main/native/operation_builder_jni.cc
@@ -275,15 +275,15 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrShapeList(
   if (num_dims_length > 0) {
     const int shapes_length = env->GetArrayLength(shapes);
     cshapes.reset(new int64_t[shapes_length]);
-    cdims.reset(new int64_t* [num_dims_length]);
+    cdims.reset(new int64_t*[num_dims_length]);
     cnum_dims.reset(new int[num_dims_length]);
     jlong* shapes_elems =
-        (jlong*) env->GetPrimitiveArrayCritical(shapes, nullptr);
+        static_cast<jlong*>(env->GetPrimitiveArrayCritical(shapes, nullptr));
     std::memcpy(cshapes.get(), shapes_elems, shapes_length << 3);
     env->ReleasePrimitiveArrayCritical(shapes, shapes_elems, JNI_ABORT);
     int64_t* cshapes_ptr = cshapes.get();
     jint* num_dims_elems =
-        (jint*) env->GetPrimitiveArrayCritical(num_dims, nullptr);
+        static_cast<jint*>(env->GetPrimitiveArrayCritical(num_dims, nullptr));
     for (int i = 0; i < num_dims_length; ++i) {
       cnum_dims[i] = static_cast<int>(num_dims_elems[i]);
       cdims[i] = cshapes_ptr;
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
index 2430816725abdd664cd016cdfefa6c94b3d0b9b1..0a4a8cf4e3f65311ba887b4d47bc79080bfd5382 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
@@ -151,10 +151,10 @@ public class OperationBuilderTest {
   @Test
   public void setAttrShapeList() {
     // Those shapes match tensors ones, so no exception is thrown
-    testSetAttrShapeList(new Shape[] { Shape.make(2, 2), Shape.make(2, 2, 2) });
+    testSetAttrShapeList(new Shape[] {Shape.make(2, 2), Shape.make(2, 2, 2)});
     try {
       // Those shapes do not match tensors ones, exception is thrown
-      testSetAttrShapeList(new Shape[] { Shape.make(2, 2), Shape.make(2, 2, 2, 2) });
+      testSetAttrShapeList(new Shape[] {Shape.make(2, 2), Shape.make(2, 2, 2, 2)});
       fail("Shapes are incompatible and an exception was expected");
     } catch (IllegalArgumentException e) {
       // expected
@@ -189,20 +189,23 @@ public class OperationBuilderTest {
   }
 
   private static void testSetAttrShapeList(Shape[] shapes) {
-    try (Graph g = new Graph(); Session s = new Session(g)) {
-      int[][] matrix = new int[][] { { 0, 0 }, { 0, 0 } };
-      Output<?> queue = g.opBuilder("FIFOQueue", "queue")
-          .setAttr("component_types", new DataType[] { DataType.INT32, DataType.INT32 }) 
-          .setAttr("shapes", shapes)
-          .build()
-          .output(0);
+    try (Graph g = new Graph();
+        Session s = new Session(g)) {
+      int[][] matrix = new int[][] {{0, 0}, {0, 0}};
+      Output<?> queue =
+          g.opBuilder("FIFOQueue", "queue")
+              .setAttr("component_types", new DataType[] {DataType.INT32, DataType.INT32})
+              .setAttr("shapes", shapes)
+              .build()
+              .output(0);
       assertTrue(hasNode(g, "queue"));
       Output<Integer> c1 = TestUtil.constant(g, "const1", matrix);
-      Output<Integer> c2 = TestUtil.constant(g, "const2", new int[][][] { matrix, matrix });
-      Operation enqueue = g.opBuilder("QueueEnqueue", "enqueue")
-          .addInput(queue)
-          .addInputList(new Output<?>[] { c1, c2 })
-          .build();
+      Output<Integer> c2 = TestUtil.constant(g, "const2", new int[][][] {matrix, matrix});
+      Operation enqueue =
+          g.opBuilder("QueueEnqueue", "enqueue")
+              .addInput(queue)
+              .addInputList(new Output<?>[] {c1, c2})
+              .build();
       assertTrue(hasNode(g, "enqueue"));
 
       s.runner().addTarget(enqueue).run();
diff --git a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
index a86b4dd117ede64d2b105ceb189220a5dd5d9740..e8cc76c2a6458193161a98e17483fe73de107b77 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
@@ -151,7 +151,7 @@ public class SessionTest {
       s.close();
       try {
         s.runner().run();
-        fail("methods on a close()d session should fail");
+        fail("methods on a session should fail after close() is called");
       } catch (IllegalStateException e) {
         // expected exception
       }
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 23ad9bfa56fd974fb553e7581a86712b00d1b465..f563d32388d63808bb483530c73d7aa669abecd0 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1,5 +1,8 @@
 # Description:
 # Python support for TensorFlow.
+#
+# Public targets:
+#  ":platform" - Low-level and platform-specific Python code.
 
 package(
     default_visibility = [
@@ -32,6 +35,7 @@ load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library_py")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_lib_deps")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_grappler")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_plugin_deps")
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_verbs_deps")
@@ -74,6 +78,8 @@ py_library(
         ":io_ops",
         ":layers",
         ":lib",
+        ":list_ops",
+        ":manip_ops",
         ":math_ops",
         ":metrics",
         ":nn",
@@ -129,6 +135,7 @@ py_library(
         ],
     ) + ["platform/build_info.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":lib",
         ":pywrap_tensorflow",
@@ -171,7 +178,21 @@ tf_py_test(
     name = "flags_test",
     size = "small",
     srcs = ["platform/flags_test.py"],
-    additional_deps = [":platform"],
+    additional_deps = [
+        ":client_testlib",
+        ":platform",
+    ],
+)
+
+tf_py_test(
+    name = "stacktrace_handler_test",
+    size = "small",
+    srcs = ["platform/stacktrace_handler_test.py"],
+    additional_deps = [
+        ":client_testlib",
+        ":platform",
+    ],
+    tags = ["no_windows"],
 )
 
 tf_py_test(
@@ -179,10 +200,7 @@ tf_py_test(
     size = "small",
     srcs = ["platform/app_test.py"],
     additional_deps = [":platform"],
-    tags = [
-        "manual",
-        "notap",
-    ],
+    tags = ["notap"],
 )
 
 cc_library(
@@ -197,9 +215,8 @@ cc_library(
         "//tensorflow/core/grappler/costs:analytical_cost_estimator",
         "//tensorflow/core/grappler/costs:cost_estimator",
         "//tensorflow/core/grappler/costs:measuring_cost_estimator",
-        "//tensorflow/core/grappler/costs:op_performance_data_cc",
         "//tensorflow/core/grappler/costs:utils",
-    ],
+    ] + tf_protos_grappler(),
 )
 
 cc_library(
@@ -207,11 +224,11 @@ cc_library(
     srcs = ["grappler/model_analyzer.cc"],
     hdrs = ["grappler/model_analyzer.h"],
     deps = [
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/costs:utils",
     ],
 )
 
@@ -227,11 +244,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "bfloat16_lib",
+    srcs = ["lib/core/bfloat16.cc"],
+    hdrs = ["lib/core/bfloat16.h"],
+    deps = [
+        ":numpy_lib",
+        ":safe_ptr",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//util/python:python_headers",
+    ],
+)
+
 cc_library(
     name = "ndarray_tensor_bridge",
     srcs = ["lib/core/ndarray_tensor_bridge.cc"],
     hdrs = ["lib/core/ndarray_tensor_bridge.h"],
     deps = [
+        ":bfloat16_lib",
         ":numpy_lib",
         "//tensorflow/c:c_api",
         "//tensorflow/core:lib",
@@ -268,10 +299,16 @@ cc_library(
     deps = [
         ":ndarray_tensor_bridge",
         ":numpy_lib",
+        ":py_util",
+        ":safe_ptr",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:script_ops_op_lib",
+        "//tensorflow/python/eager:pywrap_tfe_lib",
         "//third_party/py/numpy:headers",
         "//util/python:python_headers",
     ],
@@ -293,6 +330,7 @@ cc_library(
     srcs = ["lib/core/ndarray_tensor.cc"],
     hdrs = ["lib/core/ndarray_tensor.h"],
     deps = [
+        ":bfloat16_lib",
         ":ndarray_tensor_bridge",
         ":numpy_lib",
         ":safe_ptr",
@@ -309,6 +347,7 @@ cc_library(
     hdrs = ["lib/core/py_seq_tensor.h"],
     deps = [
         ":numpy_lib",
+        ":py_util",
         ":safe_ptr",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -316,6 +355,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "py_util",
+    srcs = ["lib/core/py_util.cc"],
+    hdrs = ["lib/core/py_util.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:script_ops_op_lib",
+        "//util/python:python_headers",
+    ],
+)
+
 cc_library(
     name = "py_record_reader_lib",
     srcs = ["lib/io/py_record_reader.cc"],
@@ -352,6 +402,7 @@ tf_cc_shared_object(
     }),
     deps = [
         "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
         "@protobuf_archive//:protobuf_headers",
     ],
 )
@@ -530,6 +581,7 @@ py_library(
         ":pywrap_tensorflow",
         ":random_seed",
         ":sparse_tensor",
+        ":tensor_spec",
         ":tensor_util",
         ":util",
         "//tensorflow/python/eager:context",
@@ -586,6 +638,7 @@ py_library(
     srcs = ["framework/dtypes.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":pywrap_tensorflow",
         "//tensorflow/core:protos_all_py",
     ],
 )
@@ -676,6 +729,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":c_api_util",
+        ":control_flow_util",
         ":device",
         ":dtypes",
         ":op_def_registry",
@@ -732,6 +786,18 @@ py_library(
     ],
 )
 
+py_library(
+    name = "tensor_spec",
+    srcs = ["framework/tensor_spec.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":common_shapes",
+        ":dtypes",
+        ":tensor_shape",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "tensor_util",
     srcs = ["framework/tensor_util.py"],
@@ -766,15 +832,23 @@ py_library(
     srcs = ["framework/test_util.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":array_ops",
         ":client",
         ":errors",
-        ":framework",
         ":framework_for_generated_wrappers",
         ":platform",
         ":platform_test",
         ":pywrap_tensorflow",
+        ":random_seed",
+        ":resource_variable_ops",
+        ":session",
         ":training",
         ":util",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -1074,6 +1148,7 @@ py_test(
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
 )
 
@@ -1091,6 +1166,21 @@ py_test(
     ],
 )
 
+py_test(
+    name = "framework_tensor_spec_test",
+    size = "small",
+    srcs = ["framework/tensor_spec_test.py"],
+    main = "framework/tensor_spec_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework_for_generated_wrappers",
+        ":framework_test_lib",
+        ":platform_test",
+        ":tensor_spec",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "framework_sparse_tensor_test",
     size = "small",
@@ -1180,6 +1270,12 @@ py_test(
         ":framework_test_lib",
         ":platform_test",
         ":random_ops",
+        ":resource_variable_ops",
+        ":session",
+        ":test_ops",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
     ],
 )
@@ -1190,6 +1286,7 @@ py_test(
     srcs = ["framework/dtypes_test.py"],
     main = "framework/dtypes_test.py",
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -1204,12 +1301,12 @@ py_test(
     name = "op_def_library_test",
     size = "small",
     srcs = ["framework/op_def_library_test.py"],
-    main = "framework/op_def_library_test.py",
     srcs_version = "PY2AND3",
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
+        ":test_ops",
     ],
 )
 
@@ -1258,7 +1355,10 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "control_flow_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
     deps = [
         "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:no_op_op_lib",
@@ -1321,6 +1421,21 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "batch_ops_gen",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "manip_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "math_ops_gen",
     visibility = [
@@ -1359,6 +1474,10 @@ tf_gen_op_wrapper_private_py(
     name = "resource_variable_ops_gen",
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "list_ops_gen",
+)
+
 tf_gen_op_wrapper_private_py(
     name = "script_ops_gen",
 )
@@ -1526,6 +1645,7 @@ py_library(
     deps = [
         ":control_flow_ops",
         ":control_flow_ops_gen",
+        ":control_flow_util",
         ":framework",
         ":framework_for_generated_wrappers",
         ":math_ops",
@@ -1542,6 +1662,7 @@ py_library(
         ":array_ops_gen",
         ":constant_op",
         ":control_flow_ops_gen",
+        ":control_flow_util",
         ":data_flow_ops_gen",
         ":dtypes",
         ":framework_ops",
@@ -1557,6 +1678,15 @@ py_library(
     ],
 )
 
+py_library(
+    name = "control_flow_util",
+    srcs = ["ops/control_flow_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":platform",
+    ],
+)
+
 py_library(
     name = "ctc_ops",
     srcs = ["ops/ctc_ops.py"],
@@ -1630,6 +1760,7 @@ py_library(
         ":bitwise_ops",
         ":control_flow_grad",
         ":control_flow_ops",
+        ":control_flow_util",
         ":framework",
         ":framework_for_generated_wrappers",
         ":functional_ops",
@@ -1637,6 +1768,8 @@ py_library(
         ":linalg_grad",
         ":linalg_ops",
         ":logging_ops",
+        ":manip_grad",
+        ":manip_ops",
         ":math_grad",
         ":math_ops",
         ":platform",
@@ -1706,6 +1839,7 @@ py_library(
         ":math_ops",
         ":nn_ops",
         ":random_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -1758,6 +1892,29 @@ py_library(
     ],
 )
 
+py_library(
+    name = "manip_grad",
+    srcs = ["ops/manip_grad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":control_flow_ops",
+        ":framework_for_generated_wrappers",
+        ":manip_ops",
+    ],
+)
+
+py_library(
+    name = "manip_ops",
+    srcs = ["ops/manip_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":manip_ops_gen",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "logging_ops",
     srcs = ["ops/logging_ops.py"],
@@ -1863,6 +2020,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "list_ops",
+    srcs = ["ops/list_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":list_ops_gen",
+    ],
+)
+
 py_library(
     name = "nn",
     srcs = [
@@ -1982,6 +2149,7 @@ py_library(
     deps = [
         ":array_ops",
         ":control_flow_ops",
+        ":control_flow_util",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":rnn_cell",
@@ -2209,6 +2377,8 @@ py_library(
         ":linalg_ops",
         ":logging_ops",
         ":lookup_ops",
+        ":manip_grad",
+        ":manip_ops",
         ":math_grad",
         ":math_ops",
         ":numerics",
@@ -2288,6 +2458,8 @@ py_library(
         ":platform",
         ":util",
         ":variable_scope",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
 )
 
@@ -2565,6 +2737,7 @@ cuda_py_test(
         ":nn_ops_gen",
         "//third_party/py/numpy",
     ],
+    shard_count = 4,
     tags = ["no_windows"],
 )
 
@@ -2581,7 +2754,7 @@ cuda_py_test(
         ":nn_grad",
         "//third_party/py/numpy",
     ],
-    shard_count = 4,
+    shard_count = 16,
 )
 
 cuda_py_test(
@@ -2966,12 +3139,14 @@ tf_cuda_library(
         ":safe_ptr",
         ":test_ops_kernels",
         "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//third_party/py/numpy:headers",
@@ -3007,12 +3182,14 @@ tf_py_wrap_cc(
         "grappler/item.i",
         "grappler/model_analyzer.i",
         "grappler/tf_optimizer.i",
+        "lib/core/bfloat16.i",
         "lib/core/py_func.i",
         "lib/core/strings.i",
         "lib/io/file_io.i",
         "lib/io/py_record_reader.i",
         "lib/io/py_record_writer.i",
         "platform/base.i",
+        "platform/stacktrace_handler.i",
         "pywrap_tfe.i",
         "training/quantize_training.i",
         "training/server_lib.i",
@@ -3025,6 +3202,7 @@ tf_py_wrap_cc(
         "util/util.i",
     ],
     deps = [
+        ":bfloat16_lib",
         ":cost_analyzer_lib",
         ":model_analyzer_lib",
         ":cpp_python_util",
@@ -3099,130 +3277,124 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_test",
     size = "small",
     srcs = ["training/server_lib_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_multiple_containers_test",
     size = "small",
     srcs = ["training/server_lib_multiple_containers_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_same_variables_clear_container_test",
     size = "small",
     srcs = ["training/server_lib_same_variables_clear_container_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_same_variables_clear_test",
     size = "small",
     srcs = ["training/server_lib_same_variables_clear_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_same_variables_no_clear_test",
     size = "small",
     srcs = ["training/server_lib_same_variables_no_clear_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_sparse_job_test",
     size = "small",
     srcs = ["training/server_lib_sparse_job_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
 cuda_py_test(
@@ -3242,6 +3414,7 @@ cuda_py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    grpc_enabled = True,
     tags = [
         "no_oss",  # Test flaky due to port collisions.
         "oss_serial",
@@ -3260,6 +3433,7 @@ tf_py_test(
         ":training",
         ":variables",
     ],
+    grpc_enabled = True,
     tags = [
         "no_oss",  # Test flaky due to port collisions.
         "notsan",  # data race due to b/62910646
@@ -3290,17 +3464,11 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
-py_test(
+tf_py_test(
     name = "session_test",
     size = "small",
     srcs = ["client/session_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_gpu",
-        "no_pip_gpu",  # testInteractivePlacePrunedGraph fails on invalid assumption about GPU ops.
-        "no_windows",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":control_flow_ops",
@@ -3318,21 +3486,19 @@ py_test(
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_gpu",
+        "no_pip_gpu",  # testInteractivePlacePrunedGraph fails on invalid assumption about GPU ops.
+        "no_windows",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "session_clusterspec_prop_test",
     size = "small",
     srcs = ["client/session_clusterspec_prop_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_gpu",
-        "no_oss",
-        "no_pip",
-        "no_pip_gpu",
-        "notap",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
@@ -3347,37 +3513,40 @@ py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_gpu",
+        "no_oss",
+        "no_pip",
+        "no_pip_gpu",
+        "notap",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "session_list_devices_test",
     size = "small",
     srcs = ["client/session_list_devices_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_gpu",
-        "no_pip_gpu",
-        "notsan",  # data race due to b/62910646
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":framework",
         ":framework_test_lib",
         ":platform_test",
         ":training",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_gpu",
+        "no_pip_gpu",
+        "notsan",  # data race due to b/62910646
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "session_partial_run_test",
     size = "small",
     srcs = ["client/session_partial_run_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_gpu",
-        "no_windows",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":errors",
@@ -3390,6 +3559,11 @@ py_test(
         ":util",
         "@six_archive//:six",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_gpu",
+        "no_windows",
+    ],
 )
 
 cuda_py_test(
@@ -3405,6 +3579,20 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "virtual_gpu_test",
+    size = "small",
+    srcs = ["client/virtual_gpu_test.py"],
+    additional_deps = [
+        ":client",
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = ["noguitar"],
+)
+
 py_test(
     name = "graph_util_test",
     size = "small",
@@ -3422,6 +3610,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "bfloat16_test",
+    size = "small",
+    srcs = ["lib/core/bfloat16_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    deps = [
+        ":client_testlib",
+        ":lib",
+        ":pywrap_tensorflow",
+    ],
+)
+
 py_test(
     name = "file_io_test",
     size = "small",
@@ -3575,7 +3776,9 @@ cuda_py_test(
         "//third_party/py/numpy",
         "@six_archive//:six",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
+    tags = ["multi_gpu"],
 )
 
 py_test(
@@ -3624,6 +3827,7 @@ cuda_py_test(
     srcs = ["training/session_manager_test.py"],
     additional_deps = [
         ":array_ops",
+        ":control_flow_ops",
         ":client",
         ":client_testlib",
         ":errors",
@@ -3632,20 +3836,18 @@ cuda_py_test(
         ":training",
         ":variables",
     ],
+    grpc_enabled = True,
     main = "training/session_manager_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "supervisor_test",
     size = "small",
     srcs = ["training/supervisor_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework",
         ":framework_for_generated_wrappers",
         ":io_ops",
@@ -3656,6 +3858,8 @@ py_test(
         ":variables",
         "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
+    tags = ["no_windows"],
 )
 
 py_test(
@@ -4094,12 +4298,6 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-filegroup(
-    name = "hidden_ops",
-    srcs = ["ops/hidden_ops.txt"],
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cuda_py_test(
     name = "accumulate_n_benchmark",
     size = "large",
@@ -4269,6 +4467,7 @@ cuda_py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    grpc_enabled = True,
     main = "client/session_benchmark.py",
 )
 
@@ -4305,7 +4504,10 @@ py_test(
         "grappler/item_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],  # tf_optimizer is not available in pip.
+    tags = [
+        "grappler",
+        "no_pip",  # tf_optimizer is not available in pip.
+    ],
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -4315,6 +4517,27 @@ py_test(
     ],
 )
 
+py_test(
+    name = "datasets_test",
+    size = "small",
+    srcs = [
+        "grappler/datasets_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    tags = [
+        "grappler",
+        "no_pip",  # tf_optimizer is not available in pip.
+    ],
+    deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":tf_item",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data",
+    ],
+)
+
 py_library(
     name = "tf_cluster",
     srcs = [
@@ -4328,21 +4551,24 @@ py_library(
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "cluster_test",
     size = "small",
     srcs = [
         "grappler/cluster_test.py",
     ],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],  # tf_optimizer is not available in pip.
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":tf_cluster",
         ":tf_item",
         "//tensorflow/core:protos_all_py",
     ],
+    shard_count = 10,
+    tags = [
+        "grappler",
+        "no_pip",  # tf_optimizer is not available in pip.
+    ],
 )
 
 py_library(
@@ -4362,12 +4588,13 @@ py_test(
     name = "tf_optimizer_test",
     size = "small",
     srcs = [
-        "grappler/cluster_test.py",
-        "grappler/item_test.py",
         "grappler/tf_optimizer_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],  # tf_optimizer is not available in pip.
+    tags = [
+        "grappler",
+        "no_pip",  # tf_optimizer is not available in pip.
+    ],
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -4385,6 +4612,9 @@ py_test(
         "grappler/memory_optimizer_test.py",
     ],
     srcs_version = "PY2AND3",
+    tags = [
+        "grappler",
+    ],
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -4419,13 +4649,17 @@ cuda_py_test(
         ":nn",
         ":ops",
         ":random_ops",
+        ":state_ops",
         ":tf_cluster",
         ":tf_optimizer",
         ":training",
         "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
     ],
-    tags = ["manual"],
+    shard_count = 10,
+    tags = [
+        "grappler",
+    ],
 )
 
 py_library(
@@ -4460,7 +4694,10 @@ py_test(
     size = "small",
     srcs = ["grappler/cost_analyzer_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "grappler",
+        "no_pip",
+    ],
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -4492,7 +4729,10 @@ py_test(
     size = "small",
     srcs = ["grappler/model_analyzer_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "grappler",
+        "no_pip",
+    ],
     deps = [
         ":array_ops",
         ":client_testlib",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index af34aca3e345ff6d12f471f289b77001b40c00bf..02ed5517ca895ab070a89f8810f77dadcff9212b 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -60,7 +60,7 @@ from tensorflow.core.protobuf.tensorflow_server_pb2 import *
 from tensorflow.core.util.event_pb2 import *
 
 # Framework
-from tensorflow.python.framework.framework_lib import *
+from tensorflow.python.framework.framework_lib import *  # pylint: disable=redefined-builtin
 from tensorflow.python.framework.versions import *
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import graph_util
@@ -84,6 +84,7 @@ from tensorflow.python.feature_column import feature_column_lib as feature_colum
 from tensorflow.python.layers import layers
 from tensorflow.python.ops import bitwise_ops as bitwise
 from tensorflow.python.ops import image_ops as image
+from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sets
@@ -115,6 +116,7 @@ from tensorflow.python.platform import test
 
 from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.all_util import make_all
+from tensorflow.python.util.tf_export import tf_export
 
 # Import modules whose docstrings contribute, for use by remove_undocumented
 # below.
@@ -166,6 +168,31 @@ _allowed_symbols = [
     'TensorInfo',  # Used for tf.saved_model functionality.
 ]
 
+# Export protos
+# pylint: disable=undefined-variable
+tf_export('AttrValue')(AttrValue)
+tf_export('ConfigProto')(ConfigProto)
+tf_export('Event', 'summary.Event')(Event)
+tf_export('GPUOptions')(GPUOptions)
+tf_export('GraphDef')(GraphDef)
+tf_export('GraphOptions')(GraphOptions)
+tf_export('HistogramProto')(HistogramProto)
+tf_export('LogMessage')(LogMessage)
+tf_export('MetaGraphDef')(MetaGraphDef)
+tf_export('NameAttrList')(NameAttrList)
+tf_export('NodeDef')(NodeDef)
+tf_export('OptimizerOptions')(OptimizerOptions)
+tf_export('RunMetadata')(RunMetadata)
+tf_export('RunOptions')(RunOptions)
+tf_export('SessionLog', 'summary.SessionLog')(SessionLog)
+tf_export('Summary', 'summary.Summary')(Summary)
+tf_export('summary.SummaryDescription')(SummaryDescription)
+tf_export('SummaryMetadata')(SummaryMetadata)
+tf_export('summary.TaggedRunMetadata')(TaggedRunMetadata)
+tf_export('TensorInfo')(TensorInfo)
+# pylint: enable=undefined-variable
+
+
 # The following symbols are kept for compatibility. It is our plan
 # to remove them in the future.
 _allowed_symbols.extend([
@@ -241,6 +268,7 @@ _allowed_symbols.extend([
     'linalg',
     'logging',
     'losses',
+    'manip',
     'metrics',
     'newaxis',
     'nn',
@@ -263,6 +291,7 @@ _allowed_symbols.extend([
     'GIT_VERSION',
     'COMPILER_VERSION',
     'CXX11_ABI_FLAG',
+    'MONOLITHIC_BUILD',
 ])
 
 # Remove all extra symbols that don't have a docstring or are not explicitly
@@ -282,6 +311,7 @@ _exported_dunders = set([
     '__git_version__',
     '__compiler_version__',
     '__cxx11_abi_flag__',
+    '__monolithic_build__',
 ])
 
 # Expose symbols minus dunders, unless they are whitelisted above.
diff --git a/tensorflow/python/build_defs.bzl b/tensorflow/python/build_defs.bzl
index 2d8625933f9ea4ab3bedf8d3157430d821f3e584..b9056f86e6d0465a8521f054a459c06eb5aeb37c 100644
--- a/tensorflow/python/build_defs.bzl
+++ b/tensorflow/python/build_defs.bzl
@@ -22,9 +22,12 @@ def tf_gen_op_wrapper_private_py(name, out=None, deps=[],
   bare_op_name = name[:-4] # Strip off the _gen
   tf_gen_op_wrapper_py(name=bare_op_name,
     out=out,
-    hidden_file="ops/hidden_ops.txt",
     visibility=visibility,
     deps=deps,
     require_shape_functions=require_shape_functions,
     generated_target_name=name,
+    api_def_srcs = [
+        "//tensorflow/core/api_def:base_api_def",
+        "//tensorflow/core/api_def:python_api_def",
+    ],
   )
diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py
index 7bba10efacfbc7fbde402c665b3d55d852e36eae..aaf41626ab0078489026036d2b838f33a893a540 100644
--- a/tensorflow/python/client/device_lib_test.py
+++ b/tensorflow/python/client/device_lib_test.py
@@ -34,7 +34,8 @@ class DeviceLibTest(test_util.TensorFlowTestCase):
     # GPU test
     if test.is_gpu_available():
       self.assertGreater(len(devices), 1)
-      self.assertTrue("GPU" in [d.device_type for d in devices] or "SYCL" in [d.device_type for d in devices])
+      self.assertTrue("GPU" in [d.device_type for d in devices] or
+                      "SYCL" in [d.device_type for d in devices])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/client/events_writer.i b/tensorflow/python/client/events_writer.i
index de030fcb4282912475ed8853bae9d41cde2c085d..c72b76b8fa4a05588841466a836bc189bb64d154 100644
--- a/tensorflow/python/client/events_writer.i
+++ b/tensorflow/python/client/events_writer.i
@@ -23,6 +23,9 @@ limitations under the License.
 
 %nodefaultctor EventsWriter;
 
+%ignore tensorflow::Status::operator=;
+%include "tensorflow/core/lib/core/status.h"
+
 %ignoreall
 %unignore tensorflow;
 %unignore tensorflow::EventsWriter;
diff --git a/tensorflow/python/client/notebook.py b/tensorflow/python/client/notebook.py
index 8babe35b3230e7b46c0c9484ccddae4e5e22a335..4b6a0f71ae65aa28b70dd22ce6cffa82e9bc5973 100644
--- a/tensorflow/python/client/notebook.py
+++ b/tensorflow/python/client/notebook.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Notebook front-end to TensorFlow.
 
 When you run this binary, you'll see something like below, which indicates
@@ -43,10 +42,8 @@ from tensorflow.python.platform import app
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "cpp"
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION"] = "2"
 
-
 FLAGS = None
 
-
 ORIG_ARGV = sys.argv
 # Main notebook process calls itself with argv[1]="kernel" to start kernel
 # subprocesses.
@@ -73,8 +70,8 @@ def main(unused_argv):
       notebookapp.ip = "0.0.0.0"
       notebookapp.password = passwd(FLAGS.password)
     else:
-      print ("\nNo password specified; Notebook server will only be available"
-             " on the local machine.\n")
+      print("\nNo password specified; Notebook server will only be available"
+            " on the local machine.\n")
     notebookapp.initialize(argv=["--notebook-dir", FLAGS.notebook_dir])
 
     if notebookapp.ip == "0.0.0.0":
@@ -125,8 +122,8 @@ if __name__ == "__main__":
   # kernel app.
   if IS_KERNEL:
     # Drop everything except --flagfile.
-    sys.argv = ([sys.argv[0]] +
-                [x for x in sys.argv[1:] if x.startswith("--flagfile")])
+    sys.argv = (
+        [sys.argv[0]] + [x for x in sys.argv[1:] if x.startswith("--flagfile")])
 
   FLAGS, unparsed = parser.parse_known_args()
   app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 759c36ad72e922671288b0d57fe9e442b915c144..f3c4fecdc0fde0436bea76cc774edaabe1bc07dd 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """A client interface for TensorFlow."""
 
 from __future__ import absolute_import
@@ -36,6 +35,7 @@ from tensorflow.python.ops import session_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 class SessionInterface(object):
@@ -71,8 +71,9 @@ def _get_indexed_slices_value_from_fetches(fetched_vals):
 
 
 def _get_feeds_for_indexed_slices(feed, feed_val):
-  return list(zip([feed.values, feed.indices] if feed.dense_shape is None else
-                  [feed.values, feed.indices, feed.dense_shape], feed_val))
+  return list(
+      zip([feed.values, feed.indices] if feed.dense_shape is None else
+          [feed.values, feed.indices, feed.dense_shape], feed_val))
 
 
 # List of extensions supported to convert run arguments into actual fetches and
@@ -124,10 +125,20 @@ _REGISTERED_EXPANSIONS = [
      lambda fetch: ([fetch], lambda fetched_vals: fetched_vals[0]),
      lambda feed, feed_val: [(feed, feed_val)],
      lambda feed: [feed])]
+
 # pylint: enable=g-long-lambda
 
-def register_session_run_conversion_functions(tensor_type, fetch_function,
-    feed_function=None, feed_function_for_partial_run=None):
+
+def _convert_to_numpy_obj(numpy_dtype, obj):
+  """Explicitly convert obj based on numpy type except for string type."""
+  return numpy_dtype(obj) if numpy_dtype is not object else str(obj)
+
+
+def register_session_run_conversion_functions(
+    tensor_type,
+    fetch_function,
+    feed_function=None,
+    feed_function_for_partial_run=None):
   """Register fetch and feed conversion functions for `tf.Session.run()`.
 
   This function registers a triple of conversion functions for fetching and/or
@@ -168,11 +179,11 @@ def register_session_run_conversion_functions(tensor_type, fetch_function,
   """
   for conversion_function in _REGISTERED_EXPANSIONS:
     if issubclass(conversion_function[0], tensor_type):
-      raise ValueError(
-          '%s has already been registered so ignore it.', tensor_type)
+      raise ValueError('%s has already been registered so ignore it.',
+                       tensor_type)
       return
-  _REGISTERED_EXPANSIONS.insert(0,
-    (tensor_type, fetch_function, feed_function, feed_function_for_partial_run))
+  _REGISTERED_EXPANSIONS.insert(0, (tensor_type, fetch_function, feed_function,
+                                    feed_function_for_partial_run))
 
 
 class _FetchMapper(object):
@@ -227,8 +238,8 @@ class _FetchMapper(object):
       An instance of a subclass of `_FetchMapper` that handles the shape.
     """
     if fetch is None:
-      raise TypeError('Fetch argument %r has invalid type %r' %
-                      (fetch, type(fetch)))
+      raise TypeError('Fetch argument %r has invalid type %r' % (fetch,
+                                                                 type(fetch)))
     elif isinstance(fetch, (list, tuple)):
       # NOTE(touts): This is also the code path for namedtuples.
       return _ListFetchMapper(fetch)
@@ -241,8 +252,8 @@ class _FetchMapper(object):
           fetches, contraction_fn = fetch_fn(fetch)
           return _ElementFetchMapper(fetches, contraction_fn)
     # Did not find anything.
-    raise TypeError('Fetch argument %r has invalid type %r' %
-                    (fetch, type(fetch)))
+    raise TypeError('Fetch argument %r has invalid type %r' % (fetch,
+                                                               type(fetch)))
 
 
 class _ElementFetchMapper(_FetchMapper):
@@ -271,8 +282,8 @@ class _ElementFetchMapper(_FetchMapper):
             fetch, allow_tensor=True, allow_operation=True))
       except TypeError as e:
         raise TypeError('Fetch argument %r has invalid type %r, '
-                        'must be a string or Tensor. (%s)'
-                        % (fetch, type(fetch), str(e)))
+                        'must be a string or Tensor. (%s)' %
+                        (fetch, type(fetch), str(e)))
       except ValueError as e:
         raise ValueError('Fetch argument %r cannot be interpreted as a '
                          'Tensor. (%s)' % (fetch, str(e)))
@@ -370,8 +381,9 @@ class _DictFetchMapper(_FetchMapper):
     """
     self._fetch_type = type(fetches)
     self._keys = fetches.keys()
-    self._mappers = [_FetchMapper.for_fetch(fetch)
-                     for fetch in fetches.values()]
+    self._mappers = [
+        _FetchMapper.for_fetch(fetch) for fetch in fetches.values()
+    ]
     self._unique_fetches, self._value_indices = _uniquify_fetches(self._mappers)
 
   def unique_fetches(self):
@@ -395,6 +407,7 @@ class _FetchHandler(object):
   result structure matching the user-provided structure for fetches, but
   containing the corresponding results.
   """
+
   # TODO(touts): Make this class also take care of destructuring the feed
   # dict instead of doing it in the callers.
 
@@ -545,8 +558,11 @@ class _DeviceAttributes(object):
     return self._memory_limit_bytes
 
   def __repr__(self):
-    return '_DeviceAttributes(%s, %s, %d)' % (self.name, self.device_type,
-                                              self.memory_limit_bytes,)
+    return '_DeviceAttributes(%s, %s, %d)' % (
+        self.name,
+        self.device_type,
+        self.memory_limit_bytes,
+    )
 
 
 class BaseSession(SessionInterface):
@@ -595,8 +611,8 @@ class BaseSession(SessionInterface):
 
     if config is not None:
       if not isinstance(config, config_pb2.ConfigProto):
-        raise TypeError('config must be a tf.ConfigProto, but got %s'
-                        % type(config))
+        raise TypeError(
+            'config must be a tf.ConfigProto, but got %s' % type(config))
       self._config = config
       self._add_shapes = config.graph_options.infer_shapes
     else:
@@ -970,8 +986,8 @@ class BaseSession(SessionInterface):
       for tensor_type, _, _, feed_fn in _REGISTERED_EXPANSIONS:
         if isinstance(feed, tensor_type):
           return feed_fn(feed)
-      raise TypeError('Feed argument %r has invalid type %r'
-                      % (feed, type(feed)))
+      raise TypeError('Feed argument %r has invalid type %r' % (feed,
+                                                                type(feed)))
 
     # Check session.
     if self._closed:
@@ -992,8 +1008,8 @@ class BaseSession(SessionInterface):
     for feed in feeds:
       for subfeed in _feed_fn(feed):
         try:
-          subfeed_t = self.graph.as_graph_element(subfeed, allow_tensor=True,
-                                                  allow_operation=False)
+          subfeed_t = self.graph.as_graph_element(
+              subfeed, allow_tensor=True, allow_operation=False)
           if self._created_with_new_api:
             # pylint: disable=protected-access
             feed_list.append(subfeed_t._as_tf_output())
@@ -1001,8 +1017,7 @@ class BaseSession(SessionInterface):
           else:
             feed_list.append(compat.as_bytes(subfeed_t.name))
         except Exception as e:
-          e.message = ('Cannot interpret feed_list key as Tensor: '
-                       + e.message)
+          e.message = ('Cannot interpret feed_list key as Tensor: ' + e.message)
           e.args = (e.message,)
           raise e
 
@@ -1035,12 +1050,13 @@ class BaseSession(SessionInterface):
 
   def _run(self, handle, fetches, feed_dict, options, run_metadata):
     """Perform either run or partial_run, depending the presence of `handle`."""
+
     def _feed_fn(feed, feed_val):
       for tensor_type, _, feed_fn, _ in _REGISTERED_EXPANSIONS:
         if isinstance(feed, tensor_type):
           return feed_fn(feed, feed_val)
-      raise TypeError('Feed argument %r has invalid type %r'
-                      % (feed, type(feed)))
+      raise TypeError('Feed argument %r has invalid type %r' % (feed,
+                                                                type(feed)))
 
     # Check session.
     if self._closed:
@@ -1060,11 +1076,11 @@ class BaseSession(SessionInterface):
       for feed, feed_val in feed_dict.items():
         for subfeed, subfeed_val in _feed_fn(feed, feed_val):
           try:
-            subfeed_t = self.graph.as_graph_element(subfeed, allow_tensor=True,
-                                                    allow_operation=False)
+            subfeed_t = self.graph.as_graph_element(
+                subfeed, allow_tensor=True, allow_operation=False)
           except Exception as e:
-            raise TypeError('Cannot interpret feed_dict key as Tensor: '
-                            + e.args[0])
+            raise TypeError(
+                'Cannot interpret feed_dict key as Tensor: ' + e.args[0])
 
           if isinstance(subfeed_val, ops.Tensor):
             raise TypeError('The value of a feed cannot be a tf.Tensor object. '
@@ -1072,12 +1088,13 @@ class BaseSession(SessionInterface):
                             'strings, lists, numpy ndarrays, or TensorHandles.')
 
           subfeed_dtype = subfeed_t.dtype.as_numpy_dtype
-          if isinstance(subfeed_val,
-                        int) and subfeed_dtype(subfeed_val) != subfeed_val:
+          if isinstance(subfeed_val, int) and _convert_to_numpy_obj(
+              subfeed_dtype, subfeed_val) != subfeed_val:
             raise TypeError(
-                'Type of feed value ' + str(subfeed_val) + ' is not'
-                ' compatible with Tensor type ' + str(subfeed_dtype) + '.'
-                ' Try explicitly setting the type of the feed tensor'
+                'Type of feed value ' + str(subfeed_val) + ' with type ' + str(
+                    type(subfeed_val)) +
+                ' is not compatible with Tensor type ' + str(subfeed_dtype) +
+                '. Try explicitly setting the type of the feed tensor'
                 ' to a larger type (e.g. int64).')
 
           is_tensor_handle_feed = isinstance(subfeed_val,
@@ -1090,10 +1107,10 @@ class BaseSession(SessionInterface):
 
           if (not is_tensor_handle_feed and
               not subfeed_t.get_shape().is_compatible_with(np_val.shape)):
-            raise ValueError(
-                'Cannot feed value of shape %r for Tensor %r, '
-                'which has shape %r'
-                % (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
+            raise ValueError('Cannot feed value of shape %r for Tensor %r, '
+                             'which has shape %r' %
+                             (np_val.shape, subfeed_t.name,
+                              str(subfeed_t.get_shape())))
           if not self.graph.is_feedable(subfeed_t):
             raise ValueError('Tensor %s may not be fed.' % subfeed_t)
 
@@ -1122,10 +1139,7 @@ class BaseSession(SessionInterface):
       results = []
     return fetch_handler.build_results(self, results)
 
-  def make_callable(self,
-                    fetches,
-                    feed_list=None,
-                    accept_options=False):
+  def make_callable(self, fetches, feed_list=None, accept_options=False):
     """Returns a Python callable that runs a particular step.
 
     The returned callable will take `len(feed_list)` arguments whose types
@@ -1160,9 +1174,6 @@ class BaseSession(SessionInterface):
       TypeError: If `fetches` or `feed_list` cannot be interpreted
         as arguments to @{tf.Session.run}.
     """
-    assert not self._created_with_new_api, ('session.make_callable() doesn\'t '
-                                            'work with C API')
-
     if feed_list is not None:
       if not isinstance(feed_list, (list, tuple)):
         raise TypeError('`feed_list` must be a list or tuple.')
@@ -1171,9 +1182,12 @@ class BaseSession(SessionInterface):
       # `Session._run()` so that we can convert the feeds to a list of
       # strings here.
       def _generic_run(*feed_args, **kwargs):
-        feed_dict = {feed: feed_val
-                     for feed, feed_val in zip(feed_list, feed_args)}
+        feed_dict = {
+            feed: feed_val
+            for feed, feed_val in zip(feed_list, feed_args)
+        }
         return self.run(fetches, feed_dict=feed_dict, **kwargs)
+
       return _generic_run
 
     # Ensure any changes to the graph are reflected in the runtime.
@@ -1184,24 +1198,34 @@ class BaseSession(SessionInterface):
 
     # Create a fetch handler to take care of the structure of fetches.
     fetch_handler = _FetchHandler(self._graph, fetches, {})
-    fetch_list_as_strings = _name_list(fetch_handler.fetches())
-    target_list_as_strings = _name_list(fetch_handler.targets())
-
-    def _callable_template_with_options_and_metadata(
-        fetch_list_as_strings,
-        target_list_as_strings,
-        fetch_handler,
-        options=None,
-        run_metadata=None):
+    if self._created_with_new_api:
+      # pylint: disable=protected-access
+      fetch_list = [t._as_tf_output() for t in fetch_handler.fetches()]
+      target_list = [op._c_op for op in fetch_handler.targets()]
+      # pylint: enable=protected-access
+    else:
+      fetch_list = _name_list(fetch_handler.fetches())
+      target_list = _name_list(fetch_handler.targets())
+
+    def _callable_template_with_options_and_metadata(fetch_list,
+                                                     target_list,
+                                                     fetch_handler,
+                                                     options=None,
+                                                     run_metadata=None):
       """Template callable that accepts RunOptions and RunMetadata."""
       options_ptr = tf_session.TF_NewBufferFromString(
           compat.as_bytes(options.SerializeToString())) if options else None
       run_metadata_ptr = tf_session.TF_NewBuffer() if run_metadata else None
       try:
         with errors.raise_exception_on_not_ok_status() as status:
-          results = tf_session.TF_Run(
-              self._session, options_ptr, {}, fetch_list_as_strings,
-              target_list_as_strings, status, run_metadata_ptr)
+          if self._created_with_new_api:
+            results = tf_session.TF_SessionRun_wrapper(
+                self._session, options_ptr, {}, fetch_list, target_list,
+                run_metadata_ptr, status)
+          else:
+            results = tf_session.TF_Run(self._session, options_ptr, {},
+                                        fetch_list, target_list, status,
+                                        run_metadata_ptr)
           if fetch_handler:
             results = fetch_handler.build_results(self, results)
           else:
@@ -1217,46 +1241,61 @@ class BaseSession(SessionInterface):
       return results
 
     if accept_options:
-      return functools.partial(
-          _callable_template_with_options_and_metadata, fetch_list_as_strings,
-          target_list_as_strings, fetch_handler)
+      return functools.partial(_callable_template_with_options_and_metadata,
+                               fetch_list, target_list, fetch_handler)
     elif isinstance(fetches, ops.Operation):
       # Special case for fetching a single operation, because the
       # function will have no return value.
-      assert not fetch_list_as_strings
-      assert len(target_list_as_strings) == 1
+      assert not fetch_list
+      assert len(target_list) == 1
+
       def _single_operation_run():
         with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.TF_Run(self._session, None, {}, [],
-                            target_list_as_strings, status, None)
+          if self._created_with_new_api:
+            tf_session.TF_SessionRun_wrapper(self._session, None, {}, [],
+                                             target_list, None, status)
+          else:
+            tf_session.TF_Run(self._session, None, {}, [], target_list, status,
+                              None)
+
       return _single_operation_run
     elif isinstance(fetches, ops.Tensor):
       # Special case for fetching a single tensor, because the
       # function can return the result of `TF_Run()` directly.
-      assert len(fetch_list_as_strings) == 1
-      assert not target_list_as_strings
+      assert len(fetch_list) == 1
+      assert not target_list
+
       def _single_tensor_run():
         with errors.raise_exception_on_not_ok_status() as status:
-          results = tf_session.TF_Run(self._session, None, {},
-                                      fetch_list_as_strings, [], status, None)
+          if self._created_with_new_api:
+            results = tf_session.TF_SessionRun_wrapper(
+                self._session, None, {}, fetch_list, [], None, status)
+          else:
+            results = tf_session.TF_Run(self._session, None, {}, fetch_list, [],
+                                        status, None)
         return results[0]
+
       return _single_tensor_run
     else:
       # In all other cases, we must use `fetch_handler` to build the
       # results for us.
       def _fetch_handler_run():
         with errors.raise_exception_on_not_ok_status() as status:
-          results = tf_session.TF_Run(self._session, None, {},
-                                      fetch_list_as_strings,
-                                      target_list_as_strings, status, None)
+          if self._created_with_new_api:
+            results = tf_session.TF_SessionRun_wrapper(
+                self._session, None, {}, fetch_list, target_list, None, status)
+          else:
+            results = tf_session.TF_Run(self._session, None, {}, fetch_list,
+                                        target_list, status, None)
         return fetch_handler.build_results(self, results)
+
       return _fetch_handler_run
 
   # Captures the name of a node in an error status.
   _NODEDEF_NAME_RE = re.compile(r'\[\[Node: ([^ ]*?) =')
 
-  def _do_run(self, handle, target_list, fetch_list, feed_dict,
-              options, run_metadata):
+  def _do_run(self, handle, target_list, fetch_list, feed_dict, options,
+              run_metadata):
     """Runs a step based on the given fetches and feeds.
 
     Args:
@@ -1293,13 +1332,12 @@ class BaseSession(SessionInterface):
       self._extend_graph()
       with errors.raise_exception_on_not_ok_status() as status:
         if self._created_with_new_api:
-          return tf_session.TF_SessionRun_wrapper(
-              session, options, feed_dict, fetch_list, target_list,
-              run_metadata, status)
+          return tf_session.TF_SessionRun_wrapper(session, options, feed_dict,
+                                                  fetch_list, target_list,
+                                                  run_metadata, status)
         else:
-          return tf_session.TF_Run(session, options,
-                                   feed_dict, fetch_list, target_list,
-                                   status, run_metadata)
+          return tf_session.TF_Run(session, options, feed_dict, fetch_list,
+                                   target_list, status, run_metadata)
 
     def _prun_fn(session, handle, feed_dict, fetch_list):
       if target_list:
@@ -1338,20 +1376,20 @@ class BaseSession(SessionInterface):
   def _extend_graph(self):
     # Nothing to do if we're using the new session interface
     # TODO(skyewm): remove this function altogether eventually
-    if self._created_with_new_api: return
+    if self._created_with_new_api:
+      return
 
     # Ensure any changes to the graph are reflected in the runtime.
     with self._extend_lock:
       if self._graph.version > self._current_version:
         # pylint: disable=protected-access
         graph_def, self._current_version = self._graph._as_graph_def(
-            from_version=self._current_version,
-            add_shapes=self._add_shapes)
+            from_version=self._current_version, add_shapes=self._add_shapes)
         # pylint: enable=protected-access
 
         with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.TF_ExtendGraph(
-              self._session, graph_def.SerializeToString(), status)
+          tf_session.TF_ExtendGraph(self._session,
+                                    graph_def.SerializeToString(), status)
         self._opened = True
 
   # The threshold to run garbage collection to delete dead tensors.
@@ -1371,9 +1409,8 @@ class BaseSession(SessionInterface):
       feeds = {}
       fetches = []
       for deleter_key, tensor_handle in enumerate(tensors_to_delete):
-        holder, deleter = session_ops._get_handle_deleter(self.graph,
-                                                          deleter_key,
-                                                          tensor_handle)
+        holder, deleter = session_ops._get_handle_deleter(
+            self.graph, deleter_key, tensor_handle)
         feeds[holder] = tensor_handle
         fetches.append(deleter)
       self.run(fetches, feed_dict=feeds)
@@ -1405,6 +1442,7 @@ class BaseSession(SessionInterface):
       return handles
 
 
+@tf_export('Session')
 class Session(BaseSession):
   """A class for running TensorFlow operations.
 
@@ -1444,7 +1482,8 @@ class Session(BaseSession):
     sess.run(...)
   ```
 
-  The [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
+  The
+  [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
   protocol buffer exposes various configuration options for a
   session. For example, to create a session that uses soft constraints
   for device placement, and log the resulting placement decisions,
@@ -1475,7 +1514,8 @@ class Session(BaseSession):
         @{$distributed$Distributed TensorFlow}
         for more examples.
       graph: (Optional.) The `Graph` to be launched (described above).
-      config: (Optional.) A [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
+      config: (Optional.) A
+        [`ConfigProto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)
         protocol buffer with configuration options for the session.
 
     """
@@ -1499,8 +1539,22 @@ class Session(BaseSession):
   def __exit__(self, exec_type, exec_value, exec_tb):
     if exec_type is errors.OpError:
       logging.error('Session closing due to OpError: %s', (exec_value,))
-    self._default_session_context_manager.__exit__(
-        exec_type, exec_value, exec_tb)
+    try:
+      self._default_session_context_manager.__exit__(exec_type, exec_value,
+                                                     exec_tb)
+    except RuntimeError as error:
+      if error == exec_value:
+        # NOTE(skyewm): for some reason, in Python3,
+        # _default_session_context_manager.__exit__ will re-raise the "not
+        # re-entrant" exception raised in __enter__ above (note that if we're
+        # here, we're in the outer session context manager, since __exit__ is
+        # not called when __enter__ raises an exception). We still want to
+        # continue cleaning up this context manager before the exception is
+        # further propagated, so we ignore it here (note that it'll continue
+        # being propagated after this method completes).
+        pass
+      else:
+        raise
     self._default_graph_context_manager.__exit__(exec_type, exec_value, exec_tb)
 
     self._default_session_context_manager = None
@@ -1543,6 +1597,7 @@ class Session(BaseSession):
     tf_session.TF_Reset(target, containers, config)
 
 
+@tf_export('InteractiveSession')
 class InteractiveSession(BaseSession):
   """A TensorFlow `Session` for use in interactive contexts, such as a shell.
 
diff --git a/tensorflow/python/client/session_benchmark.py b/tensorflow/python/client/session_benchmark.py
index 721bca91b71aa00479c27fad102d5888d58d35b1..da74855193dbfe3019f23c542d86c5e493e9ac7a 100644
--- a/tensorflow/python/client/session_benchmark.py
+++ b/tensorflow/python/client/session_benchmark.py
@@ -22,6 +22,7 @@ import time
 
 import numpy as np
 
+from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
index c85b22eb156407fcb78302c43b9cb17b8f6b5e06..f1934241334e049c1d02e095d371927bec71be14 100644
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -77,7 +77,8 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     config = config_pb2.ConfigProto(cluster_def=cluster_def)
 
     with ops.Graph().as_default() as g, ops.device('/job:worker/task:1'):
-      const = constant_op.constant(17)
+      with ops.device('/cpu:0'):	 
+        const = constant_op.constant(17)
     sess = session.Session(server1.target, config=config, graph=g)
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
diff --git a/tensorflow/python/client/session_list_devices_test.py b/tensorflow/python/client/session_list_devices_test.py
index 584b1abe55c0df09afad0c432837646e75beb653..5a7413c12e9db92cb85d54a69602753ff6476425 100644
--- a/tensorflow/python/client/session_list_devices_test.py
+++ b/tensorflow/python/client/session_list_devices_test.py
@@ -39,7 +39,6 @@ class SessionListDevicesTestMethods(object):
       devices = sess.list_devices()
       self.assertTrue('/job:localhost/replica:0/task:0/device:CPU:0' in set(
           [d.name for d in devices]), devices)
-      self.assertGreaterEqual(1, len(devices), devices)
 
   def testInvalidDeviceNumber(self):
     opts = tf_session.TF_NewSessionOptions()
@@ -65,7 +64,6 @@ class SessionListDevicesTestMethods(object):
       devices = sess.list_devices()
       self.assertTrue('/job:local/replica:0/task:0/device:CPU:0' in set(
           [d.name for d in devices]), devices)
-      self.assertGreaterEqual(1, len(devices), devices)
 
   def testListDevicesClusterSpecPropagation(self):
     server1 = server_lib.Server.create_local_server()
@@ -84,7 +82,6 @@ class SessionListDevicesTestMethods(object):
           '/job:worker/replica:0/task:0/device:CPU:0' in device_names)
       self.assertTrue(
           '/job:worker/replica:0/task:1/device:CPU:0' in device_names)
-      self.assertGreaterEqual(2, len(devices), devices)
 
 
 class SessionListDevicesTest(SessionListDevicesTestMethods,
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index f4b02711955242085e222d341e04fb9fc409dd63..490572254b0be6a110ef06cea15d20d780f732cf 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Tests for tensorflow.python.client.session.Session."""
 from __future__ import absolute_import
 from __future__ import division
@@ -28,9 +27,10 @@ import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import types_pb2
 from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
@@ -45,8 +45,8 @@ from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
 # Import resource_variable_ops for the variables-to-tensor implicit conversion.
 from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import state_ops
@@ -55,13 +55,12 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
-ops._USE_C_API = True
-
 # NOTE(mrry): Dummy shape registration for ops used in the tests, since they
 # don't have C++ op registrations on which to attach C++ shape fns.
 ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
 
 
+@test_util.with_c_api
 class SessionTest(test_util.TensorFlowTestCase):
 
   def testUseExistingGraph(self):
@@ -93,14 +92,18 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(arr, copy_val)
       # Test without feed.
       copy_val = copy.eval()
-      self.assertAllEqual(np.asarray([[10.0, 10.0, 10.0], [10.0, 10.0, 10.0]],
-                                     dtype=np.float32), copy_val)
+      self.assertAllEqual(
+          np.asarray(
+              [[10.0, 10.0, 10.0], [10.0, 10.0, 10.0]], dtype=np.float32),
+          copy_val)
 
   def testManyCPUs(self):
     # TODO(keveman): Implement ListDevices and test for the number of
     # devices returned by ListDevices.
     with session.Session(
-        config=config_pb2.ConfigProto(device_count={'CPU': 2})):
+        config=config_pb2.ConfigProto(device_count={
+            'CPU': 2
+        })):
       inp = constant_op.constant(10.0, name='W1')
       self.assertAllEqual(inp.eval(), 10.0)
 
@@ -159,19 +162,23 @@ class SessionTest(test_util.TensorFlowTestCase):
       def exc_predicate(e):
         return (e.op is None and e.node_def is None and
                 e.error_code == error_codes_pb2.INVALID_ARGUMENT)
+
       with self.assertRaisesOpError(exc_predicate):
         # Run with a bogus handle.
         s.partial_run('foo', r1, feed_dict={a: 1, b: 2})
 
-  @test_util.disable_c_api  # No shape registration for 'ConstructionFails'
   def testOpConstructionErrorPayload(self):
+    if ops._USE_C_API:
+      return  # No shape registration for 'ConstructionFails'
+
     with session.Session():
       failing_op = ops.get_default_graph().create_op(
           'ConstructionFails', [], [], name='f')
 
       def exc_predicate(e):
-        return (e.op == failing_op
-                and e.error_code == error_codes_pb2.INVALID_ARGUMENT)
+        return (e.op == failing_op and
+                e.error_code == error_codes_pb2.INVALID_ARGUMENT)
+
       with self.assertRaisesOpError(exc_predicate):
         failing_op.run()
 
@@ -188,9 +195,9 @@ class SessionTest(test_util.TensorFlowTestCase):
       # pylint: enable=protected-access
 
       def exc_predicate(e):
-        return (e.op == c.op
-                and e.op._original_op == b.op
-                and e.op._original_op._original_op == a.op)
+        return (e.op == c.op and e.op._original_op == b.op and
+                e.op._original_op._original_op == a.op)
+
       with self.assertRaisesOpError(exc_predicate):
         c.eval()
 
@@ -206,7 +213,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaises(TypeError):
         s.run({'a': a, 'b': None})
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchSingleton(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -229,7 +235,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       res = sess.run(a.op)  # An op, not a tensor.
       self.assertEqual(None, res)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchList(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -245,7 +250,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertTrue(isinstance(res, list))
       self.assertEqual([42.0, None, 44.0, 42.0, None], res)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchTuple(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -259,7 +263,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertTrue(isinstance(res, tuple))
       self.assertEqual((42.0, None, 44.0, 42.0), res)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchNamedTuple(self):
     # pylint: disable=invalid-name
     ABC = collections.namedtuple('ABC', ['a', 'b', 'c'])
@@ -342,8 +345,12 @@ class SessionTest(test_util.TensorFlowTestCase):
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(c_val)
       # List of lists, tuples, namedtuple, and dict
-      res = sess.run([[a, b, c], (a, b, c), ABC(a=a, b=b, c=c),
-                      {'a': a.name, 'c': c, 'b': b}])
+      res = sess.run([[a, b, c], (a, b, c),
+                      ABC(a=a, b=b, c=c), {
+                          'a': a.name,
+                          'c': c,
+                          'b': b
+                      }])
       self.assertTrue(isinstance(res, list))
       self.assertEqual(4, len(res))
       self.assertTrue(isinstance(res[0], list))
@@ -366,8 +373,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(b_val, res[3]['b'])
       self.assertEqual(c_val, res[3]['c'])
       # Tuple of lists, tuples, namedtuple, and dict
-      res = sess.run(([a, b, c], (a.name, b, c), ABC(a=a, b=b, c=c),
-                      {'a': a, 'c': c, 'b': b}))
+      res = sess.run(([a, b, c], (a.name, b, c), ABC(a=a, b=b, c=c), {
+          'a': a,
+          'c': c,
+          'b': b
+      }))
       self.assertTrue(isinstance(res, tuple))
       self.assertEqual(4, len(res))
       self.assertTrue(isinstance(res[0], list))
@@ -390,10 +400,16 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(b_val, res[3]['b'])
       self.assertEqual(c_val, res[3]['c'])
       # Namedtuple of lists, tuples, namedtuples, and dict
-      res = sess.run(DEFG(d=[a, b, c],
-                          e=(a, b, c),
-                          f=ABC(a=a.name, b=b, c=c),
-                          g={'a': a, 'c': c, 'b': b}))
+      res = sess.run(
+          DEFG(
+              d=[a, b, c],
+              e=(a, b, c),
+              f=ABC(a=a.name, b=b, c=c),
+              g={
+                  'a': a,
+                  'c': c,
+                  'b': b
+              }))
       self.assertTrue(isinstance(res, DEFG))
       self.assertTrue(isinstance(res.d, list))
       self.assertEqual(3, len(res.d))
@@ -415,10 +431,16 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(b_val, res.g['b'])
       self.assertEqual(c_val, res.g['c'])
       # Dict of lists, tuples, namedtuples, and dict
-      res = sess.run({'d': [a, b, c],
-                      'e': (a, b, c),
-                      'f': ABC(a=a, b=b, c=c),
-                      'g': {'a': a.name, 'c': c, 'b': b}})
+      res = sess.run({
+          'd': [a, b, c],
+          'e': (a, b, c),
+          'f': ABC(a=a, b=b, c=c),
+          'g': {
+              'a': a.name,
+              'c': c,
+              'b': b
+          }
+      })
       self.assertTrue(isinstance(res, dict))
       self.assertEqual(4, len(res))
       self.assertTrue(isinstance(res['d'], list))
@@ -517,8 +539,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       values = np.array([1.0, 2.0]).astype(np.float32)
       shape = np.array([7, 9, 2]).astype(np.int64)
       sp = sparse_tensor.SparseTensor(
-          constant_op.constant(indices),
-          constant_op.constant(values),
+          constant_op.constant(indices), constant_op.constant(values),
           constant_op.constant(shape))
       # Single fetch, use as tuple
       sp_out = s.run(sp)
@@ -588,14 +609,17 @@ class SessionTest(test_util.TensorFlowTestCase):
       sp = sparse_tensor.SparseTensor(
           array_ops.placeholder(dtype=np.int64, shape=(2, 3)),
           array_ops.placeholder(dtype=np.float32, shape=(2,)),
-          array_ops.placeholder(dtype=np.int64, shape=(3,)),)
+          array_ops.placeholder(dtype=np.int64, shape=(3,)),
+      )
       sp_indices = array_ops.identity(sp.indices)
       sp_values = array_ops.identity(sp.values)
       sp_shape = array_ops.identity(sp.dense_shape)
       sp2 = sparse_tensor.SparseTensor(sp_indices, sp_values, sp_shape)
       # Feed with tuple
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape], {sp: (indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: (indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
@@ -606,20 +630,23 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(sp_out.dense_shape, shape)
       # Feed with SparseTensorValue
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape],
-          {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue, fetch SparseTensorValue
-      sp2_out = s.run(
-          sp2, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+      sp2_out = s.run(sp2, {
+          sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+      })
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
       self.assertAllEqual(sp2_out.dense_shape, shape)
       # Feed SparseTensorValue and fetch sp directly.
-      sp_out = s.run(
-          sp, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+      sp_out = s.run(sp, {
+          sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+      })
       self.assertAllEqual(sp_out.indices, indices)
       self.assertAllEqual(sp_out.values, values)
       self.assertAllEqual(sp_out.dense_shape, shape)
@@ -636,20 +663,24 @@ class SessionTest(test_util.TensorFlowTestCase):
       sp2 = sparse_tensor.SparseTensor(sp_indices, sp_values, sp_shape)
       # Feed with tuple
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape], {sp: (indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: (indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape],
-          {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue, fetch SparseTensorValue
-      sp2_out = s.run(
-          sp2, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+      sp2_out = s.run(sp2, {
+          sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+      })
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
       self.assertAllEqual(sp2_out.dense_shape, shape)
@@ -667,20 +698,24 @@ class SessionTest(test_util.TensorFlowTestCase):
       sp2 = sparse_tensor.SparseTensor(sp_indices, sp_values, sp_shape)
       # Feed with tuple
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape], {sp: (indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: (indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape],
-          {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
       # Feed with SparseTensorValue, fetch SparseTensorValue
-      sp2_out = s.run(
-          sp2, {sp: sparse_tensor.SparseTensorValue(indices, values, shape)})
+      sp2_out = s.run(sp2, {
+          sp: sparse_tensor.SparseTensorValue(indices, values, shape)
+      })
       self.assertAllEqual(sp2_out.indices, indices)
       self.assertAllEqual(sp2_out.values, values)
       self.assertAllEqual(sp2_out.dense_shape, shape)
@@ -690,9 +725,8 @@ class SessionTest(test_util.TensorFlowTestCase):
       indices = np.array([[3, 2, 0], [4, 5, 1]]).astype(np.int64)
       values = np.array([1.0, 2.0]).astype(np.float32)
       shape = np.array([7, 9, 2]).astype(np.int64)
-      sp = array_ops.sparse_placeholder(dtype=np.float32,
-                                        shape=shape,
-                                        name='placeholder1')
+      sp = array_ops.sparse_placeholder(
+          dtype=np.float32, shape=shape, name='placeholder1')
       self.assertAllEqual(sp.dense_shape.eval(session=s), shape)
       self.assertAllEqual(tensor_util.constant_value(sp.dense_shape), shape)
       sp_indices = array_ops.identity(sp.indices)
@@ -700,7 +734,9 @@ class SessionTest(test_util.TensorFlowTestCase):
       sp_shape = array_ops.identity(sp.dense_shape)
       # Feed with tuple
       indices_out, values_out, shape_out = s.run(
-          [sp_indices, sp_values, sp_shape], {sp: (indices, values)})
+          [sp_indices, sp_values, sp_shape], {
+              sp: (indices, values)
+          })
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(shape_out, shape)
@@ -746,33 +782,34 @@ class SessionTest(test_util.TensorFlowTestCase):
       indices = np.array([[3, 2, 0], [4, 5, 1]]).astype(np.int64)
       dense_shape = np.array([7, 9, 2]).astype(np.int64)
       ind = ops.IndexedSlices(
-          array_ops.placeholder(dtype=np.float32,
-                                shape=(2,)),
-          array_ops.placeholder(dtype=np.int64,
-                                shape=(2, 3)),
-          array_ops.placeholder(dtype=np.int64,
-                                shape=(3,)),)
+          array_ops.placeholder(dtype=np.float32, shape=(2,)),
+          array_ops.placeholder(dtype=np.int64, shape=(2, 3)),
+          array_ops.placeholder(dtype=np.int64, shape=(3,)),
+      )
       ind_values = array_ops.identity(ind.values)
       ind_indices = array_ops.identity(ind.indices)
       ind_dense_shape = array_ops.identity(ind.dense_shape)
       ind2 = ops.IndexedSlices(ind_values, ind_indices, ind_dense_shape)
       # Feed with tuple
       values_out, indices_out, dense_shape_out = s.run(
-          [ind_values, ind_indices, ind_dense_shape],
-          {ind: (values, indices, dense_shape)})
+          [ind_values, ind_indices, ind_dense_shape], {
+              ind: (values, indices, dense_shape)
+          })
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(dense_shape_out, dense_shape)
       # Feed with IndexedSlicesValue
       values_out, indices_out, dense_shape_out = s.run(
-          [ind_values, ind_indices, ind_dense_shape],
-          {ind: ops.IndexedSlicesValue(values, indices, dense_shape)})
+          [ind_values, ind_indices, ind_dense_shape], {
+              ind: ops.IndexedSlicesValue(values, indices, dense_shape)
+          })
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(indices_out, indices)
       self.assertAllEqual(dense_shape_out, dense_shape)
       # Feed with IndexedSlicesValue, fetch IndexedSlicesValue
-      ind2_out = s.run(ind2, {ind: ops.IndexedSlicesValue(values, indices,
-                                                          dense_shape)})
+      ind2_out = s.run(ind2, {
+          ind: ops.IndexedSlicesValue(values, indices, dense_shape)
+      })
       self.assertAllEqual(ind2_out.values, values)
       self.assertAllEqual(ind2_out.indices, indices)
       self.assertAllEqual(ind2_out.dense_shape, dense_shape)
@@ -817,28 +854,27 @@ class SessionTest(test_util.TensorFlowTestCase):
       indices = np.array([[3, 2, 0], [4, 5, 1]]).astype(np.int64)
       dense_shape = None
       ind = ops.IndexedSlices(
-          array_ops.placeholder(dtype=np.float32,
-                                shape=(2,)),
-          array_ops.placeholder(dtype=np.int64,
-                                shape=(2, 3)),
-          None)
+          array_ops.placeholder(dtype=np.float32, shape=(2,)),
+          array_ops.placeholder(dtype=np.int64, shape=(2, 3)), None)
       ind_values = array_ops.identity(ind.values)
       ind_indices = array_ops.identity(ind.indices)
       ind2 = ops.IndexedSlices(ind_values, ind_indices)
       # Feed with tuple
-      values_out, indices_out = s.run(
-          [ind_values, ind_indices], {ind: (values, indices)})
+      values_out, indices_out = s.run([ind_values, ind_indices], {
+          ind: (values, indices)
+      })
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(indices_out, indices)
       # Feed with IndexedSlicesValue
-      values_out, indices_out = s.run(
-          [ind_values, ind_indices],
-          {ind: ops.IndexedSlicesValue(values, indices, dense_shape)})
+      values_out, indices_out = s.run([ind_values, ind_indices], {
+          ind: ops.IndexedSlicesValue(values, indices, dense_shape)
+      })
       self.assertAllEqual(values_out, values)
       self.assertAllEqual(indices_out, indices)
       # Feed with IndexedSlicesValue, fetch IndexedSlicesValue
-      ind2_out = s.run(ind2, {ind: ops.IndexedSlicesValue(values, indices,
-                                                          dense_shape)})
+      ind2_out = s.run(ind2, {
+          ind: ops.IndexedSlicesValue(values, indices, dense_shape)
+      })
       self.assertAllEqual(ind2_out.values, values)
       self.assertAllEqual(ind2_out.indices, indices)
       self.assertAllEqual(ind2_out.dense_shape, dense_shape)
@@ -987,8 +1023,9 @@ class SessionTest(test_util.TensorFlowTestCase):
     constructed_events = [threading.Event() for _ in range(10)]
     continue_event = threading.Event()
     for i, constructed_event in enumerate(constructed_events):
-      t = self.checkedThread(target=self._testDefaultGraphInThread,
-                             args=(constructed_event, continue_event, i))
+      t = self.checkedThread(
+          target=self._testDefaultGraphInThread,
+          args=(constructed_event, continue_event, i))
       threads.append(t)
     for t in threads:
       t.start()
@@ -1007,6 +1044,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         ev.wait()
         val = c.eval(session=sess)
         self.assertEqual(val, 5.0)
+
       threads = [self.checkedThread(target=run_step) for _ in range(100)]
       for t in threads:
         t.start()
@@ -1039,11 +1077,10 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testGraphDef(self):
     with session.Session() as sess:
-      self.assertProtoEquals(
-          'versions { producer: %d min_consumer: %d }' % (
-              versions.GRAPH_DEF_VERSION,
-              versions.GRAPH_DEF_VERSION_MIN_CONSUMER),
-          sess.graph_def)
+      self.assertProtoEquals('versions { producer: %d min_consumer: %d }' %
+                             (versions.GRAPH_DEF_VERSION,
+                              versions.GRAPH_DEF_VERSION_MIN_CONSUMER),
+                             sess.graph_def)
       c = constant_op.constant(5.0, name='c')
       self.assertEquals(len(sess.graph_def.node), 1)
       d = constant_op.constant(6.0, name='d')
@@ -1073,6 +1110,7 @@ class SessionTest(test_util.TensorFlowTestCase):
             lambda e: 'Attempted to use a closed Session.' in str(e)):
           while True:
             sess.run(c)
+
       t = threading.Thread(target=update_thread)
       t.start()
       time.sleep(0.1)
@@ -1176,20 +1214,13 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(b_val, [[2.0, 2.0, 2.0]])
       self.assertAllEqual(a2_val, [[1.0, 1.0]])
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFeedAndFetch(self):
     with session.Session() as sess:
-      for dtype in [dtypes.float16,
-                    dtypes.float32,
-                    dtypes.float64,
-                    dtypes.int32,
-                    dtypes.uint8,
-                    dtypes.int16,
-                    dtypes.int8,
-                    dtypes.int64,
-                    dtypes.bool,
-                    dtypes.complex64,
-                    dtypes.complex128]:
+      for dtype in [
+          dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+          dtypes.uint8, dtypes.int16, dtypes.int8, dtypes.int64, dtypes.bool,
+          dtypes.complex64, dtypes.complex128
+      ]:
         for shape in [(32, 4, 128), (37,), (2, 0, 6), (0, 0, 0)]:
           np_dtype = dtype.as_numpy_dtype
 
@@ -1208,13 +1239,19 @@ class SessionTest(test_util.TensorFlowTestCase):
             np_array = np_array.astype(np_dtype)
 
           self.assertAllEqual(np_array,
-                              sess.run(out_t, feed_dict={feed_t: np_array}))
+                              sess.run(out_t, feed_dict={
+                                  feed_t: np_array
+                              }))
           # Check that we can also get the feed back.
           self.assertAllEqual(np_array,
-                              sess.run(feed_t, feed_dict={feed_t: np_array}))
+                              sess.run(feed_t, feed_dict={
+                                  feed_t: np_array
+                              }))
           # Also check that we can get both back.
-          out_v, feed_v = sess.run([out_t, feed_t],
-                                   feed_dict={feed_t: np_array})
+          out_v, feed_v = sess.run(
+              [out_t, feed_t], feed_dict={
+                  feed_t: np_array
+              })
           self.assertAllEqual(np_array, out_v)
           self.assertAllEqual(np_array, feed_v)
 
@@ -1223,7 +1260,6 @@ class SessionTest(test_util.TensorFlowTestCase):
           self.assertAllEqual(np_array, out_v)
           self.assertAllEqual(np_array, feed_v)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testMakeCallableOnTensorWithRunOptions(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -1236,7 +1272,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(42.0, res)
       self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testMakeCallableOnOperationWithRunOptions(self):
     with session.Session() as sess:
       a = variables.Variable(42.0)
@@ -1251,7 +1286,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(43.0, sess.run(a))
       self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testMakeCallableWithFeedListAndRunOptions(self):
     with session.Session() as sess:
       ph = array_ops.placeholder(dtypes.float32)
@@ -1262,9 +1296,11 @@ class SessionTest(test_util.TensorFlowTestCase):
           trace_level=config_pb2.RunOptions.FULL_TRACE)
       run_metadata = config_pb2.RunMetadata()
       self.assertEqual(0, len(run_metadata.step_stats.dev_stats))
-      self.assertAllClose(
-          42.0,
-          tensor_runner(41.0, options=run_options, run_metadata=run_metadata))
+      self.assertAllClose(42.0,
+                          tensor_runner(
+                              41.0,
+                              options=run_options,
+                              run_metadata=run_metadata))
       self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
 
   def testFeedError(self):
@@ -1301,8 +1337,9 @@ class SessionTest(test_util.TensorFlowTestCase):
         size = 1
         for s in shape:
           size *= s
-        c_list = np.array([compat.as_bytes(str(i)) for i in xrange(size)],
-                          dtype=np.object).reshape(shape) if size > 0 else []
+        c_list = np.array(
+            [compat.as_bytes(str(i)) for i in xrange(size)],
+            dtype=np.object).reshape(shape) if size > 0 else []
         c = constant_op.constant(c_list)
         self.assertAllEqual(c.eval(), c_list)
 
@@ -1312,13 +1349,16 @@ class SessionTest(test_util.TensorFlowTestCase):
         size = 1
         for s in shape:
           size *= s
-        c_list = np.array([compat.as_bytes(str(i)) for i in xrange(size)],
-                          dtype=np.object).reshape(shape)
+        c_list = np.array(
+            [compat.as_bytes(str(i)) for i in xrange(size)],
+            dtype=np.object).reshape(shape)
         feed_t = array_ops.placeholder(dtype=dtypes.string, shape=shape)
         c = array_ops.identity(feed_t)
         self.assertAllEqual(sess.run(c, feed_dict={feed_t: c_list}), c_list)
-        self.assertAllEqual(sess.run(feed_t, feed_dict={feed_t: c_list}),
-                            c_list)
+        self.assertAllEqual(
+            sess.run(feed_t, feed_dict={
+                feed_t: c_list
+            }), c_list)
         c_v, feed_v = sess.run([c, feed_t], feed_dict={feed_t: c_list})
         self.assertAllEqual(c_v, c_list)
         self.assertAllEqual(feed_v, c_list)
@@ -1334,8 +1374,10 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testStringFeedWithUnicode(self):
     with session.Session():
-      c_list = [u'\n\x01\x00', u'\n\x00\x01',
-                u'\u26a3 unicode', u'\U0001f60e deal with it']
+      c_list = [
+          u'\n\x01\x00', u'\n\x00\x01', u'\u26a3 unicode',
+          u'\U0001f60e deal with it'
+      ]
       feed_t = array_ops.placeholder(dtype=dtypes.string, shape=[len(c_list)])
       c = array_ops.identity(feed_t)
 
@@ -1428,9 +1470,10 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(constant_op.constant(1.0), run_metadata=run_metadata)
         self.assertTrue(not run_metadata.HasField('step_stats'))
 
-        sess.run(constant_op.constant(1.0),
-                 options=run_options,
-                 run_metadata=run_metadata)
+        sess.run(
+            constant_op.constant(1.0),
+            options=run_options,
+            run_metadata=run_metadata)
 
         self.assertTrue(run_metadata.HasField('step_stats'))
         self.assertEquals(len(run_metadata.step_stats.dev_stats), 1)
@@ -1444,23 +1487,27 @@ class SessionTest(test_util.TensorFlowTestCase):
       with session.Session() as sess:
         # all combinations are valid
         sess.run(constant_op.constant(1.0), options=None, run_metadata=None)
-        sess.run(constant_op.constant(1.0), options=None,
-                 run_metadata=run_metadata)
+        sess.run(
+            constant_op.constant(1.0), options=None, run_metadata=run_metadata)
         self.assertTrue(not run_metadata.HasField('step_stats'))
 
-        sess.run(constant_op.constant(1.0), options=run_options,
-                 run_metadata=None)
+        sess.run(
+            constant_op.constant(1.0), options=run_options, run_metadata=None)
         self.assertTrue(not run_metadata.HasField('step_stats'))
 
-        sess.run(constant_op.constant(1.0), options=run_options,
-                 run_metadata=run_metadata)
+        sess.run(
+            constant_op.constant(1.0),
+            options=run_options,
+            run_metadata=run_metadata)
 
         self.assertTrue(run_metadata.HasField('step_stats'))
         self.assertEquals(len(run_metadata.step_stats.dev_stats), 1)
 
-  # TODO(nolivia): C API doesn't yet handle marking nodes as not feedable.
-  @test_util.disable_c_api
   def testFeedShapeCompatibility(self):
+    # TODO(nolivia): C API doesn't yet handle marking nodes as not feedable.
+    if ops._USE_C_API:
+      return
+
     with session.Session() as sess:
       some_tensor = constant_op.constant([2.0, 2.0, 2.0, 2.0])
       new_shape = constant_op.constant([2, 2])
@@ -1503,8 +1550,11 @@ class SessionTest(test_util.TensorFlowTestCase):
         d = math_ops.multiply(c, c)
       for step in xrange(120):
         run_metadata = config_pb2.RunMetadata()
-        sess.run(d, feed_dict={a: 1.0},
-                 options=run_options, run_metadata=run_metadata)
+        sess.run(
+            d,
+            feed_dict={a: 1.0},
+            options=run_options,
+            run_metadata=run_metadata)
         if step == 99:
           self.assertTrue(run_metadata.HasField('cost_graph'))
         else:
@@ -1573,8 +1623,7 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testTimeoutWithShortOperations(self):
     num_epochs = 5
-    q = data_flow_ops.FIFOQueue(
-        capacity=50, dtypes=[dtypes.int32], shapes=[()])
+    q = data_flow_ops.FIFOQueue(capacity=50, dtypes=[dtypes.int32], shapes=[()])
     enqueue_op = q.enqueue_many(constant_op.constant([1, 2]))
 
     # Use a 10-second timeout, which should be longer than any
@@ -1586,7 +1635,9 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(q.size()), num_epochs * 2)
 
   def testRegisterFetchAndFeedConversionFunctions(self):
+
     class SquaredTensor(object):
+
       def __init__(self, tensor):
         self.sq = math_ops.square(tensor)
 
@@ -1595,24 +1646,27 @@ class SessionTest(test_util.TensorFlowTestCase):
     feed_fn2 = lambda feed: [feed.sq]
 
     session.register_session_run_conversion_functions(SquaredTensor, fetch_fn,
-        feed_fn1, feed_fn2)
+                                                      feed_fn1, feed_fn2)
     with self.assertRaises(ValueError):
-      session.register_session_run_conversion_functions(SquaredTensor,
-          fetch_fn, feed_fn1, feed_fn2)
+      session.register_session_run_conversion_functions(SquaredTensor, fetch_fn,
+                                                        feed_fn1, feed_fn2)
     with self.test_session() as sess:
       np1 = np.array([1.0, 1.5, 2.0, 2.5])
       np2 = np.array([3.0, 3.5, 4.0, 4.5])
       squared_tensor = SquaredTensor(np2)
       squared_eval = sess.run(squared_tensor)
       self.assertAllClose(np2 * np2, squared_eval)
-      squared_eval = sess.run(squared_tensor, feed_dict={
-        squared_tensor : np1 * np1})
+      squared_eval = sess.run(
+          squared_tensor, feed_dict={
+              squared_tensor: np1 * np1
+          })
       self.assertAllClose(np1 * np1, squared_eval)
       partial_run = sess.partial_run_setup([squared_tensor], [])
       squared_eval = sess.partial_run(partial_run, squared_tensor)
       self.assertAllClose(np2 * np2, squared_eval)
 
   def testDefaultLogDevicePlacement(self):
+
     class CaptureStderr(str):
       """Class to capture stderr from C++ shared library."""
 
@@ -1690,8 +1744,10 @@ class SessionTest(test_util.TensorFlowTestCase):
   def runTestBuildGraphError(self, sess):
     # Ensure that errors from building the graph get propagated.
     data = array_ops.placeholder(dtypes.float32, shape=[])
-    enter_1 = control_flow_ops.enter(data, 'foo_1', False)
-    enter_2 = control_flow_ops.enter(data, 'foo_2', False)
+    # pylint: disable=protected-access
+    enter_1 = gen_control_flow_ops._enter(data, 'foo_1', False)
+    enter_2 = gen_control_flow_ops._enter(data, 'foo_2', False)
+    # pylint: enable=protected-access
     res = math_ops.add(enter_1, enter_2)
     with self.assertRaisesOpError('has inputs from different frames'):
       sess.run(res, feed_dict={data: 1.0})
@@ -1723,6 +1779,7 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def runTestAddFunctionToSession(self, target=''):
     """Add a function to a session after the graph has already been run."""
+
     @function.Defun(dtypes.float32)
     def foo(x):
       return x + 1
@@ -1741,6 +1798,161 @@ class SessionTest(test_util.TensorFlowTestCase):
     server = server_lib.Server.create_local_server()
     self.runTestAddFunctionToSession(server.target)
 
+  def testOpenAndCloseGrpcSession(self):
+    server = server_lib.Server.create_local_server()
+    with session.Session(server.target):
+      pass
+
+  def testOpenAndCloseSession(self):
+    with session.Session():
+      pass
+
+  def testAutoConvertAndCheckData(self):
+    with self.test_session() as sess:
+      a = array_ops.placeholder(dtype=dtypes.string)
+      with self.assertRaisesRegexp(
+          TypeError, 'Type of feed value 1 with type <(\w+) \'int\'> is not'):
+        sess.run(a, feed_dict={a: 1})
+
+
+class GraphMutationTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._original_use_c_api_value = ops._USE_C_API
+    ops._USE_C_API = True
+    super(GraphMutationTest, self).setUp()
+
+  def tearDown(self):
+    ops._USE_C_API = self._original_use_c_api_value
+    super(GraphMutationTest, self).tearDown()
+
+  def testUpdateInputAfterRunning(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+    with session.Session(graph=g) as sess:
+      self.assertAllEqual(3.0, sess.run(c))
+      c.op._update_input(1, a)  # pylint: disable=protected-access
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          'add.*was changed by updating input tensor after it was run'):
+        sess.run(c)
+
+      # Check that running the graph with a new session is fine
+      with session.Session(graph=g) as sess2:
+        self.assertAllEqual(2.0, sess2.run(c))
+
+  def testSetDeviceAfterRunning(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+    with session.Session(graph=g) as sess:
+      self.assertAllEqual(3.0, sess.run(c))
+      c.op._set_device('/cpu:0')  # pylint: disable=protected-access
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          'add.*was changed by setting device after it was run'):
+        sess.run(c)
+
+  def testSetAttrAfterRunning(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0, dtype=dtypes.float32)
+      b = math_ops.cast(a, dtypes.float64)
+
+    with session.Session(graph=g) as sess:
+      self.assertAllEqual(1.0, sess.run(b))
+      b.op._set_attr('DstT', attr_value_pb2.AttrValue(type=types_pb2.DT_FLOAT))
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          'Cast.*was changed by setting attribute after it was run'):
+        sess.run(b)
+
+  def testRunModifyRun(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+      with session.Session(graph=g) as sess:
+        self.assertAllEqual(3.0, sess.run(c))
+
+        d = b + c
+        d.op._update_input(0, a)  # pylint: disable=protected-access
+        self.assertAllEqual(3.0, sess.run(c))
+        self.assertAllEqual(4.0, sess.run(d))
+
+  def testRunModifyRunTwoSessions(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+      with session.Session(graph=g) as sess1:
+        with session.Session(graph=g) as sess2:
+          self.assertAllEqual(3.0, sess1.run(c))
+          self.assertAllEqual(3.0, sess2.run(c))
+
+          d = b + c
+          d.op._update_input(0, a)  # pylint: disable=protected-access
+          self.assertAllEqual(3.0, sess2.run(c))
+          self.assertAllEqual(4.0, sess2.run(d))
+
+          d.op._update_input(0, b)  # pylint: disable=protected-access
+          self.assertAllEqual(3.0, sess1.run(c))
+          self.assertAllEqual(5.0, sess1.run(d))
+
+          with self.assertRaisesRegexp(
+              errors.FailedPreconditionError,
+              'add.*was changed by updating input tensor after it was run'):
+            sess2.run(c)
+
+  def testTwoSessionsOneRunBeforeModification(self):
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+    with session.Session(graph=g) as sess1:
+      with session.Session(graph=g) as sess2:
+        sess1.run(c)
+
+        c.op._set_device('/cpu:0')  # pylint: disable=protected-access
+
+        with self.assertRaisesRegexp(
+            errors.FailedPreconditionError,
+            'add.*was changed by setting device after it was run'):
+          sess1.run(c)
+
+        # sess2 was not run before modification
+        self.assertAllEqual(3.0, sess2.run(c))
+
+  def testTwoSessionsBothRunBeforeModification(self):
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+    with session.Session(graph=g) as sess1:
+      with session.Session(graph=g) as sess2:
+        sess1.run(c)
+        sess2.run(c)
+
+        c.op._set_device('/cpu:0')  # pylint: disable=protected-access
+
+        with self.assertRaisesRegexp(
+            errors.FailedPreconditionError,
+            'add.*was changed by setting device after it was run'):
+          sess1.run(c)
+
+        with self.assertRaisesRegexp(
+            errors.FailedPreconditionError,
+            'add.*was changed by setting device after it was run'):
+          sess2.run(c)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 5fa1a7e8fc2388bf64670624de7c653318dcb981..1fd488e7b6388f7953a279dca8f93ab57a85f63d 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -67,6 +67,15 @@ PyObject* CreateWrappedTFOperation(TF_Operation* tf_operation) {
   return SWIG_NewPointerObj(tf_operation, SWIGTYPE_p_TF_Operation, 0);
 }
 
+// Helper function to convert a Python list of ints to a C++ vector of int64s
+void PyInt64ListToVector(PyObject* py_int_seq, std::vector<int64_t>* vec) {
+  int size = PySequence_Fast_GET_SIZE(py_int_seq);
+  for (int i = 0; i < size; ++i) {
+    PyObject* item = PySequence_Fast_GET_ITEM(py_int_seq, i);
+    vec->push_back(PyInt_AsLong(item));
+  }
+}
+
 %}
 
 %include "tensorflow/python/client/tf_sessionrun_wrapper.i"
@@ -91,6 +100,9 @@ tensorflow::ImportNumpy();
 // _GLIBCXX_USE_CXX11_ABI flag value
 %constant const int __cxx11_abi_flag__ = tf_cxx11_abi_flag();
 
+// Flag indicating whether the build is monolithic
+%constant const int __monolithic_build__ = tf_monolithic_build();
+
 // Release the Python GIL for the duration of most methods.
 %exception {
   Py_BEGIN_ALLOW_THREADS;
@@ -145,6 +157,25 @@ tensorflow::ImportNumpy();
   }
 }
 
+%ignore TF_OperationOutputConsumers;
+%unignore TF_OperationOutputConsumers_wrapper;
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_OperationGetOutputConsumers_wrapper;
+
+// Build a Python list of unicode strings and return it. (Operation names are
+// always represented as unicode.)
+%typemap(out) std::vector<const char*>
+tensorflow::TF_OperationOutputConsumers_wrapper {
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+
+  for (size_t i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM($result, i, PyUnicode_FromString($1[i]));
+  }
+}
+
 %unignore GetOperationInputs;
 // See comment for "%noexception TF_SessionRun_wrapper;"
 %noexception GetOperationInputs;
@@ -157,13 +188,30 @@ tensorflow::ImportNumpy();
     SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
   }
 
-  // Unwrap the generated SwigValueWrapper<std::vector<TF_Output>> via &
-  std::vector<TF_Output>* tf_outputs = &$1;
-  for (size_t i = 0; i < $1.size(); ++i) {
-    PyList_SET_ITEM($result, i, CreateWrappedTFOutput((*tf_outputs)[i]));
+  // Unwrap the generated SwigValueWrapper<std::vector<TF_Output>>
+  const std::vector<TF_Output>& tf_outputs = $1;
+  for (size_t i = 0; i < tf_outputs.size(); ++i) {
+    PyList_SET_ITEM($result, i, CreateWrappedTFOutput(tf_outputs[i]));
   }
 }
 
+%ignore TF_ImportGraphDefResultsMissingUnusedInputMappings;
+%unignore TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper;
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper;
+
+%typemap(out) std::vector<string>
+TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+  for (size_t i = 0; i < $1.size(); ++i) {
+    const string& input_str = $1[i];
+    PyList_SET_ITEM($result, i, PyBytes_FromStringAndSize(input_str.data(),
+                                                          input_str.size()));
+  }
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 // BEGIN TYPEMAPS FOR tensorflow::TF_Run_wrapper()
@@ -437,6 +485,7 @@ tensorflow::ImportNumpy();
 %unignore tensorflow;
 %unignore TF_Run;
 %unignore EqualGraphDefWrapper;
+%unignore EqualAttrValueWrapper;
 
 // Include the wrapper for TF_PRunSetup from tf_session_helper.h.
 
@@ -532,6 +581,144 @@ def TF_Reset(target, containers=None, config=None):
 %unignore TF_GraphGetTensorShapeHelper;
 %ignore TF_GraphGetTensorShape;
 
+// We use TF_GraphSetTensorShape_wrapper instead of
+// TF_GraphSetTensorShape
+%ignore TF_GraphSetTensorShape;
+%unignore tensorflow;
+%unignore TF_GraphSetTensorShape_wrapper;
+
+// $input is a Python list of ints to a vector<int> for TF_GraphSetTensorShape_wrapper
+%typemap(in) (const std::vector<int64_t>& dims)
+    (std::vector<int64_t> dims_local){
+  if ($input != Py_None) {
+    PyObject* py_int_seq = PySequence_Fast($input, tensorflow::strings::Printf(
+          "$symname: expected list but got %s ",
+          Py_TYPE($input)->tp_name).c_str());
+    if (py_int_seq == nullptr) {
+      SWIG_exception_fail(SWIG_RuntimeError, tensorflow::strings::Printf(
+          "$symname: PySequence_Fast returned NULL.").c_str());
+    }
+    PyInt64ListToVector(py_int_seq, &dims_local);
+    Py_DECREF(py_int_seq);
+    $1 = &dims_local;
+  } else {
+    $1 = nullptr;
+  }
+}
+
+// We use TF_GraphGetTensorShape_wrapper instead of
+// TF_GraphGetTensorShape
+%ignore TF_GraphGetTensorShape;
+%unignore tensorflow;
+%unignore TF_GraphGetTensorShape_wrapper;
+
+// Build a Python list of ints and return it.
+%typemap(out) std::vector<int64_t> tensorflow::TF_GraphGetTensorShape_wrapper {
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+
+  for (size_t i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM($result, i, PyInt_FromLong($1[i]));
+  }
+}
+
+// We use TF_GraphSetOutputHandleShapesAndTypes_wrapper instead of
+// TF_GraphSetOutputHandleShapesAndTypes
+%ignore TF_GraphSetOutputHandleShapesAndTypes;
+%unignore tensorflow;
+%unignore TF_GraphSetOutputHandleShapesAndTypes_wrapper;
+
+// The space between the double angle brackets below looks extraneous, but
+// our version of SWIG cannot parse ">>".
+%typemap(in) (const std::vector<std::vector<int64_t> >& shapes)
+    (std::vector<std::vector<int64_t> > shapes_local){
+  PyObject* seq = PySequence_Fast($input, tensorflow::strings::Printf(
+        "$symname: expected list but got %s ",
+        Py_TYPE($input)->tp_name).c_str());
+  if (seq == nullptr) {
+    SWIG_exception_fail(SWIG_RuntimeError, tensorflow::strings::Printf(
+        "$symname: PySequence_Fast returned NULL.").c_str());
+  }
+
+  int size = PySequence_Fast_GET_SIZE(seq);
+  if (size == 0) {
+    SWIG_exception_fail(SWIG_ValueError, tensorflow::strings::Printf(
+        "$symname: shapes list must be non-empty").c_str());
+  }
+
+  for (int i = 0; i < size; ++i) {
+    PyObject* item = PySequence_Fast_GET_ITEM(seq, i);
+    std::vector<int64_t> dims;
+    if (item != Py_None) {
+      PyObject* py_int_seq = PySequence_Fast(item, tensorflow::strings::Printf(
+            "$symname: expected list but got %s ",
+            Py_TYPE($input)->tp_name).c_str());
+      if (py_int_seq == nullptr) {
+        SWIG_exception_fail(SWIG_RuntimeError, tensorflow::strings::Printf(
+            "$symname: PySequence_Fast returned NULL.").c_str());
+      }
+      PyInt64ListToVector(py_int_seq, &dims);
+      Py_DECREF(py_int_seq);
+    }
+    shapes_local.push_back(dims);
+  }
+
+  Py_DECREF(seq);
+  $1 = &shapes_local;
+}
+
+%typemap(in) (const std::vector<int>& ranks)
+    (std::vector<int> ranks_local){
+  PyObject* seq = PySequence_Fast($input, tensorflow::strings::Printf(
+        "$symname: expected list but got %s ",
+        Py_TYPE($input)->tp_name).c_str());
+  if (seq == nullptr) {
+    SWIG_exception_fail(SWIG_RuntimeError, tensorflow::strings::Printf(
+        "$symname: PySequence_Fast returned NULL.").c_str());
+  }
+
+  int size = PySequence_Fast_GET_SIZE(seq);
+  if (size == 0) {
+    SWIG_exception_fail(SWIG_ValueError, tensorflow::strings::Printf(
+        "$symname: shapes list must be non-empty").c_str());
+  }
+
+  for (int i = 0; i < size; ++i) {
+    PyObject* item = PySequence_Fast_GET_ITEM(seq, i);
+    ranks_local.push_back((int) PyInt_AsLong(item));
+  }
+
+  Py_DECREF(seq);
+  $1 = &ranks_local;
+}
+
+%typemap(in) (const std::vector<TF_DataType>& types)
+    (std::vector<TF_DataType> types_local){
+  PyObject* seq = PySequence_Fast($input, tensorflow::strings::Printf(
+        "$symname: expected list but got %s ",
+        Py_TYPE($input)->tp_name).c_str());
+  if (seq == nullptr) {
+    SWIG_exception_fail(SWIG_RuntimeError, tensorflow::strings::Printf(
+        "$symname: PySequence_Fast returned NULL.").c_str());
+  }
+
+  int size = PySequence_Fast_GET_SIZE(seq);
+  if (size == 0) {
+    SWIG_exception_fail(SWIG_ValueError, tensorflow::strings::Printf(
+        "$symname: shapes list must be non-empty").c_str());
+  }
+
+  for (int i = 0; i < size; ++i) {
+    PyObject* item = PySequence_Fast_GET_ITEM(seq, i);
+    types_local.push_back((TF_DataType) PyInt_AsLong(item));
+  }
+
+  Py_DECREF(seq);
+  $1 = &types_local;
+}
+
 %include "tensorflow/python/client/tf_session_helper.h"
 
 %unignoreall
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index ad982e5dd8d4fc2b151ab5e246e8cff3b88304b6..361dbc22b097a9bc82f656d7416b88c4a3a1ec2d 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -18,11 +18,16 @@ limitations under the License.
 #include <cstring>
 
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
@@ -299,6 +304,27 @@ string EqualGraphDefWrapper(const string& actual, const string& expected) {
   return EqualGraphDef(actual_def, expected_def, &diff) ? "" : diff;
 }
 
+string EqualAttrValueWrapper(const string& actual, const string& expected) {
+  AttrValue actual_attr_value;
+  if (!actual_attr_value.ParseFromString(actual)) {
+    return "actual is not a valid serialized AttrValue";
+  }
+
+  AttrValue expected_attr_value;
+  if (!expected_attr_value.ParseFromString(expected)) {
+    return "expected is not a valid serialized AttrValue";
+  }
+
+  string diff;
+  if (!AreAttrValuesEqual(actual_attr_value, expected_attr_value)) {
+    diff = strings::Printf(
+        "Actual AttrValue %s does not match Expected AttrValue %s.",
+        SummarizeAttrValue(actual_attr_value).c_str(),
+        SummarizeAttrValue(expected_attr_value).c_str());
+  }
+  return diff;
+}
+
 // Return value set to 6 inlined elements so it fits in a 64-byte cache line.
 tensorflow::gtl::InlinedVector<int64_t, 6> TF_GraphGetTensorShapeHelper(
     TF_Graph* graph, TF_Output output, TF_Status* out_status,
@@ -374,6 +400,19 @@ std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
   return control_inputs;
 }
 
+std::vector<const char*> TF_OperationOutputConsumers_wrapper(
+    TF_Output oper_out) {
+  int num_consumers = TF_OperationOutputNumConsumers(oper_out);
+  std::vector<TF_Input> consumers(num_consumers);
+  TF_OperationOutputConsumers(oper_out, consumers.data(), num_consumers);
+
+  std::vector<const char*> consumer_names(num_consumers);
+  for (int i = 0; i < num_consumers; ++i) {
+    consumer_names[i] = TF_OperationName(consumers[i].oper);
+  }
+  return consumer_names;
+}
+
 TF_Function* TF_GraphToFunction_wrapper(
     const TF_Graph* fn_body, const char* fn_name, bool append_hash_to_fn_name,
     const std::vector<TF_Operation*>* opers,
@@ -407,4 +446,51 @@ TF_Function* TF_GraphToFunction_wrapper(
                             opts, description, out_status);
 }
 
+void TF_GraphSetOutputHandleShapesAndTypes_wrapper(
+    TF_Graph* graph, TF_Output output,
+    const std::vector<std::vector<int64_t>>& shapes,
+    const std::vector<int>& ranks, const std::vector<TF_DataType>& types,
+    TF_Status* status) {
+  std::vector<const int64_t*> shapes_pointers(shapes.size());
+  for (int i = 0; i < shapes.size(); ++i) {
+    shapes_pointers[i] = ranks[i] <= 0 ? nullptr : &shapes[i][0];
+  }
+  TF_GraphSetOutputHandleShapesAndTypes(graph, output, shapes.size(),
+                                        shapes_pointers.data(), ranks.data(),
+                                        types.data(), status);
+}
+
+void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
+                                    const std::vector<int64_t>& dims,
+                                    bool unknown_shape, TF_Status* status) {
+  if (unknown_shape) {
+    TF_GraphSetTensorShape(graph, output, nullptr, -1, status);
+    return;
+  }
+  TF_GraphSetTensorShape(graph, output, dims.data(), dims.size(), status);
+}
+
+std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
+                                                    TF_Output output,
+                                                    int num_dims,
+                                                    TF_Status* status) {
+  std::vector<int64_t> dims(num_dims);
+  TF_GraphGetTensorShape(graph, output, dims.data(), num_dims, status);
+  return dims;
+}
+
+std::vector<string> TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
+    TF_ImportGraphDefResults* results) {
+  int num_missing_unused_input_mappings;
+  const char** src_names;
+  int* src_indexes;
+  TF_ImportGraphDefResultsMissingUnusedInputMappings(
+      results, &num_missing_unused_input_mappings, &src_names, &src_indexes);
+  std::vector<string> input_strs(num_missing_unused_input_mappings);
+  for (int i = 0; i < num_missing_unused_input_mappings; ++i) {
+    input_strs[i] = TensorId(src_names[i], src_indexes[i]).ToString();
+  }
+  return input_strs;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 6ed08d3a5818163c03f2bb52157b8371312aba18..29d5b28f40a7c07c199eec8c8cd85de626f6b068 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -97,6 +97,13 @@ void TF_Reset_wrapper(const TF_SessionOptions* opt,
 // for no difference.
 string EqualGraphDefWrapper(const string& actual, const string& expected);
 
+// Convenience wrapper around AreAttrValuesEqual to make it easier to wrap.
+// The actual and expected strings must correspond to a serialized binary
+// representation of two AttrValue proto instances.
+// Returns an explanation if a difference is found, or the empty string
+// for no difference.
+string EqualAttrValueWrapper(const string& actual, const string& expected);
+
 // Gets shape from C API Graph object.
 //
 // If shape is known, returns shape vector where -1 means "unknown
@@ -160,6 +167,11 @@ std::vector<TF_Output> GetOperationInputs(TF_Operation* oper);
 std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
     TF_Operation* oper);
 
+// Retrieves the op names of the consumers of `oper_out`. The returned strings
+// have the lifetime of the underlying TF_Graph.
+std::vector<const char*> TF_OperationOutputConsumers_wrapper(
+    TF_Output oper_out);
+
 // `opers` equaling NULL are converted to `nopers = -1`.
 // `output_names` must be empty or have the same length as `outputs`.
 TF_Function* TF_GraphToFunction_wrapper(
@@ -168,6 +180,39 @@ TF_Function* TF_GraphToFunction_wrapper(
     const std::vector<TF_Output>& inputs, const std::vector<TF_Output>& outputs,
     const NameVector& output_names, const TF_FunctionOptions* opts,
     const char* description, TF_Status* out_status);
+
+// Set the shapes and types for the output's handle.
+//
+// The sizes of 'shapes', 'ranks', and 'types' must be equal; `shapes[i]`
+// contains the shape of the handle's i-th value, `ranks[i]` contains the i-th
+// shape's rank, and `types[i]` contains the i-th value's dtype. If the i-th
+// shape is unknown, then `ranks[i]` must be equal to -1.
+//
+// The space between the double angle brackets below looks extraneous, but
+// our version of SWIG cannot parse ">>".
+void TF_GraphSetOutputHandleShapesAndTypes_wrapper(
+    TF_Graph* graph, TF_Output output,
+    const std::vector<std::vector<int64_t> >& shapes,
+    const std::vector<int>& ranks, const std::vector<TF_DataType>& types,
+    TF_Status* status);
+
+// Set the shape of output. If unknown is true, `num_dims` must be set to
+// -1 and `dims` is set to nullptr.
+void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
+                                    const std::vector<int64_t>& dims,
+                                    bool unknown_shape, TF_Status* status);
+
+// Return the shape of output. `num_dims` should be the output of
+// TF_GraphGetTensorNumDims. If `num_dims = -1`, this should not be called.
+std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
+                                                    TF_Output output,
+                                                    int num_dims,
+                                                    TF_Status* status);
+
+// Returns the string representations of the missing unused input mappings.
+std::vector<string> TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
+    TF_ImportGraphDefResults* results);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_CLIENT_TF_SESSION_HELPER_H_
diff --git a/tensorflow/python/client/virtual_gpu_test.py b/tensorflow/python/client/virtual_gpu_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..addf63474c9ba213cf0c1eeffa9d31e94f15eac1
--- /dev/null
+++ b/tensorflow/python/client/virtual_gpu_test.py
@@ -0,0 +1,245 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multiple virtual GPU support."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+
+import numpy as np
+
+from google.protobuf import text_format
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+
+
+class VirtualGpuTestUtil(object):
+
+  def __init__(self,
+               dim=1000,
+               num_ops=100,
+               virtual_devices_per_gpu=None,
+               device_probabilities=None):
+    self._dim = dim
+    self._num_ops = num_ops
+    if virtual_devices_per_gpu is None:
+      self._virtual_devices_per_gpu = [3]
+    else:
+      self._virtual_devices_per_gpu = virtual_devices_per_gpu
+    self._visible_device_list = [
+        i for i in range(len(self._virtual_devices_per_gpu))
+    ]
+    gpu_devices = [
+        ('/gpu:' + str(i)) for i in range(sum(self._virtual_devices_per_gpu))
+    ]
+    self.devices = ['/cpu:0'] + gpu_devices
+    self._num_devices = len(self.devices)
+    # Each virtual device gets 2GB memory.
+    self._mem_limits_mb = [
+        ([1 << 11] * i) for i in self._virtual_devices_per_gpu
+    ]
+    self.config = self._GetSessionConfig()
+
+    if device_probabilities is not None:
+      self._device_probabilities = list(device_probabilities)  # Deep copy
+      for i in range(1, self._num_devices):
+        self._device_probabilities[i] += self._device_probabilities[i - 1]
+    else:
+      # Each device gets same probability to be assigned an operation.
+      step = 1.0 / self._num_devices
+      self._device_probabilities = [
+          (x + 1) * step for x in range(self._num_devices)
+      ]
+    # To prevent rounding error causing problems.
+    self._device_probabilities[self._num_devices - 1] = 1.1
+
+    logging.info('dim: %d', self._dim)
+    logging.info('num_ops: %d', self._num_ops)
+    logging.info('visible_device_list: %s', str(self._visible_device_list))
+    logging.info('virtual_devices_per_gpu: %s',
+                 str(self._virtual_devices_per_gpu))
+    logging.info('mem_limits: %s', str(self._mem_limits_mb))
+    logging.info('devices: %s', str(self.devices))
+    logging.info('config: %s', text_format.MessageToString(self.config))
+    logging.info('device_probabilities: %s', str(self._device_probabilities))
+
+  # Creates virtual GPU devices
+  def _GetSessionConfig(self):
+    virtual_device_gpu_options = config_pb2.GPUOptions(
+        visible_device_list=','.join(str(d) for d in self._visible_device_list),
+        experimental=config_pb2.GPUOptions.Experimental(virtual_devices=[
+            config_pb2.GPUOptions.Experimental.VirtualDevices(
+                memory_limit_mb=i) for i in self._mem_limits_mb
+        ]))
+    return config_pb2.ConfigProto(gpu_options=virtual_device_gpu_options)
+
+  # Generates a list of 3-tuples, each tuple contains the source and destination
+  # device index for a binary operation like 'add', like:
+  # (src_devcie_1, src_device_2, dst_device)
+  def _GenerateOperationPlacement(self):
+    result = []
+    for unused_i in range(self._num_ops):
+      op_device = ()
+      for unused_j in range(3):
+        random_num = random.random()
+        for device_index in range(self._num_devices):
+          if self._device_probabilities[device_index] > random_num:
+            op_device += (device_index,)
+            break
+      result.append(op_device)
+    return result
+
+  # Logs part of the matrix for debugging purposes.
+  def _LogMatrix(self, mat, dim):
+    logging.info('---- printing the first 10*10 submatrix ----')
+    for i in range(min(10, dim)):
+      row = ''
+      for j in range(min(10, dim)):
+        row += ' ' + str(mat[i][j])
+      logging.info(row)
+
+  # Runs a list of 'add' operations where each operation satisfies the device
+  # placement constraints in `op_placement`, and returns the result.
+  def _TestRandomGraphWithDevices(self,
+                                  sess,
+                                  seed,
+                                  op_placement,
+                                  devices,
+                                  debug_mode=False):
+    data = []
+    shape = (self._dim, self._dim)
+    feed_dict = {}
+    # Initialize the matrices
+    for i in range(len(devices)):
+      with ops.device(devices[i]):
+        var = array_ops.placeholder(dtypes.float32, shape=shape)
+        np.random.seed(seed + i)
+        feed_dict[var] = np.random.uniform(
+            low=0, high=0.1, size=shape).astype(np.float32)
+        data.append(var)
+    # Run the 'add' operations on those matrices
+    for op in op_placement:
+      with ops.device(devices[op[2]]):
+        data[op[2]] = math_ops.add(data[op[0]], data[op[1]])
+    with ops.device('/cpu:0'):
+      s = data[0]
+      for i in range(1, len(data)):
+        s = math_ops.add(s, data[i])
+    if debug_mode:
+      logging.info(ops.get_default_graph().as_graph_def())
+    result = sess.run(s, feed_dict=feed_dict)
+    self._LogMatrix(result, self._dim)
+    return result
+
+  # Generates a random graph with `self._num_ops` 'add' operations with each
+  # operation placed on different virtual device, test that the result is
+  # identical to the result obtained by running the same graph on cpu only.
+  def TestRandomGraph(self, sess, op_placement=None, random_seed=None):
+    debug_mode = False
+    if op_placement is None:
+      op_placement = self._GenerateOperationPlacement()
+    else:
+      debug_mode = True
+    if random_seed is None:
+      random_seed = random.randint(0, 1 << 31)
+    else:
+      debug_mode = True
+    logging.info('Virtual gpu functional test for random graph...')
+    logging.info('operation placement: %s', str(op_placement))
+    logging.info('random seed: %d', random_seed)
+
+    # Run with multiple virtual gpus.
+    result_vgd = self._TestRandomGraphWithDevices(
+        sess, random_seed, op_placement, self.devices, debug_mode=debug_mode)
+    # Run with single cpu.
+    result_cpu = self._TestRandomGraphWithDevices(
+        sess,
+        random_seed,
+        op_placement, ['/cpu:0'] * self._num_devices,
+        debug_mode=debug_mode)
+    # Test the result
+    for i in range(self._dim):
+      for j in range(self._dim):
+        if result_vgd[i][j] != result_cpu[i][j]:
+          logging.error(
+              'Result mismatch at row %d column %d: expected %f, actual %f', i,
+              j, result_cpu[i][j], result_vgd[i][j])
+          logging.error('Devices: %s', self.devices)
+          logging.error('Memory limits (in MB): %s', self._mem_limits_mb)
+          return False
+    return True
+
+
+@test_util.with_c_api
+class VirtualGpuTest(test_util.TensorFlowTestCase):
+
+  def __init__(self, method_name):
+    super(VirtualGpuTest, self).__init__(method_name)
+    self._util = VirtualGpuTestUtil()
+
+  def testStatsContainAllDeviceNames(self):
+    with self.test_session(config=self._util.config) as sess:
+      # TODO(laigd): b/70811538. The is_gpu_available() call will invoke
+      # DeviceFactory::AddDevices() with a default SessionOption, which prevents
+      # adding virtual devices in the future, thus must be called within a
+      # context of a session within which virtual devices are created. Same in
+      # the following test case.
+      if not test.is_gpu_available(cuda_only=True):
+        self.skipTest('No GPU available')
+      run_options = config_pb2.RunOptions(
+          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata = config_pb2.RunMetadata()
+
+      mat_shape = [10, 10]
+      data = []
+      for d in self._util.devices:
+        with ops.device(d):
+          var = variables.Variable(random_ops.random_uniform(mat_shape))
+          sess.run(var.initializer)
+          data.append(var)
+      s = data[0]
+      for i in range(1, len(data)):
+        s = math_ops.add(s, data[i])
+      sess.run(s, options=run_options, run_metadata=run_metadata)
+
+    self.assertTrue(run_metadata.HasField('step_stats'))
+    step_stats = run_metadata.step_stats
+    devices = [d.device for d in step_stats.dev_stats]
+    self.assertTrue('/job:localhost/replica:0/task:0/device:CPU:0' in devices)
+    self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:0' in devices)
+    self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:1' in devices)
+    self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:2' in devices)
+
+  def testLargeRandomGraph(self):
+    with self.test_session(config=self._util.config) as sess:
+      if not test.is_gpu_available(cuda_only=True):
+        self.skipTest('No GPU available')
+      for _ in range(10):
+        if not self._util.TestRandomGraph(sess):
+          return
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..8b8adefa65a5c54d40bc28d8f50953513cfd3605
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -0,0 +1,381 @@
+# Tests of TensorFlow kernels written using the Python API.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "sycl_py_test")
+
+tf_py_test(
+    name = "batch_dataset_op_test",
+    size = "small",
+    srcs = ["batch_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "dataset_constructor_op_test",
+    size = "small",
+    srcs = ["dataset_constructor_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+    tags = [
+        "manual",
+        "nomac",  # b/62040583
+    ],
+)
+
+tf_py_test(
+    name = "dataset_from_generator_op_test",
+    size = "medium",
+    srcs = ["dataset_from_generator_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+tf_py_test(
+    name = "filter_dataset_op_test",
+    size = "small",
+    srcs = ["filter_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "flat_map_dataset_op_test",
+    size = "small",
+    srcs = ["flat_map_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+    grpc_enabled = True,
+)
+
+tf_py_test(
+    name = "list_files_dataset_op_test",
+    size = "small",
+    srcs = ["list_files_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "interleave_dataset_op_test",
+    size = "small",
+    srcs = ["interleave_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "map_dataset_op_test",
+    size = "small",
+    srcs = ["map_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "prefetch_dataset_op_test",
+    size = "small",
+    srcs = ["prefetch_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "range_dataset_op_test",
+    size = "small",
+    srcs = ["range_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
+
+tf_py_test(
+    name = "reader_dataset_ops_test",
+    size = "small",
+    srcs = ["reader_dataset_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+tf_py_test(
+    name = "sequence_dataset_op_test",
+    size = "small",
+    srcs = ["sequence_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "shuffle_dataset_op_test",
+    size = "small",
+    srcs = ["shuffle_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
+
+tf_py_test(
+    name = "shard_dataset_op_test",
+    size = "small",
+    srcs = ["shard_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "cache_dataset_op_test",
+    size = "small",
+    srcs = ["cache_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
+
+tf_py_test(
+    name = "zip_dataset_op_test",
+    size = "small",
+    srcs = ["zip_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "concatenate_dataset_op_test",
+    size = "small",
+    srcs = ["concatenate_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+tf_py_test(
+    name = "iterator_ops_test",
+    size = "small",
+    srcs = ["iterator_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:training",
+    ],
+    grpc_enabled = True,
+)
+
+tf_py_test(
+    name = "iterator_ops_cluster_test",
+    size = "small",
+    srcs = ["iterator_ops_cluster_test.py"],
+    additional_deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:lookup_ops",
+    ],
+    grpc_enabled = True,
+    tags = [
+        "no_oss",  # Test flaky due to port collisions.
+        "no_windows",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
similarity index 78%
rename from tensorflow/python/kernel_tests/batch_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
index 0546218601d6d1eac3658f86b58af43c2d5a5f04..bd80b9dbf561de16168b05facf0086dadcda6444 100644
--- a/tensorflow/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -50,8 +51,9 @@ class BatchDatasetTest(test.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-                .repeat(count).batch(batch_size).make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+        .repeat(count).batch(batch_size).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -67,7 +69,7 @@ class BatchDatasetTest(test.TestCase):
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
           for j in range(14):
-            self.assertAllEqual(component[(i*14 + j) % 7]**2,
+            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
                                 result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -82,12 +84,12 @@ class BatchDatasetTest(test.TestCase):
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
           for j in range(8):
-            self.assertAllEqual(component[(i*8 + j) % 7]**2,
+            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
                                 result_component[j])
       result = sess.run(get_next)
       for component, result_component in zip(components, result):
         for j in range((14 * 7) % 8):
-          self.assertAllEqual(component[((num_batches - 1)*8 + j) % 7]**2,
+          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
                               result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -187,14 +189,35 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testBatchShapeError(self):
+
+    def generator():
+      yield [1.0, 2.0, 3.0]
+      yield [4.0, 5.0, 6.0]
+      yield [7.0, 8.0, 9.0, 10.0]
+
+    iterator = (
+        dataset_ops.Dataset.from_generator(
+            generator, dtypes.float32, output_shapes=[None]).batch(3)
+        .make_initializable_iterator())
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'Cannot batch tensors with different shapes in component 0. '
+          r'First element had shape \[3\] and element 2 had shape \[4\].'):
+        sess.run(next_element)
+
   def testPaddedBatchDataset(self):
     seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
     padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(seq_lens)
-                .map(lambda x: array_ops.fill([x], x)).padded_batch(
-                    4,
-                    padded_shapes=padded_shape).make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(seq_lens)
+        .map(lambda x: array_ops.fill([x], x)).padded_batch(
+            4, padded_shapes=padded_shape).make_initializable_iterator())
 
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -202,35 +225,40 @@ class BatchDatasetTest(test.TestCase):
     with self.test_session() as sess:
       # Test with random sequence lengths, and max padding.
       random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(init_op, feed_dict={padded_shape: [-1],
-                                   seq_lens: random_seq_lens})
+      sess.run(
+          init_op, feed_dict={
+              padded_shape: [-1],
+              seq_lens: random_seq_lens
+          })
       for i in range(8):
         result = sess.run(get_next)
         padded_len = np.max(result)
         self.assertEqual((4, padded_len), result.shape)
         for j in range(4):
-          seq_len = random_seq_lens[(i*4)+j]
+          seq_len = random_seq_lens[(i * 4) + j]
           self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
           self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
       # Test with random sequence lengths, and constant padding.
-      sess.run(init_op, feed_dict={padded_shape: [25],
-                                   seq_lens: random_seq_lens})
+      sess.run(
+          init_op, feed_dict={
+              padded_shape: [25],
+              seq_lens: random_seq_lens
+          })
       for i in range(8):
         result = sess.run(get_next)
         self.assertEqual((4, 25), result.shape)
         for j in range(4):
-          seq_len = random_seq_lens[(i*4)+j]
+          seq_len = random_seq_lens[(i * 4) + j]
           self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
           self.assertAllEqual(result[j, seq_len:], [0] * (25 - seq_len))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
       # Test correct handling of empty tensors.
-      sess.run(init_op, feed_dict={padded_shape: [-1],
-                                   seq_lens: [0, 0, 0, 0]})
+      sess.run(init_op, feed_dict={padded_shape: [-1], seq_lens: [0, 0, 0, 0]})
       result = sess.run(get_next)
       self.assertAllEqual([[], [], [], []], result)
       with self.assertRaises(errors.OutOfRangeError):
@@ -238,8 +266,7 @@ class BatchDatasetTest(test.TestCase):
 
       # Test error handling with constant sequence lengths, and
       # too-short padding.
-      sess.run(init_op, feed_dict={padded_shape: [5],
-                                   seq_lens: [6, 5, 5, 5]})
+      sess.run(init_op, feed_dict={padded_shape: [5], seq_lens: [6, 5, 5, 5]})
       with self.assertRaises(errors.DataLossError):
         result = sess.run(get_next)
 
@@ -250,11 +277,13 @@ class BatchDatasetTest(test.TestCase):
     def fill_tuple(x):
       filled = array_ops.fill([x], x)
       return (filled, string_ops.as_string(filled))
-    iterator = (dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
-                .padded_batch(
-                    4,
-                    padded_shapes=(padded_shape, padded_shape),
-                    padding_values=(-1, "<end>")).make_initializable_iterator())
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
+        .padded_batch(
+            4,
+            padded_shapes=(padded_shape, padded_shape),
+            padding_values=(-1, '<end>')).make_initializable_iterator())
 
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -262,25 +291,46 @@ class BatchDatasetTest(test.TestCase):
     with self.test_session() as sess:
       # Test with random sequence lengths, and max padding.
       random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(init_op, feed_dict={padded_shape: [-1],
-                                   seq_lens: random_seq_lens})
+      sess.run(
+          init_op, feed_dict={
+              padded_shape: [-1],
+              seq_lens: random_seq_lens
+          })
       for i in range(8):
         result = sess.run(get_next)
         padded_len = np.max(result[0])
         self.assertEqual((4, padded_len), result[0].shape)
         self.assertEqual((4, padded_len), result[1].shape)
         for j in range(4):
-          seq_len = random_seq_lens[(i*4)+j]
+          seq_len = random_seq_lens[(i * 4) + j]
           self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
           self.assertAllEqual(result[0][j, seq_len:],
                               [-1] * (padded_len - seq_len))
           self.assertAllEqual(result[1][j, :seq_len],
                               [compat.as_bytes(str(seq_len))] * seq_len)
           self.assertAllEqual(result[1][j, seq_len:],
-                              [b"<end>"] * (padded_len - seq_len))
+                              [b'<end>'] * (padded_len - seq_len))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testPaddedBatchDatasetUnicode(self):
+    # See GitHub issue 16149
+    def generator():
+      data = [[u'Простой', u'тест', u'юникода'],
+              [u'никогда', u'не', u'бывает', u'простым']]
+
+      for seq in data:
+        yield seq, [0, 1, 2, 3]
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, (dtypes.string, dtypes.int32),
+        (tensor_shape.TensorShape([None]), tensor_shape.TensorShape([None])))
+    padded_dataset = dataset.padded_batch(
+        2, padded_shapes=([None], [None]), padding_values=('', 0))
+    with self.test_session() as sess:
+      next_element = padded_dataset.make_one_shot_iterator().get_next()
+      sess.run(next_element)
+
   def testPaddedBatchDatasetShapeSpecifications(self):
     int_placeholder = array_ops.placeholder(dtypes.int32)
     float_placeholder = array_ops.placeholder(dtypes.float32)
@@ -304,15 +354,16 @@ class BatchDatasetTest(test.TestCase):
                        constant_op.constant([-1, -1], dtype=dtypes.int64),
                        constant_op.constant([37], dtype=dtypes.int64)))
 
-    for dataset in [dynamic_padding_from_tensor_shapes,
-                    dynamic_padding_from_lists,
-                    dynamic_padding_from_lists_with_minus_one,
-                    dynamic_padding_from_tensors]:
+    for dataset in [
+        dynamic_padding_from_tensor_shapes, dynamic_padding_from_lists,
+        dynamic_padding_from_lists_with_minus_one, dynamic_padding_from_tensors
+    ]:
       self.assertEqual([None, None], dataset.output_shapes[0].as_list())
       self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
       self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
 
   def testPaddedBatchSparseError(self):
+
     def _map_fn(i):
       return sparse_tensor.SparseTensorValue(
           indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
@@ -321,5 +372,5 @@ class BatchDatasetTest(test.TestCase):
       _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/cache_dataset_op_test.py b/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/cache_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/concatenate_dataset_op_test.py b/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/concatenate_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/dataset_constructor_op_test.py b/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
similarity index 77%
rename from tensorflow/python/kernel_tests/dataset_constructor_op_test.py
rename to tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
index 9e2a62055051d7cea269fcbcdb697a895b33d821..14627810b57f68fd96e3e3cc7b51b4fbf7365299 100644
--- a/tensorflow/python/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
+
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
@@ -37,7 +39,7 @@ from tensorflow.python.platform import test
 class DatasetConstructorTest(test.TestCase):
 
   def testFromTensors(self):
-    """Test an dataset that represents a single tuple of tensors."""
+    """Test a dataset that represents a single tuple of tensors."""
     components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
 
     iterator = (dataset_ops.Dataset.from_tensors(components)
@@ -62,7 +64,7 @@ class DatasetConstructorTest(test.TestCase):
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
   def testFromTensorsSparse(self):
-    """Test an dataset that represents a single tuple of tensors."""
+    """Test a dataset that represents a single tuple of tensors."""
     components = (sparse_tensor.SparseTensorValue(
         indices=np.array([[0]]),
         values=np.array([0]),
@@ -125,7 +127,7 @@ class DatasetConstructorTest(test.TestCase):
         sess.run(get_next)
 
   def testFromTensorSlices(self):
-    """Test an dataset that represents the slices from a tuple of tensors."""
+    """Test a dataset that represents the slices from a tuple of tensors."""
     components = (
         np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
             np.array([[12], [13], [14], [15]]), 22),
@@ -150,7 +152,7 @@ class DatasetConstructorTest(test.TestCase):
         sess.run(get_next)
 
   def testFromTensorSlicesSparse(self):
-    """Test an dataset that represents the slices from a tuple of tensors."""
+    """Test a dataset that represents the slices from a tuple of tensors."""
     components = (sparse_tensor.SparseTensorValue(
         indices=np.array([[0, 0], [1, 0], [2, 0]]),
         values=np.array([0, 0, 0]),
@@ -206,7 +208,7 @@ class DatasetConstructorTest(test.TestCase):
         sess.run(get_next)
 
   def testFromTensorSlicesMixed(self):
-    """Test an dataset that represents the slices from a tuple of tensors."""
+    """Test a dataset that represents the slices from a tuple of tensors."""
     components = (np.tile(np.array([[1], [2], [3]]), 20),
                   np.tile(np.array([[12], [13], [14]]), 22),
                   np.array([37.0, 38.0, 39.0]),
@@ -484,11 +486,168 @@ class DatasetConstructorTest(test.TestCase):
       sess.run(var_1.initializer)
 
       iterator = dataset.make_initializable_iterator()
+      sess.run(iterator.initializer)
 
       with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "Trying to access resource located in device"):
-        sess.run(iterator.initializer)
+          errors.FailedPreconditionError,
+          "Error while reading resource variable Variable"):
+        sess.run(iterator.get_next())
+
+
+class DatasetConstructorBenchmark(test.Benchmark):
+
+  def benchmarkSliceRepeatBatch(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data)
+        .repeat(num_epochs + 1).batch(batch_size))
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        sess.run(next_element)
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          sess.run(next_element)
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print("Slice/repeat/batch with sess.run() input size: %d batch size: %d "
+          "Median wall time per element: %f" % (input_size, batch_size,
+                                                median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="benchmark_slice_repeat_batch_input_%d_batch_%d" % (input_size,
+                                                                 batch_size))
+
+  def benchmarkSliceRepeatBatchCallable(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data)
+        .repeat(num_epochs + 1).batch(batch_size))
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      get_next_element = sess.make_callable(next_element)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        get_next_element()
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          get_next_element()
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print(
+        "Slice/repeat/batch with callable input size: %d batch size: %d Median"
+        " wall time per element: %f" % (input_size, batch_size,
+                                        median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="benchmark_slice_repeat_batch_callable_input_%d_batch_%d" %
+        (input_size, batch_size))
+
+  def benchmarkReshapeSliceRepeatCallable(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data.reshape(100, 100))
+        .repeat(num_epochs + 1))
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      get_next_element = sess.make_callable(next_element)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        get_next_element()
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          get_next_element()
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print("Reshape/slice/repeat with callable input size: %d batch size: %d "
+          "Median wall time per element: %f" % (input_size, batch_size,
+                                                median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="benchmark_reshape_slice_repeat_callable_input_%d_batch_%d" %
+        (input_size, batch_size))
+
+  def benchmarkSliceBatchCacheRepeatCallable(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data).batch(batch_size)
+        .cache().repeat(num_epochs + 1))
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      get_next_element = sess.make_callable(next_element)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        get_next_element()
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          get_next_element()
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print(
+        "Slice/batch/cache/repeat with callable input size: %d batch size: %d "
+        "Median wall time per element: %f"
+        % (input_size, batch_size, median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="benchmark_slice_batch_cache_repeat_callable_input_%d_batch_%d" %
+        (input_size, batch_size))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/dataset_from_generator_op_test.py b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/dataset_from_generator_op_test.py
rename to tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
diff --git a/tensorflow/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/filter_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/flat_map_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/interleave_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
similarity index 64%
rename from tensorflow/python/kernel_tests/iterator_ops_cluster_test.py
rename to tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
index 45dfa13720b09c7bba979b72a339c13dcd2d827b..25c91b42dc65f849a680e65fc7fc2548c1cea8ea 100644
--- a/tensorflow/python/kernel_tests/iterator_ops_cluster_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
@@ -17,10 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
@@ -28,6 +31,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
@@ -103,6 +109,67 @@ class IteratorClusterTest(test.TestCase):
                                    "/job:worker/replica:0/task:1/cpu:0",
                                    workers[0].target)
 
+  def testCaptureHashTableInSharedIterator(self):
+    worker, _ = test_util.create_local_cluster(1, 1)
+
+    # NOTE(mrry): We must use the V2 variants of `HashTable`
+    # etc. because these produce a `tf.resource`-typed output that is
+    # compatible with the in-graph function implementation.
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.HashTable(
+        lookup_ops.KeyValueTensorInitializer(keys, values),
+        default_val,
+        shared_name="shared_table")
+
+    input_sentences = dataset_ops.Dataset.from_tensor_slices(
+        ["brain brain tank salad surgery", "surgery brain"])
+
+    iterator = (
+        input_sentences.map(lambda x: string_ops.string_split([x]).values).map(
+            table.lookup)
+        .make_initializable_iterator(shared_name="shared_iterator"))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with session.Session(worker[0].target) as sess:
+      sess.run(table.init)
+      sess.run(init_op)
+      self.assertAllEqual([0, 0, -1, 1, 2], sess.run(get_next))
+
+    with session.Session(worker[0].target) as sess:
+      self.assertAllEqual([2, 0], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testImplicitDisposeParallelMapDataset(self):
+    # Tests whether a parallel map dataset will be cleaned up correctly when
+    # the pipeline does not run it until exhaustion.
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(None) -> PrefetchDataset(100).
+    worker, _ = test_util.create_local_cluster(1, 1)
+
+    components = (np.arange(1000),
+                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
+                  np.array(37.0) * np.arange(1000))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+        .repeat(None).prefetch(10000))
+
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with session.Session(worker[0].target) as sess:
+      sess.run(init_op)
+      for _ in range(3):
+        sess.run(get_next)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
similarity index 98%
rename from tensorflow/python/kernel_tests/iterator_ops_test.py
rename to tensorflow/python/data/kernel_tests/iterator_ops_test.py
index 513c36d64fa3e8aa00410b7fd06fa2e061aec4c5..23c6d7385f8d4a12019fa514f349f2598d9629de 100644
--- a/tensorflow/python/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
@@ -18,6 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import warnings
+
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
@@ -633,6 +635,18 @@ class IteratorTest(test.TestCase):
         with self.assertRaises(errors.InvalidArgumentError):
           sess.run(restore_op)
 
+  def testRepeatedGetNextWarning(self):
+    iterator = dataset_ops.Dataset.range(10).make_one_shot_iterator()
+    warnings.simplefilter("always")
+    with warnings.catch_warnings(record=True) as w:
+      for _ in range(100):
+        iterator.get_next()
+    self.assertEqual(100 - iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD,
+                     len(w))
+    for warning in w:
+      self.assertTrue(
+          iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE in str(warning.message))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/list_files_dataset_op_test.py b/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/list_files_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
similarity index 99%
rename from tensorflow/python/kernel_tests/map_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/map_dataset_op_test.py
index ad6bbc043db9e44ec7893cd9ae29898a8c7fedaa..04d1abdb254feea1df6f1b8cfc5a512802107224 100644
--- a/tensorflow/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
@@ -361,11 +361,12 @@ class MapDatasetTest(test.TestCase):
                 .map(lambda _: counter_var.assign_add(1))
                 .make_initializable_iterator())
     init_op = iterator.initializer
+    get_next = iterator.get_next()
 
     with self.test_session() as sess:
-      with self.assertRaisesRegexp(errors.FailedPreconditionError,
-                                   "Failed to capture resource"):
-        sess.run(init_op)
+      sess.run(init_op)
+      with self.assertRaises(errors.NotFoundError):
+        sess.run(get_next)
 
   def testSeededStatefulOperatorIsProperlyStateful(self):
     iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
diff --git a/tensorflow/python/kernel_tests/prefetch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
similarity index 94%
rename from tensorflow/python/kernel_tests/prefetch_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
index edea9c9027e72db33074adc31af71dc74e578f3b..646324cb95df6fc1fa0a901ebdccc8d4ef74a66c 100644
--- a/tensorflow/python/kernel_tests/prefetch_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
@@ -25,10 +25,11 @@ from tensorflow.python.platform import test
 
 
 class PrefetchDatasetTest(test.TestCase):
+
   def testBufferSize(self):
     buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
     iterator = dataset_ops.Dataset.range(10).prefetch(
-      buffer_size=buffer_size).make_initializable_iterator()
+        buffer_size=buffer_size).make_initializable_iterator()
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -42,7 +43,7 @@ class PrefetchDatasetTest(test.TestCase):
   def testInvalidBufferSize(self):
     buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
     iterator = dataset_ops.Dataset.range(10).prefetch(
-      buffer_size=buffer_size).make_initializable_iterator()
+        buffer_size=buffer_size).make_initializable_iterator()
     init_op = iterator.initializer
 
     with self.assertRaisesRegexp(errors.InvalidArgumentError, "buffer_size"):
diff --git a/tensorflow/python/kernel_tests/range_dataset_op_test.py b/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/range_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/range_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
similarity index 97%
rename from tensorflow/python/kernel_tests/reader_dataset_ops_test.py
rename to tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
index c8e7333b4b9949b6b6ef5f7f6d63e5ff8c354c37..d7140088c310767d40bd2cf3413c899375acab15 100644
--- a/tensorflow/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
@@ -272,6 +272,24 @@ class FixedLengthRecordReaderTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(iterator.get_next())
 
+  def testFixedLengthRecordDatasetWrongSize(self):
+    test_filenames = self._createFiles()
+    dataset = readers.FixedLengthRecordDataset(
+        test_filenames,
+        self._record_bytes + 1,  # Incorrect record length.
+        self._header_bytes,
+        self._footer_bytes,
+        buffer_size=10)
+    iterator = dataset.make_one_shot_iterator()
+
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"Excluding the header \(5 bytes\) and footer \(2 bytes\), input "
+          r"file \".*fixed_length_record.0.txt\" has body length 21 bytes, "
+          r"which is not an exact multiple of the record length \(4 bytes\)."):
+        sess.run(iterator.get_next())
+
   def _iterator_checkpoint_path(self):
     return os.path.join(self.get_temp_dir(), "iterator")
 
diff --git a/tensorflow/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/sequence_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/shard_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shard_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/shard_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/shard_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/shuffle_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/zip_dataset_op_test.py b/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/zip_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 05acfe4de7855f398d4e14f7478f5909f3e20431..f12b358a7dc35c18338171e489fa88ba1a82d11b 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -21,6 +21,7 @@ py_library(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
         "//third_party/py/numpy",
@@ -33,11 +34,11 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_ops",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/util:convert",
     ],
 )
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index dbe29c087a40b8797013067e63df3e6ad0a08889..b665443b7acb9eb266b6fcf36a002cfce54875f1 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -40,9 +40,11 @@ from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
-from tensorflow.python.ops import sparse_ops
+from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("data.Dataset")
 class Dataset(object):
   """Represents a potentially large set of elements.
 
@@ -201,7 +203,7 @@ class Dataset(object):
       tensors: A nested structure of tensors.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return TensorDataset(tensors)
 
@@ -214,11 +216,12 @@ class Dataset(object):
         0th dimension.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return TensorSliceDataset(tensors)
 
   @staticmethod
+  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensor_slices()`.")
   def from_sparse_tensor_slices(sparse_tensor):
     """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
 
@@ -226,7 +229,7 @@ class Dataset(object):
       sparse_tensor: A `tf.SparseTensor`.
 
     Returns:
-      A `Dataset` of rank-(N-1) sparse tensors.
+      Dataset: A `Dataset` of rank-(N-1) sparse tensors.
     """
     return SparseTensorSliceDataset(sparse_tensor)
 
@@ -285,6 +288,23 @@ class Dataset(object):
     sess.run(value)  # (2, array([1, 1]))
     ```
 
+    NOTE: The current implementation of `Dataset.from_generator()` uses
+    @{tf.py_func} and inherits the same constraints. In particular, it
+    requires the `Dataset`- and `Iterator`-related operations to be placed
+    on a device in the same process as the Python program that called
+    `Dataset.from_generator()`. The body of `generator` will not be
+    serialized in a `GraphDef`, and you should not use this method if you
+    need to serialize your model and restore it in a different environment.
+
+    NOTE: If `generator` depends on mutable global variables or other external
+    state, be aware that the runtime may invoke `generator` multiple times
+    (in order to support repeating the `Dataset`) and at any time
+    between the call to `Dataset.from_generator()` and the production of the
+    first element from the generator. Mutating global variables or external
+    state can cause undefined behavior, and we recommend that you explicitly
+    cache any external state in `generator` before calling
+    `Dataset.from_generator()`.
+
     Args:
       generator: A callable object that takes no arguments and returns an
         object that supports the `iter()` protocol.
@@ -295,7 +315,7 @@ class Dataset(object):
         `generator`.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     if not callable(generator):
       raise TypeError("`generator` must be callable.")
@@ -438,7 +458,7 @@ class Dataset(object):
         len(args) == 3 -> start = args[0], stop = args[1, stop = args[2]
 
     Returns:
-      A `RangeDataset`.
+      Dataset: A `RangeDataset`.
 
     Raises:
       ValueError: if len(args) == 0.
@@ -482,7 +502,7 @@ class Dataset(object):
       datasets: A nested structure of datasets.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return ZipDataset(datasets)
 
@@ -508,7 +528,7 @@ class Dataset(object):
       dataset: `Dataset` to be concatenated.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return ConcatenateDataset(self, dataset)
 
@@ -520,7 +540,7 @@ class Dataset(object):
         maximum number elements that will be buffered when prefetching.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return PrefetchDataset(self, buffer_size)
 
@@ -538,12 +558,14 @@ class Dataset(object):
         - /path/to/dir/b.py
         - /path/to/dir/c.py
 
+    NOTE: The order of the file names returned can be non-deterministic.
+
     Args:
       file_pattern: A string or scalar string `tf.Tensor`, representing
         the filename pattern that will be matched.
 
     Returns:
-     A `Dataset` of strings corresponding to file names.
+     Dataset: A `Dataset` of strings corresponding to file names.
     """
     return Dataset.from_tensor_slices(gen_io_ops.matching_files(file_pattern))
 
@@ -560,7 +582,7 @@ class Dataset(object):
         indefinitely.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return RepeatDataset(self, count)
 
@@ -584,7 +606,7 @@ class Dataset(object):
         iterated over. (Defaults to `True`.)
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return ShuffleDataset(self, buffer_size, seed, reshuffle_each_iteration)
 
@@ -597,7 +619,7 @@ class Dataset(object):
         If a filename is not provided, the dataset will be cached in memory.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return CacheDataset(self, filename)
 
@@ -611,7 +633,7 @@ class Dataset(object):
         dataset, the new dataset will contain all elements of this dataset.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return TakeDataset(self, count)
 
@@ -626,7 +648,7 @@ class Dataset(object):
         is -1, skips the entire dataset.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return SkipDataset(self, count)
 
@@ -673,7 +695,7 @@ class Dataset(object):
       index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
 
     Raises:
       ValueError: if `num_shards` or `index` are illegal values. Note: error
@@ -706,12 +728,18 @@ class Dataset(object):
   def batch(self, batch_size):
     """Combines consecutive elements of this dataset into batches.
 
+    NOTE: If the number of elements (`N`) in this dataset is not an exact
+    multiple of `batch_size`, the final batch contain smaller tensors with
+    shape `N % batch_size` in the batch dimension. If your program depends on
+    the batches having the same shape, consider using the
+    @{tf.contrib.data.batch_and_drop_remainder} transformation instead.
+
     Args:
       batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
         consecutive elements of this dataset to combine in a single batch.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return BatchDataset(self, batch_size)
 
@@ -740,12 +768,12 @@ class Dataset(object):
         the empty string for string types.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return PaddedBatchDataset(self, batch_size, padded_shapes, padding_values)
 
   def map(self, map_func, num_parallel_calls=None):
-    """Maps `map_func` across this datset.
+    """Maps `map_func` across this dataset.
 
     Args:
       map_func: A function mapping a nested structure of tensors (having
@@ -756,7 +784,7 @@ class Dataset(object):
         specified, elements will be processed sequentially.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     if num_parallel_calls is None:
       return MapDataset(self, map_func)
@@ -772,7 +800,7 @@ class Dataset(object):
         `Dataset`.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return FlatMapDataset(self, map_func)
 
@@ -785,7 +813,7 @@ class Dataset(object):
     ```python
     # Preprocess 4 files concurrently, and interleave blocks of 16 records from
     # each file.
-    filenames = ["/var/data/file1.txt", "/var/data/file2.txt", ..."]
+    filenames = ["/var/data/file1.txt", "/var/data/file2.txt", ...]
     dataset = (Dataset.from_tensor_slices(filenames)
                .interleave(lambda x:
                    TextLineDataset(x).map(parse_fn, num_parallel_calls=1),
@@ -841,7 +869,7 @@ class Dataset(object):
         input element before cycling to another input element.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return InterleaveDataset(self, map_func, cycle_length, block_length)
 
@@ -854,7 +882,7 @@ class Dataset(object):
         scalar `tf.bool` tensor.
 
     Returns:
-      A `Dataset`.
+      Dataset: A `Dataset`.
     """
     return FilterDataset(self, predicate)
 
@@ -875,10 +903,11 @@ class Dataset(object):
 
     Args:
       transformation_func: A function that takes one `Dataset` argument and
-        returns a `Dataset`.
+          returns a `Dataset`.
 
     Returns:
-      The `Dataset` returned by applying `transformation_func` to this dataset.
+      Dataset: The `Dataset` returned by applying `transformation_func` to this
+          dataset.
     """
     dataset = transformation_func(self)
     if not isinstance(dataset, Dataset):
@@ -944,11 +973,7 @@ class TensorSliceDataset(Dataset):
     batch_dim = flat_tensors[0].get_shape()[0]
     for t in flat_tensors[1:]:
       batch_dim.assert_is_compatible_with(t.get_shape()[0])
-    self._tensors = nest.pack_sequence_as(tensors, [
-        sparse_ops.serialize_many_sparse(tensor)
-        if sparse_tensor_lib.is_sparse(tensor) else tensor
-        for tensor in nest.flatten(tensors)
-    ])
+    self._tensors = sparse.serialize_many_sparse_tensors(tensors)
     self._output_classes = sparse.get_classes(tensors)
     self._output_shapes = nest.pack_sequence_as(
         tensors, [t.get_shape()[1:] for t in nest.flatten(tensors)])
@@ -1233,7 +1258,26 @@ class ShuffleDataset(Dataset):
                buffer_size,
                seed=None,
                reshuffle_each_iteration=None):
-    """See `Dataset.shuffle()` for details."""
+    """Randomly shuffles the elements of this dataset.
+
+    Args:
+      input_dataset: The input dataset.
+      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
+        number of elements from this dataset from which the new
+        dataset will sample.
+      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+        random seed that will be used to create the distribution. See
+        @{tf.set_random_seed} for behavior.
+      reshuffle_each_iteration: (Optional.) A boolean, which if true indicates
+        that the dataset should be pseudorandomly reshuffled each time it is
+        iterated over. (Defaults to `True`.)
+
+    Returns:
+      A `Dataset`.
+
+    Raises:
+      ValueError: if invalid arguments are provided.
+    """
     super(ShuffleDataset, self).__init__()
     self._input_dataset = input_dataset
     self._buffer_size = ops.convert_to_tensor(
@@ -1415,6 +1459,19 @@ def _padding_value_to_tensor(value, output_type):
   return value
 
 
+def _default_padding(input_dataset):
+
+  def make_zero(t):
+    if t.base_dtype == dtypes.string:
+      return ""
+    elif t.base_dtype == dtypes.variant:
+      raise TypeError("Unable to create padding for field of type 'variant'")
+    else:
+      return np.zeros_like(t.as_numpy_dtype())
+
+  return nest.map_structure(make_zero, input_dataset.output_types)
+
+
 class PaddedBatchDataset(Dataset):
   """A `Dataset` that batches and pads contiguous elements from its input."""
 
@@ -1430,23 +1487,13 @@ class PaddedBatchDataset(Dataset):
         batch_size, dtype=dtypes.int64, name="batch_size")
     padding_values = (
         padding_values
-        if padding_values is not None else self._default_padding(input_dataset))
+        if padding_values is not None else _default_padding(input_dataset))
     self._padded_shapes = nest.map_structure_up_to(
         input_dataset.output_shapes, _partial_shape_to_tensor, padded_shapes)
     self._padding_values = nest.map_structure_up_to(
         input_dataset.output_shapes, _padding_value_to_tensor, padding_values,
         input_dataset.output_types)
 
-  def _default_padding(self, input_dataset):
-
-    def make_zero(t):
-      if t.base_dtype == dtypes.string:
-        return ""
-      else:
-        return np.zeros_like(t.as_numpy_dtype())
-
-    return nest.map_structure(make_zero, input_dataset.output_types)
-
   def _as_variant_tensor(self):
     return gen_dataset_ops.padded_batch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 663bed07b257b4ccdd657370e370c0f4e2fdf77b..4756ec74820bace5bea4e1f41ebe214420fe5c3d 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -17,14 +17,39 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
-
-
+from tensorflow.python.util.tf_export import tf_export
+
+
+# NOTE(mrry): It is legitimate to call `Iterator.get_next()` multiple
+# times, e.g. when you are distributing different elements to multiple
+# devices in a single step. However, a common pitfall arises when
+# users call `Iterator.get_next()` in each iteration of their training
+# loop. `Iterator.get_next()` adds ops to the graph, and executing
+# each op allocates resources (including threads); as a consequence,
+# invoking it in every iteration of a training loop causes slowdown
+# and eventual resource exhaustion. To guard against this outcome, we
+# log a warning when the number of uses crosses a threshold of suspicion.
+GET_NEXT_CALL_WARNING_THRESHOLD = 32
+
+GET_NEXT_CALL_WARNING_MESSAGE = (
+    "An unusually high number of `Iterator.get_next()` calls was detected. "
+    "This often indicates that `Iterator.get_next()` is being called inside "
+    "a training loop, which will cause gradual slowdown and eventual resource "
+    "exhaustion. If this is the case, restructure your code to call "
+    "`next_element = iterator.get_next()` once outside the loop, and use "
+    "`next_element` as the input to some computation that is invoked inside "
+    "the loop.")
+
+
+@tf_export("data.Iterator")
 class Iterator(object):
   """Represents the state of iterating through a `Dataset`."""
 
@@ -56,6 +81,7 @@ class Iterator(object):
     self._output_shapes = output_shapes
     self._string_handle = gen_dataset_ops.iterator_to_string_handle(
         self._iterator_resource)
+    self._get_next_call_count = 0
 
   @staticmethod
   def from_structure(output_types,
@@ -142,8 +168,10 @@ class Iterator(object):
     iterator_resource = gen_dataset_ops.iterator(
         container="",
         shared_name=shared_name,
-        output_types=nest.flatten(output_types),
-        output_shapes=nest.flatten(output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(output_types, output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(output_shapes, output_classes)))
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -209,8 +237,10 @@ class Iterator(object):
     string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
     iterator_resource = gen_dataset_ops.iterator_from_string_handle(
         string_handle,
-        output_types=nest.flatten(output_types),
-        output_shapes=nest.flatten(output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(output_types, output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(output_shapes, output_classes)))
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -274,7 +304,42 @@ class Iterator(object):
           dataset._as_variant_tensor(), self._iterator_resource, name=name)  # pylint: disable=protected-access
 
   def get_next(self, name=None):
-    """Returns a nested structure of `tf.Tensor`s containing the next element.
+    """Returns a nested structure of `tf.Tensor`s representing the next element.
+
+    In graph mode, you should typically call this method *once* and use its
+    result as the input to another computation. A typical loop will then call
+    @{tf.Session.run} on the result of that computation. The loop will terminate
+    when the `Iterator.get_next()` operation raises
+    @{tf.errors.OutOfRangeError}. The following skeleton shows how to use
+    this method when building a training loop:
+
+    ```python
+    dataset = ...  # A `tf.data.Dataset` object.
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    # Build a TensorFlow graph that does something with each element.
+    loss = model_function(next_element)
+    optimizer = ...  # A `tf.train.Optimizer` object.
+    train_op = optimizer.minimize(loss)
+
+    with tf.Session() as sess:
+      try:
+        while True:
+          sess.run(train_op)
+      except tf.errors.OutOfRangeError:
+        pass
+    ```
+
+    NOTE: It is legitimate to call `Iterator.get_next()` multiple times, e.g.
+    when you are distributing different elements to multiple devices in a single
+    step. However, a common pitfall arises when users call `Iterator.get_next()`
+    in each iteration of their training loop. `Iterator.get_next()` adds ops to
+    the graph, and executing each op allocates resources (including threads); as
+    a consequence, invoking it in every iteration of a training loop causes
+    slowdown and eventual resource exhaustion. To guard against this outcome, we
+    log a warning when the number of uses crosses a fixed threshold of
+    suspiciousness.
 
     Args:
       name: (Optional.) A name for the created operation.
@@ -282,6 +347,10 @@ class Iterator(object):
     Returns:
       A nested structure of `tf.Tensor` objects.
     """
+    self._get_next_call_count += 1
+    if self._get_next_call_count > GET_NEXT_CALL_WARNING_THRESHOLD:
+      warnings.warn(GET_NEXT_CALL_WARNING_MESSAGE)
+
     return sparse.deserialize_sparse_tensors(
         nest.pack_sequence_as(self._output_types,
                               gen_dataset_ops.iterator_get_next(
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index c6fb8531aea13850524e6b9a83911d7afe950395..fa7601741b11f018e9b53ed3b77a7561be50d3f4 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -18,29 +18,19 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops.dataset_ops import Dataset
-from tensorflow.python.framework import constant_op
+from tensorflow.python.data.util import convert
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(b/64974358): Increase default buffer size to 256 MB.
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024  # 256 KB
 
 
-def _convert_optional_param_to_tensor(argument_name,
-                                      argument_value,
-                                      argument_default=0,
-                                      argument_dtype=dtypes.int64):
-  if argument_value is not None:
-    return ops.convert_to_tensor(
-        argument_value, dtype=argument_dtype, name=argument_name)
-  else:
-    return constant_op.constant(
-        argument_default, dtype=argument_dtype, name=argument_name)
-
-
+@tf_export("data.TextLineDataset")
 class TextLineDataset(Dataset):
   """A `Dataset` comprising lines from one or more text files."""
 
@@ -58,12 +48,12 @@ class TextLineDataset(Dataset):
     super(TextLineDataset, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
-    self._compression_type = _convert_optional_param_to_tensor(
+    self._compression_type = convert.optional_param_to_tensor(
         "compression_type",
         compression_type,
         argument_default="",
         argument_dtype=dtypes.string)
-    self._buffer_size = _convert_optional_param_to_tensor(
+    self._buffer_size = convert.optional_param_to_tensor(
         "buffer_size", buffer_size, _DEFAULT_READER_BUFFER_SIZE_BYTES)
 
   def _as_variant_tensor(self):
@@ -83,6 +73,7 @@ class TextLineDataset(Dataset):
     return dtypes.string
 
 
+@tf_export("data.TFRecordDataset")
 class TFRecordDataset(Dataset):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
@@ -100,12 +91,12 @@ class TFRecordDataset(Dataset):
     # Force the type to string even if filenames is an empty list.
     self._filenames = ops.convert_to_tensor(
         filenames, dtypes.string, name="filenames")
-    self._compression_type = _convert_optional_param_to_tensor(
+    self._compression_type = convert.optional_param_to_tensor(
         "compression_type",
         compression_type,
         argument_default="",
         argument_dtype=dtypes.string)
-    self._buffer_size = _convert_optional_param_to_tensor(
+    self._buffer_size = convert.optional_param_to_tensor(
         "buffer_size",
         buffer_size,
         argument_default=_DEFAULT_READER_BUFFER_SIZE_BYTES)
@@ -127,6 +118,7 @@ class TFRecordDataset(Dataset):
     return dtypes.string
 
 
+@tf_export("data.FixedLengthRecordDataset")
 class FixedLengthRecordDataset(Dataset):
   """A `Dataset` of fixed-length records from one or more binary files."""
 
@@ -155,11 +147,11 @@ class FixedLengthRecordDataset(Dataset):
     self._record_bytes = ops.convert_to_tensor(
         record_bytes, dtype=dtypes.int64, name="record_bytes")
 
-    self._header_bytes = _convert_optional_param_to_tensor(
+    self._header_bytes = convert.optional_param_to_tensor(
         "header_bytes", header_bytes)
-    self._footer_bytes = _convert_optional_param_to_tensor(
+    self._footer_bytes = convert.optional_param_to_tensor(
         "footer_bytes", footer_bytes)
-    self._buffer_size = _convert_optional_param_to_tensor(
+    self._buffer_size = convert.optional_param_to_tensor(
         "buffer_size", buffer_size, _DEFAULT_READER_BUFFER_SIZE_BYTES)
 
   def _as_variant_tensor(self):
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index f7d7fe98d3eca10b6481e3c0f7d08b42e95ef81a..e32c7b54a48dd887c2748897c3ce3661aab9f497 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -62,6 +62,30 @@ py_test(
     ],
 )
 
+py_library(
+    name = "convert",
+    srcs = ["convert.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+py_test(
+    name = "convert_test",
+    size = "small",
+    srcs = ["convert_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":convert",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:util",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/data/util/convert.py b/tensorflow/python/data/util/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeb1d700f3c67a1a2ab627aa8a291755bc2127e4
--- /dev/null
+++ b/tensorflow/python/data/util/convert.py
@@ -0,0 +1,34 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helpers constructing Datasets."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+
+
+def optional_param_to_tensor(argument_name,
+                             argument_value,
+                             argument_default=0,
+                             argument_dtype=dtypes.int64):
+  if argument_value is not None:
+    return ops.convert_to_tensor(
+        argument_value, dtype=argument_dtype, name=argument_name)
+  else:
+    return constant_op.constant(
+        argument_default, dtype=argument_dtype, name=argument_name)
diff --git a/tensorflow/python/data/util/convert_test.py b/tensorflow/python/data/util/convert_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cb6488070eb422f6c8d56ca5d712cbdf09fa883
--- /dev/null
+++ b/tensorflow/python/data/util/convert_test.py
@@ -0,0 +1,53 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utilities working with user input."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.util import convert
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class ConvertTest(test.TestCase):
+
+  def testInteger(self):
+    resp = convert.optional_param_to_tensor("foo", 3)
+    with self.test_session() as sess:
+      self.assertEqual(3, sess.run(resp))
+
+  def testIntegerDefault(self):
+    resp = convert.optional_param_to_tensor("foo", None)
+    with self.test_session() as sess:
+      self.assertEqual(0, sess.run(resp))
+
+  def testStringDefault(self):
+    resp = convert.optional_param_to_tensor("bar", None, "default",
+                                            dtypes.string)
+    with self.test_session() as sess:
+      self.assertEqual(compat.as_bytes("default"), sess.run(resp))
+
+  def testString(self):
+    resp = convert.optional_param_to_tensor("bar", "value", "default",
+                                            dtypes.string)
+    with self.test_session() as sess:
+      self.assertEqual(compat.as_bytes("value"), sess.run(resp))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index bd7ab3d34f826450a1de8821286c1237a2b5dedd..e90ce3fb40af68fb68d6ee8bac6892848d8c5a79 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -266,7 +266,7 @@ def map_structure(func, *structure, **check_types_dict):
   and the return value will contain the results in the same structure.
 
   Args:
-    func: A callable that acceps as many arguments are there are structures.
+    func: A callable that accepts as many arguments are there are structures.
     *structure: scalar, or tuple or list of constructed scalars and/or other
       tuples/lists, or scalars.  Note: numpy arrays are considered scalars.
     **check_types_dict: only valid keyword argument is `check_types`. If set to
@@ -379,12 +379,12 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
     if check_types and isinstance(shallow_tree, dict):
       if set(input_tree) != set(shallow_tree):
         raise ValueError(
-          "The two structures don't have the same keys. Input "
-          "structure has keys %s, while shallow structure has keys %s."
-          % (list(_six.iterkeys(input_tree)),
+            "The two structures don't have the same keys. Input "
+            "structure has keys %s, while shallow structure has keys %s." %
+            (list(_six.iterkeys(input_tree)),
              list(_six.iterkeys(shallow_tree))))
-      input_tree = list(_six.iteritems(input_tree))
-      shallow_tree = list(_six.iteritems(shallow_tree))
+      input_tree = list(sorted(_six.iteritems(input_tree)))
+      shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
 
     for shallow_branch, input_branch in zip(shallow_tree, input_tree):
       assert_shallow_structure(shallow_branch, input_branch,
@@ -479,8 +479,8 @@ def map_structure_up_to(shallow_tree, func, *inputs):
   The `inputs`, can be thought of as having the same structure as
   `shallow_tree`, but with leaf nodes that are themselves tree structures.
 
-  This function therefore will return something with the same base structure as
-  `shallow_tree`.
+  This function, therefore, will return something with the same base structure
+  as `shallow_tree`.
 
   Examples:
 
diff --git a/tensorflow/python/data/util/nest_test.py b/tensorflow/python/data/util/nest_test.py
index 8c84d9d1dfc1d398e1068a6ca2c13a99e9fadcb9..ff380815a4a32192de621888199e66355f9b4635 100644
--- a/tensorflow/python/data/util/nest_test.py
+++ b/tensorflow/python/data/util/nest_test.py
@@ -271,11 +271,16 @@ class NestTest(test.TestCase):
     inp_ab1 = {"a": (1, 1), "b": {"c": (2, 2)}}
     inp_ab2 = {"a": (1, 1), "b": {"d": (2, 2)}}
     expected_message = (
-        "The two structures don't have the same keys. Input "
-        "structure has keys \['c'\], while shallow structure has keys \['d'\].")
+        r"The two structures don't have the same keys. Input "
+        r"structure has keys \['c'\], while shallow structure has "
+        r"keys \['d'\].")
     with self.assertRaisesRegexp(ValueError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
 
+    inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))])
+    inp_ba = collections.OrderedDict([("b", (2, 3)), ("a", 1)])
+    nest.assert_shallow_structure(inp_ab, inp_ba)
+
   def testFlattenUpTo(self):
     input_tree = (((2, 2), (3, 3)), ((4, 9), (5, 5)))
     shallow_tree = ((True, True), (False, True))
diff --git a/tensorflow/python/data/util/sparse.py b/tensorflow/python/data/util/sparse.py
index b4219198d3bce612c7fde926e780ad779f1076d9..5e6d22470978d97c5e73640e86d3f8b82cbc1b60 100644
--- a/tensorflow/python/data/util/sparse.py
+++ b/tensorflow/python/data/util/sparse.py
@@ -57,7 +57,7 @@ def as_dense_shapes(shapes, classes):
 
 
 def as_dense_types(types, classes):
-  """Converts sparse tensor types to `dtypes.string`.
+  """Converts sparse tensor types to `dtypes.variant`.
 
   Args:
     types: a structure of types to convert.
@@ -65,11 +65,11 @@ def as_dense_types(types, classes):
 
   Returns:
     a structure matching the nested structure of `types`, containing
-    `dtypes.string` at positions where `classes` contains `tf.SparseTensor` and
+    `dtypes.variant` at positions where `classes` contains `tf.SparseTensor` and
     matching contents of `types` otherwise
   """
   ret = nest.pack_sequence_as(types, [
-      dtypes.string if c is sparse_tensor.SparseTensor else ty
+      dtypes.variant if c is sparse_tensor.SparseTensor else ty
       for ty, c in zip(nest.flatten(types), nest.flatten(classes))
   ])
   return ret
@@ -116,6 +116,24 @@ def get_classes(tensors):
   ])
 
 
+def serialize_many_sparse_tensors(tensors):
+  """Serializes many sparse tensors into a batch.
+
+  Args:
+    tensors: a tensor structure to serialize.
+
+  Returns:
+    `tensors` with any sparse tensors replaced by the serialized batch.
+  """
+
+  ret = nest.pack_sequence_as(tensors, [
+      sparse_ops.serialize_many_sparse(tensor, out_type=dtypes.variant)
+      if sparse_tensor.is_sparse(tensor) else tensor
+      for tensor in nest.flatten(tensors)
+  ])
+  return ret
+
+
 def serialize_sparse_tensors(tensors):
   """Serializes sparse tensors.
 
@@ -123,11 +141,11 @@ def serialize_sparse_tensors(tensors):
     tensors: a tensor structure to serialize.
 
   Returns:
-    `tensors` with any sparse tensors replaced by the their serialized version.
+    `tensors` with any sparse tensors replaced by their serialized version.
   """
 
   ret = nest.pack_sequence_as(tensors, [
-      sparse_ops.serialize_sparse(tensor)
+      sparse_ops.serialize_sparse(tensor, out_type=dtypes.variant)
       if isinstance(tensor, sparse_tensor.SparseTensor) else tensor
       for tensor in nest.flatten(tensors)
   ])
diff --git a/tensorflow/python/data/util/sparse_test.py b/tensorflow/python/data/util/sparse_test.py
index a707570bab71357aa982aaed5c3d175e763f6b5f..d49b3ff34bd0ebd6beef1bea168dad22059317be 100644
--- a/tensorflow/python/data/util/sparse_test.py
+++ b/tensorflow/python/data/util/sparse_test.py
@@ -168,7 +168,7 @@ class SparseTest(test.TestCase):
         {
             "types": dtypes.int32,
             "classes": sparse_tensor.SparseTensor,
-            "expected": dtypes.string
+            "expected": dtypes.variant
         },
         {
             "types": (dtypes.int32),
@@ -178,7 +178,7 @@ class SparseTest(test.TestCase):
         {
             "types": (dtypes.int32),
             "classes": (sparse_tensor.SparseTensor),
-            "expected": (dtypes.string)
+            "expected": (dtypes.variant)
         },
         {
             "types": (dtypes.int32, ()),
@@ -193,12 +193,12 @@ class SparseTest(test.TestCase):
         {
             "types": (dtypes.int32, ()),
             "classes": (sparse_tensor.SparseTensor, ()),
-            "expected": (dtypes.string, ())
+            "expected": (dtypes.variant, ())
         },
         {
             "types": ((), dtypes.int32),
             "classes": ((), sparse_tensor.SparseTensor),
-            "expected": ((), dtypes.string)
+            "expected": ((), dtypes.variant)
         },
         {
             "types": (dtypes.int32, (), dtypes.int32),
@@ -209,7 +209,7 @@ class SparseTest(test.TestCase):
             "types": (dtypes.int32, (), dtypes.int32),
             "classes": (sparse_tensor.SparseTensor, (),
                         sparse_tensor.SparseTensor),
-            "expected": (dtypes.string, (), dtypes.string)
+            "expected": (dtypes.variant, (), dtypes.variant)
         },
         {
             "types": ((), dtypes.int32, ()),
@@ -219,7 +219,7 @@ class SparseTest(test.TestCase):
         {
             "types": ((), dtypes.int32, ()),
             "classes": ((), sparse_tensor.SparseTensor, ()),
-            "expected": ((), dtypes.string, ())
+            "expected": ((), dtypes.variant, ())
         },
     )
     for test_case in test_cases:
@@ -227,45 +227,6 @@ class SparseTest(test.TestCase):
           sparse.as_dense_types(test_case["types"], test_case["classes"]),
           test_case["expected"])
 
-  def assertSparseValuesEqual(self, a, b):
-    if not isinstance(a, sparse_tensor.SparseTensor):
-      self.assertFalse(isinstance(b, sparse_tensor.SparseTensor))
-      self.assertEqual(a, b)
-      return
-    self.assertTrue(isinstance(b, sparse_tensor.SparseTensor))
-    with self.test_session():
-      self.assertAllEqual(a.eval().indices, b.eval().indices)
-      self.assertAllEqual(a.eval().values, b.eval().values)
-      self.assertAllEqual(a.eval().dense_shape, b.eval().dense_shape)
-
-  def testSerializeDeserialize(self):
-    test_cases = (
-        (),
-        sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
-        sparse_tensor.SparseTensor(
-            indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
-        sparse_tensor.SparseTensor(
-            indices=[[0, 0], [3, 4]], values=[1, -1], dense_shape=[4, 5]),
-        (sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
-        (sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[1], dense_shape=[1, 1]), ()),
-        ((), sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
-    )
-    for expected in test_cases:
-      classes = sparse.get_classes(expected)
-      shapes = nest.map_structure(lambda _: tensor_shape.TensorShape(None),
-                                  classes)
-      types = nest.map_structure(lambda _: dtypes.int32, classes)
-      actual = sparse.deserialize_sparse_tensors(
-          sparse.serialize_sparse_tensors(expected), types, shapes,
-          sparse.get_classes(expected))
-      nest.assert_same_structure(expected, actual)
-      for a, e in zip(nest.flatten(actual), nest.flatten(expected)):
-        self.assertSparseValuesEqual(a, e)
-
   def testGetClasses(self):
     s = sparse_tensor.SparseTensor(indices=[[0]], values=[1], dense_shape=[1])
     d = ops.Tensor
@@ -324,6 +285,75 @@ class SparseTest(test.TestCase):
       self.assertEqual(
           sparse.get_classes(test_case["classes"]), test_case["expected"])
 
+  def assertSparseValuesEqual(self, a, b):
+    if not isinstance(a, sparse_tensor.SparseTensor):
+      self.assertFalse(isinstance(b, sparse_tensor.SparseTensor))
+      self.assertEqual(a, b)
+      return
+    self.assertTrue(isinstance(b, sparse_tensor.SparseTensor))
+    with self.test_session():
+      self.assertAllEqual(a.eval().indices, b.eval().indices)
+      self.assertAllEqual(a.eval().values, b.eval().values)
+      self.assertAllEqual(a.eval().dense_shape, b.eval().dense_shape)
+
+  def testSerializeDeserialize(self):
+    test_cases = (
+        (),
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+        sparse_tensor.SparseTensor(
+            indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0], [3, 4]], values=[1, -1], dense_shape=[4, 5]),
+        (sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
+        (sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]), ()),
+        ((),
+         sparse_tensor.SparseTensor(
+             indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
+    )
+    for expected in test_cases:
+      classes = sparse.get_classes(expected)
+      shapes = nest.map_structure(lambda _: tensor_shape.TensorShape(None),
+                                  classes)
+      types = nest.map_structure(lambda _: dtypes.int32, classes)
+      actual = sparse.deserialize_sparse_tensors(
+          sparse.serialize_sparse_tensors(expected), types, shapes,
+          sparse.get_classes(expected))
+      nest.assert_same_structure(expected, actual)
+      for a, e in zip(nest.flatten(actual), nest.flatten(expected)):
+        self.assertSparseValuesEqual(a, e)
+
+  def testSerializeManyDeserialize(self):
+    test_cases = (
+        (),
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+        sparse_tensor.SparseTensor(
+            indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0], [3, 4]], values=[1, -1], dense_shape=[4, 5]),
+        (sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
+        (sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]), ()),
+        ((),
+         sparse_tensor.SparseTensor(
+             indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
+    )
+    for expected in test_cases:
+      classes = sparse.get_classes(expected)
+      shapes = nest.map_structure(lambda _: tensor_shape.TensorShape(None),
+                                  classes)
+      types = nest.map_structure(lambda _: dtypes.int32, classes)
+      actual = sparse.deserialize_sparse_tensors(
+          sparse.serialize_many_sparse_tensors(expected), types, shapes,
+          sparse.get_classes(expected))
+      nest.assert_same_structure(expected, actual)
+      for a, e in zip(nest.flatten(actual), nest.flatten(expected)):
+        self.assertSparseValuesEqual(a, e)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 68b97ddbe3048b7aef18fcf8cc2b41ee545ee55f..f0e90f67772d114142ccc218ed9f42b723a1b556 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -31,6 +31,7 @@ py_library(
         ":debug_graphs",
         ":debug_utils",
         ":grpc_debug_server",
+        ":grpc_debug_test_server",
         ":hooks",
         ":local_cli_wrapper",
         "//tensorflow/python:util",
@@ -41,15 +42,23 @@ py_library(
 py_library(
     name = "debug_pip",
     deps = [
+        ":cli_test_utils",
         ":debug_py",
         ":grpc_debug_test_server",
         ":offline_analyzer",
         ":session_debug_testlib",
+        ":source_remote",
     ] + if_not_windows([
         ":debug_examples",
     ]),
 )
 
+py_library(
+    name = "common",
+    srcs = ["lib/common.py"],
+    srcs_version = "PY2AND3",
+)
+
 py_library(
     name = "debug_graphs",
     srcs = ["lib/debug_graphs.py"],
@@ -110,6 +119,18 @@ py_library(
     ],
 )
 
+py_library(
+    name = "source_remote",
+    srcs = ["lib/source_remote.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":common",
+        ":debug_service_pb2_grpc",
+        "//tensorflow/core/debug:debug_service_proto_py",
+        "//tensorflow/python/profiler:tfprof_logger",
+    ],
+)
+
 py_library(
     name = "stepper",
     srcs = ["lib/stepper.py"],
@@ -180,9 +201,11 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":command_parser",
+        ":common",
         ":debugger_cli_common",
         ":tensor_format",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -321,7 +344,11 @@ py_library(
     name = "grpc_wrapper",
     srcs = ["wrappers/grpc_wrapper.py"],
     srcs_version = "PY2AND3",
-    deps = [":framework"],
+    deps = [
+        ":common",
+        ":framework",
+        ":source_remote",
+    ],
 )
 
 py_library(
@@ -332,6 +359,7 @@ py_library(
         ":analyzer_cli",
         ":cli_shared",
         ":command_parser",
+        ":common",
         ":debug_data",
         ":debugger_cli_common",
         ":framework",
@@ -426,6 +454,20 @@ py_binary(
     ],
 )
 
+py_test(
+    name = "common_test",
+    size = "small",
+    srcs = ["lib/common_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":common",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 py_test(
     name = "debug_graphs_test",
     size = "small",
@@ -515,6 +557,32 @@ py_test(
     ],
 )
 
+py_test(
+    name = "source_remote_test",
+    size = "small",
+    srcs = ["lib/source_remote_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+        "nomac",
+        "oss_serial",
+    ],
+    deps = [
+        ":grpc_debug_test_server",
+        ":source_remote",
+        ":source_utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "stepper_test",
     size = "small",
@@ -574,8 +642,11 @@ py_test(
     size = "small",
     srcs = ["cli/curses_ui_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
+    tags = [
+        "no_windows",
+    ],
     deps = [
+        ":cli_test_utils",
         ":curses_ui",
         ":debugger_cli_common",
         ":tensor_format",
@@ -765,6 +836,7 @@ py_test(
     srcs = ["cli/tensor_format_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":cli_test_utils",
         ":debug_data",
         ":tensor_format",
         "//tensorflow/core:protos_all_py",
@@ -808,6 +880,12 @@ py_test(
     ],
 )
 
+py_library(
+    name = "cli_test_utils",
+    srcs = ["cli/cli_test_utils.py"],
+    srcs_version = "PY2AND3",
+)
+
 cuda_py_test(
     name = "analyzer_cli_test",
     size = "small",
@@ -815,6 +893,7 @@ cuda_py_test(
     additional_deps = [
         ":analyzer_cli",
         ":cli_config",
+        ":cli_test_utils",
         ":command_parser",
         ":debug_data",
         ":debug_utils",
@@ -924,6 +1003,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     data = ["//tensorflow/tools/dist_test/server:grpc_tensorflow_server"],
+    grpc_enabled = True,
     tags = [
         "no_oss",  # Incompatible with bazel_pip.
         "no_windows",
diff --git a/tensorflow/python/debug/README.md b/tensorflow/python/debug/README.md
index b26411cd1538250b61364b6c7257fd03d5b6278b..a2273b050bb1ecd5a35938c3de57fb8562f1d26d 100644
--- a/tensorflow/python/debug/README.md
+++ b/tensorflow/python/debug/README.md
@@ -28,7 +28,7 @@ models:
 
 * Easy access through session wrappers
 * Easy integration with common high-level APIs, such as
-  [tf-learn](https://www.tensorflow.org/get_started/tflearn) and
+  [TensorFlow Estimators](https://www.tensorflow.org/programmers_guide/estimators) and
   [Keras](https://keras.io/)
 * Inspection of runtime tensor values and node connections
 * Conditional breaking after runs that generate tensors satisfying given
diff --git a/tensorflow/python/debug/__init__.py b/tensorflow/python/debug/__init__.py
index 821350ee907c46aaa52b5f47ca763f34458eeb3e..34da44b60df9dbda836d6c91089c5ee90f11c584 100644
--- a/tensorflow/python/debug/__init__.py
+++ b/tensorflow/python/debug/__init__.py
@@ -30,6 +30,8 @@ See the @{$python/tfdbg} guide.
 @@GrpcDebugWrapperSession
 @@LocalCLIDebugHook
 @@LocalCLIDebugWrapperSession
+@@TensorBoardDebugHook
+@@TensorBoardDebugWrapperSession
 @@WatchOptions
 
 @@reconstruct_non_debug_graph_def
@@ -60,9 +62,11 @@ from tensorflow.python.debug.lib.debug_utils import watch_graph_with_blacklists
 from tensorflow.python.debug.wrappers.dumping_wrapper import DumpingDebugWrapperSession
 from tensorflow.python.debug.wrappers.framework import WatchOptions
 from tensorflow.python.debug.wrappers.grpc_wrapper import GrpcDebugWrapperSession
+from tensorflow.python.debug.wrappers.grpc_wrapper import TensorBoardDebugWrapperSession
 from tensorflow.python.debug.wrappers.hooks import DumpingDebugHook
 from tensorflow.python.debug.wrappers.hooks import GrpcDebugHook
 from tensorflow.python.debug.wrappers.hooks import LocalCLIDebugHook
+from tensorflow.python.debug.wrappers.hooks import TensorBoardDebugHook
 from tensorflow.python.debug.wrappers.local_cli_wrapper import LocalCLIDebugWrapperSession
 
 from tensorflow.python.util import all_util as _all_util
diff --git a/tensorflow/python/debug/cli/analyzer_cli.py b/tensorflow/python/debug/cli/analyzer_cli.py
index afa3363d99c1e7341f3901dfc8e79d07fb675cfd..156afdfd4c44f2f1a07ffdd1e68ad48bbbe31cba 100644
--- a/tensorflow/python/debug/cli/analyzer_cli.py
+++ b/tensorflow/python/debug/cli/analyzer_cli.py
@@ -402,6 +402,12 @@ class DebugAnalyzer(object):
         action="store_true",
         help="Print the tensor in its entirety, i.e., do not use ellipses "
         "(may be slow for large results).")
+    ap.add_argument(
+        "-w",
+        "--write_path",
+        default="",
+        help="Path of the numpy file to write the evaluation result to, "
+        "using numpy.save()")
     self._arg_parsers["eval"] = ap
 
   def add_tensor_filter(self, filter_name, filter_callable):
@@ -972,7 +978,8 @@ class DebugAnalyzer(object):
             print_all=parsed.print_all,
             tensor_slicing=tensor_slicing,
             highlight_options=highlight_options,
-            include_numeric_summary=parsed.numeric_summary)
+            include_numeric_summary=parsed.numeric_summary,
+            write_path=parsed.write_path)
       else:
         output = cli_shared.error(
             "Invalid number (%d) for tensor %s, which generated one dump." %
@@ -1018,7 +1025,8 @@ class DebugAnalyzer(object):
             np_printoptions,
             print_all=parsed.print_all,
             tensor_slicing=tensor_slicing,
-            highlight_options=highlight_options)
+            highlight_options=highlight_options,
+            write_path=parsed.write_path)
       _add_main_menu(output, node_name=node_name, enable_print_tensor=False)
 
     return output
@@ -1071,7 +1079,8 @@ class DebugAnalyzer(object):
         "from eval of expression '%s'" % parsed.expression,
         np_printoptions,
         print_all=parsed.print_all,
-        include_numeric_summary=True)
+        include_numeric_summary=True,
+        write_path=parsed.write_path)
 
   def _reconstruct_print_source_command(self,
                                         parsed,
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index 847f9ec401499abb8ec4f310fa4d5118b2afca7b..6b110fda9eba301f298e84b63d091bb300549bee 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.client import session
 from tensorflow.python.debug.cli import analyzer_cli
 from tensorflow.python.debug.cli import cli_config
 from tensorflow.python.debug.cli import cli_shared
+from tensorflow.python.debug.cli import cli_test_utils
 from tensorflow.python.debug.cli import command_parser
 from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.lib import debug_data
@@ -55,7 +56,8 @@ def no_rewrite_session_config():
   rewriter_config = rewriter_config_pb2.RewriterConfig(
       disable_model_pruning=True,
       constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
 
   graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
   return config_pb2.ConfigProto(graph_options=graph_options)
@@ -1017,6 +1019,24 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         list_inputs_node_name=node_name,
         list_outputs_node_name=node_name)
 
+  def testPrintTensorAndWriteToNpyFile(self):
+    node_name = "simple_mul_add/matmul"
+    tensor_name = node_name + ":0"
+    npy_path = os.path.join(self._dump_root, "matmul.npy")
+    out = self._registry.dispatch_command(
+        "print_tensor", [tensor_name, "-w", npy_path],
+        screen_info={"cols": 80})
+
+    self.assertEqual([
+        "Tensor \"%s:DebugIdentity\":" % tensor_name,
+        "  dtype: float64",
+        "  shape: (2, 1)",
+        "",
+    ], out.lines[:4])
+    self.assertTrue(out.lines[4].startswith("Saved value to: %s (" % npy_path))
+    # Load the numpy file and verify its contents.
+    self.assertAllClose([[7.0], [-2.0]], np.load(npy_path))
+
   def testPrintTensorHighlightingRanges(self):
     node_name = "simple_mul_add/matmul"
     tensor_name = node_name + ":0"
@@ -1207,21 +1227,44 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         "eval", ["np.matmul(`%s`, `%s`.T)" % (tensor_name, tensor_name)],
         screen_info={"cols": 80})
 
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self,
+        ["Tensor \"from eval of expression "
+         "'np.matmul(`simple_mul_add/matmul:0`, "
+         "`simple_mul_add/matmul:0`.T)'\":",
+         "  dtype: float64",
+         "  shape: (2, 2)",
+         "",
+         "Numeric summary:",
+         "| - + | total |",
+         "| 2 2 |     4 |",
+         "|           min           max          mean           std |"],
+        out.lines[:8])
+    cli_test_utils.assert_array_lines_close(
+        self, [-14.0, 49.0, 6.25, 25.7524270701], out.lines[8:9])
+    cli_test_utils.assert_array_lines_close(
+        self, [[49.0, -14.0], [-14.0, 4.0]], out.lines[10:])
+
+  def testEvalExpressionAndWriteToNpyFile(self):
+    node_name = "simple_mul_add/matmul"
+    tensor_name = node_name + ":0"
+    npy_path = os.path.join(self._dump_root, "matmul_eval.npy")
+    out = self._registry.dispatch_command(
+        "eval",
+        ["np.matmul(`%s`, `%s`.T)" % (tensor_name, tensor_name), "-w",
+         npy_path], screen_info={"cols": 80})
+
     self.assertEqual([
         "Tensor \"from eval of expression "
         "'np.matmul(`simple_mul_add/matmul:0`, "
         "`simple_mul_add/matmul:0`.T)'\":",
         "  dtype: float64",
         "  shape: (2, 2)",
-        "",
-        "Numeric summary:",
-        "| - + | total |",
-        "| 2 2 |     4 |",
-        "|           min           max          mean           std |",
-        "|         -14.0          49.0          6.25 25.7524270701 |",
-        "",
-        "array([[ 49., -14.],",
-        "       [-14.,   4.]])"], out.lines)
+        ""], out.lines[:4])
+
+    self.assertTrue(out.lines[4].startswith("Saved value to: %s (" % npy_path))
+    # Load the numpy file and verify its contents.
+    self.assertAllClose([[49.0, -14.0], [-14.0, 4.0]], np.load(npy_path))
 
   def testAddGetTensorFilterLambda(self):
     analyzer = analyzer_cli.DebugAnalyzer(self._debug_dump,
diff --git a/tensorflow/python/debug/cli/cli_shared.py b/tensorflow/python/debug/cli/cli_shared.py
index df972eacf7332ed4b9a7ccf513da1db91e71fb4c..dea019fef58015fbd7982a81319dcabe4e5f4930 100644
--- a/tensorflow/python/debug/cli/cli_shared.py
+++ b/tensorflow/python/debug/cli/cli_shared.py
@@ -25,8 +25,10 @@ import six
 from tensorflow.python.debug.cli import command_parser
 from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.cli import tensor_format
+from tensorflow.python.debug.lib import common
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 
 RL = debugger_cli_common.RichLine
 
@@ -151,7 +153,8 @@ def format_tensor(tensor,
                   print_all=False,
                   tensor_slicing=None,
                   highlight_options=None,
-                  include_numeric_summary=False):
+                  include_numeric_summary=False,
+                  write_path=None):
   """Generate formatted str to represent a tensor or its slices.
 
   Args:
@@ -171,6 +174,8 @@ def format_tensor(tensor,
       for more details.
     include_numeric_summary: Whether a text summary of the numeric values (if
       applicable) will be included.
+    write_path: A path to save the tensor value (after any slicing) to
+      (optional). `numpy.save()` is used to save the value.
 
   Returns:
     An instance of `debugger_cli_common.RichTextLines` representing the
@@ -185,6 +190,16 @@ def format_tensor(tensor,
     value = tensor
     sliced_name = tensor_name
 
+  auxiliary_message = None
+  if write_path:
+    with gfile.Open(write_path, "wb") as output_file:
+      np.save(output_file, value)
+    line = debugger_cli_common.RichLine("Saved value to: ")
+    line += debugger_cli_common.RichLine(write_path, font_attr="bold")
+    line += " (%sB)" % bytes_to_readable_str(gfile.Stat(write_path).length)
+    auxiliary_message = debugger_cli_common.rich_text_lines_from_rich_line_list(
+        [line, debugger_cli_common.RichLine("")])
+
   if print_all:
     np_printoptions["threshold"] = value.size
   else:
@@ -195,6 +210,7 @@ def format_tensor(tensor,
       sliced_name,
       include_metadata=True,
       include_numeric_summary=include_numeric_summary,
+      auxiliary_message=auxiliary_message,
       np_printoptions=np_printoptions,
       highlight_options=highlight_options)
 
@@ -214,51 +230,6 @@ def error(msg):
       RL("ERROR: " + msg, COLOR_RED)])
 
 
-def get_graph_element_name(elem):
-  """Obtain the name or string representation of a graph element.
-
-  If the graph element has the attribute "name", return name. Otherwise, return
-  a __str__ representation of the graph element. Certain graph elements, such as
-  `SparseTensor`s, do not have the attribute "name".
-
-  Args:
-    elem: The graph element in question.
-
-  Returns:
-    If the attribute 'name' is available, return the name. Otherwise, return
-    str(fetch).
-  """
-
-  return elem.name if hasattr(elem, "name") else str(elem)
-
-
-def _get_fetch_names(fetches):
-  """Get a flattened list of the names in run() call fetches.
-
-  Args:
-    fetches: Fetches of the `Session.run()` call. It maybe a Tensor, an
-      Operation or a Variable. It may also be nested lists, tuples or
-      dicts. See doc of `Session.run()` for more details.
-
-  Returns:
-    (list of str) A flattened list of fetch names from `fetches`.
-  """
-
-  lines = []
-  if isinstance(fetches, (list, tuple)):
-    for fetch in fetches:
-      lines.extend(_get_fetch_names(fetch))
-  elif isinstance(fetches, dict):
-    for key in fetches:
-      lines.extend(_get_fetch_names(fetches[key]))
-  else:
-    # This ought to be a Tensor, an Operation or a Variable, for which the name
-    # attribute should be available. (Bottom-out condition of the recursion.)
-    lines.append(get_graph_element_name(fetches))
-
-  return lines
-
-
 def _recommend_command(command, description, indent=2, create_link=False):
   """Generate a RichTextLines object that describes a recommended command.
 
@@ -327,14 +298,14 @@ def get_run_start_intro(run_call_count,
     (RichTextLines) Formatted intro message about the `Session.run()` call.
   """
 
-  fetch_lines = _get_fetch_names(fetches)
+  fetch_lines = common.get_flattened_names(fetches)
 
   if not feed_dict:
     feed_dict_lines = [debugger_cli_common.RichLine("  (Empty)")]
   else:
     feed_dict_lines = []
     for feed_key in feed_dict:
-      feed_key_name = get_graph_element_name(feed_key)
+      feed_key_name = common.get_graph_element_name(feed_key)
       feed_dict_line = debugger_cli_common.RichLine("  ")
       feed_dict_line += debugger_cli_common.RichLine(
           feed_key_name,
@@ -446,10 +417,10 @@ def get_run_short_description(run_call_count,
   description = "run #%d: " % run_call_count
 
   if isinstance(fetches, (ops.Tensor, ops.Operation, variables.Variable)):
-    description += "1 fetch (%s); " % get_graph_element_name(fetches)
+    description += "1 fetch (%s); " % common.get_graph_element_name(fetches)
   else:
     # Could be (nested) list, tuple, dict or namedtuple.
-    num_fetches = len(_get_fetch_names(fetches))
+    num_fetches = len(common.get_flattened_names(fetches))
     if num_fetches > 1:
       description += "%d fetches; " % num_fetches
     else:
diff --git a/tensorflow/python/debug/cli/cli_test_utils.py b/tensorflow/python/debug/cli/cli_test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a963d8da53ff0cf543ac33b389633cb2b1916b2
--- /dev/null
+++ b/tensorflow/python/debug/cli/cli_test_utils.py
@@ -0,0 +1,65 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Testing utilities for tfdbg command-line interface."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+import numpy as np
+
+
+def assert_lines_equal_ignoring_whitespace(test, expected_lines, actual_lines):
+  """Assert equality in lines, ignoring all whitespace.
+
+  Args:
+    test: An instance of unittest.TestCase or its subtypes (e.g.,
+      TensorFlowTestCase).
+    expected_lines: Expected lines as an iterable of strings.
+    actual_lines: Actual lines as an iterable of strings.
+  """
+  test.assertEqual(
+      len(expected_lines), len(actual_lines),
+      "Mismatch in the number of lines: %d vs %d" % (
+          len(expected_lines), len(actual_lines)))
+  for expected_line, actual_line in zip(expected_lines, actual_lines):
+    test.assertEqual("".join(expected_line.split()),
+                     "".join(actual_line.split()))
+
+
+# Regular expression for separators between values in a string representation
+# of an ndarray, exclusing whitespace.
+_ARRAY_VALUE_SEPARATOR_REGEX = re.compile(r"(array|\(|\[|\]|\)|\||,)")
+
+
+def assert_array_lines_close(test, expected_array, array_lines):
+  """Assert that the array value represented by lines is close to expected.
+
+  Note that the shape of the array represented by the `array_lines` is ignored.
+
+  Args:
+    test: An instance of TensorFlowTestCase.
+    expected_array: Expected value of the array.
+    array_lines: A list of strings representing the array.
+      E.g., "array([[ 1.0, 2.0 ], [ 3.0, 4.0 ]])"
+      Assumes that values are separated by commas, parentheses, brackets, "|"
+      characters and whitespace.
+  """
+  elements = []
+  for line in array_lines:
+    line = re.sub(_ARRAY_VALUE_SEPARATOR_REGEX, " ", line)
+    elements.extend(float(s) for s in line.split())
+  test.assertAllClose(np.array(expected_array).flatten(), elements)
diff --git a/tensorflow/python/debug/cli/command_parser.py b/tensorflow/python/debug/cli/command_parser.py
index aabb3ed7b6008cf2a785d91b937620680d2942cc..e6e44b796dfff997b739d98ab4239cbf0c790404 100644
--- a/tensorflow/python/debug/cli/command_parser.py
+++ b/tensorflow/python/debug/cli/command_parser.py
@@ -540,4 +540,11 @@ def get_print_tensor_argparser(description):
       action="store_true",
       help="Include summary for non-empty tensors of numeric (int*, float*, "
       "complex*) and Boolean types.")
+  ap.add_argument(
+      "-w",
+      "--write_path",
+      type=str,
+      default="",
+      help="Path of the numpy file to write the tensor data to, using "
+      "numpy.save().")
   return ap
diff --git a/tensorflow/python/debug/cli/curses_ui_test.py b/tensorflow/python/debug/cli/curses_ui_test.py
index 4ca11e7e4104009e01d7db7f6b25912ad94c2118..02511cbe6a586879e259f59236f0d75c8d55bfba 100644
--- a/tensorflow/python/debug/cli/curses_ui_test.py
+++ b/tensorflow/python/debug/cli/curses_ui_test.py
@@ -25,6 +25,7 @@ import threading
 import numpy as np
 from six.moves import queue
 
+from tensorflow.python.debug.cli import cli_test_utils
 from tensorflow.python.debug.cli import curses_ui
 from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.cli import tensor_format
@@ -1056,13 +1057,10 @@ class CursesTest(test_util.TensorFlowTestCase):
     self.assertEqual(11, len(ui.scroll_messages))
 
     for i in range(11):
-      self.assertEqual([
-          "Tensor \"m\":", "", "array([[ 1.,  1.,  1.,  1.,  1.],",
-          "       [ 1.,  1.,  1.,  1.,  1.],",
-          "       [ 1.,  1.,  1.,  1.,  1.],",
-          "       [ 1.,  1.,  1.,  1.,  1.],",
-          "       [ 1.,  1.,  1.,  1.,  1.]])"
-      ], ui.unwrapped_outputs[i].lines)
+      cli_test_utils.assert_lines_equal_ignoring_whitespace(
+          self, ["Tensor \"m\":", ""], ui.unwrapped_outputs[i].lines[:2])
+      self.assertEqual(
+          repr(np.ones([5, 5])).split("\n"), ui.unwrapped_outputs[i].lines[2:])
 
     self.assertEqual({
         0: None,
@@ -1165,13 +1163,10 @@ class CursesTest(test_util.TensorFlowTestCase):
     self.assertEqual(4, len(ui.output_array_pointer_indices))
 
     for i in range(4):
-      self.assertEqual([
-          "Tensor \"m\":", "", "array([[ 1.,  1.,  1.,  1.,  1.],",
-          "       [ 1.,  1.,  1.,  1.,  1.],",
-          "       [ 1.,  1.,  1.,  1.,  1.],",
-          "       [ 1.,  1.,  1.,  1.,  1.],",
-          "       [ 1.,  1.,  1.,  1.,  1.]])"
-      ], ui.unwrapped_outputs[i].lines)
+      cli_test_utils.assert_lines_equal_ignoring_whitespace(
+          self, ["Tensor \"m\":", ""], ui.unwrapped_outputs[i].lines[:2])
+      self.assertEqual(
+          repr(np.ones([5, 5])).split("\n"), ui.unwrapped_outputs[i].lines[2:])
 
     self.assertEqual({
         0: None,
diff --git a/tensorflow/python/debug/cli/tensor_format.py b/tensorflow/python/debug/cli/tensor_format.py
index 05ccf93f15385566719abf8064296c2e1cfec027..9ba84e3f2261de277361d503e9189583494a5084 100644
--- a/tensorflow/python/debug/cli/tensor_format.py
+++ b/tensorflow/python/debug/cli/tensor_format.py
@@ -72,6 +72,7 @@ class HighlightOptions(object):
 def format_tensor(tensor,
                   tensor_label,
                   include_metadata=False,
+                  auxiliary_message=None,
                   include_numeric_summary=False,
                   np_printoptions=None,
                   highlight_options=None):
@@ -84,6 +85,8 @@ def format_tensor(tensor,
       suppress the tensor name line in the return value.
     include_metadata: Whether metadata such as dtype and shape are to be
       included in the formatted text.
+    auxiliary_message: An auxiliary message to display under the tensor label,
+      dtype and shape information lines.
     include_numeric_summary: Whether a text summary of the numeric values (if
       applicable) will be included.
     np_printoptions: A dictionary of keyword arguments that are passed to a
@@ -131,13 +134,16 @@ def format_tensor(tensor,
 
   if include_metadata:
     lines.append("  dtype: %s" % str(tensor.dtype))
-    lines.append("  shape: %s" % str(tensor.shape))
+    lines.append("  shape: %s" % str(tensor.shape).replace("L", ""))
 
   if lines:
     lines.append("")
   formatted = debugger_cli_common.RichTextLines(
       lines, font_attr_segs=font_attr_segs)
 
+  if auxiliary_message:
+    formatted.extend(auxiliary_message)
+
   if include_numeric_summary:
     formatted.append("Numeric summary:")
     formatted.extend(numeric_summary(tensor))
@@ -529,7 +535,7 @@ def numeric_summary(tensor):
   if not isinstance(tensor, np.ndarray) or not np.size(tensor):
     return debugger_cli_common.RichTextLines([
         "No numeric summary available due to empty tensor."])
-  elif (np.issubdtype(tensor.dtype, np.float) or
+  elif (np.issubdtype(tensor.dtype, np.floating) or
         np.issubdtype(tensor.dtype, np.complex) or
         np.issubdtype(tensor.dtype, np.integer)):
     counts = [
diff --git a/tensorflow/python/debug/cli/tensor_format_test.py b/tensorflow/python/debug/cli/tensor_format_test.py
index d3beb5f7bc8538a36437d1a322904cd141210985..18ddbb6437cf463afa2cc4e8f02cb592e016049a 100644
--- a/tensorflow/python/debug/cli/tensor_format_test.py
+++ b/tensorflow/python/debug/cli/tensor_format_test.py
@@ -17,12 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
+
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.debug.cli import cli_test_utils
 from tensorflow.python.debug.cli import tensor_format
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.framework import test_util
@@ -40,21 +42,109 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
         {"dtype": tensor.dtype, "shape": tensor.shape},
         annotations["tensor_metadata"])
 
-  def _checkBeginIndices(self, expected_indices, annot):
-    self.assertEqual({tensor_format.BEGIN_INDICES_KEY: expected_indices},
-                     annot)
-
-  def _checkOmittedIndices(self, expected_indices, annot):
-    self.assertEqual({tensor_format.OMITTED_INDICES_KEY: expected_indices},
-                     annot)
+  # Regular expression for text representation of float numbers, possibly in
+  # engineering notation.
+  _ELEMENT_REGEX = re.compile(
+      r"([+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?|nan|inf|-inf)")
+
+  def _checkBeginIndicesAnnotations(self, out, a):
+    """Check the beginning-index annotations of an ndarray representation.
+
+    Args:
+      out: An instance of RichTextLines representing a numpy.ndarray.
+      a: The numpy.ndarray being represented.
+
+    Raises:
+      ValueError: if any ellipses ("...") are found in the lines representing
+        the array.
+    """
+    begin_line_num = 0
+    while not out.lines[begin_line_num].startswith("array"):
+      begin_line_num += 1
+    element_index = 0
+    for line_num in range(begin_line_num, len(out.lines)):
+      line = out.lines[line_num]
+      if "..." in line:
+        raise ValueError("Unexpected found ellipses in line representing array")
+      matches = re.finditer(self._ELEMENT_REGEX, line)
+      for line_item_index, _ in enumerate(matches):
+        subscripts = list(np.unravel_index(element_index, a.shape))
+        if line_item_index == 0:
+          self.assertEqual({tensor_format.BEGIN_INDICES_KEY: subscripts},
+                           out.annotations[line_num])
+        element_index += 1
+    self.assertEqual(element_index, np.size(a))
+
+  def _checkTensorElementLocations(self, out, a):
+    """Check the results of locate_tensor_element on an ndarray representation.
+
+    that represents a numpy.ndaray.
+
+    Args:
+      out: An instance of RichTextLines representing a numpy.ndarray.
+      a: The numpy.ndarray being represented.
+
+    Raises:
+      ValueError: if any ellipses ("...") are found in the lines representing
+        the array.
+    """
+    # First, locate the beginning of the tensor value section.
+    begin_line_num = 0
+    while not out.lines[begin_line_num].startswith("array"):
+      begin_line_num += 1
+    # Second, find all matches to tensor-value regex.
+    element_index = 0
+    for line_num in range(begin_line_num, len(out.lines)):
+      line = out.lines[line_num]
+      if "..." in line:
+        raise ValueError("Unexpected found ellipses in line representing array")
+      matches = re.finditer(self._ELEMENT_REGEX, line)
+      for match in matches:
+        subscripts = list(np.unravel_index(element_index, a.shape))
+        is_omitted, row, start_col, end_col = (
+            tensor_format.locate_tensor_element(out, subscripts))
+        self.assertFalse(is_omitted)
+        self.assertEqual(line_num, row)
+        self.assertEqual(match.start(), start_col)
+        self.assertEqual(match.end(), end_col)
+        element_index += 1
+    self.assertEqual(element_index, np.size(a))
+
+  def _findFirst(self, lines, string):
+    """Find first occurrence of a string in a list of strings."""
+    for i, line in enumerate(lines):
+      find_index = line.find(string)
+      if find_index >= 0:
+        return i, find_index
+
+  def _extractBoldNumbers(self, out, start_line):
+    """Extract all numbers that have the bold font attribute.
+
+    Args:
+      out: An instance of RichTextLines.
+      start_line: 0-based index to start from.
+
+    Returns:
+      A list of floats.
+    """
+    floats = []
+    for i in range(start_line, len(out.lines)):
+      if i not in out.font_attr_segs:
+        continue
+      line_attrs = out.font_attr_segs[i]
+      for begin, end, attr_value in line_attrs:
+        if attr_value == "bold":
+          floats.append(float(out.lines[i][begin:end]))
+    return floats
 
   def testFormatZeroDimensionTensor(self):
-    a = np.array(42.0, dtype=np.float32)
+    a = np.array(42, dtype=np.int32)
 
     out = tensor_format.format_tensor(a, "a")
 
-    self.assertEqual(["Tensor \"a\":", "", "array(42.0, dtype=float32)"],
-                     out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["Tensor \"a\":", ""], out.lines[:2])
+    self.assertTrue(out.lines[2].startswith("array(42"))
     self._checkTensorMetadata(a, out.annotations)
 
   def testFormatTensorHighlightsTensorNameWithoutDebugOp(self):
@@ -81,82 +171,51 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
     out = tensor_format.format_tensor(
         a, "a", np_printoptions={"linewidth": 40})
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "",
-        "array([ 0.,  0.,  0.,  0.,  0.,  0.,",
-        "        0.,  0.,  0.,  0.,  0.,  0.,",
-        "        0.,  0.,  0.,  0.,  0.,  0.,",
-        "        0.,  0.])",
-    ], out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["Tensor \"a\":", ""], out.lines[:2])
+    self.assertEqual(repr(a).split("\n"), out.lines[2:])
 
     self._checkTensorMetadata(a, out.annotations)
 
     # Check annotations for beginning indices of the lines.
-    self._checkBeginIndices([0], out.annotations[2])
-    self._checkBeginIndices([6], out.annotations[3])
-    self._checkBeginIndices([12], out.annotations[4])
-    self._checkBeginIndices([18], out.annotations[5])
+    self._checkBeginIndicesAnnotations(out, a)
 
   def testFormatTensor2DNoEllipsisNoRowBreak(self):
     a = np.linspace(0.0, 1.0 - 1.0 / 16.0, 16).reshape([4, 4])
 
     out = tensor_format.format_tensor(a, "a")
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "",
-        "array([[ 0.    ,  0.0625,  0.125 ,  0.1875],",
-        "       [ 0.25  ,  0.3125,  0.375 ,  0.4375],",
-        "       [ 0.5   ,  0.5625,  0.625 ,  0.6875],",
-        "       [ 0.75  ,  0.8125,  0.875 ,  0.9375]])",
-    ], out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["Tensor \"a\":", ""], out.lines[:2])
+    self.assertEqual(repr(a).split("\n"), out.lines[2:])
 
     self._checkTensorMetadata(a, out.annotations)
-
-    # Check annotations for the beginning indices of the lines.
-    for i in xrange(2, 6):
-      self._checkBeginIndices([i  - 2, 0], out.annotations[i])
+    self._checkBeginIndicesAnnotations(out, a)
 
   def testFormatTensorSuppressingTensorName(self):
     a = np.linspace(0.0, 1.0 - 1.0 / 16.0, 16).reshape([4, 4])
 
     out = tensor_format.format_tensor(a, None)
-
-    self.assertEqual([
-        "array([[ 0.    ,  0.0625,  0.125 ,  0.1875],",
-        "       [ 0.25  ,  0.3125,  0.375 ,  0.4375],",
-        "       [ 0.5   ,  0.5625,  0.625 ,  0.6875],",
-        "       [ 0.75  ,  0.8125,  0.875 ,  0.9375]])",
-    ], out.lines)
+    self.assertEqual(repr(a).split("\n"), out.lines)
 
     self._checkTensorMetadata(a, out.annotations)
-
-    # Check annotations for the beginning indices of the lines.
-    for i in xrange(4):
-      self._checkBeginIndices([i, 0], out.annotations[i])
+    self._checkBeginIndicesAnnotations(out, a)
 
   def testFormatTensorWithMetadata(self):
     a = np.linspace(0.0, 1.0 - 1.0 / 16.0, 16).reshape([4, 4])
 
     out = tensor_format.format_tensor(a, "a", include_metadata=True)
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "  dtype: float64",
-        "  shape: (4, 4)",
-        "",
-        "array([[ 0.    ,  0.0625,  0.125 ,  0.1875],",
-        "       [ 0.25  ,  0.3125,  0.375 ,  0.4375],",
-        "       [ 0.5   ,  0.5625,  0.625 ,  0.6875],",
-        "       [ 0.75  ,  0.8125,  0.875 ,  0.9375]])",
-    ], out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self,
+        ["Tensor \"a\":",
+         "  dtype: float64",
+         "  shape: (4, 4)",
+         ""], out.lines[:4])
+    self.assertEqual(repr(a).split("\n"), out.lines[4:])
 
     self._checkTensorMetadata(a, out.annotations)
-
-    # Check annotations for the beginning indices of the lines.
-    for i in xrange(4, 7):
-      self._checkBeginIndices([i  - 4, 0], out.annotations[i])
+    self._checkBeginIndicesAnnotations(out, a)
 
   def testFormatTensor2DNoEllipsisWithRowBreak(self):
     a = np.linspace(0.0, 1.0 - 1.0 / 40.0, 40).reshape([2, 20])
@@ -168,58 +227,26 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
         {"dtype": a.dtype, "shape": a.shape},
         out.annotations["tensor_metadata"])
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "",
-        "array([[ 0.   ,  0.025,  0.05 ,  0.075,  0.1  ,",
-        "         0.125,  0.15 ,  0.175,  0.2  ,  0.225,",
-        "         0.25 ,  0.275,  0.3  ,  0.325,  0.35 ,",
-        "         0.375,  0.4  ,  0.425,  0.45 ,  0.475],",
-        "       [ 0.5  ,  0.525,  0.55 ,  0.575,  0.6  ,",
-        "         0.625,  0.65 ,  0.675,  0.7  ,  0.725,",
-        "         0.75 ,  0.775,  0.8  ,  0.825,  0.85 ,",
-        "         0.875,  0.9  ,  0.925,  0.95 ,  0.975]])",
-    ], out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["Tensor \"a\":", ""], out.lines[:2])
+    self.assertEqual(repr(a).split("\n"), out.lines[2:])
 
     self._checkTensorMetadata(a, out.annotations)
 
     # Check annotations for the beginning indices of the lines.
-    self._checkBeginIndices([0, 0], out.annotations[2])
-    self._checkBeginIndices([0, 5], out.annotations[3])
-    self._checkBeginIndices([0, 10], out.annotations[4])
-    self._checkBeginIndices([0, 15], out.annotations[5])
-    self._checkBeginIndices([1, 0], out.annotations[6])
-    self._checkBeginIndices([1, 5], out.annotations[7])
-    self._checkBeginIndices([1, 10], out.annotations[8])
-    self._checkBeginIndices([1, 15], out.annotations[9])
-
-  def testFormatTensor3DNoEllipsis(self):  # TODO(cais): Test name.
+    self._checkBeginIndicesAnnotations(out, a)
+
+  def testFormatTensor3DNoEllipsis(self):
     a = np.linspace(0.0, 1.0 - 1.0 / 24.0, 24).reshape([2, 3, 4])
 
     out = tensor_format.format_tensor(a, "a")
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "",
-        "array([[[ 0.        ,  0.04166667,  0.08333333,  0.125     ],",
-        "        [ 0.16666667,  0.20833333,  0.25      ,  0.29166667],",
-        "        [ 0.33333333,  0.375     ,  0.41666667,  0.45833333]],",
-        "",
-        "       [[ 0.5       ,  0.54166667,  0.58333333,  0.625     ],",
-        "        [ 0.66666667,  0.70833333,  0.75      ,  0.79166667],",
-        "        [ 0.83333333,  0.875     ,  0.91666667,  0.95833333]]])",
-    ], out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["Tensor \"a\":", ""], out.lines[:2])
+    self.assertEqual(repr(a).split("\n"), out.lines[2:])
 
     self._checkTensorMetadata(a, out.annotations)
-
-    # Check annotations for beginning indices of the lines.
-    self._checkBeginIndices([0, 0, 0], out.annotations[2])
-    self._checkBeginIndices([0, 1, 0], out.annotations[3])
-    self._checkBeginIndices([0, 2, 0], out.annotations[4])
-    self.assertNotIn(5, out.annotations)
-    self._checkBeginIndices([1, 0, 0], out.annotations[6])
-    self._checkBeginIndices([1, 1, 0], out.annotations[7])
-    self._checkBeginIndices([1, 2, 0], out.annotations[8])
+    self._checkBeginIndicesAnnotations(out, a)
 
   def testFormatTensor3DNoEllipsisWithArgwhereHighlightWithMatches(self):
     a = np.linspace(0.0, 1.0 - 1.0 / 24.0, 24).reshape([2, 3, 4])
@@ -235,39 +262,22 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
     out = tensor_format.format_tensor(
         a, "a", highlight_options=highlight_options)
 
-    self.assertEqual([
-        "Tensor \"a\": "
-        "Highlighted(between 0.26 and 0.5): 5 of 24 element(s) (20.83%)",
-        "",
-        "array([[[ 0.        ,  0.04166667,  0.08333333,  0.125     ],",
-        "        [ 0.16666667,  0.20833333,  0.25      ,  0.29166667],",
-        "        [ 0.33333333,  0.375     ,  0.41666667,  0.45833333]],",
-        "",
-        "       [[ 0.5       ,  0.54166667,  0.58333333,  0.625     ],",
-        "        [ 0.66666667,  0.70833333,  0.75      ,  0.79166667],",
-        "        [ 0.83333333,  0.875     ,  0.91666667,  0.95833333]]])",
-    ], out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self,
+        ["Tensor \"a\": "
+         "Highlighted(between 0.26 and 0.5): 5 of 24 element(s) (20.83%)",
+         ""],
+        out.lines[:2])
+    self.assertEqual(repr(a).split("\n"), out.lines[2:])
 
     self._checkTensorMetadata(a, out.annotations)
 
     # Check annotations for beginning indices of the lines.
-    self._checkBeginIndices([0, 0, 0], out.annotations[2])
-    self._checkBeginIndices([0, 1, 0], out.annotations[3])
-    self._checkBeginIndices([0, 2, 0], out.annotations[4])
-    self.assertNotIn(5, out.annotations)
-    self._checkBeginIndices([1, 0, 0], out.annotations[6])
-    self._checkBeginIndices([1, 1, 0], out.annotations[7])
-    self._checkBeginIndices([1, 2, 0], out.annotations[8])
+    self._checkBeginIndicesAnnotations(out, a)
 
-    # Check font attribute segments for highlighted elements.
-    self.assertNotIn(2, out.font_attr_segs)
-    self.assertEqual([(49, 59, "bold")], out.font_attr_segs[3])
-    self.assertEqual([(10, 20, "bold"), (23, 28, "bold"), (36, 46, "bold"),
-                      (49, 59, "bold")], out.font_attr_segs[4])
-    self.assertNotIn(5, out.font_attr_segs)
-    self.assertNotIn(6, out.font_attr_segs)
-    self.assertNotIn(7, out.font_attr_segs)
-    self.assertNotIn(8, out.font_attr_segs)
+    self.assertAllClose(
+        [0.29166667, 0.33333333, 0.375, 0.41666667, 0.45833333],
+        self._extractBoldNumbers(out, 2))
 
   def testFormatTensor3DNoEllipsisWithArgwhereHighlightWithNoMatches(self):
     a = np.linspace(0.0, 1.0 - 1.0 / 24.0, 24).reshape([2, 3, 4])
@@ -279,93 +289,54 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
     out = tensor_format.format_tensor(
         a, "a", highlight_options=highlight_options)
 
-    self.assertEqual([
-        "Tensor \"a\": Highlighted: 0 of 24 element(s) (0.00%)", "",
-        "array([[[ 0.        ,  0.04166667,  0.08333333,  0.125     ],",
-        "        [ 0.16666667,  0.20833333,  0.25      ,  0.29166667],",
-        "        [ 0.33333333,  0.375     ,  0.41666667,  0.45833333]],", "",
-        "       [[ 0.5       ,  0.54166667,  0.58333333,  0.625     ],",
-        "        [ 0.66666667,  0.70833333,  0.75      ,  0.79166667],",
-        "        [ 0.83333333,  0.875     ,  0.91666667,  0.95833333]]])"
-    ], out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self,
+        ["Tensor \"a\": Highlighted: 0 of 24 element(s) (0.00%)", ""],
+        out.lines[:2])
+    self.assertEqual(repr(a).split("\n"), out.lines[2:])
 
     self._checkTensorMetadata(a, out.annotations)
-
-    # Check annotations for beginning indices of the lines.
-    self._checkBeginIndices([0, 0, 0], out.annotations[2])
-    self._checkBeginIndices([0, 1, 0], out.annotations[3])
-    self._checkBeginIndices([0, 2, 0], out.annotations[4])
-    self.assertNotIn(5, out.annotations)
-    self._checkBeginIndices([1, 0, 0], out.annotations[6])
-    self._checkBeginIndices([1, 1, 0], out.annotations[7])
-    self._checkBeginIndices([1, 2, 0], out.annotations[8])
+    self._checkBeginIndicesAnnotations(out, a)
 
     # Check font attribute segments for highlighted elements.
-    self.assertNotIn(2, out.font_attr_segs)
-    self.assertNotIn(3, out.font_attr_segs)
-    self.assertNotIn(4, out.font_attr_segs)
-    self.assertNotIn(5, out.font_attr_segs)
-    self.assertNotIn(6, out.font_attr_segs)
-    self.assertNotIn(7, out.font_attr_segs)
-    self.assertNotIn(8, out.font_attr_segs)
+    for i in range(2, len(out.lines)):
+      self.assertNotIn(i, out.font_attr_segs)
 
   def testFormatTensorWithEllipses(self):
-    a = np.zeros([11, 11, 11])
+    a = (np.arange(11 * 11 * 11) + 1000).reshape([11, 11, 11]).astype(np.int32)
 
     out = tensor_format.format_tensor(
         a, "a", False, np_printoptions={"threshold": 100, "edgeitems": 2})
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "",
-        "array([[[ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        ..., ",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.]],",
-        "",
-        "       [[ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        ..., ",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.]],",
-        "",
-        "       ..., ",
-        "       [[ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        ..., ",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.]],",
-        "",
-        "       [[ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        ..., ",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.]]])",
-    ], out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["Tensor \"a\":", ""], out.lines[:2])
+    self.assertEqual(repr(a).split("\n"), out.lines[2:])
 
     self._checkTensorMetadata(a, out.annotations)
 
     # Check annotations for beginning indices of the lines.
-    for i in xrange(2):
-      self._checkBeginIndices([i, 0, 0], out.annotations[i * 6 + 2])
-      self._checkBeginIndices([i, 1, 0], out.annotations[i * 6 + 3])
-      self._checkOmittedIndices([i, 2, 0], out.annotations[i * 6 + 4])
-      self._checkBeginIndices([i, 9, 0], out.annotations[i * 6 + 5])
-      self._checkBeginIndices([i, 10, 0], out.annotations[i * 6 + 6])
-      self.assertNotIn(i * 6 + 7, out.annotations)
-
-    p = 15
-    for i in xrange(2):
-      self._checkBeginIndices([9 + i, 0, 0], out.annotations[p + i * 6])
-      self._checkBeginIndices([9 + i, 1, 0], out.annotations[p + i * 6 + 1])
-      self._checkOmittedIndices(
-          [9 + i, 2, 0], out.annotations[p + i * 6 + 2])
-      self._checkBeginIndices([9 + i, 9, 0], out.annotations[p + i * 6 + 3])
-      self._checkBeginIndices([9 + i, 10, 0], out.annotations[p + i * 6 + 4])
-
-      if i < 1:
-        self.assertNotIn(p + i * 6 + 5, out.annotations)
+    actual_row_0_0_0, _ = self._findFirst(out.lines, "1000")
+    self.assertEqual({tensor_format.BEGIN_INDICES_KEY: [0, 0, 0]},
+                     out.annotations[actual_row_0_0_0])
+    actual_row_0_1_0, _ = self._findFirst(out.lines, "1011")
+    self.assertEqual({tensor_format.BEGIN_INDICES_KEY: [0, 1, 0]},
+                     out.annotations[actual_row_0_1_0])
+    # Find the first line that is completely omitted.
+    omitted_line = 2
+    while not out.lines[omitted_line].strip().startswith("..."):
+      omitted_line += 1
+    self.assertEqual({tensor_format.OMITTED_INDICES_KEY: [0, 2, 0]},
+                     out.annotations[omitted_line])
+
+    actual_row_10_10_0, _ = self._findFirst(out.lines, "2320")
+    self.assertEqual({tensor_format.BEGIN_INDICES_KEY: [10, 10, 0]},
+                     out.annotations[actual_row_10_10_0])
+    # Find the last line that is completely omitted.
+    omitted_line = len(out.lines) - 1
+    while not out.lines[omitted_line].strip().startswith("..."):
+      omitted_line -= 1
+    self.assertEqual({tensor_format.OMITTED_INDICES_KEY: [10, 2, 0]},
+                     out.annotations[omitted_line])
 
   def testFormatUninitializedTensor(self):
     tensor_proto = tensor_pb2.TensorProto(
@@ -396,63 +367,11 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
     out = tensor_format.format_tensor(
         a, "a", np_printoptions={"linewidth": 40})
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "",
-        "array([ 0.,  0.,  0.,  0.,  0.,  0.,",
-        "        0.,  0.,  0.,  0.,  0.,  0.,",
-        "        0.,  0.,  0.,  0.,  0.,  0.,",
-        "        0.,  0.])",
-    ], out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["Tensor \"a\":", ""], out.lines[:2])
+    self.assertEqual(repr(a).split("\n"), out.lines[2:])
 
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [0])
-    self.assertFalse(is_omitted)
-    self.assertEqual(2, row)
-    self.assertEqual(8, start_col)
-    self.assertEqual(10, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [5])
-    self.assertFalse(is_omitted)
-    self.assertEqual(2, row)
-    self.assertEqual(33, start_col)
-    self.assertEqual(35, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [6])
-    self.assertFalse(is_omitted)
-    self.assertEqual(3, row)
-    self.assertEqual(8, start_col)
-    self.assertEqual(10, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [11])
-    self.assertFalse(is_omitted)
-    self.assertEqual(3, row)
-    self.assertEqual(33, start_col)
-    self.assertEqual(35, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [12])
-    self.assertFalse(is_omitted)
-    self.assertEqual(4, row)
-    self.assertEqual(8, start_col)
-    self.assertEqual(10, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [18])
-    self.assertFalse(is_omitted)
-    self.assertEqual(5, row)
-    self.assertEqual(8, start_col)
-    self.assertEqual(10, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [19])
-    self.assertFalse(is_omitted)
-    self.assertEqual(5, row)
-    self.assertEqual(13, start_col)
-    self.assertEqual(15, end_col)
+    self._checkTensorElementLocations(out, a)
 
     with self.assertRaisesRegexp(
         ValueError, "Indices exceed tensor dimensions"):
@@ -472,49 +391,11 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
     out = tensor_format.format_tensor(
         a, "a", np_printoptions={"linewidth": 40})
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "",
-        "array([ 0.,  0.,  0.,  0.,  0.,  0.,",
-        "        0.,  0.,  0.,  0.,  0.,  0.,",
-        "        0.,  0.,  0.,  0.,  0.,  0.,",
-        "        0.,  0.])",
-    ], out.lines)
-
-    (are_omitted, rows, start_cols,
-     end_cols) = tensor_format.locate_tensor_element(out, [[0]])
-    self.assertEqual([False], are_omitted)
-    self.assertEqual([2], rows)
-    self.assertEqual([8], start_cols)
-    self.assertEqual([10], end_cols)
-
-    (are_omitted, rows, start_cols,
-     end_cols) = tensor_format.locate_tensor_element(out, [[0], [5]])
-    self.assertEqual([False, False], are_omitted)
-    self.assertEqual([2, 2], rows)
-    self.assertEqual([8, 33], start_cols)
-    self.assertEqual([10, 35], end_cols)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["Tensor \"a\":", ""], out.lines[:2])
+    self.assertEqual(repr(a).split("\n"), out.lines[2:])
 
-    (are_omitted, rows, start_cols,
-     end_cols) = tensor_format.locate_tensor_element(out, [[0], [6]])
-    self.assertEqual([False, False], are_omitted)
-    self.assertEqual([2, 3], rows)
-    self.assertEqual([8, 8], start_cols)
-    self.assertEqual([10, 10], end_cols)
-
-    (are_omitted, rows, start_cols,
-     end_cols) = tensor_format.locate_tensor_element(out, [[0], [5], [6]])
-    self.assertEqual([False, False, False], are_omitted)
-    self.assertEqual([2, 2, 3], rows)
-    self.assertEqual([8, 33, 8], start_cols)
-    self.assertEqual([10, 35, 10], end_cols)
-
-    (are_omitted, rows, start_cols,
-     end_cols) = tensor_format.locate_tensor_element(out, [[0], [5], [6], [19]])
-    self.assertEqual([False, False, False, False], are_omitted)
-    self.assertEqual([2, 2, 3, 5], rows)
-    self.assertEqual([8, 33, 8, 13], start_cols)
-    self.assertEqual([10, 35, 10, 15], end_cols)
+    self._checkTensorElementLocations(out, a)
 
   def testBatchModeWithErrors(self):
     a = np.zeros(20)
@@ -522,14 +403,9 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
     out = tensor_format.format_tensor(
         a, "a", np_printoptions={"linewidth": 40})
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "",
-        "array([ 0.,  0.,  0.,  0.,  0.,  0.,",
-        "        0.,  0.,  0.,  0.,  0.,  0.,",
-        "        0.,  0.,  0.,  0.,  0.,  0.,",
-        "        0.,  0.])",
-    ], out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["Tensor \"a\":", ""], out.lines[:2])
+    self.assertEqual(repr(a).split("\n"), out.lines[2:])
 
     with self.assertRaisesRegexp(ValueError, "Dimensions mismatch"):
       tensor_format.locate_tensor_element(out, [[0, 0], [0]])
@@ -554,104 +430,22 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
     out = tensor_format.format_tensor(
         a, "a", np_printoptions={"linewidth": 100})
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "",
-        "array([[  1.00000000e-08,   1.00000000e-08,   1.00000000e-08],",
-        "       [             nan,   1.00000000e-08,              inf],",
-        "       [  1.00000000e-08,   1.00000000e-08,   1.00000000e-08]])",
-    ], out.lines)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [0, 0])
-    self.assertFalse(is_omitted)
-    self.assertEqual(2, row)
-    self.assertEqual(10, start_col)
-    self.assertEqual(24, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [0, 2])
-    self.assertFalse(is_omitted)
-    self.assertEqual(2, row)
-    self.assertEqual(46, start_col)
-    self.assertEqual(60, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [1, 0])
-    self.assertFalse(is_omitted)
-    self.assertEqual(3, row)
-    self.assertEqual(21, start_col)
-    self.assertEqual(24, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [1, 1])
-    self.assertFalse(is_omitted)
-    self.assertEqual(3, row)
-    self.assertEqual(28, start_col)
-    self.assertEqual(42, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [1, 2])
-    self.assertFalse(is_omitted)
-    self.assertEqual(3, row)
-    self.assertEqual(57, start_col)
-    self.assertEqual(60, end_col)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["Tensor \"a\":", ""], out.lines[:2])
+    self.assertEqual(repr(a).split("\n"), out.lines[2:])
 
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [2, 2])
-    self.assertFalse(is_omitted)
-    self.assertEqual(4, row)
-    self.assertEqual(46, start_col)
-    self.assertEqual(60, end_col)
+    self._checkTensorElementLocations(out, a)
 
   def testLocateTensorElement2DNoEllipsis(self):
     a = np.linspace(0.0, 1.0 - 1.0 / 16.0, 16).reshape([4, 4])
 
     out = tensor_format.format_tensor(a, "a")
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "",
-        "array([[ 0.    ,  0.0625,  0.125 ,  0.1875],",
-        "       [ 0.25  ,  0.3125,  0.375 ,  0.4375],",
-        "       [ 0.5   ,  0.5625,  0.625 ,  0.6875],",
-        "       [ 0.75  ,  0.8125,  0.875 ,  0.9375]])",
-    ], out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["Tensor \"a\":", ""], out.lines[:2])
+    self.assertEqual(repr(a).split("\n"), out.lines[2:])
 
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [0, 0])
-    self.assertFalse(is_omitted)
-    self.assertEqual(2, row)
-    self.assertEqual(9, start_col)
-    self.assertEqual(11, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [0, 3])
-    self.assertFalse(is_omitted)
-    self.assertEqual(2, row)
-    self.assertEqual(36, start_col)
-    self.assertEqual(42, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [1, 0])
-    self.assertFalse(is_omitted)
-    self.assertEqual(3, row)
-    self.assertEqual(9, start_col)
-    self.assertEqual(13, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [1, 3])
-    self.assertFalse(is_omitted)
-    self.assertEqual(3, row)
-    self.assertEqual(36, start_col)
-    self.assertEqual(42, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [3, 3])
-    self.assertFalse(is_omitted)
-    self.assertEqual(5, row)
-    self.assertEqual(36, start_col)
-    self.assertEqual(42, end_col)
+    self._checkTensorElementLocations(out, a)
 
     with self.assertRaisesRegexp(
         ValueError, "Indices exceed tensor dimensions"):
@@ -670,55 +464,20 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
 
     out = tensor_format.format_tensor(a, "a", include_numeric_summary=True)
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "",
-        "Numeric summary:",
-        "|  0  + | total |",
-        "|  1 15 |    16 |",
-        "|           min           max          mean           std |",
-        "|           0.0        0.9375       0.46875 0.28811076429 |",
-        "",
-        "array([[ 0.    ,  0.0625,  0.125 ,  0.1875],",
-        "       [ 0.25  ,  0.3125,  0.375 ,  0.4375],",
-        "       [ 0.5   ,  0.5625,  0.625 ,  0.6875],",
-        "       [ 0.75  ,  0.8125,  0.875 ,  0.9375]])",
-    ], out.lines)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [0, 0])
-    self.assertFalse(is_omitted)
-    self.assertEqual(8, row)
-    self.assertEqual(9, start_col)
-    self.assertEqual(11, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [0, 3])
-    self.assertFalse(is_omitted)
-    self.assertEqual(8, row)
-    self.assertEqual(36, start_col)
-    self.assertEqual(42, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [1, 0])
-    self.assertFalse(is_omitted)
-    self.assertEqual(9, row)
-    self.assertEqual(9, start_col)
-    self.assertEqual(13, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [1, 3])
-    self.assertFalse(is_omitted)
-    self.assertEqual(9, row)
-    self.assertEqual(36, start_col)
-    self.assertEqual(42, end_col)
-
-    is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
-        out, [3, 3])
-    self.assertFalse(is_omitted)
-    self.assertEqual(11, row)
-    self.assertEqual(36, start_col)
-    self.assertEqual(42, end_col)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self,
+        ["Tensor \"a\":",
+         "",
+         "Numeric summary:",
+         "|  0  + | total |",
+         "|  1 15 |    16 |",
+         "|           min           max          mean           std |"],
+        out.lines[:6])
+    cli_test_utils.assert_array_lines_close(
+        self, [0.0, 0.9375, 0.46875, 0.28811076429], out.lines[6:7])
+    cli_test_utils.assert_array_lines_close(self, a, out.lines[8:])
+
+    self._checkTensorElementLocations(out, a)
 
     with self.assertRaisesRegexp(
         ValueError, "Indices exceed tensor dimensions"):
@@ -733,100 +492,75 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
       tensor_format.locate_tensor_element(out, [0])
 
   def testLocateTensorElement3DWithEllipses(self):
-    a = np.zeros([11, 11, 11])
+    a = (np.arange(11 * 11 * 11) + 1000).reshape([11, 11, 11]).astype(np.int32)
 
     out = tensor_format.format_tensor(
         a, "a", False, np_printoptions={"threshold": 100, "edgeitems": 2})
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "",
-        "array([[[ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        ..., ",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.]],",
-        "",
-        "       [[ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        ..., ",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.]],",
-        "",
-        "       ..., ",
-        "       [[ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        ..., ",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.]],",
-        "",
-        "       [[ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        ..., ",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.]]])",
-    ], out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["Tensor \"a\":", ""], out.lines[:2])
 
+    actual_row_0_0_0, actual_col_0_0_0 = self._findFirst(out.lines, "1000")
     is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
         out, [0, 0, 0])
     self.assertFalse(is_omitted)
-    self.assertEqual(2, row)
-    self.assertEqual(10, start_col)
-    self.assertEqual(12, end_col)
+    self.assertEqual(actual_row_0_0_0, row)
+    self.assertEqual(actual_col_0_0_0, start_col)
+    self.assertEqual(actual_col_0_0_0 + 4, end_col)
 
+    actual_row_0_0_10, _ = self._findFirst(out.lines, "1010")
     is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
         out, [0, 0, 10])
     self.assertFalse(is_omitted)
-    self.assertEqual(2, row)
+    self.assertEqual(actual_row_0_0_10, row)
     self.assertIsNone(start_col)  # Passes ellipsis.
     self.assertIsNone(end_col)
 
+    actual_row_0_1_0, actual_col_0_1_0 = self._findFirst(out.lines, "1011")
     is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
         out, [0, 1, 0])
     self.assertFalse(is_omitted)
-    self.assertEqual(3, row)
-    self.assertEqual(10, start_col)
-    self.assertEqual(12, end_col)
+    self.assertEqual(actual_row_0_1_0, row)
+    self.assertEqual(actual_col_0_1_0, start_col)
+    self.assertEqual(actual_col_0_1_0 + 4, end_col)
 
     is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
         out, [0, 2, 0])
     self.assertTrue(is_omitted)  # In omitted line.
-    self.assertEqual(4, row)
     self.assertIsNone(start_col)
     self.assertIsNone(end_col)
 
     is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
         out, [0, 2, 10])
     self.assertTrue(is_omitted)  # In omitted line.
-    self.assertEqual(4, row)
     self.assertIsNone(start_col)
     self.assertIsNone(end_col)
 
     is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
         out, [0, 8, 10])
     self.assertTrue(is_omitted)  # In omitted line.
-    self.assertEqual(4, row)
     self.assertIsNone(start_col)
     self.assertIsNone(end_col)
 
+    actual_row_0_10_1, actual_col_0_10_1 = self._findFirst(out.lines, "1111")
     is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
         out, [0, 10, 1])
     self.assertFalse(is_omitted)
-    self.assertEqual(6, row)
-    self.assertEqual(15, start_col)
-    self.assertEqual(17, end_col)
+    self.assertEqual(actual_row_0_10_1, row)
+    self.assertEqual(actual_col_0_10_1, start_col)
+    self.assertEqual(actual_col_0_10_1 + 4, end_col)
 
     is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
         out, [5, 1, 1])
     self.assertTrue(is_omitted)  # In omitted line.
-    self.assertEqual(14, row)
     self.assertIsNone(start_col)
     self.assertIsNone(end_col)
 
+    actual_row_10_10_10, _ = self._findFirst(out.lines, "2330")
     is_omitted, row, start_col, end_col = tensor_format.locate_tensor_element(
         out, [10, 10, 10])
     self.assertFalse(is_omitted)
-    self.assertEqual(25, row)
+    self.assertEqual(actual_row_10_10_10, row)
     self.assertIsNone(start_col)  # Past ellipsis.
     self.assertIsNone(end_col)
 
@@ -843,71 +577,50 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
       tensor_format.locate_tensor_element(out, [5, 5])
 
   def testLocateTensorElement3DWithEllipsesBatchMode(self):
-    a = np.zeros([11, 11, 11])
+    a = (np.arange(11 * 11 * 11) + 1000).reshape([11, 11, 11]).astype(np.int32)
 
     out = tensor_format.format_tensor(
         a, "a", False, np_printoptions={"threshold": 100,
                                         "edgeitems": 2})
 
-    self.assertEqual([
-        "Tensor \"a\":",
-        "",
-        "array([[[ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        ..., ",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.]],",
-        "",
-        "       [[ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        ..., ",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.]],",
-        "",
-        "       ..., ",
-        "       [[ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        ..., ",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.]],",
-        "",
-        "       [[ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        ..., ",
-        "        [ 0.,  0., ...,  0.,  0.],",
-        "        [ 0.,  0., ...,  0.,  0.]]])",
-    ], out.lines)
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["Tensor \"a\":", ""], out.lines[:2])
+    self.assertEqual(repr(a).split("\n"), out.lines[2:])
+
+    actual_row_0_0_0, actual_col_0_0_0 = self._findFirst(out.lines, "1000")
+    actual_row_0_0_10, _ = self._findFirst(out.lines, "1010")
+    actual_row_10_10_10, _ = self._findFirst(out.lines, "2330")
 
     (are_omitted, rows, start_cols,
      end_cols) = tensor_format.locate_tensor_element(out, [[0, 0, 0]])
     self.assertEqual([False], are_omitted)
-    self.assertEqual([2], rows)
-    self.assertEqual([10], start_cols)
-    self.assertEqual([12], end_cols)
+    self.assertEqual([actual_row_0_0_0], rows)
+    self.assertEqual([actual_col_0_0_0], start_cols)
+    self.assertEqual([actual_col_0_0_0 + 4], end_cols)
 
     (are_omitted, rows, start_cols,
      end_cols) = tensor_format.locate_tensor_element(out,
                                                      [[0, 0, 0], [0, 0, 10]])
     self.assertEqual([False, False], are_omitted)
-    self.assertEqual([2, 2], rows)
-    self.assertEqual([10, None], start_cols)
-    self.assertEqual([12, None], end_cols)
+    self.assertEqual([actual_row_0_0_0, actual_row_0_0_10], rows)
+    self.assertEqual([actual_col_0_0_0, None], start_cols)
+    self.assertEqual([actual_col_0_0_0 + 4, None], end_cols)
 
     (are_omitted, rows, start_cols,
      end_cols) = tensor_format.locate_tensor_element(out,
                                                      [[0, 0, 0], [0, 2, 0]])
     self.assertEqual([False, True], are_omitted)
     self.assertEqual([2, 4], rows)
-    self.assertEqual([10, None], start_cols)
-    self.assertEqual([12, None], end_cols)
+    self.assertEqual(2, len(start_cols))
+    self.assertEqual(2, len(end_cols))
 
     (are_omitted, rows, start_cols,
      end_cols) = tensor_format.locate_tensor_element(out,
                                                      [[0, 0, 0], [10, 10, 10]])
     self.assertEqual([False, False], are_omitted)
-    self.assertEqual([2, 25], rows)
-    self.assertEqual([10, None], start_cols)
-    self.assertEqual([12, None], end_cols)
+    self.assertEqual([actual_row_0_0_0, actual_row_10_10_10], rows)
+    self.assertEqual([actual_col_0_0_0, None], start_cols)
+    self.assertEqual([actual_col_0_0_0 + 4, None], end_cols)
 
   def testLocateTensorElementAnnotationsUnavailable(self):
     tensor_proto = tensor_pb2.TensorProto(
@@ -931,41 +644,41 @@ class NumericSummaryTest(test_util.TensorFlowTestCase):
     x = np.array([np.nan, np.nan, -np.inf, np.inf, np.inf, np.inf, -2, -3, -4,
                   0, 1, 2, 2, 2, 2, 0, 0, 0, np.inf, np.inf, np.inf])
     out = tensor_format.numeric_summary(x)
-    self.assertEqual(
-        "|  nan -inf    -    0    + +inf | total |", out.lines[0])
-    self.assertEqual(
-        "|    2    1    3    4    5    6 |    21 |", out.lines[1])
-    self.assertEqual(
-        "|           min           max          mean           std |",
-        out.lines[2])
-    self.assertEqual(
-        "|          -4.0           2.0           0.0 1.95789002075 |",
-        out.lines[3])
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self,
+        ["|  nan -inf    -    0    + +inf | total |",
+         "|    2    1    3    4    5    6 |    21 |",
+         "|     min     max    mean    std |"], out.lines[:3])
+    cli_test_utils.assert_array_lines_close(
+        self, [-4.0, 2.0, 0.0, 1.95789002075], out.lines[3:4])
 
   def testNumericSummaryOnFloatMissingCategories(self):
     x = np.array([np.nan, np.nan])
     out = tensor_format.numeric_summary(x)
     self.assertEqual(2, len(out.lines))
-    self.assertEqual("| nan | total |", out.lines[0])
-    self.assertEqual("|   2 |     2 |", out.lines[1])
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["| nan | total |", "|   2 |     2 |"], out.lines[:2])
 
     x = np.array([-np.inf, np.inf, 0, 0, np.inf, np.inf])
     out = tensor_format.numeric_summary(x)
-    self.assertEqual("| -inf    0 +inf | total |", out.lines[0])
-    self.assertEqual("|    1    2    3 |     6 |", out.lines[1])
-    self.assertEqual("|  min  max mean  std |", out.lines[2])
-    self.assertEqual("|  0.0  0.0  0.0  0.0 |", out.lines[3])
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self,
+        ["| -inf    0 +inf | total |",
+         "|    1    2    3 |     6 |",
+         "|  min  max mean  std |"], out.lines[:3])
+    cli_test_utils.assert_array_lines_close(
+        self, [0.0, 0.0, 0.0, 0.0], out.lines[3:4])
 
     x = np.array([-120, 120, 130])
     out = tensor_format.numeric_summary(x)
-    self.assertEqual("| - + | total |", out.lines[0])
-    self.assertEqual("| 1 2 |     3 |", out.lines[1])
-    self.assertEqual(
-        "|           min           max          mean           std |",
-        out.lines[2])
-    self.assertEqual(
-        "|          -120           130 43.3333333333 115.566238822 |",
-        out.lines[3])
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self,
+        ["| - + | total |",
+         "| 1 2 |     3 |",
+         "|       min       max     mean      std |"],
+        out.lines[:3])
+    cli_test_utils.assert_array_lines_close(
+        self, [-120, 130, 43.3333333333, 115.566238822], out.lines[3:4])
 
   def testNumericSummaryOnEmptyFloat(self):
     x = np.array([], dtype=np.float32)
@@ -976,33 +689,31 @@ class NumericSummaryTest(test_util.TensorFlowTestCase):
   def testNumericSummaryOnInt(self):
     x = np.array([-3] * 50 + [3] * 200 + [0], dtype=np.int32)
     out = tensor_format.numeric_summary(x)
-    self.assertEqual("|   -   0   + | total |", out.lines[0])
-    self.assertEqual("|  50   1 200 |   251 |", out.lines[1])
-    self.assertEqual(
-        "|           min           max          mean           std |",
-        out.lines[2])
-    self.assertEqual(
-        "|            -3             3 1.79282868526 2.39789673081 |",
-        out.lines[3])
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self,
+        ["|   -   0   + | total |",
+         "|  50   1 200 |   251 |",
+         "|      min     max    mean     std |"],
+        out.lines[:3])
+    cli_test_utils.assert_array_lines_close(
+        self, [-3, 3, 1.79282868526, 2.39789673081], out.lines[3:4])
 
   def testNumericSummaryOnBool(self):
     x = np.array([False, True, True, False], dtype=np.bool)
     out = tensor_format.numeric_summary(x)
-    self.assertEqual(2, len(out.lines))
-    self.assertEqual("| False  True | total |", out.lines[0])
-    self.assertEqual("|     2     2 |     4 |", out.lines[1])
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self,
+        ["| False  True | total |", "|     2     2 |     4 |"], out.lines)
 
     x = np.array([True] * 10, dtype=np.bool)
     out = tensor_format.numeric_summary(x)
-    self.assertEqual(2, len(out.lines))
-    self.assertEqual("| True | total |", out.lines[0])
-    self.assertEqual("|   10 |    10 |", out.lines[1])
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["| True | total |", "|   10 |    10 |"], out.lines)
 
     x = np.array([False] * 10, dtype=np.bool)
     out = tensor_format.numeric_summary(x)
-    self.assertEqual(2, len(out.lines))
-    self.assertEqual("| False | total |", out.lines[0])
-    self.assertEqual("|    10 |    10 |", out.lines[1])
+    cli_test_utils.assert_lines_equal_ignoring_whitespace(
+        self, ["| False | total |", "|    10 |    10 |"], out.lines)
 
     x = np.array([], dtype=np.bool)
     out = tensor_format.numeric_summary(x)
diff --git a/tensorflow/python/debug/examples/debug_fibonacci.py b/tensorflow/python/debug/examples/debug_fibonacci.py
index 704dbda357d1208d0663da41eb7aef4b299dedb8..3821b393ec6847db71b7c4b7396b1ed448ae9538 100644
--- a/tensorflow/python/debug/examples/debug_fibonacci.py
+++ b/tensorflow/python/debug/examples/debug_fibonacci.py
@@ -44,6 +44,10 @@ def main(_):
   sess.run(tf.global_variables_initializer())
 
   # Wrap the TensorFlow Session object for debugging.
+  if FLAGS.debug and FLAGS.tensorboard_debug_address:
+    raise ValueError(
+        "The --debug and --tensorboard_debug_address flags are mutually "
+        "exclusive.")
   if FLAGS.debug:
     sess = tf_debug.LocalCLIDebugWrapperSession(sess)
 
@@ -52,6 +56,9 @@ def main(_):
 
     sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
     sess.add_tensor_filter("has_negative", has_negative)
+  elif FLAGS.tensorboard_debug_address:
+    sess = tf_debug.TensorBoardDebugWrapperSession(
+        sess, FLAGS.tensorboard_debug_address)
 
   print("Fibonacci number at position %d:\n%s" %
         (FLAGS.length, sess.run(n1)))
@@ -82,7 +89,15 @@ if __name__ == "__main__":
       "--debug",
       dest="debug",
       action="store_true",
-      help="Use TensorFlow Debugger (tfdbg).")
+      help="Use TensorFlow Debugger (tfdbg). Mutually exclusive with the "
+      "--tensorboard_debug_address flag.")
+  parser.add_argument(
+      "--tensorboard_debug_address",
+      type=str,
+      default=None,
+      help="Connect to the TensorBoard Debugger Plugin backend specified by "
+      "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
+      "--debug flag.")
 
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_mnist.py b/tensorflow/python/debug/examples/debug_mnist.py
index 0a6dbf311d8e7a0377363d74b57ef2b1d7d00e1d..ab1c90371cd18bbaf278b72248bcc7e9e9c34b06 100644
--- a/tensorflow/python/debug/examples/debug_mnist.py
+++ b/tensorflow/python/debug/examples/debug_mnist.py
@@ -120,8 +120,15 @@ def main(_):
 
   sess.run(tf.global_variables_initializer())
 
+  if FLAGS.debug and FLAGS.tensorboard_debug_address:
+    raise ValueError(
+        "The --debug and --tensorboard_debug_address flags are mutually "
+        "exclusive.")
   if FLAGS.debug:
     sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=FLAGS.ui_type)
+  elif FLAGS.tensorboard_debug_address:
+    sess = tf_debug.TensorBoardDebugWrapperSession(
+        sess, FLAGS.tensorboard_debug_address)
 
   # Add this point, sess is a debug wrapper around the actual Session if
   # FLAGS.debug is true. In that case, calling run() will launch the CLI.
@@ -173,6 +180,14 @@ if __name__ == "__main__":
       nargs="?",
       const=True,
       default=False,
-      help="Use debugger to track down bad values during training")
+      help="Use debugger to track down bad values during training. "
+      "Mutually exclusive with the --tensorboard_debug_address flag.")
+  parser.add_argument(
+      "--tensorboard_debug_address",
+      type=str,
+      default=None,
+      help="Connect to the TensorBoard Debugger Plugin backend specified by "
+      "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
+      "--debug flag.")
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_tflearn_iris.py b/tensorflow/python/debug/examples/debug_tflearn_iris.py
index 92314d8dd9f64f48ffe0bc921f99a4661c4c0e93..4f4666ee4fa51ef085d31ee8396dffaf9e38f49e 100644
--- a/tensorflow/python/debug/examples/debug_tflearn_iris.py
+++ b/tensorflow/python/debug/examples/debug_tflearn_iris.py
@@ -110,10 +110,16 @@ def main(_):
       model_dir=model_dir)
 
   hooks = None
+  if FLAGS.debug and FLAGS.tensorboard_debug_address:
+    raise ValueError(
+        "The --debug and --tensorboard_debug_address flags are mutually "
+        "exclusive.")
   if FLAGS.debug:
     debug_hook = tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type,
                                             dump_root=FLAGS.dump_root)
-    hooks = [debug_hook]
+  elif FLAGS.tensorboard_debug_address:
+    debug_hook = tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address)
+  hooks = [debug_hook]
 
   if not FLAGS.use_experiment:
     # Fit model.
@@ -185,11 +191,19 @@ if __name__ == "__main__":
       nargs="?",
       const=True,
       default=False,
-      help="Use debugger to track down bad values during training")
+      help="Use debugger to track down bad values during training. "
+      "Mutually exclusive with the --tensorboard_debug_address flag.")
   parser.add_argument(
       "--dump_root",
       type=str,
       default="",
       help="Optional custom root directory for temporary debug dump data")
+  parser.add_argument(
+      "--tensorboard_debug_address",
+      type=str,
+      default=None,
+      help="Connect to the TensorBoard Debugger Plugin backend specified by "
+      "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
+      "--debug flag.")
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/examples_test.sh b/tensorflow/python/debug/examples/examples_test.sh
index 25916f1903cd41c7f714fd0eb7bad0329dde8ceb..2df6c0b6a2701022e3fed6648208b9708197bebc 100755
--- a/tensorflow/python/debug/examples/examples_test.sh
+++ b/tensorflow/python/debug/examples/examples_test.sh
@@ -23,6 +23,9 @@
 
 set -e
 
+# Filter out LOG(INFO)
+export TF_CPP_MIN_LOG_LEVEL=1
+
 IS_VIRTUALENV=0
 PYTHON_BIN_PATH=""
 while true; do
diff --git a/tensorflow/python/debug/lib/common.py b/tensorflow/python/debug/lib/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..19a0d8c5010021c02de26f5b401fea10c7563a58
--- /dev/null
+++ b/tensorflow/python/debug/lib/common.py
@@ -0,0 +1,87 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common values and methods for TensorFlow Debugger."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import json
+
+GRPC_URL_PREFIX = "grpc://"
+
+# A key for a Session.run() call.
+RunKey = collections.namedtuple("RunKey", ["feed_names", "fetch_names"])
+
+
+def get_graph_element_name(elem):
+  """Obtain the name or string representation of a graph element.
+
+  If the graph element has the attribute "name", return name. Otherwise, return
+  a __str__ representation of the graph element. Certain graph elements, such as
+  `SparseTensor`s, do not have the attribute "name".
+
+  Args:
+    elem: The graph element in question.
+
+  Returns:
+    If the attribute 'name' is available, return the name. Otherwise, return
+    str(fetch).
+  """
+
+  return elem.name if hasattr(elem, "name") else str(elem)
+
+
+def get_flattened_names(feeds_or_fetches):
+  """Get a flattened list of the names in run() call feeds or fetches.
+
+  Args:
+    feeds_or_fetches: Feeds or fetches of the `Session.run()` call. It maybe
+      a Tensor, an Operation or a Variable. It may also be nested lists, tuples
+      or dicts. See doc of `Session.run()` for more details.
+
+  Returns:
+    (list of str) A flattened list of fetch names from `feeds_or_fetches`.
+  """
+
+  lines = []
+  if isinstance(feeds_or_fetches, (list, tuple)):
+    for item in feeds_or_fetches:
+      lines.extend(get_flattened_names(item))
+  elif isinstance(feeds_or_fetches, dict):
+    for key in feeds_or_fetches:
+      lines.extend(get_flattened_names(feeds_or_fetches[key]))
+  else:
+    # This ought to be a Tensor, an Operation or a Variable, for which the name
+    # attribute should be available. (Bottom-out condition of the recursion.)
+    lines.append(get_graph_element_name(feeds_or_fetches))
+
+  return lines
+
+
+def get_run_key(feed_dict, fetches):
+  """Summarize the names of feeds and fetches as a RunKey JSON string.
+
+  Args:
+    feed_dict: The feed_dict given to the `Session.run()` call.
+    fetches: The fetches from the `Session.run()` call.
+
+  Returns:
+    A JSON Array consisting of two items. They first items is a flattened
+    Array of the names of the feeds. The second item is a flattened Array of
+    the names of the fetches.
+  """
+  return json.dumps(RunKey(get_flattened_names(feed_dict),
+                           get_flattened_names(fetches)))
diff --git a/tensorflow/python/debug/lib/common_test.py b/tensorflow/python/debug/lib/common_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5af0dafcf9fd81763b30eb159a3e21ef8b7f9ac9
--- /dev/null
+++ b/tensorflow/python/debug/lib/common_test.py
@@ -0,0 +1,59 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for common values and methods of TensorFlow Debugger."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.python.debug.lib import common
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class CommonTest(test_util.TensorFlowTestCase):
+
+  def testOnFeedOneFetch(self):
+    a = constant_op.constant(10.0, name="a")
+    b = constant_op.constant(20.0, name="b")
+    run_key = common.get_run_key({"a": a}, [b])
+    loaded = json.loads(run_key)
+    self.assertItemsEqual(["a:0"], loaded[0])
+    self.assertItemsEqual(["b:0"], loaded[1])
+
+  def testGetRunKeyFlat(self):
+    a = constant_op.constant(10.0, name="a")
+    b = constant_op.constant(20.0, name="b")
+    run_key = common.get_run_key({"a": a}, [a, b])
+    loaded = json.loads(run_key)
+    self.assertItemsEqual(["a:0"], loaded[0])
+    self.assertItemsEqual(["a:0", "b:0"], loaded[1])
+
+  def testGetRunKeyNestedFetches(self):
+    a = constant_op.constant(10.0, name="a")
+    b = constant_op.constant(20.0, name="b")
+    c = constant_op.constant(30.0, name="c")
+    d = constant_op.constant(30.0, name="d")
+    run_key = common.get_run_key(
+        {}, {"set1": [a, b], "set2": {"c": c, "d": d}})
+    loaded = json.loads(run_key)
+    self.assertItemsEqual([], loaded[0])
+    self.assertItemsEqual(["a:0", "b:0", "c:0", "d:0"], loaded[1])
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index c4b13a1045dac4966b0e841155a2932216881d34..8d355aa27f6fa10a1889420a9087800be12a81ce 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -222,7 +222,7 @@ def has_inf_or_nan(datum, tensor):
     # Also return False for data types that cannot be represented as numpy
     # arrays.
     return False
-  elif (np.issubdtype(tensor.dtype, np.float) or
+  elif (np.issubdtype(tensor.dtype, np.floating) or
         np.issubdtype(tensor.dtype, np.complex) or
         np.issubdtype(tensor.dtype, np.integer)):
     return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor))
diff --git a/tensorflow/python/debug/lib/debug_gradients.py b/tensorflow/python/debug/lib/debug_gradients.py
index b01a58719cb45b3a42052e0f3522f39a7c5c63c5..16f51a4b32f711b97077643cec669bb8970e0b21 100644
--- a/tensorflow/python/debug/lib/debug_gradients.py
+++ b/tensorflow/python/debug/lib/debug_gradients.py
@@ -156,9 +156,12 @@ class GradientsDebugger(object):
     # TODO(cais): Implement value_stack.
     grad_debug_op_name = _tensor_to_grad_debug_op_name(input_tensor, self._uuid)
     # pylint: disable=protected-access
-    debug_grad_identity = gen_array_ops._debug_gradient_identity(
-        input_tensor, name=grad_debug_op_name)
+    identity_op = (gen_array_ops._debug_gradient_ref_identity
+                   if input_tensor.dtype._is_ref_dtype
+                   else gen_array_ops._debug_gradient_identity)
+    debug_grad_identity = identity_op(input_tensor, name=grad_debug_op_name)
     # pylint: enable=protected-access
+    assert debug_grad_identity.dtype == input_tensor.dtype
     if debug_grad_identity.op.name != grad_debug_op_name:
       raise ValueError(
           "The graph already contains an op named %s" % grad_debug_op_name)
@@ -261,32 +264,22 @@ class GradientsDebugger(object):
       The GradientsDebugger instance itself.
     """
     tensor_name_pattern = re.compile(tensor_name_regex)
-
-    # pylint: disable=protected-access
     with graph.as_default():
       for op in graph.get_operations():
         for output in op.outputs:
           if tensor_name_pattern.match(output.name):
             debug_op = self.identify_gradient(output)
 
-            for consumer in output.consumers():
+            # Make a copy of output.consumers() since we'll modify the consumers
+            # TODO(skyewm): this is unnecessary once the C API is enabled
+            for consumer in list(output.consumers()):
               if consumer == debug_op.op:
                 continue
 
               # Locate the slot index of the original input.
-              input_slots = []
-              for i, consumer_input in enumerate(consumer._inputs):
+              for i, consumer_input in enumerate(consumer.inputs):
                 if consumer_input == output:
-                  input_slots.append(i)
-
-              for slot in input_slots:
-                consumer._inputs[slot] = debug_op
-                debug_op._consumers.append(consumer)
-
-            del output._consumers[:]
-            output._consumers.append(debug_op.op)
-    # pylint: enable=protected-access
-
+                  consumer._update_input(i, debug_op)  # pylint: disable=protected-access
     return self
 
   def _check_same_graph(self, tensor):
@@ -369,6 +362,12 @@ def _identify_gradient_grad(op, dy):
   return dy
 
 
+@ops.RegisterGradient("DebugGradientRefIdentity")
+def _identify_gradient_grad_ref(op, dy):
+  """Gradient function for the DebugIdentity op."""
+  return _identify_gradient_grad(op, dy)
+
+
 def gradient_values_from_dump(grad_debugger, x_tensor, dump):
   """Find gradient values from a `DebugDumpDir` object.
 
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index 966578320e22caba28344248cbc0562fdc3dfee2..01867fc69d0782b34edb1e8eb873b19f5dfc8529 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -22,6 +22,7 @@ import shutil
 import tempfile
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.lib import debug_gradients
@@ -38,8 +39,13 @@ from tensorflow.python.training import gradient_descent
 class IdentifyGradientTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
-    self.sess = session.Session()
-    with self.sess:
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        disable_model_pruning=True,
+        dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+    config = config_pb2.ConfigProto(graph_options=graph_options)
+    self.sess = session.Session(config=config)
+    with self.sess.as_default():
       self.u = variables.Variable(2.0, name="u")
       self.v = variables.Variable(3.0, name="v")
       self.w = math_ops.multiply(self.u.value(), self.v.value(), name="w")
@@ -112,8 +118,8 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
   def testCallingIdentifyGradientTwiceWithTheSameGradientsDebuggerErrors(self):
     grad_debugger = debug_gradients.GradientsDebugger()
     grad_debugger.identify_gradient(self.w)
-    with self.assertRaisesRegexp(
-        ValueError, "The graph already contains an op named .*"):
+    with self.assertRaisesRegexp(ValueError,
+                                 "The graph already contains an op named .*"):
       grad_debugger.identify_gradient(self.w)
 
   def testIdentifyGradientWorksOnMultipleLosses(self):
@@ -139,10 +145,10 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsNot(dz1_dy, dz2_dy)
 
     self.sess.run(variables.global_variables_initializer())
-    self.assertAllClose(5.0 ** 2, self.sess.run(z1))
-    self.assertAllClose(5.0 ** 0.5, self.sess.run(z2))
+    self.assertAllClose(5.0**2, self.sess.run(z1))
+    self.assertAllClose(5.0**0.5, self.sess.run(z2))
     self.assertAllClose(2.0 * 5.0, self.sess.run(dz1_dy))
-    self.assertAllClose(0.5 * (5.0 ** -0.5), self.sess.run(dz2_dy))
+    self.assertAllClose(0.5 * (5.0**-0.5), self.sess.run(dz2_dy))
 
   def testIdentifyGradientRaisesLookupErrorForUnknownXTensor(self):
     grad_debugger_1 = debug_gradients.GradientsDebugger()
@@ -254,8 +260,8 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.sess.run(variables.global_variables_initializer())
     self.assertAllClose(3.0, self.sess.run(u_grad))
     self.assertAllClose(2.0, self.sess.run(v_grad))
-    self.assertAllClose(
-        3.0, self.sess.run(grad_debugger.gradient_tensor("u:0")))
+    self.assertAllClose(3.0, self.sess.run(
+        grad_debugger.gradient_tensor("u:0")))
 
   def testWatchGradientsWorksOnMultipleTensors(self):
     y = math_ops.add(self.w, -1.0, name="y")
@@ -272,10 +278,10 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(grad_debugger.gradient_tensor("w:0"), ops.Tensor)
 
     self.sess.run(variables.global_variables_initializer())
-    self.assertAllClose(
-        1.0, self.sess.run(grad_debugger.gradient_tensor("w:0")))
-    self.assertAllClose(
-        3.0, self.sess.run(grad_debugger.gradient_tensor("u:0")))
+    self.assertAllClose(1.0, self.sess.run(
+        grad_debugger.gradient_tensor("w:0")))
+    self.assertAllClose(3.0, self.sess.run(
+        grad_debugger.gradient_tensor("u:0")))
 
   def testWatchGradientsByXTensorsWorks(self):
     y = math_ops.add(self.w, -1.0, name="foo/y")
@@ -285,8 +291,8 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     # But we can still get the gradient tensors by using
     # watch_gradients_by_x_tensors().
     grad_debugger = debug_gradients.GradientsDebugger()
-    with grad_debugger.watch_gradients_by_tensors(
-        self.sess.graph, [self.w, self.u, y]):
+    with grad_debugger.watch_gradients_by_tensors(self.sess.graph,
+                                                  [self.w, self.u, y]):
       gradient_descent.GradientDescentOptimizer(0.1).minimize(z)
 
     self.assertEqual(3, len(grad_debugger.gradient_tensors()))
@@ -319,18 +325,18 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     self.assertIsNot(dz1_dy, dz2_dy)
 
     self.sess.run(variables.global_variables_initializer())
-    self.assertAllClose(5.0 ** 2, self.sess.run(z1))
-    self.assertAllClose(5.0 ** 0.5, self.sess.run(z2))
+    self.assertAllClose(5.0**2, self.sess.run(z1))
+    self.assertAllClose(5.0**0.5, self.sess.run(z2))
     self.assertAllClose(2.0 * 5.0, self.sess.run(dz1_dy))
-    self.assertAllClose(0.5 * (5.0 ** -0.5), self.sess.run(dz2_dy))
+    self.assertAllClose(0.5 * (5.0**-0.5), self.sess.run(dz2_dy))
 
   def testGradientsValuesFromDumpWorks(self):
     y = math_ops.add(self.w, -1.0, name="y")
     z = math_ops.square(y, name="z")
 
     grad_debugger = debug_gradients.GradientsDebugger()
-    with grad_debugger.watch_gradients_by_tensors(
-        self.sess.graph, [self.w, self.u, y]):
+    with grad_debugger.watch_gradients_by_tensors(self.sess.graph,
+                                                  [self.w, self.u, y]):
       train_op = gradient_descent.GradientDescentOptimizer(0.1).minimize(z)
 
     self.sess.run(variables.global_variables_initializer())
@@ -338,12 +344,11 @@ class IdentifyGradientTest(test_util.TensorFlowTestCase):
     run_options = config_pb2.RunOptions(output_partition_graphs=True)
     dump_dir = tempfile.mkdtemp()
     debug_url = "file://" + dump_dir
-    debug_utils.watch_graph(
-        run_options,
-        self.sess.graph,
-        debug_urls=debug_url)
+    debug_utils.watch_graph(run_options, self.sess.graph, debug_urls=debug_url)
     run_metadata = config_pb2.RunMetadata()
+    self.assertAllClose(2.0, self.sess.run(self.u))
     self.sess.run(train_op, options=run_options, run_metadata=run_metadata)
+    self.assertAllClose(-1.0, self.sess.run(self.u))
 
     dump = debug_data.DebugDumpDir(
         dump_dir, partition_graphs=run_metadata.partition_graphs)
diff --git a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
index cc1a3805385eb5097dc65738440bdefa28d5d3e3..bd00f738610627a4b3bc7c61476164188a7b460c 100644
--- a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
+++ b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
@@ -164,7 +164,7 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       self._compareOriginalAndReconstructedGraphDefs(sess, loop)
 
   def testReconstructGraphWithGradients(self):
-    with session.Session() as sess:
+    with session.Session(config=self._no_rewrite_session_config()) as sess:
       u = variables.Variable(12.0, name="u")
       v = variables.Variable(30.0, name="v")
       x = constant_op.constant(1.1, name="x")
diff --git a/tensorflow/python/debug/lib/debug_service_pb2_grpc.py b/tensorflow/python/debug/lib/debug_service_pb2_grpc.py
index 98adc3284b94afc8190f7ee4240d7c5fbf37b4b5..16573eab6f0e61c12020c4becb72369c38f05b42 100755
--- a/tensorflow/python/debug/lib/debug_service_pb2_grpc.py
+++ b/tensorflow/python/debug/lib/debug_service_pb2_grpc.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 import grpc
 
 from tensorflow.core.debug import debug_service_pb2 as tensorflow_dot_core_dot_debug_dot_debug__service__pb2
+from tensorflow.core.protobuf import debug_pb2 as tensorflow_dot_core_dot_protobuf_dot_debug__pb2
 from tensorflow.core.util import event_pb2 as tensorflow_dot_core_dot_util_dot_event__pb2
 
 
@@ -42,6 +43,16 @@ class EventListenerStub(object):
         request_serializer=tensorflow_dot_core_dot_util_dot_event__pb2.Event.SerializeToString,
         response_deserializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.FromString,
         )
+    self.SendTracebacks = channel.unary_unary(
+        '/tensorflow.EventListener/SendTracebacks',
+        request_serializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.CallTraceback.SerializeToString,
+        response_deserializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.FromString,
+        )
+    self.SendSourceFiles = channel.unary_unary(
+        '/tensorflow.EventListener/SendSourceFiles',
+        request_serializer=tensorflow_dot_core_dot_protobuf_dot_debug__pb2.DebuggedSourceFiles.SerializeToString,
+        response_deserializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.FromString,
+        )
 
 
 class EventListenerServicer(object):
@@ -62,6 +73,20 @@ class EventListenerServicer(object):
     context.set_details('Method not implemented!')
     raise NotImplementedError('Method not implemented!')
 
+  def SendTracebacks(self, request, context):
+    """Send the tracebacks of ops in a Python graph definition.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+  def SendSourceFiles(self, request, context):
+    """Send a collection of source code files being debugged.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
 
 def add_EventListenerServicer_to_server(servicer, server):
   rpc_method_handlers = {
@@ -70,6 +95,16 @@ def add_EventListenerServicer_to_server(servicer, server):
           request_deserializer=tensorflow_dot_core_dot_util_dot_event__pb2.Event.FromString,
           response_serializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.SerializeToString,
       ),
+      'SendTracebacks': grpc.unary_unary_rpc_method_handler(
+          servicer.SendTracebacks,
+          request_deserializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.CallTraceback.FromString,
+          response_serializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.SerializeToString,
+      ),
+      'SendSourceFiles': grpc.unary_unary_rpc_method_handler(
+          servicer.SendSourceFiles,
+          request_deserializer=tensorflow_dot_core_dot_protobuf_dot_debug__pb2.DebuggedSourceFiles.FromString,
+          response_serializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.SerializeToString,
+      ),
   }
   generic_handler = grpc.method_handlers_generic_handler(
       'tensorflow.EventListener', rpc_method_handlers)
diff --git a/tensorflow/python/debug/lib/grpc_debug_server.py b/tensorflow/python/debug/lib/grpc_debug_server.py
index 5ab910fb0c9d89bc31a15ecbec48516f07a02979..1b559f1f27538364d8e12339d321e41d33c52590 100644
--- a/tensorflow/python/debug/lib/grpc_debug_server.py
+++ b/tensorflow/python/debug/lib/grpc_debug_server.py
@@ -458,3 +458,36 @@ class EventListenerBaseServicer(debug_service_pb2_grpc.EventListenerServicer):
         `debug_op` as a `str`.
     """
     return list(self._gated_grpc_debug_watches)
+
+  def SendTracebacks(self, request, context):
+    """Base implementation of the handling of SendTracebacks calls.
+
+    The base implementation does nothing with the incoming request.
+    Override in an implementation of the server if necessary.
+
+    Args:
+      request: A `CallTraceback` proto, containing information about the
+        type (e.g., graph vs. eager execution) and source-code traceback of the
+        call and (any) associated `tf.Graph`s.
+      context: Server context.
+
+    Returns:
+      A `EventReply` proto.
+    """
+    return debug_service_pb2.EventReply()
+
+  def SendSourceFiles(self, request, context):
+    """Base implementation of the handling of SendSourceFiles calls.
+
+    The base implementation does nothing with the incoming request.
+    Override in an implementation of the server if necessary.
+
+    Args:
+      request: A `DebuggedSourceFiles` proto, containing the path, content, size
+        and last-modified timestamp of source files.
+      context: Server context.
+
+    Returns:
+      A `EventReply` proto.
+    """
+    return debug_service_pb2.EventReply()
diff --git a/tensorflow/python/debug/lib/grpc_debug_test_server.py b/tensorflow/python/debug/lib/grpc_debug_test_server.py
index 76e45c0bedbb463c872bfca466c6991c9d459e49..917004694845c752d1f6bf88cc2a203eb8f9ba73 100644
--- a/tensorflow/python/debug/lib/grpc_debug_test_server.py
+++ b/tensorflow/python/debug/lib/grpc_debug_test_server.py
@@ -238,6 +238,15 @@ class EventListenerTestServicer(grpc_debug_server.EventListenerBaseServicer):
         self, server_port,
         functools.partial(EventListenerTestStreamHandler, dump_dir, self))
 
+    # Members for storing the graph ops traceback and source files.
+    self._call_types = []
+    self._call_keys = []
+    self._origin_stacks = []
+    self._origin_id_to_strings = []
+    self._graph_tracebacks = []
+    self._graph_versions = []
+    self._source_files = None
+
   def _initialize_toggle_watch_state(self, toggle_watches):
     self._toggle_watches = toggle_watches
     self._toggle_watch_state = dict()
@@ -259,6 +268,100 @@ class EventListenerTestServicer(grpc_debug_server.EventListenerBaseServicer):
     self.core_metadata_json_strings = []
     self.partition_graph_defs = []
     self.debug_tensor_values = collections.defaultdict(list)
+    self._call_types = []
+    self._call_keys = []
+    self._origin_stacks = []
+    self._origin_id_to_strings = []
+    self._graph_tracebacks = []
+    self._graph_versions = []
+    self._source_files = None
+
+  def SendTracebacks(self, request, context):
+    self._call_types.append(request.call_type)
+    self._call_keys.append(request.call_key)
+    self._origin_stacks.append(request.origin_stack)
+    self._origin_id_to_strings.append(request.origin_id_to_string)
+    self._graph_tracebacks.append(request.graph_traceback)
+    self._graph_versions.append(request.graph_version)
+    return debug_service_pb2.EventReply()
+
+  def SendSourceFiles(self, request, context):
+    self._source_files = request
+    return debug_service_pb2.EventReply()
+
+  def query_op_traceback(self, op_name):
+    """Query the traceback of an op.
+
+    Args:
+      op_name: Name of the op to query.
+
+    Returns:
+      The traceback of the op, as a list of 3-tuples:
+        (filename, lineno, function_name)
+
+    Raises:
+      ValueError: If the op cannot be found in the tracebacks received by the
+        server so far.
+    """
+    for op_log_proto in self._graph_tracebacks:
+      for log_entry in op_log_proto.log_entries:
+        if log_entry.name == op_name:
+          return self._code_def_to_traceback(log_entry.code_def,
+                                             op_log_proto.id_to_string)
+    raise ValueError(
+        "Op '%s' does not exist in the tracebacks received by the debug "
+        "server." % op_name)
+
+  def query_origin_stack(self):
+    """Query the stack of the origin of the execution call.
+
+    Returns:
+      A `list` of all tracebacks. Each item corresponds to an execution call,
+        i.e., a `SendTracebacks` request. Each item is a `list` of 3-tuples:
+        (filename, lineno, function_name).
+    """
+    ret = []
+    for stack, id_to_string in zip(
+        self._origin_stacks, self._origin_id_to_strings):
+      ret.append(self._code_def_to_traceback(stack, id_to_string))
+    return ret
+
+  def query_call_types(self):
+    return self._call_types
+
+  def query_call_keys(self):
+    return self._call_keys
+
+  def query_graph_versions(self):
+    return self._graph_versions
+
+  def query_source_file_line(self, file_path, lineno):
+    """Query the content of a given line in a source file.
+
+    Args:
+      file_path: Path to the source file.
+      lineno: Line number as an `int`.
+
+    Returns:
+      Content of the line as a string.
+
+    Raises:
+      ValueError: If no source file is found at the given file_path.
+    """
+    if not self._source_files:
+      raise ValueError(
+          "This debug server has not received any source file contents yet.")
+    for source_file_proto in self._source_files.source_files:
+      if source_file_proto.file_path == file_path:
+        return source_file_proto.lines[lineno - 1]
+    raise ValueError(
+        "Source file at path %s has not been received by the debug server",
+        file_path)
+
+  def _code_def_to_traceback(self, code_def, id_to_string):
+    return [(id_to_string[trace.file_id],
+             trace.lineno,
+             id_to_string[trace.function_id]) for trace in code_def.traces]
 
 
 def start_server_on_separate_thread(dump_to_filesystem=True,
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index e1ddd4ee642f2a11cf4bb65b1d60b8f731b9c8f6..b623ee31c5dc59894373ec7952e53acd0f6e1126 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -54,7 +54,8 @@ from tensorflow.python.training import monitored_session
 def no_rewrite_session_config():
   rewriter_config = rewriter_config_pb2.RewriterConfig(
       disable_model_pruning=True,
-      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
   graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
   return config_pb2.ConfigProto(graph_options=graph_options)
 
@@ -248,10 +249,79 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
     self.assertEqual(
         14, len(dump.get_tensors("v/read", 0, "DebugNumericSummary")[0]))
 
-  def testConstructGrpcDebugHookWithGrpcInUrlRaisesValueError(self):
-    """Tests that the hook raises an error if the URL starts with grpc://."""
+  def testTensorBoardDebugHookWorks(self):
+    u = variables.Variable(2.1, name="u")
+    v = variables.Variable(20.0, name="v")
+    w = math_ops.multiply(u, v, name="w")
+
+    sess = session.Session(config=no_rewrite_session_config())
+    sess.run(u.initializer)
+    sess.run(v.initializer)
+
+    grpc_debug_hook = hooks.TensorBoardDebugHook(
+        ["localhost:%d" % self._server_port])
+    sess = monitored_session._HookedSession(sess, [grpc_debug_hook])
+
+    # Activate watch point on a tensor before calling sess.run().
+    self._server.request_watch("u/read", 0, "DebugIdentity")
+    self.assertAllClose(42.0, sess.run(w))
+
+    # self.assertAllClose(42.0, sess.run(w))
+    dump = debug_data.DebugDumpDir(self._dump_root)
+    self.assertAllClose([2.1], dump.get_tensors("u/read", 0, "DebugIdentity"))
+
+    # Check that the server has received the stack trace.
+    self.assertTrue(self._server.query_op_traceback("u"))
+    self.assertTrue(self._server.query_op_traceback("u/read"))
+    self.assertTrue(self._server.query_op_traceback("v"))
+    self.assertTrue(self._server.query_op_traceback("v/read"))
+    self.assertTrue(self._server.query_op_traceback("w"))
+
+    # Check that the server has received the python file content.
+    # Query an arbitrary line to make sure that is the case.
+    with open(__file__, "rt") as this_source_file:
+      first_line = this_source_file.readline().strip()
+      self.assertEqual(
+          first_line, self._server.query_source_file_line(__file__, 1))
+
+    self._server.clear_data()
+    # Call sess.run() again, and verify that this time the traceback and source
+    # code is not sent, because the graph version is not newer.
+    self.assertAllClose(42.0, sess.run(w))
+    with self.assertRaises(ValueError):
+      self._server.query_op_traceback("delta_1")
     with self.assertRaises(ValueError):
-      hooks.GrpcDebugHook(["grpc://foo:42"])
+      self._server.query_source_file_line(__file__, 1)
+
+  def testTensorBoardDebugHookDisablingTracebackSourceCodeSendingWorks(self):
+    u = variables.Variable(2.1, name="u")
+    v = variables.Variable(20.0, name="v")
+    w = math_ops.multiply(u, v, name="w")
+
+    sess = session.Session(config=no_rewrite_session_config())
+    sess.run(variables.global_variables_initializer())
+
+    grpc_debug_hook = hooks.TensorBoardDebugHook(
+        ["localhost:%d" % self._server_port],
+        send_traceback_and_source_code=False)
+    sess = monitored_session._HookedSession(sess, [grpc_debug_hook])
+
+    # Activate watch point on a tensor before calling sess.run().
+    self._server.request_watch("u/read", 0, "DebugIdentity")
+    self.assertAllClose(42.0, sess.run(w))
+
+    # Check that the server has _not_ received any tracebacks, as a result of
+    # the disabling above.
+    with self.assertRaisesRegexp(
+        ValueError, r"Op .*u/read.* does not exist"):
+      self.assertTrue(self._server.query_op_traceback("u/read"))
+    with self.assertRaisesRegexp(
+        ValueError, r".* has not received any source file"):
+      self._server.query_source_file_line(__file__, 1)
+
+  def testConstructGrpcDebugHookWithOrWithouGrpcInUrlWorks(self):
+    hooks.GrpcDebugHook(["grpc://foo:42424"])
+    hooks.GrpcDebugHook(["foo:42424"])
 
 
 class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
@@ -684,6 +754,112 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
           # to disable the breakpoint at delta:0:DebugIdentity.
           self.assertSetEqual(set(), self._server_1.breakpoints)
 
+  def testTensorBoardDebuggerWrapperToggleBreakpointsWorks(self):
+    with session.Session(config=no_rewrite_session_config()) as sess:
+      v_1 = variables.Variable(50.0, name="v_1")
+      v_2 = variables.Variable(-50.0, name="v_2")
+      delta_1 = constant_op.constant(5.0, name="delta_1")
+      delta_2 = constant_op.constant(-5.0, name="delta_2")
+      inc_v_1 = state_ops.assign_add(v_1, delta_1, name="inc_v_1")
+      inc_v_2 = state_ops.assign_add(v_2, delta_2, name="inc_v_2")
+
+      sess.run([v_1.initializer, v_2.initializer])
+
+      # The TensorBoardDebugWrapperSession should add a DebugIdentity debug op
+      # with attribute gated_grpc=True for every tensor in the graph.
+      sess = grpc_wrapper.TensorBoardDebugWrapperSession(
+          sess, self._debug_server_url_1)
+
+      for i in xrange(4):
+        self._server_1.clear_data()
+
+        if i in (0, 2):
+          # Enable breakpoint at delta_[1,2]:0:DebugIdentity in runs 0 and 2.
+          self._server_1.request_watch(
+              "delta_1", 0, "DebugIdentity", breakpoint=True)
+          self._server_1.request_watch(
+              "delta_2", 0, "DebugIdentity", breakpoint=True)
+        else:
+          # Disable the breakpoint in runs 1 and 3.
+          self._server_1.request_unwatch("delta_1", 0, "DebugIdentity")
+          self._server_1.request_unwatch("delta_2", 0, "DebugIdentity")
+
+        output = sess.run([inc_v_1, inc_v_2])
+        self.assertAllClose([50.0 + 5.0 * (i + 1), -50 - 5.0 * (i + 1)], output)
+
+        if i in (0, 2):
+          # During runs 0 and 2, the server should have received the published
+          # debug tensor delta:0:DebugIdentity. The breakpoint should have been
+          # unblocked by EventReply reponses from the server.
+          self.assertAllClose(
+              [5.0],
+              self._server_1.debug_tensor_values["delta_1:0:DebugIdentity"])
+          self.assertAllClose(
+              [-5.0],
+              self._server_1.debug_tensor_values["delta_2:0:DebugIdentity"])
+          # After the runs, the server should have properly registered the
+          # breakpoints.
+        else:
+          # After the end of runs 1 and 3, the server has received the requests
+          # to disable the breakpoint at delta:0:DebugIdentity.
+          self.assertSetEqual(set(), self._server_1.breakpoints)
+
+        if i == 0:
+          # Check that the server has received the stack trace.
+          self.assertTrue(self._server_1.query_op_traceback("delta_1"))
+          self.assertTrue(self._server_1.query_op_traceback("delta_2"))
+          self.assertTrue(self._server_1.query_op_traceback("inc_v_1"))
+          self.assertTrue(self._server_1.query_op_traceback("inc_v_2"))
+          # Check that the server has received the python file content.
+          # Query an arbitrary line to make sure that is the case.
+          with open(__file__, "rt") as this_source_file:
+            first_line = this_source_file.readline().strip()
+          self.assertEqual(
+              first_line, self._server_1.query_source_file_line(__file__, 1))
+        else:
+          # In later Session.run() calls, the traceback shouldn't have been sent
+          # because it is already sent in the 1st call. So calling
+          # query_op_traceback() should lead to an exception, because the test
+          # debug server clears the data at the beginning of every iteration.
+          with self.assertRaises(ValueError):
+            self._server_1.query_op_traceback("delta_1")
+          with self.assertRaises(ValueError):
+            self._server_1.query_source_file_line(__file__, 1)
+
+  def testTensorBoardDebuggerWrapperDisablingTracebackSourceSendingWorks(self):
+    with session.Session(config=no_rewrite_session_config()) as sess:
+      v_1 = variables.Variable(50.0, name="v_1")
+      v_2 = variables.Variable(-50.0, name="v_2")
+      delta_1 = constant_op.constant(5.0, name="delta_1")
+      delta_2 = constant_op.constant(-5.0, name="delta_2")
+      inc_v_1 = state_ops.assign_add(v_1, delta_1, name="inc_v_1")
+      inc_v_2 = state_ops.assign_add(v_2, delta_2, name="inc_v_2")
+
+      sess.run(variables.global_variables_initializer())
+
+      # Disable the sending of traceback and source code.
+      sess = grpc_wrapper.TensorBoardDebugWrapperSession(
+          sess, self._debug_server_url_1, send_traceback_and_source_code=False)
+
+      for i in xrange(4):
+        self._server_1.clear_data()
+
+        if i == 0:
+          self._server_1.request_watch(
+              "delta_1", 0, "DebugIdentity", breakpoint=True)
+
+        output = sess.run([inc_v_1, inc_v_2])
+        self.assertAllClose([50.0 + 5.0 * (i + 1), -50 - 5.0 * (i + 1)], output)
+
+        # No op traceback or source code should have been received by the debug
+        # server due to the disabling above.
+        with self.assertRaisesRegexp(
+            ValueError, r"Op .*delta_1.* does not exist"):
+          self.assertTrue(self._server_1.query_op_traceback("delta_1"))
+        with self.assertRaisesRegexp(
+            ValueError, r".* has not received any source file"):
+          self._server_1.query_source_file_line(__file__, 1)
+
   def testGetGrpcDebugWatchesReturnsCorrectAnswer(self):
     with session.Session() as sess:
       v = variables.Variable(50.0, name="v")
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index 20a40018bf9c67c5b743963489c8fc5616efa2db..f4fac1401918ccacd38aae5ad2ef8d686c9204b9 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -988,7 +988,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
   def testWatchingVariableUpdateOpsSeesUpdatedValues(self):
     """Watch output slots on Variable-updating ops, with no emitted edges."""
 
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       u_init = constant_op.constant(10.0)
       u = variables.Variable(u_init, name="gdo/u")
       v_init = constant_op.constant(20.0)
diff --git a/tensorflow/python/debug/lib/source_remote.py b/tensorflow/python/debug/lib/source_remote.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b6b2b995ecd13cffddaa38bd2ec673e6b824574
--- /dev/null
+++ b/tensorflow/python/debug/lib/source_remote.py
@@ -0,0 +1,209 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Communicating tracebacks and source code with debug server."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import socket
+
+import grpc
+
+from tensorflow.core.debug import debug_service_pb2
+from tensorflow.core.protobuf import debug_pb2
+from tensorflow.python.debug.lib import common
+from tensorflow.python.debug.lib import debug_service_pb2_grpc
+from tensorflow.python.debug.lib import source_utils
+from tensorflow.python.platform import gfile
+from tensorflow.python.profiler import tfprof_logger
+
+
+def _load_debugged_source_file(file_path, source_file_proto):
+  file_stat = gfile.Stat(file_path)
+  source_file_proto.host = socket.gethostname()
+  source_file_proto.file_path = file_path
+  source_file_proto.last_modified = file_stat.mtime_nsec
+  source_file_proto.bytes = file_stat.length
+  try:
+    with gfile.Open(file_path, "r") as f:
+      source_file_proto.lines.extend(f.read().splitlines())
+  except IOError:
+    pass
+
+
+def _string_to_id(string, string_to_id):
+  if string not in string_to_id:
+    string_to_id[string] = len(string_to_id)
+  return string_to_id[string]
+
+
+def _format_origin_stack(origin_stack, call_traceback_proto):
+  """Format a traceback stack for a `CallTraceback` proto.
+
+  Args:
+    origin_stack: The stack list as returned by `traceback.extract_stack()`.
+    call_traceback_proto: A `CallTraceback` proto whose fields are to be
+      populated.
+  """
+  string_to_id = dict()
+  string_to_id[None] = 0
+  for frame in origin_stack:
+    file_path, lineno, func_name, line_text = frame
+    call_traceback_proto.origin_stack.traces.add(
+        file_id=_string_to_id(file_path, string_to_id),
+        lineno=lineno,
+        function_id=_string_to_id(func_name, string_to_id),
+        line_id=_string_to_id(line_text, string_to_id))
+
+  id_to_string = call_traceback_proto.origin_id_to_string
+  for key, value in string_to_id.items():
+    id_to_string[value] = key if key is not None else ""
+
+
+def _source_file_paths_outside_tensorflow_py_library(code_defs, id_to_string):
+  """Extract source file paths outside TensorFlow Python library.
+
+  Args:
+    code_defs: An iterable of `CodeDef` protos, i.e., an iterable of stack
+      traces.
+    id_to_string: A proto map from integer ids to strings.
+
+  Returns:
+    An iterable of source file paths outside the TensorFlow Python library.
+  """
+  file_ids = set()
+  for code_def in code_defs:
+    for trace in code_def.traces:
+      file_ids.add(trace.file_id)
+  non_tf_files = (id_to_string[file_id] for file_id in file_ids)
+  non_tf_files = (
+      f for f in non_tf_files
+      if not source_utils.guess_is_tensorflow_py_library(f) and gfile.Exists(f))
+  return non_tf_files
+
+
+def _send_call_tracebacks(destinations,
+                          origin_stack,
+                          is_eager_execution=False,
+                          call_key=None,
+                          graph=None,
+                          send_source=True):
+  """Send the tracebacks of a TensorFlow execution call.
+
+  To gRPC debug server(s). This applies to graph execution (`tf.Session.run()`)
+  calls and eager execution calls.
+
+  If `send_source`, also sends the underlying source files outside the
+  TensorFlow library.
+
+  Args:
+    destinations: gRPC destination addresses, a `str` or a `list` of `str`s,
+      e.g., "localhost:4242". If a `list`, gRPC requests containing the same
+      `CallTraceback` proto payload will be sent to all the destinations.
+    origin_stack: The traceback stack for the origin of the execution call. For
+      graph execution, this is the traceback of the `tf.Session.run()`
+      invocation. For eager execution, this is the traceback of the Python
+      line that executes the eager opertion.
+    is_eager_execution: (`bool`) whether an eager execution call (i.e., not a
+      `tf.Session.run` or derived methods) is being sent.
+    call_key: The key of the execution call, as a string. For graph execution,
+      this is a string describing the feeds, fetches (and targets) names of the
+      `tf.Session.run` call. For eager execution, this is ignored.
+    graph: A Python `tf.Graph` object (i.e., *not* a `tf.GraphDef`), which
+      contains op tracebacks, if applicable.
+    send_source: Whether the source files involved in the op tracebacks but
+      outside the TensorFlow library are to be sent.
+  """
+  if not isinstance(destinations, list):
+    destinations = [destinations]
+  # Strip grpc:// prefix, if any is present.
+  destinations = [
+      dest[len(common.GRPC_URL_PREFIX):]
+      if dest.startswith(common.GRPC_URL_PREFIX) else dest
+      for dest in destinations]
+
+  call_type = (debug_service_pb2.CallTraceback.EAGER_EXECUTION
+               if is_eager_execution
+               else debug_service_pb2.CallTraceback.GRAPH_EXECUTION)
+  graph_traceback = tfprof_logger.merge_default_with_oplog(
+      graph, add_trainable_var=False) if graph else None
+  call_traceback = debug_service_pb2.CallTraceback(
+      call_type=call_type, call_key=call_key, graph_traceback=graph_traceback,
+      graph_version=graph.version if graph else None)
+
+  _format_origin_stack(origin_stack, call_traceback)
+
+  if send_source:
+    source_file_paths = set()
+    source_file_paths.update(_source_file_paths_outside_tensorflow_py_library(
+        (log_entry.code_def for log_entry
+         in call_traceback.graph_traceback.log_entries),
+        call_traceback.graph_traceback.id_to_string))
+    source_file_paths.update(_source_file_paths_outside_tensorflow_py_library(
+        [call_traceback.origin_stack], call_traceback.origin_id_to_string))
+
+    debugged_source_files = debug_pb2.DebuggedSourceFiles()
+    for file_path in source_file_paths:
+      _load_debugged_source_file(
+          file_path, debugged_source_files.source_files.add())
+
+  for destination in destinations:
+    channel = grpc.insecure_channel(destination)
+    stub = debug_service_pb2_grpc.EventListenerStub(channel)
+    stub.SendTracebacks(call_traceback)
+    if send_source:
+      stub.SendSourceFiles(debugged_source_files)
+
+
+def send_graph_tracebacks(destinations,
+                          run_key,
+                          origin_stack,
+                          graph,
+                          send_source=True):
+  """Send the tracebacks of a graph execution call to debug server(s).
+
+  Args:
+    destinations: gRPC destination addresses, a `str` or a `list` of `str`s,
+      e.g., "localhost:4242". If a `list`, gRPC requests containing the same
+      `CallTraceback` proto payload will be sent to all the destinations.
+    run_key: A string describing the feeds, fetches (and targets) names of the
+      `tf.Session.run` call.
+    origin_stack: The traceback of the `tf.Session.run()` invocation.
+    graph: A Python `tf.Graph` object (i.e., *not* a `tf.GraphDef`), which
+      contains op tracebacks.
+    send_source: Whether the source files involved in the op tracebacks but
+      outside the TensorFlow library are to be sent.
+  """
+  _send_call_tracebacks(
+      destinations, origin_stack, is_eager_execution=False, call_key=run_key,
+      graph=graph, send_source=send_source)
+
+
+def send_eager_tracebacks(destinations,
+                          origin_stack,
+                          send_source=True):
+  """Send the tracebacks of an eager execution call to debug server(s).
+
+  Args:
+    destinations: gRPC destination addresses, a `str` or a `list` of `str`s,
+      e.g., "localhost:4242". If a `list`, gRPC requests containing the same
+    origin_stack: The traceback of the eager operation invocation.
+    send_source: Whether the source files involved in the op tracebacks but
+      outside the TensorFlow library are to be sent.
+  """
+  _send_call_tracebacks(
+      destinations, origin_stack, is_eager_execution=True,
+      send_source=send_source)
diff --git a/tensorflow/python/debug/lib/source_remote_test.py b/tensorflow/python/debug/lib/source_remote_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..27bafa45e1207513e46fd2ae0f92d5bfa686ffd5
--- /dev/null
+++ b/tensorflow/python/debug/lib/source_remote_test.py
@@ -0,0 +1,171 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for source_remote."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import traceback
+
+from tensorflow.core.debug import debug_service_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import grpc_debug_test_server
+from tensorflow.python.debug.lib import source_remote
+from tensorflow.python.debug.lib import source_utils
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import tf_inspect
+
+
+def line_number_above():
+  return tf_inspect.stack()[1][2] - 1
+
+
+class SendTracebacksTest(test_util.TensorFlowTestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    test_util.TensorFlowTestCase.setUpClass()
+    (cls._server_port, cls._debug_server_url, cls._server_dump_dir,
+     cls._server_thread,
+     cls._server) = grpc_debug_test_server.start_server_on_separate_thread()
+    cls._server_address = "localhost:%d" % cls._server_port
+    (cls._server_port_2, cls._debug_server_url_2, cls._server_dump_dir_2,
+     cls._server_thread_2,
+     cls._server_2) = grpc_debug_test_server.start_server_on_separate_thread()
+    cls._server_address_2 = "localhost:%d" % cls._server_port_2
+    cls._curr_file_path = os.path.normpath(os.path.abspath(__file__))
+
+  @classmethod
+  def tearDownClass(cls):
+    # Stop the test server and join the thread.
+    cls._server.stop_server().wait()
+    cls._server_thread.join()
+    cls._server_2.stop_server().wait()
+    cls._server_thread_2.join()
+    test_util.TensorFlowTestCase.tearDownClass()
+
+  def tearDown(self):
+    ops.reset_default_graph()
+    self._server.clear_data()
+    self._server_2.clear_data()
+    super(SendTracebacksTest, self).tearDown()
+
+  def _findFirstTraceInsideTensorFlowPyLibrary(self, op):
+    """Find the first trace of an op that belongs to the TF Python library."""
+    for trace in op.traceback:
+      if source_utils.guess_is_tensorflow_py_library(trace[0]):
+        return trace
+
+  def testSendGraphTracebacksToSingleDebugServer(self):
+    this_func_name = "testSendGraphTracebacksToSingleDebugServer"
+    with session.Session() as sess:
+      a = variables.Variable(21.0, name="a")
+      a_lineno = line_number_above()
+      b = variables.Variable(2.0, name="b")
+      b_lineno = line_number_above()
+      math_ops.add(a, b, name="x")
+      x_lineno = line_number_above()
+
+      send_stack = traceback.extract_stack()
+      send_lineno = line_number_above()
+      source_remote.send_graph_tracebacks(
+          self._server_address, "dummy_run_key", send_stack, sess.graph)
+
+      tb = self._server.query_op_traceback("a")
+      self.assertIn((self._curr_file_path, a_lineno, this_func_name), tb)
+      tb = self._server.query_op_traceback("b")
+      self.assertIn((self._curr_file_path, b_lineno, this_func_name), tb)
+      tb = self._server.query_op_traceback("x")
+      self.assertIn((self._curr_file_path, x_lineno, this_func_name), tb)
+
+      self.assertIn(
+          (self._curr_file_path, send_lineno, this_func_name),
+          self._server.query_origin_stack()[-1])
+
+      self.assertEqual(
+          "      a = variables.Variable(21.0, name=\"a\")",
+          self._server.query_source_file_line(__file__, a_lineno))
+      # Files in the TensorFlow code base shouldn not have been sent.
+      tf_trace_file_path = self._findFirstTraceInsideTensorFlowPyLibrary(a.op)
+      with self.assertRaises(ValueError):
+        self._server.query_source_file_line(tf_trace_file_path, 0)
+      self.assertEqual([debug_service_pb2.CallTraceback.GRAPH_EXECUTION],
+                       self._server.query_call_types())
+      self.assertEqual(["dummy_run_key"], self._server.query_call_keys())
+      self.assertEqual(
+          [sess.graph.version], self._server.query_graph_versions())
+
+  def testSendGraphTracebacksToTwoDebugServers(self):
+    this_func_name = "testSendGraphTracebacksToTwoDebugServers"
+    with session.Session() as sess:
+      a = variables.Variable(21.0, name="two/a")
+      a_lineno = line_number_above()
+      b = variables.Variable(2.0, name="two/b")
+      b_lineno = line_number_above()
+      x = math_ops.add(a, b, name="two/x")
+      x_lineno = line_number_above()
+
+      send_traceback = traceback.extract_stack()
+      send_lineno = line_number_above()
+      source_remote.send_graph_tracebacks(
+          [self._server_address, self._server_address_2],
+          "dummy_run_key", send_traceback, sess.graph)
+
+      servers = [self._server, self._server_2]
+      for server in servers:
+        tb = server.query_op_traceback("two/a")
+        self.assertIn((self._curr_file_path, a_lineno, this_func_name), tb)
+        tb = server.query_op_traceback("two/b")
+        self.assertIn((self._curr_file_path, b_lineno, this_func_name), tb)
+        tb = server.query_op_traceback("two/x")
+        self.assertIn((self._curr_file_path, x_lineno, this_func_name), tb)
+
+        self.assertIn(
+            (self._curr_file_path, send_lineno, this_func_name),
+            server.query_origin_stack()[-1])
+
+        self.assertEqual(
+            "      x = math_ops.add(a, b, name=\"two/x\")",
+            server.query_source_file_line(__file__, x_lineno))
+        tf_trace_file_path = self._findFirstTraceInsideTensorFlowPyLibrary(x.op)
+        with self.assertRaises(ValueError):
+          server.query_source_file_line(tf_trace_file_path, 0)
+        self.assertEqual([debug_service_pb2.CallTraceback.GRAPH_EXECUTION],
+                         server.query_call_types())
+        self.assertEqual(["dummy_run_key"], server.query_call_keys())
+        self.assertEqual([sess.graph.version], server.query_graph_versions())
+
+  def testSendEagerTracebacksToSingleDebugServer(self):
+    this_func_name = "testSendEagerTracebacksToSingleDebugServer"
+    send_traceback = traceback.extract_stack()
+    send_lineno = line_number_above()
+    source_remote.send_eager_tracebacks(self._server_address, send_traceback)
+
+    self.assertEqual([debug_service_pb2.CallTraceback.EAGER_EXECUTION],
+                     self._server.query_call_types())
+    self.assertIn((self._curr_file_path, send_lineno, this_func_name),
+                  self._server.query_origin_stack()[-1])
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/stepper.py b/tensorflow/python/debug/lib/stepper.py
index 1fa0b3dba2b547bf1d311e42e1005a8e501f9829..c27b3f51cddb51654b1ff5a35fd7d689fc4109c4 100644
--- a/tensorflow/python/debug/lib/stepper.py
+++ b/tensorflow/python/debug/lib/stepper.py
@@ -80,7 +80,7 @@ class NodeStepper(object):
   when they are required as data dependencies.
 
   The temporary directories are automatically clean when the NodeStepper
-  instance exits as a context mananger.
+  instance exits as a context manager.
 
   Once the tracing is complete, it will issue a run() call on the
   underlying session, using the aforementioned feed_dict prepared by the input
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper.py b/tensorflow/python/debug/wrappers/dumping_wrapper.py
index 962318e54a479069d58e06c09c141b097fd15782..3fac2e59717a828424a808b770812afc7772bfe2 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper.py
@@ -73,6 +73,7 @@ class DumpingDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
         self, sess, watch_fn=watch_fn, thread_name_filter=thread_name_filter,
         pass_through_operrors=pass_through_operrors)
 
+    session_root = os.path.expanduser(session_root)
     if gfile.Exists(session_root):
       if not gfile.IsDirectory(session_root):
         raise ValueError(
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index acea9433e22203d56f4ceb6cd92b681e35876a09..254201c39371e2034b08fad927e98418c8086ea5 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -389,6 +389,11 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
         r"mode\."):
       sess.invoke_node_stepper(node_stepper)
 
+  def testDumpingWrapperWithEmptyFetchWorks(self):
+    sess = dumping_wrapper.DumpingDebugWrapperSession(
+        self.sess, session_root=self.session_root, log_usage=False)
+    sess.run([])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index 4e243cb6c9649a24009a0c9ac501c59eaac3bd79..c530204bbf6959f56a72c6e67add91f1e575f067 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -121,7 +121,9 @@ from tensorflow.python.debug.lib import debug_utils
 from tensorflow.python.debug.lib import stepper
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import monitored_session
+from tensorflow.python.util import nest
 
 
 # Helper function.
@@ -439,7 +441,12 @@ class BaseDebugWrapperSession(session.SessionInterface):
             "callable_runner and fetches/feed_dict are mutually exclusive, but "
             "are used simultaneously.")
 
-    if self._is_disabled_thread():
+    empty_fetches = not nest.flatten(fetches)
+    if empty_fetches:
+      tf_logging.info(
+          "Due to empty fetches, tfdbg Session wrapper is letting a "
+          "Session.run pass through without any debugging actions.")
+    if self._is_disabled_thread() or empty_fetches:
       if callable_runner:
         return callable_runner(*callable_runner_args)
       else:
@@ -706,7 +713,8 @@ class BaseDebugWrapperSession(session.SessionInterface):
         exec_type, exec_value, exec_tb)
 
   def __del__(self):
-    self._sess.__del__()
+    if hasattr(self._sess, "__del__"):
+      self._sess.__del__()
 
   def close(self):
     self._sess.close()
diff --git a/tensorflow/python/debug/wrappers/grpc_wrapper.py b/tensorflow/python/debug/wrappers/grpc_wrapper.py
index 4062016607c8a56eb275fe4712a47c84bc7ed01c..fb9494f57636e46e54ef230cf4803dbb6ccad0c7 100644
--- a/tensorflow/python/debug/wrappers/grpc_wrapper.py
+++ b/tensorflow/python/debug/wrappers/grpc_wrapper.py
@@ -17,15 +17,56 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import signal
+import sys
+import traceback
+
 # Google-internal import(s).
+from tensorflow.python.debug.lib import common
 from tensorflow.python.debug.wrappers import framework
 
 
+def publish_traceback(debug_server_urls,
+                      graph,
+                      feed_dict,
+                      fetches,
+                      old_graph_version):
+  """Publish traceback and source code if graph version is new.
+
+  `graph.version` is compared with `old_graph_version`. If the former is higher
+  (i.e., newer), the graph traceback and the associated source code is sent to
+  the debug server at the specified gRPC URLs.
+
+  Args:
+    debug_server_urls: A single gRPC debug server URL as a `str` or a `list` of
+      debug server URLs.
+    graph: A Python `tf.Graph` object.
+    feed_dict: Feed dictionary given to the `Session.run()` call.
+    fetches: Fetches from the `Session.run()` call.
+    old_graph_version: Old graph version to compare to.
+
+  Returns:
+    If `graph.version > old_graph_version`, the new graph version as an `int`.
+    Else, the `old_graph_version` is returned.
+  """
+  # TODO(cais): Consider moving this back to the top, after grpc becomes a
+  # pip dependency of tensorflow or tf_debug.
+  # pylint:disable=g-import-not-at-top
+  from tensorflow.python.debug.lib import source_remote
+  # pylint:enable=g-import-not-at-top
+  if graph.version > old_graph_version:
+    run_key = common.get_run_key(feed_dict, fetches)
+    source_remote.send_graph_tracebacks(
+        debug_server_urls, run_key, traceback.extract_stack(), graph,
+        send_source=True)
+    return graph.version
+  else:
+    return old_graph_version
+
+
 class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
   """Debug Session wrapper that send debug data to gRPC stream(s)."""
 
-  _GRPC_URL_PREFIX = "grpc://"
-
   def __init__(self,
                sess,
                grpc_debug_server_addresses,
@@ -38,7 +79,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
       sess: The TensorFlow `Session` object being wrapped.
       grpc_debug_server_addresses: (`str` or `list` of `str`) Single or a list
         of the gRPC debug server addresses, in the format of
-        <host:port>, without the "grpc://" prefix. For example:
+        <host:port>, with or without the "grpc://" prefix. For example:
           "localhost:7000",
           ["localhost:7000", "192.168.0.2:8000"]
       watch_fn: (`Callable`) A Callable that can be used to define per-run
@@ -62,8 +103,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
 
     if isinstance(grpc_debug_server_addresses, str):
       self._grpc_debug_server_urls = [
-          self._GRPC_URL_PREFIX + grpc_debug_server_addresses
-      ]
+          self._normalize_grpc_url(grpc_debug_server_addresses)]
     elif isinstance(grpc_debug_server_addresses, list):
       self._grpc_debug_server_urls = []
       for address in grpc_debug_server_addresses:
@@ -71,7 +111,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
           raise TypeError(
               "Expected type str in list grpc_debug_server_addresses, "
               "received type %s" % type(address))
-        self._grpc_debug_server_urls.append(self._GRPC_URL_PREFIX + address)
+        self._grpc_debug_server_urls.append(self._normalize_grpc_url(address))
     else:
       raise TypeError(
           "Expected type str or list in grpc_debug_server_addresses, "
@@ -93,3 +133,100 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
     """
 
     return self._grpc_debug_server_urls
+
+  def _normalize_grpc_url(self, address):
+    return (common.GRPC_URL_PREFIX + address
+            if not address.startswith(common.GRPC_URL_PREFIX) else address)
+
+
+def _signal_handler(unused_signal, unused_frame):
+  try:
+    input_func = raw_input
+  except NameError:
+    # Python 3 does not have raw_input.
+    input_func = input
+
+  while True:
+    response = input_func("\nSIGINT received. Quit program? (Y/n): ").strip()
+    if response in ("", "Y", "y"):
+      sys.exit(0)
+    elif response in ("N", "n"):
+      break
+
+
+def register_signal_handler():
+  try:
+    signal.signal(signal.SIGINT, _signal_handler)
+  except ValueError:
+    # This can happen if we are not in the MainThread.
+    pass
+
+
+class TensorBoardDebugWrapperSession(GrpcDebugWrapperSession):
+  """A tfdbg Session wrapper that can be used with TensorBoard Debugger Plugin.
+
+  This wrapper is the same as `GrpcDebugWrapperSession`, except that it uses a
+    predefined `watch_fn` that
+    1) uses `DebugIdentity` debug ops with the `gated_grpc` attribute set to
+        `True` to allow the interactive enabling and disabling of tensor
+       breakpoints.
+    2) watches all tensors in the graph.
+  This saves the need for the user to define a `watch_fn`.
+  """
+
+  def __init__(self,
+               sess,
+               grpc_debug_server_addresses,
+               thread_name_filter=None,
+               send_traceback_and_source_code=True,
+               log_usage=True):
+    """Constructor of TensorBoardDebugWrapperSession.
+
+    Args:
+      sess: The `tf.Session` instance to be wrapped.
+      grpc_debug_server_addresses: gRPC address(es) of debug server(s), as a
+        `str` or a `list` of `str`s. E.g., "localhost:2333",
+        "grpc://localhost:2333", ["192.168.0.7:2333", "192.168.0.8:2333"].
+      thread_name_filter: Optional filter for thread names.
+      send_traceback_and_source_code: Whether traceback of graph elements and
+        the source code are to be sent to the debug server(s).
+      log_usage: Whether the usage of this class is to be logged (if
+        applicable).
+    """
+    def _gated_grpc_watch_fn(fetches, feeds):
+      del fetches, feeds  # Unused.
+      return framework.WatchOptions(
+          debug_ops=["DebugIdentity(gated_grpc=true)"])
+
+    super(TensorBoardDebugWrapperSession, self).__init__(
+        sess,
+        grpc_debug_server_addresses,
+        watch_fn=_gated_grpc_watch_fn,
+        thread_name_filter=thread_name_filter,
+        log_usage=log_usage)
+
+    self._send_traceback_and_source_code = send_traceback_and_source_code
+    # Keeps track of the latest version of Python graph object that has been
+    # sent to the debug servers.
+    self._sent_graph_version = -1
+
+    register_signal_handler()
+
+  def run(self,
+          fetches,
+          feed_dict=None,
+          options=None,
+          run_metadata=None,
+          callable_runner=None,
+          callable_runner_args=None):
+    if self._send_traceback_and_source_code:
+      self._sent_graph_version = publish_traceback(
+          self._grpc_debug_server_urls, self.graph, feed_dict, fetches,
+          self._sent_graph_version)
+    return super(TensorBoardDebugWrapperSession, self).run(
+        fetches,
+        feed_dict=feed_dict,
+        options=options,
+        run_metadata=run_metadata,
+        callable_runner=callable_runner,
+        callable_runner_args=callable_runner_args)
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index 4efa97973eb893a0105ca6abce6d306c1f6867d8..6705cd31e291d2eab7aa8179e9b2b829f8970c18 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -27,9 +27,6 @@ from tensorflow.python.debug.wrappers import grpc_wrapper
 from tensorflow.python.debug.wrappers import local_cli_wrapper
 from tensorflow.python.training import session_run_hook
 
-# The prefix for GRPC endpoint URLs.
-_GRPC_ENDPOINT_PREFIX = "grpc://"
-
 
 class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   """Command-line-interface debugger hook.
@@ -38,10 +35,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
   """
 
-  def __init__(self,
-               ui_type="curses",
-               dump_root=None,
-               thread_name_filter=None):
+  def __init__(self, ui_type="curses", dump_root=None, thread_name_filter=None):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
@@ -65,7 +59,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
     """Add a tensor filter.
 
     See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
-    Override default behavior to accommodate the possibility of this method being
+    Override default behavior to accommodate the possibility of this method
+    being
     called prior to the initialization of the underlying
     `LocalCLIDebugWrapperSession` object.
 
@@ -140,9 +135,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
       # pylint: enable=protected-access
 
       with stepper.NodeStepper(
-          run_context.session,
-          run_context.original_args.
-          fetches,
+          run_context.session, run_context.original_args.fetches,
           run_context.original_args.feed_dict) as node_stepper:
         self._session_wrapper.invoke_node_stepper(
             node_stepper, restore_variable_values_on_exit=True)
@@ -152,8 +145,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   def after_run(self, run_context, run_values):
     # Adapt run_context and run_values to OnRunEndRequest and invoke superclass
     # on_run_end()
-    on_run_end_request = framework.OnRunEndRequest(
-        self._performed_action, run_values.run_metadata)
+    on_run_end_request = framework.OnRunEndRequest(self._performed_action,
+                                                   run_values.run_metadata)
     self._session_wrapper.on_run_end(on_run_end_request)
 
 
@@ -249,8 +242,8 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
 
     Args:
       grpc_debug_server_addresses: (`list` of `str`) A list of the gRPC debug
-        server addresses, in the format of <host:port>, without the "grpc://"
-        prefix. For example: ["localhost:7000", "192.168.0.2:8000"]
+        server addresses, in the format of <host:port>, with or without the
+        "grpc://" prefix. For example: ["localhost:7000", "192.168.0.2:8000"]
       watch_fn: A function that allows for customizing which ops to watch at
         which specific steps. See doc of
         `dumping_wrapper.DumpingDebugWrapperSession.__init__` for details.
@@ -258,23 +251,14 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
         wrapper session will be active. See doc of `BaseDebugWrapperSession` for
         more details.
       log_usage: (bool) Whether usage is to be logged.
-
-    Raises:
-      ValueError: if any debugger server addresses start with grpc://.
     """
-
-    for address in grpc_debug_server_addresses:
-      if address.startswith(_GRPC_ENDPOINT_PREFIX):
-        raise ValueError(
-            ("Debug server address %r starts with %r. It should not because "
-             "the hook already automatically adds the prefix.") % (
-                 address, _GRPC_ENDPOINT_PREFIX))
-
-    # A wrapper session responsible for GRPC communication.
     self._grpc_debug_wrapper_session = None
     self._thread_name_filter = thread_name_filter
+    self._grpc_debug_server_addresses = (
+        grpc_debug_server_addresses
+        if isinstance(grpc_debug_server_addresses, list) else
+        [grpc_debug_server_addresses])
 
-    self._grpc_debug_server_addresses = grpc_debug_server_addresses
     self._watch_fn = watch_fn
     self._log_usage = log_usage
 
@@ -315,3 +299,58 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
 
     return session_run_hook.SessionRunArgs(
         None, feed_dict=None, options=run_options)
+
+
+class TensorBoardDebugHook(GrpcDebugHook):
+  """A tfdbg hook that can be used with TensorBoard Debugger Plugin.
+
+  This hook is the same as `GrpcDebugHook`, except that it uses a predefined
+    `watch_fn` that
+    1) uses `DebugIdentity` debug ops with the `gated_grpc` attribute set to
+        `True`, to allow the interactive enabling and disabling of tensor
+       breakpoints.
+    2) watches all tensors in the graph.
+  This saves the need for the user to define a `watch_fn`.
+  """
+
+  def __init__(self,
+               grpc_debug_server_addresses,
+               thread_name_filter=None,
+               send_traceback_and_source_code=True,
+               log_usage=True):
+    """Constructor of TensorBoardDebugHook.
+
+    Args:
+      grpc_debug_server_addresses: gRPC address(es) of debug server(s), as a
+        `str` or a `list` of `str`s. E.g., "localhost:2333",
+        "grpc://localhost:2333", ["192.168.0.7:2333", "192.168.0.8:2333"].
+      thread_name_filter: Optional filter for thread names.
+      send_traceback_and_source_code: Whether traceback of graph elements and
+        the source code are to be sent to the debug server(s).
+      log_usage: Whether the usage of this class is to be logged (if
+        applicable).
+    """
+
+    def _gated_grpc_watch_fn(fetches, feeds):
+      del fetches, feeds  # Unused.
+      return framework.WatchOptions(
+          debug_ops=["DebugIdentity(gated_grpc=true)"])
+
+    super(TensorBoardDebugHook, self).__init__(
+        grpc_debug_server_addresses,
+        watch_fn=_gated_grpc_watch_fn,
+        thread_name_filter=thread_name_filter,
+        log_usage=log_usage)
+
+    self._grpc_debug_server_addresses = grpc_debug_server_addresses
+    self._send_traceback_and_source_code = send_traceback_and_source_code
+    self._sent_graph_version = -1
+    grpc_wrapper.register_signal_handler()
+
+  def before_run(self, run_context):
+    if self._send_traceback_and_source_code:
+      self._sent_graph_version = grpc_wrapper.publish_traceback(
+          self._grpc_debug_server_addresses, run_context.session.graph,
+          run_context.original_args.feed_dict,
+          run_context.original_args.fetches, self._sent_graph_version)
+    return super(TensorBoardDebugHook, self).before_run(run_context)
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
index 5bf6d9d1f4a4533a04495be9a1bf8364c3bb3db1..1465cb72950c8fa6a453ebd4290bbf6382173ff8 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@@ -31,6 +31,7 @@ from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.cli import profile_analyzer_cli
 from tensorflow.python.debug.cli import stepper_cli
 from tensorflow.python.debug.cli import ui_factory
+from tensorflow.python.debug.lib import common
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.wrappers import framework
 
@@ -81,6 +82,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     if not dump_root:
       self._dump_root = tempfile.mktemp(prefix=_DUMP_ROOT_PREFIX)
     else:
+      dump_root = os.path.expanduser(dump_root)
       if os.path.isfile(dump_root):
         raise ValueError("dump_root path points to a file: %s" % dump_root)
       elif os.path.isdir(dump_root) and os.listdir(dump_root):
@@ -464,7 +466,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     feed_key = None
     feed_value = None
     for key in self._feed_dict:
-      key_name = cli_shared.get_graph_element_name(key)
+      key_name = common.get_graph_element_name(key)
       if key_name == tensor_name:
         feed_key = key_name
         feed_value = self._feed_dict[key]
@@ -561,7 +563,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
                                            list(self._tensor_filters.keys()))
     if self._feed_dict:
       # Register tab completion for feed_dict keys.
-      feed_keys = [cli_shared.get_graph_element_name(key)
+      feed_keys = [common.get_graph_element_name(key)
                    for key in self._feed_dict.keys()]
       curses_cli.register_tab_comp_context(["print_feed", "pf"], feed_keys)
 
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 770a496aa9d2f4bb8bee0f51526ba8c3d4278b81..490812c96d83791cdc20c56f16c968f1a1851af8 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -664,6 +664,20 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
         [["run"], ["run"]], monitored_sess)
     self.assertFalse(wrapped_monitored_sess.should_stop())
 
+  def testRunsWithEmptyFetchWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run"]], self.sess, dump_root="")
+
+    run_output = wrapped_sess.run([])
+    self.assertEqual([], run_output)
+
+  def testRunsWithEmptyNestedFetchWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["run"]], self.sess, dump_root="")
+
+    run_output = wrapped_sess.run({"foo": {"baz": []}, "bar": ()})
+    self.assertEqual({"foo": {"baz": []}, "bar": ()}, run_output)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index b491a637bacccd181cab0960f08a5306b719bdd0..ab81d40148476735492890f608315b19eaa0a33f 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -1,8 +1,7 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "py_test", "tf_cc_binary")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load(
     "//tensorflow/tools/test:performance.bzl",
     "tf_py_logged_benchmark",
@@ -110,6 +109,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:training",
@@ -144,6 +144,7 @@ cuda_py_test(
         ":test",
         "//tensorflow/python:clip_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
     ],
 )
 
@@ -205,29 +206,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "python_eager_op_gen_main",
-    srcs = [
-        "python_eager_op_gen_main.cc",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":python_eager_op_gen",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_cc_binary(
-    name = "python_eager_op_gen_demo",
-    deps = [
-        ":python_eager_op_gen_main",
-        "//tensorflow/core:ops",
-    ],
-)
-
 py_library(
     name = "custom_gradient",
     srcs = ["custom_gradient.py"],
@@ -415,11 +393,28 @@ cuda_py_test(
         "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:tensor_shape",
     ],
 )
 
+py_test(
+    name = "pywrap_tfe_test",
+    srcs = ["pywrap_tfe_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backprop",
+        ":context",
+        ":test",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:random_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.
 
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 0144f3b1e59d733e951bc1f7408c803facab4eac..5c235382652811ff83ec800c0a28a3beccd45f0f 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import functools
 import operator
 import threading
@@ -42,6 +43,26 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
 
+class _TensorCache(object):
+  """Simple cache which evicts items based on length in a FIFO manner."""
+
+  def __init__(self, max_items=256):
+    self._data = collections.OrderedDict()
+    self._max_items = max_items if max_items else 256
+
+  def put(self, key, value):
+    self._data[key] = value
+
+    if len(self._data) > self._max_items:
+      self._data.popitem(last=False)
+
+  def get(self, key):
+    return self._data.get(key, None)
+
+  def flush(self):
+    self._data = {}
+
+
 _op_attr_type_cache = {}
 
 
@@ -157,6 +178,8 @@ _ops_which_dont_need_outputs = set([
     "SegmentMax",
     "UnsortedSegmentSum",
     "UnsortedSegmentMax",
+    "UnsortedSegmentMin",
+    "UnsortedSegmentProd",
     "Abs",
     "Neg",
     "ReciprocalGrad",
@@ -344,7 +367,7 @@ def implicit_val_and_grad(f):
 
   def grad_fn(*args):
     """Computes the gradient of the wrapped function."""
-    tape.push_new_tape()
+    this_tape = tape.push_new_tape()
     try:
       end_node = f(*args)
       if end_node is None:
@@ -352,15 +375,18 @@ def implicit_val_and_grad(f):
                          "did you forget to return a value from {}?".format(
                              f.__name__))
     finally:
-      popped_tape = tape.pop_tape()
-      variables = popped_tape.watched_variables()
+      tape.pop_tape(this_tape)
+    # Sorting variables by id, which is monotonically increasing in construction
+    # order. This ensures unique order across executions.
+    variables = list(sorted(this_tape.watched_variables(),
+                            key=lambda v: v.handle._id))  # pylint: disable=protected-access
     sources = [x.handle for x in variables]
 
     if not sources:
       raise ValueError("No trainable variables were accessed while the "
                        "function was being computed.")
     grad = imperative_grad.imperative_grad(_default_vspace,
-                                           popped_tape,
+                                           this_tape,
                                            nest.flatten(end_node),
                                            sources)
     return end_node, list(zip(grad, variables))
@@ -540,14 +566,14 @@ def _ensure_unique_tensor_objects(parameter_positions, args):
     if i in parameter_positions:
       tid = ops.tensor_id(t)
       if tid in s:
-        args[i] = args[i]._dup()  # pylint: disable=protected-access
+        args[i] = gen_array_ops.identity(args[i])
       else:
         s.add(tid)
   return args
 
 
 def val_and_grad_function(f, params=None):
-  """Returns a function that computes f and is derivative w.r.t. params.
+  """Returns a function that computes f and its derivative w.r.t. params.
 
   Example:
   ```python
@@ -649,7 +675,7 @@ def make_vjp(f, params=None):
     """Computes the value and gradient of the decorated function."""
     parameter_positions = _get_arg_spec(f, params, args)
     assert not kwds, "The gradient function can't take keyword arguments."
-    tape.push_new_tape()
+    this_tape = tape.push_new_tape()
     try:
       sources = []
       args = [
@@ -670,12 +696,12 @@ def make_vjp(f, params=None):
       flat_result = [gen_array_ops.identity(x) for x in flat_result]
       result = nest.pack_sequence_as(result, flat_result)
     finally:
-      t = tape.pop_tape()
+      tape.pop_tape(this_tape)
     def vjp(dy=None):
       if dy is not None:
         dy = [ops.convert_to_tensor(x) for x in nest.flatten(dy)]
       return imperative_grad.imperative_grad(
-          _default_vspace, t, nest.flatten(result), sources,
+          _default_vspace, this_tape, nest.flatten(result), sources,
           output_gradients=dy)
     return result, vjp
 
@@ -707,7 +733,7 @@ def _aggregate_grads(gradients):
       if isinstance(grad, ops.Tensor):
         indexed_slices = ops.IndexedSlices(
             grad,
-            constant_op.constant(range(grad.shape[0])),
+            math_ops.range(grad.shape[0]),
             constant_op.constant(grad.shape.as_list()))
         indexed_slices_list.append(indexed_slices)
       else:
@@ -731,8 +757,7 @@ def _num_elements(grad):
   raise ValueError("`grad` not a Tensor or IndexedSlices.")
 
 
-_last_shape_dtype = [None, None]
-_last_zero = [None]
+_zeros_cache = _TensorCache()
 
 
 def _fast_fill(value, shape, dtype):
@@ -741,13 +766,22 @@ def _fast_fill(value, shape, dtype):
 
 def _zeros(shape, dtype):
   """Wraps array_ops.zeros to cache last zero for a given shape and dtype."""
-  if [shape, dtype] != _last_shape_dtype:
-    _last_shape_dtype[:] = [shape, dtype]
-    _last_zero[0] = _fast_fill(0, shape, dtype)
-  return _last_zero[0]
+  device = context.context().device_name
+  if dtype == dtypes.variant:
+    # TODO(apassos): need to save enough information about variant tensors to do
+    # a zeros
+    return None
+  cache_key = shape, dtype, device
+  cached = _zeros_cache.get(cache_key)
+  if cached is None:
+    cached = _fast_fill(0, shape, dtype)
+    _zeros_cache.put(cache_key, cached)
+  return cached
 
 
 def _ones(shape, dtype):
+  if shape == ():  # pylint: disable=g-explicit-bool-comparison
+    return constant_op.constant(1, dtype=dtype)
   return _fast_fill(1, shape, dtype)
 
 
@@ -832,11 +866,11 @@ class GradientTape(object):
     self._persistent = persistent
 
   def __enter__(self):
-    tape.push_new_tape(persistent=self._persistent)
+    self._tape = tape.push_new_tape(persistent=self._persistent)
     return self
 
   def __exit__(self, typ, value, traceback):
-    self._tape = tape.pop_tape()
+    tape.pop_tape(self._tape)
 
   def watch(self, tensor):
     """Ensures that `tensor` is being traced by this tape.
@@ -849,13 +883,18 @@ class GradientTape(object):
         t = t.handle
       tape.watch(t)
 
-  def gradient(self, target, sources):
+  def watched_variables(self):
+    return self._tape.watched_variables()
+
+  def gradient(self, target, sources, output_gradients=None):
     """Computes the gradient using information traced by the tape.
 
     Args:
       target: the tensor to be differentiated.
       sources: a list of Tensors or Variables, the target will be
        differentiated with respect to the sources.
+      output_gradients: a list of gradients, one for each element of
+       target. Defaults to None.
 
     Returns:
       a list of Tensors (or IndexedSlices, or None), one for each element in
@@ -873,7 +912,8 @@ class GradientTape(object):
                else x
                for x in sources]
     grad = imperative_grad.imperative_grad(
-        _default_vspace, self._tape, [target], sources)
+        _default_vspace, self._tape, [target], sources,
+        output_gradients=output_gradients)
     if not self._persistent:
       self._tape = None
     return grad
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 9816dd022eb5b524888a8058ef550a107ef3a00d..a12113893ab3eac671e8138472bc95e9d8b89499 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gradients
@@ -44,6 +45,7 @@ from tensorflow.python.training import training
 
 class BackpropTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testAggregateGradients(self):
 
     def fn(x):
@@ -60,7 +62,7 @@ class BackpropTest(test.TestCase):
     var_np = np.random.rand(4, 2).astype(np.float32)
     var = constant_op.constant(var_np)
     grad = backprop.gradients_function(fn, [0])(var)[0]
-    grad = ops.convert_to_tensor(grad).numpy()
+    grad = self.evaluate(ops.convert_to_tensor(grad))
 
     with context.graph_mode(), self.test_session():
       tf_var = array_ops.constant(var_np, dtypes.float32)
@@ -151,6 +153,21 @@ class BackpropTest(test.TestCase):
     opt.apply_gradients([(grad, embedding)])
     self.assertAllClose(expected, embedding.read_value())
 
+  def testImplicitGradOrdering(self):
+    v0 = resource_variable_ops.ResourceVariable(1.0)
+    v1 = resource_variable_ops.ResourceVariable(2.0)
+
+    def f():
+      x = v1 * v1
+      y = v0 * v0
+      return x + y
+
+    grads = backprop.implicit_grad(f)()
+    ordered_variables = [x[1] for x in grads]
+    self.assertTrue(ordered_variables[0] is v0)
+    self.assertTrue(ordered_variables[1] is v1)
+
+  @test_util.assert_no_new_tensors
   def testGradientNone(self):
 
     def loss(x, l):
@@ -165,6 +182,7 @@ class BackpropTest(test.TestCase):
     g, = backprop.gradients_function(loss, [0])(logits, labels)
     self.assertAllEqual(g.numpy(), [[-0.5, 0.5]])
 
+  @test_util.assert_no_new_tensors
   def testSecondGrad(self):
 
     def first(x):
@@ -181,6 +199,7 @@ class BackpropTest(test.TestCase):
     grad = backprop.gradients_function(second, [0])(f)[0]
     self.assertAllEqual([[0.0]], grad)
 
+  @test_util.assert_no_new_tensors
   def testMakeVJP(self):
 
     def f(x):
@@ -191,6 +210,7 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(result, 9.0)
     self.assertAllEqual(vjp(2.0)[0], 12.0)
 
+  @test_util.assert_no_new_tensors
   def testGradGrad(self):
 
     def sq(x):
@@ -204,6 +224,7 @@ class BackpropTest(test.TestCase):
 
     self.assertAllEqual(gradgrad(constant_op.constant(3.0))[0], 2.0)
 
+  @test_util.assert_no_new_tensors
   def testGradGradExp(self):
 
     def grad(x):
@@ -214,6 +235,22 @@ class BackpropTest(test.TestCase):
 
     self.assertAllEqual(gradgrad(constant_op.constant(0.0))[0], 1.0)
 
+  @test_util.assert_no_new_tensors
+  def testStopGradient(self):
+    grad = backprop.gradients_function(
+        lambda x: array_ops.stop_gradient(math_ops.argmax(x)))
+    self.assertAllEqual(grad([0.0])[0], None)
+
+  @test_util.assert_no_new_tensors
+  def testArgmax(self):
+    def argmax(x):
+      i = math_ops.argmax(x)
+      return array_ops.stop_gradient(i)
+
+    grad = backprop.gradients_function(argmax)
+    self.assertAllEqual(grad([0.0])[0], None)
+
+  @test_util.assert_no_new_tensors
   def testGPU(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -229,6 +266,7 @@ class BackpropTest(test.TestCase):
     grad = backprop.gradients_function(fn, [0])(constant_op.constant(1.0))[0]
     self.assertAllEqual(grad, 1.0)
 
+  @test_util.assert_no_new_tensors
   def testGPUImplicitGrad(self):
     if not context.context().num_gpus():
       self.skipTest('No GPU found')
@@ -244,6 +282,7 @@ class BackpropTest(test.TestCase):
     self.assertEqual(
         backprop.implicit_grad(f)()[0][0].cpu().numpy(), 1.0)
 
+  @test_util.assert_no_new_tensors
   def testCPU(self):
 
     def fn(x):
@@ -254,6 +293,7 @@ class BackpropTest(test.TestCase):
     grad = backprop.gradients_function(fn, [0])(constant_op.constant(1.0))[0]
     self.assertAllEqual(grad, 1.0)
 
+  @test_util.assert_no_new_tensors
   def testTensorCopyGPU2CPU2GPU(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -268,6 +308,7 @@ class BackpropTest(test.TestCase):
     grad = backprop.gradients_function(f, [0])(a, b)[0]
     self.assertAllEqual(grad, 1.0)
 
+  @test_util.assert_no_new_tensors
   def testEmptyParams(self):
 
     def fn(a, b):
@@ -279,6 +320,7 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(dx, y.numpy())
     self.assertAllEqual(dy, x.numpy())
 
+  @test_util.assert_no_new_tensors
   def testUnconnectedNone(self):
     v = resource_variable_ops.ResourceVariable(
         1.0, name='testUnconnectedNone')
@@ -289,6 +331,7 @@ class BackpropTest(test.TestCase):
 
     self.assertEqual(backprop.implicit_grad(f)()[0][0], None)
 
+  @test_util.assert_no_new_tensors
   def testGradientTape(self):
     with backprop.GradientTape() as g:
       x = constant_op.constant(3.0)
@@ -303,6 +346,7 @@ class BackpropTest(test.TestCase):
     grad = g.gradient(y, [x])[0]
     self.assertEqual(grad.numpy(), 6.0)
 
+  @test_util.assert_no_new_tensors
   def testGradientTapeGradientCalledMultipleTimes(self):
     with backprop.GradientTape() as g:
       x = constant_op.constant(3.0)
@@ -314,6 +358,7 @@ class BackpropTest(test.TestCase):
         RuntimeError, 'GradientTape.gradient can only be called once'):
       g.gradient(y, [x])
 
+  @test_util.assert_no_new_tensors
   def testPersistentTape(self):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant(3.0)
@@ -326,6 +371,7 @@ class BackpropTest(test.TestCase):
     self.assertEqual(dy_dx.numpy(), 2*3)
     del g
 
+  @test_util.assert_no_new_tensors
   def testPersistentNestedTape(self):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant(3.0)
@@ -345,6 +391,7 @@ class BackpropTest(test.TestCase):
     self.assertEqual(grad.numpy(), 12.0)
     del g
 
+  @test_util.assert_no_new_tensors
   def testGradientTapeVariable(self):
     v = resource_variable_ops.ResourceVariable(1.0, name='v')
     with backprop.GradientTape() as g:
@@ -352,6 +399,7 @@ class BackpropTest(test.TestCase):
     grad = g.gradient(y, [v])[0]
     self.assertAllEqual(grad, 2.0)
 
+  @test_util.assert_no_new_tensors
   def testEmptyParamsForValueAndGradFunction(self):
     def fn(a, b):
       return a * b
@@ -364,6 +412,7 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(dx, y)
     self.assertAllEqual(dy, x)
 
+  @test_util.assert_no_new_tensors
   def testNonEmptyParamsForValueAndGradFunction(self):
     def fn(a, b):
       return a * b
@@ -376,6 +425,7 @@ class BackpropTest(test.TestCase):
     self.assertEqual(1, len(grads))
     self.assertAllEqual(grads[0], x)
 
+  @test_util.assert_no_new_tensors
   def testTensorCopyCPU2GPU2CPU(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -460,6 +510,7 @@ class BackpropTest(test.TestCase):
 
     self.assertAllEqual(backprop.gradients_function(f)(1.0)[0], 3.0)
 
+  @test_util.assert_no_new_tensors
   def testExceptionSafety(self):
 
     def f(unused_x):
@@ -475,6 +526,7 @@ class BackpropTest(test.TestCase):
 
     self.assertAllEqual(backprop.gradients_function(real_f)(1.0)[0], 2.0)
 
+  @test_util.assert_no_new_tensors
   def testMultiValueConvertToTensor(self):
     x = resource_variable_ops.ResourceVariable(
         initial_value=array_ops.constant([1.0]), name='x')
@@ -535,6 +587,7 @@ class BackpropTest(test.TestCase):
         initial_value=1., name='testSameObjectForMultipleArguments.Variable')
     self.assertAllEqual([1., 1.], np_g(v, v))
 
+  @test_util.assert_no_new_tensors
   def testImplicitGradientsCustomGradientAndCachedVariableValue(self):
 
     @custom_gradient.custom_gradient
@@ -560,6 +613,7 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(7, grad)
     self.assertAllEqual(x, var)
 
+  @test_util.assert_no_new_tensors
   def testCustomGradient(self):
 
     @custom_gradient.custom_gradient
@@ -586,6 +640,7 @@ class BackpropTest(test.TestCase):
         var.assign_sub(lr*grad)
     self.assertAllEqual(losses, [4.0, 3., 2., 1., 0.])
 
+  @test_util.assert_no_new_tensors
   def testCustomGradientIdentity(self):
 
     @custom_gradient.custom_gradient
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 9849f0f322eff2d909e7396158539a9663b95f29..b56cbe80a7ab6b90d715187b0f0a44847038fc37 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -28,11 +28,14 @@ from __future__ import print_function
 import time
 
 import numpy as np
+import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop  # pylint: disable=unused-import
 from tensorflow.python.eager import context
+from tensorflow.python.eager import core
+from tensorflow.python.eager import execute
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
@@ -41,12 +44,33 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-
+from tensorflow.python.ops import resource_variable_ops
 
 CPU = "/device:CPU:0"
 GPU = "/device:GPU:0"
 
 
+def c_tfe_py_fastpath_execute(a,
+                              b,
+                              transpose_a=False,
+                              transpose_b=False,
+                              name=None):
+  ctx = context.context()
+  assert not ctx.in_graph_mode(
+  ), "The prototype doesn't contain C code for graph construction"
+  try:
+    return pywrap_tensorflow.TFE_Py_FastPathExecute(
+        ctx._handle, ctx.device_name, "MatMul", execute.record_gradient, name,
+        ctx._post_execution_callbacks, a, b, "transpose_a", transpose_a,
+        "transpose_b", transpose_b)
+  except core._NotOkStatusException as e:
+    if name is not None:
+      message = e.message + " name: " + name
+    else:
+      message = e.message
+    six.raise_from(core._status_to_exception(e.code, message), None)
+
+
 class MicroBenchmarks(test.Benchmark):
 
   def __init__(self):
@@ -222,6 +246,14 @@ class MicroBenchmarks(test.Benchmark):
       gen_math_ops._mat_mul(m, m, transpose_b=transpose_b)
     self._run(func, num_iters)
 
+  def _benchmark_tfe_py_fastpath_execute_matmul(self, m, transpose_b,
+                                                num_iters):
+
+    def func():
+      c_tfe_py_fastpath_execute(m, m, transpose_b=transpose_b)
+
+    self._run(func, num_iters)
+
   def _benchmark_tfe_py_execute_matmul(self, m, transpose_b, num_iters):
     inputs = [m, m]
     # pylint: disable=protected-access
@@ -240,6 +272,14 @@ class MicroBenchmarks(test.Benchmark):
     func = lambda: f(m, m, transpose_b)
     self._run(func, num_iters)
 
+  def _benchmark_read_variable(self, m, num_iters):
+    self._run(m.value, num_iters)
+
+  def _benchmark_read_variable_with_tape(self, m, num_iters):
+    with backprop.GradientTape() as tape:
+      tape.watch(m)
+      self._run(m.value, num_iters)
+
   # Benchmarks for A^2, A of dimension 2 by 2.
   def benchmark_np_matmul_2_by_2(self):
     self._benchmark_np_matmul(
@@ -257,6 +297,12 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
+  def benchmark_tfe_py_fastpath_execute_matmul_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_tfe_py_fastpath_execute_matmul(
+          m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
   def benchmark_tfe_py_execute_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -320,6 +366,12 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  def benchmark_tfe_py_fastpath_execute_matmul_100_by_784_CPU(self):
+    with context.device(CPU):
+      m = self._m_100_by_784.cpu()
+      self._benchmark_tfe_py_fastpath_execute_matmul(
+          m, transpose_b=True, num_iters=self._num_iters_100_by_784)
+
   def benchmark_tfe_py_execute_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
@@ -364,6 +416,32 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  def benchmark_read_variable_op_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
+      self._benchmark_read_variable(m, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_read_variable_op_2_by_2_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2.gpu())
+      self._benchmark_read_variable(m, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_read_variable_op_with_tape_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
+      self._benchmark_read_variable_with_tape(
+          m, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_read_variable_op_with_tape_2_by_2_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = resource_variable_ops.ResourceVariable(self._m_2_by_2.gpu())
+      self._benchmark_read_variable_with_tape(
+          m, num_iters=self._num_iters_2_by_2)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 92f4e15c054bd8cf3886b8c22e414abdfccbdae5..0e9c21b221c64aaa445fde59514c7e50f8d8b773 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -18,14 +18,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import contextlib
 import copy
 import random
 import threading
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import errors
+from tensorflow.python.util import compat
+from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
 
 GRAPH_MODE = 0
@@ -45,6 +50,8 @@ _MAXINT32 = 2**31 - 1
 DEVICE_PLACEMENT_EXPLICIT = pywrap_tensorflow.TFE_DEVICE_PLACEMENT_EXPLICIT
 DEVICE_PLACEMENT_WARN = pywrap_tensorflow.TFE_DEVICE_PLACEMENT_WARN
 DEVICE_PLACEMENT_SILENT = pywrap_tensorflow.TFE_DEVICE_PLACEMENT_SILENT
+DEVICE_PLACEMENT_SILENT_FOR_INT32 = (
+    pywrap_tensorflow.TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32)
 
 
 # TODO(agarwal): better name ?
@@ -62,6 +69,41 @@ class _EagerContext(threading.local):
     self.scalar_cache = {}
 
 
+ContextStackEntry = collections.namedtuple(
+    "ContextStackEntry", ["is_building_function", "enter_context_fn"])
+
+
+class ContextStack(threading.local):
+  """A thread-local stack of context switches."""
+
+  def __init__(self):
+    super(ContextStack, self).__init__()
+    self.stack = []
+
+  def push(self, is_building_function, enter_context_fn):
+    """Push metadata about a context switch onto the stack.
+
+    A context switch can take one of two forms: installing a graph as the
+    default graph, or entering the eager context.
+
+    Args:
+      is_building_function: (bool.) Whether the context is building a function.
+      enter_context_fn: (function.) A callable that executes the context switch.
+        For example, `graph.as_default` or `eager_mode`.
+    """
+
+    self.stack.append(
+        ContextStackEntry(is_building_function, enter_context_fn))
+
+  def pop(self):
+    """Pop the stack."""
+
+    self.stack.pop()
+
+
+context_stack = ContextStack()
+
+
 # TODO(agarwal): rename to EagerContext / EagerRuntime ?
 # TODO(agarwal): consider keeping the corresponding Graph here.
 class Context(object):
@@ -83,6 +125,8 @@ class Context(object):
            right device but raises a warning.
          tfe.DEVICE_PLACEMENT_SILENT: silently copies the tensors. This might
            hide performance problems.
+         tfe.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies int32 tensors,
+           raising errors on the other ones.
     """
     self._eager_context = _EagerContext()
     self._context_handle = None
@@ -97,6 +141,9 @@ class Context(object):
     """Set a global eager mode seed for random ops."""
     self._seed = seed
     self._rng = random.Random(self._seed)
+    # Also clear the kernel cache, to reset any existing seeds
+    if self._context_handle is not None:
+      pywrap_tensorflow.TFE_ContextClearCaches(self._context_handle)
 
   def _internal_operation_seed(self):
     """Returns a fake operation seed.
@@ -183,10 +230,14 @@ class Context(object):
     ctx = self._eager_context
     old_mode = ctx.mode
     ctx.mode = mode
+    if mode == EAGER_MODE:
+      context_stack.push(False, eager_mode)
     try:
       yield
     finally:
       ctx.mode = old_mode
+      if mode == EAGER_MODE:
+        context_stack.pop()
 
   def in_graph_mode(self):
     """Returns True if current thread is in GRAPH mode."""
@@ -288,6 +339,21 @@ class Context(object):
     self._initialize_handle_and_devices()
     return self._num_gpus
 
+  def add_function(self, fn):
+    """Add a function definition to the context.
+
+    Once added, the function (identified by its name) can be executed like any
+    other operation.
+
+    Args:
+      fn: A wrapped TF_Function (returned from TF_GraphToFunction_wrapper).
+    """
+    with errors.raise_exception_on_not_ok_status() as status:
+      pywrap_tensorflow.TFE_ContextAddFunction(
+          self._handle,  # pylint: disable=protected-access
+          fn,
+          status)
+
   def add_function_def(self, fdef):
     """Add a function definition to the context.
 
@@ -340,6 +406,56 @@ class Context(object):
     """Get the list of post-execution callbacks added to the context."""
     return self._post_execution_callbacks
 
+  def enable_run_metadata(self):
+    """Enables tracing of op execution via RunMetadata.
+
+    To retrieve the accumulated metadata call context.export_run_metadata()
+    and to stop tracing call context.disable_run_metadata().
+    """
+    if not self._context_handle:
+      self._initialize_handle_and_devices()
+    pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._context_handle)
+
+  @tf_contextlib.contextmanager
+  def device_policy(self, policy):
+    if not self._context_handle:
+      self._initialize_handle_and_devices()
+    old = pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
+        self._context_handle)
+    pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
+        self._handle, policy)
+    try:
+      yield
+    finally:
+      pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
+          self._handle, old)
+
+  def disable_run_metadata(self):
+    """Disables tracing of op execution via RunMetadata."""
+    if not self._context_handle:
+      return
+    pywrap_tensorflow.TFE_ContextDisableRunMetadata(self._context_handle)
+
+  def export_run_metadata(self):
+    """Returns a RunMetadata proto with accumulated information.
+
+    The returned protocol buffer contains information since the most recent call
+    to either enable_run_metadata or export_run_metadata.
+
+    Returns:
+      A RunMetadata protocol buffer. Or None if not enabled.
+    """
+    if not self._context_handle:
+      return None
+    with c_api_util.tf_buffer() as buffer_:
+      with errors.raise_exception_on_not_ok_status() as status:
+        pywrap_tensorflow.TFE_ContextExportRunMetadata(
+            self._context_handle, buffer_, status)
+      proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
+    run_metadata = config_pb2.RunMetadata()
+    run_metadata.ParseFromString(compat.as_bytes(proto_data))
+    return run_metadata
+
 _context = None
 _context_lock = threading.Lock()
 
@@ -458,3 +574,36 @@ def num_gpus():
     The number of available GPU devices.
   """
   return context().num_gpus()
+
+
+def enable_run_metadata():
+  """Enables tracing of op execution via RunMetadata.
+
+  To retrieve the accumulated metadata call context.export_run_metadata()
+  and to stop tracing call context.disable_run_metadata().
+  """
+  context().enable_run_metadata()
+
+
+def disable_run_metadata():
+  """Disables tracing of op execution via RunMetadata."""
+  context().disable_run_metadata()
+
+
+def export_run_metadata():
+  """Returns a RunMetadata proto with accumulated information.
+
+  The returned protocol buffer contains information since the most recent call
+  to either enable_run_metadata or export_run_metadata.
+
+  Returns:
+    A RunMetadata protocol buffer.
+  """
+  return context().export_run_metadata()
+
+
+# Not every user creates a Context via context.context()
+# (for example, enable_eager_execution in python/framework/ops.py),
+# but they do all import this file.  Note that IS_IN_GRAPH_MODE and
+# in_graph_mode are both parameterless functions.
+is_in_graph_mode.IS_IN_GRAPH_MODE = in_graph_mode
diff --git a/tensorflow/python/eager/core.py b/tensorflow/python/eager/core.py
index 483b7172107838a0069831f2347b0c644c05c000..8fb69300209d74a164c38654d737432cdfb7884a 100644
--- a/tensorflow/python/eager/core.py
+++ b/tensorflow/python/eager/core.py
@@ -47,3 +47,17 @@ class _NotOkStatusException(Exception):
 
 
 pywrap_tensorflow.TFE_Py_RegisterExceptionClass(_NotOkStatusException)
+
+
+class _FallbackException(Exception):
+  """Exception class to handle fallback from the fastpath.
+
+  The fastpath that we refer to here is the one implemented to reduce per-op
+  overheads (TFE_Py_FastPathExecute_C). If the conditions for executing the op
+  on the fastpath are not met, we fallback to a safer (and more complete)
+  slowpath, and this Exception is raised to signal that transition.
+  """
+  pass
+
+
+pywrap_tensorflow.TFE_Py_RegisterFallbackExceptionClass(_FallbackException)
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 2449162dcaa47cb71dde3be70675654709fec794..ee3c10633e1cb849e319f2f5490e5beb5dd15c80 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import nn_ops
 
 
 def execute(op_name, num_outputs, inputs, attrs=None):
@@ -84,6 +85,42 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertTrue(has_cpu_device)
     del ctx
 
+  def testRunMetadata(self):
+    context.enable_run_metadata()
+    t = constant_op.constant(1.0)
+    _ = t + t  # Runs an operation which will be in the RunMetadata
+    run_metadata = context.export_run_metadata()
+    context.disable_run_metadata()
+    step_stats = run_metadata.step_stats
+    self.assertGreater(len(step_stats.dev_stats), 0)
+    cpu_stats = step_stats.dev_stats[0]
+    self.assertEqual('/job:localhost/replica:0/task:0/device:CPU:0',
+                     cpu_stats.device)
+    self.assertEqual(len(cpu_stats.node_stats), 1)
+    self.assertEqual(cpu_stats.node_stats[0].node_name, 'Add')
+
+  def testContextStackContainsEagerMode(self):
+    # Eager execution has been enabled, and no other context
+    # switch has occurred, so `context_stack` should contain
+    # exactly one entry.
+    self.assertEqual(len(context.context_stack.stack), 1)
+    stack_entry = context.context_stack.stack[0]
+
+    # The entry should log that eager mode was entered.
+    self.assertIs(stack_entry.enter_context_fn, context.eager_mode)
+
+    # It is not possible to build a graph function when eager execution
+    # is enabled; the stack entry should reflect this fact.
+    self.assertFalse(stack_entry.is_building_function)
+
+  def testInt32GPU(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+    with ops.device('gpu:0'):
+      xent = nn_ops.sparse_softmax_cross_entropy_with_logits(
+          logits=[[0.0, 0.0]], labels=[0])
+    self.assertAllClose(xent, [0.69314718])
+
   def _runInThread(self, target, args):
     t = threading.Thread(target=target, args=args)
     try:
@@ -145,6 +182,15 @@ class TFETest(test_util.TensorFlowTestCase):
     with self.assertRaises(RuntimeError):
       x.gpu(context.context().num_gpus() + 1)
 
+  def testCopyScope(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+    constant = constant_op.constant(1.0)
+    with ops.device('gpu:0'):
+      with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+        c = constant + 1.0
+    self.assertAllEqual(c, 2.0)
+
   def testNumpyForceCPU(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index 306cf07aabe1c214d02da5f077a57043cc1f4089..2ff5b8d8f489731c14d8abb81652a17026ed4935 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -72,7 +72,7 @@ def execute_with_callbacks(op_name, num_outputs, inputs, attrs, ctx, name=None):
   """Monkey-patch to execute to enable execution callbacks."""
   tensors = quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
   for callback in ctx.post_execution_callbacks:
-    callback(op_name, name, attrs, inputs, tensors)
+    callback(op_name, inputs, attrs, tensors, name)
 
   return tensors
 
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 2f1654dda499583fe4766cbe2e330399defc96fd..535361498a9dd33003d0479051e97d7ff2553067 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -104,10 +104,10 @@ class InfOrNanError(Exception):
 
 
 def inf_nan_callback(op_type,
-                     op_name,
-                     attrs,
                      inputs,
+                     attrs,
                      outputs,
+                     op_name,
                      check_inf=True,
                      check_nan=True,
                      action=_DEFAULT_CALLBACK_ACTION):
@@ -121,14 +121,14 @@ def inf_nan_callback(op_type,
 
   Args:
     op_type: Name of the TFE operation type (e.g., `MatMul`).
-    op_name: Name of the TFE operation. This name is set by client and can be
-      `None` if it unset.
-    attrs: Attributes of the TFE operation, as a tuple of alternating attribute
-      names and attribute values.
     inputs: The `list` of input tensors to the operation, currently unused by
       this callback.
+    attrs: Attributes of the TFE operation, as a tuple of alternating attribute
+      names and attribute values.
     outputs: The `list` of output tensors from the operation, checked by this
       callback for `inf` and `nan` values.
+    op_name: Name of the TFE operation. This name is set by client and can be
+      `None` if it unset.
     check_inf: (`bool`) Whether this callback should check for `inf` values in
       the output tensor values.
     check_nan: (`bool`) Whether this callback should check for `nan` values in
@@ -153,7 +153,7 @@ def inf_nan_callback(op_type,
       continue
 
     numpy_dtype = output.dtype.as_numpy_dtype
-    if (np.issubdtype(numpy_dtype, np.float) or
+    if (np.issubdtype(numpy_dtype, np.floating) or
         np.issubdtype(numpy_dtype, np.complex) or
         np.issubdtype(numpy_dtype, np.integer)):
       try:
@@ -187,26 +187,38 @@ def inf_nan_callback(op_type,
 
 
 def inf_callback(op_type,
-                 op_name,
-                 attrs,
                  inputs,
+                 attrs,
                  outputs,
+                 op_name,
                  action=_DEFAULT_CALLBACK_ACTION):
   """A specialization of `inf_nan_callback` that checks for `inf`s only."""
   inf_nan_callback(
-      op_type, op_name, attrs, inputs, outputs, check_inf=True, check_nan=False,
+      op_type,
+      inputs,
+      attrs,
+      outputs,
+      op_name,
+      check_inf=True,
+      check_nan=False,
       action=action)
 
 
 def nan_callback(op_type,
-                 op_name,
-                 attrs,
                  inputs,
+                 attrs,
                  outputs,
+                 op_name,
                  action=_DEFAULT_CALLBACK_ACTION):
   """A specialization of `inf_nan_callback` that checks for `nan`s only."""
   inf_nan_callback(
-      op_type, op_name, attrs, inputs, outputs, check_inf=False, check_nan=True,
+      op_type,
+      inputs,
+      attrs,
+      outputs,
+      op_name,
+      check_inf=False,
+      check_nan=True,
       action=action)
 
 
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 9bcd9c23c7bad4d4e3b93fa4bb5fc2c316d5c828..28f5289ffc0ace6f9b6cad7cdd1160a184f882c7 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -25,15 +25,20 @@ import threading
 
 import numpy as np
 
+from tensorflow.core.framework import function_pb2
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
+from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import graph_to_function_def
+from tensorflow.python.framework import dtypes as dtypes_module
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 
@@ -47,28 +52,6 @@ _scoped_captures = threading.local()
 _scoped_captures.tensors = None
 
 
-def make_function_def(graph, operations, inputs, outputs):
-  """Makes function def where accesses to resources are serialized."""
-  last_op_using_resource_tensor = {}
-
-  # TODO(apassos) probably control flow has to be handled delicately here as in
-  # if a resource is accessed inside a control flow context we need the control
-  # dependency to point to something outside the context which is guaranteed to
-  # happen after the access.
-  #
-  # TODO(apassos) this should do some form of alias analysis as ops which
-  # forward the resources such as Identity and Switch can cause serialization to
-  # fail.
-  for op in operations:
-    for t in op.inputs:
-      if t.dtype == dtypes.resource:
-        if t.name in last_op_using_resource_tensor:
-          op._add_control_input(last_op_using_resource_tensor[t.name])  # pylint: disable=protected-access
-        last_op_using_resource_tensor[t.name] = op
-  return graph_to_function_def.graph_to_function_def(
-      graph, operations, inputs, outputs)
-
-
 @contextlib.contextmanager
 def capture_tensors(captures):
   old = _scoped_captures.__dict__.get("tensors", None)
@@ -85,8 +68,25 @@ def capture_value(tensor_map, value, dtype, name):
   if captured_value is None:
     captured_value = graph_placeholder(
         dtype=dtype or value.dtype, shape=value.shape, name=name)
-    if captured_value.dtype == dtypes.resource:
-      captured_value._handle_data = value._handle_data  # pylint: disable=protected-access
+    if captured_value.dtype == dtypes_module.resource:
+      handle_data = value._handle_data  # pylint: disable=protected-access
+      captured_value._handle_data = handle_data  # pylint: disable=protected-access
+      if handle_data is not None and handle_data.is_set:
+        # Ensure that shapes and dtypes are propagated.
+        shapes, types = zip(*[(pair.shape, pair.dtype)
+                              for pair in handle_data.shape_and_type])
+        ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
+        shapes = [[d.size for d in s.dim]
+                  if not s.unknown_rank else None for s in shapes]
+        with errors.raise_exception_on_not_ok_status() as status:
+          pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
+              captured_value._op._graph._c_graph,  # pylint: disable=protected-access
+              captured_value._as_tf_output(),  # pylint: disable=protected-access
+              shapes,
+              ranks,
+              types,
+              status)
+
     tensor_map[ops.tensor_id(value)] = (value, captured_value)
   else:
     captured_value = captured_value[1]
@@ -101,7 +101,7 @@ def _convert_to_graph_tensor(value, dtype=None, name=None, as_ref=False):
   Arguments:
     value: A Tensor object.
     dtype: The datatype of the value produced by the node in the graph.
-    name:  Name of the node in the graph.
+    name:  str, Name of the node in the graph.
     as_ref: Ignored (required by register_tensor_conversion_function).
 
   Returns:
@@ -109,22 +109,47 @@ def _convert_to_graph_tensor(value, dtype=None, name=None, as_ref=False):
     is not enabled. A placeholder which will have the value of the
     tensor at runtime otherwise.
   """
+  del as_ref  # Unused.
+
   if context.in_eager_mode():
     return value
-  _ = as_ref
+
+  default_graph = ops.get_default_graph()
+  if not default_graph.building_function:
+    return value
+
   tensor_map = _scoped_captures.tensors
   if tensor_map is None:
     # Capturing is not enabled.
+    if value.dtype == dtypes_module.resource:
+      return value
     return constant_op.constant(value.numpy())
+  if type(value) == ops.Tensor and value.graph is default_graph:
+    # The tensor has already been converted and captured. The type check
+    # is intentional: we are checking that value is a Tensor and not an
+    # EagerTensor.
+    return value
   return capture_value(tensor_map, value, dtype, name)
 
 
 class CapturingGraph(ops.Graph):
+  """Graph used when constructing eager functions."""
 
   def __init__(self, captures):
     super(CapturingGraph, self).__init__()
     self._building_function = True
     self.captures = captures
+    # Map from resource tensor name to last op (in program order) which uses
+    # this tensor. Used to enforce that execution order matches program order
+    # for resource tensors.
+    self._last_op_using_resource_tensor = {}
+
+  # TODO(apassos) remove once the C API is used by default.
+  def _use_c_api_hack(self):
+    return True
+
+  def clear_resource_control_flow_state(self):
+    self._last_op_using_resource_tensor = {}
 
   def create_op(
       self,
@@ -137,12 +162,31 @@ class CapturingGraph(ops.Graph):
       op_def=None,
       compute_shapes=True,
       compute_device=True):
+    # TODO(apassos) probably control flow has to be handled delicately here as
+    # in if a resource is accessed inside a control flow context we need the
+    # control dependency to point to something outside the context which is
+    # guaranteed to happen after the access.
+    #
+    # TODO(apassos) this should do some form of alias analysis as ops which
+    # forward the resources such as Identity and Switch can cause serialization
+    # to fail.
+    resource_inputs = set()
+    control_inputs = set()
     for i, inp in enumerate(inputs):
       if inp.graph is not self:
         inputs[i] = capture_value(self.captures, inp, inp.dtype, inp.op.name)
-    return super(CapturingGraph, self).create_op(
-        op_type, inputs, dtypes, input_types, name, attrs, op_def,
-        compute_shapes, compute_device)
+      inp = inputs[i]
+      if inp.dtype == dtypes_module.resource:
+        if inp.name in self._last_op_using_resource_tensor:
+          control_inputs.add(self._last_op_using_resource_tensor[inp.name])
+        resource_inputs.add(inp.name)
+    with self.control_dependencies(list(control_inputs)):
+      op = super(CapturingGraph, self).create_op(
+          op_type, inputs, dtypes, input_types, name, attrs, op_def,
+          compute_shapes, compute_device)
+    for name in resource_inputs:
+      self._last_op_using_resource_tensor[name] = op
+    return op
 
 
 # TODO(apassos): it'd be really nice if we could scope this registration.
@@ -196,14 +240,52 @@ def _inference_name(n):
   return "__inference_%s_%s" % (n, ops.uid())
 
 
-class _DefinedFunction(object):
-  """Mocks the interface of tf _DefinedFunction."""
+# TODO(apassos) get rid of this by splitting framework.function._DefinedFunction
+# so it doesn't have the definition-generating logic and is just a container for
+# an already-defined function.
+class _EagerDefinedFunction(object):
+  """Function object with the interface of tf _DefinedFunction."""
 
-  def __init__(self, fdef):
-    self.definition = fdef
-    self.name = fdef.signature.name
+  def __init__(self, name, graph, operations, inputs, outputs):
+    """Initializes an eager defined function.
+
+    Args:
+      name: str, the name for the created function.
+      graph: Graph, the graph containing the operations in the function
+      operations: list of Operation; the subset of operations in the graph
+        which will be in the function
+      inputs: the tensors in the graph to be used as inputs to the function
+      outputs: the tensors in the graph which will be outputs to the function
+    """
+    with errors.raise_exception_on_not_ok_status() as status:
+      fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
+          graph._c_graph,  # pylint: disable=protected-access
+          compat.as_str(name),
+          False,
+          [o._c_op for o in operations],  # pylint: disable=protected-access
+          [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
+          [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
+          [],
+          None,
+          compat.as_str(""),
+          status)
+    # TODO(apassos) avoid creating a FunctionDef (specially to grab the
+    # signature, but also in general it's nice not to depend on it.
+    with c_api_util.tf_buffer() as buffer_:
+      with errors.raise_exception_on_not_ok_status() as status:
+        pywrap_tensorflow.TF_FunctionToFunctionDef(fn, buffer_, status)
+      proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
+    function_def = function_pb2.FunctionDef()
+    function_def.ParseFromString(compat.as_bytes(proto_data))
+    if context.in_eager_mode():
+      _register(fn)
+    self.definition = function_def
+    self.name = function_def.signature.name
+    self.signature = function_def.signature
     self.grad_func_name = None
     self.python_grad_func = None
+    self._c_func = fn
+    self._grad_func = None
 
 
 def _map_sequence_obj_to_idx(sequence):
@@ -211,54 +293,71 @@ def _map_sequence_obj_to_idx(sequence):
   return {id(x): i for i, x in enumerate(sequence)}
 
 
+def _flatten(sequence):
+  """A wrapper around `nest.flatten` that also unpacks `IndexedSlices`."""
+  # TODO(akshayka): Support `SparseTensor` in a similar fashion.
+  flat_sequence = nest.flatten(sequence)
+  outputs = []
+  for item in flat_sequence:
+    if isinstance(item, ops.IndexedSlices):
+      if item.dense_shape is not None:
+        outputs.extend([item.values, item.indices, item.dense_shape])
+      else:
+        outputs.extend([item.values, item.indices])
+    else:
+      outputs.append(item)
+  return outputs
+
+
 class GraphModeFunction(object):
   """Callable object representing a graph-mode function.
 
   Args:
-    input_placeholders: list of placeholder values to feed when calling
-      the wrapped function.
+    name: str the name of the created function
+    input_placeholders: list of placeholder values (tensors) to feed when
+      calling the wrapped function.
     extra_inputs: Tensor inputs this function definition closed over which
       are passed as arguments. Need to track so gradients are supported
       correctly.
-    fdef: the function definition we want to call.
-    graph: the graph from which the fdef operations were pulled. Used as
+    graph: the Graph from which the operations will be pulled. Used as
       a context when computing gradients.
-    operations: the subset of operations in the graph used in the function
+    operations: the subset of Operations in the graph used in the function
       definition.
-    func_outputs: the python outputs of the graph-mode function, with
-      tensorflow.Tensor objects to be replaced by tfe values when called.
-    func_outputs_to_fdef_outputs: Maps id(obj) in func_outputs to index of
-      fdef's outputs. It allows mapping fdef output tensors to nested
-      func_outputs structure.
-    output_shapes: List of shapes of all tensors which are output by the
-      internal function.
+    outputs: a flat list of the Tensors in the graph used as outputs to the
+      function
+    func_outputs: a possibly nested python object which will be returned by
+      this function. The Tensors in this structure will be replaced by their
+      corresponding values in outputs.
+    output_shapes: List of shapes of all tensors in outputs
     variables: (optional) List of variables to watch during function execution.
   """
 
   def __init__(self,
+               name,
                input_placeholders,
                extra_inputs,
-               fdef,
                graph,
                operations,
+               outputs,
                func_outputs,
-               func_outputs_to_fdef_outputs,
                output_shapes,
                variables=None):
-    assert len(input_placeholders) == len(fdef.signature.input_arg), "%s %s" % (
-        len(input_placeholders), len(fdef.signature.input_arg))
+    defined_function = _EagerDefinedFunction(
+        name, graph, operations, input_placeholders, outputs)
+    if len(input_placeholders) != len(defined_function.signature.input_arg):
+      raise ValueError("Internal error: invalid lengths. %s %s" % (
+          len(input_placeholders), len(defined_function.signature.input_arg)))
     self._input_placeholders = input_placeholders
     self._extra_inputs = list(extra_inputs)
     self._graph = graph
-    self._has_backprop = False
-    self._func_name = fdef.signature.name
-    self._fdef = _DefinedFunction(fdef)
-    self._num_outputs = len(fdef.signature.output_arg)
+    self._backward_function = None
+    self._func_name = name
+    self._function_def = defined_function
+    self._num_outputs = len(defined_function.signature.output_arg)
     self._ops = operations
     self._func_outputs = func_outputs
     self._returns = [func_outputs] if isinstance(
-        func_outputs, (ops.Tensor, type(None))) else list(func_outputs)
-    self._returns_to_fedf_outputs = func_outputs_to_fdef_outputs
+        func_outputs, (ops.Tensor, type(None))) else _flatten(func_outputs)
     self._output_shapes = output_shapes
     self._variables = variables if variables is not None else []
 
@@ -266,55 +365,55 @@ class GraphModeFunction(object):
   def variables(self):
     return self._variables
 
-  def _compute_backprop(self):
-    """Computes the backprop function object for this function."""
-    self._has_backprop = True
+  def _construct_backprop_function(self):
+    """Constructs the backprop function object for this function."""
     with self._graph.as_default(), context.graph_mode():
       c = _CapturingContext()
       with c:
-        filtered_outputs = [
-            x for x in self._returns if x is not None
-        ]
+        filtered_outputs = [x for x in self._returns if x is not None]
         self._out_grad_placeholders = [
-            graph_placeholder(x.dtype, x.shape) for x in filtered_outputs
-        ]
+            graph_placeholder(x.dtype, x.shape) for x in filtered_outputs]
         in_gradients = gradients_impl.gradients(
             filtered_outputs,
             self._input_placeholders,
             grad_ys=self._out_grad_placeholders)
-        shapes = [x.shape for x in in_gradients if x is not None]
+
+    backward_outputs = tuple(
+        grad for grad in _flatten(in_gradients) if grad is not None)
+    output_shapes = tuple(grad.shape for grad in backward_outputs)
+
     captures = list(sorted(c.captured_tensors, key=lambda x: x.name))
-    forward_function_def = make_function_def(
-        self._graph, self._ops, self._input_placeholders,
+    forward_name = _forward_name(self._func_name)
+    self._forward_fdef = _EagerDefinedFunction(
+        forward_name, self._graph, self._ops, self._input_placeholders,
         filtered_outputs + captures)
-    self._forward_fdef = _DefinedFunction(forward_function_def)
-    _register_with_name(_forward_name(self._func_name), forward_function_def)
-    backward_outputs = [x for x in in_gradients if x is not None]
     all_inputs = self._out_grad_placeholders + captures
-    backward_function_def = make_function_def(
-        self._graph, [x.op for x in self._out_grad_placeholders
-                     ] + list(sorted(c.known_ops, key=lambda x: x.name)),
-        all_inputs, backward_outputs)
-    _register_with_name(_backward_name(self._func_name), backward_function_def)
+    # Excluding input ops from the body as we do not intend to execute these
+    # operations when the function is executed.
+    all_ignored_ops = frozenset(x.op for x in all_inputs)
+    # Enforce a deterministic order of operations in the generated graph. This
+    # means rerunning the function-defining code will always define the same
+    # function, which is useful if we serialize this etc.
+    function_def_ops = tuple(x
+                             for x in sorted(c.known_ops, key=lambda x: x.name)
+                             if x not in all_ignored_ops)
+    bname = _backward_name(self._func_name)
     self._backward_function = GraphModeFunction(
-        all_inputs, [], backward_function_def, self._graph, c.known_ops,
-        in_gradients, _map_sequence_obj_to_idx(backward_outputs), shapes)
+        bname, all_inputs, [], self._graph, function_def_ops,
+        backward_outputs, in_gradients, output_shapes)
 
   def _backprop_call(self, args):
     """Calls the wrapped function and records the result on a tape."""
     all_args = args + self._extra_inputs
-    signature = self._forward_fdef.definition.signature
+    signature = self._forward_fdef.signature
     ctx = context.context()
     if ctx.in_graph_mode():
       g = ops.get_default_graph()
       g._add_function(self._forward_fdef)  # pylint: disable=protected-access
-      def make_tensor(x):
-        if isinstance(x, ops.Tensor):
-          return x
-        return ops.internal_convert_to_tensor(x, ctx=ctx)
       op = g.create_op(
-          signature.name, [make_tensor(x) for x in all_args],
-          [dtypes.DType(x.type) for x in signature.output_arg],
+          signature.name,
+          [ops.internal_convert_to_tensor(x, ctx=ctx) for x in all_args],
+          tuple(dtypes_module.DType(x.type) for x in signature.output_arg),
           op_def=signature,
           name="FunctionCall",
           compute_shapes=False)
@@ -334,7 +433,7 @@ class GraphModeFunction(object):
     side_outputs = outputs[len(self._returns):]
 
     def backward_function(*args):
-      return self._backward_function(*(list(args) + side_outputs))
+      return self._backward_function(*(list(args) + side_outputs))  # pylint: disable=not-callable
 
     tape.record_operation(
         signature.name,
@@ -344,36 +443,71 @@ class GraphModeFunction(object):
 
     return self._build_call_outputs(real_outputs)
 
+  @property
+  def output_shapes(self):
+    """The function's output shapes."""
+    # TODO(ebrevdo): Should we only keep the output shapes associated
+    # with len(self._returns) outputs?
+    outputs_list = nest.flatten(self._func_outputs)
+    j = 0
+    for i, o in enumerate(outputs_list):
+      if o is not None:
+        if isinstance(o, ops.IndexedSlices):
+          # Extract the shape of the `IndexedSlices` object's `values` field.
+          outputs_list[i] = self._output_shapes[j]  # the `values` shape
+          if o.dense_shape is not None:
+            j += 3  # skip over shapes for `values`, `indices`, `dense_shape`
+          else:
+            j += 2  # skip over shapes for `values`, `indices`
+        else:
+          outputs_list[i] = self._output_shapes[j]
+          j += 1
+    return nest.pack_sequence_as(self._func_outputs, outputs_list)
+
+  @property
+  def output_dtypes(self):
+    return nest.map_structure(
+        lambda x: x.dtype if x is not None else None, self._func_outputs)
+
+  @property
+  def captured_inputs(self):
+    return self._extra_inputs
+
+  @property
+  def name(self):
+    """Returns the name of the function in Eager-compatible format."""
+    return self._function_def.name.encode("utf-8")
+
+  def add_to_graph(self, g):
+    if self._function_def.name not in g._functions:  # pylint: disable=protected-access
+      g._add_function(self._function_def)  # pylint: disable=protected-access
+    for f in self._graph._functions.values():  # pylint: disable=protected-access
+      if f.name not in g._functions:  # pylint: disable=protected-access
+        g._add_function(f)  # pylint: disable=protected-access
+
   def __call__(self, *args):
     """Executes the passed function in eager mode."""
     for v in self._variables:
       if v._trainable:  # pylint: disable=protected-access
         tape.watch_variable(v)
 
-    tensor_inputs = [
-        x for x in nest.flatten(args)
-        if isinstance(x, ops.Tensor)
-    ]
-
+    tensor_inputs = [x for x in nest.flatten(args) if isinstance(x, ops.Tensor)]
     if tape.should_record(tensor_inputs) or tape.should_record(
         self._extra_inputs):
-      if not self._has_backprop:
-        self._compute_backprop()
+      if self._backward_function is None:
+        self._construct_backprop_function()
       return self._backprop_call(tensor_inputs)
 
     ctx = context.context()
     if ctx.in_graph_mode():
       g = ops.get_default_graph()
-      if self._fdef.name not in g._functions:  # pylint: disable=protected-access
-        g._add_function(self._fdef)  # pylint: disable=protected-access
-      for f in self._graph._functions.values():  # pylint: disable=protected-access
-        if f.name not in g._functions:  # pylint: disable=protected-access
-          g._add_function(f)  # pylint: disable=protected-access
-      signature = self._fdef.definition.signature
+      self.add_to_graph(g)
+      signature = self._function_def.definition.signature
       args = list(tensor_inputs) + self._extra_inputs
       op = g.create_op(
-          signature.name, [ops.convert_to_tensor(x) for x in args],
-          [dtypes.DType(x.type) for x in signature.output_arg],
+          signature.name,
+          [ops.internal_convert_to_tensor(x, ctx=ctx) for x in args],
+          tuple(dtypes_module.DType(x.type) for x in signature.output_arg),
           op_def=signature,
           name="FunctionCall",
           compute_shapes=False)
@@ -402,39 +536,53 @@ class GraphModeFunction(object):
     """
     if self._func_outputs is None:
       return None
+    # Use `nest.flatten` instead of `_flatten` in order to preserve any
+    # IndexedSlices in `self._func_outputs`.
     outputs_list = nest.flatten(self._func_outputs)
     j = 0
     for i, o in enumerate(outputs_list):
       if o is not None:
-        outputs_list[i] = result[j]
-        j += 1
-    return nest.pack_sequence_as(self._func_outputs, outputs_list)
+        if isinstance(o, ops.IndexedSlices):
+          # Repack Tensors for IndexedSlices.
+          if o.dense_shape is not None:
+            outputs_list[i] = ops.IndexedSlices(
+                values=result[j],
+                indices=result[j + 1],
+                dense_shape=result[j + 2])
+            j += 3
+          else:
+            outputs_list[i] = ops.IndexedSlices(
+                values=result[j],
+                indices=result[j + 1])
+            j += 2
+        else:
+          outputs_list[i] = result[j]
+          j += 1
+    ret = nest.pack_sequence_as(self._func_outputs, outputs_list)
+    return ret
 
 
 def _get_defun_inputs(args):
   """Maps the inputs args to graph inputs."""
   ret = []
-  for a in args:
+  flat_args = nest.flatten(args)
+  for a in flat_args:
     if isinstance(a, ops.Tensor):
       ret.append(graph_placeholder(a.dtype, a.shape))
-    elif type(a) in (tuple, list):
-      ret.append(_get_defun_inputs(a))
     else:
       ret.append(a)
-  return tuple(ret) if type(args) is tuple else ret
+  return nest.pack_sequence_as(args, ret)
 
 
 def _defun_internal(name, func, args, kwds):
   """Defines and returns graph-mode version of func."""
-  container_prefix = ops.get_default_graph()._container_prefix  # pylint: disable=protected-access
+  graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
   with context.graph_mode():
     captures = {}
     tmp_graph = CapturingGraph(captures)
-    # Inherit the container prefix, since this is used for error checking when
-    # isolating eager execution (the container prefix at creation must match the
-    # container prefix when used, and variables accessed in the defun will be
-    # used in the outside context).
-    tmp_graph._container_prefix = container_prefix  # pylint: disable=protected-access
+    # Inherit the graph key, since this is used for matching variables in
+    # optimizers.
+    tmp_graph._graph_key = graph_key  # pylint: disable=protected-access
     # Copy the graph collections to ensure summaries and other things work. This
     # lets the function access (but not mutate) collections of the containing
     # graph, such as the global step and the summary writer collections.
@@ -445,46 +593,53 @@ def _defun_internal(name, func, args, kwds):
     with tmp_graph.as_default():
       func_inputs = _get_defun_inputs(args)
 
+      def convert(x):
+        if x is None:
+          return None
+        return ops.convert_to_tensor_or_indexed_slices(x)
+
       with capture_tensors(captures):
-        tape.push_new_tape()
+        this_tape = tape.push_new_tape()
         try:
           func_outputs = func(*func_inputs, **kwds)
+          func_outputs = nest.map_structure(convert, func_outputs)
         finally:
-          variables = tape.pop_tape().watched_variables()
+          tape.pop_tape(this_tape)
+        variables = this_tape.watched_variables()
+
+        # Returning a closed-over tensor as an output does not trigger a
+        # call to convert_to_tensor, so we manually capture all such tensors.
+        outputs_list = _flatten(func_outputs)
+        func_def_outputs = [
+            _convert_to_graph_tensor(x) for x in outputs_list if x is not None
+        ]
+
       ids = list(sorted(captures.keys()))
       if ids:
         extra_inputs, extra_placeholders = zip(* [captures[x] for x in ids])
       else:
         extra_inputs = []
         extra_placeholders = []
-      outputs_list = nest.flatten(func_outputs)
-      output_shapes = [x.shape for x in outputs_list if x is not None]
+      output_shapes = tuple(
+          x.shape if isinstance(x, ops.Tensor) else None
+          for x in outputs_list)
 
-  flat_inputs = [
-      x for x in nest.flatten(func_inputs) if isinstance(x, ops.Tensor)
-  ]
+  flat_inputs = [x for x in nest.flatten(func_inputs)
+                 if isinstance(x, ops.Tensor)]
   all_inputs = flat_inputs + list(extra_placeholders)
-
-  func_def_outputs = [x for x in outputs_list if x is not None]
-  inference_function_def = make_function_def(
-      tmp_graph, tmp_graph.get_operations(), all_inputs, func_def_outputs)
+  all_ignored_ops = frozenset(x.op for x in all_inputs)
+  fname = _inference_name(name)
+  operations = tuple(x for x in tmp_graph.get_operations()
+                     if x not in all_ignored_ops)
   # Register any other functions defined in the graph
   # TODO(ashankar): Oh lord, forgive me for this lint travesty.
-  for f in tmp_graph._functions.values():  # pylint: disable=protected-access
-    # TODO(ashankar): What about the gradient registry?
-    _register_with_name(f.name, f.definition)
-  _register_with_name(_inference_name(name), inference_function_def)
-
+  if context.in_eager_mode():
+    for f in tmp_graph._functions.values():  # pylint: disable=protected-access
+      # TODO(ashankar): What about the gradient registry?
+      _register(f._c_func)  # pylint: disable=protected-access
   return GraphModeFunction(
-      all_inputs,
-      extra_inputs,
-      inference_function_def,
-      tmp_graph,
-      tmp_graph.get_operations(),
-      func_outputs,
-      _map_sequence_obj_to_idx(func_def_outputs),
-      output_shapes,
-      variables=variables)
+      fname, all_inputs, extra_inputs, tmp_graph, operations, func_def_outputs,
+      func_outputs, output_shapes, variables)
 
 
 # Defun uses this instead of Tensor as a cache key. Using dtype because
@@ -499,17 +654,30 @@ def _cache_key(x):
   """Cache key for tfe functions."""
   if isinstance(x, ops.Tensor):
     return _TensorDtype(x.dtype, x._shape_tuple())  # pylint: disable=protected-access
+  if isinstance(x, ops.IndexedSlices):
+    if x.dense_shape is not None:
+      return tuple([
+          _TensorDtype(x.values.dtype, x.values._shape_tuple()),  # pylint: disable=protected-access
+          _TensorDtype(x.indices.dtype, x.indices._shape_tuple()),  # pylint: disable=protected-access
+          _TensorDtype(x.dense_shape.dtype, x.dense_shape._shape_tuple())  # pylint: disable=protected-access
+      ])
+    else:
+      return tuple([
+          _TensorDtype(x.values.dtype, x.values._shape_tuple()),  # pylint: disable=protected-access
+          _TensorDtype(x.indices.dtype, x.indices._shape_tuple())  # pylint: disable=protected-access
+      ])
   if isinstance(x, np.ndarray):
     return ("array", x.shape, tuple(x.reshape(-1)))
-  if type(x) in (list, tuple):
+  if isinstance(x, (list, tuple)):
     return tuple([_cache_key(a) for a in x])
+  if isinstance(x, dict):
+    return tuple(tuple([_cache_key(k), _cache_key(v)]) for k, v in x.items())
   return x
 
 
-def _register_with_name(name, fdef):
-  """Registers the function `fdef` with the name `name`."""
-  fdef.signature.name = name
-  context.context().add_function_def(fdef)
+def _register(fn):
+  """Registers the function `fn`."""
+  context.context().add_function(fn)
 
 
 # TODO(apassos): better error messages for non-hashable arguments.
@@ -532,7 +700,8 @@ def named_defun(func, name):
     """Decorated version of func."""
     # Macroexpand on non-Tensor arguments
     cache_key = tuple(_cache_key(x) for x in args)
-    assert all(not isinstance(x, ops.EagerTensor) for x in kwds.values())
+    if any(isinstance(x, ops.EagerTensor) for x in kwds.values()):
+      raise ValueError("Tensor keyword arguments are not supported.")
     cache_key = (cache_key, tuple(kwds.items()))
 
     if cache_key not in arguments_to_functions:
@@ -594,4 +763,265 @@ def defun(func):
      or more Tensor objects).
   """
   # TODO(apassos): deal with captured global state. Deal with control flow.
-  return tf_decorator.make_decorator(func, named_defun(func, func.__name__))
+  try:
+    name = func.__name__
+  except AttributeError:
+    name = "function"
+  return tf_decorator.make_decorator(func, named_defun(func, name))
+
+
+def make_defun_op(func, *args, **kwds):
+  """Compile func into graph_mode, assuming func arguments are *args, **kwargs.
+
+  `make_defun_op` converts a function that constructs a TensorFlow graph into
+  a function object and attaches it to the graph.  The resulting function
+  object can be queried for its properties, and called directly with different
+  inputs to execute.
+
+  More details on use cases and limitations are available in the
+  documentation for `defun`.
+
+  Example:
+  ```python
+  def f(x, y):
+    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
+
+  def g(x, y):
+    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
+
+  z = tf.constant([[0.0, 0.0]])
+  g_op = make_defun_op(g, z, z)
+
+  assert g_op.output_shapes == tf.TensorShape([])
+  assert g_op.output_types == tf.float32
+
+  x = tf.constant([[2.0, 3.0]])
+  y = tf.constant([[3.0, -2.0]])
+
+  # The plain function and defun-compiled function should return the same value.
+  assert f(x, y).numpy() == g_op(x, y).numpy()
+  ```
+
+  Args:
+    func: function to be compiled.
+    *args: List arguments to pass to `func` when attaching to the graph.
+    **kwds: Keyword arguments to pass to `func` when attaching to the graph.
+
+  Returns:
+     A wrapper object which can be queried for its output properties,
+     and which can be called directly the way a `@defun` wrapped function
+     can.
+
+  Raises:
+    ValueError: if any of the keyword arguments to `func` are `EagerTensor`
+      objects (not yet supported).
+  """
+  name = func.__name__
+  if any(isinstance(x, ops.EagerTensor) for x in kwds.values()):
+    raise ValueError("Tensor keyword arguments are not supported.")
+  return _defun_internal(name, func, args, kwds)
+
+
+class AutomaticControlDependencies(object):
+  """Context manager to automatically add control dependencies.
+
+  Code under this context manager will act as if a sensible set of control
+  dependencies were present. More specifically:
+    1. All stateful ops in the scope will execute
+    2. Stateful ops which modify the same resource will execute in program order
+
+  Note: creating variables in an automatic control dependencies context is not
+  supported (the value of the variables will never change as they will keep
+  getting reinitialized).
+
+  NOT THREAD SAFE
+  """
+
+  def __init__(self):
+    self._returned_tensors = set()
+
+  def mark_as_return(self, tensor):
+    self._returned_tensors.add(tensor)
+
+  def __enter__(self):
+    if context.in_eager_mode():
+      return self
+    # This code assumes no other thread is adding ops to the graph while
+    # we're adding ops to the graph.
+    # TODO(apassos): Fix this by locking the graph or using a temporary
+    # graph (but that would mess up devices and collections at least,
+    # probably other things as well).
+    self._graph = ops.get_default_graph()
+    self._n_operations = len(self._graph.get_operations())
+    return self
+
+  def _process_switch(self, switch_op, ops_which_must_run,
+                      last_op_using_resource_tensor, merge_for_resource):
+    """Processes a switch node for a resource input.
+
+    When tensorflow creates a cond, it creates a control flow context for each
+    branch of the cond. Each external tensor accessed by that branch is routed
+    through a switch op, which gets created in the graph _after_ the op which
+    uses that tensor get created.
+
+    If the resource comes from another switch op we process that one first.
+
+    _process_switch creates a corresponding merge node for the switch node. This
+    merge node is added to the outer control flow context of the switch
+    node. We also ensure that:
+
+      1. The switch node executes after the previous op which used the resource
+         tensor
+
+      2. Any op which uses a resource output of the switch node executes before
+         the merge for the switch node.
+
+      3. The next op which uses the input resource to the switch node (which
+         might be another switch node for the other branch of the conditional)
+         will execute after the merge node is done.
+
+      4. The merge node is marked as must_run so it will run even if no
+         subsequent operation uses the resource.
+
+    Args:
+      switch_op: the switch op to be processed
+      ops_which_must_run: the set of ops which must run
+      last_op_using_resource_tensor: map from resource tensor to last op using
+        it
+      merge_for_resource: map from resource tensor to merge which must follow
+        all usages of it.
+    """
+    inp = switch_op.inputs[0]
+    if inp.dtype == dtypes_module.resource and inp.op.type == "Switch":
+      self._process_switch(inp.op, ops_which_must_run,
+                           last_op_using_resource_tensor, merge_for_resource)
+    if switch_op.outputs[0] in merge_for_resource:
+      return
+    new_merge = control_flow_ops.merge(switch_op.outputs,
+                                       name="artificial_merge")
+    new_merge[0].op._control_flow_context = (  # pylint: disable=protected-access
+        switch_op._control_flow_context.outer_context)  # pylint: disable=protected-access
+    # Ensures the merge always runs
+    ops_which_must_run.add(new_merge[0].op)
+    if inp in last_op_using_resource_tensor:
+      # Ensures the switch exectutes after the previous op using the resource.
+      switch_op._add_control_input(last_op_using_resource_tensor[inp])  # pylint: disable=protected-access
+    # Ensure the next op outside the cond happens after the merge.
+    last_op_using_resource_tensor[inp] = new_merge[0].op
+    if inp in merge_for_resource:
+      merge_for_resource[inp]._add_control_input(new_merge[0].op)  # pylint: disable=protected-access
+    for o in switch_op.outputs:
+      # Ensures the merge will execute after all ops inside the cond
+      merge_for_resource[o] = new_merge[0].op
+
+  def __exit__(self, unused_type, unused_value, unused_traceback):
+    if context.in_eager_mode():
+      return
+
+    if self._graph is not ops.get_default_graph():
+      raise RuntimeError(
+          "Graph changed while trying to add control dependencies.")
+
+    # map from resource tensor to the last op which used it
+    last_op_using_resource_tensor = {}
+    # set of conditional and loop exits
+    ops_which_must_run = set()
+    # merge which must depend on ops which use this resource
+    merge_for_resource = {}
+
+    new_operations = self._graph.get_operations()[self._n_operations:]
+
+    # Ensures that uses of resource tensors get serialized properly and all
+    # execute. This is done by keeping a map from resource tensor to the last op
+    # in graph-construction order which used it (last_op_using_resource_tensor).
+    #
+    # Conditionals are written in TensorFlow such that every external tensor
+    # accessed in the conditional goes through a switch op and every return
+    # tensor (it's guaranteed that there will be at least one) goes through a
+    # merge op.
+    #
+    # To handle conditionals, switches are handled in a special way (see
+    # comments for _process_switch). Merge nodes created by TF's conditional
+    # logic (as opposed to by _process_switch) are forced to run and also get a
+    # control dependency added to them to ensure all stateful ops inside their
+    # control flow context run.
+    #
+    # We also ensure that if an op is using a resource output by a switch node
+    # (that is, a resource tensor for which there's a value in
+    # merge_for_resource) this op will run before the merge for that resource.
+    #
+    # We try to add control inputs to nodes respecting their control flow
+    # contexts to avoid dead nodes propagating everywhere and leading to
+    # "retval[0] doesn't have value" errors. If a node gets a control dependency
+    # on a dead node (i.e. a note from an untaken control flow branch) that node
+    # will be marked as dead unless it's a merge node.
+    #
+    # TODO(apassos): serialize non-resource-taking stateful ops as well, and
+    # test that it works. Support while loops. Support init_scope escaping from
+    # this.
+    for op in new_operations:
+      control_inputs = set()
+      # Ensure stateful ops run
+      if self._graph._registered_ops[op.type].is_stateful:  # pylint: disable=protected-access
+        ops_which_must_run.add(op)
+      # Ignore switches (they're handled separately)
+      if op.type == "Switch" and op.inputs[0].dtype == dtypes_module.resource:
+        continue
+      # Make merges trigger all other computation which must run
+      if op.type == "Merge":
+        for o in ops_which_must_run:
+          op._add_control_input(o)  # pylint: disable=protected-access
+          for inp in o.inputs:
+            if inp in last_op_using_resource_tensor:
+              last_op_using_resource_tensor[inp] = op
+        ops_which_must_run = set([op])
+        continue
+      for inp in op.inputs:
+        if inp.dtype == dtypes_module.resource:
+          # Deal with switches, finally.
+          if inp.op.type == "Switch":
+            self._process_switch(inp.op, ops_which_must_run,
+                                 last_op_using_resource_tensor,
+                                 merge_for_resource)
+          # Ensure uses of resources are serialized
+          if inp in last_op_using_resource_tensor:
+            if (last_op_using_resource_tensor[inp]._control_flow_context  # pylint: disable=protected-access
+                is op._control_flow_context):  # pylint: disable=protected-access
+              control_inputs.add(last_op_using_resource_tensor[inp])
+          # Ensure merges happen after the closing of a cond block
+          if inp in merge_for_resource:
+            merge_for_resource[inp]._add_control_input(op)  # pylint: disable=protected-access
+          last_op_using_resource_tensor[inp] = op
+      control_inputs = [c for c in control_inputs
+                        if c._control_flow_context is op._control_flow_context]  # pylint: disable=protected-access
+      op._add_control_inputs(control_inputs)  # pylint: disable=protected-access
+
+    # Ensure all ops which must run do run
+    for r in self._returned_tensors:
+      r.op._add_control_inputs(  # pylint: disable=protected-access
+          [o for o in ops_which_must_run
+           if o._control_flow_context is r.op._control_flow_context])  # pylint: disable=protected-access
+
+
+def automatic_control_dependencies(f):
+  """Wraps f to automatically insert control dependencies.
+
+  The inserted dependencies ensure that:
+    1. All stateful ops in f run when the result of f runs
+    2. Updates to the same resources happen in order.
+
+  Args:
+    f: the function to be wrapped.
+
+  Returns:
+    The wrapped function.
+  """
+
+  def wrapper(*args, **kwds):
+    with AutomaticControlDependencies() as a:
+      result = f(*args, **kwds)
+      for t in nest.flatten(result):
+        a.mark_as_return(t)
+      return result
+
+  return tf_decorator.make_decorator(f, wrapper)
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index c55f2f1d5957cabfaf3bae617d88dca55f7b8e4b..431d9388c0ee97eda197142ec97b9448d985b04b 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
@@ -27,8 +29,10 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function as tf_function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
@@ -56,6 +60,20 @@ class FunctionTest(test.TestCase):
     out = sq(t)
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
+  def testNestedInputsGraphMode(self):
+    matmul = function.defun(math_ops.matmul)
+
+    pair = collections.namedtuple('pair', ['a', 'b'])
+
+    @function.defun
+    def a_times_b(inputs):
+      return matmul(inputs.a['a'], inputs.b['b'])
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+
+    out = a_times_b(pair({'a': t}, {'b': t}))
+    self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
+
   def testGraphModeWithGradients(self):
     v = resource_variable_ops.ResourceVariable(1.0, name='v')
 
@@ -68,6 +86,81 @@ class FunctionTest(test.TestCase):
 
     self.assertAllEqual(step(), 2.0)
 
+  def testBasicDefunOpGraphMode(self):
+    matmul = function.defun(math_ops.matmul)
+
+    def sq(a):
+      return matmul(a, a)
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+
+    sq_op = function.make_defun_op(sq, t)
+
+    self.assertEqual(sq_op.output_shapes, tensor_shape.TensorShape([2, 2]))
+    out = sq_op(t)
+    self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
+
+  def testNestedInputsDefunOpGraphMode(self):
+    matmul = function.defun(math_ops.matmul)
+
+    pair = collections.namedtuple('pair', ['a', 'b'])
+    def a_times_b(inputs):
+      return matmul(inputs.a['a'], inputs.b['b'])
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+
+    inputs = pair({'a': t}, {'b': t})
+    sq_op = function.make_defun_op(a_times_b, inputs)
+
+    self.assertEqual(sq_op.output_shapes, tensor_shape.TensorShape([2, 2]))
+    out = sq_op(inputs)
+    self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
+
+  def testNestedOutputDefunOpGraphMode(self):
+    matmul = function.defun(math_ops.matmul)
+
+    def sq(a):
+      return (matmul(a, a), {'b': constant_op.constant(1.0)})
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+
+    sq_op = function.make_defun_op(sq, t)
+
+    self.assertEqual(sq_op.output_shapes,
+                     (tensor_shape.TensorShape([2, 2]),
+                      {'b': tensor_shape.TensorShape([])}))
+    self.assertEqual(sq_op.output_dtypes,
+                     (dtypes.float32, {'b': dtypes.float32}))
+    (a, b) = sq_op(t)
+    self.assertAllEqual(a, math_ops.matmul(t, t).numpy())
+    self.assertAllEqual(b['b'].numpy(), 1.0)
+
+  def testDefunOpGraphModeWithGradients(self):
+    v = resource_variable_ops.ResourceVariable(1.0, name='v')
+
+    def step():
+      def inner():
+        return v * v
+
+      return backprop.implicit_grad(inner)()[0][0]
+
+    step_op = function.make_defun_op(step)
+
+    self.assertEqual(step_op.output_dtypes, dtypes.float32)
+    self.assertEqual(step_op.output_shapes, tensor_shape.TensorShape([]))
+    self.assertAllEqual(step_op(), 2.0)
+
+  def testDefunOpGraphModeNoneOutput(self):
+    def fn(unused_a, unused_b):
+      return None
+
+    x = constant_op.constant(1)
+    fn_op = function.make_defun_op(fn, x, x)
+
+    self.assertEqual(fn_op.output_dtypes, None)
+    self.assertEqual(fn_op.output_shapes, None)
+    self.assertAllEqual(fn_op(x, x), None)
+
   def testDefunReadVariable(self):
     v = resource_variable_ops.ResourceVariable(1.0)
 
@@ -87,6 +180,42 @@ class FunctionTest(test.TestCase):
 
     self.assertEqual(3.0, float(f()))
 
+  def testDefunShapeInferenceWithCapturedResourceVariable(self):
+    v = resource_variable_ops.ResourceVariable([[1, 2], [3, 4]])
+
+    def f():
+      x = constant_op.constant([[1, 2], [3, 4]])
+      out = math_ops.matmul(v, x)
+      self.assertEqual(out.get_shape(), tensor_shape.TensorShape([2, 2]))
+
+    compiled = function.defun(f)
+    compiled()
+
+  def testDefunShapeInferenceWithCapturedResourceVariableInGraphMode(self):
+    with context.graph_mode():
+      v = resource_variable_ops.ResourceVariable([[1, 2], [3, 4]])
+
+      def f():
+        x = constant_op.constant([[1, 2], [3, 4]])
+        out = math_ops.matmul(v, x)
+        self.assertEqual(out.get_shape(), tensor_shape.TensorShape([2, 2]))
+
+      compiled = function.defun(f)
+      compiled()
+
+  def testDefunShapeInferenceWithCapturedVariableInGraphMode(self):
+    with context.graph_mode():
+      v = variables.Variable([[1, 2], [3, 4]])
+
+      def f():
+        x = constant_op.constant([[1, 2], [3, 4]])
+        out = math_ops.matmul(v, x)
+        self.assertEqual(out.get_shape(), tensor_shape.TensorShape([2, 2]))
+
+      # Check that shape inference works while creating the defun
+      compiled = function.defun(f)
+      compiled()
+
   def testDefunDifferentiable(self):
     v = resource_variable_ops.ResourceVariable(1.0)
 
@@ -246,6 +375,78 @@ class FunctionTest(test.TestCase):
 
     self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
 
+  def testGradientOfGatherWithDefun(self):
+
+    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
+
+    def sum_gather():
+      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
+
+    grad_fn = backprop.implicit_grad(sum_gather)
+    gradient = grad_fn()
+    defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather))
+    defun_gradient = defun_grad_fn()
+    self.assertEqual(len(gradient), len(defun_gradient))
+
+    gradient = gradient[0][0]
+    defun_gradient = defun_gradient[0][0]
+    self.assertAllEqual(gradient.values, defun_gradient.values)
+    self.assertAllEqual(gradient.indices, defun_gradient.indices)
+    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
+
+  def testReturningIndexedSlicesWithDefun(self):
+
+    def validate(indexed_slice):
+      def f():
+        return indexed_slice
+
+      output = function.defun(f)()
+      self.assertTrue(isinstance(output, ops.IndexedSlices))
+      self.assertAllEqual(indexed_slice.values, output.values)
+      self.assertAllEqual(indexed_slice.indices, output.indices)
+      self.assertAllEqual(indexed_slice.dense_shape, output.dense_shape)
+
+      self.assertEqual(
+          function.make_defun_op(f).output_shapes, indexed_slice.values.shape)
+
+    arg = ops.IndexedSlices(
+        values=constant_op.constant([1, 2]),
+        indices=constant_op.constant([0, 1]),
+        dense_shape=constant_op.constant([2]))
+    validate(arg)
+
+    arg = ops.IndexedSlices(
+        values=constant_op.constant([1, 2]),
+        indices=constant_op.constant([0, 1]),
+        dense_shape=None)
+    validate(arg)
+
+  def testIndexedSliceAsArgumentWithDefun(self):
+
+    @function.defun
+    def f(indexed_slice):
+      return indexed_slice
+
+    def validate(arg):
+      output = f(arg)
+      self.assertTrue(isinstance(output, ops.IndexedSlices))
+      self.assertAllEqual(arg.values, output.values)
+      self.assertAllEqual(arg.indices, output.indices)
+      self.assertAllEqual(arg.dense_shape, output.dense_shape)
+
+    indexed_slice = ops.IndexedSlices(
+        values=constant_op.constant([1]),
+        indices=constant_op.constant([0]),
+        dense_shape=constant_op.constant([1]))
+    validate(indexed_slice)
+
+    # Test that `f` works even when `dense_shape` is None.
+    indexed_slice = ops.IndexedSlices(
+        values=constant_op.constant([1]),
+        indices=constant_op.constant([0]),
+        dense_shape=None)
+    validate(indexed_slice)
+
   def testFunctionOnDevice(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -272,10 +473,11 @@ class FunctionTest(test.TestCase):
 
     # The Reshape op requires the shape tensor to be placed in host memory.
     reshape = function.defun(array_ops.reshape)
-    value = constant_op.constant([1., 2.]).gpu()
+    value = constant_op.constant([1., 2.])
     shape = constant_op.constant([2, 1]).gpu()
     with self.assertRaises(errors.InvalidArgumentError):
-      reshape(value, shape)
+      with ops.device('gpu:0'):
+        reshape(value, shape)
 
   def testDifferentiableFunctionNoneOutputs(self):
 
@@ -310,6 +512,38 @@ class FunctionTest(test.TestCase):
 
     self.assertAllEqual(3, add_one(constant_op.constant(2)))
 
+  def testVariableCaptureInNestedFunctions(self):
+    v = resource_variable_ops.ResourceVariable(1)
+
+    @function.defun
+    def read():
+      return v.read_value()
+
+    @function.defun
+    def outer():
+      return read()
+
+    self.assertEqual(1, int(outer()))
+
+  def testReturnCapturedEagerTensor(self):
+    t = constant_op.constant(1)
+
+    @function.defun
+    def read():
+      return t
+
+    self.assertEqual(1, int(read()))
+
+  def testReturnCapturedGraphTensor(self):
+    with context.graph_mode(), self.test_session():
+      t = constant_op.constant(1)
+
+      @function.defun
+      def read():
+        return t
+
+      self.assertEqual(1, int(self.evaluate(read())))
+
   def testSequenceInputs(self):
     clip_by_global_norm = function.defun(clip_ops.clip_by_global_norm)
     t_list = [constant_op.constant(1.0), constant_op.constant(2.0)]
@@ -343,6 +577,191 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual(ret[0][2], 10)
     self.assertAllEqual(ret[1], 15)
 
+  def testVariableNamesRespectNameScopesWithDefun(self):
+    @function.defun
+    def create_variable():
+      with ops.name_scope('foo'):
+        v = resource_variable_ops.ResourceVariable(0.0, name='bar')
+      self.assertEqual(v.name, 'foo/bar:0')
+    create_variable()
+
+  def testVariableNamesRespectNameScopesWithDefunInGraph(self):
+    with context.graph_mode():
+      @function.defun
+      def create_variable():
+        with ops.name_scope('foo'):
+          v = resource_variable_ops.ResourceVariable([1.0, 2.0], name='bar')
+        self.assertEqual(v.name, 'foo/bar:0')
+      with ops.get_default_graph().as_default():
+        create_variable()
+
+
+class AutomaticControlDependenciesTest(test.TestCase):
+
+  def testBasic(self):
+    with context.graph_mode(), self.test_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      with function.AutomaticControlDependencies() as c:
+        v.assign(v + 1)
+        v.assign(2 * v)
+        val = v.read_value()
+        c.mark_as_return(val)
+      self.assertAllEqual(val.eval(), 4.0)
+
+  def testCondMustRun(self):
+    with context.graph_mode(), self.test_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      p = array_ops.placeholder(dtype=dtypes.bool)
+      with function.AutomaticControlDependencies() as c:
+
+        def true_fn():
+          v.assign(v + 1)
+          return 0.0
+
+        def false_fn():
+          v.assign(v + 4)
+          return 1.0
+
+        control_flow_ops.cond(p, true_fn, false_fn)
+        val = v.read_value()
+        c.mark_as_return(val)
+      self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
+      self.assertAllEqual(val.eval(feed_dict={p: True}), 6.0)
+
+  def testCondMustRunSeparateRead(self):
+    with context.graph_mode(), self.test_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      p = array_ops.placeholder(dtype=dtypes.bool)
+      with function.AutomaticControlDependencies() as c:
+
+        def true_fn():
+          v.assign(v + 1)
+          return 0.0
+
+        def false_fn():
+          v.assign(v + 4)
+          return 1.0
+
+        control_flow_ops.cond(p, true_fn, false_fn)
+        one = constant_op.constant(1.0)
+        c.mark_as_return(one)
+      one.eval(feed_dict={p: False})
+      self.assertAllEqual(v.read_value().eval(), 5.0)
+      one.eval(feed_dict={p: True})
+      self.assertAllEqual(v.read_value().eval(), 6.0)
+
+  def testCondNested(self):
+    with context.graph_mode(), self.test_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      p = array_ops.placeholder(dtype=dtypes.bool)
+      q = array_ops.placeholder(dtype=dtypes.bool)
+      with function.AutomaticControlDependencies() as c:
+
+        def true_fn():
+          v.assign(v + 1, name='true')
+          return 1.0
+
+        def false_fn():
+
+          def inner_true_fn():
+            v.assign(v * 2, name='false_true')
+            return 2.0
+
+          def inner_false_fn():
+            v.assign(v * 3, name='false_false')
+            return 3.0
+
+          control_flow_ops.cond(q, inner_true_fn, inner_false_fn)
+          return 1.0
+
+        control_flow_ops.cond(p, true_fn, false_fn)
+        with ops.name_scope('final'):
+          val = v.read_value()
+        c.mark_as_return(val)
+      self.assertAllEqual(val.eval(feed_dict={p: False, q: False}), 3.0)
+      self.assertAllEqual(val.eval(feed_dict={p: False, q: True}), 6.0)
+      self.assertAllEqual(val.eval(feed_dict={p: True, q: True}), 7.0)
+      self.assertAllEqual(val.eval(feed_dict={p: True, q: False}), 8.0)
+
+  def testCondOneBranch(self):
+    with context.graph_mode(), self.test_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      p = array_ops.placeholder(dtype=dtypes.bool)
+      with function.AutomaticControlDependencies() as c:
+
+        def true_fn():
+          return 0.0
+
+        def false_fn():
+          v.assign(v + 4)
+          return 1.0
+
+        control_flow_ops.cond(p, true_fn, false_fn)
+        val = v.read_value()
+        c.mark_as_return(val)
+      self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
+      self.assertAllEqual(val.eval(feed_dict={p: True}), 5.0)
+
+  def testCondOneBranchUpdateBefore(self):
+    with context.graph_mode(), self.test_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      p = array_ops.placeholder(dtype=dtypes.bool)
+      with function.AutomaticControlDependencies() as c:
+        v.assign(v * 2)
+
+        def true_fn():
+          return 0.0
+
+        def false_fn():
+          v.assign(v + 4)
+          return 1.0
+
+        control_flow_ops.cond(p, true_fn, false_fn)
+        val = v.read_value()
+        c.mark_as_return(val)
+      self.assertAllEqual(val.eval(feed_dict={p: False}), 6.0)
+      self.assertAllEqual(val.eval(feed_dict={p: True}), 12.0)
+
+  def testCondOneBranchUpdateAfter(self):
+    with context.graph_mode(), self.test_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      p = array_ops.placeholder(dtype=dtypes.bool)
+      with function.AutomaticControlDependencies() as c:
+
+        def true_fn():
+          return 0.0
+
+        def false_fn():
+          v.assign(v + 4)
+          return 1.0
+
+        control_flow_ops.cond(p, true_fn, false_fn)
+        v.assign(v * 2)
+        val = v.read_value()
+        c.mark_as_return(val)
+      self.assertAllEqual(val.eval(feed_dict={p: False}), 10.0)
+      self.assertAllEqual(val.eval(feed_dict={p: True}), 20.0)
+
+  def testDecorator(self):
+    with context.graph_mode(), self.test_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+
+      @function.automatic_control_dependencies
+      def f():
+        v.assign(v + 1)
+        v.assign(2 * v)
+        return v.read_value()
+
+      self.assertAllEqual(f().eval(), 4.0)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/gen_op.bzl b/tensorflow/python/eager/gen_op.bzl
deleted file mode 100644
index 1c99d342befaf04112ac83aeecce2b122eb361c5..0000000000000000000000000000000000000000
--- a/tensorflow/python/eager/gen_op.bzl
+++ /dev/null
@@ -1,49 +0,0 @@
-"""For eager-mode Python."""
-
-load("//tensorflow:tensorflow.bzl",
-     "clean_dep",
-     "tf_binary_additional_srcs",
-     "tf_copts",
-     "tf_cc_binary")
-
-def tfe_gen_op_wrapper_py(name,
-                          out=None,
-                          visibility=None,
-                          deps=[],
-                          generated_target_name=None):
-  """Generate an eager-mode Python op wrapper for an op library."""
-  # Construct a cc_binary containing the specified ops.
-  tool_name = "gen_" + name + "_py_wrappers_cc"
-  if not deps:
-    deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
-  tf_cc_binary(
-      name=tool_name,
-      linkopts=["-lm"],
-      copts=tf_copts(),
-      linkstatic=1,
-      deps=([
-          clean_dep("//tensorflow/python/eager:python_eager_op_gen_main")
-      ] + deps),
-      visibility=[clean_dep("//visibility:public")],)
-
-  # Invoke the previous cc_binary to generate a python file.
-  if not out:
-    out = "gen_" + name + ".py"
-
-  native.genrule(
-      name=name + "_pygenrule",
-      outs=[out],
-      tools=[tool_name] + tf_binary_additional_srcs(),
-      cmd=("$(location " + tool_name + ")  > $@"))
-
-  # Make a py_library out of the generated python file.
-  if not generated_target_name:
-    generated_target_name = name
-  native.py_library(
-      name=generated_target_name,
-      srcs=[out],
-      srcs_version="PY2AND3",
-      visibility=visibility,
-      deps=[
-          clean_dep("//tensorflow/python/eager:framework_for_generated_wrappers"),
-      ],)
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
index 837a75c808f94d4561a0eb68c8e77700d0e413da..62106bf0e2809e3c056e4a357f3d05251b7dca68 100644
--- a/tensorflow/python/eager/graph_callable.py
+++ b/tensorflow/python/eager/graph_callable.py
@@ -252,21 +252,17 @@ def _graph_callable_internal(func, shape_and_dtypes):
     Callable graph object.
   """
   container = tf_ops.get_default_graph()._container  # pylint: disable=protected-access
-  container_prefix = tf_ops.get_default_graph()._container_prefix  # pylint: disable=protected-access
+  graph_key = tf_ops.get_default_graph()._graph_key  # pylint: disable=protected-access
   with context.graph_mode():
     # This graph will store both the initialization and the call version of the
     # wrapped function. It will later be used by the backprop code to build the
     # backprop graph, if necessary.
     captures = {}
     tmp_graph = function.CapturingGraph(captures)
-    # Inherit the container from the original graph to create resources at user
-    # expected containers. Also inherits the container prefix, since this is
-    # used for error checking when isolating Eager execution (the container
-    # prefix at creation must match the container prefix when used, and
-    # variables returned from the graph callable will be used in the outside
-    # context).
+    # Inherit the graph key from the original graph to ensure optimizers don't
+    # misbehave.
     tmp_graph._container = container  # pylint: disable=protected-access
-    tmp_graph._container_prefix = container_prefix  # pylint: disable=protected-access
+    tmp_graph._graph_key = graph_key  # pylint: disable=protected-access
     with tmp_graph.as_default():
       # Placeholders for the non-variable inputs.
       func_inputs = _get_graph_callable_inputs(shape_and_dtypes)
@@ -296,6 +292,7 @@ def _graph_callable_internal(func, shape_and_dtypes):
       # Call the function again, now replacing usages of variables with
       # placeholders. This assumes the variable capturing scope created above
       # knows about all variables.
+      tmp_graph.clear_resource_control_flow_state()
       with variable_captures.capturing_scope(), function.capture_tensors(
           captures):
         captured_outputs = func(*func_inputs)
@@ -317,46 +314,33 @@ def _graph_callable_internal(func, shape_and_dtypes):
   placeholder_inputs = flat_inputs+ list(extra_placeholders)
 
   func_def_outputs = [x for x in outputs_list if isinstance(x, tf_ops.Tensor)]
-  initializer_function_def = function.make_function_def(
-      tmp_graph,
-      initializing_operations,
-      placeholder_inputs,
-      func_def_outputs)
+  initialization_name = function._inference_name(func.__name__)  # pylint: disable=protected-access
   # TODO(ashankar): Oh lord, forgive me for this lint travesty.
   # Also, what about the gradient registry of these functions? Those need to be
   # addressed as well.
   for f in tmp_graph._functions.values():  # pylint: disable=protected-access
-    function._register_with_name(f.name, f.definition)  # pylint: disable=protected-access
-  function._register_with_name(function._inference_name(func.__name__),  # pylint: disable=protected-access
-                               initializer_function_def)
+    function._register(f._c_func)  # pylint: disable=protected-access
   initializer_function = function.GraphModeFunction(
+      initialization_name,
       placeholder_inputs,
       extra_inputs,
-      initializer_function_def,
       tmp_graph,
       initializing_operations,
+      func_def_outputs,
       func_outputs,
-      function._map_sequence_obj_to_idx(func_def_outputs),  # pylint: disable=protected-access
       output_shapes)
 
   capture_func_def_outputs = [
       x for x in captured_outlist if isinstance(x, tf_ops.Tensor)]
-  captured_function_def = function.make_function_def(
-      tmp_graph,
-      capturing_operations,
-      placeholder_inputs,
-      capture_func_def_outputs)
-  function._register_with_name(function._inference_name(func.__name__),  # pylint: disable=protected-access
-                               captured_function_def)
-
+  captured_function_name = function._inference_name(func.__name__)  # pylint: disable=protected-access
   captured_function = function.GraphModeFunction(
+      captured_function_name,
       placeholder_inputs,
       extra_inputs,
-      captured_function_def,
       tmp_graph,
       capturing_operations,
+      capture_func_def_outputs,
       captured_outputs,
-      function._map_sequence_obj_to_idx(capture_func_def_outputs),  # pylint: disable=protected-access
       output_shapes,
       variables=[x.variable for x in sorted_variables])
 
diff --git a/tensorflow/python/eager/graph_callable_test.py b/tensorflow/python/eager/graph_callable_test.py
index 548e16a909f8fe846ea6d5a7a33c4247c5d90054..b9e6ca2a93ac6ff02b741051234dbdd8a55bf12b 100644
--- a/tensorflow/python/eager/graph_callable_test.py
+++ b/tensorflow/python/eager/graph_callable_test.py
@@ -152,7 +152,6 @@ class GraphCallableTest(test.TestCase):
     self.assertAllEqual(5, f(constant_op.constant(2)))
 
   def testNestedFunction(self):
-
     # TensorFlow function (which is what would be used in TensorFlow graph
     # construction).
     @function.Defun(dtypes.int32, dtypes.int32)
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index 70e23b9311792fd7e5243bbc9fd6e4009f1493a9..f2e70341d975fb06bce7f2ce6cba7d8c3bc9826c 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -24,7 +24,6 @@ from tensorflow.python.eager import execute
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
@@ -33,6 +32,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 
 
@@ -245,20 +245,12 @@ class OpsTest(test_util.TensorFlowTestCase):
     reshaped = array_ops.reshape(value, shape)
     self.assertAllEqual([[1], [2]], reshaped.cpu())
 
-    # And if the shape is in device memory, it should complain
-    # TODO(ashankar): Revisit this - perhaps instead of complaining,
-    # it should implicitly copy the tensor to host memory?
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        'cannot compute Reshape as input #1 was expected to be on.*'
-        'using.*DEVICE_PLACEMENT_SILENT'):
-      reshaped = array_ops.reshape(value, shape.gpu())
-
-  def testInvalidInputDataType(self):
+  def testInt64(self):
     # Fill requires the first input to be an int32 tensor.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, 'int64'):
-      array_ops.fill(constant_op.constant([2], dtype=dtypes.int64),
-                     constant_op.constant(1))
+    self.assertAllEqual(
+        [1.0, 1.0],
+        array_ops.fill(constant_op.constant([2], dtype=dtypes.int64),
+                       constant_op.constant(1)))
 
   def testOutputOnHostMemory(self):
     if not context.context().num_gpus():
@@ -322,6 +314,13 @@ class OpsTest(test_util.TensorFlowTestCase):
   def testIdentity(self):
     self.assertAllEqual(2, array_ops.identity(2))
 
+  def testIdentityOnVariable(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+    with context.device('/gpu:0'):
+      v = resource_variable_ops.ResourceVariable(True)
+    self.assertAllEqual(True, array_ops.identity(v))
+
   def testIncompatibleSetShape(self):
     x = constant_op.constant(1)
     with self.assertRaises(ValueError):
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index 956fbdac50d05fbd23ab93ec97145645805ac5e7..e6d03297e0b85856ff165af310149c79e494ab36 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -42,6 +42,8 @@ namespace {
 
 const int kRightMargin = 78;
 
+constexpr char kEagerFallbackSuffix[] = "_eager_fallback";
+
 string AttrVarName(const string& attr_name,
                    std::unordered_map<string, string>* attr_expressions) {
   const string var = strings::StrCat("_attr_", attr_name);
@@ -49,11 +51,12 @@ string AttrVarName(const string& attr_name,
   return var;
 }
 
-void AddInferredAttr(const string& attr_name, const string& value_expression,
-                     string* result,
+void AddInferredAttr(const string& indentation, const string& attr_name,
+                     const string& value_expression, string* result,
                      std::unordered_map<string, string>* attr_expressions) {
-  strings::StrAppend(result, "  ", AttrVarName(attr_name, attr_expressions),
-                     " = ", value_expression, "\n");
+  strings::StrAppend(result, indentation,
+                     AttrVarName(attr_name, attr_expressions), " = ",
+                     value_expression, "\n");
 }
 
 string VectorToTuple(const std::vector<string>& l) {
@@ -99,6 +102,15 @@ string TensorPBString(const TensorProto& pb) {
   return strings::StrCat("\"\"\"", ProtoShortDebugString(pb), "\"\"\"");
 }
 
+const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.in_arg_size(); ++i) {
+    if (api_def.in_arg(i).name() == name) {
+      return &api_def.in_arg(i);
+    }
+  }
+  return nullptr;
+}
+
 class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
  public:
   GenEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
@@ -112,11 +124,33 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   string Code() override;
 
  protected:
-  void ExpectListArg(const string& arg_name);
-  void AddEagerInferredAttrs();
-  void AddEagerInputCasts();
-  void AddEagerAttrs();
-  void AddEagerExecute(const string& num_outputs_expr);
+  void HandleGraphMode(const string& function_setup);
+
+  string GetEagerNotAllowedError();
+  void ExpectListArg(const string& indentation, const string& arg_name,
+                     string* output);
+  bool GetEagerFunctionSetup(const string& indentation, string* function_setup);
+  void GetOutputSizesAndNumOutputsExpr(std::vector<string>* output_sizes,
+                                       string* num_outputs_expr);
+
+  void AddEagerFunctionTeardown(const string& indentation,
+                                const std::vector<string>& output_sizes,
+                                bool execute_record_gradient);
+
+  bool AddEagerFastPathAndGraphCode(const string& parameters,
+                                    const std::vector<string>& output_sizes,
+                                    const string& eager_not_allowed_error);
+  bool AddEagerFallbackCode(const string& parameters,
+                            const std::vector<string>& output_sizes,
+                            const string& num_outputs_expr,
+                            const string& eager_not_allowed_error);
+  void AddEagerFastPathExecute();
+
+  void AddEagerInferredAttrs(const string& indentation);
+  void AddEagerInputCasts(const string& indentation);
+  void AddEagerAttrs(const string& indentation);
+  void AddEagerExecute(const string& indentation,
+                       const string& num_outputs_expr);
 
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
@@ -139,6 +173,13 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   typedef std::unordered_map<string, std::vector<int>> AttrToArgMap;
   AttrToArgMap attr_to_args_;
   std::unordered_map<string, string> attr_expressions_;
+  // This has all the input args followed by those attrs that don't have
+  // defaults.
+  std::vector<python_op_gen_internal::ParamNames> params_no_default_;
+  // The parameters with defaults (these have to be listed after those without).
+  // No input args are included, just attrs.
+  std::vector<std::pair<python_op_gen_internal::ParamNames, string>>
+      params_with_default_;
 };
 
 string GetEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
@@ -164,14 +205,14 @@ string GenEagerPythonOp::FlattenInputs(
       } else if (inputs_state == WAS_LIST_INPUT) {
         strings::StrAppend(&inputs, " + ");
       }
-      strings::StrAppend(&inputs, "list(", param_names_[i], ")");
+      strings::StrAppend(&inputs, "list(", param_names_[i].GetRenameTo(), ")");
       inputs_state = WAS_LIST_INPUT;
       if (output_sizes != nullptr) {
         if (!arg.number_attr().empty()) {
           output_sizes->emplace_back(AttrVarName(arg.number_attr(), nullptr));
         } else {
           output_sizes->emplace_back(
-              strings::StrCat("len(", param_names_[i], ")"));
+              strings::StrCat("len(", param_names_[i].GetRenameTo(), ")"));
         }
       }
     } else {
@@ -182,7 +223,7 @@ string GenEagerPythonOp::FlattenInputs(
       } else {
         strings::StrAppend(&inputs, "[");
       }
-      strings::StrAppend(&inputs, param_names_[i]);
+      strings::StrAppend(&inputs, param_names_[i].GetRenameTo());
       inputs_state = WAS_SOLO_INPUT;
       if (output_sizes != nullptr) output_sizes->emplace_back();
     }
@@ -195,15 +236,15 @@ string GenEagerPythonOp::FlattenInputs(
 }
 
 string GenEagerPythonOp::Code() {
-  // This has all the input args followed by those attrs that don't have
-  // defaults.
-  std::vector<string> args_no_default;
-  // The parameters with defaults (these have to be listed after those without).
-  // No input args are included, just attrs.
-  std::vector<std::pair<string, string>> args_with_defaults;
-  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
-    const auto& arg(op_def_.input_arg(i));
-    args_no_default.push_back(arg.name());
+  if (api_def_.visibility() == ApiDef::SKIP) {
+    return "";
+  }
+
+  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
+    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
+    params_no_default_.emplace_back(api_def_arg.name(),
+                                    api_def_arg.rename_to());
     if (!arg.type_attr().empty()) {
       AddAttrForArg(arg.type_attr(), i);
     } else if (!arg.type_list_attr().empty()) {
@@ -215,31 +256,39 @@ string GenEagerPythonOp::Code() {
   }
   for (int i = 0; i < op_def_.attr_size(); ++i) {
     const auto& attr(op_def_.attr(i));
+    const auto& api_def_attr(api_def_.attr(i));
     // Do not add inferred attrs to the Python function signature.
     if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
-      if (attr.has_default_value()) {
+      if (api_def_attr.has_default_value()) {
         if (attr.type() == "tensor") {
-          args_with_defaults.emplace_back(
-              attr.name(),
-              strings::StrCat("_execute.make_tensor(",
-                              TensorPBString(attr.default_value().tensor()),
-                              ", \"", attr.name(), "\")"));
+          params_with_default_.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              strings::StrCat(
+                  "_execute.make_tensor(",
+                  TensorPBString(api_def_attr.default_value().tensor()), ", \"",
+                  api_def_attr.rename_to(), "\")"));
         } else if (attr.type() == "list(tensor)") {
           std::vector<string> pbtxt;
-          for (const auto& pb : attr.default_value().list().tensor()) {
+          for (const auto& pb : api_def_attr.default_value().list().tensor()) {
             pbtxt.emplace_back(TensorPBString(pb));
           }
-          args_with_defaults.emplace_back(
-              attr.name(),
-              strings::StrCat("[_execute.make_tensor(_pb, \"", attr.name(),
-                              "\") for _pb in ", VectorToTuple(pbtxt), "]"));
+          params_with_default_.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              strings::StrCat("[_execute.make_tensor(_pb, \"",
+                              api_def_attr.rename_to(), "\") for _pb in ",
+                              VectorToTuple(pbtxt), "]"));
         } else {
-          args_with_defaults.emplace_back(
-              attr.name(), python_op_gen_internal::AttrValueToPython(
-                               attr.type(), attr.default_value(), "_dtypes."));
+          params_with_default_.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              python_op_gen_internal::AttrValueToPython(
+                  attr.type(), api_def_attr.default_value(), "_dtypes."));
         }
       } else {
-        args_no_default.push_back(attr.name());
+        params_no_default_.emplace_back(api_def_attr.name(),
+                                        api_def_attr.rename_to());
       }
     }
   }
@@ -247,159 +296,79 @@ string GenEagerPythonOp::Code() {
   // Save the list of attr parameters (attrs that won't be inferred),
   // those with defaults go at the end.
   // Get the attrs in the order we want by taking the attrs without defaults
-  // from the end of args_no_default, and adding args_no_default.
-  attrs_.reserve(args_no_default.size() - op_def_.input_arg_size() +
-                 args_with_defaults.size());
-  attrs_.insert(attrs_.end(),
-                args_no_default.begin() + op_def_.input_arg_size(),
-                args_no_default.end());
-  for (const auto& a : args_with_defaults) {
-    attrs_.push_back(a.first);
+  // from the end of params_no_default_, and adding params_no_default_.
+  attrs_.reserve(params_no_default_.size() - op_def_.input_arg_size() +
+                 params_with_default_.size());
+  for (int i = op_def_.input_arg_size(); i < params_no_default_.size(); ++i) {
+    attrs_.push_back(params_no_default_[i].GetName());
+  }
+  for (const auto& p : params_with_default_) {
+    attrs_.push_back(p.first.GetName());
+  }
+
+  param_names_.reserve(params_no_default_.size() + params_with_default_.size());
+  param_names_.insert(param_names_.begin(), params_no_default_.begin(),
+                      params_no_default_.end());
+  for (const auto& param_and_default : params_with_default_) {
+    param_names_.push_back(param_and_default.first);
   }
 
-  param_names_.reserve(args_no_default.size() + args_with_defaults.size());
   string parameters;
-  for (const string& name : args_no_default) {
+  for (const auto& param : params_no_default_) {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-    const string param = python_op_gen_internal::AvoidPythonReserved(name);
-    strings::StrAppend(&parameters, param);
-    param_names_.push_back(param);
+    strings::StrAppend(&parameters, param.GetRenameTo());
   }
-  for (const auto& name_default : args_with_defaults) {
+  for (const auto& param_and_default : params_with_default_) {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-    const string param =
-        python_op_gen_internal::AvoidPythonReserved(name_default.first);
-    strings::StrAppend(&parameters, param, "=", name_default.second);
-    param_names_.push_back(param);
+    strings::StrAppend(&parameters, param_and_default.first.GetRenameTo(), "=",
+                       param_and_default.second);
   }
   if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
   strings::StrAppend(&parameters, "name=None");
 
-  AddDefLine(parameters);
-  AddDocStringDescription();
-  AddDocStringArgs();
-  AddDocStringInputs();
-  AddDocStringAttrs();
-  AddDocStringNameArg();
-  AddOutputGlobals();
-  AddDocStringOutputs();
-  strings::StrAppend(&result_, "  \"\"\"\n");
-
-  // Function body.
-
-  // Validate list inputs, infer length attrs.
+  // Add attr_expressions_ for attrs that are params.
+  for (int i = 0; i < attrs_.size(); ++i) {
+    const string& attr_name = attrs_[i];
+    const string& attr_api_name =
+        param_names_[i + op_def_.input_arg_size()].GetRenameTo();
+    attr_expressions_[attr_name] = attr_api_name;
+  }
+  // Add attr_expressions_ for attrs that are inferred.
   for (int i = 0; i < op_def_.attr_size(); ++i) {
     const auto& attr(op_def_.attr(i));
     if (attr.type() == "int") {
       auto arg_list = attr_to_args_.find(attr.name());
       if (arg_list != attr_to_args_.end()) {
-        // Inferred int attrs are the lengths of inputs. Validate those
-        // inputs are lists and have the same length.
-        for (auto iter = arg_list->second.begin();
-             iter != arg_list->second.end(); ++iter) {
-          const string& arg_name = param_names_[*iter];
-          ExpectListArg(arg_name);
-          if (iter == arg_list->second.begin()) {
-            AddInferredAttr(attr.name(), strings::StrCat("len(", arg_name, ")"),
-                            &result_, &attr_expressions_);
-          } else {
-            const auto& attr_var = attr_expressions_[attr.name()];
-            strings::StrAppend(&result_, "  if len(", arg_name,
-                               ") != ", attr_var,
-                               ":\n"
-                               "    raise ValueError(\n"
-                               "        \"List argument '",
-                               arg_name, "' to '", op_name_,
-                               "' Op with length %d \"\n"
-                               "        \"must match length %d of argument '",
-                               inferred_attrs_[attr.name()],
-                               "'.\" %\n"
-                               "        (len(",
-                               arg_name, "), ", attr_var, "))\n");
-          }
-        }
+        AttrVarName(attr.name(), &attr_expressions_);
       }
     }
   }
 
-  // Values for non-inferred attrs.
-  for (int i = 0; i < attrs_.size(); ++i) {
-    const string& attr_name = attrs_[i];
-    const string& param = param_names_[i + op_def_.input_arg_size()];
-    const auto& attr = *FindAttr(attr_name, op_def_);
-    StringPiece attr_type = attr.type();
-    attr_expressions_[attr_name] = param;
-    const int default_index = i - (attrs_.size() - args_with_defaults.size());
-    if (default_index >= 0) {
-      const string& default_value = args_with_defaults[default_index].second;
-      strings::StrAppend(&result_, "  if ", param, " is None:\n");
-      strings::StrAppend(&result_, "    ", param, " = ", default_value, "\n");
-    }
-    if (attr_type.starts_with("list(")) {
-      ExpectListArg(param);
-    }
+  string num_outputs_expr;
+  std::vector<string> output_sizes(num_outs_);
+  GetOutputSizesAndNumOutputsExpr(&output_sizes, &num_outputs_expr);
 
-    if (attr_type == "string") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_str(", param,
-                         ", \"", param, "\")\n");
-    } else if (attr_type == "list(string)") {
-      strings::StrAppend(&result_, "  ", param, " = [_execute.make_str(_s, \"",
-                         param, "\") for _s in ", param, "]\n");
-    } else if (attr_type == "int") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_int(", param,
-                         ", \"", param, "\")\n");
-    } else if (attr_type == "list(int)") {
-      strings::StrAppend(&result_, "  ", param, " = [_execute.make_int(_i, \"",
-                         param, "\") for _i in ", param, "]\n");
-    } else if (attr_type == "float") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_float(",
-                         param, ", \"", param, "\")\n");
-    } else if (attr_type == "list(float)") {
-      strings::StrAppend(&result_, "  ", param,
-                         " = [_execute.make_float(_f, \"", param,
-                         "\") for _f in ", param, "]\n");
-    } else if (attr_type == "bool") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_bool(", param,
-                         ", \"", param, "\")\n");
-    } else if (attr_type == "list(bool)") {
-      strings::StrAppend(&result_, "  ", param, " = [_execute.make_bool(_b, \"",
-                         param, "\") for _b in ", param, "]\n");
-    } else if (attr_type == "type") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_type(", param,
-                         ", \"", param, "\")\n");
-    } else if (attr_type == "list(type)") {
-      strings::StrAppend(&result_, "  ", param, " = [_execute.make_type(_t, \"",
-                         param, "\") for _t in ", param, "]\n");
-    } else if (attr_type == "shape") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_shape(",
-                         param, ", \"", param, "\")\n");
-    } else if (attr_type == "list(shape)") {
-      strings::StrAppend(&result_, "  ", param,
-                         " = [_execute.make_shape(_s, \"", param,
-                         "\") for _s in ", param, "]\n");
-    } else if (attr_type == "tensor") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_tensor(",
-                         param, ", \"", param, "\")\n");
-    } else if (attr_type == "list(tensor)") {
-      strings::StrAppend(&result_, "  ", param,
-                         " = [_execute.make_tensor(_t, \"", param,
-                         "\") for _t in ", param, "]\n");
-    } else if (attr_type != "func") {
-      return strings::StrCat("# No definition for ", function_name_,
-                             " since we don't support attrs with type\n"
-                             "# '",
-                             attr_type, "' right now.\n\n");
-    }
+  string eager_not_allowed_error = GetEagerNotAllowedError();
+
+  if (!AddEagerFastPathAndGraphCode(parameters, output_sizes,
+                                    eager_not_allowed_error)) {
+    return result_;
   }
 
-  // Figure out the list of inputs.
-  const string inputs = FlattenInputs(nullptr, nullptr);
+  if (!AddEagerFallbackCode(parameters, output_sizes, num_outputs_expr,
+                            eager_not_allowed_error)) {
+    return result_;
+  }
 
+  return prelude_ + result_;
+}
+
+void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
   // Handle graph-mode case
   strings::StrAppend(&result_,
                      "  _ctx = _context.context()\n"
-
-                     "  if _ctx.in_graph_mode():\n"
+                     "  if _ctx.in_graph_mode():\n",
+                     function_setup,
                      "    _, _, _op = _op_def_lib._apply_op_helper(\n");
   AddBodyNoReturn("        ");
   if (num_outs_ > 0) {
@@ -436,123 +405,383 @@ string GenEagerPythonOp::Code() {
   } else {
     strings::StrAppend(&result_, "    return _op\n");
   }
+}
 
-  // Handle eager-mode case
-  strings::StrAppend(&result_, "  else:\n");
+string GenEagerPythonOp::GetEagerNotAllowedError() {
+  bool eager_allowed = true;
+  string ref_arg;
+  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
+    const auto& arg = op_def_.input_arg(i);
+    if (arg.is_ref()) {
+      eager_allowed = false;
+      DCHECK_EQ(op_def_.input_arg(i).name(), api_def_.in_arg(i).name());
+      ref_arg = api_def_.in_arg(i).rename_to();
+    }
+  }
+  for (int i = 0; i < op_def_.output_arg_size(); ++i) {
+    const auto& arg = op_def_.output_arg(i);
+    if (arg.is_ref()) {
+      eager_allowed = false;
+      DCHECK_EQ(op_def_.output_arg(i).name(), api_def_.out_arg(i).name());
+      ref_arg = api_def_.out_arg(i).rename_to();
+    }
+  }
+
+  if (eager_allowed) return "";
 
+  return strings::StrCat("raise RuntimeError(\"", op_name_,
+                         " op does not support eager execution. ", "Arg '",
+                         ref_arg, "' is a ref.\")\n");
+}
+
+void GenEagerPythonOp::ExpectListArg(const string& indentation,
+                                     const string& arg_name, string* output) {
+  strings::StrAppend(output, indentation, "if not isinstance(", arg_name,
+                     ", (list, tuple)):\n", indentation, "  raise TypeError(\n",
+                     indentation, "      \"Expected list for '", arg_name,
+                     "' argument to \"\n", indentation, "      \"'", op_name_,
+                     "' Op, not %r.\" % ", arg_name, ")\n");
+}
+
+bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
+                                             string* function_setup) {
+  // Validate list inputs, infer length attrs.
+  for (int i = 0; i < op_def_.attr_size(); ++i) {
+    const auto& attr(op_def_.attr(i));
+    if (attr.type() == "int") {
+      auto arg_list = attr_to_args_.find(attr.name());
+      if (arg_list != attr_to_args_.end()) {
+        // Inferred int attrs are the lengths of inputs. Validate those
+        // inputs are lists and have the same length.
+        for (auto iter = arg_list->second.begin();
+             iter != arg_list->second.end(); ++iter) {
+          const string& arg_api_name = param_names_[*iter].GetRenameTo();
+          ExpectListArg(indentation, arg_api_name, function_setup);
+          if (iter == arg_list->second.begin()) {
+            AddInferredAttr(indentation, attr.name(),
+                            strings::StrCat("len(", arg_api_name, ")"),
+                            function_setup, &attr_expressions_);
+          } else {
+            const auto& attr_var = attr_expressions_[attr.name()];
+            strings::StrAppend(
+                function_setup, indentation, "if len(", arg_api_name,
+                ") != ", attr_var, ":\n", indentation, "  raise ValueError(\n",
+                indentation, "      \"List argument '", arg_api_name, "' to '",
+                op_name_, "' Op with length %d \"\n", indentation,
+                "      \"must match length %d of argument '",
+                inferred_attrs_[attr.name()], "'.\" %\n", indentation,
+                "      (len(", arg_api_name, "), ", attr_var, "))\n");
+          }
+        }
+      }
+    }
+  }
+
+  for (int i = 0; i < attrs_.size(); ++i) {
+    const string& attr_name = attrs_[i];
+    const auto& param = param_names_[i + op_def_.input_arg_size()];
+    const auto& attr = *FindAttr(attr_name, op_def_);
+    const string& attr_api_name = param.GetRenameTo();
+    StringPiece attr_type = attr.type();
+    attr_expressions_[attr_name] = attr_api_name;
+    const int default_index = i - (attrs_.size() - params_with_default_.size());
+    if (default_index >= 0) {
+      const string& default_value = params_with_default_[default_index].second;
+      strings::StrAppend(function_setup, indentation, "if ", attr_api_name,
+                         " is None:\n");
+      strings::StrAppend(function_setup, indentation, "  ", attr_api_name,
+                         " = ", default_value, "\n");
+    }
+    if (attr_type.starts_with("list(")) {
+      ExpectListArg(indentation, attr_api_name, function_setup);
+    }
+
+    if (attr_type == "string") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_str(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(string)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_str(_s, \"", attr_api_name,
+                         "\") for _s in ", attr_api_name, "]\n");
+    } else if (attr_type == "int") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_int(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(int)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_int(_i, \"", attr_api_name,
+                         "\") for _i in ", attr_api_name, "]\n");
+    } else if (attr_type == "float") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_float(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(float)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_float(_f, \"", attr_api_name,
+                         "\") for _f in ", attr_api_name, "]\n");
+    } else if (attr_type == "bool") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_bool(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(bool)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_bool(_b, \"", attr_api_name,
+                         "\") for _b in ", attr_api_name, "]\n");
+    } else if (attr_type == "type") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_type(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(type)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_type(_t, \"", attr_api_name,
+                         "\") for _t in ", attr_api_name, "]\n");
+    } else if (attr_type == "shape") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_shape(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(shape)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_shape(_s, \"", attr_api_name,
+                         "\") for _s in ", attr_api_name, "]\n");
+    } else if (attr_type == "tensor") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_tensor(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(tensor)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_tensor(_t, \"", attr_api_name,
+                         "\") for _t in ", attr_api_name, "]\n");
+    } else if (attr_type != "func") {
+      *function_setup =
+          strings::StrCat("# No definition for ", function_name_,
+                          " since we don't support attrs with type\n"
+                          "# '",
+                          attr_type, "' right now.\n\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+// If output i is list output, output_sizes[i] will be set to a
+// string with the python expression that will evaluate to its
+// length. output_sizes[i] is empty for non-list outputs.
+void GenEagerPythonOp::GetOutputSizesAndNumOutputsExpr(
+    std::vector<string>* output_sizes, string* num_outputs_expr) {
   // Expression representing the number of outputs.
   int num_fixed_outputs = 0;
-  string num_outputs_expr;
-  // If output i is list output, output_sizes[i] will be set to a
-  // string with the python expression that will evaluate to its
-  // length. output_sizes[i] is empty for non-list outputs.
-  std::vector<string> output_sizes(num_outs_);
   for (int i = 0; i < num_outs_; ++i) {
     const auto& arg(op_def_.output_arg(i));
     if (!arg.number_attr().empty()) {
-      if (!num_outputs_expr.empty()) {
-        strings::StrAppend(&num_outputs_expr, " + ");
+      if (!num_outputs_expr->empty()) {
+        strings::StrAppend(num_outputs_expr, " + ");
       }
-      output_sizes[i] = attr_expressions_[arg.number_attr()];
-      strings::StrAppend(&num_outputs_expr, output_sizes[i]);
+      (*output_sizes)[i] = attr_expressions_[arg.number_attr()];
+      strings::StrAppend(num_outputs_expr, (*output_sizes)[i]);
     } else if (!arg.type_list_attr().empty()) {
-      if (!num_outputs_expr.empty()) {
-        strings::StrAppend(&num_outputs_expr, " + ");
+      if (!num_outputs_expr->empty()) {
+        strings::StrAppend(num_outputs_expr, " + ");
       }
       // Have to be careful to use an expression that works in both
       // graph and eager paths here.
       const auto iter = inferred_attrs_.find(arg.type_list_attr());
       if (iter == inferred_attrs_.end()) {
-        output_sizes[i] = strings::StrCat(
+        (*output_sizes)[i] = strings::StrCat(
             "len(", attr_expressions_[arg.type_list_attr()], ")");
       } else {
-        output_sizes[i] = strings::StrCat("len(", iter->second, ")");
+        (*output_sizes)[i] = strings::StrCat("len(", iter->second, ")");
       }
-      strings::StrAppend(&num_outputs_expr, output_sizes[i]);
+      strings::StrAppend(num_outputs_expr, (*output_sizes)[i]);
     } else {
       ++num_fixed_outputs;
     }
   }
   if (num_fixed_outputs > 0) {
-    if (!num_outputs_expr.empty()) {
-      strings::StrAppend(&num_outputs_expr, " + ");
+    if (!num_outputs_expr->empty()) {
+      strings::StrAppend(num_outputs_expr, " + ");
     }
-    strings::StrAppend(&num_outputs_expr, num_fixed_outputs);
-  } else if (num_outputs_expr.empty()) {
-    num_outputs_expr = "0";
-  }
-
-  bool eager_allowed = true;
-  string ref_arg;
-  for (const auto& arg : op_def_.input_arg()) {
-    if (arg.is_ref()) {
-      eager_allowed = false;
-      ref_arg = arg.name();
-    }
-  }
-  for (const auto& arg : op_def_.output_arg()) {
-    if (arg.is_ref()) {
-      eager_allowed = false;
-      ref_arg = arg.name();
-    }
-  }
-
-  if (eager_allowed) {
-    AddEagerInferredAttrs();
-    AddEagerInputCasts();
-    strings::StrAppend(&result_, "    _inputs_flat = ", inputs, "\n");
-    AddEagerAttrs();
-    AddEagerExecute(num_outputs_expr);
-  } else {
-    strings::StrAppend(&result_,
-                       "    raise RuntimeError(\n"
-                       "        \"",
-                       op_name_, " op does not support eager execution. ",
-                       "Arg '", ref_arg, "'' is a ref.\")\n");
+    strings::StrAppend(num_outputs_expr, num_fixed_outputs);
+  } else if (num_outputs_expr->empty()) {
+    *num_outputs_expr = "0";
   }
+}
 
+void GenEagerPythonOp::AddEagerFunctionTeardown(
+    const string& indentation, const std::vector<string>& output_sizes,
+    bool execute_record_gradient) {
   if (num_outs_ > 0) {
-    strings::StrAppend(&result_, "  _execute.record_gradient(\n", "      \"",
-                       op_def_.name(),
-                       "\", _inputs_flat, _attrs, _result, name)\n");
+    if (execute_record_gradient) {
+      strings::StrAppend(&result_, indentation, "_execute.record_gradient(\n",
+                         "      \"", op_def_.name(),
+                         "\", _inputs_flat, _attrs, _result, name)\n");
+    }
     if (num_outs_ == 1 && !output_sizes[0].empty()) {
       // Single list result.
     } else if (num_outs_ == 1) {
       // Execute returns a single-element list which we need to destructure.
-      strings::StrAppend(&result_, "  _result, = _result\n");
+      strings::StrAppend(&result_, indentation, "_result, = _result\n");
     } else {
       // Have multiple outputs, so we will need to reformat the return
       // value of execute() to be a list with one entry per op output
       // (that entry will be a list of tensors if that output is of list
       // type).
       // For list outputs, convert the right subrange of _result into a list.
-      Unflatten("  ", output_sizes, "_result", &result_);
+      Unflatten(indentation, output_sizes, "_result", &result_);
       // Convert to a named tuple.
-      strings::StrAppend(&result_, "  _result = _", op_def_.name(),
+      strings::StrAppend(&result_, indentation, "_result = _", op_def_.name(),
                          "Output._make(_result)\n");
     }
   } else {
-    strings::StrAppend(&result_, "    _result = None\n");
+    strings::StrAppend(&result_, indentation, "_result = None\n");
   }
-  strings::StrAppend(&result_, "  return _result\n\n");
-  return prelude_ + result_;
+  strings::StrAppend(&result_, indentation, "return _result\n\n");
 }
 
-void GenEagerPythonOp::ExpectListArg(const string& arg_name) {
-  strings::StrAppend(&result_, "  if not isinstance(", arg_name,
-                     ", (list, tuple)):\n"
-                     "    raise TypeError(\n"
-                     "        \"Expected list for '",
-                     arg_name,
-                     "' argument to \"\n"
-                     "        \"'",
-                     op_name_, "' Op, not %r.\" % ", arg_name, ")\n");
+bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
+    const string& parameters, const std::vector<string>& output_sizes,
+    const string& eager_not_allowed_error) {
+  AddExport();
+  AddDefLine(function_name_, parameters);
+  AddDocStringDescription();
+  AddDocStringArgs();
+  AddDocStringInputs();
+  AddDocStringAttrs();
+  AddDocStringNameArg();
+  AddOutputGlobals();  // Added to prelude_
+  AddDocStringOutputs();
+  strings::StrAppend(&result_, "  \"\"\"\n");
+
+  // Handle graph-mode case
+  string function_setup;
+  if (!GetEagerFunctionSetup("    ", &function_setup)) {
+    result_ = function_setup;
+    return false;
+  }
+  HandleGraphMode(function_setup);
+  AddEagerFunctionTeardown("    ", output_sizes,
+                           true /* execute_record_gradient */);
+
+  // Handle eager-mode case
+  strings::StrAppend(&result_, "  else:\n");
+
+  if (eager_not_allowed_error.empty()) {
+    AddEagerFastPathExecute();
+  } else {
+    strings::StrAppend(&result_, "    ", eager_not_allowed_error);
+  }
+
+  strings::StrAppend(&result_, "\n\n");
+  return true;
 }
 
-void GenEagerPythonOp::AddEagerInferredAttrs() {
+bool GenEagerPythonOp::AddEagerFallbackCode(
+    const string& parameters, const std::vector<string>& output_sizes,
+    const string& num_outputs_expr, const string& eager_not_allowed_error) {
+  if (!eager_not_allowed_error.empty()) {
+    strings::StrAppend(&result_, "  ", eager_not_allowed_error);
+    return true;
+  }
+
+  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix), parameters);
+  strings::StrAppend(
+      &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
+  strings::StrAppend(&result_, "  This is for function ", function_name_,
+                     "\n  \"\"\"\n");
+
+  strings::StrAppend(&result_, "  _ctx = _context.context()\n");
+
+  string function_setup;
+  if (!GetEagerFunctionSetup("  ", &function_setup)) {
+    result_ = function_setup;
+    return false;
+  }
+  strings::StrAppend(&result_, function_setup);
+
+  AddEagerInferredAttrs("  ");
+  AddEagerInputCasts("  ");
+  strings::StrAppend(
+      &result_, "  _inputs_flat = ", FlattenInputs(nullptr, nullptr), "\n");
+  AddEagerAttrs("  ");
+  AddEagerExecute("  ", num_outputs_expr);
+
+  AddEagerFunctionTeardown("  ", output_sizes,
+                           true /* execute_record_gradient */);
+
+  return true;
+}
+
+void GenEagerPythonOp::AddEagerFastPathExecute() {
+  string fastpath_execute_params = strings::StrCat(
+      "_ctx._handle, _ctx.device_name, \"", op_def_.name(), "\", ",
+      "_execute.record_gradient, name, _ctx._post_execution_callbacks");
+  string fallback_params;
+
+  for (int i = 0; i < api_def_.in_arg_size(); i++) {
+    const string param_name = param_names_[i].GetRenameTo();
+    strings::StrAppend(&fastpath_execute_params, ", ", param_name);
+    if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+    strings::StrAppend(&fallback_params, param_name);
+  }
+
+  for (const auto& attr : api_def_.attr()) {
+    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
+      strings::StrAppend(&fastpath_execute_params, ", \"", attr.name(), "\", ",
+                         attr.rename_to());
+
+      if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+      strings::StrAppend(&fallback_params, attr.rename_to(), "=",
+                         attr.rename_to());
+    }
+  }
+
+  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+  strings::StrAppend(&fallback_params, "name=name");
+
+  strings::StrAppend(&result_, "    try:\n");
+  strings::StrAppend(
+      &result_, "      ",
+      "_result = _pywrap_tensorflow.TFE_Py_FastPathExecute(\n",
+      WordWrap(strings::StrCat("        "),
+               strings::StrCat(fastpath_execute_params, ")"), kRightMargin),
+      "\n");
+
+  if (op_def_.output_arg_size() > 1) {
+    const string output_tuple_name =
+        strings::StrCat("_", op_def_.name(), "Output");
+    strings::StrAppend(&result_, "      ", "_result = ", output_tuple_name,
+                       "._make(_result)\n");
+  }
+  strings::StrAppend(&result_, "      ", "return _result\n");
+
+  // Handle fallback.
+  strings::StrAppend(&result_, "    ", "except _core._FallbackException:\n");
+  strings::StrAppend(
+      &result_, "      ", "return ", function_name_, kEagerFallbackSuffix,
+      "(\n",
+      WordWrap(strings::StrCat("          "),
+               strings::StrCat(fallback_params, ")"), kRightMargin),
+      "\n");
+
+  // Any errors thrown from execute need to be unwrapped from
+  // _NotOkStatusException.
+  strings::StrAppend(&result_, "    ",
+                     "except _core._NotOkStatusException as e:\n");
+  strings::StrAppend(&result_, "      ", "if name is not None:\n");
+  strings::StrAppend(&result_, "        ",
+                     "message = e.message + \" name: \" + name\n");
+  strings::StrAppend(&result_, "      ", "else:\n");
+  strings::StrAppend(&result_, "        ", "message = e.message\n");
+  strings::StrAppend(
+      &result_, "      ",
+      "_six.raise_from(_core._status_to_exception(e.code, message), None)\n");
+}
+
+void GenEagerPythonOp::AddEagerInferredAttrs(const string& indentation) {
   // Figure out values for inferred attrs, and cast to eager tensors.
   for (int i = 0; i < op_def_.attr_size(); ++i) {
     const auto& attr(op_def_.attr(i));
+    const auto& api_def_attr(api_def_.attr(i));
     auto arg_list = attr_to_args_.find(attr.name());
     if (arg_list != attr_to_args_.end()) {
       if (attr.type() == "type") {
@@ -565,33 +794,34 @@ void GenEagerPythonOp::AddEagerInferredAttrs() {
           strings::StrAppend(
               &conversion, ", ",
               python_op_gen_internal::AttrValueToPython(
-                  attr.type(), attr.default_value(), "_dtypes."));
+                  attr.type(), api_def_attr.default_value(), "_dtypes."));
         }
         strings::StrAppend(&conversion, ")");
         const string var_name = AttrVarName(attr.name(), &attr_expressions_);
         if (output_sizes.size() == 1) {
           // Avoid creating a temporary variable in the case where
           // we can easily assign to the right value directly.
-          const string inputs_var = param_names_[arg_list->second.front()];
+          const string inputs_var =
+              param_names_[arg_list->second.front()].GetRenameTo();
           if (output_sizes.front().empty()) {
-            strings::StrAppend(&result_, "    ", var_name, ", (", inputs_var,
-                               ",) = ", conversion, "\n");
+            strings::StrAppend(&result_, indentation, var_name, ", (",
+                               inputs_var, ",) = ", conversion, "\n");
           } else {
-            strings::StrAppend(&result_, "    ", var_name, ", ", inputs_var,
-                               " = ", conversion, "\n");
+            strings::StrAppend(&result_, indentation, var_name, ", ",
+                               inputs_var, " = ", conversion, "\n");
           }
         } else {
           const string inputs_var = strings::StrCat("_inputs_", attr.name());
-          strings::StrAppend(&result_, "    ", var_name, ", ", inputs_var,
+          strings::StrAppend(&result_, indentation, var_name, ", ", inputs_var,
                              " = ", conversion, "\n");
           // Convert from a flat list of eager tensors back to the
           // parameter variables.
-          Unflatten("    ", output_sizes, inputs_var, &result_);
+          Unflatten(indentation, output_sizes, inputs_var, &result_);
           std::vector<string> p;
           for (int j : arg_list->second) {
-            p.emplace_back(param_names_[j]);
+            p.emplace_back(param_names_[j].GetRenameTo());
           }
-          strings::StrAppend(&result_, "    ", VectorToTuple(p), " = ",
+          strings::StrAppend(&result_, indentation, VectorToTuple(p), " = ",
                              inputs_var, "\n");
         }
       } else if (attr.type() == "list(type)") {
@@ -608,38 +838,38 @@ void GenEagerPythonOp::AddEagerInferredAttrs() {
           std::vector<string> lists;
           for (auto iter = arg_list->second.begin();
                iter != arg_list->second.end(); ++iter) {
-            lists.push_back(param_names_[*iter]);
+            lists.push_back(param_names_[*iter].GetRenameTo());
           }
           inputs_var = VectorToTuple(lists);
           conversion = "_execute.args_to_mixed_eager_tensors";
         } else {
           // For one list(tensor) argument, we just convert every
           // element of the list to an eager tensor.
-          inputs_var = param_names_[arg_list->second.front()];
+          inputs_var = param_names_[arg_list->second.front()].GetRenameTo();
           conversion = "_execute.convert_to_mixed_eager_tensors";
         }
-        strings::StrAppend(&result_, "    ", var_name, ", ", inputs_var, " = ",
-                           conversion, "(", inputs_var, ", _ctx)\n");
+        strings::StrAppend(&result_, indentation, var_name, ", ", inputs_var,
+                           " = ", conversion, "(", inputs_var, ", _ctx)\n");
       }
     }
   }
 }
 
-void GenEagerPythonOp::AddEagerInputCasts() {
+void GenEagerPythonOp::AddEagerInputCasts(const string& indentation) {
   // Cast remaining args to eager tensors
   for (int i = 0; i < op_def_.input_arg_size(); ++i) {
     const auto& arg(op_def_.input_arg(i));
     if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) continue;
-    const string& param = param_names_[i];
+    const string& param = param_names_[i].GetRenameTo();
     const string fn = arg.number_attr().empty() ? "" : "n_";
     const string dtype =
         python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
-    strings::StrAppend(&result_, "    ", param, " = _ops.convert_", fn,
+    strings::StrAppend(&result_, indentation, param, " = _ops.convert_", fn,
                        "to_tensor(", param, ", ", dtype, ")\n");
   }
 }
 
-void GenEagerPythonOp::AddEagerAttrs() {
+void GenEagerPythonOp::AddEagerAttrs(const string& indentation) {
   // Compute eager attrs
   if (op_def_.attr_size() > 0) {
     string attr_values;
@@ -651,14 +881,19 @@ void GenEagerPythonOp::AddEagerAttrs() {
     }
     strings::StrAppend(&attr_values, ")");
     strings::StrAppend(
-        &result_, WordWrap("    _attrs = (", attr_values, kRightMargin), "\n");
+        &result_,
+        WordWrap(indentation, strings::StrCat("_attrs = (", attr_values),
+                 kRightMargin),
+        "\n");
   } else {
-    strings::StrAppend(&result_, "    _attrs = None\n");
+    strings::StrAppend(&result_, indentation, "_attrs = None\n");
   }
 }
 
-void GenEagerPythonOp::AddEagerExecute(const string& num_outputs_expr) {
-  const string return_prefix = "    _result = _execute.execute(";
+void GenEagerPythonOp::AddEagerExecute(const string& indentation,
+                                       const string& num_outputs_expr) {
+  const string return_prefix =
+      strings::StrCat(indentation, "_result = _execute.execute(");
   const string return_args = strings::StrCat(
       "b\"", op_def_.name(), "\", ", num_outputs_expr,
       ", inputs=_inputs_flat, attrs=_attrs, ctx=_ctx, name=name)");
@@ -679,8 +914,8 @@ string GetEagerPythonOps(const OpList& ops, const ApiDefMap& api_defs,
 This file is MACHINE GENERATED! Do not edit.
 )");
 
-  // Mention the original source file so someone tracing back through generated
-  // Python code will know where to look next.
+  // Mention the original source file so someone tracing back through
+  // generated Python code will know where to look next.
   if (!source_file_name.empty()) {
     strings::StrAppend(&result, "Original C++ source file: ");
     strings::StrAppend(&result, source_file_name);
@@ -690,11 +925,14 @@ This file is MACHINE GENERATED! Do not edit.
   strings::StrAppend(&result, R"("""
 
 import collections as _collections
+import six as _six
 
-from tensorflow.python.eager import execute as _execute
+from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
 from tensorflow.python.eager import context as _context
 from tensorflow.python.eager import core as _core
+from tensorflow.python.eager import execute as _execute
 from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.framework import errors as _errors
 from tensorflow.python.framework import tensor_shape as _tensor_shape
 
 from tensorflow.core.framework import op_def_pb2 as _op_def_pb2
@@ -712,11 +950,21 @@ from tensorflow.python.util.tf_export import tf_export
   auto out = cleaned_ops.mutable_op();
   out->Reserve(ops.op_size());
   for (const auto& op_def : ops.op()) {
-    bool is_hidden = false;
-    for (const string& hidden : hidden_ops) {
-      if (op_def.name() == hidden) {
-        is_hidden = true;
-        break;
+    const auto* api_def = api_defs.GetApiDef(op_def.name());
+
+    if (api_def->visibility() == ApiDef::SKIP) {
+      continue;
+    }
+
+    // An op is hidden if either its ApiDef visibility is HIDDEN
+    // or it is in the hidden_ops list.
+    bool is_hidden = api_def->visibility() == ApiDef::HIDDEN;
+    if (!is_hidden) {
+      for (const string& hidden : hidden_ops) {
+        if (op_def.name() == hidden) {
+          is_hidden = true;
+          break;
+        }
       }
     }
 
@@ -733,7 +981,6 @@ from tensorflow.python.util.tf_export import tf_export
       continue;
     }
 
-    const auto* api_def = api_defs.GetApiDef(op_def.name());
     strings::StrAppend(&result,
                        GetEagerPythonOp(op_def, *api_def, function_name));
 
diff --git a/tensorflow/python/eager/python_eager_op_gen.h b/tensorflow/python/eager/python_eager_op_gen.h
index f9dfdf0408f2ea0cf72631e67266ec445b98a868..d27b00139d129aba1c511a21afce749eae8b32ed 100644
--- a/tensorflow/python/eager/python_eager_op_gen.h
+++ b/tensorflow/python/eager/python_eager_op_gen.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
+#ifndef TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
+#define TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
 
 #include <string>
 #include <vector>
@@ -40,4 +40,4 @@ string GetEagerPythonWrappers(const char* op_list_buf, size_t op_list_len);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
+#endif  // TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
diff --git a/tensorflow/python/eager/python_eager_op_gen_main.cc b/tensorflow/python/eager/python_eager_op_gen_main.cc
deleted file mode 100644
index cd74c438ec6f5cd7f807a7205f76eff7421aeb74..0000000000000000000000000000000000000000
--- a/tensorflow/python/eager/python_eager_op_gen_main.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/python/eager/python_eager_op_gen.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb.h"
-#include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-
-namespace tensorflow {
-namespace {
-
-constexpr char kBaseApiDef[] =
-    "tensorflow/core/api_def/base_api/*.pbtxt";
-constexpr char kPythonApiDef[] =
-    "tensorflow/core/api_def/python_api/*.pbtxt";
-constexpr bool kUseApiDef = false;
-
-void PrintAllPythonOps(const std::vector<string>& hidden_ops) {
-  OpList ops;
-  OpRegistry::Global()->Export(false, &ops);
-
-  ApiDefMap api_def_map(ops);
-  if (kUseApiDef) {
-    Env* env = Env::Default();
-
-    std::vector<string> base_api_files;
-    std::vector<string> python_api_files;
-    TF_CHECK_OK(env->GetMatchingPaths(kBaseApiDef, &base_api_files));
-    TF_CHECK_OK(env->GetMatchingPaths(kPythonApiDef, &python_api_files));
-
-    TF_CHECK_OK(api_def_map.LoadFileList(env, base_api_files));
-    TF_CHECK_OK(api_def_map.LoadFileList(env, python_api_files));
-  }
-  PrintEagerPythonOps(ops, api_def_map, hidden_ops, true /* require_shapes */);
-}
-
-}  // namespace
-}  // namespace tensorflow
-
-int main(int argc, char* argv[]) {
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  if (argc == 1) {
-    tensorflow::PrintAllPythonOps({});
-  } else {
-    return -1;
-  }
-  return 0;
-}
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 91192fea62dd3b0f94350a9b25ce8568e248e7e3..6fa076507d11ab9c88891cbeb0a4fb3959e4e99d 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -332,7 +332,7 @@ void EagerTensor_dealloc(EagerTensor* self) {
   tensorflow::ClearDecrefCache();
   auto id = self->id;
   Py_TYPE(self)->tp_free(self);
-  TFE_Py_TapeStackDeleteTrace(id);
+  TFE_Py_TapeSetDeleteTrace(id);
 }
 
 // Getter for `_id`.
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index a33b17ada6f94e43ac16696c502be4b885e9d33a..16b7d1a119a409d1d0a77b220d5d0945b280b638 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -47,8 +47,18 @@ void TFE_Py_Execute(TFE_Context* ctx, const char* device_name,
 
 // Registers e as the Exception class for handling not ok Status. Returns
 // Py_None if registration succeeds, else throws a TypeError and returns NULL.
+//
+// This function is not thread-safe.
 PyObject* TFE_Py_RegisterExceptionClass(PyObject* e);
 
+// Registers e as the Exception to be raised when the conditions of
+// TFE_Py_FastPathExecute_C have not been met. When this exception is set, it
+// is a signal to the calling code that it should fall back to the safer (and
+// more complete) code path.
+//
+// This function is not thread-safe.
+PyObject* TFE_Py_RegisterFallbackExceptionClass(PyObject* e);
+
 // Returns 0 if 'status' is TF_OK. Otherwise, raises an exception (using
 // `exception` if not nullptr, else using the class registered via
 // TFE_Py_RegisterExceptionClass), and returns -1.
@@ -87,22 +97,25 @@ TFE_TensorHandle* EagerTensor_Handle(const PyObject* o);
 // newly created type, or nullptr on error.
 PyObject* TFE_Py_InitEagerTensor(PyObject* base_class);
 
-// Pushes a new tape into the thread-local stack.
-// `persistent` must be a PyBool_Type, i.e either Py_True or Py_False
-void TFE_Py_TapeStackPushNew(PyObject* persistent);
+// Creates a new tape and adds it to the active set. `persistent` must be a
+// PyBool_Type, i.e either Py_True or Py_False
+PyObject* TFE_Py_TapeSetNew(PyObject* persistent);
 
-// Pops the tape from the top of the stack and returns it.
-PyObject* TFE_Py_TapeStackPop();
-
-// Pushes an existing tape onto the stack.
-void TFE_Py_TapeStackPush(PyObject* tape);
+// Removes the passed tape from the set of active tapes.
+void TFE_Py_TapeSetRemove(PyObject* tape);
 
 // Returns true if the tape stack is empty.
-PyObject* TFE_Py_TapeStackIsEmpty();
+PyObject* TFE_Py_TapeSetIsEmpty();
+
+PyObject* TFE_Py_TapeSetShouldRecord(PyObject* tensors);
+void TFE_Py_TapeSetWatch(PyObject* tensor);
+void TFE_Py_TapeSetDeleteTrace(tensorflow::int64 tensor_id);
 
-PyObject* TFE_Py_TapeStackShouldRecord(PyObject* tensors);
-void TFE_Py_TapeStackWatch(PyObject* tensor);
-void TFE_Py_TapeStackDeleteTrace(tensorflow::int64 tensor_id);
+// Stops any gradient recording on the current thread.
+void TFE_Py_TapeSetStopOnThread();
+
+// Restarts gradient recording on the current thread.
+void TFE_Py_TapeSetRestartOnThread();
 
 // Records an operation in the gradient tape stack.type is a string for the
 // operation type, used in the backprop code. output_tensors should be a list of
@@ -111,13 +124,12 @@ void TFE_Py_TapeStackDeleteTrace(tensorflow::int64 tensor_id);
 // operation. backward_function should be the function to be called during
 // backprop to, given the gradients of the output tensors, produce the gradients
 // of the input tensors.
-void TFE_Py_TapeStackRecordOperation(PyObject* op_type,
-                                     PyObject* output_tensors,
-                                     PyObject* input_tensor_ids,
-                                     PyObject* backward_function);
+void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
+                                   PyObject* input_tensor_ids,
+                                   PyObject* backward_function);
 
 // Watches the given variable object on the given tape.
-void TFE_Py_TapeStackWatchVariable(PyObject* variable);
+void TFE_Py_TapeSetWatchVariable(PyObject* variable);
 
 // Computes a gradient based on information recorded on the tape.`tape` must
 // have been produced by TFE_Py_NewTape. `vspace` must be a
@@ -129,6 +141,30 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
                               PyObject* target, PyObject* sources,
                               PyObject* output_gradients, TF_Status* status);
 
+// Execute a tensorflow operation assuming that all provided inputs are
+// correctly formatted (i.e. EagerTensors). If it doesn't find EagerTensors,
+// it will simply fail with a NotImplementedError.
+//
+// The first PyObject* is unused.
+// The "args" PyObject* is meant to be a tuple with the following structure:
+//  Item 1: The TFE Context
+//  Item 2: device_name: Name of the device on which to execute the operation,
+//          or NULL for automatic selection.
+//  Item 3: op_name: Name of the TensorFlow op to execute.
+//  Item 4: record_gradient_callback: Callback that records the gradient of the
+//          result. The callback takes (op_name, inputs, attrs, result, name)
+//          - all sequences and records the gradient.
+//  Item 5: name: An optional name for the operation.
+//  Item 6: List representing all callbacks to execute after successful
+//  op execute.
+//  Item 7 onwards: inputs - This is a list of inputs followed by a list of
+//        attrs. It is not necessary for type attrs to be present.
+//
+// This is named _C since there doesn't seem to be any way to make it visible
+// in the SWIG interface without renaming due to the use of the %native
+// directive.
+PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args);
+
 // Returns the set of variables watched by the given tape.
 PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape);
 
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index ce823cb5679462e28efa925ab98100bbe864ad9b..cabbcc48fd56563a50591cc6adabc3af75918401 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -21,12 +21,18 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/tape.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/compactptrset.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/eager/pywrap_tensor.h"
 
 using tensorflow::string;
+using tensorflow::strings::Printf;
 
 namespace {
 
@@ -52,10 +58,22 @@ PARSE_VALUE(ParseInt64Value, int64_t, PyLong_Check, PyLong_AsLong)
 #else
 PARSE_VALUE(ParseIntValue, int, PyInt_Check, PyInt_AsLong)
 PARSE_VALUE(ParseInt64Value, int64_t, PyInt_Check, PyInt_AsLong)
+PARSE_VALUE(ParseInt64LongValue, int64_t, PyLong_Check, PyLong_AsLong)
 #endif
 PARSE_VALUE(ParseFloatValue, float, PyFloat_Check, PyFloat_AsDouble)
 #undef PARSE_VALUE
 
+Py_ssize_t TensorShapeNumDims(PyObject* value) {
+  const auto size = PySequence_Size(value);
+  if (size == -1) {
+    // TensorShape.__len__ raises an error in the scenario where the shape is an
+    // unknown, which needs to be cleared.
+    // TODO(nareshmodi): ensure that this is actually a TensorShape.
+    PyErr_Clear();
+  }
+  return size;
+}
+
 bool ParseStringValue(const string& key, PyObject* py_value, TF_Status* status,
                       const char** value) {
   if (PyBytes_Check(py_value)) {
@@ -82,32 +100,40 @@ bool ParseBoolValue(const string& key, PyObject* py_value, TF_Status* status,
   return true;
 }
 
-const char* ParseProtoValue(const string& key, const char* proto_name,
-                            PyObject* py_value, size_t* size,
-                            TF_Status* status) {
-  char* output = nullptr;
-  Py_ssize_t py_size;
-  if (PyBytes_Check(py_value) &&
-      PyBytes_AsStringAndSize(py_value, &output, &py_size) >= 0) {
-    *size = static_cast<size_t>(py_size);
-    return output;
-  }
+bool IsInteger(PyObject* py_value) {
 #if PY_MAJOR_VERSION >= 3
-  if (PyUnicode_Check(py_value) &&
-      (output = PyUnicode_AsUTF8AndSize(py_value, &py_size)) != nullptr) {
-    *size = static_cast<size_t>(py_size);
-    return output;
-  }
+  return PyLong_Check(py_value);
+#else
+  return PyInt_Check(py_value);
 #endif
-  TF_SetStatus(status, TF_INVALID_ARGUMENT,
-               tensorflow::strings::StrCat("Expecting a string (serialized ",
-                                           proto_name, ") value for attr ", key)
-                   .c_str());
-  return nullptr;
 }
 
-bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list,
-                   TF_AttrType type, TF_Status* status) {
+// The passed in py_value is expected to be an object of the python type
+// dtypes.DType or an int.
+bool ParseTypeValue(const string& key, PyObject* py_value, TF_Status* status,
+                    int* value) {
+  if (IsInteger(py_value)) {
+    return ParseIntValue(key, py_value, status, value);
+  }
+
+  PyObject* py_type_enum = PyObject_GetAttrString(py_value, "_type_enum");
+  if (py_type_enum == nullptr) {
+    return false;
+  }
+
+  if (!ParseIntValue(key, py_type_enum, status, value)) {
+    Py_DECREF(py_type_enum);
+    return false;
+  }
+
+  Py_DECREF(py_type_enum);
+  return true;
+}
+
+bool SetOpAttrList(
+    TFE_Op* op, const char* key, PyObject* py_list, TF_AttrType type,
+    tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
+    TF_Status* status) {
   if (!PySequence_Check(py_list)) {
     TF_SetStatus(
         status, TF_INVALID_ARGUMENT,
@@ -117,6 +143,7 @@ bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list,
     return false;
   }
   const int num_values = PySequence_Size(py_list);
+  if (attr_list_sizes != nullptr) (*attr_list_sizes)[key] = num_values;
 
 #define PARSE_LIST(c_type, parse_fn)                                \
   std::unique_ptr<c_type[]> values(new c_type[num_values]);         \
@@ -138,7 +165,7 @@ bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list,
     PARSE_LIST(unsigned char, ParseBoolValue);
     TFE_OpSetAttrBoolList(op, key, values.get(), num_values);
   } else if (type == TF_ATTR_TYPE) {
-    PARSE_LIST(int, ParseIntValue);
+    PARSE_LIST(int, ParseTypeValue);
     TFE_OpSetAttrTypeList(op, key,
                           reinterpret_cast<const TF_DataType*>(values.get()),
                           num_values);
@@ -158,8 +185,10 @@ bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list,
                   .c_str());
           return false;
         }
-        const auto size = PySequence_Size(py_value);
-        total_dims += size;
+        const auto size = TensorShapeNumDims(py_value);
+        if (size >= 0) {
+          total_dims += size;
+        }
       }
     }
     // Allocate a buffer that can fit all of the dims together.
@@ -175,7 +204,12 @@ bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list,
         dims[i] = nullptr;
         num_dims[i] = -1;
       } else {
-        const auto size = PySequence_Size(py_value);
+        const auto size = TensorShapeNumDims(py_value);
+        if (size == -1) {
+          dims[i] = nullptr;
+          num_dims[i] = -1;
+          continue;
+        }
         dims[i] = offset;
         num_dims[i] = size;
         for (int j = 0; j < size; ++j) {
@@ -203,8 +237,123 @@ bool SetOpAttrList(TFE_Op* op, const char* key, PyObject* py_list,
   return true;
 }
 
-bool SetOpAttrScalar(TFE_Context* ctx, TFE_Op* op, const char* key,
-                     PyObject* py_value, TF_AttrType type, TF_Status* status) {
+// This is only declared here since GetFunc makes a recursive call to
+// SetOpAttrScalarDefault.
+void SetOpAttrScalarDefault(
+    TFE_Context* ctx, TFE_Op* op, const tensorflow::AttrValue& default_value,
+    const char* attr_name,
+    tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
+    TF_Status* status);
+
+TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func,
+                TF_Status* status) {
+  TFE_Op* func_op = TFE_NewOp(ctx, func.name().data(), status);
+  for (const auto& attr : func.attr()) {
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    SetOpAttrScalarDefault(ctx, func_op, attr.second, attr.first.data(),
+                           nullptr, status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+  return func_op;
+}
+
+void SetOpAttrListDefault(
+    TFE_Context* ctx, TFE_Op* op, const tensorflow::OpDef::AttrDef& attr,
+    const char* key, TF_AttrType type,
+    tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
+    TF_Status* status) {
+  if (type == TF_ATTR_STRING) {
+    int num_values = attr.default_value().list().s_size();
+    std::unique_ptr<const char* []> values(new const char*[num_values]);
+    (*attr_list_sizes)[key] = num_values;
+    for (int i = 0; i < num_values; i++) {
+      values[i] = attr.default_value().list().s(i).data();
+    }
+    TFE_OpSetAttrStringList(op, key, values.get(), num_values);
+  } else if (type == TF_ATTR_INT) {
+    int num_values = attr.default_value().list().i_size();
+    std::unique_ptr<int64_t[]> values(new int64_t[num_values]);
+    (*attr_list_sizes)[key] = num_values;
+    for (int i = 0; i < num_values; i++) {
+      values[i] = attr.default_value().list().i(i);
+    }
+    TFE_OpSetAttrIntList(op, key, values.get(), num_values);
+  } else if (type == TF_ATTR_FLOAT) {
+    int num_values = attr.default_value().list().f_size();
+    std::unique_ptr<float[]> values(new float[num_values]);
+    (*attr_list_sizes)[key] = num_values;
+    for (int i = 0; i < num_values; i++) {
+      values[i] = attr.default_value().list().f(i);
+    }
+    TFE_OpSetAttrFloatList(op, key, values.get(), num_values);
+  } else if (type == TF_ATTR_BOOL) {
+    int num_values = attr.default_value().list().b_size();
+    std::unique_ptr<unsigned char[]> values(new unsigned char[num_values]);
+    (*attr_list_sizes)[key] = num_values;
+    for (int i = 0; i < num_values; i++) {
+      values[i] = attr.default_value().list().b(i);
+    }
+    TFE_OpSetAttrBoolList(op, key, values.get(), num_values);
+  } else if (type == TF_ATTR_TYPE) {
+    int num_values = attr.default_value().list().type_size();
+    std::unique_ptr<int[]> values(new int[num_values]);
+    (*attr_list_sizes)[key] = num_values;
+    for (int i = 0; i < num_values; i++) {
+      values[i] = attr.default_value().list().type(i);
+    }
+    TFE_OpSetAttrTypeList(op, key,
+                          reinterpret_cast<const TF_DataType*>(values.get()),
+                          attr.default_value().list().type_size());
+  } else if (type == TF_ATTR_SHAPE) {
+    int num_values = attr.default_value().list().shape_size();
+    (*attr_list_sizes)[key] = num_values;
+    int total_dims = 0;
+    for (int i = 0; i < num_values; ++i) {
+      if (!attr.default_value().list().shape(i).unknown_rank()) {
+        total_dims += attr.default_value().list().shape(i).dim_size();
+      }
+    }
+    // Allocate a buffer that can fit all of the dims together.
+    std::unique_ptr<int64_t[]> buffer(new int64_t[total_dims]);
+    // Copy the input dims into the buffer and set dims to point to
+    // the start of each list's dims.
+    std::unique_ptr<const int64_t* []> dims(new const int64_t*[num_values]);
+    std::unique_ptr<int[]> num_dims(new int[num_values]);
+    int64_t* offset = buffer.get();
+    for (int i = 0; i < num_values; ++i) {
+      const auto& shape = attr.default_value().list().shape(i);
+      if (shape.unknown_rank()) {
+        dims[i] = nullptr;
+        num_dims[i] = -1;
+      } else {
+        for (int j = 0; j < shape.dim_size(); j++) {
+          *offset = shape.dim(j).size();
+          ++offset;
+        }
+      }
+    }
+    TFE_OpSetAttrShapeList(op, key, dims.get(), num_dims.get(), num_values,
+                           status);
+  } else if (type == TF_ATTR_FUNC) {
+    int num_values = attr.default_value().list().func_size();
+    (*attr_list_sizes)[key] = num_values;
+    std::unique_ptr<const TFE_Op* []> funcs(new const TFE_Op*[num_values]);
+    for (int i = 0; i < num_values; i++) {
+      funcs[i] = GetFunc(ctx, attr.default_value().list().func(i), status);
+    }
+    TFE_OpSetAttrFunctionList(op, key, funcs.get(), num_values);
+  } else {
+    TF_SetStatus(status, TF_UNIMPLEMENTED,
+                 "Lists of tensors are not yet implemented for default valued "
+                 "attributes for an operation.");
+  }
+}
+
+bool SetOpAttrScalar(
+    TFE_Context* ctx, TFE_Op* op, const char* key, PyObject* py_value,
+    TF_AttrType type,
+    tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
+    TF_Status* status) {
   if (type == TF_ATTR_STRING) {
     const char* value;
     if (!ParseStringValue(key, py_value, status, &value)) return false;
@@ -213,6 +362,10 @@ bool SetOpAttrScalar(TFE_Context* ctx, TFE_Op* op, const char* key,
     int64_t value;
     if (!ParseInt64Value(key, py_value, status, &value)) return false;
     TFE_OpSetAttrInt(op, key, value);
+    // attr_list_sizes is set for all int attributes (since at this point we are
+    // not aware if that attribute might be used to calculate the size of an
+    // output list or not).
+    if (attr_list_sizes != nullptr) (*attr_list_sizes)[key] = value;
   } else if (type == TF_ATTR_FLOAT) {
     float value;
     if (!ParseFloatValue(key, py_value, status, &value)) return false;
@@ -223,7 +376,7 @@ bool SetOpAttrScalar(TFE_Context* ctx, TFE_Op* op, const char* key,
     TFE_OpSetAttrBool(op, key, value);
   } else if (type == TF_ATTR_TYPE) {
     int value;
-    if (!ParseIntValue(key, py_value, status, &value)) return false;
+    if (!ParseTypeValue(key, py_value, status, &value)) return false;
     TFE_OpSetAttrType(op, key, static_cast<TF_DataType>(value));
   } else if (type == TF_ATTR_SHAPE) {
     if (py_value == Py_None) {
@@ -237,7 +390,11 @@ bool SetOpAttrScalar(TFE_Context* ctx, TFE_Op* op, const char* key,
                          .c_str());
         return false;
       }
-      const auto num_dims = PySequence_Size(py_value);
+      const auto num_dims = TensorShapeNumDims(py_value);
+      if (num_dims == -1) {
+        TFE_OpSetAttrShape(op, key, nullptr, -1, status);
+        return true;
+      }
       std::unique_ptr<int64_t[]> dims(new int64_t[num_dims]);
       for (int i = 0; i < num_dims; ++i) {
         auto inner_py_value = PySequence_ITEM(py_value, i);
@@ -289,14 +446,71 @@ bool SetOpAttrScalar(TFE_Context* ctx, TFE_Op* op, const char* key,
   return true;
 }
 
-void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs,
+void SetOpAttrScalarDefault(
+    TFE_Context* ctx, TFE_Op* op, const tensorflow::AttrValue& default_value,
+    const char* attr_name,
+    tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
+    TF_Status* status) {
+  switch (default_value.value_case()) {
+    case tensorflow::AttrValue::kS:
+      TFE_OpSetAttrString(op, attr_name, default_value.s().data());
+      break;
+    case tensorflow::AttrValue::kI:
+      TFE_OpSetAttrInt(op, attr_name, static_cast<int64_t>(default_value.i()));
+      (*attr_list_sizes)[attr_name] = default_value.i();
+      break;
+    case tensorflow::AttrValue::kF:
+      TFE_OpSetAttrFloat(op, attr_name, default_value.f());
+      break;
+    case tensorflow::AttrValue::kB:
+      TFE_OpSetAttrBool(op, attr_name, default_value.b());
+      break;
+    case tensorflow::AttrValue::kType:
+      TFE_OpSetAttrType(op, attr_name,
+                        static_cast<TF_DataType>(default_value.type()));
+      break;
+    case tensorflow::AttrValue::kShape: {
+      const auto& tensor_shape = default_value.shape();
+      if (tensor_shape.unknown_rank()) {
+        TFE_OpSetAttrShape(op, attr_name, nullptr, -1, status);
+      } else {
+        const auto num_dims = tensor_shape.dim_size();
+        std::unique_ptr<int64_t[]> dims(new int64_t[num_dims]);
+        for (int i = 0; i < num_dims; ++i) {
+          dims[i] = tensor_shape.dim(i).size();
+        }
+        TFE_OpSetAttrShape(op, attr_name, dims.get(), num_dims, status);
+      }
+    } break;
+    case tensorflow::AttrValue::kFunc: {
+      const auto func_op = GetFunc(ctx, default_value.func(), status);
+      if (TF_GetCode(status) != TF_OK) return;
+      // TODO(nareshmodi): TFE_OpSetAttrFunction and TFE_OpSetAttrFunctionList
+      // require TFE_Op* and just convert it internally a NameAttrValue, so
+      // consider adding an overload to the C API to make this case easier.
+      TFE_OpSetAttrFunction(op, attr_name, func_op);
+    } break;
+    case tensorflow::AttrValue::kList:
+      TF_FALLTHROUGH_INTENDED;
+    case tensorflow::AttrValue::kTensor:
+      TF_FALLTHROUGH_INTENDED;
+    case tensorflow::AttrValue::kPlaceholder:
+      TF_FALLTHROUGH_INTENDED;
+    case tensorflow::AttrValue::VALUE_NOT_SET:
+      TF_SetStatus(
+          status, TF_UNIMPLEMENTED,
+          tensorflow::strings::StrCat("Unable to get setfor default value: ",
+                                      default_value.DebugString())
+              .data());
+  }
+}
+
+// start_index is the index at which the Tuple/List attrs will start getting
+// processed.
+void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs, int start_index,
                 TF_Status* out_status) {
   if (attrs == Py_None) return;
-  if (!PyTuple_Check(attrs)) {
-    TF_SetStatus(out_status, TF_INVALID_ARGUMENT, "Expecting an attrs tuple.");
-    return;
-  }
-  Py_ssize_t len = PyTuple_GET_SIZE(attrs);
+  Py_ssize_t len = PyTuple_GET_SIZE(attrs) - start_index;
   if ((len & 1) != 0) {
     TF_SetStatus(out_status, TF_INVALID_ARGUMENT,
                  "Expecting attrs tuple to have even length.");
@@ -304,8 +518,8 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs,
   }
   // Parse attrs
   for (Py_ssize_t i = 0; i < len; i += 2) {
-    PyObject* py_key = PyTuple_GET_ITEM(attrs, i);
-    PyObject* py_value = PyTuple_GET_ITEM(attrs, i + 1);
+    PyObject* py_key = PyTuple_GET_ITEM(attrs, start_index + i);
+    PyObject* py_value = PyTuple_GET_ITEM(attrs, start_index + i + 1);
 #if PY_MAJOR_VERSION >= 3
     const char* key = PyBytes_Check(py_key) ? PyBytes_AsString(py_key)
                                             : PyUnicode_AsUTF8(py_key);
@@ -316,9 +530,40 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs,
     const TF_AttrType type = TFE_OpGetAttrType(op, key, &is_list, out_status);
     if (TF_GetCode(out_status) != TF_OK) return;
     if (is_list != 0) {
-      if (!SetOpAttrList(op, key, py_value, type, out_status)) return;
+      if (!SetOpAttrList(op, key, py_value, type, nullptr, out_status)) return;
+    } else {
+      if (!SetOpAttrScalar(ctx, op, key, py_value, type, nullptr, out_status))
+        return;
+    }
+  }
+}
+
+// This function will set the op attrs required. If an attr has the value of
+// None, then it will read the AttrDef to get the default value and set that
+// instead. Any failure in this function will simply fall back to the slow
+// path.
+void SetOpAttrWithDefaults(
+    TFE_Context* ctx, TFE_Op* op, const tensorflow::OpDef::AttrDef& attr,
+    const char* attr_name, PyObject* attr_value,
+    tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
+    TF_Status* status) {
+  unsigned char is_list = 0;
+  const TF_AttrType type = TFE_OpGetAttrType(op, attr_name, &is_list, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  if (attr_value == Py_None) {
+    if (is_list != 0) {
+      SetOpAttrListDefault(ctx, op, attr, attr_name, type, attr_list_sizes,
+                           status);
     } else {
-      if (!SetOpAttrScalar(ctx, op, key, py_value, type, out_status)) return;
+      SetOpAttrScalarDefault(ctx, op, attr.default_value(), attr_name,
+                             attr_list_sizes, status);
+    }
+  } else {
+    if (is_list != 0) {
+      SetOpAttrList(op, attr_name, attr_value, type, attr_list_sizes, status);
+    } else {
+      SetOpAttrScalar(ctx, op, attr_name, attr_value, type, attr_list_sizes,
+                      status);
     }
   }
 }
@@ -327,8 +572,11 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs,
 tensorflow::mutex exception_class_mutex(tensorflow::LINKER_INITIALIZED);
 PyObject* exception_class GUARDED_BY(exception_class_mutex) = nullptr;
 
-static tensorflow::mutex _uid_mutex(tensorflow::LINKER_INITIALIZED);
-static tensorflow::int64 _uid GUARDED_BY(_uid_mutex) = 0;
+// Python subclass of Exception that is created to signal fallback.
+PyObject* fallback_exception_class = nullptr;
+
+tensorflow::mutex _uid_mutex(tensorflow::LINKER_INITIALIZED);
+tensorflow::int64 _uid GUARDED_BY(_uid_mutex) = 0;
 
 }  // namespace
 
@@ -346,7 +594,7 @@ void TFE_Py_Execute(TFE_Context* ctx, const char* device_name,
     }
   }
   if (TF_GetCode(out_status) == TF_OK) {
-    SetOpAttrs(ctx, op, attrs, out_status);
+    SetOpAttrs(ctx, op, attrs, 0, out_status);
   }
   Py_BEGIN_ALLOW_THREADS;
   if (TF_GetCode(out_status) == TF_OK) {
@@ -382,6 +630,37 @@ PyObject* TFE_Py_RegisterExceptionClass(PyObject* e) {
   }
 }
 
+PyObject* TFE_Py_RegisterFallbackExceptionClass(PyObject* e) {
+  if (fallback_exception_class != nullptr) {
+    Py_DECREF(fallback_exception_class);
+  }
+  if (PyObject_IsSubclass(e, PyExc_Exception) <= 0) {
+    fallback_exception_class = nullptr;
+    PyErr_SetString(PyExc_TypeError,
+                    "TFE_Py_RegisterFallbackExceptionClass: "
+                    "Registered class should be subclass of Exception.");
+    return nullptr;
+  } else {
+    Py_INCREF(e);
+    fallback_exception_class = e;
+    Py_RETURN_NONE;
+  }
+}
+
+void RaiseFallbackException(const char* message) {
+  if (fallback_exception_class != nullptr) {
+    PyErr_SetObject(fallback_exception_class, Py_BuildValue("s", message));
+    return;
+  }
+
+  PyErr_SetString(
+      PyExc_RuntimeError,
+      tensorflow::strings::StrCat(
+          "Fallback exception type not set, attempting to fallback due to ",
+          message)
+          .data());
+}
+
 int MaybeRaiseExceptionFromTFStatus(TF_Status* status, PyObject* exception) {
   if (TF_GetCode(status) == TF_OK) return 0;
   const char* msg = TF_Message(status);
@@ -472,9 +751,19 @@ class GradientTape
   explicit GradientTape(bool persistent)
       : tensorflow::eager::GradientTape<PyObject, PyObject>(persistent) {}
 
+  virtual ~GradientTape() {
+    for (PyObject* v : watched_variables_) {
+      Py_DECREF(v);
+    }
+  }
+
   void WatchVariable(PyObject* v) {
-    watched_variables_.insert(v);
-    Py_INCREF(v);
+    auto insert_result = watched_variables_.insert(v);
+    if (insert_result.second) {
+      // Only increment the reference count if we aren't already watching this
+      // variable.
+      Py_INCREF(v);
+    }
     PyObject* handle = PyObject_GetAttrString(v, "handle");
     if (handle == nullptr) {
       return;
@@ -528,65 +817,95 @@ static PyTypeObject TFE_Py_Tape_Type = {
     "TFE_Py_Tape objects",                        /* tp_doc */
 };
 
+// Note: in the current design no mutex is needed here because of the python
+// GIL, which is always held when any TFE_Py_* methods are called. We should
+// revisit this if/when decide to not hold the GIL while manipulating the tape
+// stack.
+static tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>* tape_set = nullptr;
+tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>* GetTapeSet() {
+  if (tape_set == nullptr) {
+    tape_set = new tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>;
+  }
+  return tape_set;
+}
+
+// A safe copy of the current tapeset. Does not get affected by other python
+// threads changing the set of active tapes.
+class SafeTapeSet {
+ public:
+  SafeTapeSet() : tape_set_(*GetTapeSet()) {
+    for (auto* tape : tape_set_) {
+      Py_INCREF(tape);
+    }
+  }
+
+  ~SafeTapeSet() {
+    for (auto* tape : tape_set_) {
+      Py_DECREF(tape);
+    }
+  }
+
+  tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>::const_iterator begin() {
+    return tape_set_.begin();
+  }
+
+  tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>::const_iterator end() {
+    return tape_set_.end();
+  }
+
+ private:
+  tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*> tape_set_;
+};
+
 // xcode 7 doesn't define thread_local, so for compatibility we implement our
 // own. TODO(apassos) remove once we can deprecate xcode 7.
 #ifndef __APPLE__
-thread_local std::vector<TFE_Py_Tape*>* tape_stack = nullptr;
-std::vector<TFE_Py_Tape*>* GetTapeStack() {
-  if (tape_stack == nullptr) {
-    tape_stack = new std::vector<TFE_Py_Tape*>;
-  }
-  return tape_stack;
+bool* ThreadTapeIsStopped() {
+  thread_local bool thread_tape_is_stopped{false};
+  return &thread_tape_is_stopped;
 }
 #else
-static tensorflow::mutex stack_mu(tensorflow::LINKER_INITIALIZED);
-static std::unordered_map<std::thread::id, std::vector<TFE_Py_Tape*>*>*
-    tape_stack GUARDED_BY(stack_mu) = nullptr;
-std::vector<TFE_Py_Tape*>* GetTapeStack() {
-  tensorflow::mutex_lock ml(stack_mu);
-  if (tape_stack == nullptr) {
-    tape_stack =
-        new std::unordered_map<std::thread::id, std::vector<TFE_Py_Tape*>*>;
-  }
-  auto it = tape_stack->find(std::this_thread::get_id());
-  if (it != tape_stack->end()) {
-    return it->second;
-  }
-  return tape_stack
-      ->emplace(std::this_thread::get_id(), new std::vector<TFE_Py_Tape*>)
-      .first->second;
+static std::unordered_map<std::thread::id, bool>* tape_is_stopped = nullptr;
+bool* ThreadTapeIsStopped() {
+  if (tape_is_stopped == nullptr) {
+    tape_is_stopped = new std::unordered_map<std::thread::id, bool>;
+  }
+  auto it = tape_is_stopped->find(std::this_thread::get_id());
+  if (it != tape_is_stopped->end()) {
+    return &(it->second);
+  }
+  return &(tape_is_stopped->emplace(std::this_thread::get_id(), false)
+               .first->second);
 }
 #endif
 
-void TFE_Py_TapeStackPushNew(PyObject* persistent) {
+void TFE_Py_TapeSetStopOnThread() { *ThreadTapeIsStopped() = true; }
+
+void TFE_Py_TapeSetRestartOnThread() { *ThreadTapeIsStopped() = false; }
+
+PyObject* TFE_Py_TapeSetNew(PyObject* persistent) {
   TFE_Py_Tape_Type.tp_new = PyType_GenericNew;
-  if (PyType_Ready(&TFE_Py_Tape_Type) < 0) return;
+  if (PyType_Ready(&TFE_Py_Tape_Type) < 0) return nullptr;
   TFE_Py_Tape* tape = PyObject_NEW(TFE_Py_Tape, &TFE_Py_Tape_Type);
   tape->tape = new GradientTape(persistent == Py_True);
-  GetTapeStack()->push_back(tape);
-}
-
-void TFE_Py_TapeStackPush(PyObject* tape) {
   Py_INCREF(tape);
-  GetTapeStack()->push_back(reinterpret_cast<TFE_Py_Tape*>(tape));
+  GetTapeSet()->insert(reinterpret_cast<TFE_Py_Tape*>(tape));
+  return reinterpret_cast<PyObject*>(tape);
 }
 
-PyObject* TFE_Py_TapeStackIsEmpty() {
-  if (GetTapeStack()->empty()) {
+PyObject* TFE_Py_TapeSetIsEmpty() {
+  if (*ThreadTapeIsStopped() || GetTapeSet()->empty()) {
     Py_RETURN_TRUE;
   }
   Py_RETURN_FALSE;
 }
 
-PyObject* TFE_Py_TapeStackPop() {
-  auto* stack = GetTapeStack();
-  if (stack->empty()) {
-    PyErr_SetString(PyExc_RuntimeError, "tape stack is empty.");
-    return nullptr;
-  }
-  TFE_Py_Tape* top = stack->back();
-  stack->pop_back();
-  return reinterpret_cast<PyObject*>(top);
+void TFE_Py_TapeSetRemove(PyObject* tape) {
+  auto* stack = GetTapeSet();
+  stack->erase(reinterpret_cast<TFE_Py_Tape*>(tape));
+  // We kept a reference to the tape in the set to ensure it wouldn't get
+  // deleted under us; cleaning it up here.
+  Py_DECREF(tape);
 }
 
 static std::vector<tensorflow::int64> MakeIntList(PyObject* list) {
@@ -602,7 +921,11 @@ static std::vector<tensorflow::int64> MakeIntList(PyObject* list) {
   tensor_ids.reserve(len);
   for (int i = 0; i < len; ++i) {
     PyObject* item = PySequence_Fast_GET_ITEM(seq, i);
+#if PY_MAJOR_VERSION >= 3
     if (PyLong_Check(item)) {
+#else
+    if (PyLong_Check(item) || PyInt_Check(item)) {
+#endif
       tensorflow::int64 id = MakeInt(item);
       tensor_ids.push_back(id);
     } else {
@@ -613,12 +936,15 @@ static std::vector<tensorflow::int64> MakeIntList(PyObject* list) {
   return tensor_ids;
 }
 
-PyObject* TFE_Py_TapeStackShouldRecord(PyObject* tensors) {
+PyObject* TFE_Py_TapeSetShouldRecord(PyObject* tensors) {
   if (tensors == Py_None) {
     Py_RETURN_FALSE;
   }
-  auto* stack = GetTapeStack();
-  if (stack->empty()) {
+  if (*ThreadTapeIsStopped()) {
+    Py_RETURN_FALSE;
+  }
+  auto* tape_set_ptr = GetTapeSet();
+  if (tape_set_ptr->empty()) {
     Py_RETURN_FALSE;
   }
   PyObject* seq = PySequence_Fast(tensors, "expected a sequence");
@@ -635,7 +961,8 @@ PyObject* TFE_Py_TapeStackShouldRecord(PyObject* tensors) {
     tensor_ids.push_back(FastTensorId(item));
   }
   Py_DECREF(seq);
-  for (TFE_Py_Tape* tape : *stack) {
+  auto tape_set = *tape_set_ptr;
+  for (TFE_Py_Tape* tape : tape_set) {
     if (tape->tape->ShouldRecord(tensor_ids)) {
       Py_RETURN_TRUE;
     }
@@ -643,12 +970,15 @@ PyObject* TFE_Py_TapeStackShouldRecord(PyObject* tensors) {
   Py_RETURN_FALSE;
 }
 
-void TFE_Py_TapeStackWatch(PyObject* tensor) {
+void TFE_Py_TapeSetWatch(PyObject* tensor) {
+  if (*ThreadTapeIsStopped()) {
+    return;
+  }
   tensorflow::int64 tensor_id = FastTensorId(tensor);
   if (PyErr_Occurred()) {
     return;
   }
-  for (TFE_Py_Tape* tape : *GetTapeStack()) {
+  for (TFE_Py_Tape* tape : *GetTapeSet()) {
     tape->tape->Watch(tensor_id);
   }
 }
@@ -713,8 +1043,11 @@ std::vector<tensorflow::int64> MakeTensorIDList(PyObject* tensors) {
   return list;
 }
 
-void TFE_Py_TapeStackWatchVariable(PyObject* variable) {
-  for (TFE_Py_Tape* tape : *GetTapeStack()) {
+void TFE_Py_TapeSetWatchVariable(PyObject* variable) {
+  if (*ThreadTapeIsStopped()) {
+    return;
+  }
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
     tape->tape->WatchVariable(variable);
   }
 }
@@ -725,20 +1058,20 @@ PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape) {
   PyObject* result = PySet_New(nullptr);
   for (PyObject* variable : watched_variables) {
     PySet_Add(result, variable);
-    Py_DECREF(variable);
   }
   return result;
 }
 
-void TFE_Py_TapeStackRecordOperation(PyObject* op_type,
-                                     PyObject* output_tensors,
-                                     PyObject* input_tensors,
-                                     PyObject* backward_function) {
-  auto* stack = GetTapeStack();
-  if (stack->empty()) {
+void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
+                                   PyObject* input_tensors,
+                                   PyObject* backward_function) {
+  if (GetTapeSet()->empty() || *ThreadTapeIsStopped()) {
     return;
   }
   std::vector<tensorflow::int64> input_ids = MakeTensorIDList(input_tensors);
+  if (PyErr_Occurred()) {
+    return;
+  }
   std::vector<tensorflow::eager::TapeTensor> output_info;
   PyObject* seq = PySequence_Fast(output_tensors,
                                   "expected a sequence of integer tensor ids");
@@ -770,7 +1103,7 @@ void TFE_Py_TapeStackRecordOperation(PyObject* op_type,
     return;
   }
 
-  for (TFE_Py_Tape* tape : *stack) {
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
     Py_INCREF(backward_function);
     tape->tape->RecordOperation(
         op_type_str, output_info, input_ids, backward_function,
@@ -778,8 +1111,8 @@ void TFE_Py_TapeStackRecordOperation(PyObject* op_type,
   }
 }
 
-void TFE_Py_TapeStackDeleteTrace(tensorflow::int64 tensor_id) {
-  for (TFE_Py_Tape* tape : *GetTapeStack()) {
+void TFE_Py_TapeSetDeleteTrace(tensorflow::int64 tensor_id) {
+  for (TFE_Py_Tape* tape : SafeTapeSet()) {
     tape->tape->DeleteTrace(tensor_id);
   }
 }
@@ -948,7 +1281,6 @@ std::vector<PyObject*> MakeTensorList(PyObject* tensors) {
   return list;
 }
 
-
 PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
                               PyObject* target, PyObject* sources,
                               PyObject* output_gradients, TF_Status* status) {
@@ -1000,6 +1332,467 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
     }
     return py_result;
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  return PyList_New(0);
+}
+
+namespace {
+static const int kFastPathExecuteInputStartIndex = 6;
+
+PyObject* GetPythonObjectFromString(const char* s) {
+#if PY_MAJOR_VERSION >= 3
+  return PyUnicode_FromString(s);
+#else
+  return PyBytes_FromString(s);
+#endif
+}
+
+bool CheckEagerTensors(PyObject* seq, int start_index,
+                       const tensorflow::OpDef& op_def) {
+  for (int i = 0; i < op_def.input_arg_size(); i++) {
+    PyObject* item = PyTuple_GET_ITEM(seq, i + start_index);
+    if (!op_def.input_arg(i).number_attr().empty() ||
+        !op_def.input_arg(i).type_list_attr().empty()) {
+      // This item should be a list input.
+      if (!PyList_Check(item)) return false;
+      for (Py_ssize_t j = 0; j < PyList_Size(item); j++) {
+        if (!EagerTensor_CheckExact(PyList_GET_ITEM(item, j))) return false;
+      }
+    } else if (!EagerTensor_CheckExact(item)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Adds input and type attr to the op, and to the list of flattened
+// inputs/attrs.
+bool AddInputToOp(PyObject* input, const tensorflow::OpDef::ArgDef* input_arg,
+                  std::vector<PyObject*>* flattened_attrs,
+                  std::vector<PyObject*>* flattened_inputs, TFE_Op* op,
+                  TF_Status* status) {
+  TFE_TensorHandle* input_handle = EagerTensor_Handle(input);
+  if (input_arg != nullptr && !input_arg->type_attr().empty()) {
+    auto dtype = TFE_TensorHandleDataType(input_handle);
+    TFE_OpSetAttrType(op, input_arg->type_attr().data(), dtype);
+    if (flattened_attrs != nullptr) {
+      flattened_attrs->push_back(
+          GetPythonObjectFromString(input_arg->type_attr().data()));
+      flattened_attrs->push_back(PyLong_FromLong(dtype));
+    }
+  }
+
+  if (flattened_inputs != nullptr) {
+    flattened_inputs->push_back(input);
+  }
+  TFE_OpAddInput(op, input_handle, status);
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
+    return false;
+  }
+  return true;
+}
+
+const tensorflow::OpDef* GetOpDef(PyObject* py_op_name) {
+  const char* op_name = TFE_GetPythonString(py_op_name);
+  if (op_name == nullptr) {
+    PyErr_SetString(PyExc_TypeError,
+                    Printf("expected a string for op_name, got %s instead",
+                           py_op_name->ob_type->tp_name)
+                        .c_str());
+    return nullptr;
+  }
+
+  const tensorflow::OpRegistrationData* op_reg_data = nullptr;
+  const tensorflow::Status lookup_status =
+      tensorflow::OpRegistry::Global()->LookUp(op_name, &op_reg_data);
+  if (MaybeRaiseExceptionFromStatus(lookup_status, nullptr)) {
+    return nullptr;
+  }
+  return &op_reg_data->op_def;
+}
+
+const char* GetDeviceName(PyObject* py_device_name) {
+  if (py_device_name != Py_None) {
+    return TFE_GetPythonString(py_device_name);
+  }
+  return nullptr;
+}
+
+bool RaiseIfNotPyList(PyObject* list, const string& attr_name) {
+  if (!PyList_Check(list)) {
+    PyErr_SetString(PyExc_TypeError,
+                    Printf("expected a list for attr %s, got %s instead",
+                           attr_name.data(), list->ob_type->tp_name)
+                        .data());
+
+    return false;
+  }
+  return true;
+}
+
+bool RunCallbacks(bool run_gradient_callback, bool run_post_exec_callbacks,
+                  const tensorflow::OpDef* op_def, PyObject* args,
+                  const std::vector<PyObject*>& flattened_inputs,
+                  const std::vector<PyObject*>& flattened_attrs,
+                  PyObject* flattened_result, PyObject* op_name, PyObject* name,
+                  PyObject* record_gradient_callback, PyObject* callbacks) {
+  PyObject* inputs = PyTuple_New(flattened_inputs.size());
+  for (int i = 0; i < flattened_inputs.size(); i++) {
+    PyObject* input = flattened_inputs[i];
+    Py_INCREF(input);
+    PyTuple_SET_ITEM(inputs, i, input);
+  }
+
+  int num_non_inferred_attrs = PyTuple_GET_SIZE(args) -
+                               op_def->input_arg_size() -
+                               kFastPathExecuteInputStartIndex;
+  int num_attrs = flattened_attrs.size() + num_non_inferred_attrs;
+  PyObject* attrs = PyTuple_New(num_attrs);
+
+  for (int i = 0; i < num_non_inferred_attrs; i++) {
+    auto* attr = PyTuple_GET_ITEM(
+        args, kFastPathExecuteInputStartIndex + op_def->input_arg_size() + i);
+    Py_INCREF(attr);
+    PyTuple_SET_ITEM(attrs, i, attr);
+  }
+  for (int i = num_non_inferred_attrs; i < num_attrs; i++) {
+    // Not INCREFing anything in flattened_attrs as each of those is a new
+    // reference, so allow the attrs tuple to steal the reference.
+    PyTuple_SET_ITEM(attrs, i, flattened_attrs.at(i - num_non_inferred_attrs));
+  }
+
+  PyObject* callback_args =
+      Py_BuildValue("OOOOO", op_name, inputs, attrs, flattened_result, name);
+
+  auto cleaner = tensorflow::gtl::MakeCleanup([inputs, attrs, callback_args] {
+    Py_DECREF(inputs);
+    Py_DECREF(attrs);
+    Py_DECREF(callback_args);
+  });
+
+  if (run_gradient_callback) {
+    if (!PyCallable_Check(record_gradient_callback)) {
+      PyErr_SetString(PyExc_TypeError,
+                      Printf("expected a function for "
+                             "record_gradient_callback, got %s instead",
+                             record_gradient_callback->ob_type->tp_name)
+                          .c_str());
+      return false;
+    }
+
+    PyObject* callback_result =
+        PyObject_CallObject(record_gradient_callback, callback_args);
+    if (!callback_result) {
+      return false;
+    }
+    Py_DECREF(callback_result);
+  }
+
+  if (run_post_exec_callbacks) {
+    for (Py_ssize_t i = 0; i < PyList_Size(callbacks); i++) {
+      PyObject* callback_fn = PyList_GET_ITEM(callbacks, i);
+      if (!PyCallable_Check(callback_fn)) {
+        PyErr_SetString(
+            PyExc_TypeError,
+            Printf("expected a function for "
+                   "post execution callback in index %ld, got %s instead",
+                   i, callback_fn->ob_type->tp_name)
+                .c_str());
+        return false;
+      }
+      PyObject* callback_result =
+          PyObject_CallObject(callback_fn, callback_args);
+      if (!callback_result) {
+        return false;
+      }
+      Py_DECREF(callback_result);
+    }
+  }
+
+  return true;
+}
+
+}  // namespace
+
+PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
+  Py_ssize_t args_size = PyTuple_GET_SIZE(args);
+  if (args_size < kFastPathExecuteInputStartIndex) {
+    PyErr_SetString(
+        PyExc_ValueError,
+        Printf("There must be at least %d items in the input tuple.",
+               kFastPathExecuteInputStartIndex)
+            .c_str());
+    return nullptr;
+  }
+
+  TFE_Context* ctx = reinterpret_cast<TFE_Context*>(
+      PyCapsule_GetPointer(PyTuple_GET_ITEM(args, 0), nullptr));
+  const char* device_name = GetDeviceName(PyTuple_GET_ITEM(args, 1));
+  PyObject* op_name = PyTuple_GET_ITEM(args, 2);
+  const tensorflow::OpDef* op_def = GetOpDef(op_name);
+  if (op_def == nullptr) return nullptr;
+  PyObject* record_gradient_callback = PyTuple_GET_ITEM(args, 3);
+  PyObject* name = PyTuple_GET_ITEM(args, 4);
+  PyObject* callbacks = PyTuple_GET_ITEM(args, 5);
+
+  if (args_size < kFastPathExecuteInputStartIndex + op_def->input_arg_size()) {
+    PyErr_SetString(
+        PyExc_ValueError,
+        Printf("Tuple size smaller than intended. Expected to be at least %d, "
+               "was %ld",
+               kFastPathExecuteInputStartIndex + op_def->input_arg_size(),
+               args_size)
+            .c_str());
+    return nullptr;
+  }
+
+  if (!CheckEagerTensors(args, kFastPathExecuteInputStartIndex, *op_def)) {
+    RaiseFallbackException(
+        "This function does not handle the case of the path where "
+        "all inputs are not already EagerTensors.");
+    return nullptr;
+  }
+
+  TF_Status* status = TF_NewStatus();
+  TFE_Op* op = TFE_NewOp(ctx, op_def->name().c_str(), status);
+  auto cleaner = tensorflow::gtl::MakeCleanup([status, op] {
+    TF_DeleteStatus(status);
+    TFE_DeleteOp(op);
+  });
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
+    return nullptr;
+  }
+
+  // Mapping of attr name to size - used to calculate the number of values
+  // to be expected by the TFE_Execute run.
+  tensorflow::gtl::FlatMap<string, tensorflow::int64> attr_list_sizes;
+
+  // Set non-inferred attrs, including setting defaults if the attr is passed in
+  // as None.
+  for (int i = kFastPathExecuteInputStartIndex + op_def->input_arg_size();
+       i < args_size; i += 2) {
+    PyObject* py_attr_name = PyTuple_GET_ITEM(args, i);
+    const tensorflow::StringPiece attr_name(TFE_GetPythonString(py_attr_name));
+    PyObject* py_attr_value = PyTuple_GET_ITEM(args, i + 1);
+
+    // Not creating an index since most of the time there are not more than a
+    // few attrs.
+    // TODO(nareshmodi): Maybe include the index as part of the
+    // OpRegistrationData.
+    for (const auto& attr : op_def->attr()) {
+      if (attr_name == attr.name()) {
+        SetOpAttrWithDefaults(ctx, op, attr, attr_name.data(), py_attr_value,
+                              &attr_list_sizes, status);
+
+        if (TF_GetCode(status) != TF_OK) {
+          RaiseFallbackException(TF_Message(status));
+          return nullptr;
+        }
+
+        break;
+      }
+    }
+  }
+
+  TFE_OpSetDevice(op, device_name, status);
+  if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
+    return nullptr;
+  }
+
+  // TODO(nareshmodi): Add a benchmark for the fast-path with gradient callbacks
+  // (similar to benchmark_tf_gradient_function_*). Also consider using an
+  // InlinedVector for flattened_attrs and flattened_inputs if the benchmarks
+  // point out problems with heap allocs.
+  bool run_gradient_callback = !*ThreadTapeIsStopped() &&
+                               !GetTapeSet()->empty() &&
+                               record_gradient_callback != Py_None;
+  bool run_post_exec_callbacks =
+      callbacks != Py_None && PyList_Size(callbacks) > 0;
+  bool run_callbacks = run_gradient_callback || run_post_exec_callbacks;
+  // Flat attrs and inputs as required by the record_gradient call. The attrs
+  // here only contain inferred attrs (non-inferred attrs are added directly
+  // from the input args).
+  // All items in flattened_attrs contain new references.
+  // All items in flattened_inputs contain borrowed references.
+  // TODO(nareshmodi): figure out why PyList_New/PyList_Append don't work
+  // directly.
+  std::unique_ptr<std::vector<PyObject*>> flattened_attrs = nullptr;
+  std::unique_ptr<std::vector<PyObject*>> flattened_inputs = nullptr;
+
+  if (run_callbacks) {
+    flattened_attrs.reset(new std::vector<PyObject*>);
+    flattened_inputs.reset(new std::vector<PyObject*>);
+  }
+
+  // Add inferred attrs and inputs.
+  // The following code might set duplicate type attrs. This will result in
+  // the CacheKey for the generated AttrBuilder possibly differing from
+  // those where the type attrs are correctly set. Inconsistent CacheKeys
+  // for ops means that there might be unnecessarily duplicated kernels.
+  // TODO(nareshmodi): Fix this.
+  for (int i = 0; i < op_def->input_arg_size(); i++) {
+    const auto& input_arg = op_def->input_arg(i);
+
+    PyObject* input =
+        PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex + i);
+    if (!input_arg.number_attr().empty()) {
+      // The item is a homogeneous list.
+      if (!RaiseIfNotPyList(input, input_arg.number_attr())) return nullptr;
+      Py_ssize_t len = PyList_Size(input);
+
+      TFE_OpSetAttrInt(op, input_arg.number_attr().data(), len);
+      if (run_callbacks) {
+        flattened_attrs->push_back(
+            GetPythonObjectFromString(input_arg.number_attr().data()));
+        flattened_attrs->push_back(PyLong_FromLong(len));
+      }
+      attr_list_sizes[input_arg.number_attr()] = len;
+
+      if (len > 0) {
+        // First item adds the type attr.
+        if (!AddInputToOp(PyList_GET_ITEM(input, 0), &input_arg,
+                          flattened_attrs.get(), flattened_inputs.get(), op,
+                          status)) {
+          return nullptr;
+        }
+
+        for (Py_ssize_t j = 1; j < len; j++) {
+          // Since the list is homogeneous, we don't need to re-add the attr.
+          if (!AddInputToOp(PyList_GET_ITEM(input, j), nullptr /* input_arg */,
+                            nullptr /* flattened_attrs */,
+                            flattened_inputs.get(), op, status)) {
+            return nullptr;
+          }
+        }
+      }
+    } else if (!input_arg.type_list_attr().empty()) {
+      // The item is a heterogeneous list.
+      if (!RaiseIfNotPyList(input, input_arg.type_list_attr())) return nullptr;
+      const string& attr_name = input_arg.type_list_attr();
+      Py_ssize_t len = PyList_Size(input);
+      tensorflow::gtl::InlinedVector<TF_DataType, 4> attr_value(len);
+      PyObject* py_attr_value = nullptr;
+      if (run_callbacks) {
+        py_attr_value = PyTuple_New(len);
+      }
+      for (Py_ssize_t j = 0; j < len; j++) {
+        PyObject* py_input = PyList_GET_ITEM(input, j);
+        TFE_TensorHandle* input_handle = EagerTensor_Handle(py_input);
+        attr_value[j] = TFE_TensorHandleDataType(input_handle);
+
+        TFE_OpAddInput(op, input_handle, status);
+        if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
+          return nullptr;
+        }
+
+        if (run_callbacks) {
+          flattened_inputs->push_back(py_input);
+
+          PyTuple_SET_ITEM(py_attr_value, j, PyLong_FromLong(attr_value[j]));
+        }
+      }
+      if (run_callbacks) {
+        flattened_attrs->push_back(GetPythonObjectFromString(attr_name.data()));
+        flattened_attrs->push_back(py_attr_value);
+      }
+      TFE_OpSetAttrTypeList(op, attr_name.data(), attr_value.data(),
+                            attr_value.size());
+      attr_list_sizes[attr_name] = len;
+    } else {
+      // The item is a single item.
+      if (!AddInputToOp(input, &input_arg, flattened_attrs.get(),
+                        flattened_inputs.get(), op, status)) {
+        return nullptr;
+      }
+    }
+  }
+
+  int num_retvals = 0;
+  for (int i = 0; i < op_def->output_arg_size(); i++) {
+    const auto& output_arg = op_def->output_arg(i);
+    if (!output_arg.number_attr().empty()) {
+      num_retvals += attr_list_sizes[output_arg.number_attr()];
+    } else if (!output_arg.type_list_attr().empty()) {
+      num_retvals += attr_list_sizes[output_arg.type_list_attr()];
+    } else {
+      num_retvals++;
+    }
+  }
+
+  tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 2> retvals(num_retvals);
+
+  Py_BEGIN_ALLOW_THREADS;
+  TFE_Execute(op, retvals.data(), &num_retvals, status);
+  Py_END_ALLOW_THREADS;
+  if (TF_GetCode(status) != TF_OK) {
+    // Augment the status with the op_name for easier debugging similar to
+    // TFE_Py_Execute.
+    TF_SetStatus(status, TF_GetCode(status),
+                 tensorflow::strings::StrCat(TF_Message(status), " [Op:",
+                                             TFE_GetPythonString(op_name), "]")
+                     .c_str());
+
+    MaybeRaiseExceptionFromTFStatus(status, nullptr);
+    return nullptr;
+  }
+
+  PyObject* flat_result = PyList_New(num_retvals);
+  for (int i = 0; i < num_retvals; ++i) {
+    PyList_SET_ITEM(flat_result, i, EagerTensorFromHandle(retvals[i]));
+  }
+
+  if (run_callbacks &&
+      !RunCallbacks(run_gradient_callback, run_post_exec_callbacks, op_def,
+                    args, *flattened_inputs, *flattened_attrs, flat_result,
+                    op_name, name, record_gradient_callback, callbacks)) {
+    return nullptr;
+  }
+
+  // Unflatten results.
+  if (op_def->output_arg_size() == 0) {
+    Py_RETURN_NONE;
+  }
+
+  if (op_def->output_arg_size() == 1) {
+    if (!op_def->output_arg(0).number_attr().empty() ||
+        !op_def->output_arg(0).type_list_attr().empty()) {
+      return flat_result;
+    } else {
+      auto* result = PyList_GET_ITEM(flat_result, 0);
+      Py_INCREF(result);
+      Py_DECREF(flat_result);
+      return result;
+    }
+  }
+
+  // Correctly output the results that are made into a namedtuple.
+  PyObject* result = PyList_New(op_def->output_arg_size());
+  int flat_result_index = 0;
+  for (int i = 0; i < op_def->output_arg_size(); i++) {
+    if (!op_def->output_arg(i).number_attr().empty()) {
+      int list_length = attr_list_sizes[op_def->output_arg(i).number_attr()];
+      PyObject* inner_list = PyList_New(list_length);
+      for (int j = 0; j < list_length; j++) {
+        PyObject* obj = PyList_GET_ITEM(flat_result, flat_result_index++);
+        Py_INCREF(obj);
+        PyList_SET_ITEM(inner_list, j, obj);
+      }
+      PyList_SET_ITEM(result, i, inner_list);
+    } else if (!op_def->output_arg(i).type_list_attr().empty()) {
+      int list_length = attr_list_sizes[op_def->output_arg(i).type_list_attr()];
+      PyObject* inner_list = PyList_New(list_length);
+      for (int j = 0; j < list_length; j++) {
+        PyObject* obj = PyList_GET_ITEM(flat_result, flat_result_index++);
+        Py_INCREF(obj);
+        PyList_SET_ITEM(inner_list, j, obj);
+      }
+      PyList_SET_ITEM(result, i, inner_list);
+    } else {
+      PyObject* obj = PyList_GET_ITEM(flat_result, flat_result_index++);
+      Py_INCREF(obj);
+      PyList_SET_ITEM(result, i, obj);
+    }
+  }
+  Py_DECREF(flat_result);
+  return result;
 }
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..49323e6640e664ef5f98b227964f9dd4e248ca39
--- /dev/null
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -0,0 +1,169 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for low-level eager execution primitives."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import execute
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+
+
+class Tests(test.TestCase):
+
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastpathExecute_MatMulCorrectResponse(self):
+    a_2_by_2 = random_ops.random_uniform((2, 2))
+    b_2_by_2 = random_ops.random_uniform((2, 2))
+
+    a_100_by_784 = random_ops.random_uniform((100, 784))
+    b_100_by_784 = random_ops.random_uniform((100, 784))
+
+    ctx = context.context()
+
+    self.assertAllClose(
+        math_ops.matmul(a_2_by_2, b_2_by_2),
+        pywrap_tensorflow.TFE_Py_FastPathExecute(
+            ctx._handle, ctx.device_name, "MatMul", execute.record_gradient,
+            None, None, a_2_by_2, b_2_by_2, "transpose_a", False, "transpose_b",
+            False))
+    self.assertAllClose(
+        math_ops.matmul(a_100_by_784, b_100_by_784, transpose_b=True),
+        pywrap_tensorflow.TFE_Py_FastPathExecute(
+            ctx._handle, ctx.device_name, "MatMul", execute.record_gradient,
+            None, None, a_100_by_784, b_100_by_784, "transpose_a", False,
+            "transpose_b", True))
+
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastpathExecute_TapeWrite(self):
+    ctx = context.context()
+    with backprop.GradientTape(persistent=True) as tape:
+      a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
+      tape.watch(a_2_by_2)
+      z = pywrap_tensorflow.TFE_Py_FastPathExecute(
+          ctx._handle, ctx.device_name, "MatMul", execute.record_gradient, None,
+          None, a_2_by_2, a_2_by_2, "transpose_a", False, "transpose_b", False)
+    dz_dy = tape.gradient(z, [a_2_by_2])[0]
+    self.assertAllEqual(dz_dy.numpy(),
+                        constant_op.constant(4.0, shape=[2, 2]).numpy())
+
+  # Tests homogeneous list op
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastpathExecute_AddNCorrectResponse(self):
+    ctx = context.context()
+    a_2_by_2 = random_ops.random_uniform((2, 2))
+    b_2_by_2 = random_ops.random_uniform((2, 2))
+
+    self.assertAllClose(
+        math_ops.add_n([a_2_by_2, b_2_by_2]),
+        pywrap_tensorflow.TFE_Py_FastPathExecute(
+            ctx._handle, ctx.device_name, "AddN", execute.record_gradient, None,
+            None, [a_2_by_2, b_2_by_2]))
+
+  # Tests homogeneous list op
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastpathExecute_AddNTapeWrite(self):
+    ctx = context.context()
+    a_2_by_2 = random_ops.random_uniform((2, 2))
+    b_2_by_2 = random_ops.random_uniform((2, 2))
+
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(a_2_by_2)
+      tape.watch(b_2_by_2)
+      z1 = pywrap_tensorflow.TFE_Py_FastPathExecute(
+          ctx._handle, ctx.device_name, "AddN", execute.record_gradient, None,
+          None, [a_2_by_2, b_2_by_2])
+      z2 = math_ops.add_n([a_2_by_2, b_2_by_2])
+    dz1_dy = tape.gradient(z1, [a_2_by_2])[0]
+    dz2_dy = tape.gradient(z2, [a_2_by_2])[0]
+    self.assertAllEqual(dz1_dy.numpy(), dz2_dy.numpy())
+
+  # Tests heterogeneous list op
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastpathExecute_IdentityNCorrectResponse(self):
+    ctx = context.context()
+    a_2_by_2 = random_ops.random_uniform((2, 2))
+    b_2_by_2 = random_ops.random_uniform((2, 2))
+
+    self.assertAllClose(
+        array_ops.identity_n([a_2_by_2, b_2_by_2]),
+        pywrap_tensorflow.TFE_Py_FastPathExecute(
+            ctx._handle, ctx.device_name, "IdentityN", execute.record_gradient,
+            None, None, [a_2_by_2, b_2_by_2]))
+
+  # Tests heterogeneous list op
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastpathExecute_IdentityNTapeWrite(self):
+    ctx = context.context()
+    a_2_by_2 = random_ops.random_uniform((2, 2))
+    b_2_by_2 = random_ops.random_uniform((2, 2))
+
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(a_2_by_2)
+      tape.watch(b_2_by_2)
+      z1 = pywrap_tensorflow.TFE_Py_FastPathExecute(
+          ctx._handle, ctx.device_name, "IdentityN", execute.record_gradient,
+          None, None, [a_2_by_2, b_2_by_2])
+      z2 = array_ops.identity_n([a_2_by_2, b_2_by_2])
+    dz1_dy = tape.gradient(z1[0], [a_2_by_2])[0]
+    dz2_dy = tape.gradient(z2[0], [a_2_by_2])[0]
+    self.assertAllEqual(dz1_dy.numpy(), dz2_dy.numpy())
+
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testFastpathExecute_InvalidInputs(self):
+    a_2_by_2 = random_ops.random_uniform((2, 2))
+    ctx = context.context()
+    assert not ctx.in_graph_mode(
+    ), "The prototype doesn't contain C code for graph construction"
+    ctx_handle = ctx._handle  # pylint: disable=protected-access
+
+    # Not enough base params
+    with self.assertRaisesRegexp(ValueError,
+                                 "at least 6 items in the input tuple"):
+      pywrap_tensorflow.TFE_Py_FastPathExecute(ctx_handle, ctx.device_name,
+                                               "Identity")
+
+    # Not enough inputs
+    with self.assertRaisesRegexp(ValueError,
+                                 "Expected to be at least 7, was 6"):
+      pywrap_tensorflow.TFE_Py_FastPathExecute(
+          ctx_handle, ctx_handle, "Identity", backprop._record_gradient, None,
+          [])
+
+    # Bad type
+    with self.assertRaisesRegexp(TypeError, "expected a string for op_name"):
+      pywrap_tensorflow.TFE_Py_FastPathExecute(
+          ctx_handle, ctx.device_name, ctx_handle, backprop._record_gradient,
+          None, [], a_2_by_2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index 14b5238f74039ec23bd197699de68c4c0254e8d3..ad82266beca05d9f508a702124390fd934161ffd 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -35,7 +35,8 @@ class Tape(object):
 
 def push_new_tape(persistent=False):
   """Pushes a new tape onto the tape stack."""
-  pywrap_tensorflow.TFE_Py_TapeStackPushNew(persistent)
+  tape = pywrap_tensorflow.TFE_Py_TapeSetNew(persistent)
+  return Tape(tape)
 
 
 def watch(tensor):
@@ -44,7 +45,7 @@ def watch(tensor):
   Args:
     tensor: tensor to be watched.
   """
-  pywrap_tensorflow.TFE_Py_TapeStackWatch(tensor)
+  pywrap_tensorflow.TFE_Py_TapeSetWatch(tensor)
 
 
 def watch_variable(variable):
@@ -53,42 +54,39 @@ def watch_variable(variable):
   Args:
     variable: variable to be watched.
   """
-  pywrap_tensorflow.TFE_Py_TapeStackWatchVariable(variable)
+  pywrap_tensorflow.TFE_Py_TapeSetWatchVariable(variable)
 
 
-def pop_tape():
+def pop_tape(tape):
   """Pops the top tape in the stack, if any."""
-  return Tape(pywrap_tensorflow.TFE_Py_TapeStackPop())
+  pywrap_tensorflow.TFE_Py_TapeSetRemove(tape._tape)  # pylint: disable=protected-access
 
 
 @contextlib.contextmanager
 def stop_recording():
-  stack = []
-  while not pywrap_tensorflow.TFE_Py_TapeStackIsEmpty():
-    stack.append(pop_tape()._tape)  # pylint: disable=protected-access
   try:
+    pywrap_tensorflow.TFE_Py_TapeSetStopOnThread()
     yield
   finally:
-    for tape in reversed(stack):
-      pywrap_tensorflow.TFE_Py_TapeStackPush(tape)
+    pywrap_tensorflow.TFE_Py_TapeSetRestartOnThread()
 
 
 def should_record(tensors):
   """Returns true if any tape in the stack watches any of these tensors."""
-  return pywrap_tensorflow.TFE_Py_TapeStackShouldRecord(tensors)
+  return pywrap_tensorflow.TFE_Py_TapeSetShouldRecord(tensors)
 
 
 def record_operation(op_type, output_tensors, input_tensors, backward_function):
   """Records the operation on all tapes in the stack."""
-  pywrap_tensorflow.TFE_Py_TapeStackRecordOperation(
+  pywrap_tensorflow.TFE_Py_TapeSetRecordOperation(
       op_type, output_tensors, input_tensors, backward_function)
 
 
 def delete_trace(tensor_id):
   """Deletes traces for this Tensor from all tapes in the stack."""
-  pywrap_tensorflow.TFE_Py_TapeStackDeleteTrace(tensor_id)
+  pywrap_tensorflow.TFE_Py_TapeSetDeleteTrace(tensor_id)
 
 
 def could_possibly_record():
   """Returns True if any tape is active."""
-  return not pywrap_tensorflow.TFE_Py_TapeStackIsEmpty()
+  return not pywrap_tensorflow.TFE_Py_TapeSetIsEmpty()
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 7a4593ec464ab1834a555a131b8b717f5010de62..0bd5a5dbafd5ea8da21d4fb8a7dcae9fe23dd3d2 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import re
 
 import numpy as np
 
@@ -106,6 +107,24 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     t = _create_tensor(n)
     self.assertAllEqual([[1, 2], [3, 4]], t)
 
+  def testNumpyArrayDtype(self):
+    tensor = constant_op.constant([1.0, 2.0, 3.0])
+    numpy_tensor = np.asarray(tensor, dtype=np.int32)
+    self.assertAllEqual(numpy_tensor, [1, 2, 3])
+
+  def testNdimsAgreesWithNumpy(self):
+    numpy_tensor = np.asarray(1.0)
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(numpy_tensor.ndim, tensor.ndim)
+
+    numpy_tensor = np.asarray([1.0, 2.0, 3.0])
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(numpy_tensor.ndim, tensor.ndim)
+
+    numpy_tensor = np.asarray([[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]])
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(numpy_tensor.ndim, tensor.ndim)
+
   def testCopy(self):
     t = constant_op.constant(1.0)
     tt = copy.copy(t)
@@ -174,8 +193,8 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     np.set_printoptions(threshold=2, edgeitems=1)
 
     t = _create_tensor(np.arange(10, dtype=np.int32))
-    self.assertIn("[0 ..., 9]", str(t))
-    self.assertIn("[0, ..., 9]", repr(t))
+    self.assertTrue(re.match(r".*\[.*0.*\.\.\..*9.*\]", str(t)))
+    self.assertTrue(re.match(r".*\[.*0.*\.\.\..*9.*\]", repr(t)))
 
     # Clean up: reset to previous printoptions.
     np.set_printoptions(threshold=orig_threshold, edgeitems=orig_edgeitems)
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 03f386e9cf885fb88cbb557a99b9d0abe78b3062..c519fd557a9319d6ef5522b26198e5b4202917fc 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -215,6 +215,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
+        "noasan",  # test flakily times out in asan mode.
         "notsan",  # b/67510291
     ],
     deps = [
@@ -266,6 +267,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "@six_archive//:six",
     ],
 )
@@ -281,6 +283,7 @@ py_library(
         ":model_fn",
         ":numpy_io",
         ":prediction_keys",
+        ":warm_starting_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -354,6 +357,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "@six_archive//:six",
     ],
 )
@@ -423,6 +427,7 @@ py_library(
         ":model_fn",
         ":run_config",
         ":util",
+        ":warm_starting_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:control_flow_ops",
@@ -433,6 +438,7 @@ py_library(
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/data",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:tag_constants",
         "//third_party/py/numpy",
@@ -598,6 +604,7 @@ py_library(
         ":metric_keys",
         ":model_fn",
         ":prediction_keys",
+        ":util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
@@ -620,8 +627,9 @@ py_library(
 
 py_test(
     name = "head_test",
-    size = "small",
+    size = "medium",
     srcs = ["canned/head_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -645,6 +653,7 @@ py_test(
         "//tensorflow/python:string_ops",
         "//tensorflow/python:training",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model:signature_constants",
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -674,6 +683,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py
index 96e4ecd29fbcd4f4335077e9f81c5704ae2b9bec..3e92a77543e3d2162497e9f995f3adc2a01cb4dd 100644
--- a/tensorflow/python/estimator/canned/baseline.py
+++ b/tensorflow/python/estimator/canned/baseline.py
@@ -57,7 +57,9 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.training import training_util
+from tensorflow.python.util.tf_export import tf_export
 
 # The default learning rate of 0.3 is a historical artifact of the initial
 # implementation, but seems a reasonable choice.
@@ -172,6 +174,7 @@ def _baseline_model_fn(features, labels, mode, head, optimizer,
       train_op_fn=train_op_fn)
 
 
+@tf_export('estimator.BaselineClassifier')
 class BaselineClassifier(estimator.Estimator):
   """A classifier that can establish a simple baseline.
 
@@ -220,7 +223,8 @@ class BaselineClassifier(estimator.Estimator):
                weight_column=None,
                label_vocabulary=None,
                optimizer='Ftrl',
-               config=None):
+               config=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a BaselineClassifier instance.
 
     Args:
@@ -240,6 +244,8 @@ class BaselineClassifier(estimator.Estimator):
         optimizer to use for training. If not specified, will use
         `FtrlOptimizer` with a default learning rate of 0.3.
       config: `RunConfig` object to configure the runtime settings.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     Returns:
       A `BaselineClassifier` estimator.
 
@@ -249,11 +255,13 @@ class BaselineClassifier(estimator.Estimator):
     if n_classes == 2:
       head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
           weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     else:
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
           n_classes, weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     def _model_fn(features, labels, mode, config):
       return _baseline_model_fn(
           features=features,
@@ -269,6 +277,7 @@ class BaselineClassifier(estimator.Estimator):
         config=config)
 
 
+@tf_export('estimator.BaselineRegressor')
 class BaselineRegressor(estimator.Estimator):
   """A regressor that can establish a simple baseline.
 
@@ -311,7 +320,8 @@ class BaselineRegressor(estimator.Estimator):
                label_dimension=1,
                weight_column=None,
                optimizer='Ftrl',
-               config=None):
+               config=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a BaselineRegressor instance.
 
     Args:
@@ -328,13 +338,16 @@ class BaselineRegressor(estimator.Estimator):
         optimizer to use for training. If not specified, will use
         `FtrlOptimizer` with a default learning rate of 0.3.
       config: `RunConfig` object to configure the runtime settings.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     Returns:
       A `BaselineRegressor` estimator.
     """
 
     head = head_lib._regression_head_with_mean_squared_error_loss(  # pylint: disable=protected-access
         label_dimension=label_dimension,
-        weight_column=weight_column)
+        weight_column=weight_column,
+        loss_reduction=loss_reduction)
     def _model_fn(features, labels, mode, config):
       return _baseline_model_fn(
           features=features,
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index 6f94b2288b999b8d4d3d9f6cb2b3cb4945c39e0d..7043da8de036e5be27d223271c37e065d9ffbcdd 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -30,8 +30,10 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
 from tensorflow.python.training import training_util
+from tensorflow.python.util.tf_export import tf_export
 
 # The default learning rate of 0.05 is a historical artifact of the initial
 # implementation, but seems a reasonable choice.
@@ -88,7 +90,6 @@ def _dnn_logit_fn_builder(units, hidden_units, feature_columns, activation_fn,
         partitioner=input_layer_partitioner):
       net = feature_column_lib.input_layer(
           features=features, feature_columns=feature_columns)
-
     for layer_id, num_hidden_units in enumerate(hidden_units):
       with variable_scope.variable_scope(
           'hiddenlayer_%d' % layer_id, values=(net,)) as hidden_layer_scope:
@@ -110,15 +111,23 @@ def _dnn_logit_fn_builder(units, hidden_units, feature_columns, activation_fn,
           kernel_initializer=init_ops.glorot_uniform_initializer(),
           name=logits_scope)
     _add_hidden_layer_summary(logits, logits_scope.name)
+
     return logits
 
   return dnn_logit_fn
 
 
-def _dnn_model_fn(
-    features, labels, mode, head, hidden_units, feature_columns,
-    optimizer='Adagrad', activation_fn=nn.relu, dropout=None,
-    input_layer_partitioner=None, config=None):
+def _dnn_model_fn(features,
+                  labels,
+                  mode,
+                  head,
+                  hidden_units,
+                  feature_columns,
+                  optimizer='Adagrad',
+                  activation_fn=nn.relu,
+                  dropout=None,
+                  input_layer_partitioner=None,
+                  config=None):
   """Deep Neural Net model_fn.
 
   Args:
@@ -141,9 +150,7 @@ def _dnn_model_fn(
     config: `RunConfig` object to configure the runtime settings.
 
   Returns:
-    predictions: A dict of `Tensor` objects.
-    loss: A scalar containing the loss of the step.
-    train_op: The op for training.
+    An `EstimatorSpec` instance.
 
   Raises:
     ValueError: If features has the wrong type.
@@ -151,6 +158,7 @@ def _dnn_model_fn(
   if not isinstance(features, dict):
     raise ValueError('features should be a dictionary of `Tensor`s. '
                      'Given type: {}'.format(type(features)))
+
   optimizer = optimizers.get_optimizer_instance(
       optimizer, learning_rate=_LEARNING_RATE)
   num_ps_replicas = config.num_ps_replicas if config else 0
@@ -189,6 +197,7 @@ def _dnn_model_fn(
         logits=logits)
 
 
+@tf_export('estimator.DNNClassifier')
 class DNNClassifier(estimator.Estimator):
   """A classifier for TensorFlow DNN models.
 
@@ -217,6 +226,12 @@ class DNNClassifier(estimator.Estimator):
         l1_regularization_strength=0.001
       ))
 
+  # Or estimator with warm-starting from a previous checkpoint.
+  estimator = DNNClassifier(
+      feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
+      hidden_units=[1024, 512, 256],
+      warm_start_from="/path/to/checkpoint/dir")
+
   # Input builders
   def input_fn_train: # returns x, y
     pass
@@ -251,18 +266,22 @@ class DNNClassifier(estimator.Estimator):
   @end_compatibility
   """
 
-  def __init__(self,
-               hidden_units,
-               feature_columns,
-               model_dir=None,
-               n_classes=2,
-               weight_column=None,
-               label_vocabulary=None,
-               optimizer='Adagrad',
-               activation_fn=nn.relu,
-               dropout=None,
-               input_layer_partitioner=None,
-               config=None):
+  def __init__(
+      self,
+      hidden_units,
+      feature_columns,
+      model_dir=None,
+      n_classes=2,
+      weight_column=None,
+      label_vocabulary=None,
+      optimizer='Adagrad',
+      activation_fn=nn.relu,
+      dropout=None,
+      input_layer_partitioner=None,
+      config=None,
+      warm_start_from=None,
+      loss_reduction=losses.Reduction.SUM,
+  ):
     """Initializes a `DNNClassifier` instance.
 
     Args:
@@ -300,16 +319,27 @@ class DNNClassifier(estimator.Estimator):
       input_layer_partitioner: Optional. Partitioner for input layer. Defaults
         to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
       config: `RunConfig` object to configure the runtime settings.
+      warm_start_from: A string filepath to a checkpoint to warm-start from, or
+        a `WarmStartSettings` object to fully configure warm-starting.  If the
+        string filepath is provided instead of a `WarmStartSettings`, then all
+        weights are warm-started, and it is assumed that vocabularies and Tensor
+        names are unchanged.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     """
     if n_classes == 2:
       head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
           weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     else:
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
           n_classes, weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
+
     def _model_fn(features, labels, mode, config):
+      """Call the defined shared _dnn_model_fn."""
       return _dnn_model_fn(
           features=features,
           labels=labels,
@@ -322,10 +352,13 @@ class DNNClassifier(estimator.Estimator):
           dropout=dropout,
           input_layer_partitioner=input_layer_partitioner,
           config=config)
+
     super(DNNClassifier, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+        model_fn=_model_fn, model_dir=model_dir, config=config,
+        warm_start_from=warm_start_from)
 
 
+@tf_export('estimator.DNNRegressor')
 class DNNRegressor(estimator.Estimator):
   """A regressor for TensorFlow DNN models.
 
@@ -354,6 +387,12 @@ class DNNRegressor(estimator.Estimator):
         l1_regularization_strength=0.001
       ))
 
+  # Or estimator with warm-starting from a previous checkpoint.
+  estimator = DNNRegressor(
+      feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
+      hidden_units=[1024, 512, 256],
+      warm_start_from="/path/to/checkpoint/dir")
+
   # Input builders
   def input_fn_train: # returns x, y
     pass
@@ -388,17 +427,21 @@ class DNNRegressor(estimator.Estimator):
   @end_compatibility
   """
 
-  def __init__(self,
-               hidden_units,
-               feature_columns,
-               model_dir=None,
-               label_dimension=1,
-               weight_column=None,
-               optimizer='Adagrad',
-               activation_fn=nn.relu,
-               dropout=None,
-               input_layer_partitioner=None,
-               config=None):
+  def __init__(
+      self,
+      hidden_units,
+      feature_columns,
+      model_dir=None,
+      label_dimension=1,
+      weight_column=None,
+      optimizer='Adagrad',
+      activation_fn=nn.relu,
+      dropout=None,
+      input_layer_partitioner=None,
+      config=None,
+      warm_start_from=None,
+      loss_reduction=losses.Reduction.SUM,
+  ):
     """Initializes a `DNNRegressor` instance.
 
     Args:
@@ -430,15 +473,25 @@ class DNNRegressor(estimator.Estimator):
       input_layer_partitioner: Optional. Partitioner for input layer. Defaults
         to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
       config: `RunConfig` object to configure the runtime settings.
+      warm_start_from: A string filepath to a checkpoint to warm-start from, or
+        a `WarmStartSettings` object to fully configure warm-starting.  If the
+        string filepath is provided instead of a `WarmStartSettings`, then all
+        weights are warm-started, and it is assumed that vocabularies and Tensor
+        names are unchanged.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     """
+
     def _model_fn(features, labels, mode, config):
+      """Call the defined shared _dnn_model_fn."""
       return _dnn_model_fn(
           features=features,
           labels=labels,
           mode=mode,
           head=head_lib.  # pylint: disable=protected-access
           _regression_head_with_mean_squared_error_loss(
-              label_dimension=label_dimension, weight_column=weight_column),
+              label_dimension=label_dimension, weight_column=weight_column,
+              loss_reduction=loss_reduction),
           hidden_units=hidden_units,
           feature_columns=tuple(feature_columns or []),
           optimizer=optimizer,
@@ -446,5 +499,7 @@ class DNNRegressor(estimator.Estimator):
           dropout=dropout,
           input_layer_partitioner=input_layer_partitioner,
           config=config)
+
     super(DNNRegressor, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+        model_fn=_model_fn, model_dir=model_dir, config=config,
+        warm_start_from=warm_start_from)
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index 3c61bd5b07ba04193f0ed9de3567264b898114cf..6d0fb96057ee93964ee3571bae3b878faad88882 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -33,9 +33,11 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
 from tensorflow.python.training import sync_replicas_optimizer
 from tensorflow.python.training import training_util
+from tensorflow.python.util.tf_export import tf_export
 
 # The default learning rates are a historical artifact of the initial
 # implementation.
@@ -74,12 +76,19 @@ def _add_layer_summary(value, tag):
   summary.histogram('%s/activation' % tag, value)
 
 
-def _dnn_linear_combined_model_fn(
-    features, labels, mode, head,
-    linear_feature_columns=None, linear_optimizer='Ftrl',
-    dnn_feature_columns=None, dnn_optimizer='Adagrad', dnn_hidden_units=None,
-    dnn_activation_fn=nn.relu, dnn_dropout=None,
-    input_layer_partitioner=None, config=None):
+def _dnn_linear_combined_model_fn(features,
+                                  labels,
+                                  mode,
+                                  head,
+                                  linear_feature_columns=None,
+                                  linear_optimizer='Ftrl',
+                                  dnn_feature_columns=None,
+                                  dnn_optimizer='Adagrad',
+                                  dnn_hidden_units=None,
+                                  dnn_activation_fn=nn.relu,
+                                  dnn_dropout=None,
+                                  input_layer_partitioner=None,
+                                  config=None):
   """Deep Neural Net and Linear combined model_fn.
 
   Args:
@@ -108,7 +117,7 @@ def _dnn_linear_combined_model_fn(
     config: `RunConfig` object to configure the runtime settings.
 
   Returns:
-    `ModelFnOps`
+    An `EstimatorSpec` instance.
 
   Raises:
     ValueError: If both `linear_feature_columns` and `dnn_features_columns`
@@ -121,6 +130,7 @@ def _dnn_linear_combined_model_fn(
   if not linear_feature_columns and not dnn_feature_columns:
     raise ValueError(
         'Either linear_feature_columns or dnn_feature_columns must be defined.')
+
   num_ps_replicas = config.num_ps_replicas if config else 0
   input_layer_partitioner = input_layer_partitioner or (
       partitioned_variables.min_max_variable_partitioner(
@@ -216,6 +226,7 @@ def _dnn_linear_combined_model_fn(
       logits=logits)
 
 
+@tf_export('estimator.DNNLinearCombinedClassifier')
 class DNNLinearCombinedClassifier(estimator.Estimator):
   """An estimator for TensorFlow Linear and DNN joined classification models.
 
@@ -243,7 +254,9 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
           categorical_feature_a_emb, categorical_feature_b_emb,
           numeric_feature],
       dnn_hidden_units=[1000, 500, 100],
-      dnn_optimizer=tf.train.ProximalAdagradOptimizer(...))
+      dnn_optimizer=tf.train.ProximalAdagradOptimizer(...),
+      # warm-start settings
+      warm_start_from="/path/to/checkpoint/dir")
 
   # To apply L1 and L2 regularization, you can set optimizers as follows:
   tf.train.ProximalAdagradOptimizer(
@@ -297,7 +310,9 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
                weight_column=None,
                label_vocabulary=None,
                input_layer_partitioner=None,
-               config=None):
+               config=None,
+               warm_start_from=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a DNNLinearCombinedClassifier instance.
 
     Args:
@@ -339,6 +354,13 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
       input_layer_partitioner: Partitioner for input layer. Defaults to
         `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
       config: RunConfig object to configure the runtime settings.
+      warm_start_from: A string filepath to a checkpoint to warm-start from, or
+        a `WarmStartSettings` object to fully configure warm-starting.  If the
+        string filepath is provided instead of a `WarmStartSettings`, then all
+        weights are warm-started, and it is assumed that vocabularies and Tensor
+        names are unchanged.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
 
     Raises:
       ValueError: If both linear_feature_columns and dnn_features_columns are
@@ -354,13 +376,17 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
     if n_classes == 2:
       head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
           weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     else:
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
           n_classes,
           weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
+
     def _model_fn(features, labels, mode, config):
+      """Call the _dnn_linear_combined_model_fn."""
       return _dnn_linear_combined_model_fn(
           features=features,
           labels=labels,
@@ -377,9 +403,11 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
           config=config)
 
     super(DNNLinearCombinedClassifier, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+        model_fn=_model_fn, model_dir=model_dir, config=config,
+        warm_start_from=warm_start_from)
 
 
+@tf_export('estimator.DNNLinearCombinedRegressor')
 class DNNLinearCombinedRegressor(estimator.Estimator):
   """An estimator for TensorFlow Linear and DNN joined models for regression.
 
@@ -407,7 +435,9 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
           categorical_feature_a_emb, categorical_feature_b_emb,
           numeric_feature],
       dnn_hidden_units=[1000, 500, 100],
-      dnn_optimizer=tf.train.ProximalAdagradOptimizer(...))
+      dnn_optimizer=tf.train.ProximalAdagradOptimizer(...),
+      # warm-start settings
+      warm_start_from="/path/to/checkpoint/dir")
 
   # To apply L1 and L2 regularization, you can set optimizers as follows:
   tf.train.ProximalAdagradOptimizer(
@@ -460,7 +490,9 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
                label_dimension=1,
                weight_column=None,
                input_layer_partitioner=None,
-               config=None):
+               config=None,
+               warm_start_from=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a DNNLinearCombinedRegressor instance.
 
     Args:
@@ -496,6 +528,13 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
       input_layer_partitioner: Partitioner for input layer. Defaults to
         `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
       config: RunConfig object to configure the runtime settings.
+      warm_start_from: A string filepath to a checkpoint to warm-start from, or
+        a `WarmStartSettings` object to fully configure warm-starting.  If the
+        string filepath is provided instead of a `WarmStartSettings`, then all
+        weights are warm-started, and it is assumed that vocabularies and Tensor
+        names are unchanged.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
 
     Raises:
       ValueError: If both linear_feature_columns and dnn_features_columns are
@@ -510,13 +549,15 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
                        'must be defined.')
 
     def _model_fn(features, labels, mode, config):
+      """Call the _dnn_linear_combined_model_fn."""
       return _dnn_linear_combined_model_fn(
           features=features,
           labels=labels,
           mode=mode,
           head=head_lib.  # pylint: disable=protected-access
           _regression_head_with_mean_squared_error_loss(
-              label_dimension=label_dimension, weight_column=weight_column),
+              label_dimension=label_dimension, weight_column=weight_column,
+              loss_reduction=loss_reduction),
           linear_feature_columns=linear_feature_columns,
           linear_optimizer=linear_optimizer,
           dnn_feature_columns=dnn_feature_columns,
@@ -528,4 +569,5 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
           config=config)
 
     super(DNNLinearCombinedRegressor, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+        model_fn=_model_fn, model_dir=model_dir, config=config,
+        warm_start_from=warm_start_from)
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined_test.py b/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
index 2151df8423774f0e6f9e51a114efe66472204962..84675bf2a4a1655026bbba37c5d7a63d2f788c46 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
@@ -26,6 +26,7 @@ import six
 
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
+from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import dnn_testing_utils
 from tensorflow.python.estimator.canned import linear_testing_utils
@@ -47,6 +48,7 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import optimizer as optimizer_lib
 
+
 try:
   # pylint: disable=g-import-not-at-top
   import pandas as pd
@@ -731,5 +733,156 @@ class DNNLinearCombinedTests(test.TestCase):
         next(est.predict(input_fn=input_fn)))
 
 
+class DNNLinearCombinedWarmStartingTest(test.TestCase):
+
+  def setUp(self):
+    # Create a directory to save our old checkpoint and vocabularies to.
+    self._ckpt_and_vocab_dir = tempfile.mkdtemp()
+
+    # Make a dummy input_fn.
+    def _input_fn():
+      features = {
+          'age': [[23.], [31.]],
+          'city': [['Palo Alto'], ['Mountain View']],
+      }
+      return features, [0, 1]
+
+    self._input_fn = _input_fn
+
+  def tearDown(self):
+    # Clean up checkpoint / vocab dir.
+    writer_cache.FileWriterCache.clear()
+    shutil.rmtree(self._ckpt_and_vocab_dir)
+
+  def test_classifier_basic_warm_starting(self):
+    """Tests correctness of DNNLinearCombinedClassifier default warm-start."""
+    age = feature_column.numeric_column('age')
+    city = feature_column.embedding_column(
+        feature_column.categorical_column_with_vocabulary_list(
+            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
+        dimension=5)
+
+    # Create a DNNLinearCombinedClassifier and train to save a checkpoint.
+    dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
+        linear_feature_columns=[age],
+        dnn_feature_columns=[city],
+        dnn_hidden_units=[256, 128],
+        model_dir=self._ckpt_and_vocab_dir,
+        n_classes=4,
+        linear_optimizer='SGD',
+        dnn_optimizer='SGD')
+    dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)
+
+    # Create a second DNNLinearCombinedClassifier, warm-started from the first.
+    # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
+    # have accumulator values that change).
+    warm_started_dnn_lc_classifier = (
+        dnn_linear_combined.DNNLinearCombinedClassifier(
+            linear_feature_columns=[age],
+            dnn_feature_columns=[city],
+            dnn_hidden_units=[256, 128],
+            n_classes=4,
+            linear_optimizer=gradient_descent.GradientDescentOptimizer(
+                learning_rate=0.0),
+            dnn_optimizer=gradient_descent.GradientDescentOptimizer(
+                learning_rate=0.0),
+            warm_start_from=dnn_lc_classifier.model_dir))
+
+    warm_started_dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)
+    for variable_name in warm_started_dnn_lc_classifier.get_variable_names():
+      self.assertAllClose(
+          dnn_lc_classifier.get_variable_value(variable_name),
+          warm_started_dnn_lc_classifier.get_variable_value(variable_name))
+
+  def test_regressor_basic_warm_starting(self):
+    """Tests correctness of DNNLinearCombinedRegressor default warm-start."""
+    age = feature_column.numeric_column('age')
+    city = feature_column.embedding_column(
+        feature_column.categorical_column_with_vocabulary_list(
+            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
+        dimension=5)
+
+    # Create a DNNLinearCombinedRegressor and train to save a checkpoint.
+    dnn_lc_regressor = dnn_linear_combined.DNNLinearCombinedRegressor(
+        linear_feature_columns=[age],
+        dnn_feature_columns=[city],
+        dnn_hidden_units=[256, 128],
+        model_dir=self._ckpt_and_vocab_dir,
+        linear_optimizer='SGD',
+        dnn_optimizer='SGD')
+    dnn_lc_regressor.train(input_fn=self._input_fn, max_steps=1)
+
+    # Create a second DNNLinearCombinedRegressor, warm-started from the first.
+    # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
+    # have accumulator values that change).
+    warm_started_dnn_lc_regressor = (
+        dnn_linear_combined.DNNLinearCombinedRegressor(
+            linear_feature_columns=[age],
+            dnn_feature_columns=[city],
+            dnn_hidden_units=[256, 128],
+            linear_optimizer=gradient_descent.GradientDescentOptimizer(
+                learning_rate=0.0),
+            dnn_optimizer=gradient_descent.GradientDescentOptimizer(
+                learning_rate=0.0),
+            warm_start_from=dnn_lc_regressor.model_dir))
+
+    warm_started_dnn_lc_regressor.train(input_fn=self._input_fn, max_steps=1)
+    for variable_name in warm_started_dnn_lc_regressor.get_variable_names():
+      self.assertAllClose(
+          dnn_lc_regressor.get_variable_value(variable_name),
+          warm_started_dnn_lc_regressor.get_variable_value(variable_name))
+
+  def test_warm_starting_selective_variables(self):
+    """Tests selecting variables to warm-start."""
+    age = feature_column.numeric_column('age')
+    city = feature_column.embedding_column(
+        feature_column.categorical_column_with_vocabulary_list(
+            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
+        dimension=5)
+
+    # Create a DNNLinearCombinedClassifier and train to save a checkpoint.
+    dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
+        linear_feature_columns=[age],
+        dnn_feature_columns=[city],
+        dnn_hidden_units=[256, 128],
+        model_dir=self._ckpt_and_vocab_dir,
+        n_classes=4,
+        linear_optimizer='SGD',
+        dnn_optimizer='SGD')
+    dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)
+
+    # Create a second DNNLinearCombinedClassifier, warm-started from the first.
+    # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
+    # have accumulator values that change).
+    warm_started_dnn_lc_classifier = (
+        dnn_linear_combined.DNNLinearCombinedClassifier(
+            linear_feature_columns=[age],
+            dnn_feature_columns=[city],
+            dnn_hidden_units=[256, 128],
+            n_classes=4,
+            linear_optimizer=gradient_descent.GradientDescentOptimizer(
+                learning_rate=0.0),
+            dnn_optimizer=gradient_descent.GradientDescentOptimizer(
+                learning_rate=0.0),
+            # The provided regular expression will only warm-start the deep
+            # portion of the model.
+            warm_start_from=warm_starting_util.WarmStartSettings(
+                ckpt_to_initialize_from=dnn_lc_classifier.model_dir,
+                vars_to_warm_start='.*(dnn).*')))
+
+    warm_started_dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)
+    for variable_name in warm_started_dnn_lc_classifier.get_variable_names():
+      if 'dnn' in variable_name:
+        self.assertAllClose(
+            dnn_lc_classifier.get_variable_value(variable_name),
+            warm_started_dnn_lc_classifier.get_variable_value(variable_name))
+      elif 'linear' in variable_name:
+        linear_values = warm_started_dnn_lc_classifier.get_variable_value(
+            variable_name)
+        # Since they're not warm-started, the linear weights will be
+        # zero-initialized.
+        self.assertAllClose(np.zeros_like(linear_values), linear_values)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/estimator/canned/dnn_test.py b/tensorflow/python/estimator/canned/dnn_test.py
index e005cec263b93b90cb710e020aaf33d54cb69e2f..fc90b7c35e5e3f63d4592989e5ebb393fb245360 100644
--- a/tensorflow/python/estimator/canned/dnn_test.py
+++ b/tensorflow/python/estimator/canned/dnn_test.py
@@ -73,6 +73,15 @@ class DNNLogitFnTest(dnn_testing_utils.BaseDNNLogitFnTest, test.TestCase):
                                                   dnn._dnn_logit_fn_builder)
 
 
+class DNNWarmStartingTest(dnn_testing_utils.BaseDNNWarmStartingTest,
+                          test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNWarmStartingTest.__init__(self, _dnn_classifier_fn,
+                                                       _dnn_regressor_fn)
+
+
 class DNNClassifierEvaluateTest(
     dnn_testing_utils.BaseDNNClassifierEvaluateTest, test.TestCase):
 
diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
index 3ffca14261386b156771906fda80914971ea1c68..706575985ff9e0fef94f110825ec11af33031ea3 100644
--- a/tensorflow/python/estimator/canned/dnn_testing_utils.py
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -28,6 +28,7 @@ import six
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.canned import prediction_keys
@@ -39,6 +40,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
@@ -49,6 +51,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary import summary as summary_lib
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import saver
@@ -64,6 +67,10 @@ HIDDEN_WEIGHTS_NAME_PATTERN = 'dnn/hiddenlayer_%d/kernel'
 HIDDEN_BIASES_NAME_PATTERN = 'dnn/hiddenlayer_%d/bias'
 LOGITS_WEIGHTS_NAME = 'dnn/logits/kernel'
 LOGITS_BIASES_NAME = 'dnn/logits/bias'
+OCCUPATION_EMBEDDING_NAME = ('dnn/input_from_feature_columns/input_layer/'
+                             'occupation_embedding/embedding_weights')
+CITY_EMBEDDING_NAME = ('dnn/input_from_feature_columns/input_layer/'
+                       'city_embedding/embedding_weights')
 
 
 def assert_close(expected, actual, rtol=1e-04, message='', name='assert_close'):
@@ -696,6 +703,301 @@ class BaseDNNLogitFnTest(object):
             self.assertAllClose(expected_logits, sess.run(logits))
 
 
+class BaseDNNWarmStartingTest(object):
+
+  def __init__(self, _dnn_classifier_fn, _dnn_regressor_fn):
+    self._dnn_classifier_fn = _dnn_classifier_fn
+    self._dnn_regressor_fn = _dnn_regressor_fn
+
+  def setUp(self):
+    # Create a directory to save our old checkpoint and vocabularies to.
+    self._ckpt_and_vocab_dir = tempfile.mkdtemp()
+
+    # Make a dummy input_fn.
+    def _input_fn():
+      features = {
+          'city': [['Palo Alto'], ['Mountain View']],
+          'locality': [['Palo Alto'], ['Mountain View']],
+          'occupation': [['doctor'], ['consultant']]
+      }
+      return features, [0, 1]
+
+    self._input_fn = _input_fn
+
+  def tearDown(self):
+    # Clean up checkpoint / vocab dir.
+    writer_cache.FileWriterCache.clear()
+    shutil.rmtree(self._ckpt_and_vocab_dir)
+
+  def assertAllNotClose(self, t1, t2):
+    """Helper assert for arrays."""
+    sum_of_abs_diff = 0.0
+    for x, y in zip(t1, t2):
+      try:
+        for a, b in zip(x, y):
+          sum_of_abs_diff += abs(b - a)
+      except TypeError:
+        sum_of_abs_diff += abs(y - x)
+    self.assertGreater(sum_of_abs_diff, 0)
+
+  def test_classifier_basic_warm_starting(self):
+    """Tests correctness of DNNClassifier default warm-start."""
+    city = feature_column.embedding_column(
+        feature_column.categorical_column_with_vocabulary_list(
+            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
+        dimension=5)
+
+    # Create a DNNClassifier and train to save a checkpoint.
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=[256, 128],
+        feature_columns=[city],
+        model_dir=self._ckpt_and_vocab_dir,
+        n_classes=4,
+        optimizer='SGD')
+    dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
+
+    # Create a second DNNClassifier, warm-started from the first.  Use a
+    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
+    # accumulator values that change).
+    warm_started_dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=[256, 128],
+        feature_columns=[city],
+        n_classes=4,
+        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
+        warm_start_from=dnn_classifier.model_dir)
+
+    warm_started_dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
+    for variable_name in warm_started_dnn_classifier.get_variable_names():
+      self.assertAllClose(
+          dnn_classifier.get_variable_value(variable_name),
+          warm_started_dnn_classifier.get_variable_value(variable_name))
+
+  def test_regressor_basic_warm_starting(self):
+    """Tests correctness of DNNRegressor default warm-start."""
+    city = feature_column.embedding_column(
+        feature_column.categorical_column_with_vocabulary_list(
+            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
+        dimension=5)
+
+    # Create a DNNRegressor and train to save a checkpoint.
+    dnn_regressor = self._dnn_regressor_fn(
+        hidden_units=[256, 128],
+        feature_columns=[city],
+        model_dir=self._ckpt_and_vocab_dir,
+        optimizer='SGD')
+    dnn_regressor.train(input_fn=self._input_fn, max_steps=1)
+
+    # Create a second DNNRegressor, warm-started from the first.  Use a
+    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
+    # accumulator values that change).
+    warm_started_dnn_regressor = self._dnn_regressor_fn(
+        hidden_units=[256, 128],
+        feature_columns=[city],
+        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
+        warm_start_from=dnn_regressor.model_dir)
+
+    warm_started_dnn_regressor.train(input_fn=self._input_fn, max_steps=1)
+    for variable_name in warm_started_dnn_regressor.get_variable_names():
+      self.assertAllClose(
+          dnn_regressor.get_variable_value(variable_name),
+          warm_started_dnn_regressor.get_variable_value(variable_name))
+
+  def test_warm_starting_selective_variables(self):
+    """Tests selecting variables to warm-start."""
+    city = feature_column.embedding_column(
+        feature_column.categorical_column_with_vocabulary_list(
+            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
+        dimension=5)
+
+    # Create a DNNClassifier and train to save a checkpoint.
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=[256, 128],
+        feature_columns=[city],
+        model_dir=self._ckpt_and_vocab_dir,
+        n_classes=4,
+        optimizer='SGD')
+    dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
+
+    # Create a second DNNClassifier, warm-started from the first.  Use a
+    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
+    # accumulator values that change).
+    warm_started_dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=[256, 128],
+        feature_columns=[city],
+        n_classes=4,
+        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
+        # The provided regular expression will only warm-start the city
+        # embedding, not the kernels and biases of the hidden weights.
+        warm_start_from=warm_starting_util.WarmStartSettings(
+            ckpt_to_initialize_from=dnn_classifier.model_dir,
+            vars_to_warm_start='.*(city).*'))
+
+    warm_started_dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
+    for variable_name in warm_started_dnn_classifier.get_variable_names():
+      if 'city' in variable_name:
+        self.assertAllClose(
+            dnn_classifier.get_variable_value(variable_name),
+            warm_started_dnn_classifier.get_variable_value(variable_name))
+      elif 'bias' in variable_name:
+        # Hidden layer biases are zero-initialized.
+        bias_values = warm_started_dnn_classifier.get_variable_value(
+            variable_name)
+        self.assertAllClose(np.zeros_like(bias_values), bias_values)
+      elif 'kernel' in variable_name:
+        # We can't override the glorot uniform initializer used for the kernels
+        # in the dense layers, so just make sure we're not getting the same
+        # values from the old checkpoint.
+        self.assertAllNotClose(
+            dnn_classifier.get_variable_value(variable_name),
+            warm_started_dnn_classifier.get_variable_value(variable_name))
+
+  def test_warm_starting_with_vocab_remapping_and_partitioning(self):
+    """Tests warm-starting with vocab remapping and partitioning."""
+    vocab_list = ['doctor', 'lawyer', 'consultant']
+    vocab_file = os.path.join(self._ckpt_and_vocab_dir, 'occupation_vocab')
+    with open(vocab_file, 'w') as f:
+      f.write('\n'.join(vocab_list))
+    occupation = feature_column.embedding_column(
+        feature_column.categorical_column_with_vocabulary_file(
+            'occupation',
+            vocabulary_file=vocab_file,
+            vocabulary_size=len(vocab_list)),
+        dimension=2)
+
+    # Create a DNNClassifier and train to save a checkpoint.
+    partitioner = partitioned_variables.fixed_size_partitioner(num_shards=2)
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=[256, 128],
+        feature_columns=[occupation],
+        model_dir=self._ckpt_and_vocab_dir,
+        n_classes=4,
+        optimizer='SGD',
+        input_layer_partitioner=partitioner)
+    dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
+
+    # Create a second DNNClassifier, warm-started from the first.  Use a
+    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
+    # accumulator values that change).  Use a new FeatureColumn with a
+    # different vocabulary for occupation.
+    new_vocab_list = ['doctor', 'consultant', 'engineer']
+    new_vocab_file = os.path.join(self._ckpt_and_vocab_dir,
+                                  'new_occupation_vocab')
+    with open(new_vocab_file, 'w') as f:
+      f.write('\n'.join(new_vocab_list))
+    new_occupation = feature_column.embedding_column(
+        feature_column.categorical_column_with_vocabulary_file(
+            'occupation',
+            vocabulary_file=new_vocab_file,
+            vocabulary_size=len(new_vocab_list)),
+        dimension=2)
+    # We can create our VocabInfo object from the new and old occupation
+    # FeatureColumn's.
+    occupation_vocab_info = warm_starting_util.VocabInfo(
+        new_vocab=new_occupation.categorical_column.vocabulary_file,
+        new_vocab_size=new_occupation.categorical_column.vocabulary_size,
+        num_oov_buckets=new_occupation.categorical_column.num_oov_buckets,
+        old_vocab=occupation.categorical_column.vocabulary_file,
+        old_vocab_size=occupation.categorical_column.vocabulary_size,
+        # Can't use constant_initializer with load_and_remap.  In practice,
+        # use a truncated normal initializer.
+        backup_initializer=init_ops.random_uniform_initializer(
+            minval=0.39, maxval=0.39))
+    warm_started_dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=[256, 128],
+        feature_columns=[occupation],
+        n_classes=4,
+        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
+        warm_start_from=warm_starting_util.WarmStartSettings(
+            ckpt_to_initialize_from=dnn_classifier.model_dir,
+            var_name_to_vocab_info={
+                OCCUPATION_EMBEDDING_NAME: occupation_vocab_info
+            },
+            # Explicitly providing None here will only warm-start variables
+            # referenced in var_name_to_vocab_info (no hidden weights will be
+            # warmstarted).
+            vars_to_warm_start=None),
+        input_layer_partitioner=partitioner)
+
+    warm_started_dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
+    # 'doctor' was ID-0 and still ID-0.
+    self.assertAllClose(
+        dnn_classifier.get_variable_value(OCCUPATION_EMBEDDING_NAME)[0, :],
+        warm_started_dnn_classifier.get_variable_value(
+            OCCUPATION_EMBEDDING_NAME)[0, :])
+    # 'consultant' was ID-2 and now ID-1.
+    self.assertAllClose(
+        dnn_classifier.get_variable_value(OCCUPATION_EMBEDDING_NAME)[2, :],
+        warm_started_dnn_classifier.get_variable_value(
+            OCCUPATION_EMBEDDING_NAME)[1, :])
+    # 'engineer' is a new entry and should be initialized with the
+    # backup_initializer in VocabInfo.
+    self.assertAllClose([0.39] * 2,
+                        warm_started_dnn_classifier.get_variable_value(
+                            OCCUPATION_EMBEDDING_NAME)[2, :])
+    for variable_name in warm_started_dnn_classifier.get_variable_names():
+      if 'bias' in variable_name:
+        # Hidden layer biases are zero-initialized.
+        bias_values = warm_started_dnn_classifier.get_variable_value(
+            variable_name)
+        self.assertAllClose(np.zeros_like(bias_values), bias_values)
+      elif 'kernel' in variable_name:
+        # We can't override the glorot uniform initializer used for the kernels
+        # in the dense layers, so just make sure we're not getting the same
+        # values from the old checkpoint.
+        self.assertAllNotClose(
+            dnn_classifier.get_variable_value(variable_name),
+            warm_started_dnn_classifier.get_variable_value(variable_name))
+
+  def test_warm_starting_with_naming_change(self):
+    """Tests warm-starting with a Tensor name remapping."""
+    locality = feature_column.embedding_column(
+        feature_column.categorical_column_with_vocabulary_list(
+            'locality', vocabulary_list=['Mountain View', 'Palo Alto']),
+        dimension=5)
+
+    # Create a DNNClassifier and train to save a checkpoint.
+    dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=[256, 128],
+        feature_columns=[locality],
+        model_dir=self._ckpt_and_vocab_dir,
+        n_classes=4,
+        optimizer='SGD')
+    dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
+
+    # Create a second DNNClassifier, warm-started from the first.  Use a
+    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
+    # accumulator values that change).
+    city = feature_column.embedding_column(
+        feature_column.categorical_column_with_vocabulary_list(
+            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
+        dimension=5)
+    warm_started_dnn_classifier = self._dnn_classifier_fn(
+        hidden_units=[256, 128],
+        feature_columns=[city],
+        n_classes=4,
+        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
+        # The 'city' variable correspond to the 'locality' variable in the
+        # previous model.
+        warm_start_from=warm_starting_util.WarmStartSettings(
+            ckpt_to_initialize_from=dnn_classifier.model_dir,
+            var_name_to_prev_var_name={
+                CITY_EMBEDDING_NAME:
+                    CITY_EMBEDDING_NAME.replace('city', 'locality')
+            }))
+
+    warm_started_dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
+    for variable_name in warm_started_dnn_classifier.get_variable_names():
+      if 'city' in variable_name:
+        self.assertAllClose(
+            dnn_classifier.get_variable_value(
+                CITY_EMBEDDING_NAME.replace('city', 'locality')),
+            warm_started_dnn_classifier.get_variable_value(CITY_EMBEDDING_NAME))
+      else:
+        self.assertAllClose(
+            dnn_classifier.get_variable_value(variable_name),
+            warm_started_dnn_classifier.get_variable_value(variable_name))
+
+
 class BaseDNNClassifierEvaluateTest(object):
 
   def __init__(self, dnn_classifier_fn):
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index fa5d02c4767f9c21e7d0a3a2dad917f3cbf22c02..cb9e3fc6ca116ac0f48a37cea92fa4119754f324 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -24,6 +24,7 @@ import collections
 import six
 
 from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator import util
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export_output
@@ -54,11 +55,13 @@ _PREDICT_SERVING_KEY = 'predict'
 
 
 # A LossSpec contains
-# * a scalar `Tensor` representing weighted, sum-reduced loss
-# * a scalar `Tensor` representing the sum of example weights
+# * a scalar `Tensor` representing reduced weighted training loss
+# * a scalar `Tensor` representing the unreduced unweighted loss
+# * a scalar `Tensor` representing the example weights
 # * possibly processed labels (e.g. vocabulary lookup, shape manipulation, etc)
 LossSpec = collections.namedtuple(
-    'LossSpec', ['weighted_sum_loss', 'example_weight_sum', 'processed_labels'])
+    'LossSpec', ['training_loss', 'unreduced_loss', 'weights',
+                 'processed_labels'])
 
 
 def _summary_key(head_name, val):
@@ -159,8 +162,9 @@ class _Head(object):
 
     Returns:
       A LossSpec that contains
-      * the scalar `Tensor` representing weighted, sum-reduced loss
-      * the scalar `Tensor` representing the sum of example weights
+      * the scalar `Tensor` representing reduced weighted training loss
+      * the scalar `Tensor` representing the unreduced unweighted loss
+      * the scalar `Tensor` representing the example weights
       * possibly processed labels (e.g. vocabulary lookup, shape manipulation,
         etc.)
 
@@ -170,7 +174,8 @@ class _Head(object):
 
   @abc.abstractmethod
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None):
+      self, features, mode, logits, labels=None, train_op_fn=None,
+      regularization_losses=None):
     """Returns `EstimatorSpec` that a model_fn can return.
 
     Please note that,
@@ -182,10 +187,12 @@ class _Head(object):
       logits: logits `Tensor` to be used by the head.
       labels: Labels `Tensor`, or `dict` of same.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns an op
-          to optimize the model with the loss. This is used in TRAIN mode and
-          must not be None. None is allowed in other modes. If you want to
-          optimize loss yourself you can pass `no_op_train_fn` and then use
-          EstimatorSpec.loss to compute and apply gradients.
+        to optimize the model with the loss. This is used in TRAIN mode and
+        must not be None. None is allowed in other modes. If you want to
+        optimize loss yourself you can pass `no_op_train_fn` and then use
+        EstimatorSpec.loss to compute and apply gradients.
+      regularization_losses: A list of additional scalar losses to be added to
+        the training loss, such as regularization losses.
 
     Returns:
       `EstimatorSpec`.
@@ -365,6 +372,64 @@ def _check_logits_final_dim(logits, expected_logits_dimension):
         return array_ops.identity(logits, name=scope)
 
 
+def _validate_loss_fn_args(loss_fn):
+  """Validates loss_fn arguments.
+
+  Required arguments: labels, logits.
+  Optional arguments: features.
+
+  Args:
+    loss_fn: The loss function.
+  Raises:
+    ValueError: If the signature is unexpected.
+  """
+  loss_fn_args = util.fn_args(loss_fn)
+  for required_arg in ['labels', 'logits']:
+    if required_arg not in loss_fn_args:
+      raise ValueError(
+          'loss_fn must contain argument: {}. '
+          'Given arguments: {}'.format(required_arg, loss_fn_args))
+  invalid_args = list(set(loss_fn_args) - set(['labels', 'logits', 'features']))
+  if invalid_args:
+    raise ValueError('loss_fn has unexpected args: {}'.format(invalid_args))
+
+
+def _call_loss_fn(loss_fn, labels, logits, features, expected_loss_dim=1):
+  """Calls loss_fn and checks the returned shape.
+
+  Args:
+    loss_fn: The loss function.
+    labels: Processed labels Tensor.
+    logits: Logits Tensor of shape [D0, D1, ... DN, logits_dimension].
+    features: Features dict.
+    expected_loss_dim: The expected last dimension of loss Tensor.
+  Returns:
+    Loss Tensor with shape [D0, D1, ... DN, expected_loss_dim].
+  """
+  loss_fn_args = util.fn_args(loss_fn)
+  kwargs = {}
+  if 'features' in loss_fn_args:
+    kwargs['features'] = features
+  with ops.name_scope(
+      None, 'call_loss_fn',
+      values=[labels, logits] + list(six.itervalues(features))):
+    unweighted_loss = loss_fn(labels=labels, logits=logits, **kwargs)
+    logits_shape = array_ops.shape(logits, name='logits_shape')
+    expected_loss_shape = array_ops.concat(
+        [logits_shape[:-1], [expected_loss_dim]], axis=0,
+        name='expected_loss_shape')
+    loss_shape = array_ops.shape(unweighted_loss, name='loss_shape')
+    check_loss_shape_op = control_flow_ops.Assert(
+        math_ops.reduce_all(math_ops.equal(loss_shape, expected_loss_shape)),
+        data=[
+            'loss_fn must return Tensor of shape '
+            '[D0, D1, ... DN, {}]. '.format(expected_loss_dim),
+            'logits_shape: ', logits_shape, 'loss_shape: ', loss_shape],
+        name='check_loss_shape')
+    with ops.control_dependencies([check_loss_shape_op]):
+      return array_ops.identity(unweighted_loss)
+
+
 def _indicator_labels_mean(labels, weights=None, name=None):
   with ops.name_scope(name, 'labels_mean', (labels, weights)) as scope:
     labels = math_ops.to_float(labels, name='labels')
@@ -456,10 +521,13 @@ def _recall_at_threshold(labels, predictions, weights, threshold, name=None):
     return array_ops.squeeze(precision_tensor), array_ops.squeeze(update_op)
 
 
-def _multi_class_head_with_softmax_cross_entropy_loss(n_classes,
-                                                      weight_column=None,
-                                                      label_vocabulary=None,
-                                                      name=None):
+def _multi_class_head_with_softmax_cross_entropy_loss(
+    n_classes,
+    weight_column=None,
+    label_vocabulary=None,
+    loss_reduction=losses.Reduction.SUM,
+    loss_fn=None,
+    name=None):
   """Creates a '_Head' for multi class classification.
 
   The head expects `logits` with shape `[D0, D1, ... DN, n_classes]`.
@@ -477,6 +545,12 @@ def _multi_class_head_with_softmax_cross_entropy_loss(n_classes,
   labels have shape `[batch_size, 1]`, the loss is the weighted sum over
   `batch_size`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support integer `labels` with
+  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
+  the input labels before passing them to `loss_fn`.
+
   Args:
     n_classes: Number of classes, must be greater than 2 (for 2 classes, use
       `_BinaryLogisticHeadWithSigmoidCrossEntropyLoss`).
@@ -489,6 +563,9 @@ def _multi_class_head_with_softmax_cross_entropy_loss(n_classes,
       integer within [0, n_classes). If given, labels must be of string type and
       have any value in `label_vocabulary`. Note that errors will be raised if
       `label_vocabulary` is not provided but labels are strings.
+    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
+      reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -496,16 +573,26 @@ def _multi_class_head_with_softmax_cross_entropy_loss(n_classes,
     An instance of `_Head` for multi class classification.
 
   Raises:
-    ValueError: if `n_classes`, `metric_class_ids` or `label_keys` is invalid.
+    ValueError: If `n_classes`, `label_vocabulary` or `loss_reduction` is
+      invalid.
   """
   if label_vocabulary is not None and not isinstance(label_vocabulary,
                                                      (list, tuple)):
     raise ValueError(
         'label_vocabulary should be a list or a tuple. Given type: {}'.format(
             type(label_vocabulary)))
-
-  return _MultiClassHeadWithSoftmaxCrossEntropyLoss(n_classes, weight_column,
-                                                    label_vocabulary, name)
+  if (loss_reduction not in losses.Reduction.all() or
+      loss_reduction == losses.Reduction.NONE):
+    raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
+  if loss_fn:
+    _validate_loss_fn_args(loss_fn)
+  return _MultiClassHeadWithSoftmaxCrossEntropyLoss(
+      n_classes=n_classes,
+      weight_column=weight_column,
+      label_vocabulary=label_vocabulary,
+      loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
+      name=name)
 
 
 class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
@@ -515,12 +602,16 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
                n_classes,
                weight_column=None,
                label_vocabulary=None,
+               loss_reduction=losses.Reduction.SUM,
+               loss_fn=None,
                name=None):
     if (n_classes is None) or (n_classes <= 2):
       raise ValueError('n_classes must be > 2: %s.' % n_classes)
     self._n_classes = n_classes
     self._weight_column = weight_column
     self._label_vocabulary = label_vocabulary
+    self._loss_reduction = loss_reduction
+    self._loss_fn = loss_fn
     self._name = name
 
   @property
@@ -531,24 +622,20 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
   def logits_dimension(self):
     return self._n_classes
 
-  def _eval_metric_ops(self, labels, class_ids, weights, weighted_sum_loss,
-                       example_weight_sum):
+  def _eval_metric_ops(
+      self, labels, class_ids, weights, unreduced_loss, regularization_loss):
     """Returns the Eval metric ops."""
     with ops.name_scope(
         None, 'metrics',
-        (labels, class_ids, weights, weighted_sum_loss, example_weight_sum)):
+        (labels, class_ids, weights, unreduced_loss, regularization_loss)):
       keys = metric_keys.MetricKeys
       metric_ops = {
           # Estimator already adds a metric for loss.
           # TODO(xiejw): Any other metrics?
           _summary_key(self._name, keys.LOSS_MEAN):
               metrics_lib.mean(
-                  # Both values and weights here are reduced, scalar Tensors.
-                  # values is the actual mean we want -- weights represents the
-                  # total weight of the batch and is needed to calculate
-                  # update_op over many batches.
-                  values=(weighted_sum_loss / example_weight_sum),
-                  weights=example_weight_sum,
+                  values=unreduced_loss,
+                  weights=weights,
                   name=keys.LOSS_MEAN),
           _summary_key(self._name, keys.ACCURACY):
               metrics_lib.accuracy(
@@ -557,6 +644,11 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
                   weights=weights,
                   name=keys.ACCURACY),
       }
+      if regularization_loss is not None:
+        metric_ops[_summary_key(self._name, keys.LOSS_REGULARIZATION)] = (
+            metrics_lib.mean(
+                values=regularization_loss,
+                name=keys.LOSS_REGULARIZATION))
     return metric_ops
 
   def _label_ids(self, labels):
@@ -582,24 +674,28 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
     labels = _check_dense_labels_match_logits_and_reshape(
         labels=labels, logits=logits, expected_labels_dimension=1)
     label_ids = self._label_ids(labels)
-    unweighted_loss = losses.sparse_softmax_cross_entropy(
-        labels=label_ids, logits=logits, reduction=losses.Reduction.NONE)
-    # Restore the squeezed dim, so unweighted_loss matches the weights shape.
-    unweighted_loss = array_ops.expand_dims(unweighted_loss, axis=-1)
+    if self._loss_fn:
+      unweighted_loss = _call_loss_fn(
+          loss_fn=self._loss_fn, labels=label_ids, logits=logits,
+          features=features, expected_loss_dim=1)
+    else:
+      unweighted_loss = losses.sparse_softmax_cross_entropy(
+          labels=label_ids, logits=logits, reduction=losses.Reduction.NONE)
+      # Restore the squeezed dim, so unweighted_loss matches the weights shape.
+      unweighted_loss = array_ops.expand_dims(unweighted_loss, axis=-1)
     weights = _get_weights_and_check_match_logits(
         features=features, weight_column=self._weight_column, logits=logits)
-    weighted_sum_loss = losses.compute_weighted_loss(
-        unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
-    # _weights() can return 1.
-    example_weight_sum = math_ops.reduce_sum(
-        weights * array_ops.ones_like(unweighted_loss))
+    training_loss = losses.compute_weighted_loss(
+        unweighted_loss, weights=weights, reduction=self._loss_reduction)
     return LossSpec(
-        weighted_sum_loss=weighted_sum_loss,
-        example_weight_sum=example_weight_sum,
+        training_loss=training_loss,
+        unreduced_loss=unweighted_loss,
+        weights=weights,
         processed_labels=label_ids)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None):
+      self, features, mode, logits, labels=None, train_op_fn=None,
+      regularization_losses=None):
     """Returns an `EstimatorSpec`.
 
     Args:
@@ -608,10 +704,16 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
       logits: logits `Tensor` with shape `[D0, D1, ... DN, logits_dimension]`.
         For many applications, the shape is `[batch_size, logits_dimension]`.
       labels: Labels integer or string `Tensor` with shape matching `logits`,
-        namely `[D0, D1, ... DN, 1]`. `labels` is required argument when `mode`
-        equals `TRAIN` or `EVAL`.
+        namely `[D0, D1, ... DN, 1]` or `[D0, D1, ... DN]`. `labels` is
+        required argument when `mode` equals `TRAIN` or `EVAL`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns
         `train_op`. Required in TRAIN mode.
+      regularization_losses: A list of additional scalar losses to be added to
+        the training loss, such as regularization losses. These losses are
+        usually expressed as a batch average, so for best results users need to
+        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
+        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
+        avoid scaling errors.
     Returns:
       `EstimatorSpec`.
     Raises:
@@ -655,40 +757,66 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
                 _PREDICT_SERVING_KEY: export_output.PredictOutput(predictions)
             })
 
-      weighted_sum_loss, example_weight_sum, label_ids = self.create_loss(
+      training_loss, unreduced_loss, weights, label_ids = self.create_loss(
           features=features, mode=mode, logits=logits, labels=labels)
+      if regularization_losses:
+        regularization_loss = math_ops.add_n(regularization_losses)
+        regularized_training_loss = math_ops.add_n(
+            [training_loss, regularization_loss])
+      else:
+        regularization_loss = None
+        regularized_training_loss = training_loss
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
-            loss=weighted_sum_loss,
+            loss=regularized_training_loss,
             eval_metric_ops=self._eval_metric_ops(
                 labels=label_ids,
                 class_ids=class_ids,
-                weights=_weights(features, self._weight_column),
-                weighted_sum_loss=weighted_sum_loss,
-                example_weight_sum=example_weight_sum))
+                weights=weights,
+                unreduced_loss=unreduced_loss,
+                regularization_loss=regularization_loss))
 
       # Train.
       if train_op_fn is None:
         raise ValueError('train_op_fn cannot be None.')
+      # Only summarize mean_loss for SUM reduction to preserve backwards
+      # compatibility. Otherwise skip it to avoid unnecessary computation.
+      if self._loss_reduction == losses.Reduction.SUM:
+        example_weight_sum = math_ops.reduce_sum(
+            weights * array_ops.ones_like(unreduced_loss))
+        mean_loss = training_loss / example_weight_sum
+      else:
+        mean_loss = None
     with ops.name_scope(''):
+      keys = metric_keys.MetricKeys
       summary.scalar(
-          _summary_key(self._name, metric_keys.MetricKeys.LOSS),
-          weighted_sum_loss)
-      summary.scalar(
-          _summary_key(self._name, metric_keys.MetricKeys.LOSS_MEAN),
-          weighted_sum_loss / example_weight_sum)
+          _summary_key(self._name, keys.LOSS),
+          regularized_training_loss)
+      if mean_loss is not None:
+        summary.scalar(
+            _summary_key(self._name, keys.LOSS_MEAN),
+            mean_loss)
+      if regularization_loss is not None:
+        summary.scalar(
+            _summary_key(self._name, keys.LOSS_REGULARIZATION),
+            regularization_loss)
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
-        loss=weighted_sum_loss,
-        train_op=train_op_fn(weighted_sum_loss))
+        loss=regularized_training_loss,
+        train_op=train_op_fn(regularized_training_loss))
 
 
 def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
-    weight_column=None, thresholds=None, label_vocabulary=None, name=None):
+    weight_column=None,
+    thresholds=None,
+    label_vocabulary=None,
+    loss_reduction=losses.Reduction.SUM,
+    loss_fn=None,
+    name=None):
   """Creates a `_Head` for single label binary classification.
 
   This head uses `sigmoid_cross_entropy_with_logits` loss.
@@ -708,6 +836,12 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
   labels have shape `[batch_size, 1]`, the loss is the weighted sum over
   `batch_size`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support float `labels` with
+  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
+  the input labels before passing them to `loss_fn`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -723,6 +857,9 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
       [0, 1]. If given, labels must be string type and have any value in
       `label_vocabulary`. Note that errors will be raised if `label_vocabulary`
       is not provided but labels are strings.
+    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
+      reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -730,7 +867,8 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
     An instance of `_Head` for binary classification.
 
   Raises:
-    ValueError: if `thresholds` contains a value outside of `(0, 1)`.
+    ValueError: If `thresholds` contains a value outside of `(0, 1)`.
+    ValueError: If `loss_reduction` is invalid.
   """
   thresholds = tuple(thresholds) if thresholds else tuple()
   if label_vocabulary is not None and not isinstance(label_vocabulary,
@@ -742,10 +880,17 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
   for threshold in thresholds:
     if (threshold <= 0.0) or (threshold >= 1.0):
       raise ValueError('thresholds not in (0, 1): {}.'.format((thresholds,)))
+  if (loss_reduction not in losses.Reduction.all() or
+      loss_reduction == losses.Reduction.NONE):
+    raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
+  if loss_fn:
+    _validate_loss_fn_args(loss_fn)
   return _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(
       weight_column=weight_column,
       thresholds=thresholds,
       label_vocabulary=label_vocabulary,
+      loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
@@ -756,10 +901,14 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
                weight_column=None,
                thresholds=None,
                label_vocabulary=None,
+               loss_reduction=losses.Reduction.SUM,
+               loss_fn=None,
                name=None):
     self._weight_column = weight_column
     self._thresholds = thresholds
     self._label_vocabulary = label_vocabulary
+    self._loss_reduction = loss_reduction
+    self._loss_fn = loss_fn
     self._name = name
 
   @property
@@ -771,10 +920,10 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
     return 1
 
   def _eval_metric_ops(self, labels, logits, logistic, class_ids, weights,
-                       weighted_sum_loss, example_weight_sum):
+                       unreduced_loss, regularization_loss):
     with ops.name_scope(None, 'metrics',
                         (labels, logits, logistic, class_ids, weights,
-                         weighted_sum_loss, example_weight_sum)):
+                         unreduced_loss, regularization_loss)):
       keys = metric_keys.MetricKeys
       labels_mean = _indicator_labels_mean(
           labels=labels, weights=weights, name=keys.LABEL_MEAN)
@@ -782,12 +931,8 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
           # Estimator already adds a metric for loss.
           _summary_key(self._name, keys.LOSS_MEAN):
               metrics_lib.mean(
-                  # Both values and weights here are reduced, scalar Tensors.
-                  # values is the actual mean we want -- weights represents the
-                  # total weight of the batch and is needed to calculate
-                  # update_op over many batches.
-                  values=(weighted_sum_loss / example_weight_sum),
-                  weights=example_weight_sum,
+                  values=unreduced_loss,
+                  weights=weights,
                   name=keys.LOSS_MEAN),
           _summary_key(self._name, keys.ACCURACY):
               metrics_lib.accuracy(
@@ -818,6 +963,11 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
                   curve='PR',
                   name=keys.AUC_PR)
       }
+      if regularization_loss is not None:
+        metric_ops[_summary_key(self._name, keys.LOSS_REGULARIZATION)] = (
+            metrics_lib.mean(
+                values=regularization_loss,
+                name=keys.LOSS_REGULARIZATION))
       for threshold in self._thresholds:
         accuracy_key = keys.ACCURACY_AT_THRESHOLD % threshold
         metric_ops[_summary_key(self._name,
@@ -859,23 +1009,49 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
           name='class_id_lookup').lookup(labels)
     labels = math_ops.to_float(labels)
     labels = _assert_range(labels, 2)
-    unweighted_loss = nn.sigmoid_cross_entropy_with_logits(
-        labels=labels, logits=logits)
+    if self._loss_fn:
+      unweighted_loss = _call_loss_fn(
+          loss_fn=self._loss_fn, labels=labels, logits=logits,
+          features=features, expected_loss_dim=1)
+    else:
+      unweighted_loss = nn.sigmoid_cross_entropy_with_logits(
+          labels=labels, logits=logits)
     weights = _get_weights_and_check_match_logits(
         features=features, weight_column=self._weight_column, logits=logits)
-    weighted_sum_loss = losses.compute_weighted_loss(
-        unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
-    # _weights() can return 1.
-    example_weight_sum = math_ops.reduce_sum(
-        weights * array_ops.ones_like(unweighted_loss))
+    training_loss = losses.compute_weighted_loss(
+        unweighted_loss, weights=weights, reduction=self._loss_reduction)
     return LossSpec(
-        weighted_sum_loss=weighted_sum_loss,
-        example_weight_sum=example_weight_sum,
+        training_loss=training_loss,
+        unreduced_loss=unweighted_loss,
+        weights=weights,
         processed_labels=labels)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None):
-    """See `Head`."""
+      self, features, mode, logits, labels=None, train_op_fn=None,
+      regularization_losses=None):
+    """Returns an `EstimatorSpec`.
+
+    Args:
+      features: Input `dict` of `Tensor` or `SparseTensor` objects.
+      mode: Estimator's `ModeKeys`.
+      logits: logits `Tensor` with shape `[D0, D1, ... DN, 1]`. For many
+        applications, the shape is `[batch_size, 1]`.
+      labels: Labels integer or string `Tensor` with shape matching `logits`,
+        namely `[D0, D1, ... DN, 1]` or `[D0, D1, ... DN]`. `labels` is required
+        argument when `mode` equals `TRAIN` or `EVAL`.
+      train_op_fn: Function that takes a scalar loss `Tensor` and returns
+        `train_op`. Required in TRAIN mode.
+      regularization_losses: A list of additional scalar losses to be added to
+        the training loss, such as regularization losses. These losses are
+        usually expressed as a batch average, so for best results users need to
+        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
+        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
+        avoid scaling errors.
+    Returns:
+      `EstimatorSpec`.
+    Raises:
+      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+    """
     # Predict.
     with ops.name_scope(self._name, 'head'):
       with ops.name_scope(None, 'predictions', (logits,)):
@@ -919,47 +1095,68 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
                 _PREDICT_SERVING_KEY: export_output.PredictOutput(predictions)
             })
 
-      (weighted_sum_loss, example_weight_sum,
-       processed_labels) = self.create_loss(
-           features=features, mode=mode, logits=logits, labels=labels)
+      (training_loss, unreduced_loss, weights, processed_labels) = (
+          self.create_loss(
+              features=features, mode=mode, logits=logits, labels=labels))
+      if regularization_losses:
+        regularization_loss = math_ops.add_n(regularization_losses)
+        regularized_training_loss = math_ops.add_n(
+            [training_loss, regularization_loss])
+      else:
+        regularization_loss = None
+        regularized_training_loss = training_loss
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
-        weights = _get_weights_and_check_match_logits(
-            features=features, weight_column=self._weight_column, logits=logits)
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
-            loss=weighted_sum_loss,
+            loss=regularized_training_loss,
             eval_metric_ops=self._eval_metric_ops(
                 labels=processed_labels,
                 logits=logits,
                 logistic=logistic,
                 class_ids=class_ids,
                 weights=weights,
-                weighted_sum_loss=weighted_sum_loss,
-                example_weight_sum=example_weight_sum))
+                unreduced_loss=unreduced_loss,
+                regularization_loss=regularization_loss))
 
       # Train.
       if train_op_fn is None:
         raise ValueError('train_op_fn can not be None.')
+      # Only summarize mean_loss for SUM reduction to preserve backwards
+      # compatibility. Otherwise skip it to avoid unnecessary computation.
+      if self._loss_reduction == losses.Reduction.SUM:
+        example_weight_sum = math_ops.reduce_sum(
+            weights * array_ops.ones_like(unreduced_loss))
+        mean_loss = training_loss / example_weight_sum
+      else:
+        mean_loss = None
     with ops.name_scope(''):
+      keys = metric_keys.MetricKeys
       summary.scalar(
-          _summary_key(self._name, metric_keys.MetricKeys.LOSS),
-          weighted_sum_loss)
-      summary.scalar(
-          _summary_key(self._name, metric_keys.MetricKeys.LOSS_MEAN),
-          weighted_sum_loss / example_weight_sum)
+          _summary_key(self._name, keys.LOSS),
+          regularized_training_loss)
+      if mean_loss is not None:
+        summary.scalar(
+            _summary_key(self._name, keys.LOSS_MEAN), mean_loss)
+      if regularization_loss is not None:
+        summary.scalar(
+            _summary_key(self._name, keys.LOSS_REGULARIZATION),
+            regularization_loss)
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
-        loss=weighted_sum_loss,
-        train_op=train_op_fn(weighted_sum_loss))
+        loss=regularized_training_loss,
+        train_op=train_op_fn(regularized_training_loss))
 
 
-def _regression_head_with_mean_squared_error_loss(weight_column=None,
-                                                  label_dimension=1,
-                                                  name=None):
+def _regression_head_with_mean_squared_error_loss(
+    weight_column=None,
+    label_dimension=1,
+    loss_reduction=losses.Reduction.SUM,
+    loss_fn=None,
+    name=None):
   """Creates a `_Head` for regression using the `mean_squared_error` loss.
 
   The loss is the weighted sum over all input dimensions. Namely, if the input
@@ -977,6 +1174,10 @@ def _regression_head_with_mean_squared_error_loss(weight_column=None,
   `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
   `[D0, D1, ... DN, label_dimension]`.
 
+  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
+  `(labels, logits, features)` as arguments and returns unreduced loss with
+  shape `[D0, D1, ... DN, label_dimension]`.
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -985,27 +1186,48 @@ def _regression_head_with_mean_squared_error_loss(weight_column=None,
     label_dimension: Number of regression labels per example. This is the size
       of the last dimension of the labels `Tensor` (typically, this has shape
       `[batch_size, label_dimension]`).
+    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
+      reduce training loss over batch. Defaults to `SUM`.
+    loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
   Returns:
     An instance of `_Head` for linear regression.
+
+  Raises:
+    ValueError: If `label_dimension` or `loss_reduction` is invalid.
   """
+  if (loss_reduction not in losses.Reduction.all() or
+      loss_reduction == losses.Reduction.NONE):
+    raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
+  if loss_fn:
+    _validate_loss_fn_args(loss_fn)
   return _RegressionHeadWithMeanSquaredErrorLoss(
       weight_column=weight_column,
       label_dimension=label_dimension,
+      loss_reduction=loss_reduction,
+      loss_fn=loss_fn,
       name=name)
 
 
 class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
   """`Head` for regression using the mean squared loss."""
 
-  def __init__(self, label_dimension, weight_column=None, name=None):
+  def __init__(
+      self,
+      label_dimension,
+      weight_column=None,
+      loss_reduction=losses.Reduction.SUM,
+      loss_fn=None,
+      name=None):
     """`Head` for regression."""
     if label_dimension < 1:
       raise ValueError('Invalid label_dimension %s.' % label_dimension)
     self._logits_dimension = label_dimension
     self._weight_column = weight_column
+    self._loss_reduction = loss_reduction
+    self._loss_fn = loss_fn
     self._name = name
 
   @property
@@ -1024,23 +1246,27 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         labels=labels, logits=logits,
         expected_labels_dimension=self._logits_dimension)
     labels = math_ops.to_float(labels)
-    unweighted_loss = losses.mean_squared_error(
-        labels=labels, predictions=logits, reduction=losses.Reduction.NONE)
+    if self._loss_fn:
+      unweighted_loss = _call_loss_fn(
+          loss_fn=self._loss_fn, labels=labels, logits=logits,
+          features=features, expected_loss_dim=self._logits_dimension)
+    else:
+      unweighted_loss = losses.mean_squared_error(
+          labels=labels, predictions=logits, reduction=losses.Reduction.NONE)
     weights = _get_weights_and_check_match_logits(
         features=features, weight_column=self._weight_column, logits=logits,
         allow_per_logit_weights=True)
-    weighted_sum_loss = losses.compute_weighted_loss(
-        unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
-    # _weights() can return 1.
-    example_weight_sum = math_ops.reduce_sum(
-        weights * array_ops.ones_like(unweighted_loss))
+    training_loss = losses.compute_weighted_loss(
+        unweighted_loss, weights=weights, reduction=self._loss_reduction)
     return LossSpec(
-        weighted_sum_loss=weighted_sum_loss,
-        example_weight_sum=example_weight_sum,
+        training_loss=training_loss,
+        unreduced_loss=unweighted_loss,
+        weights=weights,
         processed_labels=labels)
 
   def create_estimator_spec(
-      self, features, mode, logits, labels=None, train_op_fn=None):
+      self, features, mode, logits, labels=None, train_op_fn=None,
+      regularization_losses=None):
     """Returns an `EstimatorSpec`.
 
     Args:
@@ -1054,6 +1280,12 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         `mode` equals `TRAIN` or `EVAL`.
       train_op_fn: Function that takes a scalar loss `Tensor` and returns
         `train_op`. Required in TRAIN mode.
+      regularization_losses: A list of additional scalar losses to be added to
+        the training loss, such as regularization losses. These losses are
+        usually expressed as a batch average, so for best results users need to
+        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
+        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
+        avoid scaling errors.
     Returns:
       `EstimatorSpec`.
     Raises:
@@ -1074,43 +1306,66 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
                 _PREDICT_SERVING_KEY: export_output.PredictOutput(predictions)
             })
 
-      weighted_sum_loss, example_weight_sum, _ = self.create_loss(
+      training_loss, unreduced_loss, weights, _ = self.create_loss(
           features=features, mode=mode, logits=logits, labels=labels)
+      if regularization_losses:
+        regularization_loss = math_ops.add_n(regularization_losses)
+        regularized_training_loss = math_ops.add_n(
+            [training_loss, regularization_loss])
+      else:
+        regularization_loss = None
+        regularized_training_loss = training_loss
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
+        keys = metric_keys.MetricKeys
         # Estimator already adds a metric for loss.
         eval_metric_ops = {
-            _summary_key(self._name, metric_keys.MetricKeys.LOSS_MEAN):
+            _summary_key(self._name, keys.LOSS_MEAN):
                 metrics_lib.mean(
-                    # Both values and weights here are reduced, scalar Tensors.
-                    # values is the actual mean we want -- weights represents
-                    # the total weight of the batch and is needed to calculate
-                    # update_op over many batches.
-                    values=(weighted_sum_loss / example_weight_sum),
-                    weights=example_weight_sum)
+                    values=unreduced_loss,
+                    weights=weights)
         }
+        if regularization_loss is not None:
+          regularization_loss_key = _summary_key(
+              self._name, keys.LOSS_REGULARIZATION)
+          eval_metric_ops[regularization_loss_key] = metrics_lib.mean(
+              values=regularization_loss,
+              name=keys.LOSS_REGULARIZATION)
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
-            loss=weighted_sum_loss,
+            loss=regularized_training_loss,
             eval_metric_ops=eval_metric_ops)
 
       # Train.
       if train_op_fn is None:
         raise ValueError('train_op_fn can not be None.')
+      # Only summarize mean_loss for SUM reduction to preserve backwards
+      # compatibility. Otherwise skip it to avoid unnecessary computation.
+      if self._loss_reduction == losses.Reduction.SUM:
+        example_weight_sum = math_ops.reduce_sum(
+            weights * array_ops.ones_like(unreduced_loss))
+        mean_loss = training_loss / example_weight_sum
+      else:
+        mean_loss = None
     with ops.name_scope(''):
+      keys = metric_keys.MetricKeys
       summary.scalar(
-          _summary_key(self._name, metric_keys.MetricKeys.LOSS),
-          weighted_sum_loss)
-      summary.scalar(
-          _summary_key(self._name, metric_keys.MetricKeys.LOSS_MEAN),
-          weighted_sum_loss / example_weight_sum)
+          _summary_key(self._name, keys.LOSS),
+          regularized_training_loss)
+      if mean_loss is not None:
+        summary.scalar(
+            _summary_key(self._name, keys.LOSS_MEAN), mean_loss)
+      if regularization_loss is not None:
+        summary.scalar(
+            _summary_key(self._name, keys.LOSS_REGULARIZATION),
+            regularization_loss)
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
-        loss=weighted_sum_loss,
-        train_op=train_op_fn(weighted_sum_loss))
+        loss=regularized_training_loss,
+        train_op=train_op_fn(regularized_training_loss))
 
 
 def _assert_range(labels, n_classes, message=None):
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index f3afd84125d8758fec61d9afc08a64a0210c1f6d..3a03770af498981a054c3df9155e83a60c7f0350 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import monitored_session
@@ -100,6 +101,51 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
       head_lib._multi_class_head_with_softmax_cross_entropy_loss(
           n_classes=2)
 
+  def test_invalid_loss_reduction(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'Invalid loss_reduction: invalid_loss_reduction'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=3, loss_reduction='invalid_loss_reduction')
+    with self.assertRaisesRegexp(
+        ValueError, r'Invalid loss_reduction: none'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=3, loss_reduction=losses.Reduction.NONE)
+
+  def test_loss_fn_arg_labels_missing(self):
+    def _loss_fn(logits):
+      del logits  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: labels\. '
+        r'Given arguments: \(\'logits\',\)'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=3, loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_logits_missing(self):
+    def _loss_fn(labels):
+      del labels  # unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: logits\. '
+        r'Given arguments: \(\'labels\',\)'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=3, loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_features_ok(self):
+    def _loss_fn(labels, logits, features):
+      del labels, logits, features  # Unused
+    head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_invalid(self):
+    def _loss_fn(labels, logits, name=None):
+      del labels, logits, name  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn has unexpected args: \[\'name\'\]'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=3, loss_fn=_loss_fn)
+
   def test_invalid_logits_shape(self):
     n_classes = 3
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
@@ -149,7 +195,7 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     # Dynamic shape.
     labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
     logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_placeholder,
@@ -158,7 +204,7 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           r'\[expected_labels_shape: \] \[2 1\] \[labels_shape: \] \[2 2\]'):
-        weighted_sum_loss.eval({
+        training_loss.eval({
             logits_placeholder: logits_2x3,
             labels_placeholder: labels_2x2
         })
@@ -203,21 +249,21 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
 
     labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
     logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features={'x': np.array(((42.,),))},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_placeholder,
         labels=labels_placeholder)[0]
     with self.test_session():
       with self.assertRaisesOpError('Label IDs must < n_classes'):
-        weighted_sum_loss.eval({
+        training_loss.eval({
             labels_placeholder: labels_2x1_with_large_id,
             logits_placeholder: logits_2x3
         })
 
     with self.test_session():
       with self.assertRaisesOpError('Label IDs must >= 0'):
-        weighted_sum_loss.eval({
+        training_loss.eval({
             labels_placeholder: labels_2x1_with_negative_id,
             logits_placeholder: logits_2x3
         })
@@ -264,7 +310,7 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     # Dynamic shape.
     labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
     logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_placeholder,
@@ -273,7 +319,7 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           r'\[expected_labels_shape: \] \[2 1\] \[labels_shape: \] \[3 1\]'):
-        weighted_sum_loss.eval({
+        training_loss.eval({
             labels_placeholder: values_3x1,
             logits_placeholder: values_2x3
         })
@@ -383,9 +429,9 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     labels = np.array(((1,), (1,)), dtype=np.int64)
     features = {'x': np.array(((42,),), dtype=np.int32)}
     # loss = cross_entropy(labels, logits) = [10, 0].
-    expected_weighted_sum_loss = 10.
+    expected_training_loss = 10.
     # Create loss.
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
@@ -393,10 +439,57 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss,
-          weighted_sum_loss.eval(),
-          rtol=1e-2,
-          atol=1e-2)
+          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
+
+  def test_eval_create_loss_loss_fn(self):
+    """Tests head.create_loss for eval mode and custom loss_fn."""
+    loss = np.array([[1.], [2.]], dtype=np.float32)
+    logits_input = np.array([[-10., 10., 0.], [-15., 10., 0]], dtype=np.float32)
+    labels_input = np.array([[1], [2]], dtype=np.int64)
+    def _loss_fn(labels, logits):
+      check_labels = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(labels, labels_input)),
+          data=[labels])
+      check_logits = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(logits, logits_input)),
+          data=[logits])
+      with ops.control_dependencies([check_labels, check_logits]):
+        return constant_op.constant(loss)
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_fn=_loss_fn)
+
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_input,
+        labels=labels_input)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
+
+  def test_eval_create_loss_loss_fn_wrong_shape(self):
+    """Tests custom loss_fn that returns Tensor of unexpected shape."""
+    loss = np.array([1., 2.], dtype=np.float32)
+    def _loss_fn(labels, logits):
+      del labels, logits  # Unused
+      return constant_op.constant(loss)
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_fn=_loss_fn)
+
+    logits = np.array([[-10., 10., 0.], [-15., 10., 0.]], dtype=np.float32)
+    labels = np.array([[1], [2]], dtype=np.int64)
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 1\]\. \] '
+          r'\[logits_shape: \] \[2 3\] \[loss_shape: \] \[2\]'):
+        actual_training_loss.eval()
 
   def test_eval_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -476,6 +569,52 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     ]
     self.assertItemsEqual(expected_metric_keys, spec.eval_metric_ops.keys())
 
+  def test_eval_with_regularization_losses(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
+    labels = np.array(((1,), (1,)), dtype=np.int64)
+    features = {'x': np.array(((42,),), dtype=np.int32)}
+    regularization_losses = [1.5, 0.5]
+    expected_regularization_loss = 2.
+    # unregularized_loss = sum(cross_entropy(labels, logits)) / batch_size
+    #                    = sum(10, 0) / 2 = 5.
+    expected_unregularized_loss = 5.
+    expected_regularized_loss = (
+        expected_unregularized_loss + expected_regularization_loss)
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels,
+        regularization_losses=regularization_losses)
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_unregularized_loss,
+        keys.LOSS_REGULARIZATION: expected_regularization_loss,
+        keys.ACCURACY: 0.5,  # 1 of 2 labels is correct.
+    }
+
+    # Assert predictions, loss, and metrics.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      loss, metrics = sess.run((spec.loss, update_ops))
+      self.assertAllClose(expected_regularized_loss, loss, rtol=tol, atol=tol)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval()
+                             for k in value_ops},
+          rtol=tol,
+          atol=tol)
+
   def test_eval_with_label_vocabulary_create_loss(self):
     n_classes = 3
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
@@ -484,8 +623,8 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     labels = [[b'iroh'], [b'iroh']]
     features = {'x': np.array(((42,),), dtype=np.int32)}
     # loss = cross_entropy(labels, logits) = [10, 0].
-    expected_weighted_sum_loss = 10.
-    weighted_sum_loss = head.create_loss(
+    expected_training_loss = 10.
+    training_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
@@ -493,10 +632,7 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss,
-          weighted_sum_loss.eval(),
-          rtol=1e-2,
-          atol=1e-2)
+          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
 
   def test_eval_with_label_vocabulary(self):
     n_classes = 3
@@ -584,27 +720,61 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
           rtol=tol, atol=tol)
 
   def test_train_create_loss(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3)
 
     logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
     labels = np.array(((1,), (1,)), dtype=np.int64)
     features = {'x': np.array(((42,),), dtype=np.int32)}
 
-    # loss = cross_entropy(labels, logits) = [10, 0].
-    expected_weighted_sum_loss = 10.
-    weighted_sum_loss = head.create_loss(
+    # unreduced_loss = cross_entropy(labels, logits) = [10, 0].
+    expected_unreduced_loss = [[10.], [0.]]
+    # Weights default to 1.
+    expected_weights = 1.
+    # training_loss = 1 * 10 + 1 * 0
+    expected_training_loss = 10.
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
-        labels=labels)[0]
+        labels=labels)
+    tol = 1e-2
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(
+          expected_training_loss, training_loss.eval(), rtol=tol, atol=tol)
+      self.assertAllClose(
+          expected_unreduced_loss, unreduced_loss.eval(), rtol=tol, atol=tol)
+      self.assertAllClose(expected_weights, actual_weights)
+
+  def test_train_create_loss_loss_reduction(self):
+    """Tests create_loss with loss_reduction."""
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+
+    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
+    labels = np.array(((1,), (1,)), dtype=np.int64)
+    features = {'x': np.array(((42,),), dtype=np.int32)}
+
+    # unreduced_loss = cross_entropy(labels, logits) = [10, 0].
+    expected_unreduced_loss = [[10.], [0.]]
+    # Weights default to 1.
+    expected_weights = 1.
+    # training_loss = 1 * 10 + 1 * 0 / num_nonzero_weights
+    expected_training_loss = 10. / 2.
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)
+    tol = 1e-2
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss,
-          weighted_sum_loss.eval(),
-          rtol=1e-2,
-          atol=1e-2)
+          expected_training_loss, training_loss.eval(), rtol=tol, atol=tol)
+      self.assertAllClose(
+          expected_unreduced_loss, unreduced_loss.eval(), rtol=tol, atol=tol)
+      self.assertAllClose(expected_weights, actual_weights)
 
   def test_train_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -702,10 +872,55 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
               expected_loss / 2,
       }, summary_str, tol)
 
-  def test_train_with_one_dim_label_and_weights_create_loss(self):
+  def test_train_with_regularization_losses(self):
     n_classes = 3
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, weight_column='label_weights')
+        n_classes, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+
+    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
+    labels = np.array(((1,), (1,)), dtype=np.int64)
+    features = {'x': np.array(((42,),), dtype=np.int32)}
+    expected_train_result = 'my_train_op'
+    def _train_op_fn(loss):
+      return string_ops.string_join(
+          [constant_op.constant(expected_train_result),
+           string_ops.as_string(loss, precision=2)])
+
+    regularization_losses = [1.5, 0.5]
+    expected_regularization_loss = 2.
+    # unregularized_loss = sum(cross_entropy(labels, logits)) / batch_size
+    #                    = sum(10, 0) / 2 = 5.
+    # loss = unregularized_loss + regularization_loss = 7.
+    expected_loss = 7.
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        regularization_losses=regularization_losses)
+
+    # Assert predictions, loss, train_op, and summaries.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
+                                                  spec.scaffold.summary_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
+          train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          metric_keys.MetricKeys.LOSS_REGULARIZATION: (
+              expected_regularization_loss),
+      }, summary_str, tol)
+
+  def test_train_one_dim_create_loss(self):
+    """Tests create_loss with 1D labels and weights (shape [batch_size])."""
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, weight_column='label_weights')
 
     logits = np.array(((10, 0, 0), (0, 10, 0), (0, 0, 10),), dtype=np.float32)
     labels_rank_1 = np.array((1, 2, 2,), dtype=np.int64)
@@ -715,33 +930,30 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
         'label_weights': weights_rank_1
     }
 
-    # loss = cross_entropy(labels, logits) = [10, 10, 0].
-    # weighted sum loss = 1 * 10 + 2 * 10 + 3 * 0 = 30.
-    expected_weighted_sum_loss = 30.
-    # example weight sum = 1 + 2 + 3
-    expected_example_weight_sum = 6.
-    weighted_sum_loss, example_weight_sum, _ = head.create_loss(
+    # unreduced_loss = cross_entropy(labels, logits) = [10, 10, 0].
+    expected_unreduced_loss = [[10.], [10.], [0.]]
+    # weights are reshaped to [3, 1] to match logits.
+    expected_weights = [[1.], [2.], [3.]]
+    # training_loss = 1 * 10 + 2 * 10 + 3 * 0 = 30.
+    expected_training_loss = 30.
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
         labels=labels_rank_1)
+    tol = 1e-2
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss,
-          weighted_sum_loss.eval(),
-          rtol=1e-2,
-          atol=1e-2)
+          expected_training_loss, training_loss.eval(), rtol=tol, atol=tol)
       self.assertAllClose(
-          expected_example_weight_sum,
-          example_weight_sum.eval(),
-          rtol=1e-2,
-          atol=1e-2)
+          expected_unreduced_loss, unreduced_loss.eval(), rtol=tol, atol=tol)
+      self.assertAllClose(expected_weights, actual_weights.eval())
 
-  def test_train_with_one_dim_label_and_weights(self):
-    n_classes = 3
+  def test_train_one_dim(self):
+    """Tests train with 1D labels and weights (shape [batch_size])."""
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, weight_column='label_weights')
+        n_classes=3, weight_column='label_weights')
 
     logits = np.array(((10, 0, 0), (0, 10, 0), (0, 0, 10),), dtype=np.float32)
     labels_rank_1 = np.array((1, 2, 2,), dtype=np.int64)
@@ -803,8 +1015,8 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     labels = [[b'iroh'], [b'iroh']]
     features = {'x': np.array(((42,),), dtype=np.int32)}
     # loss = cross_entropy(labels, logits) = [10, 0].
-    expected_weighted_sum_loss = 10.
-    weighted_sum_loss = head.create_loss(
+    expected_training_loss = 10.
+    training_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
@@ -812,10 +1024,7 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss,
-          weighted_sum_loss.eval(),
-          rtol=1e-2,
-          atol=1e-2)
+          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
 
   def test_train_with_vocabulary(self):
     n_classes = 3
@@ -909,22 +1118,25 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     labels = np.array([[[0], [1]], [[1], [2]]], dtype=np.int64)
     weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
 
-    # loss = cross_entropy(labels, logits) = [[0, 12], [0, 15]].
-    # weighted_sum_loss = 1*0 + 1.5*12 + 2*0 + 2.5*15 = 55.5
-    expected_weighted_sum_loss = 55.5
-    expected_example_weight_sum = np.sum(weights)
-    weighted_sum_loss, example_weight_sum, _ = head.create_loss(
+    # unreduced_loss = cross_entropy(labels, logits) = [[0, 12], [0, 15]].
+    expected_unreduced_loss = [[[0.], [12.]], [[0.], [15.]]]
+    # weights are reshaped to [2, 2, 1] to match logits.
+    expected_weights = [[[1.], [1.5]], [[2.], [2.5]]]
+    # training_loss = 1*0 + 1.5*12 + 2*0 + 2.5*15 = 55.5
+    expected_training_loss = 55.5
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features={'weights': weights},
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
         labels=labels)
+    tol = 1e-2
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss, weighted_sum_loss.eval(),
-          rtol=1e-2, atol=1e-2)
+          expected_training_loss, training_loss.eval(), rtol=tol, atol=tol)
       self.assertAllClose(
-          expected_example_weight_sum, example_weight_sum.eval())
+          expected_unreduced_loss, unreduced_loss.eval(), rtol=tol, atol=tol)
+      self.assertAllClose(expected_weights, actual_weights.eval())
 
   def test_multi_dim_weighted_train(self):
     """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 2]."""
@@ -1067,6 +1279,51 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
       head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
           thresholds=(0.5, 1.))
 
+  def test_invalid_loss_reduction(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'Invalid loss_reduction: invalid_loss_reduction'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_reduction='invalid_loss_reduction')
+    with self.assertRaisesRegexp(
+        ValueError, r'Invalid loss_reduction: none'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_reduction=losses.Reduction.NONE)
+
+  def test_loss_fn_arg_labels_missing(self):
+    def _loss_fn(logits):
+      del logits  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: labels\. '
+        r'Given arguments: \(\'logits\',\)'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_logits_missing(self):
+    def _loss_fn(labels):
+      del labels  # unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: logits\. '
+        r'Given arguments: \(\'labels\',\)'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_features_ok(self):
+    def _loss_fn(labels, logits, features):
+      del labels, logits, features  # Unused
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_invalid(self):
+    def _loss_fn(labels, logits, name=None):
+      del labels, logits, name  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn has unexpected args: \[\'name\'\]'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          loss_fn=_loss_fn)
+
   def test_invalid_logits_shape(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
     self.assertEqual(1, head.logits_dimension)
@@ -1112,7 +1369,7 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     # Dynamic shape.
     labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
     logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features={'x': np.array(((42.,),))},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_placeholder,
@@ -1121,7 +1378,7 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           r'\[expected_labels_shape: \] \[2 1\] \[labels_shape: \] \[2 2\]'):
-        weighted_sum_loss.eval({
+        training_loss.eval({
             logits_placeholder: logits_2x1,
             labels_placeholder: labels_2x2
         })
@@ -1153,7 +1410,7 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     # Dynamic shape.
     labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
     logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features={'x': values_2x1},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_placeholder,
@@ -1162,7 +1419,7 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           r'\[expected_labels_shape: \] \[3 1\] \[labels_shape: \] \[2 1\]'):
-        weighted_sum_loss.eval({
+        training_loss.eval({
             labels_placeholder: values_2x1,
             logits_placeholder: values_3x1
         })
@@ -1170,7 +1427,7 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           r'\[expected_labels_shape: \] \[2 1\] \[labels_shape: \] \[3 1\]'):
-        weighted_sum_loss.eval({
+        training_loss.eval({
             labels_placeholder: values_3x1,
             logits_placeholder: values_2x1
         })
@@ -1254,9 +1511,9 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     features = {'x': np.array(((42,),), dtype=np.int32)}
 
     # loss = cross_entropy(labels, logits) = [0, 41].
-    expected_weighted_sum_loss = 41.
+    expected_training_loss = 41.
     # Create loss.
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
@@ -1264,10 +1521,7 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss,
-          weighted_sum_loss.eval(),
-          rtol=1e-2,
-          atol=1e-2)
+          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
 
   def test_eval_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -1351,6 +1605,53 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     ]
     self.assertItemsEqual(expected_metric_keys, spec.eval_metric_ops.keys())
 
+  def test_eval_with_regularization_losses(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    logits = np.array(((45,), (-41,),), dtype=np.float32)
+    labels = np.array(((1,), (1,),), dtype=np.int32)
+    features = {'x': np.array(((42,),), dtype=np.int32)}
+    regularization_losses = [1.5, 0.5]
+    expected_regularization_loss = 2.
+    # unregularized_loss = sum(cross_entropy(labels, logits)) / batch_size
+    #                    = sum(0, 41) / 2 = 20.5
+    expected_unregularized_loss = 20.5
+    expected_regularized_loss = (
+        expected_unregularized_loss + expected_regularization_loss)
+
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels,
+        regularization_losses=regularization_losses)
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_unregularized_loss,
+        keys.LOSS_REGULARIZATION: expected_regularization_loss,
+        keys.ACCURACY: 1./2,
+        keys.PREDICTION_MEAN: 1./2,
+        keys.LABEL_MEAN: 2./2,
+        keys.ACCURACY_BASELINE: 2./2,
+        keys.AUC: 0.,
+        keys.AUC_PR: 1.,
+    }
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      loss, metrics = sess.run((spec.loss, update_ops))
+      self.assertAllClose(expected_regularized_loss, loss)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops})
+
   def test_eval_with_vocabulary_list_create_loss(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
         label_vocabulary=['aang', 'iroh'])
@@ -1358,14 +1659,14 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     labels = [[b'iroh'], [b'iroh']]
     features = {'x': np.array(((42,),), dtype=np.int32)}
     # Create loss.
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
         labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(41., weighted_sum_loss.eval())
+      self.assertAllClose(41., training_loss.eval())
 
   def test_eval_with_vocabulary_list(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
@@ -1401,9 +1702,9 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     # loss = -ln(probabilities[label[i]])) = [-ln(0.269), -ln(0.731)]
     #      = [1.31304389, 0.31334182]
     # weighted sum loss = 1.62638571
-    expected_weighted_sum_loss = 1.62638571
+    expected_training_loss = 1.62638571
     # Create loss.
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
@@ -1411,10 +1712,7 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss,
-          weighted_sum_loss.eval(),
-          rtol=1e-2,
-          atol=1e-2)
+          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
 
   def test_eval_with_thresholds(self):
     thresholds = [0.25, 0.5, 0.75]
@@ -1477,17 +1775,99 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     logits = np.array(((45,), (-41,),), dtype=np.float32)
     labels = np.array(((1,), (1,),), dtype=np.float64)
     features = {'x': np.array(((42,),), dtype=np.float32)}
-    # loss = cross_entropy(labels, logits) = [0, 41].
-    expected_weighted_sum_loss = 41.
+    # unreduced_loss = cross_entropy(labels, logits) = [0, 41]
+    expected_unreduced_loss = [[0.], [41.]]
+    # weights default to 1.
+    expected_weights = 1.
+    # training loss = 1 * 0 + 1 * 41
+    expected_training_loss = 41.
+    # Create loss.
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(expected_training_loss, training_loss.eval())
+      self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
+      self.assertAllClose(expected_weights, actual_weights)
+
+  def test_train_create_loss_loss_reduction(self):
+    """Tests create_loss with loss_reduction."""
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+
+    logits = np.array(((45,), (-41,),), dtype=np.float32)
+    labels = np.array(((1,), (1,),), dtype=np.float64)
+    features = {'x': np.array(((42,),), dtype=np.float32)}
+    # unreduced_loss = cross_entropy(labels, logits) = [0, 41]
+    expected_unreduced_loss = [[0.], [41.]]
+    # weights default to 1.
+    expected_weights = 1.
+    # training loss = (1 * 0 + 1 * 41) / num_nonzero_weights
+    expected_training_loss = 41. / 2.
     # Create loss.
-    weighted_sum_loss = head.create_loss(
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
+        labels=labels)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(expected_training_loss, training_loss.eval())
+      self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
+      self.assertAllClose(expected_weights, actual_weights)
+
+  def test_eval_create_loss_loss_fn(self):
+    """Tests head.create_loss for eval mode and custom loss_fn."""
+    loss = np.array([[1.], [2.]], dtype=np.float32)
+    logits_input = np.array([[-10.], [10.]], dtype=np.float32)
+    labels_input = np.array([[1], [0]], dtype=np.int64)
+    def _loss_fn(labels, logits):
+      check_labels = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(labels, labels_input)),
+          data=[labels])
+      check_logits = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(logits, logits_input)),
+          data=[logits])
+      with ops.control_dependencies([check_labels, check_logits]):
+        return constant_op.constant(loss)
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_fn=_loss_fn)
+
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_input,
+        labels=labels_input)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
+
+  def test_eval_create_loss_loss_fn_wrong_shape(self):
+    """Tests custom loss_fn that returns Tensor of unexpected shape."""
+    loss = np.array([1., 2.], dtype=np.float32)
+    def _loss_fn(labels, logits):
+      del labels, logits  # Unused
+      return constant_op.constant(loss)
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_fn=_loss_fn)
+
+    logits = np.array([[-10.], [10.]], dtype=np.float32)
+    labels = np.array([[1], [0]], dtype=np.int64)
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
         labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_weighted_sum_loss, weighted_sum_loss.eval())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 1\]\. \] '
+          r'\[logits_shape: \] \[2 1\] \[loss_shape: \] \[2\]'):
+        actual_training_loss.eval()
 
   def test_train_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -1586,6 +1966,49 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
           },
           summary_str)
 
+  def test_train_with_regularization_losses(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+
+    logits = np.array(((45,), (-41,),), dtype=np.float32)
+    labels = np.array(((1,), (1,),), dtype=np.float64)
+    expected_train_result = b'my_train_op'
+    features = {'x': np.array(((42,),), dtype=np.float32)}
+    regularization_losses = [1.5, 0.5]
+    expected_regularization_loss = 2.
+    # unregularized_loss = sum(cross_entropy(labels, logits)) / batch_size
+    #                    = sum(0, 41) / 2 = 20.5
+    # loss = unregularized_loss + regularization_loss = 7.
+    expected_loss = 22.5
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        regularization_losses=regularization_losses)
+
+    # Assert predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
+                                                  spec.scaffold.summary_op))
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          metric_keys.MetricKeys.LOSS_REGULARIZATION: (
+              expected_regularization_loss),
+      }, summary_str)
+
   def test_float_labels_train_create_loss(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
 
@@ -1598,9 +2021,9 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     #         -0.4 * log(sigmoid(-0.3)) -0.6 * log(sigmoid(0.3))]
     #      = [0.57407698418, 0.67435524446]
     # weighted sum loss = 0.57407698418 + 0.67435524446
-    expected_weighted_sum_loss = 1.24843222864
+    expected_training_loss = 1.24843222864
     # Create loss.
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
@@ -1608,10 +2031,7 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss,
-          weighted_sum_loss.eval(),
-          rtol=1e-2,
-          atol=1e-2)
+          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
 
   def test_float_labels_train(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
@@ -1658,9 +2078,9 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     #         -0.4 * log(sigmoid(-0.3)) -0.6 * log(sigmoid(0.3))]
     #      = [0.57407698418, 0.67435524446]
     # weighted sum loss = 0.57407698418 + 0.67435524446
-    expected_weighted_sum_loss = 1.24843222864
+    expected_training_loss = 1.24843222864
     # Create loss.
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
@@ -1668,10 +2088,7 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss,
-          weighted_sum_loss.eval(),
-          rtol=1e-2,
-          atol=1e-2)
+          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
 
   def test_float_labels_eval(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
@@ -1790,8 +2207,8 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
       self.assertAllClose(
           expected_metrics, {k: value_ops[k].eval() for k in value_ops})
 
-  def test_train_with_one_dim_labels_and_weights_create_loss(self):
-    """3 examples, 1 batch."""
+  def test_train_one_dim_create_loss(self):
+    """Tests create_loss with 1D labels and weights (shape [batch_size])."""
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
         weight_column='label_weights')
 
@@ -1803,13 +2220,14 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
         'x': np.array(((42.,), (43.,), (44.,)), dtype=np.float32),
         'label_weights': weights_rank_1,
     }
-    # losses = cross_entropy(labels, logits) = [0, 41, 44]
-    # weighted sum loss = 1 * 0 + .1 * 41 + 1.5 * 44
-    expected_weighted_sum_loss = 70.1
-    # example weight sum = 1 + 0.1 + 1.5
-    expected_example_weight_sum = 2.6
+    # unreduced_loss = cross_entropy(labels, logits) = [0, 41, 44]
+    expected_unreduced_loss = [[0.], [41.], [44.]]
+    # weights are reshaped to [3, 1] to match logits.
+    expected_weights = [[1.], [.1], [1.5]]
+    # training loss = 1 * 0 + .1 * 41 + 1.5 * 44
+    expected_training_loss = 70.1
     # Create loss.
-    weighted_sum_loss, example_weight_sum, _ = head.create_loss(
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
@@ -1817,18 +2235,15 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss,
-          weighted_sum_loss.eval(),
-          rtol=1e-2,
-          atol=1e-2)
+          expected_training_loss, training_loss.eval(),
+          rtol=1e-2, atol=1e-2)
       self.assertAllClose(
-          expected_example_weight_sum,
-          example_weight_sum.eval(),
-          rtol=1e-2,
-          atol=1e-2)
+          expected_unreduced_loss, unreduced_loss.eval(),
+          rtol=1e-2, atol=1e-2)
+      self.assertAllClose(expected_weights, actual_weights.eval())
 
-  def test_train_with_one_dim_labels_and_weights(self):
-    """3 examples, 1 batch."""
+  def test_train_one_dim(self):
+    """Tests train with 1D labels and weights (shape [batch_size])."""
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
         weight_column='label_weights')
 
@@ -1933,12 +2348,14 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     logits = np.array([[[10], [-10]], [[12], [-12]]], dtype=np.float32)
     labels = np.array([[[0], [0]], [[1], [1]]], dtype=np.float64)
     weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-    # loss = cross_entropy(labels, logits) = [[10, 0], [0, 12]].
-    # weighted_sum_loss = 1*10 + 1.5*0 + 2*0 + 2.5*12 = 40
-    expected_weighted_sum_loss = 40.
-    expected_example_weight_sum = np.sum(weights)
+    # unreduced_loss = cross_entropy(labels, logits) = [[10, 0], [0, 12]].
+    expected_unreduced_loss = [[[10.], [0.]], [[0.], [12.]]]
+    # Weights are reshaped to [2, 2, 1] to match logits.
+    expected_weights = [[[1.], [1.5]], [[2.], [2.5]]]
+    # training_loss = 1*10 + 1.5*0 + 2*0 + 2.5*12 = 40
+    expected_training_loss = 40.
     # Create loss.
-    weighted_sum_loss, example_weight_sum, _ = head.create_loss(
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features={'weights': weights},
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
@@ -1947,10 +2364,12 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_weighted_sum_loss, weighted_sum_loss.eval(),
+          expected_training_loss, training_loss.eval(),
           rtol=tol, atol=tol)
       self.assertAllClose(
-          expected_example_weight_sum, example_weight_sum.eval())
+          expected_unreduced_loss, unreduced_loss.eval(),
+          rtol=tol, atol=tol)
+      self.assertAllClose(expected_weights, actual_weights.eval())
 
   def test_multi_dim_weighted_train(self):
     """Logits and labels of shape [2, 2, 1], weights [2, 2]."""
@@ -2096,6 +2515,47 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, r'Invalid label_dimension'):
       head_lib._regression_head_with_mean_squared_error_loss(label_dimension=0)
 
+  def test_invalid_loss_reduction(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'Invalid loss_reduction: invalid_loss_reduction'):
+      head_lib._regression_head_with_mean_squared_error_loss(
+          loss_reduction='invalid_loss_reduction')
+    with self.assertRaisesRegexp(
+        ValueError, r'Invalid loss_reduction: none'):
+      head_lib._regression_head_with_mean_squared_error_loss(
+          loss_reduction=losses.Reduction.NONE)
+
+  def test_loss_fn_arg_labels_missing(self):
+    def _loss_fn(logits):
+      del logits  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: labels\. '
+        r'Given arguments: \(\'logits\',\)'):
+      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_logits_missing(self):
+    def _loss_fn(labels):
+      del labels  # unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn must contain argument: logits\. '
+        r'Given arguments: \(\'labels\',\)'):
+      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_features_ok(self):
+    def _loss_fn(labels, logits, features):
+      del labels, logits, features  # Unused
+      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+
+  def test_loss_fn_arg_invalid(self):
+    def _loss_fn(labels, logits, name=None):
+      del labels, logits, name  # Unused
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'loss_fn has unexpected args: \[\'name\'\]'):
+      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+
   def test_invalid_logits(self):
     head = head_lib._regression_head_with_mean_squared_error_loss(
         label_dimension=3)
@@ -2154,7 +2614,7 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
             labels_placeholder: values_3d,
             logits_placeholder: values_1d
         })
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features={'x': values_1d},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_placeholder,
@@ -2163,7 +2623,7 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           r'\[expected_labels_shape: \] \[2 3\] \[labels_shape: \] \[2 1\]'):
-        weighted_sum_loss.eval({
+        training_loss.eval({
             labels_placeholder: values_1d,
             logits_placeholder: values_3d
         })
@@ -2206,7 +2666,7 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
             labels_placeholder: values_3d,
             logits_placeholder: values_1d
         })
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features={'x': values_1d},
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits_placeholder,
@@ -2215,7 +2675,7 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           r'\[expected_labels_shape: \] \[2 3\] \[labels_shape: \] \[2 1\]'):
-        weighted_sum_loss.eval({
+        training_loss.eval({
             labels_placeholder: values_1d,
             logits_placeholder: values_3d
         })
@@ -2261,7 +2721,7 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
     labels = np.array(((43,), (44,),), dtype=np.int32)
     features = {'x': np.array(((42,),), dtype=np.float32)}
     # Create loss.
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
@@ -2269,7 +2729,57 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       # loss = [(43-45)^2, (44-41)] = [4, 9]
-      self.assertAllClose(13., weighted_sum_loss.eval())
+      self.assertAllClose(13., training_loss.eval())
+
+  def test_eval_create_loss_loss_fn(self):
+    """Tests head.create_loss for eval mode and custom loss_fn."""
+    loss = np.array([[0., 1.], [2., 3.]], dtype=np.float32)
+    logits_input = np.array([[-1., 1.], [-2., 2.]], dtype=np.float32)
+    labels_input = np.array([[1., 0.], [2., -1.]], dtype=np.float32)
+    def _loss_fn(labels, logits):
+      check_labels = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(labels, labels_input)),
+          data=[labels])
+      check_logits = control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(logits, logits_input)),
+          data=[logits])
+      with ops.control_dependencies([check_labels, check_logits]):
+        return constant_op.constant(loss)
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        label_dimension=2, loss_fn=_loss_fn)
+
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_input,
+        labels=labels_input)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
+
+  def test_eval_create_loss_loss_fn_wrong_shape(self):
+    """Tests custom loss_fn that returns Tensor of unexpected shape."""
+    loss = np.array([[1.], [2.]], dtype=np.float32)
+    def _loss_fn(labels, logits):
+      del labels, logits  # Unused
+      return constant_op.constant(loss)
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        label_dimension=2, loss_fn=_loss_fn)
+
+    logits = np.array([[-1., 1.], [-2., 2.]], dtype=np.float32)
+    labels = np.array([[1., 0.], [2., -1.]], dtype=np.float32)
+    actual_training_loss = head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)[0]
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 2\]\. \] '
+          r'\[logits_shape: \] \[2 2\] \[loss_shape: \] \[2 1\]'):
+        actual_training_loss.eval()
 
   def test_eval_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -2343,21 +2853,98 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
     ]
     self.assertItemsEqual(expected_metric_keys, spec.eval_metric_ops.keys())
 
+  def test_eval_with_regularization_losses(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    self.assertEqual(1, head.logits_dimension)
+
+    logits = np.array(((45,), (41,),), dtype=np.float32)
+    labels = np.array(((43,), (44,),), dtype=np.int32)
+    features = {'x': np.array(((42,),), dtype=np.float32)}
+    regularization_losses = [1.5, 0.5]
+    expected_regularization_loss = 2.
+    # unregularized_loss = ((43-45)^2 + (44-41)^2) / batch_size
+    #                    = (4 + 9) / 2 = 6.5
+    expected_unregularized_loss = 6.5
+    expected_regularized_loss = (
+        expected_unregularized_loss + expected_regularization_loss)
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels,
+        regularization_losses=regularization_losses)
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_unregularized_loss,
+        keys.LOSS_REGULARIZATION: expected_regularization_loss,
+    }
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+      predictions, loss, metrics = sess.run((
+          spec.predictions[prediction_key], spec.loss, update_ops))
+      self.assertAllClose(logits, predictions)
+      self.assertAllClose(expected_regularized_loss, loss)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops})
+
   def test_train_create_loss(self):
     head = head_lib._regression_head_with_mean_squared_error_loss()
     logits = np.array(((45,), (41,),), dtype=np.float32)
     labels = np.array(((43,), (44,),), dtype=np.int32)
     features = {'x': np.array(((42,),), dtype=np.float32)}
+    # unreduced_loss = [(43-45)^2, (44-41)] = [4, 9]
+    expected_unreduced_loss = [[4.], [9.]]
+    # weights default to 1.
+    expected_weights = 1
+    # training_loss = 1 * 4 + 1 * 9 = 13
+    expected_training_loss = 13.
     # Create loss.
-    weighted_sum_loss = head.create_loss(
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
-        labels=labels)[0]
+        labels=labels)
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      # loss = [(43-45)^2, (44-41)] = [4, 9]
-      self.assertAllClose(13., weighted_sum_loss.eval())
+      self.assertAllClose(expected_training_loss, training_loss.eval())
+      self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
+      self.assertAllClose(expected_weights, actual_weights)
+
+  def test_train_create_loss_loss_reduction(self):
+    """Tests create_loss with loss_reduction."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+    logits = np.array(((45,), (41,),), dtype=np.float32)
+    labels = np.array(((43,), (44,),), dtype=np.int32)
+    features = {'x': np.array(((42,),), dtype=np.float32)}
+    # unreduced_loss = [(43-45)^2, (44-41)] = [4, 9]
+    expected_unreduced_loss = [[4.], [9.]]
+    # weights default to 1.
+    expected_weights = 1
+    # training_loss = (1 * 4 + 1 * 9) / num_nonzero_weights
+    expected_training_loss = 13. / 2.
+    # Create loss.
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(expected_training_loss, training_loss.eval())
+      self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
+      self.assertAllClose(expected_weights, actual_weights)
 
   def test_train_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -2465,6 +3052,53 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
           },
           summary_str)
 
+  def test_train_with_regularization_losses(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,),), dtype=np.float32)
+    labels = np.array(((43.,), (44.,),), dtype=np.float64)
+    expected_train_result = b'my_train_op'
+    features = {'x': np.array(((42.,),), dtype=np.float32)}
+    regularization_losses = [1.5, 0.5]
+    expected_regularization_loss = 2.
+    # unregularized_loss = ((43-45)^2 + (44-41)^2) / batch_size
+    #                    = (4 + 9) / 2 = 6.5
+    # loss = unregularized_loss + regularization_loss = 8.5
+    expected_loss = 8.5
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        regularization_losses=regularization_losses)
+
+    # Assert predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+      predictions, loss, train_result, summary_str = sess.run((
+          spec.predictions[prediction_key], spec.loss, spec.train_op,
+          spec.scaffold.summary_op))
+      self.assertAllClose(logits, predictions)
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          metric_keys.MetricKeys.LOSS_REGULARIZATION: (
+              expected_regularization_loss),
+      }, summary_str)
+
   def test_weighted_multi_example_eval(self):
     """1d label, 3 examples, 1 batch."""
     head = head_lib._regression_head_with_mean_squared_error_loss(
@@ -2588,34 +3222,35 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
           metric_keys.MetricKeys.LOSS_MEAN: 39.0769231,
       }, summary_str)
 
-  def test_test_with_one_dim_label_and_weight_create_loss(self):
-    """1d label, 3 examples, 1 batch."""
+  def test_train_one_dim_create_loss(self):
+    """Tests create_loss with 1D labels and weights (shape [batch_size])."""
     head = head_lib._regression_head_with_mean_squared_error_loss(
         weight_column='label_weights')
     logits = np.array(((45,), (41,), (44,)), dtype=np.float32)
     x_feature_rank_1 = np.array((42., 43., 44.,), dtype=np.float32)
     weight_rank_1 = np.array((1., .1, 1.5,), dtype=np.float64)
     labels_rank_1 = np.array((35., 42., 45.,))
-    # loss = [(35-45)^2, (42-41)^2, (45-44)^2] = [100, 1, 1].
-    # weighted sum loss = 100 * 1 + 1 * .1 + 1.5 * 1 = 101.6
-    expected_unreduced_loss = 101.6
-    # example weight sum = 1 + 0.1 + 1.5
-    expected_example_weight_sum = 2.6
+    # unreduced_loss = [(35-45)^2, (42-41)^2, (45-44)^2] = [100, 1, 1].
+    expected_unreduced_loss = [[100.], [1.], [1.]]
+    # weights are reshaped to [3, 1] to match logits.
+    expected_weights = [[1.], [.1], [1.5]]
+    # training_loss = 100 * 1 + 1 * .1 + 1.5 * 1 = 101.6
+    expected_training_loss = 101.6
     features = {'x': x_feature_rank_1, 'label_weights': weight_rank_1}
     # Create loss.
-    weighted_sum_loss, example_weight_sum, _ = head.create_loss(
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
         labels=labels_rank_1)
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_unreduced_loss, weighted_sum_loss.eval())
-      self.assertAllClose(expected_example_weight_sum,
-                          example_weight_sum.eval())
+      self.assertAllClose(expected_training_loss, training_loss.eval())
+      self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
+      self.assertAllClose(expected_weights, actual_weights.eval())
 
-  def test_with_one_dim_label_and_weight(self):
-    """1d label, 3 examples, 1 batch."""
+  def test_train_one_dim(self):
+    """Tests train with 1D labels and weights (shape [batch_size])."""
     head = head_lib._regression_head_with_mean_squared_error_loss(
         weight_column='label_weights')
     self.assertEqual(1, head.logits_dimension)
@@ -2683,7 +3318,7 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
         'label_weights': np.array(((1., .1, 1.5),))
     }
     # Create loss.
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
@@ -2692,7 +3327,7 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
       _initialize_variables(self, monitored_session.Scaffold())
       # loss = [(35-45)^2, (42-41)^2, (45-44)^2] = [100, 1, 1].
       # weighted sum loss = 1 * 100 + .1 * 1 + 1.5 * 1 = 101.6
-      self.assertAllClose(101.6, weighted_sum_loss.eval())
+      self.assertAllClose(101.6, training_loss.eval())
 
   def test_weighted_multi_value_eval(self):
     """3d label, 1 example, 1 batch."""
@@ -2752,7 +3387,7 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
         'label_weights': np.array(((1., .1, 1.5),))
     }
     # Create loss.
-    weighted_sum_loss = head.create_loss(
+    training_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
@@ -2761,7 +3396,7 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
       _initialize_variables(self, monitored_session.Scaffold())
       # loss = [(35-45)^2, (42-41)^2, (45-44)^2] = [100, 1, 1].
       # weighted sum loss = 1 * 100 + .1 * 1 + 1.5 * 1 = 101.6
-      self.assertAllClose(101.6, weighted_sum_loss.eval())
+      self.assertAllClose(101.6, training_loss.eval())
 
   def test_weighted_multi_value_train(self):
     """3d label, 1 example, 1 batch."""
@@ -2943,24 +3578,26 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
     labels = np.array([[[01., 02., 03.], [12., 13., 14.]],
                        [[23., 24., 25.], [34., 35., 36.]]])
     weights = np.array([[1., 1.5], [2., 2.5]])
-    expected_weighted_sum_loss = np.sum(
+    expected_unreduced_loss = [[[1., 1., 1.], [4., 4., 4.]],
+                               [[9., 9., 9.], [16., 16., 16.]]]
+    expected_training_loss = np.sum(
         np.array([[[1. * x for x in [1., 1., 1.]],
                    [1.5 * x for x in [4., 4., 4.]]],
                   [[2. * x for x in [9., 9., 9.]],
                    [2.5 * x for x in [16., 16., 16.]]]]))
-    # Weights are expanded to [2, 2, label_dimension].
-    expected_example_weight_sum = np.sum(weights) * label_dimension
+    # Weights are expanded to [2, 2, 1] to match logits.
+    expected_weights = [[[1.], [1.5]], [[2.], [2.5]]]
     # Create loss.
-    weighted_sum_loss, example_weight_sum, _ = head.create_loss(
+    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
         features={'label_weights': weights},
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
         labels=labels)
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_weighted_sum_loss, weighted_sum_loss.eval())
-      self.assertAllClose(
-          expected_example_weight_sum, example_weight_sum.eval())
+      self.assertAllClose(expected_training_loss, training_loss.eval())
+      self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
+      self.assertAllClose(expected_weights, actual_weights.eval())
 
   def test_multi_dim_weighted_train(self):
     """Logits, labels of shape [2, 2, 3], weight shape [2, 2]."""
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index 8658ee38e99a5a6ba16560774302a1d6de8bc49e..a2f24ef27044680fe93b176b5207593165d0d109 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -26,10 +26,15 @@ from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import optimizers
 from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.summary import summary
 from tensorflow.python.training import ftrl
 from tensorflow.python.training import training_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 # The default learning rate of 0.2 is a historical artifact of the initial
@@ -42,6 +47,26 @@ def _get_default_optimizer(feature_columns):
   return ftrl.FtrlOptimizer(learning_rate=learning_rate)
 
 
+def _compute_fraction_of_zero(cols_to_vars):
+  """Given a linear cols_to_vars dict, compute the fraction of zero weights.
+
+  Args:
+    cols_to_vars: A dictionary mapping FeatureColumns to lists of tf.Variables
+      like one returned from feature_column_lib.linear_model.
+
+  Returns:
+    The fraction of zeros (sparsity) in the linear model.
+  """
+  all_weight_vars = []
+  for var_or_var_list in cols_to_vars.values():
+    # Skip empty-lists associated with columns that created no Variables.
+    if var_or_var_list:
+      all_weight_vars += [
+          array_ops.reshape(var, [-1]) for var in var_or_var_list
+      ]
+  return nn.zero_fraction(array_ops.concat(all_weight_vars, axis=0))
+
+
 def _linear_logit_fn_builder(units, feature_columns):
   """Function builder for a linear logit_fn.
 
@@ -66,8 +91,22 @@ def _linear_logit_fn_builder(units, feature_columns):
     Returns:
       A `Tensor` representing the logits.
     """
-    return feature_column_lib.linear_model(
-        features=features, feature_columns=feature_columns, units=units)
+    cols_to_vars = {}
+    logits = feature_column_lib.linear_model(
+        features=features,
+        feature_columns=feature_columns,
+        units=units,
+        cols_to_vars=cols_to_vars)
+    bias = cols_to_vars.pop('bias')
+    if units > 1:
+      summary.histogram('bias', bias)
+    else:
+      # If units == 1, the bias value is a length-1 list of a scalar Tensor,
+      # so we should provide a scalar summary.
+      summary.scalar('bias', bias[0][0])
+    summary.scalar('fraction_of_zero_weights',
+                   _compute_fraction_of_zero(cols_to_vars))
+    return logits
 
   return linear_logit_fn
 
@@ -98,6 +137,7 @@ def _linear_model_fn(features, labels, mode, head, feature_columns, optimizer,
   if not isinstance(features, dict):
     raise ValueError('features should be a dictionary of `Tensor`s. '
                      'Given type: {}'.format(type(features)))
+
   optimizer = optimizers.get_optimizer_instance(
       optimizer or _get_default_optimizer(feature_columns),
       learning_rate=_LEARNING_RATE)
@@ -131,6 +171,7 @@ def _linear_model_fn(features, labels, mode, head, feature_columns, optimizer,
         logits=logits)
 
 
+@tf_export('estimator.LinearClassifier')
 class LinearClassifier(estimator.Estimator):
   """Linear classifier model.
 
@@ -159,6 +200,13 @@ class LinearClassifier(estimator.Estimator):
         l1_regularization_strength=0.001
       ))
 
+  # Or estimator with warm-starting from a previous checkpoint.
+  estimator = LinearClassifier(
+      feature_columns=[categorical_column_a,
+                       categorical_feature_a_x_categorical_feature_b],
+      warm_start_from="/path/to/checkpoint/dir")
+
+
   # Input builders
   def input_fn_train: # returns x, y (where y represents label's class index).
     ...
@@ -198,7 +246,9 @@ class LinearClassifier(estimator.Estimator):
                label_vocabulary=None,
                optimizer='Ftrl',
                config=None,
-               partitioner=None):
+               partitioner=None,
+               warm_start_from=None,
+               loss_reduction=losses.Reduction.SUM):
     """Construct a `LinearClassifier` estimator object.
 
     Args:
@@ -230,6 +280,13 @@ class LinearClassifier(estimator.Estimator):
         to FTRL optimizer.
       config: `RunConfig` object to configure the runtime settings.
       partitioner: Optional. Partitioner for input layer.
+      warm_start_from: A string filepath to a checkpoint to warm-start from, or
+        a `WarmStartSettings` object to fully configure warm-starting.  If the
+        string filepath is provided instead of a `WarmStartSettings`, then all
+        weights and biases are warm-started, and it is assumed that vocabularies
+        and Tensor names are unchanged.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
 
     Returns:
       A `LinearClassifier` estimator.
@@ -240,12 +297,16 @@ class LinearClassifier(estimator.Estimator):
     if n_classes == 2:
       head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
           weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
     else:
       head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
           n_classes, weight_column=weight_column,
-          label_vocabulary=label_vocabulary)
+          label_vocabulary=label_vocabulary,
+          loss_reduction=loss_reduction)
+
     def _model_fn(features, labels, mode, config):
+      """Call the defined shared _linear_model_fn."""
       return _linear_model_fn(
           features=features,
           labels=labels,
@@ -255,12 +316,15 @@ class LinearClassifier(estimator.Estimator):
           optimizer=optimizer,
           partitioner=partitioner,
           config=config)
+
     super(LinearClassifier, self).__init__(
         model_fn=_model_fn,
         model_dir=model_dir,
-        config=config)
+        config=config,
+        warm_start_from=warm_start_from)
 
 
+@tf_export('estimator.LinearRegressor')
 class LinearRegressor(estimator.Estimator):
   """An estimator for TensorFlow Linear regression problems.
 
@@ -279,6 +343,13 @@ class LinearRegressor(estimator.Estimator):
       feature_columns=[categorical_column_a,
                        categorical_feature_a_x_categorical_feature_b])
 
+  # Or estimator with warm-starting from a previous checkpoint.
+  estimator = LinearRegressor(
+      feature_columns=[categorical_column_a,
+                       categorical_feature_a_x_categorical_feature_b],
+      warm_start_from="/path/to/checkpoint/dir")
+
+
   # Input builders
   def input_fn_train: # returns x, y
     ...
@@ -317,7 +388,9 @@ class LinearRegressor(estimator.Estimator):
                weight_column=None,
                optimizer='Ftrl',
                config=None,
-               partitioner=None):
+               partitioner=None,
+               warm_start_from=None,
+               loss_reduction=losses.Reduction.SUM):
     """Initializes a `LinearRegressor` instance.
 
     Args:
@@ -341,10 +414,20 @@ class LinearRegressor(estimator.Estimator):
         to FTRL optimizer.
       config: `RunConfig` object to configure the runtime settings.
       partitioner: Optional. Partitioner for input layer.
+      warm_start_from: A string filepath to a checkpoint to warm-start from, or
+        a `WarmStartSettings` object to fully configure warm-starting.  If the
+        string filepath is provided instead of a `WarmStartSettings`, then all
+        weights and biases are warm-started, and it is assumed that vocabularies
+        and Tensor names are unchanged.
+      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+        to reduce training loss over batch. Defaults to `SUM`.
     """
     head = head_lib._regression_head_with_mean_squared_error_loss(  # pylint: disable=protected-access
-        label_dimension=label_dimension, weight_column=weight_column)
+        label_dimension=label_dimension, weight_column=weight_column,
+        loss_reduction=loss_reduction)
+
     def _model_fn(features, labels, mode, config):
+      """Call the defined shared _linear_model_fn."""
       return _linear_model_fn(
           features=features,
           labels=labels,
@@ -354,7 +437,9 @@ class LinearRegressor(estimator.Estimator):
           optimizer=optimizer,
           partitioner=partitioner,
           config=config)
+
     super(LinearRegressor, self).__init__(
         model_fn=_model_fn,
         model_dir=model_dir,
-        config=config)
+        config=config,
+        warm_start_from=warm_start_from)
diff --git a/tensorflow/python/estimator/canned/linear_test.py b/tensorflow/python/estimator/canned/linear_test.py
index 907ab4801f451985c67da7ddd9d945775976bd07..59a230417d1692664ac3555cbf40cfa039c95be9 100644
--- a/tensorflow/python/estimator/canned/linear_test.py
+++ b/tensorflow/python/estimator/canned/linear_test.py
@@ -119,8 +119,6 @@ class LinearClassifierIntegrationTest(
 
 
 # Tests for Linear logit_fn.
-
-
 class LinearLogitFnTest(linear_testing_utils.BaseLinearLogitFnTest,
                         test.TestCase):
 
@@ -129,5 +127,15 @@ class LinearLogitFnTest(linear_testing_utils.BaseLinearLogitFnTest,
     linear_testing_utils.BaseLinearLogitFnTest.__init__(self)
 
 
+# Tests for warm-starting with Linear logit_fn.
+class LinearWarmStartingTest(linear_testing_utils.BaseLinearWarmStartingTest,
+                             test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearWarmStartingTest.__init__(
+        self, _linear_classifier_fn, _linear_regressor_fn)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/estimator/canned/linear_testing_utils.py b/tensorflow/python/estimator/canned/linear_testing_utils.py
index 138b75a9d6b03bf29b94866a1024b3fb7ae7f075..3e9183cf1b633757074377472e9b4cac953e04a1 100644
--- a/tensorflow/python/estimator/canned/linear_testing_utils.py
+++ b/tensorflow/python/estimator/canned/linear_testing_utils.py
@@ -31,6 +31,7 @@ from tensorflow.core.example import feature_pb2
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.canned import linear
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.export import export
@@ -43,17 +44,20 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import input as input_lib
-from tensorflow.python.training import optimizer
+from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import queue_runner
 from tensorflow.python.training import saver
 from tensorflow.python.training import session_run_hook
@@ -74,6 +78,7 @@ except ImportError:
 # Names of variables created by model.
 AGE_WEIGHT_NAME = 'linear/linear_model/age/weights'
 HEIGHT_WEIGHT_NAME = 'linear/linear_model/height/weights'
+OCCUPATION_WEIGHT_NAME = 'linear/linear_model/occupation/weights'
 BIAS_NAME = 'linear/linear_model/bias_weights'
 LANGUAGE_WEIGHT_NAME = 'linear/linear_model/language/weights'
 
@@ -94,7 +99,7 @@ def assert_close(expected, actual, rtol=1e-04, name='assert_close'):
 
 
 def save_variables_to_ckpt(model_dir):
-  init_all_op = [variables.global_variables_initializer()]
+  init_all_op = [variables_lib.global_variables_initializer()]
   with tf_session.Session() as sess:
     sess.run(init_all_op)
     saver.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
@@ -139,7 +144,7 @@ class CheckPartitionerVarHook(session_run_hook.SessionRunHook):
       partitioned_weight = variable_scope.get_variable(
           self._var_name, shape=(self._var_dim, 1))
       self._test_case.assertTrue(
-          isinstance(partitioned_weight, variables.PartitionedVariable))
+          isinstance(partitioned_weight, variables_lib.PartitionedVariable))
       for part in partitioned_weight:
         self._test_case.assertEqual(self._var_dim // self._partitions,
                                     part.get_shape()[0])
@@ -240,9 +245,9 @@ class BaseLinearRegressorEvaluationTest(object):
 
   def test_evaluation_for_simple_data(self):
     with ops.Graph().as_default():
-      variables.Variable([[11.0]], name=AGE_WEIGHT_NAME)
-      variables.Variable([2.0], name=BIAS_NAME)
-      variables.Variable(
+      variables_lib.Variable([[11.0]], name=AGE_WEIGHT_NAME)
+      variables_lib.Variable([2.0], name=BIAS_NAME)
+      variables_lib.Variable(
           100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
@@ -262,9 +267,9 @@ class BaseLinearRegressorEvaluationTest(object):
   def test_evaluation_batch(self):
     """Tests evaluation for batch_size==2."""
     with ops.Graph().as_default():
-      variables.Variable([[11.0]], name=AGE_WEIGHT_NAME)
-      variables.Variable([2.0], name=BIAS_NAME)
-      variables.Variable(
+      variables_lib.Variable([[11.0]], name=AGE_WEIGHT_NAME)
+      variables_lib.Variable([2.0], name=BIAS_NAME)
+      variables_lib.Variable(
           100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
@@ -287,9 +292,9 @@ class BaseLinearRegressorEvaluationTest(object):
   def test_evaluation_weights(self):
     """Tests evaluation with weights."""
     with ops.Graph().as_default():
-      variables.Variable([[11.0]], name=AGE_WEIGHT_NAME)
-      variables.Variable([2.0], name=BIAS_NAME)
-      variables.Variable(
+      variables_lib.Variable([[11.0]], name=AGE_WEIGHT_NAME)
+      variables_lib.Variable([2.0], name=BIAS_NAME)
+      variables_lib.Variable(
           100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
@@ -318,10 +323,10 @@ class BaseLinearRegressorEvaluationTest(object):
     x_dim = 3
     label_dim = 2
     with ops.Graph().as_default():
-      variables.Variable(
+      variables_lib.Variable(
           [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], name=AGE_WEIGHT_NAME)
-      variables.Variable([7.0, 8.0], name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      variables_lib.Variable([7.0, 8.0], name=BIAS_NAME)
+      variables_lib.Variable(100, name='global_step', dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
     linear_regressor = self._linear_regressor_fn(
@@ -352,10 +357,10 @@ class BaseLinearRegressorEvaluationTest(object):
 
   def test_evaluation_for_multiple_feature_columns(self):
     with ops.Graph().as_default():
-      variables.Variable([[10.0]], name=AGE_WEIGHT_NAME)
-      variables.Variable([[2.0]], name=HEIGHT_WEIGHT_NAME)
-      variables.Variable([5.0], name=BIAS_NAME)
-      variables.Variable(
+      variables_lib.Variable([[10.0]], name=AGE_WEIGHT_NAME)
+      variables_lib.Variable([[2.0]], name=HEIGHT_WEIGHT_NAME)
+      variables_lib.Variable([5.0], name=BIAS_NAME)
+      variables_lib.Variable(
           100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
@@ -401,9 +406,9 @@ class BaseLinearRegressorPredictTest(object):
   def test_1d(self):
     """Tests predict when all variables are one-dimensional."""
     with ops.Graph().as_default():
-      variables.Variable([[10.]], name='linear/linear_model/x/weights')
-      variables.Variable([.2], name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      variables_lib.Variable([[10.]], name='linear/linear_model/x/weights')
+      variables_lib.Variable([.2], name=BIAS_NAME)
+      variables_lib.Variable(100, name='global_step', dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
     linear_regressor = self._linear_regressor_fn(
@@ -428,12 +433,12 @@ class BaseLinearRegressorPredictTest(object):
     x_dim = 4
     feature_columns = (feature_column_lib.numeric_column('x', shape=(x_dim,)),)
     with ops.Graph().as_default():
-      variables.Variable(  # shape=[x_dim, label_dimension]
+      variables_lib.Variable(  # shape=[x_dim, label_dimension]
           [[1., 2., 3.], [2., 3., 4.], [3., 4., 5.], [4., 5., 6.]],
           name='linear/linear_model/x/weights')
-      variables.Variable(  # shape=[label_dimension]
+      variables_lib.Variable(  # shape=[label_dimension]
           [.2, .4, .6], name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      variables_lib.Variable(100, name='global_step', dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
     linear_regressor = self._linear_regressor_fn(
@@ -457,10 +462,10 @@ class BaseLinearRegressorPredictTest(object):
   def testTwoFeatureColumns(self):
     """Tests predict with two feature columns."""
     with ops.Graph().as_default():
-      variables.Variable([[10.]], name='linear/linear_model/x0/weights')
-      variables.Variable([[20.]], name='linear/linear_model/x1/weights')
-      variables.Variable([.2], name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      variables_lib.Variable([[10.]], name='linear/linear_model/x0/weights')
+      variables_lib.Variable([[20.]], name='linear/linear_model/x1/weights')
+      variables_lib.Variable([.2], name=BIAS_NAME)
+      variables_lib.Variable(100, name='global_step', dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
     linear_regressor = self._linear_regressor_fn(
@@ -690,8 +695,8 @@ class BaseLinearRegressorTrainingTest(object):
         return control_flow_ops.no_op()
 
     mock_optimizer = test.mock.NonCallableMock(
-        spec=optimizer.Optimizer,
-        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+        spec=optimizer_lib.Optimizer,
+        wraps=optimizer_lib.Optimizer(use_locking=False, name='my_optimizer'))
     mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
 
     # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
@@ -810,9 +815,9 @@ class BaseLinearRegressorTrainingTest(object):
     bias = 5.0
     initial_global_step = 100
     with ops.Graph().as_default():
-      variables.Variable([[age_weight]], name=AGE_WEIGHT_NAME)
-      variables.Variable([bias], name=BIAS_NAME)
-      variables.Variable(
+      variables_lib.Variable([[age_weight]], name=AGE_WEIGHT_NAME)
+      variables_lib.Variable([bias], name=BIAS_NAME)
+      variables_lib.Variable(
           initial_global_step,
           name=ops.GraphKeys.GLOBAL_STEP,
           dtype=dtypes.int64)
@@ -843,9 +848,9 @@ class BaseLinearRegressorTrainingTest(object):
     bias = 5.0
     initial_global_step = 100
     with ops.Graph().as_default():
-      variables.Variable([[age_weight]], name=AGE_WEIGHT_NAME)
-      variables.Variable([bias], name=BIAS_NAME)
-      variables.Variable(
+      variables_lib.Variable([[age_weight]], name=AGE_WEIGHT_NAME)
+      variables_lib.Variable([bias], name=BIAS_NAME)
+      variables_lib.Variable(
           initial_global_step,
           name=ops.GraphKeys.GLOBAL_STEP,
           dtype=dtypes.int64)
@@ -910,8 +915,8 @@ class BaseLinearClassifierTrainingTest(object):
         return state_ops.assign_add(global_step, 1).op
 
     mock_optimizer = test.mock.NonCallableMock(
-        spec=optimizer.Optimizer,
-        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+        spec=optimizer_lib.Optimizer,
+        wraps=optimizer_lib.Optimizer(use_locking=False, name='my_optimizer'))
     mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
 
     # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
@@ -1124,10 +1129,11 @@ class BaseLinearClassifierTrainingTest(object):
     bias = [-35.0] if n_classes == 2 else [-35.0] * n_classes
     initial_global_step = 100
     with ops.Graph().as_default():
-      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables_lib.Variable(bias, name=BIAS_NAME)
+      variables_lib.Variable(
+          initial_global_step,
+          name=ops.GraphKeys.GLOBAL_STEP,
           dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
@@ -1184,10 +1190,11 @@ class BaseLinearClassifierTrainingTest(object):
     bias = [-35.0]
     initial_global_step = 100
     with ops.Graph().as_default():
-      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables_lib.Variable(bias, name=BIAS_NAME)
+      variables_lib.Variable(
+          initial_global_step,
+          name=ops.GraphKeys.GLOBAL_STEP,
           dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
@@ -1228,10 +1235,11 @@ class BaseLinearClassifierTrainingTest(object):
     bias = [-35.0] if n_classes == 2 else [-35.0] * n_classes
     initial_global_step = 100
     with ops.Graph().as_default():
-      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables_lib.Variable(bias, name=BIAS_NAME)
+      variables_lib.Variable(
+          initial_global_step,
+          name=ops.GraphKeys.GLOBAL_STEP,
           dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
@@ -1310,9 +1318,9 @@ class BaseLinearClassifierEvaluationTest(object):
     bias = [-30.0] if n_classes == 2 else [-30.0] * n_classes
 
     with ops.Graph().as_default():
-      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
+      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables_lib.Variable(bias, name=BIAS_NAME)
+      variables_lib.Variable(
           100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
@@ -1372,10 +1380,11 @@ class BaseLinearClassifierEvaluationTest(object):
     bias = [-35.0] if n_classes == 2 else [-35.0] * n_classes
     initial_global_step = 100
     with ops.Graph().as_default():
-      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables_lib.Variable(bias, name=BIAS_NAME)
+      variables_lib.Variable(
+          initial_global_step,
+          name=ops.GraphKeys.GLOBAL_STEP,
           dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
@@ -1445,10 +1454,11 @@ class BaseLinearClassifierEvaluationTest(object):
     bias = [-35.0] if n_classes == 2 else [-35.0] * n_classes
     initial_global_step = 100
     with ops.Graph().as_default():
-      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables_lib.Variable(bias, name=BIAS_NAME)
+      variables_lib.Variable(
+          initial_global_step,
+          name=ops.GraphKeys.GLOBAL_STEP,
           dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
@@ -1539,9 +1549,9 @@ class BaseLinearClassifierPredictTest(object):
     bias = [10.0] if n_classes == 2 else [10.0] * n_classes
 
     with ops.Graph().as_default():
-      variables.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
+      variables_lib.Variable(bias, name=BIAS_NAME)
+      variables_lib.Variable(100, name='global_step', dtype=dtypes.int64)
       save_variables_to_ckpt(self._model_dir)
 
     est = self._linear_classifier_fn(
@@ -1815,12 +1825,12 @@ class BaseLinearLogitFnTest(object):
     with ops.Graph().as_default():
       logit_fn = linear._linear_logit_fn_builder(units=2, feature_columns=[age])
       logits = logit_fn(features={'age': [[23.], [31.]]})
-      with variable_scope.variable_scope('linear_model', reuse=True):
-        bias_var = variable_scope.get_variable('bias_weights')
+      bias_var = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
+                                    'linear_model/bias_weights')[0]
       age_var = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
                                    'linear_model/age')[0]
       with tf_session.Session() as sess:
-        sess.run([variables.global_variables_initializer()])
+        sess.run([variables_lib.global_variables_initializer()])
         self.assertAllClose([[0., 0.], [0., 0.]], logits.eval())
         sess.run(bias_var.assign([10., 5.]))
         self.assertAllClose([[10., 5.], [10., 5.]], logits.eval())
@@ -1828,3 +1838,262 @@ class BaseLinearLogitFnTest(object):
         # [2 * 23 + 10, 3 * 23 + 5] = [56, 74].
         # [2 * 31 + 10, 3 * 31 + 5] = [72, 98]
         self.assertAllClose([[56., 74.], [72., 98.]], logits.eval())
+
+  def test_compute_fraction_of_zero(self):
+    """Tests the calculation of sparsity."""
+    age = feature_column_lib.numeric_column('age')
+    occupation = feature_column_lib.categorical_column_with_hash_bucket(
+        'occupation', hash_bucket_size=5)
+    with ops.Graph().as_default():
+      cols_to_vars = {}
+      feature_column_lib.linear_model(
+          features={
+              'age': [[23.], [31.]],
+              'occupation': [['doctor'], ['engineer']]
+          },
+          feature_columns=[age, occupation],
+          units=3,
+          cols_to_vars=cols_to_vars)
+      cols_to_vars.pop('bias')
+      fraction_zero = linear._compute_fraction_of_zero(cols_to_vars)
+      age_var = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
+                                   'linear_model/age')[0]
+      with tf_session.Session() as sess:
+        sess.run([variables_lib.global_variables_initializer()])
+        # Upon initialization, all variables will be zero.
+        self.assertAllClose(1, fraction_zero.eval())
+
+        sess.run(age_var.assign([[2.0, 0.0, -1.0]]))
+        # 1 of the 3 age weights are zero, and all of the 15 (5 hash buckets
+        # x 3-dim output) are zero.
+        self.assertAllClose(16. / 18., fraction_zero.eval())
+
+
+class BaseLinearWarmStartingTest(object):
+
+  def __init__(self, _linear_classifier_fn, _linear_regressor_fn):
+    self._linear_classifier_fn = _linear_classifier_fn
+    self._linear_regressor_fn = _linear_regressor_fn
+
+  def setUp(self):
+    # Create a directory to save our old checkpoint and vocabularies to.
+    self._ckpt_and_vocab_dir = tempfile.mkdtemp()
+
+    # Make a dummy input_fn.
+    def _input_fn():
+      features = {
+          'age': [[23.], [31.]],
+          'age_in_years': [[23.], [31.]],
+          'occupation': [['doctor'], ['consultant']]
+      }
+      return features, [0, 1]
+
+    self._input_fn = _input_fn
+
+  def tearDown(self):
+    # Clean up checkpoint / vocab dir.
+    writer_cache.FileWriterCache.clear()
+    shutil.rmtree(self._ckpt_and_vocab_dir)
+
+  def test_classifier_basic_warm_starting(self):
+    """Tests correctness of LinearClassifier default warm-start."""
+    age = feature_column_lib.numeric_column('age')
+
+    # Create a LinearClassifier and train to save a checkpoint.
+    linear_classifier = self._linear_classifier_fn(
+        feature_columns=[age],
+        model_dir=self._ckpt_and_vocab_dir,
+        n_classes=4,
+        optimizer='SGD')
+    linear_classifier.train(input_fn=self._input_fn, max_steps=1)
+
+    # Create a second LinearClassifier, warm-started from the first.  Use a
+    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
+    # accumulator values that change).
+    warm_started_linear_classifier = self._linear_classifier_fn(
+        feature_columns=[age],
+        n_classes=4,
+        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
+        warm_start_from=linear_classifier.model_dir)
+
+    warm_started_linear_classifier.train(input_fn=self._input_fn, max_steps=1)
+    for variable_name in warm_started_linear_classifier.get_variable_names():
+      self.assertAllClose(
+          linear_classifier.get_variable_value(variable_name),
+          warm_started_linear_classifier.get_variable_value(variable_name))
+
+  def test_regressor_basic_warm_starting(self):
+    """Tests correctness of LinearRegressor default warm-start."""
+    age = feature_column_lib.numeric_column('age')
+
+    # Create a LinearRegressor and train to save a checkpoint.
+    linear_regressor = self._linear_regressor_fn(
+        feature_columns=[age],
+        model_dir=self._ckpt_and_vocab_dir,
+        optimizer='SGD')
+    linear_regressor.train(input_fn=self._input_fn, max_steps=1)
+
+    # Create a second LinearRegressor, warm-started from the first.  Use a
+    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
+    # accumulator values that change).
+    warm_started_linear_regressor = self._linear_regressor_fn(
+        feature_columns=[age],
+        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
+        warm_start_from=linear_regressor.model_dir)
+
+    warm_started_linear_regressor.train(input_fn=self._input_fn, max_steps=1)
+    for variable_name in warm_started_linear_regressor.get_variable_names():
+      self.assertAllClose(
+          linear_regressor.get_variable_value(variable_name),
+          warm_started_linear_regressor.get_variable_value(variable_name))
+
+  def test_warm_starting_selective_variables(self):
+    """Tests selecting variables to warm-start."""
+    age = feature_column_lib.numeric_column('age')
+
+    # Create a LinearClassifier and train to save a checkpoint.
+    linear_classifier = self._linear_classifier_fn(
+        feature_columns=[age],
+        model_dir=self._ckpt_and_vocab_dir,
+        n_classes=4,
+        optimizer='SGD')
+    linear_classifier.train(input_fn=self._input_fn, max_steps=1)
+
+    # Create a second LinearClassifier, warm-started from the first.  Use a
+    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
+    # accumulator values that change).
+    warm_started_linear_classifier = self._linear_classifier_fn(
+        feature_columns=[age],
+        n_classes=4,
+        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
+        # The provided regular expression will only warm-start the age variable
+        # and not the bias.
+        warm_start_from=warm_starting_util.WarmStartSettings(
+            ckpt_to_initialize_from=linear_classifier.model_dir,
+            vars_to_warm_start='.*(age).*'))
+
+    warm_started_linear_classifier.train(input_fn=self._input_fn, max_steps=1)
+    self.assertAllClose(
+        linear_classifier.get_variable_value(AGE_WEIGHT_NAME),
+        warm_started_linear_classifier.get_variable_value(AGE_WEIGHT_NAME))
+    # Bias should still be zero from initialization.
+    self.assertAllClose(
+        [0.0] * 4, warm_started_linear_classifier.get_variable_value(BIAS_NAME))
+
+  def test_warm_starting_with_vocab_remapping_and_partitioning(self):
+    """Tests warm-starting with vocab remapping and partitioning."""
+    vocab_list = ['doctor', 'lawyer', 'consultant']
+    vocab_file = os.path.join(self._ckpt_and_vocab_dir, 'occupation_vocab')
+    with open(vocab_file, 'w') as f:
+      f.write('\n'.join(vocab_list))
+    occupation = feature_column_lib.categorical_column_with_vocabulary_file(
+        'occupation',
+        vocabulary_file=vocab_file,
+        vocabulary_size=len(vocab_list))
+
+    # Create a LinearClassifier and train to save a checkpoint.
+    partitioner = partitioned_variables.fixed_size_partitioner(num_shards=2)
+    linear_classifier = self._linear_classifier_fn(
+        feature_columns=[occupation],
+        model_dir=self._ckpt_and_vocab_dir,
+        n_classes=4,
+        optimizer='SGD',
+        partitioner=partitioner)
+    linear_classifier.train(input_fn=self._input_fn, max_steps=1)
+
+    # Create a second LinearClassifier, warm-started from the first.  Use a
+    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
+    # accumulator values that change).  Use a new FeatureColumn with a
+    # different vocabulary for occupation.
+    new_vocab_list = ['doctor', 'consultant', 'engineer']
+    new_vocab_file = os.path.join(self._ckpt_and_vocab_dir,
+                                  'new_occupation_vocab')
+    with open(new_vocab_file, 'w') as f:
+      f.write('\n'.join(new_vocab_list))
+    new_occupation = feature_column_lib.categorical_column_with_vocabulary_file(
+        'occupation',
+        vocabulary_file=new_vocab_file,
+        vocabulary_size=len(new_vocab_list))
+    # We can create our VocabInfo object from the new and old occupation
+    # FeatureColumn's.
+    occupation_vocab_info = warm_starting_util.VocabInfo(
+        new_vocab=new_occupation.vocabulary_file,
+        new_vocab_size=new_occupation.vocabulary_size,
+        num_oov_buckets=new_occupation.num_oov_buckets,
+        old_vocab=occupation.vocabulary_file,
+        old_vocab_size=occupation.vocabulary_size,
+        # Can't use constant_initializer with load_and_remap.  In practice,
+        # use a truncated normal initializer.
+        backup_initializer=init_ops.random_uniform_initializer(
+            minval=0.39, maxval=0.39))
+    warm_started_linear_classifier = self._linear_classifier_fn(
+        feature_columns=[occupation],
+        n_classes=4,
+        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
+        warm_start_from=warm_starting_util.WarmStartSettings(
+            ckpt_to_initialize_from=linear_classifier.model_dir,
+            var_name_to_vocab_info={
+                OCCUPATION_WEIGHT_NAME: occupation_vocab_info
+            },
+            # Explicitly providing None here will only warm-start variables
+            # referenced in var_name_to_vocab_info (the bias will not be
+            # warm-started).
+            vars_to_warm_start=None),
+        partitioner=partitioner)
+
+    warm_started_linear_classifier.train(input_fn=self._input_fn, max_steps=1)
+    # 'doctor' was ID-0 and still ID-0.
+    self.assertAllClose(
+        linear_classifier.get_variable_value(OCCUPATION_WEIGHT_NAME)[0, :],
+        warm_started_linear_classifier.get_variable_value(
+            OCCUPATION_WEIGHT_NAME)[0, :])
+    # 'consultant' was ID-2 and now ID-1.
+    self.assertAllClose(
+        linear_classifier.get_variable_value(OCCUPATION_WEIGHT_NAME)[2, :],
+        warm_started_linear_classifier.get_variable_value(
+            OCCUPATION_WEIGHT_NAME)[1, :])
+    # 'engineer' is a new entry and should be initialized with the
+    # backup_initializer in VocabInfo.
+    self.assertAllClose([0.39] * 4,
+                        warm_started_linear_classifier.get_variable_value(
+                            OCCUPATION_WEIGHT_NAME)[2, :])
+    # Bias should still be zero (from initialization logic).
+    self.assertAllClose(
+        [0.0] * 4, warm_started_linear_classifier.get_variable_value(BIAS_NAME))
+
+  def test_warm_starting_with_naming_change(self):
+    """Tests warm-starting with a Tensor name remapping."""
+    age_in_years = feature_column_lib.numeric_column('age_in_years')
+
+    # Create a LinearClassifier and train to save a checkpoint.
+    linear_classifier = self._linear_classifier_fn(
+        feature_columns=[age_in_years],
+        model_dir=self._ckpt_and_vocab_dir,
+        n_classes=4,
+        optimizer='SGD')
+    linear_classifier.train(input_fn=self._input_fn, max_steps=1)
+
+    # Create a second LinearClassifier, warm-started from the first.  Use a
+    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
+    # accumulator values that change).
+    warm_started_linear_classifier = self._linear_classifier_fn(
+        feature_columns=[feature_column_lib.numeric_column('age')],
+        n_classes=4,
+        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
+        # The 'age' variable correspond to the 'age_in_years' variable in the
+        # previous model.
+        warm_start_from=warm_starting_util.WarmStartSettings(
+            ckpt_to_initialize_from=linear_classifier.model_dir,
+            var_name_to_prev_var_name={
+                AGE_WEIGHT_NAME: AGE_WEIGHT_NAME.replace('age', 'age_in_years')
+            }))
+
+    warm_started_linear_classifier.train(input_fn=self._input_fn, max_steps=1)
+    self.assertAllClose(
+        linear_classifier.get_variable_value(
+            AGE_WEIGHT_NAME.replace('age', 'age_in_years')),
+        warm_started_linear_classifier.get_variable_value(AGE_WEIGHT_NAME))
+    # The bias is also warm-started (with no name remapping).
+    self.assertAllClose(
+        linear_classifier.get_variable_value(BIAS_NAME),
+        warm_started_linear_classifier.get_variable_value(BIAS_NAME))
diff --git a/tensorflow/python/estimator/canned/metric_keys.py b/tensorflow/python/estimator/canned/metric_keys.py
index 7dc4bfe5ffb5f762b56f4fc91b8a75ee4ba1796e..44eb680939203fea67e3391326a6f1013f022ad5 100644
--- a/tensorflow/python/estimator/canned/metric_keys.py
+++ b/tensorflow/python/estimator/canned/metric_keys.py
@@ -25,6 +25,7 @@ class MetricKeys(object):
   """Metric key strings."""
   LOSS = model_fn.LOSS_METRIC_KEY
   LOSS_MEAN = model_fn.AVERAGE_LOSS_METRIC_KEY
+  LOSS_REGULARIZATION = 'regularization_loss'
 
   ACCURACY = 'accuracy'
   # This is the best the model could do by always predicting one class.
diff --git a/tensorflow/python/estimator/canned/parsing_utils.py b/tensorflow/python/estimator/canned/parsing_utils.py
index f153272947ca427b25b00e6df4741d7ada5790df..74e5e5a1bed80229c68daa3ff33ee7af4004bf47 100644
--- a/tensorflow/python/estimator/canned/parsing_utils.py
+++ b/tensorflow/python/estimator/canned/parsing_utils.py
@@ -23,8 +23,10 @@ import six
 from tensorflow.python.feature_column import feature_column as fc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('estimator.classifier_parse_example_spec')
 def classifier_parse_example_spec(feature_columns,
                                   label_key,
                                   label_dtype=dtypes.int64,
@@ -164,6 +166,7 @@ def classifier_parse_example_spec(feature_columns,
   return parsing_spec
 
 
+@tf_export('estimator.regressor_parse_example_spec')
 def regressor_parse_example_spec(feature_columns,
                                  label_key,
                                  label_dtype=dtypes.float32,
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index f267f4a54e541c8942fd6430a802798e430a5a47..e269b71f2ee007e2d36275c12e81ee44c30db694 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -30,10 +30,12 @@ from google.protobuf import message
 from tensorflow.core.framework import summary_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as tf_session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config
 from tensorflow.python.estimator import util
+from tensorflow.python.estimator import warm_starting_util
 from tensorflow.python.estimator.export.export import build_all_signature_defs
 from tensorflow.python.estimator.export.export import get_temp_export_dir
 from tensorflow.python.estimator.export.export import get_timestamped_export_dir
@@ -53,13 +55,16 @@ from tensorflow.python.training import saver
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
+from tensorflow.python.util import compat_internal
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 _VALID_MODEL_FN_ARGS = set(
     ['features', 'labels', 'mode', 'params', 'self', 'config'])
 
 
+@tf_export('estimator.Estimator')
 class Estimator(object):
   """Estimator class to train and evaluate TensorFlow models.
 
@@ -95,9 +100,22 @@ class Estimator(object):
   @end_compatibility
   """
 
-  def __init__(self, model_fn, model_dir=None, config=None, params=None):
+  def __init__(self, model_fn, model_dir=None, config=None, params=None,
+               warm_start_from=None):
     """Constructs an `Estimator` instance.
 
+    See @{$estimators} for more information. To warm-start an `Estimator`:
+
+    ```python
+    estimator = tf.estimator.DNNClassifier(
+        feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
+        hidden_units=[1024, 512, 256],
+        warm_start_from="/path/to/checkpoint/dir")
+    ```
+
+    For more details on warm-start configuration, see
+    @{tf.estimator.WarmStartSettings$WarmStartSettings}.
+
     Args:
       model_fn: Model function. Follows the signature:
 
@@ -127,12 +145,19 @@ class Estimator(object):
 
       model_dir: Directory to save model parameters, graph and etc. This can
         also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model. If `None`, the model_dir in
-        `config` will be used if set. If both are set, they must be same. If
-        both are `None`, a temporary directory will be used.
+        continue training a previously saved model. If `PathLike` object, the
+        path will be resolved. If `None`, the model_dir in `config` will be used
+        if set. If both are set, they must be same. If both are `None`, a
+        temporary directory will be used.
       config: Configuration object.
       params: `dict` of hyper parameters that will be passed into `model_fn`.
               Keys are names of parameters, values are basic python types.
+      warm_start_from: Optional string filepath to a checkpoint to warm-start
+                       from, or a `tf.estimator.WarmStartSettings` object to
+                       fully configure warm-starting.  If the string filepath is
+                       provided instead of a `WarmStartSettings`, then all
+                       variables are warm-started, and it is assumed that
+                       vocabularies and Tensor names are unchanged.
 
     Raises:
       RuntimeError: If eager execution is enabled.
@@ -157,6 +182,7 @@ class Estimator(object):
       self._config = config
 
     # Model directory.
+    model_dir = compat_internal.path_to_str(model_dir)
     if (model_dir is not None) and (self._config.model_dir is not None):
       if model_dir != self._config.model_dir:
         # TODO(alanyee): remove this suppression after it is no longer needed
@@ -189,6 +215,11 @@ class Estimator(object):
     self._model_fn = model_fn
     self._params = copy.deepcopy(params or {})
 
+    # pylint: disable=protected-access
+    self._warm_start_settings = (
+        warm_starting_util._get_default_warm_start_settings(warm_start_from))
+    # pylint: enable=protected-access
+
   @property
   def model_dir(self):
     return self._model_dir
@@ -261,9 +292,19 @@ class Estimator(object):
     """Trains a model given training data input_fn.
 
     Args:
-      input_fn: Input function returning a tuple of:
-          features - `Tensor` or dictionary of string feature name to `Tensor`.
-          labels - `Tensor` or dictionary of `Tensor` with labels.
+      input_fn: A function that provides input data for training as minibatches.
+        See @{$get_started/premade_estimators#create_input_functions} for more
+        information. The function should construct and return one of
+        the following:
+
+          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
+            tuple (features, labels) with same constraints as below.
+          * A tuple (features, labels): Where features is a `Tensor` or a
+            dictionary of string feature name to `Tensor` and labels is a
+            `Tensor` or a dictionary of string label name to `Tensor`. Both
+            features and labels are consumed by `model_fn`. They should satisfy
+            the expectation of `model_fn` from inputs.
+
       hooks: List of `SessionRunHook` subclass instances. Used for callbacks
         inside the training loop.
       steps: Number of steps for which to train model. If `None`, train forever
@@ -331,10 +372,19 @@ class Estimator(object):
     `StopIteration`).
 
     Args:
-      input_fn: Input function returning a tuple of:
-          features - Dictionary of string feature name to `Tensor` or
-            `SparseTensor`.
-          labels - `Tensor` or dictionary of `Tensor` with labels.
+      input_fn: A function that constructs the input data for evaluation.
+        See @{$get_started/premade_estimators#create_input_functions} for more
+        information. The function should construct and return one of
+        the following:
+
+          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
+            tuple (features, labels) with same constraints as below.
+          * A tuple (features, labels): Where features is a `Tensor` or a
+            dictionary of string feature name to `Tensor` and labels is a
+            `Tensor` or a dictionary of string label name to `Tensor`. Both
+            features and labels are consumed by `model_fn`. They should satisfy
+            the expectation of `model_fn` from inputs.
+
       steps: Number of steps for which to evaluate model. If `None`, evaluates
         until `input_fn` raises an end-of-input exception.
       hooks: List of `SessionRunHook` subclass instances. Used for callbacks
@@ -377,15 +427,25 @@ class Estimator(object):
               input_fn,
               predict_keys=None,
               hooks=None,
-              checkpoint_path=None):
+              checkpoint_path=None,
+              yield_single_examples=True):
     """Yields predictions for given features.
 
     Args:
-      input_fn: Input function returning features which is a dictionary of
-        string feature name to `Tensor` or `SparseTensor`. If it returns a
-        tuple, first item is extracted as features. Prediction continues until
-        `input_fn` raises an end-of-input exception (`OutOfRangeError` or
+      input_fn: A function that constructs the features. Prediction continues
+        until `input_fn` raises an end-of-input exception (`OutOfRangeError` or
         `StopIteration`).
+        See @{$get_started/premade_estimators#create_input_functions} for more
+        information. The function should construct and return one of
+        the following:
+
+          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must have
+            same constraints as below.
+          * features: A `Tensor` or a dictionary of string feature name to
+            `Tensor`. features are consumed by `model_fn`. They should satisfy
+            the expectation of `model_fn` from inputs.
+          * A tuple, in which case the first item is extracted as features.
+
       predict_keys: list of `str`, name of the keys to predict. It is used if
         the `EstimatorSpec.predictions` is a `dict`. If `predict_keys` is used
         then rest of the predictions will be filtered from the dictionary. If
@@ -394,13 +454,18 @@ class Estimator(object):
         inside the prediction call.
       checkpoint_path: Path of a specific checkpoint to predict. If `None`, the
         latest checkpoint in `model_dir` is used.
+      yield_single_examples: If False, yield the whole batch as returned by the
+        model_fn instead of decomposing the batch into individual elements. This
+        is useful if model_fn return some tensor with first dimension not
+        equal to the batch size
 
     Yields:
       Evaluated values of `predictions` tensors.
 
     Raises:
       ValueError: Could not find a trained model in model_dir.
-      ValueError: if batch length of predictions are not same.
+      ValueError: if batch length of predictions are not same and
+        yield_single_examples is True.
       ValueError: If there is a conflict between `predict_keys` and
         `predictions`. For example if `predict_keys` is not `None` but
         `EstimatorSpec.predictions` is not a `dict`.
@@ -416,20 +481,26 @@ class Estimator(object):
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
       self._create_and_assert_global_step(g)
-      features = self._get_features_from_input_fn(
+      features, input_hooks = self._get_features_from_input_fn(
           input_fn, model_fn_lib.ModeKeys.PREDICT)
       estimator_spec = self._call_model_fn(
           features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
       predictions = self._extract_keys(estimator_spec.predictions, predict_keys)
+      all_hooks = list(input_hooks)
+      all_hooks.extend(hooks)
+      all_hooks.extend(list(estimator_spec.prediction_hooks or []))
       with training.MonitoredSession(
           session_creator=training.ChiefSessionCreator(
               checkpoint_filename_with_path=checkpoint_path,
+              master=self._config.master,
               scaffold=estimator_spec.scaffold,
               config=self._session_config),
-          hooks=hooks) as mon_sess:
+          hooks=all_hooks) as mon_sess:
         while not mon_sess.should_stop():
           preds_evaluated = mon_sess.run(predictions)
-          if not isinstance(predictions, dict):
+          if not yield_single_examples:
+            yield preds_evaluated
+          elif not isinstance(predictions, dict):
             for pred in preds_evaluated:
               yield pred
           else:
@@ -441,9 +512,11 @@ class Estimator(object):
 
   def _assert_members_are_not_overridden(self):
     """Asserts members of `Estimator` are not overridden."""
-    allowed_overrides = set(['_call_input_fn', '_create_global_step',
-                             '_convert_train_steps_to_hooks',
-                             '_convert_eval_steps_to_hooks'])
+    allowed_overrides = set([
+        '_call_input_fn', '_create_global_step',
+        '_convert_train_steps_to_hooks', '_convert_eval_steps_to_hooks',
+        '_tf_api_names'
+    ])
     estimator_members = set([m for m in Estimator.__dict__.keys()
                              if not m.startswith('__')])
     subclass_members = set(self.__class__.__dict__.keys())
@@ -460,7 +533,8 @@ class Estimator(object):
       self, export_dir_base, serving_input_receiver_fn,
       assets_extra=None,
       as_text=False,
-      checkpoint_path=None):
+      checkpoint_path=None,
+      strip_default_attrs=False):
     # pylint: disable=line-too-long
     """Exports inference graph as a SavedModel into given dir.
 
@@ -485,7 +559,7 @@ class Estimator(object):
     `ExportOutput`s, and the inputs are always the input receivers provided by
     the serving_input_receiver_fn.
 
-    Extra assets may be written into the SavedModel via the extra_assets
+    Extra assets may be written into the SavedModel via the assets_extra
     argument.  This should be a dict, where each key gives a destination path
     (including the filename) relative to the assets.extra directory.  The
     corresponding value gives the full path of the source file to be copied.
@@ -502,6 +576,9 @@ class Estimator(object):
       as_text: whether to write the SavedModel proto in text format.
       checkpoint_path: The checkpoint path to export.  If `None` (the default),
         the most recent checkpoint found within the model directory is chosen.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
 
     Returns:
       The string path to the exported directory.
@@ -548,7 +625,6 @@ class Estimator(object):
             sharded=True)
         saver_for_restore.restore(session, checkpoint_path)
 
-        # TODO(b/36111876): replace legacy_init_op with main_op mechanism
         # pylint: disable=protected-access
         local_init_op = (
             estimator_spec.scaffold.local_init_op or
@@ -562,7 +638,8 @@ class Estimator(object):
             signature_def_map=signature_def_map,
             assets_collection=ops.get_collection(
                 ops.GraphKeys.ASSET_FILEPATHS),
-            legacy_init_op=local_init_op)
+            legacy_init_op=local_init_op,
+            strip_default_attrs=strip_default_attrs)
         builder.save(as_text)
 
       # Add the extra assets
@@ -582,6 +659,11 @@ class Estimator(object):
   def _get_features_from_input_fn(self, input_fn, mode):
     """Extracts the `features` from return values of `input_fn`."""
     result = self._call_input_fn(input_fn, mode)
+    input_hooks = []
+    if isinstance(result, dataset_ops.Dataset):
+      iterator = result.make_initializable_iterator()
+      input_hooks.append(_DatasetInitializerHook(iterator))
+      result = iterator.get_next()
     if isinstance(result, (list, tuple)):
       # Unconditionally drop the label (the second element of result).
       result = result[0]
@@ -590,16 +672,22 @@ class Estimator(object):
       logging.warning('Input graph does not use tf.data.Dataset or contain a '
                       'QueueRunner. That means predict yields forever. '
                       'This is probably a mistake.')
-    return result
+    return result, input_hooks
 
   def _get_features_and_labels_from_input_fn(self, input_fn, mode):
+    """Extracts the `features` and labels from return values of `input_fn`."""
     result = self._call_input_fn(input_fn, mode)
+    input_hooks = []
+    if isinstance(result, dataset_ops.Dataset):
+      iterator = result.make_initializable_iterator()
+      input_hooks.append(_DatasetInitializerHook(iterator))
+      result = iterator.get_next()
     if isinstance(result, (list, tuple)):
       if len(result) != 2:
         raise ValueError(
-            'input_fn should return (feautures, labels) as a len 2 tuple.')
-      return result
-    return result, None
+            'input_fn should return (features, labels) as a len 2 tuple.')
+      return result[0], result[1], input_hooks
+    return result, None, input_hooks
 
   def _extract_batch_length(self, preds_evaluated):
     """Extracts batch length of predictions."""
@@ -671,9 +759,10 @@ class Estimator(object):
     Raises:
       ValueError: if input_fn takes invalid arguments.
     """
-    del mode  # unused
     input_fn_args = util.fn_args(input_fn)
     kwargs = {}
+    if 'mode' in input_fn_args:
+      kwargs['mode'] = mode
     if 'params' in input_fn_args:
       kwargs['params'] = self.params
     if 'config' in input_fn_args:
@@ -710,7 +799,10 @@ class Estimator(object):
       kwargs['params'] = self.params
     if 'config' in model_fn_args:
       kwargs['config'] = config
+
+    logging.info('Calling model_fn.')
     model_fn_results = self._model_fn(features=features, **kwargs)
+    logging.info('Done calling model_fn.')
 
     if not isinstance(model_fn_results, model_fn_lib.EstimatorSpec):
       raise ValueError('model_fn should return an EstimatorSpec.')
@@ -723,10 +815,19 @@ class Estimator(object):
       random_seed.set_random_seed(self._config.tf_random_seed)
       global_step_tensor = self._create_and_assert_global_step(g)
       training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
-      features, labels = self._get_features_and_labels_from_input_fn(
-          input_fn, model_fn_lib.ModeKeys.TRAIN)
+      features, labels, input_hooks = (
+          self._get_features_and_labels_from_input_fn(
+              input_fn, model_fn_lib.ModeKeys.TRAIN))
+      worker_hooks.extend(input_hooks)
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
+
+      if self._warm_start_settings:
+        logging.info('Warm-starting with WarmStartSettings: %s' %
+                     (self._warm_start_settings,))
+        # pylint: disable=protected-access
+        warm_starting_util._warm_start(self._warm_start_settings)
+        # pylint: enable=protected-access
       # Check if the user created a loss summary, and add one if they didn't.
       # We assume here that the summary is called 'loss'. If it is not, we will
       # make another one with the name 'loss' to ensure it shows up in the right
@@ -822,8 +923,9 @@ class Estimator(object):
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
       global_step_tensor = self._create_and_assert_global_step(g)
-      features, labels = self._get_features_and_labels_from_input_fn(
-          input_fn, model_fn_lib.ModeKeys.EVAL)
+      features, labels, input_hooks = (
+          self._get_features_and_labels_from_input_fn(
+              input_fn, model_fn_lib.ModeKeys.EVAL))
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.EVAL, self.config)
 
@@ -844,7 +946,8 @@ class Estimator(object):
             'already defines a default metric with the same name.')
       eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor
 
-      all_hooks = list(hooks or [])
+      all_hooks = list(input_hooks)
+      all_hooks.extend(hooks)
       all_hooks.extend(list(estimator_spec.evaluation_hooks or []))
 
       eval_results = evaluation._evaluate_once(  # pylint: disable=protected-access
@@ -908,7 +1011,7 @@ def _get_replica_device_setter(config):
       'Variable', 'VariableV2', 'AutoReloadVariable', 'MutableHashTable',
       'MutableHashTableV2', 'MutableHashTableOfTensors',
       'MutableHashTableOfTensorsV2', 'MutableDenseHashTable',
-      'MutableDenseHashTableV2'
+      'MutableDenseHashTableV2', 'VarHandleOp'
   ]
 
   if config.task_type:
@@ -1039,3 +1142,16 @@ def _has_dataset_or_queue_runner(maybe_tensor):
 
   # Now, check queue.
   return ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS)
+
+
+class _DatasetInitializerHook(training.SessionRunHook):
+
+  def __init__(self, iterator):
+    self._iterator = iterator
+
+  def begin(self):
+    self._initializer = self._iterator.initializer
+
+  def after_create_session(self, session, coord):
+    del coord
+    session.run(self._initializer)
diff --git a/tensorflow/python/estimator/estimator_lib.py b/tensorflow/python/estimator/estimator_lib.py
index bed2b674192bd4054baa2ee5d30fc72c0e8d54ed..01699e7399c4089281e9ece76e534e1f82692257 100644
--- a/tensorflow/python/estimator/estimator_lib.py
+++ b/tensorflow/python/estimator/estimator_lib.py
@@ -41,6 +41,8 @@ from tensorflow.python.estimator.run_config import RunConfig
 from tensorflow.python.estimator.training import EvalSpec
 from tensorflow.python.estimator.training import train_and_evaluate
 from tensorflow.python.estimator.training import TrainSpec
+from tensorflow.python.estimator.warm_starting_util import VocabInfo
+from tensorflow.python.estimator.warm_starting_util import WarmStartSettings
 
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -76,6 +78,10 @@ _allowed_symbols = [
     'Exporter',
     'LatestExporter',
     'FinalExporter',
+
+    # Warm-starting
+    'WarmStartSettings',
+    'VocabInfo',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index c1b773b8c408dbfe7df685d5dcf2748ae5428adf..7c7d913c32964f4400c3345474c6b6e1bffc5fe4 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import layers
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
@@ -51,12 +52,14 @@ from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.summary import summary
 from tensorflow.python.summary import summary_iterator
@@ -418,6 +421,7 @@ class EstimatorTrainTest(test.TestCase):
     self.assertEqual(1, model_fn_call_count[0])
 
   def test_callable_input_fn(self):
+    expected_mode = model_fn_lib.ModeKeys.TRAIN
     expected_params = {'batch_size': 10}
     expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
     input_fn_call_count = [0]
@@ -430,8 +434,9 @@ class EstimatorTrainTest(test.TestCase):
 
     class InputFn(object):
 
-      def __call__(self, params, config):
+      def __call__(self, mode, params, config):
         input_fn_call_count[0] += 1
+        test_self.assertEqual(expected_mode, mode)
         test_self.assertEqual(expected_params, params)
         test_self.assertEqual(4321, config.tf_random_seed)
         return dummy_input_fn()
@@ -444,6 +449,7 @@ class EstimatorTrainTest(test.TestCase):
     self.assertEqual(1, input_fn_call_count[0])
 
   def test_input_fn_args(self):
+    expected_mode = model_fn_lib.ModeKeys.TRAIN
     expected_params = {'batch_size': 10}
     expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
     input_fn_call_count = [0]
@@ -452,8 +458,9 @@ class EstimatorTrainTest(test.TestCase):
       del params, config
       return model_fn_global_step_incrementer(features, labels, mode)
 
-    def _input_fn(params, config):
+    def _input_fn(mode, params, config):
       input_fn_call_count[0] += 1
+      self.assertEqual(expected_mode, mode)
       self.assertEqual(expected_params, params)
       self.assertEqual(4321, config.tf_random_seed)
       return dummy_input_fn()
@@ -623,6 +630,33 @@ class EstimatorTrainTest(test.TestCase):
     self.assertEqual(
         10, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
 
+  def test_warm_starts(self):
+    def _make_model_fn(x):
+      def _variable_creating_model_fn(features, labels, mode):
+        _, _ = features, labels
+        variable_scope.get_variable('x', initializer=x)
+        global_step = training.get_global_step()
+        return model_fn_lib.EstimatorSpec(
+            mode,
+            loss=constant_op.constant(1.),
+            train_op=state_ops.assign_add(global_step, 1))
+      return _variable_creating_model_fn
+
+    est = estimator.Estimator(model_fn=_make_model_fn(42.))
+    est.train(dummy_input_fn, steps=10)
+
+    warm_started_est = estimator.Estimator(
+        model_fn=_make_model_fn(36.),
+        warm_start_from=est.model_dir)
+    warm_started_est.train(dummy_input_fn, steps=5)
+    # warm_start is called after the model_fn, so x should have the value
+    # from the checkpoint.
+    self.assertEqual(42., warm_started_est.get_variable_value('x'))
+    # global_step should not be warm-started.
+    self.assertEqual(
+        5, estimator._load_global_step_from_checkpoint_dir(
+            warm_started_est.model_dir))
+
   def test_max_step(self):
     est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
     est.train(dummy_input_fn, max_steps=5)
@@ -913,9 +947,84 @@ class EstimatorGetVariablesTest(test.TestCase):
     self.assertEqual(3., est.get_variable_value('three'))
 
 
+class EstimatorDatasetIntegrationTest(test.TestCase):
+  """Tests dataset integration."""
+
+  def test_returned_by_input_fn(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.from_tensors(([1.], [2.]))
+
+    def _model_fn(features, labels, mode):
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=features + labels,  # 1 + 2
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn, steps=1)
+    scores = est.evaluate(_input_fn, steps=1)
+    self.assertEqual(3., scores[model_fn_lib.LOSS_METRIC_KEY])
+
+  def test_with_none_labels(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.from_tensors([7.])
+
+    def _model_fn(features, labels, mode):
+      self.assertIsNone(labels)
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=features,  # 7
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn, steps=1)
+    scores = est.evaluate(_input_fn, steps=1)
+    self.assertEqual(7., scores[model_fn_lib.LOSS_METRIC_KEY])
+
+  def test_with_predict(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.from_tensors([10.])
+
+    def _model_fn(features, labels, mode):
+      _ = labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          predictions=features,  # 10
+          loss=features,  # 10
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn, steps=1)
+    self.assertEqual([10.], next(est.predict(input_fn=_input_fn)))
+
+  def test_batching(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.from_tensor_slices(([[1.], [2.]],
+                                                     [[10.], [20.]])).batch(1)
+
+    def _model_fn(features, labels, mode):
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          predictions=features,
+          loss=features + (0 if labels is None else labels),  # 11, 22
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn)
+    scores = est.evaluate(_input_fn)
+    # (11 + 22)/2 = 16.5
+    self.assertEqual(16.5, scores[model_fn_lib.LOSS_METRIC_KEY])
+    self.assertEqual([1., 2.], list(est.predict(_input_fn)))
+
+
 class EstimatorEvaluateTest(test.TestCase):
 
   def test_input_fn_args(self):
+    expected_mode = model_fn_lib.ModeKeys.EVAL
     expected_params = {'batch_size': 10}
     expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
     input_fn_call_count = [0]
@@ -924,8 +1033,9 @@ class EstimatorEvaluateTest(test.TestCase):
       del params, config
       return model_fn_global_step_incrementer(features, labels, mode)
 
-    def _input_fn(params, config):
+    def _input_fn(mode, params, config):
       input_fn_call_count[0] += 1
+      self.assertEqual(expected_mode, mode)
       self.assertEqual(expected_params, params)
       self.assertEqual(4321, config.tf_random_seed)
       return dummy_input_fn()
@@ -1189,6 +1299,7 @@ class EstimatorEvaluateTest(test.TestCase):
 class EstimatorPredictTest(test.TestCase):
 
   def test_input_fn_args(self):
+    expected_mode = model_fn_lib.ModeKeys.PREDICT
     expected_params = {'batch_size': 10}
     expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
     input_fn_call_count = [0]
@@ -1201,8 +1312,9 @@ class EstimatorPredictTest(test.TestCase):
           train_op=state_ops.assign_add(training.get_global_step(), 1),
           predictions=constant_op.constant([[10.]]))
 
-    def _input_fn(params, config):
+    def _input_fn(mode, params, config):
       input_fn_call_count[0] += 1
+      self.assertEqual(expected_mode, mode)
       self.assertEqual(expected_params, params)
       self.assertEqual(4321, config.tf_random_seed)
       return dummy_input_fn()
@@ -1243,6 +1355,25 @@ class EstimatorPredictTest(test.TestCase):
     est.train(dummy_input_fn, steps=1)
     self.assertEqual(10., next(est.predict(dummy_input_fn)))
 
+  def test_predictionhooks_are_used(self):
+    hook = test.mock.MagicMock(
+        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
+
+    def _model_fn_hooks(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([[10.]]),
+          prediction_hooks=[hook])
+
+    est = estimator.Estimator(model_fn=_model_fn_hooks)
+    est.train(dummy_input_fn, steps=1)
+    self.assertFalse(hook.begin.called)
+    next(est.predict(dummy_input_fn))
+    self.assertTrue(hook.begin.called)
+
   def test_warn_if_no_queue_runner(self):
 
     def _model_fn(features, labels, mode):
@@ -1341,6 +1472,27 @@ class EstimatorPredictTest(test.TestCase):
                                  'Batch length of predictions should be same'):
       next(est.predict(dummy_input_fn))
 
+  def test_iterate_batches(self):
+
+    def _model_fn(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions={
+              # First dim is different but the prediction should still work
+              'y1': array_ops.zeros(shape=[3]),
+              'y2': array_ops.zeros(shape=[5, 3])
+          })
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(dummy_input_fn, steps=1)
+
+    predictions = next(est.predict(dummy_input_fn, yield_single_examples=False))
+    self.assertAllEqual(predictions['y1'].shape, [3])
+    self.assertAllEqual(predictions['y2'].shape, [5, 3])
+
   def test_predict_keys_defined_for_tensor(self):
 
     def _model_fn(features, labels, mode):
@@ -1976,6 +2128,65 @@ class EstimatorExportTest(test.TestCase):
 
     gfile.DeleteRecursively(tmpdir)
 
+  def test_export_savedmodel_proto_strip_default_attrs(self):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
+    est.train(input_fn=dummy_input_fn, steps=1)
+    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir_stripped = est.export_savedmodel(
+        export_dir_base, serving_input_receiver_fn, strip_default_attrs=True)
+    export_dir_not_stripped = est.export_savedmodel(
+        export_dir_base, serving_input_receiver_fn, strip_default_attrs=False)
+
+    # Load the SavedModel from disk as-is to verify default attrs
+    # are stripped. Reimporting the SavedModel via the loader causes the
+    # default attrs to be populated in the NodeDefs.
+
+    # pylint: disable=protected-access
+    saved_model_stripped_pb = loader_impl._parse_saved_model(
+        export_dir_stripped)
+    saved_model_not_stripped_pb = loader_impl._parse_saved_model(
+        export_dir_not_stripped)
+    self.assertIsNotNone(saved_model_stripped_pb)
+    self.assertIsNotNone(saved_model_not_stripped_pb)
+    # pylint: enable=protected-access
+
+    meta_graph_def_stripped = [
+        x for x in saved_model_stripped_pb.meta_graphs
+        if x.meta_info_def.tags == [tag_constants.SERVING]][0]
+    meta_graph_def_not_stripped = [
+        x for x in saved_model_not_stripped_pb.meta_graphs
+        if x.meta_info_def.tags == [tag_constants.SERVING]][0]
+
+    # "weight" node in graph is a "Variable" Op with 2 default valued attrs.
+    #   o "container"    : "".
+    #   o "shared_name"  : "".
+
+    # saved_model_stripped_pb was exported with strip_default_attrs set to True.
+    # "weight" node shouldn't have attributes "container" and "shared_name".
+    node_def = test_util.get_node_def_from_graph(
+        'weight', meta_graph_def_stripped.graph_def)
+    self.assertNotIn('container', node_def.attr)
+    self.assertNotIn('shared_name', node_def.attr)
+
+    # saved_model_not_stripped_pb was exported with strip_default_attrs
+    # disabled. "weight" node should have attributes "container" and
+    # "shared_name".
+    node_def = test_util.get_node_def_from_graph(
+        'weight', meta_graph_def_not_stripped.graph_def)
+    self.assertIn('container', node_def.attr)
+    self.assertIn('shared_name', node_def.attr)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
 
 class EstimatorHookOrderingTest(test.TestCase):
 
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 51075731ddc52a55799958c3bfa6140f77404541..83251c79fc561e16ebddb638668b92b3c69b8af4 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -36,12 +36,14 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 _SINGLE_FEATURE_DEFAULT_NAME = 'feature'
 _SINGLE_RECEIVER_DEFAULT_NAME = 'input'
 
 
+@tf_export('estimator.export.ServingInputReceiver')
 class ServingInputReceiver(collections.namedtuple(
     'ServingInputReceiver',
     ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
@@ -118,6 +120,7 @@ class ServingInputReceiver(collections.namedtuple(
         receiver_tensors_alternatives=receiver_tensors_alternatives)
 
 
+@tf_export('estimator.export.build_parsing_serving_input_receiver_fn')
 def build_parsing_serving_input_receiver_fn(feature_spec,
                                             default_batch_size=None):
   """Build a serving_input_receiver_fn expecting fed tf.Examples.
@@ -146,6 +149,7 @@ def build_parsing_serving_input_receiver_fn(feature_spec,
   return serving_input_receiver_fn
 
 
+@tf_export('estimator.export.build_raw_serving_input_receiver_fn')
 def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
   """Build a serving_input_receiver_fn expecting feature Tensors.
 
diff --git a/tensorflow/python/estimator/export/export_output.py b/tensorflow/python/estimator/export/export_output.py
index 863af6d41d985043542b03375372fe564c283b82..87b964be37197dac99b8ce4398cbdaf3b4989c7f 100644
--- a/tensorflow/python/estimator/export/export_output.py
+++ b/tensorflow/python/estimator/export/export_output.py
@@ -26,8 +26,10 @@ import six
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('estimator.export.ExportOutput')
 class ExportOutput(object):
   """Represents an output of a model that can be served.
 
@@ -50,6 +52,7 @@ class ExportOutput(object):
     pass
 
 
+@tf_export('estimator.export.ClassificationOutput')
 class ClassificationOutput(ExportOutput):
   """Represents the output of a classification head.
 
@@ -118,6 +121,7 @@ class ClassificationOutput(ExportOutput):
         examples, self.classes, self.scores)
 
 
+@tf_export('estimator.export.RegressionOutput')
 class RegressionOutput(ExportOutput):
   """Represents the output of a regression head."""
 
@@ -153,6 +157,7 @@ class RegressionOutput(ExportOutput):
 _SINGLE_OUTPUT_DEFAULT_NAME = 'output'
 
 
+@tf_export('estimator.export.PredictOutput')
 class PredictOutput(ExportOutput):
   """Represents the output of a generic prediction head.
 
diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index c6f20d4a9e2a6b3384ba59ae2df67ff7a3464aa9..a3f04626d1e5ed7ca7fb09a5dcc2457a0cf5ab82 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -25,8 +25,10 @@ from tensorflow.python.estimator import gc
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('estimator.Exporter')
 class Exporter(object):
   """A class representing a type of model export."""
 
@@ -73,7 +75,8 @@ class _SavedModelExporter(Exporter):
                name,
                serving_input_receiver_fn,
                assets_extra=None,
-               as_text=False):
+               as_text=False,
+               strip_default_attrs=True):
     """Create an `Exporter` to use with `tf.estimator.EvalSpec`.
 
     Args:
@@ -90,6 +93,9 @@ class _SavedModelExporter(Exporter):
         `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
       as_text: whether to write the SavedModel proto in text format. Defaults to
         `False`.
+      strip_default_attrs: Boolean. If set, default attrs in the `GraphDef` will
+        be stripped on write. This is the default behavior and recommended for
+        better forward compatibility of the resulting `SavedModel`.
 
     Raises:
       ValueError: if any arguments is invalid.
@@ -98,6 +104,7 @@ class _SavedModelExporter(Exporter):
     self._serving_input_receiver_fn = serving_input_receiver_fn
     self._assets_extra = assets_extra
     self._as_text = as_text
+    self._strip_default_attrs = strip_default_attrs
 
   @property
   def name(self):
@@ -112,11 +119,13 @@ class _SavedModelExporter(Exporter):
         self._serving_input_receiver_fn,
         assets_extra=self._assets_extra,
         as_text=self._as_text,
-        checkpoint_path=checkpoint_path)
+        checkpoint_path=checkpoint_path,
+        strip_default_attrs=self._strip_default_attrs)
 
     return export_result
 
 
+@tf_export('estimator.FinalExporter')
 class FinalExporter(Exporter):
   """This class exports the serving graph and checkpoints in the end.
 
@@ -168,6 +177,7 @@ class FinalExporter(Exporter):
                                              is_the_final_export)
 
 
+@tf_export('estimator.LatestExporter')
 class LatestExporter(Exporter):
   """This class regularly exports the serving graph and checkpoints.
 
@@ -197,8 +207,8 @@ class LatestExporter(Exporter):
       as_text: whether to write the SavedModel proto in text format. Defaults to
         `False`.
       exports_to_keep: Number of exports to keep.  Older exports will be
-       garbage-collected.  Defaults to 5.  Set to `None` to disable garbage
-       collection.
+        garbage-collected.  Defaults to 5.  Set to `None` to disable garbage
+        collection.
 
     Raises:
       ValueError: if any arguments is invalid.
diff --git a/tensorflow/python/estimator/exporter_test.py b/tensorflow/python/estimator/exporter_test.py
index 8e0f66cece754dea95987d136d90855e6818236b..70b5612804b2d91d66482d98ae080f42dfa17455 100644
--- a/tensorflow/python/estimator/exporter_test.py
+++ b/tensorflow/python/estimator/exporter_test.py
@@ -69,7 +69,8 @@ class LatestExporterTest(test.TestCase):
         _serving_input_receiver_fn,
         assets_extra={"from/path": "to/path"},
         as_text=False,
-        checkpoint_path="checkpoint_path")
+        checkpoint_path="checkpoint_path",
+        strip_default_attrs=True)
 
   def test_only_the_last_export_is_saved(self):
 
@@ -102,7 +103,8 @@ class LatestExporterTest(test.TestCase):
         _serving_input_receiver_fn,
         assets_extra={"from/path": "to/path"},
         as_text=False,
-        checkpoint_path="checkpoint_path")
+        checkpoint_path="checkpoint_path",
+        strip_default_attrs=True)
 
   def test_garbage_collect_exports(self):
     export_dir_base = tempfile.mkdtemp() + "export/"
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index 750af20e8a1e27c0f9c4fcf3ebf586c41bc9c66c..a6f471291008e3c27dea1aeea5865e334f76e5c8 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -19,8 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+
+import numpy as np
 from six import string_types
+
 from tensorflow.python.estimator.inputs.queues import feeding_functions
+from tensorflow.python.util.tf_export import tf_export
 
 # Key name to pack the target into dict of `features`. See
 # `_get_unique_target_key` for details.
@@ -36,6 +40,13 @@ def _get_unique_target_key(features):
   temporarily and unpacked after calling the feeding function. Toward this goal,
   this function returns a key not existed in the `features` to pack the
   `target`.
+
+  Args:
+    features: OrderedDict of numpy arrays
+
+  Returns:
+    A unique key that can be used to insert the subsequent target into
+      features dict.
   """
   target_key = _TARGET_KEY
   while target_key in features:
@@ -43,6 +54,40 @@ def _get_unique_target_key(features):
   return target_key
 
 
+def _validate_and_convert_features(x):
+  """Type check input data and make a shadow copy as an ordered dict.
+
+  Args:
+    x: numpy array object or dict of numpy array objects. If an array,
+      the array will be treated as a single feature.
+
+  Returns:
+    OrderedDict copy of x.
+
+  Raises:
+    ValueError: if x is empty
+    TypeError: if x is an unknown type.
+  """
+  if isinstance(x, dict):
+    if not x:
+      raise ValueError('x cannot be an empty dict')
+    # Make a shadow copy and also ensure the order of iteration is consistent.
+    ordered_dict_data = collections.OrderedDict(
+        sorted(x.items(), key=lambda t: t[0]))
+  elif isinstance(x, np.ndarray):
+    if x.size == 0:
+      raise ValueError('x cannot be an empty array')
+
+    # Make a shadow copy and convert to dict to align with dict processing.
+    ordered_dict_data = collections.OrderedDict({'__direct_np_input__': x})
+  else:
+    x_type = type(x).__name__
+    raise TypeError('x must be a dict or array; got {}'.format(x_type))
+
+  return ordered_dict_data
+
+
+@tf_export('estimator.inputs.numpy_input_fn')
 def numpy_input_fn(x,
                    y=None,
                    batch_size=128,
@@ -70,7 +115,8 @@ def numpy_input_fn(x,
   ```
 
   Args:
-    x: dict of numpy array object.
+    x: numpy array object or dict of numpy array objects. If an array,
+      the array will be treated as a single feature.
     y: numpy array object or dict of numpy array object. `None` if absent.
     batch_size: Integer, size of batches to return.
     num_epochs: Integer, number of epochs to iterate over data. If `None` will
@@ -90,23 +136,19 @@ def numpy_input_fn(x,
       values in `x` have same shape).
     ValueError: if duplicate keys are in both `x` and `y` when `y` is a dict.
     ValueError: if x or y is an empty dict.
-    TypeError: `x` is not a dict or `shuffle` is not bool.
+    TypeError: `x` is not a dict or array, or if `shuffle` is not bool.
   """
-
   if not isinstance(shuffle, bool):
     raise TypeError('shuffle must be explicitly set as boolean; '
                     'got {}'.format(shuffle))
 
   def input_fn():
     """Numpy input function."""
-    if not isinstance(x, dict):
-      raise TypeError('x must be dict; got {}'.format(type(x).__name__))
-    if not x:
-      raise ValueError('x cannot be empty')
 
-    # Make a shadow copy and also ensure the order of iteration is consistent.
-    ordered_dict_data = collections.OrderedDict(
-        sorted(x.items(), key=lambda t: t[0]))
+    # Note that `x` should not be used after conversion to ordered_dict_data,
+    # as type could be either dict or array.
+    ordered_dict_data = _validate_and_convert_features(x)
+
     # Deep copy keys which is a view in python 3
     feature_keys = list(ordered_dict_data.keys())
 
@@ -161,7 +203,13 @@ def numpy_input_fn(x,
     if batch:
       batch.pop(0)
 
-    features = dict(zip(feature_keys, batch[:len(feature_keys)]))
+    if isinstance(x, np.ndarray):
+      # Return as the same type as original array.
+      features = batch[0]
+    else:
+      # Return as the original dict type
+      features = dict(zip(feature_keys, batch[:len(feature_keys)]))
+
     if target_keys is None:
       # TODO(martinwicke), return consistent result
       return features
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 1374e3f7e12e76683f14737747b490c9a5e319eb..92d057e25da785cf5ee310ca1c80f67a5fbdb43a 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
+from tensorflow.python.training import monitored_session
 from tensorflow.python.training import queue_runner_impl
 
 
@@ -231,10 +232,10 @@ class NumpyIoTest(test.TestCase):
       coord.join(threads)
 
   def testNumpyInputFnWithXAsNonDict(self):
-    x = np.arange(32, 36)
+    x = list(range(32, 36))
     y = np.arange(4)
     with self.test_session():
-      with self.assertRaisesRegexp(TypeError, 'x must be dict'):
+      with self.assertRaisesRegexp(TypeError, 'x must be a dict or array'):
         failing_input_fn = numpy_io.numpy_input_fn(
             x, y, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
@@ -243,7 +244,15 @@ class NumpyIoTest(test.TestCase):
     x = {}
     y = np.arange(4)
     with self.test_session():
-      with self.assertRaisesRegexp(ValueError, 'x cannot be empty'):
+      with self.assertRaisesRegexp(ValueError, 'x cannot be an empty'):
+        failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
+        failing_input_fn()
+
+  def testNumpyInputFnWithXIsEmptyArray(self):
+    x = np.array([[], []])
+    y = np.arange(4)
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, 'x cannot be an empty'):
         failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
         failing_input_fn()
 
@@ -369,6 +378,82 @@ class NumpyIoTest(test.TestCase):
         failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
         failing_input_fn()
 
+  def testNumpyInputFnWithXIsArray(self):
+    x = np.arange(4) * 1.0
+    y = np.arange(-32, -28)
+
+    input_fn = numpy_io.numpy_input_fn(
+        x, y, batch_size=2, shuffle=False, num_epochs=1)
+    features, target = input_fn()
+
+    with monitored_session.MonitoredSession() as session:
+      res = session.run([features, target])
+      self.assertAllEqual(res[0], [0, 1])
+      self.assertAllEqual(res[1], [-32, -31])
+
+      session.run([features, target])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+  def testNumpyInputFnWithXIsNDArray(self):
+    x = np.arange(16).reshape(4, 2, 2) * 1.0
+    y = np.arange(-48, -32).reshape(4, 2, 2)
+
+    input_fn = numpy_io.numpy_input_fn(
+        x, y, batch_size=2, shuffle=False, num_epochs=1)
+    features, target = input_fn()
+
+    with monitored_session.MonitoredSession() as session:
+      res = session.run([features, target])
+      self.assertAllEqual(res[0], [[[0, 1], [2, 3]], [[4, 5], [6, 7]]])
+      self.assertAllEqual(
+          res[1], [[[-48, -47], [-46, -45]], [[-44, -43], [-42, -41]]])
+
+      session.run([features, target])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features, target])
+
+  def testNumpyInputFnWithXIsArrayYIsDict(self):
+    x = np.arange(4) * 1.0
+    y = {'y1': np.arange(-32, -28)}
+
+    input_fn = numpy_io.numpy_input_fn(
+        x, y, batch_size=2, shuffle=False, num_epochs=1)
+    features_tensor, targets_tensor = input_fn()
+
+    with monitored_session.MonitoredSession() as session:
+      features, targets = session.run([features_tensor, targets_tensor])
+      self.assertEqual(len(features), 2)
+      self.assertAllEqual(features, [0, 1])
+      self.assertEqual(len(targets), 1)
+      self.assertAllEqual(targets['y1'], [-32, -31])
+
+      session.run([features_tensor, targets_tensor])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features_tensor, targets_tensor])
+
+  def testArrayAndDictGiveSameOutput(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x_arr = np.vstack((a, b))
+    x_dict = {'feature1': x_arr}
+    y = np.arange(-48, -40).reshape(2, 4)
+
+    input_fn_arr = numpy_io.numpy_input_fn(
+        x_arr, y, batch_size=2, shuffle=False, num_epochs=1)
+    features_arr, targets_arr = input_fn_arr()
+
+    input_fn_dict = numpy_io.numpy_input_fn(
+        x_dict, y, batch_size=2, shuffle=False, num_epochs=1)
+    features_dict, targets_dict = input_fn_dict()
+
+    with monitored_session.MonitoredSession() as session:
+      res_arr, res_dict = session.run([
+          (features_arr, targets_arr), (features_dict, targets_dict)])
+
+      self.assertAllEqual(res_arr[0], res_dict[0]['feature1'])
+      self.assertAllEqual(res_arr[1], res_dict[1])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index 90d6145377d8f931b94793f8a912f77f1620f16e..bd06843021f47f81fc0c22d0fcee43530dc10098 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python.estimator.inputs.queues import feeding_functions
+from tensorflow.python.util.tf_export import tf_export
 
 try:
   # pylint: disable=g-import-not-at-top
@@ -34,6 +35,7 @@ except ImportError:
   HAS_PANDAS = False
 
 
+@tf_export('estimator.inputs.pandas_input_fn')
 def pandas_input_fn(x,
                     y=None,
                     batch_size=128,
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 75c0e61d47b37110b14aa57f6a185cab822a70bb..8e5d8141a1a15d8cb28aefc0f24c02495337245d 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -47,10 +47,9 @@ except ImportError:
 
 
 def _fill_array(arr, seq, fillvalue=0):
-  """
-  Recursively fills padded arr with elements from seq.
-  If length of seq is less than arr padded length, fillvalue used.
+  """Recursively fills padded arr with elements from seq.
 
+  If length of seq is less than arr padded length, fillvalue used.
   Args:
     arr: Padded tensor of shape [batch_size, ..., max_padded_dim_len].
     seq: Non-padded list of data sampels of shape
@@ -84,28 +83,30 @@ def _pad_if_needed(batch_key_item, fillvalue=0):
   Raises:
     ValueError if data samples have different shapes (except last padded dim).
   """
-  shapes = [seq.shape[:-1] if len(seq.shape) > 0 else -1
-            for seq in batch_key_item]
+  shapes = [
+      seq.shape[:-1] if len(seq.shape) > 0 else -1 for seq in batch_key_item
+  ]
   if not all(shapes[0] == x for x in shapes):
     raise ValueError("Array shapes must match.")
 
-  last_length = [seq.shape[-1] if len(seq.shape) > 0 else 0
-                 for seq in batch_key_item]
+  last_length = [
+      seq.shape[-1] if len(seq.shape) > 0 else 0 for seq in batch_key_item
+  ]
   if all([x == last_length[0] for x in last_length]):
     return batch_key_item
 
   batch_size = len(batch_key_item)
   max_sequence_length = max(last_length)
   result_batch = np.zeros(
-    shape=[batch_size] + list(shapes[0]) + [max_sequence_length],
-    dtype=batch_key_item[0].dtype)
+      shape=[batch_size] + list(shapes[0]) + [max_sequence_length],
+      dtype=batch_key_item[0].dtype)
   _fill_array(result_batch, batch_key_item, fillvalue)
   return result_batch
 
 
-def _get_integer_indices_for_next_batch(
-    batch_indices_start, batch_size, epoch_end, array_length,
-    current_epoch, total_epochs):
+def _get_integer_indices_for_next_batch(batch_indices_start, batch_size,
+                                        epoch_end, array_length, current_epoch,
+                                        total_epochs):
   """Returns the integer indices for next batch.
 
   If total epochs is not None and current epoch is the final epoch, the end
@@ -135,8 +136,9 @@ def _get_integer_indices_for_next_batch(
                                  "Already emitted %s epochs." % current_epoch)
 
   batch_indices_end = batch_indices_start + batch_size
-  batch_indices = [j % array_length for j in
-                   range(batch_indices_start, batch_indices_end)]
+  batch_indices = [
+      j % array_length for j in range(batch_indices_start, batch_indices_end)
+  ]
   epoch_end_indices = [i for i, x in enumerate(batch_indices) if x == epoch_end]
   current_epoch += len(epoch_end_indices)
 
@@ -320,16 +322,20 @@ class _GeneratorFeedFn(object):
           raise KeyError("key mismatch between dicts emitted by GenFun "
                          "Expected {} keys; got {}".format(
                              self._keys, data_row.keys()))
-        list_dict.setdefault(self._col_placeholders[index],
-                             list()).append(data_row[key])
+        list_dict.setdefault(self._col_placeholders[index], list()).append(
+            data_row[key])
         list_dict_size += 1
 
     if self._pad_value is not None:
-      feed_dict = {key: np.asarray(_pad_if_needed(item, self._pad_value))
-                   for key, item in list(list_dict.items())}
+      feed_dict = {
+          key: np.asarray(_pad_if_needed(item, self._pad_value))
+          for key, item in list(list_dict.items())
+      }
     else:
-      feed_dict = {key: np.asarray(item)
-                   for key, item in list(list_dict.items())}
+      feed_dict = {
+          key: np.asarray(item)
+          for key, item in list(list_dict.items())
+      }
     return feed_dict
 
 
@@ -382,9 +388,8 @@ def _enqueue_data(data,
       queue_shapes = [(), data.shape[1:]]
       get_feed_fn = _ArrayFeedFn
     elif isinstance(data, collections.OrderedDict):
-      types = [dtypes.int64] + [
-          dtypes.as_dtype(col.dtype) for col in data.values()
-      ]
+      types = [dtypes.int64
+              ] + [dtypes.as_dtype(col.dtype) for col in data.values()]
       queue_shapes = [()] + [col.shape[1:] for col in data.values()]
       get_feed_fn = _OrderedDictNumpyFeedFn
     elif isinstance(data, tp.FunctionType):
@@ -447,11 +452,11 @@ def _enqueue_data(data,
           seed=seed)
     elif pad_data:
       min_after_dequeue = 0  # just for the summary text
-      queue_shapes = list(map(
-        lambda x: tuple(list(x[:-1]) + [None]) if len(x) > 0 else x,
-        queue_shapes))
+      queue_shapes = list(
+          map(lambda x: tuple(list(x[:-1]) + [None]) if len(x) > 0 else x,
+              queue_shapes))
       queue = data_flow_ops.PaddingFIFOQueue(
-        capacity, dtypes=types, shapes=queue_shapes)
+          capacity, dtypes=types, shapes=queue_shapes)
     else:
       min_after_dequeue = 0  # just for the summary text
       queue = data_flow_ops.FIFOQueue(
@@ -470,31 +475,35 @@ def _enqueue_data(data,
 
       if not pad_data:
         feed_fns.append(
-          get_feed_fn(
-              placeholders,
-              data,
-              enqueue_size,
-              random_start=shuffle,
-              seed=seed_i,
-              num_epochs=num_epochs))
+            get_feed_fn(
+                placeholders,
+                data,
+                enqueue_size,
+                random_start=shuffle,
+                seed=seed_i,
+                num_epochs=num_epochs))
       else:
         feed_fns.append(
-          get_feed_fn(
-              placeholders,
-              data,
-              enqueue_size,
-              random_start=shuffle,
-              seed=seed_i,
-              num_epochs=num_epochs,
-              pad_value=pad_value))
+            get_feed_fn(
+                placeholders,
+                data,
+                enqueue_size,
+                random_start=shuffle,
+                seed=seed_i,
+                num_epochs=num_epochs,
+                pad_value=pad_value))
 
     runner = fqr._FeedingQueueRunner(  # pylint: disable=protected-access
-        queue=queue, enqueue_ops=enqueue_ops, feed_fns=feed_fns)
+        queue=queue,
+        enqueue_ops=enqueue_ops,
+        feed_fns=feed_fns)
     queue_runner.add_queue_runner(runner)
 
-    full = (math_ops.cast(
-        math_ops.maximum(0, queue.size() - min_after_dequeue),
-        dtypes.float32) * (1. / (capacity - min_after_dequeue)))
+    full = (
+        math_ops.cast(
+            math_ops.maximum(0,
+                             queue.size() - min_after_dequeue), dtypes.float32)
+        * (1. / (capacity - min_after_dequeue)))
     # Note that name contains a '/' at the end so we intentionally do not place
     # a '/' after %s below.
     summary_name = ("queue/%sfraction_over_%d_of_%d_full" %
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index da202408c3680b397994620e221fa4937d7c65e4..8111ab564c017175b3f7bc1020d850db74587958 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -31,8 +31,10 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('estimator.ModeKeys')
 class ModeKeys(object):
   """Standard names for model modes.
 
@@ -52,11 +54,12 @@ LOSS_METRIC_KEY = 'loss'
 AVERAGE_LOSS_METRIC_KEY = 'average_loss'
 
 
+@tf_export('estimator.EstimatorSpec')
 class EstimatorSpec(
     collections.namedtuple('EstimatorSpec', [
         'mode', 'predictions', 'loss', 'train_op', 'eval_metric_ops',
         'export_outputs', 'training_chief_hooks', 'training_hooks', 'scaffold',
-        'evaluation_hooks'
+        'evaluation_hooks', 'prediction_hooks'
     ])):
   """Ops and objects returned from a `model_fn` and passed to an `Estimator`.
 
@@ -73,7 +76,8 @@ class EstimatorSpec(
               training_chief_hooks=None,
               training_hooks=None,
               scaffold=None,
-              evaluation_hooks=None):
+              evaluation_hooks=None,
+              prediction_hooks=None):
     """Creates a validated `EstimatorSpec` instance.
 
     Depending on the value of `mode`, different arguments are required. Namely
@@ -154,6 +158,8 @@ class EstimatorSpec(
         initialization, saver, and more to be used in training.
       evaluation_hooks: Iterable of `tf.train.SessionRunHook` objects to
         run during evaluation.
+      prediction_hooks: Iterable of `tf.train.SessionRunHook` objects to
+        run during predictions.
 
     Returns:
       A validated `EstimatorSpec` object.
@@ -282,7 +288,10 @@ class EstimatorSpec(
     training_chief_hooks = tuple(training_chief_hooks or [])
     training_hooks = tuple(training_hooks or [])
     evaluation_hooks = tuple(evaluation_hooks or [])
-    for hook in training_hooks + training_chief_hooks + evaluation_hooks:
+    prediction_hooks = tuple(prediction_hooks or [])
+
+    for hook in (training_hooks + training_chief_hooks + evaluation_hooks +
+                 prediction_hooks):
       if not isinstance(hook, session_run_hook.SessionRunHook):
         raise TypeError(
             'All hooks must be SessionRunHook instances, given: {}'.format(
@@ -305,7 +314,8 @@ class EstimatorSpec(
         training_chief_hooks=training_chief_hooks,
         training_hooks=training_hooks,
         scaffold=scaffold,
-        evaluation_hooks=evaluation_hooks)
+        evaluation_hooks=evaluation_hooks,
+        prediction_hooks=prediction_hooks)
 
   def _replace(self, **kwds):
     """Return a new EstimatorSpec replacing specified fields with new values."""
diff --git a/tensorflow/python/estimator/model_fn_test.py b/tensorflow/python/estimator/model_fn_test.py
index d67c4b716161816d941eef94a4b9aeb0643de55e..b7eeeb437cb4a624cdee552be3032364b18a8290 100644
--- a/tensorflow/python/estimator/model_fn_test.py
+++ b/tensorflow/python/estimator/model_fn_test.py
@@ -72,7 +72,8 @@ class EstimatorSpecTrainTest(test.TestCase):
           training_chief_hooks=[_FakeHook()],
           training_hooks=[_FakeHook()],
           scaffold=monitored_session.Scaffold(),
-          evaluation_hooks=[_FakeHook()])
+          evaluation_hooks=[_FakeHook()],
+          prediction_hooks=[_FakeHook()])
 
   def testLossNumber(self):
     """Tests that error is raised when loss is a number (not Tensor)."""
@@ -465,7 +466,17 @@ class EstimatorSpecInferTest(test.TestCase):
           training_chief_hooks=[_FakeHook()],
           training_hooks=[_FakeHook()],
           scaffold=monitored_session.Scaffold(),
-          evaluation_hooks=[_FakeHook()])
+          evaluation_hooks=[_FakeHook()],
+          prediction_hooks=[_FakeHook()])
+
+  def testPredictionHookInvalid(self):
+    with ops.Graph().as_default(), self.test_session():
+      with self.assertRaisesRegexp(
+          TypeError, 'All hooks must be SessionRunHook instances'):
+        model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT,
+            predictions=constant_op.constant(1.),
+            prediction_hooks=[_InvalidHook()])
 
   def testPredictionsMissing(self):
     with ops.Graph().as_default(), self.test_session():
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index d71964d2ec8e8ce21934428c3fff88f65b2751da..3e021242c4cc914990c6b38736b8f725213b5b7e 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -27,6 +27,8 @@ import six
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat_internal
+from tensorflow.python.util.tf_export import tf_export
 
 
 _USE_DEFAULT = object()
@@ -54,35 +56,68 @@ _TASK_TYPE_KEY = 'type'
 _TASK_ID_KEY = 'index'
 _CLUSTER_KEY = 'cluster'
 _SERVICE_KEY = 'service'
+_SESSION_MASTER_KEY = 'session_master'
+_EVAL_SESSION_MASTER_KEY = 'eval_session_master'
+_MODEL_DIR_KEY = 'model_dir'
 _LOCAL_MASTER = ''
 _GRPC_SCHEME = 'grpc://'
 
 
-def _get_master(cluster_spec, task_type, task_id):
-  """Returns the appropriate string for the TensorFlow master."""
+def _get_session_master(cluster_spec, task_type, task_id, tf_config):
+  """Returns the appropriate address for TensorFlow master.
+
+  The order of precedence to deteremine the TF session master is as follows:
+  1. If `tf_session_master` is set in TF_CONFIG environment variable, takes it.
+  2. If the cluster has only one node, returns empty string ''.
+  3. Returns the grpc address according to the task type and id in the cluster.
+     This is between-graph replication.
+
+  Note: task_type and task_id must be validated. Typically, validated using
+  `_validate_task_type_and_task_id`.
+
+  Args:
+    cluster_spec: A `ClusterSpec` instance.
+    task_type: String. Task type for current node.
+    task_id: Int. Task id for current node.
+    tf_config: Dict. Python dict for the TF_CONFIG environment variable.
+
+  Raises:
+    RuntimeError: If `cluster_spec` is not set.
+
+  """
+  if _SESSION_MASTER_KEY in tf_config:
+    return tf_config[_SESSION_MASTER_KEY]
+
   if not cluster_spec:
-    raise RuntimeError(
-        'Internal error: `_get_master` does not expect empty cluster_spec.')
+    raise RuntimeError('Internal error: `_get_session_master` '
+                       'does not expect empty cluster_spec.')
 
   jobs = cluster_spec.jobs
+
+  # If there is only one node in the cluster, do things locally by setting
+  # master to ''.  If a service or user sets TF_CONFIG with a single node, it's
+  # more performant to use a direct master rather than an RPC service.
+  if len(jobs) == 1 and len(cluster_spec.job_tasks(jobs[0])) == 1:
+    return _LOCAL_MASTER
+
   # Lookup the master in cluster_spec using task_type and task_id,
   # if possible.
-  if task_type not in jobs:
-    raise ValueError(
-        '%s is not a valid task_type in the cluster_spec:\n'
-        '%s\n\n'
-        'Note that these values may be coming from the TF_CONFIG environment '
-        'variable.' % (task_type, cluster_spec))
   addresses = cluster_spec.job_tasks(task_type)
-  if not 0 <= task_id < len(addresses):
-    raise ValueError(
-        '%d is not a valid task_id for task_type %s in the cluster_spec:\n'
-        '%s\n\n'
-        'Note that these values may be coming from the TF_CONFIG environment '
-        'variable.' % (task_id, task_type, cluster_spec))
   return _GRPC_SCHEME + addresses[task_id]
 
 
+def _get_eval_session_master(task_type, tf_config):
+  """Returns the appropriate address for TensorFlow evaluation master."""
+  if task_type == TaskType.EVALUATOR:
+    return tf_config.get(_EVAL_SESSION_MASTER_KEY, _LOCAL_MASTER)
+
+  if _EVAL_SESSION_MASTER_KEY in tf_config:
+    raise ValueError('Key ({}) should not be set for task type other than {}. '
+                     'Task type: {}'.format(_EVAL_SESSION_MASTER_KEY,
+                                            TaskType.EVALUATOR, task_type))
+  return _LOCAL_MASTER
+
+
 def _count_ps(cluster_spec):
   """Counts the number of parameter servers in cluster_spec."""
   if not cluster_spec:
@@ -140,9 +175,56 @@ def _validate_task_type_and_task_id(cluster_spec, task_env, chief_task_type):
   # cluster spec, which will be checked later (when retrieving the `master`)
   if task_id < 0:
     raise ValueError('Task index must be non-negative number.')
+
+  # Evaluator is not part of the training cluster.
+  if task_type == TaskType.EVALUATOR:
+    return task_type, task_id
+
+  if task_type not in cluster_spec.jobs:
+    raise ValueError(
+        '%s is not a valid task_type in the cluster_spec:\n'
+        '%s\n\n'
+        'Note that these values may be coming from the TF_CONFIG environment '
+        'variable.' % (task_type, cluster_spec))
+  addresses = cluster_spec.job_tasks(task_type)
+  if not 0 <= task_id < len(addresses):
+    raise ValueError(
+        '%d is not a valid task_id for task_type %s in the cluster_spec:\n'
+        '%s\n\n'
+        'Note that these values may be coming from the TF_CONFIG environment '
+        'variable.' % (task_id, task_type, cluster_spec))
+
   return task_type, task_id
 
 
+def _get_global_id_in_cluster(
+    cluster_spec, task_type, task_id, chief_task_type):
+  """Returns the global id in cluster."""
+  # Note: This is implementation details, which user should not rely on.
+  # The first id is 0, which is always for the `chief` node. All other nodes,
+  # except `ps`, are ordered alphabetical based on task type (alphabetically)
+  # and task id (ascendingly). `ps` are ordered last.
+
+  # Sort task names in cluster
+  task_type_ordered_list = [chief_task_type]
+  task_type_ordered_list.extend([
+      t for t in sorted(cluster_spec.jobs)
+      if t != chief_task_type and t != TaskType.PS
+  ])
+  if TaskType.PS in cluster_spec.jobs:
+    task_type_ordered_list.append(TaskType.PS)
+
+  next_global_id = 0
+  for t in task_type_ordered_list:
+    if t == task_type:
+      return next_global_id + task_id
+    next_global_id += len(cluster_spec.job_tasks(t))
+
+  # This should never happen.
+  raise RuntimeError('Internal Error: `task_type` ({}) is not in '
+                     'cluster_spec ({}).'.format(task_type, cluster_spec))
+
+
 def _validate_save_ckpt_with_replaced_keys(new_copy, replaced_keys):
   """Validates the save ckpt properties."""
   # Ensure one (and only one) of save_steps and save_secs is not None.
@@ -205,6 +287,7 @@ class TaskType(object):
   EVALUATOR = 'evaluator'
 
 
+@tf_export('estimator.RunConfig')
 class RunConfig(object):
   """This class specifies the configurations for an `Estimator` run."""
 
@@ -319,7 +402,8 @@ class RunConfig(object):
 
     Args:
       model_dir: directory where model parameters, graph, etc are saved. If
-        `None`, will use a default value set by the Estimator.
+        `PathLike` object, the path will be resolved. If `None`, will use a
+        default value set by the Estimator.
       tf_random_seed: Random seed for TensorFlow initializers.
         Setting this value allows consistency between reruns.
       save_summary_steps: Save summaries every this many steps.
@@ -358,6 +442,13 @@ class RunConfig(object):
           save_checkpoints_secs is not None):
       raise ValueError(_SAVE_CKPT_ERR)
 
+    tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
+    if tf_config:
+      logging.info('TF_CONFIG environment variable: %s', tf_config)
+
+    model_dir = _get_model_dir(tf_config,
+                               compat_internal.path_to_str(model_dir))
+
     RunConfig._replace(
         self,
         allowed_properties_list=_DEFAULT_REPLACEABLE_LIST,
@@ -371,14 +462,10 @@ class RunConfig(object):
         keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
         log_step_count_steps=log_step_count_steps)
 
-    self._init_distributed_setting_from_environment_var()
+    self._init_distributed_setting_from_environment_var(tf_config)
 
-  def _init_distributed_setting_from_environment_var(self):
-    """Initialize distributed properties based on environment variable."""
-
-    tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV) or '{}')
-    if tf_config:
-      logging.info('TF_CONFIG environment variable: %s', tf_config)
+  def _init_distributed_setting_from_environment_var(self, tf_config):
+    """Initialize distributed properties based on `tf_config`."""
 
     self._service = _validate_service(tf_config.get(_SERVICE_KEY))
     self._cluster_spec = server_lib.ClusterSpec(tf_config.get(_CLUSTER_KEY, {}))
@@ -393,24 +480,34 @@ class RunConfig(object):
       self._task_type, self._task_id = _validate_task_type_and_task_id(
           self._cluster_spec, task_env, TaskType.CHIEF)
 
+      self._evaluation_master = _get_eval_session_master(
+          self._task_type, tf_config)
+
       if self._task_type != TaskType.EVALUATOR:
-        self._master = _get_master(
-            self._cluster_spec, self._task_type, self._task_id)
+        self._master = _get_session_master(self._cluster_spec, self._task_type,
+                                           self._task_id, tf_config)
         self._num_ps_replicas = _count_ps(self._cluster_spec)
         self._num_worker_replicas = _count_worker(
             self._cluster_spec, chief_task_type=TaskType.CHIEF)
+        self._global_id_in_cluster = _get_global_id_in_cluster(
+            self._cluster_spec,
+            self._task_type,
+            self._task_id,
+            chief_task_type=TaskType.CHIEF)
       else:
         # Evaluator is not part of the training cluster.
         self._cluster_spec = server_lib.ClusterSpec({})
         self._master = _LOCAL_MASTER
         self._num_ps_replicas = 0
         self._num_worker_replicas = 0
+        self._global_id_in_cluster = None  # undefined
 
       self._is_chief = self._task_type == TaskType.CHIEF
     else:
       # Local mode.
       self._task_type = task_env.get(_TASK_TYPE_KEY, TaskType.WORKER)
       self._task_id = int(task_env.get(_TASK_ID_KEY, 0))
+      self._global_id_in_cluster = 0
 
       if self._task_type != TaskType.WORKER:
         raise ValueError(
@@ -419,7 +516,9 @@ class RunConfig(object):
         raise ValueError(
             'If "cluster" is not set in TF_CONFIG, task index must be 0.')
 
-      self._master = ''
+      self._master = tf_config.get(_SESSION_MASTER_KEY, _LOCAL_MASTER)
+      self._evaluation_master = tf_config.get(_EVAL_SESSION_MASTER_KEY,
+                                              _LOCAL_MASTER)
       self._is_chief = True
       self._num_ps_replicas = 0
       self._num_worker_replicas = 1
@@ -443,8 +542,16 @@ class RunConfig(object):
       raise ValueError('If `master` node exists in `cluster`, task_type '
                        '`evaluator` is not supported.')
 
-    self._master = _get_master(
-        self._cluster_spec, self._task_type, self._task_id)
+    self._global_id_in_cluster = _get_global_id_in_cluster(
+        self._cluster_spec,
+        self._task_type,
+        self._task_id,
+        chief_task_type=TaskType.MASTER)
+
+    self._master = _get_session_master(self._cluster_spec, self._task_type,
+                                       self._task_id, tf_config)
+    self._evaluation_master = _get_eval_session_master(self._task_type,
+                                                       tf_config)
     self._num_ps_replicas = _count_ps(self._cluster_spec)
     self._num_worker_replicas = _count_worker(
         self._cluster_spec, chief_task_type=TaskType.MASTER)
@@ -457,7 +564,7 @@ class RunConfig(object):
 
   @property
   def evaluation_master(self):
-    return ''
+    return self._evaluation_master
 
   @property
   def is_chief(self):
@@ -479,6 +586,46 @@ class RunConfig(object):
   def task_id(self):
     return self._task_id
 
+  @property
+  def global_id_in_cluster(self):
+    """The global id in the training cluster.
+
+    All global ids in the training cluster are assigned from an increasing
+    sequence of consecutive integers. The first id is 0.
+
+    Note: Task id (the property field `task_id`) is tracking the index of the
+    node among all nodes with the SAME task type. For example, given the cluster
+    definition as follows:
+
+    ```
+      cluster = {'chief': ['host0:2222'],
+                 'ps': ['host1:2222', 'host2:2222'],
+                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
+    ```
+
+    Nodes with task type `worker` can have id 0, 1, 2.  Nodes with task type
+    `ps` can have id, 0, 1. So, `task_id` is not unique, but the pair
+    (`task_type`, `task_id`) can uniquely determine a node in the cluster.
+
+    Global id, i.e., this field, is tracking the index of the node among ALL
+    nodes in the cluster. It is uniquely assigned.  For example, for the cluster
+    spec given above, the global ids are assigned as:
+    ```
+      task_type  | task_id  |  global_id
+      --------------------------------
+      chief      | 0        |  0
+      worker     | 0        |  1
+      worker     | 1        |  2
+      worker     | 2        |  3
+      ps         | 0        |  4
+      ps         | 1        |  5
+    ```
+
+    Returns:
+      An integer id.
+    """
+    return self._global_id_in_cluster
+
   @property
   def task_type(self):
     return self._task_type
@@ -593,3 +740,31 @@ class RunConfig(object):
     _validate_save_ckpt_with_replaced_keys(config, kwargs.keys())
     _validate_properties(config)
     return config
+
+
+def _get_model_dir(tf_config, model_dir):
+  """Returns `model_dir` based user provided `tf_config` or `model_dir`."""
+  # pylint: disable=g-explicit-bool-comparison
+
+  # Empty string is treated as False in Python condition check, which triggers
+  # some confusing error messages. For example, 'a or b' returns None if a is ''
+  # and b is None. `None` is allowed for model_dir but '' is not allowed. Here,
+  # explicitly check empty string to provide clear error message.
+  if model_dir == '':
+    raise ValueError('model_dir should be non-empty.')
+
+  model_dir_in_tf_config = tf_config.get('model_dir')
+  if model_dir_in_tf_config == '':
+    raise ValueError('model_dir in TF_CONFIG should be non-empty.')
+
+  if model_dir_in_tf_config:
+    if model_dir and model_dir_in_tf_config != model_dir:
+      raise ValueError(
+          '`model_dir` provided in RunConfig construct, if set, '
+          'must have the same value as the model_dir in TF_CONFIG. '
+          'model_dir: {}\nTF_CONFIG["model_dir"]: {}.\n'.format(
+              model_dir, model_dir_in_tf_config))
+
+    logging.info('Using model_dir in TF_CONFIG: %s', model_dir_in_tf_config)
+
+  return model_dir or model_dir_in_tf_config
diff --git a/tensorflow/python/estimator/run_config_test.py b/tensorflow/python/estimator/run_config_test.py
index ecc850d5405837e8bf803b9a7c8c156ff19b7a90..a3eef4c53fd90a1ce69f3067d0b5c15909f43cec 100644
--- a/tensorflow/python/estimator/run_config_test.py
+++ b/tensorflow/python/estimator/run_config_test.py
@@ -31,6 +31,10 @@ _SAVE_CKPT_ERR = (
     '`save_checkpoints_steps` and `save_checkpoints_secs` cannot be both set.'
 )
 _MODEL_DIR_ERR = 'model_dir should be non-empty'
+_MODEL_DIR_TF_CONFIG_ERR = 'model_dir in TF_CONFIG should be non-empty'
+_MODEL_DIR_MISMATCH_ERR = (
+    '`model_dir` provided in RunConfig construct, if set, '
+    'must have the same value as the model_dir in TF_CONFIG. ')
 _SAVE_SUMMARY_STEPS_ERR = 'save_summary_steps should be >= 0'
 _SAVE_CKPT_STEPS_ERR = 'save_checkpoints_steps should be >= 0'
 _SAVE_CKPT_SECS_ERR = 'save_checkpoints_secs should be >= 0'
@@ -40,6 +44,8 @@ _KEEP_CKPT_HOURS_ERR = 'keep_checkpoint_every_n_hours should be > 0'
 _TF_RANDOM_SEED_ERR = 'tf_random_seed must be integer'
 _ONE_CHIEF_ERR = 'The "cluster" in TF_CONFIG must have only one "chief" node.'
 _ONE_MASTER_ERR = 'The "cluster" in TF_CONFIG must have only one "master" node.'
+_INVALID_TASK_TYPE_FOR_EVAL_MASTER = (
+    'Key.*eval.*master.*should not be set for task type other than')
 _MISSING_CHIEF_ERR = 'If "cluster" is set .* it must have one "chief" node'
 _MISSING_TASK_TYPE_ERR = 'If "cluster" is set .* task type must be set'
 _MISSING_TASK_ID_ERR = 'If "cluster" is set .* task index must be set'
@@ -256,8 +262,9 @@ class RunConfigDistributedSettingTest(test.TestCase):
             'index': 0
         }
     }
+    run_config = _create_run_config_with_cluster_spec(tf_config)
     self._assert_distributed_properties(
-        run_config=_create_run_config_with_cluster_spec(tf_config),
+        run_config=run_config,
         expected_cluster_spec={},
         expected_task_type=run_config_lib.TaskType.WORKER,
         expected_task_id=0,
@@ -266,6 +273,33 @@ class RunConfigDistributedSettingTest(test.TestCase):
         expected_is_chief=True,
         expected_num_worker_replicas=1,
         expected_num_ps_replicas=0)
+    self.assertEqual(0, run_config.global_id_in_cluster)
+
+  def test_session_master_for_local(self):
+    tf_config = {'session_master': '_my_master'}
+    self._assert_distributed_properties(
+        run_config=_create_run_config_with_cluster_spec(tf_config),
+        expected_cluster_spec={},
+        expected_task_type=run_config_lib.TaskType.WORKER,
+        expected_task_id=0,
+        expected_master='_my_master',
+        expected_evaluation_master='',
+        expected_is_chief=True,
+        expected_num_worker_replicas=1,
+        expected_num_ps_replicas=0)
+
+  def test_eval_session_master_for_local(self):
+    tf_config = {'eval_session_master': '_my_eval_master'}
+    self._assert_distributed_properties(
+        run_config=_create_run_config_with_cluster_spec(tf_config),
+        expected_cluster_spec={},
+        expected_task_type=run_config_lib.TaskType.WORKER,
+        expected_task_id=0,
+        expected_master='',
+        expected_evaluation_master='_my_eval_master',
+        expected_is_chief=True,
+        expected_num_worker_replicas=1,
+        expected_num_ps_replicas=0)
 
   def test_invalid_task_type_for_local(self):
     tf_config = {
@@ -310,6 +344,50 @@ class RunConfigDistributedSettingTest(test.TestCase):
         expected_num_worker_replicas=4,
         expected_num_ps_replicas=2)
 
+  def test_session_master_from_single_node_tf_config(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+        },
+        'task': {
+            'type': run_config_lib.TaskType.CHIEF,
+            'index': 0
+        },
+        'session_master': '_my_master'
+    }
+    self.assertEqual('_my_master',
+                     _create_run_config_with_cluster_spec(tf_config).master)
+
+  def test_session_master_from_multiple_nodes_tf_config(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
+        },
+        'task': {
+            'type': run_config_lib.TaskType.CHIEF,
+            'index': 0
+        },
+        'session_master': '_my_master'
+    }
+    self.assertEqual('_my_master',
+                     _create_run_config_with_cluster_spec(tf_config).master)
+
+  def test_fail_with_eval_session_master_for_non_evaluator(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+        },
+        'task': {
+            'type': run_config_lib.TaskType.CHIEF,
+            'index': 0
+        },
+        'eval_session_master': 'grpc://123',
+    }
+    with self.assertRaisesRegexp(
+        ValueError, _INVALID_TASK_TYPE_FOR_EVAL_MASTER):
+      _create_run_config_with_cluster_spec(tf_config)
+
   def test_fail_with_multiple_chief_nodes(self):
     tf_config = {
         'cluster': {
@@ -344,7 +422,7 @@ class RunConfigDistributedSettingTest(test.TestCase):
         expected_cluster_spec=tf_config['cluster'],
         expected_task_type=run_config_lib.TaskType.CHIEF,
         expected_task_id=0,
-        expected_master='grpc://host0:0',
+        expected_master='',
         expected_evaluation_master='',
         expected_is_chief=True,
         expected_num_worker_replicas=1,
@@ -468,8 +546,9 @@ class RunConfigDistributedSettingTest(test.TestCase):
             'index': 12
         }
     }
+    run_config = _create_run_config_with_cluster_spec(tf_config)
     self._assert_distributed_properties(
-        run_config=_create_run_config_with_cluster_spec(tf_config),
+        run_config=run_config,
         expected_cluster_spec={},
         expected_task_type=run_config_lib.TaskType.EVALUATOR,
         expected_task_id=12,
@@ -478,6 +557,23 @@ class RunConfigDistributedSettingTest(test.TestCase):
         expected_is_chief=False,  # evaluator is never chief.
         expected_num_worker_replicas=0,  # evaluator is not in training cluster.
         expected_num_ps_replicas=0)
+    self.assertIsNone(run_config.global_id_in_cluster)
+
+  def test_eval_master_for_evaluator(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
+            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
+        },
+        'task': {
+            'type': run_config_lib.TaskType.EVALUATOR,
+            'index': 12
+        },
+        'eval_session_master': 'grpc://123',
+    }
+    run_config = _create_run_config_with_cluster_spec(tf_config)
+    self.assertEqual('grpc://123', run_config.evaluation_master)
 
   def test_fail_with_invalid_task_index_for_evaluator(self):
     tf_config = {
@@ -492,6 +588,71 @@ class RunConfigDistributedSettingTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, _NEGATIVE_TASK_INDEX_ERR):
       _create_run_config_with_cluster_spec(tf_config)
 
+  def test_global_id_in_cluster_for_chief(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5'],
+            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
+        },
+        'task': {
+            'type': run_config_lib.TaskType.CHIEF,
+            'index': 0,
+        },
+    }
+    run_config = _create_run_config_with_cluster_spec(tf_config)
+    self.assertEqual(0, run_config.global_id_in_cluster)
+
+  def test_global_id_in_cluster_for_worker(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5'],
+            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
+        },
+        'task': {
+            'type': run_config_lib.TaskType.WORKER,
+            'index': 2,
+        },
+    }
+    run_config = _create_run_config_with_cluster_spec(tf_config)
+    self.assertEqual(3, run_config.global_id_in_cluster)
+
+  def test_global_id_in_cluster_for_ps(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5'],
+            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
+        },
+        'task': {
+            'type': run_config_lib.TaskType.PS,
+            'index': 1,
+        },
+    }
+    run_config = _create_run_config_with_cluster_spec(tf_config)
+    self.assertEqual(5, run_config.global_id_in_cluster)
+
+  def test_global_id_in_cluster_for_multipe_worker_types(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+            'worker': ['host3:3', 'host4:4', 'host5:5'],
+            'other_type': ['host3:1', 'host4:2'],
+            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
+        },
+        'task': {
+            'type': 'other_type',
+            'index': 1,
+        },
+    }
+    # Though 'other_type' is defined after 'worker', based on alphabetical
+    # order, the task type order should be 'chief', 'other_type', 'worker',
+    # 'ps', where 'chief' and 'ps' are predefined to be the top and last in the
+    # order list.
+    run_config = _create_run_config_with_cluster_spec(tf_config)
+    self.assertEqual(2, run_config.global_id_in_cluster)
+
 
 class RunConfigDistributedSettingWithMasterTest(test.TestCase):
 
@@ -524,7 +685,7 @@ class RunConfigDistributedSettingWithMasterTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE_FOR_LOCAL_ERR):
       _create_run_config_with_cluster_spec(tf_config)
 
-  def test_master_tf_config(self):
+  def test_master_node(self):
     tf_config = {
         'cluster': {
             run_config_lib.TaskType.MASTER: ['host0:0'],
@@ -547,6 +708,50 @@ class RunConfigDistributedSettingWithMasterTest(test.TestCase):
         expected_num_worker_replicas=4,
         expected_num_ps_replicas=2)
 
+  def test_session_master_in_single_node_tf_config(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.MASTER: ['host0:0'],
+        },
+        'task': {
+            'type': run_config_lib.TaskType.MASTER,
+            'index': 0
+        },
+        'session_master': '_my_master'
+    }
+    self.assertEqual('_my_master',
+                     _create_run_config_with_cluster_spec(tf_config).master)
+
+  def test_session_master_in_multiple_nodes_tf_config(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.MASTER: ['host0:0'],
+            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
+        },
+        'task': {
+            'type': run_config_lib.TaskType.MASTER,
+            'index': 0
+        },
+        'session_master': '_my_master'
+    }
+    self.assertEqual('_my_master',
+                     _create_run_config_with_cluster_spec(tf_config).master)
+
+  def test_fail_with_eval_session_master(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.MASTER: ['host0:0'],
+        },
+        'task': {
+            'type': run_config_lib.TaskType.MASTER,
+            'index': 0
+        },
+        'eval_session_master': 'grpc://123',
+    }
+    with self.assertRaisesRegexp(
+        ValueError, _INVALID_TASK_TYPE_FOR_EVAL_MASTER):
+      _create_run_config_with_cluster_spec(tf_config)
+
   def test_fail_with_multiple_master_nodes(self):
     tf_config = {
         'cluster': {
@@ -572,7 +777,7 @@ class RunConfigDistributedSettingWithMasterTest(test.TestCase):
         expected_cluster_spec=tf_config['cluster'],
         expected_task_type=run_config_lib.TaskType.MASTER,
         expected_task_id=0,
-        expected_master='grpc://host0:0',
+        expected_master='',
         expected_evaluation_master='',
         expected_is_chief=True,
         expected_num_worker_replicas=1,
@@ -716,6 +921,71 @@ class RunConfigDistributedSettingWithMasterTest(test.TestCase):
                                  _INVALID_CHIEF_IN_CLUSTER_WITH_MASTER_ERR):
       _create_run_config_with_cluster_spec(tf_config)
 
+  def test_global_id_in_cluster_for_master(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.MASTER: ['host0:0'],
+            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5'],
+            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
+        },
+        'task': {
+            'type': run_config_lib.TaskType.MASTER,
+            'index': 0,
+        },
+    }
+    run_config = _create_run_config_with_cluster_spec(tf_config)
+    self.assertEqual(0, run_config.global_id_in_cluster)
+
+  def test_global_id_in_cluster_for_worker(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.MASTER: ['host0:0'],
+            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5'],
+            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
+        },
+        'task': {
+            'type': run_config_lib.TaskType.WORKER,
+            'index': 2,
+        },
+    }
+    run_config = _create_run_config_with_cluster_spec(tf_config)
+    self.assertEqual(3, run_config.global_id_in_cluster)
+
+  def test_global_id_in_cluster_for_ps(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.MASTER: ['host0:0'],
+            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5'],
+            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
+        },
+        'task': {
+            'type': run_config_lib.TaskType.PS,
+            'index': 1,
+        },
+    }
+    run_config = _create_run_config_with_cluster_spec(tf_config)
+    self.assertEqual(5, run_config.global_id_in_cluster)
+
+  def test_global_id_in_cluster_for_multipe_worker_types(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.MASTER: ['host0:0'],
+            'worker': ['host3:3', 'host4:4', 'host5:5'],
+            'other_type': ['host3:1', 'host4:2'],
+            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
+        },
+        'task': {
+            'type': 'other_type',
+            'index': 1,
+        },
+    }
+    # Though 'other_type' is defined after 'worker', based on alphabetical
+    # order, the task type order should be 'chief', 'other_type', 'worker',
+    # 'ps', where 'chief' and 'ps' are predefined to be the top and last in the
+    # order list.
+    run_config = _create_run_config_with_cluster_spec(tf_config)
+    self.assertEqual(2, run_config.global_id_in_cluster)
+
 
 class RunConfigSaveCheckpointsTest(test.TestCase):
 
@@ -793,5 +1063,45 @@ class RunConfigServiceKeyTest(test.TestCase):
       _create_run_config_with_cluster_spec(tf_config)
 
 
+class RunConfigModelDirTest(test.TestCase):
+
+  def test_default(self):
+    run_config = run_config_lib.RunConfig()
+    self.assertIsNone(run_config.model_dir)
+
+  def test_model_dir_in_constructor(self):
+    run_config = run_config_lib.RunConfig(model_dir='/tmp/123')
+    self.assertEqual('/tmp/123', run_config.model_dir)
+
+  def test_model_dir_in_tf_config(self):
+    tf_config = {
+        'model_dir': '/tmp/123',
+    }
+    run_config = _create_run_config_with_cluster_spec(tf_config)
+    self.assertEqual('/tmp/123', run_config.model_dir)
+
+  def test_model_dir_both_set_in_both_constructor_and_tf_config(self):
+    model_dir = '/tmp/123'
+    tf_config = {'model_dir': model_dir}
+    kwargs = {'model_dir': model_dir}
+    run_config = _create_run_config_with_cluster_spec(tf_config, **kwargs)
+    self.assertEqual('/tmp/123', run_config.model_dir)
+
+  def test_model_dir_different_in_both_constructor_and_tf_config(self):
+    tf_config = {'model_dir': '/tmp/123'}
+    kwargs = {'model_dir': '/tmp/456'}
+    with self.assertRaisesRegexp(ValueError, _MODEL_DIR_MISMATCH_ERR):
+      _create_run_config_with_cluster_spec(tf_config, **kwargs)
+
+  def test_fail_with_empty_string_in_constructor(self):
+    with self.assertRaisesRegexp(ValueError, _MODEL_DIR_ERR):
+      run_config_lib.RunConfig(model_dir='')
+
+  def test_fail_with_empty_string_in_tf_config(self):
+    with self.assertRaisesRegexp(ValueError, _MODEL_DIR_TF_CONFIG_ERR):
+      tf_config = {'model_dir': ''}
+      _create_run_config_with_cluster_spec(tf_config)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index 1131995b3ef1a832c3312d27a46d8395d62cecc7..63328dcfb55646ce2aaf8929d5517c8522c418f2 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Classes and functions related to train_and_evaluate."""
 
 from __future__ import absolute_import
@@ -36,20 +35,21 @@ from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import server_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import compat
-
+from tensorflow.python.util.tf_export import tf_export
 
 _MAX_DELAY_SECS = 60
 _DELAY_SECS_PER_WORKER = 5
 _TF_CONFIG_ENV = 'TF_CONFIG'
 _ENVIRONMENT_KEY = 'environment'
 _ENVIRONMENT_GOOGLE_VALUE = 'google'
+_TRAINER_JOBS = (run_config_lib.TaskType.CHIEF, run_config_lib.TaskType.MASTER,
+                 run_config_lib.TaskType.WORKER)
 
 
 def _validate_input_fn(input_fn):
   """Validates the `input_fn`."""
   if not callable(input_fn):
-    raise TypeError(
-        '`input_fn` must be callable, given: {}'.format(input_fn))
+    raise TypeError('`input_fn` must be callable, given: {}'.format(input_fn))
 
 
 def _validate_hooks(hooks):
@@ -115,6 +115,7 @@ def _is_google_env():
   return tf_config.get(_ENVIRONMENT_KEY) == _ENVIRONMENT_GOOGLE_VALUE
 
 
+@tf_export('estimator.TrainSpec')
 class TrainSpec(
     collections.namedtuple('TrainSpec', ['input_fn', 'max_steps', 'hooks'])):
   """Configuration for the "train" part for the `train_and_evaluate` call.
@@ -123,10 +124,7 @@ class TrainSpec(
   duration. Optional hooks run at various stages of training.
   """
 
-  def __new__(cls,
-              input_fn,
-              max_steps=None,
-              hooks=None):
+  def __new__(cls, input_fn, max_steps=None, hooks=None):
     """Creates a validated `TrainSpec` instance.
 
     Args:
@@ -159,16 +157,14 @@ class TrainSpec(
     hooks = _validate_hooks(hooks)
 
     return super(TrainSpec, cls).__new__(
-        cls,
-        input_fn=input_fn,
-        max_steps=max_steps,
-        hooks=hooks)
+        cls, input_fn=input_fn, max_steps=max_steps, hooks=hooks)
 
 
+@tf_export('estimator.EvalSpec')
 class EvalSpec(
     collections.namedtuple('EvalSpec', [
-        'input_fn', 'steps', 'name', 'hooks', 'exporters',
-        'start_delay_secs', 'throttle_secs'
+        'input_fn', 'steps', 'name', 'hooks', 'exporters', 'start_delay_secs',
+        'throttle_secs'
     ])):
   """Configuration for the "eval" part for the `train_and_evaluate` call.
 
@@ -253,6 +249,7 @@ class EvalSpec(
         throttle_secs=throttle_secs)
 
 
+@tf_export('estimator.train_and_evaluate')
 def train_and_evaluate(estimator, train_spec, eval_spec):
   """Train and evaluate the `estimator`.
 
@@ -415,52 +412,17 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   Raises:
     ValueError: if environment variable `TF_CONFIG` is incorrectly set.
   """
+  executor = _TrainingExecutor(
+      estimator=estimator, train_spec=train_spec, eval_spec=eval_spec)
 
-  if not isinstance(estimator, estimator_lib.Estimator):
-    raise TypeError('`estimator` must have type `tf.estimator.Estimator`, '
-                    'given {}'.format(type(estimator)))
   config = estimator.config
-
-  executor = _TrainingExecutor(estimator=estimator, train_spec=train_spec,
-                               eval_spec=eval_spec)
-
-  if (not config.cluster_spec and
-      config.task_type != run_config_lib.TaskType.EVALUATOR):
-    logging.info('Running training and evaluation locally (non-distributed).')
-    executor.run_local()
-    return
-
-  # Distributed case.
-  if not config.task_type:
-    # TODO(xiejw): Improve the error message about how to set the TF_CONFIG
-    # correctly.
-    raise ValueError(
-        '`estimator.config` must have task_type set. This usually means '
-        'TF_CONFIG environment is not set correctly.')
-
-  if config.task_type == 'local':
-    raise ValueError(
-        '`task.type` in TF_CONFIG cannot be `local`. Leaving `cluster` and '
-        '`task` properties in TF_CONFIG absent triggers train and evaluate '
-        '`Estimator` locally (non-distributed).')
-
   if (config.task_type == run_config_lib.TaskType.EVALUATOR and
       config.task_id > 0):
     raise ValueError(
         'For distributed training, there can only be one `evaluator` task '
         '(with task id 0).  Given task id {}'.format(config.task_id))
 
-  # For task type foo, call executor.run_foo.
-  available_tasks = [x for x in dir(executor) if x.startswith('run_')
-                     and x != 'run_local'
-                     and callable(getattr(executor, x))]
-  task_to_run = 'run_' + config.task_type
-  if task_to_run not in available_tasks:
-    raise ValueError(
-        'Task type {} is not supported. Supported task types are {}'.format(
-            config.task_type, [x[len('run_'):] for x in available_tasks]))
-  getattr(executor, task_to_run)()
-  return
+  executor.run()
 
 
 class _StopAtSecsHook(session_run_hook.SessionRunHook):
@@ -486,7 +448,12 @@ class _TrainingExecutor(object):
   training and evaluation based on the setting in `tf.estimator.RunConfig`.
   """
 
-  def __init__(self, estimator, train_spec, eval_spec):
+  def __init__(self,
+               estimator,
+               train_spec,
+               eval_spec,
+               train_hooks=None,
+               continuous_eval_listener=None):
     if not isinstance(estimator, estimator_lib.Estimator):
       raise TypeError('`estimator` must have type `tf.estimator.Estimator`.')
     self._estimator = estimator
@@ -499,10 +466,65 @@ class _TrainingExecutor(object):
       raise TypeError('`eval_spec` must have type `tf.estimator.EvalSpec`.')
     self._eval_spec = eval_spec
 
+    self._train_hooks = _validate_hooks(train_hooks)
+
+    if (continuous_eval_listener and
+        not isinstance(continuous_eval_listener, _ContinuousEvalListener)):
+      raise TypeError('`continuous_eval_listener` must have type '
+                      '`_ContinuousEvalListener`.')
+    self._continuous_eval_listener = (
+        continuous_eval_listener or _ContinuousEvalListener())
+
   @property
   def estimator(self):
     return self._estimator
 
+  def run(self):
+    """Executes the run_foo for task type `foo`.
+
+    `_TrainingExecutor` predefines the procedure for task type 'chief',
+    'worker', 'ps', and 'evaluator'. For task type `foo`, the corresponding
+    procedure is `run_foo'. This `run` method invoke the procedure base on the
+    `RunConfig.task_type`.
+
+    Raises:
+      ValueError: if the estimator.config is mis-configured.
+    """
+    config = self._estimator.config
+
+    if (not config.cluster_spec and
+        config.task_type != run_config_lib.TaskType.EVALUATOR):
+      logging.info('Running training and evaluation locally (non-distributed).')
+      self.run_local()
+      return
+
+    # Distributed case.
+    if not config.task_type:
+      # TODO(xiejw): Improve the error message about how to set the TF_CONFIG
+      # correctly.
+      raise ValueError(
+          '`estimator.config` must have task_type set. This usually means '
+          'TF_CONFIG environment is not set correctly.')
+
+    if config.task_type == 'local':
+      raise ValueError(
+          '`task.type` in TF_CONFIG cannot be `local`. Leaving `cluster` and '
+          '`task` properties in TF_CONFIG absent triggers train and evaluate '
+          '`Estimator` locally (non-distributed).')
+
+    # For task type foo, call executor.run_foo.
+    available_tasks = [
+        x for x in dir(self)
+        if x.startswith('run_') and x != 'run_local' and
+        callable(getattr(self, x))
+    ]
+    task_to_run = 'run_' + config.task_type
+    if task_to_run not in available_tasks:
+      raise ValueError(
+          'Task type {} is not supported. Supported task types are {}'.format(
+              config.task_type, [x[len('run_'):] for x in available_tasks]))
+    getattr(self, task_to_run)()
+
   def run_chief(self):
     """Runs task chief."""
     # TODO(xiejw): To allow execution framework to add train hooks.
@@ -534,9 +556,8 @@ class _TrainingExecutor(object):
           self._timer.update_last_triggered_step(global_step_value)
           self._evaluator.evaluate_and_export()
         else:
-          logging.info(
-              'Skip the current checkpoint eval due to throttle secs '
-              '({} secs).'.format(self._eval_throttle_secs))
+          logging.info('Skip the current checkpoint eval due to throttle secs '
+                       '({} secs).'.format(self._eval_throttle_secs))
 
     # Final export signal: For any eval result with global_step >= train
     # max_steps, the evaluator will send the final export signal. There is a
@@ -549,8 +570,8 @@ class _TrainingExecutor(object):
     #
     # But here, throttle_secs will skip the next intermediate checkpoint and,
     # so, the double final export chance is very small.
-    evaluator = _TrainingExecutor._Evaluator(
-        self._estimator, self._eval_spec, self._train_spec.max_steps)
+    evaluator = _TrainingExecutor._Evaluator(self._estimator, self._eval_spec,
+                                             self._train_spec.max_steps)
 
     # When the underlying `Estimator` object saves a new checkpoint, we would
     # like this callback to be called so that evaluation and export can trigger.
@@ -590,11 +611,11 @@ class _TrainingExecutor(object):
       raise ValueError('eval_spec.throttle_secs should be positive, given: {}.'
                        'It is used do determine how long each training '
                        'iteration should go when train and evaluate '
-                       'locally.'.format(
-                           self._eval_spec.throttle_secs))
+                       'locally.'.format(self._eval_spec.throttle_secs))
 
     stop_hook = _StopAtSecsHook(self._eval_spec.throttle_secs)
-    train_hooks = list(self._train_spec.hooks) + [stop_hook]
+    train_hooks = (
+        list(self._train_spec.hooks) + [stop_hook] + list(self._train_hooks))
     logging.info('Start train and evaluate loop. The evaluate will happen '
                  'after {} secs (eval_spec.throttle_secs) or training is '
                  'finished.'.format(self._eval_spec.throttle_secs))
@@ -613,27 +634,56 @@ class _TrainingExecutor(object):
       # _should_stop_local_train will then end the while True as the stopping
       # condition is satisfied (both checks use the same global_step value,
       # i.e., no race condition)
-      metrics = evaluator.evaluate_and_export()
+      eval_result = evaluator.evaluate_and_export()
 
-      if not metrics:
-        #  This is unexpected. Training should always end with a new checkpoint.
-        raise RuntimeError('There was no new checkpoint after the training.')
+      if eval_result.status != _EvalStatus.EVALUATED:
+        #  This is unexpected; should never happen.
+        #  Training should always end with a new checkpoint.
+        raise RuntimeError('There was no new checkpoint after the training. '
+                           'Eval status: {}'.format(eval_result.status))
 
-      if _should_stop_local_train(metrics[ops.GraphKeys.GLOBAL_STEP]):
+      if _should_stop_local_train(
+          eval_result.metrics[ops.GraphKeys.GLOBAL_STEP]):
         break
 
   def _start_std_server(self, config):
     """Creates, starts, and returns a server_lib.Server."""
-    if (not config.cluster_spec or not config.task_type or not config.master or
+    if (not config.cluster_spec or not config.task_type or
         config.task_id is None):
       raise RuntimeError('Could not start server; be sure to specify '
-                         'cluster_spec, task_type, master, and task in '
+                         'cluster_spec, task_type, and task in '
                          'RunConfig or set the TF_CONFIG environment variable.')
+
+    if not config.master:
+      jobs = config.cluster_spec.jobs
+      if (len(jobs) == 1 and
+          len(config.cluster_spec.job_tasks(jobs[0])) == 1 and
+          config.task_type in _TRAINER_JOBS):
+        # For distributed training, config.master is empty if and only if it has
+        # a single node in the cluster spec. In this case, we should not start
+        # the server.
+        logging.info('Skip starting Tensorflow server as there is only one '
+                     'node in the cluster.')
+        return
+      else:
+        raise RuntimeError(
+            'Could not start server; be sure to specify master in '
+            'RunConfig or set the TF_CONFIG environment variable.')
+
+    logging.info('Start Tensorflow server.')
+
+    if config.session_config is None:
+      session_config = config_pb2.ConfigProto(log_device_placement=False)
+    else:
+      session_config = config_pb2.ConfigProto(
+          log_device_placement=False,
+          gpu_options=config.session_config.gpu_options)
+
     server = server_lib.Server(
         config.cluster_spec,
         job_name=config.task_type,
         task_index=config.task_id,
-        config=config_pb2.ConfigProto(log_device_placement=False),
+        config=session_config,
         start=False)
     server.start()
     return server
@@ -662,10 +712,11 @@ class _TrainingExecutor(object):
                    start_delay_secs)
       time.sleep(start_delay_secs)
 
-    self._estimator.train(input_fn=self._train_spec.input_fn,
-                          max_steps=self._train_spec.max_steps,
-                          hooks=self._train_spec.hooks,
-                          saving_listeners=saving_listeners)
+    self._estimator.train(
+        input_fn=self._train_spec.input_fn,
+        max_steps=self._train_spec.max_steps,
+        hooks=list(self._train_spec.hooks) + list(self._train_hooks),
+        saving_listeners=saving_listeners)
 
   def _start_continuous_evaluation(self):
     """Repeatedly calls `Estimator` evaluate and export until training ends."""
@@ -678,32 +729,57 @@ class _TrainingExecutor(object):
     evaluator = _TrainingExecutor._Evaluator(self._estimator, self._eval_spec,
                                              self._train_spec.max_steps)
 
-    while True:
-      if latest_eval_result:
-        global_step = latest_eval_result.get(ops.GraphKeys.GLOBAL_STEP)
+    should_early_stop = False
+    while not should_early_stop:
+      if (latest_eval_result and
+          latest_eval_result.status == _EvalStatus.EVALUATED):
+        global_step = latest_eval_result.metrics.get(ops.GraphKeys.GLOBAL_STEP)
         if (global_step and self._train_spec.max_steps and
             global_step >= self._train_spec.max_steps):
           logging.info(
               'Exiting evaluation, global_step=%s >= train max_steps=%s',
-              global_step,
-              self._train_spec.max_steps)
+              global_step, self._train_spec.max_steps)
           return
 
-      # Final export signal: For any eval result with global_step >= train
-      # max_steps, the evaluator will send the final export signal. The next
-      # iteration of while loop will end the continuous eval as the stopping
-      # condition is satisfied (both checks use the same global_step value,
-      # i.e., no race condition)
-      start = time.time()
-      latest_eval_result = evaluator.evaluate_and_export()
+      latest_eval_result, should_early_stop = self._execute_evaluator_once(
+          evaluator, self._continuous_eval_listener,
+          self._eval_spec.throttle_secs)
+
+  def _execute_evaluator_once(self, evaluator, continuous_eval_listener,
+                              throttle_secs):
+    """Executes the `evaluator`."""
+    start = time.time()
 
-      # Throttle if necessary.
-      elapsed_time = time.time() - start
-      difference = self._eval_spec.throttle_secs  - elapsed_time
-      if difference > 0:
-        logging.info('Waiting %f secs before starting next eval run.',
-                     difference)
-        time.sleep(difference)
+    eval_result = None
+    should_early_stop = False
+
+    if not continuous_eval_listener.before_eval():
+      logging.info('Exiting evaluation, as requested by '
+                   '_ContinuousEvalListener.before_eval.')
+      should_early_stop = True
+      return (eval_result, should_early_stop)
+
+    # Final export signal: For any eval result with global_step >= train
+    # max_steps, the evaluator will send the final export signal. The next
+    # iteration of while loop will end the continuous eval as the stopping
+    # condition is satisfied (both checks use the same global_step value,
+    # i.e., no race condition)
+    eval_result = evaluator.evaluate_and_export()
+
+    if not self._continuous_eval_listener.after_eval(eval_result):
+      logging.info('Exiting evaluation, as requested by '
+                   '_ContinuousEvalListener.after_eval.')
+      should_early_stop = True
+      return (eval_result, should_early_stop)
+
+    # Throttle if necessary.
+    elapsed_time = time.time() - start
+    difference = throttle_secs - elapsed_time
+    if difference > 0:
+      logging.info('Waiting %f secs before starting next eval run.', difference)
+      time.sleep(difference)
+
+    return (eval_result, should_early_stop)
 
   class _Evaluator(object):
     """A helper class to call `Estimator.evaluate` and export model."""
@@ -724,8 +800,7 @@ class _TrainingExecutor(object):
       """Evaluate and (maybe) export the current model.
 
       Returns:
-        Evaluation results. Returns `None` if current round of evaluation is
-        skipped.
+        An `EvalResult` instance.
 
       Raises:
         RuntimeError: for any unexpected internal error.
@@ -735,39 +810,32 @@ class _TrainingExecutor(object):
       if not latest_ckpt_path:
         self._log_err_msg('Estimator is not trained yet. Will start an '
                           'evaluation when a checkpoint is ready.')
-        return None
+        return _EvalResult(status=_EvalStatus.MISSING_CHECKPOINT)
 
       if latest_ckpt_path == self._previous_ckpt_path:
         self._log_err_msg(
             'No new checkpoint ready for evaluation. Skip the current '
             'evaluation pass as evaluation results are expected to be same '
             'for the same checkpoint.')
-        return None
-      eval_result = self._estimator.evaluate(
+        return _EvalResult(status=_EvalStatus.NO_NEW_CHECKPOINT)
+
+      metrics = self._estimator.evaluate(
           input_fn=self._eval_spec.input_fn,
           steps=self._eval_spec.steps,
           name=self._eval_spec.name,
           checkpoint_path=latest_ckpt_path,
           hooks=self._eval_spec.hooks)
 
-      if not eval_result:
-        raise RuntimeError(
-            'Internal error: `Estimator.evaluate` should never return empty '
-            'result.')
-      if not isinstance(eval_result, dict):
-        raise TypeError(
-            '`Estimator.evaluate` should return dict. Given {}.'.format(
-                type(eval_result)))
-      if ops.GraphKeys.GLOBAL_STEP not in eval_result:
-        raise RuntimeError(
-            'Internal error: `Estimator.evaluate` result should have '
-            '`global_step` in result. Given {}'.format(eval_result))
+      # _EvalResult validates the metrics.
+      eval_result = _EvalResult(
+          status=_EvalStatus.EVALUATED,
+          metrics=metrics,
+          checkpoint_path=latest_ckpt_path)
 
-      is_the_final_export = (eval_result[ops.GraphKeys.GLOBAL_STEP] >=
-                             self._max_training_steps
-                             if self._max_training_steps else False)
-      self._export_eval_result(eval_result, latest_ckpt_path,
-                               is_the_final_export)
+      is_the_final_export = (
+          eval_result.metrics[ops.GraphKeys.GLOBAL_STEP] >=
+          self._max_training_steps if self._max_training_steps else False)
+      self._export_eval_result(eval_result, is_the_final_export)
 
       if is_the_final_export:
         logging.debug('Calling exporter with the `is_the_final_export=True`.')
@@ -784,8 +852,7 @@ class _TrainingExecutor(object):
         logging.warning(message)
         self._last_warning_time = current_time
 
-    def _export_eval_result(self, eval_result, checkpoint_path,
-                            is_the_final_export):
+    def _export_eval_result(self, eval_result, is_the_final_export):
       """Export `eval_result` according to exporters in `EvalSpec`."""
       export_dir_base = os.path.join(
           compat.as_str_any(self._estimator.model_dir),
@@ -797,6 +864,114 @@ class _TrainingExecutor(object):
             export_path=os.path.join(
                 compat.as_str_any(export_dir_base),
                 compat.as_str_any(exporter.name)),
-            checkpoint_path=checkpoint_path,
-            eval_result=eval_result,
+            checkpoint_path=eval_result.checkpoint_path,
+            eval_result=eval_result.metrics,
             is_the_final_export=is_the_final_export)
+
+
+class _EvalStatus(object):
+  """The status of an evaluation event.
+
+  For local training and evaluation, the status can only be `EVALUATED` as
+  `Estimator.train` always generates a new checkpoint.
+
+  For distributed training and evaluation, a separated evaluator keeps looking
+  for new checkpoint. So, multiple situations might occur:
+
+  - EVALUATED: A new checkpoint is found since last evaluation.
+      `Estimator.evaluate` will be invoked.
+  - MISSING_CHECKPOINT: No checkpoint can be found. Typically, this means
+      the trainer has not yet produced any checkpoint.
+  - NO_NEW_CHECKPOINT: No new checkpoint can be found since last evaluation.
+      Typically, this means the trainer has not yet produced any new checkpoint.
+  """
+
+  EVALUATED = 'evaluated'
+  MISSING_CHECKPOINT = 'missing checkpoint'
+  NO_NEW_CHECKPOINT = 'no new checkpoint'
+
+
+class _EvalResult(
+    collections.namedtuple('EvalResult',
+                           ['status', 'metrics', 'checkpoint_path'])):
+  """_EvalResult holds the result of an evaluation event."""
+
+  def __new__(cls, status, metrics=None, checkpoint_path=None):
+    """Creates a validated `_EvalResult`.
+
+    Args:
+      status: See `_EvalStatus`.
+      metrics: The evaluation results returned by `Estimator.evaluate`. Only set
+          if status is `EVALUATED`.
+      checkpoint_path: The corresponding checkpoint path for the `metrics`. Only
+          set if status is `EVALUATED`.
+    Returns:
+      A validated `_EvalResult` object.
+
+    Raises:
+      ValueError: If validation fails.
+      TypeError: If any of the arguments is not the expected type.
+    """
+
+    if status != _EvalStatus.EVALUATED:
+      if metrics:
+        raise ValueError(
+            'metrics must be `None` if status is not {}; got status {},'
+            ' metrics {}'.format(_EvalStatus.EVALUATED, status, metrics))
+      if checkpoint_path:
+        raise ValueError(
+            'checkpoint must be `None` if status is not {}; got status {}, '
+            'checkpoint_path {}'.format(_EvalStatus.EVALUATED, status,
+                                        checkpoint_path))
+      return super(_EvalResult, cls).__new__(cls, status, metrics,
+                                             checkpoint_path)
+
+    # Now, evaluated case.
+    assert status == _EvalStatus.EVALUATED
+
+    # Validates metrics.
+    if not metrics:
+      raise ValueError(
+          'Internal error: `Estimator.evaluate` should never return empty '
+          'metrics.')
+    if not isinstance(metrics, dict):
+      raise TypeError(
+          '`Estimator.evaluate` should return dict. Given {}.'.format(
+              type(metrics)))
+    if ops.GraphKeys.GLOBAL_STEP not in metrics:
+      raise ValueError(
+          'Internal error: `Estimator.evaluate` result should have '
+          '`global_step` in result. Given {}'.format(metrics))
+
+    # Validates checkpoint_path.
+    if not checkpoint_path:
+      raise ValueError(
+          'Internal error: `checkpoint_path` should never be empty.')
+
+    return super(_EvalResult, cls).__new__(cls, status, metrics,
+                                           checkpoint_path)
+
+
+class _ContinuousEvalListener(object):
+  """Interface for listeners that take action before or after evaluation."""
+
+  def before_eval(self):
+    """Called before evaluation.
+
+    Returns:
+      `False` if you want to skip the current evaluation and early stop the
+      continuous evaluation; `True` otherwise.
+    """
+    return True
+
+  def after_eval(self, eval_result):
+    """Called after the evaluation is executed.
+
+    Args:
+      eval_result: An `_EvalResult` instance.
+
+    Returns:
+      False if you want to early stop continuous evaluation; `True` otherwise.
+    """
+    del eval_result
+    return True
diff --git a/tensorflow/python/estimator/training_test.py b/tensorflow/python/estimator/training_test.py
index 17d018aa8808266c273a282e4042817d1368bdfe..4f7da848086514b6241799645997c8c6a246631f 100644
--- a/tensorflow/python/estimator/training_test.py
+++ b/tensorflow/python/estimator/training_test.py
@@ -72,6 +72,7 @@ _NONE_EXPORTER_NAME_MSG = (
     'An Exporter cannot have a name that is `None` or empty.')
 _INVALID_TRAIN_SPEC_MSG = '`train_spec` must have type `tf.estimator.TrainSpec`'
 _INVALID_EVAL_SPEC_MSG = '`eval_spec` must have type `tf.estimator.EvalSpec`'
+_INVALID_EVAL_LISTENER_MSG = 'must have type `_ContinuousEvalListener`'
 _INVALID_CONFIG_FOR_STD_SERVER_MSG = 'Could not start server; .*TF_CONFIG'
 _INVALID_LOCAL_TASK_WITH_CLUSTER = '`task.type` in TF_CONFIG cannot be `local`'
 _INVALID_TASK_TYPE = '`estimator.config` must have task_type set.'
@@ -81,7 +82,7 @@ _INVALID_TASK_TYPE = '`estimator.config` must have task_type set.'
 _INVALID_TASK_TO_RUN = (
     'Task type .* is not supported. Supported task types are ((?!local).)*$')
 _INVALID_EMPTY_EVAL_RESULT_ERR = (
-    'Internal error: `Estimator.evaluate` should never return empty result')
+    'Internal error: `Estimator.evaluate` should never return empty metrics')
 _INVALID_EVAL_RESULT_TYPE_ERR = '`Estimator.evaluate` should return dict.'
 _MISSING_GLOBAL_STEP_IN_EVAL_RESULT_ERR = (
     'Internal error: `Estimator.evaluate` result should have `global_step`')
@@ -311,61 +312,21 @@ class EvalSpecTest(test.TestCase):
       training.EvalSpec(input_fn=lambda: 1, exporters=_create_exporter(None))
 
 
-class TrainAndEvaluteTest(test.TestCase):
+class TrainAndEvaluateTest(test.TestCase):
 
-  def _mock_executor_instance(self):
-    mock_instance = test.mock.Mock()
-    mock_instance.call_task = {}
-
-    def task_fn(name):
-      def _fn():
-        mock_instance.call_task[name] = 1
-      return _fn
-
-    mock_instance.run_chief = task_fn('chief')
-    mock_instance.run_master = task_fn('master')
-    mock_instance.run_ps = task_fn('ps')
-    mock_instance.run_evaluator = task_fn('evaluator')
-    mock_instance.run_worker = task_fn('worker')
-    mock_instance.run_local = task_fn('local')
-
-    return mock_instance
-
-  def _test_run_task_in_distributed_training(self, run_config):
+  def test_run_task(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = run_config
     mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor:
-      mock_executor_instance = self._mock_executor_instance()
+      mock_executor_instance = test.mock.Mock()
       mock_executor.return_value = mock_executor_instance
       training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
       mock_executor.assert_called_with(estimator=mock_est,
                                        train_spec=mock_train_spec,
                                        eval_spec=mock_eval_spec)
-      return mock_executor_instance
-
-  def test_run_chief(self):
-    mock_executor = self._test_run_task_in_distributed_training(
-        run_config=_create_run_config_with_cluster_spec(_TF_CONFIG_FOR_CHIEF))
-    self.assertEqual(1, mock_executor.call_task['chief'])
-
-  def test_run_worker(self):
-    mock_executor = self._test_run_task_in_distributed_training(
-        run_config=_create_run_config_with_cluster_spec(_TF_CONFIG_FOR_WORKER))
-    self.assertEqual(1, mock_executor.call_task['worker'])
-
-  def test_run_ps(self):
-    mock_executor = self._test_run_task_in_distributed_training(
-        run_config=_create_run_config_with_cluster_spec(_TF_CONFIG_FOR_PS))
-    self.assertEqual(1, mock_executor.call_task['ps'])
-
-  def test_run_evaluator(self):
-    mock_executor = self._test_run_task_in_distributed_training(
-        run_config=_create_run_config_with_cluster_spec(
-            _TF_CONFIG_FOR_EVALUATOR))
-    self.assertEqual(1, mock_executor.call_task['evaluator'])
+      self.assertTrue(mock_executor_instance.run.called)
 
   def test_error_out_if_evaluator_task_id_is_non_zero(self):
     tf_config = {
@@ -377,93 +338,15 @@ class TrainAndEvaluteTest(test.TestCase):
             'index': 1
         }
     }
-    with self.assertRaisesRegexp(ValueError, _INVALID_EVAL_TASK_ID_ERR):
-      self._test_run_task_in_distributed_training(
-          run_config=_create_run_config_with_cluster_spec(tf_config))
-
-  def test_run_local(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = run_config_lib.RunConfig()
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor:
-      mock_executor_instance = self._mock_executor_instance()
-      mock_executor.return_value = mock_executor_instance
-      training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
-      self.assertEqual(1, mock_executor_instance.call_task['local'])
-
-      mock_executor.assert_called_with(estimator=mock_est,
-                                       train_spec=mock_train_spec,
-                                       eval_spec=mock_eval_spec)
 
-  def test_invalid_local_task(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            'local': ['hos1:1'],
-        },
-        'task': {
-            'type': 'local',
-            'index': 0
-        }
-    }
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_est.config = _create_run_config_with_cluster_spec(tf_config)
     mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
-    with self.assertRaisesRegexp(ValueError, _INVALID_LOCAL_TASK_WITH_CLUSTER):
+    with self.assertRaisesRegexp(ValueError, _INVALID_EVAL_TASK_ID_ERR):
       training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
 
-  def test_unsupported_task_due_to_missing_run_task(self):
-    unsupported_task = 'alloc'
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            unsupported_task: ['hos1:1'],
-        },
-        'task': {
-            'type': unsupported_task,
-            'index': 0
-        }
-    }
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = _create_run_config_with_cluster_spec(tf_config)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor:
-      # mock_instance has no run_alloc method.
-      mock_instance = self._mock_executor_instance()
-      mock_executor.return_value = mock_instance
-      with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TO_RUN):
-        training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
-
-  def test_unsupported_task_due_to_not_callable(self):
-    unsupported_task = 'alloc'
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            unsupported_task: ['hos1:1'],
-        },
-        'task': {
-            'type': unsupported_task,
-            'index': 0
-        }
-    }
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = _create_run_config_with_cluster_spec(tf_config)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor:
-      mock_instance = self._mock_executor_instance()
-      mock_instance.run_alloc = 123  # not callable
-      mock_executor.return_value = mock_instance
-      with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TO_RUN):
-        training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
-
   def test_invalid_estimator(self):
     invalid_estimator = object()
     mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
@@ -473,19 +356,6 @@ class TrainAndEvaluteTest(test.TestCase):
       training.train_and_evaluate(invalid_estimator, mock_train_spec,
                                   mock_eval_spec)
 
-  def test_invalid_task_type(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = test.mock.Mock()
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.Mock()
-    mock_est.config.cluster_spec = {'1': 'dummy'}
-    mock_est.config.task_type = ''
-
-    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE):
-      training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
-
 
 class TrainingExecutorConstructorTest(test.TestCase):
   """Tests constructor of _TrainingExecutor."""
@@ -522,6 +392,29 @@ class TrainingExecutorConstructorTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
       training._TrainingExecutor(estimator, train_spec, invalid_eval_spec)
 
+  def test_invalid_train_hooks(self):
+    estimator = estimator_lib.Estimator(model_fn=lambda features: features)
+    train_spec = training.TrainSpec(input_fn=lambda: 1)
+    eval_spec = training.EvalSpec(input_fn=lambda: 1)
+    invalid_train_hooks = [object()]
+
+    with self.assertRaisesRegexp(TypeError, _INVALID_HOOK_MSG):
+      training._TrainingExecutor(
+          estimator, train_spec, eval_spec, train_hooks=invalid_train_hooks)
+
+  def test_invalid_continuous_eval_listener(self):
+    estimator = estimator_lib.Estimator(model_fn=lambda features: features)
+    train_spec = training.TrainSpec(input_fn=lambda: 1)
+    eval_spec = training.EvalSpec(input_fn=lambda: 1)
+    invalid_continuous_eval_listener = object()
+
+    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_LISTENER_MSG):
+      training._TrainingExecutor(
+          estimator,
+          train_spec,
+          eval_spec,
+          continuous_eval_listener=invalid_continuous_eval_listener)
+
 
 class _TrainingExecutorTrainingTest(object):
   """Tests training of _TrainingExecutor."""
@@ -530,6 +423,8 @@ class _TrainingExecutorTrainingTest(object):
     self._run_config = run_config
 
   def _run_task(self, executor):
+    # We should not call executor.run as the test here is intended to test
+    # run_foo explicitly (foo is the task type).
     return getattr(executor, 'run_' + self._run_config.task_type)()
 
   @test.mock.patch.object(time, 'sleep')
@@ -554,19 +449,40 @@ class _TrainingExecutorTrainingTest(object):
 
     self.assertTrue(mock_server_instance.start.called)
 
-    mock_est.train.assert_called_with(input_fn=train_spec.input_fn,
-                                      max_steps=train_spec.max_steps,
-                                      hooks=train_spec.hooks,
-                                      saving_listeners=test.mock.ANY)
+    mock_est.train.assert_called_with(
+        input_fn=train_spec.input_fn,
+        max_steps=train_spec.max_steps,
+        hooks=list(train_spec.hooks),
+        saving_listeners=test.mock.ANY)
     mock_est.evaluate.assert_not_called()
     mock_est.export_savedmodel.assert_not_called()
 
+  @test.mock.patch.object(time, 'sleep')
+  @test.mock.patch.object(server_lib, 'Server')
+  def test_train_with_train_hooks(self, unused_mock_server, unused_mock_sleep):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.config = self._run_config
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=2, hooks=[_FakeHook()])
+    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
+    extra_hooks = [_FakeHook()]
+
+    executor = training._TrainingExecutor(
+        mock_est, train_spec, mock_eval_spec, train_hooks=extra_hooks)
+    self._run_task(executor)
+
+    mock_est.train.assert_called_with(
+        input_fn=train_spec.input_fn,
+        max_steps=train_spec.max_steps,
+        hooks=list(train_spec.hooks) + extra_hooks,
+        saving_listeners=test.mock.ANY)
+
   @test.mock.patch.object(time, 'sleep')
   @test.mock.patch.object(server_lib, 'Server')
   def test_no_server_startup_in_google(self, mock_server, unused_mock_sleep):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_est.config = self._run_config
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, hooks=[])
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     executor = training._TrainingExecutor(mock_est, mock_train_spec,
@@ -598,7 +514,8 @@ class _TrainingExecutorTrainingTest(object):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec(
+        {'worker': ['dummy', 'dummy1']})
     mock_est.config.master = ''
     mock_est.config.task_type = 'worker'
     mock_est.config.task_id = 2
@@ -608,13 +525,33 @@ class _TrainingExecutorTrainingTest(object):
       self._run_task(training._TrainingExecutor(mock_est, mock_train_spec,
                                                 mock_eval_spec))
 
+  @test.mock.patch.object(time, 'sleep')
+  @test.mock.patch.object(server_lib, 'Server')
+  def test_single_worker_node_with_empty_tf_master(
+      self, mock_server, unused_mock_sleep):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, hooks=[])
+    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
+
+    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
+    # Single node cluster.
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'worker': ['dummy']})
+    mock_est.config.master = ''
+    mock_est.config.task_type = 'worker'
+    mock_est.config.task_id = 2
+
+    self._run_task(training._TrainingExecutor(mock_est, mock_train_spec,
+                                              mock_eval_spec))
+    self.assertTrue(mock_est.train.called)
+    mock_server.assert_not_called()
+
   def test_fail_with_empty_task_type(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'worker': ['dummy']})
     mock_est.config.master = 'grpc://...'
     mock_est.config.task_type = ''
     mock_est.config.task_id = 2
@@ -630,7 +567,7 @@ class _TrainingExecutorTrainingTest(object):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'worker': ['dummy']})
     mock_est.config.master = 'grpc://...'
     mock_est.config.task_type = 'worker'
     mock_est.config.task_id = None
@@ -655,7 +592,7 @@ class TrainingExecutorRunWorkerTest(_TrainingExecutorTrainingTest,
   def test_delay_for_worker(self, _):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_est.config = self._run_config
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, hooks=[])
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     executor = training._TrainingExecutor(mock_est, mock_train_spec,
@@ -682,7 +619,7 @@ class TrainingExecutorRunChiefTest(_TrainingExecutorTrainingTest,
   def test_no_delay_for_chief(self, _):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_est.config = self._run_config
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, hooks=[])
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     executor = training._TrainingExecutor(mock_est, mock_train_spec,
@@ -705,7 +642,8 @@ class TrainingExecutorRunMasterTest(test.TestCase):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
     mock_est.config = self._run_config
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, max_steps=123)
+    mock_train_spec = test.mock.Mock(
+        spec=training.TrainSpec, max_steps=123, hooks=[])
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec, exporters=[])
 
     executor = training._TrainingExecutor(mock_est, mock_train_spec,
@@ -738,19 +676,42 @@ class TrainingExecutorRunMasterTest(test.TestCase):
 
     self.assertTrue(mock_server_instance.start.called)
 
-    mock_est.train.assert_called_with(input_fn=train_spec.input_fn,
-                                      max_steps=train_spec.max_steps,
-                                      hooks=train_spec.hooks,
-                                      saving_listeners=test.mock.ANY)
+    mock_est.train.assert_called_with(
+        input_fn=train_spec.input_fn,
+        max_steps=train_spec.max_steps,
+        hooks=list(train_spec.hooks),
+        saving_listeners=test.mock.ANY)
     mock_est.export_savedmodel.assert_not_called()
 
+  @test.mock.patch.object(time, 'sleep')
+  @test.mock.patch.object(server_lib, 'Server')
+  def test_train_with_train_hooks(self, mock_server, unused_mock_sleep):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
+    mock_est.config = self._run_config
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=2, hooks=[_FakeHook()])
+    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec, exporters=[])
+    extra_hooks = [_FakeHook()]
+
+    executor = training._TrainingExecutor(
+        mock_est, train_spec, mock_eval_spec, train_hooks=extra_hooks)
+    executor.run_master()
+
+    mock_est.train.assert_called_with(
+        input_fn=train_spec.input_fn,
+        max_steps=train_spec.max_steps,
+        hooks=list(train_spec.hooks) + extra_hooks,
+        saving_listeners=test.mock.ANY)
+
   @test.mock.patch.object(time, 'sleep')
   @test.mock.patch.object(server_lib, 'Server')
   def test_no_server_startup_in_google(self, mock_server, unused_mock_sleep):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
     mock_est.config = self._run_config
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, max_steps=123)
+    mock_train_spec = test.mock.Mock(
+        spec=training.TrainSpec, max_steps=123, hooks=[])
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec, exporters=[])
 
     executor = training._TrainingExecutor(mock_est, mock_train_spec,
@@ -768,7 +729,7 @@ class TrainingExecutorRunMasterTest(test.TestCase):
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
     mock_est.config.cluster_spec = None
     mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'worker'
+    mock_est.config.task_type = 'master'
     mock_est.config.task_id = 2
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -782,23 +743,49 @@ class TrainingExecutorRunMasterTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec(
+        {'master': ['dummy'], 'worker': ['dummy1']})
     mock_est.config.master = ''
-    mock_est.config.task_type = 'worker'
-    mock_est.config.task_id = 2
+    mock_est.config.task_type = 'master'
+    mock_est.config.task_id = 0
 
     with self.assertRaisesRegexp(RuntimeError,
                                  _INVALID_CONFIG_FOR_STD_SERVER_MSG):
       training._TrainingExecutor(
           mock_est, mock_train_spec, mock_eval_spec).run_master()
 
+  @test.mock.patch.object(time, 'sleep')
+  @test.mock.patch.object(server_lib, 'Server')
+  def test_single_master_node_with_empty_tf_master(
+      self, mock_server, unused_mock_sleep):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
+
+    mock_train_spec = test.mock.Mock(
+        spec=training.TrainSpec, max_steps=123, hooks=[])
+    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec, exporters=[])
+
+    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
+    mock_est.config.cluster_spec = server_lib.ClusterSpec(
+        {'master': ['dummy']})
+    mock_est.config.master = ''
+    mock_est.config.task_type = 'master'
+    mock_est.config.task_id = 0
+
+    executor = training._TrainingExecutor(
+        mock_est, mock_train_spec, mock_eval_spec)
+    executor.run_master()
+
+    mock_server.assert_not_called()
+    self.assertTrue(mock_est.train.called)
+
   def test_fail_with_empty_task_type(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'master': ['dummy']})
     mock_est.config.master = 'grpc://...'
     mock_est.config.task_type = ''
     mock_est.config.task_id = 2
@@ -814,9 +801,9 @@ class TrainingExecutorRunMasterTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'master': ['dummy']})
     mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'worker'
+    mock_est.config.task_type = 'master'
     mock_est.config.task_id = None
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -993,6 +980,28 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
         hooks=eval_spec.hooks)
     self.assertFalse(mock_est.train.called)
 
+  def test_evaluate_with_train_hooks(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.latest_checkpoint.return_value = 'latest_it_is'
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
+
+    eval_spec = training.EvalSpec(
+        input_fn=lambda: 1,
+        steps=2,
+        hooks=[_FakeHook()],
+        name='cont_eval',
+        start_delay_secs=0,
+        throttle_secs=0)
+
+    # The train_hooks will not be called during eval.
+    mock_hook = test.mock.Mock(spec=session_run_hook.SessionRunHook)
+    executor = training._TrainingExecutor(
+        mock_est, mock_train_spec, eval_spec, train_hooks=[mock_hook])
+    executor.run_evaluator()
+
+    mock_hook.begin.assert_not_called()
+
   def test_evaluate_multiple_times(self):
     training_max_step = 200
 
@@ -1036,6 +1045,88 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
     self.assertEqual(2, mock_est.times_export_was_called)
     self.assertEqual(1, mock_est.times_final_export_was_called)
 
+  def test_evaluate_listener_before_eval(self):
+    training_max_step = 200
+
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.model_dir = compat.as_bytes(test.get_temp_dir())
+    # Without early stopping, this eval will be run twice.
+    mock_est.evaluate.side_effect = [{
+        _GLOBAL_STEP_KEY: training_max_step // 2
+    }, {
+        _GLOBAL_STEP_KEY: training_max_step
+    }]
+    mock_est.latest_checkpoint.side_effect = ['path_1', 'path_2']
+
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, hooks=[])
+    mock_train_spec.max_steps = training_max_step
+
+    class _Listener(training._ContinuousEvalListener):
+
+      def __init__(self):
+        self.call_count = 0
+
+      def before_eval(self):
+        self.call_count += 1
+        return  self.call_count == 1
+
+    listener = _Listener()
+
+    eval_spec = training.EvalSpec(
+        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=0)
+
+    training._TrainingExecutor(
+        mock_est, mock_train_spec, eval_spec,
+        continuous_eval_listener=listener).run_evaluator()
+
+    # Before_eval returns False during the second time, so, evaluate will be
+    # called once.
+    self.assertEqual(1, mock_est.evaluate.call_count)
+    self.assertEqual(2, listener.call_count)
+
+  def test_evaluate_listener_after_eval(self):
+    training_max_step = 200
+
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.model_dir = compat.as_bytes(test.get_temp_dir())
+    # Without early stopping, this eval will be run twice.
+    expected_eval_metrics = [{
+        _GLOBAL_STEP_KEY: training_max_step // 2
+    }, {
+        _GLOBAL_STEP_KEY: training_max_step
+    }]
+    mock_est.evaluate.side_effect = expected_eval_metrics
+    mock_est.latest_checkpoint.side_effect = ['path_1', 'path_2']
+
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_train_spec.max_steps = training_max_step
+
+    class _Listener(training._ContinuousEvalListener):
+
+      def __init__(self):
+        self.call_count = 0
+
+      def after_eval(self, eval_result):
+        self.call_count += 1
+        self.eval_result = eval_result
+        return False
+
+    listener = _Listener()
+
+    eval_spec = training.EvalSpec(
+        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=0)
+
+    training._TrainingExecutor(
+        mock_est, mock_train_spec, eval_spec,
+        continuous_eval_listener=listener).run_evaluator()
+
+    # after_eval returns False during the first time, so, evaluate will be
+    # called once.
+    self.assertEqual(1, mock_est.evaluate.call_count)
+    self.assertEqual(1, listener.call_count)
+    self.assertAllEqual(expected_eval_metrics[0], listener.eval_result.metrics)
+    self.assertEqual('path_1', listener.eval_result.checkpoint_path)
+
   def test_final_export_is_true_in_the_end(self):
     training_max_step = 200
 
@@ -1108,6 +1199,70 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
     # successuful evaluation)
     self.assertEqual(2, mock_log.call_count)
 
+  def test_continuous_eval_listener_eval_result(self):
+    training_max_step = 200
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    expected_eval_metrics = [{
+        _GLOBAL_STEP_KEY: training_max_step // 2
+    }, {
+        _GLOBAL_STEP_KEY: training_max_step
+    }]
+    mock_est.evaluate.side_effect = expected_eval_metrics
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_train_spec.max_steps = training_max_step
+
+    class _Listener(training._ContinuousEvalListener):
+
+      def __init__(self):
+        self.eval_results = []
+
+      def after_eval(self, eval_result):
+        self.eval_results.append(eval_result)
+        return True
+
+    continuous_eval_listener = _Listener()
+
+    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
+
+    # First two items are invalid, next two items are same.
+    mock_est.latest_checkpoint.side_effect = [
+        None, '', 'same', 'same', 'path_2'
+    ]
+    expected_eval_results = [
+        training._EvalResult(training._EvalStatus.MISSING_CHECKPOINT),
+        training._EvalResult(training._EvalStatus.MISSING_CHECKPOINT),
+        training._EvalResult(
+            training._EvalStatus.EVALUATED,
+            metrics=expected_eval_metrics[0],
+            checkpoint_path='same'),
+        training._EvalResult(training._EvalStatus.NO_NEW_CHECKPOINT),
+        training._EvalResult(
+            training._EvalStatus.EVALUATED,
+            metrics=expected_eval_metrics[1],
+            checkpoint_path='path_2'),
+    ]
+
+    eval_spec = training.EvalSpec(
+        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=0)
+
+    executor = training._TrainingExecutor(
+        mock_est,
+        mock_train_spec,
+        eval_spec,
+        continuous_eval_listener=continuous_eval_listener)
+    executor.run_evaluator()
+
+    # Three checkpoint paths are invalid.
+    self.assertEqual(5, mock_est.latest_checkpoint.call_count)
+    self.assertEqual(2, mock_est.evaluate.call_count)
+
+    self.assertEqual(5, len(continuous_eval_listener.eval_results))
+    for i, result in enumerate(continuous_eval_listener.eval_results):
+      self.assertEqual(expected_eval_results[i].status, result.status)
+      self.assertAllEqual(expected_eval_results[i].metrics, result.metrics)
+      self.assertEqual(expected_eval_results[i].checkpoint_path,
+                       result.checkpoint_path)
+
   def test_sleep_start_delay_secs(self):
     training_max_step = 200
     start_delay_secs = 123
@@ -1184,7 +1339,7 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
     mock_est.evaluate.return_value = {}
 
     executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(RuntimeError, _INVALID_EMPTY_EVAL_RESULT_ERR):
+    with self.assertRaisesRegexp(ValueError, _INVALID_EMPTY_EVAL_RESULT_ERR):
       executor.run_evaluator()
 
   def test_errors_out_if_evaluate_returns_non_dict(self):
@@ -1206,7 +1361,7 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
     mock_est.evaluate.return_value = {'loss': 123}
 
     executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(RuntimeError,
+    with self.assertRaisesRegexp(ValueError,
                                  _MISSING_GLOBAL_STEP_IN_EVAL_RESULT_ERR):
       executor.run_evaluator()
 
@@ -1246,7 +1401,7 @@ class TrainingExecutorRunPsTest(test.TestCase):
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
     mock_est.config.cluster_spec = None
     mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'gs'
+    mock_est.config.task_type = 'ps'
     mock_est.config.task_id = 2
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -1260,9 +1415,9 @@ class TrainingExecutorRunPsTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'gs': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'ps': ['dummy']})
     mock_est.config.master = ''
-    mock_est.config.task_type = 'gs'
+    mock_est.config.task_type = 'ps'
     mock_est.config.task_id = 2
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -1276,7 +1431,7 @@ class TrainingExecutorRunPsTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'gs': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'ps': ['dummy']})
     mock_est.config.master = 'grpc://...'
     mock_est.config.task_type = ''
     mock_est.config.task_id = 2
@@ -1292,9 +1447,9 @@ class TrainingExecutorRunPsTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'gs': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'ps': ['dummy']})
     mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'gs'
+    mock_est.config.task_type = 'ps'
     mock_est.config.task_id = None
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -1480,6 +1635,26 @@ class TrainingExecutorRunLocalTest(test.TestCase):
     self.assertEqual(train_spec.input_fn, train_args['input_fn'])
     self.assertEqual(train_spec.max_steps, train_args['max_steps'])
 
+  def test_train_hooks(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, model_dir='path/')
+    mock_est.latest_checkpoint.return_value = 'checkpoint_path/'
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=300, hooks=[_FakeHook()])
+    eval_spec = training.EvalSpec(input_fn=lambda: 1, steps=2)
+    mock_est.evaluate.return_value = {_GLOBAL_STEP_KEY: train_spec.max_steps}
+    extra_hooks = [_FakeHook()]
+
+    executor = training._TrainingExecutor(
+        mock_est, train_spec, eval_spec, train_hooks=extra_hooks)
+    executor.run_local()
+
+    train_args = mock_est.train.call_args[1]
+    self.assertEqual(
+        list(train_spec.hooks) + extra_hooks, [
+            h for h in train_args['hooks']
+            if not isinstance(h, training._StopAtSecsHook)
+        ])
+
   def test_errors_out_if_throttle_secs_is_zero(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     train_spec = training.TrainSpec(input_fn=lambda: 1)
@@ -1527,7 +1702,7 @@ class TrainingExecutorRunLocalTest(test.TestCase):
     mock_est.evaluate.return_value = {}
 
     executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(RuntimeError, _INVALID_EMPTY_EVAL_RESULT_ERR):
+    with self.assertRaisesRegexp(ValueError, _INVALID_EMPTY_EVAL_RESULT_ERR):
       executor.run_local()
 
   def test_errors_out_if_evaluate_returns_non_dict(self):
@@ -1547,11 +1722,152 @@ class TrainingExecutorRunLocalTest(test.TestCase):
     mock_est.evaluate.return_value = {'loss': 123}
 
     executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(RuntimeError,
+    with self.assertRaisesRegexp(ValueError,
                                  _MISSING_GLOBAL_STEP_IN_EVAL_RESULT_ERR):
       executor.run_local()
 
 
+class TrainAndEvaluateRunTest(test.TestCase):
+
+  def _test_run_task_and_executor(self, run_config):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.config = run_config
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
+
+    executor = training._TrainingExecutor(mock_est, mock_train_spec,
+                                          mock_eval_spec)
+
+    executor.call_task = {}
+
+    def task_fn(name):
+
+      def _fn():
+        executor.call_task[name] = 1
+
+      return _fn
+
+    executor.run_chief = task_fn('chief')
+    executor.run_master = task_fn('master')
+    executor.run_ps = task_fn('ps')
+    executor.run_evaluator = task_fn('evaluator')
+    executor.run_worker = task_fn('worker')
+    executor.run_local = task_fn('local')
+    return executor
+
+  def test_run_chief(self):
+    executor = self._test_run_task_and_executor(
+        run_config=_create_run_config_with_cluster_spec(_TF_CONFIG_FOR_CHIEF))
+    executor.run()
+    self.assertEqual(1, executor.call_task['chief'])
+
+  def test_run_worker(self):
+    executor = self._test_run_task_and_executor(
+        run_config=_create_run_config_with_cluster_spec(_TF_CONFIG_FOR_WORKER))
+    executor.run()
+    self.assertEqual(1, executor.call_task['worker'])
+
+  def test_run_ps(self):
+    executor = self._test_run_task_and_executor(
+        run_config=_create_run_config_with_cluster_spec(_TF_CONFIG_FOR_PS))
+    executor.run()
+    self.assertEqual(1, executor.call_task['ps'])
+
+  def test_run_evaluator(self):
+    executor = self._test_run_task_and_executor(
+        run_config=_create_run_config_with_cluster_spec(
+            _TF_CONFIG_FOR_EVALUATOR))
+    executor.run()
+    self.assertEqual(1, executor.call_task['evaluator'])
+
+  def test_run_local(self):
+    executor = self._test_run_task_and_executor(
+        run_config=run_config_lib.RunConfig())
+    executor.run()
+    self.assertEqual(1, executor.call_task['local'])
+
+  def test_invalid_local_task(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+            'local': ['hos1:1'],
+        },
+        'task': {
+            'type': 'local',  # invalid task type.
+            'index': 0
+        }
+    }
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.config = _create_run_config_with_cluster_spec(tf_config)
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
+
+    executor = training._TrainingExecutor(mock_est, mock_train_spec,
+                                          mock_eval_spec)
+    with self.assertRaisesRegexp(ValueError, _INVALID_LOCAL_TASK_WITH_CLUSTER):
+      executor.run()
+
+  def test_unsupported_task_due_to_missing_run_task(self):
+    unsupported_task = 'alloc'
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+            unsupported_task: ['hos1:1'],
+        },
+        'task': {
+            'type': unsupported_task,
+            'index': 0
+        }
+    }
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.config = _create_run_config_with_cluster_spec(tf_config)
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
+
+    executor = training._TrainingExecutor(mock_est, mock_train_spec,
+                                          mock_eval_spec)
+    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TO_RUN):
+      executor.run()
+
+  def test_unsupported_task_due_to_not_callable(self):
+    unsupported_task = 'alloc'
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+            unsupported_task: ['hos1:1'],
+        },
+        'task': {
+            'type': unsupported_task,
+            'index': 0
+        }
+    }
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.config = _create_run_config_with_cluster_spec(tf_config)
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
+
+    executor = training._TrainingExecutor(mock_est, mock_train_spec,
+                                          mock_eval_spec)
+    executor.run_alloc = 123  # not callable
+    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TO_RUN):
+      executor.run()
+
+  def test_invalid_task_type(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.config = test.mock.Mock()
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
+
+    mock_est.config = test.mock.Mock()
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'1': ['dummy']})
+    mock_est.config.task_type = ''
+
+    executor = training._TrainingExecutor(mock_est, mock_train_spec,
+                                          mock_eval_spec)
+    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE):
+      executor.run()
+
+
 class TrainAndEvaluateIntegrationTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/estimator/util.py b/tensorflow/python/estimator/util.py
index 12f2592d848c3ce55777ffdae5cee7ac602ee87f..b7ba76d8714e6b13551bb3e18083f45e53d2afc3 100644
--- a/tensorflow/python/estimator/util.py
+++ b/tensorflow/python/estimator/util.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Utility to retrieve function args.."""
+"""Utility to retrieve function args."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,7 +21,6 @@ from __future__ import print_function
 
 import functools
 
-from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
 
@@ -45,14 +44,13 @@ def fn_args(fn):
   Raises:
     ValueError: if partial function has positionally bound arguments
   """
-  _, fn = tf_decorator.unwrap(fn)
   if isinstance(fn, functools.partial):
     args = fn_args(fn.func)
     args = [a for a in args[len(fn.args):] if a not in (fn.keywords or [])]
   else:
     if _is_callable_object(fn):
       fn = fn.__call__
-    args = tf_inspect.getargspec(fn).args
+    args = tf_inspect.getfullargspec(fn).args
     if _is_bounded_method(fn):
       args.remove('self')
   return tuple(args)
diff --git a/tensorflow/python/estimator/warm_starting_util.py b/tensorflow/python/estimator/warm_starting_util.py
index e5655db08201601030c4473e3194e89ef89f5a68..adb013f5c653c4967a743047fef4e805946e0f59 100644
--- a/tensorflow/python/estimator/warm_starting_util.py
+++ b/tensorflow/python/estimator/warm_starting_util.py
@@ -21,101 +21,228 @@ from __future__ import print_function
 import collections
 import six
 
-from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_ops
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training import saver
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("estimator.VocabInfo")
+class VocabInfo(
+    collections.namedtuple("VocabInfo", [
+        "new_vocab",
+        "new_vocab_size",
+        "num_oov_buckets",
+        "old_vocab",
+        "old_vocab_size",
+        "backup_initializer",
+    ])):
+  """Vocabulary information for WarmStartSettings.
+
+  See @{tf.estimator.WarmStartSettings$WarmStartSettings} for examples of using
+  VocabInfo to warm-start.
 
+  Attributes:
+    new_vocab: [Required] A path to the new vocabulary file (used with the
+      model to be trained).
+    new_vocab_size: [Required] An integer indicating how many entries of the new
+      vocabulary will used in training.
+    num_oov_buckets: [Required] An integer indicating how many OOV buckets are
+      associated with the vocabulary.
+    old_vocab: [Required] A path to the old vocabulary file (used with the
+      checkpoint to be warm-started from).
+    old_vocab_size: [Optional] An integer indicating how many entries of the old
+      vocabulary were used in the creation of the checkpoint. If not provided,
+      the entire old vocabulary will be used.
+    backup_initializer: [Optional] A variable initializer used for variables
+      corresponding to new vocabulary entries and OOV. If not provided, these
+      entries will be zero-initialized.
+  """
 
-class _WarmStartSettings(
-    collections.namedtuple("_WarmStartSettings", [
+  def __new__(cls,
+              new_vocab,
+              new_vocab_size,
+              num_oov_buckets,
+              old_vocab,
+              old_vocab_size=-1,
+              backup_initializer=None):
+    return super(VocabInfo, cls).__new__(
+        cls,
+        new_vocab,
+        new_vocab_size,
+        num_oov_buckets,
+        old_vocab,
+        old_vocab_size,
+        backup_initializer,
+    )
+
+
+@tf_export("estimator.WarmStartSettings")
+class WarmStartSettings(
+    collections.namedtuple("WarmStartSettings", [
         "ckpt_to_initialize_from",
-        "col_to_prev_vocab",
-        "col_to_prev_tensor",
-        "exclude_columns",
+        "vars_to_warm_start",
+        "var_name_to_vocab_info",
+        "var_name_to_prev_var_name",
     ])):
-  """Settings for warm-starting input layer in models.
+  """Settings for warm-starting in Estimators.
+
+  Example Use with canned `DNNEstimator`:
+
+  ```
+  emb_vocab_file = tf.feature_column.embedding_column(
+      tf.feature_column.categorical_column_with_vocabulary_file(
+          "sc_vocab_file", "new_vocab.txt", vocab_size=100),
+      dimension=8)
+  emb_vocab_list = tf.feature_column.embedding_column(
+      tf.feature_column.categorical_column_with_vocabulary_list(
+          "sc_vocab_list", vocabulary_list=["a", "b"]),
+      dimension=8)
+  estimator = tf.estimator.DNNClassifier(
+    hidden_units=[128, 64], feature_columns=[emb_vocab_file, emb_vocab_list],
+    warm_start_from=ws)
+  ```
+
+  where `ws` could be defined as:
+
+  Warm-start all weights in the model (input layer and hidden weights).
+  Either the directory or a specific checkpoint can be provided (in the case
+  of the former, the latest checkpoint will be used):
+
+  ```
+  ws = WarmStartSettings(ckpt_to_initialize_from="/tmp")
+  ws = WarmStartSettings(ckpt_to_initialize_from="/tmp/model-1000")
+  ```
+
+  Warm-start only the embeddings (input layer):
+
+  ```
+  ws = WarmStartSettings(ckpt_to_initialize_from="/tmp",
+                         vars_to_warm_start=".*input_layer.*")
+  ```
+
+  Warm-start all weights but the embedding parameters corresponding to
+  `sc_vocab_file` have a different vocab from the one used in the current
+  model:
+
+  ```
+  vocab_info = ws_util.VocabInfo(
+      new_vocab=sc_vocab_file.vocabulary_file,
+      new_vocab_size=sc_vocab_file.vocabulary_size,
+      num_oov_buckets=sc_vocab_file.num_oov_buckets,
+      old_vocab="old_vocab.txt"
+  )
+  ws = WarmStartSettings(
+      ckpt_to_initialize_from="/tmp",
+      var_name_to_vocab_info={
+          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
+      })
+  ```
+
+  Warm-start only `sc_vocab_file` embeddings (and no other variables), which
+  have a different vocab from the one used in the current model:
+
+  ```
+  vocab_info = ws_util.VocabInfo(
+      new_vocab=sc_vocab_file.vocabulary_file,
+      new_vocab_size=sc_vocab_file.vocabulary_size,
+      num_oov_buckets=sc_vocab_file.num_oov_buckets,
+      old_vocab="old_vocab.txt"
+  )
+  ws = WarmStartSettings(
+      ckpt_to_initialize_from="/tmp",
+      vars_to_warm_start=None,
+      var_name_to_vocab_info={
+          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
+      })
+  ```
+
+  Warm-start all weights but the parameters corresponding to `sc_vocab_file`
+  have a different vocab from the one used in current checkpoint, and only
+  100 of those entries were used:
+
+  ```
+  vocab_info = ws_util.VocabInfo(
+      new_vocab=sc_vocab_file.vocabulary_file,
+      new_vocab_size=sc_vocab_file.vocabulary_size,
+      num_oov_buckets=sc_vocab_file.num_oov_buckets,
+      old_vocab="old_vocab.txt",
+      old_vocab_size=100
+  )
+  ws = WarmStartSettings(
+      ckpt_to_initialize_from="/tmp",
+      var_name_to_vocab_info={
+          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
+      })
+  ```
+
+  Warm-start all weights but the parameters corresponding to `sc_vocab_file`
+  have a different vocab from the one used in current checkpoint and the
+  parameters corresponding to `sc_vocab_list` have a different name from the
+  current checkpoint:
+
+  ```
+  vocab_info = ws_util.VocabInfo(
+      new_vocab=sc_vocab_file.vocabulary_file,
+      new_vocab_size=sc_vocab_file.vocabulary_size,
+      num_oov_buckets=sc_vocab_file.num_oov_buckets,
+      old_vocab="old_vocab.txt",
+      old_vocab_size=100
+  )
+  ws = WarmStartSettings(
+      ckpt_to_initialize_from="/tmp",
+      var_name_to_vocab_info={
+          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
+      },
+      var_name_to_prev_var_name={
+          "input_layer/sc_vocab_list_embedding/embedding_weights":
+              "old_tensor_name"
+      })
+  ```
 
   Attributes:
     ckpt_to_initialize_from: [Required] A string specifying the directory with
       checkpoint file(s) or path to checkpoint from which to warm-start the
       model parameters.
-    col_to_prev_vocab: [Optional] Dict of `FeatureColumn` to vocabularies used
-      for the `FeatureColumn` in `ckpt_to_initialize_from`.  Vocabularies can
-      be represented either by a string (path to vocabulary), or tuple of
-      (string, int), representing (path of the vocabulary, vocab_size) if only
-      `vocab_size` entries of the old vocabulary were used in the checkpoint. If
-      the dict is not explicitly provided, the vocabularies are assumed to be
-      same between previous and present checkpoints.
-    col_to_prev_tensor: [Optional] Dict of `FeatureColumn` to name of the
-      variable (corresponding to the `FeatureColumn`) in
-      `ckpt_to_initialize_from`. If not explicitly provided, the name of the
-      variable is assumed to be same between previous and present checkpoints.
-    exclude_columns: [Optional] List of `FeatureColumn`s that should not be
-      warm-started from provided checkpoint.
-
-  Example Uses:
-
-  # Feature columns defining transformations on inputs.
-  sc_vocab_file = tf.feature_column.categorical_column_with_vocabulary_file(
-      "sc_vocab_file", "new_vocab.txt", vocab_size=100)
-  sc_vocab_list = tf.feature_column.cateogorical_column_with_vocabulary_list(
-      "sc_vocab_list", vocabulary_list=["a", "b"])
-
-  # Warm-start all weights. The parameters corresponding to "sc_vocab_file" have
-  # the same name and same vocab as current checkpoint. The parameters
-  # corresponding to "sc_vocab_list" have the same name.
-  ws = _WarmStartSettings(ckpt_to_initialize_from="/tmp")
-
-  # Warm-start all weights but the parameters corresponding to "sc_vocab_file"
-  # have a different vocab from the one used in current checkpoint.
-  ws = _WarmStartSettings(ckpt_to_initialize_from="/tmp",
-                          col_to_prev_vocab={sc_vocab_file: "old_vocab.txt"})
-
-  # Warm-start all weights but the parameters corresponding to "sc_vocab_file"
-  # have a different vocab from the one used in current checkpoint, and only
-  # 100 of those entries were used.
-  ws = _WarmStartSettings(ckpt_to_initialize_from="/tmp",
-                          col_to_prev_vocab={sc_vocab_file:
-                                             ("old_vocab.txt", 100)})
-
-  # Warm-start all weights but the parameters corresponding to "sc_vocab_file"
-  # have a different vocab from the one used in current checkpoint and the
-  # parameters corresponding to "sc_vocab_list" have a different name from the
-  # current checkpoint.
-  ws = _WarmStartSettings(ckpt_to_initialize_from="/tmp",
-                          col_to_prev_vocab={sc_vocab_file: "old_vocab.txt"},
-                          col_to_prev_tensor={sc_vocab_list: "old_tensor_name"})
-
-  # Warm-start all weights except those corrresponding to "sc_vocab_file".
-  ws = _WarmStartSettings(ckpt_to_initialize_from="/tmp",
-                          exclude_columns=[sc_vocab_file])
+    vars_to_warm_start: [Optional] A regular expression that captures which
+      variables to warm-start (see tf.get_collection).  Defaults to `'.*'`,
+      which warm-starts all variables.  If `None` is explicitly given, only
+      variables specified in `var_name_to_vocab_info` will be warm-started.
+    var_name_to_vocab_info: [Optional] Dict of variable names (strings) to
+      VocabInfo. The variable names should be "full" variables, not the names
+      of the partitions.  If not explicitly provided, the variable is assumed to
+      have no vocabulary.
+    var_name_to_prev_var_name: [Optional] Dict of variable names (strings) to
+      name of the previously-trained variable in `ckpt_to_initialize_from`. If
+      not explicitly provided, the name of the variable is assumed to be same
+      between previous checkpoint and current model.
   """
 
   def __new__(cls,
               ckpt_to_initialize_from,
-              col_to_prev_vocab=None,
-              col_to_prev_tensor=None,
-              exclude_columns=None):
+              vars_to_warm_start=".*",
+              var_name_to_vocab_info=None,
+              var_name_to_prev_var_name=None):
     if not ckpt_to_initialize_from:
       raise ValueError(
-          "`ckpt_to_initialize_from` MUST be set in _WarmStartSettings")
-    return super(_WarmStartSettings, cls).__new__(
+          "`ckpt_to_initialize_from` MUST be set in WarmStartSettings")
+    return super(WarmStartSettings, cls).__new__(
         cls,
         ckpt_to_initialize_from,
-        col_to_prev_vocab or {},
-        col_to_prev_tensor or {},
-        exclude_columns or [],)
+        vars_to_warm_start,
+        var_name_to_vocab_info or {},
+        var_name_to_prev_var_name or {},
+    )
 
 
 def _is_variable(x):
-  return (isinstance(x, variables.Variable) or
+  return (isinstance(x, variables_lib.Variable) or
           isinstance(x, resource_variable_ops.ResourceVariable))
 
 
@@ -135,11 +262,12 @@ def _infer_var_name(var):
   """
   name_to_var_dict = saver.BaseSaverBuilder.OpListToDict(var)
   if len(name_to_var_dict) > 1:
-    raise TypeError("`var` passed as arg violates the constraints.")
+    raise TypeError("`var` = %s passed as arg violates the constraints.  "
+                    "name_to_var_dict = %s" % (var, name_to_var_dict))
   return list(name_to_var_dict.keys())[0]
 
 
-def _warmstart_var(var, prev_ckpt, prev_tensor_name=None):
+def _warm_start_var(var, prev_ckpt, prev_tensor_name=None):
   """Warm-starts given variable from `prev_tensor_name` tensor in `prev_ckpt`.
 
   Args:
@@ -147,69 +275,26 @@ def _warmstart_var(var, prev_ckpt, prev_tensor_name=None):
       Can be either of the following:
       (i) `Variable`
       (ii) `ResourceVariable`
-      (iii) `PartitionedVariable`
-      (iv) list of `Variable` and/or `PartitionedVariable`: The list may
-        contain one or more variables that has been sharded.  For example:
-        [Variable('a/part_0'), Variable('b/part_0'), Variable('a/part_1'),
-         PartitionedVariable([Variable('c/part_0'), Variable('c/part_1')])]
-        where we have three whole Variables represented ('a', 'b', and 'c').
+      (iii) list of `Variable`: The list must contain slices of the same larger
+        variable.
+      (iv) `PartitionedVariable`
     prev_ckpt: A string specifying the directory with checkpoint file(s) or path
       to checkpoint. The given checkpoint must have tensor with name
       `prev_tensor_name` (if not None) or tensor with name same as given `var`.
     prev_tensor_name: Name of the tensor to lookup in provided `prev_ckpt`. If
       None, we lookup tensor with same name as given `var`.
-
-  Raises:
-    ValueError: If prev_tensor_name is not None, but the given var represents
-      more than one Variable.
-    TypeError: If var is not one of the allowed types.
   """
   if _is_variable(var):
     current_var_name = _infer_var_name([var])
-  elif isinstance(var, variables.PartitionedVariable):
+  elif isinstance(var, list) and all(_is_variable(v) for v in var):
+    current_var_name = _infer_var_name(var)
+  elif isinstance(var, variables_lib.PartitionedVariable):
     current_var_name = _infer_var_name([var])
     var = var._get_variable_list()  # pylint: disable=protected-access
-  elif (isinstance(var, list) and all(
-      _is_variable(v) or isinstance(v, variables.PartitionedVariable)
-      for v in var)):
-    # Convert length-1 lists of vars to single tf.Variables.  This ensures that
-    # checkpoint_utils.init_from_checkpoint() doesn't incorrectly assume
-    # slice info is present.
-    if len(var) == 1:
-      current_var_name = _infer_var_name(var)
-      var = var[0]
-    else:
-      # If we have multiple elements in var, we cannot assume they all
-      # represent the same Variable.
-      name_to_var_dict = saver.BaseSaverBuilder.OpListToDict(
-          var, convert_variable_to_tensor=False)
-      if prev_tensor_name:
-        # Providing a prev_tensor_name is only viable if var representes a
-        # single Variable.
-        if len(name_to_var_dict) > 1:
-          raise ValueError("var represented more than one Variable, but "
-                           "prev_tensor_name was provided.")
-        checkpoint_utils.init_from_checkpoint(prev_ckpt, {
-            prev_tensor_name: var
-        })
-      else:
-        # OpListToDict gives us roughly what we need, but
-        # the values in the dict may be PartitionedVariables (which
-        # init_from_checkpoint does not expect) that we need to convert to
-        # lists.
-        name_to_var_dict_fixed = {}
-        for name, var in six.iteritems(name_to_var_dict):
-          if isinstance(var, variables.PartitionedVariable):
-            name_to_var_dict_fixed[name] = var._get_variable_list()  # pylint: disable=protected-access
-          else:
-            name_to_var_dict_fixed[name] = var
-        checkpoint_utils.init_from_checkpoint(prev_ckpt, name_to_var_dict_fixed)
-      return
   else:
     raise TypeError(
-        "var MUST be one of the following: a Variable, PartitionedVariable, or "
-        "list of Variable's and/or PartitionedVariable's, but is {}".format(
-            type(var)))
+        "var MUST be one of the following: a Variable, list of Variable or "
+        "PartitionedVariable, but is {}".format(type(var)))
   if not prev_tensor_name:
     # Assume tensor name remains the same.
     prev_tensor_name = current_var_name
@@ -219,15 +304,15 @@ def _warmstart_var(var, prev_ckpt, prev_tensor_name=None):
 # pylint: disable=protected-access
 # Accesses protected members of tf.Variable to reset the variable's internal
 # state.
-def _warmstart_var_with_vocab(var,
-                              current_vocab_path,
-                              current_vocab_size,
-                              prev_ckpt,
-                              prev_vocab_path,
-                              previous_vocab_size=-1,
-                              current_oov_buckets=0,
-                              prev_tensor_name=None,
-                              initializer=None):
+def _warm_start_var_with_vocab(var,
+                               current_vocab_path,
+                               current_vocab_size,
+                               prev_ckpt,
+                               prev_vocab_path,
+                               previous_vocab_size=-1,
+                               current_oov_buckets=0,
+                               prev_tensor_name=None,
+                               initializer=None):
   """Warm-starts given variable from `prev_tensor_name` tensor in `prev_ckpt`.
 
   Use this method when the `var` is backed by vocabulary. This method stitches
@@ -270,7 +355,7 @@ def _warmstart_var_with_vocab(var,
     var = [var]
   elif isinstance(var, list) and all(_is_variable(v) for v in var):
     var = var
-  elif isinstance(var, variables.PartitionedVariable):
+  elif isinstance(var, variables_lib.PartitionedVariable):
     var = var._get_variable_list()
   else:
     raise TypeError(
@@ -290,10 +375,10 @@ def _warmstart_var_with_vocab(var,
           full_shape=slice_info.full_shape,
           var_offset=slice_info.var_offset)
 
-    # TODO(vihanjain): Support _WarmstartSettings where class vocabularies need
+    # TODO(eddz): Support WarmStartSettings where class vocabularies need
     # remapping too.
     init = checkpoint_ops._load_and_remap_matrix_initializer(
-        ckpt_path=saver.latest_checkpoint(prev_ckpt),
+        ckpt_path=checkpoint_utils._get_checkpoint_filename(prev_ckpt),
         old_tensor_name=prev_tensor_name,
         new_row_vocab_size=current_vocab_size,
         new_col_vocab_size=v_shape[1],
@@ -311,114 +396,129 @@ def _warmstart_var_with_vocab(var,
 # pylint: enable=protected-access
 
 
-def _warmstart_input_layer(cols_to_vars, warmstart_settings):
-  """Warm-starts input layer of a model using given settings.
+def _warm_start(warm_start_settings):
+  """Warm-starts a model using the given settings.
 
-  Args:
-    cols_to_vars: Dict of feature columns to corresponding graph variables.
-    warmstart_settings: An object of `_WarmStartSettings`.
-
-    Typical usage example:
-
-    ```python
-    tfcl = tf.contrib.layers
-    # Define features and transformations.
-    sc_vocab_list = tf.feature_column.categorical_column_with_vocabulary_list(
-        "sc_vocab_list", vocabulary_list=["a", "b"])
-    sc_vocab_file = tf.feature_column.categorical_column_with_vocabulary_file(
-        "sc_vocab_file", "new_vocab.txt", vocab_size=100)
-    cross = tf.feature_column.crossed_column(
-      [sc_vocab_list, sc_vocab_file], hash_bucket_size=5000)
-
-    all_cols = set(sc_vocab_list, sc_vocab_file, cross)
-    batch_features = tf.parse_example(
-        serialized=serialized_examples,
-        features=tf.contrib.layers.create_feature_spec_for_parsing(all_cols))
-
-    cols_to_vars = {}
-    tf.feature_column.linear_model(
-        features=batch_features,
-        feature_columns=all_cols,
-        units=1,
-        cols_to_vars=cols_to_vars)
-
-    # Warm-start entire input layer.
-    ws_settings = _WarmStartSettings(
-        "/tmp/prev_model_dir",
-        col_to_prev_vocab={sc_vocab_file: "old_vocab.txt"})
-    _warmstart_input_layer(cols_to_vars, ws_settings)
-    # Warm-start bias too.
-    _warmstart_var(cols_to_vars['bias'], ws_settings.ckpt_to_initialize_from)
-    ```
-
-    The above example effectively warm-starts full linear model.
+  If you are using a tf.estimator.Estimator, this will automatically be called
+  during training.
 
+  Args:
+    warm_start_settings: An object of `WarmStartSettings`.
   Raises:
-    ValueError: If a column in cols_to_vars has an entry in
-      warmstart_settings.cols_to_prev_vocab, but is not an instance of
-      _VocabularyFileCategoricalColumn or _EmbeddingColumn.
+    ValueError: If the WarmStartSettings contains prev_var_name or VocabInfo
+      configuration for variable names that are not used.  This is to ensure
+      a stronger check for variable configuration than relying on users to
+      examine the logs.
   """
-  for col, var in six.iteritems(cols_to_vars):
-    if not isinstance(col, feature_column._FeatureColumn):  # pylint: disable=protected-access
-      raise TypeError(
-          "Keys in dict `cols_to_vars` must be of type FeatureColumn. Found "
-          "key of type: {}".format(type(col)))
-    if col in warmstart_settings.exclude_columns:
-      logging.info("Skipping warm-starting column: {}".format(col.name))
-      continue
-
-    prev_tensor_name = warmstart_settings.col_to_prev_tensor.get(col)
-    # pylint: disable=protected-access
-    is_sparse_vocab_column = isinstance(
-        col, feature_column._VocabularyFileCategoricalColumn)
-    is_embedding_vocab_column = (
-        isinstance(col, feature_column._EmbeddingColumn) and
-        isinstance(col.categorical_column,
-                   feature_column._VocabularyFileCategoricalColumn))
-    if is_sparse_vocab_column or is_embedding_vocab_column:
-      # pylint: enable=protected-access
-      initializer = None
-      if is_embedding_vocab_column:
-        initializer = col.initializer
-        vocabulary_file = col.categorical_column.vocabulary_file
-        vocabulary_size = col.categorical_column.vocabulary_size
-        num_oov_buckets = col.categorical_column.num_oov_buckets
-      else:
-        vocabulary_file = col.vocabulary_file
-        vocabulary_size = col.vocabulary_size
-        num_oov_buckets = col.num_oov_buckets
-      prev_vocab = warmstart_settings.col_to_prev_vocab.get(
-          col, vocabulary_file)
-      if isinstance(prev_vocab, str):
-        prev_vocab_path = prev_vocab
-        previous_vocab_size = -1
-        logging.info(
-            "Warm-starting column: {}; prev_vocab: {}; "
-            "prev_tensor: {}".format(col.name, prev_vocab_path,
-                                     (prev_tensor_name or "Unchanged")))
-      elif isinstance(prev_vocab, tuple):
-        prev_vocab_path = prev_vocab[0]
-        previous_vocab_size = prev_vocab[1]
-        logging.info("Warm-starting column: {}; prev_vocab: {} (first {} "
-                     "entries); prev_tensor: {}".format(
-                         col.name, prev_vocab_path, previous_vocab_size,
-                         (prev_tensor_name or "Unchanged")))
-
-      _warmstart_var_with_vocab(
-          var,
-          current_vocab_path=vocabulary_file,
-          current_vocab_size=vocabulary_size,
-          prev_ckpt=warmstart_settings.ckpt_to_initialize_from,
-          prev_vocab_path=prev_vocab_path,
-          previous_vocab_size=previous_vocab_size,
-          current_oov_buckets=num_oov_buckets,
-          prev_tensor_name=prev_tensor_name,
-          initializer=initializer)
+  logging.info("Warm-starting from: %s",
+               (warm_start_settings.ckpt_to_initialize_from,))
+  # We have to deal with partitioned variables, since get_collection flattens
+  # out the list.
+  grouped_variables = {}
+  # Both warm_start_settings.vars_to_warm_start = '.*' and
+  # warm_start_settings.vars_to_warm_start = None will match everything here.
+  for v in ops.get_collection(
+      # TODO(eddz): Allow for different collections here (to support
+      # warm-starting accumulators).
+      ops.GraphKeys.TRAINABLE_VARIABLES,
+      scope=warm_start_settings.vars_to_warm_start):
+    if not isinstance(v, list):
+      var_name = _infer_var_name([v])
     else:
-      if col in warmstart_settings.col_to_prev_vocab:
-        raise ValueError("Vocabulary provided for column %s which is not a "
-                         "_VocabularyFileCategoricalColumn or _EmbeddingColumn")
-      logging.info("Warm-starting column: {}; prev_tensor: {}".format(
-          col.name, prev_tensor_name or "Unchanged"))
-      _warmstart_var(var, warmstart_settings.ckpt_to_initialize_from,
-                     prev_tensor_name)
+      var_name = _infer_var_name(v)
+    grouped_variables.setdefault(var_name, []).append(v)
+
+  # Keep track of which var_names in var_name_to_prev_var_name and
+  # var_name_to_vocab_info have been used.  Err on the safer side by throwing an
+  # exception if any are unused by the end of the loop.  It is easy to misname
+  # a variable during this configuration, in which case without this check, we
+  # would fail to warm-start silently.
+  prev_var_name_used = set()
+  vocab_info_used = set()
+
+  for var_name, variable in six.iteritems(grouped_variables):
+    prev_var_name = warm_start_settings.var_name_to_prev_var_name.get(var_name)
+    if prev_var_name:
+      prev_var_name_used.add(var_name)
+    vocab_info = warm_start_settings.var_name_to_vocab_info.get(var_name)
+    if vocab_info:
+      vocab_info_used.add(var_name)
+      logging.info(
+          "Warm-starting variable: {}; current_vocab: {} current_vocab_size: {}"
+          " prev_vocab: {} prev_vocab_size: {} current_oov: {} prev_tensor: {}"
+          " initializer: {}".format(
+              var_name,
+              vocab_info.new_vocab,
+              vocab_info.new_vocab_size,
+              vocab_info.old_vocab,
+              (vocab_info.old_vocab_size if vocab_info.old_vocab_size > 0
+               else "All"),
+              vocab_info.num_oov_buckets,
+              prev_var_name or "Unchanged",
+              vocab_info.backup_initializer or "zero-initialized"))
+      _warm_start_var_with_vocab(
+          variable,
+          current_vocab_path=vocab_info.new_vocab,
+          current_vocab_size=vocab_info.new_vocab_size,
+          prev_ckpt=warm_start_settings.ckpt_to_initialize_from,
+          prev_vocab_path=vocab_info.old_vocab,
+          previous_vocab_size=vocab_info.old_vocab_size,
+          current_oov_buckets=vocab_info.num_oov_buckets,
+          prev_tensor_name=prev_var_name,
+          initializer=vocab_info.backup_initializer)
+    else:
+      # For the special value of warm_start_settings.vars_to_warm_start = None,
+      # we only warm-start variables with explicitly specified vocabularies.
+      if warm_start_settings.vars_to_warm_start:
+        logging.info("Warm-starting variable: {}; prev_var_name: {}".format(
+            var_name, prev_var_name or "Unchanged"))
+        # Because we use a default empty list in grouped_variables, single
+        # unpartitioned variables will be lists here, which we rectify in order
+        # for init_from_checkpoint logic to work correctly.
+        if len(variable) == 1:
+          variable = variable[0]
+        _warm_start_var(variable, warm_start_settings.ckpt_to_initialize_from,
+                        prev_var_name)
+
+  prev_var_name_not_used = set(
+      warm_start_settings.var_name_to_prev_var_name.keys()) - prev_var_name_used
+  vocab_info_not_used = set(
+      warm_start_settings.var_name_to_vocab_info.keys()) - vocab_info_used
+
+  if prev_var_name_not_used:
+    raise ValueError(
+        "You provided the following variables in "
+        "warm_start_settings.var_name_to_prev_var_name that were not used: "
+        "{0}.  Perhaps you misspelled them?  Here is the list of viable "
+        "variable names: {1}".format(prev_var_name_not_used,
+                                     grouped_variables.keys()))
+  if vocab_info_not_used:
+    raise ValueError(
+        "You provided the following variables in "
+        "warm_start_settings.var_name_to_vocab_info that were not used: {0}. "
+        " Perhaps you misspelled them?  Here is the list of viable variable "
+        "names: {1}".format(vocab_info_not_used, grouped_variables.keys()))
+
+
+def _get_default_warm_start_settings(warm_start_from):
+  """Returns default WarmStartSettings.
+
+  Args:
+    warm_start_from: Either a string representing the filepath of a checkpoint
+      to initialize from, or an instance of WarmStartSettings.
+
+  Returns:
+    Either None or an instance of WarmStartSettings.
+
+  Raises:
+    ValueError: If warm_start_from is not None but is neither a string nor an
+      instance of WarmStartSettings.
+  """
+  if warm_start_from is None:
+    return None
+  if isinstance(warm_start_from, six.string_types):
+    return WarmStartSettings(ckpt_to_initialize_from=warm_start_from)
+  elif isinstance(warm_start_from, WarmStartSettings):
+    return warm_start_from
+  else:
+    raise ValueError("warm_start_from must be a string or a WarmStartSettings")
diff --git a/tensorflow/python/estimator/warm_starting_util_test.py b/tensorflow/python/estimator/warm_starting_util_test.py
index a05dbfd7449c9e108649da9ec5a40fe220233953..3985d9ebd04e6963339fcf9999f6367fe4dadc1a 100644
--- a/tensorflow/python/estimator/warm_starting_util_test.py
+++ b/tensorflow/python/estimator/warm_starting_util_test.py
@@ -50,9 +50,7 @@ class WarmStartingUtilTest(test.TestCase):
     sess.run(variables.global_variables_initializer())
     saver = saver_lib.Saver()
     ckpt_prefix = os.path.join(self.get_temp_dir(), "model")
-    ckpt_state_name = "checkpoint"
-    saver.save(
-        sess, ckpt_prefix, global_step=0, latest_filename=ckpt_state_name)
+    saver.save(sess, ckpt_prefix, global_step=0)
 
   def _create_prev_run_var(self,
                            var_name,
@@ -72,36 +70,6 @@ class WarmStartingUtilTest(test.TestCase):
           var = var._get_variable_list()
         return var, sess.run(var)
 
-  def _create_prev_run_multiple_vars(self,
-                                     var_names,
-                                     initializers,
-                                     shapes=None,
-                                     partitioners=None):
-    if not shapes:
-      shapes = [None] * len(var_names)
-    if not partitioners:
-      partitioners = [None] * len(var_names)
-    with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
-        var_list = []
-        for var_name, shape, initializer, partitioner in zip(
-            var_names, shapes, initializers, partitioners):
-          var_list.append(
-              variable_scope.get_variable(
-                  var_name,
-                  shape=shape,
-                  initializer=initializer,
-                  partitioner=partitioner))
-        self._write_checkpoint(sess)
-        run_vars = []
-        for var, partitioner in zip(var_list, partitioners):
-          if partitioner:
-            self.assertTrue(isinstance(var, variables.PartitionedVariable))
-            run_vars.append(sess.run(var._get_variable_list()))
-          else:
-            run_vars.append(sess.run(var))
-        return var_list, run_vars
-
   def _create_dummy_inputs(self):
     return {
         "sc_int": array_ops.sparse_placeholder(dtypes.int32),
@@ -120,9 +88,7 @@ class WarmStartingUtilTest(test.TestCase):
           feature_columns=feature_cols,
           units=1,
           cols_to_vars=cols_to_vars)
-    # Return a dictionary mapping each column to its variable, dropping the
-    # 'bias' key that's also filled.
-    cols_to_vars.pop("bias")
+    # Return a dictionary mapping each column to its variable.
     return cols_to_vars
 
   def _assert_cols_to_vars(self, cols_to_vars, cols_to_expected_values, sess):
@@ -138,7 +104,7 @@ class WarmStartingUtilTest(test.TestCase):
       with self.test_session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.]])
-        ws_util._warmstart_var(fruit_weights, self.get_temp_dir())
+        ws_util._warm_start_var(fruit_weights, self.get_temp_dir())
         sess.run(variables.global_variables_initializer())
         self.assertAllEqual(prev_val, fruit_weights.eval(sess))
 
@@ -154,7 +120,7 @@ class WarmStartingUtilTest(test.TestCase):
       with self.test_session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.]])
-        ws_util._warmstart_var(fruit_weights, self.get_temp_dir())
+        ws_util._warm_start_var(fruit_weights, self.get_temp_dir())
         sess.run(variables.global_variables_initializer())
         self.assertAllEqual(prev_val, fruit_weights.eval(sess))
 
@@ -171,7 +137,7 @@ class WarmStartingUtilTest(test.TestCase):
             partitioner=lambda shape, dtype: [2, 1])
         self.assertTrue(
             isinstance(fruit_weights, variables.PartitionedVariable))
-        ws_util._warmstart_var(fruit_weights, self.get_temp_dir())
+        ws_util._warm_start_var(fruit_weights, self.get_temp_dir())
         sess.run(variables.global_variables_initializer())
         fruit_weights = fruit_weights._get_variable_list()
         new_val = np.concatenate(
@@ -195,7 +161,7 @@ class WarmStartingUtilTest(test.TestCase):
             partitioner=lambda shape, dtype: [2, 1])
         self.assertTrue(
             isinstance(fruit_weights, variables.PartitionedVariable))
-        ws_util._warmstart_var(
+        ws_util._warm_start_var(
             fruit_weights,
             self.get_temp_dir(),
             prev_tensor_name="old_scope/fruit_weights")
@@ -205,103 +171,10 @@ class WarmStartingUtilTest(test.TestCase):
             [fruit_weights[0].eval(sess), fruit_weights[1].eval(sess)], axis=0)
         self.assertAllEqual(prev_val, new_val)
 
-  def testWarmStartVarMultipleVars(self):
-    _, prev_vals = self._create_prev_run_multiple_vars(
-        var_names=["fruit_weights", "other_weights"],
-        initializers=[[[0.5], [1.], [1.5], [2.]], [[.05], [.1], [.15], [.2]]])
-
-    with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
-        fruit_weights = variable_scope.get_variable(
-            "fruit_weights", initializer=[[0.], [0.], [0.], [0.]])
-        other_weights = variable_scope.get_variable(
-            "other_weights", initializer=[[0.], [0.], [0.], [0.]])
-        ws_util._warmstart_var([fruit_weights, other_weights],
-                               self.get_temp_dir())
-        sess.run(variables.global_variables_initializer())
-        self.assertAllEqual(prev_vals[0], fruit_weights.eval(sess))
-        self.assertAllEqual(prev_vals[1], other_weights.eval(sess))
-
-  def testWarmStartVarMultipleVarsBothPartitioned(self):
-    _, prev_vals = self._create_prev_run_multiple_vars(
-        var_names=["fruit_weights", "other_weights"],
-        shapes=[[4, 1], [4, 1]],
-        initializers=[[[0.5], [1.], [1.5], [2.]], [[.05], [.1], [.15], [.2]]],
-        partitioners=[lambda shape, dtype: [2, 1], lambda shape, dtype: [2, 1]])
-
-    with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
-        fruit_weights = variable_scope.get_variable(
-            "fruit_weights",
-            shape=[4, 1],
-            initializer=[[0.], [0.], [0.], [0.]],
-            partitioner=lambda shape, dtype: [2, 1])
-        other_weights = variable_scope.get_variable(
-            "other_weights",
-            shape=[4, 1],
-            initializer=[[0.], [0.], [0.], [0.]],
-            partitioner=lambda shape, dtype: [2, 1])
-        ws_util._warmstart_var([fruit_weights, other_weights],
-                               self.get_temp_dir())
-        sess.run(variables.global_variables_initializer())
-        fruit_weights = fruit_weights._get_variable_list()
-        new_fruit_weights_val = np.concatenate(
-            [fruit_weights[0].eval(sess), fruit_weights[1].eval(sess)], axis=0)
-        other_weights = other_weights._get_variable_list()
-        new_other_weights_val = np.concatenate(
-            [other_weights[0].eval(sess), other_weights[1].eval(sess)], axis=0)
-        self.assertAllEqual(
-            np.concatenate(prev_vals[0], axis=0), new_fruit_weights_val)
-        self.assertAllEqual(
-            np.concatenate(prev_vals[1], axis=0), new_other_weights_val)
-
-  def testWarmStartVarMultipleVarsMixOfPartitions(self):
-    # First is not partitioned, but the second two are.
-    _, prev_vals = self._create_prev_run_multiple_vars(
-        var_names=["fruit_weights", "other_weights", "veggie_weights"],
-        shapes=[None, [4, 1], [4, 1]],
-        initializers=[[[0.5], [1.], [1.5], [2.]], [[.05], [.1], [.15], [.2]],
-                      [[5.], [10.], [15.], [20.]]],
-        partitioners=[
-            None, lambda shape, dtype: [2, 1], lambda shape, dtype: [2, 1]
-        ])
-
-    with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
-        fruit_weights = variable_scope.get_variable(
-            "fruit_weights", initializer=[[0.], [0.], [0.], [0.]])
-        other_weights = variable_scope.get_variable(
-            "other_weights",
-            shape=[4, 1],
-            initializer=[[0.], [0.], [0.], [0.]],
-            partitioner=lambda shape, dtype: [2, 1])
-        veggie_weights = variable_scope.get_variable(
-            "veggie_weights",
-            shape=[4, 1],
-            initializer=[[0.], [0.], [0.], [0.]],
-            partitioner=lambda shape, dtype: [2, 1])
-        # Flatten one of the partitioned variables.
-        ws_util._warmstart_var([fruit_weights, other_weights] +
-                               veggie_weights._get_variable_list(),
-                               self.get_temp_dir())
-        sess.run(variables.global_variables_initializer())
-        veggie_weights = veggie_weights._get_variable_list()
-        new_veggie_weights_val = np.concatenate(
-            [veggie_weights[0].eval(sess), veggie_weights[1].eval(sess)],
-            axis=0)
-        other_weights = other_weights._get_variable_list()
-        new_other_weights_val = np.concatenate(
-            [other_weights[0].eval(sess), other_weights[1].eval(sess)], axis=0)
-        self.assertAllEqual(prev_vals[0], fruit_weights.eval(sess))
-        self.assertAllEqual(
-            np.concatenate(prev_vals[1], axis=0), new_other_weights_val)
-        self.assertAllEqual(
-            np.concatenate(prev_vals[2], axis=0), new_veggie_weights_val)
-
   def testWarmStartVarWithVocab(self):
     prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                         "old_vocab")
-    _, _ = self._create_prev_run_var(
+    self._create_prev_run_var(
         "fruit_weights", initializer=[[0.5], [1.], [1.5], [2.]])
 
     # New vocab with elements in reverse order and one new element.
@@ -312,8 +185,8 @@ class WarmStartingUtilTest(test.TestCase):
       with self.test_session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
-        ws_util._warmstart_var_with_vocab(fruit_weights, new_vocab_path, 5,
-                                          self.get_temp_dir(), prev_vocab_path)
+        ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 5,
+                                           self.get_temp_dir(), prev_vocab_path)
         sess.run(variables.global_variables_initializer())
         self.assertAllEqual([[2.], [1.5], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
@@ -321,7 +194,7 @@ class WarmStartingUtilTest(test.TestCase):
   def testWarmStartVarWithVocabConstrainedOldVocabSize(self):
     prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                         "old_vocab")
-    _, _ = self._create_prev_run_var(
+    self._create_prev_run_var(
         "fruit_weights", initializer=[[0.5], [1.], [1.5], [2.]])
 
     # New vocab with elements in reverse order and one new element.
@@ -332,7 +205,7 @@ class WarmStartingUtilTest(test.TestCase):
       with self.test_session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
-        ws_util._warmstart_var_with_vocab(
+        ws_util._warm_start_var_with_vocab(
             fruit_weights,
             new_vocab_path,
             5,
@@ -347,7 +220,7 @@ class WarmStartingUtilTest(test.TestCase):
   def testWarmStartVarWithVocabPrevVarPartitioned(self):
     prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                         "old_vocab")
-    _, _ = self._create_prev_run_var(
+    self._create_prev_run_var(
         "fruit_weights",
         shape=[4, 1],
         initializer=[[0.5], [1.], [1.5], [2.]],
@@ -361,8 +234,8 @@ class WarmStartingUtilTest(test.TestCase):
       with self.test_session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
-        ws_util._warmstart_var_with_vocab(fruit_weights, new_vocab_path, 5,
-                                          self.get_temp_dir(), prev_vocab_path)
+        ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 5,
+                                           self.get_temp_dir(), prev_vocab_path)
         sess.run(variables.global_variables_initializer())
         self.assertAllEqual([[2.], [1.5], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
@@ -370,7 +243,7 @@ class WarmStartingUtilTest(test.TestCase):
   def testWarmStartVarWithVocabCurrentVarPartitioned(self):
     prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                         "old_vocab")
-    _, _ = self._create_prev_run_var(
+    self._create_prev_run_var(
         "fruit_weights", initializer=[[0.5], [1.], [1.5], [2.]])
 
     # New vocab with elements in reverse order and one new element.
@@ -384,7 +257,7 @@ class WarmStartingUtilTest(test.TestCase):
             shape=[6, 1],
             initializer=[[0.], [0.], [0.], [0.], [0.], [0.]],
             partitioner=lambda shape, dtype: [2, 1])
-        ws_util._warmstart_var_with_vocab(
+        ws_util._warm_start_var_with_vocab(
             fruit_weights,
             new_vocab_path,
             5,
@@ -403,7 +276,7 @@ class WarmStartingUtilTest(test.TestCase):
   def testWarmStartVarWithVocabBothVarsPartitioned(self):
     prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                         "old_vocab")
-    _, _ = self._create_prev_run_var(
+    self._create_prev_run_var(
         "fruit_weights",
         shape=[4, 1],
         initializer=[[0.5], [1.], [1.5], [2.]],
@@ -421,8 +294,8 @@ class WarmStartingUtilTest(test.TestCase):
             shape=[6, 1],
             initializer=[[0.], [0.], [0.], [0.], [0.], [0.]],
             partitioner=lambda shape, dtype: [2, 1])
-        ws_util._warmstart_var_with_vocab(fruit_weights, new_vocab_path, 6,
-                                          self.get_temp_dir(), prev_vocab_path)
+        ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 6,
+                                           self.get_temp_dir(), prev_vocab_path)
         sess.run(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_weights, variables.PartitionedVariable))
@@ -432,7 +305,7 @@ class WarmStartingUtilTest(test.TestCase):
         self.assertAllEqual([[0.5], [0.], [0.]],
                             fruit_weights_vars[1].eval(sess))
 
-  def testWarmStartInputLayer_SparseColumnIntegerized(self):
+  def testWarmStart_SparseColumnIntegerized(self):
     # Create feature column.
     sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10)
 
@@ -443,28 +316,28 @@ class WarmStartingUtilTest(test.TestCase):
     self.assertAllEqual(np.ones([10, 1]), prev_int_val)
 
     partitioner = lambda shape, dtype: [1] * len(shape)
-    # New graph, new session WITHOUT warmstarting.
+    # New graph, new session WITHOUT warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_int], partitioner)
         sess.run(variables.global_variables_initializer())
-        # Without warmstarting, the weights should be initialized using default
+        # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_int: [np.zeros([10, 1])]},
                                   sess)
 
-    # New graph, new session with warmstarting.
+    # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_int], partitioner)
-        ws_util._warmstart_input_layer(cols_to_vars,
-                                       ws_util._WarmStartSettings(
-                                           self.get_temp_dir()))
+        ws_util._warm_start(
+            ws_util.WarmStartSettings(
+                self.get_temp_dir(), vars_to_warm_start=".*sc_int.*"))
         sess.run(variables.global_variables_initializer())
-        # Verify weights were correctly warmstarted.
+        # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_int: [prev_int_val]}, sess)
 
-  def testWarmStartInputLayer_SparseColumnHashed(self):
+  def testWarmStart_SparseColumnHashed(self):
     # Create feature column.
     sc_hash = fc.categorical_column_with_hash_bucket(
         "sc_hash", hash_bucket_size=15)
@@ -474,29 +347,66 @@ class WarmStartingUtilTest(test.TestCase):
         "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
 
     partitioner = lambda shape, dtype: [1] * len(shape)
-    # New graph, new session WITHOUT warmstarting.
+    # New graph, new session WITHOUT warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_hash], partitioner)
         sess.run(variables.global_variables_initializer())
-        # Without warmstarting, the weights should be initialized using default
+        # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_hash: [np.zeros([15, 1])]},
                                   sess)
 
-    # New graph, new session with warmstarting.
+    # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_hash], partitioner)
-        ws_util._warmstart_input_layer(cols_to_vars,
-                                       ws_util._WarmStartSettings(
-                                           self.get_temp_dir()))
+        ws_util._warm_start(
+            ws_util.WarmStartSettings(
+                self.get_temp_dir(), vars_to_warm_start=".*sc_hash.*"))
         sess.run(variables.global_variables_initializer())
-        # Verify weights were correctly warmstarted.
+        # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_hash: [prev_hash_val]},
                                   sess)
 
-  def testWarmStartInputLayer_SparseColumnVocabulary(self):
+  def testWarmStart_SparseColumnVocabulary(self):
+    # Create vocab for sparse column "sc_vocab".
+    vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
+                                   "vocab")
+    # Create feature column.
+    sc_vocab = fc.categorical_column_with_vocabulary_file(
+        "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4)
+
+    # Save checkpoint from which to warm-start.
+    _, prev_vocab_val = self._create_prev_run_var(
+        "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones())
+
+    partitioner = lambda shape, dtype: [1] * len(shape)
+    # New graph, new session WITHOUT warm-starting.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
+        sess.run(variables.global_variables_initializer())
+        # Without warm-starting, the weights should be initialized using default
+        # initializer (which is init_ops.zeros_initializer).
+        self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([4, 1])]},
+                                  sess)
+
+    # New graph, new session with warm-starting.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
+        # Since old vocab is not explicitly set in WarmStartSettings, the old
+        # vocab is assumed to be same as new vocab.
+        ws_util._warm_start(
+            ws_util.WarmStartSettings(
+                self.get_temp_dir(), vars_to_warm_start=".*sc_vocab.*"))
+        sess.run(variables.global_variables_initializer())
+        # Verify weights were correctly warm-started.
+        self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [prev_vocab_val]},
+                                  sess)
+
+  def testWarmStart_ExplicitCheckpointFile(self):
     # Create vocab for sparse column "sc_vocab".
     vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                    "vocab")
@@ -509,31 +419,33 @@ class WarmStartingUtilTest(test.TestCase):
         "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones())
 
     partitioner = lambda shape, dtype: [1] * len(shape)
-    # New graph, new session WITHOUT warmstarting.
+    # New graph, new session WITHOUT warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
         sess.run(variables.global_variables_initializer())
-        # Without warmstarting, the weights should be initialized using default
+        # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([4, 1])]},
                                   sess)
 
-    # New graph, new session with warmstarting.
+    # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
         # Since old vocab is not explicitly set in WarmStartSettings, the old
         # vocab is assumed to be same as new vocab.
-        ws_util._warmstart_input_layer(cols_to_vars,
-                                       ws_util._WarmStartSettings(
-                                           self.get_temp_dir()))
+        ws_util._warm_start(
+            ws_util.WarmStartSettings(
+                # Explicitly provide the file prefix instead of just the dir.
+                os.path.join(self.get_temp_dir(), "model-0"),
+                vars_to_warm_start=".*sc_vocab.*"))
         sess.run(variables.global_variables_initializer())
-        # Verify weights were correctly warmstarted.
+        # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [prev_vocab_val]},
                                   sess)
 
-  def testWarmStartInputLayer_SparseColumnVocabularyConstrainedVocabSizes(self):
+  def testWarmStart_SparseColumnVocabularyConstrainedVocabSizes(self):
     # Create old vocabulary, and use a size smaller than the total number of
     # entries.
     old_vocab_path = self._write_vocab(["apple", "guava", "banana"],
@@ -553,32 +465,39 @@ class WarmStartingUtilTest(test.TestCase):
         "linear_model/sc_vocab/weights", shape=[2, 1], initializer=ones())
 
     partitioner = lambda shape, dtype: [1] * len(shape)
-    # New graph, new session WITHOUT warmstarting.
+    # New graph, new session WITHOUT warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
         sess.run(variables.global_variables_initializer())
-        # Without warmstarting, the weights should be initialized using default
+        # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([2, 1])]},
                                   sess)
 
-    # New graph, new session with warmstarting.
+    # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
-        warmstart_settings = ws_util._WarmStartSettings(
+        vocab_info = ws_util.VocabInfo(
+            new_vocab=sc_vocab.vocabulary_file,
+            new_vocab_size=sc_vocab.vocabulary_size,
+            num_oov_buckets=sc_vocab.num_oov_buckets,
+            old_vocab=old_vocab_path,
+            old_vocab_size=old_vocab_size)
+        warm_start_settings = ws_util.WarmStartSettings(
             ckpt_to_initialize_from=self.get_temp_dir(),
-            col_to_prev_vocab={
-                sc_vocab: (old_vocab_path, old_vocab_size)
+            vars_to_warm_start=".*sc_vocab.*",
+            var_name_to_vocab_info={
+                "linear_model/sc_vocab/weights": vocab_info
             })
-        ws_util._warmstart_input_layer(cols_to_vars, warmstart_settings)
+        ws_util._warm_start(warm_start_settings)
         sess.run(variables.global_variables_initializer())
-        # Verify weights were correctly warmstarted.  'banana' isn't in the
+        # Verify weights were correctly warm-started.  'banana' isn't in the
         # first two entries of the old vocabulary, so it's newly initialized.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [[[1], [0]]]}, sess)
 
-  def testWarmStartInputLayer_BucketizedColumn(self):
+  def testWarmStart_BucketizedColumn(self):
     # Create feature column.
     real = fc.numeric_column("real")
     real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.])
@@ -590,29 +509,29 @@ class WarmStartingUtilTest(test.TestCase):
         initializer=norms())
 
     partitioner = lambda shape, dtype: [1] * len(shape)
-    # New graph, new session WITHOUT warmstarting.
+    # New graph, new session WITHOUT warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([real_bucket], partitioner)
         sess.run(variables.global_variables_initializer())
-        # Without warmstarting, the weights should be initialized using default
+        # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars,
                                   {real_bucket: [np.zeros([5, 1])]}, sess)
 
-    # New graph, new session with warmstarting.
+    # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([real_bucket], partitioner)
-        ws_util._warmstart_input_layer(cols_to_vars,
-                                       ws_util._WarmStartSettings(
-                                           self.get_temp_dir()))
+        ws_util._warm_start(
+            ws_util.WarmStartSettings(
+                self.get_temp_dir(), vars_to_warm_start=".*real_bucketized.*"))
         sess.run(variables.global_variables_initializer())
-        # Verify weights were correctly warmstarted.
+        # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars,
                                   {real_bucket: [prev_bucket_val]}, sess)
 
-  def testWarmStartInputLayer_MultipleCols(self):
+  def testWarmStart_MultipleCols(self):
     # Create vocab for sparse column "sc_vocab".
     vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                    "vocab")
@@ -630,7 +549,8 @@ class WarmStartingUtilTest(test.TestCase):
     cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20)
     all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross]
 
-    # Save checkpoint from which to warm-start.
+    # Save checkpoint from which to warm-start.  Also create a bias variable,
+    # so we can check that it's also warm-started.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         sc_int_weights = variable_scope.get_variable(
@@ -649,22 +569,24 @@ class WarmStartingUtilTest(test.TestCase):
             "linear_model/sc_keys_X_sc_vocab/weights",
             shape=[20, 1],
             initializer=rand())
+        bias = variable_scope.get_variable(
+            "linear_model/bias_weights",
+            shape=[1],
+            initializer=rand())
         self._write_checkpoint(sess)
         (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val,
-         prev_bucket_val, prev_cross_val) = sess.run([
+         prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([
              sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights,
-             real_bucket_weights, cross_weights
+             real_bucket_weights, cross_weights, bias
          ])
-        # Verify we initialized the values correctly.
-        self.assertAllEqual(np.ones([10, 1]), prev_int_val)
 
     partitioner = lambda shape, dtype: [1] * len(shape)
-    # New graph, new session WITHOUT warmstarting.
+    # New graph, new session WITHOUT warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
         sess.run(variables.global_variables_initializer())
-        # Without warmstarting, all weights should be initialized using default
+        # Without warm-starting, all weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {
             sc_int: [np.zeros([10, 1])],
@@ -675,15 +597,23 @@ class WarmStartingUtilTest(test.TestCase):
             cross: [np.zeros([20, 1])],
         }, sess)
 
-    # New graph, new session with warmstarting.
+    # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
-        ws_util._warmstart_input_layer(cols_to_vars,
-                                       ws_util._WarmStartSettings(
-                                           self.get_temp_dir()))
+        vocab_info = ws_util.VocabInfo(
+            new_vocab=sc_vocab.vocabulary_file,
+            new_vocab_size=sc_vocab.vocabulary_size,
+            num_oov_buckets=sc_vocab.num_oov_buckets,
+            old_vocab=vocab_path)
+        ws_util._warm_start(
+            ws_util.WarmStartSettings(
+                self.get_temp_dir(),
+                var_name_to_vocab_info={
+                    "linear_model/sc_vocab/weights": vocab_info
+                }))
         sess.run(variables.global_variables_initializer())
-        # Verify weights were correctly warmstarted.
+        # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {
             sc_int: [prev_int_val],
             sc_hash: [prev_hash_val],
@@ -691,9 +621,10 @@ class WarmStartingUtilTest(test.TestCase):
             sc_vocab: [prev_vocab_val],
             real_bucket: [prev_bucket_val],
             cross: [prev_cross_val],
+            "bias": [prev_bias_val],
         }, sess)
 
-  def testWarmStartInputLayerMoreSettings(self):
+  def testWarmStartMoreSettings(self):
     # Create old and new vocabs for sparse column "sc_vocab".
     prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                         "old_vocab")
@@ -712,11 +643,11 @@ class WarmStartingUtilTest(test.TestCase):
     # Save checkpoint from which to warm-start.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
-        _ = variable_scope.get_variable(
+        variable_scope.get_variable(
             "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
         sc_keys_weights = variable_scope.get_variable(
             "some_other_name", shape=[4, 1], initializer=rand())
-        _ = variable_scope.get_variable(
+        variable_scope.get_variable(
             "linear_model/sc_vocab/weights",
             initializer=[[0.5], [1.], [2.], [3.]])
         self._write_checkpoint(sess)
@@ -728,20 +659,30 @@ class WarmStartingUtilTest(test.TestCase):
       partitions[0] = min(2, shape[0].value)
       return partitions
 
-    # New graph, new session with warmstarting.
+    # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner)
-        ws_settings = ws_util._WarmStartSettings(
+        vocab_info = ws_util.VocabInfo(
+            new_vocab=sc_vocab.vocabulary_file,
+            new_vocab_size=sc_vocab.vocabulary_size,
+            num_oov_buckets=sc_vocab.num_oov_buckets,
+            old_vocab=prev_vocab_path)
+        ws_settings = ws_util.WarmStartSettings(
             self.get_temp_dir(),
-            col_to_prev_vocab={sc_vocab: prev_vocab_path},
-            col_to_prev_tensor={sc_keys: "some_other_name"},
-            exclude_columns=[sc_hash])
-        ws_util._warmstart_input_layer(cols_to_vars, ws_settings)
+            vars_to_warm_start=".*(sc_keys|sc_vocab).*",
+            var_name_to_vocab_info={
+                ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info
+            },
+            var_name_to_prev_var_name={
+                ws_util._infer_var_name(cols_to_vars[sc_keys]):
+                    "some_other_name"
+            })
+        ws_util._warm_start(ws_settings)
         sess.run(variables.global_variables_initializer())
-        # Verify weights were correctly warmstarted.  Var corresponding to
+        # Verify weights were correctly warm-started.  Var corresponding to
         # sc_hash should not be warm-started.  Var corresponding to sc_vocab
-        # should be correctly warmstarted after vocab remapping.
+        # should be correctly warm-started after vocab remapping.
         self._assert_cols_to_vars(cols_to_vars, {
             sc_keys:
                 np.split(prev_keys_val, 2),
@@ -752,7 +693,140 @@ class WarmStartingUtilTest(test.TestCase):
             ]
         }, sess)
 
-  def testWarmStartInputLayerEmbeddingColumn(self):
+  def testWarmStartMoreSettingsNoPartitioning(self):
+    # Create old and new vocabs for sparse column "sc_vocab".
+    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
+                                        "old_vocab")
+    new_vocab_path = self._write_vocab(
+        ["orange", "guava", "banana", "apple", "raspberry",
+         "blueberry"], "new_vocab")
+    # Create feature columns.
+    sc_hash = fc.categorical_column_with_hash_bucket(
+        "sc_hash", hash_bucket_size=15)
+    sc_keys = fc.categorical_column_with_vocabulary_list(
+        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
+    sc_vocab = fc.categorical_column_with_vocabulary_file(
+        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
+    all_linear_cols = [sc_hash, sc_keys, sc_vocab]
+
+    # Save checkpoint from which to warm-start.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        variable_scope.get_variable(
+            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
+        sc_keys_weights = variable_scope.get_variable(
+            "some_other_name", shape=[4, 1], initializer=rand())
+        variable_scope.get_variable(
+            "linear_model/sc_vocab/weights",
+            initializer=[[0.5], [1.], [2.], [3.]])
+        self._write_checkpoint(sess)
+        prev_keys_val = sess.run(sc_keys_weights)
+
+    # New graph, new session with warm-starting.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        cols_to_vars = self._create_linear_model(all_linear_cols,
+                                                 partitioner=None)
+        vocab_info = ws_util.VocabInfo(
+            new_vocab=sc_vocab.vocabulary_file,
+            new_vocab_size=sc_vocab.vocabulary_size,
+            num_oov_buckets=sc_vocab.num_oov_buckets,
+            old_vocab=prev_vocab_path)
+        ws_settings = ws_util.WarmStartSettings(
+            self.get_temp_dir(),
+            vars_to_warm_start=".*(sc_keys|sc_vocab).*",
+            var_name_to_vocab_info={
+                ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info
+            },
+            var_name_to_prev_var_name={
+                ws_util._infer_var_name(cols_to_vars[sc_keys]):
+                    "some_other_name"
+            })
+        ws_util._warm_start(ws_settings)
+        sess.run(variables.global_variables_initializer())
+        # Verify weights were correctly warm-started.  Var corresponding to
+        # sc_hash should not be warm-started.  Var corresponding to sc_vocab
+        # should be correctly warm-started after vocab remapping.
+        self._assert_cols_to_vars(cols_to_vars, {
+            sc_keys: [prev_keys_val],
+            sc_hash: [np.zeros([15, 1])],
+            sc_vocab: [np.array([[3.], [2.], [1.], [0.5], [0.], [0.]])]
+        }, sess)
+
+  def testWarmStartVarsToWarmstartIsNone(self):
+    # Create old and new vocabs for sparse column "sc_vocab".
+    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
+                                        "old_vocab")
+    new_vocab_path = self._write_vocab(
+        ["orange", "guava", "banana", "apple", "raspberry",
+         "blueberry"], "new_vocab")
+    # Create feature columns.
+    sc_hash = fc.categorical_column_with_hash_bucket(
+        "sc_hash", hash_bucket_size=15)
+    sc_keys = fc.categorical_column_with_vocabulary_list(
+        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
+    sc_vocab = fc.categorical_column_with_vocabulary_file(
+        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
+    all_linear_cols = [sc_hash, sc_keys, sc_vocab]
+
+    # Save checkpoint from which to warm-start.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        variable_scope.get_variable(
+            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
+        variable_scope.get_variable(
+            "some_other_name", shape=[4, 1], initializer=rand())
+        variable_scope.get_variable(
+            "linear_model/sc_vocab/weights",
+            initializer=[[0.5], [1.], [2.], [3.]])
+        self._write_checkpoint(sess)
+
+    def _partitioner(shape, dtype):  # pylint:disable=unused-argument
+      # Partition each var into 2 equal slices.
+      partitions = [1] * len(shape)
+      partitions[0] = min(2, shape[0].value)
+      return partitions
+
+    # New graph, new session with warm-starting.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner)
+        vocab_info = ws_util.VocabInfo(
+            new_vocab=sc_vocab.vocabulary_file,
+            new_vocab_size=sc_vocab.vocabulary_size,
+            num_oov_buckets=sc_vocab.num_oov_buckets,
+            old_vocab=prev_vocab_path)
+        ws_settings = ws_util.WarmStartSettings(
+            self.get_temp_dir(),
+            # The special value of None here will ensure that only the variable
+            # specified in var_name_to_vocab_info (sc_vocab embedding) is
+            # warm-started.
+            vars_to_warm_start=None,
+            var_name_to_vocab_info={
+                ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info
+            },
+            # Even though this is provided, the None value for
+            # vars_to_warm_start overrides the logic, and this will not be
+            # warm-started.
+            var_name_to_prev_var_name={
+                ws_util._infer_var_name(cols_to_vars[sc_keys]):
+                    "some_other_name"
+            })
+        ws_util._warm_start(ws_settings)
+        sess.run(variables.global_variables_initializer())
+        # Verify weights were correctly warm-started.  Var corresponding to
+        # sc_vocab should be correctly warm-started after vocab remapping,
+        # and neither of the other two should be warm-started..
+        self._assert_cols_to_vars(cols_to_vars, {
+            sc_keys: [np.zeros([2, 1]), np.zeros([2, 1])],
+            sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])],
+            sc_vocab: [
+                np.array([[3.], [2.], [1.]]),
+                np.array([[0.5], [0.], [0.]])
+            ]
+        }, sess)
+
+  def testWarmStartEmbeddingColumn(self):
     # Create old and new vocabs for embedding column "sc_vocab".
     prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                         "old_vocab")
@@ -763,7 +837,7 @@ class WarmStartingUtilTest(test.TestCase):
     # Save checkpoint from which to warm-start.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
-        _ = variable_scope.get_variable(
+        variable_scope.get_variable(
             "input_layer/sc_vocab_embedding/embedding_weights",
             initializer=[[0.5, 0.4], [1., 1.1], [2., 2.2], [3., 3.3]])
         self._write_checkpoint(sess)
@@ -774,58 +848,167 @@ class WarmStartingUtilTest(test.TestCase):
       partitions[0] = min(2, shape[0].value)
       return partitions
 
+    # Create feature columns.
+    sc_vocab = fc.categorical_column_with_vocabulary_file(
+        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
+    emb_vocab_column = fc.embedding_column(
+        categorical_column=sc_vocab,
+        dimension=2)
+    all_deep_cols = [emb_vocab_column]
+    # New graph, new session with warm-starting.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        cols_to_vars = {}
+        with variable_scope.variable_scope("", partitioner=_partitioner):
+          # Create the variables.
+          fc.input_layer(
+              features=self._create_dummy_inputs(),
+              feature_columns=all_deep_cols,
+              cols_to_vars=cols_to_vars)
+        vocab_info = ws_util.VocabInfo(
+            new_vocab=sc_vocab.vocabulary_file,
+            new_vocab_size=sc_vocab.vocabulary_size,
+            num_oov_buckets=sc_vocab.num_oov_buckets,
+            old_vocab=prev_vocab_path,
+            # Can't use constant_initializer with load_and_remap.  In practice,
+            # use a truncated normal initializer.
+            backup_initializer=init_ops.random_uniform_initializer(
+                minval=0.42, maxval=0.42))
+        ws_settings = ws_util.WarmStartSettings(
+            self.get_temp_dir(),
+            var_name_to_vocab_info={
+                ws_util._infer_var_name(cols_to_vars[emb_vocab_column]):
+                    vocab_info
+            })
+        ws_util._warm_start(ws_settings)
+        sess.run(variables.global_variables_initializer())
+        # Verify weights were correctly warm-started. Var corresponding to
+        # emb_vocab_column should be correctly warm-started after vocab
+        # remapping. Missing values are filled in with the EmbeddingColumn's
+        # initializer.
+        self._assert_cols_to_vars(
+            cols_to_vars, {
+                emb_vocab_column: [
+                    np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
+                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]])
+                ]
+            }, sess)
+
+  def testWarmStartEmbeddingColumnLinearModel(self):
+    # Create old and new vocabs for embedding column "sc_vocab".
+    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
+                                        "old_vocab")
+    new_vocab_path = self._write_vocab(
+        ["orange", "guava", "banana", "apple", "raspberry", "blueberry"],
+        "new_vocab")
+
+    # Save checkpoint from which to warm-start.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        variable_scope.get_variable(
+            "linear_model/sc_vocab_embedding/embedding_weights",
+            initializer=[[0.5, 0.4], [1., 1.1], [2., 2.2], [3., 3.3]])
+        variable_scope.get_variable(
+            "linear_model/sc_vocab_embedding/weights",
+            initializer=[[0.69], [0.71]])
+        self._write_checkpoint(sess)
+
+    def _partitioner(shape, dtype):  # pylint:disable=unused-argument
+      # Partition each var into 2 equal slices.
+      partitions = [1] * len(shape)
+      partitions[0] = min(2, shape[0].value)
+      return partitions
+
     # Create feature columns.
     sc_vocab = fc.categorical_column_with_vocabulary_file(
         "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
     emb_vocab = fc.embedding_column(
         categorical_column=sc_vocab,
-        dimension=2,
-        # Can't use constant_initializer with load_and_remap.  In practice,
-        # use a truncated normal initializer.
-        initializer=init_ops.random_uniform_initializer(
-            minval=0.42, maxval=0.42))
+        dimension=2)
     all_deep_cols = [emb_vocab]
-    # New graph, new session with warmstarting.
+    # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         cols_to_vars = {}
         with variable_scope.variable_scope("", partitioner=_partitioner):
           # Create the variables.
-          fc.input_layer(
+          fc.linear_model(
               features=self._create_dummy_inputs(),
               feature_columns=all_deep_cols,
               cols_to_vars=cols_to_vars)
-        ws_settings = ws_util._WarmStartSettings(
-            self.get_temp_dir(), col_to_prev_vocab={
-                emb_vocab: prev_vocab_path
+
+        # Construct the vocab_info for the embedding weight.
+        vocab_info = ws_util.VocabInfo(
+            new_vocab=sc_vocab.vocabulary_file,
+            new_vocab_size=sc_vocab.vocabulary_size,
+            num_oov_buckets=sc_vocab.num_oov_buckets,
+            old_vocab=prev_vocab_path,
+            # Can't use constant_initializer with load_and_remap.  In practice,
+            # use a truncated normal initializer.
+            backup_initializer=init_ops.random_uniform_initializer(
+                minval=0.42, maxval=0.42))
+        ws_settings = ws_util.WarmStartSettings(
+            self.get_temp_dir(),
+            vars_to_warm_start=".*sc_vocab.*",
+            var_name_to_vocab_info={
+                "linear_model/sc_vocab_embedding/embedding_weights": vocab_info
             })
-        ws_util._warmstart_input_layer(cols_to_vars, ws_settings)
+        ws_util._warm_start(ws_settings)
         sess.run(variables.global_variables_initializer())
-        # Verify weights were correctly warmstarted. Var corresponding to
-        # emb_vocab should be correctly warmstarted after vocab remapping.
+        # Verify weights were correctly warm-started. Var corresponding to
+        # emb_vocab should be correctly warm-started after vocab remapping.
         # Missing values are filled in with the EmbeddingColumn's initializer.
         self._assert_cols_to_vars(
             cols_to_vars, {
                 emb_vocab: [
+                    # embedding_weights part 0.
                     np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
-                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]])
+                    # embedding_weights part 1.
+                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]]),
+                    # linear weights part 0.
+                    np.array([[0.69]]),
+                    # linear weights part 1.
+                    np.array([[0.71]])
                 ]
             }, sess)
 
   def testErrorConditions(self):
-    self.assertRaises(ValueError, ws_util._WarmStartSettings, None)
+    self.assertRaises(ValueError, ws_util.WarmStartSettings, None)
     x = variable_scope.get_variable(
         "x",
         shape=[4, 1],
         initializer=ones(),
         partitioner=lambda shape, dtype: [2, 1])
 
-    # List of PartitionedVariable is invalid type when warmstarting with vocab.
-    self.assertRaises(TypeError, ws_util._warmstart_var_with_vocab, [x], "/tmp",
-                      5, "/tmp", "/tmp")
+    # List of PartitionedVariable is invalid type when warm-starting with vocab.
+    self.assertRaises(TypeError, ws_util._warm_start_var_with_vocab, [x],
+                      "/tmp", 5, "/tmp", "/tmp")
     # Keys of type other than FeatureColumn.
-    self.assertRaises(TypeError, ws_util._warmstart_input_layer,
-                      {"StringType": x}, ws_util._WarmStartSettings("/tmp"))
+    self.assertRaises(TypeError, ws_util._warm_start, {"StringType": x},
+                      ws_util.WarmStartSettings("/tmp"))
+
+    # Unused variable names raises ValueError.
+    with ops.Graph().as_default():
+      with self.test_session() as sess:
+        x = variable_scope.get_variable(
+            "x",
+            shape=[4, 1],
+            initializer=ones(),
+            partitioner=lambda shape, dtype: [2, 1])
+        self._write_checkpoint(sess)
+
+    self.assertRaises(ValueError, ws_util._warm_start,
+                      ws_util.WarmStartSettings(
+                          self.get_temp_dir(),
+                          var_name_to_vocab_info={
+                              "y": ws_util.VocabInfo("", 1, 0, "")
+                          }))
+    self.assertRaises(ValueError, ws_util._warm_start,
+                      ws_util.WarmStartSettings(
+                          self.get_temp_dir(),
+                          var_name_to_prev_var_name={
+                              "y": "y2"
+                          }))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index b1c81dd58c7d2d9cf95821ea78eda2e7ee675d25..a758f8a4fc4898713772c4e919acda48b0f6ad0b 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -48,6 +48,7 @@ py_library(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python:template",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
@@ -84,6 +85,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:partitioned_variables",
@@ -92,6 +94,8 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:numpy_io",
     ],
 )
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 0686480ca48adab5766f25a83cb0de15678cf617..c416881c3119c160d28f4b8e37cd2aeb22f239a6 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -134,7 +134,7 @@ import math
 import numpy as np
 import six
 
-from tensorflow.python.eager import context
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -150,14 +150,68 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import tf_export
+
 
+def _internal_input_layer(features,
+                          feature_columns,
+                          weight_collections=None,
+                          trainable=True,
+                          cols_to_vars=None,
+                          scope=None):
+  """See input_layer. `scope` is a name or variable scope to use."""
 
+  feature_columns = _clean_feature_columns(feature_columns)
+  for column in feature_columns:
+    if not isinstance(column, _DenseColumn):
+      raise ValueError(
+          'Items of feature_columns must be a _DenseColumn. '
+          'You can wrap a categorical column with an '
+          'embedding_column or indicator_column. Given: {}'.format(column))
+  weight_collections = list(weight_collections or [])
+  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+
+  # a non-None `scope` can allow for variable reuse, when, e.g., this function
+  # is wrapped by a `make_template`.
+  with variable_scope.variable_scope(
+      scope, default_name='input_layer', values=features.values()):
+    builder = _LazyBuilder(features)
+    output_tensors = []
+    ordered_columns = []
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      ordered_columns.append(column)
+      with variable_scope.variable_scope(
+          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
+        tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+            builder,
+            weight_collections=weight_collections,
+            trainable=trainable)
+        num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
+        batch_size = array_ops.shape(tensor)[0]
+        output_tensors.append(
+            array_ops.reshape(tensor, shape=(batch_size, num_elements)))
+        if cols_to_vars is not None:
+          # Retrieve any variables created (some _DenseColumn's don't create
+          # variables, in which case an empty list is returned).
+          cols_to_vars[column] = ops.get_collection(
+              ops.GraphKeys.GLOBAL_VARIABLES,
+              scope=variable_scope.get_variable_scope().name)
+    _verify_static_batch_size_equality(output_tensors, ordered_columns)
+    return array_ops.concat(output_tensors, 1)
+
+
+@tf_export('feature_column.input_layer')
 def input_layer(features,
                 feature_columns,
                 weight_collections=None,
@@ -194,7 +248,7 @@ def input_layer(features,
       `bucketized_column`, `indicator_column`. If you have categorical features,
       you can wrap them with an `embedding_column` or `indicator_column`.
     weight_collections: A list of collection names to which the Variable will be
-      added. Note that, variables will also be added to collections
+      added. Note that variables will also be added to collections
       `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
     trainable: If `True` also add the variable to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
@@ -216,54 +270,69 @@ def input_layer(features,
   Raises:
     ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
   """
-  feature_columns = _clean_feature_columns(feature_columns)
-  for column in feature_columns:
-    if not isinstance(column, _DenseColumn):
-      raise ValueError(
-          'Items of feature_columns must be a _DenseColumn. '
-          'You can wrap a categorical column with an '
-          'embedding_column or indicator_column. Given: {}'.format(column))
-  weight_collections = list(weight_collections or [])
-  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
-  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
-  with variable_scope.variable_scope(
-      None, default_name='input_layer', values=features.values()):
-    builder = _LazyBuilder(features)
-    output_tensors = []
-    ordered_columns = []
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      ordered_columns.append(column)
-      with variable_scope.variable_scope(
-          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
-        if column._var_scope_name == column.name:  # pylint: disable=protected-access
-          tensor = _get_dense_tensor(
-              column=column,
-              builder=builder,
-              weight_collections=weight_collections,
-              trainable=trainable)
-        else:
-          # This is typically the case for shared_embedding_columns. The
-          # embedding weights variable will be under the common variable_scope,
-          # but the ops for each column will be under a separate name_scope.
-          with ops.name_scope(column.name):
-            tensor = _get_dense_tensor(
-                column=column,
-                builder=builder,
-                weight_collections=weight_collections,
-                trainable=trainable)
-        output_tensors.append(tensor)
-        if cols_to_vars is not None:
-          # Retrieve any variables created (some _DenseColumn's don't create
-          # variables, in which case an empty list is returned).
-          cols_to_vars[column] = ops.get_collection(
-              ops.GraphKeys.GLOBAL_VARIABLES,
-              scope=variable_scope.get_variable_scope().name)
-    _verify_static_batch_size_equality(output_tensors, ordered_columns)
-    return array_ops.concat(output_tensors, 1)
+  return _internal_input_layer(features, feature_columns, weight_collections,
+                               trainable, cols_to_vars)
+
+
+# TODO(akshayka): InputLayer should be a subclass of Layer, and it
+# should implement the logic in input_layer using Layer's build-and-call
+# paradigm; input_layer should create an instance of InputLayer and
+# return the result of inovking its apply method, just as functional layers do.
+class InputLayer(object):
+  """An object-oriented version of `input_layer` that reuses variables."""
+
+  def __init__(self,
+               feature_columns,
+               weight_collections=None,
+               trainable=True,
+               cols_to_vars=None):
+    """See `input_layer`."""
+
+    self._feature_columns = feature_columns
+    self._weight_collections = weight_collections
+    self._trainable = trainable
+    self._cols_to_vars = cols_to_vars
+    self._input_layer_template = template.make_template(
+        'feature_column_input_layer',
+        _internal_input_layer,
+        create_scope_now_=True)
+    self._scope = self._input_layer_template.variable_scope
+
+  def __call__(self, features):
+    return self._input_layer_template(
+        features=features,
+        feature_columns=self._feature_columns,
+        weight_collections=self._weight_collections,
+        trainable=self._trainable,
+        cols_to_vars=None,
+        scope=self._scope)
+
+  @property
+  def non_trainable_variables(self):
+    return self._input_layer_template.non_trainable_variables
+
+  @property
+  def non_trainable_weights(self):
+    return self._input_layer_template.non_trainable_weights
+
+  @property
+  def trainable_variables(self):
+    return self._input_layer_template.trainable_variables
+
+  @property
+  def trainable_weights(self):
+    return self._input_layer_template.trainable_weights
 
+  @property
+  def variables(self):
+    return self._input_layer_template.variables
+
+  @property
+  def weights(self):
+    return self._input_layer_template.weights
 
+
+@tf_export('feature_column.linear_model')
 def linear_model(features,
                  feature_columns,
                  units=1,
@@ -355,26 +424,13 @@ def linear_model(features,
       with variable_scope.variable_scope(
           None, default_name=column._var_scope_name):  # pylint: disable=protected-access
         ordered_columns.append(column)
-        if column._var_scope_name == column.name:  # pylint: disable=protected-access
-          weighted_sum = _create_weighted_sum(
-              column=column,
-              builder=builder,
-              units=units,
-              sparse_combiner=sparse_combiner,
-              weight_collections=weight_collections,
-              trainable=trainable)
-        else:
-          # This is typically the case for shared_embedding_columns. The
-          # embedding weights variable will be under the common variable_scope,
-          # but the ops for each column will be under a separate name_scope.
-          with ops.name_scope(column.name):
-            weighted_sum = _create_weighted_sum(
-                column=column,
-                builder=builder,
-                units=units,
-                sparse_combiner=sparse_combiner,
-                weight_collections=weight_collections,
-                trainable=trainable)
+        weighted_sum = _create_weighted_sum(
+            column=column,
+            builder=builder,
+            units=units,
+            sparse_combiner=sparse_combiner,
+            weight_collections=weight_collections,
+            trainable=trainable)
         weighted_sums.append(weighted_sum)
         if cols_to_vars is not None:
           # Retrieve the variables created.
@@ -446,6 +502,7 @@ def _transform_features(features, feature_columns):
   return outputs
 
 
+@tf_export('feature_column.make_parse_example_spec')
 def make_parse_example_spec(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
@@ -455,6 +512,7 @@ def make_parse_example_spec(feature_columns):
 
   ```python
   # Define features and transformations
+  feature_a = categorical_column_with_vocabulary_file(...)
   feature_b = numeric_column(...)
   feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...)
   feature_a_x_feature_c = crossed_column(
@@ -505,6 +563,7 @@ def make_parse_example_spec(feature_columns):
   return result
 
 
+@tf_export('feature_column.embedding_column')
 def embedding_column(
     categorical_column, dimension, combiner='mean', initializer=None,
     ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
@@ -579,10 +638,6 @@ def embedding_column(
       is specified.
     ValueError: if `initializer` is specified and is not callable.
     RuntimeError: If eager execution is enabled.
-
-  @compatibility(eager)
-  Not compatible with eager execution.
-  @end_compatibility
   """
   if (dimension is None) or (dimension < 1):
     raise ValueError('Invalid dimension {}.'.format(dimension))
@@ -594,8 +649,6 @@ def embedding_column(
     raise ValueError('initializer must be callable if specified. '
                      'Embedding of column_name: {}'.format(
                          categorical_column.name))
-  if not context.in_graph_mode():
-    raise RuntimeError('Embedding_column not supported in eager mode.')
   if initializer is None:
     initializer = init_ops.truncated_normal_initializer(
         mean=0.0, stddev=1 / math.sqrt(dimension))
@@ -605,18 +658,18 @@ def embedding_column(
       dimension=dimension,
       combiner=combiner,
       initializer=initializer,
-      shared_embedding_collection_name=None,
       ckpt_to_load_from=ckpt_to_load_from,
       tensor_name_in_ckpt=tensor_name_in_ckpt,
       max_norm=max_norm,
       trainable=trainable)
 
 
-def _shared_embedding_columns(
+@tf_export('feature_column.shared_embedding_columns')
+def shared_embedding_columns(
     categorical_columns, dimension, combiner='mean', initializer=None,
     shared_embedding_collection_name=None, ckpt_to_load_from=None,
     tensor_name_in_ckpt=None, max_norm=None, trainable=True):
-  """List of `_DenseColumn`s that convert from sparse, categorical input.
+  """List of dense columns that convert from sparse, categorical input.
 
   This is similar to `embedding_column`, except that that it produces a list of
   embedding columns that share the same embedding weights.
@@ -625,7 +678,7 @@ def _shared_embedding_columns(
   impression video IDs that share the same vocabulary), and you want to convert
   them to a dense representation (e.g., to feed to a DNN).
 
-  Inputs must be a list of `_CategoricalColumn` created by any of the
+  Inputs must be a list of categorical columns created by any of the
   `categorical_column_*` function. They must all be of the same type and have
   the same arguments except `key`. E.g. they can be
   categorical_column_with_vocabulary_file with the same vocabulary_file. Some or
@@ -669,7 +722,7 @@ def _shared_embedding_columns(
   ```
 
   Args:
-    categorical_columns: List of `_CategoricalColumn`s created by a
+    categorical_columns: List of categorical columns created by a
       `categorical_column_with_*` function. These columns produce the sparse IDs
       that are inputs to the embedding lookup. All columns must be of the same
       type and have the same arguments except `key`. E.g. they can be
@@ -699,7 +752,7 @@ def _shared_embedding_columns(
     trainable: Whether or not the embedding is trainable. Default is True.
 
   Returns:
-    A list of `_DenseColumn`s that converts from sparse input. The order of
+    A list of dense columns that converts from sparse input. The order of
     results follows the ordering of `categorical_columns`.
 
   Raises:
@@ -749,7 +802,7 @@ def _shared_embedding_columns(
 
   result = []
   for column in categorical_columns:
-    result.append(_EmbeddingColumn(
+    result.append(_SharedEmbeddingColumn(
         categorical_column=column,
         dimension=dimension,
         combiner=combiner,
@@ -762,6 +815,7 @@ def _shared_embedding_columns(
   return result
 
 
+@tf_export('feature_column.numeric_column')
 def numeric_column(key,
                    shape=(1,),
                    default_value=None,
@@ -836,6 +890,7 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
+@tf_export('feature_column.bucketized_column')
 def bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
@@ -925,6 +980,7 @@ def _assert_string_or_int(dtype, prefix):
         '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
 
 
+@tf_export('feature_column.categorical_column_with_hash_bucket')
 def categorical_column_with_hash_bucket(key,
                                         hash_bucket_size,
                                         dtype=dtypes.string):
@@ -981,6 +1037,7 @@ def categorical_column_with_hash_bucket(key,
   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
+@tf_export('feature_column.categorical_column_with_vocabulary_file')
 def categorical_column_with_vocabulary_file(key,
                                             vocabulary_file,
                                             vocabulary_size=None,
@@ -1100,6 +1157,7 @@ def categorical_column_with_vocabulary_file(key,
       dtype=dtype)
 
 
+@tf_export('feature_column.categorical_column_with_vocabulary_list')
 def categorical_column_with_vocabulary_list(
     key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
   """A `_CategoricalColumn` with in-memory vocabulary.
@@ -1210,6 +1268,7 @@ def categorical_column_with_vocabulary_list(
       default_value=default_value, num_oov_buckets=num_oov_buckets)
 
 
+@tf_export('feature_column.categorical_column_with_identity')
 def categorical_column_with_identity(key, num_buckets, default_value=None):
   """A `_CategoricalColumn` that returns identity values.
 
@@ -1277,6 +1336,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
       key=key, num_buckets=num_buckets, default_value=default_value)
 
 
+@tf_export('feature_column.indicator_column')
 def indicator_column(categorical_column):
   """Represents multi-hot representation of given categorical column.
 
@@ -1305,6 +1365,7 @@ def indicator_column(categorical_column):
   return _IndicatorColumn(categorical_column)
 
 
+@tf_export('feature_column.weighted_categorical_column')
 def weighted_categorical_column(
     categorical_column, weight_feature_key, dtype=dtypes.float32):
   """Applies weight values to a `_CategoricalColumn`.
@@ -1379,6 +1440,7 @@ def weighted_categorical_column(
       dtype=dtype)
 
 
+@tf_export('feature_column.crossed_column')
 def crossed_column(keys, hash_bucket_size, hash_key=None):
   """Returns a column for performing crosses of categorical features.
 
@@ -1623,21 +1685,6 @@ class _DenseColumn(_FeatureColumn):
     pass
 
 
-def _get_dense_tensor(
-    column,
-    builder,
-    weight_collections,
-    trainable):
-  """Creates a dense Tensor for a _DenseColumn for input_layer."""
-  tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-      builder,
-      weight_collections=weight_collections,
-      trainable=trainable)
-  num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
-  batch_size = array_ops.shape(tensor)[0]
-  return array_ops.reshape(tensor, shape=(batch_size, num_elements))
-
-
 def _create_weighted_sum(
     column,
     builder,
@@ -1648,11 +1695,19 @@ def _create_weighted_sum(
   """Creates a weighted sum for a dense or sparse column for linear_model."""
   if isinstance(column, _CategoricalColumn):
     return _create_categorical_column_weighted_sum(
-        column, builder, units, sparse_combiner, weight_collections,
-        trainable)
+        column=column,
+        builder=builder,
+        units=units,
+        sparse_combiner=sparse_combiner,
+        weight_collections=weight_collections,
+        trainable=trainable)
   else:
     return _create_dense_column_weighted_sum(
-        column, builder, units, weight_collections, trainable)
+        column=column,
+        builder=builder,
+        units=units,
+        weight_collections=weight_collections,
+        trainable=trainable)
 
 
 def _create_dense_column_weighted_sum(
@@ -1920,29 +1975,26 @@ def _to_sparse_input(input_tensor, ignore_value=None):
   if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
     return input_tensor
   with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)):
-    input_rank = input_tensor.get_shape().ndims
-    if input_rank is None:
-      # TODO(b/32318825): Implement dense_to_sparse_tensor for undefined rank.
-      raise ValueError('Undefined input_tensor shape.')
     if ignore_value is None:
-      ignore_value = '' if input_tensor.dtype == dtypes.string else -1
-    dense_shape = math_ops.cast(array_ops.shape(input_tensor), dtypes.int64)
-    indices = array_ops.where(math_ops.not_equal(
-        input_tensor, math_ops.cast(ignore_value, input_tensor.dtype)))
-    # Flattens the tensor and indices for use with gather.
-    flat_tensor = array_ops.reshape(input_tensor, [-1])
-    flat_indices = indices[:, input_rank - 1]
-    # Computes the correct flattened indices for 2d (or higher) tensors.
-    if input_rank > 1:
-      higher_dims = indices[:, :input_rank - 1]
-      shape_offsets = array_ops.stack(
-          _shape_offsets(array_ops.unstack(dense_shape)[1:]))
-      offsets = math_ops.reduce_sum(
-          math_ops.multiply(higher_dims, shape_offsets),
-          reduction_indices=[1])
-      flat_indices = math_ops.add(flat_indices, offsets)
-    values = array_ops.gather(flat_tensor, flat_indices)
-    return sparse_tensor_lib.SparseTensor(indices, values, dense_shape)
+      if input_tensor.dtype == dtypes.string:
+        # Exception due to TF strings are converted to numpy objects by default.
+        ignore_value = ''
+      elif input_tensor.dtype.is_integer:
+        ignore_value = -1  # -1 has a special meaning of missing feature
+      else:
+        # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is
+        # constructing a new numpy object of the given type, which yields the
+        # default value for that type.
+        ignore_value = input_tensor.dtype.as_numpy_dtype()
+    ignore_value = math_ops.cast(
+        ignore_value, input_tensor.dtype, name='ignore_value')
+    indices = array_ops.where(
+        math_ops.not_equal(input_tensor, ignore_value), name='indices')
+    return sparse_tensor_lib.SparseTensor(
+        indices=indices,
+        values=array_ops.gather_nd(input_tensor, indices, name='values'),
+        dense_shape=array_ops.shape(
+            input_tensor, out_type=dtypes.int64, name='dense_shape'))
 
 
 def _clean_feature_columns(feature_columns):
@@ -2103,24 +2155,16 @@ class _EmbeddingColumn(
     _DenseColumn,
     collections.namedtuple('_EmbeddingColumn', (
         'categorical_column', 'dimension', 'combiner', 'initializer',
-        'shared_embedding_collection_name', 'ckpt_to_load_from',
-        'tensor_name_in_ckpt', 'max_norm', 'trainable'
+        'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'
     ))):
   """See `embedding_column`."""
 
   @property
   def name(self):
     if not hasattr(self, '_name'):
-      if self.shared_embedding_collection_name:
-        self._name = '{}_shared_embedding'.format(self.categorical_column.name)
-      else:
-        self._name = '{}_embedding'.format(self.categorical_column.name)
+      self._name = '{}_embedding'.format(self.categorical_column.name)
     return self._name
 
-  @property
-  def _var_scope_name(self):
-    return self.shared_embedding_collection_name or self.name
-
   @property
   def _parse_example_spec(self):
     return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
@@ -2142,7 +2186,75 @@ class _EmbeddingColumn(
     sparse_weights = sparse_tensors.weight_tensor
 
     embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
-    if self.shared_embedding_collection_name:
+    embedding_weights = variable_scope.get_variable(
+        name='embedding_weights',
+        shape=embedding_shape,
+        dtype=dtypes.float32,
+        initializer=self.initializer,
+        trainable=self.trainable and trainable,
+        collections=weight_collections)
+    if self.ckpt_to_load_from is not None:
+      to_restore = embedding_weights
+      if isinstance(to_restore, variables.PartitionedVariable):
+        to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
+      checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
+          self.tensor_name_in_ckpt: to_restore
+      })
+
+    # Return embedding lookup result.
+    return _safe_embedding_lookup_sparse(
+        embedding_weights=embedding_weights,
+        sparse_ids=sparse_ids,
+        sparse_weights=sparse_weights,
+        combiner=self.combiner,
+        name='%s_weights' % self.name,
+        max_norm=self.max_norm)
+
+
+class _SharedEmbeddingColumn(
+    _DenseColumn,
+    collections.namedtuple('_SharedEmbeddingColumn', (
+        'categorical_column', 'dimension', 'combiner', 'initializer',
+        'shared_embedding_collection_name', 'ckpt_to_load_from',
+        'tensor_name_in_ckpt', 'max_norm', 'trainable'
+    ))):
+  """See `embedding_column`."""
+
+  @property
+  def name(self):
+    if not hasattr(self, '_name'):
+      self._name = '{}_shared_embedding'.format(self.categorical_column.name)
+    return self._name
+
+  @property
+  def _var_scope_name(self):
+    return self.shared_embedding_collection_name
+
+  @property
+  def _parse_example_spec(self):
+    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+
+  def _transform_feature(self, inputs):
+    return inputs.get(self.categorical_column)
+
+  @property
+  def _variable_shape(self):
+    if not hasattr(self, '_shape'):
+      self._shape = tensor_shape.vector(self.dimension)
+    return self._shape
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    # This method is called from a variable_scope with name _var_scope_name,
+    # which is shared among all shared embeddings. Open a name_scope here, so
+    # that the ops for different columns have distinct names.
+    with ops.name_scope(None, default_name=self.name):
+      # Get sparse IDs and weights.
+      sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
+          inputs, weight_collections=weight_collections, trainable=trainable)
+      sparse_ids = sparse_tensors.id_tensor
+      sparse_weights = sparse_tensors.weight_tensor
+
+      embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
       shared_embedding_collection = ops.get_collection(
           self.shared_embedding_collection_name)
       if shared_embedding_collection:
@@ -2154,7 +2266,7 @@ class _EmbeddingColumn(
               'The feature_column library already adds a variable under the '
               'hood.'.format(shared_embedding_collection))
         embedding_weights = shared_embedding_collection[0]
-        if embedding_weights.shape != embedding_shape:
+        if embedding_weights.get_shape() != embedding_shape:
           raise ValueError(
               'Shared embedding collection {} contains variable {} of '
               'unexpected shape {}. Expected shape is {}. '
@@ -2163,7 +2275,7 @@ class _EmbeddingColumn(
               'The feature_column library already adds a variable under the '
               'hood.'.format(
                   self.shared_embedding_collection_name, embedding_weights.name,
-                  embedding_weights.shape, embedding_shape))
+                  embedding_weights.get_shape(), embedding_shape))
       else:
         embedding_weights = variable_scope.get_variable(
             name='embedding_weights',
@@ -2174,30 +2286,22 @@ class _EmbeddingColumn(
             collections=weight_collections)
         ops.add_to_collection(
             self.shared_embedding_collection_name, embedding_weights)
-    else:
-      embedding_weights = variable_scope.get_variable(
-          name='embedding_weights',
-          shape=embedding_shape,
-          dtype=dtypes.float32,
-          initializer=self.initializer,
-          trainable=self.trainable and trainable,
-          collections=weight_collections)
-    if self.ckpt_to_load_from is not None:
-      to_restore = embedding_weights
-      if isinstance(to_restore, variables.PartitionedVariable):
-        to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
-      checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
-          self.tensor_name_in_ckpt: to_restore
-      })
-
-    # Return embedding lookup result.
-    return _safe_embedding_lookup_sparse(
-        embedding_weights=embedding_weights,
-        sparse_ids=sparse_ids,
-        sparse_weights=sparse_weights,
-        combiner=self.combiner,
-        name='%s_weights' % self.name,
-        max_norm=self.max_norm)
+      if self.ckpt_to_load_from is not None:
+        to_restore = embedding_weights
+        if isinstance(to_restore, variables.PartitionedVariable):
+          to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
+        checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
+            self.tensor_name_in_ckpt: to_restore
+        })
+
+      # Return embedding lookup result.
+      return _safe_embedding_lookup_sparse(
+          embedding_weights=embedding_weights,
+          sparse_ids=sparse_ids,
+          sparse_weights=sparse_weights,
+          combiner=self.combiner,
+          name='%s_weights' % self.name,
+          max_norm=self.max_norm)
 
 
 def _create_tuple(shape, value):
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index 8a57986764f9f5e2cff788817cc7706089dc73b0..505a1408d271e9262226b2ea4cff234345e2f3b6 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -29,6 +29,7 @@ _allowed_symbols = [
     'linear_model',
     'make_parse_example_spec',
     'embedding_column',
+    'shared_embedding_columns',
     'crossed_column',
     'numeric_column',
     'bucketized_column',
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index d974f14b8a35df7c86c0ab548c74772142fbbca4..6f366e77229577b1a6a5363f882daa07203f525c 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -26,19 +26,22 @@ import numpy as np
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column as fc_lib
 from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column.feature_column import _CategoricalColumn
 from tensorflow.python.feature_column.feature_column import _DenseColumn
 from tensorflow.python.feature_column.feature_column import _FeatureColumn
 from tensorflow.python.feature_column.feature_column import _LazyBuilder
 from tensorflow.python.feature_column.feature_column import _transform_features
+from tensorflow.python.feature_column.feature_column import InputLayer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import parsing_ops
@@ -1068,6 +1071,7 @@ def get_linear_model_column_var(column):
                             'linear_model/' + column.name)[0]
 
 
+@test_util.with_c_api
 class LinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
@@ -1321,10 +1325,16 @@ class LinearModelTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      predictions = fc.linear_model(features, [price])
-      with _initialized_session():
-        with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
-          predictions.eval()
+      if ops._USE_C_API:
+        with self.assertRaisesRegexp(
+            Exception,
+            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+          predictions = fc.linear_model(features, [price])
+      else:
+        predictions = fc.linear_model(features, [price])
+        with _initialized_session():
+          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+            predictions.eval()
 
   def test_dense_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
@@ -1646,8 +1656,9 @@ class LinearModelTest(test.TestCase):
         indices=((0,), (1,)),
         values=('sedan', 'hardtop'),
         dense_shape=(2,))
+    country_data = np.array(['US', 'CA'])
 
-    net = fc.linear_model(features, [price_buckets, body_style])
+    net = fc.linear_model(features, [price_buckets, body_style, country])
     bias = get_linear_model_bias()
     price_buckets_var = get_linear_model_column_var(price_buckets)
     body_style_var = get_linear_model_column_var(body_style)
@@ -1656,15 +1667,14 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose(
-          [[10 - 1000 + 5.], [1000 - 10 + 5.]],
-          sess.run(net, feed_dict={
-              features['price']: price_data,
-              features['body-style']: body_style_data}))
-
-    # Dense categorical_column with unknown shape is not allowed.
-    with self.assertRaisesRegexp(ValueError, 'Undefined input_tensor shape.'):
-      fc.linear_model(features, [price_buckets, body_style, country])
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          sess.run(
+                              net,
+                              feed_dict={
+                                  features['price']: price_data,
+                                  features['body-style']: body_style_data,
+                                  features['country']: country_data
+                              }))
 
   def test_with_rank_0_feature(self):
     price = fc.numeric_column('price')
@@ -1690,6 +1700,106 @@ class LinearModelTest(test.TestCase):
 
 class InputLayerTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_retrieving_input(self):
+    features = {'a': [0.]}
+    input_layer = InputLayer(fc.numeric_column('a'))
+    inputs = self.evaluate(input_layer(features))
+    self.assertAllClose([[0.]], inputs)
+
+  def test_reuses_variables(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(key='a',
+                                                               num_buckets=3)
+      embedding_dimension = 2
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      input_layer = InputLayer([embedding_column])
+      features = {'a': sparse_input}
+
+      inputs = input_layer(features)
+      variables = input_layer.variables
+
+      # Sanity check: test that the inputs are correct.
+      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+
+      # Check that only one variable was created.
+      self.assertEqual(1, len(variables))
+
+      # Check that invoking input_layer on the same features does not create
+      # additional variables
+      _ = input_layer(features)
+      self.assertEqual(1, len(variables))
+      self.assertEqual(variables[0], input_layer.variables[0])
+
+  def test_feature_column_input_layer_gradient(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(key='a',
+                                                               num_buckets=3)
+      embedding_dimension = 2
+
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      input_layer = InputLayer([embedding_column])
+      features = {'a': sparse_input}
+
+      def scale_matrix():
+        matrix = input_layer(features)
+        return 2 * matrix
+
+      # Sanity check: Verify that scale_matrix returns the correct output.
+      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+
+      # Check that the returned gradient is correct.
+      grad_function = backprop.implicit_grad(scale_matrix)
+      grads_and_vars = grad_function()
+      indexed_slice = grads_and_vars[0][0]
+      gradient = grads_and_vars[0][0].values
+
+      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+
+
+@test_util.with_c_api
+class FunctionalInputLayerTest(test.TestCase):
+
   def test_raises_if_empty_feature_columns(self):
     with self.assertRaisesRegexp(ValueError,
                                  'feature_columns must not be empty'):
@@ -1752,10 +1862,16 @@ class InputLayerTest(test.TestCase):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      net = fc.input_layer(features, [price])
-      with _initialized_session():
-        with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
-          net.eval()
+      if ops._USE_C_API:
+        with self.assertRaisesRegexp(
+            Exception,
+            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+          net = fc.input_layer(features, [price])
+      else:
+        net = fc.input_layer(features, [price])
+        with _initialized_session():
+          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+            net.eval()
 
   def test_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
@@ -2016,9 +2132,9 @@ class InputLayerTest(test.TestCase):
 
   def test_with_1d_unknown_shape_sparse_tensor(self):
     embedding_values = (
-        (1., 2., 3., 4., 5.),  # id 0
-        (6., 7., 8., 9., 10.),  # id 1
-        (11., 12., 13., 14., 15.)  # id 2
+        (1., 2.),  # id 0
+        (6., 7.),  # id 1
+        (11., 12.)  # id 2
     )
     def _initializer(shape, dtype, partition_info):
       del shape, dtype, partition_info
@@ -2035,8 +2151,8 @@ class InputLayerTest(test.TestCase):
     # embedded_body_style has 5 dims in input_layer.
     country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc.embedding_column(country, dimension=5,
-                                           initializer=_initializer)
+    embedded_country = fc.embedding_column(
+        country, dimension=2, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
     features = {
@@ -2054,22 +2170,24 @@ class InputLayerTest(test.TestCase):
         indices=((0,), (1,)),
         values=('sedan', 'hardtop'),
         dense_shape=(2,))
+    country_data = np.array([['US'], ['CA']])
 
-    # Dense categorical_column with unknown shape is not allowed.
-    with self.assertRaisesRegexp(ValueError, 'Undefined input_tensor shape.'):
-      fc.input_layer(features, [price, one_hot_body_style, embedded_country])
-
-    net = fc.input_layer(features, [price, one_hot_body_style])
-    self.assertEqual(1 + 3, net.shape[1])
+    net = fc.input_layer(features,
+                         [price, one_hot_body_style, embedded_country])
+    self.assertEqual(1 + 3 + 2, net.shape[1])
     with _initialized_session() as sess:
 
       # Each row is formed by concatenating `embedded_body_style`,
       # `one_hot_body_style`, and `price` in order.
       self.assertAllEqual(
-          [[0., 0., 1., 11.], [1., 0., 0., 12.]],
-          sess.run(net, feed_dict={
-              features['price']: price_data,
-              features['body-style']: body_style_data}))
+          [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]],
+          sess.run(
+              net,
+              feed_dict={
+                  features['price']: price_data,
+                  features['body-style']: body_style_data,
+                  features['country']: country_data
+              }))
 
   def test_with_rank_0_feature(self):
     # price has 1 dimension in input_layer
@@ -3446,7 +3564,6 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertEqual('mean', embedding_column.combiner)
     self.assertIsNotNone(embedding_column.initializer)
     self.assertIsNone(embedding_column.ckpt_to_load_from)
-    self.assertIsNone(embedding_column.shared_embedding_collection_name)
     self.assertIsNone(embedding_column.tensor_name_in_ckpt)
     self.assertIsNone(embedding_column.max_norm)
     self.assertTrue(embedding_column.trainable)
@@ -3471,7 +3588,6 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
     self.assertEqual('my_initializer', embedding_column.initializer())
-    self.assertIsNone(embedding_column.shared_embedding_collection_name)
     self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
     self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
     self.assertEqual(42., embedding_column.max_norm)
@@ -3503,7 +3619,6 @@ class EmbeddingColumnTest(test.TestCase):
       self.assertEqual(embedding_dimension, embedding_column.dimension)
       self.assertEqual('my_combiner', embedding_column.combiner)
       self.assertEqual('my_initializer', embedding_column.initializer())
-      self.assertIsNone(embedding_column.shared_embedding_collection_name)
       self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
       self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
       self.assertEqual(42., embedding_column.max_norm)
@@ -4035,7 +4150,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc_lib._shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
     self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
@@ -4081,7 +4196,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -4134,7 +4249,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    original_a, _ = fc_lib._shared_embedding_columns(
+    original_a, _ = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -4172,7 +4287,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
     with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
-      fc_lib._shared_embedding_columns(
+      fc.shared_embedding_columns(
           [categorical_column_a, categorical_column_b], dimension=2,
           initializer='not_fn')
 
@@ -4187,7 +4302,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         ValueError,
         'all categorical_columns must have the same type.*'
         '_IdentityCategoricalColumn.*_HashedCategoricalColumn'):
-      fc_lib._shared_embedding_columns(
+      fc.shared_embedding_columns(
           [categorical_column_a, categorical_column_b, categorical_column_c],
           dimension=2)
 
@@ -4200,11 +4315,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='bbb', num_buckets=3)
     weighted_categorical_column_b = fc.weighted_categorical_column(
         categorical_column_b, weight_feature_key='bbb_weights')
-    fc_lib._shared_embedding_columns(
+    fc.shared_embedding_columns(
         [weighted_categorical_column_a, categorical_column_b], dimension=2)
-    fc_lib._shared_embedding_columns(
+    fc.shared_embedding_columns(
         [categorical_column_a, weighted_categorical_column_b], dimension=2)
-    fc_lib._shared_embedding_columns(
+    fc.shared_embedding_columns(
         [weighted_categorical_column_a, weighted_categorical_column_b],
         dimension=2)
 
@@ -4213,7 +4328,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     b = fc.categorical_column_with_vocabulary_list(
         key='bbb', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_embedded, b_embedded = fc_lib._shared_embedding_columns(
+    a_embedded, b_embedded = fc.shared_embedding_columns(
         [a, b], dimension=2)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -4245,25 +4360,256 @@ class SharedEmbeddingColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['bbb'].eval())
 
-  def test_input_layer(self):
+  def test_transform_feature(self):
+    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    b = fc.categorical_column_with_identity(key='bbb', num_buckets=3)
+    a_embedded, b_embedded = fc.shared_embedding_columns(
+        [a, b], dimension=2)
+    features = {
+        'aaa': sparse_tensor.SparseTensor(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=(0, 1, 0),
+            dense_shape=(2, 2)),
+        'bbb': sparse_tensor.SparseTensor(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=(1, 2, 1),
+            dense_shape=(2, 2)),
+    }
+    outputs = _transform_features(features, [a, a_embedded, b, b_embedded])
+    output_a = outputs[a]
+    output_a_embedded = outputs[a_embedded]
+    output_b = outputs[b]
+    output_b_embedded = outputs[b_embedded]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self, output_a.eval(), output_a_embedded.eval())
+      _assert_sparse_tensor_value(
+          self, output_b.eval(), output_b_embedded.eval())
+
+  def test_get_dense_tensor(self):
+    # Inputs.
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array(
+        [[2, -1, -1],  # example 0, ids [2]
+         [0, 1, -1]])  # example 1, ids [0, 1]
+    input_b = np.array(
+        [[0, -1, -1],  # example 0, ids [0]
+         [-1, -1, -1]])  # example 1, ids []
+    input_features = {
+        'aaa': input_a,
+        'bbb': input_b
+    }
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups_a = (
+        # example 0:
+        (7., 11.),  # ids [2], embedding = [7, 11]
+        # example 1:
+        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+    )
+    expected_lookups_b = (
+        # example 0:
+        (1., 2.),  # ids [0], embedding = [1, 2]
+        # example 1:
+        (0., 0.),  # ids [], embedding = [0, 0]
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension, initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup_a = embedding_column_a._get_dense_tensor(
+        _LazyBuilder(input_features))
+    embedding_lookup_b = embedding_column_b._get_dense_tensor(
+        _LazyBuilder(input_features))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    embedding_var = global_vars[0]
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, embedding_var.eval())
+      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
+      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
+
+  def test_get_dense_tensor_placeholder_inputs(self):
+    # Inputs.
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array(
+        [[2, -1, -1],  # example 0, ids [2]
+         [0, 1, -1]])  # example 1, ids [0, 1]
+    input_b = np.array(
+        [[0, -1, -1],  # example 0, ids [0]
+         [-1, -1, -1]])  # example 1, ids []
+    # Specify shape, because dense input must have rank specified.
+    input_a_placeholder = array_ops.placeholder(
+        dtype=dtypes.int64, shape=[None, 3])
+    input_b_placeholder = array_ops.placeholder(
+        dtype=dtypes.int64, shape=[None, 3])
+    input_features = {
+        'aaa': input_a_placeholder,
+        'bbb': input_b_placeholder,
+    }
+    feed_dict = {
+        input_a_placeholder: input_a,
+        input_b_placeholder: input_b,
+    }
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension, initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup_a = embedding_column_a._get_dense_tensor(
+        _LazyBuilder(input_features))
+    embedding_lookup_b = embedding_column_b._get_dense_tensor(
+        _LazyBuilder(input_features))
+
+    with _initialized_session() as sess:
+      sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict)
+
+  def test_linear_model(self):
+    # Inputs.
+    batch_size = 2
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array(
+        [[2, -1, -1],  # example 0, ids [2]
+         [0, 1, -1]])  # example 1, ids [0, 1]
+    input_b = np.array(
+        [[0, -1, -1],  # example 0, ids [0]
+         [-1, -1, -1]])  # example 1, ids []
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension, initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          categorical_column_a.name: input_a,
+          categorical_column_b.name: input_b,
+      }, (embedding_column_a, embedding_column_b))
+      # Linear weights do not follow the column name. But this is a rare use
+      # case, and fixing it would add too much complexity to the code.
+      expected_var_names = (
+          'linear_model/bias_weights:0',
+          'linear_model/aaa_bbb_shared_embedding/weights:0',
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v for v in ops.get_collection(
+              ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
+      linear_weights_a = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/weights:0']
+      linear_weights_b = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights_a.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
+        linear_weights_b.assign(((3.,), (5.,))).eval()
+        # example 0, ids [0], embedding[0] = [1, 2]
+        # example 1, ids [], embedding[1] = 0, 0]
+        # sum(embeddings * linear_weights)
+        # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
+        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+
+  def _test_input_layer(self, trainable=True):
     # Inputs.
     vocabulary_size = 3
     sparse_input_a = sparse_tensor.SparseTensorValue(
         # example 0, ids [2]
         # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
+        indices=((0, 0), (1, 0), (1, 4)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
     sparse_input_b = sparse_tensor.SparseTensorValue(
         # example 0, ids [0]
         # example 1, ids []
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (3, 0)),
-        values=(0, 1),
-        dense_shape=(4, 5))
+        indices=((0, 0),),
+        values=(0,),
+        dense_shape=(2, 5))
 
     # Embedding variable.
     embedding_dimension = 2
@@ -4288,14 +4634,6 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
         # B ids [], embedding = [0, 0]
         (2., 3.5, 0., 0.),
-        # example 2:
-        # A ids [], embedding = [0, 0]
-        # B ids [], embedding = [0, 0]
-        (0., 0., 0., 0.),
-        # example 3:
-        # A ids [1], embedding = [3, 5]
-        # B ids [1], embedding = [3, 5]
-        (3., 5., 3., 5.),
     )
 
     # Build columns.
@@ -4303,9 +4641,10 @@ class SharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension, initializer=_initializer,
+        trainable=trainable)
 
     # Provide sparse input and get dense result.
     input_layer = fc.input_layer(
@@ -4318,17 +4657,26 @@ class SharedEmbeddingColumnTest(test.TestCase):
         ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
         tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(
-        ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
-        tuple([v.name for v in trainable_vars]))
+    if trainable:
+      self.assertItemsEqual(
+          ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
     shared_embedding_vars = ops.get_collection('aaa_bbb_shared_embedding')
     self.assertItemsEqual(
         ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
         tuple([v.name for v in shared_embedding_vars]))
     with _initialized_session():
-      self.assertAllEqual(embedding_values, trainable_vars[0].eval())
+      self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
       self.assertAllEqual(expected_lookups, input_layer.eval())
 
+  def test_input_layer(self):
+    self._test_input_layer()
+
+  def test_input_layer_no_trainable(self):
+    self._test_input_layer(trainable=False)
+
 
 class WeightedCategoricalColumnTest(test.TestCase):
 
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index bf3be34d85120f3d873367aa55948d27d34977cf..d3d8c9c154fbfcc9613acce4e1bdab7df2e7d56d 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -45,12 +45,14 @@ import numpy as np
 import six
 
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import types_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _eager_reshape(tensor, shape, ctx):
@@ -58,7 +60,6 @@ def _eager_reshape(tensor, shape, ctx):
   attr_t = tensor._datatype_enum()  # pylint: disable=protected-access
   attr_tshape, (shape,) = execute.args_to_matching_eager(
       [shape], ctx, dtypes.int32)
-  attr_tshape = attr_tshape
   inputs_flat = [tensor, shape]
   attrs = ("T", attr_t, "Tshape", attr_tshape)
   result, = execute.execute(
@@ -71,7 +72,7 @@ def _eager_fill(dims, value, ctx):
   attr_t = value.dtype.as_datatype_enum
   dims = convert_to_eager_tensor(dims, ctx, dtypes.int32)
   inputs_flat = [dims, value]
-  attrs = ("T", attr_t)
+  attrs = ("T", attr_t, "index_type", types_pb2.DT_INT32)
   result, = execute.execute(
       b"Fill", 1, inputs=inputs_flat, attrs=attrs, ctx=ctx)
   return result
@@ -130,6 +131,7 @@ def convert_to_eager_tensor(value, ctx, dtype=None):
     return ops.EagerTensor(value, context=handle, device=device, dtype=dtype)
 
 
+@tf_export("constant")
 def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
   """Creates a constant tensor.
 
diff --git a/tensorflow/python/framework/cpp_shape_inference.h b/tensorflow/python/framework/cpp_shape_inference.h
index afca7277c775062a8efa7052f789f5146636f4b9..c6ab6b106f5ea335424701afebbabba72ece8660 100644
--- a/tensorflow/python/framework/cpp_shape_inference.h
+++ b/tensorflow/python/framework/cpp_shape_inference.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_CPP_SHAPE_INFERENCE_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_CPP_SHAPE_INFERENCE_H_
+#ifndef TENSORFLOW_PYTHON_FRAMEWORK_CPP_SHAPE_INFERENCE_H_
+#define TENSORFLOW_PYTHON_FRAMEWORK_CPP_SHAPE_INFERENCE_H_
 
 // Must be included first
 #include "tensorflow/python/lib/core/numpy.h"
@@ -51,4 +51,4 @@ std::vector<string> RunCppShapeInference(
 }  // namespace swig
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_CPP_SHAPE_INFERENCE_H_
+#endif  // TENSORFLOW_PYTHON_FRAMEWORK_CPP_SHAPE_INFERENCE_H_
diff --git a/tensorflow/python/framework/device.py b/tensorflow/python/framework/device.py
index 8f5125dcfef004bcbd5a581c5ff9dea1d85cf57e..ab06a2babf3976347714a98a50f95c07cbb6fdda 100644
--- a/tensorflow/python/framework/device.py
+++ b/tensorflow/python/framework/device.py
@@ -19,8 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("DeviceSpec")
 class DeviceSpec(object):
   """Represents a (possibly partial) specification for a TensorFlow device.
 
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index db124ab12acdfb9724f9800f5be36b9f1d45f323..99ae8b24f11c4955379ae532ba7b921ebec63385 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Library of dtypes (Tensor element types)."""
 from __future__ import absolute_import
 from __future__ import division
@@ -21,8 +20,13 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.util.tf_export import tf_export
+
+_np_bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
 
 
+@tf_export("DType")
 class DType(object):
   """Represents the type of the elements in a `Tensor`.
 
@@ -76,8 +80,8 @@ class DType(object):
     # TODO(mrry): Make the necessary changes (using __new__) to ensure
     # that calling this returns one of the interned values.
     type_enum = int(type_enum)
-    if (type_enum not in types_pb2.DataType.values()
-        or type_enum == types_pb2.DT_INVALID):
+    if (type_enum not in types_pb2.DataType.values() or
+        type_enum == types_pb2.DT_INVALID):
       raise TypeError(
           "type_enum is not a valid types_pb2.DataType: %s" % type_enum)
     self._type_enum = type_enum
@@ -116,10 +120,10 @@ class DType(object):
 
   @property
   def is_numpy_compatible(self):
-    numpy_incompatible = [types_pb2.DT_VARIANT,
-                          types_pb2.DT_VARIANT_REF,
-                          types_pb2.DT_RESOURCE,
-                          types_pb2.DT_RESOURCE_REF]
+    numpy_incompatible = [
+        types_pb2.DT_VARIANT, types_pb2.DT_VARIANT_REF, types_pb2.DT_RESOURCE,
+        types_pb2.DT_RESOURCE_REF
+    ]
     return self._type_enum not in numpy_incompatible
 
   @property
@@ -146,8 +150,9 @@ class DType(object):
   @property
   def is_floating(self):
     """Returns whether this is a (non-quantized, real) floating point type."""
-    return self.is_numpy_compatible and np.issubdtype(self.as_numpy_dtype,
-                                                      np.floating)
+    return ((self.is_numpy_compatible and
+             np.issubdtype(self.as_numpy_dtype, np.floating)) or
+            self.base_dtype == bfloat16)
 
   @property
   def is_complex(self):
@@ -157,7 +162,7 @@ class DType(object):
   @property
   def is_quantized(self):
     """Returns whether this is a quantized data type."""
-    return self.base_dtype in [qint8, quint8, qint16, quint16, qint32, bfloat16]
+    return self.base_dtype in [qint8, quint8, qint16, quint16, qint32]
 
   @property
   def is_unsigned(self):
@@ -182,8 +187,8 @@ class DType(object):
       TypeError: if this is a non-numeric, unordered, or quantized type.
 
     """
-    if (self.is_quantized or self.base_dtype in
-        (bool, string, complex64, complex128)):
+    if (self.is_quantized or
+        self.base_dtype in (bool, string, complex64, complex128)):
       raise TypeError("Cannot find minimum value of %s." % self)
 
     # there is no simple way to get the min value of a dtype, we have to check
@@ -194,6 +199,8 @@ class DType(object):
       try:
         return np.iinfo(self.as_numpy_dtype()).min
       except:
+        if self.base_dtype == bfloat16:
+          return _np_bfloat16(float.fromhex("-0x1.FEp127"))
         raise TypeError("Cannot find minimum value of %s." % self)
 
   @property
@@ -204,8 +211,8 @@ class DType(object):
       TypeError: if this is a non-numeric, unordered, or quantized type.
 
     """
-    if (self.is_quantized or self.base_dtype in
-        (bool, string, complex64, complex128)):
+    if (self.is_quantized or
+        self.base_dtype in (bool, string, complex64, complex128)):
       raise TypeError("Cannot find maximum value of %s." % self)
 
     # there is no simple way to get the max value of a dtype, we have to check
@@ -216,6 +223,8 @@ class DType(object):
       try:
         return np.iinfo(self.as_numpy_dtype()).max
       except:
+        if self.base_dtype == bfloat16:
+          return _np_bfloat16(float.fromhex("0x1.FEp127"))
         raise TypeError("Cannot find maximum value of %s." % self)
 
   @property
@@ -229,9 +238,9 @@ class DType(object):
       min, max : tuple
         Lower and upper intensity limits.
     """
-    min, max = dtype_range[self.as_numpy_dtype]
+    min, max = dtype_range[self.as_numpy_dtype]  # pylint: disable=redefined-builtin
     if clip_negative:
-      min = 0
+      min = 0  # pylint: disable=redefined-builtin
     return min, max
 
   def is_compatible_with(self, other):
@@ -254,8 +263,8 @@ class DType(object):
       this `DType`.
     """
     other = as_dtype(other)
-    return self._type_enum in (
-        other.as_datatype_enum, other.base_dtype.as_datatype_enum)
+    return self._type_enum in (other.as_datatype_enum,
+                               other.base_dtype.as_datatype_enum)
 
   def __eq__(self, other):
     """Returns True iff this DType refers to the same type as `other`."""
@@ -295,48 +304,74 @@ class DType(object):
       return 1
     return np.dtype(self.as_numpy_dtype).itemsize
 
+
 # Define data type range of numpy dtype
-dtype_range = {np.bool_: (False, True),
-               np.bool8: (False, True),
-               np.uint8: (0, 255),
-               np.uint16: (0, 65535),
-               np.int8: (-128, 127),
-               np.int16: (-32768, 32767),
-               np.int64: (-2**63, 2**63 - 1),
-               np.uint64: (0, 2**64 - 1),
-               np.int32: (-2**31, 2**31 - 1),
-               np.uint32: (0, 2**32 - 1),
-               np.float32: (-1, 1),
-               np.float64: (-1, 1)}
+dtype_range = {
+    np.bool_: (False, True),
+    np.bool8: (False, True),
+    np.uint8: (0, 255),
+    np.uint16: (0, 65535),
+    np.int8: (-128, 127),
+    np.int16: (-32768, 32767),
+    np.int64: (-2**63, 2**63 - 1),
+    np.uint64: (0, 2**64 - 1),
+    np.int32: (-2**31, 2**31 - 1),
+    np.uint32: (0, 2**32 - 1),
+    np.float32: (-1, 1),
+    np.float64: (-1, 1)
+}
 
 # Define standard wrappers for the types_pb2.DataType enum.
 resource = DType(types_pb2.DT_RESOURCE)
+tf_export("resource").export_constant(__name__, "resource")
 variant = DType(types_pb2.DT_VARIANT)
+tf_export("variant").export_constant(__name__, "variant")
 float16 = DType(types_pb2.DT_HALF)
+tf_export("float16").export_constant(__name__, "float16")
 half = float16
+tf_export("half").export_constant(__name__, "half")
 float32 = DType(types_pb2.DT_FLOAT)
+tf_export("float32").export_constant(__name__, "float32")
 float64 = DType(types_pb2.DT_DOUBLE)
+tf_export("float64").export_constant(__name__, "float64")
 double = float64
+tf_export("double").export_constant(__name__, "double")
 int32 = DType(types_pb2.DT_INT32)
+tf_export("int32").export_constant(__name__, "int32")
 uint8 = DType(types_pb2.DT_UINT8)
+tf_export("uint8").export_constant(__name__, "uint8")
 uint16 = DType(types_pb2.DT_UINT16)
+tf_export("uint16").export_constant(__name__, "uint16")
 uint32 = DType(types_pb2.DT_UINT32)
 uint64 = DType(types_pb2.DT_UINT64)
 int16 = DType(types_pb2.DT_INT16)
+tf_export("int16").export_constant(__name__, "int16")
 int8 = DType(types_pb2.DT_INT8)
+tf_export("int8").export_constant(__name__, "int8")
 string = DType(types_pb2.DT_STRING)
+tf_export("string").export_constant(__name__, "string")
 complex64 = DType(types_pb2.DT_COMPLEX64)
+tf_export("complex64").export_constant(__name__, "complex64")
 complex128 = DType(types_pb2.DT_COMPLEX128)
+tf_export("complex128").export_constant(__name__, "complex128")
 int64 = DType(types_pb2.DT_INT64)
-bool = DType(types_pb2.DT_BOOL)
+tf_export("int64").export_constant(__name__, "int64")
+bool = DType(types_pb2.DT_BOOL)  # pylint: disable=redefined-builtin
+tf_export("bool").export_constant(__name__, "bool")
 qint8 = DType(types_pb2.DT_QINT8)
+tf_export("qint8").export_constant(__name__, "qint8")
 quint8 = DType(types_pb2.DT_QUINT8)
+tf_export("quint8").export_constant(__name__, "quint8")
 qint16 = DType(types_pb2.DT_QINT16)
+tf_export("qint16").export_constant(__name__, "qint16")
 quint16 = DType(types_pb2.DT_QUINT16)
+tf_export("quint16").export_constant(__name__, "quint16")
 qint32 = DType(types_pb2.DT_QINT32)
+tf_export("qint32").export_constant(__name__, "qint32")
 resource_ref = DType(types_pb2.DT_RESOURCE_REF)
 variant_ref = DType(types_pb2.DT_VARIANT_REF)
 bfloat16 = DType(types_pb2.DT_BFLOAT16)
+tf_export("bfloat16").export_constant(__name__, "bfloat16")
 float16_ref = DType(types_pb2.DT_HALF_REF)
 half_ref = float16_ref
 float32_ref = DType(types_pb2.DT_FLOAT_REF)
@@ -361,7 +396,6 @@ quint16_ref = DType(types_pb2.DT_QUINT16_REF)
 qint32_ref = DType(types_pb2.DT_QINT32_REF)
 bfloat16_ref = DType(types_pb2.DT_BFLOAT16_REF)
 
-
 # Maintain an intern table so that we don't have to create a large
 # number of small objects.
 _INTERN_TABLE = {
@@ -413,7 +447,6 @@ _INTERN_TABLE = {
     types_pb2.DT_VARIANT_REF: variant_ref,
 }
 
-
 # Standard mappings between types_pb2.DataType values and string names.
 _TYPE_TO_STRING = {
     types_pb2.DT_HALF: "float16",
@@ -463,8 +496,10 @@ _TYPE_TO_STRING = {
     types_pb2.DT_RESOURCE_REF: "resource_ref",
     types_pb2.DT_VARIANT_REF: "variant_ref",
 }
-_STRING_TO_TF = {value: _INTERN_TABLE[key]
-                 for key, value in _TYPE_TO_STRING.items()}
+_STRING_TO_TF = {
+    value: _INTERN_TABLE[key]
+    for key, value in _TYPE_TO_STRING.items()
+}
 # Add non-canonical aliases.
 _STRING_TO_TF["half"] = float16
 _STRING_TO_TF["half_ref"] = float16_ref
@@ -473,7 +508,6 @@ _STRING_TO_TF["float_ref"] = float32_ref
 _STRING_TO_TF["double"] = float64
 _STRING_TO_TF["double_ref"] = float64_ref
 
-
 # Numpy representation for quantized dtypes.
 #
 # These are magic strings that are used in the swig wrapper to identify
@@ -486,6 +520,8 @@ _np_qint16 = np.dtype([("qint16", np.int16, 1)])
 _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
 _np_qint32 = np.dtype([("qint32", np.int32, 1)])
 
+# _np_bfloat16 is defined by a module import.
+
 # Custom struct dtype for directly-fed ResourceHandles of supported type(s).
 np_resource = np.dtype([("resource", np.ubyte, 1)])
 
@@ -511,70 +547,115 @@ _NP_TO_TF = frozenset([
     (_np_qint16, qint16),
     (_np_quint16, quint16),
     (_np_qint32, qint32),
-    # NOTE(touts): Intentionally no way to feed a DT_BFLOAT16.
+    (_np_bfloat16, bfloat16),
 ])
 _TF_TO_NP = {
-    types_pb2.DT_HALF: np.float16,
-    types_pb2.DT_FLOAT: np.float32,
-    types_pb2.DT_DOUBLE: np.float64,
-    types_pb2.DT_INT32: np.int32,
-    types_pb2.DT_UINT8: np.uint8,
-    types_pb2.DT_UINT16: np.uint16,
-    types_pb2.DT_UINT32: np.uint32,
-    types_pb2.DT_UINT64: np.uint64,
-    types_pb2.DT_INT16: np.int16,
-    types_pb2.DT_INT8: np.int8,
+    types_pb2.DT_HALF:
+        np.float16,
+    types_pb2.DT_FLOAT:
+        np.float32,
+    types_pb2.DT_DOUBLE:
+        np.float64,
+    types_pb2.DT_INT32:
+        np.int32,
+    types_pb2.DT_UINT8:
+        np.uint8,
+    types_pb2.DT_UINT16:
+        np.uint16,
+    types_pb2.DT_UINT32:
+        np.uint32,
+    types_pb2.DT_UINT64:
+        np.uint64,
+    types_pb2.DT_INT16:
+        np.int16,
+    types_pb2.DT_INT8:
+        np.int8,
     # NOTE(touts): For strings we use np.object as it supports variable length
     # strings.
-    types_pb2.DT_STRING: np.object,
-    types_pb2.DT_COMPLEX64: np.complex64,
-    types_pb2.DT_COMPLEX128: np.complex128,
-    types_pb2.DT_INT64: np.int64,
-    types_pb2.DT_BOOL: np.bool,
-    types_pb2.DT_QINT8: _np_qint8,
-    types_pb2.DT_QUINT8: _np_quint8,
-    types_pb2.DT_QINT16: _np_qint16,
-    types_pb2.DT_QUINT16: _np_quint16,
-    types_pb2.DT_QINT32: _np_qint32,
-    types_pb2.DT_BFLOAT16: np.uint16,
+    types_pb2.DT_STRING:
+        np.object,
+    types_pb2.DT_COMPLEX64:
+        np.complex64,
+    types_pb2.DT_COMPLEX128:
+        np.complex128,
+    types_pb2.DT_INT64:
+        np.int64,
+    types_pb2.DT_BOOL:
+        np.bool,
+    types_pb2.DT_QINT8:
+        _np_qint8,
+    types_pb2.DT_QUINT8:
+        _np_quint8,
+    types_pb2.DT_QINT16:
+        _np_qint16,
+    types_pb2.DT_QUINT16:
+        _np_quint16,
+    types_pb2.DT_QINT32:
+        _np_qint32,
+    types_pb2.DT_BFLOAT16:
+        _np_bfloat16,
 
     # Ref types
-    types_pb2.DT_HALF_REF: np.float16,
-    types_pb2.DT_FLOAT_REF: np.float32,
-    types_pb2.DT_DOUBLE_REF: np.float64,
-    types_pb2.DT_INT32_REF: np.int32,
-    types_pb2.DT_UINT32_REF: np.uint32,
-    types_pb2.DT_UINT8_REF: np.uint8,
-    types_pb2.DT_UINT16_REF: np.uint16,
-    types_pb2.DT_INT16_REF: np.int16,
-    types_pb2.DT_INT8_REF: np.int8,
-    types_pb2.DT_STRING_REF: np.object,
-    types_pb2.DT_COMPLEX64_REF: np.complex64,
-    types_pb2.DT_COMPLEX128_REF: np.complex128,
-    types_pb2.DT_INT64_REF: np.int64,
-    types_pb2.DT_UINT64_REF: np.uint64,
-    types_pb2.DT_BOOL_REF: np.bool,
-    types_pb2.DT_QINT8_REF: _np_qint8,
-    types_pb2.DT_QUINT8_REF: _np_quint8,
-    types_pb2.DT_QINT16_REF: _np_qint16,
-    types_pb2.DT_QUINT16_REF: _np_quint16,
-    types_pb2.DT_QINT32_REF: _np_qint32,
-    types_pb2.DT_BFLOAT16_REF: np.uint16,
+    types_pb2.DT_HALF_REF:
+        np.float16,
+    types_pb2.DT_FLOAT_REF:
+        np.float32,
+    types_pb2.DT_DOUBLE_REF:
+        np.float64,
+    types_pb2.DT_INT32_REF:
+        np.int32,
+    types_pb2.DT_UINT32_REF:
+        np.uint32,
+    types_pb2.DT_UINT8_REF:
+        np.uint8,
+    types_pb2.DT_UINT16_REF:
+        np.uint16,
+    types_pb2.DT_INT16_REF:
+        np.int16,
+    types_pb2.DT_INT8_REF:
+        np.int8,
+    types_pb2.DT_STRING_REF:
+        np.object,
+    types_pb2.DT_COMPLEX64_REF:
+        np.complex64,
+    types_pb2.DT_COMPLEX128_REF:
+        np.complex128,
+    types_pb2.DT_INT64_REF:
+        np.int64,
+    types_pb2.DT_UINT64_REF:
+        np.uint64,
+    types_pb2.DT_BOOL_REF:
+        np.bool,
+    types_pb2.DT_QINT8_REF:
+        _np_qint8,
+    types_pb2.DT_QUINT8_REF:
+        _np_quint8,
+    types_pb2.DT_QINT16_REF:
+        _np_qint16,
+    types_pb2.DT_QUINT16_REF:
+        _np_quint16,
+    types_pb2.DT_QINT32_REF:
+        _np_qint32,
+    types_pb2.DT_BFLOAT16_REF:
+        _np_bfloat16,
 }
 
-
-QUANTIZED_DTYPES = frozenset(
-    [qint8, quint8, qint16, quint16, qint32, qint8_ref, quint8_ref, qint16_ref,
-     quint16_ref, qint32_ref])
+QUANTIZED_DTYPES = frozenset([
+    qint8, quint8, qint16, quint16, qint32, qint8_ref, quint8_ref, qint16_ref,
+    quint16_ref, qint32_ref
+])
+tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES")
 
 
+@tf_export("as_dtype")
 def as_dtype(type_value):
   """Converts the given `type_value` to a `DType`.
 
   Args:
     type_value: A value that can be converted to a `tf.DType`
       object. This may currently be a `tf.DType` object, a
-      [`DataType` enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
+      [`DataType`
+        enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
       a string type name, or a `numpy.dtype`.
 
   Returns:
@@ -611,5 +692,4 @@ def as_dtype(type_value):
     except TypeError as e:
       raise TypeError("Cannot convert {} to a dtype. {}".format(type_value, e))
 
-  raise TypeError(
-      "Cannot convert value %r to a TensorFlow DType." % type_value)
+  raise TypeError("Cannot convert value %r to a TensorFlow DType." % type_value)
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 67842e14b1077fdf69aa3405f4f43fc92e499b4d..e49e2fda5d84da4f8f87fae73874351afe0a20f2 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -176,7 +176,7 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertEqual(dtypes.as_dtype("float64").is_floating, True)
     self.assertEqual(dtypes.as_dtype("string").is_floating, False)
     self.assertEqual(dtypes.as_dtype("bool").is_floating, False)
-    self.assertEqual(dtypes.as_dtype("bfloat16").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("bfloat16").is_floating, True)
     self.assertEqual(dtypes.as_dtype("qint8").is_floating, False)
     self.assertEqual(dtypes.as_dtype("qint16").is_floating, False)
     self.assertEqual(dtypes.as_dtype("qint32").is_floating, False)
@@ -276,6 +276,9 @@ class TypesTest(test_util.TensorFlowTestCase):
       if numpy_dtype in (np.float16, np.float32, np.float64):
         self.assertEquals(dtype.min, np.finfo(numpy_dtype).min)
         self.assertEquals(dtype.max, np.finfo(numpy_dtype).max)
+      if numpy_dtype == dtypes.bfloat16.as_numpy_dtype:
+        self.assertEquals(dtype.min, float.fromhex("-0x1.FEp127"))
+        self.assertEquals(dtype.max, float.fromhex("0x1.FEp127"))
 
   def testRepr(self):
     for enum, name in dtypes._TYPE_TO_STRING.items():
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index c3b2c498c3118087ed57d825ae5c4e66703d8174..2a40316d51c023df9c664d0dd79a0df3b2ac5041 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -25,8 +25,10 @@ from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("OpError", "errors.OpError")
 class OpError(Exception):
   """A generic error that is raised when TensorFlow execution fails.
 
@@ -133,25 +135,48 @@ class OpError(Exception):
 
 
 OK = error_codes_pb2.OK
+tf_export("errors.OK").export_constant(__name__, "OK")
 CANCELLED = error_codes_pb2.CANCELLED
+tf_export("errors.CANCELLED").export_constant(__name__, "CANCELLED")
 UNKNOWN = error_codes_pb2.UNKNOWN
+tf_export("errors.UNKNOWN").export_constant(__name__, "UNKNOWN")
 INVALID_ARGUMENT = error_codes_pb2.INVALID_ARGUMENT
+tf_export("errors.INVALID_ARGUMENT").export_constant(__name__,
+                                                     "INVALID_ARGUMENT")
 DEADLINE_EXCEEDED = error_codes_pb2.DEADLINE_EXCEEDED
+tf_export("errors.DEADLINE_EXCEEDED").export_constant(__name__,
+                                                      "DEADLINE_EXCEEDED")
 NOT_FOUND = error_codes_pb2.NOT_FOUND
+tf_export("errors.NOT_FOUND").export_constant(__name__, "NOT_FOUND")
 ALREADY_EXISTS = error_codes_pb2.ALREADY_EXISTS
+tf_export("errors.ALREADY_EXISTS").export_constant(__name__, "ALREADY_EXISTS")
 PERMISSION_DENIED = error_codes_pb2.PERMISSION_DENIED
+tf_export("errors.PERMISSION_DENIED").export_constant(__name__,
+                                                      "PERMISSION_DENIED")
 UNAUTHENTICATED = error_codes_pb2.UNAUTHENTICATED
+tf_export("errors.UNAUTHENTICATED").export_constant(__name__, "UNAUTHENTICATED")
 RESOURCE_EXHAUSTED = error_codes_pb2.RESOURCE_EXHAUSTED
+tf_export("errors.RESOURCE_EXHAUSTED").export_constant(__name__,
+                                                       "RESOURCE_EXHAUSTED")
 FAILED_PRECONDITION = error_codes_pb2.FAILED_PRECONDITION
+tf_export("errors.FAILED_PRECONDITION").export_constant(__name__,
+                                                        "FAILED_PRECONDITION")
 ABORTED = error_codes_pb2.ABORTED
+tf_export("errors.ABORTED").export_constant(__name__, "ABORTED")
 OUT_OF_RANGE = error_codes_pb2.OUT_OF_RANGE
+tf_export("errors.OUT_OF_RANGE").export_constant(__name__, "OUT_OF_RANGE")
 UNIMPLEMENTED = error_codes_pb2.UNIMPLEMENTED
+tf_export("errors.UNIMPLEMENTED").export_constant(__name__, "UNIMPLEMENTED")
 INTERNAL = error_codes_pb2.INTERNAL
+tf_export("errors.INTERNAL").export_constant(__name__, "INTERNAL")
 UNAVAILABLE = error_codes_pb2.UNAVAILABLE
+tf_export("errors.UNAVAILABLE").export_constant(__name__, "UNAVAILABLE")
 DATA_LOSS = error_codes_pb2.DATA_LOSS
+tf_export("errors.DATA_LOSS").export_constant(__name__, "DATA_LOSS")
 
 
 # pylint: disable=line-too-long
+@tf_export("errors.CancelledError")
 class CancelledError(OpError):
   """Raised when an operation or step is cancelled.
 
@@ -172,6 +197,7 @@ class CancelledError(OpError):
 # pylint: enable=line-too-long
 
 
+@tf_export("errors.UnknownError")
 class UnknownError(OpError):
   """Unknown error.
 
@@ -189,6 +215,7 @@ class UnknownError(OpError):
     super(UnknownError, self).__init__(node_def, op, message, error_code)
 
 
+@tf_export("errors.InvalidArgumentError")
 class InvalidArgumentError(OpError):
   """Raised when an operation receives an invalid argument.
 
@@ -209,6 +236,7 @@ class InvalidArgumentError(OpError):
                                                INVALID_ARGUMENT)
 
 
+@tf_export("errors.DeadlineExceededError")
 class DeadlineExceededError(OpError):
   """Raised when a deadline expires before an operation could complete.
 
@@ -223,6 +251,7 @@ class DeadlineExceededError(OpError):
                                                 DEADLINE_EXCEEDED)
 
 
+@tf_export("errors.NotFoundError")
 class NotFoundError(OpError):
   """Raised when a requested entity (e.g., a file or directory) was not found.
 
@@ -239,6 +268,7 @@ class NotFoundError(OpError):
     super(NotFoundError, self).__init__(node_def, op, message, NOT_FOUND)
 
 
+@tf_export("errors.AlreadyExistsError")
 class AlreadyExistsError(OpError):
   """Raised when an entity that we attempted to create already exists.
 
@@ -256,6 +286,7 @@ class AlreadyExistsError(OpError):
                                              ALREADY_EXISTS)
 
 
+@tf_export("errors.PermissionDeniedError")
 class PermissionDeniedError(OpError):
   """Raised when the caller does not have permission to run an operation.
 
@@ -273,6 +304,7 @@ class PermissionDeniedError(OpError):
                                                 PERMISSION_DENIED)
 
 
+@tf_export("errors.UnauthenticatedError")
 class UnauthenticatedError(OpError):
   """The request does not have valid authentication credentials.
 
@@ -287,6 +319,7 @@ class UnauthenticatedError(OpError):
                                                UNAUTHENTICATED)
 
 
+@tf_export("errors.ResourceExhaustedError")
 class ResourceExhaustedError(OpError):
   """Some resource has been exhausted.
 
@@ -302,6 +335,7 @@ class ResourceExhaustedError(OpError):
                                                  RESOURCE_EXHAUSTED)
 
 
+@tf_export("errors.FailedPreconditionError")
 class FailedPreconditionError(OpError):
   """Operation was rejected because the system is not in a state to execute it.
 
@@ -318,6 +352,7 @@ class FailedPreconditionError(OpError):
                                                   FAILED_PRECONDITION)
 
 
+@tf_export("errors.AbortedError")
 class AbortedError(OpError):
   """The operation was aborted, typically due to a concurrent action.
 
@@ -335,6 +370,7 @@ class AbortedError(OpError):
     super(AbortedError, self).__init__(node_def, op, message, ABORTED)
 
 
+@tf_export("errors.OutOfRangeError")
 class OutOfRangeError(OpError):
   """Raised when an operation iterates past the valid input range.
 
@@ -353,6 +389,7 @@ class OutOfRangeError(OpError):
                                           OUT_OF_RANGE)
 
 
+@tf_export("errors.UnimplementedError")
 class UnimplementedError(OpError):
   """Raised when an operation has not been implemented.
 
@@ -371,6 +408,7 @@ class UnimplementedError(OpError):
                                              UNIMPLEMENTED)
 
 
+@tf_export("errors.InternalError")
 class InternalError(OpError):
   """Raised when the system experiences an internal error.
 
@@ -385,6 +423,7 @@ class InternalError(OpError):
     super(InternalError, self).__init__(node_def, op, message, INTERNAL)
 
 
+@tf_export("errors.UnavailableError")
 class UnavailableError(OpError):
   """Raised when the runtime is currently unavailable.
 
@@ -399,6 +438,7 @@ class UnavailableError(OpError):
                                            UNAVAILABLE)
 
 
+@tf_export("errors.DataLossError")
 class DataLossError(OpError):
   """Raised when unrecoverable data loss or corruption is encountered.
 
@@ -437,10 +477,12 @@ _EXCEPTION_CLASS_TO_CODE = dict((
     (class_, code) for (code, class_) in _CODE_TO_EXCEPTION_CLASS.items()))
 
 
+@tf_export("errors.exception_type_from_error_code")
 def exception_type_from_error_code(error_code):
   return _CODE_TO_EXCEPTION_CLASS[error_code]
 
 
+@tf_export("errors.error_code_from_exception_type")
 def error_code_from_exception_type(cls):
   return _EXCEPTION_CLASS_TO_CODE[cls]
 
@@ -457,7 +499,8 @@ def _make_specific_exception(node_def, op, message, error_code):
 # Named like a function for backwards compatibility with the
 # @tf_contextlib.contextmanager version, which was switched to a class to avoid
 # some object creation overhead.
-class raise_exception_on_not_ok_status(object):  # pylint: disable=invalid-name
+@tf_export("errors.raise_exception_on_not_ok_status")  # pylint: disable=invalid-name
+class raise_exception_on_not_ok_status(object):
   """Context manager to check for C API status."""
 
   def __enter__(self):
diff --git a/tensorflow/python/framework/framework_lib.py b/tensorflow/python/framework/framework_lib.py
index d16fe979e6ef9a41063c3a2b3e8a3e18de2aa9d7..3172f3c2c3d259d2c3f2b340b101aef043d0fc33 100644
--- a/tensorflow/python/framework/framework_lib.py
+++ b/tensorflow/python/framework/framework_lib.py
@@ -118,7 +118,7 @@ from tensorflow.python.framework.ops import register_tensor_conversion_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.python.framework.dtypes import *
+from tensorflow.python.framework.dtypes import *  # pylint: disable=redefined-builtin
 
 # Load a TensorFlow plugin
 from tensorflow.python.framework.load_library import *
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 29cf2237244810a888d53927f44889b4a4e9704e..cba225e749d88a45c43266e45172a7335a8e0b71 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -82,8 +82,8 @@ class Defun(object):
     return x + y, x - y
 
   # Building the graph.
-  a = tf.Constant([1.0])
-  b = tf.Constant([2.0])
+  a = tf.constant([1.0])
+  b = tf.constant([2.0])
   c, d = MyFunc(a, b, name='mycall')
   ```
   """
@@ -417,7 +417,7 @@ class _DefinedFunction(object):
       if self._func_name:
         assert self._func_name == self._op_def.name
       else:
-        self._func_name = self._op_def.name
+        self._func_name = compat.as_str(self._op_def.name)
 
   def _set_c_attrs(self, attrs):
     """Sets `attrs` as attributes of self._c_func.
@@ -682,7 +682,7 @@ class _FuncGraph(ops.Graph):
 
   def create_op(self, op_type, inputs, data_types, **kwargs):
     for i, x in enumerate(inputs):
-      if x.graph is not self:
+      if isinstance(x, ops.EagerTensor) or x.graph is not self:
         # Referring to a tensor from other graph.
         if x in self._captured:
           # Captured already.
@@ -692,7 +692,10 @@ class _FuncGraph(ops.Graph):
         else:
           # Substitute with a placeholder.
           self.extra_inputs.append(x)
-          ph = array_ops.placeholder(x.dtype, shape=x.get_shape())
+          # Hoist the new input placeholder out of any control flow context
+          # we're currently in.
+          with ops.control_dependencies(None):
+            ph = array_ops.placeholder(x.dtype, shape=x.get_shape())
           # pylint: disable=protected-access
           ph._handle_data = x._handle_data
           # pylint: enable=protected-access
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index ba43e9199b4764fef4b86056a1ae57bd9070003e..301a7f682dde8dbeccd1e81675b0059433990a09 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -19,12 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import re
+import sys
 import time
 
 import numpy as np
 
 from tensorflow.core.framework import function_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -84,6 +86,21 @@ class FunctionTest(test.TestCase):
       with session.Session() as sess:
         self.assertAllEqual([18.0], sess.run(call))
 
+  def testIdentityImplicitDeref(self):
+
+    @function.Defun(dtypes.float32, func_name="MyIdentity")
+    def MyIdentityFunc(a):
+      return a
+
+    with ops.Graph().as_default():
+      var = variables.Variable([18.0])
+      call = MyIdentityFunc(var._ref())  # pylint: disable=protected-access
+      self.assertEqual("MyIdentity", call.op.name)
+      for cfg in _OptimizerOptions():
+        with session.Session(config=cfg) as sess:
+          sess.run(var.initializer)
+          self.assertAllEqual([18.0], sess.run(call))
+
   def testIdentityOutputName(self):
 
     @function.Defun(
@@ -450,13 +467,17 @@ class FunctionTest(test.TestCase):
                                          lambda y: AssertFail(y), [x])
       # pylint: enable=unnecessary-lambda
 
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
     # Enables inlining.
-    config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-        optimizer_options=config_pb2.OptimizerOptions(
-            opt_level=config_pb2.OptimizerOptions.L0,
-            do_common_subexpression_elimination=True,
-            do_function_inlining=True,
-            do_constant_folding=True)))
+    config = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(
+            optimizer_options=config_pb2.OptimizerOptions(
+                opt_level=config_pb2.OptimizerOptions.L0,
+                do_common_subexpression_elimination=True,
+                do_function_inlining=True,
+                do_constant_folding=True),
+            rewrite_options=rewriter_config))
 
     with session.Session(config=config) as sess:
       # Since the 'False' branch is not taken, the assertion should not fire.
@@ -724,6 +745,38 @@ class FunctionTest(test.TestCase):
         # NOTE: We still do not support capturing control deps.
         _ = Foo(x)
 
+  def testCaptureInWhileLoop(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = constant_op.constant(1)
+
+      @function.Defun()
+      def Foo():
+        return control_flow_ops.while_loop(lambda i: i < 10,
+                                           lambda i: i + x,
+                                           [0])
+      y = Foo()
+
+    with self.test_session(graph=g) as sess:
+      self.assertEqual(sess.run(y), 10)
+
+  def testCaptureInCond(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = constant_op.constant(1)
+
+      @function.Defun(dtypes.bool)
+      def Foo(pred):
+        return control_flow_ops.cond(pred,
+                                     lambda: x,
+                                     lambda: x + 1)
+      y = Foo(True)
+      z = Foo(False)
+
+    with self.test_session(graph=g) as sess:
+      self.assertEqual(sess.run(y), 1)
+      self.assertEqual(sess.run(z), 2)
+
   def testStableName(self):
 
     @function.Defun()
@@ -733,8 +786,12 @@ class FunctionTest(test.TestCase):
     # We added more randomness to function names in C API.
     # TODO(iga): Remove this if statement when we switch to C API.
     if ops._USE_C_API:  # pylint: disable=protected-access
-      self.assertEqual("Foo_aCYSbwBkR5A",
-                       Foo.instantiate([dtypes.float32] * 3).name)
+      if sys.byteorder == "big":
+        self.assertEqual("Foo_kEdkAG8SJvg",
+                         Foo.instantiate([dtypes.float32] * 3).name)
+      else:
+        self.assertEqual("Foo_aCYSbwBkR5A",
+                         Foo.instantiate([dtypes.float32] * 3).name)
     else:
       self.assertEqual("Foo_d643acf7",
                        Foo.instantiate([dtypes.float32] * 3).name)
@@ -882,6 +939,94 @@ class FunctionTest(test.TestCase):
           np.array([1.0, 0.0]).astype(np.float32),
           sess.run(dinp, {inp: x}))
 
+  def testFunctionMarkedStateful(self):
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def Foo(t, x):
+      return x[t]
+
+    @function.Defun(dtypes.int64)
+    def Bar(x):
+      return x
+
+    # NOTE(mrry): All functions are currently considered stateless by the
+    # runtime, so we simulate a "stateful" function.
+    # TODO(b/70565970): Remove this hack when we are able to build stateful
+    # functions using the API.
+    # pylint: disable=protected-access
+    Foo._signature.is_stateful = True
+    Bar._signature.is_stateful = True
+    # pylint: enable=protected-access
+
+    result_1 = Foo(3, [1.0, 2.0, 3.0, 4.0])
+    result_2 = Bar(constant_op.constant(100, dtype=dtypes.int64))
+
+    with session.Session() as sess:
+      self.assertEqual(4.0, sess.run(result_1))
+      self.assertEqual(100, sess.run(result_2))
+      self.assertEqual((4.0, 100), sess.run((result_1, result_2)))
+
+  def testStatefulFunction(self):
+
+    @function.Defun()
+    def FunctionWithStatelessOp():
+      return constant_op.constant(42.0)
+
+    @function.Defun()
+    def FunctionWithStatefulOp():
+      return random_ops.random_uniform([100], maxval=10, dtype=dtypes.int32)
+
+    @function.Defun()
+    def FunctionWithStatelessFunctionCall():
+      return FunctionWithStatelessOp()
+
+    @function.Defun()
+    def FunctionWithStatefulFunctionCall():
+      return FunctionWithStatefulOp()
+
+    # Test that the `is_stateful` bit is propagated.
+    self.assertFalse(FunctionWithStatelessOp.definition.signature.is_stateful)
+    self.assertTrue(FunctionWithStatefulOp.definition.signature.is_stateful)
+    self.assertFalse(
+        FunctionWithStatelessFunctionCall.definition.signature.is_stateful)
+    self.assertTrue(
+        FunctionWithStatefulFunctionCall.definition.signature.is_stateful)
+
+    # Ensure that two invocations of the same random-number-generating
+    # function produce different results.
+    result1 = FunctionWithStatefulFunctionCall()
+    result2 = FunctionWithStatefulFunctionCall()
+
+    # Statefulness affects how the function is treated by the various
+    # optimization passes, so run the test in each optimizer
+    # configuration.
+    for config in _OptimizerOptions():
+      with session.Session(config=config) as sess:
+        val1, val2 = sess.run((result1, result2))
+        self.assertFalse(all(val1 == val2))
+        val3, val4 = sess.run((result1, result2))
+        self.assertFalse(all(val3 == val1))
+        self.assertFalse(all(val4 == val2))
+
+  def testSameFunctionOnTwoDevices(self):
+
+    @function.Defun(dtypes.float32)
+    def AddOne(x):
+      return x + 1.0
+
+    with ops.device("/cpu:0"):
+      f_0 = AddOne(41.0)
+
+    with ops.device("/cpu:1"):
+      f_1 = AddOne(43.0)
+
+    for config in _OptimizerOptions():
+      config.device_count["CPU"] = 2
+      with session.Session(config=config) as sess:
+        self.assertEqual(42.0, sess.run(f_0))
+        self.assertEqual(44.0, sess.run(f_1))
+        self.assertEqual((42.0, 44.0), sess.run((f_0, f_1)))
+
 
 @test_util.with_c_api
 class FunctionsFromProtos(test.TestCase):
@@ -1313,7 +1458,7 @@ class FunctionInlineControlTest(test.TestCase):
       def Cell(v):
         # If v is a vector [n, 1], x is a big square matrix.
         x = math_ops.tanh(v + array_ops.transpose(v, [1, 0]))
-        return math_ops.reduce_sum(x, 1, keep_dims=True)
+        return math_ops.reduce_sum(x, 1, keepdims=True)
 
       @function.Defun(dtype)
       def Forward(x):
diff --git a/tensorflow/python/framework/graph_io.py b/tensorflow/python/framework/graph_io.py
index a0ea4ad48eb84b22f42ea840513ebefbf6b4abbe..be30b16f5f0a76469226687fc1a419882b96f133 100644
--- a/tensorflow/python/framework/graph_io.py
+++ b/tensorflow/python/framework/graph_io.py
@@ -24,8 +24,10 @@ import os.path
 from google.protobuf import text_format
 from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('train.write_graph')
 def write_graph(graph_or_graph_def, logdir, name, as_text=True):
   """Writes a graph proto to a file.
 
diff --git a/tensorflow/python/framework/graph_to_function_def.py b/tensorflow/python/framework/graph_to_function_def.py
index 448f87aa6ee31127113ed10aee8e4e0fa06482f1..5bf30ee68491c5c0686cc9572f024299dbfe587a 100644
--- a/tensorflow/python/framework/graph_to_function_def.py
+++ b/tensorflow/python/framework/graph_to_function_def.py
@@ -58,7 +58,7 @@ def _is_in_placeholders(op, func_arg_placeholders):
 
 
 def _get_node_def(op):
-  return op._node_def  # pylint: disable=protected-access
+  return op.node_def  # pylint: disable=protected-access
 
 
 def _get_op_def(op):
@@ -110,6 +110,13 @@ def _add_op_node(op, func, input_dict):
                                                (node_def.input[i],
                                                 input_dict.items()))
       node_def.input[i] = input_dict[node_def.input[i]]
+  # The function is stateful if any of its operations are stateful.
+  # NOTE(mrry): The "Const" node typically does not have an `OpDef` associated
+  # with it, so we assume any nodes without an `OpDef` are stateless.
+  # TODO(skyewm): Remove the `is not None` test after we transition to the C
+  # API.
+  if op.op_def is not None and op.op_def.is_stateful:
+    func.signature.is_stateful = True
 
 
 def graph_to_function_def(graph, operations, inputs, outputs, out_names=None):
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 6c7b4553881637ce0b2ec63449bde0a397ef2d72..5a543317e665a940841714fd72d834a430f8406a 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 _VARIABLE_OPS = {
     "Assign",
@@ -49,6 +50,7 @@ def _is_variable_op(op):
   return op in _VARIABLE_OPS
 
 
+@tf_export("graph_util.must_run_on_cpu")
 def must_run_on_cpu(node, pin_variables_on_cpu=False):
   """Returns True if the given node_def must run on CPU, otherwise False.
 
@@ -147,6 +149,7 @@ def _bfs_for_reachable_nodes(target_nodes, name_to_input_name):
   return nodes_to_keep
 
 
+@tf_export("graph_util.extract_sub_graph")
 def extract_sub_graph(graph_def, dest_nodes):
   """Extract the subgraph that can reach any of the nodes in 'dest_nodes'.
 
@@ -184,6 +187,7 @@ def extract_sub_graph(graph_def, dest_nodes):
   return out
 
 
+@tf_export("graph_util.tensor_shape_from_node_def_name")
 def tensor_shape_from_node_def_name(graph, input_name):
   """Convenience function to get a shape from a NodeDef's input string."""
   # To get a tensor, the name must be in the form <input>:<port>, for example
@@ -198,6 +202,7 @@ def tensor_shape_from_node_def_name(graph, input_name):
   return shape
 
 
+@tf_export("graph_util.convert_variables_to_constants")
 def convert_variables_to_constants(sess,
                                    input_graph_def,
                                    output_node_names,
@@ -270,6 +275,7 @@ def convert_variables_to_constants(sess,
   return output_graph_def
 
 
+@tf_export("graph_util.remove_training_nodes")
 def remove_training_nodes(input_graph, protected_nodes=None):
   """Prunes out nodes that aren't needed for inference.
 
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 434cbda7ad6ede8f6d07dc2ecfb74ea42cad2d46..6ecc1a40ae14760dd39242aaf595b32a9decdc9f 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """A utility function for importing TensorFlow graphs."""
 from __future__ import absolute_import
 from __future__ import division
@@ -36,14 +35,15 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(josh11b): SWIG the code from node_def_util instead of duplicating
 # the logic here.
 def _GetNodeAttr(node_def, attr_name):
   if attr_name not in node_def.attr:
-    raise ValueError('Expected one attr with name %r in %s.'
-                     % (attr_name, str(node_def)))
+    raise ValueError('Expected one attr with name %r in %s.' % (attr_name,
+                                                                str(node_def)))
   return node_def.attr[attr_name]
 
 
@@ -150,7 +150,7 @@ def _MaybeDevice(device):
     yield
 
 
-def _ProcessGraphDefParam(graph_def):
+def _ProcessGraphDefParam(graph_def, op_dict):
   """Type-checks and possibly canonicalizes `graph_def`."""
   if not isinstance(graph_def, graph_pb2.GraphDef):
     # `graph_def` could be a dynamically-created message, so try a duck-typed
@@ -161,6 +161,22 @@ def _ProcessGraphDefParam(graph_def):
       graph_def.MergeFrom(old_graph_def)
     except TypeError:
       raise TypeError('graph_def must be a GraphDef proto.')
+  else:
+    # If we're using the graph_def provided by the caller, modify graph_def
+    # in-place to add attr defaults to the NodeDefs (this is visible to the
+    # caller).
+    # NOTE(skyewm): this is undocumented behavior that at least meta_graph.py
+    # depends on. It might make sense to move this to meta_graph.py and have
+    # import_graph_def not modify the graph_def argument (we'd have to make sure
+    # this doesn't break anything else.)
+    for node in graph_def.node:
+      if node.op not in op_dict:
+        # Assume unrecognized ops are functions for now. TF_ImportGraphDef will
+        # report an error if the op is actually missing.
+        continue
+      op_def = op_dict[node.op]
+      _SetDefaultAttrValues(node, op_def)
+
   return graph_def
 
 
@@ -169,9 +185,8 @@ def _ProcessInputMapParam(input_map):
   if input_map is None:
     input_map = {}
   else:
-    if not (isinstance(input_map, dict)
-            and all(isinstance(k, compat.bytes_or_text_types)
-                    for k in input_map.keys())):
+    if not (isinstance(input_map, dict) and all(
+        isinstance(k, compat.bytes_or_text_types) for k in input_map.keys())):
       raise TypeError('input_map must be a dictionary mapping strings to '
                       'Tensor objects.')
   return input_map
@@ -179,12 +194,12 @@ def _ProcessInputMapParam(input_map):
 
 def _ProcessReturnElementsParam(return_elements):
   """Type-checks and possibly canonicalizes `return_elements`."""
-  if return_elements is not None:
-    return_elements = tuple(return_elements)
-    if not all(isinstance(x, compat.bytes_or_text_types)
-               for x in return_elements):
-      raise TypeError('return_elements must be a list of strings.')
-  return return_elements
+  if return_elements is None:
+    return None
+  if not all(
+      isinstance(x, compat.bytes_or_text_types) for x in return_elements):
+    raise TypeError('return_elements must be a list of strings.')
+  return tuple(compat.as_str(x) for x in return_elements)
 
 
 def _FindAttrInOpDef(attr_name, op_def):
@@ -194,24 +209,150 @@ def _FindAttrInOpDef(attr_name, op_def):
   return None
 
 
-def _PopulateTFImportGraphDefOptions(options, prefix, return_elements):
+def _RemoveDefaultAttrs(op_dict, producer_op_list, graph_def):
+  """Removes unknown default attrs according to `producer_op_list`.
+
+  Removes any unknown attrs in `graph_def` (i.e. attrs that do not appear in
+  the OpDefs in `op_dict`) that have a default value in `producer_op_list`.
+
+  Args:
+    op_dict: dict mapping operation name to OpDef.
+    producer_op_list: OpList proto.
+    graph_def: GraphDef proto
+  """
+  producer_op_dict = {op.name: op for op in producer_op_list.op}
+  for node in graph_def.node:
+    # Remove any default attr values that aren't in op_def.
+    if node.op in producer_op_dict:
+      op_def = op_dict[node.op]
+      producer_op_def = producer_op_dict[node.op]
+      # We make a copy of node.attr to iterate through since we may modify
+      # node.attr inside the loop.
+      for key in list(node.attr):
+        if _FindAttrInOpDef(key, op_def) is None:
+          # No attr_def in consumer, look in producer.
+          attr_def = _FindAttrInOpDef(key, producer_op_def)
+          if (attr_def and attr_def.HasField('default_value') and
+              node.attr[key] == attr_def.default_value):
+            # Unknown attr had default value in producer, delete it so it can be
+            # understood by consumer.
+            del node.attr[key]
+
+
+def _ConvertInputMapValues(name, input_map):
+  """Ensures all input map values are tensors.
+
+  This should be called from inside the import name scope.
+
+  Args:
+    name: the `name` argument passed to import_graph_def
+    input_map: the `input_map` argument passed to import_graph_def.
+
+  Returns:
+    An possibly-updated version of `input_map`.
+
+  Raises:
+    ValueError: if input map values cannot be converted due to empty name scope.
+  """
+  if not all(isinstance(v, ops.Tensor) for v in input_map.values()):
+    if name == '':  # pylint: disable=g-explicit-bool-comparison
+      raise ValueError(
+          'tf.import_graph_def() requires a non-empty `name` if `input_map` '
+          'contains non-Tensor values. Try calling tf.convert_to_tensor() on '
+          '`input_map` values before calling tf.import_graph_def().')
+    with ops.name_scope('_inputs'):
+      input_map = {k: ops.convert_to_tensor(v) for k, v in input_map.items()}
+  return input_map
+
+
+def _PopulateTFImportGraphDefOptions(options, prefix, input_map,
+                                     return_elements):
   """Populates the TF_ImportGraphDefOptions `options`."""
   c_api.TF_ImportGraphDefOptionsSetPrefix(options, prefix)
-
+  c_api.TF_ImportGraphDefOptionsSetUniquifyNames(options, True)
+
+  for input_src, input_dst in input_map.items():
+    input_src = compat.as_str(input_src)
+    if input_src.startswith('^'):
+      src_name = compat.as_bytes(input_src[1:])
+      dst_op = input_dst._as_tf_output().oper  # pylint: disable=protected-access
+      c_api.TF_ImportGraphDefOptionsRemapControlDependency(
+          options, src_name, dst_op)
+    else:
+      src_name, src_idx = _ParseTensorName(input_src)
+      src_name = compat.as_str(src_name)
+      dst_output = input_dst._as_tf_output()  # pylint: disable=protected-access
+      c_api.TF_ImportGraphDefOptionsAddInputMapping(options, src_name, src_idx,
+                                                    dst_output)
   for name in return_elements or []:
     if ':' in name:
       op_name, index = _ParseTensorName(name)
+      op_name = compat.as_str(op_name)
       c_api.TF_ImportGraphDefOptionsAddReturnOutput(options, op_name, index)
     else:
-      c_api.TF_ImportGraphDefOptionsAddReturnOperation(options, name)
+      c_api.TF_ImportGraphDefOptionsAddReturnOperation(options,
+                                                       compat.as_str(name))
 
 
 def _ProcessNewOps(graph):
   """Processes the newly-added TF_Operations in `graph`."""
-  for c_op in c_api_util.new_tf_operations(graph):
-    graph._create_op_from_tf_operation(c_op)  # pylint: disable=protected-access
-
-  # TODO(skyewm): colocation logic
+  # Maps from a node to the names of the ops it's colocated with, if colocation
+  # is specified in the attributes.
+  colocation_pairs = {}
+
+  for new_op in graph._add_new_tf_operations(compute_devices=False):  # pylint: disable=protected-access
+    colocation_names = _GetColocationNames(new_op)
+    if colocation_names:
+      colocation_pairs[new_op] = colocation_names
+      # Don't apply this op's device function, since colocation constraints
+      # override device functions. Note that this op's device may still be set
+      # by the loop below.
+    else:
+      with _MaybeDevice(new_op.device):
+        graph._apply_device_functions(new_op)  # pylint: disable=protected-access
+
+  # The following loop populates the device field of ops that are colocated
+  # with another op.  This is implied by the colocation attribute, but we
+  # propagate the device field for completeness.
+  for op, coloc_op_list in colocation_pairs.items():
+    coloc_device = None
+    # Find any device in the list of colocated ops that have a device, if it
+    # exists.  We assume that if multiple ops have devices, they refer to the
+    # same device.  Otherwise, a runtime error will occur since the colocation
+    # property cannot be guaranteed.
+    #
+    # One possible improvement is to try to check for compatibility of all
+    # devices in this list at import time here, which would require
+    # implementing a compatibility function for device specs in python.
+    for coloc_op_name in coloc_op_list:
+      try:
+        coloc_op = graph._get_operation_by_name_unsafe(coloc_op_name)  # pylint: disable=protected-access
+      except KeyError:
+        raise ValueError('Specified colocation to an op that '
+                         'does not exist during import: %s in %s' %
+                         (coloc_op_name, op.name))
+      if coloc_op.device:
+        coloc_device = pydev.DeviceSpec.from_string(coloc_op.device)
+        break
+    if coloc_device:
+      op._set_device(coloc_device)  # pylint: disable=protected-access
+
+
+def _GetColocationNames(op):
+  """Returns names of the ops that `op` should be colocated with."""
+  colocation_names = []
+  try:
+    class_values = op.get_attr('_class')
+  except ValueError:
+    # No _class attr
+    return
+  for val in class_values:
+    val = compat.as_str(val)
+    if val.startswith('loc:@'):
+      colocation_node_name = val[len('loc:@'):]
+      if colocation_node_name != op.name:
+        colocation_names.append(colocation_node_name)
+  return colocation_names
 
 
 def _GatherReturnElements(requested_return_elements, graph, results):
@@ -243,12 +384,27 @@ def _GatherReturnElements(requested_return_elements, graph, results):
   return combined_return_elements
 
 
+def _SetDefaultAttrValues(node_def, op_def):
+  """Set any default attr values in `node_def` that aren't present."""
+  assert node_def.op == op_def.name
+  for attr_def in op_def.attr:
+    key = attr_def.name
+    if attr_def.HasField('default_value'):
+      value = node_def.attr[key]
+      if value is None or value.WhichOneof('value') is None:
+        node_def.attr[key].CopyFrom(attr_def.default_value)
+
+
+@tf_export('import_graph_def')
 @deprecated_args(None, 'Please file an issue at '
                  'https://github.com/tensorflow/tensorflow/issues if you depend'
-                 ' on this feature.',
-                 'op_dict')
-def import_graph_def(graph_def, input_map=None, return_elements=None,
-                     name=None, op_dict=None, producer_op_list=None):
+                 ' on this feature.', 'op_dict')
+def import_graph_def(graph_def,
+                     input_map=None,
+                     return_elements=None,
+                     name=None,
+                     op_dict=None,
+                     producer_op_list=None):
   """Imports the graph from `graph_def` into the current default `Graph`.
 
   This function provides a way to import a serialized TensorFlow
@@ -290,16 +446,15 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
       do not appear in `graph_def`, or `graph_def` is not well-formed (e.g.
       it refers to an unknown tensor).
   """
-  graph_def = _ProcessGraphDefParam(graph_def)
+  op_dict = op_def_registry.get_registered_ops()
+
+  graph_def = _ProcessGraphDefParam(graph_def, op_dict)
   input_map = _ProcessInputMapParam(input_map)
   return_elements = _ProcessReturnElementsParam(return_elements)
 
-  op_dict = op_def_registry.get_registered_ops()
-
-  if producer_op_list is None:
-    producer_op_dict = None
-  else:
-    producer_op_dict = {op.name: op for op in producer_op_list.op}
+  if producer_op_list is not None:
+    # TODO(skyewm): make a copy of graph_def so we're not mutating the argument?
+    _RemoveDefaultAttrs(op_dict, producer_op_list, graph_def)
 
   graph = ops.get_default_graph()
 
@@ -312,17 +467,54 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
       else:
         prefix = ''
 
+      # Generate any input map tensors inside name scope
+      input_map = _ConvertInputMapValues(name, input_map)
+
     scoped_options = c_api_util.ScopedTFImportGraphDefOptions()
     options = scoped_options.options
-    _PopulateTFImportGraphDefOptions(options, prefix, return_elements)
+    _PopulateTFImportGraphDefOptions(options, prefix, input_map,
+                                     return_elements)
 
     with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
-      with errors.raise_exception_on_not_ok_status() as status:
-        results = c_api.TF_GraphImportGraphDefWithResults(
-            graph._c_graph, serialized, options, status)  # pylint: disable=protected-access
+      try:
+        with errors.raise_exception_on_not_ok_status() as status:
+          results = c_api.TF_GraphImportGraphDefWithResults(
+              graph._c_graph, serialized, options, status)  # pylint: disable=protected-access
+      except errors.InvalidArgumentError as e:
+        # Convert to ValueError for backwards compatibility.
+        raise ValueError(str(e))
 
     _ProcessNewOps(graph)
 
+    # Create _DefinedFunctions for any imported functions.
+    #
+    # We do this by creating _DefinedFunctions directly from `graph_def`, and
+    # adding them to `graph`. Adding an existing function to a TF_Graph is a
+    # no-op, so this only has the effect of updating the Python state (usually
+    # _DefinedFunction.add_to_graph also adds the function to the TF_Graph).
+    #
+    # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
+    # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
+    if graph_def.library and graph_def.library.function:
+      # pylint: disable=protected-access
+      functions = function._from_library(graph_def.library)
+      for f in functions:
+        f.add_to_graph(graph)
+      # pylint: enable=protected-access
+
+    # Treat input mappings that don't appear in the graph as an error, because
+    # they are likely to be due to a typo.
+    missing_unused_input_keys = (
+        c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
+            results))
+    if missing_unused_input_keys:
+      missing_unused_input_keys = [
+          compat.as_str(s) for s in missing_unused_input_keys
+      ]
+      raise ValueError(
+          'Attempted to map inputs that were not found in graph_def: [%s]' %
+          ', '.join(missing_unused_input_keys))
+
     if return_elements is None:
       return None
     else:
@@ -359,16 +551,7 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
       # more nuanced.
       g.graph_def_versions.CopyFrom(graph_def.versions)
 
-      if not all(isinstance(v, ops.Tensor) for v in input_map.values()):
-        if not scope:
-          # The caller must have passed `name=''`.
-          raise ValueError(
-              'tf.import_graph_def() requires a non-empty `name` if `input_map`'
-              ' contains non-Tensor values. Try calling tf.convert_to_tensor() '
-              'on `input_map` values before calling tf.import_graph_def().')
-        with ops.name_scope('_inputs'):
-          input_map = {k: ops.convert_to_tensor(v)
-                       for k, v in input_map.items()}
+      input_map = _ConvertInputMapValues(name, input_map)
 
       # NOTE(mrry): We do this in two passes, because there may be a cycle in
       # `graph_def`.
@@ -378,31 +561,9 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
         # Check to see if this op's name matches a previously seen op
         if node.name in name_to_op:
           raise ValueError('Duplicate name \'%s\' in GraphDef.' % node.name)
-        # Set any default attr values that aren't present.
         if node.op not in op_dict:
           raise ValueError('No op named %s in defined operations.' % node.op)
         op_def = op_dict[node.op]
-        for attr_def in op_def.attr:
-          key = attr_def.name
-          if attr_def.HasField('default_value'):
-            value = node.attr[key]
-            if value is None or value.WhichOneof('value') is None:
-              node.attr[key].CopyFrom(attr_def.default_value)
-        if producer_op_dict:
-          # Remove any default attr values that aren't in op_def.
-          if node.op in producer_op_dict:
-            producer_op_def = producer_op_dict[node.op]
-            # We make a copy of node.attr to iterate through since we
-            # may modify node.attr inside the loop.
-            for key in list(node.attr):
-              if _FindAttrInOpDef(key, op_def) is None:
-                # No attr_def in consumer, look in producer.
-                attr_def = _FindAttrInOpDef(key, producer_op_def)
-                if (attr_def and attr_def.HasField('default_value') and
-                    node.attr[key] == attr_def.default_value):
-                  # Unknown attr had default value in producer, delete it
-                  # so it can be understood by consumer.
-                  del node.attr[key]
 
         output_types = _OutputTypes(node, op_dict)
         name_to_op[node.name] = g.create_op(
@@ -505,13 +666,13 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
                   node, 'Input tensor %r %s' % (input_name, te)))
 
         # pylint: disable=protected-access
-        if op._input_dtypes != input_types:
+        if op._input_types != input_types:
           raise ValueError(
               _InvalidNodeMessage(
                   node,
                   'Input types mismatch (expected %r but got %r)'
                   % (', '.join(dtypes.as_dtype(x).name for x in input_types),
-                     ', '.join(x.name for x in op._input_dtypes))))
+                     ', '.join(x.name for x in op._input_types))))
         # pylint: enable=protected-access
 
         if not g._is_function(op.type):  # pylint: disable=protected-access
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index 5a6187c8a6df1d4c076722c7655d4bd1b276c211..bf5d9fe0936882c242198bdc7118f9f3a4e79260 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import test_ops  # pylint: disable=unused-impor
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -111,8 +112,6 @@ class ImportGraphDefTest(test.TestCase):
       self.assertNotEqual(None, a.op_def)
 
   def testMultipleImport(self):
-    if ops._USE_C_API: return  # TODO(skyewm): set uniquify_names
-
     graph_def = self._MakeGraphDef("""
     node { name: 'A' op: 'IntOutput' }
     node { name: 'B' op: 'IntInput' input: 'A:0' }
@@ -155,17 +154,36 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(b3.name, "A_3/B")
       self.assertEqual(list(b3.inputs), [a3.outputs[0]])
 
+      # Import with an already-used name but with a '/' to indicate an
+      # "absolute" name scope (see the Graph.name_scope docstring).
+      a_a, a_b = importer.import_graph_def(
+          graph_def,
+          return_elements=["A", "B"],
+          name="A/")
+      self.assertEqual(a_a.name, "A/A")
+      self.assertEqual(a_b.name, "A/B")
+      self.assertEqual(list(a_b.inputs), [a_a.outputs[0]])
+
+      # Repeat the same import.
+      a_a1, a_b1 = importer.import_graph_def(
+          graph_def,
+          return_elements=["A", "B"],
+          name="A/")
+      self.assertEqual(a_a1.name, "A/A_1")
+      self.assertEqual(a_b1.name, "A/B_1")
+      self.assertEqual(list(a_b1.inputs), [a_a1.outputs[0]])
+
       # Import with existing de-duped node names
-      a4, b4 = importer.import_graph_def(
+      a1_1, b1_1 = importer.import_graph_def(
           self._MakeGraphDef("""
           node { name: 'A_1' op: 'IntOutput' }
           node { name: 'B_1' op: 'IntInput' input: 'A_1:0' }
           """),
           return_elements=["A_1", "B_1"],
           name="")
-      self.assertEqual(a4.name, "A_1_1")
-      self.assertEqual(b4.name, "B_1_1")
-      self.assertEqual(list(b4.inputs), [a4.outputs[0]])
+      self.assertEqual(a1_1.name, "A_1_1")
+      self.assertEqual(b1_1.name, "B_1_1")
+      self.assertEqual(list(b1_1.inputs), [a1_1.outputs[0]])
 
       # Create a name scope and then import node with same name
       with ops.name_scope("foo"):
@@ -201,8 +219,6 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(outer_inner_c.name, "outer/inner/c_1")
 
   def testInputMap(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     with ops.Graph().as_default():
       feed_a_0 = constant_op.constant(0, dtype=dtypes.int32)
       feed_b_1 = constant_op.constant(1, dtype=dtypes.int32)
@@ -230,8 +246,6 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(d.inputs[1], feed_b_1)
 
   def testInputMapBytes(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     with ops.Graph().as_default():
       feed_a_0 = constant_op.constant(0, dtype=dtypes.int32)
       feed_b_1 = constant_op.constant(1, dtype=dtypes.int32)
@@ -259,8 +273,6 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(d.inputs[1], feed_b_1)
 
   def testInputMapUnicode(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     with ops.Graph().as_default():
       feed_a_0 = constant_op.constant(0, dtype=dtypes.int32)
       feed_b_1 = constant_op.constant(1, dtype=dtypes.int32)
@@ -299,8 +311,6 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(b.inputs[0], a.outputs[0])
 
   def testInputMapImplicitZerothOutput(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     with ops.Graph().as_default():
       feed_a_0 = constant_op.constant(0, dtype=dtypes.int32)
       b, = importer.import_graph_def(
@@ -341,47 +351,46 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(d.inputs[1], b.outputs[0])
 
       self.assertEqual(a.outputs[0].dtype, dtypes.int32_ref)
-      self.assertEqual(c._input_dtypes, [dtypes.int32, dtypes.int32])
+      self.assertEqual(c._input_types, [dtypes.int32, dtypes.int32])
       self.assertEqual(c.outputs, [])
-      self.assertEqual(d._input_dtypes, [dtypes.int32_ref, dtypes.int32])
+      self.assertEqual(d._input_types, [dtypes.int32_ref, dtypes.int32])
       self.assertEqual(d.outputs, [])
 
-  def testCyclic(self):
-    # Importing cycles not supported with C API enabled (this test will
-    # eventually be deleted).
-    # TODO(skyewm): write while loop test
-    if ops._USE_C_API: return
+  def testWhileLoop(self):
+    # Produce GraphDef containing while loop.
+    graph = ops.Graph()
+    with graph.as_default():
+      r = control_flow_ops.while_loop(lambda i: i < 10, lambda i: i + 1, [0])
+      # Add an op that consumes the while loop output.
+      math_ops.add(r, 1)
+    graph_def = graph.as_graph_def()
 
+    # Import the GraphDef and make sure it runs.
     with ops.Graph().as_default():
-      a, b = importer.import_graph_def(
-          self._MakeGraphDef("""
-          node { name: 'A' op: 'Unary'
-                 attr { key: 'T' value { type: DT_INT32 } } input: 'B:0' }
-          node { name: 'B' op: 'Unary'
-                 attr { key: 'T' value { type: DT_INT32 } } input: 'A:0' }
-          """),
-          return_elements=["A", "B"])
-
-      self.assertEqual(a.inputs[0], b.outputs[0])
-      self.assertEqual(b.inputs[0], a.outputs[0])
+      imported_r, = importer.import_graph_def(graph_def,
+                                              return_elements=[r.name])
+      self.assertEqual(imported_r.name, "import/" + r.name)
+      with self.test_session() as sess:
+        self.assertEqual(sess.run(imported_r), 10)
 
   def testTypeMismatchInGraphDef(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+    if ops._USE_C_API:
+      # TODO(skyewm): improve error message
+      error_msg = ("Input 0 of node import/B was passed int32 from import/A:0 "
+                   "incompatible with expected float.")
+    else:
+      error_msg = ("Cannot convert a tensor of type int32 to an input of type "
+                   "float")
 
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'FloatInput' input: 'A:0' }
             """))
-      self.assertTrue(
-          "Cannot convert a tensor of type int32 to an input of type float" in
-          str(e.exception))
 
   def testShapeWhitelist(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     # Barrier's shape is an output vector of 2, but the
     # graph says it's a scalar.  This is currently whitelisted.
     with ops.Graph().as_default():
@@ -389,14 +398,14 @@ class ImportGraphDefTest(test.TestCase):
           self._MakeGraphDef("""
           node { name: 'A' op: 'Barrier'
                  attr { key: '_output_shapes'
-                        value { list { shape { } } } } }
+                        value { list { shape { } } } }
+                 attr { key: 'component_types'
+                        value { list { type: DT_FLOAT } } } }
           """),
           return_elements=["A"],
           name="import")
 
   def testShapeWhitelistViolation(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     # L2 loss produces a scalar shape, but the graph
     # has the wrong shape, so raise an error.
     with ops.Graph().as_default():
@@ -416,45 +425,51 @@ class ImportGraphDefTest(test.TestCase):
             "Shapes () and (43,) are not compatible" in str(e.exception))
 
   def testInvalidSignatureTooManyInputsInGraphDef(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+    if ops._USE_C_API:
+      # TODO(skyewm): improve error message
+      error_msg = "NodeDef expected inputs '' do not match 1 inputs specified"
+    else:
+      error_msg = r"More inputs specified \('A:0'\) than the op expects"
 
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'None' input: 'A:0' }
             """))
-      self.assertTrue("More inputs specified ('A:0') than the op expects" in
-                      str(e.exception))
 
   def testInvalidSignatureNotEnoughInputsInGraphDef(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+    if ops._USE_C_API:
+      # TODO(skyewm): improve error message
+      error_msg = ("NodeDef expected inputs 'int32, float' do not match 1 "
+                   "inputs specified")
+    else:
+      error_msg = (r"Input types mismatch \(expected 'int32, float32' but "
+                   r"got 'int32'\)")
 
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'IntInputFloatInput' input: 'A:0' }
             """))
-      self.assertTrue("Input types mismatch (expected 'int32, float32' but "
-                      "got 'int32')" in str(e.exception))
 
   def testMissingInputOpInGraphDef(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+    if ops._USE_C_API:
+      error_msg = "Node 'B': Unknown input node 'A:0'"
+    else:
+      error_msg = "Input tensor 'A:0' not found"
 
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'FloatInput' input: 'A:0' }
             """))
-      self.assertTrue("Input tensor 'A:0' not found" in str(e.exception))
 
   def testMissingInputOpInGraphDefButAppearsInInputMap(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     with ops.Graph().as_default():
       feed_a_0 = constant_op.constant(5.0)
       b, = importer.import_graph_def(
@@ -466,111 +481,124 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(b.inputs[0], feed_a_0)
 
   def testMissingInputTensorInGraphDef(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+    if ops._USE_C_API:
+      error_msg = ("Node 'B': Connecting to invalid output 1 of source node A "
+                   "which has 1 outputs")
+    else:
+      error_msg = "Input tensor 'A:1' not found"
 
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'FloatOutput' }
             node { name: 'B' op: 'FloatInput' input: 'A:1' }
             """))
-      self.assertTrue("Input tensor 'A:1' not found" in str(e.exception))
 
   def testMissingControlInputInGraphDef(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+    if ops._USE_C_API:
+      error_msg = r"Node 'B': Unknown input node '\^A'"
+    else:
+      error_msg = r"Control input '\^A' not found"
 
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: '^A' }
             """))
-      self.assertTrue("Control input '^A' not found" in str(e.exception))
 
   def testInvalidTensorNameOutputIndexInGraphDef(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+    if ops._USE_C_API:
+      error_msg = "Node 'B': Unknown input node 'A:B'"
+    else:
+      error_msg = "Cannot convert 'A:B' to a tensor name."
 
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: 'A:B' }
             """))
-      self.assertEqual("Cannot convert 'A:B' to a tensor name.",
-                       str(e.exception))
 
   def testInvalidTensorNameInGraphDef(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+    if ops._USE_C_API:
+      error_msg = "Node 'B': Unknown input node 'A:B:0'"
+    else:
+      error_msg = "Cannot convert 'A:B:0' to a tensor name."
 
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: 'A:B:0' }
             """))
-      self.assertEqual("Cannot convert 'A:B:0' to a tensor name.",
-                       str(e.exception))
 
   def testMissingReturnOperation(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+    if ops._USE_C_API:
+      error_msg = "Requested return node 'B' not found in graph def"
+    else:
+      error_msg = "return_element 'B' not found in graph_def."
 
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'None' }
             """),
             return_elements=["B"])
-      self.assertTrue(
-          "return_element 'B' not found in graph_def." in str(e.exception))
 
   def testMissingReturnTensor(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+    if ops._USE_C_API:
+      error_msg = (r"Invalid return output 1 of node 'A', which has 1 "
+                   r"output\(s\)")
+    else:
+      error_msg = "return_element 'A:1' not found in graph_def."
 
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             return_elements=["A:1"])
-      self.assertTrue(
-          "return_element 'A:1' not found in graph_def." in str(e.exception))
 
-      with self.assertRaises(ValueError) as e:
+      if ops._USE_C_API:
+        error_msg = "Requested return tensor 'B:0' not found in graph def"
+      else:
+        error_msg = "return_element 'B:0' not found in graph_def."
+
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             return_elements=["B:0"])
-      self.assertTrue(
-          "return_element 'B:0' not found in graph_def." in str(e.exception))
 
-      with self.assertRaises(ValueError) as e:
+      if ops._USE_C_API:
+        error_msg = "Cannot convert 'A:B:0' to a tensor name."
+      else:
+        error_msg = "return_element 'A:B:0' not found in graph_def."
+
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             return_elements=["A:B:0"])
-      self.assertTrue(
-          "return_element 'A:B:0' not found in graph_def." in str(e.exception))
 
   def testMissingInputMap(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Attempted to map inputs that were not found in graph_def: \[B:0\]"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'None' }
             """),
             input_map={"B:0": constant_op.constant(5.0)})
-      self.assertTrue("not found in graph_def: [B:0]" in str(e.exception))
 
   def testInputMapUnusedAsInput(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     with ops.Graph().as_default():
       # Mapping an unused node output should succeed.
       importer.import_graph_def(
@@ -580,28 +608,30 @@ class ImportGraphDefTest(test.TestCase):
           input_map={"A:0": constant_op.constant(5.0)})
 
       # Mapping a non-existent output of an existing node should fail.
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Attempted to map inputs that were not found in graph_def: \[A:2\]"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             input_map={"A:2": constant_op.constant(5.0)})
-      self.assertTrue("not found in graph_def: [A:2]" in str(e.exception))
 
   def testInputMapTypeMismatch(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
+    if ops._USE_C_API:
+      error_msg = ("Input 0 of node import/B was passed float from Const:0 "
+                   "incompatible with expected int32.")
+    else:
+      error_msg = ("Cannot convert a tensor of type float32 to an input of "
+                   "type int32.")
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'IntInput' input: 'A:0' }
             """),
             input_map={"A:0": constant_op.constant(5.0)})
-      self.assertTrue(
-          "Cannot convert a tensor of type float32 to an input of type int32."
-          in str(e.exception))
 
   def testNoReturns(self):
     with ops.Graph().as_default() as g:
@@ -651,8 +681,6 @@ class ImportGraphDefTest(test.TestCase):
           b.node_def.attr["_class"])
 
   def testColocationWithDeviceFn(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     original_graph_def = self._MakeGraphDef("""
           node { name: 'A' op: 'None' attr {
             key: '_class'
@@ -674,23 +702,17 @@ class ImportGraphDefTest(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.device(CustomDeviceFn):
-        b, = importer.import_graph_def(
-            original_graph_def, return_elements=["B"], name="imported_graph")
-
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None' device: "/device:A:0"
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-                }
-          }
-          node { name: 'imported_graph/B' op: 'None' device: "/device:A:0"
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-          } }""", b.graph.as_graph_def())
-
-    # Test a scenario where 'A' doesn't get a device; 'A' should
-    # not have a device, but during runtime will get colocated with
-    # 'B' because of the colocation attribute.
+        a, b = importer.import_graph_def(original_graph_def,
+                                         return_elements=["A", "B"],
+                                         name="imported_graph")
+      self.assertEqual(a.device, "/device:A:0")
+      self.assertEqual(b.device, "/device:A:0")
+      self.assertEqual(a.colocation_groups(), [b"loc:@imported_graph/A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@imported_graph/A"])
+
+    # Test a scenario where 'A' doesn't get a device; 'A' should not have a
+    # device, but during runtime will get colocated with 'B' because of the
+    # colocation attribute. B's device function is still overridden by A.
     def BDeviceFn(op):
       if "B" in op.name:
         return "/device:B:0"
@@ -698,19 +720,13 @@ class ImportGraphDefTest(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.device(BDeviceFn):
-        b, = importer.import_graph_def(
-            original_graph_def, return_elements=["B"], name="imported_graph")
-
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None'
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-                }
-          }
-          node { name: 'imported_graph/B' op: 'None'
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-          } }""", b.graph.as_graph_def())
+        a, b = importer.import_graph_def(original_graph_def,
+                                         return_elements=["A", "B"],
+                                         name="imported_graph")
+      self.assertEqual(a.device, "")
+      self.assertEqual(b.device, "")
+      self.assertEqual(a.colocation_groups(), [b"loc:@imported_graph/A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@imported_graph/A"])
 
     # Only A gets a device, so B inherits it implicitly.
     def ADeviceFn(op):
@@ -720,23 +736,15 @@ class ImportGraphDefTest(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.device(ADeviceFn):
-        b, = importer.import_graph_def(
-            original_graph_def, return_elements=["B"], name="imported_graph")
-
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None' device: "/device:A:0"
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-                }
-          }
-          node { name: 'imported_graph/B' op: 'None' device: "/device:A:0"
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-          } }""", b.graph.as_graph_def())
+        a, b = importer.import_graph_def(original_graph_def,
+                                         return_elements=["A", "B"],
+                                         name="imported_graph")
+      self.assertEqual(a.device, "/device:A:0")
+      self.assertEqual(b.device, "/device:A:0")
+      self.assertEqual(a.colocation_groups(), [b"loc:@imported_graph/A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@imported_graph/A"])
 
   def testMultipleColocationWithDeviceFn(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     original_graph_def = self._MakeGraphDef("""
           node { name: 'A' op: 'None'}
           node { name: 'B' op: 'None'}
@@ -757,24 +765,18 @@ class ImportGraphDefTest(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.device(CustomDeviceFn):
-        c, = importer.import_graph_def(
-            original_graph_def, return_elements=["C"], name="imported_graph")
-
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None' }
-          node { name: 'imported_graph/B' op: 'None' device: "/device:B:0" }
-          node { name: 'imported_graph/C' op: 'None' device: "/device:B:0"
-                 attr {
-                   key: '_class' value {
-                     list { s: 'loc:@imported_graph/A'
-                            s: 'loc:@imported_graph/B' }
-                   }
-                 }
-               }""", c.graph.as_graph_def())
+        a, b, c = importer.import_graph_def(original_graph_def,
+                                            return_elements=["A", "B", "C"],
+                                            name="imported_graph")
+      self.assertEqual(a.device, "")
+      self.assertEqual(b.device, "/device:B:0")
+      self.assertEqual(c.device, "/device:B:0")
+      self.assertEqual(a.colocation_groups(), [b"loc:@imported_graph/A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@imported_graph/B"])
+      self.assertEqual(c.colocation_groups(),
+                       [b"loc:@imported_graph/A", b"loc:@imported_graph/B"])
 
   def testNamePrefixColocationAttrsMultipleImport(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     original_graph_def = self._MakeGraphDef("""
           node { name: 'A' op: 'None' }
           node { name: 'B' op: 'None'  attr {
@@ -783,32 +785,33 @@ class ImportGraphDefTest(test.TestCase):
           } }""")
 
     with ops.Graph().as_default():
-      b, = importer.import_graph_def(
-          original_graph_def, return_elements=["B"], name="")
-      _, = importer.import_graph_def(
-          original_graph_def, return_elements=["B"], name="")
-      self.assertProtoEqualsVersion("""
-          node { name: 'A' op: 'None' }
-          node { name: 'B' op: 'None'  attr {
-            key: '_class'
-            value { list { s: 'loc:@A' } }
-          } }
-          node { name: 'A_1' op: 'None' }
-          node { name: 'B_1' op: 'None'  attr {
-            key: '_class'
-            value { list { s: 'loc:@A_1' } }
-          } }""", b.graph.as_graph_def())
+      a, b = importer.import_graph_def(
+          original_graph_def, return_elements=["A", "B"], name="")
+      a_1, b_1 = importer.import_graph_def(
+          original_graph_def, return_elements=["A", "B"], name="")
 
-  def testNamePrefixColocationAttrsNotFound(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+      self.assertEqual(a.name, "A")
+      self.assertEqual(b.name, "B")
+      self.assertEqual(b.colocation_groups(), [b"loc:@A"])
 
+      self.assertEqual(a_1.name, "A_1")
+      self.assertEqual(b_1.name, "B_1")
+      self.assertEqual(b_1.colocation_groups(), [b"loc:@A_1"])
+
+  def testNamePrefixColocationAttrsNotFound(self):
     original_graph_def = self._MakeGraphDef("""
           node { name: 'B' op: 'None'  attr {
             key: '_class'
             value { list { s: 'loc:@A' } }
           } }""")
+
+    if ops._USE_C_API:
+      error_msg = "Node 'B' expects to be colocated with unknown node 'A'"
+    else:
+      error_msg = "does not exist during import"
+
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, "does not exist during import"):
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             original_graph_def, return_elements=["B"], name="imported_graph")
 
@@ -825,8 +828,6 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual("graph_def must be a GraphDef proto.", str(e.exception))
 
   def testInvalidInputForInputMap(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     with ops.Graph().as_default():
       with self.assertRaises(TypeError) as e:
         importer.import_graph_def(
@@ -858,10 +859,9 @@ class ImportGraphDefTest(test.TestCase):
 
   def testInvalidInputForReturnOperations(self):
     with ops.Graph().as_default():
-      with self.assertRaises(TypeError) as e:
+      with self.assertRaisesRegexp(
+          TypeError, "return_elements must be a list of strings."):
         importer.import_graph_def(self._MakeGraphDef(""), return_elements=[7])
-      self.assertEqual("return_elements must be a list of strings.",
-                       str(e.exception))
 
       if ops._USE_C_API:
         error_msg = "Cannot convert 'a:b:c' to a tensor name."
@@ -872,17 +872,19 @@ class ImportGraphDefTest(test.TestCase):
                                   return_elements=["a:b:c"])
 
   def testDuplicateOperationNames(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+    if ops._USE_C_API:
+      error_msg = "Node 'A' is not unique"
+    else:
+      error_msg = "Duplicate name 'A' in GraphDef."
 
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'IntOutput' }
             node { name: 'A' op: 'IntOutput' }
             """))
-      self.assertEqual("Duplicate name 'A' in GraphDef.", str(e.exception))
 
   def testWithExtensionAndAttr(self):
     with ops.Graph().as_default() as g:
@@ -895,8 +897,6 @@ class ImportGraphDefTest(test.TestCase):
       self.assertAllEqual(pack.outputs[0].eval(), [5.0, 5.0])
 
   def testWithDevice(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     with ops.Graph().as_default() as g:
       # No device.
       a = constant_op.constant(3.0, name="a")
@@ -940,8 +940,6 @@ class ImportGraphDefTest(test.TestCase):
         self.assertEqual(c.device + "/device:GPU:0", c5.device)
 
   def testWithDeviceFunctionDependingOnInputs(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     with ops.Graph().as_default() as g:
       with ops.device("/job:ps"):
         v1 = constant_op.constant(1.0)
@@ -967,8 +965,6 @@ class ImportGraphDefTest(test.TestCase):
     self.assertEqual(2, len(ops_with_two_inputs))
 
   def testGradient(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     with ops.Graph().as_default() as g:
       inputs = array_ops.placeholder(
           dtypes.float32, shape=[None, 100], name="input")
@@ -1046,23 +1042,26 @@ class ImportGraphDefTest(test.TestCase):
             sess.run(x)
 
   def testVersionHigh(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     with ops.Graph().as_default() as g:
       pat = (r"GraphDef min consumer version %d above current version %d "
              r"for TensorFlow \S+\.  Please upgrade TensorFlow\.$" %
              (1 << 30, versions.GRAPH_DEF_VERSION))
-      importer.import_graph_def(self._MakeGraphDef("", min_consumer=1 << 30))
-      x = constant_op.constant(
-          7)  # Need at least one op to get a C++ graph generated
-      with self.test_session(graph=g) as sess:
-        with self.assertRaisesRegexp(Exception, pat):
-          sess.run(x)
+
+      if ops._USE_C_API:
+        with self.assertRaisesRegexp(ValueError, pat):
+          importer.import_graph_def(self._MakeGraphDef("",
+                                                       min_consumer=1 << 30))
+      else:
+        # Python API only throws when graph is run
+        importer.import_graph_def(self._MakeGraphDef("", min_consumer=1 << 30))
+        x = constant_op.constant(
+            7)  # Need at least one op to get a C++ graph generated
+        with self.test_session(graph=g) as sess:
+          with self.assertRaisesRegexp(Exception, pat):
+            sess.run(x)
 
   def testVersionAppliesToOpConstruction(self):
     """These tests rely on shape fns in test_ops.cc."""
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     with ops.Graph().as_default():
       importer.import_graph_def(
           self._MakeGraphDef(
@@ -1089,8 +1088,6 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(123.0, a[0].get_attr("default_float"))
 
   def testDefaultAttrsRemoved(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     producer_op_list = op_def_pb2.OpList()
     text_format.Merge("""
       op {
@@ -1107,23 +1104,28 @@ class ImportGraphDefTest(test.TestCase):
           """),
           return_elements=["A"],
           producer_op_list=producer_op_list)
-      with self.assertRaisesRegexp(ValueError, "No attr named 'default_int'"):
+      if ops._USE_C_API:
+        error_msg = "Operation 'import/A' has no attr named 'default_int'."
+      else:
+        error_msg = "No attr named 'default_int'"
+      with self.assertRaisesRegexp(ValueError, error_msg):
         a[0].get_attr("default_int")
 
-    # Attr only in producer_op_list with non-default value is preserved.
-    with ops.Graph().as_default():
-      a = importer.import_graph_def(
-          self._MakeGraphDef("""
-          node { name: 'A' op: 'OpWithFutureDefaultAttr'
-                 attr { key: 'default_int' value { i: 987 } } }
-          """),
-          return_elements=["A"],
-          producer_op_list=producer_op_list)
-      self.assertEqual(987, a[0].get_attr("default_int"))
+    # Unknown attrs cannot be imported using C API. This test will eventually be
+    # deleted.
+    if not ops._USE_C_API:
+      # Attr only in producer_op_list with non-default value is preserved.
+      with ops.Graph().as_default():
+        a = importer.import_graph_def(
+            self._MakeGraphDef("""
+            node { name: 'A' op: 'OpWithFutureDefaultAttr'
+                   attr { key: 'default_int' value { i: 987 } } }
+            """),
+            return_elements=["A"],
+            producer_op_list=producer_op_list)
+        self.assertEqual(987, a[0].get_attr("default_int"))
 
   def testFunctions(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     dtype = dtypes.float32
     @function.Defun(dtype, dtype, dtype, dtype)
     def Grad(x, y, dout1, dout2):  # pylint: disable=unused-argument
@@ -1201,8 +1203,6 @@ class ImportGraphDefTest(test.TestCase):
         self.assertEqual(sess.run("outer:0"), 21)
 
   def testImportInsideDefun(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     g = ops.Graph()
     with g.as_default():
       @function.Defun()
@@ -1226,8 +1226,6 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(z_val, -2.0)
 
   def testImportGraphWithFunctionTwice(self):
-    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
-
     g = ops.Graph()
     with g.as_default():
       @function.Defun()
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 909e6d4c7be76743211d4c9045706fce62d4910e..1f2aa264c110930b318f30e3a24010a96ebce47e 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -21,15 +21,17 @@ from __future__ import print_function
 import hashlib
 import imp
 import sys
-import threading
+import threading  # pylint: disable=unused-import
 
 from tensorflow.core.framework import op_def_pb2
-from tensorflow.core.lib.core import error_codes_pb2
+from tensorflow.core.lib.core import error_codes_pb2  # pylint: disable=unused-import
 from tensorflow.python import pywrap_tensorflow as py_tf
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('load_op_library')
 def load_op_library(library_filename):
   """Loads a TensorFlow plugin, containing custom ops and kernels.
 
@@ -79,6 +81,7 @@ def load_op_library(library_filename):
   return module
 
 
+@tf_export('load_file_system_library')
 def load_file_system_library(library_filename):
   """Loads a TensorFlow plugin, containing file system implementation.
 
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 44ddc013b2817956b95bf4da068b2fb77f87a07c..8c03a5f19dee31a6609590e46d608af9a686c5fe 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -31,6 +31,7 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import importer
@@ -86,6 +87,10 @@ def _node_def(from_node_def, export_scope, unbound_inputs, clear_devices=False):
                compat.as_str(s).split("@")[1].startswith(export_scope)]
       node_def.attr[k].CopyFrom(attr_value_pb2.AttrValue(
           list=attr_value_pb2.AttrValue.ListValue(s=new_s)))
+    elif node_def.op in ("Enter", "RefEnter") and k == "frame_name":
+      if not export_scope or compat.as_str(v.s).startswith(export_scope):
+        new_s = compat.as_bytes(ops.strip_name_scope(v.s, export_scope))
+      node_def.attr[k].CopyFrom(attr_value_pb2.AttrValue(s=new_s))
     else:
       node_def.attr[k].CopyFrom(v)
 
@@ -442,6 +447,67 @@ def add_collection_def(meta_graph_def, key, graph=None,
     return
 
 
+def _is_default_attr_value(op_def, attr_name, attr_value):
+  """Checks if given attribute matches the default value in the op def."""
+  for attr_def in op_def.attr:
+    if attr_def.name == attr_name:
+      if not attr_def.HasField("default_value"):
+        return False
+      # pywrap_tensorflow.EqualAttrValueWrapper returns an empty string
+      # if both arguments represent an equivalent AttrValue instance.
+      return not pywrap_tensorflow.EqualAttrValueWrapper(
+          attr_value.SerializeToString(),
+          attr_def.default_value.SerializeToString())
+  return False
+
+
+def _strip_graph_default_valued_attrs(meta_graph_def):
+  """Strips default valued attributes for node defs in given MetaGraphDef.
+
+  This method also sets `meta_info_def.stripped_default_attrs` in the given
+  `MetaGraphDef` proto to True.
+
+  Args:
+    meta_graph_def: `MetaGraphDef` protocol buffer
+
+  Returns:
+    None.
+  """
+  # Map function op names to their function definitions.
+  op_name_to_function = {}
+  for function_def in meta_graph_def.graph_def.library.function:
+    op_name_to_function[function_def.signature.name] = function_def
+
+  # Get all registered ops.
+  registered_ops = op_def_registry.get_registered_ops()
+
+  def _strip_node_default_valued_attrs(node_def):
+    """Removes default valued attributes from a single node def."""
+    if node_def.op in op_name_to_function or node_def.op not in registered_ops:
+      return
+    op_def = registered_ops[node_def.op]
+
+    attrs_to_strip = set()
+    for attr_name, attr_value in node_def.attr.items():
+      if _is_default_attr_value(op_def, attr_name, attr_value):
+        attrs_to_strip.add(attr_name)
+
+    for attr in attrs_to_strip:
+      del node_def.attr[attr]
+
+  # Process all NodeDef instances in graph_def.
+  for node_def in meta_graph_def.graph_def.node:
+    _strip_node_default_valued_attrs(node_def)
+
+  # Process all NodeDef instances in graph_def.library.function.
+  for function_def in meta_graph_def.graph_def.library.function:
+    for function_node_def in function_def.node_def:
+      _strip_node_default_valued_attrs(function_node_def)
+
+  # Tell consumers of this graph that default valued attrs have been stripped.
+  meta_graph_def.meta_info_def.stripped_default_attrs = True
+
+
 def create_meta_graph_def(meta_info_def=None,
                           graph_def=None,
                           saver_def=None,
@@ -449,7 +515,9 @@ def create_meta_graph_def(meta_info_def=None,
                           graph=None,
                           export_scope=None,
                           exclude_nodes=None,
-                          clear_extraneous_savers=False):
+                          clear_extraneous_savers=False,
+                          strip_default_attrs=False):
+  # pylint: disable=line-too-long
   """Construct and returns a `MetaGraphDef` protocol buffer.
 
   Args:
@@ -464,12 +532,17 @@ def create_meta_graph_def(meta_info_def=None,
     clear_extraneous_savers: Remove any preexisting SaverDefs from the SAVERS
         collection.  Note this method does not alter the graph, so any
         extraneous Save/Restore ops should have been removed already, as needed.
+    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+
   Returns:
     MetaGraphDef protocol buffer.
 
   Raises:
     TypeError: If the arguments are not of the correct proto buffer type.
   """
+  # pylint: enable=line-too-long
   # Type check.
   if graph and not isinstance(graph, ops.Graph):
     raise TypeError("graph must be of type Graph, not %s", type(graph))
@@ -511,6 +584,10 @@ def create_meta_graph_def(meta_info_def=None,
         stripped_op_list_for_graph(meta_graph_def.graph_def))
   # pylint: enable=g-explicit-length-test
 
+  # Strip default valued attributes in graph_def.
+  if strip_default_attrs:
+    _strip_graph_default_valued_attrs(meta_graph_def)
+
   # Adds saver_def.
   if saver_def:
     meta_graph_def.saver_def.MergeFrom(saver_def)
@@ -655,13 +732,14 @@ def import_scoped_meta_graph(meta_graph_or_file,
     if clear_devices:
       for node in input_graph_def.node:
         node.device = ""
+
+    scope_to_prepend_to_names = graph.unique_name(
+        import_scope or "", mark_as_used=False)
+
     importer.import_graph_def(
         input_graph_def, name=(import_scope or ""), input_map=input_map,
         producer_op_list=producer_op_list)
 
-    scope_to_prepend_to_names = "/".join(
-        [part for part in [graph.get_name_scope(), import_scope] if part])
-
     # Restores all the other collections.
     for key, col_def in sorted(meta_graph_def.collection_def.items()):
       # Don't add unbound_inputs to the new graph.
@@ -724,6 +802,7 @@ def export_scoped_meta_graph(filename=None,
                              clear_devices=False,
                              saver_def=None,
                              clear_extraneous_savers=False,
+                             strip_default_attrs=False,
                              **kwargs):
   """Returns `MetaGraphDef` proto. Optionally writes it to filename.
 
@@ -752,6 +831,8 @@ def export_scoped_meta_graph(filename=None,
     clear_extraneous_savers: Remove any Saver-related information from the
         graph (both Save/Restore ops and SaverDefs) that are not associated
         with the provided SaverDef.
+    strip_default_attrs: Set to true if default valued attributes must be
+        removed while exporting the GraphDef.
     **kwargs: Optional keyed arguments, including meta_info_def and
         collection_list.
 
@@ -773,6 +854,7 @@ def export_scoped_meta_graph(filename=None,
     if graph_def:
       new_graph_def = graph_pb2.GraphDef()
       new_graph_def.versions.CopyFrom(graph_def.versions)
+      new_graph_def.library.CopyFrom(graph_def.library)
 
       if clear_extraneous_savers:
         exclude_nodes = _find_extraneous_saver_nodes(graph_def, saver_def)
@@ -799,7 +881,7 @@ def export_scoped_meta_graph(filename=None,
                                 export_scope,
                                 exclude_nodes):
           value = graph._nodes_by_id[key]
-      # pylint: enable=protected-access
+          # pylint: enable=protected-access
           node_def = _node_def(value.node_def, export_scope, unbound_inputs,
                                clear_devices=clear_devices)
           graph_def.node.extend([node_def])
@@ -810,6 +892,9 @@ def export_scoped_meta_graph(filename=None,
           bytesize += value.node_def.ByteSize()
           if bytesize >= (1 << 31) or bytesize < 0:
             raise ValueError("GraphDef cannot be larger than 2GB.")
+
+      graph._copy_functions_to_graph_def(graph_def, bytesize)  # pylint: disable=protected-access
+
     # It's possible that not all the inputs are in the export_scope.
     # If we would like such information included in the exported meta_graph,
     # add them to a special unbound_inputs collection.
@@ -833,6 +918,7 @@ def export_scoped_meta_graph(filename=None,
       exclude_nodes=exclude_nodes,
       clear_extraneous_savers=clear_extraneous_savers,
       saver_def=saver_def,
+      strip_default_attrs=strip_default_attrs,
       **kwargs)
 
   if filename:
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 4c22c913b850685bd6e50b03b5fbb09a01441b68..f2f1e83da15eacdbb4f194967b51559d279ae1a4 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -24,6 +24,8 @@ import random
 import shutil
 
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -33,6 +35,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn_ops
@@ -58,6 +61,7 @@ def _TestDir(test_name):
 # pylint: enable=invalid-name
 
 
+@test_util.with_c_api
 class SimpleMetaGraphTest(test.TestCase):
 
   def testNoVariables(self):
@@ -102,7 +106,8 @@ class SimpleMetaGraphTest(test.TestCase):
       # Re-exports the current graph state for comparison to the original.
       new_meta_graph_def, _ = meta_graph.export_scoped_meta_graph(filename +
                                                                   "_new")
-      self.assertProtoEquals(meta_graph_def, new_meta_graph_def)
+      test_util.assert_meta_graph_protos_equal(self, meta_graph_def,
+                                               new_meta_graph_def)
 
       # Ensures that we can still get a reference to our graph collections.
       new_input_tensor = ops.get_collection("input_tensor")[0]
@@ -154,7 +159,110 @@ class SimpleMetaGraphTest(test.TestCase):
     op_list = meta_graph.stripped_op_list_for_graph(graph)
     self.assertEqual(["Const"], [op.name for op in op_list.op])
 
+  def testDefaultAttrStripping(self):
+    """Verifies that default attributes are stripped from a graph def."""
 
+    # Complex Op has 2 attributes with defaults:
+    #   o "T"    : float32.
+    #   o "Tout" : complex64.
+
+    # When inputs to the Complex Op are float32 instances, "T" maps to float32
+    # and "Tout" maps to complex64. Since these attr values map to their
+    # defaults, they must be stripped unless stripping of default attrs is
+    # disabled.
+    with self.test_session():
+      real_num = constant_op.constant(1.0, dtype=dtypes.float32, name="real")
+      imag_num = constant_op.constant(2.0, dtype=dtypes.float32, name="imag")
+      math_ops.complex(real_num, imag_num, name="complex")
+
+      # strip_default_attrs is enabled.
+      meta_graph_def, _ = meta_graph.export_scoped_meta_graph(
+          graph_def=ops.get_default_graph().as_graph_def(),
+          strip_default_attrs=True)
+      node_def = test_util.get_node_def_from_graph("complex",
+                                                   meta_graph_def.graph_def)
+      self.assertNotIn("T", node_def.attr)
+      self.assertNotIn("Tout", node_def.attr)
+      self.assertTrue(meta_graph_def.meta_info_def.stripped_default_attrs)
+
+      # strip_default_attrs is disabled.
+      meta_graph_def, _ = meta_graph.export_scoped_meta_graph(
+          graph_def=ops.get_default_graph().as_graph_def(),
+          strip_default_attrs=False)
+      node_def = test_util.get_node_def_from_graph("complex",
+                                                   meta_graph_def.graph_def)
+      self.assertIn("T", node_def.attr)
+      self.assertIn("Tout", node_def.attr)
+      self.assertFalse(meta_graph_def.meta_info_def.stripped_default_attrs)
+
+    # When inputs to the Complex Op are float64 instances, "T" maps to float64
+    # and "Tout" maps to complex128. Since these attr values don't map to their
+    # defaults, they must not be stripped.
+    with self.test_session(graph=ops.Graph()):
+      real_num = constant_op.constant(1.0, dtype=dtypes.float64, name="real")
+      imag_num = constant_op.constant(2.0, dtype=dtypes.float64, name="imag")
+      math_ops.complex(real_num, imag_num, name="complex")
+      meta_graph_def, _ = meta_graph.export_scoped_meta_graph(
+          graph_def=ops.get_default_graph().as_graph_def(),
+          strip_default_attrs=True)
+      node_def = test_util.get_node_def_from_graph("complex",
+                                                   meta_graph_def.graph_def)
+      self.assertEqual(node_def.attr["T"].type, dtypes.float64)
+      self.assertEqual(node_def.attr["Tout"].type, dtypes.complex128)
+      self.assertTrue(meta_graph_def.meta_info_def.stripped_default_attrs)
+
+  def testDefaultAttrStrippingNestedFunctions(self):
+    """Verifies that default attributes are stripped from function node defs."""
+    with self.test_session():
+      @function.Defun(dtypes.float32, dtypes.float32)
+      def f0(i, j):
+        return math_ops.complex(i, j, name="double_nested_complex")
+
+      @function.Defun(dtypes.float32, dtypes.float32)
+      def f1(i, j):
+        return f0(i, j)
+
+      _ = f1(constant_op.constant(1.0), constant_op.constant(2.0))
+      meta_graph_def, _ = meta_graph.export_scoped_meta_graph(
+          graph_def=ops.get_default_graph().as_graph_def(),
+          strip_default_attrs=True)
+
+      double_nested_complex_node_def = None
+      for function_def in meta_graph_def.graph_def.library.function:
+        for node_def in function_def.node_def:
+          if node_def.name.startswith("double_nested_complex"):
+            double_nested_complex_node_def = node_def
+            break
+        if double_nested_complex_node_def:
+          break
+
+      self.assertIsNotNone(double_nested_complex_node_def)
+      self.assertNotIn("T", double_nested_complex_node_def.attr)
+      self.assertNotIn("Tout", double_nested_complex_node_def.attr)
+      self.assertTrue(meta_graph_def.meta_info_def.stripped_default_attrs)
+
+  def testDefaultAttrStrippingUnregisteredOps(self):
+    """Verifies that nodes with un-registered ops are not stripped."""
+    graph_def = graph_pb2.GraphDef()
+    node = graph_def.node.add()
+    node.name = "node_with_unreg_op"
+    node.op = "unreg_op"
+    node.attr["attr_1"].i = 1
+
+    meta_info_def = meta_graph_pb2.MetaGraphDef.MetaInfoDef()
+    meta_info_def.stripped_op_list.op.add()
+
+    with self.test_session():
+      meta_graph_def = meta_graph.create_meta_graph_def(
+          meta_info_def=meta_info_def, graph_def=graph_def,
+          strip_default_attrs=True)
+      node_def = test_util.get_node_def_from_graph("node_with_unreg_op",
+                                                   meta_graph_def.graph_def)
+      self.assertEqual(node_def.attr["attr_1"].i, 1)
+      self.assertTrue(meta_graph_def.meta_info_def.stripped_default_attrs)
+
+
+@test_util.with_c_api
 class ScopedMetaGraphTest(test.TestCase):
 
   def _testScopedExport(self, test_dir, exported_filenames):
@@ -332,12 +440,65 @@ class ScopedMetaGraphTest(test.TestCase):
     ]
     orig_meta_graphs = self._testScopedExport(test_dir, filenames)
     new_meta_graphs = self._testScopedImport(test_dir, filenames)
-    # Delete the unbound_inputs to allow directly calling ProtoEqual.
-    del orig_meta_graphs[0].collection_def["unbound_inputs"]
-    del new_meta_graphs[0].collection_def["unbound_inputs"]
     for a, b in zip(orig_meta_graphs, new_meta_graphs):
+      # The unbound input strings are slightly different with the C API enabled
+      # ("images" vs "images:0") due to the original import_graph_def code
+      # vs. ImportGraphDef in C++.
+      # TODO(skyewm): update the pbtxts once _USE_C_API is removed.
+      del a.collection_def["unbound_inputs"]
+      del b.collection_def["unbound_inputs"]
       test_util.assert_meta_graph_protos_equal(self, a, b)
 
+  def testWhileLoopGradients(self):
+    # Create a simple while loop.
+    with ops.Graph().as_default():
+      with ops.name_scope("export"):
+        var = variables.Variable(0)
+        var_name = var.name
+        _, output = control_flow_ops.while_loop(lambda i, x: i < 5,
+                                                lambda i, x: (i + 1, x + i),
+                                                [0, var])
+        output_name = output.name
+
+      # Generate a MetaGraphDef containing the while loop with an export scope.
+      meta_graph_def, _ = meta_graph.export_scoped_meta_graph(
+          export_scope="export")
+
+      # Build and run the gradients of the while loop. We use this below to
+      # verify that the gradients are correct with the imported MetaGraphDef.
+      init_op = variables.global_variables_initializer()
+      grad = gradients_impl.gradients([output], [var])
+      with session.Session() as sess:
+        sess.run(init_op)
+        expected_grad_value = sess.run(grad)
+
+    # Restore the MetaGraphDef into a new Graph with an import scope.
+    with ops.Graph().as_default():
+      meta_graph.import_scoped_meta_graph(meta_graph_def, import_scope="import")
+
+      # Re-export and make sure we get the same MetaGraphDef.
+      new_meta_graph_def, _ = meta_graph.export_scoped_meta_graph(
+          export_scope="import")
+      test_util.assert_meta_graph_protos_equal(
+          self, meta_graph_def, new_meta_graph_def)
+
+      # Make sure we can still build gradients and get the same result.
+
+      def new_name(tensor_name):
+        base_tensor_name = tensor_name.replace("export/", "")
+        return "import/" + base_tensor_name
+
+      var = ops.get_default_graph().get_tensor_by_name(new_name(var_name))
+      output = ops.get_default_graph().get_tensor_by_name(new_name(output_name))
+      grad = gradients_impl.gradients([output], [var])
+
+      init_op = variables.global_variables_initializer()
+
+      with session.Session() as sess:
+        sess.run(init_op)
+        actual_grad_value = sess.run(grad)
+        self.assertEqual(expected_grad_value, actual_grad_value)
+
   def testScopedImportUnderNameScope(self):
     graph = ops.Graph()
     with graph.as_default():
@@ -353,6 +514,19 @@ class ScopedMetaGraphTest(test.TestCase):
         self.assertEqual(list(imported_variables.values())[0].name,
                          "foo/bar/myvar:0")
 
+  def testImportsUsingSameScopeName(self):
+    with ops.Graph().as_default():
+      variables.Variable(0, name="v")
+      meta_graph_def, _ = meta_graph.export_scoped_meta_graph()
+    with ops.Graph().as_default():
+      for suffix in ["", "_1"]:
+        imported_variables = meta_graph.import_scoped_meta_graph(
+            meta_graph_def, import_scope="s")
+        self.assertEqual(len(imported_variables), 1)
+        self.assertEqual(list(imported_variables.keys())[0], "v:0")
+        self.assertEqual(list(imported_variables.values())[0].name,
+                         "s" + suffix + "/v:0")
+
   def testScopedImportWithSelectedCollections(self):
     meta_graph_filename = os.path.join(
         _TestDir("selected_collections_import"), "meta_graph.pb")
@@ -456,7 +630,8 @@ class ScopedMetaGraphTest(test.TestCase):
                                                       "exported_queue1.pbtxt")
     new_meta_graph = self._testScopedImportWithQueue(
         test_dir, "exported_queue1.pbtxt", "exported_new_queue1.pbtxt")
-    self.assertProtoEquals(orig_meta_graph, new_meta_graph)
+    test_util.assert_meta_graph_protos_equal(self, orig_meta_graph,
+                                             new_meta_graph)
 
   # Verifies that we can export a subgraph in a nested name scope containing a
   # "hidden1/hidden2" and import it into "new_hidden1/new_hidden2" in a new
@@ -602,6 +777,7 @@ class ScopedMetaGraphTest(test.TestCase):
     self.assertEqual("", str(graph2.as_graph_element("matmul").device))
 
 
+@test_util.with_c_api
 class MetaGraphWithVariableScopeTest(test.TestCase):
 
   def testMetricsCollection(self):
@@ -659,6 +835,7 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
         initializer = variables.local_variables_initializer()
 
 
+@test_util.with_c_api
 class ExportImportAcrossScopesTest(test.TestCase):
 
   def testPartionedVariables(self):
@@ -729,7 +906,7 @@ class ExportImportAcrossScopesTest(test.TestCase):
             if shared_name_value.s:
               node.attr[shared_name_attr].s = b""
 
-    self.assertProtoEquals(expected, result)
+    test_util.assert_meta_graph_protos_equal(self, expected, result)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/framework/op_def_library_test.py b/tensorflow/python/framework/op_def_library_test.py
index 715e863b787b41f81a0f3a8ac9e4f6b48f349e2a..84ca062ade3b32c37212ba2d5b7eb9c64fb1dfa5 100644
--- a/tensorflow/python/framework/op_def_library_test.py
+++ b/tensorflow/python/framework/op_def_library_test.py
@@ -26,8 +26,8 @@ from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.framework.op_def_library import OpDefLibrary
 from tensorflow.python.platform import googletest
 
 
@@ -36,75 +36,13 @@ def _unknown_shape(op):
   return [tensor_shape.unknown_shape() for _ in op.outputs]
 
 
-# NOTE(mrry): Dummy shape registrations for ops used in the tests, since they
-# don't have C++ op registrations on which to attach C++ shape fns.
-ops.RegisterShape("Attr")(_unknown_shape)
-ops.RegisterShape("AttrBool")(_unknown_shape)
-ops.RegisterShape("AttrBoolList")(_unknown_shape)
-ops.RegisterShape("AttrDefault")(_unknown_shape)
-ops.RegisterShape("AttrEmptyListDefault")(_unknown_shape)
-ops.RegisterShape("AttrEnum")(_unknown_shape)
-ops.RegisterShape("AttrEnumList")(_unknown_shape)
-ops.RegisterShape("AttrFloat")(_unknown_shape)
-ops.RegisterShape("AttrListDefault")(_unknown_shape)
-ops.RegisterShape("AttrListMin")(_unknown_shape)
-ops.RegisterShape("AttrMin")(_unknown_shape)
-ops.RegisterShape("AttrShape")(_unknown_shape)
-ops.RegisterShape("AttrShapeList")(_unknown_shape)
-ops.RegisterShape("AttrPartialShape")(_unknown_shape)
-ops.RegisterShape("AttrPartialShapeList")(_unknown_shape)
-ops.RegisterShape("AttrTypeDefault")(_unknown_shape)
-ops.RegisterShape("AttrListTypeDefault")(_unknown_shape)
-ops.RegisterShape("Binary")(_unknown_shape)
-ops.RegisterShape("ComplexStruct")(_unknown_shape)
-ops.RegisterShape("InPolymorphicTwice")(_unknown_shape)
-ops.RegisterShape("MixedStruct")(_unknown_shape)
-ops.RegisterShape("NInPolymorphicTwice")(_unknown_shape)
-ops.RegisterShape("NInTwice")(_unknown_shape)
-ops.RegisterShape("NInTwoTypeVariables")(_unknown_shape)
-ops.RegisterShape("NIntsIn")(_unknown_shape)
-ops.RegisterShape("NIntsOut")(_unknown_shape)
-ops.RegisterShape("NIntsOutDefault")(_unknown_shape)
-ops.RegisterShape("NPolymorphicIn")(_unknown_shape)
-ops.RegisterShape("NPolymorphicOut")(_unknown_shape)
-ops.RegisterShape("NPolymorphicOutDefault")(_unknown_shape)
-ops.RegisterShape("NPolymorphicRestrictIn")(_unknown_shape)
-ops.RegisterShape("NPolymorphicRestrictOut")(_unknown_shape)
-ops.RegisterShape("OutT")(_unknown_shape)
-ops.RegisterShape("OutTypeList")(_unknown_shape)
-ops.RegisterShape("OutTypeListRestrict")(_unknown_shape)
-ops.RegisterShape("Polymorphic")(_unknown_shape)
-ops.RegisterShape("PolymorphicDefaultOut")(_unknown_shape)
-ops.RegisterShape("PolymorphicOut")(_unknown_shape)
-ops.RegisterShape("RefIn")(_unknown_shape)
-ops.RegisterShape("RefOut")(_unknown_shape)
-ops.RegisterShape("ReservedAttr")(_unknown_shape)
-ops.RegisterShape("ReservedInput")(_unknown_shape)
-ops.RegisterShape("Restrict")(_unknown_shape)
-ops.RegisterShape("Simple")(_unknown_shape)
-ops.RegisterShape("SimpleStruct")(_unknown_shape)
-ops.RegisterShape("TwoRefsIn")(_unknown_shape)
-ops.RegisterShape("TypeList")(_unknown_shape)
-ops.RegisterShape("TypeListRestrict")(_unknown_shape)
-ops.RegisterShape("TypeListTwice")(_unknown_shape)
-
-
+@test_util.with_c_api
 class OpDefLibraryTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
-    self._lib = OpDefLibrary()
-    self._g = ops.Graph()
-    self._default_graph_controller = self._g.as_default()
-    self._default_graph_controller.__enter__()
-    self._add_op("name: 'Simple' input_arg { name: 'a' type: DT_INT32 } "
-                 "output_arg { name: 'out' type: DT_FLOAT }")
-    self._add_op("name: 'OutT' output_arg { name: 'a' type_attr: 'T' } "
-                 "attr { name: 'T' type: 'type' }")
-
-  def tearDown(self):
-    self._default_graph_controller.__exit__(None, None, None)
-
-  def _add_op(self, ascii):
+    self._lib = test_ops._op_def_lib
+
+  def _add_op(self, ascii):  # pylint: disable=redefined-builtin
     op_def = op_def_pb2.OpDef()
     text_format.Merge(ascii, op_def)
     self._lib.add_op(op_def)
@@ -177,1376 +115,1228 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
                      "Arg 'a' of 'NoTypes' must have one type field not 0")
 
   def testSimple(self):
-    out = self._lib.apply_op("Simple", a=3)
-    self.assertEqual(dtypes.float32, out.dtype)
-    self.assertProtoEquals("""
-      name: 'Simple' op: 'Simple' input: 'Simple/a'
-      """, out.op.node_def)
-
-    out = self._lib.apply_op("Simple", a=4)
-    self.assertProtoEquals("""
-      name: 'Simple_1' op: 'Simple' input: 'Simple_1/a'
-      """, out.op.node_def)
-
-    out = self._lib.apply_op("Simple", a=5, name="named")
-    self.assertProtoEquals("""
-      name: 'named' op: 'Simple' input: 'named/a'
-      """, out.op.node_def)
-
-    out = self._lib.apply_op("Simple", a=[[1, 2, 3], [4, 5, 6]], name="two_d")
-    self.assertProtoEquals("""
-      name: 'two_d' op: 'Simple' input: 'two_d/a'
-      """, out.op.node_def)
+    with ops.Graph().as_default():
+      out = self._lib.apply_op("Simple", a=3)
+      self.assertEqual(dtypes.float32, out.dtype)
+      self.assertProtoEquals("""
+        name: 'Simple' op: 'Simple' input: 'Simple/a'
+        """, out.op.node_def)
+
+      out = self._lib.apply_op("Simple", a=4)
+      self.assertProtoEquals("""
+        name: 'Simple_1' op: 'Simple' input: 'Simple_1/a'
+        """, out.op.node_def)
+
+      out = self._lib.apply_op("Simple", a=5, name="named")
+      self.assertProtoEquals("""
+        name: 'named' op: 'Simple' input: 'named/a'
+        """, out.op.node_def)
+
+      out = self._lib.apply_op("Simple", a=[[1, 2, 3], [4, 5, 6]], name="two_d")
+      self.assertProtoEquals("""
+        name: 'two_d' op: 'Simple' input: 'two_d/a'
+        """, out.op.node_def)
 
   def testSimpleFailures(self):
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Simple", a="Bad string")
-    self.assertEqual(str(cm.exception),
-                     "Expected int32 passed to parameter 'a' of op 'Simple', "
-                     "got 'Bad string' of type 'str' instead.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Simple", a=self.Tensor(dtypes.string))
-    self.assertEqual(str(cm.exception),
-                     "Input 'a' of 'Simple' Op has type string "
-                     "that does not match expected type of int32.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Simple", a=6, extra="bogus")
-    self.assertEqual(str(cm.exception),
-                     "apply_op() got unexpected keyword arguments: extra")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Simple", a=6, extra1="bogus", extra2="also_bogus")
-    self.assertEqual(str(cm.exception),
-                     "apply_op() got unexpected keyword arguments: extra1, "
-                     "extra2")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Simple")
-    self.assertEqual(str(cm.exception), "No argument for input a")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Simple", wrong=7)
-    self.assertEqual(str(cm.exception), "No argument for input a")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Simple", a={"label": 1})
-    self.assertEqual(str(cm.exception),
-                     "Expected int32 passed to parameter 'a' of op 'Simple', "
-                     "got {'label': 1} of type 'dict' instead.")
+    with ops.Graph().as_default():
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Simple", a="Bad string")
+      self.assertEqual(str(cm.exception),
+                       "Expected int32 passed to parameter 'a' of op 'Simple', "
+                       "got 'Bad string' of type 'str' instead.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Simple", a=self.Tensor(dtypes.string))
+      self.assertEqual(str(cm.exception),
+                       "Input 'a' of 'Simple' Op has type string "
+                       "that does not match expected type of int32.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Simple", a=6, extra="bogus")
+      self.assertEqual(str(cm.exception),
+                       "apply_op() got unexpected keyword arguments: extra")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Simple", a=6, extra1="bogus", extra2="also_bogus")
+      self.assertEqual(str(cm.exception),
+                       "apply_op() got unexpected keyword arguments: extra1, "
+                       "extra2")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Simple")
+      self.assertEqual(str(cm.exception), "No argument for input a")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Simple", wrong=7)
+      self.assertEqual(str(cm.exception), "No argument for input a")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Simple", a={"label": 1})
+      self.assertEqual(str(cm.exception),
+                       "Expected int32 passed to parameter 'a' of op 'Simple', "
+                       "got {'label': 1} of type 'dict' instead.")
 
   def testReservedInput(self):
-    self._add_op("name: 'ReservedInput' "
-                 "input_arg { name: 'input' type: DT_INT32 } ")
-    op = self._lib.apply_op("ReservedInput", input_=7, name="x")
-    self.assertProtoEquals("""
-      name: 'x' op: 'ReservedInput' input: 'x/input'
-      """, op.node_def)
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("ReservedInput", input_=7, name="x")
+      self.assertProtoEquals("""
+        name: 'x' op: 'ReservedInput' input: 'x/input'
+        """, op.node_def)
 
   def testPolymorphic(self):
-    self._add_op("name: 'Polymorphic' "
-                 "input_arg { name: 'a' type_attr: 'T' } "
-                 "output_arg { name: 'out' type_attr: 'T' } "
-                 "attr { name: 'T' type: 'type' }")
-
-    out = self._lib.apply_op("Polymorphic", a=7, name="p")
-    self.assertEqual(dtypes.int32, out.dtype)
-    self.assertProtoEquals("""
-      name: 'p' op: 'Polymorphic' input: 'p/a'
-      attr { key: 'T' value { type: DT_INT32 } }
-      """, out.op.node_def)
-
-    out = self._lib.apply_op("Polymorphic", a="s", name="q")
-    self.assertEqual(dtypes.string, out.dtype)
-    self.assertProtoEquals("""
-      name: 'q' op: 'Polymorphic' input: 'q/a'
-      attr { key: 'T' value { type: DT_STRING } }
-      """, out.op.node_def)
-
-    out = self._lib.apply_op("Polymorphic", a=["s", "t", "u"], name="r")
-    self.assertEqual(dtypes.string, out.dtype)
-    self.assertProtoEquals("""
-      name: 'r' op: 'Polymorphic' input: 'r/a'
-      attr { key: 'T' value { type: DT_STRING } }
-      """, out.op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Polymorphic", a="s", T=dtypes.string)
-    self.assertEqual(str(cm.exception),
-                     "Should not specify value for inferred attr 'T'.")
+    with ops.Graph().as_default():
+      out = self._lib.apply_op("Polymorphic", a=7, name="p")
+      self.assertEqual(dtypes.int32, out.dtype)
+      self.assertProtoEquals("""
+        name: 'p' op: 'Polymorphic' input: 'p/a'
+        attr { key: 'T' value { type: DT_INT32 } }
+        """, out.op.node_def)
+
+      out = self._lib.apply_op("Polymorphic", a="s", name="q")
+      self.assertEqual(dtypes.string, out.dtype)
+      self.assertProtoEquals("""
+        name: 'q' op: 'Polymorphic' input: 'q/a'
+        attr { key: 'T' value { type: DT_STRING } }
+        """, out.op.node_def)
+
+      out = self._lib.apply_op("Polymorphic", a=["s", "t", "u"], name="r")
+      self.assertEqual(dtypes.string, out.dtype)
+      self.assertProtoEquals("""
+        name: 'r' op: 'Polymorphic' input: 'r/a'
+        attr { key: 'T' value { type: DT_STRING } }
+        """, out.op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Polymorphic", a="s", T=dtypes.string)
+      self.assertEqual(str(cm.exception),
+                       "Should not specify value for inferred attr 'T'.")
 
   def testPolymorphicOut(self):
-    self._add_op("name: 'PolymorphicOut' "
-                 "output_arg { name: 'out' type_attr: 'T' } "
-                 "attr { name: 'T' type: 'type' }")
-
-    out = self._lib.apply_op("PolymorphicOut", T=dtypes.int32, name="p")
-    self.assertEqual(dtypes.int32, out.dtype)
-    self.assertProtoEquals("""
-      name: 'p' op: 'PolymorphicOut'
-      attr { key: 'T' value { type: DT_INT32 } }
-      """, out.op.node_def)
-
-    out = self._lib.apply_op("PolymorphicOut", T=dtypes.bool, name="q")
-    self.assertEqual(dtypes.bool, out.dtype)
-    self.assertProtoEquals("""
-      name: 'q' op: 'PolymorphicOut'
-      attr { key: 'T' value { type: DT_BOOL } }
-      """, out.op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("PolymorphicOut")
-    self.assertEqual(str(cm.exception),
-                     "No argument for attr T")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("PolymorphicOut", T=None)
-    self.assertEqual(str(cm.exception),
-                     "Expected DataType for argument 'T' not None.")
+    with ops.Graph().as_default():
+      out = self._lib.apply_op("PolymorphicOut", T=dtypes.int32, name="p")
+      self.assertEqual(dtypes.int32, out.dtype)
+      self.assertProtoEquals("""
+        name: 'p' op: 'PolymorphicOut'
+        attr { key: 'T' value { type: DT_INT32 } }
+        """, out.op.node_def)
+
+      out = self._lib.apply_op("PolymorphicOut", T=dtypes.bool, name="q")
+      self.assertEqual(dtypes.bool, out.dtype)
+      self.assertProtoEquals("""
+        name: 'q' op: 'PolymorphicOut'
+        attr { key: 'T' value { type: DT_BOOL } }
+        """, out.op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("PolymorphicOut")
+      self.assertEqual(str(cm.exception),
+                       "No argument for attr T")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("PolymorphicOut", T=None)
+      self.assertEqual(str(cm.exception),
+                       "Expected DataType for argument 'T' not None.")
 
   def testPolymorphicDefaultOut(self):
-    self._add_op("name: 'PolymorphicDefaultOut' "
-                 "output_arg { name: 'out' type_attr: 'T' } "
-                 "attr { name: 'T' type: 'type' "
-                 "  default_value { type: DT_STRING } }")
-
-    out = self._lib.apply_op("PolymorphicDefaultOut", T=None, name="p")
-    self.assertEqual(dtypes.string, out.dtype)
-    self.assertProtoEquals("""
-      name: 'p' op: 'PolymorphicDefaultOut'
-      attr { key: 'T' value { type: DT_STRING } }
-      """, out.op.node_def)
-
-    out = self._lib.apply_op("PolymorphicDefaultOut", T=dtypes.bool, name="q")
-    self.assertEqual(dtypes.bool, out.dtype)
-    self.assertProtoEquals("""
-      name: 'q' op: 'PolymorphicDefaultOut'
-      attr { key: 'T' value { type: DT_BOOL } }
-      """, out.op.node_def)
+    with ops.Graph().as_default():
+      out = self._lib.apply_op("PolymorphicDefaultOut", T=None, name="p")
+      self.assertEqual(dtypes.string, out.dtype)
+      self.assertProtoEquals("""
+        name: 'p' op: 'PolymorphicDefaultOut'
+        attr { key: 'T' value { type: DT_STRING } }
+        """, out.op.node_def)
+
+      out = self._lib.apply_op("PolymorphicDefaultOut", T=dtypes.bool, name="q")
+      self.assertEqual(dtypes.bool, out.dtype)
+      self.assertProtoEquals("""
+        name: 'q' op: 'PolymorphicDefaultOut'
+        attr { key: 'T' value { type: DT_BOOL } }
+        """, out.op.node_def)
 
   def testBinary(self):
-    self._add_op("name: 'Binary' "
-                 "input_arg { name: 'a' type_attr: 'T' } "
-                 "input_arg { name: 'b' type_attr: 'T' } "
-                 "output_arg { name: 'out' type_attr: 'T' } "
-                 "attr { name: 'T' type: 'type' }")
-
-    out = self._lib.apply_op("Binary", a=8, b=9, name="b")
-    self.assertEqual(dtypes.int32, out.dtype)
-    self.assertProtoEquals("""
-      name: 'b' op: 'Binary' input: 'b/a' input: 'b/b'
-      attr { key: 'T' value { type: DT_INT32 } }
-      """, out.op.node_def)
-
-    out = self._lib.apply_op("Binary", a="left", b="right", name="c")
-    self.assertEqual(dtypes.string, out.dtype)
-    self.assertProtoEquals("""
-      name: 'c' op: 'Binary' input: 'c/a' input: 'c/b'
-      attr { key: 'T' value { type: DT_STRING } }
-      """, out.op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Binary", a="left", b=12)
-    self.assertEqual(str(cm.exception),
-                     "Expected string passed to parameter 'b' of op 'Binary', "
-                     "got 12 of type 'int' instead.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Binary",
-                         a=self.Tensor(dtypes.string),
-                         b=self.Tensor(dtypes.int32))
-    self.assertEqual(str(cm.exception),
-                     "Input 'b' of 'Binary' Op has type int32 "
-                     "that does not match type string of argument 'a'.")
+    with ops.Graph().as_default():
+      out = self._lib.apply_op("Binary", a=8, b=9, name="b")
+      self.assertEqual(dtypes.int32, out.dtype)
+      self.assertProtoEquals("""
+        name: 'b' op: 'Binary' input: 'b/a' input: 'b/b'
+        attr { key: 'T' value { type: DT_INT32 } }
+        """, out.op.node_def)
+
+      out = self._lib.apply_op("Binary", a="left", b="right", name="c")
+      self.assertEqual(dtypes.string, out.dtype)
+      self.assertProtoEquals("""
+        name: 'c' op: 'Binary' input: 'c/a' input: 'c/b'
+        attr { key: 'T' value { type: DT_STRING } }
+        """, out.op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Binary", a="left", b=12)
+      self.assertEqual(str(cm.exception),
+                       "Expected string passed to parameter 'b' of op 'Binary',"
+                       " got 12 of type 'int' instead.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Binary",
+                           a=self.Tensor(dtypes.string),
+                           b=self.Tensor(dtypes.int32))
+      self.assertEqual(str(cm.exception),
+                       "Input 'b' of 'Binary' Op has type int32 "
+                       "that does not match type string of argument 'a'.")
 
   def testRestrict(self):
-    self._add_op("name: 'Restrict' "
-                 "input_arg { name: 'a' type_attr: 'T' } "
-                 "output_arg { name: 'out' type_attr: 'T' } "
-                 "attr { name: 'T' type: 'type' allowed_values { list { "
-                 "  type: DT_STRING type: DT_BOOL } } }")
-
-    out = self._lib.apply_op("Restrict", a="foo", name="g")
-    self.assertEqual(dtypes.string, out.dtype)
-    self.assertProtoEquals("""
-      name: 'g' op: 'Restrict' input: 'g/a'
-      attr { key: 'T' value { type: DT_STRING } }
-      """, out.op.node_def)
-
-    out = self._lib.apply_op("Restrict", a=True, name="h")
-    self.assertEqual(dtypes.bool, out.dtype)
-    self.assertProtoEquals("""
-      name: 'h' op: 'Restrict' input: 'h/a'
-      attr { key: 'T' value { type: DT_BOOL } }
-      """, out.op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Restrict", a=17)
-    self.assertEqual(str(cm.exception),
-                     "Value passed to parameter 'a' has DataType int32 "
-                     "not in list of allowed values: string, bool")
+    with ops.Graph().as_default():
+      out = self._lib.apply_op("Restrict", a="foo", name="g")
+      self.assertEqual(dtypes.string, out.dtype)
+      self.assertProtoEquals("""
+        name: 'g' op: 'Restrict' input: 'g/a'
+        attr { key: 'T' value { type: DT_STRING } }
+        """, out.op.node_def)
+
+      out = self._lib.apply_op("Restrict", a=True, name="h")
+      self.assertEqual(dtypes.bool, out.dtype)
+      self.assertProtoEquals("""
+        name: 'h' op: 'Restrict' input: 'h/a'
+        attr { key: 'T' value { type: DT_BOOL } }
+        """, out.op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Restrict", a=17)
+      self.assertEqual(str(cm.exception),
+                       "Value passed to parameter 'a' has DataType int32 "
+                       "not in list of allowed values: string, bool")
 
   def testTypeList(self):
-    self._add_op("name: 'TypeList' "
-                 "input_arg { name: 'a' type_list_attr: 'T' } "
-                 "attr { name: 'T' type: 'list(type)' }")
-
-    op = self._lib.apply_op("TypeList", a=["foo"], name="z")
-    self.assertProtoEquals("""
-      name: 'z' op: 'TypeList' input: 'z/a_0'
-      attr { key: 'T' value { list { type: DT_STRING } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("TypeList", a=[True, 12], name="y")
-    self.assertProtoEquals("""
-      name: 'y' op: 'TypeList' input: 'y/a_0' input: 'y/a_1'
-      attr { key: 'T' value { list { type: DT_BOOL type: DT_INT32 } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("TypeList", a=[], name="empty")
-    self.assertProtoEquals("""
-      name: 'empty' op: 'TypeList' attr { key: 'T' value { list { } } }
-      """, op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("TypeList", a=17)
-    self.assertStartsWith(str(cm.exception),
-                          "Expected list for 'a' "
-                          "argument to 'TypeList' Op, not ")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("TypeList", a=[self.Tensor(dtypes.int32), None])
-    self.assertStartsWith(str(cm.exception),
-                          "Tensors in list passed to 'a' of 'TypeList' Op "
-                          "have types [int32, <NOT CONVERTIBLE TO TENSOR>]")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("TypeList", a=["foo"], name="z")
+      self.assertProtoEquals("""
+        name: 'z' op: 'TypeList' input: 'z/a_0'
+        attr { key: 'T' value { list { type: DT_STRING } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("TypeList", a=[True, 12], name="y")
+      self.assertProtoEquals("""
+        name: 'y' op: 'TypeList' input: 'y/a_0' input: 'y/a_1'
+        attr { key: 'T' value { list { type: DT_BOOL type: DT_INT32 } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("TypeList", a=[], name="empty")
+      self.assertProtoEquals("""
+        name: 'empty' op: 'TypeList' attr { key: 'T' value { list { } } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("TypeList", a=17)
+      self.assertStartsWith(str(cm.exception),
+                            "Expected list for 'a' "
+                            "argument to 'TypeList' Op, not ")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("TypeList", a=[self.Tensor(dtypes.int32), None])
+      self.assertStartsWith(str(cm.exception),
+                            "Tensors in list passed to 'a' of 'TypeList' Op "
+                            "have types [int32, <NOT CONVERTIBLE TO TENSOR>]")
 
   def testTypeListTwice(self):
-    self._add_op("name: 'TypeListTwice' "
-                 "input_arg { name: 'a' type_list_attr: 'T' } "
-                 "input_arg { name: 'b' type_list_attr: 'T' } "
-                 "attr { name: 'T' type: 'list(type)' }")
-
-    op = self._lib.apply_op("TypeListTwice",
-                            a=["foo", True],
-                            b=["bar", False],
-                            name="z")
-    self.assertProtoEquals("""
-      name: 'z' op: 'TypeListTwice'
-      input: 'z/a_0' input: 'z/a_1' input: 'z/b_0' input: 'z/b_1'
-      attr { key: 'T' value { list { type: DT_STRING type: DT_BOOL } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("TypeListTwice", a=[], b=[], name="empty")
-    self.assertProtoEquals("""
-      name: 'empty' op: 'TypeListTwice' attr { key: 'T' value { list { } } }
-      """, op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("TypeListTwice", a=["foo", True], b=["bar", 6])
-    self.assertEqual(str(cm.exception),
-                     "Input 'b' of 'TypeListTwice' Op has type list of "
-                     "string, int32 that does not match type list "
-                     "string, bool of argument 'a'.")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("TypeListTwice",
+                              a=["foo", True],
+                              b=["bar", False],
+                              name="z")
+      self.assertProtoEquals("""
+        name: 'z' op: 'TypeListTwice'
+        input: 'z/a_0' input: 'z/a_1' input: 'z/b_0' input: 'z/b_1'
+        attr { key: 'T' value { list { type: DT_STRING type: DT_BOOL } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("TypeListTwice", a=[], b=[], name="empty")
+      self.assertProtoEquals("""
+        name: 'empty' op: 'TypeListTwice' attr { key: 'T' value { list { } } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("TypeListTwice", a=["foo", True], b=["bar", 6])
+      self.assertEqual(str(cm.exception),
+                       "Input 'b' of 'TypeListTwice' Op has type list of "
+                       "string, int32 that does not match type list "
+                       "string, bool of argument 'a'.")
 
   def testOutTypeList(self):
-    self._add_op("name: 'OutTypeList' "
-                 "output_arg { name: 'out' type_list_attr: 'T' } "
-                 "attr { name: 'T' type: 'list(type)' }")
-
-    out, = self._lib.apply_op("OutTypeList", T=[dtypes.float32], name="x")
-    self.assertEqual(dtypes.float32, out.dtype)
-    self.assertProtoEquals("""
-      name: 'x' op: 'OutTypeList'
-      attr { key: 'T' value { list { type: DT_FLOAT } } }
-      """, out.op.node_def)
-
-    out1, out2 = self._lib.apply_op("OutTypeList",
-                                    T=[dtypes.int32, dtypes.bool],
-                                    name="w")
-    self.assertEqual(dtypes.int32, out1.dtype)
-    self.assertEqual(dtypes.bool, out2.dtype)
-    self.assertProtoEquals("""
-      name: 'w' op: 'OutTypeList'
-      attr { key: 'T' value { list { type: DT_INT32 type: DT_BOOL } } }
-      """, out1.op.node_def)
-
-    out = self._lib.apply_op("OutTypeList", T=[], name="empty")
-    self.assertEqual([], out)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("OutTypeList", T=dtypes.int32)
-    self.assertEqual(str(cm.exception), "Expected list for attr T")
+    with ops.Graph().as_default():
+      out, = self._lib.apply_op("OutTypeList", T=[dtypes.float32], name="x")
+      self.assertEqual(dtypes.float32, out.dtype)
+      self.assertProtoEquals("""
+        name: 'x' op: 'OutTypeList'
+        attr { key: 'T' value { list { type: DT_FLOAT } } }
+        """, out.op.node_def)
+
+      out1, out2 = self._lib.apply_op("OutTypeList",
+                                      T=[dtypes.int32, dtypes.bool],
+                                      name="w")
+      self.assertEqual(dtypes.int32, out1.dtype)
+      self.assertEqual(dtypes.bool, out2.dtype)
+      self.assertProtoEquals("""
+        name: 'w' op: 'OutTypeList'
+        attr { key: 'T' value { list { type: DT_INT32 type: DT_BOOL } } }
+        """, out1.op.node_def)
+
+      out = self._lib.apply_op("OutTypeList", T=[], name="empty")
+      self.assertEqual([], out)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("OutTypeList", T=dtypes.int32)
+      self.assertEqual(str(cm.exception), "Expected list for attr T")
 
   def testTypeListRestrict(self):
-    self._add_op("name: 'TypeListRestrict' "
-                 "input_arg { name: 'a' type_list_attr: 'T' } "
-                 "attr { name: 'T' type: 'list(type)' allowed_values { list { "
-                 "  type: DT_STRING type: DT_BOOL } } }")
-
-    op = self._lib.apply_op("TypeListRestrict", a=["foo", False], name="v")
-    self.assertProtoEquals("""
-      name: 'v' op: 'TypeListRestrict' input: 'v/a_0' input: 'v/a_1'
-      attr { key: 'T' value { list { type: DT_STRING type: DT_BOOL } } }
-      """, op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("TypeListRestrict", a=[True, 12])
-    self.assertEqual(str(cm.exception),
-                     "Value passed to parameter 'a' has DataType int32 "
-                     "not in list of allowed values: string, bool")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("TypeListRestrict", a=["foo", False], name="v")
+      self.assertProtoEquals("""
+        name: 'v' op: 'TypeListRestrict' input: 'v/a_0' input: 'v/a_1'
+        attr { key: 'T' value { list { type: DT_STRING type: DT_BOOL } } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("TypeListRestrict", a=[True, 12])
+      self.assertEqual(str(cm.exception),
+                       "Value passed to parameter 'a' has DataType int32 "
+                       "not in list of allowed values: string, bool")
 
   def testOutTypeListRestrict(self):
-    self._add_op("name: 'OutTypeListRestrict' "
-                 "output_arg { name: 'out' type_list_attr: 't' } "
-                 "attr { name: 't' type: 'list(type)' allowed_values { list { "
-                 "  type: DT_STRING type: DT_BOOL } } }")
-
-    out1, out2 = self._lib.apply_op("OutTypeListRestrict",
-                                    t=[dtypes.bool, dtypes.string],
-                                    name="u")
-    self.assertEqual(dtypes.bool, out1.dtype)
-    self.assertEqual(dtypes.string, out2.dtype)
-    self.assertProtoEquals("""
-      name: 'u' op: 'OutTypeListRestrict'
-      attr { key: 't' value { list { type: DT_BOOL type: DT_STRING } } }
-      """, out1.op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("OutTypeListRestrict", t=[dtypes.string, dtypes.int32])
-    self.assertEqual(str(cm.exception),
-                     "Value passed to parameter 't' has DataType int32 "
-                     "not in list of allowed values: string, bool")
+    with ops.Graph().as_default():
+      out1, out2 = self._lib.apply_op("OutTypeListRestrict",
+                                      t=[dtypes.bool, dtypes.string],
+                                      name="u")
+      self.assertEqual(dtypes.bool, out1.dtype)
+      self.assertEqual(dtypes.string, out2.dtype)
+      self.assertProtoEquals("""
+        name: 'u' op: 'OutTypeListRestrict'
+        attr { key: 't' value { list { type: DT_BOOL type: DT_STRING } } }
+        """, out1.op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("OutTypeListRestrict",
+                           t=[dtypes.string, dtypes.int32])
+      self.assertEqual(str(cm.exception),
+                       "Value passed to parameter 't' has DataType int32 "
+                       "not in list of allowed values: string, bool")
 
   def testAttr(self):
-    self._add_op("name: 'Attr' attr { name: 'a' type: 'int' }")
-    op = self._lib.apply_op("Attr", a=12, name="t")
-    self.assertProtoEquals("""
-      name: 't' op: 'Attr' attr { key: 'a' value { i: 12 } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("Attr", a=tensor_shape.Dimension(13), name="u")
-    self.assertProtoEquals("""
-      name: 'u' op: 'Attr' attr { key: 'a' value { i: 13 } }
-      """, op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Attr", a="bad")
-    self.assertEqual(str(cm.exception),
-                     "Expected int for argument 'a' not 'bad'.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Attr", a=[12])
-    self.assertEqual(str(cm.exception),
-                     "Expected int for argument 'a' not [12].")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Attr", a=None)
-    self.assertEqual(str(cm.exception),
-                     "Expected int for argument 'a' not None.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("Attr")
-    self.assertEqual(str(cm.exception), "No argument for attr a")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("Attr", a=12, name="t")
+      self.assertProtoEquals("""
+        name: 't' op: 'Attr' attr { key: 'a' value { i: 12 } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("Attr", a=tensor_shape.Dimension(13), name="u")
+      self.assertProtoEquals("""
+        name: 'u' op: 'Attr' attr { key: 'a' value { i: 13 } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Attr", a="bad")
+      self.assertEqual(str(cm.exception),
+                       "Expected int for argument 'a' not 'bad'.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Attr", a=[12])
+      self.assertEqual(str(cm.exception),
+                       "Expected int for argument 'a' not [12].")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Attr", a=None)
+      self.assertEqual(str(cm.exception),
+                       "Expected int for argument 'a' not None.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("Attr")
+      self.assertEqual(str(cm.exception), "No argument for attr a")
 
   def testAttrFloat(self):
-    self._add_op("name: 'AttrFloat' attr { name: 'a' type: 'float' }")
-
-    op = self._lib.apply_op("AttrFloat", a=1.2, name="t")
-    self.assertProtoEquals("""
-      name: 't' op: 'AttrFloat' attr { key: 'a' value { f: 1.2 } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("AttrFloat", a=12, name="u")
-    self.assertProtoEquals("""
-      name: 'u' op: 'AttrFloat' attr { key: 'a' value { f: 12 } }
-      """, op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("AttrFloat", a="bad")
-    self.assertEqual(str(cm.exception),
-                     "Expected float for argument 'a' not 'bad'.")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("AttrFloat", a=1.2, name="t")
+      self.assertProtoEquals("""
+        name: 't' op: 'AttrFloat' attr { key: 'a' value { f: 1.2 } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("AttrFloat", a=12, name="u")
+      self.assertProtoEquals("""
+        name: 'u' op: 'AttrFloat' attr { key: 'a' value { f: 12 } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("AttrFloat", a="bad")
+      self.assertEqual(str(cm.exception),
+                       "Expected float for argument 'a' not 'bad'.")
 
   def testAttrBool(self):
-    self._add_op("name: 'AttrBool' attr { name: 'a' type: 'bool' }")
-
-    op = self._lib.apply_op("AttrBool", a=True, name="t")
-    self.assertProtoEquals("""
-      name: 't' op: 'AttrBool' attr { key: 'a' value { b: true } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("AttrBool", a=False, name="u")
-    self.assertProtoEquals("""
-      name: 'u' op: 'AttrBool' attr { key: 'a' value { b: false } }
-      """, op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("AttrBool", a=0)
-    self.assertEqual(str(cm.exception),
-                     "Expected bool for argument 'a' not 0.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("AttrBool", a=1)
-    self.assertEqual(str(cm.exception),
-                     "Expected bool for argument 'a' not 1.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("AttrBool", a=[])
-    self.assertEqual(str(cm.exception),
-                     "Expected bool for argument 'a' not [].")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("AttrBool", a=True, name="t")
+      self.assertProtoEquals("""
+        name: 't' op: 'AttrBool' attr { key: 'a' value { b: true } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("AttrBool", a=False, name="u")
+      self.assertProtoEquals("""
+        name: 'u' op: 'AttrBool' attr { key: 'a' value { b: false } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("AttrBool", a=0)
+      self.assertEqual(str(cm.exception),
+                       "Expected bool for argument 'a' not 0.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("AttrBool", a=1)
+      self.assertEqual(str(cm.exception),
+                       "Expected bool for argument 'a' not 1.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("AttrBool", a=[])
+      self.assertEqual(str(cm.exception),
+                       "Expected bool for argument 'a' not [].")
 
   def testAttrBoolList(self):
-    self._add_op("name: 'AttrBoolList' attr { name: 'a' type: 'list(bool)' }")
-
-    op = self._lib.apply_op("AttrBoolList", a=[True, False, True], name="t")
-    self.assertProtoEquals("""
-      name: 't' op: 'AttrBoolList'
-      attr { key: 'a' value { list { b: true b: false b:true } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("AttrBoolList", a=[], name="u")
-    self.assertProtoEquals("""
-      name: 'u' op: 'AttrBoolList' attr { key: 'a' value { list { } } }
-      """, op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("AttrBoolList", a=[0])
-    self.assertEqual(str(cm.exception),
-                     "Expected bool for argument 'a' not 0.")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("AttrBoolList", a=[True, False, True], name="t")
+      self.assertProtoEquals("""
+        name: 't' op: 'AttrBoolList'
+        attr { key: 'a' value { list { b: true b: false b:true } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("AttrBoolList", a=[], name="u")
+      self.assertProtoEquals("""
+        name: 'u' op: 'AttrBoolList' attr { key: 'a' value { list { } } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("AttrBoolList", a=[0])
+      self.assertEqual(str(cm.exception),
+                       "Expected bool for argument 'a' not 0.")
 
   def testAttrMin(self):
-    self._add_op("name: 'AttrMin' attr { name: 'a' type: 'int' "
-                 "has_minimum: true minimum: 5 }")
-    op = self._lib.apply_op("AttrMin", a=12, name="s")
-    self.assertProtoEquals("""
-      name: 's' op: 'AttrMin' attr { key: 'a' value { i: 12 } }
-      """, op.node_def)
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("AttrMin", a=12, name="s")
+      self.assertProtoEquals("""
+        name: 's' op: 'AttrMin' attr { key: 'a' value { i: 12 } }
+        """, op.node_def)
 
-    with self.assertRaises(ValueError) as cm:
-      self._lib.apply_op("AttrMin", a=2)
-    self.assertEqual(str(cm.exception),
-                     "Attr 'a' of 'AttrMin' Op passed 2 less than minimum 5.")
+      with self.assertRaises(ValueError) as cm:
+        self._lib.apply_op("AttrMin", a=2)
+      self.assertEqual(str(cm.exception),
+                       "Attr 'a' of 'AttrMin' Op passed 2 less than minimum 5.")
 
   def testAttrListMin(self):
-    self._add_op("name: 'AttrListMin' attr { name: 'a' type: 'list(int)' "
-                 "has_minimum: true minimum: 2 }")
-
-    op = self._lib.apply_op("AttrListMin", a=[1, 2], name="r")
-    self.assertProtoEquals("""
-      name: 'r' op: 'AttrListMin'
-      attr { key: 'a' value { list { i: 1 i: 2 } } }
-      """, op.node_def)
-
-    with self.assertRaises(ValueError) as cm:
-      self._lib.apply_op("AttrListMin", a=[17])
-    self.assertEqual(str(cm.exception),
-                     "Attr 'a' of 'AttrListMin' Op "
-                     "passed list of length 1 less than minimum 2.")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("AttrListMin", a=[1, 2], name="r")
+      self.assertProtoEquals("""
+        name: 'r' op: 'AttrListMin'
+        attr { key: 'a' value { list { i: 1 i: 2 } } }
+        """, op.node_def)
+
+      with self.assertRaises(ValueError) as cm:
+        self._lib.apply_op("AttrListMin", a=[17])
+      self.assertEqual(str(cm.exception),
+                       "Attr 'a' of 'AttrListMin' Op "
+                       "passed list of length 1 less than minimum 2.")
 
   def testAttrEnum(self):
-    self._add_op("name: 'AttrEnum' "
-                 "attr { name: 'a' type: 'string' "
-                 "  allowed_values { list { s: 'apples' s: 'oranges' } } }")
-
-    op = self._lib.apply_op("AttrEnum", a="oranges", name="e")
-    self.assertProtoEquals("""
-      name: 'e' op: 'AttrEnum' attr { key: 'a' value { s: 'oranges' } }
-      """, op.node_def)
-
-    with self.assertRaises(ValueError) as cm:
-      self._lib.apply_op("AttrEnum", a="invalid")
-    self.assertEqual(str(cm.exception),
-                     'Attr \'a\' of \'AttrEnum\' Op '
-                     'passed string \'invalid\' not in: '
-                     '"apples", "oranges".')
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("AttrEnum", a="oranges", name="e")
+      self.assertProtoEquals("""
+        name: 'e' op: 'AttrEnum' attr { key: 'a' value { s: 'oranges' } }
+        """, op.node_def)
+
+      with self.assertRaises(ValueError) as cm:
+        self._lib.apply_op("AttrEnum", a="invalid")
+      self.assertEqual(str(cm.exception),
+                       'Attr \'a\' of \'AttrEnum\' Op '
+                       'passed string \'invalid\' not in: '
+                       '"apples", "oranges".')
 
   def testAttrEnumList(self):
-    self._add_op("name: 'AttrEnumList' "
-                 "attr { name: 'a' type: 'list(string)' "
-                 "  allowed_values { list { s: 'apples' s: 'oranges' } } }")
-
-    op = self._lib.apply_op("AttrEnumList", a=["oranges", "apples"], name="f")
-    self.assertProtoEquals("""
-      name: 'f' op: 'AttrEnumList'
-      attr { key: 'a' value { list { s: 'oranges' s: 'apples' } } }
-      """, op.node_def)
-
-    with self.assertRaises(ValueError) as cm:
-      self._lib.apply_op("AttrEnumList", a=["apples", "invalid", "oranges"])
-    self.assertEqual(str(cm.exception),
-                     'Attr \'a\' of \'AttrEnumList\' Op '
-                     'passed string \'invalid\' not '
-                     'in: "apples", "oranges".')
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("AttrEnumList", a=["oranges", "apples"], name="f")
+      self.assertProtoEquals("""
+        name: 'f' op: 'AttrEnumList'
+        attr { key: 'a' value { list { s: 'oranges' s: 'apples' } } }
+        """, op.node_def)
+
+      with self.assertRaises(ValueError) as cm:
+        self._lib.apply_op("AttrEnumList", a=["apples", "invalid", "oranges"])
+      self.assertEqual(str(cm.exception),
+                       'Attr \'a\' of \'AttrEnumList\' Op '
+                       'passed string \'invalid\' not '
+                       'in: "apples", "oranges".')
 
   def testAttrShape(self):
-    self._add_op("name: 'AttrShape' attr { name: 'a' type: 'shape' }")
-
-    op = self._lib.apply_op("AttrShape", a=[5], name="s1")
-    self.assertProtoEquals("""
-      name: 's1' op: 'AttrShape'
-      attr { key: 'a' value { shape { dim { size: 5 } } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("AttrShape", a=(4, 3, 2), name="s2")
-    self.assertProtoEquals("""
-      name: 's2' op: 'AttrShape'
-      attr { key: 'a' value {
-        shape { dim { size: 4 } dim { size: 3 } dim { size: 2 } } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op(
-        "AttrShape", a=tensor_shape.TensorShape([3, 2]), name="s3")
-    self.assertProtoEquals("""
-      name: 's3' op: 'AttrShape'
-      attr { key: 'a' value {
-        shape { dim { size: 3 } dim { size: 2 } } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("AttrShape", a=[], name="s4")
-    self.assertProtoEquals("""
-      name: 's4' op: 'AttrShape' attr { key: 'a' value { shape { } } }
-      """, op.node_def)
-
-    shape = tensor_shape_pb2.TensorShapeProto()
-    shape.dim.add().size = 6
-    shape.dim.add().size = 3
-    op = self._lib.apply_op("AttrShape", a=shape, name="s5")
-    self.assertProtoEquals("""
-      name: 's5' op: 'AttrShape'
-      attr { key: 'a' value { shape { dim { size: 6 } dim { size: 3 } } } }
-      """, op.node_def)
-
-    # TODO(josh11b): Re-enable this test once we stop promoting scalars to shapes.
-    # with self.assertRaises(TypeError) as cm:
-    #   self._lib.apply_op("AttrShape", a=5)
-    # self.assertEqual(str(cm.exception),
-    #                  "Don't know how to convert 5 to a TensorShapeProto for "
-    #                  "argument 'a'")
-
-    with self.assertRaises(TypeError):
-      self._lib.apply_op("AttrShape", a="ABC")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("AttrShape", a=[5], name="s1")
+      self.assertProtoEquals("""
+        name: 's1' op: 'AttrShape'
+        attr { key: 'a' value { shape { dim { size: 5 } } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("AttrShape", a=(4, 3, 2), name="s2")
+      self.assertProtoEquals("""
+        name: 's2' op: 'AttrShape'
+        attr { key: 'a' value {
+          shape { dim { size: 4 } dim { size: 3 } dim { size: 2 } } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op(
+          "AttrShape", a=tensor_shape.TensorShape([3, 2]), name="s3")
+      self.assertProtoEquals("""
+        name: 's3' op: 'AttrShape'
+        attr { key: 'a' value {
+          shape { dim { size: 3 } dim { size: 2 } } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("AttrShape", a=[], name="s4")
+      self.assertProtoEquals("""
+        name: 's4' op: 'AttrShape' attr { key: 'a' value { shape { } } }
+        """, op.node_def)
+
+      shape = tensor_shape_pb2.TensorShapeProto()
+      shape.dim.add().size = 6
+      shape.dim.add().size = 3
+      op = self._lib.apply_op("AttrShape", a=shape, name="s5")
+      self.assertProtoEquals("""
+        name: 's5' op: 'AttrShape'
+        attr { key: 'a' value { shape { dim { size: 6 } dim { size: 3 } } } }
+        """, op.node_def)
+
+      # TODO(josh11b): Re-enable this test once we stop promoting scalars to
+      # shapes.
+      # with self.assertRaises(TypeError) as cm:
+      #   self._lib.apply_op("AttrShape", a=5)
+      # self.assertEqual(str(cm.exception),
+      #                  "Don't know how to convert 5 to a TensorShapeProto for"
+      #                  " argument 'a'")
+
+      with self.assertRaises(TypeError):
+        self._lib.apply_op("AttrShape", a="ABC")
 
   def testAttrShapeList(self):
-    self._add_op("name: 'AttrShapeList' attr { name: 'a' type: 'list(shape)' }")
-
-    op = self._lib.apply_op("AttrShapeList", a=[[3, 2], [6, 5, 4]], name="sl")
-    self.assertProtoEquals("""
-      name: 'sl' op: 'AttrShapeList'
-      attr { key: 'a' value { list {
-        shape { dim { size: 3 } dim { size: 2 } }
-        shape { dim { size: 6 } dim { size: 5 } dim { size: 4 } } } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("AttrShapeList", a=[], name="esl")
-    self.assertProtoEquals("""
-      name: 'esl' op: 'AttrShapeList' attr { key: 'a' value { list { } } }
-      """, op.node_def)
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("AttrShapeList", a=[[3, 2], [6, 5, 4]], name="sl")
+      self.assertProtoEquals("""
+        name: 'sl' op: 'AttrShapeList'
+        attr { key: 'a' value { list {
+          shape { dim { size: 3 } dim { size: 2 } }
+          shape { dim { size: 6 } dim { size: 5 } dim { size: 4 } } } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("AttrShapeList", a=[], name="esl")
+      self.assertProtoEquals("""
+        name: 'esl' op: 'AttrShapeList' attr { key: 'a' value { list { } } }
+        """, op.node_def)
 
   def testAttrPartialShape(self):
-    self._add_op(
-        "name: 'AttrPartialShape' attr { name: 'a' type: 'shape' }")
-
-    op = self._lib.apply_op("AttrPartialShape", a=[5], name="s1")
-    self.assertProtoEquals("""
-      name: 's1' op: 'AttrPartialShape'
-      attr { key: 'a' value { shape { dim { size: 5 } } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("AttrPartialShape", a=(4, None, 2), name="s2")
-    self.assertProtoEquals("""
-      name: 's2' op: 'AttrPartialShape'
-      attr { key: 'a' value {
-        shape { dim { size: 4 } dim { size: -1 } dim { size: 2 } } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op(
-        "AttrPartialShape", a=tensor_shape.TensorShape([3, None]), name="s3")
-    self.assertProtoEquals("""
-      name: 's3' op: 'AttrPartialShape'
-      attr { key: 'a' value {
-        shape { dim { size: 3 } dim { size: -1 } } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("AttrPartialShape", a=[], name="s4")
-    self.assertProtoEquals("""
-      name: 's4' op: 'AttrPartialShape'
-      attr { key: 'a' value { shape { } } }
-      """, op.node_def)
-
-    shape = tensor_shape_pb2.TensorShapeProto()
-    shape.dim.add().size = -1
-    shape.dim.add().size = 3
-    op = self._lib.apply_op("AttrPartialShape", a=shape, name="s5")
-    self.assertProtoEquals("""
-      name: 's5' op: 'AttrPartialShape'
-      attr { key: 'a' value {
-        shape { dim { size: -1 } dim { size: 3 } } } }
-      """, op.node_def)
-
-    # TODO(ebrevdo): Re-enable once we stop promoting scalars to shapes.
-    # with self.assertRaises(TypeError) as cm:
-    #   self._lib.apply_op("AttrPartialShape", a=5)
-    # self.assertEqual(str(cm.exception),
-    #                  "Don't know how to convert 5 to a TensorShapeProto for "
-    #                  "argument 'a'")
-
-    with self.assertRaises(TypeError):
-      self._lib.apply_op("AttrPartialShape", a="ABC")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("AttrPartialShape", a=[5], name="s1")
+      self.assertProtoEquals("""
+        name: 's1' op: 'AttrPartialShape'
+        attr { key: 'a' value { shape { dim { size: 5 } } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("AttrPartialShape", a=(4, None, 2), name="s2")
+      self.assertProtoEquals("""
+        name: 's2' op: 'AttrPartialShape'
+        attr { key: 'a' value {
+          shape { dim { size: 4 } dim { size: -1 } dim { size: 2 } } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op(
+          "AttrPartialShape", a=tensor_shape.TensorShape([3, None]), name="s3")
+      self.assertProtoEquals("""
+        name: 's3' op: 'AttrPartialShape'
+        attr { key: 'a' value {
+          shape { dim { size: 3 } dim { size: -1 } } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("AttrPartialShape", a=[], name="s4")
+      self.assertProtoEquals("""
+        name: 's4' op: 'AttrPartialShape'
+        attr { key: 'a' value { shape { } } }
+        """, op.node_def)
+
+      shape = tensor_shape_pb2.TensorShapeProto()
+      shape.dim.add().size = -1
+      shape.dim.add().size = 3
+      op = self._lib.apply_op("AttrPartialShape", a=shape, name="s5")
+      self.assertProtoEquals("""
+        name: 's5' op: 'AttrPartialShape'
+        attr { key: 'a' value {
+          shape { dim { size: -1 } dim { size: 3 } } } }
+        """, op.node_def)
+
+      # TODO(ebrevdo): Re-enable once we stop promoting scalars to shapes.
+      # with self.assertRaises(TypeError) as cm:
+      #   self._lib.apply_op("AttrPartialShape", a=5)
+      # self.assertEqual(str(cm.exception),
+      #                  "Don't know how to convert 5 to a TensorShapeProto for"
+      #                  " argument 'a'")
+
+      with self.assertRaises(TypeError):
+        self._lib.apply_op("AttrPartialShape", a="ABC")
 
   def testAttrPartialShapeList(self):
-    self._add_op("""
-      name: 'AttrPartialShapeList'
-      attr { name: 'a' type: 'list(shape)' }
-    """)
-
-    op = self._lib.apply_op(
-        "AttrPartialShapeList", a=[[3, 2], [6, None, 4]], name="sl")
-    self.assertProtoEquals("""
-      name: 'sl' op: 'AttrPartialShapeList'
-      attr { key: 'a' value { list {
-        shape { dim { size: 3 } dim { size: 2 } }
-        shape { dim { size: 6 } dim { size: -1 } dim { size: 4 } } } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("AttrPartialShapeList", a=[], name="esl")
-    self.assertProtoEquals("""
-      name: 'esl' op: 'AttrPartialShapeList' attr {
-        key: 'a' value { list { } } }
-      """, op.node_def)
+    with ops.Graph().as_default():
+      op = self._lib.apply_op(
+          "AttrPartialShapeList", a=[[3, 2], [6, None, 4]], name="sl")
+      self.assertProtoEquals("""
+        name: 'sl' op: 'AttrPartialShapeList'
+        attr { key: 'a' value { list {
+          shape { dim { size: 3 } dim { size: 2 } }
+          shape { dim { size: 6 } dim { size: -1 } dim { size: 4 } } } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("AttrPartialShapeList", a=[], name="esl")
+      self.assertProtoEquals("""
+        name: 'esl' op: 'AttrPartialShapeList' attr {
+          key: 'a' value { list { } } }
+        """, op.node_def)
 
   def testAttrDefault(self):
-    self._add_op("name: 'AttrDefault' "
-                 "attr { name: 'a' type: 'string' "
-                 "  default_value { s: 'banana' } }")
-
-    op = self._lib.apply_op("AttrDefault", a=None, name="d")
-    self.assertProtoEquals("""
-      name: 'd' op: 'AttrDefault' attr { key: 'a' value { s: 'banana' } }
-      """, op.node_def)
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("AttrDefault", a=None, name="d")
+      self.assertProtoEquals("""
+        name: 'd' op: 'AttrDefault' attr { key: 'a' value { s: 'banana' } }
+        """, op.node_def)
 
-    op = self._lib.apply_op("AttrDefault", a="kiwi", name="c")
-    self.assertProtoEquals("""
-      name: 'c' op: 'AttrDefault' attr { key: 'a' value { s: 'kiwi' } }
-      """, op.node_def)
+      op = self._lib.apply_op("AttrDefault", a="kiwi", name="c")
+      self.assertProtoEquals("""
+        name: 'c' op: 'AttrDefault' attr { key: 'a' value { s: 'kiwi' } }
+        """, op.node_def)
 
   def testAttrListDefault(self):
-    self._add_op("name: 'AttrListDefault' "
-                 "attr { name: 'a' type: 'list(int)' "
-                 "  default_value { list { i: 5 i: 15 } } }")
-
-    op = self._lib.apply_op("AttrListDefault", a=None, name="b")
-    self.assertProtoEquals("""
-      name: 'b' op: 'AttrListDefault'
-      attr { key: 'a' value { list { i: 5 i: 15 } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("AttrListDefault", a=[3], name="a")
-    self.assertProtoEquals("""
-      name: 'a' op: 'AttrListDefault'
-      attr { key: 'a' value { list { i: 3 } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("AttrListDefault", a=[], name="empty")
-    self.assertProtoEquals("""
-      name: 'empty' op: 'AttrListDefault'
-      attr { key: 'a' value { list { } } }
-      """, op.node_def)
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("AttrListDefault", a=None, name="b")
+      self.assertProtoEquals("""
+        name: 'b' op: 'AttrListDefault'
+        attr { key: 'a' value { list { i: 5 i: 15 } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("AttrListDefault", a=[3], name="a")
+      self.assertProtoEquals("""
+        name: 'a' op: 'AttrListDefault'
+        attr { key: 'a' value { list { i: 3 } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("AttrListDefault", a=[], name="empty")
+      self.assertProtoEquals("""
+        name: 'empty' op: 'AttrListDefault'
+        attr { key: 'a' value { list { } } }
+        """, op.node_def)
 
   def testAttrEmptyListDefault(self):
-    self._add_op("name: 'AttrEmptyListDefault' "
-                 "attr { name: 'a' type: 'list(float)' "
-                 "       default_value { list { } } }")
-
-    op = self._lib.apply_op("AttrEmptyListDefault", a=None, name="b")
-    self.assertProtoEquals("""
-      name: 'b' op: 'AttrEmptyListDefault'
-      attr { key: 'a' value { list { } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("AttrEmptyListDefault", a=[3], name="a")
-    self.assertProtoEquals("""
-      name: 'a' op: 'AttrEmptyListDefault'
-      attr { key: 'a' value { list { f: 3 } } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("AttrEmptyListDefault", a=[], name="empty")
-    self.assertProtoEquals("""
-      name: 'empty' op: 'AttrEmptyListDefault'
-      attr { key: 'a' value { list { } } }
-      """, op.node_def)
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("AttrEmptyListDefault", a=None, name="b")
+      self.assertProtoEquals("""
+        name: 'b' op: 'AttrEmptyListDefault'
+        attr { key: 'a' value { list { } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("AttrEmptyListDefault", a=[3], name="a")
+      self.assertProtoEquals("""
+        name: 'a' op: 'AttrEmptyListDefault'
+        attr { key: 'a' value { list { f: 3 } } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("AttrEmptyListDefault", a=[], name="empty")
+      self.assertProtoEquals("""
+        name: 'empty' op: 'AttrEmptyListDefault'
+        attr { key: 'a' value { list { } } }
+        """, op.node_def)
 
   def testReservedAttr(self):
-    self._add_op("name: 'ReservedAttr' "
-                 "attr { name: 'range' type: 'int' } ")
-    op = self._lib.apply_op("ReservedAttr", range_=7, name="x")
-    self.assertProtoEquals("""
-      name: 'x' op: 'ReservedAttr' attr { key: 'range' value { i: 7 } }
-      """, op.node_def)
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("ReservedAttr", range_=7, name="x")
+      self.assertProtoEquals("""
+        name: 'x' op: 'ReservedAttr' attr { key: 'range' value { i: 7 } }
+        """, op.node_def)
 
   def testDefaultAttrType(self):
-    self._add_op("name: 'AttrTypeDefault' "
-                 "input_arg { name: 'a' type_attr: 'T' } "
-                 "attr { name: 'T' type: 'type' "
-                 "       default_value { type: DT_INT32 } }")
-
-    # Give an input whose type has no obvious output type.
-    op = self._lib.apply_op("AttrTypeDefault", a=[], name="n")
-    self.assertProtoEquals("""
-      name: 'n' op: 'AttrTypeDefault' input: 'n/a'
-      attr { key: 'T' value { type: DT_INT32 } }
-      """, op.node_def)
-
-    # Give an input whose type can be inferred as different
-    # than the default.
-    op = self._lib.apply_op("AttrTypeDefault", a=[1.0], name="f")
-    self.assertProtoEquals("""
-      name: 'f' op: 'AttrTypeDefault' input: 'f/a'
-      attr { key: 'T' value { type: DT_FLOAT } }
-      """, op.node_def)
+    with ops.Graph().as_default():
+      # Give an input whose type has no obvious output type.
+      op = self._lib.apply_op("AttrTypeDefault", a=[], name="n")
+      self.assertProtoEquals("""
+        name: 'n' op: 'AttrTypeDefault' input: 'n/a'
+        attr { key: 'T' value { type: DT_INT32 } }
+        """, op.node_def)
+
+      # Give an input whose type can be inferred as different
+      # than the default.
+      op = self._lib.apply_op("AttrTypeDefault", a=[1.0], name="f")
+      self.assertProtoEquals("""
+        name: 'f' op: 'AttrTypeDefault' input: 'f/a'
+        attr { key: 'T' value { type: DT_FLOAT } }
+        """, op.node_def)
 
   def testDefaultListAttrType(self):
-    self._add_op("name: 'AttrListTypeDefault' "
-                 "input_arg { name: 'a' type_attr: 'T' number_attr: 'N' } "
-                 "input_arg { name: 'b' type_attr: 'T' number_attr: 'N' } "
-                 "attr { name: 'T' type: 'type' "
-                 "       default_value { type: DT_INT32 } }"
-                 "attr { name: 'N' type: 'int' }")
-
-    # Give an input whose type can be inferred as different
-    # than the default.
-    op = self._lib.apply_op("AttrListTypeDefault", a=[1.0], b=[2.0], name="n")
-    self.assertProtoEquals("""
-      name: 'n' op: 'AttrListTypeDefault' input: 'n/a_0' input: 'n/b_0'
-      attr { key: 'T' value { type: DT_FLOAT } }
-      attr { key: 'N' value { i: 1 } }
-      """, op.node_def)
+    with ops.Graph().as_default():
+      # Give an input whose type can be inferred as different
+      # than the default.
+      op = self._lib.apply_op("AttrListTypeDefault", a=[1.0], b=[2.0], name="n")
+      self.assertProtoEquals("""
+        name: 'n' op: 'AttrListTypeDefault' input: 'n/a_0' input: 'n/b_0'
+        attr { key: 'T' value { type: DT_FLOAT } }
+        attr { key: 'N' value { i: 1 } }
+        """, op.node_def)
 
   def testNIntsIn(self):
-    self._add_op("name: 'NIntsIn' "
-                 "input_arg { name: 'a' type: DT_INT32 number_attr: 'N' } "
-                 "attr { name: 'N' type: 'int' has_minimum: true minimum: 2 }")
-
-    op = self._lib.apply_op("NIntsIn", a=[1, 2], name="n")
-    self.assertProtoEquals("""
-      name: 'n' op: 'NIntsIn' input: 'n/a_0' input: 'n/a_1'
-      attr { key: 'N' value { i: 2 } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("NIntsIn", a=[5, 4, 3, 2, 1], name="o")
-    self.assertProtoEquals("""
-      name: 'o' op: 'NIntsIn'
-      input: 'o/a_0' input: 'o/a_1' input: 'o/a_2' input: 'o/a_3' input: 'o/a_4'
-      attr { key: 'N' value { i: 5 } }
-      """, op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NIntsIn", a=["foo", "bar"])
-    self.assertEqual(str(cm.exception),
-                     "Tensors in list passed to 'a' of 'NIntsIn' Op have types "
-                     "[string, string] that do not match expected type int32.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NIntsIn",
-                         a=[self.Tensor(dtypes.string),
-                            self.Tensor(dtypes.string)])
-    self.assertEqual(str(cm.exception),
-                     "Tensors in list passed to 'a' of 'NIntsIn' Op have "
-                     "types [string, string] that do not match expected type "
-                     "int32.")
-
-    with self.assertRaises(ValueError) as cm:
-      self._lib.apply_op("NIntsIn", a=[99])
-    self.assertEqual(str(cm.exception),
-                     "List argument 'a' to 'NIntsIn' Op "
-                     "with length 1 shorter than "
-                     "minimum length 2.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NIntsIn", a=[38, "bar"])
-    self.assertEqual(str(cm.exception),
-                     "Tensors in list passed to 'a' of 'NIntsIn' Op have types "
-                     "[int32, string] that do not match expected type int32.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NIntsIn",
-                         a=[self.Tensor(dtypes.int32),
-                            self.Tensor(dtypes.string)])
-    self.assertEqual(str(cm.exception),
-                     "Tensors in list passed to 'a' of 'NIntsIn' Op "
-                     "have types [int32, string] that do not match expected "
-                     "type int32.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NIntsIn", a=17)
-    self.assertStartsWith(str(cm.exception),
-                          "Expected list for 'a' argument "
-                          "to 'NIntsIn' Op, not ")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("NIntsIn", a=[1, 2], name="n")
+      self.assertProtoEquals("""
+        name: 'n' op: 'NIntsIn' input: 'n/a_0' input: 'n/a_1'
+        attr { key: 'N' value { i: 2 } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("NIntsIn", a=[5, 4, 3, 2, 1], name="o")
+      self.assertProtoEquals("""
+        name: 'o' op: 'NIntsIn'
+        input: 'o/a_0' input: 'o/a_1' input: 'o/a_2' input: 'o/a_3' input: 'o/a_4'
+        attr { key: 'N' value { i: 5 } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NIntsIn", a=["foo", "bar"])
+      self.assertEqual(
+          str(cm.exception),
+          "Tensors in list passed to 'a' of 'NIntsIn' Op have types "
+          "[string, string] that do not match expected type int32.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NIntsIn",
+                           a=[self.Tensor(dtypes.string),
+                              self.Tensor(dtypes.string)])
+      self.assertEqual(str(cm.exception),
+                       "Tensors in list passed to 'a' of 'NIntsIn' Op have "
+                       "types [string, string] that do not match expected type "
+                       "int32.")
+
+      with self.assertRaises(ValueError) as cm:
+        self._lib.apply_op("NIntsIn", a=[99])
+      self.assertEqual(str(cm.exception),
+                       "List argument 'a' to 'NIntsIn' Op "
+                       "with length 1 shorter than "
+                       "minimum length 2.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NIntsIn", a=[38, "bar"])
+      self.assertEqual(
+          str(cm.exception),
+          "Tensors in list passed to 'a' of 'NIntsIn' Op have types "
+          "[int32, string] that do not match expected type int32.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NIntsIn",
+                           a=[self.Tensor(dtypes.int32),
+                              self.Tensor(dtypes.string)])
+      self.assertEqual(str(cm.exception),
+                       "Tensors in list passed to 'a' of 'NIntsIn' Op "
+                       "have types [int32, string] that do not match expected "
+                       "type int32.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NIntsIn", a=17)
+      self.assertStartsWith(str(cm.exception),
+                            "Expected list for 'a' argument "
+                            "to 'NIntsIn' Op, not ")
 
   def testNPolymorphicIn(self):
-    self._add_op("name: 'NPolymorphicIn' "
-                 "input_arg { name: 'a' type_attr: 'T' number_attr: 'N' } "
-                 "attr { name: 'T' type: 'type' } "
-                 "attr { name: 'N' type: 'int' has_minimum: true minimum: 2 }")
-
-    op = self._lib.apply_op("NPolymorphicIn", a=[1, 2], name="n")
-    self.assertProtoEquals("""
-      name: 'n' op: 'NPolymorphicIn' input: 'n/a_0' input: 'n/a_1'
-      attr { key: 'T' value { type: DT_INT32 } }
-      attr { key: 'N' value { i: 2 } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("NPolymorphicIn", a=[5, 4, 3, 2, 1], name="o")
-    self.assertProtoEquals("""
-      name: 'o' op: 'NPolymorphicIn'
-      input: 'o/a_0' input: 'o/a_1' input: 'o/a_2' input: 'o/a_3' input: 'o/a_4'
-      attr { key: 'T' value { type: DT_INT32 } }
-      attr { key: 'N' value { i: 5 } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("NPolymorphicIn", a=["foo", "bar"], name="p")
-    self.assertProtoEquals("""
-      name: 'p' op: 'NPolymorphicIn' input: 'p/a_0' input: 'p/a_1'
-      attr { key: 'T' value { type: DT_STRING } }
-      attr { key: 'N' value { i: 2 } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("NPolymorphicIn",
-                            a=[1, self.Tensor(dtypes.float32, name="x")],
-                            name="q")
-    self.assertProtoEquals("""
-      name: 'q' op: 'NPolymorphicIn' input: 'q/a_0' input: 'x'
-      attr { key: 'T' value { type: DT_FLOAT } }
-      attr { key: 'N' value { i: 2 } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("NPolymorphicIn",
-                            a=[self.Tensor(dtypes.float32, name="y"),
-                               self.Tensor(dtypes.float32_ref, name="z")],
-                            name="r")
-    self.assertProtoEquals("""
-      name: 'r' op: 'NPolymorphicIn' input: 'y' input: 'z'
-      attr { key: 'T' value { type: DT_FLOAT } }
-      attr { key: 'N' value { i: 2 } }
-      """, op.node_def)
-
-    with self.assertRaises(ValueError) as cm:
-      self._lib.apply_op("NPolymorphicIn", a=[99])
-    self.assertEqual(str(cm.exception),
-                     "List argument 'a' to 'NPolymorphicIn' Op with length 1 "
-                     "shorter than minimum length 2.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NPolymorphicIn", a=[38, "bar"])
-    self.assertEqual(str(cm.exception),
-                     "Tensors in list passed to 'a' of 'NPolymorphicIn' Op "
-                     "have types [int32, string] that don't all match.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NPolymorphicIn", a=[38, self.Tensor(dtypes.string)])
-    self.assertEqual(str(cm.exception),
-                     "Tensors in list passed to 'a' of 'NPolymorphicIn' Op "
-                     "have types [int32, string] that don't all match.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NPolymorphicIn", a=[38, None])
-    self.assertEqual(str(cm.exception),
-                     "Tensors in list passed to 'a' of 'NPolymorphicIn' Op "
-                     "have types [int32, <NOT CONVERTIBLE TO TENSOR>] that "
-                     "don't all match.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NPolymorphicIn",
-                         a=["abcd", self.Tensor(dtypes.int32)])
-    self.assertEqual(str(cm.exception),
-                     "Tensors in list passed to 'a' of 'NPolymorphicIn' Op "
-                     "have types [string, int32] that don't all match.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NPolymorphicIn", a=17)
-    self.assertStartsWith(str(cm.exception),
-                          "Expected list for 'a' argument "
-                          "to 'NPolymorphicIn' Op, not ")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("NPolymorphicIn", a=[1, 2], name="n")
+      self.assertProtoEquals("""
+        name: 'n' op: 'NPolymorphicIn' input: 'n/a_0' input: 'n/a_1'
+        attr { key: 'T' value { type: DT_INT32 } }
+        attr { key: 'N' value { i: 2 } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("NPolymorphicIn", a=[5, 4, 3, 2, 1], name="o")
+      self.assertProtoEquals("""
+        name: 'o' op: 'NPolymorphicIn'
+        input: 'o/a_0' input: 'o/a_1' input: 'o/a_2' input: 'o/a_3' input: 'o/a_4'
+        attr { key: 'T' value { type: DT_INT32 } }
+        attr { key: 'N' value { i: 5 } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("NPolymorphicIn", a=["foo", "bar"], name="p")
+      self.assertProtoEquals("""
+        name: 'p' op: 'NPolymorphicIn' input: 'p/a_0' input: 'p/a_1'
+        attr { key: 'T' value { type: DT_STRING } }
+        attr { key: 'N' value { i: 2 } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("NPolymorphicIn",
+                              a=[1, self.Tensor(dtypes.float32, name="x")],
+                              name="q")
+      self.assertProtoEquals("""
+        name: 'q' op: 'NPolymorphicIn' input: 'q/a_0' input: 'x'
+        attr { key: 'T' value { type: DT_FLOAT } }
+        attr { key: 'N' value { i: 2 } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("NPolymorphicIn",
+                              a=[self.Tensor(dtypes.float32, name="y"),
+                                 self.Tensor(dtypes.float32_ref, name="z")],
+                              name="r")
+      self.assertProtoEquals("""
+        name: 'r' op: 'NPolymorphicIn' input: 'y' input: 'z'
+        attr { key: 'T' value { type: DT_FLOAT } }
+        attr { key: 'N' value { i: 2 } }
+        """, op.node_def)
+
+      with self.assertRaises(ValueError) as cm:
+        self._lib.apply_op("NPolymorphicIn", a=[99])
+      self.assertEqual(str(cm.exception),
+                       "List argument 'a' to 'NPolymorphicIn' Op with length 1 "
+                       "shorter than minimum length 2.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NPolymorphicIn", a=[38, "bar"])
+      self.assertEqual(str(cm.exception),
+                       "Tensors in list passed to 'a' of 'NPolymorphicIn' Op "
+                       "have types [int32, string] that don't all match.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NPolymorphicIn", a=[38, self.Tensor(dtypes.string)])
+      self.assertEqual(str(cm.exception),
+                       "Tensors in list passed to 'a' of 'NPolymorphicIn' Op "
+                       "have types [int32, string] that don't all match.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NPolymorphicIn", a=[38, None])
+      self.assertEqual(str(cm.exception),
+                       "Tensors in list passed to 'a' of 'NPolymorphicIn' Op "
+                       "have types [int32, <NOT CONVERTIBLE TO TENSOR>] that "
+                       "don't all match.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NPolymorphicIn",
+                           a=["abcd", self.Tensor(dtypes.int32)])
+      self.assertEqual(str(cm.exception),
+                       "Tensors in list passed to 'a' of 'NPolymorphicIn' Op "
+                       "have types [string, int32] that don't all match.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NPolymorphicIn", a=17)
+      self.assertStartsWith(str(cm.exception),
+                            "Expected list for 'a' argument "
+                            "to 'NPolymorphicIn' Op, not ")
 
   def testNPolymorphicRestrictIn(self):
-    self._add_op("name: 'NPolymorphicRestrictIn' "
-                 "input_arg { name: 'a' type_attr: 'T' number_attr: 'N' } "
-                 "attr { name: 'T' type: 'type' allowed_values { "
-                 "  list { type: DT_STRING type: DT_BOOL } } } "
-                 "attr { name: 'N' type: 'int' has_minimum: true minimum: 2 }")
-
-    op = self._lib.apply_op("NPolymorphicRestrictIn", a=["foo", "bar"],
-                            name="p")
-    self.assertProtoEquals("""
-      name: 'p' op: 'NPolymorphicRestrictIn' input: 'p/a_0' input: 'p/a_1'
-      attr { key: 'T' value { type: DT_STRING } }
-      attr { key: 'N' value { i: 2 } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("NPolymorphicRestrictIn",
-                            a=[False, True, False],
-                            name="b")
-    self.assertProtoEquals("""
-      name: 'b' op: 'NPolymorphicRestrictIn'
-      input: 'b/a_0' input: 'b/a_1' input: 'b/a_2'
-      attr { key: 'T' value { type: DT_BOOL } }
-      attr { key: 'N' value { i: 3 } }
-      """, op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NPolymorphicRestrictIn", a=[1, 2])
-    self.assertEqual(str(cm.exception),
-                     "Value passed to parameter 'a' has DataType int32 not in "
-                     "list of allowed values: string, bool")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("NPolymorphicRestrictIn", a=["foo", "bar"],
+                              name="p")
+      self.assertProtoEquals("""
+        name: 'p' op: 'NPolymorphicRestrictIn' input: 'p/a_0' input: 'p/a_1'
+        attr { key: 'T' value { type: DT_STRING } }
+        attr { key: 'N' value { i: 2 } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("NPolymorphicRestrictIn",
+                              a=[False, True, False],
+                              name="b")
+      self.assertProtoEquals("""
+        name: 'b' op: 'NPolymorphicRestrictIn'
+        input: 'b/a_0' input: 'b/a_1' input: 'b/a_2'
+        attr { key: 'T' value { type: DT_BOOL } }
+        attr { key: 'N' value { i: 3 } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NPolymorphicRestrictIn", a=[1, 2])
+      self.assertEqual(
+          str(cm.exception),
+          "Value passed to parameter 'a' has DataType int32 not in "
+          "list of allowed values: string, bool")
 
   def testNInTwice(self):
-    self._add_op("name: 'NInTwice' "
-                 "input_arg { name: 'a' type: DT_INT32 number_attr: 'N' } "
-                 "input_arg { name: 'b' type: DT_STRING number_attr: 'N' } "
-                 "attr { name: 'N' type: 'int' has_minimum: true minimum: 0 }")
-
-    op = self._lib.apply_op("NInTwice", a=[1, 2], b=["one", "two"], name="n")
-    self.assertProtoEquals("""
-      name: 'n' op: 'NInTwice'
-      input: 'n/a_0' input: 'n/a_1' input: 'n/b_0' input: 'n/b_1'
-      attr { key: 'N' value { i: 2 } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("NInTwice", a=[], b=[], name="o")
-    self.assertProtoEquals("""
-      name: 'o' op: 'NInTwice' attr { key: 'N' value { i: 0 } }
-      """, op.node_def)
-
-    with self.assertRaises(ValueError) as cm:
-      self._lib.apply_op("NInTwice", a=[1, 2, 3], b=["too short"])
-    self.assertEqual(str(cm.exception),
-                     "List argument 'b' to 'NInTwice' Op "
-                     "with length 1 must match "
-                     "length 3 of argument 'a'.")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("NInTwice", a=[1, 2], b=["one", "two"], name="n")
+      self.assertProtoEquals("""
+        name: 'n' op: 'NInTwice'
+        input: 'n/a_0' input: 'n/a_1' input: 'n/b_0' input: 'n/b_1'
+        attr { key: 'N' value { i: 2 } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("NInTwice", a=[], b=[], name="o")
+      self.assertProtoEquals("""
+        name: 'o' op: 'NInTwice' attr { key: 'N' value { i: 0 } }
+        """, op.node_def)
+
+      with self.assertRaises(ValueError) as cm:
+        self._lib.apply_op("NInTwice", a=[1, 2, 3], b=["too short"])
+      self.assertEqual(str(cm.exception),
+                       "List argument 'b' to 'NInTwice' Op "
+                       "with length 1 must match "
+                       "length 3 of argument 'a'.")
 
   def testNInPolymorphicTwice(self):
-    self._add_op("name: 'NInPolymorphicTwice' "
-                 "input_arg { name: 'a' type_attr: 'T' number_attr: 'N' } "
-                 "input_arg { name: 'b' type_attr: 'T' number_attr: 'N' } "
-                 "attr { name: 'T' type: 'type' } "
-                 "attr { name: 'N' type: 'int' has_minimum: true minimum: 0 }")
-
-    op = self._lib.apply_op("NInPolymorphicTwice", a=[1, 2], b=[3, 4], name="n")
-    self.assertProtoEquals("""
-      name: 'n' op: 'NInPolymorphicTwice'
-      input: 'n/a_0' input: 'n/a_1' input: 'n/b_0' input: 'n/b_1'
-      attr { key: 'T' value { type: DT_INT32 } }
-      attr { key: 'N' value { i: 2 } }
-      """, op.node_def)
-
-    with self.assertRaises(ValueError) as cm:
-      self._lib.apply_op("NInPolymorphicTwice", a=[1, 2, 3], b=[5])
-    self.assertEqual(str(cm.exception),
-                     "List argument 'b' to 'NInPolymorphicTwice' Op "
-                     "with length 1 "
-                     "must match length 3 of argument 'a'.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NInPolymorphicTwice", a=[1, 2], b=["one", "two"])
-    self.assertEqual(str(cm.exception),
-                     "Tensors in list passed to 'b' of 'NInPolymorphicTwice' "
-                     "Op have types [string, string] that do not match type "
-                     "int32 inferred from earlier arguments.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NInPolymorphicTwice",
-                         a=[self.Tensor(dtypes.int32)],
-                         b=[self.Tensor(dtypes.string)])
-    self.assertEqual(str(cm.exception),
-                     "Tensors in list passed to 'b' of "
-                     "'NInPolymorphicTwice' Op have types [string] that do not "
-                     "match type int32 inferred from earlier arguments.")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("NInPolymorphicTwice", a=[1, 2], b=[3, 4],
+                              name="n")
+      self.assertProtoEquals("""
+        name: 'n' op: 'NInPolymorphicTwice'
+        input: 'n/a_0' input: 'n/a_1' input: 'n/b_0' input: 'n/b_1'
+        attr { key: 'T' value { type: DT_INT32 } }
+        attr { key: 'N' value { i: 2 } }
+        """, op.node_def)
+
+      with self.assertRaises(ValueError) as cm:
+        self._lib.apply_op("NInPolymorphicTwice", a=[1, 2, 3], b=[5])
+      self.assertEqual(str(cm.exception),
+                       "List argument 'b' to 'NInPolymorphicTwice' Op "
+                       "with length 1 "
+                       "must match length 3 of argument 'a'.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NInPolymorphicTwice", a=[1, 2], b=["one", "two"])
+      self.assertEqual(str(cm.exception),
+                       "Tensors in list passed to 'b' of 'NInPolymorphicTwice' "
+                       "Op have types [string, string] that do not match type "
+                       "int32 inferred from earlier arguments.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NInPolymorphicTwice",
+                           a=[self.Tensor(dtypes.int32)],
+                           b=[self.Tensor(dtypes.string)])
+      self.assertEqual(str(cm.exception),
+                       "Tensors in list passed to 'b' of "
+                       "'NInPolymorphicTwice' Op have types [string] that do "
+                       "not match type int32 inferred from earlier arguments.")
 
   def testNInTwoTypeVariables(self):
-    self._add_op("name: 'NInTwoTypeVariables' "
-                 "input_arg { name: 'a' type_attr: 'S' number_attr: 'N' } "
-                 "input_arg { name: 'b' type_attr: 'T' number_attr: 'N' } "
-                 "attr { name: 'S' type: 'type' } "
-                 "attr { name: 'T' type: 'type' } "
-                 "attr { name: 'N' type: 'int' has_minimum: true minimum: 0 }")
-
-    op = self._lib.apply_op("NInTwoTypeVariables",
-                            a=[1, 2],
-                            b=[True, False],
-                            name="n")
-    self.assertProtoEquals("""
-      name: 'n' op: 'NInTwoTypeVariables'
-      input: 'n/a_0' input: 'n/a_1' input: 'n/b_0' input: 'n/b_1'
-      attr { key: 'S' value { type: DT_INT32 } }
-      attr { key: 'T' value { type: DT_BOOL } }
-      attr { key: 'N' value { i: 2 } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("NInTwoTypeVariables", a=[1, 2], b=[3, 4], name="o")
-    self.assertProtoEquals("""
-      name: 'o' op: 'NInTwoTypeVariables'
-      input: 'o/a_0' input: 'o/a_1' input: 'o/b_0' input: 'o/b_1'
-      attr { key: 'S' value { type: DT_INT32 } }
-      attr { key: 'T' value { type: DT_INT32 } }
-      attr { key: 'N' value { i: 2 } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("NInTwoTypeVariables",
-                            a=[self.Tensor(dtypes.int32, name="q")],
-                            b=[self.Tensor(dtypes.string, name="r")],
-                            name="p")
-    self.assertProtoEquals("""
-      name: 'p' op: 'NInTwoTypeVariables' input: 'q' input: 'r'
-      attr { key: 'S' value { type: DT_INT32 } }
-      attr { key: 'T' value { type: DT_STRING } }
-      attr { key: 'N' value { i: 1 } }
-      """, op.node_def)
-
-    with self.assertRaises(ValueError) as cm:
-      self._lib.apply_op("NInTwoTypeVariables", a=[1, 2, 3], b=["5"])
-    self.assertEqual(str(cm.exception),
-                     "List argument 'b' to 'NInTwoTypeVariables' Op "
-                     "with length 1 "
-                     "must match length 3 of argument 'a'.")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("NInTwoTypeVariables",
+                              a=[1, 2],
+                              b=[True, False],
+                              name="n")
+      self.assertProtoEquals("""
+        name: 'n' op: 'NInTwoTypeVariables'
+        input: 'n/a_0' input: 'n/a_1' input: 'n/b_0' input: 'n/b_1'
+        attr { key: 'S' value { type: DT_INT32 } }
+        attr { key: 'T' value { type: DT_BOOL } }
+        attr { key: 'N' value { i: 2 } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("NInTwoTypeVariables", a=[1, 2], b=[3, 4],
+                              name="o")
+      self.assertProtoEquals("""
+        name: 'o' op: 'NInTwoTypeVariables'
+        input: 'o/a_0' input: 'o/a_1' input: 'o/b_0' input: 'o/b_1'
+        attr { key: 'S' value { type: DT_INT32 } }
+        attr { key: 'T' value { type: DT_INT32 } }
+        attr { key: 'N' value { i: 2 } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("NInTwoTypeVariables",
+                              a=[self.Tensor(dtypes.int32, name="q")],
+                              b=[self.Tensor(dtypes.string, name="r")],
+                              name="p")
+      self.assertProtoEquals("""
+        name: 'p' op: 'NInTwoTypeVariables' input: 'q' input: 'r'
+        attr { key: 'S' value { type: DT_INT32 } }
+        attr { key: 'T' value { type: DT_STRING } }
+        attr { key: 'N' value { i: 1 } }
+        """, op.node_def)
+
+      with self.assertRaises(ValueError) as cm:
+        self._lib.apply_op("NInTwoTypeVariables", a=[1, 2, 3], b=["5"])
+      self.assertEqual(str(cm.exception),
+                       "List argument 'b' to 'NInTwoTypeVariables' Op "
+                       "with length 1 "
+                       "must match length 3 of argument 'a'.")
 
   def testInPolymorphicTwice(self):
-    self._add_op("name: 'InPolymorphicTwice' "
-                 "input_arg { name: 'a' type_attr: 'T' number_attr: 'N' } "
-                 "input_arg { name: 'b' type_attr: 'T' number_attr: 'M' } "
-                 "attr { name: 'T' type: 'type' } "
-                 "attr { name: 'N' type: 'int' has_minimum: true minimum: 0 } "
-                 "attr { name: 'M' type: 'int' has_minimum: true minimum: 0 } ")
-
-    op = self._lib.apply_op("InPolymorphicTwice", a=[8], b=[3, 4, 5], name="n")
-    self.assertProtoEquals("""
-      name: 'n' op: 'InPolymorphicTwice'
-      input: 'n/a_0' input: 'n/b_0' input: 'n/b_1' input: 'n/b_2'
-      attr { key: 'T' value { type: DT_INT32 } }
-      attr { key: 'N' value { i: 1 } }
-      attr { key: 'M' value { i: 3 } }
-      """, op.node_def)
-
-    op = self._lib.apply_op("InPolymorphicTwice", a=[8], b=[], name="o")
-    self.assertProtoEquals("""
-      name: 'o' op: 'InPolymorphicTwice' input: 'o/a_0'
-      attr { key: 'T' value { type: DT_INT32 } }
-      attr { key: 'N' value { i: 1 } }
-      attr { key: 'M' value { i: 0 } }
-      """, op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("InPolymorphicTwice", a=[], b=[3, 4, 5])
-    self.assertEqual(str(cm.exception),
-                     "Don't know how to infer type variable from empty input "
-                     "list passed to input 'a' of 'InPolymorphicTwice' Op.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("InPolymorphicTwice", a=[1, 2], b=["one", "two"])
-    self.assertEqual(str(cm.exception),
-                     "Tensors in list passed to 'b' of 'InPolymorphicTwice' Op "
-                     "have types [string, string] that do not match type int32 "
-                     "inferred from earlier arguments.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("InPolymorphicTwice",
-                         a=[self.Tensor(dtypes.int32)],
-                         b=[self.Tensor(dtypes.string)])
-    self.assertEqual(str(cm.exception),
-                     "Tensors in list passed to 'b' of 'InPolymorphicTwice' "
-                     "Op have types [string] that do not match type int32 "
-                     "inferred from earlier arguments.")
+    with ops.Graph().as_default():
+      op = self._lib.apply_op("InPolymorphicTwice", a=[8], b=[3, 4, 5],
+                              name="n")
+      self.assertProtoEquals("""
+        name: 'n' op: 'InPolymorphicTwice'
+        input: 'n/a_0' input: 'n/b_0' input: 'n/b_1' input: 'n/b_2'
+        attr { key: 'T' value { type: DT_INT32 } }
+        attr { key: 'N' value { i: 1 } }
+        attr { key: 'M' value { i: 3 } }
+        """, op.node_def)
+
+      op = self._lib.apply_op("InPolymorphicTwice", a=[8], b=[], name="o")
+      self.assertProtoEquals("""
+        name: 'o' op: 'InPolymorphicTwice' input: 'o/a_0'
+        attr { key: 'T' value { type: DT_INT32 } }
+        attr { key: 'N' value { i: 1 } }
+        attr { key: 'M' value { i: 0 } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("InPolymorphicTwice", a=[], b=[3, 4, 5])
+      self.assertEqual(str(cm.exception),
+                       "Don't know how to infer type variable from empty input "
+                       "list passed to input 'a' of 'InPolymorphicTwice' Op.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("InPolymorphicTwice", a=[1, 2], b=["one", "two"])
+      self.assertEqual(
+          str(cm.exception),
+          "Tensors in list passed to 'b' of 'InPolymorphicTwice' Op "
+          "have types [string, string] that do not match type int32 "
+          "inferred from earlier arguments.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("InPolymorphicTwice",
+                           a=[self.Tensor(dtypes.int32)],
+                           b=[self.Tensor(dtypes.string)])
+      self.assertEqual(str(cm.exception),
+                       "Tensors in list passed to 'b' of 'InPolymorphicTwice' "
+                       "Op have types [string] that do not match type int32 "
+                       "inferred from earlier arguments.")
 
   def testNIntsOut(self):
-    self._add_op("name: 'NIntsOut' "
-                 "output_arg { name: 'a' type: DT_INT32 number_attr: 'N' } "
-                 "attr { name: 'N' type: 'int' has_minimum: true minimum: 2 }")
-
-    out1, out2 = self._lib.apply_op("NIntsOut", N=2, name="n")
-    self.assertEqual(dtypes.int32, out1.dtype)
-    self.assertEqual(dtypes.int32, out2.dtype)
-    self.assertProtoEquals("""
-      name: 'n' op: 'NIntsOut' attr { key: 'N' value { i: 2 } }
-      """, out1.op.node_def)
-
-    out1, out2, out3, out4, out5 = self._lib.apply_op(
-        "NIntsOut", N=5, name="o")
-    self.assertEqual(dtypes.int32, out1.dtype)
-    self.assertEqual(dtypes.int32, out2.dtype)
-    self.assertEqual(dtypes.int32, out3.dtype)
-    self.assertEqual(dtypes.int32, out4.dtype)
-    self.assertEqual(dtypes.int32, out5.dtype)
-    self.assertProtoEquals("""
-      name: 'o' op: 'NIntsOut' attr { key: 'N' value { i: 5 } }
-      """, out5.op.node_def)
-
-    with self.assertRaises(ValueError) as cm:
-      self._lib.apply_op("NIntsOut", N=1)
-    self.assertEqual(str(cm.exception),
-                     "Attr 'N' of 'NIntsOut' Op passed 1 less than minimum 2.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NIntsOut", N=[3])
-    self.assertEqual(str(cm.exception),
-                     "Expected int for argument 'N' not [3].")
+    with ops.Graph().as_default():
+      out1, out2 = self._lib.apply_op("NIntsOut", N=2, name="n")
+      self.assertEqual(dtypes.int32, out1.dtype)
+      self.assertEqual(dtypes.int32, out2.dtype)
+      self.assertProtoEquals("""
+        name: 'n' op: 'NIntsOut' attr { key: 'N' value { i: 2 } }
+        """, out1.op.node_def)
+
+      out1, out2, out3, out4, out5 = self._lib.apply_op(
+          "NIntsOut", N=5, name="o")
+      self.assertEqual(dtypes.int32, out1.dtype)
+      self.assertEqual(dtypes.int32, out2.dtype)
+      self.assertEqual(dtypes.int32, out3.dtype)
+      self.assertEqual(dtypes.int32, out4.dtype)
+      self.assertEqual(dtypes.int32, out5.dtype)
+      self.assertProtoEquals("""
+        name: 'o' op: 'NIntsOut' attr { key: 'N' value { i: 5 } }
+        """, out5.op.node_def)
+
+      with self.assertRaises(ValueError) as cm:
+        self._lib.apply_op("NIntsOut", N=1)
+      self.assertEqual(
+          str(cm.exception),
+          "Attr 'N' of 'NIntsOut' Op passed 1 less than minimum 2.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NIntsOut", N=[3])
+      self.assertEqual(str(cm.exception),
+                       "Expected int for argument 'N' not [3].")
 
   def testNIntsOutDefault(self):
-    self._add_op("name: 'NIntsOutDefault' "
-                 "output_arg { name: 'a' type: DT_INT32 number_attr: 'N' } "
-                 "attr { name: 'N' type: 'int' has_minimum: true minimum: 2"
-                 "  default_value { i:3 } }")
-
-    out1, out2, out3 = self._lib.apply_op(
-        "NIntsOutDefault", N=None, name="z")
-    self.assertEqual(dtypes.int32, out1.dtype)
-    self.assertEqual(dtypes.int32, out2.dtype)
-    self.assertEqual(dtypes.int32, out3.dtype)
-    self.assertProtoEquals("""
-      name: 'z' op: 'NIntsOutDefault' attr { key: 'N' value { i: 3 } }
-      """, out1.op.node_def)
-
-    out1, out2 = self._lib.apply_op("NIntsOutDefault", N=2, name="y")
-    self.assertEqual(dtypes.int32, out1.dtype)
-    self.assertEqual(dtypes.int32, out2.dtype)
-    self.assertProtoEquals("""
-      name: 'y' op: 'NIntsOutDefault' attr { key: 'N' value { i: 2 } }
-      """, out2.op.node_def)
+    with ops.Graph().as_default():
+      out1, out2, out3 = self._lib.apply_op(
+          "NIntsOutDefault", N=None, name="z")
+      self.assertEqual(dtypes.int32, out1.dtype)
+      self.assertEqual(dtypes.int32, out2.dtype)
+      self.assertEqual(dtypes.int32, out3.dtype)
+      self.assertProtoEquals("""
+        name: 'z' op: 'NIntsOutDefault' attr { key: 'N' value { i: 3 } }
+        """, out1.op.node_def)
+
+      out1, out2 = self._lib.apply_op("NIntsOutDefault", N=2, name="y")
+      self.assertEqual(dtypes.int32, out1.dtype)
+      self.assertEqual(dtypes.int32, out2.dtype)
+      self.assertProtoEquals("""
+        name: 'y' op: 'NIntsOutDefault' attr { key: 'N' value { i: 2 } }
+        """, out2.op.node_def)
 
   def testNPolymorphicOut(self):
-    self._add_op("name: 'NPolymorphicOut' "
-                 "output_arg { name: 'a' type_attr: 'T' number_attr: 'N' } "
-                 "attr { name: 'T' type: 'type' } "
-                 "attr { name: 'N' type: 'int' has_minimum: true minimum: 2 }")
-
-    out1, out2 = self._lib.apply_op("NPolymorphicOut",
-                                    N=2,
-                                    T=dtypes.int32,
-                                    name="n")
-    self.assertEqual(dtypes.int32, out1.dtype)
-    self.assertEqual(dtypes.int32, out2.dtype)
-    self.assertProtoEquals("""
-      name: 'n' op: 'NPolymorphicOut'
-      attr { key: 'T' value { type: DT_INT32 } }
-      attr { key: 'N' value { i: 2 } }
-      """, out1.op.node_def)
-
-    out1, out2, out3 = self._lib.apply_op(
-        "NPolymorphicOut", T=dtypes.string, N=3, name="o")
-    self.assertEqual(dtypes.string, out1.dtype)
-    self.assertEqual(dtypes.string, out2.dtype)
-    self.assertEqual(dtypes.string, out3.dtype)
-    self.assertProtoEquals("""
-      name: 'o' op: 'NPolymorphicOut'
-      attr { key: 'T' value { type: DT_STRING } }
-      attr { key: 'N' value { i: 3 } }
-      """, out3.op.node_def)
-
-    with self.assertRaises(ValueError) as cm:
-      self._lib.apply_op("NPolymorphicOut", N=1, T=dtypes.string)
-    self.assertEqual(str(cm.exception),
-                     "Attr 'N' of 'NPolymorphicOut' Op "
-                     "passed 1 less than minimum 2.")
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NPolymorphicOut", N=3, T=[dtypes.string])
-    self.assertEqual(
-        str(cm.exception),
-        "Expected DataType for argument 'T' not [tf.string].")
+    with ops.Graph().as_default():
+      out1, out2 = self._lib.apply_op("NPolymorphicOut",
+                                      N=2,
+                                      T=dtypes.int32,
+                                      name="n")
+      self.assertEqual(dtypes.int32, out1.dtype)
+      self.assertEqual(dtypes.int32, out2.dtype)
+      self.assertProtoEquals("""
+        name: 'n' op: 'NPolymorphicOut'
+        attr { key: 'T' value { type: DT_INT32 } }
+        attr { key: 'N' value { i: 2 } }
+        """, out1.op.node_def)
+
+      out1, out2, out3 = self._lib.apply_op(
+          "NPolymorphicOut", T=dtypes.string, N=3, name="o")
+      self.assertEqual(dtypes.string, out1.dtype)
+      self.assertEqual(dtypes.string, out2.dtype)
+      self.assertEqual(dtypes.string, out3.dtype)
+      self.assertProtoEquals("""
+        name: 'o' op: 'NPolymorphicOut'
+        attr { key: 'T' value { type: DT_STRING } }
+        attr { key: 'N' value { i: 3 } }
+        """, out3.op.node_def)
+
+      with self.assertRaises(ValueError) as cm:
+        self._lib.apply_op("NPolymorphicOut", N=1, T=dtypes.string)
+      self.assertEqual(str(cm.exception),
+                       "Attr 'N' of 'NPolymorphicOut' Op "
+                       "passed 1 less than minimum 2.")
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NPolymorphicOut", N=3, T=[dtypes.string])
+      self.assertEqual(
+          str(cm.exception),
+          "Expected DataType for argument 'T' not [tf.string].")
 
   def testNPolymorphicOutDefault(self):
-    self._add_op("name: 'NPolymorphicOutDefault' "
-                 "output_arg { name: 'a' type_attr: 'T' number_attr: 'N' } "
-                 "attr { name: 'T' type: 'type'"
-                 "  default_value { type: DT_BOOL } } "
-                 "attr { name: 'N' type: 'int' has_minimum: true minimum: 2 "
-                 "  default_value { i: 2 } }")
-
-    out1, out2 = self._lib.apply_op(
-        "NPolymorphicOutDefault", N=None, T=None, name="r")
-    self.assertEqual(dtypes.bool, out1.dtype)
-    self.assertEqual(dtypes.bool, out2.dtype)
-    self.assertProtoEquals("""
-      name: 'r' op: 'NPolymorphicOutDefault'
-      attr { key: 'T' value { type: DT_BOOL } }
-      attr { key: 'N' value { i: 2 } }
-      """, out1.op.node_def)
-
-    out1, out2, out3 = self._lib.apply_op(
-        "NPolymorphicOutDefault", N=3, T=None, name="s")
-    self.assertEqual(dtypes.bool, out1.dtype)
-    self.assertEqual(dtypes.bool, out2.dtype)
-    self.assertEqual(dtypes.bool, out3.dtype)
-    self.assertProtoEquals("""
-      name: 's' op: 'NPolymorphicOutDefault'
-      attr { key: 'T' value { type: DT_BOOL } }
-      attr { key: 'N' value { i: 3 } }
-      """, out1.op.node_def)
-
-    out1, out2 = self._lib.apply_op(
-        "NPolymorphicOutDefault", N=None, T=dtypes.int32, name="t")
-    self.assertEqual(dtypes.int32, out1.dtype)
-    self.assertEqual(dtypes.int32, out2.dtype)
-    self.assertProtoEquals("""
-      name: 't' op: 'NPolymorphicOutDefault'
-      attr { key: 'T' value { type: DT_INT32 } }
-      attr { key: 'N' value { i: 2 } }
-      """, out1.op.node_def)
-
-    out1, out2, out3 = self._lib.apply_op(
-        "NPolymorphicOutDefault", N=3, T=dtypes.int32, name="u")
-    self.assertEqual(dtypes.int32, out1.dtype)
-    self.assertEqual(dtypes.int32, out2.dtype)
-    self.assertEqual(dtypes.int32, out3.dtype)
-    self.assertProtoEquals("""
-      name: 'u' op: 'NPolymorphicOutDefault'
-      attr { key: 'T' value { type: DT_INT32 } }
-      attr { key: 'N' value { i: 3 } }
-      """, out1.op.node_def)
+    with ops.Graph().as_default():
+      out1, out2 = self._lib.apply_op(
+          "NPolymorphicOutDefault", N=None, T=None, name="r")
+      self.assertEqual(dtypes.bool, out1.dtype)
+      self.assertEqual(dtypes.bool, out2.dtype)
+      self.assertProtoEquals("""
+        name: 'r' op: 'NPolymorphicOutDefault'
+        attr { key: 'T' value { type: DT_BOOL } }
+        attr { key: 'N' value { i: 2 } }
+        """, out1.op.node_def)
+
+      out1, out2, out3 = self._lib.apply_op(
+          "NPolymorphicOutDefault", N=3, T=None, name="s")
+      self.assertEqual(dtypes.bool, out1.dtype)
+      self.assertEqual(dtypes.bool, out2.dtype)
+      self.assertEqual(dtypes.bool, out3.dtype)
+      self.assertProtoEquals("""
+        name: 's' op: 'NPolymorphicOutDefault'
+        attr { key: 'T' value { type: DT_BOOL } }
+        attr { key: 'N' value { i: 3 } }
+        """, out1.op.node_def)
+
+      out1, out2 = self._lib.apply_op(
+          "NPolymorphicOutDefault", N=None, T=dtypes.int32, name="t")
+      self.assertEqual(dtypes.int32, out1.dtype)
+      self.assertEqual(dtypes.int32, out2.dtype)
+      self.assertProtoEquals("""
+        name: 't' op: 'NPolymorphicOutDefault'
+        attr { key: 'T' value { type: DT_INT32 } }
+        attr { key: 'N' value { i: 2 } }
+        """, out1.op.node_def)
+
+      out1, out2, out3 = self._lib.apply_op(
+          "NPolymorphicOutDefault", N=3, T=dtypes.int32, name="u")
+      self.assertEqual(dtypes.int32, out1.dtype)
+      self.assertEqual(dtypes.int32, out2.dtype)
+      self.assertEqual(dtypes.int32, out3.dtype)
+      self.assertProtoEquals("""
+        name: 'u' op: 'NPolymorphicOutDefault'
+        attr { key: 'T' value { type: DT_INT32 } }
+        attr { key: 'N' value { i: 3 } }
+        """, out1.op.node_def)
 
   def testNPolymorphicRestrictOut(self):
-    self._add_op("name: 'NPolymorphicRestrictOut' "
-                 "output_arg { name: 'a' type_attr: 'T' number_attr: 'N' } "
-                 "attr { name: 'T' type: 'type' allowed_values { "
-                 "  list { type: DT_STRING type: DT_BOOL } } } "
-                 "attr { name: 'N' type: 'int' has_minimum: true minimum: 2 }")
-
-    out1, out2, out3 = self._lib.apply_op(
-        "NPolymorphicRestrictOut", N=3, T=dtypes.bool, name="u")
-    self.assertEqual(dtypes.bool, out1.dtype)
-    self.assertEqual(dtypes.bool, out2.dtype)
-    self.assertEqual(dtypes.bool, out3.dtype)
-    self.assertProtoEquals("""
-      name: 'u' op: 'NPolymorphicRestrictOut'
-      attr { key: 'T' value { type: DT_BOOL } }
-      attr { key: 'N' value { i: 3 } }
-      """, out1.op.node_def)
-
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("NPolymorphicRestrictOut", N=2, T=dtypes.int32)
-    self.assertEqual(str(cm.exception),
-                     "Value passed to parameter 'T' has DataType int32 "
-                     "not in list of allowed values: string, bool")
+    with ops.Graph().as_default():
+      out1, out2, out3 = self._lib.apply_op(
+          "NPolymorphicRestrictOut", N=3, T=dtypes.bool, name="u")
+      self.assertEqual(dtypes.bool, out1.dtype)
+      self.assertEqual(dtypes.bool, out2.dtype)
+      self.assertEqual(dtypes.bool, out3.dtype)
+      self.assertProtoEquals("""
+        name: 'u' op: 'NPolymorphicRestrictOut'
+        attr { key: 'T' value { type: DT_BOOL } }
+        attr { key: 'N' value { i: 3 } }
+        """, out1.op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("NPolymorphicRestrictOut", N=2, T=dtypes.int32)
+      self.assertEqual(str(cm.exception),
+                       "Value passed to parameter 'T' has DataType int32 "
+                       "not in list of allowed values: string, bool")
 
   def testRef(self):
-    self._add_op("name: 'RefIn' "
-                 "input_arg { name: 'a' type_attr: 'T' is_ref: true } "
-                 "attr { name: 'T' type: 'type' } ")
-    self._add_op("name: 'TwoRefsIn' "
-                 "input_arg { name: 'a' type_attr: 'T' is_ref: true } "
-                 "input_arg { name: 'b' type_attr: 'T' is_ref: true } "
-                 "attr { name: 'T' type: 'type' } ")
-    self._add_op("name: 'RefOut' "
-                 "output_arg { name: 'a' type_attr: 'T' is_ref: true } "
-                 "attr { name: 'T' type: 'type' } ")
-
-    out = self._lib.apply_op("RefOut", T=dtypes.bool, name="o")
-    self.assertEqual(dtypes.bool_ref, out.dtype)
-    self.assertProtoEquals("""
-      name: 'o' op: 'RefOut'
-      attr { key: 'T' value { type: DT_BOOL } }
-      """, out.op.node_def)
-
-    op = self._lib.apply_op("RefIn", a=out, name="i")
-    self.assertProtoEquals("""
-      name: 'i' op: 'RefIn' input: 'o'
-      attr { key: 'T' value { type: DT_BOOL } }
-      attr { key: "_class" value { list { s: "loc:@o" } } }
-      """, op.node_def)
-
-    # Can pass ref to non-ref input.
-    out = self._lib.apply_op("RefOut", T=dtypes.int32, name="r")
-    out = self._lib.apply_op("Simple", a=out, name="s")
-    self.assertProtoEquals("""
-      name: 's' op: 'Simple' input: 'r'
-      """, out.op.node_def)
-
-    # Can't pass non-ref to ref input.
-    with self.assertRaises(TypeError) as cm:
-      self._lib.apply_op("RefIn", a=2)
-    self.assertEqual(str(cm.exception),
-                     "'RefIn' Op requires that input 'a' be a mutable tensor " +
-                     "(e.g.: a tf.Variable)")
-
-    input_a = self._lib.apply_op("RefOut", T=dtypes.int32, name="t")
-    input_b = self._lib.apply_op("RefOut", T=dtypes.int32, name="u")
-    op = self._lib.apply_op("TwoRefsIn", a=input_a, b=input_b, name="v")
-    # NOTE(mrry): The order of colocation constraints is an implementation
-    # detail.
-    self.assertProtoEquals("""
-      name: 'v' op: 'TwoRefsIn' input: 't' input: 'u'
-      attr { key: 'T' value { type: DT_INT32 } }
-      attr { key: "_class" value { list { s: "loc:@t" s: "loc:@u" } } }
-      """, op.node_def)
+    with ops.Graph().as_default():
+      out = self._lib.apply_op("RefOut", T=dtypes.bool, name="o")
+      self.assertEqual(dtypes.bool_ref, out.dtype)
+      self.assertProtoEquals("""
+        name: 'o' op: 'RefOut'
+        attr { key: 'T' value { type: DT_BOOL } }
+        """, out.op.node_def)
+
+      op = self._lib.apply_op("RefIn", a=out, name="i")
+      self.assertProtoEquals("""
+        name: 'i' op: 'RefIn' input: 'o'
+        attr { key: 'T' value { type: DT_BOOL } }
+        attr { key: "_class" value { list { s: "loc:@o" } } }
+        """, op.node_def)
+
+      # Can pass ref to non-ref input.
+      out = self._lib.apply_op("RefOut", T=dtypes.int32, name="r")
+      out = self._lib.apply_op("Simple", a=out, name="s")
+      self.assertProtoEquals("""
+        name: 's' op: 'Simple' input: 'r'
+        """, out.op.node_def)
+
+      # Can't pass non-ref to ref input.
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("RefIn", a=2)
+      self.assertEqual(
+          str(cm.exception),
+          "'RefIn' Op requires that input 'a' be a mutable tensor " +
+          "(e.g.: a tf.Variable)")
+
+      input_a = self._lib.apply_op("RefOut", T=dtypes.int32, name="t")
+      input_b = self._lib.apply_op("RefOut", T=dtypes.int32, name="u")
+      op = self._lib.apply_op("TwoRefsIn", a=input_a, b=input_b, name="v")
+      # NOTE(mrry): The order of colocation constraints is an implementation
+      # detail.
+      self.assertProtoEquals("""
+        name: 'v' op: 'TwoRefsIn' input: 't' input: 'u'
+        attr { key: 'T' value { type: DT_INT32 } }
+        attr { key: "_class" value { list { s: "loc:@t" s: "loc:@u" } } }
+        """, op.node_def)
 
   def testSpecifyDevice(self):
-    with self._g.device("/job:ADevice"):
-      self._lib.apply_op("Simple", a=3)
-    # We look at the whole graph here to make sure the Const op is also given
-    # the specified device.
-    graph_def = self._g.as_graph_def()
-    self.assertEqual(len(graph_def.node), 2)
-    for node in graph_def.node:
-      self.assertDeviceEqual(node.device, "/job:ADevice")
+    graph = ops.Graph()
+    with graph.as_default():
+      with graph.device("/job:ADevice"):
+        self._lib.apply_op("Simple", a=3)
+      # We look at the whole graph here to make sure the Const op is also given
+      # the specified device.
+      graph_def = graph.as_graph_def()
+      self.assertEqual(len(graph_def.node), 2)
+      for node in graph_def.node:
+        self.assertDeviceEqual(node.device, "/job:ADevice")
 
   def testStructuredOutputSingleList(self):
-    self._add_op("name: 'SimpleStruct' "
-                 "output_arg { name: 'a' type: DT_INT32 number_attr: 'n_a' } "
-                 "attr { name: 'n_a' type: 'int' }")
-    for n_a in [0, 1, 3]:
-      a = self._lib.apply_op("SimpleStruct", n_a=n_a)
-      self.assertTrue(isinstance(a, list))
-      self.assertEqual(n_a, len(a))
+    with ops.Graph().as_default():
+      for n_a in [0, 1, 3]:
+        a = self._lib.apply_op("SimpleStruct", n_a=n_a)
+        self.assertTrue(isinstance(a, list))
+        self.assertEqual(n_a, len(a))
 
   def testStructuredOutputListAndSingle(self):
-    self._add_op("name: 'MixedStruct' "
-                 "output_arg { name: 'a' type: DT_INT32 number_attr: 'n_a' } "
-                 "output_arg { name: 'b' type: DT_FLOAT } "
-                 "attr { name: 'n_a' type: 'int' }")
-    for n_a in [0, 1, 3]:
-      a, b = self._lib.apply_op("MixedStruct", n_a=n_a)
-      self.assertTrue(isinstance(a, list))
-      self.assertEqual(n_a, len(a))
-      self.assertTrue(all(x.dtype == dtypes.int32 for x in a))
-      self.assertTrue(isinstance(b, ops.Tensor))
-      self.assertEqual(dtypes.float32, b.dtype)
+    with ops.Graph().as_default():
+      for n_a in [0, 1, 3]:
+        a, b = self._lib.apply_op("MixedStruct", n_a=n_a)
+        self.assertTrue(isinstance(a, list))
+        self.assertEqual(n_a, len(a))
+        self.assertTrue(all(x.dtype == dtypes.int32 for x in a))
+        self.assertTrue(isinstance(b, ops.Tensor))
+        self.assertEqual(dtypes.float32, b.dtype)
 
   def testStructuredOutputMultipleLists(self):
-    self._add_op("name: 'ComplexStruct' "
-                 "output_arg { name: 'a' type: DT_INT32 number_attr: 'n_a' } "
-                 "output_arg { name: 'b' type: DT_INT64 number_attr: 'n_b' } "
-                 "output_arg { name: 'c' type_list_attr: 't_c' } "
-                 "attr { name: 'n_a' type: 'int' } "
-                 "attr { name: 'n_b' type: 'int' } "
-                 "attr { name: 't_c' type: 'list(type)' }")
-    for n_a in [0, 1, 3]:
-      for n_b in [0, 1, 3]:
-        for t_c in [[],
-                    [dtypes.int32],
-                    [dtypes.int32, dtypes.float32]]:
-          a, b, c = self._lib.apply_op("ComplexStruct",
-                                       n_a=n_a,
-                                       n_b=n_b,
-                                       t_c=t_c)
-
-          self.assertEqual(n_a, len(a))
-          self.assertTrue(all(x.dtype == dtypes.int32 for x in a))
-          self.assertEqual(n_b, len(b))
-          self.assertTrue(all(x.dtype == dtypes.int64 for x in b))
-          self.assertEqual(t_c, [x.dtype for x in c])
-
-
+    with ops.Graph().as_default():
+      for n_a in [0, 1, 3]:
+        for n_b in [0, 1, 3]:
+          for t_c in [[],
+                      [dtypes.int32],
+                      [dtypes.int32, dtypes.float32]]:
+            a, b, c = self._lib.apply_op("ComplexStruct",
+                                         n_a=n_a,
+                                         n_b=n_b,
+                                         t_c=t_c)
+
+            self.assertEqual(n_a, len(a))
+            self.assertTrue(all(x.dtype == dtypes.int32 for x in a))
+            self.assertEqual(n_b, len(b))
+            self.assertTrue(all(x.dtype == dtypes.int64 for x in b))
+            self.assertEqual(t_c, [x.dtype for x in c])
+
+
+@test_util.with_c_api
 class OpDefLibraryGraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
-    self._lib = OpDefLibrary()
-    self._g = ops.Graph()
-    self._add_op("name: 'Simple' input_arg { name: 'a' type: DT_INT32 } "
-                 "output_arg { name: 'out' type: DT_FLOAT }")
-    self._add_op("name: 'Binary' "
-                 "input_arg { name: 'a' type_attr: 'T' } "
-                 "input_arg { name: 'b' type_attr: 'T' } "
-                 "output_arg { name: 'out' type_attr: 'T' } "
-                 "attr { name: 'T' type: 'type' }")
-
-  def _add_op(self, ascii):
+    self._lib = test_ops._op_def_lib
+
+  def _add_op(self, ascii):  # pylint: disable=redefined-builtin
     op_def = op_def_pb2.OpDef()
     text_format.Merge(ascii, op_def)
     self._lib.add_op(op_def)
@@ -1556,15 +1346,15 @@ class OpDefLibraryGraphTest(test_util.TensorFlowTestCase):
     self.assertEqual(out.graph, ops.get_default_graph())
 
   def testDefaultGraph(self):
-    with self._g.as_default():
+    graph = ops.Graph()
+    with graph.as_default():
       out = self._lib.apply_op("Simple", a=3)
-      self.assertEqual(out.graph, self._g)
+      self.assertEqual(out.graph, graph)
 
   def testDifferentGraphFails(self):
-    with self._g.as_default():
+    with ops.Graph().as_default():
       a = self._lib.apply_op("Simple", a=3)
-    other_g = ops.Graph()
-    with other_g.as_default():
+    with ops.Graph().as_default():
       b = self._lib.apply_op("Simple", a=4)
     with self.assertRaises(ValueError) as cm:
       self._lib.apply_op("Binary", a=a, b=b)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 60df8f82f0dcfb011a98802d358b2644727d7a00..398b3f67e20660dc23f8fb339774ad0e3b2eff9d 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 import copy
 import linecache
+import os
 import re
 import sys
 import threading
@@ -35,6 +36,7 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.framework import versions_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
@@ -47,28 +49,20 @@ from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import versions
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.platform import app
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.tf_export import tf_export
+
 
 # Temporary global switch determining if we should enable the work-in-progress
 # calls to the C API. Currently disabled by default but can be manually enabled
-# e.g. in tests. This will be removed once all functionality is supported and
-# there's no performance penalty with it enabled.
-#
-# TODO(skyewm) before we can remove this:
-# - functions
-# - import_graph_def() incrementally adds inputs to ops (i.e. creates an
-#   Operation and then calls _add_input()). The current code requires that all
-#   inputs be specified when creating the Operation (since we call
-#   TF_FinishOperation()).
-# - ops_test.py (and others?) create unregistered op types
-# - while loop
-# - performance (e.g. delete/refactor redundant Python functionality, switch to
-#   new session API)
-_USE_C_API = False
+# in code or via the environment variable. This will be removed once all
+# functionality is supported and there's no performance penalty with it enabled.
+_USE_C_API = os.getenv("TF_C_API_GRAPH_CONSTRUCTION", "0") is not "0"
 
 
 def tensor_id(tensor):
@@ -198,6 +192,7 @@ class _TensorLike(object):
   pass
 
 
+@tf_export("Tensor")
 class Tensor(_TensorLike):
   """Represents one of the outputs of an `Operation`.
 
@@ -292,7 +287,7 @@ class Tensor(_TensorLike):
     self._op = op
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
-    self._shape = tensor_shape.unknown_shape()
+    self._shape_val = tensor_shape.unknown_shape()
     # List of operations that use this Tensor as input.  We maintain this list
     # to easily navigate a computation graph.
     self._consumers = []
@@ -373,7 +368,31 @@ class Tensor(_TensorLike):
       A `TensorShape` representing the shape of this tensor.
 
     """
-    return self._shape
+    if _USE_C_API:
+      graph = self._op._graph._c_graph  # pylint: disable=protected-access
+      with errors.raise_exception_on_not_ok_status() as status:
+        num_dims = c_api.TF_GraphGetTensorNumDims(graph, self._as_tf_output(),
+                                                  status)
+      if num_dims == -1:
+        dim_list = None
+      else:
+        with errors.raise_exception_on_not_ok_status() as status:
+          dim_list = c_api.TF_GraphGetTensorShape_wrapper(
+              graph, self._as_tf_output(), num_dims, status)
+        dim_list = [None if i == -1 else i for i in dim_list]
+      return tensor_shape.TensorShape(dim_list)
+    return self._shape_val
+
+  @property
+  def _shape(self):
+    logging.warning("Tensor._shape is private, use Tensor.shape "
+                    "instead. Tensor._shape will eventually be removed.")
+    return self.shape
+
+  @_shape.setter
+  def _shape(self, value):
+    raise ValueError(
+        "Tensor._shape cannot be assigned, use Tensor.set_shape instead.")
 
   def __iter__(self):
     if context.in_graph_mode():
@@ -392,8 +411,8 @@ class Tensor(_TensorLike):
       yield self[i]
 
   def _shape_as_list(self):
-    if self._shape.ndims is not None:
-      return [dim.value for dim in self._shape.dims]
+    if self.shape.ndims is not None:
+      return [dim.value for dim in self.shape.dims]
     else:
       return None
 
@@ -409,7 +428,7 @@ class Tensor(_TensorLike):
     Returns:
       Integer rank or None
     """
-    return self._shape.ndims
+    return self.shape.ndims
 
   def get_shape(self):
     """Alias of Tensor.shape."""
@@ -440,14 +459,39 @@ class Tensor(_TensorLike):
     ```
 
     Args:
-      shape: A `TensorShape` representing the shape of this tensor.
+      shape: A `TensorShape` representing the shape of this tensor, a
+      `TensorShapeProto`, a list, a tuple, or None.
 
     Raises:
       ValueError: If `shape` is not compatible with the current shape of
         this tensor.
     """
-    # TODO(skyewm): call C API
-    self._shape = self._shape.merge_with(shape)
+    if not _USE_C_API:
+      self._shape_val = self._shape_val.merge_with(shape)
+      return
+    if not isinstance(shape, tensor_shape.TensorShape):
+      shape = tensor_shape.TensorShape(shape)
+    dim_list = []
+    if shape.dims is None:
+      unknown_shape = True
+    else:
+      unknown_shape = False
+      for dim in shape.dims:
+        if dim.value is None:
+          dim_list.append(-1)
+        else:
+          dim_list.append(dim.value)
+    try:
+      with errors.raise_exception_on_not_ok_status() as status:
+        c_api.TF_GraphSetTensorShape_wrapper(
+            self._op._graph._c_graph,  # pylint: disable=protected-access
+            self._as_tf_output(),
+            dim_list,
+            unknown_shape,
+            status)
+    except errors.InvalidArgumentError as e:
+      # Convert to ValueError for backwards compatibility.
+      raise ValueError(str(e))
 
   @property
   def value_index(self):
@@ -460,7 +504,17 @@ class Tensor(_TensorLike):
     Returns:
       A list of `Operation`s.
     """
-    return self._consumers
+    if self._op._c_op:  # pylint: disable=protected-access
+      consumer_names = c_api.TF_OperationOutputConsumers_wrapper(
+          self._as_tf_output())
+      # pylint: disable=protected-access
+      return [
+          self.graph._get_operation_by_name_unsafe(name)
+          for name in consumer_names
+      ]
+      # pylint: enable=protected-access
+    else:
+      return self._consumers
 
   def _add_consumer(self, consumer):
     """Add a consumer to this tensor.
@@ -471,6 +525,9 @@ class Tensor(_TensorLike):
     Raises:
       TypeError: if the consumer is not an Operation.
     """
+    # pylint: disable=protected-access
+    assert not self._op._c_op, "Tensor._add_consumer doesn't work with C API"
+    # pylint: enable=protected-access
     if not isinstance(consumer, Operation):
       raise TypeError("Consumer must be an Operation: %s" % consumer)
     self._consumers.append(consumer)
@@ -598,11 +655,6 @@ class Tensor(_TensorLike):
     """
     return _eval_using_default_session(self, feed_dict, self.graph, session)
 
-  def _dup(self):
-    ret = copy.copy(self)
-    ret._id = uid()  # pylint: disable=protected-access
-    return ret
-
 
 # TODO(agarwal): consider getting rid of this.
 class _EagerTensorBase(Tensor):
@@ -641,8 +693,8 @@ class _EagerTensorBase(Tensor):
   def __float__(self):
     return float(self.numpy())
 
-  def __array__(self):
-    return np.array(self.numpy())
+  def __array__(self, dtype=None):
+    return np.array(self.numpy(), dtype=dtype)
 
   def __format__(self, format_spec):
     return self.numpy().__format__(format_spec)
@@ -728,9 +780,6 @@ class _EagerTensorBase(Tensor):
     return new_tensor
     # pylint: enable=protected-access
 
-  def _dup(self):
-    return self._copy(device_name=self.device)
-
   @property
   def shape(self):
     return tensor_shape.TensorShape(self._shape_tuple())
@@ -743,6 +792,11 @@ class _EagerTensorBase(Tensor):
     """The shape of the tensor as a list."""
     return list(self._shape_tuple())
 
+  @property
+  def ndim(self):
+    """Returns the number of Tensor dimensions."""
+    return self.shape.ndims
+
   def cpu(self):
     """A copy of this Tensor with contents backed by host memory."""
     return self._copy(context.context(), "CPU:0")
@@ -834,6 +888,7 @@ _tensor_conversion_func_lock = threading.Lock()
 register_dense_tensor_like_type(Tensor)
 
 
+@tf_export("convert_to_tensor")
 def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
   """Converts the given `value` to a `Tensor`.
 
@@ -938,7 +993,7 @@ def internal_convert_to_tensor(value,
     # Fast path for EagerTensors that don't need any conversion.
     if isinstance(value, EagerTensor):
       # Note that we don't check that value's dtype matches the dtype
-      # argument.  We exepct that the C runtime will do that checking
+      # argument.  We expect that the C runtime will do that checking
       # when we execute the kernel.
       return value
 
@@ -1079,6 +1134,7 @@ def convert_n_to_tensor(values, dtype=None, name=None, preferred_dtype=None):
       as_ref=False)
 
 
+@tf_export("convert_to_tensor_or_indexed_slices")
 def convert_to_tensor_or_indexed_slices(value, dtype=None, name=None):
   """Converts the given object to a `Tensor` or an `IndexedSlices`.
 
@@ -1209,6 +1265,7 @@ def convert_n_to_tensor_or_indexed_slices(values, dtype=None, name=None):
 
 
 # TODO(josh11b): Add ctx argument to conversion_func() signature.
+@tf_export("register_tensor_conversion_function")
 def register_tensor_conversion_function(base_type,
                                         conversion_func,
                                         priority=100):
@@ -1269,6 +1326,7 @@ def register_tensor_conversion_function(base_type,
     _tensor_conversion_func_cache = {}
 
 
+@tf_export("IndexedSlices")
 class IndexedSlices(_TensorLike):
   """A sparse representation of a set of tensor slices at given indices.
 
@@ -1449,6 +1507,7 @@ def _create_c_op(graph, node_def, inputs, control_inputs):
   return c_op
 
 
+@tf_export("Operation")
 class Operation(object):
   """Represents a graph node that performs computation on tensors.
 
@@ -1520,7 +1579,7 @@ class Operation(object):
     # an Operation for that op. This is useful for creating Operations for ops
     # indirectly created by C API methods, e.g. the ops created by
     # TF_ImportGraphDef. When `node_def` is a TF_Operation, all optional fields
-    # except `control_inputs` should be None.
+    # should be None.
 
     if isinstance(node_def, node_def_pb2.NodeDef):
       if node_def.ByteSize() >= (1 << 31) or node_def.ByteSize() < 0:
@@ -1528,15 +1587,14 @@ class Operation(object):
             "Cannot create a tensor proto whose content is larger than 2GB.")
       if not _VALID_OP_NAME_REGEX.match(node_def.name):
         raise ValueError("'%s' is not a valid node name" % node_def.name)
-      self._node_def = copy.deepcopy(node_def)
       c_op = None
     elif type(node_def).__name__ == "SwigPyObject":
       assert inputs is None
       assert output_types is None
+      assert control_inputs is None
       assert input_types is None
       assert original_op is None
       assert op_def is None
-      self._node_def = None
       c_op = node_def
     else:
       raise TypeError("node_def needs to be a NodeDef: %s" % node_def)
@@ -1544,28 +1602,27 @@ class Operation(object):
     if not isinstance(g, Graph):
       raise TypeError("g needs to be a Graph: %s" % g)
     self._graph = g
+
     if inputs is None:
       inputs = []
     elif not isinstance(inputs, list):
       raise TypeError("inputs needs to be a list of Tensors: %s" % inputs)
-    self._inputs = list(inputs)  # Defensive copy.
-    for a in self._inputs:
+    for a in inputs:
       if not isinstance(a, Tensor):
         raise TypeError("input needs to be a Tensor: %s" % a)
     if input_types is None:
-      input_types = [i.dtype.base_dtype for i in self._inputs]
+      input_types = [i.dtype.base_dtype for i in inputs]
     else:
       if not all(
           x.is_compatible_with(i.dtype)
-          for i, x in zip(self._inputs, input_types)):
+          for i, x in zip(inputs, input_types)):
         raise TypeError("In op '%s', input types (%s) are not compatible "
                         "with expected types (%s)" %
-                        (self.node_def.name, [i.dtype for i in self._inputs],
+                        (node_def.name, [i.dtype for i in inputs],
                          input_types))
-    self._input_types_val = input_types
 
     # Build the list of control inputs.
-    self._control_inputs = []
+    control_input_ops = []
     if control_inputs:
       for c in control_inputs:
         control_op = None
@@ -1576,36 +1633,45 @@ class Operation(object):
         else:
           raise TypeError("Control input must be an Operation, "
                           "a Tensor, or IndexedSlices: %s" % c)
-        self._control_inputs.append(control_op)
+        control_input_ops.append(control_op)
+
+    # Don't set private fields with C API enabled to catch users who need to
+    # switch to public API.
+    # TODO(skyewm): delete these fields once we remove _USE_C_API
+    if not self._graph._c_graph:
+      self._inputs_val = list(inputs)  # Defensive copy.
+      self._input_types_val = input_types
+      self._control_inputs_val = control_input_ops
+      self._node_def_val = copy.deepcopy(node_def)
+      self._op_def_val = op_def
 
+    self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._original_op = original_op
-    self._op_def = op_def
     self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access
+    self._control_flow_context = self.graph._get_control_flow_context()  # pylint: disable=protected-access
 
     # Initialize self._c_op.
     if c_op:
       # TODO(skyewm): remove this assert when we remove USE_C_API
       assert self._graph._c_graph  # pylint: disable=protected-access
       self._c_op = c_op
-      self._add_control_inputs(self._control_inputs)
     elif self._graph._c_graph:  # pylint: disable=protected-access
-      if self._op_def:
-        # TODO(skyewm): op_def_library.apply_op() flattens the incoming
-        # inputs. Refactor so we don't have to do this here.
-        grouped_inputs = self._reconstruct_sequence_inputs(
-            self._op_def, self._inputs, self._node_def.attr)
-      else:
-        # If no OpDef is specified, assume all inputs are scalar.
-        grouped_inputs = self._inputs
-
-      self._c_op = _create_c_op(self._graph, self._node_def, grouped_inputs,
-                                self._control_inputs)
+      if op_def is None:
+        op_def = self._graph._get_op_def(node_def.op)
+      # TODO(skyewm): op_def_library.apply_op() flattens the incoming inputs.
+      # Refactor so we don't have to do this here.
+      grouped_inputs = self._reconstruct_sequence_inputs(
+          op_def, inputs, node_def.attr)
+      self._c_op = _create_c_op(self._graph, node_def, grouped_inputs,
+                                control_input_ops)
     else:
       self._c_op = None
 
-    # Mark that we consume the inputs.
-    for input_tensor in self.inputs:
-      input_tensor._add_consumer(self)  # pylint: disable=protected-access
+    # Mark that we consume the inputs. This is unnecessary and unsupported with
+    # the C API enabled, since the C API tracks the tensor consumers instead.
+    if not self._c_op:
+      for input_tensor in self._inputs_val:
+        input_tensor._add_consumer(self)  # pylint: disable=protected-access
 
     # Initialize self._outputs.
     if self._c_op:
@@ -1622,20 +1688,21 @@ class Operation(object):
         for i, output_type in enumerate(output_types)
     ]
 
-    # Add this op to the current control flow context.
-    self._control_flow_context = g._get_control_flow_context()  # pylint: disable=protected-access
+    if not c_op:
+      self._control_flow_post_processing()
+
+  def _control_flow_post_processing(self):
+    """Add this op to its control flow context.
+
+    This may add new ops and change this op's inputs. self.inputs must be
+    available before calling this method.
+    """
+    for input_tensor in self.inputs:
+      control_flow_util.CheckInputFromValidContext(self, input_tensor.op)
     if self._control_flow_context is not None:
       self._control_flow_context.AddOp(self)
-    # NOTE(keveman): Control flow context's AddOp could be creating new ops and
-    # setting op.inputs[index] = new_op. Thus the new ops' id could be larger
-    # than this op's id even though this op depend on them. Therefore, delaying
-    # assigning id to this op until all ops this could be dependent on are
-    # created.
-    self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._recompute_node_def()
 
-    self._graph._add_op(self)  # pylint: disable=protected-access
-
   def _reconstruct_sequence_inputs(self, op_def, inputs, attrs):
     """Regroups a flat list of input tensors into scalar and sequence inputs.
 
@@ -1718,7 +1785,7 @@ class Operation(object):
     if self._c_op:
       return c_api.TF_OperationName(self._c_op)
     else:
-      return self._node_def.name
+      return self._node_def_val.name
 
   @property
   def _id(self):
@@ -1737,7 +1804,7 @@ class Operation(object):
     if self._c_op:
       return c_api.TF_OperationDevice(self._c_op)
     else:
-      return self._node_def.device
+      return self._node_def_val.device
 
   @property
   def _output_types(self):
@@ -1795,9 +1862,9 @@ class Operation(object):
       c_api.SetRequestedDevice(
           self._graph._c_graph,  # pylint: disable=protected-access
           self._c_op,  # pylint: disable=protected-access
-          _device_string(device))
+          compat.as_str(_device_string(device)))
     else:
-      self._node_def.device = _device_string(device)
+      self._node_def_val.device = _device_string(device)
 
   def _add_input(self, tensor, dtype=None):
     """Add a new input to this operation.
@@ -1825,7 +1892,7 @@ class Operation(object):
         raise TypeError(
             "Cannot convert a tensor of type %s to an input of type %s" %
             (tensor.dtype.name, dtype.name))
-    self._inputs.append(tensor)
+    self._inputs_val.append(tensor)
     self._input_types_val.append(dtype)
     tensor._add_consumer(self)  # pylint: disable=protected-access
     self._recompute_node_def()
@@ -1855,8 +1922,8 @@ class Operation(object):
             self._tf_input(index),
             status)
     else:
-      self._inputs[index].consumers().remove(self)
-      self._inputs[index] = tensor
+      self._inputs_val[index].consumers().remove(self)
+      self._inputs_val[index] = tensor
       self._input_types_val[index] = tensor.dtype
       tensor._add_consumer(self)  # pylint: disable=protected-access
       self._recompute_node_def()
@@ -1882,7 +1949,7 @@ class Operation(object):
           if not isinstance(op, Operation):
             raise TypeError("op must be an Operation: %s" % op)
           _assert_same_graph(self, op)
-          self._control_inputs.append(op)
+          self._control_inputs_val.append(op)
         self._recompute_node_def()
 
   def _add_control_input(self, op):
@@ -1902,18 +1969,26 @@ class Operation(object):
     else:
       self._add_control_inputs([op])
 
+  def _remove_all_control_inputs(self):
+    """Removes any control inputs to this operation."""
+    if self._c_op:
+      c_api.RemoveAllControlInputs(self._graph._c_graph, self._c_op)  # pylint: disable=protected-access
+    else:
+      del self.control_inputs[:]
+
   # Methods below are used when building the NodeDef and Graph proto.
   def _recompute_node_def(self):
     # TODO(skyewm): remove this function when we switch to C API
     if self._c_op: return
 
-    del self._node_def.input[:]
+    del self._node_def_val.input[:]
     # pylint: disable=protected-access
-    self._node_def.input.extend([t._as_node_def_input() for t in self._inputs])
+    self._node_def_val.input.extend(
+        [t._as_node_def_input() for t in self._inputs_val])
     # pylint: enable=protected-access
-    if self._control_inputs:
-      self._node_def.input.extend(
-          ["^%s" % op.name for op in self._control_inputs])
+    if self._control_inputs_val:
+      self._node_def_val.input.extend(
+          ["^%s" % op.name for op in self._control_inputs_val])
 
   def __str__(self):
     return str(self.node_def)
@@ -1931,23 +2006,23 @@ class Operation(object):
   class _InputList(object):
     """Immutable input list wrapper."""
 
-    def __init__(self, op):
-      self._op = op
+    def __init__(self, inputs):
+      self._inputs = inputs
 
     def __iter__(self):
-      return iter(self._op._inputs)
+      return iter(self._inputs)
 
     def __len__(self):
-      return len(self._op._inputs)
+      return len(self._inputs)
 
     def __bool__(self):
-      return bool(self._op._inputs)
+      return bool(self._inputs)
 
     # Python 3 wants __bool__, Python 2.7 wants __nonzero__
     __nonzero__ = __bool__
 
     def __getitem__(self, i):
-      return self._op._inputs[i]
+      return self._inputs[i]
 
 # pylint: enable=protected-access
 
@@ -1956,17 +2031,24 @@ class Operation(object):
     """The list of `Tensor` objects representing the data inputs of this op."""
     if self._c_op:
       tf_outputs = c_api.GetOperationInputs(self._c_op)
-      # TODO(skyewm): return Operation._InputList
       # pylint: disable=protected-access
-      return [self.graph._get_tensor_by_tf_output(tf_output)
-              for tf_output in tf_outputs]
+      retval = [
+          self.graph._get_tensor_by_tf_output(tf_output)
+          for tf_output in tf_outputs
+      ]
       # pylint: enable=protected-access
-    else:
-      return Operation._InputList(self)
+      return Operation._InputList(retval)
+    return Operation._InputList(self._inputs_val)
 
   @property
-  def _input_dtypes(self):
-    return self._input_types
+  def _inputs(self):
+    logging.warning("Operation._inputs is private, use Operation.inputs "
+                    "instead. Operation._inputs will eventually be removed.")
+    return self.inputs
+
+  @_inputs.setter
+  def _inputs(self, value):
+    raise ValueError("Cannot assign _inputs")
 
   @property
   def _input_types(self):
@@ -1980,6 +2062,10 @@ class Operation(object):
     else:
       return self._input_types_val
 
+  @_input_types.setter
+  def _input_types(self, value):
+    raise ValueError("Cannot assign _input_types")
+
   @property
   def control_inputs(self):
     """The `Operation` objects on which this op has a control dependency.
@@ -2003,7 +2089,26 @@ class Operation(object):
       ]
       # pylint: enable=protected-access
     else:
-      return self._control_inputs
+      return self._control_inputs_val
+
+  @property
+  def _control_inputs(self):
+    logging.warning("Operation._control_inputs is private, use "
+                    "Operation.control_inputs instead. "
+                    "Operation._control_inputs will eventually be removed.")
+    return self.control_inputs
+
+  @_control_inputs.setter
+  def _control_inputs(self, value):
+    logging.warning("Operation._control_inputs is private, use "
+                    "Operation.control_inputs instead. "
+                    "Operation._control_inputs will eventually be removed.")
+    # Copy value because it may be self._control_inputs_val (in particular if
+    # this is called from self._control_inputs += ...), and we don't want to
+    # clear value below.
+    value = copy.copy(value)
+    self._remove_all_control_inputs()
+    self._add_control_inputs(value)
 
   @property
   def type(self):
@@ -2012,7 +2117,7 @@ class Operation(object):
       op_type = c_api.TF_OperationOpType(self._c_op)
       return op_type
     else:
-      return self._node_def.op
+      return self._node_def_val.op
 
   @property
   def graph(self):
@@ -2039,7 +2144,13 @@ class Operation(object):
       node_def.ParseFromString(compat.as_bytes(data))
       return node_def
     else:
-      return self._node_def
+      return self._node_def_val
+
+  @property
+  def _node_def(self):
+    logging.warning("Operation._node_def is private, use Operation.node_def "
+                    "instead. Operation._node_def will eventually be removed.")
+    return self.node_def
 
   @property
   def op_def(self):
@@ -2053,18 +2164,15 @@ class Operation(object):
     """
     # pylint: enable=line-too-long
     if self._c_op:
-      with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          # pylint: disable=protected-access
-          c_api.TF_GraphGetOpDef(self._graph._c_graph,
-                                 compat.as_bytes(self.type), buf, status)
-          # pylint: enable=protected-access
-        data = c_api.TF_GetBuffer(buf)
-      op_def = op_def_pb2.OpDef()
-      op_def.ParseFromString(compat.as_bytes(data))
-      return op_def
+      return self._graph._get_op_def(self.type)
     else:
-      return self._op_def
+      return self._op_def_val
+
+  @property
+  def _op_def(self):
+    logging.warning("Operation._op_def is private, use Operation.op_def "
+                    "instead. Operation._op_def will eventually be removed.")
+    return self.op_def
 
   @property
   def traceback(self):
@@ -2084,7 +2192,7 @@ class Operation(object):
 
   def _set_attr(self, attr_name, attr_value):
     """Private method used to set an attribute in the node_def."""
-    if _USE_C_API:
+    if self._c_op:
       buf = c_api.TF_NewBufferFromString(
           compat.as_bytes(attr_value.SerializeToString()))
       try:
@@ -2096,7 +2204,7 @@ class Operation(object):
       finally:
         c_api.TF_DeleteBuffer(buf)
     else:
-      self._node_def.attr[attr_name].CopyFrom(attr_value)
+      self._node_def_val.attr[attr_name].CopyFrom(attr_value)
 
   def get_attr(self, name):
     """Returns the value of the attr of this op with the given `name`.
@@ -2123,10 +2231,10 @@ class Operation(object):
       x = attr_value_pb2.AttrValue()
       x.ParseFromString(data)
     else:
-      if name not in self._node_def.attr:
+      if name not in self._node_def_val.attr:
         raise ValueError(
-            "No attr named '" + name + "' in " + str(self._node_def))
-      x = self._node_def.attr[name]
+            "No attr named '" + name + "' in " + str(self._node_def_val))
+      x = self._node_def_val.attr[name]
 
     # Treat an empty oneof value as an empty list.
     if not x.WhichOneof("value"):
@@ -2170,6 +2278,7 @@ class Operation(object):
 _gradient_registry = registry.Registry("gradient")
 
 
+@tf_export("RegisterGradient")
 class RegisterGradient(object):
   """A decorator for registering the gradient function for an op type.
 
@@ -2212,6 +2321,7 @@ class RegisterGradient(object):
     return f
 
 
+@tf_export("NoGradient", "NotDifferentiable")
 def NotDifferentiable(op_type):
   """Specifies that ops of type `op_type` is not differentiable.
 
@@ -2531,6 +2641,7 @@ def _name_from_scope_name(name):
   return name[:-1] if (name and name[-1] == "/") else name
 
 
+@tf_export("Graph")
 class Graph(object):
   """A TensorFlow computation, represented as a dataflow graph.
 
@@ -2640,23 +2751,26 @@ class Graph(object):
     self._handle_movers = {}
     # A map from tensor handle to its delete op.
     self._handle_deleters = {}
-    # Resource container.
-    if context.in_graph_mode():
-      self._container_prefix = ""
-    else:
-      # In Eager mode, isolate resources (particularly ResourceVariables) in
-      # Graphs by default. This prevents unintended variable sharing. Graph mode
-      # gets this kind of isolation from Sessions.
-      self._container_prefix = "eager-execution-%d/" % (uid(),)
-    self._container = self._container_prefix
+    # Allow optimizers and other objects to pseudo-uniquely key graphs (this key
+    # will be shared when defining function graphs, for example, so optimizers
+    # being called inside function definitions behave as if they were seeing the
+    # actual outside graph).
+    self._graph_key = "grap-key-%d/" % (uid(),)
+    self._container = ""
     self._registered_ops = op_def_registry.get_registered_ops()
 
     # TODO(skyewm): fold as much of the above as possible into the C
     # implementation
-    if _USE_C_API:
+    if _USE_C_API or self._use_c_api_hack():
       self._scoped_c_graph = c_api_util.ScopedTFGraph()
     else:
       self._scoped_c_graph = None
+    self._variable_creator_stack = []
+
+  # TODO(apassos) remove once the C API is used by default.
+  def _use_c_api_hack(self):
+    """Temporary hack; can be overridden to force C API usage."""
+    return False
 
   def _convert_stack(self, stack, include_func_start_lineno=False):
     """Converts a stack extracted using _extract_stack() to a traceback stack.
@@ -2688,6 +2802,22 @@ class Graph(object):
         ret.append((filename, lineno, name, line))
     return ret
 
+  # Note: this method is private because the API of tf.Graph() is public and
+  # frozen, and this functionality is still not ready for public visibility.
+  @tf_contextlib.contextmanager
+  def _variable_creator_scope(self, creator):
+    old = list(self._variable_creator_stack)
+    self._variable_creator_stack.append(creator)
+    try:
+      yield
+    finally:
+      self._variable_creator_stack = old
+
+  # Note: this method is private because the API of tf.Graph() is public and
+  # frozen, and this functionality is still not ready for public visibility.
+  def _get_variable_creator_stack(self):
+    return list(self._variable_creator_stack)
+
   def _extract_stack(self):
     """A lightweight, extensible re-implementation of traceback.extract_stack.
 
@@ -2858,6 +2988,20 @@ class Graph(object):
     """
     self._control_flow_context = ctx
 
+  def _copy_functions_to_graph_def(self, graph_def, starting_bytesize):
+    """If this graph contains functions, copy them to `graph_def`."""
+    bytesize = starting_bytesize
+    for f in self._functions.values():
+      bytesize += f.definition.ByteSize()
+      if bytesize >= (1 << 31) or bytesize < 0:
+        raise ValueError("GraphDef cannot be larger than 2GB.")
+      graph_def.library.function.extend([f.definition])
+      if f.grad_func_name:
+        grad_def = function_pb2.GradientDef()
+        grad_def.function_name = f.name
+        grad_def.gradient_func = f.grad_func_name
+        graph_def.library.gradient.extend([grad_def])
+
   def _as_graph_def(self, from_version=None, add_shapes=False):
     # pylint: disable=line-too-long
     """Returns a serialized `GraphDef` representation of this graph.
@@ -2886,33 +3030,42 @@ class Graph(object):
 
     """
     # pylint: enable=line-too-long
-    with self._lock:
-      graph = graph_pb2.GraphDef()
-      graph.versions.CopyFrom(self._graph_def_versions)
-      bytesize = 0
-      for op_id in sorted(self._nodes_by_id):
-        op = self._nodes_by_id[op_id]
-        if from_version is None or op_id > from_version:
-          graph.node.extend([op.node_def])
-          if op.outputs and add_shapes:
-            assert "_output_shapes" not in graph.node[-1].attr
-            graph.node[-1].attr["_output_shapes"].list.shape.extend(
-                [output.get_shape().as_proto() for output in op.outputs])
-          bytesize += op.node_def.ByteSize()
-          if bytesize >= (1 << 31) or bytesize < 0:
-            raise ValueError("GraphDef cannot be larger than 2GB.")
-      if self._functions:
-        for f in self._functions.values():
-          bytesize += f.definition.ByteSize()
-          if bytesize >= (1 << 31) or bytesize < 0:
-            raise ValueError("GraphDef cannot be larger than 2GB.")
-          graph.library.function.extend([f.definition])
-          if f.grad_func_name:
-            grad_def = function_pb2.GradientDef()
-            grad_def.function_name = f.name
-            grad_def.gradient_func = f.grad_func_name
-            graph.library.gradient.extend([grad_def])
-      return graph, self._version
+    if _USE_C_API:
+      with self._lock:
+        with c_api_util.tf_buffer() as buf:
+          with errors.raise_exception_on_not_ok_status() as status:
+            c_api.TF_GraphToGraphDef(self._c_graph, buf, status)
+          data = c_api.TF_GetBuffer(buf)
+        graph = graph_pb2.GraphDef()
+        graph.ParseFromString(compat.as_bytes(data))
+        # Strip the experimental library field iff it's empty.
+        if not graph.library.function:
+          graph.ClearField("library")
+
+        if add_shapes:
+          for node in graph.node:
+            op = self._nodes_by_name[node.name]
+            if op.outputs:
+              node.attr["_output_shapes"].list.shape.extend(
+                  [output.get_shape().as_proto() for output in op.outputs])
+    else:
+      with self._lock:
+        graph = graph_pb2.GraphDef()
+        graph.versions.CopyFrom(self._graph_def_versions)
+        bytesize = 0
+        for op_id in sorted(self._nodes_by_id):
+          op = self._nodes_by_id[op_id]
+          if from_version is None or op_id > from_version:
+            graph.node.extend([op.node_def])
+            if op.outputs and add_shapes:
+              assert "_output_shapes" not in graph.node[-1].attr
+              graph.node[-1].attr["_output_shapes"].list.shape.extend(
+                  [output.get_shape().as_proto() for output in op.outputs])
+            bytesize += op.node_def.ByteSize()
+            if bytesize >= (1 << 31) or bytesize < 0:
+              raise ValueError("GraphDef cannot be larger than 2GB.")
+        self._copy_functions_to_graph_def(graph, bytesize)
+    return graph, self._version
 
   def as_graph_def(self, from_version=None, add_shapes=False):
     # pylint: disable=line-too-long
@@ -2986,9 +3139,14 @@ class Graph(object):
     # Add function to graph
     # pylint: disable=protected-access
     if self._c_graph:
-      assert function._c_func, (
-          "Cannot add function created without C API support to graph "
-          "created with C API support")
+      # Handle functions created without using the C API. TODO(apassos,skyewm)
+      # remove this when all functions are generated using the C API by default
+      # as this will be unnecessary.
+      if not function._c_func:
+        with errors.raise_exception_on_not_ok_status() as status:
+          serialized = function.definition.SerializeToString()
+          function._c_func = c_api.TF_FunctionImportFunctionDef(
+              serialized, status)
       with errors.raise_exception_on_not_ok_status() as status:
         gradient = function._grad_func._c_func if function._grad_func else None
         c_api.TF_GraphCopyFunction(self._c_graph, function._c_func, gradient,
@@ -3099,12 +3257,11 @@ class Graph(object):
         input_types=input_types,
         original_op=self._default_original_op,
         op_def=op_def)
-
     self._create_op_helper(ret, compute_shapes=compute_shapes,
                            compute_device=compute_device)
     return ret
 
-  def _create_op_from_tf_operation(self, c_op):
+  def _create_op_from_tf_operation(self, c_op, compute_device=True):
     """Creates an `Operation` in this graph from the supplied TF_Operation.
 
     This method is like create_op() except the new Operation is constructed
@@ -3112,19 +3269,23 @@ class Graph(object):
     field. This is used to create Operation objects around TF_Operations created
     indirectly by the C API (e.g. by TF_ImportGraphDef, TF_FinishWhile).
 
+    This function does not call Operation._control_flow_post_processing or
+    Graph._control_dependencies_for_inputs (since the inputs may not be
+    available yet). The caller is responsible for calling these methods.
+
     Args:
       c_op: a wrapped TF_Operation
+      compute_device: (Optional.) If True, device functions will be executed
+        to compute the device property of the Operation.
 
     Returns:
       An `Operation` object.
     """
     self._check_not_finalized()
-    tf_outputs = c_api.GetOperationInputs(c_op)
-    input_ops = set(self._get_operation_by_tf_operation(output.oper)
-                    for output in tf_outputs)
-    control_inputs = self._control_dependencies_for_inputs(input_ops)
-    ret = Operation(c_op, self, control_inputs=control_inputs)
-    self._create_op_helper(ret)
+    ret = Operation(c_op, self)
+    assert ret.name not in self._names_in_use
+    self._names_in_use[ret.name] = 1
+    self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
   def _create_op_helper(self, op, compute_shapes=True, compute_device=True):
@@ -3138,6 +3299,8 @@ class Graph(object):
     # compute_shapes argument.
     if op._c_op or compute_shapes:  # pylint: disable=protected-access
       set_shapes_for_outputs(op)
+    # TODO(b/XXXX): move to Operation.__init__ once _USE_C_API flag is removed.
+    self._add_op(op)
 
     # Apply any additional attributes requested. Do not overwrite any existing
     # attributes.
@@ -3205,8 +3368,8 @@ class Graph(object):
     # (2) "is_stateful" is set in OpDef
     # (3) "container" attribute is in OpDef
     # (4) "container" attribute is None
-    if (self._container and op.type in self._registered_ops and
-        self._registered_ops[op.type].is_stateful):
+    # TODO(skyewm): remove op.op_def check when _USE_C_API is removed.
+    if self._container and op.op_def and op.op_def.is_stateful:
       try:
         container_attr = op.get_attr("container")
       except ValueError:
@@ -3217,6 +3380,37 @@ class Graph(object):
           op._set_attr("container", attr_value_pb2.AttrValue(  # pylint: disable=protected-access
               s=compat.as_bytes(self._container)))
 
+  def _add_new_tf_operations(self, compute_devices=True):
+    """Creates `Operations` in this graph for any new TF_Operations.
+
+    This is useful for when TF_Operations are indirectly created by the C API
+    outside of the Operation constructor (e.g. by TF_ImportGraphDef,
+    TF_FinishWhile). This ensures there are corresponding Operations for all
+    TF_Operations in the underlying TF_Graph.
+
+    Args:
+      compute_devices: (Optional.) If True, device functions will be executed
+        to compute the device properties of each new Operation.
+
+    Returns:
+      A list of the new `Operation` objects.
+    """
+    # Create all Operation objects before accessing their inputs since an op may
+    # be created before its inputs.
+    new_ops = [
+        self._create_op_from_tf_operation(c_op, compute_device=compute_devices)
+        for c_op in c_api_util.new_tf_operations(self)
+    ]
+
+    for op in new_ops:
+      new_control_inputs = self._control_dependencies_for_inputs(op.inputs)
+      # pylint: disable=protected-access
+      op._add_control_inputs(new_control_inputs)
+      op._control_flow_post_processing()
+      # pylint: enable=protected-access
+
+    return new_ops
+
   def as_graph_element(self, obj, allow_tensor=True, allow_operation=True):
     """Returns the object referred to by `obj`, as an `Operation` or `Tensor`.
 
@@ -3455,6 +3649,22 @@ class Graph(object):
   def _last_id(self):
     return self._next_id_counter
 
+  def _get_op_def(self, type):  # pylint: disable=redefined-builtin
+    """Returns the `OpDef` proto for `type`. `type` is a string."""
+    if self._c_graph:
+      with c_api_util.tf_buffer() as buf:
+        with errors.raise_exception_on_not_ok_status() as status:
+          # pylint: disable=protected-access
+          c_api.TF_GraphGetOpDef(self._c_graph,
+                                 compat.as_bytes(type), buf, status)
+          # pylint: enable=protected-access
+        data = c_api.TF_GetBuffer(buf)
+      op_def = op_def_pb2.OpDef()
+      op_def.ParseFromString(compat.as_bytes(data))
+      return op_def
+    else:
+      return self._registered_ops[type]
+
   def as_default(self):
     """Returns a context manager that makes this `Graph` the default graph.
 
@@ -3727,6 +3937,9 @@ class Graph(object):
         above.
     """
     if name:
+      if isinstance(name, compat.bytes_or_text_types):
+        name = compat.as_str(name)
+
       if self._name_stack:
         # Scopes created in a nested scope may have initial characters
         # that are illegal as the initial character of an op name
@@ -4020,7 +4233,7 @@ class Graph(object):
     """
     original_container = self._container
     try:
-      self._container = self._container_prefix + container_name
+      self._container = container_name
       yield self._container
     finally:
       self._container = original_container
@@ -4054,10 +4267,10 @@ class Graph(object):
       """
       self._graph = graph
       if control_inputs is None:
-        self._control_inputs = []
+        self._control_inputs_val = []
         self._new_stack = True
       else:
-        self._control_inputs = control_inputs
+        self._control_inputs_val = control_inputs
         self._new_stack = False
       self._seen_nodes = set()
       self._old_stack = None
@@ -4085,7 +4298,7 @@ class Graph(object):
 
     @property
     def control_inputs(self):
-      return self._control_inputs
+      return self._control_inputs_val
 
     def add_op(self, op):
       self._seen_nodes.add(op)
@@ -4459,6 +4672,9 @@ class Graph(object):
 
 # TODO(agarwal): currently device directives in an outer eager scope will not
 # apply to inner graph mode code. Fix that.
+
+
+@tf_export("device")
 def device(device_name_or_function):
   """Wrapper for `Graph.device()` using the default graph.
 
@@ -4488,6 +4704,7 @@ def device(device_name_or_function):
     return context.device(device_name_or_function)
 
 
+@tf_export("container")
 def container(container_name):
   """Wrapper for `Graph.container()` using the default graph.
 
@@ -4501,6 +4718,7 @@ def container(container_name):
   return get_default_graph().container(container_name)
 
 
+@tf_export("colocate_with")
 def colocate_with(op, ignore_existing=False):
   if context.in_graph_mode():
     return get_default_graph().colocate_with(op, ignore_existing)
@@ -4511,21 +4729,18 @@ def colocate_with(op, ignore_existing=False):
       return _NullContextmanager()
 
 
+@tf_export("control_dependencies")
 def control_dependencies(control_inputs):
   """Wrapper for `Graph.control_dependencies()` using the default graph.
 
   See @{tf.Graph.control_dependencies}
   for more details.
 
-  When eager execution is enabled, any callable object in the `control_inputs`
-  list will be called.
-
   Args:
     control_inputs: A list of `Operation` or `Tensor` objects which
       must be executed or computed before running the operations
       defined in the context.  Can also be `None` to clear the control
-      dependencies. If eager execution is enabled, any callable object in the
-      `control_inputs` list will be called.
+      dependencies.
 
   Returns:
    A context manager that specifies control dependencies for all
@@ -4534,11 +4749,6 @@ def control_dependencies(control_inputs):
   if context.in_graph_mode():
     return get_default_graph().control_dependencies(control_inputs)
   else:
-    if control_inputs:
-      # Excute any pending callables.
-      for control in control_inputs:
-        if callable(control):
-          control()
     return _NullContextmanager()
 
 
@@ -4637,6 +4847,7 @@ def default_session(session):
   return _default_session_stack.get_controller(session)
 
 
+@tf_export("get_default_session")
 def get_default_session():
   """Returns the default session for the current thread.
 
@@ -4756,10 +4967,85 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
     super(_DefaultGraphStack, self).reset()
     self._global_default_graph = None
 
+  @tf_contextlib.contextmanager
+  def get_controller(self, default):
+    try:
+      context.context_stack.push(default.building_function, default.as_default)
+      with super(_DefaultGraphStack, self).get_controller(default) as g:
+        yield g
+    finally:
+      context.context_stack.pop()
+
 
 _default_graph_stack = _DefaultGraphStack()
 
 
+# pylint: disable=g-doc-return-or-yield,line-too-long
+@tf_contextlib.contextmanager
+def init_scope():
+  """A context manager that lifts ops out of control-flow scopes and function-building graphs.
+
+  There is often a need to lift variable initialization ops out of control-flow
+  scopes, function-building graphs, and gradient tapes. Entering an
+  `init_scope` is a mechanism for satisfying these desiderata. In particular,
+  entering an `init_scope` has three effects:
+
+    (1) All control dependencies are cleared the moment the scope is entered;
+        this is equivalent to entering the context manager returned from
+        `control_dependencies(None)`, which has the side-effect of exiting
+        control-flow scopes like `tf.cond` and `tf.while_loop`.
+
+    (2) All operations that are created while the scope is active are lifted
+        into the lowest context on the `context_stack` that is not building a
+        graph function. Here, a context is defined as either a graph or an eager
+        context. Every context switch, i.e., every installation of a graph as
+        the default graph and every switch into eager mode, is logged in a
+        thread-local stack called the `context_stack`; the log entry for a
+        context switch is popped from the stack when the context is exited.
+        Entering an `init_scope` is equivalent to crawling up the
+        `context_stack`, finding the first context that is not building a graph
+        function, and entering it. A caveat is that if graph mode is enabled
+        but the default graph stack is empty, then entering an `init_scope`
+        will simply install a fresh graph as the default one.
+
+    (3) The gradient tape is paused while the scope is active.
+  """
+  # pylint: enable=g-doc-return-or-yield,line-too-long
+
+  in_graph_mode = context.in_graph_mode()
+  # Retrieve the active name scope: entering an `init_scope` preserves
+  # the name scope of the current context.
+  if in_graph_mode:
+    default_graph = get_default_graph()
+    scope = default_graph.get_name_scope()
+  else:
+    scope = context.context().scope_name
+  if scope and scope[-1] != '/':
+    # Names that end with trailing slashes are treated by `name_scope` as
+    # absolute.
+    scope = scope + '/'
+
+  outer_context = None
+  if in_graph_mode and not _default_graph_stack.stack:
+    outer_context = default_graph.as_default
+  else:
+    for stack_entry in reversed(context.context_stack.stack):
+      if not stack_entry.is_building_function:
+        outer_context = stack_entry.enter_context_fn
+        break
+
+  if outer_context is None:
+    raise AssertionError("All graphs are building functions, and no "
+                         "eager context was previously active.")
+
+  try:
+    with outer_context(), name_scope(scope), control_dependencies(
+        None), tape.stop_recording():
+      yield
+  finally:
+    pass
+
+
 def enable_eager_execution(config=None, device_policy=None):
   """Enables, for the rest of the lifetime of this program, eager execution.
 
@@ -4788,12 +5074,24 @@ def enable_eager_execution(config=None, device_policy=None):
          right device but raises a warning.
        tfe.DEVICE_PLACEMENT_SILENT: silently copies the tensors. This might
          hide performance problems.
+       tfe.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies int32 tensors,
+         raising errors on the other ones.
 
   Raises:
     ValueError: If trying to create a context after using graph operations
      or if trying to create a context with nontrivial options which differ
      from those of the existing context.
   """
+  if config is not None and not isinstance(config, config_pb2.ConfigProto):
+    raise TypeError(
+        "config must be a tf.ConfigProto, but got %s" % type(config))
+  if device_policy not in (None, context.DEVICE_PLACEMENT_EXPLICIT,
+                           context.DEVICE_PLACEMENT_WARN,
+                           context.DEVICE_PLACEMENT_SILENT,
+                           context.DEVICE_PLACEMENT_SILENT_FOR_INT32):
+    raise ValueError(
+        "device_policy must be one of None, tfe.DEVICE_PLACEMENT_*"
+    )
   # pylint: disable=protected-access
   if context._default_mode == context.GRAPH_MODE:
     graph_mode_has_been_used = (
@@ -4806,6 +5104,13 @@ def enable_eager_execution(config=None, device_policy=None):
   if context._context is None:
     context._context = context.Context(config=config,
                                        device_policy=device_policy)
+    if context.context_stack.stack:
+      raise AssertionError("Invariant violated: The context stack must "
+                           "be empty when eager execution is enabled.")
+    # Log that eager execution has been enabled by pushing an entry onto the
+    # context stack; this entry won't ever be popped, as it's impossible to
+    # disable eager execution
+    context.context_stack.push(False, context.eager_mode)
   elif ((config is not None and config is not context._context._config)
         or (device_policy is not None
             and device_policy is not context._context._device_policy)):
@@ -4848,6 +5153,7 @@ def eager_run(main=None, argv=None):
   app.run(main, argv)
 
 
+@tf_export("reset_default_graph")
 def reset_default_graph():
   """Clears the default graph stack and resets the global default graph.
 
@@ -4866,6 +5172,7 @@ def reset_default_graph():
   _default_graph_stack.reset()
 
 
+@tf_export("get_default_graph")
 def get_default_graph():
   """Returns the default graph for the current thread.
 
@@ -4986,6 +5293,7 @@ def _get_graph_from_inputs(op_input_list, graph=None):
   return graph or get_default_graph()
 
 
+@tf_export("GraphKeys")
 class GraphKeys(object):
   """Standard names to use for graph collections.
 
@@ -5134,6 +5442,7 @@ class GraphKeys(object):
     return cls.GLOBAL_VARIABLES
 
 
+@tf_export("add_to_collection")
 def add_to_collection(name, value):
   """Wrapper for `Graph.add_to_collection()` using the default graph.
 
@@ -5170,6 +5479,7 @@ def add_to_collections(names, value):
   get_default_graph().add_to_collections(names, value)
 
 
+@tf_export("get_collection_ref")
 def get_collection_ref(key):
   """Wrapper for `Graph.get_collection_ref()` using the default graph.
 
@@ -5193,6 +5503,7 @@ def get_collection_ref(key):
   return get_default_graph().get_collection_ref(key)
 
 
+@tf_export("get_collection")
 def get_collection(key, scope=None):
   """Wrapper for `Graph.get_collection()` using the default graph.
 
@@ -5226,9 +5537,13 @@ def get_all_collection_keys():
   return get_default_graph().get_all_collection_keys()
 
 
+name_scope_cache = {}
+
+
 # Named like a function for backwards compatibility with the
 # @tf_contextlib.contextmanager version, which was switched to a class to avoid
 # some object creation overhead.
+@tf_export("name_scope", "keras.backend.name_scope")
 class name_scope(object):  # pylint: disable=invalid-name
   """A context manager for use when defining a Python op.
 
@@ -5281,11 +5596,23 @@ class name_scope(object):  # pylint: disable=invalid-name
     """
     if self._in_eager_mode:
       self._old_name = self._ctx.scope_name
-      if self._name:
-        scope_name = (self._old_name + self._name + "/"
-                      if self._old_name else self._name + "/")
-      else:
+      if not self._name:
         scope_name = ""
+      else:
+        cache_key = self._name, self._old_name, self._default_name
+        if cache_key in name_scope_cache:
+          self._ctx.scope_name = name_scope_cache[cache_key]
+          return self._ctx.scope_name
+        elif self._name[-1] == "/":
+          # A trailing slash breaks out of nested name scopes, indicating a
+          # fully specified scope name, for compatibility with Graph.name_scope.
+          scope_name = self._name
+        else:
+          name_with_trailing_slash = self._name + "/"
+          scope_name = (
+              self._old_name + name_with_trailing_slash
+              if self._old_name else name_with_trailing_slash)
+        name_scope_cache[cache_key] = scope_name
       self._ctx.scope_name = scope_name
       return scope_name
     else:
@@ -5301,8 +5628,12 @@ class name_scope(object):  # pylint: disable=invalid-name
       g = _get_graph_from_inputs(self._values)
       self._g_manager = g.as_default()
       self._g_manager.__enter__()
-      self._name_scope = g.name_scope(self._name)
-      return self._name_scope.__enter__()
+      try:
+        self._name_scope = g.name_scope(self._name)
+        return self._name_scope.__enter__()
+      except:
+        self._g_manager.__exit__(*sys.exc_info())
+        raise
 
   def __exit__(self, type_arg, value_arg, traceback_arg):
     if self._in_eager_mode:
@@ -5364,6 +5695,7 @@ def prepend_name_scope(name, import_scope):
 
 # pylint: disable=g-doc-return-or-yield
 # pylint: disable=not-context-manager
+@tf_export("op_scope")
 @tf_contextlib.contextmanager
 def op_scope(values, name, default_name=None):
   """DEPRECATED. Same as name_scope above, just different argument order."""
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index cd296ccdc5ef372038fb62f0311a056cfc5ceaae..c6deafd89eb1bdc4892a65ba3ab8c7900915390f 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
@@ -43,6 +44,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -203,13 +205,13 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(dtypes.float32, float_t.dtype)
     self.assertEqual(op, float_t.op)
     self.assertEqual(0, float_t._value_index)
-    self.assertEqual(0, len(float_t._consumers))
+    self.assertEqual(0, len(float_t.consumers()))
     self.assertEqual("myop", float_t._as_node_def_input())
 
     self.assertEqual(dtypes.string, label_str_t.dtype)
     self.assertEqual(op, label_str_t.op)
     self.assertEqual(1, label_str_t._value_index)
-    self.assertEqual(0, len(label_str_t._consumers))
+    self.assertEqual(0, len(label_str_t.consumers()))
     self.assertEqual("myop:1", label_str_t._as_node_def_input())
 
     self.assertProtoEquals("op:'FloatOutputStringOutput' name:'myop'",
@@ -223,8 +225,8 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(op2.inputs))
     self.assertIs(float_t, op2.inputs[0])
 
-    self.assertEqual(1, len(float_t._consumers))
-    self.assertEqual(op2, float_t._consumers[0])
+    self.assertEqual(1, len(float_t.consumers()))
+    self.assertEqual(op2, float_t.consumers()[0])
 
     self.assertProtoEquals("op:'FloatOutput' name:'myop1'", op1.node_def)
     self.assertProtoEquals("op:'FloatInput' name:'myop2' input:'myop1'",
@@ -243,14 +245,14 @@ class OperationTest(test_util.TensorFlowTestCase):
     op3 = test_ops.foo2(float1_t, label2_str_t, label2_str_t, name="myop3").d.op
     self.assertEqual(2, len(op3.values()))
 
-    self.assertEqual(1, len(float1_t._consumers))
-    self.assertEqual(op3, float1_t._consumers[0])
+    self.assertEqual(1, len(float1_t.consumers()))
+    self.assertEqual(op3, float1_t.consumers()[0])
 
-    self.assertEqual(0, len(float2_t._consumers))
+    self.assertEqual(0, len(float2_t.consumers()))
 
-    self.assertEqual(2, len(label2_str_t._consumers))
-    self.assertEqual(op3, label2_str_t._consumers[0])
-    self.assertEqual(op3, label2_str_t._consumers[1])
+    self.assertEqual(2, len(label2_str_t.consumers()))
+    self.assertEqual(op3, label2_str_t.consumers()[0])
+    self.assertEqual(op3, label2_str_t.consumers()[1])
 
     self.assertProtoEquals("""
     op:'Foo2' name:'myop3'
@@ -274,6 +276,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     op1 = ops.Operation(
         ops._NodeDef("RefOutputFloatOutput", "op1"), g, [],
         [dtypes.float32_ref, dtypes.float32])
+    g._add_op(op1)
     self.assertProtoEquals("op:'RefOutputFloatOutput' name:'op1'", op1.node_def)
     self.assertEquals([], list(op1.inputs))
     ref_t, nonref_t = op1.values()
@@ -282,12 +285,14 @@ class OperationTest(test_util.TensorFlowTestCase):
         ops._NodeDef("RefInputFloatInput", "op2"),
         g, [ref_t, nonref_t], [],
         input_types=[dtypes.float32_ref, dtypes.float32])
+    g._add_op(op2)
     self.assertProtoEquals(
         "op:'RefInputFloatInput' name:'op2' input:'op1' input:'op1:1'",
         op2.node_def)
     self.assertEquals([ref_t, nonref_t], list(op2.inputs))
     op3 = ops.Operation(
         ops._NodeDef("TwoFloatInputs", "op3"), g, [ref_t, nonref_t], [])
+    g._add_op(op3)
     self.assertProtoEquals(
         "op:'TwoFloatInputs' name:'op3' input:'op1' input:'op1:1'",
         op3.node_def)
@@ -482,6 +487,30 @@ class OperationTest(test_util.TensorFlowTestCase):
     z._add_control_inputs([x, y, y])  # pylint: disable=protected-access
     self.assertEqual(z.control_inputs, [x, y])
 
+  def testRemoveAllControlInputs(self):
+    a = constant_op.constant(1)
+    with ops.control_dependencies([a]):
+      b = constant_op.constant(2)
+    c = constant_op.constant(3)
+    d = constant_op.constant(4)
+    e = constant_op.constant(5)
+    with ops.control_dependencies([a, c]):
+      f = d + e
+
+    self.assertEqual(a.op.control_inputs, [])
+    self.assertEqual(b.op.control_inputs, [a.op])
+    self.assertEqual(f.op.control_inputs, [a.op, c.op])
+
+    a.op._remove_all_control_inputs()  # pylint: disable=protected-access
+    self.assertEqual(a.op.control_inputs, [])
+
+    b.op._remove_all_control_inputs()  # pylint: disable=protected-access
+    self.assertEqual(b.op.control_inputs, [])
+
+    f.op._remove_all_control_inputs()  # pylint: disable=protected-access
+    self.assertEqual(f.op.control_inputs, [])
+    self.assertEqual(list(f.op.inputs), [d, e])
+
   def testControlInputCycle(self):
     # Non-C API path has a different error message
     if not ops._USE_C_API: return
@@ -508,16 +537,22 @@ class OperationTest(test_util.TensorFlowTestCase):
 
     z.op._update_input(0, y)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [y, y])
+    self.assertEquals(x.consumers(), [])
+    self.assertEquals(y.consumers(), [z.op, z.op])
     with session.Session(graph=g) as sess:
       self.assertEquals(sess.run(z), 4)
 
     z.op._update_input(0, x)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
+    self.assertEquals(x.consumers(), [z.op])
+    self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
       self.assertEquals(sess.run(z), 3)
 
     z.op._update_input(1, y)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
+    self.assertEquals(x.consumers(), [z.op])
+    self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
       self.assertEquals(sess.run(z), 3)
 
@@ -625,6 +660,15 @@ class OperationTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
         y * x  # pylint: disable=pointless-statement
 
+  def testInputsAreImmutable(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = test_ops.int_output()
+      op = test_ops.int_input_int_output(x, name="myop").op
+    with self.assertRaisesRegexp(
+        AttributeError, "'_InputList' object has no attribute 'append'"):
+      op.inputs.append(None)
+
 
 @test_util.with_c_api
 class CreateOpTest(test_util.TensorFlowTestCase):
@@ -734,6 +778,29 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(len(op.outputs), 1)
     self.assertEqual(op.outputs[0].shape, tensor_shape.matrix(2, 3))
 
+  def testUniqueName(self):
+    g = ops.Graph()
+    with g.as_default():
+      if ops._USE_C_API:
+        c_op = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop"), [], [])
+        c_op2 = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop_1"), [], [])
+        op = g._create_op_from_tf_operation(c_op)
+        op2 = g._create_op_from_tf_operation(c_op2)
+      else:
+        # Test pure-Python version to make sure C API has same behavior.
+        op = test_ops.int_output(name="myop").op
+        op2 = test_ops.int_output(name="myop_1").op
+
+      # Create ops with same names as op1 and op2. We expect the new names to be
+      # uniquified.
+      op3 = test_ops.int_output(name="myop").op
+      op4 = test_ops.int_output(name="myop_1").op
+
+    self.assertEqual(op.name, "myop")
+    self.assertEqual(op2.name, "myop_1")
+    self.assertEqual(op3.name, "myop_2")
+    self.assertEqual(op4.name, "myop_1_1")
+
   def testCond(self):
     g = ops.Graph()
     with g.as_default():
@@ -741,10 +808,10 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
 
       def true_fn():
         if ops._USE_C_API:
-          c_op = ops._create_c_op(ops.get_default_graph(),
-                                  ops._NodeDef("IntInput", "cond/myop"), [x],
-                                  [])
-          ops.get_default_graph()._create_op_from_tf_operation(c_op)
+          ops._create_c_op(ops.get_default_graph(),
+                           ops._NodeDef("IntInput", "cond/myop"), [x], [])
+          new_ops = g._add_new_tf_operations()
+          self.assertEqual(len(new_ops), 1)
         else:
           # Test pure-Python version to make sure C API has same behavior.
           test_ops.int_input(x, name="myop")
@@ -774,10 +841,10 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
 
       def body(i):
         if ops._USE_C_API:
-          c_op = ops._create_c_op(ops.get_default_graph(),
-                                  ops._NodeDef("IntInput", "myloop/myop"), [x],
-                                  [])
-          ops.get_default_graph()._create_op_from_tf_operation(c_op)
+          ops._create_c_op(ops.get_default_graph(),
+                           ops._NodeDef("IntInput", "myloop/myop"), [x], [])
+          new_ops = g._add_new_tf_operations()
+          self.assertEqual(len(new_ops), 1)
         else:
           # Test pure-Python version to make sure C API has same behavior.
           test_ops.int_input(x, name="myop")
@@ -808,11 +875,11 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
       def body(i):
         c = constant_op.constant(1.0, name="c")
         if ops._USE_C_API:
-          c_op = ops._create_c_op(ops.get_default_graph(),
-                                  ops._NodeDef("IntInput", "myloop/myop"), [x],
-                                  [])
+          ops._create_c_op(ops.get_default_graph(),
+                           ops._NodeDef("IntInput", "myloop/myop"), [x], [])
           with ops.control_dependencies([c]):
-            ops.get_default_graph()._create_op_from_tf_operation(c_op)
+            new_ops = g._add_new_tf_operations()
+            self.assertEqual(len(new_ops), 1)
         else:
           with ops.control_dependencies([c]):
             test_ops.int_input(x, name="myop")
@@ -828,10 +895,6 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(op.control_inputs, [c])
 
   def testWhileLoopWithExternalControlDep(self):
-    # TODO(skyewm): enable once ControlFlowContext._RemoveExternalControlEdges
-    # works with C API enabled
-    if ops._USE_C_API: self.skipTest("Not yet implemented with C API enabled")
-
     g = ops.Graph()
     with g.as_default():
       x = test_ops.int_output()
@@ -839,11 +902,11 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
 
       def body(i):
         if ops._USE_C_API:
-          c_op = ops._create_c_op(ops.get_default_graph(),
-                                  ops._NodeDef("IntInput", "myloop/myop"), [x],
-                                  [])
+          ops._create_c_op(ops.get_default_graph(),
+                           ops._NodeDef("IntInput", "myloop/myop"), [x], [])
           with ops.control_dependencies([c]):
-            ops.get_default_graph()._create_op_from_tf_operation(c_op)
+            new_ops = g._add_new_tf_operations()
+            self.assertEqual(len(new_ops), 1)
         else:
           with ops.control_dependencies([c]):
             test_ops.int_input(x, name="myop")
@@ -853,7 +916,6 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
 
     op = g.get_operation_by_name("myloop/myop")
     self.assertIsNotNone(op)
-    self.assertEqual(len(op.control_inputs), 1)
     # External control dep is removed and replaced with internal control dep
     self.assertNotEqual(op.control_inputs[0], c.op)
     self.assertIsNotNone(op.control_inputs[0]._get_control_flow_context())
@@ -1537,7 +1599,7 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
       self.assertEqual(future.calls, 1)
     else:
       a = constant_op.constant(1.0)
-      b = future
+      b = future()
       with ops.control_dependencies([a, b]):
         c = constant_op.constant(3.0)
       self.assertEqual(future.calls, 1)
@@ -1705,6 +1767,37 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
 @test_util.with_c_api
 class OpScopeTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testNames(self):
+    with ops.name_scope("foo") as foo:
+      self.assertEqual("foo/", foo)
+      with ops.name_scope("foo2") as foo2:
+        self.assertEqual("foo/foo2/", foo2)
+      with ops.name_scope(None) as empty1:
+        self.assertEqual("", empty1)
+        with ops.name_scope("foo3") as foo3:
+          self.assertEqual("foo3/", foo3)
+      with ops.name_scope("") as empty2:
+        self.assertEqual("", empty2)
+    with ops.name_scope("foo/") as outer_foo:
+      self.assertEqual("foo/", outer_foo)
+      with ops.name_scope("") as empty3:
+        self.assertEqual("", empty3)
+      with ops.name_scope("foo4") as foo4:
+        self.assertEqual("foo/foo4/", foo4)
+      with ops.name_scope("foo5//") as foo5:
+        self.assertEqual("foo5//", foo5)
+        with ops.name_scope("foo6") as foo6:
+          self.assertEqual("foo5//foo6/", foo6)
+      with ops.name_scope("/") as foo7:
+        self.assertEqual("/", foo7)
+      with ops.name_scope("//") as foo8:
+        self.assertEqual("//", foo8)
+      with ops.name_scope("a//b/c") as foo9:
+        self.assertEqual("foo/a//b/c/", foo9)
+    with ops.name_scope("a//b/c") as foo10:
+      self.assertEqual("a//b/c/", foo10)
+
   @test_util.run_in_graph_and_eager_modes()
   def testEagerDefaultScopeName(self):
     with ops.name_scope(None, "default") as scope:
@@ -1785,6 +1878,228 @@ class OpScopeTest(test_util.TensorFlowTestCase):
     self._testGraphElements([a, variable, b])
 
 
+class InitScopeTest(test_util.TensorFlowTestCase):
+
+  def testClearsControlDependencies(self):
+    g = ops.Graph()
+    a_1 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+    a_2 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+    a_3 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+    a_4 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+
+    with g.as_default():
+      with g.control_dependencies([a_1]):
+        with g.control_dependencies([a_2]):
+          with ops.init_scope():
+            with g.control_dependencies([a_3]):
+              with g.control_dependencies([a_4]):
+                # deps [a_3, a_4]
+                b_3_4 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+              # deps = [a_3]
+              b_3 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+            # deps back to None
+            b_none = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+          # deps back to [a_1, a_2]
+          b_1_2 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+        # deps back to [a_1]
+        b_1 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+        with ops.init_scope():
+          # deps are None again
+          b_none2 = _apply_op(g, "FloatOutput", [], [dtypes.float32])
+
+    self.assertItemsEqual([a_3.op, a_4.op], b_3_4.op.control_inputs)
+    self.assertItemsEqual([a_3.op], b_3.op.control_inputs)
+    self.assertItemsEqual([], b_none.op.control_inputs)
+    self.assertItemsEqual([a_1.op, a_2.op], b_1_2.op.control_inputs)
+    self.assertItemsEqual([a_1.op], b_1.op.control_inputs)
+    self.assertItemsEqual([], b_none2.op.control_inputs)
+
+  def testLiftsOpsFromFunctions(self):
+    g0 = ops.Graph()
+    g1 = ops.Graph()
+    g1._building_function = True  # pylint: disable=protected-access
+    g2 = ops.Graph()
+    g2._building_function = True  # pylint: disable=protected-access
+
+    with g0.as_default():
+      with g1.as_default():
+        with g2.as_default():
+          with ops.init_scope():
+            _ = constant_op.constant(1.0)
+
+    self.assertEqual(len(g2.get_operations()), 0)
+    self.assertEqual(len(g1.get_operations()), 0)
+    self.assertEqual(len(g0.get_operations()), 1)
+
+  def testComposes(self):
+    g0 = ops.Graph()
+    g1 = ops.Graph()
+    g1._building_function = True  # pylint: disable=protected-access
+    g2 = ops.Graph()
+    g2._building_function = True  # pylint: disable=protected-access
+    g3 = ops.Graph()
+    g3._building_function = False  # pylint: disable=protected-access
+
+    with g0.as_default():
+      with g1.as_default():
+        with ops.init_scope():
+          # This op should be lifted into g0.
+          _ = constant_op.constant(1.0)
+          self.assertIs(g0, ops.get_default_graph())
+          self.assertEqual(len(g2.get_operations()), 0)
+          self.assertEqual(len(g1.get_operations()), 0)
+          self.assertEqual(len(g0.get_operations()), 1)
+        with g2.as_default():
+          with ops.init_scope():
+            # This op should be lifted into g0.
+            _ = constant_op.constant(1.0)
+            self.assertIs(g0, ops.get_default_graph())
+            with g3.as_default():
+              with ops.init_scope():
+                # This op should be lifted into g3, because g3 is not building a
+                # function.
+                _ = constant_op.constant(1.0)
+                self.assertIs(g3, ops.get_default_graph())
+
+    self.assertEqual(len(g3.get_operations()), 1)
+    self.assertEqual(len(g2.get_operations()), 0)
+    self.assertEqual(len(g1.get_operations()), 0)
+    self.assertEqual(len(g0.get_operations()), 2)
+
+  def testEscapesToEagerContext(self):
+    g = ops.Graph()
+    g._building_function = True  # pylint: disable=protected-access
+    with context.eager_mode():
+      with context.graph_mode():
+        with g.as_default():
+          with ops.init_scope():
+            # Because g is building a function, init_scope should
+            # escape out to the eager context.
+            self.assertTrue(context.in_eager_mode())
+          # g should be reinstated as the default graph, and the
+          # graph context should be re-entered.
+          self.assertIs(g, ops.get_default_graph())
+          self.assertTrue(context.in_graph_mode())
+
+  def testAllGraphsBuildingFunctionsRaisesError(self):
+    g = ops.Graph()
+    g._building_function = True  # pylint: disable=protected-access
+    with g.as_default():
+      with self.assertRaises(AssertionError):
+        with ops.init_scope():
+          pass
+
+  def testStaysInEagerWhenOnlyEagerContextActive(self):
+    with context.eager_mode():
+      with ops.init_scope():
+        self.assertTrue(context.eager_mode())
+      self.assertTrue(context.eager_mode())
+
+  def testEscapesDefunWhenInEagerMode(self):
+
+    def function_with_variables():
+      with ops.init_scope():
+        v = resource_variable_ops.ResourceVariable(3)
+      return v.assign_add(1)
+
+    with context.eager_mode():
+      # Each invocation of function_with_variables recreates a variable.
+      self.assertEqual(4, int(function_with_variables()))
+      self.assertEqual(4, int(function_with_variables()))
+
+      compiled = eager_function.defun(function_with_variables)
+      # The init_scope in function_with_variables lifts the variable out
+      # of the graph function constructed by defun; hence,
+      # compiled now appears to be stateful.
+      self.assertEqual(4, int(compiled()))
+      self.assertEqual(5, int(compiled()))
+
+  def testEscapesDefunWhenInGraphMode(self):
+    def function_with_variables(name):
+      with ops.init_scope():
+        _ = variable_scope.get_variable(name, shape=(1,))
+
+    g = ops.Graph()
+    with g.as_default():
+      with self.test_session():
+        # First ensure that graphs that are not building functions are
+        # not escaped.
+        function_with_variables("foo")
+        with self.assertRaisesRegexp(ValueError,
+                                     r"Variable foo already exists.*"):
+          # This will fail because reuse is not set to True.
+          function_with_variables("foo")
+
+        compiled = eager_function.defun(function_with_variables)
+        compiled("bar")
+        self.assertEqual(
+            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 2)
+
+        # The second call to `compiled` should not create variables: the
+        # init_scope has lifted the variable creation code out of the defun.
+        compiled("bar")
+        self.assertEqual(
+            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 2)
+
+  def testEscapesNestedDefun(self):
+
+    def inner_function():
+      with ops.init_scope():
+        v = resource_variable_ops.ResourceVariable(1)
+      return v.assign_add(2)
+
+    def outer_function(inner=None):
+      with ops.init_scope():
+        v0 = resource_variable_ops.ResourceVariable(0)
+      return v0.assign_add(1) + inner()
+
+    with context.eager_mode():
+      # Each invocation of outer_function recreates variables.
+      self.assertEqual(4, int(outer_function(inner=inner_function)))
+      self.assertEqual(4, int(outer_function(inner=inner_function)))
+
+      compiled_inner = eager_function.defun(inner_function)
+      compiled_outer = eager_function.defun(outer_function)
+      # The init_scope lifts variables out of the graph functions
+      # constructed by defun; hence, compiled_outer should now appear to be
+      # stateful.
+      self.assertEqual(4, int(compiled_outer(inner=compiled_inner)))
+      self.assertEqual(7, int(compiled_outer(inner=compiled_inner)))
+
+  def testInstallsDefaultGraphWhenGraphStackIsEmptyInGraphMode(self):
+    with context.graph_mode():
+      # pylint: disable=protected-access
+      self.assertEqual(len(ops._default_graph_stack.stack), 0)
+      with ops.init_scope():
+        self.assertGreater(len(ops._default_graph_stack.stack), 0)
+      self.assertEqual(len(ops._default_graph_stack.stack), 0)
+      # pylint: enable=protected-access
+
+  def testPreservesNameScopeInGraphConstruction(self):
+    with ops.Graph().as_default():
+      function_graph = ops.Graph()
+      with function_graph.as_default():
+        with ops.name_scope("inner"), ops.init_scope():
+          self.assertEqual(ops.get_name_scope(), "inner")
+      self.assertEqual(ops.get_name_scope(), "")
+
+  def testPreservesNameScopeInEagerExecution(self):
+    with context.eager_mode():
+      def foo():
+        with ops.name_scope("inner"), ops.init_scope():
+          if context.in_graph_mode():
+            self.assertEqual(ops.get_name_scope(), "inner")
+          else:
+            # A trailing slash is always appended when eager execution is
+            # enabled.
+            self.assertEqual(context.context().scope_name, "inner/")
+      foo()
+      self.assertEqual(ops.get_name_scope(), "")
+      foo_compiled = eager_function.defun(foo)
+      foo_compiled()
+      self.assertEqual(ops.get_name_scope(), "")
+
+
 @test_util.with_c_api
 class GraphTest(test_util.TensorFlowTestCase):
 
@@ -1876,6 +2191,24 @@ class GraphTest(test_util.TensorFlowTestCase):
     gc.collect()
     self.assertIsNone(g_ref())
 
+  def testRunnableAfterInvalidShape(self):
+    with ops.Graph().as_default():
+      with self.assertRaises(ValueError):
+        math_ops.add([1, 2], [1, 2, 3])
+      a = constant_op.constant(1)
+      with session.Session() as sess:
+        sess.run(a)
+
+  def testRunnableAfterInvalidShapeWithKernelLabelMap(self):
+    g = ops.Graph()
+    with g.as_default():
+      with g._kernel_label_map({"KernelLabelRequired": "overload_1"}):
+        with self.assertRaises(ValueError):
+          test_ops.kernel_label_required(1)
+      a = constant_op.constant(1)
+      with session.Session() as sess:
+        sess.run(a)
+
 
 @test_util.with_c_api
 class AttrScopeTest(test_util.TensorFlowTestCase):
@@ -1890,7 +2223,6 @@ class AttrScopeTest(test_util.TensorFlowTestCase):
       b = compat.as_text(x.get_attr("_B"))
     except ValueError:
       b = None
-    print(a, b)
     return (a, b)
 
   def testNoLabel(self):
@@ -1981,6 +2313,8 @@ class AsGraphDefTest(test_util.TensorFlowTestCase):
       t4.set_shape([43, 37])
       t5.set_shape([43, None])
 
+      b = constant_op.constant(1.0)  # pylint: disable=unused-variable
+
       gd = g.as_graph_def(add_shapes=True)
       self.assertProtoEqualsVersion("""
       node { name: "FiveFloatOutputs" op: "FiveFloatOutputs"
@@ -1997,6 +2331,26 @@ class AsGraphDefTest(test_util.TensorFlowTestCase):
           }
         }
       }
+    node { name: "Const" op: "Const"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape { }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value { type: DT_FLOAT }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape { }
+         float_val: 1.0  } } } }
       """, gd)
 
 
@@ -2279,6 +2633,18 @@ class NameScopeTest(test_util.TensorFlowTestCase):
         self.assertEqual("scope1", g.get_name_scope())
       self.assertEqual("", g.get_name_scope())
 
+  def testTwoGraphs(self):
+
+    def f():
+      g1 = ops.Graph()
+      g2 = ops.Graph()
+      with g1.as_default():
+        with g2.as_default():
+          with ops.name_scope("_"):
+            pass
+
+    self.assertRaisesRegexp(ValueError, "'_' is not a valid scope name", f)
+
 
 @test_util.with_c_api
 class TracebackTest(test_util.TensorFlowTestCase):
@@ -2336,7 +2702,7 @@ class OutputTypesTest(test_util.TensorFlowTestCase):
     with g.as_default():
       x = constant_op.constant([1, 1, 2, 4, 4, 4, 7, 8, 8],
                                dtype=dtypes.double)
-      y, _ = gen_array_ops.unique(x)
+      y, _ = gen_array_ops._unique(x)
       self.assertEqual([types_pb2.DT_DOUBLE, types_pb2.DT_INT32],
                        y.op._output_types)  # pylint: disable=protected-access
 
@@ -2353,47 +2719,14 @@ class OutputTypesTest(test_util.TensorFlowTestCase):
 
 
 @test_util.with_c_api
-class InputTypesTest(test_util.TensorFlowTestCase):
-  """Tests Operation._input_dtypes and Operation._input_types properties.
-
-  This test should not exist as _input_types is a private property.
-  This property is used by many tests that would normally cover its
-  behavior. However, we can't yet run these tests in C
-  API mode because they use _set_device method. This test will be deleted
-  once we port _set_device.
-  """
-  # TODO(iga): Remove this test
-
-  def setUp(self):
-    self.prev_use_c_api = ops._USE_C_API  # pylint: disable=protected-access
-    ops._USE_C_API = True  # pylint: disable=protected-access
-
-  def tearDown(self):
-    ops._USE_C_API = self.prev_use_c_api  # pylint: disable=protected-access
-
-  def testZeroInputs(self):
-    g = ops.Graph()
-    with g.as_default():
-      # Using a constant because creating unregistered ops
-      # doesn't work with the C API.
-      op = constant_op.constant(12, dtype=dtypes.uint16).op
-      # pylint: disable=protected-access
-      self.assertEqual([], op._input_types)
-      self.assertEqual([], op._input_dtypes)
-      # pylint: enable=protected-access
-
-  def testTwoInputs(self):
-    g = ops.Graph()
-    with g.as_default():
-      x = constant_op.constant(1.0, dtype=dtypes.double)
-      y = constant_op.constant(2.0, dtype=dtypes.double)
-      z = math_ops.multiply(x, y)
-      # pylint: disable=protected-access
-      self.assertTrue(isinstance(z.op._input_types[0], dtypes.DType))
-      self.assertTrue(isinstance(z.op._input_types[1], dtypes.DType))
-      self.assertEqual([dtypes.double, dtypes.double], z.op._input_types)
-      self.assertEqual([dtypes.double, dtypes.double], z.op._input_dtypes)
-      # pylint: enable=protected-access
+class EnableEagerExecutionTest(test_util.TensorFlowTestCase):
+
+  def testBadArgumentsToEnableEagerExecution(self):
+    with self.assertRaisesRegexp(TypeError, "config must be a tf.ConfigProto"):
+      ops.enable_eager_execution(context.DEVICE_PLACEMENT_SILENT)
+    with self.assertRaisesRegexp(ValueError, "device_policy must be one of"):
+      c = config_pb2.ConfigProto()
+      ops.enable_eager_execution(c, c)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index c57f0a98421fa88e5faa870157116c1617c19620..c95149d177990e364c3d6b9daeae5dc535cf0070 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdio.h>
 #include <sstream>
 #include <unordered_map>
+#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb_text.h"
@@ -475,20 +476,17 @@ GenPythonOp::GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
 GenPythonOp::~GenPythonOp() {}
 
 string GenPythonOp::Code() {
-  if (api_def_.visibility() == ApiDef::SKIP) {
-    return "";
-  }
   // This has all the input args followed by those attrs that don't have
   // defaults.
-  std::vector<string> args_no_default;
+  std::vector<ParamNames> params_no_default;
   // The parameters with defaults (these have to be listed after those without).
   // No input args are included, just attrs.
-  std::vector<string> args_with_defaults;
+  std::vector<ParamNames> params_with_default;
 
   for (int i = 0; i < api_def_.arg_order_size(); ++i) {
     const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
     const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
-    args_no_default.push_back(api_def_arg.rename_to());
+    params_no_default.emplace_back(api_def_arg.name(), api_def_arg.rename_to());
     if (!arg.type_attr().empty()) {
       gtl::InsertIfNotPresent(&inferred_attrs_, arg.type_attr(), arg.name());
     } else if (!arg.type_list_attr().empty()) {
@@ -504,9 +502,9 @@ string GenPythonOp::Code() {
     // Do not add inferred attrs to the Python function signature.
     if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
       if (attr.has_default_value()) {
-        args_with_defaults.push_back(attr.rename_to());
+        params_with_default.emplace_back(attr.name(), attr.rename_to());
       } else {
-        args_no_default.push_back(attr.rename_to());
+        params_no_default.emplace_back(attr.name(), attr.rename_to());
       }
     }
   }
@@ -515,27 +513,30 @@ string GenPythonOp::Code() {
   // those with defaults go at the end.
   // Get the attrs in the order we want by taking the attrs without defaults
   // from the end of args_no_default, and adding args_no_default.
-  attrs_.reserve(args_no_default.size() - op_def_.input_arg_size() +
-                 args_with_defaults.size());
-  attrs_.insert(attrs_.end(),
-                args_no_default.begin() + op_def_.input_arg_size(),
-                args_no_default.end());
-  attrs_.insert(attrs_.end(), args_with_defaults.begin(),
-                args_with_defaults.end());
-
-  param_names_.reserve(args_no_default.size() + args_with_defaults.size());
+  attrs_.reserve(params_no_default.size() - op_def_.input_arg_size() +
+                 params_with_default.size());
+  for (int i = op_def_.input_arg_size(); i < params_no_default.size(); ++i) {
+    attrs_.push_back(params_no_default[i].GetName());
+  }
+  for (int i = 0; i < params_with_default.size(); ++i) {
+    attrs_.push_back(params_with_default[i].GetName());
+  }
+
+  param_names_.reserve(params_no_default.size() + params_with_default.size());
+  param_names_.insert(param_names_.begin(), params_no_default.begin(),
+                      params_no_default.end());
+  for (const auto& param : params_with_default) {
+    param_names_.push_back(param);
+  }
+
   string parameters;
-  for (const string& name : args_no_default) {
+  for (const auto& param : params_no_default) {
     AddDelimiter(&parameters, ", ");
-    const string param = AvoidPythonReserved(name);
-    strings::StrAppend(&parameters, param);
-    param_names_.push_back(param);
+    strings::StrAppend(&parameters, param.GetRenameTo());
   }
-  for (const string& name : args_with_defaults) {
+  for (const auto& param_and_default : params_with_default) {
     AddDelimiter(&parameters, ", ");
-    const string param = AvoidPythonReserved(name);
-    strings::StrAppend(&parameters, param, "=None");
-    param_names_.push_back(param);
+    strings::StrAppend(&parameters, param_and_default.GetRenameTo(), "=None");
   }
   AddDelimiter(&parameters, ", ");
   strings::StrAppend(&parameters, "name=None");
@@ -557,10 +558,11 @@ string GenPythonOp::Code() {
 }
 
 void GenPythonOp::AddExport() {
-  if (api_def_.visibility() != api_def_.VISIBLE) {
+  if (api_def_.visibility() != ApiDef::VISIBLE) {
     return;
   }
-  strings::StrAppend(&result_, "tf_export(");
+
+  strings::StrAppend(&result_, "@tf_export(");
 
   // Add all endpoint names to tf_export.
   bool first_endpoint = true;
@@ -570,13 +572,21 @@ void GenPythonOp::AddExport() {
     } else {
       first_endpoint = false;
     }
-    strings::StrAppend(&result_, "'", endpoint.name(), "'");
+    string endpoint_name;
+    python_op_gen_internal::GenerateLowerCaseOpName(endpoint.name(),
+                                                    &endpoint_name);
+    strings::StrAppend(&result_, "'", endpoint_name, "'");
   }
   strings::StrAppend(&result_, ")\n");
 }
 
+void GenPythonOp::AddDefLine(const string& function_name,
+                             const string& parameters) {
+  strings::StrAppend(&result_, "def ", function_name, "(", parameters, "):\n");
+}
+
 void GenPythonOp::AddDefLine(const string& parameters) {
-  strings::StrAppend(&result_, "def ", function_name_, "(", parameters, "):\n");
+  AddDefLine(function_name_, parameters);
 }
 
 void GenPythonOp::AddDocStringDescription() {
@@ -603,9 +613,9 @@ void GenPythonOp::AddDocStringInputs() {
     StringPiece description = api_def_arg.description();
     string desc;
     if (ConsumeEquals(&description)) {  // Skip the generated type info.
-      desc = strings::StrCat(param_names_[i], ": ");
+      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ");
     } else {
-      desc = strings::StrCat(param_names_[i], ": ",
+      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ",
                              ArgTypeName(op_def_, arg, inferred_attrs_, false));
     }
     if (!description.empty()) {
@@ -750,7 +760,8 @@ void GenPythonOp::AddBody(const string& prefix) {
 void GenPythonOp::AddBodyNoReturn(const string& apply_prefix) {
   string args = strings::StrCat("\"", op_def_.name(), "\", ");
   for (size_t i = 0; i < param_names_.size(); ++i) {
-    strings::StrAppend(&args, param_names_[i], "=", param_names_[i], ", ");
+    strings::StrAppend(&args, AvoidPythonReserved(param_names_[i].GetName()),
+                       "=", param_names_[i].GetRenameTo(), ", ");
   }
   strings::StrAppend(&args, "name=name)");
 
@@ -796,11 +807,21 @@ from tensorflow.python.util.tf_export import tf_export
   auto out = cleaned_ops.mutable_op();
   out->Reserve(ops.op_size());
   for (const auto& op_def : ops.op()) {
-    bool is_hidden = false;
-    for (const string& hidden : hidden_ops) {
-      if (op_def.name() == hidden) {
-        is_hidden = true;
-        break;
+    const auto* api_def = api_defs.GetApiDef(op_def.name());
+
+    if (api_def->visibility() == ApiDef::SKIP) {
+      continue;
+    }
+
+    // An op is hidden if either its ApiDef visibility is HIDDEN
+    // or it is in the hidden_ops list.
+    bool is_hidden = api_def->visibility() == ApiDef::HIDDEN;
+    if (!is_hidden) {
+      for (const string& hidden : hidden_ops) {
+        if (op_def.name() == hidden) {
+          is_hidden = true;
+          break;
+        }
       }
     }
 
@@ -817,7 +838,6 @@ from tensorflow.python.util.tf_export import tf_export
       continue;
     }
 
-    const auto* api_def = api_defs.GetApiDef(op_def.name());
     strings::StrAppend(&result, GetPythonOp(op_def, *api_def, function_name));
 
     if (!require_shapes) {
diff --git a/tensorflow/python/framework/python_op_gen_internal.h b/tensorflow/python/framework/python_op_gen_internal.h
index c1efbf9be2277dbc047868dde5110b5505fc9e23..4319e5a7820b33283df8153fdc76e0e567813a17 100644
--- a/tensorflow/python/framework/python_op_gen_internal.h
+++ b/tensorflow/python/framework/python_op_gen_internal.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
+#ifndef TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
+#define TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
 
 #include <unordered_map>
 
@@ -41,6 +41,28 @@ void GenerateLowerCaseOpName(const string& str, string* result);
 
 string DataTypeToPython(DataType dtype, const string& dtype_module);
 
+// Names that corresponds to a single input parameter.
+class ParamNames {
+ public:
+  // Create param based on Arg.
+  ParamNames(const string& name, const string& rename_to) : name_(name) {
+    rename_to_ = AvoidPythonReserved(rename_to);
+  }
+
+  // Get original parameter name.
+  string GetName() const { return name_; }
+
+  // Get the name to rename the parameter to. Note that AvoidPythonReserved
+  // has already been applied.
+  string GetRenameTo() const { return rename_to_; }
+
+ private:
+  // Original parameter name.
+  string name_;
+  // API name for this parameter.
+  string rename_to_;
+};
+
 class GenPythonOp {
  public:
   GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
@@ -51,6 +73,7 @@ class GenPythonOp {
 
  protected:
   // Print: def Function(parameters):
+  void AddDefLine(const string& function_name, const string& parameters);
   void AddDefLine(const string& parameters);
 
   // Format the Op's descriptions so that it can be a Python docstring.
@@ -84,10 +107,10 @@ class GenPythonOp {
 
   // All parameters, including inputs & non-inferred attrs, required and those
   // with defaults, except "name"
-  std::vector<string> param_names_;
+  std::vector<ParamNames> param_names_;
 };
 
 }  // namespace python_op_gen_internal
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
+#endif  // TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_INTERNAL_H_
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index 61b1d02a5e85f40c884ffe77104b425b3554b796..bc5ca195da50499c6fbab822a9a093be3f0277e0 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -34,12 +34,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-constexpr char kBaseApiDef[] =
-    "tensorflow/core/api_def/base_api/*.pbtxt";
-constexpr char kPythonApiDef[] =
-    "tensorflow/core/api_def/python_api/*.pbtxt";
-constexpr bool kUseApiDef = false;
-
 Status ReadOpListFromFile(const string& filename,
                           std::vector<string>* op_list) {
   std::unique_ptr<RandomAccessFile> file;
@@ -110,22 +104,23 @@ string InferSourceFileName(const char* argv_zero) {
 }
 
 void PrintAllPythonOps(const std::vector<string>& op_list,
+                       const std::vector<string>& api_def_dirs,
                        const string& source_file_name, bool require_shapes,
                        bool op_list_is_whitelist) {
   OpList ops;
   OpRegistry::Global()->Export(false, &ops);
 
   ApiDefMap api_def_map(ops);
-  if (kUseApiDef) {
+  if (!api_def_dirs.empty()) {
     Env* env = Env::Default();
 
-    std::vector<string> base_api_files;
-    std::vector<string> python_api_files;
-    TF_CHECK_OK(env->GetMatchingPaths(kBaseApiDef, &base_api_files));
-    TF_CHECK_OK(env->GetMatchingPaths(kPythonApiDef, &python_api_files));
-
-    TF_CHECK_OK(api_def_map.LoadFileList(env, base_api_files));
-    TF_CHECK_OK(api_def_map.LoadFileList(env, python_api_files));
+    for (const auto& api_def_dir : api_def_dirs) {
+      std::vector<string> api_files;
+      TF_CHECK_OK(env->GetMatchingPaths(io::JoinPath(api_def_dir, "*.pbtxt"),
+                                        &api_files));
+      TF_CHECK_OK(api_def_map.LoadFileList(env, api_files));
+    }
+    api_def_map.UpdateDocs();
   }
 
   if (op_list_is_whitelist) {
@@ -154,23 +149,30 @@ int main(int argc, char* argv[]) {
       tensorflow::InferSourceFileName(argv[0]);
 
   // Usage:
-  //   gen_main [ @FILENAME | OpName[,OpName]* ] (0 | 1) [0 | 1]
-  if (argc == 2) {
-    tensorflow::PrintAllPythonOps({}, source_file_name,
-                                  tensorflow::string(argv[1]) == "1",
-                                  false /* op_list_is_whitelist */);
-  } else if (argc == 3) {
-    std::vector<tensorflow::string> hidden_ops;
-    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[1], &hidden_ops));
-    tensorflow::PrintAllPythonOps(hidden_ops, source_file_name,
+  //   gen_main api_def_dir1,api_def_dir2,...
+  //       [ @FILENAME | OpName[,OpName]* ] (0 | 1) [0 | 1]
+  if (argc < 3) {
+    return -1;
+  }
+  std::vector<tensorflow::string> api_def_dirs = tensorflow::str_util::Split(
+      argv[1], ",", tensorflow::str_util::SkipEmpty());
+
+  if (argc == 3) {
+    tensorflow::PrintAllPythonOps({}, api_def_dirs, source_file_name,
                                   tensorflow::string(argv[2]) == "1",
                                   false /* op_list_is_whitelist */);
   } else if (argc == 4) {
+    std::vector<tensorflow::string> hidden_ops;
+    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[2], &hidden_ops));
+    tensorflow::PrintAllPythonOps(hidden_ops, api_def_dirs, source_file_name,
+                                  tensorflow::string(argv[3]) == "1",
+                                  false /* op_list_is_whitelist */);
+  } else if (argc == 5) {
     std::vector<tensorflow::string> op_list;
-    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[1], &op_list));
-    tensorflow::PrintAllPythonOps(op_list, source_file_name,
-                                  tensorflow::string(argv[2]) == "1",
-                                  tensorflow::string(argv[3]) == "1");
+    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[2], &op_list));
+    tensorflow::PrintAllPythonOps(op_list, api_def_dirs, source_file_name,
+                                  tensorflow::string(argv[3]) == "1",
+                                  tensorflow::string(argv[4]) == "1");
   } else {
     return -1;
   }
diff --git a/tensorflow/python/framework/random_seed.py b/tensorflow/python/framework/random_seed.py
index 5f1130570d2ec9bd964abeb7526ab03f14e067a3..1e74a790a3fb0c72b7c0fb1127ffac95f386d85e 100644
--- a/tensorflow/python/framework/random_seed.py
+++ b/tensorflow/python/framework/random_seed.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 DEFAULT_GRAPH_SEED = 87654321
@@ -32,6 +33,7 @@ def _truncate_seed(seed):
   return seed % _MAXINT32  # Truncate to fit into 32-bit integer
 
 
+@tf_export('get_seed')
 def get_seed(op_seed):
   """Returns the local seeds an operation should use given an op-specific seed.
 
@@ -78,6 +80,7 @@ def get_seed(op_seed):
   return seeds
 
 
+@tf_export('set_random_seed')
 def set_random_seed(seed):
   """Sets the graph-level random seed.
 
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 6218cc34cad50aa6e291dcffcf352c717e0d85f0..1fe81e5f17a7de0a113596d920d63e5d9474c7c1 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -23,6 +23,7 @@ import collections
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=protected-access
 _TensorLike = ops._TensorLike
@@ -31,6 +32,7 @@ _override_helper = ops._override_helper
 # pylint: enable=protected-access
 
 
+@tf_export("SparseTensor")
 class SparseTensor(_TensorLike):
   """Represents a sparse tensor.
 
@@ -222,8 +224,10 @@ class SparseTensor(_TensorLike):
 
 SparseTensorValue = collections.namedtuple(
     "SparseTensorValue", ["indices", "values", "dense_shape"])
+tf_export("SparseTensorValue")(SparseTensorValue)
 
 
+@tf_export("convert_to_tensor_or_sparse_tensor")
 def convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None):
   """Converts value to a `SparseTensor` or `Tensor`.
 
diff --git a/tensorflow/python/framework/subscribe.py b/tensorflow/python/framework/subscribe.py
index cdcb74e88fd30ee4ff6abbc4b0fbb3c37d5f785e..7797d991da7c1c3a429bbf9e60772f0a1952c723 100644
--- a/tensorflow/python/framework/subscribe.py
+++ b/tensorflow/python/framework/subscribe.py
@@ -137,11 +137,18 @@ def _subscribe_new(tensor, side_effects, control_cache):
     # are subscribed at the same time, we remove the control dependency from
     # the original op only once and we add the dependencies to all the
     # new identities.
+    if ops._USE_C_API:  # pylint: disable=protected-access
+      new_control_inputs = consumer_op.control_inputs
+    else:
+      # Make a copy so we don't modify the actual control inputs (this is fixed
+      # in the C API).
+      new_control_inputs = list(consumer_op.control_inputs)
+    if tensor.op in new_control_inputs:
+      new_control_inputs.remove(tensor.op)
+    new_control_inputs.append(out.op)
     # pylint: disable=protected-access
-    if tensor.op in consumer_op._control_inputs:
-      consumer_op._control_inputs.remove(tensor.op)
-    consumer_op._control_inputs.append(out.op)
-    consumer_op._recompute_node_def()
+    consumer_op._remove_all_control_inputs()
+    consumer_op._add_control_inputs(new_control_inputs)
     # pylint: enable=protected-access
   return out
 
@@ -167,12 +174,8 @@ def _subscribe_extend(tensor, side_effects):
     for s in side_effects:
       outs += s(source_tensor)
 
-  for out in outs:
-    out_type = type(out)
-    if out_type is ops.Tensor:
-      out = out.op
-    tensor.op._control_inputs.append(out)  # pylint: disable=protected-access
-  tensor.op._recompute_node_def()  # pylint: disable=protected-access
+  out_ops = [out.op if isinstance(out, ops.Tensor) else out for out in outs]
+  tensor.op._add_control_inputs(out_ops)  # pylint: disable=protected-access
 
   return tensor
 
diff --git a/tensorflow/python/framework/subscribe_test.py b/tensorflow/python/framework/subscribe_test.py
index 01df20241dd8687ea41217ceddd1be8b9f975978..8b95b25e82a1886c43e08f47a612300750643fb1 100644
--- a/tensorflow/python/framework/subscribe_test.py
+++ b/tensorflow/python/framework/subscribe_test.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
+@test_util.with_c_api
 class SubscribeTest(test_util.TensorFlowTestCase):
 
   def _ExpectSubscribedIdentities(self, container):
@@ -58,12 +59,12 @@ class SubscribeTest(test_util.TensorFlowTestCase):
       return t
 
     c0 = c
-    self.assertTrue(c0.op in d.op._control_inputs)
+    self.assertTrue(c0.op in d.op.control_inputs)
     c = subscribe.subscribe(c,
                             lambda t: script_ops.py_func(sub, [t], [t.dtype]))
     # Verify that control dependencies are correctly moved to the subscription.
-    self.assertFalse(c0.op in d.op._control_inputs)
-    self.assertTrue(c.op in d.op._control_inputs)
+    self.assertFalse(c0.op in d.op.control_inputs)
+    self.assertTrue(c.op in d.op.control_inputs)
 
     with self.test_session() as sess:
       c_out = sess.run([c])
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 54ec15ea66d637b3ef00c38d089e8cbd1c75444c..222071cb9e87aa0fdd9788d1c72df4c66ea61547 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -19,8 +19,10 @@ from __future__ import print_function
 
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("Dimension")
 class Dimension(object):
   """Represents the value of one dimension in a TensorShape."""
 
@@ -397,6 +399,7 @@ def as_dimension(value):
     return Dimension(value)
 
 
+@tf_export("TensorShape")
 class TensorShape(object):
   """Represents the shape of a `Tensor`.
 
diff --git a/tensorflow/python/framework/tensor_spec.py b/tensorflow/python/framework/tensor_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0411bc3d9b4b2b87e5a31e9f201154f28ccf1cc
--- /dev/null
+++ b/tensorflow/python/framework/tensor_spec.py
@@ -0,0 +1,201 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A TensorSpec class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+
+
+class TensorSpec(object):
+  """Describes a tf.Tensor.
+
+  A TensorSpec allows an API to describe the Tensors that it accepts or
+  returns, before that Tensor exists. This allows dynamic and flexible graph
+  construction and configuration.
+  """
+
+  __slots__ = ["_shape", "_dtype", "_name"]
+
+  def __init__(self, shape, dtype, name=None):
+    """Creates a TensorSpec.
+
+    Args:
+      shape: Value convertible to `tf.TensorShape`. The shape of the tensor.
+      dtype: Value convertible to `tf.DType`. The type of the tensor values.
+      name: Optional name for the Tensor.
+
+    Raises:
+      TypeError: If shape is not convertible to a `tf.TensorShape`, or dtype is
+        not convertible to a `tf.DType`.
+    """
+    self._shape = tensor_shape.TensorShape(shape)
+    self._dtype = dtypes.as_dtype(dtype)
+    self._name = name
+
+  @classmethod
+  def from_spec(cls, spec, name=None):
+    return cls(spec.shape, spec.dtype, name or spec.name)
+
+  @classmethod
+  def from_tensor(cls, tensor, name=None):
+    if isinstance(tensor, ops.EagerTensor):
+      return TensorSpec(tensor.shape, tensor.dtype, name)
+    elif isinstance(tensor, ops.Tensor):
+      return TensorSpec(tensor.shape, tensor.dtype, name or tensor.op.name)
+    else:
+      raise ValueError("`tensor` should be a tf.Tensor")
+
+  @property
+  def shape(self):
+    """Returns the `TensorShape` that represents the shape of the tensor."""
+    return self._shape
+
+  @property
+  def dtype(self):
+    """Returns the `dtype` of elements in the tensor."""
+    return self._dtype
+
+  @property
+  def name(self):
+    """Returns the name of the described tensor."""
+    return self._name
+
+  def is_compatible_with(self, spec_or_tensor):
+    """True if the shape and dtype of `spec_or_tensor` are compatible."""
+    return (self._dtype.is_compatible_with(spec_or_tensor.dtype) and
+            self._shape.is_compatible_with(spec_or_tensor.shape))
+
+  def __repr__(self):
+    return "TensorSpec(shape={}, dtype={}, name={})".format(
+        self.shape, repr(self.dtype), repr(self.name))
+
+  def __eq__(self, other):
+    return self.shape == other.shape and self.dtype == other.dtype
+
+  def __ne__(self, other):
+    return not self == other
+
+
+class BoundedTensorSpec(TensorSpec):
+  """A `TensorSpec` that specifies minimum and maximum values.
+
+  Example usage:
+  ```python
+  spec = tensor_spec.BoundedTensorSpec((1, 2, 3), tf.float32, 0, (5, 5, 5))
+  tf_minimum = tf.convert_to_tensor(spec.minimum, dtype=spec.dtype)
+  tf_maximum = tf.convert_to_tensor(spec.maximum, dtype=spec.dtype)
+  ```
+
+  Bounds are meant to be inclusive. This is especially important for
+  integer types. The following spec will be satisfied by tensors
+  with values in the set {0, 1, 2}:
+  ```python
+  spec = tensor_spec.BoundedTensorSpec((3, 5), tf.int32, 0, 2)
+  ```
+  """
+
+  __slots__ = ("_minimum", "_maximum")
+
+  def __init__(self, shape, dtype, minimum, maximum, name=None):
+    """Initializes a new `BoundedTensorSpec`.
+
+    Args:
+      shape: Value convertible to `tf.TensorShape`. The shape of the tensor.
+      dtype: Value convertible to `tf.DType`. The type of the tensor values.
+      minimum: Number or sequence specifying the minimum element bounds
+        (inclusive). Must be broadcastable to `shape`.
+      maximum: Number or sequence specifying the maximum element bounds
+        (inclusive). Must be broadcastable to `shape`.
+      name: Optional string containing a semantic name for the corresponding
+        array. Defaults to `None`.
+
+    Raises:
+      ValueError: If `minimum` or `maximum` are not provided or not
+        broadcastable to `shape`.
+      TypeError: If the shape is not an iterable or if the `dtype` is an invalid
+        numpy dtype.
+    """
+    super(BoundedTensorSpec, self).__init__(shape, dtype, name)
+
+    if minimum is None or maximum is None:
+      raise ValueError("minimum and maximum must be provided; but saw "
+                       "'%s' and '%s'" % (minimum, maximum))
+
+    try:
+      minimum_shape = np.shape(minimum)
+      common_shapes.broadcast_shape(
+          tensor_shape.TensorShape(minimum_shape), self.shape)
+    except ValueError as exception:
+      raise ValueError("minimum is not compatible with shape. "
+                       "Message: {!r}.".format(exception))
+
+    try:
+      maximum_shape = np.shape(maximum)
+      common_shapes.broadcast_shape(
+          tensor_shape.TensorShape(maximum_shape), self.shape)
+    except ValueError as exception:
+      raise ValueError("maximum is not compatible with shape. "
+                       "Message: {!r}.".format(exception))
+
+    self._minimum = np.array(minimum, dtype=self.dtype.as_numpy_dtype())
+    self._minimum.setflags(write=False)
+
+    self._maximum = np.array(maximum, dtype=self.dtype.as_numpy_dtype())
+    self._maximum.setflags(write=False)
+
+  @classmethod
+  def from_spec(cls, spec):
+    dtype = dtypes.as_dtype(spec.dtype)
+    if dtype in [dtypes.float64, dtypes.float32]:
+      # Avoid under/over-flow for `dtype.maximum - dtype.minimum`.
+      low = dtype.min / 2
+      high = dtype.max / 2
+    else:
+      low = dtype.min
+      high = dtype.max
+
+    minimum = getattr(spec, "minimum", low)
+    maximum = getattr(spec, "maximum", high)
+    return BoundedTensorSpec(spec.shape, dtype, minimum, maximum, spec.name)
+
+  @property
+  def minimum(self):
+    """Returns a NumPy array specifying the minimum bounds (inclusive)."""
+    return self._minimum
+
+  @property
+  def maximum(self):
+    """Returns a NumPy array specifying the maximum bounds (inclusive)."""
+    return self._maximum
+
+  def __repr__(self):
+    s = "BoundedTensorSpec(shape={}, dtype={}, name={}, minimum={}, maximum={})"
+    return s.format(self.shape, repr(self.dtype), repr(self.name),
+                    repr(self.minimum), repr(self.maximum))
+
+  def __eq__(self, other):
+    tensor_spec_eq = super(BoundedTensorSpec, self).__eq__(other)
+    return (tensor_spec_eq and np.allclose(self.minimum, other.minimum) and
+            np.allclose(self.maximum, other.maximum))
+
+
diff --git a/tensorflow/python/framework/tensor_spec_test.py b/tensorflow/python/framework/tensor_spec_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..54ca4d9a19c2e1c879c05cfb828085951bdd8444
--- /dev/null
+++ b/tensorflow/python/framework/tensor_spec_test.py
@@ -0,0 +1,227 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensor_spec."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class TensorSpecTest(test_util.TensorFlowTestCase):
+
+  def testAcceptsNumpyDType(self):
+    desc = tensor_spec.TensorSpec([1], np.float32)
+    self.assertEqual(desc.dtype, dtypes.float32)
+
+  def testAcceptsTensorShape(self):
+    desc = tensor_spec.TensorSpec(tensor_shape.TensorShape([1]), dtypes.float32)
+    self.assertEqual(desc.shape, tensor_shape.TensorShape([1]))
+
+  def testUnknownShape(self):
+    desc = tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)
+    self.assertEqual(desc.shape, tensor_shape.TensorShape(None))
+
+  def testShapeCompatibility(self):
+    unknown = array_ops.placeholder(dtypes.int64)
+    partial = array_ops.placeholder(dtypes.int64, shape=[None, 1])
+    full = array_ops.placeholder(dtypes.int64, shape=[2, 3])
+    rank3 = array_ops.placeholder(dtypes.int64, shape=[4, 5, 6])
+
+    desc_unknown = tensor_spec.TensorSpec(None, dtypes.int64)
+    self.assertTrue(desc_unknown.is_compatible_with(unknown))
+    self.assertTrue(desc_unknown.is_compatible_with(partial))
+    self.assertTrue(desc_unknown.is_compatible_with(full))
+    self.assertTrue(desc_unknown.is_compatible_with(rank3))
+
+    desc_partial = tensor_spec.TensorSpec([2, None], dtypes.int64)
+    self.assertTrue(desc_partial.is_compatible_with(unknown))
+    self.assertTrue(desc_partial.is_compatible_with(partial))
+    self.assertTrue(desc_partial.is_compatible_with(full))
+    self.assertFalse(desc_partial.is_compatible_with(rank3))
+
+    desc_full = tensor_spec.TensorSpec([2, 3], dtypes.int64)
+    self.assertTrue(desc_full.is_compatible_with(unknown))
+    self.assertFalse(desc_full.is_compatible_with(partial))
+    self.assertTrue(desc_full.is_compatible_with(full))
+    self.assertFalse(desc_full.is_compatible_with(rank3))
+
+    desc_rank3 = tensor_spec.TensorSpec([4, 5, 6], dtypes.int64)
+    self.assertTrue(desc_rank3.is_compatible_with(unknown))
+    self.assertFalse(desc_rank3.is_compatible_with(partial))
+    self.assertFalse(desc_rank3.is_compatible_with(full))
+    self.assertTrue(desc_rank3.is_compatible_with(rank3))
+
+  def testTypeCompatibility(self):
+    floats = array_ops.placeholder(dtypes.float32, shape=[10, 10])
+    ints = array_ops.placeholder(dtypes.int32, shape=[10, 10])
+    desc = tensor_spec.TensorSpec(shape=(10, 10), dtype=dtypes.float32)
+    self.assertTrue(desc.is_compatible_with(floats))
+    self.assertFalse(desc.is_compatible_with(ints))
+
+  def testName(self):
+    desc = tensor_spec.TensorSpec([1], dtypes.float32, name="beep")
+    self.assertEqual(desc.name, "beep")
+
+  def testRepr(self):
+    desc1 = tensor_spec.TensorSpec([1], dtypes.float32, name="beep")
+    self.assertEqual(
+        repr(desc1),
+        "TensorSpec(shape=(1,), dtype=tf.float32, name='beep')")
+    desc2 = tensor_spec.TensorSpec([1, None], dtypes.int32)
+    self.assertEqual(
+        repr(desc2),
+        "TensorSpec(shape=(1, ?), dtype=tf.int32, name=None)")
+
+  def testFromTensorSpec(self):
+    spec_1 = tensor_spec.TensorSpec((1, 2), dtypes.int32)
+    spec_2 = tensor_spec.TensorSpec.from_spec(spec_1)
+    self.assertEqual(spec_1, spec_2)
+
+  def testFromTensor(self):
+    zero = constant_op.constant(0)
+    spec = tensor_spec.TensorSpec.from_tensor(zero)
+    self.assertEqual(spec.dtype, dtypes.int32)
+    self.assertEqual(spec.shape, [])
+    self.assertEqual(spec.name, "Const")
+
+  def testFromPlaceholder(self):
+    unknown = array_ops.placeholder(dtypes.int64, name="unknown")
+    partial = array_ops.placeholder(dtypes.float32,
+                                    shape=[None, 1],
+                                    name="partial")
+    spec_1 = tensor_spec.TensorSpec.from_tensor(unknown)
+    self.assertEqual(spec_1.dtype, dtypes.int64)
+    self.assertEqual(spec_1.shape, None)
+    self.assertEqual(spec_1.name, "unknown")
+    spec_2 = tensor_spec.TensorSpec.from_tensor(partial)
+    self.assertEqual(spec_2.dtype, dtypes.float32)
+    self.assertEqual(spec_2.shape.as_list(), [None, 1])
+    self.assertEqual(spec_2.name, "partial")
+
+  def testFromBoundedTensorSpec(self):
+    bounded_spec = tensor_spec.BoundedTensorSpec((1, 2), dtypes.int32, 0, 1)
+    spec = tensor_spec.TensorSpec.from_spec(bounded_spec)
+    self.assertEqual(bounded_spec.shape, spec.shape)
+    self.assertEqual(bounded_spec.dtype, spec.dtype)
+    self.assertEqual(bounded_spec.name, spec.name)
+
+
+class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
+
+  def testInvalidMinimum(self):
+    with self.assertRaisesRegexp(ValueError, "not compatible"):
+      tensor_spec.BoundedTensorSpec((3, 5), dtypes.uint8, (0, 0, 0), (1, 1))
+
+  def testInvalidMaximum(self):
+    with self.assertRaisesRegexp(ValueError, "not compatible"):
+      tensor_spec.BoundedTensorSpec((3, 5), dtypes.uint8, 0, (1, 1, 1))
+
+  def testMinimumMaximumAttributes(self):
+    spec = tensor_spec.BoundedTensorSpec(
+        (1, 2, 3), dtypes.float32, 0, (5, 5, 5))
+    self.assertEqual(type(spec.minimum), np.ndarray)
+    self.assertEqual(type(spec.maximum), np.ndarray)
+    self.assertAllEqual(spec.minimum, np.array(0, dtype=np.float32))
+    self.assertAllEqual(spec.maximum, np.array([5, 5, 5], dtype=np.float32))
+
+  def testNotWriteableNP(self):
+    spec = tensor_spec.BoundedTensorSpec(
+        (1, 2, 3), dtypes.float32, 0, (5, 5, 5))
+    with self.assertRaisesRegexp(ValueError, "read-only"):
+      spec.minimum[0] = -1
+    with self.assertRaisesRegexp(ValueError, "read-only"):
+      spec.maximum[0] = 100
+
+  def testReuseSpec(self):
+    spec_1 = tensor_spec.BoundedTensorSpec((1, 2), dtypes.int32,
+                                           minimum=0, maximum=1)
+    spec_2 = tensor_spec.BoundedTensorSpec(
+        spec_1.shape, spec_1.dtype, spec_1.minimum, spec_1.maximum)
+    self.assertEqual(spec_1, spec_2)
+
+  def testScalarBounds(self):
+    spec = tensor_spec.BoundedTensorSpec(
+        (), dtypes.float32, minimum=0.0, maximum=1.0)
+
+    self.assertIsInstance(spec.minimum, np.ndarray)
+    self.assertIsInstance(spec.maximum, np.ndarray)
+
+    # Sanity check that numpy compares correctly to a scalar for an empty shape.
+    self.assertEqual(0.0, spec.minimum)
+    self.assertEqual(1.0, spec.maximum)
+
+    # Check that the spec doesn't fail its own input validation.
+    _ = tensor_spec.BoundedTensorSpec(
+        spec.shape, spec.dtype, spec.minimum, spec.maximum)
+
+  def testFromBoundedTensorSpec(self):
+    spec_1 = tensor_spec.BoundedTensorSpec((1, 2), dtypes.int32,
+                                           minimum=0, maximum=1)
+    spec_2 = tensor_spec.BoundedTensorSpec.from_spec(spec_1)
+    self.assertEqual(spec_1, spec_2)
+
+  def testEquality(self):
+    spec_1_1 = tensor_spec.BoundedTensorSpec((1, 2, 3), dtypes.float32,
+                                             0, (5, 5, 5))
+    spec_1_2 = tensor_spec.BoundedTensorSpec((1, 2, 3), dtypes.float32,
+                                             0.00000001,
+                                             (5, 5, 5.00000000000000001))
+    spec_2_1 = tensor_spec.BoundedTensorSpec((1, 2, 3), dtypes.float32,
+                                             1, (5, 5, 5))
+    spec_2_2 = tensor_spec.BoundedTensorSpec((1, 2, 3), dtypes.float32,
+                                             (1, 1, 1), (5, 5, 5))
+    spec_2_3 = tensor_spec.BoundedTensorSpec((1, 2, 3), dtypes.float32,
+                                             (1, 1, 1), 5)
+    spec_3_1 = tensor_spec.BoundedTensorSpec((1, 2, 3), dtypes.float32,
+                                             (2, 1, 1), (5, 5, 5))
+
+    self.assertEqual(spec_1_1, spec_1_2)
+    self.assertEqual(spec_1_2, spec_1_1)
+
+    self.assertNotEqual(spec_1_1, spec_2_2)
+    self.assertNotEqual(spec_1_1, spec_2_1)
+    self.assertNotEqual(spec_2_2, spec_1_1)
+    self.assertNotEqual(spec_2_1, spec_1_1)
+
+    self.assertEqual(spec_2_1, spec_2_2)
+    self.assertEqual(spec_2_2, spec_2_1)
+    self.assertEqual(spec_2_2, spec_2_3)
+
+    self.assertNotEqual(spec_1_1, spec_3_1)
+    self.assertNotEqual(spec_2_1, spec_3_1)
+    self.assertNotEqual(spec_2_2, spec_3_1)
+
+  def testFromTensorSpec(self):
+    spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
+    bounded_spec = tensor_spec.BoundedTensorSpec.from_spec(spec)
+    self.assertEqual(spec.shape, bounded_spec.shape)
+    self.assertEqual(spec.dtype, bounded_spec.dtype)
+    self.assertEqual(spec.dtype.min, bounded_spec.minimum)
+    self.assertEqual(spec.dtype.max, bounded_spec.maximum)
+    self.assertEqual(spec.name, bounded_spec.name)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 9fc0e494633d239c1343a88eb0bbbb5a8ee75cb8..0e5f696111ae7f74b41f8af21a5190fc2617e51a 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Utilities to create TensorProtos."""
 from __future__ import absolute_import
 from __future__ import division
@@ -38,6 +37,8 @@ except ImportError:
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.util.tf_export import tf_export
+
 # pylint: enable=g-import-not-at-top
 
 
@@ -46,29 +47,57 @@ def ExtractBitsFromFloat16(x):
 
 
 def SlowAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
-  tensor_proto.half_val.extend([
-      ExtractBitsFromFloat16(x) for x in proto_values])
+  tensor_proto.half_val.extend(
+      [ExtractBitsFromFloat16(x) for x in proto_values])
+
+
+def ExtractBitsFromBFloat16(x):
+  return np.asscalar(
+      np.asarray(x, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16))
+
+
+def SlowAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
+  tensor_proto.half_val.extend(
+      [ExtractBitsFromBFloat16(x) for x in proto_values])
+
 
 if _FAST_TENSOR_UTIL_AVAILABLE:
   _NP_TO_APPEND_FN = {
+      dtypes.bfloat16.as_numpy_dtype:
+          SlowAppendBFloat16ArrayToTensorProto,
       # TODO(sesse): We should have a
       # fast_tensor_util.AppendFloat16ArrayToTensorProto,
       # but it seems np.float16_t doesn't exist?
-      np.float16: SlowAppendFloat16ArrayToTensorProto,
-      np.float32: fast_tensor_util.AppendFloat32ArrayToTensorProto,
-      np.float64: fast_tensor_util.AppendFloat64ArrayToTensorProto,
-      np.int32: fast_tensor_util.AppendInt32ArrayToTensorProto,
-      np.int64: fast_tensor_util.AppendInt64ArrayToTensorProto,
-      np.uint8: fast_tensor_util.AppendUInt8ArrayToTensorProto,
-      np.uint16: fast_tensor_util.AppendUInt16ArrayToTensorProto,
-      np.uint32: fast_tensor_util.AppendUInt32ArrayToTensorProto,
-      np.uint64: fast_tensor_util.AppendUInt64ArrayToTensorProto,
-      np.int8: fast_tensor_util.AppendInt8ArrayToTensorProto,
-      np.int16: fast_tensor_util.AppendInt16ArrayToTensorProto,
-      np.complex64: fast_tensor_util.AppendComplex64ArrayToTensorProto,
-      np.complex128: fast_tensor_util.AppendComplex128ArrayToTensorProto,
-      np.object: fast_tensor_util.AppendObjectArrayToTensorProto,
-      np.bool: fast_tensor_util.AppendBoolArrayToTensorProto,
+      np.float16:
+          SlowAppendFloat16ArrayToTensorProto,
+      np.float32:
+          fast_tensor_util.AppendFloat32ArrayToTensorProto,
+      np.float64:
+          fast_tensor_util.AppendFloat64ArrayToTensorProto,
+      np.int32:
+          fast_tensor_util.AppendInt32ArrayToTensorProto,
+      np.int64:
+          fast_tensor_util.AppendInt64ArrayToTensorProto,
+      np.uint8:
+          fast_tensor_util.AppendUInt8ArrayToTensorProto,
+      np.uint16:
+          fast_tensor_util.AppendUInt16ArrayToTensorProto,
+      np.uint32:
+          fast_tensor_util.AppendUInt32ArrayToTensorProto,
+      np.uint64:
+          fast_tensor_util.AppendUInt64ArrayToTensorProto,
+      np.int8:
+          fast_tensor_util.AppendInt8ArrayToTensorProto,
+      np.int16:
+          fast_tensor_util.AppendInt16ArrayToTensorProto,
+      np.complex64:
+          fast_tensor_util.AppendComplex64ArrayToTensorProto,
+      np.complex128:
+          fast_tensor_util.AppendComplex128ArrayToTensorProto,
+      np.object:
+          fast_tensor_util.AppendObjectArrayToTensorProto,
+      np.bool:
+          fast_tensor_util.AppendBoolArrayToTensorProto,
       dtypes.qint8.as_numpy_dtype:
           fast_tensor_util.AppendInt8ArrayToTensorProto,
       dtypes.quint8.as_numpy_dtype:
@@ -105,14 +134,12 @@ else:
     tensor_proto.uint64_val.extend([np.asscalar(x) for x in proto_values])
 
   def SlowAppendComplex64ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.scomplex_val.extend([np.asscalar(v)
-                                      for x in proto_values
-                                      for v in [x.real, x.imag]])
+    tensor_proto.scomplex_val.extend(
+        [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]])
 
   def SlowAppendComplex128ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.dcomplex_val.extend([np.asscalar(v)
-                                      for x in proto_values
-                                      for v in [x.real, x.imag]])
+    tensor_proto.dcomplex_val.extend(
+        [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]])
 
   def SlowAppendObjectArrayToTensorProto(tensor_proto, proto_values):
     tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values])
@@ -121,6 +148,7 @@ else:
     tensor_proto.bool_val.extend([np.asscalar(x) for x in proto_values])
 
   _NP_TO_APPEND_FN = {
+      dtypes.bfloat16.as_numpy_dtype: SlowAppendBFloat16ArrayToTensorProto,
       np.float16: SlowAppendFloat16ArrayToTensorProto,
       np.float32: SlowAppendFloat32ArrayToTensorProto,
       np.float64: SlowAppendFloat64ArrayToTensorProto,
@@ -238,15 +266,16 @@ def _FilterTuple(v):
       return None
   if isinstance(v, list):
     if not any(isinstance(x, (list, tuple)) for x in v):
-      return _FirstNotNone([None if isinstance(x, (list, tuple)) else x for x in v])
+      return _FirstNotNone(
+          [None if isinstance(x, (list, tuple)) else x for x in v])
   return _FirstNotNone([_FilterTuple(x) for x in v])
 
 
 def _FilterInt(v):
   if isinstance(v, (list, tuple)):
     return _FirstNotNone([_FilterInt(x) for x in v])
-  return None if isinstance(v, (compat.integral_types,
-                                tensor_shape.Dimension)) else _NotNone(v)
+  return None if isinstance(
+      v, (compat.integral_types, tensor_shape.Dimension)) else _NotNone(v)
 
 
 def _FilterFloat(v):
@@ -315,6 +344,7 @@ def _AssertCompatible(values, dtype):
                       (dtype.name, repr(mismatch), type(mismatch).__name__))
 
 
+@tf_export("make_tensor_proto")
 def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   """Create a TensorProto.
 
@@ -365,8 +395,11 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   if dtype:
     dtype = dtypes.as_dtype(dtype)
 
-  is_quantized = (dtype in [dtypes.qint8, dtypes.quint8, dtypes.qint16,
-                            dtypes.quint16, dtypes.qint32])
+  is_quantized = (
+      dtype in [
+          dtypes.qint8, dtypes.quint8, dtypes.qint16, dtypes.quint16,
+          dtypes.qint32
+      ])
 
   # We first convert value to a numpy array or scalar.
   if isinstance(values, (np.ndarray, np.generic)):
@@ -404,9 +437,9 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
       if (list(nparray.shape) != _GetDenseDimensions(values) and
           not is_quantized):
         raise ValueError("""Argument must be a dense tensor: %s"""
-                         """ - got shape %s, but wanted %s.""" % (
-                             values, list(nparray.shape),
-                             _GetDenseDimensions(values)))
+                         """ - got shape %s, but wanted %s.""" %
+                         (values, list(nparray.shape),
+                          _GetDenseDimensions(values)))
 
     # python/numpy default float type is float64. We prefer float32 instead.
     if (nparray.dtype == np.float64) and dtype is None:
@@ -431,8 +464,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
 
   if dtype is not None and (not hasattr(dtype, "base_dtype") or
                             dtype.base_dtype != numpy_dtype.base_dtype):
-    raise TypeError("Incompatible types: %s vs. %s. Value is %s"
-                    % (dtype, nparray.dtype, values))
+    raise TypeError("Incompatible types: %s vs. %s. Value is %s" %
+                    (dtype, nparray.dtype, values))
 
   # If shape is not given, get the shape from the numpy array.
   if shape is None:
@@ -495,13 +528,14 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
 
   append_fn = GetNumpyAppendFn(proto_values.dtype)
   if append_fn is None:
-    raise TypeError("Element type not supported in TensorProto: %s" %
-                    numpy_dtype.name)
+    raise TypeError(
+        "Element type not supported in TensorProto: %s" % numpy_dtype.name)
   append_fn(tensor_proto, proto_values)
 
   return tensor_proto
 
 
+@tf_export("make_ndarray")
 def MakeNdarray(tensor):
   """Create a numpy ndarray from a tensor.
 
@@ -537,19 +571,23 @@ def MakeNdarray(tensor):
       return tmp.reshape(shape)
   elif tensor_dtype == dtypes.float32:
     if len(tensor.float_val) == 1:
-      return np.repeat(np.array(tensor.float_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.float_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
       return np.fromiter(tensor.float_val, dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.float64:
     if len(tensor.double_val) == 1:
-      return np.repeat(np.array(tensor.double_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.double_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
       return np.fromiter(tensor.double_val, dtype=dtype).reshape(shape)
-  elif tensor_dtype in [dtypes.int32, dtypes.uint8, dtypes.uint16, dtypes.int16,
-                        dtypes.int8, dtypes.qint32, dtypes.quint8, dtypes.qint8,
-                        dtypes.qint16, dtypes.quint16, dtypes.bfloat16]:
+  elif tensor_dtype in [
+      dtypes.int32, dtypes.uint8, dtypes.uint16, dtypes.int16, dtypes.int8,
+      dtypes.qint32, dtypes.quint8, dtypes.qint8, dtypes.qint16, dtypes.quint16,
+      dtypes.bfloat16
+  ]:
     if len(tensor.int_val) == 1:
       return np.repeat(np.array(tensor.int_val[0], dtype=dtype),
                        num_elements).reshape(shape)
@@ -557,35 +595,41 @@ def MakeNdarray(tensor):
       return np.fromiter(tensor.int_val, dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.int64:
     if len(tensor.int64_val) == 1:
-      return np.repeat(np.array(tensor.int64_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.int64_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
       return np.fromiter(tensor.int64_val, dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.string:
     if len(tensor.string_val) == 1:
-      return np.repeat(np.array(tensor.string_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(tensor.string_val[0], dtype=dtype),
+          num_elements).reshape(shape)
     else:
-      return np.array([x for x in tensor.string_val],
-                      dtype=dtype).reshape(shape)
+      return np.array(
+          [x for x in tensor.string_val], dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.complex64:
     it = iter(tensor.scomplex_val)
     if len(tensor.scomplex_val) == 2:
-      return np.repeat(np.array(complex(tensor.scomplex_val[0],
-                                        tensor.scomplex_val[1]), dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(
+              complex(tensor.scomplex_val[0], tensor.scomplex_val[1]),
+              dtype=dtype), num_elements).reshape(shape)
     else:
-      return np.array([complex(x[0], x[1]) for x in zip(it, it)],
-                      dtype=dtype).reshape(shape)
+      return np.array(
+          [complex(x[0], x[1]) for x in zip(it, it)],
+          dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.complex128:
     it = iter(tensor.dcomplex_val)
     if len(tensor.dcomplex_val) == 2:
-      return np.repeat(np.array(complex(tensor.dcomplex_val[0],
-                                        tensor.dcomplex_val[1]), dtype=dtype),
-                       num_elements).reshape(shape)
+      return np.repeat(
+          np.array(
+              complex(tensor.dcomplex_val[0], tensor.dcomplex_val[1]),
+              dtype=dtype), num_elements).reshape(shape)
     else:
-      return np.array([complex(x[0], x[1]) for x in zip(it, it)],
-                      dtype=dtype).reshape(shape)
+      return np.array(
+          [complex(x[0], x[1]) for x in zip(it, it)],
+          dtype=dtype).reshape(shape)
   elif tensor_dtype == dtypes.bool:
     if len(tensor.bool_val) == 1:
       return np.repeat(np.array(tensor.bool_val[0], dtype=dtype),
@@ -629,8 +673,9 @@ def _ConstantValue(tensor, partial):
   elif tensor.op.type == "Shape":
     input_shape = tensor.op.inputs[0].get_shape()
     if input_shape.is_fully_defined():
-      return np.array([dim.value for dim in input_shape.dims],
-                      dtype=tensor.dtype.as_numpy_dtype)
+      return np.array(
+          [dim.value for dim in input_shape.dims],
+          dtype=tensor.dtype.as_numpy_dtype)
     else:
       return None
   elif tensor.op.type == "Size":
@@ -642,8 +687,10 @@ def _ConstantValue(tensor, partial):
   elif tensor.op.type == "Rank":
     input_shape = tensor.op.inputs[0].get_shape()
     if input_shape.ndims is not None:
-      return np.ndarray(shape=(), buffer=np.array([input_shape.ndims], dtype=np.int32),
-                        dtype=np.int32)
+      return np.ndarray(
+          shape=(),
+          buffer=np.array([input_shape.ndims], dtype=np.int32),
+          dtype=np.int32)
     else:
       return None
   elif tensor.op.type == "Range":
@@ -845,8 +892,8 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
         new_axis_mask = tensor.op.get_attr("new_axis_mask")
         shrink_axis_mask = tensor.op.get_attr("shrink_axis_mask")
         valid_attributes = (not ellipsis_mask and not new_axis_mask and
-                            not shrink_axis_mask and
-                            (not begin_mask or (begin_mask == 1)) and
+                            not shrink_axis_mask and (not begin_mask or
+                                                      (begin_mask == 1)) and
                             (not end_mask or (end_mask == 1)))
         if valid_attributes:  # additional inputs not supported
           prev = constant_value_as_shape(tensor.op.inputs[0])
@@ -862,8 +909,8 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   ret = tensor_shape.unknown_shape(shape[0].value)
   value = constant_value(tensor)
   if value is not None:
-    ret = ret.merge_with(tensor_shape.TensorShape(
-        [d if d >= 0 else None for d in value]))
+    ret = ret.merge_with(
+        tensor_shape.TensorShape([d if d >= 0 else None for d in value]))
   return ret
 
 
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index 25bb7af20cfce6d96e8a877f370142dc00ecb9ca..070b5ac11f563443a97b304ddcdaabd2f4338445 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -26,6 +26,16 @@ REGISTER_OP("KernelLabel")
     .Output("result: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("KernelLabelRequired")
+    .Input("input: int32")
+    .Output("result: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &out));
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
 REGISTER_OP("GraphDefVersion")
     .Output("version: int32")
     .SetIsStateful()
@@ -66,6 +76,11 @@ REGISTER_OP("TestStringOutput")
     .Output("output2: string")
     .SetShapeFn(shape_inference::UnknownShape);
 
+REGISTER_OP("TestAttr")
+    .Output("out: T")
+    .Attr("T: {float, double}")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 namespace {
 enum KernelLabel { DEFAULT_LABEL, OVERLOAD_1_LABEL, OVERLOAD_2_LABEL };
 }  // namespace
@@ -104,6 +119,14 @@ REGISTER_KERNEL_BUILDER(Name("KernelLabel")
                             .Label("overload_2"),
                         KernelLabelOp<OVERLOAD_2_LABEL>);
 
+// All "KernelLabelRequired" kernels have labels
+REGISTER_KERNEL_BUILDER(
+    Name("KernelLabelRequired").Device(DEVICE_CPU).Label("overload_1"),
+    KernelLabelOp<OVERLOAD_1_LABEL>);
+REGISTER_KERNEL_BUILDER(
+    Name("KernelLabelRequired").Device(DEVICE_CPU).Label("overload_2"),
+    KernelLabelOp<OVERLOAD_2_LABEL>);
+
 class GraphDefVersionOp : public OpKernel {
  public:
   explicit GraphDefVersionOp(OpKernelConstruction* ctx)
@@ -170,6 +193,20 @@ class ResourceUsingOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("ResourceUsingOp").Device(DEVICE_CPU),
                         ResourceUsingOp);
 
+class TestAttrOp : public OpKernel {
+ public:
+  explicit TestAttrOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    output->scalar<float>()() = 1.0;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("TestAttr").Device(DEVICE_CPU).TypeConstraint<float>("T"), TestAttrOp);
+
 // Various test ops without kernels. These are used to test graph construction.
 
 REGISTER_OP("A")
@@ -369,4 +406,255 @@ REGISTER_OP("FuncAttr")
     .Attr("f: func")
     .SetShapeFn(shape_inference::UnknownShape);
 
+REGISTER_OP("Simple")
+    .Input("a: int32")
+    .Output("out: float")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("OutT").Output("a: T").Attr("T: type").SetShapeFn(
+    shape_inference::UnknownShape);
+
+REGISTER_OP("ReservedInput")
+    .Input("input: int32")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("Polymorphic")
+    .Input("a: T")
+    .Output("out: T")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("PolymorphicOut")
+    .Output("out: T")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("PolymorphicDefaultOut")
+    .Output("out: T")
+    .Attr("T: type = DT_STRING")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("Binary")
+    .Input("a: T")
+    .Input("b: T")
+    .Output("out: T")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("Restrict")
+    .Input("a: T")
+    .Output("out: T")
+    .Attr("T: {string, bool}")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("TypeList")
+    .Input("a: T")
+    .Attr("T: list(type) >= 0")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("TypeListTwice")
+    .Input("a: T")
+    .Input("b: T")
+    .Attr("T: list(type) >= 0")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("OutTypeList")
+    .Output("out: T")
+    .Attr("T: list(type) >= 0")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("TypeListRestrict")
+    .Input("a: T")
+    .Attr("T: list({string, bool})")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("OutTypeListRestrict")
+    .Output("out: t")
+    .Attr("t: list({string, bool})")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("Attr").Attr("a: int").SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrFloat")
+    .Attr("a: float")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrBool")
+    .Attr("a: bool")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrBoolList")
+    .Attr("a: list(bool)")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrMin")
+    .Attr("a: int >= 5")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrListMin")
+    .Attr("a: list(int) >= 2")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrEnum")
+    .Attr("a: {'apples', 'oranges'}")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrEnumList")
+    .Attr("a: list({'apples', 'oranges'})")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrShape")
+    .Attr("a: shape")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrShapeList")
+    .Attr("a: list(shape)")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrPartialShape")
+    .Attr("a: shape")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrPartialShapeList")
+    .Attr("a: list(shape)")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrDefault")
+    .Attr("a: string = 'banana'")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrListDefault")
+    .Attr("a: list(int) = [5, 15]")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrEmptyListDefault")
+    .Attr("a: list(float) = []")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("ReservedAttr")
+    .Attr("range: int")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrTypeDefault")
+    .Input("a: T")
+    .Attr("T: type = DT_INT32")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("AttrListTypeDefault")
+    .Input("a: N * T")
+    .Input("b: N * T")
+    .Attr("T: type = DT_INT32")
+    .Attr("N: int")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("NIntsIn")
+    .Input("a: N * int32")
+    .Attr("N: int >= 2")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("NPolymorphicIn")
+    .Input("a: N * T")
+    .Attr("T: type")
+    .Attr("N: int >= 2")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("NPolymorphicRestrictIn")
+    .Input("a: N * T")
+    .Attr("T: {string, bool}")
+    .Attr("N: int >= 2")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("NInTwice")
+    .Input("a: N * int32")
+    .Input("b: N * string")
+    .Attr("N: int >= 0")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("NInPolymorphicTwice")
+    .Input("a: N * T")
+    .Input("b: N * T")
+    .Attr("T: type")
+    .Attr("N: int >= 0")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("NInTwoTypeVariables")
+    .Input("a: N * S")
+    .Input("b: N * T")
+    .Attr("S: type")
+    .Attr("T: type")
+    .Attr("N: int >= 0")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("InPolymorphicTwice")
+    .Input("a: N * T")
+    .Input("b: M * T")
+    .Attr("T: type")
+    .Attr("N: int >= 0")
+    .Attr("M: int >= 0")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("NIntsOut")
+    .Output("a: N * int32")
+    .Attr("N: int >= 2")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("NIntsOutDefault")
+    .Output("a: N * int32")
+    .Attr("N: int >= 2 = 3")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("NPolymorphicOut")
+    .Output("a: N * T")
+    .Attr("T: type")
+    .Attr("N: int >= 2")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("NPolymorphicOutDefault")
+    .Output("a: N * T")
+    .Attr("T: type = DT_BOOL")
+    .Attr("N: int >= 2 = 2")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("NPolymorphicRestrictOut")
+    .Output("a: N * T")
+    .Attr("T: {string, bool}")
+    .Attr("N: int >= 2")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("RefIn")
+    .Input("a: Ref(T)")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("TwoRefsIn")
+    .Input("a: Ref(T)")
+    .Input("b: Ref(T)")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("RefOut")
+    .Output("a: Ref(T)")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("SimpleStruct")
+    .Output("a: n_a * int32")
+    .Attr("n_a: int >= 0")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("MixedStruct")
+    .Output("a: n_a * int32")
+    .Output("b: float")
+    .Attr("n_a: int >= 0")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("ComplexStruct")
+    .Output("a: n_a * int32")
+    .Output("b: n_b * int64")
+    .Output("c: t_c")
+    .Attr("n_a: int >= 0")
+    .Attr("n_b: int >= 0")
+    .Attr("t_c: list(type) >= 0")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 4c026590c2da8d5db9fda92ed7586729868ff895..310bd75d4ee6624617fa9e45d7f2c97f03f982e6 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -47,22 +47,29 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
+from tensorflow.python.eager import tape  # pylint: disable=unused-import
 from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
+from tensorflow.python.util import nest
 from tensorflow.python.util.protobuf import compare
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("test.gpu_device_name")
 def gpu_device_name():
   """Returns the name of a GPU device if available or the empty string."""
   for x in device_lib.list_local_devices():
@@ -97,6 +104,7 @@ def assert_ops_in_graph(expected_ops, graph):
   return actual_ops
 
 
+@tf_export("test.assert_equal_graph_def")
 def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
   """Asserts that two `GraphDef`s are (mostly) the same.
 
@@ -115,11 +123,11 @@ def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
     TypeError: If either argument is not a `GraphDef`.
   """
   if not isinstance(actual, graph_pb2.GraphDef):
-    raise TypeError("Expected tf.GraphDef for actual, got %s" %
-                    type(actual).__name__)
+    raise TypeError(
+        "Expected tf.GraphDef for actual, got %s" % type(actual).__name__)
   if not isinstance(expected, graph_pb2.GraphDef):
-    raise TypeError("Expected tf.GraphDef for expected, got %s" %
-                    type(expected).__name__)
+    raise TypeError(
+        "Expected tf.GraphDef for expected, got %s" % type(expected).__name__)
 
   if checkpoint_v2:
     _strip_checkpoint_v2_randomized(actual)
@@ -144,11 +152,10 @@ def assert_meta_graph_protos_equal(tester, a, b):
       a_proto = proto_type()
       b_proto = proto_type()
       # Number of entries in the collections is the same
-      tester.assertEqual(len(a_value.bytes_list.value),
-                         len(b_value.bytes_list.value))
-      for (a_value_item, b_value_item) in zip(
-          a_value.bytes_list.value,
-          b_value.bytes_list.value):
+      tester.assertEqual(
+          len(a_value.bytes_list.value), len(b_value.bytes_list.value))
+      for (a_value_item, b_value_item) in zip(a_value.bytes_list.value,
+                                              b_value.bytes_list.value):
         a_proto.ParseFromString(a_value_item)
         b_proto.ParseFromString(b_value_item)
         tester.assertProtoEquals(a_proto, b_proto)
@@ -158,6 +165,16 @@ def assert_meta_graph_protos_equal(tester, a, b):
   # proto comparison below.
   a.ClearField("collection_def")
   b.ClearField("collection_def")
+
+  # Check the graph_defs.
+  assert_equal_graph_def(a.graph_def, b.graph_def, checkpoint_v2=True)
+  # Check graph_def versions (ignored by assert_equal_graph_def).
+  tester.assertProtoEquals(a.graph_def.versions, b.graph_def.versions)
+  # Compared the fields directly, remove their raw values from the
+  # proto comparison below.
+  a.ClearField("graph_def")
+  b.ClearField("graph_def")
+
   tester.assertProtoEquals(a, b)
 
 
@@ -174,7 +191,7 @@ def _strip_checkpoint_v2_randomized(graph_def):
       if attr_tensor_value and len(attr_tensor_value.string_val) == 1:
         attr_tensor_string_value = attr_tensor_value.string_val[0]
         if (attr_tensor_string_value and
-            re.match(_SHARDED_SAVE_OP_PATTERN, attr_tensor_string_value)):
+            re.match(_SHARDED_SAVE_OP_PATTERN, str(attr_tensor_string_value))):
           delete_keys.append(attr_key)
     for attr_key in delete_keys:
       del node.attr[attr_key]
@@ -188,6 +205,10 @@ def CudaSupportsHalfMatMulAndConv():
   return pywrap_tensorflow.CudaSupportsHalfMatMulAndConv()
 
 
+def InstallStackTraceHandler():
+  pywrap_tensorflow.InstallStacktraceHandler()
+
+
 def NHWCToNCHW(input_tensor):
   """Converts the input from the NHWC format to NCHW.
 
@@ -198,10 +219,7 @@ def NHWCToNCHW(input_tensor):
     converted tensor or shape array
   """
   # tensor dim -> new axis order
-  new_axes = {
-      4: [0, 3, 1, 2],
-      5: [0, 4, 1, 2, 3]
-  }
+  new_axes = {4: [0, 3, 1, 2], 5: [0, 4, 1, 2, 3]}
   if isinstance(input_tensor, ops.Tensor):
     ndims = input_tensor.shape.ndims
     return array_ops.transpose(input_tensor, new_axes[ndims])
@@ -228,8 +246,9 @@ def NHWCToNCHW_VECT_C(input_shape_or_tensor):
   """
   permutations = {5: [0, 3, 1, 2, 4], 6: [0, 4, 1, 2, 3, 5]}
   is_tensor = isinstance(input_shape_or_tensor, ops.Tensor)
-  temp_shape = (input_shape_or_tensor.shape.as_list()
-                if is_tensor else input_shape_or_tensor)
+  temp_shape = (
+      input_shape_or_tensor.shape.as_list()
+      if is_tensor else input_shape_or_tensor)
   if temp_shape[-1] % 4 != 0:
     raise ValueError(
         "Last dimension of input must be evenly divisible by 4 to convert to "
@@ -261,8 +280,9 @@ def NCHW_VECT_CToNHWC(input_shape_or_tensor):
   """
   permutations = {5: [0, 2, 3, 1, 4], 6: [0, 2, 3, 4, 1, 5]}
   is_tensor = isinstance(input_shape_or_tensor, ops.Tensor)
-  input_shape = (input_shape_or_tensor.shape.as_list()
-                 if is_tensor else input_shape_or_tensor)
+  input_shape = (
+      input_shape_or_tensor.shape.as_list()
+      if is_tensor else input_shape_or_tensor)
   if input_shape[-1] != 4:
     raise ValueError("Last dimension of NCHW_VECT_C must be 4.")
   permutation = permutations[len(input_shape)]
@@ -285,10 +305,7 @@ def NCHWToNHWC(input_tensor):
     converted tensor or shape array
   """
   # tensor dim -> new axis order
-  new_axes = {
-      4: [0, 2, 3, 1],
-      5: [0, 2, 3, 4, 1]
-  }
+  new_axes = {4: [0, 2, 3, 1], 5: [0, 2, 3, 4, 1]}
   if isinstance(input_tensor, ops.Tensor):
     ndims = input_tensor.shape.ndims
     return array_ops.transpose(input_tensor, new_axes[ndims])
@@ -303,10 +320,17 @@ def _use_c_api_wrapper(fn, use_c_api, *args, **kwargs):
   prev_value = ops._USE_C_API
   ops._USE_C_API = use_c_api
   try:
-    with ops.Graph().as_default():
-      fn(*args, **kwargs)
+    # Reset the default graph so it has the C API enabled. We call
+    # reset_default_graph() instead of creating a new default Graph context to
+    # make this robust to tests that call reset_default_graph(), which requires
+    # that the current default graph isn't nested.
+    ops.reset_default_graph()
+    fn(*args, **kwargs)
   finally:
     ops._USE_C_API = prev_value
+    # Make sure default graph reflects prev_value in case next test doesn't call
+    # reset_default_graph().
+    ops.reset_default_graph()
 # pylint: disable=protected-access
 
 
@@ -323,7 +347,9 @@ def skip_if(condition):
   Returns:
     The wrapped function
   """
+
   def real_skip_if(fn):
+
     def wrapper(*args, **kwargs):
       if callable(condition):
         skip = condition()
@@ -331,7 +357,9 @@ def skip_if(condition):
         skip = condition
       if not skip:
         fn(*args, **kwargs)
+
     return wrapper
+
   return real_skip_if
 
 
@@ -348,8 +376,10 @@ def disable_c_api(fn):
   Returns:
     The wrapped function
   """
+
   def wrapper(*args, **kwargs):
     _use_c_api_wrapper(fn, False, *args, **kwargs)
+
   return wrapper
 
 
@@ -366,8 +396,10 @@ def enable_c_api(fn):
   Returns:
     The wrapped function
   """
+
   def wrapper(*args, **kwargs):
     _use_c_api_wrapper(fn, True, *args, **kwargs)
+
   return wrapper
 
 
@@ -393,64 +425,58 @@ def with_c_api(cls):
   return cls
 
 
-class IsolateTest(object):
-  """A context manager which isolates resources in its block.
+def assert_no_new_tensors(f):
+  """Decorator for asserting that no new Tensors persist after a test.
 
-  Provides an Eager-agnostic abstraction for preventing the sharing of
-  variables and other resources.
+  Mainly useful for checking that code using the Python C API has correctly
+  manipulated reference counts.
 
-  In graph mode, resource handle ops are only executed in a particular Session,
-  isolating them from resources with the same name in other Graphs. In Eager,
-  separate Sessions do not exist, so resources (particularly ResourceVariables)
-  would be shared implicitly if a resource of the same name were created
-  anywhere in a Python process. Multiple handles to the same resource would
-  cause several issues, and so this type of sharing will raise an exception.
+  Clears the caches that it knows about, runs the garbage collector, then checks
+  that there are no Tensor or Tensor-like objects still around. This includes
+  Tensors to which something still has a reference (e.g. from missing
+  Py_DECREFs) and uncollectable cycles (i.e. Python reference cycles where one
+  of the objects has __del__ defined).
 
-  Using resources with the same name in a single Python process may be useful
-  (especially for unit tests), so this context manager provides an abstraction
-  for isolating resources. Using a resource created in one Isolation environment
-  in another is an error.
-
-  Example usage in Eager mode:
-
-  ```python
-  import tensorflow as tf
-  # Import subject to change
-  from tensorflow.contrib.eager.python import tfe
-
-  tfe.enable_eager_execution()
+  Args:
+    f: The test case to run.
+  Returns:
+    The decorated test case.
+  """
 
-  for hyperparameter in [1, 2, 3]:
-    with tfe.IsolateTest():
-      v = tfe.Variable(name="v", initial_value=hyperparameter)
-      # train model, test results ...
-  ```
+  def decorator(self, **kwargs):
+    """Finds existing Tensors, runs the test, checks for new Tensors."""
 
-  IsolateTest is currently exposed through contrib.eager, but it creates a new
-  default Graph and provides equivalent safety in graph mode.
-  """
+    def _is_tensor(obj):
+      try:
+        return (isinstance(obj, ops.Tensor) or
+                isinstance(obj, variables.Variable))
+      except ReferenceError:
+        # If the object no longer exists, we don't care about it.
+        return False
+
+    tensors_before = set(id(obj) for obj in gc.get_objects() if _is_tensor(obj))
+    outside_graph_key = ops.get_default_graph()._graph_key
+    with ops.Graph().as_default():
+      # Run the test in a new graph so that collections get cleared when it's
+      # done, but inherit the graph key so optimizers behave.
+      ops.get_default_graph()._graph_key = outside_graph_key
+      f(self, **kwargs)
+    # Make an effort to clear caches, which would otherwise look like leaked
+    # Tensors.
+    backprop._zeros_cache.flush()
+    context.get_default_context().scalar_cache().clear()
+    gc.collect()
+    tensors_after = [
+        obj for obj in gc.get_objects()
+        if _is_tensor(obj) and id(obj) not in tensors_before
+    ]
+    if tensors_after:
+      raise AssertionError(("%d Tensors not deallocated after test: %s" % (
+          len(tensors_after),
+          str(tensors_after),
+      )))
 
-  def __init__(self):
-    if context.in_eager_mode() and tape.could_possibly_record():
-      raise ValueError("Cannot isolate Eager execution with an active tape.")
-    # In Eager, Graphs set a container which isolates resources, and maintain a
-    # VariableStore which caches ResourceVariable objects created through
-    # get_variable. So setting the default Graph has the side effect of
-    # isolating Eager resources.
-    with context.eager_mode():
-      # Create the graph in Eager mode, as this provides stricter semantics
-      # (i.e. has a unique container prefix). This prevents implicit sharing
-      # when a Graph-mode graph is created and then Eager mode is enabled (an
-      # error through enable_eager_execution, but common with context managers
-      # in unit tests).
-      self._graph_as_default_context_manager = ops.Graph().as_default()
-
-  def __enter__(self):
-    self._graph_as_default_context_manager.__enter__()
-
-  def __exit__(self, type_arg, value_arg, traceback_arg):
-    return self._graph_as_default_context_manager.__exit__(
-        type_arg, value_arg, traceback_arg)
+  return decorator
 
 
 def assert_no_garbage_created(f):
@@ -483,13 +509,17 @@ def assert_no_garbage_created(f):
     # not hold on to every object in other tests.
     gc.set_debug(previous_debug_flags)
     gc.enable()
+
   return decorator
 
 
-def run_in_graph_and_eager_modes(
-    __unused__=None, graph=None, config=None,
-    use_gpu=False, force_gpu=False,
-    reset_test=True, assert_no_eager_garbage=False):
+def run_in_graph_and_eager_modes(__unused__=None,
+                                 graph=None,
+                                 config=None,
+                                 use_gpu=False,
+                                 force_gpu=False,
+                                 reset_test=True,
+                                 assert_no_eager_garbage=False):
   """Runs the test in both graph and eager modes.
 
   Args:
@@ -507,7 +537,8 @@ def run_in_graph_and_eager_modes(
       garbage for legitimate reasons (e.g. they define a class which inherits
       from `object`), and because DEBUG_SAVEALL is sticky in some Python
       interpreters (meaning that tests which rely on objects being collected
-      elsewhere in the unit test file will not work).
+      elsewhere in the unit test file will not work). Additionally, checks that
+      nothing still has a reference to Tensors that the test allocated.
   Returns:
     Returns a decorator that will run the decorated test function
         using both a graph and using eager execution.
@@ -517,6 +548,7 @@ def run_in_graph_and_eager_modes(
 
   def decorator(f):
     """Test method decorator."""
+
     def decorated(self, **kwargs):
       """Decorated the test method."""
       with context.graph_mode():
@@ -544,16 +576,19 @@ def run_in_graph_and_eager_modes(
             f(self, **kwargs)
 
       if assert_no_eager_garbage:
-        run_eager_mode = assert_no_garbage_created(run_eager_mode)
+        run_eager_mode = assert_no_new_tensors(
+            assert_no_garbage_created(run_eager_mode))
 
       with context.eager_mode():
-        with IsolateTest():
+        with ops.Graph().as_default():
           run_eager_mode(self, **kwargs)
 
     return decorated
+
   return decorator
 
 
+@tf_export("test.is_gpu_available")
 def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
   """Returns whether TensorFlow can access a GPU.
 
@@ -602,6 +637,7 @@ def device(use_gpu):
     yield
 
 
+@tf_export("test.TestCase")
 class TensorFlowTestCase(googletest.TestCase):
   """Base class for tests that need to test TensorFlow.
   """
@@ -655,7 +691,7 @@ class TensorFlowTestCase(googletest.TestCase):
       self._tempdir = tempfile.mkdtemp(dir=googletest.GetTempDir())
     return self._tempdir
 
-  def _AssertProtoEquals(self, a, b):
+  def _AssertProtoEquals(self, a, b, msg=None):
     """Asserts that a and b are the same proto.
 
     Uses ProtoEq() first, as it returns correct results
@@ -665,11 +701,12 @@ class TensorFlowTestCase(googletest.TestCase):
     Args:
       a: a proto.
       b: another proto.
+      msg: Optional message to report on failure.
     """
     if not compare.ProtoEq(a, b):
-      compare.assertProtoEqual(self, a, b, normalize_numbers=True)
+      compare.assertProtoEqual(self, a, b, normalize_numbers=True, msg=msg)
 
-  def assertProtoEquals(self, expected_message_maybe_ascii, message):
+  def assertProtoEquals(self, expected_message_maybe_ascii, message, msg=None):
     """Asserts that message is same as parsed expected_message_ascii.
 
     Creates another prototype of message, reads the ascii message into it and
@@ -678,29 +715,33 @@ class TensorFlowTestCase(googletest.TestCase):
     Args:
       expected_message_maybe_ascii: proto message in original or ascii form.
       message: the message to validate.
+      msg: Optional message to report on failure.
     """
-
+    msg = msg if msg else ""
     if isinstance(expected_message_maybe_ascii, type(message)):
       expected_message = expected_message_maybe_ascii
       self._AssertProtoEquals(expected_message, message)
     elif isinstance(expected_message_maybe_ascii, str):
       expected_message = type(message)()
-      text_format.Merge(expected_message_maybe_ascii, expected_message,
-                        descriptor_pool=descriptor_pool.Default())
-      self._AssertProtoEquals(expected_message, message)
+      text_format.Merge(
+          expected_message_maybe_ascii,
+          expected_message,
+          descriptor_pool=descriptor_pool.Default())
+      self._AssertProtoEquals(expected_message, message, msg=msg)
     else:
-      assert False, ("Can't compare protos of type %s and %s" %
-                     (type(expected_message_maybe_ascii), type(message)))
+      assert False, ("Can't compare protos of type %s and %s. %s" %
+                     (type(expected_message_maybe_ascii), type(message), msg))
 
   def assertProtoEqualsVersion(
       self,
       expected,
       actual,
       producer=versions.GRAPH_DEF_VERSION,
-      min_consumer=versions.GRAPH_DEF_VERSION_MIN_CONSUMER):
+      min_consumer=versions.GRAPH_DEF_VERSION_MIN_CONSUMER,
+      msg=None):
     expected = "versions { producer: %d min_consumer: %d };\n%s" % (
         producer, min_consumer, expected)
-    self.assertProtoEquals(expected, actual)
+    self.assertProtoEquals(expected, actual, msg=msg)
 
   def assertStartsWith(self, actual, expected_start, msg=None):
     """Assert that actual.startswith(expected_start) is True.
@@ -715,25 +756,22 @@ class TensorFlowTestCase(googletest.TestCase):
       fail_msg += " : %r" % (msg) if msg else ""
       self.fail(fail_msg)
 
-  def _eval_helper(self, tensors):
-    if isinstance(tensors, ops.EagerTensor):
-      return tensors.numpy()
-    if isinstance(tensors, resource_variable_ops.ResourceVariable):
-      return tensors.read_value().numpy()
-
-    if isinstance(tensors, tuple):
-      return tuple([self._eval_helper(t) for t in tensors])
-    elif isinstance(tensors, list):
-      return [self._eval_helper(t) for t in tensors]
-    elif isinstance(tensors, dict):
-      assert not tensors, "Only support empty dict now."
-      return dict()
-    elif tensors is None:
+  def _eval_tensor(self, tensor):
+    if tensor is None:
       return None
-    elif callable(tensors):
-      return self._eval_helper(tensors())
+    elif isinstance(tensor, ops.EagerTensor):
+      return tensor.numpy()
+    elif isinstance(tensor, resource_variable_ops.ResourceVariable):
+      return tensor.read_value().numpy()
+    elif callable(tensor):
+      return self._eval_helper(tensor())
     else:
-      raise ValueError("Unsupported type %s." % type(tensors))
+      raise ValueError("Unsupported type %s." % type(tensor))
+
+  def _eval_helper(self, tensors):
+    if tensors is None:
+      return None
+    return nest.map_structure(self._eval_tensor, tensors)
 
   def evaluate(self, tensors):
     """Evaluates tensors and returns numpy values.
@@ -773,7 +811,8 @@ class TensorFlowTestCase(googletest.TestCase):
     trigger the creation of a new session.
 
     Use the `use_gpu` and `force_gpu` options to control where ops are run. If
-    `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if `use_gpu`
+    `force_gpu` is True, all ops are pinned to `/device:GPU:0`. Otherwise, if
+    `use_gpu`
     is True, TensorFlow tries to run as many ops on the GPU as possible. If both
     `force_gpu and `use_gpu` are False, all ops are pinned to the CPU.
 
@@ -972,6 +1011,7 @@ class TensorFlowTestCase(googletest.TestCase):
     self._threads.append(ret)
     return ret
 
+
 # pylint: enable=invalid-name
 
   def assertNear(self, f1, f2, err, msg=None):
@@ -991,7 +1031,7 @@ class TensorFlowTestCase(googletest.TestCase):
                     "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
                                            if msg is not None else ""))
 
-  def assertArrayNear(self, farray1, farray2, err):
+  def assertArrayNear(self, farray1, farray2, err, msg=None):
     """Asserts that two float arrays are near each other.
 
     Checks that for all elements of farray1 and farray2
@@ -1001,23 +1041,25 @@ class TensorFlowTestCase(googletest.TestCase):
       farray1: a list of float values.
       farray2: a list of float values.
       err: a float value.
+      msg: Optional message to report on failure.
     """
-    self.assertEqual(len(farray1), len(farray2))
+    self.assertEqual(len(farray1), len(farray2), msg=msg)
     for f1, f2 in zip(farray1, farray2):
-      self.assertNear(float(f1), float(f2), err)
+      self.assertNear(float(f1), float(f2), err, msg=msg)
 
   def _NDArrayNear(self, ndarray1, ndarray2, err):
     return np.linalg.norm(ndarray1 - ndarray2) < err
 
-  def assertNDArrayNear(self, ndarray1, ndarray2, err):
+  def assertNDArrayNear(self, ndarray1, ndarray2, err, msg=None):
     """Asserts that two numpy arrays have near values.
 
     Args:
       ndarray1: a numpy ndarray.
       ndarray2: a numpy ndarray.
       err: a float. The maximum absolute difference allowed.
+      msg: Optional message to report on failure.
     """
-    self.assertTrue(self._NDArrayNear(ndarray1, ndarray2, err))
+    self.assertTrue(self._NDArrayNear(ndarray1, ndarray2, err), msg=msg)
 
   def _GetNdArray(self, a):
     if not isinstance(a, np.ndarray):
@@ -1039,7 +1081,8 @@ class TensorFlowTestCase(googletest.TestCase):
       # the absolute difference between a and b.  Here, we want to
       # print out which elements violate such conditions.
       cond = np.logical_or(
-          np.abs(a - b) > atol + rtol * np.abs(b), np.isnan(a) != np.isnan(b))
+          np.abs(a - b) > atol + rtol * np.abs(b),
+          np.isnan(a) != np.isnan(b))
       if a.ndim:
         x = a[np.where(cond)]
         y = b[np.where(cond)]
@@ -1052,37 +1095,99 @@ class TensorFlowTestCase(googletest.TestCase):
       print("not close dif = ", np.abs(x - y))
       print("not close tol = ", atol + rtol * np.abs(y))
       print("dtype = %s, shape = %s" % (a.dtype, a.shape))
-      np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
-
-  def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
-    """Asserts that two numpy arrays, or dicts of same, have near values.
-
-    This does not support nested dicts.
+      # TODO(xpan): There seems to be a bug:
+      # tensorflow/compiler/tests:binary_ops_test pass with float32
+      # nan even though the equal_nan is False by default internally.
+      np.testing.assert_allclose(
+          a, b, rtol=rtol, atol=atol, err_msg=msg, equal_nan=True)
+
+  def _assertAllCloseRecursive(self, a, b, rtol=1e-6, atol=1e-6, path=None,
+                               msg=None):
+    path = path or []
+    path_str = (("[" + "][".join([str(p) for p in path]) + "]") if path else "")
+    msg = msg if msg else ""
+
+    # Check if a and/or b are namedtuples.
+    if hasattr(a, "_asdict"):
+      a = a._asdict()
+    if hasattr(b, "_asdict"):
+      b = b._asdict()
+    a_is_dict = isinstance(a, dict)
+    if a_is_dict != isinstance(b, dict):
+      raise ValueError("Can't compare dict to non-dict, a%s vs b%s. %s" %
+                       (path_str, path_str, msg))
+    if a_is_dict:
+      self.assertItemsEqual(
+          a.keys(),
+          b.keys(),
+          msg="mismatched keys: a%s has keys %s, but b%s has keys %s. %s" %
+          (path_str, a.keys(), path_str, b.keys(), msg))
+      for k in a:
+        path.append(k)
+        self._assertAllCloseRecursive(
+            a[k], b[k], rtol=rtol, atol=atol, path=path, msg=msg)
+        del path[-1]
+    elif isinstance(a, (list, tuple)):
+      # Try to directly compare a, b as ndarrays; if not work, then traverse
+      # through the sequence, which is more expensive.
+      try:
+        a_as_ndarray = np.array(a)
+        b_as_ndarray = np.array(b)
+        self._assertArrayLikeAllClose(
+            a_as_ndarray,
+            b_as_ndarray,
+            rtol=rtol,
+            atol=atol,
+            msg="Mismatched value: a%s is different from b%s. %s" %
+            (path_str, path_str, msg))
+      except (ValueError, TypeError) as e:
+        if len(a) != len(b):
+          raise ValueError(
+              "Mismatched length: a%s has %d items, but b%s has %d items. %s" %
+              (path_str, len(a), path_str, len(b), msg))
+        for idx, (a_ele, b_ele) in enumerate(zip(a, b)):
+          path.append(str(idx))
+          self._assertAllCloseRecursive(
+              a_ele, b_ele, rtol=rtol, atol=atol, path=path, msg=msg)
+          del path[-1]
+    # a and b are ndarray like objects
+    else:
+      try:
+        self._assertArrayLikeAllClose(
+            a,
+            b,
+            rtol=rtol,
+            atol=atol,
+            msg="Mismatched value: a%s is different from b%s." % (path_str,
+                                                                  path_str))
+      except TypeError as e:
+        msg = "Error: a%s has %s, but b%s has %s" % (
+            path_str, type(a), path_str, type(b))
+        e.args = ((e.args[0] + ' : ' + msg,) + e.args[1:])
+        raise
+
+  def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
+    """Asserts that two structures of numpy arrays, have near values.
+
+    `a` and `b` can be arbitrarily nested structures. A layer of a nested
+    structure can be a `dict`, `namedtuple`, `tuple` or `list`.
 
     Args:
-      a: The expected numpy ndarray (or anything can be converted to one), or
-        dict of same. Must be a dict iff `b` is a dict.
-      b: The actual numpy ndarray (or anything can be converted to one), or
-        dict of same. Must be a dict iff `a` is a dict.
+      a: The expected numpy `ndarray`, or anything that can be converted into a
+          numpy `ndarray`, or any arbitrarily nested of structure of these.
+      b: The actual numpy `ndarray`, or anything that can be converted into a
+          numpy `ndarray`, or any arbitrarily nested of structure of these.
       rtol: relative tolerance.
       atol: absolute tolerance.
+      msg: Optional message to report on failure.
 
     Raises:
-      ValueError: if only one of `a` and `b` is a dict.
+      ValueError: if only one of `a[p]` and `b[p]` is a dict or
+          `a[p]` and `b[p]` have different length, where `[p]` denotes a path
+          to the nested structure, e.g. given `a = [(1, 1), {'d': (6, 7)}]` and
+          `[p] = [1]['d']`, then `a[p] = (6, 7)`.
     """
-    is_a_dict = isinstance(a, dict)
-    if is_a_dict != isinstance(b, dict):
-      raise ValueError("Can't compare dict to non-dict, %s vs %s." % (a, b))
-    if is_a_dict:
-      self.assertItemsEqual(
-          a.keys(), b.keys(),
-          msg="mismatched keys, expected %s, got %s" % (a.keys(), b.keys()))
-      for k in a:
-        self._assertArrayLikeAllClose(
-            a[k], b[k], rtol=rtol, atol=atol,
-            msg="%s: expected %s, got %s." % (k, a, b))
-    else:
-      self._assertArrayLikeAllClose(a, b, rtol=rtol, atol=atol)
+    self._assertAllCloseRecursive(a, b, rtol=rtol, atol=atol, msg=msg)
 
   def assertAllCloseAccordingToType(self,
                                     a,
@@ -1092,7 +1197,10 @@ class TensorFlowTestCase(googletest.TestCase):
                                     float_rtol=1e-6,
                                     float_atol=1e-6,
                                     half_rtol=1e-3,
-                                    half_atol=1e-3):
+                                    half_atol=1e-3,
+                                    bfloat16_rtol=1e-2,
+                                    bfloat16_atol=1e-2,
+                                    msg=None):
     """Like assertAllClose, but also suitable for comparing fp16 arrays.
 
     In particular, the tolerance is reduced to 1e-3 if at least
@@ -1107,9 +1215,13 @@ class TensorFlowTestCase(googletest.TestCase):
       float_atol: absolute tolerance for float32.
       half_rtol: relative tolerance for float16.
       half_atol: absolute tolerance for float16.
+      bfloat16_rtol: relative tolerance for bfloat16.
+      bfloat16_atol: absolute tolerance for bfloat16.
+      msg: Optional message to report on failure.
     """
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
+    # types with lower tol are put later to overwrite previous ones.
     if (a.dtype == np.float32 or b.dtype == np.float32 or
         a.dtype == np.complex64 or b.dtype == np.complex64):
       rtol = max(rtol, float_rtol)
@@ -1117,20 +1229,26 @@ class TensorFlowTestCase(googletest.TestCase):
     if a.dtype == np.float16 or b.dtype == np.float16:
       rtol = max(rtol, half_rtol)
       atol = max(atol, half_atol)
+    if (a.dtype == dtypes.bfloat16.as_numpy_dtype or
+        b.dtype == dtypes.bfloat16.as_numpy_dtype):
+      rtol = max(rtol, bfloat16_rtol)
+      atol = max(atol, bfloat16_atol)
 
-    self.assertAllClose(a, b, rtol=rtol, atol=atol)
+    self.assertAllClose(a, b, rtol=rtol, atol=atol, msg=msg)
 
-  def assertAllEqual(self, a, b):
+  def assertAllEqual(self, a, b, msg=None):
     """Asserts that two numpy arrays have the same values.
 
     Args:
       a: the expected numpy ndarray or anything can be converted to one.
       b: the actual numpy ndarray or anything can be converted to one.
+      msg: Optional message to report on failure.
     """
+    msg = msg if msg else ""
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
-    self.assertEqual(a.shape, b.shape, "Shape mismatch: expected %s, got %s." %
-                     (a.shape, b.shape))
+    self.assertEqual(a.shape, b.shape, "Shape mismatch: expected %s, got %s."
+                                       " %s" % (a.shape, b.shape, msg))
     same = (a == b)
 
     if a.dtype == np.float32 or a.dtype == np.float64:
@@ -1147,7 +1265,7 @@ class TensorFlowTestCase(googletest.TestCase):
         x, y = a, b
       print("not equal lhs = ", x)
       print("not equal rhs = ", y)
-      np.testing.assert_array_equal(a, b)
+      np.testing.assert_array_equal(a, b, err_msg=msg)
 
   # pylint: disable=g-doc-return-or-yield
   @contextlib.contextmanager
@@ -1197,12 +1315,13 @@ class TensorFlowTestCase(googletest.TestCase):
     return self.assertRaisesWithPredicateMatch(errors.OpError,
                                                expected_err_re_or_predicate)
 
-  def assertShapeEqual(self, np_array, tf_tensor):
+  def assertShapeEqual(self, np_array, tf_tensor, msg=None):
     """Asserts that a Numpy ndarray and a TensorFlow tensor have the same shape.
 
     Args:
       np_array: A Numpy ndarray or Numpy scalar.
       tf_tensor: A Tensor.
+      msg: Optional message to report on failure.
 
     Raises:
       TypeError: If the arguments have the wrong type.
@@ -1211,19 +1330,22 @@ class TensorFlowTestCase(googletest.TestCase):
       raise TypeError("np_array must be a Numpy ndarray or Numpy scalar")
     if not isinstance(tf_tensor, ops.Tensor):
       raise TypeError("tf_tensor must be a Tensor")
-    self.assertAllEqual(np_array.shape, tf_tensor.get_shape().as_list())
+    self.assertAllEqual(np_array.shape, tf_tensor.get_shape().as_list(),
+                        msg=msg)
 
-  def assertDeviceEqual(self, device1, device2):
+  def assertDeviceEqual(self, device1, device2, msg=None):
     """Asserts that the two given devices are the same.
 
     Args:
       device1: A string device name or TensorFlow `DeviceSpec` object.
       device2: A string device name or TensorFlow `DeviceSpec` object.
+      msg: Optional message to report on failure.
     """
     device1 = pydev.canonical_name(device1)
     device2 = pydev.canonical_name(device2)
     self.assertEqual(device1, device2,
-                     "Devices %s and %s are not equal" % (device1, device2))
+                     "Devices %s and %s are not equal. %s" % 
+                     (device1, device2, msg))
 
   # Fix Python 3 compatibility issues
   if six.PY3:
@@ -1238,8 +1360,12 @@ class TensorFlowTestCase(googletest.TestCase):
     # pylint: enable=invalid-name
 
 
-def create_local_cluster(num_workers, num_ps, protocol="grpc",
-                         worker_config=None, ps_config=None):
+@tf_export("test.create_local_cluster")
+def create_local_cluster(num_workers,
+                         num_ps,
+                         protocol="grpc",
+                         worker_config=None,
+                         ps_config=None):
   """Create and start local servers and return the associated `Server` objects.
 
   Example:
@@ -1289,15 +1415,50 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc",
 
   workers = [
       server_lib.Server(
-          cs, job_name="worker", protocol=protocol, task_index=ix,
-          config=worker_config, start=True)
-      for ix in range(num_workers)
+          cs,
+          job_name="worker",
+          protocol=protocol,
+          task_index=ix,
+          config=worker_config,
+          start=True) for ix in range(num_workers)
   ]
   ps_servers = [
       server_lib.Server(
-          cs, job_name="ps", protocol=protocol, task_index=ix,
-          config=ps_config, start=True)
-      for ix in range(num_ps)
+          cs,
+          job_name="ps",
+          protocol=protocol,
+          task_index=ix,
+          config=ps_config,
+          start=True) for ix in range(num_ps)
   ]
 
   return workers, ps_servers
+
+
+def get_node_def_from_graph(node_name, graph_def):
+  """Returns the `NodeDef` instance for given node name in the graph def.
+
+  This method explores only the NodeDefs in `graph_def.node`.
+
+  Args:
+    node_name: Name of the NodeDef to search for.
+    graph_def: An instance of `GraphDef` proto.
+
+  Returns:
+    the `NodeDef` instance whose name field matches the given node_name or None.
+  """
+  for node_def in graph_def.node:
+    if node_def.name == node_name:
+      return node_def
+  return None
+
+
+def set_producer_version(graph, producer_version):
+  """Sets graph.graph_def_versions.producer to `producer_version`."""
+  # The C API doesn't expose altering GraphDefVersions. We can indirectly set
+  # it via import_graph_def though.
+  graph_def = graph_pb2.GraphDef()
+  graph_def.versions.producer = producer_version
+  with graph.as_default():
+    importer.import_graph_def(graph_def)
+  assert graph.graph_def_versions.producer, producer_version
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 9aed3457a60c97867a03bd75aa9357fe93c26cc5..a717eb39513ac3369ae133b6090ff82597f12eb7 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import copy
 import random
 import threading
 
@@ -27,19 +29,19 @@ from google.protobuf import text_format
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_ops  # pylint: disable=unused-import
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
+@test_util.with_c_api
 class TestUtilTest(test_util.TensorFlowTestCase):
 
   def test_assert_ops_in_graph(self):
@@ -184,8 +186,8 @@ class TestUtilTest(test_util.TensorFlowTestCase):
   def _WeMustGoDeeper(self, msg):
     with self.assertRaisesOpError(msg):
       with ops.Graph().as_default():
-        node_def = ops._NodeDef("op_type", "name")
-        node_def_orig = ops._NodeDef("op_type_orig", "orig")
+        node_def = ops._NodeDef("IntOutput", "name")
+        node_def_orig = ops._NodeDef("IntOutput", "orig")
         op_orig = ops.Operation(node_def_orig, ops.get_default_graph())
         op = ops.Operation(node_def, ops.get_default_graph(),
                            original_op=op_orig)
@@ -210,6 +212,18 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, r"Can't compare dict to non-dict"):
       self.assertAllClose({"a": 1}, 1)
 
+  def testAllCloseNamedtuples(self):
+    a = 7
+    b = (2., 3.)
+    c = np.ones((3, 2, 4)) * 7.
+    expected = {"a": a, "b": b, "c": c}
+    my_named_tuple = collections.namedtuple("MyNamedTuple", ["a", "b", "c"])
+
+    # Identity.
+    self.assertAllClose(expected, my_named_tuple(a=a, b=b, c=c))
+    self.assertAllClose(
+        my_named_tuple(a=a, b=b, c=c), my_named_tuple(a=a, b=b, c=c))
+
   def testAllCloseDicts(self):
     a = 7
     b = (2., 3.)
@@ -237,12 +251,30 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"):
       self.assertAllClose(expected, {"a": a, "b": b, "c": c_copy})
 
-  def testAllCloseNestedDicts(self):
-    a = {"a": 1, "b": 2, "nested": {"d": 3, "e": 4}}
-    with self.assertRaisesRegexp(
-        TypeError,
-        r"inputs could not be safely coerced to any supported types"):
-      self.assertAllClose(a, a)
+  def testAllCloseListOfNamedtuples(self):
+    my_named_tuple = collections.namedtuple("MyNamedTuple", ["x", "y"])
+    l1 = [
+        my_named_tuple(x=np.array([[2.3, 2.5]]), y=np.array([[0.97, 0.96]])),
+        my_named_tuple(x=np.array([[3.3, 3.5]]), y=np.array([[0.98, 0.99]]))
+    ]
+    l2 = [
+        ([[2.3, 2.5]], [[0.97, 0.96]]),
+        ([[3.3, 3.5]], [[0.98, 0.99]]),
+    ]
+    self.assertAllClose(l1, l2)
+
+  def testAllCloseNestedStructure(self):
+    a = {"x": np.ones((3, 2, 4)) * 7, "y": (2, [{"nested": {"m": 3, "n": 4}}])}
+    self.assertAllClose(a, a)
+
+    b = copy.deepcopy(a)
+    self.assertAllClose(a, b)
+
+    # Test mismatched values
+    b["y"][1][0]["nested"]["n"] = 4.2
+    with self.assertRaisesRegexp(AssertionError,
+                                 r"\[y\]\[1\]\[0\]\[nested\]\[n\]"):
+      self.assertAllClose(a, b)
 
   def testArrayNear(self):
     a = [1, 2]
@@ -267,6 +299,9 @@ class TestUtilTest(test_util.TensorFlowTestCase):
         control_flow_ops.Assert(x, y).run()
 
   def testAssertAllCloseAccordingToType(self):
+    # test plain int
+    self.assertAllCloseAccordingToType(1, 1, rtol=1e-8, atol=1e-8)
+
     # test float64
     self.assertAllCloseAccordingToType(
         np.asarray([1e-8], dtype=np.float64),
@@ -316,6 +351,10 @@ class TestUtilTest(test_util.TensorFlowTestCase):
       )
 
   def testRandomSeed(self):
+    # Call setUp again for WithCApi case (since it makes a new defeault graph
+    # after setup).
+    # TODO(skyewm): remove this when C API is permanently enabled.
+    self.setUp()
     a = random.randint(1, 1000)
     a_np_rand = np.random.rand(1)
     with self.test_session():
@@ -339,7 +378,25 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     with context.eager_mode():
       self.assertEqual(2, self.evaluate(model))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_nested_tensors_evaluate(self):
+    expected = {"a": 1, "b": 2, "nested": {"d": 3, "e": 4}}
+    nested = {"a": constant_op.constant(1),
+              "b": constant_op.constant(2),
+              "nested": {"d": constant_op.constant(3),
+                         "e": constant_op.constant(4)}}
 
+    self.assertEqual(expected, self.evaluate(nested))
+
+  def test_get_node_def_from_graph(self):
+    graph_def = graph_pb2.GraphDef()
+    node_foo = graph_def.node.add()
+    node_foo.name = "foo"
+    self.assertIs(test_util.get_node_def_from_graph("foo", graph_def), node_foo)
+    self.assertIsNone(test_util.get_node_def_from_graph("bar", graph_def))
+
+
+@test_util.with_c_api
 class GarbageCollectionTest(test_util.TensorFlowTestCase):
 
   def test_no_reference_cycle_decorator(self):
@@ -363,72 +420,26 @@ class GarbageCollectionTest(test_util.TensorFlowTestCase):
 
     ReferenceCycleTest().test_has_no_cycle()
 
+  def test_no_leaked_tensor_decorator(self):
 
-@test_util.with_c_api
-class IsolationTest(test_util.TensorFlowTestCase):
+    class LeakedTensorTest(object):
 
-  @test_util.run_in_graph_and_eager_modes()
-  def test_variable_reuse_exception(self):
-    with test_util.IsolateTest(), session.Session():
-      first_container_variable = resource_variable_ops.ResourceVariable(
-          name="first_container_variable",
-          initial_value=1)
-      if context.in_graph_mode():
-        self.evaluate([variables.global_variables_initializer()])
-    with test_util.IsolateTest():
-      if context.in_graph_mode():
-        with self.assertRaises(RuntimeError):
-          self.evaluate(first_container_variable.read_value())
-      else:
-        with self.assertRaises(ValueError):
-          first_container_variable.read_value()
+      def __init__(inner_self):  # pylint: disable=no-self-argument
+        inner_self.assertEqual = self.assertEqual  # pylint: disable=invalid-name
 
-  @test_util.run_in_graph_and_eager_modes()
-  def test_variable_reuse_exception_nested(self):
-    with test_util.IsolateTest(), session.Session():
-      first_container_variable = resource_variable_ops.ResourceVariable(
-          name="first_container_variable",
-          initial_value=1)
-      if context.in_graph_mode():
-        self.evaluate([variables.global_variables_initializer()])
-      with test_util.IsolateTest(), session.Session():
-        if context.in_graph_mode():
-          with self.assertRaises(RuntimeError):
-            self.evaluate(first_container_variable.read_value())
-        else:
-          with self.assertRaises(ValueError):
-            first_container_variable.read_value()
+      @test_util.assert_no_new_tensors
+      def test_has_leak(self):
+        self.a = constant_op.constant([3.])
+
+      @test_util.assert_no_new_tensors
+      def test_has_no_leak(self):
+        constant_op.constant([3.])
+
+    with self.assertRaisesRegexp(AssertionError, "Tensors not deallocated"):
+      LeakedTensorTest().test_has_leak()
+
+    LeakedTensorTest().test_has_no_leak()
 
-  @test_util.run_in_graph_and_eager_modes()
-  def test_no_sharing(self):
-    with test_util.IsolateTest(), session.Session():
-      first_container_variable = resource_variable_ops.ResourceVariable(
-          name="same_name",
-          initial_value=1)
-      if context.in_graph_mode():
-        self.evaluate([variables.global_variables_initializer()])
-      with test_util.IsolateTest(), session.Session():
-        second_container_variable = resource_variable_ops.ResourceVariable(
-            name="same_name",
-            initial_value=2)
-        if context.in_graph_mode():
-          self.evaluate([variables.global_variables_initializer()])
-        self.assertEqual(
-            2, self.evaluate(second_container_variable.read_value()))
-      self.assertEqual(1, self.evaluate(first_container_variable.read_value()))
-
-  def test_graph_mode_isolation(self):
-    with context.graph_mode():
-      # Even if we've (accidentally) called IsolateTest in Graph mode, it should
-      # provide Eager isolation.
-      with test_util.IsolateTest():
-        with context.eager_mode():
-          first_container_variable = resource_variable_ops.ResourceVariable(
-              name="first_container_variable",
-              initial_value=1)
-      with context.eager_mode():
-        with self.assertRaises(ValueError):
-          first_container_variable.read_value()
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/versions.py b/tensorflow/python/framework/versions.py
index 81529e2b1e06e70fb2839c037c555ef41bcdd291..06955b885852a641bc814f88c99838effe03bfd4 100644
--- a/tensorflow/python/framework/versions.py
+++ b/tensorflow/python/framework/versions.py
@@ -20,28 +20,42 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.util.tf_export import tf_export
 
 __version__ = pywrap_tensorflow.__version__
 __git_version__ = pywrap_tensorflow.__git_version__
 __compiler_version__ = pywrap_tensorflow.__compiler_version__
 __cxx11_abi_flag__ = pywrap_tensorflow.__cxx11_abi_flag__
+__monolithic_build__ = pywrap_tensorflow.__monolithic_build__
 
 VERSION = __version__
+tf_export("VERSION").export_constant(__name__, "VERSION")
 GIT_VERSION = __git_version__
+tf_export("GIT_VERSION").export_constant(__name__, "GIT_VERSION")
 COMPILER_VERSION = __compiler_version__
+tf_export("COMPILER_VERSION").export_constant(__name__, "COMPILER_VERSION")
 CXX11_ABI_FLAG = __cxx11_abi_flag__
+tf_export("CXX11_ABI_FLAG").export_constant(__name__, "CXX11_ABI_FLAG")
+MONOLITHIC_BUILD = __monolithic_build__
+tf_export("MONOLITHIC_BUILD").export_constant(__name__, "MONOLITHIC_BUILD")
 
 GRAPH_DEF_VERSION = pywrap_tensorflow.GRAPH_DEF_VERSION
+tf_export("GRAPH_DEF_VERSION").export_constant(__name__, "GRAPH_DEF_VERSION")
 GRAPH_DEF_VERSION_MIN_CONSUMER = (
     pywrap_tensorflow.GRAPH_DEF_VERSION_MIN_CONSUMER)
+tf_export("GRAPH_DEF_VERSION_MIN_CONSUMER").export_constant(
+    __name__, "GRAPH_DEF_VERSION_MIN_CONSUMER")
 GRAPH_DEF_VERSION_MIN_PRODUCER = (
     pywrap_tensorflow.GRAPH_DEF_VERSION_MIN_PRODUCER)
+tf_export("GRAPH_DEF_VERSION_MIN_PRODUCER").export_constant(
+    __name__, "GRAPH_DEF_VERSION_MIN_PRODUCER")
 
 __all__ = [
     "__version__",
     "__git_version__",
     "__compiler_version__",
     "__cxx11_abi_flag__",
+    "__monolithic_build__",
     "COMPILER_VERSION",
     "CXX11_ABI_FLAG",
     "GIT_VERSION",
@@ -49,4 +63,5 @@ __all__ = [
     "GRAPH_DEF_VERSION_MIN_CONSUMER",
     "GRAPH_DEF_VERSION_MIN_PRODUCER",
     "VERSION",
+    "MONOLITHIC_BUILD",
 ]
diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index 18fda345e6dc55dd89ea9071f30ab998fda5ee76..8079cb307bb1f5904b71bae891d5ef5f1e749e66 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -54,6 +54,23 @@ bool _PyObjAs(PyObject *input, tensorflow::NamedDevice *out) {
   $1 = &temp;
 }
 
+%typemap(in) const tensorflow::NamedDevice& (tensorflow::NamedDevice temp) {
+  char* c_string;
+  Py_ssize_t py_size;
+  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
+    // Python has raised an error (likely TypeError or UnicodeEncodeError).
+    SWIG_fail;
+  }
+
+  if (!temp.ParseFromString(string(c_string, py_size))) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "The NamedDevice could not be parsed as a valid protocol buffer");
+    SWIG_fail;
+  }
+  $1 = &temp;
+}
+
 %typemap(in) const tensorflow::RunMetadata& (tensorflow::RunMetadata temp) {
   char* c_string;
   Py_ssize_t py_size;
@@ -83,6 +100,7 @@ bool _PyObjAs(PyObject *input, tensorflow::NamedDevice *out) {
 #include <memory>
 #include <vector>
 #include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_memory.h"
@@ -90,6 +108,8 @@ bool _PyObjAs(PyObject *input, tensorflow::NamedDevice *out) {
 #include "tensorflow/core/grappler/costs/measuring_cost_estimator.h"
 #include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/memory_types.h"
 
 // Provide the implementation of the GCluster struct here.
 struct GCluster {
@@ -120,6 +140,7 @@ static GCluster TF_NewCluster(bool allow_soft_placement,
           timeout_s, num_cpu_cores, num_gpus);
   cluster_->DisableDetailedStats(disable_detailed_stats);
   cluster_->AllowSoftPlacement(allow_soft_placement);
+  cluster_->SetNumWarmupSteps(10);
   tensorflow::Status status = cluster_->Provision();
   tensorflow::Set_TF_Status_from_Status(out_status, status);
   return GCluster(cluster_);
@@ -134,13 +155,17 @@ static GCluster TF_NewVirtualCluster(
   }
   tensorflow::grappler::Cluster*cluster_ =
       new tensorflow::grappler::VirtualCluster(devices);
+  PyGILState_STATE gstate = PyGILState_Ensure();
   tensorflow::Status status = cluster_->Provision();
+  PyGILState_Release(gstate);
   tensorflow::Set_TF_Status_from_Status(out_status, status);
   return GCluster(cluster_);
 }
 
 static void TF_ShutdownCluster(GCluster cluster) {
+  PyGILState_STATE gstate = PyGILState_Ensure();
   cluster->Shutdown();
+  PyGILState_Release(gstate);
 }
 
 tensorflow::Status _GetOpPerformanceDataAndRunTime(
@@ -181,6 +206,106 @@ static PyObject* TF_ListDevices(GCluster cluster) {
   return result;
 }
 
+static std::vector<string> TF_ListAvailableOps() {
+  tensorflow::OpRegistry* registry = tensorflow::OpRegistry::Global();
+  std::vector<tensorflow::OpDef> ops;
+  registry->GetRegisteredOps(&ops);
+  std::vector<string> op_names;
+  for (const tensorflow::OpDef& op : ops) {
+    op_names.push_back(op.name());
+  }
+  std::sort(op_names.begin(), op_names.end());
+  return op_names;
+}
+
+static PyObject* TF_GetSupportedDevices(GCluster cluster, GItem item) {
+  if (cluster.is_none() || item.is_none()) {
+    Py_RETURN_NONE;
+  }
+  const std::unordered_map<string, tensorflow::DeviceProperties>& devices = cluster->GetDevices();
+  std::unordered_map<string, std::vector<string>> device_types;
+  for (const auto& dev : devices) {
+    device_types[dev.second.type()].push_back(dev.first);
+  }
+
+  std::unordered_map<string, std::set<string>> supported_device_types;
+  std::unordered_map<string, std::set<string>> device_restrictions;
+
+  for (const auto& node : item->graph.node()) {
+    for (const auto& dev : device_types) {
+      const string& type = dev.first;
+      if (cluster->type() != "single_machine") {
+        // The actual kernel may not be linked in this binary.
+        supported_device_types[node.name()].insert(type);
+      } else {
+        // Check the kernel capabilities
+        const tensorflow::DeviceType dev_type(type);
+        tensorflow::Status s = tensorflow::FindKernelDef(dev_type, node, nullptr, nullptr);
+        if (s.ok()) {
+          supported_device_types[node.name()].insert(type);
+
+          // Check which inputs are restricted to reside on the host.
+          // TODO: extends this to support outputs as well
+          tensorflow::MemoryTypeVector inp_mtypes;
+          tensorflow::MemoryTypeVector out_mtypes;
+          s = tensorflow::MemoryTypesForNode(tensorflow::OpRegistry::Global(), dev_type, node,
+                                             &inp_mtypes, &out_mtypes);
+          if (s.ok()) {
+            for (int i = 0; i < inp_mtypes.size(); ++i) {
+              if (inp_mtypes[i] == tensorflow::HOST_MEMORY) {
+                device_restrictions[tensorflow::grappler::NodeName(node.input(i))].insert("CPU");
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  PyObject* result = PyDict_New();
+
+  for (const auto& supported_dev : supported_device_types) {
+    const string& node = supported_dev.first;
+    std::set<string> feasible;
+    const auto it = device_restrictions.find(node);
+    if (it != device_restrictions.end()) {
+      const std::set<string>& candidates = supported_dev.second;
+      const std::set<string>& valid = it->second;
+      std::set_intersection(candidates.begin(), candidates.end(), valid.begin(), valid.end(),
+                            std::inserter(feasible, feasible.begin()));
+    } else {
+      feasible = supported_dev.second;
+    }
+
+    std::vector<string> device_names;
+    for (const string& type : feasible) {
+      auto it = device_types.find(type);
+      CHECK(it != device_types.end());
+      for (const string& name : it->second) {
+        device_names.push_back(name);
+      }
+    }
+
+    PyObject* dev = PyList_New(device_names.size());
+    for (int i = 0; i < device_names.size(); ++i) {
+      PyList_SetItem(dev, i, PyString_FromString(device_names[i].c_str()));
+    }
+    CHECK_EQ(0, PyDict_SetItem(result, PyString_FromString(node.c_str()), dev));
+  }
+  PyGILState_Release(gstate);
+  return result;
+}
+
+
+static double TF_EstimatePerformance(const tensorflow::NamedDevice& device) {
+  tensorflow::grappler::OpLevelCostEstimator estimator;
+  tensorflow::grappler::OpLevelCostEstimator::DeviceInfo info =
+      estimator.GetDeviceInfo(device.properties());
+  return info.gigaops;
+}
+
 static PyObject* TF_MeasureCosts(
     GItem item,
     GCluster cluster,
@@ -307,10 +432,12 @@ static GCluster TF_NewVirtualCluster(
     TF_Status* out_status);
 static void TF_ShutdownCluster(GCluster cluster);
 static PyObject* TF_ListDevices(GCluster cluster);
+static std::vector<string> TF_ListAvailableOps();
+static PyObject* TF_GetSupportedDevices(GCluster cluster, GItem item);
+static float TF_EstimatePerformance(const tensorflow::NamedDevice& device);
 static PyObject* TF_MeasureCosts(
     GItem item, GCluster cluster,
     bool generate_timeline, TF_Status* out_status);
 static PyObject* TF_DeterminePeakMemoryUsage(
     GItem item, GCluster cluster,
     TF_Status* out_status);
-
diff --git a/tensorflow/python/grappler/cluster.py b/tensorflow/python/grappler/cluster.py
index cf795fddb71cd9f6119f8b27e17464cb44e77d1e..079d07115b31da86600821a098aec08ec60bf436 100644
--- a/tensorflow/python/grappler/cluster.py
+++ b/tensorflow/python/grappler/cluster.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
+
 from tensorflow.core.framework import step_stats_pb2
 from tensorflow.core.grappler.costs import op_performance_data_pb2
 from tensorflow.core.protobuf import device_properties_pb2
@@ -56,9 +58,13 @@ class Cluster(object):
         self._tf_cluster = tf_cluster.TF_NewVirtualCluster(
             devices_serialized, status)
 
-  def __del__(self):
+  def Shutdown(self):
     if self._tf_cluster is not None:
       tf_cluster.TF_ShutdownCluster(self._tf_cluster)
+      self._tf_cluster = None
+
+  def __del__(self):
+    self.Shutdown()
 
   @property
   def tf_cluster(self):
@@ -74,6 +80,18 @@ class Cluster(object):
         devices.append(device_properties_pb2.NamedDevice.FromString(raw_dev))
     return devices
 
+  def ListAvailableOps(self):
+    """Returns a list of all the available operations (sorted alphatically)."""
+    return tf_cluster.TF_ListAvailableOps()
+
+  def GetSupportedDevices(self, item):
+    return tf_cluster.TF_GetSupportedDevices(self._tf_cluster, item.tf_item)
+
+  def EstimatePerformance(self, device):
+    """Estimate the performance of the specified device."""
+    serialized = device.SerializeToString()
+    return tf_cluster.TF_EstimatePerformance(serialized)
+
   def MeasureCosts(self, item):
     """Returns the cost of running the specified item.
 
@@ -108,3 +126,14 @@ class Cluster(object):
           item.tf_item, self._tf_cluster, status)
 
     return ret_from_swig
+
+
+@contextlib.contextmanager
+def Provision(allow_soft_placement=True,
+              disable_detailed_stats=True,
+              disable_timeline=True,
+              devices=None):
+  cluster = Cluster(allow_soft_placement, disable_detailed_stats,
+                    disable_timeline, devices)
+  yield cluster
+  cluster.Shutdown()
diff --git a/tensorflow/python/grappler/cluster_test.py b/tensorflow/python/grappler/cluster_test.py
index f1f02963de0fac8f2a04075e9efd97aec429da9d..10d515a36458d4025060cf4900251cd493f40795 100644
--- a/tensorflow/python/grappler/cluster_test.py
+++ b/tensorflow/python/grappler/cluster_test.py
@@ -23,6 +23,8 @@ from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.grappler import cluster
 from tensorflow.python.grappler import item
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
@@ -43,7 +45,7 @@ class ClusterTest(test.TestCase):
       op_perfs, run_time, step_stats = grappler_cluster.MeasureCosts(
           grappler_item)
       self.assertTrue(run_time > 0)
-      self.assertEqual(len(op_perfs), 10)
+      self.assertEqual(len(op_perfs), 7)
       self.assertTrue(step_stats.dev_stats)
 
   def testNoDetailedStats(self):
@@ -65,7 +67,7 @@ class ClusterTest(test.TestCase):
 
   def testMemoryEstimates(self):
     with ops.Graph().as_default() as g:
-      with ops.device('/job:localhost/replica:0/task:0/cpu:0'):
+      with ops.device('/job:localhost/replica:0/task:0/device:CPU:0'):
         a = random_ops.random_uniform(shape=())
         b = random_ops.random_uniform(shape=())
         c = a + b
@@ -77,7 +79,7 @@ class ClusterTest(test.TestCase):
             disable_detailed_stats=True, disable_timeline=True)
         peak_mem = grappler_cluster.DeterminePeakMemoryUsage(grappler_item)
         self.assertLessEqual(1, len(peak_mem))
-        snapshot = peak_mem['/job:localhost/replica:0/task:0/cpu:0']
+        snapshot = peak_mem['/job:localhost/replica:0/task:0/device:CPU:0']
         peak_usage = snapshot[0]
         self.assertEqual(52, peak_usage)
         live_tensors = snapshot[1]
@@ -106,6 +108,77 @@ class ClusterTest(test.TestCase):
       self.assertGreater(run_time, 0)
       self.assertEqual(len(op_perfs), 15)
 
+      estimated_perf = grappler_cluster.EstimatePerformance(named_device)
+      self.assertEqual(7680.0, estimated_perf)
+
+  def testContext(self):
+    with ops.Graph().as_default() as g:
+      a = random_ops.random_uniform(shape=())
+      b = random_ops.random_uniform(shape=())
+      c = a + b
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      train_op.append(c)
+      mg = meta_graph.create_meta_graph_def(graph=g)
+      grappler_item = item.Item(mg)
+
+    with cluster.Provision(
+        disable_detailed_stats=False, disable_timeline=False) as gcluster:
+      op_perfs, run_time, step_stats = gcluster.MeasureCosts(grappler_item)
+      self.assertTrue(run_time > 0)
+      self.assertEqual(len(op_perfs), 7)
+      self.assertTrue(step_stats.dev_stats)
+
+  def testAvailableOps(self):
+    with cluster.Provision() as gcluster:
+      op_names = gcluster.ListAvailableOps()
+      self.assertTrue(b'Add' in op_names)
+      self.assertTrue(b'MatMul' in op_names)
+      self.assertEqual(op_names, sorted(op_names))
+
+  def testSupportDevices(self):
+    with ops.Graph().as_default() as g:
+      a = random_ops.random_uniform(shape=(2, 3))
+      b = random_ops.random_uniform(shape=(2, 3))
+      c = a + b
+      dims = math_ops.range(0, array_ops.rank(c), 1)
+      d = math_ops.reduce_sum(a, axis=dims)
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      train_op.append(d)
+      mg = meta_graph.create_meta_graph_def(graph=g)
+      grappler_item = item.Item(mg)
+
+      device_properties = device_properties_pb2.DeviceProperties(
+          type='GPU', frequency=1000, num_cores=60)
+      named_gpu = device_properties_pb2.NamedDevice(
+          properties=device_properties, name='/GPU:0')
+      device_properties = device_properties_pb2.DeviceProperties(
+          type='CPU', frequency=3000, num_cores=6)
+      named_cpu = device_properties_pb2.NamedDevice(
+          properties=device_properties, name='/CPU:0')
+      virtual_cluster = cluster.Cluster(devices=[named_cpu, named_gpu])
+      supported_dev = virtual_cluster.GetSupportedDevices(grappler_item)
+      self.assertEqual(supported_dev['add'], ['/CPU:0', '/GPU:0'])
+      self.assertEqual(supported_dev['Sum'], ['/CPU:0', '/GPU:0'])
+      self.assertEqual(supported_dev['range'], ['/CPU:0', '/GPU:0'])
+
+      real_cluster = cluster.Cluster()
+      supported_dev = real_cluster.GetSupportedDevices(grappler_item)
+      if test.is_gpu_available():
+        self.assertEqual(supported_dev['add'], [
+            '/job:localhost/replica:0/task:0/device:CPU:0',
+            '/job:localhost/replica:0/task:0/device:GPU:0'
+        ])
+        self.assertEqual(supported_dev['Sum'], [
+            '/job:localhost/replica:0/task:0/device:CPU:0',
+            '/job:localhost/replica:0/task:0/device:GPU:0'
+        ])
+        # The axis tensor must reside on the host
+        self.assertEqual(supported_dev['range'],
+                         ['/job:localhost/replica:0/task:0/device:CPU:0'])
+      else:
+        self.assertEqual(supported_dev['add'],
+                         ['/job:localhost/replica:0/task:0/device:CPU:0'])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/grappler/cost_analyzer_test.py b/tensorflow/python/grappler/cost_analyzer_test.py
index f4933a45149ac1d003e24e1ebc6b20d0ad708b6c..511908c79ce47d6849bf97d11bc42f2f1bb13f18 100644
--- a/tensorflow/python/grappler/cost_analyzer_test.py
+++ b/tensorflow/python/grappler/cost_analyzer_test.py
@@ -89,13 +89,10 @@ class CostAnalysisTest(test.TestCase):
     self.assertTrue(b"MatMul" in report)
     self.assertTrue(b"ApplyAdam" in report)
     self.assertTrue(b"Conv2D" in report)
-    self.assertTrue(b"Conv2DBackpropInput" in report)
     self.assertTrue(b"Conv2DBackpropFilter" in report)
     self.assertTrue(b"Softmax" in report)
 
-    for op_type in [
-        b"MatMul", b"Conv2D", b"Conv2DBackpropInput", b"Conv2DBackpropFilter"
-    ]:
+    for op_type in [b"MatMul", b"Conv2D", b"Conv2DBackpropFilter"]:
       matcher = re.compile(
           br"\s+" + op_type + br",\s*(\d+),\s*(\d+),\s*([\d\.eE+-]+)%,\s*" +
           br"([\d\.eE+-]+)%,\s*(-?\d+),\s*(\d+),", re.MULTILINE)
@@ -130,7 +127,8 @@ class CostAnalysisTest(test.TestCase):
 
     # Check the report
     self.assertTrue(
-        "Peak usage for device /job:localhost/replica:0/task:0/cpu:0: 16 bytes"
+        "Peak usage for device /job:localhost/replica:0/task:0/device:CPU:0: "
+        "16 bytes"
         in report)
     self.assertTrue("  a:0 uses 4 bytes" in report)
     self.assertTrue("  b:0 uses 4 bytes" in report)
diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
index 146bb4311cb5a44d5739821db19f33a41e6e9ce2..51b77b471b09d59f1a63b5cc3b736a8f2462351d 100644
--- a/tensorflow/python/grappler/cost_analyzer_tool.py
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -22,34 +22,72 @@ import argparse
 import sys
 
 from google.protobuf import text_format
-
+from tensorflow.contrib.fused_conv.ops import gen_fused_conv2d_bias_activation_op  # pylint: disable=unused-import
+from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
 from tensorflow.python.grappler import cost_analyzer
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.platform import app
 from tensorflow.python.platform import gfile
+from tensorflow.python.training import saver
 
 
 def main(_):
-  with gfile.GFile(FLAGS.input) as input_file:
-    metagraph = meta_graph_pb2.MetaGraphDef()
-    metagraph.ParseFromString(input_file.read())
+  if FLAGS.metagraphdef:
+    with gfile.GFile(FLAGS.metagraphdef) as meta_file:
+      metagraph = meta_graph_pb2.MetaGraphDef()
+      metagraph.ParseFromString(meta_file.read())
+  else:
+    with gfile.GFile(FLAGS.graphdef) as graph_file:
+      graph_def = graph_pb2.GraphDef()
+      if FLAGS.graphdef.endswith(".pbtxt"):
+        text_format.Merge(graph_file.read(), graph_def)
+      else:
+        graph_def.ParseFromString(graph_file.read())
+      importer.import_graph_def(graph_def, name="")
+      graph = ops.get_default_graph()
+      fetch = graph.get_operation_by_name(FLAGS.fetch)
+      graph.add_to_collection("train_op", fetch)
+      metagraph = saver.export_meta_graph(
+          graph_def=graph.as_graph_def(), graph=graph)
 
+  rewriter_config = rewriter_config_pb2.RewriterConfig()
   if FLAGS.rewriter_config is not None:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
     text_format.Merge(FLAGS.rewriter_config, rewriter_config)
-    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
-    metagraph.graph_def.CopyFrom(optimized_graph)
+  optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
+  metagraph.graph_def.CopyFrom(optimized_graph)
 
   report = cost_analyzer.GenerateCostReport(metagraph, FLAGS.per_node_report)
   print(report)
+  report = cost_analyzer.GenerateMemoryReport(metagraph)
+  print(report)
 
 
 if __name__ == "__main__":
   parser = argparse.ArgumentParser()
   parser.add_argument(
-      "--input", type=str, default=None, help="Input .meta file path.")
+      "--metagraphdef",
+      type=str,
+      default=None,
+      help="Input .meta MetaGraphDef file path.")
+  parser.add_argument(
+      "--graphdef",
+      type=str,
+      default=None,
+      help="Input .pb GraphDef file path.")
+  # Consider making flag fetch work together with flag metagraphdef. As some
+  # MetaGraphDef files don't have collection train_op.
+  parser.add_argument(
+      "--fetch",
+      type=str,
+      default=None,
+      help=
+      "The name of the fetch node. This flag is ignored if flag "
+      "metagraphdef is used."
+  )
   parser.add_argument(
       "--rewriter_config",
       type=str,
diff --git a/tensorflow/python/grappler/datasets_test.py b/tensorflow/python/grappler/datasets_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d942af597c180576ebe65e26ad39923754092f3
--- /dev/null
+++ b/tensorflow/python/grappler/datasets_test.py
@@ -0,0 +1,447 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the datasets shape inference."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.grappler import item
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class GrapplerTest(test.TestCase):
+
+  def testFromTensors(self):
+    test_cases = [{
+        'tensor': 0,
+        'shape': tensor_shape.TensorShape([])
+    }, {
+        'tensor': np.array([1, 2, 3]),
+        'shape': tensor_shape.TensorShape([3])
+    }, {
+        'tensor': np.array([[1, 2, 3]]),
+        'shape': tensor_shape.TensorShape([1, 3])
+    }]
+
+    for test_case in test_cases:
+      with ops.Graph().as_default() as g:
+        dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
+        iterator = dataset.make_one_shot_iterator()
+        get_next = iterator.get_next()
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        train_op.append(get_next)
+        mg = meta_graph.create_meta_graph_def(graph=g)
+        grappler_item = item.Item(mg)
+        op_properties = grappler_item.GetOpProperties()
+        self.assertEqual(test_case['shape'],
+                         op_properties['IteratorGetNext'][0].shape)
+
+  def testFromTensorSlices(self):
+    test_cases = [{
+        'tensor': np.array([1, 2, 3]),
+        'shape': tensor_shape.TensorShape([])
+    }, {
+        'tensor': np.array([[1, 2, 3]]),
+        'shape': tensor_shape.TensorShape([3])
+    }, {
+        'tensor': np.array([[[1, 2, 3]]]),
+        'shape': tensor_shape.TensorShape([1, 3])
+    }]
+
+    for test_case in test_cases:
+      with ops.Graph().as_default() as g:
+        dataset = dataset_ops.Dataset.from_tensor_slices(test_case['tensor'])
+        iterator = dataset.make_one_shot_iterator()
+        get_next = iterator.get_next()
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        train_op.append(get_next)
+        mg = meta_graph.create_meta_graph_def(graph=g)
+        grappler_item = item.Item(mg)
+        op_properties = grappler_item.GetOpProperties()
+        self.assertEqual(test_case['shape'],
+                         op_properties['IteratorGetNext'][0].shape)
+
+  def testFromGenerator(self):
+    test_cases = [{
+        'tensor': 0,
+        'shape': tensor_shape.TensorShape([])
+    }, {
+        'tensor': np.array([1, 2, 3]),
+        'shape': tensor_shape.TensorShape([3])
+    }, {
+        'tensor': np.array([[1, 2, 3]]),
+        'shape': tensor_shape.TensorShape([1, 3])
+    }]
+
+    for test_case in test_cases:
+
+      def make_generator(tensor):
+
+        def generator():
+          yield tensor
+
+        return generator
+
+      with ops.Graph().as_default() as g:
+        dataset = dataset_ops.Dataset.from_generator(
+            make_generator(test_case['tensor']),
+            dtypes.int64,
+            output_shapes=test_case['shape'])
+        iterator = dataset.make_one_shot_iterator()
+        get_next = iterator.get_next()
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        train_op.append(get_next)
+        mg = meta_graph.create_meta_graph_def(graph=g)
+        grappler_item = item.Item(mg)
+        op_properties = grappler_item.GetOpProperties()
+        self.assertEqual(test_case['shape'],
+                         op_properties['IteratorGetNext'][0].shape)
+
+  def testRange(self):
+    with ops.Graph().as_default() as g:
+      dataset = dataset_ops.Dataset.range(42)
+      iterator = dataset.make_one_shot_iterator()
+      get_next = iterator.get_next()
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      train_op.append(get_next)
+      mg = meta_graph.create_meta_graph_def(graph=g)
+      grappler_item = item.Item(mg)
+      op_properties = grappler_item.GetOpProperties()
+      self.assertEqual(tensor_shape.scalar(),
+                       op_properties['IteratorGetNext'][0].shape)
+
+  def _testTransformation(self, fn):
+    test_cases = [{
+        'tensor': 0,
+        'shape': tensor_shape.TensorShape({})
+    }, {
+        'tensor': np.array([1, 2, 3]),
+        'shape': tensor_shape.TensorShape([3])
+    }, {
+        'tensor': np.array([[1, 2, 3]]),
+        'shape': tensor_shape.TensorShape([1, 3])
+    }]
+
+    for test_case in test_cases:
+      with ops.Graph().as_default() as g:
+        dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
+        dataset = fn(dataset, test_case['tensor'], test_case['shape'])
+        iterator = dataset.make_one_shot_iterator()
+        get_next = iterator.get_next()
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        train_op.append(get_next)
+        mg = meta_graph.create_meta_graph_def(graph=g)
+        grappler_item = item.Item(mg)
+        op_properties = grappler_item.GetOpProperties()
+        self.assertEqual(test_case['shape'],
+                         op_properties['IteratorGetNext'][0].shape)
+
+  def testConcatenate(self):
+
+    def fn(dataset, tensor, shape):
+      del shape
+      return dataset.concatenate(dataset_ops.Dataset.from_tensors(tensor))
+
+    self._testTransformation(fn)
+
+  def testPrefetch(self):
+
+    def fn(dataset, tensor, shape):
+      del tensor, shape
+      return dataset.prefetch(42)
+
+    self._testTransformation(fn)
+
+  def testRepeat(self):
+
+    def fn(dataset, tensor, shape):
+      del tensor, shape
+      return dataset.repeat(42)
+
+    self._testTransformation(fn)
+
+  def testShuffle(self):
+
+    def fn(dataset, tensor, shape):
+      del tensor, shape
+      return dataset.shuffle(42)
+
+    self._testTransformation(fn)
+
+  def testCache(self):
+
+    def fn(dataset, tensor, shape):
+      del tensor, shape
+      return dataset.cache()
+
+    self._testTransformation(fn)
+
+  def testTake(self):
+
+    def fn(dataset, tensor, shape):
+      del tensor, shape
+      return dataset.take(42)
+
+    self._testTransformation(fn)
+
+  def testSkip(self):
+
+    def fn(dataset, tensor, shape):
+      del tensor, shape
+      return dataset.skip(42)
+
+    self._testTransformation(fn)
+
+  def testShard(self):
+
+    def fn(dataset, tensor, shape):
+      del tensor, shape
+      return dataset.shard(42, 0)
+
+    self._testTransformation(fn)
+
+  def testFilter(self):
+
+    def fn(dataset, tensor, shape):
+      del tensor, shape
+      return dataset.filter(lambda x: True)
+
+    self._testTransformation(fn)
+
+  def as_tensor_shape(self, proto_with_symbolic_values):
+    for i in range(len(proto_with_symbolic_values.dim)):
+      if proto_with_symbolic_values.dim[i].size < -1:
+        proto_with_symbolic_values.dim[i].size = -1
+    return tensor_shape.TensorShape(proto_with_symbolic_values)
+
+  def testBatch(self):
+    test_cases = [{
+        'tensor': 0,
+        'shape': tensor_shape.TensorShape([None])
+    }, {
+        'tensor': np.array([1, 2, 3]),
+        'shape': tensor_shape.TensorShape([None, 3])
+    }, {
+        'tensor': np.array([[1, 2, 3]]),
+        'shape': tensor_shape.TensorShape([None, 1, 3])
+    }]
+
+    for test_case in test_cases:
+      with ops.Graph().as_default() as g:
+        dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
+        dataset = dataset.batch(42)
+        iterator = dataset.make_one_shot_iterator()
+        get_next = iterator.get_next()
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        train_op.append(get_next)
+        mg = meta_graph.create_meta_graph_def(graph=g)
+        grappler_item = item.Item(mg)
+        op_properties = grappler_item.GetOpProperties()
+        inferred_shape = self.as_tensor_shape(
+            op_properties['IteratorGetNext'][0].shape)
+        self.assertTrue(test_case['shape'][0].is_compatible_with(
+            inferred_shape[0]))
+        self.assertEqual(test_case['shape'][1:], inferred_shape[1:])
+
+  def testPaddedBatch(self):
+    test_cases = [{
+        'tensor': 0,
+        'shape': tensor_shape.TensorShape([None])
+    }, {
+        'tensor': np.array([1, 2, 3]),
+        'shape': tensor_shape.TensorShape([None, 4])
+    }, {
+        'tensor': np.array([[1, 2, 3]]),
+        'shape': tensor_shape.TensorShape([None, 2, 4])
+    }]
+
+    for test_case in test_cases:
+      with ops.Graph().as_default() as g:
+        dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
+        dataset = dataset.padded_batch(42, padded_shapes=test_case['shape'][1:])
+        iterator = dataset.make_one_shot_iterator()
+        get_next = iterator.get_next()
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        train_op.append(get_next)
+        mg = meta_graph.create_meta_graph_def(graph=g)
+        grappler_item = item.Item(mg)
+        op_properties = grappler_item.GetOpProperties()
+        inferred_shape = self.as_tensor_shape(
+            op_properties['IteratorGetNext'][0].shape)
+        self.assertTrue(test_case['shape'][0].is_compatible_with(
+            inferred_shape[0]))
+        self.assertEqual(test_case['shape'][1:], inferred_shape[1:])
+
+  def testFlatMap(self):
+    test_cases = [{
+        'tensor': 0,
+        'shape': tensor_shape.TensorShape([])
+    }, {
+        'tensor': np.array([1, 2, 3]),
+        'shape': tensor_shape.TensorShape([3])
+    }, {
+        'tensor': np.array([[1, 2, 3]]),
+        'shape': tensor_shape.TensorShape([1, 3])
+    }]
+
+    for test_case in test_cases:
+      with ops.Graph().as_default() as g:
+        dataset = dataset_ops.Dataset.range(42)
+
+        def make_dataset(tensor):
+
+          def dataset_fn(n):
+            return dataset_ops.Dataset.from_tensors(tensor).repeat(n)
+
+          return dataset_fn
+
+        dataset = dataset.flat_map(make_dataset(test_case['tensor']))
+        iterator = dataset.make_one_shot_iterator()
+        get_next = iterator.get_next()
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        train_op.append(get_next)
+        mg = meta_graph.create_meta_graph_def(graph=g)
+        grappler_item = item.Item(mg)
+        op_properties = grappler_item.GetOpProperties()
+        self.assertEqual(test_case['shape'],
+                         op_properties['IteratorGetNext'][0].shape)
+
+  def testInterleave(self):
+    test_cases = [{
+        'tensor': 0,
+        'shape': tensor_shape.TensorShape([])
+    }, {
+        'tensor': np.array([1, 2, 3]),
+        'shape': tensor_shape.TensorShape([3])
+    }, {
+        'tensor': np.array([[1, 2, 3]]),
+        'shape': tensor_shape.TensorShape([1, 3])
+    }]
+
+    for test_case in test_cases:
+      with ops.Graph().as_default() as g:
+        dataset = dataset_ops.Dataset.range(42)
+
+        def make_dataset(tensor):
+
+          def dataset_fn(n):
+            return dataset_ops.Dataset.from_tensors(tensor).repeat(n)
+
+          return dataset_fn
+
+        dataset = dataset.interleave(
+            make_dataset(test_case['tensor']), cycle_length=42)
+        iterator = dataset.make_one_shot_iterator()
+        get_next = iterator.get_next()
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        train_op.append(get_next)
+        mg = meta_graph.create_meta_graph_def(graph=g)
+        grappler_item = item.Item(mg)
+        op_properties = grappler_item.GetOpProperties()
+        self.assertEqual(test_case['shape'],
+                         op_properties['IteratorGetNext'][0].shape)
+
+  def testMap(self):
+    test_cases = [{
+        'tensor': 0,
+        'shape': tensor_shape.TensorShape([])
+    }, {
+        'tensor': np.array([1, 2, 3]),
+        'shape': tensor_shape.TensorShape([3])
+    }, {
+        'tensor': np.array([[1, 2, 3]]),
+        'shape': tensor_shape.TensorShape([3, 1])
+    }, {
+        'tensor': np.array([[[1, 2, 3], [4, 5, 6]]]),
+        'shape': tensor_shape.TensorShape([3, 2, 1])
+    }]
+
+    for test_case in test_cases:
+      with ops.Graph().as_default() as g:
+        dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
+        dataset = dataset.map(array_ops.transpose)
+        iterator = dataset.make_one_shot_iterator()
+        get_next = iterator.get_next()
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        train_op.append(get_next)
+        mg = meta_graph.create_meta_graph_def(graph=g)
+        grappler_item = item.Item(mg)
+        op_properties = grappler_item.GetOpProperties()
+        self.assertEqual(test_case['shape'],
+                         op_properties['IteratorGetNext'][0].shape)
+
+  def testFromStructure(self):
+    test_cases = [{
+        'shape': tensor_shape.TensorShape([])
+    }, {
+        'shape': tensor_shape.TensorShape([3])
+    }, {
+        'shape': tensor_shape.TensorShape([1, 2])
+    }, {
+        'shape': tensor_shape.TensorShape([1, 2, 3])
+    }]
+
+    for test_case in test_cases:
+      with ops.Graph().as_default() as g:
+        iterator = iterator_ops.Iterator.from_structure(
+            dtypes.int64, output_shapes=test_case['shape'])
+        get_next = iterator.get_next()
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        train_op.append(get_next)
+        mg = meta_graph.create_meta_graph_def(graph=g)
+        grappler_item = item.Item(mg)
+        op_properties = grappler_item.GetOpProperties()
+        self.assertEqual(test_case['shape'],
+                         op_properties['IteratorGetNext'][0].shape)
+
+  def testFromStringHandle(self):
+    test_cases = [{
+        'shape': tensor_shape.TensorShape([])
+    }, {
+        'shape': tensor_shape.TensorShape([3])
+    }, {
+        'shape': tensor_shape.TensorShape([1, 2])
+    }, {
+        'shape': tensor_shape.TensorShape([1, 2, 3])
+    }]
+
+    for test_case in test_cases:
+      with ops.Graph().as_default() as g:
+        iterator = iterator_ops.Iterator.from_structure(dtypes.int64)
+        handle = iterator.string_handle()
+        iterator = iterator_ops.Iterator.from_string_handle(
+            handle, dtypes.int64, output_shapes=test_case['shape'])
+        get_next = iterator.get_next()
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        train_op.append(get_next)
+        mg = meta_graph.create_meta_graph_def(graph=g)
+        grappler_item = item.Item(mg)
+        op_properties = grappler_item.GetOpProperties()
+        self.assertEqual(test_case['shape'],
+                         op_properties['IteratorGetNext'][0].shape)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/grappler/item.i b/tensorflow/python/grappler/item.i
index 7dd79f7c82c35a1bb84881a350d31eb00e9c40f7..d0fc1a04f220e0a053257e0206bb07b25f3767c6 100644
--- a/tensorflow/python/grappler/item.i
+++ b/tensorflow/python/grappler/item.i
@@ -42,9 +42,12 @@ struct GItem {
 #include <unordered_set>
 #include <map>
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 #include "tensorflow/core/grappler/grappler_item_builder.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
@@ -80,6 +83,7 @@ static GItem TF_NewItem(
   tensorflow::grappler::ItemConfig cfg;
   cfg.ignore_user_placement = ignore_user_placement;
   cfg.ignore_colocation = ignore_colocation;
+  cfg.inline_functions = true;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
       tensorflow::grappler::GrapplerItemFromMetaGraphDef("item", meta_graph, cfg);
   if (!item) {
@@ -92,7 +96,8 @@ static GItem TF_NewItem(
   return GItem(item.release());
 }
 
-static std::vector<string> TF_IdentifyImportantOps(GItem item) {
+static std::vector<string> TF_IdentifyImportantOps(GItem item, bool sort_topologically,
+                                                   TF_Status* status) {
   if (item.is_none()) {
     return {};
   }
@@ -108,8 +113,23 @@ static std::vector<string> TF_IdentifyImportantOps(GItem item) {
   }
 
   std::vector<string> ops;
-  for (const auto& op_name : op_names) {
-    ops.push_back(op_name);
+  if (sort_topologically) {
+    tensorflow::GraphDef subgraph;
+    for (const tensorflow::NodeDef& node : item->graph.node()) {
+      if (op_names.find(node.name()) != op_names.end()) {
+        *subgraph.add_node() = node;
+      }
+    }
+    tensorflow::Status s = tensorflow::grappler::TopologicalSort(&subgraph);
+    tensorflow::Set_TF_Status_from_Status(status, s);
+    for (const tensorflow::NodeDef& node : subgraph.node()) {
+      ops.push_back(node.name());
+    }
+  }
+  else {
+    for (const auto& op_name : op_names) {
+      ops.push_back(op_name);
+    }
   }
 
   return ops;
@@ -120,7 +140,7 @@ static PyObject* TF_GetOpProperties(GItem item) {
     Py_RETURN_NONE;
   }
   tensorflow::grappler::GraphProperties properties(*item);
-  tensorflow::Status status = properties.InferStatically();
+  tensorflow::Status status = properties.InferStatically(false);
   if (!status.ok()) {
     Py_RETURN_NONE;
   }
@@ -145,6 +165,139 @@ static PyObject* TF_GetOpProperties(GItem item) {
   return props;
 }
 
+class ColocationGroups {
+public:
+  void Group(const string& x, const string& y) {
+    Rep* x_root = Find(x);
+    Rep* y_root = Find(y);
+
+    // x and y are already in the same set
+    if (x_root == y_root) {
+      return;
+    }
+    // x and y are not in same set, so we merge them
+    // Use the occasion to strengthen what we know about the handle by merging the
+    // information about the 2 subsets.
+    if (x_root->rank < y_root->rank) {
+      x_root->parent = y_root;
+    } else if (x_root->rank > y_root->rank) {
+      y_root->parent = x_root;
+    } else {
+      // Arbitrarily make one root the new parent
+      y_root->parent = x_root;
+      x_root->rank = x_root->rank + 1;
+    }
+  }
+
+  void ExtractGroups(std::vector<std::vector<string>>* groups) {
+    groups->reserve(nodes_.size());
+    std::unordered_map<const Rep*, int> group_ids;
+    for (const auto& rep : nodes_) {
+      Rep* r = Find(rep.first);
+      auto it = group_ids.find(r);
+      std::vector<string>* g;
+      if (it == group_ids.end()) {
+        int id = group_ids.size();
+        group_ids[r] = id;
+        groups->resize(id+1);
+        g = &groups->back();
+      } else {
+        int id = it->second;
+        g = &((*groups)[id]);
+      }
+      g->push_back(rep.first);
+    }
+  }
+
+private:
+  struct Rep {
+    // Parent in the tree used to encode the set.
+    Rep* parent;
+    // Rank in the tree, used to figure out how to compress the path to the root
+    // of the tree.
+    int rank;
+    // The node.
+    string value;
+  };
+
+  Rep* Find(const string& n) {
+    auto it = nodes_.find(n);
+    if (it == nodes_.end()) {
+      // This is the first time we process this handle, create an entry for it.
+      Rep* node = new Rep;
+      node->parent = node;
+      node->rank = 0;
+      node->value = n;
+      nodes_[n] = node;
+      return node;
+    }
+    // Return the representative for the set, which is the root of the tree. Apply
+    // path compression to speedup future queries.
+    Rep* node = it->second;
+    Rep* root = node->parent;
+    while (root != root->parent) {
+      root = root->parent;
+    }
+    while (node->parent != root) {
+      Rep* next = node->parent;
+      node->parent = root;
+      node = next;
+    }
+    return root;
+  }
+
+  std::unordered_map<string, Rep*> nodes_;
+};
+
+static PyObject* TF_GetColocationGroups(GItem item) {
+  if (item.is_none()) {
+    Py_RETURN_NONE;
+  }
+  ColocationGroups groupings;
+  tensorflow::OpRegistry* registry = tensorflow::OpRegistry::Global();
+  for (const auto& node : item->graph.node()) {
+    const tensorflow::OpDef* op_def;
+    tensorflow::Status s = registry->LookUpOpDef(node.op(), &op_def);
+    if (!s.ok()) {
+      continue;
+    }
+    tensorflow::NameRangeMap inputs;
+    tensorflow::NameRangeMap outputs;
+    s = tensorflow::NameRangesForNode(node, *op_def, &inputs, &outputs);
+    if (!s.ok()) {
+      continue;
+    }
+    int i = 0;
+    for (const auto& arg : op_def->input_arg()) {
+      if (!arg.is_ref()) {
+        continue;
+      }
+      const auto& range = inputs[arg.name()];
+      for (int i = range.first; i < range.second; ++i) {
+        string input = tensorflow::grappler::NodeName(node.input(i));
+        groupings.Group(node.name(), input);
+      }
+    }
+  }
+
+  std::vector<std::vector<string>> groups;
+  groupings.ExtractGroups(&groups);
+
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  PyObject* result = PyList_New(groups.size());
+  for (int i = 0; i < groups.size(); ++i) {
+    const std::vector<string>& group = groups[i];
+    PyObject* g = PyTuple_New(group.size());
+    for (int j = 0; j < group.size(); ++j) {
+      const string& node_name = group[j];
+      PyTuple_SetItem(g, j, PyString_FromString(node_name.c_str()));
+    }
+    PyList_SetItem(result, i, g);
+  }
+  PyGILState_Release(gstate);
+  return result;
+}
+
 %}
 
 
@@ -152,5 +305,7 @@ static PyObject* TF_GetOpProperties(GItem item) {
 static GItem TF_NewItem(
     const tensorflow::MetaGraphDef& meta_graph, bool ignore_colocation,
     bool ignore_user_placement, TF_Status* out_status);
-static std::vector<string> TF_IdentifyImportantOps(GItem item);
+static std::vector<string> TF_IdentifyImportantOps(GItem item, bool sort_topologically,
+                                                   TF_Status* status);
 static PyObject* TF_GetOpProperties(GItem item);
+static PyObject* TF_GetColocationGroups(GItem item);
diff --git a/tensorflow/python/grappler/item.py b/tensorflow/python/grappler/item.py
index 4fc94ec9680464aea17875189ac4a749f3fa11dc..4a083849bd39f606877069419396d8c42ef077eb 100644
--- a/tensorflow/python/grappler/item.py
+++ b/tensorflow/python/grappler/item.py
@@ -50,8 +50,10 @@ class Item(object):
     self._tf_item = None
     self._BuildTFItem()
 
-  def IdentifyImportantOps(self):
-    return tf_item.TF_IdentifyImportantOps(self.tf_item)
+  def IdentifyImportantOps(self, sort_topologically=False):
+    with errors.raise_exception_on_not_ok_status() as status:
+      return tf_item.TF_IdentifyImportantOps(self.tf_item, sort_topologically,
+                                             status)
 
   def GetOpProperties(self):
     ret_from_swig = tf_item.TF_GetOpProperties(self.tf_item)
@@ -64,6 +66,17 @@ class Item(object):
       properties[key] = prop
     return properties
 
+  def GetColocationGroups(self):
+    """Return a list of hard colocation constraints.
+
+    All the nodes in a colocation tuple must be placed on the same device for
+    the model to work.
+
+    Returns:
+      A list of colocation tuples.
+    """
+    return tf_item.TF_GetColocationGroups(self.tf_item)
+
   @property
   def metagraph(self):
     return self._metagraph
diff --git a/tensorflow/python/grappler/item_test.py b/tensorflow/python/grappler/item_test.py
index 71c68d25cd928d5cb2dc5028ed331d468c5b9826..cd70e2fdecc74f9d99240ac566f3c28e900a06c2 100644
--- a/tensorflow/python/grappler/item_test.py
+++ b/tensorflow/python/grappler/item_test.py
@@ -26,6 +26,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.grappler import item
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -53,7 +56,7 @@ class ItemTest(test.TestCase):
       mg = meta_graph.create_meta_graph_def(graph=g)
       grappler_item = item.Item(mg)
       op_list = grappler_item.IdentifyImportantOps()
-      self.assertEqual([b'Const', b'Const_1', b'add'], op_list)
+      self.assertItemsEqual([b'Const', b'Const_1', b'add'], op_list)
 
   def testOpProperties(self):
     with ops.Graph().as_default() as g:
@@ -104,6 +107,21 @@ class ItemTest(test.TestCase):
     newest_tf_item = grappler_item.tf_item
     self.assertEqual(new_tf_item, newest_tf_item)
 
+  def testColocationContraints(self):
+    with ops.Graph().as_default() as g:
+      c = constant_op.constant([10])
+      v = variables.Variable([3], dtype=dtypes.int32)
+      i = gen_array_ops._ref_identity(v)
+      a = state_ops.assign(i, c)
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      train_op.append(a)
+      mg = meta_graph.create_meta_graph_def(graph=g)
+      grappler_item = item.Item(mg)
+      groups = grappler_item.GetColocationGroups()
+      self.assertEqual(len(groups), 1)
+      self.assertItemsEqual(
+          groups[0], ['Assign', 'RefIdentity', 'Variable', 'Variable/Assign'])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 626e0502cbb87bb5cb16c4be3098102b5c29af19..b04bbb0daa64ebcec38972248ff9311faad090fe 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -34,9 +34,13 @@ from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
@@ -78,6 +82,36 @@ def _two_layer_model(x):
   return h_pool2
 
 
+def _model_with_second_port():
+  random_seed.set_random_seed(0)
+  x = random_ops.truncated_normal([2, 5, 5, 4], seed=0)
+  scale = constant_op.constant(0.1, shape=[4])
+  offset = constant_op.constant(0.3, shape=[4])
+  y, mean, _ = nn.fused_batch_norm(x, scale, offset)
+  mul = math_ops.add(y, mean)
+  output = array_ops.identity(mul)
+  return output
+
+
+def _model_with_branch(x):
+  x_image = array_ops.reshape(x, [-1, 28, 28, 1])
+  w_conv1 = _weight([5, 5, 1, 32])
+  w_conv2 = _weight([5, 5, 1, 32])
+  c_conv1 = _conv2d(x_image, w_conv1)
+  c_conv2 = _conv2d(x_image, w_conv2)
+  add = math_ops.add(c_conv1, c_conv2)
+  return add
+
+
+def _model_with_vec_and_4d(x):
+  x_image = array_ops.reshape(x, [-1, 28, 28, 1])
+  w_conv1 = _weight([5, 5, 1, 32])
+  c_conv1 = _conv2d(x_image, w_conv1)
+  vector = constant_op.constant(6.4, shape=[32])
+  add = math_ops.add(c_conv1, vector)
+  return add
+
+
 def _loop():
   random_seed.set_random_seed(0)
   x1 = random_ops.truncated_normal([1, 784], seed=0)
@@ -89,6 +123,30 @@ def _loop():
   return outputs
 
 
+def _loop_with_branch():
+  random_seed.set_random_seed(0)
+  x1 = random_ops.truncated_normal([1, 784], seed=0)
+  x2 = random_ops.truncated_normal([1, 784], seed=0)
+  x3 = random_ops.truncated_normal([1, 784], seed=0)
+  x4 = random_ops.truncated_normal([1, 784], seed=0)
+  elems = (x1, x2, x3, x4)
+  outputs = functional_ops.map_fn(
+      _model_with_branch, elems, dtype=dtypes.float32)
+  return outputs
+
+
+def _loop_with_vec_and_4d():
+  random_seed.set_random_seed(0)
+  x1 = random_ops.truncated_normal([1, 784], seed=0)
+  x2 = random_ops.truncated_normal([1, 784], seed=0)
+  x3 = random_ops.truncated_normal([1, 784], seed=0)
+  x4 = random_ops.truncated_normal([1, 784], seed=0)
+  elems = (x1, x2, x3, x4)
+  outputs = functional_ops.map_fn(
+      _model_with_vec_and_4d, elems, dtype=dtypes.float32)
+  return outputs
+
+
 def _get_config(layout_optimizer=True):
   if layout_optimizer:
     rewrite_options = rewriter_config_pb2.RewriterConfig(
@@ -99,6 +157,7 @@ def _get_config(layout_optimizer=True):
   graph_options = config_pb2.GraphOptions(
       rewrite_options=rewrite_options, build_cost_model=1)
   config = config_pb2.ConfigProto(graph_options=graph_options)
+  config.graph_options.optimizer_options.opt_level = -1
   return config
 
 
@@ -121,14 +180,41 @@ def _get_cluster():
   named_device = device_properties_pb2.NamedDevice()
   named_device.name = '/GPU:0'
   named_device.properties.type = 'GPU'
+  named_device.properties.num_cores = 24
+  named_device.properties.frequency = 1000
   named_device.properties.environment['architecture'] = '4'
   cluster = gcluster.Cluster(devices=[named_device])
   return cluster
 
 
+def _is_transpose(node):
+  return node.endswith('TransposeNHWCToNCHW-LayoutOptimizer') or node.endswith(
+      'TransposeNCHWToNHWC-LayoutOptimizer')
+
+
+def _is_permute(node):
+  return node.endswith('VecPermuteNHWCToNCHW-LayoutOptimizer') or node.endswith(
+      'VecPermuteNCHWToNHWC-LayoutOptimizer')
+
+
 class LayoutOptimizerTest(test.TestCase):
   """Tests the Grappler layout optimizer."""
 
+  def _assert_trans_nchw_to_nhwc(self, name, nodes):
+    self.assertIn(name + '-TransposeNCHWToNHWC-LayoutOptimizer', nodes)
+
+  def _assert_trans_nhwc_to_nchw(self, name, nodes):
+    self.assertIn(name + '-TransposeNHWCToNCHW-LayoutOptimizer', nodes)
+
+  def _assert_map_nhwc_to_nchw(self, name, nodes):
+    self.assertIn(name + '-DimMapNHWCToNCHW-LayoutOptimizer', nodes)
+
+  def _assert_vec_nchw_to_nhwc(self, name, nodes):
+    self.assertIn(name + '-VecPermuteNCHWToNHWC-LayoutOptimizer', nodes)
+
+  def _assert_vec_nhwc_to_nchw(self, name, nodes):
+    self.assertIn(name + '-VecPermuteNHWCToNCHW-LayoutOptimizer', nodes)
+
   def _train(self, checkpoint_path, layout_optimizer=False, restore=False):
     ops.reset_default_graph()
     graph = ops.get_default_graph()
@@ -180,7 +266,7 @@ class LayoutOptimizerTest(test.TestCase):
       nodes = []
       num_transposes = 0
       for node in metadata.cost_graph.node:
-        if node.name.startswith('LayoutOptimizerTranspose'):
+        if _is_transpose(node.name):
           num_transposes += 1
         nodes.append(node.name)
 
@@ -188,16 +274,328 @@ class LayoutOptimizerTest(test.TestCase):
       # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
       expected_num_transposes = 2
       self.assertEqual(expected_num_transposes, num_transposes)
-      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Reshape-0',
-                    nodes)
-      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Relu_1-MaxPool_1',
-                    nodes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('Relu_1-0-0', nodes)
 
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
-  def testLoop(self):
+  def testSplitWithNonConstAxis(self):
     if test.is_gpu_available(cuda_only=True):
-      output = _loop()
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      dim = array_ops.placeholder(dtype='int32')
+      split = array_ops.split(conv, 2, axis=dim)
+      scale = constant_op.constant(0.1, shape=[32])
+      offset = constant_op.constant(0.3, shape=[32])
+      bn0 = nn.fused_batch_norm(split[0], scale, offset)
+      bn1 = nn.fused_batch_norm(split[1], scale, offset)
+      add = bn0[0] + bn1[0]
+      output = array_ops.identity(add)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={dim: 3})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata, feed_dict={dim: 3})
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('add_2-0-0', nodes)
+      self._assert_map_nhwc_to_nchw('split-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testSplitVWithNonConstAxis(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      dim = array_ops.placeholder(dtype='int32')
+      sizes = constant_op.constant([50, 10, 4], shape=[3])
+      split = gen_array_ops._split_v(
+          value=conv, size_splits=sizes, axis=dim, num_split=3)
+      output = math_ops.reduce_sum(split[0])
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={dim: 3})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata, feed_dict={dim: 3})
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('SplitV-0-0', nodes)
+      self._assert_map_nhwc_to_nchw('SplitV-2', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testPadWithConstPaddings(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      paddings_val = [[1, 2], [3, 4], [5, 6], [7, 8]]
+      paddings = constant_op.constant(
+          paddings_val, dtype='int32', name='PaddingsConst')
+      pad = array_ops.pad(conv, paddings)
+      output = array_ops.identity(pad)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('Pad-0-0', nodes)
+      self.assertIn('Pad-1-LayoutOptimizer', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testReduceSum(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      reduce_sum = math_ops.reduce_sum(conv)
+      output = array_ops.identity(reduce_sum)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Three transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 1
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testCast(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      cast = math_ops.cast(conv, dtype='bool')
+      output = array_ops.identity(cast)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('Cast-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testSqueeze(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[1, 2])
+      squeeze = array_ops.squeeze(reduce_sum)
+      output = array_ops.identity(squeeze)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Three transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 1
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testSqueezeAlongHW(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[1, 2], keep_dims=True)
+      squeeze = array_ops.squeeze(reduce_sum, axis=[1, 2])
+      output = array_ops.identity(squeeze)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Three transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 1
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testSqueezeAlongNHW(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[0, 1, 2], keep_dims=True)
+      squeeze = array_ops.squeeze(reduce_sum, axis=[0, 1, 2])
+      output = array_ops.identity(squeeze)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Three transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 1
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testReduceSumAlongHWC(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[1, 2, 3])
+      output = array_ops.identity(reduce_sum)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Three transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 1
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testReduceSumAlongNHW(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[0, 1, 2])
+      output = array_ops.identity(reduce_sum)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Three transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 1
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testReduceSumAlongC(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[3])
+      output = array_ops.identity(reduce_sum)
 
       with session.Session() as sess:
         output_val_ref = sess.run(output)
@@ -206,6 +604,778 @@ class LayoutOptimizerTest(test.TestCase):
         metadata = config_pb2.RunMetadata()
         output_val = sess.run(output, run_metadata=metadata)
 
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Three transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 1
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testReduceSumAlongCKeepDims(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      reduce_sum = math_ops.reduce_sum(conv, axis=[3], keep_dims=True)
+      output = array_ops.identity(reduce_sum)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('Sum-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testConcatWithControlDependency(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      axis = constant_op.constant(3)
+      var = variables.Variable(3)
+      assign = state_ops.assign(var, 6)
+      with ops.control_dependencies([assign]):
+        concat = array_ops.concat([conv, conv], axis)
+      output = array_ops.identity(concat)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('concat-0-0', nodes)
+      self.assertIn('concat-2-LayoutOptimizer', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testFill(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = array_ops.placeholder(dtype='float32')
+      conv = _two_layer_model(x)
+      shape = array_ops.shape(conv)
+      scalar = array_ops.constant(5.7)
+      fill = array_ops.fill(shape, scalar)
+      output = array_ops.identity(fill)
+
+      x_val = [3.4] * 784
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={x: x_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                x: x_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      num_vec_permute = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        if _is_permute(node.name):
+          num_vec_permute += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      # Two vector permute nodes were initially added in the Expand phase of
+      # LayoutOptimizer; they cancelled out each other in the Collapse phase.
+      expected_vec_permute = 0
+      self.assertEqual(expected_vec_permute, num_vec_permute)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('Fill-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testTile(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      multiple = array_ops.placeholder(dtype='int32')
+      tile = array_ops.tile(conv, multiple)
+      output = array_ops.identity(tile)
+
+      multiple_val = [2, 3, 4, 1]
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={multiple: multiple_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                multiple: multiple_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('Tile-0-0', nodes)
+      self._assert_vec_nhwc_to_nchw('Tile-1', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testReverseWithConstDims(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      dims = constant_op.constant([3, 1], name='DimsConst')
+      reverse = array_ops.reverse(conv, dims)
+      output = array_ops.identity(reverse)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('ReverseV2-0-0', nodes)
+      self.assertIn('ReverseV2-1-LayoutOptimizer', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testReverseWithNonConstDims(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      dims = array_ops.placeholder(dtype='int32')
+      reverse = array_ops.reverse(conv, dims)
+      output = array_ops.identity(reverse)
+
+      dims_val = [2, 3]
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={dims: dims_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                dims: dims_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('ReverseV2-0-0', nodes)
+      self._assert_map_nhwc_to_nchw('ReverseV2-1', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testSelectOp(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      add = math_ops.add(conv, conv)
+      mean = math_ops.reduce_mean(conv)
+      condition = math_ops.less(conv, mean)
+      select = gen_math_ops._select(condition, conv, add)
+      output = array_ops.identity(select)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('Select-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testSelectOpConditionUnknownShape(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      add = math_ops.add(conv, conv)
+      condition = array_ops.placeholder(dtype='bool')
+      select = gen_math_ops._select(condition, conv, add)
+      output = array_ops.identity(select)
+
+      condition_val = np.zeros((1, 7, 7, 64))
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={condition: condition_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={condition: condition_val})
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 3
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testSelectOpScalarCondition(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      add = math_ops.add(conv, conv)
+      condition = constant_op.constant(True)
+      select = gen_math_ops._select(condition, conv, add)
+      output = array_ops.identity(select)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('Select-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testPadWithNonConstPaddings(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      paddings = array_ops.placeholder(dtype='int32')
+      pad = array_ops.pad(conv, paddings)
+      output = array_ops.identity(pad)
+
+      paddings_val = [[1, 2], [3, 4], [5, 6], [7, 8]]
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={paddings: paddings_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                paddings: paddings_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('Pad-0-0', nodes)
+      self._assert_vec_nhwc_to_nchw('Pad-1', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testMaxPoolV2(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      ksize = constant_op.constant([1, 2, 3, 1], shape=[4])
+      strides = array_ops.placeholder(dtype='int32', shape=[4])
+      max_pool = gen_nn_ops._max_pool_v2(conv, ksize, strides, 'VALID')
+      output = array_ops.identity(max_pool)
+
+      strides_val = [1, 3, 2, 1]
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={strides: strides_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                strides: strides_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('MaxPoolV2-0-0', nodes)
+      self._assert_vec_nhwc_to_nchw('MaxPoolV2-2', nodes)
+      self.assertIn('MaxPoolV2-1-LayoutOptimizer', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testMaxPoolGradV2(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      ksize = constant_op.constant([1, 2, 3, 1], shape=[4])
+      strides = array_ops.placeholder(dtype='int32', shape=[4])
+      max_pool_grad = gen_nn_ops.max_pool_grad_v2(conv, conv, conv, ksize,
+                                                  strides, 'VALID')
+      output = array_ops.identity(max_pool_grad)
+
+      strides_val = [1, 3, 2, 1]
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={strides: strides_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                strides: strides_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('MaxPoolGradV2-0-0', nodes)
+      self._assert_vec_nhwc_to_nchw('MaxPoolGradV2-4', nodes)
+      self.assertIn('MaxPoolGradV2-3-LayoutOptimizer', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testSliceWithNonConstAxis(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      size = array_ops.placeholder(dtype='int32')
+      s = array_ops.slice(conv, [0, 0, 0, 0], size)
+      output = array_ops.identity(s)
+
+      size_val = [1, 2, 3, 4]
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={size: size_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                size: size_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('Slice-0-0', nodes)
+      self._assert_vec_nhwc_to_nchw('Slice-2', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testStridedSliceWithNonConstAxis(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      end = array_ops.placeholder(dtype='int32')
+      s = array_ops.strided_slice(conv, [0, 0, 0, 0], end, strides=[1, 2, 3, 1])
+      output = array_ops.identity(s)
+
+      end_val = [1, 2, 3, 4]
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={end: end_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                end: end_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('StridedSlice-0-0', nodes)
+      self._assert_vec_nhwc_to_nchw('StridedSlice-2', nodes)
+      self.assertIn('StridedSlice-1-LayoutOptimizer', nodes)
+      self.assertIn('StridedSlice-3-LayoutOptimizer', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testStridedSliceWithMask1011(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      # This will generate a StridedSlice op with begin mask and
+      # end mask 11(1011).
+      s = conv[:, :, 1:-1, :]
+      output = array_ops.identity(s)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('strided_slice-0-0', nodes)
+      self.assertIn('strided_slice-1-LayoutOptimizer', nodes)
+      self.assertIn('strided_slice-2-LayoutOptimizer', nodes)
+      self.assertIn('strided_slice-3-LayoutOptimizer', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testStridedSliceWithMask0111(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      # This will generate a StridedSlice op with begin mask and
+      # end mask 7(0111).
+      s = conv[:, :, :, 1:-1]
+      output = array_ops.identity(s)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('strided_slice-0-0', nodes)
+      self.assertIn('strided_slice-1-LayoutOptimizer', nodes)
+      self.assertIn('strided_slice-2-LayoutOptimizer', nodes)
+      self.assertIn('strided_slice-3-LayoutOptimizer', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testStridedSliceGradWithNonConstAxis(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      end = array_ops.placeholder(dtype='int32')
+      shape = array_ops.shape(conv)
+      end_val = [1, 2, 3, 4]
+      s = array_ops.strided_slice(
+          conv, [0, 0, 0, 0], end_val, strides=[1, 2, 3, 1])
+      s_grad = array_ops.strided_slice_grad(shape, [0, 0, 0, 0], end,
+                                            [1, 2, 3, 1], s)
+      output = array_ops.identity(s_grad)
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={end: end_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                end: end_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('StridedSliceGrad-0-0', nodes)
+      self._assert_vec_nhwc_to_nchw('StridedSliceGrad-2', nodes)
+      self.assertIn('StridedSlice-1-LayoutOptimizer', nodes)
+      self.assertIn('StridedSlice-2-LayoutOptimizer', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testShapeN(self):
+    if test.is_gpu_available(cuda_only=True):
+      x = array_ops.placeholder(dtype='float32')
+      conv = _two_layer_model(x)
+      shapen = array_ops.shape_n([conv, conv])
+      output = math_ops.add(shapen[0], shapen[1])
+
+      x_val = [1.7] * 784
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={x: x_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                x: x_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 1
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self._assert_vec_nchw_to_nhwc('ShapeN-0-0', nodes)
+      self.assertAllEqual(output_val_ref, output_val)
+
+  def testShapeNFollowedByNotConvertibleNodeReshape(self):
+    if test.is_gpu_available(cuda_only=True):
+      x = array_ops.placeholder(dtype='float32')
+      conv = _two_layer_model(x)
+      conv_reshape = array_ops.reshape(conv, [1, 1, 1, -1])
+      shapen = array_ops.shape_n([conv, conv_reshape])
+      shape = array_ops.identity(shapen[1])
+      ones = array_ops.ones(shape)
+      output = math_ops.add_n([conv_reshape, ones])
+
+      x_val = [1.7] * 784
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={x: x_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={x: x_val})
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
+      self.assertAllEqual(output_val_ref, output_val)
+
+  def testLoop(self):
+    if test.is_gpu_available(cuda_only=True):
+      output = _loop()
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/MaxPool_1-0-2', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testLoopWithBranch(self):
+    if test.is_gpu_available(cuda_only=True):
+      output = _loop_with_branch()
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 3
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testLoopWithVecAnd4D(self):
+    if test.is_gpu_available(cuda_only=True):
+      output = _loop_with_vec_and_4d()
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testBinaryOpSecondPort(self):
+    if test.is_gpu_available(cuda_only=True):
+      output = _model_with_second_port()
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('FusedBatchNorm-0', nodes)
+      self._assert_trans_nchw_to_nhwc('Add-0-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testGradient(self):
@@ -219,7 +1389,7 @@ class LayoutOptimizerTest(test.TestCase):
     for node in optimized_graph.node:
       if node.op in ['Conv2D', 'Conv2DBackpropFilter', 'Conv2DBackpropInput']:
         found += 1
-        self.assertEqual(node.attr['data_format'].s, 'NCHW')
+        self.assertEqual(node.attr['data_format'].s, b'NCHW')
     self.assertEqual(found, 5)
 
   def testDepthwise(self):
@@ -236,7 +1406,7 @@ class LayoutOptimizerTest(test.TestCase):
           'DepthwiseConv2dNativeBackpropInput'
       ]:
         found += 1
-        self.assertEqual(node.attr['data_format'].s, 'NCHW')
+        self.assertEqual(node.attr['data_format'].s, b'NCHW')
     self.assertEqual(found, 6)
 
   def testCheckpointCompatibility(self):
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 9fbadeceb3b1a8c9f949bc59a5ec75c5b7420cac..948911f099674af4c6dd19bfdac75e5fc1f75c78 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -18,12 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -33,6 +35,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import training as train
 
 
+@test_util.with_c_api
 class MemoryOptimizerSwapTest(test.TestCase):
   """Tests the Grappler memory optimizer."""
 
@@ -66,7 +69,7 @@ class MemoryOptimizerSwapTest(test.TestCase):
     train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
     train_op.append(d)
 
-    d.op.node_def.attr['_swap_to_host'].i = 0
+    d.op._set_attr('_swap_to_host', attr_value_pb2.AttrValue(i=0))
 
     mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
     graph_size = len(mg.graph_def.node)
@@ -92,6 +95,7 @@ class MemoryOptimizerSwapTest(test.TestCase):
         self.assertEqual('c', node.input[1])
 
 
+@test_util.with_c_api
 class MemoryOptimizerRecomputeTest(test.TestCase):
   """Tests the Python interface to recomputation rewrites.
 
@@ -128,6 +132,7 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             memory_optimization=rewriter_config_pb2.RewriterConfig.
@@ -152,6 +157,7 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             memory_optimization=rewriter_config_pb2.RewriterConfig.
@@ -209,6 +215,66 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
         loss_op_name=loss_op_name)
     self.assertAllClose(original_loss, memory_optimized_loss, rtol=1e-4)
 
+  def _annotated_graph(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      random_seed.set_random_seed(2)
+      current_activation = variable_scope.get_variable(
+          name='start', shape=[1, 2, 2, 5])
+      conv_filter = variable_scope.get_variable(
+          name='filter', shape=[5, 5, 5, 5])
+      for layer_number in range(3):
+        with variable_scope.variable_scope('layer_{}'.format(layer_number)):
+          after_conv = nn.conv2d(current_activation, conv_filter, [1, 1, 1, 1],
+                                 'SAME')
+          current_activation = 2. * after_conv
+          current_activation.op._set_attr(
+              '_recompute_hint',
+              # The value of the attribute does not matter; just that the key
+              # exists in the op's attributes.
+              attr_value_pb2.AttrValue(i=1))
+          current_activation += 5.
+          current_activation.op._set_attr(
+              '_recompute_hint', attr_value_pb2.AttrValue(i=0))
+          current_activation = nn.relu(current_activation)
+          current_activation.op._set_attr(
+              '_recompute_hint', attr_value_pb2.AttrValue(i=1))
+      loss = math_ops.reduce_mean(current_activation)
+      optimizer = train.AdamOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+      init_op = variables.global_variables_initializer()
+    return graph, init_op, train_op
+
+  def testHintNoMetaGraph(self):
+    # Closer to expected usage, but does not check that a re-write actually
+    # happens; see testHintDoesRewrite.
+    graph, init_op, train_op = self._annotated_graph()
+    with graph.as_default():
+      manual_memory_config = rewriter_config_pb2.RewriterConfig(
+          memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
+      graph_options = config_pb2.GraphOptions(
+          rewrite_options=manual_memory_config)
+      session_config = config_pb2.ConfigProto(graph_options=graph_options)
+      with session.Session(config=session_config) as sess:
+        sess.run(init_op)
+        sess.run(train_op)
+
+  def testHintDoesRewrite(self):
+    graph = self._annotated_graph()[0]
+    with graph.as_default():
+      metagraph = train.export_meta_graph()
+    self.assertEqual(
+        0,
+        len([node for node in metagraph.graph_def.node
+             if 'Recomputed/' in node.name]))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+        rewriter_config_pb2.RewriterConfig(
+            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL),
+        metagraph)
+    self.assertEqual(
+        9,
+        len([node for node in rewritten_graph_def.node
+             if 'Recomputed/' in node.name]))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/grappler/model_analyzer.cc b/tensorflow/python/grappler/model_analyzer.cc
index 7d365c3be923e216b44149921b76d734c2b9a82f..d23eb811ac2b0a6a8802979b4d966b5617c8a8d9 100644
--- a/tensorflow/python/grappler/model_analyzer.cc
+++ b/tensorflow/python/grappler/model_analyzer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/python/grappler/model_analyzer.h"
 
 #include <iomanip>
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -25,26 +26,26 @@ namespace grappler {
 
 ModelAnalyzer::ModelAnalyzer(const GrapplerItem& item) : item_(item) {}
 
-Status ModelAnalyzer::GenerateReport(std::ostream& os) {
+Status ModelAnalyzer::GenerateReport(bool debug, std::ostream& os) {
   GraphProperties properties(item_);
-  TF_RETURN_IF_ERROR(properties.InferStatically());
+  TF_RETURN_IF_ERROR(properties.InferStatically(false));
 
   for (const auto& node : item_.MainOpsFanin()) {
-    PrintNodeInfo(node, properties, os);
+    PrintNodeInfo(node, properties, debug, os);
   }
   for (const auto& node : item_.EnqueueOpsFanin()) {
-    PrintNodeInfo(node, properties, os);
+    PrintNodeInfo(node, properties, debug, os);
   }
 
   return Status::OK();
 }
 
 void ModelAnalyzer::PrintNodeInfo(const NodeDef* node,
-                                  const GraphProperties& properties,
+                                  const GraphProperties& properties, bool debug,
                                   std::ostream& os) const {
   os << node->name() << " [" << node->op() << "]" << std::endl;
   if (properties.HasOutputProperties(node->name())) {
-    std::vector<OpInfo::TensorProperties> props =
+    const std::vector<OpInfo::TensorProperties>& props =
         properties.GetOutputProperties(node->name());
     for (int i = 0; i < props.size(); ++i) {
       const OpInfo::TensorProperties& prop = props[i];
@@ -75,6 +76,27 @@ void ModelAnalyzer::PrintNodeInfo(const NodeDef* node,
       os << std::endl;
     }
   }
+
+  if (debug) {
+    const OpRegistrationData* op_reg_data;
+    Status status = OpRegistry::Global()->LookUp(node->op(), &op_reg_data);
+    if (!status.ok()) {
+      os << "\tCouldn't find op registration for " << node->op() << std::endl;
+    } else if (!op_reg_data->shape_inference_fn) {
+      os << "\tCouldn't find shape function for op " << node->op() << std::endl;
+    } else if (properties.HasInputProperties(node->name())) {
+      const std::vector<OpInfo::TensorProperties>& props =
+          properties.GetInputProperties(node->name());
+      for (int i = 0; i < props.size(); ++i) {
+        const OpInfo::TensorProperties& prop = props[i];
+        if (prop.has_value()) {
+          os << "\t"
+             << "input " << i << " (" << DataTypeString(prop.dtype())
+             << ") has known value" << std::endl;
+        }
+      }
+    }
+  }
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/python/grappler/model_analyzer.h b/tensorflow/python/grappler/model_analyzer.h
index a14034103ca70e59ac24d88318edc198e7d1c5f4..5bc551927d88db723e21b29903d6f5b941048139 100644
--- a/tensorflow/python/grappler/model_analyzer.h
+++ b/tensorflow/python/grappler/model_analyzer.h
@@ -31,11 +31,11 @@ class GraphProperties;
 class ModelAnalyzer {
  public:
   explicit ModelAnalyzer(const GrapplerItem& item);
-  Status GenerateReport(std::ostream& os);
+  Status GenerateReport(bool debug, std::ostream& os);
 
  private:
   void PrintNodeInfo(const NodeDef* node, const GraphProperties& properties,
-                     std::ostream& os) const;
+                     bool debug, std::ostream& os) const;
 
   const GrapplerItem& item_;
 };
diff --git a/tensorflow/python/grappler/model_analyzer.i b/tensorflow/python/grappler/model_analyzer.i
index 726143a0bb4db28538f4338eb3773d85332dc122..7c3a692d0efc501341ff1dff3cf24b8a4830ec84 100644
--- a/tensorflow/python/grappler/model_analyzer.i
+++ b/tensorflow/python/grappler/model_analyzer.i
@@ -40,7 +40,7 @@ limitations under the License.
 %}
 
 %{
-string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph) {
+string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph, bool debug) {
   tensorflow::grappler::ItemConfig cfg;
   cfg.apply_optimizations = false;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
@@ -53,10 +53,10 @@ string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph) {
   tensorflow::grappler::ModelAnalyzer analyzer(*item);
 
   std::stringstream os;
-  analyzer.GenerateReport(os);
+  analyzer.GenerateReport(debug, os);
   return os.str();
 }
 
 %}
 
-string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph);
+string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph, bool debug);
diff --git a/tensorflow/python/grappler/model_analyzer.py b/tensorflow/python/grappler/model_analyzer.py
index c852d71ad8b047f5437ca62c49a5500bc29cec60..535889e1c4034952562a05e4d044fcafeddbc0ca 100644
--- a/tensorflow/python/grappler/model_analyzer.py
+++ b/tensorflow/python/grappler/model_analyzer.py
@@ -22,16 +22,18 @@ from tensorflow.python import pywrap_tensorflow as tf_wrap
 from tensorflow.python.framework import errors
 
 
-def GenerateModelReport(metagraph):
+def GenerateModelReport(metagraph, debug=False):
   """Report what's known statically about each node in the provided metagraph.
 
   Args:
     metagraph: A TensorFlow MetaGraphDef.
+    debug: Add some information useful for debugging.
 
   Returns:
     A string containing the report.
   """
   with errors.raise_exception_on_not_ok_status():
-    ret_from_swig = tf_wrap.GenerateModelReport(metagraph.SerializeToString())
+    ret_from_swig = tf_wrap.GenerateModelReport(metagraph.SerializeToString(),
+                                                debug)
 
   return ret_from_swig
diff --git a/tensorflow/python/grappler/model_analyzer_test.py b/tensorflow/python/grappler/model_analyzer_test.py
index b59d1650f4b5e4c7239c2275213e9a26c3aafafe..ec172755f1ae43fc7581e97c6a18471da45f9100 100644
--- a/tensorflow/python/grappler/model_analyzer_test.py
+++ b/tensorflow/python/grappler/model_analyzer_test.py
@@ -49,6 +49,24 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     # Also print the report to make it easier to debug
     print("{}".format(report))
 
+  def testDebugMode(self):
+    """Make sure arguments can be passed correctly."""
+    a = constant_op.constant([10, 11], name="a")
+    b = constant_op.constant([10], name="b")
+    c = math_ops.add(a, b, name="c")
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(c)
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    report = model_analyzer.GenerateModelReport(mg, debug=True)
+
+    # Check the report headers
+    self.assertTrue(b"input 0 (int32) has known value" in report)
+    self.assertTrue(b"input 1 (int32) has known value" in report)
+
+    # Also print the report to make it easier to debug
+    print("{}".format(report))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
index f0dd4483a635ddf39e7f51ad0008390c1feb2e13..1b657983a4690dd0ddb7f569ce514b08cb10400a 100644
--- a/tensorflow/python/grappler/tf_optimizer.i
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -103,6 +103,11 @@ PyObject* TF_OptimizeGraph(
     std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
         tensorflow::grappler::GrapplerItemFromMetaGraphDef(graph_id, metagraph, item_config);
 
+    if (!grappler_item) {
+      TF_SetStatus(out_status, TF_INVALID_ARGUMENT, "Failed to import metagraph, check error log for more info.");
+      return nullptr;
+    }
+
     tensorflow::DeviceBase* cpu_device = nullptr;
     tensorflow::GraphDef out_graph;
     tensorflow::grappler::MetaOptimizer optimizer(cpu_device, rewriter_config);
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 5959659a40ccdfbc6448f425ce776fae4d0bcd79..d97a035256aa5eb927106646ebb9d35c654a2c82 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -3,6 +3,8 @@
 
 licenses(["notice"])  # Apache 2.0
 
+exports_files(["LICENSE"])
+
 package(default_visibility = ["//visibility:public"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
@@ -14,10 +16,12 @@ py_library(
         "_impl/keras/__init__.py",
         "_impl/keras/activations.py",
         "_impl/keras/applications/__init__.py",
+        "_impl/keras/applications/densenet.py",
         "_impl/keras/applications/imagenet_utils.py",
         "_impl/keras/applications/inception_resnet_v2.py",
         "_impl/keras/applications/inception_v3.py",
         "_impl/keras/applications/mobilenet.py",
+        "_impl/keras/applications/nasnet.py",
         "_impl/keras/applications/resnet50.py",
         "_impl/keras/applications/vgg16.py",
         "_impl/keras/applications/vgg19.py",
@@ -37,6 +41,7 @@ py_library(
         "_impl/keras/engine/__init__.py",
         "_impl/keras/engine/topology.py",
         "_impl/keras/engine/training.py",
+        "_impl/keras/engine/training_eager.py",
         "_impl/keras/estimator.py",
         "_impl/keras/initializers.py",
         "_impl/keras/layers/__init__.py",
@@ -76,9 +81,11 @@ py_library(
         "_impl/keras/wrappers/scikit_learn.py",
         "activations/__init__.py",
         "applications/__init__.py",
+        "applications/densenet/__init__.py",
         "applications/inception_resnet_v2/__init__.py",
         "applications/inception_v3/__init__.py",
         "applications/mobilenet/__init__.py",
+        "applications/nasnet/__init__.py",
         "applications/resnet50/__init__.py",
         "applications/vgg16/__init__.py",
         "applications/vgg19/__init__.py",
@@ -256,6 +263,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "densenet_test",
+    size = "large",
+    srcs = ["_impl/keras/applications/densenet_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "inception_resnet_v2_test",
     size = "medium",
@@ -292,6 +311,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "nasnet_test",
+    size = "large",
+    srcs = ["_impl/keras/applications/nasnet_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "resnet50_test",
     size = "small",
@@ -392,7 +423,7 @@ py_test(
 
 py_test(
     name = "core_test",
-    size = "small",
+    size = "medium",
     srcs = ["_impl/keras/layers/core_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -453,6 +484,7 @@ py_test(
     size = "small",
     srcs = ["_impl/keras/layers/normalization_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -504,7 +536,7 @@ py_test(
 
 py_test(
     name = "recurrent_test",
-    size = "small",
+    size = "medium",
     srcs = ["_impl/keras/layers/recurrent_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -527,7 +559,7 @@ py_test(
 
 py_test(
     name = "wrappers_test",
-    size = "small",
+    size = "medium",
     srcs = ["_impl/keras/layers/wrappers_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
@@ -553,7 +585,7 @@ py_test(
 
 py_test(
     name = "data_utils_test",
-    size = "small",
+    size = "medium",
     srcs = ["_impl/keras/utils/data_utils_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -691,6 +723,32 @@ py_test(
     ],
 )
 
+py_test(
+    name = "training_eager_test",
+    size = "medium",
+    srcs = ["_impl/keras/engine/training_eager_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "model_subclassing_test",
+    size = "medium",
+    srcs = ["_impl/keras/model_subclassing_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "topology_test",
     size = "small",
diff --git a/tensorflow/python/keras/_impl/keras/__init__.py b/tensorflow/python/keras/_impl/keras/__init__.py
index 74cc9d0488c88de04bf29aafcd0e23895c59826a..73113539329c5493141db243b85254062f7b8f88 100644
--- a/tensorflow/python/keras/_impl/keras/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/__init__.py
@@ -40,4 +40,4 @@ from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.models import Sequential
 
-__version__ = '2.1.1-tf'
+__version__ = '2.1.3-tf'
diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/_impl/keras/activations.py
index f017d2ae85548211070ececf48e977dd7d2f6a25..236e17653e1b762e1e6962f453b714d1bf7bcbf7 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/_impl/keras/activations.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras built-in activation functions.
+"""Built-in activation functions.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -24,8 +24,10 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.layers.base import Layer
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.activations.softmax')
 def softmax(x, axis=-1):
   """Softmax activation function.
 
@@ -50,10 +52,12 @@ def softmax(x, axis=-1):
     raise ValueError('Cannot apply softmax to a tensor that is 1D')
 
 
+@tf_export('keras.activations.elu')
 def elu(x, alpha=1.0):
   return K.elu(x, alpha)
 
 
+@tf_export('keras.activations.selu')
 def selu(x):
   """Scaled Exponential Linear Unit. (Klambauer et al., 2017).
 
@@ -61,48 +65,59 @@ def selu(x):
       x: A tensor or variable to compute the activation function for.
 
   Returns:
-    Tensor with the same shape and dtype as `x`.
+      Tensor with the same shape and dtype as `x`.
+
+  # Note
+      - To be used together with the initialization "lecun_normal".
+      - To be used together with the dropout variant "AlphaDropout".
 
-  References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
   alpha = 1.6732632423543772848170429916717
   scale = 1.0507009873554804934193349852946
   return scale * K.elu(x, alpha)
 
 
+@tf_export('keras.activations.softplus')
 def softplus(x):
   return K.softplus(x)
 
 
+@tf_export('keras.activations.softsign')
 def softsign(x):
   return K.softsign(x)
 
 
+@tf_export('keras.activations.relu')
 def relu(x, alpha=0., max_value=None):
   return K.relu(x, alpha=alpha, max_value=max_value)
 
 
+@tf_export('keras.activations.tanh')
 def tanh(x):
   return K.tanh(x)
 
 
+@tf_export('keras.activations.sigmoid')
 def sigmoid(x):
   return K.sigmoid(x)
 
 
+@tf_export('keras.activations.hard_sigmoid')
 def hard_sigmoid(x):
   return K.hard_sigmoid(x)
 
 
+@tf_export('keras.activations.linear')
 def linear(x):
   return x
 
 
+@tf_export('keras.activations.serialize')
 def serialize(activation):
   return activation.__name__
 
 
+@tf_export('keras.activations.deserialize')
 def deserialize(name, custom_objects=None):
   return deserialize_keras_object(
       name,
@@ -111,6 +126,7 @@ def deserialize(name, custom_objects=None):
       printable_module_name='activation function')
 
 
+@tf_export('keras.activations.get')
 def get(identifier):
   if identifier is None:
     return linear
diff --git a/tensorflow/python/keras/_impl/keras/applications/__init__.py b/tensorflow/python/keras/_impl/keras/applications/__init__.py
index c11c52b71e9bff1cfd595a9dbc0e86dcaa8506c8..206a769b377483c65a78b76fe44055eb50bdc7c4 100644
--- a/tensorflow/python/keras/_impl/keras/applications/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/applications/__init__.py
@@ -18,9 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet121
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet169
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet201
 from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import InceptionResNetV2
 from tensorflow.python.keras._impl.keras.applications.inception_v3 import InceptionV3
 from tensorflow.python.keras._impl.keras.applications.mobilenet import MobileNet
+from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetLarge
+from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetMobile
 from tensorflow.python.keras._impl.keras.applications.resnet50 import ResNet50
 from tensorflow.python.keras._impl.keras.applications.vgg16 import VGG16
 from tensorflow.python.keras._impl.keras.applications.vgg19 import VGG19
diff --git a/tensorflow/python/keras/_impl/keras/applications/densenet.py b/tensorflow/python/keras/_impl/keras/applications/densenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6521f8410435fd13393b9991d3ee9a6342a912d0
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/densenet.py
@@ -0,0 +1,354 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+# pylint: disable=unused-import
+"""DenseNet models for Keras.
+
+# Reference paper
+
+- [Densely Connected Convolutional Networks]
+  (https://arxiv.org/abs/1608.06993) (CVPR 2017 Best Paper Award)
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras.applications import imagenet_utils
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.layers import Activation
+from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import BatchNormalization
+from tensorflow.python.keras._impl.keras.layers import Concatenate
+from tensorflow.python.keras._impl.keras.layers import Conv2D
+from tensorflow.python.keras._impl.keras.layers import Dense
+from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import Input
+from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
+from tensorflow.python.keras._impl.keras.models import Model
+from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.util.tf_export import tf_export
+
+
+DENSENET121_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet121_weights_tf_dim_ordering_tf_kernels.h5'
+DENSENET121_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5'
+DENSENET169_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet169_weights_tf_dim_ordering_tf_kernels.h5'
+DENSENET169_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5'
+DENSENET201_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet201_weights_tf_dim_ordering_tf_kernels.h5'
+DENSENET201_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def dense_block(x, blocks, name):
+  """A dense block.
+
+  Arguments:
+      x: input tensor.
+      blocks: integer, the number of building blocks.
+      name: string, block label.
+
+  Returns:
+      output tensor for the block.
+  """
+  for i in range(blocks):
+    x = conv_block(x, 32, name=name + '_block' + str(i + 1))
+  return x
+
+
+def transition_block(x, reduction, name):
+  """A transition block.
+
+  Arguments:
+      x: input tensor.
+      reduction: float, compression rate at transition layers.
+      name: string, block label.
+
+  Returns:
+      output tensor for the block.
+  """
+  bn_axis = 3 if K.image_data_format() == 'channels_last' else 1
+  x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name=name + '_bn')(x)
+  x = Activation('relu', name=name + '_relu')(x)
+  x = Conv2D(
+      int(K.int_shape(x)[bn_axis] * reduction),
+      1,
+      use_bias=False,
+      name=name + '_conv')(
+          x)
+  x = AveragePooling2D(2, strides=2, name=name + '_pool')(x)
+  return x
+
+
+def conv_block(x, growth_rate, name):
+  """A building block for a dense block.
+
+  Arguments:
+      x: input tensor.
+      growth_rate: float, growth rate at dense layers.
+      name: string, block label.
+
+  Returns:
+      output tensor for the block.
+  """
+  bn_axis = 3 if K.image_data_format() == 'channels_last' else 1
+  x1 = BatchNormalization(
+      axis=bn_axis, epsilon=1.001e-5, name=name + '_0_bn')(
+          x)
+  x1 = Activation('relu', name=name + '_0_relu')(x1)
+  x1 = Conv2D(4 * growth_rate, 1, use_bias=False, name=name + '_1_conv')(x1)
+  x1 = BatchNormalization(
+      axis=bn_axis, epsilon=1.001e-5, name=name + '_1_bn')(
+          x1)
+  x1 = Activation('relu', name=name + '_1_relu')(x1)
+  x1 = Conv2D(
+      growth_rate, 3, padding='same', use_bias=False, name=name + '_2_conv')(
+          x1)
+  x = Concatenate(axis=bn_axis, name=name + '_concat')([x, x1])
+  return x
+
+
+def DenseNet(blocks,
+             include_top=True,
+             weights='imagenet',
+             input_tensor=None,
+             input_shape=None,
+             pooling=None,
+             classes=1000):
+  """Instantiates the DenseNet architecture.
+
+  Optionally loads weights pre-trained
+  on ImageNet. Note that when using TensorFlow,
+  for best performance you should set
+  `image_data_format='channels_last'` in your Keras config
+  at ~/.keras/keras.json.
+
+  The model and the weights are compatible with
+  TensorFlow, Theano, and CNTK. The data format
+  convention used by the model is the one
+  specified in your Keras config file.
+
+  Arguments:
+      blocks: numbers of building blocks for the four dense layers.
+      include_top: whether to include the fully-connected
+          layer at the top of the network.
+      weights: one of `None` (random initialization),
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` (with `channels_last` data format)
+          or `(3, 224, 224)` (with `channels_first` data format).
+          It should have exactly 3 inputs channels.
+      pooling: optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+  """
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=224,
+      min_size=221,
+      data_format=K.image_data_format(),
+      require_flatten=include_top,
+      weights=weights)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
+
+  bn_axis = 3 if K.image_data_format() == 'channels_last' else 1
+
+  x = ZeroPadding2D(padding=((3, 3), (3, 3)))(img_input)
+  x = Conv2D(64, 7, strides=2, use_bias=False, name='conv1/conv')(x)
+  x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name='conv1/bn')(x)
+  x = Activation('relu', name='conv1/relu')(x)
+  x = ZeroPadding2D(padding=((1, 1), (1, 1)))(x)
+  x = MaxPooling2D(3, strides=2, name='pool1')(x)
+
+  x = dense_block(x, blocks[0], name='conv2')
+  x = transition_block(x, 0.5, name='pool2')
+  x = dense_block(x, blocks[1], name='conv3')
+  x = transition_block(x, 0.5, name='pool3')
+  x = dense_block(x, blocks[2], name='conv4')
+  x = transition_block(x, 0.5, name='pool4')
+  x = dense_block(x, blocks[3], name='conv5')
+
+  x = BatchNormalization(axis=bn_axis, epsilon=1.001e-5, name='bn')(x)
+
+  if include_top:
+    x = GlobalAveragePooling2D(name='avg_pool')(x)
+    x = Dense(classes, activation='softmax', name='fc1000')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D(name='avg_pool')(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D(name='max_pool')(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+
+  # Create model.
+  if blocks == [6, 12, 24, 16]:
+    model = Model(inputs, x, name='densenet121')
+  elif blocks == [6, 12, 32, 32]:
+    model = Model(inputs, x, name='densenet169')
+  elif blocks == [6, 12, 48, 32]:
+    model = Model(inputs, x, name='densenet201')
+  else:
+    model = Model(inputs, x, name='densenet')
+
+  # Load weights.
+  if weights == 'imagenet':
+    if include_top:
+      if blocks == [6, 12, 24, 16]:
+        weights_path = get_file(
+            'densenet121_weights_tf_dim_ordering_tf_kernels.h5',
+            DENSENET121_WEIGHT_PATH,
+            cache_subdir='models',
+            file_hash='0962ca643bae20f9b6771cb844dca3b0')
+      elif blocks == [6, 12, 32, 32]:
+        weights_path = get_file(
+            'densenet169_weights_tf_dim_ordering_tf_kernels.h5',
+            DENSENET169_WEIGHT_PATH,
+            cache_subdir='models',
+            file_hash='bcf9965cf5064a5f9eb6d7dc69386f43')
+      elif blocks == [6, 12, 48, 32]:
+        weights_path = get_file(
+            'densenet201_weights_tf_dim_ordering_tf_kernels.h5',
+            DENSENET201_WEIGHT_PATH,
+            cache_subdir='models',
+            file_hash='7bb75edd58cb43163be7e0005fbe95ef')
+    else:
+      if blocks == [6, 12, 24, 16]:
+        weights_path = get_file(
+            'densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5',
+            DENSENET121_WEIGHT_PATH_NO_TOP,
+            cache_subdir='models',
+            file_hash='4912a53fbd2a69346e7f2c0b5ec8c6d3')
+      elif blocks == [6, 12, 32, 32]:
+        weights_path = get_file(
+            'densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5',
+            DENSENET169_WEIGHT_PATH_NO_TOP,
+            cache_subdir='models',
+            file_hash='50662582284e4cf834ce40ab4dfa58c6')
+      elif blocks == [6, 12, 48, 32]:
+        weights_path = get_file(
+            'densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5',
+            DENSENET201_WEIGHT_PATH_NO_TOP,
+            cache_subdir='models',
+            file_hash='1c2de60ee40562448dbac34a0737e798')
+    model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
+
+  return model
+
+
+@tf_export('keras.applications.DenseNet121',
+           'keras.applications.densenet.DenseNet121')
+def DenseNet121(include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                input_shape=None,
+                pooling=None,
+                classes=1000):
+  return DenseNet([6, 12, 24, 16], include_top, weights, input_tensor,
+                  input_shape, pooling, classes)
+
+
+@tf_export('keras.applications.DenseNet169',
+           'keras.applications.densenet.DenseNet169')
+def DenseNet169(include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                input_shape=None,
+                pooling=None,
+                classes=1000):
+  return DenseNet([6, 12, 32, 32], include_top, weights, input_tensor,
+                  input_shape, pooling, classes)
+
+
+@tf_export('keras.applications.DenseNet201',
+           'keras.applications.densenet.DenseNet201')
+def DenseNet201(include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                input_shape=None,
+                pooling=None,
+                classes=1000):
+  return DenseNet([6, 12, 48, 32], include_top, weights, input_tensor,
+                  input_shape, pooling, classes)
+
+
+@tf_export('keras.applications.densenet.preprocess_input')
+def preprocess_input(x, data_format=None):
+  """Preprocesses a numpy array encoding a batch of images.
+
+  Arguments:
+      x: a 3D or 4D numpy array consists of RGB values within [0, 255].
+      data_format: data format of the image tensor.
+
+  Returns:
+      Preprocessed array.
+  """
+  return imagenet_utils.preprocess_input(x, data_format, mode='torch')
+
+
+setattr(DenseNet121, '__doc__', DenseNet.__doc__)
+setattr(DenseNet169, '__doc__', DenseNet.__doc__)
+setattr(DenseNet201, '__doc__', DenseNet.__doc__)
diff --git a/tensorflow/python/keras/_impl/keras/applications/densenet_test.py b/tensorflow/python/keras/_impl/keras/applications/densenet_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b92287a1e77a944c069a6c234e11e4a79ad7d32
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/densenet_test.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DenseNet application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.platform import test
+
+
+class DenseNet121Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.DenseNet121(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.DenseNet121(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1024))
+
+  def test_with_pooling(self):
+    model = keras.applications.DenseNet121(weights=None,
+                                           include_top=False,
+                                           pooling='avg')
+    self.assertEqual(model.output_shape, (None, 1024))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet121(weights='unknown',
+                                     include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet121(weights='imagenet',
+                                     classes=2000)
+
+
+class DenseNet169Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.DenseNet169(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.DenseNet169(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1664))
+
+  def test_with_pooling(self):
+    model = keras.applications.DenseNet169(weights=None,
+                                           include_top=False,
+                                           pooling='max')
+    self.assertEqual(model.output_shape, (None, 1664))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet169(weights='unknown',
+                                     include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet169(weights='imagenet',
+                                     classes=2000)
+
+
+class DenseNet201(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.DenseNet201(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.DenseNet201(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1920))
+
+  def test_with_pooling(self):
+    model = keras.applications.DenseNet201(weights=None,
+                                           include_top=False,
+                                           pooling='avg')
+    self.assertEqual(model.output_shape, (None, 1920))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet201(weights='unknown',
+                                     include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.DenseNet201(weights='imagenet',
+                                     classes=2000)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
index 58841e5db06229727ea088388a901633216aa6fe..d9cb726137409f899bc75e3c19f6bffeb3ca4e34 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities used by models pre-trained on ImageNet.
+"""Utilities for ImageNet data preprocessing & prediction decoding.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -20,71 +20,198 @@ from __future__ import print_function
 
 import json
 
+import numpy as np
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 CLASS_INDEX = None
 CLASS_INDEX_PATH = 'https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json'
 
+# Global tensor of imagenet mean for preprocessing symbolic inputs
+_IMAGENET_MEAN = None
 
-def preprocess_input(x, data_format=None, mode='caffe'):
+
+def _preprocess_numpy_input(x, data_format, mode):
+  """Preprocesses a Numpy array encoding a batch of images.
+
+  Arguments:
+      x: Input array, 3D or 4D.
+      data_format: Data format of the image array.
+      mode: One of "caffe", "tf" or "torch".
+          - caffe: will convert the images from RGB to BGR,
+              then will zero-center each color channel with
+              respect to the ImageNet dataset,
+              without scaling.
+          - tf: will scale pixels between -1 and 1,
+              sample-wise.
+          - torch: will scale pixels between 0 and 1 and then
+              will normalize each channel with respect to the
+              ImageNet dataset.
+
+  Returns:
+      Preprocessed Numpy array.
+  """
+  if mode == 'tf':
+    x /= 127.5
+    x -= 1.
+    return x
+
+  if mode == 'torch':
+    x /= 255.
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+  else:
+    if data_format == 'channels_first':
+      # 'RGB'->'BGR'
+      if x.ndim == 3:
+        x = x[::-1, ...]
+      else:
+        x = x[:, ::-1, ...]
+    else:
+      # 'RGB'->'BGR'
+      x = x[..., ::-1]
+    mean = [103.939, 116.779, 123.68]
+    std = None
+
+  # Zero-center by mean pixel
+  if data_format == 'channels_first':
+    if x.ndim == 3:
+      x[0, :, :] -= mean[0]
+      x[1, :, :] -= mean[1]
+      x[2, :, :] -= mean[2]
+      if std is not None:
+        x[0, :, :] /= std[0]
+        x[1, :, :] /= std[1]
+        x[2, :, :] /= std[2]
+    else:
+      x[:, 0, :, :] -= mean[0]
+      x[:, 1, :, :] -= mean[1]
+      x[:, 2, :, :] -= mean[2]
+      if std is not None:
+        x[:, 0, :, :] /= std[0]
+        x[:, 1, :, :] /= std[1]
+        x[:, 2, :, :] /= std[2]
+  else:
+    x[..., 0] -= mean[0]
+    x[..., 1] -= mean[1]
+    x[..., 2] -= mean[2]
+    if std is not None:
+      x[..., 0] /= std[0]
+      x[..., 1] /= std[1]
+      x[..., 2] /= std[2]
+  return x
+
+
+def _preprocess_symbolic_input(x, data_format, mode):
   """Preprocesses a tensor encoding a batch of images.
 
   Arguments:
-      x: input Numpy tensor, 4D.
-      data_format: data format of the image tensor.
-      mode: One of "caffe", "tf".
+      x: Input tensor, 3D or 4D.
+      data_format: Data format of the image tensor.
+      mode: One of "caffe", "tf" or "torch".
           - caffe: will convert the images from RGB to BGR,
               then will zero-center each color channel with
               respect to the ImageNet dataset,
               without scaling.
           - tf: will scale pixels between -1 and 1,
               sample-wise.
+          - torch: will scale pixels between 0 and 1 and then
+              will normalize each channel with respect to the
+              ImageNet dataset.
 
   Returns:
       Preprocessed tensor.
   """
-  if data_format is None:
-    data_format = K.image_data_format()
-  assert data_format in {'channels_last', 'channels_first'}
+  global _IMAGENET_MEAN
 
   if mode == 'tf':
-    x /= 255.
-    x -= 0.5
-    x *= 2.
+    x /= 127.5
+    x -= 1.
     return x
 
-  if data_format == 'channels_first':
-    if x.ndim == 3:
+  if mode == 'torch':
+    x /= 255.
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+  else:
+    if data_format == 'channels_first':
       # 'RGB'->'BGR'
-      x = x[::-1, ...]
-      # Zero-center by mean pixel
-      x[0, :, :] -= 103.939
-      x[1, :, :] -= 116.779
-      x[2, :, :] -= 123.68
+      if K.ndim(x) == 3:
+        x = x[::-1, ...]
+      else:
+        x = x[:, ::-1, ...]
     else:
-      x = x[:, ::-1, ...]
-      x[:, 0, :, :] -= 103.939
-      x[:, 1, :, :] -= 116.779
-      x[:, 2, :, :] -= 123.68
+      # 'RGB'->'BGR'
+      x = x[..., ::-1]
+    mean = [103.939, 116.779, 123.68]
+    std = None
+
+  if _IMAGENET_MEAN is None:
+    _IMAGENET_MEAN = K.constant(-np.array(mean))
+
+  # Zero-center by mean pixel
+  if K.dtype(x) != K.dtype(_IMAGENET_MEAN):
+    x = K.bias_add(x, K.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
   else:
-    # 'RGB'->'BGR'
-    x = x[..., ::-1]
-    # Zero-center by mean pixel
-    x[..., 0] -= 103.939
-    x[..., 1] -= 116.779
-    x[..., 2] -= 123.68
+    x = K.bias_add(x, _IMAGENET_MEAN, data_format)
+  if std is not None:
+    x /= std
   return x
 
 
+@tf_export('keras.applications.resnet50.preprocess_input',
+           'keras.applications.vgg19.preprocess_input',
+           'keras.applications.vgg16.preprocess_input')
+def preprocess_input(x, data_format=None, mode='caffe'):
+  """Preprocesses a tensor or Numpy array encoding a batch of images.
+
+  Arguments:
+      x: Input Numpy or symbolic tensor, 3D or 4D.
+      data_format: Data format of the image tensor/array.
+      mode: One of "caffe", "tf".
+          - caffe: will convert the images from RGB to BGR,
+              then will zero-center each color channel with
+              respect to the ImageNet dataset,
+              without scaling.
+          - tf: will scale pixels between -1 and 1,
+              sample-wise.
+
+  Returns:
+      Preprocessed tensor or Numpy array.
+
+  Raises:
+      ValueError: In case of unknown `data_format` argument.
+  """
+  if data_format is None:
+    data_format = K.image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  if isinstance(x, np.ndarray):
+    return _preprocess_numpy_input(x, data_format=data_format, mode=mode)
+  else:
+    return _preprocess_symbolic_input(x, data_format=data_format, mode=mode)
+
+
+@tf_export('keras.applications.nasnet.decode_predictions',
+           'keras.applications.resnet50.decode_predictions',
+           'keras.applications.vgg19.decode_predictions',
+           'keras.applications.vgg16.decode_predictions',
+           'keras.applications.inception_resnet_v2.decode_predictions',
+           'keras.applications.inception_v3.decode_predictions',
+           'keras.applications.densenet.decode_predictions',
+           'keras.applications.mobilenet.decode_predictions',
+           'keras.applications.xception.decode_predictions')
 def decode_predictions(preds, top=5):
   """Decodes the prediction of an ImageNet model.
 
   Arguments:
       preds: Numpy tensor encoding a batch of predictions.
-      top: integer, how many top-guesses to return.
+      top: Integer, how many top-guesses to return.
 
   Returns:
       A list of lists of top class prediction tuples
@@ -92,7 +219,7 @@ def decode_predictions(preds, top=5):
       One list of tuples per sample in batch input.
 
   Raises:
-      ValueError: in case of invalid shape of the `pred` array
+      ValueError: In case of invalid shape of the `pred` array
           (must be 2D).
   """
   global CLASS_INDEX
@@ -102,10 +229,11 @@ def decode_predictions(preds, top=5):
                      '(i.e. a 2D array of shape (samples, 1000)). '
                      'Found array with shape: ' + str(preds.shape))
   if CLASS_INDEX is None:
-    fpath = get_file('imagenet_class_index.json',
-                     CLASS_INDEX_PATH,
-                     cache_subdir='models',
-                     file_hash='c2c37ea517e94d9795004a39431a14cb')
+    fpath = get_file(
+        'imagenet_class_index.json',
+        CLASS_INDEX_PATH,
+        cache_subdir='models',
+        file_hash='c2c37ea517e94d9795004a39431a14cb')
     CLASS_INDEX = json.load(open(fpath))
   results = []
   for pred in preds:
@@ -122,17 +250,17 @@ def _obtain_input_shape(input_shape,
                         data_format,
                         require_flatten,
                         weights=None):
-  """Internal utility to compute/validate an ImageNet model's input shape.
+  """Internal utility to compute/validate a model's input shape.
 
   Arguments:
-      input_shape: either None (will return the default network input shape),
+      input_shape: Either None (will return the default network input shape),
           or a user-provided shape to be validated.
-      default_size: default input width/height for the model.
-      min_size: minimum input width/height accepted by the model.
-      data_format: image data format to use.
-      require_flatten: whether the model is expected to
+      default_size: Default input width/height for the model.
+      min_size: Minimum input width/height accepted by the model.
+      data_format: Image data format to use.
+      require_flatten: Whether the model is expected to
           be linked to a classifier via a Flatten layer.
-      weights: one of `None` (random initialization)
+      weights: One of `None` (random initialization)
           or 'imagenet' (pre-training on ImageNet).
           If weights='imagenet' input channels must be equal to 3.
 
@@ -140,7 +268,7 @@ def _obtain_input_shape(input_shape,
       An integer shape tuple (may include None entries).
 
   Raises:
-      ValueError: in case of invalid argument values.
+      ValueError: In case of invalid argument values.
   """
   if weights != 'imagenet' and input_shape and len(input_shape) == 3:
     if data_format == 'channels_first':
@@ -177,8 +305,8 @@ def _obtain_input_shape(input_shape,
                            '`input_shape=' + str(input_shape) + '`')
         if ((input_shape[1] is not None and input_shape[1] < min_size) or
             (input_shape[2] is not None and input_shape[2] < min_size)):
-          raise ValueError('Input size must be at least ' + str(min_size) + 'x'
-                           + str(min_size) + '; got '
+          raise ValueError('Input size must be at least ' + str(min_size) +
+                           'x' + str(min_size) + '; got '
                            '`input_shape=' + str(input_shape) + '`')
     else:
       if input_shape is not None:
@@ -189,8 +317,8 @@ def _obtain_input_shape(input_shape,
                            '`input_shape=' + str(input_shape) + '`')
         if ((input_shape[0] is not None and input_shape[0] < min_size) or
             (input_shape[1] is not None and input_shape[1] < min_size)):
-          raise ValueError('Input size must be at least ' + str(min_size) + 'x'
-                           + str(min_size) + '; got '
+          raise ValueError('Input size must be at least ' + str(min_size) +
+                           'x' + str(min_size) + '; got '
                            '`input_shape=' + str(input_shape) + '`')
   else:
     if require_flatten:
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py
index 517ba91219fc0ec0b61ccd673b420021a0db483d..d843dace59f1c88744217fbaee605d2ac859ec55 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py
+++ b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
 from tensorflow.python.platform import test
 
 
@@ -29,22 +30,62 @@ class ImageNetUtilsTest(test.TestCase):
   def test_preprocess_input(self):
     # Test batch of images
     x = np.random.uniform(0, 255, (2, 10, 10, 3))
-    self.assertEqual(
-        keras.applications.imagenet_utils.preprocess_input(x).shape, x.shape)
-    out1 = keras.applications.imagenet_utils.preprocess_input(
-        x, 'channels_last')
-    out2 = keras.applications.imagenet_utils.preprocess_input(
-        np.transpose(x, (0, 3, 1, 2)), 'channels_first')
+    self.assertEqual(preprocess_input(x).shape, x.shape)
+    out1 = preprocess_input(x, 'channels_last')
+    out2 = preprocess_input(np.transpose(x, (0, 3, 1, 2)), 'channels_first')
     self.assertAllClose(out1, out2.transpose(0, 2, 3, 1))
 
     # Test single image
     x = np.random.uniform(0, 255, (10, 10, 3))
-    self.assertEqual(
-        keras.applications.imagenet_utils.preprocess_input(x).shape, x.shape)
-    out1 = keras.applications.imagenet_utils.preprocess_input(
-        x, 'channels_last')
-    out2 = keras.applications.imagenet_utils.preprocess_input(
-        np.transpose(x, (2, 0, 1)), 'channels_first')
+    self.assertEqual(preprocess_input(x).shape, x.shape)
+    out1 = preprocess_input(x, 'channels_last')
+    out2 = preprocess_input(np.transpose(x, (2, 0, 1)), 'channels_first')
+    self.assertAllClose(out1, out2.transpose(1, 2, 0))
+
+  def test_preprocess_input_symbolic(self):
+    # Test image batch
+    x = np.random.uniform(0, 255, (2, 10, 10, 3))
+    inputs = keras.layers.Input(shape=x.shape[1:])
+    outputs = keras.layers.Lambda(
+        preprocess_input, output_shape=x.shape[1:])(inputs)
+    model = keras.models.Model(inputs, outputs)
+    assert model.predict(x).shape == x.shape
+    # pylint: disable=g-long-lambda
+    outputs1 = keras.layers.Lambda(lambda x:
+                                   preprocess_input(x, 'channels_last'),
+                                   output_shape=x.shape[1:])(inputs)
+    model1 = keras.models.Model(inputs, outputs1)
+    out1 = model1.predict(x)
+    x2 = np.transpose(x, (0, 3, 1, 2))
+    inputs2 = keras.layers.Input(shape=x2.shape[1:])
+    # pylint: disable=g-long-lambda
+    outputs2 = keras.layers.Lambda(lambda x:
+                                   preprocess_input(x, 'channels_first'),
+                                   output_shape=x2.shape[1:])(inputs2)
+    model2 = keras.models.Model(inputs2, outputs2)
+    out2 = model2.predict(x2)
+    self.assertAllClose(out1, out2.transpose(0, 2, 3, 1))
+
+    # Test single image
+    x = np.random.uniform(0, 255, (10, 10, 3))
+    inputs = keras.layers.Input(shape=x.shape)
+    outputs = keras.layers.Lambda(preprocess_input,
+                                  output_shape=x.shape)(inputs)
+    model = keras.models.Model(inputs, outputs)
+    assert model.predict(x[np.newaxis])[0].shape == x.shape
+    # pylint: disable=g-long-lambda
+    outputs1 = keras.layers.Lambda(lambda x:
+                                   preprocess_input(x, 'channels_last'),
+                                   output_shape=x.shape)(inputs)
+    model1 = keras.models.Model(inputs, outputs1)
+    out1 = model1.predict(x[np.newaxis])[0]
+    x2 = np.transpose(x, (2, 0, 1))
+    inputs2 = keras.layers.Input(shape=x2.shape)
+    outputs2 = keras.layers.Lambda(lambda x:
+                                   preprocess_input(x, 'channels_first'),
+                                   output_shape=x2.shape)(inputs2)  # pylint: disable=g-long-lambda
+    model2 = keras.models.Model(inputs2, outputs2)
+    out2 = model2.predict(x2[np.newaxis])[0]
     self.assertAllClose(out1, out2.transpose(1, 2, 0))
 
   def test_obtain_input_shape(self):
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
index de29b92575e48410614d3b32520d99436891344a..bf3901fc54419c2b401bf9c4d6311b39a18f1aba 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=invalid-name
+# pylint: disable=unused-import
 """Inception-ResNet V2 model for Keras.
 
 # Reference
@@ -23,10 +25,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
@@ -41,10 +45,14 @@ from tensorflow.python.keras._impl.keras.layers import Lambda
 from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
 
 BASE_WEIGHT_URL = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.7/'
 
 
+@tf_export('keras.applications.inception_resnet_v2.preprocess_input')
 def preprocess_input(x):
   """Preprocesses a numpy array encoding a batch of images.
 
@@ -114,7 +122,8 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
       scale: scaling factor to scale the residuals (i.e., the output of
           passing `x` through an inception module) before adding them
           to the shortcut branch. Let `r` be the output from the residual
-          branch, the output of this block will be `x + scale * r`.
+            branch,
+          the output of this block will be `x + scale * r`.
       block_type: `'block35'`, `'block17'` or `'block8'`, determines
           the network structure in the residual branch.
       block_idx: an `int` used for generating layer names. The Inception-ResNet
@@ -126,8 +135,7 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
           will have `block_type='block35', block_idx=0`, ane the layer names
             will have
           a common prefix `'block35_0'`.
-      activation: activation function to use at the end of the block
-          (see [activations](../activations.md)).
+      activation: activation function to use at the end of the block.
           When `activation=None`, no activation is applied
           (i.e., "linear" activation: `a(x) = x`).
 
@@ -176,6 +184,7 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
 
   x = Lambda(
       lambda inputs, scale: inputs[0] + inputs[1] * scale,
+      output_shape=K.int_shape(x)[1:],
       arguments={'scale': scale},
       name=block_name)([x, up])
   if activation is not None:
@@ -183,7 +192,9 @@ def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
   return x
 
 
-def InceptionResNetV2(include_top=True,  # pylint: disable=invalid-name
+@tf_export('keras.applications.InceptionResNetV2',
+           'keras.applications.inception_resnet_v2.InceptionResNetV2')
+def InceptionResNetV2(include_top=True,
                       weights='imagenet',
                       input_tensor=None,
                       input_shape=None,
@@ -208,8 +219,9 @@ def InceptionResNetV2(include_top=True,  # pylint: disable=invalid-name
   Arguments:
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: one of `None` (random initialization)
-          or `'imagenet'` (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -239,10 +251,11 @@ def InceptionResNetV2(include_top=True,  # pylint: disable=invalid-name
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -365,5 +378,7 @@ def InceptionResNetV2(include_top=True,  # pylint: disable=invalid-name
           cache_subdir='models',
           file_hash='d19885ff4a710c122648d3b5c3b684e4')
     model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
 
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
index d4fea4fbb0223d079149224e2d3d89487834ca40..e268e97bc663773a218f01b958b08f8e43c74ee2 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """Inception V3 model for Keras.
 
 Note that the input image format for this model is different than for
@@ -29,11 +30,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
@@ -46,6 +49,8 @@ from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.5/inception_v3_weights_tf_dim_ordering_tf_kernels.h5'
@@ -90,12 +95,15 @@ def conv2d_bn(x,
       strides=strides,
       padding=padding,
       use_bias=False,
-      name=conv_name)(x)
+      name=conv_name)(
+          x)
   x = BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(x)
   x = Activation('relu', name=name)(x)
   return x
 
 
+@tf_export('keras.applications.InceptionV3',
+           'keras.applications.inception_v3.InceptionV3')
 def InceptionV3(include_top=True,
                 weights='imagenet',
                 input_tensor=None,
@@ -107,7 +115,7 @@ def InceptionV3(include_top=True,
   Optionally loads weights pre-trained
   on ImageNet. Note that when using TensorFlow,
   for best performance you should set
-  `image_data_format="channels_last"` in your Keras config
+  `image_data_format='channels_last'` in your Keras config
   at ~/.keras/keras.json.
   The model and the weights are compatible with both
   TensorFlow and Theano. The data format
@@ -118,15 +126,16 @@ def InceptionV3(include_top=True,
   Arguments:
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(299, 299, 3)` (with `channels_last` data format)
           or `(3, 299, 299)` (with `channels_first` data format).
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 139.
           E.g. `(150, 150, 3)` would be one valid value.
       pooling: Optional pooling mode for feature extraction
@@ -151,10 +160,11 @@ def InceptionV3(include_top=True,
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -172,7 +182,10 @@ def InceptionV3(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
 
   if K.image_data_format() == 'channels_first':
     channel_axis = 1
@@ -383,9 +396,14 @@ def InceptionV3(include_top=True,
           cache_subdir='models',
           file_hash='bcbd6486424b2319ff4ef7d526e38f63')
     model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
+
   return model
 
 
+@tf_export('keras.applications.nasnet.preprocess_input',
+           'keras.applications.inception_v3.preprocess_input')
 def preprocess_input(x):
   """Preprocesses a numpy array encoding a batch of images.
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
index 653bd8c09f2d7a4ac2f6cb5e6c792b2285b378cc..027ae26113a42782fbbee27d993b85cb3aebbf23 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=invalid-name
+# pylint: disable=unused-import
 """MobileNet v1 models for Keras.
 
 MobileNet is a general architecture and can be used for multiple use cases.
@@ -56,7 +58,7 @@ the 100 % MobileNet on various input sizes:
 ------------------------------------------------------------------------
 
 The weights for all 16 models are obtained and translated
-from Tensorflow checkpoints found at
+from TensorFlow checkpoints found at
 https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md
 
 # Reference
@@ -67,7 +69,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import warnings
+import os
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
@@ -75,9 +77,10 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
 from tensorflow.python.keras._impl.keras.layers import Conv2D
@@ -89,6 +92,9 @@ from tensorflow.python.keras._impl.keras.layers import Reshape
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
 
 BASE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.6/'
 
@@ -97,6 +103,7 @@ def relu6(x):
   return K.relu(x, max_value=6)
 
 
+@tf_export('keras.applications.mobilenet.preprocess_input')
 def preprocess_input(x):
   """Preprocesses a numpy array encoding a batch of images.
 
@@ -129,7 +136,7 @@ class DepthwiseConv2D(Conv2D):
           all spatial dimensions.
           Specifying any stride value != 1 is incompatible with specifying
           any `dilation_rate` value != 1.
-      padding: one of `"valid"` or `"same"` (case-insensitive).
+      padding: one of `'valid'` or `'same'` (case-insensitive).
       depth_multiplier: The number of depthwise convolution output channels
           for each input channel.
           The total number of depthwise convolution output
@@ -143,29 +150,21 @@ class DepthwiseConv2D(Conv2D):
           `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+          If you never set it, then it will be 'channels_last'.
+      activation: Activation function to use.
           If you don't specify anything, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
+          (ie. 'linear' activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
-      depthwise_initializer: Initializer for the depthwise kernel matrix
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      depthwise_initializer: Initializer for the depthwise kernel matrix.
+      bias_initializer: Initializer for the bias vector.
       depthwise_regularizer: Regularizer function applied to
-          the depthwise kernel matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the depthwise kernel matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
+          the output of the layer (its 'activation')..
       depthwise_constraint: Constraint function applied to
-          the depthwise kernel matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the depthwise kernel matrix.
+      bias_constraint: Constraint function applied to the bias vector.
 
   Input shape:
       4D tensor with shape:
@@ -215,6 +214,7 @@ class DepthwiseConv2D(Conv2D):
     self.depthwise_constraint = constraints.get(depthwise_constraint)
     self.bias_initializer = initializers.get(bias_initializer)
 
+  @shape_type_conversion
   def build(self, input_shape):
     if len(input_shape) < 4:
       raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
@@ -268,6 +268,7 @@ class DepthwiseConv2D(Conv2D):
 
     return outputs
 
+  @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if self.data_format == 'channels_first':
       rows = input_shape[2]
@@ -304,7 +305,9 @@ class DepthwiseConv2D(Conv2D):
     return config
 
 
-def MobileNet(input_shape=None,  # pylint: disable=invalid-name
+@tf_export('keras.applications.MobileNet',
+           'keras.applications.mobilenet.MobileNet')
+def MobileNet(input_shape=None,
               alpha=1.0,
               depth_multiplier=1,
               dropout=1e-3,
@@ -333,7 +336,7 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
           if `include_top` is False (otherwise the input shape
           has to be `(224, 224, 3)` (with `channels_last` data format)
           or (3, 224, 224) (with `channels_first` data format).
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 32.
           E.g. `(200, 200, 3)` would be one valid value.
       alpha: controls the width of the network.
@@ -348,8 +351,9 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
       dropout: dropout rate
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: `None` (random initialization) or
-          `imagenet` (ImageNet weights)
+      weights: one of `None` (random initialization),
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of
           `layers.Input()`)
           to use as image input for the model.
@@ -384,16 +388,17 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
                        'as other backends do not support '
                        'depthwise convolution.')
 
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as ImageNet with `include_top` '
                      'as true, `classes` should be 1000')
 
-  # Determine proper input shape.
+  # Determine proper input shape and default size.
   if input_shape is None:
     default_size = 224
   else:
@@ -403,10 +408,12 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
     else:
       rows = input_shape[0]
       cols = input_shape[1]
+
     if rows == cols and rows in [128, 160, 192, 224]:
       default_size = rows
     else:
       default_size = 224
+
   input_shape = _obtain_input_shape(
       input_shape,
       default_size=default_size,
@@ -414,6 +421,7 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
       data_format=K.image_data_format(),
       require_flatten=include_top,
       weights=weights)
+
   if K.image_data_format() == 'channels_last':
     row_axis, col_axis = (0, 1)
   else:
@@ -438,15 +446,15 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
                        ' Input shape provided = %s' % (input_shape,))
 
   if K.image_data_format() != 'channels_last':
-    warnings.warn('The MobileNet family of models is only available '
-                  'for the input data format "channels_last" '
-                  '(width, height, channels). '
-                  'However your settings specify the default '
-                  'data format "channels_first" (channels, width, height).'
-                  ' You should set `image_data_format="channels_last"` '
-                  'in your Keras config located at ~/.keras/keras.json. '
-                  'The model being returned right now will expect inputs '
-                  'to follow the "channels_last" data format.')
+    logging.warning('The MobileNet family of models is only available '
+                    'for the input data format "channels_last" '
+                    '(width, height, channels). '
+                    'However your settings specify the default '
+                    'data format "channels_first" (channels, width, height).'
+                    ' You should set `image_data_format="channels_last"` '
+                    'in your Keras config located at ~/.keras/keras.json. '
+                    'The model being returned right now will expect inputs '
+                    'to follow the "channels_last" data format.')
     K.set_image_data_format('channels_last')
     old_data_format = 'channels_first'
   else:
@@ -534,6 +542,8 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
       weigh_path = BASE_WEIGHT_PATH + model_name
       weights_path = get_file(model_name, weigh_path, cache_subdir='models')
     model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
 
   if old_data_format:
     K.set_image_data_format(old_data_format)
@@ -594,7 +604,8 @@ def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
       padding='same',
       use_bias=False,
       strides=strides,
-      name='conv1')(inputs)
+      name='conv1')(
+          inputs)
   x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
   return Activation(relu6, name='conv1_relu')(x)
 
@@ -661,7 +672,8 @@ def _depthwise_conv_block(inputs,
       depth_multiplier=depth_multiplier,
       strides=strides,
       use_bias=False,
-      name='conv_dw_%d' % block_id)(inputs)
+      name='conv_dw_%d' % block_id)(
+          inputs)
   x = BatchNormalization(axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x)
   x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)
 
@@ -670,6 +682,7 @@ def _depthwise_conv_block(inputs,
       padding='same',
       use_bias=False,
       strides=(1, 1),
-      name='conv_pw_%d' % block_id)(x)
+      name='conv_pw_%d' % block_id)(
+          x)
   x = BatchNormalization(axis=channel_axis, name='conv_pw_%d_bn' % block_id)(x)
   return Activation(relu6, name='conv_pw_%d_relu' % block_id)(x)
diff --git a/tensorflow/python/keras/_impl/keras/applications/nasnet.py b/tensorflow/python/keras/_impl/keras/applications/nasnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..08dae57f006c64021cbca26404770cd89b1ce176
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/nasnet.py
@@ -0,0 +1,788 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+# pylint: disable=invalid-name
+# pylint: disable=unused-import
+"""NASNet-A models for Keras.
+
+NASNet refers to Neural Architecture Search Network, a family of models
+that were designed automatically by learning the model architectures
+directly on the dataset of interest.
+
+Here we consider NASNet-A, the highest performance model that was found
+for the CIFAR-10 dataset, and then extended to ImageNet 2012 dataset,
+obtaining state of the art performance on CIFAR-10 and ImageNet 2012.
+Only the NASNet-A models, and their respective weights, which are suited
+for ImageNet 2012 are provided.
+
+The below table describes the performance on ImageNet 2012:
+--------------------------------------------------------------------------------
+      Architecture       | Top-1 Acc | Top-5 Acc |  Multiply-Adds |  Params (M)
+--------------------------------------------------------------------------------
+|   NASNet-A (4 @ 1056)  |   74.0 %  |   91.6 %  |       564 M    |     5.3    |
+|   NASNet-A (6 @ 4032)  |   82.7 %  |   96.2 %  |      23.8 B    |    88.9    |
+--------------------------------------------------------------------------------
+
+References:
+ - [Learning Transferable Architectures for Scalable Image Recognition]
+    (https://arxiv.org/abs/1707.07012)
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.inception_v3 import preprocess_input
+from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.layers import Activation
+from tensorflow.python.keras._impl.keras.layers import add
+from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import BatchNormalization
+from tensorflow.python.keras._impl.keras.layers import concatenate
+from tensorflow.python.keras._impl.keras.layers import Conv2D
+from tensorflow.python.keras._impl.keras.layers import Cropping2D
+from tensorflow.python.keras._impl.keras.layers import Dense
+from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import Input
+from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import SeparableConv2D
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
+from tensorflow.python.keras._impl.keras.models import Model
+from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
+
+NASNET_MOBILE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-mobile.h5'
+NASNET_MOBILE_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-mobile-no-top.h5'
+NASNET_LARGE_WEIGHT_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-large.h5'
+NASNET_LARGE_WEIGHT_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/NASNet-large-no-top.h5'
+
+
+def NASNet(input_shape=None,
+           penultimate_filters=4032,
+           num_blocks=6,
+           stem_block_filters=96,
+           skip_reduction=True,
+           filter_multiplier=2,
+           include_top=True,
+           weights=None,
+           input_tensor=None,
+           pooling=None,
+           classes=1000,
+           default_size=None):
+  """Instantiates a NASNet model.
+
+  Note that only TensorFlow is supported for now,
+  therefore it only works with the data format
+  `image_data_format='channels_last'` in your Keras config
+  at `~/.keras/keras.json`.
+
+  Arguments:
+      input_shape: Optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(331, 331, 3)` for NASNetLarge or
+          `(224, 224, 3)` for NASNetMobile
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 32.
+          E.g. `(224, 224, 3)` would be one valid value.
+      penultimate_filters: Number of filters in the penultimate layer.
+          NASNet models use the notation `NASNet (N @ P)`, where:
+              -   N is the number of blocks
+              -   P is the number of penultimate filters
+      num_blocks: Number of repeated blocks of the NASNet model.
+          NASNet models use the notation `NASNet (N @ P)`, where:
+              -   N is the number of blocks
+              -   P is the number of penultimate filters
+      stem_block_filters: Number of filters in the initial stem block
+      skip_reduction: Whether to skip the reduction step at the tail
+          end of the network. Set to `False` for CIFAR models.
+      filter_multiplier: Controls the width of the network.
+          - If `filter_multiplier` < 1.0, proportionally decreases the number
+              of filters in each layer.
+          - If `filter_multiplier` > 1.0, proportionally increases the number
+              of filters in each layer.
+          - If `filter_multiplier` = 1, default number of filters from the
+               paper are used at each layer.
+      include_top: Whether to include the fully-connected
+          layer at the top of the network.
+      weights: `None` (random initialization) or
+          `imagenet` (ImageNet weights)
+      input_tensor: Optional Keras tensor (i.e. output of
+          `layers.Input()`)
+          to use as image input for the model.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model
+              will be the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a
+              2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: Optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+      default_size: Specifies the default image size of the model
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: In case of invalid argument for `weights`,
+          invalid input shape or invalid `penultimate_filters` value.
+      RuntimeError: If attempting to run this model with a
+          backend that does not support separable convolutions.
+  """
+  if K.backend() != 'tensorflow':
+    raise RuntimeError('Only Tensorflow backend is currently supported, '
+                       'as other backends do not support '
+                       'separable convolution.')
+
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as ImageNet with `include_top` '
+                     'as true, `classes` should be 1000')
+
+  if default_size is None:
+    default_size = 331
+
+  # Determine proper input shape and default size.
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=default_size,
+      min_size=32,
+      data_format=K.image_data_format(),
+      require_flatten=include_top or weights,
+      weights=weights)
+
+  if K.image_data_format() != 'channels_last':
+    logging.warning('The NASNet family of models is only available '
+                    'for the input data format "channels_last" '
+                    '(width, height, channels). '
+                    'However your settings specify the default '
+                    'data format "channels_first" (channels, width, height).'
+                    ' You should set `image_data_format="channels_last"` '
+                    'in your Keras config located at ~/.keras/keras.json. '
+                    'The model being returned right now will expect inputs '
+                    'to follow the "channels_last" data format.')
+    K.set_image_data_format('channels_last')
+    old_data_format = 'channels_first'
+  else:
+    old_data_format = None
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
+
+  if penultimate_filters % 24 != 0:
+    raise ValueError(
+        'For NASNet-A models, the value of `penultimate_filters` '
+        'needs to be divisible by 24. Current value: %d' % penultimate_filters)
+
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+  filters = penultimate_filters // 24
+
+  if not skip_reduction:
+    x = Conv2D(
+        stem_block_filters, (3, 3),
+        strides=(2, 2),
+        padding='valid',
+        use_bias=False,
+        name='stem_conv1',
+        kernel_initializer='he_normal')(
+            img_input)
+  else:
+    x = Conv2D(
+        stem_block_filters, (3, 3),
+        strides=(1, 1),
+        padding='same',
+        use_bias=False,
+        name='stem_conv1',
+        kernel_initializer='he_normal')(
+            img_input)
+
+  x = BatchNormalization(
+      axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='stem_bn1')(
+          x)
+
+  p = None
+  if not skip_reduction:  # imagenet / mobile mode
+    x, p = _reduction_a_cell(
+        x, p, filters // (filter_multiplier**2), block_id='stem_1')
+    x, p = _reduction_a_cell(
+        x, p, filters // filter_multiplier, block_id='stem_2')
+
+  for i in range(num_blocks):
+    x, p = _normal_a_cell(x, p, filters, block_id='%d' % (i))
+
+  x, p0 = _reduction_a_cell(
+      x, p, filters * filter_multiplier, block_id='reduce_%d' % (num_blocks))
+
+  p = p0 if not skip_reduction else p
+
+  for i in range(num_blocks):
+    x, p = _normal_a_cell(
+        x, p, filters * filter_multiplier, block_id='%d' % (num_blocks + i + 1))
+
+  x, p0 = _reduction_a_cell(
+      x,
+      p,
+      filters * filter_multiplier**2,
+      block_id='reduce_%d' % (2 * num_blocks))
+
+  p = p0 if not skip_reduction else p
+
+  for i in range(num_blocks):
+    x, p = _normal_a_cell(
+        x,
+        p,
+        filters * filter_multiplier**2,
+        block_id='%d' % (2 * num_blocks + i + 1))
+
+  x = Activation('relu')(x)
+
+  if include_top:
+    x = GlobalAveragePooling2D()(x)
+    x = Dense(classes, activation='softmax', name='predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+
+  model = Model(inputs, x, name='NASNet')
+
+  # load weights
+  if weights == 'imagenet':
+    if default_size == 224:  # mobile version
+      if include_top:
+        weight_path = NASNET_MOBILE_WEIGHT_PATH
+        model_name = 'nasnet_mobile.h5'
+      else:
+        weight_path = NASNET_MOBILE_WEIGHT_PATH_NO_TOP
+        model_name = 'nasnet_mobile_no_top.h5'
+
+      weights_file = get_file(model_name, weight_path, cache_subdir='models')
+      model.load_weights(weights_file)
+
+    elif default_size == 331:  # large version
+      if include_top:
+        weight_path = NASNET_LARGE_WEIGHT_PATH
+        model_name = 'nasnet_large.h5'
+      else:
+        weight_path = NASNET_LARGE_WEIGHT_PATH_NO_TOP
+        model_name = 'nasnet_large_no_top.h5'
+
+      weights_file = get_file(model_name, weight_path, cache_subdir='models')
+      model.load_weights(weights_file)
+    else:
+      raise ValueError('ImageNet weights can only be loaded with NASNetLarge'
+                       ' or NASNetMobile')
+  elif weights is not None:
+    model.load_weights(weights)
+
+  if old_data_format:
+    K.set_image_data_format(old_data_format)
+
+  return model
+
+
+@tf_export('keras.applications.NASNetLarge',
+           'keras.applications.nasnet.NASNetLarge')
+def NASNetLarge(input_shape=None,
+                include_top=True,
+                weights='imagenet',
+                input_tensor=None,
+                pooling=None,
+                classes=1000):
+  """Instantiates a NASNet model in ImageNet mode.
+
+  Note that only TensorFlow is supported for now,
+  therefore it only works with the data format
+  `image_data_format='channels_last'` in your Keras config
+  at `~/.keras/keras.json`.
+
+  Arguments:
+      input_shape: Optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(331, 331, 3)` for NASNetLarge.
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 32.
+          E.g. `(224, 224, 3)` would be one valid value.
+      include_top: Whether to include the fully-connected
+          layer at the top of the network.
+      weights: `None` (random initialization) or
+          `imagenet` (ImageNet weights)
+      input_tensor: Optional Keras tensor (i.e. output of
+          `layers.Input()`)
+          to use as image input for the model.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model
+              will be the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a
+              2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: Optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+      RuntimeError: If attempting to run this model with a
+          backend that does not support separable convolutions.
+  """
+  return NASNet(
+      input_shape,
+      penultimate_filters=4032,
+      num_blocks=6,
+      stem_block_filters=96,
+      skip_reduction=False,
+      filter_multiplier=2,
+      include_top=include_top,
+      weights=weights,
+      input_tensor=input_tensor,
+      pooling=pooling,
+      classes=classes,
+      default_size=331)
+
+
+@tf_export('keras.applications.NASNetMobile',
+           'keras.applications.nasnet.NASNetMobile')
+def NASNetMobile(input_shape=None,
+                 include_top=True,
+                 weights='imagenet',
+                 input_tensor=None,
+                 pooling=None,
+                 classes=1000):
+  """Instantiates a Mobile NASNet model in ImageNet mode.
+
+  Note that only TensorFlow is supported for now,
+  therefore it only works with the data format
+  `image_data_format='channels_last'` in your Keras config
+  at `~/.keras/keras.json`.
+
+  Arguments:
+      input_shape: Optional shape tuple, only to be specified
+          if `include_top` is False (otherwise the input shape
+          has to be `(224, 224, 3)` for NASNetMobile
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 32.
+          E.g. `(224, 224, 3)` would be one valid value.
+      include_top: Whether to include the fully-connected
+          layer at the top of the network.
+      weights: `None` (random initialization) or
+          `imagenet` (ImageNet weights)
+      input_tensor: Optional Keras tensor (i.e. output of
+          `layers.Input()`)
+          to use as image input for the model.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model
+              will be the 4D tensor output of the
+              last convolutional layer.
+          - `avg` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a
+              2D tensor.
+          - `max` means that global max pooling will
+              be applied.
+      classes: Optional number of classes to classify images
+          into, only to be specified if `include_top` is True, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras model instance.
+
+  Raises:
+      ValueError: In case of invalid argument for `weights`,
+          or invalid input shape.
+      RuntimeError: If attempting to run this model with a
+          backend that does not support separable convolutions.
+  """
+  return NASNet(
+      input_shape,
+      penultimate_filters=1056,
+      num_blocks=4,
+      stem_block_filters=32,
+      skip_reduction=False,
+      filter_multiplier=2,
+      include_top=include_top,
+      weights=weights,
+      input_tensor=input_tensor,
+      pooling=pooling,
+      classes=classes,
+      default_size=224)
+
+
+def _separable_conv_block(ip,
+                          filters,
+                          kernel_size=(3, 3),
+                          strides=(1, 1),
+                          block_id=None):
+  """Adds 2 blocks of [relu-separable conv-batchnorm].
+
+  Arguments:
+      ip: Input tensor
+      filters: Number of output filters per layer
+      kernel_size: Kernel size of separable convolutions
+      strides: Strided convolution for downsampling
+      block_id: String block_id
+
+  Returns:
+      A Keras tensor
+  """
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+
+  with K.name_scope('separable_conv_block_%s' % block_id):
+    x = Activation('relu')(ip)
+    x = SeparableConv2D(
+        filters,
+        kernel_size,
+        strides=strides,
+        name='separable_conv_1_%s' % block_id,
+        padding='same',
+        use_bias=False,
+        kernel_initializer='he_normal')(
+            x)
+    x = BatchNormalization(
+        axis=channel_dim,
+        momentum=0.9997,
+        epsilon=1e-3,
+        name='separable_conv_1_bn_%s' % (block_id))(
+            x)
+    x = Activation('relu')(x)
+    x = SeparableConv2D(
+        filters,
+        kernel_size,
+        name='separable_conv_2_%s' % block_id,
+        padding='same',
+        use_bias=False,
+        kernel_initializer='he_normal')(
+            x)
+    x = BatchNormalization(
+        axis=channel_dim,
+        momentum=0.9997,
+        epsilon=1e-3,
+        name='separable_conv_2_bn_%s' % (block_id))(
+            x)
+  return x
+
+
+def _adjust_block(p, ip, filters, block_id=None):
+  """Adjusts the input `previous path` to match the shape of the `input`.
+
+  Used in situations where the output number of filters needs to be changed.
+
+  Arguments:
+      p: Input tensor which needs to be modified
+      ip: Input tensor whose shape needs to be matched
+      filters: Number of output filters to be matched
+      block_id: String block_id
+
+  Returns:
+      Adjusted Keras tensor
+  """
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+  img_dim = 2 if K.image_data_format() == 'channels_first' else -2
+
+  ip_shape = K.int_shape(ip)
+
+  if p is not None:
+    p_shape = K.int_shape(p)
+
+  with K.name_scope('adjust_block'):
+    if p is None:
+      p = ip
+
+    elif p_shape[img_dim] != ip_shape[img_dim]:
+      with K.name_scope('adjust_reduction_block_%s' % block_id):
+        p = Activation('relu', name='adjust_relu_1_%s' % block_id)(p)
+
+        p1 = AveragePooling2D(
+            (1, 1),
+            strides=(2, 2),
+            padding='valid',
+            name='adjust_avg_pool_1_%s' % block_id)(
+                p)
+        p1 = Conv2D(
+            filters // 2, (1, 1),
+            padding='same',
+            use_bias=False,
+            name='adjust_conv_1_%s' % block_id,
+            kernel_initializer='he_normal')(
+                p1)
+
+        p2 = ZeroPadding2D(padding=((0, 1), (0, 1)))(p)
+        p2 = Cropping2D(cropping=((1, 0), (1, 0)))(p2)
+        p2 = AveragePooling2D(
+            (1, 1),
+            strides=(2, 2),
+            padding='valid',
+            name='adjust_avg_pool_2_%s' % block_id)(
+                p2)
+        p2 = Conv2D(
+            filters // 2, (1, 1),
+            padding='same',
+            use_bias=False,
+            name='adjust_conv_2_%s' % block_id,
+            kernel_initializer='he_normal')(
+                p2)
+
+        p = concatenate([p1, p2], axis=channel_dim)
+        p = BatchNormalization(
+            axis=channel_dim,
+            momentum=0.9997,
+            epsilon=1e-3,
+            name='adjust_bn_%s' % block_id)(
+                p)
+
+    elif p_shape[channel_dim] != filters:
+      with K.name_scope('adjust_projection_block_%s' % block_id):
+        p = Activation('relu')(p)
+        p = Conv2D(
+            filters, (1, 1),
+            strides=(1, 1),
+            padding='same',
+            name='adjust_conv_projection_%s' % block_id,
+            use_bias=False,
+            kernel_initializer='he_normal')(
+                p)
+        p = BatchNormalization(
+            axis=channel_dim,
+            momentum=0.9997,
+            epsilon=1e-3,
+            name='adjust_bn_%s' % block_id)(
+                p)
+  return p
+
+
+def _normal_a_cell(ip, p, filters, block_id=None):
+  """Adds a Normal cell for NASNet-A (Fig. 4 in the paper).
+
+  Arguments:
+      ip: Input tensor `x`
+      p: Input tensor `p`
+      filters: Number of output filters
+      block_id: String block_id
+
+  Returns:
+      A Keras tensor
+  """
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+
+  with K.name_scope('normal_A_block_%s' % block_id):
+    p = _adjust_block(p, ip, filters, block_id)
+
+    h = Activation('relu')(ip)
+    h = Conv2D(
+        filters, (1, 1),
+        strides=(1, 1),
+        padding='same',
+        name='normal_conv_1_%s' % block_id,
+        use_bias=False,
+        kernel_initializer='he_normal')(
+            h)
+    h = BatchNormalization(
+        axis=channel_dim,
+        momentum=0.9997,
+        epsilon=1e-3,
+        name='normal_bn_1_%s' % block_id)(
+            h)
+
+    with K.name_scope('block_1'):
+      x1_1 = _separable_conv_block(
+          h, filters, kernel_size=(5, 5), block_id='normal_left1_%s' % block_id)
+      x1_2 = _separable_conv_block(
+          p, filters, block_id='normal_right1_%s' % block_id)
+      x1 = add([x1_1, x1_2], name='normal_add_1_%s' % block_id)
+
+    with K.name_scope('block_2'):
+      x2_1 = _separable_conv_block(
+          p, filters, (5, 5), block_id='normal_left2_%s' % block_id)
+      x2_2 = _separable_conv_block(
+          p, filters, (3, 3), block_id='normal_right2_%s' % block_id)
+      x2 = add([x2_1, x2_2], name='normal_add_2_%s' % block_id)
+
+    with K.name_scope('block_3'):
+      x3 = AveragePooling2D(
+          (3, 3),
+          strides=(1, 1),
+          padding='same',
+          name='normal_left3_%s' % (block_id))(
+              h)
+      x3 = add([x3, p], name='normal_add_3_%s' % block_id)
+
+    with K.name_scope('block_4'):
+      x4_1 = AveragePooling2D(
+          (3, 3),
+          strides=(1, 1),
+          padding='same',
+          name='normal_left4_%s' % (block_id))(
+              p)
+      x4_2 = AveragePooling2D(
+          (3, 3),
+          strides=(1, 1),
+          padding='same',
+          name='normal_right4_%s' % (block_id))(
+              p)
+      x4 = add([x4_1, x4_2], name='normal_add_4_%s' % block_id)
+
+    with K.name_scope('block_5'):
+      x5 = _separable_conv_block(
+          h, filters, block_id='normal_left5_%s' % block_id)
+      x5 = add([x5, h], name='normal_add_5_%s' % block_id)
+
+    x = concatenate(
+        [p, x1, x2, x3, x4, x5],
+        axis=channel_dim,
+        name='normal_concat_%s' % block_id)
+  return x, ip
+
+
+def _reduction_a_cell(ip, p, filters, block_id=None):
+  """Adds a Reduction cell for NASNet-A (Fig. 4 in the paper).
+
+  Arguments:
+      ip: Input tensor `x`
+      p: Input tensor `p`
+      filters: Number of output filters
+      block_id: String block_id
+
+  Returns:
+      A Keras tensor
+  """
+  channel_dim = 1 if K.image_data_format() == 'channels_first' else -1
+
+  with K.name_scope('reduction_A_block_%s' % block_id):
+    p = _adjust_block(p, ip, filters, block_id)
+
+    h = Activation('relu')(ip)
+    h = Conv2D(
+        filters, (1, 1),
+        strides=(1, 1),
+        padding='same',
+        name='reduction_conv_1_%s' % block_id,
+        use_bias=False,
+        kernel_initializer='he_normal')(
+            h)
+    h = BatchNormalization(
+        axis=channel_dim,
+        momentum=0.9997,
+        epsilon=1e-3,
+        name='reduction_bn_1_%s' % block_id)(
+            h)
+
+    with K.name_scope('block_1'):
+      x1_1 = _separable_conv_block(
+          h,
+          filters, (5, 5),
+          strides=(2, 2),
+          block_id='reduction_left1_%s' % block_id)
+      x1_2 = _separable_conv_block(
+          p,
+          filters, (7, 7),
+          strides=(2, 2),
+          block_id='reduction_1_%s' % block_id)
+      x1 = add([x1_1, x1_2], name='reduction_add_1_%s' % block_id)
+
+    with K.name_scope('block_2'):
+      x2_1 = MaxPooling2D(
+          (3, 3),
+          strides=(2, 2),
+          padding='same',
+          name='reduction_left2_%s' % block_id)(
+              h)
+      x2_2 = _separable_conv_block(
+          p,
+          filters, (7, 7),
+          strides=(2, 2),
+          block_id='reduction_right2_%s' % block_id)
+      x2 = add([x2_1, x2_2], name='reduction_add_2_%s' % block_id)
+
+    with K.name_scope('block_3'):
+      x3_1 = AveragePooling2D(
+          (3, 3),
+          strides=(2, 2),
+          padding='same',
+          name='reduction_left3_%s' % block_id)(
+              h)
+      x3_2 = _separable_conv_block(
+          p,
+          filters, (5, 5),
+          strides=(2, 2),
+          block_id='reduction_right3_%s' % block_id)
+      x3 = add([x3_1, x3_2], name='reduction_add3_%s' % block_id)
+
+    with K.name_scope('block_4'):
+      x4 = AveragePooling2D(
+          (3, 3),
+          strides=(1, 1),
+          padding='same',
+          name='reduction_left4_%s' % block_id)(
+              x1)
+      x4 = add([x2, x4])
+
+    with K.name_scope('block_5'):
+      x5_1 = _separable_conv_block(
+          x1, filters, (3, 3), block_id='reduction_left4_%s' % block_id)
+      x5_2 = MaxPooling2D(
+          (3, 3),
+          strides=(2, 2),
+          padding='same',
+          name='reduction_right5_%s' % block_id)(
+              h)
+      x5 = add([x5_1, x5_2], name='reduction_add4_%s' % block_id)
+
+    x = concatenate(
+        [x2, x3, x4, x5],
+        axis=channel_dim,
+        name='reduction_concat_%s' % block_id)
+    return x, ip
diff --git a/tensorflow/python/keras/_impl/keras/applications/nasnet_test.py b/tensorflow/python/keras/_impl/keras/applications/nasnet_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa1dec670cb995e47bdcf88bd69594c532781b18
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/nasnet_test.py
@@ -0,0 +1,76 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Nasnet application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.platform import test
+
+
+class NASNetMobileTest(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.NASNetMobile(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.NASNetMobile(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1056))
+
+  def test_with_pooling(self):
+    model = keras.applications.NASNetMobile(weights=None,
+                                            include_top=False,
+                                            pooling='avg')
+    self.assertEqual(model.output_shape, (None, 1056))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.NASNetMobile(weights='unknown',
+                                      include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.NASNetMobile(weights='imagenet',
+                                      classes=2000)
+
+
+class NASNetLargeTest(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.NASNetLarge(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.NASNetLarge(weights=None, include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 4032))
+
+  def test_with_pooling(self):
+    model = keras.applications.NASNetLarge(weights=None,
+                                           include_top=False,
+                                           pooling='avg')
+    self.assertEqual(model.output_shape, (None, 4032))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.NASNetLarge(weights='unknown',
+                                     include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.NASNetLarge(weights='imagenet',
+                                     classes=2000)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
index 717b626fdc3c65d510cf190e53b4b1c04a89ebfa..a47dd657bb9ea0627d82831b7ee5d0b33788b5b7 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """ResNet50 model for Keras.
 
 # Reference:
@@ -26,11 +27,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
@@ -43,7 +46,10 @@ from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
 from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
+from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5'
@@ -76,7 +82,8 @@ def identity_block(input_tensor, kernel_size, filters, stage, block):
   x = Activation('relu')(x)
 
   x = Conv2D(
-      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(x)
+      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(
+          x)
   x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
   x = Activation('relu')(x)
 
@@ -90,7 +97,7 @@ def identity_block(input_tensor, kernel_size, filters, stage, block):
 
 def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
                                                                           2)):
-  """conv_block is the block that has a conv layer at shortcut.
+  """A block that has a conv layer at shortcut.
 
   Arguments:
       input_tensor: input tensor
@@ -98,14 +105,14 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
       filters: list of integers, the filters of 3 conv layer at main path
       stage: integer, current stage label, used for generating layer names
       block: 'a','b'..., current block label, used for generating layer names
-      strides: Tuple of integers.
+      strides: Strides for the first conv layer in the block.
 
   Returns:
       Output tensor for the block.
 
-  Note that from stage 3, the first conv layer at main path is with
-  strides=(2,2)
-  And the shortcut should have strides=(2,2) as well
+  Note that from stage 3,
+  the first conv layer at main path is with strides=(2, 2)
+  And the shortcut should have strides=(2, 2) as well
   """
   filters1, filters2, filters3 = filters
   if K.image_data_format() == 'channels_last':
@@ -116,13 +123,14 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
   bn_name_base = 'bn' + str(stage) + block + '_branch'
 
   x = Conv2D(
-      filters1, (1, 1), strides=strides,
-      name=conv_name_base + '2a')(input_tensor)
+      filters1, (1, 1), strides=strides, name=conv_name_base + '2a')(
+          input_tensor)
   x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x)
   x = Activation('relu')(x)
 
   x = Conv2D(
-      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(x)
+      filters2, kernel_size, padding='same', name=conv_name_base + '2b')(
+          x)
   x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x)
   x = Activation('relu')(x)
 
@@ -130,8 +138,8 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
   x = BatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x)
 
   shortcut = Conv2D(
-      filters3, (1, 1), strides=strides,
-      name=conv_name_base + '1')(input_tensor)
+      filters3, (1, 1), strides=strides, name=conv_name_base + '1')(
+          input_tensor)
   shortcut = BatchNormalization(axis=bn_axis, name=bn_name_base + '1')(shortcut)
 
   x = layers.add([x, shortcut])
@@ -139,6 +147,8 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
   return x
 
 
+@tf_export('keras.applications.ResNet50',
+           'keras.applications.resnet50.ResNet50')
 def ResNet50(include_top=True,
              weights='imagenet',
              input_tensor=None,
@@ -150,7 +160,7 @@ def ResNet50(include_top=True,
   Optionally loads weights pre-trained
   on ImageNet. Note that when using TensorFlow,
   for best performance you should set
-  `image_data_format="channels_last"` in your Keras config
+  `image_data_format='channels_last'` in your Keras config
   at ~/.keras/keras.json.
 
   The model and the weights are compatible with both
@@ -161,15 +171,16 @@ def ResNet50(include_top=True,
   Arguments:
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(224, 224, 3)` (with `channels_last` data format)
           or `(3, 224, 224)` (with `channels_first` data format).
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 197.
           E.g. `(200, 200, 3)` would be one valid value.
       pooling: Optional pooling mode for feature extraction
@@ -194,10 +205,11 @@ def ResNet50(include_top=True,
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -215,15 +227,18 @@ def ResNet50(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
-
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
   if K.image_data_format() == 'channels_last':
     bn_axis = 3
   else:
     bn_axis = 1
 
-  x = Conv2D(64, (7, 7),
-             strides=(2, 2), padding='same', name='conv1')(img_input)
+  x = Conv2D(
+      64, (7, 7), strides=(2, 2), padding='same', name='conv1')(
+          img_input)
   x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
   x = Activation('relu')(x)
   x = MaxPooling2D((3, 3), strides=(2, 2))(x)
@@ -283,4 +298,7 @@ def ResNet50(include_top=True,
           cache_subdir='models',
           md5_hash='a268eb855778b3df3c7506639542a6af')
     model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
+
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg16.py b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
index a0862e6407747cd0ad3d698c63da77b17c272e1b..9da74253abc2124844ab89b7727ddda4f754d8e2 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """VGG16 model for Keras.
 
 # Reference
@@ -25,10 +26,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Conv2D
 from tensorflow.python.keras._impl.keras.layers import Dense
@@ -40,12 +43,15 @@ from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5'
 WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5'
 
 
+@tf_export('keras.applications.VGG16', 'keras.applications.vgg16.VGG16')
 def VGG16(include_top=True,
           weights='imagenet',
           input_tensor=None,
@@ -57,7 +63,7 @@ def VGG16(include_top=True,
   Optionally loads weights pre-trained
   on ImageNet. Note that when using TensorFlow,
   for best performance you should set
-  `image_data_format="channels_last"` in your Keras config
+  `image_data_format='channels_last'` in your Keras config
   at ~/.keras/keras.json.
 
   The model and the weights are compatible with both
@@ -68,8 +74,9 @@ def VGG16(include_top=True,
   Arguments:
       include_top: whether to include the 3 fully-connected
           layers at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -101,10 +108,11 @@ def VGG16(include_top=True,
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -121,48 +129,62 @@ def VGG16(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
-
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
   # Block 1
   x = Conv2D(
-      64, (3, 3), activation='relu', padding='same',
-      name='block1_conv1')(img_input)
+      64, (3, 3), activation='relu', padding='same', name='block1_conv1')(
+          img_input)
   x = Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
+      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
 
   # Block 2
   x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
+      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(
+          x)
   x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
+      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
 
   # Block 3
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
 
   # Block 4
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
 
   # Block 5
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
 
   if include_top:
@@ -211,4 +233,8 @@ def VGG16(include_top=True,
         dense = model.get_layer(name='fc1')
         layer_utils.convert_dense_weights_data_format(dense, shape,
                                                       'channels_first')
+
+  elif weights is not None:
+    model.load_weights(weights)
+
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg19.py b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
index cfa1c95336e8ab798e4d5bd67f9c7f89e4705ca7..961c1f991893dbc0df858e9f72b61202c9fee500 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """VGG19 model for Keras.
 
 # Reference
@@ -25,10 +26,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Conv2D
 from tensorflow.python.keras._impl.keras.layers import Dense
@@ -40,12 +43,15 @@ from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels.h5'
 WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5'
 
 
+@tf_export('keras.applications.VGG19', 'keras.applications.vgg19.VGG19')
 def VGG19(include_top=True,
           weights='imagenet',
           input_tensor=None,
@@ -57,7 +63,7 @@ def VGG19(include_top=True,
   Optionally loads weights pre-trained
   on ImageNet. Note that when using TensorFlow,
   for best performance you should set
-  `image_data_format="channels_last"` in your Keras config
+  `image_data_format='channels_last'` in your Keras config
   at ~/.keras/keras.json.
 
   The model and the weights are compatible with both
@@ -68,15 +74,16 @@ def VGG19(include_top=True,
   Arguments:
       include_top: whether to include the 3 fully-connected
           layers at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(224, 224, 3)` (with `channels_last` data format)
           or `(3, 224, 224)` (with `channels_first` data format).
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 48.
           E.g. `(200, 200, 3)` would be one valid value.
       pooling: Optional pooling mode for feature extraction
@@ -101,10 +108,11 @@ def VGG19(include_top=True,
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -121,54 +129,71 @@ def VGG19(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
-
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
   # Block 1
   x = Conv2D(
-      64, (3, 3), activation='relu', padding='same',
-      name='block1_conv1')(img_input)
+      64, (3, 3), activation='relu', padding='same', name='block1_conv1')(
+          img_input)
   x = Conv2D(
-      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)
+      64, (3, 3), activation='relu', padding='same', name='block1_conv2')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
 
   # Block 2
   x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
+      128, (3, 3), activation='relu', padding='same', name='block2_conv1')(
+          x)
   x = Conv2D(
-      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
+      128, (3, 3), activation='relu', padding='same', name='block2_conv2')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
 
   # Block 3
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv1')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv2')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv3')(
+          x)
   x = Conv2D(
-      256, (3, 3), activation='relu', padding='same', name='block3_conv4')(x)
+      256, (3, 3), activation='relu', padding='same', name='block3_conv4')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
 
   # Block 4
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv1')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv2')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv3')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block4_conv4')(x)
+      512, (3, 3), activation='relu', padding='same', name='block4_conv4')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
 
   # Block 5
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv1')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv2')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv3')(
+          x)
   x = Conv2D(
-      512, (3, 3), activation='relu', padding='same', name='block5_conv4')(x)
+      512, (3, 3), activation='relu', padding='same', name='block5_conv4')(
+          x)
   x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)
 
   if include_top:
@@ -217,4 +242,8 @@ def VGG19(include_top=True,
         dense = model.get_layer(name='fc1')
         layer_utils.convert_dense_weights_data_format(dense, shape,
                                                       'channels_first')
+
+  elif weights is not None:
+    model.load_weights(weights)
+
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/xception.py b/tensorflow/python/keras/_impl/keras/applications/xception.py
index 14f6ad809015aae451f8ddc13fa64166b06995a6..7e7ca5a18a31622ac79d61ab01ce65341a4a46c5 100644
--- a/tensorflow/python/keras/_impl/keras/applications/xception.py
+++ b/tensorflow/python/keras/_impl/keras/applications/xception.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
+# pylint: disable=unused-import
 """Xception V1 model for Keras.
 
 On ImageNet, this model gets to a top-1 validation accuracy of 0.790
@@ -36,11 +37,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
@@ -54,12 +57,15 @@ from tensorflow.python.keras._impl.keras.layers import SeparableConv2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 TF_WEIGHTS_PATH = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels.h5'
 TF_WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.4/xception_weights_tf_dim_ordering_tf_kernels_notop.h5'
 
 
+@tf_export('keras.applications.Xception',
+           'keras.applications.xception.Xception')
 def Xception(include_top=True,
              weights='imagenet',
              input_tensor=None,
@@ -72,7 +78,7 @@ def Xception(include_top=True,
   on ImageNet. This model is available for TensorFlow only,
   and can only be used with inputs following the TensorFlow
   data format `(width, height, channels)`.
-  You should set `image_data_format="channels_last"` in your Keras config
+  You should set `image_data_format='channels_last'` in your Keras config
   located at ~/.keras/keras.json.
 
   Note that the default input image size for this model is 299x299.
@@ -80,14 +86,15 @@ def Xception(include_top=True,
   Arguments:
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+            'imagenet' (pre-training on ImageNet),
+            or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
           if `include_top` is False (otherwise the input shape
           has to be `(299, 299, 3)`.
-          It should have exactly 3 input channels,
+          It should have exactly 3 inputs channels,
           and width and height should be no smaller than 71.
           E.g. `(150, 150, 3)` would be one valid value.
       pooling: Optional pooling mode for feature extraction
@@ -114,18 +121,16 @@ def Xception(include_top=True,
       RuntimeError: If attempting to run this model with a
           backend that does not support separable convolutions.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
                      ' as true, `classes` should be 1000')
 
-  if K.backend() != 'tensorflow':
-    raise RuntimeError('The Xception model is only available with '
-                       'the TensorFlow backend.')
   if K.image_data_format() != 'channels_last':
     logging.warning(
         'The Xception model is only available for the '
@@ -154,11 +159,14 @@ def Xception(include_top=True,
   if input_tensor is None:
     img_input = Input(shape=input_shape)
   else:
-    img_input = Input(tensor=input_tensor, shape=input_shape)
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
 
   x = Conv2D(
-      32, (3, 3), strides=(2, 2), use_bias=False,
-      name='block1_conv1')(img_input)
+      32, (3, 3), strides=(2, 2), use_bias=False, name='block1_conv1')(
+          img_input)
   x = BatchNormalization(name='block1_conv1_bn')(x)
   x = Activation('relu', name='block1_conv1_act')(x)
   x = Conv2D(64, (3, 3), use_bias=False, name='block1_conv2')(x)
@@ -166,53 +174,65 @@ def Xception(include_top=True,
   x = Activation('relu', name='block1_conv2_act')(x)
 
   residual = Conv2D(
-      128, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+      128, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
+          x)
   residual = BatchNormalization()(residual)
 
   x = SeparableConv2D(
-      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv1')(x)
+      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv1')(
+          x)
   x = BatchNormalization(name='block2_sepconv1_bn')(x)
   x = Activation('relu', name='block2_sepconv2_act')(x)
   x = SeparableConv2D(
-      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv2')(x)
+      128, (3, 3), padding='same', use_bias=False, name='block2_sepconv2')(
+          x)
   x = BatchNormalization(name='block2_sepconv2_bn')(x)
 
   x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block2_pool')(x)
+      (3, 3), strides=(2, 2), padding='same', name='block2_pool')(
+          x)
   x = layers.add([x, residual])
 
   residual = Conv2D(
-      256, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+      256, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
+          x)
   residual = BatchNormalization()(residual)
 
   x = Activation('relu', name='block3_sepconv1_act')(x)
   x = SeparableConv2D(
-      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv1')(x)
+      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv1')(
+          x)
   x = BatchNormalization(name='block3_sepconv1_bn')(x)
   x = Activation('relu', name='block3_sepconv2_act')(x)
   x = SeparableConv2D(
-      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv2')(x)
+      256, (3, 3), padding='same', use_bias=False, name='block3_sepconv2')(
+          x)
   x = BatchNormalization(name='block3_sepconv2_bn')(x)
 
   x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block3_pool')(x)
+      (3, 3), strides=(2, 2), padding='same', name='block3_pool')(
+          x)
   x = layers.add([x, residual])
 
   residual = Conv2D(
-      728, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+      728, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
+          x)
   residual = BatchNormalization()(residual)
 
   x = Activation('relu', name='block4_sepconv1_act')(x)
   x = SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv1')(x)
+      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv1')(
+          x)
   x = BatchNormalization(name='block4_sepconv1_bn')(x)
   x = Activation('relu', name='block4_sepconv2_act')(x)
   x = SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv2')(x)
+      728, (3, 3), padding='same', use_bias=False, name='block4_sepconv2')(
+          x)
   x = BatchNormalization(name='block4_sepconv2_bn')(x)
 
   x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block4_pool')(x)
+      (3, 3), strides=(2, 2), padding='same', name='block4_pool')(
+          x)
   x = layers.add([x, residual])
 
   for i in range(8):
@@ -221,46 +241,52 @@ def Xception(include_top=True,
 
     x = Activation('relu', name=prefix + '_sepconv1_act')(x)
     x = SeparableConv2D(
-        728, (3, 3), padding='same', use_bias=False,
-        name=prefix + '_sepconv1')(x)
+        728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv1')(
+            x)
     x = BatchNormalization(name=prefix + '_sepconv1_bn')(x)
     x = Activation('relu', name=prefix + '_sepconv2_act')(x)
     x = SeparableConv2D(
-        728, (3, 3), padding='same', use_bias=False,
-        name=prefix + '_sepconv2')(x)
+        728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv2')(
+            x)
     x = BatchNormalization(name=prefix + '_sepconv2_bn')(x)
     x = Activation('relu', name=prefix + '_sepconv3_act')(x)
     x = SeparableConv2D(
-        728, (3, 3), padding='same', use_bias=False,
-        name=prefix + '_sepconv3')(x)
+        728, (3, 3), padding='same', use_bias=False, name=prefix + '_sepconv3')(
+            x)
     x = BatchNormalization(name=prefix + '_sepconv3_bn')(x)
 
     x = layers.add([x, residual])
 
   residual = Conv2D(
-      1024, (1, 1), strides=(2, 2), padding='same', use_bias=False)(x)
+      1024, (1, 1), strides=(2, 2), padding='same', use_bias=False)(
+          x)
   residual = BatchNormalization()(residual)
 
   x = Activation('relu', name='block13_sepconv1_act')(x)
   x = SeparableConv2D(
-      728, (3, 3), padding='same', use_bias=False, name='block13_sepconv1')(x)
+      728, (3, 3), padding='same', use_bias=False, name='block13_sepconv1')(
+          x)
   x = BatchNormalization(name='block13_sepconv1_bn')(x)
   x = Activation('relu', name='block13_sepconv2_act')(x)
   x = SeparableConv2D(
-      1024, (3, 3), padding='same', use_bias=False, name='block13_sepconv2')(x)
+      1024, (3, 3), padding='same', use_bias=False, name='block13_sepconv2')(
+          x)
   x = BatchNormalization(name='block13_sepconv2_bn')(x)
 
   x = MaxPooling2D(
-      (3, 3), strides=(2, 2), padding='same', name='block13_pool')(x)
+      (3, 3), strides=(2, 2), padding='same', name='block13_pool')(
+          x)
   x = layers.add([x, residual])
 
   x = SeparableConv2D(
-      1536, (3, 3), padding='same', use_bias=False, name='block14_sepconv1')(x)
+      1536, (3, 3), padding='same', use_bias=False, name='block14_sepconv1')(
+          x)
   x = BatchNormalization(name='block14_sepconv1_bn')(x)
   x = Activation('relu', name='block14_sepconv1_act')(x)
 
   x = SeparableConv2D(
-      2048, (3, 3), padding='same', use_bias=False, name='block14_sepconv2')(x)
+      2048, (3, 3), padding='same', use_bias=False, name='block14_sepconv2')(
+          x)
   x = BatchNormalization(name='block14_sepconv2_bn')(x)
   x = Activation('relu', name='block14_sepconv2_act')(x)
 
@@ -297,12 +323,15 @@ def Xception(include_top=True,
           cache_subdir='models',
           file_hash='b0042744bf5b25fce3cb969f33bebb97')
     model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
 
   if old_data_format:
     K.set_image_data_format(old_data_format)
   return model
 
 
+@tf_export('keras.applications.xception.preprocess_input')
 def preprocess_input(x):
   """Preprocesses a numpy array encoding a batch of images.
 
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index ec7a5dcffd0c0f0dda90bbc92de54af82680b607..a4caa420158a53044c3e295e881afdad949ee795 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -29,6 +29,7 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_module
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
@@ -47,6 +48,7 @@ from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
@@ -54,6 +56,7 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.training import moving_averages
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 
 py_all = all
@@ -85,7 +88,7 @@ _MANUAL_VAR_INIT = False
 _FLOATX = 'float32'
 
 # Epsilon fuzz factor used throughout the codebase.
-_EPSILON = 10e-8
+_EPSILON = 1e-7
 
 # Default image data format, one of "channels_last", "channels_first".
 _IMAGE_DATA_FORMAT = 'channels_last'
@@ -96,6 +99,7 @@ _IMAGE_DATA_FORMAT = 'channels_last'
 _LOCAL_DEVICES = None
 
 
+@tf_export('keras.backend.backend')
 def backend():
   """Publicly accessible method for determining the current backend.
 
@@ -107,6 +111,7 @@ def backend():
   return 'tensorflow'
 
 
+@tf_export('keras.backend.epsilon')
 def epsilon():
   """Returns the value of the fuzz factor used in numeric expressions.
 
@@ -116,12 +121,13 @@ def epsilon():
   Example:
   ```python
       >>> keras.backend.epsilon()
-      1e-08
+      1e-07
   ```
   """
   return _EPSILON
 
 
+@tf_export('keras.backend.set_epsilon')
 def set_epsilon(value):
   """Sets the value of the fuzz factor used in numeric expressions.
 
@@ -132,7 +138,7 @@ def set_epsilon(value):
   ```python
       >>> from keras import backend as K
       >>> K.epsilon()
-      1e-08
+      1e-07
       >>> K.set_epsilon(1e-05)
       >>> K.epsilon()
       1e-05
@@ -142,6 +148,7 @@ def set_epsilon(value):
   _EPSILON = value
 
 
+@tf_export('keras.backend.floatx')
 def floatx():
   """Returns the default float type, as a string.
 
@@ -159,6 +166,7 @@ def floatx():
   return _FLOATX
 
 
+@tf_export('keras.backend.set_floatx')
 def set_floatx(value):
   """Sets the default float type.
 
@@ -184,6 +192,7 @@ def set_floatx(value):
   _FLOATX = str(value)
 
 
+@tf_export('keras.backend.cast_to_floatx')
 def cast_to_floatx(x):
   """Cast a Numpy array to the default Keras float type.
 
@@ -211,6 +220,7 @@ def cast_to_floatx(x):
   return np.asarray(x, dtype=_FLOATX)
 
 
+@tf_export('keras.backend.image_data_format')
 def image_data_format():
   """Returns the default image data format convention.
 
@@ -226,6 +236,7 @@ def image_data_format():
   return _IMAGE_DATA_FORMAT
 
 
+@tf_export('keras.backend.set_image_data_format')
 def set_image_data_format(data_format):
   """Sets the value of the image data format convention.
 
@@ -251,6 +262,7 @@ def set_image_data_format(data_format):
   _IMAGE_DATA_FORMAT = str(data_format)
 
 
+@tf_export('keras.backend.get_uid')
 def get_uid(prefix=''):
   """Associates a string prefix with an integer counter in a TensorFlow graph.
 
@@ -278,6 +290,7 @@ def get_uid(prefix=''):
   return layer_name_uids[prefix]
 
 
+@tf_export('keras.backend.reset_uids')
 def reset_uids():
   per_graph_layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS
   keys = list(per_graph_layer_name_uids.keys())
@@ -285,6 +298,7 @@ def reset_uids():
     del per_graph_layer_name_uids[key]
 
 
+@tf_export('keras.backend.clear_session')
 def clear_session():
   """Destroys the current TF graph and creates a new one.
 
@@ -295,11 +309,13 @@ def clear_session():
   ops.reset_default_graph()
   reset_uids()
   _SESSION = None
-  phase = array_ops.placeholder(dtype='bool', name='keras_learning_phase')
+  phase = array_ops.placeholder_with_default(
+      False, shape=(), name='keras_learning_phase')
   _GRAPH_LEARNING_PHASES = {}
   _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = phase
 
 
+@tf_export('keras.backend.manual_variable_initialization')
 def manual_variable_initialization(value):
   """Sets the manual variable initialization flag.
 
@@ -316,6 +332,7 @@ def manual_variable_initialization(value):
   _MANUAL_VAR_INIT = value
 
 
+@tf_export('keras.backend.learning_phase')
 def learning_phase():
   """Returns the learning phase flag.
 
@@ -325,14 +342,25 @@ def learning_phase():
 
   Returns:
       Learning phase (scalar integer tensor or Python integer).
+
+  Raises:
+      ValueError: If called when Eager execution is enabled.
   """
+  if context.in_eager_mode():
+    if 'eager' not in _GRAPH_LEARNING_PHASES:
+      # Fallback to inference mode as default.
+      return 0
+    return _GRAPH_LEARNING_PHASES['eager']
+
   graph = ops.get_default_graph()
   if graph not in _GRAPH_LEARNING_PHASES:
-    phase = array_ops.placeholder(dtype='bool', name='keras_learning_phase')
+    phase = array_ops.placeholder_with_default(
+        False, shape=(), name='keras_learning_phase')
     _GRAPH_LEARNING_PHASES[graph] = phase
   return _GRAPH_LEARNING_PHASES[graph]
 
 
+@tf_export('keras.backend.set_learning_phase')
 def set_learning_phase(value):
   """Sets the learning phase to a fixed value.
 
@@ -345,9 +373,13 @@ def set_learning_phase(value):
   global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
   if value not in {0, 1}:
     raise ValueError('Expected learning phase to be ' '0 or 1.')
-  _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value
+  if context.in_eager_mode():
+    _GRAPH_LEARNING_PHASES['eager'] = value
+  else:
+    _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value
 
 
+@tf_export('keras.backend.get_session')
 def get_session():
   """Returns the TF session to be used by the backend.
 
@@ -383,6 +415,7 @@ def get_session():
   return session
 
 
+@tf_export('keras.backend.set_session')
 def set_session(session):
   """Sets the global TensorFlow session.
 
@@ -485,6 +518,7 @@ def _to_tensor(x, dtype):
   return ops.convert_to_tensor(x, dtype=dtype)
 
 
+@tf_export('keras.backend.is_sparse')
 def is_sparse(tensor):
   """Returns whether a tensor is a sparse tensor.
 
@@ -508,6 +542,7 @@ def is_sparse(tensor):
   return isinstance(tensor, sparse_tensor.SparseTensor)
 
 
+@tf_export('keras.backend.to_dense')
 def to_dense(tensor):
   """Converts a sparse tensor into a dense tensor and returns it.
 
@@ -537,6 +572,7 @@ def to_dense(tensor):
 name_scope = ops.name_scope
 
 
+@tf_export('keras.backend.variable')
 def variable(value, dtype=None, name=None, constraint=None):
   """Instantiates a variable and returns it.
 
@@ -575,7 +611,7 @@ def variable(value, dtype=None, name=None, constraint=None):
     v._keras_shape = sparse_coo.shape
     v._uses_learning_phase = False
     return v
-  v = variables_module.Variable(
+  v = resource_variable_ops.ResourceVariable(
       value,
       dtype=dtypes_module.as_dtype(dtype),
       name=name,
@@ -609,6 +645,7 @@ def _initialize_variables(session):
       session.run(variables_module.variables_initializer(uninitialized_vars))
 
 
+@tf_export('keras.backend.constant')
 def constant(value, dtype=None, shape=None, name=None):
   """Creates a constant tensor.
 
@@ -677,6 +714,7 @@ def is_keras_tensor(x):
   return hasattr(x, '_keras_history')
 
 
+@tf_export('keras.backend.placeholder')
 def placeholder(shape=None, ndim=None, dtype=None, sparse=False, name=None):
   """Instantiates a placeholder tensor and returns it.
 
@@ -729,6 +767,7 @@ def is_placeholder(x):
     return False
 
 
+@tf_export('keras.backend.shape')
 def shape(x):
   """Returns the symbolic shape of a tensor or variable.
 
@@ -761,6 +800,7 @@ def shape(x):
   return array_ops.shape(x)
 
 
+@tf_export('keras.backend.int_shape')
 def int_shape(x):
   """Returns the shape of tensor or variable as a tuple of int or None entries.
 
@@ -788,6 +828,7 @@ def int_shape(x):
     return None
 
 
+@tf_export('keras.backend.ndim')
 def ndim(x):
   """Returns the number of axes in a tensor, as an integer.
 
@@ -815,6 +856,7 @@ def ndim(x):
   return None
 
 
+@tf_export('keras.backend.dtype')
 def dtype(x):
   """Returns the dtype of a Keras tensor or variable, as a string.
 
@@ -845,6 +887,7 @@ def dtype(x):
   return x.dtype.base_dtype.name
 
 
+@tf_export('keras.backend.eval')
 def eval(x):
   """Evaluates the value of a variable.
 
@@ -866,6 +909,7 @@ def eval(x):
   return to_dense(x).eval(session=get_session())
 
 
+@tf_export('keras.backend.zeros')
 def zeros(shape, dtype=None, name=None):
   """Instantiates an all-zeros variable and returns it.
 
@@ -876,6 +920,8 @@ def zeros(shape, dtype=None, name=None):
 
   Returns:
       A variable (including Keras metadata), filled with `0.0`.
+      Note that if `shape` was symbolic, we cannot return a variable,
+      and will return a dynamically-shaped tensor instead.
 
   Example:
   ```python
@@ -890,12 +936,15 @@ def zeros(shape, dtype=None, name=None):
   if dtype is None:
     dtype = floatx()
   tf_dtype = dtypes_module.as_dtype(dtype)
-  return variable(
-      init_ops.constant_initializer(0., dtype=tf_dtype)(shape), dtype, name)
+  v = array_ops.zeros(shape=shape, dtype=tf_dtype, name=name)
+  if py_all(v.get_shape().as_list()):
+    return variable(v, dtype=dtype, name=name)
+  return v
 
 
+@tf_export('keras.backend.ones')
 def ones(shape, dtype=None, name=None):
-  """Instantiates an all-ones tensor variable and returns it.
+  """Instantiates an all-ones variable and returns it.
 
   Arguments:
       shape: Tuple of integers, shape of returned Keras variable.
@@ -904,6 +953,8 @@ def ones(shape, dtype=None, name=None):
 
   Returns:
       A Keras variable, filled with `1.0`.
+      Note that if `shape` was symbolic, we cannot return a variable,
+      and will return a dynamically-shaped tensor instead.
 
   Example:
   ```python
@@ -918,10 +969,13 @@ def ones(shape, dtype=None, name=None):
   if dtype is None:
     dtype = floatx()
   tf_dtype = dtypes_module.as_dtype(dtype)
-  return variable(
-      init_ops.constant_initializer(1., dtype=tf_dtype)(shape), dtype, name)
+  v = array_ops.ones(shape=shape, dtype=tf_dtype, name=name)
+  if py_all(v.get_shape().as_list()):
+    return variable(v, dtype=dtype, name=name)
+  return v
 
 
+@tf_export('keras.backend.eye')
 def eye(size, dtype=None, name=None):
   """Instantiate an identity matrix and returns it.
 
@@ -950,6 +1004,7 @@ def eye(size, dtype=None, name=None):
   return variable(linalg_ops.eye(size, dtype=tf_dtype), dtype, name)
 
 
+@tf_export('keras.backend.zeros_like')
 def zeros_like(x, dtype=None, name=None):
   """Instantiates an all-zeros variable of the same shape as another tensor.
 
@@ -975,6 +1030,7 @@ def zeros_like(x, dtype=None, name=None):
   return array_ops.zeros_like(x, dtype=dtype, name=name)
 
 
+@tf_export('keras.backend.ones_like')
 def ones_like(x, dtype=None, name=None):
   """Instantiates an all-ones variable of the same shape as another tensor.
 
@@ -1013,6 +1069,7 @@ def identity(x, name=None):
   return array_ops.identity(x, name=name)
 
 
+@tf_export('keras.backend.random_uniform_variable')
 def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
   """Instantiates a variable with values drawn from a uniform distribution.
 
@@ -1049,6 +1106,7 @@ def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
   return variable(value, dtype=dtype, name=name)
 
 
+@tf_export('keras.backend.random_normal_variable')
 def random_normal_variable(shape, mean, scale, dtype=None, name=None,
                            seed=None):
   """Instantiates a variable with values drawn from a normal distribution.
@@ -1086,6 +1144,7 @@ def random_normal_variable(shape, mean, scale, dtype=None, name=None,
   return variable(value, dtype=dtype, name=name)
 
 
+@tf_export('keras.backend.count_params')
 def count_params(x):
   """Returns the static number of elements in a variable or tensor.
 
@@ -1108,6 +1167,7 @@ def count_params(x):
   return np.prod(x.get_shape().as_list())
 
 
+@tf_export('keras.backend.cast')
 def cast(x, dtype):
   """Casts a tensor to a different dtype and returns it.
 
@@ -1143,10 +1203,12 @@ def cast(x, dtype):
 # UPDATES OPS
 
 
+@tf_export('keras.backend.update')
 def update(x, new_x):
   return state_ops.assign(x, new_x)
 
 
+@tf_export('keras.backend.update_add')
 def update_add(x, increment):
   """Update the value of `x` by adding `increment`.
 
@@ -1160,6 +1222,7 @@ def update_add(x, increment):
   return state_ops.assign_add(x, increment)
 
 
+@tf_export('keras.backend.update_sub')
 def update_sub(x, decrement):
   """Update the value of `x` by subtracting `decrement`.
 
@@ -1173,6 +1236,7 @@ def update_sub(x, decrement):
   return state_ops.assign_sub(x, decrement)
 
 
+@tf_export('keras.backend.moving_average_update')
 def moving_average_update(x, value, momentum):
   """Compute the moving average of a variable.
 
@@ -1185,12 +1249,13 @@ def moving_average_update(x, value, momentum):
       An Operation to update the variable.
   """
   return moving_averages.assign_moving_average(
-      x, value, momentum, zero_debias=False)
+      x, value, momentum, zero_debias=True)
 
 
 # LINEAR ALGEBRA
 
 
+@tf_export('keras.backend.dot')
 def dot(x, y):
   """Multiplies 2 tensors (and/or variables) and returns a *tensor*.
 
@@ -1262,6 +1327,7 @@ def dot(x, y):
   return out
 
 
+@tf_export('keras.backend.batch_dot')
 def batch_dot(x, y, axes=None):
   """Batchwise dot product.
 
@@ -1354,6 +1420,7 @@ def batch_dot(x, y, axes=None):
   return out
 
 
+@tf_export('keras.backend.transpose')
 def transpose(x):
   """Transposes a tensor and returns it.
 
@@ -1389,6 +1456,7 @@ def transpose(x):
   return array_ops.transpose(x)
 
 
+@tf_export('keras.backend.gather')
 def gather(reference, indices):
   """Retrieves the elements of indices `indices` in the tensor `reference`.
 
@@ -1405,6 +1473,7 @@ def gather(reference, indices):
 # ELEMENT-WISE OPERATIONS
 
 
+@tf_export('keras.backend.max')
 def max(x, axis=None, keepdims=False):
   """Maximum value in a tensor.
 
@@ -1419,9 +1488,10 @@ def max(x, axis=None, keepdims=False):
   Returns:
       A tensor with maximum values of `x`.
   """
-  return math_ops.reduce_max(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_max(x, axis, keepdims)
 
 
+@tf_export('keras.backend.min')
 def min(x, axis=None, keepdims=False):
   """Minimum value in a tensor.
 
@@ -1436,9 +1506,10 @@ def min(x, axis=None, keepdims=False):
   Returns:
       A tensor with miminum values of `x`.
   """
-  return math_ops.reduce_min(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_min(x, axis, keepdims)
 
 
+@tf_export('keras.backend.sum')
 def sum(x, axis=None, keepdims=False):
   """Sum of the values in a tensor, alongside the specified axis.
 
@@ -1453,9 +1524,10 @@ def sum(x, axis=None, keepdims=False):
   Returns:
       A tensor with sum of `x`.
   """
-  return math_ops.reduce_sum(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_sum(x, axis, keepdims)
 
 
+@tf_export('keras.backend.prod')
 def prod(x, axis=None, keepdims=False):
   """Multiplies the values in a tensor, alongside the specified axis.
 
@@ -1470,7 +1542,7 @@ def prod(x, axis=None, keepdims=False):
   Returns:
       A tensor with the product of elements of `x`.
   """
-  return math_ops.reduce_prod(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_prod(x, axis, keepdims)
 
 
 def cumsum(x, axis=0):
@@ -1499,6 +1571,7 @@ def cumprod(x, axis=0):
   return math_ops.cumprod(x, axis=axis)
 
 
+@tf_export('keras.backend.var')
 def var(x, axis=None, keepdims=False):
   """Variance of a tensor, alongside the specified axis.
 
@@ -1515,12 +1588,13 @@ def var(x, axis=None, keepdims=False):
   """
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
-  m = math_ops.reduce_mean(x, axis=axis, keep_dims=True)
+  m = math_ops.reduce_mean(x, axis, True)
   devs_squared = math_ops.square(x - m)
   return math_ops.reduce_mean(
-      devs_squared, axis=axis, keep_dims=keepdims)
+      devs_squared, axis, keepdims)
 
 
+@tf_export('keras.backend.std')
 def std(x, axis=None, keepdims=False):
   """Standard deviation of a tensor, alongside the specified axis.
 
@@ -1538,6 +1612,7 @@ def std(x, axis=None, keepdims=False):
   return math_ops.sqrt(var(x, axis=axis, keepdims=keepdims))
 
 
+@tf_export('keras.backend.mean')
 def mean(x, axis=None, keepdims=False):
   """Mean of a tensor, alongside the specified axis.
 
@@ -1546,7 +1621,7 @@ def mean(x, axis=None, keepdims=False):
       axis: A list of integer. Axes to compute the mean.
       keepdims: A boolean, whether to keep the dimensions or not.
           If `keepdims` is `False`, the rank of the tensor is reduced
-          by 1 for each entry in `axis`. If `keep_dims` is `True`,
+          by 1 for each entry in `axis`. If `keepdims` is `True`,
           the reduced dimensions are retained with length 1.
 
   Returns:
@@ -1554,9 +1629,10 @@ def mean(x, axis=None, keepdims=False):
   """
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
-  return math_ops.reduce_mean(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_mean(x, axis, keepdims)
 
 
+@tf_export('keras.backend.any')
 def any(x, axis=None, keepdims=False):
   """Bitwise reduction (logical OR).
 
@@ -1569,9 +1645,10 @@ def any(x, axis=None, keepdims=False):
       A uint8 tensor (0s and 1s).
   """
   x = math_ops.cast(x, dtypes_module.bool)
-  return math_ops.reduce_any(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_any(x, axis, keepdims)
 
 
+@tf_export('keras.backend.all')
 def all(x, axis=None, keepdims=False):
   """Bitwise reduction (logical AND).
 
@@ -1584,9 +1661,10 @@ def all(x, axis=None, keepdims=False):
       A uint8 tensor (0s and 1s).
   """
   x = math_ops.cast(x, dtypes_module.bool)
-  return math_ops.reduce_all(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_all(x, axis, keepdims)
 
 
+@tf_export('keras.backend.argmax')
 def argmax(x, axis=-1):
   """Returns the index of the maximum value along an axis.
 
@@ -1600,6 +1678,7 @@ def argmax(x, axis=-1):
   return math_ops.argmax(x, axis)
 
 
+@tf_export('keras.backend.argmin')
 def argmin(x, axis=-1):
   """Returns the index of the minimum value along an axis.
 
@@ -1613,6 +1692,7 @@ def argmin(x, axis=-1):
   return math_ops.argmin(x, axis)
 
 
+@tf_export('keras.backend.square')
 def square(x):
   """Element-wise square.
 
@@ -1625,6 +1705,7 @@ def square(x):
   return math_ops.square(x)
 
 
+@tf_export('keras.backend.abs')
 def abs(x):
   """Element-wise absolute value.
 
@@ -1637,6 +1718,7 @@ def abs(x):
   return math_ops.abs(x)
 
 
+@tf_export('keras.backend.sqrt')
 def sqrt(x):
   """Element-wise square root.
 
@@ -1652,6 +1734,7 @@ def sqrt(x):
   return math_ops.sqrt(x)
 
 
+@tf_export('keras.backend.exp')
 def exp(x):
   """Element-wise exponential.
 
@@ -1664,6 +1747,7 @@ def exp(x):
   return math_ops.exp(x)
 
 
+@tf_export('keras.backend.log')
 def log(x):
   """Element-wise log.
 
@@ -1694,9 +1778,10 @@ def logsumexp(x, axis=None, keepdims=False):
   Returns:
       The reduced tensor.
   """
-  return math_ops.reduce_logsumexp(x, axis=axis, keep_dims=keepdims)
+  return math_ops.reduce_logsumexp(x, axis, keepdims)
 
 
+@tf_export('keras.backend.round')
 def round(x):
   """Element-wise rounding to the closest integer.
 
@@ -1711,6 +1796,7 @@ def round(x):
   return math_ops.round(x)
 
 
+@tf_export('keras.backend.sign')
 def sign(x):
   """Element-wise sign.
 
@@ -1723,6 +1809,7 @@ def sign(x):
   return math_ops.sign(x)
 
 
+@tf_export('keras.backend.pow')
 def pow(x, a):
   """Element-wise exponentiation.
 
@@ -1736,6 +1823,7 @@ def pow(x, a):
   return math_ops.pow(x, a)
 
 
+@tf_export('keras.backend.clip')
 def clip(x, min_value, max_value):
   """Element-wise value clipping.
 
@@ -1756,6 +1844,7 @@ def clip(x, min_value, max_value):
   return clip_ops.clip_by_value(x, min_value, max_value)
 
 
+@tf_export('keras.backend.equal')
 def equal(x, y):
   """Element-wise equality between two tensors.
 
@@ -1769,6 +1858,7 @@ def equal(x, y):
   return math_ops.equal(x, y)
 
 
+@tf_export('keras.backend.not_equal')
 def not_equal(x, y):
   """Element-wise inequality between two tensors.
 
@@ -1782,6 +1872,7 @@ def not_equal(x, y):
   return math_ops.not_equal(x, y)
 
 
+@tf_export('keras.backend.greater')
 def greater(x, y):
   """Element-wise truth value of (x > y).
 
@@ -1795,6 +1886,7 @@ def greater(x, y):
   return math_ops.greater(x, y)
 
 
+@tf_export('keras.backend.greater_equal')
 def greater_equal(x, y):
   """Element-wise truth value of (x >= y).
 
@@ -1808,6 +1900,7 @@ def greater_equal(x, y):
   return math_ops.greater_equal(x, y)
 
 
+@tf_export('keras.backend.less')
 def less(x, y):
   """Element-wise truth value of (x < y).
 
@@ -1821,6 +1914,7 @@ def less(x, y):
   return math_ops.less(x, y)
 
 
+@tf_export('keras.backend.less_equal')
 def less_equal(x, y):
   """Element-wise truth value of (x <= y).
 
@@ -1834,6 +1928,7 @@ def less_equal(x, y):
   return math_ops.less_equal(x, y)
 
 
+@tf_export('keras.backend.maximum')
 def maximum(x, y):
   """Element-wise maximum of two tensors.
 
@@ -1847,6 +1942,7 @@ def maximum(x, y):
   return math_ops.maximum(x, y)
 
 
+@tf_export('keras.backend.minimum')
 def minimum(x, y):
   """Element-wise minimum of two tensors.
 
@@ -1860,6 +1956,7 @@ def minimum(x, y):
   return math_ops.minimum(x, y)
 
 
+@tf_export('keras.backend.sin')
 def sin(x):
   """Computes sin of x element-wise.
 
@@ -1872,6 +1969,7 @@ def sin(x):
   return math_ops.sin(x)
 
 
+@tf_export('keras.backend.cos')
 def cos(x):
   """Computes cos of x element-wise.
 
@@ -1884,6 +1982,109 @@ def cos(x):
   return math_ops.cos(x)
 
 
+def _regular_normalize_batch_in_training(x,
+                                         gamma,
+                                         beta,
+                                         reduction_axes,
+                                         epsilon=1e-3):
+  """Non-fused version of `normalize_batch_in_training`.
+
+  Arguments:
+      x: Input tensor or variable.
+      gamma: Tensor by which to scale the input.
+      beta: Tensor with which to center the input.
+      reduction_axes: iterable of integers,
+          axes over which to normalize.
+      epsilon: Fuzz factor.
+
+  Returns:
+      A tuple length of 3, `(normalized_tensor, mean, variance)`.
+  """
+  mean, var = nn.moments(x, reduction_axes, None, None, False)
+  normed = nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
+  return normed, mean, var
+
+
+def _broadcast_normalize_batch_in_training(x,
+                                           gamma,
+                                           beta,
+                                           reduction_axes,
+                                           epsilon=1e-3):
+  """Non-fused, broadcast version of `normalize_batch_in_training`.
+
+  Arguments:
+      x: Input tensor or variable.
+      gamma: Tensor by which to scale the input.
+      beta: Tensor with which to center the input.
+      reduction_axes: iterable of integers,
+          axes over which to normalize.
+      epsilon: Fuzz factor.
+
+  Returns:
+      A tuple length of 3, `(normalized_tensor, mean, variance)`.
+  """
+  mean, var = nn.moments(x, reduction_axes, None, None, False)
+  target_shape = []
+  for axis in range(ndim(x)):
+    if axis in reduction_axes:
+      target_shape.append(1)
+    else:
+      target_shape.append(array_ops.shape(x)[axis])
+  target_shape = array_ops.stack(target_shape)
+
+  broadcast_mean = array_ops.reshape(mean, target_shape)
+  broadcast_var = array_ops.reshape(var, target_shape)
+  if gamma is None:
+    broadcast_gamma = None
+  else:
+    broadcast_gamma = array_ops.reshape(gamma, target_shape)
+  if beta is None:
+    broadcast_beta = None
+  else:
+    broadcast_beta = array_ops.reshape(beta, target_shape)
+
+  normed = nn.batch_normalization(x, broadcast_mean, broadcast_var,
+                                  broadcast_beta, broadcast_gamma, epsilon)
+  return normed, mean, var
+
+
+def _fused_normalize_batch_in_training(x,
+                                       gamma,
+                                       beta,
+                                       reduction_axes,
+                                       epsilon=1e-3):
+  """Fused version of `normalize_batch_in_training`.
+
+  Arguments:
+      x: Input tensor or variable.
+      gamma: Tensor by which to scale the input.
+      beta: Tensor with which to center the input.
+      reduction_axes: iterable of integers,
+          axes over which to normalize.
+      epsilon: Fuzz factor.
+
+  Returns:
+      A tuple length of 3, `(normalized_tensor, mean, variance)`.
+  """
+  if list(reduction_axes) == [0, 1, 2]:
+    normalization_axis = 3
+    tf_data_format = 'NHWC'
+  else:
+    normalization_axis = 1
+    tf_data_format = 'NCHW'
+
+  if gamma is None:
+    gamma = constant_op.constant(
+        1.0, dtype=x.dtype, shape=[x.get_shape()[normalization_axis]])
+  if beta is None:
+    beta = constant_op.constant(
+        0.0, dtype=x.dtype, shape=[x.get_shape()[normalization_axis]])
+
+  return nn.fused_batch_norm(
+      x, gamma, beta, epsilon=epsilon, data_format=tf_data_format)
+
+
+@tf_export('keras.backend.normalize_batch_in_training')
 def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
   """Computes mean and std for batch then apply batch_normalization on batch.
 
@@ -1898,35 +2099,22 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
   Returns:
       A tuple length of 3, `(normalized_tensor, mean, variance)`.
   """
-  mean, var = nn.moments(
-      x, reduction_axes, shift=None, name=None, keep_dims=False)
-  if sorted(reduction_axes) == list(range(ndim(x)))[:-1]:
-    normed = nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
+  if ndim(x) == 4 and list(reduction_axes) in [[0, 1, 2], [0, 2, 3]]:
+    if not _has_nchw_support() and list(reduction_axes) == [0, 2, 3]:
+      return _broadcast_normalize_batch_in_training(
+          x, gamma, beta, reduction_axes, epsilon=epsilon)
+    return _fused_normalize_batch_in_training(
+        x, gamma, beta, reduction_axes, epsilon=epsilon)
   else:
-    # need broadcasting
-    target_shape = []
-    for axis in range(ndim(x)):
-      if axis in reduction_axes:
-        target_shape.append(1)
-      else:
-        target_shape.append(array_ops.shape(x)[axis])
-    target_shape = array_ops.stack(target_shape)
-
-    broadcast_mean = array_ops.reshape(mean, target_shape)
-    broadcast_var = array_ops.reshape(var, target_shape)
-    if gamma is None:
-      broadcast_gamma = None
+    if sorted(reduction_axes) == list(range(ndim(x)))[:-1]:
+      return _regular_normalize_batch_in_training(
+          x, gamma, beta, reduction_axes, epsilon=epsilon)
     else:
-      broadcast_gamma = array_ops.reshape(gamma, target_shape)
-    if beta is None:
-      broadcast_beta = None
-    else:
-      broadcast_beta = array_ops.reshape(beta, target_shape)
-    normed = nn.batch_normalization(x, broadcast_mean, broadcast_var,
-                                    broadcast_beta, broadcast_gamma, epsilon)
-  return normed, mean, var
+      return _broadcast_normalize_batch_in_training(
+          x, gamma, beta, reduction_axes, epsilon=epsilon)
 
 
+@tf_export('keras.backend.batch_normalization')
 def batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
   """Applies batch normalization on x given mean, var, beta and gamma.
 
@@ -1950,6 +2138,7 @@ def batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
 # SHAPE OPERATIONS
 
 
+@tf_export('keras.backend.concatenate')
 def concatenate(tensors, axis=-1):
   """Concatenates a list of tensors alongside the specified axis.
 
@@ -1973,6 +2162,7 @@ def concatenate(tensors, axis=-1):
     return array_ops.concat([to_dense(x) for x in tensors], axis)
 
 
+@tf_export('keras.backend.reshape')
 def reshape(x, shape):
   """Reshapes a tensor to the specified shape.
 
@@ -1986,6 +2176,7 @@ def reshape(x, shape):
   return array_ops.reshape(x, shape)
 
 
+@tf_export('keras.backend.permute_dimensions')
 def permute_dimensions(x, pattern):
   """Permutes axes in a tensor.
 
@@ -2000,6 +2191,7 @@ def permute_dimensions(x, pattern):
   return array_ops.transpose(x, perm=pattern)
 
 
+@tf_export('keras.backend.resize_images')
 def resize_images(x, height_factor, width_factor, data_format):
   """Resizes the images contained in a 4D tensor.
 
@@ -2044,6 +2236,7 @@ def resize_images(x, height_factor, width_factor, data_format):
     raise ValueError('Invalid data_format:', data_format)
 
 
+@tf_export('keras.backend.resize_volumes')
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
   """Resizes the volume contained in a 5D tensor.
 
@@ -2075,6 +2268,7 @@ def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
     raise ValueError('Invalid data_format:', data_format)
 
 
+@tf_export('keras.backend.repeat_elements')
 def repeat_elements(x, rep, axis):
   """Repeats the elements of a tensor along an axis, like `np.repeat`.
 
@@ -2127,6 +2321,7 @@ def repeat_elements(x, rep, axis):
   return x_rep
 
 
+@tf_export('keras.backend.repeat')
 def repeat(x, n):
   """Repeats a 2D tensor.
 
@@ -2146,6 +2341,7 @@ def repeat(x, n):
   return array_ops.tile(x, pattern)
 
 
+@tf_export('keras.backend.arange')
 def arange(start, stop=None, step=1, dtype='int32'):
   """Creates a 1D tensor containing a sequence of integers.
 
@@ -2191,6 +2387,7 @@ def tile(x, n):
   return array_ops.tile(x, n)
 
 
+@tf_export('keras.backend.flatten')
 def flatten(x):
   """Flatten a tensor.
 
@@ -2203,6 +2400,7 @@ def flatten(x):
   return array_ops.reshape(x, [-1])
 
 
+@tf_export('keras.backend.batch_flatten')
 def batch_flatten(x):
   """Turn a nD tensor into a 2D tensor with same 0th dimension.
 
@@ -2218,6 +2416,7 @@ def batch_flatten(x):
   return x
 
 
+@tf_export('keras.backend.expand_dims')
 def expand_dims(x, axis=-1):
   """Adds a 1-sized dimension at index "axis".
 
@@ -2231,6 +2430,7 @@ def expand_dims(x, axis=-1):
   return array_ops.expand_dims(x, axis)
 
 
+@tf_export('keras.backend.squeeze')
 def squeeze(x, axis):
   """Removes a 1-dimension from the tensor at index "axis".
 
@@ -2244,6 +2444,7 @@ def squeeze(x, axis):
   return array_ops.squeeze(x, [axis])
 
 
+@tf_export('keras.backend.temporal_padding')
 def temporal_padding(x, padding=(1, 1)):
   """Pads the middle dimension of a 3D tensor.
 
@@ -2260,6 +2461,7 @@ def temporal_padding(x, padding=(1, 1)):
   return array_ops.pad(x, pattern)
 
 
+@tf_export('keras.backend.spatial_2d_padding')
 def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
   """Pads the 2nd and 3rd dimensions of a 4D tensor.
 
@@ -2290,6 +2492,7 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
   return array_ops.pad(x, pattern)
 
 
+@tf_export('keras.backend.spatial_3d_padding')
 def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
   """Pads 5D tensor with zeros along the depth, height, width dimensions.
 
@@ -2333,6 +2536,7 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
   return array_ops.pad(x, pattern)
 
 
+@tf_export('keras.backend.stack')
 def stack(x, axis=0):
   """Stacks a list of rank `R` tensors into a rank `R+1` tensor.
 
@@ -2346,6 +2550,7 @@ def stack(x, axis=0):
   return array_ops.stack(x, axis=axis)
 
 
+@tf_export('keras.backend.one_hot')
 def one_hot(indices, num_classes):
   """Computes the one-hot representation of an integer tensor.
 
@@ -2364,6 +2569,7 @@ def one_hot(indices, num_classes):
   return array_ops.one_hot(indices, depth=num_classes, axis=-1)
 
 
+@tf_export('keras.backend.reverse')
 def reverse(x, axes):
   """Reverse a tensor along the specified axes.
 
@@ -2383,6 +2589,7 @@ def reverse(x, axes):
 # VALUE MANIPULATION
 
 
+@tf_export('keras.backend.get_value')
 def get_value(x):
   """Returns the value of a variable.
 
@@ -2392,9 +2599,12 @@ def get_value(x):
   Returns:
       A Numpy array.
   """
+  if context.in_eager_mode():
+    return x.numpy()
   return x.eval(session=get_session())
 
 
+@tf_export('keras.backend.batch_get_value')
 def batch_get_value(tensors):
   """Returns the value of more than one tensor variable.
 
@@ -2404,12 +2614,15 @@ def batch_get_value(tensors):
   Returns:
       A list of Numpy arrays.
   """
+  if context.in_eager_mode():
+    return [x.numpy() for x in tensors]
   if tensors:
     return get_session().run(tensors)
   else:
     return []
 
 
+@tf_export('keras.backend.set_value')
 def set_value(x, value):
   """Sets the value of a variable, from a Numpy array.
 
@@ -2419,18 +2632,22 @@ def set_value(x, value):
           (of the same shape).
   """
   value = np.asarray(value, dtype=dtype(x))
-  tf_dtype = dtypes_module.as_dtype(x.dtype.name.split('_')[0])
-  if hasattr(x, '_assign_placeholder'):
-    assign_placeholder = x._assign_placeholder
-    assign_op = x._assign_op
+  if context.in_eager_mode():
+    x.assign(value)
   else:
-    assign_placeholder = array_ops.placeholder(tf_dtype, shape=value.shape)
-    assign_op = x.assign(assign_placeholder)
-    x._assign_placeholder = assign_placeholder
-    x._assign_op = assign_op
-  get_session().run(assign_op, feed_dict={assign_placeholder: value})
+    tf_dtype = dtypes_module.as_dtype(x.dtype.name.split('_')[0])
+    if hasattr(x, '_assign_placeholder'):
+      assign_placeholder = x._assign_placeholder
+      assign_op = x._assign_op
+    else:
+      assign_placeholder = array_ops.placeholder(tf_dtype, shape=value.shape)
+      assign_op = x.assign(assign_placeholder)
+      x._assign_placeholder = assign_placeholder
+      x._assign_op = assign_op
+    get_session().run(assign_op, feed_dict={assign_placeholder: value})
 
 
+@tf_export('keras.backend.batch_set_value')
 def batch_set_value(tuples):
   """Sets the values of many tensor variables at once.
 
@@ -2438,25 +2655,31 @@ def batch_set_value(tuples):
       tuples: a list of tuples `(tensor, value)`.
           `value` should be a Numpy array.
   """
-  if tuples:
-    assign_ops = []
-    feed_dict = {}
+  if context.in_eager_mode():
     for x, value in tuples:
-      value = np.asarray(value, dtype=dtype(x))
-      tf_dtype = dtypes_module.as_dtype(x.dtype.name.split('_')[0])
-      if hasattr(x, '_assign_placeholder'):
-        assign_placeholder = x._assign_placeholder
-        assign_op = x._assign_op
-      else:
-        assign_placeholder = array_ops.placeholder(tf_dtype, shape=value.shape)
-        assign_op = x.assign(assign_placeholder)
-        x._assign_placeholder = assign_placeholder
-        x._assign_op = assign_op
-      assign_ops.append(assign_op)
-      feed_dict[assign_placeholder] = value
-    get_session().run(assign_ops, feed_dict=feed_dict)
+      x.assign(np.asarray(value, dtype=dtype(x)))
+  else:
+    if tuples:
+      assign_ops = []
+      feed_dict = {}
+      for x, value in tuples:
+        value = np.asarray(value, dtype=dtype(x))
+        tf_dtype = dtypes_module.as_dtype(x.dtype.name.split('_')[0])
+        if hasattr(x, '_assign_placeholder'):
+          assign_placeholder = x._assign_placeholder
+          assign_op = x._assign_op
+        else:
+          assign_placeholder = array_ops.placeholder(tf_dtype,
+                                                     shape=value.shape)
+          assign_op = x.assign(assign_placeholder)
+          x._assign_placeholder = assign_placeholder
+          x._assign_op = assign_op
+        assign_ops.append(assign_op)
+        feed_dict[assign_placeholder] = value
+      get_session().run(assign_ops, feed_dict=feed_dict)
 
 
+@tf_export('keras.backend.print_tensor')
 def print_tensor(x, message=''):
   """Prints `message` and the tensor value when evaluated.
 
@@ -2554,6 +2777,7 @@ class Function(object):
     return updated[:len(self.outputs)]
 
 
+@tf_export('keras.backend.function')
 def function(inputs, outputs, updates=None, **kwargs):
   """Instantiates a Keras function.
 
@@ -2579,6 +2803,7 @@ def function(inputs, outputs, updates=None, **kwargs):
   return Function(inputs, outputs, updates=updates, **kwargs)
 
 
+@tf_export('keras.backend.gradients')
 def gradients(loss, variables):
   """Returns the gradients of `variables` w.r.t. `loss`.
 
@@ -2593,6 +2818,7 @@ def gradients(loss, variables):
       loss, variables, colocate_gradients_with_ops=True)
 
 
+@tf_export('keras.backend.stop_gradient')
 def stop_gradient(variables):
   """Returns `variables` but with zero gradient w.r.t. every other variable.
 
@@ -2613,13 +2839,15 @@ def stop_gradient(variables):
 # CONTROL FLOW
 
 
+@tf_export('keras.backend.rnn')
 def rnn(step_function,
         inputs,
         initial_states,
         go_backwards=False,
         mask=None,
         constants=None,
-        unroll=False):
+        unroll=False,
+        input_length=None):
   """Iterates over the time dimension of a tensor.
 
   Arguments:
@@ -2648,6 +2876,7 @@ def rnn(step_function,
       constants: a list of constant values passed at each step.
       unroll: whether to unroll the RNN or to use a symbolic loop
           (`while_loop` or `scan` depending on backend).
+      input_length: Unused; exists for API compatibility.
 
   Returns:
       A tuple, `(last_output, outputs, new_states)`.
@@ -2665,9 +2894,11 @@ def rnn(step_function,
       ValueError: if `mask` is provided (not `None`) but states is not provided
           (`len(states)` == 0).
   """
+  del input_length
   ndim = len(inputs.get_shape())
   if ndim < 3:
     raise ValueError('Input should be at least 3D.')
+  inputs_shape = inputs.get_shape()
   axes = [1, 0] + list(range(2, ndim))
   inputs = array_ops.transpose(inputs, (axes))
 
@@ -2852,10 +3083,18 @@ def rnn(step_function,
 
   axes = [1, 0] + list(range(2, len(outputs.get_shape())))
   outputs = array_ops.transpose(outputs, axes)
+
+  # Static shape inference: (samples, time, ...)
+  outputs_shape = outputs.get_shape().as_list()
+  outputs_shape[0] = inputs_shape[0]
+  outputs_shape[1] = inputs_shape[1]
+  outputs.set_shape(outputs_shape)
+
   last_output._uses_learning_phase = uses_learning_phase
   return last_output, outputs, new_states
 
 
+@tf_export('keras.backend.switch')
 def switch(condition, then_expression, else_expression):
   """Switches between two operations depending on a scalar value.
 
@@ -2919,6 +3158,7 @@ def switch(condition, then_expression, else_expression):
   return x
 
 
+@tf_export('keras.backend.in_train_phase')
 def in_train_phase(x, alt, training=None):
   """Selects `x` in train phase, and `alt` otherwise.
 
@@ -2962,6 +3202,7 @@ def in_train_phase(x, alt, training=None):
   return x
 
 
+@tf_export('keras.backend.in_test_phase')
 def in_test_phase(x, alt, training=None):
   """Selects `x` in test phase, and `alt` otherwise.
 
@@ -2985,6 +3226,7 @@ def in_test_phase(x, alt, training=None):
 # NN OPERATIONS
 
 
+@tf_export('keras.backend.relu')
 def relu(x, alpha=0., max_value=None):
   """Rectified linear unit.
 
@@ -3011,12 +3253,13 @@ def relu(x, alpha=0., max_value=None):
   return x
 
 
+@tf_export('keras.backend.elu')
 def elu(x, alpha=1.):
   """Exponential linear unit.
 
   Arguments:
       x: A tensor or variable to compute the activation function for.
-      alpha: A scalar, slope of positive section.
+      alpha: A scalar, slope of negative section.
 
   Returns:
       A tensor.
@@ -3028,6 +3271,7 @@ def elu(x, alpha=1.):
     return array_ops.where(x > 0, res, alpha * res)
 
 
+@tf_export('keras.backend.softmax')
 def softmax(x):
   """Softmax of a tensor.
 
@@ -3040,6 +3284,7 @@ def softmax(x):
   return nn.softmax(x)
 
 
+@tf_export('keras.backend.softplus')
 def softplus(x):
   """Softplus of a tensor.
 
@@ -3052,6 +3297,7 @@ def softplus(x):
   return nn.softplus(x)
 
 
+@tf_export('keras.backend.softsign')
 def softsign(x):
   """Softsign of a tensor.
 
@@ -3064,6 +3310,7 @@ def softsign(x):
   return nn.softsign(x)
 
 
+@tf_export('keras.backend.categorical_crossentropy')
 def categorical_crossentropy(target, output, from_logits=False):
   """Categorical crossentropy between an output tensor and a target tensor.
 
@@ -3082,8 +3329,8 @@ def categorical_crossentropy(target, output, from_logits=False):
   # expects logits, Keras expects probabilities.
   if not from_logits:
     # scale preds so that the class probas of each sample sum to 1
-    output /= math_ops.reduce_sum(
-        output, axis=len(output.get_shape()) - 1, keep_dims=True)
+    output = output / math_ops.reduce_sum(  # pylint: disable=g-no-augmented-assignment
+        output, len(output.get_shape()) - 1, True)
     # manual computation of crossentropy
     epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
     output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
@@ -3094,6 +3341,7 @@ def categorical_crossentropy(target, output, from_logits=False):
     return nn.softmax_cross_entropy_with_logits(labels=target, logits=output)
 
 
+@tf_export('keras.backend.sparse_categorical_crossentropy')
 def sparse_categorical_crossentropy(target, output, from_logits=False):
   """Categorical crossentropy with integer targets.
 
@@ -3120,13 +3368,14 @@ def sparse_categorical_crossentropy(target, output, from_logits=False):
   logits = array_ops.reshape(output, [-1, int(output_shape[-1])])
   res = nn.sparse_softmax_cross_entropy_with_logits(
       labels=targets, logits=logits)
-  if len(output_shape) == 3:
-    # if our output includes timesteps we need to reshape
+  if len(output_shape) >= 3:
+    # If our output includes timesteps or spatial dimensions we need to reshape
     return array_ops.reshape(res, array_ops.shape(output)[:-1])
   else:
     return res
 
 
+@tf_export('keras.backend.binary_crossentropy')
 def binary_crossentropy(target, output, from_logits=False):
   """Binary crossentropy between an output tensor and a target tensor.
 
@@ -3150,6 +3399,7 @@ def binary_crossentropy(target, output, from_logits=False):
   return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
 
 
+@tf_export('keras.backend.sigmoid')
 def sigmoid(x):
   """Element-wise sigmoid.
 
@@ -3162,6 +3412,7 @@ def sigmoid(x):
   return nn.sigmoid(x)
 
 
+@tf_export('keras.backend.hard_sigmoid')
 def hard_sigmoid(x):
   """Segment-wise linear approximation of sigmoid.
 
@@ -3182,6 +3433,7 @@ def hard_sigmoid(x):
   return x
 
 
+@tf_export('keras.backend.tanh')
 def tanh(x):
   """Element-wise tanh.
 
@@ -3194,6 +3446,7 @@ def tanh(x):
   return nn.tanh(x)
 
 
+@tf_export('keras.backend.dropout')
 def dropout(x, level, noise_shape=None, seed=None):
   """Sets entries in `x` to zero at random, while scaling the entire tensor.
 
@@ -3216,6 +3469,7 @@ def dropout(x, level, noise_shape=None, seed=None):
   return nn.dropout(x * 1., retain_prob, noise_shape, seed=seed)
 
 
+@tf_export('keras.backend.l2_normalize')
 def l2_normalize(x, axis=None):
   """Normalizes a tensor wrt the L2 norm alongside the specified axis.
 
@@ -3229,6 +3483,7 @@ def l2_normalize(x, axis=None):
   return nn.l2_normalize(x, dim=axis)
 
 
+@tf_export('keras.backend.in_top_k')
 def in_top_k(predictions, targets, k):
   """Returns whether the `targets` are in the top `k` `predictions`.
 
@@ -3248,6 +3503,25 @@ def in_top_k(predictions, targets, k):
 # CONVOLUTIONS
 
 
+def _preprocess_conv1d_input(x, data_format):
+  """Transpose and cast the input before the conv1d.
+
+  Arguments:
+      x: input tensor.
+      data_format: string, `"channels_last"` or `"channels_first"`.
+
+  Returns:
+      A tensor.
+  """
+  tf_data_format = 'NHWC'  # to pass TF Conv2dNative operations
+  if data_format == 'channels_first':
+    if not _has_nchw_support():
+      x = array_ops.transpose(x, (0, 2, 1))  # NCW -> NWC
+    else:
+      tf_data_format = 'NCHW'
+  return x, tf_data_format
+
+
 def _preprocess_conv2d_input(x, data_format):
   """Transpose and cast the input before the conv2d.
 
@@ -3307,6 +3581,7 @@ def _preprocess_padding(padding):
   return padding
 
 
+@tf_export('keras.backend.conv1d')
 def conv1d(x,
            kernel,
            strides=1,
@@ -3356,6 +3631,7 @@ def conv1d(x,
   return x
 
 
+@tf_export('keras.backend.conv2d')
 def conv2d(x,
            kernel,
            strides=(1, 1),
@@ -3400,6 +3676,7 @@ def conv2d(x,
   return x
 
 
+@tf_export('keras.backend.conv2d_transpose')
 def conv2d_transpose(x,
                      kernel,
                      output_shape,
@@ -3461,6 +3738,67 @@ def conv2d_transpose(x,
   return x
 
 
+def separable_conv1d(x,
+                     depthwise_kernel,
+                     pointwise_kernel,
+                     strides=1,
+                     padding='valid',
+                     data_format=None,
+                     dilation_rate=1):
+  """1D convolution with separable filters.
+
+  Arguments:
+      x: input tensor
+      depthwise_kernel: convolution kernel for the depthwise convolution.
+      pointwise_kernel: kernel for the 1x1 convolution.
+      strides: stride integer.
+      padding: string, `"same"` or `"valid"`.
+      data_format: string, `"channels_last"` or `"channels_first"`.
+      dilation_rate: integer dilation rate.
+
+  Returns:
+      Output tensor.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
+  x, tf_data_format = _preprocess_conv1d_input(x, data_format)
+  padding = _preprocess_padding(padding)
+  if tf_data_format == 'NHWC':
+    spatial_start_dim = 1
+    strides = (1, 1) + strides + (1,)
+  else:
+    spatial_start_dim = 2
+    strides = (1, 1, 1) + strides
+  x = array_ops.expand_dims(x, spatial_start_dim)
+  depthwise_kernel = array_ops.expand_dims(depthwise_kernel, 0)
+  pointwise_kernel = array_ops.expand_dims(pointwise_kernel, 0)
+  dilation_rate = (1,) + dilation_rate
+
+  x = nn.separable_conv2d(
+      x,
+      depthwise_kernel,
+      pointwise_kernel,
+      strides=strides,
+      padding=padding,
+      rate=dilation_rate,
+      data_format=tf_data_format)
+
+  x = array_ops.squeeze(x, [spatial_start_dim])
+
+  if data_format == 'channels_first' and tf_data_format == 'NHWC':
+    x = array_ops.transpose(x, (0, 2, 1))  # NWC -> NCW
+
+  return x
+
+
+@tf_export('keras.backend.separable_conv2d')
 def separable_conv2d(x,
                      depthwise_kernel,
                      pointwise_kernel,
@@ -3560,6 +3898,7 @@ def depthwise_conv2d(x,
   return x
 
 
+@tf_export('keras.backend.conv3d')
 def conv3d(x,
            kernel,
            strides=(1, 1, 1),
@@ -3665,6 +4004,7 @@ def conv3d_transpose(x,
   return x
 
 
+@tf_export('keras.backend.pool2d')
 def pool2d(x,
            pool_size,
            strides=(1, 1),
@@ -3717,6 +4057,7 @@ def pool2d(x,
   return x
 
 
+@tf_export('keras.backend.pool3d')
 def pool3d(x,
            pool_size,
            strides=(1, 1, 1),
@@ -3880,6 +4221,7 @@ def local_conv2d(inputs,
   return output
 
 
+@tf_export('keras.backend.bias_add')
 def bias_add(x, bias, data_format=None):
   """Adds a bias vector to a tensor.
 
@@ -3907,47 +4249,53 @@ def bias_add(x, bias, data_format=None):
     raise ValueError(
         'Unexpected bias dimensions %d, expect to be 1 or %d dimensions' %
         (len(bias_shape), ndim(x)))
+  # pylint: disable=g-no-augmented-assignment
   if ndim(x) == 5:
     if data_format == 'channels_first':
       if len(bias_shape) == 1:
-        x += reshape(bias, (1, bias_shape[0], 1, 1, 1))
+        x = x + reshape(bias, (1, bias_shape[0], 1, 1, 1))
       else:
-        x += reshape(bias, (1, bias_shape[3]) + bias_shape[:3])
+        x = x + reshape(bias, (1, bias_shape[3]) + bias_shape[:3])
     elif data_format == 'channels_last':
       if len(bias_shape) == 1:
-        x += reshape(bias, (1, 1, 1, bias_shape[0]))
+        x = x + reshape(bias, (1, 1, 1, bias_shape[0]))
       else:
-        x += reshape(bias, (1,) + bias_shape)
+        x = x + reshape(bias, (1,) + bias_shape)
   elif ndim(x) == 4:
     if data_format == 'channels_first':
       if len(bias_shape) == 1:
-        x += reshape(bias, (1, bias_shape[0], 1, 1))
+        if _has_nchw_support():
+          x = nn.bias_add(x, bias, data_format='NCHW')
+        else:
+          x = x + reshape(bias, (1, bias_shape[0], 1, 1))
       else:
-        x += reshape(bias, (1, bias_shape[2]) + bias_shape[:2])
+        x = x + reshape(bias, (1, bias_shape[2]) + bias_shape[:2])
     elif data_format == 'channels_last':
       if len(bias_shape) == 1:
         x = nn.bias_add(x, bias, data_format='NHWC')
       else:
-        x += reshape(bias, (1,) + bias_shape)
+        x = x + reshape(bias, (1,) + bias_shape)
   elif ndim(x) == 3:
     if data_format == 'channels_first':
       if len(bias_shape) == 1:
-        x += reshape(bias, (1, bias_shape[0], 1))
+        x = x + reshape(bias, (1, bias_shape[0], 1))
       else:
-        x += reshape(bias, (1, bias_shape[1], bias_shape[0]))
+        x = x + reshape(bias, (1, bias_shape[1], bias_shape[0]))
     elif data_format == 'channels_last':
       if len(bias_shape) == 1:
-        x += reshape(bias, (1, 1, bias_shape[0]))
+        x = x + reshape(bias, (1, 1, bias_shape[0]))
       else:
-        x += reshape(bias, (1,) + bias_shape)
+        x = x + reshape(bias, (1,) + bias_shape)
   else:
     x = nn.bias_add(x, bias)
+  # pylint: enable=g-no-augmented-assignment
   return x
 
 
 # RANDOMNESS
 
 
+@tf_export('keras.backend.random_normal')
 def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with normal distribution of values.
 
@@ -3970,6 +4318,7 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
       shape, mean=mean, stddev=stddev, dtype=dtype, seed=seed)
 
 
+@tf_export('keras.backend.random_uniform')
 def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
   """Returns a tensor with uniform distribution of values.
 
@@ -3993,6 +4342,7 @@ def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
       shape, minval=minval, maxval=maxval, dtype=dtype, seed=seed)
 
 
+@tf_export('keras.backend.random_binomial')
 def random_binomial(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random binomial distribution of values.
 
@@ -4014,6 +4364,7 @@ def random_binomial(shape, p=0.0, dtype=None, seed=None):
       array_ops.ones(shape, dtype=dtype), array_ops.zeros(shape, dtype=dtype))
 
 
+@tf_export('keras.backend.truncated_normal')
 def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with truncated random normal distribution of values.
 
@@ -4047,6 +4398,7 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 # in TensorFlow's CTC implementation
 
 
+@tf_export('keras.backend.ctc_label_dense_to_sparse')
 def ctc_label_dense_to_sparse(labels, label_lengths):
   """Converts CTC labels from dense to sparse.
 
@@ -4091,6 +4443,7 @@ def ctc_label_dense_to_sparse(labels, label_lengths):
       math_ops.to_int64(indices), vals_sparse, math_ops.to_int64(label_shape))
 
 
+@tf_export('keras.backend.ctc_batch_cost')
 def ctc_batch_cost(y_true, y_pred, input_length, label_length):
   """Runs CTC loss algorithm on each batch element.
 
@@ -4113,13 +4466,14 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
   sparse_labels = math_ops.to_int32(
       ctc_label_dense_to_sparse(y_true, label_length))
 
-  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
+  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
 
   return array_ops.expand_dims(
       ctc.ctc_loss(
           inputs=y_pred, labels=sparse_labels, sequence_length=input_length), 1)
 
 
+@tf_export('keras.backend.ctc_decode')
 def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
   """Decodes the output of a softmax.
 
@@ -4148,7 +4502,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
           Tensor `(top_paths, )` that contains
               the log probability of each decoded sequence.
   """
-  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)
+  y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
   input_length = math_ops.to_int32(input_length)
 
   if greedy:
@@ -4171,6 +4525,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
 # HIGH ORDER FUNCTIONS
 
 
+@tf_export('keras.backend.map_fn')
 def map_fn(fn, elems, name=None, dtype=None):
   """Map the function fn over the elements elems and return the outputs.
 
@@ -4186,6 +4541,7 @@ def map_fn(fn, elems, name=None, dtype=None):
   return functional_ops.map_fn(fn, elems, name=name, dtype=dtype)
 
 
+@tf_export('keras.backend.foldl')
 def foldl(fn, elems, initializer=None, name=None):
   """Reduce elems using fn to combine them from left to right.
 
@@ -4202,6 +4558,7 @@ def foldl(fn, elems, initializer=None, name=None):
   return functional_ops.foldl(fn, elems, initializer=initializer, name=name)
 
 
+@tf_export('keras.backend.foldr')
 def foldr(fn, elems, initializer=None, name=None):
   """Reduce elems using fn to combine them from right to left.
 
diff --git a/tensorflow/python/keras/_impl/keras/backend_test.py b/tensorflow/python/keras/_impl/keras/backend_test.py
index e45e566dcac62a2d91c8e6d68caa5c15d8d80244..f29ca49378bc43385b9e90d3f1cefb7937df64cd 100644
--- a/tensorflow/python/keras/_impl/keras/backend_test.py
+++ b/tensorflow/python/keras/_impl/keras/backend_test.py
@@ -22,6 +22,7 @@ import scipy.sparse
 
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
 
@@ -114,12 +115,19 @@ class BackendUtilsTest(test.TestCase):
     self.assertEqual(keras.backend.get_uid('foo'), 1)
 
   def test_learning_phase(self):
-    with self.test_session():
+    with self.test_session() as sess:
       keras.backend.set_learning_phase(1)
       self.assertEqual(keras.backend.learning_phase(), 1)
       with self.assertRaises(ValueError):
         keras.backend.set_learning_phase(2)
 
+      # Test running with a learning-phase-consuming layer
+      keras.backend.set_learning_phase(0)
+      x = keras.Input((3,))
+      y = keras.layers.BatchNormalization()(x)
+      sess.run(variables.global_variables_initializer())
+      sess.run(y, feed_dict={x: np.random.random((2, 3))})
+
   def test_int_shape(self):
     x = keras.backend.placeholder(shape=(3, 4))
     self.assertEqual(keras.backend.int_shape(x), (3, 4))
@@ -907,6 +915,15 @@ class BackendNNOpsTest(test.TestCase):
         last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
                                                              initial_states,
                                                              **kwargs)
+        # check static shape inference
+        self.assertEquals(last_output.get_shape().as_list(),
+                          [num_samples, output_dim])
+        self.assertEquals(outputs.get_shape().as_list(),
+                          [num_samples, timesteps, output_dim])
+        for state in new_states:
+          self.assertEquals(state.get_shape().as_list(),
+                            [num_samples, output_dim])
+
         last_output_list[i].append(keras.backend.eval(last_output))
         outputs_list[i].append(keras.backend.eval(outputs))
         self.assertEqual(len(new_states), 1)
@@ -946,7 +963,6 @@ class BackendNNOpsTest(test.TestCase):
     x = keras.backend.variable(val)
     reduction_axes = (0, 2, 3)
 
-    # case: need broadcasting
     g_val = np.random.random((3,))
     b_val = np.random.random((3,))
     gamma = keras.backend.variable(g_val)
@@ -957,17 +973,6 @@ class BackendNNOpsTest(test.TestCase):
     self.assertEqual(mean.get_shape().as_list(), [3,])
     self.assertEqual(var.get_shape().as_list(), [3,])
 
-    # case: doesn't need broadcasting
-    g_val = np.random.random((1, 3, 1, 1))
-    b_val = np.random.random((1, 3, 1, 1))
-    gamma = keras.backend.variable(g_val)
-    beta = keras.backend.variable(b_val)
-    normed, mean, var = keras.backend.normalize_batch_in_training(
-        x, gamma, beta, reduction_axes, epsilon=1e-3)
-    self.assertEqual(normed.get_shape().as_list(), [10, 3, 10, 10])
-    self.assertEqual(mean.get_shape().as_list(), [3,])
-    self.assertEqual(var.get_shape().as_list(), [3,])
-
     # case: gamma=None
     gamma = None
     normed, mean, var = keras.backend.normalize_batch_in_training(
diff --git a/tensorflow/python/keras/_impl/keras/callbacks.py b/tensorflow/python/keras/_impl/keras/callbacks.py
index 16109b52b3ad05c1f5dd46f05bef493ce15f4295..de013c7c3f22c5279dfd590c660f108172d5977d 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks.py
+++ b/tensorflow/python/keras/_impl/keras/callbacks.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras callbacks: utilities called at certain points during model training.
+# pylint: disable=g-import-not-at-top
+"""Callbacks: utilities called at certain points during model training.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -34,14 +35,13 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary as tf_summary
+from tensorflow.python.util.tf_export import tf_export
 
 
-# pylint: disable=g-import-not-at-top
 try:
   import requests
 except ImportError:
   requests = None
-# pylint: enable=g-import-not-at-top
 
 
 class CallbackList(object):
@@ -109,9 +109,9 @@ class CallbackList(object):
     delta_t_median = np.median(self._delta_ts_batch_begin)
     if (self._delta_t_batch > 0. and
         delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
-      logging.warning(
-          'Method on_batch_begin() is slow compared '
-          'to the batch update (%f). Check your callbacks.' % delta_t_median)
+      logging.warning('Method on_batch_begin() is slow compared '
+                      'to the batch update (%f). Check your callbacks.',
+                      delta_t_median)
     self._t_enter_batch = time.time()
 
   def on_batch_end(self, batch, logs=None):
@@ -132,9 +132,9 @@ class CallbackList(object):
     delta_t_median = np.median(self._delta_ts_batch_end)
     if (self._delta_t_batch > 0. and
         (delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1)):
-      logging.warning(
-          'Method on_batch_end() is slow compared '
-          'to the batch update (%f). Check your callbacks.' % delta_t_median)
+      logging.warning('Method on_batch_end() is slow compared '
+                      'to the batch update (%f). Check your callbacks.',
+                      delta_t_median)
 
   def on_train_begin(self, logs=None):
     """Called at the beginning of training.
@@ -160,6 +160,7 @@ class CallbackList(object):
     return iter(self.callbacks)
 
 
+@tf_export('keras.callbacks.Callback')
 class Callback(object):
   """Abstract base class used to build new callbacks.
 
@@ -189,6 +190,7 @@ class Callback(object):
 
   def __init__(self):
     self.validation_data = None
+    self.model = None
 
   def set_params(self, params):
     self.params = params
@@ -215,6 +217,7 @@ class Callback(object):
     pass
 
 
+@tf_export('keras.callbacks.BaseLogger')
 class BaseLogger(Callback):
   """Callback that accumulates epoch averages of metrics.
 
@@ -244,8 +247,10 @@ class BaseLogger(Callback):
           logs[k] = self.totals[k] / self.seen
 
 
+@tf_export('keras.callbacks.TerminateOnNaN')
 class TerminateOnNaN(Callback):
-  """Callback that terminates training when a NaN loss is encountered."""
+  """Callback that terminates training when a NaN loss is encountered.
+  """
 
   def __init__(self):
     super(TerminateOnNaN, self).__init__()
@@ -259,6 +264,7 @@ class TerminateOnNaN(Callback):
         self.model.stop_training = True
 
 
+@tf_export('keras.callbacks.ProgbarLogger')
 class ProgbarLogger(Callback):
   """Callback that prints metrics to stdout.
 
@@ -322,9 +328,10 @@ class ProgbarLogger(Callback):
       if k in logs:
         self.log_values.append((k, logs[k]))
     if self.verbose:
-      self.progbar.update(self.seen, self.log_values, force=True)
+      self.progbar.update(self.seen, self.log_values)
 
 
+@tf_export('keras.callbacks.History')
 class History(Callback):
   """Callback that records events into a `History` object.
 
@@ -344,6 +351,7 @@ class History(Callback):
       self.history.setdefault(k, []).append(v)
 
 
+@tf_export('keras.callbacks.ModelCheckpoint')
 class ModelCheckpoint(Callback):
   """Save the model after every epoch.
 
@@ -395,7 +403,7 @@ class ModelCheckpoint(Callback):
 
     if mode not in ['auto', 'min', 'max']:
       logging.warning('ModelCheckpoint mode %s is unknown, '
-                      'fallback to auto mode.' % mode)
+                      'fallback to auto mode.', (mode), RuntimeWarning)
       mode = 'auto'
 
     if mode == 'min':
@@ -422,11 +430,11 @@ class ModelCheckpoint(Callback):
         current = logs.get(self.monitor)
         if current is None:
           logging.warning('Can save best model only with %s available, '
-                          'skipping.' % (self.monitor))
+                          'skipping.', self.monitor, RuntimeWarning)
         else:
           if self.monitor_op(current, self.best):
             if self.verbose > 0:
-              print('Epoch %05d: %s improved from %0.5f to %0.5f,'
+              print('\nEpoch %05d: %s improved from %0.5f to %0.5f,'
                     ' saving model to %s' % (epoch + 1, self.monitor, self.best,
                                              current, filepath))
             self.best = current
@@ -436,17 +444,18 @@ class ModelCheckpoint(Callback):
               self.model.save(filepath, overwrite=True)
           else:
             if self.verbose > 0:
-              print('Epoch %05d: %s did not improve' % (epoch + 1,
-                                                        self.monitor))
+              print('\nEpoch %05d: %s did not improve' % (epoch + 1,
+                                                          self.monitor))
       else:
         if self.verbose > 0:
-          print('Epoch %05d: saving model to %s' % (epoch + 1, filepath))
+          print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
         if self.save_weights_only:
           self.model.save_weights(filepath, overwrite=True)
         else:
           self.model.save(filepath, overwrite=True)
 
 
+@tf_export('keras.callbacks.EarlyStopping')
 class EarlyStopping(Callback):
   """Stop training when a monitored quantity has stopped improving.
 
@@ -485,7 +494,7 @@ class EarlyStopping(Callback):
 
     if mode not in ['auto', 'min', 'max']:
       logging.warning('EarlyStopping mode %s is unknown, '
-                      'fallback to auto mode.' % mode)
+                      'fallback to auto mode.', mode, RuntimeWarning)
       mode = 'auto'
 
     if mode == 'min':
@@ -513,8 +522,8 @@ class EarlyStopping(Callback):
     current = logs.get(self.monitor)
     if current is None:
       logging.warning('Early stopping conditioned on metric `%s` '
-                      'which is not available. Available metrics are: %s' %
-                      (self.monitor, ','.join(list(logs.keys()))))
+                      'which is not available. Available metrics are: %s',
+                      self.monitor, ','.join(list(logs.keys())), RuntimeWarning)
       return
     if self.monitor_op(current - self.min_delta, self.best):
       self.best = current
@@ -530,6 +539,7 @@ class EarlyStopping(Callback):
       print('Epoch %05d: early stopping' % (self.stopped_epoch + 1))
 
 
+@tf_export('keras.callbacks.RemoteMonitor')
 class RemoteMonitor(Callback):
   """Callback used to stream events to a server.
 
@@ -543,8 +553,6 @@ class RemoteMonitor(Callback):
       path: String; path relative to `root` to which the events will be sent.
       field: String; JSON field under which the data will be stored.
       headers: Dictionary; optional custom HTTP headers.
-          Defaults to:
-          `{'Accept': 'application/json', 'Content-Type': 'application/json'}`
   """
 
   def __init__(self,
@@ -553,11 +561,7 @@ class RemoteMonitor(Callback):
                field='data',
                headers=None):
     super(RemoteMonitor, self).__init__()
-    if headers is None:
-      headers = {
-          'Accept': 'application/json',
-          'Content-Type': 'application/json'
-      }
+
     self.root = root
     self.path = path
     self.field = field
@@ -580,6 +584,7 @@ class RemoteMonitor(Callback):
                       'root server at ' + str(self.root))
 
 
+@tf_export('keras.callbacks.LearningRateScheduler')
 class LearningRateScheduler(Callback):
   """Learning rate scheduler.
 
@@ -587,11 +592,13 @@ class LearningRateScheduler(Callback):
       schedule: a function that takes an epoch index as input
           (integer, indexed from 0) and returns a new
           learning rate as output (float).
+      verbose: int. 0: quiet, 1: update messages.
   """
 
-  def __init__(self, schedule):
+  def __init__(self, schedule, verbose=0):
     super(LearningRateScheduler, self).__init__()
     self.schedule = schedule
+    self.verbose = verbose
 
   def on_epoch_begin(self, epoch, logs=None):
     if not hasattr(self.model.optimizer, 'lr'):
@@ -601,8 +608,12 @@ class LearningRateScheduler(Callback):
       raise ValueError('The output of the "schedule" function '
                        'should be float.')
     K.set_value(self.model.optimizer.lr, lr)
+    if self.verbose > 0:
+      print('\nEpoch %05d: LearningRateScheduler reducing learning '
+            'rate to %s.' % (epoch + 1, lr))
 
 
+@tf_export('keras.callbacks.TensorBoard')
 class TensorBoard(Callback):
   # pylint: disable=line-too-long
   """Tensorboard basic visualizations.
@@ -772,6 +783,7 @@ class TensorBoard(Callback):
     self.writer.close()
 
 
+@tf_export('keras.callbacks.ReduceLROnPlateau')
 class ReduceLROnPlateau(Callback):
   """Reduce learning rate when a metric has stopped improving.
 
@@ -841,7 +853,7 @@ class ReduceLROnPlateau(Callback):
     """
     if self.mode not in ['auto', 'min', 'max']:
       logging.warning('Learning Rate Plateau Reducing mode %s is unknown, '
-                      'fallback to auto mode.' % (self.mode))
+                      'fallback to auto mode.', self.mode, RuntimeWarning)
       self.mode = 'auto'
     if (self.mode == 'min' or
         (self.mode == 'auto' and 'acc' not in self.monitor)):
@@ -852,7 +864,6 @@ class ReduceLROnPlateau(Callback):
       self.best = -np.Inf
     self.cooldown_counter = 0
     self.wait = 0
-    self.lr_epsilon = self.min_lr * 1e-4
 
   def on_train_begin(self, logs=None):
     self._reset()
@@ -863,8 +874,9 @@ class ReduceLROnPlateau(Callback):
     current = logs.get(self.monitor)
     if current is None:
       logging.warning('Reduce LR on plateau conditioned on metric `%s` '
-                      'which is not available. Available metrics are: %s' %
-                      (self.monitor, ','.join(list(logs.keys()))))
+                      'which is not available. Available metrics are: %s',
+                      self.monitor, ','.join(list(logs.keys())), RuntimeWarning)
+
     else:
       if self.in_cooldown():
         self.cooldown_counter -= 1
@@ -876,13 +888,13 @@ class ReduceLROnPlateau(Callback):
       elif not self.in_cooldown():
         if self.wait >= self.patience:
           old_lr = float(K.get_value(self.model.optimizer.lr))
-          if old_lr > self.min_lr + self.lr_epsilon:
+          if old_lr > self.min_lr:
             new_lr = old_lr * self.factor
             new_lr = max(new_lr, self.min_lr)
             K.set_value(self.model.optimizer.lr, new_lr)
             if self.verbose > 0:
-              print('\nEpoch %05d: reducing learning rate to %s.' % (epoch,
-                                                                     new_lr))
+              print('\nEpoch %05d: ReduceLROnPlateau reducing learning '
+                    'rate to %s.' % (epoch + 1, new_lr))
             self.cooldown_counter = self.cooldown
             self.wait = 0
         self.wait += 1
@@ -891,6 +903,7 @@ class ReduceLROnPlateau(Callback):
     return self.cooldown_counter > 0
 
 
+@tf_export('keras.callbacks.CSVLogger')
 class CSVLogger(Callback):
   """Callback that streams epoch results to a csv file.
 
@@ -898,10 +911,11 @@ class CSVLogger(Callback):
   including 1D iterables such as np.ndarray.
 
   Example:
-      ```python
-      csv_logger = CSVLogger('training.log')
-      model.fit(X_train, Y_train, callbacks=[csv_logger])
-      ```
+
+  ```python
+  csv_logger = CSVLogger('training.log')
+  model.fit(X_train, Y_train, callbacks=[csv_logger])
+  ```
 
   Arguments:
       filename: filename of the csv file, e.g. 'run/log.csv'.
@@ -941,12 +955,14 @@ class CSVLogger(Callback):
       else:
         return k
 
+    if self.keys is None:
+      self.keys = sorted(logs.keys())
+
     if self.model.stop_training:
       # We set NA so that csv parsers do not fail for this last epoch.
       logs = dict([(k, logs[k]) if k in logs else (k, 'NA') for k in self.keys])
 
     if not self.writer:
-      self.keys = sorted(logs.keys())
 
       class CustomDialect(csv.excel):
         delimiter = self.sep
@@ -968,6 +984,7 @@ class CSVLogger(Callback):
     self.writer = None
 
 
+@tf_export('keras.callbacks.LambdaCallback')
 class LambdaCallback(Callback):
   r"""Callback for creating simple, custom callbacks on-the-fly.
 
@@ -992,32 +1009,32 @@ class LambdaCallback(Callback):
 
   Example:
 
-      ```python
-      # Print the batch number at the beginning of every batch.
-      batch_print_callback = LambdaCallback(
-          on_batch_begin=lambda batch,logs: print(batch))
-
-      # Stream the epoch loss to a file in JSON format. The file content
-      # is not well-formed JSON but rather has a JSON object per line.
-      import json
-      json_log = open('loss_log.json', mode='wt', buffering=1)
-      json_logging_callback = LambdaCallback(
-          on_epoch_end=lambda epoch, logs: json_log.write(
-              json.dumps({'epoch': epoch, 'loss': logs['loss']}) + '\n'),
-          on_train_end=lambda logs: json_log.close()
-      )
-
-      # Terminate some processes after having finished model training.
-      processes = ...
-      cleanup_callback = LambdaCallback(
-          on_train_end=lambda logs: [
-              p.terminate() for p in processes if p.is_alive()])
-
-      model.fit(...,
-                callbacks=[batch_print_callback,
-                           json_logging_callback,
-                           cleanup_callback])
-      ```
+  ```python
+  # Print the batch number at the beginning of every batch.
+  batch_print_callback = LambdaCallback(
+      on_batch_begin=lambda batch,logs: print(batch))
+
+  # Stream the epoch loss to a file in JSON format. The file content
+  # is not well-formed JSON but rather has a JSON object per line.
+  import json
+  json_log = open('loss_log.json', mode='wt', buffering=1)
+  json_logging_callback = LambdaCallback(
+      on_epoch_end=lambda epoch, logs: json_log.write(
+          json.dumps({'epoch': epoch, 'loss': logs['loss']}) + '\n'),
+      on_train_end=lambda logs: json_log.close()
+  )
+
+  # Terminate some processes after having finished model training.
+  processes = ...
+  cleanup_callback = LambdaCallback(
+      on_train_end=lambda logs: [
+          p.terminate() for p in processes if p.is_alive()])
+
+  model.fit(...,
+            callbacks=[batch_print_callback,
+                       json_logging_callback,
+                       cleanup_callback])
+  ```
   """
 
   def __init__(self,
diff --git a/tensorflow/python/keras/_impl/keras/callbacks_test.py b/tensorflow/python/keras/_impl/keras/callbacks_test.py
index 9c17fbb4a7eb318a91f04a6de8e956c8b2c17545..79dfcd1bb669db09de0cbaa103914efaaf19c6fb 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks_test.py
+++ b/tensorflow/python/keras/_impl/keras/callbacks_test.py
@@ -685,8 +685,8 @@ class KerasCallbacksTest(test.TestCase):
       # fit w/o validation data should raise ValueError if histogram_freq > 0
       cbs = callbacks_factory(histogram_freq=1)
       with self.assertRaises(ValueError):
-        model.fit(x_train, y_train, batch_size=BATCH_SIZE,
-                  callbacks=cbs, epochs=3)
+        model.fit(
+            x_train, y_train, batch_size=BATCH_SIZE, callbacks=cbs, epochs=3)
 
       for cb in cbs:
         cb.on_train_end()
@@ -695,8 +695,8 @@ class KerasCallbacksTest(test.TestCase):
       # histogram_freq > 0
       cbs = callbacks_factory(histogram_freq=1)
       with self.assertRaises(ValueError):
-        model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                            callbacks=cbs)
+        model.fit_generator(
+            data_generator(True), len(x_train), epochs=2, callbacks=cbs)
 
       for cb in cbs:
         cb.on_train_end()
@@ -705,10 +705,13 @@ class KerasCallbacksTest(test.TestCase):
       # histogram_freq > 0
       cbs = callbacks_factory(histogram_freq=1)
       with self.assertRaises(ValueError):
-        model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                            validation_data=data_generator(False),
-                            validation_steps=1,
-                            callbacks=cbs)
+        model.fit_generator(
+            data_generator(True),
+            len(x_train),
+            epochs=2,
+            validation_data=data_generator(False),
+            validation_steps=1,
+            callbacks=cbs)
 
       for cb in cbs:
         cb.on_train_end()
diff --git a/tensorflow/python/keras/_impl/keras/constraints.py b/tensorflow/python/keras/_impl/keras/constraints.py
index e58e3b0377b4b0fcad923095177c54d9c3ee1c0b..ab62d575e34c1a43d4b02bf5e4ce7962229ce15a 100644
--- a/tensorflow/python/keras/_impl/keras/constraints.py
+++ b/tensorflow/python/keras/_impl/keras/constraints.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Constraints: functions that impose constraints on weights values.
+# pylint: disable=invalid-name
+"""Constraints: functions that impose constraints on weight values.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -23,8 +24,10 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.constraints.Constraint')
 class Constraint(object):
 
   def __call__(self, w):
@@ -34,6 +37,7 @@ class Constraint(object):
     return {}
 
 
+@tf_export('keras.constraints.MaxNorm', 'keras.constraints.max_norm')
 class MaxNorm(Constraint):
   """MaxNorm weight constraint.
 
@@ -54,10 +58,6 @@ class MaxNorm(Constraint):
           to constrain the weights of each filter tensor of size
           `(rows, cols, input_depth)`.
 
-  References:
-      - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting
-        Srivastava, Hinton, et al.
-        2014](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
   """
 
   def __init__(self, max_value=2, axis=0):
@@ -67,22 +67,22 @@ class MaxNorm(Constraint):
   def __call__(self, w):
     norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
     desired = K.clip(norms, 0, self.max_value)
-    w *= (desired / (K.epsilon() + norms))
-    return w
+    return w * (desired / (K.epsilon() + norms))
 
   def get_config(self):
     return {'max_value': self.max_value, 'axis': self.axis}
 
 
+@tf_export('keras.constraints.NonNeg', 'keras.constraints.non_neg')
 class NonNeg(Constraint):
   """Constrains the weights to be non-negative.
   """
 
   def __call__(self, w):
-    w *= K.cast(w >= 0., K.floatx())
-    return w
+    return w * K.cast(K.greater_equal(w, 0.), K.floatx())
 
 
+@tf_export('keras.constraints.UnitNorm', 'keras.constraints.unit_norm')
 class UnitNorm(Constraint):
   """Constrains the weights incident to each hidden unit to have unit norm.
 
@@ -111,6 +111,7 @@ class UnitNorm(Constraint):
     return {'axis': self.axis}
 
 
+@tf_export('keras.constraints.MinMaxNorm', 'keras.constraints.min_max_norm')
 class MinMaxNorm(Constraint):
   """MinMaxNorm weight constraint.
 
@@ -132,7 +133,7 @@ class MinMaxNorm(Constraint):
           has shape `(input_dim, output_dim)`,
           set `axis` to `0` to constrain each weight vector
           of length `(input_dim,)`.
-          In a `Conv2D` layer with `dim_ordering="channels_last"`,
+          In a `Conv2D` layer with `data_format="channels_last"`,
           the weight tensor has shape
           `(rows, cols, input_depth, output_depth)`,
           set `axis` to `[0, 1, 2]`
@@ -148,10 +149,10 @@ class MinMaxNorm(Constraint):
 
   def __call__(self, w):
     norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
-    desired = (self.rate * K.clip(norms, self.min_value, self.max_value) +
-               (1 - self.rate) * norms)
-    w *= (desired / (K.epsilon() + norms))
-    return w
+    desired = (
+        self.rate * K.clip(norms, self.min_value, self.max_value) +
+        (1 - self.rate) * norms)
+    return w * (desired / (K.epsilon() + norms))
 
   def get_config(self):
     return {
@@ -164,19 +165,23 @@ class MinMaxNorm(Constraint):
 
 # Aliases.
 
-# pylint: disable=invalid-name
 max_norm = MaxNorm
 non_neg = NonNeg
 unit_norm = UnitNorm
 min_max_norm = MinMaxNorm
 
-# pylint: enable=invalid-name
+# Legacy aliases.
+maxnorm = max_norm
+nonneg = non_neg
+unitnorm = unit_norm
 
 
+@tf_export('keras.constraints.serialize')
 def serialize(constraint):
   return serialize_keras_object(constraint)
 
 
+@tf_export('keras.constraints.deserialize')
 def deserialize(config, custom_objects=None):
   return deserialize_keras_object(
       config,
@@ -185,6 +190,7 @@ def deserialize(config, custom_objects=None):
       printable_module_name='constraint')
 
 
+@tf_export('keras.constraints.get')
 def get(identifier):
   if identifier is None:
     return None
diff --git a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py b/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
index 4359be89280f7ffa3479af38cd66ebd3aaf6c30e..13fa9aed2b8da124af4e9f68c779e08d3094cb5d 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
@@ -21,34 +21,36 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.util.tf_export import tf_export
 
 
-def load_data(path='boston_housing.npz', seed=113, test_split=0.2):
+@tf_export('keras.datasets.boston_housing.load_data')
+def load_data(path='boston_housing.npz', test_split=0.2, seed=113):
   """Loads the Boston Housing dataset.
 
   Arguments:
       path: path where to cache the dataset locally
           (relative to ~/.keras/datasets).
+      test_split: fraction of the data to reserve as test set.
       seed: Random seed for shuffling the data
           before computing the test split.
-      test_split: fraction of the data to reserve as test set.
 
   Returns:
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
   """
   assert 0 <= test_split < 1
-  fh = 'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5'
   path = get_file(
       path,
       origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz',
-      file_hash=fh)
+      file_hash=
+      'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5')
   f = np.load(path)
   x = f['x']
   y = f['y']
   f.close()
 
   np.random.seed(seed)
-  indices = np.arrange(len(x))
+  indices = np.arange(len(x))
   np.random.shuffle(indices)
   x = x[indices]
   y = y[indices]
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar.py b/tensorflow/python/keras/_impl/keras/datasets/cifar.py
index 564709c0eed6778b9809eb8c23556cac3c4702d9..7ada3340a59e114d73095068ec476da5973b67fb 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/cifar.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities used by the CIFAR10 and CIFAR100 datasets.
+"""Utilities common to CIFAR10 and CIFAR100 datasets.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py b/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
index 7905da66c1e619153c75d7e05cad748710d63849..6b772433822474c06efcce1701226a4a67abe361 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""CIFAR10 small image classification dataset.
+"""CIFAR10 small images classification dataset.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -25,8 +25,10 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.datasets.cifar import load_batch
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.datasets.cifar10.load_data')
 def load_data():
   """Loads CIFAR10 dataset.
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py b/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
index b69c0724c58d6d60a291c69db3de926605d90954..28d74116a50979abab207dbec88e384210dfc070 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""CIFAR100 small image classification dataset.
+"""CIFAR100 small images classification dataset.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -25,8 +25,10 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.datasets.cifar import load_batch
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.datasets.cifar100.load_data')
 def load_data(label_mode='fine'):
   """Loads CIFAR100 dataset.
 
@@ -40,7 +42,7 @@ def load_data(label_mode='fine'):
       ValueError: in case of invalid `label_mode`.
   """
   if label_mode not in ['fine', 'coarse']:
-    raise ValueError('label_mode must be one of "fine" "coarse".')
+    raise ValueError('`label_mode` must be one of `"fine"`, `"coarse"`.')
 
   dirname = 'cifar-100-python'
   origin = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
diff --git a/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py b/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
index 17be684e4f8bdb800c6b0883649da25f18fa0402..b9ae41a0d4d0e8d9df70e3fc1952e81c5f57e8d9 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 
 import gzip
 import os
+
 import numpy as np
+
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 
 
@@ -38,9 +40,8 @@ def load_data():
   ]
 
   paths = []
-  for given_file in files:
-    paths.append(
-        get_file(given_file, origin=base + given_file, cache_subdir=dirname))
+  for fname in files:
+    paths.append(get_file(fname, origin=base + fname, cache_subdir=dirname))
 
   with gzip.open(paths[0], 'rb') as lbpath:
     y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8)
diff --git a/tensorflow/python/keras/_impl/keras/datasets/imdb.py b/tensorflow/python/keras/_impl/keras/datasets/imdb.py
index 0e83473899c303e3ad96d253cf31a1def476fa52..e2dddf7730f2a922b09de4dadb4dd282b05caf21 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/imdb.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""IMDB movie review sentiment classification dataset.
+"""IMDB sentiment classification dataset.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -21,11 +21,14 @@ from __future__ import print_function
 import json
 
 import numpy as np
-from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.keras._impl.keras.preprocessing.sequence import _remove_long_seq
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.datasets.imdb.load_data')
 def load_data(path='imdb.npz',
               num_words=None,
               skip_top=0,
@@ -33,7 +36,8 @@ def load_data(path='imdb.npz',
               seed=113,
               start_char=1,
               oov_char=2,
-              index_from=3):
+              index_from=3,
+              **kwargs):
   """Loads the IMDB dataset.
 
   Arguments:
@@ -43,13 +47,14 @@ def load_data(path='imdb.npz',
           the most frequent words are kept
       skip_top: skip the top N most frequently occurring words
           (which may not be informative).
-      maxlen: truncate sequences after this length.
+      maxlen: sequences longer than this will be filtered out.
       seed: random seed for sample shuffling.
       start_char: The start of a sequence will be marked with this character.
           Set to 1 because 0 is usually the padding character.
       oov_char: words that were cut out because of the `num_words`
           or `skip_top` limit will be replaced with this character.
       index_from: index actual words with this index and higher.
+      **kwargs: Used for backwards compatibility.
 
   Returns:
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
@@ -64,22 +69,29 @@ def load_data(path='imdb.npz',
   Words that were not seen in the training set but are in the test set
   have simply been skipped.
   """
+  # Legacy support
+  if 'nb_words' in kwargs:
+    logging.warning('The `nb_words` argument in `load_data` '
+                    'has been renamed `num_words`.')
+    num_words = kwargs.pop('nb_words')
+  if kwargs:
+    raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+
   path = get_file(
       path,
       origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
       file_hash='599dadb1135973df5b59232a0e9a887c')
-  f = np.load(path)
-  x_train, labels_train = f['x_train'], f['y_train']
-  x_test, labels_test = f['x_test'], f['y_test']
-  f.close()
+  with np.load(path) as f:
+    x_train, labels_train = f['x_train'], f['y_train']
+    x_test, labels_test = f['x_test'], f['y_test']
 
   np.random.seed(seed)
-  indices = np.arrange(len(x_train))
+  indices = np.arange(len(x_train))
   np.random.shuffle(indices)
   x_train = x_train[indices]
   labels_train = labels_train[indices]
 
-  indices = np.arrange(len(x_test))
+  indices = np.arange(len(x_test))
   np.random.shuffle(indices)
   x_test = x_test[indices]
   labels_test = labels_test[indices]
@@ -93,14 +105,7 @@ def load_data(path='imdb.npz',
     xs = [[w + index_from for w in x] for x in xs]
 
   if maxlen:
-    new_xs = []
-    new_labels = []
-    for x, y in zip(xs, labels):
-      if len(x) < maxlen:
-        new_xs.append(x)
-        new_labels.append(y)
-    xs = new_xs
-    labels = new_labels
+    xs, labels = _remove_long_seq(maxlen, xs, labels)
     if not xs:
       raise ValueError('After filtering for sequences shorter than maxlen=' +
                        str(maxlen) + ', no sequence was kept. '
@@ -112,27 +117,20 @@ def load_data(path='imdb.npz',
   # reserve 'index_from' (=3 by default) characters:
   # 0 (padding), 1 (start), 2 (OOV)
   if oov_char is not None:
-    xs = [[oov_char if (w >= num_words or w < skip_top) else w for w in x]
-          for x in xs]
+    xs = [
+        [w if (skip_top <= w < num_words) else oov_char for w in x] for x in xs
+    ]
   else:
-    new_xs = []
-    for x in xs:
-      nx = []
-      for w in x:
-        if skip_top <= w < num_words:
-          nx.append(w)
-      new_xs.append(nx)
-    xs = new_xs
-
-  x_train = np.array(xs[:len(x_train)])
-  y_train = np.array(labels[:len(x_train)])
+    xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
 
-  x_test = np.array(xs[len(x_train):])
-  y_test = np.array(labels[len(x_train):])
+  idx = len(x_train)
+  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
+  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
 
   return (x_train, y_train), (x_test, y_test)
 
 
+@tf_export('keras.datasets.imdb.get_word_index')
 def get_word_index(path='imdb_word_index.json'):
   """Retrieves the dictionary mapping word indices back to words.
 
@@ -144,7 +142,8 @@ def get_word_index(path='imdb_word_index.json'):
   """
   path = get_file(
       path,
-      origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json')
+      origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json',
+      file_hash='bfafd718b763782e994055a2d397834f')
   f = open(path)
   data = json.load(f)
   f.close()
diff --git a/tensorflow/python/keras/_impl/keras/datasets/mnist.py b/tensorflow/python/keras/_impl/keras/datasets/mnist.py
index e98f29537f4e29c649d0a1879e75505b050d6639..e30691373e9aafad61b101476e21d6860527ce98 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/mnist.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/mnist.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""MNIST handwritten digits classification dataset.
+"""MNIST handwritten digits dataset.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -21,8 +21,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.datasets.mnist.load_data')
 def load_data(path='mnist.npz'):
   """Loads the MNIST dataset.
 
@@ -38,9 +40,7 @@ def load_data(path='mnist.npz'):
       origin='https://s3.amazonaws.com/img-datasets/mnist.npz',
       file_hash='8a61469f7ea1b51cbae51d4f78837e45')
   f = np.load(path)
-  x_train = f['x_train']
-  y_train = f['y_train']
-  x_test = f['x_test']
-  y_test = f['y_test']
+  x_train, y_train = f['x_train'], f['y_train']
+  x_test, y_test = f['x_test'], f['y_test']
   f.close()
   return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/python/keras/_impl/keras/datasets/reuters.py b/tensorflow/python/keras/_impl/keras/datasets/reuters.py
index d05eb0ef8caed93963b0059a023a06172d4e9ddb..b711696b5eecf9ba07a66cef25c1811c182b3b60 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/reuters.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/reuters.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Reuters newswire topic classification dataset.
+"""Reuters topic classification dataset.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -22,11 +21,14 @@ from __future__ import print_function
 import json
 
 import numpy as np
-from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.keras._impl.keras.preprocessing.sequence import _remove_long_seq
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.datasets.reuters.load_data')
 def load_data(path='reuters.npz',
               num_words=None,
               skip_top=0,
@@ -35,7 +37,8 @@ def load_data(path='reuters.npz',
               seed=113,
               start_char=1,
               oov_char=2,
-              index_from=3):
+              index_from=3,
+              **kwargs):
   """Loads the Reuters newswire classification dataset.
 
   Arguments:
@@ -53,6 +56,7 @@ def load_data(path='reuters.npz',
       oov_char: words that were cut out because of the `num_words`
           or `skip_top` limit will be replaced with this character.
       index_from: index actual words with this index and higher.
+      **kwargs: Used for backwards compatibility.
 
   Returns:
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
@@ -63,37 +67,34 @@ def load_data(path='reuters.npz',
   Words that were not seen in the training set but are in the test set
   have simply been skipped.
   """
+  # Legacy support
+  if 'nb_words' in kwargs:
+    logging.warning('The `nb_words` argument in `load_data` '
+                    'has been renamed `num_words`.')
+    num_words = kwargs.pop('nb_words')
+  if kwargs:
+    raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+
   path = get_file(
       path,
       origin='https://s3.amazonaws.com/text-datasets/reuters.npz',
       file_hash='87aedbeb0cb229e378797a632c1997b6')
-  npzfile = np.load(path)
-  xs = npzfile['x']
-  labels = npzfile['y']
-  npzfile.close()
+  with np.load(path) as f:
+    xs, labels = f['x'], f['y']
 
   np.random.seed(seed)
-  indices = np.arrange(len(xs))
+  indices = np.arange(len(xs))
   np.random.shuffle(indices)
   xs = xs[indices]
   labels = labels[indices]
 
-  np.random.shuffle(labels)
-
   if start_char is not None:
     xs = [[start_char] + [w + index_from for w in x] for x in xs]
   elif index_from:
     xs = [[w + index_from for w in x] for x in xs]
 
   if maxlen:
-    new_xs = []
-    new_labels = []
-    for x, y in zip(xs, labels):
-      if len(x) < maxlen:
-        new_xs.append(x)
-        new_labels.append(y)
-    xs = new_xs
-    labels = new_labels
+    xs, labels = _remove_long_seq(maxlen, xs, labels)
 
   if not num_words:
     num_words = max([max(x) for x in xs])
@@ -102,27 +103,18 @@ def load_data(path='reuters.npz',
   # reserve 'index_from' (=3 by default) characters:
   # 0 (padding), 1 (start), 2 (OOV)
   if oov_char is not None:
-    xs = [[oov_char if (w >= num_words or w < skip_top) else w for w in x]
-          for x in xs]
+    xs = [[w if skip_top <= w < num_words else oov_char for w in x] for x in xs]
   else:
-    new_xs = []
-    for x in xs:
-      nx = []
-      for w in x:
-        if skip_top <= w < num_words:
-          nx.append(w)
-      new_xs.append(nx)
-    xs = new_xs
-
-  x_train = np.array(xs[:int(len(xs) * (1 - test_split))])
-  y_train = np.array(labels[:int(len(xs) * (1 - test_split))])
+    xs = [[w for w in x if skip_top <= w < num_words] for x in xs]
 
-  x_test = np.array(xs[int(len(xs) * (1 - test_split)):])
-  y_test = np.array(labels[int(len(xs) * (1 - test_split)):])
+  idx = int(len(xs) * (1 - test_split))
+  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
+  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
 
   return (x_train, y_train), (x_test, y_test)
 
 
+@tf_export('keras.datasets.reuters.get_word_index')
 def get_word_index(path='reuters_word_index.json'):
   """Retrieves the dictionary mapping word indices back to words.
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology.py b/tensorflow/python/keras/_impl/keras/engine/topology.py
index 4a7bb2e83894f06c433964409ccb2bd3ebfed128..b267fac7df6e4ce24bf5c307048eaa1c09ac80f1 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology.py
@@ -39,6 +39,7 @@ from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.layers import network as tf_network
 from tensorflow.python.layers import utils as tf_layers_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=g-import-not-at-top
@@ -60,6 +61,7 @@ TFBaseLayer = tf_base_layers.Layer
 # pylint: enable=invalid-name
 
 
+@tf_export('keras.layers.Layer')
 class Layer(tf_base_layers.Layer):
   """Abstract base layer class.
 
@@ -109,7 +111,7 @@ class Layer(tf_base_layers.Layer):
       set_weights(weights)
       get_config()
       count_params()
-      _compute_output_shape(input_shape)
+      compute_output_shape(input_shape)
       compute_mask(x, mask)
       get_input_at(node_index)
       get_output_at(node_index)
@@ -259,6 +261,10 @@ class Layer(tf_base_layers.Layer):
     if context.in_eager_mode():
       return output
 
+    # Un-built subclassed network: build it
+    if isinstance(self, Network) and not self.inputs:
+      self._set_inputs(inputs)
+
     # Update learning phase info.
     output_tensors = _to_list(output)
     uses_lp = any(
@@ -274,7 +280,7 @@ class Layer(tf_base_layers.Layer):
       del self._initial_weights
     return output
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     """Computes the output shape of the layer.
 
     Assumes that the layer will be built
@@ -289,10 +295,13 @@ class Layer(tf_base_layers.Layer):
     Returns:
         An input shape tuple.
     """
-    if isinstance(input_shape, list):
-      return [tensor_shape.TensorShape(shape) for shape in input_shape]
-    else:
-      return tensor_shape.TensorShape(input_shape)
+    logging.warning(
+        'All custom layers should implement the '
+        '`compute_output_shape` method. This layer (' + self.name + ') '
+        'is relying on the base `Layer.compute_output_shape` implementation, '
+        'which will start raising a `NotImplementedError` '
+        'as of July 1st, 2018.')
+    return input_shape
 
   def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
     """Computes an output mask tensor.
@@ -487,6 +496,7 @@ class Layer(tf_base_layers.Layer):
     self._activity_regularizer = activity_regularizer
 
 
+@tf_export('keras.layers.InputLayer')
 class InputLayer(tf_network.InputLayer, Layer):
   """Layer to be used as an entry point into a graph.
 
@@ -549,6 +559,7 @@ class InputLayer(tf_network.InputLayer, Layer):
     return config
 
 
+@tf_export('keras.layers.Input', 'keras.Input')
 def Input(  # pylint: disable=invalid-name
     shape=None,
     batch_size=None,
@@ -674,10 +685,26 @@ class Network(tf_network.GraphNetwork, Layer):
       from_config
   """
 
-  def __init__(self, inputs, outputs, name=None):
+  def __init__(self, *args, **kwargs):  # pylint: disable=super-init-not-called
+    # Signature detection
+    if (len(args) == 2 or
+        len(args) == 1 and 'outputs' in kwargs or
+        'inputs' in kwargs and 'outputs' in kwargs):
+      # Graph network
+      self._init_graph_network(*args, **kwargs)
+    else:
+      # Subclassed network
+      self._init_subclassed_network(**kwargs)
+
+  def _init_graph_network(self, inputs, outputs, name=None):
+    # TODO(fchollet): merge back tf.layers.Network and tf.keras.Network
+    # into a single class tf.keras.Network
     super(Network, self).__init__(inputs, outputs, name=name)
 
+    self._is_compiled = False
     self.supports_masking = False
+    self.optimizer = None
+
     # Fill in the output mask cache.
     masks = []
     for x in self.inputs:
@@ -705,13 +732,63 @@ class Network(tf_network.GraphNetwork, Layer):
       self.input_names.append(layer.name)
       if layer.is_placeholder:
         self._feed_input_names.append(layer.name)
-        self._feed_inputs.append(layer.input)
         self._feed_input_shapes.append(K.int_shape(self.inputs[i]))
+        # layer.input gives an error in eager mode
+        if context.in_graph_mode():
+          self._feed_inputs.append(layer.input)
     for layer in self._output_layers:
       self.output_names.append(layer.name)
 
-    self.internal_input_shapes = [K.int_shape(x) for x in self.inputs]
-    self.internal_output_shapes = [K.int_shape(x) for x in self.outputs]
+  def _init_subclassed_network(self, name=None):
+    self._init_set_name(name)
+    self._layers = []
+    self._is_graph_network = False
+    self._is_compiled = False
+    self.outputs = None
+    self.inputs = None
+    self.trainable = True
+    self.supports_masking = False
+    self.built = False
+    self.optimizer = None
+
+    # Not used, exists for compatibility purposes due to implementation of
+    # the base layer tf.layers.Layer - TODO(fchollet): clean up when refactoring
+    self._scope = None
+    self._reuse = None
+    self._dtype = None
+    self._graph = None
+    self._activity_regularizer = None
+
+    # Used in symbolic mode only
+    self._updates = []
+    self._losses = []
+
+    # Used in symbolic mode only, only in conjonction with graph-networks
+    self._outbound_nodes = []
+    self._inbound_nodes = []
+
+  def __setattr__(self, name, value):
+    if isinstance(value, (tf_base_layers.Layer, Network)):
+      try:
+        is_graph_network = self._is_graph_network
+      except AttributeError:
+        raise RuntimeError('It looks like you are subclassing `Model` and you '
+                           'forgot to call `super(YourClass, self).__init__()`.'
+                           ' Always start with this line.')
+      if not is_graph_network:
+        if value not in self._layers:
+          self._layers.append(value)
+    super(Network, self).__setattr__(name, value)
+
+  def add_variable(self, name, shape, dtype=None, initializer=None,
+                   regularizer=None, trainable=True, constraint=None):
+    raise NotImplementedError('`add_variable` is not supported on Networks')
+
+  def add_loss(self, *args, **kwargs):
+    if context.in_eager_mode():
+      raise NotImplementedError('`add_loss` is not supported in eager-mode '
+                                'on Networks')
+    super(Network, self).add_loss(*args, **kwargs)
 
   @property
   def uses_learning_phase(self):
@@ -774,13 +851,16 @@ class Network(tf_network.GraphNetwork, Layer):
     K.batch_set_value(tuples)
 
   def compute_mask(self, inputs, mask):
+    if not self._is_graph_network:
+      return None
+
     inputs = _to_list(inputs)
     if mask is None:
       masks = [None for _ in range(len(inputs))]
     else:
       masks = _to_list(mask)
-    cache_key = ','.join([str(id(x)) for x in inputs])
-    cache_key += '_' + ','.join([str(id(x)) for x in masks])
+    cache_key = (tf_layers_util.object_list_uid(inputs)
+                 + '_' + tf_layers_util.object_list_uid(masks))
     if cache_key in self._output_mask_cache:
       return self._output_mask_cache[cache_key]
     else:
@@ -788,6 +868,9 @@ class Network(tf_network.GraphNetwork, Layer):
       return output_masks
 
   def get_config(self):
+    if not self._is_graph_network:
+      raise NotImplementedError
+
     config = {
         'name': self.name,
     }
@@ -1037,6 +1120,9 @@ class Network(tf_network.GraphNetwork, Layer):
     model = load_model('my_model.h5')
     ```
     """
+    if not self._is_graph_network:
+      raise NotImplementedError
+
     from tensorflow.python.keras._impl.keras.models import save_model  # pylint: disable=g-import-not-at-top
     save_model(self, filepath, overwrite, include_optimizer)
 
@@ -1139,6 +1225,8 @@ class Network(tf_network.GraphNetwork, Layer):
     Returns:
         A JSON string.
     """
+    if not self._is_graph_network:
+      raise NotImplementedError
 
     def get_json_type(obj):
       # If obj is any numpy type
@@ -1174,6 +1262,9 @@ class Network(tf_network.GraphNetwork, Layer):
     Raises:
         ImportError: if yaml module is not found.
     """
+    if not self._is_graph_network:
+      raise NotImplementedError
+
     if yaml is None:
       raise ImportError('Requires yaml module installed.')
     return yaml.dump(self._updated_config(), **kwargs)
@@ -1301,18 +1392,17 @@ def preprocess_weights_for_loading(layer,
   Returns:
       A list of weights values (Numpy arrays).
   """
-  if original_keras_version == '1':
-    if layer.__class__.__name__ == 'Bidirectional':
-      num_weights_per_layer = len(weights) // 2
-
-      forward_weights = preprocess_weights_for_loading(
-          layer.forward_layer, weights[:num_weights_per_layer],
-          original_keras_version, original_backend)
-      backward_weights = preprocess_weights_for_loading(
-          layer.backward_layer, weights[num_weights_per_layer:],
-          original_keras_version, original_backend)
-      weights = forward_weights + backward_weights
+  if layer.__class__.__name__ == 'Bidirectional':
+    num_weights_per_layer = len(weights) // 2
+    forward_weights = preprocess_weights_for_loading(
+        layer.forward_layer, weights[:num_weights_per_layer],
+        original_keras_version, original_backend)
+    backward_weights = preprocess_weights_for_loading(
+        layer.backward_layer, weights[num_weights_per_layer:],
+        original_keras_version, original_backend)
+    weights = forward_weights + backward_weights
 
+  if original_keras_version == '1':
     if layer.__class__.__name__ == 'TimeDistributed':
       weights = preprocess_weights_for_loading(
           layer.layer, weights, original_keras_version, original_backend)
@@ -1416,7 +1506,7 @@ def preprocess_weights_for_loading(layer,
 
   conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
   if layer.__class__.__name__ in conv_layers:
-    if original_backend and K.backend() != original_backend:
+    if original_backend == 'theano':
       weights[0] = conv_utils.convert_kernel(weights[0])
       if layer.__class__.__name__ == 'ConvLSTM2D':
         weights[1] = conv_utils.convert_kernel(weights[1])
@@ -1425,11 +1515,11 @@ def preprocess_weights_for_loading(layer,
       if layer.__class__.__name__ == 'ConvLSTM2D':
         weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
 
-  # convert the weights of CuDNNLSTM so that they could be loaded into LSTM
-  if layer.__class__.__name__ == 'LSTM':
-    # determine if we're loading a CuDNNLSTM layer from the number of bias
-    # weights:
+  # Convert the weights of CuDNNLSTM so that they could be loaded into LSTM
+  if layer.__class__.__name__ == 'LSTM' and len(weights) == 3:
+    # Determine if loading a CuDNNLSTM layer from the number of bias weights:
     # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
+    # if there's no bias weight in the file, skip this conversion
     units = weights[1].shape[0]
     bias = weights[2]
     if len(bias) == units * 8:
@@ -1569,3 +1659,31 @@ def load_weights_from_hdf5_group_by_name(f, layers):
       for i in range(len(weight_values)):
         weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
   K.batch_set_value(weight_value_tuples)
+
+
+def shape_type_conversion(fn):
+  """Decorator that handles tuple/TensorShape conversion.
+
+  Used in `compute_output_shape` and `build`.
+
+  Arguments:
+    fn: function to wrap.
+
+  Returns:
+    Wrapped function.
+  """
+
+  def wrapper(instance, input_shape):
+    if input_shape is not None:
+      if isinstance(input_shape, list):
+        input_shape = [
+            tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
+      else:
+        input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+    output_shape = fn(instance, input_shape)
+    if output_shape is not None:
+      if isinstance(output_shape, list):
+        return [tensor_shape.TensorShape(x) for x in output_shape]
+      return tensor_shape.TensorShape(output_shape)
+
+  return wrapper
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
index 32e692ba7c22007fc717792f05d5511dc6317cca..0673e4237674cf01c3df5ab7dc8e13f1de03e477 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
@@ -26,6 +26,8 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 
 try:
@@ -42,22 +44,28 @@ except ImportError:
 class TopologyConstructionTest(test.TestCase):
 
   def test_get_updates_for(self):
-    a = keras.layers.Input(shape=(2,))
+    a = keras.layers.Input(shape=(1,))
     dense_layer = keras.layers.Dense(1)
-    dense_layer.add_update(0, inputs=a)
-    dense_layer.add_update(1, inputs=None)
+    dense_layer.build((None, 1))
+    update_1 = state_ops.assign_add(dense_layer.kernel, a)
+    update_2 = state_ops.assign_add(dense_layer.kernel, [[1.]])
+    dense_layer.add_update(update_1, inputs=a)
+    dense_layer.add_update(update_2, inputs=None)
 
-    self.assertListEqual(dense_layer.get_updates_for(a), [0])
-    self.assertListEqual(dense_layer.get_updates_for(None), [1])
+    self.assertListEqual(dense_layer.get_updates_for(a), [update_1])
+    self.assertListEqual(dense_layer.get_updates_for(None), [update_2])
 
   def test_get_losses_for(self):
-    a = keras.layers.Input(shape=(2,))
+    a = keras.layers.Input(shape=(1,))
     dense_layer = keras.layers.Dense(1)
-    dense_layer.add_loss(0, inputs=a)
-    dense_layer.add_loss(1, inputs=None)
+    dense_layer.build((None, 1))
+    loss_1 = math_ops.reduce_sum(a)
+    loss_2 = math_ops.reduce_sum(dense_layer.kernel)
+    dense_layer.add_loss(loss_1, inputs=a)
+    dense_layer.add_loss(loss_2, inputs=None)
 
-    self.assertListEqual(dense_layer.get_losses_for(a), [0])
-    self.assertListEqual(dense_layer.get_losses_for(None), [1])
+    self.assertListEqual(dense_layer.get_losses_for(a), [loss_1])
+    self.assertListEqual(dense_layer.get_losses_for(None), [loss_2])
 
   def test_trainable_weights(self):
     a = keras.layers.Input(shape=(2,))
@@ -279,7 +287,7 @@ class TopologyConstructionTest(test.TestCase):
 
       model = keras.models.Model(inputs=[a, b], outputs=[c, d], name='model')
       self.assertEqual(len(model.layers), 6)
-      output_shapes = model._compute_output_shape([(None, 32), (None, 32)])
+      output_shapes = model.compute_output_shape([(None, 32), (None, 32)])
       self.assertListEqual(output_shapes[0].as_list(), [None, 64])
       self.assertListEqual(output_shapes[1].as_list(), [None, 5])
       self.assertListEqual(
@@ -340,6 +348,7 @@ class TopologyConstructionTest(test.TestCase):
       e = keras.layers.Input(shape=(32,), name='input_e')
       f = keras.layers.Input(shape=(32,), name='input_f')
       g, h = model([e, f])
+      self.assertEqual(g.name, 'model_1/dense_2/BiasAdd:0')
 
       self.assertListEqual(g.get_shape().as_list(), c.get_shape().as_list())
       self.assertListEqual(h.get_shape().as_list(), d.get_shape().as_list())
@@ -360,8 +369,8 @@ class TopologyConstructionTest(test.TestCase):
       self.assertListEqual(
           model.compute_mask([e, f], [None, None]), [None, None])
       self.assertListEqual(
-          final_model._compute_output_shape([(10, 32), (10, 32)]), [(10, 7),
-                                                                    (10, 64)])
+          final_model.compute_output_shape([(10, 32), (10, 32)]), [(10, 7),
+                                                                   (10, 64)])
 
       # run recursive model
       fn = keras.backend.function(final_model.inputs, final_model.outputs)
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index b4205bf4a397690ce6dd3424e0dd4076d9860e9d..a71f371b8e28d0ca1a9d17edf16e647279a87a78 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras training and evaluation routines.
+"""Training-related part of the Keras engine.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -23,17 +22,31 @@ import copy
 
 import numpy as np
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import callbacks as cbks
 from tensorflow.python.keras._impl.keras import losses
 from tensorflow.python.keras._impl.keras import metrics as metrics_module
 from tensorflow.python.keras._impl.keras import optimizers
+from tensorflow.python.keras._impl.keras.engine import training_eager
 from tensorflow.python.keras._impl.keras.engine.topology import Network
 from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
 from tensorflow.python.keras._impl.keras.utils.data_utils import OrderedEnqueuer
 from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
+from tensorflow.python.keras._impl.keras.utils.generic_utils import make_batches
 from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.layers.base import _DeferredTensor
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import optimizer as tf_optimizer_module
+from tensorflow.python.util.tf_export import tf_export
+
+try:
+  from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
+except ImportError:
+  issparse = None
 
 
 def _standardize_input_data(data,
@@ -70,89 +83,75 @@ def _standardize_input_data(data,
     return []
   if data is None:
     return [None for _ in range(len(names))]
+
   if isinstance(data, dict):
-    for key, value in data.items():
-      if value.__class__.__name__ == 'DataFrame':
-        data[key] = value.values
-    arrays = []
-    for name in names:
-      if name not in data:
-        raise ValueError('No data provided for "' + name +
-                         '". Need data for each key in: ' + str(names))
-      arrays.append(data[name])
+    try:
+      data = [
+          data[x].values
+          if data[x].__class__.__name__ == 'DataFrame' else data[x]
+          for x in names
+      ]
+    except KeyError as e:
+      raise ValueError('No data provided for "' + e.args[0] + '". Need data '
+                       'for each key in: ' + str(names))
   elif isinstance(data, list):
-    for key, value in enumerate(data):
-      if value.__class__.__name__ == 'DataFrame':
-        data[key] = value.values
-    if len(data) != len(names):
-      if data and hasattr(data[0], 'shape'):
-        raise ValueError(
-            'Error when checking model ' + exception_prefix +
-            ': the list of Numpy arrays '
-            'that you are passing to your model '
-            'is not the size the model expected. '
-            'Expected to see ' + str(len(names)) + ' array(s), but instead got '
-            'the following list of ' + str(len(data)) + ' arrays: ' +
-            str(data)[:200] + '...')
-      else:
-        if len(names) == 1:
-          data = [np.asarray(data)]
-        else:
-          raise ValueError('Error when checking model ' + exception_prefix +
-                           ': you are passing a list as '
-                           'input to your model, '
-                           'but the model expects '
-                           'a list of ' + str(len(names)) +
-                           ' Numpy arrays instead. '
-                           'The list you passed was: ' + str(data)[:200])
-    arrays = data
-  elif data.__class__.__name__ == 'DataFrame':
-    # test if data is a DataFrame, without pandas installed
-    arrays = data.values
+    if isinstance(data[0], list):
+      data = [np.asarray(d) for d in data]
+    elif len(names) == 1 and isinstance(data[0], (float, int)):
+      data = [np.asarray(data)]
+    else:
+      data = [
+          x.values if x.__class__.__name__ == 'DataFrame' else x for x in data
+      ]
   else:
-    if not hasattr(data, 'shape'):
+    data = data.values if data.__class__.__name__ == 'DataFrame' else data
+    data = [data]
+  data = [
+      np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x for x in data
+  ]
+
+  if len(data) != len(names):
+    if data and hasattr(data[0], 'shape'):
+      raise ValueError('Error when checking model ' + exception_prefix +
+                       ': the list of Numpy arrays that you are passing to '
+                       'your model is not the size the model expected. '
+                       'Expected to see ' + str(len(names)) + ' array(s), '
+                       'but instead got the following list of ' +
+                       str(len(data)) + ' arrays: ' + str(data)[:200] + '...')
+    elif len(names) > 1:
+      raise ValueError(
+          'Error when checking model ' + exception_prefix +
+          ': you are passing a list as input to your model, '
+          'but the model expects a list of ' + str(len(names)) +
+          ' Numpy arrays instead. The list you passed was: ' + str(data)[:200])
+    elif len(data) == 1 and not hasattr(data[0], 'shape'):
       raise TypeError('Error when checking model ' + exception_prefix +
-                      ': data should be a Numpy array, '
-                      'or list/dict of Numpy arrays. '
-                      'Found: ' + str(data)[:200] + '...')
-    if len(names) > 1:
-      # Case: model expects multiple inputs but only received
-      # a single Numpy array.
-      raise ValueError('The model expects ' + str(len(names)) + ' ' +
-                       exception_prefix +
-                       ' arrays, but only received one array. '
-                       'Found: array with shape ' + str(data.shape))
-    arrays = [data]
-
-  # Make arrays at least 2D.
-  for i in range(len(names)):
-    array = arrays[i]
-    if len(array.shape) == 1:
-      array = np.expand_dims(array, 1)
-      arrays[i] = array
+                      ': data should be a Numpy array, or list/dict of '
+                      'Numpy arrays. Found: ' + str(data)[:200] + '...')
+    elif len(names) == 1:
+      data = [np.asarray(data)]
 
   # Check shapes compatibility.
   if shapes:
     for i in range(len(names)):
-      if shapes[i] is None:
-        continue
-      array = arrays[i]
-      if len(array.shape) != len(shapes[i]):
-        raise ValueError(
-            'Error when checking ' + exception_prefix + ': expected ' + names[i]
-            + ' to have ' + str(len(shapes[i])) +
-            ' dimensions, but got array with shape ' + str(array.shape))
-      for j, (dim, ref_dim) in enumerate(zip(array.shape, shapes[i])):
-        if not j and not check_batch_axis:
-          # skip the first axis
-          continue
-        if ref_dim:
-          if ref_dim != dim:
-            raise ValueError('Error when checking ' + exception_prefix +
-                             ': expected ' + names[i] + ' to have shape ' +
-                             str(shapes[i]) + ' but got array with shape ' +
-                             str(array.shape))
-  return arrays
+      if shapes[i] is not None:
+        data_shape = data[i].shape
+        shape = shapes[i]
+        if data[i].ndim != len(shape):
+          raise ValueError('Error when checking ' + exception_prefix +
+                           ': expected ' + names[i] + ' to have ' +
+                           str(len(shape)) + ' dimensions, but got array '
+                           'with shape ' + str(data_shape))
+        if not check_batch_axis:
+          data_shape = data_shape[1:]
+          shape = shape[1:]
+        for dim, ref_dim in zip(data_shape, shape):
+          if ref_dim != dim and ref_dim:
+            raise ValueError(
+                'Error when checking ' + exception_prefix + ': expected ' +
+                names[i] + ' to have shape ' + str(shape) +
+                ' but got array with shape ' + str(data_shape))
+  return data
 
 
 def _standardize_sample_or_class_weights(x_weight, output_names, weight_type):
@@ -193,10 +192,10 @@ def _standardize_sample_or_class_weights(x_weight, output_names, weight_type):
       x_weights.append(x_weight.get(name))
     return x_weights
   else:
-    raise TypeError('The model has multiple outputs, so `' + weight_type + '` '
-                    'should be either a list of a dict. '
-                    'Provided `' + weight_type + '` type not understood: ' +
-                    str(x_weight))
+    raise TypeError(
+        'The model has multiple outputs, so `' + weight_type + '` '
+        'should be either a list or a dict. '
+        'Provided `' + weight_type + '` type not understood: ' + str(x_weight))
 
 
 def _standardize_class_weights(class_weight, output_names):
@@ -234,12 +233,12 @@ def _check_array_lengths(inputs, targets, weights=None):
   set_w = set_of_lengths(weights)
   if len(set_x) > 1:
     raise ValueError('All input arrays (x) should have '
-                     'the same number of samples. Got array shapes: ' + str(
-                         [x.shape for x in inputs]))
+                     'the same number of samples. Got array shapes: ' +
+                     str([x.shape for x in inputs]))
   if len(set_y) > 1:
     raise ValueError('All target arrays (y) should have '
-                     'the same number of samples. Got array shapes: ' + str(
-                         [y.shape for y in targets]))
+                     'the same number of samples. Got array shapes: ' +
+                     str([y.shape for y in targets]))
   if set_x and set_y and list(set_x)[0] != list(set_y)[0]:
     raise ValueError('Input arrays should have '
                      'the same number of samples as target arrays. '
@@ -247,8 +246,8 @@ def _check_array_lengths(inputs, targets, weights=None):
                      'and ' + str(list(set_y)[0]) + ' target samples.')
   if len(set_w) > 1:
     raise ValueError('All sample_weight arrays should have '
-                     'the same number of samples. Got array shapes: ' + str(
-                         [w.shape for w in weights]))
+                     'the same number of samples. Got array shapes: ' +
+                     str([w.shape for w in weights]))
   if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
     raise ValueError('Sample_weight arrays should have '
                      'the same number of samples as target arrays. Got ' +
@@ -365,62 +364,6 @@ def _batch_shuffle(index_array, batch_size):
   return np.append(index_array, last_batch)
 
 
-def _make_batches(size, batch_size):
-  """Returns a list of batch indices (tuples of indices).
-
-  Arguments:
-      size: Integer, total size of the data to slice into batches.
-      batch_size: Integer, batch size.
-
-  Returns:
-      A list of tuples of array indices.
-  """
-  num_batches = int(np.ceil(size / float(batch_size)))
-  return [(i * batch_size, min(size, (i + 1) * batch_size))
-          for i in range(num_batches)]
-
-
-def _slice_arrays(arrays, start=None, stop=None):
-  """Slice an array or list of arrays.
-
-  This takes an array-like, or a list of
-  array-likes, and outputs:
-      - arrays[start:stop] if `arrays` is an array-like
-      - [x[start:stop] for x in arrays] if `arrays` is a list
-
-  Can also work on list/array of indices: `_slice_arrays(x, indices)`
-
-  Arguments:
-      arrays: Single array or list of arrays.
-      start: can be an integer index (start index)
-          or a list/array of indices
-      stop: integer (stop index); should be None if
-          `start` was a list.
-
-  Returns:
-      A slice of the array(s).
-  """
-  if arrays is None:
-    return [None]
-  elif isinstance(arrays, list):
-    if hasattr(start, '__len__'):
-      # hdf5 datasets only support list objects as indices
-      if hasattr(start, 'shape'):
-        start = start.tolist()
-      return [None if x is None else x[start] for x in arrays]
-    else:
-      return [None if x is None else x[start:stop] for x in arrays]
-  else:
-    if hasattr(start, '__len__'):
-      if hasattr(start, 'shape'):
-        start = start.tolist()
-      return arrays[start]
-    elif hasattr(start, '__getitem__'):
-      return arrays[start:stop]
-    else:
-      return [None]
-
-
 def _weighted_masked_objective(fn):
   """Adds support for masking and sample-weighting to an objective function.
 
@@ -528,16 +471,16 @@ def _standardize_weights(y,
 
   if sample_weight is not None:
     if len(sample_weight.shape) > len(y.shape):
-      raise ValueError('Found a sample_weight with shape' +
-                       str(sample_weight.shape) + '.'
-                       'Expected sample_weight with rank '
-                       'less than or equal to ' + str(len(y.shape)))
+      raise ValueError(
+          'Found a sample_weight with shape' + str(sample_weight.shape) + '.'
+          'Expected sample_weight with rank '
+          'less than or equal to ' + str(len(y.shape)))
 
     if y.shape[:sample_weight.ndim] != sample_weight.shape:
-      raise ValueError('Found a sample_weight array with shape ' +
-                       str(sample_weight.shape) + ' for an input with shape ' +
-                       str(y.shape) + '. '
-                       'sample_weight cannot be broadcast.')
+      raise ValueError(
+          'Found a sample_weight array with shape ' + str(sample_weight.shape) +
+          ' for an input with shape ' + str(y.shape) + '. '
+          'sample_weight cannot be broadcast.')
     return sample_weight
   elif isinstance(class_weight, dict):
     if len(y.shape) > 2:
@@ -569,6 +512,7 @@ def _standardize_weights(y,
       return np.ones((y.shape[0], y.shape[1]), dtype=K.floatx())
 
 
+@tf_export('keras.models.Model', 'keras.Model')
 class Model(Network):
   """The `Model` class adds training & evaluation routines to a `Network`.
   """
@@ -631,20 +575,39 @@ class Model(Network):
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
     loss = loss or {}
+    if context.in_eager_mode() and  not isinstance(
+        optimizer, (tf_optimizer_module.Optimizer, optimizers.TFOptimizer)):
+      raise ValueError('Only TF native optimizers are supported in Eager mode.')
+
     self.optimizer = optimizers.get(optimizer)
-    self.sample_weight_mode = sample_weight_mode
     self.loss = loss
+    self.metrics = metrics
     self.loss_weights = loss_weights
+    if context.in_eager_mode() and sample_weight_mode is not None:
+      raise ValueError('sample_weight_mode is not supported in Eager mode.')
     self.sample_weight_mode = sample_weight_mode
+    if context.in_eager_mode() and weighted_metrics is not None:
+      raise ValueError('weighted_metrics is not supported in Eager mode.')
+    self.weighted_metrics = weighted_metrics
+    if context.in_eager_mode() and target_tensors is not None:
+      raise ValueError('target_tensors is not supported in Eager mode.')
+    self.target_tensors = target_tensors
+
+    if not self.built:
+      # Model is not compilable because it does not know its number of inputs
+      # and outputs, nor their shapes and names. We will compile after the first
+      # time the model gets called on training data.
+      return
+    self._is_compiled = True
 
     # Prepare loss functions.
     if isinstance(loss, dict):
       for name in loss:
         if name not in self.output_names:
-          raise ValueError('Unknown entry in loss '
-                           'dictionary: "' + name + '". '
-                           'Only expected the following keys: ' +
-                           str(self.output_names))
+          raise ValueError(
+              'Unknown entry in loss '
+              'dictionary: "' + name + '". '
+              'Only expected the following keys: ' + str(self.output_names))
       loss_functions = []
       for name in self.output_names:
         if name not in loss:
@@ -665,6 +628,7 @@ class Model(Network):
       loss_function = losses.get(loss)
       loss_functions = [loss_function for _ in range(len(self.outputs))]
     self.loss_functions = loss_functions
+
     weighted_losses = [_weighted_masked_objective(fn) for fn in loss_functions]
     skip_target_indices = []
     skip_target_weighing_indices = []
@@ -678,11 +642,12 @@ class Model(Network):
         skip_target_weighing_indices.append(i)
 
     # Prepare output masks.
-    masks = self.compute_mask(self.inputs, mask=None)
-    if masks is None:
-      masks = [None for _ in self.outputs]
-    if not isinstance(masks, list):
-      masks = [masks]
+    if context.in_graph_mode():
+      masks = self.compute_mask(self.inputs, mask=None)
+      if masks is None:
+        masks = [None for _ in self.outputs]
+      if not isinstance(masks, list):
+        masks = [masks]
 
     # Prepare loss weights.
     if loss_weights is None:
@@ -690,57 +655,81 @@ class Model(Network):
     elif isinstance(loss_weights, dict):
       for name in loss_weights:
         if name not in self.output_names:
-          raise ValueError('Unknown entry in loss_weights '
-                           'dictionary: "' + name + '". '
-                           'Only expected the following keys: ' +
-                           str(self.output_names))
+          raise ValueError(
+              'Unknown entry in loss_weights '
+              'dictionary: "' + name + '". '
+              'Only expected the following keys: ' + str(self.output_names))
       loss_weights_list = []
       for name in self.output_names:
         loss_weights_list.append(loss_weights.get(name, 1.))
     elif isinstance(loss_weights, list):
       if len(loss_weights) != len(self.outputs):
-        raise ValueError('When passing a list as loss_weights, '
-                         'it should have one entry per model outputs. '
-                         'The model has ' + str(len(self.outputs)) +
-                         ' outputs, but you passed loss_weights=' +
-                         str(loss_weights))
+        raise ValueError(
+            'When passing a list as loss_weights, '
+            'it should have one entry per model output. '
+            'The model has ' + str(len(self.outputs)) +
+            ' outputs, but you passed loss_weights=' + str(loss_weights))
       loss_weights_list = loss_weights
     else:
       raise TypeError('Could not interpret loss_weights argument: ' +
                       str(loss_weights) + ' - expected a list of dicts.')
+    self.loss_weights_list = loss_weights_list
+
+    # initialization for Eager mode execution
+    if context.in_eager_mode():
+      if target_tensors is not None:
+        raise ValueError('target_tensors are not currently supported in Eager'
+                         'mode.')
+      self.total_loss = None
+      self.metrics_tensors = []
+      self.metrics_names = ['loss']
+      for i in range(len(self.outputs)):
+        if len(self.outputs) > 1:
+          self.metrics_names.append(self.output_names[i] + '_loss')
+      self.nested_metrics = _collect_metrics(metrics, self.output_names)
+      self._feed_sample_weight_modes = []
+      for i in range(len(self.outputs)):
+        self._feed_sample_weight_modes.append(None)
+      self.sample_weights = []
+      self.targets = []
+      for i in range(len(self.outputs)):
+        self._feed_output_names.append(self.output_names[i])
+      self._collected_trainable_weights = self.trainable_weights
+      return
 
     # Prepare targets of model.
     self.targets = []
     self._feed_targets = []
-    if target_tensors is not None:
+    if target_tensors not in (None, []):
       if isinstance(target_tensors, list):
         if len(target_tensors) != len(self.outputs):
-          raise ValueError('When passing a list as `target_tensors`, '
-                           'it should have one entry per model outputs. '
-                           'The model has ' + str(len(self.outputs)) +
-                           ' outputs, but you passed target_tensors=' +
-                           str(target_tensors))
+          raise ValueError(
+              'When passing a list as `target_tensors`, '
+              'it should have one entry per model output. '
+              'The model has ' + str(len(self.outputs)) +
+              ' outputs, but you passed target_tensors=' + str(target_tensors))
       elif isinstance(target_tensors, dict):
         for name in target_tensors:
           if name not in self.output_names:
-            raise ValueError('Unknown entry in `target_tensors` '
-                             'dictionary: "' + name + '". '
-                             'Only expected the following keys: ' +
-                             str(self.output_names))
-        target_tensors_ = []
+            raise ValueError(
+                'Unknown entry in `target_tensors` '
+                'dictionary: "' + name + '". '
+                'Only expected the following keys: ' + str(self.output_names))
+        tmp_target_tensors = []
         for name in self.output_names:
-          target_tensors_.append(target_tensors.get(name, None))
-        target_tensors = target_tensors_
+          tmp_target_tensors.append(target_tensors.get(name, None))
+        target_tensors = tmp_target_tensors
       else:
         raise TypeError('Expected `target_tensors` to be '
                         'a list or dict, but got:', target_tensors)
+
     for i in range(len(self.outputs)):
       if i in skip_target_indices:
         self.targets.append(None)
       else:
-        shape = self.internal_output_shapes[i]
+        shape = K.int_shape(self.outputs[i])
         name = self.output_names[i]
-        if target_tensors is not None:
+        if target_tensors not in (None, []):
           target = target_tensors[i]
         else:
           target = None
@@ -766,30 +755,30 @@ class Model(Network):
     if isinstance(sample_weight_mode, dict):
       for name in sample_weight_mode:
         if name not in self.output_names:
-          raise ValueError('Unknown entry in '
-                           'sample_weight_mode dictionary: "' + name + '". '
-                           'Only expected the following keys: ' +
-                           str(self.output_names))
+          raise ValueError(
+              'Unknown entry in '
+              'sample_weight_mode dictionary: "' + name + '". '
+              'Only expected the following keys: ' + str(self.output_names))
       for i, name in enumerate(self.output_names):
         if i in skip_target_weighing_indices:
           weight = None
           sample_weight_modes.append(None)
         else:
           if name not in sample_weight_mode:
-            raise ValueError('Output "' + name +
-                             '" missing from sample_weight_modes '
-                             'dictionary')
+            raise ValueError(
+                'Output "' + name + '" missing from sample_weight_modes '
+                'dictionary')
           if sample_weight_mode.get(name) == 'temporal':
             weight = K.placeholder(ndim=2, name=name + '_sample_weights')
             sample_weight_modes.append('temporal')
           else:
-            weight = K.placeholder(ndim=1, name=name + '_sample_weights')
+            weight = K.placeholder(ndim=1, name=name + 'sample_weights')
             sample_weight_modes.append(None)
         sample_weights.append(weight)
     elif isinstance(sample_weight_mode, list):
       if len(sample_weight_mode) != len(self.outputs):
         raise ValueError('When passing a list as sample_weight_mode, '
-                         'it should have one entry per model outputs. '
+                         'it should have one entry per model output. '
                          'The model has ' + str(len(self.outputs)) +
                          ' outputs, but you passed '
                          'sample_weight_mode=' + str(sample_weight_mode))
@@ -894,23 +883,36 @@ class Model(Network):
           metric_name_prefix = 'weighted_' if weights is not None else ''
 
           for metric in metrics:
-            if metric == 'accuracy' or metric == 'acc':
-              # custom handling of accuracy
+            if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
+              # custom handling of accuracy/crossentropy
               # (because of class mode duality)
-              output_shape = self.internal_output_shapes[i]
+              output_shape = K.int_shape(self.outputs[i])
               if (output_shape[-1] == 1 or
                   self.loss_functions[i] == losses.binary_crossentropy):
-                # case: binary accuracy
-                acc_fn = metrics_module.binary_accuracy
+                # case: binary accuracy/crossentropy
+                if metric in ('accuracy', 'acc'):
+                  acc_fn = metrics_module.binary_accuracy
+                elif metric in ('crossentropy', 'ce'):
+                  acc_fn = metrics_module.binary_crossentropy
               elif self.loss_functions[
                   i] == losses.sparse_categorical_crossentropy:
-                # case: categorical accuracy with sparse targets
-                acc_fn = metrics_module.sparse_categorical_accuracy
+                # case: categorical accuracy/crossentropy with sparse targets
+                if metric in ('accuracy', 'acc'):
+                  acc_fn = metrics_module.sparse_categorical_accuracy
+                elif metric in ('crossentropy', 'ce'):
+                  acc_fn = metrics_module.sparse_categorical_crossentropy
               else:
-                acc_fn = metrics_module.categorical_accuracy
-
+                # case: categorical accuracy/crossentropy
+                if metric in ('accuracy', 'acc'):
+                  acc_fn = metrics_module.categorical_accuracy
+                elif metric in ('crossentropy', 'ce'):
+                  acc_fn = metrics_module.categorical_crossentropy
+              if metric in ('accuracy', 'acc'):
+                suffix = 'acc'
+              elif metric in ('crossentropy', 'ce'):
+                suffix = 'ce'
               weighted_metric_fn = _weighted_masked_objective(acc_fn)
-              metric_name = metric_name_prefix + 'acc'
+              metric_name = metric_name_prefix + suffix
             else:
               metric_fn = metrics_module.get(metric)
               weighted_metric_fn = _weighted_masked_objective(metric_fn)
@@ -930,7 +932,7 @@ class Model(Network):
     self._feed_sample_weights = []
     for i in range(len(self.sample_weights)):
       if i not in skip_target_weighing_indices:
-        self._feed_sample_weights.append(sample_weights[i])
+        self._feed_sample_weights.append(self.sample_weights[i])
 
     # Functions for train, test and predict will
     # be compiled lazily when required.
@@ -949,7 +951,7 @@ class Model(Network):
     """Check trainable weights count consistency.
 
     This will raise a warning if `trainable_weights` and
-    `_collected_trainable_weights` are consistent (i.e. have the same
+    `_collected_trainable_weights` are inconsistent (i.e. have different
     number of parameters).
     Inconsistency will typically arise when one modifies `model.trainable`
     without calling `model.compile` again.
@@ -959,9 +961,10 @@ class Model(Network):
 
     if len(self.trainable_weights) != len(self._collected_trainable_weights):
       logging.warning(
-          'Discrepancy between trainable weights and collected trainable'
-          ' weights, did you set `model.trainable` without calling'
-          ' `model.compile` after ?')
+          UserWarning(
+              'Discrepancy between trainable weights and collected trainable'
+              ' weights, did you set `model.trainable` without calling'
+              ' `model.compile` after ?'))
 
   def _make_train_function(self):
     if not hasattr(self, 'train_function'):
@@ -976,9 +979,13 @@ class Model(Network):
 
       with K.name_scope('training'):
         with K.name_scope(self.optimizer.__class__.__name__):
-          training_updates = self.optimizer.get_updates(
+          # Training updates
+          updates = self.optimizer.get_updates(
               params=self._collected_trainable_weights, loss=self.total_loss)
-        updates = self.updates + training_updates
+        # Unconditional updates
+        updates += self.get_updates_for(None)
+        # Conditional updates relevant to this model
+        updates += self.get_updates_for(self._feed_inputs)
         # Gets loss and metrics. Updates weights at each call.
         self.train_function = K.function(
             inputs, [self.total_loss] + self.metrics_tensors,
@@ -1050,18 +1057,21 @@ class Model(Network):
         processed based on the size of the first dimension of the
         first input numpy array. When steps is not `None` and
         `batch_size` is `None`, returns `None`.
+
+    Raises:
+        ValueError: In case of invalid arguments.
     """
     if steps is not None:
       num_samples = None
       if batch_size is not None:
-        raise ValueError('If ' + steps_name +
-                         ' is set, the `batch_size` must be None.')
+        raise ValueError(
+            'If ' + steps_name + ' is set, the `batch_size` must be None.')
     elif ins and hasattr(ins[0], 'shape'):
       num_samples = ins[0].shape[0]
     else:
-      raise ValueError('Either the input data should have '
-                       'a defined shape, or ' + steps_name +
-                       ' should be specified.')
+      raise ValueError(
+          'Either the input data should have '
+          'a defined shape, or ' + steps_name + ' should be specified.')
     return num_samples
 
   def _fit_loop(self,
@@ -1104,31 +1114,33 @@ class Model(Network):
         steps_per_epoch: Total number of steps (batches of samples)
             before declaring one epoch finished and starting the
             next epoch. Ignored with the default value of `None`.
-        validation_steps: Number of steps to run validation for (only if doing
-          validation from data tensors). Ignored with default value of `None`.
+        validation_steps: Number of steps to run validation for
+            (only if doing validation from data tensors).
+            Ignored with the default value of `None`.
 
     Returns:
         `History` object.
 
     Raises:
-      ValueError: In case of invalid argument values.
+        ValueError: in case of invalid arguments.
     """
     do_validation = False
     if val_f and val_ins:
       do_validation = True
-      if (verbose and ins and
-          hasattr(ins[0], 'shape') and hasattr(val_ins[0], 'shape')):
+      if verbose and ins and hasattr(ins[0], 'shape') and hasattr(
+          val_ins[0], 'shape'):
         print('Train on %d samples, validate on %d samples' %
               (ins[0].shape[0], val_ins[0].shape[0]))
     if validation_steps:
-      if steps_per_epoch is None:
-        raise ValueError('Can only use `validation_steps` when doing step-wise '
-                         'training, i.e. `steps_per_epoch` must be set.')
       do_validation = True
+      if steps_per_epoch is None:
+        raise ValueError('Can only use `validation_steps` '
+                         'when doing step-wise '
+                         'training, i.e. `steps_per_epoch` '
+                         'must be set.')
 
     num_train_samples = self._check_num_samples(
         ins, batch_size, steps_per_epoch, 'steps_per_epoch')
-
     if num_train_samples is not None:
       index_array = np.arange(num_train_samples)
 
@@ -1151,6 +1163,7 @@ class Model(Network):
       callback_model = self
 
     callbacks.set_model(callback_model)
+
     callbacks.set_params({
         'batch_size': batch_size,
         'epochs': epochs,
@@ -1165,6 +1178,13 @@ class Model(Network):
     for cbk in callbacks:
       cbk.validation_data = val_ins
 
+    # To prevent a slowdown, we find beforehand the arrays that need conversion.
+    feed = self._feed_inputs + self._feed_targets + self._feed_sample_weights
+    indices_for_conversion_to_dense = []
+    for i in range(len(feed)):
+      if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
+        indices_for_conversion_to_dense.append(i)
+
     for epoch in range(initial_epoch, epochs):
       callbacks.on_epoch_begin(epoch)
       epoch_logs = {}
@@ -1203,15 +1223,16 @@ class Model(Network):
         elif shuffle:
           np.random.shuffle(index_array)
 
-        batches = _make_batches(num_train_samples, batch_size)
+        batches = make_batches(num_train_samples, batch_size)
+
         for batch_index, (batch_start, batch_end) in enumerate(batches):
           batch_ids = index_array[batch_start:batch_end]
           try:
             if isinstance(ins[-1], float):
               # Do not slice the training phase flag.
-              ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+              ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
             else:
-              ins_batch = _slice_arrays(ins, batch_ids)
+              ins_batch = slice_arrays(ins, batch_ids)
           except TypeError:
             raise TypeError('TypeError while preparing batch. '
                             'If using HDF5 input data, '
@@ -1220,6 +1241,9 @@ class Model(Network):
           batch_logs['batch'] = batch_index
           batch_logs['size'] = len(batch_ids)
           callbacks.on_batch_begin(batch_index, batch_logs)
+          for i in indices_for_conversion_to_dense:
+            ins_batch[i] = ins_batch[i].toarray()
+
           outs = f(ins_batch)
           if not isinstance(outs, list):
             outs = [outs]
@@ -1268,6 +1292,13 @@ class Model(Network):
         progbar = Progbar(target=steps)
       else:
         progbar = Progbar(target=num_samples)
+
+    indices_for_conversion_to_dense = []
+    for i in range(len(self._feed_inputs)):
+      if (issparse is not None and issparse(ins[i]) and
+          not K.is_sparse(self._feed_inputs[i])):
+        indices_for_conversion_to_dense.append(i)
+
     if steps is not None:
       # Step-based predictions.
       # Since we do not know how many samples
@@ -1296,15 +1327,18 @@ class Model(Network):
     else:
       # Sample-based predictions.
       outs = []
-      batches = _make_batches(num_samples, batch_size)
+      batches = make_batches(num_samples, batch_size)
       index_array = np.arange(num_samples)
       for batch_index, (batch_start, batch_end) in enumerate(batches):
         batch_ids = index_array[batch_start:batch_end]
         if ins and isinstance(ins[-1], float):
           # Do not slice the training phase flag.
-          ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+          ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
         else:
-          ins_batch = _slice_arrays(ins, batch_ids)
+          ins_batch = slice_arrays(ins, batch_ids)
+        for i in indices_for_conversion_to_dense:
+          ins_batch[i] = ins_batch[i].toarray()
+
         batch_outs = f(ins_batch)
         if not isinstance(batch_outs, list):
           batch_outs = [batch_outs]
@@ -1341,12 +1375,19 @@ class Model(Network):
     """
     num_samples = self._check_num_samples(ins, batch_size, steps, 'steps')
     outs = []
-
     if verbose == 1:
       if steps is not None:
         progbar = Progbar(target=steps)
       else:
         progbar = Progbar(target=num_samples)
+
+    # To prevent a slowdown, we find beforehand the arrays that need conversion.
+    feed = self._feed_inputs + self._feed_targets + self._feed_sample_weights
+    indices_for_conversion_to_dense = []
+    for i in range(len(feed)):
+      if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
+        indices_for_conversion_to_dense.append(i)
+
     if steps is not None:
       for step in range(steps):
         batch_outs = f(ins)
@@ -1365,19 +1406,20 @@ class Model(Network):
       for i in range(len(outs)):
         outs[i] /= steps
     else:
-      if verbose == 1:
-        progbar = Progbar(target=num_samples)
-      batches = _make_batches(num_samples, batch_size)
+      batches = make_batches(num_samples, batch_size)
       index_array = np.arange(num_samples)
       for batch_index, (batch_start, batch_end) in enumerate(batches):
         batch_ids = index_array[batch_start:batch_end]
         if isinstance(ins[-1], float):
           # Do not slice the training phase flag.
-          ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+          ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
         else:
-          ins_batch = _slice_arrays(ins, batch_ids)
+          ins_batch = slice_arrays(ins, batch_ids)
+        for i in indices_for_conversion_to_dense:
+          ins_batch[i] = ins_batch[i].toarray()
 
         batch_outs = f(ins_batch)
+
         if isinstance(batch_outs, list):
           if batch_index == 0:
             for batch_out in enumerate(batch_outs):
@@ -1388,7 +1430,6 @@ class Model(Network):
           if batch_index == 0:
             outs.append(0.)
           outs[0] += batch_outs * len(batch_ids)
-
         if verbose == 1:
           progbar.update(batch_end)
       for i in range(len(outs)):
@@ -1399,48 +1440,213 @@ class Model(Network):
 
   def _standardize_user_data(self,
                              x,
-                             y,
+                             y=None,
                              sample_weight=None,
                              class_weight=None,
-                             check_batch_axis=True,
                              batch_size=None):
-    if not hasattr(self, 'optimizer'):
-      raise RuntimeError('You must compile a model before '
-                         'training/testing. '
-                         'Use `model.compile(optimizer, loss)`.')
-
-    output_shapes = []
-    for output_shape, loss_fn in zip(self._feed_output_shapes,
-                                     self._feed_loss_fns):
-      if loss_fn is losses.sparse_categorical_crossentropy:
-        output_shapes.append(output_shape[:-1] + (1,))
+    """Runs validation checks on input and target data passed by the user.
+
+    Also standardizes the data to lists of arrays, in order.
+
+    Also builds and compiles the model on the fly if it is a subclassed model
+    that has never been called before (and thus has no inputs/outputs).
+
+    This is a purely internal method, subject to refactoring at any time.
+
+    Args:
+      x: An array or list of arrays, to be used as input data. If the model
+       has known, named inputs, this could also be a dict mapping input names
+       to the corresponding array.
+      y: An array or list of arrays, to be used as target data. If the model
+       has known, named outputs, this could also be a dict mapping output names
+       to the corresponding array.
+      sample_weight: An optional sample-weight array passed by the user to
+        weight the importance of each sample in `x`.
+      class_weight: An optional class-weight array by the user to
+        weight the importance of samples in `x` based on the class they belong
+        to, as conveyed by `y`.
+      batch_size: Integer batch size. If provided, it is used to run additional
+        validation checks on stateful models.
+
+    Returns:
+      A tuple of 3 lists: input arrays, target arrays, sample-weight arrays.
+      If the model's input and targets are symbolic, these lists are empty
+      (since the model takes no user-provided data, instead the data comes
+      from the symbolic inputs/targets).
+
+    Raises:
+      ValueError: In case of invalid user-provided data.
+      RuntimeError: If the model was never compiled.
+    """
+    # First, we build/compile the model on the fly if necessary.
+    all_inputs = []
+    if not self.built:
+      # We need to use `x` to set the model inputs.
+      # We type-check that `x` and `y` are either single arrays
+      # or lists of arrays.
+      if isinstance(x, (list, tuple)):
+        if not all(isinstance(v, np.ndarray) or
+                   tensor_util.is_tensor(v) for v in x):
+          raise ValueError('Please provide as model inputs either a single '
+                           'array or a list of arrays. You passed: x=' + str(x))
+        all_inputs += list(x)
+      elif isinstance(x, dict):
+        raise ValueError('Please do not pass a dictionary as model inputs.')
       else:
-        output_shapes.append(output_shape)
+        if not isinstance(x, np.ndarray) and not tensor_util.is_tensor(x):
+          raise ValueError('Please provide as model inputs either a single '
+                           'array or a list of arrays. You passed: x=' + str(x))
+        all_inputs.append(x)
+
+      # Build the model using the retrieved inputs (value or symbolic).
+      # If values, then in symbolic-mode placeholders will be created
+      # to match the value shapes.
+      if not self.inputs:
+        self._set_inputs(x)
+
+    if y is not None:
+      if not self.optimizer:
+        raise RuntimeError('You must compile a model before '
+                           'training/testing. '
+                           'Use `model.compile(optimizer, loss)`.')
+      if not self._is_compiled:
+        # On-the-fly compilation of the model.
+        # We need to use `y` to set the model targets.
+        if isinstance(y, (list, tuple)):
+          if not all(isinstance(v, np.ndarray) or
+                     tensor_util.is_tensor(v) for v in y):
+            raise ValueError('Please provide as model targets either a single '
+                             'array or a list of arrays. '
+                             'You passed: y=' + str(y))
+        elif isinstance(y, dict):
+          raise ValueError('Please do not pass a dictionary as model targets.')
+        else:
+          if not isinstance(y, np.ndarray) and not tensor_util.is_tensor(y):
+            raise ValueError('Please provide as model targets either a single '
+                             'array or a list of arrays. '
+                             'You passed: y=' + str(y))
+
+        # Typecheck that all inputs are *either* value *or* symbolic.
+        # TODO(fchollet): this check could be removed in Eager mode?
+        if y is not None:
+          if isinstance(y, (list, tuple)):
+            all_inputs += list(y)
+          else:
+            all_inputs.append(y)
+        if any(tensor_util.is_tensor(v) for v in all_inputs):
+          if not all(tensor_util.is_tensor(v) for v in all_inputs):
+            raise ValueError('Do not pass inputs that mix Numpy arrays and '
+                             'TensorFlow tensors. '
+                             'You passed: x=' + str(x) + '; y=' + str(y))
+
+        if context.in_graph_mode():
+          # Handle target tensors if any passed.
+          if not isinstance(y, (list, tuple)):
+            y = [y]
+          target_tensors = [v for v in y if tensor_util.is_tensor(v)]
+        else:
+          target_tensors = None
+        self.compile(optimizer=self.optimizer,
+                     loss=self.loss,
+                     metrics=self.metrics,
+                     loss_weights=self.loss_weights,
+                     target_tensors=target_tensors)
+
+    # If `x` and `y` were all symbolic, then no model should not be fed any
+    # inputs and targets.
+    # Note: in this case, `any` and `all` are equivalent since we disallow
+    # mixed symbolic/value inputs.
+    if any(tensor_util.is_tensor(v) for v in all_inputs):
+      return [], [], []
+
+    # What follows is input validation and standardization to list format,
+    # in the case where all inputs are value arrays.
+
+    if context.in_eager_mode():
+      # In eager mode, do not do shape validation.
+      feed_input_names = self.input_names
+      feed_input_shapes = None
+    elif not self._is_graph_network:
+      # Case: symbolic-mode subclassed network. Do not do shape validation.
+      feed_input_names = self._feed_input_names
+      feed_input_shapes = None
+    else:
+      # Case: symbolic-mode graph network.
+      # In this case, we run extensive shape validation checks.
+      feed_input_names = self._feed_input_names
+      feed_input_shapes = self._feed_input_shapes
+
+    # Standardize the inputs.
     x = _standardize_input_data(
         x,
-        self._feed_input_names,
-        self._feed_input_shapes,
-        check_batch_axis=False,
+        feed_input_names,
+        feed_input_shapes,
+        check_batch_axis=False,  # Don't enforce the batch size.
         exception_prefix='input')
-    y = _standardize_input_data(
-        y,
-        self._feed_output_names,
-        output_shapes,
-        check_batch_axis=False,
-        exception_prefix='target')
-    sample_weights = _standardize_sample_weights(sample_weight,
-                                                 self._feed_output_names)
-    class_weights = _standardize_class_weights(class_weight,
-                                               self._feed_output_names)
-    sample_weights = [
-        _standardize_weights(ref, sw, cw, mode)
-        for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights,
-                                       self._feed_sample_weight_modes)
-    ]
-    _check_array_lengths(x, y, sample_weights)
-    _check_loss_and_target_compatibility(y, self._feed_loss_fns,
-                                         self._feed_output_shapes)
+
+    if y is not None:
+      if context.in_eager_mode():
+        feed_output_names = self.output_names
+        feed_output_shapes = None
+        # Sample weighting not supported in this case.
+        # TODO(fchollet): consider supporting it.
+        feed_sample_weight_modes = [None for _ in self.outputs]
+      elif not self._is_graph_network:
+        feed_output_names = self._feed_output_names
+        feed_output_shapes = None
+        # Sample weighting not supported in this case.
+        # TODO(fchollet): consider supporting it.
+        feed_sample_weight_modes = [None for _ in self.outputs]
+      else:
+        feed_output_names = self._feed_output_names
+        feed_sample_weight_modes = self._feed_sample_weight_modes
+        feed_output_shapes = []
+        for output_shape, loss_fn in zip(self._feed_output_shapes,
+                                         self._feed_loss_fns):
+          if loss_fn is losses.sparse_categorical_crossentropy:
+            feed_output_shapes.append(output_shape[:-1] + (1,))
+          elif (not hasattr(loss_fn, '__name__') or
+                getattr(losses, loss_fn.__name__, None) is None):
+            # If `loss_fn` is not a function (e.g. callable class)
+            # or if it not in the `losses` module, then
+            # it is a user-defined loss and we make no assumptions
+            # about it.
+            feed_output_shapes.append(None)
+          else:
+            feed_output_shapes.append(output_shape)
+
+      # Standardize the outputs.
+      y = _standardize_input_data(
+          y,
+          feed_output_names,
+          feed_output_shapes,
+          check_batch_axis=False,  # Don't enforce the batch size.
+          exception_prefix='target')
+
+      # Generate sample-wise weight values given the `sample_weight` and
+      # `class_weight` arguments.
+      sample_weights = _standardize_sample_weights(sample_weight,
+                                                   feed_output_names)
+      class_weights = _standardize_class_weights(class_weight,
+                                                 feed_output_names)
+      sample_weights = [
+          _standardize_weights(ref, sw, cw, mode)
+          for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights,
+                                         feed_sample_weight_modes)
+      ]
+      # Check that all arrays have the same length.
+      _check_array_lengths(x, y, sample_weights)
+      if self._is_graph_network and not context.in_eager_mode():
+        # Additional checks to avoid users mistakenly using improper loss fns.
+        _check_loss_and_target_compatibility(y, self._feed_loss_fns,
+                                             feed_output_shapes)
+    else:
+      y = []
+      sample_weights = []
+
     if self.stateful and batch_size:
+      # Check that for stateful networks, number of samples is a multiple
+      # of the static batch size.
       if x[0].shape[0] % batch_size != 0:
         raise ValueError('In a stateful network, '
                          'you should only pass inputs with '
@@ -1463,6 +1669,140 @@ class Model(Network):
       deduped_out_labels.append(new_label)
     return deduped_out_labels
 
+  def _set_inputs(self, inputs):
+    """Set model's input and output specs based on the input data received.
+
+    This is to be used for Model subclasses, which do not know at instantiation
+    time what their inputs look like.
+
+    Args:
+      inputs: Single array, or list of arrays. The arrays could be placeholders,
+        Numpy arrays, or data tensors.
+        - if placeholders: the model is built on top of these placeholders,
+          and we expect Numpy data to be fed for them when calling `fit`/etc.
+        - if Numpy data: we create placeholders matching the shape of the Numpy
+          arrays. We expect Numpy data to be fed for these placeholders
+          when calling `fit`/etc.
+        - if data tensors: the model is built on top of these tensors.
+          We do not expect any Numpy data to be provided when calling `fit`/etc.
+    """
+    if context.in_eager_mode():
+      self._eager_set_inputs(inputs)
+    else:
+      self._symbolic_set_inputs(inputs)
+
+  def _eager_set_inputs(self, inputs):
+    """Set model's input and output specs based on the input data received.
+
+    This is to be used for Model subclasses, which do not know at instantiation
+    time what their inputs look like.
+
+    We assume the number and ndim of outputs
+    does not change over different calls.
+
+    Args:
+      inputs: Argument `x` (input data) passed by the user upon first model use.
+
+    Raises:
+      ValueError: If the model's inputs are already set.
+    """
+    assert context.in_eager_mode()
+    if self.inputs:
+      raise ValueError('Model inputs are already set.')
+    # On-the-fly setting of model inputs/outputs as DeferredTensors,
+    # to keep track of number of inputs and outputs and their ndim.
+    if isinstance(inputs, (list, tuple)):
+      dummy_output_values = self.call(
+          [ops.convert_to_tensor(v, dtype=K.floatx()) for v in inputs])
+      dummy_input_values = list(inputs)
+    else:
+      dummy_output_values = self.call(
+          ops.convert_to_tensor(inputs, dtype=K.floatx()))
+      dummy_input_values = [inputs]
+    if isinstance(dummy_output_values, (list, tuple)):
+      dummy_output_values = list(dummy_output_values)
+    else:
+      dummy_output_values = [dummy_output_values]
+    self.outputs = [
+        _DeferredTensor(shape=(None for _ in v.shape),
+                        dtype=v.dtype) for v in dummy_output_values]
+    self.inputs = [
+        _DeferredTensor(shape=(None for _ in v.shape),
+                        dtype=v.dtype) for v in dummy_input_values]
+    self.input_names = [
+        'input_%d' % (i + 1) for i in range(len(dummy_input_values))]
+    self.output_names = [
+        'output_%d' % (i + 1) for i in range(len(dummy_output_values))]
+    self.built = True
+
+  def _symbolic_set_inputs(self, inputs):
+    """Set model's inputs based on the input data received from the user.
+
+    This is to be used for Model subclasses, which do not know at instantiation
+    time what their inputs look like.
+
+    Args:
+      inputs: Argument `x` (input data) passed by the user upon first model use.
+
+    Raises:
+      ValueError: If the model's inputs are already set.
+    """
+    assert context.in_graph_mode()
+    if self.inputs:
+      raise ValueError('Model inputs are already set.')
+
+    # On-the-fly setting of symbolic model inputs (either by using the tensor
+    # provided, or by creating a placeholder if Numpy data was provided).
+    self.inputs = []
+    self.input_names = []
+    self._feed_inputs = []
+    self._feed_input_names = []
+    self._feed_input_shapes = []
+    if isinstance(inputs, (list, tuple)):
+      inputs = list(inputs)
+    else:
+      inputs = [inputs]
+
+    for i, v in enumerate(inputs):
+      name = 'input_%d' % (i + 1)
+      self.input_names.append(name)
+      if isinstance(v, list):
+        v = np.asarray(v)
+        if v.ndim == 1:
+          v = np.expand_dims(v, 1)
+      if isinstance(v, (np.ndarray)):
+        # We fix the placeholder shape except the batch size.
+        # This is suboptimal, but it is the best we can do with the info
+        # we have. The user should call `model._set_inputs(placeholders)`
+        # to specify custom placeholders if the need arises.
+        shape = (None,) + v.shape[1:]
+        placeholder = K.placeholder(shape=shape, name=name)
+        self.inputs.append(placeholder)
+        self._feed_inputs.append(placeholder)
+        self._feed_input_names.append(name)
+        self._feed_input_shapes.append(shape)
+      else:
+        # Assumed tensor - TODO(fchollet) additional type check?
+        self.inputs.append(v)
+        if K.is_placeholder(v):
+          self._feed_inputs.append(v)
+          self._feed_input_names.append(name)
+          self._feed_input_shapes.append(K.int_shape(v))
+
+    # Obtain symbolic outputs by calling the model.
+    if len(self.inputs) == 1:
+      outputs = self.call(self.inputs[0])
+    else:
+      outputs = self.call(self.inputs)
+    if isinstance(outputs, (list, tuple)):
+      outputs = list(outputs)
+    else:
+      outputs = [outputs]
+    self.outputs = outputs
+    self.output_names = [
+        'output_%d' % (i + 1) for i in range(len(self.outputs))]
+    self.built = True
+
   def fit(self,
           x=None,
           y=None,
@@ -1477,7 +1817,8 @@ class Model(Network):
           sample_weight=None,
           initial_epoch=0,
           steps_per_epoch=None,
-          validation_steps=None):
+          validation_steps=None,
+          **kwargs):
     """Trains the model for a fixed number of epochs (iterations on a dataset).
 
     Arguments:
@@ -1494,10 +1835,9 @@ class Model(Network):
             dictionary mapping output names to Numpy arrays.
             `y` can be `None` (default) if feeding from
             TensorFlow data tensors.
-            Can be `None` (default) if feeding from framework-native tensors.
         batch_size: Integer or `None`.
             Number of samples per gradient update.
-            If unspecified, it will default to 32.
+            If unspecified, `batch_size` will default to 32.
         epochs: Integer. Number of epochs to train the model.
             An epoch is an iteration over the entire `x` and `y`
             data provided.
@@ -1506,7 +1846,7 @@ class Model(Network):
             The model is not trained for a number of iterations
             given by `epochs`, but merely until the epoch
             of index `epochs` is reached.
-        verbose: 0, 1, or 2. Verbosity mode.
+        verbose: Integer. 0, 1, or 2. Verbosity mode.
             0 = silent, 1 = progress bar, 2 = one line per epoch.
         callbacks: List of `keras.callbacks.Callback` instances.
             List of callbacks to apply during training.
@@ -1523,7 +1863,7 @@ class Model(Network):
             `(x_val, y_val, val_sample_weights)` on which to evaluate
             the loss and any model metrics at the end of each epoch.
             The model will not be trained on this data.
-            This will override `validation_split`.
+            `validation_data` will override `validation_split`.
         shuffle: Boolean (whether to shuffle the training data
             before each epoch) or str (for 'batch').
             'batch' is a special option for dealing with the
@@ -1546,17 +1886,20 @@ class Model(Network):
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
             `sample_weight_mode="temporal"` in `compile()`.
-        initial_epoch: Epoch at which to start training
+        initial_epoch: Integer.
+            Epoch at which to start training
             (useful for resuming a previous training run).
-        steps_per_epoch: Total number of steps (batches of samples)
+        steps_per_epoch: Integer or `None`.
+            Total number of steps (batches of samples)
             before declaring one epoch finished and starting the
             next epoch. When training with input tensors such as
             TensorFlow data tensors, the default `None` is equal to
-            the number of unique samples in your dataset divided by
+            the number of samples in your dataset divided by
             the batch size, or 1 if that cannot be determined.
         validation_steps: Only relevant if `steps_per_epoch`
             is specified. Total number of steps (batches of samples)
             to validate before stopping.
+        **kwargs: Used for backwards compatibility.
 
     Returns:
         A `History` object. Its `History.history` attribute is
@@ -1565,25 +1908,36 @@ class Model(Network):
         and validation metrics values (if applicable).
 
     Raises:
+        RuntimeError: If the model was never compiled.
         ValueError: In case of mismatch between the provided input data
             and what the model expects.
     """
+    # TODO(fchollet): this method may be creating reference cycles, which would
+    # lead to accumulating garbage in memory when called in a loop. Investigate.
+
     # Backwards compatibility
     if batch_size is None and steps_per_epoch is None:
       batch_size = 32
+    # Legacy support
+    if 'nb_epoch' in kwargs:
+      logging.warning(
+          'The `nb_epoch` argument in `fit` '
+          'has been renamed `epochs`.')
+      epochs = kwargs.pop('nb_epoch')
+    if kwargs:
+      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
     if x is None and y is None and steps_per_epoch is None:
       raise ValueError('If fitting from data tensors, '
                        'you should specify the `steps_per_epoch` '
                        'argument.')
+
     # Validate user data.
     x, y, sample_weights = self._standardize_user_data(
         x,
         y,
         sample_weight=sample_weight,
         class_weight=class_weight,
-        check_batch_axis=False,
         batch_size=batch_size)
-
     # Prepare validation data.
     do_validation = False
     val_ins = []
@@ -1605,7 +1959,6 @@ class Model(Network):
           val_x,
           val_y,
           sample_weight=val_sample_weight,
-          check_batch_axis=False,
           batch_size=batch_size)
       if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
         val_ins = val_x + val_y + val_sample_weights + [0.]
@@ -1618,10 +1971,10 @@ class Model(Network):
         split_at = int(x[0].shape[0] * (1. - validation_split))
       else:
         split_at = int(len(x[0]) * (1. - validation_split))
-      x, val_x = (_slice_arrays(x, 0, split_at), _slice_arrays(x, split_at))
-      y, val_y = (_slice_arrays(y, 0, split_at), _slice_arrays(y, split_at))
-      sample_weights, val_sample_weights = (_slice_arrays(
-          sample_weights, 0, split_at), _slice_arrays(sample_weights, split_at))
+      x, val_x = (slice_arrays(x, 0, split_at), slice_arrays(x, split_at))
+      y, val_y = (slice_arrays(y, 0, split_at), slice_arrays(y, split_at))
+      sample_weights, val_sample_weights = (slice_arrays(
+          sample_weights, 0, split_at), slice_arrays(sample_weights, split_at))
       if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
         val_ins = val_x + val_y + val_sample_weights + [0.]
       else:
@@ -1637,38 +1990,65 @@ class Model(Network):
       ins = x + y + sample_weights + [1.]
     else:
       ins = x + y + sample_weights
-    self._make_train_function()
-    f = self.train_function
 
     # Prepare display labels.
     out_labels = self._get_deduped_metrics_names()
 
-    if do_validation:
-      self._make_test_function()
-      val_f = self.test_function
-      callback_metrics = copy.copy(out_labels) + [
-          'val_' + n for n in out_labels
-      ]
+    if context.in_eager_mode():
+      if do_validation:
+        callback_metrics = copy.copy(out_labels) + [
+            'val_' + n for n in out_labels
+        ]
+      else:
+        callback_metrics = copy.copy(out_labels)
+
+      return training_eager.fit_loop(
+          self,
+          ins,
+          out_labels=out_labels,
+          batch_size=batch_size,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          val_ins=val_ins,
+          shuffle=shuffle,
+          callback_metrics=callback_metrics,
+          initial_epoch=initial_epoch,
+          steps_per_epoch=steps_per_epoch,
+          validation_steps=validation_steps)
     else:
-      val_f = None
-      callback_metrics = copy.copy(out_labels)
-
-    # Delegate logic to `_fit_loop`.
-    return self._fit_loop(
-        f,
-        ins,
-        out_labels=out_labels,
-        batch_size=batch_size,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        val_f=val_f,
-        val_ins=val_ins,
-        shuffle=shuffle,
-        callback_metrics=callback_metrics,
-        initial_epoch=initial_epoch,
-        steps_per_epoch=steps_per_epoch,
-        validation_steps=validation_steps)
+      self._make_train_function()
+      f = self.train_function
+
+      if do_validation:
+        if context.in_graph_mode():
+          self._make_test_function()
+          val_f = self.test_function
+        else:
+          val_f = None
+        callback_metrics = copy.copy(out_labels) + [
+            'val_' + n for n in out_labels
+        ]
+      else:
+        val_f = None
+        callback_metrics = copy.copy(out_labels)
+
+      # Delegate logic to `_fit_loop`.
+      return self._fit_loop(
+          f,
+          ins,
+          out_labels=out_labels,
+          batch_size=batch_size,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          val_f=val_f,
+          val_ins=val_ins,
+          shuffle=shuffle,
+          callback_metrics=callback_metrics,
+          initial_epoch=initial_epoch,
+          steps_per_epoch=steps_per_epoch,
+          validation_steps=validation_steps)
 
   def evaluate(self,
                x=None,
@@ -1687,14 +2067,14 @@ class Model(Network):
             If input layers in the model are named, you can also pass a
             dictionary mapping input names to Numpy arrays.
             `x` can be `None` (default) if feeding from
-            framework-native tensors (e.g. TensorFlow data tensors).
+            TensorFlow data tensors.
         y: Numpy array of target (label) data
             (if the model has a single output),
             or list of Numpy arrays (if the model has multiple outputs).
             If output layers in the model are named, you can also pass a
             dictionary mapping output names to Numpy arrays.
             `y` can be `None` (default) if feeding from
-            framework-native tensors (e.g. TensorFlow data tensors).
+            TensorFlow data tensors.
         batch_size: Integer or `None`.
             Number of samples per evaluation step.
             If unspecified, `batch_size` will default to 32.
@@ -1714,8 +2094,7 @@ class Model(Network):
         steps: Integer or `None`.
             Total number of steps (batches of samples)
             before declaring the evaluation round finished.
-            The default `None` is equal to the number of unique samples in
-            your dataset divided by the batch size.
+            Ignored with the default value of `None`.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -1724,7 +2103,7 @@ class Model(Network):
         the display labels for the scalar outputs.
 
     Raises:
-      ValueError: In case of invalid arguments.
+        ValueError: in case of invalid arguments.
     """
     # Backwards compatibility.
     if batch_size is None and steps is None:
@@ -1733,22 +2112,27 @@ class Model(Network):
       raise ValueError('If evaluating from data tensors, '
                        'you should specify the `steps` '
                        'argument.')
+
     # Validate user data.
     x, y, sample_weights = self._standardize_user_data(
         x,
         y,
         sample_weight=sample_weight,
-        check_batch_axis=False,
         batch_size=batch_size)
     # Prepare inputs, delegate logic to `_test_loop`.
     if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
       ins = x + y + sample_weights + [0.]
     else:
       ins = x + y + sample_weights
-    self._make_test_function()
-    f = self.test_function
-    return self._test_loop(
-        f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+
+    if context.in_eager_mode():
+      return training_eager.test_loop(
+          self, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+    else:
+      self._make_test_function()
+      f = self.test_function
+      return self._test_loop(
+          f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
 
   def predict(self, x, batch_size=None, verbose=0, steps=None):
     """Generates output predictions for the input samples.
@@ -1780,30 +2164,23 @@ class Model(Network):
       raise ValueError('If predicting from data tensors, '
                        'you should specify the `steps` '
                        'argument.')
-    # Validate user data.
-    x = _standardize_input_data(
-        x,
-        self._feed_input_names,
-        self._feed_input_shapes,
-        check_batch_axis=False)
-    if self.stateful:
-      if x[0].shape[0] > batch_size and x[0].shape[0] % batch_size != 0:
-        raise ValueError('In a stateful network, '
-                         'you should only pass inputs with '
-                         'a number of samples that can be '
-                         'divided by the batch size. Found: ' +
-                         str(x[0].shape[0]) + ' samples. '
-                         'Batch size: ' + str(batch_size) + '.')
+    x, _, _ = self._standardize_user_data(x)
 
     # Prepare inputs, delegate logic to `_predict_loop`.
     if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
       ins = x + [0.]
     else:
       ins = x
-    self._make_predict_function()
-    f = self.predict_function
-    return self._predict_loop(
-        f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+
+    if context.in_eager_mode():
+      return training_eager.predict_loop(
+          self, ins, batch_size=batch_size, verbose=verbose, steps=steps)
+    else:
+      self._make_predict_function()
+      f = self.predict_function
+
+      return self._predict_loop(
+          f, ins, batch_size=batch_size, verbose=verbose, steps=steps)
 
   def train_on_batch(self, x, y, sample_weight=None, class_weight=None):
     """Runs a single gradient update on a single batch of data.
@@ -1839,19 +2216,24 @@ class Model(Network):
         or list of scalars (if the model has multiple outputs
         and/or metrics). The attribute `model.metrics_names` will give you
         the display labels for the scalar outputs.
+
     """
     x, y, sample_weights = self._standardize_user_data(
         x,
         y,
         sample_weight=sample_weight,
-        class_weight=class_weight,
-        check_batch_axis=True)
+        class_weight=class_weight)
     if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
       ins = x + y + sample_weights + [1.]
     else:
       ins = x + y + sample_weights
-    self._make_train_function()
-    outputs = self.train_function(ins)
+
+    if context.in_eager_mode():
+      outputs = training_eager.train_on_batch(self, ins)
+    else:
+      self._make_train_function()
+      outputs = self.train_function(ins)
+
     if len(outputs) == 1:
       return outputs[0]
     return outputs
@@ -1883,15 +2265,23 @@ class Model(Network):
         or list of scalars (if the model has multiple outputs
         and/or metrics). The attribute `model.metrics_names` will give you
         the display labels for the scalar outputs.
+
+    Raises:
+        ValueError: in case of invalid arguments.
     """
     x, y, sample_weights = self._standardize_user_data(
-        x, y, sample_weight=sample_weight, check_batch_axis=True)
+        x, y, sample_weight=sample_weight)
     if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
       ins = x + y + sample_weights + [0.]
     else:
       ins = x + y + sample_weights
-    self._make_test_function()
-    outputs = self.test_function(ins)
+
+    if context.in_eager_mode():
+      outputs = training_eager.test_on_batch(self, ins)
+    else:
+      self._make_test_function()
+      outputs = self.test_function(ins)
+
     if len(outputs) == 1:
       return outputs[0]
     return outputs
@@ -1904,22 +2294,37 @@ class Model(Network):
 
     Returns:
         Numpy array(s) of predictions.
+
     """
-    x = _standardize_input_data(x, self._feed_input_names,
-                                self._feed_input_shapes)
+    x, _, _ = self._standardize_user_data(x)
+
     if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
       ins = x + [0.]
     else:
       ins = x
-    self._make_predict_function()
-    outputs = self.predict_function(ins)
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+
+    if context.in_eager_mode():
+      ins_batch_converted = []
+      for ib in ins:
+        ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+
+      eager_model_inputs = []
+      for i in range(len(self.inputs)):
+        eager_model_inputs.append(ins_batch_converted[i])
+
+      outs = self(eager_model_inputs)  # pylint: disable=not-callable
+      return outs
+
+    if context.in_graph_mode():
+      self._make_predict_function()
+      outputs = self.predict_function(ins)
+      if len(outputs) == 1:
+        return outputs[0]
+      return outputs
 
   def fit_generator(self,
                     generator,
-                    steps_per_epoch,
+                    steps_per_epoch=None,
                     epochs=1,
                     verbose=1,
                     callbacks=None,
@@ -1930,8 +2335,7 @@ class Model(Network):
                     workers=1,
                     use_multiprocessing=False,
                     shuffle=True,
-                    initial_epoch=0,
-                    **kwargs):
+                    initial_epoch=0):
     """Fits the model on data yielded batch-by-batch by a Python generator.
 
     The generator is run in parallel to the model, for efficiency.
@@ -1943,20 +2347,31 @@ class Model(Network):
     using `use_multiprocessing=True`.
 
     Arguments:
-        generator: A generator or an instance of Sequence (keras.utils.Sequence)
-            object in order to avoid duplicate data when using multiprocessing.
+        generator: A generator or an instance of `Sequence`
+          (`keras.utils.Sequence`)
+            object in order to avoid duplicate data
+            when using multiprocessing.
             The output of the generator must be either
-            - a tuple (inputs, targets)
-            - a tuple (inputs, targets, sample_weights).
-            All arrays should contain the same number of samples.
+            - a tuple `(inputs, targets)`
+            - a tuple `(inputs, targets, sample_weights)`.
+            This tuple (a single output of the generator) makes a single batch.
+            Therefore, all arrays in this tuple must have the same length (equal
+            to the size of this batch). Different batches may have different
+              sizes.
+            For example, the last batch of the epoch is commonly smaller than
+              the
+            others, if the size of the dataset is not divisible by the batch
+              size.
             The generator is expected to loop over its data
             indefinitely. An epoch finishes when `steps_per_epoch`
             batches have been seen by the model.
         steps_per_epoch: Total number of steps (batches of samples)
             to yield from `generator` before declaring one epoch
             finished and starting the next epoch. It should typically
-            be equal to the number of unique samples of your dataset
-            divided by the batch size. Not used if using `Sequence`.
+            be equal to the number of samples of your dataset
+            divided by the batch size.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
         epochs: Integer, total number of iterations on the data.
         verbose: Verbosity mode, 0, 1, or 2.
         callbacks: List of callbacks to be called during training.
@@ -1967,24 +2382,29 @@ class Model(Network):
         validation_steps: Only relevant if `validation_data`
             is a generator. Total number of steps (batches of samples)
             to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(validation_data)` as a number of steps.
         class_weight: Dictionary mapping class indices to a weight
             for the class.
-        max_queue_size: Maximum size for the generator queue
-        workers: Maximum number of processes to spin up
-            when using process-based threading.
-        use_multiprocessing: If True, use process based threading.
+        max_queue_size: Integer. Maximum size for the generator queue.
+            If unspecified, `max_queue_size` will default to 10.
+        workers: Integer. Maximum number of processes to spin up
+            when using process based threading.
+            If unspecified, `workers` will default to 1. If 0, will
+            execute the generator on the main thread.
+        use_multiprocessing: Boolean. If True, use process based threading.
+            If unspecified, `workers` will default to False.
             Note that because
             this implementation relies on multiprocessing,
             you should not pass
             non picklable arguments to the generator
             as they can't be passed
             easily to children processes.
-        shuffle: Whether to shuffle the data at the beginning of each
-            epoch. Only used with instances of `Sequence`
-            (`keras.utils.Sequence`).
+        shuffle: Whether to shuffle the order of the batches at
+            the beginning of each epoch. Only used with instances
+            of `Sequence` (keras.utils.Sequence).
         initial_epoch: Epoch at which to start training
             (useful for resuming a previous training run)
-        **kwargs: support for legacy arguments.
 
     Returns:
         A `History` object.
@@ -2005,23 +2425,13 @@ class Model(Network):
         model.fit_generator(generate_arrays_from_file('/my_file.txt'),
                             steps_per_epoch=10000, epochs=10)
     ```
-
     Raises:
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-    if kwargs:
-      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+    if not self._is_graph_network:
+      raise NotImplementedError(
+          '`fit_generator` is not yet enabled for Model subclasses')
 
     wait_time = 0.01  # in seconds
     epoch = initial_epoch
@@ -2031,15 +2441,34 @@ class Model(Network):
     if do_validation:
       self._make_test_function()
 
+    is_sequence = isinstance(generator, Sequence)
+    if not is_sequence and use_multiprocessing and workers > 1:
+      logging.warning(
+          UserWarning('Using a generator with `use_multiprocessing=True`'
+                      ' and multiple workers may duplicate your data.'
+                      ' Please consider using the`keras.utils.Sequence'
+                      ' class.'))
+    if steps_per_epoch is None:
+      if is_sequence:
+        steps_per_epoch = len(generator)
+      else:
+        raise ValueError('`steps_per_epoch=None` is only valid for a'
+                         ' generator based on the `keras.utils.Sequence`'
+                         ' class. Please specify `steps_per_epoch` or use'
+                         ' the `keras.utils.Sequence` class.')
+
     # python 2 has 'next', 3 has '__next__'
     # avoid any explicit version checks
-    val_gen = (hasattr(validation_data, 'next') or
-               hasattr(validation_data, '__next__') or
-               isinstance(validation_data, Sequence))
-    if val_gen and not validation_steps:
-      raise ValueError('When using a generator for validation data, '
-                       'you must specify a value for '
-                       '`validation_steps`.')
+    val_gen = (
+        hasattr(validation_data, 'next') or
+        hasattr(validation_data, '__next__') or
+        isinstance(validation_data, Sequence))
+    if (val_gen and not isinstance(validation_data, Sequence) and
+        not validation_steps):
+      raise ValueError('`validation_steps=None` is only valid for a'
+                       ' generator based on the `keras.utils.Sequence`'
+                       ' class. Please specify `validation_steps` or use'
+                       ' the `keras.utils.Sequence` class.')
 
     # Prepare display labels.
     out_labels = self._get_deduped_metrics_names()
@@ -2067,47 +2496,66 @@ class Model(Network):
     })
     callbacks.on_train_begin()
 
-    if do_validation and not val_gen:
-      if len(validation_data) == 2:
-        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-        val_sample_weight = None
-      elif len(validation_data) == 3:
-        val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
-      else:
-        raise ValueError('`validation_data` should be a tuple '
-                         '`(val_x, val_y, val_sample_weight)` '
-                         'or `(val_x, val_y)`. Found: ' + str(validation_data))
-      val_x, val_y, val_sample_weights = self._standardize_user_data(
-          val_x, val_y, val_sample_weight)
-      val_data = val_x + val_y + val_sample_weights
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        val_data += [0.]
-      for cbk in callbacks:
-        cbk.validation_data = val_data
-    is_sequence = isinstance(generator, Sequence)
-    if not is_sequence and use_multiprocessing and workers > 1:
-      logging.warning(
-          logging.warning('Using a generator with `use_multiprocessing=True`'
-                          ' and multiple workers may duplicate your data.'
-                          ' Please consider using the`keras.utils.Sequence'
-                          ' class.'))
-    if is_sequence:
-      steps_per_epoch = len(generator)
     enqueuer = None
+    val_enqueuer = None
 
     try:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing, shuffle=shuffle)
+      if do_validation:
+        if val_gen:
+          if workers > 0:
+            if isinstance(validation_data, Sequence):
+              val_enqueuer = OrderedEnqueuer(
+                  validation_data, use_multiprocessing=use_multiprocessing)
+              if validation_steps is None:
+                validation_steps = len(validation_data)
+            else:
+              val_enqueuer = GeneratorEnqueuer(
+                  validation_data,
+                  use_multiprocessing=use_multiprocessing,
+                  wait_time=wait_time)
+            val_enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+            validation_generator = val_enqueuer.get()
+          else:
+            validation_generator = validation_data
+        else:
+          if len(validation_data) == 2:
+            val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
+            val_sample_weight = None
+          elif len(validation_data) == 3:
+            val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
+          else:
+            raise ValueError(
+                '`validation_data` should be a tuple '
+                '`(val_x, val_y, val_sample_weight)` '
+                'or `(val_x, val_y)`. Found: ' + str(validation_data))
+          val_x, val_y, val_sample_weights = self._standardize_user_data(
+              val_x, val_y, val_sample_weight)
+          val_data = val_x + val_y + val_sample_weights
+          if self.uses_learning_phase and not isinstance(
+              K.learning_phase(), int):
+            val_data += [0.]
+          for cbk in callbacks:
+            cbk.validation_data = val_data
+
+      if workers > 0:
+        if is_sequence:
+          enqueuer = OrderedEnqueuer(
+              generator,
+              use_multiprocessing=use_multiprocessing,
+              shuffle=shuffle)
+        else:
+          enqueuer = GeneratorEnqueuer(
+              generator,
+              use_multiprocessing=use_multiprocessing,
+              wait_time=wait_time)
+        enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+        output_generator = enqueuer.get()
       else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing,
-            wait_time=wait_time)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
+        output_generator = generator
 
       callback_model.stop_training = False
+      # Construct epoch logs.
+      epoch_logs = {}
       while epoch < epochs:
         callbacks.on_epoch_begin(epoch)
         steps_done = 0
@@ -2119,6 +2567,7 @@ class Model(Network):
             raise ValueError('Output of generator should be '
                              'a tuple `(x, y, sample_weight)` '
                              'or `(x, y)`. Found: ' + str(generator_output))
+
           if len(generator_output) == 2:
             x, y = generator_output
             sample_weight = None
@@ -2150,8 +2599,6 @@ class Model(Network):
 
           callbacks.on_batch_end(batch_index, batch_logs)
 
-          # Construct epoch logs.
-          epoch_logs = {}
           batch_index += 1
           steps_done += 1
 
@@ -2159,11 +2606,7 @@ class Model(Network):
           if steps_done >= steps_per_epoch and do_validation:
             if val_gen:
               val_outs = self.evaluate_generator(
-                  validation_data,
-                  validation_steps,
-                  max_queue_size=max_queue_size,
-                  workers=workers,
-                  use_multiprocessing=use_multiprocessing)
+                  validation_generator, validation_steps, workers=0)
             else:
               # No need for try/except because
               # data has already been validated.
@@ -2188,19 +2631,22 @@ class Model(Network):
           break
 
     finally:
-      if enqueuer is not None:
-        enqueuer.stop()
+      try:
+        if enqueuer is not None:
+          enqueuer.stop()
+      finally:
+        if val_enqueuer is not None:
+          val_enqueuer.stop()
 
     callbacks.on_train_end()
     return self.history
 
   def evaluate_generator(self,
                          generator,
-                         steps,
+                         steps=None,
                          max_queue_size=10,
                          workers=1,
-                         use_multiprocessing=False,
-                         **kwargs):
+                         use_multiprocessing=False):
     """Evaluates the model on a data generator.
 
     The generator should return the same kind of data
@@ -2214,10 +2660,13 @@ class Model(Network):
             when using multiprocessing.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
-            Not used if using `Sequence`.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
         max_queue_size: maximum size for the generator queue
-        workers: maximum number of processes to spin up
-            when using process-based threading.
+        workers: Integer. Maximum number of processes to spin up
+            when using process based threading.
+            If unspecified, `workers` will default to 1. If 0, will
+            execute the generator on the main thread.
         use_multiprocessing: if True, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
@@ -2225,7 +2674,6 @@ class Model(Network):
             non picklable arguments to the generator
             as they can't be passed
             easily to children processes.
-        **kwargs: support for legacy arguments.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -2233,22 +2681,16 @@ class Model(Network):
         and/or metrics). The attribute `model.metrics_names` will give you
         the display labels for the scalar outputs.
 
+    Raises:
+        ValueError: in case of invalid arguments.
+
     Raises:
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
-    if kwargs:
-      raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
+    if not self._is_graph_network:
+      raise NotImplementedError(
+          '`evaluate_generator` is not yet enabled for Model subclasses')
 
     self._make_test_function()
 
@@ -2259,25 +2701,34 @@ class Model(Network):
     is_sequence = isinstance(generator, Sequence)
     if not is_sequence and use_multiprocessing and workers > 1:
       logging.warning(
-          logging.warning('Using a generator with `use_multiprocessing=True`'
-                          ' and multiple workers may duplicate your data.'
-                          ' Please consider using the`keras.utils.Sequence'
-                          ' class.'))
-    if is_sequence:
-      steps = len(generator)
+          UserWarning('Using a generator with `use_multiprocessing=True`'
+                      ' and multiple workers may duplicate your data.'
+                      ' Please consider using the`keras.utils.Sequence'
+                      ' class.'))
+    if steps is None:
+      if is_sequence:
+        steps = len(generator)
+      else:
+        raise ValueError('`steps=None` is only valid for a generator'
+                         ' based on the `keras.utils.Sequence` class.'
+                         ' Please specify `steps` or use the'
+                         ' `keras.utils.Sequence` class.')
     enqueuer = None
 
     try:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing)
+      if workers > 0:
+        if is_sequence:
+          enqueuer = OrderedEnqueuer(
+              generator, use_multiprocessing=use_multiprocessing)
+        else:
+          enqueuer = GeneratorEnqueuer(
+              generator,
+              use_multiprocessing=use_multiprocessing,
+              wait_time=wait_time)
+        enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+        output_generator = enqueuer.get()
       else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing,
-            wait_time=wait_time)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
+        output_generator = generator
 
       while steps_done < steps:
         generator_output = next(output_generator)
@@ -2297,11 +2748,11 @@ class Model(Network):
         outs = self.test_on_batch(x, y, sample_weight=sample_weight)
 
         if isinstance(x, list):
-          batch_size = len(x[0])
+          batch_size = x[0].shape[0]
         elif isinstance(x, dict):
-          batch_size = len(list(x.values())[0])
+          batch_size = list(x.values())[0].shape[0]
         else:
-          batch_size = len(x)
+          batch_size = x.shape[0]
         if batch_size == 0:
           raise ValueError('Received an empty batch. '
                            'Batches should at least contain one item.')
@@ -2325,12 +2776,11 @@ class Model(Network):
 
   def predict_generator(self,
                         generator,
-                        steps,
+                        steps=None,
                         max_queue_size=10,
                         workers=1,
                         use_multiprocessing=False,
-                        verbose=0,
-                        **kwargs):
+                        verbose=0):
     """Generates predictions for the input samples from a data generator.
 
     The generator should return the same kind of data as accepted by
@@ -2338,15 +2788,18 @@ class Model(Network):
 
     Arguments:
         generator: Generator yielding batches of input samples
-                or an instance of Sequence (keras.utils.Sequence)
-                object in order to avoid duplicate data
-                when using multiprocessing.
+            or an instance of Sequence (keras.utils.Sequence)
+            object in order to avoid duplicate data
+            when using multiprocessing.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
         max_queue_size: Maximum size for the generator queue.
-          Not used if using `Sequence`.
-        workers: Maximum number of processes to spin up
-            when using process-based threading.
+        workers: Integer. Maximum number of processes to spin up
+            when using process based threading.
+            If unspecified, `workers` will default to 1. If 0, will
+            execute the generator on the main thread.
         use_multiprocessing: If `True`, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
@@ -2355,7 +2808,6 @@ class Model(Network):
             as they can't be passed
             easily to children processes.
         verbose: verbosity mode, 0 or 1.
-        **kwargs: support for legacy arguments.
 
     Returns:
         Numpy array(s) of predictions.
@@ -2364,16 +2816,9 @@ class Model(Network):
         ValueError: In case the generator yields
             data in an invalid format.
     """
-    # Legacy support
-    if 'max_q_size' in kwargs:
-      max_queue_size = kwargs.pop('max_q_size')
-      logging.warning('The argument `max_q_size` has been renamed '
-                      '`max_queue_size`. Update your method calls accordingly.')
-    if 'pickle_safe' in kwargs:
-      use_multiprocessing = kwargs.pop('pickle_safe')
-      logging.warning('The argument `pickle_safe` has been renamed '
-                      '`use_multiprocessing`. '
-                      'Update your method calls accordingly.')
+    if not self._is_graph_network:
+      raise NotImplementedError(
+          '`predict_generator` is not yet enabled for Model subclasses')
 
     self._make_predict_function()
 
@@ -2383,25 +2828,34 @@ class Model(Network):
     is_sequence = isinstance(generator, Sequence)
     if not is_sequence and use_multiprocessing and workers > 1:
       logging.warning(
-          logging.warning('Using a generator with `use_multiprocessing=True`'
-                          ' and multiple workers may duplicate your data.'
-                          ' Please consider using the`keras.utils.Sequence'
-                          ' class.'))
-    if is_sequence:
-      steps = len(generator)
+          UserWarning('Using a generator with `use_multiprocessing=True`'
+                      ' and multiple workers may duplicate your data.'
+                      ' Please consider using the`keras.utils.Sequence'
+                      ' class.'))
+    if steps is None:
+      if is_sequence:
+        steps = len(generator)
+      else:
+        raise ValueError('`steps=None` is only valid for a generator'
+                         ' based on the `keras.utils.Sequence` class.'
+                         ' Please specify `steps` or use the'
+                         ' `keras.utils.Sequence` class.')
     enqueuer = None
 
     try:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing)
+      if workers > 0:
+        if is_sequence:
+          enqueuer = OrderedEnqueuer(
+              generator, use_multiprocessing=use_multiprocessing)
+        else:
+          enqueuer = GeneratorEnqueuer(
+              generator,
+              use_multiprocessing=use_multiprocessing,
+              wait_time=wait_time)
+        enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+        output_generator = enqueuer.get()
       else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing,
-            wait_time=wait_time)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
+        output_generator = generator
 
       if verbose == 1:
         progbar = Progbar(target=steps)
@@ -2448,6 +2902,6 @@ class Model(Network):
       else:
         return np.concatenate(all_outs[0])
     if steps_done == 1:
-      return [out for out in all_outs]
+      return [out[0] for out in all_outs]
     else:
       return [np.concatenate(out) for out in all_outs]
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
new file mode 100644
index 0000000000000000000000000000000000000000..477bb2fe7ac44f1f52191a113c495360400b8d75
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -0,0 +1,612 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras training and evaluation routines.
+"""
+# pylint: disable=protected-access
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from tensorflow.python.eager.backprop import GradientTape
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import callbacks as cbks
+from tensorflow.python.keras._impl.keras import losses
+from tensorflow.python.keras._impl.keras import metrics as metrics_module
+from tensorflow.python.keras._impl.keras.utils.generic_utils import make_batches
+from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+
+
+def _get_metrics_info(metric, internal_output_shapes=None, loss_func=None):
+  if metric == 'accuracy' or metric == 'acc':
+    # custom handling of accuracy
+    # (because of class mode duality)
+    output_shape = internal_output_shapes
+    if output_shape[-1] == 1 or loss_func == losses.binary_crossentropy:
+      # case: binary accuracy
+      acc_fn = metrics_module.binary_accuracy
+    elif loss_func == losses.sparse_categorical_crossentropy:
+      # case: categorical accuracy with sparse targets
+      acc_fn = metrics_module.sparse_categorical_accuracy
+    else:
+      acc_fn = metrics_module.categorical_accuracy
+
+    metric_name = 'acc'
+    return metric_name, acc_fn
+  else:
+    metric_fn = metrics_module.get(metric)
+    metric_name = metric_fn.__name__
+    return metric_name, metric_fn
+
+
+def _eager_loss_fn(outputs, targets, loss_fn, output_name):
+  with K.name_scope(output_name + '_loss'):
+    loss = loss_fn(targets, outputs)
+  return loss
+
+
+def _eager_metrics_fn(model, outputs, targets):
+  """Calculates the metrics for each output of the given model.
+
+  Arguments:
+      model: The model on which metrics are being calculated.
+      outputs: The outputs of the given model.
+      targets: The predictions or targets of the given model.
+
+  Returns:
+      Returns the metric names and metric results for each output of the model.
+  """
+  metric_names = []
+  metric_results = []
+  if not isinstance(outputs, list):
+    outputs = [outputs]
+
+  if not isinstance(targets, list):
+    targets = [targets]
+
+  for i in range(len(model.outputs)):
+    output_metrics = model.nested_metrics[i]
+    for nested_output_metric in output_metrics:
+      metric_name, metric_fn = _get_metrics_info(
+          nested_output_metric, K.int_shape(model.outputs[i]),
+          model.loss_functions[i])
+
+      if len(model.output_names) > 1:
+        metric_name = model.output_names[i] + '_' + metric_name
+        if metric_name not in model.metrics_names:
+          model.metrics_names.append(metric_name)
+
+      with K.name_scope(metric_name):
+        metric_result = metric_fn(outputs[i], targets[i])
+        metric_names.append(metric_name)
+        metric_results.append(K.mean(metric_result))
+
+  return metric_names, metric_results
+
+
+def _model_loss(model, inputs, targets):
+  """Calculates the loss for a given model.
+
+  Arguments:
+     model: The model on which metrics are being calculated.
+     inputs: The inputs of the given model. This is typically the mini batch of
+              data that is fed to the model.
+     targets: The predictions or targets of the given model.
+
+  Returns:
+     Returns the model output, total loss and loss value calculated using the
+     specified loss function. The total loss includes regularization losses and
+     applies masking and sample weighting to the loss value.
+  """
+  total_loss = 0
+  if len(inputs) == 1:
+    outs = model.call(inputs[0])
+  else:
+    outs = model.call(inputs)
+  if not isinstance(outs, list):
+    outs = [outs]
+
+  if not isinstance(targets, list):
+    targets = [targets]
+
+  loss_metrics = []
+  with K.name_scope('loss'):
+    for i, loss_fn in enumerate(model.loss_functions):
+      # compute the loss
+      output_loss = _eager_loss_fn(outs[i], targets[i], loss_fn,
+                                   model.output_names[i])
+      loss_metrics.append(K.mean(output_loss))
+
+      mask = outs[i]._keras_mask
+      # adapted from weighted_loss_fn
+      if mask is not None:
+        # mask should have the same shape as output_loss
+        output_loss *= mask
+        #  the loss per batch should be proportional
+        #  to the number of unmasked samples.
+        output_loss /= K.mean(mask)
+
+      # adapted from weighted_loss_fn
+      # apply sample weighting
+      if model.sample_weights:
+        # reduce score_array to same ndim as weight array
+        ndim = K.ndim(output_loss)
+        weight_ndim = K.ndim(model.sample_weights)
+        output_loss = K.mean(output_loss, axis=list(range(weight_ndim, ndim)))
+        output_loss *= model.sample_weights
+        output_loss /= K.mean(K.cast(K.not_equal(model.sample_weights, 0),
+                                     K.floatx()))
+        output_loss = K.mean(output_loss)
+
+      loss_weight = model.loss_weights_list[i]
+      if total_loss is None:
+        total_loss = loss_weight * output_loss
+      else:
+        total_loss += loss_weight * output_loss
+
+    total_loss = K.mean(total_loss)
+    # Add regularization losses
+    custom_losses = []
+    for layer in model.layers:
+      if layer.losses:
+        custom_losses += layer.losses
+
+    if custom_losses:
+      total_loss += sum(custom_losses)
+
+  return outs, total_loss, loss_metrics
+
+
+def _process_single_batch(eager_model_inputs, eager_model_outputs, model,
+                          training=True):
+  """Calculate the loss and gradient for one input batch.
+
+     The model weights are updated if training is set to True.
+
+  Arguments:
+      eager_model_inputs: Input batch data.
+      eager_model_outputs: Output batch data.
+      model: Model whose loss has to be calculated.
+      training: The boolean represents if the weights of the model are updated.
+              'fit' methods will set this to True while 'evaluate' methods will
+              set this to False.
+
+  Returns:
+      output of the model, total loss and the loss associated with each output.
+
+  Raises:
+      ValueError: If the model loss is 0 or if the trainable weights list is
+                  empty when the trainable parameter is set to True.
+  """
+  K.set_learning_phase(training)
+  with GradientTape() as tape:
+    outs, loss, loss_metrics = _model_loss(model, eager_model_inputs,
+                                           eager_model_outputs)
+    if loss is None:
+      raise ValueError('The model cannot be run '
+                       'because it has no loss to optimize.')
+  if training:
+    if not model._collected_trainable_weights:
+      raise ValueError('The list of trainable weights is empty. Make sure that '
+                       'you are not setting model.trainable to False before '
+                       'compiling the model.')
+    grads = tape.gradient(loss, model._collected_trainable_weights)
+    model.optimizer.apply_gradients(zip(grads,
+                                        model._collected_trainable_weights))
+  return outs, loss, loss_metrics
+
+
+def train_on_batch(model, ins):
+  """Calculates the loss and gradient updates for one input batch.
+
+  Arguments:
+      model: Given model on which loss and gradients are calculated.
+      ins: Input and output batch numpy arrays.
+
+  Returns:
+      total loss and the loss associated with each output.
+  """
+  ins_batch_converted = []
+  for ib in ins:
+    ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+  eager_model_inputs = []
+  eager_model_outputs = []
+  for i in range(len(model.inputs)):
+    eager_model_inputs.append(ins_batch_converted[i])
+  for i in range(len(model.inputs), len(ins_batch_converted)):
+    eager_model_outputs.append(ins_batch_converted[i])
+  outs, loss, _ = _process_single_batch(
+      eager_model_inputs, eager_model_outputs, model)
+  if not isinstance(outs, list):
+    outs = [outs]
+  _, metrics_results = _eager_metrics_fn(
+      model, outs, eager_model_outputs)
+  if not isinstance(loss, list):
+    loss = [loss]
+  return loss + metrics_results
+
+
+def test_on_batch(model, ins):
+  """Calculates the loss for one input batch.
+
+  Arguments:
+      model: Given model on which loss is calculated.
+      ins: Input and output batch numpy arrays.
+
+  Returns:
+      total loss, loss and metrics associated with each output.
+  """
+  ins_batch_converted = []
+  for ib in ins:
+    ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+  eager_model_inputs = []
+  eager_model_outputs = []
+  for i in range(len(model.inputs)):
+    eager_model_inputs.append(ins_batch_converted[i])
+  for i in range(len(model.inputs), len(ins_batch_converted)):
+    eager_model_outputs.append(ins_batch_converted[i])
+  outs, loss, loss_metrics = _process_single_batch(
+      eager_model_inputs, eager_model_outputs, model, training=False)
+  if not isinstance(outs, list):
+    outs = [outs]
+  metric_names, metrics_results = _eager_metrics_fn(
+      model, outs, eager_model_outputs)
+  model.metrics_names.append(metric_names)
+  if not isinstance(loss, list):
+    loss = [loss]
+  return loss + loss_metrics + metrics_results
+
+
+def fit_loop(
+    model,
+    ins,
+    out_labels=None,
+    batch_size=None,
+    epochs=100,
+    verbose=1,
+    callbacks=None,
+    val_ins=None,
+    shuffle=True,
+    callback_metrics=None,
+    initial_epoch=0,
+    steps_per_epoch=None,
+    validation_steps=None):
+  """Abstract fit function for `f(ins)`.
+
+  Assume that f returns a list, labeled by out_labels.
+
+  Arguments:
+      model: Instance of the model that is being executed in Eager mode.
+      ins: List of tensors to be fed to `f`
+      out_labels: List of strings, display names of
+          the outputs of `f`
+      batch_size: Integer batch size or None if unknown.
+      epochs: Number of times to iterate over the data
+      verbose: Verbosity mode, 0, 1 or 2
+      callbacks: List of callbacks to be called during training
+      val_ins: List of tensors to be fed to `val_f`
+      shuffle: Whether to shuffle the data at the beginning of each epoch
+      callback_metrics: List of strings, the display names of the metrics
+          passed to the callbacks. They should be the
+          concatenation of list the display names of the outputs of
+           `f` and the list of display names of the outputs of `f_val`.
+      initial_epoch: Epoch at which to start training
+          (useful for resuming a previous training run)
+      steps_per_epoch: Total number of steps (batches of samples)
+          before declaring one epoch finished and starting the
+          next epoch. Ignored with the default value of `None`.
+      validation_steps: Number of steps to run validation for (only if doing
+        validation from data tensors). Ignored with default value of `None`.
+
+  Returns:
+      `History` object.
+
+  Raises:
+    ValueError: In case of invalid argument values.
+  """
+  # Required for Eager mode
+  K.set_learning_phase(True)
+
+  do_validation = False
+  if val_ins:
+    do_validation = True
+    if (verbose and ins and hasattr(ins[0], 'shape') and
+        hasattr(val_ins[0], 'shape')):
+      print('Train on %d samples, validate on %d samples' %
+            (ins[0].shape[0], val_ins[0].shape[0]))
+  if validation_steps:
+    if steps_per_epoch is None:
+      raise ValueError('Can only use `validation_steps` when doing step-wise '
+                       'training, i.e. `steps_per_epoch` must be set.')
+    do_validation = True
+
+  num_train_samples = model._check_num_samples(
+      ins, batch_size, steps_per_epoch, 'steps_per_epoch')
+
+  if num_train_samples is not None:
+    index_array = np.arange(num_train_samples)
+
+  model.history = cbks.History()
+  callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
+  if verbose:
+    if steps_per_epoch is not None:
+      count_mode = 'steps'
+    else:
+      count_mode = 'samples'
+    callbacks += [cbks.ProgbarLogger(count_mode)]
+  callbacks = cbks.CallbackList(callbacks)
+  out_labels = out_labels or []
+
+  # it's possible to callback a different model than self
+  # (used by Sequential models)
+  if hasattr(model, 'callback_model') and model.callback_model:
+    callback_model = model.callback_model
+  else:
+    callback_model = model
+
+  callbacks.set_model(callback_model)
+
+  callbacks.set_params({
+      'batch_size': batch_size,
+      'epochs': epochs,
+      'steps': steps_per_epoch,
+      'samples': num_train_samples,
+      'verbose': verbose,
+      'do_validation': do_validation,
+      'metrics': callback_metrics or [],
+  })
+  callbacks.on_train_begin()
+  callback_model.stop_training = False
+  for cbk in callbacks:
+    cbk.validation_data = val_ins
+
+  for epoch in range(initial_epoch, epochs):
+    callbacks.on_epoch_begin(epoch)
+    epoch_logs = {}
+    if shuffle == 'batch':
+      index_array = model._batch_shuffle(index_array, batch_size)
+    elif shuffle:
+      np.random.shuffle(index_array)
+
+    batches = make_batches(num_train_samples, batch_size)
+
+    for batch_index, (batch_start, batch_end) in enumerate(batches):
+      batch_ids = index_array[batch_start:batch_end]
+      try:
+        if isinstance(ins[-1], float):
+          # Do not slice the training phase flag.
+          ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+        else:
+          ins_batch = slice_arrays(ins, batch_ids)
+      except TypeError:
+        raise TypeError('TypeError while preparing batch. '
+                        'If using HDF5 input data, '
+                        'pass shuffle="batch".')
+      batch_logs = {}
+      batch_logs['batch'] = batch_index
+      batch_logs['size'] = len(batch_ids)
+
+      callbacks.on_batch_begin(batch_index, batch_logs)
+
+      ins_batch_converted = []
+      for ib in ins_batch:
+        ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+      eager_model_inputs = []
+      eager_model_outputs = []
+      for i in range(len(model.inputs)):
+        eager_model_inputs.append(ins_batch_converted[i])
+
+      for i in range(len(model.inputs), len(ins_batch_converted)):
+        eager_model_outputs.append(ins_batch_converted[i])
+
+      outs, loss, loss_metrics = _process_single_batch(eager_model_inputs,
+                                                       eager_model_outputs,
+                                                       model)
+
+      if not isinstance(outs, list):
+        outs = [outs]
+
+      for l, o in zip(out_labels, outs):
+        batch_logs[l] = o
+      # Required for Eager mode
+      metrics_names, metrics_results = _eager_metrics_fn(model, outs,
+                                                         eager_model_outputs)
+      batch_logs['loss'] = tensor_util.constant_value(K.mean(loss))
+
+      # TODO(anjalisridhar): Move this to compile to avoid duplicate code.
+      # In graph mode we set the metric names in compile. However in
+      # Eager mode we calculate the metrics for each batch in fit_loop.
+      # We could calculate the metric names and functions in compile.
+      # This would avoid setting the callback parameters separately.
+      # We need to do this for the first iteration alone
+      for m in metrics_names:
+        if m not in callback_metrics:
+          callback_metrics.append(m)
+
+      callbacks.set_params({
+          'batch_size': batch_size,
+          'epochs': epochs,
+          'steps': steps_per_epoch,
+          'samples': num_train_samples,
+          'verbose': verbose,
+          'do_validation': do_validation,
+          'metrics': callback_metrics or [],
+      })
+
+      for k, v in zip(model.metrics_names,
+                      [K.mean(loss)] + loss_metrics + metrics_results):
+        batch_logs[k] = tensor_util.constant_value(v)
+
+      callbacks.on_batch_end(batch_index, batch_logs)
+      if callback_model.stop_training:
+        break
+
+      if batch_index == len(batches) - 1:  # Last batch.
+        if do_validation:
+          val_outs = test_loop(
+              model, val_ins, batch_size=batch_size, verbose=0)
+          if not isinstance(val_outs, list):
+            val_outs = [val_outs]
+          # Same labels assumed.
+          for l, o in zip(out_labels, val_outs):
+            epoch_logs['val_' + l] = o
+    callbacks.on_epoch_end(epoch, epoch_logs)
+    if callback_model.stop_training:
+      break
+  callbacks.on_train_end()
+  return model.history
+
+
+def test_loop(model, ins, batch_size=None, verbose=0, steps=None):
+  """Abstract method to loop over some data in batches.
+
+  Arguments:
+      model: Model instance that is being evaluated in Eager mode.
+      ins: list of tensors to be fed to `f`.
+      batch_size: integer batch size or `None`.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring predictions finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Scalar loss (if the model has a single output and no metrics)
+      or list of scalars (if the model has multiple outputs
+      and/or metrics). The attribute `model.metrics_names` will give you
+      the display labels for the scalar outputs.
+  """
+  K.set_learning_phase(False)
+  num_samples = model._check_num_samples(ins, batch_size, steps, 'steps')
+  outs = []
+  if verbose == 1:
+    progbar = Progbar(target=num_samples)
+  batches = make_batches(num_samples, batch_size)
+  index_array = np.arange(num_samples)
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    if isinstance(ins[-1], float):
+      # Do not slice the training phase flag.
+      ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+    else:
+      ins_batch = slice_arrays(ins, batch_ids)
+
+    ins_batch_converted = []
+    for ib in ins_batch:
+      ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+
+    eager_model_inputs = []
+    eager_model_outputs = []
+    for i in range(len(model.inputs)):
+      eager_model_inputs.append(ins_batch_converted[i])
+
+    for i in range(len(model.inputs), len(ins_batch_converted)):
+      eager_model_outputs.append(ins_batch_converted[i])
+
+    loss_outs, loss, loss_metrics = _model_loss(model, eager_model_inputs,
+                                                eager_model_outputs)
+    _, metrics_results = _eager_metrics_fn(model, loss_outs,
+                                           eager_model_outputs)
+    batch_outs = []
+    for _, v in zip(model.metrics_names,
+                    [K.mean(loss)] + loss_metrics + metrics_results):
+      batch_outs.append(tensor_util.constant_value(v))
+
+    if isinstance(batch_outs, list):
+      if batch_index == 0:
+        for batch_out in enumerate(batch_outs):
+          outs.append(0.)
+      for i, batch_out in enumerate(batch_outs):
+        outs[i] += batch_out * len(batch_ids)
+    else:
+      if batch_index == 0:
+        outs.append(0.)
+      outs[0] += batch_outs * len(batch_ids)
+
+    if verbose == 1:
+      progbar.update(batch_end)
+  for i in range(len(outs)):
+    outs[i] /= num_samples
+  if len(outs) == 1:
+    return outs[0]
+  return outs
+
+
+def predict_loop(model, ins, batch_size=32, verbose=0, steps=None):
+  """Abstract method to loop over some data in batches.
+
+  Arguments:
+      model:
+      ins: list of tensors to be fed to `f`.
+      batch_size: integer batch size.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring `_predict_loop` finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Array of predictions (if the model has a single output)
+      or list of arrays of predictions
+      (if the model has multiple outputs).
+  """
+  K.set_learning_phase(False)
+  num_samples = model._check_num_samples(ins, batch_size, steps, 'steps')
+  if verbose == 1:
+    if steps is not None:
+      progbar = Progbar(target=steps)
+    else:
+      progbar = Progbar(target=num_samples)
+
+  outs = []
+  batches = make_batches(num_samples, batch_size)
+  index_array = np.arange(num_samples)
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    if ins and isinstance(ins[-1], float):
+      # Do not slice the training phase flag.
+      ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
+    else:
+      ins_batch = slice_arrays(ins, batch_ids)
+
+    ins_batch_converted = []
+    for ib in ins_batch:
+      ins_batch_converted.append(ops.convert_to_tensor(ib, dtype=K.floatx()))
+
+    eager_model_inputs = []
+    for i in range(len(model.inputs)):
+      eager_model_inputs.append(ins_batch_converted[i])
+
+    if len(eager_model_inputs) == 1:
+      batch_outs = model.call(eager_model_inputs[0])
+    else:
+      batch_outs = model.call(eager_model_inputs)
+
+    if not isinstance(batch_outs, list):
+      batch_outs = [batch_outs]
+    if batch_index == 0:
+      # Pre-allocate the results arrays.
+      for batch_out in batch_outs:
+        dims = batch_out.shape[1:].dims
+        dims_list = [d.value for d in dims]
+        shape = (num_samples,) + tuple(dims_list)
+        outs.append(np.zeros(shape, dtype=batch_out.dtype.as_numpy_dtype))
+    for i, batch_out in enumerate(batch_outs):
+      outs[i][batch_start:batch_end] = batch_out
+    if verbose == 1:
+      progbar.update(batch_end)
+  if len(outs) == 1:
+    return outs[0]
+  return outs
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..45601f964a090fd927a22eb525d3c1c154fd71db
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
@@ -0,0 +1,756 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+
+class TrainingTest(test.TestCase):
+
+  def test_fit_on_arrays(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    # Test fit at different verbosity
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=2,
+        batch_size=5,
+        verbose=2)
+
+    # Test with validation data
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=2,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        validation_data=([input_a_np, input_b_np], [output_d_np,
+                                                    output_e_np]),
+        epochs=2,
+        batch_size=5,
+        verbose=2)
+    model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
+
+  # Test with validation split
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=2,
+        batch_size=5,
+        verbose=0,
+        validation_split=0.2)
+
+    # Test with dictionary inputs
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        epochs=1,
+        batch_size=5,
+        verbose=1)
+    model.fit(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        validation_data=({'input_a': input_a_np,
+                          'input_b': input_b_np
+                         },
+                         {
+                             'dense': output_d_np,
+                             'dropout': output_e_np
+                         }),
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.train_on_batch({
+        'input_a': input_a_np,
+        'input_b': input_b_np
+    }, {'dense': output_d_np,
+        'dropout': output_e_np})
+    # Test with lists for loss, metrics
+    loss = ['mae', 'mse']
+    metrics = ['acc', 'mae']
+    model.compile(optimizer, loss, metrics=metrics)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+
+    # Test with dictionaries for loss, metrics, loss weights
+    loss = {'dense': 'mse', 'dropout': 'mae'}
+    loss_weights = {'dense': 1., 'dropout': 0.5}
+    metrics = {'dense': 'mse', 'dropout': 'mae'}
+    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+    model.fit(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+
+    # Invalid use cases
+    with self.assertRaises(AttributeError):
+      model.fit(
+          [input_a_np, input_b_np], [output_d_np, output_e_np],
+          epochs=1,
+          validation_data=([input_a_np, input_b_np], 0, 0),
+          verbose=0)
+    with self.assertRaises(ValueError):
+      model.train_on_batch({'input_a': input_a_np},
+                           [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      model.train_on_batch([input_a_np], [output_d_np, output_e_np])
+    with self.assertRaises(AttributeError):
+      model.train_on_batch(1, [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      model.train_on_batch(input_a_np, [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      bad_input = np.random.random((11, 3))
+      model.train_on_batch([bad_input, input_b_np],
+                           [output_d_np, output_e_np])
+    with self.assertRaises(ValueError):
+      bad_target = np.random.random((11, 4))
+      model.train_on_batch([input_a_np, input_b_np],
+                           [bad_target, output_e_np])
+
+    # Build single-input model
+    x = keras.layers.Input(shape=(3,), name='input_a')
+    y = keras.layers.Dense(4)(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+    # This will work
+    model.fit([input_a_np], output_d_np, epochs=1)
+    with self.assertRaises(ValueError):
+      model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
+
+  def test_evaluate_predict_on_arrays(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae']
+    model.compile(
+        optimizer,
+        loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        sample_weight_mode=None)
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    # Test evaluate at different verbosity
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=0)
+    self.assertEqual(len(out), 5)
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=1)
+    self.assertEqual(len(out), 5)
+    out = model.evaluate(
+        [input_a_np, input_b_np], [output_d_np, output_e_np],
+        batch_size=5,
+        verbose=2)
+    self.assertEqual(len(out), 5)
+    out = model.test_on_batch([input_a_np, input_b_np],
+                              [output_d_np, output_e_np])
+    self.assertEqual(len(out), 5)
+
+    # Test evaluate with dictionary inputs
+    model.evaluate(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        batch_size=5,
+        verbose=0)
+    model.evaluate(
+        {
+            'input_a': input_a_np,
+            'input_b': input_b_np
+        }, {'dense': output_d_np,
+            'dropout': output_e_np},
+        batch_size=5,
+        verbose=1)
+
+    # Test predict
+    out = model.predict([input_a_np, input_b_np], batch_size=5)
+    self.assertEqual(len(out), 2)
+    out = model.predict({'input_a': input_a_np, 'input_b': input_b_np})
+    self.assertEqual(len(out), 2)
+    out = model.predict_on_batch({
+        'input_a': input_a_np,
+        'input_b': input_b_np
+    })
+    self.assertEqual(len(out), 2)
+
+  def test_invalid_loss_or_metrics(self):
+    num_classes = 5
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+    model.add(keras.layers.Activation('relu'))
+    model.add(keras.layers.Dense(num_classes))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+    np.random.seed(1337)
+
+    (x_train, y_train), (_, _) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+
+    with self.assertRaises(ValueError):
+      model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
+
+    with self.assertRaises(TypeError):
+      model.compile(loss='categorical_crossentropy',
+                    optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    metrics=set(0))
+
+    with self.assertRaises(ValueError):
+      model.compile(loss=None,
+                    optimizer='rms')
+
+
+class LossWeightingTest(test.TestCase):
+
+  def test_class_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 3000
+    test_samples = 3000
+    input_dim = 5
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+    model.add(keras.layers.Activation('relu'))
+    model.add(keras.layers.Dense(num_classes))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    np.random.seed(1337)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_test = y_test.copy()
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    y_test = keras.utils.to_categorical(y_test, num_classes)
+    test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+    class_weight[weighted_class] = 2.
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        class_weight=class_weight,
+        validation_data=(x_train, y_train, sample_weight))
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 2,
+        verbose=0,
+        class_weight=class_weight)
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 2,
+        verbose=0,
+        class_weight=class_weight,
+        validation_split=0.1)
+
+    model.train_on_batch(
+        x_train[:batch_size], y_train[:batch_size], class_weight=class_weight)
+    ref_score = model.evaluate(x_test, y_test, verbose=0)
+    score = model.evaluate(
+        x_test[test_ids, :], y_test[test_ids, :], verbose=0)
+    self.assertLess(score, ref_score)
+
+  def test_sample_weights(self):
+    num_classes = 5
+    batch_size = 5
+    epochs = 5
+    weighted_class = 3
+    train_samples = 3000
+    test_samples = 3000
+    input_dim = 5
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_shape=(input_dim,)))
+    model.add(keras.layers.Activation('relu'))
+    model.add(keras.layers.Dense(num_classes))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    np.random.seed(43)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_test = y_test.copy()
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    y_test = keras.utils.to_categorical(y_test, num_classes)
+    test_ids = np.where(int_y_test == np.array(weighted_class))[0]
+
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+    class_weight[weighted_class] = 2.
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        sample_weight=sample_weight)
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=epochs // 3,
+        verbose=0,
+        sample_weight=sample_weight,
+        validation_split=0.1)
+    model.train_on_batch(
+        x_train[:batch_size],
+        y_train[:batch_size],
+        sample_weight=sample_weight[:batch_size])
+    model.test_on_batch(
+        x_train[:batch_size],
+        y_train[:batch_size],
+        sample_weight=sample_weight[:batch_size])
+
+  def test_temporal_sample_weights(self):
+    num_classes = 5
+    weighted_class = 3
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+    timesteps = 3
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.TimeDistributed(
+            keras.layers.Dense(num_classes),
+            input_shape=(timesteps, input_dim)))
+    model.add(keras.layers.Activation('softmax'))
+
+    np.random.seed(1337)
+    (_, y_train), _ = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    int_y_train = y_train.copy()
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+    class_weight[weighted_class] = 2.
+
+    sample_weight = np.ones((y_train.shape[0]))
+    sample_weight[int_y_train == weighted_class] = 2.
+    with self.assertRaises(ValueError):
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer=RMSPropOptimizer(learning_rate=0.001),
+          sample_weight_mode='temporal')
+
+  def test_class_weight_invalid_use_case(self):
+    num_classes = 5
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+    timesteps = 3
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.TimeDistributed(
+            keras.layers.Dense(num_classes),
+            input_shape=(timesteps, input_dim)))
+    model.add(keras.layers.Activation('softmax'))
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+    # convert class vectors to binary class matrices
+    y_train = keras.utils.to_categorical(y_train, num_classes)
+    class_weight = dict([(i, 1.) for i in range(num_classes)])
+
+    del class_weight[1]
+    with self.assertRaises(ValueError):
+      model.fit(x_train, y_train,
+                epochs=0, verbose=0, class_weight=class_weight)
+
+    with self.assertRaises(ValueError):
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer=RMSPropOptimizer(learning_rate=0.001),
+          sample_weight_mode=[])
+
+    # Build multi-output model
+    x = keras.Input((3,))
+    y1 = keras.layers.Dense(4, name='1')(x)
+    y2 = keras.layers.Dense(4, name='2')(x)
+    model = keras.models.Model(x, [y1, y2])
+    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+    x_np = np.random.random((10, 3))
+    y_np = np.random.random((10, 4))
+    w_np = np.random.random((10,))
+    # This will work
+    model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': w_np})
+    # These will not
+    with self.assertRaises(ValueError):
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight=[w_np])
+    with self.assertRaises(TypeError):
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight=w_np)
+    with self.assertRaises(ValueError):
+      bad_w_np = np.random.random((11,))
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
+    with self.assertRaises(ValueError):
+      bad_w_np = np.random.random((10, 2))
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
+    with self.assertRaises(ValueError):
+      bad_w_np = np.random.random((10, 2, 2))
+      model.fit(x_np, [y_np, y_np], epochs=1, sample_weight={'1': bad_w_np})
+
+
+class TestDynamicTrainability(test.TestCase):
+
+  def test_trainable_warning(self):
+    x = np.random.random((5, 3))
+    y = np.random.random((5, 2))
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_dim=3))
+    model.trainable = False
+    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
+    model.trainable = True
+    with self.assertRaises(ValueError):
+      model.train_on_batch(x, y)
+
+  def test_trainable_argument(self):
+    x = np.random.random((5, 3))
+    y = np.random.random((5, 2))
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_dim=3, trainable=False))
+    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
+    out = model.predict(x)
+    with self.assertRaises(ValueError):
+      model.train_on_batch(x, y)
+    out_2 = model.predict(x)
+    self.assertAllClose(out, out_2)
+
+    # test with nesting
+    inputs = keras.layers.Input(shape=(3,))
+    output = model(inputs)
+    model = keras.models.Model(inputs, output)
+    model.compile(RMSPropOptimizer(learning_rate=0.001), 'mse')
+    out = model.predict(x)
+    with self.assertRaises(ValueError):
+      model.train_on_batch(x, y)
+    out_2 = model.predict(x)
+    self.assertAllClose(out, out_2)
+
+  def test_layer_trainability_switch(self):
+    # with constructor argument, in Sequential
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, trainable=False, input_dim=1))
+    self.assertListEqual(model.trainable_weights, [])
+
+    # by setting the `trainable` argument, in Sequential
+    model = keras.models.Sequential()
+    layer = keras.layers.Dense(2, input_dim=1)
+    model.add(layer)
+    self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+    layer.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+    # with constructor argument, in Model
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2, trainable=False)(x)
+    model = keras.models.Model(x, y)
+    self.assertListEqual(model.trainable_weights, [])
+
+    # by setting the `trainable` argument, in Model
+    x = keras.layers.Input(shape=(1,))
+    layer = keras.layers.Dense(2)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    self.assertListEqual(model.trainable_weights, layer.trainable_weights)
+    layer.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+  def test_model_trainability_switch(self):
+    # a non-trainable model has no trainable weights
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2)(x)
+    model = keras.models.Model(x, y)
+    model.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+    # same for Sequential
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_dim=1))
+    model.trainable = False
+    self.assertListEqual(model.trainable_weights, [])
+
+  def test_nested_model_trainability(self):
+
+    # a Sequential inside a Model
+    inner_model = keras.models.Sequential()
+    inner_model.add(keras.layers.Dense(2, input_dim=1))
+
+    x = keras.layers.Input(shape=(1,))
+    y = inner_model(x)
+    outer_model = keras.models.Model(x, y)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+    # a Sequential inside a Sequential
+    inner_model = keras.models.Sequential()
+    inner_model.add(keras.layers.Dense(2, input_dim=1))
+    outer_model = keras.models.Sequential()
+    outer_model.add(inner_model)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+    # a Model inside a Model
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2)(x)
+    inner_model = keras.models.Model(x, y)
+    x = keras.layers.Input(shape=(1,))
+    y = inner_model(x)
+    outer_model = keras.models.Model(x, y)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+    # a Model inside a Sequential
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(2)(x)
+    inner_model = keras.models.Model(x, y)
+    outer_model = keras.models.Sequential()
+    outer_model.add(inner_model)
+    self.assertListEqual(outer_model.trainable_weights,
+                         inner_model.trainable_weights)
+    inner_model.trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+    inner_model.trainable = True
+    inner_model.layers[-1].trainable = False
+    self.assertListEqual(outer_model.trainable_weights, [])
+
+
+class TestTrainingUtils(test.TestCase):
+
+  def test_check_array_lengths(self):
+    keras.engine.training._check_array_lengths(None, None, None)
+    a_np = np.random.random((4, 3, 3))
+    keras.engine.training._check_array_lengths(a_np, a_np, a_np)
+    keras.engine.training._check_array_lengths(
+        [a_np, a_np], [a_np, a_np], [a_np, a_np])
+    keras.engine.training._check_array_lengths([None], [None], [None])
+
+    b_np = np.random.random((3, 4))
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths(a_np, None, None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths(a_np, a_np, None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths([a_np], [None], None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths([a_np], [b_np], None)
+    with self.assertRaises(ValueError):
+      keras.engine.training._check_array_lengths([a_np], None, [b_np])
+
+  def test_slice_arrays(self):
+    input_a = np.random.random((10, 3))
+    slice_arrays(None)
+    slice_arrays(input_a, 0)
+    slice_arrays(input_a, 0, 1)
+    slice_arrays(input_a, stop=2)
+    input_a = [None, [1, 1], None, [1, 1]]
+    slice_arrays(input_a, 0)
+    slice_arrays(input_a, 0, 1)
+    slice_arrays(input_a, stop=2)
+    input_a = [None]
+    slice_arrays(input_a, 0)
+    slice_arrays(input_a, 0, 1)
+    slice_arrays(input_a, stop=2)
+    input_a = None
+    slice_arrays(input_a, 0)
+    slice_arrays(input_a, 0, 1)
+    slice_arrays(input_a, stop=2)
+
+  def test_fit_with_BatchNorm(self):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, input_dim=4))
+    model.add(keras.layers.BatchNormalization())
+    model.add(keras.layers.Activation('tanh'))
+    model.add(keras.layers.Dropout(0.2))
+
+    input_a_np = np.random.random((10, 4))
+    output_b_np = np.random.random((10, 10))
+
+    model.compile(loss='binary_crossentropy', optimizer=RMSPropOptimizer(0.001))
+    model.fit(input_a_np, output_b_np, epochs=1, batch_size=5, verbose=0)
+
+  def test_fit_with_regularization(self):
+    model = keras.models.Sequential()
+    with self.assertRaises(ValueError):
+      model.add(
+          keras.layers.Dense(4, input_dim=3,
+                             kernel_regularizer=keras.regularizers.l2(0.01),
+                             activity_regularizer=keras.regularizers.l1(0.01)))
+
+
+if __name__ == '__main__':
+  # Bazel sets these environment variables to very long paths.
+  # Tempfile uses them to create long paths, and in turn multiprocessing
+  # library tries to create sockets named after paths. Delete whatever bazel
+  # writes to these to avoid tests failing due to socket addresses being too
+  # long.
+  for var in ('TMPDIR', 'TMP', 'TEMP'):
+    if var in os.environ:
+      del os.environ[var]
+
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index 17a26f978e24776baee77182e1f901e3ee1091c8..9651eb9f14f1275dc79c8d3b1fb54690772086a1 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -26,8 +26,14 @@ import numpy as np
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.engine.training import _weighted_masked_objective
+from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.platform import test
 
+try:
+  import scipy.sparse as scipy_sparse  # pylint: disable=g-import-not-at-top
+except ImportError:
+  scipy_sparse = None
+
 
 class TrainingTest(test.TestCase):
 
@@ -73,6 +79,14 @@ class TrainingTest(test.TestCase):
           verbose=2)
       model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
 
+      # Test model with input data as a list of lists
+      model.fit(
+          [np.ndarray.tolist(input_a_np), np.ndarray.tolist(input_b_np)],
+          [output_d_np, output_e_np],
+          epochs=2,
+          batch_size=5,
+          verbose=2)
+
       # Test with validation data
       model.fit(
           [input_a_np, input_b_np], [output_d_np, output_e_np],
@@ -169,7 +183,7 @@ class TrainingTest(test.TestCase):
       with self.assertRaises(ValueError):
         model.train_on_batch({'input_a': input_a_np},
                              [output_d_np, output_e_np])
-      with self.assertRaises(TypeError):
+      with self.assertRaises(AttributeError):
         model.fit(
             [input_a_np, input_b_np], [output_d_np, output_e_np],
             epochs=1,
@@ -177,7 +191,7 @@ class TrainingTest(test.TestCase):
             verbose=0)
       with self.assertRaises(ValueError):
         model.train_on_batch([input_a_np], [output_d_np, output_e_np])
-      with self.assertRaises(TypeError):
+      with self.assertRaises(AttributeError):
         model.train_on_batch(1, [output_d_np, output_e_np])
       with self.assertRaises(ValueError):
         model.train_on_batch(input_a_np, [output_d_np, output_e_np])
@@ -200,6 +214,16 @@ class TrainingTest(test.TestCase):
       with self.assertRaises(ValueError):
         model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
 
+      # Test model on a list of floats
+      input_a_np = np.random.random((10, 3))
+      input_b_np = np.random.random((10, 4))
+
+      model.fit([np.ndarray.tolist(input_a_np)],
+                [np.ndarray.tolist(input_b_np)],
+                epochs=2,
+                batch_size=5,
+                verbose=2)
+
   def test_evaluate_predict_on_arrays(self):
     with self.test_session():
       a = keras.layers.Input(shape=(3,), name='input_a')
@@ -312,6 +336,63 @@ class TrainingTest(test.TestCase):
         model.compile(loss=None,
                       optimizer='rmsprop')
 
+  def test_training_on_sparse_data_with_dense_placeholders(self):
+    if scipy_sparse is None:
+      return
+
+    test_inputs = [
+        scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)]
+    test_outputs = [
+        scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)]
+    in1 = keras.layers.Input(shape=(3,))
+    in2 = keras.layers.Input(shape=(3,))
+    out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
+    out2 = keras.layers.Dense(4, name='dense_1')(in2)
+    model = keras.Model([in1, in2], [out1, out2])
+    model.predict(test_inputs, batch_size=2)
+    model.compile('rmsprop', 'mse')
+    model.fit(test_inputs, test_outputs,
+              epochs=1, batch_size=2, validation_split=0.5)
+    model.evaluate(test_inputs, test_outputs, batch_size=2)
+
+  def test_that_trainable_disables_updates(self):
+    val_a = np.random.random((10, 4))
+    val_out = np.random.random((10, 4))
+
+    with self.test_session():
+      a = keras.layers.Input(shape=(4,))
+      layer = keras.layers.BatchNormalization(input_shape=(4,))
+      b = layer(a)
+      model = keras.Model(a, b)
+
+      model.trainable = False
+      assert not model.updates
+
+      model.compile('sgd', 'mse')
+      assert not model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
+      model.trainable = True
+      model.compile('sgd', 'mse')
+      assert model.updates
+
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      assert np.abs(np.sum(x1 - x2)) > 1e-5
+
+      layer.trainable = False
+      model.compile('sgd', 'mse')
+      assert not model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
 
 class LossWeightingTest(test.TestCase):
 
@@ -399,7 +480,7 @@ class LossWeightingTest(test.TestCase):
       model.add(keras.layers.Activation('softmax'))
       model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
 
-      np.random.seed(1337)
+      np.random.seed(43)
       (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
           train_samples=train_samples,
           test_samples=test_samples,
@@ -836,6 +917,11 @@ class TestGeneratorMethods(test.TestCase):
                             use_multiprocessing=False,
                             validation_data=custom_generator(),
                             validation_steps=10)
+        model.fit_generator(custom_generator(),
+                            steps_per_epoch=5,
+                            validation_data=custom_generator(),
+                            validation_steps=1,
+                            workers=0)
         model.predict_generator(custom_generator(),
                                 steps=5,
                                 max_queue_size=10,
@@ -845,6 +931,10 @@ class TestGeneratorMethods(test.TestCase):
                                 steps=5,
                                 max_queue_size=10,
                                 use_multiprocessing=False)
+        model.predict_generator(custom_generator(),
+                                steps=5,
+                                max_queue_size=10,
+                                workers=0)
         model.evaluate_generator(custom_generator(),
                                  steps=5,
                                  max_queue_size=10,
@@ -854,25 +944,11 @@ class TestGeneratorMethods(test.TestCase):
                                  steps=5,
                                  max_queue_size=10,
                                  use_multiprocessing=False)
-
-        # Test legacy API
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
-                            max_q_size=10,
-                            workers=4,
-                            pickle_safe=True)
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_q_size=10,
-                                workers=2,
-                                pickle_safe=True)
         model.evaluate_generator(custom_generator(),
                                  steps=5,
-                                 max_q_size=10,
-                                 workers=2,
-                                 pickle_safe=True)
+                                 max_queue_size=10,
+                                 use_multiprocessing=False,
+                                 workers=0)
 
   def test_generator_methods_with_sample_weights(self):
     arr_data = np.random.random((50, 2))
@@ -946,7 +1022,7 @@ class TestGeneratorMethods(test.TestCase):
                             use_multiprocessing=False,
                             validation_data=custom_generator(),
                             validation_steps=10)
-      with self.assertRaises(TypeError):
+      with self.assertRaises(AttributeError):
         model.predict_generator(custom_generator(),
                                 steps=5,
                                 max_queue_size=10,
@@ -982,22 +1058,22 @@ class TestTrainingUtils(test.TestCase):
 
   def test_slice_arrays(self):
     input_a = np.random.random((10, 3))
-    keras.engine.training._slice_arrays(None)
-    keras.engine.training._slice_arrays(input_a, 0)
-    keras.engine.training._slice_arrays(input_a, 0, 1)
-    keras.engine.training._slice_arrays(input_a, stop=2)
+    slice_arrays(input_a, 0)
+    slice_arrays(None)
+    slice_arrays(input_a, 0, 1)
+    slice_arrays(input_a, stop=2)
     input_a = [None, [1, 1], None, [1, 1]]
-    keras.engine.training._slice_arrays(input_a, 0)
-    keras.engine.training._slice_arrays(input_a, 0, 1)
-    keras.engine.training._slice_arrays(input_a, stop=2)
+    slice_arrays(input_a, 0)
+    slice_arrays(input_a, 0, 1)
+    slice_arrays(input_a, stop=2)
     input_a = [None]
-    keras.engine.training._slice_arrays(input_a, 0)
-    keras.engine.training._slice_arrays(input_a, 0, 1)
-    keras.engine.training._slice_arrays(input_a, stop=2)
+    slice_arrays(input_a, 0)
+    slice_arrays(input_a, 0, 1)
+    slice_arrays(input_a, stop=2)
     input_a = None
-    keras.engine.training._slice_arrays(input_a, 0)
-    keras.engine.training._slice_arrays(input_a, 0, 1)
-    keras.engine.training._slice_arrays(input_a, stop=2)
+    slice_arrays(input_a, 0)
+    slice_arrays(input_a, 0, 1)
+    slice_arrays(input_a, stop=2)
 
 
 class TestTrainingWithDataTensors(test.TestCase):
@@ -1439,4 +1515,13 @@ class TestTrainingWithDataTensors(test.TestCase):
 
 
 if __name__ == '__main__':
+  # Bazel sets these environment variables to very long paths.
+  # Tempfile uses them to create long paths, and in turn multiprocessing
+  # library tries to create sockets named after paths. Delete whatever bazel
+  # writes to these to avoid tests failing due to socket addresses being too
+  # long.
+  for var in ('TMPDIR', 'TMP', 'TEMP'):
+    if var in os.environ:
+      del os.environ[var]
+
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index a1dfa81a79a445d900347aa55e2c848c70d83085..db0140c2df4d20f9e18e6c1401c6c6aa197bcf1f 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -23,23 +23,33 @@ import os
 
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import export as export_lib
+from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import models
 from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
-from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.util.tf_export import tf_export
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
 
+def _cast_tensor_to_floatx(x):
+  """Cast tensor to keras's floatx dtype if it is not already the same dtype."""
+  if x.dtype == K.floatx():
+    return x
+  else:
+    return math_ops.cast(x, K.floatx())
+
+
 def _create_ordered_io(keras_model, estimator_io_dict, is_input=True):
   """Create a list of tensors from IO dictionary based on Keras IO order.
 
@@ -68,7 +78,7 @@ def _create_ordered_io(keras_model, estimator_io_dict, is_input=True):
                                         ', '.join(keras_io_names)))
   tensors = []
   for io_name in keras_io_names:
-    tensors.append(estimator_io_dict[io_name])
+    tensors.append(_cast_tensor_to_floatx(estimator_io_dict[io_name]))
   return tensors
 
 
@@ -116,7 +126,8 @@ def _clone_and_build_model(mode,
       target_tensors = _create_ordered_io(keras_model, labels, is_input=False)
     else:
       target_tensors = [
-          sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(labels)
+          _cast_tensor_to_floatx(
+              sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(labels))
       ]
 
     model.compile(
@@ -191,7 +202,8 @@ def _create_keras_model_fn(keras_model, custom_objects=None):
         train_op=train_op,
         eval_metric_ops=eval_metric_ops,
         export_outputs={
-            _DEFAULT_SERVING_KEY: export_lib.export_output.PredictOutput(predictions)
+            _DEFAULT_SERVING_KEY:
+            export_lib.export_output.PredictOutput(predictions)
         })
 
   return model_fn
@@ -233,6 +245,7 @@ def _save_first_checkpoint(keras_model, estimator, custom_objects,
         saver.save(sess, os.path.join(estimator.model_dir, 'keras_model.ckpt'))
 
 
+@tf_export('keras.estimator.model_to_estimator')
 def model_to_estimator(keras_model=None,
                        keras_model_path=None,
                        custom_objects=None,
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/keras/_impl/keras/estimator_test.py
index a7ea3b48a33d4e2d485dd5ca40e39a6f3387facb..9fc48b4117e7ee2c717d5418754254aa02b82869 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@@ -25,8 +25,6 @@ import numpy as np
 
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
@@ -80,22 +78,17 @@ def get_resource_for_simple_model(is_sequential, is_evaluate):
   y_test = keras.utils.to_categorical(y_test)
 
   train_input_fn = numpy_io.numpy_input_fn(
-      x={input_name: np.array(x_train, dtype=np.float32)},
-      y=np.array(y_train, dtype=np.float32),
+      x={input_name: x_train},
+      y=y_train,
       shuffle=False,
       num_epochs=None,
       batch_size=16)
 
   evaluate_input_fn = numpy_io.numpy_input_fn(
-      x={input_name: np.array(x_test, dtype=np.float32)},
-      y=np.array(y_test, dtype=np.float32),
-      num_epochs=1,
-      shuffle=False)
+      x={input_name: x_test}, y=y_test, num_epochs=1, shuffle=False)
 
   predict_input_fn = numpy_io.numpy_input_fn(
-      x={input_name: np.array(x_test, dtype=np.float32)},
-      num_epochs=1,
-      shuffle=False)
+      x={input_name: x_test}, num_epochs=1, shuffle=False)
 
   inference_input_fn = evaluate_input_fn if is_evaluate else predict_input_fn
 
@@ -243,41 +236,13 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     d_test = keras.utils.to_categorical(d_test)
 
     def train_input_fn():
-      input_dict = {
-          'input_a':
-              ops.convert_to_tensor(
-                  np.array(a_train, dtype=np.float32), dtype=dtypes.float32),
-          'input_b':
-              ops.convert_to_tensor(
-                  np.array(b_train, dtype=np.float32), dtype=dtypes.float32)
-      }
-      output_dict = {
-          'dense_2':
-              ops.convert_to_tensor(
-                  np.array(c_train, dtype=np.float32), dtype=dtypes.float32),
-          'dense_3':
-              ops.convert_to_tensor(
-                  np.array(d_train, dtype=np.float32), dtype=dtypes.float32)
-      }
+      input_dict = {'input_a': a_train, 'input_b': b_train}
+      output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
-      input_dict = {
-          'input_a':
-              ops.convert_to_tensor(
-                  np.array(a_test, dtype=np.float32), dtype=dtypes.float32),
-          'input_b':
-              ops.convert_to_tensor(
-                  np.array(b_test, dtype=np.float32), dtype=dtypes.float32)
-      }
-      output_dict = {
-          'dense_2':
-              ops.convert_to_tensor(
-                  np.array(c_test, dtype=np.float32), dtype=dtypes.float32),
-          'dense_3':
-              ops.convert_to_tensor(
-                  np.array(d_test, dtype=np.float32), dtype=dtypes.float32)
-      }
+      input_dict = {'input_a': a_test, 'input_b': b_test}
+      output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
     with self.test_session():
@@ -347,26 +312,12 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
     y_train = keras.utils.to_categorical(y_train)
 
     def invald_input_name_input_fn():
-      input_dict = {
-          'invalid_input_name':
-              ops.convert_to_tensor(
-                  np.array(x_train, dtype=np.float32), dtype=dtypes.float32),
-      }
-      output = ops.convert_to_tensor(
-          np.array(y_train, dtype=np.float32), dtype=dtypes.float32)
-      return input_dict, output
+      input_dict = {'invalid_input_name': x_train}
+      return input_dict, y_train
 
     def invald_output_name_input_fn():
-      input_dict = {
-          'input_1':
-              ops.convert_to_tensor(
-                  np.array(x_train, dtype=np.float32), dtype=dtypes.float32),
-      }
-      output_dict = {
-          'invalid_output_name':
-              ops.convert_to_tensor(
-                  np.array(y_train, dtype=np.float32), dtype=dtypes.float32),
-      }
+      input_dict = {'input_1': x_train}
+      output_dict = {'invalid_output_name': y_train}
       return input_dict, output_dict
 
     model = simple_functional_model()
diff --git a/tensorflow/python/keras/_impl/keras/initializers.py b/tensorflow/python/keras/_impl/keras/initializers.py
index 8752faa534a3d6094ce530e490571ff939f86dbb..338c669f97736ace721f1d7e47a79426713ccfce 100644
--- a/tensorflow/python/keras/_impl/keras/initializers.py
+++ b/tensorflow/python/keras/_impl/keras/initializers.py
@@ -32,8 +32,10 @@ from tensorflow.python.ops.init_ops import RandomUniform
 from tensorflow.python.ops.init_ops import TruncatedNormal
 from tensorflow.python.ops.init_ops import VarianceScaling
 from tensorflow.python.ops.init_ops import Zeros
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.initializers.lecun_normal')
 def lecun_normal(seed=None):
   """LeCun normal initializer.
 
@@ -56,6 +58,7 @@ def lecun_normal(seed=None):
       scale=1., mode='fan_in', distribution='normal', seed=seed)
 
 
+@tf_export('keras.initializers.lecun_uniform')
 def lecun_uniform(seed=None):
   """LeCun uniform initializer.
 
@@ -77,6 +80,7 @@ def lecun_uniform(seed=None):
       scale=1., mode='fan_in', distribution='uniform', seed=seed)
 
 
+@tf_export('keras.initializers.glorot_normal')
 def glorot_normal(seed=None):
   """Glorot normal initializer, also called Xavier normal initializer.
 
@@ -99,6 +103,7 @@ def glorot_normal(seed=None):
       scale=1., mode='fan_avg', distribution='normal', seed=seed)
 
 
+@tf_export('keras.initializers.glorot_uniform')
 def glorot_uniform(seed=None):
   """Glorot uniform initializer, also called Xavier uniform initializer.
 
@@ -121,6 +126,7 @@ def glorot_uniform(seed=None):
       scale=1., mode='fan_avg', distribution='uniform', seed=seed)
 
 
+@tf_export('keras.initializers.he_normal')
 def he_normal(seed=None):
   """He normal initializer.
 
@@ -141,6 +147,7 @@ def he_normal(seed=None):
       scale=2., mode='fan_in', distribution='normal', seed=seed)
 
 
+@tf_export('keras.initializers.he_uniform')
 def he_uniform(seed=None):
   """He uniform variance scaling initializer.
 
@@ -178,10 +185,12 @@ orthogonal = Orthogonal
 # Utility functions
 
 
+@tf_export('keras.initializers.serialize')
 def serialize(initializer):
   return serialize_keras_object(initializer)
 
 
+@tf_export('keras.initializers.deserialize')
 def deserialize(config, custom_objects=None):
   return deserialize_keras_object(
       config,
@@ -190,6 +199,7 @@ def deserialize(config, custom_objects=None):
       printable_module_name='initializer')
 
 
+@tf_export('keras.initializers.get')
 def get(identifier):
   if isinstance(identifier, dict):
     return deserialize(identifier)
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
index 1cb881a13f348fedc55ee48518a54b852d680876..7cac17c51a9adcf8fc62154b6633de60bab18387 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
@@ -14,20 +14,22 @@
 # ==============================================================================
 """Layers that act as activation functions.
 """
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.LeakyReLU')
 class LeakyReLU(Layer):
   """Leaky version of a Rectified Linear Unit.
 
@@ -61,7 +63,12 @@ class LeakyReLU(Layer):
     base_config = super(LeakyReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
 
+@tf_export('keras.layers.PReLU')
 class PReLU(Layer):
   """Parametric Rectified Linear Unit.
 
@@ -111,9 +118,9 @@ class PReLU(Layer):
     else:
       self.shared_axes = list(shared_axes)
 
+  @shape_type_conversion
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    param_shape = input_shape[1:]
+    param_shape = list(input_shape[1:])
     self.param_broadcast = [False] * len(param_shape)
     if self.shared_axes is not None:
       for i in self.shared_axes:
@@ -137,8 +144,9 @@ class PReLU(Layer):
   def call(self, inputs, mask=None):
     pos = K.relu(inputs)
     if K.backend() == 'theano':
-      neg = (K.pattern_broadcast(self.alpha, self.param_broadcast) *
-             (inputs - K.abs(inputs)) * 0.5)
+      neg = (
+          K.pattern_broadcast(self.alpha, self.param_broadcast) *
+          (inputs - K.abs(inputs)) * 0.5)
     else:
       neg = -self.alpha * K.relu(-inputs)
     return pos + neg
@@ -153,7 +161,12 @@ class PReLU(Layer):
     base_config = super(PReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
 
+
+@tf_export('keras.layers.ELU')
 class ELU(Layer):
   """Exponential Linear Unit.
 
@@ -187,7 +200,12 @@ class ELU(Layer):
     base_config = super(ELU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
 
+@tf_export('keras.layers.ThresholdedReLU')
 class ThresholdedReLU(Layer):
   """Thresholded Rectified Linear Unit.
 
@@ -214,9 +232,47 @@ class ThresholdedReLU(Layer):
     self.theta = K.cast_to_floatx(theta)
 
   def call(self, inputs, mask=None):
-    return inputs * K.cast(inputs > self.theta, K.floatx())
+    return inputs * K.cast(K.greater(inputs, self.theta), K.floatx())
 
   def get_config(self):
     config = {'theta': float(self.theta)}
     base_config = super(ThresholdedReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
+
+@tf_export('keras.layers.Softmax')
+class Softmax(Layer):
+  """Softmax activation function.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as the input.
+
+  Arguments:
+      axis: Integer, axis along which the softmax normalization is applied.
+  """
+
+  def __init__(self, axis=-1, **kwargs):
+    super(Softmax, self).__init__(**kwargs)
+    self.supports_masking = True
+    self.axis = axis
+
+  def call(self, inputs):
+    return activations.softmax(inputs, axis=self.axis)
+
+  def get_config(self):
+    config = {'axis': self.axis}
+    base_config = super(Softmax, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py
index 91efab30edf99901b25dc0085b7d49e70d1b6d6d..343b7949accf3f0c9ddc5245910aa5faad8335c6 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py
@@ -56,6 +56,12 @@ class AdvancedActivationsTest(test.TestCase):
                                kwargs={'theta': 0.5},
                                input_shape=(2, 3, 4))
 
+  def test_softmax(self):
+    with self.test_session():
+      testing_utils.layer_test(keras.layers.Softmax,
+                               kwargs={'axis': 1},
+                               input_shape=(2, 3, 4))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index 1cbae9126317479c808730ad89e86d42ae201bc6..bc43451114a0c2396b687a7734bb48391139a914 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -38,8 +38,10 @@ from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D
 # pylint: enable=unused-import
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.layers import convolutional as tf_convolutional_layers
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.Conv1D', 'keras.layers.Convolution1D')
 class Conv1D(tf_convolutional_layers.Conv1D, Layer):
   """1D convolution layer (e.g. temporal convolution).
 
@@ -153,6 +155,7 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Conv2D', 'keras.layers.Convolution2D')
 class Conv2D(tf_convolutional_layers.Conv2D, Layer):
   """2D convolution layer (e.g. spatial convolution over images).
 
@@ -286,6 +289,7 @@ class Conv2D(tf_convolutional_layers.Conv2D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Conv3D', 'keras.layers.Convolution3D')
 class Conv3D(tf_convolutional_layers.Conv3D, Layer):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
@@ -426,6 +430,8 @@ class Conv3D(tf_convolutional_layers.Conv3D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Conv2DTranspose',
+           'keras.layers.Convolution2DTranspose')
 class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
   """Transposed convolution layer (sometimes called Deconvolution).
 
@@ -563,7 +569,9 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class Conv3DTranspose(tf_convolutional_layers.Conv3D, Layer):
+@tf_export('keras.layers.Conv3DTranspose',
+           'keras.layers.Convolution3DTranspose')
+class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -711,6 +719,148 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.SeparableConv1D',
+           'keras.layers.SeparableConvolution1D')
+class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
+  """Depthwise separable 1D convolution.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A single integer specifying the spatial
+      dimensions of the filters.
+    strides: A single integer specifying the strides
+      of the convolution.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: A single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               dilation_rate=1,
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               pointwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               pointwise_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    super(SeparableConv1D, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        depthwise_initializer=initializers.get(depthwise_initializer),
+        pointwise_initializer=initializers.get(pointwise_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        depthwise_regularizer=regularizers.get(depthwise_regularizer),
+        pointwise_regularizer=regularizers.get(pointwise_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        depthwise_constraint=constraints.get(depthwise_constraint),
+        pointwise_constraint=constraints.get(pointwise_constraint),
+        bias_constraint=constraints.get(bias_constraint),
+        **kwargs)
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'depthwise_initializer':
+            initializers.serialize(self.depthwise_initializer),
+        'pointwise_initializer':
+            initializers.serialize(self.pointwise_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'depthwise_regularizer':
+            regularizers.serialize(self.depthwise_regularizer),
+        'pointwise_regularizer':
+            regularizers.serialize(self.pointwise_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'depthwise_constraint':
+            constraints.serialize(self.depthwise_constraint),
+        'pointwise_constraint':
+            constraints.serialize(self.pointwise_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(SeparableConv1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+@tf_export('keras.layers.SeparableConv2D',
+           'keras.layers.SeparableConvolution2D')
 class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
   """Depthwise separable 2D convolution.
 
@@ -874,6 +1024,7 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.UpSampling1D')
 class UpSampling1D(Layer):
   """Upsampling layer for 1D inputs.
 
@@ -894,7 +1045,7 @@ class UpSampling1D(Layer):
     self.size = int(size)
     self.input_spec = InputSpec(ndim=3)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     size = self.size * input_shape[1] if input_shape[1] is not None else None
     return tensor_shape.TensorShape([input_shape[0], size, input_shape[2]])
@@ -909,6 +1060,7 @@ class UpSampling1D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.UpSampling2D')
 class UpSampling2D(Layer):
   """Upsampling layer for 2D inputs.
 
@@ -950,7 +1102,7 @@ class UpSampling2D(Layer):
     self.size = conv_utils.normalize_tuple(size, 2, 'size')
     self.input_spec = InputSpec(ndim=4)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
       height = self.size[0] * input_shape[
@@ -976,6 +1128,7 @@ class UpSampling2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.UpSampling3D')
 class UpSampling3D(Layer):
   """Upsampling layer for 3D inputs.
 
@@ -1017,7 +1170,7 @@ class UpSampling3D(Layer):
     self.input_spec = InputSpec(ndim=5)
     super(UpSampling3D, self).__init__(**kwargs)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
       dim1 = self.size[0] * input_shape[
@@ -1048,6 +1201,7 @@ class UpSampling3D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.ZeroPadding1D')
 class ZeroPadding1D(Layer):
   """Zero-padding layer for 1D input (e.g. temporal sequence).
 
@@ -1072,7 +1226,7 @@ class ZeroPadding1D(Layer):
     self.padding = conv_utils.normalize_tuple(padding, 2, 'padding')
     self.input_spec = InputSpec(ndim=3)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     if input_shape[1] is not None:
       length = input_shape[1] + self.padding[0] + self.padding[1]
     else:
@@ -1088,6 +1242,7 @@ class ZeroPadding1D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.ZeroPadding2D')
 class ZeroPadding2D(Layer):
   """Zero-padding layer for 2D input (e.g. picture).
 
@@ -1154,7 +1309,7 @@ class ZeroPadding2D(Layer):
                        'Found: ' + str(padding))
     self.input_spec = InputSpec(ndim=4)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
       if input_shape[2] is not None:
@@ -1189,6 +1344,7 @@ class ZeroPadding2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.ZeroPadding3D')
 class ZeroPadding3D(Layer):
   """Zero-padding layer for 3D data (spatial or spatio-temporal).
 
@@ -1263,7 +1419,7 @@ class ZeroPadding3D(Layer):
           'Found: ' + str(padding))
     self.input_spec = InputSpec(ndim=5)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
       if input_shape[2] is not None:
@@ -1306,6 +1462,7 @@ class ZeroPadding3D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Cropping1D')
 class Cropping1D(Layer):
   """Cropping layer for 1D input (e.g. temporal sequence).
 
@@ -1330,7 +1487,7 @@ class Cropping1D(Layer):
     self.cropping = conv_utils.normalize_tuple(cropping, 2, 'cropping')
     self.input_spec = InputSpec(ndim=3)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if input_shape[1] is not None:
       length = input_shape[1] - self.cropping[0] - self.cropping[1]
@@ -1350,6 +1507,7 @@ class Cropping1D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Cropping2D')
 class Cropping2D(Layer):
   """Cropping layer for 2D input (e.g. picture).
 
@@ -1428,7 +1586,7 @@ class Cropping2D(Layer):
                        'Found: ' + str(cropping))
     self.input_spec = InputSpec(ndim=4)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     # pylint: disable=invalid-unary-operand-type
     if self.data_format == 'channels_first':
@@ -1481,6 +1639,7 @@ class Cropping2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Cropping3D')
 class Cropping3D(Layer):
   """Cropping layer for 3D data (e.g.
 
@@ -1560,7 +1719,7 @@ class Cropping3D(Layer):
           'Found: ' + str(cropping))
     self.input_spec = InputSpec(ndim=5)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     # pylint: disable=invalid-unary-operand-type
     if self.data_format == 'channels_first':
@@ -1663,6 +1822,7 @@ class Cropping3D(Layer):
 Convolution1D = Conv1D
 Convolution2D = Conv2D
 Convolution3D = Conv3D
+SeparableConvolution1D = SeparableConv1D
 SeparableConvolution2D = SeparableConv2D
 Convolution2DTranspose = Conv2DTranspose
 Convolution3DTranspose = Conv3DTranspose
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
index c88122ce1887c4cb93efadc82f504792c862941d..a04c3a24bfb1d7b4dc6e388ebee14147b3f89461 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
@@ -20,15 +20,16 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.layers.recurrent import Recurrent
 from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.util.tf_export import tf_export
 
 
 class ConvRecurrent2D(Recurrent):
@@ -127,10 +128,10 @@ class ConvRecurrent2D(Recurrent):
     self.input_spec = [InputSpec(ndim=5)]
     self.state_spec = None
 
-  def _compute_output_shape(self, input_shape):
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
       rows = input_shape[3]
       cols = input_shape[4]
@@ -151,30 +152,28 @@ class ConvRecurrent2D(Recurrent):
         dilation=self.dilation_rate[1])
     if self.return_sequences:
       if self.data_format == 'channels_first':
-        output_shape = [input_shape[0], input_shape[1],
-                        self.filters, rows, cols]
+        output_shape = (input_shape[0], input_shape[1], self.filters, rows,
+                        cols)
       elif self.data_format == 'channels_last':
-        output_shape = [input_shape[0], input_shape[1],
-                        rows, cols, self.filters]
+        output_shape = (input_shape[0], input_shape[1], rows, cols,
+                        self.filters)
     else:
       if self.data_format == 'channels_first':
-        output_shape = [input_shape[0], self.filters, rows, cols]
+        output_shape = (input_shape[0], self.filters, rows, cols)
       elif self.data_format == 'channels_last':
-        output_shape = [input_shape[0], rows, cols, self.filters]
+        output_shape = (input_shape[0], rows, cols, self.filters)
 
     if self.return_state:
       if self.data_format == 'channels_first':
-        output_shapes = [output_shape] + [(input_shape[0],
-                                           self.filters,
-                                           rows,
-                                           cols) for _ in range(2)]
+        output_shape = [output_shape] + [
+            (input_shape[0], self.filters, rows, cols) for _ in range(2)
+        ]
       elif self.data_format == 'channels_last':
-        output_shapes = [output_shape] + [(input_shape[0],
-                                           rows,
-                                           cols,
-                                           self.filters) for _ in range(2)]
-      return [tensor_shape.TensorShape(shape) for shape in output_shapes]
-    return tensor_shape.TensorShape(output_shape)
+        output_shape = [output_shape] + [
+            (input_shape[0], rows, cols, self.filters) for _ in range(2)
+        ]
+
+    return output_shape
 
   def get_config(self):
     config = {
@@ -192,6 +191,7 @@ class ConvRecurrent2D(Recurrent):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.ConvLSTM2D')
 class ConvLSTM2D(ConvRecurrent2D):
   """Convolutional LSTM.
 
@@ -294,11 +294,6 @@ class ConvLSTM2D(ConvRecurrent2D):
   Raises:
       ValueError: in case of invalid constructor arguments.
 
-  References:
-      - [Convolutional LSTM Network: A Machine Learning Approach for
-      Precipitation Nowcasting](http://arxiv.org/abs/1506.04214v1)
-      The current implementation does not include the feedback loop on the
-      cells output
   """
 
   def __init__(self,
@@ -338,7 +333,6 @@ class ConvLSTM2D(ConvRecurrent2D):
         return_sequences=return_sequences,
         go_backwards=go_backwards,
         stateful=stateful,
-        activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
     self.activation = activations.get(activation)
     self.recurrent_activation = activations.get(recurrent_activation)
@@ -352,6 +346,7 @@ class ConvLSTM2D(ConvRecurrent2D):
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
 
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.recurrent_constraint = constraints.get(recurrent_constraint)
@@ -361,13 +356,12 @@ class ConvLSTM2D(ConvRecurrent2D):
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.state_spec = [InputSpec(ndim=4), InputSpec(ndim=4)]
 
+  @shape_type_conversion
   def build(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
     batch_size = input_shape[0] if self.stateful else None
     self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_shape[2:])
-
     if self.stateful:
       self.reset_states()
     else:
@@ -467,9 +461,9 @@ class ConvLSTM2D(ConvRecurrent2D):
                        'Got input shape: ' + str(input_shape))
 
     if self.return_state:
-      output_shape = tuple(self._compute_output_shape(input_shape)[0].as_list())
+      output_shape = tuple(self.compute_output_shape(input_shape)[0].as_list())
     else:
-      output_shape = tuple(self._compute_output_shape(input_shape).as_list())
+      output_shape = tuple(self.compute_output_shape(input_shape).as_list())
     if self.return_sequences:
       output_shape = (input_shape[0],) + output_shape[2:]
     else:
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
index be7da6f2b409aa57e3f1328441f0e37ede924c11..39c9d4f0fb2751b0eef3b28f6d5b8cb0a93e22e5 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
@@ -311,6 +311,72 @@ class Conv3DTransposeTest(test.TestCase):
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
+class SeparableConv1DTest(test.TestCase):
+
+  def test_separable_conv_1d(self):
+    num_samples = 2
+    filters = 6
+    stack_size = 3
+    length = 7
+    strides = 1
+
+    for padding in ['valid', 'same']:
+      for multiplier in [1, 2]:
+        if padding == 'same' and strides != 1:
+          continue
+
+        with self.test_session(use_gpu=True):
+          testing_utils.layer_test(
+              keras.layers.SeparableConv1D,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': 3,
+                  'padding': padding,
+                  'strides': strides,
+                  'depth_multiplier': multiplier
+              },
+              input_shape=(num_samples, length, stack_size))
+
+  def test_separable_conv1d_regularizers(self):
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'depthwise_regularizer': 'l2',
+        'pointwise_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.SeparableConv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((1, 5, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+  def test_separable_conv1d_constraints(self):
+    d_constraint = lambda x: x
+    p_constraint = lambda x: x
+    b_constraint = lambda x: x
+
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'pointwise_constraint': p_constraint,
+        'depthwise_constraint': d_constraint,
+        'bias_constraint': b_constraint,
+        'strides': 1
+    }
+    with self.test_session(use_gpu=True):
+      layer = keras.layers.SeparableConv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
+      self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
+
+
 class SeparableConv2DTest(test.TestCase):
 
   def test_separable_conv_2d(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index 517129fab05a504245725032e715b624a3b975a7..50a197c80c3d97f47a071a24297301dddf78a27e 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -23,6 +23,7 @@ import types as python_types
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
@@ -36,8 +37,10 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import func_dump
 from tensorflow.python.keras._impl.keras.utils.generic_utils import func_load
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.layers import core as tf_core_layers
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.Masking')
 class Masking(Layer):
   """Masks a sequence by using a mask value to skip timesteps.
 
@@ -79,12 +82,16 @@ class Masking(Layer):
         K.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
     return inputs * K.cast(boolean_mask, inputs.dtype)
 
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
   def get_config(self):
     config = {'mask_value': self.mask_value}
     base_config = super(Masking, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Dropout')
 class Dropout(tf_core_layers.Dropout, Layer):
   """Applies Dropout to the input.
 
@@ -104,19 +111,20 @@ class Dropout(tf_core_layers.Dropout, Layer):
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
-    self.supports_masking = True
     # Inheritance call order:
     # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
     super(Dropout, self).__init__(rate=rate,
                                   noise_shape=noise_shape,
                                   seed=seed,
                                   **kwargs)
+    self.supports_masking = True
 
   def call(self, inputs, training=None):
     if training is None:
       training = K.learning_phase()
     output = super(Dropout, self).call(inputs, training=training)
-    if training is K.learning_phase():
+    # EagerTensor object has no attribute _uses_learning_phase
+    if not context.in_eager_mode() and training is K.learning_phase():
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
@@ -130,6 +138,7 @@ class Dropout(tf_core_layers.Dropout, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.SpatialDropout1D')
 class SpatialDropout1D(Dropout):
   """Spatial 1D version of Dropout.
 
@@ -166,6 +175,7 @@ class SpatialDropout1D(Dropout):
     return noise_shape
 
 
+@tf_export('keras.layers.SpatialDropout2D')
 class SpatialDropout2D(Dropout):
   """Spatial 2D version of Dropout.
 
@@ -219,6 +229,7 @@ class SpatialDropout2D(Dropout):
       return (input_shape[0], 1, 1, input_shape[3])
 
 
+@tf_export('keras.layers.SpatialDropout3D')
 class SpatialDropout3D(Dropout):
   """Spatial 3D version of Dropout.
 
@@ -271,6 +282,7 @@ class SpatialDropout3D(Dropout):
       return (input_shape[0], 1, 1, 1, input_shape[4])
 
 
+@tf_export('keras.layers.Activation')
 class Activation(Layer):
   """Applies an activation function to an output.
 
@@ -295,12 +307,16 @@ class Activation(Layer):
   def call(self, inputs):
     return self.activation(inputs)
 
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
   def get_config(self):
     config = {'activation': activations.serialize(self.activation)}
     base_config = super(Activation, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Reshape')
 class Reshape(Layer):
   """Reshapes an output to a certain shape.
 
@@ -385,7 +401,7 @@ class Reshape(Layer):
       raise ValueError(msg)
     return output_shape
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if None in input_shape[1:]:
       output_shape = [input_shape[0]]
@@ -406,6 +422,7 @@ class Reshape(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Permute')
 class Permute(Layer):
   """Permutes the dimensions of the input according to a given pattern.
 
@@ -441,7 +458,7 @@ class Permute(Layer):
     self.dims = tuple(dims)
     self.input_spec = InputSpec(ndim=len(self.dims) + 1)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     output_shape = copy.copy(input_shape)
     for i, dim in enumerate(self.dims):
@@ -458,6 +475,7 @@ class Permute(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Flatten')
 class Flatten(tf_core_layers.Flatten, Layer):
   """Flattens the input. Does not affect the batch size.
 
@@ -477,6 +495,7 @@ class Flatten(tf_core_layers.Flatten, Layer):
   pass
 
 
+@tf_export('keras.layers.RepeatVector')
 class RepeatVector(Layer):
   """Repeats the input n times.
 
@@ -507,7 +526,7 @@ class RepeatVector(Layer):
     self.n = n
     self.input_spec = InputSpec(ndim=2)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     return tensor_shape.TensorShape([input_shape[0], self.n, input_shape[1]])
 
@@ -520,6 +539,7 @@ class RepeatVector(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Lambda')
 class Lambda(Layer):
   """Wraps arbitrary expression as a `Layer` object.
 
@@ -547,8 +567,19 @@ class Lambda(Layer):
   Arguments:
       function: The function to be evaluated.
           Takes input tensor as first argument.
+      output_shape: Expected output shape from function.
+            This argument can be inferred if not explicitly provided.
+            Can be a tuple or function.
+            If a tuple, it only specifies the first dimension onward;
+                 sample dimension is assumed either the same as the input:
+                 `output_shape = (input_shape[0], ) + output_shape`
+                 or, the input is `None` and
+                 the sample dimension is also `None`:
+                 `output_shape = (None, ) + output_shape`
+            If a function, it specifies the entire shape as a function of the
+            input shape: `output_shape = f(input_shape)`
       arguments: optional dictionary of keyword arguments to be passed
-          to the function.
+            to the function.
 
   Input shape:
       Arbitrary. Use the keyword argument input_shape
@@ -557,16 +588,52 @@ class Lambda(Layer):
 
   Output shape:
       Specified by `output_shape` argument
-      (or auto-inferred when using TensorFlow).
   """
 
-  def __init__(self, function, mask=None, arguments=None, **kwargs):
+  def __init__(self, function, output_shape=None, mask=None, arguments=None,
+               **kwargs):
     super(Lambda, self).__init__(**kwargs)
     self.function = function
     self.arguments = arguments if arguments else {}
     if mask is not None:
       self.supports_masking = True
     self.mask = mask
+    if output_shape is None:
+      self._output_shape = None
+    elif isinstance(output_shape, (tuple, list)):
+      self._output_shape = tuple(output_shape)
+    else:
+      if not callable(output_shape):
+        raise TypeError('In Lambda, `output_shape` '
+                        'must be a list, a tuple, or a function.')
+      self._output_shape = output_shape
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+
+    if self._output_shape is None:
+      x = K.placeholder(shape=input_shape)
+      x = self.call(x)
+      if isinstance(x, list):
+        return [tensor_shape.TensorShape(K.int_shape(x_elem)) for x_elem in x]
+      else:
+        return tensor_shape.TensorShape(K.int_shape(x))
+    elif isinstance(self._output_shape, (tuple, list)):
+      if isinstance(input_shape, list):
+        num_samples = input_shape[0][0]
+      else:
+        num_samples = input_shape[0] if input_shape else None
+      return tensor_shape.TensorShape((num_samples,) +
+                                      tuple(self._output_shape))
+    else:
+      shape = self._output_shape(input_shape)
+      if not isinstance(shape, (list, tuple)):
+        raise ValueError(
+            '`output_shape` function must return a tuple or a list of tuples.')
+      if isinstance(shape, list):
+        if isinstance(shape[0], int) or shape[0] is None:
+          shape = tuple(shape)
+      return tensor_shape.TensorShape(shape)
 
   def call(self, inputs, mask=None):
     arguments = self.arguments
@@ -587,9 +654,21 @@ class Lambda(Layer):
       function = self.function.__name__
       function_type = 'function'
 
+    if isinstance(self._output_shape, python_types.LambdaType):
+      output_shape = func_dump(self._output_shape)
+      output_shape_type = 'lambda'
+    elif callable(self._output_shape):
+      output_shape = self._output_shape.__name__
+      output_shape_type = 'function'
+    else:
+      output_shape = self._output_shape
+      output_shape_type = 'raw'
+
     config = {
         'function': function,
         'function_type': function_type,
+        'output_shape': output_shape,
+        'output_shape_type': output_shape_type,
         'arguments': self.arguments
     }
     base_config = super(Lambda, self).get_config()
@@ -614,6 +693,19 @@ class Lambda(Layer):
     else:
       raise TypeError('Unknown function type:', function_type)
 
+    output_shape_type = config.pop('output_shape_type')
+    if output_shape_type == 'function':
+      # Simple lookup in custom objects
+      output_shape = deserialize_keras_object(
+          config['output_shape'],
+          custom_objects=custom_objects,
+          printable_module_name='output_shape function in Lambda layer')
+    elif output_shape_type == 'lambda':
+      # Unsafe deserialization from bytecode
+      output_shape = func_load(config['output_shape'], globs=globs)
+    else:
+      output_shape = config['output_shape']
+
     # If arguments were numpy array, they have been saved as
     # list. We need to recover the ndarray
     if 'arguments' in config:
@@ -625,9 +717,11 @@ class Lambda(Layer):
             config['arguments'][key] = np.array(arg_dict['value'])
 
     config['function'] = function
+    config['output_shape'] = output_shape
     return cls(**config)
 
 
+@tf_export('keras.layers.Dense')
 class Dense(tf_core_layers.Dense, Layer):
   """Just your regular densely-connected NN layer.
 
@@ -732,6 +826,7 @@ class Dense(tf_core_layers.Dense, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.ActivityRegularization')
 class ActivityRegularization(Layer):
   """Layer that applies an update to the cost function based input activity.
 
@@ -755,6 +850,9 @@ class ActivityRegularization(Layer):
     self.l1 = l1
     self.l2 = l2
 
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
   def get_config(self):
     config = {'l1': self.l1, 'l2': self.l2}
     base_config = super(ActivityRegularization, self).get_config()
diff --git a/tensorflow/python/keras/_impl/keras/layers/core_test.py b/tensorflow/python/keras/_impl/keras/layers/core_test.py
index dd768dc268ef6b39f64b522fd88393610c832287..bdb99c91c289cf808fec7b891376dbfcf5504aca 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core_test.py
@@ -47,6 +47,11 @@ class CoreLayersTest(test.TestCase):
                   'noise_shape': [3, 1]},
           input_shape=(3, 2))
 
+    # https://github.com/tensorflow/tensorflow/issues/14819
+    with self.test_session():
+      dropout = keras.layers.Dropout(0.5)
+      self.assertEqual(True, dropout.supports_masking)
+
     with self.test_session():
       testing_utils.layer_test(
           keras.layers.SpatialDropout1D,
@@ -220,6 +225,34 @@ class CoreLayersTest(test.TestCase):
       self.assertEqual(1, len(layer.losses))
       _ = layer.get_config()
 
+  def test_lambda_output_shape(self):
+    with self.test_session():
+      l = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
+      l(keras.backend.variable(np.ones((1, 1))))
+      self.assertEqual((1, 1), l.get_config()['output_shape'])
+
+  def test_lambda_output_shape_function(self):
+    def get_output_shape(input_shape):
+      return 1 * input_shape
+
+    with self.test_session():
+      l = keras.layers.Lambda(lambda x: x + 1, output_shape=get_output_shape)
+      l(keras.backend.variable(np.ones((1, 1))))
+      self.assertEqual('lambda', l.get_config()['output_shape_type'])
+
+  def test_lambda_config_serialization(self):
+    with self.test_session():
+      # test serialization with output_shape and output_shape_type
+      layer = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
+      layer(keras.backend.variable(np.ones((1, 1))))
+      config = layer.get_config()
+      layer = keras.layers.deserialize({
+          'class_name': 'Lambda',
+          'config': config
+      })
+
+      layer = keras.layers.Lambda.from_config(config)
 
 if __name__ == '__main__':
   test.main()
+
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index 3ac5e5661e192f9583b228df05a5e37545d388fe..ca92899a455cd28a756e9efff63655d7c43c9f45 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -18,14 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.Embedding')
 class Embedding(Layer):
   """Turns positive integers (indexes) into dense vectors of fixed size.
 
@@ -58,13 +60,13 @@ class Embedding(Layer):
     output_dim: int >= 0. Dimension of the dense embedding.
     embeddings_initializer: Initializer for the `embeddings` matrix.
     embeddings_regularizer: Regularizer function applied to
-          the `embeddings` matrix.
+        the `embeddings` matrix.
     embeddings_constraint: Constraint function applied to
-          the `embeddings` matrix.
+        the `embeddings` matrix.
     mask_zero: Whether or not the input value 0 is a special "padding"
         value that should be masked out.
-        This is useful when using recurrent layers,
-        which may take variable length inputs.
+        This is useful when using recurrent layers
+        which may take variable length input.
         If this is `True` then all subsequent layers
         in the model need to support masking or an exception will be raised.
         If mask_zero is set to True, as a consequence, index 0 cannot be
@@ -81,9 +83,6 @@ class Embedding(Layer):
   Output shape:
       3D tensor with shape: `(batch_size, sequence_length, output_dim)`.
 
-  References:
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
-        Networks](http://arxiv.org/abs/1512.05287)
   """
 
   def __init__(self,
@@ -101,19 +100,19 @@ class Embedding(Layer):
         kwargs['input_shape'] = (input_length,)
       else:
         kwargs['input_shape'] = (None,)
-    super(Embedding, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(Embedding, self).__init__(**kwargs)
 
     self.input_dim = input_dim
     self.output_dim = output_dim
     self.embeddings_initializer = initializers.get(embeddings_initializer)
     self.embeddings_regularizer = regularizers.get(embeddings_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
     self.embeddings_constraint = constraints.get(embeddings_constraint)
     self.mask_zero = mask_zero
     self.input_length = input_length
 
+  @shape_type_conversion
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     self.embeddings = self.add_weight(
         shape=(self.input_dim, self.output_dim),
         initializer=self.embeddings_initializer,
@@ -129,10 +128,10 @@ class Embedding(Layer):
     else:
       return K.not_equal(inputs, 0)
 
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
     if self.input_length is None:
-      return tensor_shape.TensorShape(input_shape + [self.output_dim])
+      return input_shape + (self.output_dim,)
     else:
       # input_length can be tuple if input is 3D or higher
       if isinstance(self.input_length, (list, tuple)):
@@ -149,8 +148,7 @@ class Embedding(Layer):
                        (str(self.input_length), str(input_shape)))
           elif s1 is None:
             in_lens[i] = s2
-      return tensor_shape.TensorShape(
-          (input_shape[0],) + tuple(in_lens) + (self.output_dim,))
+      return (input_shape[0],) + tuple(in_lens) + (self.output_dim,)
 
   def call(self, inputs):
     if K.dtype(inputs) != 'int32':
diff --git a/tensorflow/python/keras/_impl/keras/layers/local.py b/tensorflow/python/keras/_impl/keras/layers/local.py
index bf1d495b9dda6302f95094fbda40fc5a6b5f79ed..798ac236a30a438107caed939f7650f51b62ef42 100644
--- a/tensorflow/python/keras/_impl/keras/layers/local.py
+++ b/tensorflow/python/keras/_impl/keras/layers/local.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
@@ -26,9 +25,12 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.LocallyConnected1D')
 class LocallyConnected1D(Layer):
   """Locally-connected layer for 1D inputs.
 
@@ -98,8 +100,7 @@ class LocallyConnected1D(Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    super(LocallyConnected1D, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(LocallyConnected1D, self).__init__(**kwargs)
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(kernel_size, 1, 'kernel_size')
     self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
@@ -114,12 +115,13 @@ class LocallyConnected1D(Layer):
     self.bias_initializer = initializers.get(bias_initializer)
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
     self.input_spec = InputSpec(ndim=3)
 
+  @shape_type_conversion
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     input_dim = input_shape[2]
     if input_dim is None:
       raise ValueError('Axis 2 of input should be fully-defined. '
@@ -146,15 +148,14 @@ class LocallyConnected1D(Layer):
     self.input_spec = InputSpec(ndim=3, axes={2: input_dim})
     self.built = True
 
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
     length = conv_utils.conv_output_length(input_shape[1], self.kernel_size[0],
                                            self.padding, self.strides[0])
-    return tensor_shape.TensorShape([input_shape[0], length, self.filters])
+    return (input_shape[0], length, self.filters)
 
   def call(self, inputs):
     output = K.local_conv1d(inputs, self.kernel, self.kernel_size, self.strides)
-
     if self.use_bias:
       output = K.bias_add(output, self.bias)
     if self.activation is not None:
@@ -163,25 +164,38 @@ class LocallyConnected1D(Layer):
 
   def get_config(self):
     config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'filters':
+            self.filters,
+        'kernel_size':
+            self.kernel_size,
+        'strides':
+            self.strides,
+        'padding':
+            self.padding,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
     }
     base_config = super(LocallyConnected1D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.LocallyConnected2D')
 class LocallyConnected2D(Layer):
   """Locally-connected layer for 2D inputs.
 
@@ -273,8 +287,7 @@ class LocallyConnected2D(Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    super(LocallyConnected2D, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(LocallyConnected2D, self).__init__(**kwargs)
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
     self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
@@ -289,12 +302,13 @@ class LocallyConnected2D(Layer):
     self.bias_initializer = initializers.get(bias_initializer)
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
     self.input_spec = InputSpec(ndim=4)
 
+  @shape_type_conversion
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_last':
       input_row, input_col = input_shape[1:-1]
       input_filter = input_shape[3]
@@ -306,7 +320,6 @@ class LocallyConnected2D(Layer):
                        ' a LocallyConnected2D layer '
                        'should be fully-defined, but layer received '
                        'the inputs shape ' + str(input_shape))
-
     output_row = conv_utils.conv_output_length(input_row, self.kernel_size[0],
                                                self.padding, self.strides[0])
     output_col = conv_utils.conv_output_length(input_col, self.kernel_size[1],
@@ -337,33 +350,30 @@ class LocallyConnected2D(Layer):
       self.input_spec = InputSpec(ndim=4, axes={-1: input_filter})
     self.built = True
 
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
     if self.data_format == 'channels_first':
       rows = input_shape[2]
       cols = input_shape[3]
     elif self.data_format == 'channels_last':
       rows = input_shape[1]
       cols = input_shape[2]
+
     rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
                                          self.padding, self.strides[0])
     cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
                                          self.padding, self.strides[1])
 
     if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], self.filters, rows, cols])
+      return (input_shape[0], self.filters, rows, cols)
     elif self.data_format == 'channels_last':
-      return tensor_shape.TensorShape(
-          [input_shape[0], rows, cols, self.filters])
+      return (input_shape[0], rows, cols, self.filters)
 
   def call(self, inputs):
-    output = K.local_conv2d(inputs,
-                            self.kernel,
-                            self.kernel_size,
-                            self.strides,
+    output = K.local_conv2d(inputs, self.kernel, self.kernel_size, self.strides,
                             (self.output_row, self.output_col),
                             self.data_format)
+
     if self.use_bias:
       output = K.bias_add(output, self.bias, data_format=self.data_format)
 
@@ -372,21 +382,34 @@ class LocallyConnected2D(Layer):
 
   def get_config(self):
     config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'filters':
+            self.filters,
+        'kernel_size':
+            self.kernel_size,
+        'strides':
+            self.strides,
+        'padding':
+            self.padding,
+        'data_format':
+            self.data_format,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
     }
     base_config = super(LocallyConnected2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
index 8d359bf17cdb80c98aeeed6d69e301962609ce59..deb1d7c0c685e51ed756cbcdd5aec81ee60b5f96 100644
--- a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
@@ -39,6 +39,23 @@ class LSTMLayerTest(test.TestCase):
                   'return_sequences': True},
           input_shape=(num_samples, timesteps, embedding_dim))
 
+  def test_static_shape_inference_LSTM(self):
+    # Github issue: 15165
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+
+    model = keras.models.Sequential()
+    inputs = keras.layers.Dense(embedding_dim,
+                                input_shape=(timesteps, embedding_dim))
+    model.add(inputs)
+    layer = keras.layers.LSTM(units, return_sequences=True)
+    model.add(layer)
+    outputs = model.layers[-1].output
+    self.assertEquals(outputs.get_shape().as_list(),
+                      [None, timesteps, units])
+
   def test_dynamic_behavior_LSTM(self):
     num_samples = 2
     timesteps = 3
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/_impl/keras/layers/merge.py
index 888be2736934c314474bdc9259498fa2b415a4db..cdf2878e83e32147d30d6b29742b7e9013a1facb 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge.py
@@ -14,15 +14,16 @@
 # ==============================================================================
 # pylint: disable=not-callable
 # pylint: disable=redefined-builtin
-"""Layers can merge several input tensors into a single output tensor.
+"""Layers that can merge several inputs into one.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine.topology import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _Merge(Layer):
@@ -73,12 +74,13 @@ class _Merge(Layer):
         output_shape.append(i)
       else:
         if i != j:
-          raise ValueError('Operands could not be broadcast '
-                           'together with shapes ' + str(shape1) + ' ' +
-                           str(shape2))
+          raise ValueError(
+              'Operands could not be broadcast '
+              'together with shapes ' + str(shape1) + ' ' + str(shape2))
         output_shape.append(i)
     return tuple(output_shape)
 
+  @shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list):
@@ -87,14 +89,13 @@ class _Merge(Layer):
       raise ValueError('A merge layer should be called '
                        'on a list of at least 2 inputs. '
                        'Got ' + str(len(input_shape)) + ' inputs.')
-    input_shape = [tensor_shape.TensorShape(s).as_list() for s in input_shape]
     batch_sizes = [s[0] for s in input_shape if s is not None]
     batch_sizes = set(batch_sizes)
     batch_sizes -= set([None])
     if len(batch_sizes) > 1:
-      raise ValueError('Can not merge tensors with different '
-                       'batch sizes. Got tensors with shapes : ' +
-                       str(input_shape))
+      raise ValueError(
+          'Can not merge tensors with different '
+          'batch sizes. Got tensors with shapes : ' + str(input_shape))
     if input_shape[0] is None:
       output_shape = None
     else:
@@ -111,9 +112,10 @@ class _Merge(Layer):
       self._reshape_required = False
     else:
       self._reshape_required = True
-    self.built = True
 
   def call(self, inputs):
+    if not isinstance(inputs, list):
+      raise ValueError('A merge layer should be called ' 'on a list of inputs.')
     if self._reshape_required:
       reshaped_inputs = []
       input_ndims = list(map(K.ndim, inputs))
@@ -172,7 +174,8 @@ class _Merge(Layer):
     else:
       return self._merge_function(inputs)
 
-  def _compute_output_shape(self, input_shape):
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
     if input_shape[0] is None:
       output_shape = None
     else:
@@ -208,12 +211,29 @@ class _Merge(Layer):
     return K.all(K.concatenate(masks, axis=0), axis=0, keepdims=False)
 
 
+@tf_export('keras.layers.Add')
 class Add(_Merge):
   """Layer that adds a list of inputs.
 
   It takes as input a list of tensors,
   all of the same shape, and returns
   a single tensor (also of the same shape).
+
+  Examples:
+
+  ```python
+      import keras
+
+      input1 = keras.layers.Input(shape=(16,))
+      x1 = keras.layers.Dense(8, activation='relu')(input1)
+      input2 = keras.layers.Input(shape=(32,))
+      x2 = keras.layers.Dense(8, activation='relu')(input2)
+      added = keras.layers.Add()([x1, x2])  # equivalent to added =
+      keras.layers.add([x1, x2])
+
+      out = keras.layers.Dense(4)(added)
+      model = keras.models.Model(inputs=[input1, input2], outputs=out)
+  ```
   """
 
   def _merge_function(self, inputs):
@@ -247,13 +267,21 @@ class Subtract(_Merge):
   ```
   """
 
+  @shape_type_conversion
+  def build(self, input_shape):
+    super(Subtract, self).build(input_shape)
+    if len(input_shape) != 2:
+      raise ValueError('A `Subtract` layer should be called '
+                       'on exactly 2 inputs')
+
   def _merge_function(self, inputs):
     if len(inputs) != 2:
-      raise ValueError('`Subtract` layer should be called '
-                       'on exactly 2 inputs. Received: %s' % inputs)
+      raise ValueError('A `Subtract` layer should be called '
+                       'on exactly 2 inputs')
     return inputs[0] - inputs[1]
 
 
+@tf_export('keras.layers.Multiply')
 class Multiply(_Merge):
   """Layer that multiplies (element-wise) a list of inputs.
 
@@ -269,6 +297,7 @@ class Multiply(_Merge):
     return output
 
 
+@tf_export('keras.layers.Average')
 class Average(_Merge):
   """Layer that averages a list of inputs.
 
@@ -284,6 +313,7 @@ class Average(_Merge):
     return output / len(inputs)
 
 
+@tf_export('keras.layers.Maximum')
 class Maximum(_Merge):
   """Layer that computes the maximum (element-wise) a list of inputs.
 
@@ -314,6 +344,7 @@ class Minimum(_Merge):
     return output
 
 
+@tf_export('keras.layers.Concatenate')
 class Concatenate(_Merge):
   """Layer that concatenates a list of inputs.
 
@@ -330,47 +361,43 @@ class Concatenate(_Merge):
     super(Concatenate, self).__init__(**kwargs)
     self.axis = axis
     self.supports_masking = True
+    self._reshape_required = False
 
+  @shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
-    if not (isinstance(input_shape, list) and len(input_shape) > 1):
-      raise ValueError('`Concatenate` layer should be called '
-                       'on a list containing at least two inputs')
+    if not isinstance(input_shape, list) or len(input_shape) < 2:
+      raise ValueError('A `Concatenate` layer should be called '
+                       'on a list of at least 2 inputs')
     if all([shape is None for shape in input_shape]):
       return
-    reduced_inputs_shapes = [
-        tensor_shape.TensorShape(shape).as_list() for shape in input_shape
-    ]
+    reduced_inputs_shapes = [list(shape) for shape in input_shape]
     shape_set = set()
     for i in range(len(reduced_inputs_shapes)):
       del reduced_inputs_shapes[i][self.axis]
       shape_set.add(tuple(reduced_inputs_shapes[i]))
     if len(shape_set) > 1:
-      raise ValueError('`Concatenate` layer requires '
+      raise ValueError('A `Concatenate` layer requires '
                        'inputs with matching shapes '
                        'except for the concat axis. '
                        'Got inputs shapes: %s' % (input_shape))
-    self.built = True
 
-  def call(self, inputs):
-    if not isinstance(inputs, list):
-      raise ValueError('A `Concatenate` layer should be called '
-                       'on a list of inputs.')
+  def _merge_function(self, inputs):
     return K.concatenate(inputs, axis=self.axis)
 
-  def _compute_output_shape(self, input_shape):
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
     if not isinstance(input_shape, list):
       raise ValueError('A `Concatenate` layer should be called '
                        'on a list of inputs.')
     input_shapes = input_shape
-    output_shape = tensor_shape.TensorShape(input_shapes[0]).as_list()
+    output_shape = list(input_shapes[0])
     for shape in input_shapes[1:]:
-      shape = tensor_shape.TensorShape(shape).as_list()
       if output_shape[self.axis] is None or shape[self.axis] is None:
         output_shape[self.axis] = None
         break
       output_shape[self.axis] += shape[self.axis]
-    return tensor_shape.TensorShape(output_shape)
+    return tuple(output_shape)
 
   def compute_mask(self, inputs, mask=None):
     if mask is None:
@@ -390,7 +417,7 @@ class Concatenate(_Merge):
     masks = []
     for input_i, mask_i in zip(inputs, mask):
       if mask_i is None:
-        # Input is unmasked. Append all 1s to masks
+        # Input is unmasked. Append all 1s to masks,
         masks.append(K.ones_like(input_i, dtype='bool'))
       elif K.ndim(mask_i) < K.ndim(input_i):
         # Mask is smaller than the input, expand it
@@ -408,6 +435,7 @@ class Concatenate(_Merge):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.Dot')
 class Dot(_Merge):
   """Layer that computes a dot product between samples in two tensors.
 
@@ -441,14 +469,16 @@ class Dot(_Merge):
     self.axes = axes
     self.normalize = normalize
     self.supports_masking = True
+    self._reshape_required = False
 
+  @shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list) or len(input_shape) != 2:
       raise ValueError('A `Dot` layer should be called '
                        'on a list of 2 inputs.')
-    shape1 = tensor_shape.TensorShape(input_shape[0]).as_list()
-    shape2 = tensor_shape.TensorShape(input_shape[1]).as_list()
+    shape1 = input_shape[0]
+    shape2 = input_shape[1]
     if shape1 is None or shape2 is None:
       return
     if isinstance(self.axes, int):
@@ -462,9 +492,10 @@ class Dot(_Merge):
       raise ValueError('Dimension incompatibility '
                        '%s != %s. ' % (shape1[axes[0]], shape2[axes[1]]) +
                        'Layer shapes: %s, %s' % (shape1, shape2))
-    self.built = True
 
-  def call(self, inputs):
+  def _merge_function(self, inputs):
+    if len(inputs) != 2:
+      raise ValueError('A `Dot` layer should be called ' 'on exactly 2 inputs')
     x1 = inputs[0]
     x2 = inputs[1]
     if isinstance(self.axes, int):
@@ -485,12 +516,13 @@ class Dot(_Merge):
     output = K.batch_dot(x1, x2, axes)
     return output
 
-  def _compute_output_shape(self, input_shape):
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
     if not isinstance(input_shape, list) or len(input_shape) != 2:
       raise ValueError('A `Dot` layer should be called '
                        'on a list of 2 inputs.')
-    shape1 = tensor_shape.TensorShape(input_shape[0]).as_list()
-    shape2 = tensor_shape.TensorShape(input_shape[1]).as_list()
+    shape1 = list(input_shape[0])
+    shape2 = list(input_shape[1])
     if isinstance(self.axes, int):
       if self.axes < 0:
         axes = [self.axes % len(shape1), self.axes % len(shape2)]
@@ -504,7 +536,7 @@ class Dot(_Merge):
     output_shape = shape1 + shape2
     if len(output_shape) == 1:
       output_shape += [1]
-    return tensor_shape.TensorShape(output_shape)
+    return tuple(output_shape)
 
   def compute_mask(self, inputs, mask=None):
     return None
@@ -518,6 +550,7 @@ class Dot(_Merge):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.add')
 def add(inputs, **kwargs):
   """Functional interface to the `Add` layer.
 
@@ -527,6 +560,21 @@ def add(inputs, **kwargs):
 
   Returns:
       A tensor, the sum of the inputs.
+
+  Examples:
+
+  ```python
+      import keras
+
+      input1 = keras.layers.Input(shape=(16,))
+      x1 = keras.layers.Dense(8, activation='relu')(input1)
+      input2 = keras.layers.Input(shape=(32,))
+      x2 = keras.layers.Dense(8, activation='relu')(input2)
+      added = keras.layers.add([x1, x2])
+
+      out = keras.layers.Dense(4)(added)
+      model = keras.models.Model(inputs=[input1, input2], outputs=out)
+  ```
   """
   return Add(**kwargs)(inputs)
 
@@ -559,6 +607,7 @@ def subtract(inputs, **kwargs):
   return Subtract(**kwargs)(inputs)
 
 
+@tf_export('keras.layers.multiply')
 def multiply(inputs, **kwargs):
   """Functional interface to the `Multiply` layer.
 
@@ -572,6 +621,7 @@ def multiply(inputs, **kwargs):
   return Multiply(**kwargs)(inputs)
 
 
+@tf_export('keras.layers.average')
 def average(inputs, **kwargs):
   """Functional interface to the `Average` layer.
 
@@ -585,6 +635,7 @@ def average(inputs, **kwargs):
   return Average(**kwargs)(inputs)
 
 
+@tf_export('keras.layers.maximum')
 def maximum(inputs, **kwargs):
   """Functional interface to the `Maximum` layer.
 
@@ -611,6 +662,7 @@ def minimum(inputs, **kwargs):
   return Minimum(**kwargs)(inputs)
 
 
+@tf_export('keras.layers.concatenate')
 def concatenate(inputs, axis=-1, **kwargs):
   """Functional interface to the `Concatenate` layer.
 
@@ -625,6 +677,7 @@ def concatenate(inputs, axis=-1, **kwargs):
   return Concatenate(axis=axis, **kwargs)(inputs)
 
 
+@tf_export('keras.layers.dot')
 def dot(inputs, axes, normalize=False, **kwargs):
   """Functional interface to the `Dot` layer.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge_test.py b/tensorflow/python/keras/_impl/keras/layers/merge_test.py
index 1f34c367e4b7593a9a7c7d320cdc1d8d75c4959e..bb03dda1fc645222c1ced97cfce8d459586dd89d 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge_test.py
@@ -188,9 +188,9 @@ class MergeLayersTest(test.TestCase):
       self.assertEqual(out.shape, (2, 1))
       self.assertAllClose(out, expected, atol=1e-4)
 
-      # test _compute_output_shape
+      # test compute_output_shape
       layer = keras.layers.Dot(axes=-1)
-      self.assertEqual(layer._compute_output_shape([(4, 5), (4, 5)]), (4, 1))
+      self.assertEqual(layer.compute_output_shape([(4, 5), (4, 5)]), (4, 1))
 
   def test_dot_errors(self):
     i1 = keras.layers.Input(shape=(4, 5))
@@ -206,7 +206,7 @@ class MergeLayersTest(test.TestCase):
       keras.layers.dot([i1, i2, i3], axes=-1)
     with self.assertRaises(ValueError):
       dot = keras.layers.Dot(1)
-      dot._compute_output_shape(1)
+      dot.compute_output_shape(1)
 
   def test_merge_subtract(self):
     i1 = keras.layers.Input(shape=(4, 5))
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise.py b/tensorflow/python/keras/_impl/keras/layers/noise.py
index 9caa8b7024aa31118802a5bac5edac756dccc0f9..9010f4961585af58b7eae43dcd224e0c39606239 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise.py
+++ b/tensorflow/python/keras/_impl/keras/layers/noise.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Layers for regularization models via the addition of noise.
+"""Layers that operate regularization via the addition of noise.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -22,8 +22,11 @@ import numpy as np
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.GaussianNoise')
 class GaussianNoise(Layer):
   """Apply additive zero-centered Gaussian noise.
 
@@ -64,7 +67,12 @@ class GaussianNoise(Layer):
     base_config = super(GaussianNoise, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
 
+
+@tf_export('keras.layers.GaussianDropout')
 class GaussianDropout(Layer):
   """Apply multiplicative 1-centered Gaussian noise.
 
@@ -83,10 +91,6 @@ class GaussianDropout(Layer):
   Output shape:
       Same shape as input.
 
-  References:
-      - [Dropout: A Simple Way to Prevent Neural Networks from Overfitting
-        Srivastava, Hinton, et al.
-        2014](http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf)
   """
 
   def __init__(self, rate, **kwargs):
@@ -110,7 +114,12 @@ class GaussianDropout(Layer):
     base_config = super(GaussianDropout, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
 
+@tf_export('keras.layers.AlphaDropout')
 class AlphaDropout(Layer):
   """Applies Alpha Dropout to the input.
 
@@ -134,8 +143,6 @@ class AlphaDropout(Layer):
   Output shape:
       Same shape as input.
 
-  References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
@@ -151,17 +158,24 @@ class AlphaDropout(Layer):
   def call(self, inputs, training=None):
     if 0. < self.rate < 1.:
       noise_shape = self._get_noise_shape(inputs)
-      alpha = 1.6732632423543772848170429916717
-      scale = 1.0507009873554804934193349852946
 
-      def dropped_inputs(inputs=inputs, rate=self.rate, seed=self.seed):
+      def dropped_inputs(inputs=inputs, rate=self.rate, seed=self.seed):  # pylint: disable=missing-docstring
+        alpha = 1.6732632423543772848170429916717
+        scale = 1.0507009873554804934193349852946
         alpha_p = -alpha * scale
-        kept_idx = K.greater_equal(K.random_uniform(noise_shape, seed=seed),
-                                   rate)
+
+        kept_idx = K.greater_equal(
+            K.random_uniform(noise_shape, seed=seed), rate)
         kept_idx = K.cast(kept_idx, K.floatx())
-        a = ((1 - rate) * (1 + rate * alpha_p ** 2)) ** -0.5
+
+        # Get affine transformation params
+        a = ((1 - rate) * (1 + rate * alpha_p**2))**-0.5
         b = -a * alpha_p * rate
+
+        # Apply mask
         x = inputs * kept_idx + alpha_p * (1 - kept_idx)
+
+        # Do affine transformation
         return a * x + b
 
       return K.in_train_phase(dropped_inputs, inputs, training=training)
@@ -171,3 +185,7 @@ class AlphaDropout(Layer):
     config = {'rate': self.rate}
     base_config = super(AlphaDropout, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    return input_shape
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py
index 965ef70e6e6cb488aa4832462da4a2cb43e964a6..0dedd5e8daa2974038c90ae2e8c68ca6516ba725 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py
@@ -18,14 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.layers import normalization as tf_normalization_layers
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.BatchNormalization')
 class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
   """Batch normalization layer (Ioffe and Szegedy, 2014).
 
@@ -108,7 +111,7 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
     if training is None:
       training = K.learning_phase()
     output = super(BatchNormalization, self).call(inputs, training=training)
-    if training is K.learning_phase():
+    if context.in_graph_mode() and training is K.learning_phase():
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py b/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
index 39a90e597089b30d110f26f074eba5d6895e52df..2b3628c3f1023612297465bdf3286246261992a2 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
@@ -132,13 +132,19 @@ class NormalizationLayersTest(test.TestCase):
       model.compile('sgd', 'mse')
       model.train_on_batch(x, x)
 
-      assert len(model.updates) == 2
+      self.assertEqual(len(bn.updates), 4)
+      self.assertEqual(len(model.updates), 2)
+      self.assertEqual(len(model.get_updates_for(x1)), 0)
+      self.assertEqual(len(model.get_updates_for(x2)), 2)
 
       # Test model-level reuse
       x3 = keras.layers.Input(shape=(10,))
       y3 = model(x3)
-      new_model = keras.models.Model(x3, y3)
-      assert len(model.updates) == 2
+      new_model = keras.models.Model(x3, y3, name='new_model')
+
+      self.assertEqual(len(new_model.updates), 2)
+      self.assertEqual(len(model.updates), 4)
+      self.assertEqual(len(new_model.get_updates_for(x3)), 2)
       new_model.compile('sgd', 'mse')
       new_model.train_on_batch(x, x)
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling.py b/tensorflow/python/keras/_impl/keras/layers/pooling.py
index afe4ebfdc5305a91dc287203d56a9b389b468663..15d53379769d8142f5b2755a07479f60751346d2 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling.py
+++ b/tensorflow/python/keras/_impl/keras/layers/pooling.py
@@ -24,8 +24,10 @@ from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.layers import pooling as tf_pooling_layers
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.MaxPool1D', 'keras.layers.MaxPooling1D')
 class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
   """Max pooling operation for temporal data.
 
@@ -58,6 +60,7 @@ class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.AveragePooling1D', 'keras.layers.AvgPool1D')
 class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
   """Average pooling for temporal data.
 
@@ -91,6 +94,7 @@ class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.MaxPool2D', 'keras.layers.MaxPooling2D')
 class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
   """Max pooling operation for spatial data.
 
@@ -156,6 +160,7 @@ class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.AveragePooling2D', 'keras.layers.AvgPool2D')
 class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
   """Average pooling operation for spatial data.
 
@@ -221,6 +226,7 @@ class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.MaxPool3D', 'keras.layers.MaxPooling3D')
 class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
   """Max pooling operation for 3D data (spatial or spatio-temporal).
 
@@ -282,6 +288,7 @@ class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.AveragePooling3D', 'keras.layers.AvgPool3D')
 class AveragePooling3D(tf_pooling_layers.AveragePooling3D, Layer):
   """Average pooling operation for 3D data (spatial or spatio-temporal).
 
@@ -351,7 +358,7 @@ class _GlobalPooling1D(Layer):
     super(_GlobalPooling1D, self).__init__(**kwargs)
     self.input_spec = InputSpec(ndim=3)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     return tensor_shape.TensorShape([input_shape[0], input_shape[2]])
 
@@ -359,6 +366,8 @@ class _GlobalPooling1D(Layer):
     raise NotImplementedError
 
 
+@tf_export('keras.layers.GlobalAveragePooling1D',
+           'keras.layers.GlobalAvgPool1D')
 class GlobalAveragePooling1D(_GlobalPooling1D):
   """Global average pooling operation for temporal data.
 
@@ -374,6 +383,7 @@ class GlobalAveragePooling1D(_GlobalPooling1D):
     return K.mean(inputs, axis=1)
 
 
+@tf_export('keras.layers.GlobalMaxPool1D', 'keras.layers.GlobalMaxPooling1D')
 class GlobalMaxPooling1D(_GlobalPooling1D):
   """Global max pooling operation for temporal data.
 
@@ -398,7 +408,7 @@ class _GlobalPooling2D(Layer):
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=4)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_last':
       return tensor_shape.TensorShape([input_shape[0], input_shape[3]])
@@ -414,6 +424,8 @@ class _GlobalPooling2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.GlobalAveragePooling2D',
+           'keras.layers.GlobalAvgPool2D')
 class GlobalAveragePooling2D(_GlobalPooling2D):
   """Global average pooling operation for spatial data.
 
@@ -449,6 +461,7 @@ class GlobalAveragePooling2D(_GlobalPooling2D):
       return K.mean(inputs, axis=[2, 3])
 
 
+@tf_export('keras.layers.GlobalMaxPool2D', 'keras.layers.GlobalMaxPooling2D')
 class GlobalMaxPooling2D(_GlobalPooling2D):
   """Global max pooling operation for spatial data.
 
@@ -493,7 +506,7 @@ class _GlobalPooling3D(Layer):
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=5)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_last':
       return tensor_shape.TensorShape([input_shape[0], input_shape[4]])
@@ -509,6 +522,8 @@ class _GlobalPooling3D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.layers.GlobalAveragePooling3D',
+           'keras.layers.GlobalAvgPool3D')
 class GlobalAveragePooling3D(_GlobalPooling3D):
   """Global Average pooling operation for 3D data.
 
@@ -544,6 +559,7 @@ class GlobalAveragePooling3D(_GlobalPooling3D):
       return K.mean(inputs, axis=[2, 3, 4])
 
 
+@tf_export('keras.layers.GlobalMaxPool3D', 'keras.layers.GlobalMaxPooling3D')
 class GlobalMaxPooling3D(_GlobalPooling3D):
   """Global Max pooling operation for 3D data.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 8df1840b4cbfddd3d31708da5eb3a57333d621ef..45f6711c77224875328ba346e6297fad3a681cb6 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,12 +13,13 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=protected-access
-"""Recurrent layers.
+"""Recurrent layers and their base classes.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numbers
 import numpy as np
 
 from tensorflow.python.framework import tensor_shape
@@ -29,10 +30,13 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.StackedRNNCells')
 class StackedRNNCells(Layer):
   """Wrapper allowing a stack of RNN cells to behave as a single cell.
 
@@ -109,6 +113,7 @@ class StackedRNNCells(Layer):
       states += cell_states
     return inputs, states
 
+  @shape_type_conversion
   def build(self, input_shape):
     for cell in self.cells:
       if isinstance(cell, Layer):
@@ -117,7 +122,7 @@ class StackedRNNCells(Layer):
         output_dim = cell.state_size[0]
       else:
         output_dim = cell.state_size
-      input_shape = (input_shape[0], input_shape[1], output_dim)
+      input_shape = (input_shape[0], output_dim)
     self.built = True
 
   def get_config(self):
@@ -198,19 +203,19 @@ class StackedRNNCells(Layer):
     losses = []
     for cell in self.cells:
       if isinstance(cell, Layer):
-        cell_losses = cell.losses
-        losses += cell_losses
-    return losses
+        losses += cell.losses
+    return losses + self._losses
 
-  def get_losses_for(self, inputs=None):
-    losses = []
+  @property
+  def updates(self):
+    updates = []
     for cell in self.cells:
       if isinstance(cell, Layer):
-        cell_losses = cell.get_losses_for(inputs)
-        losses += cell_losses
-    return losses
+        updates += cell.updates
+    return updates + self._updates
 
 
+@tf_export('keras.layers.RNN')
 class RNN(Layer):
   """Base class for recurrent layers.
 
@@ -262,8 +267,7 @@ class RNN(Layer):
           (e.g. via the `input_shape` argument)
 
   Input shape:
-      3D tensor with shape `(batch_size, timesteps, input_dim)`,
-      (Optional) 2D tensors with shape `(batch_size, output_dim)`.
+      3D tensor with shape `(batch_size, timesteps, input_dim)`.
 
   Output shape:
       - if `return_state`: a list of tensors. The first tensor is
@@ -370,7 +374,6 @@ class RNN(Layer):
                go_backwards=False,
                stateful=False,
                unroll=False,
-               activity_regularizer=None,
                **kwargs):
     if isinstance(cell, (list, tuple)):
       cell = StackedRNNCells(cell)
@@ -382,8 +385,7 @@ class RNN(Layer):
                        'an attribute `state_size` '
                        '(tuple of integers, '
                        'one integer per RNN state).')
-    super(RNN, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(RNN, self).__init__(**kwargs)
     self.cell = cell
     self.return_sequences = return_sequences
     self.return_state = return_state
@@ -401,7 +403,7 @@ class RNN(Layer):
   @property
   def states(self):
     if self._states is None:
-      if isinstance(self.cell.state_size, int):
+      if isinstance(self.cell.state_size, numbers.Integral):
         num_states = 1
       else:
         num_states = len(self.cell.state_size)
@@ -412,15 +414,16 @@ class RNN(Layer):
   def states(self, states):
     self._states = states
 
-  def _compute_output_shape(self, input_shape):
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
 
     if hasattr(self.cell.state_size, '__len__'):
-      output_dim = self.cell.state_size[0]
+      state_size = self.cell.state_size
     else:
-      output_dim = self.cell.state_size
+      state_size = [self.cell.state_size]
+    output_dim = state_size[0]
 
     if self.return_sequences:
       output_shape = (input_shape[0], input_shape[1], output_dim)
@@ -428,11 +431,10 @@ class RNN(Layer):
       output_shape = (input_shape[0], output_dim)
 
     if self.return_state:
-      state_shape = [(input_shape[0], output_dim) for _ in self.states]
-      output_shape = [output_shape] + state_shape
+      state_shape = [(input_shape[0], dim) for dim in state_size]
+      return [output_shape] + state_shape
     else:
-      output_shape = output_shape
-    return tensor_shape.TensorShape(output_shape)
+      return output_shape
 
   def compute_mask(self, inputs, mask):
     if isinstance(mask, list):
@@ -444,6 +446,7 @@ class RNN(Layer):
     else:
       return output_mask
 
+  @shape_type_conversion
   def build(self, input_shape):
     # Note input_shape will be list of shapes of initial states and
     # constants if these are passed in __call__.
@@ -454,7 +457,6 @@ class RNN(Layer):
 
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
 
     batch_size = input_shape[0] if self.stateful else None
     input_dim = input_shape[-1]
@@ -478,9 +480,9 @@ class RNN(Layer):
       # initial_state was passed in call, check compatibility
       if [spec.shape[-1] for spec in self.state_spec] != state_size:
         raise ValueError(
-            'An initial_state was passed that is not compatible with '
+            'An `initial_state` was passed that is not compatible with '
             '`cell.state_size`. Received `state_spec`={}; '
-            'However `cell.state_size` is '
+            'however `cell.state_size` is '
             '{}'.format(self.state_spec, self.cell.state_size))
     else:
       self.state_spec = [InputSpec(shape=(None, dim)) for dim in state_size]
@@ -610,11 +612,12 @@ class RNN(Layer):
         constants=constants,
         go_backwards=self.go_backwards,
         mask=mask,
-        unroll=self.unroll)
+        unroll=self.unroll,
+        input_length=timesteps)
     if self.stateful:
       updates = []
       for i in range(len(states)):
-        updates.append((self.states[i], states[i]))
+        updates.append(K.update(self.states[i], states[i]))
       self.add_update(updates, inputs)
 
     if self.return_sequences:
@@ -625,6 +628,8 @@ class RNN(Layer):
     # Properly set learning phase
     if getattr(last_output, '_uses_learning_phase', False):
       output._uses_learning_phase = True
+      for state in states:
+        state._uses_learning_phase = True
 
     if self.return_state:
       if not isinstance(states, (list, tuple)):
@@ -636,7 +641,7 @@ class RNN(Layer):
       return output
 
   def _standardize_args(self, inputs, initial_state, constants):
-    """Standardize `__call__` arguments to a single list of tensor inputs.
+    """Standardize `__call__` to a single list of tensor inputs.
 
     When running a model loaded from file, the input tensors
     `initial_state` and `constants` can be passed to `RNN.__call__` as part
@@ -688,7 +693,7 @@ class RNN(Layer):
                        'a `batch_input_shape` '
                        'argument to your first layer.\n'
                        '- If using the functional API, specify '
-                       'the time dimension by passing a '
+                       'the batch size by passing a '
                        '`batch_shape` argument to your Input layer.')
     # initialize state if None
     if self.states[0] is None:
@@ -772,52 +777,45 @@ class RNN(Layer):
 
   @property
   def losses(self):
+    losses = []
     if isinstance(self.cell, Layer):
-      return self.cell.losses
-    return []
+      losses += self.cell.losses
+    return losses + self._losses
 
-  def get_losses_for(self, inputs=None):
+  @property
+  def updates(self):
+    updates = []
     if isinstance(self.cell, Layer):
-      cell_losses = self.cell.get_losses_for(inputs)
-      return cell_losses + super(RNN, self).get_losses_for(inputs)
-    return super(RNN, self).get_losses_for(inputs)
+      updates += self.cell.updates
+    return updates + self._updates
 
 
+@tf_export('keras.layers.SimpleRNNCell')
 class SimpleRNNCell(Layer):
   """Cell class for SimpleRNN.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+      activation: Activation function to use.
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -865,6 +863,7 @@ class SimpleRNNCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
+  @shape_type_conversion
   def build(self, input_shape):
     self.kernel = self.add_weight(
         shape=(input_shape[-1], self.units),
@@ -889,33 +888,21 @@ class SimpleRNNCell(Layer):
       self.bias = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._dropout_mask = K.in_train_phase(
-          dropped_inputs, ones, training=training)
-    else:
-      self._dropout_mask = None
-
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
-    if 0 < self.recurrent_dropout < 1:
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, self.units))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._recurrent_dropout_mask = K.in_train_phase(
-          dropped_inputs, ones, training=training)
-    else:
-      self._recurrent_dropout_mask = None
-
   def call(self, inputs, states, training=None):
     prev_output = states[0]
+    if 0 < self.dropout < 1 and self._dropout_mask is None:
+      self._dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs,
+                                 K.shape(inputs)[-1]),
+          self.dropout,
+          training=training)
+    if (0 < self.recurrent_dropout < 1 and
+        self._recurrent_dropout_mask is None):
+      self._recurrent_dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs, self.units),
+          self.recurrent_dropout,
+          training=training)
+
     dp_mask = self._dropout_mask
     rec_dp_mask = self._recurrent_dropout_mask
 
@@ -938,45 +925,69 @@ class SimpleRNNCell(Layer):
         output._uses_learning_phase = True
     return output, [output]
 
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
+    }
+    base_config = super(SimpleRNNCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
+@tf_export('keras.layers.SimpleRNN')
 class SimpleRNN(RNN):
   """Fully-connected RNN where the output is to be fed back to input.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+      activation: Activation function to use.
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1050,12 +1061,12 @@ class SimpleRNN(RNN):
         go_backwards=go_backwards,
         stateful=stateful,
         unroll=unroll,
-        activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
+    self.cell._dropout_mask = None
+    self.cell._recurrent_dropout_mask = None
     return super(SimpleRNN, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -1117,25 +1128,36 @@ class SimpleRNN(RNN):
 
   def get_config(self):
     config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint),
-        'dropout': self.dropout,
-        'recurrent_dropout': self.recurrent_dropout
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout
     }
     base_config = super(SimpleRNN, self).get_config()
     del base_config['cell']
@@ -1148,44 +1170,34 @@ class SimpleRNN(RNN):
     return cls(**config)
 
 
+@tf_export('keras.layers.GRUCell')
 class GRUCell(Layer):
   """Cell class for the GRU layer.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+      activation: Activation function to use.
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1243,6 +1255,7 @@ class GRUCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
+  @shape_type_conversion
   def build(self, input_shape):
     input_dim = input_shape[-1]
     self.kernel = self.add_weight(
@@ -1286,38 +1299,24 @@ class GRUCell(Layer):
       self.bias_h = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._dropout_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(3)
-      ]
-    else:
-      self._dropout_mask = None
-
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
-    if 0 < self.recurrent_dropout < 1:
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, self.units))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._recurrent_dropout_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(3)
-      ]
-    else:
-      self._recurrent_dropout_mask = None
-
   def call(self, inputs, states, training=None):
     h_tm1 = states[0]  # previous memory
 
+    if 0 < self.dropout < 1 and self._dropout_mask is None:
+      self._dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs,
+                                 K.shape(inputs)[-1]),
+          self.dropout,
+          training=training,
+          count=3)
+    if (0 < self.recurrent_dropout < 1 and
+        self._recurrent_dropout_mask is None):
+      self._recurrent_dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs, self.units),
+          self.recurrent_dropout,
+          training=training,
+          count=3)
+
     # dropout matrices for input units
     dp_mask = self._dropout_mask
     # dropout matrices for recurrent units
@@ -1381,51 +1380,77 @@ class GRUCell(Layer):
         h._uses_learning_phase = True
     return h, [h]
 
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'recurrent_activation':
+            activations.serialize(self.recurrent_activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout,
+        'implementation':
+            self.implementation
+    }
+    base_config = super(GRUCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
+@tf_export('keras.layers.GRU')
 class GRU(RNN):
-  # pylint: disable=line-too-long
   """Gated Recurrent Unit - Cho et al.
 
   2014.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+      activation: Activation function to use.
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1455,12 +1480,7 @@ class GRU(RNN):
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
 
-  References:
-      - [On the Properties of Neural Machine Translation: Encoder-Decoder Approaches](https://arxiv.org/abs/1409.1259)
-      - [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling](http://arxiv.org/abs/1412.3555v1)
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
   """
-  # pylint: enable=line-too-long
 
   def __init__(self,
                units,
@@ -1518,8 +1538,8 @@ class GRU(RNN):
     self.activity_regularizer = regularizers.get(activity_regularizer)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
+    self.cell._dropout_mask = None
+    self.cell._recurrent_dropout_mask = None
     return super(GRU, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -1589,28 +1609,40 @@ class GRU(RNN):
 
   def get_config(self):
     config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint),
-        'dropout': self.dropout,
-        'recurrent_dropout': self.recurrent_dropout,
-        'implementation': self.implementation
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout,
+        'implementation':
+            self.implementation
     }
     base_config = super(GRU, self).get_config()
     del base_config['cell']
@@ -1623,49 +1655,39 @@ class GRU(RNN):
     return cls(**config)
 
 
+@tf_export('keras.layers.LSTMCell')
 class LSTMCell(Layer):
   """Cell class for the LSTM layer.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+      activation: Activation function to use.
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
           used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector.
       unit_forget_bias: Boolean.
           If True, add 1 to the bias of the forget gate at initialization.
           Setting it to true will also force `bias_initializer="zeros"`.
           This is recommended in [Jozefowicz et
             al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1725,6 +1747,7 @@ class LSTMCell(Layer):
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
 
+  @shape_type_conversion
   def build(self, input_shape):
     input_dim = input_shape[-1]
     self.kernel = self.add_weight(
@@ -1784,36 +1807,22 @@ class LSTMCell(Layer):
       self.bias_o = None
     self.built = True
 
-  def _generate_dropout_mask(self, inputs, training=None):
-    if 0 < self.dropout < 1:
-      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._dropout_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-    else:
-      self._dropout_mask = None
-
-  def _generate_recurrent_dropout_mask(self, inputs, training=None):
-    if 0 < self.recurrent_dropout < 1:
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, self.units))
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      self._recurrent_dropout_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-    else:
-      self._recurrent_dropout_mask = None
-
   def call(self, inputs, states, training=None):
+    if 0 < self.dropout < 1 and self._dropout_mask is None:
+      self._dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs,
+                                 K.shape(inputs)[-1]),
+          self.dropout,
+          training=training,
+          count=4)
+    if (0 < self.recurrent_dropout < 1 and
+        self._recurrent_dropout_mask is None):
+      self._recurrent_dropout_mask = _generate_dropout_mask(
+          _generate_dropout_ones(inputs, self.units),
+          self.recurrent_dropout,
+          training=training,
+          count=4)
+
     # dropout matrices for input units
     dp_mask = self._dropout_mask
     # dropout matrices for recurrent units
@@ -1887,54 +1896,82 @@ class LSTMCell(Layer):
         h._uses_learning_phase = True
     return h, [h, c]
 
+  def get_config(self):
+    config = {
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'recurrent_activation':
+            activations.serialize(self.recurrent_activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'unit_forget_bias':
+            self.unit_forget_bias,
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout,
+        'implementation':
+            self.implementation
+    }
+    base_config = super(LSTMCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
+@tf_export('keras.layers.LSTM')
 class LSTM(RNN):
-  # pylint: disable=line-too-long
   """Long-Short Term Memory layer - Hochreiter 1997.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use
-          (see [activations](../activations.md)).
+      activation: Activation function to use.
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step
-          (see [activations](../activations.md)).
+          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs.
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the inputs..
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state.
-          (see [initializers](../initializers.md)).
-      bias_initializer: Initializer for the bias vector
-          (see [initializers](../initializers.md)).
+          used for the linear transformation of the recurrent state..
+      bias_initializer: Initializer for the bias vector.
       unit_forget_bias: Boolean.
           If True, add 1 to the bias of the forget gate at initialization.
           Setting it to true will also force `bias_initializer="zeros"`.
           This is recommended in [Jozefowicz et
             al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
+          the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix
-          (see [regularizer](../regularizers.md)).
-      bias_regularizer: Regularizer function applied to the bias vector
-          (see [regularizer](../regularizers.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation").
-          (see [regularizer](../regularizers.md)).
+          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix
-          (see [constraints](../constraints.md)).
+          the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix
-          (see [constraints](../constraints.md)).
-      bias_constraint: Constraint function applied to the bias vector
-          (see [constraints](../constraints.md)).
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
@@ -1964,13 +2001,7 @@ class LSTM(RNN):
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
 
-  References:
-      - [Long short-term memory](http://www.bioinf.jku.at/publications/older/2604.pdf)
-      - [Learning to forget: Continual prediction with LSTM](http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015)
-      - [Supervised sequence labeling with recurrent neural networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
   """
-  # pylint: enable=line-too-long
 
   def __init__(self,
                units,
@@ -2030,8 +2061,8 @@ class LSTM(RNN):
     self.activity_regularizer = regularizers.get(activity_regularizer)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
-    self.cell._generate_dropout_mask(inputs, training=training)
-    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
+    self.cell._dropout_mask = None
+    self.cell._recurrent_dropout_mask = None
     return super(LSTM, self).call(
         inputs, mask=mask, training=training, initial_state=initial_state)
 
@@ -2105,29 +2136,42 @@ class LSTM(RNN):
 
   def get_config(self):
     config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
         'recurrent_activation':
             activations.serialize(self.recurrent_activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
         'recurrent_initializer':
             initializers.serialize(self.recurrent_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'unit_forget_bias': self.unit_forget_bias,
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'unit_forget_bias':
+            self.unit_forget_bias,
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
         'recurrent_regularizer':
             regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
         'recurrent_constraint':
             constraints.serialize(self.recurrent_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint),
-        'dropout': self.dropout,
-        'recurrent_dropout': self.recurrent_dropout,
-        'implementation': self.implementation
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint),
+        'dropout':
+            self.dropout,
+        'recurrent_dropout':
+            self.recurrent_dropout,
+        'implementation':
+            self.implementation
     }
     base_config = super(LSTM, self).get_config()
     del base_config['cell']
@@ -2140,6 +2184,23 @@ class LSTM(RNN):
     return cls(**config)
 
 
+def _generate_dropout_ones(inputs, dims):
+  return K.ones((K.shape(inputs)[0], dims))
+
+
+def _generate_dropout_mask(ones, rate, training=None, count=1):
+
+  def dropped_inputs():
+    return K.dropout(ones, rate)
+
+  if count > 1:
+    return [
+        K.in_train_phase(dropped_inputs, ones, training=training)
+        for _ in range(count)
+    ]
+  return K.in_train_phase(dropped_inputs, ones, training=training)
+
+
 class Recurrent(Layer):
   """Deprecated abstract base class for recurrent layers.
 
@@ -2266,7 +2327,8 @@ class Recurrent(Layer):
     self.dropout = 0
     self.recurrent_dropout = 0
 
-  def _compute_output_shape(self, input_shape):
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -2403,7 +2465,7 @@ class Recurrent(Layer):
     if self.stateful:
       updates = []
       for i in range(len(states)):
-        updates.append((self.states[i], states[i]))
+        updates.append(K.update(self.states[i], states[i]))
       self.add_update(updates, inputs)
 
     # Properly set learning phase
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
index 7dc4c1db9b4b71775bd3c52a863752b34d9dc3ea..ab48a63e3544534567ee3205bb74174cda6e1769 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
@@ -353,13 +353,10 @@ class RNNTest(test.TestCase):
       self.assertAllClose(y_np, y_np_3, atol=1e-4)
 
   def test_stacked_rnn_attributes(self):
-    cells = [keras.layers.LSTMCell(3),
-             keras.layers.LSTMCell(3, kernel_regularizer='l2')]
+    cells = [keras.layers.LSTMCell(1),
+             keras.layers.LSTMCell(1)]
     layer = keras.layers.RNN(cells)
-    layer.build((None, None, 5))
-
-    # Test regularization losses
-    self.assertEqual(len(layer.losses), 1)
+    layer.build((None, None, 1))
 
     # Test weights
     self.assertEqual(len(layer.trainable_weights), 6)
@@ -367,11 +364,32 @@ class RNNTest(test.TestCase):
     self.assertEqual(len(layer.trainable_weights), 3)
     self.assertEqual(len(layer.non_trainable_weights), 3)
 
-    # Test `get_losses_for`
-    x = keras.Input((None, 5))
-    y = keras.backend.sum(x)
-    cells[0].add_loss(y, inputs=x)
-    self.assertEqual(layer.get_losses_for(x), [y])
+    # Test `get_losses_for` and `losses`
+    x = keras.Input((None, 1))
+    loss_1 = keras.backend.sum(x)
+    loss_2 = keras.backend.sum(cells[0].kernel)
+    cells[0].add_loss(loss_1, inputs=x)
+    cells[0].add_loss(loss_2)
+    self.assertEqual(len(layer.losses), 2)
+    self.assertEqual(layer.get_losses_for(None), [loss_2])
+    self.assertEqual(layer.get_losses_for(x), [loss_1])
+
+    # Test `get_updates_for` and `updates`
+    cells = [keras.layers.LSTMCell(1),
+             keras.layers.LSTMCell(1)]
+    layer = keras.layers.RNN(cells)
+    layer.build((None, None, 1))
+
+    x = keras.Input((None, 1))
+    update_1 = keras.backend.update_add(
+        cells[0].kernel, x[0, 0, 0] * cells[0].kernel)
+    update_2 = keras.backend.update_add(
+        cells[0].kernel, keras.backend.ones_like(cells[0].kernel))
+    cells[0].add_update(update_1, inputs=x)
+    cells[0].add_update(update_2)
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(layer.get_updates_for(None), [update_2])
+    self.assertEqual(layer.get_updates_for(x), [update_1])
 
   def test_rnn_dynamic_trainability(self):
     layer_class = keras.layers.SimpleRNN
@@ -392,6 +410,105 @@ class RNNTest(test.TestCase):
     self.assertEqual(len(layer.trainable_weights), 3)
     self.assertEqual(len(layer.non_trainable_weights), 0)
 
+  def test_state_reuse_with_dropout(self):
+    layer_class = keras.layers.SimpleRNN
+    embedding_dim = 4
+    units = 3
+    timesteps = 2
+    num_samples = 2
+
+    with self.test_session():
+      input1 = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+      layer = layer_class(units,
+                          return_state=True,
+                          return_sequences=True,
+                          dropout=0.2)
+      state = layer(input1)[1:]
+
+      input2 = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+      output = layer_class(units)(input2, initial_state=state)
+      model = keras.Model([input1, input2], output)
+
+      inputs = [np.random.random((num_samples, timesteps, embedding_dim)),
+                np.random.random((num_samples, timesteps, embedding_dim))]
+      model.predict(inputs)
+
+  def test_builtin_rnn_cell_serialization(self):
+    for cell_class in [keras.layers.SimpleRNNCell,
+                       keras.layers.GRUCell,
+                       keras.layers.LSTMCell]:
+      with self.test_session():
+        # Test basic case.
+        x = keras.Input((None, 5))
+        cell = cell_class(32)
+        layer = keras.layers.RNN(cell)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(optimizer='rmsprop', loss='mse')
+
+        # Test basic case serialization.
+        x_np = np.random.random((6, 5, 5))
+        y_np = model.predict(x_np)
+        weights = model.get_weights()
+        config = layer.get_config()
+        layer = keras.layers.RNN.from_config(config)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.set_weights(weights)
+        y_np_2 = model.predict(x_np)
+        self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+        # Test stacking.
+        cells = [cell_class(8),
+                 cell_class(12),
+                 cell_class(32)]
+        layer = keras.layers.RNN(cells)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.compile(optimizer='rmsprop', loss='mse')
+
+        # Test stacked RNN serialization.
+        x_np = np.random.random((6, 5, 5))
+        y_np = model.predict(x_np)
+        weights = model.get_weights()
+        config = layer.get_config()
+        layer = keras.layers.RNN.from_config(config)
+        y = layer(x)
+        model = keras.models.Model(x, y)
+        model.set_weights(weights)
+        y_np_2 = model.predict(x_np)
+        self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+  def test_stacked_rnn_dropout(self):
+    cells = [keras.layers.LSTMCell(3, dropout=0.1, recurrent_dropout=0.1),
+             keras.layers.LSTMCell(3, dropout=0.1, recurrent_dropout=0.1)]
+    layer = keras.layers.RNN(cells)
+
+    with self.test_session():
+      x = keras.Input((None, 5))
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.compile('sgd', 'mse')
+      x_np = np.random.random((6, 5, 5))
+      y_np = np.random.random((6, 3))
+      model.train_on_batch(x_np, y_np)
+
+  def test_stacked_rnn_compute_output_shape(self):
+    cells = [keras.layers.LSTMCell(3),
+             keras.layers.LSTMCell(6)]
+    embedding_dim = 4
+    timesteps = 2
+    layer = keras.layers.RNN(cells, return_state=True, return_sequences=True)
+    output_shape = layer.compute_output_shape((None, timesteps, embedding_dim))
+    expected_output_shape = [(None, timesteps, 6),
+                             (None, 6),
+                             (None, 6),
+                             (None, 3),
+                             (None, 3)]
+    self.assertEqual(
+        [tuple(o.as_list()) for o in output_shape],
+        expected_output_shape)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index aefa5a1c020b490991708056d609ae1efa8d4a9a..f053aa1d09570e76aa0b6b9733c0b0bb438e24a0 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -25,10 +25,13 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.topology import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.layers import utils as tf_layers_util
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.layers.Wrapper')
 class Wrapper(Layer):
   """Abstract wrapper base class.
 
@@ -68,34 +71,11 @@ class Wrapper(Layer):
 
   @property
   def updates(self):
-    if hasattr(self.layer, 'updates'):
-      return self.layer.updates
-    return []
-
-  def get_updates_for(self, inputs=None):
-    # If the wrapper modifies the inputs, use the modified inputs to
-    # get the updates from the inner layer.
-    inner_inputs = inputs
-    if inputs is not None:
-      uid = tf_layers_util.object_list_uid(inputs)
-      if uid in self._input_map:
-        inner_inputs = self._input_map[uid]
-
-    updates = self.layer.get_updates_for(inner_inputs)
-    updates += super(Wrapper, self).get_updates_for(inputs)
-    return updates
+    return self.layer.updates + self._updates
 
   @property
   def losses(self):
-    if hasattr(self.layer, 'losses'):
-      return self.layer.losses
-    return []
-
-  def get_losses_for(self, inputs=None):
-    if inputs is None:
-      losses = self.layer.get_losses_for(None)
-      return losses + super(Wrapper, self).get_losses_for(None)
-    return super(Wrapper, self).get_losses_for(inputs)
+    return self.layer.losses + self._losses
 
   def get_weights(self):
     return self.layer.get_weights()
@@ -121,6 +101,7 @@ class Wrapper(Layer):
     return cls(layer, **config)
 
 
+@tf_export('keras.layers.TimeDistributed')
 class TimeDistributed(Wrapper):
   """This wrapper allows to apply a layer to every temporal slice of an input.
 
@@ -181,11 +162,11 @@ class TimeDistributed(Wrapper):
     super(TimeDistributed, self).build()
     self.built = True
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     child_input_shape = tensor_shape.TensorShape([input_shape[0]] +
                                                  input_shape[2:])
-    child_output_shape = self.layer._compute_output_shape(  # pylint: disable=protected-access
+    child_output_shape = self.layer.compute_output_shape(
         child_input_shape).as_list()
     timesteps = input_shape[1]
     return tensor_shape.TensorShape([child_output_shape[0], timesteps] +
@@ -231,7 +212,7 @@ class TimeDistributed(Wrapper):
       if hasattr(y, '_uses_learning_phase'):
         uses_learning_phase = y._uses_learning_phase
       # Shape: (num_samples, timesteps, ...)
-      output_shape = self._compute_output_shape(input_shape).as_list()
+      output_shape = self.compute_output_shape(input_shape).as_list()
       y = K.reshape(y, (-1, input_length) + tuple(output_shape[2:]))
 
     # Apply activity regularizer if any:
@@ -245,6 +226,7 @@ class TimeDistributed(Wrapper):
     return y
 
 
+@tf_export('keras.layers.Bidirectional')
 class Bidirectional(Wrapper):
   """Bidirectional wrapper for RNNs.
 
@@ -291,6 +273,7 @@ class Bidirectional(Wrapper):
       self.backward_layer.initial_weights = weights[nw // 2:]
     self.stateful = layer.stateful
     self.return_sequences = layer.return_sequences
+    self.return_state = layer.return_state
     self.supports_masking = True
 
   def get_weights(self):
@@ -301,27 +284,54 @@ class Bidirectional(Wrapper):
     self.forward_layer.set_weights(weights[:nw // 2])
     self.backward_layer.set_weights(weights[nw // 2:])
 
-  def _compute_output_shape(self, input_shape):
-    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
-    if self.merge_mode in ['sum', 'ave', 'mul']:
-      return self.forward_layer._compute_output_shape(input_shape)  # pylint: disable=protected-access
-    elif self.merge_mode == 'concat':
-      shape = self.forward_layer._compute_output_shape(input_shape).as_list()  # pylint: disable=protected-access
-      shape[-1] *= 2
-      return tensor_shape.TensorShape(shape)
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    output_shape = tuple(self.forward_layer.compute_output_shape(
+        input_shape).as_list())
+    if self.return_state:
+      state_shape = output_shape[1:]
+      output_shape = output_shape[0]
+
+    if self.merge_mode == 'concat':
+      output_shape = list(output_shape)
+      output_shape[-1] *= 2
+      output_shape = tuple(output_shape)
     elif self.merge_mode is None:
-      shape = self.forward_layer._compute_output_shape(input_shape)  # pylint: disable=protected-access
-      return [shape, copy.copy(shape)]
+      output_shape = [output_shape, copy.copy(output_shape)]
 
-  def call(self, inputs, training=None, mask=None):
+    if self.return_state:
+      if self.merge_mode is None:
+        return output_shape + state_shape + copy.copy(state_shape)
+      return [output_shape] + state_shape + copy.copy(state_shape)
+    return output_shape
+
+  def call(self, inputs, training=None, mask=None, initial_state=None):
     kwargs = {}
     if has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
     if has_arg(self.layer.call, 'mask'):
       kwargs['mask'] = mask
 
-    y = self.forward_layer.call(inputs, **kwargs)
-    y_rev = self.backward_layer.call(inputs, **kwargs)
+    if initial_state is not None and has_arg(self.layer.call, 'initial_state'):
+      if not isinstance(initial_state, list):
+        raise ValueError(
+            'When passing `initial_state` to a Bidirectional RNN, the state '
+            'should be a list containing the states of the underlying RNNs. '
+            'Found: ' + str(initial_state))
+      forward_state = initial_state[:len(initial_state) // 2]
+      backward_state = initial_state[len(initial_state) // 2:]
+      y = self.forward_layer.call(inputs, initial_state=forward_state, **kwargs)
+      y_rev = self.backward_layer.call(
+          inputs, initial_state=backward_state, **kwargs)
+    else:
+      y = self.forward_layer.call(inputs, **kwargs)
+      y_rev = self.backward_layer.call(inputs, **kwargs)
+
+    if self.return_state:
+      states = y[1:] + y_rev[1:]
+      y = y[0]
+      y_rev = y_rev[0]
+
     if self.return_sequences:
       y_rev = K.reverse(y_rev, 1)
     if self.merge_mode == 'concat':
@@ -343,6 +353,11 @@ class Bidirectional(Wrapper):
           out._uses_learning_phase = True
       else:
         output._uses_learning_phase = True
+
+    if self.return_state:
+      if self.merge_mode is None:
+        return output + states
+      return [output] + states
     return output
 
   def reset_states(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py b/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
index a0951b8240dac5162161962456c34df4c2a16595..f48c8919a148403874758b618aaa9a662e511240 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
@@ -158,7 +158,7 @@ class BidirectionalTest(test.TestCase):
 
         # test compute output shape
         ref_shape = model.layers[-1].output.get_shape()
-        shape = model.layers[-1]._compute_output_shape(
+        shape = model.layers[-1].compute_output_shape(
             (None, timesteps, dim))
         self.assertListEqual(shape.as_list(), ref_shape.as_list())
 
@@ -238,6 +238,131 @@ class BidirectionalTest(test.TestCase):
       model.compile(loss='mse', optimizer='sgd')
       model.fit(x, y, epochs=1, batch_size=1)
 
+  def test_Bidirectional_merged_value(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+    x = [np.random.rand(samples, timesteps, dim)]
+
+    with self.test_session():
+      for merge_mode in ['sum', 'mul', 'ave', 'concat', None]:
+        if merge_mode == 'sum':
+          merge_func = lambda y, y_rev: y + y_rev
+        elif merge_mode == 'mul':
+          merge_func = lambda y, y_rev: y * y_rev
+        elif merge_mode == 'ave':
+          merge_func = lambda y, y_rev: (y + y_rev) / 2
+        elif merge_mode == 'concat':
+          merge_func = lambda y, y_rev: np.concatenate((y, y_rev), axis=-1)
+        else:
+          merge_func = lambda y, y_rev: [y, y_rev]
+
+        # basic case
+        inputs = keras.Input((timesteps, dim))
+        layer = keras.layers.Bidirectional(
+            rnn(units, return_sequences=True), merge_mode=merge_mode)
+        f_merged = keras.backend.function([inputs], _to_list(layer(inputs)))
+        f_forward = keras.backend.function([inputs],
+                                           [layer.forward_layer.call(inputs)])
+        f_backward = keras.backend.function(
+            [inputs],
+            [keras.backend.reverse(layer.backward_layer.call(inputs), 1)])
+
+        y_merged = f_merged(x)
+        y_expected = _to_list(merge_func(f_forward(x)[0], f_backward(x)[0]))
+        assert len(y_merged) == len(y_expected)
+        for x1, x2 in zip(y_merged, y_expected):
+          self.assertAllClose(x1, x2, atol=1e-5)
+
+        # test return_state
+        inputs = keras.Input((timesteps, dim))
+        layer = keras.layers.Bidirectional(
+            rnn(units, return_state=True), merge_mode=merge_mode)
+        f_merged = keras.backend.function([inputs], layer(inputs))
+        f_forward = keras.backend.function([inputs],
+                                           layer.forward_layer.call(inputs))
+        f_backward = keras.backend.function([inputs],
+                                            layer.backward_layer.call(inputs))
+        n_states = len(layer.layer.states)
+
+        y_merged = f_merged(x)
+        y_forward = f_forward(x)
+        y_backward = f_backward(x)
+        y_expected = _to_list(merge_func(y_forward[0], y_backward[0]))
+        assert len(y_merged) == len(y_expected) + n_states * 2
+        for x1, x2 in zip(y_merged, y_expected):
+          self.assertAllClose(x1, x2, atol=1e-5)
+
+        y_merged = y_merged[-n_states * 2:]
+        y_forward = y_forward[-n_states:]
+        y_backward = y_backward[-n_states:]
+        for state_birnn, state_inner in zip(y_merged, y_forward + y_backward):
+          self.assertAllClose(state_birnn, state_inner, atol=1e-5)
+
+  def test_Bidirectional_dropout(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+    merge_mode = 'sum'
+    x = [np.random.rand(samples, timesteps, dim)]
+
+    with self.test_session():
+      inputs = keras.Input((timesteps, dim))
+      wrapped = keras.layers.Bidirectional(
+          rnn(units, dropout=0.2, recurrent_dropout=0.2), merge_mode=merge_mode)
+      outputs = _to_list(wrapped(inputs, training=True))
+      assert all(not getattr(x, '_uses_learning_phase') for x in outputs)
+
+      inputs = keras.Input((timesteps, dim))
+      wrapped = keras.layers.Bidirectional(
+          rnn(units, dropout=0.2, return_state=True), merge_mode=merge_mode)
+      outputs = _to_list(wrapped(inputs))
+      assert all(x._uses_learning_phase for x in outputs)
+
+      model = keras.Model(inputs, outputs)
+      assert model.uses_learning_phase
+      y1 = _to_list(model.predict(x))
+      y2 = _to_list(model.predict(x))
+      for x1, x2 in zip(y1, y2):
+        self.assertAllClose(x1, x2, atol=1e-5)
+
+  def test_Bidirectional_state_reuse(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+
+    with self.test_session():
+      inputs = keras.Input((timesteps, dim))
+      layer = keras.layers.Bidirectional(
+          rnn(units, return_state=True, return_sequences=True))
+      outputs = layer(inputs)
+      output, state = outputs[0], outputs[1:]
+
+      # test passing invalid initial_state: passing a tensor
+      with self.assertRaises(ValueError):
+        output = keras.layers.Bidirectional(
+            rnn(units))(output, initial_state=state[0])
+
+      # test valid usage: passing a list
+      output = keras.layers.Bidirectional(
+          rnn(units))(output, initial_state=state)
+      model = keras.Model(inputs, output)
+      inputs = np.random.rand(samples, timesteps, dim)
+      outputs = model.predict(inputs)
+
+
+def _to_list(ls):
+  if isinstance(ls, list):
+    return ls
+  else:
+    return [ls]
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/losses.py b/tensorflow/python/keras/_impl/keras/losses.py
index 19212aeee8cd4fbc723ba3e47c9d3e226ec339a9..1576ed7b999f65992f46b357c8ebeda8935c68d0 100644
--- a/tensorflow/python/keras/_impl/keras/losses.py
+++ b/tensorflow/python/keras/_impl/keras/losses.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Built-in Keras loss functions.
+# pylint: disable=unused-import
+"""Built-in loss functions.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -23,72 +24,108 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.metrics.mean_squared_error',
+           'keras.losses.mean_squared_error')
 def mean_squared_error(y_true, y_pred):
   return K.mean(K.square(y_pred - y_true), axis=-1)
 
 
+@tf_export('keras.metrics.mean_absolute_error',
+           'keras.losses.mean_absolute_error')
 def mean_absolute_error(y_true, y_pred):
   return K.mean(K.abs(y_pred - y_true), axis=-1)
 
 
+@tf_export('keras.metrics.mean_absolute_percentage_error',
+           'keras.losses.mean_absolute_percentage_error')
 def mean_absolute_percentage_error(y_true, y_pred):
-  # Equivalent to MAE, but sometimes easier to interpret.
   diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true), K.epsilon(), None))
   return 100. * K.mean(diff, axis=-1)
 
 
+@tf_export('keras.metrics.mean_squared_logarithmic_error',
+           'keras.losses.mean_squared_logarithmic_error')
 def mean_squared_logarithmic_error(y_true, y_pred):
   first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
   second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
   return K.mean(K.square(first_log - second_log), axis=-1)
 
 
+@tf_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
 def squared_hinge(y_true, y_pred):
   return K.mean(K.square(K.maximum(1. - y_true * y_pred, 0.)), axis=-1)
 
 
+@tf_export('keras.metrics.hinge', 'keras.losses.hinge')
 def hinge(y_true, y_pred):
   return K.mean(K.maximum(1. - y_true * y_pred, 0.), axis=-1)
 
 
+@tf_export('keras.losses.categorical_hinge')
 def categorical_hinge(y_true, y_pred):
   pos = K.sum(y_true * y_pred, axis=-1)
   neg = K.max((1. - y_true) * y_pred, axis=-1)
-  return K.maximum(neg - pos + 1., 0.)
+  return K.maximum(0., neg - pos + 1.)
 
 
+@tf_export('keras.losses.logcosh')
 def logcosh(y_true, y_pred):
+  """Logarithm of the hyperbolic cosine of the prediction error.
 
-  def cosh(x):
-    return (K.exp(x) + K.exp(-x)) / 2
+  `log(cosh(x))` is approximately equal to `(x ** 2) / 2` for small `x` and
+  to `abs(x) - log(2)` for large `x`. This means that 'logcosh' works mostly
+  like the mean squared error, but will not be so strongly affected by the
+  occasional wildly incorrect prediction.
 
-  return K.mean(K.log(cosh(y_pred - y_true)), axis=-1)
+  Arguments:
+      y_true: tensor of true targets.
+      y_pred: tensor of predicted targets.
 
+  Returns:
+      Tensor with one scalar loss entry per sample.
+  """
 
+  def _logcosh(x):
+    return x + K.softplus(-2. * x) - K.log(2.)
+
+  return K.mean(_logcosh(y_pred - y_true), axis=-1)
+
+
+@tf_export('keras.metrics.categorical_crossentropy',
+           'keras.losses.categorical_crossentropy')
 def categorical_crossentropy(y_true, y_pred):
   return K.categorical_crossentropy(y_true, y_pred)
 
 
+@tf_export('keras.metrics.sparse_categorical_crossentropy',
+           'keras.losses.sparse_categorical_crossentropy')
 def sparse_categorical_crossentropy(y_true, y_pred):
   return K.sparse_categorical_crossentropy(y_true, y_pred)
 
 
+@tf_export('keras.metrics.binary_crossentropy',
+           'keras.losses.binary_crossentropy')
 def binary_crossentropy(y_true, y_pred):
   return K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1)
 
 
+@tf_export('keras.metrics.kullback_leibler_divergence',
+           'keras.losses.kullback_leibler_divergence')
 def kullback_leibler_divergence(y_true, y_pred):
   y_true = K.clip(y_true, K.epsilon(), 1)
   y_pred = K.clip(y_pred, K.epsilon(), 1)
   return K.sum(y_true * K.log(y_true / y_pred), axis=-1)
 
 
+@tf_export('keras.metrics.poisson', 'keras.losses.poisson')
 def poisson(y_true, y_pred):
   return K.mean(y_pred - y_true * K.log(y_pred + K.epsilon()), axis=-1)
 
 
+@tf_export('keras.metrics.cosine_proximity', 'keras.losses.cosine_proximity')
 def cosine_proximity(y_true, y_pred):
   y_true = K.l2_normalize(y_true, axis=-1)
   y_pred = K.l2_normalize(y_pred, axis=-1)
@@ -105,10 +142,12 @@ kld = KLD = kullback_leibler_divergence
 cosine = cosine_proximity
 
 
+@tf_export('keras.losses.serialize')
 def serialize(loss):
   return serialize_keras_object(loss)
 
 
+@tf_export('keras.losses.deserialize')
 def deserialize(name, custom_objects=None):
   return deserialize_keras_object(
       name,
@@ -117,6 +156,7 @@ def deserialize(name, custom_objects=None):
       printable_module_name='loss function')
 
 
+@tf_export('keras.losses.get')
 def get(identifier):
   if identifier is None:
     return None
diff --git a/tensorflow/python/keras/_impl/keras/metrics.py b/tensorflow/python/keras/_impl/keras/metrics.py
index 202048f26d2ad201b4762d3b2b32638f9d041e88..0e2fb6365a2d9fda987d1326d8a48f40b55672f4 100644
--- a/tensorflow/python/keras/_impl/keras/metrics.py
+++ b/tensorflow/python/keras/_impl/keras/metrics.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Built-in Keras metrics functions.
+# pylint: disable=unused-import
+"""Built-in metrics.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -21,7 +22,6 @@ from __future__ import print_function
 import six
 
 from tensorflow.python.keras._impl.keras import backend as K
-# pylint: disable=unused-import
 from tensorflow.python.keras._impl.keras.losses import binary_crossentropy
 from tensorflow.python.keras._impl.keras.losses import categorical_crossentropy
 from tensorflow.python.keras._impl.keras.losses import cosine_proximity
@@ -35,14 +35,16 @@ from tensorflow.python.keras._impl.keras.losses import mean_squared_logarithmic_
 from tensorflow.python.keras._impl.keras.losses import poisson
 from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crossentropy
 from tensorflow.python.keras._impl.keras.losses import squared_hinge
-# pylint: disable=unused-import
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.metrics.binary_accuracy')
 def binary_accuracy(y_true, y_pred):
   return K.mean(K.equal(y_true, K.round(y_pred)), axis=-1)
 
 
+@tf_export('keras.metrics.categorical_accuracy')
 def categorical_accuracy(y_true, y_pred):
   return K.cast(
       K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)), K.floatx())
@@ -55,13 +57,15 @@ def sparse_categorical_accuracy(y_true, y_pred):
                                          K.floatx())), K.floatx())
 
 
+@tf_export('keras.metrics.top_k_categorical_accuracy')
 def top_k_categorical_accuracy(y_true, y_pred, k=5):
   return K.mean(K.in_top_k(y_pred, K.argmax(y_true, axis=-1), k), axis=-1)
 
 
+@tf_export('keras.metrics.sparse_top_k_categorical_accuracy')
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
-  return K.mean(K.in_top_k(y_pred,
-                           K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
+  return K.mean(
+      K.in_top_k(y_pred, K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
 
 
 # Aliases
@@ -73,10 +77,12 @@ msle = MSLE = mean_squared_logarithmic_error
 cosine = cosine_proximity
 
 
+@tf_export('keras.metrics.serialize')
 def serialize(metric):
   return metric.__name__
 
 
+@tf_export('keras.metrics.deserialize')
 def deserialize(name, custom_objects=None):
   return deserialize_keras_object(
       name,
@@ -85,6 +91,7 @@ def deserialize(name, custom_objects=None):
       printable_module_name='metric function')
 
 
+@tf_export('keras.metrics.get')
 def get(identifier):
   if isinstance(identifier, six.string_types):
     identifier = str(identifier)
diff --git a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..275985aa36fc6d85768ae05f14cf65e710ad7353
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
@@ -0,0 +1,558 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Model subclassing."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+import numpy as np
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+try:
+  import h5py  # pylint:disable=g-import-not-at-top
+except ImportError:
+  h5py = None
+
+
+class SimpleTestModel(keras.Model):
+
+  def __init__(self, use_bn=False, use_dp=False, num_classes=10):
+    super(SimpleTestModel, self).__init__(name='test_model')
+    self.use_bn = use_bn
+    self.use_dp = use_dp
+    self.num_classes = num_classes
+
+    self.dense1 = keras.layers.Dense(32, activation='relu')
+    self.dense2 = keras.layers.Dense(num_classes, activation='softmax')
+    if self.use_dp:
+      self.dp = keras.layers.Dropout(0.5)
+    if self.use_bn:
+      self.bn = keras.layers.BatchNormalization(axis=-1)
+
+  def call(self, inputs):
+    x = self.dense1(inputs)
+    if self.use_dp:
+      x = self.dp(x)
+    if self.use_bn:
+      x = self.bn(x)
+    return self.dense2(x)
+
+
+class MultiIOTestModel(keras.Model):
+
+  def __init__(self, use_bn=False, use_dp=False, num_classes=(2, 3)):
+    super(MultiIOTestModel, self).__init__(name='test_model')
+    self.use_bn = use_bn
+    self.use_dp = use_dp
+    self.num_classes = num_classes
+
+    self.dense1 = keras.layers.Dense(32, activation='relu')
+    self.dense2 = keras.layers.Dense(num_classes[0], activation='softmax')
+    self.dense3 = keras.layers.Dense(num_classes[1], activation='softmax')
+    if use_dp:
+      self.dp = keras.layers.Dropout(0.5)
+    if use_bn:
+      self.bn = keras.layers.BatchNormalization()
+
+  def call(self, inputs):
+    x1, x2 = inputs
+    x1 = self.dense1(x1)
+    x2 = self.dense1(x2)
+    if self.use_dp:
+      x1 = self.dp(x1)
+    if self.use_bn:
+      x2 = self.bn(x2)
+    return [self.dense2(x1), self.dense3(x2)]
+
+
+class NestedTestModel1(keras.Model):
+  """A model subclass nested inside a model subclass.
+  """
+
+  def __init__(self, num_classes=2):
+    super(NestedTestModel1, self).__init__(name='nested_model_1')
+    self.num_classes = num_classes
+    self.dense1 = keras.layers.Dense(32, activation='relu')
+    self.dense2 = keras.layers.Dense(num_classes, activation='relu')
+    self.bn = keras.layers.BatchNormalization()
+    self.test_net = SimpleTestModel(num_classes=4,
+                                    use_bn=True,
+                                    use_dp=True)
+
+  def call(self, inputs):
+    x = self.dense1(inputs)
+    x = self.bn(x)
+    x = self.test_net(x)  # pylint: disable=not-callable
+    return self.dense2(x)
+
+
+def get_functional_graph_model(input_dim, num_classes):
+  # A simple functional-API model (a.k.a. graph network)
+  inputs = keras.Input(shape=(input_dim,))
+  x = keras.layers.Dense(32, activation='relu')(inputs)
+  x = keras.layers.BatchNormalization()(x)
+  outputs = keras.layers.Dense(num_classes)(x)
+  return keras.Model(inputs, outputs)
+
+
+class NestedTestModel2(keras.Model):
+  """A model subclass with a functional-API graph network inside.
+  """
+
+  def __init__(self, num_classes=2):
+    super(NestedTestModel2, self).__init__(name='nested_model_2')
+    self.num_classes = num_classes
+    self.dense1 = keras.layers.Dense(32, activation='relu')
+    self.dense2 = keras.layers.Dense(num_classes, activation='relu')
+    self.bn = self.bn = keras.layers.BatchNormalization()
+    self.test_net = get_functional_graph_model(32, 4)
+
+  def call(self, inputs):
+    x = self.dense1(inputs)
+    x = self.bn(x)
+    x = self.test_net(x)
+    return self.dense2(x)
+
+
+def get_nested_model_3(input_dim, num_classes):
+  # A functional-API model with a subclassed model inside.
+  # NOTE: this requires the inner subclass to implement `compute_output_shape`.
+
+  inputs = keras.Input(shape=(input_dim,))
+  x = keras.layers.Dense(32, activation='relu')(inputs)
+  x = keras.layers.BatchNormalization()(x)
+
+  class Inner(keras.Model):
+
+    def __init__(self):
+      super(Inner, self).__init__()
+      self.dense1 = keras.layers.Dense(32, activation='relu')
+      self.dense2 = keras.layers.Dense(5, activation='relu')
+      self.bn = keras.layers.BatchNormalization()
+
+    def call(self, inputs):
+      x = self.dense1(inputs)
+      x = self.dense2(x)
+      return self.bn(x)
+
+    def compute_output_shape(self, input_shape):
+      return tensor_shape.TensorShape((input_shape[0], 5))
+
+  test_model = Inner()
+  x = test_model(x)  # pylint: disable=not-callable
+  outputs = keras.layers.Dense(num_classes)(x)
+  return keras.Model(inputs, outputs, name='nested_model_3')
+
+
+class ModelSubclassingTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_single_io_workflow_with_np_arrays(self):
+    num_classes = 2
+    num_samples = 100
+    input_dim = 50
+
+    with self.test_session():
+      model = SimpleTestModel(num_classes=num_classes,
+                              use_dp=True,
+                              use_bn=True)
+      model.compile(loss='mse',
+                    optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    metrics=['acc'])
+
+      x = np.ones((num_samples, input_dim))
+      y = np.zeros((num_samples, num_classes))
+
+      model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+      _ = model.evaluate(x, y, verbose=0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_multi_io_workflow_with_np_arrays(self):
+    num_classes = (2, 3)
+    num_samples = 1000
+    input_dim = 50
+
+    with self.test_session():
+      model = MultiIOTestModel(num_classes=num_classes,
+                               use_dp=True,
+                               use_bn=True)
+      model.compile(loss='mse',
+                    optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    metrics=['acc'])
+
+      x1 = np.ones((num_samples, input_dim))
+      x2 = np.ones((num_samples, input_dim))
+      y1 = np.zeros((num_samples, num_classes[0]))
+      y2 = np.zeros((num_samples, num_classes[1]))
+
+      model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
+      _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
+
+  def test_single_io_workflow_with_tensors(self):
+
+    num_classes = 2
+    num_samples = 10
+    input_dim = 50
+
+    with self.test_session():
+      model = SimpleTestModel(num_classes=num_classes,
+                              use_dp=True,
+                              use_bn=True)
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+      x = array_ops.ones((num_samples, input_dim))
+      y = array_ops.zeros((num_samples, num_classes))
+
+      model.fit(x, y, epochs=2, steps_per_epoch=10, verbose=0)
+      _ = model.evaluate(steps=10, verbose=0)
+
+  def test_multi_io_workflow_with_tensors(self):
+
+    num_classes = (2, 3)
+    num_samples = 10
+    input_dim = 50
+
+    with self.test_session():
+      model = MultiIOTestModel(num_classes=num_classes,
+                               use_dp=True,
+                               use_bn=True)
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+      x1 = array_ops.ones((num_samples, input_dim))
+      x2 = array_ops.ones((num_samples, input_dim))
+      y1 = array_ops.zeros((num_samples, num_classes[0]))
+      y2 = array_ops.zeros((num_samples, num_classes[1]))
+
+      model.fit([x1, x2], [y1, y2], epochs=2, steps_per_epoch=10, verbose=0)
+      _ = model.evaluate(steps=10, verbose=0)
+
+  def test_multi_io_workflow_with_numpy_arrays_and_custom_placeholders(self):
+
+    num_classes = (2, 3)
+    num_samples = 1000
+    input_dim = 50
+
+    with self.test_session():
+      model = MultiIOTestModel(num_classes=num_classes,
+                               use_dp=True,
+                               use_bn=True)
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+      x1 = np.ones((num_samples, input_dim))
+      x2 = np.ones((num_samples, input_dim))
+      y1 = np.zeros((num_samples, num_classes[0]))
+      y2 = np.zeros((num_samples, num_classes[1]))
+
+      x2_placeholder = array_ops.placeholder(
+          dtype='float32', shape=(None, input_dim))
+      model._set_inputs([x1, x2_placeholder])
+
+      model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
+      _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def test_attributes(self):
+    # layers, weights, trainable_weights, non_trainable_weights, inputs, outputs
+
+    num_classes = (2, 3)
+    num_samples = 100
+    input_dim = 50
+
+    model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+
+    x1 = np.ones((num_samples, input_dim))
+    x2 = np.ones((num_samples, input_dim))
+    y1 = np.zeros((num_samples, num_classes[0]))
+    y2 = np.zeros((num_samples, num_classes[1]))
+
+    self.assertEqual(model.name, 'test_model')
+    self.assertEqual(model.built, False)
+    self.assertEqual(len(model.weights), 0)
+
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.train_on_batch([x1, x2], [y1, y2])
+
+    self.assertEqual(model.built, True)
+    self.assertEqual(len(model.layers), 4)
+    self.assertEqual(len(model.weights), 10)
+    self.assertEqual(len(model.trainable_weights), 8)
+    self.assertEqual(len(model.non_trainable_weights), 2)
+    self.assertEqual(len(model.inputs), 2)
+    self.assertEqual(len(model.outputs), 2)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_updates(self):
+    # test that updates get run during training
+    num_samples = 100
+    input_dim = 50
+
+    class BNNet(keras.Model):
+
+      def __init__(self):
+        super(BNNet, self).__init__()
+        self.bn = keras.layers.BatchNormalization(beta_initializer='ones',
+                                                  gamma_initializer='ones')
+
+      def call(self, inputs):
+        return self.bn(inputs)
+
+    x = np.ones((num_samples, input_dim))
+    y = np.ones((num_samples, input_dim))
+
+    with self.test_session():
+      model = BNNet()
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      y_ref = model.predict(x)
+
+      model.train_on_batch(x, y)
+      y_new = model.predict(x)
+      self.assertGreater(np.sum(np.abs(y_ref - y_new)), 0.1)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_training_and_inference_behavior(self):
+    # test that dropout is applied in training and not inference
+
+    num_samples = 100
+    input_dim = 50
+
+    class DPNet(keras.Model):
+
+      def __init__(self):
+        super(DPNet, self).__init__()
+        self.dp = keras.layers.Dropout(0.5)
+        self.dense = keras.layers.Dense(1,
+                                        use_bias=False,
+                                        kernel_initializer='ones')
+
+      def call(self, inputs):
+        x = self.dp(inputs)
+        return self.dense(x)
+
+    with self.test_session():
+      model = DPNet()
+      x = np.ones((num_samples, input_dim))
+      y = model.predict(x)
+      self.assertEqual(np.sum(y), np.sum(x))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      loss = model.train_on_batch(x, y)
+      self.assertGreater(loss, 0.1)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_training_methods(self):
+    # test fit, train_on_batch
+    # on different input types: list, dict
+
+    num_classes = (2, 3)
+    num_samples = 100
+    input_dim = 50
+
+    x1 = np.ones((num_samples, input_dim))
+    x2 = np.ones((num_samples, input_dim))
+    y1 = np.zeros((num_samples, num_classes[0]))
+    y2 = np.zeros((num_samples, num_classes[1]))
+
+    with self.test_session():
+      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32)
+      model.fit({'input_1': x1, 'input_2': x2},
+                {'output_1': y1, 'output_2': y2},
+                epochs=2, batch_size=32)
+      model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32,
+                validation_data=([x1, x2], [y1, y2]))
+
+      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.train_on_batch([x1, x2], [y1, y2])
+      model.train_on_batch({'input_1': x1, 'input_2': x2},
+                           {'output_1': y1, 'output_2': y2})
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def test_inference_methods(self):
+    # test predict, evaluate, test_on_batch, predict_on_batch
+    # on different input types: list, dict
+    num_classes = (2, 3)
+    num_samples = 100
+    input_dim = 50
+
+    x1 = np.ones((num_samples, input_dim))
+    x2 = np.ones((num_samples, input_dim))
+    y1 = np.zeros((num_samples, num_classes[0]))
+    y2 = np.zeros((num_samples, num_classes[1]))
+
+    with self.test_session():
+      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.evaluate([x1, x2], [y1, y2])
+      model.test_on_batch([x1, x2], [y1, y2])
+
+      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+      model.predict([x1, x2])
+
+      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+      model.predict_on_batch([x1, x2])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_trainable_mutation(self):
+    # test that you can change `trainable` on a model or layer, and that
+    # it freezes the model state during training
+    # TODO(fchollet): add test after we unify BN behavior in eager and symbolic.
+    pass
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_saving(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    num_classes = (2, 3)
+    num_samples = 100
+    input_dim = 50
+
+    x1 = np.ones((num_samples, input_dim))
+    x2 = np.ones((num_samples, input_dim))
+    y1 = np.zeros((num_samples, num_classes[0]))
+    y2 = np.zeros((num_samples, num_classes[1]))
+
+    with self.test_session():
+      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32)
+      y_ref_1, y_ref_2 = model.predict([x1, x2])
+
+      fd, fname = tempfile.mkstemp('.h5')
+      model.save_weights(fname)
+
+      model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
+      # need to build the model before loading weights
+      # (otherwise no weights to load)
+      model._set_inputs([x1, x2])
+      model.load_weights(fname)
+
+      y1, y2 = model.predict([x1, x2])
+      self.assertAllClose(y_ref_1, y1, atol=1e-5)
+      self.assertAllClose(y_ref_2, y2, atol=1e-5)
+      os.close(fd)
+      os.remove(fname)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_summary(self):
+
+    class ToString(object):
+
+      def __init__(self):
+        self.contents = ''
+
+      def __call__(self, msg):
+        self.contents += msg + '\n'
+
+    # Single-io
+    model = SimpleTestModel(num_classes=4, use_bn=True, use_dp=True)
+    model._set_inputs(np.ones((3, 4)))  # need to build model first
+    print_fn = ToString()
+    model.summary(print_fn=print_fn)
+    self.assertTrue('Trainable params: 356' in print_fn.contents)
+
+    # Multi-io
+    model = MultiIOTestModel(num_classes=(5, 6), use_bn=True, use_dp=True)
+    model._set_inputs([np.ones((3, 4)),
+                       np.ones((3, 4))])  # need to build model first
+    print_fn = ToString()
+    model.summary(print_fn=print_fn)
+    self.assertTrue('Trainable params: 587' in print_fn.contents)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_subclass_nested_in_subclass(self):
+    num_classes = 2
+    num_samples = 100
+    input_dim = 50
+
+    with self.test_session():
+      model = NestedTestModel1(num_classes=num_classes)
+      model.compile(loss='mse',
+                    optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    metrics=['acc'])
+
+      x = np.ones((num_samples, input_dim))
+      y = np.zeros((num_samples, num_classes))
+
+      model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+      _ = model.evaluate(x, y, verbose=0)
+
+      self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
+      self.assertEqual(len(model.non_trainable_weights),
+                       2 + len(model.test_net.non_trainable_weights))
+      self.assertEqual(len(model.trainable_weights),
+                       6 + len(model.test_net.trainable_weights))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_graph_nested_in_subclass(self):
+    num_classes = 2
+    num_samples = 100
+    input_dim = 50
+
+    with self.test_session():
+      model = NestedTestModel2(num_classes=num_classes)
+      model.compile(loss='mse',
+                    optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    metrics=['acc'])
+
+      x = np.ones((num_samples, input_dim))
+      y = np.zeros((num_samples, num_classes))
+
+      model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+      _ = model.evaluate(x, y, verbose=0)
+
+      self.assertEqual(len(model.weights), 8 + len(model.test_net.weights))
+      self.assertEqual(len(model.non_trainable_weights),
+                       2 + len(model.test_net.non_trainable_weights))
+      self.assertEqual(len(model.trainable_weights),
+                       6 + len(model.test_net.trainable_weights))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_subclass_nested_in_graph(self):
+    num_classes = 2
+    num_samples = 100
+    input_dim = 50
+
+    with self.test_session():
+      model = get_nested_model_3(input_dim=input_dim, num_classes=num_classes)
+      model.compile(loss='mse',
+                    optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    metrics=['acc'])
+
+      x = np.ones((num_samples, input_dim))
+      y = np.zeros((num_samples, num_classes))
+
+      model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+      _ = model.evaluate(x, y, verbose=0)
+
+      self.assertEqual(len(model.weights), 16)
+      self.assertEqual(
+          len(model.non_trainable_weights), 4)
+      self.assertEqual(len(model.trainable_weights), 12)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/models.py b/tensorflow/python/keras/_impl/keras/models.py
index ba202827ce3fca397ab487f58c01667b9b0c4444..4c3ec7dbe458bfb78d38950b1bad7a474bb55ad3 100644
--- a/tensorflow/python/keras/_impl/keras/models.py
+++ b/tensorflow/python/keras/_impl/keras/models.py
@@ -38,6 +38,7 @@ from tensorflow.python.keras._impl.keras.engine.training import Model
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=g-import-not-at-top
@@ -53,6 +54,7 @@ except ImportError:
 # pylint: enable=g-import-not-at-top
 
 
+@tf_export('keras.models.save_model')
 def save_model(model, filepath, overwrite=True, include_optimizer=True):
   """Save a model to a HDF5 file.
 
@@ -183,6 +185,7 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
     f.flush()
 
 
+@tf_export('keras.models.load_model')
 def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
   """Loads a model saved via `save_model`.
 
@@ -302,6 +305,7 @@ def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=
   return model
 
 
+@tf_export('keras.models.model_from_config')
 def model_from_config(config, custom_objects=None):
   """Instantiates a Keras model from its config.
 
@@ -324,6 +328,7 @@ def model_from_config(config, custom_objects=None):
   return layer_module.deserialize(config, custom_objects=custom_objects)
 
 
+@tf_export('keras.models.model_from_yaml')
 def model_from_yaml(yaml_string, custom_objects=None):
   """Parses a yaml model configuration file and returns a model instance.
 
@@ -345,6 +350,7 @@ def model_from_yaml(yaml_string, custom_objects=None):
   return layer_module.deserialize(config, custom_objects=custom_objects)
 
 
+@tf_export('keras.models.model_from_json')
 def model_from_json(json_string, custom_objects=None):
   """Parses a JSON model configuration file and returns a model instance.
 
@@ -361,6 +367,7 @@ def model_from_json(json_string, custom_objects=None):
   return layer_module.deserialize(config, custom_objects=custom_objects)
 
 
+@tf_export('keras.models.Sequential', 'keras.Sequential')
 class Sequential(Model):
   """Linear stack of layers.
 
@@ -399,7 +406,9 @@ class Sequential(Model):
   """
 
   def __init__(self, layers=None, name=None):
-    self.layers = []  # Stack of layers.
+    self._is_graph_network = True
+    self._is_compiled = False
+    self._layers = []  # Stack of layers.
     self.model = None  # Internal Model instance.
     self.inputs = []  # List of input tensors
     self.outputs = []  # List of length 1: the output tensor (unique).
@@ -421,8 +430,6 @@ class Sequential(Model):
     # Used by Layer base class.
     self._dtype = None
     self._activity_regularizer = None
-    self._per_input_losses = {}
-    self._per_input_updates = {}
 
     # The following properties are not actually used by Keras;
     # they exist for compatibility with TF's variable scoping mechanism.
@@ -492,13 +499,13 @@ class Sequential(Model):
         # to the input layer we just created.
         layer(x)
 
-      if len(layer.inbound_nodes[-1].output_tensors) != 1:
+      if len(layer._inbound_nodes[-1].output_tensors) != 1:
         raise ValueError('All layers in a Sequential model '
                          'should have a single output tensor. '
                          'For multi-output layers, '
                          'use the functional API.')
 
-      self.outputs = [layer.inbound_nodes[-1].output_tensors[0]]
+      self.outputs = [layer._inbound_nodes[-1].output_tensors[0]]
       self.inputs = topology.get_source_inputs(self.outputs[0])
 
       # We create an input node, which we will keep updated
@@ -522,7 +529,7 @@ class Sequential(Model):
       self._inbound_nodes[0].output_tensors = self.outputs
       self._inbound_nodes[0].output_shapes = [K.int_shape(self.outputs[0])]
 
-    self.layers.append(layer)
+    self._layers.append(layer)
     self.built = False
 
   def pop(self):
@@ -636,34 +643,6 @@ class Sequential(Model):
       return trainable_weights + weights
     return weights
 
-  @property
-  def updates(self):
-    if not self.built:
-      self.build()
-    return self.model.updates
-
-  @property
-  def state_updates(self):
-    if not self.built:
-      self.build()
-    return self.model.state_updates
-
-  def get_updates_for(self, inputs):
-    if not self.built:
-      self.build()
-    return self.model.get_updates_for(inputs)
-
-  @property
-  def losses(self):
-    if not self.built:
-      self.build()
-    return self.model.losses
-
-  def get_losses_for(self, inputs):
-    if not self.built:
-      self.build()
-    return self.model.get_losses_for(inputs)
-
   @property
   def regularizers(self):
     if not self.built:
@@ -1070,7 +1049,7 @@ class Sequential(Model):
 
   def fit_generator(self,
                     generator,
-                    steps_per_epoch,
+                    steps_per_epoch=None,
                     epochs=1,
                     verbose=1,
                     callbacks=None,
@@ -1101,8 +1080,10 @@ class Sequential(Model):
         steps_per_epoch: Total number of steps (batches of samples)
             to yield from `generator` before declaring one epoch
             finished and starting the next epoch. It should typically
-            be equal to the number of unique samples of your dataset
+            be equal to the number of samples of your dataset
             divided by the batch size.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
         epochs: Integer, total number of iterations on the data.
             Note that in conjunction with initial_epoch, the parameter
             epochs is to be understood as "final epoch". The model is
@@ -1118,8 +1099,10 @@ class Sequential(Model):
             is a generator.
             Number of steps to yield from validation generator
             at the end of every epoch. It should typically
-            be equal to the number of unique samples of your
+            be equal to the number of samples of your
             validation dataset divided by the batch size.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(validation_data)` as a number of steps.
         class_weight: Dictionary mapping class indices to a weight
             for the class.
         max_queue_size: Maximum size for the generator queue
@@ -1195,7 +1178,7 @@ class Sequential(Model):
 
   def evaluate_generator(self,
                          generator,
-                         steps,
+                         steps=None,
                          max_queue_size=10,
                          workers=1,
                          use_multiprocessing=False,
@@ -1210,6 +1193,8 @@ class Sequential(Model):
             or (inputs, targets, sample_weights)
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
         max_queue_size: maximum size for the generator queue
         workers: maximum number of processes to spin up
         use_multiprocessing: if True, use process based threading.
@@ -1254,7 +1239,7 @@ class Sequential(Model):
 
   def predict_generator(self,
                         generator,
-                        steps,
+                        steps=None,
                         max_queue_size=10,
                         workers=1,
                         use_multiprocessing=False,
@@ -1269,6 +1254,8 @@ class Sequential(Model):
         generator: generator yielding batches of input samples.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
         max_queue_size: maximum size for the generator queue
         workers: maximum number of processes to spin up
         use_multiprocessing: if True, use process based threading.
diff --git a/tensorflow/python/keras/_impl/keras/models_test.py b/tensorflow/python/keras/_impl/keras/models_test.py
index 61938066b98b9f6bb48e7e68870d15ed60ad3dd9..04017e4b28b27e52f88a7746fc44510c29edffce 100644
--- a/tensorflow/python/keras/_impl/keras/models_test.py
+++ b/tensorflow/python/keras/_impl/keras/models_test.py
@@ -306,7 +306,7 @@ class TestSequential(test.TestCase):
         def call(self, inputs):
           return [3 * inputs, 2 * inputs]
 
-        def _compute_output_shape(self, input_shape):
+        def compute_output_shape(self, input_shape):
           return [input_shape, input_shape]
 
       with self.assertRaises(ValueError):
@@ -340,6 +340,35 @@ class TestSequential(test.TestCase):
     inner_model.trainable = True
     self.assertEqual(len(model.trainable_weights), 4)
 
+  def test_sequential_update_disabling(self):
+    val_a = np.random.random((10, 4))
+    val_out = np.random.random((10, 4))
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.BatchNormalization(input_shape=(4,)))
+
+      model.trainable = False
+      assert not model.updates
+
+      model.compile('sgd', 'mse')
+      assert not model.updates
+      assert not model.model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
+      model.trainable = True
+      model.compile('sgd', 'mse')
+      assert model.updates
+      assert model.model.updates
+
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      assert np.abs(np.sum(x1 - x2)) > 1e-5
+
 
 class TestModelCloning(test.TestCase):
 
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/_impl/keras/optimizers.py
index a08073fa86442e0564aa63052bb87b92dc64cdf6..76a97156ed7d9ca89b0d94f31bed3a23eca9609d 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras optimizer classes (will eventually be replaced with core optimizers).
+# pylint: disable=invalid-name
+"""Built-in optimizer classes.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -31,6 +32,7 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_ke
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer as tf_optimizer_module
+from tensorflow.python.util.tf_export import tf_export
 
 
 def clip_norm(g, c, n):
@@ -64,6 +66,7 @@ def clip_norm(g, c, n):
   return g
 
 
+@tf_export('keras.optimizers.Optimizer')
 class Optimizer(object):
   """Abstract optimizer base class.
 
@@ -121,9 +124,9 @@ class Optimizer(object):
     param_values = K.batch_get_value(params)
     for pv, p, w in zip(param_values, params, weights):
       if pv.shape != w.shape:
-        raise ValueError('Optimizer weight shape ' + str(pv.shape) +
-                         ' not compatible with '
-                         'provided weight shape ' + str(w.shape))
+        raise ValueError(
+            'Optimizer weight shape ' + str(pv.shape) + ' not compatible with '
+            'provided weight shape ' + str(w.shape))
       weight_value_tuples.append((p, w))
     K.batch_set_value(weight_value_tuples)
 
@@ -148,6 +151,7 @@ class Optimizer(object):
     return cls(**config)
 
 
+@tf_export('keras.optimizers.SGD')
 class SGD(Optimizer):
   """Stochastic gradient descent optimizer.
 
@@ -156,7 +160,8 @@ class SGD(Optimizer):
 
   Arguments:
       lr: float >= 0. Learning rate.
-      momentum: float >= 0. Parameter updates momentum.
+      momentum: float >= 0. Parameter that accelerates SGD
+          in the relevant direction and dampens oscillations.
       decay: float >= 0. Learning rate decay over each update.
       nesterov: boolean. Whether to apply Nesterov momentum.
   """
@@ -177,9 +182,9 @@ class SGD(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
-
+      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
+                 (1. + self.decay * K.cast(self.iterations,
+                                           K.dtype(self.decay))))
     # momentum
     shapes = [K.int_shape(p) for p in params]
     moments = [K.zeros(shape) for shape in shapes]
@@ -211,6 +216,7 @@ class SGD(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.optimizers.RMSprop')
 class RMSprop(Optimizer):
   """RMSProp optimizer.
 
@@ -224,32 +230,34 @@ class RMSprop(Optimizer):
   Arguments:
       lr: float >= 0. Learning rate.
       rho: float >= 0.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
+
   """
 
-  def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0., **kwargs):
+  def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs):
     super(RMSprop, self).__init__(**kwargs)
     with K.name_scope(self.__class__.__name__):
       self.lr = K.variable(lr, name='lr')
       self.rho = K.variable(rho, name='rho')
       self.decay = K.variable(decay, name='decay')
       self.iterations = K.variable(0, dtype='int64', name='iterations')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.initial_decay = decay
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    accumulators = [
-        K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params
-    ]
+    accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     self.weights = accumulators
     self.updates = [K.update_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
+                 (1. + self.decay * K.cast(self.iterations,
+                                           K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
       # update accumulator
@@ -275,6 +283,7 @@ class RMSprop(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.optimizers.Adagrad')
 class Adagrad(Optimizer):
   """Adagrad optimizer.
 
@@ -283,20 +292,19 @@ class Adagrad(Optimizer):
 
   Arguments:
       lr: float >= 0. Learning rate.
-      epsilon: float >= 0.
+      epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
 
-  References:
-      - [Adaptive Subgradient Methods for Online Learning and Stochastic
-        Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
   """
 
-  def __init__(self, lr=0.01, epsilon=1e-8, decay=0., **kwargs):
+  def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
     super(Adagrad, self).__init__(**kwargs)
     with K.name_scope(self.__class__.__name__):
       self.lr = K.variable(lr, name='lr')
       self.decay = K.variable(decay, name='decay')
       self.iterations = K.variable(0, dtype='int64', name='iterations')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.initial_decay = decay
 
@@ -309,8 +317,9 @@ class Adagrad(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
+                 (1. + self.decay * K.cast(self.iterations,
+                                           K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
       new_a = a + K.square(g)  # update accumulator
@@ -334,6 +343,7 @@ class Adagrad(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.optimizers.Adadelta')
 class Adadelta(Optimizer):
   """Adadelta optimizer.
 
@@ -344,20 +354,19 @@ class Adadelta(Optimizer):
       lr: float >= 0. Learning rate.
           It is recommended to leave it at the default value.
       rho: float >= 0.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
 
-  References:
-      - [Adadelta - an adaptive learning rate
-        method](http://arxiv.org/abs/1212.5701)
   """
 
-  def __init__(self, lr=1.0, rho=0.95, epsilon=1e-8, decay=0., **kwargs):
+  def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs):
     super(Adadelta, self).__init__(**kwargs)
     with K.name_scope(self.__class__.__name__):
       self.lr = K.variable(lr, name='lr')
       self.decay = K.variable(decay, name='decay')
       self.iterations = K.variable(0, dtype='int64', name='iterations')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.rho = rho
     self.epsilon = epsilon
     self.initial_decay = decay
@@ -372,8 +381,9 @@ class Adadelta(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
+                 (1. + self.decay * K.cast(self.iterations,
+                                           K.dtype(self.decay))))
 
     for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
       # update accumulator
@@ -406,6 +416,7 @@ class Adadelta(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.optimizers.Adam')
 class Adam(Optimizer):
   """Adam optimizer.
 
@@ -415,20 +426,21 @@ class Adam(Optimizer):
       lr: float >= 0. Learning rate.
       beta_1: float, 0 < beta < 1. Generally close to 1.
       beta_2: float, 0 < beta < 1. Generally close to 1.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
+      amsgrad: boolean. Whether to apply the AMSGrad variant of this
+          algorithm from the paper "On the Convergence of Adam and
+          Beyond".
 
-  References:
-      - [Adam - A Method for Stochastic
-        Optimization](http://arxiv.org/abs/1412.6980v8)
   """
 
   def __init__(self,
                lr=0.001,
                beta_1=0.9,
                beta_2=0.999,
-               epsilon=1e-8,
+               epsilon=None,
                decay=0.,
+               amsgrad=False,
                **kwargs):
     super(Adam, self).__init__(**kwargs)
     with K.name_scope(self.__class__.__name__):
@@ -437,8 +449,11 @@ class Adam(Optimizer):
       self.beta_1 = K.variable(beta_1, name='beta_1')
       self.beta_2 = K.variable(beta_2, name='beta_2')
       self.decay = K.variable(decay, name='decay')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.initial_decay = decay
+    self.amsgrad = amsgrad
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
@@ -446,21 +461,31 @@ class Adam(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
+                 (1. + self.decay * K.cast(self.iterations,
+                                           K.dtype(self.decay))))
 
     t = K.cast(self.iterations, K.floatx()) + 1
-    lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
-                 (1. - K.pow(self.beta_1, t)))
+    lr_t = lr * (
+        K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)))
 
     ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
-    self.weights = [self.iterations] + ms + vs
+    if self.amsgrad:
+      vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
+    else:
+      vhats = [K.zeros(1) for _ in params]
+    self.weights = [self.iterations] + ms + vs + vhats
 
-    for p, g, m, v in zip(params, grads, ms, vs):
+    for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
       m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
       v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
-      p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
+      if self.amsgrad:
+        vhat_t = K.maximum(vhat, v_t)
+        p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
+        self.updates.append(K.update(vhat, vhat_t))
+      else:
+        p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
 
       self.updates.append(K.update(m, m_t))
       self.updates.append(K.update(v, v_t))
@@ -479,12 +504,14 @@ class Adam(Optimizer):
         'beta_1': float(K.get_value(self.beta_1)),
         'beta_2': float(K.get_value(self.beta_2)),
         'decay': float(K.get_value(self.decay)),
-        'epsilon': self.epsilon
+        'epsilon': self.epsilon,
+        'amsgrad': self.amsgrad
     }
     base_config = super(Adam, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.optimizers.Adamax')
 class Adamax(Optimizer):
   """Adamax optimizer from Adam paper's Section 7.
 
@@ -494,19 +521,16 @@ class Adamax(Optimizer):
   Arguments:
       lr: float >= 0. Learning rate.
       beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
 
-  References:
-      - [Adam - A Method for Stochastic
-        Optimization](http://arxiv.org/abs/1412.6980v8)
   """
 
   def __init__(self,
                lr=0.002,
                beta_1=0.9,
                beta_2=0.999,
-               epsilon=1e-8,
+               epsilon=None,
                decay=0.,
                **kwargs):
     super(Adamax, self).__init__(**kwargs)
@@ -516,6 +540,8 @@ class Adamax(Optimizer):
       self.beta_1 = K.variable(beta_1, name='beta_1')
       self.beta_2 = K.variable(beta_2, name='beta_2')
       self.decay = K.variable(decay, name='decay')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.initial_decay = decay
 
@@ -525,8 +551,9 @@ class Adamax(Optimizer):
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr *= (1. / (1. + self.decay * K.cast(self.iterations,
-                                            K.dtype(self.decay))))
+      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
+                 (1. + self.decay * K.cast(self.iterations,
+                                           K.dtype(self.decay))))
 
     t = K.cast(self.iterations, K.floatx()) + 1
     lr_t = lr / (1. - K.pow(self.beta_1, t))
@@ -567,6 +594,7 @@ class Adamax(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.optimizers.Nadam')
 class Nadam(Optimizer):
   """Nesterov Adam optimizer.
 
@@ -580,19 +608,15 @@ class Nadam(Optimizer):
   Arguments:
       lr: float >= 0. Learning rate.
       beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
-      epsilon: float >= 0. Fuzz factor.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
 
-  References:
-      - [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf)
-      - [On the importance of initialization and momentum in deep
-        learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf)
   """
 
   def __init__(self,
                lr=0.002,
                beta_1=0.9,
                beta_2=0.999,
-               epsilon=1e-8,
+               epsilon=None,
                schedule_decay=0.004,
                **kwargs):
     super(Nadam, self).__init__(**kwargs)
@@ -602,12 +626,15 @@ class Nadam(Optimizer):
       self.lr = K.variable(lr, name='lr')
       self.beta_1 = K.variable(beta_1, name='beta_1')
       self.beta_2 = K.variable(beta_2, name='beta_2')
+    if epsilon is None:
+      epsilon = K.epsilon()
     self.epsilon = epsilon
     self.schedule_decay = schedule_decay
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
     self.updates = [K.update_add(self.iterations, 1)]
+
     t = K.cast(self.iterations, K.floatx()) + 1
 
     # Due to the recommendations in [2], i.e. warming momentum schedule
@@ -670,6 +697,12 @@ class TFOptimizer(Optimizer):
     with K.name_scope(self.__class__.__name__):
       self.iterations = K.variable(0, dtype='int64', name='iterations')
 
+  def apply_gradients(self, grads):
+    self.optimizer.apply_gradients(grads)
+
+  def get_grads(self, loss, params):
+    return self.optimizer.compute_gradients(loss, params)
+
   def get_updates(self, loss, params):
     grads = self.optimizer.compute_gradients(loss, params)
     self.updates = [K.update_add(self.iterations, 1)]
@@ -691,7 +724,6 @@ class TFOptimizer(Optimizer):
 
 # Aliases.
 
-# pylint: disable=invalid-name
 sgd = SGD
 rmsprop = RMSprop
 adagrad = Adagrad
@@ -700,13 +732,13 @@ adam = Adam
 adamax = Adamax
 nadam = Nadam
 
-# pylint: enable=invalid-name
-
 
+@tf_export('keras.optimizers.serialize')
 def serialize(optimizer):
   return serialize_keras_object(optimizer)
 
 
+@tf_export('keras.optimizers.deserialize')
 def deserialize(config, custom_objects=None):
   """Inverse of the `serialize` function.
 
@@ -740,6 +772,7 @@ def deserialize(config, custom_objects=None):
       printable_module_name='optimizer')
 
 
+@tf_export('keras.optimizers.get')
 def get(identifier):
   """Retrieves a Keras Optimizer instance.
 
diff --git a/tensorflow/python/keras/_impl/keras/optimizers_test.py b/tensorflow/python/keras/_impl/keras/optimizers_test.py
index 6e9e4e6c99a6ffb0684d20ca001bba98b0d799bc..57636afbf089f27c00cc56c46fdb3ea50f89cc6b 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers_test.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers_test.py
@@ -102,6 +102,7 @@ class KerasOptimizersTest(test.TestCase):
     with self.test_session():
       _test_optimizer(keras.optimizers.Adam())
       _test_optimizer(keras.optimizers.Adam(decay=1e-3))
+      _test_optimizer(keras.optimizers.Adam(amsgrad=True))
 
   def test_adamax(self):
     with self.test_session():
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image.py b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
index 12dc718cd791d0a5829c4809474a83783ed561f9..d12f10863921ee7d635930f34e8bc701c89864e8 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=g-import-not-at-top
 """Fairly basic set of tools for real-time data augmentation on image data.
 
 Can easily be extended to include new transformations,
@@ -28,25 +29,23 @@ import re
 import threading
 
 import numpy as np
-from six.moves import range  # pylint: disable=redefined-builtin
-
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
-
-# pylint: disable=g-import-not-at-top
-try:
-  from PIL import Image as pil_image
-except ImportError:
-  pil_image = None
 try:
   from scipy import linalg
   import scipy.ndimage as ndi
 except ImportError:
   linalg = None
   ndi = None
-# pylint: enable=g-import-not-at-top
+
+
+try:
+  from PIL import Image as pil_image
+except ImportError:
+  pil_image = None
 
 if pil_image is not None:
   _PIL_INTERPOLATION_METHODS = {
@@ -64,6 +63,7 @@ if pil_image is not None:
     _PIL_INTERPOLATION_METHODS['lanczos'] = pil_image.LANCZOS
 
 
+@tf_export('keras.preprocessing.image.random_rotation')
 def random_rotation(x,
                     rg,
                     row_axis=1,
@@ -88,7 +88,7 @@ def random_rotation(x,
   Returns:
       Rotated Numpy image tensor.
   """
-  theta = np.pi / 180 * np.random.uniform(-rg, rg)
+  theta = np.deg2rad(np.random.uniform(-rg, rg))
   rotation_matrix = np.array([[np.cos(theta), -np.sin(theta), 0],
                               [np.sin(theta), np.cos(theta), 0], [0, 0, 1]])
 
@@ -98,6 +98,7 @@ def random_rotation(x,
   return x
 
 
+@tf_export('keras.preprocessing.image.random_shift')
 def random_shift(x,
                  wrg,
                  hrg,
@@ -134,6 +135,7 @@ def random_shift(x,
   return x
 
 
+@tf_export('keras.preprocessing.image.random_shear')
 def random_shear(x,
                  intensity,
                  row_axis=1,
@@ -145,7 +147,7 @@ def random_shear(x,
 
   Arguments:
       x: Input tensor. Must be 3D.
-      intensity: Transformation intensity.
+      intensity: Transformation intensity in degrees.
       row_axis: Index of axis for rows in the input tensor.
       col_axis: Index of axis for columns in the input tensor.
       channel_axis: Index of axis for channels in the input tensor.
@@ -158,7 +160,7 @@ def random_shear(x,
   Returns:
       Sheared Numpy image tensor.
   """
-  shear = np.random.uniform(-intensity, intensity)
+  shear = np.deg2rad(np.random.uniform(-intensity, intensity))
   shear_matrix = np.array([[1, -np.sin(shear), 0], [0, np.cos(shear), 0],
                            [0, 0, 1]])
 
@@ -168,6 +170,7 @@ def random_shear(x,
   return x
 
 
+@tf_export('keras.preprocessing.image.random_zoom')
 def random_zoom(x,
                 zoom_range,
                 row_axis=1,
@@ -188,8 +191,10 @@ def random_zoom(x,
           (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
       cval: Value used for points outside the boundaries
           of the input if `mode='constant'`.
+
   Returns:
       Zoomed Numpy image tensor.
+
   Raises:
       ValueError: if `zoom_range` isn't a tuple.
   """
@@ -209,6 +214,7 @@ def random_zoom(x,
   return x
 
 
+@tf_export('keras.preprocessing.image.random_channel_shift')
 def random_channel_shift(x, intensity, channel_axis=0):
   x = np.rollaxis(x, channel_axis, 0)
   min_x, max_x = np.min(x), np.max(x)
@@ -230,6 +236,7 @@ def transform_matrix_offset_center(matrix, x, y):
   return transform_matrix
 
 
+@tf_export('keras.preprocessing.image.apply_transform')
 def apply_transform(x,
                     transform_matrix,
                     channel_axis=0,
@@ -267,6 +274,7 @@ def apply_transform(x,
   return x
 
 
+@tf_export('keras.preprocessing.image.flip_axis')
 def flip_axis(x, axis):
   x = np.asarray(x).swapaxes(axis, 0)
   x = x[::-1, ...]
@@ -274,6 +282,7 @@ def flip_axis(x, axis):
   return x
 
 
+@tf_export('keras.preprocessing.image.array_to_img')
 def array_to_img(x, data_format=None, scale=True):
   """Converts a 3D Numpy array to a PIL Image instance.
 
@@ -324,6 +333,7 @@ def array_to_img(x, data_format=None, scale=True):
     raise ValueError('Unsupported channel number: ', x.shape[2])
 
 
+@tf_export('keras.preprocessing.image.img_to_array')
 def img_to_array(img, data_format=None):
   """Converts a PIL Image instance to a Numpy array.
 
@@ -358,6 +368,7 @@ def img_to_array(img, data_format=None):
   return x
 
 
+@tf_export('keras.preprocessing.image.load_img')
 def load_img(path, grayscale=False, target_size=None, interpolation='nearest'):
   """Loads an image into PIL format.
 
@@ -366,7 +377,7 @@ def load_img(path, grayscale=False, target_size=None, interpolation='nearest'):
       grayscale: Boolean, whether to load the image as grayscale.
       target_size: Either `None` (default to original size)
           or tuple of ints `(img_height, img_width)`.
-     interpolation: Interpolation method used to resample the image if the
+      interpolation: Interpolation method used to resample the image if the
           target size is different from that of the loaded image.
           Supported methods are "nearest", "bilinear", and "bicubic".
           If PIL version 1.1.3 or newer is installed, "lanczos" is also
@@ -394,11 +405,9 @@ def load_img(path, grayscale=False, target_size=None, interpolation='nearest'):
     width_height_tuple = (target_size[1], target_size[0])
     if img.size != width_height_tuple:
       if interpolation not in _PIL_INTERPOLATION_METHODS:
-        raise ValueError(
-            'Invalid interpolation method {} specified. Supported '
-            'methods are {}'.format(
-                interpolation,
-                ', '.join(_PIL_INTERPOLATION_METHODS.keys())))
+        raise ValueError('Invalid interpolation method {} specified. Supported '
+                         'methods are {}'.format(interpolation, ', '.join(
+                             _PIL_INTERPOLATION_METHODS.keys())))
       resample = _PIL_INTERPOLATION_METHODS[interpolation]
       img = img.resize(width_height_tuple, resample)
   return img
@@ -407,11 +416,13 @@ def load_img(path, grayscale=False, target_size=None, interpolation='nearest'):
 def list_pictures(directory, ext='jpg|jpeg|bmp|png|ppm'):
   return [
       os.path.join(root, f)
-      for root, _, files in os.walk(directory) for f in files
+      for root, _, files in os.walk(directory)
+      for f in files
       if re.match(r'([\w]+\.(?:' + ext + '))', f)
   ]
 
 
+@tf_export('keras.preprocessing.image.ImageDataGenerator')
 class ImageDataGenerator(object):
   """Generate minibatches of image data with real-time data augmentation.
 
@@ -423,9 +434,9 @@ class ImageDataGenerator(object):
       zca_whitening: apply ZCA whitening.
       zca_epsilon: epsilon for ZCA whitening. Default is 1e-6.
       rotation_range: degrees (0 to 180).
-      width_shift_range: fraction of total width.
-      height_shift_range: fraction of total height.
-      shear_range: shear intensity (shear angle in radians).
+      width_shift_range: fraction of total width, if < 1, or pixels if >= 1.
+      height_shift_range: fraction of total height, if < 1, or pixels if >= 1.
+      shear_range: shear intensity (shear angle in degrees).
       zoom_range: amount of zoom. if scalar z, zoom will be randomly picked
           in the range [1-z, 1+z]. A sequence of two can be passed instead
           to select this range.
@@ -433,6 +444,12 @@ class ImageDataGenerator(object):
       fill_mode: points outside the boundaries are filled according to the
           given mode ('constant', 'nearest', 'reflect' or 'wrap'). Default
           is 'nearest'.
+          Points outside the boundaries of the input are filled according to the
+            given mode:
+              'constant': kkkkkkkk|abcd|kkkkkkkk (cval=k)
+              'nearest':  aaaaaaaa|abcd|dddddddd
+              'reflect':  abcddcba|abcd|dcbaabcd
+              'wrap':  abcdabcd|abcd|abcdabcd
       cval: value used for points outside the boundaries when fill_mode is
           'constant'. Default is 0.
       horizontal_flip: whether to randomly flip images horizontally.
@@ -522,6 +539,32 @@ class ImageDataGenerator(object):
       raise ValueError('`zoom_range` should be a float or '
                        'a tuple or list of two floats. '
                        'Received arg: ', zoom_range)
+    if zca_whitening:
+      if not featurewise_center:
+        self.featurewise_center = True
+        logging.warning('This ImageDataGenerator specifies '
+                        '`zca_whitening`, which overrides '
+                        'setting of `featurewise_center`.')
+      if featurewise_std_normalization:
+        self.featurewise_std_normalization = False
+        logging.warning('This ImageDataGenerator specifies '
+                        '`zca_whitening` '
+                        'which overrides setting of'
+                        '`featurewise_std_normalization`.')
+    if featurewise_std_normalization:
+      if not featurewise_center:
+        self.featurewise_center = True
+        logging.warning('This ImageDataGenerator specifies '
+                        '`featurewise_std_normalization`, '
+                        'which overrides setting of '
+                        '`featurewise_center`.')
+    if samplewise_std_normalization:
+      if not samplewise_center:
+        self.samplewise_center = True
+        logging.warning('This ImageDataGenerator specifies '
+                        '`samplewise_std_normalization`, '
+                        'which overrides setting of '
+                        '`samplewise_center`.')
 
   def flow(self,
            x,
@@ -556,7 +599,8 @@ class ImageDataGenerator(object):
                           save_to_dir=None,
                           save_prefix='',
                           save_format='png',
-                          follow_links=False):
+                          follow_links=False,
+                          interpolation='nearest'):
     return DirectoryIterator(
         directory,
         self,
@@ -571,7 +615,8 @@ class ImageDataGenerator(object):
         save_to_dir=save_to_dir,
         save_prefix=save_prefix,
         save_format=save_format,
-        follow_links=follow_links)
+        follow_links=follow_links,
+        interpolation=interpolation)
 
   def standardize(self, x):
     """Apply the normalization configuration to a batch of inputs.
@@ -589,22 +634,22 @@ class ImageDataGenerator(object):
     if self.samplewise_center:
       x -= np.mean(x, keepdims=True)
     if self.samplewise_std_normalization:
-      x /= np.std(x, keepdims=True) + 1e-7
+      x /= (np.std(x, keepdims=True) + K.epsilon())
 
     if self.featurewise_center:
       if self.mean is not None:
         x -= self.mean
       else:
         logging.warning('This ImageDataGenerator specifies '
-                        '`featurewise_center`, but it hasn\'t'
+                        '`featurewise_center`, but it hasn\'t '
                         'been fit on any training data. Fit it '
                         'first by calling `.fit(numpy_data)`.')
     if self.featurewise_std_normalization:
       if self.std is not None:
-        x /= (self.std + 1e-7)
+        x /= (self.std + K.epsilon())
       else:
         logging.warning('This ImageDataGenerator specifies '
-                        '`featurewise_std_normalization`, but it hasn\'t'
+                        '`featurewise_std_normalization`, but it hasn\'t '
                         'been fit on any training data. Fit it '
                         'first by calling `.fit(numpy_data)`.')
     if self.zca_whitening:
@@ -614,7 +659,7 @@ class ImageDataGenerator(object):
         x = np.reshape(whitex, x.shape)
       else:
         logging.warning('This ImageDataGenerator specifies '
-                        '`zca_whitening`, but it hasn\'t'
+                        '`zca_whitening`, but it hasn\'t '
                         'been fit on any training data. Fit it '
                         'first by calling `.fit(numpy_data)`.')
     return x
@@ -634,7 +679,6 @@ class ImageDataGenerator(object):
     """
     if ndi is None:
       raise ImportError('Scipy is required for image transformations.')
-
     # x is a single image, so it doesn't have image number at index 0
     img_row_axis = self.row_axis - 1
     img_col_axis = self.col_axis - 1
@@ -646,25 +690,27 @@ class ImageDataGenerator(object):
     # use composition of homographies
     # to generate final transform that needs to be applied
     if self.rotation_range:
-      theta = np.pi / 180 * np.random.uniform(-self.rotation_range,
-                                              self.rotation_range)
+      theta = np.deg2rad(
+          np.random.uniform(-self.rotation_range, self.rotation_range))
     else:
       theta = 0
 
     if self.height_shift_range:
-      tx = np.random.uniform(-self.height_shift_range,
-                             self.height_shift_range) * x.shape[img_row_axis]
+      tx = np.random.uniform(-self.height_shift_range, self.height_shift_range)
+      if self.height_shift_range < 1:
+        tx *= x.shape[img_row_axis]
     else:
       tx = 0
 
     if self.width_shift_range:
-      ty = np.random.uniform(-self.width_shift_range,
-                             self.width_shift_range) * x.shape[img_col_axis]
+      ty = np.random.uniform(-self.width_shift_range, self.width_shift_range)
+      if self.width_shift_range < 1:
+        ty *= x.shape[img_col_axis]
     else:
       ty = 0
 
     if self.shear_range:
-      shear = np.random.uniform(-self.shear_range, self.shear_range)
+      shear = np.deg2rad(np.random.uniform(-self.shear_range, self.shear_range))
     else:
       shear = 0
 
@@ -742,7 +788,7 @@ class ImageDataGenerator(object):
     if x.ndim != 4:
       raise ValueError('Input to `.fit()` should have rank 4. '
                        'Got array with shape: ' + str(x.shape))
-    if x.shape[self.channel_axis] not in {3, 4}:
+    if x.shape[self.channel_axis] not in {1, 3, 4}:
       logging.warning(
           'Expected input to be images (as Numpy array) '
           'following the data format convention "' + self.data_format + '" '
@@ -782,12 +828,15 @@ class ImageDataGenerator(object):
         raise ImportError('Scipy is required for zca_whitening.')
 
       flat_x = np.reshape(x, (x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]))
-      sigma = np.dot(flat_x.T, flat_x) / flat_x.shape[0]
-      u, s, _ = linalg.svd(sigma)
-      self.principal_components = np.dot(
-          np.dot(u, np.diag(1. / np.sqrt(s + self.zca_epsilon))), u.T)
+      num_examples = flat_x.shape[0]
+      _, s, vt = linalg.svd(flat_x / np.sqrt(num_examples))
+      s_expand = np.hstack(
+          (s, np.zeros(vt.shape[0] - num_examples, dtype=flat_x.dtype)))
+      self.principal_components = (
+          vt.T / np.sqrt(s_expand**2 + self.zca_epsilon)).dot(vt)
 
 
+@tf_export('keras.preprocessing.image.Iterator')
 class Iterator(Sequence):
   """Base class for image data iterators.
 
@@ -795,10 +844,10 @@ class Iterator(Sequence):
   method.
 
   Arguments:
-    n: Integer, total number of samples in the dataset to loop over.
-    batch_size: Integer, size of a batch.
-    shuffle: Boolean, whether to shuffle the data between epochs.
-    seed: Random seeding for data shuffling.
+      n: Integer, total number of samples in the dataset to loop over.
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      seed: Random seeding for data shuffling.
   """
 
   def __init__(self, n, batch_size, shuffle, seed):
@@ -821,20 +870,18 @@ class Iterator(Sequence):
     if idx >= len(self):
       raise ValueError('Asked to retrieve element {idx}, '
                        'but the Sequence '
-                       'has length {length}'.format(idx=idx,
-                                                    length=len(self)))
+                       'has length {length}'.format(idx=idx, length=len(self)))
     if self.seed is not None:
       np.random.seed(self.seed + self.total_batches_seen)
     self.total_batches_seen += 1
     if self.index_array is None:
       self._set_index_array()
-    index_array = self.index_array[self.batch_size * idx:self.batch_size *
-                                   (idx + 1)]
+    index_array = self.index_array[self.batch_size * idx:self.batch_size * (
+        idx + 1)]
     return self._get_batches_of_transformed_samples(index_array)
 
   def __len__(self):
-    length = int(np.ceil(self.n / float(self.batch_size)))
-    return np.maximum(length, 0)
+    return (self.n + self.batch_size - 1) // self.batch_size  # round up
 
   def on_epoch_end(self):
     self._set_index_array()
@@ -872,12 +919,14 @@ class Iterator(Sequence):
 
     Arguments:
         index_array: array of sample indices to include in batch.
+
     Returns:
         A batch of transformed samples.
     """
     raise NotImplementedError
 
 
+@tf_export('keras.preprocessing.image.NumpyArrayIterator')
 class NumpyArrayIterator(Iterator):
   """Iterator yielding data from a Numpy array.
 
@@ -947,8 +996,8 @@ class NumpyArrayIterator(Iterator):
                                              seed)
 
   def _get_batches_of_transformed_samples(self, index_array):
-    batch_x = np.zeros(tuple([len(index_array)] + list(self.x.shape)[1:]),
-                       dtype=K.floatx())
+    batch_x = np.zeros(
+        tuple([len(index_array)] + list(self.x.shape)[1:]), dtype=K.floatx())
     for i, j in enumerate(index_array):
       x = self.x[j]
       x = self.image_data_generator.random_transform(x.astype(K.floatx()))
@@ -958,7 +1007,9 @@ class NumpyArrayIterator(Iterator):
       for i, j in enumerate(index_array):
         img = array_to_img(batch_x[i], self.data_format, scale=True)
         fname = '{prefix}_{index}_{hash}.{format}'.format(
-            prefix=self.save_prefix, index=j, hash=np.random.randint(1e4),
+            prefix=self.save_prefix,
+            index=j,
+            hash=np.random.randint(1e4),
             format=self.save_format)
         img.save(os.path.join(self.save_to_dir, fname))
     if self.y is None:
@@ -983,10 +1034,11 @@ class NumpyArrayIterator(Iterator):
 
 def _count_valid_files_in_directory(directory, white_list_formats,
                                     follow_links):
-  """Count files with extension in `white_list_formats` in a directory.
+  """Count files with extension in `white_list_formats` contained in directory.
 
   Arguments:
-      directory: absolute path to the directory containing files to be counted
+      directory: absolute path to the directory
+          containing files to be counted
       white_list_formats: set of strings containing allowed extensions for
           the files to be counted.
       follow_links: boolean.
@@ -1002,7 +1054,7 @@ def _count_valid_files_in_directory(directory, white_list_formats,
 
   samples = 0
   for _, _, files in _recursive_list(directory):
-    for fname in sorted(files):
+    for fname in files:
       is_valid = False
       for extension in white_list_formats:
         if fname.lower().endswith('.' + extension):
@@ -1042,7 +1094,7 @@ def _list_valid_filenames_in_directory(directory, white_list_formats,
   subdir = os.path.basename(directory)
   basedir = os.path.dirname(directory)
   for root, _, files in _recursive_list(directory):
-    for fname in files:
+    for fname in sorted(files):
       is_valid = False
       for extension in white_list_formats:
         if fname.lower().endswith('.' + extension):
@@ -1056,6 +1108,7 @@ def _list_valid_filenames_in_directory(directory, white_list_formats,
   return classes, filenames
 
 
+@tf_export('keras.preprocessing.image.DirectoryIterator')
 class DirectoryIterator(Iterator):
   """Iterator capable of reading images from a directory on disk.
 
@@ -1091,6 +1144,12 @@ class DirectoryIterator(Iterator):
           images (if `save_to_dir` is set).
       save_format: Format to use for saving sample images
           (if `save_to_dir` is set).
+      interpolation: Interpolation method used to resample the image if the
+          target size is different from that of the loaded image.
+          Supported methods are "nearest", "bilinear", and "bicubic".
+          If PIL version 1.1.3 or newer is installed, "lanczos" is also
+          supported. If PIL version 3.4.0 or newer is installed, "box" and
+          "hamming" are also supported. By default, "nearest" is used.
   """
 
   def __init__(self,
@@ -1107,7 +1166,8 @@ class DirectoryIterator(Iterator):
                save_to_dir=None,
                save_prefix='',
                save_format='png',
-               follow_links=False):
+               follow_links=False,
+               interpolation='nearest'):
     if data_format is None:
       data_format = K.image_data_format()
     self.directory = directory
@@ -1138,6 +1198,7 @@ class DirectoryIterator(Iterator):
     self.save_to_dir = save_to_dir
     self.save_prefix = save_prefix
     self.save_format = save_format
+    self.interpolation = interpolation
 
     white_list_formats = {'png', 'jpg', 'jpeg', 'bmp', 'ppm'}
 
@@ -1158,8 +1219,8 @@ class DirectoryIterator(Iterator):
         white_list_formats=white_list_formats,
         follow_links=follow_links)
     self.samples = sum(
-        pool.map(function_partial, (os.path.join(directory, subdir)
-                                    for subdir in classes)))
+        pool.map(function_partial,
+                 (os.path.join(directory, subdir) for subdir in classes)))
 
     print('Found %d images belonging to %d classes.' % (self.samples,
                                                         self.num_classes))
@@ -1172,8 +1233,9 @@ class DirectoryIterator(Iterator):
     i = 0
     for dirpath in (os.path.join(directory, subdir) for subdir in classes):
       results.append(
-          pool.apply_async(_list_valid_filenames_in_directory, (
-              dirpath, white_list_formats, self.class_indices, follow_links)))
+          pool.apply_async(
+              _list_valid_filenames_in_directory,
+              (dirpath, white_list_formats, self.class_indices, follow_links)))
     for res in results:
       classes, filenames = res.get()
       self.classes[i:i + len(classes)] = classes
@@ -1190,9 +1252,11 @@ class DirectoryIterator(Iterator):
     # build batch of image data
     for i, j in enumerate(index_array):
       fname = self.filenames[j]
-      img = load_img(os.path.join(self.directory, fname),
-                     grayscale=grayscale,
-                     target_size=self.target_size)
+      img = load_img(
+          os.path.join(self.directory, fname),
+          grayscale=grayscale,
+          target_size=self.target_size,
+          interpolation=self.interpolation)
       x = img_to_array(img, data_format=self.data_format)
       x = self.image_data_generator.random_transform(x)
       x = self.image_data_generator.standardize(x)
@@ -1202,7 +1266,9 @@ class DirectoryIterator(Iterator):
       for i, j in enumerate(index_array):
         img = array_to_img(batch_x[i], self.data_format, scale=True)
         fname = '{prefix}_{index}_{hash}.{format}'.format(
-            prefix=self.save_prefix, index=j, hash=np.random.randint(1e7),
+            prefix=self.save_prefix,
+            index=j,
+            hash=np.random.randint(1e7),
             format=self.save_format)
         img.save(os.path.join(self.save_to_dir, fname))
     # build batch of labels
@@ -1231,4 +1297,3 @@ class DirectoryIterator(Iterator):
     # The transformation of images is not under thread lock
     # so it can be done in parallel
     return self._get_batches_of_transformed_samples(index_array)
-
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
index 642f4f2face5bd56cdc1ed7b4f6d6621c6d1b210..a423d96d3d8578df347b7ee36fb53dfd335e0d65 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Preprocessing utilities for sequence data.
+"""Utilities for preprocessing sequence data.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -22,8 +22,10 @@ import random
 
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.preprocessing.sequence.pad_sequences')
 def pad_sequences(sequences,
                   maxlen=None,
                   dtype='int32',
@@ -104,6 +106,7 @@ def pad_sequences(sequences,
   return x
 
 
+@tf_export('keras.preprocessing.sequence.make_sampling_table')
 def make_sampling_table(size, sampling_factor=1e-5):
   """Generates a word rank-based probabilistic sampling table.
 
@@ -129,7 +132,7 @@ def make_sampling_table(size, sampling_factor=1e-5):
       is the probability that a word of rank i should be sampled.
   """
   gamma = 0.577
-  rank = np.array(list(range(size)))
+  rank = np.arange(size)
   rank[0] = 1
   inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1. / (12. * rank)
   f = sampling_factor * inv_fq
@@ -137,6 +140,7 @@ def make_sampling_table(size, sampling_factor=1e-5):
   return np.minimum(1., f / np.sqrt(f))
 
 
+@tf_export('keras.preprocessing.sequence.skipgrams')
 def skipgrams(sequence,
               vocabulary_size,
               window_size=4,
@@ -170,7 +174,7 @@ def skipgrams(sequence,
           if True labels will be categorical eg. [[1,0],[0,1],[0,1] .. ]
       sampling_table: 1D array of size `vocabulary_size` where the entry i
           encodes the probability to sample a word of rank i.
-      seed: Random seed.
+      seed: random seed.
 
   Returns:
       couples, labels: where `couples` are int pairs and
@@ -224,3 +228,22 @@ def skipgrams(sequence,
     random.shuffle(labels)
 
   return couples, labels
+
+
+def _remove_long_seq(maxlen, seq, label):
+  """Removes sequences that exceed the maximum length.
+
+  Arguments:
+      maxlen: int, maximum length
+      seq: list of lists where each sublist is a sequence
+      label: list where each element is an integer
+
+  Returns:
+      new_seq, new_label: shortened lists for `seq` and `label`.
+  """
+  new_seq, new_label = [], []
+  for x, y in zip(seq, label):
+    if len(x) < maxlen:
+      new_seq.append(x)
+      new_label.append(y)
+  return new_seq, new_label
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text.py b/tensorflow/python/keras/_impl/keras/preprocessing/text.py
index 47e5aa064fd806196fc9457fc90bc1a26e55ebf3..1e3828ccf1e3bf9c443691e1c1da5697bedb4653 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/text.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Utilities for text input preprocessing.
-
-May benefit from a fast Cython rewrite.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -29,12 +27,17 @@ import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
+
 if sys.version_info < (3,):
   maketrans = string.maketrans
 else:
   maketrans = str.maketrans
 
 
+@tf_export('keras.preprocessing.text.text_to_word_sequence')
 def text_to_word_sequence(text,
                           filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                           lower=True,
@@ -63,11 +66,27 @@ def text_to_word_sequence(text,
   return [i for i in seq if i]
 
 
+@tf_export('keras.preprocessing.text.one_hot')
 def one_hot(text,
             n,
             filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
             lower=True,
             split=' '):
+  """One-hot encodes a text into a list of word indexes of size n.
+
+  This is a wrapper to the `hashing_trick` function using `hash` as the
+  hashing function; unicity of word to index mapping non-guaranteed.
+
+  Arguments:
+      text: Input text (string).
+      n: Dimension of the hashing space.
+      filters: Sequence of characters to filter out.
+      lower: Whether to convert the input to lowercase.
+      split: Sentence split marker (string).
+
+  Returns:
+      A list of integer word indices (unicity non-guaranteed).
+  """
   return hashing_trick(
       text, n, hash_function=hash, filters=filters, lower=lower, split=split)
 
@@ -99,6 +118,10 @@ def hashing_trick(text,
 
   Two or more words may be assigned to the same index, due to possible
   collisions by the hashing function.
+  The
+  probability
+  of a collision is in relation to the dimension of the hashing space and
+  the number of distinct objects.
   """
   if hash_function is None:
     hash_function = hash
@@ -109,6 +132,7 @@ def hashing_trick(text,
   return [(hash_function(w) % (n - 1) + 1) for w in seq]
 
 
+@tf_export('keras.preprocessing.text.Tokenizer')
 class Tokenizer(object):
   """Text tokenization utility class.
 
@@ -127,6 +151,8 @@ class Tokenizer(object):
       lower: boolean. Whether to convert the texts to lowercase.
       split: character or string to use for token splitting.
       char_level: if True, every character will be treated as a token.
+      oov_token: if given, it will be added to word_index and used to
+          replace out-of-vocabulary words during text_to_sequence calls
 
   By default, all punctuation is removed, turning the texts into
   space-separated sequences of words
@@ -141,7 +167,17 @@ class Tokenizer(object):
                filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                lower=True,
                split=' ',
-               char_level=False):
+               char_level=False,
+               oov_token=None,
+               **kwargs):
+    # Legacy support
+    if 'nb_words' in kwargs:
+      logging.warning('The `nb_words` argument in `Tokenizer` '
+                      'has been renamed `num_words`.')
+      num_words = kwargs.pop('nb_words')
+    if kwargs:
+      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+
     self.word_counts = OrderedDict()
     self.word_docs = {}
     self.filters = filters
@@ -150,6 +186,7 @@ class Tokenizer(object):
     self.num_words = num_words
     self.document_count = 0
     self.char_level = char_level
+    self.oov_token = oov_token
 
   def fit_on_texts(self, texts):
     """Updates internal vocabulary based on a list of texts.
@@ -181,7 +218,13 @@ class Tokenizer(object):
     sorted_voc = [wc[0] for wc in wcounts]
     # note that index 0 is reserved, never assigned to an existing word
     self.word_index = dict(
-        list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))
+        list(zip(sorted_voc, list(range(1,
+                                        len(sorted_voc) + 1)))))
+
+    if self.oov_token is not None:
+      i = self.word_index.get(self.oov_token)
+      if i is None:
+        self.word_index[self.oov_token] = len(self.word_index) + 1
 
     self.index_docs = {}
     for w, c in list(self.word_docs.items()):
@@ -248,6 +291,10 @@ class Tokenizer(object):
             continue
           else:
             vect.append(i)
+        elif self.oov_token is not None:
+          i = self.word_index.get(self.oov_token)
+          if i is not None:
+            vect.append(i)
       yield vect
 
   def texts_to_matrix(self, texts, mode='binary'):
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py b/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
index 17ab48ba3fc9dfd553f8f425579c0a37ff42eb84..a934e331c4a14d9bd170258b6b6183e6a15bb561 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
@@ -76,6 +76,22 @@ class TestText(test.TestCase):
     self.assertLessEqual(np.max(encoded), 4)
     self.assertGreaterEqual(np.min(encoded), 1)
 
+  def test_tokenizer_oov_flag(self):
+    x_train = ['This text has only known words']
+    x_test = ['This text has some unknown words']  # 2 OOVs: some, unknown
+
+    # Defalut, without OOV flag
+    tokenizer = keras.preprocessing.text.Tokenizer()
+    tokenizer.fit_on_texts(x_train)
+    x_test_seq = tokenizer.texts_to_sequences(x_test)
+    assert len(x_test_seq[0]) == 4  # discards 2 OOVs
+
+    # With OOV feature
+    tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<unk>')
+    tokenizer.fit_on_texts(x_train)
+    x_test_seq = tokenizer.texts_to_sequences(x_test)
+    assert len(x_test_seq[0]) == 6  # OOVs marked in place
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/regularizers.py b/tensorflow/python/keras/_impl/keras/regularizers.py
index 161ff9bf5bf12b3521fe444f1d68bd62b6e8c71d..2c30844647acdb78d1ca31d052ec7e5ecc6dcc2a 100644
--- a/tensorflow/python/keras/_impl/keras/regularizers.py
+++ b/tensorflow/python/keras/_impl/keras/regularizers.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras built-in regularizers.
+"""Built-in regularizers.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -23,8 +23,10 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.regularizers.Regularizer')
 class Regularizer(object):
   """Regularizer base class.
   """
@@ -37,6 +39,7 @@ class Regularizer(object):
     return cls(**config)
 
 
+@tf_export('keras.regularizers.L1L2')
 class L1L2(Regularizer):
   """Regularizer for L1 and L2 regularization.
 
@@ -64,22 +67,27 @@ class L1L2(Regularizer):
 # Aliases.
 
 
+@tf_export('keras.regularizers.l1')
 def l1(l=0.01):
   return L1L2(l1=l)
 
 
+@tf_export('keras.regularizers.l2')
 def l2(l=0.01):
   return L1L2(l2=l)
 
 
+@tf_export('keras.regularizers.l1_l2')
 def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
   return L1L2(l1=l1, l2=l2)
 
 
+@tf_export('keras.regularizers.serialize')
 def serialize(regularizer):
   return serialize_keras_object(regularizer)
 
 
+@tf_export('keras.regularizers.deserialize')
 def deserialize(config, custom_objects=None):
   return deserialize_keras_object(
       config,
@@ -88,6 +96,7 @@ def deserialize(config, custom_objects=None):
       printable_module_name='regularizer')
 
 
+@tf_export('keras.regularizers.get')
 def get(identifier):
   if identifier is None:
     return None
diff --git a/tensorflow/python/keras/_impl/keras/testing_utils.py b/tensorflow/python/keras/_impl/keras/testing_utils.py
index f204a5df3e654eebd5c0165f383f2c418961f5ba..b889e311b37d48732641205a90ca83af34ea4489 100644
--- a/tensorflow/python/keras/_impl/keras/testing_utils.py
+++ b/tensorflow/python/keras/_impl/keras/testing_utils.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.util import tf_inspect
 
@@ -109,7 +110,8 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   # check shape inference
   model = keras.models.Model(x, y)
   expected_output_shape = tuple(
-      layer._compute_output_shape(input_shape).as_list())  # pylint: disable=protected-access
+      layer.compute_output_shape(
+          tensor_shape.TensorShape(input_shape)).as_list())
   actual_output = model.predict(input_data)
   actual_output_shape = actual_output.shape
   for expected_dim, actual_dim in zip(expected_output_shape,
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils.py b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
index 1f2e9ac44076582c7aea083203b13fddaa597474..e87c8f48ef0967d561db1ab841a669d783f9b1ec 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=g-import-not-at-top
 """Utilities for file download and caching."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from abc import abstractmethod
+from contextlib import closing
 import hashlib
 import multiprocessing
 from multiprocessing.pool import ThreadPool
@@ -28,6 +30,7 @@ import sys
 import tarfile
 import threading
 import time
+import traceback
 import zipfile
 
 import numpy as np
@@ -37,11 +40,13 @@ from six.moves.urllib.error import URLError
 from six.moves.urllib.request import urlopen
 
 from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+from tensorflow.python.util.tf_export import tf_export
+
 
 try:
-  import queue  # pylint:disable=g-import-not-at-top
+  import queue
 except ImportError:
-  import Queue as queue  # pylint:disable=g-import-not-at-top
+  import Queue as queue
 
 
 if sys.version_info[0] == 2:
@@ -85,7 +90,7 @@ if sys.version_info[0] == 2:
       for chunk in chunk_read(response, reporthook=reporthook):
         fd.write(chunk)
 else:
-  from six.moves.urllib.request import urlretrieve  # pylint: disable=g-import-not-at-top
+  from six.moves.urllib.request import urlretrieve
 
 
 def _extract_archive(file_path, path='.', archive_format='auto'):
@@ -134,6 +139,7 @@ def _extract_archive(file_path, path='.', archive_format='auto'):
   return False
 
 
+@tf_export('keras.utils.get_file')
 def get_file(fname,
              origin,
              untar=False,
@@ -185,7 +191,7 @@ def get_file(fname,
       Path to the downloaded file
   """
   if cache_dir is None:
-    cache_dir = os.path.expanduser(os.path.join('~', '.keras'))
+    cache_dir = os.path.join(os.path.expanduser('~'), '.keras')
   if md5_hash is not None and file_hash is None:
     file_hash = md5_hash
     hash_algorithm = 'md5'
@@ -314,36 +320,47 @@ def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
     return False
 
 
+@tf_export('keras.utils.Sequence')
 class Sequence(object):
   """Base object for fitting to a sequence of data, such as a dataset.
 
   Every `Sequence` must implements the `__getitem__` and the `__len__` methods.
   If you want to modify your dataset between epochs you may implement
-  `on_epoch_end`. The method `__getitem__` should return a complete batch.
+  `on_epoch_end`.
+  The method `__getitem__` should return a complete batch.
+
+  # Notes
 
-  Notes:
   `Sequence` are a safer way to do multiprocessing. This structure guarantees
-   that the network will only train once on each sample per epoch which is not
-   the case with generators.
+  that the network will only train once
+   on each sample per epoch which is not the case with generators.
+
   Examples:
+
   ```python
       from skimage.io import imread
       from skimage.transform import resize
       import numpy as np
       import math
+
       # Here, `x_set` is list of path to the images
       # and `y_set` are the associated classes.
+
       class CIFAR10Sequence(Sequence):
+
           def __init__(self, x_set, y_set, batch_size):
               self.x, self.y = x_set, y_set
               self.batch_size = batch_size
+
           def __len__(self):
               return math.ceil(len(self.x) / self.batch_size)
+
           def __getitem__(self, idx):
               batch_x = self.x[idx * self.batch_size:(idx + 1) *
-                        self.batch_size]
+              self.batch_size]
               batch_y = self.y[idx * self.batch_size:(idx + 1) *
-                        self.batch_size]
+              self.batch_size]
+
               return np.array([
                   resize(imread(file_name), (200, 200))
                      for file_name in batch_x]), np.array(batch_y)
@@ -371,7 +388,6 @@ class Sequence(object):
     """
     raise NotImplementedError
 
-  @abstractmethod
   def on_epoch_end(self):
     """Method called at the end of every epoch.
     """
@@ -401,6 +417,7 @@ def get_index(uid, i):
   return _SHARED_SEQUENCES[uid][i]
 
 
+@tf_export('keras.utils.SequenceEnqueuer')
 class SequenceEnqueuer(object):
   """Base class to enqueue inputs.
 
@@ -469,25 +486,36 @@ class OrderedEnqueuer(SequenceEnqueuer):
 
   Arguments:
       sequence: A `keras.utils.data_utils.Sequence` object.
-      use_multiprocessing: Use multiprocessing if True, otherwise threading
-      shuffle: Whether to shuffle the data at the beginning of each epoch
+      use_multiprocessing: use multiprocessing if True, otherwise threading
+      shuffle: whether to shuffle the data at the beginning of each epoch
   """
 
   def __init__(self, sequence, use_multiprocessing=False, shuffle=False):
     self.sequence = sequence
+    self.use_multiprocessing = use_multiprocessing
 
-    # Doing Multiprocessing.Value += x is not process-safe.
     global _SEQUENCE_COUNTER
     if _SEQUENCE_COUNTER is None:
-      _SEQUENCE_COUNTER = multiprocessing.Value('i', 0)
+      try:
+        _SEQUENCE_COUNTER = multiprocessing.Value('i', 0)
+      except OSError:
+        # In this case the OS does not allow us to use
+        # multiprocessing. We resort to an int
+        # for enqueuer indexing.
+        _SEQUENCE_COUNTER = 0
+
+    if isinstance(_SEQUENCE_COUNTER, int):
+      self.uid = _SEQUENCE_COUNTER
+      _SEQUENCE_COUNTER += 1
+    else:
+      # Doing Multiprocessing.Value += x is not process-safe.
+      with _SEQUENCE_COUNTER.get_lock():
+        self.uid = _SEQUENCE_COUNTER.value
+        _SEQUENCE_COUNTER.value += 1
 
-    with _SEQUENCE_COUNTER.get_lock():
-      self.uid = _SEQUENCE_COUNTER.value
-      _SEQUENCE_COUNTER.value += 1
-    self.use_multiprocessing = use_multiprocessing
     self.shuffle = shuffle
     self.workers = 0
-    self.executor = None
+    self.executor_fn = None
     self.queue = None
     self.run_thread = None
     self.stop_signal = None
@@ -504,9 +532,9 @@ class OrderedEnqueuer(SequenceEnqueuer):
             (when full, workers could block on `put()`)
     """
     if self.use_multiprocessing:
-      self.executor = multiprocessing.Pool(workers)
+      self.executor_fn = lambda: multiprocessing.Pool(workers)
     else:
-      self.executor = ThreadPool(workers)
+      self.executor_fn = lambda: ThreadPool(workers)
     self.workers = workers
     self.queue = queue.Queue(max_queue_size)
     self.stop_signal = threading.Event()
@@ -522,24 +550,26 @@ class OrderedEnqueuer(SequenceEnqueuer):
         return
 
   def _run(self):
-    """Function to submit request to the executor & queue `Future` objects."""
+    """Submits request to the executor and queue the `Future` objects."""
     sequence = list(range(len(self.sequence)))
     self._send_sequence()  # Share the initial sequence
     while True:
       if self.shuffle:
         random.shuffle(sequence)
-      for i in sequence:
-        if self.stop_signal.is_set():
-          return
-        self.queue.put(
-            self.executor.apply_async(get_index, (self.uid, i)), block=True)
 
-      # Done with the current epoch, waiting for the final batches
-      self._wait_queue()
+      with closing(self.executor_fn()) as executor:
+        for i in sequence:
+          if self.stop_signal.is_set():
+            return
+          self.queue.put(
+              executor.apply_async(get_index, (self.uid, i)), block=True)
 
-      if self.stop_signal.is_set():
-        # We're done
-        return
+        # Done with the current epoch, waiting for the final batches
+        self._wait_queue()
+
+        if self.stop_signal.is_set():
+          # We're done
+          return
 
       # Call the internal on epoch end.
       self.sequence.on_epoch_end()
@@ -551,8 +581,9 @@ class OrderedEnqueuer(SequenceEnqueuer):
     Skip the data if it is `None`.
 
     Yields:
-        Tuples (inputs, targets)
-        or (inputs, targets, sample_weights)
+        The next element in the queue, i.e. a tuple
+        `(inputs, targets)` or
+        `(inputs, targets, sample_weights)`.
     """
     try:
       while self.is_running():
@@ -560,20 +591,14 @@ class OrderedEnqueuer(SequenceEnqueuer):
         self.queue.task_done()
         if inputs is not None:
           yield inputs
-    except Exception as e:
+    except Exception as e:  # pylint: disable=broad-except
       self.stop()
-      raise StopIteration(e)
+      six.raise_from(StopIteration(e), e)
 
   def _send_sequence(self):
     """Send current Sequence to all workers."""
-    _SHARED_SEQUENCES[
-        self.uid] = self.sequence  # For new processes that may spawn
-
-    self._close_pool()
-    if self.use_multiprocessing:
-      self.executor = multiprocessing.Pool(self.workers)
-    else:
-      self.executor = ThreadPool(self.workers)
+    # For new processes that may spawn
+    _SHARED_SEQUENCES[self.uid] = self.sequence
 
   def stop(self, timeout=None):
     """Stops running threads and wait for them to exit, if necessary.
@@ -588,15 +613,11 @@ class OrderedEnqueuer(SequenceEnqueuer):
       self.queue.queue.clear()
       self.queue.unfinished_tasks = 0
       self.queue.not_full.notify()
-    self._close_pool()
     self.run_thread.join(timeout)
     _SHARED_SEQUENCES[self.uid] = None
 
-  def _close_pool(self):
-    self.executor.close()
-    self.executor.join()
-
 
+@tf_export('keras.utils.GeneratorEnqueuer')
 class GeneratorEnqueuer(SequenceEnqueuer):
   """Builds a queue out of a data generator.
 
@@ -620,41 +641,88 @@ class GeneratorEnqueuer(SequenceEnqueuer):
                seed=None):
     self.wait_time = wait_time
     self._generator = generator
-    self._use_multiprocessing = use_multiprocessing
+    if os.name is 'nt' and use_multiprocessing is True:
+      # On Windows, avoid **SYSTEMATIC** error in `multiprocessing`:
+      # `TypeError: can't pickle generator objects`
+      # => Suggest multithreading instead of multiprocessing on Windows
+      raise ValueError('Using a generator with `use_multiprocessing=True`'
+                       ' is not supported on Windows (no marshalling of'
+                       ' generators across process boundaries). Instead,'
+                       ' use single thread/process or multithreading.')
+    else:
+      self._use_multiprocessing = use_multiprocessing
     self._threads = []
     self._stop_event = None
+    self._manager = None
     self.queue = None
     self.seed = seed
 
-  def start(self, workers=1, max_queue_size=10):
-    """Kicks off threads which add data from the generator into the queue.
-
-    Arguments:
-        workers: number of worker threads
-        max_queue_size: queue size
-            (when full, threads could block on `put()`)
-    """
-
-    def data_generator_task():
+  def _data_generator_task(self):
+    if self._use_multiprocessing is False:
+      while not self._stop_event.is_set():
+        with self.genlock:
+          try:
+            if (self.queue is not None and
+                self.queue.qsize() < self.max_queue_size):
+              # On all OSes, avoid **SYSTEMATIC** error
+              # in multithreading mode:
+              # `ValueError: generator already executing`
+              # => Serialize calls to
+              # infinite iterator/generator's next() function
+              generator_output = next(self._generator)
+              self.queue.put((True, generator_output))
+            else:
+              time.sleep(self.wait_time)
+          except StopIteration:
+            break
+          except Exception as e:  # pylint: disable=broad-except
+            # Can't pickle tracebacks.
+            # As a compromise, print the traceback and pickle None instead.
+            if not hasattr(e, '__traceback__'):
+              setattr(e, '__traceback__', sys.exc_info()[2])
+            self.queue.put((False, e))
+            self._stop_event.set()
+            break
+    else:
       while not self._stop_event.is_set():
         try:
-          if self._use_multiprocessing or self.queue.qsize() < max_queue_size:
+          if (self.queue is not None and
+              self.queue.qsize() < self.max_queue_size):
             generator_output = next(self._generator)
-            self.queue.put(generator_output)
+            self.queue.put((True, generator_output))
           else:
             time.sleep(self.wait_time)
         except StopIteration:
           break
-        except Exception:
+        except Exception as e:  # pylint: disable=broad-except
+          # Can't pickle tracebacks.
+          # As a compromise, print the traceback and pickle None instead.
+          traceback.print_exc()
+          setattr(e, '__traceback__', None)
+          self.queue.put((False, e))
           self._stop_event.set()
-          raise
+          break
+
+  def start(self, workers=1, max_queue_size=10):
+    """Kicks off threads which add data from the generator into the queue.
 
+    Arguments:
+        workers: number of worker threads
+        max_queue_size: queue size
+            (when full, threads could block on `put()`)
+    """
     try:
+      self.max_queue_size = max_queue_size
       if self._use_multiprocessing:
-        self.queue = multiprocessing.Queue(maxsize=max_queue_size)
+        self._manager = multiprocessing.Manager()
+        self.queue = self._manager.Queue(maxsize=max_queue_size)
         self._stop_event = multiprocessing.Event()
       else:
-        self.queue = queue.Queue()
+        # On all OSes, avoid **SYSTEMATIC** error in multithreading mode:
+        # `ValueError: generator already executing`
+        # => Serialize calls to infinite iterator/generator's next() function
+        self.genlock = threading.Lock()
+        self.queue = queue.Queue(maxsize=max_queue_size)
         self._stop_event = threading.Event()
 
       for _ in range(workers):
@@ -662,12 +730,12 @@ class GeneratorEnqueuer(SequenceEnqueuer):
           # Reset random seed else all children processes
           # share the same seed
           np.random.seed(self.seed)
-          thread = multiprocessing.Process(target=data_generator_task)
+          thread = multiprocessing.Process(target=self._data_generator_task)
           thread.daemon = True
           if self.seed is not None:
             self.seed += 1
         else:
-          thread = threading.Thread(target=data_generator_task)
+          thread = threading.Thread(target=self._data_generator_task)
         self._threads.append(thread)
         thread.start()
     except:
@@ -689,15 +757,18 @@ class GeneratorEnqueuer(SequenceEnqueuer):
       self._stop_event.set()
 
     for thread in self._threads:
-      if thread.is_alive():
-        if self._use_multiprocessing:
+      if self._use_multiprocessing:
+        if thread.is_alive():
           thread.terminate()
-        else:
-          thread.join(timeout)
+      else:
+        # The thread.is_alive() test is subject to a race condition:
+        # the thread could terminate right after the test and before the
+        # join, rendering this test meaningless -> Call thread.join()
+        # always, which is ok no matter what the status of the thread.
+        thread.join(timeout)
 
-    if self._use_multiprocessing:
-      if self.queue is not None:
-        self.queue.close()
+    if self._manager:
+      self._manager.shutdown()
 
     self._threads = []
     self._stop_event = None
@@ -709,16 +780,28 @@ class GeneratorEnqueuer(SequenceEnqueuer):
     Skip the data if it is `None`.
 
     Yields:
-        Data arrays.
+        The next element in the queue, i.e. a tuple
+        `(inputs, targets)` or
+        `(inputs, targets, sample_weights)`.
     """
     while self.is_running():
       if not self.queue.empty():
-        inputs = self.queue.get()
-        if inputs is not None:
-          yield inputs
+        success, value = self.queue.get()
+        # Rethrow any exceptions found in the queue
+        if not success:
+          six.reraise(value.__class__, value, value.__traceback__)
+        # Yield regular values
+        if value is not None:
+          yield value
       else:
         all_finished = all([not thread.is_alive() for thread in self._threads])
         if all_finished and self.queue.empty():
           raise StopIteration()
         else:
           time.sleep(self.wait_time)
+
+    # Make sure to rethrow the first exception in the queue, if any
+    while not self.queue.empty():
+      success, value = self.queue.get()
+      if not success:
+        six.reraise(value.__class__, value, value.__traceback__)
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py b/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py
index 47c5b4cff06c083f8ebd699b5cb9da85b74116e0..677e98e871d4a148b13c1aa22696917ed8dc90f9 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py
+++ b/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py
@@ -186,7 +186,7 @@ class TestEnqueuers(test.TestCase):
         use_multiprocessing=False)
     enqueuer.start(3, 10)
     gen_output = enqueuer.get()
-    with self.assertRaises(StopIteration):
+    with self.assertRaises(IndexError):
       next(gen_output)
 
   @unittest.skipIf(
@@ -198,7 +198,7 @@ class TestEnqueuers(test.TestCase):
         use_multiprocessing=True)
     enqueuer.start(3, 10)
     gen_output = enqueuer.get()
-    with self.assertRaises(StopIteration):
+    with self.assertRaises(IndexError):
       next(gen_output)
 
   def test_ordered_enqueuer_threads(self):
@@ -299,4 +299,13 @@ class TestEnqueuers(test.TestCase):
 
 
 if __name__ == '__main__':
+  # Bazel sets these environment variables to very long paths.
+  # Tempfile uses them to create long paths, and in turn multiprocessing
+  # library tries to create sockets named after paths. Delete whatever bazel
+  # writes to these to avoid tests failing due to socket addresses being too
+  # long.
+  for var in ('TMPDIR', 'TMP', 'TEMP'):
+    if var in os.environ:
+      del os.environ[var]
+
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
index 025e5d30a597c560804293b12b0bd063764c87fe..462d600bf827768b0f2e6265aebdaad48e70fcd9 100644
--- a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import binascii
+import codecs
 import marshal
 import os
 import sys
@@ -28,10 +30,12 @@ import six
 
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 _GLOBAL_CUSTOM_OBJECTS = {}
 
 
+@tf_export('keras.utils.CustomObjectScope')
 class CustomObjectScope(object):
   """Provides a scope that changes to `_GLOBAL_CUSTOM_OBJECTS` cannot escape.
 
@@ -67,6 +71,7 @@ class CustomObjectScope(object):
     _GLOBAL_CUSTOM_OBJECTS.update(self.backup)
 
 
+@tf_export('keras.utils.custom_object_scope')
 def custom_object_scope(*args):
   """Provides a scope that changes to `_GLOBAL_CUSTOM_OBJECTS` cannot escape.
 
@@ -97,6 +102,7 @@ def custom_object_scope(*args):
   return CustomObjectScope(*args)
 
 
+@tf_export('keras.utils.get_custom_objects')
 def get_custom_objects():
   """Retrieves a live reference to the global dictionary of custom objects.
 
@@ -117,6 +123,7 @@ def get_custom_objects():
   return _GLOBAL_CUSTOM_OBJECTS
 
 
+@tf_export('keras.utils.serialize_keras_object')
 def serialize_keras_object(instance):
   _, instance = tf_decorator.unwrap(instance)
   if instance is None:
@@ -132,6 +139,7 @@ def serialize_keras_object(instance):
     raise ValueError('Cannot serialize', instance)
 
 
+@tf_export('keras.utils.deserialize_keras_object')
 def deserialize_keras_object(identifier,
                              module_objects=None,
                              custom_objects=None,
@@ -197,10 +205,11 @@ def func_dump(func):
       A tuple `(code, defaults, closure)`.
   """
   if os.name == 'nt':
-    code = marshal.dumps(
-        func.__code__).replace(b'\\', b'/').decode('raw_unicode_escape')
+    raw_code = marshal.dumps(func.__code__).replace(b'\\', b'/')
+    code = codecs.encode(raw_code, 'base64').decode('ascii')
   else:
-    code = marshal.dumps(func.__code__).decode('raw_unicode_escape')
+    raw_code = marshal.dumps(func.__code__)
+    code = codecs.encode(raw_code, 'base64').decode('ascii')
   defaults = func.__defaults__
   if func.__closure__:
     closure = tuple(c.cell_contents for c in func.__closure__)
@@ -225,7 +234,33 @@ def func_load(code, defaults=None, closure=None, globs=None):
     code, defaults, closure = code
     if isinstance(defaults, list):
       defaults = tuple(defaults)
-  code = marshal.loads(code.encode('raw_unicode_escape'))
+
+  def ensure_value_to_cell(value):
+    """Ensures that a value is converted to a python cell object.
+
+    Arguments:
+        value: Any value that needs to be casted to the cell type
+
+    Returns:
+        A value wrapped as a cell object (see function "func_load")
+    """
+    def dummy_fn():
+      # pylint: disable=pointless-statement
+      value  # just access it so it gets captured in .__closure__
+
+    cell_value = dummy_fn.__closure__[0]
+    if not isinstance(value, type(cell_value)):
+      return cell_value
+    else:
+      return value
+
+  if closure is not None:
+    closure = tuple(ensure_value_to_cell(_) for _ in closure)
+  try:
+    raw_code = codecs.decode(code.encode('ascii'), 'base64')
+  except (UnicodeEncodeError, binascii.Error):
+    raw_code = code.encode('raw_unicode_escape')
+  code = marshal.loads(raw_code)
   if globs is None:
     globs = globals()
   return python_types.FunctionType(
@@ -250,60 +285,79 @@ def has_arg(fn, name, accept_all=False):
   return name in arg_spec.args
 
 
+@tf_export('keras.utils.Progbar')
 class Progbar(object):
   """Displays a progress bar.
 
   Arguments:
       target: Total number of steps expected, None if unknown.
+      width: Progress bar width on screen.
+      verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
+      stateful_metrics: Iterable of string names of metrics that
+          should *not* be averaged over time. Metrics in this list
+          will be displayed as-is. All others will be averaged
+          by the progbar before display.
       interval: Minimum visual progress update interval (in seconds).
   """
 
-  def __init__(self, target, width=30, verbose=1, interval=0.05):
-    self.width = width
-    if target is None:
-      target = -1
+  def __init__(self, target, width=30, verbose=1, interval=0.05,
+               stateful_metrics=None):
     self.target = target
-    self.sum_values = {}
-    self.unique_values = []
-    self.start = time.time()
-    self.last_update = 0
-    self.interval = interval
-    self.total_width = 0
-    self.seen_so_far = 0
+    self.width = width
     self.verbose = verbose
+    self.interval = interval
+    if stateful_metrics:
+      self.stateful_metrics = set(stateful_metrics)
+    else:
+      self.stateful_metrics = set()
+
     self._dynamic_display = ((hasattr(sys.stdout, 'isatty') and
                               sys.stdout.isatty()) or
-                             'ipykernel' in sys.modules)
-
-  def update(self, current, values=None, force=False):
+                             'ipykernel' in sys.modules or
+                             'posix' in sys.modules)
+    self._total_width = 0
+    self._seen_so_far = 0
+    # We use a dict + list to avoid garbage collection
+    # issues found in OrderedDict
+    self._values = {}
+    self._values_order = []
+    self._start = time.time()
+    self._last_update = 0
+
+  def update(self, current, values=None):
     """Updates the progress bar.
 
     Arguments:
         current: Index of current step.
-        values: List of tuples (name, value_for_last_step).
-            The progress bar will display averages for these values.
-        force: Whether to force visual progress update.
+        values: List of tuples:
+            `(name, value_for_last_step)`.
+            If `name` is in `stateful_metrics`,
+            `value_for_last_step` will be displayed as-is.
+            Else, an average of the metric over time will be displayed.
     """
     values = values or []
     for k, v in values:
-      if k not in self.sum_values:
-        self.sum_values[k] = [
-            v * (current - self.seen_so_far), current - self.seen_so_far
-        ]
-        self.unique_values.append(k)
+      if k not in self._values_order:
+        self._values_order.append(k)
+      if k not in self.stateful_metrics:
+        if k not in self._values:
+          self._values[k] = [v * (current - self._seen_so_far),
+                             current - self._seen_so_far]
+        else:
+          self._values[k][0] += v * (current - self._seen_so_far)
+          self._values[k][1] += (current - self._seen_so_far)
       else:
-        self.sum_values[k][0] += v * (current - self.seen_so_far)
-        self.sum_values[k][1] += (current - self.seen_so_far)
-    self.seen_so_far = current
+        self._values[k] = v
+    self._seen_so_far = current
 
     now = time.time()
-    info = ' - %.0fs' % (now - self.start)
+    info = ' - %.0fs' % (now - self._start)
     if self.verbose == 1:
-      if (not force and (now - self.last_update) < self.interval and
-          current < self.target):
+      if (now - self._last_update < self.interval and
+          self.target is not None and current < self.target):
         return
 
-      prev_total_width = self.total_width
+      prev_total_width = self._total_width
       if self._dynamic_display:
         sys.stdout.write('\b' * prev_total_width)
         sys.stdout.write('\r')
@@ -324,22 +378,21 @@ class Progbar(object):
             bar += '='
         bar += ('.' * (self.width - prog_width))
         bar += ']'
-        sys.stdout.write(bar)
-        self.total_width = len(bar)
       else:
         bar = '%7d/Unknown' % current
 
-      self.total_width = len(bar)
+      self._total_width = len(bar)
       sys.stdout.write(bar)
 
       if current:
-        time_per_unit = (now - self.start) / current
+        time_per_unit = (now - self._start) / current
       else:
         time_per_unit = 0
       if self.target is not None and current < self.target:
         eta = time_per_unit * (self.target - current)
         if eta > 3600:
-          eta_format = '%d:%02d:%02d' % (eta // 3600, (eta % 3600) // 60,
+          eta_format = '%d:%02d:%02d' % (eta // 3600,
+                                         (eta % 3600) // 60,
                                          eta % 60)
         elif eta > 60:
           eta_format = '%d:%02d' % (eta // 60, eta % 60)
@@ -355,35 +408,32 @@ class Progbar(object):
         else:
           info += ' %.0fus/step' % (time_per_unit * 1e6)
 
-      for k in self.unique_values:
+      for k in self._values_order:
         info += ' - %s:' % k
-        if isinstance(self.sum_values[k], list):
-          avg = np.mean(self.sum_values[k][0] / max(1, self.sum_values[k][1]))
+        if isinstance(self._values[k], list):
+          avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
           if abs(avg) > 1e-3:
             info += ' %.4f' % avg
           else:
             info += ' %.4e' % avg
         else:
-          info += ' %s' % self.sum_values[k]
+          info += ' %s' % self._values[k]
+
+      self._total_width += len(info)
+      if prev_total_width > self._total_width:
+        info += (' ' * (prev_total_width - self._total_width))
 
-      self.total_width += len(info)
-      if prev_total_width > self.total_width:
-        info += (' ' * (prev_total_width - self.total_width))
       if self.target is not None and current >= self.target:
         info += '\n'
 
       sys.stdout.write(info)
       sys.stdout.flush()
 
-      if current >= self.target:
-        sys.stdout.write('\n')
-
     elif self.verbose == 2:
       if self.target is None or current >= self.target:
-        for k in self.unique_values:
+        for k in self._values_order:
           info += ' - %s:' % k
-          avg = np.mean(
-              self.sum_values[k][0] / max(1, self.sum_values[k][1]))
+          avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
           if avg > 1e-3:
             info += ' %.4f' % avg
           else:
@@ -393,7 +443,69 @@ class Progbar(object):
         sys.stdout.write(info)
         sys.stdout.flush()
 
-    self.last_update = now
+    self._last_update = now
 
   def add(self, n, values=None):
-    self.update(self.seen_so_far + n, values)
+    self.update(self._seen_so_far + n, values)
+
+
+def make_batches(size, batch_size):
+  """Returns a list of batch indices (tuples of indices).
+
+  Arguments:
+      size: Integer, total size of the data to slice into batches.
+      batch_size: Integer, batch size.
+
+  Returns:
+      A list of tuples of array indices.
+  """
+  num_batches = int(np.ceil(size / float(batch_size)))
+  return [(i * batch_size, min(size, (i + 1) * batch_size))
+          for i in range(0, num_batches)]
+
+
+def slice_arrays(arrays, start=None, stop=None):
+  """Slice an array or list of arrays.
+
+  This takes an array-like, or a list of
+  array-likes, and outputs:
+      - arrays[start:stop] if `arrays` is an array-like
+      - [x[start:stop] for x in arrays] if `arrays` is a list
+
+  Can also work on list/array of indices: `slice_arrays(x, indices)`
+
+  Arguments:
+      arrays: Single array or list of arrays.
+      start: can be an integer index (start index)
+          or a list/array of indices
+      stop: integer (stop index); should be None if
+          `start` was a list.
+
+  Returns:
+      A slice of the array(s).
+
+  Raises:
+      ValueError: If the value of start is a list and stop is not None.
+  """
+  if arrays is None:
+    return [None]
+  if isinstance(start, list) and stop is not None:
+    raise ValueError('The stop argument has to be None if the value of start is'
+                     'a list.')
+  elif isinstance(arrays, list):
+    if hasattr(start, '__len__'):
+      # hdf5 datasets only support list objects as indices
+      if hasattr(start, 'shape'):
+        start = start.tolist()
+      return [None if x is None else x[start] for x in arrays]
+    else:
+      return [None if x is None else x[start:stop] for x in arrays]
+  else:
+    if hasattr(start, '__len__'):
+      if hasattr(start, 'shape'):
+        start = start.tolist()
+      return arrays[start]
+    elif hasattr(start, '__getitem__'):
+      return arrays[start:stop]
+    else:
+      return [None]
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils.py b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
index 2003e19a0a759d84ec1785d9bab6bde560ba030a..bbf1d2a3d9c3948271780ec3fad3316b4e6d53c3 100644
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=g-import-not-at-top
 """Utilities related to disk I/O."""
 from __future__ import absolute_import
 from __future__ import division
@@ -21,14 +22,16 @@ from collections import defaultdict
 import sys
 
 import numpy as np
+from tensorflow.python.util.tf_export import tf_export
 
 
 try:
-  import h5py  # pylint:disable=g-import-not-at-top
+  import h5py
 except ImportError:
   h5py = None
 
 
+@tf_export('keras.utils.HDF5Matrix')
 class HDF5Matrix(object):
   """Representation of HDF5 dataset to be used instead of a Numpy array.
 
@@ -63,11 +66,11 @@ class HDF5Matrix(object):
                         'HDF5 and h5py installed.')
 
     if datapath not in list(self.refs.keys()):
-      self._f = h5py.File(datapath)
-      self.refs[datapath] = self._f
+      f = h5py.File(datapath)
+      self.refs[datapath] = f
     else:
-      self._f = self.refs[datapath]
-    self.data = self._f[dataset]
+      f = self.refs[datapath]
+    self.data = f[dataset]
     self.start = start
     if end is None:
       self.end = self.data.shape[0]
@@ -78,9 +81,6 @@ class HDF5Matrix(object):
   def __len__(self):
     return self.end - self.start
 
-  def  __del__(self):
-    self._f.close()
-
   def __getitem__(self, key):
     if isinstance(key, slice):
       start, stop = key.start, key.stop
diff --git a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
index 053c0600a33d6ab0151ecc8879cbc68fe731dbe5..4c8009dfd80e1aec457fa03687f2840c7fe4607b 100644
--- a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities related to Keras layers.
+# pylint: disable=protected-access
+"""Utilities related to layer/model functionality.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -22,16 +23,17 @@ import numpy as np
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.conv_utils import convert_kernel
+from tensorflow.python.util.tf_export import tf_export
 
 
 def count_params(weights):
   """Count the total number of scalars composing the weights.
 
   Arguments:
-    weights: An iterable containing the weights on which to compute params
+      weights: An iterable containing the weights on which to compute params
 
   Returns:
-    The total number of scalars composing the weights
+      The total number of scalars composing the weights
   """
   return int(np.sum([K.count_params(p) for p in set(weights)]))
 
@@ -46,24 +48,30 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
           terminal window sizes).
       positions: Relative or absolute positions of log elements in each line.
           If not provided, defaults to `[.33, .55, .67, 1.]`.
-      print_fn: Print function to use (defaults to `print`).
+      print_fn: Print function to use.
           It will be called on each line of the summary.
           You can set it to a custom function
           in order to capture the string summary.
+          It defaults to `print` (prints to stdout).
   """
   if print_fn is None:
     print_fn = print
 
   if model.__class__.__name__ == 'Sequential':
     sequential_like = True
+  elif not model._is_graph_network:
+    # We treat subclassed models as a simple sequence of layers, for logging
+    # purposes.
+    sequential_like = True
   else:
     sequential_like = True
-    nodes_by_depth = model._nodes_by_depth.values()  # pylint: disable=protected-access
+    nodes_by_depth = model._nodes_by_depth.values()
     nodes = []
     for v in nodes_by_depth:
       if (len(v) > 1) or (len(v) == 1 and len(v[0].inbound_layers) > 1):
-        # If the model has multiple nodes or if the nodes have
-        # multiple inbound_layers, the model is no longer sequential.
+        # if the model has multiple nodes
+        # or if the nodes have multiple inbound_layers
+        # the model is no longer sequential
         sequential_like = False
         break
       nodes += v
@@ -71,7 +79,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
       # search for shared layers
       for layer in model.layers:
         flag = False
-        for node in layer.inbound_nodes:
+        for node in layer._inbound_nodes:
           if node in nodes:
             if flag:
               sequential_like = False
@@ -96,7 +104,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     # header names for the different log elements
     to_display = ['Layer (type)', 'Output Shape', 'Param #', 'Connected to']
     relevant_nodes = []
-    for v in model._nodes_by_depth.values():  # pylint: disable=protected-access
+    for v in model._nodes_by_depth.values():
       relevant_nodes += v
 
   def print_row(fields, positions):
@@ -114,17 +122,24 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
   print_fn('=' * line_length)
 
   def print_layer_summary(layer):
+    """Prints a summary for a single layer.
+
+    Arguments:
+        layer: target layer.
+    """
     try:
       output_shape = layer.output_shape
     except AttributeError:
       output_shape = 'multiple'
+    except RuntimeError:  # output_shape unknown in Eager mode.
+      output_shape = '?'
     name = layer.name
     cls_name = layer.__class__.__name__
     fields = [name + ' (' + cls_name + ')', output_shape, layer.count_params()]
     print_row(fields, positions)
 
   def print_layer_summary_with_connections(layer):
-    """Prints a summary for a single layer.
+    """Prints a summary for a single layer (including topological connections).
 
     Arguments:
         layer: target layer.
@@ -134,7 +149,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     except AttributeError:
       output_shape = 'multiple'
     connections = []
-    for node in layer._inbound_nodes:  # pylint: disable=protected-access
+    for node in layer._inbound_nodes:
       if relevant_nodes and node not in relevant_nodes:
         # node is not part of the current network
         continue
@@ -142,8 +157,8 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
         inbound_layer = node.inbound_layers[i].name
         inbound_node_index = node.node_indices[i]
         inbound_tensor_index = node.tensor_indices[i]
-        connections.append(inbound_layer + '[' + str(inbound_node_index) + ']['
-                           + str(inbound_tensor_index) + ']')
+        connections.append(inbound_layer + '[' + str(inbound_node_index) +
+                           '][' + str(inbound_tensor_index) + ']')
 
     name = layer.name
     cls_name = layer.__class__.__name__
@@ -172,9 +187,9 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     else:
       print_fn('_' * line_length)
 
-  model._check_trainable_weights_consistency()  # pylint: disable=protected-access
+  model._check_trainable_weights_consistency()
   if hasattr(model, '_collected_trainable_weights'):
-    trainable_count = count_params(model._collected_trainable_weights)  # pylint: disable=protected-access
+    trainable_count = count_params(model._collected_trainable_weights)
   else:
     trainable_count = count_params(model.trainable_weights)
 
@@ -187,6 +202,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
   print_fn('_' * line_length)
 
 
+@tf_export('keras.utils.convert_all_kernels_in_model')
 def convert_all_kernels_in_model(model):
   """Converts all convolution kernels in a model from Theano to TensorFlow.
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/np_utils.py b/tensorflow/python/keras/_impl/keras/utils/np_utils.py
index 896016d4d8bb48192e32ab094f7b7a0e6799921c..a611be08aaed824ebb278b4b28ef52ea1872563b 100644
--- a/tensorflow/python/keras/_impl/keras/utils/np_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/np_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,8 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('keras.utils.to_categorical')
 def to_categorical(y, num_classes=None):
   """Converts a class vector (integers) to binary class matrix.
 
@@ -35,7 +37,7 @@ def to_categorical(y, num_classes=None):
   """
   y = np.array(y, dtype='int')
   input_shape = y.shape
-  if input_shape and input_shape[-1] == 1:
+  if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
     input_shape = tuple(input_shape[:-1])
   y = y.ravel()
   if not num_classes:
@@ -48,6 +50,7 @@ def to_categorical(y, num_classes=None):
   return categorical
 
 
+@tf_export('keras.utils.normalize')
 def normalize(x, axis=-1, order=2):
   """Normalizes a Numpy array.
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py b/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py
index 9680c295cd31c40114726a919d4e327c07ddd240..1e974c2ef2aee3b6a83ad777673505f8c75b2b58 100644
--- a/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py
+++ b/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py
@@ -28,8 +28,9 @@ class TestNPUtils(test.TestCase):
 
   def test_to_categorical(self):
     num_classes = 5
-    shapes = [(3,), (4, 3), (5, 4, 3), (3, 1), (3, 2, 1)]
-    expected_shapes = [(3, num_classes),
+    shapes = [(1,), (3,), (4, 3), (5, 4, 3), (3, 1), (3, 2, 1)]
+    expected_shapes = [(1, num_classes),
+                       (3, num_classes),
                        (4, 3, num_classes),
                        (5, 4, 3, num_classes),
                        (3, num_classes)]
diff --git a/tensorflow/python/keras/_impl/keras/utils/training_utils.py b/tensorflow/python/keras/_impl/keras/utils/training_utils.py
index 8939c814cf3f9c6fa2f2af79e71919c6666e5561..ce7402e9d279278eaaf5aab58a3973eec6de8e99 100644
--- a/tensorflow/python/keras/_impl/keras/utils/training_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/training_utils.py
@@ -21,6 +21,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine.training import Model
 from tensorflow.python.ops import array_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _get_available_devices():
@@ -32,6 +33,7 @@ def _normalize_device_name(name):
   return name
 
 
+@tf_export('keras.utils.multi_gpu_model')
 def multi_gpu_model(model, gpus):
   """Replicates a model on different GPUs.
 
@@ -112,12 +114,22 @@ def multi_gpu_model(model, gpus):
   from tensorflow.python.keras._impl.keras.layers.core import Lambda
   from tensorflow.python.keras._impl.keras.layers.merge import concatenate
 
-  if gpus <= 1:
-    raise ValueError('For multi-gpu usage to be effective, '
-                     'call `multi_gpu_model` with `gpus >= 2`. '
-                     'Received: `gpus=%d`' % gpus)
-
-  target_devices = ['/cpu:0'] + ['/gpu:%d' % i for i in range(gpus)]
+  if isinstance(gpus, (list, tuple)):
+    if len(gpus) <= 1:
+      raise ValueError('For multi-gpu usage to be effective, '
+                       'call `multi_gpu_model` with `len(gpus) >= 2`. '
+                       'Received: `gpus=%s`' % gpus)
+    num_gpus = len(gpus)
+    target_gpu_ids = gpus
+  else:
+    if gpus <= 1:
+      raise ValueError('For multi-gpu usage to be effective, '
+                       'call `multi_gpu_model` with `gpus >= 2`. '
+                       'Received: `gpus=%d`' % gpus)
+    num_gpus = gpus
+    target_gpu_ids = range(num_gpus)
+
+  target_devices = ['/cpu:0'] + ['/gpu:%d' % i for i in target_gpu_ids]
   available_devices = _get_available_devices()
   available_devices = [
       _normalize_device_name(name) for name in available_devices
@@ -145,7 +157,7 @@ def multi_gpu_model(model, gpus):
     batch_size = shape[:1]
     input_shape = shape[1:]
     step = batch_size // parts
-    if i == gpus - 1:
+    if i == num_gpus - 1:
       size = batch_size - step * i
     else:
       size = step
@@ -160,9 +172,9 @@ def multi_gpu_model(model, gpus):
 
   # Place a copy of the model on each GPU,
   # each getting a slice of the inputs.
-  for i in range(gpus):
-    with ops.device('/gpu:%d' % i):
-      with ops.name_scope('replica_%d' % i):
+  for i, gpu_id in enumerate(target_gpu_ids):
+    with ops.device('/gpu:%d' % gpu_id):
+      with ops.name_scope('replica_%d' % gpu_id):
         inputs = []
         # Retrieve a slice of the input.
         for x in model.inputs:
@@ -172,8 +184,9 @@ def multi_gpu_model(model, gpus):
               output_shape=input_shape,
               arguments={
                   'i': i,
-                  'parts': gpus
-              })(x)
+                  'parts': num_gpus
+              })(
+                  x)
           inputs.append(slice_i)
 
         # Apply model on slice
@@ -189,6 +202,6 @@ def multi_gpu_model(model, gpus):
   # Merge outputs on CPU.
   with ops.device('/cpu:0'):
     merged = []
-    for outputs in all_outputs:
-      merged.append(concatenate(outputs, axis=0))
+    for name, outputs in zip(model.output_names, all_outputs):
+      merged.append(concatenate(outputs, axis=0, name=name))
     return Model(model.inputs, merged)
diff --git a/tensorflow/python/keras/_impl/keras/utils/training_utils_test.py b/tensorflow/python/keras/_impl/keras/utils/training_utils_test.py
index 51fbd041a4943b1837c5f725a06c0c08fb9cb216..12354c49ca72cddc0f395bcfcfabab18c1189227 100644
--- a/tensorflow/python/keras/_impl/keras/utils/training_utils_test.py
+++ b/tensorflow/python/keras/_impl/keras/utils/training_utils_test.py
@@ -33,6 +33,7 @@ class TestMultiGPUModel(test.TestCase):
     output_dim = 1
     hidden_dim = 10
     epochs = 2
+    target_gpu_id = [0, 2, 4]
 
     with self.test_session():
       model = keras.models.Sequential()
@@ -42,8 +43,12 @@ class TestMultiGPUModel(test.TestCase):
 
       x = np.random.random((num_samples, input_dim))
       y = np.random.random((num_samples, output_dim))
+
       parallel_model = keras.utils.multi_gpu_model(model, gpus=gpus)
+      parallel_model.compile(loss='mse', optimizer='rmsprop')
+      parallel_model.fit(x, y, epochs=epochs)
 
+      parallel_model = keras.utils.multi_gpu_model(model, gpus=target_gpu_id)
       parallel_model.compile(loss='mse', optimizer='rmsprop')
       parallel_model.fit(x, y, epochs=epochs)
 
@@ -56,6 +61,7 @@ class TestMultiGPUModel(test.TestCase):
     output_dim_b = 2
     hidden_dim = 10
     epochs = 2
+    target_gpu_id = [0, 2, 4]
 
     with self.test_session():
       input_a = keras.Input((input_dim_a,))
@@ -76,6 +82,10 @@ class TestMultiGPUModel(test.TestCase):
       parallel_model.compile(loss='mse', optimizer='rmsprop')
       parallel_model.fit([a_x, b_x], [a_y, b_y], epochs=epochs)
 
+      parallel_model = keras.utils.multi_gpu_model(model, gpus=target_gpu_id)
+      parallel_model.compile(loss='mse', optimizer='rmsprop')
+      parallel_model.fit([a_x, b_x], [a_y, b_y], epochs=epochs)
+
   def multi_gpu_test_invalid_devices(self):
     with self.test_session():
       input_shape = (1000, 10)
@@ -92,3 +102,16 @@ class TestMultiGPUModel(test.TestCase):
         parallel_model = keras.utils.multi_gpu_model(
             model, gpus=len(keras.backend._get_available_gpus()) + 1)
         parallel_model.fit(x, y, epochs=2)
+
+      with self.assertRaises(ValueError):
+        parallel_model = keras.utils.multi_gpu_model(
+            model, gpus=[0, 2, 4, 6, 8])
+        parallel_model.fit(x, y, epochs=2)
+
+      with self.assertRaises(ValueError):
+        parallel_model = keras.utils.multi_gpu_model(model, gpus=1)
+        parallel_model.fit(x, y, epochs=2)
+
+      with self.assertRaises(ValueError):
+        parallel_model = keras.utils.multi_gpu_model(model, gpus=[0])
+        parallel_model.fit(x, y, epochs=2)
diff --git a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
index d56c4484ce35d0c6af08d6199867b7845f367c88..45c1b92075c50956fee004409e98898411e83d27 100644
--- a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,30 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=protected-access
+# pylint: disable=g-import-not-at-top
 """Utilities related to model visualization."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import os
-import sys
+from tensorflow.python.util.tf_export import tf_export
+
 
 try:
   # pydot-ng is a fork of pydot that is better maintained.
-  import pydot_ng as pydot  # pylint: disable=g-import-not-at-top
+  import pydot_ng as pydot
 except ImportError:
-  # Fall back on pydot if necessary.
-  # Silence a `print` statement that occurs in case of import error,
-  # by temporarily replacing sys.stdout.
-  _stdout = sys.stdout
-  sys.stdout = sys.stderr
+  # pydotplus is an improved version of pydot
   try:
-    import pydot  # pylint: disable=g-import-not-at-top
+    import pydotplus as pydot
   except ImportError:
-    pydot = None
-  finally:
-    # Restore sys.stdout.
-    sys.stdout = _stdout
+    # Fall back on pydot if necessary.
+    try:
+      import pydot
+    except ImportError:
+      pydot = None
 
 
 def _check_pydot():
@@ -65,8 +65,8 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   Returns:
       A `pydot.Dot` instance representing the Keras model.
   """
-  from tensorflow.python.keras._impl.keras.layers.wrappers import Wrapper  # pylint: disable=g-import-not-at-top
-  from tensorflow.python.keras._impl.keras.models import Sequential  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras._impl.keras.layers.wrappers import Wrapper
+  from tensorflow.python.keras._impl.keras.models import Sequential
 
   _check_pydot()
   dot = pydot.Dot()
@@ -118,9 +118,9 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   # Connect nodes with edges.
   for layer in layers:
     layer_id = str(id(layer))
-    for i, node in enumerate(layer._inbound_nodes):  # pylint: disable=protected-access
+    for i, node in enumerate(layer._inbound_nodes):
       node_key = layer.name + '_ib-' + str(i)
-      if node_key in model._network_nodes:  # pylint: disable=protected-access
+      if node_key in model._container_nodes:
         for inbound_layer in node.inbound_layers:
           inbound_layer_id = str(id(inbound_layer))
           layer_id = str(id(layer))
@@ -128,6 +128,7 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   return dot
 
 
+@tf_export('keras.utils.plot_model')
 def plot_model(model,
                to_file='model.png',
                show_shapes=False,
diff --git a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py b/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
index 31ef4773ad6481264aea09c72f955a5a6ef8a11d..2884dc84cc5d99511947e6f0f97b0bf8a505221f 100644
--- a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
+++ b/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""API wrapper allowing to use certain Keras models with the Scikit-Learn API.
+"""Wrapper for using the Scikit-Learn API with Keras models.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -24,8 +24,9 @@ import types
 import numpy as np
 
 from tensorflow.python.keras._impl.keras.models import Sequential
+from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
 from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
-from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 
 class BaseWrapper(object):
@@ -38,18 +39,18 @@ class BaseWrapper(object):
       build_fn: callable function or class instance
       **sk_params: model parameters & fitting parameters
 
-  The build_fn should construct, compile and return a Keras model, which
+  The `build_fn` should construct, compile and return a Keras model, which
   will then be used to fit/predict. One of the following
-  three values could be passed to build_fn:
+  three values could be passed to `build_fn`:
   1. A function
-  2. An instance of a class that implements the __call__ method
+  2. An instance of a class that implements the `__call__` method
   3. None. This means you implement a class that inherits from either
-  `KerasClassifier` or `KerasRegressor`. The __call__ method of the
-  present class will then be treated as the default build_fn.
+  `KerasClassifier` or `KerasRegressor`. The `__call__` method of the
+  present class will then be treated as the default `build_fn`.
 
   `sk_params` takes both model parameters and fitting parameters. Legal model
   parameters are the arguments of `build_fn`. Note that like all other
-  estimators in scikit-learn, 'build_fn' should provide default values for
+  estimators in scikit-learn, `build_fn` should provide default values for
   its arguments, so that you could create the estimator without passing any
   values to `sk_params`.
 
@@ -75,7 +76,7 @@ class BaseWrapper(object):
     self.check_params(sk_params)
 
   def check_params(self, params):
-    """Checks for user typos in "params".
+    """Checks for user typos in `params`.
 
     Arguments:
         params: dictionary; the parameters to be checked
@@ -95,13 +96,11 @@ class BaseWrapper(object):
     else:
       legal_params_fns.append(self.build_fn)
 
-    legal_params = []
-    for fn in legal_params_fns:
-      legal_params += tf_inspect.getargspec(fn)[0]
-    legal_params = set(legal_params)
-
     for params_name in params:
-      if params_name not in legal_params:
+      for fn in legal_params_fns:
+        if has_arg(fn, params_name):
+          break
+      else:
         if params_name != 'nb_epoch':
           raise ValueError('{} is not a legal parameter'.format(params_name))
 
@@ -136,10 +135,10 @@ class BaseWrapper(object):
 
     Arguments:
         x : array-like, shape `(n_samples, n_features)`
-            Training samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Training samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
-            True labels for X.
+            True labels for `x`.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.fit`
 
@@ -170,26 +169,26 @@ class BaseWrapper(object):
     return history
 
   def filter_sk_params(self, fn, override=None):
-    """Filters `sk_params` and return those in `fn`'s arguments.
+    """Filters `sk_params` and returns those in `fn`'s arguments.
 
     Arguments:
         fn : arbitrary function
-        override: dictionary, values to override sk_params
+        override: dictionary, values to override `sk_params`
 
     Returns:
-        res : dictionary dictionary containing variables
-            in both sk_params and fn's arguments.
+        res : dictionary containing variables
+            in both `sk_params` and `fn`'s arguments.
     """
     override = override or {}
     res = {}
-    fn_args = tf_inspect.getargspec(fn)[0]
     for name, value in self.sk_params.items():
-      if name in fn_args:
+      if has_arg(fn, name):
         res.update({name: value})
     res.update(override)
     return res
 
 
+@tf_export('keras.wrappers.scikit_learn.KerasClassifier')
 class KerasClassifier(BaseWrapper):
   """Implementation of the scikit-learn classifier API for Keras.
   """
@@ -199,10 +198,10 @@ class KerasClassifier(BaseWrapper):
 
     Arguments:
         x : array-like, shape `(n_samples, n_features)`
-            Training samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Training samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
-            True labels for X.
+            True labels for `x`.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.fit`
 
@@ -229,8 +228,8 @@ class KerasClassifier(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         **kwargs: dictionary arguments
             Legal arguments are the arguments
             of `Sequential.predict_classes`.
@@ -248,8 +247,8 @@ class KerasClassifier(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         **kwargs: dictionary arguments
             Legal arguments are the arguments
             of `Sequential.predict_classes`.
@@ -258,8 +257,8 @@ class KerasClassifier(BaseWrapper):
         proba: array-like, shape `(n_samples, n_outputs)`
             Class probability estimates.
             In the case of binary classification,
-            tp match the scikit-learn API,
-            will return an array of shape '(n_samples, 2)'
+            to match the scikit-learn API,
+            will return an array of shape `(n_samples, 2)`
             (instead of `(n_sample, 1)` as in Keras).
     """
     kwargs = self.filter_sk_params(Sequential.predict_proba, kwargs)
@@ -276,16 +275,16 @@ class KerasClassifier(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         y: array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
-            True labels for x.
+            True labels for `x`.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.evaluate`.
 
     Returns:
         score: float
-            Mean accuracy of predictions on X wrt. y.
+            Mean accuracy of predictions on `x` wrt. `y`.
 
     Raises:
         ValueError: If the underlying model isn't configured to
@@ -312,6 +311,7 @@ class KerasClassifier(BaseWrapper):
                      'the `model.compile()` method.')
 
 
+@tf_export('keras.wrappers.scikit_learn.KerasRegressor')
 class KerasRegressor(BaseWrapper):
   """Implementation of the scikit-learn regressor API for Keras.
   """
@@ -321,8 +321,8 @@ class KerasRegressor(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.predict`.
 
@@ -338,16 +338,16 @@ class KerasRegressor(BaseWrapper):
 
     Arguments:
         x: array-like, shape `(n_samples, n_features)`
-            Test samples where n_samples in the number of samples
-            and n_features is the number of features.
+            Test samples where `n_samples` is the number of samples
+            and `n_features` is the number of features.
         y: array-like, shape `(n_samples,)`
-            True labels for X.
+            True labels for `x`.
         **kwargs: dictionary arguments
             Legal arguments are the arguments of `Sequential.evaluate`.
 
     Returns:
         score: float
-            Mean accuracy of predictions on X wrt. y.
+            Mean accuracy of predictions on `x` wrt. `y`.
     """
     kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)
     loss = self.model.evaluate(x, y, **kwargs)
diff --git a/tensorflow/python/keras/applications/__init__.py b/tensorflow/python/keras/applications/__init__.py
index 34f1435ffb6b65ef0e1399fb6893c3b791616f79..fccedf919a7b261bb30f332172b1388db9da1939 100644
--- a/tensorflow/python/keras/applications/__init__.py
+++ b/tensorflow/python/keras/applications/__init__.py
@@ -18,16 +18,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras.applications import densenet
 from tensorflow.python.keras.applications import inception_resnet_v2
 from tensorflow.python.keras.applications import inception_v3
 from tensorflow.python.keras.applications import mobilenet
+from tensorflow.python.keras.applications import nasnet
 from tensorflow.python.keras.applications import resnet50
 from tensorflow.python.keras.applications import vgg16
 from tensorflow.python.keras.applications import vgg19
 from tensorflow.python.keras.applications import xception
+from tensorflow.python.keras.applications.densenet import DenseNet121
+from tensorflow.python.keras.applications.densenet import DenseNet169
+from tensorflow.python.keras.applications.densenet import DenseNet201
 from tensorflow.python.keras.applications.inception_resnet_v2 import InceptionResNetV2
 from tensorflow.python.keras.applications.inception_v3 import InceptionV3
 from tensorflow.python.keras.applications.mobilenet import MobileNet
+from tensorflow.python.keras.applications.nasnet import NASNetLarge
+from tensorflow.python.keras.applications.nasnet import NASNetMobile
 from tensorflow.python.keras.applications.resnet50 import ResNet50
 from tensorflow.python.keras.applications.vgg16 import VGG16
 from tensorflow.python.keras.applications.vgg19 import VGG19
diff --git a/tensorflow/python/keras/applications/densenet/__init__.py b/tensorflow/python/keras/applications/densenet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b8ea83920733a3a442171616ab460ffaf831521
--- /dev/null
+++ b/tensorflow/python/keras/applications/densenet/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""DenseNet Keras applications."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl.keras.applications.densenet import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet121
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet169
+from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet201
+from tensorflow.python.keras._impl.keras.applications.densenet import preprocess_input
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/python/keras/applications/nasnet/__init__.py b/tensorflow/python/keras/applications/nasnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..94eb145b85b85b2e52ca37e7aebc681c1f054e16
--- /dev/null
+++ b/tensorflow/python/keras/applications/nasnet/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""NASNet Keras applications."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl.keras.applications.nasnet import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetLarge
+from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetMobile
+from tensorflow.python.keras._impl.keras.applications.nasnet import preprocess_input
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index b94bf8f0f67a7a8ddbb351d13cb17ccdbf283260..84ee5040dcd7b118a5c63b6532135913fe238797 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -30,6 +30,7 @@ from tensorflow.python.keras._impl.keras.layers.advanced_activations import Leak
 from tensorflow.python.keras._impl.keras.layers.advanced_activations import PReLU
 from tensorflow.python.keras._impl.keras.layers.advanced_activations import ELU
 from tensorflow.python.keras._impl.keras.layers.advanced_activations import ThresholdedReLU
+from tensorflow.python.keras._impl.keras.layers.advanced_activations import Softmax
 
 # Convolution layers.
 from tensorflow.python.keras._impl.keras.layers.convolutional import Conv1D
@@ -37,6 +38,7 @@ from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2D
 from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3D
 from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2DTranspose
 from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3DTranspose
+from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConv1D
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConv2D
 
 # Convolution layer aliases.
@@ -45,6 +47,7 @@ from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution
 from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3D
 from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution2DTranspose
 from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3DTranspose
+from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution1D
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution2D
 
 # Image processing layers.
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 2ec162578cebf4b36dc0ebc68f8ac26b6df8f422..d4ceb2e489c8a20d26eaf9d89b12992d2b8673d7 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -81,6 +81,23 @@ tf_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "list_ops_test",
+    size = "small",
+    srcs = ["list_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:list_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:client_testlib",
+    ],
+    grpc_enabled = True,
+)
+
 cuda_py_test(
     name = "benchmark_test",
     size = "small",
@@ -294,6 +311,19 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "decode_compressed_op_test",
+    size = "small",
+    srcs = ["decode_compressed_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:parsing_ops",
+    ],
+)
+
 cuda_py_test(
     name = "determinant_op_test",
     size = "small",
@@ -356,6 +386,7 @@ tf_py_test(
     srcs = ["fractional_avg_pool_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn_grad",
@@ -370,6 +401,7 @@ tf_py_test(
     srcs = ["fractional_max_pool_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn_grad",
@@ -483,6 +515,7 @@ tf_py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
     ],
+    grpc_enabled = True,
 )
 
 tf_py_test(
@@ -517,6 +550,18 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "matrix_logarithm_op_test",
+    size = "small",
+    srcs = ["matrix_logarithm_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+    ],
+)
+
 cuda_py_test(
     name = "matrix_inverse_op_test",
     size = "small",
@@ -602,6 +647,23 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "parse_single_example_op_test",
+    size = "small",
+    srcs = ["parse_single_example_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+    ],
+)
+
 tf_py_test(
     name = "partitioned_variables_test",
     size = "small",
@@ -676,6 +738,7 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
+        "//tensorflow/python:resource_variable_ops",
     ],
     tags = ["noasan"],  # http://b/32635055
 )
@@ -980,6 +1043,7 @@ tf_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:function",
     ],
 )
 
@@ -1230,7 +1294,9 @@ cuda_py_test(
 
 cuda_py_test(
     name = "control_flow_ops_py_test",
-    size = "small",
+    # TODO(b/70473603): change this back to "small" once the C API is
+    # permanently enabled
+    size = "medium",
     srcs = ["control_flow_ops_py_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1264,6 +1330,19 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "control_flow_util_test",
+    size = "small",
+    srcs = ["control_flow_util_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_ops_gen",
+        "//tensorflow/python:control_flow_util",
+        "//tensorflow/python:test_ops",
+    ],
+)
+
 cuda_py_test(
     name = "conv1d_test",
     size = "small",
@@ -1370,7 +1449,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "dynamic_partition_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["dynamic_partition_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1427,6 +1506,7 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    grpc_enabled = True,
     tags = ["no_windows"],
 )
 
@@ -1521,6 +1601,19 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "manip_ops_test",
+    size = "small",
+    srcs = ["manip_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:manip_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+    tags = ["no_windows_gpu"],
+)
+
 cuda_py_test(
     name = "matmul_op_test",
     size = "small",
@@ -1631,6 +1724,8 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
     tags = ["no_windows"],
 )
@@ -2055,7 +2150,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "transpose_op_test",
-    size = "medium",
+    size = "large",
     srcs = ["transpose_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2063,6 +2158,11 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    shard_count = 2,
+    tags = [
+        "no_gpu",
+        "no_oss",
+    ],
 )
 
 cuda_py_test(
@@ -2143,6 +2243,7 @@ cuda_py_test(
     srcs = ["atrous_convolution_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn_grad",
@@ -2403,6 +2504,7 @@ cuda_py_test(
         "//tensorflow/python:sparse_ops",
     ],
     shard_count = 5,
+    tags = ["noasan"],
 )
 
 cuda_py_test(
@@ -2733,7 +2835,7 @@ tf_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
     ],
-    shard_count = 3,
+    shard_count = 10,
     tags = ["no_windows_gpu"],
 )
 
@@ -2776,101 +2878,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "batch_dataset_op_test",
-    size = "small",
-    srcs = ["batch_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "dataset_constructor_op_test",
-    size = "small",
-    srcs = ["dataset_constructor_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-    ],
-    tags = [
-        "manual",
-        "nomac",  # b/62040583
-    ],
-)
-
-tf_py_test(
-    name = "dataset_from_generator_op_test",
-    size = "small",
-    srcs = ["dataset_from_generator_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:sparse",
-    ],
-)
-
-tf_py_test(
-    name = "filter_dataset_op_test",
-    size = "small",
-    srcs = ["filter_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "flat_map_dataset_op_test",
-    size = "small",
-    srcs = ["flat_map_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
 tf_py_test(
     name = "garbage_collection_test",
     size = "small",
@@ -2885,263 +2892,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "list_files_dataset_op_test",
-    size = "small",
-    srcs = ["list_files_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "interleave_dataset_op_test",
-    size = "small",
-    srcs = ["interleave_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "map_dataset_op_test",
-    size = "small",
-    srcs = ["map_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "prefetch_dataset_op_test",
-    size = "small",
-    srcs = ["prefetch_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "range_dataset_op_test",
-    size = "small",
-    srcs = ["range_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-)
-
-tf_py_test(
-    name = "reader_dataset_ops_test",
-    size = "small",
-    srcs = ["reader_dataset_ops_test.py"],
-    additional_deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:readers",
-    ],
-)
-
-tf_py_test(
-    name = "sequence_dataset_op_test",
-    size = "small",
-    srcs = ["sequence_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "shuffle_dataset_op_test",
-    size = "small",
-    srcs = ["shuffle_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-)
-
-tf_py_test(
-    name = "shard_dataset_op_test",
-    size = "small",
-    srcs = ["shard_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "cache_dataset_op_test",
-    size = "small",
-    srcs = ["cache_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-)
-
-tf_py_test(
-    name = "zip_dataset_op_test",
-    size = "small",
-    srcs = ["zip_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "concatenate_dataset_op_test",
-    size = "small",
-    srcs = ["concatenate_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-tf_py_test(
-    name = "iterator_ops_test",
-    size = "small",
-    srcs = ["iterator_ops_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/util:sparse",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
-    ],
-)
-
-tf_py_test(
-    name = "iterator_ops_cluster_test",
-    size = "small",
-    srcs = ["iterator_ops_cluster_test.py"],
-    additional_deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-    tags = [
-        "no_oss",  # Test flaky due to port collisions.
-        "no_windows",
-    ],
-)
-
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 1bf2b70c1beb25739fddebb3a552c368dc7a48be..1e2ea829884f2f97ab2203b54228365d85a9dea0 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import time
+import unittest
 
 import numpy as np
 
@@ -33,10 +34,13 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as test_lib
 
@@ -79,7 +83,9 @@ class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
       matrix_ph = array_ops.placeholder(dtypes.int32)
       transposed = array_ops.matrix_transpose(matrix_ph)
       self.assertAllEqual(
-          expected_transposed, transposed.eval(feed_dict={matrix_ph: matrix}))
+          expected_transposed, transposed.eval(feed_dict={
+              matrix_ph: matrix
+          }))
 
   def testBatchMatrixDynamicallyDefined(self):
     matrix_0 = [[1, 2, 3], [4, 5, 6]]
@@ -93,7 +99,9 @@ class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
       transposed = array_ops.matrix_transpose(batch_matrix_ph)
       self.assertAllEqual(
           expected_transposed,
-          transposed.eval(feed_dict={batch_matrix_ph: batch_matrix}))
+          transposed.eval(feed_dict={
+              batch_matrix_ph: batch_matrix
+          }))
 
   def testTensorWithStaticRankLessThanTwoRaisesBecauseNotAMatrix(self):
     vector = [1, 2, 3]
@@ -200,8 +208,10 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
 
       masked_tensor = sess.run(
           array_ops.boolean_mask(ph_tensor, ph_mask),
-          feed_dict={ph_tensor: arr,
-                     ph_mask: mask})
+          feed_dict={
+              ph_tensor: arr,
+              ph_mask: mask
+          })
       np.testing.assert_allclose(masked_tensor, arr[mask])
 
   def testMaskDimensionsSetToNoneRaises(self):
@@ -274,26 +284,36 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     x_np = np.array([1, 200, 3, 40, 5], dtype=np_dtype)
 
     for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        x_tf = array_ops.reverse_v2(x_np, [0]).eval()
-        self.assertAllEqual(x_tf, np.asarray(x_np)[::-1])
+      for axis_dtype in [dtypes.int32, dtypes.int64]:
+        with self.test_session(use_gpu=use_gpu):
+          x_tf = array_ops.reverse_v2(x_np,
+                                      constant_op.constant(
+                                          [0], dtype=axis_dtype)).eval()
+          self.assertAllEqual(x_tf, np.asarray(x_np)[::-1])
 
   def _reverse2DimAuto(self, np_dtype):
     x_np = np.array([[1, 200, 3], [4, 5, 60]], dtype=np_dtype)
 
     for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
       for use_gpu in [False, True]:
-        with self.test_session(use_gpu=use_gpu):
-          x_tf_1 = reverse_f(x_np, [0]).eval()
-          x_tf_2 = reverse_f(x_np, [-2]).eval()
-          x_tf_3 = reverse_f(x_np, [1]).eval()
-          x_tf_4 = reverse_f(x_np, [-1]).eval()
-          x_tf_5 = reverse_f(x_np, [1, 0]).eval()
-          self.assertAllEqual(x_tf_1, np.asarray(x_np)[::-1, :])
-          self.assertAllEqual(x_tf_2, np.asarray(x_np)[::-1, :])
-          self.assertAllEqual(x_tf_3, np.asarray(x_np)[:, ::-1])
-          self.assertAllEqual(x_tf_4, np.asarray(x_np)[:, ::-1])
-          self.assertAllEqual(x_tf_5, np.asarray(x_np)[::-1, ::-1])
+        for axis_dtype in [dtypes.int32, dtypes.int64]:
+          with self.test_session(use_gpu=use_gpu):
+            x_tf_1 = reverse_f(x_np, constant_op.constant(
+                [0], dtype=axis_dtype)).eval()
+            x_tf_2 = reverse_f(x_np, constant_op.constant(
+                [-2], dtype=axis_dtype)).eval()
+            x_tf_3 = reverse_f(x_np, constant_op.constant(
+                [1], dtype=axis_dtype)).eval()
+            x_tf_4 = reverse_f(x_np, constant_op.constant(
+                [-1], dtype=axis_dtype)).eval()
+            x_tf_5 = reverse_f(x_np,
+                               constant_op.constant([1, 0],
+                                                    dtype=axis_dtype)).eval()
+            self.assertAllEqual(x_tf_1, np.asarray(x_np)[::-1, :])
+            self.assertAllEqual(x_tf_2, np.asarray(x_np)[::-1, :])
+            self.assertAllEqual(x_tf_3, np.asarray(x_np)[:, ::-1])
+            self.assertAllEqual(x_tf_4, np.asarray(x_np)[:, ::-1])
+            self.assertAllEqual(x_tf_5, np.asarray(x_np)[::-1, ::-1])
 
   # This is the version of reverse that uses axis indices rather than
   # bool tensors
@@ -313,18 +333,16 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
 
   def testReverse1DimAuto(self):
     for dtype in [
-        np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64,
-        np.bool, np.float16, np.float32,
-        np.float64, np.complex64, np.complex128,
+        np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
+        np.float16, np.float32, np.float64, np.complex64, np.complex128,
         np.array(b"").dtype.type
     ]:
       self._reverse1DimAuto(dtype)
 
   def testReverse2DimAuto(self):
     for dtype in [
-        np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64,
-        np.bool, np.float16, np.float32,
-        np.float64, np.complex64, np.complex128,
+        np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
+        np.float16, np.float32, np.float64, np.complex64, np.complex128,
         np.array(b"").dtype.type
     ]:
       self._reverse2DimAuto(dtype)
@@ -397,7 +415,7 @@ class MeshgridTest(test_util.TensorFlowTestCase):
   def _compareDiffType(self, n, np_dtype, use_gpu):
     inputs = []
     for index in ("ij", "xy"):
-      for i in range(n):
+      for _ in range(n):
         x = np.linspace(-10, 10, 5).astype(np_dtype)
         if np_dtype in (np.complex64, np.complex128):
           x += 1j
@@ -405,8 +423,8 @@ class MeshgridTest(test_util.TensorFlowTestCase):
       numpy_out = np.meshgrid(*inputs, indexing=index)
       with self.test_session(use_gpu=use_gpu):
         tf_out = array_ops.meshgrid(*inputs, indexing=index)
-        for X, _X in zip(numpy_out, tf_out):
-          self.assertAllEqual(X, _X.eval())
+        for x_np, x_tf in zip(numpy_out, tf_out):
+          self.assertAllEqual(x_np, x_tf.eval())
 
   def testCompare(self):
     for t in (np.float16, np.float32, np.float64, np.int32, np.int64,
@@ -700,8 +718,8 @@ class GradSliceChecker(object):
     slice_val_grad2, = gradients_impl.gradients(
         slice_val_grad, dy, grad_ys=self.var)
     self.sess.run(assign)
-    slice_val_grad_evaled, slice_val_grad2_evaled = (self.sess.run(
-        [slice_val_grad, slice_val_grad2]))
+    slice_val_grad_evaled, slice_val_grad2_evaled = (
+        self.sess.run([slice_val_grad, slice_val_grad2]))
     analytic_grad2_evaled = analytic_grad2.eval()
     self.test.assertAllEqual(slice_val_grad2_evaled, analytic_grad2_evaled)
 
@@ -935,6 +953,32 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
         v = variables.Variable([1, 2])
         sess.run(v[:].assign([1, 2]))
 
+  def testTypeError(self):
+    init_val = constant_op.constant([1, 2], dtype=dtypes.int32)
+    too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8)
+    too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
+    v = variables.Variable(init_val)
+    with self.assertRaises(TypeError):
+      v[:].assign(too_small_val)
+    with self.assertRaises(TypeError):
+      v[:].assign(too_large_val)
+
+  def testTypeErrorResource(self):
+    init_val = constant_op.constant([1, 2], dtype=dtypes.int32)
+    too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8)
+    too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
+    v = resource_variable_ops.ResourceVariable(init_val)
+    with self.test_session() as sess:
+      sess.run(v.initializer)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "l-value dtype int32 does not match r-value dtype int64"):
+        sess.run(v[:].assign(too_large_val))
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "l-value dtype int32 does not match r-value dtype int8"):
+        sess.run(v[:].assign(too_small_val))
+
 
 class ShapeSizeRankTest(test_util.TensorFlowTestCase):
 
@@ -964,6 +1008,7 @@ class ShapeSizeRankTest(test_util.TensorFlowTestCase):
       self.assertEqual(2, array_ops.rank(sp).eval())
 
 
+@test_util.with_c_api
 class SequenceMaskTest(test_util.TensorFlowTestCase):
 
   def testExceptions(self):
@@ -971,40 +1016,66 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, "maxlen must be scalar"):
         array_ops.sequence_mask([10, 20], [10, 20])
 
-  def testOneDimensional(self):
+  def testOneDimensionalWithMaxlen(self):
     with self.test_session():
       res = array_ops.sequence_mask(constant_op.constant([1, 3, 2]), 5)
       self.assertAllEqual(res.get_shape(), [3, 5])
-      self.assertAllEqual(res.eval(), [[True, False, False, False, False],
-                                       [True, True, True, False, False],
-                                       [True, True, False, False, False]])
+      self.assertAllEqual(
+          res.eval(),
+          [[True, False, False, False, False], [True, True, True, False, False],
+           [True, True, False, False, False]])
 
+  def testOneDimensionalDtypeWithoutMaxlen(self):
+    with self.test_session():
       # test dtype and default maxlen:
+      res = array_ops.sequence_mask(constant_op.constant([0, 1, 4]),
+                                    dtype=dtypes.float32)
+      if ops._USE_C_API:
+        self.assertAllEqual(res.get_shape().as_list(), [3, 4])
+      else:
+        self.assertAllEqual(res.get_shape().as_list(), [3, None])
+      self.assertAllEqual(
+          res.eval(),
+          [[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
+
+  def testOneDimensionalWithoutMaxlen(self):
+    with self.test_session():
       res = array_ops.sequence_mask(
-          constant_op.constant([0, 1, 4]), dtype=dtypes.float32)
-      self.assertAllEqual(res.get_shape().as_list(), [3, None])
-      self.assertAllEqual(res.eval(), [[0.0, 0.0, 0.0,
-                                        0.0], [1.0, 0.0, 0.0, 0.0],
-                                       [1.0, 1.0, 1.0, 1.0]])
+          constant_op.constant([0, 1, 4]))
+      if ops._USE_C_API:
+        self.assertAllEqual(res.get_shape().as_list(), [3, 4])
+      else:
+        self.assertAllEqual(res.get_shape().as_list(), [3, None])
+      self.assertAllEqual(
+          res.eval(),
+          [[False, False, False, False],
+           [True, False, False, False],
+           [True, True, True, True]])
 
   def testTwoDimensional(self):
     with self.test_session():
       res = array_ops.sequence_mask(constant_op.constant([[1, 3, 2]]), 5)
       self.assertAllEqual(res.get_shape(), [1, 3, 5])
-      self.assertAllEqual(res.eval(), [[[True, False, False, False, False],
-                                        [True, True, True, False, False],
-                                        [True, True, False, False, False]]])
+      self.assertAllEqual(res.eval(), [[[True, False, False, False, False], [
+          True, True, True, False, False
+      ], [True, True, False, False, False]]])
 
       # test dtype and default maxlen:
       res = array_ops.sequence_mask(
           constant_op.constant([[0, 1, 4], [1, 2, 3]]), dtype=dtypes.float32)
-      self.assertAllEqual(res.get_shape().as_list(), [2, 3, None])
-      self.assertAllEqual(res.eval(), [[[0.0, 0.0, 0.0, 0.0],
-                                        [1.0, 0.0, 0.0, 0.0],
-                                        [1.0, 1.0, 1.0, 1.0]],
-                                       [[1.0, 0.0, 0.0, 0.0],
-                                        [1.0, 1.0, 0.0, 0.0],
-                                        [1.0, 1.0, 1.0, 0.0]]])
+      if ops._USE_C_API:
+        self.assertAllEqual(res.get_shape().as_list(), [2, 3, 4])
+      else:
+        self.assertAllEqual(res.get_shape().as_list(), [2, 3, None])
+      self.assertAllEqual(
+          res.eval(),
+          [[[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]],
+           [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0]]])
+
+  def testUnknownShape(self):
+    lengths = array_ops.placeholder(dtype=dtypes.int32)
+    res = array_ops.sequence_mask(lengths)
+    self.assertEqual(res.shape, None)
 
   def testDtypes(self):
 
@@ -1013,9 +1084,10 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           constant_op.constant([1, 3, 2], dtype=lengths_dtype),
           constant_op.constant(5, dtype=maxlen_dtype))
       self.assertAllEqual(res.get_shape(), [3, 5])
-      self.assertAllEqual(res.eval(), [[True, False, False, False, False],
-                                       [True, True, True, False, False],
-                                       [True, True, False, False, False]])
+      self.assertAllEqual(
+          res.eval(),
+          [[True, False, False, False, False], [True, True, True, False, False],
+           [True, True, False, False, False]])
 
     with self.test_session():
       check_dtypes(dtypes.int32, dtypes.int32)
@@ -1070,13 +1142,14 @@ class PadTest(test_util.TensorFlowTestCase):
   def testEager(self):
     with context.eager_mode():
       t = constant_op.constant([[1, 2, 3], [4, 5, 6]])
-      paddings = constant_op.constant([[1, 1,], [2, 2]])
+      paddings = constant_op.constant([[
+          1,
+          1,
+      ], [2, 2]])
       padded = array_ops.pad(t, paddings, "CONSTANT")
       self.assertAllEqual(padded.numpy(),
-                          [[0, 0, 0, 0, 0, 0, 0],
-                           [0, 0, 1, 2, 3, 0, 0],
-                           [0, 0, 4, 5, 6, 0, 0],
-                           [0, 0, 0, 0, 0, 0, 0]])
+                          [[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 3, 0, 0],
+                           [0, 0, 4, 5, 6, 0, 0], [0, 0, 0, 0, 0, 0, 0]])
 
 
 class InvertPermutationTest(test_util.TensorFlowTestCase):
@@ -1090,5 +1163,70 @@ class InvertPermutationTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(y.eval(), [2, 4, 3, 0, 1])
 
 
+class UnravelIndexTest(test_util.TensorFlowTestCase):
+
+  # TODO(b/73086570): Reenable test.
+  @unittest.skip("Test does not pass internally.")
+  def testUnravelIndex(self):
+    with self.test_session():
+      for dtype in [dtypes.int32, dtypes.int64]:
+        indices_1 = constant_op.constant(1621, dtype=dtype)
+        dims_1 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
+        out_1 = array_ops.unravel_index(indices_1, dims_1)
+        self.assertAllEqual(out_1.eval(), [3, 1, 4, 1])
+
+        indices_2 = constant_op.constant([1621], dtype=dtype)
+        dims_2 = constant_op.constant([6, 7, 8, 9], dtype=dtype)
+        out_2 = array_ops.unravel_index(indices_2, dims_2)
+        self.assertAllEqual(out_2.eval(), [[3], [1], [4], [1]])
+
+        indices_3 = constant_op.constant([22, 41, 37], dtype=dtype)
+        dims_3 = constant_op.constant([7, 6], dtype=dtype)
+        out_3 = array_ops.unravel_index(indices_3, dims_3)
+        self.assertAllEqual(out_3.eval(), [[3, 6, 6], [4, 5, 1]])
+
+
+class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
+
+  def testSimple(self):
+    with self.test_session():
+      a = array_ops.constant(10)
+      guarantee_a = array_ops.guarantee_const(a)
+      self.assertEqual(10, guarantee_a.eval())
+
+  def testVariables(self):
+    with self.test_session() as sess:
+      for use_resource in [False, True]:
+        a = variable_scope.get_variable(
+            "var_{}".format(use_resource), [],
+            initializer=init_ops.constant_initializer(10.0),
+            use_resource=use_resource)
+        guarantee_a = array_ops.guarantee_const(a)
+        sess.run(variables.global_variables_initializer())
+        self.assertEqual(10.0, guarantee_a.eval())
+
+  def testResourceRejection(self):
+    with self.test_session() as sess:
+      a = variable_scope.get_variable(
+          "resource_var", [],
+          initializer=init_ops.constant_initializer(10.0),
+          use_resource=True)
+      guarantee_a = array_ops.guarantee_const(a.handle)
+      sess.run(variables.global_variables_initializer())
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               "cannot be a resource variable"):
+        guarantee_a.eval()
+
+
+class SnapshotOpTest(test_util.TensorFlowTestCase):
+
+  def testInvertPermutation(self):
+    for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([0, 1, 2, 3], dtype=dtype)
+        y = gen_array_ops._snapshot(x)
+        self.assertAllEqual(y.eval(), [0, 1, 2, 3])
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/kernel_tests/atrous_convolution_test.py b/tensorflow/python/kernel_tests/atrous_convolution_test.py
index 3ac27d11c57062b8d7c4c1d9b89bc576c6959dec..2d1b3d9b7e836591646a2d0e59742bf6139446d1 100644
--- a/tensorflow/python/kernel_tests/atrous_convolution_test.py
+++ b/tensorflow/python/kernel_tests/atrous_convolution_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -80,6 +81,7 @@ class AtrousConvolutionTest(test.TestCase):
       otherwise, it's delayed after the context.
     """
     checks = []
+
     def add_check(check, *args, **kwargs):
       if context.in_eager_mode():
         args_val, kwargs_val = self.evaluate([args, kwargs])
@@ -95,12 +97,12 @@ class AtrousConvolutionTest(test.TestCase):
 
   def _test_atrous_convolution(self, add_check, input_shape, filter_shape,
                                dilation_rate, **kwargs):
-    filters = np.arange(np.prod(filter_shape),
-                        dtype=np.float32).reshape(filter_shape)
+    filters = np.arange(
+        np.prod(filter_shape), dtype=np.float32).reshape(filter_shape)
     filters_upsampled = upsample_filters(filters, dilation_rate)
     x = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape)
-    y1 = nn_ops.convolution(input=x, filter=filters,
-                            dilation_rate=dilation_rate, **kwargs)
+    y1 = nn_ops.convolution(
+        input=x, filter=filters, dilation_rate=dilation_rate, **kwargs)
     y2 = nn_ops.convolution(input=x, filter=filters_upsampled, **kwargs)
 
     def check(y1_eval, y2_eval):
@@ -108,6 +110,20 @@ class AtrousConvolutionTest(test.TestCase):
 
     add_check(check, y1, y2)
 
+  def test_unknown_spatial_dims_for_channel_last_format(self):
+    x = array_ops.placeholder(dtypes.float32, [1, None, None, 10])
+    w = array_ops.zeros([3, 3, 10, 20])
+    y = nn_ops.convolution(
+        x, w, "VALID", dilation_rate=[2, 2], data_format="NHWC")
+    self.assertEqual(y.shape.as_list(), [1, None, None, 20])
+
+  def test_unknown_spatial_dims_for_channel_first_format(self):
+    x = array_ops.placeholder(dtypes.float32, [1, 10, None, None])
+    w = array_ops.zeros([3, 3, 10, 20])
+    y = nn_ops.convolution(
+        x, w, "VALID", dilation_rate=[2, 2], data_format="NCHW")
+    self.assertEqual(y.shape.as_list(), [1, 20, None, None])
+
   @test_util.run_in_graph_and_eager_modes()
   def testAtrousConvolution2D(self):
     with self._delay_checks() as add_check:
@@ -202,28 +218,35 @@ class AtrousConvolutionTest(test.TestCase):
 
               def combined_op(converted_input, num_spatial_dims, padding_arg):  # pylint: disable=unused-argument
                 # pylint: disable=cell-var-from-loop
-                result = nn_ops.convolution(input=converted_input, filter=f1,
-                                            padding=padding)
-                result = nn_ops.convolution(input=result, filter=f2,
-                                            padding=padding)
+                result = nn_ops.convolution(
+                    input=converted_input, filter=f1, padding=padding)
+                result = nn_ops.convolution(
+                    input=result, filter=f2, padding=padding)
                 # pylint: enable=cell-var-from-loop
                 return result
 
               for rate_height in range(2, 4):
                 for rate_width in range(2, 4):
                   dilation_rate = [rate_height, rate_width]
-                  y1 = nn_ops.convolution(input=x, filter=f1, padding=padding,
-                                          dilation_rate=dilation_rate)
-                  y1 = nn_ops.convolution(input=y1, filter=f2,
-                                          padding=padding,
-                                          dilation_rate=dilation_rate)
+                  y1 = nn_ops.convolution(
+                      input=x,
+                      filter=f1,
+                      padding=padding,
+                      dilation_rate=dilation_rate)
+                  y1 = nn_ops.convolution(
+                      input=y1,
+                      filter=f2,
+                      padding=padding,
+                      dilation_rate=dilation_rate)
                   y2 = nn_ops.with_space_to_batch(
-                      input=x, dilation_rate=dilation_rate, op=combined_op,
+                      input=x,
+                      dilation_rate=dilation_rate,
+                      op=combined_op,
                       padding="VALID")
 
                   def check(y1_eval, y2_eval):
-                    self.assertAllClose(y1_eval, y2_eval, rtol=1e-2,
-                                        atol=1e-2)
+                    self.assertAllClose(y1_eval, y2_eval, rtol=1e-2, atol=1e-2)
+
                   add_check(check, y1, y2)
 
   def _test_gradient(self, x_shape, f_shape, dilation_rate, padding):
diff --git a/tensorflow/python/kernel_tests/bcast_ops_test.py b/tensorflow/python/kernel_tests/bcast_ops_test.py
index 7c18044c5c5072ef03681165f6194b73a29392ca..9e512346053a4c3af089170f47313606c4a307c2 100644
--- a/tensorflow/python/kernel_tests/bcast_ops_test.py
+++ b/tensorflow/python/kernel_tests/bcast_ops_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops.gen_array_ops import _broadcast_args
 from tensorflow.python.ops.gen_array_ops import _broadcast_gradient_args
 from tensorflow.python.platform import test
@@ -135,6 +137,19 @@ class BcastOpsTest(test.TestCase):
     self.assertAllEqual(r0, [0, 1, 3])
     self.assertAllEqual(r1, [])
 
+  def testDataTypes(self):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      r = self._GetBroadcastShape(
+          constant_op.constant([2, 3, 5], dtype=dtype),
+          constant_op.constant([1], dtype=dtype))
+      self.assertAllEqual(r, [2, 3, 5])
+
+      r0, r1 = self._GetGradientArgs(
+          constant_op.constant([2, 3, 5], dtype=dtype),
+          constant_op.constant([1], dtype=dtype))
+      self.assertAllEqual(r0, [])
+      self.assertAllEqual(r1, [0, 1, 2])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
index 88b3f20469a6a8d8e8181e8d5a3876ae22fb9c06..28b3dc45e9c5fd9aee0b4b7f71a5dc1b93c057ed 100644
--- a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
+++ b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
@@ -80,7 +80,7 @@ class RangeSamplerOpsTest(test.TestCase):
     with self.test_session():
       true_classes = constant_op.constant(
           [[1, 2], [0, 4], [3, 3]], dtype=dtypes.int64)
-      _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler(
+      _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler(  # pylint: disable=line-too-long
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       sampled_log_expected_count = math_ops.log(sampled_expected_count)
       result = sampled_log_expected_count.eval()
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 7ce0f1e7b8a4df7c8c3acb36c0d46f60cbf0f703..2e94603a3f3d4ca9074320cfb4e9bf06b6640e82 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -117,7 +117,7 @@ class AssertEqualTest(test.TestCase):
   def test_error_message_eager(self):
     expected_error_msg_full = r"""big does not equal small
 Condition x == y did not hold.
-Indices of first 6 different values:
+Indices of first 3 different values:
 \[\[0 0\]
  \[1 1\]
  \[2 0\]\]
@@ -129,6 +129,21 @@ First 6 elements of x:
 \[2 2 3 3 6 6\]
 First 6 elements of y:
 \[20  2  3 30 60  6\]
+"""
+    expected_error_msg_default = r"""big does not equal small
+Condition x == y did not hold.
+Indices of first 3 different values:
+\[\[0 0\]
+ \[1 1\]
+ \[2 0\]\]
+Corresponding x values:
+\[2 3 6\]
+Corresponding y values:
+\[20 30 60\]
+First 3 elements of x:
+\[2 2 3\]
+First 3 elements of y:
+\[20  2  3\]
 """
     expected_error_msg_short = r"""big does not equal small
 Condition x == y did not hold.
@@ -151,6 +166,9 @@ First 2 elements of y:
                                    expected_error_msg_full):
         check_ops.assert_equal(big, small, message="big does not equal small",
                                summarize=10)
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   expected_error_msg_default):
+        check_ops.assert_equal(big, small, message="big does not equal small")
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    expected_error_msg_short):
         check_ops.assert_equal(big, small, message="big does not equal small",
@@ -270,6 +288,118 @@ class AssertNoneEqualTest(test.TestCase):
       assert x is None
 
 
+class AssertAllCloseTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_doesnt_raise_when_equal(self):
+    x = constant_op.constant(1., name="x")
+    y = constant_op.constant(1., name="y")
+    with ops.control_dependencies(
+        [check_ops.assert_near(x, y, message="failure message")]):
+      out = array_ops.identity(x)
+      self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_doesnt_raise_when_close_enough_32_bit_due_to_default_rtol(self):
+    eps = np.finfo(np.float32).eps
+    # Default rtol/atol is 10*eps
+    x = constant_op.constant(1., name="x")
+    y = constant_op.constant(1. + 2 * eps, name="y", dtype=np.float32)
+    with ops.control_dependencies(
+        [check_ops.assert_near(x, y, atol=0., message="failure message")]):
+      out = array_ops.identity(x)
+      self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_doesnt_raise_when_close_enough_32_bit_due_to_default_atol(self):
+    eps = np.finfo(np.float32).eps
+    # Default rtol/atol is 10*eps
+    x = constant_op.constant(0., name="x")
+    y = constant_op.constant(0. + 2 * eps, name="y", dtype=np.float32)
+    with ops.control_dependencies(
+        [check_ops.assert_near(x, y, rtol=0., message="failure message")]):
+      out = array_ops.identity(x)
+      self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_doesnt_raise_when_close_enough_64_bit_due_to_default_rtol(self):
+    eps = np.finfo(np.float64).eps
+    # Default rtol/atol is 10*eps
+    x = constant_op.constant(1., name="x", dtype=np.float64)
+    y = constant_op.constant(1. + 2 * eps, name="y", dtype=np.float64)
+    with ops.control_dependencies(
+        [check_ops.assert_near(x, y, atol=0., message="failure message")]):
+      out = array_ops.identity(x)
+      self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_doesnt_raise_when_close_enough_64_bit_due_to_default_atol(self):
+    eps = np.finfo(np.float64).eps
+    # Default rtol/atol is 10*eps
+    x = constant_op.constant(0., name="x", dtype=np.float64)
+    y = constant_op.constant(0. + 2 * eps, name="y", dtype=np.float64)
+    with ops.control_dependencies(
+        [check_ops.assert_near(x, y, rtol=0., message="failure message")]):
+      out = array_ops.identity(x)
+      self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_doesnt_raise_when_close_enough_due_to_custom_rtol(self):
+    x = constant_op.constant(1., name="x")
+    y = constant_op.constant(1.1, name="y")
+    with ops.control_dependencies(
+        [check_ops.assert_near(x, y, atol=0., rtol=0.5,
+                               message="failure message")]):
+      out = array_ops.identity(x)
+      self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_doesnt_raise_when_close_enough_due_to_custom_atol(self):
+    x = constant_op.constant(0., name="x")
+    y = constant_op.constant(0.1, name="y", dtype=np.float32)
+    with ops.control_dependencies(
+        [check_ops.assert_near(x, y, atol=0.5, rtol=0.,
+                               message="failure message")]):
+      out = array_ops.identity(x)
+      self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_doesnt_raise_when_both_empty(self):
+    larry = constant_op.constant([])
+    curly = constant_op.constant([])
+    with ops.control_dependencies([check_ops.assert_near(larry, curly)]):
+      out = array_ops.identity(larry)
+    self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_raises_when_atol_violated(self):
+    x = constant_op.constant(10., name="x")
+    y = constant_op.constant(10.2, name="y")
+    with self.assertRaisesOpError("x and y not equal to tolerance"):
+      with ops.control_dependencies(
+          [check_ops.assert_near(x, y, atol=0.1,
+                                 message="failure message")]):
+        out = array_ops.identity(x)
+        self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_raises_when_default_rtol_violated(self):
+    x = constant_op.constant(0.1, name="x")
+    y = constant_op.constant(0.0, name="y")
+    with self.assertRaisesOpError("x and y not equal to tolerance"):
+      with ops.control_dependencies(
+          [check_ops.assert_near(x, y, message="failure message")]):
+        out = array_ops.identity(x)
+        self.evaluate(out)
+
+  def test_returns_none_with_eager(self):
+    with context.eager_mode():
+      t1 = constant_op.constant([1., 2.])
+      t2 = constant_op.constant([1., 2.])
+      x = check_ops.assert_near(t1, t2)
+      assert x is None
+
+
 class AssertLessTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index a5fd3bc3345f41d9d3f07278dc7979c1103b597f..127bc6bb20ae6b415da94672de68cc4b8ceaa287 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -495,9 +495,9 @@ class ConcatOpTest(test.TestCase):
         p = []
         shape = np.array([7, 13])
         if test.is_gpu_available():
-          num_tensors = 10000
+          num_tensors = 5000
         else:
-          num_tensors = 1000
+          num_tensors = 500
         for i in np.arange(num_tensors):
           input_shape = shape
           placeholder = array_ops.placeholder(dtypes.float32, shape=input_shape)
diff --git a/tensorflow/python/kernel_tests/constant_op_eager_test.py b/tensorflow/python/kernel_tests/constant_op_eager_test.py
index 3b71586b55451df86bf214437be3ceec8a4265eb..8e9d75667d49bf9e377ccb9290a3a91786b5a1cb 100644
--- a/tensorflow/python/kernel_tests/constant_op_eager_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_eager_test.py
@@ -237,6 +237,39 @@ class ConstantTest(test.TestCase):
     self._testAll((1, x))
     self._testAll((x, 1))
 
+  def testInvalidLength(self):
+
+    class BadList(list):
+
+      def __init__(self):
+        super(BadList, self).__init__([1, 2, 3])  # pylint: disable=invalid-length-returned
+
+      def __len__(self):
+        return -1
+
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant([BadList()])
+    with self.assertRaisesRegexp(ValueError, "mixed types"):
+      constant_op.constant([1, 2, BadList()])
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant(BadList())
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant([[BadList(), 2], 3])
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant([BadList(), [1, 2, 3]])
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant([BadList(), []])
+
+    # TODO(allenl, josh11b): These cases should return exceptions rather than
+    # working (currently shape checking only checks the first element of each
+    # sequence recursively). Maybe the first one is fine, but the second one
+    # silently truncating is rather bad.
+
+    # with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+    #   constant_op.constant([[3, 2, 1], BadList()])
+    # with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+    #   constant_op.constant([[], BadList()])
+
   def testSparseValuesRaiseErrors(self):
     with self.assertRaisesRegexp(ValueError, "non-rectangular Python sequence"):
       constant_op.constant([[1, 2], [3]], dtype=dtypes_lib.int32)
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 68817cc2566847255d289f822aa69308e9c2e329..16e56349c45dd56a335f6f881826d975e24bd110 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -44,7 +44,8 @@ class ConstantTest(test.TestCase):
     np_ans = np.array(x)
     with self.test_session(use_gpu=False):
       tf_ans = ops.convert_to_tensor(x).eval()
-    if np_ans.dtype in [np.float32, np.float64, np.complex64, np.complex128]:
+    dtype = dtypes_lib.as_dtype(np_ans.dtype)
+    if dtype.is_floating or dtype.is_complex:
       self.assertAllClose(np_ans, tf_ans)
     else:
       self.assertAllEqual(np_ans, tf_ans)
@@ -53,7 +54,8 @@ class ConstantTest(test.TestCase):
     np_ans = np.array(x)
     with self.test_session(use_gpu=True):
       tf_ans = ops.convert_to_tensor(x).eval()
-    if np_ans.dtype in [np.float32, np.float64, np.complex64, np.complex128]:
+    dtype = dtypes_lib.as_dtype(np_ans.dtype)
+    if dtype.is_floating or dtype.is_complex:
       self.assertAllClose(np_ans, tf_ans)
     else:
       self.assertAllEqual(np_ans, tf_ans)
@@ -62,6 +64,19 @@ class ConstantTest(test.TestCase):
     self._testCpu(x)
     self._testGpu(x)
 
+  def testBFloat16(self):
+    bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
+    self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(bfloat16))
+    self._testAll(
+        np.random.normal(size=30).reshape([2, 3, 5]).astype(bfloat16))
+    self._testAll(np.empty((2, 0, 5)).astype(bfloat16))
+
+  def testHalf(self):
+    self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float16))
+    self._testAll(
+        np.random.normal(size=30).reshape([2, 3, 5]).astype(np.float16))
+    self._testAll(np.empty((2, 0, 5)).astype(np.float16))
+
   def testFloat(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float32))
     self._testAll(
@@ -439,18 +454,19 @@ class ZerosLikeTest(test.TestCase):
 
   def testZerosLikeCPU(self):
     for dtype in [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int8,
-        dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16, dtypes_lib.int32,
-        dtypes_lib.int64, dtypes_lib.bool, dtypes_lib.complex64,
-        dtypes_lib.complex128, dtypes_lib.string
+        dtypes_lib.half, dtypes_lib.float32, dtypes_lib.float64,
+        dtypes_lib.int8, dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16,
+        dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.bool,
+        dtypes_lib.complex64, dtypes_lib.complex128, dtypes_lib.string
     ]:
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=False)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=False)
 
   def testZerosLikeGPU(self):
     for dtype in [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
-        dtypes_lib.bool, dtypes_lib.int64, dtypes_lib.string
+        dtypes_lib.half, dtypes_lib.float32, dtypes_lib.float64,
+        dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.complex64,
+        dtypes_lib.complex128, dtypes_lib.bool
     ]:
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=True)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=True)
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 1b7f9b110c2c6f86a1c22ed6eeae2d966cabdad3..15ff0ec09b65a8ba242473fb7b25ee00424e0926 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -38,11 +38,13 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_state_ops
@@ -68,16 +70,6 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.util import nest
 
 
-def check_op_order(graph):
-  """Sanity check on the ordering of op id."""
-
-  for op in graph.get_operations():
-    for v in op.inputs:
-      assert v.op._id < op._id or op.type == "Merge", (
-          "The id of %s must be less than the id of %s" % (v.op.name, op.name))
-  return True
-
-
 def check_consumers(graph):
   """Sanity check on the consumer list of the tensors."""
 
@@ -122,14 +114,16 @@ def opt_cfg():
               do_constant_folding=True)))
 
 
-def isum(s):
+def isum(s, maximum_iterations=None):
   i = constant_op.constant(0, name="i")
   c = lambda i, s: math_ops.less(i, 10)
   b = lambda i, s: [math_ops.add(i, 1), math_ops.add(i, s)]
-  _, r_s = control_flow_ops.while_loop(c, b, [i, s])
+  _, r_s = control_flow_ops.while_loop(
+      c, b, [i, s], maximum_iterations=maximum_iterations)
   return r_s
 
 
+@test_util.with_c_api
 class ControlFlowTest(test.TestCase):
 
   def testRefIdentity(self):
@@ -140,7 +134,6 @@ class ControlFlowTest(test.TestCase):
       op = state_ops.assign(v, 9)
       v2 = control_flow_ops.with_dependencies([op], v)
 
-      self.assertTrue(check_op_order(v.graph))
       self.assertTrue(isinstance(v2, ops.Tensor))
       variables.global_variables_initializer().run()
       self.assertEqual(9, v2.eval())
@@ -151,7 +144,7 @@ class ControlFlowTest(test.TestCase):
 
       enter_v = control_flow_ops._Enter(v, "foo_1", is_constant=True)
       nine = constant_op.constant(9)
-      enter_nine = control_flow_ops.enter(nine, "foo_1")
+      enter_nine = gen_control_flow_ops._enter(nine, "foo_1")
       op = state_ops.assign(enter_v, enter_nine)
       v2 = control_flow_ops.with_dependencies([op], enter_v)
       v3 = control_flow_ops.exit(v2)
@@ -171,9 +164,9 @@ class ControlFlowTest(test.TestCase):
   def testEnterMulExit(self):
     with self.test_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
-      enter_data = control_flow_ops.enter(data, "foo_1", False)
+      enter_data = gen_control_flow_ops._enter(data, "foo_1", False)
       five = constant_op.constant(5)
-      enter_five = control_flow_ops.enter(five, "foo_1", False)
+      enter_five = gen_control_flow_ops._enter(five, "foo_1", False)
       mul_op = math_ops.multiply(enter_data, enter_five)
       exit_op = control_flow_ops.exit(mul_op)
 
@@ -185,12 +178,13 @@ class ControlFlowTest(test.TestCase):
       v = variables.Variable([0.0, 0.0], dtype=dtypes.float32)
 
       # If is_constant=True, the shape information should be propagated.
-      enter_v_constant = control_flow_ops.enter(v, "frame1", is_constant=True)
+      enter_v_constant = gen_control_flow_ops._enter(
+          v, "frame1", is_constant=True)
       self.assertEqual(enter_v_constant.shape, [2])
 
       # Otherwise, the shape should be unknown.
-      enter_v_non_constant = control_flow_ops.enter(v, "frame2",
-                                                    is_constant=False)
+      enter_v_non_constant = gen_control_flow_ops._enter(
+          v, "frame2", is_constant=False)
       self.assertEqual(enter_v_non_constant.shape, None)
 
   def testSwitchMergeIndexedSlices(self):
@@ -263,8 +257,8 @@ class ControlFlowTest(test.TestCase):
       false = ops.convert_to_tensor(False)
       n = constant_op.constant(10)
 
-      enter_false = control_flow_ops.enter(false, "foo_1", False)
-      enter_n = control_flow_ops.enter(n, "foo_1", False)
+      enter_false = gen_control_flow_ops._enter(false, "foo_1", False)
+      enter_n = gen_control_flow_ops._enter(n, "foo_1", False)
 
       merge_n = control_flow_ops.merge([enter_n, enter_n], name="merge_n")[0]
       switch_n = control_flow_ops.switch(merge_n, enter_false)
@@ -281,9 +275,9 @@ class ControlFlowTest(test.TestCase):
       one = constant_op.constant(1)
       n = constant_op.constant(10)
 
-      enter_i = control_flow_ops.enter(zero, "foo", False)
-      enter_one = control_flow_ops.enter(one, "foo", True)
-      enter_n = control_flow_ops.enter(n, "foo", True)
+      enter_i = gen_control_flow_ops._enter(zero, "foo", False)
+      enter_one = gen_control_flow_ops._enter(one, "foo", True)
+      enter_n = gen_control_flow_ops._enter(n, "foo", True)
 
       with ops.device(test.gpu_device_name()):
         merge_i = control_flow_ops.merge([enter_i, enter_i])[0]
@@ -307,9 +301,9 @@ class ControlFlowTest(test.TestCase):
       one = constant_op.constant(1)
       n = constant_op.constant(10)
 
-      enter_i = control_flow_ops.enter(zero, "foo", False)
-      enter_one = control_flow_ops.enter(one, "foo", True)
-      enter_n = control_flow_ops.enter(n, "foo", True)
+      enter_i = gen_control_flow_ops._enter(zero, "foo", False)
+      enter_one = gen_control_flow_ops._enter(one, "foo", True)
+      enter_n = gen_control_flow_ops._enter(n, "foo", True)
 
       merge_i = control_flow_ops.merge([enter_i, enter_i])[0]
 
@@ -330,8 +324,8 @@ class ControlFlowTest(test.TestCase):
   def testDifferentFrame(self):
     with self.test_session():
       data = array_ops.placeholder(dtypes.float32, shape=[])
-      enter_1 = control_flow_ops.enter(data, "foo_1", False)
-      enter_2 = control_flow_ops.enter(data, "foo_2", False)
+      enter_1 = gen_control_flow_ops._enter(data, "foo_1", False)
+      enter_2 = gen_control_flow_ops._enter(data, "foo_2", False)
       res = math_ops.add(enter_1, enter_2)
       with self.assertRaisesOpError("has inputs from different frames"):
         res.eval(feed_dict={data: 1.0})
@@ -396,7 +390,6 @@ class ControlFlowTest(test.TestCase):
 
       val = r.values.eval()
       ind = r.indices.eval()
-    self.assertTrue(check_op_order(x.values.graph))
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
 
@@ -443,7 +436,6 @@ class ControlFlowTest(test.TestCase):
 
       val = r.values.eval()
       ind = r.indices.eval()
-    self.assertTrue(check_op_order(x.values.graph))
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
     self.assertTrue(ind.dtype == np.int64)
@@ -472,7 +464,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       result = r.eval()
-    self.assertTrue(check_op_order(x.graph))
     self.assertAllEqual(11, result)
 
   def testCond_1(self):
@@ -486,7 +477,6 @@ class ControlFlowTest(test.TestCase):
           math_ops.less(1, 0), lambda: math_ops.add(x, 1),
           lambda: math_ops.subtract(x, 1))
       result = r.eval()
-    self.assertTrue(check_op_order(x.graph))
     self.assertAllEqual(9, result)
 
   def testCond_3(self):
@@ -499,7 +489,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn3, fn2)
 
       result = r.eval()
-    self.assertTrue(check_op_order(x.graph))
     self.assertAllEqual(12, result)
 
   def testCond_4(self):
@@ -518,7 +507,6 @@ class ControlFlowTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertEqual(len(r), 2)
       result = r[1].eval()
-      self.assertTrue(check_op_order(age.graph))
       self.assertAllEqual(True, result)
       self.assertAllEqual(7, v1.eval())
       self.assertAllEqual(2, v2.eval())
@@ -716,6 +704,36 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
       self.assertEqual(10000, r.eval())
 
+  def testWhileExternalControlDependencies(self):
+    with self.test_session():
+      v = variables.Variable(0.0)
+      v.initializer.run()
+      increment = v.assign_add(1.0)
+
+      def body_fn(i):
+        with ops.control_dependencies([increment]):
+          return i + i
+
+      result = control_flow_ops.while_loop(cond=lambda i: i < 1,
+                                           body=body_fn, loop_vars=[1])
+      result.eval()
+      self.assertAllEqual(v.eval(), 1.0)
+
+  def testWhileExternalControlDependenciesNoInput(self):
+    with self.test_session():
+      v = variables.Variable(0.0)
+      v.initializer.run()
+      increment = v.assign_add(1.0)
+
+      def body_fn(unused_i):
+        with ops.control_dependencies([increment]):
+          return constant_op.constant(5, name="five")
+
+      result = control_flow_ops.while_loop(cond=lambda i: i < 5,
+                                           body=body_fn, loop_vars=[0])
+      result.eval()
+      self.assertAllEqual(v.eval(), 1.0)
+
   def testWhileWithRefs_1(self):
     with self.test_session() as sess:
       x = variables.Variable(0)._ref()  # pylint: disable=protected-access
@@ -746,6 +764,184 @@ class ControlFlowTest(test.TestCase):
       r = isum(s)
       self.assertAllEqual(45, r.eval())
 
+  def testWhileWithMaximumIterations(self):
+    with self.test_session():
+      s = constant_op.constant([1, 2, 3, 4, 5])
+      r = isum(s, maximum_iterations=3)
+      self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], r.eval())
+
+  def testWhileWithMaximumIterationsAndSingleArgument(self):
+    with self.test_session():
+      r = control_flow_ops.while_loop(
+          lambda i: i < 3, lambda i: i + 1, [0], maximum_iterations=1)
+      self.assertEqual(1, r.eval())
+
+  def testSingleNestedMaximumIterationsWhileLoopGradientInXLAContext(self):
+    v = constant_op.constant(1.0)
+
+    def training_loop_with_gradient(i):
+      out = control_flow_ops.while_loop(
+          lambda i_, _: i_ < 3,
+          lambda i_, j: [i_ + 1, j * v], [0, 1.0],
+          maximum_iterations=i)
+      g = gradients_impl.gradients(out, v)
+      with ops.control_dependencies(g):
+        return i + 1
+
+    xla_context = control_flow_ops.XLAControlFlowContext()
+    xla_context.Enter()
+    # Create training loop, ensure we can call gradient() of
+    # while_loop inside the training loop.
+    loop = control_flow_ops.while_loop(lambda i: i < 3,
+                                       training_loop_with_gradient, [0])
+    xla_context.Exit()
+
+    loop_execute = array_ops.identity(loop)  # Because loop is not fetchable.
+
+    # Should execute without issue.
+    self.assertEqual(3, self.evaluate(loop_execute))
+
+  def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self):
+    v = constant_op.constant(1.0)
+
+    def inner_body(i, x):
+      out = control_flow_ops.while_loop(
+          lambda i, _: i < 3,
+          lambda i, j: [i + 1, j * v], [0, x],
+          maximum_iterations=i)
+      return out
+
+    def create_while_loop(maximum_iterations=None):
+      return control_flow_ops.while_loop(
+          lambda i, _: i < 3,
+          inner_body, [0, 1.0],
+          maximum_iterations=maximum_iterations)
+
+    loop_no_xla = create_while_loop(maximum_iterations=5)
+    # maximum_iterations is fine outside of an XLA scope
+    gs = gradients_impl.gradients(loop_no_xla, v)
+    self.evaluate(gs)  # This should execute without error.
+
+    xla_context = control_flow_ops.XLAControlFlowContext()
+    xla_context.Enter()
+    loop_no_maxiter = create_while_loop()
+    loop_with_maxiter = create_while_loop(maximum_iterations=2)
+    xla_context.Exit()
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Cannot create a gradient accumulator for tensor '.+' inside "
+        r"XLA while_loop because maximum_iterations was not passed to "
+        r"the tf.while_loop call \('.+'\)."):
+      _ = gradients_impl.gradients(loop_no_maxiter, v)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
+        r"while_loop. maximum_iterations tensor '.+' for while_loop context "
+        r"'.+' must be statically known \(e.g. a constant value or known "
+        r"shape dimension\), or be defined at or outside the while loop "
+        r"context '.*' \(currently defined in '.*'\)"):
+      _ = gradients_impl.gradients(loop_with_maxiter, v)
+
+  def testInvalidMaximumIterationsFromSiblingContextWhileLoopInXLAContext(self):
+    v = constant_op.constant(1.0)
+
+    def create_while_loop():
+      max_iter_holder = []
+
+      def create_mi():
+        max_iter_holder.append(array_ops.placeholder(dtypes.int32, shape=()))
+        return 1.0
+
+      _ = control_flow_ops.cond(
+          constant_op.constant(True), create_mi, create_mi)
+
+      return control_flow_ops.while_loop(
+          lambda i, _: i < 3,
+          lambda i, x: (i + 1, v * x), (0, 1.0),
+          maximum_iterations=max_iter_holder[0])
+
+    xla_context = control_flow_ops.XLAControlFlowContext()
+    xla_context.Enter()
+    loop = create_while_loop()
+    xla_context.Exit()
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
+        r"while_loop. maximum_iterations tensor '.*Placeholder:0' for "
+        r"while_loop context '.+' must be statically known \(e.g. a constant "
+        r"value or known shape dimension\), or be defined at or outside the "
+        r"while loop context '' \(currently defined in 'cond/.+'\)"):
+      _ = gradients_impl.gradients(loop, v)
+
+  def testNestedWhileLoopWithMaxItersFromOuterContextInXLAContext(self):
+    v = constant_op.constant(1.0)
+
+    p = array_ops.placeholder(dtype=dtypes.int32)
+
+    def mid_body_builder(iterations):
+
+      def mid_body(i, x):
+        r = control_flow_ops.while_loop(
+            lambda *_: True,
+            lambda i, x: (i + 1, v * x), (0, x),
+            maximum_iterations=iterations,
+            name="inner")
+        return (i + 1, gradients_impl.gradients(x + r[1], v)[0])
+
+      return mid_body
+
+    def outer_body(i, x):
+      iterations = array_ops.size(p, name="iterations")
+      return (i + 1, x + control_flow_ops.while_loop(
+          lambda *_: True,
+          mid_body_builder(iterations), (0, x),
+          maximum_iterations=iterations,
+          name="mid")[1])
+
+    def create_while_loop():
+      with ops.device("/cpu:0"):
+        r = control_flow_ops.while_loop(
+            lambda *_: True,
+            outer_body, (0, 1.0),
+            maximum_iterations=5,
+            name="outer")
+        return array_ops.identity(r[1])
+
+    xla_context = control_flow_ops.XLAControlFlowContext()
+    xla_context.Enter()
+    final_with_xla_context = create_while_loop()
+    xla_context.Exit()
+
+    final_without_xla_context = create_while_loop()
+
+    with self.test_session(use_gpu=False) as sess:
+      opts = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata = config_pb2.RunMetadata()
+
+      final_value_without_xla_context = sess.run(
+          final_without_xla_context, feed_dict={
+              p: [0, 0, 0]
+          })
+
+      final_value_with_xla_context = sess.run(
+          final_with_xla_context,
+          feed_dict={p: [0, 0, 0]},
+          options=opts,
+          run_metadata=run_metadata)
+
+      node_stats = run_metadata.step_stats.dev_stats[0].node_stats
+      stack_push_count = len(
+          [x for x in node_stats if x.node_name.endswith("StackPushV2")])
+      # Pushes to the stack = product of maximum_iterations values;
+      # the last two "3"s comes from size(p), when p == [0, 0, 0].
+      self.assertEqual(stack_push_count, 5 * 3 * 3)
+
+      self.assertAllClose(final_value_with_xla_context,
+                          final_value_without_xla_context)
+
   # Have more than 10 parallel iterations and hence exercise k-bound
   # most of the time.
   def testWhile_3(self):
@@ -766,7 +962,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(lambda i, m, c, o: math_ops.less(i, d),
                                       compute, [i, m, c, o])
       result = r[3].eval()
-    self.assertTrue(check_op_order(i.graph))
     self.assertAllEqual(10100, result)
 
   def testWhile_4(self):
@@ -788,15 +983,13 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(lambda i, m, c, o: math_ops.less(i, s),
                                       compute, [i, m, c, o])
       result = r[3].eval()
-    self.assertTrue(check_op_order(i.graph))
     self.assertAllEqual(42, result)
 
   def testWhile_5(self):
     with self.test_session():
 
       def compute(i, c, o):
-        c = array_ops.strided_slice(x,
-                                    array_ops.expand_dims(i, 0),
+        c = array_ops.strided_slice(x, array_ops.expand_dims(i, 0),
                                     [1] + array_ops.expand_dims(i, 0))
         o = array_ops.concat([o, c], 0)
         i = math_ops.add(i, 1)
@@ -807,13 +1000,13 @@ class ControlFlowTest(test.TestCase):
       o = ops.convert_to_tensor([0])
       x = ops.convert_to_tensor([1, 2, 3, 4, 5, 6])
       s = array_ops.size(x)
-      r = control_flow_ops.while_loop(
-          lambda i, c, o: math_ops.less(i, s), compute, [i, c, o], [
-              i.get_shape(), tensor_shape.unknown_shape(),
-              tensor_shape.unknown_shape()
-          ])
+      r = control_flow_ops.while_loop(lambda i, c, o: math_ops.less(i, s),
+                                      compute, [i, c, o], [
+                                          i.get_shape(),
+                                          tensor_shape.unknown_shape(),
+                                          tensor_shape.unknown_shape()
+                                      ])
       result = r[2].eval()
-    self.assertTrue(check_op_order(i.graph))
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
   def testBufferForwarding(self):
@@ -878,7 +1071,8 @@ class ControlFlowTest(test.TestCase):
         return [new_i, new_j]
 
       r = control_flow_ops.while_loop(
-          c, _b, [i, m], [i.get_shape(), tensor_shape.unknown_shape()])
+          c, _b, [i, m],
+          [i.get_shape(), tensor_shape.unknown_shape()])
       r = r[1] * array_ops.ones([8, 8])
       self.assertAllEqual(np.ones((8, 8)), r.eval())
 
@@ -910,11 +1104,18 @@ class ControlFlowTest(test.TestCase):
         return [new_i, new_j]
 
       r = control_flow_ops.while_loop(
-          c, b, [i, m], [i.get_shape(), tensor_shape.TensorShape([None, 2])])
+          c, b, [i, m],
+          [i.get_shape(), tensor_shape.TensorShape([None, 2])])
       self.assertTrue(r[1].get_shape()[0].value is None)
       self.assertEqual(r[1].get_shape()[1], tensor_shape.Dimension(2))
 
-      with self.assertRaisesRegexp(ValueError, "not an invariant for"):
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"The shape for while_1/Merge_1:0 is not an invariant for the loop. "
+          r"It enters the loop with shape \(2, 2\), but has shape \(4, 2\) "
+          r"after one iteration. Provide shape invariants using either the "
+          r"`shape_invariants` argument of tf.while_loop or set_shape\(\) on "
+          r"the loop variables."):
         r = control_flow_ops.while_loop(c, b, [i, m])
 
   def testWhileShapeInferenceSparseTensor(self):
@@ -931,20 +1132,22 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, sparse_tensor.SparseTensor(x.indices, x.values * 2.0,
-                                              x.dense_shape)
+            i + 1,
+            sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
       self.assertEqual(r.dense_shape.get_shape()[0].value, 1)
 
       _, r = control_flow_ops.while_loop(
-          c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None])])
+          c, b, [i, x],
+          [i.get_shape(), tensor_shape.TensorShape([None])])
       self.assertTrue(r.dense_shape.get_shape()[0].value is None)
 
       with self.assertRaisesRegexp(ValueError, "is not compatible with"):
         _, r = control_flow_ops.while_loop(
-            c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([5])])
+            c, b, [i, x],
+            [i.get_shape(), tensor_shape.TensorShape([5])])
 
   def testWhileShapeInferenceIndexedSlices(self):
     with self.test_session():
@@ -959,7 +1162,8 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
+            i + 1,
+            ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
@@ -967,14 +1171,16 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(r.values.get_shape(), tensor_shape.TensorShape([2, 2]))
 
       _, r = control_flow_ops.while_loop(
-          c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None, 2])])
+          c, b, [i, x],
+          [i.get_shape(), tensor_shape.TensorShape([None, 2])])
       self.assertEqual(r.dense_shape.get_shape()[0].value, 2)
       self.assertTrue(r.values.get_shape()[0].value is None)
       self.assertEqual(r.values.get_shape()[1].value, 2)
 
       with self.assertRaisesRegexp(ValueError, "is not compatible with"):
         _, r = control_flow_ops.while_loop(
-            c, b, [i, x], [i.get_shape(), tensor_shape.TensorShape([None, 5])])
+            c, b, [i, x],
+            [i.get_shape(), tensor_shape.TensorShape([None, 5])])
 
   def _testNestedWhile_1(self, use_gpu):
     with self.test_session(use_gpu=use_gpu):
@@ -1115,16 +1321,17 @@ class ControlFlowTest(test.TestCase):
           "v", [], initializer=init_ops.constant_initializer(2))
       i0 = constant_op.constant(0)
       with ops.control_dependencies([i0]):
+
         def loop_condition(i):
           return i < 4
 
         def loop_body(i):
           some_cond = control_flow_ops.cond(
               constant_op.constant(True),
-              lambda: state_ops.assign(v, math_ops.square(v)),
-              lambda: v)
+              lambda: state_ops.assign(v, math_ops.square(v)), lambda: v)
           with ops.control_dependencies([some_cond]):
             return i + 1
+
       r = control_flow_ops.while_loop(loop_condition, loop_body, (i0,))
       variables.global_variables_initializer().run()
       self.assertEqual(4, r.eval())
@@ -1253,7 +1460,6 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
-      self.assertTrue(check_op_order(n.graph))
       variables.global_variables_initializer().run()
       self.assertEqual(3, r.eval())
       result = select.eval()
@@ -1278,7 +1484,6 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
-      self.assertTrue(check_op_order(n.graph))
       variables.global_variables_initializer().run()
       self.assertEqual(3, r.eval())
       result1 = select1.eval()
@@ -1305,7 +1510,6 @@ class ControlFlowTest(test.TestCase):
           parallel_iterations=1)
       variables.global_variables_initializer().run()
       result = r[1].eval()
-    self.assertTrue(check_op_order(n.graph))
     self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
   # b/24814703
@@ -1442,7 +1646,8 @@ class ControlFlowTest(test.TestCase):
 
       _, rx = control_flow_ops.while_loop(
           c1,
-          b1, [r, x], [r.get_shape(), tensor_shape.unknown_shape()],
+          b1, [r, x],
+          [r.get_shape(), tensor_shape.unknown_shape()],
           parallel_iterations=1)
       self.assertEqual(45, rx.eval())
 
@@ -1450,7 +1655,8 @@ class ControlFlowTest(test.TestCase):
     gpu_dev_name = test.gpu_device_name() if test.is_gpu_available(
     ) else "/device:GPU:0"
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    graph = ops.Graph()
+    with graph.as_default():
       v = constant_op.constant(2.0, name="v")
       c = lambda v: math_ops.less(v, 100.0)
 
@@ -1461,7 +1667,8 @@ class ControlFlowTest(test.TestCase):
       loop = control_flow_ops.while_loop(c, b, [v], parallel_iterations=1)
       r = gradients_impl.gradients(
           loop, v, colocate_gradients_with_ops=colocate)[0]
-    r_ops = r.graph.get_operations()
+
+    r_ops = graph.get_operations()
     r_devices = [(op.name, op.device) for op in r_ops]
 
     self.assertTrue(any("Square" in op.name for op in r_ops))
@@ -1475,7 +1682,9 @@ class ControlFlowTest(test.TestCase):
         self.assertTrue(gpu_dev_name in dev)
       else:
         self.assertFalse(gpu_dev_name in dev)
-    self.assertAllClose(1024.0, sess.run(r))
+
+    with self.test_session(graph=graph) as sess:
+      self.assertAllClose(1024.0, sess.run(r))
 
   def testWhileGrad_ColocateGradients(self):
     self._testWhileGrad_ColocateGradients(colocate=False)
@@ -1501,7 +1710,8 @@ class ControlFlowTest(test.TestCase):
       b = lambda i, v: [i + 1, math_ops.multiply(x, v)]
       r = control_flow_ops.while_loop(
           c,
-          b, [n, v], [n.get_shape(), tensor_shape.unknown_shape()],
+          b, [n, v],
+          [n.get_shape(), tensor_shape.unknown_shape()],
           parallel_iterations=1)
 
       r = gradients_impl.gradients(r[1], x)[0]
@@ -1635,8 +1845,8 @@ class ControlFlowTest(test.TestCase):
       named = collections.namedtuple("named", ("a", "b"))
       loop_vars = [
           named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
-          (constant_op.constant(2.0),
-           constant_op.constant(3.0)), constant_op.constant(4.0)
+          (constant_op.constant(2.0), constant_op.constant(3.0)),
+          constant_op.constant(4.0)
       ]
       c = lambda lv0, _1, _2: lv0.a < 100.0
 
@@ -1662,8 +1872,8 @@ class ControlFlowTest(test.TestCase):
       named = collections.namedtuple("named", ("a", "b"))
       loop_vars = [
           named(a=constant_op.constant(0.0), b=constant_op.constant(1.0)),
-          (constant_op.constant(2.0),
-           constant_op.constant(3.0)), constant_op.constant(4.0)
+          (constant_op.constant(2.0), constant_op.constant(3.0)),
+          constant_op.constant(4.0)
       ]
       c = lambda lv0, _1, _2: lv0.a < 100.0
 
@@ -2014,7 +2224,8 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
+            i + 1,
+            ops.IndexedSlices(x.values * 2.0, x.indices, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
@@ -2035,8 +2246,8 @@ class ControlFlowTest(test.TestCase):
 
       def b(i, x):
         return [
-            i + 1, sparse_tensor.SparseTensor(x.indices, x.values * 2.0,
-                                              x.dense_shape)
+            i + 1,
+            sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
@@ -2058,8 +2269,8 @@ class ControlFlowTest(test.TestCase):
         x1 = x + gradients_impl.gradients(data, params)[0]
         return i + 1, x1
 
-      output_grad = control_flow_ops.while_loop(c, b,
-                                                [i0, constant_op.constant(0.0)])
+      output_grad = control_flow_ops.while_loop(
+          c, b, [i0, constant_op.constant(0.0)])
       self.assertAllClose(600.0, sess.run(output_grad)[1])
 
   def testWhileAndTensorArray(self):
@@ -2197,9 +2408,12 @@ class ControlFlowTest(test.TestCase):
 
   def testStopGradMultiFlows(self):
     with self.test_session():
+
       def body(i, y, r):
         x = variable_scope.get_variable(
-            "x", shape=(), dtype=dtypes.float32,
+            "x",
+            shape=(),
+            dtype=dtypes.float32,
             initializer=init_ops.ones_initializer())
         y *= x
         return [i + 1, y, r + math_ops.reduce_sum(y)]
@@ -2274,8 +2488,7 @@ class ControlFlowTest(test.TestCase):
       # Duplicate events cause an error if exclusive = True
       r4 = control_flow_ops.case(
           [(x < y, f1), (x < y, f2)], default=f3, exclusive=True)
-      with self.assertRaisesOpError(
-          "More than one condition evaluated as True but exclusive=True."):
+      with self.assertRaisesOpError("Input error:"):
         r4.eval()
 
       # Check that the default is called if none of the others are
@@ -2612,7 +2825,8 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           lambda i, v: i < 2, lambda i, v: [i + 1, func(v)],
           [constant_op.constant(0), x],
-          [tensor_shape.unknown_shape(), tensor_shape.unknown_shape()])
+          [tensor_shape.unknown_shape(),
+           tensor_shape.unknown_shape()])
       self.assertEqual(r[1].eval(), 65536.0)
 
       r = gradients_impl.gradients(r, x)[0]
@@ -2622,6 +2836,127 @@ class ControlFlowTest(test.TestCase):
           1)
 
 
+@test_util.with_c_api
+class ControlFlowContextCheckTest(test.TestCase):
+
+  def _getWhileTensor(self):
+    """Creates and returns a tensor from a while context."""
+    tensor = []
+
+    def body(i):
+      if not tensor:
+        tensor.append(constant_op.constant(1))
+      return i + tensor[0]
+
+    control_flow_ops.while_loop(lambda i: i < 10, body, [0])
+    return tensor[0]
+
+  def _getCondTensor(self):
+    cond_tensor = []
+
+    def true_fn():
+      if not cond_tensor:
+        cond_tensor.append(constant_op.constant(1))
+      return cond_tensor[0]
+
+    control_flow_ops.cond(
+        math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
+    return cond_tensor[0]
+
+  def testInvalidContext(self):
+    # Accessing a while loop tensor outside of control flow is illegal.
+    while_tensor = self._getWhileTensor()
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'while/Const_1' as input to 'Add' because 'while/Const_1' "
+        "is in a while loop. See info log for more details."):
+      math_ops.add(1, while_tensor)
+
+  def testInvalidContextInCond(self):
+    # Accessing a while loop tensor in cond is illegal.
+    while_tensor = self._getWhileTensor()
+    with self.assertRaisesRegexp(
+        ValueError, "Cannot use 'while/Const_1' as input to 'cond/Add' because "
+        "'while/Const_1' is in a while loop. See info log for more details."):
+      # TODO(skyewm): this passes if we return while_tensor directly instead
+      # of using it as input to another op.
+      control_flow_ops.cond(
+          math_ops.less(1, 2), lambda: math_ops.add(1, while_tensor),
+          lambda: constant_op.constant(0))
+
+  def testInvalidContextInWhile(self):
+    # Accessing a while loop tensor in a different while loop is illegal.
+    while_tensor = self._getWhileTensor()
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'while_1/Add' as input to 'while/Const_1' because they are "
+        "in different while loops. See info log for more details."):
+      control_flow_ops.while_loop(lambda i: i < 10,
+                                  lambda x: math_ops.add(1, while_tensor), [0])
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'while_2/NextIteration' as input to 'while/Const_1' "
+        "because they are in different while loops. See info log for more "
+        "details."):
+      control_flow_ops.while_loop(lambda i: i < 10, lambda i: while_tensor, [0])
+
+  def testValidCondContext(self):
+    # Accessing a tensor from a cond context is OK (although dangerous).
+    cond_tensor = self._getCondTensor()
+    math_ops.add(1, cond_tensor)
+
+  def testValidCondContextBranches(self):
+    # Accessing a tensor from a cond context from the other branch's cond
+    # context is OK (although dangerous).
+    cond_tensor = []
+
+    def branch_fn():
+      if not cond_tensor:
+        cond_tensor.append(constant_op.constant(1))
+      return cond_tensor[0]
+
+    control_flow_ops.cond(math_ops.less(1, 2), branch_fn, branch_fn)
+
+  def testValidWhileContext(self):
+    # Accessing a tensor in a nested while is OK.
+    def body(_):
+      c = constant_op.constant(1)
+      return control_flow_ops.while_loop(lambda i: i < 3, lambda i: i + c, [0])
+
+    control_flow_ops.while_loop(lambda i: i < 5, body, [0])
+
+  def testValidNestedContexts(self):
+    # Accessing a tensor from a cond context in a while context, all inside an
+    # outer while context, is OK.
+    def body(_):
+      cond_tensor = self._getCondTensor()
+      # Create another cond containing the while loop for good measure
+      return control_flow_ops.cond(
+          math_ops.less(1, 2),
+          lambda: control_flow_ops.while_loop(lambda i: i < 3,
+                                              lambda i: i + cond_tensor, [0]),
+          lambda: constant_op.constant(0))
+
+    control_flow_ops.while_loop(lambda i: i < 5, body, [0])
+
+  def testInvalidNestedContexts(self):
+    # Accessing a tensor from a while context in a different while context, all
+    # inside a cond context, is illegal.
+    def true_fn():
+      while_tensor = self._getWhileTensor()
+      return control_flow_ops.while_loop(lambda i: i < 3,
+                                         lambda i: i + while_tensor, [0])
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'cond/while_1/add' as input to 'cond/while/Const_1' because"
+        " they are in different while loops. See info log for more details."):
+      control_flow_ops.cond(
+          math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
+
+
+@test_util.with_c_api
 class TupleTest(test.TestCase):
 
   def testTensors(self):
@@ -2707,6 +3042,7 @@ class TupleTest(test.TestCase):
       self.assertEquals(1, var.eval())
 
 
+@test_util.with_c_api
 class AssertTest(test.TestCase):
 
   def testGuardedAssertDoesNotCopyWhenTrue(self):
@@ -2725,11 +3061,13 @@ class AssertTest(test.TestCase):
       sess.run(unguarded_assert, options=opts, run_metadata=unguarded_metadata)
       guarded_nodestat_names = [
           n.node_name
-          for d in guarded_metadata.step_stats.dev_stats for n in d.node_stats
+          for d in guarded_metadata.step_stats.dev_stats
+          for n in d.node_stats
       ]
       unguarded_nodestat_names = [
           n.node_name
-          for d in unguarded_metadata.step_stats.dev_stats for n in d.node_stats
+          for d in unguarded_metadata.step_stats.dev_stats
+          for n in d.node_stats
       ]
       guarded_memcpy_nodestat_names = [
           n for n in guarded_nodestat_names if "MEMCPYDtoH" in n
@@ -2744,6 +3082,7 @@ class AssertTest(test.TestCase):
       self.assertEqual([], guarded_memcpy_nodestat_names)
 
 
+@test_util.with_c_api
 class WhileOpBenchmark(test.Benchmark):
   """Evaluate the performance of while_loop op."""
 
@@ -2785,6 +3124,7 @@ class WhileOpBenchmark(test.Benchmark):
     Returns:
       The duration of the run in seconds.
     """
+
     def loop_body(i, x):
       with ops.device("/gpu:0"):
         # Always put loop body on GPU.
@@ -2826,7 +3166,7 @@ class WhileOpBenchmark(test.Benchmark):
       start_time = time.time()
       for _ in xrange(num_iters):
         sess.run(r)
-      return (time.time() - start_time)/num_iters
+      return (time.time() - start_time) / num_iters
 
   def benchmarkWhileOpCrossDevicePlacement(self):
     iters = 10
@@ -2857,6 +3197,7 @@ class WhileOpBenchmark(test.Benchmark):
         name="unroll_same_device", iters=iters, wall_time=duration)
 
 
+@test_util.with_c_api
 class EagerTest(test.TestCase):
 
   def testCond(self):
@@ -2872,8 +3213,21 @@ class EagerTest(test.TestCase):
   def testWhileLoop(self):
     with context.eager_mode():
       tensor = constant_op.constant([1, 2, 3, 4, 5])
-      self.assertAllEqual(isum(tensor).numpy(),
-                          [46, 47, 48, 49, 50])
+      self.assertAllEqual(isum(tensor).numpy(), [46, 47, 48, 49, 50])
+
+  def testWhileLoopWithMaxIterations(self):
+    with context.eager_mode():
+      tensor = constant_op.constant([1, 2, 3, 4, 5])
+      self.assertAllEqual(
+          isum(tensor, maximum_iterations=3).numpy(),
+          [1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3])
+
+  def testWhileWithMaximumIterationsAndSingleArgument(self):
+    with context.eager_mode():
+      tensor = constant_op.constant(0)
+      r = control_flow_ops.while_loop(
+          lambda i: i < 3, lambda i: i + 1, [tensor], maximum_iterations=1)
+      self.assertEqual(1, r.numpy())
 
   def testWithDependencies(self):
     with context.eager_mode():
@@ -2899,9 +3253,10 @@ class EagerTest(test.TestCase):
       f2 = lambda: constant_op.constant(23)
       f3 = lambda: constant_op.constant(-1)
 
-      r1 = control_flow_ops.case([(x < y, f1), (x > z, f2)],
-                                 default=f3, exclusive=True)
+      r1 = control_flow_ops.case(
+          [(x < y, f1), (x > z, f2)], default=f3, exclusive=True)
       self.assertAllEqual(r1.numpy(), 17)
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow_util_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..23185eaeece0d56fd83ecdf9e02c778712420465
--- /dev/null
+++ b/tensorflow/python/kernel_tests/control_flow_util_test.py
@@ -0,0 +1,71 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for tensorflow.python.ops.control_flow_util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import test_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.platform import test
+
+
+class ControlFlowUtilTest(test.TestCase):
+
+  def testIsSwitch(self):
+    switch_false, _ = control_flow_ops.switch(1, True)
+    switch = switch_false.op
+    self.assertTrue(control_flow_util.IsSwitch(switch))
+
+    ref_switch_false, _ = control_flow_ops.ref_switch(test_ops.ref_output(),
+                                                      True)
+    ref_switch = ref_switch_false.op
+    self.assertTrue(control_flow_util.IsSwitch(ref_switch))
+
+    self.assertFalse(control_flow_util.IsSwitch(test_ops.int_output().op))
+
+  def testIsLoopEnter(self):
+    enter = gen_control_flow_ops._enter(1, frame_name="name").op
+    self.assertTrue(control_flow_util.IsLoopEnter(enter))
+    self.assertFalse(control_flow_util.IsLoopConstantEnter(enter))
+
+    ref_enter = gen_control_flow_ops._ref_enter(test_ops.ref_output(),
+                                                frame_name="name").op
+    self.assertTrue(control_flow_util.IsLoopEnter(ref_enter))
+    self.assertFalse(control_flow_util.IsLoopConstantEnter(ref_enter))
+
+    const_enter = gen_control_flow_ops._enter(1, frame_name="name",
+                                              is_constant=True).op
+    self.assertTrue(control_flow_util.IsLoopEnter(const_enter))
+    self.assertTrue(control_flow_util.IsLoopConstantEnter(const_enter))
+
+    self.assertFalse(control_flow_util.IsLoopEnter(test_ops.int_output().op))
+
+  def testIsLoopExit(self):
+    exit_op = control_flow_ops.exit(1).op
+    self.assertTrue(control_flow_util.IsLoopExit(exit_op))
+
+    ref_exit = control_flow_ops.exit(test_ops.ref_output()).op
+    self.assertTrue(control_flow_util.IsLoopExit(ref_exit))
+
+    self.assertFalse(control_flow_util.IsLoopExit(test_ops.int_output().op))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index d92797a7d38cbe359d8166ea9ad7c25bd9cd1f4b..e2e6205911caa06b52f21658a91a53d60a0130ff 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -30,27 +30,29 @@ from tensorflow.python.platform import test
 class Conv1DTest(test.TestCase):
 
   def testBasic(self):
-    """Test that argument passing to conv2d is handled properly."""
-
-    x = constant_op.constant([1, 2, 3, 4], dtype=dtypes.float32)
-    x = array_ops.expand_dims(x, 0)  # Add batch dimension
-    x = array_ops.expand_dims(x, 2)  # And depth dimension
-    filters = constant_op.constant([2, 1], dtype=dtypes.float32)
-    filters = array_ops.expand_dims(filters, 1)  # in_channels
-    filters = array_ops.expand_dims(filters, 2)  # out_channels
-    # Filters is 2x1x1
-    for stride in [1, 2]:
-      with self.test_session(use_gpu=test.is_gpu_available()):
-        c = nn_ops.conv1d(x, filters, stride, padding="VALID")
-        reduced = array_ops.squeeze(c)
-        output = reduced.eval()
-        if stride == 1:
-          self.assertEqual(len(output), 3)
-          self.assertAllClose(output,
-                              [2 * 1 + 1 * 2, 2 * 2 + 1 * 3, 2 * 3 + 1 * 4])
-        else:
-          self.assertEqual(len(output), 2)
-          self.assertAllClose(output, [2 * 1 + 1 * 2, 2 * 3 + 1 * 4])
+    """Test that argument passing to conv1d is handled properly."""
+    # TODO(yongtang): dtypes.float64 can only be enabled once conv2d support
+    # dtypes.float64, as conv1d implicitly calls conv2d after expand_dims.
+    for dtype in [dtypes.float16, dtypes.float32]:
+      x = constant_op.constant([1, 2, 3, 4], dtype=dtype)
+      x = array_ops.expand_dims(x, 0)  # Add batch dimension
+      x = array_ops.expand_dims(x, 2)  # And depth dimension
+      filters = constant_op.constant([2, 1], dtype=dtype)
+      filters = array_ops.expand_dims(filters, 1)  # in_channels
+      filters = array_ops.expand_dims(filters, 2)  # out_channels
+      # Filters is 2x1x1
+      for stride in [1, 2]:
+        with self.test_session(use_gpu=test.is_gpu_available()):
+          c = nn_ops.conv1d(x, filters, stride, padding="VALID")
+          reduced = array_ops.squeeze(c)
+          output = reduced.eval()
+          if stride == 1:
+            self.assertEqual(len(output), 3)
+            self.assertAllClose(output,
+                                [2 * 1 + 1 * 2, 2 * 2 + 1 * 3, 2 * 3 + 1 * 4])
+          else:
+            self.assertEqual(len(output), 2)
+            self.assertAllClose(output, [2 * 1 + 1 * 2, 2 * 3 + 1 * 4])
 
   def testConv1DTranspose(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
index 1679857bd5b9c5a9a1fbf89f207befc4382223b1..be299beee48cd8fb058393840eddfe08da1d6d99 100644
--- a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
@@ -42,17 +42,21 @@ class Conv2DBackpropFilterGradTest(test.TestCase):
           filter_shape = [3, 3, 4, 6]
           # Make a convolution op with the current settings, just to easily get
           # the shape of the output.
-          conv_out = nn_ops.conv2d(in_val,
-                                   array_ops.zeros(filter_shape),
-                                   [1, stride, stride, 1], padding)
+          conv_out = nn_ops.conv2d(
+              in_val,
+              array_ops.zeros(filter_shape),
+              strides=[1, stride, stride, 1],
+              padding=padding)
           out_backprop_shape = conv_out.get_shape().as_list()
           out_backprop_val = constant_op.constant(
               2 * np.random.random_sample(out_backprop_shape) - 1,
               dtype=dtypes.float32)
-          output = nn_ops.conv2d_backprop_filter(in_val, filter_shape,
-                                                 out_backprop_val,
-                                                 [1, stride, stride, 1],
-                                                 padding)
+          output = nn_ops.conv2d_backprop_filter(
+              in_val,
+              filter_shape,
+              out_backprop_val,
+              strides=[1, stride, stride, 1],
+              padding=padding)
           err = gradient_checker.compute_gradient_error(
               [in_val, out_backprop_val], [in_shape, out_backprop_shape],
               output, filter_shape)
@@ -60,6 +64,42 @@ class Conv2DBackpropFilterGradTest(test.TestCase):
           err_tolerance = 2e-3
           self.assertLess(err, err_tolerance)
 
+  def testGradientDilatedConv(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        for padding in ["SAME", "VALID"]:
+          for stride in [1, 2]:
+            np.random.seed(1)
+            in_shape = [5, 8, 6, 4]
+            in_val = constant_op.constant(
+                2 * np.random.random_sample(in_shape) - 1, dtype=dtypes.float32)
+            filter_shape = [3, 3, 4, 6]
+            # Make a convolution op with the current settings,
+            # just to easily get the shape of the output.
+            conv_out = nn_ops.conv2d(
+                in_val,
+                array_ops.zeros(filter_shape),
+                dilations=[1, 2, 2, 1],
+                strides=[1, stride, stride, 1],
+                padding=padding)
+            out_backprop_shape = conv_out.get_shape().as_list()
+            out_backprop_val = constant_op.constant(
+                2 * np.random.random_sample(out_backprop_shape) - 1,
+                dtype=dtypes.float32)
+            output = nn_ops.conv2d_backprop_filter(
+                in_val,
+                filter_shape,
+                out_backprop_val,
+                dilations=[1, 2, 2, 1],
+                strides=[1, stride, stride, 1],
+                padding=padding)
+            err = gradient_checker.compute_gradient_error(
+                [in_val, out_backprop_val], [in_shape, out_backprop_shape],
+                output, filter_shape)
+            print("conv2d_backprop_filter gradient err = %g " % err)
+            err_tolerance = 2e-3
+            self.assertLess(err, err_tolerance)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv2d_transpose_test.py b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
index 7d0bc54b6993daff0298f9d76e9e67dfcbfa5711..b692d3da609fd97a55b8f5fce3334b8e9d97c827 100644
--- a/tensorflow/python/kernel_tests/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.python.client import device_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
@@ -175,7 +174,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertLess(err, err_tolerance)
 
   def testConv2DTransposeSingleStrideNCHW(self):
-    # `NCHW` data fomat is only supported for CUDA device.
+    # `NCHW` data format is only supported for CUDA device.
     if test.is_gpu_available(cuda_only=True):
       with self.test_session(use_gpu=True):
         strides = [1, 1, 1, 1]
@@ -210,7 +209,7 @@ class Conv2DTransposeTest(test.TestCase):
                 self.assertAllClose(target, value[n, k, h, w])
 
   def testConv2DTransposeSameNCHW(self):
-    # `NCHW` data fomat is only supported for CUDA device.
+    # `NCHW` data format is only supported for CUDA device.
     if test.is_gpu_available(cuda_only=True):
       with self.test_session(use_gpu=True):
         strides = [1, 1, 2, 2]
@@ -246,7 +245,7 @@ class Conv2DTransposeTest(test.TestCase):
                 self.assertAllClose(target, value[n, k, h, w])
 
   def testConv2DTransposeValidNCHW(self):
-    # `NCHW` data fomat is only supported for CUDA device.
+    # `NCHW` data format is only supported for CUDA device.
     if test.is_gpu_available(cuda_only=True):
       with self.test_session(use_gpu=True):
         strides = [1, 1, 2, 2]
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 22e5400c3745a735d783fef761276694dc830c32..edfb20d6a2b80cec930ddf696e8f0f69623a4de7 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -18,11 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
 import time
 
 import numpy as np
 
+from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib import layers
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
@@ -32,6 +34,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
@@ -162,8 +165,8 @@ class Conv2DTest(test.TestCase):
       # as we will be using its gradients as reference for fp16 gradients.
       return [dtypes.float32, dtypes.float16]
 
-  def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, strides,
-                            padding, data_format, dtype, use_gpu):
+  def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, dilations,
+                            strides, padding, data_format, dtype, use_gpu):
     """Verifies the output values of the convolution function.
 
     Args:
@@ -171,6 +174,7 @@ class Conv2DTest(test.TestCase):
         [batch, input_rows, input_cols, input_depth].
       filter_in_sizes: Filter tensor dimensions in
         [kernel_rows, kernel_cols, input_depth, output_depth].
+      dilations: Dilated rate: [col_dilation, row_dilation]
       strides: Stride: [col_stride, row_stride]
       padding: Padding type.
       data_format: Format of the data tensors.
@@ -194,11 +198,18 @@ class Conv2DTest(test.TestCase):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
       t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
       strides = [1] + strides + [1]
+      dilations = [1] + dilations + [1]
       if data_format == "NCHW":
         t1 = test_util.NHWCToNCHW(t1)
         strides = test_util.NHWCToNCHW(strides)
+        dilations = test_util.NHWCToNCHW(dilations)
       conv = nn_ops.conv2d(
-          t1, t2, strides=strides, padding=padding, data_format=data_format)
+          t1,
+          t2,
+          dilations=dilations,
+          strides=strides,
+          padding=padding,
+          data_format=data_format)
       if data_format == "NCHW":
         conv = test_util.NCHWToNHWC(conv)
 
@@ -240,14 +251,87 @@ class Conv2DTest(test.TestCase):
     for i in range(1, len(values)):
       self.assertAllClose(values[0], values[i], rtol=1e-5, atol=1e-5)
 
+  def _ComputeReferenceDilatedConv(self, tensor_in_sizes, filter_in_sizes,
+                                   stride, dilation, padding, data_format,
+                                   use_gpu):
+    total_size_1 = 1
+    total_size_2 = 1
+    for s in tensor_in_sizes:
+      total_size_1 *= s
+    for s in filter_in_sizes:
+      total_size_2 *= s
+
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
+    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    with test_util.device(use_gpu):
+      t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+      t2 = constant_op.constant(x2, shape=filter_in_sizes)
+      if isinstance(stride, collections.Iterable):
+        strides = list(stride)
+      else:
+        strides = [stride, stride]
+      if data_format == "NCHW":
+        t1 = test_util.NHWCToNCHW(t1)
+        full_strides = [1, 1] + strides
+        full_dilation = [1, 1] + dilation
+      else:
+        full_strides = [1] + strides + [1]
+        full_dilation = [1] + dilation + [1]
+      expected = nn_ops.convolution(
+          t1,
+          t2,
+          padding=padding,
+          strides=strides,
+          dilation_rate=dilation,
+          data_format=data_format)
+      computed = nn_ops.conv2d(
+          t1,
+          t2,
+          strides=full_strides,
+          dilations=full_dilation,
+          padding=padding,
+          data_format=data_format)
+      if data_format == "NCHW":
+        expected = test_util.NCHWToNHWC(expected)
+        computed = test_util.NCHWToNHWC(computed)
+    return expected, computed
+
+  def _VerifyDilatedConvValues(self, tensor_in_sizes, filter_in_sizes, strides,
+                               padding, dilations):
+    expected_results = []
+    computed_results = []
+    default_dilations = (dilations[0] == 1 and dilations[1] == 1)
+    for data_format, use_gpu in GetTestConfigs():
+      # If any dilation rate is larger than 1, only do test on the GPU
+      # because we currently do not have a CPU implementation for arbitrary
+      # dilation rates.
+      if default_dilations or use_gpu:
+        expected, computed = self._ComputeReferenceDilatedConv(
+            tensor_in_sizes, filter_in_sizes, strides, dilations, padding,
+            data_format, use_gpu)
+        expected_results.append(expected)
+        computed_results.append(computed)
+        tolerance = 1e-2 if use_gpu else 1e-5
+        expected_values = self.evaluate(expected_results)
+        computed_values = self.evaluate(computed_results)
+        for e_value, c_value in zip(expected_values, computed_values):
+          print("expected = ", e_value)
+          print("actual = ", c_value)
+          self.assertAllClose(
+              e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
+
   def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, strides, padding,
                     expected):
     tensors = []
+    dilations = [1, 1]
     for (data_format, use_gpu) in GetTestConfigs():
       for dtype in self._DtypesToTest(use_gpu):
         result = self._SetupValuesForDevice(
             tensor_in_sizes,
             filter_in_sizes,
+            dilations,
             strides,
             padding,
             data_format,
@@ -279,6 +363,16 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Filter2x1Dilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 4, 4, 1],
+          filter_in_sizes=[2, 2, 1, 1],
+          strides=[1, 1],
+          dilations=[2, 1],
+          padding="VALID")
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2DEmpty(self):
     expected_output = []
@@ -289,6 +383,16 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DEmptyDilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[0, 2, 3, 3],
+          filter_in_sizes=[1, 1, 3, 3],
+          strides=[1, 1],
+          dilations=[2, 1],
+          padding="VALID")
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Filter(self):
     # The outputs are computed using third_party/py/IPython/notebook.
@@ -300,6 +404,16 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2FilterDilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 2, 3, 3],
+          filter_in_sizes=[2, 2, 3, 3],
+          strides=[1, 1],
+          dilations=[1, 2],
+          padding="VALID")
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D1x2Filter(self):
     # The outputs are computed using third_party/py/IPython/notebook.
@@ -314,6 +428,16 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D1x2FilterDilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 2, 3, 3],
+          filter_in_sizes=[1, 2, 3, 3],
+          strides=[1, 1],
+          dilations=[2, 1],
+          padding="VALID")
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2FilterStride2(self):
     expected_output = [2271.0, 2367.0, 2463.0]
@@ -386,13 +510,23 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=[50, 60])
 
-    # TODO this currently fails.
-    # self._VerifyValues(tensor_in_sizes=[1, 8, 8, 1],
-    #                   filter_in_sizes=[2, 2, 1, 1],
-    #                   strides=[4, 4], padding="SAME",
-    #                   expected=[72, 112, 392, 432])
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DKernelSizeMatchesInputSizeDilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 3, 3, 1],
+          filter_in_sizes=[2, 2, 1, 2],
+          strides=[1, 1],
+          dilations=[2, 2],
+          padding="VALID")
 
-    # Testing for backprops
+  # TODO(yzhwang): this currently fails.
+  # self._VerifyValues(tensor_in_sizes=[1, 8, 8, 1],
+  #                   filter_in_sizes=[2, 2, 1, 1],
+  #                   strides=[4, 4], padding="SAME",
+  #                   expected=[72, 112, 392, 432])
+
+  # Testing for backprops
   def _RunAndVerifyBackpropInput(self, input_sizes, filter_sizes, output_sizes,
                                  strides, padding, expected, data_format,
                                  use_gpu, err):
@@ -663,6 +797,20 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DBackpropFilterWithEmptyInput(self):
+    expected = [0, 0, 0, 0]
+    for (data_format, use_gpu) in GetTestConfigs():
+      self._RunAndVerifyBackpropFilter(
+          input_sizes=[0, 2, 3, 1],
+          filter_sizes=[2, 2, 1, 1],
+          output_sizes=[0, 1, 2, 1],
+          strides=[1, 1],
+          padding="VALID",
+          expected=expected,
+          data_format=data_format,
+          use_gpu=use_gpu)
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Depth3ValidBackpropFilter(self):
     expected = [
@@ -724,6 +872,255 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  # Testing for backprops
+  def _RunAndVerifyBackpropInputDilation(self, input_sizes, filter_sizes,
+                                         output_sizes, strides, dilations,
+                                         padding, data_format, use_gpu, err):
+    total_input_size = 1
+    total_filter_size = 1
+    for s in input_sizes:
+      total_input_size *= s
+    for s in filter_sizes:
+      total_filter_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
+    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    default_dilations = (dilations[0] == 1 and dilations[1] == 1)
+    if default_dilations or use_gpu:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        if data_format == "NCHW":
+          input_sizes = test_util.NHWCToNCHW(input_sizes)
+        t1 = constant_op.constant(x1, shape=input_sizes)
+        t2 = constant_op.constant(x2, shape=filter_sizes)
+        full_strides = [1] + strides + [1]
+        full_dilations = [1] + dilations + [1]
+        if data_format == "NCHW":
+          full_strides = test_util.NHWCToNCHW(full_strides)
+          full_dilations = test_util.NHWCToNCHW(full_dilations)
+        conv_forward = nn_ops.conv2d(
+            t1,
+            t2,
+            strides=full_strides,
+            dilations=full_dilations,
+            padding=padding,
+            data_format=data_format)
+        conv_forward_2 = nn_ops.convolution(
+            t1,
+            t2,
+            padding=padding,
+            strides=strides,
+            dilation_rate=dilations,
+            data_format=data_format)
+        if data_format == "NCHW":
+          conv_forward = test_util.NCHWToNHWC(conv_forward)
+          conv_forward_2 = test_util.NCHWToNHWC(conv_forward_2)
+        conv = gradients_impl.gradients(conv_forward, t1)[0]
+        conv_2 = gradients_impl.gradients(conv_forward_2, t1)[0]
+        # "values" consists of two tensors for two backprops
+        value = sess.run(conv)
+        value_2 = sess.run(conv_2)
+        self.assertShapeEqual(value, conv)
+        self.assertShapeEqual(value_2, conv_2)
+      print("expected = ", value_2)
+      print("actual = ", value)
+      self.assertArrayNear(value_2.flatten(), value.flatten(), err)
+
+  # Testing for backprops
+  def _RunAndVerifyBackpropFilterDilation(self, input_sizes, filter_sizes,
+                                          output_sizes, strides, dilations,
+                                          padding, data_format, use_gpu, err):
+    total_input_size = 1
+    total_filter_size = 1
+    for s in input_sizes:
+      total_input_size *= s
+    for s in filter_sizes:
+      total_filter_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
+    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    default_dilations = (dilations[0] == 1 and dilations[1] == 1)
+    if default_dilations or use_gpu:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        if data_format == "NCHW":
+          input_sizes = test_util.NHWCToNCHW(input_sizes)
+        t1 = constant_op.constant(x1, shape=input_sizes)
+        t2 = constant_op.constant(x2, shape=filter_sizes)
+        full_strides = [1] + strides + [1]
+        full_dilations = [1] + dilations + [1]
+        if data_format == "NCHW":
+          full_strides = test_util.NHWCToNCHW(full_strides)
+          full_dilations = test_util.NHWCToNCHW(full_dilations)
+        conv_forward = nn_ops.conv2d(
+            t1,
+            t2,
+            strides=full_strides,
+            dilations=full_dilations,
+            padding=padding,
+            data_format=data_format)
+        conv_forward_2 = nn_ops.convolution(
+            t1,
+            t2,
+            padding=padding,
+            strides=strides,
+            dilation_rate=dilations,
+            data_format=data_format)
+        if data_format == "NCHW":
+          conv_forward = test_util.NCHWToNHWC(conv_forward)
+          conv_forward_2 = test_util.NCHWToNHWC(conv_forward_2)
+        conv = gradients_impl.gradients(conv_forward, t2)[0]
+        conv_2 = gradients_impl.gradients(conv_forward, t2)[0]
+        value = sess.run(conv)
+        value_2 = sess.run(conv_2)
+        self.assertShapeEqual(value, conv)
+        self.assertShapeEqual(value_2, conv_2)
+      print("expected = ", value_2)
+      print("actual = ", value)
+      self.assertArrayNear(value_2.flatten(), value.flatten(), err)
+
+  def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 3, 6, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 5, 1],
+            strides=[1, 1],
+            dilations=[2, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth1ValidBackpropFilterDilation1x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 2, 1],
+            strides=[1, 1],
+            dilations=[1, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2DEmptyBackpropFilterDilation1x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 0],
+            output_sizes=[1, 1, 2, 0],
+            strides=[1, 1],
+            dilations=[1, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth3ValidBackpropFilterDilation2x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 3, 4, 3],
+            filter_sizes=[2, 2, 3, 3],
+            output_sizes=[1, 1, 2, 3],
+            strides=[1, 1],
+            dilations=[2, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2DKernelSizeMatchesInputSizeBackpropFilterDilation2x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 3, 3, 1],
+            filter_sizes=[2, 2, 1, 2],
+            output_sizes=[1, 1, 1, 2],
+            strides=[1, 1],
+            dilations=[2, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth3ValidBackpropInputStride1x1Dilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[1, 3, 6, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 5, 1],
+            strides=[1, 1],
+            dilations=[2, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth1ValidBackpropInputDilation1x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 2, 1],
+            strides=[1, 1],
+            dilations=[1, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2DEmptyBackpropInputDilation1x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[0, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[0, 1, 2, 1],
+            strides=[1, 1],
+            dilations=[1, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth3ValidBackpropInputDilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        # The GPU version of this test is not very stable. So adjusting the
+        # error threshold to 1e-4.
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[1, 3, 2, 3],
+            filter_sizes=[2, 2, 3, 3],
+            output_sizes=[1, 1, 2, 3],
+            strides=[1, 1],
+            dilations=[2, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-4)
+
+  def testConv2DKernelSizeMatchesInputSizeBackpropInputDilation2x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[1, 3, 3, 1],
+            filter_sizes=[2, 2, 1, 2],
+            output_sizes=[1, 1, 1, 2],
+            strides=[1, 1],
+            dilations=[2, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
   # Gradient checkers
   def ConstructAndTestGradient(self, batch, input_rows, input_cols, filter_rows,
                                filter_cols, in_depth, out_depth, stride_rows,
@@ -1126,6 +1523,36 @@ class Conv2DTest(test.TestCase):
                 strides=[1, 1, 1, 1],
                 padding="VALID"))
 
+  def testCPUConv2DNCHWUnimplemented(self):
+    with self.test_session(use_gpu=False):
+      with self.assertRaisesRegexp(errors_impl.UnimplementedError,
+                                   "NHWC tensor format for now"):
+        conv = self._SetupValuesForDevice(
+            tensor_in_sizes=[1, 4, 4, 1],
+            filter_in_sizes=[2, 2, 1, 1],
+            dilations=[1, 1],
+            strides=[1, 1],
+            padding="VALID",
+            data_format="NCHW",
+            dtype=dtypes.float32,
+            use_gpu=False)
+        self.evaluate(conv)
+
+  def testCPUConv2DDilatedUnimplemented(self):
+    with self.test_session(use_gpu=False):
+      with self.assertRaisesRegexp(errors_impl.UnimplementedError,
+                                   "dilated rate of 1 for now"):
+        conv = self._SetupValuesForDevice(
+            tensor_in_sizes=[1, 4, 4, 1],
+            filter_in_sizes=[2, 2, 1, 1],
+            dilations=[2, 1],
+            strides=[1, 1],
+            padding="VALID",
+            data_format="NHWC",
+            dtype=dtypes.float32,
+            use_gpu=False)
+        self.evaluate(conv)
+
 
 class DepthwiseConv2DTest(test.TestCase):
 
@@ -1457,6 +1884,22 @@ def GetInceptionFwdTest(input_size, filter_size, stride, padding,
   return Test
 
 
+def GetInceptionFwdDilatedConvTest(input_size, filter_size, stride, padding):
+
+  def Test(self):
+    if test.is_gpu_available(cuda_only=True) and stride == 1:
+      tf_logging.info("Testing InceptionFwd with dilations %s",
+                      (input_size, filter_size, stride, padding))
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=input_size,
+          filter_in_sizes=filter_size,
+          strides=[stride, stride],
+          dilations=[2, 2],
+          padding=padding)
+
+  return Test
+
+
 def GetInceptionBackInputTest(input_size, filter_size, output_size, stride,
                               padding,
                               gpu_only=False):
@@ -1497,6 +1940,10 @@ if __name__ == "__main__":
             test_util.run_in_graph_and_eager_modes()(
                 GetInceptionFwdTest(input_size_, filter_size_, stride_,
                                     padding_)))
+    setattr(
+        Conv2DTest, "testInceptionFwdDilatedConv_" + str(index),
+        test_util.run_in_graph_and_eager_modes()(GetInceptionFwdDilatedConvTest(
+            input_size_, filter_size_, stride_, padding_)))
     setattr(Conv2DTest, "testInceptionBackInput_" + str(index),
             test_util.run_in_graph_and_eager_modes()(
                 GetInceptionBackInputTest(input_size_, filter_size_,
@@ -1519,6 +1966,9 @@ if __name__ == "__main__":
   setattr(Conv2DTest, "testInceptionFwd_No_Winograd_Nonfused",
           test_util.run_in_graph_and_eager_modes()(
               GetInceptionFwdTest(ishape, fshape, 1, "SAME", gpu_only=True)))
+  setattr(Conv2DTest, "testInceptionFwdDilatedConv_No_Winograd_Nonfused",
+          test_util.run_in_graph_and_eager_modes()(
+              GetInceptionFwdDilatedConvTest(ishape, fshape, 1, "SAME")))
   setattr(Conv2DTest, "testInceptionBackInput_No_Winograd_Nonfused",
           test_util.run_in_graph_and_eager_modes()(
               GetInceptionBackInputTest(ishape, fshape, oshape, 1, "SAME",
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index e0c53950e6ccb22f47a1c5a19a62b8373fbe4445..0d9b46c30dbbed20dd940e0427fbf6f6d5415106 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
@@ -70,6 +71,7 @@ def _sparsify(x, thresh=0.5, index_dtype=np.int64):
   return sparse_tensor.SparseTensor(
       indices=x_indices, values=x_values, dense_shape=x_shape), x_values
 
+
 def _default_tolerance(dtype):
   """Returns a sensible default tolerance for comparing results of a given
   type"""
@@ -80,7 +82,7 @@ def _default_tolerance(dtype):
   elif dtype in (np.float64, np.complex128):
     return 1e-5
   else:
-    return None # Fail fast for unexpected types
+    return None  # Fail fast for unexpected types
 
 
 class UnaryOpTest(test.TestCase):
@@ -232,10 +234,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(k, np.arccos, math_ops.acos)
     self._compareBoth(x, np.arctan, math_ops.atan)
     self._compareBoth(x, np.tan, math_ops.tan)
-    self._compareBoth(
-        y,
-        np.vectorize(self._replace_domain_error_with_inf(math.lgamma)),
-        math_ops.lgamma)
+    self._compareBoth(y,
+                      np.vectorize(
+                          self._replace_domain_error_with_inf(math.lgamma)),
+                      math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
 
@@ -297,8 +299,8 @@ class UnaryOpTest(test.TestCase):
     w = x - x.min() + 1.02  # all greater than 1
     y = (x + .5).astype(np.float64)  # no zero
     z = (x + 15.5).astype(np.float64)  # all positive
-    k = np.arange(-0.90, 0.90, 0.35).reshape(1, 3, 2).astype(
-        np.float64)  # between -1 and 1
+    k = np.arange(-0.90, 0.90,
+                  0.35).reshape(1, 3, 2).astype(np.float64)  # between -1 and 1
     self._compareBoth(x, np.abs, math_ops.abs)
     self._compareBoth(x, np.abs, _ABS)
     self._compareBoth(x, np.negative, math_ops.negative)
@@ -321,10 +323,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, np.sign, math_ops.sign)
     self._compareBoth(x, np.sin, math_ops.sin)
     self._compareBoth(x, np.cos, math_ops.cos)
-    self._compareBoth(
-        y,
-        np.vectorize(self._replace_domain_error_with_inf(math.lgamma)),
-        math_ops.lgamma)
+    self._compareBoth(y,
+                      np.vectorize(
+                          self._replace_domain_error_with_inf(math.lgamma)),
+                      math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
     self._compareBoth(x, np.arctan, math_ops.atan)
@@ -361,10 +363,10 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, np.sign, math_ops.sign)
     self._compareBoth(x, np.sin, math_ops.sin)
     self._compareBoth(x, np.cos, math_ops.cos)
-    self._compareBoth(
-        y,
-        np.vectorize(self._replace_domain_error_with_inf(math.lgamma)),
-        math_ops.lgamma)
+    self._compareBoth(y,
+                      np.vectorize(
+                          self._replace_domain_error_with_inf(math.lgamma)),
+                      math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
 
@@ -405,8 +407,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(x, np.sign, math_ops.sign)
 
   def testComplex64Basic(self):
-    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3,
-                                                    2).astype(np.complex64)
+    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
+        np.complex64)
     y = x + np.complex(0.5, 0.5)  # no zeros
     self._compareBoth(x, np.abs, math_ops.abs)
     self._compareBoth(x, np.abs, _ABS)
@@ -416,7 +418,7 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.square, math_ops.square)
     self._compareCpu(y, np.sqrt, math_ops.sqrt)
     self._compareCpu(y, self._rsqrt, math_ops.rsqrt)
-    self._compareCpu(x, np.exp, math_ops.exp)
+    self._compareBoth(x, np.exp, math_ops.exp)
     self._compareCpu(x, np.expm1, math_ops.expm1)
     self._compareCpu(y, np.log, math_ops.log)
     self._compareCpu(y, np.log1p, math_ops.log1p)
@@ -449,8 +451,8 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
   def testComplex128Basic(self):
-    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3,
-                                                    2).astype(np.complex128)
+    x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
+        np.complex128)
     y = x + np.complex(0.5, 0.5)  # no zeros
     self._compareBoth(x, np.abs, math_ops.abs)
     self._compareBoth(x, np.abs, _ABS)
@@ -460,7 +462,7 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.square, math_ops.square)
     self._compareCpu(y, np.sqrt, math_ops.sqrt)
     self._compareCpu(y, self._rsqrt, math_ops.rsqrt)
-    self._compareCpu(x, np.exp, math_ops.exp)
+    self._compareBoth(x, np.exp, math_ops.exp)
     self._compareCpu(x, np.expm1, math_ops.expm1)
     self._compareCpu(y, np.log, math_ops.log)
     self._compareCpu(y, np.log1p, math_ops.log1p)
@@ -804,10 +806,10 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.mod, _MOD)
 
   def testComplex64Basic(self):
-    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(
-        1, 3, 2).astype(np.complex64)
-    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(
-        1, 3, 2).astype(np.complex64)
+    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
+        np.complex64)
+    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(1, 3, 2).astype(
+        np.complex64)
     self._compareBoth(x, y, np.add, math_ops.add)
     self._compareBoth(x, y, np.subtract, math_ops.subtract)
     self._compareBoth(x, y, np.multiply, math_ops.multiply)
@@ -818,10 +820,10 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y + 0.1, np.true_divide, _TRUEDIV)
 
   def testComplex128Basic(self):
-    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(
-        1, 3, 2).astype(np.complex128)
-    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(
-        1, 3, 2).astype(np.complex128)
+    x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
+        np.complex128)
+    y = np.complex(1, 1) * np.linspace(20, -20, 6).reshape(1, 3, 2).astype(
+        np.complex128)
     self._compareBoth(x, y, np.add, math_ops.add)
     self._compareBoth(x, y, np.subtract, math_ops.subtract)
     self._compareBoth(x, y, np.multiply, math_ops.multiply)
@@ -1126,8 +1128,8 @@ class BinaryOpTest(test.TestCase):
 
   def testMismatchedDimensions(self):
     for func in [
-        math_ops.add, math_ops.subtract, math_ops.multiply, math_ops.div,
-        _ADD, _SUB, _MUL, _TRUEDIV, _FLOORDIV
+        math_ops.add, math_ops.subtract, math_ops.multiply, math_ops.div, _ADD,
+        _SUB, _MUL, _TRUEDIV, _FLOORDIV
     ]:
       with self.assertRaisesWithPredicateMatch(
           ValueError, lambda e: "Dimensions must" in str(e)):
@@ -1160,14 +1162,40 @@ class BinaryOpTest(test.TestCase):
                    (1.2345, float("inf")), (1.2345, -float("inf")),
                    (-4.321, float("inf")), (-4.125, -float("inf")),
                    (float("inf"), float("inf")), (float("inf"), -float("inf")),
-                   (-float("inf"), float("inf")), (-float("inf"),
-                                                   -float("inf")))
+                   (-float("inf"), float("inf")),
+                   (-float("inf"), -float("inf")))
     for dtype in np.float32, np.float64:
       x1 = np.array(x1l).astype(dtype)
       x2 = np.array(x2l).astype(dtype)
       self._compareCpu(x1, x2, np.arctan2, math_ops.atan2)
       self._compareGpu(x1, x2, np.arctan2, math_ops.atan2)
 
+  def testPowNegativeExponent(self):
+    for dtype in [np.int32, np.int64]:
+      with self.test_session(use_gpu=False) as sess:
+        with self.assertRaisesRegexp(
+            errors_impl.InvalidArgumentError,
+            "Integers to negative integer powers are not allowed"):
+          x = np.array([5, 2]).astype(dtype)
+          y = np.array([-2, 3]).astype(dtype)
+          sess.run(math_ops.pow(x, y))
+
+      with self.test_session(use_gpu=False) as sess:
+        with self.assertRaisesRegexp(
+            errors_impl.InvalidArgumentError,
+            "Integers to negative integer powers are not allowed"):
+          x = np.array([5, 2]).astype(dtype)
+          y = np.array([2, -3]).astype(dtype)
+          sess.run(math_ops.pow(x, y))
+
+      with self.test_session(use_gpu=False) as sess:
+        with self.assertRaisesRegexp(
+            errors_impl.InvalidArgumentError,
+            "Integers to negative integer powers are not allowed"):
+          x = np.array([5, 2]).astype(dtype)
+          y = -3
+          sess.run(math_ops.pow(x, y))
+
 
 class ComparisonOpTest(test.TestCase):
 
@@ -1186,22 +1214,22 @@ class ComparisonOpTest(test.TestCase):
       for x in data:
         for y in data:
           self.assertEqual(self._compareScalar(math_ops.less, x, y, t), x < y)
-          self.assertEqual(self._compareScalar(math_ops.less_equal, x, y, t),
-                           x <= y)
-          self.assertEqual(self._compareScalar(math_ops.greater, x, y, t),
-                           x > y)
+          self.assertEqual(
+              self._compareScalar(math_ops.less_equal, x, y, t), x <= y)
+          self.assertEqual(
+              self._compareScalar(math_ops.greater, x, y, t), x > y)
           self.assertEqual(
               self._compareScalar(math_ops.greater_equal, x, y, t), x >= y)
           self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t),
-                           x != y)
+          self.assertEqual(
+              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
     data = [-1, 0, 1, -1j, 1j, 1 + 1j, 1 - 1j]
     for t in [np.complex64, np.complex128]:
       for x in data:
         for y in data:
           self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t),
-                           x != y)
+          self.assertEqual(
+              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
@@ -1284,8 +1312,8 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(np.equal, math_ops.equal, include_complex=True)
 
   def testBCastNotEqual(self):
-    self._testBCastByFunc(np.not_equal, math_ops.not_equal,
-                          include_complex=True)
+    self._testBCastByFunc(
+        np.not_equal, math_ops.not_equal, include_complex=True)
 
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
@@ -1744,9 +1772,8 @@ class MathOpsOverloadTest(test.TestCase):
   def _compareUnary(self, x, dtype, np_func, tf_func):
     np_ans = np_func(x).astype(dtype.as_numpy_dtype)
     with self.test_session(use_gpu=False):
-      self.assertAllClose(
-          np_ans, tf_func(ops.convert_to_tensor(
-              x, dtype=dtype)).eval())
+      self.assertAllClose(np_ans,
+                          tf_func(ops.convert_to_tensor(x, dtype=dtype)).eval())
 
   def testOverload(self):
     dtypes = [
@@ -1768,8 +1795,8 @@ class MathOpsOverloadTest(test.TestCase):
     ]
     for dtype in dtypes:
       for np_func, tf_func in funcs:
-        if dtype in (dtypes_lib.complex64, dtypes_lib.complex128
-                    ) and tf_func == _FLOORDIV:
+        if dtype in (dtypes_lib.complex64,
+                     dtypes_lib.complex128) and tf_func == _FLOORDIV:
           continue  # floordiv makes no sense for complex
         self._compareBinary(10, 5, dtype, np_func, tf_func)
     # Mod only works for int32 and int64.
@@ -1981,7 +2008,8 @@ class ComplexMakeRealImagTest(test.TestCase):
     # self._compareAngle(cplx, use_gpu=True)
 
   def testRealReal(self):
-    for dtype in dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32, dtypes_lib.float64:
+    for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32,
+                  dtypes_lib.float64):
       x = array_ops.placeholder(dtype)
       y = math_ops.real(x)
       self.assertEqual(x, y)
@@ -2010,15 +2038,16 @@ class ComplexMakeRealImagTest(test.TestCase):
     self._compareConj(cplx, use_gpu=True)
 
   def testConjReal(self):
-    for dtype in dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16, dtypes_lib.float32, dtypes_lib.float64:
+    for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16,
+                  dtypes_lib.float32, dtypes_lib.float64):
       x = array_ops.placeholder(dtype)
       y = math_ops.conj(x)
       self.assertEqual(x, y)
 
   def testConjString(self):
     x = array_ops.placeholder(dtypes_lib.string)
-    with self.assertRaisesRegexp(
-        TypeError, r"Expected numeric or variant tensor"):
+    with self.assertRaisesRegexp(TypeError,
+                                 r"Expected numeric or variant tensor"):
       math_ops.conj(x)
 
   def _compareGradient(self, x):
@@ -2033,8 +2062,9 @@ class ComplexMakeRealImagTest(test.TestCase):
       real, imag = array_ops.reshape(real, [-1]), array_ops.reshape(imag, [-1])
       cplx = math_ops.complex(real, imag)
       cplx = math_ops.conj(cplx)
-      loss = math_ops.reduce_sum(math_ops.square(math_ops.real(
-          cplx))) + math_ops.reduce_sum(math_ops.square(math_ops.imag(cplx)))
+      loss = math_ops.reduce_sum(math_ops.square(
+          math_ops.real(cplx))) + math_ops.reduce_sum(
+              math_ops.square(math_ops.imag(cplx)))
       epsilon = 1e-3
       jacob_t, jacob_n = gradient_checker.compute_gradient(
           inx, list(x.shape), loss, [1], x_init_value=x, delta=epsilon)
@@ -2098,8 +2128,8 @@ class AccumulateTest(test.TestCase):
           np.random.rand(16, 16, 16, 16).astype(np.float32) for _ in range(20)
       ]
       random_tensors = [
-          ops.convert_to_tensor(
-              x, dtype=dtypes_lib.float32) for x in random_arrays
+          ops.convert_to_tensor(x, dtype=dtypes_lib.float32)
+          for x in random_arrays
       ]
       tf_val = math_ops.accumulate_n(random_tensors)
       np_val = random_arrays[0]
diff --git a/tensorflow/python/kernel_tests/decode_bmp_op_test.py b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
index c086f4617064241da98138888e2ce1659d1b3821..35f8f76991a679e4164da4c63bacbe79fb5cd2c2 100644
--- a/tensorflow/python/kernel_tests/decode_bmp_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors_impl
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
@@ -68,28 +67,68 @@ class DecodeBmpOpTest(test.TestCase):
   def testGrayscale(self):
     img_bytes = [[[255], [0]], [[255], [0]]]
     encoded_bytes = [
-        0x42, 0x40,
-        0x3d, 0, 0, 0,
-        0, 0,
-        0, 0,
-        0x36, 0, 0, 0,
-        0x28, 0, 0, 0,
-        0x2, 0, 0, 0,
-        0x2, 0, 0, 0,
-        0x1, 0,
-        0x8, 0,
-        0, 0, 0, 0,
-        0x10, 0, 0, 0,
-        0x13, 0xb, 0, 0,
-        0x13, 0xb, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
+        0x42,
+        0x40,
+        0x3d,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0x36,
+        0,
+        0,
+        0,
+        0x28,
+        0,
+        0,
+        0,
+        0x2,
+        0,
+        0,
+        0,
+        0x2,
+        0,
+        0,
+        0,
+        0x1,
+        0,
+        0x8,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0x10,
+        0,
+        0,
+        0,
+        0x13,
+        0xb,
+        0,
+        0,
+        0x13,
+        0xb,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
         0xff,
         0,
-        0, 0,
+        0,
+        0,
         0xff,
         0,
-        0, 0,
+        0,
+        0,
     ]
 
     byte_string = bytes(bytearray(encoded_bytes))
@@ -100,54 +139,6 @@ class DecodeBmpOpTest(test.TestCase):
       decoded = decode.eval()
       self.assertAllEqual(decoded, img_bytes)
 
-  def testIncompleteHeader(self):
-    # Encoded BMP bytes from Wikipedia
-    encoded_bytes = [
-        0x42, 0x40,
-        0x46, 0, 0, 0,
-    ]
-
-    byte_string = bytes(bytearray(encoded_bytes))
-    img_in = constant_op.constant(byte_string, dtype=dtypes.string)
-    decode = array_ops.squeeze(image_ops.decode_bmp(img_in))
-
-    with self.test_session():
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-          "requires at least 32 bytes to find the header"):
-        decoded = decode.eval()
-
-  def testIncompleteBody(self):
-    # Encoded BMP bytes from Wikipedia
-    encoded_bytes = [
-        0x42, 0x40,
-        0x46, 0, 0, 0,
-        0, 0,
-        0, 0,
-        0x36, 0, 0, 0,
-        0x28, 0, 0, 0,
-        0x2, 0, 0, 0,
-        0x2, 0, 0, 0,
-        0x1, 0,
-        0x18, 0,
-        0, 0, 0, 0,
-        0x10, 0, 0, 0,
-        0x13, 0xb, 0, 0,
-        0x13, 0xb, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0, 0,
-        0, 0, 0xff,
-        0xff, 0xff, 0xff,
-        0, 0,
-    ]
-
-    byte_string = bytes(bytearray(encoded_bytes))
-    img_in = constant_op.constant(byte_string, dtype=dtypes.string)
-    decode = array_ops.squeeze(image_ops.decode_bmp(img_in))
-
-    with self.test_session():
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-          "requires at least 68 bytes, got 62 bytes"):
-        decoded = decode.eval()
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/decode_compressed_op_test.py b/tensorflow/python/kernel_tests/decode_compressed_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9bda58ca747670861b89917a4ef1cc14eac4132
--- /dev/null
+++ b/tensorflow/python/kernel_tests/decode_compressed_op_test.py
@@ -0,0 +1,73 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DecodeRaw op from parsing_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import zlib
+
+from six import BytesIO
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+
+
+class DecodeCompressedOpTest(test.TestCase):
+
+  def _compress(self, bytes_in, compression_type):
+    if not compression_type:
+      return bytes_in
+    elif compression_type == "ZLIB":
+      return zlib.compress(bytes_in)
+    else:
+      out = BytesIO()
+      with gzip.GzipFile(fileobj=out, mode="wb") as f:
+        f.write(bytes_in)
+      return out.getvalue()
+
+  def testDecompress(self):
+    for compression_type in ["ZLIB", "GZIP", ""]:
+      with self.test_session():
+        in_bytes = array_ops.placeholder(dtypes.string, shape=[2])
+        decompressed = parsing_ops.decode_compressed(
+            in_bytes, compression_type=compression_type)
+        self.assertEqual([2], decompressed.get_shape().as_list())
+
+        result = decompressed.eval(
+            feed_dict={in_bytes: [self._compress(b"AaAA", compression_type),
+                                  self._compress(b"bBbb", compression_type)]})
+        self.assertAllEqual([b"AaAA", b"bBbb"], result)
+
+  def testDecompressWithRaw(self):
+    for compression_type in ["ZLIB", "GZIP", ""]:
+      with self.test_session():
+        in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+        decompressed = parsing_ops.decode_compressed(
+            in_bytes, compression_type=compression_type)
+        decode = parsing_ops.decode_raw(decompressed, out_type=dtypes.int16)
+
+        result = decode.eval(
+            feed_dict={in_bytes: [self._compress(b"AaBC", compression_type)]})
+        self.assertAllEqual(
+            [[ord("A") + ord("a") * 256, ord("B") + ord("C") * 256]], result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
index ead55cd03b656a18d622b9d35c1b94f9cf2f5107..510daf79dc4252c3e2943e2ba23c1012370bf456 100644
--- a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 import time
 
+from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index 009f3ea4b311c6c30e917362b9561b170e3e2068..122a9ed46967fc9c02c59ea3047216cb73a72293 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import sys
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
@@ -90,8 +89,9 @@ class DecodeRawOpTest(test.TestCase):
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
       decode = parsing_ops.decode_raw(in_bytes, out_type=dtypes.float16)
 
-      result = decode.eval(feed_dict={in_bytes: [""]})
-      self.assertEqual(len(result), 1)
+      for num_inputs in range(3):
+        result = decode.eval(feed_dict={in_bytes: [""] * num_inputs})
+        self.assertEqual((num_inputs, 0), result.shape)
 
   def testToUInt16(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index 6cfa9b37fe0e40f4f0e5e2ad2686819e5f6d4f12..0825d8fc6bea008532fd7428236dfb569f2a471e 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -84,11 +84,8 @@ class MatrixSetDiagTest(test.TestCase):
   def testSquare(self):
     with self.test_session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
-      mat = np.array([[0.0, 1.0, 0.0],
-                      [1.0, 0.0, 1.0],
-                      [1.0, 1.0, 1.0]])
-      mat_set_diag = np.array([[1.0, 1.0, 0.0],
-                               [1.0, 2.0, 1.0],
+      mat = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0]])
+      mat_set_diag = np.array([[1.0, 1.0, 0.0], [1.0, 2.0, 1.0],
                                [1.0, 1.0, 3.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((3, 3), output.get_shape())
@@ -135,19 +132,12 @@ class MatrixSetDiagTest(test.TestCase):
 
   def testRectangularBatch(self):
     with self.test_session(use_gpu=True):
-      v_batch = np.array([[-1.0, -2.0],
-                          [-4.0, -5.0]])
-      mat_batch = np.array(
-          [[[1.0, 0.0, 3.0],
-            [0.0, 2.0, 0.0]],
-           [[4.0, 0.0, 4.0],
-            [0.0, 5.0, 0.0]]])
-
-      mat_set_diag_batch = np.array(
-          [[[-1.0, 0.0, 3.0],
-            [0.0, -2.0, 0.0]],
-           [[-4.0, 0.0, 4.0],
-            [0.0, -5.0, 0.0]]])
+      v_batch = np.array([[-1.0, -2.0], [-4.0, -5.0]])
+      mat_batch = np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0]],
+                            [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0]]])
+
+      mat_set_diag_batch = np.array([[[-1.0, 0.0, 3.0], [0.0, -2.0, 0.0]],
+                                     [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]])
       output = array_ops.matrix_set_diag(mat_batch, v_batch)
       self.assertEqual((2, 2, 3), output.get_shape())
       self.assertAllEqual(mat_set_diag_batch, output.eval())
@@ -178,10 +168,14 @@ class MatrixSetDiagTest(test.TestCase):
             np.random.rand(*diag_shape), dtype=dtypes_lib.float32)
         y = array_ops.matrix_set_diag(x, x_diag)
         error_x = gradient_checker.compute_gradient_error(
-            x, x.get_shape().as_list(), y, y.get_shape().as_list())
+            x,
+            x.get_shape().as_list(), y,
+            y.get_shape().as_list())
         self.assertLess(error_x, 1e-4)
         error_x_diag = gradient_checker.compute_gradient_error(
-            x_diag, x_diag.get_shape().as_list(), y, y.get_shape().as_list())
+            x_diag,
+            x_diag.get_shape().as_list(), y,
+            y.get_shape().as_list())
         self.assertLess(error_x_diag, 1e-4)
 
   def testGradWithNoShapeInformation(self):
@@ -192,12 +186,13 @@ class MatrixSetDiagTest(test.TestCase):
       output = array_ops.matrix_set_diag(mat, v)
       grads = gradients_impl.gradients(output, [mat, v], grad_ys=grad_input)
       grad_input_val = np.random.rand(3, 3).astype(np.float32)
-      grad_vals = sess.run(grads,
-                           feed_dict={
-                               v: 2 * np.ones(3),
-                               mat: np.ones((3, 3)),
-                               grad_input: grad_input_val
-                           })
+      grad_vals = sess.run(
+          grads,
+          feed_dict={
+              v: 2 * np.ones(3),
+              mat: np.ones((3, 3)),
+              grad_input: grad_input_val
+          })
       self.assertAllEqual(np.diag(grad_input_val), grad_vals[1])
       self.assertAllEqual(grad_input_val - np.diag(np.diag(grad_input_val)),
                           grad_vals[0])
@@ -242,13 +237,9 @@ class MatrixDiagPartTest(test.TestCase):
 
   def testRectangularBatch(self):
     with self.test_session(use_gpu=True):
-      v_batch = np.array([[1.0, 2.0],
-                          [4.0, 5.0]])
-      mat_batch = np.array(
-          [[[1.0, 0.0, 0.0],
-            [0.0, 2.0, 0.0]],
-           [[4.0, 0.0, 0.0],
-            [0.0, 5.0, 0.0]]])
+      v_batch = np.array([[1.0, 2.0], [4.0, 5.0]])
+      mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 2.0, 0.0]],
+                            [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0]]])
       self.assertEqual(mat_batch.shape, (2, 2, 3))
       mat_batch_diag = array_ops.matrix_diag_part(mat_batch)
       self.assertEqual((2, 2), mat_batch_diag.get_shape())
@@ -301,19 +292,13 @@ class DiagTest(test.TestCase):
 
   def testRankOneIntTensor(self):
     x = np.array([1, 2, 3])
-    expected_ans = np.array(
-        [[1, 0, 0],
-         [0, 2, 0],
-         [0, 0, 3]])
+    expected_ans = np.array([[1, 0, 0], [0, 2, 0], [0, 0, 3]])
     self.diagOp(x, np.int32, expected_ans)
     self.diagOp(x, np.int64, expected_ans)
 
   def testRankOneFloatTensor(self):
     x = np.array([1.1, 2.2, 3.3])
-    expected_ans = np.array(
-        [[1.1, 0, 0],
-         [0, 2.2, 0],
-         [0, 0, 3.3]])
+    expected_ans = np.array([[1.1, 0, 0], [0, 2.2, 0], [0, 0, 3.3]])
     self.diagOp(x, np.float32, expected_ans)
     self.diagOp(x, np.float64, expected_ans)
 
@@ -321,123 +306,105 @@ class DiagTest(test.TestCase):
     for dtype in [np.complex64, np.complex128]:
       x = np.array([1.1 + 1.1j, 2.2 + 2.2j, 3.3 + 3.3j], dtype=dtype)
       expected_ans = np.array(
-          [[1.1 + 1.1j, 0 + 0j, 0 + 0j],
-           [0 + 0j, 2.2 + 2.2j, 0 + 0j],
-           [0 + 0j, 0 + 0j, 3.3 + 3.3j]], dtype=dtype)
+          [[1.1 + 1.1j, 0 + 0j, 0 + 0j], [0 + 0j, 2.2 + 2.2j, 0 + 0j],
+           [0 + 0j, 0 + 0j, 3.3 + 3.3j]],
+          dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
   def testRankTwoIntTensor(self):
     x = np.array([[1, 2, 3], [4, 5, 6]])
-    expected_ans = np.array(
-        [[[[1, 0, 0], [0, 0, 0]],
-          [[0, 2, 0], [0, 0, 0]],
-          [[0, 0, 3], [0, 0, 0]]],
-         [[[0, 0, 0], [4, 0, 0]],
-          [[0, 0, 0], [0, 5, 0]],
-          [[0, 0, 0], [0, 0, 6]]]])
+    expected_ans = np.array([[[[1, 0, 0], [0, 0, 0]], [[0, 2, 0], [0, 0, 0]],
+                              [[0, 0, 3], [0, 0, 0]]],
+                             [[[0, 0, 0], [4, 0, 0]], [[0, 0, 0], [0, 5, 0]],
+                              [[0, 0, 0], [0, 0, 6]]]])
     self.diagOp(x, np.int32, expected_ans)
     self.diagOp(x, np.int64, expected_ans)
 
   def testRankTwoFloatTensor(self):
     x = np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]])
     expected_ans = np.array(
-        [[[[1.1, 0, 0], [0, 0, 0]],
-          [[0, 2.2, 0], [0, 0, 0]],
-          [[0, 0, 3.3], [0, 0, 0]]],
-         [[[0, 0, 0], [4.4, 0, 0]],
-          [[0, 0, 0], [0, 5.5, 0]],
-          [[0, 0, 0], [0, 0, 6.6]]]])
+        [[[[1.1, 0, 0], [0, 0, 0]], [[0, 2.2, 0], [0, 0, 0]],
+          [[0, 0, 3.3], [0, 0, 0]]], [[[0, 0, 0], [4.4, 0, 0]],
+                                      [[0, 0, 0], [0, 5.5, 0]], [[0, 0, 0],
+                                                                 [0, 0, 6.6]]]])
     self.diagOp(x, np.float32, expected_ans)
     self.diagOp(x, np.float64, expected_ans)
 
   def testRankTwoComplexTensor(self):
     for dtype in [np.complex64, np.complex128]:
-      x = np.array([[1.1 + 1.1j, 2.2 + 2.2j, 3.3 + 3.3j],
-                    [4.4 + 4.4j, 5.5 + 5.5j, 6.6 + 6.6j]], dtype=dtype)
+      x = np.array(
+          [[1.1 + 1.1j, 2.2 + 2.2j, 3.3 + 3.3j],
+           [4.4 + 4.4j, 5.5 + 5.5j, 6.6 + 6.6j]],
+          dtype=dtype)
       expected_ans = np.array(
-          [[[[1.1 + 1.1j, 0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j, 0 + 0j]],
-            [[0 + 0j, 2.2 + 2.2j, 0 + 0j], [0 + 0j, 0 + 0j, 0 + 0j]],
-            [[0 + 0j, 0 + 0j, 3.3 + 3.3j], [0 + 0j, 0 + 0j, 0 + 0j]]],
-           [[[0 + 0j, 0 + 0j, 0 + 0j], [4.4 + 4.4j, 0 + 0j, 0 + 0j]],
-            [[0 + 0j, 0 + 0j, 0 + 0j], [0 + 0j, 5.5 + 5.5j, 0 + 0j]],
-            [[0 + 0j, 0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j, 6.6 + 6.6j]]]],
-           dtype=dtype)
+          [[[[1.1 + 1.1j, 0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j, 0 + 0j]], [
+              [0 + 0j, 2.2 + 2.2j, 0 + 0j], [0 + 0j, 0 + 0j, 0 + 0j]
+          ], [[0 + 0j, 0 + 0j, 3.3 + 3.3j], [0 + 0j, 0 + 0j, 0 + 0j]]], [[
+              [0 + 0j, 0 + 0j, 0 + 0j], [4.4 + 4.4j, 0 + 0j, 0 + 0j]
+          ], [[0 + 0j, 0 + 0j, 0 + 0j], [0 + 0j, 5.5 + 5.5j, 0 + 0j]
+             ], [[0 + 0j, 0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j, 6.6 + 6.6j]]]],
+          dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
   def testRankThreeFloatTensor(self):
-    x = np.array([[[1.1, 2.2], [3.3, 4.4]],
-                  [[5.5, 6.6], [7.7, 8.8]]])
-    expected_ans = np.array(
-        [[[[[[1.1, 0], [0, 0]], [[0, 0], [0, 0]]],
-           [[[0, 2.2], [0, 0]], [[0, 0], [0, 0]]]],
-          [[[[0, 0], [3.3, 0]], [[0, 0], [0, 0]]],
-           [[[0, 0], [0, 4.4]], [[0, 0], [0, 0]]]]],
-         [[[[[0, 0], [0, 0]], [[5.5, 0], [0, 0]]],
-           [[[0, 0], [0, 0]], [[0, 6.6], [0, 0]]]],
-          [[[[0, 0], [0, 0]], [[0, 0], [7.7, 0]]],
-           [[[0, 0], [0, 0]], [[0, 0], [0, 8.8]]]]]])
+    x = np.array([[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8]]])
+    expected_ans = np.array([[[[[[1.1, 0], [0, 0]], [[0, 0], [0, 0]]],
+                               [[[0, 2.2], [0, 0]], [[0, 0], [0, 0]]]],
+                              [[[[0, 0], [3.3, 0]], [[0, 0], [0, 0]]],
+                               [[[0, 0], [0, 4.4]], [[0, 0], [0, 0]]]]],
+                             [[[[[0, 0], [0, 0]], [[5.5, 0], [0, 0]]],
+                               [[[0, 0], [0, 0]], [[0, 6.6], [0, 0]]]],
+                              [[[[0, 0], [0, 0]], [[0, 0], [7.7, 0]]],
+                               [[[0, 0], [0, 0]], [[0, 0], [0, 8.8]]]]]])
     self.diagOp(x, np.float32, expected_ans)
     self.diagOp(x, np.float64, expected_ans)
 
   def testRankThreeComplexTensor(self):
     for dtype in [np.complex64, np.complex128]:
-      x = np.array([[[1.1 + 1.1j, 2.2 + 2.2j], [3.3 + 3.3j, 4.4 + 4.4j]],
-                    [[5.5 + 5.5j, 6.6 + 6.6j], [7.7 + 7.7j, 8.8 + 8.8j]]],
-                    dtype=dtype)
+      x = np.array(
+          [[[1.1 + 1.1j, 2.2 + 2.2j], [3.3 + 3.3j, 4.4 + 4.4j]],
+           [[5.5 + 5.5j, 6.6 + 6.6j], [7.7 + 7.7j, 8.8 + 8.8j]]],
+          dtype=dtype)
       expected_ans = np.array(
-          [[[[[[1.1 + 1.1j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-             [[[0 + 0j, 2.2 + 2.2j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]]],
-            [[[[0 + 0j, 0 + 0j], [3.3 + 3.3j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-             [[[0 + 0j, 0 + 0j], [0 + 0j, 4.4 + 4.4j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]]]]],
-           [[[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[5.5 + 5.5j, 0 + 0j], [0 + 0j, 0 + 0j]]],
-             [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 6.6 + 6.6j], [0 + 0j, 0 + 0j]]]],
-            [[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [7.7 + 7.7j, 0 + 0j]]],
-             [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
-              [[0 + 0j, 0 + 0j], [0 + 0j, 8.8 + 8.8j]]]]]],
+          [[[[[[1.1 + 1.1j, 0 + 0j], [0 + 0j, 0 + 0j]], [[0 + 0j, 0 + 0j], [
+              0 + 0j, 0 + 0j
+          ]]], [[[0 + 0j, 2.2 + 2.2j], [0 + 0j, 0 + 0j]], [[0 + 0j, 0 + 0j], [
+              0 + 0j, 0 + 0j
+          ]]]], [[[[0 + 0j, 0 + 0j], [3.3 + 3.3j, 0 + 0j]], [[0 + 0j, 0 + 0j], [
+              0 + 0j, 0 + 0j
+          ]]], [[[0 + 0j, 0 + 0j], [0 + 0j, 4.4 + 4.4j]], [[0 + 0j, 0 + 0j], [
+              0 + 0j, 0 + 0j
+          ]]]]], [[[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], [
+              [5.5 + 5.5j, 0 + 0j], [0 + 0j, 0 + 0j]
+          ]], [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], [[0 + 0j, 6.6 + 6.6j], [
+              0 + 0j, 0 + 0j
+          ]]]], [[[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]], [[0 + 0j, 0 + 0j], [
+              7.7 + 7.7j, 0 + 0j
+          ]]], [[[0 + 0j, 0 + 0j], [0 + 0j, 0 + 0j]],
+                [[0 + 0j, 0 + 0j], [0 + 0j, 8.8 + 8.8j]]]]]],
           dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
   def testRankFourNumberTensor(self):
     for dtype in [np.float32, np.float64, np.int64, np.int32]:
       # Input with shape [2, 1, 2, 3]
-      x = np.array([[[[ 1,  2,  3],
-                      [ 4,  5,  6]]],
-                    [[[ 7,  8,  9],
-                      [10, 11, 12]]]], dtype=dtype)
+      x = np.array(
+          [[[[1, 2, 3], [4, 5, 6]]], [[[7, 8, 9], [10, 11, 12]]]], dtype=dtype)
       # Output with shape [2, 1, 2, 3, 2, 1, 2, 3]
       expected_ans = np.array(
-          [[[[[[[[1, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]],
-              [[[[0, 2, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]],
-              [[[[0, 0, 3], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]]],
-             [[[[[0, 0, 0], [4, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]],
-              [[[[0, 0, 0], [0, 5, 0]]],
-               [[[0, 0, 0], [0, 0, 0]]]],
-              [[[[0, 0, 0], [0, 0, 6]]],
-               [[[0, 0, 0], [0, 0, 0]]]]]]],
-
-           [[[[[[[0, 0, 0], [0, 0, 0]]],
-               [[[7, 0, 0], [0, 0, 0]]]],
-              [[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 8, 0], [0, 0, 0]]]],
-              [[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 9], [0, 0, 0]]]]],
-             [[[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [10, 0, 0]]]],
-              [[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 11, 0]]]],
-              [[[[0, 0, 0], [0, 0, 0]]],
-               [[[0, 0, 0], [0, 0, 12]]]]]]]], dtype=dtype)
+          [[[[[[[[1, 0, 0], [0, 0, 0]]], [[[0, 0, 0], [0, 0, 0]]]], [
+              [[[0, 2, 0], [0, 0, 0]]], [[[0, 0, 0], [0, 0, 0]]]
+          ], [[[[0, 0, 3], [0, 0, 0]]], [[[0, 0, 0], [0, 0, 0]]]]], [[
+              [[[0, 0, 0], [4, 0, 0]]], [[[0, 0, 0], [0, 0, 0]]]
+          ], [[[[0, 0, 0], [0, 5, 0]]], [[[0, 0, 0], [0, 0, 0]]]], [
+              [[[0, 0, 0], [0, 0, 6]]], [[[0, 0, 0], [0, 0, 0]]]
+          ]]]], [[[[[[[0, 0, 0], [0, 0, 0]]], [[[7, 0, 0], [0, 0, 0]]]], [
+              [[[0, 0, 0], [0, 0, 0]]], [[[0, 8, 0], [0, 0, 0]]]
+          ], [[[[0, 0, 0], [0, 0, 0]]], [[[0, 0, 9], [0, 0, 0]]]]], [[
+              [[[0, 0, 0], [0, 0, 0]]], [[[0, 0, 0], [10, 0, 0]]]
+          ], [[[[0, 0, 0], [0, 0, 0]]], [[[0, 0, 0], [0, 11, 0]]]
+             ], [[[[0, 0, 0], [0, 0, 0]]], [[[0, 0, 0], [0, 0, 12]]]]]]]],
+          dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
   def testInvalidRank(self):
@@ -537,7 +504,9 @@ class DiagGradOpTest(test.TestCase):
           x1 = constant_op.constant(np.random.rand(*shape), dtype=dtype)
           y = array_ops.diag(x1)
           error = gradient_checker.compute_gradient_error(
-              x1, x1.get_shape().as_list(), y, y.get_shape().as_list())
+              x1,
+              x1.get_shape().as_list(), y,
+              y.get_shape().as_list())
           tf_logging.info("error = %f", error)
           self.assertLess(error, 1e-4)
 
@@ -555,7 +524,9 @@ class DiagGradPartOpTest(test.TestCase):
           x1 = constant_op.constant(np.random.rand(*shape), dtype=dtype)
           y = array_ops.diag_part(x1)
           error = gradient_checker.compute_gradient_error(
-              x1, x1.get_shape().as_list(), y, y.get_shape().as_list())
+              x1,
+              x1.get_shape().as_list(), y,
+              y.get_shape().as_list())
           tf_logging.info("error = %f", error)
           self.assertLess(error, 1e-4)
 
diff --git a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
index a269d722737866fa5e6ae9feee919be0db71bcf1..09812db8166567403dc966ac9cb4304be0740e50 100644
--- a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
@@ -25,7 +25,6 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import bernoulli
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
@@ -291,12 +290,6 @@ class BernoulliTest(test.TestCase):
                [np.sqrt(var(0.5)), np.sqrt(var(0.4))]],
               dtype=np.float32))
 
-  def testBernoulliWithSigmoidProbs(self):
-    p = np.array([8.3, 4.2])
-    dist = bernoulli.BernoulliWithSigmoidProbs(logits=p)
-    with self.test_session():
-      self.assertAllClose(math_ops.sigmoid(p).eval(), dist.probs.eval())
-
   def testBernoulliBernoulliKL(self):
     with self.test_session() as sess:
       batch_size = 6
diff --git a/tensorflow/python/kernel_tests/distributions/beta_test.py b/tensorflow/python/kernel_tests/distributions/beta_test.py
index 91a451f033ffbb01d54c3dacce952b406564b7b4..ab5041a6eb477ce231acbd1e6041c354ee17409b 100644
--- a/tensorflow/python/kernel_tests/distributions/beta_test.py
+++ b/tensorflow/python/kernel_tests/distributions/beta_test.py
@@ -107,8 +107,10 @@ class BetaTest(test.TestCase):
         dist.prob([-1., 0.1, 0.5]).eval()
       with self.assertRaisesOpError("sample must be positive"):
         dist.prob([0., 0.1, 0.5]).eval()
-      with self.assertRaisesOpError("sample must be no larger than `1`"):
+      with self.assertRaisesOpError("sample must be less than `1`"):
         dist.prob([.1, .2, 1.2]).eval()
+      with self.assertRaisesOpError("sample must be less than `1`"):
+        dist.prob([.1, .2, 1.0]).eval()
 
   def testPdfTwoBatches(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/distributions/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
index 019c1bc353a9891da6967a7ce9114b58226a980a..ca2358fe99934e110ba743c6085d1f25ff0f5e5e 100644
--- a/tensorflow/python/kernel_tests/distributions/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -100,6 +100,10 @@ class CategoricalTest(test.TestCase):
     self.assertEqual(
         dist.logits.dtype, dist.log_prob(np.array(
             0, dtype=np.int64)).dtype)
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
+      dist = make_categorical([], 5, dtype=dtype)
+      self.assertEqual(dist.dtype, dtype)
+      self.assertEqual(dist.dtype, dist.sample(5).dtype)
 
   def testUnknownShape(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
index b1d8da771612fe42a153a1a11b6cb26bdcb983a0..d0fa1fe98996fd234f457bd0199fad5efc2547dc 100644
--- a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
+++ b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
@@ -59,13 +59,21 @@ class KLTest(test.TestCase):
     # pylint: disable=unused-argument,unused-variable
 
     with self.test_session():
-      a = MyDistException(loc=0.0, scale=1.0)
+      a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=False)
       kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False)
       with self.assertRaisesOpError(
           "KL calculation between .* and .* returned NaN values"):
         kl.eval()
+      with self.assertRaisesOpError(
+          "KL calculation between .* and .* returned NaN values"):
+        a.kl_divergence(a).eval()
+      a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=True)
       kl_ok = kullback_leibler.kl_divergence(a, a)
       self.assertAllEqual([float("nan")], kl_ok.eval())
+      self_kl_ok = a.kl_divergence(a)
+      self.assertAllEqual([float("nan")], self_kl_ok.eval())
+      cross_ok = a.cross_entropy(a)
+      self.assertAllEqual([float("nan")], cross_ok.eval())
 
   def testRegistrationFailures(self):
 
@@ -86,16 +94,22 @@ class KLTest(test.TestCase):
     for (k, v) in _DIVERGENCES.items():
       self.assertEqual(v, _registered_kl(*k))
 
-  def testIndirectRegistration(self):
+  def _testIndirectRegistration(self, fn):
 
     class Sub1(normal.Normal):
-      pass
+
+      def entropy(self):
+        return ""
 
     class Sub2(normal.Normal):
-      pass
+
+      def entropy(self):
+        return ""
 
     class Sub11(Sub1):
-      pass
+
+      def entropy(self):
+        return ""
 
     # pylint: disable=unused-argument,unused-variable
     @kullback_leibler.RegisterKL(Sub1, Sub1)
@@ -116,16 +130,30 @@ class KLTest(test.TestCase):
     sub2 = Sub2(loc=0.0, scale=1.0)
     sub11 = Sub11(loc=0.0, scale=1.0)
 
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub1, sub2))
-    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub1))
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub11))
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
-    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub11))
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub11))
+    self.assertEqual("sub1-1", fn(sub1, sub1))
+    self.assertEqual("sub1-2", fn(sub1, sub2))
+    self.assertEqual("sub2-1", fn(sub2, sub1))
+    self.assertEqual("sub1-1", fn(sub11, sub11))
+    self.assertEqual("sub1-1", fn(sub11, sub1))
+    self.assertEqual("sub1-2", fn(sub11, sub2))
+    self.assertEqual("sub1-1", fn(sub11, sub1))
+    self.assertEqual("sub1-2", fn(sub11, sub2))
+    self.assertEqual("sub2-1", fn(sub2, sub11))
+    self.assertEqual("sub1-1", fn(sub1, sub11))
+
+  def testIndirectRegistrationKLFun(self):
+    self._testIndirectRegistration(kullback_leibler.kl_divergence)
+
+  def testIndirectRegistrationKLSelf(self):
+    self._testIndirectRegistration(
+        lambda p, q: p.kl_divergence(q))
+
+  def testIndirectRegistrationCrossEntropy(self):
+    self._testIndirectRegistration(
+        lambda p, q: p.cross_entropy(q))
+
+  def testFunctionCrossEntropy(self):
+    self._testIndirectRegistration(kullback_leibler.cross_entropy)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index 8fd26a1c9afe0ab701db199147e2de7c3ded3211..f54f146e0ac102cf25d8a66f021e8c7af9901c93 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -55,6 +56,7 @@ def _logit(x):
   return np.log(x) - np.log1p(-x)
 
 
+@test_util.with_c_api
 class AssertCloseTest(test.TestCase):
 
   def testAssertCloseIntegerDtype(self):
@@ -145,6 +147,7 @@ class AssertCloseTest(test.TestCase):
           array_ops.identity(w).eval(feed_dict=feed_dict)
 
 
+@test_util.with_c_api
 class GetLogitsAndProbsTest(test.TestCase):
 
   def testImproperArguments(self):
@@ -298,6 +301,7 @@ class GetLogitsAndProbsTest(test.TestCase):
         logit.eval(feed_dict={l: np.ones([int(2**11+1)])})
 
 
+@test_util.with_c_api
 class EmbedCheckCategoricalEventShapeTest(test.TestCase):
 
   def testTooSmall(self):
@@ -335,6 +339,7 @@ class EmbedCheckCategoricalEventShapeTest(test.TestCase):
         du.embed_check_categorical_event_shape(param)
 
 
+@test_util.with_c_api
 class EmbedCheckIntegerCastingClosedTest(test.TestCase):
 
   def testCorrectlyAssertsNonnegative(self):
@@ -370,6 +375,7 @@ class EmbedCheckIntegerCastingClosedTest(test.TestCase):
         x_checked.eval(feed_dict={x: np.array([1, -1], dtype=np.int32)})
 
 
+@test_util.with_c_api
 class LogCombinationsTest(test.TestCase):
 
   def testLogCombinationsBinomial(self):
@@ -400,6 +406,7 @@ class LogCombinationsTest(test.TestCase):
       self.assertEqual([2, 2], log_binom.get_shape())
 
 
+@test_util.with_c_api
 class DynamicShapeTest(test.TestCase):
 
   def testSameDynamicShape(self):
@@ -504,6 +511,7 @@ class DynamicShapeTest(test.TestCase):
               }))
 
 
+@test_util.with_c_api
 class RotateTransposeTest(test.TestCase):
 
   def _np_rotate_transpose(self, x, shift):
@@ -537,6 +545,7 @@ class RotateTransposeTest(test.TestCase):
                                   shift: shift_value}))
 
 
+@test_util.with_c_api
 class PickVectorTest(test.TestCase):
 
   def testCorrectlyPicksVector(self):
@@ -557,6 +566,128 @@ class PickVectorTest(test.TestCase):
                               constant_op.constant(False), x, y))  # No eval.
 
 
+@test_util.with_c_api
+class PreferStaticRankTest(test.TestCase):
+
+  def testNonEmptyConstantTensor(self):
+    x = array_ops.zeros((2, 3, 4))
+    rank = du.prefer_static_rank(x)
+    self.assertIsInstance(rank, np.ndarray)
+    self.assertEqual(3, rank)
+
+  def testEmptyConstantTensor(self):
+    x = constant_op.constant([])
+    rank = du.prefer_static_rank(x)
+    self.assertIsInstance(rank, np.ndarray)
+    self.assertEqual(1, rank)
+
+  def testScalarTensor(self):
+    x = constant_op.constant(1.)
+    rank = du.prefer_static_rank(x)
+    self.assertIsInstance(rank, np.ndarray)
+    self.assertEqual(0, rank)
+
+  def testDynamicRankEndsUpBeingNonEmpty(self):
+    x = array_ops.placeholder(np.float64, shape=None)
+    rank = du.prefer_static_rank(x)
+    with self.test_session():
+      self.assertAllEqual(2, rank.eval(feed_dict={x: np.zeros((2, 3))}))
+
+  def testDynamicRankEndsUpBeingEmpty(self):
+    x = array_ops.placeholder(np.int32, shape=None)
+    rank = du.prefer_static_rank(x)
+    with self.test_session():
+      self.assertAllEqual(1, rank.eval(feed_dict={x: []}))
+
+  def testDynamicRankEndsUpBeingScalar(self):
+    x = array_ops.placeholder(np.int32, shape=None)
+    rank = du.prefer_static_rank(x)
+    with self.test_session():
+      self.assertAllEqual(0, rank.eval(feed_dict={x: 1}))
+
+
+@test_util.with_c_api
+class PreferStaticShapeTest(test.TestCase):
+
+  def testNonEmptyConstantTensor(self):
+    x = array_ops.zeros((2, 3, 4))
+    shape = du.prefer_static_shape(x)
+    self.assertIsInstance(shape, np.ndarray)
+    self.assertAllEqual(np.array([2, 3, 4]), shape)
+
+  def testEmptyConstantTensor(self):
+    x = constant_op.constant([])
+    shape = du.prefer_static_shape(x)
+    self.assertIsInstance(shape, np.ndarray)
+    self.assertAllEqual(np.array([0]), shape)
+
+  def testScalarTensor(self):
+    x = constant_op.constant(1.)
+    shape = du.prefer_static_shape(x)
+    self.assertIsInstance(shape, np.ndarray)
+    self.assertAllEqual(np.array([]), shape)
+
+  def testDynamicShapeEndsUpBeingNonEmpty(self):
+    x = array_ops.placeholder(np.float64, shape=None)
+    shape = du.prefer_static_shape(x)
+    with self.test_session():
+      self.assertAllEqual((2, 3), shape.eval(feed_dict={x: np.zeros((2, 3))}))
+
+  def testDynamicShapeEndsUpBeingEmpty(self):
+    x = array_ops.placeholder(np.int32, shape=None)
+    shape = du.prefer_static_shape(x)
+    with self.test_session():
+      self.assertAllEqual(np.array([0]), shape.eval(feed_dict={x: []}))
+
+  def testDynamicShapeEndsUpBeingScalar(self):
+    x = array_ops.placeholder(np.int32, shape=None)
+    shape = du.prefer_static_shape(x)
+    with self.test_session():
+      self.assertAllEqual(np.array([]), shape.eval(feed_dict={x: 1}))
+
+
+@test_util.with_c_api
+class PreferStaticValueTest(test.TestCase):
+
+  def testNonEmptyConstantTensor(self):
+    x = array_ops.zeros((2, 3, 4))
+    value = du.prefer_static_value(x)
+    self.assertIsInstance(value, np.ndarray)
+    self.assertAllEqual(np.zeros((2, 3, 4)), value)
+
+  def testEmptyConstantTensor(self):
+    x = constant_op.constant([])
+    value = du.prefer_static_value(x)
+    self.assertIsInstance(value, np.ndarray)
+    self.assertAllEqual(np.array([]), value)
+
+  def testScalarTensor(self):
+    x = constant_op.constant(1.)
+    value = du.prefer_static_value(x)
+    self.assertIsInstance(value, np.ndarray)
+    self.assertAllEqual(np.array(1.), value)
+
+  def testDynamicValueEndsUpBeingNonEmpty(self):
+    x = array_ops.placeholder(np.float64, shape=None)
+    value = du.prefer_static_value(x)
+    with self.test_session():
+      self.assertAllEqual(np.zeros((2, 3)),
+                          value.eval(feed_dict={x: np.zeros((2, 3))}))
+
+  def testDynamicValueEndsUpBeingEmpty(self):
+    x = array_ops.placeholder(np.int32, shape=None)
+    value = du.prefer_static_value(x)
+    with self.test_session():
+      self.assertAllEqual(np.array([]), value.eval(feed_dict={x: []}))
+
+  def testDynamicValueEndsUpBeingScalar(self):
+    x = array_ops.placeholder(np.int32, shape=None)
+    value = du.prefer_static_value(x)
+    with self.test_session():
+      self.assertAllEqual(np.array(1), value.eval(feed_dict={x: 1}))
+
+
+@test_util.with_c_api
 class FillTriangularTest(test.TestCase):
 
   def setUp(self):
@@ -587,7 +718,7 @@ class FillTriangularTest(test.TestCase):
     x_ = np.asarray(x_)
     with self.test_session() as sess:
       static_shape = None if use_deferred_shape else x_.shape
-      x_pl = array_ops.placeholder(dtype=x_.dtype, shape=static_shape)
+      x_pl = array_ops.placeholder_with_default(x_, shape=static_shape)
       # Add `zeros_like(x)` such that x's value and gradient are identical. We
       # do this so we can ensure each gradient value is mapped to the right
       # gradient location.  (Not doing this means the gradient wrt `x` is simple
@@ -651,6 +782,7 @@ class FillTriangularTest(test.TestCase):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)), upper=True)
 
 
+@test_util.with_c_api
 class ReduceWeightedLogSumExp(test.TestCase):
 
   def _reduce_weighted_logsumexp(self, logx, w, axis, keep_dims=False):
@@ -747,6 +879,7 @@ class ReduceWeightedLogSumExp(test.TestCase):
           du.reduce_weighted_logsumexp(x, w, axis=[0, 1]).eval())
 
 
+@test_util.with_c_api
 class GenNewSeedTest(test.TestCase):
 
   def testOnlyNoneReturnsNone(self):
@@ -757,6 +890,7 @@ class GenNewSeedTest(test.TestCase):
 # TODO(jvdillon): Merge this test back into:
 # tensorflow/python/kernel_tests/softplus_op_test.py
 # once TF core is accepting new ops.
+@test_util.with_c_api
 class SoftplusTest(test.TestCase):
 
   def _npSoftplus(self, np_features):
diff --git a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
index b4fb5aa41175ba61ace0bff9a15d91ec4ee3ac55..fedbf9e696923a34968e7a907e4099c520d1447b 100644
--- a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import unittest
+
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -219,6 +221,7 @@ class DynamicPartitionTest(test.TestCase):
     self.assertAllEqual([], partition_vals[0])
     self.assertAllEqual([], partition_vals[1])
 
+  @unittest.skip("Fails on windows.")
   def testGPUTooManyParts(self):
     # This test only makes sense on the GPU. There we do not check
     # for errors. In this case, we should discard all but the first
@@ -239,6 +242,7 @@ class DynamicPartitionTest(test.TestCase):
     self.assertAllEqual([6], partition_vals[0])
     self.assertAllEqual([5], partition_vals[1])
 
+  @unittest.skip("Fails on windows.")
   def testGPUPartsTooLarge(self):
     # This test only makes sense on the GPU. There we do not check
     # for errors. In this case, we should discard all the values
@@ -262,6 +266,7 @@ class DynamicPartitionTest(test.TestCase):
     self.assertAllEqual([], partition_vals[3])
     self.assertAllEqual([], partition_vals[4])
 
+  @unittest.skip("Fails on windows.")
   def testGPUAllIndicesBig(self):
     # This test only makes sense on the GPU. There we do not check
     # for errors. In this case, we should discard all the values
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index cf723f5eec3c31c93d67fd6a34a21c8377b74c84..a4b30e4319527c6f3354ac83bf0e3a5114eb45e8 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -48,8 +48,10 @@ class DynamicStitchTestBase(object):
 
   def testShapeInferenceForScalarWithNonConstantIndices(self):
     with self.test_session(use_gpu=True):
-      indices = [array_ops.placeholder(dtype=dtypes.int32),
-                 constant_op.constant(1)]
+      indices = [
+          array_ops.placeholder(dtype=dtypes.int32),
+          constant_op.constant(1)
+      ]
       data = [constant_op.constant(40), constant_op.constant(60)]
       for step in -1, 1:
         stitched_t = self.stitch_op(indices[::step], data)
@@ -61,7 +63,8 @@ class DynamicStitchTestBase(object):
   def testSimpleOneDimensional(self):
     with self.test_session(use_gpu=True):
       indices = [
-          constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5])
+          constant_op.constant([0, 4, 7]),
+          constant_op.constant([1, 6, 2, 3, 5])
       ]
       data = [
           constant_op.constant([0, 40, 70]),
@@ -86,7 +89,8 @@ class DynamicStitchTestBase(object):
   def testSimpleTwoDimensional(self):
     with self.test_session(use_gpu=True):
       indices = [
-          constant_op.constant([0, 4, 7]), constant_op.constant([1, 6]),
+          constant_op.constant([0, 4, 7]),
+          constant_op.constant([1, 6]),
           constant_op.constant([2, 3, 5])
       ]
       data = [
@@ -104,7 +108,8 @@ class DynamicStitchTestBase(object):
   def testHigherRank(self):
     with self.test_session(use_gpu=True) as sess:
       indices = [
-          constant_op.constant(6), constant_op.constant([4, 1]),
+          constant_op.constant(6),
+          constant_op.constant([4, 1]),
           constant_op.constant([[5, 2], [0, 3]])
       ]
       data = [
@@ -127,7 +132,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorIndicesMultiDimensional(self):
     indices = [
-        constant_op.constant([0, 4, 7]), constant_op.constant([[1, 6, 2, 3, 5]])
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([[1, 6, 2, 3, 5]])
     ]
     data = [
         constant_op.constant([[0, 40, 70]]),
@@ -138,7 +144,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorDataNumDimsMismatch(self):
     indices = [
-        constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5])
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([1, 6, 2, 3, 5])
     ]
     data = [
         constant_op.constant([0, 40, 70]),
@@ -149,7 +156,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorDataDimSizeMismatch(self):
     indices = [
-        constant_op.constant([0, 4, 5]), constant_op.constant([1, 6, 2, 3])
+        constant_op.constant([0, 4, 5]),
+        constant_op.constant([1, 6, 2, 3])
     ]
     data = [
         constant_op.constant([[0], [40], [70]]),
@@ -160,7 +168,8 @@ class DynamicStitchTestBase(object):
 
   def testErrorDataAndIndicesSizeMismatch(self):
     indices = [
-        constant_op.constant([0, 4, 7]), constant_op.constant([1, 6, 2, 3, 5])
+        constant_op.constant([0, 4, 7]),
+        constant_op.constant([1, 6, 2, 3, 5])
     ]
     data = [
         constant_op.constant([0, 40, 70]),
@@ -235,13 +244,15 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
   def testHigherRankGPU(self):
     with self.test_session() as sess:
       indices = [
-          constant_op.constant(6), constant_op.constant([4, 1]),
+          constant_op.constant(6),
+          constant_op.constant([4, 1]),
           constant_op.constant([[5, 2], [0, 3]])
       ]
       data = [
           constant_op.constant([61, 62], dtype=dtypes.float32),
           constant_op.constant([[41, 42], [11, 12]], dtype=dtypes.float32),
-          constant_op.constant([[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
+          constant_op.constant(
+              [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
       ]
       stitched_t = data_flow_ops.dynamic_stitch(indices, data)
       stitched_val = stitched_t.eval()
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
index 5c7624f1f6be4da91ca74d4ef2ed81a21890b35c..6ea9f1badc3b8fac06fe6328f95714b93de97c0e 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
@@ -84,7 +84,7 @@ class ExtractImagePatches(test.TestCase):
           patches=patches)
 
   def testKsize2x2Stride1x1Rate1x1Valid(self):
-    """Test for 1x1 kernel ."""
+    """Test for 2x2 kernel with VALID padding."""
     # [1, 2, 2, 1]
     image = [[[[1], [2]], [[3], [4]]]]
     # [1, 1, 1, 4]
@@ -98,7 +98,7 @@ class ExtractImagePatches(test.TestCase):
         patches=patches)
 
   def testKsize2x2Stride1x1Rate1x1Same(self):
-    """Test for 1x1 kernel ."""
+    """Test for 2x2 kernel with SAME padding."""
     # [1, 2, 2, 1]
     image = [[[[1], [2]], [[3], [4]]]]
     # [1, 2, 2, 4]
@@ -111,6 +111,20 @@ class ExtractImagePatches(test.TestCase):
         padding="SAME",
         patches=patches)
 
+  def testKsize2x2Stride1x1Rate2x2Valid(self):
+    """Test for 2x2 kernel with 2x2 dilation."""
+    # [1, 2, 2, 1]
+    image = np.arange(16).reshape(1, 4, 4, 1).astype(np.float32)
+    # [1, 2, 2, 4]
+    patches = [[[[0, 2, 8, 10], [1, 3, 9, 11]],
+                [[4, 6, 12, 14], [5, 7, 13, 15]]]]
+    self._VerifyValues(
+        image,
+        ksizes=[2, 2],
+        strides=[1, 1],
+        rates=[2, 2],
+        padding="VALID",
+        patches=patches)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index 748135440ec5e8ad387f910e1433f638abf2260a..ce73e7ad3e5f822363c697609dfa163b6f13751a 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import random
-import re
 import time
 
 import numpy as np
diff --git a/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py b/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
index 48a51c8072416f3d494129f18912d67491fa5281..feec9934e459590bb1dd0bc5c7cf40013d3d8b88 100644
--- a/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
@@ -23,6 +23,8 @@ import math
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -310,6 +312,35 @@ class FractionalAvgTest(test.TestCase):
     self._ValidateFractionalAvgPoolResult(rand_mat, [1, 2, 2, 1], pseudo_random,
                                           overlapping)
 
+  def testDifferentInputTensorShape(self):
+    """Runs the operation in one session with different input tensor shapes."""
+    with self.test_session() as sess:
+      input_holder = array_ops.placeholder(dtypes.float32,
+                                           [None, None, None, 3])
+      pooling_ratio = [1, 1.5, 1.5, 1]
+      pseudo_random = False
+      overlapping = False
+      p, r, c = nn_ops.fractional_avg_pool(
+          input_holder,
+          pooling_ratio,
+          pseudo_random,
+          overlapping,
+          deterministic=True,
+          seed=self._SEED,
+          seed2=self._SEED2)
+      # First run.
+      input_a = np.zeros([3, 32, 32, 3])
+      actual, row_seq, col_seq = sess.run([p, r, c], {input_holder: input_a})
+      expected = self._GetExpectedFractionalAvgPoolResult(
+          input_a, row_seq, col_seq, overlapping)
+      self.assertSequenceEqual(expected.shape, actual.shape)
+      # Second run.
+      input_b = np.zeros([4, 60, 60, 3])
+      actual, row_seq, col_seq = sess.run([p, r, c], {input_holder: input_b})
+      expected = self._GetExpectedFractionalAvgPoolResult(
+          input_b, row_seq, col_seq, overlapping)
+      self.assertSequenceEqual(expected.shape, actual.shape)
+
 
 class FractionalAvgPoolGradTest(test.TestCase):
   """Tests for FractionalAvgPoolGrad.
diff --git a/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py b/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
index d380c31de35510c415420b3302fe1d4ff07877d2..5983ae7759dbf3eb2db9867def829ce8dbeb4b73 100644
--- a/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
@@ -23,6 +23,8 @@ import math
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -281,6 +283,35 @@ class FractionalMaxPoolTest(test.TestCase):
     self._ValidateFractionalMaxPoolResult(rand_mat, [1, 2, 2, 1], pseudo_random,
                                           overlapping)
 
+  def testDifferentInputTensorShape(self):
+    """Runs the operation in one session with different input tensor shapes."""
+    with self.test_session() as sess:
+      input_holder = array_ops.placeholder(dtypes.float32,
+                                           [None, None, None, 3])
+      pooling_ratio = [1, 1.5, 1.5, 1]
+      pseudo_random = False
+      overlapping = False
+      p, r, c = nn_ops.fractional_max_pool(
+          input_holder,
+          pooling_ratio,
+          pseudo_random,
+          overlapping,
+          deterministic=True,
+          seed=self._SEED,
+          seed2=self._SEED2)
+      # First run.
+      input_a = np.zeros([3, 32, 32, 3])
+      actual, row_seq, col_seq = sess.run([p, r, c], {input_holder: input_a})
+      expected = self._GetExpectedFractionalMaxPoolResult(
+          input_a, row_seq, col_seq, overlapping)
+      self.assertSequenceEqual(expected.shape, actual.shape)
+      # Second run.
+      input_b = np.zeros([4, 45, 45, 3])
+      actual, row_seq, col_seq = sess.run([p, r, c], {input_holder: input_b})
+      expected = self._GetExpectedFractionalMaxPoolResult(
+          input_b, row_seq, col_seq, overlapping)
+      self.assertSequenceEqual(expected.shape, actual.shape)
+
 
 class FractionalMaxPoolGradTest(test.TestCase):
   """Tests for FractionalMaxPoolGrad.
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 5109ed98c92002917a5dfa3b4cd79953fd950af8..91ebe8de9921268b2a3c5ad645585e1fe83c7419 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -255,6 +255,35 @@ class GatherNdTest(test.TestCase):
     with self.test_session(use_gpu=True):
       self.assertAllEqual(expected_grads, grads.eval())
 
+  def testGradientsRank7Elements(self):
+    # Shape [1,1,2,1,1,2,2]
+    indices = constant_op.constant(
+        [[[
+            [[[[0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]]],
+            [[[[0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1]]]]
+        ]]],
+        dtype=dtypes.int32)
+    inputs = constant_op.constant(
+        [[[
+            [[[[1, 3], [5, 7]]]],
+            [[[[2, 4], [6, 8]]]]
+        ]]], dtype=dtypes.float64)
+    outputs = array_ops.gather_nd(inputs, indices)
+
+    grad_vals = constant_op.constant(
+        [[[
+            [[[[1, 2], [3, 4]]]],
+            [[[[5, 6], [7, 8]]]]
+        ]]], dtype=dtypes.float64)
+    grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
+    expected_grads = np.array(
+        [[[
+            [[[[5, 6], [1, 2]]]],
+            [[[[3, 4], [7, 8]]]]
+        ]]], dtype=np.float64)
+    with self.test_session(use_gpu=True):
+      self.assertAllEqual(expected_grads, grads.eval())
+
   def testGradientsInt64Indices(self):
     indices = constant_op.constant(
         [[[0, 1], [1, 0]], [[0, 0], [1, 1]]], dtype=dtypes.int64)
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 157c0935403955d629b680eb6fc7627603ecbbf3..19a7d2f9d51fff46ee817ad03ef62383f6727791 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -146,6 +147,18 @@ class ConstantInitializersTest(test.TestCase):
       self.assertEqual(x.dtype.base_dtype, dtypes.int32)
       self.assertAllEqual(x.eval(), 7 * np.ones(shape, dtype=np.int32))
 
+  def testConstantTupleInitializer(self):
+    with self.test_session(use_gpu=True):
+      shape = [3]
+      x = variable_scope.get_variable(
+          "x",
+          shape=shape,
+          dtype=dtypes.int32,
+          initializer=init_ops.constant_initializer((10, 20, 30)))
+      x.initializer.run()
+      self.assertEqual(x.dtype.base_dtype, dtypes.int32)
+      self.assertAllEqual(x.eval(), [10, 20, 30])
+
   def _testNDimConstantInitializer(self, name, value, shape, expected):
     with self.test_session(use_gpu=True):
       init = init_ops.constant_initializer(value, dtype=dtypes.int32)
@@ -214,6 +227,16 @@ class ConstantInitializersTest(test.TestCase):
     self._testNDimConstantInitializerMoreValues(
         np.asarray(value).reshape(tuple([2, 4])), shape)
 
+  def testInvalidValueTypeForConstantInitializerCausesTypeError(self):
+    c = constant_op.constant([1.0, 2.0, 3.0])
+    with self.assertRaisesRegexp(
+        TypeError, r"Invalid type for initial value: .*Tensor.*"):
+      init_ops.constant_initializer(c, dtype=dtypes.float32)
+    v = variables.Variable([3.0, 2.0, 1.0])
+    with self.assertRaisesRegexp(
+        TypeError, r"Invalid type for initial value: .*Variable.*"):
+      init_ops.constant_initializer(v, dtype=dtypes.float32)
+
 
 class RandomNormalInitializationTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/io_ops_test.py b/tensorflow/python/kernel_tests/io_ops_test.py
index f91875c6f0c1a7bfa388ec1b1a58f06b65889c3e..61944f7e3197844d00cbc001459e48b50c9003b4 100644
--- a/tensorflow/python/kernel_tests/io_ops_test.py
+++ b/tensorflow/python/kernel_tests/io_ops_test.py
@@ -1,4 +1,4 @@
-﻿# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 4e18eaa4e8281c799e4669b2d6083c00bc1e2863..fd1b5bab6f5aa072c8821eb053bd8d39391be4d4 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -39,6 +39,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    shard_count = 5,
     tags = ["noasan"],  # times out b/63678675
 )
 
@@ -57,6 +58,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
+    shard_count = 5,
 )
 
 cuda_py_test(
@@ -73,6 +75,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
+    shard_count = 5,
 )
 
 cuda_py_test(
@@ -88,6 +91,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
+    shard_count = 5,
 )
 
 cuda_py_test(
@@ -134,6 +138,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    shard_count = 5,
 )
 
 filegroup(
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1577b7bc8021a326eb720bdf059b8d1c568c0cc1
--- /dev/null
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -0,0 +1,202 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ops which manipulate lists of tensors."""
+
+# pylint: disable=g-bad-name
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np  # pylint: disable=unused-import
+
+from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import list_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+def scalar_shape():
+  return ops.convert_to_tensor([], dtype=dtypes.int32)
+
+
+class ListOpsTest(test_util.TensorFlowTestCase):
+
+  def testPushPop(self):
+    l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
+                                   element_shape=scalar_shape())
+    l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
+    l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(e, 1.0)
+
+  def testPushPopGPU(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self.testPushPop()
+
+  def testStack(self):
+    l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
+                                   element_shape=scalar_shape())
+    l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
+    l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(t, [1.0, 2.0])
+
+  def testStackGPU(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self.testStack()
+
+  def testTensorListFromTensor(self):
+    t = constant_op.constant([1.0, 2.0])
+    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(e, 2.0)
+    l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(e, 1.0)
+    self.assertAllEqual(list_ops.tensor_list_length(l), 0)
+
+  def testFromTensorGPU(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self.testTensorListFromTensor()
+
+  def testGetSetItem(self):
+    t = constant_op.constant([1.0, 2.0])
+    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    self.assertAllEqual(e0, 1.0)
+    l = list_ops.tensor_list_set_item(l, 0, 3.0)
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(t, [3.0, 2.0])
+
+  def testGetSetGPU(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self.testGetSetItem()
+
+  def testUnknownShape(self):
+    l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
+                                   element_shape=-1)
+    l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
+    l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0, 2.0]))
+    _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(e, [1.0, 2.0])
+
+  def testCPUGPUCopy(self):
+    if not context.num_gpus():
+      return
+    t = constant_op.constant([1.0, 2.0])
+    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    with context.device("gpu:0"):
+      l_gpu = array_ops.identity(l)
+      self.assertAllEqual(
+          list_ops.tensor_list_pop_back(
+              l_gpu, element_dtype=dtypes.float32)[1],
+          2.0)
+    l_cpu = array_ops.identity(l_gpu)
+    self.assertAllEqual(
+        list_ops.tensor_list_pop_back(
+            l_cpu, element_dtype=dtypes.float32)[1],
+        2.0)
+
+  def testSerialize(self):
+    # pylint: disable=g-import-not-at-top
+    try:
+      import portpicker
+    except ImportError:
+      return
+    with context.graph_mode():
+      worker_port = portpicker.pick_unused_port()
+      ps_port = portpicker.pick_unused_port()
+      cluster_dict = {
+          "worker": ["localhost:%s" % worker_port],
+          "ps": ["localhost:%s" % ps_port]
+      }
+      cs = server_lib.ClusterSpec(cluster_dict)
+
+      worker = server_lib.Server(
+          cs, job_name="worker", protocol="grpc", task_index=0, start=True)
+      unused_ps = server_lib.Server(
+          cs, job_name="ps", protocol="grpc", task_index=0, start=True)
+      with ops.Graph().as_default(), session.Session(target=worker.target):
+        with ops.device("/job:worker"):
+          t = constant_op.constant([[1.0], [2.0]])
+          l = list_ops.tensor_list_from_tensor(t, element_shape=[1])
+        with ops.device("/job:ps"):
+          l_ps = array_ops.identity(l)
+          l_ps, e = list_ops.tensor_list_pop_back(
+              l_ps, element_dtype=dtypes.float32)
+        with ops.device("/job:worker"):
+          worker_e = array_ops.identity(e)
+        self.assertAllEqual(worker_e.eval(), [2.0])
+
+  def testPushPopGradients(self):
+    with backprop.GradientTape() as tape:
+      l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
+                                     element_shape=scalar_shape())
+      c = constant_op.constant(1.0)
+      tape.watch(c)
+      l = list_ops.tensor_list_push_back(l, c)
+      l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      e = 2 * e
+    self.assertAllEqual(tape.gradient(e, [c])[0], 2.0)
+
+  def testStackFromTensorGradients(self):
+    with backprop.GradientTape() as tape:
+      c = constant_op.constant([1.0, 2.0])
+      tape.watch(c)
+      l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+      c2 = list_ops.tensor_list_stack(
+          l, element_dtype=dtypes.float32)
+      result = c2 * 2.0
+    self.assertAllEqual(tape.gradient(result, [c])[0], [2.0, 2.0])
+
+  def testGetSetGradients(self):
+    with backprop.GradientTape() as tape:
+      c = constant_op.constant([1.0, 2.0])
+      tape.watch(c)
+      l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+      c2 = constant_op.constant(3.0)
+      tape.watch(c2)
+      l = list_ops.tensor_list_set_item(l, 0, c2)
+      e = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      ee = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
+      y = e * e + ee * ee
+    grad_c, grad_c2 = tape.gradient(y, [c, c2])
+    self.assertAllEqual(grad_c, [0.0, 4.0])
+    self.assertAllEqual(grad_c2, 6.0)
+
+  def testSetOutOfBounds(self):
+    c = constant_op.constant([1.0, 2.0])
+    l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    with self.assertRaises(errors.InvalidArgumentError):
+      list_ops.tensor_list_set_item(l, 20, 3.0)
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index e4c799cb1cfce35143b887feb9ae1af6455d7b25..5f08339fe5f15d12f08b0e18df03d612402d86f8 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -618,15 +618,20 @@ class IndexToStringTableFromFileTest(test.TestCase):
     return vocabulary_file
 
   def test_index_to_string_table(self):
-    vocabulary_file = self._createVocabFile("i2f_vocab1.txt")
-    with self.test_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file)
-      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      self.assertRaises(errors_impl.OpError, features.eval)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+    vocabulary_path = self._createVocabFile("i2f_vocab1.txt")
+    # vocabulary_file supports string and tensor
+    type_funcs = [str, constant_op.constant]
+    for type_func in type_funcs:
+      vocabulary_file = type_func(vocabulary_path)
+      with self.test_session():
+        table = lookup_ops.index_to_string_table_from_file(
+            vocabulary_file=vocabulary_file)
+        features = table.lookup(
+            constant_op.constant([0, 1, 2, 3], dtypes.int64))
+        self.assertRaises(errors_impl.OpError, features.eval)
+        lookup_ops.tables_initializer().run()
+        self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                            features.eval())
 
   def test_index_to_string_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index da57f918ac286bb59e0525a02482b672dc40dc89..197dbf44afaea2cfaf5a1ffebb6ac0a6be09d165 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -953,14 +953,14 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     # Compute the expected loss 'manually'.
     total = np.zeros((batch_size,))
     for b in range(batch_size):
-      for i in range(dims):
-        for j in range(dims):
+      for i in range(dims - 1):
+        for j in range(i + 1, dims):
           x = self._predictions[b, i].item() - self._predictions[b, j].item()
           y = self._labels[b, i].item() - self._labels[b, j].item()
           diff = (x - y)
           total[b] += (diff * diff)
 
-    self._expected_losses = np.divide(total, 9.0)
+    self._expected_losses = np.divide(total, 3.0)
 
   def testValueErrorThrownWhenWeightIsNone(self):
     with self.test_session():
@@ -1059,8 +1059,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         [[4, 8, 12], [1, 2, 3], [4, 5, 6]],
         [[8, 1, 3], [7, 8, 9], [10, 11, 12]],
     ])
-    self._test_valid_weights(
-        labels, predictions, expected_loss=122.22222)
+    self._test_valid_weights(labels, predictions, expected_loss=137.5)
 
   def test3dWeightedScalar(self):
     labels = np.array([
@@ -1073,8 +1072,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     weight = 3.0
     self._test_valid_weights(
-        labels, predictions, expected_loss=weight * 122.22222,
-        weights=weight)
+        labels, predictions, expected_loss=weight * 137.5, weights=weight)
 
   def _test_invalid_weights(
       self, labels, predictions, weights=1.0):
@@ -1124,7 +1122,9 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     self._test_valid_weights(
         # TODO(ptucker): This doesn't look right.
-        labels, predictions, expected_loss=9 * 122.22222,
+        labels,
+        predictions,
+        expected_loss=9 * 137.5,
         weights=np.ones((2, 3, 3)))
 
   def testLossWithAllZeroBatchSpecificWeights(self):
@@ -1340,11 +1340,39 @@ class ComputeWeightedLossTest(test.TestCase):
               self.assertAllClose(
                   np.sum(self._raw_losses), unweighted_loss.eval())
             else:
-              # reduction one of losses.Reduction.MEAN and
-              # losses.Reduction.SUM_BY_NONZERO_WEIGHTS.
+              # reduction one of MEAN, SUM_OVER_NONZERO_WEIGHTS,
+              # SUM_BY_NONZERO_WEIGHTS or SUM_OVER_BATCH_SIZE.
               self.assertAllClose(
                   np.mean(self._raw_losses), unweighted_loss.eval())
 
+  def testUnweightedFromPlaceholder(self):
+    for reduction in losses.Reduction.all():
+      with ops.Graph().as_default() as g:
+        self.assertEqual(0, len(util.get_losses()))
+        raw_losses = array_ops.placeholder(dtype=dtypes.float32)
+        feed_dict = {raw_losses: self._raw_losses}
+        unweighted_losses = (
+            losses.compute_weighted_loss(raw_losses, reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 1, 1)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 1, 4)), reduction=reduction),
+        )
+        self.assertEqual(3, len(util.get_losses()))
+        with self.test_session(g):
+          for unweighted_loss in unweighted_losses:
+            if reduction == losses.Reduction.NONE:
+              self.assertAllClose(
+                  self._raw_losses, unweighted_loss.eval(feed_dict))
+            elif reduction == losses.Reduction.SUM:
+              self.assertAllClose(
+                  np.sum(self._raw_losses), unweighted_loss.eval(feed_dict))
+            else:
+              # reduction one of MEAN, SUM_OVER_NONZERO_WEIGHTS,
+              # SUM_BY_NONZERO_WEIGHTS or SUM_OVER_BATCH_SIZE.
+              self.assertAllClose(
+                  np.mean(self._raw_losses), unweighted_loss.eval(feed_dict))
+
   def testScalarWeight(self):
     with ops.Graph().as_default():
       self.assertEqual(0, len(util.get_losses()))
@@ -1435,10 +1463,15 @@ class ComputeWeightedLossTest(test.TestCase):
               self.assertAllClose(
                   weighted_sum / np.sum(broadcast_weights),
                   weighted_loss.eval())
-            elif reduction == losses.Reduction.SUM_BY_NONZERO_WEIGHTS:
+            elif (reduction == losses.Reduction.SUM_OVER_NONZERO_WEIGHTS or
+                  reduction == losses.Reduction.SUM_BY_NONZERO_WEIGHTS):
               self.assertAllClose(
                   weighted_sum / np.count_nonzero(broadcast_weights),
                   weighted_loss.eval())
+            elif reduction == losses.Reduction.SUM_OVER_BATCH_SIZE:
+              self.assertAllClose(
+                  weighted_sum / self._raw_losses.size,
+                  weighted_loss.eval())
 
   def test1x1x1Weight(self):
     self._test_valid_weights((((17.0,),),))
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8200ac0cb1e4315a56181779c70da1126d8fc15
--- /dev/null
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -0,0 +1,138 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for manip_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import manip_ops
+from tensorflow.python.platform import test as test_lib
+
+# pylint: disable=g-import-not-at-top
+try:
+  from distutils.version import StrictVersion as Version
+  # numpy.roll for multiple shifts was introduced in numpy version 1.12.0
+  NP_ROLL_CAN_MULTISHIFT = Version(np.version.version) >= Version("1.12.0")
+except ImportError:
+  NP_ROLL_CAN_MULTISHIFT = False
+# pylint: enable=g-import-not-at-top
+
+
+class RollTest(test_util.TensorFlowTestCase):
+
+  def _testRoll(self, np_input, shift, axis):
+    expected_roll = np.roll(np_input, shift, axis)
+    with self.test_session():
+      roll = manip_ops.roll(np_input, shift, axis)
+      self.assertAllEqual(roll.eval(), expected_roll)
+
+  def _testGradient(self, np_input, shift, axis):
+    with self.test_session():
+      inx = constant_op.constant(np_input.tolist())
+      xs = list(np_input.shape)
+      y = manip_ops.roll(inx, shift, axis)
+      # Expected y's shape to be the same
+      ys = xs
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          inx, xs, y, ys, x_init_value=np_input)
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
+
+  def _testAll(self, np_input, shift, axis):
+    self._testRoll(np_input, shift, axis)
+    if np_input.dtype == np.float32:
+      self._testGradient(np_input, shift, axis)
+
+  def testIntTypes(self):
+    for t in [np.int32, np.int64]:
+      self._testAll(np.random.randint(-100, 100, (5)).astype(t), 3, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        self._testAll(
+            np.random.randint(-100, 100, (4, 4, 3)).astype(t), [1, -2, 3],
+            [0, 1, 2])
+        self._testAll(
+            np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t), [0, 1, -2],
+            [1, 2, 3])
+
+  def testFloatTypes(self):
+    for t in [np.float32, np.float64]:
+      self._testAll(np.random.rand(5).astype(t), 2, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        self._testAll(np.random.rand(3, 4).astype(t), [1, 2], [1, 0])
+        self._testAll(np.random.rand(1, 3, 4).astype(t), [1, 0, -3], [0, 1, 2])
+
+  def testComplexTypes(self):
+    for t in [np.complex64, np.complex128]:
+      x = np.random.rand(4, 4).astype(t)
+      self._testAll(x + 1j * x, 2, 0)
+      if NP_ROLL_CAN_MULTISHIFT:
+        x = np.random.rand(2, 5).astype(t)
+        self._testAll(x + 1j * x, [1, 2], [1, 0])
+        x = np.random.rand(3, 2, 1, 1).astype(t)
+        self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2])
+
+  def testRollInputMustVectorHigherRaises(self):
+    tensor = 7
+    shift = 1
+    axis = 0
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "input must be 1-D or higher"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollAxisMustBeScalarOrVectorRaises(self):
+    tensor = [[1, 2], [3, 4]]
+    shift = 1
+    axis = [[0, 1]]
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "axis must be a scalar or a 1-D vector"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollShiftMustBeScalarOrVectorRaises(self):
+    tensor = [[1, 2], [3, 4]]
+    shift = [[0, 1]]
+    axis = 1
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "shift must be a scalar or a 1-D vector"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollShiftAndAxisMustBeSameSizeRaises(self):
+    tensor = [[1, 2], [3, 4]]
+    shift = [1]
+    axis = [0, 1]
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "shift and axis must have the same size"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+  def testRollAxisOutOfRangeRaises(self):
+    tensor = [1, 2]
+    shift = 1
+    axis = 1
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "is out of range"):
+        manip_ops.roll(tensor, shift, axis).eval()
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/kernel_tests/map_stage_op_test.py b/tensorflow/python/kernel_tests/map_stage_op_test.py
index 8b669450590f1fce0f14a9e5d64e1055dbe23f4e..acfafde9e0f74d4e3ad6f2ee8ada9da3df94f5b9 100644
--- a/tensorflow/python/kernel_tests/map_stage_op_test.py
+++ b/tensorflow/python/kernel_tests/map_stage_op_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.platform import test
 
 TIMEOUT = 1
 
+
 class MapStageTest(test.TestCase):
 
   def testSimple(self):
@@ -83,7 +84,7 @@ class MapStageTest(test.TestCase):
             [dtypes.float32, dtypes.float32],
             shapes=[[], [128, 128]],
             names=['x', 'v'])
-        stage = stager.put(pi,{'x': x, 'v': v})
+        stage = stager.put(pi, {'x': x, 'v': v})
         key, ret = stager.get(gi)
         z = ret['x']
         y = ret['v']
@@ -128,8 +129,11 @@ class MapStageTest(test.TestCase):
         gi = array_ops.placeholder(dtypes.int64)
         p = array_ops.placeholder(dtypes.int32, name='p')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea([dtypes.int32, ], shapes=[[]])
-        stage = stager.put(pi,[x], [0])
+        stager = data_flow_ops.MapStagingArea(
+            [
+                dtypes.int32,
+            ], shapes=[[]])
+        stage = stager.put(pi, [x], [0])
         peek = stager.peek(gi)
         size = stager.size()
 
@@ -158,7 +162,7 @@ class MapStageTest(test.TestCase):
             [dtypes.float32, dtypes.float32],
             shapes=[[], [128, 128]],
             names=['x', 'v'])
-        stage = stager.put(pi,{'x': x, 'v': v})
+        stage = stager.put(pi, {'x': x, 'v': v})
         size = stager.size()
         clear = stager.clear()
 
@@ -172,7 +176,6 @@ class MapStageTest(test.TestCase):
       sess.run(clear)
       self.assertEqual(sess.run(size), 0)
 
-
   def testCapacity(self):
     capacity = 3
 
@@ -182,8 +185,10 @@ class MapStageTest(test.TestCase):
         pi = array_ops.placeholder(dtypes.int64, name='pi')
         gi = array_ops.placeholder(dtypes.int64, name='gi')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea([dtypes.int32, ],
-          capacity=capacity, shapes=[[]])
+        stager = data_flow_ops.MapStagingArea(
+            [
+                dtypes.int32,
+            ], capacity=capacity, shapes=[[]])
 
       stage = stager.put(pi, [x], [0])
       get = stager.get()
@@ -222,9 +227,8 @@ class MapStageTest(test.TestCase):
         self.fail("Expected to timeout on iteration '{}' "
                   "but instead timed out on iteration '{}' "
                   "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
+                  "capacity is '{}'.".format(capacity, i, sess.run(size),
+                                             capacity))
 
       # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
@@ -236,8 +240,8 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run(size) == 0)
 
   def testMemoryLimit(self):
-    memory_limit = 512*1024  # 512K
-    chunk = 200*1024 # 256K
+    memory_limit = 512 * 1024  # 512K
+    chunk = 200 * 1024  # 256K
     capacity = memory_limit // chunk
 
     with ops.Graph().as_default() as G:
@@ -246,8 +250,8 @@ class MapStageTest(test.TestCase):
         pi = array_ops.placeholder(dtypes.int64, name='pi')
         gi = array_ops.placeholder(dtypes.int64, name='gi')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea([dtypes.uint8],
-          memory_limit=memory_limit, shapes=[[]])
+        stager = data_flow_ops.MapStagingArea(
+            [dtypes.uint8], memory_limit=memory_limit, shapes=[[]])
         stage = stager.put(pi, [x], [0])
         get = stager.get()
         size = stager.size()
@@ -287,9 +291,8 @@ class MapStageTest(test.TestCase):
         self.fail("Expected to timeout on iteration '{}' "
                   "but instead timed out on iteration '{}' "
                   "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
+                  "capacity is '{}'.".format(capacity, i, sess.run(size),
+                                             capacity))
 
       # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
@@ -310,8 +313,10 @@ class MapStageTest(test.TestCase):
         pi = array_ops.placeholder(dtypes.int64, name='pi')
         gi = array_ops.placeholder(dtypes.int64, name='gi')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea([dtypes.int32, ],
-          shapes=[[]], ordered=True)
+        stager = data_flow_ops.MapStagingArea(
+            [
+                dtypes.int32,
+            ], shapes=[[]], ordered=True)
         stage = stager.put(pi, [x], [0])
         get = stager.get()
         size = stager.size()
@@ -349,7 +354,7 @@ class MapStageTest(test.TestCase):
         stager = data_flow_ops.MapStagingArea(
             [dtypes.float32, dtypes.float32, dtypes.float32],
             names=['x', 'v', 'f'])
-        stage_xf = stager.put(pi,{'x': x, 'f': f})
+        stage_xf = stager.put(pi, {'x': x, 'f': f})
         stage_v = stager.put(pi, {'v': v})
         key, ret = stager.get(gi)
         size = stager.size()
@@ -373,12 +378,13 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run([size, isize]) == [1, 1])
       # We can now obtain tuple associated with key 0
       self.assertTrue(
-          sess.run([key, ret],
-                   feed_dict={gi: 0}) == [0, {
-                       'x': 1,
-                       'f': 2,
-                       'v': 1
-                   }])
+          sess.run([key, ret], feed_dict={
+              gi: 0
+          }) == [0, {
+              'x': 1,
+              'f': 2,
+              'v': 1
+          }])
 
       # 0 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [0, 1])
@@ -386,12 +392,13 @@ class MapStageTest(test.TestCase):
       sess.run(stage_v, feed_dict={pi: 1, v: 3})
       # We can now obtain tuple associated with key 1
       self.assertTrue(
-          sess.run([key, ret],
-                   feed_dict={gi: 1}) == [1, {
-                       'x': 1,
-                       'f': 2,
-                       'v': 3
-                   }])
+          sess.run([key, ret], feed_dict={
+              gi: 1
+          }) == [1, {
+              'x': 1,
+              'f': 2,
+              'v': 3
+          }])
 
   def testPartialIndexInsert(self):
     with ops.Graph().as_default() as G:
@@ -450,7 +457,7 @@ class MapStageTest(test.TestCase):
         stager = data_flow_ops.MapStagingArea(
             [dtypes.float32, dtypes.float32, dtypes.float32],
             names=['x', 'v', 'f'])
-        stage_xf = stager.put(pi,{'x': x, 'f': f})
+        stage_xf = stager.put(pi, {'x': x, 'f': f})
         stage_v = stager.put(pi, {'v': v})
         peek_xf = stager.peek(pei, ['x', 'f'])
         peek_v = stager.peek(pei, ['v'])
@@ -487,11 +494,12 @@ class MapStageTest(test.TestCase):
 
       # We can now obtain 'x' and 'f' values associated with key 0
       self.assertTrue(
-          sess.run([key_xf, get_xf],
-                   feed_dict={gi: 0}) == [0, {
-                       'x': 1,
-                       'f': 2
-                   }])
+          sess.run([key_xf, get_xf], feed_dict={
+              gi: 0
+          }) == [0, {
+              'x': 1,
+              'f': 2
+          }])
       # Still have 1 complete and 1 incomplete entry
       self.assertTrue(sess.run([size, isize]) == [1, 1])
 
@@ -499,14 +507,15 @@ class MapStageTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError) as cm:
         sess.run([key_xf, get_xf], feed_dict={gi: 0})
 
-      exc_str = ("Tensor at index '0' for key '0' "
-                "has already been removed.")
+      exc_str = ("Tensor at index '0' for key '0' " 'has already been removed.')
 
       self.assertTrue(exc_str in cm.exception.message)
 
       # Obtain 'v' value associated with key 0
       self.assertTrue(
-          sess.run([key_v, get_v], feed_dict={gi: 0}) == [0, {
+          sess.run([key_v, get_v], feed_dict={
+              gi: 0
+          }) == [0, {
               'v': 1
           }])
       # 0 complete and 1 incomplete entry
@@ -523,7 +532,9 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run([size, isize]) == [1, 0])
       # We can now obtain 'x' and 'f' values associated with key 1
       self.assertTrue(
-          sess.run([pop_key_v, pop_v], feed_dict={pi: 1}) == [1, {
+          sess.run([pop_key_v, pop_v], feed_dict={
+              pi: 1
+          }) == [1, {
               'v': 1
           }])
       # Nothing is left
@@ -557,18 +568,20 @@ class MapStageTest(test.TestCase):
       self.assertTrue(sess.run([size, isize]) == [1, 0])
 
       # Partial get using indices
-      self.assertTrue(sess.run([key_xf, get_xf],
-            feed_dict={gi: 0}) == [0, [1, 2]])
+      self.assertTrue(
+          sess.run([key_xf, get_xf], feed_dict={
+              gi: 0
+          }) == [0, [1, 2]])
 
       # Still some of key 0 left
       self.assertTrue(sess.run([size, isize]) == [1, 0])
 
       # Partial get of remaining index
-      self.assertTrue(sess.run([key_v, get_v],
-            feed_dict={gi: 0}) == [0, [3]])
+      self.assertTrue(sess.run([key_v, get_v], feed_dict={gi: 0}) == [0, [3]])
 
       # All gone
       self.assertTrue(sess.run([size, isize]) == [0, 0])
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
index 317b8dc05beac7642c384bf89e6d154be50f6992..68d626de2c5cdd91ee332247c05ddce2a558a35e 100644
--- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -54,9 +55,13 @@ def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):
           band_np = np.tril(band_np, upper)
         if batch_shape_ is not ():
           band_np = np.tile(band_np, batch_shape_ + (1, 1))
-        with self.test_session(use_gpu=False):
-          band = array_ops.matrix_band_part(batch_mat, lower, upper)
-          self.assertAllEqual(band_np, band.eval())
+        for index_dtype in [dtypes_lib.int32, dtypes_lib.int64]:
+          with self.test_session(use_gpu=False):
+            band = array_ops.matrix_band_part(
+                batch_mat,
+                constant_op.constant(lower, index_dtype),
+                constant_op.constant(upper, index_dtype))
+            self.assertAllEqual(band_np, band.eval())
 
   return Test
 
diff --git a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
index c5a7a3ba99ba161c197643a3e3c5aed5d37e9d2b..6203a412d7faec4fe9f6179141301579b5900291 100644
--- a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
@@ -46,10 +46,8 @@ def np_expm(x):
 class ExponentialOpTest(test.TestCase):
 
   def _verifyExponential(self, x, np_type):
-    # TODO(pfau): add matrix logarithm and test that it is inverse of expm.
     inp = x.astype(np_type)
     with self.test_session(use_gpu=True):
-      # Verify that x^{-1} * x == Identity matrix.
       tf_ans = gen_linalg_ops._matrix_exponential(inp)
       if x.size == 0:
         np_ans = np.empty(x.shape, dtype=np_type)
@@ -121,7 +119,7 @@ class ExponentialOpTest(test.TestCase):
       gen_linalg_ops._matrix_exponential(np.array([[1., 2., 3.], [3., 4., 5.]]))
 
   def testWrongDimensions(self):
-    # The input to the inverse should be at least a 2-dimensional tensor.
+    # The input to the exponential should be at least a 2-dimensional tensor.
     tensor3 = constant_op.constant([1., 2.])
     with self.assertRaises(ValueError):
       gen_linalg_ops._matrix_exponential(tensor3)
diff --git a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..18ed59828c15f5ad21fe054cd6e40991c02bb356
--- /dev/null
+++ b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
@@ -0,0 +1,166 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.gen_linalg_ops.matrix_logarithm."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class LogarithmOpTest(test.TestCase):
+
+  def _verifyLogarithm(self, x, np_type):
+    inp = x.astype(np_type)
+    with self.test_session(use_gpu=True):
+      # Verify that expm(logm(A)) == A.
+      tf_ans = gen_linalg_ops._matrix_exponential(
+          gen_linalg_ops._matrix_logarithm(inp))
+      out = tf_ans.eval()
+      self.assertAllClose(inp, out, rtol=1e-4, atol=1e-3)
+
+  def _verifyLogarithmComplex(self, x):
+    for np_type in [np.complex64, np.complex128]:
+      self._verifyLogarithm(x, np_type)
+
+  def _makeBatch(self, matrix1, matrix2):
+    matrix_batch = np.concatenate(
+        [np.expand_dims(matrix1, 0),
+         np.expand_dims(matrix2, 0)])
+    matrix_batch = np.tile(matrix_batch, [2, 3, 1, 1])
+    return matrix_batch
+
+  def testNonsymmetric(self):
+    # 2x2 matrices
+    matrix1 = np.array([[1., 2.], [3., 4.]])
+    matrix2 = np.array([[1., 3.], [3., 5.]])
+    matrix1 = matrix1.astype(np.complex64)
+    matrix1 += 1j * matrix1
+    matrix2 = matrix2.astype(np.complex64)
+    matrix2 += 1j * matrix2
+    self._verifyLogarithmComplex(matrix1)
+    self._verifyLogarithmComplex(matrix2)
+    # Complex batch
+    self._verifyLogarithmComplex(self._makeBatch(matrix1, matrix2))
+
+  def testSymmetricPositiveDefinite(self):
+    # 2x2 matrices
+    matrix1 = np.array([[2., 1.], [1., 2.]])
+    matrix2 = np.array([[3., -1.], [-1., 3.]])
+    matrix1 = matrix1.astype(np.complex64)
+    matrix1 += 1j * matrix1
+    matrix2 = matrix2.astype(np.complex64)
+    matrix2 += 1j * matrix2
+    self._verifyLogarithmComplex(matrix1)
+    self._verifyLogarithmComplex(matrix2)
+    # Complex batch
+    self._verifyLogarithmComplex(self._makeBatch(matrix1, matrix2))
+
+  def testNonSquareMatrix(self):
+    # When the logarithm of a non-square matrix is attempted we should return
+    # an error
+    with self.assertRaises(ValueError):
+      gen_linalg_ops._matrix_logarithm(
+          np.array([[1., 2., 3.], [3., 4., 5.]], dtype=np.complex64))
+
+  def testWrongDimensions(self):
+    # The input to the logarithm should be at least a 2-dimensional tensor.
+    tensor3 = constant_op.constant([1., 2.], dtype=dtypes.complex64)
+    with self.assertRaises(ValueError):
+      gen_linalg_ops._matrix_logarithm(tensor3)
+
+  def testEmpty(self):
+    self._verifyLogarithmComplex(np.empty([0, 2, 2], dtype=np.complex64))
+    self._verifyLogarithmComplex(np.empty([2, 0, 0], dtype=np.complex64))
+
+  def testRandomSmallAndLarge(self):
+    np.random.seed(42)
+    for dtype in np.complex64, np.complex128:
+      for batch_dims in [(), (1,), (3,), (2, 2)]:
+        for size in 8, 31, 32:
+          shape = batch_dims + (size, size)
+          matrix = np.random.uniform(
+              low=-1.0, high=1.0,
+              size=np.prod(shape)).reshape(shape).astype(dtype)
+          self._verifyLogarithmComplex(matrix)
+
+  def testConcurrentExecutesWithoutError(self):
+    with self.test_session(use_gpu=True) as sess:
+      matrix1 = math_ops.cast(
+          random_ops.random_normal([5, 5], seed=42), dtypes.complex64)
+      matrix2 = math_ops.cast(
+          random_ops.random_normal([5, 5], seed=42), dtypes.complex64)
+      logm1 = gen_linalg_ops._matrix_logarithm(matrix1)
+      logm2 = gen_linalg_ops._matrix_logarithm(matrix2)
+      logm = sess.run([logm1, logm2])
+      self.assertAllEqual(logm[0], logm[1])
+
+
+class MatrixLogarithmBenchmark(test.Benchmark):
+
+  shapes = [
+      (4, 4),
+      (10, 10),
+      (16, 16),
+      (101, 101),
+      (256, 256),
+      (1000, 1000),
+      (1024, 1024),
+      (2048, 2048),
+      (513, 4, 4),
+      (513, 16, 16),
+      (513, 256, 256),
+  ]
+
+  def _GenerateMatrix(self, shape):
+    batch_shape = shape[:-2]
+    shape = shape[-2:]
+    assert shape[0] == shape[1]
+    n = shape[0]
+    matrix = np.ones(shape).astype(np.complex64) / (
+        2.0 * n) + np.diag(np.ones(n).astype(np.complex64))
+    return variables.Variable(np.tile(matrix, batch_shape + (1, 1)))
+
+  def benchmarkMatrixLogarithmOp(self):
+    for shape in self.shapes:
+      with ops.Graph().as_default(), \
+          session.Session() as sess, \
+          ops.device("/cpu:0"):
+        matrix = self._GenerateMatrix(shape)
+        logm = gen_linalg_ops._matrix_logarithm(matrix)
+        variables.global_variables_initializer().run()
+        self.run_op_benchmark(
+            sess,
+            control_flow_ops.group(logm),
+            min_iters=25,
+            name="matrix_logarithm_cpu_{shape}".format(
+                shape=shape))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index 3358b78efd22f86b455041d72e6ff663f74acdd8..e0e752147cdf8690d22fa782aca2561b2935fa8e 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -3628,7 +3628,8 @@ class MeanPerClassAccuracyTest(test.TestCase):
         predictions=array_ops.ones([10, 1]),
         labels=array_ops.ones([10, 1]),
         num_classes=2)
-    _assert_metric_variables(self, ('mean_accuracy/total_confusion_matrix:0',))
+    _assert_metric_variables(self, ('mean_accuracy/count:0',
+                                    'mean_accuracy/total:0'))
 
   def testMetricsCollections(self):
     my_collection_name = '__metrics__'
@@ -3797,23 +3798,6 @@ class MeanPerClassAccuracyTest(test.TestCase):
       desired_output = np.mean([1.0 / 2.0, 2.0 / 3.0, 0.])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
-  def testUpdateOpEvalIsAccumulatedConfusionMatrix(self):
-    predictions = array_ops.concat([
-        constant_op.constant(0, shape=[5]), constant_op.constant(1, shape=[5])
-    ], 0)
-    labels = array_ops.concat([
-        constant_op.constant(0, shape=[3]), constant_op.constant(1, shape=[7])
-    ], 0)
-    num_classes = 2
-    with self.test_session() as sess:
-      mean_accuracy, update_op = metrics.mean_per_class_accuracy(
-          labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
-      confusion_matrix = update_op.eval()
-      self.assertAllEqual([[3, 0], [2, 5]], confusion_matrix)
-      desired_mean_accuracy = np.mean([3. / 3., 5. / 7.])
-      self.assertAlmostEqual(desired_mean_accuracy, mean_accuracy.eval())
-
   def testAllCorrect(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.zeros([40])
@@ -3822,7 +3806,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
       sess.run(variables.local_variables_initializer())
-      self.assertEqual(40, update_op.eval()[0])
+      self.assertEqual(1.0, update_op.eval()[0])
       self.assertEqual(1.0, mean_accuracy.eval())
 
   def testAllWrong(self):
@@ -3833,7 +3817,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
       sess.run(variables.local_variables_initializer())
-      self.assertAllEqual([[0, 0], [40, 0]], update_op.eval())
+      self.assertAllEqual([0.0, 0.0], update_op.eval())
       self.assertEqual(0., mean_accuracy.eval())
 
   def testResultsWithSomeMissing(self):
@@ -3852,8 +3836,9 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes, weights=weights)
       sess.run(variables.local_variables_initializer())
-      self.assertAllEqual([[2, 0], [2, 4]], update_op.eval())
-      desired_mean_accuracy = np.mean([2. / 2., 4. / 6.])
+      desired_accuracy = np.array([2. / 2., 4. / 6.], dtype=np.float32)
+      self.assertAllEqual(desired_accuracy, update_op.eval())
+      desired_mean_accuracy = np.mean(desired_accuracy)
       self.assertAlmostEqual(desired_mean_accuracy, mean_accuracy.eval())
 
 
diff --git a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
index 30795eed8a063076a69ec2ec7851788775fe4dc6..d8ce9fffbd2bc0d18033339a02e0ad84f8f4c952 100644
--- a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
@@ -148,7 +148,7 @@ class DepthwiseConv2DTest(test.TestCase):
     print("depthwise conv_2d: ", tensor_in_sizes, "*", filter_in_sizes,
           ", stride:", stride, ", padding: ", padding, ", max diff: ",
           np.amax(np.absolute(native_result - interface_result)))
-    self.assertArrayNear(
+    self.assertAllClose(
         np.ravel(native_result), np.ravel(interface_result), 1e-5)
     self.assertShapeEqual(native_result, conv_native)
     self.assertShapeEqual(native_result, conv_interface)
@@ -213,7 +213,7 @@ class DepthwiseConv2DTest(test.TestCase):
             t1, t2, strides=[1, stride, stride, 1], padding=padding)
         value = sess.run(conv)
     print("value = ", value)
-    self.assertArrayNear(expected, np.ravel(value), 1e-5)
+    self.assertAllClose(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
   def testConv2D2x2Filter(self):
diff --git a/tensorflow/python/kernel_tests/parse_single_example_op_test.py b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf4c89b368c98c54001ad533dec847c41f5b3e43
--- /dev/null
+++ b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
@@ -0,0 +1,937 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.parsing_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+# Helpers for creating Example objects
+example = example_pb2.Example
+feature = feature_pb2.Feature
+features = lambda d: feature_pb2.Features(feature=d)
+bytes_feature = lambda v: feature(bytes_list=feature_pb2.BytesList(value=v))
+int64_feature = lambda v: feature(int64_list=feature_pb2.Int64List(value=v))
+float_feature = lambda v: feature(float_list=feature_pb2.FloatList(value=v))
+# Helpers for creating SequenceExample objects
+feature_list = lambda l: feature_pb2.FeatureList(feature=l)
+feature_lists = lambda d: feature_pb2.FeatureLists(feature_list=d)
+sequence_example = example_pb2.SequenceExample
+
+
+def empty_sparse(dtype, shape=None):
+  if shape is None:
+    shape = [0]
+  return (np.empty(shape=(0, len(shape)), dtype=np.int64),
+          np.array([], dtype=dtype), np.array(shape, dtype=np.int64))
+
+
+def flatten(list_of_lists):
+  """Flatten one level of nesting."""
+  return itertools.chain.from_iterable(list_of_lists)
+
+
+def flatten_values_tensors_or_sparse(tensors_list):
+  """Flatten each SparseTensor object into 3 Tensors for session.run()."""
+  return list(
+      flatten([[v.indices, v.values, v.dense_shape] if isinstance(
+          v, sparse_tensor.SparseTensor) else [v] for v in tensors_list]))
+
+
+def _compare_output_to_expected(tester, dict_tensors, expected_tensors,
+                                flat_output):
+  tester.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys()))
+
+  i = 0  # Index into the flattened output of session.run()
+  for k, v in dict_tensors.items():
+    expected_v = expected_tensors[k]
+    tf_logging.info("Comparing key: %s", k)
+    if isinstance(v, sparse_tensor.SparseTensor):
+      # Three outputs for SparseTensor : indices, values, shape.
+      tester.assertEqual([k, len(expected_v)], [k, 3])
+      tester.assertAllEqual(expected_v[0], flat_output[i])
+      tester.assertAllEqual(expected_v[1], flat_output[i + 1])
+      tester.assertAllEqual(expected_v[2], flat_output[i + 2])
+      i += 3
+    else:
+      # One output for standard Tensor.
+      tester.assertAllEqual(expected_v, flat_output[i])
+      i += 1
+
+
+class ParseExampleTest(test.TestCase):
+
+  def _test(self, kwargs, expected_values=None, expected_err=None):
+    with self.test_session() as sess:
+      if expected_err:
+        with self.assertRaisesWithPredicateMatch(expected_err[0],
+                                                 expected_err[1]):
+          out = parsing_ops.parse_single_example(**kwargs)
+          sess.run(flatten_values_tensors_or_sparse(out.values()))
+        return
+      else:
+        # Returns dict w/ Tensors and SparseTensors.
+        out = parsing_ops.parse_single_example(**kwargs)
+        # Also include a test with the example names specified to retain
+        # code coverage of the unfused version, and ensure that the two
+        # versions produce the same results.
+        out_with_example_name = parsing_ops.parse_single_example(
+            example_names="name", **kwargs)
+        for result_dict in [out, out_with_example_name]:
+          result = flatten_values_tensors_or_sparse(result_dict.values())
+          # Check values.
+          tf_result = sess.run(result)
+          _compare_output_to_expected(self, result_dict, expected_values,
+                                      tf_result)
+
+      for k, f in kwargs["features"].items():
+        if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
+          self.assertEqual(tuple(out[k].get_shape().as_list()), f.shape)
+        elif isinstance(f, parsing_ops.VarLenFeature):
+          self.assertEqual(
+              tuple(out[k].indices.get_shape().as_list()), (None, 1))
+          self.assertEqual(tuple(out[k].values.get_shape().as_list()), (None,))
+          self.assertEqual(
+              tuple(out[k].dense_shape.get_shape().as_list()), (1,))
+
+  def testEmptySerializedWithAllDefaults(self):
+    sparse_name = "st_a"
+    a_name = "a"
+    b_name = "b"
+    c_name = "c:has_a_tricky_name"
+    a_default = [0, 42, 0]
+    b_default = np.random.rand(3, 3).astype(bytes)
+    c_default = np.random.rand(2).astype(np.float32)
+
+    expected_st_a = (  # indices, values, shape
+        np.empty((0, 1), dtype=np.int64),  # indices
+        np.empty((0,), dtype=np.int64),  # sp_a is DT_INT64
+        np.array([0], dtype=np.int64))  # max_elems = 0
+
+    expected_output = {
+        sparse_name: expected_st_a,
+        a_name: np.array([a_default]),
+        b_name: np.array(b_default),
+        c_name: np.array(c_default),
+    }
+
+    self._test({
+        "serialized": ops.convert_to_tensor(""),
+        "features": {
+            sparse_name:
+                parsing_ops.VarLenFeature(dtypes.int64),
+            a_name:
+                parsing_ops.FixedLenFeature(
+                    (1, 3), dtypes.int64, default_value=a_default),
+            b_name:
+                parsing_ops.FixedLenFeature(
+                    (3, 3), dtypes.string, default_value=b_default),
+            c_name:
+                parsing_ops.FixedLenFeature(
+                    (2,), dtypes.float32, default_value=c_default),
+        }
+    }, expected_output)
+
+  def testEmptySerializedWithoutDefaultsShouldFail(self):
+    input_features = {
+        "st_a":
+            parsing_ops.VarLenFeature(dtypes.int64),
+        "a":
+            parsing_ops.FixedLenFeature(
+                (1, 3), dtypes.int64, default_value=[0, 42, 0]),
+        "b":
+            parsing_ops.FixedLenFeature(
+                (3, 3),
+                dtypes.string,
+                default_value=np.random.rand(3, 3).astype(bytes)),
+        # Feature "c" is missing a default, this gap will cause failure.
+        "c":
+            parsing_ops.FixedLenFeature(
+                (2,), dtype=dtypes.float32),
+    }
+
+    # Edge case where the key is there but the feature value is empty
+    original = example(features=features({"c": feature()}))
+    self._test(
+        {
+            "serialized": original.SerializeToString(),
+            "features": input_features,
+        },
+        expected_err=(errors_impl.OpError,
+                      "Feature: c \\(data type: float\\) is required"))
+
+    # Standard case of missing key and value.
+    self._test(
+        {
+            "serialized": "",
+            "features": input_features,
+        },
+        expected_err=(errors_impl.OpError,
+                      "Feature: c \\(data type: float\\) is required"))
+
+  def testDenseNotMatchingShapeShouldFail(self):
+    original = example(features=features({
+        "a": float_feature([-1, -1]),
+    }))
+
+    serialized = original.SerializeToString()
+
+    self._test(
+        {
+            "serialized": ops.convert_to_tensor(serialized),
+            "features": {
+                "a": parsing_ops.FixedLenFeature((1, 3), dtypes.float32)
+            }
+        },
+        # TODO(mrry): Consider matching the `tf.parse_example()` error message.
+        expected_err=(errors_impl.OpError, "Key: a."))
+
+  def testDenseDefaultNoShapeShouldFail(self):
+    original = example(features=features({
+        "a": float_feature([1, 1, 3]),
+    }))
+
+    serialized = original.SerializeToString()
+
+    self._test(
+        {
+            "serialized": ops.convert_to_tensor(serialized),
+            "features": {
+                "a": parsing_ops.FixedLenFeature(None, dtypes.float32)
+            }
+        },
+        expected_err=(ValueError, "Missing shape for feature a"))
+
+  def testSerializedContainingSparse(self):
+    original = [
+        example(features=features({
+            "st_c": float_feature([3, 4])
+        })),
+        example(features=features({
+            "st_c": float_feature([]),  # empty float list
+        })),
+        example(features=features({
+            "st_d": feature(),  # feature with nothing in it
+        })),
+        example(features=features({
+            "st_c": float_feature([1, 2, -1]),
+            "st_d": bytes_feature([b"hi"])
+        }))
+    ]
+
+    expected_outputs = [{
+        "st_c": (np.array([[0], [1]], dtype=np.int64),
+                 np.array([3.0, 4.0], dtype=np.float32),
+                 np.array([2], dtype=np.int64)),
+        "st_d":
+            empty_sparse(bytes)
+    }, {
+        "st_c": empty_sparse(np.float32),
+        "st_d": empty_sparse(bytes)
+    }, {
+        "st_c": empty_sparse(np.float32),
+        "st_d": empty_sparse(bytes)
+    }, {
+        "st_c": (np.array([[0], [1], [2]], dtype=np.int64),
+                 np.array([1.0, 2.0, -1.0], dtype=np.float32),
+                 np.array([3], dtype=np.int64)),
+        "st_d": (np.array([[0]], dtype=np.int64), np.array(["hi"], dtype=bytes),
+                 np.array([1], dtype=np.int64))
+    }]
+
+    for proto, expected_output in zip(original, expected_outputs):
+      self._test({
+          "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+          "features": {
+              "st_c": parsing_ops.VarLenFeature(dtypes.float32),
+              "st_d": parsing_ops.VarLenFeature(dtypes.string)
+          },
+      }, expected_output)
+
+  def testSerializedContainingSparseFeature(self):
+    original = [
+        example(features=features({
+            "val": float_feature([3, 4]),
+            "idx": int64_feature([5, 10])
+        })),
+        example(features=features({
+            "val": float_feature([]),  # empty float list
+            "idx": int64_feature([])
+        })),
+        example(features=features({
+            "val": feature(),  # feature with nothing in it
+            # missing idx feature
+        })),
+        example(features=features({
+            "val": float_feature([1, 2, -1]),
+            "idx":
+                int64_feature([0, 9, 3])  # unsorted
+        }))
+    ]
+
+    expected_outputs = [{
+        "sp": (np.array([[5], [10]], dtype=np.int64),
+               np.array([3.0, 4.0], dtype=np.float32),
+               np.array([13], dtype=np.int64))
+    }, {
+        "sp": empty_sparse(np.float32, shape=[13])
+    }, {
+        "sp": empty_sparse(np.float32, shape=[13])
+    }, {
+        "sp": (np.array([[0], [3], [9]], dtype=np.int64),
+               np.array([1.0, -1.0, 2.0], dtype=np.float32),
+               np.array([13], dtype=np.int64))
+    }]
+
+    for proto, expected_output in zip(original, expected_outputs):
+      self._test({
+          "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+          "features": {
+              "sp":
+                  parsing_ops.SparseFeature(["idx"], "val", dtypes.float32,
+                                            [13])
+          }
+      }, expected_output)
+
+  def testSerializedContainingSparseFeatureReuse(self):
+    original = [
+        example(features=features({
+            "val1": float_feature([3, 4]),
+            "val2": float_feature([5, 6]),
+            "idx": int64_feature([5, 10])
+        })),
+        example(features=features({
+            "val1": float_feature([]),  # empty float list
+            "idx": int64_feature([])
+        })),
+    ]
+
+    expected_outputs = [{
+        "sp1": (np.array([[5], [10]], dtype=np.int64),
+                np.array([3.0, 4.0], dtype=np.float32),
+                np.array([13], dtype=np.int64)),
+        "sp2": (np.array([[5], [10]], dtype=np.int64),
+                np.array([5.0, 6.0], dtype=np.float32),
+                np.array([7], dtype=np.int64))
+    }, {
+        "sp1": empty_sparse(np.float32, shape=[13]),
+        "sp2": empty_sparse(np.float32, shape=[7])
+    }]
+
+    for proto, expected_output in zip(original, expected_outputs):
+      self._test({
+          "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+          "features": {
+              "sp1":
+                  parsing_ops.SparseFeature("idx", "val1", dtypes.float32, 13),
+              "sp2":
+                  parsing_ops.SparseFeature(
+                      "idx",
+                      "val2",
+                      dtypes.float32,
+                      size=7,
+                      already_sorted=True)
+          }
+      }, expected_output)
+
+  def testSerializedContaining3DSparseFeature(self):
+    original = [
+        example(features=features({
+            "val": float_feature([3, 4]),
+            "idx0": int64_feature([5, 10]),
+            "idx1": int64_feature([0, 2]),
+        })),
+        example(features=features({
+            "val": float_feature([]),  # empty float list
+            "idx0": int64_feature([]),
+            "idx1": int64_feature([]),
+        })),
+        example(features=features({
+            "val": feature(),  # feature with nothing in it
+            # missing idx feature
+        })),
+        example(features=features({
+            "val": float_feature([1, 2, -1]),
+            "idx0": int64_feature([0, 9, 3]),  # unsorted
+            "idx1": int64_feature([1, 0, 2]),
+        }))
+    ]
+
+    expected_outputs = [{
+        "sp": (np.array([[5, 0], [10, 2]], dtype=np.int64),
+               np.array([3.0, 4.0], dtype=np.float32),
+               np.array([13, 3], dtype=np.int64))
+    }, {
+        "sp": empty_sparse(np.float32, shape=[13, 3])
+    }, {
+        "sp": empty_sparse(np.float32, shape=[13, 3])
+    }, {
+        "sp": (np.array([[0, 1], [3, 2], [9, 0]], dtype=np.int64),
+               np.array([1.0, -1.0, 2.0], dtype=np.float32),
+               np.array([13, 3], dtype=np.int64))
+    }]
+
+    for proto, expected_output in zip(original, expected_outputs):
+      self._test({
+          "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+          "features": {
+              "sp":
+                  parsing_ops.SparseFeature(["idx0", "idx1"], "val",
+                                            dtypes.float32, [13, 3])
+          }
+      }, expected_output)
+
+  def testSerializedContainingDense(self):
+    aname = "a"
+    bname = "b*has+a:tricky_name"
+    original = [
+        example(features=features({
+            aname: float_feature([1, 1]),
+            bname: bytes_feature([b"b0_str"]),
+        })), example(features=features({
+            aname: float_feature([-1, -1]),
+            bname: bytes_feature([b""]),
+        }))
+    ]
+
+    expected_outputs = [{
+        aname: np.array([1, 1], dtype=np.float32).reshape(1, 2, 1),
+        bname: np.array(["b0_str"], dtype=bytes).reshape(1, 1, 1, 1)
+    }, {
+        aname: np.array([-1, -1], dtype=np.float32).reshape(1, 2, 1),
+        bname: np.array([""], dtype=bytes).reshape(1, 1, 1, 1)
+    }]
+
+    for proto, expected_output in zip(original, expected_outputs):
+      # No defaults, values required
+      self._test({
+          "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+          "features": {
+              aname:
+                  parsing_ops.FixedLenFeature((1, 2, 1), dtype=dtypes.float32),
+              bname:
+                  parsing_ops.FixedLenFeature(
+                      (1, 1, 1, 1), dtype=dtypes.string),
+          }
+      }, expected_output)
+
+  # This test is identical as the previous one except
+  # for the creation of 'serialized'.
+  def testSerializedContainingDenseWithConcat(self):
+    aname = "a"
+    bname = "b*has+a:tricky_name"
+    # TODO(lew): Feature appearing twice should be an error in future.
+    original = [
+        (example(features=features({
+            aname: float_feature([10, 10]),
+        })), example(features=features({
+            aname: float_feature([1, 1]),
+            bname: bytes_feature([b"b0_str"]),
+        }))),
+        (
+            example(features=features({
+                bname: bytes_feature([b"b100"]),
+            })),
+            example(features=features({
+                aname: float_feature([-1, -1]),
+                bname: bytes_feature([b"b1"]),
+            })),),
+    ]
+
+    expected_outputs = [{
+        aname: np.array([1, 1], dtype=np.float32).reshape(1, 2, 1),
+        bname: np.array(["b0_str"], dtype=bytes).reshape(1, 1, 1, 1)
+    }, {
+        aname: np.array([-1, -1], dtype=np.float32).reshape(1, 2, 1),
+        bname: np.array(["b1"], dtype=bytes).reshape(1, 1, 1, 1)
+    }]
+
+    for (m, n), expected_output in zip(original, expected_outputs):
+      # No defaults, values required
+      self._test({
+          "serialized":
+              ops.convert_to_tensor(
+                  m.SerializeToString() + n.SerializeToString()),
+          "features": {
+              aname:
+                  parsing_ops.FixedLenFeature((1, 2, 1), dtype=dtypes.float32),
+              bname:
+                  parsing_ops.FixedLenFeature(
+                      (1, 1, 1, 1), dtype=dtypes.string),
+          }
+      }, expected_output)
+
+  def testSerializedContainingDenseScalar(self):
+    original = [
+        example(features=features({
+            "a": float_feature([1]),
+        })), example(features=features({}))
+    ]
+
+    expected_outputs = [{
+        "a": np.array([1], dtype=np.float32)
+    }, {
+        "a": np.array([-1], dtype=np.float32)
+    }]
+
+    for proto, expected_output in zip(original, expected_outputs):
+      self._test({
+          "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+          "features": {
+              "a":
+                  parsing_ops.FixedLenFeature(
+                      (1,), dtype=dtypes.float32, default_value=-1),
+          }
+      }, expected_output)
+
+  def testSerializedContainingDenseWithDefaults(self):
+    original = [
+        example(features=features({
+            "a": float_feature([1, 1]),
+        })),
+        example(features=features({
+            "b": bytes_feature([b"b1"]),
+        })),
+        example(features=features({
+            "b": feature()
+        })),
+    ]
+
+    expected_outputs = [{
+        "a": np.array([1, 1], dtype=np.float32).reshape(1, 2, 1),
+        "b": np.array("tmp_str", dtype=bytes).reshape(1, 1, 1, 1)
+    }, {
+        "a": np.array([3, -3], dtype=np.float32).reshape(1, 2, 1),
+        "b": np.array("b1", dtype=bytes).reshape(1, 1, 1, 1)
+    }, {
+        "a": np.array([3, -3], dtype=np.float32).reshape(1, 2, 1),
+        "b": np.array("tmp_str", dtype=bytes).reshape(1, 1, 1, 1)
+    }]
+
+    for proto, expected_output in zip(original, expected_outputs):
+      self._test({
+          "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+          "features": {
+              "a":
+                  parsing_ops.FixedLenFeature(
+                      (1, 2, 1),
+                      dtype=dtypes.float32,
+                      default_value=[3.0, -3.0]),
+              "b":
+                  parsing_ops.FixedLenFeature(
+                      (1, 1, 1, 1),
+                      dtype=dtypes.string,
+                      default_value="tmp_str"),
+          }
+      }, expected_output)
+
+  def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(self):
+    original = [
+        example(features=features({
+            "c": float_feature([3, 4]),
+            "val": bytes_feature([b"a", b"b"]),
+            "idx": int64_feature([0, 3])
+        })), example(features=features({
+            "c": float_feature([1, 2]),
+            "val": bytes_feature([b"c"]),
+            "idx": int64_feature([7])
+        }))
+    ]
+
+    a_default = np.array([[1, 2, 3]], dtype=np.int64)
+    b_default = np.random.rand(3, 3).astype(bytes)
+
+    expected_st_a = empty_sparse(np.int64)
+
+    expected_outputs = [{
+        "st_a":
+            expected_st_a,
+        "sp": (np.array([[0], [3]], dtype=np.int64),
+               np.array(["a", "b"], dtype=bytes), np.array(
+                   [13], dtype=np.int64)),
+        "a":
+            a_default,
+        "b":
+            b_default,
+        "c":
+            np.array([3, 4], dtype=np.float32)
+    }, {
+        "st_a":
+            expected_st_a,
+        "sp": (np.array([[7]], dtype=np.int64), np.array(["c"], dtype=bytes),
+               np.array([13], dtype=np.int64)),
+        "a":
+            a_default,
+        "b":
+            b_default,
+        "c":
+            np.array([1, 2], dtype=np.float32)
+    }]
+
+    for proto, expected_output in zip(original, expected_outputs):
+      self._test(
+          {
+              "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+              "features": {
+                  "st_a":
+                      parsing_ops.VarLenFeature(dtypes.int64),
+                  "sp":
+                      parsing_ops.SparseFeature("idx", "val", dtypes.string, 13
+                                               ),
+                  "a":
+                      parsing_ops.FixedLenFeature(
+                          (1, 3), dtypes.int64, default_value=a_default),
+                  "b":
+                      parsing_ops.FixedLenFeature(
+                          (3, 3), dtypes.string, default_value=b_default),
+                  # Feature "c" must be provided, since it has no default_value.
+                  "c":
+                      parsing_ops.FixedLenFeature((2,), dtypes.float32),
+              }
+          },
+          expected_output)
+
+  def testSerializedContainingSparseAndSparseFeatureWithReuse(self):
+    original = [
+        example(features=features({
+            "val": bytes_feature([b"a", b"b"]),
+            "idx": int64_feature([0, 3])
+        })), example(features=features({
+            "val": bytes_feature([b"c", b"d"]),
+            "idx": int64_feature([7, 1])
+        }))
+    ]
+
+    expected_outputs = [{
+        "idx": (np.array([[0], [1]], dtype=np.int64),
+                np.array([0, 3], dtype=np.int64), np.array([2],
+                                                           dtype=np.int64)),
+        "sp": (np.array([[0], [3]], dtype=np.int64),
+               np.array(["a", "b"], dtype=bytes), np.array(
+                   [13], dtype=np.int64))
+    },
+                        {
+                            "idx": (np.array([[0], [1]], dtype=np.int64),
+                                    np.array([7, 1], dtype=np.int64),
+                                    np.array([2], dtype=np.int64)),
+                            "sp": (np.array([[1], [7]], dtype=np.int64),
+                                   np.array(["d", "c"], dtype=bytes),
+                                   np.array([13], dtype=np.int64))
+                        }]
+
+    for proto, expected_output in zip(original, expected_outputs):
+      self._test({
+          "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+          "features": {
+              "idx":
+                  parsing_ops.VarLenFeature(dtypes.int64),
+              "sp":
+                  parsing_ops.SparseFeature(["idx"], "val", dtypes.string, [13]
+                                           ),
+          }
+      }, expected_output)
+
+  def testSerializedContainingVarLenDense(self):
+    aname = "a"
+    bname = "b"
+    cname = "c"
+    dname = "d"
+    original = [
+        example(features=features({
+            cname: int64_feature([2]),
+        })),
+        example(features=features({
+            aname: float_feature([1, 1]),
+            bname: bytes_feature([b"b0_str", b"b1_str"]),
+        })),
+        example(features=features({
+            aname: float_feature([-1, -1, 2, 2]),
+            bname: bytes_feature([b"b1"]),
+        })),
+        example(features=features({
+            aname: float_feature([]),
+            cname: int64_feature([3]),
+        })),
+    ]
+
+    expected_outputs = [
+        {
+            aname: np.empty(shape=(0, 2, 1), dtype=np.int64),
+            bname: np.empty(shape=(0, 1, 1, 1), dtype=bytes),
+            cname: np.array([2], dtype=np.int64),
+            dname: np.empty(shape=(0,), dtype=bytes)
+        },
+        {
+            aname:
+                np.array([[[1], [1]]], dtype=np.float32),
+            bname:
+                np.array(["b0_str", "b1_str"], dtype=bytes).reshape(2, 1, 1, 1),
+            cname:
+                np.empty(shape=(0,), dtype=np.int64),
+            dname:
+                np.empty(shape=(0,), dtype=bytes)
+        },
+        {
+            aname: np.array([[[-1], [-1]], [[2], [2]]], dtype=np.float32),
+            bname: np.array(["b1"], dtype=bytes).reshape(1, 1, 1, 1),
+            cname: np.empty(shape=(0,), dtype=np.int64),
+            dname: np.empty(shape=(0,), dtype=bytes)
+        },
+        {
+            aname: np.empty(shape=(0, 2, 1), dtype=np.int64),
+            bname: np.empty(shape=(0, 1, 1, 1), dtype=bytes),
+            cname: np.array([3], dtype=np.int64),
+            dname: np.empty(shape=(0,), dtype=bytes)
+        },
+    ]
+
+    for proto, expected_output in zip(original, expected_outputs):
+      self._test({
+          "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+          "features": {
+              aname:
+                  parsing_ops.FixedLenSequenceFeature(
+                      (2, 1), dtype=dtypes.float32, allow_missing=True),
+              bname:
+                  parsing_ops.FixedLenSequenceFeature(
+                      (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+              cname:
+                  parsing_ops.FixedLenSequenceFeature(
+                      shape=[], dtype=dtypes.int64, allow_missing=True),
+              dname:
+                  parsing_ops.FixedLenSequenceFeature(
+                      shape=[], dtype=dtypes.string, allow_missing=True),
+          }
+      }, expected_output)
+
+    # Test with padding values.
+    # NOTE(mrry): Since we parse a single example at a time, the fixed-length
+    # sequences will not be padded, and the padding value will be ignored.
+    for proto, expected_output in zip(original, expected_outputs):
+      self._test({
+          "serialized": ops.convert_to_tensor(proto.SerializeToString()),
+          "features": {
+              aname:
+                  parsing_ops.FixedLenSequenceFeature(
+                      (2, 1), dtype=dtypes.float32, allow_missing=True),
+              bname:
+                  parsing_ops.FixedLenSequenceFeature(
+                      (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+              cname:
+                  parsing_ops.FixedLenSequenceFeature(
+                      shape=[], dtype=dtypes.int64, allow_missing=True),
+              dname:
+                  parsing_ops.FixedLenSequenceFeature(
+                      shape=[], dtype=dtypes.string, allow_missing=True),
+          }
+      }, expected_output)
+
+    # Change number of required values so the inputs are not a
+    # multiple of this size.
+    self._test(
+        {
+            "serialized":
+                ops.convert_to_tensor(original[2].SerializeToString()),
+            "features": {
+                aname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1), dtype=dtypes.float32, allow_missing=True),
+                bname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+            }
+        },
+        # TODO(mrry): Consider matching the `tf.parse_example()` error message.
+        expected_err=(errors_impl.OpError, "Key: b."))
+
+    self._test(
+        {
+            "serialized": ops.convert_to_tensor(""),
+            "features": {
+                aname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1),
+                        dtype=dtypes.float32,
+                        allow_missing=True,
+                        default_value=[]),
+                bname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+            }
+        },
+        expected_err=(ValueError,
+                      "Cannot reshape a tensor with 0 elements to shape"))
+
+    self._test(
+        {
+            "serialized": ops.convert_to_tensor(""),
+            "features": {
+                aname:
+                    parsing_ops.FixedLenFeature(
+                        (None, 2, 1), dtype=dtypes.float32),
+                bname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+            }
+        },
+        expected_err=(ValueError,
+                      "First dimension of shape for feature a unknown. "
+                      "Consider using FixedLenSequenceFeature."))
+
+    self._test(
+        {
+            "serialized": ops.convert_to_tensor(""),
+            "features": {
+                cname:
+                    parsing_ops.FixedLenFeature(
+                        (1, None), dtype=dtypes.int64, default_value=[[1]]),
+            }
+        },
+        expected_err=(ValueError,
+                      "All dimensions of shape for feature c need to be known "
+                      r"but received \(1, None\)."))
+
+    self._test(
+        {
+            "serialized": ops.convert_to_tensor(""),
+            "features": {
+                aname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (2, 1), dtype=dtypes.float32, allow_missing=True),
+                bname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        (1, 1, 1), dtype=dtypes.string, allow_missing=True),
+                cname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        shape=[], dtype=dtypes.int64, allow_missing=False),
+                dname:
+                    parsing_ops.FixedLenSequenceFeature(
+                        shape=[], dtype=dtypes.string, allow_missing=True),
+            }
+        },
+        expected_err=(ValueError,
+                      "Unsupported: FixedLenSequenceFeature requires "
+                      "allow_missing to be True."))
+
+
+class ParseSingleExampleTest(test.TestCase):
+
+  def _test(self, kwargs, expected_values=None, expected_err=None):
+    with self.test_session() as sess:
+      if expected_err:
+        with self.assertRaisesWithPredicateMatch(expected_err[0],
+                                                 expected_err[1]):
+          out = parsing_ops.parse_single_example(**kwargs)
+          sess.run(flatten_values_tensors_or_sparse(out.values()))
+      else:
+        # Returns dict w/ Tensors and SparseTensors.
+        out = parsing_ops.parse_single_example(**kwargs)
+        # Check values.
+        tf_result = sess.run(flatten_values_tensors_or_sparse(out.values()))
+        _compare_output_to_expected(self, out, expected_values, tf_result)
+
+      # Check shapes.
+      for k, f in kwargs["features"].items():
+        if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
+          self.assertEqual(tuple(out[k].get_shape()),
+                           tensor_shape.as_shape(f.shape))
+        elif isinstance(f, parsing_ops.VarLenFeature):
+          self.assertEqual(
+              tuple(out[k].indices.get_shape().as_list()), (None, 1))
+          self.assertEqual(tuple(out[k].values.get_shape().as_list()), (None,))
+          self.assertEqual(
+              tuple(out[k].dense_shape.get_shape().as_list()), (1,))
+
+  def testSingleExampleWithSparseAndSparseFeatureAndDense(self):
+    original = example(features=features({
+        "c": float_feature([3, 4]),
+        "d": float_feature([0.0, 1.0]),
+        "val": bytes_feature([b"a", b"b"]),
+        "idx": int64_feature([0, 3]),
+        "st_a": float_feature([3.0, 4.0])
+    }))
+
+    serialized = original.SerializeToString()
+
+    expected_st_a = (
+        np.array(
+            [[0], [1]], dtype=np.int64),  # indices
+        np.array(
+            [3.0, 4.0], dtype=np.float32),  # values
+        np.array(
+            [2], dtype=np.int64))  # shape: max_values = 2
+
+    expected_sp = (  # indices, values, shape
+        np.array(
+            [[0], [3]], dtype=np.int64), np.array(
+                ["a", "b"], dtype="|S"), np.array(
+                    [13], dtype=np.int64))  # max_values = 13
+
+    a_default = [1, 2, 3]
+    b_default = np.random.rand(3, 3).astype(bytes)
+    expected_output = {
+        "st_a": expected_st_a,
+        "sp": expected_sp,
+        "a": [a_default],
+        "b": b_default,
+        "c": np.array([3, 4], dtype=np.float32),
+        "d": np.array([0.0, 1.0], dtype=np.float32),
+    }
+
+    self._test(
+        {
+            "serialized":
+                ops.convert_to_tensor(serialized),
+            "features": {
+                "st_a":
+                    parsing_ops.VarLenFeature(dtypes.float32),
+                "sp":
+                    parsing_ops.SparseFeature(
+                        ["idx"], "val", dtypes.string, [13]),
+                "a":
+                    parsing_ops.FixedLenFeature(
+                        (1, 3), dtypes.int64, default_value=a_default),
+                "b":
+                    parsing_ops.FixedLenFeature(
+                        (3, 3), dtypes.string, default_value=b_default),
+                # Feature "c" must be provided, since it has no default_value.
+                "c":
+                    parsing_ops.FixedLenFeature(2, dtypes.float32),
+                "d":
+                    parsing_ops.FixedLenSequenceFeature([],
+                                                        dtypes.float32,
+                                                        allow_missing=True)
+            }
+        },
+        expected_output)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py
index ff75b94322c799bcecbc4fb424403325fe493911..59b3ee2013a9922c420726d9accb2cf9355b2d42 100644
--- a/tensorflow/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/kernel_tests/parsing_ops_test.py
@@ -426,7 +426,7 @@ class ParseExampleTest(test.TestCase):
             bname: bytes_feature([b"b0_str"]),
         })), example(features=features({
             aname: float_feature([-1, -1]),
-            bname: bytes_feature([b"b1"]),
+            bname: bytes_feature([b""]),
         }))
     ]
 
@@ -438,7 +438,7 @@ class ParseExampleTest(test.TestCase):
                 [[1, 1], [-1, -1]], dtype=np.float32).reshape(2, 1, 2, 1),
         bname:
             np.array(
-                ["b0_str", "b1"], dtype=bytes).reshape(2, 1, 1, 1, 1),
+                ["b0_str", ""], dtype=bytes).reshape(2, 1, 1, 1, 1),
     }
 
     # No defaults, values required
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index d40517510046959e353cad4df0c6ddbed0db90aa..f5c6255c346961fec7245889229ea1c4b89fa388 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -46,6 +46,14 @@ class PartitionerCreatorsTest(test.TestCase):
         self.assertEqual(len(v0_list), 5)
         self.assertAllEqual(v0_part, (5, 1))
 
+  def testFixedSizePartitionerInt64(self):
+    with self.test_session():
+      partitioner = partitioned_variables.fixed_size_partitioner(4, axis=0)
+      with variable_scope.variable_scope("root", partitioner=partitioner):
+        v0 = variable_scope.get_variable("v0", dtype=dtypes.int64, shape=[20])
+        v0_list = v0._get_variable_list()
+        self.assertEqual(len(v0_list), 4)
+
   def testResourceFixedSizePartitioner(self):
     with self.test_session():
       partitioner = partitioned_variables.fixed_size_partitioner(5, axis=0)
@@ -160,8 +168,10 @@ class PartitionerCreatorsTest(test.TestCase):
           max_shards=2)
 
       # Use the partitioner with strings
-      partitioner_axis3_str = partitioned_variables.variable_axis_size_partitioner(
-          axis=3, max_shard_bytes=32768, bytes_per_string_element=8)
+      partitioner_axis3_str = partitioned_variables.variable_axis_size_partitioner(  # pylint: disable=line-too-long
+          axis=3,
+          max_shard_bytes=32768,
+          bytes_per_string_element=8)
 
       with variable_scope.variable_scope(
           "root", partitioner=partitioner_axis3_str):
@@ -414,8 +424,7 @@ class PartitionedVariablesTestCase(test.TestCase):
   def testRandomInitUnevenPartitions(self):
     with self.test_session():
       rnd = variables.Variable(
-          random_ops.random_uniform(
-              [20, 43], dtype=dtypes.float64))
+          random_ops.random_uniform([20, 43], dtype=dtypes.float64))
       var_lists = [
           partitioned_variables.create_partitioned_variables(
               rnd.get_shape(), [1, i], rnd.initialized_value())
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index 63848976336f5487cf2a44f7cf62ea316c40d7c8..6ede654aadc7d0d78bc18f13c2d4b3d47fef0402 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -96,7 +96,7 @@ def pool_direct_single_axis(
 
 
 def pool_direct(
-    input,
+    input,  # pylint: disable=redefined-builtin
     window_shape,
     pooling_type,
     padding,  # pylint: disable=redefined-builtin
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 6be8997cabdb4cba87f90378c405a63aa6f78ea3..4466beeec96509b3761e34d885276e1510c62d10 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -159,8 +159,10 @@ class PoolingTest(test.TestCase):
       elif data_format == "NCHW":
         t = test_util.NCHWToNHWC(t)
       if v2:
-        actual = t.eval(feed_dict={ksize_placeholder: ksize,
-                                   strides_placeholder: strides})
+        actual = t.eval(feed_dict={
+            ksize_placeholder: ksize,
+            strides_placeholder: strides
+        })
       else:
         actual = t.eval()
         self.assertShapeEqual(actual, t)
@@ -195,8 +197,15 @@ class PoolingTest(test.TestCase):
       self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
                           data_format, dtypes.float16, expected, use_gpu, v2)
 
-  def _VerifyValues(self, pool_func, input_sizes, ksize, strides, padding,
-                    expected, use_gpu, v2=False):
+  def _VerifyValues(self,
+                    pool_func,
+                    input_sizes,
+                    ksize,
+                    strides,
+                    padding,
+                    expected,
+                    use_gpu,
+                    v2=False):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -361,6 +370,16 @@ class PoolingTest(test.TestCase):
         expected=expected_output,
         use_gpu=use_gpu)
 
+  def _testAvgPoolEmptyInput(self, use_gpu):
+    self._VerifyValues(
+        nn_ops.avg_pool,
+        input_sizes=[0, 8, 8, 8],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding="SAME",
+        expected=[],
+        use_gpu=use_gpu)
+
   def testAvgPooling(self):
     for use_gpu in True, False:
       self._testAvgPoolValidPadding(use_gpu)
@@ -371,6 +390,7 @@ class PoolingTest(test.TestCase):
       self._testAvgPoolSamePadding4(use_gpu)
       self._testAvgPoolSamePaddingPacket4(use_gpu)
       self._testAvgPoolSamePaddingPacket8(use_gpu)
+      self._testAvgPoolEmptyInput(use_gpu)
 
   def _testMaxPoolValidPadding(self, use_gpu):
     expected_output = [13.0, 14.0, 15.0]
@@ -543,6 +563,16 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu,
           v2=v2)
 
+  def _testMaxPoolEmptyInput(self, use_gpu):
+    self._VerifyValues(
+        gen_nn_ops._max_pool_v2,
+        input_sizes=[0, 8, 8, 8],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding="SAME",
+        expected=[],
+        use_gpu=use_gpu)
+
   def testMaxPooling(self):
     for use_gpu in True, False:
       self._testMaxPoolValidPadding(use_gpu)
@@ -551,6 +581,7 @@ class PoolingTest(test.TestCase):
       self._testMaxPoolValidPaddingUnevenStride(use_gpu)
       self._testMaxPoolSamePaddingPacket4(use_gpu)
       self._testMaxPoolSamePaddingPacket8(use_gpu)
+      self._testMaxPoolEmptyInput(use_gpu)
 
   # Tests for DepthwiseMaxPooling on CPU only.
   def testDepthwiseMaxPool1x1DepthWindow1(self):
@@ -1126,16 +1157,16 @@ class PoolingTest(test.TestCase):
   def _testMaxPoolGradSamePadding3_1(self, data_format, use_gpu):
     for pool_func in [gen_nn_ops._max_pool_v2, nn_ops.max_pool]:
       self._ConstructAndTestGradient(
-        pool_func,
-        input_sizes=[1, 7, 7, 1],
-        output_sizes=[1, 7, 7, 1],
-        window_rows=3,
-        window_cols=3,
-        row_stride=1,
-        col_stride=1,
-        padding="SAME",
-        data_format=data_format,
-        use_gpu=use_gpu)
+          pool_func,
+          input_sizes=[1, 7, 7, 1],
+          output_sizes=[1, 7, 7, 1],
+          window_rows=3,
+          window_cols=3,
+          row_stride=1,
+          col_stride=1,
+          padding="SAME",
+          data_format=data_format,
+          use_gpu=use_gpu)
 
   def testMaxPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
@@ -1180,17 +1211,14 @@ class PoolingTest(test.TestCase):
     pool_func = gen_nn_ops._max_pool_v2 if v2 else nn_ops.max_pool
     with self.test_session(use_gpu=use_gpu):
       input_tensor = constant_op.constant(input_data, shape=input_sizes)
-      output_tensor = pool_func(input_tensor,
-                                [1, window_rows, window_cols, 1],
+      output_tensor = pool_func(input_tensor, [1, window_rows, window_cols, 1],
                                 [1, row_stride, col_stride, 1], padding)
       output_backprop_tensor = constant_op.constant(
           output_backprop, shape=output_sizes)
 
-      input_backprop_tensor = self._MaxPoolGrad(input_tensor, output_tensor,
-                                                output_backprop_tensor,
-                                                window_rows, window_cols,
-                                                row_stride, col_stride,
-                                                padding, v2)
+      input_backprop_tensor = self._MaxPoolGrad(
+          input_tensor, output_tensor, output_backprop_tensor, window_rows,
+          window_cols, row_stride, col_stride, padding, v2)
 
       actual_input_backprop = input_backprop_tensor.eval()
       self.assertShapeEqual(actual_input_backprop, input_backprop_tensor)
@@ -1392,13 +1420,15 @@ class PoolingTest(test.TestCase):
   def _testMaxPoolGradDirectWithNans2_2(self):
     input_data = [float("nan")] * 16
     output_backprop = [
-        float("nan"), 12.0, 13.0, 15.0, float("nan"), 17.0, 19.0, 20.0,
+        float("nan"), 12.0, 13.0, 15.0,
+        float("nan"), 17.0, 19.0, 20.0,
         float("nan")
     ]
     # Test the CPU implementation, which propagates diffs in case of NaN
     expected_input_backprop_tf_cpu = [
-        float("nan"), 12.0, 13.0, 0.0, 15.0, float("nan"), 17.0, 0.0, 19.0,
-        20.0, float("nan"), 0.0, 0.0, 0.0, 0.0, 0.0
+        float("nan"), 12.0, 13.0, 0.0, 15.0,
+        float("nan"), 17.0, 0.0, 19.0, 20.0,
+        float("nan"), 0.0, 0.0, 0.0, 0.0, 0.0
     ]
     for v2 in [True, False]:
       self._testMaxPoolGradDirect(
@@ -1614,10 +1644,9 @@ class PoolingTest(test.TestCase):
     Returns:
       A Tensor.
     """
-    return gen_nn_ops._max_pool_grad_grad(orig_input, orig_output, grad,
-                                          [1, window_rows, window_cols,
-                                           1], [1, row_stride, col_stride,
-                                                1], padding)
+    return gen_nn_ops._max_pool_grad_grad(
+        orig_input, orig_output, grad, [1, window_rows, window_cols, 1],
+        [1, row_stride, col_stride, 1], padding)
 
   def testAvgPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
@@ -1771,8 +1800,7 @@ class PoolingTest(test.TestCase):
     ]:
       with self.assertRaises(ValueError):
         pool_func(
-            array_ops.placeholder(
-                dtypes.float32, shape=[1, 3]),
+            array_ops.placeholder(dtypes.float32, shape=[1, 3]),
             ksize=[1, 1, 1, 1],
             strides=[1, 1, 1, 1],
             padding="SAME")
@@ -1783,30 +1811,29 @@ class PoolingTest(test.TestCase):
       if test.is_gpu_available():
         pool_funcs.append(nn_ops.max_pool_with_argmax)
       for pool_func in pool_funcs:
-        # Illegal strides.
-        with self.assertRaisesRegexp(
-            errors_impl.UnimplementedError,
-            "Pooling is not yet supported on the batch"):
-          sess.run(
-              pool_func(
-                  array_ops.placeholder(dtypes.float32),
-                  ksize=[1, 1, 1, 1],
-                  strides=[2, 1, 1, 1],
-                  padding="SAME"))
+        if pool_func != nn_ops.max_pool:
+          # Illegal strides.
+          with self.assertRaisesRegexp(
+              errors_impl.UnimplementedError,
+              "Pooling is not yet supported on the batch"):
+            sess.run(
+                pool_func(
+                    array_ops.placeholder(dtypes.float32),
+                    ksize=[1, 1, 1, 1],
+                    strides=[2, 1, 1, 1],
+                    padding="SAME"))
 
         # Filter too large.
         with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
           sess.run(
               pool_func(
-                  array_ops.placeholder(
-                      dtypes.float32, shape=[32, 20, 20, 3]),
+                  array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
                   ksize=[1, 20, 21, 1],
                   strides=[1, 1, 1, 1],
                   padding="VALID"))
         with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
           pool_func(
-              array_ops.placeholder(
-                  dtypes.float32, shape=[32, 20, 20, 3]),
+              array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
               ksize=[1, 21, 20, 1],
               strides=[1, 1, 1, 1],
               padding="VALID")
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 7ed99c1be9b62a145b9584fd6412f1074f501ae8..61fb3f12e45ea5ae3bc4f0a26c2116b54c003624 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,82 +24,93 @@ from six.moves import queue
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
-class PyOpTest(test.TestCase):
+def np_func(x, y):
+  return np.sinh(x) + np.cosh(y)
 
-  def testBasic(self):
 
-    def my_func(x, y):
-      return np.sinh(x) + np.cosh(y)
+def matmul(x, y):
+  return math_ops.matmul(x, y)
 
-    # single type
+
+class PyFuncTest(test.TestCase):
+  """Encapsulates tests for py_func and eager_py_func."""
+
+  # ----- Tests for py_func -----
+  def testSingleType(self):
     with self.test_session():
       x = constant_op.constant(1.0, dtypes.float32)
       y = constant_op.constant(2.0, dtypes.float32)
-      z = script_ops.py_func(my_func, [x, y], dtypes.float32)
-      self.assertEqual(z.eval(), my_func(1.0, 2.0).astype(np.float32))
+      z = self.evaluate(script_ops.py_func(np_func, [x, y], dtypes.float32))
+      self.assertEqual(z, np_func(1.0, 2.0).astype(np.float32))
 
-    # scalar
+  def testScalar(self):
     with self.test_session():
       x = constant_op.constant(1.0, dtypes.float32)
       y = constant_op.constant(2.0, dtypes.float32)
-      z = script_ops.py_func(my_func, [x, y], [dtypes.float32])
-      self.assertEqual(z[0].eval(), my_func(1.0, 2.0).astype(np.float32))
+      z = self.evaluate(
+          script_ops.eager_py_func(np_func, [x, y], [dtypes.float32]))
+      self.assertEqual(z[0], np_func(1.0, 2.0).astype(np.float32))
 
-    # array
+  def testArray(self):
     with self.test_session():
       x = constant_op.constant([1.0, 2.0], dtypes.float64)
       y = constant_op.constant([2.0, 3.0], dtypes.float64)
-      z = script_ops.py_func(my_func, [x, y], [dtypes.float64])
-      self.assertAllEqual(z[0].eval(),
-                          my_func([1.0, 2.0], [2.0, 3.0]).astype(np.float64))
+      z = self.evaluate(script_ops.py_func(np_func, [x, y], [dtypes.float64]))
+      self.assertAllEqual(z[0],
+                          np_func([1.0, 2.0], [2.0, 3.0]).astype(np.float64))
 
-    # a bit exotic type (complex64)
+  def testComplexType(self):
     with self.test_session():
       x = constant_op.constant(1 + 2j, dtypes.complex64)
       y = constant_op.constant(3 + 4j, dtypes.complex64)
-      z, = script_ops.py_func(my_func, [x, y], [dtypes.complex64])
-      self.assertAllClose(z.eval(), my_func(1 + 2j, 3 + 4j))
+      z = self.evaluate(script_ops.py_func(np_func, [x, y], dtypes.complex64))
+      self.assertAllClose(z, np_func(1 + 2j, 3 + 4j))
 
-    # a bit excotic function (rfft)
+  def testRFFT(self):
     with self.test_session():
       x = constant_op.constant([1., 2., 3., 4.], dtypes.float32)
 
       def rfft(x):
         return np.fft.rfft(x).astype(np.complex64)
 
-      y, = script_ops.py_func(rfft, [x], [dtypes.complex64])
-      self.assertAllClose(y.eval(), np.fft.rfft([1., 2., 3., 4.]))
+      y = self.evaluate(script_ops.py_func(rfft, [x], dtypes.complex64))
+      self.assertAllClose(y, np.fft.rfft([1., 2., 3., 4.]))
 
-    # returns a python literal.
+  def testPythonLiteral(self):
     with self.test_session():
 
       def literal(x):
-        return 1.0 if x == 0.0 else 0.0
+        return 1.0 if float(x) == 0.0 else 0.0
 
       x = constant_op.constant(0.0, dtypes.float64)
-      y, = script_ops.py_func(literal, [x], [dtypes.float64])
-      self.assertAllClose(y.eval(), 1.0)
+      y = self.evaluate(script_ops.py_func(literal, [x], dtypes.float64))
+      self.assertAllClose(y, 1.0)
 
-    # returns a list
+  def testList(self):
     with self.test_session():
 
       def list_func(x):
         return [x, x + 1]
 
       x = constant_op.constant(0.0, dtypes.float64)
-      y, z = script_ops.py_func(list_func, [x], [dtypes.float64] * 2)
-      self.assertAllClose(y.eval(), 0.0)
-      self.assertAllClose(z.eval(), 1.0)
+      y = self.evaluate(
+          script_ops.py_func(list_func, [x], [dtypes.float64] * 2))
+      self.assertAllClose(y, [0.0, 1.0])
 
+  def testTuple(self):
     # returns a tuple
     with self.test_session():
 
@@ -106,17 +118,17 @@ class PyOpTest(test.TestCase):
         return x, x + 1
 
       x = constant_op.constant(0.0, dtypes.float64)
-      y, z = script_ops.py_func(tuple_func, [x], [dtypes.float64] * 2)
-      self.assertAllClose(y.eval(), 0.0)
-      self.assertAllClose(z.eval(), 1.0)
+      y = self.evaluate(
+          script_ops.py_func(tuple_func, [x], [dtypes.float64] * 2))
+      self.assertAllClose(y, [0.0, 1.0])
 
     # returns a tuple, Tout and inp a tuple
     with self.test_session():
       x = constant_op.constant(0.0, dtypes.float64)
-      y, z = script_ops.py_func(tuple_func, (x,), (dtypes.float64,
-                                                   dtypes.float64))
-      self.assertAllClose(y.eval(), 0.0)
-      self.assertAllClose(z.eval(), 1.0)
+      y = self.evaluate(
+          script_ops.py_func(tuple_func, (x,),
+                             (dtypes.float64, dtypes.float64)))
+      self.assertAllClose(y, [0.0, 1.0])
 
   def testStrings(self):
 
@@ -128,10 +140,12 @@ class PyOpTest(test.TestCase):
 
     with self.test_session():
       x = constant_op.constant([b"hello", b"hi"], dtypes.string)
-      y, = script_ops.py_func(read_fixed_length_numpy_strings, [],
-                              [dtypes.string])
-      z, = script_ops.py_func(read_and_return_strings, [x, y], [dtypes.string])
-      self.assertListEqual(list(z.eval()), [b"hello there", b"hi there"])
+      y = self.evaluate(
+          script_ops.py_func(read_fixed_length_numpy_strings, [],
+                             dtypes.string))
+      z = self.evaluate(
+          script_ops.py_func(read_and_return_strings, [x, y], dtypes.string))
+      self.assertAllEqual(z, [b"hello there", b"hi there"])
 
   def testStringsAreConvertedToBytes(self):
 
@@ -143,10 +157,12 @@ class PyOpTest(test.TestCase):
 
     with self.test_session():
       x = constant_op.constant(["hello", "hi"], dtypes.string)
-      y, = script_ops.py_func(read_fixed_length_numpy_strings, [],
-                              [dtypes.string])
-      z, = script_ops.py_func(read_and_return_strings, [x, y], [dtypes.string])
-      self.assertListEqual(list(z.eval()), [b"hello there", b"hi there"])
+      y = self.evaluate(
+          script_ops.py_func(read_fixed_length_numpy_strings, [],
+                             dtypes.string))
+      z = self.evaluate(
+          script_ops.py_func(read_and_return_strings, [x, y], dtypes.string))
+      self.assertAllEqual(z, [b"hello there", b"hi there"])
 
   def testObjectArraysAreConvertedToBytes(self):
 
@@ -186,16 +202,8 @@ class PyOpTest(test.TestCase):
 
   def testNoInput(self):
     with self.test_session():
-      x, = script_ops.py_func(lambda: 42.0, [], [dtypes.float64])
-      self.assertAllClose(x.eval(), 42.0)
-
-  def testCleanup(self):
-    for _ in xrange(1000):
-      g = ops.Graph()
-      with g.as_default():
-        c = constant_op.constant([1.], dtypes.float32)
-        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
-    self.assertTrue(script_ops._py_funcs.size() < 100)
+      x = self.evaluate(script_ops.py_func(lambda: 42.0, [], dtypes.float64))
+      self.assertAllClose(x, 42.0)
 
   def testAlias(self):
     with self.test_session():
@@ -205,6 +213,16 @@ class PyOpTest(test.TestCase):
       value.op.run()
       self.assertAllEqual(np_array, [1.0, 2.0])
 
+  def testReturnUnicodeString(self):
+    with self.test_session():
+      correct = u"你好 世界"
+
+      def unicode_string():
+        return correct
+
+      z, = script_ops.py_func(unicode_string, [], [dtypes.string])
+      self.assertEqual(z.eval(), correct.encode("utf8"))
+
   def testBadNumpyReturnType(self):
     with self.test_session():
 
@@ -242,8 +260,8 @@ class PyOpTest(test.TestCase):
       # Create a numpy array aliasing a tensor and a tensor aliasing this array
       z, = script_ops.py_func(ident, [p], [dtypes.float32])
       z += 0.0  # Makes sure we release the tensor aliasing the numpy array x[0]
-                # above instead of using its memory as the return value of
-                # session.run
+      # above instead of using its memory as the return value of
+      # session.run
       self.assertEqual(0.0, z.eval(feed_dict={p: [0.0]}))
 
   def testStateful(self):
@@ -319,10 +337,10 @@ class PyOpTest(test.TestCase):
       def value(self):
         return self._value
 
-    with self.test_session() as sess:
+    with self.test_session():
       s = State()
       op = s.increment(constant_op.constant(2, dtypes.int64))
-      ret = sess.run(op)
+      ret = self.evaluate(op)
       self.assertIsNone(ret)
       self.assertAllEqual([3], s.value)
 
@@ -336,15 +354,24 @@ class PyOpTest(test.TestCase):
     with self.test_session() as sess:
       self.assertEqual(sess.run(f), [])
 
-  def _testExceptionHandling(self, py_exp, tf_exp):
+  def _testExceptionHandling(self, py_exp, tf_exp, eager=False):
 
     def raise_exception():
       raise py_exp("blah")  # pylint: disable=not-callable
 
-    f = script_ops.py_func(raise_exception, [], [])
-    with self.test_session() as sess:
+    if eager:
+      if context.in_eager_mode():
+        with self.assertRaisesRegexp(tf_exp, "blah"):
+          f = script_ops.eager_py_func(raise_exception, [], [])
+        return
+      else:
+        f = script_ops.eager_py_func(raise_exception, [], [])
+    else:
+      f = script_ops.py_func(raise_exception, [], [])
+
+    with self.test_session():
       with self.assertRaisesRegexp(tf_exp, "blah"):
-        sess.run(f)
+        self.evaluate(f)
 
   def testExceptionHandling(self):
     self._testExceptionHandling(ValueError, errors.InvalidArgumentError)
@@ -358,6 +385,89 @@ class PyOpTest(test.TestCase):
 
     self._testExceptionHandling(WeirdError, errors.UnknownError)
 
+  # ----- Tests shared by py_func and eager_py_func -----
+  def testCleanup(self):
+    for _ in xrange(1000):
+      g = ops.Graph()
+      with g.as_default():
+        c = constant_op.constant([1.], dtypes.float32)
+        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
+        _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
+    self.assertTrue(script_ops._py_funcs.size() < 100)
+
+  # ----- Tests for eager_py_func -----
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerSingleOutputInt32(self):
+    a = array_ops.ones((3, 3), dtype=dtypes.int32)
+    x = array_ops.ones((3, 1), dtype=dtypes.int32)
+    output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.int32)
+    with self.test_session():
+      ret = self.evaluate(output)
+      self.assertAllEqual(ret, [[3], [3], [3]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerSingleOutputFloat32(self):
+    with test_util.device(use_gpu=True):
+      a = array_ops.ones((3, 3), dtype=dtypes.float32)
+      x = array_ops.ones((3, 1), dtype=dtypes.float32)
+      output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
+      ret = self.evaluate(output)
+      self.assertAllClose(ret, [[3.0], [3.0], [3.0]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerArrayOutput(self):
+    with test_util.device(use_gpu=True):
+      a = array_ops.ones((3, 3), dtype=dtypes.float32)
+      x = array_ops.ones((3, 1), dtype=dtypes.float32)
+      output = script_ops.eager_py_func(
+          lambda a, x: [matmul(a, x)], inp=[a, x], Tout=[dtypes.float32])
+      ret = self.evaluate(output)
+      self.assertAllEqual(ret, [[[3.0], [3.0], [3.0]]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerReturnNone(self):
+    with test_util.device(use_gpu=True):
+      def no_return_value():
+        return
+
+      output = script_ops.eager_py_func(no_return_value, inp=[], Tout=[])
+      ret = self.evaluate(output)
+      if context.in_eager_mode():
+        self.assertEquals(len(ret), 0)
+      else:
+        self.assertIsNone(ret)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerPyFuncInDefun(self):
+    with test_util.device(use_gpu=True):
+      def wrapper():
+        a = array_ops.ones((3, 3), dtype=dtypes.float32)
+        x = array_ops.ones((3, 1), dtype=dtypes.float32)
+        return script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
+
+      wrapped = function.defun(wrapper)
+      ret = self.evaluate(wrapped())
+      self.assertAllEqual(ret, [[3.0], [3.0], [3.0]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerExceptionHandling(self):
+    with test_util.device(use_gpu=True):
+      self._testExceptionHandling(
+          ValueError, errors.InvalidArgumentError, eager=True)
+      self._testExceptionHandling(
+          TypeError, errors.InvalidArgumentError, eager=True)
+      self._testExceptionHandling(
+          StopIteration, errors.OutOfRangeError, eager=True)
+      self._testExceptionHandling(
+          MemoryError, errors.ResourceExhaustedError, eager=True)
+      self._testExceptionHandling(
+          NotImplementedError, errors.UnimplementedError, eager=True)
+
+      class WeirdError(Exception):
+        pass
+
+      self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
index ca48ba6cadee431c3af41b72646d4f1b3e60ec66..a9dc7b7de000024f23b88406bf0c1c2f32ac4fac 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
@@ -57,12 +57,14 @@ class MultinomialTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes()
   def testSmallEntropy(self):
     random_seed.set_random_seed(1618)
-    with test_util.device(use_gpu=True):
-      # A logit value of -10 corresponds to a probability of ~5e-5.
-      logits = constant_op.constant([[-10., 10., -10.], [-10., -10., 10.]])
-      num_samples = 1000
-      samples = self.evaluate(random_ops.multinomial(logits, num_samples))
-      self.assertAllEqual([[1] * num_samples, [2] * num_samples], samples)
+    for output_dtype in [np.int32, np.int64]:
+      with test_util.device(use_gpu=True):
+        # A logit value of -10 corresponds to a probability of ~5e-5.
+        logits = constant_op.constant([[-10., 10., -10.], [-10., -10., 10.]])
+        num_samples = 1000
+        samples = self.evaluate(random_ops.multinomial(
+            logits, num_samples, output_dtype=output_dtype))
+        self.assertAllEqual([[1] * num_samples, [2] * num_samples], samples)
 
   def testOneOpMultipleStepsIndependent(self):
     with self.test_session(use_gpu=True) as sess:
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 56aaa53b981497d91ca01f390df26691f142556f..df37dd98ece57ae7c3835ab63b720b29fc19c975 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -174,6 +175,17 @@ class TruncatedNormalTest(test.TestCase):
       diff = rnd2 - rnd1
       self.assertTrue(np.linalg.norm(diff.eval()) > 0.1)
 
+  def testEagerSeed(self):
+    with context.eager_mode():
+      # Ensure a context has been created
+      random_ops.random_normal([])
+      # Set the same seed twice and check that the values match
+      context.set_global_seed(42)
+      rnd1 = random_ops.random_normal([])
+      context.set_global_seed(42)
+      rnd2 = random_ops.random_normal([])
+      self.assertAllEqual(rnd1, rnd2)
+
 
 class RandomUniformTest(test.TestCase):
 
@@ -191,7 +203,8 @@ class RandomUniformTest(test.TestCase):
     return func
 
   def testRange(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       sampler = self._Sampler(1000, minv=-2, maxv=8, dtype=dt, use_gpu=True)
       x = sampler()
       self.assertTrue(-2 <= np.min(x))
@@ -201,7 +214,8 @@ class RandomUniformTest(test.TestCase):
   # to see the same sequence of values. Will catch buggy
   # implementations which uses the same random number seed.
   def testDistinct(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       maxv = 1.0 if dt.is_floating else 1 << 30
       sampler = self._Sampler(1000, minv=0, maxv=maxv, dtype=dt, use_gpu=True)
       x = sampler()
@@ -239,7 +253,8 @@ class RandomUniformTest(test.TestCase):
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
   def testCPUGPUMatch(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       maxv = 1.0 if dt.is_floating else 17
       results = {}
       for use_gpu in False, True:
@@ -249,7 +264,8 @@ class RandomUniformTest(test.TestCase):
       self.assertAllEqual(results[False], results[True])
 
   def testSeed(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64:
+    for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
+               dtypes.int64):
       for seed in [345, 2**100, -2**100]:
         sx = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed)
         sy = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed)
@@ -273,8 +289,7 @@ class RandomShapeTest(test.TestCase):
     self.assertEqual([1, 2, 3], rnd1.get_shape())
     # Partially known shape.
     rnd2 = random_ops.truncated_normal(
-        array_ops.placeholder(
-            dtypes.int32, shape=(3,)))
+        array_ops.placeholder(dtypes.int32, shape=(3,)))
     self.assertEqual([None, None, None], rnd2.get_shape().as_list())
     # Unknown shape.
     rnd3 = random_ops.truncated_normal(array_ops.placeholder(dtypes.int32))
@@ -286,8 +301,7 @@ class RandomShapeTest(test.TestCase):
     self.assertEqual([1, 2, 3], rnd1.get_shape())
     # Partially known shape.
     rnd2 = random_ops.random_normal(
-        array_ops.placeholder(
-            dtypes.int32, shape=(3,)))
+        array_ops.placeholder(dtypes.int32, shape=(3,)))
     self.assertEqual([None, None, None], rnd2.get_shape().as_list())
     # Unknown shape.
     rnd3 = random_ops.random_normal(array_ops.placeholder(dtypes.int32))
@@ -299,8 +313,7 @@ class RandomShapeTest(test.TestCase):
     self.assertEqual([1, 2, 3], rnd1.get_shape())
     # Partially known shape.
     rnd2 = random_ops.random_uniform(
-        array_ops.placeholder(
-            dtypes.int32, shape=(3,)))
+        array_ops.placeholder(dtypes.int32, shape=(3,)))
     self.assertEqual([None, None, None], rnd2.get_shape().as_list())
     # Unknown shape.
     rnd3 = random_ops.random_uniform(array_ops.placeholder(dtypes.int32))
diff --git a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
index c4e16ff6280cc7ce121955474fe8ec45acd57f95..b7a79f239cee04b191b78affd002f687b7de851a 100644
--- a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
+++ b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import random
-import re
 import time
 
 import numpy as np
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 223a4b2c8726d957f014e65ea9f87c0fb61e65bb..82a27eebeef16c9dacaf1b900f0398a56533cd2d 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -428,7 +428,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
     for i in range(self._num_files):
       fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
       filenames.append(fn)
-      with open(fn+".tmp", "wb") as f:
+      with open(fn + ".tmp", "wb") as f:
         f.write(b"H" * self._header_bytes)
         if num_records > 0:
           f.write(self._Record(i, 0))
@@ -437,7 +437,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
             f.write(b"G" * gap_bytes)
           f.write(self._Record(i, j))
         f.write(b"F" * self._footer_bytes)
-      with open(fn+".tmp", "rb") as f:
+      with open(fn + ".tmp", "rb") as f:
         cdata = zlib.compress(f.read())
         with open(fn, "wb") as zf:
           zf.write(cdata)
@@ -455,7 +455,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
           all_records_str = "".join([
               str(i)[0]
               for i in range(self._record_bytes + self._hop_bytes *
-                           (num_overlapped_records - 1))
+                             (num_overlapped_records - 1))
           ])
           f.write(compat.as_bytes(all_records_str))
         f.write(b"F" * self._footer_bytes)
@@ -467,7 +467,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
       fn = os.path.join(self.get_temp_dir(),
                         "fixed_length_overlapped_record.%d.txt" % i)
       filenames.append(fn)
-      with open(fn+".tmp", "wb") as f:
+      with open(fn + ".tmp", "wb") as f:
         f.write(b"H" * self._header_bytes)
         if num_overlapped_records > 0:
           all_records_str = "".join([
@@ -477,7 +477,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
           ])
           f.write(compat.as_bytes(all_records_str))
         f.write(b"F" * self._footer_bytes)
-      with open(fn+".tmp", "rb") as f:
+      with open(fn + ".tmp", "rb") as f:
         cdata = zlib.compress(f.read())
         with open(fn, "wb") as zf:
           zf.write(cdata)
@@ -509,7 +509,10 @@ class FixedLengthRecordReaderTest(test.TestCase):
                                     "\\(requested 1, current size 0\\)"):
         k, v = sess.run([key, value])
 
-  def _TestOneEpochWithHopBytes(self, files, num_overlapped_records, encoding=None):
+  def _TestOneEpochWithHopBytes(self,
+                                files,
+                                num_overlapped_records,
+                                encoding=None):
     with self.test_session() as sess:
       reader = io_ops.FixedLengthRecordReader(
           header_bytes=self._header_bytes,
@@ -565,13 +568,15 @@ class FixedLengthRecordReaderTest(test.TestCase):
 
   def testGzipOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
-      files = self._CreateGzipOverlappedRecordFiles(num_overlapped_records, )
-      self._TestOneEpochWithHopBytes(files, num_overlapped_records, encoding="GZIP")
+      files = self._CreateGzipOverlappedRecordFiles(num_overlapped_records,)
+      self._TestOneEpochWithHopBytes(
+          files, num_overlapped_records, encoding="GZIP")
 
   def testZlibOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
       files = self._CreateZlibOverlappedRecordFiles(num_overlapped_records)
-      self._TestOneEpochWithHopBytes(files, num_overlapped_records, encoding="ZLIB")
+      self._TestOneEpochWithHopBytes(
+          files, num_overlapped_records, encoding="ZLIB")
 
 
 class TFRecordReaderTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/record_input_test.py b/tensorflow/python/kernel_tests/record_input_test.py
index 1ec48ac361b81e66fd77e8a4506bebf910ea0e8a..068860d5d46d1e3dfac87aa6b1f986d78d9c8316 100644
--- a/tensorflow/python/kernel_tests/record_input_test.py
+++ b/tensorflow/python/kernel_tests/record_input_test.py
@@ -26,13 +26,17 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
-
 class RecordInputOpTest(test.TestCase):
 
-  def generateTestData(self, prefix, n, m):
+  def generateTestData(self,
+                       prefix,
+                       n,
+                       m,
+                       compression_type=tf_record.TFRecordCompressionType.NONE):
+    options = tf_record.TFRecordOptions(compression_type)
     for i in range(n):
       f = os.path.join(self.get_temp_dir(), prefix + "." + str(i))
-      w = tf_record.TFRecordWriter(f)
+      w = tf_record.TFRecordWriter(f, options=options)
 
       for j in range(m):
         w.write("{0:0{width}}".format(i * m + j, width=10).encode("utf-8"))
@@ -52,6 +56,44 @@ class RecordInputOpTest(test.TestCase):
 
       self.assertEqual(sess.run(yield_op), b"0000000000")
 
+  def testRecordInputSimpleGzip(self):
+    with self.test_session() as sess:
+      self.generateTestData(
+          "basic",
+          1,
+          1,
+          compression_type=tf_record.TFRecordCompressionType.GZIP)
+
+      yield_op = data_flow_ops.RecordInput(
+          file_pattern=os.path.join(self.get_temp_dir(), "basic.*"),
+          parallelism=1,
+          buffer_size=1,
+          batch_size=1,
+          name="record_input",
+          compression_type=tf_record.TFRecordCompressionType.GZIP).get_yield_op(
+          )
+
+      self.assertEqual(sess.run(yield_op), b"0000000000")
+
+  def testRecordInputSimpleZlib(self):
+    with self.test_session() as sess:
+      self.generateTestData(
+          "basic",
+          1,
+          1,
+          compression_type=tf_record.TFRecordCompressionType.ZLIB)
+
+      yield_op = data_flow_ops.RecordInput(
+          file_pattern=os.path.join(self.get_temp_dir(), "basic.*"),
+          parallelism=1,
+          buffer_size=1,
+          batch_size=1,
+          name="record_input",
+          compression_type=tf_record.TFRecordCompressionType.ZLIB).get_yield_op(
+          )
+
+      self.assertEqual(sess.run(yield_op), b"0000000000")
+
   def testRecordInputEpochs(self):
     files = 100
     records_per_file = 100
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 4231a79b2dcef951048ca54e8c8df2f42b44b1a1..531478162971575739bbe37abfc57ca427ae22ae 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -110,10 +110,10 @@ class ReductionUnknownShape(test.TestCase):
 
 class BaseReductionTest(test.TestCase):
 
-  def _tf_reduce(self, x, reduction_axes, keep_dims):
+  def _tf_reduce(self, x, reduction_axes, keepdims):
     raise NotImplementedError()
 
-  def _np_reduce(self, x, reduction_axes, keep_dims):
+  def _np_reduce(self, x, reduction_axes, keepdims):
     raise NotImplementedError()
 
   def _makeIncremental(self, shape, dtype):
@@ -128,10 +128,10 @@ class BaseReductionTest(test.TestCase):
       data -= 2j * data
     return data
 
-  def _compare(self, x, reduction_axes, keep_dims, feed_dict=None):
-    np_ans = self._np_reduce(x, reduction_axes, keep_dims)
+  def _compare(self, x, reduction_axes, keepdims, feed_dict=None):
+    np_ans = self._np_reduce(x, reduction_axes, keepdims)
     with self.test_session(use_gpu=True) as sess:
-      tf_ans = self._tf_reduce(x, reduction_axes, keep_dims)
+      tf_ans = self._tf_reduce(x, reduction_axes, keepdims)
       out = sess.run(tf_ans, feed_dict)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -140,8 +140,8 @@ class BaseReductionTest(test.TestCase):
     if reduction_axes is not None and np.shape(reduction_axes) == (1,):
       # Test scalar reduction_axes argument
       self._compareAll(x, reduction_axes[0])
-    self._compare(x, reduction_axes, keep_dims=False, feed_dict=feed_dict)
-    self._compare(x, reduction_axes, keep_dims=True, feed_dict=feed_dict)
+    self._compare(x, reduction_axes, keepdims=False, feed_dict=feed_dict)
+    self._compare(x, reduction_axes, keepdims=True, feed_dict=feed_dict)
 
   def _compareAllAxes(self, x, feed_dict=None):
     self._compareAll(x, None)
@@ -171,14 +171,14 @@ class BaseReductionTest(test.TestCase):
 
 class SumReductionTest(BaseReductionTest):
 
-  def _tf_reduce(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_sum(x, reduction_axes, keep_dims)
+  def _tf_reduce(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_sum(x, reduction_axes, keepdims)
 
-  def _np_reduce(self, x, reduction_axes, keep_dims):
+  def _np_reduce(self, x, reduction_axes, keepdims):
     if isinstance(reduction_axes, list) or isinstance(reduction_axes,
                                                       np.ndarray):
       reduction_axes = tuple(reduction_axes)
-    return np.sum(x, axis=reduction_axes, keepdims=keep_dims)
+    return np.sum(x, axis=reduction_axes, keepdims=keepdims)
 
   def testAxesType(self):
     for dtype in [dtypes.int64, dtypes.int32]:
@@ -298,7 +298,7 @@ class SumReductionTest(BaseReductionTest):
     c_known_rank = array_ops.placeholder(dtypes.float32)
     c_known_rank.set_shape(tensor_shape.unknown_shape(ndims=3))
     s_known_rank = math_ops.reduce_sum(
-        c_known_rank, reduction_axes, keep_dims=True)
+        c_known_rank, reduction_axes, keepdims=True)
     self.assertEqual(3, s_known_rank.get_shape().ndims)
 
     np_input = np.random.randn(3, 3, 3)
@@ -308,11 +308,11 @@ class SumReductionTest(BaseReductionTest):
     unknown_indices = array_ops.placeholder(dtypes.int32)
     c_unknown_indices = constant_op.constant([[10.0], [20.0]])
     s_unknown_indices = math_ops.reduce_sum(
-        c_unknown_indices, unknown_indices, keep_dims=False)
+        c_unknown_indices, unknown_indices, keepdims=False)
     self.assertEqual(tensor_shape.unknown_shape(),
                      s_unknown_indices.get_shape())
     s_unknown_indices_keep = math_ops.reduce_sum(
-        c_unknown_indices, unknown_indices, keep_dims=True)
+        c_unknown_indices, unknown_indices, keepdims=True)
     self.assertEqual(2, s_unknown_indices_keep.get_shape().ndims)
 
   def testWrongShapeForReductionIndices(self):
@@ -372,10 +372,10 @@ class SumReductionTest(BaseReductionTest):
 
 class MeanReductionTest(BaseReductionTest):
 
-  def _tf_reduce(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_mean(x, reduction_axes, keep_dims)
+  def _tf_reduce(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_mean(x, reduction_axes, keepdims)
 
-  def _np_reduce(self, x, reduction_axes, keep_dims):
+  def _np_reduce(self, x, reduction_axes, keepdims):
     if isinstance(reduction_axes, list) or isinstance(reduction_axes,
                                                       np.ndarray):
       reduction_axes = tuple(reduction_axes)
@@ -389,7 +389,7 @@ class MeanReductionTest(BaseReductionTest):
     # np.mean automatically converts integer inputs to float, while TensorFlow's
     # reduce_mean does not. For integer inputs, we emulate TensorFlow's behavior
     # using np.sum and truncating division.
-    np_sum = np.sum(x, axis=reduction_axes, keepdims=keep_dims)
+    np_sum = np.sum(x, axis=reduction_axes, keepdims=keepdims)
     if np.issubdtype(x.dtype, np.integer):
       return np_sum // count
     return np_sum / count
@@ -458,14 +458,14 @@ class MeanReductionTest(BaseReductionTest):
 
 class ProdReductionTest(BaseReductionTest):
 
-  def _tf_reduce(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_prod(x, reduction_axes, keep_dims)
+  def _tf_reduce(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_prod(x, reduction_axes, keepdims)
 
-  def _np_reduce(self, x, reduction_axes, keep_dims):
+  def _np_reduce(self, x, reduction_axes, keepdims):
     if isinstance(reduction_axes, list) or isinstance(reduction_axes,
                                                       np.ndarray):
       reduction_axes = tuple(reduction_axes)
-    return np.prod(x, axis=reduction_axes, keepdims=keep_dims)
+    return np.prod(x, axis=reduction_axes, keepdims=keepdims)
 
   def testAxesType(self):
     for dtype in [dtypes.int64, dtypes.int32]:
@@ -549,17 +549,17 @@ class ProdReductionTest(BaseReductionTest):
 
 class MinReductionTest(test.TestCase):
 
-  def _compare(self, x, reduction_axes, keep_dims, use_gpu=False):
+  def _compare(self, x, reduction_axes, keepdims, use_gpu=False):
     np_ans = x
     if reduction_axes is None:
-      np_ans = np.amin(np_ans, keepdims=keep_dims)
+      np_ans = np.amin(np_ans, keepdims=keepdims)
     else:
       for ra in reduction_axes[::-1]:
-        np_ans = np.amin(np_ans, axis=ra, keepdims=keep_dims)
+        np_ans = np.amin(np_ans, axis=ra, keepdims=keepdims)
     with self.test_session(use_gpu=use_gpu):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
-      tf_ans = math_ops.reduce_min(x, reduction_axes, keep_dims)
+      tf_ans = math_ops.reduce_min(x, reduction_axes, keepdims)
       out = tf_ans.eval()
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -662,17 +662,17 @@ class MinReductionTest(test.TestCase):
 
 class MaxReductionTest(test.TestCase):
 
-  def _compare(self, x, reduction_axes, keep_dims, use_gpu=False):
+  def _compare(self, x, reduction_axes, keepdims, use_gpu=False):
     np_ans = x
     if reduction_axes is None:
-      np_ans = np.amax(np_ans, keepdims=keep_dims)
+      np_ans = np.amax(np_ans, keepdims=keepdims)
     else:
       for ra in reduction_axes[::-1]:
-        np_ans = np.amax(np_ans, axis=ra, keepdims=keep_dims)
+        np_ans = np.amax(np_ans, axis=ra, keepdims=keepdims)
     with self.test_session(use_gpu=use_gpu):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
-      tf_ans = math_ops.reduce_max(x, reduction_axes, keep_dims)
+      tf_ans = math_ops.reduce_max(x, reduction_axes, keepdims)
       out = tf_ans.eval()
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -789,17 +789,17 @@ class MaxReductionTest(test.TestCase):
 
 class AllReductionTest(test.TestCase):
 
-  def _compare(self, x, reduction_axes, keep_dims, use_gpu=False):
+  def _compare(self, x, reduction_axes, keepdims, use_gpu=False):
     np_ans = x
     if reduction_axes is None:
-      np_ans = np.all(np_ans, keepdims=keep_dims)
+      np_ans = np.all(np_ans, keepdims=keepdims)
     else:
       for ra in reduction_axes[::-1]:
-        np_ans = np.all(np_ans, axis=ra, keepdims=keep_dims)
+        np_ans = np.all(np_ans, axis=ra, keepdims=keepdims)
     with self.test_session(use_gpu=use_gpu):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
-      tf_ans = math_ops.reduce_all(x, reduction_axes, keep_dims)
+      tf_ans = math_ops.reduce_all(x, reduction_axes, keepdims)
       out = tf_ans.eval()
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -838,17 +838,17 @@ class AllReductionTest(test.TestCase):
 
 class AnyReductionTest(test.TestCase):
 
-  def _compare(self, x, reduction_axes, keep_dims, use_gpu=False):
+  def _compare(self, x, reduction_axes, keepdims, use_gpu=False):
     np_ans = x
     if reduction_axes is None:
-      np_ans = np.any(np_ans, keepdims=keep_dims)
+      np_ans = np.any(np_ans, keepdims=keepdims)
     else:
       for ra in reduction_axes[::-1]:
-        np_ans = np.any(np_ans, axis=ra, keepdims=keep_dims)
+        np_ans = np.any(np_ans, axis=ra, keepdims=keepdims)
     with self.test_session(use_gpu=use_gpu):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
-      tf_ans = math_ops.reduce_any(x, reduction_axes, keep_dims)
+      tf_ans = math_ops.reduce_any(x, reduction_axes, keepdims)
       out = tf_ans.eval()
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -890,18 +890,18 @@ class CountNonzeroReductionTest(test.TestCase):
   def _compare(self,
                x,
                reduction_axes,
-               keep_dims,
+               keepdims,
                use_gpu=False,
                feed_dict=None):
     np_ans = (x != 0).astype(np.int32)
     if reduction_axes is None:
-      np_ans = np.sum(np_ans, keepdims=keep_dims)
+      np_ans = np.sum(np_ans, keepdims=keepdims)
     else:
       reduction_axes = np.array(reduction_axes).astype(np.int32)
       for ra in reduction_axes.ravel()[::-1]:
-        np_ans = np.sum(np_ans, axis=ra, keepdims=keep_dims)
+        np_ans = np.sum(np_ans, axis=ra, keepdims=keepdims)
     with self.test_session(use_gpu=use_gpu) as sess:
-      tf_ans = math_ops.count_nonzero(x, reduction_axes, keep_dims)
+      tf_ans = math_ops.count_nonzero(x, reduction_axes, keepdims)
       out = sess.run(tf_ans, feed_dict)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test_big.py b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
index 0959adb026e3934713442e6f3487b30a0b252943..d70360775a03caa32eab995371d54786c3c0a0d9 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test_big.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
@@ -27,24 +27,24 @@ from tensorflow.python.platform import test
 
 class BaseReductionTest(test.TestCase):
 
-  def _tf_reduce(self, x, reduction_axes, keep_dims):
+  def _tf_reduce(self, x, reduction_axes, keepdims):
     raise NotImplementedError()
 
 
 class BigReductionTest(BaseReductionTest):
   """Test reductions for sum and boolean all over a wide range of shapes."""
 
-  def _tf_reduce_max(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_max(x, reduction_axes, keep_dims)
+  def _tf_reduce_max(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_max(x, reduction_axes, keepdims)
 
-  def _tf_reduce_all(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_all(x, reduction_axes, keep_dims)
+  def _tf_reduce_all(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_all(x, reduction_axes, keepdims)
 
-  def _tf_reduce_mean(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_mean(x, reduction_axes, keep_dims)
+  def _tf_reduce_mean(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_mean(x, reduction_axes, keepdims)
 
-  def _tf_reduce_sum(self, x, reduction_axes, keep_dims):
-    return math_ops.reduce_sum(x, reduction_axes, keep_dims)
+  def _tf_reduce_sum(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_sum(x, reduction_axes, keepdims)
 
   def testFloat32Sum(self):
     # make sure we test all possible kernel invocations
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index 8cd1f52d80039deccfe4623b8bae9bb1482b8392..6b4091ae5d3c6e469a9cd5237b978eae4c75485f 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -48,8 +48,8 @@ class ReluTest(test.TestCase):
     self.assertAllClose(
         np.array([[0.0, 0.7, 0.0, 0.3, 0.0], [0.1, 0.0, 0.5, 0.0, 0.9]]),
         self._npRelu(
-            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, 0.9]
-                     ])))
+            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
+                                                     0.9]])))
 
   def _testRelu(self, np_features, use_gpu=False):
     np_relu = self._npRelu(np_features)
@@ -163,8 +163,8 @@ class Relu6Test(test.TestCase):
     self.assertAllClose(
         np.array([[0.0, 0.7, 0.0, 0.3, 6.0], [0.1, 0.0, 6.0, 0.0, 0.9]]),
         self._npRelu6(
-            np.array([[-0.9, 0.7, -0.5, 0.3, 6.0], [0.1, -0.3, 6.5, -0.7, 0.9]
-                     ])))
+            np.array([[-0.9, 0.7, -0.5, 0.3, 6.0], [0.1, -0.3, 6.5, -0.7,
+                                                    0.9]])))
 
   def _testRelu6(self, np_features, use_gpu=False):
     np_relu6 = self._npRelu6(np_features)
@@ -231,8 +231,8 @@ class EluTest(test.TestCase):
         np.array([[-0.59343034025, 0.7, -0.39346934028, 0.3, -0.09516258196],
                   [0.1, -0.25918177931, 0.5, -0.5034146962, 0.9]]),
         self._npElu(
-            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, 0.9]
-                     ])))
+            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
+                                                     0.9]])))
 
   def _testElu(self, np_features, use_gpu=False):
     np_elu = self._npElu(np_features)
@@ -330,11 +330,11 @@ class SeluTest(test.TestCase):
 
   def testNpSelu(self):
     self.assertAllClose(
-        np.array([[-1.0433095, 0.73549069, -0.6917582, 0.3152103 , -0.16730527],
-                 [0.1050701 , -0.45566732, 0.5253505, -0.88505305, 0.9456309]]),
+        np.array([[-1.0433095, 0.73549069, -0.6917582, 0.3152103, -0.16730527],
+                  [0.1050701, -0.45566732, 0.5253505, -0.88505305, 0.9456309]]),
         self._npSelu(
-            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, 0.9]
-                     ])))
+            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
+                                                     0.9]])))
 
   def _testSelu(self, np_features, use_gpu=False):
     np_selu = self._npSelu(np_features)
@@ -441,6 +441,24 @@ class CreluTest(test.TestCase):
             np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
             use_gpu=True)
 
+  def testNumbersWithAxis0(self):
+    with self.test_session():
+      crelu = nn_ops.crelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=0)
+      tf_relu = crelu.eval()
+      np_crelu = np.array([[0, 7, 0, 3, 0], [1, 0, 5, 0, 9], [9, 0, 5, 0, 1],
+                           [0, 3, 0, 7, 0]])
+      self.assertAllEqual(np_crelu, tf_relu)
+
+  def testNumbersWithAxis1(self):
+    with self.test_session():
+      crelu = nn_ops.crelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=1)
+      tf_relu = crelu.eval()
+      np_crelu = np.array([[0, 7, 0, 3, 0, 9, 0, 5, 0, 1],
+                           [1, 0, 5, 0, 9, 0, 3, 0, 7, 0]])
+      self.assertAllEqual(np_crelu, tf_relu)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 4c7a9cb0f9542afe8fc1608a05864b739d741c97..dc6e73bd5b7930d9292a4654734f55c6b29d4389 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -36,8 +36,10 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
+@test_util.with_c_api
 class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
 
   def tearDown(self):
@@ -169,6 +171,17 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
     self.assertEqual(self.evaluate(read), [[3]])
 
+  def testScatterUpdateString(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.string, shape=[1, 1])
+    self.evaluate(resource_variable_ops.assign_variable_op(
+        handle, constant_op.constant([["a"]], dtype=dtypes.string)))
+    self.evaluate(resource_variable_ops.resource_scatter_update(
+        handle, [0], constant_op.constant([["b"]], dtype=dtypes.string)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.string)
+    self.assertEqual(compat.as_bytes(self.evaluate(read)[0][0]),
+                     compat.as_bytes("b"))
+
   # TODO(alive): get this to work in Eager mode.
   def testGPU(self):
     with self.test_session(use_gpu=True):
@@ -263,6 +276,32 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     v.load(2.0)
     self.assertEqual(2.0, self.evaluate(v.value()))
 
+  def testVariableDefInitializedInstances(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      v_def = resource_variable_ops.ResourceVariable(
+          initial_value=constant_op.constant(3.0)).to_proto()
+
+    with ops.Graph().as_default(), self.test_session() as sess:
+      # v describes a VariableDef-based variable without an initial value.
+      v = resource_variable_ops.ResourceVariable(variable_def=v_def)
+      self.assertEqual(3.0, sess.run(v.initialized_value()))
+
+      # initialized_value should not rerun the initializer_op if the variable
+      # has already been initialized elsewhere.
+      sess.run(v.assign(1.0))
+      self.assertEqual(1.0, v.initialized_value().eval())
+
+    v_def.ClearField("initial_value_name")
+    with ops.Graph().as_default(), self.test_session() as sess:
+      # Restoring a legacy VariableDef proto that does not have
+      # initial_value_name set should still work.
+      v = resource_variable_ops.ResourceVariable(variable_def=v_def)
+      # We should also be able to re-export the variable to a new meta graph.
+      self.assertProtoEquals(v_def, v.to_proto())
+      # But attempts to use initialized_value will result in errors.
+      with self.assertRaises(ValueError):
+        sess.run(v.initialized_value())
+
   @test_util.run_in_graph_and_eager_modes()
   def testSparseRead(self):
     with self.test_session():
@@ -302,7 +341,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.evaluate(variables.global_variables_initializer())
     self.assertEqual(3.0, self.evaluate(v.value()))
     self.evaluate(resource_variable_ops.destroy_resource_op(v.handle))
-    with self.assertRaises(errors.NotFoundError):
+    with self.assertRaises(errors.FailedPreconditionError):
       self.evaluate(v.value())
     # Handle to a resource not actually created.
     handle = resource_variable_ops.var_handle_op(dtype=dtypes.int32, shape=[])
@@ -342,14 +381,14 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v = resource_variable_ops.ResourceVariable(
           2.0, caching_device="/job:localhost")
       self.assertEqual("/job:localhost", v.value().device)
-      with self.assertRaisesRegexp(ValueError, "No attr named '_class'"):
+      with self.assertRaises(ValueError):
         _ = v.value().op.get_attr("_class")
 
     with ops.colocate_with(v.op):
       w = resource_variable_ops.ResourceVariable(
           2.0, caching_device="/job:localhost")
       self.assertEqual("/job:localhost", w.value().device)
-      with self.assertRaisesRegexp(ValueError, "No attr named '_class'"):
+      with self.assertRaises(ValueError):
         _ = w.value().op.get_attr("_class")
 
   def testSharedName(self):
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 0c77d1db921566000c2a52e6ddb9d3dddd9b193c..daa42938e6af205425d7e423ce162294b9002be4 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -23,6 +23,7 @@ import timeit
 
 import numpy as np
 
+from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib import rnn as contrib_rnn
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
diff --git a/tensorflow/python/kernel_tests/scalar_test.py b/tensorflow/python/kernel_tests/scalar_test.py
index b34426cc21590d585bf7ef7b24b778adbf0cd084..e65241981eac2d42207c1de7a261f7936e588f2a 100644
--- a/tensorflow/python/kernel_tests/scalar_test.py
+++ b/tensorflow/python/kernel_tests/scalar_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
@@ -30,6 +31,7 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class ScalarTest(test.TestCase):
 
   def check(self, op, args, error, correct=None):
@@ -51,7 +53,7 @@ class ScalarTest(test.TestCase):
     # Test various GraphDef versions
     for version in strict + lenient:
       with ops.Graph().as_default() as g:
-        g.graph_def_versions.producer = version
+        test_util.set_producer_version(g, version)
         with self.test_session(graph=g) as sess:
           feed = {}
           xs = placeholders(args, feed)
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index a79d66e9889b4dc55a66c505bac9b29a453356be..9f5794951524b2689daa5fc4eefb19703262b8f0 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -157,6 +158,20 @@ class StatefulScatterNdTest(test.TestCase):
       result = sess.run(scatter)
       self.assertAllClose(result, expected)
 
+  def testSimpleResource(self):
+    indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
+    updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
+    ref = resource_variable_ops.ResourceVariable(
+        [0, 0, 0, 0, 0, 0, 0, 0], dtype=dtypes.float32)
+    expected = np.array([0, 11, 0, 10, 9, 0, 0, 12])
+    scatter = state_ops.scatter_nd_update(ref, indices, updates)
+    init = variables.global_variables_initializer()
+
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(init)
+      sess.run(scatter)
+      self.assertAllClose(ref.eval(), expected)
+
   def testSimple2(self):
     indices = constant_op.constant([[1, 0], [1, 1]], dtype=dtypes.int32)
     updates = constant_op.constant([11., 12.], dtype=dtypes.float32)
@@ -335,7 +350,7 @@ class StatefulScatterNdTest(test.TestCase):
         indices = np.array([2, 0, 5])
         op(ref, indices, updates).eval()
 
-        # Indicies out of range should not fail.
+        # Indices out of range should not fail.
         indices = np.array([-1, 0, 5])
         op(ref, indices, updates).eval()
         indices = np.array([2, 0, 6])
@@ -487,6 +502,43 @@ class ScatterNdTest(test.TestCase):
       if self.non_aliasing_add_test:
         self.assertAllEqual(expected_input_grad, input_grad.eval())
 
+  def testGradientsRank7SliceUpdate(self):
+    indices = constant_op.constant(
+        [[[
+            [[[[0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]]],
+            [[[[0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1]]]]
+        ]]], dtype=dtypes.int32)
+    updates = constant_op.constant(
+        [[[
+            [[[[5, 6], [2, 4]]]],
+            [[[[1, 3], [6, 8]]]]
+        ]]], dtype=dtypes.float64)
+    shape = constant_op.constant([1, 1, 2, 1, 1, 2, 2], dtype=dtypes.int32)
+    input_ = array_ops.zeros(shape, dtype=dtypes.float64)
+    outputs = self.scatter_nd(indices, updates, shape, input_)
+
+    grad_vals = constant_op.constant(
+        [[[
+            [[[[1, 2], [3, 4]]]],
+            [[[[5, 6], [7, 8]]]]
+        ]]], dtype=dtypes.float64)
+    updates_grad, input_grad = gradients_impl.gradients(
+        [outputs], [updates, input_], [grad_vals])
+    expected_updates_grad = np.array(
+        [[[
+            [[[[3, 4], [5, 6]]]],
+            [[[[1, 2], [7, 8]]]]
+        ]]], dtype=np.float64)
+    expected_input_grad = np.array(
+        [[[
+            [[[[1, 2], [3, 4]]]],
+            [[[[5, 6], [7, 8]]]]
+        ]]], dtype=np.float64)
+    with self.test_session():
+      self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, input_grad.eval())
+
   def testScatterNdRepatedIndicesAdd(self):
     indices = array_ops.zeros([100000, 1], dtypes.int32)
     values = np.random.randn(100000)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 99f9f09690f4a38f68a230efcd0dd2bf223376be..bbce6b7d47325b8209815230426672ec6894147f 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -46,13 +46,14 @@ class SegmentReductionHelper(test.TestCase):
     return constant_op.constant(
         np_values, shape=input_shape, dtype=dtype), np_values
 
-  def _segmentReduce(self, indices, x, op1, op2=None, num_out_rows=None):
+  def _segmentReduce(self, indices, x, op1, op2=None, num_segments=None,
+                     initial_value=0):
     if not x.size:
       return np.array([])
     indices = np.asarray(indices)
-    if num_out_rows is None:
-      num_out_rows = indices[-1] + 1
-    output = [None] * num_out_rows
+    if num_segments is None:
+      num_segments = indices[-1] + 1
+    output = [None] * num_segments
     slice_shape = x.shape[indices.ndim:]
     x_flat = x.reshape((indices.size,) + slice_shape)
     for i, index in enumerate(indices.ravel()):
@@ -64,13 +65,8 @@ class SegmentReductionHelper(test.TestCase):
       else:
         output[index] = x_flat[i]
     # zero initialize values that are still uncalcuated.
-    # output = [o if o is not None else np.zeros(slice_shape) for o in output]
-    if not op1 == np.max:
-      output = [o if o is not None else np.zeros(slice_shape) for o in output]
-    else:
-      zeroslice = np.zeros(slice_shape)
-      zeroslice.fill(dtype.min)
-      output = [o if o is not None else zeroslice for o in output]
+    initial_value_slice = np.ones(slice_shape) * initial_value
+    output = [o if o is not None else initial_value_slice for o in output]
     if op2 is not None:
       output = [op2(o) for o in output]
     output = [o.reshape(slice_shape) for o in output]
@@ -82,6 +78,9 @@ class SegmentReductionHelper(test.TestCase):
   def _mean_reduce_op(self, x):
     return x[0] / x[1] if isinstance(x, tuple) else x
 
+  def _sqrt_n_reduce_op(self, x):
+    return x[0] / np.sqrt(x[1]) if isinstance(x, tuple) else x
+
 
 class SegmentReductionOpTest(SegmentReductionHelper):
 
@@ -244,47 +243,128 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       self.assertAllClose(jacob_t, jacob_n)
 
 
-class UnsortedSegmentSumTest(SegmentReductionHelper):
+class UnsortedSegmentTest(SegmentReductionHelper):
+
+  def __init__(self, methodName='runTest'):
+    # Each item is np_op1, np_op2, tf_op, initial_value functor
+    self.ops_list = [(np.add, None,
+                      math_ops.unsorted_segment_sum, lambda t: 0),
+                     (self._mean_cum_op, self._mean_reduce_op,
+                      math_ops.unsorted_segment_mean, lambda t: 0),
+                     (self._mean_cum_op, self._sqrt_n_reduce_op,
+                      math_ops.unsorted_segment_sqrt_n, lambda t: 0),
+                     (np.ndarray.__mul__, None,
+                      math_ops.unsorted_segment_prod, lambda t: 1),
+                     (np.minimum, None,
+                      math_ops.unsorted_segment_min, lambda t: t.max),
+                     (np.maximum, None,
+                      math_ops.unsorted_segment_max, lambda t: t.min)]
+
+    # A subset of ops has been enabled for complex numbers
+    self.complex_ops_list = [(np.add, None,
+                              math_ops.unsorted_segment_sum, lambda t: 0)]
+    self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
+                                  dtypes_lib.float64]
+    self.all_dtypes = (self.differentiable_dtypes +
+                       [dtypes_lib.bfloat16,
+                        dtypes_lib.int64, dtypes_lib.int32,
+                        dtypes_lib.complex64, dtypes_lib.complex128])
+    super(UnsortedSegmentTest, self).__init__(methodName=methodName)
 
   def testValues(self):
-    dtypes = [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int64,
-        dtypes_lib.int32, dtypes_lib.complex64, dtypes_lib.complex128
-    ]
+    indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
+    num_segments = 12
+    for indices in indices_flat, indices_flat.reshape(5, 2):
+      shape = indices.shape + (2,)
+      for dtype in self.all_dtypes:
+        ops_list = self.complex_ops_list if dtype.is_complex else self.ops_list
+        tf_x, np_x = self._input(shape, dtype=dtype)
+        for use_gpu in [True, False]:
+          with self.test_session(use_gpu=True):
+            for np_op1, np_op2, tf_op, init_op in ops_list:
+              # sqrt_n doesn't support integers
+              if (np_op2 == self._sqrt_n_reduce_op and dtype.is_integer):
+                continue
+              # todo(philjd): enable this test once real_div supports bfloat16
+              if (np_op2 in [self._sqrt_n_reduce_op, self._mean_reduce_op] and
+                  dtype == dtypes_lib.bfloat16):
+                continue
+              np_ans = self._segmentReduce(
+                  indices, np_x, np_op1, np_op2, num_segments=num_segments,
+                  initial_value=init_op(dtype))
+              s = tf_op(tf_x, segment_ids=indices, num_segments=num_segments)
+              tf_ans = s.eval()
+              if dtype is dtypes_lib.bfloat16:
+                tf_ans = tf_ans.astype(np.float32)
+              self.assertAllClose(np_ans, tf_ans)
+              self.assertShapeEqual(np_ans, s)
+
+  def testNumSegmentsTypes(self):
+    dtypes = [dtypes_lib.int32, dtypes_lib.int64]
     indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
     num_segments = 12
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (2,)
       for dtype in dtypes:
         with self.test_session(use_gpu=True):
-          tf_x, np_x = self._input(shape, dtype=dtype)
+          tf_x, np_x = self._input(shape)
+          num_segments_constant = constant_op.constant(
+              num_segments, dtype=dtype)
           np_ans = self._segmentReduce(
-              indices, np_x, np.add, op2=None, num_out_rows=num_segments)
+              indices, np_x, np.add, op2=None, num_segments=num_segments)
           s = math_ops.unsorted_segment_sum(
-              data=tf_x, segment_ids=indices, num_segments=num_segments)
+              data=tf_x,
+              segment_ids=indices,
+              num_segments=num_segments_constant)
           tf_ans = s.eval()
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
-  def testGradientSegmentSum(self):
+  def testGradients(self):
     num_cols = 2
-    indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
+    indices_flat = np.array([0, 4, 0, -1, 3, -1, 4, 7, 7, 3])
     num_segments = max(indices_flat) + 3
-    for dtype in [dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
-                  dtypes_lib.complex128]:
+    for dtype in self.differentiable_dtypes:
+      ops_list = self.complex_ops_list if dtype.is_complex else self.ops_list
       for indices in indices_flat, indices_flat.reshape(5, 2):
         shape = indices.shape + (num_cols,)
-        with self.test_session(use_gpu=True):
-          tf_x, np_x = self._input(shape, dtype=dtype)
-          s = math_ops.unsorted_segment_sum(
-              data=tf_x, segment_ids=indices, num_segments=num_segments)
+        # test CPU and GPU as tf.gather behaves differently on each device
+        for use_gpu in [False, True]:
+          with self.test_session(use_gpu=use_gpu):
+            for _, _, tf_op, _ in ops_list:
+              tf_x, np_x = self._input(shape, dtype=dtype)
+              s = tf_op(tf_x, indices, num_segments)
+              jacob_t, jacob_n = gradient_checker.compute_gradient(
+                  tf_x,
+                  shape,
+                  s, [num_segments, num_cols],
+                  x_init_value=np_x,
+                  delta=1)
+            self.assertAllClose(jacob_t, jacob_n)
+
+  def testProdGrad(self):
+    # additional test for the prod gradient to ensure correct handling of zeros
+    values = np.array([0, 0, 1, 0, 2, 2, 3, 3, 3], dtype=np.float32)
+    indices = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=np.int32)
+    indices_neg = np.array([-1, 0, 0, -1, 1, 1, -1, 2, 2], dtype=np.int32)
+    values_tf = constant_op.constant(values)
+    # ground truth partial derivatives
+    gradients_indices = np.zeros((9, 3), dtype=np.float32)
+    gradients_indices_neg = np.zeros((9, 3), dtype=np.float32)
+    # the derivative w.r.t. to the other segments is zero, so here we only
+    # explicitly set the grad values for the corresponding segment
+    gradients_indices[range(9), indices] = [0, 0, 0, 4, 0, 0, 9, 9, 9]
+    gradients_indices_neg[range(9), indices_neg] = [0, 1, 0, 0, 2, 2, 0, 3, 3]
+    for use_gpu in [False, True]:
+      with self.test_session(use_gpu=use_gpu):
+        for ind, grad_gt in [(indices, gradients_indices),
+                             (indices_neg, gradients_indices_neg)]:
+          s = math_ops.unsorted_segment_prod(values_tf,
+                                             constant_op.constant(ind), 3)
           jacob_t, jacob_n = gradient_checker.compute_gradient(
-              tf_x,
-              shape,
-              s, [num_segments, num_cols],
-              x_init_value=np_x,
-              delta=1)
-        self.assertAllClose(jacob_t, jacob_n)
+              values_tf, (9,), s, (3,), x_init_value=values, delta=1)
+          self.assertAllClose(jacob_t, jacob_n)
+          self.assertAllClose(jacob_t, grad_gt)
 
   def testGradientMatchesSegmentSum(self):
     # Strategy: compute the gradient for UnsortedSegmentSum and SegmentSum
@@ -297,8 +377,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
     num_cols = 2
     shape = [n, num_cols]
     num_segments = max(indices) + 1
-    for dtype in [dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
-                  dtypes_lib.complex128]:
+    for dtype in self.differentiable_dtypes:
       with self.test_session(use_gpu=True):
         tf_x, np_x = self._input(shape, dtype=dtype)
         # Results from UnsortedSegmentSum
@@ -332,9 +411,8 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
           unsorted.eval()
 
   def testEmptySecondDimension(self):
-    dtypes = [
-        np.float32, np.float64, np.int64, np.int32, np.complex64, np.complex128
-    ]
+    dtypes = [np.float16, np.float32, np.float64, np.int64, np.int32,
+              np.complex64, np.complex128]
     with self.test_session(use_gpu=True):
       for dtype in dtypes:
         for itype in (np.int32, np.int64):
@@ -343,40 +421,18 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
           unsorted = math_ops.unsorted_segment_sum(data, segment_ids, 2)
           self.assertAllEqual(unsorted.eval(), np.zeros((2, 0), dtype=dtype))
 
-  def testGradientSegmentMax(self):
-    num_cols = 2
-    indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
-    num_segments = max(indices_flat) + 3
-    for indices in indices_flat, indices_flat.reshape(5, 2):
-      shape = indices.shape + (num_cols,)
-      with self.test_session(use_gpu=True):
-        tf_x, np_x = self._input(shape, dtype=dtypes_lib.float64)
-        s = math_ops.unsorted_segment_max(
-            data=tf_x, segment_ids=indices, num_segments=num_segments)
-        jacob_t, jacob_n = gradient_checker.compute_gradient(
-            tf_x,
-            shape,
-            s,
-            [num_segments, num_cols],
-            x_init_value=np_x.astype(np.double), delta=1)
-      self.assertAllClose(jacob_t, jacob_n)
-
   def testDropNegatives(self):
     # Note: the test is done by replacing segment_ids with 8 to -1
     # for index  and replace values generated by numpy with 0.
-    dtypes = [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int64,
-        dtypes_lib.int32, dtypes_lib.complex64, dtypes_lib.complex128
-    ]
     indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
     num_segments = 12
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (2,)
-      for dtype in dtypes:
+      for dtype in self.all_dtypes:
         with self.test_session(use_gpu=True):
           tf_x, np_x = self._input(shape, dtype=dtype)
           np_ans = self._segmentReduce(
-              indices, np_x, np.add, op2=None, num_out_rows=num_segments)
+              indices, np_x, np.add, op2=None, num_segments=num_segments)
           # Replace np_ans[8] with 0 for the value
           np_ans[8:] = 0
           # Replace 8 with -1 in indices
@@ -396,8 +452,15 @@ class SparseSegmentReductionHelper(SegmentReductionHelper):
     return (constant_op.constant(
         indices, dtype=dtypes_lib.int32), indices, a, b)
 
-  def _sparseSegmentReduce(self, x, indices, segment_indices, op1, op2=None):
-    return self._segmentReduce(segment_indices, x[indices], op1, op2)
+  def _sparseSegmentReduce(self,
+                           x,
+                           indices,
+                           segment_indices,
+                           op1,
+                           op2=None,
+                           num_segments=None):
+    return self._segmentReduce(
+        segment_indices, x[indices], op1, op2, num_segments=num_segments)
 
 
 class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
@@ -454,6 +517,31 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         tf_ans = s.eval()
         self.assertAllClose(np_ans, tf_ans)
 
+  def testWithNumSegments(self):
+    tf_x, np_x = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [(np.add, None, math_ops.sparse_segment_sum_with_num_segments),
+                (self._mean_cum_op, self._mean_reduce_op,
+                 math_ops.sparse_segment_mean_with_num_segments)]
+    segment_indices = [0, 2, 2, 2]
+    tf_indices = [8, 3, 0, 9]
+    num_segments = 5
+    with self.test_session(use_gpu=False):
+      for np_op1, np_op2, tf_op in ops_list:
+        np_ans = self._sparseSegmentReduce(
+            np_x,
+            tf_indices,
+            segment_indices,
+            np_op1,
+            np_op2,
+            num_segments=num_segments)
+        s = tf_op(
+            data=tf_x,
+            indices=tf_indices,
+            segment_ids=segment_indices,
+            num_segments=num_segments)
+        tf_ans = s.eval()
+        self.assertAllClose(np_ans, tf_ans)
+
   def testSegmentIdsGreaterThanZero(self):
     tf_x, np_x = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [(np.add, None, math_ops.sparse_segment_sum), (
@@ -562,6 +650,63 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         with self.assertRaisesOpError("segment ids must be >= 0"):
           s.eval()
 
+  def testSegmentWithNumSegmentsValid(self):
+    # Baseline for the test*WithNumSegmentsInvalid* methods below.
+    tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [
+        math_ops.sparse_segment_sum_with_num_segments,
+        math_ops.sparse_segment_mean_with_num_segments,
+    ]
+    num_segments = 5
+    segment_indices = [0, 1, 3, 3]
+    tf_indices = [8, 3, 0, 9]
+    with self.test_session(use_gpu=False):
+      for tf_op in ops_list:
+        s = tf_op(
+            data=tf_x,
+            indices=tf_indices,
+            segment_ids=segment_indices,
+            num_segments=num_segments)
+        s.eval()
+
+  def testSegmentWithNumSegmentsInvalid1(self):
+    tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [
+        math_ops.sparse_segment_sum_with_num_segments,
+        math_ops.sparse_segment_mean_with_num_segments,
+    ]
+    num_segments = 5
+    segment_indices = [0, 1, 3, 5]
+    tf_indices = [8, 3, 0, 9]
+    with self.test_session(use_gpu=False):
+      for tf_op in ops_list:
+        s = tf_op(
+            data=tf_x,
+            indices=tf_indices,
+            segment_ids=segment_indices,
+            num_segments=num_segments)
+        with self.assertRaisesOpError("segment ids must be < num_segments"):
+          s.eval()
+
+  def testSegmentWithNumSegmentsInvalid2(self):
+    tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [
+        math_ops.sparse_segment_sum_with_num_segments,
+        math_ops.sparse_segment_mean_with_num_segments,
+    ]
+    num_segments = -2
+    segment_indices = [0, 1, 3, 3]
+    tf_indices = [8, 3, 0, 9]
+    with self.test_session(use_gpu=False):
+      for tf_op in ops_list:
+        with self.assertRaisesRegexp(
+            ValueError, "Cannot specify a negative value for num_segments"):
+          tf_op(
+              data=tf_x,
+              indices=tf_indices,
+              segment_ids=segment_indices,
+              num_segments=num_segments)
+
   def testGradient(self):
     shape = [10, 4]
 
@@ -580,6 +725,32 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             delta=1)
       self.assertAllClose(jacob_t, jacob_n)
 
+  def testGradientWithEmptySegmentsAtEnd(self):
+    shape = [10, 4]
+
+    num_segments = 5
+    segment_indices = [0, 1, 2, 2]
+    num_indices = len(segment_indices)
+    for tf_op in [
+        math_ops.sparse_segment_sum_with_num_segments,
+        math_ops.sparse_segment_mean_with_num_segments,
+    ]:
+      with self.test_session():
+        tf_indices, _, tf_x, np_x = self._sparse_input(
+            shape, num_indices, dtype=dtypes_lib.float64)
+        s = tf_op(
+            data=tf_x,
+            indices=tf_indices,
+            segment_ids=segment_indices,
+            num_segments=num_segments)
+        jacob_t, jacob_n = gradient_checker.compute_gradient(
+            tf_x,
+            shape,
+            s, [5, 4],
+            x_init_value=np_x.astype(np.double),
+            delta=1)
+      self.assertAllClose(jacob_t, jacob_n)
+
   def testGradientValid(self):
     # Baseline for the testGradient*Invalid* methods below.
     tf_x, _ = self._input([3, 4], dtype=dtypes_lib.float32)
@@ -625,7 +796,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     ops_list = [
         math_ops.sparse_segment_mean_grad, math_ops.sparse_segment_sqrt_n_grad
     ]
-    segment_indices = [0, 1, 1, 1]  # 2 segments
+    segment_indices = [0, 1, 1, 4]  # 5 segments
     tf_indices = [8, 3, 0, 9]
     with self.test_session(use_gpu=False):
       for tf_op in ops_list:
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 6cdc7872f9176453c5ea9c318812f141214b723e..051a25080b826de05ee3e24a82fbcd1f47995544 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -217,30 +217,6 @@ class SliceTest(test.TestCase):
     self.assertEqual(expected_val.shape, slice_t.get_shape())
     self.assertEqual(expected_val.shape, slice2_t.get_shape())
 
-  def testRandomHighRank(self):
-    # Random dims of rank 8
-    input_shape = np.random.randint(0, 20, size=8)
-    inp = np.random.rand(*input_shape).astype("f")
-    with self.test_session(use_gpu=True) as sess:
-      a = constant_op.constant(
-          [float(x) for x in inp.ravel(order="C")],
-          shape=input_shape,
-          dtype=dtypes.float32)
-      indices = [0 if x == 0 else np.random.randint(x) for x in input_shape]
-      sizes = [
-          np.random.randint(0, input_shape[i] - indices[i] + 1)
-          for i in range(8)
-      ]
-      slice_t = array_ops.slice(a, indices, sizes)
-      slice_val = sess.run(slice_t)
-
-    expected_val = inp[indices[0]:indices[0] + sizes[0], indices[1]:indices[1] + sizes[
-      1], indices[2]:indices[2] + sizes[2], indices[3]:indices[3] + sizes[3], indices[
-        4]:indices[4] + sizes[4], indices[5]:indices[5] + sizes[5], indices[6]:indices[
-          6] + sizes[6], indices[7]:indices[7] + sizes[7]]
-    self.assertAllEqual(slice_val, expected_val)
-    self.assertEqual(expected_val.shape, slice_t.get_shape())
-
   def testPartialShapeInference(self):
     z = array_ops.zeros((1, 2, 3))
     self.assertAllEqual(z.get_shape().as_list(), [1, 2, 3])
@@ -251,6 +227,7 @@ class SliceTest(test.TestCase):
     m2 = array_ops.slice(z, [0, 0, 0], [constant_op.constant(1) + 0, 2, -1])
     self.assertAllEqual(m2.get_shape().as_list(), [None, 2, None])
 
+
   def _testGradientSlice(self, input_shape, slice_begin, slice_size):
     with self.test_session(use_gpu=True):
       num_inputs = np.prod(input_shape)
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index be72c1940723ea9f1e22a3b81d2b34ad67a57f4f..4d89831aae9a5e95210a8defb180e09c9d38f4d6 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -18,18 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
 import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class SoftmaxTest(test.TestCase):
 
   def _npSoftmax(self, features, dim=-1, log=False):
@@ -99,10 +99,10 @@ class SoftmaxTest(test.TestCase):
 
   def _testOverflow(self, use_gpu=False):
     if use_gpu:
-      type = np.float32
+      type = np.float32  # pylint: disable=redefined-builtin
     else:
-      type = np.float64
-    max = np.finfo(type).max
+      type = np.float64  # pylint: disable=redefined-builtin
+    max = np.finfo(type).max  # pylint: disable=redefined-builtin
     features = np.array([[1., 1., 1., 1.], [max, 1., 2., 3.]]).astype(type)
     with self.test_session(use_gpu=use_gpu):
       tf_log_softmax = nn_ops.log_softmax(features)
@@ -174,8 +174,11 @@ class SoftmaxTest(test.TestCase):
 
   def testDimTooLarge(self):
     with self.test_session():
+      # Use placeholder to make sure we get runtime error instead of shape
+      # inference error.
+      dim = array_ops.placeholder_with_default(100, shape=[])
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        nn_ops.softmax([1., 2., 3., 4.], dim=100).eval()
+        nn_ops.softmax([1., 2., 3., 4.], dim=dim).eval()
 
   def testLargeDims(self):
     # Make sure that we properly handle large inputs. See
diff --git a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
index 6ca447967196e5cdd59df74ae637f374826a8c30..4935ed6ca557f723b14713fdcde4e11c411bea1a 100644
--- a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
@@ -69,7 +69,7 @@ class SparseMatMulTest(test.TestCase):
 
     np_ans = np.matrix(np_x) * np.matrix(np_y)
     self.assertShapeEqual(np_ans, tf_ans)
-    self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-4)
+    self.assertAllCloseAccordingToType(np_ans, out, rtol=1e-4, atol=1e-4)
 
   def testBasic(self):
     x = np.arange(0., 4.).reshape([4, 1]).astype(np.float32)
@@ -128,7 +128,8 @@ class SparseMatMulTest(test.TestCase):
 
 class MatMulGradientTest(test.TestCase):
 
-  def _testGradients(self, tr_a, tr_b, sp_a, sp_b, a_dtype, b_dtype, name):
+  def _testGradients(self, tr_a, tr_b, sp_a, sp_b, a_dtype, b_dtype, delta,
+                     name):
     with self.test_session():
       a = constant_op.constant(
           RandMatrix(
@@ -151,12 +152,12 @@ class MatMulGradientTest(test.TestCase):
           a, [2, 3] if tr_a else [3, 2],
           m, [3, 4],
           x_init_value=a.eval(),
-          delta=1 / 64.) + gradient_checker.compute_gradient_error(
+          delta=delta) + gradient_checker.compute_gradient_error(
               b, [4, 2] if tr_b else [2, 4],
               m, [3, 4],
               x_init_value=b.eval(),
-              delta=1 / 64.))
-    self.assertLess(err, 1 / 128.)
+              delta=delta))
+    self.assertLess(err, delta / 2.)
 
   def testGradientInput(self):
     for tr_a in [True, False]:
@@ -165,9 +166,15 @@ class MatMulGradientTest(test.TestCase):
           for sp_b in [True, False]:
             for a_dtype in (dtypes.float32, dtypes.bfloat16):
               for b_dtype in (dtypes.float32, dtypes.bfloat16):
+                # Note: bfloat16 only has 7 mantissa bits, versus float32 with
+                # 10. Hence, we shift by 2 bits to pass the test.
+                if a_dtype == dtypes.bfloat16 and b_dtype == dtypes.bfloat16:
+                  delta = 1 / 16.
+                else:
+                  delta = 1 / 64.
                 name = "sparse_matmul_%s_%s_%s_%s" % (tr_a, tr_b, sp_a, sp_b)
                 self._testGradients(tr_a, tr_b, sp_a, sp_b, a_dtype, b_dtype,
-                                    name)
+                                    delta, name)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 1ab78a07784b7d0a6b4852c5336d1d0519f3b00f..cb5a66312fdfbc930483d59248848cf39cb6f9ba 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -938,6 +938,7 @@ class SparseTransposeTest(test.TestCase):
           sp_trans = sparse_ops.sparse_transpose(sp_input, perm=perm)
           dn_trans = sparse_ops.sparse_tensor_to_dense(sp_trans).eval()
           expected_trans = array_ops.transpose(dn_input, perm=perm).eval()
+          self.assertAllEqual(expected_trans.shape, sp_trans.get_shape())
           self.assertAllEqual(dn_trans, expected_trans)
 
 
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index 0d2887f3cef88605e87bddb7830845f12e37220b..89a54c8ab6fb19c79404222365124b72dd3b6f3f 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -57,6 +57,25 @@ class SparseReshapeTest(test.TestCase):
     sp_output = sparse_ops.sparse_reshape(sp_input, shape=(1, 5, 2, 3))
     self.assertAllEqual((1, 5, 2, 3), sp_output.get_shape())
 
+  def testStaticShapeInfoPreservedWithInferredDims(self):
+    sp_input = sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_2x3x4())
+    self.assertAllEqual((2, 3, 4), sp_input.get_shape())
+    sp_output = sparse_ops.sparse_reshape(sp_input, shape=(2, -1))
+    self.assertAllEqual((2, 3 * 4), sp_output.get_shape())
+
+  def testRaisesIfMoreThanOneInferredDim(self):
+    sp_input = sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_2x3x4())
+    with self.assertRaisesRegexp(ValueError, "At most one dimension can"):
+      sparse_ops.sparse_reshape(sp_input, shape=(-1, 2, -1))
+
+  def testRaisesIfInferredShapeNotPossible(self):
+    sp_input = sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_2x3x4())
+    with self.assertRaisesRegexp(ValueError, "Cannot reshape"):
+      sparse_ops.sparse_reshape(sp_input, shape=(-1, 7))
+
   def testSameShape(self):
     with self.test_session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6()
diff --git a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
index 78c113f51442c00984c1a5ab32a4dcc1a555ca9a..27b39a626fcc6b2705bf9e797b5293ed3f1c7820 100644
--- a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
@@ -64,12 +64,14 @@ class SerializeSparseTest(test.TestCase):
     shape = np.array([3, 4, 5]).astype(np.int64)
     return sparse_tensor_lib.SparseTensorValue(ind, val, shape)
 
-  def testSerializeDeserialize(self):
+  def _testSerializeDeserializeHelper(self,
+                                      serialize_fn,
+                                      deserialize_fn,
+                                      out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensorValue_5x6(np.arange(6))
-      serialized = sparse_ops.serialize_sparse(sp_input)
-      sp_deserialized = sparse_ops.deserialize_sparse(
-          serialized, dtype=dtypes.int32)
+      serialized = serialize_fn(sp_input, out_type=out_type)
+      sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
 
       indices, values, shape = sess.run(sp_deserialized)
 
@@ -77,14 +79,25 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(values, sp_input[1])
       self.assertAllEqual(shape, sp_input[2])
 
-  def testSerializeDeserializeBatch(self):
+  def testSerializeDeserialize(self):
+    self._testSerializeDeserializeHelper(sparse_ops.serialize_sparse,
+                                         sparse_ops.deserialize_sparse)
+
+  def testVariantSerializeDeserialize(self):
+    self._testSerializeDeserializeHelper(sparse_ops.serialize_sparse,
+                                         sparse_ops.deserialize_sparse,
+                                         dtypes.variant)
+
+  def _testSerializeDeserializeBatchHelper(self,
+                                           serialize_fn,
+                                           deserialize_fn,
+                                           out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensorValue_5x6(np.arange(6))
-      serialized = sparse_ops.serialize_sparse(sp_input)
+      serialized = serialize_fn(sp_input, out_type=out_type)
       serialized = array_ops.stack([serialized, serialized])
 
-      sp_deserialized = sparse_ops.deserialize_sparse(
-          serialized, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
 
       combined_indices, combined_values, combined_shape = sess.run(
           sp_deserialized)
@@ -97,16 +110,29 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], sp_input[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
-  def testSerializeDeserializeBatchInconsistentShape(self):
+  def testSerializeDeserializeBatch(self):
+    self._testSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
+                                              sparse_ops.deserialize_sparse)
+
+  def testSerializeDeserializeManyBatch(self):
+    self._testSerializeDeserializeBatchHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testVariantSerializeDeserializeBatch(self):
+    self._testSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
+                                              sparse_ops.deserialize_sparse,
+                                              dtypes.variant)
+
+  def _testSerializeDeserializeBatchInconsistentShapeHelper(
+      self, serialize_fn, deserialize_fn, out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorValue_5x6(np.arange(6))
       sp_input1 = self._SparseTensorValue_3x4(np.arange(6))
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
-      serialized1 = sparse_ops.serialize_sparse(sp_input1)
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
+      serialized1 = serialize_fn(sp_input1, out_type=out_type)
       serialized = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_sparse(
-          serialized, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
 
       combined_indices, combined_values, combined_shape = sess.run(
           sp_deserialized)
@@ -119,15 +145,26 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], sp_input1[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
-  def testSerializeDeserializeNestedBatch(self):
+  def testSerializeDeserializeBatchInconsistentShape(self):
+    self._testSerializeDeserializeBatchInconsistentShapeHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
+
+  def testVariantSerializeDeserializeBatchInconsistentShape(self):
+    self._testSerializeDeserializeBatchInconsistentShapeHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
+        dtypes.variant)
+
+  def _testSerializeDeserializeNestedBatchHelper(self,
+                                                 serialize_fn,
+                                                 deserialize_fn,
+                                                 out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensorValue_5x6(np.arange(6))
-      serialized = sparse_ops.serialize_sparse(sp_input)
+      serialized = serialize_fn(sp_input, out_type=out_type)
       serialized = array_ops.stack([serialized, serialized])
       serialized = array_ops.stack([serialized, serialized])
 
-      sp_deserialized = sparse_ops.deserialize_sparse(
-          serialized, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
 
       combined_indices, combined_values, combined_shape = sess.run(
           sp_deserialized)
@@ -151,40 +188,29 @@ class SerializeSparseTest(test.TestCase):
 
       self.assertAllEqual(combined_shape, [2, 2, 5, 6])
 
-  def testSerializeDeserializeMany(self):
-    with self.test_session(use_gpu=False) as sess:
-      sp_input0 = self._SparseTensorValue_5x6(np.arange(6))
-      sp_input1 = self._SparseTensorValue_3x4(np.arange(6))
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
-      serialized1 = sparse_ops.serialize_sparse(sp_input1)
-      serialized_concat = array_ops.stack([serialized0, serialized1])
-
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int32)
-
-      combined_indices, combined_values, combined_shape = sess.run(
-          sp_deserialized)
-
-      self.assertAllEqual(combined_indices[:6, 0], [0] * 6)  # minibatch 0
-      self.assertAllEqual(combined_indices[:6, 1:], sp_input0[0])
-      self.assertAllEqual(combined_indices[6:, 0], [1] * 6)  # minibatch 1
-      self.assertAllEqual(combined_indices[6:, 1:], sp_input1[0])
-      self.assertAllEqual(combined_values[:6], sp_input0[1])
-      self.assertAllEqual(combined_values[6:], sp_input1[1])
-      self.assertAllEqual(combined_shape, [2, 5, 6])
-
-  def testFeedSerializeDeserializeMany(self):
+  def testSerializeDeserializeNestedBatch(self):
+    self._testSerializeDeserializeNestedBatchHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
+
+  def testVariantSerializeDeserializeNestedBatch(self):
+    self._testSerializeDeserializeNestedBatchHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
+        dtypes.variant)
+
+  def _testFeedSerializeDeserializeBatchHelper(self,
+                                               serialize_fn,
+                                               deserialize_fn,
+                                               out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       sp_input1 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
       input1_val = self._SparseTensorValue_3x4(np.arange(6))
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
-      serialized1 = sparse_ops.serialize_sparse(sp_input1)
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
+      serialized1 = serialize_fn(sp_input1, out_type=out_type)
       serialized_concat = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized_concat, dtype=dtypes.int32)
 
       combined_indices, combined_values, combined_shape = sess.run(
           sp_deserialized, {sp_input0: input0_val,
@@ -198,40 +224,96 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], input1_val[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
-  def testSerializeManyDeserializeManyRoundTrip(self):
+  def testFeedSerializeDeserializeBatch(self):
+    self._testFeedSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
+                                                  sparse_ops.deserialize_sparse)
+
+  def testFeedSerializeDeserializeManyBatch(self):
+    self._testFeedSerializeDeserializeBatchHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testFeedVariantSerializeDeserializeBatch(self):
+    self._testFeedSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
+                                                  sparse_ops.deserialize_sparse,
+                                                  dtypes.variant)
+
+  def _testSerializeManyShapeHelper(self,
+                                    serialize_many_fn,
+                                    out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       # N == 4 because shape_value == [4, 5]
       indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
       values_value = np.array([b"a", b"b", b"c"])
       shape_value = np.array([4, 5], dtype=np.int64)
       sparse_tensor = self._SparseTensorPlaceholder(dtype=dtypes.string)
-      serialized = sparse_ops.serialize_many_sparse(sparse_tensor)
-      deserialized = sparse_ops.deserialize_many_sparse(
-          serialized, dtype=dtypes.string)
-      serialized_value, deserialized_value = sess.run(
-          [serialized, deserialized],
+      serialized = serialize_many_fn(sparse_tensor, out_type=out_type)
+      serialized_value = sess.run(
+          serialized,
           feed_dict={
               sparse_tensor.indices: indices_value,
               sparse_tensor.values: values_value,
               sparse_tensor.dense_shape: shape_value
           })
       self.assertEqual(serialized_value.shape, (4, 3))
+
+  def testSerializeManyShape(self):
+    self._testSerializeManyShapeHelper(sparse_ops.serialize_many_sparse)
+
+  def testVariantSerializeManyShape(self):
+    # NOTE: The following test is a no-op as it is currently not possible to
+    # convert the serialized variant value to a numpy value.
+    pass
+
+  def _testSerializeManyDeserializeBatchHelper(self,
+                                               serialize_many_fn,
+                                               deserialize_fn,
+                                               out_type=dtypes.string):
+    with self.test_session(use_gpu=False) as sess:
+      # N == 4 because shape_value == [4, 5]
+      indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
+      values_value = np.array([b"a", b"b", b"c"])
+      shape_value = np.array([4, 5], dtype=np.int64)
+      sparse_tensor = self._SparseTensorPlaceholder(dtype=dtypes.string)
+      serialized = serialize_many_fn(sparse_tensor, out_type=out_type)
+      deserialized = deserialize_fn(serialized, dtype=dtypes.string)
+      deserialized_value = sess.run(
+          deserialized,
+          feed_dict={
+              sparse_tensor.indices: indices_value,
+              sparse_tensor.values: values_value,
+              sparse_tensor.dense_shape: shape_value
+          })
       self.assertAllEqual(deserialized_value.indices, indices_value)
       self.assertAllEqual(deserialized_value.values, values_value)
       self.assertAllEqual(deserialized_value.dense_shape, shape_value)
 
-  def testDeserializeFailsWrongType(self):
+  def testSerializeManyDeserializeBatch(self):
+    self._testSerializeManyDeserializeBatchHelper(
+        sparse_ops.serialize_many_sparse, sparse_ops.deserialize_sparse)
+
+  def testSerializeManyDeserializeManyBatch(self):
+    self._testSerializeManyDeserializeBatchHelper(
+        sparse_ops.serialize_many_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testVariantSerializeManyDeserializeBatch(self):
+    self._testSerializeManyDeserializeBatchHelper(
+        sparse_ops.serialize_many_sparse, sparse_ops.deserialize_sparse,
+        dtypes.variant)
+
+  def _testDeserializeFailsWrongTypeHelper(self,
+                                           serialize_fn,
+                                           deserialize_fn,
+                                           out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       sp_input1 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
       input1_val = self._SparseTensorValue_3x4(np.arange(6))
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
-      serialized1 = sparse_ops.serialize_sparse(sp_input1)
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
+      serialized1 = serialize_fn(sp_input1, out_type=out_type)
       serialized_concat = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int64)
+      sp_deserialized = deserialize_fn(serialized_concat, dtype=dtypes.int64)
 
       with self.assertRaisesOpError(
           r"Requested SparseTensor of type int64 but "
@@ -240,41 +322,78 @@ class SerializeSparseTest(test.TestCase):
                  {sp_input0: input0_val,
                   sp_input1: input1_val})
 
-  def testDeserializeFailsInconsistentRank(self):
+  def testDeserializeFailsWrongType(self):
+    self._testDeserializeFailsWrongTypeHelper(sparse_ops.serialize_sparse,
+                                              sparse_ops.deserialize_sparse)
+
+  def testDeserializeManyFailsWrongType(self):
+    self._testDeserializeFailsWrongTypeHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testVariantDeserializeFailsWrongType(self):
+    self._testDeserializeFailsWrongTypeHelper(sparse_ops.serialize_sparse,
+                                              sparse_ops.deserialize_sparse,
+                                              dtypes.variant)
+
+  def _testDeserializeFailsInconsistentRankHelper(self,
+                                                  serialize_fn,
+                                                  deserialize_fn,
+                                                  out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       sp_input1 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
       input1_val = self._SparseTensorValue_1x1x1()
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
-      serialized1 = sparse_ops.serialize_sparse(sp_input1)
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
+      serialized1 = serialize_fn(sp_input1, out_type=out_type)
       serialized_concat = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized_concat, dtype=dtypes.int32)
 
       with self.assertRaisesOpError(
-          r"Inconsistent rank across SparseTensors: rank prior to "
-          r"SparseTensor\[1\] was: 3 but rank of SparseTensor\[1\] is: 4"):
+          r"Inconsistent shape across SparseTensors: rank prior to "
+          r"SparseTensor\[1\] was: 2 but rank of SparseTensor\[1\] is: 3"):
         sess.run(sp_deserialized,
                  {sp_input0: input0_val,
                   sp_input1: input1_val})
 
-  def testDeserializeFailsInvalidProto(self):
+  def testDeserializeFailsInconsistentRank(self):
+    self._testDeserializeFailsInconsistentRankHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
+
+  def testDeserializeManyFailsInconsistentRank(self):
+    self._testDeserializeFailsInconsistentRankHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testVariantDeserializeFailsInconsistentRank(self):
+    self._testDeserializeFailsInconsistentRankHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
+        dtypes.variant)
+
+  def _testDeserializeFailsInvalidProtoHelper(self,
+                                              serialize_fn,
+                                              deserialize_fn,
+                                              out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
       serialized1 = ["a", "b", "c"]
       serialized_concat = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized_concat, dtype=dtypes.int32)
 
-      with self.assertRaisesOpError(
-          r"Could not parse serialized_sparse\[1, 0\]"):
+      with self.assertRaisesOpError(r"Could not parse serialized proto"):
         sess.run(sp_deserialized, {sp_input0: input0_val})
 
+  def testDeserializeFailsInvalidProto(self):
+    self._testDeserializeFailsInvalidProtoHelper(sparse_ops.serialize_sparse,
+                                                 sparse_ops.deserialize_sparse)
+
+  def testDeserializeManyFailsInvalidProto(self):
+    self._testDeserializeFailsInvalidProtoHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/sparse_slice_op_test.py b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
index 762e400447c7e6e89ca4c0b480662aa91e287c26..da116601f833cc6b471e383e030c5fbe93b52ac5 100644
--- a/tensorflow/python/kernel_tests/sparse_slice_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
@@ -32,11 +32,12 @@ class SparseSliceOpTest(test.TestCase):
     # [  |11|  |13|14|  ]
     # [20|  |  |23|  |25]
     # [30|  |32|33|  |35]
-    ind = np.array([[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1, 4],
-                    [2, 0], [2, 3], [2, 5], [3, 0], [3, 2], [3, 3],
-                    [3, 5]]).astype(np.int64)
-    val = np.array(
-        [0, 2, 4, 5, 11, 13, 14, 20, 23, 25, 30, 32, 33, 35]).astype(np.int64)
+    ind = np.array([[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1,
+                                                                     4], [2, 0],
+                    [2, 3], [2, 5], [3, 0], [3, 2], [3, 3], [3, 5]]).astype(
+                        np.int64)
+    val = np.array([0, 2, 4, 5, 11, 13, 14, 20, 23, 25, 30, 32, 33, 35]).astype(
+        np.int64)
     shape = np.array([4, 6]).astype(np.int64)
     return sparse_tensor.SparseTensor(ind, val, shape)
 
@@ -65,50 +66,49 @@ class SparseSliceOpTest(test.TestCase):
     #  [    |'c1'|    |'d1']
     #  [    |    |'e1'|    ]
     ind = np.array([[0, 0, 0], [0, 0, 1], [0, 2, 0], [0, 2, 1], [1, 1, 0],
-                    [1, 1, 1], [1, 3, 0], [1, 3, 1], [2, 2, 0],
-                    [2, 2, 1]]).astype(np.int64)
+                    [1, 1, 1], [1, 3, 0], [1, 3, 1], [2, 2, 0], [2, 2,
+                                                                 1]]).astype(
+                                                                     np.int64)
     val = np.array(['a0', 'a1', 'b0', 'b1', 'c0', 'c1', 'd0', 'd1', 'e0', 'e1'])
     shape = np.array([3, 4, 2]).astype(np.int64)
     return sparse_tensor.SparseTensorValue(ind, val, shape)
 
   def _SparseTensor_3x4x2(self):
-    return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_3x4x2(
-    ))
+    return sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_3x4x2())
 
   def testSliceMatrixRows(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_4x6()
+      sp_input = self._SparseTensor_4x6()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [2, 6])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [2, 0], [3, 7])
-      self.assertAllEqual(sp_tensor0.indices.eval(), [[0, 0], [0, 2], [0, 4],
-                                                      [0, 5], [1, 1], [1, 3],
-                                                      [1, 4]])
+      self.assertAllEqual(
+          sp_tensor0.indices.eval(),
+          [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1, 4]])
       self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 4, 5, 11, 13, 14])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [2, 6])
-      self.assertAllEqual(sp_tensor1.indices.eval(), [[0, 0], [0, 3], [0, 5],
-                                                      [1, 0], [1, 2], [1, 3],
-                                                      [1, 5]])
+      self.assertAllEqual(
+          sp_tensor1.indices.eval(),
+          [[0, 0], [0, 3], [0, 5], [1, 0], [1, 2], [1, 3], [1, 5]])
       self.assertAllEqual(sp_tensor1.values.eval(),
                           [20, 23, 25, 30, 32, 33, 35])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 6])
 
   def testSliceMatrixUnevenCols(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_5x7()
+      sp_input = self._SparseTensor_5x7()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [5, 3])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 3], [5, 2])
       sp_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 5], [5, 2])
 
-      self.assertAllEqual(sp_tensor0.indices.eval(),
-                          [[0, 0], [0, 2], [1, 1], [2, 0], [3, 0], [3, 2],
-                           [4, 1]])
-      self.assertAllEqual(sp_tensor0.values.eval(),
-                          [0, 2, 11, 20, 30, 32, 41])
+      self.assertAllEqual(
+          sp_tensor0.indices.eval(),
+          [[0, 0], [0, 2], [1, 1], [2, 0], [3, 0], [3, 2], [4, 1]])
+      self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 11, 20, 30, 32, 41])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [5, 3])
       self.assertAllEqual(sp_tensor1.indices.eval(),
                           [[0, 1], [1, 0], [1, 1], [2, 0], [3, 0], [4, 1]])
-      self.assertAllEqual(sp_tensor1.values.eval(),
-                          [4, 13, 14, 23, 33, 44])
+      self.assertAllEqual(sp_tensor1.values.eval(), [4, 13, 14, 23, 33, 44])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [5, 2])
       self.assertAllEqual(sp_tensor2.indices.eval(),
                           [[0, 0], [1, 1], [2, 0], [3, 0], [4, 1]])
@@ -137,7 +137,7 @@ class SparseSliceOpTest(test.TestCase):
 
   def testSliceMatrixUnevenRows(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_5x7()
+      sp_input = self._SparseTensor_5x7()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [3, 7])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [3, 0], [3, 7])
       self.assertAllEqual(sp_tensor0.indices.eval(),
@@ -146,9 +146,9 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor0.values.eval(),
                           [0, 2, 4, 5, 11, 13, 14, 16, 20, 23, 25])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [3, 7])
-      self.assertAllEqual(sp_tensor1.indices.eval(),
-                          [[0, 0], [0, 2], [0, 3], [0, 5], [1, 1], [1, 4],
-                           [1, 6]])
+      self.assertAllEqual(
+          sp_tensor1.indices.eval(),
+          [[0, 0], [0, 2], [0, 3], [0, 5], [1, 1], [1, 4], [1, 6]])
       self.assertAllEqual(sp_tensor1.values.eval(),
                           [30, 32, 33, 35, 41, 44, 46])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 7])
@@ -156,9 +156,9 @@ class SparseSliceOpTest(test.TestCase):
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [2, 7])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [2, 0], [2, 7])
       sp_tensor2 = sparse_ops.sparse_slice(sp_input, [4, 0], [2, 7])
-      self.assertAllEqual(sp_tensor0.indices.eval(),
-                          [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3],
-                           [1, 4], [1, 6]])
+      self.assertAllEqual(
+          sp_tensor0.indices.eval(),
+          [[0, 0], [0, 2], [0, 4], [0, 5], [1, 1], [1, 3], [1, 4], [1, 6]])
       self.assertAllEqual(sp_tensor0.values.eval(),
                           [0, 2, 4, 5, 11, 13, 14, 16])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [2, 7])
@@ -166,45 +166,42 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor1.values.eval(),
                           [20, 23, 25, 30, 32, 33, 35])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 7])
-      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 1], [0, 4],
-                                                           [0, 6]])
+      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 1], [0, 4], [0, 6]])
       self.assertAllEqual(sp_tensor2.values.eval(), [41, 44, 46])
       self.assertAllEqual(sp_tensor2.dense_shape.eval(), [1, 7])
     return
 
   def testSliceAllRows(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_4x6()
+      sp_input = self._SparseTensor_4x6()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [1, 6])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [1, 0], [1, 6])
       sp_tensor2 = sparse_ops.sparse_slice(sp_input, [2, 0], [1, 7])
       sp_tensor3 = sparse_ops.sparse_slice(sp_input, [3, 0], [2, 7])
-      self.assertAllEqual(sp_tensor0.indices.eval(), [[0, 0], [0, 2], [0, 4],
-                                                         [0, 5]])
+      self.assertAllEqual(sp_tensor0.indices.eval(),
+                          [[0, 0], [0, 2], [0, 4], [0, 5]])
       self.assertAllEqual(sp_tensor0.values.eval(), [0, 2, 4, 5])
       self.assertAllEqual(sp_tensor0.dense_shape.eval(), [1, 6])
-      self.assertAllEqual(sp_tensor1.indices.eval(), [[0, 1], [0, 3], [0,
-                                                                          4]])
+      self.assertAllEqual(sp_tensor1.indices.eval(), [[0, 1], [0, 3], [0, 4]])
       self.assertAllEqual(sp_tensor1.values.eval(), [11, 13, 14])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [1, 6])
-      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 0], [0, 3], [0,
-                                                                          5]])
+      self.assertAllEqual(sp_tensor2.indices.eval(), [[0, 0], [0, 3], [0, 5]])
       self.assertAllEqual(sp_tensor2.values.eval(), [20, 23, 25])
       self.assertAllEqual(sp_tensor2.dense_shape.eval(), [1, 6])
-      self.assertAllEqual(sp_tensor3.indices.eval(), [[0, 0], [0, 2], [0, 3],
-                                                         [0, 5]])
+      self.assertAllEqual(sp_tensor3.indices.eval(),
+                          [[0, 0], [0, 2], [0, 3], [0, 5]])
       self.assertAllEqual(sp_tensor3.values.eval(), [30, 32, 33, 35])
       self.assertAllEqual(sp_tensor3.dense_shape.eval(), [1, 6])
 
   def testSliceColumns(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_4x6()
+      sp_input = self._SparseTensor_4x6()
       sparse_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [4, 2])
       sparse_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 2], [5, 2])
       sparse_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 4], [5, 3])
 
-      self.assertAllEqual(sparse_tensor0.indices.eval(), [[0, 0], [1, 1],
-                                                             [2, 0], [3, 0]])
+      self.assertAllEqual(sparse_tensor0.indices.eval(),
+                          [[0, 0], [1, 1], [2, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor0.values.eval(), [0, 11, 20, 30])
       self.assertAllEqual(sparse_tensor0.dense_shape.eval(), [4, 2])
       self.assertAllEqual(sparse_tensor1.indices.eval(),
@@ -218,15 +215,15 @@ class SparseSliceOpTest(test.TestCase):
 
   def testSliceAllColumns(self):
     with self.test_session(use_gpu=False):
-      sp_input=self._SparseTensor_4x6()
+      sp_input = self._SparseTensor_4x6()
       sparse_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [4, 1])
       sparse_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 1], [4, 1])
       sparse_tensor2 = sparse_ops.sparse_slice(sp_input, [0, 2], [4, 1])
       sparse_tensor3 = sparse_ops.sparse_slice(sp_input, [0, 3], [4, 1])
       sparse_tensor4 = sparse_ops.sparse_slice(sp_input, [0, 4], [5, 1])
       sparse_tensor5 = sparse_ops.sparse_slice(sp_input, [0, 5], [6, 3])
-      self.assertAllEqual(sparse_tensor0.indices.eval(), [[0, 0], [2, 0],
-                                                             [3, 0]])
+      self.assertAllEqual(sparse_tensor0.indices.eval(),
+                          [[0, 0], [2, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor0.values.eval(), [0, 20, 30])
       self.assertAllEqual(sparse_tensor0.dense_shape.eval(), [4, 1])
       self.assertAllEqual(sparse_tensor1.indices.eval(), [[1, 0]])
@@ -235,17 +232,18 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensor2.indices.eval(), [[0, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor2.values.eval(), [2, 32])
       self.assertAllEqual(sparse_tensor2.dense_shape.eval(), [4, 1])
-      self.assertAllEqual(sparse_tensor3.indices.eval(), [[1, 0], [2, 0],
-                                                             [3, 0]])
+      self.assertAllEqual(sparse_tensor3.indices.eval(),
+                          [[1, 0], [2, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor3.dense_shape.eval(), [4, 1])
       self.assertAllEqual(sparse_tensor3.values.eval(), [13, 23, 33])
       self.assertAllEqual(sparse_tensor4.indices.eval(), [[0, 0], [1, 0]])
       self.assertAllEqual(sparse_tensor4.values.eval(), [4, 14])
       self.assertAllEqual(sparse_tensor4.dense_shape.eval(), [4, 1])
-      self.assertAllEqual(sparse_tensor5.indices.eval(), [[0, 0], [2, 0],
-                                                             [3, 0]])
+      self.assertAllEqual(sparse_tensor5.indices.eval(),
+                          [[0, 0], [2, 0], [3, 0]])
       self.assertAllEqual(sparse_tensor5.values.eval(), [5, 25, 35])
       self.assertAllEqual(sparse_tensor5.dense_shape.eval(), [4, 1])
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index 347baf81148e9b747a9be4849912d154b220a084..2f27d1839b2218d0cc33d7278116186548ad3420 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -50,7 +50,7 @@ class StackOpTest(test.TestCase):
           # Convert [data[0], data[1], ...] separately to tensorflow
           # TODO(irving): Remove list() once we handle maps correctly
           xs = list(map(constant_op.constant, data))
-          # Pack back into a single tensorflow tensor
+          # Stack back into a single tensorflow tensor
           c = array_ops.stack(xs)
           self.assertAllEqual(c.eval(), data)
 
@@ -78,7 +78,7 @@ class StackOpTest(test.TestCase):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         for dtype in [np.bool, np.float32, np.int32, np.int64]:
           data = np.random.randn(*shape).astype(dtype)
-          # Pack back into a single tensorflow tensor directly using np array
+          # Stack back into a single tensorflow tensor directly using np array
           c = array_ops.stack(data)
           # This is implemented via a Const:
           self.assertEqual(c.op.type, "Const")
@@ -223,7 +223,7 @@ class StackOpTest(test.TestCase):
       array_ops.stack(t, axis=-3)
 
 
-class AutomaticPackingTest(test.TestCase):
+class AutomaticStackingTest(test.TestCase):
 
   def testSimple(self):
     with self.test_session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index 64b3388c5c0fd16436fa77ac5d8d0e8f9a859c32..dd06d303912813733886b9cf20590513760e67f1 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -25,8 +25,8 @@ from tensorflow.python.platform import test
 
 TIMEOUT = 1
 
-class StageTest(test.TestCase):
 
+class StageTest(test.TestCase):
 
   def testSimple(self):
     with ops.Graph().as_default() as G:
@@ -116,7 +116,10 @@ class StageTest(test.TestCase):
         x = array_ops.placeholder(dtypes.int32, name='x')
         p = array_ops.placeholder(dtypes.int32, name='p')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.StagingArea([dtypes.int32, ], shapes=[[]])
+        stager = data_flow_ops.StagingArea(
+            [
+                dtypes.int32,
+            ], shapes=[[]])
         stage = stager.put([x])
         peek = stager.peek(p)
         ret = stager.get()
@@ -162,8 +165,10 @@ class StageTest(test.TestCase):
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.int32, name='x')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.StagingArea([dtypes.int32, ],
-          capacity=capacity, shapes=[[]])
+        stager = data_flow_ops.StagingArea(
+            [
+                dtypes.int32,
+            ], capacity=capacity, shapes=[[]])
         stage = stager.put([x])
         ret = stager.get()
         size = stager.size()
@@ -201,9 +206,8 @@ class StageTest(test.TestCase):
         self.fail("Expected to timeout on iteration '{}' "
                   "but instead timed out on iteration '{}' "
                   "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
+                  "capacity is '{}'.".format(capacity, i, sess.run(size),
+                                             capacity))
 
       # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
@@ -216,16 +220,18 @@ class StageTest(test.TestCase):
       self.assertTrue(sess.run(size) == 0)
 
   def testMemoryLimit(self):
-    memory_limit = 512*1024  # 512K
-    chunk = 200*1024 # 256K
+    memory_limit = 512 * 1024  # 512K
+    chunk = 200 * 1024  # 256K
     capacity = memory_limit // chunk
 
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.uint8, name='x')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.StagingArea([dtypes.uint8, ],
-          memory_limit=memory_limit, shapes=[[]])
+        stager = data_flow_ops.StagingArea(
+            [
+                dtypes.uint8,
+            ], memory_limit=memory_limit, shapes=[[]])
         stage = stager.put([x])
         ret = stager.get()
         size = stager.size()
@@ -264,9 +270,8 @@ class StageTest(test.TestCase):
         self.fail("Expected to timeout on iteration '{}' "
                   "but instead timed out on iteration '{}' "
                   "Staging Area size is '{}' and configured "
-                  "capacity is '{}'.".format(capacity, i,
-                                            sess.run(size),
-                                            capacity))
+                  "capacity is '{}'.".format(capacity, i, sess.run(size),
+                                             capacity))
 
       # Should have capacity elements in the staging area
       self.assertTrue(sess.run(size) == capacity)
@@ -277,5 +282,6 @@ class StageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 0)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/summary_image_op_test.py b/tensorflow/python/kernel_tests/summary_image_op_test.py
index d2152ab560ad27b8a761ff8029fa425fdc9ff20d..4718827e8885c328cb2e84c2f1e8880bdbdb6cae 100644
--- a/tensorflow/python/kernel_tests/summary_image_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_image_op_test.py
@@ -50,7 +50,6 @@ class SummaryImageOpTest(test.TestCase):
     self.assertProtoEquals(expected, image_summ)
 
   def testImageSummary(self):
-    np.random.seed(7)
     for depth in (1, 3, 4):
       for positive in False, True:
         with self.test_session(graph=ops.Graph()) as sess:
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index 9871eacb0308ff72800b6a2d037eff974de020ed..d20567bf0ecf587e6bb12bfd4a2a57658d2f5914 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -190,10 +190,11 @@ class SvdGradOpTest(test.TestCase):
   pass  # Filled in below
 
 
-def _GetSvdGradOpTest(dtype_, shape_, compute_uv_):
+def _GetSvdGradOpTest(dtype_, shape_, compute_uv_, full_matrices_):
 
   def _NormalizingSvd(tf_a):
-    tf_s, tf_u, tf_v = linalg_ops.svd(tf_a, compute_uv=True, full_matrices=True)
+    tf_s, tf_u, tf_v = linalg_ops.svd(
+        tf_a, compute_uv=True, full_matrices=full_matrices_)
     # Singular vectors are only unique up to an arbitrary phase. We normalize
     # the vectors such that the first component of u (if m >=n) or v (if n > m)
     # have phase 0.
@@ -270,17 +271,20 @@ if __name__ == "__main__":
                          _GetSvdOpTest(dtype, shape, use_static_shape,
                                        compute_uv, full_matrices))
   for compute_uv in False, True:
-    dtypes = ([np.float32, np.float64] + [np.complex64, np.complex128] *
-              (not compute_uv))
-    for dtype in dtypes:
-      mat_shapes = ([(10, 11), (11, 10),
-                     (11, 11)] + [(5, 11), (11, 5)] * (not compute_uv))
-      for mat_shape in mat_shapes:
-        for batch_dims in [(), (3,)]:
-          shape = batch_dims + mat_shape
-          name = "%s_%s_compute_uv_%s" % (dtype.__name__,
-                                          "_".join(map(str, shape)), compute_uv)
-          _AddTest(SvdGradOpTest, "SvdGrad", name,
-                   _GetSvdGradOpTest(dtype, shape, compute_uv))
+    for full_matrices in False, True:
+      dtypes = ([np.float32, np.float64]
+                + [np.complex64, np.complex128] * (not compute_uv))
+      for dtype in dtypes:
+        mat_shapes = [(10, 11), (11, 10), (11, 11)]
+        if not full_matrices or not compute_uv:
+          mat_shapes += [(5, 11), (11, 5)]
+        for mat_shape in mat_shapes:
+          for batch_dims in [(), (3,)]:
+            shape = batch_dims + mat_shape
+            name = "%s_%s_compute_uv_%s_full_%s" % (
+                dtype.__name__, "_".join(map(str, shape)), compute_uv,
+                full_matrices)
+            _AddTest(SvdGradOpTest, "SvdGrad", name,
+                     _GetSvdGradOpTest(dtype, shape, compute_uv, full_matrices))
 
   test.main()
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index 40c0ade62a8df5a73b61c5679685ad9368c9dbbf..a519b69b22cf51ab4f4173b215c21a71d83e9f99 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -17,10 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import traceback
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -34,9 +36,10 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
 
-def variable_scoped_function():
+def variable_scoped_function(trainable=True):
   return variable_scope.get_variable(
-      "dummy", shape=[1], initializer=init_ops.zeros_initializer())
+      "dummy", shape=[1], trainable=trainable,
+      initializer=init_ops.zeros_initializer())
 
 
 def internally_variable_scoped_function(scope_name):
@@ -181,7 +184,8 @@ class TemplateTest(test.TestCase):
   def test_unique_name_raise_error_in_eager(self):
     with context.eager_mode():
       with self.assertRaisesRegexp(
-          ValueError, "unique_name cannot be used in eager mode."):
+          ValueError,
+          "unique_name_ cannot be used when eager exeuction is enabled."):
         template.make_template(
             "_", variable_scoped_function, unique_name_="s1")
 
@@ -306,6 +310,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual("s1/nested/x:0", v1.name)
     self.assertEqual("s1_1/nested/x:0", v3.name)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_nested_templates(self):
 
     def nested_template():
@@ -313,35 +318,112 @@ class TemplateTest(test.TestCase):
       nested2 = template.make_template("nested", variable_scoped_function)
       v1 = nested1()
       v2 = nested2()
+
+      # nested1 and nested2 should not share variables
       self.assertNotEqual(v1, v2)
-      return v2
+
+      # Variables created by nested1 should be isolated from variables
+      # created by nested2.
+      self.assertEqual(nested1.variables, [v1])
+      self.assertEqual(nested2.variables, [v2])
+      self.assertEqual(nested1.trainable_variables, [v1])
+      self.assertEqual(nested2.trainable_variables, [v2])
+      self.assertEqual(len(nested1.non_trainable_variables), 0)
+      self.assertEqual(len(nested2.non_trainable_variables), 0)
+      return v1, v2
 
     tmpl1 = template.make_template("s1", nested_template)
     tmpl2 = template.make_template("s1", nested_template)
 
-    v1 = tmpl1()
-    v2 = tmpl1()
-    v3 = tmpl2()
-    self.assertTrue(v1, v2)
-    self.assertNotEqual(v1, v3)
-    self.assertEqual("s1/nested_1/dummy:0", v1.name)
-    self.assertEqual("s1_1/nested_1/dummy:0", v3.name)
+    v1, v2 = tmpl1()
+    v3, v4 = tmpl1()
+    v5, v6 = tmpl2()
+
+    # The second invocation of tmpl1 should reuse the variables
+    # created in the first invocation.
+    self.assertEqual([v1, v2], [v3, v4])
+    self.assertEqual(tmpl1.variables, [v1, v2])
+    self.assertEqual(tmpl1.trainable_variables, [v1, v2])
+    self.assertEqual(len(tmpl1.non_trainable_variables), 0)
+
+    # tmpl1 and tmpl2 should not share variables.
+    self.assertNotEqual([v1, v2], [v5, v6])
+    self.assertSequenceEqual(tmpl2.variables, [v5, v6])
+    self.assertSequenceEqual(tmpl2.trainable_variables, [v5, v6])
+    self.assertEqual(len(tmpl2.non_trainable_variables), 0)
+    self.assertEqual("s1/nested/dummy:0", v1.name)
+    self.assertEqual("s1/nested_1/dummy:0", v2.name)
+    self.assertEqual("s1_1/nested/dummy:0", v5.name)
+    self.assertEqual("s1_1/nested_1/dummy:0", v6.name)
 
-  def test_nested_eager_templates_raises_error(self):
+  @test_util.run_in_graph_and_eager_modes()
+  def test_nested_templates_with_defun(self):
+
+    def variable_scoped_function_no_return_value(trainable=True):
+      # defun cannot compile functions that return non-Tensor objects
+      _ = variable_scope.get_variable(
+          "dummy",
+          shape=[1],
+          trainable=trainable,
+          initializer=init_ops.zeros_initializer())
 
     def nested_template():
-      nested1 = template.make_template("nested", variable_scoped_function)
-      nested2 = template.make_template("nested", variable_scoped_function)
-      v1 = nested1()
-      v2 = nested2()
+      nested1 = template.make_template_internal(
+          "nested",
+          variable_scoped_function_no_return_value,
+          create_graph_function_=True)
+      nested2 = template.make_template_internal(
+          "nested",
+          variable_scoped_function_no_return_value,
+          create_graph_function_=True)
+      nested1()
+      nested2()
+      v1 = nested1.variables
+      v2 = nested2.variables
+
+      # nested1 and nested2 should not share variables
       self.assertNotEqual(v1, v2)
-      return v2
 
+      # Variables created by nested1 should be isolated from variables
+      # created by nested2.
+      self.assertEqual(nested1.variables, v1)
+      self.assertEqual(nested2.variables, v2)
+      self.assertEqual(nested1.trainable_variables, v1)
+      self.assertEqual(nested2.trainable_variables, v2)
+      self.assertEqual(len(nested1.non_trainable_variables), 0)
+      self.assertEqual(len(nested2.non_trainable_variables), 0)
+
+    tmpl1 = template.make_template("s1", nested_template)
+    tmpl2 = template.make_template("s1", nested_template)
+
+    tmpl1()
+    v1 = tmpl1.variables
+    tmpl1()
+    v2 = tmpl1.variables
+    tmpl2()
+    v3 = tmpl2.variables
+
+    # The second invocation of tmpl1 should reuse the variables
+    # created in the first invocation.
+    self.assertSequenceEqual(v1, v2)
+
+    # tmpl1 and tmpl2 should not share variables.
+    self.assertNotEqual(v1, v3)
+    self.assertEqual("s1/nested/dummy:0", v1[0].name)
+    self.assertEqual("s1/nested_1/dummy:0", v1[1].name)
+    self.assertEqual("s1_1/nested/dummy:0", v3[0].name)
+    self.assertEqual("s1_1/nested_1/dummy:0", v3[1].name)
+
+  def test_graph_function_no_name(self):
     with context.eager_mode():
-      tmpl1 = template.make_template("s1", nested_template)
-      with self.assertRaisesRegexp(
-          ValueError, "Nested EagerTemaplates are not currently supported."):
-        tmpl1()
+
+      def f(_, y):
+        return y + 1
+
+      partial = functools.partial(f, 1.0)
+      tmpl = template.make_template_internal(
+          "a", partial, create_graph_function_=True)
+      self.assertAllEqual(tmpl(ops.convert_to_tensor(1.0)), 2.0)
 
   @test_util.run_in_graph_and_eager_modes()
   def test_immediate_scope_creation(self):
@@ -413,7 +495,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual(custom_getter_count[0], 2)
 
     # Test that custom getter is called when the variable scope is created
-  # during construction
+    # during construction
     custom_getter_count[0] = 0
     tmpl2 = template.make_template(
         "s2",
@@ -539,6 +621,36 @@ class TemplateTest(test.TestCase):
     # Ensure we can get the scopes before either template is actually called.
     self.assertEqual(1, len(ta.trainable_variables))
     self.assertEqual(1, len(tb.trainable_variables))
+    # None non-trainable variable was created.
+    self.assertEqual([], list(ta.non_trainable_variables))
+    self.assertEqual([], list(tb.non_trainable_variables))
+    # Ensure variables returns all the variables.
+    self.assertEqual(1, len(ta.variables))
+    self.assertEqual(1, len(tb.variables))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_non_trainable_variables(self):
+    # Make sure non_trainable_variables are created.
+    with variable_scope.variable_scope("foo2"):
+      ta = template.make_template("a", variable_scoped_function,
+                                  trainable=True)
+      tb = template.make_template("b", variable_scoped_function,
+                                  trainable=False)
+    # Initially there are not variables created.
+    self.assertEqual([], list(ta.variables))
+    self.assertEqual([], list(tb.variables))
+    # After calling there are variables created.
+    ta()
+    tb()
+    # Check the trainable and non_trainable variables.
+    self.assertEqual(1, len(ta.trainable_variables))
+    self.assertEqual([], list(ta.non_trainable_variables))
+
+    self.assertEqual([], list(tb.trainable_variables))
+    self.assertEqual(1, len(tb.non_trainable_variables))
+    # Ensure variables returns all the variables.
+    self.assertEqual(1, len(ta.variables))
+    self.assertEqual(1, len(tb.variables))
 
   # TODO(apassos) handle local variables in Eager
   def test_local_variables(self):
@@ -559,6 +671,31 @@ class TemplateTest(test.TestCase):
     self.assertEqual(0, len(ta.local_variables))
     self.assertEqual(1, len(tb.local_variables))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_make_template_with_defun(self):
+
+    def variable_scoped_function_no_return_value(scope_name):
+      # defun cannot compile functions that return non-Tensor objects
+      with variable_scope.variable_scope(scope_name):
+        _ = variable_scope.get_variable(
+            "dummy", shape=[1], initializer=init_ops.zeros_initializer())
+
+    tmpl = template.make_template_internal(
+        "s1",
+        variable_scoped_function_no_return_value,
+        create_graph_function_=True,
+        scope_name="test")
+
+    # The first invocation of tmpl1 creates variables, the second should
+    # be executed as a graph function.
+    tmpl()
+    v1 = tmpl.variables
+    tmpl()
+    v2 = tmpl.variables
+
+    self.assertSequenceEqual(v1, v2)
+    self.assertEqual("s1/test/dummy:0", v1[0].name)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 835fdbe2aa531ed28f59279e4e83d9f8297a3b98..aad2443eea7ad87faf481973e91ca3df32ccfb44 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -43,10 +43,6 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
-# TODO(ebrevdo): Delete this line after Dec. 4, 2017.
-tensor_array_ops._ENABLE_IDENTICAL_ELEMENT_SHAPES = True
-
-
 def _make_converter(tf_dtype):
   def _converter(x):
     if tf_dtype == dtypes.string:
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index f375157287460daff42670db4e30a06b6e75d177..8ad29afd0a0f2e7fbaaf2bde956326e578466b1d 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -56,15 +56,17 @@ class TensordotTest(test_lib.TestCase):
         axes_ph = array_ops.placeholder(dtypes.int32)
         output = math_ops.tensordot(a_ph, b_ph, axes_ph)
         _ = sess.run(
-            [output], feed_dict={a_ph: a,
-                                 b_ph: b,
-                                 axes_ph: (a_axes, b_axes)})
+            [output], feed_dict={
+                a_ph: a,
+                b_ph: b,
+                axes_ph: (a_axes, b_axes)
+            })
 
   def test_invalid_axes(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4]]
     # Invalid static axes.
-    for axes_value in -1, 0, [1], [[1]], [[1], [0, 1]]:
+    for axes_value in -1, 3, [1], [[1]], [[1], [0, 1]]:
       with self.assertRaises(ValueError):
         math_ops.tensordot(a, b, axes_value)
 
@@ -81,50 +83,51 @@ class TensordotTest(test_lib.TestCase):
       with self.test_session() as sess:
         with self.assertRaises(errors_impl.InvalidArgumentError):
           _ = sess.run(
-              [output], feed_dict={a_ph: a,
-                                   b_ph: b,
-                                   axes_ph: axes_value})
+              [output], feed_dict={
+                  a_ph: a,
+                  b_ph: b,
+                  axes_ph: axes_value
+              })
 
   # Test case for 11950
   def test_valid_axis(self):
-    for axes_value in [1, 2], [[1], [2]]:
+    for axes_value in [1, 2], [[1], [2]], [[], []], 0:
       with self.test_session() as sess:
-        np_a = np.ones((3,3))
+        np_a = np.ones((3, 3))
         np_b = np.array([2, 3, 1])[None, None]
         np_ans = np.tensordot(np_a, np_b, axes_value)
 
-        tf_a = array_ops.ones((3,3), dtype=dtypes.float32)
+        tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
         tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
         tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value).eval()
 
         self.assertAllEqual(tf_ans.shape, np_ans.shape)
         self.assertAllEqual(tf_ans, np_ans)
 
-
   def test_partial_shape_inference(self):
-    a = array_ops.placeholder(dtypes.float32)
-    b = array_ops.placeholder(dtypes.float32)
-    axes = ([1], [0])
-    output = math_ops.tensordot(a, b, axes)
-    self.assertEqual(output.get_shape().ndims, None)
-    a.set_shape([None, 2])
-    b.set_shape([2, 3])
-    output = math_ops.tensordot(a, b, axes)
-    output_shape = output.get_shape()
-    self.assertEqual(output_shape.ndims, 2)
-    output_shape = output_shape.as_list()
-    self.assertEqual(output_shape[0], None)
-    self.assertEqual(output_shape[1], 3)
-    a = array_ops.placeholder(dtypes.float32)
-    b = array_ops.placeholder(dtypes.float32)
-    a.set_shape([2, 2])
-    b.set_shape([2, None])
-    output = math_ops.tensordot(a, b, axes)
-    output_shape = output.get_shape()
-    self.assertEqual(output_shape.ndims, 2)
-    output_shape = output_shape.as_list()
-    self.assertEqual(output_shape[0], 2)
-    self.assertEqual(output_shape[1], None)
+    for axes in ([1], [0]), 1:
+      a = array_ops.placeholder(dtypes.float32)
+      b = array_ops.placeholder(dtypes.float32)
+      output = math_ops.tensordot(a, b, axes)
+      self.assertEqual(output.get_shape().ndims, None)
+      a.set_shape([None, 2])
+      b.set_shape([2, 3])
+      output = math_ops.tensordot(a, b, axes)
+      output_shape = output.get_shape()
+      self.assertEqual(output_shape.ndims, 2)
+      output_shape = output_shape.as_list()
+      self.assertEqual(output_shape[0], None)
+      self.assertEqual(output_shape[1], 3)
+      a = array_ops.placeholder(dtypes.float32)
+      b = array_ops.placeholder(dtypes.float32)
+      a.set_shape([2, 2])
+      b.set_shape([2, None])
+      output = math_ops.tensordot(a, b, axes)
+      output_shape = output.get_shape()
+      self.assertEqual(output_shape.ndims, 2)
+      output_shape = output_shape.as_list()
+      self.assertEqual(output_shape[0], 2)
+      self.assertEqual(output_shape[1], None)
 
 
 def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
@@ -169,9 +172,11 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
           axes = array_ops.placeholder(dtypes.int32)
           c = math_ops.tensordot(a, b, axes)
           tf_ans = sess.run(
-              c, feed_dict={a: a_np,
-                            b: b_np,
-                            axes: (a_dims_np, b_dims_np)})
+              c, feed_dict={
+                  a: a_np,
+                  b: b_np,
+                  axes: (a_dims_np, b_dims_np)
+              })
         else:
           tf_ans = math_ops.tensordot(a_np, b_np, (a_dims_np, b_dims_np)).eval()
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
@@ -191,8 +196,8 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
         low=-1.0, high=1.0, size=np.prod(shape)).reshape(shape).astype(dtype_)
     b_np = np.random.uniform(
         low=-1.0, high=1.0, size=np.prod(shape)).reshape(shape).astype(dtype_)
-    all_axes = [1]
-    if a_np.ndim > 1:
+    all_axes = [0, 1]
+    if a_np.ndim > 2:
       all_axes.append(a_np.ndim - 1)
     for axes in all_axes:
       np_ans = np.tensordot(a_np, b_np, axes=axes)
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index efb5b9f3641ceaebf1fd5285486b4a9bb93615cf..6ab931fdb97a8945ab610fda27a036693f0291e5 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -58,7 +58,7 @@ class TopKTest(test.TestCase):
         # Do some special casing of equality of indices: if indices
         # are not the same, but values are floating type, ensure that
         # the values are within epsilon of each other.
-        if not np.issubdtype(np_expected_values.dtype, np.float):
+        if not np.issubdtype(np_expected_values.dtype, np.floating):
           # Values are not floating point type; check indices exactly
           self.assertAllEqual(np_expected_indices, indices)
         else:
diff --git a/tensorflow/python/kernel_tests/transpose_op_test.py b/tensorflow/python/kernel_tests/transpose_op_test.py
index c551d9c3d056b50600d1331749ba865439748f7e..290200ce45488a9796f437d9f748e06483e83d96 100644
--- a/tensorflow/python/kernel_tests/transpose_op_test.py
+++ b/tensorflow/python/kernel_tests/transpose_op_test.py
@@ -53,11 +53,11 @@ class TransposeTest(test.TestCase):
       # Gradient check on CPU.
       xs = list(np.shape(x))
       ys = list(np.shape(tf_ans))
-      if x.dtype == np.float32:
+      if x.dtype in [np.float32, np.complex64]:
         jacob_t, jacob_n = gradient_checker.compute_gradient(inx, xs, y, ys, x,
                                                              1e-2)
         self.assertAllClose(jacob_t, jacob_n, 1e-3, 1e-3)
-      elif x.dtype == np.float64:
+      elif x.dtype in [np.float64, np.complex128]:
         jacob_t, jacob_n = gradient_checker.compute_gradient(inx, xs, y, ys, x,
                                                              1e-2)
         self.assertAllClose(jacob_t, jacob_n, 1e-6, 1e-6)
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 6390b7c51808cf338f0651bbbdb30c7b71af7d8e..6366d2e181c8cfabba8a78b664c25c85debc67ef 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -63,23 +63,24 @@ class UniqueTest(test.TestCase):
       self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
 
   def testInt32Axis(self):
-    x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
-    with self.test_session() as sess:
-      y0, idx0 = gen_array_ops.unique_v2(x, axis=[0])
-      tf_y0, tf_idx0 = sess.run([y0, idx0])
-      y1, idx1 = gen_array_ops.unique_v2(x, axis=[1])
-      tf_y1, tf_idx1 = sess.run([y1, idx1])
-    self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
-    self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
-    self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
-    self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+    for dtype in [np.int32, np.int64]:
+      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+      with self.test_session() as sess:
+        y0, idx0 = gen_array_ops._unique_v2(x, axis=np.array([0], dtype))
+        tf_y0, tf_idx0 = sess.run([y0, idx0])
+        y1, idx1 = gen_array_ops._unique_v2(x, axis=np.array([1], dtype))
+        tf_y1, tf_idx1 = sess.run([y1, idx1])
+      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
 
   def testInt32V2(self):
     # This test is only temporary, once V2 is used
     # by default, the axis will be wrapped to allow `axis=None`.
     x = np.random.randint(2, high=10, size=7000)
     with self.test_session() as sess:
-      y, idx = gen_array_ops.unique_v2(x, axis=[])
+      y, idx = gen_array_ops._unique_v2(x, axis=np.array([], np.int32))
       tf_y, tf_idx = sess.run([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
diff --git a/tensorflow/python/kernel_tests/unstack_op_test.py b/tensorflow/python/kernel_tests/unstack_op_test.py
index c2dcff978a4ac07b290352c98f2fc062583a3df1..1ee6e0866a6b1c7a9b641a95403d45213f5dc0b4 100644
--- a/tensorflow/python/kernel_tests/unstack_op_test.py
+++ b/tensorflow/python/kernel_tests/unstack_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functional tests for Unpack Op."""
+"""Functional tests for Unstack Op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,6 +22,7 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -42,15 +43,35 @@ class UnstackOpTest(test.TestCase):
     np.random.seed(7)
     with self.test_session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        data = np.random.randn(*shape)
-        # Convert data to a single tensorflow tensor
-        x = constant_op.constant(data)
-        # Unpack into a list of tensors
-        cs = array_ops.unstack(x, num=shape[0])
-        self.assertEqual(type(cs), list)
-        self.assertEqual(len(cs), shape[0])
-        cs = [c.eval() for c in cs]
-        self.assertAllEqual(cs, data)
+        for dtype in [
+            np.bool, np.float16, np.float32, np.float64, np.int32, np.int64
+        ]:
+          data = np.random.randn(*shape).astype(dtype)
+          # Convert data to a single tensorflow tensor
+          x = constant_op.constant(data)
+          # Unstack into a list of tensors
+          cs = array_ops.unstack(x, num=shape[0])
+          self.assertEqual(type(cs), list)
+          self.assertEqual(len(cs), shape[0])
+          cs = [c.eval() for c in cs]
+          self.assertAllEqual(cs, data)
+
+  def testSimpleGpu(self):
+    if not test_util.is_gpu_available():
+      self.skipTest('No GPU available')
+    np.random.seed(7)
+    with self.test_session(use_gpu=True, force_gpu=True):
+      for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
+        for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
+          data = np.random.randn(*shape).astype(dtype)
+          # Convert data to a single tensorflow tensor
+          x = constant_op.constant(data)
+          # Unstack into a list of tensors
+          cs = array_ops.unstack(x, num=shape[0])
+          self.assertEqual(type(cs), list)
+          self.assertEqual(len(cs), shape[0])
+          cs = [c.eval() for c in cs]
+          self.assertAllEqual(cs, data)
 
   def testGradientsAxis0(self):
     for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 70fe0a47852dd3b6ce999d049fe817efe68bed05..8527f116f9541942e52ba2ab635ca1212ea38583 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -117,7 +117,7 @@ class VariableScopeTest(test.TestCase):
         w = variable_scope.get_variable("w", [])
         self.assertEqual(w.dtype.base_dtype, dtypes.float16)
 
-  def testEagerVaribleStore(self):
+  def testEagerVariableStore(self):
     with context.eager_mode():
       store = variable_scope.EagerVariableStore()
       with store.as_default():
@@ -131,6 +131,30 @@ class VariableScopeTest(test.TestCase):
       self.assertFalse(v in store.non_trainable_variables())
       self.assertTrue(w in store.non_trainable_variables())
 
+      # Test copying.
+      new_store = store.copy()
+      with new_store.as_default():
+        new_v = variable_scope.get_variable("v")
+        new_w = variable_scope.get_variable("w")
+      self.assertEqual(new_v.numpy(), v.numpy())
+      self.assertEqual(new_w.numpy(), w.numpy())
+      self.assertTrue(new_v in new_store.variables())
+      self.assertTrue(new_w in new_store.variables())
+      self.assertTrue(new_v in new_store.trainable_variables())
+      self.assertFalse(new_w in new_store.trainable_variables())
+      self.assertFalse(new_v in new_store.non_trainable_variables())
+      self.assertTrue(new_w in new_store.non_trainable_variables())
+
+      # Check that variables are separate instances.
+      for v in store.variables():
+        v.assign(-1)
+      for v in new_store.variables():
+        v.assign(1)
+      for v in store.variables():
+        self.assertEqual(v.numpy(), -1)
+      for v in new_store.variables():
+        self.assertEqual(v.numpy(), 1)
+
   @test_util.run_in_graph_and_eager_modes()
   def testInitFromNonTensorValue(self):
     v = variable_scope.get_variable("v4", initializer=4, dtype=dtypes.int32)
@@ -746,6 +770,130 @@ class VariableScopeTest(test.TestCase):
           with ops.name_scope("scope2") as sc2:
             self.assertEqual(sc2, "outer_1/default/scope2/")
 
+  def testBasicWhenAuxiliaryNameScopeIsFalse(self):
+    with self.test_session():
+      with variable_scope.variable_scope(
+          "scope", auxiliary_name_scope=False) as scope:
+        self.assertEqual(scope.original_name_scope, "")
+        self.assertEqual(variable_scope.get_variable("w", []).name, "scope/w:0")
+        self.assertEqual(constant_op.constant([], name="c").name, "c:0")
+      with variable_scope.variable_scope(scope, auxiliary_name_scope=False):
+        self.assertEqual(scope.original_name_scope, "")
+        self.assertEqual(
+            variable_scope.get_variable("w1", []).name, "scope/w1:0")
+        self.assertEqual(constant_op.constant([], name="c1").name, "c1:0")
+      # Recheck: new name scope is NOT created before
+      with ops.name_scope("scope"):
+        self.assertEqual(constant_op.constant([], name="c").name, "scope/c:0")
+
+      with variable_scope.variable_scope("outer"):
+        with variable_scope.variable_scope(
+            "inner", auxiliary_name_scope=False) as inner:
+          self.assertEqual(inner.original_name_scope, "outer/")
+          self.assertEqual(
+              variable_scope.get_variable("w", []).name, "outer/inner/w:0")
+          self.assertEqual(constant_op.constant([], name="c").name, "outer/c:0")
+        with variable_scope.variable_scope(
+            inner, auxiliary_name_scope=False) as inner1:
+          self.assertEqual(inner1.original_name_scope, "outer/")
+          self.assertEqual(
+              variable_scope.get_variable("w1", []).name, "outer/inner/w1:0")
+          self.assertEqual(
+              constant_op.constant([], name="c1").name, "outer/c1:0")
+        # Recheck: new name scope is NOT created before
+        with ops.name_scope("inner"):
+          self.assertEqual(
+              constant_op.constant([], name="c").name, "outer/inner/c:0")
+
+  def testCreatedByDefaultNameWhenAuxiliaryNameScopeIsFalse(self):
+    with self.test_session():
+      with variable_scope.variable_scope(
+          None, default_name="default", auxiliary_name_scope=False) as scope:
+        self.assertEqual(scope.original_name_scope, "")
+        self.assertEqual(
+            variable_scope.get_variable("w", []).name, "default/w:0")
+        self.assertEqual(constant_op.constant([], name="c").name, "c:0")
+      # Recheck: new name scope is NOT created before
+      with ops.name_scope("default"):
+        self.assertEqual(constant_op.constant([], name="c").name, "default/c:0")
+
+      with variable_scope.variable_scope("outer"):
+        with variable_scope.variable_scope(
+            None, default_name="default", auxiliary_name_scope=False) as inner:
+          self.assertEqual(inner.original_name_scope, "outer/")
+          self.assertEqual(
+              variable_scope.get_variable("w", []).name, "outer/default/w:0")
+          self.assertEqual(constant_op.constant([], name="c").name, "outer/c:0")
+        # Recheck: new name scope is NOT created before
+        with ops.name_scope("default"):
+          self.assertEqual(
+              constant_op.constant([], name="c").name, "outer/default/c:0")
+
+  def testReenterRootScopeWhenAuxiliaryNameScopeIsFalse(self):
+    with self.test_session():
+      root_scope = variable_scope.get_variable_scope()
+      with variable_scope.variable_scope(
+          root_scope, auxiliary_name_scope=False) as scope:
+        self.assertEqual(scope.original_name_scope, "")
+        self.assertEqual(variable_scope.get_variable("w", []).name, "w:0")
+        self.assertEqual(constant_op.constant([], name="c").name, "c:0")
+
+      with variable_scope.variable_scope("outer"):
+        with variable_scope.variable_scope(
+            root_scope, auxiliary_name_scope=False) as inner:
+          self.assertEqual(inner.original_name_scope, "")
+          self.assertEqual(variable_scope.get_variable("w1", []).name, "w1:0")
+          self.assertEqual(
+              constant_op.constant([], name="c1").name, "outer/c1:0")
+
+  def testAuxiliaryNameScopeIsInvalid(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, "auxiliary_name_scope"):
+        with variable_scope.variable_scope(
+            None, default_name="scope", auxiliary_name_scope="invalid"):
+          pass
+
+      with self.assertRaisesRegexp(TypeError, "auxiliary_name_scope"):
+        with variable_scope.variable_scope(
+            "scope", auxiliary_name_scope="invalid"):
+          pass
+
+      with variable_scope.variable_scope("scope") as scope:
+        pass
+      with self.assertRaisesRegexp(TypeError, "auxiliary_name_scope"):
+        with variable_scope.variable_scope(
+            scope, auxiliary_name_scope="invalid"):
+          pass
+
+  def testReuseScopeWithoutNameScopeCollision(self):
+    # Github issue: #13429
+    with self.test_session():
+      with variable_scope.variable_scope("outer"):
+        with variable_scope.variable_scope("inner") as inner:
+          pass
+
+      with variable_scope.variable_scope(
+          inner, auxiliary_name_scope=False) as scope:
+        with ops.name_scope(scope.original_name_scope):
+          self.assertEqual(
+              variable_scope.get_variable("w", []).name, "outer/inner/w:0")
+          self.assertEqual(
+              constant_op.constant([], name="c").name, "outer/inner/c:0")
+        with ops.name_scope("inner"):
+          self.assertEqual(constant_op.constant([], name="c").name, "inner/c:0")
+
+      with variable_scope.variable_scope("another"):
+        with variable_scope.variable_scope(
+            inner, auxiliary_name_scope=False) as scope1:
+          with ops.name_scope(scope1.original_name_scope):
+            self.assertEqual(
+                variable_scope.get_variable("w1", []).name, "outer/inner/w1:0")
+            self.assertEqual(
+                constant_op.constant([], name="c1").name, "outer/inner/c1:0")
+          with ops.name_scope("inner"):
+            self.assertEqual(
+                constant_op.constant([], name="c").name, "another/inner/c:0")
+
   @test_util.run_in_graph_and_eager_modes()
   def testGetLocalVar(self):
     # Check that local variable respects naming.
@@ -881,6 +1029,18 @@ class VariableScopeTest(test.TestCase):
     # Ensure it is possible to do get_variable with a _ref dtype passed in.
     _ = variable_scope.get_variable("w", shape=[5, 6], dtype=v.dtype)
 
+  def testTwoGraphs(self):
+
+    def f():
+      g1 = ops.Graph()
+      g2 = ops.Graph()
+      with g1.as_default():
+        with g2.as_default():
+          with variable_scope.variable_scope("_"):
+            pass
+
+    self.assertRaisesRegexp(ValueError, "'_' is not a valid scope name", f)
+
 
 def axis0_into1_partitioner(shape=None, **unused_kwargs):
   part = [1] * len(shape)
@@ -901,35 +1061,6 @@ def axis0_into3_partitioner(shape=None, **unused_kwargs):
 
 class VariableScopeWithPartitioningTest(test.TestCase):
 
-  def testInitFromNonInitializer(self):
-    with self.test_session() as sess:
-      # Test various dtypes with zeros initializer as following:
-      types = [
-          dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.uint16, dtypes.int32,
-          dtypes.int64, dtypes.bool
-      ]
-
-      # Use different variable_name to distinguish various dtypes
-      for (i, dtype) in enumerate(types):
-        x = variable_scope.get_variable(
-            name="x%d" % i,
-            shape=(3, 4),
-            dtype=dtype,
-            partitioner=axis0_into2_partitioner)
-        y = variable_scope.get_variable(
-            name="y%d" % i,
-            shape=(6, 4),
-            dtype=dtype,
-            partitioner=axis0_into2_partitioner,
-            initializer=init_ops.zeros_initializer(dtype=dtype))
-
-        variables_lib.global_variables_initializer().run()
-        # x and y would become var list after partition
-        val_x = sess.run(list(x))
-        val_y = sess.run(list(y))
-
-        self.assertAllEqual(val_x, val_y)
-
   def testResultNameMatchesRequested(self):
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into2_partitioner):
@@ -1146,6 +1277,24 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
           (((np_vars[0] * np_vars[1]) + (np_vars[2] * np_vars[3]))
            + ((np_vars[4] * np_vars[5]) + (np_vars[6] * np_vars[7]))))
 
+  def testVariableCreator(self):
+
+    variable_names = []
+
+    def creator_a(next_creator, **kwargs):
+      variable_names.append(kwargs.get("name", ""))
+      return next_creator(**kwargs)
+
+    def creator_b(next_creator, **kwargs):
+      kwargs["name"] = "forced_name"
+      return next_creator(**kwargs)
+
+    with variable_scope.variable_creator_scope(creator_a):
+      with variable_scope.variable_creator_scope(creator_b):
+        variable_scope.variable(1.0, name="one_name")
+
+    self.assertAllEqual(variable_names, ["forced_name"])
+
 
 class PartitionInfoTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index f60ebf58f6fe81bf75fa4db166449843e5595c7d..b16c8c002c98a0351d1fc55fce061695327a18c9 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -22,6 +22,7 @@ import operator
 
 import numpy as np
 
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -509,6 +510,15 @@ class VariablesTestCase(test.TestCase):
         "<tf.Variable 'noop:0' shape=(5, 5) dtype=float32_ref>",
         repr(var))
 
+  def testVariableNamesPreserveNameScopesWithDefun(self):
+    @function.defun
+    def create_variable():
+      with ops.name_scope("foo"):
+        v = variables.Variable(0.0, name="bar")
+      self.assertEqual(v.name, "foo/bar:0")
+    with ops.get_default_graph().as_default():
+      create_variable()
+
 
 class IsInitializedTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index 43be08f8a1436eebdd712a4bbb69ce8ae8d12827..e152f02d8e983364603053dc5c8d14b5dfaf3605 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -38,9 +38,8 @@ class XentTest(test.TestCase):
       dim = len(features.shape) - 1
     one_only_on_dim = list(features.shape)
     one_only_on_dim[dim] = 1
-    e = np.exp(features - np.reshape(
-        np.amax(
-            features, axis=dim), one_only_on_dim))
+    e = np.exp(
+        features - np.reshape(np.amax(features, axis=dim), one_only_on_dim))
     probs = e / np.reshape(np.sum(e, axis=dim), one_only_on_dim)
     bp = (probs - labels)
     l = -np.sum(labels * np.log(probs + 1.0e-20), axis=dim)
@@ -85,10 +84,10 @@ class XentTest(test.TestCase):
 
   def testRankTooLarge(self):
     for dtype in np.float16, np.float32:
-      np_features = np.array(
-          [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]).astype(dtype)
-      np_labels = np.array(
-          [[[0., 0., 0., 1.]], [[0., .5, .5, 0.]]]).astype(dtype)
+      np_features = np.array([[[1., 1., 1., 1.]], [[1., 2., 3.,
+                                                    4.]]]).astype(dtype)
+      np_labels = np.array([[[0., 0., 0., 1.]], [[0., .5, .5,
+                                                  0.]]]).astype(dtype)
       self.assertRaisesRegexp(ValueError, "must be rank 2",
                               gen_nn_ops._softmax_cross_entropy_with_logits,
                               np_features, np_labels)
@@ -121,8 +120,8 @@ class XentTest(test.TestCase):
     # = [1.3862, 1.9401]
     np_loss, np_backprop = self._npXent(np.array(features), np.array(labels))
     self.assertAllClose(
-        np.array([[0.25, 0.25, 0.25, -0.75],
-                  [0.0321, -0.4129, -0.2632, 0.6439]]),
+        np.array([[0.25, 0.25, 0.25, -0.75], [0.0321, -0.4129, -0.2632,
+                                              0.6439]]),
         np_backprop,
         rtol=1.e-3,
         atol=1.e-3)
@@ -168,15 +167,17 @@ class XentTest(test.TestCase):
           shape=[3, 4],
           dtype=dtypes.float64,
           name="f")
-      x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
-                                                   name="xent")
+      x = nn_ops.softmax_cross_entropy_with_logits(
+          labels=l, logits=f, name="xent")
       err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3])
 
       # Check that no extra computation performed. When only first derivative is requested,
       # second derivative must not be computed. So when there is no second derivative,
       # there is no `BatchMatMul` op in the graph.
-      op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
-      self.assertNotIn('BatchMatMul', op_names)
+      op_names = [
+          op.op_def.name for op in sess.graph.get_operations() if op.op_def
+      ]
+      self.assertNotIn("BatchMatMul", op_names)
 
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
@@ -193,24 +194,29 @@ class XentTest(test.TestCase):
           shape=[3, 4],
           dtype=dtypes.float64,
           name="f")
-      x = nn_ops.softmax_cross_entropy_with_logits_v2(labels=l, logits=f,
-                                                      name="xent")
+      x = nn_ops.softmax_cross_entropy_with_logits_v2(
+          labels=l, logits=f, name="xent")
       err = gradient_checker.compute_gradient_error(l, [3, 4], x, [3])
 
     self.assertLess(err, 5e-8)
 
   def testSecondGradient(self):
     with self.test_session() as sess:
-      l = constant_op.constant([0.0, 0.0, 1.0/3, 0.0,
-                                1.0/3, 0.0, 0.0, 0.0,
-                                0.0, 0.5/3, 0.0, 0.5/3], shape=[12],
-                               dtype=dtypes.float64, name="l")
-      f = constant_op.constant([0.1, 0.2, 0.3, 0.4,
-                                0.1, 0.4, 0.9, 1.6,
-                                0.1, 0.8, 2.7, 6.4], shape=[12],
-                               dtype=dtypes.float64, name="f")
-      x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=f,
-                                                   name="xent")
+      l = constant_op.constant(
+          [
+              0.0, 0.0, 1.0 / 3, 0.0, 1.0 / 3, 0.0, 0.0, 0.0, 0.0, 0.5 / 3, 0.0,
+              0.5 / 3
+          ],
+          shape=[12],
+          dtype=dtypes.float64,
+          name="l")
+      f = constant_op.constant(
+          [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4],
+          shape=[12],
+          dtype=dtypes.float64,
+          name="f")
+      x = nn_ops.softmax_cross_entropy_with_logits(
+          labels=l, logits=f, name="xent")
       loss = math_ops.reduce_sum(x)
 
       gradients = gradients_impl.gradients(loss, [f])[0]
@@ -219,20 +225,23 @@ class XentTest(test.TestCase):
 
       # Check that second derivative is calculated.
       # (it is equivalent to being `BatchMatMul` op in the graph because of implementation of xentropy grad)
-      op_names = [op.op_def.name for op in sess.graph.get_operations() if op.op_def]
-      self.assertIn('BatchMatMul', op_names)
+      op_names = [
+          op.op_def.name for op in sess.graph.get_operations() if op.op_def
+      ]
+      self.assertIn("BatchMatMul", op_names)
 
     print("cross entropy hessian err = ", err)
     self.assertLess(err, 5e-8)
 
   def testWrapper(self):
-    features = np.array(
-        [[[1., 1., 1., 1.], [1., 2., 3., 4.]],
-         [[2., 3., 4., 5.], [6., 7., 8., 9.]],
-         [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(np.float32)
+    features = np.array([[[1., 1., 1., 1.], [1., 2., 3., 4.]],
+                         [[2., 3., 4., 5.], [6., 7., 8., 9.]],
+                         [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(
+                             np.float32)
     labels = np.array([[[0., 0., 0., 1.], [0., 1., 0., 0.]],
                        [[0., 0.5, 0.5, 0.], [0.5, 0.5, 0., 0.]],
-                       [[0., 1., 0., 0.], [0., 0., 1., 0.]]]).astype(np.float32)
+                       [[0., 1., 0., 0.], [0., 0., 1., 0.]]]).astype(
+                           np.float32)
     self._testXentWrapper(features, labels, dim=0, use_gpu=False)
     self._testXentWrapper(features, labels, dim=0, use_gpu=True)
     self._testXentWrapper(features, labels, dim=1, use_gpu=False)
@@ -240,6 +249,16 @@ class XentTest(test.TestCase):
     self._testXentWrapper(features, labels, dim=-1, use_gpu=False)
     self._testXentWrapper(features, labels, dim=-1, use_gpu=True)
 
+  def testZeroDimension(self):
+    features = np.zeros([0, 2, 4]).astype(np.float32)
+    labels = np.zeros([0, 2, 4]).astype(np.float32)
+    np_loss, _ = self._npXent(features, labels)
+    with self.test_session(use_gpu=True) as sess:
+      loss = nn_ops.softmax_cross_entropy_with_logits(
+          labels=labels, logits=features)
+      tf_loss = sess.run(loss)
+    self.assertAllEqual(np_loss, tf_loss)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 6be2bc3e7692bdba569f011243f368f0ee7abc94..8314c4aa87a5b54effc44c371703267517ffa07d 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -31,13 +31,16 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import utils as layers_util
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('layers.Layer')
 class Layer(object):
   """Base layer class.
 
@@ -99,19 +102,31 @@ class Layer(object):
         raise TypeError('Keyword argument not understood:', kwarg)
 
     # Mutable properties
+    # Indicates whether the layer's weights are updated during training
+    # and whether the layer's updates are run during training
     self.trainable = trainable
+    # A stateful layer is a layer whose updates are run during inference too,
+    # for instance stateful RNNs.
+    self.stateful = False
+    # Indicates whether `build` needs to be called upon layer call, to create
+    # the layer's weights.
     self.built = False
+    # Provides information about which inputs are compatible with the layer.
     self.input_spec = None
 
+    if activity_regularizer and context.in_eager_mode():
+      raise ValueError(
+          ('Activity regularization is not supported when executing eagerly. '
+           'Got activity_regularizer=%s') % (activity_regularizer,))
     self._activity_regularizer = activity_regularizer
     self._trainable_weights = []
     self._non_trainable_weights = []
     self._updates = []
+    # When executing eagerly, _losses is a list of zero-argument lambdas which
+    # return tensors. When using graph execution, _losses is a list of ops.
     self._losses = []
     self._reuse = kwargs.get('_reuse')
     self._graph = ops.get_default_graph()
-    self._per_input_losses = {}
-    self._per_input_updates = {}
     self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
     call_fn_args = estimator_util.fn_args(self.call)
     self._compute_previous_mask = ('mask' in call_fn_args or
@@ -214,6 +229,8 @@ class Layer(object):
   def updates(self):
     if context.in_eager_mode():
       raise RuntimeError('Layer.updates not supported in Eager mode.')
+    if not self.trainable and not self.stateful:
+      return []
     return self._updates
 
   def add_update(self, updates, inputs=None):
@@ -233,39 +250,34 @@ class Layer(object):
 
     Arguments:
       updates: Update op, or list/tuple of update ops.
-      inputs: Optional input tensor(s) that the update(s) depend on. Must
-        match the `inputs` argument passed to the `__call__` method at the time
-        the updates are created. If `None` is passed, the updates are assumed
-        to be unconditional, and will apply across all dataflows of the layer.
+      inputs: If anything other than None is passed, it signals the updates
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for BatchNormalization updates, for instance.
+        If None, the updates will be taken into account unconditionally,
+        and you are responsible for making sure that any dependency they might
+        have is available at runtime.
+        A step counter might fall into this category.
     """
     if context.in_eager_mode():
       return  # Updates already applied when in eager mode.
+
     updates = _to_list(updates)
-    if not updates:
-      return
+    updates = [x if isinstance(x, ops.Operation)
+               else ops.convert_to_tensor(x) for x in updates]
     self._updates += updates
-    if inputs is not None:
-      inputs = nest.flatten(inputs)
-    if not inputs:
-      inputs = None
-    if inputs is not None:
-      # We compute an ID that uniquely identifies the list of tensors.
-      # This ID is order-sensitive.
-      inputs_hash = layers_util.object_list_uid(inputs)
+    if inputs is None:
+      for u in updates:
+        u._unconditional_update = True  # pylint: disable=protected-access
     else:
-      inputs_hash = None
-    if inputs_hash not in self._per_input_updates:
-      self._per_input_updates[inputs_hash] = []
-    self._per_input_updates[inputs_hash] += updates
+      for u in updates:
+        u._unconditional_update = False  # pylint: disable=protected-access
 
   def get_updates_for(self, inputs):
     """Retrieves updates relevant to a specific set of inputs.
 
     Arguments:
       inputs: Input tensor or list/tuple of input tensors.
-        Must match the `inputs` argument passed to the `__call__` method
-        at the time the updates were created.
-        If you pass `inputs=None`, unconditional updates are returned.
 
     Returns:
       List of update ops of the layer that depend on `inputs`.
@@ -274,22 +286,43 @@ class Layer(object):
       RuntimeError: If called in Eager mode.
     """
     if context.in_eager_mode():
-      raise RuntimeError('Layer.get_updates_for not supported in Eager mode.')
-    if inputs is not None:
-      inputs = nest.flatten(inputs)
-    if not inputs:
-      inputs = None
-    if inputs is not None:
-      inputs_hash = layers_util.object_list_uid(inputs)
-    else:
-      inputs_hash = None
-    return self._per_input_updates.get(inputs_hash, [])
+      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
+
+    # Updates disabled if layer is not trainable and not explicitly stateful.
+    if not self.trainable and not self.stateful:
+      return []
+
+    if inputs is None:
+      # Requesting unconditional updates.
+      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
+
+    # Requesting input-conditional updates.
+    inputs = nest.flatten(inputs)
+    reachable = layers_util.get_reachable_from_inputs(inputs, self.updates)
+    updates = []
+    for update in self.updates:
+      if update in reachable:
+        updates.append(update)
+    return updates
 
   @property
   def losses(self):
+    """Losses which are associated with this `Layer`.
+
+    Note that when executing eagerly, getting this property evaluates
+    regularizers. When using graph execution, variable regularization ops have
+    already been created and are simply returned here.
+
+    Returns:
+      A list of tensors.
+    """
     if context.in_eager_mode():
-      raise RuntimeError('Layer.losses not supported in Eager mode.')
-    return self._losses
+      # _losses may only contain variable regularization losses when executing
+      # eagerly, and they have been saved as lambdas to be executed when
+      # requested.
+      return [regularizer() for regularizer in self._losses]
+    else:
+      return self._losses
 
   def add_loss(self, losses, inputs=None):
     """Add loss tensor(s), potentially dependent on layer inputs.
@@ -303,11 +336,18 @@ class Layer(object):
     The `get_losses_for` method allows to retrieve the losses relevant to a
     specific set of inputs.
 
+    Note that `add_loss` is not supported when executing eagerly. Instead,
+    variable regularizers may be added through `add_variable`. Activity
+    regularization is not supported directly (but such losses may be returned
+    from `Layer.call()`).
+
     Arguments:
       losses: Loss tensor, or list/tuple of tensors.
-      inputs: Optional input tensor(s) that the loss(es) depend on. Must
-        match the `inputs` argument passed to the `__call__` method at the time
-        the losses are created. If `None` is passed, the losses are assumed
+      inputs: If anything other than None is passed, it signals the losses
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for activity regularization losses, for instance.
+        If `None` is passed, the losses are assumed
         to be unconditional, and will apply across all dataflows of the layer
         (e.g. weight regularization losses).
 
@@ -315,24 +355,25 @@ class Layer(object):
       RuntimeError: If called in Eager mode.
     """
     if context.in_eager_mode():
+      # TODO(fchollet): it should be possible (and highly desirable) to support
+      # `add_loss` in eager mode. This allows great convenience and flexibility
+      # in defining custom losses on the fly (e.g. in VAEs).
+      # Simply appending the loss value to `self._losses`
+      # is the correct behavior.
+      # The only caveat is that we need to force the user to only call
+      # `add_loss` from inside a model or Layer's `call` method
+      # (otherwise the loss computation cannot be backproped through).
       raise RuntimeError('Layer.add_loss not supported in Eager mode.')
+
     losses = _to_list(losses)
-    if not losses:
-      return
     self._losses += losses
-    if inputs is not None:
-      inputs = nest.flatten(inputs)
-    if not inputs:
-      inputs = None
-    if inputs is not None:
-      # We compute an ID that uniquely identifies the list of tensors.
-      # This ID is order-sensitive.
-      inputs_hash = layers_util.object_list_uid(inputs)
+    if inputs is None:
+      for loss in losses:
+        loss._unconditional_loss = True  # pylint: disable=protected-access
     else:
-      inputs_hash = None
-    if inputs_hash not in self._per_input_losses:
-      self._per_input_losses[inputs_hash] = []
-    self._per_input_losses[inputs_hash] += losses
+      for loss in losses:
+        loss._unconditional_loss = False  # pylint: disable=protected-access
+    # TODO(fchollet): deprecate collection below.
     _add_elements_to_collection(losses, ops.GraphKeys.REGULARIZATION_LOSSES)
 
   def get_losses_for(self, inputs):
@@ -340,10 +381,6 @@ class Layer(object):
 
     Arguments:
       inputs: Input tensor or list/tuple of input tensors.
-        Must match the `inputs` argument passed to the `__call__`
-        method at the time the losses were created.
-        If you pass `inputs=None`, unconditional losses are returned,
-        such as weight regularization losses.
 
     Returns:
       List of loss tensors of the layer that depend on `inputs`.
@@ -353,15 +390,23 @@ class Layer(object):
     """
     if context.in_eager_mode():
       raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
-    if inputs is not None:
-      inputs = nest.flatten(inputs)
-    if not inputs:
-      inputs = None
-    if inputs is not None:
-      inputs_hash = layers_util.object_list_uid(inputs)
-    else:
-      inputs_hash = None
-    return self._per_input_losses.get(inputs_hash, [])
+
+    if inputs is None:
+      # Requesting unconditional losses.
+      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
+
+    # Requesting input-conditional losses.
+    inputs = nest.flatten(inputs)
+    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
+    # The losses we want to return will be part of this set.
+    # To avoid unnecessary work, we stop the search in case all of
+    # `self.losses` have been retrieved.
+    reachable = layers_util.get_reachable_from_inputs(inputs, self.losses)
+    losses = []
+    for loss in self.losses:
+      if loss in reachable:
+        losses.append(loss)
+    return losses
 
   def build(self, _):
     """Creates the variables of the layer."""
@@ -383,14 +428,9 @@ class Layer(object):
     """Determines op naming for the Layer."""
     return current_variable_scope.original_name_scope
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     """Computes the output shape of the layer given the input shape.
 
-    Assumes that the layer will be built to match that input shape.
-    If this method is not implemented by child classes, the default
-    assumption will be that the layer does not alter the shape of the tensors
-    passing through it.
-
     Args:
       input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
         be fully defined (e.g. the batch size may be unknown).
@@ -404,7 +444,7 @@ class Layer(object):
       ValueError: if `input_shape` is incomplete or is incompatible with the
         the layer.
     """
-    return input_shape
+    raise NotImplementedError
 
   def _make_unique_name(self, name_uid_map=None, avoid_names=None,
                         namespace='', zero_based=False):
@@ -460,24 +500,37 @@ class Layer(object):
       instance is returned.
 
     Raises:
-      RuntimeError: If called in Eager mode with regularizers.
+      RuntimeError: If called with partioned variable regularization and
+        eager execution is enabled.
     """
-    # Note that we currently don't support variable regularization in Eager
-    # mode. An alternative is for users to directly compute these losses before
-    # performing a backward pass.
+
+    # `init_graph` should point to the graph in which variable initialization
+    # will occur; it should be None if and only if initialization will take
+    # place in the eager context.
+    init_graph = None
     if context.in_graph_mode():
-      existing_variables = set(tf_variables.global_variables())
-    else:
-      existing_variables = []
-      if regularizer is not None:
-        raise RuntimeError('Variable regularization not supported in Eager '
-                           'mode.')
+      default_graph = ops.get_default_graph()
+      if default_graph.building_function:
+        with ops.init_scope():
+          # Retrieve the variables from the graph into which variables
+          # will be lifted; if initialization ops will be lifted into
+          # the eager context, then there is nothing to retrieve, since variable
+          # collections are not supported when eager execution is enabled.
+          if context.in_graph_mode():
+            init_graph = ops.get_default_graph()
+            existing_variables = set(tf_variables.global_variables())
+      else:
+        # Initialization ops will not be lifted out of the default graph.
+        init_graph = default_graph
+        existing_variables = set(tf_variables.global_variables())
+
     if dtype is None:
       dtype = self.dtype or dtypes.float32
 
     self._set_scope(None)
+    reuse = self.built or self._reuse
     with vs.variable_scope(
-        self._scope, reuse=(self.built or self._reuse)) as scope:
+        self._scope, reuse=reuse, auxiliary_name_scope=False) as scope:
       with ops.name_scope(self._name_scope_name(scope)):
         variable = vs.get_variable(name,
                                    shape=shape,
@@ -486,28 +539,53 @@ class Layer(object):
                                    constraint=constraint,
                                    trainable=trainable and self.trainable,
                                    partitioner=partitioner)
-        if (context.in_graph_mode() and trainable and self.trainable
-            and variable not in tf_variables.trainable_variables()):
-          # A custom getter / variable scope overrode the trainable flag.
-          trainable = False
-        if variable in existing_variables:
-          return variable
-        if regularizer:
-          # To match the behavior of tf.get_variable(), we only
-          # apply regularization if the variable is newly created.
-          if isinstance(variable, tf_variables.PartitionedVariable):
-            for v in variable:
-              with ops.colocate_with(v.op):
+
+        if init_graph is not None:  # pylint: disable=protected-access
+          # The variable was created and initialized in a graph.
+
+          if variable in existing_variables:
+            # To match the behavior of tf.get_variable(), we only apply
+            # regularization if the variable is newly created.
+            return variable
+
+          with init_graph.as_default():
+            trainable_variables = tf_variables.trainable_variables()
+          if (trainable and self.trainable and
+              variable not in trainable_variables):
+            # A custom getter / variable scope overrode the trainable flag.
+            trainable = False
+
+          if regularizer:
+            if isinstance(variable, tf_variables.PartitionedVariable):
+              for v in variable:
+                with ops.colocate_with(v.op):
+                  with ops.name_scope(name + '/Regularizer'):
+                    regularization = regularizer(v)
+                if regularization is not None:
+                  self.add_loss(regularization)
+            else:
+              with ops.colocate_with(variable.op):
                 with ops.name_scope(name + '/Regularizer'):
-                  regularization = regularizer(v)
+                  regularization = regularizer(variable)
               if regularization is not None:
                 self.add_loss(regularization)
-          else:
-            with ops.colocate_with(variable.op):
-              with ops.name_scope(name + '/Regularizer'):
-                regularization = regularizer(variable)
-            if regularization is not None:
-              self.add_loss(regularization)
+        elif regularizer:  # and initialization took place in an eager context
+          if isinstance(variable, tf_variables.PartitionedVariable):
+            raise RuntimeError(
+                'Partitioned variable regularization is not yet '
+                'supported when executing eagerly. File a feature request'
+                'if this is important to you.')
+          # Save a zero-argument lambda which runs the regularizer on the
+          # variable, to be executed when `Layer.losses` is requested.
+          # This makes losses responsive to variable updates when executing
+          # eagerly.
+          #
+          # TODO(akshayka): Do the same for graphs as well, so that losses
+          # collected in a while_loop can be run outside its control flow
+          # context and so that losses won't be swallowed up by graph functions
+          # (i.e., `.losses()` should always create regularizers).
+          self._losses.append(lambda: regularizer(variable))
+
     if trainable:
       self._trainable_weights.append(variable)
     else:
@@ -575,11 +653,12 @@ class Layer(object):
         # variable scope with this setting. We avoid re-creating variable scopes
         # after this point as an optimization.
         self._always_reuse_variable_scope = vs.variable_scope(
-            self._scope, reuse=True)
+            self._scope, reuse=True, auxiliary_name_scope=False)
         scope_context_manager = self._always_reuse_variable_scope
     else:
       scope_context_manager = vs.variable_scope(
-          self._scope, reuse=self._reuse)
+          self._scope, reuse=self._reuse, auxiliary_name_scope=False)
+    input_shapes = None
     with scope_context_manager as scope:
       with ops.name_scope(self._name_scope_name(scope)):
         if not self.built:
@@ -602,7 +681,7 @@ class Layer(object):
           self._assert_input_compatibility(inputs)
           if input_list and self._dtype is None:
             try:
-              self._dtype = input_list[0].dtype.name
+              self._dtype = input_list[0].dtype.base_dtype.name
             except AttributeError:
               pass
           input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
@@ -627,9 +706,12 @@ class Layer(object):
             raise ValueError('A layer\'s `call` method should return a Tensor '
                              'or a list of Tensors, not None.')
         else:
-          # Deferred mode behavior: use `_compute_output_shape` to
+          # Deferred mode behavior: use `compute_output_shape` to
           # infer the number of outputs of the layer and their shapes.
-          output_shapes = self._compute_output_shape(input_shapes)
+          if input_shapes is None:
+            input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+
+          output_shapes = self.compute_output_shape(input_shapes)
           output_shapes = nest.flatten(output_shapes)
           outputs = [
               # TODO(fchollet): name the deferred tensors?
@@ -650,12 +732,10 @@ class Layer(object):
                 activity_regularization = self._activity_regularizer(output)
               self.add_loss(activity_regularization, inputs=inputs)
 
-        if not in_deferred_mode:
-          # TODO(fchollet): consider how masking will work with deferred mode.
-          # Handle mask computation and propagation to the next layer.
+          # TODO(fchollet): consider enabling masking for Eager mode.
           if hasattr(self, 'compute_mask'):
             output_mask = self.compute_mask(inputs, previous_mask)
-            if isinstance(outputs, list):
+            if isinstance(outputs, (list, tuple)):
               if output_mask is None:
                 output_mask = [None for _ in range(len(outputs))]
               for x, m in zip(outputs, output_mask):
@@ -1154,6 +1234,7 @@ class Layer(object):
                                  ', found shape=' + str(shape))
 
 
+@tf_export('keras.layers.InputSpec', 'layers.InputSpec')
 class InputSpec(object):
   """Specifies the ndim, dtype and shape of every input to a layer.
 
@@ -1191,6 +1272,15 @@ class InputSpec(object):
     self.min_ndim = min_ndim
     self.axes = axes or {}
 
+  def __repr__(self):
+    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
+            ('shape=' + str(self.shape)) if self.shape else '',
+            ('ndim=' + str(self.ndim)) if self.ndim else '',
+            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
+            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
+            ('axes=' + str(self.axes)) if self.axes else '']
+    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+
 
 class Node(object):
   """A `Node` describes the connectivity between two layers.
@@ -1315,7 +1405,10 @@ class _DeferredTensor(object):
 
   def __init__(self, shape, dtype, name=None):
     self.shape = tensor_shape.TensorShape(shape)
-    self.dtype = dtypes.as_dtype(dtype)
+    if dtype is None:
+      self.dtype = dtypes.as_dtype(np.float32)
+    else:
+      self.dtype = dtypes.as_dtype(dtype)
     self.name = name
 
   def get_shape(self):
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 1eea20deefe2f033ab9827f9d5b92f8661618d21..91b8988d31c1f04be8134733e5e919c738ccb74f 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
@@ -88,6 +89,11 @@ class BaseLayerTest(test.TestCase):
           regularizer=regularizer)
       self.assertEqual(len(layer.losses), 1)
 
+  def testNoEagerActivityRegularizer(self):
+    with context.eager_mode():
+      with self.assertRaisesRegexp(ValueError, 'activity_regularizer'):
+        core_layers.Dense(1, activity_regularizer=lambda *args, **kwargs: 0.)
+
   def testGetVariable(self):
     with self.test_session():
 
@@ -469,6 +475,174 @@ class BaseLayerTest(test.TestCase):
     layer.apply(x)
     self.assertEqual(len(layer.get_losses_for(x)), 1)
 
+  def testNameScopeIsConsistentWithVariableScope(self):
+    # Github issue 13429.
+
+    class MyLayer(base_layers.Layer):
+
+      def build(self, input_shape):
+        self.my_var = self.add_variable('my_var', (), dtypes.float32)
+        self.built = True
+
+      def call(self, inputs):
+        return math_ops.multiply(inputs, self.my_var, name='my_op')
+
+    def _gen_layer(x, name=None):
+      layer = MyLayer(name=name)
+      out = layer.apply(x)
+      return layer, out
+
+    # unnamed layer
+    with ops.Graph().as_default():
+      x = array_ops.placeholder(dtypes.float32, (), 'x')
+      layer, op = _gen_layer(x)
+      layer1, op1 = _gen_layer(op)
+      layer2, op2 = _gen_layer(op1)
+
+      self.assertEqual(layer.my_var.name, 'my_layer/my_var:0')
+      self.assertEqual(op.name, 'my_layer/my_op:0')
+      self.assertEqual(layer1.my_var.name, 'my_layer_1/my_var:0')
+      self.assertEqual(op1.name, 'my_layer_1/my_op:0')
+      self.assertEqual(layer2.my_var.name, 'my_layer_2/my_var:0')
+      self.assertEqual(op2.name, 'my_layer_2/my_op:0')
+    # name starts from zero
+    with ops.Graph().as_default():
+      x = array_ops.placeholder(dtypes.float32, (), 'x')
+      layer, op = _gen_layer(x, name='name')
+      layer1, op1 = _gen_layer(op, name='name_1')
+      layer2, op2 = _gen_layer(op1, name='name_2')
+
+      self.assertEqual(layer.my_var.name, 'name/my_var:0')
+      self.assertEqual(op.name, 'name/my_op:0')
+      self.assertEqual(layer1.my_var.name, 'name_1/my_var:0')
+      self.assertEqual(op1.name, 'name_1/my_op:0')
+      self.assertEqual(layer2.my_var.name, 'name_2/my_var:0')
+      self.assertEqual(op2.name, 'name_2/my_op:0')
+    # name starts from one
+    with ops.Graph().as_default():
+      x = array_ops.placeholder(dtypes.float32, (), 'x')
+      layer, op = _gen_layer(x, name='name_1')
+      layer1, op1 = _gen_layer(op, name='name_2')
+      layer2, op2 = _gen_layer(op1, name='name_3')
+
+      self.assertEqual(layer.my_var.name, 'name_1/my_var:0')
+      self.assertEqual(op.name, 'name_1/my_op:0')
+      self.assertEqual(layer1.my_var.name, 'name_2/my_var:0')
+      self.assertEqual(op1.name, 'name_2/my_op:0')
+      self.assertEqual(layer2.my_var.name, 'name_3/my_var:0')
+      self.assertEqual(op2.name, 'name_3/my_op:0')
+
+  def testVariablesAreLiftedFromFunctionBuildingGraphs(self):
+    class MyLayer(base_layers.Layer):
+
+      def build(self, input_shape):
+        self.my_var = self.add_variable('my_var', (), dtypes.float32)
+        self.built = True
+
+      def call(self, inputs):
+        return inputs
+
+    outer_graph = ops.get_default_graph()
+    function_building_graph = ops.Graph()
+    function_building_graph._building_function = True
+    with outer_graph.as_default():
+      with function_building_graph.as_default():
+        layer = MyLayer()
+        # Create a variable by invoking build through __call__ and assert that
+        # it is both tracked and lifted into the outer graph.
+        inputs = array_ops.placeholder(dtypes.float32, (), 'inputs')
+        layer.apply(inputs)
+        self.assertEqual(len(layer.variables), 1)
+        self.assertEqual(len(layer.trainable_variables), 1)
+        self.assertEqual(layer.variables[0].graph, outer_graph)
+
+  def testGetUpdateFor(self):
+
+    class MyLayer(base_layers.Layer):
+
+      def build(self, input_shape):
+        self.a = self.add_variable('a',
+                                   (),
+                                   dtypes.float32,
+                                   trainable=False)
+        self.b = self.add_variable('b',
+                                   (),
+                                   dtypes.float32,
+                                   trainable=False)
+        self.add_update(state_ops.assign_add(self.a, 1., name='b_update'))
+        self.built = True
+
+      def call(self, inputs):
+        self.add_update(state_ops.assign_add(self.a, inputs, name='a_update'),
+                        inputs=True)
+        return inputs + 1
+
+    layer = MyLayer()
+    inputs = array_ops.placeholder(dtypes.float32, (), 'inputs')
+    intermediate_inputs = inputs + 1
+    outputs = layer.apply(intermediate_inputs)
+
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+    self.assertEqual(len(layer.get_updates_for([inputs])), 1)
+    self.assertEqual(len(layer.get_updates_for([intermediate_inputs])), 1)
+    self.assertEqual(len(layer.get_updates_for([outputs])), 0)
+
+    # Call same layer on new input, creating one more conditional update
+    inputs = array_ops.placeholder(dtypes.float32, (), 'inputs')
+    intermediate_inputs = inputs + 1
+    outputs = layer.apply(intermediate_inputs)
+
+    self.assertEqual(len(layer.updates), 3)
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+    # Check that we are successfully filtering out irrelevant updates
+    self.assertEqual(len(layer.get_updates_for([inputs])), 1)
+    self.assertEqual(len(layer.get_updates_for([intermediate_inputs])), 1)
+    self.assertEqual(len(layer.get_updates_for([outputs])), 0)
+
+  def testGetLossesFor(self):
+
+    class MyLayer(base_layers.Layer):
+
+      def build(self, input_shape):
+        self.a = self.add_variable('a',
+                                   (),
+                                   dtypes.float32,
+                                   trainable=False)
+        self.b = self.add_variable('b',
+                                   (),
+                                   dtypes.float32,
+                                   trainable=False)
+        self.add_loss(self.a)
+        self.built = True
+
+      def call(self, inputs):
+        self.add_loss(inputs, inputs=True)
+        return inputs + 1
+
+    layer = MyLayer()
+    inputs = array_ops.placeholder(dtypes.float32, (), 'inputs')
+    intermediate_inputs = inputs + 1
+    outputs = layer.apply(intermediate_inputs)
+
+    self.assertEqual(len(layer.losses), 2)
+    self.assertEqual(len(layer.get_losses_for(None)), 1)
+    self.assertEqual(len(layer.get_losses_for([inputs])), 1)
+    self.assertEqual(len(layer.get_losses_for([intermediate_inputs])), 1)
+    self.assertEqual(len(layer.get_losses_for([outputs])), 0)
+
+    # Call same layer on new input, creating one more conditional loss
+    inputs = array_ops.placeholder(dtypes.float32, (), 'inputs')
+    intermediate_inputs = inputs + 1
+    outputs = layer.apply(intermediate_inputs)
+
+    self.assertEqual(len(layer.losses), 3)
+    self.assertEqual(len(layer.get_losses_for(None)), 1)
+    # Check that we are successfully filtering out irrelevant losses
+    self.assertEqual(len(layer.get_losses_for([inputs])), 1)
+    self.assertEqual(len(layer.get_losses_for([intermediate_inputs])), 1)
+    self.assertEqual(len(layer.get_losses_for([outputs])), 0)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 7213fa1db8ee2eb4a36366464703b30d3f1a84c3..689046fe78832ebeb2a44a59797dc57396e9ce16 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _Conv(base.Layer):
@@ -64,8 +65,8 @@ class _Conv(base.Layer):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Optional regularizer function for the output.
@@ -192,7 +193,7 @@ class _Conv(base.Layer):
       return self.activation(outputs)
     return outputs
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_last':
       space = input_shape[1:-1]
@@ -222,6 +223,7 @@ class _Conv(base.Layer):
                                       new_space)
 
 
+@tf_export('layers.Conv1D')
 class Conv1D(_Conv):
   """1D convolution layer (e.g. temporal convolution).
 
@@ -254,8 +256,8 @@ class Conv1D(_Conv):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Optional regularizer function for the output.
@@ -311,6 +313,7 @@ class Conv1D(_Conv):
         name=name, **kwargs)
 
 
+@tf_export('layers.conv1d')
 def conv1d(inputs,
            filters,
            kernel_size,
@@ -362,8 +365,8 @@ def conv1d(inputs,
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Optional regularizer function for the output.
@@ -411,6 +414,7 @@ def conv1d(inputs,
   return layer.apply(inputs)
 
 
+@tf_export('layers.Conv2D')
 class Conv2D(_Conv):
   """2D convolution layer (e.g. spatial convolution over images).
 
@@ -450,8 +454,8 @@ class Conv2D(_Conv):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Optional regularizer function for the output.
@@ -507,6 +511,7 @@ class Conv2D(_Conv):
         name=name, **kwargs)
 
 
+@tf_export('layers.conv2d')
 def conv2d(inputs,
            filters,
            kernel_size,
@@ -565,8 +570,8 @@ def conv2d(inputs,
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Optional regularizer function for the output.
@@ -614,6 +619,7 @@ def conv2d(inputs,
   return layer.apply(inputs)
 
 
+@tf_export('layers.Conv3D')
 class Conv3D(_Conv):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
@@ -654,8 +660,8 @@ class Conv3D(_Conv):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Optional regularizer function for the output.
@@ -711,6 +717,7 @@ class Conv3D(_Conv):
         name=name, **kwargs)
 
 
+@tf_export('layers.conv3d')
 def conv3d(inputs,
            filters,
            kernel_size,
@@ -770,8 +777,8 @@ def conv3d(inputs,
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Optional regularizer function for the output.
@@ -819,8 +826,8 @@ def conv3d(inputs,
   return layer.apply(inputs)
 
 
-class SeparableConv2D(Conv2D):
-  """Depthwise separable 2D convolution.
+class _SeparableConv(_Conv):
+  """Abstract base layer for separable nD convolution.
 
   This layer performs a depthwise convolution that acts separately on
   channels, followed by a pointwise convolution that mixes channels.
@@ -829,12 +836,13 @@ class SeparableConv2D(Conv2D):
   It then optionally applies an activation function to produce the final output.
 
   Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
     filters: Integer, the dimensionality of the output space (i.e. the number
       of filters in the convolution).
-    kernel_size: A tuple or list of 2 integers specifying the spatial
+    kernel_size: A tuple or list of integers specifying the spatial
       dimensions of the filters. Can be a single integer to specify the same
       value for all spatial dimensions.
-    strides: A tuple or list of 2 positive integers specifying the strides
+    strides: A tuple or list of integers specifying the strides
       of the convolution. Can be a single integer to specify the same value for
       all spatial dimensions.
       Specifying any `stride` value != 1 is incompatible with specifying
@@ -843,9 +851,8 @@ class SeparableConv2D(Conv2D):
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
     dilation_rate: An integer or tuple/list of 2 integers, specifying
       the dilation rate to use for dilated convolution.
       Can be a single integer to specify the same value for
@@ -860,8 +867,8 @@ class SeparableConv2D(Conv2D):
     use_bias: Boolean, whether the layer uses a bias.
     depthwise_initializer: An initializer for the depthwise convolution kernel.
     pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
     depthwise_regularizer: Optional regularizer for the depthwise
       convolution kernel.
     pointwise_regularizer: Optional regularizer for the pointwise
@@ -883,12 +890,14 @@ class SeparableConv2D(Conv2D):
     name: A string, the name of the layer.
   """
 
-  def __init__(self, filters,
+  def __init__(self,
+               rank,
+               filters,
                kernel_size,
-               strides=(1, 1),
+               strides=1,
                padding='valid',
                data_format='channels_last',
-               dilation_rate=(1, 1),
+               dilation_rate=1,
                depth_multiplier=1,
                activation=None,
                use_bias=True,
@@ -905,7 +914,8 @@ class SeparableConv2D(Conv2D):
                trainable=True,
                name=None,
                **kwargs):
-    super(SeparableConv2D, self).__init__(
+    super(_SeparableConv, self).__init__(
+        rank=rank,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -920,7 +930,6 @@ class SeparableConv2D(Conv2D):
         trainable=trainable,
         name=name,
         **kwargs)
-    self.data_format = data_format
     self.depth_multiplier = depth_multiplier
     self.depthwise_initializer = depthwise_initializer
     self.pointwise_initializer = pointwise_initializer
@@ -930,26 +939,21 @@ class SeparableConv2D(Conv2D):
     self.pointwise_constraint = pointwise_constraint
 
   def build(self, input_shape):
-    if len(input_shape) < 4:
-      raise ValueError('Inputs to `SeparableConv2D` should have rank 4. '
-                       'Received input shape:', str(input_shape))
+    input_shape = tensor_shape.TensorShape(input_shape)
     if self.data_format == 'channels_first':
       channel_axis = 1
     else:
-      channel_axis = 3
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs to '
-                       '`SeparableConv2D` '
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
-    input_dim = int(input_shape[channel_axis])
-    self.input_spec = base.InputSpec(ndim=4, axes={channel_axis: input_dim})
-    depthwise_kernel_shape = (self.kernel_size[0],
-                              self.kernel_size[1],
-                              input_dim,
-                              self.depth_multiplier)
-    pointwise_kernel_shape = (1, 1,
-                              self.depth_multiplier * input_dim,
-                              self.filters)
+    input_dim = input_shape[channel_axis].value
+    self.input_spec = base.InputSpec(ndim=self.rank + 2,
+                                     axes={channel_axis: input_dim})
+    depthwise_kernel_shape = self.kernel_size + (input_dim,
+                                                 self.depth_multiplier)
+    pointwise_kernel_shape = (
+        1,) * self.rank + (self.depth_multiplier * input_dim, self.filters)
 
     self.depthwise_kernel = self.add_variable(
         name='depthwise_kernel',
@@ -979,6 +983,266 @@ class SeparableConv2D(Conv2D):
       self.bias = None
     self.built = True
 
+  def call(self, inputs):
+    raise NotImplementedError
+
+
+@tf_export('layers.SeparableConv1D')
+class SeparableConv1D(_SeparableConv):
+  """Depthwise separable 1D convolution.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A single integer specifying the spatial
+      dimensions of the filters.
+    strides: A single integer specifying the strides
+      of the convolution.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: A single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format='channels_last',
+               dilation_rate=1,
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer=None,
+               pointwise_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               pointwise_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(SeparableConv1D, self).__init__(
+        rank=1,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        depth_multiplier=depth_multiplier,
+        activation=activation,
+        use_bias=use_bias,
+        depthwise_initializer=depthwise_initializer,
+        pointwise_initializer=pointwise_initializer,
+        bias_initializer=bias_initializer,
+        depthwise_regularizer=depthwise_regularizer,
+        pointwise_regularizer=pointwise_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        depthwise_constraint=depthwise_constraint,
+        pointwise_constraint=pointwise_constraint,
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      strides = (1, 1) + self.strides + (1,)
+      spatial_start_dim = 1
+    else:
+      strides = (1, 1, 1) + self.strides
+      spatial_start_dim = 2
+
+    # Explicitly broadcast inputs and kernels to 4D.
+    # TODO(fchollet): refactor when a native separable_conv1d op is available.
+    inputs = array_ops.expand_dims(inputs, spatial_start_dim)
+    depthwise_kernel = array_ops.expand_dims(self.depthwise_kernel, 0)
+    pointwise_kernel = array_ops.expand_dims(self.pointwise_kernel, 0)
+    dilation_rate = (1,) + self.dilation_rate
+
+    outputs = nn.separable_conv2d(
+        inputs,
+        depthwise_kernel,
+        pointwise_kernel,
+        strides=strides,
+        padding=self.padding.upper(),
+        rate=dilation_rate,
+        data_format=utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.use_bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=utils.convert_data_format(self.data_format, ndim=4))
+
+    outputs = array_ops.squeeze(outputs, [spatial_start_dim])
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+
+@tf_export('layers.SeparableConv2D')
+class SeparableConv2D(_SeparableConv):
+  """Depthwise separable 2D convolution.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of 2 integers specifying the spatial
+      dimensions of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: A tuple or list of 2 positive integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format='channels_last',
+               dilation_rate=(1, 1),
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer=None,
+               pointwise_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               pointwise_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(SeparableConv2D, self).__init__(
+        rank=2,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        depth_multiplier=depth_multiplier,
+        activation=activation,
+        use_bias=use_bias,
+        depthwise_initializer=depthwise_initializer,
+        pointwise_initializer=pointwise_initializer,
+        bias_initializer=bias_initializer,
+        depthwise_regularizer=depthwise_regularizer,
+        pointwise_regularizer=pointwise_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        depthwise_constraint=depthwise_constraint,
+        pointwise_constraint=pointwise_constraint,
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+
   def call(self, inputs):
     # Apply the actual ops.
     if self.data_format == 'channels_last':
@@ -1004,27 +1268,125 @@ class SeparableConv2D(Conv2D):
       return self.activation(outputs)
     return outputs
 
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-    else:
-      rows = input_shape[1]
-      cols = input_shape[2]
 
-    rows = utils.conv_output_length(rows, self.kernel_size[0],
-                                    self.padding, self.strides[0])
-    cols = utils.conv_output_length(cols, self.kernel_size[1],
-                                    self.padding, self.strides[1])
-    if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], self.filters, rows, cols])
-    else:
-      return tensor_shape.TensorShape(
-          [input_shape[0], rows, cols, self.filters])
+@tf_export('layers.separable_conv1d')
+def separable_conv1d(inputs,
+                     filters,
+                     kernel_size,
+                     strides=1,
+                     padding='valid',
+                     data_format='channels_last',
+                     dilation_rate=1,
+                     depth_multiplier=1,
+                     activation=None,
+                     use_bias=True,
+                     depthwise_initializer=None,
+                     pointwise_initializer=None,
+                     bias_initializer=init_ops.zeros_initializer(),
+                     depthwise_regularizer=None,
+                     pointwise_regularizer=None,
+                     bias_regularizer=None,
+                     activity_regularizer=None,
+                     depthwise_constraint=None,
+                     pointwise_constraint=None,
+                     bias_constraint=None,
+                     trainable=True,
+                     name=None,
+                     reuse=None):
+  """Functional interface for the depthwise separable 1D convolution layer.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    inputs: Input tensor.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A single integer specifying the spatial
+      dimensions of the filters.
+    strides: A single integer specifying the strides
+      of the convolution.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    dilation_rate: A single integer, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    ValueError: if eager execution is enabled.
+  """
+  layer = SeparableConv1D(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilation_rate=dilation_rate,
+      depth_multiplier=depth_multiplier,
+      activation=activation,
+      use_bias=use_bias,
+      depthwise_initializer=depthwise_initializer,
+      pointwise_initializer=pointwise_initializer,
+      bias_initializer=bias_initializer,
+      depthwise_regularizer=depthwise_regularizer,
+      pointwise_regularizer=pointwise_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      depthwise_constraint=depthwise_constraint,
+      pointwise_constraint=pointwise_constraint,
+      bias_constraint=bias_constraint,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
 
 
+@tf_export('layers.separable_conv2d')
 def separable_conv2d(inputs,
                      filters,
                      kernel_size,
@@ -1089,8 +1451,8 @@ def separable_conv2d(inputs,
     use_bias: Boolean, whether the layer uses a bias.
     depthwise_initializer: An initializer for the depthwise convolution kernel.
     pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
     depthwise_regularizer: Optional regularizer for the depthwise
       convolution kernel.
     pointwise_regularizer: Optional regularizer for the pointwise
@@ -1146,6 +1508,7 @@ def separable_conv2d(inputs,
   return layer.apply(inputs)
 
 
+@tf_export('layers.Conv2DTranspose')
 class Conv2DTranspose(Conv2D):
   """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
 
@@ -1175,8 +1538,8 @@ class Conv2DTranspose(Conv2D):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Optional regularizer function for the output.
@@ -1232,7 +1595,8 @@ class Conv2DTranspose(Conv2D):
 
   def build(self, input_shape):
     if len(input_shape) != 4:
-      raise ValueError('Inputs should have rank 4. Received input shape: ' + str(input_shape))
+      raise ValueError('Inputs should have rank 4. Received input shape: ' +
+                       str(input_shape))
     if self.data_format == 'channels_first':
       channel_axis = 1
     else:
@@ -1324,7 +1688,7 @@ class Conv2DTranspose(Conv2D):
       return self.activation(outputs)
     return outputs
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     output_shape = list(input_shape)
     if self.data_format == 'channels_first':
@@ -1343,6 +1707,7 @@ class Conv2DTranspose(Conv2D):
     return tensor_shape.TensorShape(output_shape)
 
 
+@tf_export('layers.conv2d_transpose')
 def conv2d_transpose(inputs,
                      filters,
                      kernel_size,
@@ -1390,8 +1755,8 @@ def conv2d_transpose(inputs,
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If `None`, then no
-      bias will be applied.
+    bias_initializer: An initializer for the bias vector. If `None`, the default
+      initializer will be used.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Optional regularizer function for the output.
@@ -1438,6 +1803,7 @@ def conv2d_transpose(inputs,
   return layer.apply(inputs)
 
 
+@tf_export('layers.Conv3DTranspose')
 class Conv3DTranspose(Conv3D):
   """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
 
@@ -1463,8 +1829,8 @@ class Conv3DTranspose(Conv3D):
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If `None`, then no
-      bias will be applied.
+    bias_initializer: An initializer for the bias vector. If `None`, the default
+      initializer will be used.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Optional regularizer function for the output.
@@ -1552,6 +1918,7 @@ class Conv3DTranspose(Conv3D):
           dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     inputs_shape = array_ops.shape(inputs)
@@ -1622,6 +1989,8 @@ class Conv3DTranspose(Conv3D):
 
     if self.use_bias:
       outputs_shape = outputs.shape.as_list()
+      if outputs_shape[0] is None:
+        outputs_shape[0] = -1
       if self.data_format == 'channels_first':
         outputs_4d = array_ops.reshape(outputs, [
             outputs_shape[0], outputs_shape[1],
@@ -1642,7 +2011,7 @@ class Conv3DTranspose(Conv3D):
       return self.activation(outputs)
     return outputs
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     output_shape = list(input_shape)
     if self.data_format == 'channels_first':
@@ -1655,14 +2024,15 @@ class Conv3DTranspose(Conv3D):
 
     output_shape[c_axis] = self.filters
     output_shape[d_axis] = utils.deconv_output_length(
-        output_shape[d_axis], stride_d, kernel_d, self.padding)
+        output_shape[d_axis], kernel_d, self.padding, stride_d)
     output_shape[h_axis] = utils.deconv_output_length(
-        output_shape[h_axis], stride_h, kernel_h, self.padding)
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
     output_shape[w_axis] = utils.deconv_output_length(
-        output_shape[w_axis], stride_w, kernel_w, self.padding)
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
     return tensor_shape.TensorShape(output_shape)
 
 
+@tf_export('layers.conv3d_transpose')
 def conv3d_transpose(inputs,
                      filters,
                      kernel_size,
@@ -1704,8 +2074,8 @@ def conv3d_transpose(inputs,
       linear activation.
     use_bias: Boolean, whether the layer uses a bias.
     kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, no bias will
-      be applied.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
     kernel_regularizer: Optional regularizer for the convolution kernel.
     bias_regularizer: Optional regularizer for the bias vector.
     activity_regularizer: Optional regularizer function for the output.
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index da10fe68a0c1366fccde67e01bee6155a26d481e..160e732b6798697d05815e13a7b1c399070f0783 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -20,9 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional as conv_layers
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -32,6 +34,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class ConvTest(test.TestCase):
 
   def testInvalidDataFormat(self):
@@ -97,16 +100,14 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
   def testUnknownInputChannels(self):
-    images = random_ops.random_uniform((5, 7, 9, 4))
-    images._shape = tensor_shape.as_shape((5, 7, 9, None))
+    images = array_ops.placeholder(dtypes.float32, (5, 7, 9, None))
     layer = conv_layers.Conv2D(32, [3, 3], activation=nn_ops.relu)
     with self.assertRaisesRegexp(ValueError,
                                  'The channel dimension of the inputs '
                                  'should be defined. Found `None`.'):
       _ = layer.apply(images)
 
-    images = random_ops.random_uniform((5, 4, 7, 9))
-    images._shape = tensor_shape.as_shape((5, None, 7, 9))
+    images = array_ops.placeholder(dtypes.float32, (5, None, 7, 9))
     layer = conv_layers.Conv2D(32, [3, 3], data_format='channels_first')
     with self.assertRaisesRegexp(ValueError,
                                  'The channel dimension of the inputs '
@@ -167,16 +168,14 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
   def testUnknownInputChannelsConv1D(self):
-    data = random_ops.random_uniform((5, 4, 7))
-    data._shape = tensor_shape.as_shape((5, 4, None))
+    data = array_ops.placeholder(dtypes.float32, (5, 4, None))
     layer = conv_layers.Conv1D(32, 3, activation=nn_ops.relu)
     with self.assertRaisesRegexp(ValueError,
                                  'The channel dimension of the inputs '
                                  'should be defined. Found `None`.'):
       _ = layer.apply(data)
 
-    data = random_ops.random_uniform((5, 7, 4))
-    data._shape = tensor_shape.as_shape((5, None, 4))
+    data = array_ops.placeholder(dtypes.float32, (5, None, 4))
     layer = conv_layers.Conv1D(32, 3, data_format='channels_first')
     with self.assertRaisesRegexp(ValueError,
                                  'The channel dimension of the inputs '
@@ -195,8 +194,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
   def testUnknownInputChannelsConv3D(self):
-    volumes = random_ops.random_uniform((5, 6, 7, 9, 9))
-    volumes._shape = tensor_shape.as_shape((5, 6, 7, 9, None))
+    volumes = array_ops.placeholder(dtypes.float32, (5, 6, 7, 9, None))
     layer = conv_layers.Conv3D(32, [3, 3, 3], activation=nn_ops.relu)
     with self.assertRaisesRegexp(ValueError,
                                  'The channel dimension of the inputs '
@@ -328,6 +326,169 @@ class ConvTest(test.TestCase):
     self.assertEqual(conv3d.bias_constraint, b_constraint)
 
 
+@test_util.with_c_api
+class SeparableConv1DTest(test.TestCase):
+
+  def testInvalidDataFormat(self):
+    length = 9
+    data = random_ops.random_uniform((5, length, 3), seed=1)
+    with self.assertRaisesRegexp(ValueError, 'data_format'):
+      conv_layers.separable_conv1d(data, 32, 3, data_format='invalid')
+
+  def testInvalidStrides(self):
+    length = 9
+    data = random_ops.random_uniform((5, length, 3), seed=1)
+    with self.assertRaisesRegexp(ValueError, 'strides'):
+      conv_layers.separable_conv1d(data, 32, 3, strides=(1, 2))
+
+    with self.assertRaisesRegexp(ValueError, 'strides'):
+      conv_layers.separable_conv1d(data, 32, 3, strides=None)
+
+  def testInvalidKernelSize(self):
+    length = 9
+    data = random_ops.random_uniform((5, length, 3), seed=1)
+    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+      conv_layers.separable_conv1d(data, 32, (1, 2))
+
+    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+      conv_layers.separable_conv1d(data, 32, None)
+
+  def testCreateSeparableConv1D(self):
+    length = 9
+    data = random_ops.random_uniform((5, length, 4))
+    layer = conv_layers.SeparableConv1D(32, 3, activation=nn_ops.relu)
+    output = layer.apply(data)
+    self.assertEqual(output.op.name, 'separable_conv1d/Relu')
+    self.assertEqual(output.get_shape().as_list(), [5, length - 2, 32])
+    self.assertEqual(layer.depthwise_kernel.get_shape().as_list(), [3, 4, 1])
+    self.assertEqual(layer.pointwise_kernel.get_shape().as_list(), [1, 4, 32])
+    self.assertEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testCreateSeparableConv1DDepthMultiplier(self):
+    length = 9
+    data = random_ops.random_uniform((5, length, 4))
+    layer = conv_layers.SeparableConv1D(32, 3, depth_multiplier=2)
+    output = layer.apply(data)
+    self.assertEqual(output.get_shape().as_list(), [5, length - 2, 32])
+    self.assertEqual(layer.depthwise_kernel.get_shape().as_list(), [3, 4, 2])
+    self.assertEqual(layer.pointwise_kernel.get_shape().as_list(), [1, 8, 32])
+    self.assertEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testCreateSeparableConv1DChannelsFirst(self):
+    length = 9
+    data = random_ops.random_uniform((5, 4, length))
+    layer = conv_layers.SeparableConv1D(32, 3, data_format='channels_first')
+    output = layer.apply(data)
+    self.assertEqual(output.get_shape().as_list(), [5, 32, length - 2])
+    self.assertEqual(layer.depthwise_kernel.get_shape().as_list(), [3, 4, 1])
+    self.assertEqual(layer.pointwise_kernel.get_shape().as_list(), [1, 4, 32])
+    self.assertEqual(layer.bias.get_shape().as_list(), [32])
+
+  def testSeparableConv1DPaddingSame(self):
+    length = 9
+    data = random_ops.random_uniform((5, length, 32), seed=1)
+    layer = conv_layers.SeparableConv1D(
+        64, length, padding='same')
+    output = layer.apply(data)
+    self.assertEqual(output.get_shape().as_list(), [5, length, 64])
+
+  def testCreateSeparableConv1DWithStrides(self):
+    length = 10
+    data = random_ops.random_uniform((5, length, 3), seed=1)
+    layer = conv_layers.SeparableConv1D(32, 3, strides=2, padding='same')
+    output = layer.apply(data)
+    self.assertEqual(output.get_shape().as_list(), [5, length // 2, 32])
+
+  def testCreateSeparableConv1DWithStridesChannelsFirst(self):
+    data_format = 'channels_first'
+    length = 10
+    data = random_ops.random_uniform((5, 3, length), seed=1)
+    layer = conv_layers.SeparableConv1D(
+        32, 3, strides=2, padding='same', data_format=data_format)
+    output = layer.apply(data)
+    self.assertEqual(output.get_shape().as_list(), [5, 32, length // 2])
+
+  def testFunctionalConv1DReuse(self):
+    length = 10
+    data = random_ops.random_uniform((5, length, 3), seed=1)
+    conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
+    self.assertEqual(len(variables.trainable_variables()), 3)
+    conv_layers.separable_conv1d(data, 32, 3, name='sepconv1', reuse=True)
+    self.assertEqual(len(variables.trainable_variables()), 3)
+
+  def testFunctionalConv1DReuseFromScope(self):
+    with variable_scope.variable_scope('scope'):
+      length = 10
+      data = random_ops.random_uniform((5, length, 3), seed=1)
+      conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
+      self.assertEqual(len(variables.trainable_variables()), 3)
+    with variable_scope.variable_scope('scope', reuse=True):
+      conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
+      self.assertEqual(len(variables.trainable_variables()), 3)
+
+  def testFunctionalConv1DNoReuse(self):
+    length = 10
+    data = random_ops.random_uniform((5, length, 3), seed=1)
+    conv_layers.separable_conv1d(data, 32, 3)
+    self.assertEqual(len(variables.trainable_variables()), 3)
+    conv_layers.separable_conv1d(data, 32, 3)
+    self.assertEqual(len(variables.trainable_variables()), 6)
+
+  def testSeparableConv1DDepthwiseRegularizer(self):
+    length = 9
+    data = random_ops.random_uniform((5, length, 4))
+    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+    layer = conv_layers.SeparableConv1D(32, 3, depthwise_regularizer=reg)
+    layer.apply(data)
+    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertEqual(layer.losses, loss_keys)
+
+  def testSeparableConv1DPointwiseRegularizer(self):
+    length = 9
+    data = random_ops.random_uniform((5, length, 4))
+    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+    layer = conv_layers.SeparableConv1D(32, 3, pointwise_regularizer=reg)
+    layer.apply(data)
+    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertEqual(layer.losses, loss_keys)
+
+  def testSeparableConv1DBiasRegularizer(self):
+    length = 9
+    data = random_ops.random_uniform((5, length, 4))
+    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+    layer = conv_layers.SeparableConv1D(32, 3, bias_regularizer=reg)
+    layer.apply(data)
+    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertEqual(layer.losses, loss_keys)
+
+  def testSeparableConv1DNoBias(self):
+    length = 9
+    data = random_ops.random_uniform((5, length, 4))
+    layer = conv_layers.SeparableConv1D(
+        32, 3, activation=nn_ops.relu, use_bias=False)
+    output = layer.apply(data)
+    self.assertEqual(output.op.name, 'separable_conv1d/Relu')
+    self.assertEqual(layer.bias, None)
+
+  def testConstraints(self):
+    d_constraint = lambda x: x / math_ops.reduce_sum(x)
+    p_constraint = lambda x: x / math_ops.reduce_sum(x)
+    b_constraint = lambda x: x / math_ops.reduce_max(x)
+    layer = conv_layers.SeparableConv1D(2, 3,
+                                        depthwise_constraint=d_constraint,
+                                        pointwise_constraint=p_constraint,
+                                        bias_constraint=b_constraint)
+    inputs = random_ops.random_uniform((5, 3, 5), seed=1)
+    layer(inputs)
+    self.assertEqual(layer.depthwise_constraint, d_constraint)
+    self.assertEqual(layer.pointwise_constraint, p_constraint)
+    self.assertEqual(layer.bias_constraint, b_constraint)
+
+
+@test_util.with_c_api
 class SeparableConv2DTest(test.TestCase):
 
   def testInvalidDataFormat(self):
@@ -571,6 +732,7 @@ class SeparableConv2DTest(test.TestCase):
     self.assertEqual(layer.bias_constraint, b_constraint)
 
 
+@test_util.with_c_api
 class Conv2DTransposeTest(test.TestCase):
 
   def testInvalidDataFormat(self):
@@ -756,6 +918,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertEqual(layer.bias_constraint, b_constraint)
 
 
+@test_util.with_c_api
 class Conv3DTransposeTest(test.TestCase):
 
   def testInvalidDataFormat(self):
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 7be1fa5cfe95f13f67ee94bb20304fba00b33d1b..ec4fca78f046aff0ec6f6e65d5397d2649b329f1 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -37,21 +37,20 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import standard_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('layers.Dense')
 class Dense(base.Layer):
   """Densely-connected layer class.
 
   This layer implements the operation:
-  `outputs = activation(inputs.kernel + bias)`
+  `outputs = activation(inputs * kernel + bias)`
   Where `activation` is the activation function passed as the `activation`
   argument (if not `None`), `kernel` is a weights matrix created by the layer,
   and `bias` is a bias vector created by the layer
   (only if `use_bias` is `True`).
 
-  Note: if the input to the layer has a rank greater than 2, then it is
-  flattened prior to the initial matrix multiply by `kernel`.
-
   Arguments:
     units: Integer or Long, dimensionality of the output space.
     activation: Activation function (callable). Set it to None to maintain a
@@ -166,7 +165,7 @@ class Dense(base.Layer):
       return self.activation(outputs)  # pylint: disable=not-callable
     return outputs
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     input_shape = input_shape.with_rank_at_least(2)
     if input_shape[-1].value is None:
@@ -176,6 +175,7 @@ class Dense(base.Layer):
     return input_shape[:-1].concatenate(self.units)
 
 
+@tf_export('layers.dense')
 def dense(
     inputs, units,
     activation=None,
@@ -199,9 +199,6 @@ def dense(
   and `bias` is a bias vector created by the layer
   (only if `use_bias` is `True`).
 
-  Note: if the `inputs` tensor has a rank greater than 2, then it is
-  flattened prior to the initial matrix multiply by `kernel`.
-
   Arguments:
     inputs: Tensor input.
     units: Integer or Long, dimensionality of the output space.
@@ -230,7 +227,8 @@ def dense(
       by the same name.
 
   Returns:
-    Output tensor.
+    Output tensor the same shape as `inputs` except the last dimension is of
+    size `units`.
 
   Raises:
     ValueError: if eager execution is enabled.
@@ -253,6 +251,7 @@ def dense(
   return layer.apply(inputs)
 
 
+@tf_export('layers.Dropout')
 class Dropout(base.Layer):
   """Applies Dropout to the input.
 
@@ -310,7 +309,11 @@ class Dropout(base.Layer):
                             dropped_inputs,
                             lambda: array_ops.identity(inputs))
 
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
 
+@tf_export('layers.dropout')
 def dropout(inputs,
             rate=0.5,
             noise_shape=None,
@@ -352,6 +355,7 @@ def dropout(inputs,
   return layer.apply(inputs, training=training)
 
 
+@tf_export('layers.Flatten')
 class Flatten(base.Layer):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
@@ -375,10 +379,10 @@ class Flatten(base.Layer):
   def call(self, inputs):
     outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
     if context.in_graph_mode():
-      outputs.set_shape(self._compute_output_shape(inputs.get_shape()))
+      outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
     return outputs
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     output_shape = [input_shape[0]]
     if all(input_shape[1:]):
@@ -388,6 +392,7 @@ class Flatten(base.Layer):
     return tensor_shape.TensorShape(output_shape)
 
 
+@tf_export('layers.flatten')
 def flatten(inputs, name=None):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index 2d47cc69798d8c3e34e14e24301e8be9a00f49bc..15ce6cba21fcc78126f7db58ab18934db69c15fd 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -59,6 +59,14 @@ class DenseTest(test.TestCase):
     dense.apply(random_ops.random_uniform((5, 2)))
     self.assertEqual(dense.name, 'dense_2')
 
+  def testVariableInput(self):
+    with self.test_session():
+      v = variable_scope.get_variable(
+          'X', initializer=init_ops.zeros_initializer(), shape=(1, 1))
+      x = core_layers.Dense(1)(v)
+      variables.global_variables_initializer().run()
+      self.assertAllEqual(x.eval(), [[0.0]])
+
   @test_util.run_in_graph_and_eager_modes()
   def testCall(self):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')
@@ -315,20 +323,20 @@ class DenseTest(test.TestCase):
     ts = tensor_shape.TensorShape
     # pylint: disable=protected-access
     with self.assertRaises(ValueError):
-      dense._compute_output_shape(ts(None))
+      dense.compute_output_shape(ts(None))
     with self.assertRaises(ValueError):
-      dense._compute_output_shape(ts([]))
+      dense.compute_output_shape(ts([]))
     with self.assertRaises(ValueError):
-      dense._compute_output_shape(ts([1]))
+      dense.compute_output_shape(ts([1]))
     self.assertEqual(
         [None, 2],
-        dense._compute_output_shape((None, 3)).as_list())
+        dense.compute_output_shape((None, 3)).as_list())
     self.assertEqual(
         [None, 2],
-        dense._compute_output_shape(ts([None, 3])).as_list())
+        dense.compute_output_shape(ts([None, 3])).as_list())
     self.assertEqual(
         [None, 4, 2],
-        dense._compute_output_shape(ts([None, 4, 3])).as_list())
+        dense.compute_output_shape(ts([None, 4, 3])).as_list())
     # pylint: enable=protected-access
 
   @test_util.run_in_graph_and_eager_modes()
@@ -448,13 +456,13 @@ class FlattenTest(test.TestCase):
       self.assertEqual(y.get_shape().as_list(), [1, 12])
 
   def testComputeShape(self):
-    shape = core_layers.Flatten()._compute_output_shape((1, 2, 3, 2))
+    shape = core_layers.Flatten().compute_output_shape((1, 2, 3, 2))
     self.assertEqual(shape.as_list(), [1, 12])
 
-    shape = core_layers.Flatten()._compute_output_shape((None, 3, 2))
+    shape = core_layers.Flatten().compute_output_shape((None, 3, 2))
     self.assertEqual(shape.as_list(), [None, 6])
 
-    shape = core_layers.Flatten()._compute_output_shape((None, 3, None))
+    shape = core_layers.Flatten().compute_output_shape((None, 3, None))
     self.assertEqual(shape.as_list(), [None, None])
 
   def testFunctionalFlatten(self):
diff --git a/tensorflow/python/layers/layers.py b/tensorflow/python/layers/layers.py
index 0a52b1e8d9216a2535f5ae99751a4f9e9757031d..1555846efde812b9e31f48315decaf1f86aa4f70 100644
--- a/tensorflow/python/layers/layers.py
+++ b/tensorflow/python/layers/layers.py
@@ -22,6 +22,7 @@
 @@Conv1D
 @@Conv2D
 @@Conv3D
+@@SeparableConv1D
 @@SeparableConv2D
 @@Conv2DTranspose
 @@Conv3DTranspose
@@ -43,6 +44,7 @@
 @@conv1d
 @@conv2d
 @@conv3d
+@@separable_conv1d
 @@separable_conv2d
 @@conv2d_transpose
 @@conv3d_transpose
@@ -78,6 +80,7 @@ from tensorflow.python.layers.core import dropout
 from tensorflow.python.layers.core import flatten
 
 # Convolutional layers.
+from tensorflow.python.layers.convolutional import SeparableConv1D
 from tensorflow.python.layers.convolutional import SeparableConv2D
 from tensorflow.python.layers.convolutional import SeparableConvolution2D
 from tensorflow.python.layers.convolutional import Conv2DTranspose
@@ -91,6 +94,7 @@ from tensorflow.python.layers.convolutional import Convolution2D
 from tensorflow.python.layers.convolutional import Conv3D
 from tensorflow.python.layers.convolutional import Convolution3D
 
+from tensorflow.python.layers.convolutional import separable_conv1d
 from tensorflow.python.layers.convolutional import separable_conv2d
 from tensorflow.python.layers.convolutional import conv2d_transpose
 from tensorflow.python.layers.convolutional import conv3d_transpose
diff --git a/tensorflow/python/layers/maxout.py b/tensorflow/python/layers/maxout.py
index ed048845a0b88344b357836a838231677cbf40ce..765a1c4fdafdfdc5d3ea6629d4d9290d8b658902 100644
--- a/tensorflow/python/layers/maxout.py
+++ b/tensorflow/python/layers/maxout.py
@@ -31,15 +31,18 @@ from tensorflow.python.layers import base
 def maxout(inputs, num_units, axis=-1, name=None):
   """Adds a maxout op from https://arxiv.org/abs/1302.4389
 
-  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville,
+  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
+  Courville,
    Yoshua Bengio
 
-   Usually the operation is performed in the filter/channel dimension. This can also be
+   Usually the operation is performed in the filter/channel dimension. This can
+   also be
    used after fully-connected layers to reduce number of features.
 
    Arguments:
    inputs: Tensor input
-   num_units: Specifies how many features will remain after maxout in the `axis` dimension
+   num_units: Specifies how many features will remain after maxout in the `axis`
+     dimension
          (usually channel). This must be multiple of number of `axis`.
    axis: The dimension where max pooling will be performed. Default is the
    last dimension.
@@ -57,15 +60,18 @@ def maxout(inputs, num_units, axis=-1, name=None):
 class MaxOut(base.Layer):
   """Adds a maxout op from https://arxiv.org/abs/1302.4389
 
-  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron Courville, Yoshua
+  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
+  Courville, Yoshua
   Bengio
 
-  Usually the operation is performed in the filter/channel dimension. This can also be
+  Usually the operation is performed in the filter/channel dimension. This can
+  also be
   used after fully-connected layers to reduce number of features.
 
   Arguments:
     inputs: Tensor input
-    num_units: Specifies how many features will remain after maxout in the `axis` dimension
+    num_units: Specifies how many features will remain after maxout in the
+      `axis` dimension
          (usually channel).
     This must be multiple of number of `axis`.
     axis: The dimension where max pooling will be performed. Default is the
@@ -79,13 +85,8 @@ class MaxOut(base.Layer):
     ValueError: if num_units is not multiple of number of features.
   """
 
-  def __init__(self,
-         num_units,
-         axis=-1,
-         name=None,
-         **kwargs):
-    super(MaxOut, self).__init__(
-      name=name, trainable=False, **kwargs)
+  def __init__(self, num_units, axis=-1, name=None, **kwargs):
+    super(MaxOut, self).__init__(name=name, trainable=False, **kwargs)
     self.axis = axis
     self.num_units = num_units
 
@@ -95,8 +96,8 @@ class MaxOut(base.Layer):
     num_channels = shape[self.axis]
     if num_channels % self.num_units:
       raise ValueError('number of features({}) is not '
-               'a multiple of num_units({})'
-               .format(num_channels, self.num_units))
+                       'a multiple of num_units({})'.format(
+                           num_channels, self.num_units))
     shape[self.axis] = -1
     shape += [num_channels // self.num_units]
 
@@ -104,6 +105,7 @@ class MaxOut(base.Layer):
     for i in range(len(shape)):
       if shape[i] is None:
         shape[i] = gen_array_ops.shape(inputs)[i]
-    outputs = math_ops.reduce_max(gen_array_ops.reshape(inputs, shape), -1, keep_dims=False)
+    outputs = math_ops.reduce_max(
+        gen_array_ops.reshape(inputs, shape), -1, keepdims=False)
 
     return outputs
diff --git a/tensorflow/python/layers/network.py b/tensorflow/python/layers/network.py
index edc52545f92cb9b9c6f78f5c58fe44b3187d370b..eeb3276f0c4d4a030e0d37aa34e079ab2113ee0a 100644
--- a/tensorflow/python/layers/network.py
+++ b/tensorflow/python/layers/network.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 class InputLayer(base.Layer):
@@ -117,6 +118,7 @@ class InputLayer(base.Layer):
         output_tensors=[input_tensor])
 
 
+@tf_export('layers.Input')
 def Input(  # pylint: disable=invalid-name
     shape=None,
     batch_size=None,
@@ -221,13 +223,36 @@ class GraphNetwork(base.Layer):
       - get_layer: retrieves a child layer by name or index in the graph.
 
   Raises:
-    RuntimeError: If created in Eager mode.
+    TypeError: If created when eager execution is enabled, with inputs that
+      don't come from a call to `Input` or outputs that don't come from layers.
   """
 
   def __init__(self, inputs, outputs, name=None):  # pylint: disable=super-init-not-called
+    if isinstance(inputs, (list, tuple)):
+      self.inputs = list(inputs)  # Tensor or list of tensors.
+    else:
+      self.inputs = [inputs]
+    if isinstance(outputs, (list, tuple)):
+      self.outputs = list(outputs)
+    else:
+      self.outputs = [outputs]
+
     if context.in_eager_mode():
-      # TODO(fchollet): check that all inputs and outputs are DeferredTensors.
-      pass
+      # Check that all inputs/outputs are DeferredTensors.
+      for tensor in self.inputs:
+        if not isinstance(tensor, base._DeferredTensor):  # pylint: disable=protected-access
+          raise TypeError('When eager execution is enabled, '
+                          'inputs must come from a call to '
+                          '`tf.keras.Input` (called after '
+                          'tfe.enable_eager_execution()). '
+                          'Received invalid input: ' + str(tensor))
+      for tensor in self.outputs:
+        if not isinstance(tensor, base._DeferredTensor):  # pylint: disable=protected-access
+          raise TypeError('When eager execution is enabled, '
+                          'outputs must come from a call to '
+                          'a layer (called after '
+                          'tfe.enable_eager_execution()). '
+                          'Received invalid output: ' + str(tensor))
 
     self._init_set_name(name)
     self._activity_regularizer = None
@@ -248,32 +273,22 @@ class GraphNetwork(base.Layer):
     self.built = True
     # A GraphNetwork does not create weights of its own, thus has no dtype.
     self._dtype = None
+    self._is_graph_network = True
     # The following are implemented as property functions:
     # self.trainable_weights
     # self.non_trainable_weights
     # self.input_spec
 
     # Private attributes to implement compatibility with Layer.
-    self._per_input_losses = {}
-    self._per_input_updates = {}
     self._updates = []
     self._losses = []
     self._scope = None
     self._reuse = None
     self._graph = ops.get_default_graph()
 
-    # GraphNetwork-specific properties.
-    if isinstance(inputs, (list, tuple)):
-      self.inputs = list(inputs)  # Tensor or list of tensors.
-    else:
-      self.inputs = [inputs]
-    if isinstance(outputs, (list, tuple)):
-      self.outputs = list(outputs)
-    else:
-      self.outputs = [outputs]
     # All layers in order of horizontal graph traversal.
     # Entries are unique. Includes input and output layers.
-    self.layers = []
+    self._layers = []
 
     # Check for redundancy in inputs.
     if len(set(self.inputs)) != len(self.inputs):
@@ -483,7 +498,7 @@ class GraphNetwork(base.Layer):
       # here we order them by traversal order.
       layers_for_depth.sort(key=lambda x: layer_indices[x])
       layers.extend(layers_for_depth)
-    self.layers = layers
+    self._layers = layers
     self._layers_by_depth = layers_by_depth
 
     # Get sorted list of node depths.
@@ -542,6 +557,10 @@ class GraphNetwork(base.Layer):
         input_tensors=self.inputs,
         output_tensors=self.outputs)
 
+  @property
+  def layers(self):
+    return self._layers
+
   def get_layer(self, name=None, index=None):
     """Retrieves a layer based on either its name (unique) or index.
 
@@ -574,32 +593,86 @@ class GraphNetwork(base.Layer):
         return layer
     raise ValueError('No such layer: ' + name)
 
+  @property
+  def stateful(self):
+    return any([(hasattr(layer, 'stateful') and layer.stateful)
+                for layer in self.layers])
+
   @property
   def updates(self):
     """Retrieve the network's updates.
 
     Will only include updates that are either
     unconditional, or conditional on inputs to this model
-    (e.g. will not include updates that depend on tensors
-    that aren't inputs to this model).
+    (e.g. will not include updates that were created by layers of this model
+    outside of the model).
+
+    Effectively, `network.updates` behaves like `layer.updates`.
+
+    Concrete example:
+
+    ```python
+      bn = keras.layers.BatchNormalization()
+      x1 = keras.layers.Input(shape=(10,))
+      _ = bn(x1)  # This creates 2 updates.
+
+      x2 = keras.layers.Input(shape=(10,))
+      y2 = bn(x2)  # This creates 2 more updates.
+
+      # The BN layer has now 4 updates.
+      self.assertEqual(len(bn.updates), 4)
+
+      # Let's create a model from x2 to y2.
+      model = keras.models.Model(x2, y2)
+
+      # The model does not list all updates from its underlying layers,
+      # but only the updates that are relevant to it. Updates created by layers
+      # outside of the model are discarded.
+      self.assertEqual(len(model.updates), 2)
+
+      # If you keep calling the model, you append to its updates, just like
+      # what happens for a layer.
+      x3 = keras.layers.Input(shape=(10,))
+      y3 = model(x3)
+      self.assertEqual(len(model.updates), 4)
+
+      # But if you call the inner BN layer independently, you don't affect
+      # the model's updates.
+      x4 = keras.layers.Input(shape=(10,))
+      _ = bn(x4)
+      self.assertEqual(len(model.updates), 4)
+    ```
 
     Returns:
         A list of update ops.
     """
+    if context.in_eager_mode():
+      return []
+
+    if not self.trainable and not self.stateful:
+      return []
+
     updates = []
     for layer in self.layers:
-      if hasattr(layer, 'updates'):
-        # Collect updates that are dependent on inputs
-        # that are part of the model.
-        for node_index, node in enumerate(layer._inbound_nodes):  # pylint: disable=protected-access
-          node_key = _make_node_key(layer.name, node_index)
-          if node_key in self._network_nodes:
-            # The model owns this layer node.
-            inputs = node.input_tensors
-            updates += layer.get_updates_for(inputs)
-        # Collect unconditional updates.
-        updates += layer.get_updates_for(None)
-    return updates
+      updates += layer.updates
+
+    # `updates` might contain irrelevant updates, so it needs to be filtered
+    # with respect to inputs the model has been called on.
+    relevant_inputs = self.inputs or []
+    for i in range(1, len(self._inbound_nodes)):
+      inputs = self.get_input_at(i)
+      if isinstance(inputs, list):
+        relevant_inputs += inputs
+      else:
+        relevant_inputs.append(inputs)
+    reachable = layers_util.get_reachable_from_inputs(relevant_inputs, updates)
+    relevant_conditional_updates = [x for x in updates if x in reachable]
+    unconditional_updates = [
+        x for x in updates if x._unconditional_update]  # pylint: disable=protected-access
+    # A layer could be used multiple times in a nested structure,
+    # so the updates list must be de-duped.
+    return list(set(
+        relevant_conditional_updates + unconditional_updates + self._updates))
 
   @property
   def losses(self):
@@ -614,22 +687,24 @@ class GraphNetwork(base.Layer):
         A list of loss tensors.
     """
     losses = []
-    # Retrieve losses for all internal layers.
     for layer in self.layers:
-      if hasattr(layer, 'losses'):
-        # Collect losses that are dependent on inputs
-        # that are part of the model.
-        for node_index, node in enumerate(layer._inbound_nodes):  # pylint: disable=protected-access
-          node_key = _make_node_key(layer.name, node_index)
-          if node_key in self._network_nodes:
-            # The model owns this layer node.
-            inputs = node.input_tensors
-            losses += layer.get_losses_for(inputs)
-        # Collect unconditional losses.
-        losses += layer.get_losses_for(None)
-    # Add any potential unconditional model-level loss.
-    losses += self.get_losses_for(None)
-    return losses
+      losses += layer.losses
+    if context.in_eager_mode():
+      return losses
+
+    relevant_inputs = self.inputs or []
+    for i in range(1, len(self._inbound_nodes)):
+      inputs = self.get_input_at(i)
+      if isinstance(inputs, list):
+        relevant_inputs += inputs
+      else:
+        relevant_inputs.append(inputs)
+    reachable = layers_util.get_reachable_from_inputs(relevant_inputs, losses)
+    relevant_conditional_losses = [x for x in losses if x in reachable]
+    unconditional_losses = [
+        x for x in losses if x._unconditional_loss]  # pylint: disable=protected-access
+    return list(set(
+        relevant_conditional_losses + unconditional_losses + self._losses))
 
   @property
   def trainable_weights(self):
@@ -660,6 +735,10 @@ class GraphNetwork(base.Layer):
         A list of `InputSpec` instances (one per input to the model)
             or a single instance if the model has only one input.
     """
+    # If not a graph network, can't assume anything.
+    if not self._is_graph_network:
+      return None
+
     specs = []
     for layer in self._input_layers:
       if layer.input_spec is None:
@@ -709,7 +788,10 @@ class GraphNetwork(base.Layer):
     outputs, _ = self._run_internal_graph(inputs, masks)
     return outputs
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
+    if not self._is_graph_network:
+      raise NotImplementedError
+
     if isinstance(input_shape, list):
       input_shapes = []
       for shape in input_shape:
@@ -731,12 +813,12 @@ class GraphNetwork(base.Layer):
     cache_key = layers_util.object_list_uid(input_shapes)
     if cache_key not in self._output_shape_cache:
       # Cache miss. We have to run the network graph manually (recursive calls
-      # to `_compute_output_shape`).
+      # to `compute_output_shape`).
       layers_to_output_shapes = {}
       for i in range(len(input_shapes)):
         layer = self._input_layers[i]
         input_shape = input_shapes[i]
-        # It's an input layer: then `_compute_output_shape` is identity,
+        # It's an input layer: then `compute_output_shape` is identity,
         # and there is only one node and one tensor output.
         shape_key = layer.name + '_0_0'
         layers_to_output_shapes[shape_key] = input_shape
@@ -767,9 +849,9 @@ class GraphNetwork(base.Layer):
               input_shapes.append(input_shape)
 
             if len(input_shapes) == 1:
-              output_shape = layer._compute_output_shape(input_shapes[0])  # pylint: disable=protected-access
+              output_shape = layer.compute_output_shape(input_shapes[0])
             else:
-              output_shape = layer._compute_output_shape(input_shapes)  # pylint: disable=protected-access
+              output_shape = layer.compute_output_shape(input_shapes)
             if isinstance(output_shape, list):
               output_shapes = [
                   tuple(tensor_shape.TensorShape(shape).as_list())
@@ -791,20 +873,19 @@ class GraphNetwork(base.Layer):
           layer, node_index, tensor_index = self._output_coordinates[i]
           shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
           output_shapes.append(layers_to_output_shapes[shape_key])
-
         # Store in cache.
         self._output_shape_cache[cache_key] = output_shapes
-      else:
-        # Cache hit.
-        output_shapes = self._output_shape_cache[cache_key]
+    else:
+      # Cache hit.
+      output_shapes = self._output_shape_cache[cache_key]
 
-      if isinstance(output_shapes, list):
-        if len(output_shapes) == 1:
-          return tensor_shape.TensorShape(output_shapes[0])
-        else:
-          return [tensor_shape.TensorShape(shape) for shape in output_shapes]
+    if isinstance(output_shapes, list):
+      if len(output_shapes) == 1:
+        return tensor_shape.TensorShape(output_shapes[0])
       else:
-        return tensor_shape.TensorShape(output_shapes)
+        return [tensor_shape.TensorShape(shape) for shape in output_shapes]
+    else:
+      return tensor_shape.TensorShape(output_shapes)
 
   def _run_internal_graph(self, inputs, masks=None):
     """Computes output tensors for new inputs.
@@ -846,7 +927,6 @@ class GraphNetwork(base.Layer):
       for node in nodes:
         # This is always a single layer, never a list.
         layer = node.outbound_layer
-
         reference_input_tensors = node.input_tensors
         reference_output_tensors = node.output_tensors
 
@@ -894,26 +974,13 @@ class GraphNetwork(base.Layer):
               else:
                 output_masks = [None for _ in range(len(output_tensors))]
 
-            # Apply activity regularizer if any:
-            if layer.activity_regularizer is not None:
-              regularization_losses = [
-                  layer.activity_regularizer(x) for x in computed_tensors
-              ]
-              layer.add_loss(regularization_losses, computed_tensors)
-
-          if context.in_graph_mode():
-            # Update model updates and losses:
-            # Keep track of updates that depend on the inputs
-            # (e.g. BN updates).
-            self.add_update(layer.get_updates_for(computed_tensors), inputs)
-            # Keep track of unconditional updates (e.g. a counter).
-            self.add_update(layer.get_updates_for(None), None)
-            # Keep track of losses that depend on the inputs
-            # (e.g. activity regularizers).
-            self.add_loss(layer.get_losses_for(computed_tensors), inputs)
-            # Keep track of unconditional losses
-            # (e.g. weight regularizers).
-            self.add_loss(layer.get_losses_for(None), None)
+            if context.in_graph_mode():
+              if layer.activity_regularizer is not None:
+                regularization_losses = [
+                    layer.activity_regularizer(x) for x in computed_tensors
+                ]
+                # Apply activity regularizer if any:
+                layer.add_loss(regularization_losses, computed_tensors)
 
           # Update tensor_map.
           for x, y, mask in zip(reference_output_tensors, output_tensors,
@@ -943,8 +1010,8 @@ class GraphNetwork(base.Layer):
       cache_key = (layers_util.object_list_uid(inputs)
                    + '_' + layers_util.object_list_uid(masks))
       self._output_tensor_cache[cache_key] = output_tensors
-      if output_masks is not None:
-        self._output_mask_cache[cache_key] = output_masks
+      self._output_mask_cache[cache_key] = output_masks
+
       if output_shapes is not None:
         input_shapes = [layers_util.static_shape(x) for x in inputs]
         cache_key = layers_util.object_list_uid(input_shapes)
diff --git a/tensorflow/python/layers/network_test.py b/tensorflow/python/layers/network_test.py
index af7813e26420eb6e85b204fd5b50e7ddafc2e5a1..cc6e8ca9f41cd1f6aa0a3f64d7ce11ac24c04967 100644
--- a/tensorflow/python/layers/network_test.py
+++ b/tensorflow/python/layers/network_test.py
@@ -27,29 +27,137 @@ from tensorflow.python.layers import base as base_layers
 from tensorflow.python.layers import core as core_layers
 from tensorflow.python.layers import network as network_layers
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 
 
 class BaseLayerCompatibilityTest(test.TestCase):
 
-  def test_get_updates_for(self):
-    a = network_layers.Input(shape=(2,))
-    dense_layer = core_layers.Dense(1)
-    dense_layer.add_update(0, inputs=a)
-    dense_layer.add_update(1, inputs=None)
+  def test_get_updates(self):
 
-    self.assertEqual(dense_layer.get_updates_for(a), [0])
-    self.assertEqual(dense_layer.get_updates_for(None), [1])
+    class MyLayer(base_layers.Layer):
 
-  def test_get_losses_for(self):
-    a = network_layers.Input(shape=(2,))
-    dense_layer = core_layers.Dense(1)
-    dense_layer.add_loss(0, inputs=a)
-    dense_layer.add_loss(1, inputs=None)
+      def build(self, input_shape):
+        self.a = self.add_variable('a',
+                                   (1, 1),
+                                   'float32',
+                                   trainable=False)
+        self.b = self.add_variable('b',
+                                   (1, 1),
+                                   'float32',
+                                   trainable=False)
+        self.add_update(state_ops.assign_add(self.a, [[1.]]))
+        self.built = True
 
-    self.assertEqual(dense_layer.get_losses_for(a), [0])
-    self.assertEqual(dense_layer.get_losses_for(None), [1])
+      def call(self, inputs):
+        self.add_update(state_ops.assign_add(self.a, inputs),
+                        inputs=True)
+        return inputs + 1
+
+    x1 = network_layers.Input(shape=(1,))
+    layer = MyLayer()
+    _ = layer.apply(x1)
+
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(len(layer.get_updates_for(x1)), 1)
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+
+    x2 = network_layers.Input(shape=(1,))
+    y2 = layer.apply(x2)
+
+    self.assertEqual(len(layer.updates), 3)
+    self.assertEqual(len(layer.get_updates_for(x1)), 1)
+    self.assertEqual(len(layer.get_updates_for(x2)), 1)
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+
+    network = network_layers.GraphNetwork(x2, y2)
+    self.assertEqual(len(network.updates), 2)
+    self.assertEqual(len(network.get_updates_for(x1)), 0)
+    self.assertEqual(len(network.get_updates_for(x2)), 1)
+    self.assertEqual(len(network.get_updates_for(None)), 1)
+
+    x3 = network_layers.Input(shape=(1,))
+    _ = layer.apply(x3)
+    self.assertEqual(len(network.updates), 2)
+
+    x4 = network_layers.Input(shape=(1,))
+    _ = network(x4)
+    self.assertEqual(len(network.updates), 3)
+    self.assertEqual(len(network.get_updates_for(x2)), 1)
+    self.assertEqual(len(network.get_updates_for(x4)), 1)
+    self.assertEqual(len(network.get_updates_for(None)), 1)
+
+    network.add_update(state_ops.assign_add(layer.a, [[1]]))
+    self.assertEqual(len(network.updates), 4)
+    self.assertEqual(len(network.get_updates_for(None)), 2)
+
+    network.add_update(state_ops.assign_add(layer.a, x4), inputs=True)
+    self.assertEqual(len(network.updates), 5)
+    self.assertEqual(len(network.get_updates_for(x4)), 2)
+
+  def test_get_losses(self):
+
+    class MyLayer(base_layers.Layer):
+
+      def build(self, input_shape):
+        self.a = self.add_variable('a',
+                                   (1, 1),
+                                   'float32',
+                                   trainable=False)
+        self.b = self.add_variable('b',
+                                   (1, 1),
+                                   'float32',
+                                   trainable=False)
+        self.add_loss(math_ops.reduce_sum(self.a))
+        self.built = True
+
+      def call(self, inputs):
+        self.add_loss(math_ops.reduce_sum(inputs),
+                      inputs=True)
+        return inputs + 1
+
+    x1 = network_layers.Input(shape=(1,))
+    layer = MyLayer()
+    _ = layer.apply(x1)
+
+    self.assertEqual(len(layer.losses), 2)
+    self.assertEqual(len(layer.get_losses_for(x1)), 1)
+    self.assertEqual(len(layer.get_losses_for(None)), 1)
+
+    x2 = network_layers.Input(shape=(1,))
+    y2 = layer.apply(x2)
+
+    self.assertEqual(len(layer.losses), 3)
+    self.assertEqual(len(layer.get_losses_for(x1)), 1)
+    self.assertEqual(len(layer.get_losses_for(x2)), 1)
+    self.assertEqual(len(layer.get_losses_for(None)), 1)
+
+    network = network_layers.GraphNetwork(x2, y2)
+    self.assertEqual(len(network.losses), 2)
+    self.assertEqual(len(network.get_losses_for(x1)), 0)
+    self.assertEqual(len(network.get_losses_for(x2)), 1)
+    self.assertEqual(len(network.get_losses_for(None)), 1)
+
+    x3 = network_layers.Input(shape=(1,))
+    _ = layer.apply(x3)
+    self.assertEqual(len(network.losses), 2)
+
+    x4 = network_layers.Input(shape=(1,))
+    _ = network(x4)
+    self.assertEqual(len(network.losses), 3)
+    self.assertEqual(len(network.get_losses_for(x2)), 1)
+    self.assertEqual(len(network.get_losses_for(x4)), 1)
+    self.assertEqual(len(network.get_losses_for(None)), 1)
+
+    network.add_loss(math_ops.reduce_sum(layer.a))
+    self.assertEqual(len(network.losses), 4)
+    self.assertEqual(len(network.get_losses_for(None)), 2)
+
+    network.add_loss(math_ops.reduce_sum(x4), inputs=True)
+    self.assertEqual(len(network.losses), 5)
+    self.assertEqual(len(network.get_losses_for(x4)), 2)
 
   def testTopologicalAttributes(self):
     # test layer attributes / methods related to cross-layer connectivity.
@@ -299,9 +407,10 @@ class NetworkTest(test.TestCase):
 
   def testNetworkAttributes(self):
     x = network_layers.Input(shape=(32,))
-    z = core_layers.Dense(2, kernel_regularizer=lambda x: 0.01 * (x**2))(x)
+    layer = core_layers.Dense(2, kernel_regularizer=lambda x: 0.01 * (x**2))
+    z = layer(x)
     dense = core_layers.Dense(2, name='dense')
-    dense.add_update(1)
+    dense.add_update(state_ops.assign_add(layer.kernel, layer.kernel * 2.))
     y = dense(z)
     net = network_layers.GraphNetwork(x, y)
 
@@ -333,8 +442,8 @@ class NetworkTest(test.TestCase):
     self.assertEqual(net.get_input_at(0), x)
     self.assertEqual(net.get_output_at(0), y)
 
-    # _compute_output_shape
-    self.assertEqual(net._compute_output_shape((3, 32)).as_list(), [3, 2])
+    # compute_output_shape
+    self.assertEqual(net.compute_output_shape((3, 32)).as_list(), [3, 2])
 
   def testInvalidNetworks(self):
     # redundant inputs
@@ -421,7 +530,6 @@ class NetworkTest(test.TestCase):
     self.assertEqual(len(network.layers), 2)
     self.assertEqual(network.layers[0].sparse, True)
 
-  @test_util.run_in_graph_and_eager_modes()
   def testMaskingSingleInput(self):
 
     class MaskedLayer(base_layers.Layer):
@@ -504,7 +612,7 @@ class DeferredModeTest(test.TestCase):
       def call(self, inputs):
         return inputs[0] + inputs[1]
 
-      def _compute_output_shape(self, input_shape):
+      def compute_output_shape(self, input_shape):
         return input_shape[0]
 
     c = AddLayer()([a, input_b])  # pylint: disable=not-callable
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 83237b8733454255f0306b7ca267ab92ecfc66cc..d83292b80963d942023b5d086a089af53008efe0 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -39,8 +39,10 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import moving_averages
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('layers.BatchNormalization')
 class BatchNormalization(base.Layer):
   """Batch Normalization layer from http://arxiv.org/abs/1502.03167.
 
@@ -92,8 +94,8 @@ class BatchNormalization(base.Layer):
       and should be neither too small (which would add noise) nor too large
       (which would give stale estimates). Note that `momentum` is still applied
       to get the means and variances for inference.
-    fused: if `True`, use a faster, fused implementation if possible.
-      If `None`, use the system recommended implementation.
+    fused: if `None` or `True`, use a faster, fused implementation if possible.
+      If `False`, use the system recommended implementation.
     trainable: Boolean, if `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
@@ -241,7 +243,7 @@ class BatchNormalization(base.Layer):
                          'axis == [1] or axis == [3]')
 
     # Raise parameters of fp16 batch norm to fp32
-    if self.dtype == dtypes.float16:
+    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
       param_dtype = dtypes.float32
     else:
       param_dtype = self.dtype or dtypes.float32
@@ -491,6 +493,7 @@ class BatchNormalization(base.Layer):
     return (r, d, new_mean, new_variance)
 
   def call(self, inputs, training=False):
+    in_eager_mode = context.in_eager_mode()
     if self.virtual_batch_size is not None:
       # Virtual batches (aka ghost batches) can be simulated by reshaping the
       # Tensor and reusing the existing batch norm implementation
@@ -593,6 +596,9 @@ class BatchNormalization(base.Layer):
                                             axis=1, keep_dims=True)
 
       def _do_update(var, value):
+        if in_eager_mode and not self.trainable:
+          return
+
         return moving_averages.assign_moving_average(
             var, value, self.momentum, zero_debias=False)
 
@@ -625,7 +631,11 @@ class BatchNormalization(base.Layer):
 
     return outputs
 
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
 
+@tf_export('layers.batch_normalization')
 def batch_normalization(inputs,
                         axis=-1,
                         momentum=0.99,
@@ -719,8 +729,8 @@ def batch_normalization(inputs,
       and should be neither too small (which would add noise) nor too large
       (which would give stale estimates). Note that `momentum` is still applied
       to get the means and variances for inference.
-    fused: if `True`, use a faster, fused implementation if possible.
-      If `None`, use the system recommended implementation.
+    fused: if `None` or `True`, use a faster, fused implementation if possible.
+      If `False`, use the system recommended implementation.
     virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
       which means batch normalization is performed across the whole batch. When
       `virtual_batch_size` is not `None`, instead perform "Ghost Batch
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index 7c91c3284e72247aab2e5fa3bad924ede891c996..e147f348b0a60dbefb38aa9f89318f261c03684e 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -105,9 +105,17 @@ class BNTest(test.TestCase):
                          infer_use_gpu):
     batch, height, width, input_channels = 2, 4, 5, 3
     shape = [batch, height, width, input_channels]
-    checkpoint = os.path.join(self.get_temp_dir(), 'cp_%s_%s_%s_%s' %
-                              (dtype, train1_use_gpu, train2_use_gpu,
-                               infer_use_gpu))
+
+    # Not all characters in a dtype string representation are allowed in
+    # filenames in all operating systems. This map will sanitize these.
+    dtype_to_valid_fn = {
+        dtypes.float16: 'float16',
+        dtypes.float32: 'float32',
+    }
+    checkpoint = os.path.join(
+        self.get_temp_dir(), 'cp_%s_%s_%s_%s' % (
+            dtype_to_valid_fn[dtype], train1_use_gpu, train2_use_gpu,
+            infer_use_gpu))
 
     self._train(
         checkpoint,
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 78dd617bec85cc29c93a86df3601f2accd5c240a..50503ce093fbc251b11c4d5cbccb2a2683d92e7a 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -26,6 +26,7 @@ from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _Pooling1D(base.Layer):
@@ -63,14 +64,18 @@ class _Pooling1D(base.Layer):
   def call(self, inputs):
     # There is no TF op for 1D pooling, hence we make the inputs 4D.
     if self.data_format == 'channels_last':
-      inputs = array_ops.expand_dims(inputs, 2)
-      pool_shape = (1,) + self.pool_size + (1, 1)
-      strides = (1,) + self.strides + (1, 1)
-      data_format = 'NHWC'
-    else:
+      # input is NWC, make it NHWC
       inputs = array_ops.expand_dims(inputs, 1)
+      # pool on the W dim
       pool_shape = (1, 1) + self.pool_size + (1,)
       strides = (1, 1) + self.strides + (1,)
+      data_format = 'NHWC'
+    else:
+      # input is NCW, make it NCHW
+      inputs = array_ops.expand_dims(inputs, 2)
+      # pool on the W dim
+      pool_shape = (1, 1, 1) + self.pool_size
+      strides = (1, 1, 1) + self.strides
       data_format = 'NCHW'
 
     outputs = self.pool_function(
@@ -81,17 +86,18 @@ class _Pooling1D(base.Layer):
         data_format=data_format)
 
     if self.data_format == 'channels_last':
-      return array_ops.squeeze(outputs, 2)
-    else:
       return array_ops.squeeze(outputs, 1)
+    else:
+      return array_ops.squeeze(outputs, 2)
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     length = utils.conv_output_length(input_shape[1], self.pool_size[0],
                                       self.padding, self.strides[0])
     return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
 
 
+@tf_export('layers.AveragePooling1D')
 class AveragePooling1D(_Pooling1D):
   """Average Pooling layer for 1D inputs.
 
@@ -123,6 +129,7 @@ class AveragePooling1D(_Pooling1D):
         **kwargs)
 
 
+@tf_export('layers.average_pooling1d')
 def average_pooling1d(inputs, pool_size, strides,
                       padding='valid', data_format='channels_last',
                       name=None):
@@ -157,6 +164,7 @@ def average_pooling1d(inputs, pool_size, strides,
   return layer.apply(inputs)
 
 
+@tf_export('layers.MaxPooling1D')
 class MaxPooling1D(_Pooling1D):
   """Max Pooling layer for 1D inputs.
 
@@ -188,6 +196,7 @@ class MaxPooling1D(_Pooling1D):
         **kwargs)
 
 
+@tf_export('layers.max_pooling1d')
 def max_pooling1d(inputs, pool_size, strides,
                   padding='valid', data_format='channels_last',
                   name=None):
@@ -273,7 +282,7 @@ class _Pooling2D(base.Layer):
         data_format=utils.convert_data_format(self.data_format, 4))
     return outputs
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
       rows = input_shape[2]
@@ -293,6 +302,7 @@ class _Pooling2D(base.Layer):
           [input_shape[0], rows, cols, input_shape[3]])
 
 
+@tf_export('layers.AveragePooling2D')
 class AveragePooling2D(_Pooling2D):
   """Average pooling layer for 2D inputs (e.g. images).
 
@@ -324,6 +334,7 @@ class AveragePooling2D(_Pooling2D):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@tf_export('layers.average_pooling2d')
 def average_pooling2d(inputs,
                       pool_size, strides,
                       padding='valid', data_format='channels_last',
@@ -361,6 +372,7 @@ def average_pooling2d(inputs,
   return layer.apply(inputs)
 
 
+@tf_export('layers.MaxPooling2D')
 class MaxPooling2D(_Pooling2D):
   """Max pooling layer for 2D inputs (e.g. images).
 
@@ -392,6 +404,7 @@ class MaxPooling2D(_Pooling2D):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@tf_export('layers.max_pooling2d')
 def max_pooling2d(inputs,
                   pool_size, strides,
                   padding='valid', data_format='channels_last',
@@ -487,7 +500,7 @@ class _Pooling3D(base.Layer):
       outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
     return outputs
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
       len_dim1 = input_shape[2]
@@ -511,6 +524,7 @@ class _Pooling3D(base.Layer):
           [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
 
 
+@tf_export('layers.AveragePooling3D')
 class AveragePooling3D(_Pooling3D):
   """Average pooling layer for 3D inputs (e.g. volumes).
 
@@ -544,6 +558,7 @@ class AveragePooling3D(_Pooling3D):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@tf_export('layers.average_pooling3d')
 def average_pooling3d(inputs,
                       pool_size, strides,
                       padding='valid', data_format='channels_last',
@@ -583,6 +598,7 @@ def average_pooling3d(inputs,
   return layer.apply(inputs)
 
 
+@tf_export('layers.MaxPooling3D')
 class MaxPooling3D(_Pooling3D):
   """Max pooling layer for 3D inputs (e.g. volumes).
 
@@ -616,6 +632,7 @@ class MaxPooling3D(_Pooling3D):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
+@tf_export('layers.max_pooling3d')
 def max_pooling3d(inputs,
                   pool_size, strides,
                   padding='valid', data_format='channels_last',
diff --git a/tensorflow/python/layers/pooling_test.py b/tensorflow/python/layers/pooling_test.py
index 589fee5f7196cc542b39506c5bda580a92647f0d..7533674e5a0cf60f91551cd6333c8d802612e03d 100644
--- a/tensorflow/python/layers/pooling_test.py
+++ b/tensorflow/python/layers/pooling_test.py
@@ -96,33 +96,41 @@ class PoolingTest(test.TestCase):
 
   def testCreateMaxPooling1D(self):
     width = 7
-    images = random_ops.random_uniform((5, width, 4))
+    channels = 3
+    images = random_ops.random_uniform((5, width, channels))
     layer = pooling_layers.MaxPooling1D(2, strides=2)
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, width // 2, channels])
 
   def testCreateAveragePooling1D(self):
     width = 7
-    images = random_ops.random_uniform((5, width, 4))
+    channels = 3
+    images = random_ops.random_uniform((5, width, channels))
     layer = pooling_layers.AveragePooling1D(2, strides=2)
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, width // 2, channels])
 
   def testCreateMaxPooling1DChannelsFirst(self):
     width = 7
-    images = random_ops.random_uniform((5, width, 4))
+    channels = 3
+    images = random_ops.random_uniform((5, channels, width))
     layer = pooling_layers.MaxPooling1D(
         2, strides=2, data_format='channels_first')
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, channels, width // 2])
 
   def testCreateAveragePooling1DChannelsFirst(self):
     width = 7
-    images = random_ops.random_uniform((5, width, 4))
+    channels = 3
+    images = random_ops.random_uniform((5, channels, width))
     layer = pooling_layers.AveragePooling1D(
         2, strides=2, data_format='channels_first')
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, channels, width // 2])
 
   def testCreateMaxPooling3D(self):
     depth, height, width = 6, 7, 9
diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py
index 766a6800d443a79d9bd130833c27f26c844cadaf..1bbf4e6dffd3415ba246e26cd92923df8116edab 100644
--- a/tensorflow/python/layers/utils.py
+++ b/tensorflow/python/layers/utils.py
@@ -81,7 +81,7 @@ def normalize_tuple(value, n, name):
     for single_value in value_tuple:
       try:
         int(single_value)
-      except ValueError:
+      except (ValueError, TypeError):
         raise ValueError('The `' + name + '` argument must be a tuple of ' +
                          str(n) + ' integers. Received: ' + str(value) + ' '
                          'including element ' + str(single_value) + ' of type' +
@@ -208,7 +208,7 @@ def smart_cond(pred, fn1, fn2, name=None):
     else:
       return fn2()
   else:
-    return control_flow_ops.cond(pred, fn1, fn2, name)
+    return control_flow_ops.cond(pred, true_fn=fn1, false_fn=fn2, name=name)
 
 
 def constant_value(pred):
@@ -216,7 +216,7 @@ def constant_value(pred):
 
   Arguments:
     pred: A scalar, either a Python bool or a TensorFlow boolean variable
-      or tensor.
+      or tensor, or the Python integer 1 or 0.
 
   Returns:
     True or False if `pred` has a constant boolean value, None otherwise.
@@ -224,6 +224,12 @@ def constant_value(pred):
   Raises:
     TypeError: If `pred` is not a Variable, Tensor or bool.
   """
+  # Allow integer booleans.
+  if pred == 0:
+    pred = False
+  elif pred == 1:
+    pred = True
+
   if isinstance(pred, bool):
     pred_value = pred
   elif isinstance(pred, variables.Variable):
@@ -249,3 +255,45 @@ def static_shape(x):
     return tuple(x.get_shape().as_list())
   except ValueError:
     return None
+
+
+def get_reachable_from_inputs(inputs, targets=None):
+  """Returns the set of tensors reachable from `inputs`.
+
+  Stops if all targets have been found (target is optional).
+
+  Only valid in Symbolic mode, not Eager mode.
+
+  Args:
+    inputs: List of tensors.
+    targets: List of tensors.
+
+  Returns:
+    A set of tensors reachable from the inputs (includes the inputs themselves).
+  """
+  reachable = set(inputs)
+  if targets:
+    targets = set(targets)
+  queue = inputs[:]
+
+  while queue:
+    x = queue.pop()
+    outputs = []
+    try:
+      consumers = x.consumers()
+    except AttributeError:
+      # Case where x is a variable type
+      consumers = [x.op]
+    for z in consumers:
+      consumer_outputs = z.outputs
+      if consumer_outputs:  # May be None
+        outputs += consumer_outputs
+
+    for y in outputs:
+      if y not in reachable:
+        reachable.add(y)
+        queue.insert(0, y)
+
+    if targets and targets.issubset(reachable):
+      return reachable
+  return reachable
diff --git a/tensorflow/python/layers/utils_test.py b/tensorflow/python/layers/utils_test.py
index a560f6b6d21efc0c1070d5a9296a7a8e914e2eb9..c941aad7bc63dbb891fbe78cd2a47dd6805bf231 100644
--- a/tensorflow/python/layers/utils_test.py
+++ b/tensorflow/python/layers/utils_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.layers import utils
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -87,5 +88,34 @@ class ConvUtilsTest(test.TestCase):
     self.assertEqual(3, utils.deconv_output_length(4, 2, 'full', 1))
     self.assertEqual(6, utils.deconv_output_length(4, 2, 'full', 2))
 
+
+class GraphUtilsTest(test.TestCase):
+
+  def testGetReachableFromInputs(self):
+
+    with self.test_session():
+      pl_1 = array_ops.placeholder(shape=None, dtype='float32')
+      pl_2 = array_ops.placeholder(shape=None, dtype='float32')
+      pl_3 = array_ops.placeholder(shape=None, dtype='float32')
+      x_1 = pl_1 + pl_2
+      x_2 = pl_2 * 2
+      x_3 = pl_3 + 1
+      x_4 = x_1 + x_2
+      x_5 = x_3 * pl_1
+
+      self.assertEqual(
+          utils.get_reachable_from_inputs([pl_1]),
+          {pl_1, x_1, x_4, x_5})
+      self.assertEqual(
+          utils.get_reachable_from_inputs([pl_1, pl_2]),
+          {pl_1, pl_2, x_1, x_2, x_4, x_5})
+      self.assertEqual(
+          utils.get_reachable_from_inputs([pl_3]),
+          {pl_3, x_3, x_5})
+      self.assertEqual(
+          utils.get_reachable_from_inputs([x_3]),
+          {x_3, x_5})
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/lib/core/bfloat16.cc b/tensorflow/python/lib/core/bfloat16.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f07deebef3d8e8f24f73a42f29f4ade4cae568d
--- /dev/null
+++ b/tensorflow/python/lib/core/bfloat16.cc
@@ -0,0 +1,674 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+
+#include "tensorflow/python/lib/core/bfloat16.h"
+
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/python/lib/core/numpy.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
+
+namespace tensorflow {
+namespace {
+
+// Workarounds for Python 2 vs 3 API differences.
+#if PY_MAJOR_VERSION < 3
+
+PyObject* MakePyString(const string& s) {
+  return PyString_FromString(s.c_str());
+}
+
+typedef long HashType;  // NOLINT
+
+bool TfPyInt_Check(PyObject* object) { return PyInt_Check(object); }
+
+PyObject* TfPyInt_FromLong(long x) {  // NOLINT
+  return PyInt_FromLong(x);
+}
+
+long TfPyInt_AsLong(PyObject* x) {  // NOLINT
+  return PyInt_AsLong(x);
+}
+
+#else  // PY_MAJOR_VERSION < 3
+
+PyObject* MakePyString(const string& s) {
+  return PyUnicode_FromString(s.c_str());
+}
+
+bool TfPyInt_Check(PyObject* object) {
+  if (!PyLong_Check(object)) {
+    return 0;
+  }
+  int overflow = 0;
+  PyLong_AsLongAndOverflow(object, &overflow);
+  return (overflow == 0);
+}
+
+PyObject* TfPyInt_FromLong(long x) {  // NOLINT
+  return PyLong_FromLong(x);
+}
+
+long TfPyInt_AsLong(PyObject* x) {  // NOLINT
+  return PyLong_AsLong(x);
+}
+
+typedef Py_hash_t HashType;
+
+#endif  // PY_MAJOR_VERSION < 3
+
+// Forward declaration.
+extern PyTypeObject PyBfloat16_Type;
+
+// Representation of a Python bfloat16 object.
+struct PyBfloat16 {
+  PyObject_HEAD;  // Python object header
+  bfloat16 value;
+};
+
+// Returns true if 'object' is a PyBfloat16.
+bool PyBfloat16_Check(PyObject* object) {
+  return PyObject_IsInstance(object,
+                             reinterpret_cast<PyObject*>(&PyBfloat16_Type));
+}
+
+// Extracts the value of a PyBfloat16 object.
+bfloat16 PyBfloat16_Bfloat16(PyObject* object) {
+  return reinterpret_cast<PyBfloat16*>(object)->value;
+}
+
+// Constructs a PyBfloat16 object from a bfloat16.
+Safe_PyObjectPtr PyBfloat16_FromBfloat16(bfloat16 x) {
+  Safe_PyObjectPtr ref =
+      make_safe(PyBfloat16_Type.tp_alloc(&PyBfloat16_Type, 0));
+  PyBfloat16* p = reinterpret_cast<PyBfloat16*>(ref.get());
+  if (p) {
+    p->value = x;
+  }
+  return ref;
+}
+
+// Converts a Python object to a bfloat16 value. Returns true on success,
+// returns false and reports a Python error on failure.
+bool AsBfloat16(PyObject* arg, bfloat16* output) {
+  if (PyBfloat16_Check(arg)) {
+    *output = PyBfloat16_Bfloat16(arg);
+    return true;
+  }
+  if (PyFloat_Check(arg)) {
+    double d = PyFloat_AsDouble(arg);
+    if (PyErr_Occurred()) {
+      return false;
+    }
+    // TODO(phawkins): check for overflow
+    *output = bfloat16(d);
+    return true;
+  }
+  if (TfPyInt_Check(arg)) {
+    long l = TfPyInt_AsLong(arg);  // NOLINT
+    if (PyErr_Occurred()) {
+      return false;
+    }
+    // TODO(phawkins): check for overflow
+    *output = bfloat16(static_cast<float>(l));
+    return true;
+  }
+  if (PyArray_IsScalar(arg, Float)) {
+    float f;
+    PyArray_ScalarAsCtype(arg, &f);
+    *output = bfloat16(f);
+    return true;
+  }
+  PyErr_Format(PyExc_TypeError, "expected number, got %s",
+               arg->ob_type->tp_name);
+  return false;
+}
+
+// Converts a PyBfloat16 into a PyFloat.
+PyObject* PyBfloat16_Float(PyObject* self) {
+  bfloat16 x = PyBfloat16_Bfloat16(self);
+  return PyFloat_FromDouble(static_cast<double>(x));
+}
+
+// Converts a PyBfloat16 into a PyInt.
+PyObject* PyBfloat16_Int(PyObject* self) {
+  bfloat16 x = PyBfloat16_Bfloat16(self);
+  long y = static_cast<long>(x);  // NOLINT
+  return TfPyInt_FromLong(y);
+}
+
+// Negates a PyBfloat16.
+PyObject* PyBfloat16_Negative(PyObject* self) {
+  bfloat16 x = PyBfloat16_Bfloat16(self);
+  return PyBfloat16_FromBfloat16(-x).release();
+}
+
+// Binary arithmetic operators on PyBfloat16 values.
+#define BFLOAT16_BINOP(name, op)                                  \
+  PyObject* PyBfloat16_##name(PyObject* a, PyObject* b) {         \
+    bfloat16 x, y;                                                \
+    if (!AsBfloat16(a, &x) || !AsBfloat16(b, &y)) return nullptr; \
+    bfloat16 z = x op y;                                          \
+    return PyBfloat16_FromBfloat16(z).release();                  \
+  }
+BFLOAT16_BINOP(Add, +)
+BFLOAT16_BINOP(Subtract, -)
+BFLOAT16_BINOP(Multiply, *)
+BFLOAT16_BINOP(Divide, /)
+#undef BFLOAT16_BINOP
+
+// Python number methods for PyBfloat16 objects.
+PyNumberMethods PyBfloat16_AsNumber = {
+    PyBfloat16_Add,       // nb_add
+    PyBfloat16_Subtract,  // nb_subtract
+    PyBfloat16_Multiply,  // nb_multiply
+#if PY_MAJOR_VERSION < 3
+    PyBfloat16_Divide,  // nb_divide
+#endif
+    nullptr,              // nb_remainder
+    nullptr,              // nb_divmod
+    nullptr,              // nb_power
+    PyBfloat16_Negative,  // nb_negative
+    nullptr,              // nb_positive
+    nullptr,              // nb_absolute
+    nullptr,              // nb_nonzero
+    nullptr,              // nb_invert
+    nullptr,              // nb_lshift
+    nullptr,              // nb_rshift
+    nullptr,              // nb_and
+    nullptr,              // nb_xor
+    nullptr,              // nb_or
+#if PY_MAJOR_VERSION < 3
+    nullptr,  // nb_coerce
+#endif
+    PyBfloat16_Int,  // nb_int
+#if PY_MAJOR_VERSION < 3
+    PyBfloat16_Int,  // nb_long
+#else
+    nullptr,  // reserved
+#endif
+    PyBfloat16_Float,  // nb_float
+#if PY_MAJOR_VERSION < 3
+    nullptr,  // nb_oct
+    nullptr,  // nb_hex
+#endif
+
+    nullptr,  // nb_inplace_add
+    nullptr,  // nb_inplace_subtract
+    nullptr,  // nb_inplace_multiply
+#if PY_MAJOR_VERSION < 3
+    nullptr,  // nb_inplace_divide
+#endif
+    nullptr,  // nb_inplace_remainder
+    nullptr,  // nb_inplace_power
+    nullptr,  // nb_inplace_lshift
+    nullptr,  // nb_inplace_rshift
+    nullptr,  // nb_inplace_and
+    nullptr,  // nb_inplace_xor
+    nullptr,  // nb_inplace_or
+
+    nullptr,            // nb_floor_divide
+    PyBfloat16_Divide,  // nb_true_divide
+    nullptr,            // nb_inplace_floor_divide
+    nullptr,            // nb_inplace_true_divide
+    nullptr,            // nb_index
+};
+
+// Constructs a new PyBfloat16.
+PyObject* PyBfloat16_New(PyTypeObject* type, PyObject* args, PyObject* kwds) {
+  if (kwds && PyDict_Size(kwds)) {
+    PyErr_SetString(PyExc_TypeError, "constructor takes no keyword arguments");
+    return nullptr;
+  }
+  Py_ssize_t size = PyTuple_Size(args);
+  if (size != 1) {
+    PyErr_SetString(PyExc_TypeError,
+                    "expected number as argument to bfloat16 constructor");
+    return nullptr;
+  }
+  PyObject* arg = PyTuple_GetItem(args, 0);
+
+  if (PyBfloat16_Check(arg)) {
+    Py_INCREF(arg);
+    return arg;
+  } else {
+    bfloat16 value;
+    if (!AsBfloat16(arg, &value)) {
+      return nullptr;
+    }
+    return PyBfloat16_FromBfloat16(value).release();
+  }
+}
+
+// Comparisons on PyBfloat16s.
+PyObject* PyBfloat16_RichCompare(PyObject* a, PyObject* b, int op) {
+  bfloat16 x, y;
+  if (!AsBfloat16(a, &x) || !AsBfloat16(b, &y)) return nullptr;
+  bool result;
+  switch (op) {
+    case Py_LT:
+      result = x < y;
+      break;
+    case Py_LE:
+      result = x <= y;
+      break;
+    case Py_EQ:
+      result = x == y;
+      break;
+    case Py_NE:
+      result = x != y;
+      break;
+    case Py_GT:
+      result = x > y;
+      break;
+    case Py_GE:
+      result = x >= y;
+      break;
+    default:
+      LOG(FATAL) << "Invalid op type " << op;
+  }
+  return PyBool_FromLong(result);
+}
+
+// Implementation of repr() for PyBfloat16.
+PyObject* PyBfloat16_Repr(PyObject* self) {
+  bfloat16 x = reinterpret_cast<PyBfloat16*>(self)->value;
+  string v = strings::StrCat("bfloat16(", static_cast<float>(x), ")");
+  return MakePyString(v);
+}
+
+// Implementation of str() for PyBfloat16.
+PyObject* PyBfloat16_Str(PyObject* self) {
+  bfloat16 x = reinterpret_cast<PyBfloat16*>(self)->value;
+  string v = strings::StrCat(static_cast<float>(x));
+  return MakePyString(v);
+}
+
+// Hash function for PyBfloat16. We use the identity function, which is a weak
+// hash function.
+HashType PyBfloat16_Hash(PyObject* self) {
+  bfloat16 x = reinterpret_cast<PyBfloat16*>(self)->value;
+  return x.value;
+}
+
+// Python type for PyBfloat16 objects.
+PyTypeObject PyBfloat16_Type = {
+#if PY_MAJOR_VERSION < 3
+    PyObject_HEAD_INIT(nullptr) 0,  // ob_size
+#else
+    PyVarObject_HEAD_INIT(nullptr, 0)
+#endif
+    "bfloat16",                                // tp_name
+    sizeof(PyBfloat16),                        // tp_basicsize
+    0,                                         // tp_itemsize
+    nullptr,                                   // tp_dealloc
+    nullptr,                                   // tp_print
+    nullptr,                                   // tp_getattr
+    nullptr,                                   // tp_setattr
+    nullptr,                                   // tp_compare / tp_reserved
+    PyBfloat16_Repr,                           // tp_repr
+    &PyBfloat16_AsNumber,                      // tp_as_number
+    nullptr,                                   // tp_as_sequence
+    nullptr,                                   // tp_as_mapping
+    PyBfloat16_Hash,                           // tp_hash
+    nullptr,                                   // tp_call
+    PyBfloat16_Str,                            // tp_str
+    nullptr,                                   // tp_getattro
+    nullptr,                                   // tp_setattro
+    nullptr,                                   // tp_as_buffer
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,  // tp_flags
+    "bfloat16 floating-point values",          // tp_doc
+    nullptr,                                   // tp_traverse
+    nullptr,                                   // tp_clear
+    PyBfloat16_RichCompare,                    // tp_richcompare
+    0,                                         // tp_weaklistoffset
+    nullptr,                                   // tp_iter
+    nullptr,                                   // tp_iternext
+    nullptr,                                   // tp_methods
+    nullptr,                                   // tp_members
+    nullptr,                                   // tp_getset
+    nullptr,                                   // tp_base
+    nullptr,                                   // tp_dict
+    nullptr,                                   // tp_descr_get
+    nullptr,                                   // tp_descr_set
+    0,                                         // tp_dictoffset
+    nullptr,                                   // tp_init
+    nullptr,                                   // tp_alloc
+    PyBfloat16_New,                            // tp_new
+    nullptr,                                   // tp_free
+    nullptr,                                   // tp_is_gc
+    nullptr,                                   // tp_bases
+    nullptr,                                   // tp_mro
+    nullptr,                                   // tp_cache
+    nullptr,                                   // tp_subclasses
+    nullptr,                                   // tp_weaklist
+    nullptr,                                   // tp_del
+    0,                                         // tp_version_tag
+};
+
+// Numpy support
+
+PyArray_ArrFuncs NPyBfloat16_ArrFuncs;
+
+PyArray_Descr NPyBfloat16_Descr = {
+    PyObject_HEAD_INIT(nullptr) & PyBfloat16_Type,  // typeobj
+    // We must register bfloat16 with a kind other than "f", because numpy
+    // considers two types with the same kind and size to be equal, but
+    // float16 != bfloat16.
+    'V',  // kind
+    // TODO(phawkins): there doesn't seem to be a way of guaranteeing a type
+    // character is unique.
+    'E',                                                  // type
+    '=',                                                  // byteorder
+    NPY_NEEDS_PYAPI | NPY_USE_GETITEM | NPY_USE_SETITEM,  // hasobject
+    0,                                                    // type_num
+    sizeof(bfloat16),                                     // elsize
+    alignof(bfloat16),                                    // alignment
+    nullptr,                                              // subarray
+    nullptr,                                              // fields
+    nullptr,                                              // names
+    &NPyBfloat16_ArrFuncs,                                // f
+};
+
+// Registered numpy type ID. Global variable populated by the registration code.
+int npy_bfloat16_ = -1;
+
+// Implementations of NumPy array methods.
+
+PyObject* NPyBfloat16_GetItem(void* data, void* arr) {
+  bfloat16 x;
+  memcpy(&x, data, sizeof(bfloat16));
+  return PyBfloat16_FromBfloat16(x).release();
+}
+
+int NPyBfloat16_SetItem(PyObject* item, void* data, void* arr) {
+  bfloat16 x;
+  if (!AsBfloat16(item, &x)) return -1;
+  memcpy(data, &x, sizeof(bfloat16));
+  return 0;
+}
+
+void ByteSwap16(void* value) {
+  char* p = reinterpret_cast<char*>(value);
+  std::swap(p[0], p[1]);
+}
+
+void NPyBfloat16_CopySwapN(void* dstv, npy_intp dstride, void* srcv,
+                           npy_intp sstride, npy_intp n, int swap, void* arr) {
+  char* dst = reinterpret_cast<char*>(dstv);
+  char* src = reinterpret_cast<char*>(srcv);
+  if (!src) {
+    return;
+  }
+  if (swap) {
+    for (npy_intp i = 0; i < n; i++) {
+      char* r = dst + dstride * i;
+      memcpy(r, src + sstride * i, sizeof(uint16_t));
+      ByteSwap16(r);
+    }
+  } else if (dstride == sizeof(uint16_t) && sstride == sizeof(uint16_t)) {
+    memcpy(dst, src, n * sizeof(uint16_t));
+  } else {
+    for (npy_intp i = 0; i < n; i++) {
+      memcpy(dst + dstride * i, src + sstride * i, sizeof(uint16_t));
+    }
+  }
+}
+
+void NPyBfloat16_CopySwap(void* dst, void* src, int swap, void* arr) {
+  if (!src) {
+    return;
+  }
+  memcpy(dst, src, sizeof(uint16_t));
+  if (swap) {
+    ByteSwap16(dst);
+  }
+}
+
+npy_bool NPyBfloat16_NonZero(void* data, void* arr) {
+  bfloat16 x;
+  memcpy(&x, data, sizeof(x));
+  return x != static_cast<bfloat16>(0);
+}
+
+// NumPy casts
+
+// Performs a NumPy array cast from type 'From' to 'To'.
+template <typename From, typename To>
+void NPyCast(void* from_void, void* to_void, npy_intp n, void* fromarr,
+             void* toarr) {
+  const From* from = reinterpret_cast<From*>(from_void);
+  To* to = reinterpret_cast<To*>(to_void);
+  for (npy_intp i = 0; i < n; ++i) {
+    to[i] = static_cast<To>(from[i]);
+  }
+}
+
+// Registers a cast between bfloat16 and type 'T'. 'numpy_type' is the NumPy
+// type corresponding to 'T'. If 'cast_is_safe', registers that bfloat16 can be
+// safely coerced to T.
+template <typename T>
+bool RegisterBfloat16Cast(int numpy_type, bool cast_is_safe) {
+  if (PyArray_RegisterCastFunc(PyArray_DescrFromType(numpy_type), npy_bfloat16_,
+                               NPyCast<T, bfloat16>) < 0) {
+    return false;
+  }
+  if (PyArray_RegisterCastFunc(&NPyBfloat16_Descr, numpy_type,
+                               NPyCast<bfloat16, T>) < 0) {
+    return false;
+  }
+  if (cast_is_safe && PyArray_RegisterCanCast(&NPyBfloat16_Descr, numpy_type,
+                                              NPY_NOSCALAR) < 0) {
+    return false;
+  }
+  return true;
+}
+
+template <typename InType, typename OutType, typename Functor>
+void BinaryUFunc(char** args, npy_intp* dimensions, npy_intp* steps,
+                 void* data) {
+  const char* i0 = args[0];
+  const char* i1 = args[1];
+  char* o = args[2];
+  for (npy_intp k = 0; k < *dimensions; k++) {
+    InType x = *reinterpret_cast<const InType*>(i0);
+    InType y = *reinterpret_cast<const InType*>(i1);
+    *reinterpret_cast<OutType*>(o) = Functor()(x, y);
+    i0 += steps[0];
+    i1 += steps[1];
+    o += steps[2];
+  }
+}
+
+template <typename Functor>
+void CompareUFunc(char** args, npy_intp* dimensions, npy_intp* steps,
+                  void* data) {
+  BinaryUFunc<bfloat16, npy_bool, Functor>(args, dimensions, steps, data);
+}
+
+struct Bfloat16EqFunctor {
+  npy_bool operator()(bfloat16 a, bfloat16 b) { return a == b; }
+};
+struct Bfloat16NeFunctor {
+  npy_bool operator()(bfloat16 a, bfloat16 b) { return a != b; }
+};
+struct Bfloat16LtFunctor {
+  npy_bool operator()(bfloat16 a, bfloat16 b) { return a < b; }
+};
+struct Bfloat16GtFunctor {
+  npy_bool operator()(bfloat16 a, bfloat16 b) { return a > b; }
+};
+struct Bfloat16LeFunctor {
+  npy_bool operator()(bfloat16 a, bfloat16 b) { return a <= b; }
+};
+struct Bfloat16GeFunctor {
+  npy_bool operator()(bfloat16 a, bfloat16 b) { return a >= b; }
+};
+
+// Initializes the module.
+bool Initialize() {
+  // It's critical to import umath to avoid crash in open source build.
+  import_umath1(false);
+
+  Safe_PyObjectPtr numpy_str = make_safe(MakePyString("numpy"));
+  if (!numpy_str) {
+    return false;
+  }
+  Safe_PyObjectPtr numpy = make_safe(PyImport_Import(numpy_str.get()));
+  if (!numpy) {
+    return false;
+  }
+
+  // We hit a mysterious crash if we haven't initialized numpy before this:
+  PyBfloat16_Type.tp_base = &PyGenericArrType_Type;
+
+  if (PyType_Ready(&PyBfloat16_Type) < 0) {
+    return false;
+  }
+
+  // Initializes the NumPy descriptor.
+  PyArray_InitArrFuncs(&NPyBfloat16_ArrFuncs);
+  NPyBfloat16_ArrFuncs.getitem = NPyBfloat16_GetItem;
+  NPyBfloat16_ArrFuncs.setitem = NPyBfloat16_SetItem;
+  NPyBfloat16_ArrFuncs.copyswapn = NPyBfloat16_CopySwapN;
+  NPyBfloat16_ArrFuncs.copyswap = NPyBfloat16_CopySwap;
+  NPyBfloat16_ArrFuncs.nonzero = NPyBfloat16_NonZero;
+
+  Py_TYPE(&NPyBfloat16_Descr) = &PyArrayDescr_Type;
+  npy_bfloat16_ = PyArray_RegisterDataType(&NPyBfloat16_Descr);
+  if (npy_bfloat16_ < 0) return false;
+
+  // Support dtype(bfloat16)
+  if (PyDict_SetItemString(PyBfloat16_Type.tp_dict, "dtype",
+                           reinterpret_cast<PyObject*>(&NPyBfloat16_Descr)) <
+      0) {
+    return false;
+  }
+
+  // Register casts
+
+  // We lie shamelessly and say that a cast from half to bfloat16 is safe.
+  // Numpy frequently uses the smallest legal representation type for small
+  // float constants (e.g., 1.0), which is often float16. Things break if these
+  // cannot be converted transparently to bfloat16.
+  if (!RegisterBfloat16Cast<Eigen::half>(NPY_HALF, /*cast_is_safe=*/true)) {
+    return false;
+  }
+
+  if (!RegisterBfloat16Cast<float>(NPY_FLOAT, /*cast_is_safe=*/true)) {
+    return false;
+  }
+  if (!RegisterBfloat16Cast<double>(NPY_DOUBLE, /*cast_is_safe=*/true)) {
+    return false;
+  }
+  if (!RegisterBfloat16Cast<int32>(NPY_INT32, /*cast_is_safe=*/false)) {
+    return false;
+  }
+  if (!RegisterBfloat16Cast<int64>(NPY_INT64, /*cast_is_safe=*/false)) {
+    return false;
+  }
+  // Following the numpy convention. imag part is dropped when converting to
+  // float.
+  if (!RegisterBfloat16Cast<complex64>(NPY_COMPLEX64, /*cast_is_safe=*/true)) {
+    return false;
+  }
+  if (!RegisterBfloat16Cast<complex128>(NPY_COMPLEX128,
+                                        /*cast_is_safe=*/true)) {
+    return false;
+  }
+
+  // Register ufuncs
+  auto register_ufunc = [&](const char* name, PyUFuncGenericFunction fn,
+                            const std::array<int, 3>& types) {
+    Safe_PyObjectPtr ufunc_obj =
+        make_safe(PyObject_GetAttrString(numpy.get(), name));
+    if (!ufunc_obj) {
+      return false;
+    }
+    PyUFuncObject* ufunc = reinterpret_cast<PyUFuncObject*>(ufunc_obj.get());
+    if (types.size() != ufunc->nargs) {
+      PyErr_Format(PyExc_AssertionError,
+                   "ufunc %s takes %d arguments, loop takes %lu", name,
+                   ufunc->nargs, types.size());
+      return false;
+    }
+    if (PyUFunc_RegisterLoopForType(ufunc, npy_bfloat16_, fn,
+                                    const_cast<int*>(types.data()),
+                                    nullptr) < 0) {
+      return false;
+    }
+    return true;
+  };
+
+  // Comparisons
+  const std::array<int, 3> compare_types = {npy_bfloat16_, npy_bfloat16_,
+                                            NPY_BOOL};
+
+  if (!register_ufunc("equal", CompareUFunc<Bfloat16EqFunctor>,
+                      compare_types)) {
+    return false;
+  }
+  if (!register_ufunc("not_equal", CompareUFunc<Bfloat16NeFunctor>,
+                      compare_types)) {
+    return false;
+  }
+  if (!register_ufunc("less", CompareUFunc<Bfloat16LtFunctor>, compare_types)) {
+    return false;
+  }
+  if (!register_ufunc("greater", CompareUFunc<Bfloat16GtFunctor>,
+                      compare_types)) {
+    return false;
+  }
+  if (!register_ufunc("less_equal", CompareUFunc<Bfloat16LeFunctor>,
+                      compare_types)) {
+    return false;
+  }
+  if (!register_ufunc("greater_equal", CompareUFunc<Bfloat16GeFunctor>,
+                      compare_types)) {
+    return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+void RegisterNumpyBfloat16() {
+  if (npy_bfloat16_ >= 0) {
+    // Already initialized.
+    return;
+  }
+  if (!Initialize()) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_RuntimeError, "cannot load bfloat16 module.");
+    }
+    PyErr_Print();
+  }
+}
+
+PyObject* Bfloat16PyType() {
+  CHECK(PyBfloat16_Type.tp_base != nullptr);
+  Py_INCREF(&PyBfloat16_Type);
+  return reinterpret_cast<PyObject*>(&PyBfloat16_Type);
+}
+
+int Bfloat16NumpyType() {
+  CHECK_GE(npy_bfloat16_, 0);
+  return npy_bfloat16_;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/bfloat16.h b/tensorflow/python/lib/core/bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..a609928ba9029af00553a4664bef18d3749e64db
--- /dev/null
+++ b/tensorflow/python/lib/core/bfloat16.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_BFLOAT16_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_BFLOAT16_H_
+
+#include <Python.h>
+
+namespace tensorflow {
+
+// Register the bfloat16 numpy type.
+void RegisterNumpyBfloat16();
+
+// Returns the PyObject for the bfloat16 type.
+PyObject* Bfloat16PyType();
+
+// Returns the id number of the bfloat16 numpy type.
+int Bfloat16NumpyType();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_BFLOAT16_H_
diff --git a/tensorflow/python/lib/core/bfloat16.i b/tensorflow/python/lib/core/bfloat16.i
new file mode 100644
index 0000000000000000000000000000000000000000..10444b676b2549e0d9f96391f96e7a523f768d85
--- /dev/null
+++ b/tensorflow/python/lib/core/bfloat16.i
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%{
+#include "tensorflow/python/lib/core/bfloat16.h"
+%}
+
+%init %{
+tensorflow::RegisterNumpyBfloat16();
+%}
+
+%{
+PyObject* TF_bfloat16_type() {
+  return tensorflow::Bfloat16PyType();
+}
+%}
+
+PyObject* TF_bfloat16_type();
diff --git a/tensorflow/python/lib/core/bfloat16_test.py b/tensorflow/python/lib/core/bfloat16_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..09d4b01fa43babdc09f8f255e79bbed539ddc04c
--- /dev/null
+++ b/tensorflow/python/lib/core/bfloat16_test.py
@@ -0,0 +1,250 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Test cases for the bfloat16 Python type."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+
+# pylint: disable=unused-import,g-bad-import-order
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
+
+
+class Bfloat16Test(test.TestCase):
+
+  def float_values(self):
+    """Returns values that should round trip exactly to float and back."""
+    epsilon = float.fromhex("1.0p-7")
+    return [
+        0.0, 1.0, -1, 0.5, -0.5, epsilon, 1.0 + epsilon, 1.0 - epsilon,
+        -1.0 - epsilon, -1.0 + epsilon, 3.5, 42.0, 255.0, 256.0,
+        float("inf"), float("-inf"), float("nan")]
+
+  def _assertFloatIdentical(self, v, w):
+    if math.isnan(v):
+      self.assertTrue(math.isnan(w))
+    else:
+      self.assertEqual(v, w)
+
+  def testRoundTripToFloat(self):
+    for v in self.float_values():
+      self._assertFloatIdentical(v, float(bfloat16(v)))
+
+  def testRoundTripToInt(self):
+    for v in [-256, -255, -34, -2, -1, 0, 1, 2, 10, 47, 128, 255, 256, 512]:
+      self.assertEqual(v, int(bfloat16(v)))
+
+  def testStr(self):
+    self.assertEqual("0", str(bfloat16(0.0)))
+    self.assertEqual("1", str(bfloat16(1.0)))
+    self.assertEqual("-3.5", str(bfloat16(-3.5)))
+    self.assertEqual("0.0078125", str(bfloat16(float.fromhex("1.0p-7"))))
+    self.assertEqual("inf", str(bfloat16(float("inf"))))
+    self.assertEqual("-inf", str(bfloat16(float("-inf"))))
+    self.assertEqual("nan", str(bfloat16(float("nan"))))
+
+  def testRepr(self):
+    self.assertEqual("bfloat16(0)", repr(bfloat16(0)))
+    self.assertEqual("bfloat16(1)", repr(bfloat16(1)))
+    self.assertEqual("bfloat16(-3.5)", repr(bfloat16(-3.5)))
+    self.assertEqual("bfloat16(0.0078125)",
+                     repr(bfloat16(float.fromhex("1.0p-7"))))
+    self.assertEqual("bfloat16(inf)", repr(bfloat16(float("inf"))))
+    self.assertEqual("bfloat16(-inf)", repr(bfloat16(float("-inf"))))
+    self.assertEqual("bfloat16(nan)", repr(bfloat16(float("nan"))))
+
+  def testHash(self):
+    self.assertEqual(0, hash(bfloat16(0.0)))
+    self.assertEqual(0x3f80, hash(bfloat16(1.0)))
+    self.assertEqual(0x7fc0, hash(bfloat16(float("nan"))))
+
+  # Tests for Python operations
+  def testNegate(self):
+    for v in self.float_values():
+      self._assertFloatIdentical(-v, float(-bfloat16(v)))
+
+  def testAdd(self):
+    self._assertFloatIdentical(0, float(bfloat16(0) + bfloat16(0)))
+    self._assertFloatIdentical(1, float(bfloat16(1) + bfloat16(0)))
+    self._assertFloatIdentical(0, float(bfloat16(1) + bfloat16(-1)))
+    self._assertFloatIdentical(5.5, float(bfloat16(2) + bfloat16(3.5)))
+    self._assertFloatIdentical(1.25, float(bfloat16(3.5) + bfloat16(-2.25)))
+    self._assertFloatIdentical(float("inf"),
+                               float(bfloat16(float("inf")) + bfloat16(-2.25)))
+    self._assertFloatIdentical(float("-inf"),
+                               float(bfloat16(float("-inf")) + bfloat16(-2.25)))
+    self.assertTrue(math.isnan(float(bfloat16(3.5) + bfloat16(float("nan")))))
+
+  def testSub(self):
+    self._assertFloatIdentical(0, float(bfloat16(0) - bfloat16(0)))
+    self._assertFloatIdentical(1, float(bfloat16(1) - bfloat16(0)))
+    self._assertFloatIdentical(2, float(bfloat16(1) - bfloat16(-1)))
+    self._assertFloatIdentical(-1.5, float(bfloat16(2) - bfloat16(3.5)))
+    self._assertFloatIdentical(5.75, float(bfloat16(3.5) - bfloat16(-2.25)))
+    self._assertFloatIdentical(float("-inf"),
+                               float(bfloat16(-2.25) - bfloat16(float("inf"))))
+    self._assertFloatIdentical(float("inf"),
+                               float(bfloat16(-2.25) - bfloat16(float("-inf"))))
+    self.assertTrue(math.isnan(float(bfloat16(3.5) - bfloat16(float("nan")))))
+
+  def testMul(self):
+    self._assertFloatIdentical(0, float(bfloat16(0) * bfloat16(0)))
+    self._assertFloatIdentical(0, float(bfloat16(1) * bfloat16(0)))
+    self._assertFloatIdentical(-1, float(bfloat16(1) * bfloat16(-1)))
+    self._assertFloatIdentical(-7.875, float(bfloat16(3.5) * bfloat16(-2.25)))
+    self._assertFloatIdentical(float("-inf"),
+                               float(bfloat16(float("inf")) * bfloat16(-2.25)))
+    self._assertFloatIdentical(float("inf"),
+                               float(bfloat16(float("-inf")) * bfloat16(-2.25)))
+    self.assertTrue(math.isnan(float(bfloat16(3.5) * bfloat16(float("nan")))))
+
+  def testDiv(self):
+    self.assertTrue(math.isnan(float(bfloat16(0) / bfloat16(0))))
+    self._assertFloatIdentical(float("inf"), float(bfloat16(1) / bfloat16(0)))
+    self._assertFloatIdentical(-1, float(bfloat16(1) / bfloat16(-1)))
+    self._assertFloatIdentical(-1.75, float(bfloat16(3.5) / bfloat16(-2)))
+    self._assertFloatIdentical(float("-inf"),
+                               float(bfloat16(float("inf")) / bfloat16(-2.25)))
+    self._assertFloatIdentical(float("inf"),
+                               float(bfloat16(float("-inf")) / bfloat16(-2.25)))
+    self.assertTrue(math.isnan(float(bfloat16(3.5) / bfloat16(float("nan")))))
+
+  def testLess(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v < w, bfloat16(v) < bfloat16(w))
+
+  def testLessEqual(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v <= w, bfloat16(v) <= bfloat16(w))
+
+  def testGreater(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v > w, bfloat16(v) > bfloat16(w))
+
+  def testGreaterEqual(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v >= w, bfloat16(v) >= bfloat16(w))
+
+  def testEqual(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v == w, bfloat16(v) == bfloat16(w))
+
+  def testNotEqual(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v != w, bfloat16(v) != bfloat16(w))
+
+  def testNan(self):
+    a = np.isnan(bfloat16(float("nan")))
+    self.assertTrue(a)
+    np.testing.assert_allclose(np.array([1.0, a]), np.array([1.0, a]))
+
+    a = np.array(
+        [bfloat16(1.34375),
+         bfloat16(1.4375),
+         bfloat16(float("nan"))],
+        dtype=dtypes.bfloat16.as_numpy_dtype)
+    b = np.array(
+        [bfloat16(1.3359375),
+         bfloat16(1.4375),
+         bfloat16(float("nan"))],
+        dtype=dtypes.bfloat16.as_numpy_dtype)
+    np.testing.assert_allclose(
+        a, b, rtol=0.1, atol=0.1, equal_nan=True, err_msg="", verbose=True)
+
+
+class Bfloat16NumPyTest(test.TestCase):
+
+  def testDtype(self):
+    self.assertEqual(bfloat16, np.dtype(bfloat16))
+
+  def testArray(self):
+    x = np.array([[1, 2, 3]], dtype=bfloat16)
+    self.assertEqual(bfloat16, x.dtype)
+    self.assertEqual("[[bfloat16(1) bfloat16(2) bfloat16(3)]]", str(x))
+    self.assertAllEqual(x, x)
+    self.assertAllClose(x, x)
+    self.assertTrue((x == x).all())
+
+  def testComparisons(self):
+    x = np.array([401408, 7, -32], dtype=np.float32)
+    bx = x.astype(bfloat16)
+    y = np.array([82432, 7, 0], dtype=np.float32)
+    by = y.astype(bfloat16)
+    self.assertAllEqual(x == y, bx == by)
+    self.assertAllEqual(x != y, bx != by)
+    self.assertAllEqual(x < y, bx < by)
+    self.assertAllEqual(x > y, bx > by)
+    self.assertAllEqual(x <= y, bx <= by)
+    self.assertAllEqual(x >= y, bx >= by)
+
+  def testEqual2(self):
+    a = np.array([401408], bfloat16)
+    b = np.array([82432], bfloat16)
+    self.assertFalse(a.__eq__(b))
+
+  def testCasts(self):
+    for dtype in [
+        np.float16, np.float32, np.float64, np.int32, np.int64,
+        np.complex64, np.complex128]:
+      x = np.array([[1, 2, 3]], dtype=dtype)
+      y = x.astype(bfloat16)
+      z = y.astype(dtype)
+      self.assertTrue(np.all(x == y))
+      self.assertEqual(bfloat16, y.dtype)
+      self.assertTrue(np.all(x == z))
+      self.assertEqual(dtype, z.dtype)
+
+  def testConformNumpyComplex(self):
+    for dtype in [np.complex64, np.complex128]:
+      x = np.array([1.1, 2.2 + 2.2j, 3.3], dtype=dtype)
+      y_np = x.astype(np.float32)
+      y_tf = x.astype(bfloat16)
+      self.assertAllClose(y_np, y_tf, atol=2e-2)
+
+      z_np = y_np.astype(dtype)
+      z_tf = y_tf.astype(dtype)
+      self.assertAllClose(z_np, z_tf, atol=2e-2)
+
+  def testAdd(self):
+    x = np.array([[1, 2, 3]], dtype=bfloat16)
+    y = np.array([[4, 5, 6]], dtype=bfloat16)
+    self.assertAllClose(np.array([[5, 7, 9]]), x + y)
+
+  def testLogSumExp(self):
+    x = np.array([[1, 2, 3]], dtype=np.float32)
+    y = np.array([[4, 5, 6]], dtype=np.float32)
+    self.assertAllClose(np.logaddexp(x, y),
+                        np.logaddexp(x.astype(bfloat16), y.astype(bfloat16)),
+                        atol=2e-2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index cf2c2e6eb00cccf82adf3c9eb65b685130a2f632..994af69386b278f6b88c051f898cd6a9dc607f3f 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/python/lib/core/bfloat16.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 
 namespace tensorflow {
@@ -125,6 +126,10 @@ Status PyArray_TYPE_to_TF_DataType(PyArrayObject* array,
       // custom struct type.
       return PyArrayDescr_to_TF_DataType(descr, out_tf_datatype);
     default:
+      if (pyarray_type == Bfloat16NumpyType()) {
+        *out_tf_datatype = TF_BFLOAT16;
+        break;
+      }
       // TODO(mrry): Support these.
       return errors::Internal("Unsupported feed type");
   }
diff --git a/tensorflow/python/lib/core/ndarray_tensor.h b/tensorflow/python/lib/core/ndarray_tensor.h
index 5172d504bd47d2f88afb088161d74a575a4213aa..b2cd4133ca65205ee432487e80430222064ef1a4 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.h
+++ b/tensorflow/python/lib/core/ndarray_tensor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_H_
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_H_
 
 // Must be included first.
 #include "tensorflow/python/lib/core/numpy.h"
@@ -45,4 +45,4 @@ Status TensorToNdarray(const Tensor& t, PyObject** ret);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_H_
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_H_
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index 82c45f5a315d485585b1514634201225f4123de1..65e2178cda498294ffc4a5066b5692132e86180f 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/python/lib/core/bfloat16.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 
 namespace tensorflow {
@@ -175,7 +176,7 @@ Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
       *out_pyarray_type = NPY_INT32;
       break;
     case TF_BFLOAT16:
-      *out_pyarray_type = NPY_UINT16;
+      *out_pyarray_type = Bfloat16NumpyType();
       break;
     default:
       return errors::Internal("Tensorflow type ", tf_datatype,
diff --git a/tensorflow/python/lib/core/numpy.h b/tensorflow/python/lib/core/numpy.h
index 0eafe890dbafd065ece72482f4b0c0080ce458ef..25322b458b8475882830599dd4ae02f10d97094b 100644
--- a/tensorflow/python/lib/core/numpy.h
+++ b/tensorflow/python/lib/core/numpy.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include <Python.h>
 
 #include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 8bf831f8badf4f17dc74f0ec29f12e0acdac171e..e0422ef80add42307268be2743e668eb8c8acb68 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -18,15 +18,21 @@ limitations under the License.
 #include <array>
 
 #include "numpy/arrayobject.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/python/eager/pywrap_tfe.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
+#include "tensorflow/python/lib/core/py_util.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
+
 #include <Python.h>
 
 namespace tensorflow {
@@ -48,6 +54,15 @@ struct PyCall {
   // with this "token".
   string token;
 
+  // The device on which Tensors are stored; only used for EagerPyFunc.
+  Device* device;
+
+  // True if and only if the op has been placed on a GPU.
+  bool gpu;
+
+  // True if the call is associated with an EagerPyFunc.
+  bool eager;
+
   // Inputs and outputs of this function invocation.
   std::vector<Tensor> ins;
   std::vector<Tensor> out;
@@ -55,21 +70,34 @@ struct PyCall {
 
 // Givens the 'call', prepares the token and inputs as a python tuple
 // that is appropriate for calling the trampoline.
-Status MakeArgTuple(PyCall* call, PyObject** tuple) {
+Status MakeArgTuple(const PyCall* call, PyObject** tuple) {
   int64 n = call->ins.size();
   PyObject* lst = PyList_New(n);
   CHECK(lst);
   for (int64 i = 0; i < n; ++i) {
+    PyObject* arg = nullptr;
     const Tensor& t = call->ins[i];
-    PyObject* a = nullptr;
-    Status s = ConvertTensorToNdarray(t, &a);
-    if (!s.ok()) {
-      Py_DECREF(lst);
-      return s;
+    if (call->eager) {
+      if (call->gpu) {
+        arg = EagerTensorFromHandle(new TFE_TensorHandle(t, call->device));
+      } else {
+        // TFE_TensorHandle assumes that CPU is identified by `nullptr`.
+        arg = EagerTensorFromHandle(new TFE_TensorHandle(t, nullptr));
+      }
+      if (arg == nullptr) {
+        return errors::Internal("Unable to procure EagerTensor from Tensor.");
+      }
+    } else {
+      Status s = ConvertTensorToNdarray(t, &arg);
+      if (!s.ok()) {
+        Py_DECREF(lst);
+        return s;
+      }
     }
-    PyList_SetItem(lst, i, a);
+    PyList_SetItem(lst, i, arg);
   }
-  *tuple = Py_BuildValue("(sN)", call->token.c_str(), lst);
+  *tuple = Py_BuildValue("(sON)", call->token.c_str(),
+                         call->gpu ? Py_True : Py_False, lst);
   CHECK(*tuple);
   return Status::OK();
 }
@@ -127,52 +155,17 @@ bool IsSingleNone(PyObject* obj) {
     return false;
   }
   std::array<npy_intp, 0> indices;
-  char* item_ptr = static_cast<char*>(PyArray_GetPtr(array_obj, indices.data()));
+  char* item_ptr =
+      static_cast<char*>(PyArray_GetPtr(array_obj, indices.data()));
   PyObject* item = PyArray_GETITEM(array_obj, item_ptr);
   CHECK(item);
   return item == Py_None;
 }
 
-// py.__class__.__name__
-const char* ClassName(PyObject* py) {
-/* PyPy doesn't have a separate C API for old-style classes. */
-#if PY_MAJOR_VERSION < 3 && !defined(PYPY_VERSION)
-  if (PyClass_Check(py))
-    return PyString_AS_STRING(
-        CHECK_NOTNULL(reinterpret_cast<PyClassObject*>(py)->cl_name));
-  if (PyInstance_Check(py))
-    return PyString_AS_STRING(CHECK_NOTNULL(
-        reinterpret_cast<PyInstanceObject*>(py)->in_class->cl_name));
-#endif
-  if (Py_TYPE(py) == &PyType_Type) {
-    return reinterpret_cast<PyTypeObject*>(py)->tp_name;
-  }
-  return Py_TYPE(py)->tp_name;
-}
-
-string PyExcFetch() {
-  CHECK(PyErr_Occurred()) << "Must only call PyExcFetch after an exception.";
-  PyObject* ptype;
-  PyObject* pvalue;
-  PyObject* ptraceback;
-  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
-  PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
-  string err = ClassName(ptype);
-  if (pvalue) {
-    PyObject* str = PyObject_Str(pvalue);
-    if (str) {
-#if PY_MAJOR_VERSION < 3
-      strings::StrAppend(&err, ": ", PyString_AS_STRING(str));
-#else
-      strings::StrAppend(&err, ": ", PyUnicode_AsUTF8(str));
-#endif
-      Py_DECREF(str);
-    }
-    Py_DECREF(pvalue);
-  }
-  Py_DECREF(ptype);
-  Py_XDECREF(ptraceback);
-  return err;
+// Retrieves a Tensor from `eager_tensor` and stores it in `output_tensor`.
+void ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
+                                  Tensor* output_tensor) {
+  *output_tensor = EagerTensor_Handle(eager_tensor)->t;
 }
 
 // Calls the registered py function through the trampoline.
@@ -195,18 +188,18 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
     if (PyErr_Occurred()) {
       if (PyErr_ExceptionMatches(PyExc_ValueError) ||
           PyErr_ExceptionMatches(PyExc_TypeError)) {
-        return errors::InvalidArgument(PyExcFetch());
+        return errors::InvalidArgument(PyExceptionFetch());
       } else if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
         *out_log_on_error = false;
-        return errors::OutOfRange(PyExcFetch());
+        return errors::OutOfRange(PyExceptionFetch());
       } else if (PyErr_ExceptionMatches(PyExc_MemoryError)) {
-        return errors::ResourceExhausted(PyExcFetch());
+        return errors::ResourceExhausted(PyExceptionFetch());
       } else if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) {
-        return errors::Unimplemented(PyExcFetch());
+        return errors::Unimplemented(PyExceptionFetch());
       } else {
         // TODO(ebrevdo): Check if exception is an OpError and use the
         // OpError.error_code property to map it back in the Status.
-        return errors::Unknown(PyExcFetch());
+        return errors::Unknown(PyExceptionFetch());
       }
     } else {
       return errors::Internal("Failed to run py callback ", call->token,
@@ -214,21 +207,44 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
     }
   }
 
-  // Process the return values and converts them to tf Tensors.
-  Status s;
+  // Process the return values and convert them to TF Tensors.
+  Status s = Status::OK();
   if (PyList_Check(result)) {
-    // 'result' is a list.
+    // `result` is a Python list; if this operation is an `EagerPyFunc`, then
+    // every item in the list must be an `EagerTensor`; otherwise, every element
+    // must be a NumPy array.
     call->out.clear();
     for (int i = 0; i < PyList_Size(result); ++i) {
       Tensor t;
-      s = ConvertNdarrayToTensor(PyList_GetItem(result, i), &t);
+      if (call->eager) {
+        const PyObject* item = PyList_GetItem(result, i);
+        if (EagerTensor_CheckExact(item)) {
+          ExtractTensorFromEagerTensor(item, &t);
+        } else {
+          s = errors::FailedPrecondition(
+              "Expected EagerTensor, found PyObject of type: ",
+              Py_TYPE(item)->tp_name);
+        }
+      } else {
+        s = ConvertNdarrayToTensor(PyList_GetItem(result, i), &t);
+      }
+
       if (!s.ok()) {
         break;
       }
       call->out.push_back(t);
     }
+  } else if (EagerTensor_CheckExact(result) || result == Py_None) {
+    // result is an `EagerTensor` or `None`.
+    DCHECK(call->eager);
+    Tensor t;
+    if (result != Py_None) {
+      ExtractTensorFromEagerTensor(result, &t);
+      call->out.push_back(t);
+    }
   } else if (PyArray_Check(result)) {
-    // 'result' is a single ndarray.
+    // `result` is a NumPy array.
+    DCHECK(!call->eager);
     if (!IsSingleNone(result)) {
       Tensor t;
       s = ConvertNdarrayToTensor(result, &t);
@@ -237,7 +253,7 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
       }
     }
   } else {
-    s = errors::Internal("Unexpected pyobject is returned: ",
+    s = errors::Internal("Unexpected PyObject was returned: ",
                          Py_TYPE(result)->tp_name);
   }
   Py_DECREF(result);
@@ -301,13 +317,22 @@ Status ConvertNdarrayToTensor(PyObject* obj, Tensor* ret) {
         if (PyBytes_AsStringAndSize(input_data[i], &el, &el_size) == -1) {
 #if PY_MAJOR_VERSION >= 3
           el = PyUnicode_AsUTF8AndSize(input_data[i], &el_size);
-          if (!el) {
+#else
+          el = nullptr;
+          if (PyUnicode_Check(input_data[i])) {
+            PyObject* unicode = PyUnicode_AsUTF8String(input_data[i]);
+            if (unicode) {
+              if (PyString_AsStringAndSize(unicode, &el, &el_size) == -1) {
+                Py_DECREF(unicode);
+                el = nullptr;
+              }
+            }
+          }
 #endif
+          if (!el) {
             return errors::Unimplemented("Unsupported object type ",
                                          input_data[i]->ob_type->tp_name);
-#if PY_MAJOR_VERSION >= 3
           }
-#endif
         }
         tflat(i) = string(el, el_size);
       }
@@ -417,11 +442,25 @@ class PyFuncOp : public OpKernel {
  public:
   explicit PyFuncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("token", &token_));
+    eager_ = type_string() == "EagerPyFunc";
+    gpu_ = ctx->device_type().type_string() == DEVICE_GPU;
   }
 
   void Compute(OpKernelContext* ctx) override {
     PyCall call;
     call.token = token_;
+    call.gpu = gpu_;
+    call.eager = eager_;
+    if (call.eager) {
+      // Eager's C API uses `Device`, whereas `OpKernelContext` stores a
+      // `DeviceBase`; attempt to downcast.
+      call.device = dynamic_cast<Device*>(ctx->device());
+      if (call.device == nullptr) {
+        ctx->CtxFailureWithWarning(
+            errors::Internal("Unrecognized device class"));
+      }
+    }
+
     for (int i = 0; i < ctx->num_inputs(); ++i) {
       call.ins.push_back(ctx->input(i));
     }
@@ -430,6 +469,9 @@ class PyFuncOp : public OpKernel {
     py_threadstate = PyGILState_Ensure();
     bool log_on_error;
     Status s = DoCallPyFunc(&call, &log_on_error);
+    // Sometimes py_funcs can be called without a session and leak memory. This
+    // ensures we clear the decref cache so this doesn't happen.
+    ClearDecrefCache();
     PyGILState_Release(py_threadstate);
 
     // Ensures that GIL is released even when !s.ok().
@@ -460,9 +502,19 @@ class PyFuncOp : public OpKernel {
  private:
   string token_;
 
+  // True if and only if this op has been placed on a GPU.
+  bool gpu_;
+
+  // True if and only if this op should execute the python function eagerly,
+  // i.e., if and only if the eager attribute is set.
+  bool eager_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(PyFuncOp);
 };
+
 REGISTER_KERNEL_BUILDER(Name("PyFunc").Device(DEVICE_CPU), PyFuncOp);
 REGISTER_KERNEL_BUILDER(Name("PyFuncStateless").Device(DEVICE_CPU), PyFuncOp);
+REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_CPU), PyFuncOp);
+REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_GPU), PyFuncOp);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/python/lib/core/py_func.h b/tensorflow/python/lib/core/py_func.h
index 5a451d5f43285d19dff6c158ebc28045b3ff13d4..3197a7ddfa0ce3db9f8244215690e5ede5096ac2 100644
--- a/tensorflow/python/lib/core/py_func.h
+++ b/tensorflow/python/lib/core/py_func.h
@@ -24,21 +24,27 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Called by py code on initialization.
+// Called by python code on initialization.
 //
 // "trampoline" must represent a python function which has the
 // following signature:
-//   (string, list(ndarray)) -> ndarray | list(ndarray) | python scalar
+//   (string, list(ndarray)) | (string, list(EagerTensor)) ->
+//     ndarray | list(ndarray) | python scalar |
+//     EagerTensor | list(EagerTensor) | None
 //
 // The trampoline takes two arguments, the first is a string token
 // used by the python frontend's dispatching logic; the second is a
-// list of numpy ndarrays.
+// list of numpy ndarrays or EagerTensor objects. It can return a
+// single numpy ndarray, a list of numpy ndarrays, a python scalar, an
+// EagerTensor, a list of EagerTensors, or None.
 //
-// The trampoline can return a single numpy ndarray, a list of numpy
-// ndarrays, or a simply python scalar. The C++ runtime converts them,
-// if supported, back to Tensor objects.
+// PyFunc requires inputs and outputs to be ndarrays. EagerPyFunc requires
+// inputs to be a list of EagerTensors and outputs to be an EagerTensor, a list
+// of EagerTensors, or None.
 //
-// This is called by script_ops.py during its module initialization.
+// The C++ runtime converts outputs back to Tensor objects.
+//
+// This function is called by script_ops.py during its module initialization.
 //
 // TODO(zhifengc): Support distributed runtime.
 void InitializePyTrampoline(PyObject* trampoline);
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 71cb38f8fd24beeb9efe149a6bd39e0ef2031051..317bdc2e14747583f372808f48a5928273f5570a 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/lib/core/numpy.h"
+#include "tensorflow/python/lib/core/py_util.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
 
 namespace tensorflow {
@@ -89,12 +90,25 @@ Status InferShapeAndType(PyObject* obj, TensorShape* shape, DataType* dtype) {
       *dtype = DT_STRING;
     } else if (PySequence_Check(obj)) {
       auto length = PySequence_Length(obj);
-      shape->AddDim(length);
       if (length > 0) {
+        shape->AddDim(length);
         obj = PySequence_GetItem(obj, 0);
         continue;
-      } else {
+      } else if (length == 0) {
+        shape->AddDim(length);
         *dtype = DT_INVALID;  // Invalid dtype for empty tensors.
+      } else {
+        // The sequence does not have a valid length (PySequence_Length < 0).
+        if (PyErr_Occurred()) {
+          // PySequence_Length failed and set an exception. Fetch the message
+          // and convert it to a failed status.
+          return errors::InvalidArgument(PyExceptionFetch());
+        } else {
+          // This is almost certainly dead code: PySequence_Length failed but
+          // did not set an exception.
+          return errors::InvalidArgument(
+              "Attempted to convert an invalid sequence to a Tensor.");
+        }
       }
     } else if (IsPyFloat(obj)) {
       *dtype = DT_DOUBLE;
diff --git a/tensorflow/python/lib/core/py_seq_tensor.h b/tensorflow/python/lib/core/py_seq_tensor.h
index 6dc4d9c77755bd416fe709ad7a4bf350799f3eb1..c6e5080c62e96e79ca1ccf7e09e1b744ed293e07 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.h
+++ b/tensorflow/python/lib/core/py_seq_tensor.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_PY_SEQ_TENSOR_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_PY_SEQ_TENSOR_H_
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_PY_SEQ_TENSOR_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_PY_SEQ_TENSOR_H_
 
 #include <Python.h>
 
@@ -34,4 +34,4 @@ Status PySeqToTensor(PyObject* obj, PyObject* dtype, Tensor* ret);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_PY_SEQ_TENSOR_H_
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_PY_SEQ_TENSOR_H_
diff --git a/tensorflow/python/lib/core/py_util.cc b/tensorflow/python/lib/core/py_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2635694e23c07dd8e75d4bb0cfb9e83a2042d921
--- /dev/null
+++ b/tensorflow/python/lib/core/py_util.cc
@@ -0,0 +1,70 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/lib/core/py_util.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include <Python.h>
+
+namespace tensorflow {
+namespace {
+
+// py.__class__.__name__
+const char* ClassName(PyObject* py) {
+/* PyPy doesn't have a separate C API for old-style classes. */
+#if PY_MAJOR_VERSION < 3 && !defined(PYPY_VERSION)
+  if (PyClass_Check(py))
+    return PyString_AS_STRING(
+        CHECK_NOTNULL(reinterpret_cast<PyClassObject*>(py)->cl_name));
+  if (PyInstance_Check(py))
+    return PyString_AS_STRING(CHECK_NOTNULL(
+        reinterpret_cast<PyInstanceObject*>(py)->in_class->cl_name));
+#endif
+  if (Py_TYPE(py) == &PyType_Type) {
+    return reinterpret_cast<PyTypeObject*>(py)->tp_name;
+  }
+  return Py_TYPE(py)->tp_name;
+}
+
+}  // end namespace
+
+string PyExceptionFetch() {
+  CHECK(PyErr_Occurred())
+      << "Must only call PyExceptionFetch after an exception.";
+  PyObject* ptype;
+  PyObject* pvalue;
+  PyObject* ptraceback;
+  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
+  PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
+  string err = ClassName(ptype);
+  if (pvalue) {
+    PyObject* str = PyObject_Str(pvalue);
+    if (str) {
+#if PY_MAJOR_VERSION < 3
+      strings::StrAppend(&err, ": ", PyString_AS_STRING(str));
+#else
+      strings::StrAppend(&err, ": ", PyUnicode_AsUTF8(str));
+#endif
+      Py_DECREF(str);
+    }
+    Py_DECREF(pvalue);
+  }
+  Py_DECREF(ptype);
+  Py_XDECREF(ptraceback);
+  return err;
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/python/lib/core/py_util.h b/tensorflow/python/lib/core/py_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..44dfe7ba21285d06667a8d0f6ab8ac0ec8f2aa00
--- /dev/null
+++ b/tensorflow/python/lib/core/py_util.h
@@ -0,0 +1,27 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_UTIL_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_UTIL_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+// Fetch the exception message as a string. An exception must be set
+// (PyErr_Occurred() must be true).
+string PyExceptionFetch();
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_UTIL_H_
diff --git a/tensorflow/python/lib/core/safe_ptr.cc b/tensorflow/python/lib/core/safe_ptr.cc
index 456ea3348baa634075082fedde9dac175e237997..ce34b6d0041878c4122d36ab8bf9db6c17253680 100644
--- a/tensorflow/python/lib/core/safe_ptr.cc
+++ b/tensorflow/python/lib/core/safe_ptr.cc
@@ -16,25 +16,21 @@ limitations under the License.
 #include "tensorflow/python/lib/core/safe_ptr.h"
 
 namespace tensorflow {
-namespace {
 
-inline void Py_DECREF_wrapper(PyObject* o) { Py_DECREF(o); }
-
-}  // namespace
-
-Safe_PyObjectPtr make_safe(PyObject* o) {
-  return Safe_PyObjectPtr(o, Py_DECREF_wrapper);
+Safe_PyObjectPtr make_safe(PyObject* object) {
+  return Safe_PyObjectPtr(object);
 }
 
 Safe_TF_TensorPtr make_safe(TF_Tensor* tensor) {
-  return Safe_TF_TensorPtr(tensor, TF_DeleteTensor);
+  return Safe_TF_TensorPtr(tensor);
 }
 
 Safe_TFE_TensorHandlePtr make_safe(TFE_TensorHandle* handle) {
-  return Safe_TFE_TensorHandlePtr(handle, TFE_DeleteTensorHandle);
+  return Safe_TFE_TensorHandlePtr(handle);
 }
 
 Safe_TF_StatusPtr make_safe(TF_Status* status) {
-  return Safe_TF_StatusPtr(status, TF_DeleteStatus);
+  return Safe_TF_StatusPtr(status);
 }
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/safe_ptr.h b/tensorflow/python/lib/core/safe_ptr.h
index 70cd2fdf6ccf4d722892f06e1e3aa40919b63ac7..32d286888666bde8742403bb8e231b3d6d4bf695 100644
--- a/tensorflow/python/lib/core/safe_ptr.h
+++ b/tensorflow/python/lib/core/safe_ptr.h
@@ -13,45 +13,57 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
 
 #include <memory>
-#include <Python.h>
 
+#include <Python.h>
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
 
 namespace tensorflow {
+namespace detail {
+
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
+
+struct TFTensorDeleter {
+  void operator()(TF_Tensor* p) const { TF_DeleteTensor(p); }
+};
+
+struct TFETensorHandleDeleter {
+  void operator()(TFE_TensorHandle* p) const { TFE_DeleteTensorHandle(p); }
+};
+
+struct TFStatusDeleter {
+  void operator()(TF_Status* p) const { TF_DeleteStatus(p); }
+};
+
+}  // namespace detail
 
 // Safe container for an owned PyObject. On destruction, the reference count of
 // the contained object will be decremented.
-typedef void (*Py_DECREF_wrapper_type)(PyObject*);
-typedef std::unique_ptr<PyObject, Py_DECREF_wrapper_type> Safe_PyObjectPtr;
+using Safe_PyObjectPtr = std::unique_ptr<PyObject, detail::PyDecrefDeleter>;
 Safe_PyObjectPtr make_safe(PyObject* o);
 
 // Safe containers for an owned TF_Tensor. On destruction, the tensor will be
 // deleted by TF_DeleteTensor.
-// Note: can't use decltype(&TF_DeleteTensor) due to SWIG
-typedef void (*TF_DeleteTensor_type)(TF_Tensor*);
-typedef std::unique_ptr<TF_Tensor, TF_DeleteTensor_type> Safe_TF_TensorPtr;
+using Safe_TF_TensorPtr = std::unique_ptr<TF_Tensor, detail::TFTensorDeleter>;
 Safe_TF_TensorPtr make_safe(TF_Tensor* tensor);
 
 // Safe containers for an owned TFE_TensorHandle. On destruction, the handle
-// will be deleted by TFE_DeleteTensorHandle. Note: can't use
-// decltype(&TFE_DeleteTensorHandle) due to SWIG
-typedef void (*TFE_DeleteTensorHandle_type)(TFE_TensorHandle*);
-typedef std::unique_ptr<TFE_TensorHandle, TFE_DeleteTensorHandle_type>
-    Safe_TFE_TensorHandlePtr;
+// will be deleted by TFE_DeleteTensorHandle.
+using Safe_TFE_TensorHandlePtr =
+    std::unique_ptr<TFE_TensorHandle, detail::TFETensorHandleDeleter>;
 Safe_TFE_TensorHandlePtr make_safe(TFE_TensorHandle* handle);
 
 // Safe containers for an owned TF_Status. On destruction, the handle
-// will be deleted by TF_DeleteStatus. Note: can't use
-// decltype(&TF_DeleteStatus) due to SWIG
-typedef void (*TF_DeleteStatus_type)(TF_Status*);
-typedef std::unique_ptr<TF_Status, TF_DeleteStatus_type> Safe_TF_StatusPtr;
+// will be deleted by TF_DeleteStatus.
+using Safe_TF_StatusPtr = std::unique_ptr<TF_Status, detail::TFStatusDeleter>;
 Safe_TF_StatusPtr make_safe(TF_Status* status);
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
diff --git a/tensorflow/python/lib/io/file_io.i b/tensorflow/python/lib/io/file_io.i
index c0c4e035fc3d6a50334acb9228c13c702ef426c0..891a7b0fd0dc177f5ee439707c9e2c99148e177c 100644
--- a/tensorflow/python/lib/io/file_io.i
+++ b/tensorflow/python/lib/io/file_io.i
@@ -110,21 +110,15 @@ void RecursivelyCreateDir(const string& dirname, TF_Status* out_status) {
   }
 }
 
-void CopyFile(const string& oldpath, const string& newpath, bool overwrite,
+void CopyFile(const string& src, const string& target, bool overwrite,
               TF_Status* out_status) {
-  // If overwrite is false and the newpath file exists then it's an error.
-  if (!overwrite && tensorflow::Env::Default()->FileExists(newpath).ok()) {
+  // If overwrite is false and the target file exists then its an error.
+  if (!overwrite && tensorflow::Env::Default()->FileExists(target).ok()) {
     TF_SetStatus(out_status, TF_ALREADY_EXISTS, "file already exists");
     return;
   }
-  string file_content;
-  tensorflow::Status status = ReadFileToString(tensorflow::Env::Default(),
-      oldpath, &file_content);
-  if (!status.ok()) {
-    Set_TF_Status_from_Status(out_status, status);
-    return;
-  }
-  status = WriteStringToFile(tensorflow::Env::Default(), newpath, file_content);
+  tensorflow::Status status =
+      tensorflow::Env::Default()->CopyFile(src, target);
   if (!status.ok()) {
     Set_TF_Status_from_Status(out_status, status);
   }
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index 4e3071d8513a28b02b70b290c4987bec92b3c32e..59f5075f177ef5335115cb4f24182d28a9b547c8 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
 class FileIO(object):
@@ -235,6 +236,7 @@ class FileIO(object):
     self._writable_file = None
 
 
+@tf_export("gfile.Exists")
 def file_exists(filename):
   """Determines whether a path exists or not.
 
@@ -256,6 +258,7 @@ def file_exists(filename):
   return True
 
 
+@tf_export("gfile.Remove")
 def delete_file(filename):
   """Deletes the file located at 'filename'.
 
@@ -306,6 +309,7 @@ def write_string_to_file(filename, file_content):
     f.write(file_content)
 
 
+@tf_export("gfile.Glob")
 def get_matching_files(filename):
   """Returns a list of files that match the given pattern(s).
 
@@ -336,6 +340,7 @@ def get_matching_files(filename):
       ]
 
 
+@tf_export("gfile.MkDir")
 def create_dir(dirname):
   """Creates a directory with the name 'dirname'.
 
@@ -353,6 +358,7 @@ def create_dir(dirname):
     pywrap_tensorflow.CreateDir(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.MakeDirs")
 def recursive_create_dir(dirname):
   """Creates a directory and all parent/intermediate directories.
 
@@ -368,6 +374,7 @@ def recursive_create_dir(dirname):
     pywrap_tensorflow.RecursivelyCreateDir(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.Copy")
 def copy(oldpath, newpath, overwrite=False):
   """Copies data from oldpath to newpath.
 
@@ -385,6 +392,7 @@ def copy(oldpath, newpath, overwrite=False):
         compat.as_bytes(oldpath), compat.as_bytes(newpath), overwrite, status)
 
 
+@tf_export("gfile.Rename")
 def rename(oldname, newname, overwrite=False):
   """Rename or move a file / directory.
 
@@ -426,6 +434,7 @@ def atomic_write_string_to_file(filename, contents, overwrite=True):
     raise
 
 
+@tf_export("gfile.DeleteRecursively")
 def delete_recursively(dirname):
   """Deletes everything under dirname recursively.
 
@@ -439,6 +448,7 @@ def delete_recursively(dirname):
     pywrap_tensorflow.DeleteRecursively(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.IsDirectory")
 def is_directory(dirname):
   """Returns whether the path is a directory or not.
 
@@ -452,6 +462,7 @@ def is_directory(dirname):
   return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
 
 
+@tf_export("gfile.ListDirectory")
 def list_directory(dirname):
   """Returns a list of entries contained within a directory.
 
@@ -479,6 +490,7 @@ def list_directory(dirname):
     ]
 
 
+@tf_export("gfile.Walk")
 def walk(top, in_order=True):
   """Recursive directory tree generator for directories.
 
@@ -522,6 +534,7 @@ def walk(top, in_order=True):
     yield here
 
 
+@tf_export("gfile.Stat")
 def stat(filename):
   """Returns file statistics for a given path.
 
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index df190100689bd864de78f5a2cf52b1ade081a789..48ea107a146c2714f7b59f53abbcd8b60dbf2fd4 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -22,8 +22,10 @@ from __future__ import print_function
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("python_io.TFRecordCompressionType")
 class TFRecordCompressionType(object):
   """The type of compression for the record."""
   NONE = 0
@@ -33,6 +35,7 @@ class TFRecordCompressionType(object):
 
 # NOTE(vrv): This will eventually be converted into a proto.  to match
 # the interface used by the C++ RecordWriter.
+@tf_export("python_io.TFRecordOptions")
 class TFRecordOptions(object):
   """Options used for manipulating TFRecord files."""
   compression_type_map = {
@@ -51,6 +54,7 @@ class TFRecordOptions(object):
     return cls.compression_type_map[options.compression_type]
 
 
+@tf_export("python_io.tf_record_iterator")
 def tf_record_iterator(path, options=None):
   """An iterator that read the records from a TFRecords file.
 
@@ -81,6 +85,7 @@ def tf_record_iterator(path, options=None):
   reader.Close()
 
 
+@tf_export("python_io.TFRecordWriter")
 class TFRecordWriter(object):
   """A class to write records to a TFRecords file.
 
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 87f8d1486011683c89095aeb04e2d01461f83749..9745d38dc23dba806a2d0dd2ef588a5a950aa05c 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Gradients for operators defined in array_ops.py."""
 
 from __future__ import absolute_import
@@ -28,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -116,6 +116,19 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
                                                         non_neg_concat_dim)
       out_grads = array_ops.split(grad, sizes, non_neg_concat_dim)
     else:
+      if constant_op.is_constant(concat_dim):
+        # If concat_dim is a constant defined in a different context,
+        # then we duplicate it in the current context to avoid passing it
+        # through an Enter node.
+        # This is a small optimization in general, but it is required when
+        # compiling with XLA, as XLA needs the concat input to be folded into a
+        # constant.
+        grad_context = control_flow_util.GetOutputContext(grad.op)
+        dim_context = control_flow_util.GetOutputContext(concat_dim.op)
+        if dim_context != grad_context:
+          value = tensor_util.constant_value(concat_dim)
+          concat_dim = constant_op.constant(value=value, dtype=concat_dim.dtype)
+
       # Using mod here for convenience since concat_dim is already verified
       # in concat implementation to be within the allowed [-rank, rank) range.
       non_neg_concat_dim = concat_dim % array_ops.rank(input_values[0])
@@ -131,8 +144,8 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
         # extract the size of each input along the concat dimension
         sizes = array_ops.squeeze(
             array_ops.slice(
-                array_ops.stack(
-                    sizes, axis=1), [non_neg_concat_dim, 0], [1, -1]))
+                array_ops.stack(sizes, axis=1), [non_neg_concat_dim, 0],
+                [1, -1]))
         out_grads = array_ops.split(grad, sizes, non_neg_concat_dim)
       else:
         offset = gen_array_ops._concat_offset(non_neg_concat_dim, sizes)
@@ -167,8 +180,7 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
         new_values = array_ops.slice(
             grad.values, begin,
             array_ops.concat([[-1], array_ops.slice(size, [1], [-1])], 0))
-        out_grads.append(
-            ops.IndexedSlices(new_values, grad.indices, size))
+        out_grads.append(ops.IndexedSlices(new_values, grad.indices, size))
         # Lint complains begin = begin + ...
         begin = math_ops.add(begin, size * mask)
     else:
@@ -178,30 +190,33 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
       for size in sizes:
         size_concat_dim = array_ops.gather(size, non_neg_concat_dim)
         if size_concat_dim.dtype != grad.indices.dtype:
-          size_concat_dim = math_ops.cast(size_concat_dim,
-                                          dtype=grad.indices.dtype)
+          size_concat_dim = math_ops.cast(
+              size_concat_dim, dtype=grad.indices.dtype)
         end = start + size_concat_dim
         # Compute the 1-D Tensor of indices relevant for this input.
         indices_to_select = array_ops.squeeze(
-            array_ops.where(math_ops.logical_and(grad.indices >= start,
-                                                 grad.indices < end)),
+            array_ops.where(
+                math_ops.logical_and(grad.indices >= start,
+                                     grad.indices < end)),
             squeeze_dims=[1])
         new_indices = array_ops.gather(grad.indices, indices_to_select) - start
         new_values = array_ops.gather(grad.values, indices_to_select)
-        out_grads.append(
-            ops.IndexedSlices(new_values, new_indices, size))
+        out_grads.append(ops.IndexedSlices(new_values, new_indices, size))
         start = end
   else:
     raise TypeError("Expected Tensor or IndexedSlices, got %s" % type(grad))
 
-  return (out_grads + [None] if end_value_index <= dim_index
-          else [None] + out_grads)
+  return (out_grads + [None]
+          if end_value_index <= dim_index else [None] + out_grads)
 
 
 @ops.RegisterGradient("Concat")
 def _ConcatGrad(op, grad):
   return _ConcatGradHelper(
-      op, grad, start_value_index=1, end_value_index=len(op.inputs),
+      op,
+      grad,
+      start_value_index=1,
+      end_value_index=len(op.inputs),
       dim_index=0)
 
 
@@ -287,9 +302,13 @@ def _SplitGrad(op, *grads):
 @ops.RegisterGradient("SplitV")
 def _SplitVGrad(op, *grads):
   returnval = array_ops.concat(list(grads), op.inputs[2])
-  returnval = [returnval] + [None,] * (len(op.inputs) - 1)
+  returnval = [returnval] + [
+      None,
+  ] * (
+      len(op.inputs) - 1)
   return returnval
 
+
 ops.NotDifferentiable("Const")
 
 
@@ -334,9 +353,9 @@ def _MatrixSetDiagGrad(op, grad):
       matrix_shape = array_ops.slice(grad_shape, [grad_rank - 2], [2])
       min_dim = math_ops.reduce_min(matrix_shape)
       diag_shape = array_ops.concat([batch_shape, [min_dim]], 0)
-  grad_input = array_ops.matrix_set_diag(
-      grad, array_ops.zeros(
-          diag_shape, dtype=grad.dtype))
+  grad_input = array_ops.matrix_set_diag(grad,
+                                         array_ops.zeros(
+                                             diag_shape, dtype=grad.dtype))
   grad_diag = array_ops.matrix_diag_part(grad)
   return (grad_input, grad_diag)
 
@@ -444,8 +463,8 @@ def _GatherV2Grad(op, grad):
   values_transpose = array_ops.transpose(values, transpose_dims)
   num_segments = params_shape[axis]
 
-  params_grad = math_ops.unsorted_segment_sum(
-      values_transpose, indices, num_segments)
+  params_grad = math_ops.unsorted_segment_sum(values_transpose, indices,
+                                              num_segments)
 
   # Inverts the above transpose by moving dimension 0 back to its original
   # position.
@@ -524,15 +543,22 @@ def _TransposeGrad(op, grad):
   return [array_ops.transpose(grad, array_ops.invert_permutation(p)), None]
 
 
-ops.NotDifferentiable("Shape")
+@ops.RegisterGradient("ConjugateTranspose")
+def _ConjugateTransposeGrad(op, grad):
+  """Returns conj(unshuffle(grad))."""
+  p = op.inputs[1]
+  return [
+      array_ops.transpose(
+          grad, array_ops.invert_permutation(p), conjugate=True), None
+  ]
 
 
-ops.NotDifferentiable("ShapeN")
+ops.NotDifferentiable("Shape")
 
+ops.NotDifferentiable("ShapeN")
 
 ops.NotDifferentiable("Rank")
 
-
 ops.NotDifferentiable("Size")
 
 
@@ -580,6 +606,7 @@ def _PadGrad(op, grad):
   else:
     return x_grad, None
 
+
 ops.RegisterGradient("Pad")(_PadGrad)
 ops.RegisterGradient("PadV2")(_PadGrad)
 
@@ -615,30 +642,34 @@ def _ReverseV2Grad(op, grad):
 def _SpaceToBatchGrad(op, grad):
   # Its gradient is the opposite op: BatchToSpace.
   block_size = op.get_attr("block_size")
-  return [array_ops.batch_to_space(grad, op.inputs[1], block_size=block_size),
-          None]
+  return [
+      array_ops.batch_to_space(grad, op.inputs[1], block_size=block_size), None
+  ]
 
 
 @ops.RegisterGradient("SpaceToBatchND")
 def _SpaceToBatchNDGrad(op, grad):
   # Its gradient is the opposite op: BatchToSpaceND.
-  return [array_ops.batch_to_space_nd(grad, op.inputs[1], op.inputs[2]),
-          None, None]
+  return [
+      array_ops.batch_to_space_nd(grad, op.inputs[1], op.inputs[2]), None, None
+  ]
 
 
 @ops.RegisterGradient("BatchToSpace")
 def _BatchToSpaceGrad(op, grad):
   # Its gradient is the opposite op: SpaceToBatch.
   block_size = op.get_attr("block_size")
-  return [array_ops.space_to_batch(grad, op.inputs[1], block_size=block_size),
-          None]
+  return [
+      array_ops.space_to_batch(grad, op.inputs[1], block_size=block_size), None
+  ]
 
 
 @ops.RegisterGradient("BatchToSpaceND")
 def _BatchToSpaceNDGrad(op, grad):
   # Its gradient is the opposite op: SpaceToBatchND.
-  return [array_ops.space_to_batch_nd(grad, op.inputs[1], op.inputs[2]),
-          None, None]
+  return [
+      array_ops.space_to_batch_nd(grad, op.inputs[1], op.inputs[2]), None, None
+  ]
 
 
 @ops.RegisterGradient("SpaceToDepth")
@@ -702,30 +733,28 @@ def _QuantizeAndDequantizeV3Grad(_, grad):
 def _ExtractImagePatchesGrad(op, grad):
 
   batch_size, rows_in, cols_in, channels = [
-    dim.value for dim in op.inputs[0].get_shape()
+      dim.value for dim in op.inputs[0].get_shape()
   ]
   input_bhwc = array_ops.shape(op.inputs[0])
   batch_size = input_bhwc[0]
   channels = input_bhwc[3]
 
-  _, rows_out, cols_out, _ = [
-    dim.value for dim in op.outputs[0].get_shape()
-  ]
-  _, ksize_r, ksize_c, _ = op.get_attr('ksizes')
-  _, stride_r, stride_h, _ = op.get_attr('strides')
-  _, rate_r, rate_c, _ = op.get_attr('rates')
-  padding = op.get_attr('padding')
+  _, rows_out, cols_out, _ = [dim.value for dim in op.outputs[0].get_shape()]
+  _, ksize_r, ksize_c, _ = op.get_attr("ksizes")
+  _, stride_r, stride_h, _ = op.get_attr("strides")
+  _, rate_r, rate_c, _ = op.get_attr("rates")
+  padding = op.get_attr("padding")
 
   ksize_r_eff = ksize_r + (ksize_r - 1) * (rate_r - 1)
   ksize_c_eff = ksize_c + (ksize_c - 1) * (rate_c - 1)
 
-  if padding == b'SAME':
+  if padding == b"SAME":
     rows_out = int(ceil(rows_in / stride_r))
     cols_out = int(ceil(cols_in / stride_h))
     pad_rows = ((rows_out - 1) * stride_r + ksize_r_eff - rows_in) // 2
     pad_cols = ((cols_out - 1) * stride_h + ksize_c_eff - cols_in) // 2
 
-  elif padding == b'VALID':
+  elif padding == b"VALID":
     rows_out = int(ceil((rows_in - ksize_r_eff + 1) / stride_r))
     cols_out = int(ceil((cols_in - ksize_c_eff + 1) / stride_h))
     pad_rows = (rows_out - 1) * stride_r + ksize_r_eff - rows_in
@@ -734,10 +763,9 @@ def _ExtractImagePatchesGrad(op, grad):
   pad_rows, pad_cols = max(0, pad_rows), max(0, pad_cols)
 
   grad_expanded = array_ops.transpose(
-    array_ops.reshape(grad, (batch_size, rows_out,
-                             cols_out, ksize_r, ksize_c, channels)),
-    (1, 2, 3, 4, 0, 5)
-  )
+      array_ops.reshape(
+          grad, (batch_size, rows_out, cols_out, ksize_r, ksize_c, channels)),
+      (1, 2, 3, 4, 0, 5))
   grad_flat = array_ops.reshape(grad_expanded, (-1, batch_size * channels))
 
   row_steps = range(0, rows_out * stride_r, stride_r)
@@ -749,29 +777,21 @@ def _ExtractImagePatchesGrad(op, grad):
       r_low, c_low = row_steps[i] - pad_rows, col_steps[j] - pad_cols
       r_high, c_high = r_low + ksize_r_eff, c_low + ksize_c_eff
 
-      idx.extend([(r * (cols_in) + c,
-                   i * (cols_out * ksize_r * ksize_c) +
-                   j * (ksize_r * ksize_c) +
-                   ri * (ksize_c) + ci)
+      idx.extend([(r * (cols_in) + c, i * (cols_out * ksize_r * ksize_c) + j *
+                   (ksize_r * ksize_c) + ri * (ksize_c) + ci)
                   for (ri, r) in enumerate(range(r_low, r_high, rate_r))
                   for (ci, c) in enumerate(range(c_low, c_high, rate_c))
-                  if 0 <= r and r < rows_in and 0 <= c and c < cols_in
-      ])
+                  if 0 <= r and r < rows_in and 0 <= c and c < cols_in])
 
-  sp_shape = (rows_in * cols_in,
-              rows_out * cols_out * ksize_r * ksize_c)
+  sp_shape = (rows_in * cols_in, rows_out * cols_out * ksize_r * ksize_c)
 
   sp_mat = sparse_tensor.SparseTensor(
-    array_ops.constant(idx, dtype=ops.dtypes.int64),
-    array_ops.ones((len(idx),), dtype=ops.dtypes.float32),
-    sp_shape
-  )
+      array_ops.constant(idx, dtype=ops.dtypes.int64),
+      array_ops.ones((len(idx),), dtype=ops.dtypes.float32), sp_shape)
 
   jac = sparse_ops.sparse_tensor_dense_matmul(sp_mat, grad_flat)
 
-  grad_out = array_ops.reshape(
-    jac, (rows_in, cols_in, batch_size, channels)
-  )
+  grad_out = array_ops.reshape(jac, (rows_in, cols_in, batch_size, channels))
   grad_out = array_ops.transpose(grad_out, (2, 0, 1, 3))
 
   return [grad_out]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 38eff54c692acd4cb9e2a75caa2c9e9cc23045be..ad409ad7e5a152bbc4312e1d16f324bb8be71c33 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# Tests for this file live in python/kernel_tests/array_ops_test.py
 """Support for manipulating tensors.
 
 See the @{$python/array_ops} guide.
@@ -34,6 +35,7 @@ See the @{$python/array_ops} guide.
 @@reshape
 @@squeeze
 @@expand_dims
+@@unravel_index
 @@meshgrid
 @@slice
 @@strided_slice
@@ -70,6 +72,7 @@ See the @{$python/array_ops} guide.
 @@quantize_v2
 @@quantized_concat
 @@setdiff1d
+@@guarantee_const
 @@fake_quant_with_min_max_args
 @@fake_quant_with_min_max_args_gradient
 @@fake_quant_with_min_max_vars
@@ -102,16 +105,19 @@ from tensorflow.python.ops import gen_math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_array_ops import *
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
 
 # Used for slicing to specify a new 1 size dimension
 newaxis = None
+tf_export("newaxis").export_constant(__name__, "newaxis")
 
 # We override the 'slice' for the "slice" op, so we keep python's
 # existing 'slice' for later use in this module.
 _BaseSlice = slice
 
 
+@tf_export("identity")
 def identity(input, name=None):  # pylint: disable=redefined-builtin
   r"""Return a tensor with the same shape and contents as input.
 
@@ -125,11 +131,8 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
   if context.in_graph_mode():
     return gen_array_ops.identity(input, name=name)
   else:
-    try:
-      in_device = input.device
-    except AttributeError:
-      input = ops.convert_to_tensor(input)
-      in_device = input.device
+    input = ops.convert_to_tensor(input)
+    in_device = input.device
     # TODO(ashankar): Does 'identity' need to invoke execution callbacks?
     if context.context().device_name != in_device:
       return input._copy()  # pylint: disable=protected-access
@@ -137,6 +140,7 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
 
 
 # pylint: disable=redefined-builtin,protected-access
+@tf_export("expand_dims")
 def expand_dims(input, axis=None, name=None, dim=None):
   """Inserts a dimension of 1 into a tensor's shape.
 
@@ -213,6 +217,7 @@ listdiff.__doc__ = gen_array_ops._list_diff.__doc__ + "\n" + listdiff.__doc__
 
 
 # pylint: disable=undefined-variable,protected-access
+@tf_export("setdiff1d")
 def setdiff1d(x, y, index_dtype=dtypes.int32, name=None):
   return gen_array_ops._list_diff(x, y, index_dtype, name)
 
@@ -222,6 +227,7 @@ setdiff1d.__doc__ = gen_array_ops._list_diff.__doc__
 # pylint: enable=protected-access
 
 
+@tf_export("broadcast_dynamic_shape")
 def broadcast_dynamic_shape(shape_x, shape_y):
   # pylint: disable=protected-access
   """Returns the broadcasted dynamic shape between `shape_x` and `shape_y`.
@@ -237,6 +243,7 @@ def broadcast_dynamic_shape(shape_x, shape_y):
   # pylint: enable=protected-access
 
 
+@tf_export("broadcast_static_shape")
 def broadcast_static_shape(shape_x, shape_y):
   """Returns the broadcasted static shape between `shape_x` and `shape_y`.
 
@@ -253,6 +260,7 @@ def broadcast_static_shape(shape_x, shape_y):
   return common_shapes.broadcast_shape(shape_x, shape_y)
 
 
+@tf_export("shape")
 def shape(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
@@ -306,6 +314,7 @@ def shape_internal(input, name=None, optimize=True, out_type=dtypes.int32):
       return gen_array_ops.shape(input, name=name, out_type=out_type)
 
 
+@tf_export("shape_n")
 def shape_n(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
   """Returns shape of tensors.
@@ -332,6 +341,7 @@ def shape_n(input, out_type=dtypes.int32, name=None):
   return output
 
 
+@tf_export("size")
 def size(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the size of a tensor.
@@ -389,6 +399,7 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
       return gen_array_ops.size(input, name=name, out_type=out_type)
 
 
+@tf_export("rank")
 def rank(input, name=None):
   # pylint: disable=redefined-builtin
   """Returns the rank of a tensor.
@@ -451,18 +462,21 @@ def _slice_helper(tensor, slice_spec, var=None):
   This operation extracts the specified region from the tensor.
   The notation is similar to NumPy with the restriction that
   currently only support basic indexing. That means that
-  using a tensor as input is not currently allowed
+  using a non-scalar tensor as input is not currently allowed.
 
   Some useful examples:
 
   ```python
   # strip leading and trailing 2 elements
   foo = tf.constant([1,2,3,4,5,6])
-  print(foo[2:-2].eval())  # [3,4]
+  print(foo[2:-2].eval())  # => [3,4]
 
   # skip every row and reverse every column
   foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-  print(foo[::2,::-1].eval())  # [[3,2,1], [9,8,7]]
+  print(foo[::2,::-1].eval())  # => [[3,2,1], [9,8,7]]
+
+  # Use scalar tensors as indices on both dimensions
+  print(foo[tf.constant(0), tf.constant(2)].eval())  # => 3
 
   # Insert another dimension
   foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
@@ -473,9 +487,9 @@ def _slice_helper(tensor, slice_spec, var=None):
 
   # Ellipses (3 equivalent operations)
   foo = tf.constant([[1,2,3], [4,5,6], [7,8,9]])
-  print(foo[tf.newaxis, :, :].eval())  # [[[1,2,3], [4,5,6], [7,8,9]]]
-  print(foo[tf.newaxis, ...].eval())  # [[[1,2,3], [4,5,6], [7,8,9]]]
-  print(foo[tf.newaxis].eval())  # [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[tf.newaxis, :, :].eval())  # => [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[tf.newaxis, ...].eval())  # => [[[1,2,3], [4,5,6], [7,8,9]]]
+  print(foo[tf.newaxis].eval())  # => [[[1,2,3], [4,5,6], [7,8,9]]]
   ```
 
   Notes:
@@ -576,6 +590,7 @@ def _slice_helper(tensor, slice_spec, var=None):
 
 
 # pylint: disable=undefined-variable,protected-access,redefined-outer-name
+@tf_export("slice")
 def slice(input_, begin, size, name=None):
   # pylint: disable=redefined-builtin
   """Extracts a slice from a tensor.
@@ -628,6 +643,7 @@ def slice(input_, begin, size, name=None):
 
 
 # pylint: disable=invalid-name
+@tf_export("strided_slice")
 def strided_slice(input_,
                   begin,
                   end,
@@ -816,6 +832,7 @@ def _SliceHelperVar(var, slice_spec):
 ops.Tensor._override_operator("__getitem__", _slice_helper)
 
 
+@tf_export("parallel_stack")
 def parallel_stack(values, name="parallel_stack"):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor in parallel.
 
@@ -866,6 +883,7 @@ def parallel_stack(values, name="parallel_stack"):
         [expand_dims(value, 0) for value in values], shape=output_shape)
 
 
+@tf_export("stack")
 def stack(values, axis=0, name="stack"):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor.
 
@@ -1011,6 +1029,7 @@ ops.register_tensor_conversion_function((list, tuple),
                                         _autopacking_conversion_function, 99)
 
 
+@tf_export("unstack")
 def unstack(value, num=None, axis=0, name="unstack"):
   """Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
 
@@ -1060,6 +1079,7 @@ def unstack(value, num=None, axis=0, name="unstack"):
   return gen_array_ops._unpack(value, num=num, axis=axis, name=name)
 
 
+@tf_export("concat")
 def concat(values, axis, name="concat"):
   """Concatenates tensors along one dimension.
 
@@ -1092,6 +1112,27 @@ def concat(values, axis, name="concat"):
   tf.shape(tf.concat([t3, t4], 0))  # [4, 3]
   tf.shape(tf.concat([t3, t4], 1))  # [2, 6]
   ```
+  As in Python, the `axis` could also be negative numbers. Negative `axis`
+  are interpreted as counting from the end of the rank, i.e.,
+   `axis + rank(values)`-th dimension.
+
+  For example:
+
+  ```python
+  t1 = [[[1, 2], [2, 3]], [[4, 4], [5, 3]]]
+  t2 = [[[7, 4], [8, 4]], [[2, 10], [15, 11]]]
+  tf.concat([t1, t2], -1)
+  ```
+
+  would produce:
+
+  ```python
+  [[[ 1,  2,  7,  4],
+    [ 2,  3,  8,  4]],
+
+   [[ 4,  4,  2, 10],
+    [ 5,  3, 15, 11]]]
+  ```
 
   Note: If you are concatenating along a new axis consider using stack.
   E.g.
@@ -1109,7 +1150,10 @@ def concat(values, axis, name="concat"):
   Args:
     values: A list of `Tensor` objects or a single `Tensor`.
     axis: 0-D `int32` `Tensor`.  Dimension along which to concatenate. Must be
-      in the range `[-rank(values), rank(values))`.
+      in the range `[-rank(values), rank(values))`. As in Python, indexing
+      for axis is 0-based. Positive axis in the rage of
+      `[0, rank(values))` refers to `axis`-th dimension. And negative axis
+      refers to `axis + rank(values)`-th dimension.
     name: A name for the operation (optional).
 
   Returns:
@@ -1132,6 +1176,7 @@ def concat(values, axis, name="concat"):
   return gen_array_ops._concat_v2(values=values, axis=axis, name=name)
 
 
+@tf_export("boolean_mask")
 def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
   """Apply boolean mask to tensor.  Numpy equivalent is `tensor[mask]`.
 
@@ -1212,6 +1257,7 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
     return _apply_mask_1d(tensor, mask, axis)
 
 
+@tf_export("sparse_mask")
 def sparse_mask(a, mask_indices, name=None):
   """Masks elements of `IndexedSlices`.
 
@@ -1254,6 +1300,19 @@ def sparse_mask(a, mask_indices, name=None):
     return ops.IndexedSlices(out_values, out_indices, a.dense_shape)
 
 
+@tf_export("unique")
+def unique(x, out_idx=dtypes.int32, name=None):
+  # TODO(yongtang): switch to v2 once API deprecation
+  # period (3 weeks) pass.
+  # TODO(yongtang): The documentation should also
+  # be updated when switch  to v2.
+  return gen_array_ops._unique(x, out_idx, name)
+
+
+unique.__doc__ = gen_array_ops._unique.__doc__
+
+
+@tf_export("split")
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor into sub tensors.
 
@@ -1305,7 +1364,7 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   size_splits = ops.convert_to_tensor(num_or_size_splits)
   if size_splits._rank() == 0 and size_splits.dtype.is_integer:
     return gen_array_ops._split(
-        split_dim=axis, num_split=num_or_size_splits, value=value, name=name)
+        axis=axis, num_split=num_or_size_splits, value=value, name=name)
 
   if num is None:
     num = size_splits._shape_tuple()[0]
@@ -1315,11 +1374,12 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   return gen_array_ops._split_v(
       value=value,
       size_splits=size_splits,
-      split_dim=axis,
+      axis=axis,
       num_split=num,
       name=name)
 
 
+@tf_export("transpose")
 def transpose(a, perm=None, name="transpose", conjugate=False):
   """Transposes `a`. Permutes the dimensions according to `perm`.
 
@@ -1396,6 +1456,7 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
 
 
 # pylint: disable=invalid-name
+@tf_export("matrix_transpose", "linalg.transpose")
 def matrix_transpose(a, name="matrix_transpose", conjugate=False):
   """Transposes last two dimensions of tensor `a`.
 
@@ -1467,6 +1528,7 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
 # pylint: enable=invalid-name
 
 
+@tf_export("zeros")
 def zeros(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to zero.
 
@@ -1496,24 +1558,22 @@ def zeros(shape, dtype=dtypes.float32, name=None):
       zero = ""
     else:
       zero = 0
-    # Checking for boolean dtype to prevent attempting to run fill on the GPU
-    # which does not have a boolean kernel registered.
-    if context.in_eager_mode() and dtype != dtypes.bool:
-      return fill(shape, constant(zero, dtype=dtype), name=name)
-    try:
-      if isinstance(shape, ops.Tensor):
-        # TODO(apassos) this is required to reproduce the behavior from before
-        # Tensors were iterable. It's a crutch.
-        raise TypeError
-      shape = tensor_shape.as_shape(shape)
-      output = constant(zero, shape=shape, dtype=dtype, name=name)
-    except (TypeError, ValueError):
-      shape = ops.convert_to_tensor(shape, dtype=dtypes.int32, name="shape")
-      output = fill(shape, constant(zero, dtype=dtype), name=name)
+    if not isinstance(shape, ops.Tensor):
+      try:
+        # Go through tensor shapes to get int64-if-needed semantics
+        shape = constant_op._tensor_shape_tensor_conversion_function(
+            tensor_shape.TensorShape(shape))
+      except (TypeError, ValueError):
+        # Happens when shape is a list with tensor elements
+        shape = ops.convert_to_tensor(shape, dtype=dtypes.int32)
+    if not shape._shape_tuple():
+      shape = reshape(shape, [-1])  # Ensure it's a vector
+    output = fill(shape, constant(zero, dtype=dtype), name=name)
   assert output.dtype.base_dtype == dtype
   return output
 
 
+@tf_export("zeros_like")
 def zeros_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to zero.
 
@@ -1530,9 +1590,9 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
 
   Args:
     tensor: A `Tensor`.
-    dtype: A type for the returned `Tensor`. Must be `float32`, `float64`,
-      `int8`, `uint8`, `int16`, `uint16`, int32`, `int64`,
-      `complex64`, `complex128` or `bool`.
+    dtype: A type for the returned `Tensor`. Must be `float16`, `float32`,
+      `float64`, `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
+      `complex64`, `complex128`, `bool` or `string`.
     name: A name for the operation (optional).
     optimize: if true, attempt to statically determine the shape of 'tensor'
     and encode it as a constant.
@@ -1566,6 +1626,7 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
       return gen_array_ops._zeros_like(tensor, name=name)
 
 
+@tf_export("ones_like")
 def ones_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to 1.
 
@@ -1603,6 +1664,7 @@ def ones_like(tensor, dtype=None, name=None, optimize=True):
     return ret
 
 
+@tf_export("ones")
 def ones(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to 1.
 
@@ -1627,19 +1689,22 @@ def ones(shape, dtype=dtypes.float32, name=None):
   dtype = dtypes.as_dtype(dtype).base_dtype
   with ops.name_scope(name, "ones", [shape]) as name:
     one = True if dtype == dtypes.bool else 1
-    try:
-      if isinstance(shape, ops.Tensor):
-        raise TypeError(
-            "preserving semantics from before tensors were iterable")
-      shape = tensor_shape.as_shape(shape)
-      output = constant(one, shape=shape, dtype=dtype, name=name)
-    except (TypeError, ValueError):
-      shape = ops.convert_to_tensor(shape, dtype=dtypes.int32, name="shape")
-      output = fill(shape, constant(one, dtype=dtype), name=name)
+    if not isinstance(shape, ops.Tensor):
+      try:
+        # Go through tensor shapes to get int64-if-needed semantics
+        shape = constant_op._tensor_shape_tensor_conversion_function(
+            tensor_shape.TensorShape(shape))
+      except (TypeError, ValueError):
+        # Happens when shape is a list with tensor elements
+        shape = ops.convert_to_tensor(shape, dtype=dtypes.int32)
+    if not shape._shape_tuple():
+      shape = reshape(shape, [-1])  # Ensure it's a vector
+    output = fill(shape, constant(one, dtype=dtype), name=name)
   assert output.dtype.base_dtype == dtype
   return output
 
 
+@tf_export("placeholder")
 def placeholder(dtype, shape=None, name=None):
   """Inserts a placeholder for a tensor that will be always fed.
 
@@ -1693,6 +1758,7 @@ def _normalize_sparse_shape(shape, name):
   return (ops.convert_to_tensor(shape, dtype=dtypes.int64, name=name), rank)
 
 
+@tf_export("sparse_placeholder")
 def sparse_placeholder(dtype, shape=None, name=None):
   """Inserts a placeholder for a sparse tensor that will be always fed.
 
@@ -1759,6 +1825,7 @@ def sparse_placeholder(dtype, shape=None, name=None):
 # pylint: enable=redefined-outer-name
 
 
+@tf_export("pad")
 def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pylint: disable=invalid-name
   """Pads a tensor.
 
@@ -1852,6 +1919,7 @@ def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pyl
   return result
 
 
+@tf_export("meshgrid")
 def meshgrid(*args, **kwargs):
   """Broadcasts parameters for evaluation on an N-D grid.
 
@@ -1991,6 +2059,7 @@ def _TileGradShape(op):
     return [tensor_shape.TensorShape(output_dims)]
 
 
+@tf_export("edit_distance")
 def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   """Computes the Levenshtein distance between sequences.
 
@@ -2008,7 +2077,7 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   hypothesis = tf.SparseTensor(
       [[0, 0, 0],
        [1, 0, 0]],
-      ["a", "b"]
+      ["a", "b"],
       (2, 1, 1))
 
   # 'truth' is a tensor of shape `[2, 2]` with variable-length values:
@@ -2020,7 +2089,7 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
       [[0, 1, 0],
        [1, 0, 0],
        [1, 0, 1],
-       [1, 1, 0]]
+       [1, 1, 0]],
       ["a", "b", "c", "a"],
       (2, 2, 2))
 
@@ -2104,6 +2173,7 @@ def _FakeQuantWithMinMaxVarsPerChannelGradient(op, grad):
       narrow_range=op.get_attr("narrow_range"))
 
 
+@tf_export("required_space_to_batch_paddings")
 def required_space_to_batch_paddings(input_shape,
                                      block_shape,
                                      base_paddings=None,
@@ -2182,6 +2252,7 @@ def required_space_to_batch_paddings(input_shape,
     return result_paddings, result_crops
 
 
+@tf_export("space_to_batch")
 def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=redefined-builtin
   result = space_to_batch_nd(
       input,
@@ -2195,6 +2266,7 @@ def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=r
 space_to_batch.__doc__ = gen_array_ops._space_to_batch.__doc__
 
 
+@tf_export("space_to_depth")
 def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
 
@@ -2202,6 +2274,7 @@ def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint:
 space_to_depth.__doc__ = gen_array_ops.space_to_depth.__doc__
 
 
+@tf_export("depth_to_space")
 def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
 
@@ -2209,6 +2282,7 @@ def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint:
 depth_to_space.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
+@tf_export("batch_to_space")
 def batch_to_space(input, crops, block_size, name=None):  # pylint: disable=redefined-builtin
   result = batch_to_space_nd(
       input,
@@ -2222,6 +2296,7 @@ def batch_to_space(input, crops, block_size, name=None):  # pylint: disable=rede
 batch_to_space.__doc__ = gen_array_ops._batch_to_space.__doc__
 
 
+@tf_export("one_hot")
 def one_hot(indices,
             depth,
             on_value=None,
@@ -2377,10 +2452,11 @@ def _all_dimensions(x):
     r = x.dense_shape.get_shape()[0].value  # sparse.dense_shape is 1-D.
     return constant_op.constant(np.arange(r), dtype=dtypes.int32)
 
-  # Otherwise, we rely on Range and Rank to do the right thing at run-time.
-  return range(0, rank(x))
+  # Otherwise, we rely on `range` and `rank` to do the right thing at runtime.
+  return gen_math_ops._range(0, rank(x), 1)
 
 
+@tf_export("sequence_mask")
 def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
   """Returns a mask tensor representing the first N positions of each cell.
 
@@ -2422,7 +2498,7 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
       maxlen = gen_math_ops._max(lengths, _all_dimensions(lengths))
     else:
       maxlen = ops.convert_to_tensor(maxlen)
-    if maxlen.get_shape().ndims != 0:
+    if maxlen.get_shape().ndims is not None and maxlen.get_shape().ndims != 0:
       raise ValueError("maxlen must be scalar for sequence_mask")
 
     # The basic idea is to compare a range row vector of size maxlen:
@@ -2443,6 +2519,7 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
       return gen_math_ops.cast(result, dtype)
 
 
+@tf_export("squeeze")
 def squeeze(input, axis=None, name=None, squeeze_dims=None):
   # pylint: disable=redefined-builtin
   """Removes dimensions of size 1 from the shape of a tensor.
@@ -2492,6 +2569,7 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None):
   return gen_array_ops._squeeze(input, axis, name)
 
 
+@tf_export("where")
 def where(condition, x=None, y=None, name=None):
   """Return the elements, either from `x` or `y`, depending on the `condition`.
 
@@ -2537,13 +2615,14 @@ def where(condition, x=None, y=None, name=None):
     with ops.name_scope(name, "Where", [condition]) as name:
       condition = ops.convert_to_tensor(
           condition, preferred_dtype=dtypes.bool, name="condition")
-      return gen_array_ops.where(input=condition, name=name)
+      return gen_array_ops.where(condition=condition, name=name)
   elif x is not None and y is not None:
-    return gen_math_ops._select(condition=condition, t=x, e=y, name=name)
+    return gen_math_ops._select(condition=condition, x=x, y=y, name=name)
   else:
     raise ValueError("x and y must both be non-None or both be None.")
 
 
+@tf_export("reverse")
 def reverse(tensor, axis, name=None):
   return gen_array_ops.reverse_v2(tensor, axis, name)
 
@@ -2552,6 +2631,7 @@ reverse.__doc__ = gen_array_ops.reverse_v2.__doc__
 
 
 # pylint: disable=redefined-builtin
+@tf_export("reverse_sequence")
 def reverse_sequence(input,
                      seq_lengths,
                      seq_axis=None,
@@ -2579,6 +2659,7 @@ reverse_sequence.__doc__ = deprecation.rewrite_argument_docstring(
     "seq_dim", "seq_axis")
 
 
+@tf_export("gather")
 def gather(params, indices, validate_indices=None, name=None, axis=0):
   # TODO(rjryan): Remove "Gather" creation in favor of GatherV2 once the forward
   # compatibility 3 week period has passed.
@@ -2594,6 +2675,7 @@ gather.__doc__ = gen_array_ops.gather_v2.__doc__
 
 # Define quantize_v2 here in order to make name the second-to-last attribute,
 # because round_mode was added later.
+@tf_export("quantize_v2")
 @deprecation.deprecated(
     "2017-10-25",
     "`tf.quantize_v2` is deprecated, please use `tf.quantize` instead.")
@@ -2618,6 +2700,7 @@ quantize_v2.__doc__ = """Please use `tf.quantize` instead."""
 
 # We want to expose tf.quantize instead of tf.quantize_v2; we can deprecate
 # tf.quantize_v2 in next version of TensorFlow.
+@tf_export("quantize")
 def quantize(input,  # pylint: disable=redefined-builtin
              min_range,
              max_range,
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index fa1b219b1771dbd8f99939d8f6571d2a8791433e..c4cfc0da197edcfd143cfee79fd3c3f9b7a2858b 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -36,7 +36,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
 
   def testBinaryOps(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
-                  dtypes.uint8, dtypes.uint16]
+                  dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
 
     with self.test_session(use_gpu=True) as sess:
       for dtype in dtype_list:
@@ -71,7 +71,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
 
   def testInvertOp(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
-                  dtypes.uint8, dtypes.uint16]
+                  dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
     inputs = [0, 5, 3, 14]
     with self.test_session(use_gpu=True) as sess:
       for dtype in dtype_list:
@@ -135,5 +135,36 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
                   bitwise_ops.right_shift(lhs, rhs)])
 
 
+  def testShapeInference(self):
+    dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]
+
+    with self.test_session(use_gpu=True) as sess:
+      for dtype in dtype_list:
+        lhs = constant_op.constant([[0], [3], [5]], dtype=dtype)
+        rhs = constant_op.constant([[1, 2, 4]], dtype=dtype)
+
+        and_tensor = bitwise_ops.bitwise_and(lhs, rhs)
+        or_tensor = bitwise_ops.bitwise_or(lhs, rhs)
+        xor_tensor = bitwise_ops.bitwise_xor(lhs, rhs)
+        ls_tensor = bitwise_ops.left_shift(lhs, rhs)
+        rs_tensor = bitwise_ops.right_shift(lhs, rhs)
+
+        and_result, or_result, xor_result, ls_result, rs_result = sess.run(
+            [and_tensor, or_tensor, xor_tensor, ls_tensor, rs_tensor])
+
+        # Compare shape inference with result
+        self.assertAllEqual(and_tensor.get_shape().as_list(), and_result.shape)
+        self.assertAllEqual(and_tensor.get_shape().as_list(), [3, 3])
+        self.assertAllEqual(or_tensor.get_shape().as_list(), or_result.shape)
+        self.assertAllEqual(or_tensor.get_shape().as_list(), [3, 3])
+        self.assertAllEqual(xor_tensor.get_shape().as_list(), xor_result.shape)
+        self.assertAllEqual(xor_tensor.get_shape().as_list(), [3, 3])
+        self.assertAllEqual(ls_tensor.get_shape().as_list(), ls_result.shape)
+        self.assertAllEqual(ls_tensor.get_shape().as_list(), [3, 3])
+        self.assertAllEqual(rs_tensor.get_shape().as_list(), rs_result.shape)
+        self.assertAllEqual(rs_tensor.get_shape().as_list(), [3, 3])
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index d6294c24f5cf9427209c9f5e84d05e32686908bf..220ef1754d2e1a2d54a8962148b47806df48e98f 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -20,11 +20,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_candidate_sampling_ops
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import math_ops  # pylint: disable=unused-import
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('nn.uniform_candidate_sampler')
 def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                               range_max, seed=None, name=None):
   """Samples a set of classes using a uniform base distribution.
@@ -80,6 +82,7 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
       seed2=seed2, name=name)
 
 
+@tf_export('nn.log_uniform_candidate_sampler')
 def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                                   range_max, seed=None, name=None):
   """Samples a set of classes using a log-uniform (Zipfian) base distribution.
@@ -138,6 +141,7 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
       seed2=seed2, name=name)
 
 
+@tf_export('nn.learned_unigram_candidate_sampler')
 def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
                                       unique, range_max, seed=None, name=None):
   """Samples a set of classes from a distribution learned during training.
@@ -194,6 +198,7 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
       seed2=seed2, name=name)
 
 
+@tf_export('nn.fixed_unigram_candidate_sampler')
 def fixed_unigram_candidate_sampler(true_classes,
                                     num_true,
                                     num_sampled,
@@ -285,6 +290,7 @@ def fixed_unigram_candidate_sampler(true_classes,
       unigrams=unigrams, seed=seed1, seed2=seed2, name=name)
 
 
+@tf_export('nn.all_candidate_sampler')
 def all_candidate_sampler(true_classes, num_true, num_sampled, unique,
                           seed=None, name=None):
   """Generate the set of all classes.
@@ -320,6 +326,7 @@ def all_candidate_sampler(true_classes, num_true, num_sampled, unique,
       name=name)
 
 
+@tf_export('nn.compute_accidental_hits')
 def compute_accidental_hits(true_classes, sampled_candidates, num_true,
                             seed=None, name=None):
   """Compute the position ids in `sampled_candidates` matching `true_classes`.
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 1377af3eac43a5846353257304ef7e022d3506d4..0fd6e29a49c8e4e31e244bfbbfca525d72e4d811 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -23,6 +23,7 @@ See the @{$python/check_ops} guide.
 @@assert_non_positive
 @@assert_equal
 @@assert_none_equal
+@@assert_near
 @@assert_less
 @@assert_less_equal
 @@assert_greater
@@ -56,6 +57,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 NUMERIC_TYPES = frozenset(
     [dtypes.float32, dtypes.float64, dtypes.int8, dtypes.int16, dtypes.int32,
@@ -70,6 +72,7 @@ __all__ = [
     'assert_non_positive',
     'assert_equal',
     'assert_none_equal',
+    'assert_near',
     'assert_integer',
     'assert_less',
     'assert_less_equal',
@@ -109,6 +112,7 @@ def _shape_and_dtype_str(tensor):
   return 'shape=%s dtype=%s' % (tensor.shape, tensor.dtype.name)
 
 
+@tf_export('assert_proper_iterable')
 def assert_proper_iterable(values):
   """Static assert that values is a "proper" iterable.
 
@@ -136,6 +140,7 @@ def assert_proper_iterable(values):
         'Expected argument "values" to be iterable.  Found: %s' % type(values))
 
 
+@tf_export('assert_negative')
 def assert_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < 0` holds element-wise.
 
@@ -176,6 +181,7 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):
     return assert_less(x, zero, data=data, summarize=summarize)
 
 
+@tf_export('assert_positive')
 def assert_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x > 0` holds element-wise.
 
@@ -215,6 +221,7 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):
     return assert_less(zero, x, data=data, summarize=summarize)
 
 
+@tf_export('assert_non_negative')
 def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x >= 0` holds element-wise.
 
@@ -256,6 +263,7 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
     return assert_less_equal(zero, x, data=data, summarize=summarize)
 
 
+@tf_export('assert_non_positive')
 def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= 0` holds element-wise.
 
@@ -297,6 +305,7 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
     return assert_less_equal(x, zero, data=data, summarize=summarize)
 
 
+@tf_export('assert_equal')
 def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x == y` holds element-wise.
 
@@ -338,8 +347,11 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
       eq = math_ops.equal(x, y)
       condition = math_ops.reduce_all(eq)
       if not condition:
-        # Prepare a message with first elements of x and y
+        # Prepare a message with first elements of x and y.
         summary_msg = ''
+        # Default to printing 3 elements like control_flow_ops.Assert (used
+        # by graph mode) does.
+        summarize = 3 if summarize is None else summarize
         if summarize:
           # reshape((-1,)) is the fastest way to get a flat array view.
           x_np = x.numpy().reshape((-1,))
@@ -351,15 +363,13 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
                          (x_sum, x_np[:x_sum],
                           y_sum, y_np[:y_sum]))
 
-        # Get the values that actually differed and their indices
+        # Get the values that actually differed and their indices.
         mask = math_ops.logical_not(eq)
         indices = array_ops.where(mask)
         indices_np = indices.numpy()
         x_vals = array_ops.boolean_mask(x, mask)
         y_vals = array_ops.boolean_mask(y, mask)
-        diff_to_print = 0
-        if summarize:
-          diff_to_print = min(summarize, indices_np.size)
+        summarize = min(summarize, indices_np.shape[0])
 
         raise errors.InvalidArgumentError(
             node_def=None, op=None,
@@ -370,9 +380,9 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
                      '%s'
                      %
                      (message or '',
-                      diff_to_print, indices_np[:diff_to_print],
-                      x_vals.numpy().reshape((-1,))[:diff_to_print],
-                      y_vals.numpy().reshape((-1,))[:diff_to_print],
+                      summarize, indices_np[:summarize],
+                      x_vals.numpy().reshape((-1,))[:summarize],
+                      y_vals.numpy().reshape((-1,))[:summarize],
                       summary_msg)))
       return
 
@@ -392,6 +402,7 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
+@tf_export('assert_none_equal')
 def assert_none_equal(
     x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x != y` holds for all elements.
@@ -442,6 +453,85 @@ def assert_none_equal(
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
+@tf_export('assert_near')
+def assert_near(
+    x, y, rtol=None, atol=None, data=None, summarize=None, message=None,
+    name=None):
+  """Assert the condition `x` and `y` are close element-wise.
+
+  Example of adding a dependency to an operation:
+
+  ```python
+  with tf.control_dependencies([tf.assert_near(x, y)]):
+    output = tf.reduce_sum(x)
+  ```
+
+  This condition holds if for every pair of (possibly broadcast) elements
+  `x[i]`, `y[i]`, we have
+
+  ```tf.abs(x[i] - y[i]) <= atol + rtol * tf.abs(y[i])```.
+
+  If both `x` and `y` are empty, this is trivially satisfied.
+
+  The default `atol` and `rtol` is `10 * eps`, where `eps` is the smallest
+  representable positive number such that `1 + eps != eps`.  This is about
+  `1.2e-6` in `32bit`, `2.22e-15` in `64bit`, and `0.00977` in `16bit`.
+  See `numpy.finfo`.
+
+  Args:
+    x:  Float or complex `Tensor`.
+    y:  Float or complex `Tensor`, same `dtype` as, and broadcastable to, `x`.
+    rtol:  `Tensor`.  Same `dtype` as, and broadcastable to, `x`.
+      The relative tolerance.  Default is `10 * eps`.
+    atol:  `Tensor`.  Same `dtype` as, and broadcastable to, `x`.
+      The absolute tolerance.  Default is `10 * eps`.
+    data:  The tensors to print out if the condition is False.  Defaults to
+      error message and first few entries of `x`, `y`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).  Defaults to "assert_near".
+
+  Returns:
+    Op that raises `InvalidArgumentError` if `x` and `y` are not close enough.
+
+  @compatibility(numpy)
+  Similar to `numpy.assert_allclose`, except tolerance depends on data type.
+  This is due to the fact that `TensorFlow` is often used with `32bit`, `64bit`,
+  and even `16bit` data.
+  @end_compatibility
+  """
+  message = message or ''
+  with ops.name_scope(name, 'assert_near', [x, y, rtol, atol, data]):
+    x = ops.convert_to_tensor(x, name='x')
+    y = ops.convert_to_tensor(y, name='y', dtype=x.dtype)
+
+    eps = np.finfo(x.dtype.as_numpy_dtype).eps
+    rtol = 10 * eps if rtol is None else rtol
+    atol = 10 * eps if atol is None else atol
+
+    rtol = ops.convert_to_tensor(rtol, name='rtol', dtype=x.dtype)
+    atol = ops.convert_to_tensor(atol, name='atol', dtype=x.dtype)
+
+    if context.in_eager_mode():
+      x_name = _shape_and_dtype_str(x)
+      y_name = _shape_and_dtype_str(y)
+    else:
+      x_name = x.name
+      y_name = y.name
+
+    if data is None:
+      data = [
+          message,
+          'x and y not equal to tolerance rtol = %s, atol = %s' % (rtol, atol),
+          'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y
+      ]
+    tol = atol + rtol * math_ops.abs(y)
+    diff = math_ops.abs(x - y)
+    condition = math_ops.reduce_all(math_ops.less(diff, tol))
+    return control_flow_ops.Assert(condition, data, summarize=summarize)
+
+
+@tf_export('assert_less')
 def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < y` holds element-wise.
 
@@ -489,6 +579,7 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
+@tf_export('assert_less_equal')
 def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= y` holds element-wise.
 
@@ -536,6 +627,7 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
+@tf_export('assert_greater')
 def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x > y` holds element-wise.
 
@@ -583,6 +675,7 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
+@tf_export('assert_greater_equal')
 def assert_greater_equal(x, y, data=None, summarize=None, message=None,
                          name=None):
   """Assert the condition `x >= y` holds element-wise.
@@ -680,6 +773,7 @@ def _assert_rank_condition(
   return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
+@tf_export('assert_rank')
 def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank`.
 
@@ -741,6 +835,7 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   return assert_op
 
 
+@tf_export('assert_rank_at_least')
 def assert_rank_at_least(
     x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank` or higher.
@@ -871,6 +966,7 @@ def _assert_ranks_condition(
   return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
+@tf_export('assert_rank_in')
 def assert_rank_in(
     x, ranks, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank in `ranks`.
@@ -932,6 +1028,7 @@ def assert_rank_in(
   return assert_op
 
 
+@tf_export('assert_integer')
 def assert_integer(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
 
@@ -969,6 +1066,7 @@ def assert_integer(x, message=None, name=None):
     return control_flow_ops.no_op('statically_determined_was_integer')
 
 
+@tf_export('assert_type')
 def assert_type(tensor, tf_type, message=None, name=None):
   """Statically asserts that the given `Tensor` is of the specified type.
 
@@ -1016,10 +1114,12 @@ def _get_diff_for_monotonic_comparison(x):
   return control_flow_ops.cond(is_shorter_than_two, short_result, diff)
 
 
+@tf_export('is_numeric_tensor')
 def is_numeric_tensor(tensor):
   return isinstance(tensor, ops.Tensor) and tensor.dtype in NUMERIC_TYPES
 
 
+@tf_export('is_non_decreasing')
 def is_non_decreasing(x, name=None):
   """Returns `True` if `x` is non-decreasing.
 
@@ -1046,6 +1146,7 @@ def is_non_decreasing(x, name=None):
     return math_ops.reduce_all(math_ops.less_equal(zero, diff))
 
 
+@tf_export('is_strictly_increasing')
 def is_strictly_increasing(x, name=None):
   """Returns `True` if `x` is strictly increasing.
 
@@ -1104,6 +1205,7 @@ def _assert_same_base_type(items, expected_type=None):
   return expected_type
 
 
+@tf_export('assert_same_float_dtype')
 def assert_same_float_dtype(tensors=None, dtype=None):
   """Validate and return float type based on `tensors` and `dtype`.
 
@@ -1132,6 +1234,7 @@ def assert_same_float_dtype(tensors=None, dtype=None):
   return dtype
 
 
+@tf_export('assert_scalar')
 def assert_scalar(tensor, name=None):
   with ops.name_scope(name, 'assert_scalar', [tensor]) as name_scope:
     tensor = ops.convert_to_tensor(tensor, name=name_scope)
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 80803530c1ede4537e729ef77958a5d905005dd3..49f8c665313562cb20dbe4494103ded16646c741 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -28,8 +28,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("clip_by_value")
 def clip_by_value(t, clip_value_min, clip_value_max,
                   name=None):
   """Clips tensor values to a specified min and max.
@@ -70,6 +72,7 @@ def clip_by_value(t, clip_value_min, clip_value_max,
   return t_max
 
 
+@tf_export("clip_by_norm")
 def clip_by_norm(t, clip_norm, axes=None, name=None):
   """Clips tensor values to a maximum L2-norm.
 
@@ -107,7 +110,7 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
     t = ops.convert_to_tensor(t, name="t")
 
     # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
-    l2norm = math_ops.sqrt(math_ops.reduce_sum(t * t, axes, keep_dims=True))
+    l2norm = math_ops.sqrt(math_ops.reduce_sum(t * t, axes, keepdims=True))
     intermediate = t * clip_norm
     # Assert that the shape is compatible with the initial shape,
     # to prevent unintentional broadcasting.
@@ -117,6 +120,8 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
 
   return tclip
 
+
+@tf_export("global_norm")
 def global_norm(t_list, name=None):
   """Computes the global norm of multiple tensors.
 
@@ -164,6 +169,8 @@ def global_norm(t_list, name=None):
 
   return norm
 
+
+@tf_export("clip_by_global_norm")
 def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
   """Clips values of multiple tensors by the ratio of the sum of their norms.
 
@@ -246,6 +253,7 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
   return list_clipped, use_norm
 
 
+@tf_export("clip_by_average_norm")
 def clip_by_average_norm(t, clip_norm, name=None):
   """Clips tensor values to a maximum average L2-norm.
 
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index 32e071db1749ceed56e2f31446e58213d0603705..e4ce2ab28a15f82e80194ab17ef939411982076a 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 def remove_squeezable_dimensions(
@@ -93,6 +94,7 @@ def remove_squeezable_dimensions(
     return labels, predictions
 
 
+@tf_export('confusion_matrix')
 def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
                      name=None, weights=None):
   """Computes the confusion matrix from predictions and labels.
@@ -117,7 +119,7 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
   For example:
 
   ```python
-    tf.contrib.metrics.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
+    tf.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
         [[0 0 0 0 0]
          [0 0 1 0 0]
          [0 0 1 0 0]
diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 22dc6771ec0690fe807b34b3dea6295edf7dbbf0..97b57177b29986a006df992f4c0c2b79e11467aa 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,undefined-variable
@@ -52,7 +53,8 @@ def _SwitchGrad(op, *grad):
       # TODO(yuanbyu): Perform shape inference with this new input.
       if grad[1] is not None:
         # pylint: disable=protected-access
-        control_flow_ops._AddNextAndBackEdge(merge_grad, grad[1])
+        control_flow_ops._AddNextAndBackEdge(merge_grad, grad[1],
+                                             enforce_shape_invariant=False)
         # pylint: enable=protected-access
       return None, None
     elif grad[0] is not None:
@@ -91,7 +93,7 @@ def _MergeGrad(op, grad, _):
   input_op = op.inputs[0].op
   graph = ops.get_default_graph()
   # pylint: disable=protected-access
-  op_ctxt = control_flow_ops._GetOutputContext(input_op)
+  op_ctxt = control_flow_util.GetOutputContext(input_op)
   grad_ctxt = graph._get_control_flow_context()
   # pylint: enable=protected-access
   if isinstance(op_ctxt, WhileContext):
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 38c959df8ded422a9313a8b44fc646e1e98b3108..c33f3512893a413dd4c5b9a1fd87c9bb498627f9 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Control Flow Operations.
 
 See the @{$python/control_flow_ops} guide.
@@ -51,11 +50,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import collections
+import functools
 
 import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import control_flow_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -66,6 +67,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util as util
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
@@ -77,10 +79,11 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.gen_control_flow_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_should_use
-
+from tensorflow.python.util.tf_export import tf_export
 
 # We override the 'tuple' for a control flow op, so we keep python's
 # existing 'tuple' for later use in this module.
@@ -115,6 +118,7 @@ def _summarize_eager(tensor, summarize=None):
 
 # Assert and Print are special symbols in python, so we must
 # use an upper-case version of them.
+@tf_export("Assert")
 @tf_should_use.should_use_result
 def Assert(condition, data, summarize=None, name=None):
   """Asserts that the given condition is true.
@@ -152,9 +156,10 @@ def Assert(condition, data, summarize=None, name=None):
       xs = ops.convert_n_to_tensor(data)
       data_str = [_summarize_eager(x, summarize) for x in xs]
       raise errors.InvalidArgumentError(
-          node_def=None, op=None,
-          message="Expected '%s' to be true. Summarized data: %s" % (
-              condition, "\n".join(data_str)))
+          node_def=None,
+          op=None,
+          message="Expected '%s' to be true. Summarized data: %s" %
+          (condition, "\n".join(data_str)))
     return
 
   with ops.name_scope(name, "Assert", [condition, data]) as name:
@@ -163,15 +168,15 @@ def Assert(condition, data, summarize=None, name=None):
       # As a simple heuristic, we assume that string and int32 are
       # on host to avoid the need to use cond. If it is not case,
       # we will pay the price copying the tensor to host memory.
-      return gen_logging_ops._assert(
-          condition, data, summarize, name="Assert")
+      return gen_logging_ops._assert(condition, data, summarize, name="Assert")
     else:
       condition = ops.convert_to_tensor(condition, name="Condition")
+
       def true_assert():
         return gen_logging_ops._assert(
             condition, data, summarize, name="Assert")
-      guarded_assert = cond(
-          condition, no_op, true_assert, name="AssertGuard")
+
+      guarded_assert = cond(condition, no_op, true_assert, name="AssertGuard")
       if context.in_eager_mode():
         return
       return guarded_assert.op
@@ -211,7 +216,7 @@ def _Identity(data, name=None):
 def _NextIteration(data, name=None):
   data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
   if isinstance(data, ops.Tensor):
-    if data.dtype._is_ref_dtype:   # pylint: disable=protected-access
+    if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
       return ref_next_iteration(data, name=name)
     else:
       return next_iteration(data, name=name)
@@ -230,8 +235,13 @@ def _NextIteration(data, name=None):
       return sparse_tensor.SparseTensor(indices, values, dense_shape)
 
 
-def _Enter(data, frame_name, is_constant=False, parallel_iterations=10,
-           use_ref=True, use_input_shape=True, name=None):
+def _Enter(data,
+           frame_name,
+           is_constant=False,
+           parallel_iterations=10,
+           use_ref=True,
+           use_input_shape=True,
+           name=None):
   """Creates or finds a child frame, and makes `data` available to it.
 
   The unique `frame_name` is used by the `Executor` to identify frames. If
@@ -253,41 +263,57 @@ def _Enter(data, frame_name, is_constant=False, parallel_iterations=10,
   data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype and use_ref:  # pylint: disable=protected-access
-      result = ref_enter(data, frame_name, is_constant, parallel_iterations,
-                         name=name)
+      result = gen_control_flow_ops._ref_enter(
+          data, frame_name, is_constant, parallel_iterations, name=name)
     else:
-      result = enter(data, frame_name, is_constant, parallel_iterations,
-                     name=name)
+      result = gen_control_flow_ops._enter(
+          data, frame_name, is_constant, parallel_iterations, name=name)
     if use_input_shape:
       result.set_shape(data.get_shape())
     return result
   else:
     if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
       raise TypeError("Type %s not supported" % type(data))
-    values = _Enter(data.values, frame_name, is_constant,
-                    parallel_iterations=parallel_iterations,
-                    use_input_shape=use_input_shape, name=name)
-    indices = enter(data.indices, frame_name, is_constant,
-                    parallel_iterations, name="indices")
+    values = _Enter(
+        data.values,
+        frame_name,
+        is_constant,
+        parallel_iterations=parallel_iterations,
+        use_input_shape=use_input_shape,
+        name=name)
+    indices = gen_control_flow_ops._enter(
+        data.indices,
+        frame_name,
+        is_constant,
+        parallel_iterations,
+        name="indices")
     if use_input_shape:
       indices.set_shape(data.indices.get_shape())
     if isinstance(data, ops.IndexedSlices):
       dense_shape = data.dense_shape
       if dense_shape is not None:
-        dense_shape = enter(dense_shape, frame_name, is_constant,
-                            parallel_iterations, name="dense_shape")
+        dense_shape = gen_control_flow_ops._enter(
+            dense_shape,
+            frame_name,
+            is_constant,
+            parallel_iterations,
+            name="dense_shape")
         if use_input_shape:
           dense_shape.set_shape(data.dense_shape.get_shape())
       return ops.IndexedSlices(values, indices, dense_shape)
     else:
-      dense_shape = enter(data.dense_shape, frame_name, is_constant,
-                          parallel_iterations, name="dense_shape")
+      dense_shape = gen_control_flow_ops._enter(
+          data.dense_shape,
+          frame_name,
+          is_constant,
+          parallel_iterations,
+          name="dense_shape")
       if use_input_shape:
         dense_shape.set_shape(data.dense_shape.get_shape())
       return sparse_tensor.SparseTensor(indices, values, dense_shape)
 
 
-def exit(data, name=None):
+def exit(data, name=None):  # pylint: disable=redefined-builtin
   """Exits the current frame to its parent frame.
 
   Exit makes its input `data` available to the parent frame.
@@ -440,8 +466,10 @@ def merge(inputs, name=None):
   if any([inp is None for inp in inputs]):
     raise ValueError("At least one of the merge inputs is None: %s" % inputs)
   with ops.name_scope(name, "Merge", inputs) as name:
-    inputs = [ops.internal_convert_to_tensor_or_indexed_slices(inp, as_ref=True)
-              for inp in inputs]
+    inputs = [
+        ops.internal_convert_to_tensor_or_indexed_slices(inp, as_ref=True)
+        for inp in inputs
+    ]
     if all([isinstance(v, ops.Tensor) for v in inputs]):
       if all([v.dtype._is_ref_dtype for v in inputs]):  # pylint: disable=protected-access
         return gen_control_flow_ops._ref_merge(inputs, name)
@@ -471,6 +499,8 @@ def merge(inputs, name=None):
       else:
         dense_shape = None
       return ops.IndexedSlices(values, indices, dense_shape), chosen_index
+
+
 # pylint: enable=protected-access
 
 
@@ -484,7 +514,9 @@ def _convert_tensorarray_to_flow(tensor_or_tensor_array):
 def _make_tensor_array(ta, t_or_flow):
   # pylint: disable=protected-access
   new_ta = tensor_array_ops.TensorArray(
-      dtype=ta.dtype, handle=ta.handle, flow=t_or_flow,
+      dtype=ta.dtype,
+      handle=ta.handle,
+      flow=t_or_flow,
       infer_shape=ta._infer_shape,
       colocate_with_first_write_call=ta._colocate_with_first_write_call)
   new_ta._colocate_with = ta._colocate_with
@@ -496,36 +528,13 @@ def _make_tensor_array(ta, t_or_flow):
 def _convert_flows_to_tensorarrays(tensors_or_tensorarrays, tensors_or_flows):
   if len(tensors_or_tensorarrays) != len(tensors_or_flows):
     raise ValueError(
-        "Lengths of original Tensor list and new list do not match: %d vs. %d"
-        % (len(tensors_or_tensorarrays), len(tensors_or_flows)))
+        "Lengths of original Tensor list and new list do not match: %d vs. %d" %
+        (len(tensors_or_tensorarrays), len(tensors_or_flows)))
   return [
       _make_tensor_array(ta, t_or_flow)
-      if isinstance(ta, tensor_array_ops.TensorArray)
-      else t_or_flow
-      for (ta, t_or_flow) in zip(tensors_or_tensorarrays, tensors_or_flows)]
-
-
-def _IsLoopConstantEnter(op):
-  """Return true iff op is a loop invariant."""
-  is_enter = (op.type == "Enter" or op.type == "RefEnter")
-  return is_enter and op.get_attr("is_constant")
-
-
-def _GetLoopConstantEnter(value):
-  """Return the enter op if we can infer `value` to be a loop invariant."""
-  id_ops = {"Switch", "RefSwitch", "Identity", "RefIdentity"}
-  op = value.op
-  while op.type in id_ops:
-    op = op.inputs[0].op
-  return op if _IsLoopConstantEnter(op) else None
-
-
-def _GetOutputContext(op):
-  """Return the control flow context for the output of an op."""
-  ctxt = op._get_control_flow_context()
-  if IsLoopExit(op):
-    ctxt = ctxt.outer_context
-  return ctxt
+      if isinstance(ta, tensor_array_ops.TensorArray) else t_or_flow
+      for (ta, t_or_flow) in zip(tensors_or_tensorarrays, tensors_or_flows)
+  ]
 
 
 def _ShapeLessThanOrEqual(shape1, shape2):
@@ -564,8 +573,8 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
         raise ValueError(
             "The shape invariant specified for %s is not compatible with "
             "the initial shape of the loop variable. It enters the loop "
-            "with shape %s, but the specified shape invariant is %s."
-            % (inp.name, inp.get_shape(), shape))
+            "with shape %s, but the specified shape invariant is %s." %
+            (inp.name, inp.get_shape(), shape))
       var.set_shape(shape)
     else:
       if not isinstance(var, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
@@ -576,8 +585,8 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
               "The shape invariant specified for %s is not compatible with "
               "the initial shape of the values tensor of this IndexedSlices. "
               "It enters the loop with shape %s, but the specified shape "
-              "invariant is %s."
-              % (inp.values.name, inp.values.get_shape(), shape))
+              "invariant is %s." % (inp.values.name, inp.values.get_shape(),
+                                    shape))
         var.values.set_shape(shape)
         var.indices.set_shape(tensor_shape.TensorShape([shape[0]]))
         if var.dense_shape is not None:
@@ -588,8 +597,8 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
               "The shape invariant specified for %s is not compatible with "
               "the initial shape of the shape tensor of this SparseTensor. "
               "It enters the loop with shape %s, but the specified shape "
-              "invariant is %s."
-              % (inp.dense_shape.name, inp.dense_shape.get_shape(), shape))
+              "invariant is %s." % (inp.dense_shape.name,
+                                    inp.dense_shape.get_shape(), shape))
         var.values.set_shape(tensor_shape.TensorShape([None]))
         var.indices.set_shape(tensor_shape.TensorShape([None, shape.ndims]))
         var.dense_shape.set_shape(shape)
@@ -612,12 +621,14 @@ def _EnforceShapeInvariant(merge_var, next_var):
     m_shape = merge_var.get_shape()
     n_shape = next_var.get_shape()
     if not _ShapeLessThanOrEqual(n_shape, m_shape):
+      # TODO(skyewm): get original loop input that caused the shape error and
+      # report its name instead of the merge node's.
       raise ValueError(
           "The shape for %s is not an invariant for the loop. It enters "
           "the loop with shape %s, but has shape %s after one iteration. "
           "Provide shape invariants using either the `shape_invariants` "
-          "argument of tf.while_loop or set_shape() on the loop variables."
-          % (merge_var.name, m_shape, n_shape))
+          "argument of tf.while_loop or set_shape() on the loop variables." %
+          (merge_var.name, m_shape, n_shape))
   else:
     if not isinstance(var, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
       raise TypeError("Type %s not supported" % type(var))
@@ -640,9 +651,9 @@ def _EnforceShapeInvariant(merge_var, next_var):
               "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
               "after one iteration. Provide shape invariants using either the "
               "`shape_invariants` argument of tf.while_loop or set_shape() "
-              "on the loop variables."
-              % (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
-                 n_values_shape, n_indices_shape, n_shape_shape))
+              "on the loop variables." %
+              (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
+               n_values_shape, n_indices_shape, n_shape_shape))
     else:
       m_values_shape = merge_var.values.get_shape()
       m_indices_shape = merge_var.indices.get_shape()
@@ -654,21 +665,27 @@ def _EnforceShapeInvariant(merge_var, next_var):
           not _ShapeLessThanOrEqual(n_indices_shape, m_indices_shape) or
           not _ShapeLessThanOrEqual(n_shape_shape, m_shape_shape)):
         raise ValueError(
-          "The shape for %s is not an invariant for the loop. It enters "
-          "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
-          "after one iteration. Provide shape invariants using either "
-          "the `shape_invariants` argument of tf.while_loop or set_shape() "
-          "on the loop variables."
-          % (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
+            "The shape for %s is not an invariant for the loop. It enters "
+            "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
+            "after one iteration. Provide shape invariants using either "
+            "the `shape_invariants` argument of tf.while_loop or set_shape() "
+            "on the loop variables." %
+            (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
              n_values_shape, n_indices_shape, n_shape_shape))
 
 
-def _AddNextAndBackEdge(m, v):
+def _AddNextAndBackEdge(m, v, enforce_shape_invariant=True):
   """Add NextIteration and back edge from v to m."""
   if isinstance(m, ops.Tensor):
     v = ops.convert_to_tensor(v)
     v = _NextIteration(v)
-    m.op._update_input(1, v)   # pylint: disable=protected-access
+    if enforce_shape_invariant:
+      # Make sure the shapes of loop outputs are correct. We do this before
+      # calling _update_input, which will raise a less-helpful error message if
+      # the types don't match.
+      # TODO(skyewm): call this for other cases below (needs testing)
+      _EnforceShapeInvariant(m, v)
+    m.op._update_input(1, v)  # pylint: disable=protected-access
   elif isinstance(m, ops.IndexedSlices):
     # pylint: disable=protected-access
     v = math_ops._as_indexed_slices(v, optimize=False)
@@ -694,6 +711,77 @@ def _AddNextAndBackEdge(m, v):
   return v
 
 
+def GetMaxSizeFromNestedMaximumIterations(value, while_ctxt):
+  """Calculate a max_size for use by stack ops inside an XLA while_loop.
+
+  Args:
+    value: The value inside the while_loop forward context.  Used for printing
+      error messages.
+    while_ctxt: The forward context inside which value resides.  This does
+      not always match the value's immediate context, as `value` may be
+      inside e.g. a cond context inside the while_loop.
+
+  Returns:
+    A tensor containing the `max_size` to feed to a Stack initializer.
+
+  Raises:
+    ValueError: If `value` is nested inside a `while_loop` that either
+      lacks a `maximum_iterations` parameter, or the `maximum_iterations`
+      parameter:
+
+        - is inside a `while_loop` that is a parent of the calling context, and
+        - cannot be evaluated at graph build time to a constant.
+  """
+  value_name = value.name
+  # curr_ctxt is the context that tf.gradients was called in.
+  curr_ctxt = ops.get_default_graph()._get_control_flow_context()  # pylint: disable=protected-access
+
+  curr_ctxt_name = curr_ctxt.name if curr_ctxt is not None else ""
+  max_size = constant_op.constant(1)
+
+  # Loop through all containing while contexts between value and the
+  # current context, multiplying together each context's
+  # max_iterations to get the maximum stack size.
+  while while_ctxt not in (None, curr_ctxt):
+    max_iter = while_ctxt.maximum_iterations
+    if max_iter is None:
+      raise ValueError(
+          "Cannot create a gradient accumulator for tensor '%s' inside "
+          "XLA while_loop because maximum_iterations was not passed to "
+          "the tf.while_loop call ('%s')." % (value_name, while_ctxt.name))
+
+    # pylint: disable=protected-access
+    max_iter_ctxt = max_iter.op._get_control_flow_context()
+    # pylint: enable=protected-access
+
+    # If max_iter_ctxt (non-strictly) contains curr_ctxt, then it's OK to use.
+    if util.IsContainingContext(curr_ctxt, max_iter_ctxt):
+      max_size *= max_iter
+    else:
+      # We cannot use max_iter because it's defined in a nested while
+      # or cond context, so will fail if we try to use it as input to
+      # any ops in curr_ctxt (e.g. max_size or the final accumulator
+      # stack). Attempt to get a constant value out to use instead.
+      const_max_iter = tensor_util.constant_value(max_iter)
+      if const_max_iter is None:
+        raise ValueError(
+            "Cannot create a gradient accumulator for tensor '%s' inside XLA "
+            "while_loop. maximum_iterations tensor '%s' for while_loop context "
+            "'%s' must be statically known (e.g. a constant value or known "
+            "shape dimension), or be defined at or outside the while loop "
+            "context '%s' (currently defined in '%s')." %
+            (value_name, max_iter.name, while_ctxt.name, curr_ctxt_name,
+             max_iter_ctxt.name))
+      max_size *= const_max_iter
+
+    # Find the next outer WhileContext (or stop if we reach the
+    # tf.gradient's context).
+    while_ctxt = util.GetContainingWhileContext(
+        while_ctxt.outer_context, stop_ctxt=curr_ctxt)
+
+  return max_size
+
+
 class GradLoopState(object):
   """The state used for constructing the gradient graph for a while loop.
 
@@ -747,9 +835,11 @@ class GradLoopState(object):
       outer_forward_ctxt = forward_ctxt.outer_context
 
     # Add the forward loop counter.
-    if outer_forward_ctxt: outer_forward_ctxt.Enter()
+    if outer_forward_ctxt:
+      outer_forward_ctxt.Enter()
     cnt, forward_index = forward_ctxt.AddForwardLoopCounter(outer_grad_state)
-    if outer_forward_ctxt: outer_forward_ctxt.Exit()
+    if outer_forward_ctxt:
+      outer_forward_ctxt.Exit()
     self._forward_context = forward_ctxt
     self._forward_index = forward_index
 
@@ -762,25 +852,31 @@ class GradLoopState(object):
 
       outer_grad_ctxt = outer_grad_state.grad_context
       outer_grad_ctxt.Enter()
-      self._grad_context = WhileContext(forward_ctxt.parallel_iterations,
-                                        forward_ctxt.back_prop,
-                                        forward_ctxt.swap_memory,
-                                        forward_ctxt.name,
-                                        self)
+      self._grad_context = WhileContext(
+          maximum_iterations=forward_ctxt.maximum_iterations,
+          parallel_iterations=forward_ctxt.parallel_iterations,
+          back_prop=forward_ctxt.back_prop,
+          swap_memory=forward_ctxt.swap_memory,
+          name=forward_ctxt.name,
+          grad_state=self)
       real_cnt = outer_grad_state.AddBackpropAccumulatedValue(history_cnt, cnt)
       self._grad_index = self._grad_context.AddBackpropLoopCounter(
           real_cnt, outer_grad_state)
       outer_grad_ctxt.Exit()
     else:
-      if outer_forward_ctxt: outer_forward_ctxt.Enter()
-      self._grad_context = WhileContext(forward_ctxt.parallel_iterations,
-                                        forward_ctxt.back_prop,
-                                        forward_ctxt.swap_memory,
-                                        forward_ctxt.name,
-                                        self)
+      if outer_forward_ctxt:
+        outer_forward_ctxt.Enter()
+      self._grad_context = WhileContext(
+          maximum_iterations=forward_ctxt.maximum_iterations,
+          parallel_iterations=forward_ctxt.parallel_iterations,
+          back_prop=forward_ctxt.back_prop,
+          swap_memory=forward_ctxt.swap_memory,
+          name=forward_ctxt.name,
+          grad_state=self)
       self._grad_index = self._grad_context.AddBackpropLoopCounter(
           cnt, outer_grad_state)
-      if outer_forward_ctxt: outer_forward_ctxt.Exit()
+      if outer_forward_ctxt:
+        outer_forward_ctxt.Exit()
 
   @property
   def outer_grad_state(self):
@@ -902,23 +998,35 @@ class GradLoopState(object):
 
     Raises:
       TypeError: For internal errors involving the value condition context.
+      ValueError: If `value` is inside a XLA scope and a valid max size
+        for the stack can't be found.
     """
-    curr_ctxt = ops.get_default_graph()._get_control_flow_context()
+    # curr_ctxt is the context that tf.gradients was called in.
+    curr_ctxt = ops.get_default_graph()._get_control_flow_context()  # pylint: disable=protected-access
     with ops.control_dependencies(None):
-      if curr_ctxt: curr_ctxt.Enter()
+      if curr_ctxt:
+        curr_ctxt.Enter()
       with ops.colocate_with(value):
+        # We only need to pass maximum_iterations to the stack if
+        # we're inside an XLA context.
+        if not util.IsInXLAContext(value.op):
+          max_size = constant_op.constant(-1, dtypes.int32)
+        else:
+          max_size = GetMaxSizeFromNestedMaximumIterations(
+              value, self.forward_context)
         # pylint: disable=protected-access
-        acc = gen_data_flow_ops._stack_v2(-1, value.dtype.base_dtype,
-                                          name="f_acc")
+        acc = gen_data_flow_ops._stack_v2(
+            max_size=max_size, elem_type=value.dtype.base_dtype, name="f_acc")
         # pylint: enable=protected-access
-      if curr_ctxt: curr_ctxt.Exit()
+      if curr_ctxt:
+        curr_ctxt.Exit()
 
       # Make acc available in the forward context.
       enter_acc = self.forward_context.AddValue(acc)
 
       # Add the stack_push op in the context of value.op.
       swap_enabled = self.forward_context.swap_memory
-      value_ctxt = _GetOutputContext(value.op)
+      value_ctxt = util.GetOutputContext(value.op)
       if value_ctxt == self.forward_context:
         # value is not nested in the forward context.
         self.forward_context.Enter()
@@ -932,8 +1040,7 @@ class GradLoopState(object):
       else:
         # value is in a cond context within the forward context.
         if not isinstance(value_ctxt, CondContext):
-          raise TypeError(
-              "value_ctxt is not a CondContext: %s" % value_ctxt)
+          raise TypeError("value_ctxt is not a CondContext: %s" % value_ctxt)
         if dead_branch:
           # The special case for creating a zero tensor for a dead
           # branch of a switch. See ControlFlowState.ZerosLike().
@@ -1028,7 +1135,7 @@ class GradLoopState(object):
       cur_value = value
       cur_grad_state = self
       while True:
-        enter_op = _GetLoopConstantEnter(cur_value)
+        enter_op = util.GetLoopConstantEnter(cur_value)
         if enter_op:
           # Special case: cur_value comes from a constant Enter node.
           cur_value = enter_op.inputs[0]
@@ -1057,8 +1164,8 @@ class GradLoopState(object):
 
       if real_value is None:
         # Add the stack pop op in the grad context.
-        real_value = cur_grad_state.AddBackpropAccumulatedValue(history_value,
-                                                                cur_value)
+        real_value = cur_grad_state.AddBackpropAccumulatedValue(
+            history_value, cur_value)
         if cur_grad_state != self:
           real_value = self._grad_context.AddValue(real_value)
       self._history_map[value.name] = real_value
@@ -1077,11 +1184,11 @@ class ControlFlowState(object):
   """Maintain the mapping from the loops to their grad states."""
 
   def __init__(self):
-    self._map = {}   # maps forward loop context to GradLoopState
+    self._map = {}  # maps forward loop context to GradLoopState
 
   def GetGradState(self, op, before):
     """Return the grad state for this op if it's in a forward loop context."""
-    if before and IsLoopExit(op):
+    if before and util.IsLoopExit(op):
       forward_ctxt = op._get_control_flow_context()
       forward_ctxt = forward_ctxt.outer_context
       if forward_ctxt:
@@ -1241,8 +1348,9 @@ class ControlFlowState(object):
     Returns:
       A zero tensor of the same shape of op.outputs[index].
     """
-    if IsLoopSwitch(op): return None
-    dead_branch = IsSwitch(op)
+    if util.IsLoopSwitch(op):
+      return None
+    dead_branch = util.IsSwitch(op)
     forward_ctxt = _GetWhileContext(op)
     grad_state = self._map.get(forward_ctxt)
     if grad_state is None:
@@ -1284,8 +1392,8 @@ class ControlFlowState(object):
       grad_state.grad_context.Enter()
 
       # Create a zero tensor with the right shape.
-      shape = grad_state.AddBackpropAccumulatedValue(
-          history_zeros_shape, zeros_shape, dead_branch)
+      shape = grad_state.AddBackpropAccumulatedValue(history_zeros_shape,
+                                                     zeros_shape, dead_branch)
       result = array_ops.zeros(shape, val.dtype)
     return result
 
@@ -1316,12 +1424,14 @@ class ControlFlowState(object):
           else:
             # Create a zeros in the outer grad context.
             outer_grad_ctxt = grad_state.grad_context.outer_context
-            if outer_grad_ctxt: outer_grad_ctxt.Enter()
+            if outer_grad_ctxt:
+              outer_grad_ctxt.Enter()
             enter_grad_op = b_merge.op.inputs[0].op
             enter_grad = enter_grad_op.inputs[0]
             grad_shape = array_ops.shape_internal(enter_grad, optimize=False)
             grad_val = array_ops.zeros(grad_shape)
-            if outer_grad_ctxt: outer_grad_ctxt.Exit()
+            if outer_grad_ctxt:
+              outer_grad_ctxt.Exit()
             # Use the zeros for iterations > 0.
             grad_state.grad_context.Enter()
             next_grad_val = _NextIteration(grad_val)
@@ -1342,7 +1452,7 @@ def MaybeCreateControlFlowState(between_op_list, between_ops,
   """
   loop_state = None
   for op in between_op_list:
-    if IsLoopExit(op):
+    if util.IsLoopExit(op):
       if loop_state is None:
         loop_state = ControlFlowState()
       if colocate_gradients_with_ops:
@@ -1353,28 +1463,10 @@ def MaybeCreateControlFlowState(between_op_list, between_ops,
   return loop_state
 
 
-def IsSwitch(op):
-  """Return true if `op` is a Switch."""
-  return op.type == "Switch" or op.type == "RefSwitch"
-
-
-def IsLoopExit(op):
-  """Return true if `op` is an Exit."""
-  return op.type == "Exit" or op.type == "RefExit"
-
-
-def IsLoopSwitch(op):
-  """Return true if `op` is the Switch for a while loop."""
-  if IsSwitch(op):
-    ctxt = op._get_control_flow_context()
-    return ctxt and isinstance(ctxt, WhileContext)
-  return False
-
-
 def ZerosLikeOutsideLoop(op, index):
   """Create zeros_like for the specified output of an op."""
   val = op.outputs[index]
-  if not IsSwitch(op):
+  if not util.IsSwitch(op):
     return array_ops.zeros_like(val, optimize=False)
   else:
     op_ctxt = op._get_control_flow_context()
@@ -1408,11 +1500,13 @@ class ControlFlowContext(object):
   """
 
   def __init__(self, values_def=None, import_scope=None):
+    self._nested_contexts = []
     self._outer_context = ops.get_default_graph()._get_control_flow_context()
+    if self._outer_context:
+      self._outer_context._nested_contexts.append(self)  # pylint: disable=protected-access
     self._context_stack = []
     if values_def:
-      self._init_values_from_proto(values_def,
-                                   import_scope=import_scope)
+      self._init_values_from_proto(values_def, import_scope=import_scope)
     else:
       # Values that have been already seen in this context.
       self._values = set()
@@ -1445,6 +1539,10 @@ class ControlFlowContext(object):
       g.as_graph_element(op)._set_control_flow_context(self)
       # pylint: enable=protected-access
 
+  @property
+  def name(self):
+    return self._name
+
   @property
   def outer_context(self):
     """Return the context containing this context."""
@@ -1458,7 +1556,17 @@ class ControlFlowContext(object):
   def back_prop(self):
     raise NotImplementedError("Abstract method")
 
-  def _to_proto(self, export_scope=None):
+  @abc.abstractmethod
+  def to_control_flow_context_def(self, context_def, export_scope=None):
+    """Serializes this into `context_def`.
+
+    Args:
+      context_def: a `ControlFlowContextDef` protocol buffer.
+      export_scope: Optional `string`. Name scope to remove.
+    """
+    raise NotImplementedError("Abstract method")
+
+  def _to_values_def(self, export_scope=None):
     """Converts the values to a `ValuesDef` protocol buffer.
 
     Args:
@@ -1469,20 +1577,12 @@ class ControlFlowContext(object):
     """
     values_def = control_flow_pb2.ValuesDef()
     values_def.values.extend(
-        [ops.strip_name_scope(v, export_scope)
-         for v in sorted(self._values)])
+        [ops.strip_name_scope(v, export_scope) for v in sorted(self._values)])
     for k, v in self._external_values.items():
       k = ops.strip_name_scope(k, export_scope)
-      values_def.external_values[k] = ops.strip_name_scope(
-          v.name, export_scope)
+      values_def.external_values[k] = ops.strip_name_scope(v.name, export_scope)
     return values_def
 
-  @staticmethod
-  def _from_proto(values_def, import_scope=None):
-    """Returns a `ControlFlowContext` created from `values_def`."""
-    return ControlFlowContext(values_def=values_def,
-                              import_scope=import_scope)
-
   def AddName(self, name):
     self._values.add(name)
 
@@ -1511,7 +1611,7 @@ class ControlFlowContext(object):
     return None
 
   def _IsInOuterContext(self, op):
-    op_ctxt = _GetOutputContext(op)
+    op_ctxt = util.GetOutputContext(op)
     outer_ctxt = self.outer_context
     while outer_ctxt != op_ctxt:
       if outer_ctxt is None:
@@ -1529,13 +1629,17 @@ class ControlFlowContext(object):
     else:
       internal_control_inputs = []
       for x in op.control_inputs:
-        ctxt = _GetOutputContext(x)
+        ctxt = util.GetOutputContext(x)
         if ctxt is not None and ctxt.GetWhileContext() == while_ctxt:
           internal_control_inputs.append(x)
+    external_control_inputs = []
     if len(internal_control_inputs) != len(op.control_inputs):
-      del op.control_inputs[:]
+      external_control_inputs = list(set(op.control_inputs)
+                                     - set(internal_control_inputs))
+      op._remove_all_control_inputs()
       op._add_control_inputs(internal_control_inputs)
-    return internal_control_inputs
+    return internal_control_inputs, external_control_inputs
+
   # pylint: enable=protected-access
 
   def AddInnerOp(self, op):
@@ -1547,12 +1651,29 @@ class ControlFlowContext(object):
     """Returns the pivot node for this context, or None."""
     return None
 
+  def IsWhileContext(self):
+    return False
+
+  def IsCondContext(self):
+    return False
+
+  def IsXLAContext(self):
+    return False
+
+  def __str__(self):
+    return self.name
+
 
 class CondContext(ControlFlowContext):
   """The context for the conditional construct."""
 
-  def __init__(self, pred=None, pivot=None, branch=None,
-               name="cond_text", context_def=None, import_scope=None):
+  def __init__(self,
+               pred=None,
+               pivot=None,
+               branch=None,
+               name="cond_text",
+               context_def=None,
+               import_scope=None):
     """Creates a `CondContext`.
 
     Args:
@@ -1572,9 +1693,9 @@ class CondContext(ControlFlowContext):
     else:
       # Initializes the default fields.
       ControlFlowContext.__init__(self)
-      self._pred = pred         # The boolean tensor for the cond predicate
-      self._pivot = pivot       # The predicate tensor in this branch
-      self._branch = branch     # 0 or 1 representing this branch
+      self._pred = pred  # The boolean tensor for the cond predicate
+      self._pivot = pivot  # The predicate tensor in this branch
+      self._branch = branch  # 0 or 1 representing this branch
 
       # Values considered to have been already seen in this context.
       self._values.add(pred.name)
@@ -1590,19 +1711,14 @@ class CondContext(ControlFlowContext):
     assert isinstance(context_def, control_flow_pb2.CondContextDef)
     # Create from context_def.
     g = ops.get_default_graph()
-    self._name = ops.prepend_name_scope(
-        context_def.context_name, import_scope)
-    self._pred = g.as_graph_element(ops.prepend_name_scope(
-        context_def.pred_name, import_scope))
-    self._pivot = g.as_graph_element(ops.prepend_name_scope(
-        context_def.pivot_name, import_scope))
+    self._name = ops.prepend_name_scope(context_def.context_name, import_scope)
+    self._pred = g.as_graph_element(
+        ops.prepend_name_scope(context_def.pred_name, import_scope))
+    self._pivot = g.as_graph_element(
+        ops.prepend_name_scope(context_def.pivot_name, import_scope))
     self._branch = context_def.branch
-    super(CondContext, self).__init__(values_def=context_def.values_def,
-                                      import_scope=import_scope)
-
-  @property
-  def name(self):
-    return self._name
+    super(CondContext, self).__init__(
+        values_def=context_def.values_def, import_scope=import_scope)
 
   @property
   def pred(self):
@@ -1640,18 +1756,23 @@ class CondContext(ControlFlowContext):
     Returns:
       A `CondContextDef` protocol buffer.
     """
-    if (export_scope is None or
-        self.name.startswith(export_scope)):
+    if (export_scope is None or self.name.startswith(export_scope)):
       context_def = control_flow_pb2.CondContextDef()
-      context_def.context_name = ops.strip_name_scope(
-          self.name, export_scope)
-      context_def.pred_name = ops.strip_name_scope(
-          self._pred.name, export_scope)
-      context_def.pivot_name = ops.strip_name_scope(
-          self._pivot.name, export_scope)
+      context_def.context_name = ops.strip_name_scope(self.name, export_scope)
+      context_def.pred_name = ops.strip_name_scope(self._pred.name,
+                                                   export_scope)
+      context_def.pivot_name = ops.strip_name_scope(self._pivot.name,
+                                                    export_scope)
       context_def.branch = self._branch
-      context_def.values_def.MergeFrom(super(CondContext, self)._to_proto(
+      context_def.values_def.MergeFrom(super(CondContext, self)._to_values_def(
           export_scope))
+      # TODO(b/72868227): enable this once the corresponding control_flow.proto
+      # changes have been checked in (they aren't checked in and this is
+      # disabled for now to ensure forwards compatibility).
+      if False:  # pylint: disable=using-constant-test
+        for nested in self._nested_contexts:
+          nested_def = context_def.nested_contexts.add()
+          nested.to_control_flow_context_def(nested_def)
 
       return context_def
     else:
@@ -1660,8 +1781,21 @@ class CondContext(ControlFlowContext):
   @staticmethod
   def from_proto(context_def, import_scope=None):
     """Returns a `CondContext` object created from `context_def`."""
-    return CondContext(context_def=context_def,
-                       import_scope=import_scope)
+    ret = CondContext(context_def=context_def,
+                      import_scope=import_scope)
+
+    # TODO(b/72868227): remove "if hasattr(...)" once the corresponding
+    # control_flow.proto changes have been checked in (they aren't checked in
+    # and this is here for now to ensure forwards compatibility).
+    if hasattr(context_def, "nested_contexts"):
+      ret.Enter()
+      for nested_def in context_def.nested_contexts:
+        from_control_flow_context_def(nested_def)
+      ret.Exit()
+    return ret
+
+  def to_control_flow_context_def(self, context_def, export_scope=None):
+    context_def.cond_ctxt.CopyFrom(self.to_proto(export_scope=export_scope))
 
   def AddValue(self, val):
     """Add `val` to the current context and its outer context recursively."""
@@ -1720,7 +1854,7 @@ class CondContext(ControlFlowContext):
         op._add_control_input(self._pivot.op)
       # pylint: enable=protected-access
 
-    if self._outer_context or not IsLoopExit(op):
+    if self._outer_context or not util.IsLoopExit(op):
       op.graph.prevent_fetching(op)
 
     if self._outer_context:
@@ -1775,8 +1909,8 @@ class CondContext(ControlFlowContext):
         if original_result is None:
           return no_op(), None
         else:
-          original_result = nest.map_structure(
-              array_ops.identity, original_result)
+          original_result = nest.map_structure(array_ops.identity,
+                                               original_result)
     if original_result is None:
       return None, None
 
@@ -1785,6 +1919,9 @@ class CondContext(ControlFlowContext):
       result = [result]
     return original_result, result
 
+  def IsCondContext(self):
+    return True
+
 
 def _UnpackIfSingleton(res):
   if isinstance(res, (list, _basetuple)) and len(res) == 1:
@@ -1793,13 +1930,19 @@ def _UnpackIfSingleton(res):
     return res
 
 
+# pylint: disable=redefined-outer-name
 # pylint: disable=g-doc-args
+@tf_export("cond")
 @deprecation.deprecated_args(
-    None,
-    "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
+    None, "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
     "fn1", "fn2")
-def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
-         fn1=None, fn2=None):
+def cond(pred,
+         true_fn=None,
+         false_fn=None,
+         strict=False,
+         name=None,
+         fn1=None,
+         fn2=None):
   """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
 
   `true_fn` and `false_fn` both return lists of output tensors. `true_fn` and
@@ -1958,9 +2101,15 @@ def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
     merges = [merge(pair)[0] for pair in zip(res_f_flat, res_t_flat)]
     merges = _convert_flows_to_tensorarrays(nest.flatten(orig_res_t), merges)
 
-    # Add to collections
-    ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_t)
-    ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_f)
+    # Only add non-nested conds to the collection. Any nested control flow will
+    # be encapsulated in the root context.
+    assert context_t.outer_context == context_f.outer_context
+    # TODO(b/72868227): remove "if True..." once the corresponding
+    # control_flow.proto changes have been checked in (they aren't checked in
+    # and this is disabled for now to ensure forwards compatibility).
+    if True or context_t.outer_context is None:
+      ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_t)
+      ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_f)
 
     merges = nest.pack_sequence_as(structure=orig_res_t, flat_sequence=merges)
 
@@ -1968,7 +2117,10 @@ def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
     if not strict:
       merges = _UnpackIfSingleton(merges)
     return merges
+
+
 # pylint: enable=g-doc-args
+# pylint: enable=redefined-outer-name
 
 
 def _resource_safe_shape(t):
@@ -1986,12 +2138,19 @@ def _resource_safe_shape(t):
 class WhileContext(ControlFlowContext):
   """The context for the loop construct."""
 
-  def __init__(self, parallel_iterations=10, back_prop=True, swap_memory=False,
-               name="while_context", grad_state=None, context_def=None,
+  def __init__(self,
+               maximum_iterations=None,
+               parallel_iterations=10,
+               back_prop=True,
+               swap_memory=False,
+               name="while_context",
+               grad_state=None,
+               context_def=None,
                import_scope=None):
     """"Creates a `WhileContext`.
 
     Args:
+      maximum_iterations: Optional upper bound on number of loop iterations.
       parallel_iterations: The number of iterations allowed to run in parallel.
       back_prop: Whether backprop is enabled for this while loop.
       swap_memory: Whether GPU-CPU memory swap is enabled for this loop.
@@ -2006,16 +2165,17 @@ class WhileContext(ControlFlowContext):
       self._init_from_proto(context_def, import_scope=import_scope)
     else:
       ControlFlowContext.__init__(self)
-      self._init_from_args(parallel_iterations, back_prop, swap_memory,
-                           name)
+      self._init_from_args(maximum_iterations, parallel_iterations, back_prop,
+                           swap_memory, name)
     # The gradient loop state.
     self._grad_state = grad_state
 
-  def _init_from_args(self, parallel_iterations, back_prop, swap_memory,
-                      name):
+  def _init_from_args(self, maximum_iterations, parallel_iterations, back_prop,
+                      swap_memory, name):
     """Creates a new `WhileContext` from arguments.
 
     Args:
+      maximum_iterations: Optional upper bound on number of loop iterations.
       parallel_iterations: The number of iterations allowed to run in parallel.
       back_prop: Whether backprop is enabled for this while loop.
       swap_memory: Whether GPU-CPU memory swap is enabled for this loop.
@@ -2028,6 +2188,7 @@ class WhileContext(ControlFlowContext):
       raise ValueError("`parallel_iterations` must be a positive integer: "
                        "%s" % parallel_iterations)
     self._name = ops.get_default_graph().unique_name(name)
+    self._maximum_iterations = maximum_iterations
     self._parallel_iterations = parallel_iterations
     self._back_prop = back_prop
     self._swap_memory = swap_memory
@@ -2053,34 +2214,53 @@ class WhileContext(ControlFlowContext):
     assert isinstance(context_def, control_flow_pb2.WhileContextDef)
     # Create from context_def.
     g = ops.get_default_graph()
-    self._name = ops.prepend_name_scope(
-        context_def.context_name, import_scope)
+    self._name = ops.prepend_name_scope(context_def.context_name, import_scope)
+    if context_def.maximum_iterations_name:
+      self._maximum_iterations = g.as_graph_element(
+          ops.prepend_name_scope(context_def.maximum_iterations_name,
+                                 import_scope))
+    else:
+      self._maximum_iterations = None
     self._parallel_iterations = context_def.parallel_iterations
     self._back_prop = context_def.back_prop
     self._swap_memory = context_def.swap_memory
-    self._pivot_for_pred = g.as_graph_element(ops.prepend_name_scope(
-        context_def.pivot_for_pred_name, import_scope))
+    self._pivot_for_pred = g.as_graph_element(
+        ops.prepend_name_scope(context_def.pivot_for_pred_name, import_scope))
     # We use this node to control constants created by the body lambda.
-    self._pivot_for_body = g.as_graph_element(ops.prepend_name_scope(
-        context_def.pivot_for_body_name, import_scope))
+    self._pivot_for_body = g.as_graph_element(
+        ops.prepend_name_scope(context_def.pivot_for_body_name, import_scope))
     # The boolean tensor for loop termination condition. Used in code
     # generation for gradient computation.
     self._pivot = g.as_graph_element(
         ops.prepend_name_scope(context_def.pivot_name, import_scope))
     # The list of exit tensors for loop variables.
-    self._loop_exits = [g.as_graph_element(
-        ops.prepend_name_scope(exit_name, import_scope))
-                        for exit_name in context_def.loop_exit_names]
+    self._loop_exits = [
+        g.as_graph_element(ops.prepend_name_scope(exit_name, import_scope))
+        for exit_name in context_def.loop_exit_names
+    ]
     # The list of enter tensors for loop variables.
-    self._loop_enters = [g.as_graph_element(
-        ops.prepend_name_scope(enter_name, import_scope))
-                         for enter_name in context_def.loop_enter_names]
-    super(WhileContext, self).__init__(values_def=context_def.values_def,
-                                       import_scope=import_scope)
+    self._loop_enters = [
+        g.as_graph_element(ops.prepend_name_scope(enter_name, import_scope))
+        for enter_name in context_def.loop_enter_names
+    ]
+    super(WhileContext, self).__init__(
+        values_def=context_def.values_def, import_scope=import_scope)
+
+    # import_scope causes self.name to be different from the original serialized
+    # context's name. Rewrite "frame_name" attrs with the new name.
+    if import_scope:
+      for tensor_name in self._values:
+        op = g.as_graph_element(tensor_name).op
+        if util.IsLoopEnter(op):
+          # pylint: disable=protected-access
+          op._set_attr("frame_name",
+                       attr_value_pb2.AttrValue(s=compat.as_bytes(self.name)))
+          # pylint: enable=protected-access
 
   @property
-  def name(self):
-    return self._name
+  def maximum_iterations(self):
+    """The maximum number of iterations that will be executed."""
+    return self._maximum_iterations
 
   @property
   def parallel_iterations(self):
@@ -2126,34 +2306,45 @@ class WhileContext(ControlFlowContext):
     Returns:
       A `WhileContextDef` protocol buffer.
     """
-    if (export_scope is None or
-        self.name.startswith(export_scope)):
+    if (export_scope is None or self.name.startswith(export_scope)):
       context_def = control_flow_pb2.WhileContextDef()
-      context_def.context_name = ops.strip_name_scope(
-          self.name, export_scope)
+      context_def.context_name = ops.strip_name_scope(self.name, export_scope)
       context_def.parallel_iterations = self._parallel_iterations
+      if self._maximum_iterations is not None:
+        context_def.maximum_iterations_name = ops.strip_name_scope(
+            self._maximum_iterations.name, export_scope)
       context_def.back_prop = self._back_prop
       context_def.swap_memory = self._swap_memory
       context_def.pivot_for_pred_name = ops.strip_name_scope(
           self._pivot_for_pred.name, export_scope)
       context_def.pivot_for_body_name = ops.strip_name_scope(
           self._pivot_for_body.name, export_scope)
-      context_def.pivot_name = ops.strip_name_scope(
-          self._pivot.name, export_scope)
-      context_def.loop_exit_names.extend(
-          [ops.strip_name_scope(l.name, export_scope)
-           for l in self._loop_exits])
-      context_def.loop_enter_names.extend(
-          [ops.strip_name_scope(l.name, export_scope)
-           for l in self._loop_enters])
+      context_def.pivot_name = ops.strip_name_scope(self._pivot.name,
+                                                    export_scope)
+      context_def.loop_exit_names.extend([
+          ops.strip_name_scope(l.name, export_scope) for l in self._loop_exits
+      ])
+      context_def.loop_enter_names.extend([
+          ops.strip_name_scope(l.name, export_scope) for l in self._loop_enters
+      ])
       context_def.values_def.MergeFrom(
-          super(WhileContext, self)._to_proto(
+          super(WhileContext, self)._to_values_def(
               export_scope=export_scope))
+      # TODO(b/72868227): remove "if True..." once the corresponding
+      # control_flow.proto changes have been checked in (they aren't checked in
+      # and this is disabled for now to ensure forwards compatibility).
+      if False:  # pylint: disable=using-constant-test
+        for nested in self._nested_contexts:
+          nested_def = context_def.nested_contexts.add()
+          nested.to_control_flow_context_def(nested_def)
 
       return context_def
     else:
       return None
 
+  def to_control_flow_context_def(self, context_def, export_scope=None):
+    context_def.while_ctxt.CopyFrom(self.to_proto(export_scope=export_scope))
+
   @staticmethod
   def from_proto(context_def, import_scope=None):
     """Returns a `WhileContext` object created from `context_def`.
@@ -2165,8 +2356,17 @@ class WhileContext(ControlFlowContext):
     Returns:
       A `WhileContext` Python object.
     """
-    return WhileContext(context_def=context_def,
-                        import_scope=import_scope)
+    ret = WhileContext(context_def=context_def,
+                       import_scope=import_scope)
+    # TODO(b/72868227): remove "if hasattr(...)" once the corresponding
+    # control_flow.proto changes have been checked in (they aren't checked in
+    # and this is disabled for now to ensure forwards compatibility).
+    if hasattr(context_def, "nested_contexts"):
+      ret.Enter()
+      for nested_def in context_def.nested_contexts:
+        from_control_flow_context_def(nested_def, import_scope=import_scope)
+      ret.Exit()
+    return ret
 
   def GetWhileContext(self):
     return self
@@ -2190,7 +2390,7 @@ class WhileContext(ControlFlowContext):
         grad_ctxt = grad_ctxt.GetWhileContext()
         if grad_ctxt.grad_state:
           forward_ctxt = _GetWhileContext(val.op)
-          if IsLoopExit(val.op):
+          if util.IsLoopExit(val.op):
             forward_ctxt = forward_ctxt.outer_context
             if forward_ctxt:
               forward_ctxt = forward_ctxt.GetWhileContext()
@@ -2203,8 +2403,11 @@ class WhileContext(ControlFlowContext):
         result = self._outer_context.AddValue(val)
       # Create an Enter to make `result` known to this loop context.
       with ops.control_dependencies(None):
-        enter = _Enter(result, self._name, is_constant=True,
-                       parallel_iterations=self._parallel_iterations)
+        enter = _Enter(
+            result,
+            self._name,
+            is_constant=True,
+            parallel_iterations=self._parallel_iterations)
         enter.graph.prevent_feeding(enter)
         if self._outer_context:
           self._outer_context.AddInnerOp(enter.op)
@@ -2244,14 +2447,12 @@ class WhileContext(ControlFlowContext):
   def _AddOpInternal(self, op):
     """Add `op` to the current context.
 
-    In the case that op has only external data inputs, we remove all of its
-    external control inputs so all its inputs are in the same while loop
-    context. This is valid because op now has an Enter input that has all
-    the right control dependency.
+    We move any external control dependencies of the op to the loop pivot, to
+    ensure they get executed.
     """
     if not op.inputs:
       # Remove any external control dependency on this op
-      control_inputs = self._RemoveExternalControlEdges(op)
+      control_inputs, external_inputs = self._RemoveExternalControlEdges(op)
       # Add a control edge from the control pivot to this op.
       if not control_inputs:
         # pylint: disable=protected-access
@@ -2264,15 +2465,24 @@ class WhileContext(ControlFlowContext):
         x = op.inputs[index]
         real_x = self.AddValue(x)
         if real_x != x:
-          op._update_input(index, real_x)
+          op._update_input(index, real_x)  # pylint: disable=protected-access
       # Remove any external control dependency on this op.
-      self._RemoveExternalControlEdges(op)
+      _, external_inputs = self._RemoveExternalControlEdges(op)
       # Add a control dependency to prevent loop invariants from
       # enabling ops that should not be executed.
       self._MaybeAddControlDependency(op)
       for x in op.outputs:
         self._values.add(x.name)
-    if self._outer_context or not IsLoopExit(op):
+    if external_inputs:
+      # Use an identity to pull control inputs as data inputs. Note that we
+      # ignore ops which don't have outputs. TODO(apassos): fix that
+      with ops.control_dependencies(None):
+        self.Enter()
+        external_inputs = [array_ops.identity(x.outputs[0]).op
+                           for x in external_inputs if x.outputs]
+        self.Exit()
+      op._add_control_inputs(external_inputs)  # pylint: disable=protected-access
+    if self._outer_context or not util.IsLoopExit(op):
       op.graph.prevent_fetching(op)
       for x in op.outputs:
         op.graph.prevent_feeding(x)
@@ -2282,6 +2492,7 @@ class WhileContext(ControlFlowContext):
 
   def _MaybeAddControlDependency(self, op):
     """Add a control input to the op if it only depends on loop invariants."""
+
     def _IsOpFree(op):
       """Determines if `op` needs a control dependency."""
       if op.control_inputs:
@@ -2291,9 +2502,10 @@ class WhileContext(ControlFlowContext):
         return True
       # pylint: enable=protected-access
       for x in op.inputs:
-        if not _IsLoopConstantEnter(x.op):
+        if not util.IsLoopConstantEnter(x.op):
           return False
       return True
+
     if _IsOpFree(op):
       # pylint: disable=protected-access
       op._add_control_input(self.GetControlPivot().op)
@@ -2327,9 +2539,12 @@ class WhileContext(ControlFlowContext):
 
     self.Enter()
     self.AddName(n.name)
-    enter_n = _Enter(n, self._name, is_constant=False,
-                     parallel_iterations=self._parallel_iterations,
-                     name="f_count")
+    enter_n = _Enter(
+        n,
+        self._name,
+        is_constant=False,
+        parallel_iterations=self._parallel_iterations,
+        name="f_count")
     self.loop_enters.append(enter_n)
 
     merge_n = merge([enter_n, enter_n])[0]
@@ -2369,9 +2584,12 @@ class WhileContext(ControlFlowContext):
 
     self.Enter()
     self.AddName(count.name)
-    enter_count = _Enter(count, self._name, is_constant=False,
-                         parallel_iterations=self._parallel_iterations,
-                         name="b_count")
+    enter_count = _Enter(
+        count,
+        self._name,
+        is_constant=False,
+        parallel_iterations=self._parallel_iterations,
+        name="b_count")
     self.loop_enters.append(enter_count)
 
     merge_count = merge([enter_count, enter_count])[0]
@@ -2429,9 +2647,11 @@ class WhileContext(ControlFlowContext):
     # without running any iterations.
     shape = grad.get_shape()
     if shape.is_fully_defined():
-      if self.outer_context: self.outer_context.Enter()
+      if self.outer_context:
+        self.outer_context.Enter()
       acc = constant_op.constant(0, grad.dtype, shape=shape, name="b_acc")
-      if self.outer_context: self.outer_context.Exit()
+      if self.outer_context:
+        self.outer_context.Exit()
     else:
       value = op.inputs[0]
       if (isinstance(self.outer_context, WhileContext) and
@@ -2450,17 +2670,21 @@ class WhileContext(ControlFlowContext):
         acc = array_ops.zeros(real_shape, grad.dtype)
         self.outer_context.Exit()
       else:
-        if self.outer_context: self.outer_context.Enter()
+        if self.outer_context:
+          self.outer_context.Enter()
         zeros_shape = array_ops.shape_internal(value, optimize=False)
         acc = array_ops.zeros(zeros_shape, grad.dtype)
-        if self.outer_context: self.outer_context.Exit()
-      acc._shape = grad.get_shape()  # pylint: disable=protected-access
+        if self.outer_context:
+          self.outer_context.Exit()
 
     self.Enter()
     self.AddName(acc.name)
-    enter_acc = _Enter(acc, self._name, is_constant=False,
-                       parallel_iterations=self._parallel_iterations,
-                       name="b_acc")
+    enter_acc = _Enter(
+        acc,
+        self._name,
+        is_constant=False,
+        parallel_iterations=self._parallel_iterations,
+        name="b_acc")
     self.loop_enters.append(enter_acc)
 
     merge_acc = merge([enter_acc, enter_acc], name="b_acc")[0]
@@ -2493,14 +2717,17 @@ class WhileContext(ControlFlowContext):
     dense_shape = grad.dense_shape
 
     self.Exit()
-    if self.outer_context: self.outer_context.Enter()
+    if self.outer_context:
+      self.outer_context.Enter()
     if values.get_shape().is_fully_defined():
       values_shape = tensor_shape.TensorShape(
           [tensor_shape.Dimension(1)] + values.get_shape().dims[1:])
-      if self.outer_context: self.outer_context.Enter()
-      values_acc = constant_op.constant(0, values.dtype, shape=values_shape,
-                                        name="b_acc")
-      if self.outer_context: self.outer_context.Exit()
+      if self.outer_context:
+        self.outer_context.Enter()
+      values_acc = constant_op.constant(
+          0, values.dtype, shape=values_shape, name="b_acc")
+      if self.outer_context:
+        self.outer_context.Exit()
     else:
       values_shape = _resource_safe_shape(op.inputs[0])[1:]
       values_shape = array_ops.concat([[1], values_shape], 0)
@@ -2509,16 +2736,19 @@ class WhileContext(ControlFlowContext):
     shape_acc = None
     if dense_shape is not None:
       if dense_shape.get_shape().is_fully_defined():
-        if self.outer_context: self.outer_context.Enter()
-        shape_acc = constant_op.constant(0, dense_shape.dtype,
-                                         shape=dense_shape.get_shape())
-        if self.outer_context: self.outer_context.Exit()
+        if self.outer_context:
+          self.outer_context.Enter()
+        shape_acc = constant_op.constant(
+            0, dense_shape.dtype, shape=dense_shape.get_shape())
+        if self.outer_context:
+          self.outer_context.Exit()
       else:
         shape_acc = array_ops.zeros_like(
             array_ops.shape_internal(op.inputs[0], optimize=False),
             optimize=False)
 
-    if self.outer_context: self.outer_context.Exit()
+    if self.outer_context:
+      self.outer_context.Exit()
 
     self.Enter()
     self.AddName(values_acc.name)
@@ -2527,9 +2757,23 @@ class WhileContext(ControlFlowContext):
     if shape_acc is not None:
       self.AddName(shape_acc.name)
       init_acc.append(shape_acc)
-    enter_acc = [_Enter(x, self._name, is_constant=False,
-                        parallel_iterations=self._parallel_iterations,
-                        name="b_acc") for x in init_acc]
+
+    # Set use_input_shape=False since the accumulator tensors will grow in
+    # size. If use_input_shape=True, the _update_input call below will result in
+    # incompatible shapes.
+    enter_acc = [
+        _Enter(
+            x,
+            self._name,
+            is_constant=False,
+            parallel_iterations=self._parallel_iterations,
+            use_input_shape=False,
+            name="b_acc") for x in init_acc
+    ]
+    # Manually set appropriate partial shapes.
+    enter_acc[0].set_shape([None])
+    if values_acc.shape.dims is not None:
+      enter_acc[1].set_shape([None] + values_acc.shape.as_list()[1:])
     self.loop_enters.extend(enter_acc)
 
     merge_acc = [merge([x, x], name="b_acc")[0] for x in enter_acc]
@@ -2542,8 +2786,7 @@ class WhileContext(ControlFlowContext):
     ]
     if shape_acc is not None:
       # For the shape we just keep the maximum
-      acc_indexed_slices.append(
-          math_ops.maximum(dense_shape, switch_acc[2][1]))
+      acc_indexed_slices.append(math_ops.maximum(dense_shape, switch_acc[2][1]))
 
     next_acc = [_NextIteration(x) for x in acc_indexed_slices]
     for xm, xn in zip(merge_acc, next_acc):
@@ -2554,7 +2797,8 @@ class WhileContext(ControlFlowContext):
 
     self.ExitResult(exit_acc)
     return ops.IndexedSlices(
-        indices=exit_acc[0], values=exit_acc[1],
+        indices=exit_acc[0],
+        values=exit_acc[1],
         dense_shape=exit_acc[2] if shape_acc is not None else None)
 
   def _InitializeValues(self, values):
@@ -2587,10 +2831,14 @@ class WhileContext(ControlFlowContext):
     if self._outer_context:
       real_vars = [self._outer_context.AddValue(x) for x in loop_vars]
     with ops.control_dependencies(None):
-      enter_vars = [_Enter(x, self._name, is_constant=False,
-                           parallel_iterations=self._parallel_iterations,
-                           use_input_shape=(shape_invariants is None))
-                    for x in real_vars]
+      enter_vars = [
+          _Enter(
+              x,
+              self._name,
+              is_constant=False,
+              parallel_iterations=self._parallel_iterations,
+              use_input_shape=(shape_invariants is None)) for x in real_vars
+      ]
       for x in enter_vars:
         x.graph.prevent_feeding(x)
         if self._outer_context:
@@ -2607,7 +2855,7 @@ class WhileContext(ControlFlowContext):
 
     if control_pivot is not None:
       for var in enter_vars:
-        if _IsLoopConstantEnter(var.op.inputs[0].op):
+        if util.IsLoopConstantEnter(var.op.inputs[0].op):
           # pylint: disable=protected-access
           var.op._add_control_input(control_pivot.op)
           # pylint: enable=protected-access
@@ -2651,11 +2899,13 @@ class WhileContext(ControlFlowContext):
       summary_ref = ops.get_collection_ref(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
       summary_ref[:] = pre_summaries
       with ops.control_dependencies(new_summaries):
+
         def map_fn(x):
           # TODO(apassos) figure out how to trigger with tensor arrays as well
           if isinstance(x, tensor_array_ops.TensorArray):
             return x
           return array_ops.identity(x)
+
         body_result = nest.map_structure(map_fn, body_result)
 
     # Compare the structure types of input and output of body.
@@ -2683,11 +2933,6 @@ class WhileContext(ControlFlowContext):
     exit_vars = [exit(x[0]) for x in switch_vars]
     self._loop_exits = exit_vars
 
-    # Make sure the shapes of loop outputs are correct.
-    for m_var, n_var in zip(merge_vars, next_vars):
-      if isinstance(m_var, ops.Tensor):
-        _EnforceShapeInvariant(m_var, n_var)
-
     # Exit the loop.
     self.ExitResult(exit_vars)
 
@@ -2717,8 +2962,7 @@ class WhileContext(ControlFlowContext):
     packed_exit_vars = nest.pack_sequence_as(
         structure=original_body_result,
         flat_sequence=exit_vars_with_tensor_arrays)
-    return (packed_exit_vars[0] if len(exit_vars) == 1
-            else packed_exit_vars)
+    return (packed_exit_vars[0] if len(exit_vars) == 1 else packed_exit_vars)
 
   def _FixControlInputsAndContext(self, enters):
     graph = ops.get_default_graph()
@@ -2736,17 +2980,29 @@ class WhileContext(ControlFlowContext):
       for x in xs:
         inp_op = x.op.inputs[0].op
         control_inputs = graph._control_dependencies_for_inputs([inp_op])
-        outer_control_inputs = [op for op in control_inputs
-                                if self._IsInOuterContext(op)]
+        outer_control_inputs = [
+            op for op in control_inputs if self._IsInOuterContext(op)
+        ]
         x.op._set_control_flow_context(self)
         x.op._add_control_inputs(outer_control_inputs)
         graph._record_op_seen_by_control_dependencies(x.op)
     # pylint: enable=protected-access
 
+  def IsWhileContext(self):
+    return True
+
 
-def while_loop(cond, body, loop_vars, shape_invariants=None,
-               parallel_iterations=10, back_prop=True, swap_memory=False,
-               name=None):
+# pylint: disable=redefined-outer-name
+@tf_export("while_loop")
+def while_loop(cond,
+               body,
+               loop_vars,
+               shape_invariants=None,
+               parallel_iterations=10,
+               back_prop=True,
+               swap_memory=False,
+               name=None,
+               maximum_iterations=None):
   """Repeat `body` while the condition `cond` is true.
 
   `cond` is a callable returning a boolean scalar tensor. `body` is a callable
@@ -2818,6 +3074,10 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
     back_prop: Whether backprop is enabled for this while loop.
     swap_memory: Whether GPU-CPU memory swap is enabled for this loop.
     name: Optional name prefix for the returned tensors.
+    maximum_iterations: Optional maximum number of iterations of the while loop
+      to run.  If provided, the `cond` output is AND-ed with an additional
+      condition ensuring the number of iterations executed is no greater than
+      `maximum_iterations`.
 
   Returns:
     The output tensors for the loop variables after the loop. When the length
@@ -2871,18 +3131,61 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
     if parallel_iterations < 1:
       raise TypeError("parallel_iterations must be a positive integer.")
 
+    if maximum_iterations is not None:
+      maximum_iterations = ops.convert_to_tensor(
+          maximum_iterations, name="maximum_iterations")
+      if maximum_iterations.shape.ndims != 0:
+        raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
+                         maximum_iterations.shape)
+
+      counter = constant_op.constant(
+          0, dtype=maximum_iterations.dtype, name="iteration_counter")
+      orig_cond = cond
+      orig_body = body
+      if len(loop_vars) == 1:
+        loop_vars = (counter, loop_vars[0])
+        cond = lambda i, lv: (  # pylint: disable=g-long-lambda
+            math_ops.logical_and(i < maximum_iterations, orig_cond(lv)))
+        body = lambda i, lv: (i + 1, orig_body(lv))
+      else:
+        loop_vars = (counter, loop_vars)
+        cond = lambda i, lv: (  # pylint: disable=g-long-lambda
+            math_ops.logical_and(i < maximum_iterations, orig_cond(*lv)))
+        body = lambda i, lv: (i + 1, orig_body(*lv))
+
     if context.in_eager_mode():
       while cond(*loop_vars):
         loop_vars = body(*loop_vars)
-      return loop_vars
+      if maximum_iterations is not None:
+        return loop_vars[1]
+      else:
+        return loop_vars
 
     if shape_invariants is not None:
+      if maximum_iterations is not None:
+        shape_invariants = (tensor_shape.TensorShape([]), shape_invariants)
       nest.assert_same_structure(loop_vars, shape_invariants)
 
-    loop_context = WhileContext(parallel_iterations, back_prop, swap_memory)  # pylint: disable=redefined-outer-name
-    ops.add_to_collection(ops.GraphKeys.WHILE_CONTEXT, loop_context)
+    loop_context = WhileContext(
+        maximum_iterations=maximum_iterations,
+        parallel_iterations=parallel_iterations,
+        back_prop=back_prop,
+        swap_memory=swap_memory)
+    # Only add non-nested loops to the collection. Any nested control flow will
+    # be encapsulated in the root context.
+    # TODO(b/72868227): enable condition once the corresponding
+    # control_flow.proto changes have been checked in (they aren't checked in
+    # and this is disabled for now to ensure forwards compatibility).
+    if True or loop_context.outer_context is None:
+      ops.add_to_collection(ops.GraphKeys.WHILE_CONTEXT, loop_context)
     result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
-    return result
+    if maximum_iterations is not None:
+      return result[1]
+    else:
+      return result
+
+
+# pylint: enable=redefined-outer-name
 
 
 def _AsTensorList(x, p):
@@ -2909,8 +3212,9 @@ def _AsTensorList(x, p):
     if isinstance(v, ops.Tensor):
       l.append(array_ops.identity(v))
     else:
-      l.append(ops.IndexedSlices(array_ops.identity(v.values),
-                                 array_ops.identity(v.indices)))
+      l.append(
+          ops.IndexedSlices(
+              array_ops.identity(v.values), array_ops.identity(v.indices)))
   return l
 
 
@@ -2920,8 +3224,7 @@ def _CheckResults(a, b):
   for x, y in zip(a, b):
     assert x.dtype == y.dtype, (
         "Values returned by a() [%s] and b() [%s] must have "
-        "the same type: %s, %s." %
-        (x.name, y.name, x.dtype.name, y.dtype.name))
+        "the same type: %s, %s." % (x.name, y.name, x.dtype.name, y.dtype.name))
 
 
 def with_dependencies(dependencies, output_tensor, name=None):
@@ -2957,9 +3260,9 @@ def with_dependencies(dependencies, output_tensor, name=None):
         if isinstance(output_tensor, ops.Tensor):
           return _Identity(output_tensor, name=name)
         else:
-          return ops.IndexedSlices(_Identity(output_tensor.values, name=name),
-                                   output_tensor.indices,
-                                   output_tensor.dense_shape)
+          return ops.IndexedSlices(
+              _Identity(output_tensor.values, name=name), output_tensor.indices,
+              output_tensor.dense_shape)
 
 
 def _GroupControlDeps(dev, deps, name=None):
@@ -2972,6 +3275,7 @@ def _GroupControlDeps(dev, deps, name=None):
 
 
 # TODO(touts): Accept "inputs" as a list.
+@tf_export("group")
 def group(*inputs, **kwargs):
   """Create an op that groups multiple operations.
 
@@ -3030,6 +3334,7 @@ def group(*inputs, **kwargs):
     def device_key(dev):
       """A sort key that allows None to be compared to strings."""
       return "" if dev is None else dev
+
     for dev in sorted(six.iterkeys(ops_on_device), key=device_key):
       deps.append(_GroupControlDeps(dev, ops_on_device[dev]))
 
@@ -3037,7 +3342,8 @@ def group(*inputs, **kwargs):
       return no_op(name=name)
 
 
-def tuple(tensors, name=None, control_inputs=None):
+@tf_export("tuple")
+def tuple(tensors, name=None, control_inputs=None):  # pylint: disable=redefined-builtin
   """Group tensors together.
 
   This creates a tuple of tensors with the same values as the `tensors`
@@ -3095,23 +3401,106 @@ def tuple(tensors, name=None, control_inputs=None):
     return tpl
 
 
-def _assert_exclusive(preds):
-  """Returns an Assert op that checks that the predicates are exclusive."""
-  preds_c = array_ops.stack(preds, name="preds_c")
+def _assert_at_most_n_true(predicates, n, msg):
+  """Returns an Assert op that checks that at most n predicates are True.
+
+  Args:
+    predicates: list of bool scalar tensors.
+    n: maximum number of true predicates allowed.
+    msg: Error message.
+  """
+  preds_c = array_ops.stack(predicates, name="preds_c")
   num_true_conditions = math_ops.reduce_sum(
       math_ops.cast(preds_c, dtypes.int32), name="num_true_conds")
-  at_most_one_true_condition = math_ops.less(
-      num_true_conditions, constant_op.constant(2, name="two_true_conds"))
+  condition = math_ops.less_equal(num_true_conditions,
+                                  constant_op.constant(n, name="n_true_conds"))
+  preds_names = ", ".join(getattr(p, "name", "?") for p in predicates)
+  error_msg = [
+      "%s: more than %d conditions (%s) evaluated as True:" %
+      (msg, n, preds_names), preds_c
+  ]
+  return Assert(condition, data=error_msg, summarize=len(predicates))
+
+
+def _case_create_default_action(predicates, actions):
+  """Creates default action for a list of actions and their predicates.
+
+  It uses the input actions to select an arbitrary as default and makes sure
+  that corresponding predicates have valid values.
+
+  Args:
+    predicates: a list of bool scalar tensors
+    actions: a list of callable objects which return tensors.
+
+  Returns:
+    a callable
+  """
+  k = len(predicates) - 1  # could pick any
+  predicate, action = predicates[k], actions[k]
+  other_predicates, other_actions = predicates[:k], actions[:k]
+
+  def default_action():
+    others_msg = ("Implementation error: "
+                  "selected default action #%d was called, but some of other "
+                  "predicates are True: " % k)
+    default_msg = ("Input error: "
+                   "None of conditions evaluated as True:",
+                   array_ops.stack(predicates, name="preds_c"))
+    with ops.control_dependencies([
+        _assert_at_most_n_true(other_predicates, n=0, msg=others_msg),
+        Assert(predicate, data=default_msg)
+    ]):
+      return action()
+
+  return default_action, other_predicates, other_actions
+
+
+def _case_verify_and_canonicalize_args(pred_fn_pairs, exclusive, name):
+  """Verifies input arguments for the case function.
+
+  Args:
+    pred_fn_pairs: Dict or list of pairs of a boolean scalar tensor and a
+                   callable which returns a list of tensors.
+    exclusive: True iff at most one predicate is allowed to evaluate to `True`.
+    name: A name for the case operation.
 
-  error_msg = [("More than one condition evaluated as True but "
-                "exclusive=True.  Conditions: (%s), Values:"
-                % ", ".join([p.name for p in preds])),
-               preds_c]
-  return Assert(condition=at_most_one_true_condition, data=error_msg,
-                summarize=len(preds))
+  Raises:
+    TypeError: If `pred_fn_pairs` is not a list/dictionary.
+    TypeError: If `pred_fn_pairs` is a list but does not contain 2-tuples.
+    TypeError: If `fns[i]` is not callable for any i, or `default` is not
+               callable.
 
+  Returns:
+    a tuple <list of scalar bool tensors, list of callables>.
+  """
+  if not isinstance(pred_fn_pairs, (list, _basetuple, dict)):
+    raise TypeError("fns must be a list, tuple, or dict")
 
-def case(pred_fn_pairs, default=None, exclusive=False, strict=False,
+  if isinstance(pred_fn_pairs, collections.OrderedDict):
+    pred_fn_pairs = pred_fn_pairs.items()
+  elif isinstance(pred_fn_pairs, dict):
+    pred_fn_pairs = sorted(pred_fn_pairs.items(), key=lambda item: item[0].name)
+    if not exclusive:
+      logging.warn("%s: An unordered dictionary of predicate/fn pairs was "
+                   "provided, but exclusive=False. The order of conditional "
+                   "tests is deterministic but not guaranteed.", name)
+  for pred_fn_pair in pred_fn_pairs:
+    if not isinstance(pred_fn_pair, _basetuple) or len(pred_fn_pair) != 2:
+      raise TypeError("Each entry in pred_fn_pairs must be a 2-tuple")
+    pred, fn = pred_fn_pair
+    if pred.dtype != dtypes.bool:
+      raise TypeError("pred must be of type bool: %s", pred.name)
+    if not callable(fn):
+      raise TypeError("fn for pred %s must be callable." % pred.name)
+  predicates, actions = zip(*pred_fn_pairs)
+  return predicates, actions
+
+
+@tf_export("case")
+def case(pred_fn_pairs,
+         default=None,
+         exclusive=False,
+         strict=False,
          name="case"):
   """Create a case operation.
 
@@ -3196,160 +3585,74 @@ def case(pred_fn_pairs, default=None, exclusive=False, strict=False,
     TypeError: If `pred_fn_pairs` is a list but does not contain 2-tuples.
     TypeError: If `fns[i]` is not callable for any i, or `default` is not
                callable.
-    ValueError: If in eager mode and all predicates are false and no
-               default is provided.
-    ValueError: If in eager mode and is passed a dictionary.
   """
-  pfp = pred_fn_pairs  # For readability
-  if not (isinstance(pfp, list) or isinstance(pfp, _basetuple)
-          or isinstance(pfp, dict)):
-    raise TypeError("fns must be a list, tuple, or dict")
-  if isinstance(pfp, dict):
-    if context.in_eager_mode():
-      raise ValueError(
-          "In eager mode the predicates must be a list, not a dictionary.")
-    if isinstance(pfp, collections.OrderedDict):
-      pfp = pfp.items()
+  predicates, actions = _case_verify_and_canonicalize_args(
+      pred_fn_pairs, exclusive, name)
+  with ops.name_scope(name, "case", [predicates]):
+    if default is None:
+      default, predicates, actions = _case_create_default_action(
+          predicates, actions)
+    fn = default
+    # To eval conditions in direct order we create nested conditions in reverse:
+    #   cond(c[0], true_fn=.., false_fn=cond(c[1], ...))
+    for predicate, action in reversed(list(zip(predicates, actions))):
+      fn = functools.partial(
+          cond, predicate, true_fn=action, false_fn=fn, strict=strict)
+    if exclusive:
+      with ops.control_dependencies([
+          _assert_at_most_n_true(
+              predicates, n=1, msg="Input error: exclusive=True")
+      ]):
+        return fn()
     else:
-      pfp = sorted(pfp.items(), key=lambda item: item[0].name)
-      if not exclusive:
-        logging.warn("%s: An unordered dictionary of predicate/fn pairs was "
-                     "provided, but exclusive=False. The order of conditional "
-                     "tests is deterministic but not guaranteed.", name)
-  for tup in pfp:
-    if not isinstance(tup, _basetuple) or len(tup) != 2:
-      raise TypeError("Each entry in pred_fn_pairs must be a 2-tuple")
-    pred, fn = tup
-    if pred.dtype != dtypes.bool:
-      raise TypeError("pred must be of type bool: %s", pred.name)
-    if not callable(fn):
-      raise TypeError("fn for pred %s must be callable." % pred.name)
+      return fn()
 
-  if default is not None and not callable(default):
-    raise TypeError("default must be callable.")
 
-  if context.in_eager_mode():
-    for pred, fn in pfp:
-      if pred:
-        return fn()
-    if default is None:
-      raise ValueError("tf.case received all false predicates and no default.")
-    return default()
+class XLAControlFlowContext(ControlFlowContext):
+  """Base class for XLA and TPU control flow contexts."""
 
-  preds, fns = map(list, zip(*pfp))
-  del pfp  # From now on, preds and fns form the source of truth.
+  def __init__(self):
+    super(XLAControlFlowContext, self).__init__()
+    self._name = "XLAControlFlowContext"
 
-  with ops.name_scope(name, "case", [preds]):
-    exclusivity_assert = _assert_exclusive(preds) if exclusive else None
-    # If no default is provided, then we remove one of the (predicate, function)
-    # pairs and define the default to be the removed function with an additional
-    # control dependency that asserts that the removed predicate holds.
-    if default is None:
-      all_preds = _basetuple(preds)  # For the error message.
-      last_pred, last_fn = preds.pop(), fns.pop()
-      def new_default():
-        preds_c = array_ops.stack(all_preds, name="preds_c")
-        error_msg = [
-            ("None of the conditions evaluated as True. Conditions: (%s), "
-             "Values:" % ", ".join([p.name for p in all_preds])),
-            preds_c]
-        assertion = Assert(condition=last_pred,
-                           data=error_msg, summarize=len(all_preds))
-        with ops.control_dependencies([assertion]):
-          return last_fn()
-      default = new_default
-
-    if not preds:
-      return default()
-    not_preds = []
-    for i, p in enumerate(preds):
-      with ops.name_scope("not_%d" % i):
-        not_preds.append(math_ops.logical_not(p))
-    and_not_preds = [constant_op.constant(True, name="always_true")]
-    for i, notp in enumerate(not_preds):
-      with ops.name_scope("and_not_%d" % i):
-        and_not_preds.append(math_ops.logical_and(and_not_preds[-1], notp))
-
-    # preds = [p1, p2, p3]
-    # fns = [f1, f2, f3]
-    # not_preds = [~p1, ~p2, ~p3]
-    # and_not_preds = [True, ~p1, ~p1 & ~p2, ~p1 & ~p2 & ~p3]
-    # case_preds = [p1,
-    #               p2 & ~p1,
-    #               p3 & ~p2 & ~p1,
-    #              ~p3 & ~p2 & ~p1]
-
-    case_preds = []
-    for i, (p, and_not_p_prev) in enumerate(zip(preds, and_not_preds[:-1])):
-      with ops.name_scope("case_%d" % i):
-        case_preds.append(math_ops.logical_and(p, and_not_p_prev))
-    with ops.name_scope("case_none_are_true"):
-      case_preds.append(and_not_preds[-1])
-
-    # Create an empty tensor, or list, with the right type and shape
-    with ops.name_scope("case_create_empty"):
-      def _create_empty_constant(dtype, shape):
-        value = ("" if dtype == dtypes.string else dtype.as_numpy_dtype())
-        if shape.ndims is None:
-          return array_ops.constant(value, dtype=dtype)
-        else:
-          temp_shape = [1 if x.value is None else x.value for x in shape]
-          result = array_ops.constant(value, shape=temp_shape, dtype=dtype)
-          result._shape = shape  # pylint: disable=protected-access
-          return result
-
-      def _correct_empty(v):
-        if isinstance(v, ops.Operation):
-          return no_op()
-        elif isinstance(v, tensor_array_ops.TensorArray):
-          return v
-        elif not hasattr(v, "dtype"):
-          return ops.convert_to_tensor(v)
-        elif isinstance(v, sparse_tensor.SparseTensor):
-          return sparse_tensor.SparseTensor(indices=[[0] * len(v.get_shape())],
-                                            values=[v.dtype.as_numpy_dtype()],
-                                            dense_shape=v.get_shape())
-        else:
-          return _create_empty_constant(v.dtype, v.get_shape())
-
-      empty = lambda: nest.map_structure(_correct_empty, default())
-
-    # case_sequence = [
-    #   cond(~p3 & ~p2 & ~p1, default, empty),
-    #   cond(p3 & ~p2 & ~p1, f3, lambda: case_sequence[0]),
-    #   cond(p2 & ~p1, f2, lambda: case_sequence[1]),
-    #   cond(p1, f1, lambda: case_sequence[2])
-    # ]
-    #
-    # And the return value will be case_sequence[-1]
-    def _build_case():
-      all_fns = [fn for fn in fns]
-      all_fns.append(default)
-      prev_case = None
-      for i, (cp, fn) in enumerate(list(zip(case_preds, all_fns))[::-1]):
-        prev_case = cond(
-            cp, fn,
-            empty if i == 0 else lambda: prev_case,
-            strict=strict, name="If_%d" % i)
-      return prev_case
-
-    if exclusivity_assert is not None:
-      with ops.control_dependencies([exclusivity_assert]):
-        case_seq = _build_case()
-    else:
-      case_seq = _build_case()
+  def IsXLAContext(self):
+    return True
 
-    if not strict:
-      case_seq = _UnpackIfSingleton(case_seq)
-    return case_seq
+  def AddOp(self, _):
+    pass
+
+  def AddValue(self, x):
+    return x
+
+
+def from_control_flow_context_def(context_def, import_scope=None):
+  """Deserializes `context_def` into the appropriate ControlFlowContext.
+
+  Args:
+    context_def: ControlFlowContextDef proto
+    import_scope: Optional `string`. Name scope to add.
+
+  Returns:
+    A ControlFlowContext subclass
+  """
+  if context_def.HasField("cond_ctxt"):
+    return CondContext.from_proto(context_def.cond_ctxt,
+                                  import_scope=import_scope)
+  if context_def.HasField("while_ctxt"):
+    return WhileContext.from_proto(context_def.while_ctxt,
+                                   import_scope=import_scope)
+  raise NotImplementedError("Unknown ControlFlowContextDef field: %s"
+                            % context_def.WhichOneof("ctxt"))
 
 
-ops.register_proto_function(ops.GraphKeys.COND_CONTEXT,
-                            proto_type=control_flow_pb2.CondContextDef,
-                            to_proto=CondContext.to_proto,
-                            from_proto=CondContext.from_proto)
+ops.register_proto_function(
+    ops.GraphKeys.COND_CONTEXT,
+    proto_type=control_flow_pb2.CondContextDef,
+    to_proto=CondContext.to_proto,
+    from_proto=CondContext.from_proto)
 
-ops.register_proto_function(ops.GraphKeys.WHILE_CONTEXT,
-                            proto_type=control_flow_pb2.WhileContextDef,
-                            to_proto=WhileContext.to_proto,
-                            from_proto=WhileContext.from_proto)
+ops.register_proto_function(
+    ops.GraphKeys.WHILE_CONTEXT,
+    proto_type=control_flow_pb2.WhileContextDef,
+    to_proto=WhileContext.to_proto,
+    from_proto=WhileContext.from_proto)
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 3e8f39dd240af3a5030d259603ab648d50c27cd3..f22f3059d139d1bb7c7db57a2939184f1089f397 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -51,6 +51,7 @@ TestTuple = collections.namedtuple("TestTuple", "a b")
 SingletonTestTuple = collections.namedtuple("SingletonTestTuple", "a")
 
 
+@test_util.with_c_api
 class GroupTestCase(test_util.TensorFlowTestCase):
 
   def _StripNode(self, nd):
@@ -132,6 +133,7 @@ class GroupTestCase(test_util.TensorFlowTestCase):
         control_flow_ops.group(1, 2)
 
 
+@test_util.with_c_api
 class ShapeTestCase(test_util.TensorFlowTestCase):
 
   def testShape(self):
@@ -143,6 +145,7 @@ class ShapeTestCase(test_util.TensorFlowTestCase):
                             [constant_op.constant(1.0)], tensor).get_shape())
 
 
+@test_util.with_c_api
 class WithDependenciesTestCase(test_util.TensorFlowTestCase):
 
   def testTupleDependencies(self):
@@ -174,6 +177,7 @@ class WithDependenciesTestCase(test_util.TensorFlowTestCase):
         self.assertEquals(1, counter.eval())
 
 
+@test_util.with_c_api
 class SwitchTestCase(test_util.TensorFlowTestCase):
 
   def testIndexedSlicesWithDenseShape(self):
@@ -185,7 +189,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       zero = constant_op.constant(0)
       one = constant_op.constant(1)
       less_op = math_ops.less(zero, one)
-      switch_false, switch_true = control_flow_ops.switch(data, less_op)
+      _, switch_true = control_flow_ops.switch(data, less_op)
       self.assertAllEqual([1, 2, 3], switch_true.values.eval())
       self.assertAllEqual([0, 1], switch_true.indices.eval())
 
@@ -195,16 +199,17 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
           "embedding_matrix", [5, 5],
           initializer=init_ops.random_normal_initializer())
 
-      def Cond(it, _):
+      def cond(it, _):
         return it < 5
 
-      def Body(it, cost):
+      def body(it, cost):
         embedding = embedding_ops.embedding_lookup(embedding_matrix + 0.0, [0])
         cost += math_ops.reduce_sum(embedding)
         return it + 1, cost
 
       _, cost = control_flow_ops.while_loop(
-          Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)])
+          cond, body, [constant_op.constant(0),
+                       constant_op.constant(0.0)])
       optimizer = momentum.MomentumOptimizer(0.1, 0.9)
       train_op = optimizer.minimize(cost)
       with self.test_session() as sess:
@@ -219,16 +224,17 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
           initializer=[[2.0], [3.0]],
           use_resource=True)
 
-      def Cond(it, _):
+      def cond(it, _):
         return it < 5
 
-      def Body(it, cost):
+      def body(it, cost):
         embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
         cost += math_ops.reduce_sum(embedding)
         return it + 1, cost
 
       _, cost = control_flow_ops.while_loop(
-          Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)])
+          cond, body, [constant_op.constant(0),
+                       constant_op.constant(0.0)])
       with self.test_session() as sess:
         sess.run(variables.global_variables_initializer())
         self.assertAllEqual(10.0, cost.eval())
@@ -240,10 +246,10 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
           initializer=init_ops.random_normal_initializer(),
           use_resource=use_resource)
 
-      def Cond(it, _):
+      def cond(it, _):
         return it < 5
 
-      def Body(it, cost):
+      def body(it, cost):
         embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
         cost = control_flow_ops.cond(
             math_ops.equal(it, 3), lambda: math_ops.square(cost),
@@ -251,7 +257,8 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
         return it + 1, cost
 
       _, cost = control_flow_ops.while_loop(
-          Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)])
+          cond, body, [constant_op.constant(0),
+                       constant_op.constant(0.0)])
 
       dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0]
       dynamic_grads = math_ops.segment_sum(dynamic_grads.values,
@@ -285,15 +292,15 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
             dtype=dtype, size=num_steps)
         initial_i = constant_op.constant(0, dtype=dtypes.int32)
 
-        def Cond(i, _):
+        def cond(i, _):
           return i < num_steps  # pylint: disable=cell-var-from-loop
 
-        def Body(i, outputs):
+        def body(i, outputs):
           x = array_ops.gather(inputs, i)  # pylint: disable=cell-var-from-loop
           outputs = outputs.write(i, x)
           return i + 1, outputs
 
-        _, outputs = control_flow_ops.while_loop(Cond, Body,
+        _, outputs = control_flow_ops.while_loop(cond, body,
                                                  [initial_i, initial_outputs])
 
         outputs = math_ops.reduce_sum(outputs.stack())
@@ -312,15 +319,15 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
             dtype=dtype, dynamic_size=True, size=1)
         initial_i = constant_op.constant(0, dtype=dtypes.int32)
 
-        def Cond(i, _):
+        def cond(i, _):
           return i < array_ops.size(inputs)  # pylint: disable=cell-var-from-loop
 
-        def Body(i, outputs):
+        def body(i, outputs):
           x = array_ops.gather(inputs, i)  # pylint: disable=cell-var-from-loop
           outputs = outputs.write(i, x)
           return i + 1, outputs
 
-        _, outputs = control_flow_ops.while_loop(Cond, Body,
+        _, outputs = control_flow_ops.while_loop(cond, body,
                                                  [initial_i, initial_outputs])
 
         outputs = math_ops.reduce_sum(outputs.stack())
@@ -431,6 +438,7 @@ class CondTest(test_util.TensorFlowTestCase):
           control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
 
 
+@test_util.with_c_api
 class ContextTest(test_util.TensorFlowTestCase):
 
   def testCondContext(self):
@@ -447,18 +455,26 @@ class ContextTest(test_util.TensorFlowTestCase):
               c.to_proto(),
               control_flow_ops.CondContext.from_proto(c.to_proto()).to_proto())
 
-  def testWhileContext(self):
+  def _testWhileContextHelper(self, maximum_iterations=None):
     with self.test_session() as sess:
       i = constant_op.constant(0)
       c = lambda i: math_ops.less(i, 10)
       b = lambda i: math_ops.add(i, 1)
-      control_flow_ops.while_loop(c, b, [i])
+      control_flow_ops.while_loop(
+          c, b, [i], maximum_iterations=maximum_iterations)
       for op in sess.graph.get_operations():
-        c = op._get_control_flow_context()
-        if c:
+        control_flow_context = op._get_control_flow_context()
+        if control_flow_context:
           self.assertProtoEquals(
-              c.to_proto(),
-              control_flow_ops.WhileContext.from_proto(c.to_proto()).to_proto())
+              control_flow_context.to_proto(),
+              control_flow_ops.WhileContext.from_proto(
+                  control_flow_context.to_proto()).to_proto())
+
+  def testWhileContext(self):
+    self._testWhileContextHelper()
+
+  def testWhileContextWithMaximumIterations(self):
+    self._testWhileContextHelper(maximum_iterations=10)
 
   def testControlContextImportScope(self):
     with self.test_session():
@@ -471,8 +487,8 @@ class ContextTest(test_util.TensorFlowTestCase):
       c._values = ["a", "b"]
       c._external_values = {"a": b1}
 
-      c_with_scope = control_flow_ops.ControlFlowContext._from_proto(
-          c._to_proto(), import_scope="test_scope")
+      c_with_scope = control_flow_ops.ControlFlowContext(
+          values_def=c._to_values_def(), import_scope="test_scope")
 
       # _values and _external_values should be have scope prepended.
       self.assertEquals(
@@ -482,12 +498,13 @@ class ContextTest(test_util.TensorFlowTestCase):
 
       # Calling _to_proto() with export_scope should remove "test_scope".
       self.assertProtoEquals(
-          c._to_proto(),
-          c_with_scope._to_proto(export_scope="test_scope"))
+          c._to_values_def(),
+          c_with_scope._to_values_def(export_scope="test_scope"))
+
 
+def _get_nested_shape(nested):
 
-def _GetNestedShape(nested):
-  def _GetShape(tensor):
+  def _get_shape(tensor):
     if isinstance(tensor, tensor_array_ops.TensorArray):
       return tensor_array_ops.TensorArray
     elif isinstance(tensor, ops.IndexedSlices):
@@ -495,10 +512,10 @@ def _GetNestedShape(nested):
     else:
       return tensor.get_shape()
 
-  return nest.map_structure(_GetShape, nested)
+  return nest.map_structure(_get_shape, nested)
 
 
-def _CreateTensorArray(size, shape):
+def _create_tensor_array(size, shape):
   ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=size,
                                     clear_after_read=False)
   for i in range(size):
@@ -506,16 +523,19 @@ def _CreateTensorArray(size, shape):
   return ta
 
 
-def _RawNestedShape(nested_shape):
-  def _RawShape(shape):
+def _raw_nested_shape(nested_shape):
+
+  def _raw_shape(shape):
     if isinstance(shape, tensor_shape.TensorShape) and shape.ndims is not None:
       return [x.value for x in shape]
     else:
       return None
-  return nest.map_structure(_RawShape, nested_shape)
+
+  return nest.map_structure(_raw_shape, nested_shape)
 
 
 # TODO(yori): Add tests for indexed slices.
+@test_util.with_c_api
 class DataTypesTest(test_util.TensorFlowTestCase):
 
   def assertAllEqualNested(self, a, b):
@@ -530,17 +550,21 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     condition = array_ops.placeholder(dtypes.bool)
     output_cond = control_flow_ops.cond(condition, fn_true, fn_false,
                                         strict=strict)
-    self.assertEqual(_RawNestedShape(_GetNestedShape(output_cond)),
-                     _RawNestedShape(expected_shape))
+    self.assertEqual(
+        _raw_nested_shape(_get_nested_shape(output_cond)),
+        _raw_nested_shape(expected_shape))
 
     output_case = control_flow_ops.case([(condition, fn_true)], fn_false,
                                         strict=strict)
-    self.assertEqual(_RawNestedShape(_GetNestedShape(output_case)),
-                     _RawNestedShape(expected_shape))
+    self.assertEqual(
+        _raw_nested_shape(_get_nested_shape(output_case)),
+        _raw_nested_shape(expected_shape))
 
   def _testReturnValues(self, fn_true, fn_false, expected_value_true,
                         expected_value_false, strict=False,
-                        check_cond=True):
+                        check_cond=True, feed_dict=None):
+    if feed_dict is None: feed_dict = {}
+
     condition = array_ops.placeholder(dtypes.bool)
     output_cond = control_flow_ops.cond(condition, fn_true, fn_false,
                                         strict=strict)
@@ -549,13 +573,17 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
     with self.test_session() as sess:
       variables.global_variables_initializer().run()
+      true_feed_dict = {condition: True}
+      true_feed_dict.update(feed_dict)
       result_cond, result_case = sess.run([output_cond, output_case],
-                                          feed_dict={condition: True})
+                                          feed_dict=true_feed_dict)
       self.assertAllEqualNested(result_cond, expected_value_true)
       if check_cond:
         self.assertAllEqualNested(result_case, expected_value_true)
+      false_feed_dict = {condition: False}
+      false_feed_dict.update(feed_dict)
       result_cond, result_case = sess.run([output_cond, output_case],
-                                          feed_dict={condition: False})
+                                          feed_dict=false_feed_dict)
       self.assertAllEqualNested(result_cond, expected_value_false)
       if check_cond:
         self.assertAllEqualNested(result_case, expected_value_false)
@@ -607,59 +635,69 @@ class DataTypesTest(test_util.TensorFlowTestCase):
       control_flow_ops.cond(constant_op.constant(True), fn_tensor, fn_none)
 
   def test_tensors(self):
-    def _BuildTrueBranch(dtype):
-      def _Build():
+
+    def _build_true_branch(dtype):
+
+      def _build():
         return (array_ops.zeros([2, 2], dtype=dtype),
                 array_ops.ones([3, 3], dtype=dtype))
-      return _Build
 
-    def _BuildFalseBranch(dtype):
-      def _Build():
+      return _build
+
+    def _build_false_branch(dtype):
+
+      def _build():
         return (array_ops.ones([2, 2], dtype=dtype),
                 array_ops.zeros([3, 3], dtype=dtype))
-      return _Build
+
+      return _build
 
     for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
       shape = (tensor_shape.TensorShape([2, 2]),
                tensor_shape.TensorShape([3, 3]))
-      fn_true = _BuildTrueBranch(dtype)
-      fn_false = _BuildFalseBranch(dtype)
+      fn_true = _build_true_branch(dtype)
+      fn_false = _build_false_branch(dtype)
       self._testShape(fn_true, fn_false, shape)
       self._testReturnValues(fn_true, fn_false,
                              (np.zeros([2, 2]), np.ones([3, 3])),
                              (np.ones([2, 2]), np.zeros([3, 3])))
 
   def test_tensors_unknown_shape(self):
-    def _BuildTrueBranch(dtype):
-      def _Build():
-        tensor = array_ops.zeros([2, 2], dtype=dtype)
-        tensor._shape = tensor_shape.TensorShape(None)
+
+    def _build_true_branch(dtype):
+      tensor = array_ops.placeholder(dtype=dtype, shape=None)
+
+      def _build():
         return tensor
-      return _Build
 
-    def _BuildFalseBranch(dtype):
-      def _Build():
-        tensor = array_ops.ones([2, 2], dtype=dtype)
-        tensor._shape = tensor_shape.TensorShape(None)
+      return _build, tensor
+
+    def _build_false_branch(dtype):
+      tensor = array_ops.placeholder(dtype=dtype, shape=None)
+
+      def _build():
         return tensor
-      return _Build
+
+      return _build, tensor
 
     for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
       shape = tensor_shape.TensorShape(None)
-      fn_true = _BuildTrueBranch(dtype)
-      fn_false = _BuildFalseBranch(dtype)
+      fn_true, true_tensor = _build_true_branch(dtype)
+      fn_false, false_tensor = _build_false_branch(dtype)
       self._testShape(fn_true, fn_false, shape)
       self._testReturnValues(fn_true, fn_false,
-                             np.zeros([2, 2]), np.ones([2, 2]))
+                             np.zeros([2, 2]), np.ones([2, 2]),
+                             feed_dict={true_tensor: np.zeros([2, 2]),
+                                        false_tensor: np.ones([2, 2])})
 
   def test_sparse_tensors(self):
     shape = tensor_shape.TensorShape([None, None])
 
-    def FnTrue():
+    def true_fn():
       return [sparse_tensor.SparseTensor(indices=[[0, 0], [1, 2]],
                                          values=[1, 2], dense_shape=[3, 4])]
 
-    def FnFalse():
+    def false_fn():
       return [sparse_tensor.SparseTensor(indices=[[0, 0], [2, 1]],
                                          values=[3, 4], dense_shape=[3, 4])]
 
@@ -667,38 +705,44 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                                              values=[1, 2], dense_shape=[3, 4])
     value2 = sparse_tensor.SparseTensorValue(indices=[[0, 0], [2, 1]],
                                              values=[3, 4], dense_shape=[3, 4])
-    self._testShape(FnTrue, FnFalse, shape)
-    self._testReturnValues(FnTrue, FnFalse, value1, value2)
-    self._testShape(FnTrue, FnFalse, [shape], strict=True)
-    self._testReturnValues(FnTrue, FnFalse, [value1], [value2], strict=True)
+    self._testShape(true_fn, false_fn, shape)
+    self._testReturnValues(true_fn, false_fn, value1, value2)
+    self._testShape(true_fn, false_fn, [shape], strict=True)
+    self._testReturnValues(true_fn, false_fn, [value1], [value2], strict=True)
 
   def test_tensors_with_partially_specified_shapes(self):
-    def _BuildBranch(dtype, shape):
-      def _Build():
-        a = array_ops.zeros([2, 2], dtype=dtype)
-        b = array_ops.zeros([5], dtype=dtype)
-        c = array_ops.ones([3, 3], dtype=dtype)
-        a._shape = tensor_shape.TensorShape(shape[0])
-        b._shape = tensor_shape.TensorShape(shape[1])
-        c._shape = tensor_shape.TensorShape(shape[2])
+
+    def _build_branch(dtype, shape):
+      a = array_ops.placeholder(dtype=dtype, shape=shape[0])
+      b = array_ops.placeholder(dtype=dtype, shape=shape[1])
+      c = array_ops.placeholder(dtype=dtype, shape=shape[2])
+
+      def _build():
         return a, b, c
-      return _Build
+
+      return _build, (a, b, c)
 
     for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
       shape = (tensor_shape.TensorShape([None, 2]),
                tensor_shape.TensorShape([None]),
                tensor_shape.TensorShape([3, None]))
-      fn_true = _BuildBranch(dtype, shape)
-      fn_false = _BuildBranch(dtype, shape)
+      fn_true, true_tensors = _build_branch(dtype, shape)
+      fn_false, false_tensors = _build_branch(dtype, shape)
       self._testShape(fn_true, fn_false, shape)
       self._testReturnValues(fn_true, fn_false,
                              (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])),
-                             (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])))
+                             (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])),
+                             feed_dict={true_tensors[0]: np.zeros([2, 2]),
+                                        false_tensors[0]: np.zeros([2, 2]),
+                                        true_tensors[1]: np.zeros([5]),
+                                        false_tensors[1]: np.zeros([5]),
+                                        true_tensors[2]: np.ones([3, 3]),
+                                        false_tensors[2]: np.ones([3, 3])})
 
   def test_tensor_arrays(self):
     element_shape = tensor_shape.TensorShape([2])
-    ta1 = _CreateTensorArray(4, element_shape)
-    ta2 = _CreateTensorArray(4, element_shape)
+    ta1 = _create_tensor_array(4, element_shape)
+    ta2 = _create_tensor_array(4, element_shape)
     shape = tensor_array_ops.TensorArray
     fn_true = lambda: ta1
     fn_false = lambda: ta2
@@ -706,7 +750,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
   def test_tensor_array_reads(self):
     shape = tensor_shape.TensorShape([2])
-    ta = _CreateTensorArray(4, shape)
+    ta = _create_tensor_array(4, shape)
     fn_true = lambda: ta.read(0)
     fn_false = lambda: ta.read(1)
     self._testShape(fn_true, fn_false, shape)
@@ -805,23 +849,26 @@ class DataTypesTest(test_util.TensorFlowTestCase):
              tensor_shape.TensorShape([5, 5]),
              tensor_shape.TensorShape([])]
 
-    def FnTrue():
+    def true_fn():
       return [constant_op.constant(1),
               TestTuple(constant_op.constant(2), [3, 4]),
               array_ops.zeros([5, 5]), 6]
 
-    def FnFalse():
+    def false_fn():
       return [constant_op.constant(11),
               TestTuple(constant_op.constant(12), [13, 14]),
               array_ops.ones([5, 5]), 16]
 
-    self._testShape(FnTrue, FnFalse, shape)
-    self._testReturnValues(FnTrue, FnFalse,
-                           [1, TestTuple(2, [3, 4]), np.zeros([5, 5]), 6],
-                           [11, TestTuple(12, [13, 14]), np.ones([5, 5]), 16])
+    self._testShape(true_fn, false_fn, shape)
+    self._testReturnValues(
+        true_fn, false_fn,
+        [1, TestTuple(2, [3, 4]), np.zeros([5, 5]), 6],
+        [11, TestTuple(12, [13, 14]),
+         np.ones([5, 5]), 16])
 
   def test_cond_inside_while_loop(self):
-    def Body(i, matrix):
+
+    def body(i, matrix):
       result_tuple, unused_matrix = control_flow_ops.cond(
           constant_op.constant(True),
           lambda: (TestTuple(matrix * 2, matrix * 4), matrix),
@@ -830,13 +877,15 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
     iteration, matrix = control_flow_ops.while_loop(
         lambda i, matrix: i < 10,
-        Body,
-        loop_vars=[constant_op.constant(0), array_ops.ones([2, 2])])
+        body,
+        loop_vars=[constant_op.constant(0),
+                   array_ops.ones([2, 2])])
 
     self.assertEqual(iteration.get_shape(), tensor_shape.TensorShape([]))
     self.assertEqual(matrix.get_shape(), tensor_shape.TensorShape([2, 2]))
 
 
+@test_util.with_c_api
 class CaseTest(test_util.TensorFlowTestCase):
 
   def testCase_withDefault(self):
@@ -860,8 +909,7 @@ class CaseTest(test_util.TensorFlowTestCase):
     with self.test_session() as sess:
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 8)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "More than one condition evaluated as True"):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 2})
 
   def testCase_multiple_matches_non_exclusive(self):
@@ -886,11 +934,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
       self.assertEqual(sess.run(output, feed_dict={x: 2}), 4)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 6)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r"\[None of the conditions evaluated as True. "
-          r"Conditions: \(Equal:0, Equal_1:0, Equal_2:0\), Values:\] "
-          r"\[0 0 0\]"):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
 
   def testCase_withoutDefault_oneCondition(self):
@@ -899,10 +943,7 @@ class CaseTest(test_util.TensorFlowTestCase):
     output = control_flow_ops.case(conditions, exclusive=True)
     with self.test_session() as sess:
       self.assertEqual(sess.run(output, feed_dict={x: 1}), 2)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r"\[None of the conditions evaluated as True. "
-          r"Conditions: \(Equal:0\), Values:\] \[0\]"):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
 
 
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..eee31102db57b44ee29cb04ea79aabf003603f2f
--- /dev/null
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -0,0 +1,263 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilty functions for control flow.
+
+This file is necessary to avoid cyclic dependencies between ops.py and
+control_flow_ops.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import traceback
+
+from tensorflow.python.platform import tf_logging as logging
+
+
+def IsInXLAContext(op):
+  try:
+    xla_compile = op.get_attr("_XlaCompile")
+    if xla_compile: return True
+  except ValueError:
+    pass
+  ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+  return GetContainingXLAContext(ctxt) is not None
+
+
+def IsInWhileLoop(op):
+  ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+  return GetContainingWhileContext(ctxt) is not None
+
+
+def IsInCond(op):
+  ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+  return GetContainingCondContext(ctxt) is not None
+
+
+def IsSwitch(op):
+  """Return true if `op` is a Switch."""
+  return op.type == "Switch" or op.type == "RefSwitch"
+
+
+def IsLoopEnter(op):
+  """Returns true if `op` is an Enter."""
+  return op.type == "Enter" or op.type == "RefEnter"
+
+
+def IsLoopExit(op):
+  """Return true if `op` is an Exit."""
+  return op.type == "Exit" or op.type == "RefExit"
+
+
+def IsLoopSwitch(op):
+  """Return true if `op` is the Switch for a while loop."""
+  if IsSwitch(op):
+    ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+    return ctxt and ctxt.IsWhileContext()
+  return False
+
+
+def IsLoopConstantEnter(op):
+  """Return true iff op is a loop invariant."""
+  return IsLoopEnter(op) and op.get_attr("is_constant")
+
+
+def GetLoopConstantEnter(value):
+  """Return the enter op if we can infer `value` to be a loop invariant."""
+  id_ops = {"Switch", "RefSwitch", "Identity", "RefIdentity"}
+  op = value.op
+  while op.type in id_ops:
+    op = op.inputs[0].op
+  return op if IsLoopConstantEnter(op) else None
+
+
+def GetOutputContext(op):
+  """Return the control flow context for the output of an op."""
+  ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+  # Exit nodes usually have a control flow context, except in the case where the
+  # exit node was imported via import_graph_def (in which case no nodes have
+  # control flow contexts).
+  if ctxt is not None and IsLoopExit(op):
+    ctxt = ctxt.outer_context
+  return ctxt
+
+
+def GetContainingWhileContext(ctxt, stop_ctxt=None):
+  """Returns the first ancestor WhileContext of `ctxt`.
+
+  Returns `ctxt` if `ctxt` is a WhileContext, or None if `ctxt` is not in a
+  while loop.
+
+  Args:
+    ctxt: ControlFlowContext
+    stop_ctxt: ControlFlowContext, optional. If provided, the search will end
+      if it sees stop_ctxt.
+
+  Returns:
+    `ctxt` if `ctxt` is a WhileContext, the most nested WhileContext containing
+    `ctxt`, or None if `ctxt` is not in a while loop.  If `stop_ctxt` is not
+    `None`, this returns `ctxt` if it matches `stop_ctxt` in its traversal.
+  """
+  while ctxt:
+    if ctxt.IsWhileContext() or ctxt == stop_ctxt: return ctxt
+    ctxt = ctxt.outer_context
+  return None
+
+
+def GetContainingXLAContext(ctxt):
+  """Returns the first ancestor XLAContext of `ctxt`.
+
+  Returns `ctxt` if `ctxt` is a XLAContext, or None if `ctxt` is not in a
+  while loop.
+
+  Args:
+    ctxt: ControlFlowContext
+
+  Returns:
+    `ctxt` if `ctxt` is a XLAContext, the most nested XLAContext containing
+    `ctxt`, or None if `ctxt` is not in a while loop.
+  """
+  while ctxt:
+    if ctxt.IsXLAContext(): return ctxt
+    ctxt = ctxt.outer_context
+  return None
+
+
+def GetContainingCondContext(ctxt):
+  """Returns the first ancestor CondContext of `ctxt`.
+
+  Returns `ctxt` if `ctxt` is a CondContext, or None if `ctxt` is not in a cond.
+
+  Args:
+    ctxt: ControlFlowContext
+
+  Returns:
+    `ctxt` if `ctxt` is a CondContext, the most nested CondContext containing
+    `ctxt`, or None if `ctxt` is not in a cond.
+  """
+  while ctxt:
+    if ctxt.IsCondContext(): return ctxt
+    ctxt = ctxt.outer_context
+  return None
+
+
+def IsContainingContext(ctxt, maybe_containing_ctxt):
+  """Returns true if `maybe_containing_ctxt` is or contains `ctxt`."""
+  while ctxt is not maybe_containing_ctxt:
+    if ctxt is None: return False
+    ctxt = ctxt.outer_context
+  return True
+
+
+def CheckInputFromValidContext(op, input_op):
+  """Returns whether `input_op` can be used from `op`s context.
+
+  Conceptually, only inputs from op's while context or any ancestor while
+  context (including outside of any context) are valid. In practice, there are
+  many other edge cases as well.
+
+  Args:
+    op: Operation
+    input_op: Operation
+
+  Raises:
+    ValueError: if input_op is from an invalid context.
+  """
+  op_ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+  input_ctxt = GetOutputContext(input_op)
+  valid = False
+
+  if not input_ctxt:
+    # input_op isn't in a control flow context.
+    valid = True
+  elif op_ctxt is input_ctxt:
+    # input_op is in the same context as op.
+    valid = True
+  else:
+    while_ctxt = GetContainingWhileContext(op_ctxt)
+    input_while_ctxt = GetContainingWhileContext(input_ctxt)
+
+    if while_ctxt is None:
+      if input_while_ctxt is None:
+        # Neither op nor input_op is in a while loop, but one or both are in
+        # conds. We allow this, although execution will fail if the branch
+        # corresponding to input_op's cond context isn't taken.
+        valid = True
+      # Invalid if op isn't in a while loop and input_op is. Unless...
+      if IsLoopEnter(op):
+        # WhileContext._BuildLoop clears context for Enter nodes.
+        valid = True
+      if IsSwitch(op):
+        # CondContext.AddValue clears context for Switch nodes.
+        valid = True
+    elif IsContainingContext(while_ctxt, input_while_ctxt):
+      # input_op is in a while loop which contains op's while loop (or not in a
+      # while loop at all).
+      valid = True
+    elif (while_ctxt.grad_state and
+          IsContainingContext(while_ctxt.grad_state.forward_context,
+                              input_while_ctxt)):
+      # op is in a gradient context and input_op is in the associated forward
+      # pass context or an ancestor thereof. This case is need to build while
+      # loop gradients.
+      # NOTE(skyewm): we theoretically also need this case for custom gradient
+      # functions that close over tensors from ancestor contexts, but I haven't
+      # verified this.
+      valid = True
+    elif (while_ctxt.grad_state and
+          while_ctxt.grad_state.forward_context is
+          input_while_ctxt._outer_context):  # pylint: disable=protected-access
+      # op is in a gradient context and input_op is in a child of the associated
+      # forward pass context. This case is needed for the gradients of while
+      # loops with conds.
+      valid = True
+    elif (input_while_ctxt.grad_state and
+          input_while_ctxt.grad_state.forward_context is while_ctxt):
+      # input_op is in the gradient context of op's context. This case is needed
+      # when the gradient of a while loop gradient is requested (this will
+      # eventually fail unless there is a stop_gradient() or similar).
+      valid = True
+    elif (input_while_ctxt.grad_state and
+          input_ctxt.grad_state.forward_context.grad_state and
+          input_ctxt.grad_state.forward_context.grad_state.forward_context is
+          while_ctxt):
+      # input_op is in the grad grad context of op's context. This case is
+      # needed when the gradient of a while loop gradient is requested (this
+      # will eventually fail unless there is a stop_gradient() or similar).
+      valid = True
+
+  if not valid:
+    if while_ctxt:
+      error_msg = (
+          "Cannot use '%s' as input to '%s' because they are in different while"
+          " loops." % (op.name, input_op.name))
+    else:
+      error_msg = (
+          "Cannot use '%s' as input to '%s' because '%s' is in a while loop."
+          % (input_op.name, op.name, input_op.name))
+
+    # Log the error message plus the relevant stack traces. The stacks may be
+    # useful for debugging this error, but we don't want to raise an
+    # unreadable exception.
+    log_msg = error_msg
+    log_msg += "\n\n%s while context: %s" % (op.name, while_ctxt)
+    log_msg += "\n%s while context: %s" % (input_op.name, input_while_ctxt)
+    log_msg += "\n\nTraceback for %s:\n%s\nTraceback for %s:\n%s\n" % (
+        op.name, "".join(traceback.format_list(op.traceback)),
+        input_op.name, "".join(traceback.format_list(input_op.traceback)))
+    logging.info(log_msg)
+    raise ValueError(error_msg + " See info log for more details.")
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index f037767cf4051d058a2da0cca9c4515fd9705d28..83da6739db673644f59fda3044769b18b2138fbc 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -25,9 +25,11 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_ctc_ops
 from tensorflow.python.ops.nn_grad import _BroadcastMul
+from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access, invalid-name
+@tf_export("nn.ctc_loss")
 def ctc_loss(labels, inputs, sequence_length,
              preprocess_collapse_repeated=False,
              ctc_merge_repeated=True,
@@ -185,6 +187,7 @@ def _CTCLossGrad(op, grad_loss, _):
   return [_BroadcastMul(grad_loss, grad_without_gradient), None, None, None]
 
 
+@tf_export("nn.ctc_greedy_decoder")
 def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
   """Performs greedy decoding on the logits given in input (best path).
 
@@ -228,6 +231,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
           log_probabilities)
 
 
+@tf_export("nn.ctc_beam_search_decoder")
 def ctc_beam_search_decoder(inputs, sequence_length, beam_width=100,
                             top_paths=1, merge_repeated=True):
   """Performs beam search decoding on the logits given in input.
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index c186eb5b7ecaa5c74841aca15f0f11e994eba2ea..95e45bff066d4b2653e5de67684a6277006345f2 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #==============================================================================
-
 """Data Flow Operations."""
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -31,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.lib.io import python_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
@@ -38,6 +38,8 @@ from tensorflow.python.ops import math_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_data_flow_ops import *
+from tensorflow.python.util.tf_export import tf_export
+
 # pylint: enable=wildcard-import
 
 
@@ -52,17 +54,19 @@ def _as_type_list(dtypes):
     return list(dtypes)
 
 
-def _as_shape_list(shapes, dtypes, unknown_dim_allowed=False,
+def _as_shape_list(shapes,
+                   dtypes,
+                   unknown_dim_allowed=False,
                    unknown_rank_allowed=False):
   """Convert shapes to a list of tuples of int (or None)."""
   del dtypes
   if unknown_dim_allowed:
-    if (not isinstance(shapes, collections.Sequence)
-        or not shapes
-        or any(shape is None or isinstance(shape, int) for shape in shapes)):
+    if (not isinstance(shapes, collections.Sequence) or not shapes or
+        any(shape is None or isinstance(shape, int) for shape in shapes)):
       raise ValueError(
           "When providing partial shapes, a list of shapes must be provided.")
-  if shapes is None: return None
+  if shapes is None:
+    return None
   if isinstance(shapes, tensor_shape.TensorShape):
     shapes = [shapes]
   if not isinstance(shapes, (tuple, list)):
@@ -101,11 +105,13 @@ def _shape_common(s1, s2):
     return tensor_shape.unknown_shape()
   d = [
       d1 if d1 is not None and d1 == d2 else None
-      for (d1, d2) in zip(s1.as_list(), s2.as_list())]
+      for (d1, d2) in zip(s1.as_list(), s2.as_list())
+  ]
   return tensor_shape.TensorShape(d)
 
 
 # pylint: disable=protected-access
+@tf_export("QueueBase")
 class QueueBase(object):
   """Base class for queue implementations.
 
@@ -192,8 +198,7 @@ class QueueBase(object):
       TypeError: When `queues` is not a list of `QueueBase` objects,
         or when the data types of `queues` are not all the same.
     """
-    if ((not queues) or
-        (not isinstance(queues, list)) or
+    if ((not queues) or (not isinstance(queues, list)) or
         (not all(isinstance(x, QueueBase) for x in queues))):
       raise TypeError("A list of queues expected")
 
@@ -207,12 +212,16 @@ class QueueBase(object):
 
     queue_shapes = [q.shapes for q in queues]
     reduced_shapes = [
-        six.moves.reduce(_shape_common, s) for s in zip(*queue_shapes)]
+        six.moves.reduce(_shape_common, s) for s in zip(*queue_shapes)
+    ]
 
     queue_refs = array_ops.stack([x.queue_ref for x in queues])
     selected_queue = array_ops.gather(queue_refs, index)
-    return QueueBase(dtypes=dtypes, shapes=reduced_shapes, names=names,
-                     queue_ref=selected_queue)
+    return QueueBase(
+        dtypes=dtypes,
+        shapes=reduced_shapes,
+        names=names,
+        queue_ref=selected_queue)
 
   @property
   def queue_ref(self):
@@ -279,8 +288,8 @@ class QueueBase(object):
 
     tensors = []
     for i, (val, dtype) in enumerate(zip(vals, self._dtypes)):
-      tensors.append(ops.convert_to_tensor(val, dtype=dtype,
-                                           name="component_%d" % i))
+      tensors.append(
+          ops.convert_to_tensor(val, dtype=dtype, name="component_%d" % i))
 
     return tensors
 
@@ -552,11 +561,13 @@ class QueueBase(object):
       name = "%s_Close" % self._name
     if self._queue_ref.dtype == _dtypes.resource:
       return gen_data_flow_ops._queue_close_v2(
-          self._queue_ref, cancel_pending_enqueues=cancel_pending_enqueues,
+          self._queue_ref,
+          cancel_pending_enqueues=cancel_pending_enqueues,
           name=name)
     else:
       return gen_data_flow_ops._queue_close(
-          self._queue_ref, cancel_pending_enqueues=cancel_pending_enqueues,
+          self._queue_ref,
+          cancel_pending_enqueues=cancel_pending_enqueues,
           name=name)
 
   def is_closed(self, name=None):
@@ -574,9 +585,9 @@ class QueueBase(object):
     if name is None:
       name = "%s_Is_Closed" % self._name
     if self._queue_ref.dtype == _dtypes.resource:
-      return gen_data_flow_ops.queue_is_closed_v2(self._queue_ref,name=name)
+      return gen_data_flow_ops.queue_is_closed_v2(self._queue_ref, name=name)
     else:
-      return gen_data_flow_ops.queue_is_closed_(self._queue_ref,name=name)
+      return gen_data_flow_ops.queue_is_closed_(self._queue_ref, name=name)
 
   def size(self, name=None):
     """Compute the number of elements in this queue.
@@ -595,6 +606,7 @@ class QueueBase(object):
       return gen_data_flow_ops._queue_size(self._queue_ref, name=name)
 
 
+@tf_export("RandomShuffleQueue")
 class RandomShuffleQueue(QueueBase):
   """A queue implementation that dequeues elements in a random order.
 
@@ -607,8 +619,14 @@ class RandomShuffleQueue(QueueBase):
   @end_compatibility
   """
 
-  def __init__(self, capacity, min_after_dequeue, dtypes, shapes=None,
-               names=None, seed=None, shared_name=None,
+  def __init__(self,
+               capacity,
+               min_after_dequeue,
+               dtypes,
+               shapes=None,
+               names=None,
+               seed=None,
+               shared_name=None,
                name="random_shuffle_queue"):
     """Create a queue that dequeues elements in a random order.
 
@@ -666,13 +684,19 @@ class RandomShuffleQueue(QueueBase):
       string = (str(seed1) + shared_name).encode("utf-8")
       seed2 = int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
     queue_ref = gen_data_flow_ops._random_shuffle_queue_v2(
-        component_types=dtypes, shapes=shapes, capacity=capacity,
-        min_after_dequeue=min_after_dequeue, seed=seed1, seed2=seed2,
-        shared_name=shared_name, name=name)
+        component_types=dtypes,
+        shapes=shapes,
+        capacity=capacity,
+        min_after_dequeue=min_after_dequeue,
+        seed=seed1,
+        seed2=seed2,
+        shared_name=shared_name,
+        name=name)
 
     super(RandomShuffleQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
 
+@tf_export("FIFOQueue")
 class FIFOQueue(QueueBase):
   """A queue implementation that dequeues elements in first-in first-out order.
 
@@ -685,8 +709,13 @@ class FIFOQueue(QueueBase):
   @end_compatibility
   """
 
-  def __init__(self, capacity, dtypes, shapes=None, names=None,
-               shared_name=None, name="fifo_queue"):
+  def __init__(self,
+               capacity,
+               dtypes,
+               shapes=None,
+               names=None,
+               shared_name=None,
+               name="fifo_queue"):
     """Creates a queue that dequeues elements in a first-in first-out order.
 
     A `FIFOQueue` has bounded capacity; supports multiple concurrent
@@ -720,12 +749,16 @@ class FIFOQueue(QueueBase):
     shapes = _as_shape_list(shapes, dtypes)
     names = _as_name_list(names, dtypes)
     queue_ref = gen_data_flow_ops._fifo_queue_v2(
-        component_types=dtypes, shapes=shapes, capacity=capacity,
-        shared_name=shared_name, name=name)
+        component_types=dtypes,
+        shapes=shapes,
+        capacity=capacity,
+        shared_name=shared_name,
+        name=name)
 
     super(FIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
 
+@tf_export("PaddingFIFOQueue")
 class PaddingFIFOQueue(QueueBase):
   """A FIFOQueue that supports batching variable-sized tensors by padding.
 
@@ -741,7 +774,12 @@ class PaddingFIFOQueue(QueueBase):
   @end_compatibility
   """
 
-  def __init__(self, capacity, dtypes, shapes, names=None, shared_name=None,
+  def __init__(self,
+               capacity,
+               dtypes,
+               shapes,
+               names=None,
+               shared_name=None,
                name="padding_fifo_queue"):
     """Creates a queue that dequeues elements in a first-in first-out order.
 
@@ -786,16 +824,20 @@ class PaddingFIFOQueue(QueueBase):
     names = _as_name_list(names, dtypes)
     if len(dtypes) != len(shapes):
       raise ValueError("Shapes must be provided for all components, "
-                       "but received %d dtypes and %d shapes."
-                       % (len(dtypes), len(shapes)))
+                       "but received %d dtypes and %d shapes." % (len(dtypes),
+                                                                  len(shapes)))
 
     queue_ref = gen_data_flow_ops._padding_fifo_queue_v2(
-        component_types=dtypes, shapes=shapes, capacity=capacity,
-        shared_name=shared_name, name=name)
+        component_types=dtypes,
+        shapes=shapes,
+        capacity=capacity,
+        shared_name=shared_name,
+        name=name)
 
     super(PaddingFIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
 
+@tf_export("PriorityQueue")
 class PriorityQueue(QueueBase):
   """A queue implementation that dequeues elements in prioritized order.
 
@@ -808,7 +850,12 @@ class PriorityQueue(QueueBase):
   @end_compatibility
   """
 
-  def __init__(self, capacity, types, shapes=None, names=None, shared_name=None,
+  def __init__(self,
+               capacity,
+               types,
+               shapes=None,
+               names=None,
+               shared_name=None,
                name="priority_queue"):
     """Creates a queue that dequeues elements in a first-in first-out order.
 
@@ -849,14 +896,17 @@ class PriorityQueue(QueueBase):
     shapes = _as_shape_list(shapes, types)
 
     queue_ref = gen_data_flow_ops._priority_queue_v2(
-        component_types=types, shapes=shapes, capacity=capacity,
-        shared_name=shared_name, name=name)
+        component_types=types,
+        shapes=shapes,
+        capacity=capacity,
+        shared_name=shared_name,
+        name=name)
 
     priority_dtypes = [_dtypes.int64] + types
     priority_shapes = [()] + shapes if shapes else shapes
 
-    super(PriorityQueue, self).__init__(
-        priority_dtypes, priority_shapes, names, queue_ref)
+    super(PriorityQueue, self).__init__(priority_dtypes, priority_shapes, names,
+                                        queue_ref)
 
 
 # TODO(josh11b): class BatchQueue(QueueBase):
@@ -936,8 +986,10 @@ class Barrier(object):
       self._shapes = [tensor_shape.unknown_shape() for _ in self._types]
 
     self._barrier_ref = gen_data_flow_ops._barrier(
-        component_types=self._types, shapes=self._shapes,
-        shared_name=shared_name, name=name)
+        component_types=self._types,
+        shapes=self._shapes,
+        shared_name=shared_name,
+        name=name)
     if context.in_graph_mode():
       self._name = self._barrier_ref.op.name.split("/")[-1]
     else:
@@ -1021,12 +1073,13 @@ class Barrier(object):
     """
     if name is None:
       name = "%s_BarrierTakeMany" % self._name
-    ret = gen_data_flow_ops._barrier_take_many(self._barrier_ref,
-                                               num_elements,
-                                               self._types,
-                                               allow_small_batch,
-                                               timeout,
-                                               name=name)
+    ret = gen_data_flow_ops._barrier_take_many(
+        self._barrier_ref,
+        num_elements,
+        self._types,
+        allow_small_batch,
+        timeout,
+        name=name)
 
     # NOTE(mrry): Not using a shape function because we need access to
     # the Barrier object.
@@ -1041,8 +1094,7 @@ class Barrier(object):
       op.outputs[1].set_shape(tensor_shape.vector(batch_dim))  # keys
       for output, shape in zip(op.outputs[2:], self._shapes):  # value_list
         output.set_shape(
-            tensor_shape.TensorShape([batch_dim]).concatenate(
-                shape))
+            tensor_shape.TensorShape([batch_dim]).concatenate(shape))
 
     return ret
 
@@ -1105,6 +1157,7 @@ class Barrier(object):
         self._barrier_ref, name=name)
 
 
+@tf_export("ConditionalAccumulatorBase")
 class ConditionalAccumulatorBase(object):
   """A conditional accumulator for aggregating gradients.
 
@@ -1183,6 +1236,7 @@ class ConditionalAccumulatorBase(object):
         name=name)
 
 
+@tf_export("ConditionalAccumulator")
 class ConditionalAccumulator(ConditionalAccumulatorBase):
   """A conditional accumulator for aggregating gradients.
 
@@ -1262,6 +1316,7 @@ class ConditionalAccumulator(ConditionalAccumulatorBase):
     return out
 
 
+@tf_export("SparseConditionalAccumulator")
 class SparseConditionalAccumulator(ConditionalAccumulatorBase):
   """A conditional accumulator for aggregating sparse gradients.
 
@@ -1288,8 +1343,8 @@ class SparseConditionalAccumulator(ConditionalAccumulatorBase):
                name="sparse_conditional_accumulator"):
     accumulator_ref = gen_data_flow_ops.sparse_conditional_accumulator(
         dtype=dtype, shape=shape, shared_name=shared_name, name=name)
-    super(SparseConditionalAccumulator,
-          self).__init__(dtype, shape, accumulator_ref)
+    super(SparseConditionalAccumulator, self).__init__(dtype, shape,
+                                                       accumulator_ref)
 
   def apply_indexed_slices_grad(self, grad, local_step=0, name=None):
     """Attempts to apply a gradient to the accumulator.
@@ -1358,8 +1413,8 @@ class SparseConditionalAccumulator(ConditionalAccumulatorBase):
         local_step=local_step,
         gradient_indices=math_ops.to_int64(grad_indices),
         gradient_values=grad_values,
-        gradient_shape=math_ops.to_int64([] if grad_shape is None else
-                                         grad_shape),
+        gradient_shape=math_ops.to_int64([]
+                                         if grad_shape is None else grad_shape),
         has_known_shape=(grad_shape is not None),
         name=name)
 
@@ -1421,11 +1476,16 @@ class BaseStagingArea(object):
   _identifier = 0
   _lock = threading.Lock()
 
-  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
-                  capacity=0, memory_limit=0):
+  def __init__(self,
+               dtypes,
+               shapes=None,
+               names=None,
+               shared_name=None,
+               capacity=0,
+               memory_limit=0):
     if shared_name is None:
-      self._name = (ops.get_default_graph()
-                       .unique_name(self.__class__.__name__))
+      self._name = (
+          ops.get_default_graph().unique_name(self.__class__.__name__))
     elif isinstance(shared_name, six.string_types):
       self._name = shared_name
     else:
@@ -1522,8 +1582,9 @@ class BaseStagingArea(object):
                          (sorted(vals.keys()), sorted(self._names)))
       # The order of values in `self._names` indicates the order in which the
       # tensors in the dictionary `vals` must be listed.
-      vals, indices, n = zip(*[(vals[k], i, k) for i, k in enumerate(self._names)
-                                                  if k in vals])
+      vals, indices, n = zip(*[(vals[k], i, k)
+                               for i, k in enumerate(self._names)
+                               if k in vals])
     else:
       if self._names:
         raise ValueError("You must enqueue a dictionary in a staging area "
@@ -1531,7 +1592,7 @@ class BaseStagingArea(object):
 
       if indices is None:
         raise ValueError("Indices must be supplied when inserting a list "
-                        "of tensors")
+                         "of tensors")
 
       if len(indices) != len(vals):
         raise ValueError("Number of indices '%s' doesn't match "
@@ -1543,8 +1604,8 @@ class BaseStagingArea(object):
 
     # Sanity check number of values
     if not len(vals) <= len(self._dtypes):
-      raise ValueError("Unexpected number of inputs '%s' vs '%s'" % (
-                          len(vals), len(self._dtypes)))
+      raise ValueError("Unexpected number of inputs '%s' vs '%s'" %
+                       (len(vals), len(self._dtypes)))
 
     tensors = []
 
@@ -1552,14 +1613,14 @@ class BaseStagingArea(object):
       dtype, shape = self._dtypes[i], self._shapes[i]
       # Check dtype
       if not val.dtype == dtype:
-        raise ValueError("Datatypes do not match. '%s' != '%s'" %(
-                        str(val.dtype), str(dtype)))
+        raise ValueError("Datatypes do not match. '%s' != '%s'" %
+                         (str(val.dtype), str(dtype)))
 
       # Check shape
       val.get_shape().assert_is_compatible_with(shape)
 
-      tensors.append(ops.convert_to_tensor(val, dtype=dtype,
-                                          name="component_%d" % i))
+      tensors.append(
+          ops.convert_to_tensor(val, dtype=dtype, name="component_%d" % i))
 
     return tensors, indices
 
@@ -1622,6 +1683,7 @@ class BaseStagingArea(object):
     else:
       return [vals]
 
+
 class StagingArea(BaseStagingArea):
   """Class for staging inputs. No ordering guarantees.
 
@@ -1656,8 +1718,13 @@ class StagingArea(BaseStagingArea):
 
   """
 
-  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
-                  capacity=0, memory_limit=0):
+  def __init__(self,
+               dtypes,
+               shapes=None,
+               names=None,
+               shared_name=None,
+               capacity=0,
+               memory_limit=0):
     """Constructs a staging area object.
 
     The two optional lists, `shapes` and `names`, must be of the same length
@@ -1692,9 +1759,8 @@ class StagingArea(BaseStagingArea):
       ValueError: If one of the arguments is invalid.
     """
 
-    super(StagingArea, self).__init__(dtypes, shapes,
-                                          names, shared_name,
-                                          capacity, memory_limit)
+    super(StagingArea, self).__init__(dtypes, shapes, names, shared_name,
+                                      capacity, memory_limit)
 
   def put(self, values, name=None):
     """Create an op that places a value into the staging area.
@@ -1716,14 +1782,18 @@ class StagingArea(BaseStagingArea):
                         self._scope_vals(values)) as scope:
 
       # Hard-code indices for this staging area
-      indices = (list(six.moves.range(len(values)))
-                  if isinstance(values, (list, tuple)) else None)
+      indices = (
+          list(six.moves.range(len(values)))
+          if isinstance(values, (list, tuple)) else None)
       vals, _ = self._check_put_dtypes(values, indices)
 
       with ops.colocate_with(self._coloc_op):
-        op = gen_data_flow_ops.stage(values=vals, shared_name=self._name,
-                                     name=scope, capacity=self._capacity,
-                                     memory_limit=self._memory_limit)
+        op = gen_data_flow_ops.stage(
+            values=vals,
+            shared_name=self._name,
+            name=scope,
+            capacity=self._capacity,
+            memory_limit=self._memory_limit)
 
       return op
 
@@ -1731,7 +1801,7 @@ class StagingArea(BaseStagingArea):
     with ops.colocate_with(self._coloc_op):
       ret = get_fn()
 
-    indices = list(six.moves.range(len(self._dtypes))) # Hard coded
+    indices = list(six.moves.range(len(self._dtypes)))  # Hard coded
     return self._get_return_value(ret, indices)
 
   def get(self, name=None):
@@ -1759,10 +1829,12 @@ class StagingArea(BaseStagingArea):
     if name is None:
       name = "%s_get" % self._name
 
+    # pylint: disable=bad-continuation
     fn = lambda: gen_data_flow_ops.unstage(dtypes=self._dtypes,
                     shared_name=self._name, name=name,
                     capacity=self._capacity,
                     memory_limit=self._memory_limit)
+    # pylint: enable=bad-continuation
 
     return self.__internal_get(fn, name)
 
@@ -1787,10 +1859,12 @@ class StagingArea(BaseStagingArea):
     if name is None:
       name = "%s_peek" % self._name
 
+    # pylint: disable=bad-continuation
     fn = lambda: gen_data_flow_ops.stage_peek(index,
                     dtypes=self._dtypes, shared_name=self._name,
                     name=name, capacity=self._capacity,
                     memory_limit=self._memory_limit)
+    # pylint: enable=bad-continuation
 
     return self.__internal_get(fn, name)
 
@@ -1806,9 +1880,12 @@ class StagingArea(BaseStagingArea):
     if name is None:
       name = "%s_size" % self._name
 
-    return gen_data_flow_ops.stage_size(name=name, shared_name=self._name,
-                        dtypes=self._dtypes, capacity=self._capacity,
-                        memory_limit=self._memory_limit)
+    return gen_data_flow_ops.stage_size(
+        name=name,
+        shared_name=self._name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
 
   def clear(self, name=None):
     """Clears the staging area.
@@ -1822,14 +1899,16 @@ class StagingArea(BaseStagingArea):
     if name is None:
       name = "%s_clear" % self._name
 
-    return gen_data_flow_ops.stage_clear(name=name, shared_name=self._name,
-                        dtypes=self._dtypes, capacity=self._capacity,
-                        memory_limit=self._memory_limit)
+    return gen_data_flow_ops.stage_clear(
+        name=name,
+        shared_name=self._name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
+
 
 class MapStagingArea(BaseStagingArea):
-  """
-  A `MapStagingArea` is a TensorFlow data structure that stores tensors across
-  multiple steps, and exposes operations that can put and get tensors.
+  """A `MapStagingArea` is a TensorFlow data structure that stores tensors across multiple steps, and exposes operations that can put and get tensors.
 
   Each `MapStagingArea` element is a (key, value) pair.
   Only int64 keys are supported, other types should be
@@ -1842,7 +1921,8 @@ class MapStagingArea(BaseStagingArea):
   It supports multiple concurrent producers and consumers; and
   provides exactly-once delivery.
 
-  Each value tuple of a `MapStagingArea` is a fixed-length tuple of tensors whose
+  Each value tuple of a `MapStagingArea` is a fixed-length tuple of tensors
+  whose
   dtypes are described by `dtypes`, and whose shapes are optionally described
   by the `shapes` argument.
 
@@ -1886,10 +1966,16 @@ class MapStagingArea(BaseStagingArea):
   associated with it are removed.
   """
 
-  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
-                      ordered=False, capacity=0, memory_limit=0):
-    """
-    Args:
+  def __init__(self,
+               dtypes,
+               shapes=None,
+               names=None,
+               shared_name=None,
+               ordered=False,
+               capacity=0,
+               memory_limit=0):
+    """Args:
+
       dtypes:  A list of types.  The length of dtypes must equal the number
         of tensors in each element.
       capacity: (Optional.) Maximum number of elements.
@@ -1915,9 +2001,8 @@ class MapStagingArea(BaseStagingArea):
 
     """
 
-    super(MapStagingArea, self).__init__(dtypes, shapes,
-                                      names, shared_name,
-                                      capacity, memory_limit)
+    super(MapStagingArea, self).__init__(dtypes, shapes, names, shared_name,
+                                         capacity, memory_limit)
 
     # Defer to different methods depending if the map is ordered
     self._ordered = ordered
@@ -1940,8 +2025,7 @@ class MapStagingArea(BaseStagingArea):
       self._clear_fn = gen_data_flow_ops.map_clear
 
   def put(self, key, vals, indices=None, name=None):
-    """
-    Create an op that stores the (key, vals) pair in the staging area.
+    """Create an op that stores the (key, vals) pair in the staging area.
 
     Incomplete puts are possible, preferably using a dictionary for vals
     as the appropriate dtypes and shapes can be inferred from the value names
@@ -1963,7 +2047,8 @@ class MapStagingArea(BaseStagingArea):
         The created op
 
     Raises:
-        ValueError: If the number or type of inputs don't match the staging area.
+        ValueError: If the number or type of inputs don't match the staging
+        area.
     """
 
     with ops.name_scope(name, "%s_put" % self._name,
@@ -1972,10 +2057,15 @@ class MapStagingArea(BaseStagingArea):
       vals, indices = self._check_put_dtypes(vals, indices)
 
       with ops.colocate_with(self._coloc_op):
-        op = self._put_fn(key, indices, vals, dtypes=self._dtypes,
-                             shared_name=self._name, name=scope,
-                             capacity=self._capacity,
-                             memory_limit=self._memory_limit)
+        op = self._put_fn(
+            key,
+            indices,
+            vals,
+            dtypes=self._dtypes,
+            shared_name=self._name,
+            name=scope,
+            capacity=self._capacity,
+            memory_limit=self._memory_limit)
     return op
 
   def _get_indices_and_dtypes(self, indices=None):
@@ -1991,13 +2081,13 @@ class MapStagingArea(BaseStagingArea):
     if all(isinstance(i, str) for i in indices):
       if self._names is None:
         raise ValueError("String indices provided '%s', but this Staging Area "
-                        "was not created with names." % indices)
+                         "was not created with names." % indices)
 
       try:
         indices = [self._names.index(n) for n in indices]
       except ValueError:
         raise ValueError("Named index '%s' not in "
-                        "Staging Area names '%s'" % (n, self._names))
+                         "Staging Area names '%s'" % (n, self._names))
     elif all(isinstance(i, int) for i in indices):
       pass
     else:
@@ -2008,10 +2098,8 @@ class MapStagingArea(BaseStagingArea):
 
     return indices, dtypes
 
-
   def peek(self, key, indices=None, name=None):
-    """
-    Peeks at staging area data associated with the key.
+    """Peeks at staging area data associated with the key.
 
     If the key is not in the staging area, it will block
     until the associated (key, value) is inserted.
@@ -2034,22 +2122,22 @@ class MapStagingArea(BaseStagingArea):
     indices, dtypes = self._get_indices_and_dtypes(indices)
 
     with ops.colocate_with(self._coloc_op):
-      result = self._peek_fn(key, shared_name=self._name,
-                      indices=indices,
-                      dtypes=dtypes,
-                      name=name,
-                      capacity=self._capacity,
-                      memory_limit=self._memory_limit)
+      result = self._peek_fn(
+          key,
+          shared_name=self._name,
+          indices=indices,
+          dtypes=dtypes,
+          name=name,
+          capacity=self._capacity,
+          memory_limit=self._memory_limit)
 
     return self._get_return_value(result, indices)
 
   def get(self, key=None, indices=None, name=None):
-    """
-    If the key is provided, the associated (key, value)
-    is returned from the staging area. If the key is not
-    in the staging area, this method will block until
-    the associated (key, value) is inserted.
+    """If the key is provided, the associated (key, value) is returned from the staging area.
 
+    If the key is not in the staging area, this method will block until
+    the associated (key, value) is inserted.
     If no key is provided and the staging area is ordered,
     the (key, value) with the smallest key will be returned.
     Otherwise, a random (key, value) will be returned.
@@ -2074,12 +2162,10 @@ class MapStagingArea(BaseStagingArea):
       return self._pop(key, indices=indices, name=name)
 
   def _pop(self, key, indices=None, name=None):
-    """
-    Remove and return the associated (key, value)
-    is returned from the staging area. If the key is not
-    in the staging area, this method will block until
-    the associated (key, value) is inserted.
+    """Remove and return the associated (key, value) is returned from the staging area.
 
+    If the key is not in the staging area, this method will block until
+    the associated (key, value) is inserted.
     Args:
         key: Key associated with the required data
         indices: Partial list of tensors to retrieve (optional).
@@ -2097,21 +2183,21 @@ class MapStagingArea(BaseStagingArea):
     indices, dtypes = self._get_indices_and_dtypes(indices)
 
     with ops.colocate_with(self._coloc_op):
-      result = self._pop_fn(key, shared_name=self._name,
-                      indices=indices,
-                      dtypes=dtypes,
-                      name=name,
-                      capacity=self._capacity,
-                      memory_limit=self._memory_limit)
+      result = self._pop_fn(
+          key,
+          shared_name=self._name,
+          indices=indices,
+          dtypes=dtypes,
+          name=name,
+          capacity=self._capacity,
+          memory_limit=self._memory_limit)
 
     return key, self._get_return_value(result, indices)
 
   def _popitem(self, indices=None, name=None):
-    """
-    If the staging area is ordered,
-    the (key, value) with the smallest key will be returned.
-    Otherwise, a random (key, value) will be returned.
+    """If the staging area is ordered, the (key, value) with the smallest key will be returned.
 
+    Otherwise, a random (key, value) will be returned.
     If the staging area is empty when this operation executes,
     it will block until there is an element to dequeue.
 
@@ -2132,12 +2218,13 @@ class MapStagingArea(BaseStagingArea):
     indices, dtypes = self._get_indices_and_dtypes(indices)
 
     with ops.colocate_with(self._coloc_op):
-      key, result = self._popitem_fn(shared_name=self._name,
-                              indices=indices,
-                              dtypes=dtypes,
-                              name=name,
-                              capacity=self._capacity,
-                              memory_limit=self._memory_limit)
+      key, result = self._popitem_fn(
+          shared_name=self._name,
+          indices=indices,
+          dtypes=dtypes,
+          name=name,
+          capacity=self._capacity,
+          memory_limit=self._memory_limit)
 
     # Separate keys and results out from
     # underlying namedtuple
@@ -2147,8 +2234,7 @@ class MapStagingArea(BaseStagingArea):
     return key, result
 
   def size(self, name=None):
-    """
-    Returns the number of elements in the staging area.
+    """Returns the number of elements in the staging area.
 
     Args:
         name: A name for the operation (optional)
@@ -2159,14 +2245,15 @@ class MapStagingArea(BaseStagingArea):
     if name is None:
       name = "%s_size" % self._name
 
-    return self._size_fn(shared_name=self._name,
-                        name=name, dtypes=self._dtypes,
-                        capacity=self._capacity,
-                        memory_limit=self._memory_limit)
+    return self._size_fn(
+        shared_name=self._name,
+        name=name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
 
   def incomplete_size(self, name=None):
-    """
-    Returns the number of incomplete elements in the staging area.
+    """Returns the number of incomplete elements in the staging area.
 
     Args:
         name: A name for the operation (optional)
@@ -2177,16 +2264,15 @@ class MapStagingArea(BaseStagingArea):
     if name is None:
       name = "%s_incomplete_size" % self._name
 
-    return self._incomplete_size_fn(shared_name=self._name,
-                        name=name, dtypes=self._dtypes,
-                        capacity=self._capacity,
-                        memory_limit=self._memory_limit)
-
-
+    return self._incomplete_size_fn(
+        shared_name=self._name,
+        name=name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
 
   def clear(self, name=None):
-    """
-    Clears the staging area.
+    """Clears the staging area.
 
     Args:
         name: A name for the operation (optional)
@@ -2197,10 +2283,12 @@ class MapStagingArea(BaseStagingArea):
     if name is None:
       name = "%s_clear" % self._name
 
-    return self._clear_fn(shared_name=self._name,
-                        name=name, dtypes=self._dtypes,
-                        capacity=self._capacity,
-                        memory_limit=self._memory_limit)
+    return self._clear_fn(
+        shared_name=self._name,
+        name=name,
+        dtypes=self._dtypes,
+        capacity=self._capacity,
+        memory_limit=self._memory_limit)
 
 
 class RecordInput(object):
@@ -2225,7 +2313,8 @@ class RecordInput(object):
                shift_ratio=0,
                seed=0,
                name=None,
-               batches=None):
+               batches=None,
+               compression_type=None):
     """Constructs a RecordInput Op.
 
     Args:
@@ -2243,6 +2332,8 @@ class RecordInput(object):
         how many batches to create, which are returned as a list when
         `get_yield_op()` is called. An example use case is to split processing
         between devices on one computer.
+      compression_type: The type of compression for the file. Currently ZLIB and
+        GZIP are supported. Defaults to none.
 
     Raises:
       ValueError: If one of the arguments is invalid.
@@ -2257,12 +2348,17 @@ class RecordInput(object):
     self._shift_ratio = shift_ratio
     self._seed = seed
     self._name = name
+    self._compression_type = python_io.TFRecordCompressionType.NONE
+    if compression_type is not None:
+      self._compression_type = compression_type
 
   def get_yield_op(self):
     """Adds a node that yields a group of records every time it is executed.
     If RecordInput `batches` parameter is not None, it yields a list of
     record batches with the specified `batch_size`.
     """
+    compression_type = python_io.TFRecordOptions.get_compression_type_string(
+        python_io.TFRecordOptions(self._compression_type))
     records = gen_data_flow_ops.record_input(
         file_pattern=self._file_pattern,
         file_buffer_size=self._buffer_size,
@@ -2270,6 +2366,7 @@ class RecordInput(object):
         file_shuffle_shift_ratio=self._shift_ratio,
         batch_size=self._batch_size,
         file_random_seed=self._seed,
+        compression_type=compression_type,
         name=self._name)
     if self._batches is None:
       return records
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index b6b20d1b4a893a4c109560be717339d75fc7ccfc..553e5db8d81f7b687b826368f2663f874441bdb9 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -29,8 +29,10 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("distributions.Bernoulli")
 class Bernoulli(distribution.Distribution):
   """Bernoulli distribution.
 
@@ -165,26 +167,6 @@ class Bernoulli(distribution.Distribution):
     return math_ops.cast(self.probs > 0.5, self.dtype)
 
 
-class BernoulliWithSigmoidProbs(Bernoulli):
-  """Bernoulli with `probs = nn.sigmoid(logits)`."""
-
-  def __init__(self,
-               logits=None,
-               dtype=dtypes.int32,
-               validate_args=False,
-               allow_nan_stats=True,
-               name="BernoulliWithSigmoidProbs"):
-    parameters = locals()
-    with ops.name_scope(name):
-      super(BernoulliWithSigmoidProbs, self).__init__(
-          probs=nn.sigmoid(logits, name="sigmoid_probs"),
-          dtype=dtype,
-          validate_args=validate_args,
-          allow_nan_stats=allow_nan_stats,
-          name=name)
-    self._parameters = parameters
-
-
 @kullback_leibler.RegisterKL(Bernoulli, Bernoulli)
 def _kl_bernoulli_bernoulli(a, b, name=None):
   """Calculate the batched KL divergence KL(a || b) with a and b Bernoulli.
diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py
index 2b93478cdf9f9e80f4c2c19ad25cb270a8e7aa98..be4ef550dddc4f393f3d81730be59fc0def47500 100644
--- a/tensorflow/python/ops/distributions/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -45,6 +46,7 @@ _beta_sample_note = """Note: `x` must have dtype `self.dtype` and be in
 `[0, 1].` It must have a shape compatible with `self.batch_shape()`."""
 
 
+@tf_export("distributions.Beta")
 class Beta(distribution.Distribution):
   """Beta distribution.
 
@@ -307,7 +309,7 @@ class Beta(distribution.Distribution):
             message="sample must be positive"),
         check_ops.assert_less(
             x, array_ops.ones([], self.dtype),
-            message="sample must be no larger than `1`."),
+            message="sample must be less than `1`."),
     ], x)
 
 
diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
index 8f6d18d91ae19ada5ff3715b523635ec8c88adc3..ed435557fde7a2e8a0a4f7eef4e240daef0565e7 100644
--- a/tensorflow/python/ops/distributions/bijector_impl.py
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -111,8 +112,9 @@ class _Mapping(collections.namedtuple(
 
 
 @six.add_metaclass(abc.ABCMeta)
+@tf_export("distributions.bijectors.Bijector")
 class Bijector(object):
-  """Interface for transformations of a `Distribution` sample.
+  r"""Interface for transformations of a `Distribution` sample.
 
   Bijectors can be used to represent any differentiable and injective
   (one to one) function defined on an open subset of `R^n`.  Some non-injective
@@ -120,27 +122,24 @@ class Bijector(object):
 
   #### Mathematical Details
 
-  A `Bijector` implements a
-  [diffeomorphism](https://en.wikipedia.org/wiki/Diffeomorphism), i.e., a
-  bijective, differentiable function. A `Bijector` is used by
-  `TransformedDistribution` but can be generally used for transforming a
-  `Distribution` generated `Tensor`. A `Bijector` is characterized by three
-  operations:
-
-  1. Forward Evaluation
+  A `Bijector` implements a [smooth covering map](
+  https://en.wikipedia.org/wiki/Local_diffeomorphism), i.e., a local
+  diffeomorphism such that every point in the target has a neighborhood evenly
+  covered by a map ([see also](
+  https://en.wikipedia.org/wiki/Covering_space#Covering_of_a_manifold)).
+  A `Bijector` is used by `TransformedDistribution` but can be generally used
+  for transforming a `Distribution` generated `Tensor`. A `Bijector` is
+  characterized by three operations:
 
+  1. Forward\
      Useful for turning one random outcome into another random outcome from a
      different distribution.
-
-  2. Inverse Evaluation
-
+  2. Inverse\
      Useful for "reversing" a transformation to compute one probability in
      terms of another.
-
-  3. (log o det o Jacobian o inverse)(x)
-
+  3. `(log o det o Jacobian o inverse)(x)`\
      "The log of the determinant of the matrix of all first-order partial
-     derivatives of the inverse function."
+     derivatives of the inverse function."\
      Useful for inverting a transformation to compute one probability in terms
      of another. Geometrically, the det(Jacobian) is the volume of the
      transformation and is used to scale the probability.
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 84ca6db4c4b4deea86fb0a0f626eda42f0283d1f..9161e3fa9f5f7f844e7f4926992c954acae246d6 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _broadcast_cat_event_and_params(event, params, base_dtype=dtypes.int32):
@@ -58,6 +59,7 @@ def _broadcast_cat_event_and_params(event, params, base_dtype=dtypes.int32):
   return event, params
 
 
+@tf_export("distributions.Categorical")
 class Categorical(distribution.Distribution):
   """Categorical distribution.
 
@@ -263,7 +265,9 @@ class Categorical(distribution.Distribution):
       logits_2d = self.logits
     else:
       logits_2d = array_ops.reshape(self.logits, [-1, self.event_size])
-    draws = random_ops.multinomial(logits_2d, n, seed=seed)
+    sample_dtype = dtypes.int64 if self.dtype.size > 4 else dtypes.int32
+    draws = random_ops.multinomial(
+        logits_2d, n, seed=seed, output_dtype=sample_dtype)
     draws = array_ops.reshape(
         array_ops.transpose(draws),
         array_ops.concat([[n], self.batch_shape_tensor()], 0))
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index 2accedf1b963f01034f0b4059f44e46eb9bfc5ab..25afeec936069b9cbf926cdc3bbb79226a79aa30 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -42,6 +43,7 @@ dtype `self.dtype` and be in the `(self.event_shape() - 1)`-simplex, i.e.,
 `self.batch_shape() + self.event_shape()`."""
 
 
+@tf_export("distributions.Dirichlet")
 class Dirichlet(distribution.Distribution):
   """Dirichlet distribution.
 
diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
index d792e9fe52dee4325d0956dbb74c8b408d5a1e8c..03a98c56ba509ea1f70f12a74ba67b903013cf70 100644
--- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -49,6 +50,7 @@ fractional components, and such that
 with `self.concentration` and `self.total_count`."""
 
 
+@tf_export("distributions.DirichletMultinomial")
 class DirichletMultinomial(distribution.Distribution):
   """Dirichlet-Multinomial compound distribution.
 
@@ -122,21 +124,22 @@ class DirichletMultinomial(distribution.Distribution):
   #### Examples
 
   ```python
-  alpha = [1, 2, 3]
-  n = 2
+  alpha = [1., 2., 3.]
+  n = 2.
   dist = DirichletMultinomial(n, alpha)
   ```
 
-  Creates a 3-class distribution, with the 3rd class is most likely to be drawn.
+  Creates a 3-class distribution, with the 3rd class is most likely to be
+  drawn.
   The distribution functions can be evaluated on counts.
 
   ```python
   # counts same shape as alpha.
-  counts = [0, 0, 2]
+  counts = [0., 0., 2.]
   dist.prob(counts)  # Shape []
 
-  # alpha will be broadcast to [[1, 2, 3], [1, 2, 3]] to match counts.
-  counts = [[1, 1, 0], [1, 0, 1]]
+  # alpha will be broadcast to [[1., 2., 3.], [1., 2., 3.]] to match counts.
+  counts = [[1., 1., 0.], [1., 0., 1.]]
   dist.prob(counts)  # Shape [2]
 
   # alpha will be broadcast to shape [5, 7, 3] to match counts.
@@ -147,12 +150,12 @@ class DirichletMultinomial(distribution.Distribution):
   Creates a 2-batch of 3-class distributions.
 
   ```python
-  alpha = [[1, 2, 3], [4, 5, 6]]  # Shape [2, 3]
-  n = [3, 3]
+  alpha = [[1., 2., 3.], [4., 5., 6.]]  # Shape [2, 3]
+  n = [3., 3.]
   dist = DirichletMultinomial(n, alpha)
 
-  # counts will be broadcast to [[2, 1, 0], [2, 1, 0]] to match alpha.
-  counts = [2, 1, 0]
+  # counts will be broadcast to [[2., 1., 0.], [2., 1., 0.]] to match alpha.
+  counts = [2., 1., 0.]
   dist.prob(counts)  # Shape [2]
   ```
 
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 22687a093ae72edff1d53131cab49fa004aa3be0..4071e50e815b01d30f3e24ba4677cc37b325f24d 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -31,8 +31,10 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -43,10 +45,26 @@ __all__ = [
 ]
 
 _DISTRIBUTION_PUBLIC_METHOD_WRAPPERS = [
-    "batch_shape_tensor", "batch_shape", "event_shape_tensor", "event_shape",
-    "sample", "log_prob", "prob", "log_cdf", "cdf", "log_survival_function",
-    "survival_function", "entropy", "mean", "variance", "stddev", "mode",
-    "covariance"]
+    "batch_shape",
+    "batch_shape_tensor",
+    "cdf",
+    "covariance",
+    "cross_entropy",
+    "entropy",
+    "event_shape",
+    "event_shape_tensor",
+    "kl_divergence",
+    "log_cdf",
+    "log_prob",
+    "log_survival_function",
+    "mean",
+    "mode",
+    "prob",
+    "sample",
+    "stddev",
+    "survival_function",
+    "variance",
+]
 
 
 @six.add_metaclass(abc.ABCMeta)
@@ -180,6 +198,7 @@ class _DistributionMeta(abc.ABCMeta):
     return abc.ABCMeta.__new__(mcs, classname, baseclasses, attrs)
 
 
+@tf_export("distributions.ReparameterizationType")
 class ReparameterizationType(object):
   """Instances of this class represent how sampling is reparameterized.
 
@@ -222,15 +241,20 @@ class ReparameterizationType(object):
 # reparameterized distribution support straight-through gradients with
 # respect to all parameters.
 FULLY_REPARAMETERIZED = ReparameterizationType("FULLY_REPARAMETERIZED")
+tf_export("distributions.FULLY_REPARAMETERIZED").export_constant(
+    __name__, "FULLY_REPARAMETERIZED")
 
 
 # Not reparameterized distribution: samples from a non-
 # reparameterized distribution do not support straight-through gradients for
 # at least some of the parameters.
 NOT_REPARAMETERIZED = ReparameterizationType("NOT_REPARAMETERIZED")
+tf_export("distributions.NOT_REPARAMETERIZED").export_constant(
+    __name__, "NOT_REPARAMETERIZED")
 
 
 @six.add_metaclass(_DistributionMeta)
+@tf_export("distributions.Distribution")
 class Distribution(_BaseDistribution):
   """A generic probability distribution base class.
 
@@ -608,7 +632,7 @@ class Distribution(_BaseDistribution):
     """Indicates that `event_shape == []`.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       is_scalar_event: `bool` scalar `Tensor`.
@@ -622,7 +646,7 @@ class Distribution(_BaseDistribution):
     """Indicates that `batch_shape == []`.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       is_scalar_batch: `bool` scalar `Tensor`.
@@ -683,7 +707,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       log_prob: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -710,7 +734,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       prob: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -747,7 +771,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       logcdf: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -780,7 +804,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       cdf: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -818,7 +842,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
@@ -853,7 +877,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
@@ -899,7 +923,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       quantile: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -923,7 +947,7 @@ class Distribution(_BaseDistribution):
     denotes expectation, and `Var.shape = batch_shape + event_shape`.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       variance: Floating-point `Tensor` with shape identical to
@@ -954,7 +978,7 @@ class Distribution(_BaseDistribution):
     denotes expectation, and `stddev.shape = batch_shape + event_shape`.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       stddev: Floating-point `Tensor` with shape identical to
@@ -1002,7 +1026,7 @@ class Distribution(_BaseDistribution):
     length-`k'` vector.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       covariance: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
@@ -1020,6 +1044,67 @@ class Distribution(_BaseDistribution):
     with self._name_scope(name):
       return self._mode()
 
+  def _cross_entropy(self, other):
+    return kullback_leibler.cross_entropy(
+        self, other, allow_nan_stats=self.allow_nan_stats)
+
+  def cross_entropy(self, other, name="cross_entropy"):
+    """Computes the (Shannon) cross entropy.
+
+    Denote this distribution (`self`) by `P` and the `other` distribution by
+    `Q`. Assuming `P, Q` are absolutely continuous with respect to
+    one another and permit densities `p(x) dr(x)` and `q(x) dr(x)`, (Shanon)
+    cross entropy is defined as:
+
+    ```none
+    H[P, Q] = E_p[-log q(X)] = -int_F p(x) log q(x) dr(x)
+    ```
+
+    where `F` denotes the support of the random variable `X ~ P`.
+
+    Args:
+      other: `tf.distributions.Distribution` instance.
+      name: Python `str` prepended to names of ops created by this function.
+
+    Returns:
+      cross_entropy: `self.dtype` `Tensor` with shape `[B1, ..., Bn]`
+        representing `n` different calculations of (Shanon) cross entropy.
+    """
+    with self._name_scope(name):
+      return self._cross_entropy(other)
+
+  def _kl_divergence(self, other):
+    return kullback_leibler.kl_divergence(
+        self, other, allow_nan_stats=self.allow_nan_stats)
+
+  def kl_divergence(self, other, name="kl_divergence"):
+    """Computes the Kullback--Leibler divergence.
+
+    Denote this distribution (`self`) by `p` and the `other` distribution by
+    `q`. Assuming `p, q` are absolutely continuous with respect to reference
+    measure `r`, the KL divergence is defined as:
+
+    ```none
+    KL[p, q] = E_p[log(p(X)/q(X))]
+             = -int_F p(x) log q(x) dr(x) + int_F p(x) log p(x) dr(x)
+             = H[p, q] - H[p]
+    ```
+
+    where `F` denotes the support of the random variable `X ~ p`, `H[., .]`
+    denotes (Shanon) cross entropy, and `H[.]` denotes (Shanon) entropy.
+
+    Args:
+      other: `tf.distributions.Distribution` instance.
+      name: Python `str` prepended to names of ops created by this function.
+
+    Returns:
+      kl_divergence: `self.dtype` `Tensor` with shape `[B1, ..., Bn]`
+        representing `n` different calculations of the Kullback-Leibler
+        divergence.
+    """
+    with self._name_scope(name):
+      return self._kl_divergence(other)
+
   @contextlib.contextmanager
   def _name_scope(self, name=None, values=None):
     """Helper function to standardize op scope."""
diff --git a/tensorflow/python/ops/distributions/exponential.py b/tensorflow/python/ops/distributions/exponential.py
index 281641b9156b9631199efc78ea1c2d30119dadb8..6345a76d485c64659aa01fa1611cd27426d8c8a5 100644
--- a/tensorflow/python/ops/distributions/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import gamma
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -35,6 +36,7 @@ __all__ = [
 ]
 
 
+@tf_export("distributions.Exponential")
 class Exponential(gamma.Gamma):
   """Exponential distribution.
 
diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py
index 4ac2b9b4ef894fd9a603ff67bf9c8754f1e23b8e..8fb218be3ac7e17e18d85b8e1c100ccd58aa1034 100644
--- a/tensorflow/python/ops/distributions/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -41,6 +42,7 @@ __all__ = [
 ]
 
 
+@tf_export("distributions.Gamma")
 class Gamma(distribution.Distribution):
   """Gamma distribution.
 
diff --git a/tensorflow/python/ops/distributions/identity_bijector.py b/tensorflow/python/ops/distributions/identity_bijector.py
index f277eda8bbfb88f2344dfd620c573e0acd8d8078..2972c3554b3639a1ae30a4167f73613b1ff8add2 100644
--- a/tensorflow/python/ops/distributions/identity_bijector.py
+++ b/tensorflow/python/ops/distributions/identity_bijector.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -27,6 +28,7 @@ __all__ = [
 ]
 
 
+@tf_export("distributions.bijectors.Identity")
 class Identity(bijector.Bijector):
   """Compute Y = g(X) = X.
 
diff --git a/tensorflow/python/ops/distributions/kullback_leibler.py b/tensorflow/python/ops/distributions/kullback_leibler.py
index a6ab581cc22ce8e9a278bb8e0c7e6afc2dcc30eb..e3c6f3e789eaf57d1fc5a1fcf244c3a0ef2fe0b8 100644
--- a/tensorflow/python/ops/distributions/kullback_leibler.py
+++ b/tensorflow/python/ops/distributions/kullback_leibler.py
@@ -23,6 +23,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 
 _DIVERGENCES = {}
@@ -50,6 +51,7 @@ def _registered_kl(type_a, type_b):
   return kl_fn
 
 
+@tf_export("distributions.kl_divergence")
 def kl_divergence(distribution_a, distribution_b,
                   allow_nan_stats=True, name=None):
   """Get the KL-divergence KL(distribution_a || distribution_b).
@@ -110,6 +112,39 @@ def kl_divergence(distribution_a, distribution_b,
       return array_ops.identity(kl_t, name="checked_kl")
 
 
+def cross_entropy(ref, other,
+                  allow_nan_stats=True, name=None):
+  """Computes the (Shannon) cross entropy.
+
+  Denote two distributions by `P` (`ref`) and `Q` (`other`). Assuming `P, Q`
+  are absolutely continuous with respect to one another and permit densities
+  `p(x) dr(x)` and `q(x) dr(x)`, (Shanon) cross entropy is defined as:
+
+  ```none
+  H[P, Q] = E_p[-log q(X)] = -int_F p(x) log q(x) dr(x)
+  ```
+
+  where `F` denotes the support of the random variable `X ~ P`.
+
+  Args:
+    ref: `tf.distributions.Distribution` instance.
+    other: `tf.distributions.Distribution` instance.
+    allow_nan_stats: Python `bool`, default `True`. When `True`,
+      statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+      indicate the result is undefined. When `False`, an exception is raised
+      if one or more of the statistic's batch members are undefined.
+    name: Python `str` prepended to names of ops created by this function.
+
+  Returns:
+    cross_entropy: `ref.dtype` `Tensor` with shape `[B1, ..., Bn]`
+      representing `n` different calculations of (Shanon) cross entropy.
+  """
+  with ops.name_scope(name, "cross_entropy"):
+    return ref.entropy() + kl_divergence(
+        ref, other, allow_nan_stats=allow_nan_stats)
+
+
+@tf_export("distributions.RegisterKL")
 class RegisterKL(object):
   """Decorator to register a KL divergence implementation function.
 
diff --git a/tensorflow/python/ops/distributions/laplace.py b/tensorflow/python/ops/distributions/laplace.py
index 5c964ff78a53b6d2dec588b85abff2c5b1173c06..e98ac855c58efa1ef3ccef2de24f329d839bac26 100644
--- a/tensorflow/python/ops/distributions/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import special_math
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -41,6 +42,7 @@ __all__ = [
 ]
 
 
+@tf_export("distributions.Laplace")
 class Laplace(distribution.Distribution):
   """The Laplace distribution with location `loc` and `scale` parameters.
 
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index 04762565c2a982f4df47a1a85547db7a104a5ec3..26b5c5aef98fc11b07a8c8357e7ec37819587da9 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -50,6 +51,7 @@ fractional components, and such that
 with `self.probs` and `self.total_count`."""
 
 
+@tf_export("distributions.Multinomial")
 class Multinomial(distribution.Distribution):
   """Multinomial distribution.
 
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 0ef1c91df8c83146fdae086d6056b1d947bae128..e7f120ea2da525e20a1ae42e6418cf2ac83686af 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import special_math
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -40,6 +41,7 @@ __all__ = [
 ]
 
 
+@tf_export("distributions.Normal")
 class Normal(distribution.Distribution):
   """The Normal distribution with location `loc` and `scale` parameters.
 
diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py
index 073ac4286be170dcfd564f61f1026a85d95c772c..778fefb8c2991153b7e7a1f20df61680153dab2a 100644
--- a/tensorflow/python/ops/distributions/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import util as distribution_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -41,6 +42,7 @@ __all__ = [
 ]
 
 
+@tf_export("distributions.StudentT")
 class StudentT(distribution.Distribution):
   """Student's t-distribution.
 
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index ba25b2c3485706cc769b8f37118a994e065c1f93..1efcf9d32e9ea9924bb080459efb7015e33ccd54 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -434,7 +434,7 @@ class TransformedDistribution(distribution_lib.Distribution):
     log_prob = self.distribution.log_prob(x)
     if self._is_maybe_event_override:
       log_prob = math_ops.reduce_sum(log_prob, self._reduce_event_indices)
-    log_prob = ildj + log_prob
+    log_prob += math_ops.cast(ildj, log_prob.dtype)
     if self._is_maybe_event_override:
       log_prob.set_shape(array_ops.broadcast_static_shape(
           y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
@@ -457,7 +457,7 @@ class TransformedDistribution(distribution_lib.Distribution):
     prob = self.distribution.prob(x)
     if self._is_maybe_event_override:
       prob = math_ops.reduce_prod(prob, self._reduce_event_indices)
-    prob *= math_ops.exp(ildj)
+    prob *= math_ops.exp(math_ops.cast(ildj, prob.dtype))
     if self._is_maybe_event_override:
       prob.set_shape(array_ops.broadcast_static_shape(
           y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
@@ -546,7 +546,9 @@ class TransformedDistribution(distribution_lib.Distribution):
       ], 0)
       entropy = array_ops.tile(entropy, multiples)
     dummy = array_ops.zeros([], self.dtype)
-    entropy -= self.bijector.inverse_log_det_jacobian(dummy)
+    entropy -= math_ops.cast(
+        self.bijector.inverse_log_det_jacobian(dummy),
+        entropy.dtype)
     entropy.set_shape(self.batch_shape)
     return entropy
 
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index 9b555f87eae14fe30ff020f996778a4ad8f98ab9..3580af18f241d777c81340f1c565074914838029 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -29,8 +29,10 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("distributions.Uniform")
 class Uniform(distribution.Distribution):
   """Uniform distribution with `low` and `high` parameters.
 
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 41b86f79409aef76dbd710606d09b21f34cab7ba..0a3000ef5ca0decf8aba641e704406b0cf8780af 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -699,6 +700,88 @@ def pick_vector(cond,
         [array_ops.where(cond, 0, n)], [array_ops.where(cond, n, -1)])
 
 
+def prefer_static_broadcast_shape(
+    shape1, shape2, name="prefer_static_broadcast_shape"):
+  """Convenience function which statically broadcasts shape when possible.
+
+  Args:
+    shape1:  `1-D` integer `Tensor`.  Already converted to tensor!
+    shape2:  `1-D` integer `Tensor`.  Already converted to tensor!
+    name:  A string name to prepend to created ops.
+
+  Returns:
+    The broadcast shape, either as `TensorShape` (if broadcast can be done
+      statically), or as a `Tensor`.
+  """
+  with ops.name_scope(name, values=[shape1, shape2]):
+    def make_shape_tensor(x):
+      return ops.convert_to_tensor(x, name="shape", dtype=dtypes.int32)
+
+    def get_tensor_shape(s):
+      if isinstance(s, tensor_shape.TensorShape):
+        return s
+      s_ = tensor_util.constant_value(make_shape_tensor(s))
+      if s_ is not None:
+        return tensor_shape.TensorShape(s_)
+      return None
+
+    def get_shape_tensor(s):
+      if not isinstance(s, tensor_shape.TensorShape):
+        return make_shape_tensor(s)
+      if s.is_fully_defined():
+        return make_shape_tensor(s.as_list())
+      raise ValueError("Cannot broadcast from partially "
+                       "defined `TensorShape`.")
+
+    shape1_ = get_tensor_shape(shape1)
+    shape2_ = get_tensor_shape(shape2)
+    if shape1_ is not None and shape2_ is not None:
+      return array_ops.broadcast_static_shape(shape1_, shape2_)
+
+    shape1_ = get_shape_tensor(shape1)
+    shape2_ = get_shape_tensor(shape2)
+    return array_ops.broadcast_dynamic_shape(shape1_, shape2_)
+
+
+def prefer_static_rank(x):
+  """Return static rank of tensor `x` if available, else `tf.rank(x)`.
+
+  Args:
+    x: `Tensor` (already converted).
+
+  Returns:
+    Numpy array (if static rank is obtainable), else `Tensor`.
+  """
+  return prefer_static_value(array_ops.rank(x))
+
+
+def prefer_static_shape(x):
+  """Return static shape of tensor `x` if available, else `tf.shape(x)`.
+
+  Args:
+    x: `Tensor` (already converted).
+
+  Returns:
+    Numpy array (if static shape is obtainable), else `Tensor`.
+  """
+  return prefer_static_value(array_ops.shape(x))
+
+
+def prefer_static_value(x):
+  """Return static value of tensor `x` if available, else `x`.
+
+  Args:
+    x: `Tensor` (already converted).
+
+  Returns:
+    Numpy array (if static value is obtainable), else `Tensor`.
+  """
+  static_x = tensor_util.constant_value(x)
+  if static_x is not None:
+    return static_x
+  return x
+
+
 def gen_new_seed(seed, salt):
   """Generate a new seed, from the given seed and salt."""
   if seed is None:
@@ -751,6 +834,7 @@ def fill_triangular(x, upper=False, name=None):
   """
 
   with ops.name_scope(name, "fill_triangular", values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
     if x.shape.with_rank_at_least(1)[-1].value is not None:
       # Formula derived by solving for n: m = n(n+1)/2.
       m = np.int32(x.shape[-1].value)
@@ -957,14 +1041,14 @@ def reduce_weighted_logsumexp(
   with ops.name_scope(name, "reduce_weighted_logsumexp", [logx, w]):
     logx = ops.convert_to_tensor(logx, name="logx")
     if w is None:
-      lswe = math_ops.reduce_logsumexp(logx, axis=axis, keep_dims=keep_dims)
+      lswe = math_ops.reduce_logsumexp(logx, axis=axis, keepdims=keep_dims)
       if return_sign:
         sgn = array_ops.ones_like(lswe)
         return lswe, sgn
       return lswe
     w = ops.convert_to_tensor(w, dtype=logx.dtype, name="w")
     log_absw_x = logx + math_ops.log(math_ops.abs(w))
-    max_log_absw_x = math_ops.reduce_max(log_absw_x, axis=axis, keep_dims=True)
+    max_log_absw_x = math_ops.reduce_max(log_absw_x, axis=axis, keepdims=True)
     # If the largest element is `-inf` or `inf` then we don't bother subtracting
     # off the max. We do this because otherwise we'd get `inf - inf = NaN`. That
     # this is ok follows from the fact that we're actually free to subtract any
@@ -978,7 +1062,7 @@ def reduce_weighted_logsumexp(
     sum_wx_over_max_absw_x = math_ops.reduce_sum(
         wx_over_max_absw_x,
         axis=axis,
-        keep_dims=keep_dims)
+        keepdims=keep_dims)
     if not keep_dims:
       max_log_absw_x = array_ops.squeeze(max_log_absw_x, axis)
     sgn = math_ops.sign(sum_wx_over_max_absw_x)
@@ -1050,8 +1134,8 @@ def dimension_size(x, axis):
   """Returns the size of a specific dimension."""
   # Since tf.gather isn't "constant-in, constant-out", we must first check the
   # static shape or fallback to dynamic shape.
-  s = x.shape.with_rank_at_least(axis + 1)[axis].value
-  if axis > -1 and s is not None:
+  s = x.shape.with_rank_at_least(np.abs(axis))[axis].value
+  if s is not None:
     return s
   return array_ops.shape(x)[axis]
 
@@ -1096,31 +1180,103 @@ def process_quadrature_grid_and_probs(
     grid = ops.convert_to_tensor(grid, name="grid", dtype=dtype)
     probs = ops.convert_to_tensor(probs, name="unnormalized_probs",
                                   dtype=dtype)
-    probs /= linalg_ops.norm(probs, ord=1, axis=-1, keep_dims=True,
+    probs /= linalg_ops.norm(probs, ord=1, axis=-1, keepdims=True,
                              name="probs")
 
-    def _static_dim_size(x, axis):
+    def _static_event_size(x):
       """Returns the static size of a specific dimension or `None`."""
-      return x.shape.with_rank_at_least(axis + 1)[axis].value
+      return x.shape.with_rank_at_least(1)[-1].value
 
-    m, n = _static_dim_size(probs, axis=0), _static_dim_size(grid, axis=0)
+    m, n = _static_event_size(probs), _static_event_size(grid)
     if m is not None and n is not None:
       if m != n:
         raise ValueError("`quadrature_grid_and_probs` must be a `tuple` of "
                          "same-length zero-th-dimension `Tensor`s "
                          "(saw lengths {}, {})".format(m, n))
     elif validate_args:
-      grid = control_flow_ops.with_dependencies([
+      assertions = [
           check_ops.assert_equal(
-              dimension_size(probs, axis=0),
-              dimension_size(grid, axis=0),
+              dimension_size(probs, axis=-1),
+              dimension_size(grid, axis=-1),
               message=("`quadrature_grid_and_probs` must be a `tuple` of "
                        "same-length zero-th-dimension `Tensor`s")),
-      ], grid)
-
+      ]
+      with ops.control_dependencies(assertions):
+        grid = array_ops.identity(grid)
+        probs = array_ops.identity(probs)
     return grid, probs
 
 
+def pad(x, axis, front=False, back=False, value=0, count=1, name=None):
+  """Pads `value` to the front and/or back of a `Tensor` dim, `count` times.
+
+  Args:
+    x: `Tensor` input.
+    axis: Scalar `int`-like `Tensor` representing the single dimension to pad.
+      (Negative indexing is supported.)
+    front: Python `bool`; if `True` the beginning of the `axis` dimension is
+      padded with `value`, `count` times. If `False` no front padding is made.
+    back: Python `bool`; if `True` the end of the `axis` dimension is
+      padded with `value`, `count` times. If `False` no end padding is made.
+    value: Scalar `int`-like `Tensor` representing the actual value added to the
+      front and/or back of the `axis` dimension of `x`.
+    count: Scalar `int`-like `Tensor` representing number of elements added to
+      the front and/or back of the `axis` dimension of `x`. E.g., if
+      `front = back = True` then `2 * count` elements are added.
+    name: Python `str` name prefixed to Ops created by this function.
+
+  Returns:
+    pad: The padded version of input `x`.
+
+  Raises:
+    ValueError: if both `front` and `back` are `False`.
+    TypeError: if `count` is not `int`-like.
+  """
+  with ops.name_scope(name, "pad", [x, value, count]):
+    x = ops.convert_to_tensor(x, name="x")
+    value = ops.convert_to_tensor(value, dtype=x.dtype, name="value")
+    count = ops.convert_to_tensor(count, name="count")
+    if not count.dtype.is_integer:
+      raise TypeError("`count.dtype` (`{}`) must be `int`-like.".format(
+          count.dtype.name))
+    if not front and not back:
+      raise ValueError("At least one of `front`, `back` must be `True`.")
+    ndims = (x.shape.ndims if x.shape.ndims is not None
+             else array_ops.rank(x, name="ndims"))
+    axis = ops.convert_to_tensor(axis, name="axis")
+    axis_ = tensor_util.constant_value(axis)
+    if axis_ is not None:
+      axis = axis_
+      if axis < 0:
+        axis = ndims + axis
+      count_ = tensor_util.constant_value(count)
+      if axis_ >= 0 or x.shape.ndims is not None:
+        head = x.shape[:axis]
+        middle = tensor_shape.TensorShape(
+            None if count_ is None
+            else (x.shape[axis] + count_ * (front + back)))
+        tail = x.shape[axis+1:]
+        final_shape = head.concatenate(middle.concatenate(tail))
+      else:
+        final_shape = None
+    else:
+      axis = array_ops.where(axis < 0, ndims + axis, axis)
+      final_shape = None
+    x = array_ops.pad(
+        x,
+        paddings=array_ops.one_hot(
+            indices=array_ops.stack([axis if front else -1,
+                                     axis if back else -1]),
+            depth=ndims,
+            axis=0,
+            on_value=count,
+            dtype=dtypes.int32),
+        constant_values=value)
+    if final_shape is not None:
+      x.set_shape(final_shape)
+    return x
+
+
 class AppendDocstring(object):
   """Helper class to promote private subclass docstring to public counterpart.
 
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index f4561d1a830141a069c12ddb33b83744363844f2..3826585f59c31133b12c365816729e090c9ab561 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _gather(params, ids, name=None):
@@ -257,6 +258,7 @@ def _embedding_lookup_and_transform(params,
       return ret
 
 
+@tf_export("nn.embedding_lookup")
 def embedding_lookup(
     params,
     ids,
@@ -325,6 +327,7 @@ def embedding_lookup(
       transform_fn=None)
 
 
+@tf_export("nn.embedding_lookup_sparse")
 def embedding_lookup_sparse(params,
                             sp_ids,
                             sp_weights,
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 688512bea6b274eed2823794f017e14eb4f128f5..ac03d30fcd2e65f032937d9259bc8fff18626619 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -44,9 +44,11 @@ from tensorflow.python.ops.gen_functional_ops import *
 from tensorflow.python.ops.gen_functional_ops import _symbolic_gradient
 # pylint: enable=unused-import
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(yuanbyu, mrry): Handle stride to support sliding windows.
+@tf_export("foldl")
 def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
           swap_memory=False, name=None):
   """foldl on the list of tensors unpacked from `elems` on dimension 0.
@@ -134,6 +136,7 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     return r_a
 
 
+@tf_export("foldr")
 def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
           swap_memory=False, name=None):
   """foldr on the list of tensors unpacked from `elems` on dimension 0.
@@ -221,6 +224,7 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     return r_a
 
 
+@tf_export("map_fn")
 def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
            swap_memory=False, infer_shape=True, name=None):
   """map on the list of tensors unpacked from `elems` on dimension 0.
@@ -424,6 +428,7 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
     return output_pack(results_flat)
 
 
+@tf_export("scan")
 def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
          swap_memory=False, infer_shape=True, name=None):
   """scan on the list of tensors unpacked from `elems` on dimension 0.
@@ -453,7 +458,7 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
 
   For example, if `elems` is `(t1, [t2, t3])` and `initializer` is
   `[i1, i2]` then an appropriate signature for `fn` in `python2` is:
-  `fn = lambda (acc_p1, acc_p2), (t1 [t2, t3]):` and `fn` must return a list,
+  `fn = lambda (acc_p1, acc_p2), (t1, [t2, t3]):` and `fn` must return a list,
   `[acc_n1, acc_n2]`.  An alternative correct signature for `fn`, and the
    one that works in `python3`, is:
   `fn = lambda a, t:`, where `a` and `t` correspond to the input tuples.
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 1ff196805507f0ca7a1123df0d2a37925fc3e503..12afcd0b517d5e85112c067ccaca5693e5a4e231 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -29,7 +29,9 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _product(t):
@@ -151,6 +153,15 @@ def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta,
     and "y_size" columns where "x_size" is the number of elements in x and
     "y_size" is the number of elements in y.
   """
+  # bfloat16 doesn't have enough bits to represent high precision numbers such
+  # as delta. Convert to float32 here. Since numeric_jacobian is expected to
+  # be the groundtruth to compare against, it shouldn't lose any information.
+  if x.dtype == dtypes.bfloat16:
+    x = math_ops.cast(x, dtypes.float32)
+  if y.dtype == dtypes.bfloat16:
+    y = math_ops.cast(y, dtypes.float32)
+  if x_data.dtype == dtypes.bfloat16.as_numpy_dtype:
+    x_data = x_data.astype(np.float32)
 
   # To compute the jacobian, we treat x and y as one-dimensional vectors
   x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1)
@@ -181,7 +192,7 @@ def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta,
 
 
 def _compute_dx_and_dy(x, y, y_shape):
-  """Returns a node to compute gradient of x wrt y."""
+  """Returns a node to compute gradient of y wrt x."""
   # We make up a dy so that we can compute the gradients. We don't really use
   # the value of dy -- we will always feed it. We need to add an identity node
   # so that we can always feed it properly. Otherwise, for the Add operation,
@@ -189,7 +200,7 @@ def _compute_dx_and_dy(x, y, y_shape):
   with x.graph.as_default():
     dy_orig = constant_op.constant(1.0, shape=y_shape, dtype=y.dtype)
     dy = array_ops.identity(dy_orig)
-  # We compute the gradients for x wrt. y
+  # We compute the gradients for y wrt. x
   grads = gradients.gradients(y, x, dy)
   assert len(grads) == 1
   return grads[0], dy_orig
@@ -206,8 +217,8 @@ def _compute_gradient(x,
                       extra_feed_dict=None):
   """Computes the theoretical and numerical jacobian."""
   t = dtypes.as_dtype(x.dtype)
-  allowed_types = [dtypes.float16, dtypes.float32, dtypes.float64,
-                   dtypes.complex64, dtypes.complex128]
+  allowed_types = [dtypes.float16, dtypes.bfloat16, dtypes.float32,
+                   dtypes.float64, dtypes.complex64, dtypes.complex128]
   assert t.base_dtype in allowed_types, "Don't support type %s for x" % t.name
   t2 = dtypes.as_dtype(y.dtype)
   assert t2.base_dtype in allowed_types, "Don't support type %s for y" % t2.name
@@ -254,6 +265,7 @@ def _compute_gradient_list(x,
   return ret
 
 
+@tf_export("test.compute_gradient")
 def compute_gradient(x,
                      x_shape,
                      y,
@@ -315,6 +327,7 @@ def compute_gradient(x,
     return ret
 
 
+@tf_export("test.compute_gradient_error")
 def compute_gradient_error(x,
                            x_shape,
                            y,
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 8d00a3c6ab2fdfff53b7e9659710659265cedc65..1418c0b10fb60601e7c3024891b89aadb53e6873 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -35,21 +35,23 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import check_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import image_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import logging_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import manip_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import spectral_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import tf_logging as logging
-
+from tensorflow.python.util.tf_export import tf_export
 
 # Warn the user if we convert a sparse representation to dense with at
 # least this number of elements.
@@ -233,9 +235,10 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
           raise TypeError(
               "Gradients of complex tensors must set grad_ys (y.dtype = %r)" %
               y.dtype)
-        new_grad_ys.append(array_ops.fill(
-            array_ops.shape(y), constant_op.constant(
-                1, dtype=y.dtype, name="grad_ys_%d" % i)))
+        new_grad_ys.append(
+            array_ops.fill(
+                array_ops.shape(y),
+                constant_op.constant(1, dtype=y.dtype, name="grad_ys_%d" % i)))
         continue
       if y.dtype.is_floating or y.dtype.is_integer:
         if not grad_y.dtype.is_floating and not grad_y.dtype.is_integer:
@@ -393,6 +396,7 @@ def _MaybeCompile(scope, op, func, grad_fn):
     return grad_fn()
 
 
+@tf_export("gradients")
 def gradients(ys,
               xs,
               grad_ys=None,
@@ -489,11 +493,12 @@ def gradients(ys,
       name, "gradients",
       list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
     ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
-    xs = [x.handle if isinstance(x, resource_variable_ops.ResourceVariable)
-          else x
-          for x in xs]
-    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs, name="x",
-                                                            as_ref=True)
+    xs = [
+        x.handle if resource_variable_ops.is_resource_variable(x) else x
+        for x in xs
+    ]
+    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
+        xs, name="x", as_ref=True)
     grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)
 
     # The approach we take here is as follows: Create a list of all ops in the
@@ -510,9 +515,8 @@ def gradients(ys,
     to_ops = [t.op for t in ys]
     from_ops = [t.op for t in xs]
     stop_gradient_ops = [t.op for t in stop_gradients]
-    pending_count, loop_state = _PendingCount(ops.get_default_graph(), to_ops,
-                                              from_ops,
-                                              colocate_gradients_with_ops)
+    pending_count, loop_state = _PendingCount(
+        ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops)
 
     # Iterate over the collected ops.
     #
@@ -585,9 +589,8 @@ def gradients(ys,
           # output, it means that the cost does not depend on output[i],
           # therefore dC/doutput[i] is 0.
           for i, out_grad in enumerate(out_grads):
-            if (not isinstance(out_grad, ops.Tensor) and
-                not out_grad) and ((not grad_fn and is_func_call) or
-                                   _IsTrainable(op.outputs[i])):
+            if (not isinstance(out_grad, ops.Tensor) and not out_grad) and (
+                (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])):
               # Only trainable outputs or outputs for a function call that
               # will use SymbolicGradient get a zero gradient. Gradient
               # functions should ignore the gradient for other outputs.
@@ -604,17 +607,17 @@ def gradients(ys,
               if grad_fn:
                 # If grad_fn was found, do not use SymbolicGradient even for
                 # functions.
-                in_grads = _MaybeCompile(
-                    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
+                in_grads = _MaybeCompile(grad_scope, op, func_call,
+                                         lambda: grad_fn(op, *out_grads))
               else:
                 # For function call ops, we add a 'SymbolicGradient'
                 # node to the graph to compute gradients.
-                in_grads = _MaybeCompile(
-                    grad_scope, op, func_call, lambda: _SymGrad(op, out_grads))
+                in_grads = _MaybeCompile(grad_scope, op, func_call,
+                                         lambda: _SymGrad(op, out_grads))
               in_grads = _AsList(in_grads)
               _VerifyGeneratedGradients(in_grads, op)
-              if gate_gradients and len(
-                  [x for x in in_grads if x is not None]) > 1:
+              if gate_gradients and len([x for x in in_grads
+                                         if x is not None]) > 1:
                 with ops.device(None):
                   with ops.colocate_with(None, ignore_existing=True):
                     in_grads = control_flow_ops.tuple(in_grads)
@@ -634,8 +637,8 @@ def gradients(ys,
                     "Incompatible shapes between op input and calculated "
                     "input gradient.  Forward operation: %s.  Input index: %d. "
                     "Original input shape: %s.  "
-                    "Calculated input gradient shape: %s"
-                    % (op.name, i, t_in.shape, in_grad.shape))
+                    "Calculated input gradient shape: %s" %
+                    (op.name, i, t_in.shape, in_grad.shape))
             _SetGrad(grads, t_in, in_grad)
         if loop_state:
           loop_state.ExitGradWhileContext(op, before=False)
@@ -667,11 +670,11 @@ def _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state):
     pending_count[x.op._id] -= 1
     ready = (pending_count[x.op._id] == 0)
     if loop_state and not ready:
-      ready = (pending_count[x.op._id] > 0 and
-               control_flow_ops.IsLoopSwitch(x.op))
+      ready = (
+          pending_count[x.op._id] > 0 and control_flow_util.IsLoopSwitch(x.op))
     # pylint: enable=protected-access
     if ready:
-      if control_flow_ops.IsLoopExit(x.op):
+      if control_flow_util.IsLoopExit(x.op):
         # if x is an exit without real gradient, defer processing them.
         grad_state = loop_state.GetGradState(x.op, before=False)
         grad_state.deferred_exits.append(x)
@@ -711,7 +714,7 @@ def _SetGrad(grads, t, grad):
   if isinstance(t_grads, list):
     t_grads.append(grad)
   else:
-    assert control_flow_ops.IsLoopSwitch(op)
+    assert control_flow_util.IsLoopSwitch(op)
     op_grads[t.value_index] = grad
 
 
@@ -722,8 +725,8 @@ def _GetGrad(grads, t):
   if not op_grads:
     return None
   t_grad = op_grads[t.value_index]
-  assert not isinstance(t_grad, list), (
-      "gradients list should have been aggregated by now.")
+  assert not isinstance(
+      t_grad, list), ("gradients list should have been aggregated by now.")
   return t_grad
 
 
@@ -742,9 +745,8 @@ def _HandleNestedIndexedSlices(grad):
   else:
     assert isinstance(grad.values, ops.IndexedSlices)
     g = _HandleNestedIndexedSlices(grad.values)
-    return ops.IndexedSlices(g.values,
-                             array_ops.gather(grad.indices, g.indices),
-                             g.dense_shape)
+    return ops.IndexedSlices(g.values, array_ops.gather(
+        grad.indices, g.indices), g.dense_shape)
 
 
 def _AccumulatorShape(inputs):
@@ -798,6 +800,7 @@ def _MultiDeviceAddN(tensor_list):
   return math_ops.add_n(summands)
 
 
+@tf_export("AggregationMethod")
 class AggregationMethod(object):
   """A class listing aggregation methods used to combine gradients.
 
@@ -845,17 +848,18 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
       AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE,
       AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
   ]:
-    raise ValueError("Invalid aggregation_method specified %s." %
-                     aggregation_method)
+    raise ValueError(
+        "Invalid aggregation_method specified %s." % aggregation_method)
   out_grads = _GetGrads(grads, op)
   for i, out_grad in enumerate(out_grads):
     if loop_state:
       if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
-        assert control_flow_ops.IsLoopSwitch(op)
+        assert control_flow_util.IsLoopSwitch(op)
         continue
     # Grads have to be Tensors or IndexedSlices
     if (isinstance(out_grad, collections.Sequence) and not all([
-        isinstance(g, (ops.Tensor, ops.IndexedSlices)) for g in out_grad
+        isinstance(g, (ops.Tensor, ops.IndexedSlices))
+        for g in out_grad
         if g is not None
     ])):
       raise TypeError("gradients have to be either all Tensors "
@@ -899,8 +903,8 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
         else:
           used = "add_n"
           out_grads[i] = _MultiDeviceAddN(out_grad)
-        logging.vlog(2, "  _AggregatedGrads %d x %s using %s",
-                     len(out_grad), tensor_shape, used)
+        logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
+                     tensor_shape, used)
       else:
         out_grad = math_ops._as_indexed_slices_list(
             [g for g in out_grad if g is not None])
@@ -963,22 +967,26 @@ def _hessian_vector_product(ys, xs, v):
   assert len(grads) == length
   elemwise_products = [
       math_ops.multiply(grad_elem, array_ops.stop_gradient(v_elem))
-      for grad_elem, v_elem in zip(grads, v) if grad_elem is not None
+      for grad_elem, v_elem in zip(grads, v)
+      if grad_elem is not None
   ]
 
   # Second backprop
   return gradients(elemwise_products, xs)
 
 
-def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
-            gate_gradients=False, aggregation_method=None):
+@tf_export("hessians")
+def hessians(ys,
+             xs,
+             name="hessians",
+             colocate_gradients_with_ops=False,
+             gate_gradients=False,
+             aggregation_method=None):
   """Constructs the Hessian of sum of `ys` with respect to `x` in `xs`.
 
   `hessians()` adds ops to the graph to output the Hessian matrix of `ys`
   with respect to `xs`.  It returns a list of `Tensor` of length `len(xs)`
-  where each tensor is the Hessian of `sum(ys)`. This function currently
-  only supports evaluating the Hessian with respect to (a list of) one-
-  dimensional tensors.
+  where each tensor is the Hessian of `sum(ys)`.
 
   The Hessian is a matrix of second-order partial derivatives of a scalar
   tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
@@ -1001,34 +1009,34 @@ def hessians(ys, xs, name="hessians", colocate_gradients_with_ops=False,
   """
   xs = _AsList(xs)
   kwargs = {
-      'colocate_gradients_with_ops': colocate_gradients_with_ops,
-      'gate_gradients': gate_gradients,
-      'aggregation_method': aggregation_method
-    }
+      "colocate_gradients_with_ops": colocate_gradients_with_ops,
+      "gate_gradients": gate_gradients,
+      "aggregation_method": aggregation_method
+  }
   # Compute first-order derivatives and iterate for each x in xs.
   hessians = []
   _gradients = gradients(ys, xs, **kwargs)
-  for i, _gradient, x in zip(range(len(xs)), _gradients, xs):
-    # Ensure that x is a vector.
-    check_rank = check_ops.assert_rank(
-      x, 1, message='Cannot compute Hessian because element %d of `xs` does '
-      'not have rank one.' % i
-    )
-    with ops.control_dependencies([check_rank]):
-      # Declare an iterator and tensor array loop variables for the gradients.
-      n = array_ops.size(x)
-      loop_vars = [
+  for gradient, x in zip(_gradients, xs):
+    # change shape to one-dimension without graph branching
+    gradient = array_ops.reshape(gradient, [-1])
+
+    # Declare an iterator and tensor array loop variables for the gradients.
+    n = array_ops.size(x)
+    loop_vars = [
         array_ops.constant(0, dtypes.int32),
         tensor_array_ops.TensorArray(x.dtype, n)
-      ]
-      # Iterate over all elements of the gradient and compute second order
-      # derivatives.
-      _, hessian = control_flow_ops.while_loop(
-          lambda j, _: j < n,
-          lambda j, result: (j + 1,
-                             result.write(j, gradients(_gradient[j], x)[0])),
-          loop_vars
-      )
-
-      hessians.append(hessian.stack())
+    ]
+    # Iterate over all elements of the gradient and compute second order
+    # derivatives.
+    _, hessian = control_flow_ops.while_loop(
+        lambda j, _: j < n,
+        lambda j, result: (j + 1,
+                           result.write(j, gradients(gradient[j], x)[0])),
+        loop_vars
+    )
+
+    _shape = array_ops.shape(x)
+    _reshaped_hessian = array_ops.reshape(hessian.stack(),
+                                          array_ops.concat((_shape, _shape), 0))
+    hessians.append(_reshaped_hessian)
   return hessians
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index dacc2947fe31b0cbe81f6acacd52fb4a74719090..d39b934819177e3c15af95a0777ba96869c5e9cf 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -573,9 +573,7 @@ class HessianVectorProductTest(test_util.TensorFlowTestCase):
       self.assertAllClose(hess_v_value, hess_v_actual)
 
 
-# TODO(skyewm): reenable C API once
-# ControlFlowContext._RemoveExternalControlEdges works with C API enabled
-# @test_util.with_c_api
+@test_util.with_c_api
 class HessianTest(test_util.TensorFlowTestCase):
 
   def testHessian1D(self):
@@ -623,6 +621,45 @@ class HessianTest(test_util.TensorFlowTestCase):
         with self.assertRaises(ValueError):
           gradients.hessians(x, x)
 
+  def testHessian2D_square_matrix(self):
+    # Manually compute the Hessian explicitly for a low-dimensional problem
+    # and check that `hessian` matches. Specifically, the Hessian of
+    # f(x) = 1/2 * x^T * x is H = constant (block identity matrix)
+    m = 3
+    rng = np.random.RandomState([1, 2, 3])
+    x_value = rng.randn(m, m).astype("float32")
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_value)
+      x_square = math_ops.reduce_sum(
+          math_ops.matmul(array_ops.transpose(x), x) * 0.5
+      )
+      hess = gradients.hessians(x_square, x)[0]
+      hess_actual = hess.eval()
+    hess_value = np.bmat([
+        [elem*np.ones((m, m)) for elem in vec]
+        for vec in np.eye(m)
+    ]).astype("float32")
+    self.assertAllEqual((m, m, m, m), hess_actual.shape)
+    self.assertAllClose(hess_value, hess_actual.reshape((m * m, m * m)))
+
+  def testHessian2D_non_square_matrix(self):
+    m = 3
+    n = 4
+    rng = np.random.RandomState([1, 2, 3])
+    x_value = rng.randn(m, n).astype("float32")
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_value)
+      x_square = math_ops.reduce_sum(
+          math_ops.matmul(array_ops.transpose(x), x) * 0.5
+      )
+      hess = gradients.hessians(x_square, x)[0]
+      hess_actual = hess.eval()
+    hess_value = np.bmat([
+        [elem*np.ones((n, n)) for elem in vec]
+        for vec in np.eye(m)
+    ]).astype("float32")
+    self.assertAllEqual((m, n, m, n), hess_actual.shape)
+    self.assertAllClose(hess_value, hess_actual.reshape((m * n, m * n)))
 
 @test_util.with_c_api
 class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
@@ -667,8 +704,8 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
   def testWarnings(self):
     # TODO(gunan) Reenable after this issue is fixed:
     # https://github.com/google/protobuf/issues/2812
-    if sys.version_info >= (3, 6):
-      self.skipTest("Skipped test for Python 3.6+")
+    if sys.version_info >= (3, 5):
+      self.skipTest("Skipped test for Python 3.5+")
 
     # Smaller than the threshold: no warning.
     c_sparse = ops.IndexedSlices(
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index f834d9002c3e14451bdf2de31cf3c1505e39be4b..f6ef6f3f3da4389a16a84fa0b3570d3cd1262472 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -8,6 +8,7 @@ ConcatV2
 ConjugateTranspose
 Const
 DebugGradientIdentity
+DebugGradientRefIdentity
 EditDistance
 ExpandDims
 ListDiff
@@ -21,6 +22,7 @@ ParallelConcat
 Placeholder
 RefIdentity
 Reverse
+Snapshot
 SpaceToBatch
 Split
 SplitV
@@ -28,6 +30,8 @@ Squeeze
 Slice
 TileGrad  # Exported through array_grad instead of array_ops.
 ZerosLike  # TODO(josh11b): Use this instead of the Python version.
+Unique
+UniqueV2
 Unpack
 
 # candidate_sampling_ops
@@ -224,6 +228,7 @@ BatchSelfAdjointEigV2
 BatchSvd
 LogMatrixDeterminant
 MatrixExponential
+MatrixLogarithm
 MatrixSolveLs
 SelfAdjointEig
 SelfAdjointEigV2
@@ -341,6 +346,7 @@ TruncatedNormal
 # script_ops
 PyFunc
 PyFuncStateless
+EagerPyFunc
 
 # sdca_ops
 
@@ -354,8 +360,8 @@ DestroyTemporaryVariable
 AddSparseToTensorsMap
 AddManySparseToTensorsMap
 TakeManySparseFromTensorsMap
-DeserializeSparse
 DeserializeManySparse
+DeserializeSparse
 SerializeManySparse
 SerializeSparse
 SparseAdd
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 51e4be9343abc6ad68786e05e9cdf87ea48e3d00..6a975160b0698270dfc9ce9140e8b3ff633cdb9e 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -17,6 +17,7 @@
 
 Please see @{$python/histogram_ops} guide.
 
+@@histogram_fixed_width_bins
 @@histogram_fixed_width
 """
 
@@ -30,8 +31,77 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('histogram_fixed_width_bins')
+def histogram_fixed_width_bins(values,
+                               value_range,
+                               nbins=100,
+                               dtype=dtypes.int32,
+                               name=None):
+  """Bins the given values for use in a histogram.
+
+  Given the tensor `values`, this operation returns a rank 1 `Tensor`
+  representing the indices of a histogram into which each element
+  of `values` would be binned. The bins are equal width and
+  determined by the arguments `value_range` and `nbins`.
+
+  Args:
+    values:  Numeric `Tensor`.
+    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
+      values <= value_range[0] will be mapped to hist[0],
+      values >= value_range[1] will be mapped to hist[-1].
+    nbins:  Scalar `int32 Tensor`.  Number of histogram bins.
+    dtype:  dtype for returned histogram.
+    name:  A name for this operation (defaults to 'histogram_fixed_width').
+
+  Returns:
+    A `Tensor` holding the indices of the binned values whose shape matches
+    `values`.
+
+  Examples:
+
+  ```python
+  # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+  nbins = 5
+  value_range = [0.0, 5.0]
+  new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+
+  with tf.get_default_session() as sess:
+    indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5)
+    variables.global_variables_initializer().run()
+    sess.run(indices) => [0, 0, 1, 2, 4]
+  ```
+  """
+  with ops.name_scope(name, 'histogram_fixed_width_bins',
+                      [values, value_range, nbins]):
+    values = ops.convert_to_tensor(values, name='values')
+    shape = array_ops.shape(values)
+
+    values = array_ops.reshape(values, [-1])
+    value_range = ops.convert_to_tensor(value_range, name='value_range')
+    nbins = ops.convert_to_tensor(nbins, dtype=dtypes.int32, name='nbins')
+    nbins_float = math_ops.cast(nbins, values.dtype)
+
+    # Map tensor values that fall within value_range to [0, 1].
+    scaled_values = math_ops.truediv(
+        values - value_range[0],
+        value_range[1] - value_range[0],
+        name='scaled_values')
+
+    # map tensor values within the open interval value_range to {0,.., nbins-1},
+    # values outside the open interval will be zero or less, or nbins or more.
+    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
+
+    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
+    indices = math_ops.cast(
+        clip_ops.clip_by_value(indices, 0, nbins_float - 1), dtypes.int32)
+    return array_ops.reshape(indices, shape)
+
+
+@tf_export('histogram_fixed_width')
 def histogram_fixed_width(values,
                           value_range,
                           nbins=100,
@@ -71,5 +141,5 @@ def histogram_fixed_width(values,
   """
   with ops.name_scope(name, 'histogram_fixed_width',
                       [values, value_range, nbins]) as name:
-    return gen_math_ops._histogram_fixed_width(values, value_range, nbins,
-                                               dtype=dtype, name=name)
+    return gen_math_ops._histogram_fixed_width(  # pylint: disable=protected-access
+        values, value_range, nbins, dtype=dtype, name=name)
diff --git a/tensorflow/python/ops/histogram_ops_test.py b/tensorflow/python/ops/histogram_ops_test.py
index 19ad6cd2ba2b8278656a33a331995336037db356..a226ac81bb536934cd191872ffc1aca84925abc0 100644
--- a/tensorflow/python/ops/histogram_ops_test.py
+++ b/tensorflow/python/ops/histogram_ops_test.py
@@ -21,11 +21,64 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import histogram_ops
 from tensorflow.python.platform import test
 
 
+class BinValuesFixedWidth(test.TestCase):
+
+  def test_empty_input_gives_all_zero_counts(self):
+    # Bins will be:
+    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+    value_range = [0.0, 5.0]
+    values = []
+    expected_bins = []
+    with self.test_session():
+      bins = histogram_ops.histogram_fixed_width_bins(
+          values, value_range, nbins=5)
+      self.assertEqual(dtypes.int32, bins.dtype)
+      self.assertAllClose(expected_bins, bins.eval())
+
+  def test_1d_values_int32_output(self):
+    # Bins will be:
+    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+    value_range = [0.0, 5.0]
+    values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+    expected_bins = [0, 0, 1, 2, 4, 4]
+    with self.test_session():
+      bins = histogram_ops.histogram_fixed_width_bins(
+          values, value_range, nbins=5, dtype=dtypes.int64)
+      self.assertEqual(dtypes.int32, bins.dtype)
+      self.assertAllClose(expected_bins, bins.eval())
+
+  def test_1d_float64_values_int32_output(self):
+    # Bins will be:
+    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+    value_range = np.float64([0.0, 5.0])
+    values = np.float64([-1.0, 0.0, 1.5, 2.0, 5.0, 15])
+    expected_bins = [0, 0, 1, 2, 4, 4]
+    with self.test_session():
+      bins = histogram_ops.histogram_fixed_width_bins(
+          values, value_range, nbins=5)
+      self.assertEqual(dtypes.int32, bins.dtype)
+      self.assertAllClose(expected_bins, bins.eval())
+
+  def test_2d_values(self):
+    # Bins will be:
+    #   (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+    value_range = [0.0, 5.0]
+    values = constant_op.constant(
+        [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]], shape=(2, 3))
+    expected_bins = [[0, 0, 1], [2, 4, 4]]
+    with self.test_session():
+      bins = histogram_ops.histogram_fixed_width_bins(
+          values, value_range, nbins=5)
+      self.assertEqual(dtypes.int32, bins.dtype)
+      self.assertAllClose(expected_bins, bins.eval())
+
+
 class HistogramFixedWidthTest(test.TestCase):
 
   def setUp(self):
@@ -87,8 +140,8 @@ class HistogramFixedWidthTest(test.TestCase):
       self.assertEqual(dtypes.int32, hist.dtype)
       self.assertAllClose(expected_bin_counts, hist.eval())
 
-      hist = histogram_ops.histogram_fixed_width(values, value_range,
-                                                 nbins=placeholder)
+      hist = histogram_ops.histogram_fixed_width(
+          values, value_range, nbins=placeholder)
       self.assertEquals(hist.shape.ndims, 1)
       self.assertIs(hist.shape[0].value, None)
       self.assertEqual(dtypes.int32, hist.dtype)
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 3b0b5a978c9f79dca9b87d3a7b6478b63e1fcb8d..ae52d32fea1c872e588c4122f5e73198e4dfe9ad 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -26,6 +26,7 @@ See the @{$python/image} guide.
 @@extract_jpeg_shape
 @@decode_png
 @@encode_png
+@@is_jpeg
 @@decode_image
 @@resize_images
 @@resize_area
@@ -49,6 +50,10 @@ See the @{$python/image} guide.
 @@grayscale_to_rgb
 @@hsv_to_rgb
 @@rgb_to_hsv
+@@rgb_to_yiq
+@@yiq_to_rgb
+@@rgb_to_yuv
+@@yuv_to_rgb
 @@convert_image_dtype
 @@adjust_brightness
 @@random_brightness
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index b9c89d62d556633100770d75cce89c63eeeb19ad..0c0e92d5b00b36f2fbd800afc046faa1fc77b95c 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -12,15 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Implementation of image ops."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -28,7 +25,6 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
@@ -36,7 +32,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
-
+from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable('RandomCrop')
 # TODO(b/31222613): This op may be differentiable, and there may be
@@ -109,8 +105,9 @@ def _ImageDimensions(image, rank):
   else:
     static_shape = image.get_shape().with_rank(rank).as_list()
     dynamic_shape = array_ops.unstack(array_ops.shape(image), rank)
-    return [s if s is not None else d
-            for s, d in zip(static_shape, dynamic_shape)]
+    return [
+        s if s is not None else d for s, d in zip(static_shape, dynamic_shape)
+    ]
 
 
 def _Check3DImage(image, require_static=True):
@@ -131,22 +128,65 @@ def _Check3DImage(image, require_static=True):
   try:
     image_shape = image.get_shape().with_rank(3)
   except ValueError:
-    raise ValueError("'image' (shape %s) must be three-dimensional." %
-                     image.shape)
+    raise ValueError(
+        "'image' (shape %s) must be three-dimensional." % image.shape)
   if require_static and not image_shape.is_fully_defined():
-    raise ValueError("'image' (shape %s) must be fully defined." %
-                     image_shape)
+    raise ValueError("'image' (shape %s) must be fully defined." % image_shape)
   if any(x == 0 for x in image_shape):
-    raise ValueError("all dims of 'image.shape' must be > 0: %s" %
-                     image_shape)
+    raise ValueError("all dims of 'image.shape' must be > 0: %s" % image_shape)
   if not image_shape.is_fully_defined():
-    return [check_ops.assert_positive(array_ops.shape(image),
-                                      ["all dims of 'image.shape' "
-                                       "must be > 0."])]
+    return [
+        check_ops.assert_positive(
+            array_ops.shape(image),
+            ["all dims of 'image.shape' "
+             'must be > 0.'])
+    ]
   else:
     return []
 
 
+def _Assert3DImage(image):
+  """Assert that we are working with a properly shaped image.
+
+    Performs the check statically if possible (i.e. if the shape
+    is statically known). Otherwise adds a control dependency
+    to an assert op that checks the dynamic shape.
+
+    Args:
+      image: 3-D Tensor of shape [height, width, channels]
+
+    Raises:
+      ValueError: if `image.shape` is not a 3-vector.
+
+    Returns:
+      If the shape of `image` could be verified statically, `image` is
+      returned unchanged, otherwise there will be a control dependency
+      added that asserts the correct dynamic shape.
+    """
+  return control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
+
+def _AssertAtLeast3DImage(image):
+  """Assert that we are working with a properly shaped image.
+
+    Performs the check statically if possible (i.e. if the shape
+    is statically known). Otherwise adds a control dependency
+    to an assert op that checks the dynamic shape.
+
+    Args:
+      image: >= 3-D Tensor of size [*, height, width, depth]
+
+    Raises:
+      ValueError: if image.shape is not a [>= 3] vector.
+
+    Returns:
+      If the shape of `image` could be verified statically, `image` is
+      returned unchanged, otherwise there will be a control dependency
+      added that asserts the correct dynamic shape.
+    """
+  return control_flow_ops.with_dependencies(
+      _CheckAtLeast3DImage(image, require_static=False), image)
+
 def _CheckAtLeast3DImage(image, require_static=True):
   """Assert that we are working with properly shaped image.
 
@@ -172,12 +212,15 @@ def _CheckAtLeast3DImage(image, require_static=True):
   if require_static and not image_shape.is_fully_defined():
     raise ValueError('\'image\' must be fully defined.')
   if any(x == 0 for x in image_shape):
-    raise ValueError('all dims of \'image.shape\' must be > 0: %s' %
-                     image_shape)
+    raise ValueError(
+        'all dims of \'image.shape\' must be > 0: %s' % image_shape)
   if not image_shape.is_fully_defined():
-    return [check_ops.assert_positive(array_ops.shape(image),
-                                      ["all dims of 'image.shape' "
-                                       "must be > 0."])]
+    return [
+        check_ops.assert_positive(
+            array_ops.shape(image),
+            ["all dims of 'image.shape' "
+             'must be > 0.'])
+    ]
   else:
     return []
 
@@ -201,6 +244,7 @@ def fix_image_flip_shape(image, result):
   return result
 
 
+@tf_export('image.random_flip_up_down')
 def random_flip_up_down(image, seed=None):
   """Randomly flips an image vertically (upside down).
 
@@ -219,17 +263,20 @@ def random_flip_up_down(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-  mirror_cond = math_ops.less(uniform_random, .5)
-  result = control_flow_ops.cond(mirror_cond,
-                                 lambda: array_ops.reverse(image, [0]),
-                                 lambda: image)
-  return fix_image_flip_shape(image, result)
-
-
+  with ops.name_scope(None, 'random_flip_up_down', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = _Assert3DImage(image)
+    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+    mirror_cond = math_ops.less(uniform_random, .5)
+    result = control_flow_ops.cond(
+        mirror_cond,
+        lambda: array_ops.reverse(image, [0]),
+        lambda: image,
+        name=scope)
+    return fix_image_flip_shape(image, result)
+
+
+@tf_export('image.random_flip_left_right')
 def random_flip_left_right(image, seed=None):
   """Randomly flip an image horizontally (left to right).
 
@@ -248,120 +295,204 @@ def random_flip_left_right(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-  mirror_cond = math_ops.less(uniform_random, .5)
-  result = control_flow_ops.cond(mirror_cond,
-                                 lambda: array_ops.reverse(image, [1]),
-                                 lambda: image)
-  return fix_image_flip_shape(image, result)
-
-
+  with ops.name_scope(None, 'random_flip_left_right', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = _Assert3DImage(image)
+    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+    mirror_cond = math_ops.less(uniform_random, .5)
+    result = control_flow_ops.cond(
+        mirror_cond,
+        lambda: array_ops.reverse(image, [1]),
+        lambda: image,
+        name=scope)
+    return fix_image_flip_shape(image, result)
+
+
+@tf_export('image.flip_left_right')
 def flip_left_right(image):
   """Flip an image horizontally (left to right).
 
-  Outputs the contents of `image` flipped along the second dimension, which is
-  `width`.
+  Outputs the contents of `image` flipped along the width dimension.
 
   See also `reverse()`.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
+    A tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
+  with ops.name_scope(None, 'flip_left_right', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = _AssertAtLeast3DImage(image)
+    shape = image.get_shape()
+    if shape.ndims == 3 or shape.ndims is None:
+      return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
+    elif shape.ndims == 4:
+      return array_ops.reverse(image, [2])
+    else:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
 
+@tf_export('image.flip_up_down')
 def flip_up_down(image):
   """Flip an image vertically (upside down).
 
-  Outputs the contents of `image` flipped along the first dimension, which is
-  `height`.
+  Outputs the contents of `image` flipped along the height dimension.
 
   See also `reverse()`.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
+    A tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
+  with ops.name_scope(None, 'flip_up_down', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = _AssertAtLeast3DImage(image)
+    shape = image.get_shape()
+    if shape.ndims == 3 or shape.ndims is None:
+      return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
+    elif shape.ndims == 4:
+      return array_ops.reverse(image, [1])
+    else:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
 
+@tf_export('image.rot90')
 def rot90(image, k=1, name=None):
-  """Rotate an image counter-clockwise by 90 degrees.
+  """Rotate image(s) counter-clockwise by 90 degrees.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels]`.
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     k: A scalar integer. The number of times the image is rotated by 90 degrees.
     name: A name for this operation (optional).
 
   Returns:
-    A rotated 3-D tensor of the same type and shape as `image`.
+    A rotated tensor of the same type and shape as `image`.
+
+  Raises:
+    ValueError: if the shape of `image` not supported.
   """
   with ops.name_scope(name, 'rot90', [image, k]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = control_flow_ops.with_dependencies(
-        _Check3DImage(image, require_static=False), image)
+    image = _AssertAtLeast3DImage(image)
     k = ops.convert_to_tensor(k, dtype=dtypes.int32, name='k')
     k.get_shape().assert_has_rank(0)
     k = math_ops.mod(k, 4)
 
-    def _rot90():
-      return array_ops.transpose(array_ops.reverse_v2(image, [1]),
-                                 [1, 0, 2])
-    def _rot180():
-      return array_ops.reverse_v2(image, [0, 1])
-    def _rot270():
-      return array_ops.reverse_v2(array_ops.transpose(image, [1, 0, 2]),
-                                  [1])
-    cases = [(math_ops.equal(k, 1), _rot90),
-             (math_ops.equal(k, 2), _rot180),
-             (math_ops.equal(k, 3), _rot270)]
+    shape = image.get_shape()
+    if shape.ndims == 3 or shape.ndims is None:
+      return _rot90_3D(image, k, scope)
+    elif shape.ndims == 4:
+      return _rot90_4D(image, k, scope)
+    else:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+
 
-    ret = control_flow_ops.case(cases, default=lambda: image, exclusive=True,
-                                name=scope)
-    ret.set_shape([None, None, image.get_shape()[2]])
-    return ret
+def _rot90_3D(image, k, name_scope):
+  """Rotate image counter-clockwise by 90 degrees `k` times.
 
+  Args:
+    image: 3-D Tensor of shape `[height, width, channels]`.
+    k: A scalar integer. The number of times the image is rotated by 90 degrees.
+    name_scope: A valid TensorFlow name scope.
 
+  Returns:
+    A 3-D tensor of the same type and shape as `image`.
+
+  """
+  def _rot90():
+    return array_ops.transpose(array_ops.reverse_v2(image, [1]),
+                               [1, 0, 2])
+  def _rot180():
+    return array_ops.reverse_v2(image, [0, 1])
+  def _rot270():
+    return array_ops.reverse_v2(array_ops.transpose(image, [1, 0, 2]),
+                                [1])
+  cases = [(math_ops.equal(k, 1), _rot90),
+           (math_ops.equal(k, 2), _rot180),
+           (math_ops.equal(k, 3), _rot270)]
+
+  result = control_flow_ops.case(cases, default=lambda: image, exclusive=True,
+                                 name=name_scope)
+  result.set_shape([None, None, image.get_shape()[2]])
+  return result
+
+def _rot90_4D(images, k, name_scope):
+  """Rotate batch of images counter-clockwise by 90 degrees `k` times.
+
+  Args:
+    images: 4-D Tensor of shape `[height, width, channels]`.
+    k: A scalar integer. The number of times the images are rotated by 90
+      degrees.
+    name_scope: A valid TensorFlow name scope.
+
+  Returns:
+    A 4-D tensor of the same type and shape as `images`.
+
+  """
+  def _rot90():
+    return array_ops.transpose(array_ops.reverse_v2(images, [2]),
+                               [0, 2, 1, 3])
+  def _rot180():
+    return array_ops.reverse_v2(images, [1, 2])
+  def _rot270():
+    return array_ops.reverse_v2(array_ops.transpose(images, [0, 2, 1, 3]),
+                                [2])
+
+  cases = [(math_ops.equal(k, 1), _rot90),
+           (math_ops.equal(k, 2), _rot180),
+           (math_ops.equal(k, 3), _rot270)]
+
+  result = control_flow_ops.case(cases, default=lambda: images, exclusive=True,
+                                 name=name_scope)
+  shape = result.get_shape()
+  result.set_shape([shape[0], None, None, shape[3]])
+  return result
+
+@tf_export('image.transpose_image')
 def transpose_image(image):
-  """Transpose an image by swapping the first and second dimension.
+  """Transpose image(s) by swapping the height and width dimension.
 
   See also `transpose()`.
 
   Args:
-    image: 3-D tensor of shape `[height, width, channels]`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
 
   Returns:
-    A 3-D tensor of shape `[width, height, channels]`
+    If `image` was 4-D, a 4-D float Tensor of shape
+   `[batch, width, height, channels]`
+    If `image` was 3-D, a 3-D float Tensor of shape
+   `[width, height, channels]`
 
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  return array_ops.transpose(image, [1, 0, 2], name='transpose_image')
+  with ops.name_scope(None, 'transpose_image', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = _AssertAtLeast3DImage(image)
+    shape = image.get_shape()
+    if shape.ndims == 3 or shape.ndims is None:
+      return array_ops.transpose(image, [1, 0, 2], name='transpose_image')
+    elif shape.ndims == 4:
+      return array_ops.transpose(image, [0, 2, 1, 3], name='transpose_image')
+    else:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
 
+@tf_export('image.central_crop')
 def central_crop(image, central_fraction):
   """Crop the central region of the image.
 
@@ -386,34 +517,35 @@ def central_crop(image, central_fraction):
   Returns:
     3-D float Tensor
   """
-  image = ops.convert_to_tensor(image, name='image')
-  if central_fraction <= 0.0 or central_fraction > 1.0:
-    raise ValueError('central_fraction must be within (0, 1]')
-  if central_fraction == 1.0:
-    return image
+  with ops.name_scope(None, 'central_crop', [image]):
+    image = ops.convert_to_tensor(image, name='image')
+    if central_fraction <= 0.0 or central_fraction > 1.0:
+      raise ValueError('central_fraction must be within (0, 1]')
+    if central_fraction == 1.0:
+      return image
 
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
+    image = _Assert3DImage(image)
 
-  img_shape = array_ops.shape(image)
-  depth = image.get_shape()[2]
-  img_h = math_ops.to_double(img_shape[0])
-  img_w = math_ops.to_double(img_shape[1])
-  bbox_h_start = math_ops.to_int32((img_h - img_h * central_fraction) / 2)
-  bbox_w_start = math_ops.to_int32((img_w - img_w * central_fraction) / 2)
+    img_shape = array_ops.shape(image)
+    depth = image.get_shape()[2]
+    img_h = math_ops.to_double(img_shape[0])
+    img_w = math_ops.to_double(img_shape[1])
+    bbox_h_start = math_ops.to_int32((img_h - img_h * central_fraction) / 2)
+    bbox_w_start = math_ops.to_int32((img_w - img_w * central_fraction) / 2)
 
-  bbox_h_size = img_shape[0] - bbox_h_start * 2
-  bbox_w_size = img_shape[1] - bbox_w_start * 2
+    bbox_h_size = img_shape[0] - bbox_h_start * 2
+    bbox_w_size = img_shape[1] - bbox_w_start * 2
 
-  bbox_begin = array_ops.stack([bbox_h_start, bbox_w_start, 0])
-  bbox_size = array_ops.stack([bbox_h_size, bbox_w_size, -1])
-  image = array_ops.slice(image, bbox_begin, bbox_size)
+    bbox_begin = array_ops.stack([bbox_h_start, bbox_w_start, 0])
+    bbox_size = array_ops.stack([bbox_h_size, bbox_w_size, -1])
+    image = array_ops.slice(image, bbox_begin, bbox_size)
 
-  # The first two dimensions are dynamic and unknown.
-  image.set_shape([None, None, depth])
-  return image
+    # The first two dimensions are dynamic and unknown.
+    image.set_shape([None, None, depth])
+    return image
 
 
+@tf_export('image.pad_to_bounding_box')
 def pad_to_bounding_box(image, offset_height, offset_width, target_height,
                         target_width):
   """Pad `image` with zeros to the specified `height` and `width`.
@@ -444,55 +576,59 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
       `target_*` arguments, or either `offset_height` or `offset_width` is
       negative.
   """
-  image = ops.convert_to_tensor(image, name='image')
+  with ops.name_scope(None, 'pad_to_bounding_box', [image]):
+    image = ops.convert_to_tensor(image, name='image')
 
-  is_batch = True
-  image_shape = image.get_shape()
-  if image_shape.ndims == 3:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-  elif image_shape.ndims is None:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-    image.set_shape([None] * 4)
-  elif image_shape.ndims != 4:
-    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
-
-  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
-
-  batch, height, width, depth = _ImageDimensions(image, rank=4)
-
-  after_padding_width = target_width - offset_width - width
-  after_padding_height = target_height - offset_height - height
-
-  assert_ops += _assert(offset_height >= 0, ValueError,
-                        'offset_height must be >= 0')
-  assert_ops += _assert(offset_width >= 0, ValueError,
-                        'offset_width must be >= 0')
-  assert_ops += _assert(after_padding_width >= 0, ValueError,
-                        'width must be <= target - offset')
-  assert_ops += _assert(after_padding_height >= 0, ValueError,
-                        'height must be <= target - offset')
-  image = control_flow_ops.with_dependencies(assert_ops, image)
-
-  # Do not pad on the depth dimensions.
-  paddings = array_ops.reshape(
-      array_ops.stack([
-          0, 0, offset_height, after_padding_height, offset_width,
-          after_padding_width, 0, 0
-      ]), [4, 2])
-  padded = array_ops.pad(image, paddings)
-
-  padded_shape = [None if _is_tensor(i) else i
-                  for i in [batch, target_height, target_width, depth]]
-  padded.set_shape(padded_shape)
-
-  if not is_batch:
-    padded = array_ops.squeeze(padded, squeeze_dims=[0])
-
-  return padded
+    is_batch = True
+    image_shape = image.get_shape()
+    if image_shape.ndims == 3:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+    elif image_shape.ndims is None:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+      image.set_shape([None] * 4)
+    elif image_shape.ndims != 4:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+
+    assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+    batch, height, width, depth = _ImageDimensions(image, rank=4)
+
+    after_padding_width = target_width - offset_width - width
+
+    after_padding_height = target_height - offset_height - height
+
+    assert_ops += _assert(offset_height >= 0, ValueError,
+                          'offset_height must be >= 0')
+    assert_ops += _assert(offset_width >= 0, ValueError,
+                          'offset_width must be >= 0')
+    assert_ops += _assert(after_padding_width >= 0, ValueError,
+                          'width must be <= target - offset')
+    assert_ops += _assert(after_padding_height >= 0, ValueError,
+                          'height must be <= target - offset')
+    image = control_flow_ops.with_dependencies(assert_ops, image)
+
+    # Do not pad on the depth dimensions.
+    paddings = array_ops.reshape(
+        array_ops.stack([
+            0, 0, offset_height, after_padding_height, offset_width,
+            after_padding_width, 0, 0
+        ]), [4, 2])
+    padded = array_ops.pad(image, paddings)
+
+    padded_shape = [
+        None if _is_tensor(i) else i
+        for i in [batch, target_height, target_width, depth]
+    ]
+    padded.set_shape(padded_shape)
+
+    if not is_batch:
+      padded = array_ops.squeeze(padded, squeeze_dims=[0])
+
+    return padded
 
 
+@tf_export('image.crop_to_bounding_box')
 def crop_to_bounding_box(image, offset_height, offset_width, target_height,
                          target_width):
   """Crops an image to a specified bounding box.
@@ -523,53 +659,56 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
       `target_*` arguments, or either `offset_height` or `offset_width` is
       negative, or either `target_height` or `target_width` is not positive.
   """
-  image = ops.convert_to_tensor(image, name='image')
+  with ops.name_scope(None, 'crop_to_bounding_box', [image]):
+    image = ops.convert_to_tensor(image, name='image')
 
-  is_batch = True
-  image_shape = image.get_shape()
-  if image_shape.ndims == 3:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-  elif image_shape.ndims is None:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-    image.set_shape([None] * 4)
-  elif image_shape.ndims != 4:
-    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
-
-  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
-
-  batch, height, width, depth = _ImageDimensions(image, rank=4)
-
-  assert_ops += _assert(offset_width >= 0, ValueError,
-                        'offset_width must be >= 0.')
-  assert_ops += _assert(offset_height >= 0, ValueError,
-                        'offset_height must be >= 0.')
-  assert_ops += _assert(target_width > 0, ValueError,
-                        'target_width must be > 0.')
-  assert_ops += _assert(target_height > 0, ValueError,
-                        'target_height must be > 0.')
-  assert_ops += _assert(width >= (target_width + offset_width), ValueError,
-                        'width must be >= target + offset.')
-  assert_ops += _assert(height >= (target_height + offset_height), ValueError,
-                        'height must be >= target + offset.')
-  image = control_flow_ops.with_dependencies(assert_ops, image)
-
-  cropped = array_ops.slice(
-      image,
-      array_ops.stack([0, offset_height, offset_width, 0]),
-      array_ops.stack([-1, target_height, target_width, -1]))
-
-  cropped_shape = [None if _is_tensor(i) else i
-                   for i in [batch, target_height, target_width, depth]]
-  cropped.set_shape(cropped_shape)
-
-  if not is_batch:
-    cropped = array_ops.squeeze(cropped, squeeze_dims=[0])
-
-  return cropped
+    is_batch = True
+    image_shape = image.get_shape()
+    if image_shape.ndims == 3:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+    elif image_shape.ndims is None:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+      image.set_shape([None] * 4)
+    elif image_shape.ndims != 4:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+
+    assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+
+    batch, height, width, depth = _ImageDimensions(image, rank=4)
+
+    assert_ops += _assert(offset_width >= 0, ValueError,
+                          'offset_width must be >= 0.')
+    assert_ops += _assert(offset_height >= 0, ValueError,
+                          'offset_height must be >= 0.')
+    assert_ops += _assert(target_width > 0, ValueError,
+                          'target_width must be > 0.')
+    assert_ops += _assert(target_height > 0, ValueError,
+                          'target_height must be > 0.')
+    assert_ops += _assert(width >= (target_width + offset_width), ValueError,
+                          'width must be >= target + offset.')
+    assert_ops += _assert(height >= (target_height + offset_height), ValueError,
+                          'height must be >= target + offset.')
+    image = control_flow_ops.with_dependencies(assert_ops, image)
+
+    cropped = array_ops.slice(
+        image, array_ops.stack([0, offset_height, offset_width, 0]),
+        array_ops.stack([-1, target_height, target_width, -1]))
+
+    cropped_shape = [
+        None if _is_tensor(i) else i
+        for i in [batch, target_height, target_width, depth]
+    ]
+    cropped.set_shape(cropped_shape)
+
+    if not is_batch:
+      cropped = array_ops.squeeze(cropped, squeeze_dims=[0])
+
+    return cropped
 
 
+@tf_export('image.resize_image_with_crop_or_pad')
 def resize_image_with_crop_or_pad(image, target_height, target_width):
   """Crops and/or pads an image to a target width and height.
 
@@ -598,90 +737,95 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
     If `images` was 3-D, a 3-D float Tensor of shape
     `[new_height, new_width, channels]`.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image_shape = image.get_shape()
-  is_batch = True
-  if image_shape.ndims == 3:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-  elif image_shape.ndims is None:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-    image.set_shape([None] * 4)
-  elif image_shape.ndims != 4:
-    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
-
-  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
-  assert_ops += _assert(target_width > 0, ValueError,
-                        'target_width must be > 0.')
-  assert_ops += _assert(target_height > 0, ValueError,
-                        'target_height must be > 0.')
-
-  image = control_flow_ops.with_dependencies(assert_ops, image)
-  # `crop_to_bounding_box` and `pad_to_bounding_box` have their own checks.
-  # Make sure our checks come first, so that error messages are clearer.
-  if _is_tensor(target_height):
-    target_height = control_flow_ops.with_dependencies(
-        assert_ops, target_height)
-  if _is_tensor(target_width):
-    target_width = control_flow_ops.with_dependencies(assert_ops, target_width)
-
-  def max_(x, y):
-    if _is_tensor(x) or _is_tensor(y):
-      return math_ops.maximum(x, y)
-    else:
-      return max(x, y)
+  with ops.name_scope(None, 'resize_image_with_crop_or_pad', [image]):
+    image = ops.convert_to_tensor(image, name='image')
+    image_shape = image.get_shape()
+    is_batch = True
+    if image_shape.ndims == 3:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+    elif image_shape.ndims is None:
+      is_batch = False
+      image = array_ops.expand_dims(image, 0)
+      image.set_shape([None] * 4)
+    elif image_shape.ndims != 4:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+
+    assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+    assert_ops += _assert(target_width > 0, ValueError,
+                          'target_width must be > 0.')
+    assert_ops += _assert(target_height > 0, ValueError,
+                          'target_height must be > 0.')
+
+    image = control_flow_ops.with_dependencies(assert_ops, image)
+    # `crop_to_bounding_box` and `pad_to_bounding_box` have their own checks.
+    # Make sure our checks come first, so that error messages are clearer.
+    if _is_tensor(target_height):
+      target_height = control_flow_ops.with_dependencies(
+          assert_ops, target_height)
+    if _is_tensor(target_width):
+      target_width = control_flow_ops.with_dependencies(assert_ops,
+                                                        target_width)
+
+    def max_(x, y):
+      if _is_tensor(x) or _is_tensor(y):
+        return math_ops.maximum(x, y)
+      else:
+        return max(x, y)
 
-  def min_(x, y):
-    if _is_tensor(x) or _is_tensor(y):
-      return math_ops.minimum(x, y)
-    else:
-      return min(x, y)
+    def min_(x, y):
+      if _is_tensor(x) or _is_tensor(y):
+        return math_ops.minimum(x, y)
+      else:
+        return min(x, y)
 
-  def equal_(x, y):
-    if _is_tensor(x) or _is_tensor(y):
-      return math_ops.equal(x, y)
-    else:
-      return x == y
+    def equal_(x, y):
+      if _is_tensor(x) or _is_tensor(y):
+        return math_ops.equal(x, y)
+      else:
+        return x == y
 
-  _, height, width, _ = _ImageDimensions(image, rank=4)
-  width_diff = target_width - width
-  offset_crop_width = max_(-width_diff // 2, 0)
-  offset_pad_width = max_(width_diff // 2, 0)
+    _, height, width, _ = _ImageDimensions(image, rank=4)
+    width_diff = target_width - width
+    offset_crop_width = max_(-width_diff // 2, 0)
+    offset_pad_width = max_(width_diff // 2, 0)
 
-  height_diff = target_height - height
-  offset_crop_height = max_(-height_diff // 2, 0)
-  offset_pad_height = max_(height_diff // 2, 0)
+    height_diff = target_height - height
+    offset_crop_height = max_(-height_diff // 2, 0)
+    offset_pad_height = max_(height_diff // 2, 0)
 
-  # Maybe crop if needed.
-  cropped = crop_to_bounding_box(image, offset_crop_height, offset_crop_width,
-                                 min_(target_height, height),
-                                 min_(target_width, width))
+    # Maybe crop if needed.
+    cropped = crop_to_bounding_box(image, offset_crop_height, offset_crop_width,
+                                   min_(target_height, height),
+                                   min_(target_width, width))
 
-  # Maybe pad if needed.
-  resized = pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width,
-                                target_height, target_width)
+    # Maybe pad if needed.
+    resized = pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width,
+                                  target_height, target_width)
 
-  # In theory all the checks below are redundant.
-  if resized.get_shape().ndims is None:
-    raise ValueError('resized contains no shape.')
+    # In theory all the checks below are redundant.
+    if resized.get_shape().ndims is None:
+      raise ValueError('resized contains no shape.')
 
-  _, resized_height, resized_width, _ = _ImageDimensions(resized, rank=4)
+    _, resized_height, resized_width, _ = _ImageDimensions(resized, rank=4)
 
-  assert_ops = []
-  assert_ops += _assert(equal_(resized_height, target_height), ValueError,
-                        'resized height is not correct.')
-  assert_ops += _assert(equal_(resized_width, target_width), ValueError,
-                        'resized width is not correct.')
+    assert_ops = []
+    assert_ops += _assert(
+        equal_(resized_height, target_height), ValueError,
+        'resized height is not correct.')
+    assert_ops += _assert(
+        equal_(resized_width, target_width), ValueError,
+        'resized width is not correct.')
 
-  resized = control_flow_ops.with_dependencies(assert_ops, resized)
+    resized = control_flow_ops.with_dependencies(assert_ops, resized)
 
-  if not is_batch:
-    resized = array_ops.squeeze(resized, squeeze_dims=[0])
+    if not is_batch:
+      resized = array_ops.squeeze(resized, squeeze_dims=[0])
 
-  return resized
+    return resized
 
 
+@tf_export('image.ResizeMethod')
 class ResizeMethod(object):
   BILINEAR = 0
   NEAREST_NEIGHBOR = 1
@@ -689,6 +833,7 @@ class ResizeMethod(object):
   AREA = 3
 
 
+@tf_export('image.resize_images')
 def resize_images(images,
                   size,
                   method=ResizeMethod.BILINEAR,
@@ -721,8 +866,9 @@ def resize_images(images,
     size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
           new size for the images.
     method: ResizeMethod.  Defaults to `ResizeMethod.BILINEAR`.
-    align_corners: bool. If true, exactly align all 4 corners of the input and
-                   output. Defaults to `false`.
+    align_corners: bool.  If True, the centers of the 4 corner pixels of the
+        input and output tensors are aligned, preserving the values at the
+        corner pixels. Defaults to `False`.
 
   Raises:
     ValueError: if the shape of `images` is incompatible with the
@@ -736,67 +882,65 @@ def resize_images(images,
     If `images` was 3-D, a 3-D float Tensor of shape
     `[new_height, new_width, channels]`.
   """
-  images = ops.convert_to_tensor(images, name='images')
-  if images.get_shape().ndims is None:
-    raise ValueError('\'images\' contains no shape.')
-  # TODO(shlens): Migrate this functionality to the underlying Op's.
-  is_batch = True
-  if images.get_shape().ndims == 3:
-    is_batch = False
-    images = array_ops.expand_dims(images, 0)
-  elif images.get_shape().ndims != 4:
-    raise ValueError('\'images\' must have either 3 or 4 dimensions.')
-
-  _, height, width, _ = images.get_shape().as_list()
+  with ops.name_scope(None, 'resize_images', [images, size]):
+    images = ops.convert_to_tensor(images, name='images')
+    if images.get_shape().ndims is None:
+      raise ValueError('\'images\' contains no shape.')
+    # TODO(shlens): Migrate this functionality to the underlying Op's.
+    is_batch = True
+    if images.get_shape().ndims == 3:
+      is_batch = False
+      images = array_ops.expand_dims(images, 0)
+    elif images.get_shape().ndims != 4:
+      raise ValueError('\'images\' must have either 3 or 4 dimensions.')
+
+    _, height, width, _ = images.get_shape().as_list()
+
+    try:
+      size = ops.convert_to_tensor(size, dtypes.int32, name='size')
+    except (TypeError, ValueError):
+      raise ValueError('\'size\' must be a 1-D int32 Tensor')
+    if not size.get_shape().is_compatible_with([2]):
+      raise ValueError('\'size\' must be a 1-D Tensor of 2 elements: '
+                       'new_height, new_width')
+    size_const_as_shape = tensor_util.constant_value_as_shape(size)
+    new_height_const = size_const_as_shape[0].value
+    new_width_const = size_const_as_shape[1].value
+
+    # If we can determine that the height and width will be unmodified by this
+    # transformation, we avoid performing the resize.
+    if all(x is not None
+           for x in [new_width_const, width, new_height_const, height]) and (
+               width == new_width_const and height == new_height_const):
+      if not is_batch:
+        images = array_ops.squeeze(images, squeeze_dims=[0])
+      return images
+
+    if method == ResizeMethod.BILINEAR:
+      images = gen_image_ops.resize_bilinear(
+          images, size, align_corners=align_corners)
+    elif method == ResizeMethod.NEAREST_NEIGHBOR:
+      images = gen_image_ops.resize_nearest_neighbor(
+          images, size, align_corners=align_corners)
+    elif method == ResizeMethod.BICUBIC:
+      images = gen_image_ops.resize_bicubic(
+          images, size, align_corners=align_corners)
+    elif method == ResizeMethod.AREA:
+      images = gen_image_ops.resize_area(
+          images, size, align_corners=align_corners)
+    else:
+      raise ValueError('Resize method is not implemented.')
+
+    # NOTE(mrry): The shape functions for the resize ops cannot unpack
+    # the packed values in `new_size`, so set the shape here.
+    images.set_shape([None, new_height_const, new_width_const, None])
 
-  try:
-    size = ops.convert_to_tensor(size, dtypes.int32, name='size')
-  except (TypeError, ValueError):
-    raise ValueError('\'size\' must be a 1-D int32 Tensor')
-  if not size.get_shape().is_compatible_with([2]):
-    raise ValueError('\'size\' must be a 1-D Tensor of 2 elements: '
-                     'new_height, new_width')
-  size_const_as_shape = tensor_util.constant_value_as_shape(size)
-  new_height_const = size_const_as_shape[0].value
-  new_width_const = size_const_as_shape[1].value
-
-  # If we can determine that the height and width will be unmodified by this
-  # transformation, we avoid performing the resize.
-  if all(x is not None
-         for x in [new_width_const, width, new_height_const, height]) and (
-             width == new_width_const and height == new_height_const):
     if not is_batch:
       images = array_ops.squeeze(images, squeeze_dims=[0])
     return images
 
-  if method == ResizeMethod.BILINEAR:
-    images = gen_image_ops.resize_bilinear(images,
-                                           size,
-                                           align_corners=align_corners)
-  elif method == ResizeMethod.NEAREST_NEIGHBOR:
-    images = gen_image_ops.resize_nearest_neighbor(images,
-                                                   size,
-                                                   align_corners=align_corners)
-  elif method == ResizeMethod.BICUBIC:
-    images = gen_image_ops.resize_bicubic(images,
-                                          size,
-                                          align_corners=align_corners)
-  elif method == ResizeMethod.AREA:
-    images = gen_image_ops.resize_area(images,
-                                       size,
-                                       align_corners=align_corners)
-  else:
-    raise ValueError('Resize method is not implemented.')
-
-  # NOTE(mrry): The shape functions for the resize ops cannot unpack
-  # the packed values in `new_size`, so set the shape here.
-  images.set_shape([None, new_height_const, new_width_const, None])
-
-  if not is_batch:
-    images = array_ops.squeeze(images, squeeze_dims=[0])
-  return images
-
 
+@tf_export('image.per_image_standardization')
 def per_image_standardization(image):
   """Linearly scales `image` to have zero mean and unit norm.
 
@@ -816,29 +960,31 @@ def per_image_standardization(image):
   Raises:
     ValueError: if the shape of 'image' is incompatible with this function.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  num_pixels = math_ops.reduce_prod(array_ops.shape(image))
+  with ops.name_scope(None, 'per_image_standardization', [image]) as scope:
+    image = ops.convert_to_tensor(image, name='image')
+    image = _Assert3DImage(image)
+    num_pixels = math_ops.reduce_prod(array_ops.shape(image))
 
-  image = math_ops.cast(image, dtype=dtypes.float32)
-  image_mean = math_ops.reduce_mean(image)
+    image = math_ops.cast(image, dtype=dtypes.float32)
+    image_mean = math_ops.reduce_mean(image)
 
-  variance = (math_ops.reduce_mean(math_ops.square(image)) -
-              math_ops.square(image_mean))
-  variance = gen_nn_ops.relu(variance)
-  stddev = math_ops.sqrt(variance)
+    variance = (
+        math_ops.reduce_mean(math_ops.square(image)) -
+        math_ops.square(image_mean))
+    variance = gen_nn_ops.relu(variance)
+    stddev = math_ops.sqrt(variance)
 
-  # Apply a minimum normalization that protects us against uniform images.
-  min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32))
-  pixel_value_scale = math_ops.maximum(stddev, min_stddev)
-  pixel_value_offset = image_mean
+    # Apply a minimum normalization that protects us against uniform images.
+    min_stddev = math_ops.rsqrt(math_ops.cast(num_pixels, dtypes.float32))
+    pixel_value_scale = math_ops.maximum(stddev, min_stddev)
+    pixel_value_offset = image_mean
 
-  image = math_ops.subtract(image, pixel_value_offset)
-  image = math_ops.div(image, pixel_value_scale)
-  return image
+    image = math_ops.subtract(image, pixel_value_offset)
+    image = math_ops.div(image, pixel_value_scale, name=scope)
+    return image
 
 
+@tf_export('image.random_brightness')
 def random_brightness(image, max_delta, seed=None):
   """Adjust the brightness of images by a random factor.
 
@@ -865,6 +1011,7 @@ def random_brightness(image, max_delta, seed=None):
   return adjust_brightness(image, delta)
 
 
+@tf_export('image.random_contrast')
 def random_contrast(image, lower, upper, seed=None):
   """Adjust the contrast of an image by a random factor.
 
@@ -896,6 +1043,7 @@ def random_contrast(image, lower, upper, seed=None):
   return adjust_contrast(image, contrast_factor)
 
 
+@tf_export('image.adjust_brightness')
 def adjust_brightness(image, delta):
   """Adjust the brightness of RGB or Grayscale images.
 
@@ -923,13 +1071,13 @@ def adjust_brightness(image, delta):
     orig_dtype = image.dtype
     flt_image = convert_image_dtype(image, dtypes.float32)
 
-    adjusted = math_ops.add(flt_image,
-                            math_ops.cast(delta, dtypes.float32),
-                            name=name)
+    adjusted = math_ops.add(
+        flt_image, math_ops.cast(delta, dtypes.float32), name=name)
 
     return convert_image_dtype(adjusted, orig_dtype, saturate=True)
 
 
+@tf_export('image.adjust_contrast')
 def adjust_contrast(images, contrast_factor):
   """Adjust contrast of RGB or grayscale images.
 
@@ -963,25 +1111,25 @@ def adjust_contrast(images, contrast_factor):
     flt_images = convert_image_dtype(images, dtypes.float32)
 
     # pylint: disable=protected-access
-    adjusted = gen_image_ops._adjust_contrastv2(flt_images,
-                                                contrast_factor=contrast_factor,
-                                                name=name)
+    adjusted = gen_image_ops._adjust_contrastv2(
+        flt_images, contrast_factor=contrast_factor, name=name)
     # pylint: enable=protected-access
 
     return convert_image_dtype(adjusted, orig_dtype, saturate=True)
 
 
+@tf_export('image.adjust_gamma')
 def adjust_gamma(image, gamma=1, gain=1):
   """Performs Gamma Correction on the input image.
 
-    Also known as Power Law Transform. This function transforms the
-    input image pixelwise according to the equation Out = In**gamma
-    after scaling each pixel to the range 0 to 1.
+  Also known as Power Law Transform. This function transforms the
+  input image pixelwise according to the equation `Out = In**gamma`
+  after scaling each pixel to the range 0 to 1.
 
   Args:
     image : A Tensor.
-    gamma : A scalar. Non negative real number.
-    gain  : A scalar. The constant multiplier.
+    gamma : A scalar or tensor. Non negative real number.
+    gain  : A scalar or tensor. The constant multiplier.
 
   Returns:
     A Tensor. Gamma corrected output image.
@@ -1000,22 +1148,26 @@ def adjust_gamma(image, gamma=1, gain=1):
   """
 
   with ops.op_scope([image, gamma, gain], None, 'adjust_gamma'):
-    # Convert pixel value to DT_FLOAT for computing adjusted image
+    # Convert pixel value to DT_FLOAT for computing adjusted image.
     img = ops.convert_to_tensor(image, name='img', dtype=dtypes.float32)
-    # Keep image dtype for computing the scale of corresponding dtype
+    # Keep image dtype for computing the scale of corresponding dtype.
     image = ops.convert_to_tensor(image, name='image')
 
-    if gamma < 0:
-      raise ValueError('Gamma should be a non-negative real number')
-    # scale = max(dtype) - min(dtype)
-    scale = constant_op.constant(image.dtype.limits[1] - image.dtype.limits[0],
-                                 dtype=dtypes.float32)
-    # According to the definition of gamma correction
-    adjusted_img = (img / scale) ** gamma * scale * gain
+    assert_op = _assert(gamma >= 0, ValueError,
+                        'Gamma should be a non-negative real number.')
+    if assert_op:
+      gamma = control_flow_ops.with_dependencies(assert_op, gamma)
+
+    # scale = max(dtype) - min(dtype).
+    scale = constant_op.constant(
+        image.dtype.limits[1] - image.dtype.limits[0], dtype=dtypes.float32)
+    # According to the definition of gamma correction.
+    adjusted_img = (img / scale)**gamma * scale * gain
 
     return adjusted_img
 
 
+@tf_export('image.convert_image_dtype')
 def convert_image_dtype(image, dtype, saturate=False, name=None):
   """Convert `image` to `dtype`, scaling its values if needed.
 
@@ -1094,6 +1246,7 @@ def convert_image_dtype(image, dtype, saturate=False, name=None):
           return math_ops.cast(scaled, dtype, name=name)
 
 
+@tf_export('image.rgb_to_grayscale')
 def rgb_to_grayscale(images, name=None):
   """Converts one or more images from RGB to Grayscale.
 
@@ -1118,13 +1271,12 @@ def rgb_to_grayscale(images, name=None):
     # Reference for converting between RGB and grayscale.
     # https://en.wikipedia.org/wiki/Luma_%28video%29
     rgb_weights = [0.2989, 0.5870, 0.1140]
-    rank_1 = array_ops.expand_dims(array_ops.rank(images) - 1, 0)
-    gray_float = math_ops.reduce_sum(
-        flt_image * rgb_weights, rank_1, keepdims=True)
-    gray_float.set_shape(images.get_shape()[:-1].concatenate([1]))
+    gray_float = math_ops.tensordot(flt_image, rgb_weights, [-1, -1])
+    gray_float = array_ops.expand_dims(gray_float, -1)
     return convert_image_dtype(gray_float, orig_dtype, name=name)
 
 
+@tf_export('image.grayscale_to_rgb')
 def grayscale_to_rgb(images, name=None):
   """Converts one or more images from Grayscale to RGB.
 
@@ -1141,9 +1293,8 @@ def grayscale_to_rgb(images, name=None):
   with ops.name_scope(name, 'grayscale_to_rgb', [images]) as name:
     images = ops.convert_to_tensor(images, name='images')
     rank_1 = array_ops.expand_dims(array_ops.rank(images) - 1, 0)
-    shape_list = (
-        [array_ops.ones(rank_1,
-                        dtype=dtypes.int32)] + [array_ops.expand_dims(3, 0)])
+    shape_list = ([array_ops.ones(rank_1, dtype=dtypes.int32)] +
+                  [array_ops.expand_dims(3, 0)])
     multiples = array_ops.concat(shape_list, 0)
     rgb = array_ops.tile(images, multiples, name=name)
     rgb.set_shape(images.get_shape()[:-1].concatenate([3]))
@@ -1151,6 +1302,7 @@ def grayscale_to_rgb(images, name=None):
 
 
 # pylint: disable=invalid-name
+@tf_export('image.random_hue')
 def random_hue(image, max_delta, seed=None):
   """Adjust the hue of an RGB image by a random factor.
 
@@ -1168,7 +1320,7 @@ def random_hue(image, max_delta, seed=None):
       set_random_seed for its interaction with the graph-level random seed.
 
   Returns:
-    3-D float tensor of shape `[height, width, channels]`.
+    Adjusted image(s), same shape and DType as `image`.
 
   Raises:
     ValueError: if `max_delta` is invalid.
@@ -1183,6 +1335,7 @@ def random_hue(image, max_delta, seed=None):
   return adjust_hue(image, delta)
 
 
+@tf_export('image.adjust_hue')
 def adjust_hue(image, delta, name=None):
   """Adjust hue of an RGB image.
 
@@ -1216,6 +1369,7 @@ def adjust_hue(image, delta, name=None):
     return convert_image_dtype(rgb_altered, orig_dtype)
 
 
+@tf_export('image.random_saturation')
 def random_saturation(image, lower, upper, seed=None):
   """Adjust the saturation of an RGB image by a random factor.
 
@@ -1248,6 +1402,7 @@ def random_saturation(image, lower, upper, seed=None):
   return adjust_saturation(image, saturation_factor)
 
 
+@tf_export('image.adjust_saturation')
 def adjust_saturation(image, saturation_factor, name=None):
   """Adjust saturation of an RGB image.
 
@@ -1275,31 +1430,32 @@ def adjust_saturation(image, saturation_factor, name=None):
     orig_dtype = image.dtype
     flt_image = convert_image_dtype(image, dtypes.float32)
 
-    # TODO(zhengxq): we will switch to the fused version after we add a GPU
-    # kernel for that.
-    fused = os.environ.get('TF_ADJUST_SATURATION_FUSED', '')
-    fused = fused.lower() in ('true', 't', '1')
-
-    if fused:
-      return convert_image_dtype(
-          gen_image_ops.adjust_saturation(flt_image, saturation_factor),
-          orig_dtype)
+    return convert_image_dtype(
+        gen_image_ops.adjust_saturation(flt_image, saturation_factor),
+        orig_dtype)
 
-    hsv = gen_image_ops.rgb_to_hsv(flt_image)
 
-    hue = array_ops.slice(hsv, [0, 0, 0], [-1, -1, 1])
-    saturation = array_ops.slice(hsv, [0, 0, 1], [-1, -1, 1])
-    value = array_ops.slice(hsv, [0, 0, 2], [-1, -1, 1])
+@tf_export('image.is_jpeg')
+def is_jpeg(contents, name=None):
+  r"""Convenience function to check if the 'contents' encodes a JPEG image.
 
-    saturation *= saturation_factor
-    saturation = clip_ops.clip_by_value(saturation, 0.0, 1.0)
-
-    hsv_altered = array_ops.concat([hue, saturation, value], 2)
-    rgb_altered = gen_image_ops.hsv_to_rgb(hsv_altered)
+  Args:
+    contents: 0-D `string`. The encoded image bytes.
+    name: A name for the operation (optional)
 
-    return convert_image_dtype(rgb_altered, orig_dtype)
+  Returns:
+     A scalar boolean tensor indicating if 'contents' may be a JPEG image.
+     is_jpeg is susceptible to false positives.
+  """
+  # Normal JPEGs start with \xff\xd8\xff\xe0
+  # JPEG with EXIF stats with \xff\xd8\xff\xe1
+  # Use \xff\xd8\xff to cover both.
+  with ops.name_scope(name, 'is_jpeg'):
+    substr = string_ops.substr(contents, 0, 3)
+    return math_ops.equal(substr, b'\xff\xd8\xff', name=name)
 
 
+@tf_export('image.decode_image')
 def decode_image(contents, channels=None, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
   and `decode_png`.
@@ -1354,8 +1510,7 @@ def decode_image(contents, channels=None, name=None):
       gif_channels = 0 if channels is None else channels
       good_channels = math_ops.logical_and(
           math_ops.not_equal(gif_channels, 1, name='check_gif_channels'),
-          math_ops.not_equal(gif_channels, 4, name='check_gif_channels')
-      )
+          math_ops.not_equal(gif_channels, 4, name='check_gif_channels'))
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
@@ -1378,8 +1533,8 @@ def decode_image(contents, channels=None, name=None):
     def _jpeg():
       """Decodes a jpeg image."""
       jpeg_channels = 0 if channels is None else channels
-      good_channels = math_ops.not_equal(jpeg_channels, 4,
-                                         name='check_jpeg_channels')
+      good_channels = math_ops.not_equal(
+          jpeg_channels, 4, name='check_jpeg_channels')
       channels_msg = ('Channels must be in (None, 0, 1, 3) when decoding JPEG '
                       'images')
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
@@ -1388,10 +1543,11 @@ def decode_image(contents, channels=None, name=None):
 
     # Decode normal JPEG images (start with \xff\xd8\xff\xe0)
     # as well as JPEG images with EXIF data (start with \xff\xd8\xff\xe1).
-    is_jpeg = math_ops.equal(substr, b'\xff\xd8\xff', name='is_jpeg')
-    return control_flow_ops.cond(is_jpeg, _jpeg, check_png, name='cond_jpeg')
+    return control_flow_ops.cond(
+        is_jpeg(contents), _jpeg, check_png, name='cond_jpeg')
 
 
+@tf_export('image.total_variation')
 def total_variation(images, name=None):
   """Calculate and return the total variation for one or more images.
 
@@ -1456,15 +1612,21 @@ def total_variation(images, name=None):
 
     # Calculate the total variation by taking the absolute value of the
     # pixel-differences and summing over the appropriate axis.
-    tot_var = (math_ops.reduce_sum(math_ops.abs(pixel_dif1), axis=sum_axis) +
-               math_ops.reduce_sum(math_ops.abs(pixel_dif2), axis=sum_axis))
+    tot_var = (
+        math_ops.reduce_sum(math_ops.abs(pixel_dif1), axis=sum_axis) +
+        math_ops.reduce_sum(math_ops.abs(pixel_dif2), axis=sum_axis))
 
   return tot_var
 
 
-def sample_distorted_bounding_box(image_size, bounding_boxes, seed=None,
-                                  seed2=None, min_object_covered=None,
-                                  aspect_ratio_range=None, area_range=None,
+@tf_export('image.sample_distorted_bounding_box')
+def sample_distorted_bounding_box(image_size,
+                                  bounding_boxes,
+                                  seed=None,
+                                  seed2=None,
+                                  min_object_covered=0.1,
+                                  aspect_ratio_range=None,
+                                  area_range=None,
                                   max_attempts=None,
                                   use_image_if_no_bounding_boxes=None,
                                   name=None):
@@ -1480,10 +1642,12 @@ def sample_distorted_bounding_box(image_size, bounding_boxes, seed=None,
   The output of this Op is a single bounding box that may be used to crop the
   original image. The output is returned as 3 tensors: `begin`, `size` and
   `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-  image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+  image. The latter may be supplied to `tf.image.draw_bounding_boxes` to
+  visualize
   what the bounding box looks like.
 
-  Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+  Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`.
+  The
   bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
   height of the underlying image.
 
@@ -1499,7 +1663,7 @@ def sample_distorted_bounding_box(image_size, bounding_boxes, seed=None,
       # Draw the bounding box in an image summary.
       image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
                                                     bbox_for_draw)
-      tf.image_summary('images_with_box', image_with_box)
+      tf.summary.image('images_with_box', image_with_box)
 
       # Employ the bounding box to distort the image.
       distorted_image = tf.slice(image, begin, size)
@@ -1511,23 +1675,27 @@ def sample_distorted_bounding_box(image_size, bounding_boxes, seed=None,
   false and no bounding boxes are supplied, an error is raised.
 
   Args:
-    image_size: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`.
+    image_size: A `Tensor`. Must be one of the following types: `uint8`, `int8`,
+      `int16`, `int32`, `int64`.
       1-D, containing `[height, width, channels]`.
     bounding_boxes: A `Tensor` of type `float32`.
       3-D with shape `[batch, N, 4]` describing the N bounding boxes
       associated with the image.
     seed: An optional `int`. Defaults to `0`.
       If either `seed` or `seed2` are set to non-zero, the random number
-      generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+      generator is seeded by the given `seed`.  Otherwise, it is seeded by a
+        random
       seed.
     seed2: An optional `int`. Defaults to `0`.
       A second seed to avoid seed collision.
     min_object_covered: A Tensor of type `float32`. Defaults to `0.1`.
       The cropped area of the image must contain at least this
-      fraction of any bounding box supplied. The value of this parameter should be
+      fraction of any bounding box supplied. The value of this parameter should
+        be
       non-negative. In the case of 0, the cropped area does not need to overlap
       any of the bounding boxes supplied.
-    aspect_ratio_range: An optional list of `floats`. Defaults to `[0.75, 1.33]`.
+    aspect_ratio_range: An optional list of `floats`. Defaults to `[0.75,
+      1.33]`.
       The cropped area of the image must have an aspect ratio =
       width / height within this range.
     area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
@@ -1535,34 +1703,44 @@ def sample_distorted_bounding_box(image_size, bounding_boxes, seed=None,
       supplied image within in this range.
     max_attempts: An optional `int`. Defaults to `100`.
       Number of attempts at generating a cropped region of the image
-      of the specified constraints. After `max_attempts` failures, return the entire
+      of the specified constraints. After `max_attempts` failures, return the
+        entire
       image.
     use_image_if_no_bounding_boxes: An optional `bool`. Defaults to `False`.
       Controls behavior if no bounding boxes supplied.
-      If true, assume an implicit bounding box covering the whole input. If false,
+      If true, assume an implicit bounding box covering the whole input. If
+        false,
       raise an error.
     name: A name for the operation (optional).
 
   Returns:
     A tuple of `Tensor` objects (begin, size, bboxes).
 
-    begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+    begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[offset_height, offset_width, 0]`. Provide as input to
       `tf.slice`.
-    size: A `Tensor`. Has the same type as `image_size`. 1-D, containing `[target_height, target_width, -1]`. Provide as input to
+    size: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[target_height, target_width, -1]`. Provide as input to
       `tf.slice`.
-    bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+    bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing
+    the distorted bounding box.
       Provide as input to `tf.image.draw_bounding_boxes`.
   """
   with ops.name_scope(name, 'sample_distorted_bounding_box'):
-    return gen_image_ops._sample_distorted_bounding_box_v2(image_size,
-                bounding_boxes, seed=seed,
-                seed2=seed2, min_object_covered=min_object_covered,
-                aspect_ratio_range=aspect_ratio_range, area_range=area_range,
-                max_attempts=max_attempts,
-                use_image_if_no_bounding_boxes=use_image_if_no_bounding_boxes,
-                name=name)
-
-
+    return gen_image_ops._sample_distorted_bounding_box_v2(  # pylint: disable=protected-access
+        image_size,
+        bounding_boxes,
+        seed=seed,
+        seed2=seed2,
+        min_object_covered=min_object_covered,
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts,
+        use_image_if_no_bounding_boxes=use_image_if_no_bounding_boxes,
+        name=name)
+
+
+@tf_export('image.non_max_suppression')
 def non_max_suppression(boxes,
                         scores,
                         max_output_size,
@@ -1607,3 +1785,107 @@ def non_max_suppression(boxes,
     return gen_image_ops._non_max_suppression_v2(boxes, scores, max_output_size,
                                                  iou_threshold)
     # pylint: enable=protected-access
+
+
+_rgb_to_yiq_kernel = [[0.299, 0.59590059,
+                       0.2115], [0.587, -0.27455667, -0.52273617],
+                      [0.114, -0.32134392, 0.31119955]]
+
+
+def rgb_to_yiq(images):
+  """Converts one or more images from RGB to YIQ.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the YIQ
+  value of the pixels.
+  The output is only well defined if the value in images are in [0,1].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(
+      _rgb_to_yiq_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims - 1], [0]])
+
+
+_yiq_to_rgb_kernel = [[1, 1, 1], [0.95598634, -0.27201283, -1.10674021],
+                      [0.6208248, -0.64720424, 1.70423049]]
+
+
+def yiq_to_rgb(images):
+  """Converts one or more images from YIQ to RGB.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+  value of the pixels.
+  The output is only well defined if the Y value in images are in [0,1],
+  I value are in [-0.5957,0.5957] and Q value are in [-0.5226,0.5226].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(
+      _yiq_to_rgb_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims - 1], [0]])
+
+
+_rgb_to_yuv_kernel = [[0.299, -0.14714119,
+                       0.61497538], [0.587, -0.28886916, -0.51496512],
+                      [0.114, 0.43601035, -0.10001026]]
+
+
+def rgb_to_yuv(images):
+  """Converts one or more images from RGB to YUV.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the YUV
+  value of the pixels.
+  The output is only well defined if the value in images are in [0,1].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(
+      _rgb_to_yuv_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims - 1], [0]])
+
+
+_yuv_to_rgb_kernel = [[1, 1, 1], [0, -0.394642334, 2.03206185],
+                      [1.13988303, -0.58062185, 0]]
+
+
+def yuv_to_rgb(images):
+  """Converts one or more images from YUV to RGB.
+
+  Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+  value of the pixels.
+  The output is only well defined if the Y value in images are in [0,1],
+  U and V value are in [-0.5,0.5].
+
+  Args:
+    images: 2-D or higher rank. Image data to convert. Last dimension must be
+    size 3.
+
+  Returns:
+    images: tensor with the same shape as `images`.
+  """
+  images = ops.convert_to_tensor(images, name='images')
+  kernel = ops.convert_to_tensor(
+      _yuv_to_rgb_kernel, dtype=images.dtype, name='kernel')
+  ndims = images.get_shape().ndims
+  return math_ops.tensordot(images, kernel, axes=[[ndims - 1], [0]])
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index d1554b399f3776933bf970f7b2ceb8db5865d844..d8d37b282fffb72cfcfc307308138a65e331ae64 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -85,6 +85,64 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
       self.assertAllClose(rgb_tf, rgb_np)
 
 
+class RGBToYIQTest(test_util.TensorFlowTestCase):
+
+  def testBatch(self):
+    # Build an arbitrary RGB image
+    np.random.seed(7)
+    batch_size = 5
+    shape = (batch_size, 2, 7, 3)
+
+    for nptype in [np.float32, np.float64]:
+      inp = np.random.rand(*shape).astype(nptype)
+
+      # Convert to YIQ and back, as a batch and individually
+      with self.test_session(use_gpu=True) as sess:
+        batch0 = constant_op.constant(inp)
+        batch1 = image_ops.rgb_to_yiq(batch0)
+        batch2 = image_ops.yiq_to_rgb(batch1)
+        split0 = array_ops.unstack(batch0)
+        split1 = list(map(image_ops.rgb_to_yiq, split0))
+        split2 = list(map(image_ops.yiq_to_rgb, split1))
+        join1 = array_ops.stack(split1)
+        join2 = array_ops.stack(split2)
+        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+
+      # Verify that processing batch elements together is the same as separate
+      self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, join2, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, inp, rtol=1e-4, atol=1e-4)
+
+
+class RGBToYUVTest(test_util.TensorFlowTestCase):
+
+  def testBatch(self):
+    # Build an arbitrary RGB image
+    np.random.seed(7)
+    batch_size = 5
+    shape = (batch_size, 2, 7, 3)
+
+    for nptype in [np.float32, np.float64]:
+      inp = np.random.rand(*shape).astype(nptype)
+
+      # Convert to YUV and back, as a batch and individually
+      with self.test_session(use_gpu=True) as sess:
+        batch0 = constant_op.constant(inp)
+        batch1 = image_ops.rgb_to_yuv(batch0)
+        batch2 = image_ops.yuv_to_rgb(batch1)
+        split0 = array_ops.unstack(batch0)
+        split1 = list(map(image_ops.rgb_to_yuv, split0))
+        split2 = list(map(image_ops.yuv_to_rgb, split1))
+        join1 = array_ops.stack(split1)
+        join2 = array_ops.stack(split2)
+        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+
+      # Verify that processing batch elements together is the same as separate
+      self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, join2, rtol=1e-4, atol=1e-4)
+      self.assertAllClose(batch2, inp, rtol=1e-4, atol=1e-4)
+
+
 class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
 
   def _RGBToGrayscale(self, images):
@@ -189,6 +247,44 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
       self.assertAllClose(y_tf, y_np, 1e-6)
 
+  def test_adjust_gamma_less_zero(self):
+    """White image should be returned for gamma equal to zero"""
+    with self.test_session():
+      x_data = np.random.uniform(0, 255, (8, 8))
+      x_np = np.array(x_data, dtype=np.float32)
+
+      x = constant_op.constant(x_np, shape=x_np.shape)
+
+      err_msg = "Gamma should be a non-negative real number."
+
+      try:
+        image_ops.adjust_gamma(x, gamma=-1)
+      except Exception as e:
+        if err_msg not in str(e):
+          raise
+      else:
+        raise AssertionError("Exception not raised: %s" % err_msg)
+
+  def test_adjust_gamma_less_zero_tensor(self):
+    """White image should be returned for gamma equal to zero"""
+    with self.test_session():
+      x_data = np.random.uniform(0, 255, (8, 8))
+      x_np = np.array(x_data, dtype=np.float32)
+
+      x = constant_op.constant(x_np, shape=x_np.shape)
+      y = constant_op.constant(-1.0, dtype=dtypes.float32)
+
+      image = image_ops.adjust_gamma(x, gamma=y)
+
+      err_msg = "Gamma should be a non-negative real number."
+      try:
+        image.eval()
+      except Exception as e:
+        if err_msg not in str(e):
+          raise
+      else:
+        raise AssertionError("Exception not raised: %s" % err_msg)
+
   def test_adjust_gamma_zero(self):
     """White image should be returned for gamma equal to zero"""
     with self.test_session():
@@ -215,13 +311,13 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       y_tf = np.trunc(y.eval())
 
       y_np = np.array(
-          [[0, 31, 45, 55, 63, 71, 78, 84],
-           [90, 95, 100, 105, 110, 115, 119, 123],
-           [127, 131, 135, 139, 142, 146, 149, 153],
-           [156, 159, 162, 165, 168, 171, 174, 177],
-           [180, 183, 186, 188, 191, 194, 196, 199],
-           [201, 204, 206, 209, 211, 214, 216, 218],
-           [221, 223, 225, 228, 230, 232, 234, 236],
+          [[0, 31, 45, 55, 63, 71, 78, 84], [
+              90, 95, 100, 105, 110, 115, 119, 123
+          ], [127, 131, 135, 139, 142, 146, 149, 153], [
+              156, 159, 162, 165, 168, 171, 174, 177
+          ], [180, 183, 186, 188, 191, 194, 196, 199], [
+              201, 204, 206, 209, 211, 214, 216, 218
+          ], [221, 223, 225, 228, 230, 232, 234, 236],
            [238, 241, 243, 245, 247, 249, 251, 253]],
           dtype=np.float32)
 
@@ -236,14 +332,12 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       y_tf = np.trunc(y.eval())
 
       y_np = np.array(
-          [[0, 0, 0, 0, 1, 1, 2, 3],
-           [4, 5, 6, 7, 9, 10, 12, 14],
-           [16, 18, 20, 22, 25, 27, 30, 33],
-           [36, 39, 42, 45, 49, 52, 56, 60],
-           [64, 68, 72, 76, 81, 85, 90, 95],
-           [100, 105, 110, 116, 121, 127, 132, 138],
-           [144, 150, 156, 163, 169, 176, 182, 189],
-           [196, 203, 211, 218, 225, 233, 241, 249]],
+          [[0, 0, 0, 0, 1, 1, 2, 3], [4, 5, 6, 7, 9, 10, 12, 14], [
+              16, 18, 20, 22, 25, 27, 30, 33
+          ], [36, 39, 42, 45, 49, 52, 56, 60], [64, 68, 72, 76, 81, 85, 90, 95],
+           [100, 105, 110, 116, 121, 127, 132, 138], [
+               144, 150, 156, 163, 169, 176, 182, 189
+           ], [196, 203, 211, 218, 225, 233, 241, 249]],
           dtype=np.float32)
 
       self.assertAllClose(y_tf, y_np, 1e-6)
@@ -281,6 +375,21 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
+  def testBatchAdjustHue(self):
+    x_shape = [2, 1, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+
+    delta = 0.25
+    y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255]
+    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
+
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_np, shape=x_shape)
+      y = image_ops.adjust_hue(x, delta)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
   def _adjustHueNp(self, x_np, delta_h):
     self.assertEqual(x_np.shape[-1], 3)
     x_v = x_np.reshape([-1, 3])
@@ -359,6 +468,87 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
       self._adjustHueTf(x_np, delta_h)
 
 
+class FlipImageBenchmark(test.Benchmark):
+
+  def _benchmarkFlipLeftRight(self, device, cpu_count):
+    image_shape = [299, 299, 3]
+    warmup_rounds = 100
+    benchmark_rounds = 1000
+    config = config_pb2.ConfigProto()
+    if cpu_count is not None:
+      config.inter_op_parallelism_threads = 1
+      config.intra_op_parallelism_threads = cpu_count
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
+      with ops.device(device):
+        inputs = variables.Variable(
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
+            trainable=False,
+            dtype=dtypes.float32)
+        run_op = image_ops.flip_left_right(inputs)
+        sess.run(variables.global_variables_initializer())
+        for i in xrange(warmup_rounds + benchmark_rounds):
+          if i == warmup_rounds:
+            start = time.time()
+          sess.run(run_op)
+    end = time.time()
+    step_time = (end - start) / benchmark_rounds
+    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
+    print("benchmarkFlipLeftRight_299_299_3_%s step_time: %.2f us" %
+          (tag, step_time * 1e6))
+    self.report_benchmark(
+        name="benchmarkFlipLeftRight_299_299_3_%s" % (tag),
+        iters=benchmark_rounds,
+        wall_time=step_time)
+
+  def _benchmarkRandomFlipLeftRight(self, device, cpu_count):
+    image_shape = [299, 299, 3]
+    warmup_rounds = 100
+    benchmark_rounds = 1000
+    config = config_pb2.ConfigProto()
+    if cpu_count is not None:
+      config.inter_op_parallelism_threads = 1
+      config.intra_op_parallelism_threads = cpu_count
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
+      with ops.device(device):
+        inputs = variables.Variable(
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
+            trainable=False,
+            dtype=dtypes.float32)
+        run_op = image_ops.random_flip_left_right(inputs)
+        sess.run(variables.global_variables_initializer())
+        for i in xrange(warmup_rounds + benchmark_rounds):
+          if i == warmup_rounds:
+            start = time.time()
+          sess.run(run_op)
+    end = time.time()
+    step_time = (end - start) / benchmark_rounds
+    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
+    print("benchmarkRandomFlipLeftRight_299_299_3_%s step_time: %.2f us" %
+          (tag, step_time * 1e6))
+    self.report_benchmark(
+        name="benchmarkRandomFlipLeftRight_299_299_3_%s" % (tag),
+        iters=benchmark_rounds,
+        wall_time=step_time)
+
+  def benchmarkFlipLeftRightCpu1(self):
+    self._benchmarkFlipLeftRight("/cpu:0", 1)
+
+  def benchmarkFlipLeftRightCpuAll(self):
+    self._benchmarkFlipLeftRight("/cpu:0", None)
+
+  def benchmarkFlipLeftRightGpu(self):
+    self._benchmarkFlipLeftRight(test.gpu_device_name(), None)
+
+  def benchmarkRandomFlipLeftRightCpu1(self):
+    self._benchmarkRandomFlipLeftRight("/cpu:0", 1)
+
+  def benchmarkRandomFlipLeftRightCpuAll(self):
+    self._benchmarkRandomFlipLeftRight("/cpu:0", None)
+
+  def benchmarkRandomFlipLeftRightGpu(self):
+    self._benchmarkRandomFlipLeftRight(test.gpu_device_name(), None)
+
+
 class AdjustHueBenchmark(test.Benchmark):
 
   def _benchmarkAdjustHue(self, device, cpu_count):
@@ -372,8 +562,7 @@ class AdjustHueBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         delta = constant_op.constant(0.1, dtype=dtypes.float32)
@@ -417,8 +606,7 @@ class AdjustSaturationBenchmark(test.Benchmark):
     with session.Session("", graph=ops.Graph(), config=config) as sess:
       with ops.device(device):
         inputs = variables.Variable(
-            random_ops.random_uniform(
-                image_shape, dtype=dtypes.float32) * 255,
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
             trainable=False,
             dtype=dtypes.float32)
         delta = constant_op.constant(0.1, dtype=dtypes.float32)
@@ -473,10 +661,11 @@ class ResizeBilinearBenchmark(test.Benchmark):
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
-          name=("resize_bilinear_%s_%s_%s" %
-                (image_size[0], image_size[1], num_channels)))
-      print("%s   : %.2f ms/img" % (results["name"], 1000 * results["wall_time"]
-                                    / (batch_size * num_ops)))
+          name=("resize_bilinear_%s_%s_%s" % (image_size[0], image_size[1],
+                                              num_channels)))
+      print("%s   : %.2f ms/img" %
+            (results["name"],
+             1000 * results["wall_time"] / (batch_size * num_ops)))
 
   def benchmarkSimilar3Channel(self):
     self._benchmarkResize((183, 229), 3)
@@ -523,8 +712,9 @@ class ResizeBicubicBenchmark(test.Benchmark):
           min_iters=20,
           name=("resize_bicubic_%s_%s_%s" % (image_size[0], image_size[1],
                                              num_channels)))
-      print("%s   : %.2f ms/img" % (results["name"], 1000 * results["wall_time"]
-                                    / (batch_size * num_ops)))
+      print("%s   : %.2f ms/img" %
+            (results["name"],
+             1000 * results["wall_time"] / (batch_size * num_ops)))
 
   def benchmarkSimilar3Channel(self):
     self._benchmarkResize((183, 229), 3)
@@ -560,8 +750,8 @@ class ResizeAreaBenchmark(test.Benchmark):
     batch_size = 1
     num_ops = 1000
     img = variables.Variable(
-        random_ops.random_normal([batch_size, image_size[0],
-                                  image_size[1], num_channels]),
+        random_ops.random_normal(
+            [batch_size, image_size[0], image_size[1], num_channels]),
         name="img")
 
     deps = []
@@ -574,12 +764,13 @@ class ResizeAreaBenchmark(test.Benchmark):
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
       results = self.run_op_benchmark(
-          sess, benchmark_op,
-          name=("resize_area_%s_%s_%s" %
-                (image_size[0], image_size[1], num_channels)))
-      print("%s   : %.2f ms/img" % (
-          results["name"],
-          1000*results["wall_time"] / (batch_size * num_ops)))
+          sess,
+          benchmark_op,
+          name=("resize_area_%s_%s_%s" % (image_size[0], image_size[1],
+                                          num_channels)))
+      print("%s   : %.2f ms/img" %
+            (results["name"],
+             1000 * results["wall_time"] / (batch_size * num_ops)))
 
   def benchmarkSimilar3Channel(self):
     self._benchmarkResize((183, 229), 3)
@@ -632,14 +823,28 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
+  def testBatchSaturation(self):
+    x_shape = [2, 1, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+
+    saturation_factor = 0.5
+    y_data = [6, 9, 13, 140, 180, 226, 135, 121, 234, 172, 255, 128]
+    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
+
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_np, shape=x_shape)
+      y = image_ops.adjust_saturation(x, saturation_factor)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
   def _adjust_saturation(self, image, saturation_factor):
     image = ops.convert_to_tensor(image, name="image")
     orig_dtype = image.dtype
     flt_image = image_ops.convert_image_dtype(image, dtypes.float32)
     saturation_adjusted_image = gen_image_ops.adjust_saturation(
         flt_image, saturation_factor)
-    return image_ops.convert_image_dtype(saturation_adjusted_image,
-                                         orig_dtype)
+    return image_ops.convert_image_dtype(saturation_adjusted_image, orig_dtype)
 
   def testHalfSaturationFused(self):
     x_shape = [2, 2, 3]
@@ -729,7 +934,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
 
 class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
-  def testIdempotentLeftRight(self):
+  def testInvolutionLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
@@ -737,27 +942,52 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, x_np)
 
+  def testInvolutionLeftRightWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, x_np)
+
   def testLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
 
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_left_right(x_tf)
+      self.assertTrue(y.op.name.startswith("flip_left_right"))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
+  def testLeftRightWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    y_np = np.array([[[3, 2, 1], [3, 2, 1]], [[3, 2, 1], [3, 2, 1]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
+
   def testRandomFlipLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
+    seed = 42
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_left_right(x_tf)
+      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
 
       count_flipped = 0
       count_unflipped = 0
-      for _ in range(50):
+      for _ in range(100):
         y_tf = y.eval()
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
@@ -765,10 +995,15 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         else:
           self.assertAllEqual(y_tf, y_np)
           count_flipped += 1
-      self.assertGreaterEqual(count_flipped, 1)
-      self.assertGreaterEqual(count_unflipped, 1)
 
-  def testIdempotentUpDown(self):
+      # 100 trials
+      # Mean: 50
+      # Std Dev: ~5
+      # Six Sigma: 50 - (5 * 6) = 20
+      self.assertGreaterEqual(count_flipped, 20)
+      self.assertGreaterEqual(count_unflipped, 20)
+
+  def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
     with self.test_session(use_gpu=True):
@@ -777,10 +1012,33 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, x_np)
 
+  def testInvolutionUpDownWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, x_np)
+
   def testUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_up_down(x_tf)
+      self.assertTrue(y.op.name.startswith("flip_up_down"))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
+  def testUpDownWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    y_np = np.array([[[4, 5, 6], [1, 2, 3]], [[10, 11, 12], [7, 8, 9]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
@@ -793,10 +1051,11 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf)
+      y = image_ops.random_flip_up_down(x_tf, seed=42)
+      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
-      for _ in range(50):
+      for _ in range(100):
         y_tf = y.eval()
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
@@ -804,10 +1063,15 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         else:
           self.assertAllEqual(y_tf, y_np)
           count_flipped += 1
-      self.assertGreaterEqual(count_flipped, 1)
-      self.assertGreaterEqual(count_unflipped, 1)
 
-  def testIdempotentTranspose(self):
+      # 100 trials
+      # Mean: 50
+      # Std Dev: ~5
+      # Six Sigma: 50 - (5 * 6) = 20
+      self.assertGreaterEqual(count_flipped, 20)
+      self.assertGreaterEqual(count_unflipped, 20)
+
+  def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
     with self.test_session(use_gpu=True):
@@ -816,10 +1080,34 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, x_np)
 
+  def testInvolutionTransposeWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, x_np)
+
   def testTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[1, 4], [2, 5], [3, 6]], dtype=np.uint8).reshape([3, 2, 1])
 
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.transpose_image(x_tf)
+      self.assertTrue(y.op.name.startswith("transpose_image"))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
+  def testTransposeWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+
+    y_np = np.array([[[1, 4], [2, 5], [3, 6]], [[7, 10], [8, 11], [9, 12]]],
+                    dtype=np.uint8).reshape([2, 3, 2, 1])
+
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
@@ -828,13 +1116,18 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
   def testPartialShapes(self):
     p_unknown_rank = array_ops.placeholder(dtypes.uint8)
-    p_unknown_dims = array_ops.placeholder(
+    p_unknown_dims_3 = array_ops.placeholder(
         dtypes.uint8, shape=[None, None, None])
+    p_unknown_dims_4 = array_ops.placeholder(
+        dtypes.uint8, shape=[None, None, None, None])
     p_unknown_width = array_ops.placeholder(dtypes.uint8, shape=[64, None, 3])
+    p_unknown_batch = array_ops.placeholder(dtypes.uint8,
+																						shape=[None, 64, 64, 3])
 
     p_wrong_rank = array_ops.placeholder(dtypes.uint8, shape=[None, None])
     p_zero_dim = array_ops.placeholder(dtypes.uint8, shape=[64, 0, 3])
 
+    #Ops that support 3D input
     for op in [
         image_ops.flip_left_right, image_ops.flip_up_down,
         image_ops.random_flip_left_right, image_ops.random_flip_up_down,
@@ -842,16 +1135,34 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     ]:
       transformed_unknown_rank = op(p_unknown_rank)
       self.assertEqual(3, transformed_unknown_rank.get_shape().ndims)
-      transformed_unknown_dims = op(p_unknown_dims)
-      self.assertEqual(3, transformed_unknown_dims.get_shape().ndims)
+      transformed_unknown_dims_3 = op(p_unknown_dims_3)
+      self.assertEqual(3, transformed_unknown_dims_3.get_shape().ndims)
       transformed_unknown_width = op(p_unknown_width)
       self.assertEqual(3, transformed_unknown_width.get_shape().ndims)
 
-      with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
-        op(p_wrong_rank)
       with self.assertRaisesRegexp(ValueError, "must be > 0"):
         op(p_zero_dim)
 
+    #Ops that support 4D input
+    for op in [
+        image_ops.flip_left_right, image_ops.flip_up_down,
+        image_ops.transpose_image, image_ops.rot90
+    ]:
+      transformed_unknown_dims_4 = op(p_unknown_dims_4)
+      self.assertEqual(4, transformed_unknown_dims_4.get_shape().ndims)
+      transformed_unknown_batch = op(p_unknown_batch)
+      self.assertEqual(4, transformed_unknown_batch.get_shape().ndims)
+      with self.assertRaisesRegexp(ValueError,
+                                   "must be at least three-dimensional"):
+        op(p_wrong_rank)
+
+    for op in [
+        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
+    ]:
+      with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
+        op(p_wrong_rank)
+
+
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -860,6 +1171,14 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         rotated = image_ops.rot90(rotated)
       self.assertAllEqual(image, rotated.eval())
 
+  def testRot90GroupOrderWithBatch(self):
+    image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
+    with self.test_session(use_gpu=True):
+      rotated = image
+      for _ in xrange(4):
+        rotated = image_ops.rot90(rotated)
+      self.assertAllEqual(image, rotated.eval())
+
   def testRot90NumpyEquivalence(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -869,6 +1188,14 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k)
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
+  def testRot90NumpyEquivalenceWithBatch(self):
+    image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
+    with self.test_session(use_gpu=True):
+      k_placeholder = array_ops.placeholder(dtypes.int32, shape=[])
+      y_tf = image_ops.rot90(image, k_placeholder)
+      for k in xrange(4):
+        y_np = np.rot90(image, k=k, axes=(1, 2))
+        self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
 class RandomFlipTest(test_util.TensorFlowTestCase):
 
@@ -1047,6 +1374,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.per_image_standardization(x)
+      self.assertTrue(y.op.name.startswith("per_image_standardization"))
       y_tf = y.eval()
       self.assertAllClose(y_tf, y_np, atol=1e-4)
 
@@ -1218,9 +1546,10 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
 
     # Each line is a test configuration:
     #   (offset_height, offset_width, target_height, target_width), err_msg
-    test_config = (([-1, 0, 3, 3], "offset_height must be >= 0"),
-                   ([0, -1, 3, 3], "offset_width must be >= 0"),
-                   ([0, 0, 0, 3], "target_height must be > 0"),
+    test_config = (([-1, 0, 3, 3], "offset_height must be >= 0"), ([
+        0, -1, 3, 3
+    ], "offset_width must be >= 0"), ([0, 0, 0, 3],
+                                      "target_height must be > 0"),
                    ([0, 0, 3, 0], "target_width must be > 0"),
                    ([2, 0, 3, 3], "height must be >= target + offset"),
                    ([0, 2, 3, 3], "width must be >= target + offset"))
@@ -1228,6 +1557,11 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     for params, err_msg in test_config:
       self._assertRaises(x, x_shape, *params, err_msg=err_msg)
 
+  def testNameScope(self):
+    image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
+    y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
+    self.assertTrue(y.name.startswith("crop_to_bounding_box"))
+
 
 class CentralCropTest(test_util.TensorFlowTestCase):
 
@@ -1251,9 +1585,10 @@ class CentralCropTest(test_util.TensorFlowTestCase):
 
   def testCropping(self):
     x_shape = [4, 8, 1]
-    x_np = np.array([[1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8],
-                     [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8]],
-                    dtype=np.int32).reshape(x_shape)
+    x_np = np.array(
+        [[1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8],
+         [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8]],
+        dtype=np.int32).reshape(x_shape)
     y_np = np.array([[3, 4, 5, 6], [3, 4, 5, 6]]).reshape([2, 4, 1])
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
@@ -1270,7 +1605,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = array_ops.placeholder(shape=x_shape, dtype=dtypes.int32)
       y = image_ops.central_crop(x, 0.33)
-      y_tf = y.eval(feed_dict={x:x_np})
+      y_tf = y.eval(feed_dict={x: x_np})
       self.assertAllEqual(y_tf, y_np)
       self.assertAllEqual(y_tf.shape, y_np.shape)
 
@@ -1304,6 +1639,13 @@ class CentralCropTest(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         _ = image_ops.central_crop(x, 1.01)
 
+  def testNameScope(self):
+    x_shape = [13, 9, 3]
+    x_np = np.ones(x_shape, dtype=np.float32)
+    with self.test_session(use_gpu=True):
+      y = image_ops.central_crop(x_np, 1.0)
+      self.assertTrue(y.op.name.startswith("central_crop"))
+
 
 class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
 
@@ -1375,15 +1717,10 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
   def testInt64(self):
-    x = [1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
 
-    y = [0, 0, 0,
-         1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    y = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     y_shape = [4, 3, 1]
     x = np.array(x).reshape(x_shape)
     y = np.array(y).reshape(y_shape)
@@ -1400,38 +1737,26 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     self._assertReturns(x, x_shape, offset_height, offset_width, x, x_shape)
 
   def testPadding(self):
-    x = [1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
 
     offset_height, offset_width = [1, 0]
-    y = [0, 0, 0,
-         1, 2, 3,
-         4, 5, 6,
-         7, 8, 9]
+    y = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     y_shape = [4, 3, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
     offset_height, offset_width = [0, 1]
-    y = [0, 1, 2, 3,
-         0, 4, 5, 6,
-         0, 7, 8, 9]
+    y = [0, 1, 2, 3, 0, 4, 5, 6, 0, 7, 8, 9]
     y_shape = [3, 4, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
     offset_height, offset_width = [0, 0]
-    y = [1, 2, 3,
-         4, 5, 6,
-         7, 8, 9,
-         0, 0, 0]
+    y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0]
     y_shape = [4, 3, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
     offset_height, offset_width = [0, 0]
-    y = [1, 2, 3, 0,
-         4, 5, 6, 0,
-         7, 8, 9, 0]
+    y = [1, 2, 3, 0, 4, 5, 6, 0, 7, 8, 9, 0]
     y_shape = [3, 4, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
@@ -1463,9 +1788,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
     #   x_shape, target_height, target_width
-    test_config = (([0, 2, 2], 2, 2),
-                   ([2, 0, 2], 2, 2),
-                   ([2, 2, 0], 2, 2))
+    test_config = (([0, 2, 2], 2, 2), ([2, 0, 2], 2, 2), ([2, 2, 0], 2, 2))
     offset_height, offset_width = [0, 0]
     x = []
 
@@ -1507,6 +1830,11 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     for config_item in test_config:
       self._assertRaises(x, x_shape, *config_item)
 
+  def testNameScope(self):
+    image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
+    y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
+    self.assertTrue(y.op.name.startswith("pad_to_bounding_box"))
+
 
 class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
 
@@ -1518,8 +1846,8 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
                               (bounding_box[2] - bounding_box[0]))
 
     image_size_np = np.array(image.shape, dtype=np.int32)
-    bounding_box_np = (np.array(
-        bounding_box, dtype=np.float32).reshape([1, 1, 4]))
+    bounding_box_np = (
+        np.array(bounding_box, dtype=np.float32).reshape([1, 1, 4]))
 
     aspect_ratios = []
     area_ratios = []
@@ -1564,7 +1892,9 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       y = array_ops.strided_slice(image_tf, begin, begin + size)
 
       for _ in xrange(num_iter):
-        y_tf = y.eval(feed_dict={min_object_covered_placeholder: min_object_covered})
+        y_tf = y.eval(feed_dict={
+            min_object_covered_placeholder: min_object_covered
+        })
         crop_height = y_tf.shape[0]
         crop_width = y_tf.shape[1]
         aspect_ratio = float(crop_width) / float(crop_height)
@@ -1656,9 +1986,10 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       image_size = constant_op.constant(
           [40, 50, 1], shape=[3], dtype=dtypes.int32)
       bounding_box = constant_op.constant(
-          [0.0, 0.0, 1.0, 1.0],
-          shape=[4],
-          dtype=dtypes.float32,)
+          [[[0.0, 0.0, 1.0, 1.0]]],
+          shape=[1, 1, 4],
+          dtype=dtypes.float32,
+      )
       begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
           image_size=image_size,
           bounding_boxes=bounding_box,
@@ -1670,6 +2001,10 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], begin.get_shape().as_list())
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
+      # Actual run to make sure shape is correct inside Compute().
+      begin = begin.eval()
+      end = end.eval()
+      bbox_for_drawing = bbox_for_drawing.eval()
 
       begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
           image_size=image_size,
@@ -1683,16 +2018,40 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
 
+  def testDefaultMinObjectCovered(self):
+    # By default min_object_covered=0.1 if not provided
+    with self.test_session(use_gpu=True):
+      image_size = constant_op.constant(
+          [40, 50, 1], shape=[3], dtype=dtypes.int32)
+      bounding_box = constant_op.constant(
+          [[[0.0, 0.0, 1.0, 1.0]]],
+          shape=[1, 1, 4],
+          dtype=dtypes.float32,)
+      begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
+          image_size=image_size,
+          bounding_boxes=bounding_box,
+          aspect_ratio_range=(0.75, 1.33),
+          area_range=(0.05, 1.0))
+
+      self.assertAllEqual([3], begin.get_shape().as_list())
+      self.assertAllEqual([3], end.get_shape().as_list())
+      self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
+      # Actual run to make sure shape is correct inside Compute().
+      begin = begin.eval()
+      end = end.eval()
+      bbox_for_drawing = bbox_for_drawing.eval()
 
 class ResizeImagesTest(test_util.TensorFlowTestCase):
 
-  OPTIONS = [image_ops.ResizeMethod.BILINEAR,
-             image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-             image_ops.ResizeMethod.BICUBIC,
-             image_ops.ResizeMethod.AREA]
+  OPTIONS = [
+      image_ops.ResizeMethod.BILINEAR, image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+      image_ops.ResizeMethod.BICUBIC, image_ops.ResizeMethod.AREA
+  ]
 
-  TYPES = [np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64,
-           np.float16, np.float32, np.float64]
+  TYPES = [
+      np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.float16,
+      np.float32, np.float64
+  ]
 
   def _assertShapeInference(self, pre_shape, size, post_shape):
     # Try single image resize
@@ -1720,12 +2079,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     single_shape = [6, 4, 1]
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
     target_height = 6
     target_width = 4
 
@@ -1756,12 +2113,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     single_shape = [6, 4, 1]
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
     new_size = array_ops.placeholder(dtypes.int32, shape=(2))
 
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
@@ -1815,8 +2170,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                                   image_ops.ResizeMethod.BILINEAR)
 
   def testReturnDtype(self):
-    target_shapes = [[6, 4], [3, 2], [array_ops.placeholder(dtypes.int32),
-                                      array_ops.placeholder(dtypes.int32)]]
+    target_shapes = [[6, 4], [3, 2], [
+        array_ops.placeholder(dtypes.int32),
+        array_ops.placeholder(dtypes.int32)
+    ]]
     for nptype in self.TYPES:
       image = array_ops.placeholder(nptype, shape=[1, 6, 4, 1])
       for opt in self.OPTIONS:
@@ -1833,12 +2190,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     img_shape = [1, 6, 4, 1]
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
     # Test size where width is specified as a tensor which is a sum
     # of two tensors.
     width_1 = constant_op.constant(1)
@@ -1860,15 +2215,11 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
   def testResizeDown(self):
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
-    data = [127, 127, 64, 64,
-            127, 127, 64, 64,
-            64, 64, 127, 127,
-            64, 64, 127, 127,
-            50, 50, 100, 100,
-            50, 50, 100, 100]
-    expected_data = [127, 64,
-                     64, 127,
-                     50, 100]
+    data = [
+        127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
+        50, 50, 100, 100, 50, 50, 100, 100
+    ]
+    expected_data = [127, 64, 64, 127, 50, 100]
     target_height = 3
     target_width = 2
 
@@ -1894,39 +2245,31 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeUpAlignCornersFalse(self):
     img_shape = [1, 3, 2, 1]
-    data = [64, 32,
-            32, 64,
-            50, 100]
+    data = [64, 32, 32, 64, 50, 100]
     target_height = 6
     target_width = 4
     expected_data = {}
     expected_data[image_ops.ResizeMethod.BILINEAR] = [
-        64.0, 48.0, 32.0, 32.0,
-        48.0, 48.0, 48.0, 48.0,
-        32.0, 48.0, 64.0, 64.0,
-        41.0, 61.5, 82.0, 82.0,
-        50.0, 75.0, 100.0, 100.0,
-        50.0, 75.0, 100.0, 100.0]
+        64.0, 48.0, 32.0, 32.0, 48.0, 48.0, 48.0, 48.0, 32.0, 48.0, 64.0, 64.0,
+        41.0, 61.5, 82.0, 82.0, 50.0, 75.0, 100.0, 100.0, 50.0, 75.0, 100.0,
+        100.0
+    ]
     expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
-        64.0, 64.0, 32.0, 32.0,
-        64.0, 64.0, 32.0, 32.0,
-        32.0, 32.0, 64.0, 64.0,
-        32.0, 32.0, 64.0, 64.0,
-        50.0, 50.0, 100.0, 100.0,
-        50.0, 50.0, 100.0, 100.0]
+        64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0,
+        32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0,
+        100.0
+    ]
     expected_data[image_ops.ResizeMethod.AREA] = [
-        64.0, 64.0, 32.0, 32.0,
-        64.0, 64.0, 32.0, 32.0,
-        32.0, 32.0, 64.0, 64.0,
-        32.0, 32.0, 64.0, 64.0,
-        50.0, 50.0, 100.0, 100.0,
-        50.0, 50.0, 100.0, 100.0]
+        64.0, 64.0, 32.0, 32.0, 64.0, 64.0, 32.0, 32.0, 32.0, 32.0, 64.0, 64.0,
+        32.0, 32.0, 64.0, 64.0, 50.0, 50.0, 100.0, 100.0, 50.0, 50.0, 100.0,
+        100.0
+    ]
 
     for nptype in self.TYPES:
       for opt in [
           image_ops.ResizeMethod.BILINEAR,
-          image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-          image_ops.ResizeMethod.AREA]:
+          image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA
+      ]:
         with self.test_session(use_gpu=True):
           img_np = np.array(data, dtype=nptype).reshape(img_shape)
           image = constant_op.constant(img_np, shape=img_shape)
@@ -1939,41 +2282,29 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeUpAlignCornersTrue(self):
     img_shape = [1, 3, 2, 1]
-    data = [6, 3,
-            3, 6,
-            6, 9]
+    data = [6, 3, 3, 6, 6, 9]
     target_height = 5
     target_width = 4
     expected_data = {}
     expected_data[image_ops.ResizeMethod.BILINEAR] = [
-        6.0, 5.0, 4.0, 3.0,
-        4.5, 4.5, 4.5, 4.5,
-        3.0, 4.0, 5.0, 6.0,
-        4.5, 5.5, 6.5, 7.5,
-        6.0, 7.0, 8.0, 9.0
+        6.0, 5.0, 4.0, 3.0, 4.5, 4.5, 4.5, 4.5, 3.0, 4.0, 5.0, 6.0, 4.5, 5.5,
+        6.5, 7.5, 6.0, 7.0, 8.0, 9.0
     ]
     expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
-        6.0, 6.0, 3.0, 3.0,
-        3.0, 3.0, 6.0, 6.0,
-        3.0, 3.0, 6.0, 6.0,
-        6.0, 6.0, 9.0, 9.0,
-        6.0, 6.0, 9.0, 9.0
+        6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 6.0, 6.0, 3.0, 3.0, 6.0, 6.0, 6.0, 6.0,
+        9.0, 9.0, 6.0, 6.0, 9.0, 9.0
     ]
     # TODO(b/37749740): Improve alignment of ResizeMethod.AREA when
     # align_corners=True.
     expected_data[image_ops.ResizeMethod.AREA] = [
-        6.0, 6.0, 6.0, 3.0,
-        6.0, 6.0, 6.0, 3.0,
-        3.0, 3.0, 3.0, 6.0,
-        3.0, 3.0, 3.0, 6.0,
-        6.0, 6.0, 6.0, 9.0
+        6.0, 6.0, 6.0, 3.0, 6.0, 6.0, 6.0, 3.0, 3.0, 3.0, 3.0, 6.0, 3.0, 3.0,
+        3.0, 6.0, 6.0, 6.0, 6.0, 9.0
     ]
 
     for nptype in self.TYPES:
       for opt in [
           image_ops.ResizeMethod.BILINEAR,
-          image_ops.ResizeMethod.NEAREST_NEIGHBOR,
-          image_ops.ResizeMethod.AREA
+          image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA
       ]:
         with self.test_session(use_gpu=True):
           img_np = np.array(data, dtype=nptype).reshape(img_shape)
@@ -1987,23 +2318,21 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeUpBicubic(self):
     img_shape = [1, 6, 6, 1]
-    data = [128, 128, 64, 64, 128, 128, 64, 64,
-            64, 64, 128, 128, 64, 64, 128, 128,
-            50, 50, 100, 100, 50, 50, 100, 100,
-            50, 50, 100, 100, 50, 50, 100, 100,
-            50, 50, 100, 100]
+    data = [
+        128, 128, 64, 64, 128, 128, 64, 64, 64, 64, 128, 128, 64, 64, 128, 128,
+        50, 50, 100, 100, 50, 50, 100, 100, 50, 50, 100, 100, 50, 50, 100, 100,
+        50, 50, 100, 100
+    ]
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
     target_height = 8
     target_width = 8
-    expected_data = [128, 135, 96, 55, 64, 114, 134, 128,
-                     78, 81, 68, 52, 57, 118, 144, 136,
-                     55, 49, 79, 109, 103, 89, 83, 84,
-                     74, 70, 95, 122, 115, 69, 49, 55,
-                     100, 105, 75, 43, 50, 89, 105, 100,
-                     57, 54, 74, 96, 91, 65, 55, 58,
-                     70, 69, 75, 81, 80, 72, 69, 70,
-                     105, 112, 75, 36, 45, 92, 111, 105]
+    expected_data = [
+        128, 135, 96, 55, 64, 114, 134, 128, 78, 81, 68, 52, 57, 118, 144, 136,
+        55, 49, 79, 109, 103, 89, 83, 84, 74, 70, 95, 122, 115, 69, 49, 55, 100,
+        105, 75, 43, 50, 89, 105, 100, 57, 54, 74, 96, 91, 65, 55, 58, 70, 69,
+        75, 81, 80, 72, 69, 70, 105, 112, 75, 36, 45, 92, 111, 105
+    ]
 
     with self.test_session(use_gpu=True):
       image = constant_op.constant(img_np, shape=img_shape)
@@ -2016,20 +2345,17 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
   def testResizeDownArea(self):
     img_shape = [1, 6, 6, 1]
-    data = [128, 64, 32, 16, 8, 4,
-            4, 8, 16, 32, 64, 128,
-            128, 64, 32, 16, 8, 4,
-            5, 10, 15, 20, 25, 30,
-            30, 25, 20, 15, 10, 5,
-            5, 10, 15, 20, 25, 30]
+    data = [
+        128, 64, 32, 16, 8, 4, 4, 8, 16, 32, 64, 128, 128, 64, 32, 16, 8, 4, 5,
+        10, 15, 20, 25, 30, 30, 25, 20, 15, 10, 5, 5, 10, 15, 20, 25, 30
+    ]
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
     target_height = 4
     target_width = 4
-    expected_data = [73, 33, 23, 39,
-                     73, 33, 23, 39,
-                     14, 16, 19, 21,
-                     14, 16, 19, 21]
+    expected_data = [
+        73, 33, 23, 39, 73, 33, 23, 39, 14, 16, 19, 21, 14, 16, 19, 21
+    ]
 
     with self.test_session(use_gpu=True):
       image = constant_op.constant(img_np, shape=img_shape)
@@ -2111,6 +2437,13 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
     self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])
 
+  def testNameScope(self):
+    img_shape = [1, 3, 2, 1]
+    with self.test_session(use_gpu=True):
+      single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
+      y = image_ops.resize_images(single_image, [55, 66])
+      self.assertTrue(y.op.name.startswith("resize_images"))
+
 
 class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
@@ -2182,133 +2515,93 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
   def testPad(self):
     # Pad even along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 1, 2, 3, 4, 0,
-         0, 5, 6, 7, 8, 0]
+    y = [0, 1, 2, 3, 4, 0, 0, 5, 6, 7, 8, 0]
     y_shape = [2, 6, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Pad odd along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 1, 2, 3, 4, 0, 0,
-         0, 5, 6, 7, 8, 0, 0]
+    y = [0, 1, 2, 3, 4, 0, 0, 0, 5, 6, 7, 8, 0, 0]
     y_shape = [2, 7, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Pad even along row.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 0, 0, 0,
-         1, 2, 3, 4,
-         5, 6, 7, 8,
-         0, 0, 0, 0]
+    y = [0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0]
     y_shape = [4, 4, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Pad odd along row.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 0, 0, 0,
-         1, 2, 3, 4,
-         5, 6, 7, 8,
-         0, 0, 0, 0,
-         0, 0, 0, 0]
+    y = [0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0]
     y_shape = [5, 4, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
   def testCrop(self):
     # Crop even along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [2, 3,
-         6, 7]
+    y = [2, 3, 6, 7]
     y_shape = [2, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop odd along col.
-    x = [1, 2, 3, 4,  5,  6,
-         7, 8, 9, 10, 11, 12]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
     x_shape = [2, 6, 1]
 
-    y = [2, 3, 4,
-         8, 9, 10]
+    y = [2, 3, 4, 8, 9, 10]
     y_shape = [2, 3, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop even along row.
-    x = [1, 2,
-         3, 4,
-         5, 6,
-         7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [4, 2, 1]
 
-    y = [3, 4,
-         5, 6]
+    y = [3, 4, 5, 6]
     y_shape = [2, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop odd along row.
-    x = [1,  2,
-         3,  4,
-         5,  6,
-         7,  8,
-         9,  10,
-         11, 12,
-         13, 14,
-         15, 16]
+    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
     x_shape = [8, 2, 1]
 
-    y = [3,  4,
-         5,  6,
-         7,  8,
-         9,  10,
-         11, 12]
+    y = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
     y_shape = [5, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
   def testCropAndPad(self):
     # Pad along row but crop along col.
-    x = [1, 2, 3, 4,
-         5, 6, 7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [2, 4, 1]
 
-    y = [0, 0,
-         2, 3,
-         6, 7,
-         0, 0]
+    y = [0, 0, 2, 3, 6, 7, 0, 0]
     y_shape = [4, 2, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
 
     # Crop along row but pad along col.
-    x = [1, 2,
-         3, 4,
-         5, 6,
-         7, 8]
+    x = [1, 2, 3, 4, 5, 6, 7, 8]
     x_shape = [4, 2, 1]
 
-    y = [0, 3, 4, 0,
-         0, 5, 6, 0]
+    y = [0, 3, 4, 0, 0, 5, 6, 0]
     y_shape = [2, 4, 1]
 
     self._assertReturns(x, x_shape, y, y_shape)
@@ -2386,6 +2679,11 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     self._assertRaises(x, x_shape, target_height, target_width,
                        "target_width must be > 0")
 
+  def testNameScope(self):
+    image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
+    y = image_ops.resize_image_with_crop_or_pad(image, 55, 66)
+    self.assertTrue(y.op.name.startswith("resize_image_with_crop_or_pad"))
+
 
 def _SimpleColorRamp():
   """Build a simple color ramp RGB image."""
@@ -2647,10 +2945,9 @@ class PngTest(test_util.TensorFlowTestCase):
 
 class GifTest(test_util.TensorFlowTestCase):
 
-  def testValid(self):
+  def _testValid(self, filename):
     # Read some real GIFs
     prefix = "tensorflow/core/lib/gif/testdata/"
-    filename = "scan.gif"
     WIDTH = 20
     HEIGHT = 40
     STRIDE = 5
@@ -2677,16 +2974,9 @@ class GifTest(test_util.TensorFlowTestCase):
 
         self.assertAllClose(frame, gt)
 
-  def testInValid(self):
-    # Read some real GIFs
-    prefix = "tensorflow/core/lib/gif/testdata/"
-    filename = "optimized.gif"
-
-    with self.test_session(use_gpu=True) as sess:
-      gif0 = io_ops.read_file(prefix + filename)
-      image0 = image_ops.decode_gif(gif0)
-      with self.assertRaises(errors.InvalidArgumentError):
-        gif0, image0 = sess.run([gif0, image0])
+  def testValid(self):
+    self._testValid("scan.gif")
+    self._testValid("optimized.gif")
 
   def testShape(self):
     with self.test_session(use_gpu=True) as sess:
@@ -2706,8 +2996,9 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       y = image_ops.convert_image_dtype(image, output_dtype)
       self.assertTrue(y.dtype == output_dtype)
       self.assertAllClose(y.eval(), y_np, atol=1e-5)
-      if output_dtype in [dtypes.float32, dtypes.float64,
-                          dtypes.int32, dtypes.int64]:
+      if output_dtype in [
+          dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64
+      ]:
         y_saturate = image_ops.convert_image_dtype(
             image, output_dtype, saturate=True)
         self.assertTrue(y_saturate.dtype == output_dtype)
@@ -2727,8 +3018,8 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       self._convert([0, 255], dtypes.uint8, dtypes.int16, [0, 255 * 128])
       self._convert([0, 32767], dtypes.int16, dtypes.uint8, [0, 255])
-      self._convert([0, 2 ** 32], dtypes.int64, dtypes.int32, [0, 1])
-      self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2 ** 32])
+      self._convert([0, 2**32], dtypes.int64, dtypes.int32, [0, 1])
+      self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2**32])
 
   def testConvertBetweenFloat(self):
     # Make sure converting to between float types does nothing interesting
@@ -2749,20 +3040,14 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
   def testConvertBetweenInt16AndInt8(self):
     with self.test_session(use_gpu=True):
       # uint8, uint16
-      self._convert([0, 255 * 256], dtypes.uint16, dtypes.uint8,
-                    [0, 255])
-      self._convert([0, 255], dtypes.uint8, dtypes.uint16,
-                    [0, 255 * 256])
+      self._convert([0, 255 * 256], dtypes.uint16, dtypes.uint8, [0, 255])
+      self._convert([0, 255], dtypes.uint8, dtypes.uint16, [0, 255 * 256])
       # int8, uint16
-      self._convert([0, 127 * 2 * 256], dtypes.uint16, dtypes.int8,
-                    [0, 127])
-      self._convert([0, 127], dtypes.int8, dtypes.uint16,
-                    [0, 127 * 2 * 256])
+      self._convert([0, 127 * 2 * 256], dtypes.uint16, dtypes.int8, [0, 127])
+      self._convert([0, 127], dtypes.int8, dtypes.uint16, [0, 127 * 2 * 256])
       # int16, uint16
-      self._convert([0, 255 * 256], dtypes.uint16, dtypes.int16,
-                    [0, 255 * 128])
-      self._convert([0, 255 * 128], dtypes.int16, dtypes.uint16,
-                    [0, 255 * 256])
+      self._convert([0, 255 * 256], dtypes.uint16, dtypes.int16, [0, 255 * 128])
+      self._convert([0, 255 * 128], dtypes.int16, dtypes.uint16, [0, 255 * 256])
 
 
 class TotalVariationTest(test_util.TensorFlowTestCase):
@@ -2895,20 +3180,17 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
     # The following are the sum of absolute differences between the pixels.
     # sum row dif = (4-1) + (7-2) = 3 + 5 = 8
     # sum col dif = (2-1) + (7-4) = 1 + 3 = 4
-    r = [[1, 2],
-         [4, 7]]
+    r = [[1, 2], [4, 7]]
 
     # Blue color channel.
     # sum row dif = 18 + 29 = 47
     # sum col dif = 7 + 18 = 25
-    g = [[11, 18],
-         [29, 47]]
+    g = [[11, 18], [29, 47]]
 
     # Green color channel.
     # sum row dif = 120 + 193 = 313
     # sum col dif = 47 + 120 = 167
-    b = [[73, 120],
-         [193, 313]]
+    b = [[73, 120], [193, 313]]
 
     # Combine the 3 color channels into a single 3-dim array.
     # The shape is (2, 2, 3) corresponding to (height, width and color).
@@ -2937,9 +3219,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
 
     # Combine these 3 images into a single array of shape (3, 2, 2, 3)
     # where the first dimension is for the image-number.
-    multi = np.vstack((a[np.newaxis, :],
-                       b[np.newaxis, :],
-                       c[np.newaxis, :]))
+    multi = np.vstack((a[np.newaxis, :], b[np.newaxis, :], c[np.newaxis, :]))
 
     # Check that TensorFlow correctly calculates the total variation
     # for each image individually and returns the correct array.
@@ -2995,6 +3275,49 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
           boxes, scores, max_output_size, iou_threshold).eval()
       self.assertAllClose(selected_indices, [3, 0, 5])
 
+  def testInvalidShape(self):
+    # The boxes should be 2D of shape [num_boxes, 4].
+    with self.assertRaisesRegexp(ValueError,
+                                 "Shape must be rank 2 but is rank 1"):
+      boxes = constant_op.constant([0.0, 0.0, 1.0, 1.0])
+      scores = constant_op.constant([0.9])
+      image_ops.non_max_suppression(boxes, scores, 3, 0.5)
+
+    with self.assertRaisesRegexp(ValueError, "Dimension must be 4 but is 3"):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      image_ops.non_max_suppression(boxes, scores, 3, 0.5)
+
+    # The boxes is of shape [num_boxes, 4], and the scores is
+    # of shape [num_boxes]. So an error will thrown.
+    with self.assertRaisesRegexp(
+        ValueError, 'Dimensions must be equal, but are 1 and 2'):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([0.9, 0.75])
+      selected_indices = image_ops.non_max_suppression(
+          boxes, scores, 3, 0.5)
+
+    # The scores should be 1D of shape [num_boxes].
+    with self.assertRaisesRegexp(ValueError,
+                                 "Shape must be rank 1 but is rank 2"):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([[0.9]])
+      image_ops.non_max_suppression(boxes, scores, 3, 0.5)
+
+    # The max_output_size should be a scaler (0-D).
+    with self.assertRaisesRegexp(ValueError,
+                                 "Shape must be rank 0 but is rank 1"):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      image_ops.non_max_suppression(boxes, scores, [3], 0.5)
+
+    # The iou_threshold should be a scaler (0-D).
+    with self.assertRaisesRegexp(ValueError,
+                                 "Shape must be rank 0 but is rank 2"):
+      boxes = constant_op.constant([[0.0, 0.0, 1.0, 1.0]])
+      scores = constant_op.constant([0.9])
+      image_ops.non_max_suppression(boxes, scores, 3, [[0.5]])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 9eea3c21f89791ab4629e30ca895ef75af32f920..c7502d0fda5c38079362d30877a917e3965e6ca0 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -34,6 +34,8 @@ from __future__ import print_function
 
 import math
 
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
@@ -42,8 +44,10 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.util.deprecation import deprecated
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("keras.initializers.Initializer")
 class Initializer(object):
   """Initializer base class: all initializers inherit from this class.
   """
@@ -81,6 +85,8 @@ class Initializer(object):
     return cls(**config)
 
 
+@tf_export("keras.initializers.Zeros", "initializers.zeros",
+           "zeros_initializer")
 class Zeros(Initializer):
   """Initializer that generates tensors initialized to 0."""
 
@@ -96,6 +102,7 @@ class Zeros(Initializer):
     return {"dtype": self.dtype.name}
 
 
+@tf_export("keras.initializers.Ones", "initializers.ones", "ones_initializer")
 class Ones(Initializer):
   """Initializer that generates tensors initialized to 1."""
 
@@ -111,6 +118,8 @@ class Ones(Initializer):
     return {"dtype": self.dtype.name}
 
 
+@tf_export("keras.initializers.Constant", "initializers.constant",
+           "constant_initializer")
 class Constant(Initializer):
   """Initializer that generates tensors with constant values.
 
@@ -128,14 +137,17 @@ class Constant(Initializer):
   tensor shape, the initializer will raise a `ValueError`.
 
   Args:
-    value: A Python scalar, list of values, or a N-dimensional numpy array. All
-      elements of the initialized variable will be set to the corresponding
-      value in the `value` argument.
+    value: A Python scalar, list or tuple of values, or a N-dimensional numpy
+      array. All elements of the initialized variable will be set to the
+      corresponding value in the `value` argument.
     dtype: The data type.
     verify_shape: Boolean that enables verification of the shape of `value`. If
       `True`, the initializer will throw an error if the shape of `value` is not
       compatible with the shape of the initialized tensor.
 
+  Raises:
+    TypeError: If the input `value` is not one of the expected types.
+
   Examples:
     The following example can be rewritten using a numpy.ndarray instead
     of the `value` list, even reshaped, as shown in the two commented lines
@@ -187,6 +199,11 @@ class Constant(Initializer):
   """
 
   def __init__(self, value=0, dtype=dtypes.float32, verify_shape=False):
+    if not (np.isscalar(value) or isinstance(value, (list, tuple, np.ndarray))):
+      raise TypeError(
+          "Invalid type for initial value: %s (expected Python scalar, list or "
+          "tuple of values, or numpy.ndarray)." % type(value))
+
     self.value = value
     self.dtype = dtypes.as_dtype(dtype)
     self._verify_shape = verify_shape
@@ -207,6 +224,8 @@ class Constant(Initializer):
     return {"value": self.value, "dtype": self.dtype.name}
 
 
+@tf_export("keras.initializers.RandomUniform", "initializers.random_uniform",
+           "random_uniform_initializer")
 class RandomUniform(Initializer):
   """Initializer that generates tensors with a uniform distribution.
 
@@ -242,6 +261,8 @@ class RandomUniform(Initializer):
     }
 
 
+@tf_export("keras.initializers.RandomNormal", "initializers.random_normal",
+           "random_normal_initializer")
 class RandomNormal(Initializer):
   """Initializer that generates tensors with a normal distribution.
 
@@ -277,6 +298,8 @@ class RandomNormal(Initializer):
     }
 
 
+@tf_export("keras.initializers.TruncatedNormal",
+           "initializers.truncated_normal", "truncated_normal_initializer")
 class TruncatedNormal(Initializer):
   """Initializer that generates a truncated normal distribution.
 
@@ -317,6 +340,8 @@ class TruncatedNormal(Initializer):
     }
 
 
+@tf_export("initializers.uniform_unit_scaling",
+           "uniform_unit_scaling_initializer")
 class UniformUnitScaling(Initializer):
   """Initializer that generates tensors without scaling variance.
 
@@ -375,6 +400,8 @@ class UniformUnitScaling(Initializer):
     return {"factor": self.factor, "seed": self.seed, "dtype": self.dtype.name}
 
 
+@tf_export("keras.initializers.VarianceScaling",
+           "initializers.variance_scaling", "variance_scaling_initializer")
 class VarianceScaling(Initializer):
   """Initializer capable of adapting its scale to the shape of weights tensors.
 
@@ -454,6 +481,8 @@ class VarianceScaling(Initializer):
     }
 
 
+@tf_export("keras.initializers.Orthogonal", "initializers.orthogonal",
+           "orthogonal_initializer")
 class Orthogonal(Initializer):
   """Initializer that generates an orthogonal matrix.
 
@@ -513,6 +542,7 @@ class Orthogonal(Initializer):
     return {"gain": self.gain, "seed": self.seed, "dtype": self.dtype.name}
 
 
+@tf_export("keras.initializers.Identity", "initializers.identity")
 class Identity(Initializer):
   """Initializer that generates the identity matrix.
 
@@ -560,6 +590,7 @@ identity_initializer = Identity
 # pylint: enable=invalid-name
 
 
+@tf_export("glorot_uniform_initializer")
 def glorot_uniform_initializer(seed=None, dtype=dtypes.float32):
   """The Glorot uniform initializer, also called Xavier uniform initializer.
 
@@ -583,6 +614,7 @@ def glorot_uniform_initializer(seed=None, dtype=dtypes.float32):
       scale=1.0, mode="fan_avg", distribution="uniform", seed=seed, dtype=dtype)
 
 
+@tf_export("glorot_normal_initializer")
 def glorot_normal_initializer(seed=None, dtype=dtypes.float32):
   """The Glorot normal initializer, also called Xavier normal initializer.
 
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index 670bb9a9c29e8450b101b04ce781dc97ceb78398..5e70b3186f382a0c795b1795b2db27bb2058ee41 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -79,6 +79,7 @@ from tensorflow.python.ops import gen_io_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_io_ops import *
+from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
 
 
@@ -140,6 +141,7 @@ def _restore_slice(file_pattern, tensor_name, shape_and_slice, tensor_type,
       preferred_shard, name=name)
 
 
+@tf_export("ReaderBase")
 class ReaderBase(object):
   """Base class for different Reader types, that produce a record every step.
 
@@ -354,6 +356,7 @@ ops.NotDifferentiable("ReaderRestoreState")
 ops.NotDifferentiable("ReaderReset")
 
 
+@tf_export("WholeFileReader")
 class WholeFileReader(ReaderBase):
   """A Reader that outputs the entire contents of a file as a value.
 
@@ -381,6 +384,7 @@ class WholeFileReader(ReaderBase):
 ops.NotDifferentiable("WholeFileReader")
 
 
+@tf_export("TextLineReader")
 class TextLineReader(ReaderBase):
   """A Reader that outputs the lines of a file delimited by newlines.
 
@@ -410,6 +414,7 @@ class TextLineReader(ReaderBase):
 ops.NotDifferentiable("TextLineReader")
 
 
+@tf_export("FixedLengthRecordReader")
 class FixedLengthRecordReader(ReaderBase):
   """A Reader that outputs fixed-length records from a file.
 
@@ -452,6 +457,7 @@ class FixedLengthRecordReader(ReaderBase):
 ops.NotDifferentiable("FixedLengthRecordReader")
 
 
+@tf_export("TFRecordReader")
 class TFRecordReader(ReaderBase):
   """A Reader that outputs the records from a TFRecords file.
 
@@ -482,6 +488,7 @@ class TFRecordReader(ReaderBase):
 ops.NotDifferentiable("TFRecordReader")
 
 
+@tf_export("LMDBReader")
 class LMDBReader(ReaderBase):
   """A Reader that outputs the records from a LMDB file.
 
@@ -506,6 +513,7 @@ class LMDBReader(ReaderBase):
 ops.NotDifferentiable("LMDBReader")
 
 
+@tf_export("IdentityReader")
 class IdentityReader(ReaderBase):
   """A Reader that outputs the queued work as both the key and value.
 
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index 5369007a56c89ef8601f8144c2fe18717e2e78fe..14319025ff275944cf34e30128df96254d06072b 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -41,4 +41,5 @@ del gen_linalg_ops
 del linalg_ops
 del math_ops
 del special_math_ops
+del tf_export
 # pylint: enable=undefined-variable
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index bf15f0e2e55385032b194c7718e175114e77dd7b..d5bd916f80d8a03e5423c43d1ca039bc4dceff5e 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -24,6 +24,7 @@ from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.util.tf_export import tf_export
 
 # Linear algebra ops.
 band_part = array_ops.matrix_band_part
@@ -41,6 +42,7 @@ einsum = special_math_ops.einsum
 expm = gen_linalg_ops._matrix_exponential
 eye = linalg_ops.eye
 inv = linalg_ops.matrix_inverse
+logm = gen_linalg_ops._matrix_logarithm
 lstsq = linalg_ops.matrix_solve_ls
 norm = linalg_ops.norm
 qr = linalg_ops.qr
@@ -53,6 +55,7 @@ transpose = array_ops.matrix_transpose
 triangular_solve = linalg_ops.matrix_triangular_solve
 
 
+@tf_export('linalg.logdet')
 def logdet(matrix, name=None):
   """Computes log of the determinant of a hermitian positive definite matrix.
 
@@ -64,8 +67,8 @@ def logdet(matrix, name=None):
   ```
 
   Args:
-    matrix:  A `Tensor`. Must be `float32`, `float64`, `complex64`, or
-      `complex128` with shape `[..., M, M]`.
+    matrix:  A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`,
+      or `complex128` with shape `[..., M, M]`.
     name:  A name to give this `Op`.  Defaults to `logdet`.
 
   Returns:
@@ -85,6 +88,7 @@ def logdet(matrix, name=None):
         reduction_indices=[-1])
 
 
+@tf_export('linalg.adjoint')
 def adjoint(matrix, name=None):
   """Transposes the last two dimensions of and conjugates tensor `matrix`.
 
@@ -98,8 +102,8 @@ def adjoint(matrix, name=None):
                         #  [3 - 3j, 6 - 6j]]
 
   Args:
-    matrix:  A `Tensor`. Must be `float32`, `float64`, `complex64`, or
-      `complex128` with shape `[..., M, M]`.
+    matrix:  A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`,
+      or `complex128` with shape `[..., M, M]`.
     name:  A name to give this `Op` (optional).
 
   Returns:
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 27e0f17020afa0fd44ec11c49b7a77d4426933dd..957a7959181efe3bbc319e62582053329b763dc3 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -32,11 +32,13 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperator"]
 
 
 # TODO(langmore) Use matrix_solve_ls for singular or non-square matrices.
+@tf_export("linalg.LinearOperator")
 class LinearOperator(object):
   """Base class defining a [batch of] linear operator[s].
 
@@ -478,7 +480,6 @@ class LinearOperator(object):
           cond,
           self._max_condition_number_to_be_non_singular(),
           message="Singular matrix up to precision epsilon.")
-    raise NotImplementedError("assert_non_singular is not implemented.")
 
   def _max_condition_number_to_be_non_singular(self):
     """Return the maximum condition number that we consider nonsingular."""
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index 14411291d4fddeb2242e243d9a611e9c2fcd171a..ecd30e4d7e4dd7cfd4b109ad6e60aacb172700f6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -25,10 +25,12 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperatorComposition"]
 
 
+@tf_export("linalg.LinearOperatorComposition")
 class LinearOperatorComposition(linear_operator.LinearOperator):
   """Composes one or more `LinearOperators`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index a4724d030f388230cf85cc68bf60b6553b409c17..b3ec3d5b7cf45ac0b2672eea9a4586b2c3295897 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -26,10 +26,12 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperatorDiag",]
 
 
+@tf_export("linalg.LinearOperatorDiag")
 class LinearOperatorDiag(linear_operator.LinearOperator):
   """`LinearOperator` acting like a [batch] square diagonal matrix.
 
@@ -121,8 +123,8 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
 
     Args:
       diag:  Shape `[B1,...,Bb, N]` `Tensor` with `b >= 0` `N >= 0`.
-        The diagonal of the operator.  Allowed dtypes: `float32`, `float64`,
-          `complex64`, `complex128`.
+        The diagonal of the operator.  Allowed dtypes: `float16`, `float32`,
+          `float64`, `complex64`, `complex128`.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.  If `diag.dtype` is real, this is auto-set to `True`.
@@ -167,7 +169,12 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   def _check_diag(self, diag):
     """Static check of diag."""
     allowed_dtypes = [
-        dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+        dtypes.complex64,
+        dtypes.complex128,
+    ]
 
     dtype = diag.dtype
     if dtype not in allowed_dtypes:
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index dd4c7cb0413013f3f54f6085a7adcb523755a603..f979fb37d6c69a2683af08a1f6722b98da0b6650 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -23,10 +23,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["LinearOperatorFullMatrix"]
 
 
+@tf_export("linalg.LinearOperatorFullMatrix")
 class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   """`LinearOperator` that wraps a [batch] matrix.
 
@@ -114,7 +116,8 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
 
     Args:
       matrix:  Shape `[B1,...,Bb, M, N]` with `b >= 0`, `M, N >= 0`.
-        Allowed dtypes: `float32`, `float64`, `complex64`, `complex128`.
+        Allowed dtypes: `float16`, `float32`, `float64`, `complex64`,
+        `complex128`.
       is_non_singular:  Expect that this operator is non-singular.
       is_self_adjoint:  Expect that this operator is equal to its hermitian
         transpose.
@@ -147,7 +150,12 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   def _check_matrix(self, matrix):
     """Static check of the `matrix` argument."""
     allowed_dtypes = [
-        dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+        dtypes.complex64,
+        dtypes.complex128,
+    ]
 
     matrix = ops.convert_to_tensor(matrix, name="matrix")
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 740c6c811f2d98f62c200cda7242c6ad00de499d..50f3d407e85e4cca22ad6326931b5a2a736819a8 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
     "LinearOperatorIdentity",
@@ -97,6 +98,7 @@ class BaseLinearOperatorIdentity(linear_operator.LinearOperator):
     return array_ops.ones(shape=d_shape, dtype=self.dtype)
 
 
+@tf_export("linalg.LinearOperatorIdentity")
 class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   """`LinearOperator` acting like a [batch] square identity matrix.
 
@@ -460,6 +462,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
                        "%s" % self._batch_shape_static)
 
 
+@tf_export("linalg.LinearOperatorScaledIdentity")
 class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
   """`LinearOperator` acting like a scaled [batch] identity matrix `A = c I`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index ad3bb2efa94bfa9751c31ff0c704aad8faa58ba7..be911029095920d424ac90b406e7b85b73884b3b 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -27,12 +27,14 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_diag
 from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
     "LinearOperatorLowRankUpdate",
 ]
 
 
+@tf_export("linalg.LinearOperatorLowRankUpdate")
 class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
   """Perturb a `LinearOperator` with a rank `K` update.
 
@@ -150,8 +152,8 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     `is_X` matrix property hints, which will trigger the appropriate code path.
 
     Args:
-      base_operator:  Shape `[B1,...,Bb, M, N]` real `float32` or `float64`
-        `LinearOperator`.  This is `L` above.
+      base_operator:  Shape `[B1,...,Bb, M, N]` real `float16`, `float32` or
+        `float64` `LinearOperator`.  This is `L` above.
       u:  Shape `[B1,...,Bb, M, K]` `Tensor` of same `dtype` as `base_operator`.
         This is `U` above.
       diag_update:  Optional shape `[B1,...,Bb, K]` `Tensor` with same `dtype`
@@ -188,7 +190,11 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     #    because if diag has non-zero imaginary part, it will not be
     #    self-adjoint positive definite.
     dtype = base_operator.dtype
-    allowed_dtypes = [dtypes.float32, dtypes.float64]
+    allowed_dtypes = [
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+    ]
     if dtype not in allowed_dtypes:
       raise TypeError(
           "Argument matrix must have dtype in %s.  Found: %s"
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index 6ea55f0367bd55379b280f81f22df2c3a0dcfb1e..a5130188b681813e1ccd4818dabdffeeb663e20a 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -26,12 +26,14 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
     "LinearOperatorLowerTriangular",
 ]
 
 
+@tf_export("linalg.LinearOperatorLowerTriangular")
 class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
   """`LinearOperator` acting like a [batch] square lower triangular matrix.
 
@@ -118,7 +120,8 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
     Args:
       tril:  Shape `[B1,...,Bb, N, N]` with `b >= 0`, `N >= 0`.
         The lower triangular part of `tril` defines this operator.  The strictly
-        upper triangle is ignored.  Allowed dtypes: `float32`, `float64`.
+        upper triangle is ignored.  Allowed dtypes: `float16`, `float32`,
+        `float64`.
       is_non_singular:  Expect that this operator is non-singular.
         This operator is non-singular if and only if its diagonal elements are
         all non-zero.
@@ -164,7 +167,11 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
     """Static check of the `tril` argument."""
     # TODO(langmore) Add complex types once matrix_triangular_solve works for
     # them.
-    allowed_dtypes = [dtypes.float32, dtypes.float64]
+    allowed_dtypes = [
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+    ]
     dtype = tril.dtype
     if dtype not in allowed_dtypes:
       raise TypeError(
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 8a76fe3ce55bbdea1677f83fe075ed3bdc8d875d..3cbbf3412a2a1bd974354a5819d410b4074ab47d 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -268,29 +268,37 @@ def _SelfAdjointEigV2Grad(op, grad_e, grad_v):
 
 @ops.RegisterGradient("Svd")
 def _SvdGrad(op, grad_s, grad_u, grad_v):
-  """Gradient for Svd based on Giles' algorithm. Reference at top of file."""
-
-  if op.get_attr("compute_uv") and not op.get_attr("full_matrices"):
-    raise NotImplementedError(
-        "SVD gradient is not implemented for compute_uv=True and "
-        "full_matrices=False.")
+  """Gradient for the singular value decomposition."""
 
+  # The derivation for the compute_uv=False case, and most of
+  # the derivation for the full_matrices=True case, are in
+  # Giles' paper (see reference at top of file).  A derivation for
+  # the full_matrices=False case is available at
+  # https://j-towns.github.io/papers/svd-derivative.pdf
   a = op.inputs[0]
   a_shape = a.get_shape().with_rank_at_least(2)
+  grad_s_mat = array_ops.matrix_diag(grad_s)
 
-  if op.get_attr("compute_uv"):
-    # TODO(rmlarsen): Make this work with complex types.
-    if a.dtype.is_complex:
-      raise NotImplementedError(
-          "SVD gradient is not implemented for complex types and "
-          "compute_uv=True.")
-    grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
-    grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
-    m = a_shape[-2].merge_with(grad_u_shape[-2])
-    n = a_shape[-1].merge_with(grad_v_shape[-2])
-    batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with(
-        grad_v_shape[:-2])
-    a_shape = batch_shape.concatenate([m, n])
+  if not op.get_attr("compute_uv"):
+    s, u, v = linalg_ops.svd(a, compute_uv=True)
+    grad_a = math_ops.matmul(u, math_ops.matmul(grad_s_mat, v, adjoint_b=True))
+    grad_a.set_shape(a_shape)
+    return grad_a
+
+  full_matrices = op.get_attr("full_matrices")
+
+  # TODO(rmlarsen): Make this work with complex types.
+  if a.dtype.is_complex:
+    raise NotImplementedError(
+        "SVD gradient is not implemented for complex types and "
+        "compute_uv=True.")
+  grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
+  grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
+  m = a_shape[-2].merge_with(grad_u_shape[-2])
+  n = a_shape[-1].merge_with(grad_v_shape[-2])
+  batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with(
+      grad_v_shape[:-2])
+  a_shape = batch_shape.concatenate([m, n])
 
   m = a_shape[-2].value
   n = a_shape[-1].value
@@ -300,12 +308,9 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
         "SVD gradient has not been implemented for input with unknown "
         "inner matrix shape.")
 
-  if not op.get_attr("full_matrices") or not op.get_attr("compute_uv"):
-    s, u, v = linalg_ops.svd(a, compute_uv=True, full_matrices=True)
-  else:
-    s = op.outputs[0]
-    u = op.outputs[1]
-    v = op.outputs[2]
+  s = op.outputs[0]
+  u = op.outputs[1]
+  v = op.outputs[2]
 
   use_adjoint = False
   if m > n:
@@ -317,26 +322,10 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
     grad_u, grad_v = grad_v, grad_u
 
   with ops.control_dependencies([grad_s, grad_u, grad_v]):
-    grad_s_mat = array_ops.matrix_diag(grad_s)
-    if not op.get_attr("compute_uv"):
-      if use_adjoint:
-        grad_a = math_ops.matmul(
-            v[..., :, :m], math_ops.matmul(u, grad_s_mat), adjoint_b=True)
-      else:
-        grad_a = math_ops.matmul(u,
-                                 math_ops.matmul(
-                                     grad_s_mat, v[..., :, :m], adjoint_b=True))
-      grad_a.set_shape(a_shape)
-      return grad_a
-
-    # TODO(rmlarsen): Define a gradient that is numerically stable for
-    # abs(m-n) > 1. Currently this does not work because there are effectively
-    # multiple singular values with value zero. I am not sure if this is a true
-    # instability or if it simply throws off the finite difference gradient
-    # checker.
-    if abs(m - n) > 1:
+    if full_matrices and abs(m - n) > 1:
       raise NotImplementedError(
-          "svd gradient is not implemented for abs(m - n) > 1")
+          "svd gradient is not implemented for abs(m - n) > 1 "
+          "when full_matrices is True")
     s_mat = array_ops.matrix_diag(s)
     s2 = math_ops.square(s)
 
@@ -352,32 +341,45 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
             array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)),
         array_ops.zeros_like(s))
     s_inv_mat = array_ops.matrix_diag(math_ops.reciprocal(s))
+
+    v1 = v[..., :, :m]
+    grad_v1 = grad_v[..., :, :m]
+
     u_gu = math_ops.matmul(u, grad_u, adjoint_a=True)
-    v_gv = math_ops.matmul(v, grad_v, adjoint_a=True)
+    v_gv = math_ops.matmul(v1, grad_v1, adjoint_a=True)
 
-    if m == n:
-      f_u = f * u_gu
-      f_v = f * v_gv
-    else:
-      dv2 = array_ops.matrix_transpose(v_gv[..., m:n, :m]) - v_gv[..., :m, m:n]
-      f_u = f * u_gu
-      f_v = f * v_gv[..., :m, :m]
+    f_u = f * u_gu
+    f_v = f * v_gv
 
-    grad_a_nouv = (
+    term1_nouv = (
         grad_s_mat + math_ops.matmul(f_u + _linalg.adjoint(f_u), s_mat) +
         math_ops.matmul(s_mat, f_v + _linalg.adjoint(f_v)))
 
-    if m != n:
-      grad_a_nouv = array_ops.concat(
-          [grad_a_nouv, math_ops.matmul(s_inv_mat, dv2)], -1)
+    term1 = math_ops.matmul(u, math_ops.matmul(term1_nouv, v1, adjoint_b=True))
+
+    if m == n:
+      grad_a_before_transpose = term1
+    else:
+      gv1t = array_ops.matrix_transpose(grad_v1)
+      gv1t_v1 = math_ops.matmul(gv1t, v1)
+      term2_nous = gv1t - math_ops.matmul(gv1t_v1, v1, adjoint_b=True)
+
+      if full_matrices:
+        v2 = v[..., :, m:n]
+        grad_v2 = grad_v[..., :, m:n]
+
+        v1t_gv2 = math_ops.matmul(v1, grad_v2, adjoint_a=True)
+        term2_nous -= math_ops.matmul(v1t_gv2, v2, adjoint_b=True)
+
+      u_s_inv = math_ops.matmul(u, s_inv_mat)
+      term2 = math_ops.matmul(u_s_inv, term2_nous)
+
+      grad_a_before_transpose = term1 + term2
 
     if use_adjoint:
-      # Use (U X V^H)^H = V (U X)^H.
-      grad_a = math_ops.matmul(
-          v, math_ops.matmul(u, grad_a_nouv), adjoint_b=True)
+      grad_a = array_ops.matrix_transpose(grad_a_before_transpose)
     else:
-      grad_a = math_ops.matmul(u,
-                               math_ops.matmul(grad_a_nouv, v, adjoint_b=True))
+      grad_a = grad_a_before_transpose
 
     grad_a.set_shape(a_shape)
     return grad_a
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index be9beee633bb7c900b1618c2922b6eff5bf65df0..9803eed6aefe072cbe0841dff2de3f640a440dd5 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops.gen_linalg_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 # Names below are lower_case.
 # pylint: disable=invalid-name
@@ -77,6 +78,7 @@ def _RegularizedGramianCholesky(matrix, l2_regularizer, first_kind):
   return gen_linalg_ops.cholesky(gramian)
 
 
+@tf_export('cholesky_solve', 'linalg.cholesky_solve')
 def cholesky_solve(chol, rhs, name=None):
   """Solves systems of linear eqns `A X = RHS`, given Cholesky factorizations.
 
@@ -119,6 +121,7 @@ def cholesky_solve(chol, rhs, name=None):
     return x
 
 
+@tf_export('eye', 'linalg.eye')
 def eye(num_rows,
         num_columns=None,
         batch_shape=None,
@@ -188,6 +191,7 @@ def eye(num_rows,
       return array_ops.matrix_set_diag(zero_matrix, diag_ones)
 
 
+@tf_export('matrix_solve_ls', 'linalg.lstsq')
 def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
   r"""Solves one or more linear least-squares problems.
 
@@ -324,6 +328,7 @@ def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
   # pylint: enable=protected-access
 
 
+@tf_export('self_adjoint_eig', 'linalg.eigh')
 def self_adjoint_eig(tensor, name=None):
   """Computes the eigen decomposition of a batch of self-adjoint matrices.
 
@@ -346,6 +351,7 @@ def self_adjoint_eig(tensor, name=None):
   return e, v
 
 
+@tf_export('self_adjoint_eigvals', 'linalg.eigvalsh')
 def self_adjoint_eigvals(tensor, name=None):
   """Computes the eigenvalues of one or more self-adjoint matrices.
 
@@ -368,6 +374,7 @@ def self_adjoint_eigvals(tensor, name=None):
   return e
 
 
+@tf_export('svd', 'linalg.svd')
 def svd(tensor, full_matrices=False, compute_uv=True, name=None):
   r"""Computes the singular value decompositions of one or more matrices.
 
@@ -439,6 +446,7 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 
 
 # pylint: disable=redefined-builtin
+@tf_export('norm', 'linalg.norm')
 @deprecation.deprecated_args(
     None, 'keep_dims is deprecated, use keepdims instead', 'keep_dims')
 def norm(tensor,
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..bba59ebcef9c7caf1a53d724767999ae7ac079e5
--- /dev/null
+++ b/tensorflow/python/ops/list_ops.py
@@ -0,0 +1,94 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops to manipulate lists of tensors."""
+
+# pylint: disable=g-bad-name
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_list_ops
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.gen_list_ops import *
+# pylint: enable=wildcard-import
+
+
+@ops.RegisterGradient("TensorListPushBack")
+def _PushBackGrad(op, dresult):
+  return gen_list_ops.tensor_list_pop_back(
+      dresult, element_dtype=op.get_attr("element_dtype"))
+
+
+@ops.RegisterGradient("TensorListPopBack")
+def _PopBackGrad(op, dlist, delement):
+  if dlist is None:
+    dlist = gen_list_ops.empty_tensor_list(
+        element_dtype=delement.dtype,
+        element_shape=gen_list_ops.tensor_list_element_shape(
+            op.outputs[0], shape_type=dtypes.int32))
+  return gen_list_ops.tensor_list_push_back(dlist, delement)
+
+
+@ops.RegisterGradient("TensorListStack")
+def _TensorListStackGrad(unused_op, dtensor):
+  return gen_list_ops.tensor_list_from_tensor(dtensor,
+                                              element_shape=dtensor.shape[1:])
+
+
+@ops.RegisterGradient("TensorListFromTensor")
+def _TensorListFromTensorGrad(op, dlist):
+  """Gradient for TensorListFromTensor."""
+  if op.inputs[0].shape[0] is not None:
+    num_elements = op.inputs[0].shape[0]
+  else:
+    num_elements = None
+  if dlist is None:
+    dlist = gen_list_ops.empty_tensor_list(
+        element_dtype=op.inputs[0].dtype,
+        element_shape=gen_list_ops.tensor_list_element_shape(
+            op.outputs[0], shape_type=dtypes.int32))
+  return gen_list_ops.tensor_list_stack(
+      dlist, element_dtype=op.inputs[0].dtype,
+      num_elements=num_elements)
+
+
+@ops.RegisterGradient("TensorListGetItem")
+def _TensorListGetItemGrad(op, ditem):
+  """Gradient for TensorListGetItem."""
+  list_size = gen_list_ops.tensor_list_length(op.inputs[0])
+  list_grad = gen_list_ops.tensor_list_set_item(
+      gen_list_ops.tensor_list_reserve(
+          gen_list_ops.tensor_list_element_shape(op.inputs[0],
+                                                 shape_type=dtypes.int32),
+          list_size, element_dtype=ditem.dtype),
+      index=op.inputs[1],
+      item=ditem)
+  index_grad = None
+  return list_grad, index_grad
+
+
+@ops.RegisterGradient("TensorListSetItem")
+def _TensorListSetItemGrad(op, dlist):
+  _, index, item = op.inputs
+  list_grad = gen_list_ops.tensor_list_set_item(
+      dlist, index=index, item=array_ops.zeros_like(item))
+  index_grad = None
+  element_grad = gen_list_ops.tensor_list_get_item(
+      dlist, index, element_dtype=item.dtype)
+  return list_grad, index_grad, element_grad
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 08e3f83a0b21a8444ad3500c62fe624440edc255..eadbc1b7c3b6e66aa76c9afd860b2274ac1976ae 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops.gen_logging_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util.deprecation import deprecated
+from tensorflow.python.util.tf_export import tf_export
 
 # The python wrapper for Assert is in control_flow_ops, as the Assert
 # call relies on certain conditionals for its dependencies.  Use
@@ -35,12 +36,13 @@ from tensorflow.python.util.deprecation import deprecated
 
 # Assert and Print are special symbols in python, so we must
 # use an upper-case version of them.
+@tf_export("Print")
 def Print(input_, data, message=None, first_n=None, summarize=None,
           name=None):
   """Prints a list of tensors.
 
-  This is an identity op with the side effect of printing `data` when
-  evaluating.
+  This is an identity op (behaves like `tf.identity`) with the side effect
+  of printing `data` when evaluating.
 
   Note: This op prints to the standard error. It is not currently compatible
     with jupyter notebook (printing to the notebook *server's* output, not into
@@ -57,7 +59,7 @@ def Print(input_, data, message=None, first_n=None, summarize=None,
     name: A name for the operation (optional).
 
   Returns:
-    Same tensor as `input_`.
+    A `Tensor`. Has the same type and contents as `input_`.
   """
   return gen_logging_ops._print(input_, data, message, first_n, summarize, name)
 
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 8bc0bc7d06bb79056a0e691c1a39c97fa51f750f..f539a7bb68da57e31746bc80fb25339a03a4fafe 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import collections
 import functools
+import six
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -39,8 +40,10 @@ from tensorflow.python.ops.gen_lookup_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("initialize_all_tables")
 @deprecated(None, "Use `tf.tables_initializer` instead.")
 def initialize_all_tables(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
@@ -55,6 +58,7 @@ def initialize_all_tables(name="init_all_tables"):
   return tables_initializer(name)
 
 
+@tf_export("tables_initializer")
 def tables_initializer(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
 
@@ -83,10 +87,10 @@ def _check_table_dtypes(table, key_dtype, value_dtype):
     TypeError: when 'key_dtype' or 'value_dtype' doesn't match the table data
       types.
   """
-  if key_dtype != table.key_dtype:
+  if key_dtype.base_dtype != table.key_dtype:
     raise TypeError("Invalid key dtype, expected %s but got %s." %
                     (table.key_dtype, key_dtype))
-  if value_dtype != table.value_dtype:
+  if value_dtype.base_dtype != table.value_dtype:
     raise TypeError("Invalid value dtype, expected %s but got %s." %
                     (table.value_dtype, value_dtype))
 
@@ -216,7 +220,7 @@ class InitializableLookupTableBase(LookupInterface):
     if isinstance(keys, sparse_tensor.SparseTensor):
       key_tensor = keys.values
 
-    if keys.dtype != self._key_dtype:
+    if keys.dtype.base_dtype != self._key_dtype:
       raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
                       (self._key_dtype, keys.dtype))
 
@@ -527,7 +531,7 @@ class TextFileInitializer(TableInitializerBase):
     ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
     # If the filename tensor is anything other than a string constant (e.g., if
     # it is a placeholder) then it does not make sense to track it as an asset.
-    if constant_op.is_constant(filename):
+    if context.in_graph_mode() and constant_op.is_constant(filename):
       ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filename)
     return init_op
 
@@ -688,19 +692,22 @@ class IdTableWithHashBuckets(LookupInterface):
 
   For example, if an instance of `IdTableWithHashBuckets` is initialized with a
   string-to-id table that maps:
-  - emerson -> 0
-  - lake -> 1
-  - palmer -> 2
+
+  * `emerson -> 0`
+  * `lake -> 1`
+  * `palmer -> 2`
 
   The `IdTableWithHashBuckets` object will performs the following mapping:
-  - emerson -> 0
-  - lake -> 1
-  - palmer -> 2
-  - <other term> -> bucket id between 3 and 3 + num_oov_buckets - 1, calculated
-    by: hash(<term>) % num_oov_buckets + vocab_size
 
-  If input_tensor is ["emerson", "lake", "palmer", "king", "crimson"],
-  the lookup result is [0, 1, 2, 4, 7]
+  * `emerson -> 0`
+  * `lake -> 1`
+  * `palmer -> 2`
+  * `<other term> -> bucket_id`, where bucket_id will be between `3` and
+  `3 + num_oov_buckets - 1`, calculated by:
+  `hash(<term>) % num_oov_buckets + vocab_size`
+
+  If input_tensor is `["emerson", "lake", "palmer", "king", "crimson"]`,
+  the lookup result is `[0, 1, 2, 4, 7]`.
 
   If `table` is None, only out-of-vocabulary buckets are used.
 
@@ -845,7 +852,7 @@ class IdTableWithHashBuckets(LookupInterface):
     Raises:
       TypeError: when `keys` doesn't match the table key data type.
     """
-    if keys.dtype != self._key_dtype:
+    if keys.dtype.base_dtype != self._key_dtype:
       raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
                       (self._key_dtype, keys.dtype))
     values = keys
@@ -959,7 +966,7 @@ def index_table_from_file(vocabulary_file=None,
       than zero.
   """
   if vocabulary_file is None or (
-      isinstance(vocabulary_file, str) and not vocabulary_file):
+      isinstance(vocabulary_file, six.string_types) and not vocabulary_file):
     raise ValueError("vocabulary_file must be specified and must not be empty.")
   if num_oov_buckets < 0:
     raise ValueError("num_oov_buckets must be greater or equal than 0, got %d."
@@ -1163,7 +1170,7 @@ def index_to_string_table_from_file(vocabulary_file,
   ```
 
   Args:
-    vocabulary_file: The vocabulary filename.
+    vocabulary_file: The vocabulary filename, may be a constant scalar `Tensor`.
     vocab_size: Number of the elements in the vocabulary, if known.
     default_value: The value to use for out-of-vocabulary indices.
     name: A name for this op (optional).
@@ -1181,8 +1188,10 @@ def index_to_string_table_from_file(vocabulary_file,
     ValueError: when `vocabulary_file` is empty.
     ValueError: when `vocab_size` is invalid.
   """
-  if not vocabulary_file:
-    raise ValueError("vocabulary_file must be specified.")
+  if vocabulary_file is None or (
+      isinstance(vocabulary_file, six.string_types) and not vocabulary_file):
+    raise ValueError("vocabulary_file must be specified and must not be empty.")
+
   if vocab_size is not None and vocab_size < 1:
     raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size)
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index b74971f654294e25e131a6ba21d982da16cf4264..8e003fb7ac6462fb611a020e86b06b5987af9546 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -28,22 +28,33 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("losses.Reduction")
 class Reduction(object):
-  """Types of loss reduction."""
+  """Types of loss reduction.
+
+  Contains the following values:
+  `NONE`: Un-reduced weighted losses with the same shape as input.
+  `SUM`: Scalar sum of weighted losses.
+  `MEAN`: Scalar `SUM` divided by sum of weights.
+  `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
+  `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
+     weights.
+  `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
+  """
 
-  # Un-reduced weighted losses with the same shape as input.
   NONE = "none"
 
-  # Scalar sum of `NONE`.
   SUM = "weighted_sum"
 
-  # Scalar `SUM` divided by sum of weights.
   MEAN = "weighted_mean"
 
-  # Scalar `SUM` divided by number of non-zero weights.
+  SUM_OVER_BATCH_SIZE = "weighted_sum_over_batch_size"
+
   SUM_BY_NONZERO_WEIGHTS = "weighted_sum_by_nonzero_weights"
+  SUM_OVER_NONZERO_WEIGHTS = SUM_BY_NONZERO_WEIGHTS
 
   @classmethod
   def all(cls):
@@ -51,6 +62,8 @@ class Reduction(object):
         cls.NONE,
         cls.SUM,
         cls.MEAN,
+        cls.SUM_OVER_BATCH_SIZE,
+        cls.SUM_OVER_NONZERO_WEIGHTS,
         cls.SUM_BY_NONZERO_WEIGHTS)
 
   @classmethod
@@ -131,10 +144,17 @@ def _num_present(losses, weights, per_batch=False):
     if per_batch:
       return math_ops.reduce_sum(
           present, axis=math_ops.range(1, array_ops.rank(present)),
-          keep_dims=True, name=scope)
+          keepdims=True, name=scope)
     return math_ops.reduce_sum(present, name=scope)
 
 
+def _num_elements(losses):
+  """Computes the number of elements in `losses` tensor."""
+  with ops.name_scope(None, "num_elements", values=[losses]) as scope:
+    return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
+
+
+@tf_export("losses.compute_weighted_loss")
 def compute_weighted_loss(
     losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -157,6 +177,13 @@ def compute_weighted_loss(
     ValueError: If `weights` is `None` or the shape is not compatible with
       `losses`, or if the number of dimensions (rank) of either `losses` or
       `weights` is missing.
+
+  Note:
+    When calculating the gradient of a weighted loss contributions from
+    both `losses` and `weights` are considered. If your `weights` depend
+    on some model parameters but you do not want this to affect the loss
+    gradient, you need to apply @{tf.stop_gradient} to `weights` before
+    passing them to `compute_weighted_loss`.
   """
   Reduction.validate(reduction)
   with ops.name_scope(scope, "weighted_loss", (losses, weights)):
@@ -175,8 +202,11 @@ def compute_weighted_loss(
           loss = _safe_mean(
               loss,
               math_ops.reduce_sum(array_ops.ones_like(losses) * weights))
-        elif reduction == Reduction.SUM_BY_NONZERO_WEIGHTS:
+        elif (reduction == Reduction.SUM_BY_NONZERO_WEIGHTS or
+              reduction == Reduction.SUM_OVER_NONZERO_WEIGHTS):
           loss = _safe_mean(loss, _num_present(losses, weights))
+        elif reduction == Reduction.SUM_OVER_BATCH_SIZE:
+          loss = _safe_mean(loss, _num_elements(losses))
 
       # Convert the result back to the input type.
       loss = math_ops.cast(loss, input_dtype)
@@ -184,6 +214,7 @@ def compute_weighted_loss(
       return loss
 
 
+@tf_export("losses.absolute_difference")
 def absolute_difference(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -231,6 +262,7 @@ def absolute_difference(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.cosine_distance")
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def cosine_distance(
     labels, predictions, axis=None, weights=1.0, scope=None,
@@ -279,11 +311,12 @@ def cosine_distance(
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
     radial_diffs = math_ops.multiply(predictions, labels)
-    losses = 1 - math_ops.reduce_sum(radial_diffs, axis=(axis,), keep_dims=True)
+    losses = 1 - math_ops.reduce_sum(radial_diffs, axis=(axis,), keepdims=True)
     return compute_weighted_loss(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.hinge_loss")
 def hinge_loss(labels, logits, weights=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -325,6 +358,7 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.huber_loss")
 def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -343,7 +377,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
   the loss is simply scaled by the given value. If `weights` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
+  `[batch_size]`, then the total loss for each sample of the batch is rescaled
   by the corresponding element in the `weights` vector. If the shape of
   `weights` matches the shape of `predictions`, then the loss of each
   measurable element of `predictions` is scaled by the corresponding value of
@@ -388,11 +422,12 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
     # This is necessary to avoid doubling the gradient, since there is already a
     # nonzero contribution to the gradient from the quadratic term.
     linear = (abs_error - quadratic)
-    losses = 0.5 * quadratic**2 + delta * linear
+    losses = 0.5 * quadratic * quadratic + delta * linear
     return compute_weighted_loss(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.log_loss")
 def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
              loss_collection=ops.GraphKeys.LOSSES,
              reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -400,7 +435,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
   the loss is simply scaled by the given value. If `weights` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
+  `[batch_size]`, then the total loss for each sample of the batch is rescaled
   by the corresponding element in the `weights` vector. If the shape of
   `weights` matches the shape of `predictions`, then the loss of each
   measurable element of `predictions` is scaled by the corresponding value of
@@ -444,6 +479,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
 
 
 # TODO(b/37208492): Add reduction arg.
+@tf_export("losses.mean_pairwise_squared_error")
 def mean_pairwise_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES):
@@ -466,7 +502,7 @@ def mean_pairwise_squared_error(
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
   the loss is simply scaled by the given value. If `weights` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
+  `[batch_size]`, then the total loss for each sample of the batch is rescaled
   by the corresponding element in the `weights` vector.
 
   Args:
@@ -485,7 +521,7 @@ def mean_pairwise_squared_error(
 
   Raises:
     ValueError: If the shape of `predictions` doesn't match that of `labels` or
-      if the shape of `weights` is invalid.  Also if `labels` or `predictions
+      if the shape of `weights` is invalid.  Also if `labels` or `predictions`
       is None.
   """
   if labels is None:
@@ -507,16 +543,17 @@ def mean_pairwise_squared_error(
 
       sum_squares_diff_per_batch = math_ops.reduce_sum(
           math_ops.square(diffs), reduction_indices=reduction_indices,
-          keep_dims=True)
+          keepdims=True)
       num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
       term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                              num_present_per_batch)
+                              num_present_per_batch - 1)
 
       sum_diff = math_ops.reduce_sum(
-          diffs, reduction_indices=reduction_indices, keep_dims=True)
-      term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                              math_ops.square(num_present_per_batch))
+          diffs, reduction_indices=reduction_indices, keepdims=True)
+      term2 = 2.0 * _safe_div(
+          math_ops.square(sum_diff),
+          math_ops.multiply(num_present_per_batch, num_present_per_batch - 1))
 
       weighted_losses = math_ops.multiply(term1 - term2, weights)
       loss = math_ops.reduce_sum(weighted_losses)
@@ -530,6 +567,7 @@ def mean_pairwise_squared_error(
       return mean_loss
 
 
+@tf_export("losses.mean_squared_error")
 def mean_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -538,7 +576,7 @@ def mean_squared_error(
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
   the loss is simply scaled by the given value. If `weights` is a tensor of size
-  [batch_size], then the total loss for each sample of the batch is rescaled
+  `[batch_size]`, then the total loss for each sample of the batch is rescaled
   by the corresponding element in the `weights` vector. If the shape of
   `weights` matches the shape of `predictions`, then the loss of each
   measurable element of `predictions` is scaled by the corresponding value of
@@ -577,6 +615,7 @@ def mean_squared_error(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.sigmoid_cross_entropy")
 def sigmoid_cross_entropy(
     multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -635,6 +674,7 @@ def sigmoid_cross_entropy(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
+@tf_export("losses.softmax_cross_entropy")
 def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -686,9 +726,12 @@ def softmax_cross_entropy(
       smooth_negatives = label_smoothing / num_classes
       onehot_labels = onehot_labels * smooth_positives + smooth_negatives
 
-    losses = nn.softmax_cross_entropy_with_logits(labels=onehot_labels,
-                                                  logits=logits,
-                                                  name="xentropy")
+    onehot_labels = array_ops.stop_gradient(
+        onehot_labels, name="labels_stop_gradient")
+    losses = nn.softmax_cross_entropy_with_logits_v2(
+        labels=onehot_labels, logits=logits, name="xentropy")
+
+
     return compute_weighted_loss(
         losses, weights, scope, loss_collection, reduction=reduction)
 
@@ -744,6 +787,7 @@ def _remove_squeezable_dimensions(
   return labels, predictions, weights
 
 
+@tf_export("losses.sparse_softmax_cross_entropy")
 def sparse_softmax_cross_entropy(
     labels, logits, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -752,7 +796,7 @@ def sparse_softmax_cross_entropy(
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
   then the loss is simply scaled by the given value. If `weights` is a
-  tensor of shape [`batch_size`], then the loss weights apply to each
+  tensor of shape `[batch_size]`, then the loss weights apply to each
   corresponding sample.
 
   Args:
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index 3718c481c26afdd9f007ffc22a9e6ec44a1eb10e..b835d963869704f053de6c2f8a75ae1fa72e6a5d 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -30,8 +30,10 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("losses.add_loss")
 def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
   """Adds a externally defined loss to the collection of losses.
 
@@ -43,6 +45,7 @@ def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
     ops.add_to_collection(loss_collection, loss)
 
 
+@tf_export("losses.get_losses")
 def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   """Gets the list of losses from the loss_collection.
 
@@ -56,6 +59,7 @@ def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   return ops.get_collection(loss_collection, scope)
 
 
+@tf_export("losses.get_regularization_losses")
 def get_regularization_losses(scope=None):
   """Gets the list of regularization losses.
 
@@ -68,6 +72,7 @@ def get_regularization_losses(scope=None):
   return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
 
 
+@tf_export("losses.get_regularization_loss")
 def get_regularization_loss(scope=None, name="total_regularization_loss"):
   """Gets the total regularization loss.
 
@@ -85,6 +90,7 @@ def get_regularization_loss(scope=None, name="total_regularization_loss"):
     return constant_op.constant(0.0)
 
 
+@tf_export("losses.get_total_loss")
 def get_total_loss(add_regularization_losses=True, name="total_loss"):
   """Returns a tensor whose value represents the total loss.
 
diff --git a/tensorflow/python/ops/manip_grad.py b/tensorflow/python/ops/manip_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb2069359dd6fbe4874e228e6f2f58ea8444744d
--- /dev/null
+++ b/tensorflow/python/ops/manip_grad.py
@@ -0,0 +1,31 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gradients for operators defined in manip_ops.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import manip_ops
+
+
+@ops.RegisterGradient("Roll")
+def _RollGrad(op, grad):
+  # The gradient is just the roll reversed
+  shift = op.inputs[1]
+  axis = op.inputs[2]
+  roll_grad = manip_ops.roll(grad, -shift, axis)
+  return roll_grad, None, None
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..91e15b47b9400f29425af2f186c7c44ee6a5a622
--- /dev/null
+++ b/tensorflow/python/ops/manip_ops.py
@@ -0,0 +1,38 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operators for manipulating tensors.
+
+@@roll
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_manip_ops as _gen_manip_ops
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+# pylint: disable=protected-access
+def roll(input, shift, axis):  # pylint: disable=redefined-builtin
+  return _gen_manip_ops.roll(input, shift, axis)
+
+
+roll.__doc__ = _gen_manip_ops.roll.__doc__
+# pylint: enable=protected-access
+
+_allowed_symbols = ['roll']
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 38fe093ba7236ff7fe7b580a893501c84c71f6b1..9e7f37d80fdd71e84516ab450d145d79519ae47a 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -40,15 +40,16 @@ def _SumGrad(op, grad):
   """Gradient for Sum."""
   # Fast path for when reducing to a scalar and ndims is known: adds only
   # Reshape and Tile ops (and possibly a Shape).
-  if op.inputs[0].get_shape().ndims is not None:
+  input_0_shape = op.inputs[0]._shape_tuple()  # pylint: disable=protected-access
+  if input_0_shape is not None:
     axes = tensor_util.constant_value(op.inputs[1])
     if axes is not None:
-      rank = op.inputs[0].get_shape().ndims
+      rank = len(input_0_shape)
       if np.array_equal(axes, np.arange(rank)):  # Reduce all dims.
         grad = array_ops.reshape(grad, [1] * rank)
         # If shape is not fully defined (but rank is), we use Shape.
-        if op.inputs[0].get_shape().is_fully_defined():
-          input_shape = op.inputs[0].get_shape().as_list()
+        if None not in input_0_shape:
+          input_shape = input_0_shape
         else:
           input_shape = array_ops.shape(op.inputs[0])
         return [array_ops.tile(grad, input_shape), None]
@@ -96,9 +97,12 @@ def _MinGrad(op, grad):
 def _MeanGrad(op, grad):
   """Gradient for Mean."""
   sum_grad = _SumGrad(op, grad)[0]
-  input_size = op.inputs[0].get_shape().num_elements()
-  output_size = op.outputs[0].get_shape().num_elements()
-  if input_size is not None and output_size is not None:
+  input_shape = op.inputs[0]._shape_tuple()  # pylint: disable=protected-access
+  output_shape = op.outputs[0]._shape_tuple()  # pylint: disable=protected-access
+  if (input_shape is not None and output_shape is not None and
+      None not in input_shape and None not in output_shape):
+    input_size = np.prod(input_shape)
+    output_size = np.prod(output_shape)
     factor = input_size // max(output_size, 1)
     factor = constant_op.constant(factor, dtype=sum_grad.dtype)
   else:
@@ -106,7 +110,7 @@ def _MeanGrad(op, grad):
     output_shape = array_ops.shape(op.outputs[0])
     factor = _safe_shape_div(
         math_ops.reduce_prod(input_shape), math_ops.reduce_prod(output_shape))
-  return sum_grad / math_ops.cast(factor, sum_grad.dtype), None
+  return math_ops.truediv(sum_grad, math_ops.cast(factor, sum_grad.dtype)), None
 
 
 @ops.RegisterGradient("Prod")
@@ -169,8 +173,7 @@ def _SegmentMeanGrad(op, grad):
       array_ops.shape(op.inputs[1]),
       array_ops.fill(array_ops.expand_dims(input_rank - 1, 0), 1)
   ], 0)
-  ones = array_ops.fill(ones_shape,
-                        constant_op.constant(1, dtype=grad.dtype))
+  ones = array_ops.fill(ones_shape, constant_op.constant(1, dtype=grad.dtype))
   scaled_grad = math_ops.div(grad, math_ops.segment_sum(ones, op.inputs[1]))
   return array_ops.gather(scaled_grad, op.inputs[1]), None
 
@@ -184,6 +187,15 @@ def _SparseSegmentSumGrad(op, grad):
           None)
 
 
+@ops.RegisterGradient("SparseSegmentSumWithNumSegments")
+def _SparseSegmentSumWithNumSegmentsGrad(op, grad):
+  """Gradient for SparseSegmentSumWithNumSegments."""
+  input_rows = array_ops.shape(op.inputs[0])[0]
+  return (math_ops.unsorted_segment_sum(
+      array_ops.gather(grad, op.inputs[2]), op.inputs[1], input_rows), None,
+          None, None)
+
+
 @ops.RegisterGradient("SparseSegmentMean")
 def _SparseSegmentMeanGrad(op, grad):
   """Gradient for SparseSegmentMean."""
@@ -192,6 +204,14 @@ def _SparseSegmentMeanGrad(op, grad):
                                             dim0), None, None)
 
 
+@ops.RegisterGradient("SparseSegmentMeanWithNumSegments")
+def _SparseSegmentMeanWithNumSegmentsGrad(op, grad):
+  """Gradient for SparseSegmentMeanWithNumSegments."""
+  dim0 = array_ops.shape(op.inputs[0])[0]
+  return (math_ops.sparse_segment_mean_grad(grad, op.inputs[1], op.inputs[2],
+                                            dim0), None, None, None)
+
+
 @ops.RegisterGradient("SparseSegmentSqrtN")
 def _SparseSegmentSqrtNGrad(op, grad):
   """Gradient for SparseSegmentSqrtN."""
@@ -200,53 +220,150 @@ def _SparseSegmentSqrtNGrad(op, grad):
                                               dim0), None, None)
 
 
-def _SegmentMinOrMaxGrad(op, grad, is_sorted):
-  """Gradient for SegmentMin and (unsorted) SegmentMax. They share similar code."""
-  zeros = array_ops.zeros(array_ops.shape(op.inputs[0]),
-                          dtype=op.inputs[0].dtype)
+@ops.RegisterGradient("SparseSegmentSqrtNWithNumSegments")
+def _SparseSegmentSqrtNWithNumSegmentsGrad(op, grad):
+  """Gradient for SparseSegmentSqrtNWithNumSegments."""
+  dim0 = array_ops.shape(op.inputs[0])[0]
+  return (math_ops.sparse_segment_sqrt_n_grad(grad, op.inputs[1], op.inputs[2],
+                                              dim0), None, None, None)
 
+
+def _SegmentMinOrMaxGrad(op, grad):
+  """ Gradient for SegmentMin and SegmentMax. """
+  zeros = array_ops.zeros_like(op.inputs[0], dtype=op.inputs[0].dtype)
   # Get the number of selected (minimum or maximum) elements in each segment.
   gathered_outputs = array_ops.gather(op.outputs[0], op.inputs[1])
   is_selected = math_ops.equal(op.inputs[0], gathered_outputs)
-  if is_sorted:
-    num_selected = math_ops.segment_sum(math_ops.cast(is_selected, grad.dtype),
-                                        op.inputs[1])
-  else:
-    num_selected = math_ops.unsorted_segment_sum(
-        math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
-
+  num_selected = math_ops.segment_sum(math_ops.cast(is_selected, grad.dtype),
+                                      op.inputs[1])
   # Compute the gradient for each segment. The gradient for the ith segment is
   # divided evenly among the selected elements in that segment.
   weighted_grads = math_ops.div(grad, num_selected)
   gathered_grads = array_ops.gather(weighted_grads, op.inputs[1])
-
-  if is_sorted:
-    return array_ops.where(is_selected, gathered_grads, zeros), None
-  else:
-    return array_ops.where(is_selected, gathered_grads, zeros), None, None
+  return array_ops.where(is_selected, gathered_grads, zeros), None
 
 
 @ops.RegisterGradient("SegmentMin")
 def _SegmentMinGrad(op, grad):
   """Gradient for SegmentMin."""
-  return _SegmentMinOrMaxGrad(op, grad, True)
+  return _SegmentMinOrMaxGrad(op, grad)
 
 
 @ops.RegisterGradient("SegmentMax")
 def _SegmentMaxGrad(op, grad):
   """Gradient for SegmentMax."""
-  return _SegmentMinOrMaxGrad(op, grad, True)
+  return _SegmentMinOrMaxGrad(op, grad)
+
+
+def _GatherDropNegatives(params, ids, zero_clipped_indices=None,
+                         is_positive=None):
+  """ Helper function for unsorted segment ops. Gathers params for
+      positive segment ids and gathers 0 for inputs with negative segment id.
+      Also returns the clipped indices and a boolean mask with the same shape
+      as ids where a positive id is masked as true. With this, the latter two
+      can be passed as arguments to this function to reuse them.
+  """
+  if zero_clipped_indices is None:
+    zero_clipped_indices = math_ops.maximum(ids, array_ops.zeros_like(ids))
+  gathered = array_ops.gather(params, zero_clipped_indices)
+  if is_positive is None:
+    is_positive = math_ops.greater_equal(ids, 0)
+    # tf.where(condition, x, y) requires condition to have the same shape as x
+    # and y.
+    # todo(philjd): remove this if tf.where supports broadcasting (#9284)
+    for _ in range(gathered.shape.ndims - is_positive.shape.ndims):
+      is_positive = array_ops.expand_dims(is_positive, -1)
+    is_positive = (is_positive &
+                   array_ops.ones_like(gathered, dtype=dtypes.bool))
+  # replace gathered params of negative indices with 0
+  zero_slice = array_ops.zeros_like(gathered)
+  return (array_ops.where(is_positive, gathered, zero_slice),
+          zero_clipped_indices, is_positive)
+
+
+def _UnsortedSegmentMinOrMaxGrad(op, grad):
+  """ Gradient for UnsortedSegmentMin and UnsortedSegmentMax. """
+  # Get the number of selected (minimum or maximum) elements in each segment.
+  gathered_outputs, zero_clipped_indices, is_positive = \
+      _GatherDropNegatives(op.outputs[0], op.inputs[1])
+  is_selected = math_ops.equal(op.inputs[0], gathered_outputs)
+  is_selected = math_ops.logical_and(is_selected, is_positive)
+  num_selected = math_ops.unsorted_segment_sum(
+      math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
+  # Compute the gradient for each segment. The gradient for the ith segment is
+  # divided evenly among the selected elements in that segment.
+  weighted_grads = math_ops.div(grad, num_selected)
+  gathered_grads, _, _ = _GatherDropNegatives(weighted_grads, None,
+                                              zero_clipped_indices,
+                                              is_positive)
+  zeros = array_ops.zeros_like(gathered_grads)
+  return array_ops.where(is_selected, gathered_grads, zeros), None, None
 
 
 @ops.RegisterGradient("UnsortedSegmentSum")
 def _UnsortedSegmentSumGrad(op, grad):
-  """Gradient for SegmentSum."""
-  return array_ops.gather(grad, op.inputs[1]), None, None
+  """Gradient for UnsortedSegmentSum."""
+  return _GatherDropNegatives(grad, op.inputs[1])[0], None, None
 
 
 @ops.RegisterGradient("UnsortedSegmentMax")
 def _UnsortedSegmentMaxGrad(op, grad):
-  return _SegmentMinOrMaxGrad(op, grad, False)
+  """ Gradient for UnsortedSegmentMax. """
+  return _UnsortedSegmentMinOrMaxGrad(op, grad)
+
+
+@ops.RegisterGradient("UnsortedSegmentMin")
+def _UnsortedSegmentMinGrad(op, grad):
+  """ Gradient for UnsortedSegmentMin. """
+  return _UnsortedSegmentMinOrMaxGrad(op, grad)
+
+
+@ops.RegisterGradient("UnsortedSegmentProd")
+def _UnsortedSegmentProdGrad(op, grad):
+  """ Gradient for UnsortedSegmentProd.
+  The gradient can be expressed for each segment by dividing the segment's
+  product by each element of the segment input tensor, but this approach can't
+  deal with zeros in the input.
+  Unlike reduce_prod we can't use cumsum here as individual segments may have
+  a different number of elements. Therefore we consider three cases:
+  1) A segment input contains no zeros and we can safely divide by the input
+     tensor.
+  2) A segment contains exactly one zero. Then the gradient of each input of
+     the segment is zero except for the 0-input, there the gradient is
+     the product of the remaining segment entries.
+  3) A segment contains at least two zeros. The gradient is zero for all
+     segment inputs.
+  """
+  # Note that unsorted_segment_sum will filter out the negative indices,
+  # so we don't need to do a logical_and with is_positive here
+  is_zero = math_ops.equal(op.inputs[0], 0)
+  num_zeros = gen_math_ops.unsorted_segment_sum(
+      math_ops.cast(is_zero, dtype=dtypes.int32), op.inputs[1], op.inputs[2])
+  # handle case 3 and set the gradient to 0 for segments with more than one
+  # 0 as input
+  grad = array_ops.where(math_ops.greater(num_zeros, 1),
+                         array_ops.zeros_like(grad), grad)
+  # replace all zeros with ones and compute the unsorted_segment_prod
+  non_zero_data = array_ops.where(is_zero, array_ops.ones_like(op.inputs[0]),
+                                  op.inputs[0])
+  non_zero_prod = gen_math_ops.unsorted_segment_prod(
+      non_zero_data, op.inputs[1], op.inputs[2])
+  # clip the indices for gather to be positive
+  zero_clipped_indices = math_ops.maximum(op.inputs[1],
+                                          array_ops.zeros_like(op.inputs[1]))
+  gathered_prod = array_ops.gather(op.outputs[0], zero_clipped_indices)
+  gathered_non_zero_prod = array_ops.gather(non_zero_prod,
+                                            zero_clipped_indices)
+  prod_divided_by_el = gathered_prod / op.inputs[0]  # May contain nan/inf.
+  # Now fetch the individual results for segments containing 0 and those that
+  # don't. is_zero will also fetch results for entries with negative index
+  # but the following gather_drop_negatives sets the corresponding entry in
+  # grad to 0 for these
+  partial_derivative = array_ops.where(is_zero, gathered_non_zero_prod,
+                                       prod_divided_by_el)
+  gathered_grad = _GatherDropNegatives(grad, op.inputs[1],
+                                       zero_clipped_indices)[0]
+  return gathered_grad * partial_derivative, None, None
 
 
 @ops.RegisterGradient("Abs")
@@ -305,7 +422,7 @@ def _SquareGrad(op, grad):
   # Added control dependencies to prevent 2*x from being computed too early.
   with ops.control_dependencies([grad]):
     x = math_ops.conj(x)
-    return grad * (2.0 * x)
+    return math_ops.multiply(grad, math_ops.multiply(x, 2.0))
 
 
 @ops.RegisterGradient("Sqrt")
@@ -507,8 +624,8 @@ def _IgammaGrad(op, grad):
   # and Gamma'(a) can grow large.
   partial_x = math_ops.exp(-x + (a - 1) * math_ops.log(x) - math_ops.lgamma(a))
   # TODO(b/36815900): Mark None return values as NotImplemented
-  return (None,
-          array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
+  return (None, array_ops.reshape(
+      math_ops.reduce_sum(partial_x * grad, rx), sx))
 
 
 @ops.RegisterGradient("Igammac")
@@ -534,15 +651,17 @@ def _BetaincGrad(op, grad):
 
   # Perform operations in log space before summing, because terms
   # can grow large.
-  log_beta = (gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b)
-              - gen_math_ops.lgamma(a + b))
-  partial_x = math_ops.exp(
-      (b - 1) * math_ops.log(1 - x) + (a - 1) * math_ops.log(x) - log_beta)
+  log_beta = (
+      gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b) -
+      gen_math_ops.lgamma(a + b))
+  partial_x = math_ops.exp((b - 1) * math_ops.log(1 - x) +
+                           (a - 1) * math_ops.log(x) - log_beta)
 
   # TODO(b/36815900): Mark None return values as NotImplemented
-  return (None,  # da
-          None,  # db
-          array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
+  return (
+      None,  # da
+      None,  # db
+      array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
 
 
 @ops.RegisterGradient("Zeta")
@@ -706,10 +825,8 @@ def _ShapesFullySpecifiedAndEqual(x, y, grad):
   y_shape = y._shape_tuple()
   grad_shape = grad._shape_tuple()
   # pylint: enable=protected-access
-  return (x_shape == y_shape and
-          x_shape == grad_shape and
-          x_shape is not None and
-          None not in x_shape)
+  return (x_shape == y_shape and x_shape == grad_shape and
+          x_shape is not None and None not in x_shape)
 
 
 @ops.RegisterGradient("Add")
@@ -731,8 +848,12 @@ def _AddGrad(op, grad):
 
 @ops.RegisterGradient("Sub")
 def _SubGrad(op, grad):
+  """Gradient for Sub."""
   x = op.inputs[0]
   y = op.inputs[1]
+  if (isinstance(grad, ops.Tensor) and
+      _ShapesFullySpecifiedAndEqual(x, y, grad)):
+    return grad, -grad
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
   # pylint: disable=protected-access
@@ -756,11 +877,13 @@ def _MulGrad(op, grad):
   sx = array_ops.shape(x)
   sy = array_ops.shape(y)
   rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
-  # pylint: enable=protected-access
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  return (array_ops.reshape(math_ops.reduce_sum(grad * y, rx), sx),
-          array_ops.reshape(math_ops.reduce_sum(x * grad, ry), sy))
+  return (array_ops.reshape(
+      math_ops.reduce_sum(gen_math_ops._mul(grad, y), rx), sx),
+          array_ops.reshape(
+              math_ops.reduce_sum(gen_math_ops._mul(x, grad), ry), sy))
+  # pylint: enable=protected-access
 
 
 @ops.RegisterGradient("Div")
@@ -823,10 +946,10 @@ def _RealDivGrad(op, grad):
   x = math_ops.conj(x)
   y = math_ops.conj(y)
   return (array_ops.reshape(
-      math_ops.reduce_sum(math_ops.realdiv(grad, y), rx),
-      sx), array_ops.reshape(
-          math_ops.reduce_sum(grad * math_ops.realdiv(math_ops.realdiv(-x, y), y),
-                              ry), sy))
+      math_ops.reduce_sum(math_ops.realdiv(grad, y), rx), sx),
+          array_ops.reshape(
+              math_ops.reduce_sum(
+                  grad * math_ops.realdiv(math_ops.realdiv(-x, y), y), ry), sy))
 
 
 @ops.RegisterGradient("Pow")
@@ -921,8 +1044,8 @@ def _SelectGrad(op, grad):
   c = op.inputs[0]
   x = op.inputs[1]
   zeros = array_ops.zeros_like(x)
-  return (None, array_ops.where(c, grad, zeros),
-          array_ops.where(c, zeros, grad))
+  return (None, array_ops.where(c, grad, zeros), array_ops.where(
+      c, zeros, grad))
 
 
 @ops.RegisterGradient("MatMul")
@@ -933,18 +1056,20 @@ def _MatMulGrad(op, grad):
   t_b = op.get_attr("transpose_b")
   a = math_ops.conj(op.inputs[0])
   b = math_ops.conj(op.inputs[1])
+  # pylint: disable=protected-access
   if not t_a and not t_b:
-    grad_a = math_ops.matmul(grad, b, transpose_b=True)
-    grad_b = math_ops.matmul(a, grad, transpose_a=True)
+    grad_a = gen_math_ops._mat_mul(grad, b, transpose_b=True)
+    grad_b = gen_math_ops._mat_mul(a, grad, transpose_a=True)
   elif not t_a and t_b:
-    grad_a = math_ops.matmul(grad, b)
-    grad_b = math_ops.matmul(grad, a, transpose_a=True)
+    grad_a = gen_math_ops._mat_mul(grad, b)
+    grad_b = gen_math_ops._mat_mul(grad, a, transpose_a=True)
   elif t_a and not t_b:
-    grad_a = math_ops.matmul(b, grad, transpose_b=True)
-    grad_b = math_ops.matmul(a, grad)
+    grad_a = gen_math_ops._mat_mul(b, grad, transpose_b=True)
+    grad_b = gen_math_ops._mat_mul(a, grad)
   elif t_a and t_b:
-    grad_a = math_ops.matmul(b, grad, transpose_a=True, transpose_b=True)
-    grad_b = math_ops.matmul(grad, a, transpose_a=True, transpose_b=True)
+    grad_a = gen_math_ops._mat_mul(b, grad, transpose_a=True, transpose_b=True)
+    grad_b = gen_math_ops._mat_mul(grad, a, transpose_a=True, transpose_b=True)
+  # pylint: enable=protected-access
   return grad_a, grad_b
 
 
@@ -984,21 +1109,20 @@ def _SparseMatMulGrad(op, grad):
   dtype_a = op.inputs[0].dtype
   dtype_b = op.inputs[1].dtype
   if not t_a and not t_b:
-    return (_SparseMatMul(
-        grad, op.inputs[1], dtype_a, transpose_b=True), _SparseMatMul(
-            op.inputs[0], grad, dtype_b, transpose_a=True))
+    return (_SparseMatMul(grad, op.inputs[1], dtype_a, transpose_b=True),
+            _SparseMatMul(op.inputs[0], grad, dtype_b, transpose_a=True))
   elif not t_a and t_b:
-    return (_SparseMatMul(grad, op.inputs[1], dtype_a), _SparseMatMul(
-        grad, op.inputs[0], dtype_b, transpose_a=True))
+    return (_SparseMatMul(grad, op.inputs[1], dtype_a),
+            _SparseMatMul(grad, op.inputs[0], dtype_b, transpose_a=True))
   elif t_a and not t_b:
-    return (_SparseMatMul(
-        op.inputs[1], grad, dtype_a, transpose_b=True),
+    return (_SparseMatMul(op.inputs[1], grad, dtype_a, transpose_b=True),
             _SparseMatMul(op.inputs[0], grad, dtype_b))
   elif t_a and t_b:
     return (_SparseMatMul(
-        op.inputs[1], grad, dtype_a, transpose_a=True,
-        transpose_b=True), _SparseMatMul(
-            grad, op.inputs[0], dtype_b, transpose_a=True, transpose_b=True))
+        op.inputs[1], grad, dtype_a, transpose_a=True, transpose_b=True),
+            _SparseMatMul(
+                grad, op.inputs[0], dtype_b, transpose_a=True,
+                transpose_b=True))
 
 
 @ops.RegisterGradient("Floor")
@@ -1102,8 +1226,8 @@ def _ComplexAbsGrad(op, grad):
   """Returns the gradient of ComplexAbs."""
   # TODO(b/27786104): The cast to complex could be removed once arithmetic
   # supports mixtures of complex64 and real values.
-  return (math_ops.complex(grad, array_ops.zeros_like(grad)) *
-          math_ops.sign(op.inputs[0]))
+  return (math_ops.complex(grad, array_ops.zeros_like(grad)) * math_ops.sign(
+      op.inputs[0]))
 
 
 @ops.RegisterGradient("Cast")
@@ -1133,8 +1257,8 @@ def _CumsumGrad(op, grad):
   exclusive = op.get_attr("exclusive")
   reverse = op.get_attr("reverse")
   return [
-      math_ops.cumsum(
-          grad, axis, exclusive=exclusive, reverse=not reverse), None
+      math_ops.cumsum(grad, axis, exclusive=exclusive, reverse=not reverse),
+      None
   ]
 
 
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index e2e23dccefabd500e184d7af95222052b609fa96..da9957aa2a5463a37bba155597600a340ee4f1e6 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -90,6 +90,7 @@ See the @{$python/math_ops} guide.
 @@cholesky
 @@cholesky_solve
 @@matrix_exponential
+@@matrix_logarithm
 @@matrix_solve
 @@matrix_triangular_solve
 @@matrix_solve_ls
@@ -130,6 +131,9 @@ See the @{$python/math_ops} guide.
 @@segment_mean
 @@unsorted_segment_sum
 @@unsorted_segment_max
+@@unsorted_segment_min
+@@unsorted_segment_prod
+@@unsorted_segment_sqrt_n
 @@sparse_segment_sum
 @@sparse_segment_mean
 @@sparse_segment_sqrt_n
@@ -171,6 +175,7 @@ from tensorflow.python.ops.gen_math_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
 linspace = gen_math_ops.lin_space
@@ -189,6 +194,7 @@ def _set_doc(doc):
 
 
 # pylint: disable=redefined-builtin
+@tf_export("argmax")
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -208,6 +214,7 @@ def argmax(input,
   return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
 
 
+@tf_export("argmin")
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -232,7 +239,8 @@ def argmin(input,
 
 # pylint: disable=anomalous-backslash-in-string,protected-access
 # pylint: disable=g-docstring-has-escape
-def abs(x, name=None):
+@tf_export("abs")
+def abs(x, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the absolute value of a tensor.
 
   Given a tensor `x` of complex numbers, this operation returns a tensor of type
@@ -252,7 +260,7 @@ def abs(x, name=None):
   Returns:
     A `Tensor` or `SparseTensor` the same size and type as `x` with absolute
       values.
-    Note, for `complex64` or `complex128' input, the returned `Tensor` will be
+    Note, for `complex64` or `complex128` input, the returned `Tensor` will be
       of type `float32` or `float64`, respectively.
   """
   with ops.name_scope(name, "Abs", [x]) as name:
@@ -306,6 +314,7 @@ class DivideDelegateWithName(object):
     return _div_python2(self.x, y, self.name)
 
 
+@tf_export("divide")
 def divide(x, y, name=None):
   """Computes Python style division of `x` by `y`."""
 
@@ -317,6 +326,7 @@ def divide(x, y, name=None):
     return x / y
 
 
+@tf_export("multiply")
 def multiply(x, y, name=None):
   return gen_math_ops._mul(x, y, name)
 
@@ -336,6 +346,7 @@ _mul.__doc__ = (
     gen_math_ops._mul.__doc__ + ("" if _mul.__doc__ is None else _mul.__doc__))
 
 
+@tf_export("subtract")
 def subtract(x, y, name=None):
   return gen_math_ops._sub(x, y, name)
 
@@ -356,6 +367,7 @@ _sub.__doc__ = (
 
 
 # pylint: disable=g-docstring-has-escape
+@tf_export("negative")
 def negative(x, name=None):
   """Computes numerical negative value element-wise.
 
@@ -404,6 +416,7 @@ def _neg(x, name=None):
 # pylint: enable=g-docstring-has-escape
 
 
+@tf_export("sign")
 def sign(x, name=None):
   """Returns an element-wise indication of the sign of a number.
 
@@ -434,6 +447,7 @@ def sign(x, name=None):
       return gen_math_ops.sign(x, name=name)
 
 
+@tf_export("square")
 def square(x, name=None):
   r"""Computes square of x element-wise.
 
@@ -456,6 +470,7 @@ def square(x, name=None):
       return gen_math_ops.square(x, name=name)
 
 
+@tf_export("sqrt")
 def sqrt(x, name=None):
   r"""Computes square root of x element-wise.
 
@@ -478,6 +493,7 @@ def sqrt(x, name=None):
       return gen_math_ops.sqrt(x, name=name)
 
 
+@tf_export("erf")
 def erf(x, name=None):
   """Computes the Gauss error function of `x` element-wise.
 
@@ -498,6 +514,7 @@ def erf(x, name=None):
       return gen_math_ops.erf(x, name=name)
 
 
+@tf_export("scalar_mul")
 def scalar_mul(scalar, x):
   """Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
 
@@ -527,7 +544,8 @@ def scalar_mul(scalar, x):
     raise ValueError("Only scalar multiply works, got shape %s" % shape)
 
 
-def pow(x, y, name=None):
+@tf_export("pow")
+def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the power of one value to another.
 
   Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
@@ -554,6 +572,7 @@ def pow(x, y, name=None):
 
 
 # pylint: disable=redefined-builtin,redefined-outer-name
+@tf_export("complex")
 def complex(real, imag, name=None):
   r"""Converts two real numbers to a complex number.
 
@@ -595,6 +614,7 @@ def complex(real, imag, name=None):
     return gen_math_ops._complex(real, imag, Tout=Tout, name=name)
 
 
+@tf_export("real")
 def real(input, name=None):
   r"""Returns the real part of a complex (or real) tensor.
 
@@ -625,6 +645,7 @@ def real(input, name=None):
       return input
 
 
+@tf_export("imag")
 def imag(input, name=None):
   r"""Returns the imaginary part of a complex (or real) tensor.
 
@@ -654,6 +675,7 @@ def imag(input, name=None):
       return array_ops.zeros_like(input)
 
 
+@tf_export("angle")
 def angle(input, name=None):
   r"""Returns the element-wise argument of a complex (or real) tensor.
 
@@ -692,7 +714,8 @@ def angle(input, name=None):
 # pylint: enable=redefined-outer-name,redefined-builtin
 
 
-def round(x, name=None):
+@tf_export("round")
+def round(x, name=None):  # pylint: disable=redefined-builtin
   """Rounds the values of a tensor to the nearest integer, element-wise.
 
   Rounds half to even.  Also known as bankers rounding. If you want to round
@@ -718,6 +741,7 @@ def round(x, name=None):
     return gen_math_ops.round(x, name=name)
 
 
+@tf_export("cast")
 def cast(x, dtype, name=None):
   """Casts a tensor to a new type.
 
@@ -758,6 +782,7 @@ def cast(x, dtype, name=None):
       return gen_math_ops.cast(x, base_type, name=name)
 
 
+@tf_export("saturate_cast")
 def saturate_cast(value, dtype, name=None):
   """Performs a safe saturating cast of `value` to `dtype`.
 
@@ -791,6 +816,7 @@ def saturate_cast(value, dtype, name=None):
     return cast(value, dtype, name=name)
 
 
+@tf_export("to_float")
 def to_float(x, name="ToFloat"):
   """Casts a tensor to type `float32`.
 
@@ -807,6 +833,7 @@ def to_float(x, name="ToFloat"):
   return cast(x, dtypes.float32, name=name)
 
 
+@tf_export("to_double")
 def to_double(x, name="ToDouble"):
   """Casts a tensor to type `float64`.
 
@@ -823,6 +850,7 @@ def to_double(x, name="ToDouble"):
   return cast(x, dtypes.float64, name=name)
 
 
+@tf_export("to_int32")
 def to_int32(x, name="ToInt32"):
   """Casts a tensor to type `int32`.
 
@@ -839,6 +867,7 @@ def to_int32(x, name="ToInt32"):
   return cast(x, dtypes.int32, name=name)
 
 
+@tf_export("to_int64")
 def to_int64(x, name="ToInt64"):
   """Casts a tensor to type `int64`.
 
@@ -855,6 +884,7 @@ def to_int64(x, name="ToInt64"):
   return cast(x, dtypes.int64, name=name)
 
 
+@tf_export("to_bfloat16")
 def to_bfloat16(x, name="ToBFloat16"):
   """Casts a tensor to type `bfloat16`.
 
@@ -950,6 +980,7 @@ _TRUEDIV_TABLE = {
     dtypes.int16: dtypes.float32,
     dtypes.int32: dtypes.float64,
     dtypes.int64: dtypes.float64,
+    dtypes.bfloat16: None,
     dtypes.float16: None,
     dtypes.float32: None,
     dtypes.float64: None,
@@ -1027,6 +1058,7 @@ def _div_python2(x, y, name=None):
       return gen_math_ops._floor_div(x, y, name=name)
 
 
+@tf_export("truediv")
 def truediv(x, y, name=None):
   """Divides x / y elementwise (using Python 3 division operator semantics).
 
@@ -1058,6 +1090,7 @@ def truediv(x, y, name=None):
   return _truediv_python3(x, y, name)
 
 
+@tf_export("div")
 def div(x, y, name=None):
   """Divides x / y elementwise (using Python 2 division operator semantics).
 
@@ -1085,6 +1118,7 @@ mod = gen_math_ops._floor_mod
 
 # TODO(aselle): Deprecate this once all internal functionality uses
 # tf.truncatediv
+@tf_export("floordiv")
 def floordiv(x, y, name=None):
   """Divides `x / y` elementwise, rounding toward the most negative integer.
 
@@ -1155,6 +1189,7 @@ _OverrideBinaryOperatorHelper(gen_math_ops._floor_mod, "mod")
 _OverrideBinaryOperatorHelper(pow, "pow")
 
 
+@tf_export("logical_xor")
 def logical_xor(x, y, name="LogicalXor"):
   """x ^ y = (x | y) & ~(x & y)."""
   # TODO(alemi) Make this a cwise op if people end up relying on it.
@@ -1174,7 +1209,8 @@ ops.Tensor._override_operator("__gt__", gen_math_ops.greater)
 ops.Tensor._override_operator("__ge__", gen_math_ops.greater_equal)
 
 
-def range(start, limit=None, delta=1, dtype=None, name="range"):
+@tf_export("range")
+def range(start, limit=None, delta=1, dtype=None, name="range"):  # pylint: disable=redefined-builtin
   """Creates a sequence of numbers.
 
   Creates a sequence of numbers that begins at `start` and extends by
@@ -1279,6 +1315,7 @@ def _may_reduce_to_scalar(keepdims, axis, reduction_indices, output):
   return output
 
 
+@tf_export("reduce_sum")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_sum(input_tensor,
@@ -1339,6 +1376,7 @@ def reduce_sum(input_tensor,
                                    name=name))
 
 
+@tf_export("count_nonzero")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def count_nonzero(input_tensor,
@@ -1405,6 +1443,7 @@ def count_nonzero(input_tensor,
         dtype=dtype)
 
 
+@tf_export("reduce_mean")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_mean(input_tensor,
@@ -1436,7 +1475,7 @@ def reduce_mean(input_tensor,
     input_tensor: The tensor to reduce. Should have numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+      `[-rank(input_tensor), rank(input_tensor)]`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1476,6 +1515,7 @@ def reduce_mean(input_tensor,
                                    name=name))
 
 
+@tf_export("reduce_prod")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_prod(input_tensor,
@@ -1525,6 +1565,7 @@ def reduce_prod(input_tensor,
                                    name=name))
 
 
+@tf_export("reduce_min")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_min(input_tensor,
@@ -1573,6 +1614,7 @@ def reduce_min(input_tensor,
                                    name=name))
 
 
+@tf_export("reduce_max")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_max(input_tensor,
@@ -1621,6 +1663,7 @@ def reduce_max(input_tensor,
                                    name=name))
 
 
+@tf_export("reduce_all")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_all(input_tensor,
@@ -1678,6 +1721,7 @@ def reduce_all(input_tensor,
                                    name=name))
 
 
+@tf_export("reduce_any")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_any(input_tensor,
@@ -1735,6 +1779,7 @@ def reduce_any(input_tensor,
                                    name=name))
 
 
+@tf_export("reduce_logsumexp")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_logsumexp(input_tensor,
@@ -1799,15 +1844,15 @@ def reduce_logsumexp(input_tensor,
         reduce_sum(
             gen_math_ops.exp(input_tensor - my_max),
             axis,
-            keepdims=True,
-            reduction_indices=reduction_indices)) + my_max
+            keepdims=keepdims,
+            reduction_indices=reduction_indices))
     if not keepdims:
-      if isinstance(axis, int):
-        axis = [axis]
-      result = array_ops.squeeze(result, axis)
+      my_max = array_ops.reshape(my_max, array_ops.shape(result))
+    result += my_max
     return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result)
 
 
+@tf_export("trace", "linalg.trace")
 def trace(x, name=None):
   """Compute the trace of a tensor `x`.
 
@@ -1849,6 +1894,7 @@ def trace(x, name=None):
     return reduce_sum(array_ops.matrix_diag_part(x), [-1], name=name)
 
 
+@tf_export("matmul")
 def matmul(a,
            b,
            transpose_a=False,
@@ -2002,7 +2048,7 @@ def matmul(a,
       # matmul currently doesn't handle bfloat16 inputs.
       use_sparse_matmul = True
     if use_sparse_matmul:
-      return sparse_matmul(
+      ret = sparse_matmul(
           a,
           b,
           transpose_a=transpose_a,
@@ -2010,6 +2056,12 @@ def matmul(a,
           a_is_sparse=a_is_sparse,
           b_is_sparse=b_is_sparse,
           name=name)
+      # sparse_matmul always returns float32, even with
+      # bfloat16 inputs. This prevents us from configuring bfloat16 training.
+      # casting to bfloat16 also matches non-sparse matmul behavior better.
+      if a.dtype == dtypes.bfloat16 and b.dtype == dtypes.bfloat16:
+        ret = cast(ret, dtypes.bfloat16)
+      return ret
     else:
       return gen_math_ops._mat_mul(
           a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
@@ -2095,6 +2147,7 @@ def _as_indexed_slices_list(inputs, optimize=True):
   return casted_outputs
 
 
+@tf_export("add_n")
 def add_n(inputs, name=None):
   """Adds all input tensors element-wise.
 
@@ -2124,6 +2177,7 @@ def add_n(inputs, name=None):
   return gen_math_ops._add_n(inputs, name=name)
 
 
+@tf_export("accumulate_n")
 def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   """Returns the element-wise sum of a list of tensors.
 
@@ -2208,6 +2262,7 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
             ref, var_name=var.op.name, name=name)
 
 
+@tf_export("nn.sigmoid", "sigmoid")
 def sigmoid(x, name=None):
   """Computes sigmoid of `x` element-wise.
 
@@ -2230,6 +2285,7 @@ def sigmoid(x, name=None):
     return gen_math_ops._sigmoid(x, name=name)
 
 
+@tf_export("log_sigmoid")
 def log_sigmoid(x, name=None):
   """Computes log sigmoid of `x` element-wise.
 
@@ -2248,6 +2304,7 @@ def log_sigmoid(x, name=None):
     return gen_math_ops._neg(gen_nn_ops.softplus(-x), name=name)
 
 
+@tf_export("nn.tanh", "tanh")
 def tanh(x, name=None):
   """Computes hyperbolic tangent of `x` element-wise.
 
@@ -2268,6 +2325,7 @@ def tanh(x, name=None):
       return gen_math_ops._tanh(x, name=name)
 
 
+@tf_export("bincount")
 def bincount(arr,
              weights=None,
              minlength=None,
@@ -2314,6 +2372,7 @@ def bincount(arr,
   return gen_math_ops.bincount(arr, output_size, weights)
 
 
+@tf_export("cumsum")
 def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative sum of the tensor `x` along `axis`.
 
@@ -2365,6 +2424,7 @@ def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
         x, axis, exclusive=exclusive, reverse=reverse, name=name)
 
 
+@tf_export("cumprod")
 def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative product of the tensor `x` along `axis`.
 
@@ -2416,6 +2476,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
         x, axis, exclusive=exclusive, reverse=reverse, name=name)
 
 
+@tf_export("conj")
 def conj(x, name=None):
   r"""Returns the complex conjugate of a complex number.
 
@@ -2494,6 +2555,244 @@ def reduced_shape(input_shape, axes):
       ])  # [1, 1]
 
 
+def _unsorted_segment_N(data, segment_ids, num_segments):
+  """ Helper function for unsorted_segment_mean/_sqrtN. Computes the number
+      of segment entries with 0-entries set to 1 to allow division by N.
+  """
+  # bincount doesn't support negative indices so we use unsorted_segment_sum
+  ones_tensor = array_ops.ones(segment_ids.shape, dtype=data.dtype)
+  N = gen_math_ops.unsorted_segment_sum(ones_tensor, segment_ids, num_segments)
+  # add dimensions for all non-reduced axes
+  ndims_output = data.shape.ndims - segment_ids.shape.ndims
+  broadcast_shape = [num_segments] + [1] * ndims_output
+  N = array_ops.reshape(N, broadcast_shape)
+  return gen_math_ops.maximum(N, 1)
+
+
+@tf_export("unsorted_segment_mean")
+def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
+  r""" Computes the mean along segments of a tensor.
+
+  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  This operator is similar to the unsorted segment sum operator found
+  [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+  Instead of computing the sum over segments, it computes the mean of all
+  entries belonging to a segment such that:
+
+  \\(output_i = 1/N_i \sum data_j\\) where the sum is over `j` such
+  that `segment_ids[j] == i` with \\N_i\\ being the number of occurrences
+  of id \\i\\.
+
+  If there is no entry for a given segment ID `i`, it outputs 0.
+
+  segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+  first dimension.
+
+  output: Has same shape as data, except for dimension 0 which
+  has size `num_segments`.
+  """
+  with ops.name_scope(name, "UnsortedSegmentMean"):
+    data = ops.convert_to_tensor(data)
+    segment_ids = ops.convert_to_tensor(segment_ids)
+    N = _unsorted_segment_N(data, segment_ids, num_segments)
+    summed = gen_math_ops.unsorted_segment_sum(data, segment_ids, num_segments)
+    return summed / N
+
+
+@tf_export("unsorted_segment_sqrt_n")
+def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
+  r"""Computes the sum along segments of a tensor divided by the sqrt(N).
+
+  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  This operator is similar to the unsorted segment sum operator found
+  [here](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+  Additionally to computing the sum over segments, it divides the results by
+  sqrt(N).
+
+  \\(output_i = 1/sqrt(N_i) \sum data_j\\) where the sum is over `j` such
+  that `segment_ids[j] == i` with \\N_i\\ being the number of occurrences
+  of id \\i\\.
+
+  If there is no entry for a given segment ID `i`, it outputs 0.
+
+  Note that this op only supports floating point and complex dtypes,
+  due to tf.sqrt only supporting these types.
+
+  segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+  first dimension.
+
+  output: Has same shape as data, except for dimension 0 which
+  has size `num_segments`.
+  """
+  with ops.name_scope(name, "UnsortedSegmentSqrtN"):
+    data = ops.convert_to_tensor(data)
+    segment_ids = ops.convert_to_tensor(segment_ids)
+    N = _unsorted_segment_N(data, segment_ids, num_segments)
+    summed = gen_math_ops.unsorted_segment_sum(data, segment_ids, num_segments)
+    return summed / gen_math_ops.sqrt(N)
+
+
+@tf_export("sparse_segment_sum")
+def sparse_segment_sum(data, indices, segment_ids, name=None,
+                       num_segments=None):
+  r"""Computes the sum along sparse segments of a tensor.
+
+  Read @{$math_ops#Segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+  dimension, selecting a subset of dimension 0, specified by `indices`.
+  `segment_ids` is allowed to have missing ids, in which case the output will
+  be zeros at those indices. In those cases `num_segments` is used to determine
+  the size of the output.
+
+  For example:
+
+  ```python
+  c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+
+  # Select two rows, one segment.
+  tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+  # => [[0 0 0 0]]
+
+  # Select two rows, two segment.
+  tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+  # => [[ 1  2  3  4]
+  #     [-1 -2 -3 -4]]
+
+  # With missing segment ids.
+  tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 2]),
+                        num_segments=4)
+  # => [[ 1  2  3  4]
+  #     [ 0  0  0  0]
+  #     [-1 -2 -3 -4]
+  #     [ 0  0  0  0]]
+
+  # Select all rows, two segments.
+  tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+  # => [[0 0 0 0]
+  #     [5 6 7 8]]
+
+  # Which is equivalent to:
+  tf.segment_sum(c, tf.constant([0, 0, 1]))
+  ```
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`.
+      Values should be sorted and can be repeated.
+    name: A name for the operation (optional).
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  if num_segments is not None:
+    return gen_math_ops.sparse_segment_sum_with_num_segments(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        num_segments=num_segments,
+        name=name)
+  else:
+    return gen_math_ops.sparse_segment_sum(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        name=name)
+
+
+@tf_export("sparse_segment_mean")
+def sparse_segment_mean(data, indices, segment_ids, name=None,
+                        num_segments=None):
+  r"""Computes the mean along sparse segments of a tensor.
+
+  Read @{$math_ops#Segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+  dimension, selecting a subset of dimension 0, specified by `indices`.
+  `segment_ids` is allowed to have missing ids, in which case the output will
+  be zeros at those indices. In those cases `num_segments` is used to determine
+  the size of the output.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`.
+      Values should be sorted and can be repeated.
+    name: A name for the operation (optional).
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  if num_segments is not None:
+    return gen_math_ops.sparse_segment_mean_with_num_segments(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        num_segments=num_segments,
+        name=name)
+  else:
+    return gen_math_ops.sparse_segment_mean(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        name=name)
+
+
+@tf_export("sparse_segment_sqrt_n")
+def sparse_segment_sqrt_n(data, indices, segment_ids, name=None,
+                          num_segments=None):
+  r"""Computes the sum along sparse segments of a tensor divided by the sqrt(N).
+
+  `N` is the size of the segment being reduced.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`.
+      Values should be sorted and can be repeated.
+    name: A name for the operation (optional).
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  if num_segments is not None:
+    return gen_math_ops.sparse_segment_sqrt_n_with_num_segments(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        num_segments=num_segments,
+        name=name)
+  else:
+    return gen_math_ops.sparse_segment_sqrt_n(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        name=name)
+
+
+@tf_export("tensordot", "linalg.tensordot")
 def tensordot(a, b, axes, name=None):
   r"""Tensor contraction of a and b along specified axes.
 
@@ -2611,10 +2910,14 @@ def tensordot(a, b, axes, name=None):
     """Generates two sets of contraction axes for the two tensor arguments."""
     a_shape = a.get_shape()
     if isinstance(axes, compat.integral_types):
-      if axes < 1:
-        raise ValueError("'axes' must be at least 1.")
+      if axes < 0:
+        raise ValueError("'axes' must be at least 0.")
       if a_shape.ndims is not None:
-        return range(a_shape.ndims - axes, a_shape.ndims), range(axes)
+        if axes > a_shape.ndims:
+          raise ValueError("'axes' must not be larger than the number of "
+                           "dimensions of tensor %s." % a)
+        return (list(xrange(a_shape.ndims - axes, a_shape.ndims)),
+                list(xrange(axes)))
       else:
         rank = array_ops.rank(a)
         return (range(rank - axes, rank, dtype=dtypes.int32),
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index bd26ff66961c858865c8a61469abac0b783ed645..d314124ccd9bc8b7676e6926830a8eb1e0315f5f 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -105,7 +105,7 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
       with self.test_session(use_gpu=True):
-        y_tf_np = math_ops.reduce_logsumexp(x_np, keep_dims=True).eval()
+        y_tf_np = math_ops.reduce_logsumexp(x_np, keepdims=True).eval()
         self.assertEqual(y_tf_np.ndim, x_np.ndim)
         y_np = log(np.sum(exp(x_np), keepdims=True))
         self.assertAllClose(y_tf_np, y_np)
diff --git a/tensorflow/python/ops/matmul_benchmark.py b/tensorflow/python/ops/matmul_benchmark.py
index f95cf08de1aaa47550fa344dc9f964c4f812cd68..6e5fe74290a219d07945998be2677176ca693cd9 100644
--- a/tensorflow/python/ops/matmul_benchmark.py
+++ b/tensorflow/python/ops/matmul_benchmark.py
@@ -95,8 +95,8 @@ class MatmulBenchmark(test.Benchmark):
         num_items = n * m * k * 2
         throughput = num_items * num_iters / duration / 1e9
         print('%s %s input_info:%s %d %.4fsec, %.4fGitems/s.' %
-              (device, str(dtype), str(n) + 'x' + str(m) + 'x' + str(k) + ',ta:'
-               + str(transpose_a) + '.tb:' + str(transpose_b), num_iters,
+              (device, str(dtype), str(n) + 'x' + str(m) + 'x' + str(k) +
+               ',ta:' + str(transpose_a) + '.tb:' + str(transpose_b), num_iters,
                duration, throughput))
 
     name_template = ('matmul_{device}_{dtype}_input_info_{inputinfo}')
@@ -112,7 +112,8 @@ class MatmulBenchmark(test.Benchmark):
     return duration
 
   def run_test_gpu(self, n, m, k, transpose_a, transpose_b, dtype, num_iters):
-    self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b, num_iters, dtype)
+    self.run_graph(test.gpu_device_name(), n, m, k, transpose_a, transpose_b,
+                   num_iters, dtype)
 
   def test_round(self, num_iters):
     dtypes = [np.float32, np.float64]
@@ -124,8 +125,8 @@ class MatmulBenchmark(test.Benchmark):
         self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)
 
       for n, m, k, (transpose_a, transpose_b) in itertools.product(
-          [200], [1, 8, 20], [10000], [(False, False), (True, False), (False,
-                                                                       True)]):
+          [200], [1, 8, 20], [10000], [(False, False), (True, False),
+                                       (False, True)]):
         self.run_test_gpu(n, m, k, transpose_a, transpose_b, dtype, num_iters)
 
       for (n, m, k), (transpose_a, transpose_b) in itertools.product(
diff --git a/tensorflow/python/ops/matmul_benchmark_test.py b/tensorflow/python/ops/matmul_benchmark_test.py
index 5a9c0a7a4951bbbc1d201f6fbc557e9a996a3655..3df0c66ef9c50909dd8c03b75654d6cf0fd7d709 100644
--- a/tensorflow/python/ops/matmul_benchmark_test.py
+++ b/tensorflow/python/ops/matmul_benchmark_test.py
@@ -33,11 +33,11 @@ def BuildGraphTest(n, m, k, transpose_a, transpose_b, dtype):
 
   def Test(self):
     if not googletest.is_gpu_available():
-      tf_logging.info("Skipping BuildGraphTest %s", (n, m, k, transpose_a,
-                                                     transpose_b))
+      tf_logging.info("Skipping BuildGraphTest %s",
+                      (n, m, k, transpose_a, transpose_b))
       return
-    tf_logging.info("Testing BuildGraphTest %s", (n, m, k, transpose_a,
-                                                  transpose_b))
+    tf_logging.info("Testing BuildGraphTest %s",
+                    (n, m, k, transpose_a, transpose_b))
     self._VerifyBuildGraph(n, m, k, transpose_a, transpose_b, dtype)
 
   return Test
@@ -47,11 +47,11 @@ def RunGraphTest(n, m, k, transpose_a, transpose_b, dtype):
 
   def Test(self):
     if not googletest.is_gpu_available():
-      tf_logging.info("Skipping RunGraphTest %s", (n, m, k, transpose_a,
-                                                   transpose_b))
+      tf_logging.info("Skipping RunGraphTest %s",
+                      (n, m, k, transpose_a, transpose_b))
       return
-    tf_logging.info("Testing RunGraphTest %s", (n, m, k, transpose_a,
-                                                transpose_b))
+    tf_logging.info("Testing RunGraphTest %s",
+                    (n, m, k, transpose_a, transpose_b))
     self._VerifyRunGraph(n, m, k, transpose_a, transpose_b, dtype)
 
   return Test
@@ -71,40 +71,41 @@ class MatmulBenchmarkTest(googletest.TestCase):
   def _VerifyBuildGraph(self, n, m, k, transpose_a, transpose_b, dtype):
     graph = ops.Graph()
     with graph.as_default():
-      matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k, transpose_a, transpose_b,
-                                   dtype)
+      matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k,
+                                   transpose_a, transpose_b, dtype)
       gd = graph.as_graph_def()
-      dev=googletest.gpu_device_name()
+      dev = googletest.gpu_device_name()
       proto_expected = """
-      node { name: "random_uniform/shape" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/min" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/max" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \""""+ dev +"""\" }
-      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \""""+ dev +"""\" }
-      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \""""+ dev +"""\" }
-      node { name: "Variable" op: "VariableV2" device: \""""+ dev +"""\" }
-      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \""""+ dev +"""\" }
-      node { name: "Variable/read" op: "Identity" input: "Variable" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/shape" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/min" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/max" op: "Const" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \""""+ dev +"""\" }
-      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \""""+ dev +"""\" }
-      node { name: "Variable_1" op: "VariableV2" device: \""""+ dev +"""\" }
-      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \""""+ dev +"""\" }
-      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \""""+ dev +"""\" }
-      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \""""+ dev +"""\" }
-      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \""""+ dev +"""\" }
+      node { name: "random_uniform/shape" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform/min" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform/max" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \"""" + dev + """\" }
+      node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \"""" + dev + """\" }
+      node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \"""" + dev + """\" }
+      node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \"""" + dev + """\" }
+      node { name: "Variable" op: "VariableV2" device: \"""" + dev + """\" }
+      node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \"""" + dev + """\" }
+      node { name: "Variable/read" op: "Identity" input: "Variable" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/shape" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/min" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/max" op: "Const" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \"""" + dev + """\" }
+      node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \"""" + dev + """\" }
+      node { name: "Variable_1" op: "VariableV2" device: \"""" + dev + """\" }
+      node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \"""" + dev + """\" }
+      node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \"""" + dev + """\" }
+      node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \"""" + dev + """\" }
+      node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \"""" + dev + """\" }
                        """
       self.assertProtoEquals(str(proto_expected), self._StripGraph(gd))
 
   def _VerifyRunGraph(self, n, m, k, transpose_a, transpose_b, dtype):
     benchmark_instance = matmul_benchmark.MatmulBenchmark()
-    duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m, k, transpose_a,
-                                            transpose_b, 1, dtype)
+    duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m,
+                                            k, transpose_a, transpose_b, 1,
+                                            dtype)
     self.assertTrue(duration > 1e-6)
 
 
@@ -113,8 +114,8 @@ if __name__ == "__main__":
   index = 0
   for _dtype in dtypes:
     for _n, _m, (_transpose_a, _transpose_b) in itertools.product(
-        [512, 1024], [1, 8, 16, 128], [(False, False), (True, False), (False,
-                                                                       True)]):
+        [512, 1024], [1, 8, 16, 128], [(False, False), (True, False),
+                                       (False, True)]):
       _k = _n
       setattr(MatmulBenchmarkTest, "testBuildGraph_" + str(index),
               BuildGraphTest(_n, _m, _k, _transpose_a, _transpose_b, _dtype))
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index e04121ee31d1b6c82151bf7415b3e73614b24781..7776ff08c4f55c43947010f313d8167596b15db7 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.util.deprecation import deprecated
+from tensorflow.python.util.tf_export import tf_export
 
 
 def metric_variable(shape, dtype, validate_shape=True, name=None):
@@ -99,27 +100,29 @@ def _remove_squeezable_dimensions(predictions, labels, weights):
     # Use dynamic rank.
     weights_rank_tensor = array_ops.rank(weights)
     rank_diff = weights_rank_tensor - array_ops.rank(predictions)
+
     def _maybe_expand_weights():
       return control_flow_ops.cond(
           math_ops.equal(rank_diff, -1),
-          lambda: array_ops.expand_dims(weights, [-1]),
-          lambda: weights)
+          lambda: array_ops.expand_dims(weights, [-1]), lambda: weights)
+
     # Don't attempt squeeze if it will fail based on static check.
     if ((weights_rank is not None) and
         (not weights_shape.dims[-1].is_compatible_with(1))):
       maybe_squeeze_weights = lambda: weights
     else:
       maybe_squeeze_weights = lambda: array_ops.squeeze(weights, [-1])
+
     def _maybe_adjust_weights():
       return control_flow_ops.cond(
-          math_ops.equal(rank_diff, 1),
-          maybe_squeeze_weights,
+          math_ops.equal(rank_diff, 1), maybe_squeeze_weights,
           _maybe_expand_weights)
+
     # If weights are scalar, do nothing. Otherwise, try to add or remove a
     # dimension to match predictions.
     weights = control_flow_ops.cond(
-        math_ops.equal(weights_rank_tensor, 0),
-        lambda: weights, _maybe_adjust_weights)
+        math_ops.equal(weights_rank_tensor, 0), lambda: weights,
+        _maybe_adjust_weights)
   return predictions, labels, weights
 
 
@@ -164,18 +167,18 @@ def _maybe_expand_labels(labels, predictions):
         if predictions_rank == labels_rank + 1:
           return array_ops.expand_dims(labels, -1, name=scope)
         raise ValueError(
-            'Unexpected labels shape %s for predictions shape %s.' % (
-                labels.get_shape(), predictions.get_shape()))
+            'Unexpected labels shape %s for predictions shape %s.' %
+            (labels.get_shape(), predictions.get_shape()))
 
     # Otherwise, use dynamic shape.
     return control_flow_ops.cond(
-        math_ops.equal(array_ops.rank(predictions), array_ops.rank(labels) + 1),
-        lambda: array_ops.expand_dims(labels, -1, name=scope),
-        lambda: labels)
+        math_ops.equal(array_ops.rank(predictions),
+                       array_ops.rank(labels) + 1),
+        lambda: array_ops.expand_dims(labels, -1, name=scope), lambda: labels)
 
 
 def _safe_div(numerator, denominator, name):
-  """Divides two values, returning 0 if the denominator is <= 0.
+  """Divides two tensors element-wise, returning 0 if the denominator is <= 0.
 
   Args:
     numerator: A real `Tensor`.
@@ -185,11 +188,11 @@ def _safe_div(numerator, denominator, name):
   Returns:
     0 if `denominator` <= 0, else `numerator` / `denominator`
   """
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.truediv(numerator, denominator),
-      0,
-      name=name)
+  t = math_ops.truediv(numerator, denominator)
+  zero = array_ops.zeros_like(t, dtype=denominator.dtype)
+  condition = math_ops.greater(denominator, zero)
+  zero = math_ops.cast(zero, t.dtype)
+  return array_ops.where(condition, t, zero, name=name)
 
 
 def _safe_scalar_div(numerator, denominator, name):
@@ -262,8 +265,12 @@ def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None):
   return total_cm, update_op
 
 
-def mean(values, weights=None, metrics_collections=None,
-         updates_collections=None, name=None):
+@tf_export('metrics.mean')
+def mean(values,
+         weights=None,
+         metrics_collections=None,
+         updates_collections=None,
+         name=None):
   """Computes the (weighted) mean of the given values.
 
   The `mean` function creates two local variables, `total` and `count`
@@ -337,8 +344,13 @@ def mean(values, weights=None, metrics_collections=None,
     return mean_t, update_op
 
 
-def accuracy(labels, predictions, weights=None, metrics_collections=None,
-             updates_collections=None, name=None):
+@tf_export('metrics.accuracy')
+def accuracy(labels,
+             predictions,
+             weights=None,
+             metrics_collections=None,
+             updates_collections=None,
+             name=None):
   """Calculates how often `predictions` matches `labels`.
 
   The `accuracy` function creates two local variables, `total` and
@@ -392,12 +404,15 @@ def accuracy(labels, predictions, weights=None, metrics_collections=None,
   if labels.dtype != predictions.dtype:
     predictions = math_ops.cast(predictions, labels.dtype)
   is_correct = math_ops.to_float(math_ops.equal(predictions, labels))
-  return mean(is_correct, weights, metrics_collections,
-              updates_collections, name or 'accuracy')
+  return mean(is_correct, weights, metrics_collections, updates_collections,
+              name or 'accuracy')
 
 
-def _confusion_matrix_at_thresholds(
-    labels, predictions, thresholds, weights=None, includes=None):
+def _confusion_matrix_at_thresholds(labels,
+                                    predictions,
+                                    thresholds,
+                                    weights=None,
+                                    includes=None):
   """Computes true_positives, false_negatives, true_negatives, false_positives.
 
   This function creates up to four local variables, `true_positives`,
@@ -495,8 +510,8 @@ def _confusion_matrix_at_thresholds(
   if weights is not None:
     weights = weights_broadcast_ops.broadcast_weights(
         math_ops.to_float(weights), predictions)
-    weights_tiled = array_ops.tile(array_ops.reshape(
-        weights, [1, -1]), [num_thresholds, 1])
+    weights_tiled = array_ops.tile(
+        array_ops.reshape(weights, [1, -1]), [num_thresholds, 1])
     thresh_tiled.get_shape().assert_is_compatible_with(
         weights_tiled.get_shape())
   else:
@@ -512,8 +527,9 @@ def _confusion_matrix_at_thresholds(
         math_ops.logical_and(label_is_pos, pred_is_pos))
     if weights_tiled is not None:
       is_true_positive *= weights_tiled
-    update_ops['tp'] = state_ops.assign_add(
-        true_p, math_ops.reduce_sum(is_true_positive, 1))
+    update_ops['tp'] = state_ops.assign_add(true_p,
+                                            math_ops.reduce_sum(
+                                                is_true_positive, 1))
     values['tp'] = true_p
 
   if 'fn' in includes:
@@ -523,8 +539,9 @@ def _confusion_matrix_at_thresholds(
         math_ops.logical_and(label_is_pos, pred_is_neg))
     if weights_tiled is not None:
       is_false_negative *= weights_tiled
-    update_ops['fn'] = state_ops.assign_add(
-        false_n, math_ops.reduce_sum(is_false_negative, 1))
+    update_ops['fn'] = state_ops.assign_add(false_n,
+                                            math_ops.reduce_sum(
+                                                is_false_negative, 1))
     values['fn'] = false_n
 
   if 'tn' in includes:
@@ -534,8 +551,9 @@ def _confusion_matrix_at_thresholds(
         math_ops.logical_and(label_is_neg, pred_is_neg))
     if weights_tiled is not None:
       is_true_negative *= weights_tiled
-    update_ops['tn'] = state_ops.assign_add(
-        true_n, math_ops.reduce_sum(is_true_negative, 1))
+    update_ops['tn'] = state_ops.assign_add(true_n,
+                                            math_ops.reduce_sum(
+                                                is_true_negative, 1))
     values['tn'] = true_n
 
   if 'fp' in includes:
@@ -545,16 +563,24 @@ def _confusion_matrix_at_thresholds(
         math_ops.logical_and(label_is_neg, pred_is_pos))
     if weights_tiled is not None:
       is_false_positive *= weights_tiled
-    update_ops['fp'] = state_ops.assign_add(
-        false_p, math_ops.reduce_sum(is_false_positive, 1))
+    update_ops['fp'] = state_ops.assign_add(false_p,
+                                            math_ops.reduce_sum(
+                                                is_false_positive, 1))
     values['fp'] = false_p
 
   return values, update_ops
 
 
-def auc(labels, predictions, weights=None, num_thresholds=200,
-        metrics_collections=None, updates_collections=None,
-        curve='ROC', name=None, summation_method='trapezoidal'):
+@tf_export('metrics.auc')
+def auc(labels,
+        predictions,
+        weights=None,
+        num_thresholds=200,
+        metrics_collections=None,
+        updates_collections=None,
+        curve='ROC',
+        name=None,
+        summation_method='trapezoidal'):
   """Computes the approximate AUC via a Riemann sum.
 
   The `auc` function creates four local variables, `true_positives`,
@@ -622,14 +648,14 @@ def auc(labels, predictions, weights=None, num_thresholds=200,
     raise RuntimeError('tf.metrics.auc is not supported when eager execution '
                        'is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'auc', (labels, predictions, weights)):
+  with variable_scope.variable_scope(name, 'auc',
+                                     (labels, predictions, weights)):
     if curve != 'ROC' and curve != 'PR':
-      raise ValueError('curve must be either ROC or PR, %s unknown' %
-                       (curve))
+      raise ValueError('curve must be either ROC or PR, %s unknown' % (curve))
     kepsilon = 1e-7  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds-2)]
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
     thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
     values, update_ops = _confusion_matrix_at_thresholds(
@@ -637,6 +663,7 @@ def auc(labels, predictions, weights=None, num_thresholds=200,
 
     # Add epsilons to avoid dividing by 0.
     epsilon = 1.0e-6
+
     def compute_auc(tp, fn, tn, fp, name):
       """Computes the roc-auc or pr-auc based on confusion counts."""
       rec = math_ops.div(tp + epsilon, tp + fn + epsilon)
@@ -667,11 +694,10 @@ def auc(labels, predictions, weights=None, num_thresholds=200,
         raise ValueError('Invalid summation_method: %s' % summation_method)
 
     # sum up the areas of all the trapeziums
-    auc_value = compute_auc(
-        values['tp'], values['fn'], values['tn'], values['fp'], 'value')
-    update_op = compute_auc(
-        update_ops['tp'], update_ops['fn'], update_ops['tn'], update_ops['fp'],
-        'update_op')
+    auc_value = compute_auc(values['tp'], values['fn'], values['tn'],
+                            values['fp'], 'value')
+    update_op = compute_auc(update_ops['tp'], update_ops['fn'],
+                            update_ops['tn'], update_ops['fp'], 'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, auc_value)
@@ -682,7 +708,10 @@ def auc(labels, predictions, weights=None, num_thresholds=200,
     return auc_value, update_op
 
 
-def mean_absolute_error(labels, predictions, weights=None,
+@tf_export('metrics.mean_absolute_error')
+def mean_absolute_error(labels,
+                        predictions,
+                        weights=None,
                         metrics_collections=None,
                         updates_collections=None,
                         name=None):
@@ -740,7 +769,11 @@ def mean_absolute_error(labels, predictions, weights=None,
               updates_collections, name or 'mean_absolute_error')
 
 
-def mean_cosine_distance(labels, predictions, dim, weights=None,
+@tf_export('metrics.mean_cosine_distance')
+def mean_cosine_distance(labels,
+                         predictions,
+                         dim,
+                         weights=None,
                          metrics_collections=None,
                          updates_collections=None,
                          name=None):
@@ -796,10 +829,8 @@ def mean_cosine_distance(labels, predictions, dim, weights=None,
       radial_diffs, reduction_indices=[
           dim,
       ], keepdims=True)
-  mean_distance, update_op = mean(radial_diffs, weights,
-                                  None,
-                                  None,
-                                  name or 'mean_cosine_distance')
+  mean_distance, update_op = mean(radial_diffs, weights, None, None, name or
+                                  'mean_cosine_distance')
   mean_distance = math_ops.subtract(1.0, mean_distance)
   update_op = math_ops.subtract(1.0, update_op)
 
@@ -812,6 +843,7 @@ def mean_cosine_distance(labels, predictions, dim, weights=None,
   return mean_distance, update_op
 
 
+@tf_export('metrics.mean_per_class_accuracy')
 def mean_per_class_accuracy(labels,
                             predictions,
                             num_classes,
@@ -824,8 +856,8 @@ def mean_per_class_accuracy(labels,
   Calculates the accuracy for each class, then takes the mean of that.
 
   For estimation of the metric over a stream of data, the function creates an
-  `update_op` operation that updates these variables and returns the
-  `mean_accuracy`.
+  `update_op` operation that updates the accuracy of each class and returns
+  them.
 
   If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
 
@@ -836,8 +868,8 @@ def mean_per_class_accuracy(labels,
       shape is [batch size] and type `int32` or `int64`. The tensor will be
       flattened if its rank > 1.
     num_classes: The possible number of labels the prediction task can
-      have. This value must be provided, since a confusion matrix of
-      dimension = [num_classes, num_classes] will be allocated.
+      have. This value must be provided, since two variables with shape =
+      [num_classes] will be allocated.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
       be either `1`, or the same as the corresponding `labels` dimension).
@@ -850,7 +882,7 @@ def mean_per_class_accuracy(labels,
 
   Returns:
     mean_accuracy: A `Tensor` representing the mean per class accuracy.
-    update_op: An operation that increments the confusion matrix.
+    update_op: An operation that updates the accuracy tensor.
 
   Raises:
     ValueError: If `predictions` and `labels` have mismatched shapes, or if
@@ -865,27 +897,43 @@ def mean_per_class_accuracy(labels,
 
   with variable_scope.variable_scope(name, 'mean_accuracy',
                                      (predictions, labels, weights)):
+    labels = math_ops.to_int64(labels)
+
+    # Flatten the input if its rank > 1.
+    if labels.get_shape().ndims > 1:
+      labels = array_ops.reshape(labels, [-1])
+
+    if predictions.get_shape().ndims > 1:
+      predictions = array_ops.reshape(predictions, [-1])
+
     # Check if shape is compatible.
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
-    total_cm, update_op = _streaming_confusion_matrix(
-        labels, predictions, num_classes, weights=weights)
+    total = metric_variable([num_classes], dtypes.float32, name='total')
+    count = metric_variable([num_classes], dtypes.float32, name='count')
 
-    def compute_mean_accuracy(name):
-      """Compute the mean per class accuracy via the confusion matrix."""
-      per_row_sum = math_ops.to_float(math_ops.reduce_sum(total_cm, 1))
-      cm_diag = math_ops.to_float(array_ops.diag_part(total_cm))
-      denominator = per_row_sum
+    ones = array_ops.ones([array_ops.size(labels)], dtypes.float32)
 
-      # If the value of the denominator is 0, set it to 1 to avoid
-      # zero division.
-      denominator = array_ops.where(
-          math_ops.greater(denominator, 0), denominator,
-          array_ops.ones_like(denominator))
-      accuracies = math_ops.div(cm_diag, denominator)
-      return math_ops.reduce_mean(accuracies, name=name)
+    if labels.dtype != predictions.dtype:
+      predictions = math_ops.cast(predictions, labels.dtype)
+    is_correct = math_ops.to_float(math_ops.equal(predictions, labels))
+
+    if weights is not None:
+      if weights.get_shape().ndims > 1:
+        weights = array_ops.reshape(weights, [-1])
+      weights = math_ops.to_float(weights)
+
+      is_correct = is_correct * weights
+      ones = ones * weights
+
+    update_total_op = state_ops.scatter_add(total, labels, ones)
+    update_count_op = state_ops.scatter_add(count, labels, is_correct)
+
+    per_class_accuracy = _safe_div(count, total, None)
 
-    mean_accuracy_v = compute_mean_accuracy('mean_accuracy')
+    mean_accuracy_v = math_ops.reduce_mean(
+        per_class_accuracy, name='mean_accuracy')
+    update_op = _safe_div(update_count_op, update_total_op, name='update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, mean_accuracy_v)
@@ -896,6 +944,7 @@ def mean_per_class_accuracy(labels,
     return mean_accuracy_v, update_op
 
 
+@tf_export('metrics.mean_iou')
 def mean_iou(labels,
              predictions,
              num_classes,
@@ -951,13 +1000,14 @@ def mean_iou(labels,
     raise RuntimeError('tf.metrics.mean_iou is not supported when '
                        'eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'mean_iou', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'mean_iou',
+                                     (predictions, labels, weights)):
     # Check if shape is compatible.
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
     total_cm, update_op = _streaming_confusion_matrix(labels, predictions,
                                                       num_classes, weights)
+
     def compute_mean_iou(name):
       """Compute the mean intersection-over-union via the confusion matrix."""
       sum_over_row = math_ops.to_float(math_ops.reduce_sum(total_cm, 0))
@@ -968,22 +1018,21 @@ def mean_iou(labels,
       # The mean is only computed over classes that appear in the
       # label or prediction tensor. If the denominator is 0, we need to
       # ignore the class.
-      num_valid_entries = math_ops.reduce_sum(math_ops.cast(
-          math_ops.not_equal(denominator, 0), dtype=dtypes.float32))
+      num_valid_entries = math_ops.reduce_sum(
+          math_ops.cast(
+              math_ops.not_equal(denominator, 0), dtype=dtypes.float32))
 
       # If the value of the denominator is 0, set it to 1 to avoid
       # zero division.
       denominator = array_ops.where(
-          math_ops.greater(denominator, 0),
-          denominator,
+          math_ops.greater(denominator, 0), denominator,
           array_ops.ones_like(denominator))
       iou = math_ops.div(cm_diag, denominator)
 
       # If the number of valid entries is 0 (no classes) we return 0.
       result = array_ops.where(
           math_ops.greater(num_valid_entries, 0),
-          math_ops.reduce_sum(iou, name=name) / num_valid_entries,
-          0)
+          math_ops.reduce_sum(iou, name=name) / num_valid_entries, 0)
       return result
 
     mean_iou_v = compute_mean_iou('mean_iou')
@@ -997,7 +1046,11 @@ def mean_iou(labels,
     return mean_iou_v, update_op
 
 
-def mean_relative_error(labels, predictions, normalizer, weights=None,
+@tf_export('metrics.mean_relative_error')
+def mean_relative_error(labels,
+                        predictions,
+                        normalizer,
+                        weights=None,
                         metrics_collections=None,
                         updates_collections=None,
                         name=None):
@@ -1056,14 +1109,16 @@ def mean_relative_error(labels, predictions, normalizer, weights=None,
       predictions, normalizer)
   predictions.get_shape().assert_is_compatible_with(normalizer.get_shape())
   relative_errors = array_ops.where(
-      math_ops.equal(normalizer, 0.0),
-      array_ops.zeros_like(labels),
+      math_ops.equal(normalizer, 0.0), array_ops.zeros_like(labels),
       math_ops.div(math_ops.abs(labels - predictions), normalizer))
   return mean(relative_errors, weights, metrics_collections,
               updates_collections, name or 'mean_relative_error')
 
 
-def mean_squared_error(labels, predictions, weights=None,
+@tf_export('metrics.mean_squared_error')
+def mean_squared_error(labels,
+                       predictions,
+                       weights=None,
                        metrics_collections=None,
                        updates_collections=None,
                        name=None):
@@ -1117,12 +1172,16 @@ def mean_squared_error(labels, predictions, weights=None,
   predictions, labels, weights = _remove_squeezable_dimensions(
       predictions=predictions, labels=labels, weights=weights)
   squared_error = math_ops.square(labels - predictions)
-  return mean(squared_error, weights, metrics_collections,
-              updates_collections, name or 'mean_squared_error')
+  return mean(squared_error, weights, metrics_collections, updates_collections,
+              name or 'mean_squared_error')
 
 
-def mean_tensor(values, weights=None, metrics_collections=None,
-                updates_collections=None, name=None):
+@tf_export('metrics.mean_tensor')
+def mean_tensor(values,
+                weights=None,
+                metrics_collections=None,
+                updates_collections=None,
+                name=None):
   """Computes the element-wise (weighted) mean of the given tensors.
 
   In contrast to the `mean` function which returns a scalar with the
@@ -1189,9 +1248,8 @@ def mean_tensor(values, weights=None, metrics_collections=None,
       update_count_op = state_ops.assign_add(count, num_values)
 
     def compute_mean(total, count, name):
-      non_zero_count = math_ops.maximum(count,
-                                        array_ops.ones_like(count),
-                                        name=name)
+      non_zero_count = math_ops.maximum(
+          count, array_ops.ones_like(count), name=name)
       return math_ops.truediv(total, non_zero_count, name=name)
 
     mean_t = compute_mean(total, count, 'value')
@@ -1206,7 +1264,10 @@ def mean_tensor(values, weights=None, metrics_collections=None,
     return mean_t, update_op
 
 
-def percentage_below(values, threshold, weights=None,
+@tf_export('metrics.percentage_below')
+def percentage_below(values,
+                     threshold,
+                     weights=None,
                      metrics_collections=None,
                      updates_collections=None,
                      name=None):
@@ -1253,14 +1314,13 @@ def percentage_below(values, threshold, weights=None,
                        'eager execution is enabled.')
 
   is_below_threshold = math_ops.to_float(math_ops.less(values, threshold))
-  return mean(is_below_threshold,
-              weights,
-              metrics_collections,
-              updates_collections,
-              name or 'percentage_below_threshold')
+  return mean(is_below_threshold, weights, metrics_collections,
+              updates_collections, name or 'percentage_below_threshold')
 
 
-def _count_condition(values, weights=None, metrics_collections=None,
+def _count_condition(values,
+                     weights=None,
+                     metrics_collections=None,
                      updates_collections=None):
   """Sums the weights of cases where the given values are True.
 
@@ -1290,8 +1350,8 @@ def _count_condition(values, weights=None, metrics_collections=None,
 
   values = math_ops.to_float(values)
   if weights is not None:
-    with ops.control_dependencies((
-        check_ops.assert_rank_in(weights, (0, array_ops.rank(values))),)):
+    with ops.control_dependencies((check_ops.assert_rank_in(
+        weights, (0, array_ops.rank(values))),)):
       weights = math_ops.to_float(weights)
       values = math_ops.multiply(values, weights)
 
@@ -1307,7 +1367,10 @@ def _count_condition(values, weights=None, metrics_collections=None,
   return value_tensor, update_op
 
 
-def false_negatives(labels, predictions, weights=None,
+@tf_export('metrics.false_negatives')
+def false_negatives(labels,
+                    predictions,
+                    weights=None,
                     metrics_collections=None,
                     updates_collections=None,
                     name=None):
@@ -1343,20 +1406,24 @@ def false_negatives(labels, predictions, weights=None,
     raise RuntimeError('tf.metrics.false_negatives is not supported when '
                        'eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'false_negatives', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'false_negatives',
+                                     (predictions, labels, weights)):
 
     predictions, labels, weights = _remove_squeezable_dimensions(
         predictions=math_ops.cast(predictions, dtype=dtypes.bool),
         labels=math_ops.cast(labels, dtype=dtypes.bool),
         weights=weights)
-    is_false_negative = math_ops.logical_and(math_ops.equal(labels, True),
-                                             math_ops.equal(predictions, False))
+    is_false_negative = math_ops.logical_and(
+        math_ops.equal(labels, True), math_ops.equal(predictions, False))
     return _count_condition(is_false_negative, weights, metrics_collections,
                             updates_collections)
 
 
-def false_negatives_at_thresholds(labels, predictions, thresholds, weights=None,
+@tf_export('metrics.false_negatives_at_thresholds')
+def false_negatives_at_thresholds(labels,
+                                  predictions,
+                                  thresholds,
+                                  weights=None,
                                   metrics_collections=None,
                                   updates_collections=None,
                                   name=None):
@@ -1409,7 +1476,10 @@ def false_negatives_at_thresholds(labels, predictions, thresholds, weights=None,
     return values['fn'], update_ops['fn']
 
 
-def false_positives(labels, predictions, weights=None,
+@tf_export('metrics.false_positives')
+def false_positives(labels,
+                    predictions,
+                    weights=None,
                     metrics_collections=None,
                     updates_collections=None,
                     name=None):
@@ -1446,20 +1516,24 @@ def false_positives(labels, predictions, weights=None,
     raise RuntimeError('tf.metrics.false_positives is not supported when '
                        'eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'false_positives', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'false_positives',
+                                     (predictions, labels, weights)):
 
     predictions, labels, weights = _remove_squeezable_dimensions(
         predictions=math_ops.cast(predictions, dtype=dtypes.bool),
         labels=math_ops.cast(labels, dtype=dtypes.bool),
         weights=weights)
-    is_false_positive = math_ops.logical_and(math_ops.equal(labels, False),
-                                             math_ops.equal(predictions, True))
+    is_false_positive = math_ops.logical_and(
+        math_ops.equal(labels, False), math_ops.equal(predictions, True))
     return _count_condition(is_false_positive, weights, metrics_collections,
                             updates_collections)
 
 
-def false_positives_at_thresholds(labels, predictions, thresholds, weights=None,
+@tf_export('metrics.false_positives_at_thresholds')
+def false_positives_at_thresholds(labels,
+                                  predictions,
+                                  thresholds,
+                                  weights=None,
                                   metrics_collections=None,
                                   updates_collections=None,
                                   name=None):
@@ -1512,7 +1586,10 @@ def false_positives_at_thresholds(labels, predictions, thresholds, weights=None,
     return values['fp'], update_ops['fp']
 
 
-def true_negatives(labels, predictions, weights=None,
+@tf_export('metrics.true_negatives')
+def true_negatives(labels,
+                   predictions,
+                   weights=None,
                    metrics_collections=None,
                    updates_collections=None,
                    name=None):
@@ -1549,20 +1626,24 @@ def true_negatives(labels, predictions, weights=None,
     raise RuntimeError('tf.metrics.true_negatives is not '
                        'supported when eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'true_negatives', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'true_negatives',
+                                     (predictions, labels, weights)):
 
     predictions, labels, weights = _remove_squeezable_dimensions(
         predictions=math_ops.cast(predictions, dtype=dtypes.bool),
         labels=math_ops.cast(labels, dtype=dtypes.bool),
         weights=weights)
-    is_true_negative = math_ops.logical_and(math_ops.equal(labels, False),
-                                            math_ops.equal(predictions, False))
+    is_true_negative = math_ops.logical_and(
+        math_ops.equal(labels, False), math_ops.equal(predictions, False))
     return _count_condition(is_true_negative, weights, metrics_collections,
                             updates_collections)
 
 
-def true_negatives_at_thresholds(labels, predictions, thresholds, weights=None,
+@tf_export('metrics.true_negatives_at_thresholds')
+def true_negatives_at_thresholds(labels,
+                                 predictions,
+                                 thresholds,
+                                 weights=None,
                                  metrics_collections=None,
                                  updates_collections=None,
                                  name=None):
@@ -1615,7 +1696,10 @@ def true_negatives_at_thresholds(labels, predictions, thresholds, weights=None,
     return values['tn'], update_ops['tn']
 
 
-def true_positives(labels, predictions, weights=None,
+@tf_export('metrics.true_positives')
+def true_positives(labels,
+                   predictions,
+                   weights=None,
                    metrics_collections=None,
                    updates_collections=None,
                    name=None):
@@ -1652,20 +1736,24 @@ def true_positives(labels, predictions, weights=None,
     raise RuntimeError('tf.metrics.true_positives is not '
                        'supported when eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'true_positives', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'true_positives',
+                                     (predictions, labels, weights)):
 
     predictions, labels, weights = _remove_squeezable_dimensions(
         predictions=math_ops.cast(predictions, dtype=dtypes.bool),
         labels=math_ops.cast(labels, dtype=dtypes.bool),
         weights=weights)
-    is_true_positive = math_ops.logical_and(math_ops.equal(labels, True),
-                                            math_ops.equal(predictions, True))
+    is_true_positive = math_ops.logical_and(
+        math_ops.equal(labels, True), math_ops.equal(predictions, True))
     return _count_condition(is_true_positive, weights, metrics_collections,
                             updates_collections)
 
 
-def true_positives_at_thresholds(labels, predictions, thresholds, weights=None,
+@tf_export('metrics.true_positives_at_thresholds')
+def true_positives_at_thresholds(labels,
+                                 predictions,
+                                 thresholds,
+                                 weights=None,
                                  metrics_collections=None,
                                  updates_collections=None,
                                  name=None):
@@ -1718,8 +1806,12 @@ def true_positives_at_thresholds(labels, predictions, thresholds, weights=None,
     return values['tp'], update_ops['tp']
 
 
-def precision(labels, predictions, weights=None,
-              metrics_collections=None, updates_collections=None,
+@tf_export('metrics.precision')
+def precision(labels,
+              predictions,
+              weights=None,
+              metrics_collections=None,
+              updates_collections=None,
               name=None):
   """Computes the precision of the predictions with respect to the labels.
 
@@ -1768,8 +1860,8 @@ def precision(labels, predictions, weights=None,
     raise RuntimeError('tf.metrics.precision is not '
                        'supported when eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'precision', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'precision',
+                                     (predictions, labels, weights)):
 
     predictions, labels, weights = _remove_squeezable_dimensions(
         predictions=math_ops.cast(predictions, dtype=dtypes.bool),
@@ -1777,22 +1869,27 @@ def precision(labels, predictions, weights=None,
         weights=weights)
 
     true_p, true_positives_update_op = true_positives(
-        labels, predictions, weights, metrics_collections=None,
-        updates_collections=None, name=None)
+        labels,
+        predictions,
+        weights,
+        metrics_collections=None,
+        updates_collections=None,
+        name=None)
     false_p, false_positives_update_op = false_positives(
-        labels, predictions, weights, metrics_collections=None,
-        updates_collections=None, name=None)
+        labels,
+        predictions,
+        weights,
+        metrics_collections=None,
+        updates_collections=None,
+        name=None)
 
     def compute_precision(tp, fp, name):
       return array_ops.where(
-          math_ops.greater(tp + fp, 0),
-          math_ops.div(tp, tp + fp),
-          0,
-          name)
+          math_ops.greater(tp + fp, 0), math_ops.div(tp, tp + fp), 0, name)
 
     p = compute_precision(true_p, false_p, 'value')
-    update_op = compute_precision(
-        true_positives_update_op, false_positives_update_op, 'update_op')
+    update_op = compute_precision(true_positives_update_op,
+                                  false_positives_update_op, 'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, p)
@@ -1803,10 +1900,14 @@ def precision(labels, predictions, weights=None,
     return p, update_op
 
 
-def precision_at_thresholds(labels, predictions, thresholds,
+@tf_export('metrics.precision_at_thresholds')
+def precision_at_thresholds(labels,
+                            predictions,
+                            thresholds,
                             weights=None,
                             metrics_collections=None,
-                            updates_collections=None, name=None):
+                            updates_collections=None,
+                            name=None):
   """Computes precision values for different `thresholds` on `predictions`.
 
   The `precision_at_thresholds` function creates four local variables,
@@ -1862,12 +1963,13 @@ def precision_at_thresholds(labels, predictions, thresholds,
 
     # Avoid division by zero.
     epsilon = 1e-7
+
     def compute_precision(tp, fp, name):
       return math_ops.div(tp, epsilon + tp + fp, name='precision_' + name)
 
     prec = compute_precision(values['tp'], values['fp'], 'value')
-    update_op = compute_precision(
-        update_ops['tp'], update_ops['fp'], 'update_op')
+    update_op = compute_precision(update_ops['tp'], update_ops['fp'],
+                                  'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, prec)
@@ -1878,8 +1980,12 @@ def precision_at_thresholds(labels, predictions, thresholds,
     return prec, update_op
 
 
-def recall(labels, predictions, weights=None,
-           metrics_collections=None, updates_collections=None,
+@tf_export('metrics.recall')
+def recall(labels,
+           predictions,
+           weights=None,
+           metrics_collections=None,
+           updates_collections=None,
            name=None):
   """Computes the recall of the predictions with respect to the labels.
 
@@ -1926,30 +2032,36 @@ def recall(labels, predictions, weights=None,
     raise RuntimeError('tf.metrics.recall is not supported is not '
                        'supported when eager execution is enabled.')
 
-  with variable_scope.variable_scope(
-      name, 'recall', (predictions, labels, weights)):
+  with variable_scope.variable_scope(name, 'recall',
+                                     (predictions, labels, weights)):
     predictions, labels, weights = _remove_squeezable_dimensions(
         predictions=math_ops.cast(predictions, dtype=dtypes.bool),
         labels=math_ops.cast(labels, dtype=dtypes.bool),
         weights=weights)
 
     true_p, true_positives_update_op = true_positives(
-        labels, predictions, weights, metrics_collections=None,
-        updates_collections=None, name=None)
+        labels,
+        predictions,
+        weights,
+        metrics_collections=None,
+        updates_collections=None,
+        name=None)
     false_n, false_negatives_update_op = false_negatives(
-        labels, predictions, weights, metrics_collections=None,
-        updates_collections=None, name=None)
+        labels,
+        predictions,
+        weights,
+        metrics_collections=None,
+        updates_collections=None,
+        name=None)
 
     def compute_recall(true_p, false_n, name):
       return array_ops.where(
           math_ops.greater(true_p + false_n, 0),
-          math_ops.div(true_p, true_p + false_n),
-          0,
-          name)
+          math_ops.div(true_p, true_p + false_n), 0, name)
 
     rec = compute_recall(true_p, false_n, 'value')
-    update_op = compute_recall(
-        true_positives_update_op, false_negatives_update_op, 'update_op')
+    update_op = compute_recall(true_positives_update_op,
+                               false_negatives_update_op, 'update_op')
 
     if metrics_collections:
       ops.add_to_collections(metrics_collections, rec)
@@ -1983,8 +2095,8 @@ def _select_class_id(ids, selected_id):
   """
   ids = sparse_tensor.convert_to_tensor_or_sparse_tensor(ids)
   if isinstance(ids, sparse_tensor.SparseTensor):
-    return sparse_ops.sparse_retain(
-        ids, math_ops.equal(ids.values, selected_id))
+    return sparse_ops.sparse_retain(ids, math_ops.equal(ids.values,
+                                                        selected_id))
 
   # TODO(ptucker): Make this more efficient, maybe add a sparse version of
   # tf.equal and tf.reduce_any?
@@ -1992,12 +2104,13 @@ def _select_class_id(ids, selected_id):
   # Shape of filled IDs is the same as `ids` with the last dim collapsed to 1.
   ids_shape = array_ops.shape(ids, out_type=dtypes.int64)
   ids_last_dim = array_ops.size(ids_shape) - 1
-  filled_selected_id_shape = math_ops.reduced_shape(
-      ids_shape, array_ops.reshape(ids_last_dim, [1]))
+  filled_selected_id_shape = math_ops.reduced_shape(ids_shape,
+                                                    array_ops.reshape(
+                                                        ids_last_dim, [1]))
 
   # Intersect `ids` with the selected ID.
-  filled_selected_id = array_ops.fill(
-      filled_selected_id_shape, math_ops.to_int64(selected_id))
+  filled_selected_id = array_ops.fill(filled_selected_id_shape,
+                                      math_ops.to_int64(selected_id))
   result = sets.set_intersection(filled_selected_id, ids)
   return sparse_tensor.SparseTensor(
       indices=result.indices, values=result.values, dense_shape=ids_shape)
@@ -2057,15 +2170,15 @@ def _sparse_true_positive_at_k(labels,
   Returns:
     A [D1, ... DN] `Tensor` of true positive counts.
   """
-  with ops.name_scope(
-      name, 'true_positives', (predictions_idx, labels, weights)):
-    labels, predictions_idx = _maybe_select_class_id(
-        labels, predictions_idx, class_id)
+  with ops.name_scope(name, 'true_positives',
+                      (predictions_idx, labels, weights)):
+    labels, predictions_idx = _maybe_select_class_id(labels, predictions_idx,
+                                                     class_id)
     tp = sets.set_size(sets.set_intersection(predictions_idx, labels))
     tp = math_ops.to_double(tp)
     if weights is not None:
-      with ops.control_dependencies((
-          weights_broadcast_ops.assert_broadcastable(weights, tp),)):
+      with ops.control_dependencies((weights_broadcast_ops.assert_broadcastable(
+          weights, tp),)):
         weights = math_ops.to_double(weights)
         tp = math_ops.multiply(tp, weights)
     return tp
@@ -2109,11 +2222,12 @@ def _streaming_sparse_true_positive_at_k(labels,
   Raises:
     ValueError: If `weights` is not `None` and has an incompatible shape.
   """
-  with ops.name_scope(
-      name, _at_k_name('true_positive', k, class_id=class_id),
-      (predictions_idx, labels, weights)) as scope:
+  with ops.name_scope(name, _at_k_name('true_positive', k, class_id=class_id),
+                      (predictions_idx, labels, weights)) as scope:
     tp = _sparse_true_positive_at_k(
-        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        predictions_idx=predictions_idx,
+        labels=labels,
+        class_id=class_id,
         weights=weights)
     batch_total_tp = math_ops.to_double(math_ops.reduce_sum(tp))
 
@@ -2150,18 +2264,16 @@ def _sparse_false_negative_at_k(labels,
   Returns:
     A [D1, ... DN] `Tensor` of false negative counts.
   """
-  with ops.name_scope(
-      None, 'false_negatives', (predictions_idx, labels, weights)):
-    labels, predictions_idx = _maybe_select_class_id(labels,
-                                                     predictions_idx,
+  with ops.name_scope(None, 'false_negatives',
+                      (predictions_idx, labels, weights)):
+    labels, predictions_idx = _maybe_select_class_id(labels, predictions_idx,
                                                      class_id)
-    fn = sets.set_size(sets.set_difference(predictions_idx,
-                                           labels,
-                                           aminusb=False))
+    fn = sets.set_size(
+        sets.set_difference(predictions_idx, labels, aminusb=False))
     fn = math_ops.to_double(fn)
     if weights is not None:
-      with ops.control_dependencies((
-          weights_broadcast_ops.assert_broadcastable(weights, fn),)):
+      with ops.control_dependencies((weights_broadcast_ops.assert_broadcastable(
+          weights, fn),)):
         weights = math_ops.to_double(weights)
         fn = math_ops.multiply(fn, weights)
     return fn
@@ -2205,11 +2317,12 @@ def _streaming_sparse_false_negative_at_k(labels,
   Raises:
     ValueError: If `weights` is not `None` and has an incompatible shape.
   """
-  with ops.name_scope(
-      name, _at_k_name('false_negative', k, class_id=class_id),
-      (predictions_idx, labels, weights)) as scope:
+  with ops.name_scope(name, _at_k_name('false_negative', k, class_id=class_id),
+                      (predictions_idx, labels, weights)) as scope:
     fn = _sparse_false_negative_at_k(
-        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        predictions_idx=predictions_idx,
+        labels=labels,
+        class_id=class_id,
         weights=weights)
     batch_total_fn = math_ops.to_double(math_ops.reduce_sum(fn))
 
@@ -2217,6 +2330,7 @@ def _streaming_sparse_false_negative_at_k(labels,
     return var, state_ops.assign_add(var, batch_total_fn, name='update')
 
 
+@tf_export('metrics.recall_at_k')
 def recall_at_k(labels,
                 predictions,
                 k,
@@ -2295,9 +2409,8 @@ def recall_at_k(labels,
     raise RuntimeError('tf.metrics.recall_at_k is not '
                        'supported when eager execution is enabled.')
 
-  with ops.name_scope(
-      name, _at_k_name('recall', k, class_id=class_id),
-      (predictions, labels, weights)) as scope:
+  with ops.name_scope(name, _at_k_name('recall', k, class_id=class_id),
+                      (predictions, labels, weights)) as scope:
     _, top_k_idx = nn.top_k(predictions, k)
     return recall_at_top_k(
         labels=labels,
@@ -2310,6 +2423,7 @@ def recall_at_k(labels,
         name=scope)
 
 
+@tf_export('metrics.recall_at_top_k')
 def recall_at_top_k(labels,
                     predictions_idx,
                     k=None,
@@ -2363,16 +2477,21 @@ def recall_at_top_k(labels,
     `predictions`, or if either `metrics_collections` or `updates_collections`
     are not a list or tuple.
   """
-  with ops.name_scope(name,
-                      _at_k_name('recall', k, class_id=class_id),
+  with ops.name_scope(name, _at_k_name('recall', k, class_id=class_id),
                       (predictions_idx, labels, weights)) as scope:
     labels = _maybe_expand_labels(labels, predictions_idx)
     top_k_idx = math_ops.to_int64(predictions_idx)
     tp, tp_update = _streaming_sparse_true_positive_at_k(
-        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+        predictions_idx=top_k_idx,
+        labels=labels,
+        k=k,
+        class_id=class_id,
         weights=weights)
     fn, fn_update = _streaming_sparse_false_negative_at_k(
-        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+        predictions_idx=top_k_idx,
+        labels=labels,
+        k=k,
+        class_id=class_id,
         weights=weights)
 
     metric = math_ops.div(tp, math_ops.add(tp, fn), name=scope)
@@ -2385,9 +2504,14 @@ def recall_at_top_k(labels,
     return metric, update
 
 
-def recall_at_thresholds(labels, predictions, thresholds,
-                         weights=None, metrics_collections=None,
-                         updates_collections=None, name=None):
+@tf_export('metrics.recall_at_thresholds')
+def recall_at_thresholds(labels,
+                         predictions,
+                         thresholds,
+                         weights=None,
+                         metrics_collections=None,
+                         updates_collections=None,
+                         name=None):
   """Computes various recall values for different `thresholds` on `predictions`.
 
   The `recall_at_thresholds` function creates four local variables,
@@ -2441,6 +2565,7 @@ def recall_at_thresholds(labels, predictions, thresholds,
 
     # Avoid division by zero.
     epsilon = 1e-7
+
     def compute_recall(tp, fn, name):
       return math_ops.div(tp, epsilon + tp + fn, name='recall_' + name)
 
@@ -2456,7 +2581,10 @@ def recall_at_thresholds(labels, predictions, thresholds,
     return rec, update_op
 
 
-def root_mean_squared_error(labels, predictions, weights=None,
+@tf_export('metrics.root_mean_squared_error')
+def root_mean_squared_error(labels,
+                            predictions,
+                            weights=None,
                             metrics_collections=None,
                             updates_collections=None,
                             name=None):
@@ -2509,9 +2637,9 @@ def root_mean_squared_error(labels, predictions, weights=None,
 
   predictions, labels, weights = _remove_squeezable_dimensions(
       predictions=predictions, labels=labels, weights=weights)
-  mse, update_mse_op = mean_squared_error(
-      labels, predictions, weights, None, None,
-      name or 'root_mean_squared_error')
+  mse, update_mse_op = mean_squared_error(labels, predictions, weights, None,
+                                          None, name or
+                                          'root_mean_squared_error')
 
   rmse = math_ops.sqrt(mse)
   update_rmse_op = math_ops.sqrt(update_mse_op)
@@ -2525,9 +2653,15 @@ def root_mean_squared_error(labels, predictions, weights=None,
   return rmse, update_rmse_op
 
 
-def sensitivity_at_specificity(
-    labels, predictions, specificity, weights=None, num_thresholds=200,
-    metrics_collections=None, updates_collections=None, name=None):
+@tf_export('metrics.sensitivity_at_specificity')
+def sensitivity_at_specificity(labels,
+                               predictions,
+                               specificity,
+                               weights=None,
+                               num_thresholds=200,
+                               metrics_collections=None,
+                               updates_collections=None,
+                               name=None):
   """Computes the specificity at a given sensitivity.
 
   The `sensitivity_at_specificity` function creates four local
@@ -2588,8 +2722,9 @@ def sensitivity_at_specificity(
   with variable_scope.variable_scope(name, 'sensitivity_at_specificity',
                                      (predictions, labels, weights)):
     kepsilon = 1e-7  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds-2)]
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
     thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
 
     values, update_ops = _confusion_matrix_at_thresholds(
@@ -2601,8 +2736,7 @@ def sensitivity_at_specificity(
       tf_index = math_ops.cast(tf_index, dtypes.int32)
 
       # Now, we have the implicit threshold, so compute the sensitivity:
-      return math_ops.div(tp[tf_index],
-                          tp[tf_index] + fn[tf_index] + kepsilon,
+      return math_ops.div(tp[tf_index], tp[tf_index] + fn[tf_index] + kepsilon,
                           name)
 
     sensitivity = compute_sensitivity_at_specificity(
@@ -2641,8 +2775,8 @@ def _expand_and_tile(tensor, multiple, dim=0, name=None):
   """
   if multiple < 1:
     raise ValueError('Invalid multiple %s, must be > 0.' % multiple)
-  with ops.name_scope(
-      name, 'expand_and_tile', (tensor, multiple, dim)) as scope:
+  with ops.name_scope(name, 'expand_and_tile',
+                      (tensor, multiple, dim)) as scope:
     # Sparse.
     tensor = sparse_tensor.convert_to_tensor_or_sparse_tensor(tensor)
     if isinstance(tensor, sparse_tensor.SparseTensor):
@@ -2742,8 +2876,8 @@ def _sparse_average_precision_at_top_k(labels, predictions_idx):
   Raises:
     ValueError: if the last dimension of predictions_idx is not set.
   """
-  with ops.name_scope(
-      None, 'average_precision', (predictions_idx, labels)) as scope:
+  with ops.name_scope(None, 'average_precision',
+                      (predictions_idx, labels)) as scope:
     predictions_idx = math_ops.to_int64(predictions_idx, name='predictions_idx')
     if predictions_idx.get_shape().ndims == 0:
       raise ValueError('The rank of predictions_idx must be at least 1.')
@@ -2780,10 +2914,12 @@ def _sparse_average_precision_at_top_k(labels, predictions_idx):
     retrieved_per_k = math_ops.cumsum(
         array_ops.ones_like(relevant_per_k), axis=-1, name='retrieved_per_k')
     precision_per_k = math_ops.div(
-        math_ops.to_double(tp_per_k), math_ops.to_double(retrieved_per_k),
+        math_ops.to_double(tp_per_k),
+        math_ops.to_double(retrieved_per_k),
         name='precision_per_k')
     relevant_precision_per_k = math_ops.multiply(
-        precision_per_k, math_ops.to_double(relevant_per_k),
+        precision_per_k,
+        math_ops.to_double(relevant_per_k),
         name='relevant_precision_per_k')
 
     # Reduce along k dimension to get the sum, yielding a [D1, ... DN] tensor.
@@ -2887,6 +3023,7 @@ def _streaming_sparse_average_precision_at_top_k(labels,
     return mean_average_precision, update
 
 
+@tf_export('metrics.sparse_average_precision_at_k')
 @deprecated(None, 'Use average_precision_at_k instead')
 def sparse_average_precision_at_k(labels,
                                   predictions,
@@ -2906,6 +3043,7 @@ def sparse_average_precision_at_k(labels,
       name=name)
 
 
+@tf_export('metrics.average_precision_at_k')
 def average_precision_at_k(labels,
                            predictions,
                            k,
@@ -2971,9 +3109,8 @@ def average_precision_at_k(labels,
 
   if k < 1:
     raise ValueError('Invalid k=%s.' % k)
-  with ops.name_scope(
-      name, _at_k_name('average_precision', k),
-      (predictions, labels, weights)) as scope:
+  with ops.name_scope(name, _at_k_name('average_precision', k),
+                      (predictions, labels, weights)) as scope:
     # Calculate top k indices to produce [D1, ... DN, k] tensor.
     _, predictions_idx = nn.top_k(predictions, k)
     return _streaming_sparse_average_precision_at_top_k(
@@ -3014,17 +3151,16 @@ def _sparse_false_positive_at_k(labels,
   Returns:
     A [D1, ... DN] `Tensor` of false positive counts.
   """
-  with ops.name_scope(
-      None, 'false_positives', (predictions_idx, labels, weights)):
-    labels, predictions_idx = _maybe_select_class_id(labels,
-                                                     predictions_idx,
+  with ops.name_scope(None, 'false_positives',
+                      (predictions_idx, labels, weights)):
+    labels, predictions_idx = _maybe_select_class_id(labels, predictions_idx,
                                                      class_id)
-    fp = sets.set_size(sets.set_difference(
-        predictions_idx, labels, aminusb=True))
+    fp = sets.set_size(
+        sets.set_difference(predictions_idx, labels, aminusb=True))
     fp = math_ops.to_double(fp)
     if weights is not None:
-      with ops.control_dependencies((
-          weights_broadcast_ops.assert_broadcastable(weights, fp),)):
+      with ops.control_dependencies((weights_broadcast_ops.assert_broadcastable(
+          weights, fp),)):
         weights = math_ops.to_double(weights)
         fp = math_ops.multiply(fp, weights)
     return fp
@@ -3068,11 +3204,12 @@ def _streaming_sparse_false_positive_at_k(labels,
   Raises:
     ValueError: If `weights` is not `None` and has an incompatible shape.
   """
-  with ops.name_scope(
-      name, _at_k_name('false_positive', k, class_id=class_id),
-      (predictions_idx, labels, weights)) as scope:
+  with ops.name_scope(name, _at_k_name('false_positive', k, class_id=class_id),
+                      (predictions_idx, labels, weights)) as scope:
     fp = _sparse_false_positive_at_k(
-        predictions_idx=predictions_idx, labels=labels, class_id=class_id,
+        predictions_idx=predictions_idx,
+        labels=labels,
+        class_id=class_id,
         weights=weights)
     batch_total_fp = math_ops.to_double(math_ops.reduce_sum(fp))
 
@@ -3080,6 +3217,7 @@ def _streaming_sparse_false_positive_at_k(labels,
     return var, state_ops.assign_add(var, batch_total_fp, name='update')
 
 
+@tf_export('metrics.precision_at_top_k')
 def precision_at_top_k(labels,
                        predictions_idx,
                        k=None,
@@ -3143,10 +3281,16 @@ def precision_at_top_k(labels,
     labels = _maybe_expand_labels(labels, predictions_idx)
     top_k_idx = math_ops.to_int64(predictions_idx)
     tp, tp_update = _streaming_sparse_true_positive_at_k(
-        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+        predictions_idx=top_k_idx,
+        labels=labels,
+        k=k,
+        class_id=class_id,
         weights=weights)
     fp, fp_update = _streaming_sparse_false_positive_at_k(
-        predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
+        predictions_idx=top_k_idx,
+        labels=labels,
+        k=k,
+        class_id=class_id,
         weights=weights)
 
     metric = math_ops.div(tp, math_ops.add(tp, fp), name=scope)
@@ -3159,6 +3303,7 @@ def precision_at_top_k(labels,
     return metric, update
 
 
+@tf_export('metrics.sparse_precision_at_k')
 @deprecated(None, 'Use precision_at_k instead')
 def sparse_precision_at_k(labels,
                           predictions,
@@ -3180,6 +3325,7 @@ def sparse_precision_at_k(labels,
       name=name)
 
 
+@tf_export('metrics.precision_at_k')
 def precision_at_k(labels,
                    predictions,
                    k,
@@ -3273,9 +3419,15 @@ def precision_at_k(labels,
         name=scope)
 
 
-def specificity_at_sensitivity(
-    labels, predictions, sensitivity, weights=None, num_thresholds=200,
-    metrics_collections=None, updates_collections=None, name=None):
+@tf_export('metrics.specificity_at_sensitivity')
+def specificity_at_sensitivity(labels,
+                               predictions,
+                               sensitivity,
+                               weights=None,
+                               num_thresholds=200,
+                               metrics_collections=None,
+                               updates_collections=None,
+                               name=None):
   """Computes the specificity at a given sensitivity.
 
   The `specificity_at_sensitivity` function creates four local
@@ -3336,8 +3488,9 @@ def specificity_at_sensitivity(
   with variable_scope.variable_scope(name, 'specificity_at_sensitivity',
                                      (predictions, labels, weights)):
     kepsilon = 1e-7  # to account for floating point imprecisions
-    thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                  for i in range(num_thresholds-2)]
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
     thresholds = [0.0 - kepsilon] + thresholds + [1.0 - kepsilon]
 
     values, update_ops = _confusion_matrix_at_thresholds(
@@ -3369,8 +3522,7 @@ def specificity_at_sensitivity(
       tf_index = math_ops.cast(tf_index, dtypes.int32)
 
       # Now, we have the implicit threshold, so compute the specificity:
-      return math_ops.div(tn[tf_index],
-                          tn[tf_index] + fp[tf_index] + kepsilon,
+      return math_ops.div(tn[tf_index], tn[tf_index] + fp[tf_index] + kepsilon,
                           name)
 
     specificity = compute_specificity_at_sensitivity(
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index 8aed2e293fa2dd6559d342f109a996d810db13bf..eebfb17085a568f48769f6df7dddd3ae2f799efc 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -24,6 +24,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -34,6 +35,7 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class BatchNormalizationTest(test.TestCase):
 
   def _npBatchNorm(self, x, m, v, beta, gamma, epsilon,
@@ -52,9 +54,7 @@ class BatchNormalizationTest(test.TestCase):
   def _tfBatchNormV1(self, x, m, v, beta, gamma, epsilon,
                      scale_after_normalization):
     """Original implementation."""
-    # _batch_norm_with_global_normalization is deprecated in v9
-    ops.get_default_graph().graph_def_versions.producer = 8
-    # pylint: disable=protected-access
+    test_util.set_producer_version(ops.get_default_graph(), 8)
     return gen_nn_ops._batch_norm_with_global_normalization(
         x, m, v, beta, gamma, epsilon, scale_after_normalization)
     # pylint: enable=protected-access
@@ -222,7 +222,7 @@ class BatchNormalizationTest(test.TestCase):
         epsilon = 0.001
         for scale_after_normalization in [True, False]:
           # _batch_norm_with_global_normalization_grad is deprecated in v9
-          ops.get_default_graph().graph_def_versions.producer = 8
+          test_util.set_producer_version(ops.get_default_graph(), 8)
           grad = gen_nn_ops._batch_norm_with_global_normalization_grad(
               x, m, v, gamma, backprop, epsilon, scale_after_normalization)
           dx, dm, dv, db, dg = grad
@@ -334,6 +334,7 @@ class BatchNormalizationTest(test.TestCase):
         (2, 3, 2, 4, 5), (1, 1, 1, 4, 5), atol=0.005)
 
 
+@test_util.with_c_api
 class SufficientStatisticsTest(test.TestCase):
 
   def _npSuffStats(self, x, axes, shift, keep_dims):
@@ -393,6 +394,7 @@ class SufficientStatisticsTest(test.TestCase):
           self._testSuffStats([1, 2, 3], [0, 2], shift, keep_dims, has_shape)
 
 
+@test_util.with_c_api
 class NormalizeMomentsTest(test.TestCase):
 
   def _npNormalizeMoments(self, counts, mean_ss, variance_ss, shift):
@@ -436,6 +438,7 @@ class NormalizeMomentsTest(test.TestCase):
       self._testNormalizeMoments([2, 3], shift)
 
 
+@test_util.with_c_api
 class MomentsTest(test.TestCase):
 
   def _unweighted_moments(self, x, axes, keep_dims=False, extra_out_grads=None):
@@ -573,6 +576,7 @@ class MomentsTest(test.TestCase):
     self._testGlobalGradient(from_y="var")
 
 
+@test_util.with_c_api
 class WeightedMomentsTest(MomentsTest):
   """Tests for nn.weighted_moments.
 
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index e72d34d1f728344709cd7429ab560379a2836cab..a08b836025d12178ab7acfbd70fcc7a47bc99532 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -171,6 +171,10 @@ class BatchNormalizationTest(test.TestCase):
         x, x_shape, y, y_shape, delta=1e-3, x_init_value=x_init_val)
     _, numerical_grad = gradient_checker.compute_gradient(
         x32, x_shape, y32, y_shape, delta=1e-3, x_init_value=x32_init_val)
+
+    # If grad is empty, no error.
+    if theoretical_grad.size == 0 and numerical_grad.size == 0:
+      return 0
     return np.fabs(theoretical_grad - numerical_grad).max()
 
   def _test_gradient(self,
@@ -274,7 +278,8 @@ class BatchNormalizationTest(test.TestCase):
         epsilon = y.op.get_attr('epsilon')
         data_format = y.op.get_attr('data_format')
         grad_vals = sess.run([grad_x, grad_scale, grad_offset])
-        grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format)
+        grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean,
+                                               pop_var, epsilon, data_format)
         grad_internal_vals = sess.run(list(grad_internal))
         for grad_val, grad_internal_val in zip(grad_vals, grad_internal_vals):
           self.assertAllClose(grad_val, grad_internal_val, atol=err_tolerance)
@@ -333,7 +338,7 @@ class BatchNormalizationTest(test.TestCase):
     self.assertLess(err_grad_x_2, err_tolerance)
     self.assertLess(err_grad_scale, err_tolerance)
 
-  def testInference(self):
+  def testInferenceShape1(self):
     x_shape = [1, 1, 6, 1]
     for dtype in [np.float16, np.float32]:
       if test.is_gpu_available(cuda_only=True):
@@ -344,6 +349,7 @@ class BatchNormalizationTest(test.TestCase):
       self._test_inference(
           x_shape, dtype, [1], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testInferenceShape2(self):
     x_shape = [1, 1, 6, 2]
     if test.is_gpu_available(cuda_only=True):
       for dtype in [np.float16, np.float32]:
@@ -352,12 +358,14 @@ class BatchNormalizationTest(test.TestCase):
         self._test_inference(
             x_shape, dtype, [2], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testInferenceShape3(self):
     x_shape = [1, 2, 1, 6]
     if test.is_gpu_available(cuda_only=True):
       for dtype in [np.float16, np.float32]:
         self._test_inference(
             x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NCHW')
 
+  def testInferenceShape4(self):
     x_shape = [27, 131, 127, 6]
     for dtype in [np.float16, np.float32]:
       if test.is_gpu_available(cuda_only=True):
@@ -368,7 +376,18 @@ class BatchNormalizationTest(test.TestCase):
       self._test_inference(
           x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
-  def testTraining(self):
+  def testInferenceShape5(self):
+    x_shape = [0, 131, 127, 6]
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
+        self._test_inference(
+            x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
+        self._test_inference(
+            x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
+      self._test_inference(
+          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
+
+  def testTrainingShape1(self):
     x_shape = [1, 1, 6, 1]
     for dtype in [np.float16, np.float32]:
       if test.is_gpu_available(cuda_only=True):
@@ -379,6 +398,7 @@ class BatchNormalizationTest(test.TestCase):
       self._test_training(
           x_shape, dtype, [1], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testTrainingShape2(self):
     x_shape = [1, 1, 6, 2]
     for dtype in [np.float16, np.float32]:
       if test.is_gpu_available(cuda_only=True):
@@ -387,12 +407,14 @@ class BatchNormalizationTest(test.TestCase):
       self._test_training(
           x_shape, dtype, [2], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testTrainingShape3(self):
     x_shape = [1, 2, 1, 6]
     if test.is_gpu_available(cuda_only=True):
       for dtype in [np.float16, np.float32]:
         self._test_training(
             x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NCHW')
 
+  def testTrainingShape4(self):
     x_shape = [27, 131, 127, 6]
     for dtype in [np.float16, np.float32]:
       if test.is_gpu_available(cuda_only=True):
@@ -403,7 +425,18 @@ class BatchNormalizationTest(test.TestCase):
       self._test_training(
           x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
-  def testBatchNormGrad(self):
+  def testTrainingShape5(self):
+    x_shape = [0, 131, 127, 6]
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
+        self._test_training(
+            x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
+        self._test_training(
+            x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
+      self._test_training(
+          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
+
+  def testBatchNormGradShape1(self):
     for is_training in [True, False]:
       x_shape = [1, 1, 6, 1]
       for dtype in [np.float16, np.float32]:
@@ -430,6 +463,8 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NHWC',
             is_training=is_training)
 
+  def testBatchNormGradShape2(self):
+    for is_training in [True, False]:
       x_shape = [1, 1, 6, 2]
       for dtype in [np.float16, np.float32]:
         if test.is_gpu_available(cuda_only=True):
@@ -448,6 +483,8 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NHWC',
             is_training=is_training)
 
+  def testBatchNormGradShape3(self):
+    for is_training in [True, False]:
       x_shape = [1, 2, 1, 6]
       if test.is_gpu_available(cuda_only=True):
         for dtype in [np.float16, np.float32]:
@@ -459,6 +496,8 @@ class BatchNormalizationTest(test.TestCase):
               data_format='NCHW',
               is_training=is_training)
 
+  def testBatchNormGradShape4(self):
+    for is_training in [True, False]:
       x_shape = [5, 7, 11, 4]
       for dtype in [np.float16, np.float32]:
         if test.is_gpu_available(cuda_only=True):
@@ -484,6 +523,33 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NHWC',
             is_training=is_training)
 
+  def testBatchNormGradShape5(self):
+    for is_training in [True, False]:
+      x_shape = [0, 7, 11, 4]
+      for dtype in [np.float16, np.float32]:
+        if test.is_gpu_available(cuda_only=True):
+          self._test_gradient(
+              x_shape,
+              dtype, [7],
+              np.float32,
+              use_gpu=True,
+              data_format='NCHW',
+              is_training=is_training)
+          self._test_gradient(
+              x_shape,
+              dtype, [4],
+              np.float32,
+              use_gpu=True,
+              data_format='NHWC',
+              is_training=is_training)
+        self._test_gradient(
+            x_shape,
+            dtype, [4],
+            np.float32,
+            use_gpu=False,
+            data_format='NHWC',
+            is_training=is_training)
+
   def _testBatchNormGradGrad(self, config):
     shape = config['shape']
     err_tolerance = config['err_tolerance']
@@ -515,26 +581,37 @@ class BatchNormalizationTest(test.TestCase):
           is_training=is_training,
           err_tolerance=err_tolerance)
 
-  def testBatchNormGradGrad(self):
-    configs = [{
+  def testBatchNormGradGradConfig1(self):
+    config = {
         'shape': [2, 3, 4, 5],
         'err_tolerance': 1e-2,
         'dtype': np.float32,
-    }, {
+    }
+    self._testBatchNormGradGrad(config)
+
+  def testBatchNormGradGradConfig2(self):
+    config = {
         'shape': [2, 3, 2, 2],
         'err_tolerance': 1e-3,
         'dtype': np.float32,
-    }, {
+    }
+    self._testBatchNormGradGrad(config)
+
+  def testBatchNormGradGradConfig3(self):
+    config = {
         'shape': [2, 3, 4, 5],
         'err_tolerance': 1e-2,
         'dtype': np.float16,
-    }, {
+    }
+    self._testBatchNormGradGrad(config)
+
+  def testBatchNormGradGradConfig4(self):
+    config = {
         'shape': [2, 3, 2, 2],
         'err_tolerance': 2e-3,
         'dtype': np.float16,
-    }]
-    for config in configs:
-      self._testBatchNormGradGrad(config)
+    }
+    self._testBatchNormGradGrad(config)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 4b406ba8404d60fbed43afa30f44b1e1a9b26d84..dc24b821a5580e3581f153f3cbf63ad2868b8a18 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -41,85 +41,111 @@ def _Conv2DBackpropInputGrad(op, grad):
   Returns:
     the gradients w.r.t. the input and the filter
   """
-  return [None,
-          nn_ops.conv2d_backprop_filter(grad, array_ops.shape(op.inputs[1]),
-                                        op.inputs[2], op.get_attr("strides"),
-                                        op.get_attr("padding"),
-                                        op.get_attr("use_cudnn_on_gpu"),
-                                        op.get_attr("data_format")),
-          nn_ops.conv2d(grad, op.inputs[1], op.get_attr("strides"),
-                        op.get_attr("padding"), op.get_attr("use_cudnn_on_gpu"),
-                        op.get_attr("data_format"))]
+  return [
+      None,
+      nn_ops.conv2d_backprop_filter(
+          grad,
+          array_ops.shape(op.inputs[1]),
+          op.inputs[2],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format")),
+      nn_ops.conv2d(
+          grad,
+          op.inputs[1],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format"))
+  ]
 
 
 @ops.RegisterGradient("Conv2DBackpropFilter")
 def _Conv2DBackpropFilterGrad(op, grad):
   return [
       nn_ops.conv2d_backprop_input(
-          array_ops.shape(op.inputs[0]), grad, op.inputs[2],
-          op.get_attr("strides"),
-          op.get_attr("padding"),
-          op.get_attr("use_cudnn_on_gpu"),
-          op.get_attr("data_format")),
-      None,
+          array_ops.shape(op.inputs[0]),
+          grad,
+          op.inputs[2],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format")), None,
       nn_ops.conv2d(
-          op.inputs[0], grad,
-          op.get_attr("strides"),
-          op.get_attr("padding"),
-          op.get_attr("use_cudnn_on_gpu"),
-          op.get_attr("data_format"))
+          op.inputs[0],
+          grad,
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format"))
   ]
 
 
 @ops.RegisterGradient("Conv3D")
 def _Conv3DGrad(op, grad):
   data_format = op.get_attr("data_format")
-  return [nn_ops.conv3d_backprop_input_v2(array_ops.shape(op.inputs[0]),
-                                          op.inputs[1],
-                                          grad,
-                                          strides=op.get_attr("strides"),
-                                          padding=op.get_attr("padding"),
-                                          data_format=data_format),
-          nn_ops.conv3d_backprop_filter_v2(op.inputs[0],
-                                           array_ops.shape(op.inputs[1]),
-                                           grad,
-                                           strides=op.get_attr("strides"),
-                                           padding=op.get_attr("padding"),
-                                           data_format=data_format)]
+  return [
+      nn_ops.conv3d_backprop_input_v2(
+          array_ops.shape(op.inputs[0]),
+          op.inputs[1],
+          grad,
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format),
+      nn_ops.conv3d_backprop_filter_v2(
+          op.inputs[0],
+          array_ops.shape(op.inputs[1]),
+          grad,
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format)
+  ]
 
 
 @ops.RegisterGradient("Conv3DBackpropInputV2")
 def _Conv3DBackpropInputGrad(op, grad):
   data_format = op.get_attr("data_format")
-  return [None,
-          nn_ops.conv3d_backprop_filter_v2(grad,
-                                           array_ops.shape(op.inputs[1]),
-                                           op.inputs[2],
-                                           strides=op.get_attr("strides"),
-                                           padding=op.get_attr("padding"),
-                                           data_format=data_format),
-          nn_ops.conv3d(grad,
-                        op.inputs[1],
-                        strides=op.get_attr("strides"),
-                        padding=op.get_attr("padding"),
-                        data_format=data_format)]
+  return [
+      None,
+      nn_ops.conv3d_backprop_filter_v2(
+          grad,
+          array_ops.shape(op.inputs[1]),
+          op.inputs[2],
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format),
+      nn_ops.conv3d(
+          grad,
+          op.inputs[1],
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format)
+  ]
 
 
 @ops.RegisterGradient("Conv3DBackpropFilterV2")
 def _Conv3DBackpropFilterGrad(op, grad):
   data_format = op.get_attr("data_format")
-  return [nn_ops.conv3d_backprop_input_v2(array_ops.shape(op.inputs[0]),
-                                          grad,
-                                          op.inputs[2],
-                                          strides=op.get_attr("strides"),
-                                          padding=op.get_attr("padding"),
-                                          data_format=data_format),
-          None,
-          nn_ops.conv3d(op.inputs[0],
-                        grad,
-                        strides=op.get_attr("strides"),
-                        padding=op.get_attr("padding"),
-                        data_format=data_format)]
+  return [
+      nn_ops.conv3d_backprop_input_v2(
+          array_ops.shape(op.inputs[0]),
+          grad,
+          op.inputs[2],
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format), None,
+      nn_ops.conv3d(
+          op.inputs[0],
+          grad,
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=data_format)
+  ]
 
 
 @ops.RegisterGradient("AvgPool3D")
@@ -135,12 +161,13 @@ def _AvgPool3DGrad(op, grad):
 
 @ops.RegisterGradient("AvgPool3DGrad")
 def _AvgPool3DGradGrad(op, grad):
-  return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops.avg_pool3d(
-      grad,
-      op.get_attr("ksize"),
-      op.get_attr("strides"),
-      op.get_attr("padding"),
-      data_format=op.get_attr("data_format")))
+  return (array_ops.stop_gradient(op.inputs[0]),
+          gen_nn_ops.avg_pool3d(
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("MaxPool3D")
@@ -158,9 +185,9 @@ def _MaxPool3DGrad(op, grad):
 @ops.RegisterGradient("MaxPool3DGrad")
 def _MaxPool3DGradGrad(op, grad):
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops._max_pool3d_grad_grad(
               op.inputs[0],
               op.inputs[1],
@@ -174,9 +201,9 @@ def _MaxPool3DGradGrad(op, grad):
 @ops.RegisterGradient("MaxPool3DGradGrad")
 def _MaxPool3DGradGradGrad(op, grad):
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops._max_pool3d_grad(
               op.inputs[0],
               op.inputs[1],
@@ -231,7 +258,7 @@ def _LogSoftmaxGrad(op, grad):
     The gradients w.r.t. the input.
   """
   softmax = math_ops.exp(op.outputs[0])
-  return grad - math_ops.reduce_sum(grad, 1, keep_dims=True) * softmax
+  return grad - math_ops.reduce_sum(grad, 1, keepdims=True) * softmax
 
 
 @ops.RegisterGradient("BiasAdd")
@@ -257,8 +284,9 @@ def _BiasAddGrad(op, received_grad):
     data_format = op.get_attr("data_format")
   except ValueError:
     data_format = None
-  return (received_grad, gen_nn_ops.bias_add_grad(out_backprop=received_grad,
-                                                  data_format=data_format))
+  return (received_grad,
+          gen_nn_ops.bias_add_grad(
+              out_backprop=received_grad, data_format=data_format))
 
 
 @ops.RegisterGradient("BiasAddGrad")
@@ -331,10 +359,9 @@ def _ReluGrad(op, grad):
 def _EluGradGrad(op, grad):
   elu_x = op.inputs[1]
   return (gen_nn_ops._elu_grad(grad, op.outputs[0]),
-          array_ops.where(elu_x < 0,
-                          grad * op.inputs[0],
-                          array_ops.zeros(shape=array_ops.shape(elu_x),
-                                          dtype=elu_x.dtype)))
+          array_ops.where(elu_x < 0, grad * op.inputs[0],
+                          array_ops.zeros(
+                              shape=array_ops.shape(elu_x), dtype=elu_x.dtype)))
 
 
 @ops.RegisterGradient("SeluGrad")
@@ -342,9 +369,11 @@ def _SeluGradGrad(op, grad):
   x = op.inputs[1]
   scale_alpha = 1.7580993408473768599402175208123
   return (gen_nn_ops._elu_grad(grad, op.outputs[0]),
-          array_ops.where(
-              x < 0., gen_nn_ops._elu_grad(grad, op.outputs[0] + scale_alpha),
-              array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype)))
+          array_ops.where(x < 0.,
+                          gen_nn_ops._elu_grad(grad,
+                                               op.outputs[0] + scale_alpha),
+                          array_ops.zeros(
+                              shape=array_ops.shape(x), dtype=x.dtype)))
 
 
 @ops.RegisterGradient("Relu6")
@@ -355,8 +384,8 @@ def _Relu6Grad(op, grad):
 @ops.RegisterGradient("Relu6Grad")
 def _Relu6GradGrad(op, grad):
   x = op.inputs[1]
-  return (gen_nn_ops._relu6_grad(grad, x), array_ops.zeros(
-      shape=array_ops.shape(x), dtype=x.dtype))
+  return (gen_nn_ops._relu6_grad(grad, x),
+          array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype))
 
 
 @ops.RegisterGradient("Elu")
@@ -395,8 +424,8 @@ def _SoftsignGrad(op, grad):
 @ops.RegisterGradient("ReluGrad")
 def _ReluGradGrad(op, grad):
   x = op.inputs[1]
-  return (gen_nn_ops._relu_grad(grad, x), array_ops.zeros(
-      shape=array_ops.shape(x), dtype=x.dtype))
+  return (gen_nn_ops._relu_grad(grad, x),
+          array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype))
 
 
 def _BroadcastMul(vec, mat):
@@ -440,8 +469,8 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
     softmax = nn_ops.softmax(logits)
 
     grad += ((grad_grad - array_ops.squeeze(
-        math_ops.matmul(grad_grad[:, None, :],
-                        softmax[:, :, None]), axis=1)) * softmax)
+        math_ops.matmul(grad_grad[:, None, :], softmax[:, :, None]), axis=1)) *
+             softmax)
 
   return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))
 
@@ -458,7 +487,8 @@ def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _):
   # so we make sure we prevent silently incorrect results by raising
   # an error if the second derivative is requested via prevent_gradient.
   sparse_softmax_grad_without_gradient = array_ops.prevent_gradient(
-      op.outputs[1], message="Currently there is no way to take the second "
+      op.outputs[1],
+      message="Currently there is no way to take the second "
       "derivative of sparse_softmax_cross_entropy_with_logits due to the fused "
       "implementation's interaction with tf.gradients()")
   return _BroadcastMul(grad_0, sparse_softmax_grad_without_gradient), None
@@ -466,25 +496,32 @@ def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _):
 
 @ops.RegisterGradient("Conv2D")
 def _Conv2DGrad(op, grad):
+  dilations = op.get_attr("dilations")
   strides = op.get_attr("strides")
   padding = op.get_attr("padding")
   use_cudnn_on_gpu = op.get_attr("use_cudnn_on_gpu")
   data_format = op.get_attr("data_format")
   shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]])
-  return [nn_ops.conv2d_backprop_input(shape_0,
-                                       op.inputs[1],
-                                       grad,
-                                       strides,
-                                       padding,
-                                       use_cudnn_on_gpu,
-                                       data_format),
-          nn_ops.conv2d_backprop_filter(op.inputs[0],
-                                        shape_1,
-                                        grad,
-                                        strides,
-                                        padding,
-                                        use_cudnn_on_gpu,
-                                        data_format)]
+  return [
+      nn_ops.conv2d_backprop_input(
+          shape_0,
+          op.inputs[1],
+          grad,
+          dilations=dilations,
+          strides=strides,
+          padding=padding,
+          use_cudnn_on_gpu=use_cudnn_on_gpu,
+          data_format=data_format),
+      nn_ops.conv2d_backprop_filter(
+          op.inputs[0],
+          shape_1,
+          grad,
+          dilations=dilations,
+          strides=strides,
+          padding=padding,
+          use_cudnn_on_gpu=use_cudnn_on_gpu,
+          data_format=data_format)
+  ]
 
 
 @ops.RegisterGradient("DepthwiseConv2dNative")
@@ -509,14 +546,16 @@ def _DepthwiseConv2dNativeGrad(op, grad):
 
 @ops.RegisterGradient("Dilation2D")
 def _Dilation2DGrad(op, grad):
-  return [nn_ops.dilation2d_backprop_input(op.inputs[0], op.inputs[1], grad,
-                                           op.get_attr("strides"),
-                                           op.get_attr("rates"),
-                                           op.get_attr("padding")),
-          nn_ops.dilation2d_backprop_filter(op.inputs[0], op.inputs[1], grad,
-                                            op.get_attr("strides"),
-                                            op.get_attr("rates"),
-                                            op.get_attr("padding"))]
+  return [
+      nn_ops.dilation2d_backprop_input(op.inputs[0], op.inputs[1], grad,
+                                       op.get_attr("strides"),
+                                       op.get_attr("rates"),
+                                       op.get_attr("padding")),
+      nn_ops.dilation2d_backprop_filter(op.inputs[0], op.inputs[1], grad,
+                                        op.get_attr("strides"),
+                                        op.get_attr("rates"),
+                                        op.get_attr("padding"))
+  ]
 
 
 @ops.RegisterGradient("LRN")
@@ -525,8 +564,10 @@ def _LRNGrad(op, grad):
   bias = op.get_attr("bias")
   alpha = op.get_attr("alpha")
   beta = op.get_attr("beta")
-  return [gen_nn_ops._lrn_grad(grad, op.inputs[0], op.outputs[0], depth_radius,
-                               bias, alpha, beta)]
+  return [
+      gen_nn_ops._lrn_grad(grad, op.inputs[0], op.outputs[0], depth_radius,
+                           bias, alpha, beta)
+  ]
 
 
 @ops.RegisterGradient("AvgPool")
@@ -542,54 +583,58 @@ def _AvgPoolGrad(op, grad):
 
 @ops.RegisterGradient("AvgPoolGrad")
 def _AvgPoolGradGrad(op, grad):
-  return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops._avg_pool(
-      grad,
-      op.get_attr("ksize"),
-      op.get_attr("strides"),
-      op.get_attr("padding"),
-      data_format=op.get_attr("data_format")))
+  return (array_ops.stop_gradient(op.inputs[0]),
+          gen_nn_ops._avg_pool(
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("MaxPool")
 def _MaxPoolGrad(op, grad):
-  return gen_nn_ops._max_pool_grad(op.inputs[0],
-                                   op.outputs[0],
-                                   grad,
-                                   op.get_attr("ksize"),
-                                   op.get_attr("strides"),
-                                   padding=op.get_attr("padding"),
-                                   data_format=op.get_attr("data_format"))
+  return gen_nn_ops._max_pool_grad(
+      op.inputs[0],
+      op.outputs[0],
+      grad,
+      op.get_attr("ksize"),
+      op.get_attr("strides"),
+      padding=op.get_attr("padding"),
+      data_format=op.get_attr("data_format"))
 
 
 @ops.RegisterGradient("MaxPoolV2")
 def _MaxPoolGradV2(op, grad):
   ksize = op.inputs[1]
   strides = op.inputs[2]
-  return gen_nn_ops.max_pool_grad_v2(op.inputs[0],
-                                     op.outputs[0],
-                                     grad,
-                                     ksize,
-                                     strides,
-                                     padding=op.get_attr("padding"),
-                                     data_format=op.get_attr("data_format")), None, None
+  return gen_nn_ops.max_pool_grad_v2(
+      op.inputs[0],
+      op.outputs[0],
+      grad,
+      ksize,
+      strides,
+      padding=op.get_attr("padding"),
+      data_format=op.get_attr("data_format")), None, None
 
 
 @ops.RegisterGradient("MaxPoolWithArgmax")
 def _MaxPoolGradWithArgmax(op, grad, unused_argmax_grad):
-  return gen_nn_ops._max_pool_grad_with_argmax(op.inputs[0],
-                                               grad,
-                                               op.outputs[1],
-                                               op.get_attr("ksize"),
-                                               op.get_attr("strides"),
-                                               padding=op.get_attr("padding"))
+  return gen_nn_ops._max_pool_grad_with_argmax(
+      op.inputs[0],
+      grad,
+      op.outputs[1],
+      op.get_attr("ksize"),
+      op.get_attr("strides"),
+      padding=op.get_attr("padding"))
 
 
 @ops.RegisterGradient("MaxPoolGrad")
 def _MaxPoolGradGrad(op, grad):
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops._max_pool_grad_grad(
               op.inputs[0],
               op.inputs[1],
@@ -605,9 +650,9 @@ def _MaxPoolGradGradV2(op, grad):
   ksize = op.inputs[3]
   strides = op.inputs[4]
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops.max_pool_grad_grad_v2(
               op.inputs[0],
               op.inputs[1],
@@ -621,9 +666,9 @@ def _MaxPoolGradGradV2(op, grad):
 @ops.RegisterGradient("MaxPoolGradGrad")
 def _MaxPoolGradGradGrad(op, grad):
   return (array_ops.zeros(
-      shape=array_ops.shape(op.inputs[0]),
-      dtype=op.inputs[0].dtype), array_ops.zeros(
-          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+      shape=array_ops.shape(op.inputs[0]), dtype=op.inputs[0].dtype),
+          array_ops.zeros(
+              shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
           gen_nn_ops._max_pool_grad(
               op.inputs[0],
               op.inputs[1],
@@ -652,10 +697,9 @@ def _FractionalMaxPoolGrad(op, grad_0, unused_grad_1, unused_grad_2):
     Input backprop for FractionalMaxPool op.
   """
   # pylint: disable=protected-access
-  return gen_nn_ops._fractional_max_pool_grad(op.inputs[0], op.outputs[0],
-                                              grad_0, op.outputs[1],
-                                              op.outputs[2],
-                                              op.get_attr("overlapping"))
+  return gen_nn_ops._fractional_max_pool_grad(
+      op.inputs[0], op.outputs[0], grad_0, op.outputs[1], op.outputs[2],
+      op.get_attr("overlapping"))
 
 
 @ops.RegisterGradient("FractionalAvgPool")
@@ -739,8 +783,9 @@ def _BaseFusedBatchNormGrad(op, use_v2, *grad):
   epsilon = op.get_attr("epsilon")
   data_format = op.get_attr("data_format")
   is_training = op.get_attr("is_training")
-  grad_fun = (gen_nn_ops.fused_batch_norm_grad_v2 if use_v2
-              else gen_nn_ops.fused_batch_norm_grad)
+  grad_fun = (
+      gen_nn_ops.fused_batch_norm_grad_v2
+      if use_v2 else gen_nn_ops.fused_batch_norm_grad)
   if is_training:
     return grad_fun(
         grad_y,
@@ -764,7 +809,7 @@ def _BaseFusedBatchNormGrad(op, use_v2, *grad):
         pop_mean,
         pop_var,
         epsilon=epsilon,
-        data_format='NHWC',
+        data_format="NHWC",
         is_training=is_training)
     if data_format == b"NCHW":
       dx = array_ops.transpose(dx, [0, 3, 1, 2])
@@ -781,18 +826,28 @@ def _FusedBatchNormV2Grad(op, *grad):
   return _BaseFusedBatchNormGrad(op, True, *grad)
 
 
-def _BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training=True):
+def _BatchNormGrad(grad_y,
+                   x,
+                   scale,
+                   pop_mean,
+                   pop_var,
+                   epsilon,
+                   data_format,
+                   is_training=True):
   """Returns the gradients for the 3 inputs of BatchNorm.
 
   Args:
     grad_y: A `Tensor` of 4 dimensions for gradient for y.
     x: A `Tensor` of 4 dimensions for x.
     scale: A `Tensor` of 1 dimension for scaling.
-    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when is_training=False.
-    pop_var: A `Tensor` of 1 dimension for the population variance. Only used when is_training=False.
+    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
+      is_training=False.
+    pop_var: A `Tensor` of 1 dimension for the population variance. Only used
+      when is_training=False.
     epsilon: A small float number added to the variance of x.
     data_format: The data format for input. Either b"NHWC" or b"NCHW".
-    is_training: A bool value to indicate the operation is for training (default)
+    is_training: A bool value to indicate the operation is for training
+      (default)
         or inference.
 
   Returns:
@@ -808,27 +863,27 @@ def _BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is
     grad_y = math_ops.cast(grad_y, dtypes.float32)
   if is_training:
     if data_format == b"NHWC":
-      keep_dims = False
+      keepdims = False
       reduce_axis = [0, 1, 2]
     else:
-      keep_dims = True
+      keepdims = True
       reduce_axis = [0, 2, 3]
       shape = [1, array_ops.size(scale), 1, 1]
       scale = array_ops.reshape(scale, shape)
-    mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keep_dims=keep_dims)
-    mean_x = math_ops.reduce_mean(x, reduce_axis, keep_dims=keep_dims)
+    mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims)
+    mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims)
     var_x = math_ops.reduce_mean(
         math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
         reduce_axis,
-        keep_dims=keep_dims)
+        keepdims=keepdims)
     grad_y_offset = grad_y - mean_grad_y
     x_offset = x - mean_x
     mean = math_ops.reduce_mean(
-        grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
+        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
     grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
         grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
     grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
-        grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
+        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
     if data_format == b"NCHW":
       grad_scale = array_ops.squeeze(grad_scale)
     grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
@@ -878,7 +933,7 @@ def _FusedBatchNormGradGrad(op, *grad):
   grad_grad_scale = grad[1]
   grad_grad_offset = grad[2]
   grad_x, grad_scale, grad_offset = _BatchNormGrad(
-        grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
+      grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
   grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
   grad_grad_y, grad_x, grad_scale = gradients_impl.gradients(
       [grad_x, grad_scale, grad_offset], [grad_y, x, scale], grad_initial)
@@ -932,14 +987,15 @@ def _TopKGrad(op, grad, _):
 
   # Substitute grad to appropriate locations and fill the rest with zeros,
   # finally reshaping it to the original input shape.
-  return [array_ops.reshape(
-      sparse_ops.sparse_to_dense(ind,
-                                 array_ops.reshape(
-                                     math_ops.reduce_prod(in_shape), [1]),
-                                 array_ops.reshape(grad, [-1]),
-                                 validate_indices=False),
-      in_shape), array_ops.zeros(
-          [], dtype=dtypes.int32)]
+  return [
+      array_ops.reshape(
+          sparse_ops.sparse_to_dense(
+              ind,
+              array_ops.reshape(math_ops.reduce_prod(in_shape), [1]),
+              array_ops.reshape(grad, [-1]),
+              validate_indices=False), in_shape),
+      array_ops.zeros([], dtype=dtypes.int32)
+  ]
 
 
 @ops.RegisterGradient("NthElement")
@@ -954,18 +1010,16 @@ def _NthElementGrad(op, grad):
     A list of two tensors, the first being the gradient w.r.t. the input,
     the second being the gradient w.r.t. the N (None).
   """
-  input = op.inputs[0]
+  input = op.inputs[0]  # pylint: disable=redefined-builtin
   output = op.outputs[0]
 
   # Compute the number of elements which equal to output in each reduction
   # dimension. If there are multiple elements then the gradient will be
   # divided between them.
   indicators = math_ops.cast(
-      math_ops.equal(array_ops.expand_dims(output, -1), input),
-      grad.dtype)
+      math_ops.equal(array_ops.expand_dims(output, -1), input), grad.dtype)
 
   grad = array_ops.expand_dims(grad, -1)
-  num_selected = array_ops.expand_dims(
-      math_ops.reduce_sum(indicators, -1), -1)
+  num_selected = array_ops.expand_dims(math_ops.reduce_sum(indicators, -1), -1)
 
   return [math_ops.div(indicators, num_selected) * grad, None]
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index f7541c0e892819beaf27ad97d7d41b8f963a4ab9..49d54beb20073162279576e1e1011e10392378e0 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -24,23 +24,26 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import nn_grad
+from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
 
 class Relu6OpTest(test.TestCase):
+
   def testRelu6GradGrad(self):
-    inputs = constant_op.constant([[-2, -1, 1, 3], [5, 7, 8, 9]],
-                                  dtype=dtypes.float32)
+    inputs = constant_op.constant(
+        [[-2, -1, 1, 3], [5, 7, 8, 9]], dtype=dtypes.float32)
     x_init_value = np.array([[-3.5, -1.5, 2, 4], [4.5, 7.5, 8.5, 11]])
     r = nn_ops.relu6(inputs)
     r_g = gradients_impl.gradients(r, inputs)[0]
     with self.test_session():
       error = gradient_checker.compute_gradient_error(
-        inputs, inputs.get_shape().as_list(),
-        r_g, r_g.get_shape().as_list(),
-        x_init_value=x_init_value)
+          inputs,
+          inputs.get_shape().as_list(),
+          r_g,
+          r_g.get_shape().as_list(),
+          x_init_value=x_init_value)
       self.assertLess(error, 1e-4)
 
 
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 00e3c7dc0f30a9b37f742917fde2f3a58b60ba64..5fa5708114fd5cda6afbca78fa0debf68f0252cc 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import candidate_sampling_ops
 from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import gen_array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -34,8 +35,10 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("nn.log_poisson_loss")
 def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
   """Computes log Poisson loss given `log_input`.
 
@@ -100,6 +103,7 @@ def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
     return result
 
 
+@tf_export("nn.sigmoid_cross_entropy_with_logits")
 def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
     _sentinel=None,
     labels=None,
@@ -179,6 +183,7 @@ def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
         name=name)
 
 
+@tf_export("nn.weighted_cross_entropy_with_logits")
 def weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None):
   """Computes a weighted cross entropy.
 
@@ -191,7 +196,13 @@ def weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None):
       targets * -log(sigmoid(logits)) +
           (1 - targets) * -log(1 - sigmoid(logits))
 
-  The argument `pos_weight` is used as a multiplier for the positive targets:
+  A value `pos_weights > 1` decreases the false negative count, hence increasing
+  the recall.
+  Conversely setting `pos_weights < 1` decreases the false positive count and
+  increases the precision.
+  This can be seen from the fact that `pos_weight` is introduced as a
+  multiplicative coefficient for the positive targets term
+  in the loss expression:
 
       targets * -log(sigmoid(logits)) * pos_weight +
           (1 - targets) * -log(1 - sigmoid(logits))
@@ -250,6 +261,7 @@ def weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None):
         name=name)
 
 
+@tf_export("nn.relu_layer")
 def relu_layer(x, weights, biases, name=None):
   """Computes Relu(x * weight + biases).
 
@@ -296,6 +308,7 @@ def _swish_grad(features, grad):
     shape_func=_swish_shape,
     func_name="swish",
     noinline=True)
+@tf_export("nn.swish")
 def swish(features):
   # pylint: disable=g-doc-args
   """Computes the Swish activation function: `x * sigmoid(x)`.
@@ -315,6 +328,7 @@ def swish(features):
   return features * math_ops.sigmoid(features)
 
 
+@tf_export("nn.l2_normalize")
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
   """Normalizes along dimension `axis` using an L2 norm.
@@ -341,11 +355,12 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
   with ops.name_scope(name, "l2_normalize", [x]) as name:
     axis = deprecated_argument_lookup("axis", axis, "dim", dim)
     x = ops.convert_to_tensor(x, name="x")
-    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keep_dims=True)
+    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
     x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
     return math_ops.multiply(x, x_inv_norm, name=name)
 
 
+@tf_export("nn.zero_fraction")
 def zero_fraction(value, name=None):
   """Returns the fraction of zeros in `value`.
 
@@ -373,6 +388,7 @@ def zero_fraction(value, name=None):
 
 
 # pylint: disable=redefined-builtin
+@tf_export("nn.depthwise_conv2d")
 def depthwise_conv2d(input,
                      filter,
                      strides,
@@ -449,6 +465,7 @@ def depthwise_conv2d(input,
 
 
 # pylint: disable=redefined-builtin,line-too-long
+@tf_export("nn.separable_conv2d")
 def separable_conv2d(input,
                      depthwise_filter,
                      pointwise_filter,
@@ -549,6 +566,7 @@ def separable_conv2d(input,
 # pylint: enable=redefined-builtin,line-too-long
 
 
+@tf_export("nn.sufficient_statistics")
 def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
 
@@ -593,11 +611,12 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
     else:  # no shift.
       m_ss = x
       v_ss = math_ops.square(x)
-    m_ss = math_ops.reduce_sum(m_ss, axes, keep_dims=keep_dims, name="mean_ss")
-    v_ss = math_ops.reduce_sum(v_ss, axes, keep_dims=keep_dims, name="var_ss")
+    m_ss = math_ops.reduce_sum(m_ss, axes, keepdims=keep_dims, name="mean_ss")
+    v_ss = math_ops.reduce_sum(v_ss, axes, keepdims=keep_dims, name="var_ss")
   return counts, m_ss, v_ss, shift
 
 
+@tf_export("nn.normalize_moments")
 def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
@@ -629,9 +648,13 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   return (mean, variance)
 
 
-def moments(x, axes,
-            shift=None,  # pylint: disable=unused-argument
-            name=None, keep_dims=False):
+@tf_export("nn.moments")
+def moments(
+    x,
+    axes,
+    shift=None,  # pylint: disable=unused-argument
+    name=None,
+    keep_dims=False):
   """Calculate the mean and variance of `x`.
 
   The mean and variance are calculated by aggregating the contents of `x`
@@ -664,23 +687,24 @@ def moments(x, axes,
     # on 32-bit floats before converting the mean and variance back to fp16
     y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
     # Compute true mean while keeping the dims for proper broadcasting.
-    mean = math_ops.reduce_mean(y, axes, keep_dims=True, name="mean")
+    mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
     variance = math_ops.reduce_mean(
         math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
         axes,
-        keep_dims=True,
+        keepdims=True,
         name="variance")
     if not keep_dims:
       mean = array_ops.squeeze(mean, axes)
       variance = array_ops.squeeze(variance, axes)
     if x.dtype == dtypes.float16:
-      return (math_ops.cast(mean, dtypes.float16), math_ops.cast(
-          variance, dtypes.float16))
+      return (math_ops.cast(mean, dtypes.float16),
+              math_ops.cast(variance, dtypes.float16))
     else:
       return (mean, variance)
 
 
+@tf_export("nn.weighted_moments")
 def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
   """Returns the frequency-weighted mean and variance of `x`.
 
@@ -714,7 +738,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     # Note that we use keep_dims=True for our reductions regardless of the arg;
     # this is so that the results remain broadcast-compatible with the inputs.
     weighted_input_sum = math_ops.reduce_sum(
-        frequency_weights * x, axes, name="weighted_input_sum", keep_dims=True)
+        frequency_weights * x, axes, name="weighted_input_sum", keepdims=True)
 
     # The shape of the weights isn't necessarily the same as x's
     # shape, just broadcast-compatible with it -- so this expression
@@ -725,7 +749,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     broadcasted_weights = frequency_weights + array_ops.zeros_like(x)
 
     sum_of_weights = math_ops.reduce_sum(
-        broadcasted_weights, axes, name="sum_of_weights", keep_dims=True)
+        broadcasted_weights, axes, name="sum_of_weights", keepdims=True)
 
     divisor = math_ops.reciprocal(sum_of_weights, name="inv_weight_sum")
 
@@ -736,7 +760,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
         frequency_weights * math_ops.squared_difference(x, weighted_mean),
         axes,
         name="weighted_distsq",
-        keep_dims=True)
+        keepdims=True)
 
     weighted_variance = math_ops.multiply(weighted_distsq, divisor)
 
@@ -752,6 +776,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     return weighted_mean, weighted_variance
 
 
+@tf_export("nn.batch_normalization")
 def batch_normalization(x,
                         mean,
                         variance,
@@ -805,10 +830,11 @@ def batch_normalization(x,
     inv = math_ops.rsqrt(variance + variance_epsilon)
     if scale is not None:
       inv *= scale
-    return x * inv + (offset - mean * inv
-                      if offset is not None else -mean * inv)
+    return x * inv + (
+        offset - mean * inv if offset is not None else -mean * inv)
 
 
+@tf_export("nn.fused_batch_norm")
 def fused_batch_norm(
     x,
     scale,
@@ -863,7 +889,7 @@ def fused_batch_norm(
   # currently only use the V2 version for float16 inputs, which is not supported
   # by the V1 version.
   # pylint: disable=protected-access
-  if x.dtype == dtypes.float16:
+  if x.dtype == dtypes.float16 or x.dtype == dtypes.bfloat16:
     fused_batch_norm_func = gen_nn_ops._fused_batch_norm_v2
   else:
     fused_batch_norm_func = gen_nn_ops._fused_batch_norm
@@ -881,6 +907,7 @@ def fused_batch_norm(
   return y, batch_mean, batch_var
 
 
+@tf_export("nn.batch_norm_with_global_normalization")
 def batch_norm_with_global_normalization(t,
                                          m,
                                          v,
@@ -942,7 +969,8 @@ def _compute_sampled_logits(weights,
                             subtract_log_q=True,
                             remove_accidental_hits=False,
                             partition_strategy="mod",
-                            name=None):
+                            name=None,
+                            seed=None):
   """Helper function for nce_loss and sampled_softmax_loss functions.
 
   Computes sampled output training logits and labels suitable for implementing
@@ -980,11 +1008,14 @@ def _compute_sampled_logits(weights,
         if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
         Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
     name: A name for the operation (optional).
+    seed: random seed for candidate sampling. Default to None, which doesn't set
+        the op-level random seed for candidate sampling.
   Returns:
-    out_logits, out_labels: `Tensor` objects each with shape
+    out_logits: `Tensor` object with shape
         `[batch_size, num_true + num_sampled]`, for passing to either
         `nn.sigmoid_cross_entropy_with_logits` (NCE) or
         `nn.softmax_cross_entropy_with_logits` (sampled softmax).
+    out_labels: A Tensor object with the same shape as `out_logits`.
   """
 
   if isinstance(weights, variables.PartitionedVariable):
@@ -1008,7 +1039,8 @@ def _compute_sampled_logits(weights,
           num_true=num_true,
           num_sampled=num_sampled,
           unique=True,
-          range_max=num_classes)
+          range_max=num_classes,
+          seed=seed)
     # NOTE: pylint cannot tell that 'sampled_values' is a sequence
     # pylint: disable=unpacking-non-sequence
     sampled, true_expected_count, sampled_expected_count = (
@@ -1095,17 +1127,19 @@ def _compute_sampled_logits(weights,
 
     # Construct output logits and labels. The true labels/logits start at col 0.
     out_logits = array_ops.concat([true_logits, sampled_logits], 1)
-    # true_logits is a float tensor, ones_like(true_logits) is a float tensor
-    # of ones. We then divide by num_true to ensure the per-example labels sum
-    # to 1.0, i.e. form a proper probability distribution.
+
+    # true_logits is a float tensor, ones_like(true_logits) is a float
+    # tensor of ones. We then divide by num_true to ensure the per-example
+    # labels sum to 1.0, i.e. form a proper probability distribution.
     out_labels = array_ops.concat([
         array_ops.ones_like(true_logits) / num_true,
         array_ops.zeros_like(sampled_logits)
     ], 1)
 
-  return out_logits, out_labels
+    return out_logits, out_labels
 
 
+@tf_export("nn.nce_loss")
 def nce_loss(weights,
              biases,
              labels,
@@ -1214,6 +1248,7 @@ def nce_loss(weights,
   return _sum_rows(sampled_losses)
 
 
+@tf_export("nn.sampled_softmax_loss")
 def sampled_softmax_loss(weights,
                          biases,
                          labels,
@@ -1224,7 +1259,8 @@ def sampled_softmax_loss(weights,
                          sampled_values=None,
                          remove_accidental_hits=True,
                          partition_strategy="mod",
-                         name="sampled_softmax_loss"):
+                         name="sampled_softmax_loss",
+                         seed=None):
   """Computes and returns the sampled softmax training loss.
 
   This is a faster way to train a softmax classifier over a huge number of
@@ -1285,6 +1321,8 @@ def sampled_softmax_loss(weights,
         if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
         Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
     name: A name for the operation (optional).
+    seed: random seed for candidate sampling. Default to None, which doesn't set
+        the op-level random seed for candidate sampling.
 
   Returns:
     A `batch_size` 1-D tensor of per-example sampled softmax losses.
@@ -1302,7 +1340,8 @@ def sampled_softmax_loss(weights,
       subtract_log_q=True,
       remove_accidental_hits=remove_accidental_hits,
       partition_strategy=partition_strategy,
-      name=name)
+      name=name,
+      seed=seed)
   sampled_losses = nn_ops.softmax_cross_entropy_with_logits(
       labels=labels, logits=logits)
   # sampled_losses is a [batch_size] tensor.
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index ec7b9372cad9e51e83e93947ce46d66f15c339fd..47f48a7e168acd6788954e8e7117993d57c63304 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -39,7 +39,7 @@ from tensorflow.python.ops.gen_nn_ops import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util import deprecation
-
+from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
 local_response_normalization = gen_nn_ops.lrn
@@ -47,8 +47,13 @@ local_response_normalization = gen_nn_ops.lrn
 # pylint: disable=protected-access
 
 
-def _non_atrous_convolution(input, filter, padding, data_format=None,  # pylint: disable=redefined-builtin
-                            strides=None, name=None):
+def _non_atrous_convolution(
+    input,  # pylint: disable=redefined-builtin
+    filter,  # pylint: disable=redefined-builtin
+    padding,
+    data_format=None,  # pylint: disable=redefined-builtin
+    strides=None,
+    name=None):
   """Computes sums of N-D convolutions (actually cross correlation).
 
   It is required that 1 <= N <= 3.
@@ -89,16 +94,17 @@ def _non_atrous_convolution(input, filter, padding, data_format=None,  # pylint:
 
   """
   with ops.name_scope(name, "non_atrous_convolution", [input, filter]) as scope:
-    input = ops.convert_to_tensor(input, name="input")
+    input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
     input_shape = input.get_shape()
-    filter = ops.convert_to_tensor(filter, name="filter")
+    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
     filter_shape = filter.get_shape()
-    op = _NonAtrousConvolution(input_shape,
-                               filter_shape=filter_shape,
-                               padding=padding,
-                               data_format=data_format,
-                               strides=strides,
-                               name=scope)
+    op = _NonAtrousConvolution(
+        input_shape,
+        filter_shape=filter_shape,
+        padding=padding,
+        data_format=data_format,
+        strides=strides,
+        name=scope)
     return op(input, filter)
 
 
@@ -118,11 +124,14 @@ class _NonAtrousConvolution(object):
     name: see _non_atrous_convolution.
   """
 
-  def __init__(self,
-               input_shape,
-               filter_shape,  # pylint: disable=redefined-builtin
-               padding, data_format=None,
-               strides=None, name=None):
+  def __init__(
+      self,
+      input_shape,
+      filter_shape,  # pylint: disable=redefined-builtin
+      padding,
+      data_format=None,
+      strides=None,
+      name=None):
     filter_shape = filter_shape.with_rank(input_shape.ndims)
     self.padding = padding
     self.name = name
@@ -136,8 +145,8 @@ class _NonAtrousConvolution(object):
     if strides is None:
       strides = [1] * conv_dims
     elif len(strides) != conv_dims:
-      raise ValueError("len(strides)=%d, but should be %d" %
-                       (len(strides), conv_dims))
+      raise ValueError("len(strides)=%d, but should be %d" % (len(strides),
+                                                              conv_dims))
     if conv_dims == 1:
       # conv1d uses the 2-d data format names
       if data_format is None or data_format == "NWC":
@@ -176,8 +185,14 @@ class _NonAtrousConvolution(object):
   # those for gen_nn_ops.conv2d and gen_nn_ops.conv3d.
   # pylint: disable=redefined-builtin
   def _conv1d(self, input, filter, strides, padding, data_format, name):
-    return conv1d(value=input, filters=filter, stride=strides, padding=padding,
-                  data_format=data_format, name=name)
+    return conv1d(
+        value=input,
+        filters=filter,
+        stride=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
   # pylint: enable=redefined-builtin
 
   def __call__(self, inp, filter):  # pylint: disable=redefined-builtin
@@ -190,6 +205,7 @@ class _NonAtrousConvolution(object):
         name=self.name)
 
 
+@tf_export("nn.with_space_to_batch")
 def with_space_to_batch(
     input,  # pylint: disable=redefined-builtin
     dilation_rate,
@@ -332,19 +348,20 @@ def with_space_to_batch(
     ValueError: if `spatial_dims` are invalid.
 
   """
-  input = ops.convert_to_tensor(input, name="input")
+  input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
   input_shape = input.get_shape()
 
   def build_op(num_spatial_dims, padding):
     return lambda inp, _: op(inp, num_spatial_dims, padding)
 
-  new_op = _WithSpaceToBatch(input_shape,
-                             dilation_rate,
-                             padding,
-                             build_op,
-                             filter_shape=filter_shape,
-                             spatial_dims=spatial_dims,
-                             data_format=data_format)
+  new_op = _WithSpaceToBatch(
+      input_shape,
+      dilation_rate,
+      padding,
+      build_op,
+      filter_shape=filter_shape,
+      spatial_dims=spatial_dims,
+      data_format=data_format)
   return new_op(input, None)
 
 
@@ -375,9 +392,8 @@ class _WithSpaceToBatch(object):
                spatial_dims=None,
                data_format=None):
     """Helper class for _with_space_to_batch."""
-    dilation_rate = ops.convert_to_tensor(dilation_rate,
-                                          dtypes.int32,
-                                          name="dilation_rate")
+    dilation_rate = ops.convert_to_tensor(
+        dilation_rate, dtypes.int32, name="dilation_rate")
     try:
       rate_shape = dilation_rate.get_shape().with_rank(1)
     except ValueError:
@@ -437,9 +453,7 @@ class _WithSpaceToBatch(object):
       if const_filter_shape is not None:
         filter_shape = const_filter_shape
         self.base_paddings = _with_space_to_batch_base_paddings(
-            const_filter_shape,
-            num_spatial_dims,
-            rate_or_const_rate)
+            const_filter_shape, num_spatial_dims, rate_or_const_rate)
       else:
         self.num_spatial_dims = num_spatial_dims
         self.rate_or_const_rate = rate_or_const_rate
@@ -452,6 +466,7 @@ class _WithSpaceToBatch(object):
     self.input_shape = input_shape
     self.spatial_dims = spatial_dims
     self.dilation_rate = dilation_rate
+    self.data_format = data_format
     self.op = build_op(num_spatial_dims, "VALID")
     self.call = self._with_space_to_batch_call
 
@@ -475,9 +490,7 @@ class _WithSpaceToBatch(object):
       # shape was not fully defined.
       filter_shape = array_ops.shape(filter)
       base_paddings = _with_space_to_batch_base_paddings(
-          filter_shape,
-          self.num_spatial_dims,
-          self.rate_or_const_rate)
+          filter_shape, self.num_spatial_dims, self.rate_or_const_rate)
     paddings, crops = array_ops.required_space_to_batch_paddings(
         input_shape=input_spatial_shape,
         base_paddings=base_paddings,
@@ -488,14 +501,20 @@ class _WithSpaceToBatch(object):
     paddings = _with_space_to_batch_adjust(paddings, 0, spatial_dims)
     crops = _with_space_to_batch_adjust(crops, 0, spatial_dims)
     input_converted = array_ops.space_to_batch_nd(
-        input=inp,
-        block_shape=dilation_rate,
-        paddings=paddings)
+        input=inp, block_shape=dilation_rate, paddings=paddings)
 
     result = self.op(input_converted, filter)
 
     result_converted = array_ops.batch_to_space_nd(
         input=result, block_shape=dilation_rate, crops=crops)
+
+    # Recover channel information for output shape if channels are not last.
+    if self.data_format is not None and self.data_format.startswith("NC"):
+      if not result_converted.shape[1].value:
+        output_shape = result_converted.shape.as_list()
+        output_shape[1] = filter.shape[-1]
+        result_converted.set_shape(output_shape)
+
     return result_converted
 
   def __call__(self, inp, filter):  # pylint: disable=redefined-builtin
@@ -508,17 +527,17 @@ def _with_space_to_batch_base_paddings(filter_shape, num_spatial_dims,
   # Spatial dimensions of the filters and the upsampled filters in which we
   # introduce (rate - 1) zeros between consecutive filter values.
   filter_spatial_shape = filter_shape[:num_spatial_dims]
-  dilated_filter_spatial_shape = (filter_spatial_shape +
-                                  (filter_spatial_shape - 1) *
-                                  (rate_or_const_rate - 1))
+  dilated_filter_spatial_shape = (
+      filter_spatial_shape + (filter_spatial_shape - 1) *
+      (rate_or_const_rate - 1))
   pad_extra_shape = dilated_filter_spatial_shape - 1
 
   # When full_padding_shape is odd, we pad more at end, following the same
   # convention as conv2d.
   pad_extra_start = pad_extra_shape // 2
   pad_extra_end = pad_extra_shape - pad_extra_start
-  base_paddings = array_ops.stack([[pad_extra_start[i], pad_extra_end[i]]
-                                   for i in range(num_spatial_dims)])
+  base_paddings = array_ops.stack(
+      [[pad_extra_start[i], pad_extra_end[i]] for i in range(num_spatial_dims)])
   return base_paddings
 
 
@@ -612,8 +631,8 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
   if strides is None:
     strides = [1] * num_spatial_dims
   elif len(strides) != num_spatial_dims:
-    raise ValueError("len(strides)=%d but should be %d" %
-                     (len(strides), num_spatial_dims))
+    raise ValueError("len(strides)=%d but should be %d" % (len(strides),
+                                                           num_spatial_dims))
   strides = np.array(strides, dtype=np.int32)
   if np.any(strides < 1):
     raise ValueError("all values of strides must be positive")
@@ -624,9 +643,15 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
   return strides, dilation_rate
 
 
-def convolution(input, filter,  # pylint: disable=redefined-builtin
-                padding, strides=None, dilation_rate=None,
-                name=None, data_format=None):
+@tf_export("nn.convolution")
+def convolution(
+    input,  # pylint: disable=redefined-builtin
+    filter,  # pylint: disable=redefined-builtin
+    padding,
+    strides=None,
+    dilation_rate=None,
+    name=None,
+    data_format=None):
   # pylint: disable=line-too-long
   """Computes sums of N-D convolutions (actually cross-correlation).
 
@@ -741,16 +766,18 @@ def convolution(input, filter,  # pylint: disable=redefined-builtin
   """
   # pylint: enable=line-too-long
   with ops.name_scope(name, "convolution", [input, filter]) as name:
-    input = ops.convert_to_tensor(input, name="input")
+    input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
     input_shape = input.get_shape()
-    filter = ops.convert_to_tensor(filter, name="filter")
+    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
     filter_shape = filter.get_shape()
-    op = Convolution(input_shape,
-                     filter_shape,
-                     padding,
-                     strides=strides,
-                     dilation_rate=dilation_rate,
-                     name=name, data_format=data_format)
+    op = Convolution(
+        input_shape,
+        filter_shape,
+        padding,
+        strides=strides,
+        dilation_rate=dilation_rate,
+        name=name,
+        data_format=data_format)
     return op(input, filter)
 
 
@@ -774,8 +801,11 @@ class Convolution(object):
   def __init__(self,
                input_shape,
                filter_shape,
-               padding, strides=None, dilation_rate=None,
-               name=None, data_format=None):
+               padding,
+               strides=None,
+               dilation_rate=None,
+               name=None,
+               data_format=None):
     """Helper function for convolution."""
     num_total_dims = filter_shape.ndims
     if num_total_dims is None:
@@ -797,17 +827,17 @@ class Convolution(object):
 
     if data_format is None or not data_format.startswith("NC"):
       input_channels_dim = input_shape[num_spatial_dims + 1]
-      spatial_dims = range(1, num_spatial_dims+1)
+      spatial_dims = range(1, num_spatial_dims + 1)
     else:
       input_channels_dim = input_shape[1]
-      spatial_dims = range(2, num_spatial_dims+2)
+      spatial_dims = range(2, num_spatial_dims + 2)
 
-    if not input_channels_dim.is_compatible_with(filter_shape[
-        num_spatial_dims]):
+    if not input_channels_dim.is_compatible_with(
+        filter_shape[num_spatial_dims]):
       raise ValueError(
           "number of input channels does not match corresponding dimension of "
-          "filter, {} != {}".format(input_channels_dim, filter_shape[
-              num_spatial_dims]))
+          "filter, {} != {}".format(input_channels_dim,
+                                    filter_shape[num_spatial_dims]))
 
     strides, dilation_rate = _get_strides_and_dilation_rate(
         num_spatial_dims, strides, dilation_rate)
@@ -823,7 +853,8 @@ class Convolution(object):
         padding=padding,
         build_op=self._build_op,
         filter_shape=filter_shape,
-        spatial_dims=spatial_dims)
+        spatial_dims=spatial_dims,
+        data_format=data_format)
 
   def _build_op(self, _, padding):
     return _NonAtrousConvolution(
@@ -838,14 +869,16 @@ class Convolution(object):
     return self.conv_op(inp, filter)
 
 
-def pool(input,  # pylint: disable=redefined-builtin
-         window_shape,
-         pooling_type,
-         padding,
-         dilation_rate=None,
-         strides=None,
-         name=None,
-         data_format=None):
+@tf_export("nn.pool")
+def pool(
+    input,  # pylint: disable=redefined-builtin
+    window_shape,
+    pooling_type,
+    padding,
+    dilation_rate=None,
+    strides=None,
+    name=None,
+    data_format=None):
   # pylint: disable=line-too-long
   """Performs an N-D pooling operation.
 
@@ -927,9 +960,9 @@ def pool(input,  # pylint: disable=redefined-builtin
 
   """
   # pylint: enable=line-too-long
-  with ops.name_scope(name, "%s_pool" %
-                      (pooling_type.lower()), [input]) as scope:
-    input = ops.convert_to_tensor(input, name="input")
+  with ops.name_scope(name, "%s_pool" % (pooling_type.lower()),
+                      [input]) as scope:
+    input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
 
     num_spatial_dims = len(window_shape)
     if num_spatial_dims < 1 or num_spatial_dims > 3:
@@ -949,17 +982,18 @@ def pool(input,  # pylint: disable=redefined-builtin
           "strides > window_shape not supported due to inconsistency between "
           "CPU and GPU implementations")
 
-    pooling_ops = {("MAX", 1): max_pool,
-                   ("MAX", 2): max_pool,
-                   ("MAX", 3): max_pool3d,  # pylint: disable=undefined-variable
-                   ("AVG", 1): avg_pool,
-                   ("AVG", 2): avg_pool,
-                   ("AVG", 3): avg_pool3d,  # pylint: disable=undefined-variable
-                  }
+    pooling_ops = {
+        ("MAX", 1): max_pool,
+        ("MAX", 2): max_pool,
+        ("MAX", 3): max_pool3d,  # pylint: disable=undefined-variable
+        ("AVG", 1): avg_pool,
+        ("AVG", 2): avg_pool,
+        ("AVG", 3): avg_pool3d,  # pylint: disable=undefined-variable
+    }
     op_key = (pooling_type, num_spatial_dims)
     if op_key not in pooling_ops:
-      raise ValueError("%d-D %s pooling is not supported." %
-                       (op_key[1], op_key[0]))
+      raise ValueError("%d-D %s pooling is not supported." % (op_key[1],
+                                                              op_key[0]))
 
     if data_format is None or not data_format.startswith("NC"):
       adjusted_window_shape = [1] + list(window_shape) + [1]
@@ -986,12 +1020,13 @@ def pool(input,  # pylint: disable=redefined-builtin
       if num_spatial_dims == 1:
         converted_input = array_ops.expand_dims(converted_input,
                                                 spatial_dims[0])
-      result = pooling_ops[op_key](converted_input,
-                                   adjusted_window_shape,
-                                   adjusted_strides,
-                                   converted_padding,
-                                   name=scope,
-                                   **data_format_kwargs)
+      result = pooling_ops[op_key](
+          converted_input,
+          adjusted_window_shape,
+          adjusted_strides,
+          converted_padding,
+          name=scope,
+          **data_format_kwargs)
       if num_spatial_dims == 1:
         result = array_ops.squeeze(result, [spatial_dims[0]])
       return result
@@ -1005,8 +1040,11 @@ def pool(input,  # pylint: disable=redefined-builtin
         filter_shape=window_shape)
 
 
+@tf_export("nn.atrous_conv2d")
 def atrous_conv2d(value, filters, rate, padding, name=None):
-  """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
+  """Atrous convolution (a.k.a.
+
+  convolution with holes or dilated convolution).
 
   This function is a simpler wrapper around the more general
   @{tf.nn.convolution}, and exists only for backwards compatibility. You can
@@ -1050,7 +1088,8 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
   that effectively use atrous convolution in different ways are, among others,
   [OverFeat: Integrated Recognition, Localization and Detection using
   Convolutional Networks](http://arxiv.org/abs/1312.6229) and [Fast Image
-  Scanning with Deep Max-Pooling Convolutional Neural Networks](http://arxiv.org/abs/1302.1700).
+  Scanning with Deep Max-Pooling Convolutional Neural
+  Networks](http://arxiv.org/abs/1302.1700).
   Atrous convolution is also closely related to the so-called noble identities
   in multi-rate signal processing.
 
@@ -1140,13 +1179,15 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
       name=name)
 
 
-def conv2d_transpose(value,
-                     filter,  # pylint: disable=redefined-builtin
-                     output_shape,
-                     strides,
-                     padding="SAME",
-                     data_format="NHWC",
-                     name=None):
+@tf_export("nn.conv2d_transpose")
+def conv2d_transpose(
+    value,
+    filter,  # pylint: disable=redefined-builtin
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NHWC",
+    name=None):
   """The transpose of `conv2d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
@@ -1182,7 +1223,7 @@ def conv2d_transpose(value,
     if data_format not in ("NCHW", "NHWC"):
       raise ValueError("data_format has to be either NCHW or NHWC.")
     value = ops.convert_to_tensor(value, name="value")
-    filter = ops.convert_to_tensor(filter, name="filter")
+    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
     axis = 3 if data_format == "NHWC" else 1
     if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[3]):
       raise ValueError("input channels does not match filter's input channels, "
@@ -1191,29 +1232,32 @@ def conv2d_transpose(value,
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
-      raise ValueError("output_shape must have shape (4,), got {}"
-                       .format(output_shape_.get_shape()))
+      raise ValueError("output_shape must have shape (4,), got {}".format(
+          output_shape_.get_shape()))
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [4] if reached this point.
       if not filter.get_shape()[2].is_compatible_with(output_shape[axis]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis], filter.get_shape()[2]))
+            "{} != {}".format(output_shape[axis],
+                              filter.get_shape()[2]))
 
     if padding != "VALID" and padding != "SAME":
       raise ValueError("padding must be either VALID or SAME:"
                        " {}".format(padding))
 
-    return gen_nn_ops.conv2d_backprop_input(input_sizes=output_shape_,
-                                            filter=filter,
-                                            out_backprop=value,
-                                            strides=strides,
-                                            padding=padding,
-                                            data_format=data_format,
-                                            name=name)
+    return gen_nn_ops.conv2d_backprop_input(
+        input_sizes=output_shape_,
+        filter=filter,
+        out_backprop=value,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 
 
+@tf_export("nn.atrous_conv2d_transpose")
 def atrous_conv2d_transpose(value,
                             filters,
                             output_shape,
@@ -1263,29 +1307,32 @@ def atrous_conv2d_transpose(value,
     if not value.get_shape()[3].is_compatible_with(filters.get_shape()[3]):
       raise ValueError(
           "value's input channels does not match filters' input channels, "
-          "{} != {}".format(value.get_shape()[3], filters.get_shape()[3]))
+          "{} != {}".format(value.get_shape()[3],
+                            filters.get_shape()[3]))
     if rate < 1:
       raise ValueError("rate {} cannot be less than one".format(rate))
 
     if rate == 1:
-      return conv2d_transpose(value,
-                              filters,
-                              output_shape,
-                              strides=[1, 1, 1, 1],
-                              padding=padding,
-                              data_format="NHWC")
+      return conv2d_transpose(
+          value,
+          filters,
+          output_shape,
+          strides=[1, 1, 1, 1],
+          padding=padding,
+          data_format="NHWC")
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
-      raise ValueError("output_shape must have shape (4,), got {}"
-                       .format(output_shape_.get_shape()))
+      raise ValueError("output_shape must have shape (4,), got {}".format(
+          output_shape_.get_shape()))
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [4] if reached this point.
       if not filters.get_shape()[2].is_compatible_with(output_shape[3]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[3], filters.get_shape()[2]))
+            "{} != {}".format(output_shape[3],
+                              filters.get_shape()[2]))
 
     # We have two padding contributions. The first is used for converting "SAME"
     # to "VALID". The second is required so that the height and width of the
@@ -1334,38 +1381,39 @@ def atrous_conv2d_transpose(value,
     # component.
     space_to_batch_pad = [[0, pad_bottom_extra], [0, pad_right_extra]]
 
-    value = array_ops.space_to_batch(input=value,
-                                     paddings=space_to_batch_pad,
-                                     block_size=rate)
+    value = array_ops.space_to_batch(
+        input=value, paddings=space_to_batch_pad, block_size=rate)
 
-    input_sizes = [rate * rate * output_shape[0],
-                   (in_height + pad_bottom_extra) // rate,
-                   (in_width + pad_right_extra) // rate,
-                   output_shape[3]]
+    input_sizes = [
+        rate * rate * output_shape[0], (in_height + pad_bottom_extra) // rate,
+        (in_width + pad_right_extra) // rate, output_shape[3]
+    ]
 
-    value = gen_nn_ops.conv2d_backprop_input(input_sizes=input_sizes,
-                                             filter=filters,
-                                             out_backprop=value,
-                                             strides=[1, 1, 1, 1],
-                                             padding="VALID",
-                                             data_format="NHWC")
+    value = gen_nn_ops.conv2d_backprop_input(
+        input_sizes=input_sizes,
+        filter=filters,
+        out_backprop=value,
+        strides=[1, 1, 1, 1],
+        padding="VALID",
+        data_format="NHWC")
 
     # The crops argument to batch_to_space includes both padding components.
     batch_to_space_crop = [[pad_top, pad_bottom + pad_bottom_extra],
                            [pad_left, pad_right + pad_right_extra]]
 
-    return array_ops.batch_to_space(input=value,
-                                    crops=batch_to_space_crop,
-                                    block_size=rate)
+    return array_ops.batch_to_space(
+        input=value, crops=batch_to_space_crop, block_size=rate)
 
 
-def conv3d_transpose(value,
-                     filter,  # pylint: disable=redefined-builtin
-                     output_shape,
-                     strides,
-                     padding="SAME",
-                     data_format="NDHWC",
-                     name=None):
+@tf_export("nn.conv3d_transpose")
+def conv3d_transpose(
+    value,
+    filter,  # pylint: disable=redefined-builtin
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NDHWC",
+    name=None):
   """The transpose of `conv3d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
@@ -1399,7 +1447,7 @@ def conv3d_transpose(value,
   with ops.name_scope(name, "conv3d_transpose",
                       [value, filter, output_shape]) as name:
     value = ops.convert_to_tensor(value, name="value")
-    filter = ops.convert_to_tensor(filter, name="filter")
+    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
     axis = 1 if data_format == "NCDHW" else 4
     if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[4]):
       raise ValueError("input channels does not match filter's input channels, "
@@ -1408,30 +1456,33 @@ def conv3d_transpose(value,
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(5)):
-      raise ValueError("output_shape must have shape (5,), got {}"
-                       .format(output_shape_.get_shape()))
+      raise ValueError("output_shape must have shape (5,), got {}".format(
+          output_shape_.get_shape()))
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [5] if reached this point.
       if not filter.get_shape()[3].is_compatible_with(output_shape[4]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[4], filter.get_shape()[3]))
+            "{} != {}".format(output_shape[4],
+                              filter.get_shape()[3]))
 
     if padding != "VALID" and padding != "SAME":
       raise ValueError("padding must be either VALID or SAME:"
                        " {}".format(padding))
 
-    return gen_nn_ops.conv3d_backprop_input_v2(input_sizes=output_shape_,
-                                               filter=filter,
-                                               out_backprop=value,
-                                               strides=strides,
-                                               padding=padding,
-                                               data_format=data_format,
-                                               name=name)
+    return gen_nn_ops.conv3d_backprop_input_v2(
+        input_sizes=output_shape_,
+        filter=filter,
+        out_backprop=value,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 
 
 # pylint: disable=protected-access
+@tf_export("nn.bias_add")
 def bias_add(value, bias, data_format=None, name=None):
   """Adds `bias` to `value`.
 
@@ -1486,31 +1537,38 @@ def bias_add_v1(value, bias, name=None):
     return gen_nn_ops._bias_add_v1(value, bias, name=name)
 
 
-def crelu(features, name=None):
+@tf_export("nn.crelu")
+def crelu(features, name=None, axis=-1):
   """Computes Concatenated ReLU.
 
   Concatenates a ReLU which selects only the positive part of the activation
   with a ReLU which selects only the *negative* part of the activation.
   Note that as a result this non-linearity doubles the depth of the activations.
-  Source: [Understanding and Improving Convolutional Neural Networks via Concatenated Rectified Linear Units. W. Shang, et al.](https://arxiv.org/abs/1603.05201)
+  Source: [Understanding and Improving Convolutional Neural Networks via
+  Concatenated Rectified Linear Units. W. Shang, et
+  al.](https://arxiv.org/abs/1603.05201)
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
       `int16`, or `int8`.
     name: A name for the operation (optional).
+    axis: The axis that the output values are concatenated along. Default is -1.
 
   Returns:
     A `Tensor` with the same type as `features`.
   """
   with ops.name_scope(name, "CRelu", [features]) as name:
     features = ops.convert_to_tensor(features, name="features")
-    c = array_ops.concat([features, -features], -1, name=name)
+    c = array_ops.concat([features, -features], axis, name=name)
     return gen_nn_ops.relu(c)
 
 
+@tf_export("nn.relu6")
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
-  Source: [Convolutional Deep Belief Networks on CIFAR-10. A. Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf)
+
+  Source: [Convolutional Deep Belief Networks on CIFAR-10. A.
+  Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf)
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
@@ -1525,6 +1583,7 @@ def relu6(features, name=None):
     return gen_nn_ops._relu6(features, name=name)
 
 
+@tf_export("nn.leaky_relu")
 def leaky_relu(features, alpha=0.2, name=None):
   """Compute the Leaky ReLU activation function.
 
@@ -1533,7 +1592,8 @@ def leaky_relu(features, alpha=0.2, name=None):
   http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf
 
   Args:
-    features: A `Tensor` representing preactivation values.
+    features: A `Tensor` representing preactivation values. Must be one of
+      the following types: `float16`, `float32`, `float64`, `int32`, `int64`.
     alpha: Slope of the activation function at x < 0.
     name: A name for the operation (optional).
 
@@ -1542,7 +1602,9 @@ def leaky_relu(features, alpha=0.2, name=None):
   """
   with ops.name_scope(name, "LeakyRelu", [features, alpha]):
     features = ops.convert_to_tensor(features, name="features")
-    alpha = ops.convert_to_tensor(alpha, name="alpha")
+    if features.dtype.is_integer:
+      features = math_ops.to_float(features)
+    alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
     return math_ops.maximum(alpha * features, features)
 
 
@@ -1594,14 +1656,16 @@ def _softmax(logits, compute_op, dim=-1, name=None):
     InvalidArgumentError: if `logits` is empty or `dim` is beyond the last
       dimension of `logits`.
   """
+
   def _swap_axis(logits, dim_index, last_index, name=None):
     """Swaps logits's dim_index and last_index."""
-    return array_ops.transpose(logits,
-                               array_ops.concat([
-                                   math_ops.range(dim_index), [last_index],
-                                   math_ops.range(dim_index + 1, last_index),
-                                   [dim_index]
-                               ], 0), name=name)
+    return array_ops.transpose(
+        logits,
+        array_ops.concat([
+            math_ops.range(dim_index), [last_index],
+            math_ops.range(dim_index + 1, last_index), [dim_index]
+        ], 0),
+        name=name)
 
   logits = ops.convert_to_tensor(logits)
 
@@ -1626,7 +1690,8 @@ def _softmax(logits, compute_op, dim=-1, name=None):
 
   # Swap logits' dimension of dim and its last dimension.
   input_rank = array_ops.rank(logits)
-  logits = _swap_axis(logits, dim, math_ops.subtract(input_rank, 1))
+  dim_axis = dim % shape.ndims
+  logits = _swap_axis(logits, dim_axis, math_ops.subtract(input_rank, 1))
   shape_after_swap = array_ops.shape(logits)
 
   # Reshape logits into a matrix.
@@ -1637,7 +1702,8 @@ def _softmax(logits, compute_op, dim=-1, name=None):
 
   # Transform back the output tensor.
   output = array_ops.reshape(output, shape_after_swap)
-  output = _swap_axis(output, dim, math_ops.subtract(input_rank, 1), name=name)
+  output = _swap_axis(
+      output, dim_axis, math_ops.subtract(input_rank, 1), name=name)
 
   # Make shape inference work since reshape and transpose may erase its static
   # shape.
@@ -1646,6 +1712,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   return output
 
 
+@tf_export("nn.softmax")
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax(logits, axis=None, name=None, dim=None):
   """Computes softmax activations.
@@ -1675,6 +1742,7 @@ def softmax(logits, axis=None, name=None, dim=None):
   return _softmax(logits, gen_nn_ops._softmax, axis, name)
 
 
+@tf_export("nn.log_softmax")
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def log_softmax(logits, axis=None, name=None, dim=None):
   """Computes log softmax activations.
@@ -1713,9 +1781,13 @@ def _ensure_xent_args(name, sentinel, labels, logits):
     raise ValueError("Both labels and logits must be provided.")
 
 
-def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=invalid-name
-                                         labels=None, logits=None,
-                                         dim=-1, name=None):
+@tf_export("nn.softmax_cross_entropy_with_logits_v2")
+def softmax_cross_entropy_with_logits_v2(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    dim=-1,
+    name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1757,19 +1829,19 @@ def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=inva
     A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
     softmax cross entropy loss.
   """
-  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel,
-                    labels, logits)
+  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
+                    logits)
 
   # TODO(pcmurray) Raise an error when the labels do not sum to 1. Note: This
   # could break users who call this with bad labels, but disregard the bad
   # results.
 
-  with ops.name_scope(
-      name, "softmax_cross_entropy_with_logits", [logits, labels]) as name:
+  with ops.name_scope(name, "softmax_cross_entropy_with_logits",
+                      [logits, labels]) as name:
     logits = ops.convert_to_tensor(logits, name="logits")
     labels = ops.convert_to_tensor(labels, name="labels")
-    precise_logits = math_ops.cast(logits, dtypes.float32) if (
-        logits.dtype == dtypes.float16) else logits
+    precise_logits = math_ops.cast(
+        logits, dtypes.float32) if (logits.dtype == dtypes.float16) else logits
     # labels and logits must be of the same type
     labels = math_ops.cast(labels, precise_logits.dtype)
     input_rank = array_ops.rank(precise_logits)
@@ -1778,13 +1850,14 @@ def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=inva
 
     # Move the dim to the end if dim is not the last dimension.
     if dim is not -1:
+
       def _move_dim_to_end(tensor, dim_index, rank):
-        return array_ops.transpose(tensor,
-                                   array_ops.concat([
-                                       math_ops.range(dim_index),
-                                       math_ops.range(dim_index + 1, rank),
-                                       [dim_index]
-                                   ], 0))
+        return array_ops.transpose(
+            tensor,
+            array_ops.concat([
+                math_ops.range(dim_index),
+                math_ops.range(dim_index + 1, rank), [dim_index]
+            ], 0))
 
       precise_logits = _move_dim_to_end(precise_logits, dim, input_rank)
       labels = _move_dim_to_end(labels, dim, input_rank)
@@ -1827,10 +1900,14 @@ See tf.nn.softmax_cross_entropy_with_logits_v2.
 """
 
 
+@tf_export("nn.softmax_cross_entropy_with_logits")
 @deprecation.deprecated(date=None, instructions=_XENT_DEPRECATION)
-def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
-                                      labels=None, logits=None,
-                                      dim=-1, name=None):
+def softmax_cross_entropy_with_logits(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    dim=-1,
+    name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1872,20 +1949,23 @@ def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
     A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
     softmax cross entropy loss.
   """
-  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel,
-                    labels, logits)
+  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
+                    logits)
 
-  with ops.name_scope(
-      name, "softmax_cross_entropy_with_logits_sg", [logits, labels]) as name:
+  with ops.name_scope(name, "softmax_cross_entropy_with_logits_sg",
+                      [logits, labels]) as name:
     labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
 
   return softmax_cross_entropy_with_logits_v2(
       labels=labels, logits=logits, dim=dim, name=name)
 
 
-def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
-                                             labels=None, logits=None,
-                                             name=None):
+@tf_export("nn.sparse_softmax_cross_entropy_with_logits")
+def sparse_softmax_cross_entropy_with_logits(
+    _sentinel=None,  # pylint: disable=invalid-name
+    labels=None,
+    logits=None,
+    name=None):
   """Computes sparse softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1941,15 +2021,15 @@ def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=
                       [labels, logits]):
     labels = ops.convert_to_tensor(labels)
     logits = ops.convert_to_tensor(logits)
-    precise_logits = math_ops.cast(logits, dtypes.float32) if (
-        dtypes.as_dtype(logits.dtype) == dtypes.float16) else logits
+    precise_logits = math_ops.cast(logits, dtypes.float32) if (dtypes.as_dtype(
+        logits.dtype) == dtypes.float16) else logits
 
     # Store label shape for result later.
     labels_static_shape = labels.get_shape()
     labels_shape = array_ops.shape(labels)
     if logits.get_shape().ndims is not None and logits.get_shape().ndims == 0:
-      raise ValueError("Logits cannot be scalars - received shape %s." %
-                       logits.get_shape())
+      raise ValueError(
+          "Logits cannot be scalars - received shape %s." % logits.get_shape())
     if logits.get_shape().ndims is not None and (
         labels_static_shape.ndims is not None and
         labels_static_shape.ndims != logits.get_shape().ndims - 1):
@@ -1981,6 +2061,7 @@ def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=
       return cost
 
 
+@tf_export("nn.avg_pool")
 def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   """Performs the average pooling on the input.
 
@@ -2005,14 +2086,16 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   """
   with ops.name_scope(name, "AvgPool", [value]) as name:
     value = ops.convert_to_tensor(value, name="input")
-    return gen_nn_ops._avg_pool(value,
-                                ksize=ksize,
-                                strides=strides,
-                                padding=padding,
-                                data_format=data_format,
-                                name=name)
+    return gen_nn_ops._avg_pool(
+        value,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 
 
+@tf_export("nn.max_pool")
 def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   """Performs the max pooling on the input.
 
@@ -2046,8 +2129,8 @@ def _calc_conv_flops(graph, node):
   """Calculates the compute resources needed for Conv2D."""
   input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
   input_shape.assert_is_fully_defined()
-  filter_shape = graph_util.tensor_shape_from_node_def_name(graph,
-                                                            node.input[1])
+  filter_shape = graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
   filter_shape.assert_is_fully_defined()
   output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
   output_shape.assert_is_fully_defined()
@@ -2055,8 +2138,9 @@ def _calc_conv_flops(graph, node):
   filter_width = int(filter_shape[1])
   filter_in_depth = int(filter_shape[2])
   output_count = np.prod(output_shape.as_list())
-  return ops.OpStats("flops", (output_count * filter_in_depth * filter_height *
-                               filter_width * 2))
+  return ops.OpStats(
+      "flops",
+      (output_count * filter_in_depth * filter_height * filter_width * 2))
 
 
 @ops.RegisterStatistics("DepthwiseConv2dNative", "flops")
@@ -2064,8 +2148,8 @@ def _calc_depthwise_conv_flops(graph, node):
   """Calculates the compute resources needed for DepthwiseConv2dNative."""
   input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
   input_shape.assert_is_fully_defined()
-  filter_shape = graph_util.tensor_shape_from_node_def_name(graph,
-                                                            node.input[1])
+  filter_shape = graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
   filter_shape.assert_is_fully_defined()
   output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
   output_shape.assert_is_fully_defined()
@@ -2084,6 +2168,7 @@ def _calc_bias_add_flops(graph, node):
   return ops.OpStats("flops", input_count)
 
 
+@tf_export("nn.xw_plus_b")
 def xw_plus_b(x, weights, biases, name=None):  # pylint: disable=invalid-name
   """Computes matmul(x, weights) + biases.
 
@@ -2130,6 +2215,7 @@ def xw_plus_b_v1(x, weights, biases, name=None):  # pylint: disable=invalid-name
     return bias_add_v1(mm, biases, name=name)
 
 
+@tf_export("nn.dropout")
 def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: disable=invalid-name
   """Computes dropout.
 
@@ -2171,9 +2257,8 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
       raise ValueError("keep_prob must be a scalar tensor or a float in the "
                        "range (0, 1], got %g" % keep_prob)
-    keep_prob = ops.convert_to_tensor(keep_prob,
-                                      dtype=x.dtype,
-                                      name="keep_prob")
+    keep_prob = ops.convert_to_tensor(
+        keep_prob, dtype=x.dtype, name="keep_prob")
     keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
 
     # Do nothing if we know keep_prob == 1
@@ -2183,9 +2268,8 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     noise_shape = noise_shape if noise_shape is not None else array_ops.shape(x)
     # uniform [keep_prob, 1.0 + keep_prob)
     random_tensor = keep_prob
-    random_tensor += random_ops.random_uniform(noise_shape,
-                                               seed=seed,
-                                               dtype=x.dtype)
+    random_tensor += random_ops.random_uniform(
+        noise_shape, seed=seed, dtype=x.dtype)
     # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
     binary_tensor = math_ops.floor(random_tensor)
     ret = math_ops.div(x, keep_prob) * binary_tensor
@@ -2194,7 +2278,8 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     return ret
 
 
-def top_k(input, k=1, sorted=True, name=None):
+@tf_export("nn.top_k")
+def top_k(input, k=1, sorted=True, name=None):  # pylint: disable=redefined-builtin
   """Finds values and indices of the `k` largest entries for the last dimension.
 
   If the input is a vector (rank=1), finds the `k` largest entries in the vector
@@ -2223,7 +2308,7 @@ def top_k(input, k=1, sorted=True, name=None):
   return gen_nn_ops._top_kv2(input, k=k, sorted=sorted, name=name)
 
 
-def nth_element(input, n, reverse=False, name=None):
+def nth_element(input, n, reverse=False, name=None):  # pylint: disable=redefined-builtin
   r"""Finds values of the `n`-th order statistic for the last dmension.
 
   If the input is a vector (rank-1), finds the entries which is the nth-smallest
@@ -2251,16 +2336,31 @@ def nth_element(input, n, reverse=False, name=None):
   return gen_nn_ops.nth_element(input, n, reverse=reverse, name=name)
 
 
-def conv1d(value, filters, stride, padding,
-           use_cudnn_on_gpu=None, data_format=None,
+@tf_export("nn.conv1d")
+@deprecation.deprecated_arg_values(
+    None,
+    "`NCHW` for data_format is deprecated, use `NCW` instead",
+    warn_once=True,
+    data_format="NCHW")
+@deprecation.deprecated_arg_values(
+    None,
+    "`NHWC` for data_format is deprecated, use `NWC` instead",
+    warn_once=True,
+    data_format="NHWC")
+def conv1d(value,
+           filters,
+           stride,
+           padding,
+           use_cudnn_on_gpu=None,
+           data_format=None,
            name=None):
   r"""Computes a 1-D convolution given 3-D input and filter tensors.
 
   Given an input tensor of shape
     [batch, in_width, in_channels]
-  if data_format is "NHWC", or
+  if data_format is "NWC", or
     [batch, in_channels, in_width]
-  if data_format is "NCHW",
+  if data_format is "NCW",
   and a filter / kernel tensor of shape
   [filter_width, in_channels, out_channels], this op reshapes
   the arguments to pass them to conv2d to perform the equivalent
@@ -2279,15 +2379,15 @@ def conv1d(value, filters, stride, padding,
   returned to the caller.
 
   Args:
-    value: A 3D `Tensor`.  Must be of type `float32` or `float64`.
+    value: A 3D `Tensor`.  Must be of type `float16` or `float32`.
     filters: A 3D `Tensor`.  Must have the same type as `input`.
     stride: An `integer`.  The number of entries by which
       the filter is moved right at each step.
     padding: 'SAME' or 'VALID'
     use_cudnn_on_gpu: An optional `bool`.  Defaults to `True`.
-    data_format: An optional `string` from `"NHWC", "NCHW"`.  Defaults
-      to `"NHWC"`, the data is stored in the order of
-      [batch, in_width, in_channels].  The `"NCHW"` format stores
+    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults
+      to `"NWC"`, the data is stored in the order of
+      [batch, in_width, in_channels].  The `"NCW"` format stores
       data as [batch, in_channels, in_width].
     name: A name for the operation (optional).
 
@@ -2299,20 +2399,25 @@ def conv1d(value, filters, stride, padding,
   """
   with ops.name_scope(name, "conv1d", [value, filters]) as name:
     # Reshape the input tensor to [batch, 1, in_width, in_channels]
-    if data_format is None or data_format == "NHWC":
+    if data_format is None or data_format == "NHWC" or data_format == "NWC":
       data_format = "NHWC"
       spatial_start_dim = 1
       strides = [1, 1, stride, 1]
-    elif data_format == "NCHW":
+    elif data_format == "NCHW" or data_format == "NCW":
+      data_format = "NCHW"
       spatial_start_dim = 2
       strides = [1, 1, 1, stride]
     else:
-      raise ValueError("data_format must be \"NHWC\" or \"NCHW\".")
+      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
     value = array_ops.expand_dims(value, spatial_start_dim)
     filters = array_ops.expand_dims(filters, 0)
-    result = gen_nn_ops.conv2d(value, filters, strides, padding,
-                               use_cudnn_on_gpu=use_cudnn_on_gpu,
-                               data_format=data_format)
+    result = gen_nn_ops.conv2d(
+        value,
+        filters,
+        strides,
+        padding,
+        use_cudnn_on_gpu=use_cudnn_on_gpu,
+        data_format=data_format)
     return array_ops.squeeze(result, [spatial_start_dim])
 
 
@@ -2400,7 +2505,7 @@ def conv1d_transpose(
       spatial_start_dim = 2
       strides = [1, 1, 1, stride]
     value = array_ops.expand_dims(value, spatial_start_dim)
-    filter = array_ops.expand_dims(filter, 0)
+    filter = array_ops.expand_dims(filter, 0)  # pylint: disable=redefined-builtin
 
     result = gen_nn_ops.conv2d_backprop_input(
         input_sizes=output_shape_,
@@ -2418,8 +2523,8 @@ def _calc_dilation2d_flops(graph, node):
   """Calculates the compute resources needed for Dilation2D."""
   input_shape = graph_util.tensor_shape_from_node_def_name(graph, node.input[0])
   input_shape.assert_is_fully_defined()
-  filter_shape = graph_util.tensor_shape_from_node_def_name(graph,
-                                                            node.input[1])
+  filter_shape = graph_util.tensor_shape_from_node_def_name(
+      graph, node.input[1])
   filter_shape.assert_is_fully_defined()
   output_shape = graph_util.tensor_shape_from_node_def_name(graph, node.name)
   output_shape.assert_is_fully_defined()
@@ -2429,6 +2534,7 @@ def _calc_dilation2d_flops(graph, node):
   return ops.OpStats("flops", (output_count * filter_height * filter_width * 2))
 
 
+@tf_export("nn.erosion2d")
 def erosion2d(value, kernel, strides, rates, padding, name=None):
   """Computes the grayscale erosion of 4-D `value` and 3-D `kernel` tensors.
 
@@ -2478,14 +2584,16 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
   with ops.name_scope(name, "erosion2d", [value, kernel]) as name:
     # Reduce erosion to dilation by duality.
     return math_ops.negative(
-        gen_nn_ops.dilation2d(input=math_ops.negative(value),
-                              filter=array_ops.reverse_v2(kernel, [0, 1]),
-                              strides=strides,
-                              rates=rates,
-                              padding=padding,
-                              name=name))
+        gen_nn_ops.dilation2d(
+            input=math_ops.negative(value),
+            filter=array_ops.reverse_v2(kernel, [0, 1]),
+            strides=strides,
+            rates=rates,
+            padding=padding,
+            name=name))
 
 
+@tf_export("nn.in_top_k")
 def in_top_k(predictions, targets, k, name=None):
   r"""Says whether the targets are in the top `K` predictions.
 
@@ -2515,5 +2623,5 @@ def in_top_k(predictions, targets, k, name=None):
   Returns:
     A `Tensor` of type `bool`. Computed Precision at `k` as a `bool Tensor`.
   """
-  with ops.name_scope(name, 'in_top_k'):
+  with ops.name_scope(name, "in_top_k"):
     return gen_nn_ops._in_top_kv2(predictions, targets, k, name=name)
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 3b918e4f74c64868ef74f7e26295941c6f2801ff..5a45bdc1e5e1d38a34176ed9443fcd1713f38e1e 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -90,6 +90,18 @@ class SoftmaxTest(test_lib.TestCase):
     self.assertAllClose(y_tf_np, y_np, eps)
     self.assertAllClose(y_tf_last_dim_np, y_np, eps)
 
+  def testSoftmaxAxes(self):
+    arr = np.linspace(0., 1, 12).reshape(3, 4)
+    x_neg_axis = nn_ops.softmax(arr, axis=-2)
+    y_pos_axis = nn_ops.softmax(arr, axis=0)
+    z_gt_axis = nn_ops.softmax(arr, axis=4)
+    x_neg_axis_tf = self.evaluate(x_neg_axis)
+    y_pos_axis_tf = self.evaluate(y_pos_axis)
+    z_gt_axis_tf = self.evaluate(z_gt_axis)
+    eps = 1e-3
+    self.assertAllClose(x_neg_axis_tf, y_pos_axis_tf, eps)
+    self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
+
   def testGradient(self):
     x_shape = [5, 10]
     x_np = np.random.randn(*x_shape).astype(np.float64)
@@ -119,8 +131,7 @@ class LogPoissonLossTest(test_lib.TestCase):
     y_np = self._log_poisson_loss(x_np, z_np, compute_full_loss=False)
     y_np_stirling = self._log_poisson_loss(x_np, z_np, compute_full_loss=True)
     y_tf = nn_impl.log_poisson_loss(z_np, x_np, compute_full_loss=False)
-    y_tf_stirling = nn_impl.log_poisson_loss(
-        z_np, x_np, compute_full_loss=True)
+    y_tf_stirling = nn_impl.log_poisson_loss(z_np, x_np, compute_full_loss=True)
     y_tf_np = self.evaluate(y_tf)
     y_tf_np_stirling = self.evaluate(y_tf_stirling)
     eps = 1e-3
@@ -164,6 +175,18 @@ class LogSoftmaxTest(test_lib.TestCase):
     eps = 1e-3
     self.assertAllClose(y_tf_np, y_np, eps)
 
+  def testLogSoftmaxAxes(self):
+    arr = np.linspace(0., 1, 12).reshape(3, 4)
+    x_neg_axis = nn_ops.log_softmax(arr, axis=-2)
+    y_pos_axis = nn_ops.log_softmax(arr, axis=0)
+    z_gt_axis = nn_ops.log_softmax(arr, axis=4)
+    x_neg_axis_tf = self.evaluate(x_neg_axis)
+    y_pos_axis_tf = self.evaluate(y_pos_axis)
+    z_gt_axis_tf = self.evaluate(z_gt_axis)
+    eps = 1e-3
+    self.assertAllClose(x_neg_axis_tf, y_pos_axis_tf, eps)
+    self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
+
   def testGradient(self):
     x_shape = [5, 10]
     x_np = np.random.randn(*x_shape).astype(np.float64)
@@ -749,8 +772,8 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     def _SoftmaxCrossEntropyWithLogits(logits, targets):
       # logits, targets: float arrays of the same shape.
       assert logits.shape == targets.shape
-      stable_exp_logits = np.exp(logits - np.amax(
-          logits, axis=1, keepdims=True))
+      stable_exp_logits = np.exp(
+          logits - np.amax(logits, axis=1, keepdims=True))
       pred = stable_exp_logits / np.sum(stable_exp_logits, 1, keepdims=True)
       return -np.sum(targets * np.log(pred + 1.0e-20), axis=1)
 
@@ -841,8 +864,8 @@ class LeakyReluTest(test_lib.TestCase):
     batch_size = 3
     height, width = 4, 4
     np.random.seed(1)  # Make it reproducible.
-    inputs = np.random.uniform(
-        size=(batch_size, height, width, 3)).astype(np.float32)
+    inputs = np.random.uniform(size=(batch_size, height, width, 3)).astype(
+        np.float32)
     inputs = constant_op.constant(inputs)
 
     outputs = nn_ops.leaky_relu(inputs)
@@ -854,11 +877,14 @@ class LeakyReluTest(test_lib.TestCase):
     self.assertAllClose(inputs, outputs)
 
   def testValues(self):
-    np_values = np.array([-1.0, 0.0, 0.5, 1.0, 2.0], dtype=np.float32)
-    outputs = nn_ops.leaky_relu(constant_op.constant(np_values))
-    with self.test_session() as sess:
-      outputs = sess.run(outputs)
-    self.assertAllClose(outputs, [-0.2, 0.0, 0.5, 1.0, 2.0])
+    for dtype in [np.int32, np.int64, np.float16, np.float32, np.float64]:
+      np_values = np.array([-2, -1, 0, 1, 2], dtype=dtype)
+      outputs = nn_ops.leaky_relu(constant_op.constant(np_values))
+      with self.test_session() as sess:
+        outputs = sess.run(outputs)
+      tol = 2e-3 if dtype == np.float16 else 1e-6
+      self.assertAllClose(
+          outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
 
 class SwishTest(test_lib.TestCase):
@@ -889,7 +915,10 @@ class SwishTest(test_lib.TestCase):
 
 class MomentsTest(test_lib.TestCase):
 
-  def doOutputTest(self, input_shape, moments_axes, tol=1e-4,
+  def doOutputTest(self,
+                   input_shape,
+                   moments_axes,
+                   tol=1e-4,
                    check_gradients=False):
     for mu in [0.0, 1.0, 1e3]:
       for sigma in [1.0, 0.1]:
@@ -953,5 +982,64 @@ class MomentsTest(test_lib.TestCase):
     self.doOutputTest((10, 10, 10, 30), (1, 2, 3))
 
 
+class DataFormatDimMapTest(test_lib.TestCase):
+
+  def _test(self, x_val, y_val_expected):
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x)
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def test(self):
+    self._test(0, 0)
+    self._test(1, 2)
+    self._test(2, 3)
+    self._test(3, 1)
+    self._test(-1, 1)
+    self._test(-2, 3)
+    self._test(-3, 2)
+    self._test(-4, 0)
+    self._test([1, 3], [2, 1])
+    self._test([1, 3, -2], [2, 1, 3])
+    self._test([1, -3, -2], [2, 2, 3])
+    self._test([[1, -3], [1, -1]], [[2, 2], [2, 1]])
+
+
+class DataFormatVectorPermuteTest(test_lib.TestCase):
+
+  def testNHWCToNCHW(self):
+    x_val = [7, 4, 9, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x)
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, [7, 3, 4, 9])
+
+  def testNCHWToNHWC(self):
+    x_val = [7, 4, 9, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x, src_format="NCHW", dst_format="NHWC")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, [7, 9, 3, 4])
+
+  def testNHWCToNCHW2D(self):
+    x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x)
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, [[7, 4], [5, 1], [9, 3], [4, 5]])
+
+  def testNCHWToNHWC2D(self):
+    x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x, src_format="NCHW", dst_format="NHWC")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, [[7, 4], [4, 5], [5, 1], [9, 3]])
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index f3558fda9ca940f2567a451bb6ad14feb10aaba7..b4ce1cbf25346412e2781a520b7e2cdcf720bcd5 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -24,8 +24,10 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("verify_tensor_all_finite")
 def verify_tensor_all_finite(t, msg, name=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
 
@@ -45,6 +47,7 @@ def verify_tensor_all_finite(t, msg, name=None):
   return out
 
 
+@tf_export("add_check_numerics_ops")
 def add_check_numerics_ops():
   """Connect a `check_numerics` to every floating point tensor.
 
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 14aef01dec337d7f59c799695871c8a169c3d63a..b0315ceee268be8ac1813dae5a262a7d9496e154 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.gen_parsing_ops import *
 # pylint: enable=wildcard-import,undefined-variable
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 ops.NotDifferentiable("DecodeRaw")
@@ -44,6 +45,7 @@ ops.NotDifferentiable("SerializeTensor")
 ops.NotDifferentiable("StringToNumber")
 
 
+@tf_export("VarLenFeature")
 class VarLenFeature(collections.namedtuple("VarLenFeature", ["dtype"])):
   """Configuration for parsing a variable-length input feature.
 
@@ -53,6 +55,7 @@ class VarLenFeature(collections.namedtuple("VarLenFeature", ["dtype"])):
   pass
 
 
+@tf_export("SparseFeature")
 class SparseFeature(
     collections.namedtuple(
         "SparseFeature",
@@ -127,6 +130,7 @@ class SparseFeature(
         cls, index_key, value_key, dtype, size, already_sorted)
 
 
+@tf_export("FixedLenFeature")
 class FixedLenFeature(collections.namedtuple(
     "FixedLenFeature", ["shape", "dtype", "default_value"])):
   """Configuration for parsing a fixed-length input feature.
@@ -146,6 +150,7 @@ class FixedLenFeature(collections.namedtuple(
         cls, shape, dtype, default_value)
 
 
+@tf_export("FixedLenSequenceFeature")
 class FixedLenSequenceFeature(collections.namedtuple(
     "FixedLenSequenceFeature",
     ["shape", "dtype", "allow_missing", "default_value"])):
@@ -355,6 +360,7 @@ def _prepend_none_dimension(features):
     return features
 
 
+@tf_export("parse_example")
 def parse_example(serialized, features, name=None, example_names=None):
   # pylint: disable=line-too-long
   """Parses `Example` protos into a `dict` of tensors.
@@ -385,7 +391,7 @@ def parse_example(serialized, features, name=None, example_names=None):
   A `values[i]` comes from a position `k` in the feature of an example at batch
   entry `batch`. This positional information is recorded in `indices[i]` as
   `[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of
-  the feature in the example at with key `SparseFeature.index_key[j].
+  the feature in the example at with key `SparseFeature.index_key[j]`.
   In other words, we split the indices (except the first index indicating the
   batch entry) of a `SparseTensor` by dimension into different features of the
   `Example`. Due to its complexity a `VarLenFeature` should be preferred over a
@@ -715,6 +721,7 @@ def _parse_example_raw(serialized,
     return dict(zip(sparse_keys + dense_keys, sparse_tensors + dense_values))
 
 
+@tf_export("parse_single_example")
 def parse_single_example(serialized, features, name=None, example_names=None):
   """Parses a single `Example` proto.
 
@@ -749,6 +756,8 @@ def parse_single_example(serialized, features, name=None, example_names=None):
   """
   if not features:
     raise ValueError("Missing features.")
+  if example_names is None:
+    return parse_single_example_v2(serialized, features, name)
   features = _prepend_none_dimension(features)
   (sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults,
    dense_shapes) = _features_to_raw_params(
@@ -848,6 +857,7 @@ def _parse_single_example_raw(serialized,
     return outputs
 
 
+@tf_export("parse_single_sequence_example")
 def parse_single_sequence_example(
     serialized, context_features=None, sequence_features=None,
     example_name=None, name=None):
@@ -1169,6 +1179,7 @@ def _parse_single_sequence_example_raw(serialized,
 
 
 # Swap `name` and `na_value` for backward compatibility.
+@tf_export("decode_csv")
 def decode_csv(records, record_defaults, field_delim=",",
                use_quote_delim=True, name=None, na_value=""):
   # pylint: disable=protected-access
@@ -1205,3 +1216,199 @@ def decode_csv(records, record_defaults, field_delim=",",
       field_delim=field_delim, use_quote_delim=use_quote_delim,
       na_value=na_value, name=name)
   # pylint: enable=protected-access
+
+
+# TODO(b/70890287): Combine the implementation of this op and
+# `parse_single_example()` after 1/10/2018.
+def parse_single_example_v2(serialized, features, name=None):
+  # pylint: disable=line-too-long
+  """Parses an `Example` proto into a `dict` of tensors.
+
+  Parses a serialized
+  [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+  proto given in `serialized`.
+
+  This op parses serialized examples into a dictionary mapping keys to `Tensor`
+  and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`,
+  `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature`
+  and `SparseFeature` is mapped to a `SparseTensor`, and each
+  `FixedLenFeature` is mapped to a `Tensor`.
+
+  Each `VarLenFeature` maps to a `SparseTensor` of the specified type
+  representing a ragged matrix. Its indices are `[index]` where
+  `index` is the value's index in the list of values associated with
+  that feature and example.
+
+  Each `SparseFeature` maps to a `SparseTensor` of the specified type
+  representing a Tensor of `dense_shape` `SparseFeature.size`.
+  Its `values` come from the feature in the examples with key `value_key`.
+  A `values[i]` comes from a position `k` in the feature of an example at batch
+  entry `batch`. This positional information is recorded in `indices[i]` as
+  `[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of
+  the feature in the example at with key `SparseFeature.index_key[j]`.
+  In other words, we split the indices (except the first index indicating the
+  batch entry) of a `SparseTensor` by dimension into different features of the
+  `Example`. Due to its complexity a `VarLenFeature` should be preferred over a
+  `SparseFeature` whenever possible.
+
+  Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or
+  `tf.float32` if not specified) and shape `df.shape`.
+
+  `FixedLenFeature` entries with a `default_value` are optional. With no default
+  value, we will fail if that `Feature` is missing from any example in
+  `serialized`.
+
+  Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type
+  (or `tf.float32` if not specified) and shape `(None,) + df.shape`.
+
+  Args:
+    serialized: A scalar (0-D Tensor) string, a serialized `Example` proto.
+    features: A `dict` mapping feature keys to `FixedLenFeature`,
+      `VarLenFeature`, and `SparseFeature` values.
+    name: A name for this operation (optional).
+
+  Returns:
+    A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
+
+  Raises:
+    ValueError: if any feature is invalid.
+  """
+  if not features:
+    raise ValueError("Missing: features was %s." % features)
+  features = _prepend_none_dimension(features)
+  (sparse_keys, sparse_types, dense_keys, dense_types,
+   dense_defaults, dense_shapes) = _features_to_raw_params(
+       features,
+       [VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature])
+  outputs = _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types,
+                                         dense_keys, dense_types,
+                                         dense_defaults, dense_shapes, name)
+  return _construct_sparse_tensors_for_sparse_features(features, outputs)
+
+
+def _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types,
+                                 dense_keys, dense_types, dense_defaults,
+                                 dense_shapes, name):
+  """Parses `Example` protos.
+
+  Args:
+    serialized: A scalar (0-D Tensor) string, containing a binary
+      serialized `Example` proto.
+    sparse_keys: A list of string keys in the examples' features.
+      The results for these keys will be returned as `SparseTensor` objects.
+    sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
+      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+      and `tf.string` (`BytesList`) are supported.
+    dense_keys: A list of string keys in the examples' features.
+      The results for these keys will be returned as `Tensor`s
+    dense_types: A list of DTypes of the same length as `dense_keys`.
+      Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+      and `tf.string` (`BytesList`) are supported.
+    dense_defaults: A dict mapping string keys to `Tensor`s.
+      The keys of the dict must match the dense_keys of the feature.
+    dense_shapes: A list of tuples with the same length as `dense_keys`.
+      The shape of the data for each dense feature referenced by `dense_keys`.
+      Required for any input tensors identified by `dense_keys`.  Must be
+      either fully defined, or may contain an unknown first dimension.
+      An unknown first dimension means the feature is treated as having
+      a variable number of blocks, and the output shape along this dimension
+      is considered unknown at graph build time.  Padding is applied for
+      minibatch elements smaller than the maximum number of blocks for the
+      given feature along this dimension.
+    name: A name for this operation (optional).
+
+  Returns:
+    A `dict` mapping keys to `Tensor`s and `SparseTensor`s.
+
+  Raises:
+    ValueError: If sparse and dense key sets intersect, or input lengths do not
+      match up.
+  """
+  with ops.name_scope(name, "ParseSingleExample", [serialized]):
+    serialized = ops.convert_to_tensor(serialized, name="serialized")
+    dense_defaults = collections.OrderedDict(
+    ) if dense_defaults is None else dense_defaults
+    sparse_keys = [] if sparse_keys is None else sparse_keys
+    sparse_types = [] if sparse_types is None else sparse_types
+    dense_keys = [] if dense_keys is None else dense_keys
+    dense_types = [] if dense_types is None else dense_types
+    dense_shapes = ([[]] * len(dense_keys)
+                    if dense_shapes is None else dense_shapes)
+
+    num_dense = len(dense_keys)
+    num_sparse = len(sparse_keys)
+
+    if len(dense_shapes) != num_dense:
+      raise ValueError("len(dense_shapes) != len(dense_keys): %d vs. %d" %
+                       (len(dense_shapes), num_dense))
+    if len(dense_types) != num_dense:
+      raise ValueError("len(dense_types) != len(num_dense): %d vs. %d" %
+                       (len(dense_types), num_dense))
+    if len(sparse_types) != num_sparse:
+      raise ValueError("len(sparse_types) != len(sparse_keys): %d vs. %d" %
+                       (len(sparse_types), num_sparse))
+    if num_dense + num_sparse == 0:
+      raise ValueError("Must provide at least one sparse key or dense key")
+    if not set(dense_keys).isdisjoint(set(sparse_keys)):
+      raise ValueError(
+          "Dense and sparse keys must not intersect; intersection: %s" %
+          set(dense_keys).intersection(set(sparse_keys)))
+
+    # Convert dense_shapes to TensorShape object.
+    dense_shapes = [tensor_shape.as_shape(shape) for shape in dense_shapes]
+
+    dense_defaults_vec = []
+    for i, key in enumerate(dense_keys):
+      default_value = dense_defaults.get(key)
+      dense_shape = dense_shapes[i]
+      if (dense_shape.ndims is not None and dense_shape.ndims > 0 and
+          dense_shape[0].value is None):
+        # Variable stride dense shape, the default value should be a
+        # scalar padding value
+        if default_value is None:
+          default_value = ops.convert_to_tensor(
+              "" if dense_types[i] == dtypes.string else 0,
+              dtype=dense_types[i])
+        else:
+          # Reshape to a scalar to ensure user gets an error if they
+          # provide a tensor that's not intended to be a padding value
+          # (0 or 2+ elements).
+          key_name = "padding_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
+          default_value = ops.convert_to_tensor(
+              default_value, dtype=dense_types[i], name=key_name)
+          default_value = array_ops.reshape(default_value, [])
+      else:
+        if default_value is None:
+          default_value = constant_op.constant([], dtype=dense_types[i])
+        elif not isinstance(default_value, ops.Tensor):
+          key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key)
+          default_value = ops.convert_to_tensor(
+              default_value, dtype=dense_types[i], name=key_name)
+          default_value = array_ops.reshape(default_value, dense_shape)
+
+      dense_defaults_vec.append(default_value)
+
+    # Finally, convert dense_shapes to TensorShapeProto
+    dense_shapes = [shape.as_proto() for shape in dense_shapes]
+
+    # pylint: disable=protected-access
+    outputs = gen_parsing_ops.parse_single_example(
+        serialized=serialized,
+        dense_defaults=dense_defaults_vec,
+        num_sparse=len(sparse_keys),
+        sparse_keys=sparse_keys,
+        sparse_types=sparse_types,
+        dense_keys=dense_keys,
+        dense_shapes=dense_shapes,
+        name=name)
+    # pylint: enable=protected-access
+
+    (sparse_indices, sparse_values, sparse_shapes, dense_values) = outputs
+
+    sparse_tensors = [
+        sparse_tensor.SparseTensor(ix, val, shape)
+        for (ix, val,
+             shape) in zip(sparse_indices, sparse_values, sparse_shapes)
+    ]
+
+    return dict(zip(sparse_keys + dense_keys, sparse_tensors + dense_values))
diff --git a/tensorflow/python/ops/partitioned_variables.py b/tensorflow/python/ops/partitioned_variables.py
index edcc0e1d7c11f86ace8e42221308270ccc188b5d..174cabdf8027e75c780441d06a98a24c19be0cfc 100644
--- a/tensorflow/python/ops/partitioned_variables.py
+++ b/tensorflow/python/ops/partitioned_variables.py
@@ -58,6 +58,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
     "create_partitioned_variables",
@@ -67,6 +68,7 @@ __all__ = [
 ]
 
 
+@tf_export("variable_axis_size_partitioner")
 def variable_axis_size_partitioner(
     max_shard_bytes, axis=0, bytes_per_string_element=16, max_shards=None):
   """Get a partitioner for VariableScope to keep shards below `max_shard_bytes`.
@@ -151,6 +153,7 @@ def variable_axis_size_partitioner(
   return _partitioner
 
 
+@tf_export("min_max_variable_partitioner")
 def min_max_variable_partitioner(max_partitions=1, axis=0,
                                  min_slice_size=256 << 10,
                                  bytes_per_string_element=16):
@@ -214,6 +217,7 @@ def min_max_variable_partitioner(max_partitions=1, axis=0,
   return _partitioner
 
 
+@tf_export("fixed_size_partitioner")
 def fixed_size_partitioner(num_shards, axis=0):
   """Partitioner to specify a fixed number of shards along given axis.
 
@@ -232,6 +236,7 @@ def fixed_size_partitioner(num_shards, axis=0):
   return _partitioner
 
 
+@tf_export("create_partitioned_variables")
 def create_partitioned_variables(
     shape, slicing, initializer, dtype=dtypes.float32,
     trainable=True, collections=None, name=None, reuse=None):
diff --git a/tensorflow/python/ops/quantized_conv_ops_test.py b/tensorflow/python/ops/quantized_conv_ops_test.py
index 5ea47ea40e5f283736523d5d09a63176b5e8fbbf..4ac2a8f634bb201c9aaecb74432f2e6e78ee840f 100644
--- a/tensorflow/python/ops/quantized_conv_ops_test.py
+++ b/tensorflow/python/ops/quantized_conv_ops_test.py
@@ -93,7 +93,8 @@ class Conv2DTest(test.TestCase):
     quantized_range = ((quantized_max - quantized_min) * range_adjust)
     range_scale = (quantized_range / number_of_steps)
     lowest_quantized = -(1 << (number_of_bits - 1))
-    result = np.array([(quantized_min + ((x - lowest_quantized) * range_scale))
+    result = np.array([(quantized_min +
+                        ((float(x) - lowest_quantized) * range_scale))
                        for x in quantized.flatten()])
     return result
 
diff --git a/tensorflow/python/ops/quantized_ops_test.py b/tensorflow/python/ops/quantized_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d590bc4be6d520cbaa000d9802b84cbfbf8e90b9
--- /dev/null
+++ b/tensorflow/python/ops/quantized_ops_test.py
@@ -0,0 +1,60 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for quantized operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class QuantizedOpsTest(test.TestCase):
+
+  def __init__(self, method_name="runTest"):
+    super(QuantizedOpsTest, self).__init__(method_name)
+
+  def testQuantizeOp(self):
+    expected_output = [1, 1, 2, 127, 255, 255]
+    with self.test_session(use_gpu=False) as sess:
+      x = constant_op.constant(
+          [1.0, 1.25, 1.75, 127.0, 255.0, 500.0],
+          shape=[6],
+          dtype=dtypes.float32)
+      x_min = 0.0
+      x_max = 255.0
+      op = array_ops.quantize(x, x_min, x_max, dtypes.quint8, mode="MIN_FIRST")
+      value = sess.run(op)
+      self.assertArrayNear(expected_output, value.output, 0.1)
+
+  def testDequantizeOp(self):
+    expected_output = [1.0, 2.0, 4.0, 8.0, 16.0, 255.0]
+    inp = np.array([1, 2, 4, 8, 16, 255]).astype(np.uint8)
+    with self.test_session(use_gpu=False) as sess:
+      x = constant_op.constant(inp, shape=[6], dtype=dtypes.quint8)
+      x_min = 0.0
+      x_max = 255.0
+      op = array_ops.dequantize(x, x_min, x_max, mode="MIN_FIRST")
+      value = sess.run(op)
+      self.assertArrayNear(expected_output, value, 0.1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 52fb5131cfa6d7152ef49d7c10d5f57292d81f24..2c86358d21b1c280b8d7ade625fd4b7a44c5de26 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import math_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_random_ops import *
+from tensorflow.python.util.tf_export import tf_export
 
 # pylint: enable=wildcard-import
 
@@ -43,6 +44,7 @@ def _ShapeTensor(shape):
 
 
 # pylint: disable=protected-access
+@tf_export("random_normal")
 def random_normal(shape,
                   mean=0.0,
                   stddev=1.0,
@@ -135,6 +137,7 @@ def parameterized_truncated_normal(shape,
     return rnd
 
 
+@tf_export("truncated_normal")
 def truncated_normal(shape,
                      mean=0.0,
                      stddev=1.0,
@@ -152,7 +155,7 @@ def truncated_normal(shape,
     mean: A 0-D Tensor or Python value of type `dtype`. The mean of the
       truncated normal distribution.
     stddev: A 0-D Tensor or Python value of type `dtype`. The standard deviation
-      of the truncated normal distribution.
+      of the normal distribution, before truncation.
     dtype: The type of the output.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
@@ -179,6 +182,7 @@ ops.NotDifferentiable("ParameterizedTruncatedNormal")
 ops.NotDifferentiable("TruncatedNormal")
 
 
+@tf_export("random_uniform")
 def random_uniform(shape,
                    minval=0,
                    maxval=None,
@@ -220,8 +224,8 @@ def random_uniform(shape,
     ValueError: If `dtype` is integral and `maxval` is not specified.
   """
   dtype = dtypes.as_dtype(dtype)
-  if dtype not in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
-                   dtypes.int64):
+  if dtype not in (dtypes.float16, dtypes.bfloat16, dtypes.float32,
+                   dtypes.float64, dtypes.int32, dtypes.int64):
     raise ValueError("Invalid dtype %r" % dtype)
   if maxval is None:
     if dtype.is_integer:
@@ -244,6 +248,7 @@ def random_uniform(shape,
 ops.NotDifferentiable("RandomUniform")
 
 
+@tf_export("random_shuffle")
 def random_shuffle(value, seed=None, name=None):
   """Randomly shuffles a tensor along its first dimension.
 
@@ -274,6 +279,7 @@ def random_shuffle(value, seed=None, name=None):
       value, seed=seed1, seed2=seed2, name=name)
 
 
+@tf_export("random_crop")
 def random_crop(value, size, seed=None, name=None):
   """Randomly crops a tensor to a given size.
 
@@ -316,7 +322,8 @@ def random_crop(value, size, seed=None, name=None):
     return array_ops.slice(value, offset, size, name=name)
 
 
-def multinomial(logits, num_samples, seed=None, name=None):
+@tf_export("multinomial")
+def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
   """Draws samples from a multinomial distribution.
 
   Example:
@@ -336,6 +343,7 @@ def multinomial(logits, num_samples, seed=None, name=None):
       @{tf.set_random_seed}
       for behavior.
     name: Optional name for the operation.
+    output_dtype: integer type to use for the output. Defaults to int64.
 
   Returns:
     The drawn samples of shape `[batch_size, num_samples]`.
@@ -344,12 +352,13 @@ def multinomial(logits, num_samples, seed=None, name=None):
     logits = ops.convert_to_tensor(logits, name="logits")
     seed1, seed2 = random_seed.get_seed(seed)
     return gen_random_ops.multinomial(
-        logits, num_samples, seed=seed1, seed2=seed2)
+        logits, num_samples, seed=seed1, seed2=seed2, output_dtype=output_dtype)
 
 
 ops.NotDifferentiable("Multinomial")
 
 
+@tf_export("random_gamma")
 def random_gamma(shape,
                  alpha,
                  beta=None,
@@ -417,6 +426,7 @@ def random_gamma(shape,
 ops.NotDifferentiable("RandomGamma")
 
 
+@tf_export("random_poisson")
 def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
 
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 343e38f960e00933293dcb1d2df8371355b16d08..8d6a4df6bf9f16e5a8ff74da5fd35b152d342d00 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -184,11 +184,12 @@ class ResourceVariable(variables.Variable):
     assign = a.assign(2.0)
     with tf.control_dependencies([assign]):
       b = a.read_value()
-
-    other_assign = a.assign(3.0)
+    with tf.control_dependencies([b]):
+      other_assign = a.assign(3.0)
     with tf.control_dependencies([other_assign]):
-      tf.Print(b, [b]).run()  # Will print 2.0 because the value was read before
-                              # other_assign ran.
+      # Will print 2.0 because the value was read before other_assign ran. If
+      # `a` was a tf.Variable instead, 2.0 or 3.0 could be printed.
+      tf.Print(b, [b]).eval()
   ```
 
   To enforce these consistency properties tf.ResourceVariable might make more
@@ -275,10 +276,6 @@ class ResourceVariable(variables.Variable):
           dtype=dtype,
           constraint=constraint)
 
-  # LINT.IfChange
-  # _VariableFromResource inherits from ResourceVariable but
-  # doesn't call the constructor, so changes here might need to be reflected
-  # there.
   # pylint: disable=unused-argument
   def _init_from_args(self,
                       initial_value=None,
@@ -351,11 +348,11 @@ class ResourceVariable(variables.Variable):
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
     self._save_slice_info = None
-    self._in_graph_mode = context.in_graph_mode()
-    # Save the graph's container prefix for error checking. Reading the value of
-    # the ResourceVariable from another Graph in Eager mode is an error.
-    self._container_prefix = ops.get_default_graph()._container_prefix  # pylint: disable=protected-access
-    with ops.control_dependencies(None):
+    # Store the graph key so optimizers know how to only retrieve variables from
+    # this graph.
+    self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
+    with ops.init_scope():
+      self._in_graph_mode = context.in_graph_mode()
       with ops.name_scope(name, "Variable", []
                           if init_from_fn else [initial_value]) as name:
         # pylint: disable=protected-access
@@ -437,7 +434,8 @@ class ResourceVariable(variables.Variable):
               self._initializer_op = (
                   gen_resource_variable_ops.assign_variable_op(
                       self._handle,
-                      self._build_initializer_expr(initial_value),
+                      self._try_guard_against_uninitialized_dependencies(
+                          initial_value),
                       name=n))
           with ops.name_scope("Read"), ops.colocate_with(self._handle):
             # Manually assign reads to the handle's device to avoid log
@@ -505,6 +503,14 @@ class ResourceVariable(variables.Variable):
     self._initializer_op = g.as_graph_element(
         ops.prepend_name_scope(
             variable_def.initializer_name, import_scope=import_scope))
+    # Check whether initial_value_name exists for backwards compatibility.
+    if (hasattr(variable_def, "initial_value_name") and
+        variable_def.initial_value_name):
+      self._initial_value = g.as_graph_element(
+          ops.prepend_name_scope(variable_def.initial_value_name,
+                                 import_scope=import_scope))
+    else:
+      self._initial_value = None
     if variable_def.snapshot_name:
       self._cached_value = g.as_graph_element(
           ops.prepend_name_scope(
@@ -521,7 +527,6 @@ class ResourceVariable(variables.Variable):
     self._dtype = dtypes.as_dtype(self._handle.op.get_attr("dtype"))
     self._graph_element = self.value()
     self._constraint = None
-  # LINT.ThenChange(//tensorflow/python/eager/graph_callable.py)
 
   def __nonzero__(self):
     return self.__bool__()
@@ -665,15 +670,7 @@ class ResourceVariable(variables.Variable):
 
     Returns:
      the read operation.
-    Raises:
-      ValueError: if the ResourceVariable was created in another isolation
-        environment or graph.
     """
-    if (not self._in_graph_mode and
-        self._container_prefix != ops.get_default_graph()._container_prefix):  # pylint: disable=protected-access
-      raise ValueError(
-          "Attempted to read a variable from another isolation environment"
-          " or Graph")
     with ops.name_scope("Read"):
       # Ensure we read the variable in the same device as the handle.
       with ops.device(self._handle_device):
@@ -710,6 +707,12 @@ class ResourceVariable(variables.Variable):
       var_def = variable_pb2.VariableDef()
       var_def.variable_name = ops.strip_name_scope(self.handle.name,
                                                    export_scope)
+      if self._initial_value is not None:
+        # This is inside an if-statement for backwards compatibility, since
+        # self._initial_value might be None for variables constructed from old
+        # protos.
+        var_def.initial_value_name = ops.strip_name_scope(
+            self._initial_value.name, export_scope)
       var_def.initializer_name = ops.strip_name_scope(self.initializer.name,
                                                       export_scope)
       if self._cached_value is not None:
@@ -780,38 +783,38 @@ class ResourceVariable(variables.Variable):
     # TODO(apassos): this here and below is not atomic. Consider making it
     # atomic if there's a way to do so without a performance cost for those who
     # don't need it.
-    with ops.control_dependencies([
-        gen_resource_variable_ops.assign_sub_variable_op(
-            self.handle,
-            ops.convert_to_tensor(delta, dtype=self.dtype),
-            name=name)
-    ]):
-      return self.read_value()
+    return self._lazy_read(gen_resource_variable_ops.assign_sub_variable_op(
+        self.handle,
+        ops.convert_to_tensor(delta, dtype=self.dtype),
+        name=name))
 
   def assign_add(self, delta, use_locking=None, name=None):
-    with ops.control_dependencies([
-        gen_resource_variable_ops.assign_add_variable_op(
-            self.handle,
-            ops.convert_to_tensor(delta, dtype=self.dtype),
-            name=name)
-    ]):
-      return self.read_value()
+    return self._lazy_read(gen_resource_variable_ops.assign_add_variable_op(
+        self.handle,
+        ops.convert_to_tensor(delta, dtype=self.dtype),
+        name=name))
+
+  def _lazy_read(self, op):
+    if hasattr(self, "_trainable") and self._trainable:
+      tape.watch_variable(self)
+    return _UnreadVariable(
+        self._handle, self.dtype, self._handle_device, self._shape,
+        self._in_graph_mode,
+        self._handle_deleter if not self._in_graph_mode else None, op)
 
   def assign(self, value, use_locking=None, name=None):
     value_tensor = ops.convert_to_tensor(value, dtype=self.dtype)
     self._shape.assert_is_compatible_with(value_tensor.shape)
-    with ops.control_dependencies([
+    return self._lazy_read(
         gen_resource_variable_ops.assign_variable_op(
             self.handle,
             value_tensor,
-            name=name)
-    ]):
-      return self.read_value()
+            name=name))
 
   def _strided_slice_assign(self, begin, end, strides, value, name, begin_mask,
                             end_mask, ellipsis_mask, new_axis_mask,
                             shrink_axis_mask):
-    with ops.control_dependencies([
+    return self._lazy_read(
         gen_array_ops.resource_strided_slice_assign(
             ref=self.handle,
             begin=begin,
@@ -823,9 +826,12 @@ class ResourceVariable(variables.Variable):
             end_mask=end_mask,
             ellipsis_mask=ellipsis_mask,
             new_axis_mask=new_axis_mask,
-            shrink_axis_mask=shrink_axis_mask)
-    ]):
-      return self.value()
+            shrink_axis_mask=shrink_axis_mask))
+
+  def __int__(self):
+    if self.dtype != dtypes.int32 and self.dtype != dtypes.int64:
+      raise TypeError("Non-integer variable can't be converted to integer.")
+    return int(self.value().numpy())
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     del name
@@ -838,31 +844,106 @@ class ResourceVariable(variables.Variable):
       return self.value()
 
   def __iadd__(self, unused_other):
-    raise RuntimeError("Variable += value not supported.")
+    raise RuntimeError("Variable += value not supported. Use "
+                       "variable.assign_add(value) to modify the variable "
+                       "value and variable = variable + value to get a new "
+                       "Tensor object.")
 
   def __isub__(self, unused_other):
-    raise RuntimeError("Variable -= value not supported.")
+    raise RuntimeError("Variable -= value not supported. Use "
+                       "variable.assign_sub(value) to modify the variable "
+                       "value and variable = variable - value to get a new "
+                       "Tensor object.")
 
   def __imul__(self, unused_other):
-    raise RuntimeError("Variable *= value not supported.")
+    raise RuntimeError("Variable *= value not supported. Use "
+                       "variable.assign_mul(value) to modify the variable "
+                       "value and variable = variable * value to get a new "
+                       "Tensor object.")
 
   def __idiv__(self, unused_other):
-    raise RuntimeError("Variable /= value not supported.")
+    raise RuntimeError("Variable /= value not supported. Use "
+                       "variable.assign_div(value) to modify the variable "
+                       "value and variable = variable / value to get a new "
+                       "Tensor object.")
 
   def __itruediv__(self, unused_other):
-    raise RuntimeError("Variable /= value not supported.")
+    raise RuntimeError("Variable /= value not supported. Use "
+                       "variable.assign_div(value) to modify the variable "
+                       "value and variable = variable / value to get a new "
+                       "Tensor object.")
 
   def __irealdiv__(self, unused_other):
-    raise RuntimeError("Variable /= value not supported.")
+    raise RuntimeError("Variable /= value not supported. Use "
+                       "variable.assign_div(value) to modify the variable "
+                       "value and variable = variable / value to get a new "
+                       "Tensor object.")
 
   def __ipow__(self, unused_other):
-    raise RuntimeError("Variable **= value not supported.")
+    raise RuntimeError("Variable **= value not supported. Use "
+                       "value and variable = variable ** value to get a new "
+                       "Tensor object.")
 
 
 def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
 
+class _UnreadVariable(ResourceVariable):
+  """Represents a future for a read of a variable.
+
+  Pretends to be the tensor if anyone looks.
+  """
+
+  def __init__(self, handle, dtype, handle_device,  # pylint: disable=super-init-not-called
+               shape, in_graph_mode, deleter, parent_op):
+    # We do not call super init on purpose.
+    self._trainable = False
+    self._save_slice_info = None
+    self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
+    self._in_graph_mode = in_graph_mode
+    self._handle = handle
+    self._handle_device = handle_device
+    self._shape = shape
+    self._initial_value = None
+    if isinstance(self._handle, ops.EagerTensor):
+      self._handle_name = ""
+    else:
+      self._handle_name = self._handle.name
+    self._dtype = dtype
+    self._constraint = None
+    self._cached_value = None
+    self._is_initialized_op = None
+    self._initializer_op = None
+    self._parent_op = parent_op
+    if context.in_graph_mode():
+      self._graph_element = self.read_value()
+    else:
+      self._graph_element = None
+    self._handle_deleter = deleter
+
+  def value(self):
+    return self._read_variable_op()
+
+  def read_value(self):
+    return self._read_variable_op()
+
+  def _read_variable_op(self):
+    with ops.control_dependencies([self._parent_op]):
+      return gen_resource_variable_ops.read_variable_op(self._handle,
+                                                        self._dtype)
+
+  def set_shape(self, shape):
+    self._shape = shape
+
+  @property
+  def op(self):
+    """The op for this variable."""
+    return self._parent_op
+
+ops.register_tensor_conversion_function(_UnreadVariable, _dense_var_to_tensor)
+ops.register_dense_tensor_like_type(_UnreadVariable)
+
 # Register a conversion function which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
 
@@ -887,26 +968,14 @@ def _ReadGrad(_, grad):
 def _GatherGrad(op, grad):
   """Gradient for gather op."""
   # Build appropriately shaped IndexedSlices
-  # Walk graph back until the original handle is found.
-  # TODO(apassos): more robust way of getting the shape.
-  # TODO(apassos): implement this for EAGER mode.
-  if context.in_eager_mode():
-    dense_shape = gen_resource_variable_ops.variable_shape(op.inputs[0])
-    return (ops.IndexedSlices(grad,
-                              op.inputs[1],
-                              dense_shape=dense_shape),
-            None)
   handle = op.inputs[0]
-  while handle.op.type != "VarHandleOp":
-    handle = handle.op.inputs[0]
-  params_shape = ops.convert_to_tensor(
-      tensor_shape.TensorShape(handle.op.get_attr("shape")))
   indices = op.inputs[1]
+  params_shape = gen_resource_variable_ops.variable_shape(handle)
   size = array_ops.expand_dims(array_ops.size(indices), 0)
   values_shape = array_ops.concat([size, params_shape[1:]], 0)
   values = array_ops.reshape(grad, values_shape)
   indices = array_ops.reshape(indices, size)
-  return [ops.IndexedSlices(values, indices, params_shape), None]
+  return (ops.IndexedSlices(values, indices, params_shape), None)
 
 
 def _to_proto_fn(v, export_scope=None):
@@ -946,3 +1015,9 @@ ops.register_proto_function(
     proto_type=variable_pb2.VariableDef,
     to_proto=_to_proto_fn,
     from_proto=_from_proto_fn)
+
+
+def is_resource_variable(var):
+  """"Returns True if `var` is to be considered a ResourceVariable."""
+  return isinstance(var, ResourceVariable) or hasattr(
+      var, "_should_act_as_resource_variable")
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index e30b19842f08d335ce7967b77dcb49578fb3fe85..aa8d4327d2f0e93768728744d5cce3fed385393f 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access
@@ -82,8 +83,9 @@ def _best_effort_input_batch_size(flat_input):
   """Get static input batch size if available, with fallback to the dynamic one.
 
   Args:
-    flat_input: An iterable of time major input Tensors of shape [max_time,
-      batch_size, ...]. All inputs should have compatible batch sizes.
+    flat_input: An iterable of time major input Tensors of shape
+      `[max_time, batch_size, ...]`.
+    All inputs should have compatible batch sizes.
 
   Returns:
     The batch size in Python integer if available, or a scalar Tensor otherwise.
@@ -170,11 +172,11 @@ def _rnn_step(
   return (final_output, final_state)
 
   Args:
-    time: Python int, the current time step
-    sequence_length: int32 `Tensor` vector of size [batch_size]
-    min_sequence_length: int32 `Tensor` scalar, min of sequence_length
-    max_sequence_length: int32 `Tensor` scalar, max of sequence_length
-    zero_output: `Tensor` vector of shape [output_size]
+    time: int32 `Tensor` scalar.
+    sequence_length: int32 `Tensor` vector of size [batch_size].
+    min_sequence_length: int32 `Tensor` scalar, min of sequence_length.
+    max_sequence_length: int32 `Tensor` scalar, max of sequence_length.
+    zero_output: `Tensor` vector of shape [output_size].
     state: Either a single `Tensor` matrix of shape `[batch_size, state_size]`,
       or a list/tuple of such tensors.
     call_cell: lambda returning tuple of (new_output, new_state) where
@@ -201,6 +203,9 @@ def _rnn_step(
   flat_state = nest.flatten(state)
   flat_zero_output = nest.flatten(zero_output)
 
+  # Vector describing which batch entries are finished.
+  copy_cond = time >= sequence_length
+
   def _copy_one_through(output, new_output):
     # TensorArray and scalar get passed through.
     if isinstance(output, tensor_array_ops.TensorArray):
@@ -208,7 +213,6 @@ def _rnn_step(
     if output.shape.ndims == 0:
       return new_output
     # Otherwise propagate the old or the new value.
-    copy_cond = (time >= sequence_length)
     with ops.colocate_with(new_output):
       return array_ops.where(copy_cond, output, new_output)
 
@@ -320,6 +324,7 @@ def _reverse_seq(input_seq, lengths):
   return results
 
 
+@tf_export("nn.bidirectional_dynamic_rnn")
 def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
                               initial_state_fw=None, initial_state_bw=None,
                               dtype=None, parallel_iterations=None,
@@ -449,6 +454,7 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
   return (outputs, output_states)
 
 
+@tf_export("nn.dynamic_rnn")
 def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
                 dtype=None, parallel_iterations=None, swap_memory=False,
                 time_major=False, scope=None):
@@ -665,7 +671,7 @@ def _dynamic_rnn_loop(cell,
     final_outputs:
       A `Tensor` of shape `[time, batch_size, cell.output_size]`.  If
       `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape`
-      objects, then this returns a (possibly nsted) tuple of Tensors matching
+      objects, then this returns a (possibly nested) tuple of Tensors matching
       the corresponding shapes.
     final_state:
       A `Tensor`, or possibly nested tuple of Tensors, matching in length
@@ -722,6 +728,8 @@ def _dynamic_rnn_loop(cell,
   if sequence_length is not None:
     min_sequence_length = math_ops.reduce_min(sequence_length)
     max_sequence_length = math_ops.reduce_max(sequence_length)
+  else:
+    max_sequence_length = time_steps
 
   time = array_ops.constant(0, dtype=dtypes.int32, name="time")
 
@@ -806,11 +814,21 @@ def _dynamic_rnn_loop(cell,
 
     return (time + 1, output_ta_t, new_state)
 
+  if in_graph_mode:
+    # Make sure that we run at least 1 step, if necessary, to ensure
+    # the TensorArrays pick up the dynamic shape.
+    loop_bound = math_ops.minimum(
+        time_steps, math_ops.maximum(1, max_sequence_length))
+  else:
+    # Using max_sequence_length isn't currently supported in the Eager branch.
+    loop_bound = time_steps
+
   _, output_final_ta, final_state = control_flow_ops.while_loop(
-      cond=lambda time, *_: time < time_steps,
+      cond=lambda time, *_: time < loop_bound,
       body=_time_step,
       loop_vars=(time, output_ta, state),
       parallel_iterations=parallel_iterations,
+      maximum_iterations=time_steps,
       swap_memory=swap_memory)
 
   # Unpack final output if not using output tuples.
@@ -832,6 +850,7 @@ def _dynamic_rnn_loop(cell,
   return (final_outputs, final_state)
 
 
+@tf_export("nn.raw_rnn")
 def raw_rnn(cell, loop_fn,
             parallel_iterations=None, swap_memory=False, scope=None):
   """Creates an `RNN` specified by RNNCell `cell` and loop function `loop_fn`.
@@ -1109,6 +1128,12 @@ def raw_rnn(cell, loop_fn,
       def _copy_some_through(current, candidate):
         """Copy some tensors through via array_ops.where."""
         def copy_fn(cur_i, cand_i):
+          # TensorArray and scalar get passed through.
+          if isinstance(cur_i, tensor_array_ops.TensorArray):
+            return cand_i
+          if cur_i.shape.ndims == 0:
+            return cand_i
+          # Otherwise propagate the old or the new value.
           with ops.colocate_with(cand_i):
             return array_ops.where(elements_finished, cur_i, cand_i)
         return nest.map_structure(copy_fn, current, candidate)
@@ -1139,6 +1164,7 @@ def raw_rnn(cell, loop_fn,
     return (emit_ta, final_state, final_loop_state)
 
 
+@tf_export("nn.static_rnn")
 def static_rnn(cell,
                inputs,
                initial_state=None,
@@ -1308,6 +1334,7 @@ def static_rnn(cell,
     return (outputs, state)
 
 
+@tf_export("nn.static_state_saving_rnn")
 def static_state_saving_rnn(cell,
                             inputs,
                             state_saver,
@@ -1392,6 +1419,7 @@ def static_state_saving_rnn(cell,
   return (outputs, state)
 
 
+@tf_export("nn.static_bidirectional_rnn")
 def static_bidirectional_rnn(cell_fw,
                              cell_bw,
                              inputs,
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 8aaf77f1733fc0569ebcbc71373a204cfb3f2913..923348ea44e18a87e09fe1c0424f0323eb967e3d 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -47,6 +47,7 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 _BIAS_VARIABLE_NAME = "bias"
@@ -133,6 +134,7 @@ def _zero_state_tensors(state_size, batch_size, dtype):
   return nest.map_structure(get_state_shape, state_size)
 
 
+@tf_export("nn.rnn_cell.RNNCell")
 class RNNCell(base_layer.Layer):
   """Abstract object representing an RNN cell.
 
@@ -238,7 +240,8 @@ class RNNCell(base_layer.Layer):
     # Try to use the last cached zero_state. This is done to avoid recreating
     # zeros, especially when eager execution is enabled.
     state_size = self.state_size
-    if hasattr(self, "_last_zero_state"):
+    is_eager = context.in_eager_mode()
+    if is_eager and hasattr(self, "_last_zero_state"):
       (last_state_size, last_batch_size, last_dtype,
        last_output) = getattr(self, "_last_zero_state")
       if (last_batch_size == batch_size and
@@ -247,11 +250,12 @@ class RNNCell(base_layer.Layer):
         return last_output
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
       output = _zero_state_tensors(state_size, batch_size, dtype)
-    self._last_zero_state = (state_size, batch_size, dtype, output)
+    if is_eager:
+      self._last_zero_state = (state_size, batch_size, dtype, output)
     return output
 
 
-class _LayerRNNCell(RNNCell):
+class LayerRNNCell(RNNCell):
   """Subclass of RNNCells that act like proper `tf.Layer` objects.
 
   For backwards compatibility purposes, most `RNNCell` instances allow their
@@ -265,7 +269,7 @@ class _LayerRNNCell(RNNCell):
   `call` methods do not access Variables `tf.get_variable`.
   """
 
-  def __call__(self, inputs, state, scope=None):
+  def __call__(self, inputs, state, scope=None, *args, **kwargs):
     """Run this RNN cell on inputs, starting from the given state.
 
     Args:
@@ -274,8 +278,9 @@ class _LayerRNNCell(RNNCell):
         with shape `[batch_size, self.state_size]`.  Otherwise, if
         `self.state_size` is a tuple of integers, this should be a tuple
         with shapes `[batch_size, s] for s in self.state_size`.
-      scope: `VariableScope` for the created subgraph; if not provided,
-        defaults to standard `tf.layers.Layer` behavior.
+      scope: optional cell scope.
+      *args: Additional positional arguments.
+      **kwargs: Additional keyword arguments.
 
     Returns:
       A pair containing:
@@ -287,10 +292,12 @@ class _LayerRNNCell(RNNCell):
     # Bypass RNNCell's variable capturing semantics for LayerRNNCell.
     # Instead, it is up to subclasses to provide a proper build
     # method.  See the class docstring for more details.
-    return base_layer.Layer.__call__(self, inputs, state, scope=scope)
+    return base_layer.Layer.__call__(self, inputs, state, scope=scope,
+                                     *args, **kwargs)
 
 
-class BasicRNNCell(_LayerRNNCell):
+@tf_export("nn.rnn_cell.BasicRNNCell")
+class BasicRNNCell(LayerRNNCell):
   """The most basic RNN cell.
 
   Args:
@@ -347,7 +354,8 @@ class BasicRNNCell(_LayerRNNCell):
     return output, output
 
 
-class GRUCell(_LayerRNNCell):
+@tf_export("nn.rnn_cell.GRUCell")
+class GRUCell(LayerRNNCell):
   """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
 
   Args:
@@ -444,6 +452,7 @@ class GRUCell(_LayerRNNCell):
 _LSTMStateTuple = collections.namedtuple("LSTMStateTuple", ("c", "h"))
 
 
+@tf_export("nn.rnn_cell.LSTMStateTuple")
 class LSTMStateTuple(_LSTMStateTuple):
   """Tuple used by LSTM Cells for `state_size`, `zero_state`, and output state.
 
@@ -463,7 +472,8 @@ class LSTMStateTuple(_LSTMStateTuple):
     return c.dtype
 
 
-class BasicLSTMCell(_LayerRNNCell):
+@tf_export("nn.rnn_cell.BasicLSTMCell")
+class BasicLSTMCell(LayerRNNCell):
   """Basic LSTM recurrent network cell.
 
   The implementation is based on: http://arxiv.org/abs/1409.2329.
@@ -587,7 +597,8 @@ class BasicLSTMCell(_LayerRNNCell):
     return new_h, new_state
 
 
-class LSTMCell(_LayerRNNCell):
+@tf_export("nn.rnn_cell.LSTMCell")
+class LSTMCell(LayerRNNCell):
   """Long short-term memory unit (LSTM) recurrent network cell.
 
   The default non-peephole implementation is based on:
@@ -830,6 +841,7 @@ def _default_dropout_state_filter_visitor(substate):
   return True
 
 
+@tf_export("nn.rnn_cell.DropoutWrapper")
 class DropoutWrapper(RNNCell):
   """Operator adding dropout to inputs and outputs of the given cell."""
 
@@ -975,6 +987,10 @@ class DropoutWrapper(RNNCell):
     string = (str(self._seed) + salt).encode("utf-8")
     return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
 
+  @property
+  def wrapped_cell(self):
+    return self._cell
+
   @property
   def state_size(self):
     return self._cell.state_size
@@ -1037,7 +1053,7 @@ class DropoutWrapper(RNNCell):
       inputs = self._dropout(inputs, "input",
                              self._recurrent_input_noise,
                              self._input_keep_prob)
-    output, new_state = self._cell(inputs, state, scope)
+    output, new_state = self._cell(inputs, state, scope=scope)
     if _should_dropout(self._state_keep_prob):
       # Identify which subsets of the state to perform dropout on and
       # which ones to keep.
@@ -1054,6 +1070,7 @@ class DropoutWrapper(RNNCell):
     return output, new_state
 
 
+@tf_export("nn.rnn_cell.ResidualWrapper")
 class ResidualWrapper(RNNCell):
   """RNNCell wrapper that ensures cell inputs are added to the outputs."""
 
@@ -1109,6 +1126,7 @@ class ResidualWrapper(RNNCell):
     return (res_outputs, new_state)
 
 
+@tf_export("nn.rnn_cell.DeviceWrapper")
 class DeviceWrapper(RNNCell):
   """Operator that ensures an RNNCell runs on a particular device."""
 
@@ -1143,6 +1161,7 @@ class DeviceWrapper(RNNCell):
       return self._cell(inputs, state, scope=scope)
 
 
+@tf_export("nn.rnn_cell.MultiRNNCell")
 class MultiRNNCell(RNNCell):
   """RNN cell composed sequentially of multiple simple cells."""
 
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 2c3667dffedf111f37a9f6eadcc7f1de83c2347e..a7339e4da33164dd38d3bd5d9fea20b0613aadb7 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -29,9 +29,42 @@ import numpy as np
 import six
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import context
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_script_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+class EagerFunc(object):
+  """A wrapper for a function owned by an EagerPyFunc."""
+
+  def __init__(self, func, Tout):
+    """Constructs an EagerFunc.
+
+    Args:
+      func: The function to wrap.
+      Tout: A list of datatypes for the output; an empty list if the output is
+            None.
+    """
+    self._func = func
+    self._out_dtypes = Tout
+
+  def __call__(self, on_gpu, args):
+    """Passes `args` to `self._func`, which is executed eagerly."""
+    with context.eager_mode():
+      ret = self._func(*args)
+      maybe_copy_to_gpu = lambda x: x if not on_gpu else x.gpu()
+      if isinstance(ret, (tuple, list)):
+        return [
+            maybe_copy_to_gpu(ops.convert_to_tensor(x, dtype=dtype))
+            for (x, dtype) in zip(ret, self._out_dtypes)
+        ]
+      elif ret is None:
+        return ret
+      else:
+        return maybe_copy_to_gpu(
+            ops.convert_to_tensor(ret, dtype=self._out_dtypes[0]))
 
 
 class FuncRegistry(object):
@@ -64,7 +97,7 @@ class FuncRegistry(object):
     components of a tensor have different lengths.  This is bad: ignoring the
     padding is wrong for text data, and removing the padding is wrong for binary
     data.  To avoid this bug, we redo the conversion using an object dtype.
-    Additionally, we convert unicode strings to (byte-)strings for Python3
+    Additionally, we convert unicode strings to (byte-)strings for
     compatibility.
 
     Args:
@@ -78,29 +111,46 @@ class FuncRegistry(object):
     if result.dtype.char == "S" and result is not value:
       return np.asarray(value, order="C", dtype=object)
     elif result.dtype.char == "U" and result is not value:
-      value = np.vectorize(lambda x: x.encode())(value)
+      value = np.vectorize(lambda x: x.encode("utf8"))(value)
       return np.asarray(value, order="C", dtype=object)
     elif result.dtype.char == "U":
       return result.astype(np.bytes_)
     else:
       return result
 
-  def __call__(self, token, args):
-    """Calls the registered function for `token` with args."""
+  def __call__(self, token, on_gpu, args):
+    """Calls the registered function for `token` with args.
+
+    Args:
+      token: A key into this `FuncRegistry` identifying which function to call.
+      on_gpu: A boolean indicating whether or not `token`'s corresponding
+        operation was placed on GPU; only used if the function registered for
+        `token` is an `EagerPyFunc`.
+      args: The arguments to pass to the function registered for `token`.
+
+    Returns:
+      The output of the function registered for `token`.
+
+    Raises:
+      ValueError: if no function is registered for `token`.
+    """
     func = self._funcs[token]
     if func is None:
       raise ValueError("callback %s is not found" % token)
-    ret = func(*args)
-    # Strings seem to lead to a memory leak here if they're not wrapped in a
-    # list.
-    if isinstance(ret, six.binary_type):
-      ret = [ret]
-    # Ensures that we return either a single numpy array or a list of numpy
-    # arrays.
-    if isinstance(ret, (tuple, list)):
-      return [self._convert(x) for x in ret]
+    if isinstance(func, EagerFunc):
+      return func(on_gpu, args)
     else:
-      return self._convert(ret)
+      ret = func(*args)
+      # Strings seem to lead to a memory leak here if they're not wrapped in a
+      # list.
+      if isinstance(ret, six.binary_type):
+        ret = [ret]
+      # Ensures that we return either a single numpy array or a list of numpy
+      # arrays.
+      if isinstance(ret, (tuple, list)):
+        return [self._convert(x) for x in ret]
+      else:
+        return self._convert(ret)
 
   def size(self):
     """Returns how many functions are currently registered."""
@@ -126,9 +176,93 @@ class CleanupFunc(object):
     self._token = token
 
   def __del__(self):
-    _py_funcs.remove(self._token)
+    if _py_funcs is not None:
+      # If _py_funcs is None, the program is most likely in shutdown, and the
+      # _py_funcs object has been destroyed already.
+      _py_funcs.remove(self._token)
+
+
+def _internal_py_func(func, inp, Tout, stateful=None, eager=False, name=None):
+  """See documentation for py_func and eager_py_func."""
+
+  is_list_or_tuple = False
+  if isinstance(Tout, (list, tuple)):
+    is_list_or_tuple = True
+  else:
+    Tout = [Tout]
+
+  if eager:
+    func = EagerFunc(func, Tout)
+
+  token = _py_funcs.insert(func)
+  # We tie the registered function's lifetime with the current default graph,
+  # i.e., when the current graph is destroyed, we remove its py funcs.
+  graph = ops.get_default_graph()
+
+  # pylint: disable=protected-access
+  while isinstance(graph, function._FuncGraph):
+    # If the py_func was declared inside a _FuncGraph, its lifetime should be
+    # bound to that of the outer graph instead.
+    graph = graph._outer_graph
+
+  cleanup = CleanupFunc(token)
+
+  # TODO(zhifengc): Consider adding a Graph method to collect
+  # `cleanup` objects in one of its member.
+  if not hasattr(graph, "_cleanup_py_funcs_used_in_graph"):
+    graph._cleanup_py_funcs_used_in_graph = []
+
+  # When `graph` is destroyed, elements in _cleanup_py_funcs_used_in_graph
+  # will be destroyed and their __del__ will remove the 'token' from
+  # the funcs registry.
+  graph._cleanup_py_funcs_used_in_graph.append(cleanup)
+  # pylint: enable=protected-access
+
+  # pylint: disable=protected-access
+  if eager:
+    result = gen_script_ops._eager_py_func(
+        input=inp, token=token, Tout=Tout, name=name)
+  else:
+    if stateful:
+      result = gen_script_ops._py_func(
+          input=inp, token=token, Tout=Tout, name=name)
+    else:
+      result = gen_script_ops._py_func_stateless(
+          input=inp, token=token, Tout=Tout, name=name)
+  # pylint: enable=protected-access
+  return result if is_list_or_tuple else result[0]
+
+
+def eager_py_func(func, inp, Tout, name=None):
+  """Wraps a python function into a TensorFlow op.
+
+  When the returned op is executed, `func` is invoked with eager execution
+  enabled. Inputs are Tensor objects and func must return None or objects
+  that may be converted to Tensor objects.
+
+  This function has the same limitations as `py_func` with respect to
+  serialization and distribution.
+
+  Args:
+    func: A Python function which accepts a list of `Tensor` objects
+      having element types that match the corresponding `tf.Tensor` objects
+      in `inp` and returns a list of `Tensor` objects (or a single
+      `Tensor`, or `None`) having element types that match the
+      corresponding values in `Tout`.
+    inp: A list of `Tensor` objects.
+    Tout: A list or tuple of tensorflow data types or a single tensorflow data
+      type if there is only one, indicating what `func` returns; an empty list
+      if no value is returned (i.e., if the return value is `None`).
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of `Tensor` or a single `Tensor` which `func` computes; an empty list
+    if `func` returns None.
+  """
+  return _internal_py_func(func=func, inp=inp, Tout=Tout, eager=True, name=name)
 
 
+@tf_export("py_func")
 def py_func(func, inp, Tout, stateful=True, name=None):
   """Wraps a python function and uses it as a TensorFlow op.
 
@@ -182,45 +316,8 @@ def py_func(func, inp, Tout, stateful=True, name=None):
   Returns:
     A list of `Tensor` or a single `Tensor` which `func` computes.
   """
-  token = _py_funcs.insert(func)
-  # We tie the registered function's life-time with the current
-  # default graph. I.e., when the current graph is destroyed, we
-  # should remove its py funcs.
-  g = ops.get_default_graph()
-
-  # pylint: disable=protected-access
-  while isinstance(g, function._FuncGraph):
-    # If the py_func was declared inside a _FuncGraph, its lifetime should be
-    # bound to that of the outer graph instead.
-    g = g._outer_graph
-
-  cleanup = CleanupFunc(token)
-
-  # TODO(zhifengc): Consider adding a Graph method to collect
-  # `cleanup` objects in one of its member.
-  if not hasattr(g, "_cleanup_py_funcs_used_in_graph"):
-    g._cleanup_py_funcs_used_in_graph = []
-
-  # When g is destroyed, elements in _cleanup_py_funcs_used_in_graph
-  # will be destroyed and their __del__ will remove the 'token' from
-  # the funcs registry.
-  g._cleanup_py_funcs_used_in_graph.append(cleanup)
-  # pylint: enable=protected-access
-
-  if isinstance(Tout, (list, tuple)):
-    is_list_or_tuple = True
-  else:
-    Tout = [Tout]
-    is_list_or_tuple = False
-  # pylint: disable=protected-access
-  if stateful:
-    result = gen_script_ops._py_func(
-        input=inp, token=token, Tout=Tout, name=name)
-  else:
-    result = gen_script_ops._py_func_stateless(
-        input=inp, token=token, Tout=Tout, name=name)
-  # pylint: enable=protected-access
-  return result if is_list_or_tuple else result[0]
+  return _internal_py_func(
+      func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
 
 
 ops.NotDifferentiable("PyFunc")
diff --git a/tensorflow/python/ops/session_ops.py b/tensorflow/python/ops/session_ops.py
index dc4d913c938a89f23297c02c2d18b286fd3bb9e8..cedd36c1deed541adcf601ff9447345e2279e8f9 100644
--- a/tensorflow/python/ops/session_ops.py
+++ b/tensorflow/python/ops/session_ops.py
@@ -36,6 +36,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 def encode_resource_handle(resource_handle):
@@ -141,6 +142,7 @@ class TensorHandle(object):
     return feeder.op.name + ";" + TensorHandle._get_reader_key(handle)
 
 
+@tf_export("get_session_handle")
 def get_session_handle(data, name=None):
   """Return the handle of `data`.
 
@@ -183,6 +185,7 @@ def get_session_handle(data, name=None):
     return gen_data_flow_ops._get_session_handle(data, name=name)  # pylint: disable=protected-access
 
 
+@tf_export("get_session_tensor")
 def get_session_tensor(handle, dtype, name=None):
   """Get the tensor of type `dtype` by feeding a tensor handle.
 
@@ -223,6 +226,7 @@ def get_session_tensor(handle, dtype, name=None):
   return (holder, tensor)
 
 
+@tf_export("delete_session_tensor")
 def delete_session_tensor(handle, name=None):
   """Delete the tensor for the given tensor handle.
 
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index 6aa9e3419ea497594b455bc5481dec5a77404bcf..b0eecd8a1e812857de8f47e1370e4fc5f1004bc0 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import gen_set_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 _VALID_DTYPES = set([
@@ -30,6 +31,7 @@ _VALID_DTYPES = set([
     dtypes.uint8, dtypes.uint16, dtypes.string])
 
 
+@tf_export("sets.set_size")
 def set_size(a, validate_indices=True):
   """Compute number of unique elements along last dimension of `a`.
 
@@ -131,6 +133,7 @@ def _set_operation(a, b, set_operation, validate_indices=True):
   return sparse_tensor.SparseTensor(indices, values, shape)
 
 
+@tf_export("sets.set_intersection")
 def set_intersection(a, b, validate_indices=True):
   """Compute set intersection of elements in last dimension of `a` and `b`.
 
@@ -197,6 +200,7 @@ def set_intersection(a, b, validate_indices=True):
   return _set_operation(a, b, "intersection", validate_indices)
 
 
+@tf_export("sets.set_difference")
 def set_difference(a, b, aminusb=True, validate_indices=True):
   """Compute set difference of elements in last dimension of `a` and `b`.
 
@@ -267,6 +271,7 @@ def set_difference(a, b, aminusb=True, validate_indices=True):
   return _set_operation(a, b, "a-b" if aminusb else "b-a", validate_indices)
 
 
+@tf_export("sets.set_union")
 def set_union(a, b, validate_indices=True):
   """Compute set union of elements in last dimension of `a` and `b`.
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index cdfe9e1c1ec7f39912f5e767ffdd291f29f5be88..0fbbf5a805f1439d85ad53f02bdb665c04248606 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -65,6 +65,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.gen_sparse_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _convert_to_sparse_tensor(sp_input):
@@ -108,6 +109,7 @@ def _convert_to_sparse_tensors(sp_inputs):
 
 
 # pylint: disable=protected-access
+@tf_export("sparse_concat")
 def sparse_concat(axis,
                   sp_inputs,
                   name=None,
@@ -225,17 +227,19 @@ def sparse_concat(axis,
             [array_ops.reshape(shape, [1, -1]) for shape in shapes], 0), 0)
     shapes = [
         array_ops.concat([
-            max_shape[:axis], shape[-1:] if axis == -1 else
-            shape[axis:axis + 1], [] if axis == -1 else max_shape[axis + 1:]
+            max_shape[:axis], shape[-1:]
+            if axis == -1 else shape[axis:axis + 1], []
+            if axis == -1 else max_shape[axis + 1:]
         ], 0) for shape in shapes
     ]
 
-  output_ind, output_val, output_shape = (gen_sparse_ops._sparse_concat(
-      inds, vals, shapes, axis, name=name))
+  output_ind, output_val, output_shape = (
+      gen_sparse_ops._sparse_concat(inds, vals, shapes, axis, name=name))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
+@tf_export("sparse_add")
 def sparse_add(a, b, thresh=0):
   """Adds two tensors, at least one of each is a `SparseTensor`.
 
@@ -297,15 +301,14 @@ def sparse_add(a, b, thresh=0):
     b = _convert_to_sparse_tensor(b)
     thresh = ops.convert_to_tensor(
         thresh, dtype=a.values.dtype.real_dtype.base_dtype, name="thresh")
-    output_ind, output_val, output_shape = (gen_sparse_ops._sparse_add(
-        a.indices, a.values, a.dense_shape,
-        b.indices, b.values, b.dense_shape,
-        thresh))
+    output_ind, output_val, output_shape = (
+        gen_sparse_ops._sparse_add(a.indices, a.values, a.dense_shape,
+                                   b.indices, b.values, b.dense_shape, thresh))
 
     # Attempt to get output_shape statically.
     a.get_shape().assert_is_compatible_with(b.get_shape())
-    static_shape = array_ops.broadcast_static_shape(
-        a.get_shape(), b.get_shape())
+    static_shape = array_ops.broadcast_static_shape(a.get_shape(),
+                                                    b.get_shape())
     if static_shape.is_fully_defined():
       output_shape = static_shape.as_list()
 
@@ -314,8 +317,8 @@ def sparse_add(a, b, thresh=0):
     # swap to make `a` the SparseTensor.
     if isinstance(b, sparse_classes):
       a, b = b, a
-    return gen_sparse_ops._sparse_tensor_dense_add(
-        a.indices, a.values, a.dense_shape, b)
+    return gen_sparse_ops._sparse_tensor_dense_add(a.indices, a.values,
+                                                   a.dense_shape, b)
 
 
 def _sparse_cross(inputs, name=None):
@@ -394,19 +397,25 @@ def _sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
 _DEFAULT_HASH_KEY = 0xDECAFCAFFE
 
 
-def _sparse_cross_internal(
-    inputs, hashed_output=False, num_buckets=0, hash_key=None, name=None):
+def _sparse_cross_internal(inputs,
+                           hashed_output=False,
+                           num_buckets=0,
+                           hash_key=None,
+                           name=None):
   """See gen_sparse_ops._sparse_cross."""
   if not isinstance(inputs, list):
     raise TypeError("Inputs must be a list")
-  if not all(isinstance(i, sparse_tensor.SparseTensor) or
-             isinstance(i, ops.Tensor) for i in inputs):
+  if not all(
+      isinstance(i, sparse_tensor.SparseTensor) or isinstance(i, ops.Tensor)
+      for i in inputs):
     raise TypeError("All inputs must be SparseTensors")
 
-  sparse_inputs = [i for i in inputs
-                   if isinstance(i, sparse_tensor.SparseTensor)]
-  dense_inputs = [i for i in inputs
-                  if not isinstance(i, sparse_tensor.SparseTensor)]
+  sparse_inputs = [
+      i for i in inputs if isinstance(i, sparse_tensor.SparseTensor)
+  ]
+  dense_inputs = [
+      i for i in inputs if not isinstance(i, sparse_tensor.SparseTensor)
+  ]
 
   indices = [sp_input.indices for sp_input in sparse_inputs]
   values = [sp_input.values for sp_input in sparse_inputs]
@@ -463,6 +472,7 @@ def sparse_dense_cwise_add(sp_t, dense_t):
   return sparse_tensor.SparseTensor(sp_t.indices, result, sp_t.dense_shape)
 
 
+@tf_export("sparse_reorder")
 def sparse_reorder(sp_input, name=None):
   """Reorders a `SparseTensor` into the canonical, row-major ordering.
 
@@ -500,8 +510,9 @@ def sparse_reorder(sp_input, name=None):
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  reordered_ind, reordered_val = (gen_sparse_ops._sparse_reorder(
-      sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
+  reordered_ind, reordered_val = (
+      gen_sparse_ops._sparse_reorder(
+          sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
 
   if sp_input.get_shape().is_fully_defined():
     dense_shape = sp_input.get_shape().as_list()
@@ -511,6 +522,7 @@ def sparse_reorder(sp_input, name=None):
   return sparse_tensor.SparseTensor(reordered_ind, reordered_val, dense_shape)
 
 
+@tf_export("sparse_reshape")
 def sparse_reshape(sp_input, shape, name=None):
   """Reshapes a `SparseTensor` to represent values in a new dense shape.
 
@@ -557,6 +569,7 @@ def sparse_reshape(sp_input, shape, name=None):
     TypeError: If `sp_input` is not a `SparseTensor`.
     ValueError:  If argument `shape` requests a `SparseTensor` with a different
       number of elements than `sp_input`.
+    ValueError:  If `shape` has more than one inferred (== -1) dimension.
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
   shape = math_ops.cast(shape, dtype=dtypes.int64)
@@ -566,22 +579,32 @@ def sparse_reshape(sp_input, shape, name=None):
         sp_input.indices, sp_input.dense_shape, shape, name=name)
 
     reshaped_shape_const = tensor_util.constant_value(shape)
-    if (reshaped_shape_const is not None
-        and sp_input.get_shape().is_fully_defined()):
-      # Don't deal with inferred dimensions. That would add significant code.
-      if all(n >= 0 for n in reshaped_shape_const):
-        reshaped_size = np.prod(reshaped_shape_const)
-        in_shape_size = np.prod(sp_input.get_shape().as_list())
-        if reshaped_size != in_shape_size:
-          raise ValueError(
-              "Cannot reshape a tensor with %d elements to shape %s "
-              "(%d elements)."
-              % (in_shape_size, reshaped_shape_const, reshaped_size))
-        reshaped_shape = reshaped_shape_const
-
-    return sparse_tensor.SparseTensor(
-        reshaped_ind, array_ops.identity(sp_input.values),
-        reshaped_shape)
+    if (reshaped_shape_const is not None and
+        sp_input.get_shape().is_fully_defined()):
+      num_implied = sum((dim == -1) for dim in reshaped_shape_const)
+      if num_implied > 1:
+        raise ValueError("At most one dimension can be inferred (-1). Found: %s"
+                         % reshaped_shape_const)
+      original_reshaped_shape = list(reshaped_shape_const)  # Copy.
+      in_shape_size = np.prod(sp_input.get_shape().as_list())
+      if num_implied:
+        implied_idx = original_reshaped_shape.index(-1)
+        non_implied_idx = (
+            original_reshaped_shape[:implied_idx] +
+            original_reshaped_shape[implied_idx + 1:])
+        reshaped_shape_const[implied_idx] = (
+            in_shape_size // np.prod(non_implied_idx))
+      reshaped_size = np.prod(reshaped_shape_const)
+      if reshaped_size != in_shape_size:
+        raise ValueError("Cannot reshape a tensor with %d elements to shape %s "
+                         "(%d elements)." %
+                         (in_shape_size, original_reshaped_shape,
+                          reshaped_size))
+      reshaped_shape = reshaped_shape_const
+
+    return sparse_tensor.SparseTensor(reshaped_ind,
+                                      array_ops.identity(sp_input.values),
+                                      reshaped_shape)
 
 
 # TODO(aselle): Remove keyword required once for 1.0 final
@@ -592,9 +615,13 @@ class KeywordRequired(object):
     return "KeywordRequired()"
 
 
+@tf_export("sparse_split")
 def sparse_split(keyword_required=KeywordRequired(),
-                 sp_input=None, num_split=None, axis=None,
-                 name=None, split_dim=None):
+                 sp_input=None,
+                 num_split=None,
+                 axis=None,
+                 name=None,
+                 split_dim=None):
   """Split a `SparseTensor` into `num_split` tensors along `axis`.
 
   If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split`
@@ -643,21 +670,23 @@ def sparse_split(keyword_required=KeywordRequired(),
                                                 split_dim)
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  output_inds, output_vals, output_shapes = (gen_sparse_ops._sparse_split(
-      axis,
-      sp_input.indices,
-      sp_input.values,
-      sp_input.dense_shape,
-      num_split,
-      name=name))
+  output_inds, output_vals, output_shapes = (
+      gen_sparse_ops._sparse_split(
+          axis,
+          sp_input.indices,
+          sp_input.values,
+          sp_input.dense_shape,
+          num_split,
+          name=name))
   sparse_tensors = []
   for i in range(0, num_split):
     sparse_tensors.append(
-        sparse_tensor.SparseTensor(
-            output_inds[i], output_vals[i], output_shapes[i]))
+        sparse_tensor.SparseTensor(output_inds[i], output_vals[i],
+                                   output_shapes[i]))
   return sparse_tensors
 
 
+@tf_export("sparse_slice")
 def sparse_slice(sp_input, start, size, name=None):
   """Slice a `SparseTensor` based on the `start` and `size.
 
@@ -695,13 +724,18 @@ def sparse_slice(sp_input, start, size, name=None):
 
   with ops.name_scope(name, "SparseSlice", [sp_input]) as name:
     output_indices, output_values, output_shape = gen_sparse_ops.sparse_slice(
-        sp_input.indices, sp_input.values, sp_input.dense_shape, start, size, name=name)
+        sp_input.indices,
+        sp_input.values,
+        sp_input.dense_shape,
+        start,
+        size,
+        name=name)
 
-    return sparse_tensor.SparseTensor(
-        output_indices,
-        output_values,
-        output_shape)
+    return sparse_tensor.SparseTensor(output_indices, output_values,
+                                      output_shape)
 
+
+@tf_export("sparse_to_dense")
 def sparse_to_dense(sparse_indices,
                     output_shape,
                     sparse_values,
@@ -757,6 +791,7 @@ def sparse_to_dense(sparse_indices,
       name=name)
 
 
+@tf_export("sparse_reduce_max")
 def sparse_reduce_max(sp_input, axis=None, keep_dims=False,
                       reduction_axes=None):
   """Computes the max of elements across dimensions of a SparseTensor.
@@ -798,13 +833,14 @@ def sparse_reduce_max(sp_input, axis=None, keep_dims=False,
     The reduced Tensor.
   """
   return gen_sparse_ops.sparse_reduce_max(
-      sp_input.indices, sp_input.values,
-      sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes),
-      keep_dims)
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims)
 
 
-def sparse_reduce_max_sparse(sp_input, axis=None, keep_dims=False,
+@tf_export("sparse_reduce_max_sparse")
+def sparse_reduce_max_sparse(sp_input,
+                             axis=None,
+                             keep_dims=False,
                              reduction_axes=None):
   """Computes the max of elements across dimensions of a SparseTensor.
 
@@ -833,14 +869,13 @@ def sparse_reduce_max_sparse(sp_input, axis=None, keep_dims=False,
   """
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_reduce_max_sparse(
-          sp_input.indices, sp_input.values,
-          sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis,
-                                                        reduction_axes),
-          keep_dims))
+          sp_input.indices, sp_input.values, sp_input.dense_shape,
+          math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
+@tf_export("sparse_reduce_sum")
 def sparse_reduce_sum(sp_input, axis=None, keep_dims=False,
                       reduction_axes=None):
   """Computes the sum of elements across dimensions of a SparseTensor.
@@ -882,13 +917,14 @@ def sparse_reduce_sum(sp_input, axis=None, keep_dims=False,
     The reduced Tensor.
   """
   return gen_sparse_ops.sparse_reduce_sum(
-      sp_input.indices, sp_input.values,
-      sp_input.dense_shape,
-      math_ops._ReductionDims(sp_input, axis, reduction_axes),
-      keep_dims)
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims)
 
 
-def sparse_reduce_sum_sparse(sp_input, axis=None, keep_dims=False,
+@tf_export("sparse_reduce_sum_sparse")
+def sparse_reduce_sum_sparse(sp_input,
+                             axis=None,
+                             keep_dims=False,
                              reduction_axes=None):
   """Computes the sum of elements across dimensions of a SparseTensor.
 
@@ -917,14 +953,13 @@ def sparse_reduce_sum_sparse(sp_input, axis=None, keep_dims=False,
   """
   output_ind, output_val, output_shape = (
       gen_sparse_ops.sparse_reduce_sum_sparse(
-          sp_input.indices, sp_input.values,
-          sp_input.dense_shape, math_ops._ReductionDims(sp_input, axis,
-                                                        reduction_axes),
-          keep_dims))
+          sp_input.indices, sp_input.values, sp_input.dense_shape,
+          math_ops._ReductionDims(sp_input, axis, reduction_axes), keep_dims))
 
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
+@tf_export("sparse_tensor_to_dense")
 def sparse_tensor_to_dense(sp_input,
                            default_value=0,
                            validate_indices=True,
@@ -976,6 +1011,7 @@ def sparse_tensor_to_dense(sp_input,
       name=name)
 
 
+@tf_export("sparse_to_indicator")
 def sparse_to_indicator(sp_input, vocab_size, name=None):
   """Converts a `SparseTensor` of ids into a dense bool indicator tensor.
 
@@ -1027,8 +1063,8 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
   with ops.name_scope(name, "SparseToIndicator", [sp_input]) as name:
     num_entries = array_ops.shape(sp_input.indices)[0]
     new_values = array_ops.fill(array_ops.expand_dims(num_entries, 0), True)
-    sp_values = sparse_tensor.SparseTensor(
-        sp_input.indices, new_values, sp_input.dense_shape)
+    sp_values = sparse_tensor.SparseTensor(sp_input.indices, new_values,
+                                           sp_input.dense_shape)
 
     sp_new = sparse_merge(sp_input, sp_values, vocab_size, name)
 
@@ -1038,6 +1074,7 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
         sp_new, default_value=False, validate_indices=False, name=name)
 
 
+@tf_export("sparse_merge")
 def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
                  already_sorted=False):
   """Combines a batch of feature ids and values into a single `SparseTensor`.
@@ -1147,8 +1184,7 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
       raise TypeError("vocab_size has to be a list of Tensors or Python ints. "
                       "Found %s" % type(vocab_size))
     for dim in vocab_size:
-      if not (isinstance(dim, ops.Tensor) or
-              isinstance(dim, numbers.Integral)):
+      if not (isinstance(dim, ops.Tensor) or isinstance(dim, numbers.Integral)):
         raise TypeError(
             "vocab_size has to be a list of Tensors or Python ints. Found %s" %
             type(dim))
@@ -1178,6 +1214,7 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
     return result if already_sorted else sparse_reorder(result)
 
 
+@tf_export("sparse_retain")
 def sparse_retain(sp_input, to_retain):
   """Retains specified non-empty values within a `SparseTensor`.
 
@@ -1221,6 +1258,7 @@ def sparse_retain(sp_input, to_retain):
                                     array_ops.identity(sp_input.dense_shape))
 
 
+@tf_export("sparse_reset_shape")
 def sparse_reset_shape(sp_input, new_shape=None):
   """Resets the shape of a `SparseTensor` with indices and values unchanged.
 
@@ -1297,24 +1335,23 @@ def sparse_reset_shape(sp_input, new_shape=None):
     # error before the sparse_tensor.SparseTensor catches it.
     output_shape_tensor.get_shape()[0].merge_with(in_shape.get_shape()[0])
 
-    output_shape_tensor_const = tensor_util.constant_value(
-        output_shape_tensor)
+    output_shape_tensor_const = tensor_util.constant_value(output_shape_tensor)
     # For cases where all shapes are known during graph construction
-    if (output_shape_tensor_const is not None
-        and sp_input.get_shape().is_fully_defined()):
+    if (output_shape_tensor_const is not None and
+        sp_input.get_shape().is_fully_defined()):
       in_shape_const = np.array(sp_input.get_shape().as_list())
       if not np.all(in_shape_const <= output_shape_tensor_const):
         raise ValueError(
             "Requested new_shape should have dimension sizes >= sp_input.shape."
-            "  Found new_shape (%s), sp_input.shape (%s)."
-            % (in_shape_const, output_shape_tensor_const))
+            "  Found new_shape (%s), sp_input.shape (%s)." %
+            (in_shape_const, output_shape_tensor_const))
       output_shape_tensor = output_shape_tensor_const
     else:
       # For cases where shape is not known during graph construction.
-      output_shape_tensor = control_flow_ops.with_dependencies(
-          [check_ops.assert_equal(
-              array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))],
-          output_shape_tensor)
+      output_shape_tensor = control_flow_ops.with_dependencies([
+          check_ops.assert_equal(
+              array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))
+      ], output_shape_tensor)
       output_shape_tensor = control_flow_ops.with_dependencies(
           [check_ops.assert_less_equal(in_shape, output_shape_tensor)],
           output_shape_tensor)
@@ -1322,6 +1359,7 @@ def sparse_reset_shape(sp_input, new_shape=None):
   return sparse_tensor.SparseTensor(in_indices, in_values, output_shape_tensor)
 
 
+@tf_export("sparse_fill_empty_rows")
 def sparse_fill_empty_rows(sp_input, default_value, name=None):
   """Fills empty rows in the input 2-D `SparseTensor` with a default value.
 
@@ -1379,22 +1417,24 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
          values=sp_input.values,
          dense_shape=sp_input.dense_shape,
          default_value=default_value)
-    return (sparse_tensor.SparseTensor(indices=output_indices,
-                                       values=output_values,
-                                       dense_shape=sp_input.dense_shape),
-            empty_row_indicator)
+    return (sparse_tensor.SparseTensor(
+        indices=output_indices,
+        values=output_values,
+        dense_shape=sp_input.dense_shape), empty_row_indicator)
 
 
-def serialize_sparse(sp_input, name=None):
-  """Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object.
+@tf_export("serialize_sparse")
+def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
+  """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
 
   Args:
     sp_input: The input `SparseTensor`.
     name: A name prefix for the returned tensors (optional).
+    out_type: The `dtype` to use for serialization.
 
   Returns:
-    A string 3-vector (1D `Tensor`), with each column representing the
-    serialized `SparseTensor`'s indices, values, and shape (respectively).
+    A 3-vector (1-D `Tensor`), with each column representing the serialized
+    `SparseTensor`'s indices, values, and shape (respectively).
 
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
@@ -1402,11 +1442,16 @@ def serialize_sparse(sp_input, name=None):
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._serialize_sparse(
-      sp_input.indices, sp_input.values, sp_input.dense_shape, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      name=name,
+      out_type=out_type)
 
 
-def serialize_many_sparse(sp_input, name=None):
-  """Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
+@tf_export("serialize_many_sparse")
+def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
+  """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
 
   The `SparseTensor` must have rank `R` greater than 1, and the first dimension
   is treated as the minibatch dimension.  Elements of the `SparseTensor`
@@ -1419,11 +1464,12 @@ def serialize_many_sparse(sp_input, name=None):
   Args:
     sp_input: The input rank `R` `SparseTensor`.
     name: A name prefix for the returned tensors (optional).
+    out_type: The `dtype` to use for serialization.
 
   Returns:
-    A string matrix (2-D `Tensor`) with `N` rows and `3` columns.
-    Each column represents serialized `SparseTensor`'s indices, values, and
-    shape (respectively).
+    A matrix (2-D `Tensor`) with `N` rows and `3` columns. Each column
+    represents serialized `SparseTensor`'s indices, values, and shape
+    (respectively).
 
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
@@ -1431,16 +1477,57 @@ def serialize_many_sparse(sp_input, name=None):
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._serialize_many_sparse(
-      sp_input.indices, sp_input.values, sp_input.dense_shape, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      name=name,
+      out_type=out_type)
 
 
 def deserialize_sparse(serialized_sparse, dtype, rank=None, name=None):
   """Deserialize `SparseTensor` objects.
 
-  The input is expected to have shape [d_1, ..., d_m, 3], where the last
-  dimension stores a serialized `SparseTensor`. The method deserializes
-  all input `SparseTensor`s, concatenates them into a single tensor, and
-  reshapes the sparse tensor to preserve the structure of the input.
+  The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+  the last dimension stores serialized `SparseTensor` objects and the other N
+  dimensions (N >= 0) correspond to a batch. The ranks of the original
+  `SparseTensor` objects must all match. When the final `SparseTensor` is
+  created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+  the sparse tensors have been concatenated along new dimensions, one for each
+  batch.
+
+  The output `SparseTensor` object's shape values for the original dimensions
+  are the max across the input `SparseTensor` objects' shape values for the
+  corresponding dimensions. The new dimensions match the size of the batch.
+
+  The input `SparseTensor` objects' indices are assumed ordered in
+  standard lexicographic order.  If this is not the case, after this
+  step run `SparseReorder` to restore index ordering.
+
+  For example, if the serialized input is a `[2 x 3]` matrix representing two
+  original `SparseTensor` objects:
+
+      index = [ 0]
+              [10]
+              [20]
+      values = [1, 2, 3]
+      shape = [50]
+
+  and
+
+      index = [ 2]
+              [10]
+      values = [4, 5]
+      shape = [30]
+
+  then the final deserialized `SparseTensor` will be:
+
+      index = [0  0]
+              [0 10]
+              [0 20]
+              [1  2]
+              [1 10]
+      values = [1, 2, 3, 4, 5]
+      shape = [2 50]
 
   Args:
     serialized_sparse: The serialized `SparseTensor` objects.
@@ -1463,6 +1550,7 @@ def deserialize_sparse(serialized_sparse, dtype, rank=None, name=None):
   return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
 
 
+@tf_export("deserialize_many_sparse")
 def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
   """Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 
@@ -1532,6 +1620,7 @@ def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
   return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
 
 
+@tf_export("sparse_tensor_dense_matmul")
 def sparse_tensor_dense_matmul(sp_a,
                                b,
                                adjoint_a=False,
@@ -1748,6 +1837,7 @@ def sparse_tensor_dense_matmul(sp_a,
         adjoint_b=adjoint_b)
 
 
+@tf_export("sparse_softmax")
 def sparse_softmax(sp_input, name=None):
   """Applies softmax to a batched N-D `SparseTensor`.
 
@@ -1798,10 +1888,11 @@ def sparse_softmax(sp_input, name=None):
                       [sp_input.indices, sp_input.values]) as name:
     out_vals = gen_sparse_ops.sparse_softmax(sp_input.indices, sp_input.values,
                                              sp_input.dense_shape)
-    return sparse_tensor.SparseTensor(
-        sp_input.indices, out_vals, sp_input.dense_shape)
+    return sparse_tensor.SparseTensor(sp_input.indices, out_vals,
+                                      sp_input.dense_shape)
 
 
+@tf_export("sparse_maximum")
 def sparse_maximum(sp_a, sp_b, name=None):
   """Returns the element-wise max of two SparseTensors.
 
@@ -1824,9 +1915,9 @@ def sparse_maximum(sp_a, sp_b, name=None):
   Returns:
     output: the output SparseTensor.
   """
-  with ops.name_scope(name, "SparseSparseMaximum", [sp_a.indices, sp_a.values,
-                                                    sp_b.indices,
-                                                    sp_b.values]) as name:
+  with ops.name_scope(
+      name, "SparseSparseMaximum",
+      [sp_a.indices, sp_a.values, sp_b.indices, sp_b.values]) as name:
     out_indices, out_values = gen_sparse_ops.sparse_sparse_maximum(
         sp_a.indices,
         sp_a.values,
@@ -1838,6 +1929,7 @@ def sparse_maximum(sp_a, sp_b, name=None):
   return sparse_tensor.SparseTensor(out_indices, out_values, sp_a.dense_shape)
 
 
+@tf_export("sparse_minimum")
 def sparse_minimum(sp_a, sp_b, name=None):
   """Returns the element-wise min of two SparseTensors.
 
@@ -1860,9 +1952,9 @@ def sparse_minimum(sp_a, sp_b, name=None):
   Returns:
     output: the output SparseTensor.
   """
-  with ops.name_scope(name, "SparseSparseMinimum", [sp_a.indices, sp_a.values,
-                                                    sp_b.indices,
-                                                    sp_b.values]) as name:
+  with ops.name_scope(
+      name, "SparseSparseMinimum",
+      [sp_a.indices, sp_a.values, sp_b.indices, sp_b.values]) as name:
     out_indices, out_values = gen_sparse_ops.sparse_sparse_minimum(
         sp_a.indices,
         sp_a.values,
@@ -1874,6 +1966,7 @@ def sparse_minimum(sp_a, sp_b, name=None):
   return sparse_tensor.SparseTensor(out_indices, out_values, sp_a.dense_shape)
 
 
+@tf_export("sparse_transpose")
 def sparse_transpose(sp_input, perm=None, name=None):
   """Transposes a `SparseTensor`
 
@@ -1914,17 +2007,26 @@ def sparse_transpose(sp_input, perm=None, name=None):
     indices = sp_input.indices
     transposed_indices = array_ops.transpose(
         array_ops.gather(array_ops.transpose(indices), perm))
-    dense_shape = sp_input.dense_shape
-    transposed_dense_shape = array_ops.gather(dense_shape, perm)
+
+    perm_ = tensor_util.constant_value(ops.convert_to_tensor(perm))
+    if perm_ is not None and sp_input.get_shape().is_fully_defined():
+      old_shape_ = sp_input.get_shape().as_list()
+      transposed_dense_shape = list(old_shape_)  # Copy.
+      for i, p in enumerate(perm_):
+        transposed_dense_shape[i] = old_shape_[p]
+    else:
+      dense_shape = sp_input.dense_shape
+      transposed_dense_shape = array_ops.gather(dense_shape, perm)
     transposed_st = sparse_tensor.SparseTensor(
-        transposed_indices, sp_input.values,
-        transposed_dense_shape)
+        transposed_indices, sp_input.values, transposed_dense_shape)
     transposed_st = sparse_reorder(transposed_st)
     return transposed_st
 
 
-def _add_sparse_to_tensors_map(sp_input, container=None,
-                               shared_name=None, name=None):
+def _add_sparse_to_tensors_map(sp_input,
+                               container=None,
+                               shared_name=None,
+                               name=None):
   """Add a `SparseTensor` to a `SparseTensorsMap` and return its handle.
 
   Args:
@@ -1945,12 +2047,18 @@ def _add_sparse_to_tensors_map(sp_input, container=None,
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._add_sparse_to_tensors_map(
-      sp_input.indices, sp_input.values, sp_input.dense_shape,
-      container=container, shared_name=shared_name, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      container=container,
+      shared_name=shared_name,
+      name=name)
 
 
-def _add_many_sparse_to_tensors_map(sp_input, container=None,
-                                    shared_name=None, name=None):
+def _add_many_sparse_to_tensors_map(sp_input,
+                                    container=None,
+                                    shared_name=None,
+                                    name=None):
   """Add a minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
 
   The `SparseTensor` must have rank `R` greater than 1, and the first dimension
@@ -1979,12 +2087,18 @@ def _add_many_sparse_to_tensors_map(sp_input, container=None,
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._add_many_sparse_to_tensors_map(
-      sp_input.indices, sp_input.values, sp_input.dense_shape,
-      container=container, shared_name=shared_name, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      container=container,
+      shared_name=shared_name,
+      name=name)
 
 
-def _take_many_sparse_from_tensors_map(
-    sparse_map_op, sparse_handles, rank=None, name=None):
+def _take_many_sparse_from_tensors_map(sparse_map_op,
+                                       sparse_handles,
+                                       rank=None,
+                                       name=None):
   """Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 
   The input `sparse_handles` must be a string matrix of shape `[N, 1]` where
@@ -2047,16 +2161,18 @@ def _take_many_sparse_from_tensors_map(
     raise TypeError("sparse_map_op be an Operation")
   if sparse_map_op.type not in ("AddSparseToTensorsMap",
                                 "AddManySparseToTensorsMap"):
-    raise TypeError("sparse_map_op must be one of AddSparseToTensorsMap or "
-                    "AddSparseToTensorsMap. Instead, found `%s`." %
-                    sparse_map_op.type)
+    raise TypeError(
+        "sparse_map_op must be one of AddSparseToTensorsMap or "
+        "AddSparseToTensorsMap. Instead, found `%s`." % sparse_map_op.type)
   with ops.colocate_with(sparse_map_op):
     shared_name = sparse_map_op.get_attr("shared_name") or sparse_map_op.name
     output_indices, output_values, output_shape = (
         gen_sparse_ops._take_many_sparse_from_tensors_map(
-            sparse_handles, dtype=sparse_map_op.get_attr("T"),
+            sparse_handles,
+            dtype=sparse_map_op.get_attr("T"),
             container=sparse_map_op.get_attr("container"),
-            shared_name=shared_name, name=name))
+            shared_name=shared_name,
+            name=name))
 
   # Feed rank data back in, if available
   output_indices.set_shape([None, rank])
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index fe3f7343222f7b10bc6af272146e8960d6f39c3d..6d7eaababcd94d687ff20dddc35c68a98320a19b 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -31,9 +31,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(b/27419586) Change docstring for required dtype of x once int allowed
+@tf_export('lbeta')
 def lbeta(x, name='lbeta'):
   r"""Computes \\(ln(|Beta(x)|)\\), reducing along the last dimension.
 
@@ -82,6 +84,7 @@ def lbeta(x, name='lbeta'):
     return result
 
 
+@tf_export('einsum', 'linalg.einsum')
 def einsum(equation, *inputs, **kwargs):
   """A generalized contraction between tensors of arbitrary dimension.
 
@@ -152,27 +155,24 @@ def einsum(equation, *inputs, **kwargs):
         indices in its subscript, or
       - the input shapes are inconsistent along a particular axis.
   """
-  name = kwargs.pop("name", None)
+  name = kwargs.pop('name', None)
   if kwargs:
-    raise TypeError("invalid keyword arguments for this function: " +
-                    ", ".join([format(key)
-                               for key in sorted(list(kwargs.keys()))]))
-  with ops.name_scope(name, "einsum", [equation, inputs]) as name:
+    raise TypeError('invalid keyword arguments for this function: ' + ', '.join(
+        [format(key) for key in sorted(list(kwargs.keys()))]))
+  with ops.name_scope(name, 'einsum', [equation, inputs]) as name:
     if '...' in equation:
       raise ValueError('Subscripts with ellipses are not yet supported.')
 
     match = re.match('([a-z,]+)(->[a-z]*)?', equation)
     if not match:
-      raise ValueError(
-          'Indices have incorrect format: %s' % equation
-      )
+      raise ValueError('Indices have incorrect format: %s' % equation)
 
     inputs = list(inputs)
     input_axis_labels = match.group(1).split(',')
 
     if len(inputs) != len(input_axis_labels):
-      raise ValueError('Got %d arguments for equation "%s", expecting %d' % (
-          len(inputs), equation, len(input_axis_labels)))
+      raise ValueError('Got %d arguments for equation "%s", expecting %d' %
+                       (len(inputs), equation, len(input_axis_labels)))
 
     axis_labels = set(''.join(input_axis_labels))
     if match.group(2):
@@ -185,37 +185,36 @@ def einsum(equation, *inputs, **kwargs):
         for ax in axes_:
           counts[ax] += 1
 
-      output_axis_labels = ''.join(sorted(
-          ax for ax in indices
-          if counts[ax] == 1
-      ))
+      output_axis_labels = ''.join(
+          sorted(ax for ax in indices if counts[ax] == 1))
 
     for a in axis_labels:
       input_count = sum(1 for s in input_axis_labels if a in s)
       if input_count > 2 and a not in output_axis_labels:
         logging.warn(
-            'Falling back to exponential-space implementation of einsum() because'
-            ' index "%s" is summed over more than two inputs.', a)
+            'Falling back to exponential-space implementation of einsum()'
+            ' because index "%s" is summed over more than two inputs.', a)
         return _exponential_space_einsum(equation, *inputs)
 
     temp = inputs[0]
     temp_axis_labels = input_axis_labels[0]
-    for i in xrange(len(inputs)-1):
-      axes_to_sum = (set(temp_axis_labels) & set(input_axis_labels[i+1])
-                     - set(output_axis_labels))
-      temp, temp_axis_labels = _einsum_reduction(temp,
-                                                 temp_axis_labels,
-                                                 inputs[i+1],
-                                                 input_axis_labels[i+1],
-                                                 axes_to_sum)
+    for i in xrange(len(inputs) - 1):
+      axes_to_sum = (
+          set(temp_axis_labels) &
+          set(input_axis_labels[i + 1]) - set(output_axis_labels))
+      temp, temp_axis_labels = _einsum_reduction(
+          temp, temp_axis_labels, inputs[i + 1], input_axis_labels[i + 1],
+          axes_to_sum)
 
     missing_indices = set(temp_axis_labels) - set(output_axis_labels)
     if missing_indices:
-      reduction_indices = [i for i, a in enumerate(temp_axis_labels)
-                           if a not in output_axis_labels]
+      reduction_indices = [
+          i for i, a in enumerate(temp_axis_labels)
+          if a not in output_axis_labels
+      ]
       temp = math_ops.reduce_sum(temp, reduction_indices=reduction_indices)
-      temp_axis_labels = ''.join(a for a in temp_axis_labels
-                                 if a in output_axis_labels)
+      temp_axis_labels = ''.join(
+          a for a in temp_axis_labels if a in output_axis_labels)
 
     if sorted(temp_axis_labels) != sorted(output_axis_labels):
       raise ValueError('Invalid equation: %s' % equation)
@@ -293,8 +292,10 @@ def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum):
       return (1, a)
 
   axis_labels = [t0_axis_labels, t1_axis_labels]
-  sorted_axes = [sorted(sym_list, key=lambda a: sort_key(i, a))
-                 for i, sym_list in enumerate(axis_labels)]
+  sorted_axes = [
+      sorted(sym_list, key=lambda a: sort_key(i, a))
+      for i, sym_list in enumerate(axis_labels)
+  ]
   inputs = [t0, t1]
   for i, axes_str in enumerate(axis_labels):
     perm = [axes_str.find(a) for a in sorted_axes[i]]
@@ -322,30 +323,30 @@ def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum):
     num_broadcast_elements_t0 = _total_size(
         t0_shape[len(preserved_axes):-len(axes_to_sum)])
     num_summed_elements = _total_size(t0_shape[-len(axes_to_sum):])
-    new_shape = (t0_shape[:len(preserved_axes)]
-                 + [num_broadcast_elements_t0, num_summed_elements])
+    new_shape = (
+        t0_shape[:len(preserved_axes)] +
+        [num_broadcast_elements_t0, num_summed_elements])
     t0 = _reshape_if_necessary(t0, new_shape)
 
     t1_shape = _get_shape(t1)
     num_broadcast_elements_t1 = _total_size(
-        t1_shape[len(preserved_axes)+len(axes_to_sum):])
-    new_shape = (t1_shape[:len(preserved_axes)]
-                 + [num_summed_elements, num_broadcast_elements_t1])
+        t1_shape[len(preserved_axes) + len(axes_to_sum):])
+    new_shape = (
+        t1_shape[:len(preserved_axes)] +
+        [num_summed_elements, num_broadcast_elements_t1])
     t1 = _reshape_if_necessary(t1, new_shape)
 
     product = math_ops.matmul(t0, t1)
 
     # Undo compaction of broadcast axes
     uncompacted_shape = (
-        t0_shape[:len(preserved_axes)+len(broadcast_axes[0])]
-        + t1_shape[len(t1_shape)-len(broadcast_axes[1]):]
-    )
+        t0_shape[:len(preserved_axes) + len(broadcast_axes[0])] +
+        t1_shape[len(t1_shape) - len(broadcast_axes[1]):])
     product = _reshape_if_necessary(product, uncompacted_shape)
 
     product_axes = (
-        sorted_axes[0][:len(preserved_axes)+len(broadcast_axes[0])] +
-        sorted_axes[1][len(sorted_axes[1])-len(broadcast_axes[1]):]
-    )
+        sorted_axes[0][:len(preserved_axes) + len(broadcast_axes[0])] +
+        sorted_axes[1][len(sorted_axes[1]) - len(broadcast_axes[1]):])
 
     return product, ''.join(product_axes)
 
@@ -399,13 +400,11 @@ def _total_size(shape_values):
 def _exponential_space_einsum(equation, *inputs):
   """Fallback implementation that supports summing an index over > 2 inputs."""
   if '...' in equation:
-    raise ValueError("Subscripts with ellipses are not yet supported.")
+    raise ValueError('Subscripts with ellipses are not yet supported.')
 
   match = re.match('([a-z,]+)(->[a-z]*)?', equation)
   if not match:
-    raise ValueError(
-        'Indices have incorrect format: %s' % equation
-    )
+    raise ValueError('Indices have incorrect format: %s' % equation)
 
   inputs = list(inputs)
   idx_in = match.group(1).split(',')
@@ -422,21 +421,15 @@ def _exponential_space_einsum(equation, *inputs):
       for ax in axes_:
         counts[ax] += 1
 
-    idx_out = ''.join(sorted(
-        ax for ax in indices
-        if counts[ax] == 1
-    ))
+    idx_out = ''.join(sorted(ax for ax in indices if counts[ax] == 1))
 
   if len(idx_in) != len(inputs):
-    raise ValueError(
-        'Expected %d inputs but got %d' % (len(idx_in), len(inputs))
-    )
+    raise ValueError('Expected %d inputs but got %d' % (len(idx_in),
+                                                        len(inputs)))
 
   missing_idx = set(idx_out).difference(idx_all)
   if missing_idx:
-    raise ValueError(
-        'Unknown output axes: %s' % missing_idx
-    )
+    raise ValueError('Unknown output axes: %s' % missing_idx)
 
   axis_order = {}
   for ax in indices:
@@ -449,18 +442,17 @@ def _exponential_space_einsum(equation, *inputs):
   for i, (input_, axes_) in enumerate(zip(inputs, idx_in)):
     if input_.get_shape().ndims != len(axes_):
       raise ValueError(
-        'Input %d with axes %s has incorrect' \
-        ' number of dimensions (expected %d, got %d)' % (
-          i, axes_, len(axes_), input_.get_shape().ndims
-        )
+          'Input %d with axes %s has incorrect' \
+          ' number of dimensions (expected %d, got %d)' % (
+              i, axes_, len(axes_), input_.get_shape().ndims
+          )
       )
 
     sorted_idx = sorted(axes_, key=axis_order.get)
 
     if len(set(axes_)) != len(axes_):
       raise ValueError(
-          'Subscript not supported: an axis appears more than once: %s' % axes_
-      )
+          'Subscript not supported: an axis appears more than once: %s' % axes_)
 
     if list(axes_) != sorted_idx:
       permuted = [axes_.find(ax) for ax in sorted_idx]
@@ -484,16 +476,15 @@ def _exponential_space_einsum(equation, *inputs):
           dims.append(dim)
 
     if len(set(dims)) > 1:
-      raise ValueError(
-          'Dimension mismatch on axis: %s' % ax
-      )
+      raise ValueError('Dimension mismatch on axis: %s' % ax)
 
     if ax not in idx_out:
       reduction_idx.append(j)
 
   # reshape, multiply
-  expanded_inputs = [array_ops.reshape(input_, shape)
-                     for input_, shape in zip(inputs, shapes)]
+  expanded_inputs = [
+      array_ops.reshape(input_, shape) for input_, shape in zip(inputs, shapes)
+  ]
   expanded_output = 1
   for input_ in expanded_inputs:
     expanded_output *= input_
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 6581e9f922518e5ebae0bc43aa8595d5b686e188..2c212f45483eacfd3fd27eecb8d7b2c846b5fe96 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -39,8 +39,9 @@ class LBetaTest(test.TestCase):
     x_one_half = [2, 1.]
     with self.test_session(use_gpu=True):
       self.assertAllClose(1, math_ops.exp(special_math_ops.lbeta(x_one)).eval())
-      self.assertAllClose(
-          0.5, math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
+      self.assertAllClose(0.5,
+                          math_ops.exp(
+                              special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual([], special_math_ops.lbeta(x_one).get_shape())
 
   def test_one_dimensional_arg_dynamic(self):
@@ -70,8 +71,9 @@ class LBetaTest(test.TestCase):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
     with self.test_session(use_gpu=True):
-      self.assertAllClose(
-          [0.5, 0.5], math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
+      self.assertAllClose([0.5, 0.5],
+                          math_ops.exp(
+                              special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual((2,), special_math_ops.lbeta(x_one_half).get_shape())
 
   def test_two_dimensional_arg_dynamic(self):
@@ -86,10 +88,12 @@ class LBetaTest(test.TestCase):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
     with self.test_session(use_gpu=True):
-      self.assertAllClose(
-          [0.5, 0.5], math_ops.exp(special_math_ops.lbeta(x_one_half)).eval())
+      self.assertAllClose([0.5, 0.5],
+                          math_ops.exp(
+                              special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual(
-          (2,), array_ops.shape(special_math_ops.lbeta(x_one_half)).eval())
+          (2,),
+          array_ops.shape(special_math_ops.lbeta(x_one_half)).eval())
       self.assertEqual(
           tensor_shape.TensorShape([2]),
           special_math_ops.lbeta(x_one_half).get_shape())
@@ -97,8 +101,8 @@ class LBetaTest(test.TestCase):
   def test_complicated_shape(self):
     with self.test_session(use_gpu=True):
       x = ops.convert_to_tensor(np.random.rand(3, 2, 2))
-      self.assertAllEqual(
-          (3, 2), array_ops.shape(special_math_ops.lbeta(x)).eval())
+      self.assertAllEqual((3, 2),
+                          array_ops.shape(special_math_ops.lbeta(x)).eval())
       self.assertEqual(
           tensor_shape.TensorShape([3, 2]),
           special_math_ops.lbeta(x).get_shape())
@@ -155,7 +159,6 @@ class EinsumTest(test.TestCase):
       'ijk->i',
       'ijk->kji',
       'ji,kj->ik',
-
       'ikl,kji->kl',
       'klj,lki->ij',
       'ijk,ilj->kli',
@@ -164,7 +167,6 @@ class EinsumTest(test.TestCase):
       'i,ijk,j->k',
       'ij,ij,jk,kl->il',
       'ij,kj,il,jm->ml',
-
       'a,ab,abc->abc',
       'a,b,ab->ab',
       'ab,ab,c->',
@@ -173,25 +175,21 @@ class EinsumTest(test.TestCase):
       'ab,ab,cd,cd->ac',
       'ab,ab,cd,cd->cd',
       'ab,ab,cd,cd,ef,ef->',
-
       'ab,cd,ef->abcdef',
       'ab,cd,ef->acdf',
       'ab,cd,de->abcde',
       'ab,cd,de->be',
       'ab,bcd,cd->abcd',
       'ab,bcd,cd->abd',
-
       'eb,cb,fb->cef',
       'abcd,ad',
       'bd,db,eac->ace',
       'ba,ac,da->bcd',
-
       'ab,ab',
       'ab,ba',
       'abc,abc',
       'abc,bac',
       'abc,cba',
-
       'dba,ead,cad->bce',
       'aef,fbc,dca->bde',
   ]
@@ -223,7 +221,7 @@ class EinsumTest(test.TestCase):
 
   dim_mismatch_cases = [('ijk,jkl->il', [(2, 3, 4), (3, 5, 6)])]
 
-  def test_simple(self):
+  def disabled_test_simple(self):
     for case in self.simple_cases:
       self.run_test(case)
 
@@ -234,10 +232,8 @@ class EinsumTest(test.TestCase):
   def test_invalid(self):
     for axes in self.invalid_cases:
       inputs = [
-          array_ops.placeholder(
-              dtypes.float32, shape=(3, 4)),
-          array_ops.placeholder(
-              dtypes.float32, shape=(3, 4)),
+          array_ops.placeholder(dtypes.float32, shape=(3, 4)),
+          array_ops.placeholder(dtypes.float32, shape=(3, 4)),
       ]
       with self.assertRaises(ValueError):
         _ = special_math_ops.einsum(axes, *inputs)
@@ -245,16 +241,22 @@ class EinsumTest(test.TestCase):
   def test_invalid_keyword_arguments(self):
     m0 = array_ops.placeholder(dtypes.int32, shape=(1, None))
     m1 = array_ops.placeholder(dtypes.int32, shape=(None, 1))
-    with self.assertRaisesRegexp(TypeError,
+    with self.assertRaisesRegexp(
+        TypeError,
         'invalid keyword arguments for this function: invalid1, invalid2'):
-      _ = special_math_ops.einsum('ij,jk->ik', m0, m1, name="name",
-                                  invalid1="value1", invalid2="value2")
+      _ = special_math_ops.einsum(
+          'ij,jk->ik',
+          m0,
+          m1,
+          name='name',
+          invalid1='value1',
+          invalid2='value2')
 
   def test_dim_mismatch(self):
     for axes, input_shapes in self.dim_mismatch_cases:
       inputs = [
-          array_ops.placeholder(
-              dtypes.float32, shape=shape) for shape in input_shapes
+          array_ops.placeholder(dtypes.float32, shape=shape)
+          for shape in input_shapes
       ]
       with self.assertRaises(ValueError):
         _ = special_math_ops.einsum(axes, *inputs)
@@ -291,8 +293,8 @@ class EinsumTest(test.TestCase):
             m0: [[1, 2, 3]],
             m1: [[2], [1], [1]],
         }
-        np.testing.assert_almost_equal(
-            [[7]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[7]], sess.run(
+            out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(None, 3))
@@ -312,11 +314,11 @@ class EinsumTest(test.TestCase):
       out = special_math_ops.einsum('ijk,kl->ijl', m0, m1)
       with session.Session() as sess:
         feed_dict = {
-            m0: [[[1,2]]],
+            m0: [[[1, 2]]],
             m1: [[3], [2]],
         }
-        np.testing.assert_almost_equal(
-            [[[7]]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[[7]]],
+                                       sess.run(out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(2, 1))
@@ -325,10 +327,10 @@ class EinsumTest(test.TestCase):
       with session.Session() as sess:
         feed_dict = {
             m0: [[3], [2]],
-            m1: [[[1,2]]],
+            m1: [[[1, 2]]],
         }
-        np.testing.assert_almost_equal(
-            [[[7]]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[[7]]],
+                                       sess.run(out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(None, None, 2))
@@ -339,8 +341,8 @@ class EinsumTest(test.TestCase):
             m0: [[[1, 2]]],
             m1: [3, 2],
         }
-        np.testing.assert_almost_equal(
-           [[7]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[7]], sess.run(
+            out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(None, 2, None, 2))
@@ -351,8 +353,8 @@ class EinsumTest(test.TestCase):
             m0: [[[[1, 2]], [[2, 1]]]],
             m1: [[3, 2]],
         }
-        np.testing.assert_almost_equal(
-            [[[7, 8]]], sess.run(out, feed_dict=feed_dict))
+        np.testing.assert_almost_equal([[[7, 8]]],
+                                       sess.run(out, feed_dict=feed_dict))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/spectral_ops.py b/tensorflow/python/ops/spectral_ops.py
index 69f868c67ada748ef76029155e470d79a643cbf4..a5796882768a87c76e0acdec9b3d99caf41e02eb 100644
--- a/tensorflow/python/ops/spectral_ops.py
+++ b/tensorflow/python/ops/spectral_ops.py
@@ -41,6 +41,7 @@ from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import math_ops as _math_ops
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _infer_fft_length_for_rfft(input_tensor, fft_rank):
@@ -164,11 +165,17 @@ ifft2d = gen_spectral_ops.ifft2d
 fft3d = gen_spectral_ops.fft3d
 ifft3d = gen_spectral_ops.ifft3d
 rfft = _rfft_wrapper(gen_spectral_ops.rfft, 1, "rfft")
+tf_export("spectral.rfft")(rfft)
 irfft = _irfft_wrapper(gen_spectral_ops.irfft, 1, "irfft")
+tf_export("spectral.irfft")(irfft)
 rfft2d = _rfft_wrapper(gen_spectral_ops.rfft2d, 2, "rfft2d")
+tf_export("spectral.rfft2d")(rfft2d)
 irfft2d = _irfft_wrapper(gen_spectral_ops.irfft2d, 2, "irfft2d")
+tf_export("spectral.irfft2d")(irfft2d)
 rfft3d = _rfft_wrapper(gen_spectral_ops.rfft3d, 3, "rfft3d")
+tf_export("spectral.rfft3d")(rfft3d)
 irfft3d = _irfft_wrapper(gen_spectral_ops.irfft3d, 3, "irfft3d")
+tf_export("spectral.irfft3d")(irfft3d)
 
 
 def _validate_dct_arguments(dct_type, n, axis, norm):
@@ -184,6 +191,7 @@ def _validate_dct_arguments(dct_type, n, axis, norm):
 
 
 # TODO(rjryan): Implement `type`, `n` and `axis` parameters.
+@tf_export("spectral.dct")
 def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
   """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
 
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 30bf4e4ef1b96ea68e9020621f37551ac619a3c2..f6d9111009dc4f6a58ac81e7071ed7fe406600fa 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -25,7 +25,9 @@ import sys as _sys
 # Imports the following modules so that @RegisterGradient get executed.
 from tensorflow.python.ops import array_grad
 from tensorflow.python.ops import data_flow_grad
+from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import math_grad
+from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import sparse_grad
 from tensorflow.python.ops import spectral_grad
 from tensorflow.python.ops import state_grad
@@ -42,11 +44,12 @@ from tensorflow.python.ops.special_math_ops import *
 # TODO(vrv): Switch to import * once we're okay with exposing the module.
 from tensorflow.python.ops.confusion_matrix import confusion_matrix
 from tensorflow.python.ops.control_flow_ops import Assert
+from tensorflow.python.ops.control_flow_ops import case
+from tensorflow.python.ops.control_flow_ops import cond
 from tensorflow.python.ops.control_flow_ops import group
 from tensorflow.python.ops.control_flow_ops import no_op
-from tensorflow.python.ops.control_flow_ops import tuple
-from tensorflow.python.ops.control_flow_ops import cond
-from tensorflow.python.ops.control_flow_ops import case
+from tensorflow.python.ops.control_flow_ops import tuple  # pylint: disable=redefined-builtin
+# pylint: enable=redefined-builtin
 from tensorflow.python.ops.control_flow_ops import while_loop
 from tensorflow.python.ops.data_flow_ops import *
 from tensorflow.python.ops.functional_ops import *
@@ -59,6 +62,7 @@ from tensorflow.python.ops.logging_ops import Print
 from tensorflow.python.ops.logging_ops import get_summary_op
 from tensorflow.python.ops.lookup_ops import initialize_all_tables
 from tensorflow.python.ops.lookup_ops import tables_initializer
+from tensorflow.python.ops.manip_ops import *
 from tensorflow.python.ops.math_ops import *
 from tensorflow.python.ops.numerics import *
 from tensorflow.python.ops.parsing_ops import *
@@ -105,6 +109,7 @@ from tensorflow.python.ops import init_ops as _init_ops
 from tensorflow.python.ops import io_ops as _io_ops
 from tensorflow.python.ops import linalg_ops as _linalg_ops
 from tensorflow.python.ops import logging_ops as _logging_ops
+from tensorflow.python.ops import manip_ops as _manip_ops
 from tensorflow.python.ops import math_ops as _math_ops
 from tensorflow.python.ops import numerics as _numerics
 from tensorflow.python.ops import parsing_ops as _parsing_ops
@@ -264,34 +269,36 @@ _allowed_symbols = (_allowed_symbols_array_ops +
                     _allowed_symbols_misc +
                     _allowed_symbols_partitioned_variables)
 
-remove_undocumented(__name__, _allowed_symbols,
-                    [_sys.modules[__name__],
-                     _array_ops,
-                     _check_ops,
-                     _clip_ops,
-                     _confusion_matrix,
-                     _control_flow_ops,
-                     _constant_op,
-                     _data_flow_ops,
-                     _functional_ops,
-                     _gradients,
-                     _histogram_ops,
-                     _init_ops,
-                     _io_ops,
-                     _linalg_ops,
-                     _logging_ops,
-                     _math_ops,
-                     _numerics,
-                     _parsing_ops,
-                     _partitioned_variables,
-                     _random_ops,
-                     _script_ops,
-                     _session_ops,
-                     _sparse_ops,
-                     _special_math_ops,
-                     _state_ops,
-                     _string_ops,
-                     _template,
-                     _tensor_array_ops,
-                     _variable_scope,
-                     _variables,])
+remove_undocumented(__name__, _allowed_symbols, [
+    _sys.modules[__name__],
+    _array_ops,
+    _check_ops,
+    _clip_ops,
+    _confusion_matrix,
+    _control_flow_ops,
+    _constant_op,
+    _data_flow_ops,
+    _functional_ops,
+    _gradients,
+    _histogram_ops,
+    _init_ops,
+    _io_ops,
+    _linalg_ops,
+    _logging_ops,
+    _manip_ops,
+    _math_ops,
+    _numerics,
+    _parsing_ops,
+    _partitioned_variables,
+    _random_ops,
+    _script_ops,
+    _session_ops,
+    _sparse_ops,
+    _special_math_ops,
+    _state_ops,
+    _string_ops,
+    _template,
+    _tensor_array_ops,
+    _variable_scope,
+    _variables,
+])
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index dfc657893cd6e7622833e4234d2f490dc1f2b690..6c0a090d16bb328de40f02edf9865a0e0a62d385 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -89,6 +89,7 @@ from tensorflow.python.ops import gen_state_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_state_ops import *
+from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
 
 
@@ -189,6 +190,7 @@ def is_variable_initialized(ref, name=None):
                                                            name=name)
 
 
+@tf_export("assign_sub")
 def assign_sub(ref, value, use_locking=None, name=None):
   """Update 'ref' by subtracting 'value' from it.
 
@@ -217,6 +219,7 @@ def assign_sub(ref, value, use_locking=None, name=None):
   return ref.assign_sub(value)
 
 
+@tf_export("assign_add")
 def assign_add(ref, value, use_locking=None, name=None):
   """Update 'ref' by adding 'value' to it.
 
@@ -245,6 +248,7 @@ def assign_add(ref, value, use_locking=None, name=None):
   return ref.assign_add(value)
 
 
+@tf_export("assign")
 def assign(ref, value, validate_shape=None, use_locking=None, name=None):
   """Update 'ref' by assigning 'value' to it.
 
@@ -274,9 +278,10 @@ def assign(ref, value, validate_shape=None, use_locking=None, name=None):
     return gen_state_ops.assign(
         ref, value, use_locking=use_locking, name=name,
         validate_shape=validate_shape)
-  return ref.assign(value)
+  return ref.assign(value, name=name)
 
 
+@tf_export("count_up_to")
 def count_up_to(ref, limit, name=None):
   r"""Increments 'ref' until it reaches 'limit'.
 
@@ -299,6 +304,7 @@ def count_up_to(ref, limit, name=None):
       ref.handle, limit, T=ref.dtype, name=name)
 
 
+@tf_export("scatter_update")
 def scatter_update(ref, indices, updates, use_locking=True, name=None):
   # pylint: disable=line-too-long
   r"""Applies sparse updates to a variable reference.
@@ -347,5 +353,70 @@ def scatter_update(ref, indices, updates, use_locking=True, name=None):
   if ref.dtype._is_ref_dtype:
     return gen_state_ops.scatter_update(ref, indices, updates,
                                         use_locking=use_locking, name=name)
-  return gen_resource_variable_ops.resource_scatter_update(
-      ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype), name=name)
+  return ref._lazy_read(gen_resource_variable_ops.resource_scatter_update(  # pylint: disable=protected-access
+      ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
+      name=name))
+
+
+@tf_export("scatter_nd_update")
+def scatter_nd_update(ref, indices, updates, use_locking=True, name=None):
+  r"""Applies sparse `updates` to individual values or slices in a Variable.
+
+  `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+  `indices` must be integer tensor, containing indices into `ref`.
+  It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+  The innermost dimension of `indices` (with length `K`) corresponds to
+  indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+  dimension of `ref`.
+
+  `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+  ```
+  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+  ```
+
+  For example, say we want to update 4 scattered elements to a rank-1 tensor to
+  8 elements. In Python, that update would look like this:
+
+  ```python
+      ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+      indices = tf.constant([[4], [3], [1] ,[7]])
+      updates = tf.constant([9, 10, 11, 12])
+      update = tf.scatter_nd_update(ref, indices, updates)
+      with tf.Session() as sess:
+        print sess.run(update)
+  ```
+
+  The resulting update to ref would look like this:
+
+      [1, 11, 3, 10, 9, 6, 7, 12]
+
+  See @{tf.scatter_nd} for more details about how to make updates to
+  slices.
+
+  Args:
+    ref: A Variable.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      A Tensor. Must be one of the following types: int32, int64.
+      A tensor of indices into ref.
+    updates: A `Tensor`. Must have the same type as `ref`.
+      A Tensor. Must have the same type as ref. A tensor of updated
+      values to add to ref.
+    use_locking: An optional `bool`. Defaults to `True`.
+      An optional bool. Defaults to True. If True, the assignment will
+      be protected by a lock; otherwise the behavior is undefined,
+      but may exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    The value of the variable after the update.
+  """
+  if ref.dtype._is_ref_dtype:
+    return gen_state_ops.scatter_nd_update(
+        ref, indices, updates, use_locking, name)
+  with ops.control_dependencies([gen_state_ops.resource_scatter_nd_update(
+      ref.handle, indices, ops.convert_to_tensor(updates, dtype=ref.dtype),
+      use_locking, name)]):
+    return ref.read_value()
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index f30e79a108f159bb03237f8c232d1ee467ff458d..b8c39d91b41790c6441594b175e8eaa03620e1ec 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -47,9 +47,11 @@ from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_string_ops import *
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
 
 
+@tf_export("string_split")
 def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=invalid-name
   """Split elements of `source` based on `delimiter` into a `SparseTensor`.
 
@@ -120,6 +122,7 @@ def _reduce_join_reduction_dims(x, axis, reduction_indices):
     return math_ops.range(array_ops.rank(x) - 1, -1, -1)
 
 
+@tf_export("reduce_join")
 def reduce_join(inputs, axis=None,
                 keep_dims=False,
                 separator="",
diff --git a/tensorflow/python/ops/summary_ops.py b/tensorflow/python/ops/summary_ops.py
index 2cf2eda16e69bcfab766c7adaa4b5d8b40d99723..7f4f4ce5ab4ee2bd309932cb81f05775996371d6 100644
--- a/tensorflow/python/ops/summary_ops.py
+++ b/tensorflow/python/ops/summary_ops.py
@@ -25,9 +25,11 @@ from tensorflow.python.ops import summary_op_util
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_logging_ops import *
+from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
 
 
+@tf_export("summary.tensor_summary")
 def tensor_summary(name,
                    tensor,
                    summary_description=None,
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 98578b799a814962b560e8ed40868b2e94010f4e..806fdd3da7aa6de01b7cd4d9d36dbf43f6139db6 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -22,15 +22,20 @@ import functools
 import traceback
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.deprecation import deprecated
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = ["make_template"]
 
 
+@tf_export("make_template")
 def make_template(name_, func_, create_scope_now_=False, unique_name_=None,
                   custom_getter_=None, **kwargs):
   """Given an arbitrary function, wrap it so that it does variable sharing.
@@ -127,25 +132,94 @@ def make_template(name_, func_, create_scope_now_=False, unique_name_=None,
 
   Returns:
     A function to encapsulate a set of variables which should be created once
-    and reused. An enclosing scope will created, either where `make_template`
-    is called, or wherever the result is called, depending on the value of
+    and reused. An enclosing scope will be created either when `make_template`
+    is called or when the result is called, depending on the value of
     `create_scope_now_`. Regardless of the value, the first time the template
     is called it will enter the scope with no reuse, and call `func_` to create
     variables, which are guaranteed to be unique. All subsequent calls will
     re-enter the scope and reuse those variables.
 
   Raises:
-    ValueError: if the name is None.
+    ValueError: if `name_` is None.
   """
+  return make_template_internal(
+      name_,
+      func_,
+      create_scope_now_,
+      unique_name_,
+      custom_getter_,
+      create_graph_function_=False,
+      **kwargs)
+
+
+def make_template_internal(name_,
+                           func_,
+                           create_scope_now_=False,
+                           unique_name_=None,
+                           custom_getter_=None,
+                           create_graph_function_=False,
+                           **kwargs):
+  """Make a template, optionally compiling func_ into a graph function.
+
+  See `make_template` for full documentation.
+
+  Args:
+    name_: A name for the scope created by this template. If necessary, the name
+      will be made unique by appending `_N` to the name.
+    func_: The function to wrap.
+    create_scope_now_: Boolean controlling whether the scope should be created
+      when the template is constructed or when the template is called. Default
+      is False, meaning the scope is created when the template is called.
+    unique_name_: When used, it overrides name_ and is not made unique. If a
+      template of the same scope/unique_name already exists and reuse is false,
+      an error is raised. Defaults to None. If executing eagerly, must be None.
+    custom_getter_: Optional custom getter for variables used in `func_`. See
+      the @{tf.get_variable} `custom_getter` documentation for
+      more information.
+    create_graph_function_: When True, `func_` will be executed as a graph
+      function. This implies that `func_` must satisfy the properties that
+      `function.defun` requires of functions: See the documentation of
+      `function.defun` for details. When executing eagerly, setting this flag to
+      True can improve performance. Regardless of whether eager execution is
+      enabled, enabling this flag gives the caller access to graph-function
+      semantics, i.e., accesses to variables are totally ordered and
+      side-effecting ops are not pruned.
+    **kwargs: Keyword arguments to apply to `func_`.
+
+  Returns:
+    A function to encapsulate a set of variables which should be created once
+    and reused. An enclosing scope will be created either when `make_template`
+    is called or when the result is called, depending on the value of
+    `create_scope_now_`. Regardless of the value, the first time the template
+    is called it will enter the scope with no reuse, and call `func_` to create
+    variables, which are guaranteed to be unique. All subsequent calls will
+    re-enter the scope and reuse those variables.
+
+  Raises:
+    ValueError: if `name_` is None.
+    ValueError: if `unique_name_` is not None and eager execution is enabled.
+  """
+
   if kwargs:
-    func_ = functools.partial(func_, **kwargs)
+    func_ = tf_decorator.make_decorator(func_, functools.partial(
+        func_, **kwargs))
   if context.in_eager_mode():
+    if unique_name_ is not None:
+      raise ValueError(
+          "unique_name_ cannot be used when eager exeuction is enabled.")
     return EagerTemplate(
-        name_, func_, create_scope_now=create_scope_now_,
-        unique_name=unique_name_, custom_getter=custom_getter_)
+        name_,
+        func_,
+        create_scope_now=create_scope_now_,
+        custom_getter=custom_getter_,
+        create_graph_function=create_graph_function_)
   return Template(
-      name_, func_, create_scope_now=create_scope_now_,
-      unique_name=unique_name_, custom_getter=custom_getter_)
+      name_,
+      func_,
+      create_scope_now=create_scope_now_,
+      unique_name=unique_name_,
+      custom_getter=custom_getter_,
+      create_graph_function=create_graph_function_)
 
 
 def _skip_common_stack_elements(stacktrace, base_case):
@@ -169,7 +243,7 @@ class Template(object):
   """
 
   def __init__(self, name, func, create_scope_now=False, unique_name=None,
-               custom_getter=None):
+               custom_getter=None, create_graph_function=False):
     """Creates a template for the given function.
 
     Args:
@@ -183,18 +257,25 @@ class Template(object):
         through much lower level code, and you want to be sure of the scope
         name without knowing exactly where it will be first called. If set to
         True, the scope will be created in the constructor, and all subsequent
-        times in __call__, leading to a trailing numeral being added to the
+        times in `__call__`, leading to a trailing numeral being added to the
         names of all created Tensors. If set to False, the scope will be created
         at the first call location.
-      unique_name: When used, it overrides name_ and is not made unique. If a
+      unique_name: When used, it overrides `name` and is not made unique. If a
         template of the same scope/unique_name already exists and reuse is
         false, an error is raised. Defaults to None.
-      custom_getter: optional custom getter to pass to variable_scope()
+      custom_getter: optional custom getter to pass to `variable_scope()`
+      create_graph_function: When True, `func` will be executed as a graph
+        function. Enabling this flag gives the caller access to graph-function
+        semantics, i.e., accesses to variables are totally ordered and
+        side-effecting ops are not pruned.
 
     Raises:
-      ValueError: if the name is None.
+      ValueError: if `name` is None.
     """
-    self._func = func
+    if create_graph_function:
+      self._func = function.defun(func)
+    else:
+      self._func = func
     self._stacktrace = traceback.format_stack()[:-2]
     self._name = name
     self._unique_name = unique_name
@@ -213,14 +294,17 @@ class Template(object):
     # which is not the same as whether the scope has been created.
     self._variables_created = False
 
-  def _call_func(self, args, kwargs, check_for_new_variables):
+  def _call_func(self, args, kwargs):
     try:
       vars_at_start = len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
       trainable_at_start = len(
           ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
       result = self._func(*args, **kwargs)
-      if check_for_new_variables:
+
+      if self._variables_created:
+        # Variables were previously created, implying this is not the first
+        # time the template has been called. Check to make sure that no new
+        # trainable variables were created this time around.
         trainable_variables = ops.get_collection(
             ops.GraphKeys.TRAINABLE_VARIABLES)
         # If a variable that we intend to train is created as a side effect
@@ -240,6 +324,8 @@ class Template(object):
                        "the first time, perhaps you used tf.Variable when you "
                        "meant tf.get_variable: %s",
                        variables[vars_at_start:])
+      else:
+        self._variables_created = True
       return result
     except Exception as exc:
       # Reraise the exception, but append the original definition to the
@@ -259,20 +345,11 @@ class Template(object):
 
   def __call__(self, *args, **kwargs):
     if self._variable_scope:
-      if self._variables_created:
-        # This is not the first visit to __call__, so variables have already
-        # been created, and we want to reuse them.
-        with variable_scope.variable_scope(self._variable_scope, reuse=True):
-          return self._call_func(args, kwargs, check_for_new_variables=True)
-      else:
-        # This is the first visit to __call__, but the scope has already been
-        # created in the constructor. Set _variables_created after the inner
-        # function is successfully called so that subsequent calls take the if
-        # branch above.
-        with variable_scope.variable_scope(self._variable_scope):
-          result = self._call_func(args, kwargs, check_for_new_variables=False)
-          self._variables_created = True
-          return result
+      # Only reuse variables if they were already created.
+      with variable_scope.variable_scope(
+          self._variable_scope, reuse=self._variables_created):
+        result = self._call_func(args, kwargs)
+      return result
     else:
       # The scope was not created at construction time, so create it here.
       # Subsequent calls should reuse variables.
@@ -280,8 +357,7 @@ class Template(object):
           self._unique_name, self._name,
           custom_getter=self._custom_getter) as vs:
         self._variable_scope = vs
-        result = self._call_func(args, kwargs, check_for_new_variables=False)
-        self._variables_created = True
+        result = self._call_func(args, kwargs)
         return result
 
   @property
@@ -307,6 +383,12 @@ class Template(object):
       # To prevent partial matches on the scope_name, we add '/' at the end.
       return name if name[-1] == "/" else name + "/"
 
+  @property
+  def variables(self):
+    """Returns the list of global and local variables created by the Template.
+    """
+    return self.global_variables + self.local_variables
+
   @property
   def trainable_variables(self):
     """Returns the list of trainable variables created by the Template."""
@@ -316,6 +398,14 @@ class Template(object):
     else:
       return []
 
+  @property
+  def non_trainable_variables(self):
+    """Returns the list of non-trainable variables created by the Template."""
+    # TODO(apassos) Make sure it matches Eager when using local variables.
+    global_variables = self.global_variables
+    trainable_variables = set(self.trainable_variables)
+    return [x for x in global_variables if x not in trainable_variables]
+
   @property
   def global_variables(self):
     """Returns the list of global variables created by the Template."""
@@ -334,6 +424,21 @@ class Template(object):
     else:
       return []
 
+  @property
+  def weights(self):
+    """List of weights/variables created by the Template."""
+    return self.variables
+
+  @property
+  def trainable_weights(self):
+    """List of trainable weights/variables created by the Template."""
+    return self.trainable_variables
+
+  @property
+  def non_trainable_weights(self):
+    """List of non-trainable weights/variables created by the Template."""
+    return self.non_trainable_variables
+
   @property
   @deprecated(
       "2017-02-21", "The .var_scope property is deprecated. Please change your "
@@ -343,6 +448,61 @@ class Template(object):
     return self._variable_scope
 
 
+class _EagerTemplateVariableStore(object):
+  """Wrapper around EagerVariableStore to support nesting EagerTemplates.
+  """
+
+  def __init__(self, variable_scope_name):
+    self._variable_scope_name = variable_scope_name
+    default = variable_scope._get_default_variable_store()  # pylint: disable=protected-access
+    if default._store_eager_variables:  # pylint: disable=protected-access
+      self._eager_variable_store = variable_scope.EagerVariableStore(default)
+    else:
+      self._eager_variable_store = variable_scope.EagerVariableStore()
+
+  def set_variable_scope_name(self, variable_scope_name):
+    self._variable_scope_name = variable_scope_name
+
+  @tf_contextlib.contextmanager
+  def as_default(self):
+    try:
+      with self._eager_variable_store.as_default():
+        yield
+    finally:
+      # Each _EagerTemplateVariableStore object lives underneath a variable
+      # scope (see EagerTemplate.__call__). This variable scope's subscopes are
+      # closed when the EagerTemplate object returns from __call__. For
+      # top-level _EagerTemplateVariableStore objects, the variable store to
+      # which the variable scope is attached is different from the
+      # EagerVariableStore; as such it is necessary to close its subscopes
+      # here as well.
+      if self._variable_scope_name is None:
+        raise RuntimeError("A variable scope must be set before an "
+                           "_EagerTemplateVariableStore object exits.")
+      self._eager_variable_store._store.close_variable_subscopes(  # pylint: disable=protected-access
+          self._variable_scope_name)
+
+  def _variables_in_scope(self, variable_list):
+    if self._variable_scope_name is None:
+      raise RuntimeError(
+          "A variable scope must be set before variables can be accessed.")
+    return [
+        v for v in variable_list
+        if v.name.startswith(self._variable_scope_name + "/")
+    ]
+
+  def variables(self):
+    return self._variables_in_scope(self._eager_variable_store.variables())
+
+  def trainable_variables(self):
+    return self._variables_in_scope(
+        self._eager_variable_store.trainable_variables())
+
+  def non_trainable_variables(self):
+    return self._variables_in_scope(
+        self._eager_variable_store.non_trainable_variables())
+
+
 class EagerTemplate(Template):
   """Wrap a function to aid in variable sharing in Eager mode.
 
@@ -355,8 +515,8 @@ class EagerTemplate(Template):
   call.
   """
 
-  def __init__(self, name, func, create_scope_now=False, unique_name=None,
-               custom_getter=None):
+  def __init__(self, name, func, create_scope_now=False, custom_getter=None,
+               create_graph_function=False):
     """Creates a template for the given function.
 
     Args:
@@ -370,43 +530,45 @@ class EagerTemplate(Template):
         through much lower level code, and you want to be sure of the scope
         name without knowing exactly where it will be first called. If set to
         True, the scope will be created in the constructor, and all subsequent
-        times in __call__, leading to a trailing numeral being added to the
+        times in `__call__`, leading to a trailing numeral being added to the
         names of all created Tensors. If set to False, the scope will be created
         at the first call location.
-      unique_name: When used, it overrides name_ and is not made unique. If a
-        template of the same scope/unique_name already exists and reuse is
-        false, an error is raised. Defaults to None.
-      custom_getter: optional custom getter to pass to variable_scope()
+      custom_getter: optional custom getter to pass to `variable_scope()`
+      create_graph_function: When True, `func` will be executed as a graph
+        function. Enabling this flag allows the caller to reap the performance
+        benefits associated with executing graphs, at the cost of sacrificing
+        debuggability; however, not all Python functions can be compiled into
+        graph functions. See the documentation for `function.defun` for details.
 
     Raises:
-      RuntimeError: if eager mode is not enabled.
-      ValueError: if the name is None or unique_name is provided.
+      RuntimeError: if eager execution is not enabled.
     """
     if not context.in_eager_mode():
       raise RuntimeError(
           "{} objects can only be used when eager execution is enabled, use "
           "tf.Template for graph construction".
           format(type(self)))
-    if unique_name:
-      raise ValueError("unique_name cannot be used in eager mode.")
-    super(EagerTemplate, self).__init__(name, func, create_scope_now,
-                                        unique_name, custom_getter)
-    # Create an eager variable store only if the current variable store cannot
-    # store eager variables. This should allow for correct nesting.
-    default_vstore = variable_scope._get_default_variable_store()  # pylint: disable=protected-access
-    if default_vstore._store_eager_variables:  # pylint: disable=protected-access
-      raise ValueError("Nested EagerTemaplates are not currently supported.")
+    super(EagerTemplate, self).__init__(name, func, create_scope_now, None,
+                                        custom_getter, create_graph_function)
+    if self._variable_scope is not None:
+      variable_scope_name = self._variable_scope.name
     else:
-      self._eager_variable_store = variable_scope.EagerVariableStore()
+      # Defer setting the variable scope name until the variable scope
+      # is created in __call__.
+      variable_scope_name = None
+    self._template_store = _EagerTemplateVariableStore(variable_scope_name)
 
-  def _call_func(self, args, kwargs, check_for_new_variables):
+  def _call_func(self, args, kwargs):
     try:
-      vars_at_start = self._eager_variable_store.variables()
-      trainable_at_start = self._eager_variable_store.trainable_variables()
-
+      vars_at_start = self._template_store.variables()
+      trainable_at_start = self._template_store.trainable_variables()
       result = self._func(*args, **kwargs)
-      if check_for_new_variables:
-        trainable_variables = self._eager_variable_store.trainable_variables()
+
+      if self._variables_created:
+        # Variables were previously created, implying this is not the first
+        # time the template has been called. Check to make sure that no new
+        # trainable variables were created this time around.
+        trainable_variables = self._template_store.trainable_variables()
         # If a variable that we intend to train is created as a side effect
         # of creating a template, then that is almost certainly an error.
         if len(trainable_at_start) != len(trainable_variables):
@@ -419,12 +581,14 @@ class EagerTemplate(Template):
         # Non-trainable tracking variables are a legitimate reason why a new
         # variable would be created, but it is a relatively advanced use-case,
         # so log it.
-        variables = self._eager_variable_store.variables()
+        variables = self._template_store.variables()
         if len(vars_at_start) != len(variables):
           logging.info("New variables created when calling a template after "
                        "the first time, perhaps you used tf.Variable when you "
                        "meant tf.get_variable: %s",
                        list(set(variables) - set(vars_at_start)))
+      else:
+        self._variables_created = True
       return result
     except Exception as exc:
       # Reraise the exception, but append the original definition to the
@@ -443,26 +607,15 @@ class EagerTemplate(Template):
       raise
 
   def __call__(self, *args, **kwargs):
+    # In both branches below, the template store is installed as default after
+    # the variable scope is opened in order to ensure that templates nested at
+    # the same level correctly uniquify lower variable scope names.
     if self._variable_scope:
-      if self._variables_created:
-        # This is not the first visit to __call__, so variables have already
-        # been created, and we want to reuse them.
-        with variable_scope.variable_scope(self._variable_scope,
-                                           reuse=variable_scope.AUTO_REUSE):
-          with self._eager_variable_store.as_default():
-            return self._call_func(args, kwargs, check_for_new_variables=True)
-      else:
-        # This is the first visit to __call__, but the scope has already been
-        # created in the constructor. Set _variables_created after the inner
-        # function is successfully called so that subsequent calls take the if
-        # branch above.
-        with variable_scope.variable_scope(self._variable_scope,
-                                           reuse=variable_scope.AUTO_REUSE):
-          with self._eager_variable_store.as_default():
-            result = self._call_func(args, kwargs,
-                                     check_for_new_variables=False)
-        self._variables_created = True
-        return result
+      with variable_scope.variable_scope(
+          self._variable_scope, reuse=variable_scope.AUTO_REUSE):
+        with self._template_store.as_default():
+          result = self._call_func(args, kwargs)
+      return result
     else:
       # The scope was not created at construction time, so create it here.
       # Subsequent calls should reuse variables.
@@ -470,10 +623,11 @@ class EagerTemplate(Template):
           self._unique_name, self._name,
           custom_getter=self._custom_getter) as vs:
         self._variable_scope = vs
-        with self._eager_variable_store.as_default():
-          result = self._call_func(args, kwargs,
-                                   check_for_new_variables=False)
-        self._variables_created = True
+        # Because the scope was not created at construction time, the template
+        # store's variable scope name is unset; set it here.
+        self._template_store.set_variable_scope_name(vs.name)
+        with self._template_store.as_default():
+          result = self._call_func(args, kwargs)
         return result
 
   @property
@@ -501,20 +655,34 @@ class EagerTemplate(Template):
 
   @property
   def variables(self):
-    """Returns the list of trainable variables created by the Template."""
+    """Returns the list of variables created by the Template."""
     # Currently there is no local variable in Eager mode.
-    return self._eager_variable_store.variables()
+    if not self._variables_created:
+      return []
+    return self._template_store.variables()
 
   @property
   def trainable_variables(self):
     """Returns the list of trainable variables created by the Template."""
     # Currently there is no local variable in Eager mode.
-    return self._eager_variable_store.trainable_variables()
+    if not self._variables_created:
+      return []
+    return self._template_store.trainable_variables()
+
+  @property
+  def non_trainable_variables(self):
+    """Returns the list of non-trainable variables created by the Template."""
+    # Currently there is no local variable in Eager mode.
+    if not self._variables_created:
+      return []
+    return self._template_store.non_trainable_variables()
 
   @property
   def global_variables(self):
     """Returns the list of global variables created by the Template."""
     # Currently there is no local variable in Eager mode.
+    if not self._variables_created:
+      return []
     return self.variables
 
   @property
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 605654d9be7985f4b0d2677cf688c31796db31b5..3c08870146e447d84d4a5f620cbead633d94751f 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -35,9 +35,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import tf_should_use
-
-# TODO(ebrevdo): Set to True in Dec. 4, 2017.
-_ENABLE_IDENTICAL_ELEMENT_SHAPES = False
+from tensorflow.python.util.tf_export import tf_export
 
 
 # _GraphTensorArray accesses many of the hidden generated ops, but is in
@@ -150,18 +148,15 @@ class _GraphTensorArray(object):
         # will retroactively set the device value of this op.
         def create():
           """Create the TensorArray op."""
-          ta_kwargs = {}
-          if _ENABLE_IDENTICAL_ELEMENT_SHAPES:
-            ta_kwargs["identical_element_shapes"] = infer_shape
           return gen_data_flow_ops._tensor_array_v3(
               dtype=dtype,
               size=size,
               element_shape=element_shape,
+              identical_element_shapes=infer_shape,
               dynamic_size=dynamic_size,
               clear_after_read=clear_after_read,
               tensor_array_name=tensor_array_name,
-              name=scope,
-              **ta_kwargs)
+              name=scope)
         if colocate_with_first_write_call:
           with ops.device(None), ops.colocate_with(None, ignore_existing=True):
             self._handle, self._flow = create()
@@ -658,7 +653,7 @@ class _EagerTensorArray(object):
     if len(tensors) > len(self._tensor_array) and not self._dynamic_size:
       raise ValueError(
           "Cannot unstack %d tensors into a TensorArray of static size %d" %
-          (len(tensors), len(self._tensors)))
+          (len(tensors), len(self._tensor_array)))
     ta = self._identity_without_array()
     ta._implementation._tensor_array = tensors  # pylint: disable=protected-access
     return ta
@@ -717,6 +712,7 @@ class _EagerTensorArray(object):
 # TensorArray is designed to hide an underlying implementation object
 # and as such accesses many of that object's hidden fields.
 # pylint: disable=protected-access
+@tf_export("TensorArray")
 class TensorArray(object):
   """Class wrapping dynamic-sized, per-time-step, write-once Tensor arrays.
 
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 3643861a16d3c6f6467a0e4427e2a08cc3e167c6..81565a63774da49628d100ef071b02f6311f6af2 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -23,9 +23,11 @@ import collections as collections_lib
 import copy
 import enum  # pylint: disable=g-bad-import-order
 import functools
+import sys
 import traceback
 
 import six
+from six import iteritems
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
@@ -39,6 +41,7 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.tf_export import tf_export
 
 __all__ = ["AUTO_REUSE", "VariableScope", "get_variable_scope",
            "get_variable", "get_local_variable", "variable_scope",
@@ -185,6 +188,7 @@ class _ReuseMode(enum.Enum):
   # REUSE_TRUE = 3
 
 AUTO_REUSE = _ReuseMode.AUTO_REUSE
+tf_export("AUTO_REUSE").export_constant(__name__, "AUTO_REUSE")
 AUTO_REUSE.__doc__ = """
 When passed in as the value for the `reuse` flag, AUTO_REUSE indicates that
 get_variable() should create the requested variable if it doesn't exist or, if
@@ -767,8 +771,8 @@ class _VariableStore(object):
     if initializer is None:
       initializer, initializing_from_value = self._get_default_initializer(
           name=name, shape=shape, dtype=dtype)
-    # Clear control dependencies while creating the initializer.
-    with ops.control_dependencies(None):
+    # Enter an init scope when creating the initializer.
+    with ops.init_scope():
       if initializing_from_value:
         init_val = initializer
         variable_dtype = None
@@ -784,26 +788,16 @@ class _VariableStore(object):
     if use_resource is None:
       # Set the default value if unspecified.
       use_resource = False
-    if use_resource:
-      v = resource_variable_ops.ResourceVariable(
-          initial_value=init_val,
-          name=name,
-          trainable=trainable,
-          collections=collections,
-          caching_device=caching_device,
-          dtype=variable_dtype,
-          validate_shape=validate_shape,
-          constraint=constraint)
-    else:
-      v = variables.Variable(
-          initial_value=init_val,
-          name=name,
-          trainable=trainable,
-          collections=collections,
-          caching_device=caching_device,
-          dtype=variable_dtype,
-          validate_shape=validate_shape,
-          constraint=constraint)
+    v = variable(
+        initial_value=init_val,
+        name=name,
+        trainable=trainable,
+        collections=collections,
+        caching_device=caching_device,
+        dtype=variable_dtype,
+        validate_shape=validate_shape,
+        constraint=constraint,
+        use_resource=use_resource)
     if context.in_graph_mode() or self._store_eager_variables:
       # In eager mode we do not want to keep default references to Variable
       # objects as this will prevent their memory from being released.
@@ -843,6 +837,7 @@ class _VariableStore(object):
     Raises:
       ValueError: When giving unsupported dtype.
     """
+    del shape
     # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
     if dtype.is_floating:
       initializer = init_ops.glorot_uniform_initializer()
@@ -850,9 +845,8 @@ class _VariableStore(object):
     # If dtype is DT_INT/DT_UINT, provide a default value `zero`
     # If dtype is DT_BOOL, provide a default value `FALSE`
     elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
-      initializer = init_ops.zeros_initializer()(
-          shape=shape, dtype=dtype.base_dtype)
-      initializing_from_value = True
+      initializer = init_ops.zeros_initializer()
+      initializing_from_value = False
     # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
     else:
       raise ValueError("An initializer for variable %s of %s is required"
@@ -862,12 +856,14 @@ class _VariableStore(object):
 
 
 # To stop regularization, use this regularizer
+@tf_export("no_regularizer")
 def no_regularizer(_):
   """Use this function to prevent regularization of variables."""
   return None
 
 
 # TODO(alive): support caching devices and partitioned variables in Eager mode.
+@tf_export("VariableScope")
 class VariableScope(object):
   """Variable scope object to carry defaults to provide to `get_variable`.
 
@@ -1167,6 +1163,7 @@ _VARSTORE_KEY = ("__variable_store",)
 _VARSCOPE_KEY = ("__varscope",)
 
 
+@tf_export("get_variable_scope")
 def get_variable_scope():
   """Returns the current variable scope."""
   scope = ops.get_collection(_VARSCOPE_KEY)
@@ -1217,8 +1214,15 @@ class EagerVariableStore(object):
   ```
   """
 
-  def __init__(self):
-    self._store = _VariableStore()
+  def __init__(self, store=None):
+    if store is not None:
+      if not store._store_eager_variables:  # pylint: disable=protected-access
+        raise ValueError("Cannot construct EagerVariableStore from a "
+                         "VariableStore object that does not hold eager "
+                         "variables.")
+      self._store = store
+    else:
+      self._store = _VariableStore()
     self._store._store_eager_variables = True  # pylint: disable=protected-access
 
   def as_default(self):
@@ -1239,7 +1243,38 @@ class EagerVariableStore(object):
                   key=lambda x: x.name)
     # pylint: enable=protected-access
 
+  def copy(self):
+    """Copy this variable store and all of its contents.
 
+    Variables contained in this store will be copied over to the new variable
+    store, meaning that they can be modified without affecting the variables in
+    this store.
+
+    Returns:
+      A new EagerVariableStore instance containing copied variables.
+    """
+    # pylint: disable=protected-access
+    new_store = EagerVariableStore()
+    for key, var in iteritems(self._store._vars):
+      # Strip device out of variable name.
+      try:
+        index = var.name.index(":")
+      except ValueError:
+        stripped_var_name = var.name
+      else:
+        stripped_var_name = var.name[:index]
+
+      # Create new variable with same value, name, and "trainable" flag.
+      new_var = resource_variable_ops.ResourceVariable(
+          var.read_value(),
+          name=stripped_var_name,
+          trainable=var._trainable)
+      new_store._store._vars[key] = new_var
+    return new_store
+    # pylint: enable=protected-access
+
+
+@tf_export("get_variable")
 def get_variable(name,
                  shape=None,
                  dtype=None,
@@ -1351,6 +1386,7 @@ get_variable.__doc__ = get_variable_or_local_docstring % (
 
 
 @functools.wraps(get_variable)
+@tf_export("get_local_variable")
 def get_local_variable(*args, **kwargs):
   kwargs["trainable"] = False
   if "collections" in kwargs:
@@ -1584,6 +1620,10 @@ class _pure_variable_scope(object):  # pylint: disable=invalid-name
           else self._name_or_scope)
       self._reuse = (self._reuse
                      or self._old.reuse)  # Re-using is inherited by sub-scopes.
+      if self._old_name_scope is None:
+        name_scope = self._name_or_scope
+      else:
+        name_scope = self._old_name_scope
       variable_scope_object = VariableScope(
           self._reuse,
           name=self._new_name,
@@ -1594,7 +1634,7 @@ class _pure_variable_scope(object):  # pylint: disable=invalid-name
           dtype=self._old.dtype,
           use_resource=self._old.use_resource,
           custom_getter=self._old.custom_getter,
-          name_scope=self._old_name_scope or self._name_or_scope,
+          name_scope=name_scope,
           constraint=self._constraint)
       if self._initializer is not None:
         variable_scope_object.set_initializer(self._initializer)
@@ -1661,7 +1701,8 @@ def _get_unique_variable_scope(prefix):
 # Named like a function for backwards compatibility with the
 # @tf_contextlib.contextmanager version, which was switched to a class to avoid
 # some object creation overhead.
-class variable_scope(object):  # pylint: disable=invalid-name
+@tf_export("variable_scope")  # pylint: disable=invalid-name
+class variable_scope(object):
   """A context manager for defining ops that creates variables (layers).
 
   This context manager validates that the (optional) `values` are from the same
@@ -1763,7 +1804,8 @@ class variable_scope(object):  # pylint: disable=invalid-name
                reuse=None,
                dtype=None,
                use_resource=None,
-               constraint=None):
+               constraint=None,
+               auxiliary_name_scope=True):
     """Initialize the context manager.
 
     Args:
@@ -1795,6 +1837,8 @@ class variable_scope(object):  # pylint: disable=invalid-name
         variable and return the Tensor for the projected value
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
+      auxiliary_name_scope: If `True`, we create an auxiliary name scope with
+        the scope. If `False`, we don't touch name scope.
 
     Returns:
       A scope that can be captured and reused.
@@ -1832,6 +1876,10 @@ class variable_scope(object):  # pylint: disable=invalid-name
       self._graph = ops._get_graph_from_inputs(self._values)  # pylint: disable=protected-access
     self._cached_pure_variable_scope = None
     self._current_name_scope = None
+    if not isinstance(auxiliary_name_scope, bool):
+      raise TypeError("The auxiliary_name_scope must be `True` or `False`, "
+                      "while get {}".format(auxiliary_name_scope))
+    self._auxiliary_name_scope = auxiliary_name_scope
 
   def __enter__(self):
     # If the default graph is building a function, then we should not replace it
@@ -1845,11 +1893,45 @@ class variable_scope(object):  # pylint: disable=invalid-name
       self._graph_context_manager.__enter__()
     if self._cached_pure_variable_scope is not None:
       # Fast path for re-entering variable_scopes. We've held on to the pure
-      # variable scope from a previous __enter__, so we avoid some overhead by
-      # re-using that object.
+      # variable scope from a previous successful __enter__, so we avoid some
+      # overhead by re-using that object.
       if self._current_name_scope is not None:
         self._current_name_scope.__enter__()
       return self._cached_pure_variable_scope.__enter__()
+
+    try:
+      return self._enter_scope_uncached()
+    except:
+      if self._graph_context_manager is not None:
+        self._graph_context_manager.__exit__(*sys.exc_info())
+      raise
+
+  def _enter_scope_uncached(self):
+    """Enters the context manager when there is no cached scope yet.
+
+    Returns:
+      The entered variable scope.
+
+    Raises:
+      TypeError: A wrong type is passed as `scope` at __init__().
+      ValueError: `reuse` is incorrectly set at __init__().
+    """
+    if self._auxiliary_name_scope:
+      # Create a new name scope later
+      current_name_scope = None
+    else:
+      # Reenter the current name scope
+      name_scope = ops.get_name_scope()
+      if name_scope:
+        # Hack to reenter
+        name_scope += "/"
+        current_name_scope = ops.name_scope(name_scope)
+      else:
+        # Root scope
+        current_name_scope = ops.name_scope(name_scope)
+
+    # IMPORTANT: Only assign to self._cached_pure_variable_scope and
+    # self._current_name_scope after successful __enter__() calls.
     if self._name_or_scope is not None:
       if not isinstance(self._name_or_scope,
                         (VariableScope,) + six.string_types):
@@ -1859,14 +1941,19 @@ class variable_scope(object):  # pylint: disable=invalid-name
         name_scope = self._name_or_scope
       else:
         name_scope = self._name_or_scope.name.split("/")[-1]
-      if name_scope:
-        self._current_name_scope = ops.name_scope(name_scope)
-        current_name_scope_name = self._current_name_scope.__enter__()
+      if name_scope or current_name_scope:
+        current_name_scope = current_name_scope or ops.name_scope(name_scope)
+        try:
+          current_name_scope_name = current_name_scope.__enter__()
+        except:
+          current_name_scope.__exit__(*sys.exc_info())
+          raise
+        self._current_name_scope = current_name_scope
         if isinstance(self._name_or_scope, six.string_types):
           old_name_scope = current_name_scope_name
         else:
           old_name_scope = self._name_or_scope.original_name_scope
-        self._cached_pure_variable_scope = _pure_variable_scope(
+        pure_variable_scope = _pure_variable_scope(
             self._name_or_scope,
             reuse=self._reuse,
             initializer=self._initializer,
@@ -1878,11 +1965,17 @@ class variable_scope(object):  # pylint: disable=invalid-name
             dtype=self._dtype,
             use_resource=self._use_resource,
             constraint=self._constraint)
-        return self._cached_pure_variable_scope.__enter__()
+        try:
+          entered_pure_variable_scope = pure_variable_scope.__enter__()
+        except:
+          pure_variable_scope.__exit__(*sys.exc_info())
+          raise
+        self._cached_pure_variable_scope = pure_variable_scope
+        return entered_pure_variable_scope
       else:
         self._current_name_scope = None
         # This can only happen if someone is entering the root variable scope.
-        self._cached_pure_variable_scope = _pure_variable_scope(
+        pure_variable_scope = _pure_variable_scope(
             self._name_or_scope,
             reuse=self._reuse,
             initializer=self._initializer,
@@ -1893,15 +1986,27 @@ class variable_scope(object):  # pylint: disable=invalid-name
             dtype=self._dtype,
             use_resource=self._use_resource,
             constraint=self._constraint)
-        return self._cached_pure_variable_scope.__enter__()
+        try:
+          entered_pure_variable_scope = pure_variable_scope.__enter__()
+        except:
+          pure_variable_scope.__exit__(*sys.exc_info())
+          raise
+        self._cached_pure_variable_scope = pure_variable_scope
+        return entered_pure_variable_scope
 
     else:  # Here name_or_scope is None. Using default name, but made unique.
       if self._reuse:
         raise ValueError("reuse=True cannot be used without a name_or_scope")
-      self._current_name_scope = ops.name_scope(self._default_name)
-      current_name_scope_name = self._current_name_scope.__enter__()
+      current_name_scope = current_name_scope or ops.name_scope(
+          self._default_name)
+      try:
+        current_name_scope_name = current_name_scope.__enter__()
+      except:
+        current_name_scope.__exit__(*sys.exc_info())
+        raise
+      self._current_name_scope = current_name_scope
       unique_default_name = _get_unique_variable_scope(self._default_name)
-      self._cached_pure_variable_scope = _pure_variable_scope(
+      pure_variable_scope = _pure_variable_scope(
           unique_default_name,
           initializer=self._initializer,
           regularizer=self._regularizer,
@@ -1912,7 +2017,13 @@ class variable_scope(object):  # pylint: disable=invalid-name
           dtype=self._dtype,
           use_resource=self._use_resource,
           constraint=self._constraint)
-      return self._cached_pure_variable_scope.__enter__()
+      try:
+        entered_pure_variable_scope = pure_variable_scope.__enter__()
+      except:
+        pure_variable_scope.__exit__(*sys.exc_info())
+        raise
+      self._cached_pure_variable_scope = pure_variable_scope
+      return entered_pure_variable_scope
 
   def __exit__(self, type_arg, value_arg, traceback_arg):
     self._cached_pure_variable_scope.__exit__(
@@ -1924,6 +2035,7 @@ class variable_scope(object):  # pylint: disable=invalid-name
 
 
 # pylint: disable=g-doc-return-or-yield
+@tf_export("variable_op_scope")
 @tf_contextlib.contextmanager
 def variable_op_scope(values,
                       name_or_scope,
@@ -1985,21 +2097,26 @@ def _compute_slice_dim_and_shape(full_shape, slicing):
   return slice_dim, slice_shape
 
 
-def variable(initial_value=None,
-             trainable=True,
-             collections=None,
-             validate_shape=True,
-             caching_device=None,
-             name=None,
-             dtype=None,
-             use_resource=None):
+def default_variable_creator(next_creator=None, **kwargs):
+  """Default variable creator."""
+  assert next_creator is None
+  initial_value = kwargs.get("initial_value", None)
+  trainable = kwargs.get("trainable", True)
+  collections = kwargs.get("collections", None)
+  validate_shape = kwargs.get("validate_shape", True)
+  caching_device = kwargs.get("caching_device", None)
+  name = kwargs.get("name", None)
+  dtype = kwargs.get("dtype", None)
+  constraint = kwargs.get("constraint", None)
+  use_resource = kwargs.get("use_resource", None)
   if use_resource is None:
     use_resource = get_variable_scope().use_resource
   if use_resource or (use_resource is None and context.in_eager_mode()):
     return resource_variable_ops.ResourceVariable(
         initial_value=initial_value, trainable=trainable,
         collections=collections, validate_shape=validate_shape,
-        caching_device=caching_device, name=name, dtype=dtype)
+        caching_device=caching_device, name=name, dtype=dtype,
+        constraint=constraint)
   elif not use_resource and context.in_eager_mode():
     raise RuntimeError(
         "VariableScope should use resource variable when eager execution is"
@@ -2009,4 +2126,95 @@ def variable(initial_value=None,
     return variables.Variable(
         initial_value=initial_value, trainable=trainable,
         collections=collections, validate_shape=validate_shape,
-        caching_device=caching_device, name=name, dtype=dtype)
+        caching_device=caching_device, name=name, dtype=dtype,
+        constraint=constraint)
+
+
+def _make_getter(captured_getter, captured_previous):
+  """Gets around capturing loop variables in python being broken."""
+  return lambda **kwargs: captured_getter(captured_previous, **kwargs)
+
+
+def variable(initial_value=None,
+             trainable=True,
+             collections=None,
+             validate_shape=True,
+             caching_device=None,
+             name=None,
+             dtype=None,
+             constraint=None,
+             use_resource=None):
+  previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
+  for getter in ops.get_default_graph()._get_variable_creator_stack():  # pylint: disable=protected-access
+    previous_getter = _make_getter(getter, previous_getter)
+  return previous_getter(initial_value=initial_value,
+                         trainable=trainable,
+                         collections=collections,
+                         validate_shape=validate_shape,
+                         caching_device=caching_device,
+                         name=name, dtype=dtype,
+                         constraint=constraint,
+                         use_resource=use_resource)
+
+
+@tf_contextlib.contextmanager
+def variable_creator_scope(variable_creator):
+  """Scope which defines a variable creation function to be used by variable().
+
+  variable_creator is expected to be a function with the following signature:
+
+  ```
+    def variable_creator(next_creator, **kwargs)
+  ```
+
+  The creator is supposed to eventually call the next_creator to create a
+  variable if it does want to create a variable and not call Variable or
+  ResourceVariable directly. This helps make creators composable. A creator may
+  choose to create multiple variables, return already existing variables, or
+  simply register that a variable was created and defer to the next creators in
+  line. Creators can also modify the keyword arguments seen by the next
+  creators.
+
+  Custom getters in the variable scope will eventually resolve down to these
+  custom creators when they do create variables.
+
+  The valid keyword arguments in kwds are:
+      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
+        which is the initial value for the Variable. The initial value must have
+        a shape specified unless `validate_shape` is set to False. Can also be a
+        callable with no argument that returns the initial value when called. In
+        that case, `dtype` must be specified. (Note that initializer functions
+        from init_ops.py must first be bound to a shape before being used here.)
+      trainable: If `True`, the default, also adds the variable to the graph
+        collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
+        the default list of variables to use by the `Optimizer` classes.
+      collections: List of graph collections keys. The new variable is added to
+        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+      validate_shape: If `False`, allows the variable to be initialized with a
+        value of unknown shape. If `True`, the default, the shape of
+        `initial_value` must be known.
+      caching_device: Optional device string describing where the Variable
+        should be cached for reading.  Defaults to the Variable's device.
+        If not `None`, caches on another device.  Typical use is to cache
+        on the device where the Ops using the Variable reside, to deduplicate
+        copying through `Switch` and other conditional statements.
+      name: Optional name for the variable. Defaults to `'Variable'` and gets
+        uniquified automatically.
+      dtype: If set, initial_value will be converted to the given type.
+        If `None`, either the datatype will be kept (if `initial_value` is
+        a Tensor), or `convert_to_tensor` will decide.
+      constraint: A constraint function to be applied to the variable after
+        updates by some algorithms.
+      use_resource: if True, a ResourceVariable is always created.
+
+  This set may grow over time, so it's important the signature of creators is as
+  mentioned above.
+
+  Args:
+    variable_creator: the passed creator
+
+  Yields:
+    A scope in which the creator is active
+  """
+  with ops.get_default_graph()._variable_creator_scope(variable_creator):  # pylint: disable=protected-access
+    yield
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index e0748d87e2d6ef2c2f8565669357f881334fa737..19e3298e4019f94132db25ab0dae5ed458bfbeb3 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -28,11 +28,14 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("Variable")
 class Variable(object):
   """See the @{$variables$Variables How To} for a high level overview.
 
@@ -209,6 +212,7 @@ class Variable(object):
     if not context.in_graph_mode():
       raise RuntimeError("tf.Variable not supported in Eager mode. "
                          "Please use tfe.Variable instead")
+    self._in_graph_mode = context.in_graph_mode()
     if variable_def:
       # If variable_def is provided, recreates the variable from its fields.
       if initial_value:
@@ -304,7 +308,7 @@ class Variable(object):
 
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
-    with ops.control_dependencies(None):
+    with ops.init_scope():
       with ops.name_scope(name, "Variable", [] if init_from_fn else
                           [initial_value]) as name:
 
@@ -362,7 +366,8 @@ class Variable(object):
         # using their initialized_value() method.
         self._initializer_op = state_ops.assign(
             self._variable,
-            self._build_initializer_expr(self._initial_value),
+            self._try_guard_against_uninitialized_dependencies(
+                self._initial_value),
             validate_shape=validate_shape).op
 
         # TODO(vrv): Change this class to not take caching_device, but
@@ -374,8 +379,8 @@ class Variable(object):
         else:
           with ops.colocate_with(self._variable.op):
             self._snapshot = array_ops.identity(self._variable, name="read")
+      ops.add_to_collections(collections, self)
 
-    ops.add_to_collections(collections, self)
     self._caching_device = caching_device
     self._save_slice_info = None
     self._constraint = constraint
@@ -549,7 +554,7 @@ class Variable(object):
       A `Tensor` holding the value of this variable after its initializer
       has run.
     """
-    with ops.control_dependencies(None):
+    with ops.init_scope():
       return control_flow_ops.cond(is_variable_initialized(self),
                                    self.read_value,
                                    lambda: self.initial_value)
@@ -781,88 +786,142 @@ class Variable(object):
 
     setattr(Variable, operator, _run_op)
 
-  def _build_initializer_expr(self, initial_value):
-    """Build an expression suitable to initialize a variable.
+  def _try_guard_against_uninitialized_dependencies(self, initial_value):
+    """Attempt to guard against dependencies on uninitialized variables.
+
+    Replace references to variables in `initial_value` with references to the
+    variable's initialized values. The initialized values are essentially
+    conditional TensorFlow graphs that return a variable's value if it is
+    initialized or its `initial_value` if it hasn't been initialized. This
+    replacement is done on a best effort basis:
 
-    Replace references to variables in initial_value with references to the
-    variable initial values instead.
+    - If the `initial_value` graph contains cycles, we don't do any
+      replacements for that graph.
+    - If the variables that `initial_value` depends on are not present in the
+      `GLOBAL_VARIABLES` or `LOCAL_VARIABLES` we don't replace them.
+
+    In these cases, it is up to the caller to ensure that the `initial_value`
+    graph uses initialized variables or that they guard access to variables
+    using their `initialized_value` method.
 
     Args:
-      initial_value: original expression
+      initial_value: `Tensor`. The initial value.
     Returns:
-      A tensorflow expression suitable to initialize a variable.
+      A `Tensor` suitable to initialize a variable.
+    Raises:
+      TypeError: If `initial_value` is not a `Tensor`.
     """
-    if isinstance(initial_value, Variable):
-      return initial_value.initialized_value()
-    elif isinstance(initial_value, ops.Tensor):
-      new_op = self._build_initializer_expr(initial_value.op)
-      if new_op != initial_value.op:
-        if isinstance(new_op, ops.Tensor):
-          return new_op
-        else:
-          return ops.Tensor(new_op, initial_value.value_index,
-                            initial_value.dtype)
-      else:
-        return initial_value
-    elif isinstance(initial_value, ops.Operation):
-      if initial_value.node_def.op in [
-          "IsVariableInitialized", "VarIsInitializedOp", "ReadVariableOp"
-      ]:
-        return initial_value
-      if initial_value.node_def.op in ["Variable", "VariableV2", "VarHandleOp"]:
-        return self._find_initialized_value_for_variable(initial_value)
-      modified = False
-      new_inputs = []
-      for tensor in initial_value.inputs:
-        new_tensor = self._build_initializer_expr(tensor)
-        new_inputs.append(new_tensor)
-        if new_tensor != tensor:
-          modified = True
-
-      if modified:
-        new_name = initial_value.node_def.name + "_" + self.name
-        new_name = new_name.replace(":", "_")
-        new_op = initial_value.node_def.op
-        new_op = new_op.replace("RefSwitch", "Switch")
-        new_value = self.graph.create_op(
-            new_op,
-            new_inputs,
-            # pylint: disable=protected-access
-            initial_value._output_types,
-            # pylint: enable=protected-access
-            name=new_name,
-            attrs=initial_value.node_def.attr)
-        return new_value
-      else:
-        return initial_value
-    else:
+    if not isinstance(initial_value, ops.Tensor):
+      raise TypeError("initial_value needs to be a Tensor: %s" % initial_value)
+
+    # Don't modify initial_value if it contains any cyclic dependencies.
+    def has_cycle(op, path):
+      """Detect cycles in the dependencies of `initial_value`."""
+      if op.name in path:
+        return True
+      path.add(op.name)
+      for op_input in op.inputs:
+        if has_cycle(op_input.op, path):
+          return True
+      for op_control_input in op.control_inputs:
+        if has_cycle(op_control_input, path):
+          return True
+      path.remove(op.name)
+      return False
+    if has_cycle(initial_value.op, path=set()):
       return initial_value
 
+    return self._safe_initial_value_from_tensor(initial_value, op_cache={})
+
+  def _safe_initial_value_from_tensor(self, tensor, op_cache):
+    """Replace dependencies on variables with their initialized values.
+
+    Args:
+      tensor: A `Tensor`. The tensor to replace.
+      op_cache: A dict mapping operation names to `Operation`s. Used to memoize
+        the results so as to avoid creating redundant operations.
+    Returns:
+      A `Tensor` compatible with `tensor`. Any inputs that lead to variable
+      values will be replaced with a corresponding graph that uses the
+      variable's initialized values. This is done on a best-effort basis. If no
+      modifications need to be made then `tensor` will be returned unchanged.
+    """
+    op = tensor.op
+    new_op = op_cache.get(op.name)
+    if new_op is None:
+      new_op = self._safe_initial_value_from_op(op, op_cache)
+      op_cache[op.name] = new_op
+    return new_op.outputs[tensor.value_index]
+
+  def _safe_initial_value_from_op(self, op, op_cache):
+    """Replace dependencies on variables with their initialized values.
+
+    Args:
+      op: An `Operation`. The operation to replace.
+      op_cache: A dict mapping operation names to `Operation`s. Used to memoize
+        the results so as to avoid creating redundant operations.
+    Returns:
+      An `Operation` compatible with `op`. Any inputs that lead to variable
+      values will be replaced with a corresponding graph that uses the
+      variable's initialized values. This is done on a best-effort basis. If no
+      modifications need to be made then `op` will be returned unchanged.
+    """
+    op_type = op.node_def.op
+    if op_type in ("IsVariableInitialized", "VarIsInitializedOp",
+                   "ReadVariableOp"):
+      return op
+
+    # Attempt to find the initialized_value of any variable reference / handles.
+    # TODO(b/70206927): Fix handling of ResourceVariables.
+    if op_type in ("Variable", "VariableV2", "VarHandleOp"):
+      initialized_value = self._find_initialized_value_for_variable(op)
+      return op if initialized_value is None else initialized_value.op
+
+    # Recursively build initializer expressions for inputs.
+    modified = False
+    new_op_inputs = []
+    for op_input in op.inputs:
+      new_op_input = self._safe_initial_value_from_tensor(op_input, op_cache)
+      new_op_inputs.append(new_op_input)
+      modified = modified or (new_op_input != op_input)
+
+    # If at least one input was modified, replace the op.
+    if modified:
+      new_op_type = op_type
+      if new_op_type == "RefSwitch":
+        new_op_type = "Switch"
+      new_op_name = op.node_def.name + "_" + self.name
+      new_op_name = new_op_name.replace(":", "_")
+      return self.graph.create_op(
+          new_op_type, new_op_inputs,
+          op._output_types,  # pylint: disable=protected-access
+          name=new_op_name, attrs=op.node_def.attr)
+
+    return op
+
   def _find_initialized_value_for_variable(self, variable_op):
-    """Find the initial value for a variable op.
+    """Find the initialized value for a variable op.
 
     To do so, lookup the variable op in the variables collection.
 
     Args:
-      variable_op: a TensorFlow variable Operation
+      variable_op: A variable `Operation`.
     Returns:
-      The initial value for the variable.
+      A `Tensor` representing the initialized value for the variable or `None`
+      if the initialized value could not be found.
     """
     try:
       var_names = [variable_op.node_def.name, variable_op.node_def.name + ":0"]
-      global_vars = self.graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-      for var in global_vars:
-        if var.name in var_names:
-          return var.initialized_value()
-      local_vars = self.graph.get_collection(ops.GraphKeys.LOCAL_VARIABLES)
-      for var in local_vars:
-        if var.name == var_names:
-          return var.initialized_value()
+      for collection_name in (ops.GraphKeys.GLOBAL_VARIABLES,
+                              ops.GraphKeys.LOCAL_VARIABLES):
+        for var in self.graph.get_collection(collection_name):
+          if var.name in var_names:
+            return var.initialized_value()
     except AttributeError:
-      # Return the variable itself when an incomplete user defined variable type
-      # was put in the collection.
-      return variable_op
-    return variable_op
+      # Return None when an incomplete user-defined variable type was put in
+      # the collection.
+      return None
+    return None
 
   # NOTE(mrry): This enables the Variable's overloaded "right" binary
   # operators to run when the left operand is an ndarray, because it
@@ -964,6 +1023,61 @@ class Variable(object):
     return Variable(variable_def=variable_def,
                     import_scope=import_scope)
 
+  def __iadd__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable += will be deprecated. Use variable.assign_add"
+        " if you want assignment to the variable value or 'x = x + y'"
+        " if you want a new python Tensor object.", 1)
+    return self + other
+
+  def __isub__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable -= will be deprecated. Use variable.assign_sub"
+        " if you want assignment to the variable value or 'x = x - y'"
+        " if you want a new python Tensor object.", 1)
+    return self - other
+
+  def __imul__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable *= will be deprecated. Use variable.assign_mul"
+        " if you want assignment to the variable value or 'x = x * y'"
+        " if you want a new python Tensor object.", 1)
+    return self * other
+
+  def __idiv__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable /= will be deprecated. Use variable.assign_div"
+        " if you want assignment to the variable value or 'x = x / y'"
+        " if you want a new python Tensor object.", 1)
+    return self / other
+
+  def __itruediv__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable /= will be deprecated. Use variable.assign_div"
+        " if you want assignment to the variable value or 'x = x / y'"
+        " if you want a new python Tensor object.", 1)
+    return self / other
+
+  def __irealdiv__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable /= will be deprecated. Use variable.assign_div"
+        " if you want assignment to the variable value or 'x = x / y'"
+        " if you want a new python Tensor object.", 1)
+    return self / other
+
+  def __ipow__(self, other):
+    logging.log_first_n(
+        logging.WARN,
+        "Variable **= will be deprecated. Use 'x = x ** y'"
+        " if you want a new python Tensor object.", 1)
+    return self ** other
+
   class SaveSliceInfo(object):
     """Information on how to save this Variable as a slice.
 
@@ -1253,6 +1367,7 @@ class PartitionedVariable(object):
         "assign() has not been implemented for PartitionedVariable.")
 
 
+@tf_export("global_variables")
 def global_variables(scope=None):
   """Returns global variables.
 
@@ -1278,6 +1393,7 @@ def global_variables(scope=None):
   return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES, scope)
 
 
+@tf_export("all_variables")
 @deprecated("2017-03-02", "Please use tf.global_variables instead.")
 def all_variables():
   """See `tf.global_variables`."""
@@ -1302,6 +1418,7 @@ def _all_saveable_objects(scope=None):
           ops.get_collection(ops.GraphKeys.SAVEABLE_OBJECTS, scope))
 
 
+@tf_export("local_variables")
 def local_variables(scope=None):
   """Returns local variables.
 
@@ -1329,6 +1446,7 @@ def local_variables(scope=None):
   return ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES, scope)
 
 
+@tf_export("model_variables")
 def model_variables(scope=None):
   """Returns all variables in the MODEL_VARIABLES collection.
 
@@ -1345,6 +1463,7 @@ def model_variables(scope=None):
   return ops.get_collection(ops.GraphKeys.MODEL_VARIABLES, scope)
 
 
+@tf_export("trainable_variables")
 def trainable_variables(scope=None):
   """Returns all variables created with `trainable=True`.
 
@@ -1366,6 +1485,7 @@ def trainable_variables(scope=None):
   return ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES, scope)
 
 
+@tf_export("moving_average_variables")
 def moving_average_variables(scope=None):
   """Returns all variables that maintain their moving averages.
 
@@ -1387,6 +1507,7 @@ def moving_average_variables(scope=None):
   return ops.get_collection(ops.GraphKeys.MOVING_AVERAGE_VARIABLES, scope)
 
 
+@tf_export("initializers.variables", "variables_initializer")
 def variables_initializer(var_list, name="init"):
   """Returns an Op that initializes a list of variables.
 
@@ -1412,6 +1533,7 @@ def variables_initializer(var_list, name="init"):
   return control_flow_ops.no_op(name=name)
 
 
+@tf_export("initialize_variables")
 @tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.variables_initializer` instead.")
 def initialize_variables(var_list, name="init"):
@@ -1419,6 +1541,7 @@ def initialize_variables(var_list, name="init"):
   return variables_initializer(var_list, name=name)
 
 
+@tf_export("initializers.global_variables", "global_variables_initializer")
 def global_variables_initializer():
   """Returns an Op that initializes global variables.
 
@@ -1432,6 +1555,7 @@ def global_variables_initializer():
   return variables_initializer(global_variables())
 
 
+@tf_export("initialize_all_variables")
 @tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.global_variables_initializer` instead.")
 def initialize_all_variables():
@@ -1439,6 +1563,7 @@ def initialize_all_variables():
   return global_variables_initializer()
 
 
+@tf_export("initializers.local_variables", "local_variables_initializer")
 def local_variables_initializer():
   """Returns an Op that initializes all local variables.
 
@@ -1452,6 +1577,7 @@ def local_variables_initializer():
   return variables_initializer(local_variables())
 
 
+@tf_export("initialize_local_variables")
 @tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.local_variables_initializer` instead.")
 def initialize_local_variables():
@@ -1459,6 +1585,7 @@ def initialize_local_variables():
   return local_variables_initializer()
 
 
+@tf_export("is_variable_initialized")
 @tf_should_use.should_use_result
 def is_variable_initialized(variable):
   """Tests if a variable has been initialized.
@@ -1473,6 +1600,7 @@ def is_variable_initialized(variable):
   return state_ops.is_variable_initialized(variable)
 
 
+@tf_export("assert_variables_initialized")
 @tf_should_use.should_use_result
 def assert_variables_initialized(var_list=None):
   """Returns an Op to check if variables are initialized.
@@ -1515,6 +1643,7 @@ def assert_variables_initialized(var_list=None):
       return array_ops.stack(ranks)
 
 
+@tf_export("report_uninitialized_variables")
 @tf_should_use.should_use_result
 def report_uninitialized_variables(var_list=None,
                                    name="report_uninitialized_variables"):
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index 1d8acf3f006bd26ece974ef3f3674e7f13d9f827..cce64c0ccafc29a9d0d0b51b4c97c5673264657b 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -23,6 +23,7 @@ import sys as _sys
 
 from tensorflow.python.platform import flags
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _usage(shorthelp):
@@ -108,19 +109,15 @@ def _define_help_flags():
     _define_help_flags_called = True
 
 
+@tf_export('app.run')
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list."""
 
   # Define help flags.
   _define_help_flags()
 
-  # Parse flags.
-  try:
-    argv = flags.FLAGS(_sys.argv if argv is None else argv)
-  except flags.Error as error:
-    _sys.stderr.write('FATAL Flags parsing error: %s\n' % error)
-    _sys.stderr.write('Pass --helpshort or --helpfull to see help on flags.\n')
-    _sys.exit(1)
+  # Parse known flags.
+  argv = flags.FLAGS(_sys.argv if argv is None else argv, known_only=True)
 
   main = main or _sys.modules['__main__'].main
 
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index 837bca1dbd06c9ee4adbf05bfc7cf3586d072d16..12dae94a6404e58d31cf88af83251e4bc9e50df3 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -33,6 +33,7 @@ from tensorflow.python.platform import app
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 
 # When a subclass of the Benchmark class is created, it is added to
@@ -181,6 +182,7 @@ class Benchmark(six.with_metaclass(_BenchmarkRegistrar, object)):
         throughput=throughput, extras=extras)
 
 
+@tf_export("test.Benchmark")
 class TensorFlowBenchmark(Benchmark):
   """Abstract class that provides helpers for TensorFlow benchmarks."""
 
diff --git a/tensorflow/python/platform/flags.py b/tensorflow/python/platform/flags.py
index e9a36ae75d6ce4763ff83c97bec008a4da0897b0..6225db77440e9d63eade956c5c4749c9e2884f6c 100644
--- a/tensorflow/python/platform/flags.py
+++ b/tensorflow/python/platform/flags.py
@@ -18,5 +18,108 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import logging as _logging
+import sys as _sys
+
 # go/tf-wildcard-import
 from absl.flags import *  # pylint: disable=wildcard-import
+import six as _six
+
+from tensorflow.python.util import tf_decorator
+
+
+# Since we wrap absl.flags DEFINE functions, we need to declare this module
+# does not affect key flags.
+disclaim_key_flags()  # pylint: disable=undefined-variable
+
+
+_RENAMED_ARGUMENTS = {
+    'flag_name': 'name',
+    'default_value': 'default',
+    'docstring': 'help',
+}
+
+
+def _wrap_define_function(original_function):
+  """Wraps absl.flags's define functions so tf.flags accepts old names."""
+
+  def wrapper(*args, **kwargs):
+    """Wrapper function that turns old keyword names to new ones."""
+    has_old_names = False
+    for old_name, new_name in _six.iteritems(_RENAMED_ARGUMENTS):
+      if old_name in kwargs:
+        has_old_names = True
+        value = kwargs.pop(old_name)
+        kwargs[new_name] = value
+    if has_old_names:
+      _logging.warning(
+          'Use of the keyword argument names (flag_name, default_value, '
+          'docstring) is deprecated, please use (name, default, help) instead.')
+    return original_function(*args, **kwargs)
+
+  return tf_decorator.make_decorator(original_function, wrapper)
+
+
+class _FlagValuesWrapper(object):
+  """Wrapper class for absl.flags.FLAGS.
+
+  The difference is that tf.flags.FLAGS implicitly parses flags with sys.argv
+  when accessing the FLAGS values before it's explicitly parsed,
+  while absl.flags.FLAGS raises an exception.
+  """
+
+  def __init__(self, flags_object):
+    self.__dict__['__wrapped'] = flags_object
+
+  def __getattribute__(self, name):
+    if name == '__dict__':
+      return super(_FlagValuesWrapper, self).__getattribute__(name)
+    return self.__dict__['__wrapped'].__getattribute__(name)
+
+  def __getattr__(self, name):
+    wrapped = self.__dict__['__wrapped']
+    # To maintain backwards compatibility, implicitly parse flags when reading
+    # a flag.
+    if not wrapped.is_parsed():
+      wrapped(_sys.argv)
+    return wrapped.__getattr__(name)
+
+  def __setattr__(self, name, value):
+    return self.__dict__['__wrapped'].__setattr__(name, value)
+
+  def __delattr__(self, name):
+    return self.__dict__['__wrapped'].__delattr__(name)
+
+  def __dir__(self):
+    return self.__dict__['__wrapped'].__dir__()
+
+  def __getitem__(self, name):
+    return self.__dict__['__wrapped'].__getitem__(name)
+
+  def __setitem__(self, name, flag):
+    return self.__dict__['__wrapped'].__setitem__(name, flag)
+
+  def __len__(self):
+    return self.__dict__['__wrapped'].__len__()
+
+  def __iter__(self):
+    return self.__dict__['__wrapped'].__iter__()
+
+  def __str__(self):
+    return self.__dict__['__wrapped'].__str__()
+
+  def __call__(self, *args, **kwargs):
+    return self.__dict__['__wrapped'].__call__(*args, **kwargs)
+
+
+# pylint: disable=invalid-name,used-before-assignment
+# absl.flags APIs use `default` as the name of the default value argument.
+# Allow the following functions continue to accept `default_value`.
+DEFINE_string = _wrap_define_function(DEFINE_string)
+DEFINE_boolean = _wrap_define_function(DEFINE_boolean)
+DEFINE_bool = DEFINE_boolean
+DEFINE_float = _wrap_define_function(DEFINE_float)
+DEFINE_integer = _wrap_define_function(DEFINE_integer)
+# pylint: enable=invalid-name,used-before-assignment
+
+FLAGS = _FlagValuesWrapper(FLAGS)  # pylint: disable=used-before-assignment
diff --git a/tensorflow/python/platform/flags_test.py b/tensorflow/python/platform/flags_test.py
index 23060e17d279cfb282f20610e0a1639db3a43ecf..bd3c8e39959a41ada22f7ee4cef4d3d462e9e6cf 100644
--- a/tensorflow/python/platform/flags_test.py
+++ b/tensorflow/python/platform/flags_test.py
@@ -17,18 +17,110 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
 import unittest
 
 from absl import flags as absl_flags
 
 from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+
+
+flags.DEFINE_string(
+    flag_name='old_string', default_value='default', docstring='docstring')
+flags.DEFINE_string(
+    name='new_string', default='default', help='docstring')
+flags.DEFINE_integer(
+    flag_name='old_integer', default_value=1, docstring='docstring')
+flags.DEFINE_integer(
+    name='new_integer', default=1, help='docstring')
+flags.DEFINE_float(
+    flag_name='old_float', default_value=1.5, docstring='docstring')
+flags.DEFINE_float(
+    name='new_float', default=1.5, help='docstring')
+flags.DEFINE_bool(
+    flag_name='old_bool', default_value=True, docstring='docstring')
+flags.DEFINE_bool(
+    name='new_bool', default=True, help='docstring')
+flags.DEFINE_boolean(
+    flag_name='old_boolean', default_value=False, docstring='docstring')
+flags.DEFINE_boolean(
+    name='new_boolean', default=False, help='docstring')
 
 
 class FlagsTest(unittest.TestCase):
 
-  def test_global_flags_object(self):
-    self.assertIs(flags.FLAGS, absl_flags.FLAGS)
+  def setUp(self):
+    self.original_flags = flags.FlagValues()
+    self.wrapped_flags = flags._FlagValuesWrapper(self.original_flags)
+    flags.DEFINE_string(
+        'test', 'default', 'test flag', flag_values=self.wrapped_flags)
+
+  def test_attribute_overrides(self):
+    # Test that methods defined in absl.flags.FlagValues are the same as the
+    # wrapped ones.
+    self.assertEqual(flags.FLAGS.is_parsed, absl_flags.FLAGS.is_parsed)
+
+  def test_getattr(self):
+    self.assertFalse(self.wrapped_flags.is_parsed())
+    with test.mock.patch.object(sys, 'argv', new=['program', '--test=new']):
+      self.assertEqual('new', self.wrapped_flags.test)
+    self.assertTrue(self.wrapped_flags.is_parsed())
+
+  def test_setattr(self):
+    self.assertEqual('default', self.wrapped_flags.test)
+    self.wrapped_flags.test = 'new'
+    self.assertEqual('new', self.wrapped_flags.test)
+
+  def test_delattr(self):
+    del self.wrapped_flags.test
+    self.assertNotIn('test', self.wrapped_flags)
+    with self.assertRaises(AttributeError):
+      _ = self.wrapped_flags.test
+
+  def test_dir(self):
+    self.assertEqual(['test'], dir(self.wrapped_flags))
+
+  def test_getitem(self):
+    self.assertIs(self.original_flags['test'], self.wrapped_flags['test'])
+
+  def test_setitem(self):
+    flag = flags.Flag(flags.ArgumentParser(), flags.ArgumentSerializer(),
+                      'fruit', 'apple', 'the fruit type')
+    self.wrapped_flags['fruit'] = flag
+    self.assertIs(self.original_flags['fruit'], self.wrapped_flags['fruit'])
+    self.assertEqual('apple', self.wrapped_flags.fruit)
+
+  def test_len(self):
+    self.assertEqual(1, len(self.wrapped_flags))
+
+  def test_iter(self):
+    self.assertEqual(['test'], list(self.wrapped_flags))
+
+  def test_str(self):
+    self.assertEqual(str(self.wrapped_flags), str(self.original_flags))
+
+  def test_call(self):
+    self.wrapped_flags(['program', '--test=new'])
+    self.assertEqual('new', self.wrapped_flags.test)
+
+  def test_keyword_arguments(self):
+    test_cases = (
+        ('old_string', 'default'),
+        ('new_string', 'default'),
+        ('old_integer', 1),
+        ('new_integer', 1),
+        ('old_float', 1.5),
+        ('new_float', 1.5),
+        ('old_bool', True),
+        ('new_bool', True),
+        ('old_boolean', False),
+        ('new_boolean', False),
+    )
+    for flag_name, default_value in test_cases:
+      self.assertEqual(default_value, absl_flags.FLAGS[flag_name].default)
+      self.assertEqual('docstring', absl_flags.FLAGS[flag_name].help)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   unittest.main()
diff --git a/tensorflow/python/platform/gfile.py b/tensorflow/python/platform/gfile.py
index 202475efdf29e746fb8e985677d1f826741939fb..315889e9aa8851138bf8b07b9803cc2d360f354a 100644
--- a/tensorflow/python/platform/gfile.py
+++ b/tensorflow/python/platform/gfile.py
@@ -34,8 +34,10 @@ from tensorflow.python.lib.io.file_io import stat as Stat
 from tensorflow.python.lib.io.file_io import walk as Walk
 # pylint: enable=unused-import
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('gfile.GFile', 'gfile.Open')
 class GFile(_FileIO):
   """File I/O wrappers without thread locking."""
 
@@ -43,6 +45,7 @@ class GFile(_FileIO):
     super(GFile, self).__init__(name=name, mode=mode)
 
 
+@tf_export('gfile.FastGFile')
 class FastGFile(_FileIO):
   """File I/O wrappers without thread locking."""
 
diff --git a/tensorflow/python/platform/resource_loader.py b/tensorflow/python/platform/resource_loader.py
index 2455acb4c0c469acbb928c4ec44571e50e06de1f..8f7b12e2b2b92d9b2bfe397d0e7cba59e11bc1f6 100644
--- a/tensorflow/python/platform/resource_loader.py
+++ b/tensorflow/python/platform/resource_loader.py
@@ -29,8 +29,10 @@ import sys as _sys
 
 from tensorflow.python.util import tf_inspect as _inspect
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('resource_loader.load_resource')
 def load_resource(path):
   """Load the resource at given path, where path is relative to tensorflow/.
 
@@ -52,6 +54,7 @@ def load_resource(path):
 
 
 # pylint: disable=protected-access
+@tf_export('resource_loader.get_data_files_path')
 def get_data_files_path():
   """Get a direct path to the data files colocated with the script.
 
@@ -62,6 +65,7 @@ def get_data_files_path():
   return _os.path.dirname(_inspect.getfile(_sys._getframe(1)))
 
 
+@tf_export('resource_loader.get_root_dir_with_all_resources')
 def get_root_dir_with_all_resources():
   """Get a root directory containing all the data attributes in the build rule.
 
@@ -101,6 +105,7 @@ def get_root_dir_with_all_resources():
   return data_files_dir or script_dir
 
 
+@tf_export('resource_loader.get_path_to_datafile')
 def get_path_to_datafile(path):
   """Get the path to the specified file in the data dependencies.
 
@@ -120,6 +125,7 @@ def get_path_to_datafile(path):
   return _os.path.join(data_files_path, path)
 
 
+@tf_export('resource_loader.readahead_file_path')
 def readahead_file_path(path, readahead='128M'):  # pylint: disable=unused-argument
   """Readahead files not implemented; simply returns given path."""
   return path
diff --git a/tensorflow/python/platform/stacktrace_handler.i b/tensorflow/python/platform/stacktrace_handler.i
new file mode 100644
index 0000000000000000000000000000000000000000..be4eea4c2f8eadb54f2773b8d97058cb6f51d63a
--- /dev/null
+++ b/tensorflow/python/platform/stacktrace_handler.i
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+#include "tensorflow/core/platform/stacktrace_handler.h"
+%}
+
+%ignoreall
+%unignore tensorflow;
+%unignore tensorflow::testing;
+%unignore tensorflow::testing::InstallStacktraceHandler;
+%include "tensorflow/core/platform/stacktrace_handler.h"
+%unignoreall
diff --git a/tensorflow/python/platform/stacktrace_handler_test.py b/tensorflow/python/platform/stacktrace_handler_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2071f9d54ceb99831999ec08ab71d63862f1c36
--- /dev/null
+++ b/tensorflow/python/platform/stacktrace_handler_test.py
@@ -0,0 +1,81 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Test to make sure stack trace is generated in case of test failures."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import signal
+import subprocess
+import sys
+
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+
+
+# FLAGS defined at the bottom:
+# child (bool) set to true if we are running in the child process.
+FLAGS = None
+
+_CHILD_FLAG_HELP = 'Boolean. Set to true if this is the child process.'
+
+
+class StacktraceHandlerTest(test.TestCase):
+
+  def testChildProcessKillsItself(self):
+    if FLAGS.child:
+      os.kill(os.getpid(), signal.SIGABRT)
+
+  def testGeneratesStacktrace(self):
+    if FLAGS.child:
+      return
+
+    # Subprocess sys.argv[0] with --child=True
+    if sys.executable:
+      child_process = subprocess.Popen(
+          [sys.executable, sys.argv[0], '--child=True'], cwd=os.getcwd(),
+          stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    else:
+      child_process = subprocess.Popen(
+          [sys.argv[0], '--child=True'], cwd=os.getcwd(),
+          stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    # Capture its output. capture both stdout and stderr and append them.
+    # We are not worried about timing or order of messages in this test.
+    child_stdout, child_stderr = child_process.communicate()
+    child_output = child_stdout + child_stderr
+
+    # Make sure the child process is dead before we proceed.
+    child_process.wait()
+
+    logging.info('Output from the child process:')
+    logging.info(child_output)
+
+    # Verify a stack trace is printed.
+    self.assertIn(b'PyEval_EvalFrame', child_output)
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--child', type=bool, default=False, help=_CHILD_FLAG_HELP)
+  FLAGS, unparsed = parser.parse_known_args()
+
+  # Now update argv, so that unittest library does not get confused.
+  sys.argv = [sys.argv[0]] + unparsed
+  test.main()
diff --git a/tensorflow/python/platform/sysconfig.py b/tensorflow/python/platform/sysconfig.py
index 57635fb4d9d6698f1a6f1a51918fe3f269d8909b..5c50fa023dc3b216838390d9356a39e70e2362d2 100644
--- a/tensorflow/python/platform/sysconfig.py
+++ b/tensorflow/python/platform/sysconfig.py
@@ -27,10 +27,13 @@ from __future__ import print_function
 import os.path as _os_path
 
 from tensorflow.python.framework.versions import CXX11_ABI_FLAG as _CXX11_ABI_FLAG
+from tensorflow.python.framework.versions import MONOLITHIC_BUILD as _MONOLITHIC_BUILD
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=g-import-not-at-top
+@tf_export('sysconfig.get_include')
 def get_include():
   """Get the directory containing the TensorFlow C++ header files.
 
@@ -45,6 +48,7 @@ def get_include():
   return _os_path.join(_os_path.dirname(tf.__file__), 'include')
 
 
+@tf_export('sysconfig.get_lib')
 def get_lib():
   """Get the directory containing the TensorFlow framework library.
 
@@ -55,6 +59,7 @@ def get_lib():
   return _os_path.join(_os_path.dirname(tf.__file__))
 
 
+@tf_export('sysconfig.get_compile_flags')
 def get_compile_flags():
   """Get the compilation flags for custom operators.
 
@@ -68,6 +73,7 @@ def get_compile_flags():
   return flags
 
 
+@tf_export('sysconfig.get_link_flags')
 def get_link_flags():
   """Get the link flags for custom operators.
 
@@ -75,8 +81,9 @@ def get_link_flags():
     The link flags.
   """
   flags = []
-  flags.append('-L%s' % get_lib())
-  flags.append('-ltensorflow_framework')
+  if not _MONOLITHIC_BUILD:
+    flags.append('-L%s' % get_lib())
+    flags.append('-ltensorflow_framework')
   return flags
 
 _allowed_symbols = []
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 72025f671721a49fe501f894355b92ced8b7a4de..9b7655722ac5a917f2753617f8e99bf2bd2f8d11 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -56,6 +56,7 @@ from tensorflow.python.ops.gradient_checker import compute_gradient
 # pylint: enable=unused-import,g-bad-import-order
 
 import sys
+from tensorflow.python.util.tf_export import tf_export
 if sys.version_info.major == 2:
   import mock                # pylint: disable=g-import-not-at-top,unused-import
 else:
@@ -68,11 +69,14 @@ Benchmark = _googletest.Benchmark  # pylint: disable=invalid-name
 StubOutForTesting = _googletest.StubOutForTesting  # pylint: disable=invalid-name
 
 
+@tf_export('test.main')
 def main(argv=None):
   """Runs all unit tests."""
+  _test_util.InstallStackTraceHandler()
   return _googletest.main(argv)
 
 
+@tf_export('test.get_temp_dir')
 def get_temp_dir():
   """Returns a temporary directory for use during tests.
 
@@ -84,6 +88,7 @@ def get_temp_dir():
   return _googletest.GetTempDir()
 
 
+@tf_export('test.test_src_dir_path')
 def test_src_dir_path(relative_path):
   """Creates an absolute test srcdir path given a relative path.
 
@@ -97,6 +102,7 @@ def test_src_dir_path(relative_path):
   return _googletest.test_src_dir_path(relative_path)
 
 
+@tf_export('test.is_built_with_cuda')
 def is_built_with_cuda():
   """Returns whether TensorFlow was built with CUDA (GPU) support."""
   return _test_util.IsGoogleCudaEnabled()
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 85ed4f071c7022801f20db75d538e5917b8eea66..22aabfd7121ac9b2eebeae2693f174e044d504ef 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -35,6 +35,7 @@ import threading
 import six
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Don't use this directly. Use _get_logger() instead.
@@ -90,30 +91,37 @@ def _get_logger():
     _logger_lock.release()
 
 
+@tf_export('logging.log')
 def log(level, msg, *args, **kwargs):
   _get_logger().log(level, msg, *args, **kwargs)
 
 
+@tf_export('logging.debug')
 def debug(msg, *args, **kwargs):
   _get_logger().debug(msg, *args, **kwargs)
 
 
+@tf_export('logging.error')
 def error(msg, *args, **kwargs):
   _get_logger().error(msg, *args, **kwargs)
 
 
+@tf_export('logging.fatal')
 def fatal(msg, *args, **kwargs):
   _get_logger().fatal(msg, *args, **kwargs)
 
 
+@tf_export('logging.info')
 def info(msg, *args, **kwargs):
   _get_logger().info(msg, *args, **kwargs)
 
 
+@tf_export('logging.warn')
 def warn(msg, *args, **kwargs):
   _get_logger().warn(msg, *args, **kwargs)
 
 
+@tf_export('logging.warning')
 def warning(msg, *args, **kwargs):
   _get_logger().warning(msg, *args, **kwargs)
 
@@ -136,15 +144,18 @@ _log_prefix = None  # later set to google2_log_prefix
 _log_counter_per_token = {}
 
 
+@tf_export('logging.TaskLevelStatusMessage')
 def TaskLevelStatusMessage(msg):
   error(msg)
 
 
+@tf_export('logging.flush')
 def flush():
   raise NotImplementedError()
 
 
 # Code below is taken from pyglib/logging
+@tf_export('logging.vlog')
 def vlog(level, msg, *args, **kwargs):
   _get_logger().log(level, msg, *args, **kwargs)
 
@@ -164,6 +175,7 @@ def _GetNextLogCountPerToken(token):
   return _log_counter_per_token[token]
 
 
+@tf_export('logging.log_every_n')
 def log_every_n(level, msg, n, *args):
   """Log 'msg % args' at level 'level' once per 'n' times.
 
@@ -180,6 +192,7 @@ def log_every_n(level, msg, n, *args):
   log_if(level, msg, not (count % n), *args)
 
 
+@tf_export('logging.log_first_n')
 def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   """Log 'msg % args' at level 'level' only first 'n' times.
 
@@ -195,6 +208,7 @@ def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   log_if(level, msg, count < n, *args)
 
 
+@tf_export('logging.log_if')
 def log_if(level, msg, condition, *args):
   """Log 'msg % args' at level 'level' only if condition is fulfilled."""
   if condition:
@@ -251,11 +265,13 @@ def google2_log_prefix(level, timestamp=None, file_and_line=None):
   return s
 
 
+@tf_export('logging.get_verbosity')
 def get_verbosity():
   """Return how much logging output will be produced."""
   return _get_logger().getEffectiveLevel()
 
 
+@tf_export('logging.set_verbosity')
 def set_verbosity(v):
   """Sets the threshold for what messages will be logged."""
   _get_logger().setLevel(v)
@@ -296,4 +312,10 @@ _allowed_symbols = [
     'warning',
 ]
 
+tf_export('logging.DEBUG').export_constant(__name__, 'DEBUG')
+tf_export('logging.ERROR').export_constant(__name__, 'ERROR')
+tf_export('logging.FATAL').export_constant(__name__, 'FATAL')
+tf_export('logging.INFO').export_constant(__name__, 'INFO')
+tf_export('logging.WARN').export_constant(__name__, 'WARN')
+
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 519b05975f03c5f1899f527636a4c855feceaacc..c815aad0a065eaba4a0dc52487b5ee67e271a146 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -57,7 +57,10 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:variables",
     ],
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "oss_serial",
+    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index dcac070a3f198d33e5a94233865b775a2e1254bb..362a1c49e64118134a4039ae3a5d939ed0b6d730 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -21,6 +21,7 @@ py_library(
     name = "model_analyzer_testlib",
     srcs = ["model_analyzer_testlib.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/python/profiler/internal/model_analyzer_testlib.py b/tensorflow/python/profiler/internal/model_analyzer_testlib.py
index 350a62c0eacafcab6e19c5110fe50c15cb5139ff..895646997b116dc5beb23264e97d1ec4c5aaa810 100644
--- a/tensorflow/python/profiler/internal/model_analyzer_testlib.py
+++ b/tensorflow/python/profiler/internal/model_analyzer_testlib.py
@@ -109,3 +109,9 @@ def ProfilerFromFile(profile_file):
   profiler = model_analyzer.Profiler.__new__(model_analyzer.Profiler)
   yield profiler
   print_mdl.DeleteProfiler()
+
+
+def CheckAndRemoveDoc(profile):
+  assert 'Doc:' in profile
+  start_pos = profile.find('Profile:')
+  return profile[start_pos + 9:]
diff --git a/tensorflow/python/profiler/internal/print_model_analysis_test.py b/tensorflow/python/profiler/internal/print_model_analysis_test.py
index 797c430e99b21a73a2260d45d6c9f25e26122806..186c028d7ccf63d8a4b6c1c97e793611671ad08f 100644
--- a/tensorflow/python/profiler/internal/print_model_analysis_test.py
+++ b/tensorflow/python/profiler/internal/print_model_analysis_test.py
@@ -18,22 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from google.protobuf import text_format
-
-from tensorflow.core.profiler import tfprof_options_pb2
-from tensorflow.core.profiler import tfprof_output_pb2
-from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
-# pylint: disable=g-bad-import-order
-# XXX: this depends on pywrap_tensorflow and must come later
-from tensorflow.python import pywrap_tensorflow as print_mdl
 
 # pylint: disable=bad-whitespace
 # pylint: disable=bad-continuation
@@ -69,407 +60,6 @@ class PrintModelAnalysisTest(test.TestCase):
     x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
     return x
 
-  def testPrintModelAnalysis(self):
-    opts = tfprof_options_pb2.OptionsProto()
-    opts.max_depth = TEST_OPTIONS['max_depth']
-    opts.min_bytes = TEST_OPTIONS['min_bytes']
-    opts.min_micros = TEST_OPTIONS['min_micros']
-    opts.min_params = TEST_OPTIONS['min_params']
-    opts.min_float_ops = TEST_OPTIONS['min_float_ops']
-    opts.order_by = TEST_OPTIONS['order_by']
-    opts.step = -1
-    for p in TEST_OPTIONS['account_type_regexes']:
-      opts.account_type_regexes.append(p)
-    for p in TEST_OPTIONS['start_name_regexes']:
-      opts.start_name_regexes.append(p)
-    for p in TEST_OPTIONS['trim_name_regexes']:
-      opts.trim_name_regexes.append(p)
-    for p in TEST_OPTIONS['show_name_regexes']:
-      opts.show_name_regexes.append(p)
-    for p in TEST_OPTIONS['hide_name_regexes']:
-      opts.hide_name_regexes.append(p)
-    opts.account_displayed_op_only = TEST_OPTIONS['account_displayed_op_only']
-    for p in TEST_OPTIONS['select']:
-      opts.select.append(p)
-    opts.output = TEST_OPTIONS['output']
-
-    with session.Session() as sess, ops.device('/cpu:0'):
-      _ = self._BuildSmallModel()
-      tfprof_pb = tfprof_output_pb2.GraphNodeProto()
-      tfprof_pb.ParseFromString(
-          print_mdl.PrintModelAnalysis(
-              sess.graph.as_graph_def(add_shapes=True).SerializeToString(),
-              b'',
-              b'',
-              b'scope',
-              opts.SerializeToString()))
-
-      expected_pb = tfprof_output_pb2.GraphNodeProto()
-      text_format.Merge(r"""name: "_TFProfRoot"
-          exec_micros: 0
-          requested_bytes: 0
-          total_exec_micros: 0
-          total_requested_bytes: 0
-          total_parameters: 648
-          children {
-            name: "Conv2D"
-            exec_micros: 0
-            requested_bytes: 0
-            total_exec_micros: 0
-            total_requested_bytes: 0
-            total_parameters: 0
-            float_ops: 0
-            total_float_ops: 0
-            input_shapes {
-              key: 0
-              value {
-                dim {
-                  size: 2
-                }
-                dim {
-                  size: 6
-                }
-                dim {
-                  size: 6
-                }
-                dim {
-                  size: 3
-                }
-              }
-            }
-            input_shapes {
-              key: 1
-              value {
-                dim {
-                  size: 6
-                }
-                dim {
-                  size: 6
-                }
-                dim {
-                  size: 3
-                }
-                dim {
-                  size: 6
-                }
-              }
-            }
-            accelerator_exec_micros: 0
-            cpu_exec_micros: 0
-            total_accelerator_exec_micros: 0
-            total_cpu_exec_micros: 0
-            run_count: 0
-            total_run_count: 0
-            total_definition_count: 1
-          }
-          children {
-            name: "DW"
-            exec_micros: 0
-            requested_bytes: 0
-            parameters: 648
-            total_exec_micros: 0
-            total_requested_bytes: 0
-            total_parameters: 648
-            children {
-              name: "DW/Assign"
-              exec_micros: 0
-              requested_bytes: 0
-              total_exec_micros: 0
-              total_requested_bytes: 0
-              total_parameters: 0
-              float_ops: 0
-              total_float_ops: 0
-              input_shapes {
-                key: 0
-                value {
-                  dim {
-                    size: 6
-                  }
-                  dim {
-                    size: 6
-                  }
-                  dim {
-                    size: 3
-                  }
-                  dim {
-                    size: 6
-                  }
-                }
-              }
-              input_shapes {
-                key: 1
-                value {
-                  dim {
-                    size: 6
-                  }
-                  dim {
-                    size: 6
-                  }
-                  dim {
-                    size: 3
-                  }
-                  dim {
-                    size: 6
-                  }
-                }
-              }
-              accelerator_exec_micros: 0
-              cpu_exec_micros: 0
-              total_accelerator_exec_micros: 0
-              total_cpu_exec_micros: 0
-              run_count: 0
-              total_run_count: 0
-              total_definition_count: 1
-            }
-            children {
-              name: "DW/Initializer"
-              exec_micros: 0
-              requested_bytes: 0
-              total_exec_micros: 0
-              total_requested_bytes: 0
-              total_parameters: 0
-              children {
-                name: "DW/Initializer/random_normal"
-                exec_micros: 0
-                requested_bytes: 0
-                total_exec_micros: 0
-                total_requested_bytes: 0
-                total_parameters: 0
-                children {
-                  name: "DW/Initializer/random_normal/RandomStandardNormal"
-                  exec_micros: 0
-                  requested_bytes: 0
-                  total_exec_micros: 0
-                  total_requested_bytes: 0
-                  total_parameters: 0
-                  float_ops: 0
-                  total_float_ops: 0
-                  input_shapes {
-                    key: 0
-                    value {
-                      dim {
-                        size: 4
-                      }
-                    }
-                  }
-                  accelerator_exec_micros: 0
-                  cpu_exec_micros: 0
-                  total_accelerator_exec_micros: 0
-                  total_cpu_exec_micros: 0
-                  run_count: 0
-                  total_run_count: 0
-                  total_definition_count: 1
-                }
-                children {
-                  name: "DW/Initializer/random_normal/mean"
-                  exec_micros: 0
-                  requested_bytes: 0
-                  total_exec_micros: 0
-                  total_requested_bytes: 0
-                  total_parameters: 0
-                  float_ops: 0
-                  total_float_ops: 0
-                  accelerator_exec_micros: 0
-                  cpu_exec_micros: 0
-                  total_accelerator_exec_micros: 0
-                  total_cpu_exec_micros: 0
-                  run_count: 0
-                  total_run_count: 0
-                  total_definition_count: 1
-                }
-                children {
-                  name: "DW/Initializer/random_normal/mul"
-                  exec_micros: 0
-                  requested_bytes: 0
-                  total_exec_micros: 0
-                  total_requested_bytes: 0
-                  total_parameters: 0
-                  float_ops: 0
-                  total_float_ops: 0
-                  input_shapes {
-                    key: 0
-                    value {
-                      dim {
-                        size: 6
-                      }
-                      dim {
-                        size: 6
-                      }
-                      dim {
-                        size: 3
-                      }
-                      dim {
-                        size: 6
-                      }
-                    }
-                  }
-                  input_shapes {
-                    key: 1
-                    value {
-                      dim {
-                        size: 1
-                      }
-                    }
-                  }
-                  accelerator_exec_micros: 0
-                  cpu_exec_micros: 0
-                  total_accelerator_exec_micros: 0
-                  total_cpu_exec_micros: 0
-                  run_count: 0
-                  total_run_count: 0
-                  total_definition_count: 1
-                }
-                children {
-                  name: "DW/Initializer/random_normal/shape"
-                  exec_micros: 0
-                  requested_bytes: 0
-                  total_exec_micros: 0
-                  total_requested_bytes: 0
-                  total_parameters: 0
-                  float_ops: 0
-                  total_float_ops: 0
-                  accelerator_exec_micros: 0
-                  cpu_exec_micros: 0
-                  total_accelerator_exec_micros: 0
-                  total_cpu_exec_micros: 0
-                  run_count: 0
-                  total_run_count: 0
-                  total_definition_count: 1
-                }
-                children {
-                  name: "DW/Initializer/random_normal/stddev"
-                  exec_micros: 0
-                  requested_bytes: 0
-                  total_exec_micros: 0
-                  total_requested_bytes: 0
-                  total_parameters: 0
-                  float_ops: 0
-                  total_float_ops: 0
-                  accelerator_exec_micros: 0
-                  cpu_exec_micros: 0
-                  total_accelerator_exec_micros: 0
-                  total_cpu_exec_micros: 0
-                  run_count: 0
-                  total_run_count: 0
-                  total_definition_count: 1
-                }
-                float_ops: 0
-                total_float_ops: 0
-                input_shapes {
-                  key: 0
-                  value {
-                    dim {
-                      size: 6
-                    }
-                    dim {
-                      size: 6
-                    }
-                    dim {
-                      size: 3
-                    }
-                    dim {
-                      size: 6
-                    }
-                  }
-                }
-                input_shapes {
-                  key: 1
-                  value {
-                    dim {
-                      size: 1
-                    }
-                  }
-                }
-                accelerator_exec_micros: 0
-                cpu_exec_micros: 0
-                total_accelerator_exec_micros: 0
-                total_cpu_exec_micros: 0
-                run_count: 0
-                total_run_count: 0
-                total_definition_count: 6
-              }
-              float_ops: 0
-              total_float_ops: 0
-              accelerator_exec_micros: 0
-              cpu_exec_micros: 0
-              total_accelerator_exec_micros: 0
-              total_cpu_exec_micros: 0
-              run_count: 0
-              total_run_count: 0
-              total_definition_count: 7
-            }
-            children {
-              name: "DW/read"
-              exec_micros: 0
-              requested_bytes: 0
-              total_exec_micros: 0
-              total_requested_bytes: 0
-              total_parameters: 0
-              float_ops: 0
-              total_float_ops: 0
-              input_shapes {
-                key: 0
-                value {
-                  dim {
-                    size: 6
-                  }
-                  dim {
-                    size: 6
-                  }
-                  dim {
-                    size: 3
-                  }
-                  dim {
-                    size: 6
-                  }
-                }
-              }
-              accelerator_exec_micros: 0
-              cpu_exec_micros: 0
-              total_accelerator_exec_micros: 0
-              total_cpu_exec_micros: 0
-              run_count: 0
-              total_run_count: 0
-              total_definition_count: 1
-            }
-            float_ops: 0
-            total_float_ops: 0
-            accelerator_exec_micros: 0
-            cpu_exec_micros: 0
-            total_accelerator_exec_micros: 0
-            total_cpu_exec_micros: 0
-            run_count: 0
-            total_run_count: 0
-            total_definition_count: 10
-          }
-          children {
-            name: "zeros"
-            exec_micros: 0
-            requested_bytes: 0
-            total_exec_micros: 0
-            total_requested_bytes: 0
-            total_parameters: 0
-            float_ops: 0
-            total_float_ops: 0
-            accelerator_exec_micros: 0
-            cpu_exec_micros: 0
-            total_accelerator_exec_micros: 0
-            total_cpu_exec_micros: 0
-            run_count: 0
-            total_run_count: 0
-            total_definition_count: 1
-          }
-          float_ops: 0
-          total_float_ops: 0
-          accelerator_exec_micros: 0
-          cpu_exec_micros: 0
-          total_accelerator_exec_micros: 0
-          total_cpu_exec_micros: 0
-          run_count: 0
-          total_run_count: 0
-          total_definition_count: 13""", expected_pb)
-      self.assertEqual(expected_pb, tfprof_pb)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index 4c915ac79a4534231846295f51c56f088948b594..fd893d6cde66e576976352bd8e0da639d22ce067 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -205,17 +205,13 @@ class RunMetadataTest(test.TestCase):
     for _, f in six.iteritems(back_to_forward):
       self.assertTrue(f in forward_op)
 
-  # pylint: disable=pointless-string-statement
-  """
-  # TODO(xpan): This test is flaky because RunMetadata returned from TensorFlow
-  # is random. Still being investigated.
   def testLoopGPU(self):
     if not test.is_gpu_available():
       return
 
     ops.reset_default_graph()
     with ops.device('/device:GPU:0'):
-      tfprof_node, run_meta = _run_loop_model()
+      _, run_meta = _run_loop_model()
       # The while-loop caused a node to appear 4 times in scheduling.
       ret = _extract_node(run_meta,
                           'rnn/while/basic_rnn_cell/MatMul')
@@ -227,11 +223,6 @@ class RunMetadataTest(test.TestCase):
 
       self.assertGreaterEqual(len(ret['gpu:0/stream:all']), 4, '%s' % run_meta)
 
-      total_accelerator_execs = 0
-      for node in ret['gpu:0/stream:all']:
-        total_accelerator_execs += node.op_end_rel_micros
-  """
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index 46a921c0a13ecca0febf6aa4085539abbd1a6fbf..0e20ca35bba606079ed5b0f225dd3029772b5af3 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -28,9 +28,12 @@ from google.protobuf import message
 from tensorflow.core.profiler import tfprof_options_pb2
 from tensorflow.core.profiler import tfprof_output_pb2
 from tensorflow.python import pywrap_tensorflow as print_mdl
+from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.profiler import option_builder
 from tensorflow.python.profiler import tfprof_logger
+from tensorflow.python.util.tf_export import tf_export
 
 _DEFAULT_PROFILE_OPTIONS = 0
 _DEFAULT_ADVISE_OPTIONS = 0
@@ -45,6 +48,14 @@ ALL_ADVICE = {
 }
 
 
+def _graph_string(graph):
+  """Helper to serialize a graph to string."""
+  if graph:
+    return graph.as_graph_def(add_shapes=True).SerializeToString()
+  else:
+    return b''
+
+
 def _build_options(options):
   """Build tfprof.OptionsProto.
 
@@ -111,6 +122,7 @@ def _build_advisor_options(options):
   return opts
 
 
+@tf_export('profiler.Profiler')
 class Profiler(object):
   """TensorFlow multi-step profiler.
 
@@ -151,24 +163,25 @@ class Profiler(object):
   ```
   """
 
-  def __init__(self, graph, op_log=None):
+  def __init__(self, graph=None, op_log=None):
     """Constructor.
 
     Args:
-      graph: tf.Graph.
+      graph: tf.Graph. If None and eager execution is not enabled, use
+          default graph.
       op_log: optional. tensorflow::tfprof::OpLogProto proto. Used to define
           extra op types.
     """
+    if not graph and context.in_graph_mode():
+      graph = ops.get_default_graph()
     self._coverage = 0.0
     self._graph = graph
     # pylint: disable=protected-access
-    op_log = tfprof_logger._merge_default_with_oplog(
+    op_log = tfprof_logger.merge_default_with_oplog(
         self._graph, op_log=op_log)
     # pylint: enable=protected-access
-
     print_mdl.NewProfiler(
-        self._graph.as_graph_def(add_shapes=True).SerializeToString(),
-        op_log.SerializeToString())
+        _graph_string(self._graph), op_log.SerializeToString())
 
   def __del__(self):
     print_mdl.DeleteProfiler()
@@ -177,19 +190,19 @@ class Profiler(object):
     """Add statistics of a step.
 
     Args:
-      step: int, A step used to identify the RunMetadata. Must be different
-         across different AddStep() calls.
+      step: int, An id used to group one or more different `run_meta` together.
+          When profiling with the profile_xxx APIs, user can use the `step`
+          id in the `options` to profile these `run_meta` together.
       run_meta: RunMetadata proto that contains statistics of a session run.
     """
     # pylint: disable=protected-access
-    op_log = tfprof_logger._merge_default_with_oplog(
+    op_log = tfprof_logger.merge_default_with_oplog(
         self._graph, run_meta=run_meta)
     # pylint: enable=protected-access
     # TODO(xpan): P1: Better to find the current graph.
-    self._coverage = print_mdl.AddStep(
-        step,
-        self._graph.as_graph_def(add_shapes=True).SerializeToString(),
-        run_meta.SerializeToString(), op_log.SerializeToString())
+    self._coverage = print_mdl.AddStep(step, _graph_string(self._graph),
+                                       run_meta.SerializeToString(),
+                                       op_log.SerializeToString())
 
   def profile_python(self, options):
     """Profile the statistics of the Python codes.
@@ -277,12 +290,24 @@ class Profiler(object):
         print_mdl.Profile('advise'.encode('utf-8'), opts.SerializeToString()))
     return advise_pb
 
+  def serialize_to_string(self):
+    """Serialize the ProfileProto to a binary string.
+
+      Users can write it to file for offline analysis by tfprof commandline
+      or graphical interface.
+
+    Returns:
+      ProfileProto binary string.
+    """
+    return print_mdl.SerializeToString()
+
   def _write_profile(self, filename):
     """Writes the profile to a file."""
     print_mdl.WriteProfile(filename)
 
 
-def profile(graph,
+@tf_export('profiler.profile')
+def profile(graph=None,
             run_meta=None,
             op_log=None,
             cmd='scope',
@@ -293,7 +318,8 @@ def profile(graph,
     https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/README.md
 
   Args:
-    graph: required tf.Graph.
+    graph: tf.Graph. If None and eager execution is not enabled, use
+        default graph.
     run_meta: optional tensorflow.RunMetadata proto. It is necessary to
         to support run time information profiling, such as time and memory.
     op_log: tensorflow.tfprof.OpLogProto proto. User can assign "types" to
@@ -310,12 +336,14 @@ def profile(graph,
     If cmd is 'op' or 'code', returns MultiGraphNodeProto proto.
     Side effect: stdout/file/timeline.json depending on options['output']
   """
+  if not graph and context.in_graph_mode():
+    graph = ops.get_default_graph()
+
   if options == _DEFAULT_PROFILE_OPTIONS:
     options = (option_builder.ProfileOptionBuilder
                .trainable_variables_parameter())
-
   # pylint: disable=protected-access
-  op_log = tfprof_logger._merge_default_with_oplog(
+  op_log = tfprof_logger.merge_default_with_oplog(
       graph, op_log, run_meta, add_trace=cmd == 'code')
   # pylint: enable=protected-access
 
@@ -323,14 +351,14 @@ def profile(graph,
 
   run_meta_str = run_meta.SerializeToString() if run_meta else b''
 
+  graph_str = _graph_string(graph)
+
   if cmd == 'code' or cmd == 'op':
     tfprof_node = tfprof_output_pb2.MultiGraphNodeProto()
-    ret = print_mdl.PrintModelAnalysis(
-        graph.as_graph_def(add_shapes=True).SerializeToString(),
-        run_meta_str,
-        op_log.SerializeToString(),
-        cmd.encode('utf-8'),
-        opts.SerializeToString())
+    ret = print_mdl.PrintModelAnalysis(graph_str, run_meta_str,
+                                       op_log.SerializeToString(),
+                                       cmd.encode('utf-8'),
+                                       opts.SerializeToString())
     try:
       tfprof_node.ParseFromString(ret)
     except message.DecodeError as e:
@@ -338,12 +366,10 @@ def profile(graph,
 
   elif cmd == 'graph' or cmd == 'scope':
     tfprof_node = tfprof_output_pb2.GraphNodeProto()
-    ret = print_mdl.PrintModelAnalysis(
-        graph.as_graph_def(add_shapes=True).SerializeToString(),
-        run_meta_str,
-        op_log.SerializeToString(),
-        cmd.encode('utf-8'),
-        opts.SerializeToString())
+    ret = print_mdl.PrintModelAnalysis(graph_str, run_meta_str,
+                                       op_log.SerializeToString(),
+                                       cmd.encode('utf-8'),
+                                       opts.SerializeToString())
     try:
       tfprof_node.ParseFromString(ret)
     except message.DecodeError as e:
@@ -355,7 +381,8 @@ def profile(graph,
   return tfprof_node
 
 
-def advise(graph, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
+@tf_export('profiler.advise')
+def advise(graph=None, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
   """Auto profile and advise.
 
     Builds profiles and automatically check anomalies of various
@@ -363,18 +390,22 @@ def advise(graph, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
     https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/README.md
 
   Args:
-    graph: required tf.Graph.
+    graph: tf.Graph. If None and eager execution is not enabled, use
+        default graph.
     run_meta: optional tensorflow.RunMetadata proto. It is necessary to
         to support run time information profiling, such as time and memory.
     options: see ALL_ADVICE example above. Default checks everything.
   Returns:
     Returns AdviceProto proto
   """
+  if not graph and context.in_eager_execution():
+    graph = ops.get_default_graph()
+
   if options == _DEFAULT_ADVISE_OPTIONS:
     options = ALL_ADVICE.copy()
 
   # pylint: disable=protected-access
-  op_log = tfprof_logger._merge_default_with_oplog(
+  op_log = tfprof_logger.merge_default_with_oplog(
       graph, None, run_meta, add_trace=True)
   # pylint: enable=protected-access
 
@@ -384,9 +415,6 @@ def advise(graph, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
   ret = tfprof_output_pb2.AdviceProto()
   ret.ParseFromString(
       print_mdl.PrintModelAnalysis(
-          graph.as_graph_def(add_shapes=True).SerializeToString(),
-          run_meta_str,
-          op_log.SerializeToString(),
-          'advise'.encode('utf-8'),
-          opts.SerializeToString()))
+          _graph_string(graph), run_meta_str, op_log.SerializeToString(),
+          'advise'.encode('utf-8'), opts.SerializeToString()))
   return ret
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 26fb99efe6753f36fbe50a5a310927276ddcbf2e..04ba28c219e276e1ca79bd4e20e7d1b6ee700db5 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -23,12 +23,20 @@ import os
 import random
 import re
 
+import numpy as np
+
 from tensorflow.core.profiler import profile_pb2
+from tensorflow.core.profiler import tfprof_log_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
@@ -60,7 +68,7 @@ class PrintModelAnalysisTest(test.TestCase):
                          '  DW (3x3x3x6, 162/162 params)\n'
                          '  DW2 (2x2x6x12, 288/288 params)\n'
                          '  ScalarW (1, 1/1 params)\n',
-                         f.read())
+                         lib.CheckAndRemoveDoc(f.read()))
 
   def testSelectEverythingDetail(self):
     ops.reset_default_graph()
@@ -87,7 +95,7 @@ class PrintModelAnalysisTest(test.TestCase):
 
         with gfile.Open(outfile, 'r') as f:
           # pylint: disable=line-too-long
-          dump_str = f.read()
+          dump_str = lib.CheckAndRemoveDoc(f.read())
           outputs = dump_str.split('\n')
 
           self.assertEqual(outputs[0],
@@ -130,7 +138,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with lib.ProfilerFromFile(profile_file) as profiler:
       profiler.profile_name_scope(options=opts)
       with gfile.Open(outfile, 'r') as f:
-        self.assertEqual(dump_str, f.read())
+        self.assertEqual(dump_str, lib.CheckAndRemoveDoc(f.read()))
 
   def testSelectEverything(self):
     ops.reset_default_graph()
@@ -158,13 +166,6 @@ class PrintModelAnalysisTest(test.TestCase):
       model_analyzer.profile(
           sess.graph, run_meta, options=opts)
 
-      with gfile.Open(outfile, 'r') as f:
-        # pylint: disable=line-too-long
-        self.assertEqual(
-            'node name | # parameters | # float_ops | assigned devices | op types | op count (run|defined) | input shapes\n_TFProfRoot (--/451 params, --/11.34k flops, _kTFScopeParent, --/8|--/36, )\n  Conv2D (0/0 params, 5.83k/5.83k flops, /job:localhost/replica:0/task:0/device:cpu:0, /job:localhost/replica:0/task:0/device:cpu:0|Conv2D, 1/1|1/1, 0:2x6x6x3|1:3x3x3x6)\n  Conv2D_1 (0/0 params, 4.61k/4.61k flops, /job:localhost/replica:0/task:0/device:cpu:0, /job:localhost/replica:0/task:0/device:cpu:0|Conv2D, 1/1|1/1, 0:2x3x3x6|1:2x2x6x12)\n  DW (3x3x3x6, 162/162 params, 0/324 flops, /job:localhost/replica:0/task:0/device:cpu:0, /job:localhost/replica:0/task:0/device:cpu:0|VariableV2|_trainable_variables, 1/2|1/10, )\n    DW/Assign (0/0 params, 0/0 flops, Assign, 0/0|1/1, 0:3x3x3x6|1:3x3x3x6)\n    DW/Initializer (0/0 params, 0/324 flops, _kTFScopeParent, 0/0|1/7, )\n      DW/Initializer/random_normal (0/0 params, 162/324 flops, Add, 0/0|1/6, 0:3x3x3x6|1:1)\n        DW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, RandomStandardNormal, 0/0|1/1, 0:4)\n        DW/Initializer/random_normal/mean (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n        DW/Initializer/random_normal/mul (0/0 params, 162/162 flops, Mul, 0/0|1/1, 0:3x3x3x6|1:1)\n        DW/Initializer/random_normal/shape (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n        DW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n    DW/read (0/0 params, 0/0 flops, /job:localhost/replica:0/task:0/device:cpu:0, /job:localhost/replica:0/task:0/device:cpu:0|Identity, 1/1|1/1, 0:3x3x3x6)\n  DW2 (2x2x6x12, 288/288 params, 0/576 flops, /job:localhost/replica:0/task:0/device:cpu:0, /job:localhost/replica:0/task:0/device:cpu:0|VariableV2|_trainable_variables, 1/2|1/10, )\n    DW2/Assign (0/0 params, 0/0 flops, Assign, 0/0|1/1, 0:2x2x6x12|1:2x2x6x12)\n    DW2/Initializer (0/0 params, 0/576 flops, _kTFScopeParent, 0/0|1/7, )\n      DW2/Initializer/random_normal (0/0 params, 288/576 flops, Add, 0/0|1/6, 0:2x2x6x12|1:1)\n        DW2/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, RandomStandardNormal, 0/0|1/1, 0:4)\n        DW2/Initializer/random_normal/mean (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n        DW2/Initializer/random_normal/mul (0/0 params, 288/288 flops, Mul, 0/0|1/1, 0:2x2x6x12|1:1)\n        DW2/Initializer/random_normal/shape (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n        DW2/Initializer/random_normal/stddev (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n    DW2/read (0/0 params, 0/0 flops, /job:localhost/replica:0/task:0/device:cpu:0, /job:localhost/replica:0/task:0/device:cpu:0|Identity, 1/1|1/1, 0:2x2x6x12)\n  ScalarW (1, 1/1 params, 0/2 flops, VariableV2|_trainable_variables, 0/0|1/10, )\n    ScalarW/Assign (0/0 params, 0/0 flops, Assign, 0/0|1/1, 0:1|1:1)\n    ScalarW/Initializer (0/0 params, 0/2 flops, _kTFScopeParent, 0/0|1/7, )\n      ScalarW/Initializer/random_normal (0/0 params, 1/2 flops, Add, 0/0|1/6, 0:1|1:1)\n        ScalarW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, RandomStandardNormal, 0/0|1/1, 0:0)\n        ScalarW/Initializer/random_normal/mean (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n        ScalarW/Initializer/random_normal/mul (0/0 params, 1/1 flops, Mul, 0/0|1/1, 0:1|1:1)\n        ScalarW/Initializer/random_normal/shape (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n        ScalarW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, Const, 0/0|1/1, )\n    ScalarW/read (0/0 params, 0/0 flops, Identity, 0/0|1/1, 0:1)\n  _retval_Conv2D_1_0_0 (0/0 params, 0/0 flops, /job:localhost/replica:0/task:0/device:cpu:0, /job:localhost/replica:0/task:0/device:cpu:0|_retval_Conv2D_1_0_0, 1/1|1/1, )\n  init (0/0 params, 0/0 flops, NoOp, 0/0|1/1, 0:1|1:3x3x3x6|2:2x2x6x12)\n  zeros (0/0 params, 0/0 flops, /job:localhost/replica:0/task:0/device:cpu:0, /job:localhost/replica:0/task:0/device:cpu:0|Const, 1/1|1/1, )\n',
-            f.read())
-        # pylint: enable=line-too-long
-
   def testSimpleCodeView(self):
     ops.reset_default_graph()
     outfile = os.path.join(test.get_temp_dir(), 'dump')
@@ -195,7 +196,7 @@ class PrintModelAnalysisTest(test.TestCase):
         # pylint: disable=line-too-long
         self.assertEqual(
             'node name | requested bytes | # parameters | # float_ops | assigned devices | in',
-            f.read()[0:80])
+            lib.CheckAndRemoveDoc(f.read())[0:80])
         # pylint: enable=line-too-long
 
   def testComplexCodeView(self):
@@ -223,13 +224,15 @@ class PrintModelAnalysisTest(test.TestCase):
         # pylint: disable=line-too-long
         with gfile.Open(outfile, 'r') as f:
           lines = f.read().split('\n')
+          self.assertGreater(len(lines), 5)
           result = '\n'.join([l[:min(len(l), 80)] for l in lines])
-          self.assertEqual(compat.as_bytes('node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/168.85k flops)\n  model_analyzer_testlib.py:63:BuildFullModel (0/1.80k params, 0/45.37k flops)\n    model_analyzer_testlib.py:40:BuildSmallModel (0/0 params, 0/0 flops)\n    model_analyzer_testlib.py:44:BuildSmallModel (0/4 params, 0/8 flops)\n    model_analyzer_testlib.py:48:BuildSmallModel (0/648 params, 0/1.30k flops)\n    model_analyzer_testlib.py:49:BuildSmallModel (0/0 params, 0/23.33k flops)\n    model_analyzer_testlib.py:53:BuildSmallModel (0/1.15k params, 0/2.30k flops)\n    model_analyzer_testlib.py:54:BuildSmallModel (0/0 params, 0/18.43k flops)\n  model_analyzer_testlib.py:63:BuildFullModel (gradient) (0/0 params, 0/67.39k f\n    model_analyzer_testlib.py:49:BuildSmallModel (gradient) (0/0 params, 0/46.66\n    model_analyzer_testlib.py:54:BuildSmallModel (gradient) (0/0 params, 0/20.74\n  model_analyzer_testlib.py:67:BuildFullModel (0/1.04k params, 0/18.57k flops)\n  model_analyzer_testlib.py:67:BuildFullModel (gradient) (0/0 params, 0/37.00k f\n  model_analyzer_testlib.py:69:BuildFullModel (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:70:BuildFullModel (0/0 params, 0/258 flops)\n  model_analyzer_testlib.py:70:BuildFullModel (gradient) (0/0 params, 0/129 flop\n  model_analyzer_testlib.py:72:BuildFullModel (0/0 params, 0/141 flops)\n'),
-                           compat.as_bytes(result))
+          self.assertTrue(
+              compat.as_text(lib.CheckAndRemoveDoc(result))
+              .startswith('node name | # parameters | # float_ops'))
 
         self.assertLess(0, tfprof_node.total_exec_micros)
         self.assertEqual(2844, tfprof_node.total_parameters)
-        self.assertEqual(168854, tfprof_node.total_float_ops)
+        self.assertLess(168800, tfprof_node.total_float_ops)
         self.assertEqual(8, len(tfprof_node.children))
         self.assertEqual('_TFProfRoot', tfprof_node.name)
         self.assertEqual(
@@ -346,8 +349,9 @@ class PrintModelAnalysisTest(test.TestCase):
       with gfile.Open(outfile, 'r') as f:
         # pylint: disable=line-too-long
         self.assertEqual(
-            'nodename|requestedbytes|peakbytes|residualbytes|outputbytes|totalexecutiontime|acceleratorexecutiontime|cpuexecutiontime|#parameters|opoccurrence(run|defined)|inputshapes\nConst0B(0',
-            f.read().replace('\t', '').replace(' ', '')[0:180])
+            'nodename|requestedbytes|peakbytes|residualbytes|outputbytes|totalexecutiontime|acceleratorexecutiontime|cpuexecutiontime|#parameters|opoccurrence(run|defined)|inputshapes',
+            lib.CheckAndRemoveDoc(f.read()).replace('\t',
+                                                    '').replace(' ', '')[0:170])
         # pylint: enable=line-too-long
 
       total_children = 0
@@ -370,7 +374,6 @@ class PrintModelAnalysisTest(test.TestCase):
         self.assertLessEqual(len(tfprof_node.graph_nodes), last_occurrence)
         last_occurrence = len(tfprof_node.graph_nodes)
 
-      self.assertEqual(total_children, 15)
       self.assertGreater(input_shapes, 0)
 
   def testAdvisor(self):
@@ -694,6 +697,101 @@ class PrintModelAnalysisTest(test.TestCase):
                       exception_str)
       self.assertTrue(mat is None)
 
+  def testTrackPersistentBytes(self):
+    ops.reset_default_graph()
+    a = array_ops.constant(np.ones((100, 100)))
+    b = array_ops.constant(np.ones((100, 100)))
+    c = a * b
+
+    with session.Session() as sess:
+      run_options = config_pb2.RunOptions(
+          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(c, options=run_options, run_metadata=run_metadata)
+
+      options = option_builder.ProfileOptionBuilder.time_and_memory()
+      options['min_bytes'] = 0
+      options['select'] = ('bytes', 'peak_bytes', 'output_bytes',
+                           'residual_bytes')
+      ret = model_analyzer.profile(
+          sess.graph, run_meta=run_metadata, cmd='scope', options=options)
+
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(c, options=run_options, run_metadata=run_metadata)
+      ret2 = model_analyzer.profile(
+          sess.graph, run_meta=run_metadata, cmd='scope', options=options)
+
+      n = lib.SearchTFProfNode(ret, 'mul')
+      n2 = lib.SearchTFProfNode(ret2, 'mul')
+      self.assertGreater(n.peak_bytes, 0)
+      self.assertGreater(n.output_bytes, 0)
+      self.assertGreater(n.residual_bytes, 0)
+      self.assertEqual(n.peak_bytes, n2.peak_bytes)
+      self.assertEqual(n.output_bytes, n2.output_bytes)
+      self.assertEqual(n.residual_bytes, n2.residual_bytes)
+
+  def testTraceLoopBytes(self):
+    if not test.is_gpu_available(): return
+    ops.reset_default_graph()
+    steps = 100
+
+    with ops.device('/gpu:0'):
+      x = array_ops.ones((100, 100), dtype=dtypes.float32)
+      n = array_ops.constant(steps, dtype=dtypes.int32)
+      x1 = array_ops.ones((100, 100))
+
+      x *= x1
+      def loop_body(i, x):
+        x *= x
+        return i + 1, x
+
+      _, y = control_flow_ops.while_loop(
+          lambda i, x: i < n, loop_body,
+          [array_ops.constant(0), x])
+
+    grad = gradients.gradients(y, [x1])
+
+    with session.Session() as sess:
+      run_options = config_pb2.RunOptions(
+          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(grad, options=run_options, run_metadata=run_metadata)
+
+      options = option_builder.ProfileOptionBuilder.time_and_memory()
+      options['min_bytes'] = 0
+      options['min_micros'] = 0
+      options['select'] = ('bytes', 'peak_bytes', 'output_bytes',
+                           'residual_bytes')
+      options['output'] = 'none'
+      ret_pb = model_analyzer.profile(
+          sess.graph, run_meta=run_metadata, cmd='scope', options=options)
+      self.assertGreater(ret_pb.total_requested_bytes, 1000000)
+
+  def testEager(self):
+    ops.reset_default_graph()
+    with context.eager_mode():
+      outfile = os.path.join(test.get_temp_dir(), 'dump')
+      opts = builder(
+          builder.time_and_memory()).with_file_output(outfile).build()
+      context.enable_run_metadata()
+      lib.BuildSmallModel()
+
+      profiler = model_analyzer.Profiler()
+      profiler.add_step(0, context.export_run_metadata())
+      context.disable_run_metadata()
+      profiler.profile_operations(opts)
+      with gfile.Open(outfile, 'r') as f:
+        out_str = f.read()
+        self.assertTrue('Conv2D' in out_str)
+        self.assertTrue('VarHandleOp' in out_str)
+
+      with gfile.Open('/tmp/eager_profile', 'wb') as f:
+        profile_pb = tfprof_log_pb2.ProfileProto()
+        profile_pb.ParseFromString(profiler.serialize_to_string())
+        profile_pb_str = '%s' % profile_pb
+        self.assertTrue('Conv2D' in profile_pb_str)
+        self.assertTrue('VarHandleOp' in profile_pb_str)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py
index 13942ad6a2adc1f1d1cad778ebd280d358f64a59..2ad7adf76933df65ca795dca361397f436adb995 100644
--- a/tensorflow/python/profiler/option_builder.py
+++ b/tensorflow/python/profiler/option_builder.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 import copy
 
 from tensorflow.python.profiler import tfprof_logger
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('profiler.ProfileOptionBuilder')
 class ProfileOptionBuilder(object):
   # pylint: disable=line-too-long
   """Option Builder for Profiling API.
@@ -298,7 +300,7 @@ class ProfileOptionBuilder(object):
     # pylint: disable=line-too-long
     """Only show profiler nodes consuming no less than 'min_float_ops'.
 
-    Please see https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profilerg3doc/profile_model_architecture.md
+    Please see https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/g3doc/profile_model_architecture.md
     on the caveats of calculating float operations.
 
     Args:
diff --git a/tensorflow/python/profiler/profile_context.py b/tensorflow/python/profiler/profile_context.py
index c7c7ad63012a153d41aa9d616dbd39acb46096f6..18eb66ef988c9f49eb04264545d417d8a986e16e 100644
--- a/tensorflow/python/profiler/profile_context.py
+++ b/tensorflow/python/profiler/profile_context.py
@@ -50,11 +50,12 @@ def _profiled_run(self,
   """Overwrites the session.run()."""
   # pylint: disable=protected-access
   # Count the session steps.
-  with self.profile_context._new_step() as step:
+  with self.profile_context._new_step() as state:
+    step, locked = state
     # Fast path if no need for profiling.
-    if not self.profile_context._is_fast_path():
+    if locked and not self.profile_context._is_fast_path(step):
       # Maybe trace this step.
-      if self.profile_context._should_trace(self.graph, fetches):
+      if self.profile_context._should_trace(step, self.graph, fetches):
         if self.profile_context._debug:
           sys.stderr.write('debug: tracing step: %d\n' % step)
         # Enable tracing, perform auto profiling or auto dump.
@@ -81,7 +82,7 @@ def _profiled_run(self,
         ret = self._profiler_run_internal(fetches, feed_dict, options)
 
       # Maybe dump profile.
-      self.profile_context._maybe_dump()
+      self.profile_context._maybe_dump(step)
 
       # Maybe profile:
       to_profiles = self.profile_context._profile_candidates()
@@ -225,26 +226,26 @@ class ProfileContext(object):
     self._dump_next_step = True
     self._slow_path_steps.add(self._step)
 
-  def _is_fast_path(self):
-    if self._step in self._slow_path_steps:
+  def _is_fast_path(self, step):
+    if step in self._slow_path_steps:
       return False
     # When user doesn't set the tracing steps explicitly, auto decide it.
-    if (self._auto_tracing and self._step > WARMUP_STEPS and
+    if (self._auto_tracing and step > WARMUP_STEPS and
         self._traced_steps <= MAX_TRACED_STEPS):
       return False
     return True
 
-  def _should_trace(self, graph, fetches):
+  def _should_trace(self, step, graph, fetches):
     """Whether should do tracing at current step."""
     if self._traced_steps > MAX_TRACED_STEPS:
       return False
     # Check user-set tracing steps.
-    if self._step in self._trace_steps or self._trace_next_step:
+    if step in self._trace_steps or self._trace_next_step:
       self._traced_steps += 1
       return True
 
     # If no user-set tracing steps set and passes warm up steps, auto trace.
-    if self._auto_tracing and self._step > WARMUP_STEPS:
+    if self._auto_tracing and step > WARMUP_STEPS:
       # If the fetches have not been seen before, trace it.
       with graph.as_default():
         fetch_names = [f.name for f in
@@ -257,23 +258,23 @@ class ProfileContext(object):
         self._traced_steps += 1
         return True
       # If the trace coverage is low, does some random tracing.
-      if (self.profiler._coverage < 0.5 and self._step < MAX_TRACED_STEPS and  # pylint: disable=protected-access
+      if (self.profiler._coverage < 0.5 and step < MAX_TRACED_STEPS and  # pylint: disable=protected-access
           self._rng.randint(0, 10) < 2):
         self._traced_steps += 1
         return True
     return False
 
-  def _maybe_dump(self):
+  def _maybe_dump(self, step):
     """Maybe dump the profile file."""
-    if not (self._step in self._dump_steps or self._dump_next_step):
+    if not (step in self._dump_steps or self._dump_next_step):
       return
     if self._debug:
-      sys.stderr.write('debug: dumping file at step: %d\n' % self._step)
+      sys.stderr.write('debug: dumping file at step: %d\n' % step)
     if not gfile.Exists(self._profiler_dir):
       gfile.MakeDirs(self._profiler_dir)
 
     filename = os.path.join(compat.as_bytes(self._profiler_dir),
-                            compat.as_bytes('profile_%d' % self._step))
+                            compat.as_bytes('profile_%d' % step))
     self.profiler._write_profile(filename)  # pylint: disable=protected-access
 
   def _dump_file(self, pb, basename):
@@ -284,11 +285,13 @@ class ProfileContext(object):
 
   @contextlib.contextmanager
   def _new_step(self):
-    with self._lock:
-      yield self._step
-      self._step += 1
-      self._trace_next_step = False
-      self._dump_next_step = False
+    acquired = self._lock.acquire(False)
+    yield (self._step, acquired)
+    self._step += 1
+    self._trace_next_step = False
+    self._dump_next_step = False
+    if acquired:
+      self._lock.release()
 
   def _profile_candidates(self):
     to_profile = []
diff --git a/tensorflow/python/profiler/profiler.py b/tensorflow/python/profiler/profiler.py
index 130dcb5134d6f7e6eb43aebea803b366a5ce27d8..fa7f30b236997cecd6d5df98c334aa7f5cc571e4 100644
--- a/tensorflow/python/profiler/profiler.py
+++ b/tensorflow/python/profiler/profiler.py
@@ -31,6 +31,7 @@ from tensorflow.python.profiler.option_builder import ProfileOptionBuilder
 from tensorflow.python.profiler.tfprof_logger import write_op_log
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 _allowed_symbols = [
@@ -48,6 +49,12 @@ _allowed_symbols.extend([
     'OpLogProto',
 ])
 
+# Export protos
+tf_export('profiler.GraphNodeProto')(GraphNodeProto)
+tf_export('profiler.MultiGraphNodeProto')(MultiGraphNodeProto)
+tf_export('profiler.AdviceProto')(AdviceProto)
+tf_export('profiler.OpLogProto')(OpLogProto)
+
 remove_undocumented(__name__, _allowed_symbols, [
     Profiler,
     profile,
diff --git a/tensorflow/python/profiler/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py
index 838064a1f0836a2041c2823f54fea4e6b5606d7f..8d121064967f2f87cd0aefaa361bfd6f387a3e6e 100644
--- a/tensorflow/python/profiler/tfprof_logger.py
+++ b/tensorflow/python/profiler/tfprof_logger.py
@@ -25,10 +25,12 @@ import sys
 
 import six
 from tensorflow.core.profiler import tfprof_log_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import gfile
 from tensorflow.python.profiler.internal import flops_registry  # pylint: disable=unused-import
+from tensorflow.python.util.tf_export import tf_export
 
 TRAINABLE_VARIABLES = '_trainable_variables'
 REGISTERED_FLOP_STATS = 'flops'
@@ -139,12 +141,13 @@ def _get_logged_ops(graph, run_meta=None, add_trace=True,
   return logged_ops, string_to_id
 
 
-def _merge_default_with_oplog(graph, op_log=None, run_meta=None,
-                              add_trace=True, add_trainable_var=True):
+def merge_default_with_oplog(graph, op_log=None, run_meta=None,
+                             add_trace=True, add_trainable_var=True):
   """Merge the tfprof default extra info with caller's op_log.
 
   Args:
-    graph: tf.Graph.
+    graph: tf.Graph. If None and eager execution is not enabled, use
+        default graph.
     op_log: OpLogProto proto.
     run_meta: RunMetadata proto used to complete shape information.
     add_trace: Whether to add op trace information.
@@ -153,7 +156,13 @@ def _merge_default_with_oplog(graph, op_log=None, run_meta=None,
   Returns:
     tmp_op_log: Merged OpLogProto proto.
   """
+  if not graph and context.in_graph_mode():
+    graph = ops.get_default_graph()
+
   tmp_op_log = tfprof_log_pb2.OpLogProto()
+  if not graph:
+    return tmp_op_log
+
   logged_ops, string_to_id = _get_logged_ops(
       graph, run_meta, add_trace=add_trace, add_trainable_var=add_trainable_var)
 
@@ -179,6 +188,7 @@ def _merge_default_with_oplog(graph, op_log=None, run_meta=None,
   return tmp_op_log
 
 
+@tf_export('profiler.write_op_log')
 def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
   """Log provided 'op_log', and add additional model information below.
 
@@ -190,7 +200,8 @@ def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
     information with best effort.
 
   Args:
-    graph: tf.Graph.
+    graph: tf.Graph. If None and eager execution is not enabled, use
+        default graph.
     log_dir: directory to write the log file.
     op_log: (Optional) OpLogProto proto to be written. If not provided, an new
         one is created.
@@ -199,7 +210,9 @@ def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
     add_trace: Whether to add python code trace information.
         Used to support "code" view.
   """
-  op_log = _merge_default_with_oplog(graph, op_log, run_meta, add_trace)
+  if not graph and context.in_graph_mode():
+    graph = ops.get_default_graph()
+  op_log = merge_default_with_oplog(graph, op_log, run_meta, add_trace)
 
   with gfile.Open(os.path.join(log_dir, 'tfprof_log'), 'w') as log:
     log.write(op_log.SerializeToString())
diff --git a/tensorflow/python/pywrap_tensorflow.py b/tensorflow/python/pywrap_tensorflow.py
index 91373fa544b62e1b4760a92bf6630edf0c7f1ee4..5c0c5783dce19ec8fa1b090827d06d203e83de68 100644
--- a/tensorflow/python/pywrap_tensorflow.py
+++ b/tensorflow/python/pywrap_tensorflow.py
@@ -60,6 +60,7 @@ try:
   from tensorflow.python.pywrap_tensorflow_internal import __git_version__
   from tensorflow.python.pywrap_tensorflow_internal import __compiler_version__
   from tensorflow.python.pywrap_tensorflow_internal import __cxx11_abi_flag__
+  from tensorflow.python.pywrap_tensorflow_internal import __monolithic_build__
 
   if _use_dlopen_global_flags:
     pywrap_dlopen_global_flags.reset_dlopen_flags()
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 82b154164e85a1044860ef501c3d32cd00eb6fde..50f481d29e9d39bd12741b5f9e02b7201336134d 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -18,21 +18,31 @@ limitations under the License.
 %rename("%s") TFE_NewContext;
 %rename("%s") TFE_DeleteContext;
 %rename("%s") TFE_ContextListDevices;
+%rename("%s") TFE_ContextAddFunction;
 %rename("%s") TFE_ContextAddFunctionDef;
+%rename("%s") TFE_ContextEnableRunMetadata;
+%rename("%s") TFE_ContextDisableRunMetadata;
+%rename("%s") TFE_ContextExportRunMetadata;
+%rename("%s") TFE_ContextClearCaches;
+%rename("%s") TFE_ContextGetDevicePlacementPolicy;
+%rename("%s") TFE_ContextSetThreadLocalDevicePlacementPolicy;
 %rename("%s") TFE_OpNameGetAttrType;
 %rename("%s") TFE_Py_InitEagerTensor;
 %rename("%s") TFE_Py_RegisterExceptionClass;
+%rename("%s") TFE_Py_RegisterFallbackExceptionClass;
 %rename("%s") TFE_Py_Execute;
+%rename("%s") TFE_Py_FastPathExecute;
 %rename("%s") TFE_Py_UID;
-%rename("%s") TFE_Py_TapeStackPushNew;
-%rename("%s") TFE_Py_TapeStackPush;
-%rename("%s") TFE_Py_TapeStackPop;
-%rename("%s") TFE_Py_TapeStackIsEmpty;
-%rename("%s") TFE_Py_TapeStackShouldRecord;
-%rename("%s") TFE_Py_TapeStackWatch;
-%rename("%s") TFE_Py_TapeStackDeleteTrace;
-%rename("%s") TFE_Py_TapeStackRecordOperation;
-%rename("%s") TFE_Py_TapeStackWatchVariable;
+%rename("%s") TFE_Py_TapeSetNew;
+%rename("%s") TFE_Py_TapeSetRemove;
+%rename("%s") TFE_Py_TapeSetStopOnThread;
+%rename("%s") TFE_Py_TapeSetRestartOnThread;
+%rename("%s") TFE_Py_TapeSetIsEmpty;
+%rename("%s") TFE_Py_TapeSetShouldRecord;
+%rename("%s") TFE_Py_TapeSetWatch;
+%rename("%s") TFE_Py_TapeSetDeleteTrace;
+%rename("%s") TFE_Py_TapeSetRecordOperation;
+%rename("%s") TFE_Py_TapeSetWatchVariable;
 %rename("%s") TFE_Py_TapeGradient;
 %rename("%s") TFE_Py_TapeWatchedVariables;
 %rename("%s") TFE_NewContextOptions;
@@ -112,6 +122,7 @@ limitations under the License.
 %rename("%s") TFE_DEVICE_PLACEMENT_EXPLICIT;
 %rename("%s") TFE_DEVICE_PLACEMENT_WARN;
 %rename("%s") TFE_DEVICE_PLACEMENT_SILENT;
+%rename("%s") TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32;
 
 %include "tensorflow/c/eager/c_api.h"
 
@@ -174,10 +185,14 @@ limitations under the License.
   }
 }
 
+// SWIG usually unwraps the tuple that the native Python/C interface generates.
+// Since we wanted to have a function with a variable length of arguments, we
+// used the native Python/C interface directly (which by default supports
+// passing all arguments as a tuple).
+%native(TFE_Py_FastPathExecute) TFE_Py_FastPathExecute_C;
 
 %include "tensorflow/python/eager/pywrap_tfe.h"
 
-
 // Clear all typemaps.
 %typemap(out) TF_DataType;
 %typemap(out) int64_t;
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 39c6439811604c5b175c75c24e682b346fde09fc..30e0a099d8b2e30cff36b69164ba9f1789dd8916 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -25,6 +25,7 @@ py_library(
         ":main_op",
         ":signature_constants",
         ":signature_def_utils",
+        ":simple_save",
         ":tag_constants",
         ":utils",
         "//tensorflow/python:util",
@@ -89,6 +90,23 @@ py_library(
     ],
 )
 
+py_library(
+    name = "simple_save",
+    srcs = [
+        "simple_save.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":builder",
+        ":signature_constants",
+        ":signature_def_utils",
+        ":tag_constants",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "main_op",
     srcs = [
@@ -130,6 +148,7 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:saver_test_utils",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:test_ops",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
@@ -198,6 +217,22 @@ py_test(
     ],
 )
 
+py_test(
+    name = "simple_save_test",
+    size = "small",
+    srcs = ["simple_save_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loader",
+        ":signature_constants",
+        ":simple_save",
+        ":tag_constants",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:variables",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
diff --git a/tensorflow/python/saved_model/README.md b/tensorflow/python/saved_model/README.md
index 8213e52ce9c004c9b9c53b76e08a028508703d06..5eeaf73a4370b0558a2c11d17a3546171b886a69 100644
--- a/tensorflow/python/saved_model/README.md
+++ b/tensorflow/python/saved_model/README.md
@@ -93,7 +93,7 @@ with an asset of the same name, only the first version is retained.
 Each meta graph added to the SavedModel must be annotated with user specified
 tags. The tags provide a means to identify the specific meta graph to load and
 restore, along with the shared set of variables and assets. These tags
-typically annotate a MetaGraph with it's functionality (e.g. serving or
+typically annotate a MetaGraph with its functionality (e.g. serving or
 training), and possibly hardware specific aspects such as GPU.
 
 #### Usage
@@ -117,6 +117,35 @@ with tf.Session(graph=tf.Graph()) as sess:
 builder.save()
 ~~~
 
+#### Stripping Default valued attributes
+The SavedModelBuilder class allows users to control whether default-valued
+attributes must be stripped from the NodeDefs while adding a meta graph to the
+SavedModel bundle. Both `SavedModelBuilder.add_meta_graph_and_variables` and
+`SavedModelBuilder.add_meta_graph` methods accept a Boolean flag
+`strip_default_attrs` that controls this behavior.
+
+If `strip_default_attrs` is `False`, the exported MetaGraphDef will have the
+default valued attributes in all it's NodeDef instances. This can break forward
+compatibility with a sequence of events such as the following:
+
+* An existing Op (`Foo`) is updated to include a new attribute (`T`) with a
+  default (`bool`) at version 101.
+* A model producer (such as a Trainer) binary picks up this change
+  (version 101) to the OpDef and re-exports an existing model that uses Op `Foo`.
+* A model consumer (such as Tensorflow Serving) running an older binary
+  (version 100) doesn't have attribute `T` for Op `Foo`, but tries to import
+  this model. The model consumer doesn't recognize attribute `T` in a NodeDef
+  that uses Op `Foo` and therefore fails to load the model.
+
+By setting `strip_default_attrs` to `True`, the model producers can strip away
+any default valued attributes in the NodeDefs. This helps ensure that newly
+added attributes with defaults don't cause older model consumers to fail loading
+models regenerated with newer training binaries.
+
+TIP: If you care about forward compatibility, then set `strip_default_attrs`
+to `True` while using `SavedModelBuilder.add_meta_graph_and_variables` and
+`SavedModelBuilder.add_meta_graph`.
+
 ### Loader
 The SavedModel loader is implemented in C++ and Python.
 
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 16651ffebc5f5911d7c270425f599036a8e80e0c..7347da75364818b95d3f2ad7dfa74a8c3614b161 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -34,8 +34,10 @@ from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("saved_model.builder.SavedModelBuilder")
 class SavedModelBuilder(object):
   """Builds the `SavedModel` protocol buffer and saves variables and assets.
 
@@ -239,7 +241,9 @@ class SavedModelBuilder(object):
                      assets_collection=None,
                      legacy_init_op=None,
                      clear_devices=False,
-                     main_op=None):
+                     main_op=None,
+                     strip_default_attrs=False):
+    # pylint: disable=line-too-long
     """Adds the current meta graph to the SavedModel.
 
     Creates a Saver in the current scope and uses the Saver to export the meta
@@ -260,11 +264,15 @@ class SavedModelBuilder(object):
       main_op: Op or group of ops to execute when the graph is loaded. Note
           that when the main_op is specified it is run after the restore op at
           load-time.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
 
     Raises:
       AssertionError: If the variables for the SavedModel have not been saved
           yet, or if the graph already contains one or more legacy init ops.
     """
+    # pylint: enable=line-too-long
     if not self._has_saved_variables:
       raise AssertionError(
           "Graph state including variables and assets has not been saved yet. "
@@ -299,7 +307,8 @@ class SavedModelBuilder(object):
     # there are edge cases where that option breaks the graph.  Until that is
     # resolved, we just leave the option set to False for now.
     # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
-    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices)
+    meta_graph_def = saver.export_meta_graph(
+        clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
 
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
@@ -311,7 +320,9 @@ class SavedModelBuilder(object):
                                    assets_collection=None,
                                    legacy_init_op=None,
                                    clear_devices=False,
-                                   main_op=None):
+                                   main_op=None,
+                                   strip_default_attrs=False):
+    # pylint: disable=line-too-long
     """Adds the current meta graph to the SavedModel and saves variables.
 
     Creates a Saver to save the variables from the provided session. Exports the
@@ -334,7 +345,11 @@ class SavedModelBuilder(object):
       main_op: Op or group of ops to execute when the graph is loaded. Note
           that when the main_op is specified it is run after the restore op at
           load-time.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
     """
+    # pylint: enable=line-too-long
     if self._has_saved_variables:
       raise AssertionError("Graph state including variables and assets has "
                            "already been saved. Please invoke "
@@ -388,7 +403,8 @@ class SavedModelBuilder(object):
     # there are edge cases where that option breaks the graph.  Until that is
     # resolved, we just leave the option set to False for now.
     # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
-    meta_graph_def = saver.export_meta_graph(clear_devices=clear_devices)
+    meta_graph_def = saver.export_meta_graph(
+        clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
 
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index 7e3e8df47fb0e024eae8add6a788d632709740af..ec49a0539ff52f6cc69bb24483ede657b698ab8d 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -20,33 +20,52 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 # Subdirectory name containing the asset files.
 ASSETS_DIRECTORY = "assets"
+tf_export("saved_model.constants.ASSETS_DIRECTORY").export_constant(
+    __name__, "ASSETS_DIRECTORY")
 
 # CollectionDef key containing SavedModel assets.
 ASSETS_KEY = "saved_model_assets"
+tf_export("saved_model.constants.ASSETS_KEY").export_constant(
+    __name__, "ASSETS_KEY")
 
 # CollectionDef key for the legacy init op.
 LEGACY_INIT_OP_KEY = "legacy_init_op"
+tf_export("saved_model.constants.LEGACY_INIT_OP_KEY").export_constant(
+    __name__, "LEGACY_INIT_OP_KEY")
 
 # CollectionDef key for the SavedModel main op.
 MAIN_OP_KEY = "saved_model_main_op"
+tf_export("saved_model.constants.MAIN_OP_KEY").export_constant(
+    __name__, "MAIN_OP_KEY")
 
 # Schema version for SavedModel.
 SAVED_MODEL_SCHEMA_VERSION = 1
+tf_export("saved_model.constants.SAVED_MODEL_SCHEMA_VERSION").export_constant(
+    __name__, "SAVED_MODEL_SCHEMA_VERSION")
 
 # File name for SavedModel protocol buffer.
 SAVED_MODEL_FILENAME_PB = "saved_model.pb"
+tf_export("saved_model.constants.SAVED_MODEL_FILENAME_PB").export_constant(
+    __name__, "SAVED_MODEL_FILENAME_PB")
 
 # File name for text version of SavedModel protocol buffer.
 SAVED_MODEL_FILENAME_PBTXT = "saved_model.pbtxt"
+tf_export("saved_model.constants.SAVED_MODEL_FILENAME_PBTXT").export_constant(
+    __name__, "SAVED_MODEL_FILENAME_PBTXT")
 
 # Subdirectory name containing the variables/checkpoint files.
 VARIABLES_DIRECTORY = "variables"
+tf_export("saved_model.constants.VARIABLES_DIRECTORY").export_constant(
+    __name__, "VARIABLES_DIRECTORY")
 
 # File name used for variables.
 VARIABLES_FILENAME = "variables"
+tf_export("saved_model.constants.VARIABLES_FILENAME").export_constant(
+    __name__, "VARIABLES_FILENAME")
 
 
 _allowed_symbols = [
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 5ff954fd9f83989565e007cad3f0f66913e0a4dd..bebf1d5e0d3cc6ac0e431230577704365d37a437 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _parse_saved_model(export_dir):
@@ -156,6 +157,7 @@ def _get_legacy_init_op_tensor(meta_graph_def_to_load):
   return legacy_init_op_tensor
 
 
+@tf_export("saved_model.loader.maybe_saved_model_directory")
 def maybe_saved_model_directory(export_dir):
   """Checks whether the provided export directory could contain a SavedModel.
 
@@ -176,6 +178,7 @@ def maybe_saved_model_directory(export_dir):
   return file_io.file_exists(txt_path) or file_io.file_exists(pb_path)
 
 
+@tf_export("saved_model.loader.load")
 def load(sess, tags, export_dir, **saver_kwargs):
   """Loads the model from a SavedModel as specified by tags.
 
@@ -232,13 +235,10 @@ def load(sess, tags, export_dir, **saver_kwargs):
     asset_tensors_dictionary = _get_asset_tensors(export_dir,
                                                   meta_graph_def_to_load)
 
-    main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load)
+    main_op_tensor = (
+        _get_main_op_tensor(meta_graph_def_to_load) or
+        (_get_legacy_init_op_tensor(meta_graph_def_to_load)))
     if main_op_tensor is not None:
       sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
-    else:
-      legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load)
-      if legacy_init_op_tensor is not None:
-        sess.run(
-            fetches=[legacy_init_op_tensor], feed_dict=asset_tensors_dictionary)
 
     return meta_graph_def_to_load
diff --git a/tensorflow/python/saved_model/main_op_impl.py b/tensorflow/python/saved_model/main_op_impl.py
index 355fd57bf1d2166f58a5fdc95d04695ea05b56b3..631ee63729513d24c2ddae71b771f7cf1695358f 100644
--- a/tensorflow/python/saved_model/main_op_impl.py
+++ b/tensorflow/python/saved_model/main_op_impl.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('saved_model.main_op.main_op')
 def main_op():
   """Returns a main op to init variables and tables.
 
@@ -40,6 +42,7 @@ def main_op():
 
 
 # TODO(sukritiramesh): Integrate with Saver for complete restore functionality.
+@tf_export('saved_model.main_op.main_op_with_restore')
 def main_op_with_restore(restore_op_name):
   """Returns a main op to init variables, tables and restore the graph.
 
diff --git a/tensorflow/python/saved_model/saved_model.py b/tensorflow/python/saved_model/saved_model.py
index 8c59f7afe778006605da31dc82fb6bbfe883f087..caabd7bc30455b55e89711a1ccab6238971f595e 100644
--- a/tensorflow/python/saved_model/saved_model.py
+++ b/tensorflow/python/saved_model/saved_model.py
@@ -30,6 +30,9 @@ from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils
 # pylint: enable=unused-import
+# pylint: disable=wildcard-import
+from tensorflow.python.saved_model.simple_save import *
+# pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 
@@ -41,6 +44,7 @@ _allowed_symbols = [
     "main_op",
     "signature_constants",
     "signature_def_utils",
+    "simple_save",
     "tag_constants",
     "utils",
 ]
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 92ca7dec6f63b50b33dde9909b4738676fb8c783..d9d316882584470769c14cf0c5f265b58e37ab43 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -25,8 +25,11 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -36,6 +39,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.saved_model import builder as saved_model_builder
 from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import main_op
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
@@ -49,8 +53,14 @@ def tearDownModule():
   file_io.delete_recursively(test.get_temp_dir())
 
 
+@test_util.with_c_api
 class SavedModelTest(test.TestCase):
 
+  def _get_export_dir(self, label):
+    if ops._USE_C_API:
+      label += "_c_api"
+    return os.path.join(test.get_temp_dir(), label)
+
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
     v = variables.Variable(variable_value, name=variable_name)
     sess.run(variables.global_variables_initializer())
@@ -118,8 +128,7 @@ class SavedModelTest(test.TestCase):
     self.assertFalse(loader.maybe_saved_model_directory(base_path))
 
   def testBadSavedModelFileFormat(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_bad_saved_model_file_format")
+    export_dir = self._get_export_dir("test_bad_saved_model_file_format")
     # Attempt to load a SavedModel from an export directory that does not exist.
     with self.test_session(graph=ops.Graph()) as sess:
       with self.assertRaisesRegexp(IOError,
@@ -152,8 +161,7 @@ class SavedModelTest(test.TestCase):
         loader.load(sess, ["foo"], export_dir)
 
   def testVerifySessionGraphUsage(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_verify_session_graph_usage")
+    export_dir = self._get_export_dir("test_verify_session_graph_usage")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -173,7 +181,7 @@ class SavedModelTest(test.TestCase):
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
   def testSequence(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_sequence")
+    export_dir = self._get_export_dir("test_sequence")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Expect an assertion error since add_meta_graph_and_variables() should be
@@ -190,7 +198,7 @@ class SavedModelTest(test.TestCase):
                         sess, ["baz"])
 
   def testTags(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_tags")
+    export_dir = self._get_export_dir("test_tags")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
@@ -279,7 +287,7 @@ class SavedModelTest(test.TestCase):
                         export_dir)
 
   def testVariables(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_variables")
+    export_dir = self._get_export_dir("test_variables")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with two variables. SavedModel invoked to:
@@ -331,7 +339,7 @@ class SavedModelTest(test.TestCase):
                         export_dir)
 
   def testGraphWithoutVariables(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_graph_has_variables")
+    export_dir = self._get_export_dir("test_graph_has_variables")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with no variables.
@@ -366,7 +374,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(30.0, sess.run(c))
 
   def testNoOverwrite(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_no_overwrite")
+    export_dir = self._get_export_dir("test_no_overwrite")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
@@ -390,7 +398,7 @@ class SavedModelTest(test.TestCase):
                       export_dir)
 
   def testSaveAsText(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_astext")
+    export_dir = self._get_export_dir("test_astext")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
@@ -421,7 +429,7 @@ class SavedModelTest(test.TestCase):
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
   def testCollections(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_collections")
+    export_dir = self._get_export_dir("test_collections")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable added to a collection. SavedModel invoked to:
@@ -471,7 +479,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(len(ops.get_collection("foo_vars")), 0)
 
   def testSignatureDefs(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_signature_defs")
+    export_dir = self._get_export_dir("test_signature_defs")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Graph with a single variable and a single entry in the signature def map.
@@ -531,8 +539,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual("foo_new", bar_signature["foo_key"].method_name)
 
   def testSignatureDefValidation(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_signature_def_validation")
+    export_dir = self._get_export_dir("test_signature_def_validation")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     tensor_without_name = meta_graph_pb2.TensorInfo()
@@ -550,7 +557,7 @@ class SavedModelTest(test.TestCase):
     self._validate_outputs_tensor_info(builder, tensor_empty)
 
   def testAssets(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_assets")
+    export_dir = self._get_export_dir("test_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -583,7 +590,7 @@ class SavedModelTest(test.TestCase):
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
   def testCustomMainOp(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_main_op")
+    export_dir = self._get_export_dir("test_main_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -618,7 +625,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
   def testLegacyInitOp(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_legacy_init_op")
+    export_dir = self._get_export_dir("test_legacy_init_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -652,8 +659,8 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
   def testLegacyInitOpWithNonEmptyCollection(self):
-    export_dir = os.path.join(test.get_temp_dir(),
-                              "test_legacy_init_op_with_non_empty_collection")
+    export_dir = self._get_export_dir(
+        "test_legacy_init_op_with_non_empty_collection")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -680,7 +687,7 @@ class SavedModelTest(test.TestCase):
             sess, ["foo"], legacy_init_op=legacy_init_op)
 
   def testMultipleAssets(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_multiple_assets")
+    export_dir = self._get_export_dir("test_multiple_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -722,7 +729,7 @@ class SavedModelTest(test.TestCase):
                                       "asset_file_tensor:0")
 
   def testDuplicateAssets(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_duplicate_assets")
+    export_dir = self._get_export_dir("test_duplicate_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with self.test_session(graph=ops.Graph()) as sess:
@@ -770,7 +777,7 @@ class SavedModelTest(test.TestCase):
                                       "asset_file_tensor:0")
 
   def testOp(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_op")
+    export_dir = self._get_export_dir("test_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with session.Session(
@@ -813,7 +820,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
   def testCustomSaveable(self):
-    export_dir = os.path.join(test.get_temp_dir(), "custom_saveable")
+    export_dir = self._get_export_dir("custom_saveable")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     with session.Session(
@@ -842,7 +849,7 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(3.0, v1.values().eval())
 
   def testClearDevices(self):
-    export_dir = os.path.join(test.get_temp_dir(), "test_clear_devices")
+    export_dir = self._get_export_dir("test_clear_devices")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
     # Specify a device and save a variable.
@@ -865,6 +872,148 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
+  def testStripDefaultAttrs(self):
+    export_dir = self._get_export_dir("test_strip_default_attrs")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    # Add a graph with two float32 variables and a Complex Op composing them
+    # with strip_default_attrs enabled.
+    with session.Session(graph=ops.Graph()) as sess:
+      real_num = variables.Variable(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variables.Variable(2.0, dtype=dtypes.float32, name="imag")
+      math_ops.complex(real_num, imag_num, name="complex")
+      sess.run(variables.global_variables_initializer())
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], strip_default_attrs=True)
+
+    # Add a graph with the same float32 variables and a Complex Op composing
+    # them with strip_default_attrs disabled.
+    with session.Session(graph=ops.Graph()) as sess:
+      real_num = variables.Variable(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variables.Variable(2.0, dtype=dtypes.float32, name="imag")
+      math_ops.complex(real_num, imag_num, name="complex")
+      sess.run(variables.global_variables_initializer())
+      builder.add_meta_graph(["bar"], strip_default_attrs=False)
+
+    # Save the SavedModel to disk in text format.
+    builder.save(as_text=True)
+
+    # Loading graph "foo" via the loader must restore the defaults for the
+    # "Complex" node based on the "Complex" OpDef in the Op registry.
+    sess = session.Session(graph=ops.Graph())
+    meta_graph_def = loader.load(sess, ["foo"], export_dir)
+    complex_node = test_util.get_node_def_from_graph("complex",
+                                                     meta_graph_def.graph_def)
+    self.assertIn("T", complex_node.attr)
+    self.assertIn("Tout", complex_node.attr)
+
+    # Load graph "foo" from disk as-is to verify default attrs are stripped.
+    # pylint: disable=protected-access
+    saved_model_pb = loader_impl._parse_saved_model(export_dir)
+    self.assertIsNotNone(saved_model_pb)
+    # pylint: enable=protected-access
+
+    meta_graph_foo_def = None
+    meta_graph_bar_def = None
+    for meta_graph_def in saved_model_pb.meta_graphs:
+      if set(meta_graph_def.meta_info_def.tags) == set(["foo"]):
+        meta_graph_foo_def = meta_graph_def
+      elif set(meta_graph_def.meta_info_def.tags) == set(["bar"]):
+        meta_graph_bar_def = meta_graph_def
+
+    self.assertIsNotNone(meta_graph_foo_def)
+    self.assertIsNotNone(meta_graph_bar_def)
+
+    # "Complex" Op has 2 attributes with defaults:
+    #   o "T"    : float32.   (input type)
+    #   o "Tout" : complex64. (output type)
+
+    # "Complex" Op in graph "foo" shouldn't have attributes "T" and "Tout".
+    # Graph "foo" was saved with strip_default_attrs set to True.
+    node_def = test_util.get_node_def_from_graph("complex",
+                                                 meta_graph_foo_def.graph_def)
+    self.assertNotIn("T", node_def.attr)
+    self.assertNotIn("Tout", node_def.attr)
+
+    # "Complex" Op in graph "bar" must have attributes "T" and "Tout".
+    # Graph "bar" was saved with strip_default_attrs set to False.
+    node_def = test_util.get_node_def_from_graph("complex",
+                                                 meta_graph_bar_def.graph_def)
+    self.assertIn("T", node_def.attr)
+    self.assertIn("Tout", node_def.attr)
+
+  # Tests the behavior of loading SavedModels that having missing attrs or attrs
+  # with incorrect types.
+  def testInconsistentConsumerDefaultAttrs(self):
+    export_dir = self._get_export_dir(
+        "test_strip_default_attrs_no_consumer_defaults")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    # Add a graph with a single variable and a test op with a defaultless
+    # float32 attr, "test_attr".
+    with session.Session(graph=ops.Graph()) as sess:
+      variables.Variable(1.0, dtype=dtypes.float64, name="var")
+      test_ops.test_attr(T=dtypes.float32, name="test_attr")
+      sess.run(variables.global_variables_initializer())
+      builder.add_meta_graph_and_variables(sess, ["foo"])
+
+    # Save the SavedModel to disk in text format.
+    builder.save(as_text=True)
+
+    # Rewrite the SavedModel to remove the T attr from "test_attr".
+    saved_model_file = os.path.join(
+        export_dir, constants.SAVED_MODEL_FILENAME_PBTXT)
+    with open(saved_model_file) as f:
+      original_saved_model = f.read()
+
+    no_attr_saved_model = original_saved_model.replace("""
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }""", "")
+    with open(saved_model_file, "w") as f:
+      f.write(no_attr_saved_model)
+
+    # Loading the SavedModel via the loader must fail because the SavedModel
+    # does not have any attr values for the "TestAttr" node, and there is no
+    # default specified in the TestAttr OpDef.
+    sess = session.Session(graph=ops.Graph())
+    if ops._USE_C_API:
+      error_message = "NodeDef missing attr 'T' from Op<name=TestAttr"
+    else:
+      error_message = ("Expected one attr with name .*T(out)?.* in name: "
+                       "\"test_attr\".*")
+    with self.assertRaisesRegexp(ValueError, error_message):
+      loader.load(sess, ["foo"], export_dir)
+
+    # Rewrite the SavedModel to change the type of the T attr in "test_attr"
+    bad_type_saved_model = original_saved_model.replace("""
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }""", """
+      attr {
+        key: "T"
+        value {
+          type: DT_DOUBLE
+        }
+      }""")
+    with open(saved_model_file, "w") as f:
+      f.write(bad_type_saved_model)
+
+    # Loading the SavedModel via the loader must fail because there is no
+    # OpKernel registered to handle T = double.
+    sess = session.Session(graph=ops.Graph())
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        ".*No OpKernel was registered to support Op \'TestAttr\' with these "
+        "attrs..*"):
+      loader.load(sess, ["foo"], export_dir)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/signature_constants.py b/tensorflow/python/saved_model/signature_constants.py
index 935a124645bde509a1b5a7751a285a85acbe8cab..6461fe8a7e7bef1a2fc787879da9e3324e2655c8 100644
--- a/tensorflow/python/saved_model/signature_constants.py
+++ b/tensorflow/python/saved_model/signature_constants.py
@@ -20,51 +20,79 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Key in the signature def map for `default` serving signatures. The default
 # signature is used in inference requests where a specific signature was not
 # specified.
 DEFAULT_SERVING_SIGNATURE_DEF_KEY = "serving_default"
+tf_export("saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY"
+         ).export_constant(__name__, "DEFAULT_SERVING_SIGNATURE_DEF_KEY")
 
 ################################################################################
 # Classification API constants.
 
 # Classification inputs.
 CLASSIFY_INPUTS = "inputs"
+tf_export("saved_model.signature_constants.CLASSIFY_INPUTS").export_constant(
+    __name__, "CLASSIFY_INPUTS")
 
 # Classification method name used in a SignatureDef.
 CLASSIFY_METHOD_NAME = "tensorflow/serving/classify"
+tf_export(
+    "saved_model.signature_constants.CLASSIFY_METHOD_NAME").export_constant(
+        __name__, "CLASSIFY_METHOD_NAME")
 
 # Classification classes output.
 CLASSIFY_OUTPUT_CLASSES = "classes"
+tf_export(
+    "saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES").export_constant(
+        __name__, "CLASSIFY_OUTPUT_CLASSES")
 
 # Classification scores output.
 CLASSIFY_OUTPUT_SCORES = "scores"
+tf_export(
+    "saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES").export_constant(
+        __name__, "CLASSIFY_OUTPUT_SCORES")
 
 ################################################################################
 # Prediction API constants.
 
 # Predict inputs.
 PREDICT_INPUTS = "inputs"
+tf_export("saved_model.signature_constants.PREDICT_INPUTS").export_constant(
+    __name__, "PREDICT_INPUTS")
 
 # Prediction method name used in a SignatureDef.
 PREDICT_METHOD_NAME = "tensorflow/serving/predict"
+tf_export(
+    "saved_model.signature_constants.PREDICT_METHOD_NAME").export_constant(
+        __name__, "PREDICT_METHOD_NAME")
 
 # Predict outputs.
 PREDICT_OUTPUTS = "outputs"
+tf_export("saved_model.signature_constants.PREDICT_OUTPUTS").export_constant(
+    __name__, "PREDICT_OUTPUTS")
 
 ################################################################################
 # Regression API constants.
 
 # Regression inputs.
 REGRESS_INPUTS = "inputs"
+tf_export("saved_model.signature_constants.REGRESS_INPUTS").export_constant(
+    __name__, "REGRESS_INPUTS")
 
 # Regression method name used in a SignatureDef.
 REGRESS_METHOD_NAME = "tensorflow/serving/regress"
+tf_export(
+    "saved_model.signature_constants.REGRESS_METHOD_NAME").export_constant(
+        __name__, "REGRESS_METHOD_NAME")
 
 # Regression outputs.
 REGRESS_OUTPUTS = "outputs"
+tf_export("saved_model.signature_constants.REGRESS_OUTPUTS").export_constant(
+    __name__, "REGRESS_OUTPUTS")
 
 ################################################################################
 
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
index 240ea61aa5f8553852044f84b61d010bfbca69d1..d0331591889110df86bdb2ac69c037bc3b968f91 100644
--- a/tensorflow/python/saved_model/signature_def_utils_impl.py
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -26,8 +26,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import utils
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('saved_model.signature_def_utils.build_signature_def')
 def build_signature_def(inputs=None, outputs=None, method_name=None):
   """Utility function to build a SignatureDef protocol buffer.
 
@@ -53,6 +55,7 @@ def build_signature_def(inputs=None, outputs=None, method_name=None):
   return signature_def
 
 
+@tf_export('saved_model.signature_def_utils.regression_signature_def')
 def regression_signature_def(examples, predictions):
   """Creates regression signature from given examples and predictions.
 
@@ -94,6 +97,7 @@ def regression_signature_def(examples, predictions):
   return signature_def
 
 
+@tf_export('saved_model.signature_def_utils.classification_signature_def')
 def classification_signature_def(examples, classes, scores):
   """Creates classification signature from given examples and predictions.
 
@@ -146,6 +150,7 @@ def classification_signature_def(examples, classes, scores):
   return signature_def
 
 
+@tf_export('saved_model.signature_def_utils.predict_signature_def')
 def predict_signature_def(inputs, outputs):
   """Creates prediction signature from given inputs and outputs.
 
@@ -180,6 +185,7 @@ def predict_signature_def(inputs, outputs):
   return signature_def
 
 
+@tf_export('saved_model.signature_def_utils.is_valid_signature')
 def is_valid_signature(signature_def):
   """Determine whether a SignatureDef can be served by TensorFlow Serving."""
   if signature_def is None:
diff --git a/tensorflow/contrib/saved_model/python/saved_model/utils.py b/tensorflow/python/saved_model/simple_save.py
similarity index 85%
rename from tensorflow/contrib/saved_model/python/saved_model/utils.py
rename to tensorflow/python/saved_model/simple_save.py
index 9f34af64a6253eecf45351d4e844265b922d9313..042b8fa8e22703d8ffb5e12de3f844d22fb1b1ce 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/utils.py
+++ b/tensorflow/python/saved_model/simple_save.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""SavedModel utility functions."""
+"""SavedModel simple save functionality."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,8 +23,10 @@ from tensorflow.python.saved_model import builder
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('saved_model.simple_save')
 def simple_save(session, export_dir, inputs, outputs, legacy_init_op=None):
   """Convenience function to build a SavedModel suitable for serving.
 
@@ -39,18 +41,21 @@ def simple_save(session, export_dir, inputs, outputs, legacy_init_op=None):
       to configure a SavedModel, this method has a few practical implications:
     - It will be treated as a graph for inference / serving (i.e. uses the tag
       `tag_constants.SERVING`)
-    - The saved model will load in TensorFlow Serving and supports the
-      [Predict API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto).
+    - The SavedModel will load in TensorFlow Serving and supports the
+      [Predict
+      API](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/apis/predict.proto).
       To use the Classify, Regress, or MultiInference APIs, please
       use either
       [tf.Estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator)
       or the lower level
-      [SavedModel APIs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
+      [SavedModel
+      APIs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
     - Some TensorFlow ops depend on information on disk or other information
       called "assets". These are generally handled automatically by adding the
       assets to the `GraphKeys.ASSET_FILEPATHS` collection. Only assets in that
       collection are exported; if you need more custom behavior, you'll need to
-      use the [SavedModelBuilder](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/builder.py).
+      use the
+      [SavedModelBuilder](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/builder.py).
 
   More information about SavedModel and signatures can be found here:
   https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md.
diff --git a/tensorflow/contrib/saved_model/python/saved_model/utils_test.py b/tensorflow/python/saved_model/simple_save_test.py
similarity index 95%
rename from tensorflow/contrib/saved_model/python/saved_model/utils_test.py
rename to tensorflow/python/saved_model/simple_save_test.py
index 36dfb88871f39218ea19c2e6f40675914510e4c4..b2fa40d4f13ff99568cd5a5c8bf39db726e23132 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/utils_test.py
+++ b/tensorflow/python/saved_model/simple_save_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for saved_model utils."""
+"""Tests for SavedModel simple save functionality."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,16 +20,16 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.saved_model.python.saved_model import utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import simple_save
 from tensorflow.python.saved_model import tag_constants
 
 
-class UtilsTest(test.TestCase):
+class SimpleSaveTest(test.TestCase):
 
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
     v = variables.Variable(variable_value, name=variable_name)
@@ -65,7 +65,7 @@ class UtilsTest(test.TestCase):
       var_y = self._init_and_validate_variable(sess, "var_y", 2)
       inputs = {"x": var_x}
       outputs = {"y": var_y}
-      utils.simple_save(sess, export_dir, inputs, outputs)
+      simple_save.simple_save(sess, export_dir, inputs, outputs)
 
     # Restore the graph with a valid tag and check the global variables and
     # signature def map.
diff --git a/tensorflow/python/saved_model/tag_constants.py b/tensorflow/python/saved_model/tag_constants.py
index e2facafda51919d3f1e0ccbe646db522ed0bc49b..d164e2c23f24469d7536f87cb431afe618ddcc06 100644
--- a/tensorflow/python/saved_model/tag_constants.py
+++ b/tensorflow/python/saved_model/tag_constants.py
@@ -20,19 +20,26 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Tag for the `serving` graph.
 SERVING = "serve"
+tf_export("saved_model.tag_constants.SERVING").export_constant(
+    __name__, "SERVING")
 
 # Tag for the `training` graph.
 TRAINING = "train"
+tf_export("saved_model.tag_constants.TRAINING").export_constant(
+    __name__, "TRAINING")
 
 # Tag for the `gpu` graph.
 GPU = "gpu"
+tf_export("saved_model.tag_constants.GPU").export_constant(__name__, "GPU")
 
 # Tag for the `tpu` graph.
 TPU = "tpu"
+tf_export("saved_model.tag_constants.TPU").export_constant(__name__, "TPU")
 
 _allowed_symbols = [
     "SERVING",
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 73ca8c9c1c6d8fddc8a9c7dbee56682999281c28..cddce29a08a6c4c79a4c7c5dbfb48a86131530b2 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -22,11 +22,13 @@ from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TensorInfo helpers.
 
 
+@tf_export("saved_model.utils.build_tensor_info")
 def build_tensor_info(tensor):
   """Utility function to build TensorInfo proto.
 
@@ -50,6 +52,7 @@ def build_tensor_info(tensor):
   return tensor_info
 
 
+@tf_export("saved_model.utils.get_tensor_from_tensor_info")
 def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None):
   """Returns the Tensor or SparseTensor described by a TensorInfo proto.
 
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 355593eca5dd2f84419035958bfe8eea83e485b8..b80ad79074e85bdeae70148b2822c319c29468bc 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -72,8 +72,10 @@ from tensorflow.python.summary.writer.writer_cache import FileWriterCache
 
 from tensorflow.python.util import compat as _compat
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('summary.scalar')
 def scalar(name, tensor, collections=None, family=None):
   """Outputs a `Summary` protocol buffer containing a single scalar value.
 
@@ -102,6 +104,7 @@ def scalar(name, tensor, collections=None, family=None):
   return val
 
 
+@tf_export('summary.image')
 def image(name, tensor, max_outputs=3, collections=None, family=None):
   """Outputs a `Summary` protocol buffer with images.
 
@@ -156,6 +159,7 @@ def image(name, tensor, max_outputs=3, collections=None, family=None):
   return val
 
 
+@tf_export('summary.histogram')
 def histogram(name, values, collections=None, family=None):
   # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with a histogram.
@@ -195,6 +199,7 @@ def histogram(name, values, collections=None, family=None):
   return val
 
 
+@tf_export('summary.audio')
 def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
           family=None):
   # pylint: disable=line-too-long
@@ -242,6 +247,7 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
   return val
 
 
+@tf_export('summary.merge')
 def merge(inputs, collections=None, name=None):
   # pylint: disable=line-too-long
   """Merges summaries.
@@ -286,12 +292,14 @@ def merge(inputs, collections=None, name=None):
   return val
 
 
-def merge_all(key=_ops.GraphKeys.SUMMARIES):
+@tf_export('summary.merge_all')
+def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None):
   """Merges all summaries collected in the default graph.
 
   Args:
     key: `GraphKey` used to collect the summaries.  Defaults to
       `GraphKeys.SUMMARIES`.
+    scope: Optional scope used to filter the summary ops, using `re.match`
 
   Returns:
     If no summaries were collected, returns None.  Otherwise returns a scalar
@@ -310,13 +318,14 @@ def merge_all(key=_ops.GraphKeys.SUMMARIES):
     raise RuntimeError(
         'Merging tf.summary.* ops is not compatible with eager execution. '
         'Use tf.contrib.summary instead.')
-  summary_ops = _ops.get_collection(key)
+  summary_ops = _ops.get_collection(key, scope=scope)
   if not summary_ops:
     return None
   else:
     return merge(summary_ops)
 
 
+@tf_export('summary.get_summary_description')
 def get_summary_description(node_def):
   """Given a TensorSummary node_def, retrieve its SummaryDescription.
 
diff --git a/tensorflow/python/summary/summary_iterator.py b/tensorflow/python/summary/summary_iterator.py
index 301f560d41378b0ec29537cd82e3e3b333f59674..321b11ffb73487405428340df94010ed8ddbfcd4 100644
--- a/tensorflow/python/summary/summary_iterator.py
+++ b/tensorflow/python/summary/summary_iterator.py
@@ -13,303 +13,18 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Reads Summaries from and writes Summaries to event files."""
+"""Provides a method for reading events from an event file via an iterator."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os.path
-import threading
-import time
-
-import six
-
-from tensorflow.core.framework import graph_pb2
-from tensorflow.core.framework import summary_pb2
 from tensorflow.core.util import event_pb2
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import tf_record
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import compat
-
-
-class SummaryWriter(object):
-  """Writes `Summary` protocol buffers to event files.
-
-  The `SummaryWriter` class provides a mechanism to create an event file in a
-  given directory and add summaries and events to it. The class updates the
-  file contents asynchronously. This allows a training program to call methods
-  to add data to the file directly from the training loop, without slowing down
-  training.
-  """
-
-  def __init__(self, logdir, graph=None, max_queue=10, flush_secs=120,
-               graph_def=None):
-    """Creates a `SummaryWriter` and an event file.
-
-    On construction the summary writer creates a new event file in `logdir`.
-    This event file will contain `Event` protocol buffers constructed when you
-    call one of the following functions: `add_summary()`, `add_session_log()`,
-    `add_event()`, or `add_graph()`.
-
-    If you pass a `Graph` to the constructor it is added to
-    the event file. (This is equivalent to calling `add_graph()` later).
-
-    TensorBoard will pick the graph from the file and display it graphically so
-    you can interactively explore the graph you built. You will usually pass
-    the graph from the session in which you launched it:
-
-    ```python
-    ...create a graph...
-    # Launch the graph in a session.
-    sess = tf.Session()
-    # Create a summary writer, add the 'graph' to the event file.
-    writer = tf.summary.FileWriter(<some-directory>, sess.graph)
-    ```
-
-    The other arguments to the constructor control the asynchronous writes to
-    the event file:
-
-    *  `flush_secs`: How often, in seconds, to flush the added summaries
-       and events to disk.
-    *  `max_queue`: Maximum number of summaries or events pending to be
-       written to disk before one of the 'add' calls block.
-
-    Args:
-      logdir: A string. Directory where event file will be written.
-      graph: A `Graph` object, such as `sess.graph`.
-      max_queue: Integer. Size of the queue for pending events and summaries.
-      flush_secs: Number. How often, in seconds, to flush the
-        pending events and summaries to disk.
-      graph_def: DEPRECATED: Use the `graph` argument instead.
-    """
-    self._logdir = logdir
-    if not gfile.IsDirectory(self._logdir):
-      gfile.MakeDirs(self._logdir)
-    self._event_queue = six.moves.queue.Queue(max_queue)
-    self._ev_writer = pywrap_tensorflow.EventsWriter(
-        compat.as_bytes(os.path.join(self._logdir, "events")))
-    self._closed = False
-    self._worker = _EventLoggerThread(self._event_queue, self._ev_writer,
-                                      flush_secs)
-    # For storing used tags for session.run() outputs.
-    self._session_run_tags = {}
-    self._worker.start()
-    if graph is not None or graph_def is not None:
-      # Calling it with both graph and graph_def for backward compatibility.
-      self.add_graph(graph=graph, graph_def=graph_def)
-
-  def get_logdir(self):
-    """Returns the directory where event file will be written."""
-    return self._logdir
-
-  def reopen(self):
-    """Reopens the summary writer.
-
-    Can be called after `close()` to add more events in the same directory.
-    The events will go into a new events file.
-
-    Does nothing if the summary writer was not closed.
-    """
-    if self._closed:
-      self._closed = False
-
-  def add_summary(self, summary, global_step=None):
-    """Adds a `Summary` protocol buffer to the event file.
-
-    This method wraps the provided summary in an `Event` protocol buffer
-    and adds it to the event file.
-
-    You can pass the result of evaluating any summary op, using
-    @{tf.Session.run} or
-    @{tf.Tensor.eval}, to this
-    function. Alternatively, you can pass a `tf.Summary` protocol
-    buffer that you populate with your own data. The latter is
-    commonly done to report evaluation results in event files.
-
-    Args:
-      summary: A `Summary` protocol buffer, optionally serialized as a string.
-      global_step: Number. Optional global step value to record with the
-        summary.
-    """
-    if isinstance(summary, bytes):
-      summ = summary_pb2.Summary()
-      summ.ParseFromString(summary)
-      summary = summ
-    event = event_pb2.Event(wall_time=time.time(), summary=summary)
-    if global_step is not None:
-      event.step = int(global_step)
-    self.add_event(event)
-
-  def add_session_log(self, session_log, global_step=None):
-    """Adds a `SessionLog` protocol buffer to the event file.
-
-    This method wraps the provided session in an `Event` protocol buffer
-    and adds it to the event file.
-
-    Args:
-      session_log: A `SessionLog` protocol buffer.
-      global_step: Number. Optional global step value to record with the
-        summary.
-    """
-    event = event_pb2.Event(wall_time=time.time(), session_log=session_log)
-    if global_step is not None:
-      event.step = int(global_step)
-    self.add_event(event)
-
-  def add_event(self, event):
-    """Adds an event to the event file.
-
-    Args:
-      event: An `Event` protocol buffer.
-    """
-    if not self._closed:
-      self._event_queue.put(event)
-
-  def _add_graph_def(self, graph_def, global_step=None):
-    graph_bytes = graph_def.SerializeToString()
-    event = event_pb2.Event(wall_time=time.time(), graph_def=graph_bytes)
-    if global_step is not None:
-      event.step = int(global_step)
-    self._event_queue.put(event)
-
-  def add_graph(self, graph, global_step=None, graph_def=None):
-    """Adds a `Graph` to the event file.
-
-    The graph described by the protocol buffer will be displayed by
-    TensorBoard. Most users pass a graph in the constructor instead.
-
-    Args:
-      graph: A `Graph` object, such as `sess.graph`.
-      global_step: Number. Optional global step counter to record with the
-        graph.
-      graph_def: DEPRECATED. Use the `graph` parameter instead.
-
-    Raises:
-      ValueError: If both graph and graph_def are passed to the method.
-    """
-
-    if graph is not None and graph_def is not None:
-      raise ValueError("Please pass only graph, or graph_def (deprecated), "
-                       "but not both.")
-
-    if isinstance(graph, ops.Graph) or isinstance(graph_def, ops.Graph):
-      # The user passed a `Graph`.
-
-      # Check if the user passed it via the graph or the graph_def argument and
-      # correct for that.
-      if not isinstance(graph, ops.Graph):
-        logging.warning("When passing a `Graph` object, please use the `graph`"
-                        " named argument instead of `graph_def`.")
-        graph = graph_def
-
-      # Serialize the graph with additional info.
-      true_graph_def = graph.as_graph_def(add_shapes=True)
-    elif (isinstance(graph, graph_pb2.GraphDef)
-          or isinstance(graph_def, graph_pb2.GraphDef)):
-      # The user passed a `GraphDef`.
-      logging.warning("Passing a `GraphDef` to the SummaryWriter is deprecated."
-                      " Pass a `Graph` object instead, such as `sess.graph`.")
-
-      # Check if the user passed it via the graph or the graph_def argument and
-      # correct for that.
-      if isinstance(graph, graph_pb2.GraphDef):
-        true_graph_def = graph
-      else:
-        true_graph_def = graph_def
-
-    else:
-      # The user passed neither `Graph`, nor `GraphDef`.
-      raise TypeError("The passed graph must be an instance of `Graph` "
-                      "or the deprecated `GraphDef`")
-    # Finally, add the graph_def to the summary writer.
-    self._add_graph_def(true_graph_def, global_step)
-
-  def add_run_metadata(self, run_metadata, tag, global_step=None):
-    """Adds a metadata information for a single session.run() call.
-
-    Args:
-      run_metadata: A `RunMetadata` protobuf object.
-      tag: The tag name for this metadata.
-      global_step: Number. Optional global step counter to record with the
-        StepStats.
-
-    Raises:
-      ValueError: If the provided tag was already used for this type of event.
-    """
-    if tag in self._session_run_tags:
-      raise ValueError("The provided tag was already used for this event type")
-    self._session_run_tags[tag] = True
-
-    tagged_metadata = event_pb2.TaggedRunMetadata()
-    tagged_metadata.tag = tag
-    # Store the `RunMetadata` object as bytes in order to have postponed
-    # (lazy) deserialization when used later.
-    tagged_metadata.run_metadata = run_metadata.SerializeToString()
-    event = event_pb2.Event(wall_time=time.time(),
-                            tagged_run_metadata=tagged_metadata)
-    if global_step is not None:
-      event.step = int(global_step)
-    self._event_queue.put(event)
-
-  def flush(self):
-    """Flushes the event file to disk.
-
-    Call this method to make sure that all pending events have been written to
-    disk.
-    """
-    self._event_queue.join()
-    self._ev_writer.Flush()
-
-  def close(self):
-    """Flushes the event file to disk and close the file.
-
-    Call this method when you do not need the summary writer anymore.
-    """
-    self.flush()
-    self._ev_writer.Close()
-    self._closed = True
-
-
-class _EventLoggerThread(threading.Thread):
-  """Thread that logs events."""
-
-  def __init__(self, queue, ev_writer, flush_secs):
-    """Creates an _EventLoggerThread.
-
-    Args:
-      queue: A Queue from which to dequeue events.
-      ev_writer: An event writer. Used to log brain events for
-       the visualizer.
-      flush_secs: How often, in seconds, to flush the
-        pending file to disk.
-    """
-    threading.Thread.__init__(self)
-    self.daemon = True
-    self._queue = queue
-    self._ev_writer = ev_writer
-    self._flush_secs = flush_secs
-    # The first event will be flushed immediately.
-    self._next_event_flush_time = 0
-
-  def run(self):
-    while True:
-      event = self._queue.get()
-      try:
-        self._ev_writer.WriteEvent(event)
-        # Flush the event writer every so often.
-        now = time.time()
-        if now > self._next_event_flush_time:
-          self._ev_writer.Flush()
-          # Do it again in two minutes.
-          self._next_event_flush_time = now + self._flush_secs
-      finally:
-        self._queue.task_done()
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('train.summary_iterator')
 def summary_iterator(path):
   # pylint: disable=line-too-long
   """An iterator for reading `Event` protocol buffers from an event file.
@@ -352,37 +67,3 @@ def summary_iterator(path):
   # pylint: enable=line-too-long
   for r in tf_record.tf_record_iterator(path):
     yield event_pb2.Event.FromString(r)
-
-
-class SummaryWriterCache(object):
-  """Cache for summary writers.
-
-  This class caches summary writers, one per directory.
-  """
-  # Cache, keyed by directory.
-  _cache = {}
-
-  # Lock protecting _SUMMARY_WRITERS.
-  _lock = threading.RLock()
-
-  @staticmethod
-  def clear():
-    """Clear cached summary writers. Currently only used for unit tests."""
-    with SummaryWriterCache._lock:
-      SummaryWriterCache._cache = {}
-
-  @staticmethod
-  def get(logdir):
-    """Returns the SummaryWriter for the specified directory.
-
-    Args:
-      logdir: str, name of the directory.
-
-    Returns:
-      A `SummaryWriter`.
-    """
-    with SummaryWriterCache._lock:
-      if logdir not in SummaryWriterCache._cache:
-        SummaryWriterCache._cache[logdir] = SummaryWriter(
-            logdir, graph=ops.get_default_graph())
-      return SummaryWriterCache._cache[logdir]
diff --git a/tensorflow/python/summary/text_summary.py b/tensorflow/python/summary/text_summary.py
index 94a85d73e2f77388f9a29b1c135fc6046a8362d0..6418c847f3c819cf2491bb449921d15c39eae288 100644
--- a/tensorflow/python/summary/text_summary.py
+++ b/tensorflow/python/summary/text_summary.py
@@ -26,10 +26,12 @@ from __future__ import print_function
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops.summary_ops import tensor_summary
+from tensorflow.python.util.tf_export import tf_export
 
 PLUGIN_NAME = "text"
 
 
+@tf_export("summary.text")
 def text_summary(name, tensor, collections=None):
   """Summarizes textual data.
 
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 12f120116f4439059f42c7212469ee835cc13ef4..1f3f2287043c021d636113b5a8807c9f4adf77aa 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import plugin_asset
 from tensorflow.python.summary.writer.event_file_writer import EventFileWriter
+from tensorflow.python.util.tf_export import tf_export
 
 _PLUGINS_DIR = "plugins"
 
@@ -276,6 +277,7 @@ class SummaryToEventTransformer(object):
     self.event_writer.add_event(event)
 
 
+@tf_export("summary.FileWriter")
 class FileWriter(SummaryToEventTransformer):
   """Writes `Summary` protocol buffers to event files.
 
diff --git a/tensorflow/python/summary/writer/writer_cache.py b/tensorflow/python/summary/writer/writer_cache.py
index bad289303c0fd0de7836b03a6762d04505521a89..645fa28a37fb125b6b1224961251bc8879d5fe6d 100644
--- a/tensorflow/python/summary/writer/writer_cache.py
+++ b/tensorflow/python/summary/writer/writer_cache.py
@@ -22,8 +22,10 @@ import threading
 
 from tensorflow.python.framework import ops
 from tensorflow.python.summary.writer.writer import FileWriter
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('summary.FileWriterCache')
 class FileWriterCache(object):
   """Cache for file writers.
 
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index d221dd523b2835d51e61487c22caee961ec28e5f..82b908ac0e95643d1daf5ed062be44a58cfea97f 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -33,6 +33,8 @@ limitations under the License.
 %include "tensorflow/python/client/tf_session.i"
 %include "tensorflow/python/client/device_lib.i"
 
+%include "tensorflow/python/lib/core/bfloat16.i"
+
 %include "tensorflow/python/lib/io/file_io.i"
 %include "tensorflow/python/training/quantize_training.i"
 %include "tensorflow/python/training/server_lib.i"
@@ -40,6 +42,7 @@ limitations under the License.
 %include "tensorflow/python/framework/python_op_gen.i"
 
 %include "tensorflow/python/framework/cpp_shape_inference.i"
+%include "tensorflow/python/platform/stacktrace_handler.i"
 %include "tensorflow/python/util/kernel_registry.i"
 
 %include "tensorflow/python/util/transform_graph.i"
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 69586c6a47762701344aafe449e96868875f8926..63f16c53a29fd65c32077dd29e3b1823c11d457b 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -251,6 +251,7 @@ py_test(
     tags = ["manual"],
     deps = [
         ":saved_model_cli",
+        "//tensorflow/core:protos_all_py",
     ],
 )
 
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index 62c6d8e35b24e2f93fa21509e0dbaabe95e838b1..5d82aa6f478d00e37d8f89e10f9fb36266b8cde2 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -72,7 +72,8 @@ def freeze_graph_with_def_protos(input_graph_def,
                                  variable_names_blacklist="",
                                  input_meta_graph_def=None,
                                  input_saved_model_dir=None,
-                                 saved_model_tags=None):
+                                 saved_model_tags=None,
+                                 checkpoint_version=saver_pb2.SaverDef.V2):
   """Converts all variables in a graph and checkpoint into constants."""
   del restore_op_name, filename_tensor_name  # Unused by updated loading code.
 
@@ -100,7 +101,8 @@ def freeze_graph_with_def_protos(input_graph_def,
     _ = importer.import_graph_def(input_graph_def, name="")
   with session.Session() as sess:
     if input_saver_def:
-      saver = saver_lib.Saver(saver_def=input_saver_def)
+      saver = saver_lib.Saver(
+          saver_def=input_saver_def, write_version=checkpoint_version)
       saver.restore(sess, input_checkpoint)
     elif input_meta_graph_def:
       restorer = saver_lib.import_meta_graph(
@@ -124,7 +126,8 @@ def freeze_graph_with_def_protos(input_graph_def,
           # 'global_step' or a similar housekeeping element) so skip it.
           continue
         var_list[key] = tensor
-      saver = saver_lib.Saver(var_list=var_list)
+      saver = saver_lib.Saver(
+          var_list=var_list, write_version=checkpoint_version)
       saver.restore(sess, input_checkpoint)
       if initializer_nodes:
         sess.run(initializer_nodes.replace(' ','').split(","))
@@ -217,7 +220,8 @@ def freeze_graph(input_graph,
                  variable_names_blacklist="",
                  input_meta_graph=None,
                  input_saved_model_dir=None,
-                 saved_model_tags=tag_constants.SERVING):
+                 saved_model_tags=tag_constants.SERVING,
+                 checkpoint_version=saver_pb2.SaverDef.V2):
   """Converts all variables in a graph and checkpoint into constants."""
   input_graph_def = None
   if input_saved_model_dir:
@@ -233,20 +237,39 @@ def freeze_graph(input_graph,
   if input_saver:
     input_saver_def = _parse_input_saver_proto(input_saver, input_binary)
   freeze_graph_with_def_protos(
-      input_graph_def, input_saver_def, input_checkpoint, output_node_names,
-      restore_op_name, filename_tensor_name, output_graph, clear_devices,
-      initializer_nodes, variable_names_whitelist, variable_names_blacklist,
-      input_meta_graph_def, input_saved_model_dir, saved_model_tags.replace(' ','').split(","))
+      input_graph_def,
+      input_saver_def,
+      input_checkpoint,
+      output_node_names,
+      restore_op_name,
+      filename_tensor_name,
+      output_graph,
+      clear_devices,
+      initializer_nodes,
+      variable_names_whitelist,
+      variable_names_blacklist,
+      input_meta_graph_def,
+      input_saved_model_dir,
+      saved_model_tags.replace(' ','').split(","),
+      checkpoint_version=checkpoint_version)
 
 
 def main(unused_args):
+  if FLAGS.checkpoint_version == 1:
+    checkpoint_version = saver_pb2.SaverDef.V1
+  elif FLAGS.checkpoint_version == 2:
+    checkpoint_version = saver_pb2.SaverDef.V2
+  else:
+    print("Invalid checkpoint version (must be '1' or '2'): %d" %
+          FLAGS.checkpoint_version)
+    return -1
   freeze_graph(FLAGS.input_graph, FLAGS.input_saver, FLAGS.input_binary,
                FLAGS.input_checkpoint, FLAGS.output_node_names,
                FLAGS.restore_op_name, FLAGS.filename_tensor_name,
                FLAGS.output_graph, FLAGS.clear_devices, FLAGS.initializer_nodes,
                FLAGS.variable_names_whitelist, FLAGS.variable_names_blacklist,
                FLAGS.input_meta_graph, FLAGS.input_saved_model_dir,
-               FLAGS.saved_model_tags)
+               FLAGS.saved_model_tags, checkpoint_version)
 
 
 if __name__ == "__main__":
@@ -267,6 +290,11 @@ if __name__ == "__main__":
       type=str,
       default="",
       help="TensorFlow variables file to load.")
+  parser.add_argument(
+      "--checkpoint_version",
+      type=int,
+      default=2,
+      help="Tensorflow variable file format")
   parser.add_argument(
       "--output_graph",
       type=str,
diff --git a/tensorflow/python/tools/freeze_graph_test.py b/tensorflow/python/tools/freeze_graph_test.py
index feeed7102cd49a79d0280cc04431de00ad3286d5..91f0061ebccaebbdbb09f283d9d52d813459f493 100644
--- a/tensorflow/python/tools/freeze_graph_test.py
+++ b/tensorflow/python/tools/freeze_graph_test.py
@@ -84,9 +84,19 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
     input_meta_graph = checkpoint_meta_graph_file
 
     freeze_graph.freeze_graph(
-        input_graph_path, input_saver_def_path, input_binary, checkpoint_path,
-        output_node_names, restore_op_name, filename_tensor_name,
-        output_graph_path, clear_devices, "", "", input_meta_graph)
+        input_graph_path,
+        input_saver_def_path,
+        input_binary,
+        checkpoint_path,
+        output_node_names,
+        restore_op_name,
+        filename_tensor_name,
+        output_graph_path,
+        clear_devices,
+        "",
+        "",
+        input_meta_graph,
+        checkpoint_version=saver_write_version)
 
     # Now we make sure the variable is now a constant, and that the graph still
     # produces the expected result.
diff --git a/tensorflow/python/tools/inspect_checkpoint.py b/tensorflow/python/tools/inspect_checkpoint.py
index 8716058e619d8e970834ec4d57e4d8ff21559d5c..dd876cbe7fcd64a8de70eb28f67996df9de1dd7d 100644
--- a/tensorflow/python/tools/inspect_checkpoint.py
+++ b/tensorflow/python/tools/inspect_checkpoint.py
@@ -97,8 +97,9 @@ def parse_numpy_printoption(kv_str):
     raise argparse.ArgumentTypeError(
         "Setting '%s' from the command line is not supported." % k)
   try:
-    v = (v_type(v_str) if v_type is not bool
-         else flags.BooleanParser().parse(v_str))
+    v = (
+        v_type(v_str)
+        if v_type is not bool else flags.BooleanParser().parse(v_str))
   except ValueError as e:
     raise argparse.ArgumentTypeError(e.message)
   np.set_printoptions(**{k: v})
@@ -121,9 +122,12 @@ if __name__ == "__main__":
   parser = argparse.ArgumentParser()
   parser.register("type", "bool", lambda v: v.lower() == "true")
   parser.add_argument(
-      "--file_name", type=str, default="", help="Checkpoint filename. "
-                    "Note, if using Checkpoint V2 format, file_name is the "
-                    "shared prefix between all files in the checkpoint.")
+      "--file_name",
+      type=str,
+      default="",
+      help="Checkpoint filename. "
+      "Note, if using Checkpoint V2 format, file_name is the "
+      "shared prefix between all files in the checkpoint.")
   parser.add_argument(
       "--tensor_name",
       type=str,
diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py
index c2687bf557b03ff588fd369771077c92ba012a15..9c1927122252f45ddfa8092045c7589fa0f45532 100644
--- a/tensorflow/python/tools/optimize_for_inference_lib.py
+++ b/tensorflow/python/tools/optimize_for_inference_lib.py
@@ -349,6 +349,7 @@ def fold_batch_norms(input_graph_def):
     bias_add_op.op = "BiasAdd"
     bias_add_op.name = node.name
     bias_add_op.attr["T"].CopyFrom(conv_op.attr["T"])
+    bias_add_op.attr["data_format"].CopyFrom(conv_op.attr["data_format"])
     bias_add_op.input.extend([new_conv_op.name, offset_op.name])
     new_ops.extend([scaled_weights_op, new_conv_op, offset_op, bias_add_op])
 
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 6dd24c0dca1d326592e4f33eba4e6233248dac5f..084a4500f8e1eb7f75e1e01668fae655b5e06763 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import image_ops
@@ -38,6 +39,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.tools import optimize_for_inference_lib
 
 
+@test_util.with_c_api
 class OptimizeForInferenceTest(test.TestCase):
 
   def create_node_def(self, op, name, inputs):
@@ -145,7 +147,7 @@ class OptimizeForInferenceTest(test.TestCase):
           np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
       gamma_op = constant_op.constant(
           np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
-      ops.get_default_graph().graph_def_versions.producer = 8
+      test_util.set_producer_version(ops.get_default_graph(), 8)
       gen_nn_ops._batch_norm_with_global_normalization(
           conv_op,
           mean_op,
@@ -171,48 +173,56 @@ class OptimizeForInferenceTest(test.TestCase):
       self.assertNotEqual("BatchNormWithGlobalNormalization", node.op)
 
   def testFoldFusedBatchNorms(self):
-    with self.test_session() as sess:
-      inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
-      input_op = constant_op.constant(
-          np.array(inputs), shape=[1, 1, 6, 2], dtype=dtypes.float32)
-      weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
-      weights_op = constant_op.constant(
-          np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32)
-      conv_op = nn_ops.conv2d(
-          input_op, weights_op, [1, 1, 1, 1], padding="SAME", name="conv_op")
-      mean_op = constant_op.constant(
-          np.array([10, 20]), shape=[2], dtype=dtypes.float32)
-      variance_op = constant_op.constant(
-          np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32)
-      beta_op = constant_op.constant(
-          np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
-      gamma_op = constant_op.constant(
-          np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
-      ops.get_default_graph().graph_def_versions.producer = 9
-      gen_nn_ops._fused_batch_norm(
-          conv_op,
-          gamma_op,
-          beta_op,
-          mean_op,
-          variance_op,
-          0.00001,
-          is_training=False,
-          name="output")
-      original_graph_def = sess.graph_def
-      original_result = sess.run(["output:0"])
-    optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
-        original_graph_def)
-
-    with self.test_session() as sess:
-      _ = importer.import_graph_def(
-          optimized_graph_def, input_map={}, name="optimized")
-      optimized_result = sess.run(["optimized/output:0"])
-
-    self.assertAllClose(
-        original_result, optimized_result, rtol=1e-04, atol=1e-06)
-
-    for node in optimized_graph_def.node:
-      self.assertNotEqual("FusedBatchNorm", node.op)
+    for data_format, use_gpu in [("NHWC", False), ("NCHW", True)]:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
+        input_op = constant_op.constant(
+            np.array(inputs),
+            shape=[1, 1, 6, 2] if data_format == "NHWC" else [1, 2, 1, 6],
+            dtype=dtypes.float32)
+        weights = [1, 2, 3, 4, 0.1, 0.2, 0.3, 0.4]
+        weights_op = constant_op.constant(
+            np.array(weights), shape=[1, 2, 2, 2], dtype=dtypes.float32)
+        conv_op = nn_ops.conv2d(
+            input_op,
+            weights_op, [1, 1, 1, 1],
+            padding="SAME",
+            data_format=data_format,
+            name="conv_op")
+        mean_op = constant_op.constant(
+            np.array([10, 20]), shape=[2], dtype=dtypes.float32)
+        variance_op = constant_op.constant(
+            np.array([0.25, 0.5]), shape=[2], dtype=dtypes.float32)
+        beta_op = constant_op.constant(
+            np.array([0.1, 0.6]), shape=[2], dtype=dtypes.float32)
+        gamma_op = constant_op.constant(
+            np.array([1.0, 2.0]), shape=[2], dtype=dtypes.float32)
+        ops.get_default_graph().graph_def_versions.producer = 9
+        gen_nn_ops._fused_batch_norm(
+            conv_op,
+            gamma_op,
+            beta_op,
+            mean_op,
+            variance_op,
+            0.00001,
+            is_training=False,
+            data_format=data_format,
+            name="output")
+        original_graph_def = sess.graph_def
+        original_result = sess.run(["output:0"])
+      optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
+          original_graph_def)
+
+      with self.test_session(use_gpu=use_gpu) as sess:
+        _ = importer.import_graph_def(
+            optimized_graph_def, input_map={}, name="optimized")
+        optimized_result = sess.run(["optimized/output:0"])
+
+      self.assertAllClose(
+          original_result, optimized_result, rtol=1e-04, atol=1e-06)
+
+      for node in optimized_graph_def.node:
+        self.assertNotEqual("FusedBatchNorm", node.op)
 
   def testFuseResizePadAndConv(self):
     with self.test_session() as sess:
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index cff2c186e382b5195b3ed35fa5966a44d42eb64f..33f6debbcbecb652774c776be54323bbaa824822 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -31,13 +31,15 @@ import warnings
 
 import numpy as np
 
+from six import integer_types
 from tensorflow.contrib.saved_model.python.saved_model import reader
 from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
+from tensorflow.core.example import example_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.wrappers import local_cli_wrapper
 from tensorflow.python.framework import ops as ops_lib
-from tensorflow.python.platform import app
+from tensorflow.python.platform import app  # pylint: disable=unused-import
 from tensorflow.python.saved_model import loader
 from tensorflow.python.tools import saved_model_utils
 
@@ -152,7 +154,9 @@ def _print_tensor_info(tensor_info):
   Args:
     tensor_info: TensorInfo object to be printed.
   """
-  print('    dtype: ' + types_pb2.DataType.keys()[tensor_info.dtype])
+  print('    dtype: ' +
+        {value: key
+         for (key, value) in types_pb2.DataType.items()}[tensor_info.dtype])
   # Display shape as tuple.
   if tensor_info.tensor_shape.unknown_rank:
     shape = 'unknown_rank'
@@ -375,7 +379,7 @@ def preprocess_input_exprs_arg_string(input_exprs_str):
         'input_key=<python expression>'
 
   Returns:
-    A dictionary that maps input keys to python expressions.
+    A dictionary that maps input keys to their values.
 
   Raises:
     RuntimeError: An error when the given input string is in a bad format.
@@ -386,17 +390,75 @@ def preprocess_input_exprs_arg_string(input_exprs_str):
     if '=' not in input_exprs_str:
       raise RuntimeError('--input_exprs "%s" format is incorrect. Please follow'
                          '"<input_key>=<python expression>"' % input_exprs_str)
-    input_key, expr = input_raw.split('=')
-    input_dict[input_key] = expr
+    input_key, expr = input_raw.split('=', 1)
+    # ast.literal_eval does not work with numpy expressions
+    input_dict[input_key] = eval(expr)  # pylint: disable=eval-used
+  return input_dict
+
 
+def preprocess_input_examples_arg_string(input_examples_str):
+  """Parses input into dict that maps input keys to lists of tf.Example.
+
+  Parses input string in the format of 'input_key1=[{feature_name:
+  feature_list}];input_key2=[{feature_name:feature_list}];' into a dictionary
+  that maps each input_key to its list of serialized tf.Example.
+
+  Args:
+    input_examples_str: A string that specifies a list of dictionaries of
+    feature_names and their feature_lists for each input.
+    Each input is separated by semicolon. For each input key:
+      'input=[{feature_name1: feature_list1, feature_name2:feature_list2}]'
+      items in feature_list can be the type of float, int, long or str.
+
+  Returns:
+    A dictionary that maps input keys to lists of serialized tf.Example.
+
+  Raises:
+    ValueError: An error when the given tf.Example is not a list.
+  """
+  input_dict = preprocess_input_exprs_arg_string(input_examples_str)
+  for input_key, example_list in input_dict.items():
+    if not isinstance(example_list, list):
+      raise ValueError(
+          'tf.Example input must be a list of dictionaries, but "%s" is %s' %
+          (example_list, type(example_list)))
+    input_dict[input_key] = [
+        _create_example_string(example) for example in example_list
+    ]
   return input_dict
 
 
-def load_inputs_from_input_arg_string(inputs_str, input_exprs_str):
+def _create_example_string(example_dict):
+  """Create a serialized tf.example from feature dictionary."""
+  example = example_pb2.Example()
+  for feature_name, feature_list in example_dict.items():
+    if not isinstance(feature_list, list):
+      raise ValueError('feature value must be a list, but %s: "%s" is %s' %
+                       (feature_name, feature_list, type(feature_list)))
+    if isinstance(feature_list[0], float):
+      example.features.feature[feature_name].float_list.value.extend(
+          feature_list)
+    elif isinstance(feature_list[0], str):
+      example.features.feature[feature_name].bytes_list.value.extend(
+          feature_list)
+    elif isinstance(feature_list[0], integer_types):
+      example.features.feature[feature_name].int64_list.value.extend(
+          feature_list)
+    else:
+      raise ValueError(
+          'Type %s for value %s is not supported for tf.train.Feature.' %
+          (type(feature_list[0]), feature_list[0]))
+  return example.SerializeToString()
+
+
+def load_inputs_from_input_arg_string(inputs_str, input_exprs_str,
+                                      input_examples_str):
   """Parses input arg strings and create inputs feed_dict.
 
   Parses '--inputs' string for inputs to be loaded from file, and parses
   '--input_exprs' string for inputs to be evaluated from python expression.
+  '--input_examples' string for inputs to be created from tf.example feature
+  dictionary list.
 
   Args:
     inputs_str: A string that specified where to load inputs. Each input is
@@ -422,9 +484,11 @@ def load_inputs_from_input_arg_string(inputs_str, input_exprs_str):
         to the specified input tensor, else SavedModel CLI will assume a
         dictionary is stored in the pickle file and the value corresponding to
         the variable_name will be used.
-    input_exprs_str: A string that specified python expressions for inputs.
+    input_exprs_str: A string that specifies python expressions for inputs.
         * In the format of: '<input_key>=<python expression>'.
         * numpy module is available as np.
+    input_examples_str: A string that specifies tf.Example with dictionary.
+        * In the format of: '<input_key>=<[{feature:value list}]>'
 
   Returns:
     A dictionary that maps input tensor keys to numpy ndarrays.
@@ -439,6 +503,7 @@ def load_inputs_from_input_arg_string(inputs_str, input_exprs_str):
 
   inputs = preprocess_inputs_arg_string(inputs_str)
   input_exprs = preprocess_input_exprs_arg_string(input_exprs_str)
+  input_examples = preprocess_input_examples_arg_string(input_examples_str)
 
   for input_tensor_key, (filename, variable_name) in inputs.items():
     data = np.load(filename)
@@ -472,15 +537,20 @@ def load_inputs_from_input_arg_string(inputs_str, input_exprs_str):
         tensor_key_feed_dict[input_tensor_key] = data
 
   # When input is a python expression:
-  for input_tensor_key, py_expr in input_exprs.items():
+  for input_tensor_key, py_expr_evaluated in input_exprs.items():
     if input_tensor_key in tensor_key_feed_dict:
       warnings.warn(
           'input_key %s has been specified with both --inputs and --input_exprs'
           ' options. Value in --input_exprs will be used.' % input_tensor_key)
+    tensor_key_feed_dict[input_tensor_key] = py_expr_evaluated
 
-    # ast.literal_eval does not work with numpy expressions
-    tensor_key_feed_dict[input_tensor_key] = eval(py_expr)  # pylint: disable=eval-used
-
+  # When input is a tf.Example:
+  for input_tensor_key, example in input_examples.items():
+    if input_tensor_key in tensor_key_feed_dict:
+      warnings.warn(
+          'input_key %s has been specified in multiple options. Value in '
+          '--input_examples will be used.' % input_tensor_key)
+    tensor_key_feed_dict[input_tensor_key] = example
   return tensor_key_feed_dict
 
 
@@ -516,11 +586,12 @@ def run(args):
     AttributeError: An error when neither --inputs nor --input_exprs is passed
     to run command.
   """
-  if not args.inputs and not args.input_exprs:
+  if not args.inputs and not args.input_exprs and not args.input_examples:
     raise AttributeError(
-        'At least one of --inputs and --input_exprs must be required')
+        'At least one of --inputs, --input_exprs or --input_examples must be '
+        'required')
   tensor_key_feed_dict = load_inputs_from_input_arg_string(
-      args.inputs, args.input_exprs)
+      args.inputs, args.input_exprs, args.input_examples)
   run_saved_model_with_feed_dict(args.dir, args.tag_set, args.signature_def,
                                  tensor_key_feed_dict, args.outdir,
                                  args.overwrite, tf_debug=args.tf_debug)
@@ -553,7 +624,7 @@ def create_parser():
       'To show all inputs and outputs TensorInfo for a specific'
       ' SignatureDef specified by the SignatureDef key in a'
       ' MetaGraph.\n'
-      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve'
+      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve '
       '--signature_def serving_default\n\n'
       'To show all available information in the SavedModel\n:'
       '$saved_model_cli show --dir /tmp/saved_model --all')
@@ -587,10 +658,12 @@ def create_parser():
   run_msg = ('Usage example:\n'
              'To run input tensors from files through a MetaGraphDef and save'
              ' the output tensors to files:\n'
-             '$saved_model_cli show --dir /tmp/saved_model --tag_set serve'
+             '$saved_model_cli show --dir /tmp/saved_model --tag_set serve '
              '--signature_def serving_default '
-             '--inputs input1_key=/tmp/124.npz[x],input2_key=/tmp/123.npy'
-             '--input_exprs \'input3_key=np.ones(2)\' --outdir=/out\n\n'
+             '--inputs input1_key=/tmp/124.npz[x],input2_key=/tmp/123.npy '
+             '--input_exprs \'input3_key=np.ones(2)\' --input_examples '
+             '\'input4_key=[{"id":[26],"weights":[0.5, 0.5]}]\' '
+             '--outdir=/out\n\n'
              'For more information about input file format, please see:\n'
              'https://www.tensorflow.org/programmers_guide/saved_model_cli\n')
   parser_run = subparsers.add_parser(
@@ -618,8 +691,14 @@ def create_parser():
   msg = ('Specifying inputs by python expressions, in the format of'
          ' "<input_key>=\'<python expression>\'", separated by \';\'. '
          'numpy module is available as \'np\'. '
-         'Will override duplicate input_keys from --inputs option.')
+         'Will override duplicate input keys from --inputs option.')
   parser_run.add_argument('--input_exprs', type=str, default='', help=msg)
+  msg = (
+      'Specifying tf.Example inputs as list of dictionaries. For example: '
+      '<input_key>=[{feature0:value_list,feature1:value_list}]. Use ";" to '
+      'separate input keys. Will override duplicate input keys from --inputs '
+      'and --input_exprs option.')
+  parser_run.add_argument('--input_examples', type=str, default='', help=msg)
   parser_run.add_argument(
       '--outdir',
       type=str,
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
index a55cf168b23e8fc4efeb5175e3c01cad1a68fa57..d6cbc49ba1e08a6b808b228fb8d69fc14f36e3d2 100644
--- a/tensorflow/python/tools/saved_model_cli_test.py
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -28,6 +28,8 @@ import sys
 import numpy as np
 from six import StringIO
 
+from tensorflow.core.framework import types_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.debug.wrappers import local_cli_wrapper
 from tensorflow.python.platform import test
 from tensorflow.python.tools import saved_model_cli
@@ -200,6 +202,14 @@ Method name is: tensorflow/serving/predict"""
     self.assertEqual(output, expected_output)
     self.assertEqual(err.getvalue().strip(), '')
 
+  def testPrintREFTypeTensor(self):
+    ref_tensor_info = meta_graph_pb2.TensorInfo()
+    ref_tensor_info.dtype = types_pb2.DT_FLOAT_REF
+    with captured_output() as (out, err):
+      saved_model_cli._print_tensor_info(ref_tensor_info)
+    self.assertTrue('DT_FLOAT_REF' in out.getvalue().strip())
+    self.assertEqual(err.getvalue().strip(), '')
+
   def testInputPreProcessFormats(self):
     input_str = 'input1=/path/file.txt[ab3];input2=file2'
     input_expr_str = 'input3=np.zeros([2,2]);input4=[4,5]'
@@ -208,8 +218,9 @@ Method name is: tensorflow/serving/predict"""
         input_expr_str)
     self.assertTrue(input_dict['input1'] == ('/path/file.txt', 'ab3'))
     self.assertTrue(input_dict['input2'] == ('file2', None))
-    self.assertTrue(input_expr_dict['input3'] == 'np.zeros([2,2])')
-    self.assertTrue(input_expr_dict['input4'] == '[4,5]')
+    print(input_expr_dict['input3'])
+    self.assertAllClose(input_expr_dict['input3'], np.zeros([2, 2]))
+    self.assertAllClose(input_expr_dict['input4'], [4, 5])
     self.assertTrue(len(input_dict) == 2)
     self.assertTrue(len(input_expr_dict) == 2)
 
@@ -217,7 +228,6 @@ Method name is: tensorflow/serving/predict"""
     input_str = (r'inputx=C:\Program Files\data.npz[v:0];'
                  r'input:0=c:\PROGRA~1\data.npy')
     input_dict = saved_model_cli.preprocess_inputs_arg_string(input_str)
-    print(input_dict)
     self.assertTrue(input_dict['inputx'] == (r'C:\Program Files\data.npz',
                                              'v:0'))
     self.assertTrue(input_dict['input:0'] == (r'c:\PROGRA~1\data.npy', None))
@@ -241,7 +251,8 @@ Method name is: tensorflow/serving/predict"""
     np.save(input0_path, x0)
     np.save(input1_path, x1)
     input_str = 'x0=' + input0_path + '[x0];x1=' + input1_path
-    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        input_str, '', '')
     self.assertTrue(np.all(feed_dict['x0'] == x0))
     self.assertTrue(np.all(feed_dict['x1'] == x1))
 
@@ -250,7 +261,8 @@ Method name is: tensorflow/serving/predict"""
     input_path = os.path.join(test.get_temp_dir(), 'input.npz')
     np.savez(input_path, a=x0)
     input_str = 'x=' + input_path + '[a];y=' + input_path
-    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        input_str, '', '')
     self.assertTrue(np.all(feed_dict['x'] == x0))
     self.assertTrue(np.all(feed_dict['y'] == x0))
 
@@ -269,7 +281,8 @@ Method name is: tensorflow/serving/predict"""
       pickle.dump(pkl2, f)
     input_str = 'x=' + input_path0 + '[b];y=' + input_path1 + '[c];'
     input_str += 'z=' + input_path2
-    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        input_str, '', '')
     self.assertTrue(np.all(feed_dict['x'] == pkl0['b']))
     self.assertTrue(np.all(feed_dict['y'] == pkl1))
     self.assertTrue(np.all(feed_dict['z'] == pkl2))
@@ -282,7 +295,7 @@ Method name is: tensorflow/serving/predict"""
     input_expr_str = ('x1=np.ones([2,10]);x2=np.array([[1],[2],[3]]);'
                       'x3=np.mgrid[0:5,0:5];x4=[[3],[4]]')
     feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
-        '', input_expr_str)
+        '', input_expr_str, '')
     self.assertTrue(np.all(feed_dict['x1'] == x1))
     self.assertTrue(np.all(feed_dict['x2'] == x2))
     self.assertTrue(np.all(feed_dict['x3'] == x3))
@@ -296,7 +309,7 @@ Method name is: tensorflow/serving/predict"""
     input_str = 'x0=' + input_path + '[a]'
     input_expr_str = 'x1=np.ones([2,10])'
     feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
-        input_str, input_expr_str)
+        input_str, input_expr_str, '')
     self.assertTrue(np.all(feed_dict['x0'] == x0))
     self.assertTrue(np.all(feed_dict['x1'] == x1))
 
@@ -308,7 +321,7 @@ Method name is: tensorflow/serving/predict"""
     input_str = 'x0=' + input_path + '[a]'
     input_expr_str = 'x0=np.ones([2,10])'
     feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
-        input_str, input_expr_str)
+        input_str, input_expr_str, '')
     self.assertTrue(np.all(feed_dict['x0'] == x1))
 
   def testInputParserErrorNoName(self):
@@ -318,7 +331,7 @@ Method name is: tensorflow/serving/predict"""
     np.savez(input_path, a=x0, b=x1)
     input_str = 'x=' + input_path
     with self.assertRaises(RuntimeError):
-      saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+      saved_model_cli.load_inputs_from_input_arg_string(input_str, '', '')
 
   def testInputParserErrorWrongName(self):
     x0 = np.array([[1], [2]])
@@ -327,7 +340,22 @@ Method name is: tensorflow/serving/predict"""
     np.savez(input_path, a=x0, b=x1)
     input_str = 'x=' + input_path + '[c]'
     with self.assertRaises(RuntimeError):
-      saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+      saved_model_cli.load_inputs_from_input_arg_string(input_str, '', '')
+
+  def testRunCommandInputExamples(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    output_dir = os.path.join(test.get_temp_dir(), 'new_dir')
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'regress_x_to_y', '--input_examples',
+        'inputs=[{"x":[8.0],"x2":[5.0]}, {"x":[4.0],"x2":[3.0]}]', '--outdir',
+        output_dir
+    ])
+    saved_model_cli.run(args)
+    y_actual = np.load(os.path.join(output_dir, 'outputs.npy'))
+    y_expected = np.array([[6.0], [4.0]])
+    self.assertAllEqual(y_expected, y_actual)
 
   def testRunCommandExistingOutdir(self):
     self.parser = saved_model_cli.create_parser()
@@ -401,6 +429,42 @@ Method name is: tensorflow/serving/predict"""
     with self.assertRaises(ValueError):
       saved_model_cli.run(args)
 
+  def testRunCommandInputExamplesNotListError(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    output_dir = os.path.join(test.get_temp_dir(), 'new_dir')
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'regress_x_to_y', '--input_examples', 'inputs={"x":8.0,"x2":5.0}',
+        '--outdir', output_dir
+    ])
+    with self.assertRaisesRegexp(ValueError, 'must be a list'):
+      saved_model_cli.run(args)
+
+  def testRunCommandInputExamplesFeatureValueNotListError(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    output_dir = os.path.join(test.get_temp_dir(), 'new_dir')
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'regress_x_to_y', '--input_examples', 'inputs=[{"x":8.0,"x2":5.0}]',
+        '--outdir', output_dir
+    ])
+    with self.assertRaisesRegexp(ValueError, 'feature value must be a list'):
+      saved_model_cli.run(args)
+
+  def testRunCommandInputExamplesFeatureBadType(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    output_dir = os.path.join(test.get_temp_dir(), 'new_dir')
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'regress_x_to_y', '--input_examples', 'inputs=[{"x":[[1],[2]]}]',
+        '--outdir', output_dir
+    ])
+    with self.assertRaisesRegexp(ValueError, 'is not supported'):
+      saved_model_cli.run(args)
+
   def testRunCommandOutputFileExistError(self):
     self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
diff --git a/tensorflow/python/tools/selective_registration_header_lib.py b/tensorflow/python/tools/selective_registration_header_lib.py
index 7f7470994dd75e22be6cbb55c5cfe17ece2e95ad..dc0612bb3f3eca29fd75ed568eded5f582572e19 100644
--- a/tensorflow/python/tools/selective_registration_header_lib.py
+++ b/tensorflow/python/tools/selective_registration_header_lib.py
@@ -54,7 +54,7 @@ def get_ops_and_kernels(proto_fileformat, proto_files, default_ops_str):
       kernel_class = pywrap_tensorflow.TryFindKernelClass(
           node_def.SerializeToString())
       if kernel_class:
-        op_and_kernel = (str(node_def.op), kernel_class.decode('utf-8'))
+        op_and_kernel = (str(node_def.op), str(kernel_class.decode('utf-8')))
         if op_and_kernel not in ops:
           ops.add(op_and_kernel)
       else:
diff --git a/tensorflow/python/training/adadelta.py b/tensorflow/python/training/adadelta.py
index 13c07cfd7bf4333fee3edc3c3ad9d2fb7bcbaad2..c08e3cca007dc17f1112d53bf729c1accf61b5df 100644
--- a/tensorflow/python/training/adadelta.py
+++ b/tensorflow/python/training/adadelta.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdadeltaOptimizer")
 class AdadeltaOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adadelta algorithm.
 
diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py
index afa192f7cc6e0ecd629fd94252d26961f1407183..deb4e6f546379eff330235dbc302a30c44193830 100644
--- a/tensorflow/python/training/adagrad.py
+++ b/tensorflow/python/training/adagrad.py
@@ -25,8 +25,10 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdagradOptimizer")
 class AdagradOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adagrad algorithm.
 
diff --git a/tensorflow/python/training/adagrad_da.py b/tensorflow/python/training/adagrad_da.py
index b3f9ea323c2bb4fd9ecee93863fbc7955b47a947..5ba403554f570d9df33a5d525a40de2eb0d11138 100644
--- a/tensorflow/python/training/adagrad_da.py
+++ b/tensorflow/python/training/adagrad_da.py
@@ -23,8 +23,10 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdagradDAOptimizer")
 class AdagradDAOptimizer(optimizer.Optimizer):
   """Adagrad Dual Averaging algorithm for sparse linear models.
 
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 266f5563e0c738fe73e3a771a46e9b28c266cd73..c92f6fc3015960a2b821651231bb94713e0d53dd 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -24,11 +24,12 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.AdamOptimizer")
 class AdamOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adam algorithm.
 
@@ -101,19 +102,16 @@ class AdamOptimizer(optimizer.Optimizer):
     self._beta2_t = None
     self._epsilon_t = None
 
-    # Variables to accumulate the powers of the beta parameters.
-    # Created in _create_slots when we know the variables to optimize.
-    self._beta1_power = None
-    self._beta2_power = None
-
     # Created in SparseApply if needed.
     self._updated_lr = None
 
   def _get_beta_accumulators(self):
-    return self._beta1_power, self._beta2_power
-
-  def _non_slot_variables(self):
-    return self._get_beta_accumulators()
+    if context.in_graph_mode():
+      graph = ops.get_default_graph()
+    else:
+      graph = None
+    return (self._get_non_slot_variable("beta1_power", graph=graph),
+            self._get_non_slot_variable("beta2_power", graph=graph))
 
   def _create_slots(self, var_list):
     # Create the beta1 and beta2 accumulators on the same device as the first
@@ -121,19 +119,13 @@ class AdamOptimizer(optimizer.Optimizer):
     # workers (these need to go on the same PS, otherwise some updates are
     # silently ignored).
     first_var = min(var_list, key=lambda x: x.name)
+    self._create_non_slot_variable(initial_value=self._beta1,
+                                   name="beta1_power",
+                                   colocate_with=first_var)
+    self._create_non_slot_variable(initial_value=self._beta2,
+                                   name="beta2_power",
+                                   colocate_with=first_var)
 
-    create_new = self._beta1_power is None
-    if not create_new and context.in_graph_mode():
-      create_new = (self._beta1_power.graph is not first_var.graph)
-
-    if create_new:
-      with ops.colocate_with(first_var):
-        self._beta1_power = variable_scope.variable(self._beta1,
-                                                    name="beta1_power",
-                                                    trainable=False)
-        self._beta2_power = variable_scope.variable(self._beta2,
-                                                    name="beta2_power",
-                                                    trainable=False)
     # Create slots for the first and second moments.
     for v in var_list:
       self._zeros_slot(v, "m", self._name)
@@ -148,10 +140,11 @@ class AdamOptimizer(optimizer.Optimizer):
   def _apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.apply_adam(
         var, m, v,
-        math_ops.cast(self._beta1_power, var.dtype.base_dtype),
-        math_ops.cast(self._beta2_power, var.dtype.base_dtype),
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(beta2_power, var.dtype.base_dtype),
         math_ops.cast(self._lr_t, var.dtype.base_dtype),
         math_ops.cast(self._beta1_t, var.dtype.base_dtype),
         math_ops.cast(self._beta2_t, var.dtype.base_dtype),
@@ -161,10 +154,11 @@ class AdamOptimizer(optimizer.Optimizer):
   def _resource_apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.resource_apply_adam(
         var.handle, m.handle, v.handle,
-        math_ops.cast(self._beta1_power, grad.dtype.base_dtype),
-        math_ops.cast(self._beta2_power, grad.dtype.base_dtype),
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(beta2_power, grad.dtype.base_dtype),
         math_ops.cast(self._lr_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
@@ -172,8 +166,9 @@ class AdamOptimizer(optimizer.Optimizer):
         grad, use_locking=self._use_locking)
 
   def _apply_sparse_shared(self, grad, var, indices, scatter_add):
-    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
-    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
     lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
     beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
     beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
@@ -217,12 +212,11 @@ class AdamOptimizer(optimizer.Optimizer):
   def _finish(self, update_ops, name_scope):
     # Update the power accumulators.
     with ops.control_dependencies(update_ops):
-      with ops.colocate_with(self._beta1_power):
-        update_beta1 = self._beta1_power.assign(
-            self._beta1_power * self._beta1_t,
-            use_locking=self._use_locking)
-        update_beta2 = self._beta2_power.assign(
-            self._beta2_power * self._beta2_t,
-            use_locking=self._use_locking)
+      beta1_power, beta2_power = self._get_beta_accumulators()
+      with ops.colocate_with(beta1_power):
+        update_beta1 = beta1_power.assign(
+            beta1_power * self._beta1_t, use_locking=self._use_locking)
+        update_beta2 = beta2_power.assign(
+            beta2_power * self._beta2_t, use_locking=self._use_locking)
     return control_flow_ops.group(*update_ops + [update_beta1, update_beta2],
                                   name=name_scope)
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 0d534db60dc92443d2795e751a574018bc03f612..a521f1299e035424d1c3897a469655db732b0dcd 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -174,8 +174,11 @@ class AdamOptimizerTest(test.TestCase):
         opt = adam.AdamOptimizer()
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         opt_variables = opt.variables()
-        self.assertIn(opt._beta1_power, opt_variables)
-        self.assertIn(opt._beta2_power, opt_variables)
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+        self.assertTrue(beta1_power is not None)
+        self.assertTrue(beta2_power is not None)
+        self.assertIn(beta1_power, opt_variables)
+        self.assertIn(beta2_power, opt_variables)
 
         with ops.Graph().as_default():
           # Shouldn't return non-slot variables from other graphs.
@@ -207,6 +210,9 @@ class AdamOptimizerTest(test.TestCase):
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/Adam:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
 
   def testBasic(self):
     with self.test_session():
diff --git a/tensorflow/python/training/basic_loops.py b/tensorflow/python/training/basic_loops.py
index 52b0f4210612bad4a2e838153ac9cbdb1023bf66..7af821c81928e67e0f258bc064d582a4186995c1 100644
--- a/tensorflow/python/training/basic_loops.py
+++ b/tensorflow/python/training/basic_loops.py
@@ -18,8 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import errors
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.basic_train_loop")
 def basic_train_loop(supervisor, train_step_fn, args=None,
                      kwargs=None, master=""):
   """Basic loop to train a model.
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 1fb00343ef23d6b6dc9ca41f4868f0a7d80feb7c..aae757b99aa9abb2fca112dcc781fc31e367649d 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -47,6 +47,7 @@ from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 from tensorflow.python.training.session_run_hook import SessionRunArgs
 from tensorflow.python.training.summary_io import SummaryWriterCache
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _HookTimer(object):
@@ -85,6 +86,7 @@ class _HookTimer(object):
     raise NotImplementedError
 
 
+@tf_export("train.SecondOrStepTimer")
 class SecondOrStepTimer(_HookTimer):
   """Timer that triggers at most once every N seconds or once every N steps.
   """
@@ -164,6 +166,7 @@ class NeverTriggerTimer(_HookTimer):
     return None
 
 
+@tf_export("train.LoggingTensorHook")
 class LoggingTensorHook(session_run_hook.SessionRunHook):
   """Prints the given tensors every N local steps, every N seconds, or at end.
 
@@ -262,6 +265,7 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
       self._log_tensors(values)
 
 
+@tf_export("train.StopAtStepHook")
 class StopAtStepHook(session_run_hook.SessionRunHook):
   """Hook that requests stop at a specified step."""
 
@@ -317,6 +321,7 @@ class StopAtStepHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
+@tf_export("train.CheckpointSaverListener")
 class CheckpointSaverListener(object):
   """Interface for listeners that take action before or after checkpoint save.
 
@@ -331,7 +336,7 @@ class CheckpointSaverListener(object):
   `CheckpointSaverHook`, as in this example:
 
   ```python
-  class ExampleCheckpointSaverListerner(CheckpointSaverListener):
+  class ExampleCheckpointSaverListener(CheckpointSaverListener):
     def begin(self):
       # You can add ops to the graph here.
       print('Starting the session.')
@@ -347,7 +352,7 @@ class CheckpointSaverListener(object):
       print('Done with the session.')
 
   ...
-  listener = ExampleCheckpointSaverListerner()
+  listener = ExampleCheckpointSaverListener()
   saver_hook = tf.train.CheckpointSaverHook(
       checkpoint_dir, listeners=[listener])
   with tf.train.MonitoredTrainingSession(chief_only_hooks=[saver_hook]):
@@ -375,6 +380,7 @@ class CheckpointSaverListener(object):
     pass
 
 
+@tf_export("train.CheckpointSaverHook")
 class CheckpointSaverHook(session_run_hook.SessionRunHook):
   """Saves checkpoints every N steps or seconds."""
 
@@ -497,6 +503,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     return savers[0]
 
 
+@tf_export("train.StepCounterHook")
 class StepCounterHook(session_run_hook.SessionRunHook):
   """Hook that counts steps per second."""
 
@@ -514,6 +521,8 @@ class StepCounterHook(session_run_hook.SessionRunHook):
 
     self._summary_writer = summary_writer
     self._output_dir = output_dir
+    self._last_global_step = None
+    self._global_step_check_count = 0
 
   def begin(self):
     if self._summary_writer is None and self._output_dir:
@@ -527,6 +536,14 @@ class StepCounterHook(session_run_hook.SessionRunHook):
   def before_run(self, run_context):  # pylint: disable=unused-argument
     return SessionRunArgs(self._global_step_tensor)
 
+  def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
+    steps_per_sec = elapsed_steps / elapsed_time
+    if self._summary_writer is not None:
+      summary = Summary(value=[Summary.Value(
+          tag=self._summary_tag, simple_value=steps_per_sec)])
+      self._summary_writer.add_summary(summary, global_step)
+    logging.info("%s: %g", self._summary_tag, steps_per_sec)
+
   def after_run(self, run_context, run_values):
     _ = run_context
 
@@ -538,20 +555,41 @@ class StepCounterHook(session_run_hook.SessionRunHook):
         elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
             global_step)
         if elapsed_time is not None:
-          steps_per_sec = elapsed_steps / elapsed_time
-          if self._summary_writer is not None:
-            summary = Summary(value=[Summary.Value(
-                tag=self._summary_tag, simple_value=steps_per_sec)])
-            self._summary_writer.add_summary(summary, global_step)
-          logging.info("%s: %g", self._summary_tag, steps_per_sec)
+          self._log_and_record(elapsed_steps, elapsed_time, global_step)
+
+    # Check whether the global step has been increased. Here, we do not use the
+    # timer.last_triggered_step as the timer might record a different global
+    # step value such that the comparison could be unreliable. For simplicity,
+    # we just compare the stale_global_step with previously recorded version.
+    if stale_global_step == self._last_global_step:
+      # Here, we use a counter to count how many times we have observed that the
+      # global step has not been increased. For some Optimizers, the global step
+      # is not increased each time by design. For example, SyncReplicaOptimizer
+      # doesn't increase the global step in worker's main train step.
+      self._global_step_check_count += 1
+      if self._global_step_check_count % 20 == 0:
+        self._global_step_check_count = 0
+        logging.warning(
+            "It seems that global step (tf.train.get_global_step) has not "
+            "been increased. Current value (could be stable): %s vs previous "
+            "value: %s. You could increase the global step by passing "
+            "tf.train.get_global_step() to Optimizer.apply_gradients or "
+            "Optimizer.minimize.", stale_global_step, self._last_global_step)
+    else:
+      # Whenever we observe the increment, reset the counter.
+      self._global_step_check_count = 0
+
+    self._last_global_step = stale_global_step
 
 
+@tf_export("train.NanLossDuringTrainingError")
 class NanLossDuringTrainingError(RuntimeError):
 
   def __str__(self):
     return "NaN loss during training."
 
 
+@tf_export("train.NanTensorHook")
 class NanTensorHook(session_run_hook.SessionRunHook):
   """Monitors the loss tensor and stops training if loss is NaN.
 
@@ -583,6 +621,7 @@ class NanTensorHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
+@tf_export("train.SummarySaverHook")
 class SummarySaverHook(session_run_hook.SessionRunHook):
   """Saves summaries every N steps."""
 
@@ -691,6 +730,7 @@ class SummarySaverHook(session_run_hook.SessionRunHook):
     return summary_op
 
 
+@tf_export("train.GlobalStepWaiterHook")
 class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
   """Delays execution until global step reaches `wait_until_step`.
 
@@ -738,6 +778,7 @@ class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
       time.sleep(0.5)
 
 
+@tf_export("train.FinalOpsHook")
 class FinalOpsHook(session_run_hook.SessionRunHook):
   """A hook which evaluates `Tensors` at the end of a session."""
 
@@ -764,6 +805,7 @@ class FinalOpsHook(session_run_hook.SessionRunHook):
                                            feed_dict=self._final_ops_feed_dict)
 
 
+@tf_export("train.FeedFnHook")
 class FeedFnHook(session_run_hook.SessionRunHook):
   """Runs `feed_fn` and sets the `feed_dict` accordingly."""
 
@@ -781,6 +823,7 @@ class FeedFnHook(session_run_hook.SessionRunHook):
         fetches=None, feed_dict=self.feed_fn())
 
 
+@tf_export("train.ProfilerHook")
 class ProfilerHook(session_run_hook.SessionRunHook):
   """Captures CPU/GPU profiling information every N steps or seconds.
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index e7ff7e12211ae57a8589c799efbf9eab3b3fe5da..2547661e5250e94136a100aa8c30c9dbb7455018 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -780,9 +780,12 @@ class StepCounterHookTest(test.TestCase):
       hook.begin()
       sess.run(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      for _ in range(30):
-        time.sleep(0.01)
-        mon_sess.run(train_op)
+      with test.mock.patch.object(tf_logging, 'warning') as mock_log:
+        for _ in range(30):
+          time.sleep(0.01)
+          mon_sess.run(train_op)
+        # logging.warning should not be called.
+        self.assertIsNone(mock_log.call_args)
       hook.end(sess)
       summary_writer.assert_summaries(
           test_case=self,
@@ -857,6 +860,24 @@ class StepCounterHookTest(test.TestCase):
       summary_value = summary_writer.summaries[2][0].value[0]
       self.assertEqual('bar/foo/sec', summary_value.tag)
 
+  def test_log_warning_if_global_step_not_increased(self):
+    with ops.Graph().as_default(), session_lib.Session() as sess:
+      variables.get_or_create_global_step()
+      train_op = training_util._increment_global_step(0)  # keep same.
+      sess.run(variables_lib.global_variables_initializer())
+      hook = basic_session_run_hooks.StepCounterHook(
+          every_n_steps=1, every_n_secs=None)
+      hook.begin()
+      mon_sess = monitored_session._HookedSession(sess, [hook])
+      mon_sess.run(train_op)  # Run one step to record global step.
+      with test.mock.patch.object(tf_logging, 'warning') as mock_log:
+        for _ in range(30):
+          mon_sess.run(train_op)
+        self.assertRegexpMatches(
+            str(mock_log.call_args),
+            'global step.*has not been increased')
+      hook.end(sess)
+
 
 class SummarySaverHookTest(test.TestCase):
 
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 5054873bc1c7751e6164a868b91b8ef7be0a5c79..fa3de6fad27b6cc773f9f2e86e9f95395eb7c285 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver
+from tensorflow.python.util.tf_export import tf_export
 
 
 __all__ = [
@@ -36,6 +37,7 @@ __all__ = [
 ]
 
 
+@tf_export("train.load_checkpoint")
 def load_checkpoint(ckpt_dir_or_file):
   """Returns `CheckpointReader` for checkpoint found in `ckpt_dir_or_file`.
 
@@ -60,6 +62,7 @@ def load_checkpoint(ckpt_dir_or_file):
   return pywrap_tensorflow.NewCheckpointReader(filename)
 
 
+@tf_export("train.load_variable")
 def load_variable(ckpt_dir_or_file, name):
   """Returns the tensor value of the given variable in the checkpoint.
 
@@ -77,6 +80,7 @@ def load_variable(ckpt_dir_or_file, name):
   return reader.get_tensor(name)
 
 
+@tf_export("train.list_variables")
 def list_variables(ckpt_dir_or_file):
   """Returns list of all variables in the checkpoint.
 
@@ -95,6 +99,7 @@ def list_variables(ckpt_dir_or_file):
   return result
 
 
+@tf_export("train.init_from_checkpoint")
 def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
   """Initializes current variables with tensors loaded from given checkpoint.
 
@@ -176,7 +181,8 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
   ckpt_file = _get_checkpoint_filename(ckpt_dir_or_file)
   reader = load_checkpoint(ckpt_dir_or_file)
   variable_map = reader.get_variable_to_shape_map()
-  for tensor_name_in_ckpt, current_var_or_name in six.iteritems(assignment_map):
+  for tensor_name_in_ckpt, current_var_or_name in sorted(
+      six.iteritems(assignment_map)):
     var = None
     # Check if this is Variable object or list of Variable objects (in case of
     # partitioned variables).
@@ -233,7 +239,7 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
           if "/part_" in var_name:
             var_name = var_name[:var_name.index("/part_")]
           scope_variables.add(var_name)
-      for var_name in scope_variables:
+      for var_name in sorted(scope_variables):
         # Lookup name with specified prefix and suffix from current variable.
         # If tensor_name given is '/' (root), don't use it for full name.
         full_tensor_name = var_name[len(scopes):]
@@ -241,6 +247,9 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
           full_tensor_name = full_tensor_name[1:]
         if tensor_name_in_ckpt != "/":
           full_tensor_name = tensor_name_in_ckpt + full_tensor_name
+        # Remove trailing '/', if any, in the full_tensor_name
+        if full_tensor_name.endswith("/"):
+          full_tensor_name = full_tensor_name[:-1]
         if full_tensor_name not in variable_map:
           raise ValueError(
               "Tensor %s (%s in %s) is not found in %s checkpoint" % (
diff --git a/tensorflow/python/training/checkpoint_utils_test.py b/tensorflow/python/training/checkpoint_utils_test.py
index 8dbc980b6bf1d594a84613fca3368e00acb9e958..cd17faa040d5b85263b54bc53100b18f736a12e0 100644
--- a/tensorflow/python/training/checkpoint_utils_test.py
+++ b/tensorflow/python/training/checkpoint_utils_test.py
@@ -143,7 +143,7 @@ class CheckpointsTest(test.TestCase):
         self.assertAllEqual(my4.eval(session), v4)
 
         # Check that tensors are not explicitly in the graph.
-        self.assertLess(len(str(session.graph.as_graph_def())), 28000)
+        self.assertLess(len(str(session.graph.as_graph_def())), 29000)
 
   def testInitWithScopeDoesNotCaptureSuffixes(self):
     checkpoint_dir = self.get_temp_dir()
diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py
index 0e31255b74f64657cffc4a2f58798835513f0444..0ff97d85e37e6167f1200ba56940f4a663c259a2 100644
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@@ -27,8 +27,10 @@ import six
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.Coordinator")
 class Coordinator(object):
   """A coordinator for threads.
 
@@ -406,6 +408,7 @@ class Coordinator(object):
 
 
 # Threads for the standard services.
+@tf_export("train.LooperThread")
 class LooperThread(threading.Thread):
   """A thread that runs code repeatedly, optionally on a timer.
 
diff --git a/tensorflow/python/training/coordinator_test.py b/tensorflow/python/training/coordinator_test.py
index 149d3eed414d53f46dcab403b7b4822ffa66e644..3e4ac1dfff9708fd1a5cd8bdf23f99d8f963bd16 100644
--- a/tensorflow/python/training/coordinator_test.py
+++ b/tensorflow/python/training/coordinator_test.py
@@ -85,8 +85,8 @@ class CoordinatorTest(test.TestCase):
     self.assertFalse(coord.wait_for_stop(0.1))
     wait_for_stop_ev = threading.Event()
     has_stopped_ev = threading.Event()
-    t = threading.Thread(target=StopOnEvent,
-                         args=(coord, wait_for_stop_ev, has_stopped_ev))
+    t = threading.Thread(
+        target=StopOnEvent, args=(coord, wait_for_stop_ev, has_stopped_ev))
     t.start()
     self.assertFalse(coord.should_stop())
     self.assertFalse(coord.wait_for_stop(0.01))
@@ -100,7 +100,8 @@ class CoordinatorTest(test.TestCase):
     threads = [
         threading.Thread(target=SleepABit, args=(0.01,)),
         threading.Thread(target=SleepABit, args=(0.02,)),
-        threading.Thread(target=SleepABit, args=(0.01,))]
+        threading.Thread(target=SleepABit, args=(0.01,))
+    ]
     for t in threads:
       t.start()
     coord.join(threads)
@@ -112,7 +113,8 @@ class CoordinatorTest(test.TestCase):
     threads = [
         threading.Thread(target=SleepABit, args=(0.01, coord)),
         threading.Thread(target=SleepABit, args=(0.02, coord)),
-        threading.Thread(target=SleepABit, args=(0.01, coord))]
+        threading.Thread(target=SleepABit, args=(0.01, coord))
+    ]
     for t in threads:
       t.start()
     WaitForThreadsToRegister(coord, 3)
@@ -125,7 +127,8 @@ class CoordinatorTest(test.TestCase):
     threads = [
         threading.Thread(target=SleepABit, args=(0.01, coord)),
         threading.Thread(target=SleepABit, args=(0.02,)),
-        threading.Thread(target=SleepABit, args=(0.01, coord))]
+        threading.Thread(target=SleepABit, args=(0.01, coord))
+    ]
     for t in threads:
       t.start()
     WaitForThreadsToRegister(coord, 2)
@@ -135,14 +138,17 @@ class CoordinatorTest(test.TestCase):
       self.assertFalse(t.is_alive())
 
   def testJoinGraceExpires(self):
+
     def TestWithGracePeriod(stop_grace_period):
       coord = coordinator.Coordinator()
       wait_for_stop_ev = threading.Event()
       has_stopped_ev = threading.Event()
       threads = [
-          threading.Thread(target=StopOnEvent,
-                           args=(coord, wait_for_stop_ev, has_stopped_ev)),
-          threading.Thread(target=SleepABit, args=(10.0,))]
+          threading.Thread(
+              target=StopOnEvent,
+              args=(coord, wait_for_stop_ev, has_stopped_ev)),
+          threading.Thread(target=SleepABit, args=(10.0,))
+      ]
       for t in threads:
         t.daemon = True
         t.start()
@@ -150,6 +156,7 @@ class CoordinatorTest(test.TestCase):
       has_stopped_ev.wait()
       with self.assertRaisesRegexp(RuntimeError, "threads still running"):
         coord.join(threads, stop_grace_period_secs=stop_grace_period)
+
     TestWithGracePeriod(1e-10)
     TestWithGracePeriod(0.002)
     TestWithGracePeriod(1.0)
@@ -159,16 +166,16 @@ class CoordinatorTest(test.TestCase):
     wait_for_stop_ev = threading.Event()
     has_stopped_ev = threading.Event()
     threads = [
-        threading.Thread(target=StopOnEvent,
-                         args=(coord, wait_for_stop_ev, has_stopped_ev)),
-        threading.Thread(target=SleepABit, args=(10.0,))]
+        threading.Thread(
+            target=StopOnEvent, args=(coord, wait_for_stop_ev, has_stopped_ev)),
+        threading.Thread(target=SleepABit, args=(10.0,))
+    ]
     for t in threads:
       t.daemon = True
       t.start()
     wait_for_stop_ev.set()
     has_stopped_ev.wait()
-    coord.join(
-        threads, stop_grace_period_secs=1., ignore_live_threads=True)
+    coord.join(threads, stop_grace_period_secs=1., ignore_live_threads=True)
 
   def testJoinRaiseReportExcInfo(self):
     coord = coordinator.Coordinator()
@@ -180,7 +187,8 @@ class CoordinatorTest(test.TestCase):
             args=(coord, ev_1, ev_2, RuntimeError("First"), False)),
         threading.Thread(
             target=RaiseOnEvent,
-            args=(coord, ev_2, None, RuntimeError("Too late"), False))]
+            args=(coord, ev_2, None, RuntimeError("Too late"), False))
+    ]
     for t in threads:
       t.start()
 
@@ -199,7 +207,8 @@ class CoordinatorTest(test.TestCase):
             args=(coord, ev_1, ev_2, RuntimeError("First"), True)),
         threading.Thread(
             target=RaiseOnEvent,
-            args=(coord, ev_2, None, RuntimeError("Too late"), True))]
+            args=(coord, ev_2, None, RuntimeError("Too late"), True))
+    ]
     for t in threads:
       t.start()
 
@@ -214,9 +223,8 @@ class CoordinatorTest(test.TestCase):
         threading.Thread(
             target=RaiseOnEvent,
             args=(coord, ev_1, None,
-                  errors_impl.OutOfRangeError(None, None, "First"),
-                  True))
-        ]
+                  errors_impl.OutOfRangeError(None, None, "First"), True))
+    ]
     for t in threads:
       t.start()
 
@@ -230,7 +238,7 @@ class CoordinatorTest(test.TestCase):
         threading.Thread(
             target=RaiseOnEvent,
             args=(coord, ev_1, None, ValueError("Clean stop"), True))
-        ]
+    ]
     for t in threads:
       t.start()
 
@@ -247,7 +255,8 @@ class CoordinatorTest(test.TestCase):
             args=(coord, ev_1, ev_2, RuntimeError("First"))),
         threading.Thread(
             target=RaiseOnEventUsingContextHandler,
-            args=(coord, ev_2, None, RuntimeError("Too late")))]
+            args=(coord, ev_2, None, RuntimeError("Too late")))
+    ]
     for t in threads:
       t.start()
 
@@ -262,7 +271,7 @@ class CoordinatorTest(test.TestCase):
         threading.Thread(
             target=RaiseOnEvent,
             args=(coord, ev_1, None, RuntimeError("First"), True)),
-        ]
+    ]
     for t in threads:
       t.start()
 
@@ -274,7 +283,7 @@ class CoordinatorTest(test.TestCase):
         threading.Thread(
             target=RaiseOnEvent,
             args=(coord, ev_1, None, RuntimeError("Second"), True)),
-        ]
+    ]
     for t in threads:
       t.start()
     with self.assertRaisesRegexp(RuntimeError, "Second"):
@@ -337,24 +346,29 @@ class LooperTest(test.TestCase):
   def testTargetArgs(self):
     n = [3]
     coord = coordinator.Coordinator()
-    thread = coordinator.LooperThread.loop(coord, 0, target=_StopAt0,
-                                        args=(coord, n))
+    thread = coordinator.LooperThread.loop(
+        coord, 0, target=_StopAt0, args=(coord, n))
     coord.join([thread])
     self.assertEqual(0, n[0])
 
   def testTargetKwargs(self):
     n = [3]
     coord = coordinator.Coordinator()
-    thread = coordinator.LooperThread.loop(coord, 0, target=_StopAt0,
-                                        kwargs={"coord": coord, "n": n})
+    thread = coordinator.LooperThread.loop(
+        coord, 0, target=_StopAt0, kwargs={
+            "coord": coord,
+            "n": n
+        })
     coord.join([thread])
     self.assertEqual(0, n[0])
 
   def testTargetMixedArgs(self):
     n = [3]
     coord = coordinator.Coordinator()
-    thread = coordinator.LooperThread.loop(coord, 0, target=_StopAt0,
-                                        args=(coord,), kwargs={"n": n})
+    thread = coordinator.LooperThread.loop(
+        coord, 0, target=_StopAt0, args=(coord,), kwargs={
+            "n": n
+        })
     coord.join([thread])
     self.assertEqual(0, n[0])
 
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index 37ab625779f788b1b8e270a15db3244ea6f1bef3..689088bb41edfd94a1d483ed2b5f7447e9e060e7 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -23,6 +23,7 @@ from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _RoundRobinStrategy(object):
@@ -121,6 +122,7 @@ class _ReplicaDeviceChooser(object):
     return worker_device.to_string()
 
 
+@tf_export("train.replica_device_setter")
 def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
                           worker_device="/job:worker", merge_devices=True,
                           cluster=None, ps_ops=None, ps_strategy=None):
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index c64a1b3f799e776c7bbbbcfb691bdd97e4a34466..9d02e694db15637126f37ee5575638908b351def 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.FtrlOptimizer")
 class FtrlOptimizer(optimizer.Optimizer):
   """Optimizer that implements the FTRL algorithm.
 
@@ -265,4 +267,3 @@ class FtrlOptimizer(optimizer.Optimizer):
                         grad.dtype),
           math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
           use_locking=self._use_locking)
-
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index 5a536e27297f054671e7e44a9e5d20a8b36580b7..380e14e02497fbe3681d6bae03fe9c636c5d13aa 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -23,8 +23,10 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.GradientDescentOptimizer")
 class GradientDescentOptimizer(optimizer.Optimizer):
   """Optimizer that implements the gradient descent algorithm.
   """
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 331a51e8bc848917967fed06632fe0d1c5bcad9c..bd9985a7c5c181c0431e0c0a91186bc36b11c787 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.summary import summary
 from tensorflow.python.training import queue_runner
+from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access
@@ -53,9 +54,12 @@ _restore_sparse = sparse_ops._take_many_sparse_from_tensors_map
 # pylint: enable=protected-access
 
 
+@tf_export("train.match_filenames_once")
 def match_filenames_once(pattern, name=None):
   """Save the list of files matching pattern, so it is only computed once.
 
+  NOTE: The order of the files returned can be non-deterministic.
+
   Args:
     pattern: A file pattern (glob), or 1D tensor of file patterns.
     name: A name for the operations (optional).
@@ -70,6 +74,7 @@ def match_filenames_once(pattern, name=None):
         collections=[ops.GraphKeys.LOCAL_VARIABLES])
 
 
+@tf_export("train.limit_epochs")
 def limit_epochs(tensor, num_epochs=None, name=None):
   """Returns tensor `num_epochs` times and then raises an `OutOfRange` error.
 
@@ -102,6 +107,7 @@ def limit_epochs(tensor, num_epochs=None, name=None):
       return array_ops.identity(tensor, name=name)
 
 
+@tf_export("train.input_producer")
 def input_producer(input_tensor,
                    element_shape=None,
                    num_epochs=None,
@@ -184,6 +190,7 @@ def input_producer(input_tensor,
     return q
 
 
+@tf_export("train.string_input_producer")
 def string_input_producer(string_tensor,
                           num_epochs=None,
                           shuffle=True,
@@ -253,6 +260,7 @@ def string_input_producer(string_tensor,
         cancel_op=cancel_op)
 
 
+@tf_export("train.range_input_producer")
 def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None,
                          capacity=32, shared_name=None, name=None):
   """Produces the integers from 0 to limit-1 in a queue.
@@ -290,6 +298,7 @@ def range_input_producer(limit, num_epochs=None, shuffle=True, seed=None,
         shared_name, "fraction_of_%d_full" % capacity, name)
 
 
+@tf_export("train.slice_input_producer")
 def slice_input_producer(tensor_list, num_epochs=None, shuffle=True, seed=None,
                          capacity=32, shared_name=None, name=None):
   """Produces a slice of each `Tensor` in `tensor_list`.
@@ -885,6 +894,7 @@ def _shuffle_batch_join(tensors_list, batch_size, capacity,
 # Batching functions ----------------------------------------------------------
 
 
+@tf_export("train.batch")
 def batch(tensors, batch_size, num_threads=1, capacity=32,
           enqueue_many=False, shapes=None, dynamic_pad=False,
           allow_smaller_final_batch=False, shared_name=None, name=None):
@@ -979,6 +989,7 @@ def batch(tensors, batch_size, num_threads=1, capacity=32,
       name=name)
 
 
+@tf_export("train.maybe_batch")
 def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32,
                 enqueue_many=False, shapes=None, dynamic_pad=False,
                 allow_smaller_final_batch=False, shared_name=None, name=None):
@@ -1031,6 +1042,7 @@ def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32,
       name=name)
 
 
+@tf_export("train.batch_join")
 def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
                shapes=None, dynamic_pad=False, allow_smaller_final_batch=False,
                shared_name=None, name=None):
@@ -1136,6 +1148,7 @@ def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
       name=name)
 
 
+@tf_export("train.maybe_batch_join")
 def maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32,
                      enqueue_many=False, shapes=None, dynamic_pad=False,
                      allow_smaller_final_batch=False, shared_name=None,
@@ -1188,6 +1201,7 @@ def maybe_batch_join(tensors_list, keep_input, batch_size, capacity=32,
       name=name)
 
 
+@tf_export("train.shuffle_batch")
 def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
                   num_threads=1, seed=None, enqueue_many=False, shapes=None,
                   allow_smaller_final_batch=False, shared_name=None, name=None):
@@ -1287,6 +1301,7 @@ def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
       name=name)
 
 
+@tf_export("train.maybe_shuffle_batch")
 def maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
                         keep_input, num_threads=1, seed=None,
                         enqueue_many=False, shapes=None,
@@ -1346,6 +1361,7 @@ def maybe_shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
       name=name)
 
 
+@tf_export("train.shuffle_batch_join")
 def shuffle_batch_join(tensors_list, batch_size, capacity,
                        min_after_dequeue, seed=None, enqueue_many=False,
                        shapes=None, allow_smaller_final_batch=False,
@@ -1439,6 +1455,7 @@ def shuffle_batch_join(tensors_list, batch_size, capacity,
       name=name)
 
 
+@tf_export("train.maybe_shuffle_batch_join")
 def maybe_shuffle_batch_join(tensors_list, batch_size, capacity,
                              min_after_dequeue, keep_input, seed=None,
                              enqueue_many=False, shapes=None,
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 802b930b0e391685b07802cbf6973b763e52d147..10ab4c1137ff226d88902143d4f2281ad77de531 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Various learning rate decay functions."""
 from __future__ import absolute_import
 from __future__ import division
@@ -26,10 +25,16 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
-def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
-                      staircase=False, name=None):
+@tf_export("train.exponential_decay")
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False,
+                      name=None):
   """Applies exponential decay to the learning rate.
 
   When training a model, it is often recommended to lower the learning rate as
@@ -85,9 +90,9 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
   """
   if global_step is None:
     raise ValueError("global_step is required for exponential_decay.")
-  with ops.name_scope(name, "ExponentialDecay",
-                      [learning_rate, global_step,
-                       decay_steps, decay_rate]) as name:
+  with ops.name_scope(
+      name, "ExponentialDecay",
+      [learning_rate, global_step, decay_steps, decay_rate]) as name:
     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
     dtype = learning_rate.dtype
     global_step = math_ops.cast(global_step, dtype)
@@ -96,15 +101,16 @@ def exponential_decay(learning_rate, global_step, decay_steps, decay_rate,
     p = global_step / decay_steps
     if staircase:
       p = math_ops.floor(p)
-    return math_ops.multiply(learning_rate, math_ops.pow(decay_rate, p),
-                             name=name)
+    return math_ops.multiply(
+        learning_rate, math_ops.pow(decay_rate, p), name=name)
 
 
+@tf_export("train.piecewise_constant")
 def piecewise_constant(x, boundaries, values, name=None):
   """Piecewise constant from boundaries and interval values.
 
-  Example: use a learning rate that's 1.0 for the first 100000 steps, 0.5
-    for steps 100001 to 110000, and 0.1 for any additional steps.
+  Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
+    for the next 10000 steps, and 0.1 for any additional steps.
 
   ```python
   global_step = tf.Variable(0, trainable=False)
@@ -120,7 +126,7 @@ def piecewise_constant(x, boundaries, values, name=None):
       `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
     boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
       increasing entries, and with all elements having the same type as `x`.
-    values: A list of `Tensor`s or float`s or `int`s that specifies the values
+    values: A list of `Tensor`s or `float`s or `int`s that specifies the values
       for the intervals defined by `boundaries`. It should have one more element
       than `boundaries`, and all elements should have the same type.
     name: A string. Optional name of the operation. Defaults to
@@ -156,15 +162,15 @@ def piecewise_constant(x, boundaries, values, name=None):
           boundaries[i] = b
         else:
           raise ValueError(
-              "Boundaries (%s) must have the same dtype as x (%s)." % (
-                  b.dtype.base_dtype, x.dtype.base_dtype))
+              "Boundaries (%s) must have the same dtype as x (%s)." %
+              (b.dtype.base_dtype, x.dtype.base_dtype))
     # TODO(rdipietro): Ensure that boundaries' elements are strictly increasing.
     values = ops.convert_n_to_tensor(values)
     for v in values[1:]:
       if v.dtype.base_dtype != values[0].dtype.base_dtype:
         raise ValueError(
-            "Values must have elements all with the same dtype (%s vs %s)." % (
-                values[0].dtype.base_dtype, v.dtype.base_dtype))
+            "Values must have elements all with the same dtype (%s vs %s)." %
+            (values[0].dtype.base_dtype, v.dtype.base_dtype))
     pred_fn_pairs = []
     pred_fn_pairs.append((x <= boundaries[0], lambda: values[0]))
     pred_fn_pairs.append((x > boundaries[-1], lambda: values[-1]))
@@ -179,9 +185,14 @@ def piecewise_constant(x, boundaries, values, name=None):
     return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
 
 
-def polynomial_decay(learning_rate, global_step, decay_steps,
-                     end_learning_rate=0.0001, power=1.0,
-                     cycle=False, name=None):
+@tf_export("train.polynomial_decay")
+def polynomial_decay(learning_rate,
+                     global_step,
+                     decay_steps,
+                     end_learning_rate=0.0001,
+                     power=1.0,
+                     cycle=False,
+                     name=None):
   """Applies a polynomial decay to the learning rate.
 
   It is commonly observed that a monotonically decreasing learning rate, whose
@@ -255,9 +266,10 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
   """
   if global_step is None:
     raise ValueError("global_step is required for polynomial_decay.")
-  with ops.name_scope(name, "PolynomialDecay",
-                      [learning_rate, global_step,
-                       decay_steps, end_learning_rate, power]) as name:
+  with ops.name_scope(
+      name, "PolynomialDecay",
+      [learning_rate, global_step, decay_steps, end_learning_rate, power
+      ]) as name:
     learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
     dtype = learning_rate.dtype
     global_step = math_ops.cast(global_step, dtype)
@@ -267,23 +279,29 @@ def polynomial_decay(learning_rate, global_step, decay_steps,
     if cycle:
       # Find the first multiple of decay_steps that is bigger than global_step.
       # If global_step is zero set the multiplier to 1
-      multiplier = control_flow_ops.cond(math_ops.equal(global_step, 0),
-                                         lambda: 1.0,
-                                         lambda: math_ops.ceil(
-                                             global_step / decay_steps))
+      multiplier = control_flow_ops.cond(
+          math_ops.equal(global_step, 0), lambda: 1.0,
+          lambda: math_ops.ceil(global_step / decay_steps))
       decay_steps = math_ops.multiply(decay_steps, multiplier)
     else:
       # Make sure that the global_step used is not bigger than decay_steps.
       global_step = math_ops.minimum(global_step, decay_steps)
 
     p = math_ops.div(global_step, decay_steps)
-    return math_ops.add(math_ops.multiply(learning_rate - end_learning_rate,
-                                          math_ops.pow(1 - p, power)),
-                        end_learning_rate, name=name)
-
-
-def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate,
-                      staircase=False, name=None):
+    return math_ops.add(
+        math_ops.multiply(learning_rate - end_learning_rate,
+                          math_ops.pow(1 - p, power)),
+        end_learning_rate,
+        name=name)
+
+
+@tf_export("train.natural_exp_decay")
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False,
+                      name=None):
   """Applies natural exponential decay to the initial learning rate.
 
   When training a model, it is often recommended to lower the learning rate as
@@ -349,8 +367,13 @@ def natural_exp_decay(learning_rate, global_step, decay_steps, decay_rate,
     return math_ops.multiply(learning_rate, exponent, name=name)
 
 
-def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
-                       staircase=False, name=None):
+@tf_export("train.inverse_time_decay")
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False,
+                       name=None):
   """Applies inverse time decay to the initial learning rate.
 
   When training a model, it is often recommended to lower the learning rate as
@@ -362,7 +385,15 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
   The function returns the decayed learning rate.  It is computed as:
 
   ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * t)
+  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /
+  decay_step)
+  ```
+
+  or, if `staircase` is `True`, as:
+
+  ```python
+  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step /
+  decay_step))
   ```
 
   Example: decay 1/t with a rate of 0.5:
@@ -371,8 +402,10 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
   ...
   global_step = tf.Variable(0, trainable=False)
   learning_rate = 0.1
-  k = 0.5
-  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, k)
+  decay_steps = 1.0
+  decay_rate = 0.5
+  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step,
+  decay_steps, decay_rate)
 
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
@@ -417,11 +450,12 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
     return math_ops.div(learning_rate, denom, name=name)
 
 
-def cosine_decay(learning_rate, global_step, decay_steps, name=None):
+@tf_export("train.cosine_decay")
+def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
   """Applies cosine decay to the learning rate.
 
   See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
-  with Warm Restarts.
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
 
   When training a model, it is often recommended to lower the learning rate as
   the training progresses.  This function applies a cosine decay function
@@ -432,7 +466,8 @@ def cosine_decay(learning_rate, global_step, decay_steps, name=None):
   The function returns the decayed learning rate.  It is computed as:
   ```python
   global_step = min(global_step, decay_steps)
-  decayed = 0.5 * (1 + cos(pi * global_step / decay_steps))
+  cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
+  decayed = (1 - alpha) * cosine_decay + alpha
   decayed_learning_rate = learning_rate * decayed
   ```
 
@@ -449,6 +484,8 @@ def cosine_decay(learning_rate, global_step, decay_steps, name=None):
       Global step to use for the decay computation.
     decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
       Number of steps to decay over.
+    alpha: A scalar `float32` or `float64` Tensor or a Python number.
+      Minimum learning rate value as a fraction of learning_rate.
     name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
   Returns:
     A scalar `Tensor` of the same type as `learning_rate`.  The decayed
@@ -469,17 +506,121 @@ def cosine_decay(learning_rate, global_step, decay_steps, name=None):
     cosine_decayed = 0.5 * (
         1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))
 
-    return math_ops.multiply(learning_rate, cosine_decayed)
+    decayed = (1 - alpha) * cosine_decayed + alpha
+    return math_ops.multiply(learning_rate, decayed)
+
+
+@tf_export("train.cosine_decay_restarts")
+def cosine_decay_restarts(learning_rate,
+                          global_step,
+                          first_decay_steps,
+                          t_mul=2.0,
+                          m_mul=1.0,
+                          alpha=0.0,
+                          name=None):
+  """Applies cosine decay with restarts to the learning rate.
+
+  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies a cosine decay function with
+  restarts to a provided initial learning rate.  It requires a `global_step`
+  value to compute the decayed learning rate.  You can just pass a TensorFlow
+  variable that you increment at each training step.
+
+  The function returns the decayed learning rate while taking into account
+  possible warm restarts. The learning rate multiplier first decays
+  from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
+  restart is performed. Each new warm restart runs for `t_mul` times more steps
+  and with `m_mul` times smaller initial learning rate.
+
+  Example usage:
+  ```python
+  first_decay_steps = 1000
+  lr_decayed = cosine_decay_restarts(learning_rate, global_step,
+                                     first_decay_steps)
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
+      The initial learning rate.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Global step to use for the decay computation.
+    first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Number of steps to decay over.
+    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+      Used to derive the number of iterations in the i-th period
+    m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+      Used to derive the initial learning rate of the i-th period:
+    alpha: A scalar `float32` or `float64` Tensor or a Python number.
+      Minimum learning rate value as a fraction of the learning_rate.
+    name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
+  Returns:
+    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
+    learning rate.
+  Raises:
+    ValueError: if `global_step` is not supplied.
+  """
+  if global_step is None:
+    raise ValueError("cosine decay restarts requires global_step")
+  with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]) as name:
+    learning_rate = ops.convert_to_tensor(
+        learning_rate, name="initial_learning_rate")
+    dtype = learning_rate.dtype
+    global_step = math_ops.cast(global_step, dtype)
+    first_decay_steps = math_ops.cast(first_decay_steps, dtype)
+    alpha = math_ops.cast(alpha, dtype)
+    t_mul = math_ops.cast(t_mul, dtype)
+    m_mul = math_ops.cast(m_mul, dtype)
+
+    completed_fraction = global_step / first_decay_steps
+
+    def compute_step(completed_fraction, geometric=False):
+      if geometric:
+        i_restart = math_ops.floor(
+            math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
+            math_ops.log(t_mul))
+
+        sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
+        completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
+
+      else:
+        i_restart = math_ops.floor(completed_fraction)
+        completed_fraction = completed_fraction - i_restart
+
+      return i_restart, completed_fraction
+
+    i_restart, completed_fraction = control_flow_ops.cond(
+        math_ops.equal(t_mul, 1.0),
+        lambda: compute_step(completed_fraction, geometric=False),
+        lambda: compute_step(completed_fraction, geometric=True))
+
+    m_fac = m_mul**i_restart
+    cosine_decayed = 0.5 * m_fac * (
+        1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))
+    decayed = (1 - alpha) * cosine_decayed + alpha
+
+  return math_ops.multiply(learning_rate, decayed, name=name)
 
 
-def linear_cosine_decay(learning_rate, global_step, decay_steps,
-                        num_periods=0.5, alpha=0.0, beta=0.001,
+@tf_export("train.linear_cosine_decay")
+def linear_cosine_decay(learning_rate,
+                        global_step,
+                        decay_steps,
+                        num_periods=0.5,
+                        alpha=0.0,
+                        beta=0.001,
                         name=None):
   """Applies linear cosine decay to the learning rate.
 
   See [Bello et al., ICML2017] Neural Optimizer Search with RL.
   https://arxiv.org/abs/1709.07417
 
+  For the idea of warm starts here controlled by `num_periods`,
+  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
   Note that linear cosine decay is more aggressive than cosine decay and
   larger initial learning rates can typically be used.
 
@@ -547,15 +688,25 @@ def linear_cosine_decay(learning_rate, global_step, decay_steps,
     return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
 
 
-def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps,
-                              initial_variance=1.0, variance_decay=0.55,
-                              num_periods=0.5, alpha=0.0, beta=0.001,
+@tf_export("train.noisy_linear_cosine_decay")
+def noisy_linear_cosine_decay(learning_rate,
+                              global_step,
+                              decay_steps,
+                              initial_variance=1.0,
+                              variance_decay=0.55,
+                              num_periods=0.5,
+                              alpha=0.0,
+                              beta=0.001,
                               name=None):
   """Applies noisy linear cosine decay to the learning rate.
 
   See [Bello et al., ICML2017] Neural Optimizer Search with RL.
   https://arxiv.org/abs/1709.07417
 
+  For the idea of warm starts here controlled by `num_periods`,
+  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
   Note that linear cosine decay is more aggressive than cosine decay and
   larger initial learning rates can typically be used.
 
@@ -626,8 +777,8 @@ def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps,
         math_ops.pow(1.0 + global_step, variance_decay))
     std = math_ops.sqrt(variance)
     noisy_linear_decayed = (
-        linear_decayed + random_ops.random_normal(
-            linear_decayed.shape, stddev=std))
+        linear_decayed +
+        random_ops.random_normal(linear_decayed.shape, stddev=std))
 
     completed_fraction = global_step / decay_steps
     fraction = 2.0 * num_periods * completed_fraction
diff --git a/tensorflow/python/training/learning_rate_decay_test.py b/tensorflow/python/training/learning_rate_decay_test.py
index ff41d80940a4b2f5d4c27f8691094422cd0cb18f..1ce8c156a0b126f680bad62267f90e31a23febed 100644
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@@ -342,10 +342,11 @@ class InverseDecayTest(test_util.TensorFlowTestCase):
 
 class CosineDecayTest(test_util.TensorFlowTestCase):
 
-  def np_cosine_decay(self, step, decay_steps):
+  def np_cosine_decay(self, step, decay_steps, alpha=0.0):
     step = min(step, decay_steps)
     completed_fraction = step / decay_steps
-    return 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    return (1.0 - alpha) * decay + alpha
 
   def testDecay(self):
     num_training_steps = 1000
@@ -357,6 +358,77 @@ class CosineDecayTest(test_util.TensorFlowTestCase):
         expected = self.np_cosine_decay(step, num_training_steps)
         self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
 
+  def testAlpha(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    alpha = 0.1
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay(
+            initial_lr, step, num_training_steps, alpha)
+        expected = self.np_cosine_decay(step, num_training_steps, alpha)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
+
+class CosineDecayRestartsTest(test_util.TensorFlowTestCase):
+  def np_cosine_decay_restarts(self, step, decay_steps, t_mul=2.0, m_mul=1.0,
+                               alpha=0.0):
+    fac = 1.0
+    while step >= decay_steps:
+      step = step - decay_steps
+      decay_steps *= t_mul
+      fac *= m_mul
+
+    completed_fraction = step / decay_steps
+    decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    return (1.0 - alpha) * decay + alpha
+
+  def testDecay(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay_restarts(
+            initial_lr, step, num_training_steps)
+        expected = self.np_cosine_decay_restarts(step, num_training_steps)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
+  def testAlpha(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    alpha = 0.1
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay_restarts(
+            initial_lr, step, num_training_steps, alpha=alpha)
+        expected = self.np_cosine_decay_restarts(step, num_training_steps,
+                                                 alpha=alpha)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
+  def testMMul(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    m_mul = 0.9
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay_restarts(
+            initial_lr, step, num_training_steps, m_mul=m_mul)
+        expected = self.np_cosine_decay_restarts(step, num_training_steps,
+                                                 m_mul=m_mul)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
+  def testTMul(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    t_mul = 1.0
+    for step in range(0, 1500, 250):
+      with self.test_session():
+        decayed_lr = learning_rate_decay.cosine_decay_restarts(
+            initial_lr, step, num_training_steps, t_mul=t_mul)
+        expected = self.np_cosine_decay_restarts(step, num_training_steps,
+                                                 t_mul=t_mul)
+        self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+
 
 class LinearCosineDecayTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index cf9530d87c46783b517884610b644b076bef6807..bd9fa79d8feac68c149f787ee8501bdddb173d33 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -22,8 +22,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.MomentumOptimizer")
 class MomentumOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Momentum algorithm.
 
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 7268b3abc93f911a29b11cb95b1f005db6f49167..cda421cef837fa6ab25898208a8dc94d70561048 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -234,23 +234,38 @@ class MomentumOptimizerTest(test.TestCase):
           self.assertAllClose(var0_np, var0.eval())
           self.assertAllClose(var1_np, var1.eval())
 
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+
+      # pylint: disable=cell-var-from-loop
+      def loss():
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
         pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
-        sgd_op = momentum_lib.MomentumOptimizer(
-            learning_rate=1.0, momentum=0.0).minimize(loss)
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval())
+        return pred * pred
+      # pylint: enable=cell-var-from-loop
+
+      opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
+      sgd_op = opt.minimize(loss)
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeWith2DIndiciesForEmbeddingLookup(self):
+    var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+
+    def loss():
+      return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
+
+    opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
+    sgd_op = opt.minimize(loss)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(sgd_op)
+    self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
 
   def testTensorLearningRateAndMomentum(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index e931555470354d1f5c76ad7d46cff1308b015116..6c5c9e01a76d539b550420134b09090b89beed46 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -41,6 +41,7 @@ from tensorflow.python.training import queue_runner
 from tensorflow.python.training import saver as training_saver
 from tensorflow.python.training import session_manager as sm
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util.tf_export import tf_export
 
 
 # The list of exceptions that we should recover from. Exceptions not in this
@@ -52,7 +53,7 @@ _PREEMPTION_ERRORS = (errors.AbortedError, errors.UnavailableError)
 USE_DEFAULT = object()
 
 
-# TODO(touts): Share that with the Supervisor.
+@tf_export('train.Scaffold')
 class Scaffold(object):
   """Structure to create or gather pieces commonly needed to train a model.
 
@@ -213,6 +214,7 @@ class Scaffold(object):
     self._saver.build()
 
     ops.get_default_graph().finalize()
+    logging.info('Graph was finalized.')
     return self
 
   @property
@@ -266,10 +268,13 @@ class Scaffold(object):
 
   @staticmethod
   def _default_local_init_op():
-    return control_flow_ops.group(variables.local_variables_initializer(),
-                                  lookup_ops.tables_initializer())
+    return control_flow_ops.group(
+        variables.local_variables_initializer(),
+        lookup_ops.tables_initializer(),
+        resources.initialize_resources(resources.local_resources()))
 
 
+@tf_export('train.MonitoredTrainingSession')
 def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              is_chief=True,
                              checkpoint_dir=None,
@@ -379,6 +384,7 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                           stop_grace_period_secs=stop_grace_period_secs)
 
 
+@tf_export('train.SessionCreator')
 class SessionCreator(object):
   """A factory for tf.Session."""
 
@@ -388,6 +394,7 @@ class SessionCreator(object):
         'create_session is not implemented for {}.'.format(self))
 
 
+@tf_export('train.ChiefSessionCreator')
 class ChiefSessionCreator(SessionCreator):
   """Creates a tf.Session for a chief."""
 
@@ -439,6 +446,7 @@ class ChiefSessionCreator(SessionCreator):
         init_fn=self._scaffold.init_fn)
 
 
+@tf_export('train.WorkerSessionCreator')
 class WorkerSessionCreator(SessionCreator):
   """Creates a tf.Session for a worker."""
 
@@ -704,6 +712,7 @@ class _MonitoredSession(object):
     return self._coordinated_creator.tf_sess
 
 
+@tf_export('train.MonitoredSession')
 class MonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, recovery and hooks.
 
@@ -786,6 +795,7 @@ class MonitoredSession(_MonitoredSession):
         stop_grace_period_secs=stop_grace_period_secs)
 
 
+@tf_export('train.SingularMonitoredSession')
 class SingularMonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, restoring, and hooks.
 
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index e34c759e894c86a103f0228163f7bae2ffc7fb61..b9ecb27df19d051c28ec1c3fe3cd9fd86717a5ed 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(touts): switch to variables.Variable.
@@ -187,7 +188,7 @@ def _zero_debias(unbiased_var, value, decay):
   with variable_scope.variable_scope(
       unbiased_var.op.name, values=[unbiased_var, value, decay]) as scope:
     with ops.colocate_with(unbiased_var):
-      with ops.control_dependencies(None):
+      with ops.init_scope():
         biased_initializer = init_ops.zeros_initializer(
             dtype=unbiased_var.dtype)(unbiased_var.get_shape())
         local_step_initializer = init_ops.zeros_initializer()
@@ -230,6 +231,7 @@ def _zero_debias(unbiased_var, value, decay):
       return unbiased_ema_delta
 
 
+@tf_export("train.ExponentialMovingAverage")
 class ExponentialMovingAverage(object):
   """Maintains moving averages of variables by employing an exponential decay.
 
@@ -385,7 +387,7 @@ class ExponentialMovingAverage(object):
       # For variables: to lower communication bandwidth across devices we keep
       # the moving averages on the same device as the variables. For other
       # tensors, we rely on the existing device allocation mechanism.
-      with ops.control_dependencies(None):
+      with ops.init_scope():
         if isinstance(var, variables.Variable):
           avg = slot_creator.create_slot(var,
                                          var.initialized_value(),
@@ -398,7 +400,9 @@ class ExponentialMovingAverage(object):
           avg = slot_creator.create_zeros_slot(
               var,
               self._name,
-              colocate_with_primary=(var.op.type in ["Variable", "VariableV2"]))
+              colocate_with_primary=(var.op.type in ["Variable",
+                                                     "VariableV2",
+                                                     "VarHandleOp"]))
           if self._zero_debias:
             zero_debias_true.add(avg)
       self._averages[var] = avg
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index b31d02eb8d7afe2dd675192fc99fb7c24b515c00..f05c40b32dcbedddb350ee8a61ad4616c666b86a 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -32,9 +32,11 @@ from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _get_variable_for(v):
@@ -173,19 +175,43 @@ class _StreamingModelPortProcessor(_OptimizableVariable):
     return g
 
 
+class _TensorProcessor(_OptimizableVariable):
+  """Processor for ordinary Tensors.
+
+  Even though a Tensor can't really be updated, sometimes it is useful to
+  compute the gradients with respect to a Tensor using the optimizer. Updating
+  the Tensor is, of course, unsupported.
+  """
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g):
+    raise NotImplementedError("Trying to update a Tensor ", self._v)
+
+
 def _get_processor(v):
   """The processor of v."""
   if context.in_eager_mode():
-    return _DenseResourceVariableProcessor(v)
+    if isinstance(v, ops.Tensor):
+      return _TensorProcessor(v)
+    else:
+      return _DenseResourceVariableProcessor(v)
   if v.op.type == "VarHandleOp":
     return _DenseResourceVariableProcessor(v)
   if isinstance(v, variables.Variable):
     return _RefVariableProcessor(v)
   if v.op.type == "SubmodelPort":
     return _StreamingModelPortProcessor(v)
+  if isinstance(v, ops.Tensor):
+    return _TensorProcessor(v)
   raise NotImplementedError("Trying to optimize unsupported type ", v)
 
 
+@tf_export("train.Optimizer")
 class Optimizer(object):
   """Base class for optimizers.
 
@@ -299,6 +325,7 @@ class Optimizer(object):
     # Dictionary of slots.
     #  {slot_name : { variable_to_train: slot_for_the_variable, ...}, ... }
     self._slots = {}
+    self._non_slot_dict = {}
 
   def get_name(self):
     return self._name
@@ -378,7 +405,9 @@ class Optimizer(object):
     given variable.
 
     Args:
-      loss: A Tensor containing the value to minimize.
+      loss: A Tensor containing the value to minimize or a callable taking
+        no arguments which returns the value to minimize. When eager execution
+        is enabled it must be a callable.
       var_list: Optional list or tuple of `tf.Variable` to update to minimize
         `loss`.  Defaults to the list of variables collected in the graph
         under the key `GraphKeys.TRAINABLE_VARIABLES`.
@@ -397,37 +426,27 @@ class Optimizer(object):
     Raises:
       TypeError: If `var_list` contains anything else than `Variable` objects.
       ValueError: If some arguments are invalid.
-      RuntimeError: If called with eager execution enabled and if `grad_loss`
-        is not `None` or `loss` is not callable.
+      RuntimeError: If called with eager execution enabled and `loss` is
+        not callable.
 
     @compatibility(eager)
-    When eager execution is enabled, `loss` should be a Python function that
-    takes elements of `var_list` as arguments and computes the value to be
-    minimized. If `var_list` is None, `loss` should take no arguments.
-    Gradient computation is done with respect to the elements of `var_list` if
-    not None, else with respect to any trainable variables created during the
-    execution of the `loss` function.
-    `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and
-    `grad_loss` are ignored when eager execution is enabled.
+    When eager execution is enabled, `gate_gradients`, `aggregation_method`,
+    and `colocate_gradients_with_ops` are ignored.
     @end_compatibility
     """
-    if context.in_eager_mode():
-      if grad_loss is not None:
-        raise RuntimeError(
-            "`grad_loss` argument to Optimizer.compute_gradients "
-            "not supported when eager execution is enabled.")
-      if not callable(loss):
-        raise RuntimeError(
-            "`loss` passed to Optimizer.compute_gradients should "
-            "be a function when eager execution is enabled.")
-      # TODO(agarwal): consider passing parameters to the `loss` function.
+    if callable(loss):
+      with backprop.GradientTape() as tape:
+        if var_list is not None:
+          tape.watch(var_list)
+        loss_value = loss()
       if var_list is None:
-        return backprop.implicit_grad(loss)()
-      else:
-        var_list = nest.flatten(var_list)
-        grads = backprop.gradients_function(loss)(*var_list)
-        grads_and_vars = list(zip(grads, var_list))
-        return grads_and_vars
+        var_list = tape.watched_variables()
+      grads = tape.gradient(loss_value, var_list, grad_loss)
+      return list(zip(grads, var_list))
+    if context.in_eager_mode():
+      raise RuntimeError(
+          "`loss` passed to Optimizer.compute_gradients should "
+          "be a function when eager execution is enabled.")
     if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP,
                               Optimizer.GATE_GRAPH]:
       raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
@@ -512,7 +531,7 @@ class Optimizer(object):
     if not var_list:
       raise ValueError("No gradients provided for any variable: %s." %
                        ([str(v) for _, _, v in converted_grads_and_vars],))
-    with ops.control_dependencies(None):
+    with ops.init_scope():
       self._create_slots([_get_variable_for(v) for v in var_list])
     update_ops = []
     with ops.name_scope(name, self._name) as name:
@@ -531,7 +550,15 @@ class Optimizer(object):
       else:
         with ops.control_dependencies([self._finish(update_ops, "update")]):
           with ops.colocate_with(global_step):
-            apply_updates = state_ops.assign_add(global_step, 1, name=name)
+            if isinstance(global_step, resource_variable_ops.ResourceVariable):
+              # TODO(apassos): the implicit read in assign_add is slow; consider
+              # making it less so.
+              apply_updates = resource_variable_ops.assign_add_variable_op(
+                  global_step.handle,
+                  ops.convert_to_tensor(1, dtype=global_step.dtype),
+                  name=name)
+            else:
+              apply_updates = state_ops.assign_add(global_step, 1, name=name)
 
       if context.in_graph_mode():
         if isinstance(apply_updates, ops.Tensor):
@@ -590,7 +617,7 @@ class Optimizer(object):
       if executing_eagerly:
         # No variable.op in eager mode. We don't expect lots of eager graphs,
         # but behavior should be consistent with graph mode.
-        return variable._container_prefix == current_graph._container_prefix  # pylint: disable=protected-access
+        return variable._graph_key == current_graph._graph_key  # pylint: disable=protected-access
       else:
         return variable.op.graph is current_graph
 
@@ -603,17 +630,32 @@ class Optimizer(object):
     # Sort variables by name so that the return is deterministic.
     return sorted(optimizer_variables, key=lambda v: v.name)
 
+  def _create_non_slot_variable(self, initial_value, name, colocate_with):
+    """Add an extra variable, not associated with a slot."""
+    if context.in_graph_mode():
+      graph = colocate_with.graph
+    else:
+      graph = None
+
+    key = (name, graph)
+    v = self._non_slot_dict.get(key, None)
+    if v is None:
+      with ops.colocate_with(colocate_with):
+        v = variable_scope.variable(initial_value, name=name, trainable=False)
+      self._non_slot_dict[key] = v
+
+    return v
+
+  def _get_non_slot_variable(self, name, graph=None):
+    return self._non_slot_dict.get((name, graph), None)
+
   def _non_slot_variables(self):
     """Additional variables created by the `Optimizer`.
 
-    This method should be overridden by child classes which create extra
-    variables, so that `variables()` includes the `Optimizer`'s non-slot
-    variables.
-
     Returns:
       A list or tuple of variables.
     """
-    return []
+    return self._non_slot_dict.values()
 
   def _assert_valid_dtypes(self, tensors):
     """Asserts tensors are all valid types (see `_valid_dtypes`).
@@ -644,7 +686,8 @@ class Optimizer(object):
     Returns:
       Valid types for loss, variables and gradients.
     """
-    return set([dtypes.float16, dtypes.float32, dtypes.float64])
+    return set(
+        [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64])
 
   def _create_slots(self, var_list):
     """Create all slots needed by the variables.
diff --git a/tensorflow/python/training/optimizer_test.py b/tensorflow/python/training/optimizer_test.py
index 6bdae39073d48e0bd8b757a2d5145480e92d185f..0cab6410e83ca1880a0a4a80d2cfa5c17517af95 100644
--- a/tensorflow/python/training/optimizer_test.py
+++ b/tensorflow/python/training/optimizer_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -44,11 +43,10 @@ class OptimizerTest(test.TestCase):
                                                     name='a_%d' % i)
       var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
                                                     name='b_%d' % i)
-      def loss(v0, v1):
-        return 5 * v0 + 3 * v1
+      def loss():
+        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
       # Note that for eager execution, minimize expects a function instead of a
       # Tensor.
-      cost = loss if context.in_eager_mode() else loss(var0, var1)
       global_step = resource_variable_ops.ResourceVariable(
           array_ops.zeros([], dtypes.int64), name='global_step_%d' % i)
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
@@ -58,7 +56,7 @@ class OptimizerTest(test.TestCase):
       self.assertAllClose([1.0, 2.0], self.evaluate(var0))
       self.assertAllClose([3.0, 4.0], self.evaluate(var1))
       # Run 1 step of sgd through optimizer
-      opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
+      opt_op = sgd_op.minimize(loss, global_step, [var0, var1])
       self.evaluate(opt_op)
       # Validate updated params
       self.assertAllClose([-14., -13.], self.evaluate(var0))
@@ -125,10 +123,9 @@ class OptimizerTest(test.TestCase):
             [3.0, 4.0], dtype=dtype, trainable=False, name='b')
         return 5 * var0 + var1
       # pylint: enable=cell-var-from-loop
-      cost = loss if context.in_eager_mode() else loss()
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
       with self.assertRaisesRegexp(ValueError, 'No.*variables'):
-        sgd_op.minimize(cost)
+        sgd_op.minimize(loss)
 
   @test_util.run_in_graph_and_eager_modes()
   def testNoGradients(self):
@@ -140,14 +137,13 @@ class OptimizerTest(test.TestCase):
       var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
                                                     name='b%d' % i)
       # pylint: disable=cell-var-from-loop
-      def loss(_):
+      def loss():
         return 5 * var0
       # pylint: enable=cell-var-from-loop
-      cost = loss if context.in_eager_mode() else loss(var1)
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
       with self.assertRaisesRegexp(ValueError, 'No gradients'):
         # var1 has no gradient
-        sgd_op.minimize(cost, var_list=[var1])
+        sgd_op.minimize(loss, var_list=[var1])
 
   @test_util.run_in_graph_and_eager_modes()
   def testNoGradientsForAnyVariables_Minimize(self):
@@ -158,13 +154,12 @@ class OptimizerTest(test.TestCase):
                                                     name='a_%d' % i)
       var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
                                                     name='b_%d' % i)
-      def loss(unused_v1, unused_v2):
+      def loss():
         return constant_op.constant(5.0)
-      cost = loss if context.in_eager_mode() else loss(var0, var1)
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
       with self.assertRaisesRegexp(ValueError,
                                    'No gradients provided for any variable'):
-        sgd_op.minimize(cost, var_list=[var0, var1])
+        sgd_op.minimize(loss, var_list=[var0, var1])
 
   @test_util.run_in_graph_and_eager_modes()
   def testNoGradientsForAnyVariables_ApplyGradients(self):
@@ -189,11 +184,10 @@ class OptimizerTest(test.TestCase):
                                                     name='a%d' % i)
       var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
                                                     name='b%d' % i)
-      def loss(v0, v1):
-        return 5 * v0 + 3 * v1
-      cost = loss if context.in_eager_mode() else loss(var0, var1)
+      def loss():
+        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
-      grads_and_vars = sgd_op.compute_gradients(cost, [var0, var1])
+      grads_and_vars = sgd_op.compute_gradients(loss, [var0, var1])
       # Convert gradients to tf.Variables
       converted_grads = [
           resource_variable_ops.ResourceVariable(array_ops.zeros([2], dtype),
@@ -221,6 +215,21 @@ class OptimizerTest(test.TestCase):
       self.assertAllClose([-14., -13.], self.evaluate(var0))
       self.assertAllClose([-6., -5.], self.evaluate(var1))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testComputeGradientsWithTensors(self):
+    x = ops.convert_to_tensor(1.0)
+    def f():
+      return x * x
+    sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+    grads_and_vars = sgd_op.compute_gradients(f, [x])
+    self.assertEqual(1, len(grads_and_vars))
+    grad, x_as_var = grads_and_vars[0]
+    self.assertIs(x, x_as_var)
+    self.assertEqual(2.0, self.evaluate(grad))
+
+    with self.assertRaises(NotImplementedError):
+      sgd_op.apply_gradients(grads_and_vars)
+
   def testTrainOp(self):
     with self.test_session():
       var0 = variables.Variable([1.0, 2.0])
diff --git a/tensorflow/python/training/proximal_adagrad.py b/tensorflow/python/training/proximal_adagrad.py
index da31ab325d5e45e1943f554c45717cceb4dc638f..9bd677b8efcd447f74ec2a3cbe94d63eeb9a4dd1 100644
--- a/tensorflow/python/training/proximal_adagrad.py
+++ b/tensorflow/python/training/proximal_adagrad.py
@@ -23,8 +23,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.ProximalAdagradOptimizer")
 class ProximalAdagradOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the Proximal Adagrad algorithm.
diff --git a/tensorflow/python/training/proximal_gradient_descent.py b/tensorflow/python/training/proximal_gradient_descent.py
index 53e9dc2ef2c86a20070fdbdc690b39d2c0e9df06..369b6cbb50e5c621737c095a24eeb473f3870534 100644
--- a/tensorflow/python/training/proximal_gradient_descent.py
+++ b/tensorflow/python/training/proximal_gradient_descent.py
@@ -24,8 +24,10 @@ from tensorflow.python.ops import math_ops
 # pylint: enable=unused-import
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.ProximalGradientDescentOptimizer")
 class ProximalGradientDescentOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the proximal gradient descent algorithm.
diff --git a/tensorflow/python/training/quantize_training.i b/tensorflow/python/training/quantize_training.i
index 40c60769731d3f7255647a07141d86b1c2594b01..17ffcd6e0758c9c1bc8bab864b6b7a2a18bc9cbf 100644
--- a/tensorflow/python/training/quantize_training.i
+++ b/tensorflow/python/training/quantize_training.i
@@ -65,6 +65,9 @@ def do_quantize_training_on_graphdef(input_graph, num_bits):
 
   graph.ParseFromString(result_graph_string)
   return graph
+
+do_quantize_training_on_graphdef._tf_api_names = [
+    'train.do_quantize_training_on_graphdef']
 %}
 
 %unignoreall
diff --git a/tensorflow/python/training/queue_runner_impl.py b/tensorflow/python/training/queue_runner_impl.py
index 4e7c81d7b2913d71a23dcaa3751db2aaffdc67cf..07afba79abf4d636c9ec2d53bcf2641594a35733 100644
--- a/tensorflow/python/training/queue_runner_impl.py
+++ b/tensorflow/python/training/queue_runner_impl.py
@@ -27,8 +27,10 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.queue_runner.QueueRunner", "train.QueueRunner")
 class QueueRunner(object):
   """Holds a list of enqueue operations for a queue, each to be run in a thread.
 
@@ -384,6 +386,7 @@ class QueueRunner(object):
                        import_scope=import_scope)
 
 
+@tf_export("train.queue_runner.add_queue_runner", "train.add_queue_runner")
 def add_queue_runner(qr, collection=ops.GraphKeys.QUEUE_RUNNERS):
   """Adds a `QueueRunner` to a collection in the graph.
 
@@ -402,6 +405,8 @@ def add_queue_runner(qr, collection=ops.GraphKeys.QUEUE_RUNNERS):
   ops.add_to_collection(collection, qr)
 
 
+@tf_export("train.queue_runner.start_queue_runners",
+           "train.start_queue_runners")
 def start_queue_runners(sess=None, coord=None, daemon=True, start=True,
                         collection=ops.GraphKeys.QUEUE_RUNNERS):
   """Starts all queue runners collected in the graph.
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index ebec725b7b98e9a078f5558af85355988e8aca67..341b970c92e42b4fe392d91f57219d713d2513e5 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """One-line documentation for rmsprop module.
 
 rmsprop algorithm [tieleman2012rmsprop]
@@ -43,16 +42,20 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.RMSPropOptimizer")
 class RMSPropOptimizer(optimizer.Optimizer):
   """Optimizer that implements the RMSProp algorithm.
 
-  See the [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  See the
+  [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
   """
 
   def __init__(self,
@@ -105,21 +108,24 @@ class RMSPropOptimizer(optimizer.Optimizer):
 
   def _create_slots(self, var_list):
     for v in var_list:
-      init_rms = init_ops.ones_initializer(dtype=v.dtype)
+      if v.get_shape().is_fully_defined():
+        init_rms = init_ops.ones_initializer(dtype=v.dtype.base_dtype)
+      else:
+        init_rms = array_ops.ones_like(v)
       self._get_or_make_slot_with_initializer(v, init_rms, v.get_shape(),
-                                              v.dtype, "rms", self._name)
+                                              v.dtype.base_dtype, "rms",
+                                              self._name)
       if self._centered:
         self._zeros_slot(v, "mg", self._name)
       self._zeros_slot(v, "momentum", self._name)
 
   def _prepare(self):
-    self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
-                                                       name="learning_rate")
+    self._learning_rate_tensor = ops.convert_to_tensor(
+        self._learning_rate, name="learning_rate")
     self._decay_tensor = ops.convert_to_tensor(self._decay, name="decay")
-    self._momentum_tensor = ops.convert_to_tensor(self._momentum,
-                                                  name="momentum")
-    self._epsilon_tensor = ops.convert_to_tensor(self._epsilon,
-                                                 name="epsilon")
+    self._momentum_tensor = ops.convert_to_tensor(
+        self._momentum, name="momentum")
+    self._epsilon_tensor = ops.convert_to_tensor(self._epsilon, name="epsilon")
 
   def _apply_dense(self, grad, var):
     rms = self.get_slot(var, "rms")
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index bd47736d4bc1b9b05d26bc45512f693a1bc0f937..0c1c8e664b682f78c69a5244db0773df80b35be7 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -53,6 +53,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Op names which identify variable reads which should be saved.
@@ -241,6 +242,34 @@ class BaseSaverBuilder(object):
     else:
       raise RuntimeError("Unexpected write_version: " + self._write_version)
 
+  def bulk_restore(self, filename_tensor, saveables, preferred_shard,
+                   restore_sequentially):
+    """Restore all tensors contained in saveables.
+
+    By default, this issues separate calls to `restore_op` for each saveable.
+    Subclasses may override to load multiple saveables in a single call.
+
+    Args:
+      filename_tensor: String Tensor.
+      saveables: List of BaseSaverBuilder.SaveableObject objects.
+      preferred_shard: Int.  Shard to open first when loading a sharded file.
+      restore_sequentially: Bool.  If true, each restore is sequential.
+
+    Returns:
+      A list of Tensors resulting from reading 'saveable' from
+        'filename'.
+
+    """
+    all_tensors = []
+    assign_ops = []
+    for saveable in saveables:
+      restore_control_inputs = assign_ops[-1:] if restore_sequentially else []
+      with ops.device(_set_cpu0(saveable.device) if saveable.device else None):
+        with ops.control_dependencies(restore_control_inputs):
+          all_tensors.extend(
+              self.restore_op(filename_tensor, saveable, preferred_shard))
+    return all_tensors
+
   # pylint: disable=unused-argument
   def restore_op(self, filename_tensor, saveable, preferred_shard):
     """Create ops to restore 'saveable'.
@@ -349,7 +378,7 @@ class BaseSaverBuilder(object):
     last_device = None
     for shard, (device, saveables) in enumerate(per_device):
       last_device = device
-      with ops.device(device):
+      with ops.device(_set_cpu0(device)):
         sharded_filename = self.sharded_filename(tmp_checkpoint_prefix, shard,
                                                  num_shards_tensor)
         sharded_prefixes.append(sharded_filename)
@@ -357,7 +386,7 @@ class BaseSaverBuilder(object):
 
     with ops.control_dependencies([x.op for x in sharded_saves]):
       # Co-locates the merge step with the last device.
-      with ops.device(last_device):
+      with ops.device(_set_cpu0(last_device)):
         # V2 format write path consists of a metadata merge step.  Once merged,
         # attempts to delete the temporary directory, "<user-fed prefix>_temp".
         merge_step = gen_io_ops.merge_v2_checkpoints(
@@ -416,30 +445,32 @@ class BaseSaverBuilder(object):
     Returns:
       An Operation that restores the variables.
     """
+    all_tensors = self.bulk_restore(filename_tensor, saveables, preferred_shard,
+                                    restore_sequentially)
+
     assign_ops = []
+    idx = 0
+    # Load and optionally reshape on the CPU, as string tensors are not
+    # available on the GPU.
+    # TODO(touts): Re-enable restore on GPU when we can support annotating
+    # string tensors as "HostMemory" inputs.
     for saveable in saveables:
-      restore_control_inputs = assign_ops[-1:] if restore_sequentially else []
-      # Load and optionally reshape on the CPU, as string tensors are not
-      # available on the GPU.
-      # TODO(touts): Re-enable restore on GPU when we can support annotating
-      # string tensors as "HostMemory" inputs.
-      with ops.device(_set_cpu0(saveable.device) if saveable.device else None):
-        with ops.control_dependencies(restore_control_inputs):
-          tensors = self.restore_op(filename_tensor, saveable, preferred_shard)
-          shapes = None
-          if reshape:
-            # Compute the shapes, let the restore op decide if and how to do
-            # the reshape.
-            shapes = []
-            for spec in saveable.specs:
-              v = spec.tensor
-              shape = v.get_shape()
-              if not shape.is_fully_defined():
-                shape = array_ops.shape(v)
-              shapes.append(shape)
-          assign_ops.append(saveable.restore(tensors, shapes))
-
-      # Create a Noop that has control dependencies from all the updates.
+      shapes = None
+      if reshape:
+        # Compute the shapes, let the restore op decide if and how to do
+        # the reshape.
+        shapes = []
+        for spec in saveable.specs:
+          v = spec.tensor
+          shape = v.get_shape()
+          if not shape.is_fully_defined():
+            shape = array_ops.shape(v)
+          shapes.append(shape)
+      saveable_tensors = all_tensors[idx:idx + len(saveable.specs)]
+      idx += len(saveable.specs)
+      assign_ops.append(saveable.restore(saveable_tensors, shapes))
+
+    # Create a Noop that has control dependencies from all the updates.
     return control_flow_ops.group(*assign_ops, name=name)
 
   def _AddShardedRestoreOps(self, filename_tensor, per_device,
@@ -797,6 +828,25 @@ class BaseSaverBuilder(object):
           version=self._write_version)
 
 
+class BulkSaverBuilder(BaseSaverBuilder):
+  """SaverBuilder with support for bulk restoring multiple saveables."""
+
+  def bulk_restore(self, filename_tensor, saveables, preferred_shard,
+                   restore_sequentially):
+
+    # Ignored: bulk restore is internally sequential.
+    del restore_sequentially
+    restore_specs = []
+    for saveable in saveables:
+      for spec in saveable.specs:
+        restore_specs.append((spec.name, spec.slice_spec, spec.tensor.dtype))
+
+    names, slices, dtypes = zip(*restore_specs)
+    # Load all tensors onto CPU 0 for compatibility with existing code.
+    with ops.device("cpu:0"):
+      return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
+
+
 def _get_saver_or_default():
   """Returns the saver from SAVERS collection, or creates a default one.
 
@@ -840,6 +890,7 @@ def _GetCheckpointFilename(save_dir, latest_filename):
   return os.path.join(save_dir, latest_filename)
 
 
+@tf_export("train.generate_checkpoint_state_proto")
 def generate_checkpoint_state_proto(save_dir,
                                     model_checkpoint_path,
                                     all_model_checkpoint_paths=None):
@@ -884,6 +935,7 @@ def generate_checkpoint_state_proto(save_dir,
   return coord_checkpoint_proto
 
 
+@tf_export("train.update_checkpoint_state")
 def update_checkpoint_state(save_dir,
                             model_checkpoint_path,
                             all_model_checkpoint_paths=None,
@@ -976,6 +1028,7 @@ def _update_checkpoint_state(save_dir,
                                       text_format.MessageToString(ckpt))
 
 
+@tf_export("train.get_checkpoint_state")
 def get_checkpoint_state(checkpoint_dir, latest_filename=None):
   """Returns CheckpointState proto from the "checkpoint" file.
 
@@ -1033,6 +1086,7 @@ def get_checkpoint_state(checkpoint_dir, latest_filename=None):
   return ckpt
 
 
+@tf_export("train.Saver")
 class Saver(object):
   """Saves and restores variables.
 
@@ -1180,7 +1234,7 @@ class Saver(object):
         The `saver_def` proto should be the one returned by the
         `as_saver_def()` call of the `Saver` that was created for that `Graph`.
       builder: Optional `SaverBuilder` to use if a `saver_def` was not provided.
-        Defaults to `BaseSaverBuilder()`.
+        Defaults to `BulkSaverBuilder()`.
       defer_build: If `True`, defer adding the save and restore ops to the
         `build()` call. In that case `build()` should be called before
         finalizing the graph or using the saver.
@@ -1260,7 +1314,8 @@ class Saver(object):
 
     if not self.saver_def or context.in_eager_mode():
       if self._builder is None:
-        self._builder = BaseSaverBuilder(self._write_version)
+        self._builder = BulkSaverBuilder(self._write_version)
+
       if self._var_list is None:
         # pylint: disable=protected-access
         self._var_list = variables._all_saveable_objects()
@@ -1509,7 +1564,9 @@ class Saver(object):
            latest_filename=None,
            meta_graph_suffix="meta",
            write_meta_graph=True,
-           write_state=True):
+           write_state=True,
+           strip_default_attrs=False):
+    # pylint: disable=line-too-long
     """Saves variables.
 
     This method runs the ops added by the constructor for saving variables.
@@ -1535,11 +1592,14 @@ class Saver(object):
         graph file.
       write_state: `Boolean` indicating whether or not to write the
         `CheckpointStateProto`.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
 
     Returns:
-      A string: path prefix used for the checkpoint files.  If the saver is
-        sharded, this string ends with: '-?????-of-nnnnn' where 'nnnnn'
-        is the number of shards created.
+      A string: path prefix used for the checkpoint files. If checkpoint
+        format is V1 and the saver is sharded, this string ends with:
+         '-?????-of-nnnnn' where 'nnnnn' is the number of shards created.
       If the saver is empty, returns None.
 
     Raises:
@@ -1548,6 +1608,7 @@ class Saver(object):
         collides with `save_path`.
       RuntimeError: If save and restore ops weren't built.
     """
+    # pylint: enable=line-too-long
     if not self._is_built and context.in_graph_mode():
       raise RuntimeError(
           "`build()` should be called before save if defer_build==True")
@@ -1618,7 +1679,8 @@ class Saver(object):
           checkpoint_file, meta_graph_suffix=meta_graph_suffix)
       if context.in_graph_mode():
         with sess.graph.as_default():
-          self.export_meta_graph(meta_graph_filename)
+          self.export_meta_graph(
+              meta_graph_filename, strip_default_attrs=strip_default_attrs)
 
     if self._is_empty:
       return None
@@ -1631,7 +1693,9 @@ class Saver(object):
                         as_text=False,
                         export_scope=None,
                         clear_devices=False,
-                        clear_extraneous_savers=False):
+                        clear_extraneous_savers=False,
+                        strip_default_attrs=False):
+    # pylint: disable=line-too-long
     """Writes `MetaGraphDef` to save_path/filename.
 
     Args:
@@ -1644,10 +1708,14 @@ class Saver(object):
       clear_extraneous_savers: Remove any Saver-related information from the
         graph (both Save/Restore ops and SaverDefs) that are not associated
         with this Saver.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
 
     Returns:
       A `MetaGraphDef` proto.
     """
+    # pylint: enable=line-too-long
     return export_meta_graph(
         filename=filename,
         graph_def=ops.get_default_graph().as_graph_def(add_shapes=True),
@@ -1656,7 +1724,8 @@ class Saver(object):
         as_text=as_text,
         export_scope=export_scope,
         clear_devices=clear_devices,
-        clear_extraneous_savers=clear_extraneous_savers)
+        clear_extraneous_savers=clear_extraneous_savers,
+        strip_default_attrs=strip_default_attrs)
 
   def restore(self, sess, save_path):
     """Restores previously saved variables.
@@ -1680,6 +1749,12 @@ class Saver(object):
       return
     if save_path is None:
       raise ValueError("Can't load save_path when it is None.")
+    if (os.path.isfile(save_path) and
+        self._write_version not in (
+            saver_pb2.SaverDef.V1, saver_pb2.SaverDef.LEGACY)):
+      raise ValueError("The specified path: %s is a file."
+                       " Please specify only the path prefix"
+                       " to the checkpoint files." % save_path)
     logging.info("Restoring parameters from %s", save_path)
     if context.in_graph_mode():
       sess.run(self.saver_def.restore_op_name,
@@ -1719,6 +1794,7 @@ def _prefix_to_checkpoint_path(prefix, format_version):
   return prefix  # Just the data file.
 
 
+@tf_export("train.latest_checkpoint")
 def latest_checkpoint(checkpoint_dir, latest_filename=None):
   """Finds the filename of latest saved checkpoint file.
 
@@ -1748,6 +1824,7 @@ def latest_checkpoint(checkpoint_dir, latest_filename=None):
   return None
 
 
+@tf_export("train.import_meta_graph")
 def import_meta_graph(meta_graph_or_file, clear_devices=False,
                       import_scope=None, **kwargs):
   """Recreates a Graph saved in a `MetaGraphDef` proto.
@@ -1849,6 +1926,7 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
       return None
 
 
+@tf_export("train.export_meta_graph")
 def export_meta_graph(filename=None,
                       meta_info_def=None,
                       graph_def=None,
@@ -1859,7 +1937,9 @@ def export_meta_graph(filename=None,
                       export_scope=None,
                       clear_devices=False,
                       clear_extraneous_savers=False,
+                      strip_default_attrs=False,
                       **kwargs):
+  # pylint: disable=line-too-long
   """Returns `MetaGraphDef` proto. Optionally writes it to filename.
 
   This function exports the graph, saver, and collection objects into
@@ -1885,6 +1965,9 @@ def export_meta_graph(filename=None,
     clear_extraneous_savers: Remove any Saver-related information from the
         graph (both Save/Restore ops and SaverDefs) that are not associated
         with the provided SaverDef.
+    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+      removed from the NodeDefs. For a detailed guide, see
+      [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
     **kwargs: Optional keyed arguments.
 
   Returns:
@@ -1899,6 +1982,7 @@ def export_meta_graph(filename=None,
   execution is enabled.
   @end_compatibility
   """
+  # pylint: enable=line-too-long
   if context.in_eager_mode():
     raise RuntimeError("Exporting/importing meta graphs is not supported when "
                        "eager execution is enabled. No graph exists when eager "
@@ -1914,10 +1998,12 @@ def export_meta_graph(filename=None,
       export_scope=export_scope,
       clear_devices=clear_devices,
       clear_extraneous_savers=clear_extraneous_savers,
+      strip_default_attrs=strip_default_attrs,
       **kwargs)
   return meta_graph_def
 
 
+@tf_export("train.checkpoint_exists")
 def checkpoint_exists(checkpoint_prefix):
   """Checks whether a V1 or V2 checkpoint exists with the specified prefix.
 
@@ -1942,6 +2028,7 @@ def checkpoint_exists(checkpoint_prefix):
     return False
 
 
+@tf_export("train.get_checkpoint_mtimes")
 def get_checkpoint_mtimes(checkpoint_prefixes):
   """Returns the mtimes (modification timestamps) of the checkpoints.
 
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index b7f1297b8f9400ea128656a051189b0e82601be0..c5a6f49df599434ab3bc1a9fe3d85db6f824071e 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -38,6 +38,7 @@ from tensorflow.core.protobuf import queue_runner_pb2
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -73,6 +74,7 @@ from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
 from tensorflow.python.util import compat
 
 
+@test_util.with_c_api
 class SaverTest(test.TestCase):
 
   def basicSaveRestore(self, variable_op):
@@ -541,6 +543,23 @@ class SaverTest(test.TestCase):
       save = saver_module.Saver({"v0": v0_2})
       variables.global_variables_initializer().run()
 
+  def testSharedServerOnGPU(self):
+    if not test.is_gpu_available():
+      return
+    save_path = os.path.join(self.get_temp_dir(), "gpu")
+    with session.Session("", graph=ops_lib.Graph()) as sess:
+      with sess.graph.device(test.gpu_device_name()):
+        v0_1 = variables.Variable(123.45)
+      save = saver_module.Saver({"v0": v0_1}, sharded=True, allow_empty=True)
+      variables.global_variables_initializer().run()
+      save.save(sess, save_path)
+
+    with session.Session("", graph=ops_lib.Graph()) as sess:
+      with sess.graph.device(test.gpu_device_name()):
+        v0_2 = variables.Variable(543.21)
+      save = saver_module.Saver({"v0": v0_2}, sharded=True, allow_empty=True)
+      variables.global_variables_initializer().run()
+
   def testVariables(self):
     save_path = os.path.join(self.get_temp_dir(), "variables")
     with session.Session("", graph=ops_lib.Graph()) as sess:
@@ -724,6 +743,7 @@ class SaverTest(test.TestCase):
       save.save(sess, save_path)
 
 
+@test_util.with_c_api
 class SaveRestoreShardedTest(test.TestCase):
 
   _WRITE_VERSION = saver_pb2.SaverDef.V1
@@ -774,9 +794,13 @@ class SaveRestoreShardedTest(test.TestCase):
         with sess.graph.device("/cpu:0"):
           v0 = variables.Variable(111, name="v0")
           t0 = saver_test_utils.CheckpointedOp(name="t0")
-        save = saver_module.Saver({"v0": v0, "t0": t0.saveable},
-                                  write_version=self._WRITE_VERSION,
-                                  sharded=True)
+        save = saver_module.Saver(
+            {
+                "v0": v0,
+                "t0": t0.saveable
+            },
+            write_version=self._WRITE_VERSION,
+            sharded=True)
         variables.global_variables_initializer().run()
         t0.insert("k11", 33.0).run()
         self.assertEqual(111, v0.eval())
@@ -794,9 +818,13 @@ class SaveRestoreShardedTest(test.TestCase):
         with sess.graph.device("/cpu:0"):
           v1 = variables.Variable(222)
           t1 = saver_test_utils.CheckpointedOp(name="t1")
-        save = saver_module.Saver({"v1": v1, "t1": t1.saveable},
-                                  write_version=self._WRITE_VERSION,
-                                  sharded=True)
+        save = saver_module.Saver(
+            {
+                "v1": v1,
+                "t1": t1.saveable
+            },
+            write_version=self._WRITE_VERSION,
+            sharded=True)
         variables.global_variables_initializer().run()
         t1.insert("k22", 44.0).run()
         self.assertEqual(222, v1.eval())
@@ -990,10 +1018,12 @@ class SaveRestoreShardedTest(test.TestCase):
     self._testPartitionedVariables(use_resource=True)
 
 
+@test_util.with_c_api
 class SaveRestoreShardedTestV2(SaveRestoreShardedTest):
   _WRITE_VERSION = saver_pb2.SaverDef.V2
 
 
+@test_util.with_c_api
 class MaxToKeepTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -1263,6 +1293,7 @@ class MaxToKeepTest(test.TestCase):
       self.assertFalse(gfile.Exists(save._MetaGraphFilename(s1)))
 
 
+@test_util.with_c_api
 class KeepCheckpointEveryNHoursTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -1321,6 +1352,7 @@ class KeepCheckpointEveryNHoursTest(test.TestCase):
       self.assertTrue(saver_module.checkpoint_exists(s4))
 
 
+@test_util.with_c_api
 class SaveRestoreWithVariableNameMap(test.TestCase):
 
   def _testNonReshape(self, variable_op):
@@ -1397,6 +1429,7 @@ class SaveRestoreWithVariableNameMap(test.TestCase):
     self._testNonReshape(variables.Variable)
 
 
+@test_util.with_c_api
 class LatestCheckpointWithRelativePaths(test.TestCase):
 
   @staticmethod
@@ -1498,6 +1531,7 @@ class LatestCheckpointWithRelativePaths(test.TestCase):
           self.assertEqual(v0.eval(), 2.0)
 
 
+@test_util.with_c_api
 class CheckpointStateTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -1612,6 +1646,7 @@ class CheckpointStateTest(test.TestCase):
                      os.path.join(save_dir, "./model.ckpt-687529"))
 
 
+@test_util.with_c_api
 class MetaGraphTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -1881,7 +1916,8 @@ class MetaGraphTest(test.TestCase):
       # Generates a new MetaGraphDef.
       new_meta_graph_def = new_saver.export_meta_graph()
       # It should be the same as the original.
-      self.assertProtoEquals(meta_graph_def, new_meta_graph_def)
+      test_util.assert_meta_graph_protos_equal(self, meta_graph_def,
+                                               new_meta_graph_def)
 
   def _testGraphExtensionSave(self, test_dir):
     filename = os.path.join(test_dir, "metafile")
@@ -2039,6 +2075,42 @@ class MetaGraphTest(test.TestCase):
         self.assertEqual(o.summary, "")
         self.assertEqual(o.description, "")
 
+  def testStripDefaultValuedAttrs(self):
+    """Verifies that default valued attrs are stripped, unless disabled."""
+
+    # With strip_default_attrs enabled, attributes "T" (float32) and "Tout"
+    # (complex64) in the "Complex" op must be removed.
+    with self.test_session():
+      real_num = variables.Variable(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variables.Variable(2.0, dtype=dtypes.float32, name="imag")
+      math_ops.complex(real_num, imag_num, name="complex")
+
+      save = saver_module.Saver({"real_num": real_num, "imag_num": imag_num})
+      variables.global_variables_initializer()
+
+      meta_graph_def = save.export_meta_graph(strip_default_attrs=True)
+      node_def = test_util.get_node_def_from_graph("complex",
+                                                   meta_graph_def.graph_def)
+      self.assertNotIn("T", node_def.attr)
+      self.assertNotIn("Tout", node_def.attr)
+
+    # With strip_default_attrs disabled, attributes "T" (float32) and "Tout"
+    # (complex64) in the "Complex" op must *not* be removed, even if they map
+    # to their defaults.
+    with self.test_session(graph=ops_lib.Graph()):
+      real_num = variables.Variable(1.0, dtype=dtypes.float32, name="real")
+      imag_num = variables.Variable(2.0, dtype=dtypes.float32, name="imag")
+      math_ops.complex(real_num, imag_num, name="complex")
+
+      save = saver_module.Saver({"real_num": real_num, "imag_num": imag_num})
+      variables.global_variables_initializer()
+
+      meta_graph_def = save.export_meta_graph(strip_default_attrs=False)
+      node_def = test_util.get_node_def_from_graph("complex",
+                                                   meta_graph_def.graph_def)
+      self.assertIn("T", node_def.attr)
+      self.assertIn("Tout", node_def.attr)
+
   def testImportIntoNamescope(self):
     # Test that we can import a meta graph into a namescope.
     test_dir = self._get_test_dir("import_into_namescope")
@@ -2129,7 +2201,33 @@ class MetaGraphTest(test.TestCase):
               10, size=[1, 10])
       })
 
+  def testPreserveDatasetAndFunctions(self):
+    with ops_lib.Graph().as_default() as g:
+      dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x)
+      iterator = dataset.make_one_shot_iterator()
+      next_element = iterator.get_next()
+      _ = array_ops.identity(next_element, name="output")
+
+      # Generate three MetaGraphDef protos using different code paths.
+      meta_graph_def_simple = saver_module.export_meta_graph()
+      meta_graph_def_devices_cleared = saver_module.export_meta_graph(
+          clear_devices=True)
+      meta_graph_def_from_graph_def = saver_module.export_meta_graph(
+          clear_devices=True, graph_def=g.as_graph_def())
+
+    for meta_graph_def in [meta_graph_def_simple,
+                           meta_graph_def_devices_cleared,
+                           meta_graph_def_from_graph_def]:
+      with session.Session(graph=ops_lib.Graph()) as sess:
+        saver_module.import_meta_graph(meta_graph_def, import_scope="new_model")
+        sess.run(variables.global_variables_initializer())
+        for i in range(10):
+          self.assertEqual(i * i, sess.run("new_model/output:0"))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run("new_model/output:0")
+
 
+@test_util.with_c_api
 class CheckpointReaderTest(test.TestCase):
 
   _WRITE_VERSION = saver_pb2.SaverDef.V1
@@ -2182,10 +2280,12 @@ class CheckpointReaderTest(test.TestCase):
       pywrap_tensorflow.NewCheckpointReader("non-existent")
 
 
+@test_util.with_c_api
 class CheckpointReaderForV2Test(CheckpointReaderTest):
   _WRITE_VERSION = saver_pb2.SaverDef.V2
 
 
+@test_util.with_c_api
 class WriteGraphTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -2213,6 +2313,7 @@ class WriteGraphTest(test.TestCase):
     self.assertTrue(os.path.exists(path))
 
 
+@test_util.with_c_api
 class SaverUtilsTest(test.TestCase):
 
   def setUp(self):
@@ -2255,6 +2356,7 @@ class SaverUtilsTest(test.TestCase):
     self.assertTrue(mtimes[1] >= mtimes[0])
 
 
+@test_util.with_c_api
 class ScopedGraphTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -2558,51 +2660,5 @@ class ScopedGraphTest(test.TestCase):
       self.assertEqual(2.0, var_dict2["variable2:0"].eval())
 
 
-# TODO(b/64763924): Remove after Jan 1st 2018.
-class LenientNamesTest(test.TestCase):
-
-  def setUp(self):
-    super(LenientNamesTest, self).setUp()
-    os.putenv("TF_SAVER_LENIENT_NAMES", "True")
-
-  def tearDown(self):
-    os.putenv("TF_SAVER_LENIENT_NAMES", "")
-    super(LenientNamesTest, self).tearDown()
-
-  def testSaveRestore(self):
-    save_path = os.path.join(self.get_temp_dir(), "basic_save_restore")
-
-    # Build a graph with 2 parameter nodes, and Save and
-    # Restore nodes for them.
-    v0 = variables.Variable(10.0, name="v0")
-    v1 = variables.Variable(20.0, name="v1")
-    v2 = saver_test_utils.CheckpointedOp(name="v2")
-    v2_init = v2.insert("k1", 30.0)
-    save = saver_module.Saver(
-        {
-            "v0:0": v0,
-            "v1": v1,
-            "v2": v2.saveable
-        }, restore_sequentially=True)
-    init_all_op = [variables.global_variables_initializer(), v2_init]
-
-    with self.test_session() as sess:
-      sess.run(init_all_op)
-      save.save(sess, save_path)
-
-    with self.test_session() as sess:
-      v0 = variables.Variable(-1.0, name="v0")
-      v1 = variables.Variable(-1.0, name="v1")
-      v2 = saver_test_utils.CheckpointedOp(name="v2")
-      save = saver_module.Saver({"v0": v0, "v1": v1, "v2": v2.saveable})
-
-      save.restore(sess, save_path)
-      # Check that the parameter nodes have been restored.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
-      self.assertEqual(b"k1", v2.keys().eval())
-      self.assertEqual(30.0, v2.values().eval())
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 2091eca0b9c6f0af4a043a4639b6fb72b90cef56..2f421d1cc0a0190670082fabf4e25470c6a1723b 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -23,6 +23,7 @@ from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _make_server_def(server_or_cluster_def, job_name, task_index, protocol,
@@ -92,6 +93,7 @@ def _make_server_def(server_or_cluster_def, job_name, task_index, protocol,
   return server_def
 
 
+@tf_export("train.Server")
 class Server(object):
   """An in-process TensorFlow server, for use in distributed training.
 
@@ -221,6 +223,7 @@ class Server(object):
                   start=start)
 
 
+@tf_export("train.ClusterSpec")
 class ClusterSpec(object):
   """Represents a cluster as a set of "tasks", organized into "jobs".
 
@@ -307,6 +310,12 @@ class ClusterSpec(object):
   def __ne__(self, other):
     return self._cluster_spec != other
 
+  def __str__(self):
+    key_values = self.as_dict()
+    string_items = [
+        repr(k) + ": " + repr(key_values[k]) for k in sorted(key_values)]
+    return "ClusterSpec({" + ", ".join(string_items) + "})"
+
   def as_dict(self):
     """Returns a dictionary from job names to their tasks.
 
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index 26aac787ed40b644af1f611270b7aaee77623877..063044f0d05d4237830e415ac2ad800c98ae8beb 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -421,6 +421,17 @@ class ServerDefTest(test.TestCase):
 
 class ClusterSpecTest(test.TestCase):
 
+  def testStringConversion(self):
+    cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:1111"],
+        "worker": ["worker0:3333", "worker1:4444"]
+    })
+
+    expected_str = (
+        "ClusterSpec({'ps': ['ps0:1111'], 'worker': ['worker0:3333', "
+        "'worker1:4444']})")
+    self.assertEqual(expected_str, str(cluster_spec))
+
   def testProtoDictDefEquivalences(self):
     cluster_spec = server_lib.ClusterSpec({
         "ps": ["ps0:2222", "ps1:2222"],
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index a13b6dd976a835d14c03ed90f40b172e0bcbfd07..360e02fb44c1062f71bb50449b9ef381510a9c69 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as saver_mod
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _maybe_name(obj):
@@ -44,6 +45,7 @@ def _maybe_name(obj):
     return "<no name for %s>" % type(obj)
 
 
+@tf_export("train.SessionManager")
 class SessionManager(object):
   """Training helper that restores from checkpoint and creates session.
 
@@ -480,7 +482,9 @@ class SessionManager(object):
     if self._local_init_op is not None:
       is_ready_for_local_init, msg = self._model_ready_for_local_init(sess)
       if is_ready_for_local_init:
+        logging.info("Running local_init_op.")
         sess.run(self._local_init_op)
+        logging.info("Done running local_init_op.")
         return True, None
       else:
         return False, msg
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index 5879fd330adec58dde45f3da8ae16c9a297f3b24..6670d9365f2994a70b7228170179f97d314041c9 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -504,6 +505,7 @@ class SessionManagerTest(test.TestCase):
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES],
           name="x")
+      # TODO(b/70206927): Use ResourceVariables once they are handled properly.
       v_res = variables.Variable(1, name="v_res")
       w_res = variables.Variable(
           v_res,
@@ -556,6 +558,24 @@ class SessionManagerTest(test.TestCase):
       self.assertEquals(1, sess.run(w_res))
       self.assertEquals(3, sess.run(x_res))
 
+  def testPrepareSessionWithCyclicInitializer(self):
+    # Regression test. Previously Variable._build_initializer_expr would enter
+    # into an infinite recursion when the variable's initial_value involved
+    # cyclic dependencies.
+    with ops.Graph().as_default():
+      i = control_flow_ops.while_loop(lambda i: i < 1, lambda i: i + 1, [0])
+      v = variables.Variable(array_ops.identity(i), name="v")
+      with self.test_session():
+        self.assertEqual(False, variables.is_variable_initialized(v).eval())
+      sm = session_manager.SessionManager(
+          ready_op=variables.report_uninitialized_variables())
+      sess = sm.prepare_session("", init_op=v.initializer)
+      self.assertEqual(1, sess.run(v))
+      self.assertEqual(
+          True,
+          variables.is_variable_initialized(
+              sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
+
   def testPrepareSessionDidNotInitLocalVariable(self):
     with ops.Graph().as_default():
       v = variables.Variable(1, name="v")
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index 5b023d8a2672af5d1fab1c2566b19fca738fd1f7..89f40300650f3b6cd1ae15d946640c9df91771e2 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -96,8 +96,10 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.SessionRunHook")
 class SessionRunHook(object):
   """Hook to extend calls to MonitoredSession.run()."""
 
@@ -189,6 +191,7 @@ class SessionRunHook(object):
     pass
 
 
+@tf_export("train.SessionRunArgs")
 class SessionRunArgs(
     collections.namedtuple("SessionRunArgs",
                            ["fetches", "feed_dict", "options"])):
@@ -213,6 +216,7 @@ class SessionRunArgs(
     return super(SessionRunArgs, cls).__new__(cls, fetches, feed_dict, options)
 
 
+@tf_export("train.SessionRunContext")
 class SessionRunContext(object):
   """Provides information about the `session.run()` call being made.
 
@@ -264,6 +268,7 @@ class SessionRunContext(object):
     self._stop_requested = True
 
 
+@tf_export("train.SessionRunValues")
 class SessionRunValues(
     collections.namedtuple("SessionRunValues",
                            ["results", "options", "run_metadata"])):
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index ea28b5ddfc2dbbf65ec60e86d29ff2a9988d2b97..75ef3d5976aba9f0cbe849d9f6984646d71a29ef 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -48,11 +48,6 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 
 
-def _is_resource(v):
-  """Returns true if v is something you get from a resource variable."""
-  return isinstance(v, resource_variable_ops.ResourceVariable)
-
-
 def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
   """Helper function for creating a slot variable."""
 
@@ -60,9 +55,12 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
   # scope.
   current_partitioner = variable_scope.get_variable_scope().partitioner
   variable_scope.get_variable_scope().set_partitioner(None)
+  # When init from val instead of callable initializer, the shape is expected to
+  # be None, not <unknown> or any fully defined shape.
+  shape = shape if callable(val) else None
   slot = variable_scope.get_variable(
       scope, initializer=val, trainable=False,
-      use_resource=_is_resource(primary),
+      use_resource=resource_variable_ops.is_resource_variable(primary),
       shape=shape, dtype=dtype,
       validate_shape=validate_shape)
   variable_scope.get_variable_scope().set_partitioner(current_partitioner)
@@ -108,7 +106,8 @@ def create_slot(primary, val, name, colocate_with_primary=True):
   # and the same name has been previously used, the scope name will add '_N'
   # as suffix for unique identifications.
   validate_shape = val.get_shape().is_fully_defined()
-  with variable_scope.variable_scope(None, primary.op.name + "/" + name):
+  prefix = primary.op.name if context.in_graph_mode() else primary._shared_name  # pylint: disable=protected-access
+  with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
       with ops.colocate_with(primary):
         return _create_slot_var(primary, val, "", validate_shape, None, None)
diff --git a/tensorflow/python/training/slot_creator_test.py b/tensorflow/python/training/slot_creator_test.py
index 08a3c8dc53a5e88559ddeaf1f95d441fa5adfd29..b0f48e4ecd4d41946a8a5ed5a0c507a2344a943a 100644
--- a/tensorflow/python/training/slot_creator_test.py
+++ b/tensorflow/python/training/slot_creator_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
@@ -29,6 +30,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import slot_creator
 
 
+@test_util.with_c_api
 class SlotCreatorTest(test.TestCase):
 
   def testCreateSlotFromVariable(self):
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index a634a842b67033d5fde6bf8cf819f681e892a247..d2ad34773e0615256c340826dcc312cc8a00dc23 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -36,11 +36,17 @@ from tensorflow.python.training import coordinator
 from tensorflow.python.training import saver as saver_mod
 from tensorflow.python.training import session_manager as session_manager_mod
 from tensorflow.python.training import training_util
+from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("train.Supervisor")
 class Supervisor(object):
   """A training helper that checkpoints models and computes summaries.
 
+  This class is deprecated. Please use
+  ${tf.train.MonitoredTrainingSession} instead.
+
   The Supervisor is a small wrapper around a `Coordinator`, a `Saver`,
   and a `SessionManager` that takes care of common needs of TensorFlow
   training programs.
@@ -198,6 +204,8 @@ class Supervisor(object):
   # the default behavior should be used.
   USE_DEFAULT = 0
 
+  @deprecation.deprecated(None,
+                          "Please switch to tf.train.MonitoredTrainingSession")
   def __init__(self,
                graph=None,
                ready_op=USE_DEFAULT,
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index b52d101a2154f02273a7a24f4ed3ef55209da20c..0c6cf910d1a01dc20b15fb1cd5dbb249fbb60ef5 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -31,6 +31,7 @@ from tensorflow.python.training import optimizer
 from tensorflow.python.training import queue_runner
 from tensorflow.python.training import session_manager
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Please note that the gradients from replicas are averaged instead of summed
@@ -38,6 +39,7 @@ from tensorflow.python.training import session_run_hook
 # rate according to the number of replicas. This change is introduced to be
 # consistent with how gradients are aggregated (averaged) within a batch in a
 # replica.
+@tf_export("train.SyncReplicasOptimizer")
 class SyncReplicasOptimizer(optimizer.Optimizer):
   """Class to synchronize, aggregate gradients and pass them to the optimizer.
 
@@ -449,7 +451,7 @@ class _SyncReplicasOptimizerHook(session_run_hook.SessionRunHook):
   """A SessionRunHook handles ops related to SyncReplicasOptimizer."""
 
   def __init__(self, sync_optimizer, is_chief, num_tokens):
-    """Creates hook to handle SyncReplicaOptimizer initialization ops.
+    """Creates hook to handle SyncReplicasOptimizer initialization ops.
 
     Args:
       sync_optimizer: `SyncReplicasOptimizer` which this hook will initialize.
diff --git a/tensorflow/python/training/sync_replicas_optimizer_test.py b/tensorflow/python/training/sync_replicas_optimizer_test.py
index 297284f80c2997e21304138c5a090da76425917b..fff17402e23cb7b054d3e433650666b0554ed8ba 100644
--- a/tensorflow/python/training/sync_replicas_optimizer_test.py
+++ b/tensorflow/python/training/sync_replicas_optimizer_test.py
@@ -286,8 +286,9 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
     global_step = variables.Variable(0, name="global_step", trainable=False)
     opt.minimize(v, global_step=global_step)
     opt_variables = opt.variables()
-    self.assertIn(opt._opt._beta1_power, opt_variables)
-    self.assertIn(opt._opt._beta2_power, opt_variables)
+    beta1_power, beta2_power = opt._opt._get_beta_accumulators()
+    self.assertIn(beta1_power, opt_variables)
+    self.assertIn(beta2_power, opt_variables)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index fa02ad84cce3ccaa391571df3a2de4b65b255c84..78c8ce9208efc2f2fa8b5c671d3379e7ca8c70f5 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -38,6 +38,7 @@ See the @{$python/train} guide.
 @@clip_by_global_norm
 @@global_norm
 @@cosine_decay
+@@cosine_decay_restarts
 @@linear_cosine_decay
 @@noisy_linear_cosine_decay
 @@exponential_decay
@@ -134,7 +135,7 @@ from tensorflow.python.training.queue_runner import *
 
 # For the module level doc.
 from tensorflow.python.training import input as _input
-from tensorflow.python.training.input import *
+from tensorflow.python.training.input import *  # pylint: disable=redefined-builtin
 # pylint: enable=wildcard-import
 
 from tensorflow.python.training.basic_session_run_hooks import SecondOrStepTimer
@@ -188,6 +189,7 @@ from tensorflow.python.training.training_util import create_global_step
 from tensorflow.python.training.training_util import get_or_create_global_step
 from tensorflow.python.pywrap_tensorflow import do_quantize_training_on_graphdef
 from tensorflow.python.pywrap_tensorflow import NewCheckpointReader
+from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=wildcard-import
 # Training data protos.
@@ -238,6 +240,23 @@ _allowed_symbols = [
     "SequenceExample",  # from example_pb2.
     "ServerDef",
 ]
+
+# pylint: disable=undefined-variable
+tf_export("train.BytesList")(BytesList)
+tf_export("train.ClusterDef")(ClusterDef)
+tf_export("train.Example")(Example)
+tf_export("train.Feature")(Feature)
+tf_export("train.Features")(Features)
+tf_export("train.FeatureList")(FeatureList)
+tf_export("train.FeatureLists")(FeatureLists)
+tf_export("train.FloatList")(FloatList)
+tf_export("train.Int64List")(Int64List)
+tf_export("train.JobDef")(JobDef)
+tf_export("train.SaverDef")(SaverDef)
+tf_export("train.SequenceExample")(SequenceExample)
+tf_export("train.ServerDef")(ServerDef)
+# pylint: enable=undefined-variable
+
 # Include extra modules for docstrings because:
 # * Input methods in tf.train are documented in io_ops.
 # * Saver methods in tf.train are documented in state_ops.
diff --git a/tensorflow/python/training/training_ops.py b/tensorflow/python/training/training_ops.py
index e98c32b614418224b1bc14081bc35f175d769965..d7133cfb500ef11e5b94c7c36905e039f9c0bf46 100644
--- a/tensorflow/python/training/training_ops.py
+++ b/tensorflow/python/training/training_ops.py
@@ -19,7 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.training import gen_training_ops
+from tensorflow.python.training import gen_training_ops  # pylint: disable=unused-import
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.training.gen_training_ops import *
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index 89a9e129328fe38da2ce497a7f26dc11446ea032..499f1feb2dbf8aee26314a43b0a000fb91a1c686 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
 # Picked a long key value to minimize the chance of collision with user defined
@@ -40,6 +41,7 @@ GLOBAL_STEP_READ_KEY = 'global_step_read_op_cache'
 write_graph = graph_io.write_graph
 
 
+@tf_export('train.global_step')
 def global_step(sess, global_step_tensor):
   """Small helper to get the global step.
 
@@ -67,6 +69,7 @@ def global_step(sess, global_step_tensor):
   return int(sess.run(global_step_tensor))
 
 
+@tf_export('train.get_global_step')
 def get_global_step(graph=None):
   """Get the global step tensor.
 
@@ -101,6 +104,7 @@ def get_global_step(graph=None):
   return global_step_tensor
 
 
+@tf_export('train.create_global_step')
 def create_global_step(graph=None):
   """Create global step tensor in graph.
 
@@ -139,6 +143,7 @@ def create_global_step(graph=None):
                      ops.GraphKeys.GLOBAL_STEP])
 
 
+@tf_export('train.get_or_create_global_step')
 def get_or_create_global_step(graph=None):
   """Returns and create (if necessary) the global step tensor.
 
@@ -156,6 +161,7 @@ def get_or_create_global_step(graph=None):
   return global_step_tensor
 
 
+@tf_export('train.assert_global_step')
 def assert_global_step(global_step_tensor):
   """Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`.
 
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 07382d93dfe5ebe3f063b86bc5afa288970330f6..4163fcac79e3d237c4c4c4303e1db2c39e5fe7c6 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Functions for Python 2 vs. 3 compatibility.
 
 ## Conversion routines
@@ -21,6 +20,7 @@ In addition to the functions below, `as_str` converts an object to a `str`.
 @@as_bytes
 @@as_text
 @@as_str_any
+@@path_to_str
 
 ## Types
 The compatibility module also provides the following types:
@@ -41,8 +41,11 @@ import numpy as _np
 import six as _six
 
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('compat.as_bytes', 'compat.as_str')
 def as_bytes(bytes_or_text, encoding='utf-8'):
   """Converts either bytes or unicode to `bytes`, using utf-8 encoding for text.
 
@@ -65,6 +68,7 @@ def as_bytes(bytes_or_text, encoding='utf-8'):
                     (bytes_or_text,))
 
 
+@tf_export('compat.as_text')
 def as_text(bytes_or_text, encoding='utf-8'):
   """Returns the given argument as a unicode string.
 
@@ -93,6 +97,7 @@ else:
   as_str = as_text
 
 
+@tf_export('compat.as_str_any')
 def as_str_any(value):
   """Converts to `str` as `str(value)`, but use `as_str` for `bytes`.
 
@@ -108,16 +113,34 @@ def as_str_any(value):
     return str(value)
 
 
+@tf_export('compat.path_to_str')
+def path_to_str(path):
+  """Returns the file system path representation of a `PathLike` object, else as it is.
+
+  Args:
+    path: An object that can be converted to path representation.
+
+  Returns:
+    A `str` object.
+  """
+  if hasattr(path, '__fspath__'):
+    path = as_str_any(path.__fspath__())
+  return path
+
+
 # Numpy 1.8 scalars don't inherit from numbers.Integral in Python 3, so we
 # need to check them specifically.  The same goes from Real and Complex.
 integral_types = (_numbers.Integral, _np.integer)
+tf_export('compat.integral_types').export_constant(__name__, 'integral_types')
 real_types = (_numbers.Real, _np.integer, _np.floating)
+tf_export('compat.real_types').export_constant(__name__, 'real_types')
 complex_types = (_numbers.Complex, _np.number)
-
+tf_export('compat.complex_types').export_constant(__name__, 'complex_types')
 
 # Either bytes or text.
 bytes_or_text_types = (bytes, _six.text_type)
-
+tf_export('compat.bytes_or_text_types').export_constant(__name__,
+                                                        'bytes_or_text_types')
 
 _allowed_symbols = [
     'as_str',
diff --git a/tensorflow/python/util/compat_internal.py b/tensorflow/python/util/compat_internal.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8b9319f668b85e227b9a0578b63fd46af0f2c13
--- /dev/null
+++ b/tensorflow/python/util/compat_internal.py
@@ -0,0 +1,35 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions for Python 2 vs. 3 compatibility that are private to TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.util.compat import as_str_any
+
+def path_to_str(path):
+  """Returns the file system path representation of a `PathLike` object,
+  else as it is.
+
+  Args:
+    path: An object that can be converted to path representation.
+
+  Returns:
+    A `str` object.
+  """
+  if hasattr(path, "__fspath__"):
+    path = as_str_any(path.__fspath__())
+  return path
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 8a66f0435a8cb3d689a6613e2fca5bab1c0a37e3..376be39978fb11463ae8a870492a359c89a9f2ce 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -24,6 +24,7 @@ import re
 
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import decorator_utils
+from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
@@ -38,13 +39,14 @@ _PRINTED_WARNING = {}
 
 def _add_deprecated_function_notice_to_docstring(doc, date, instructions):
   """Adds a deprecation notice to a docstring for deprecated functions."""
+  main_text = ['THIS FUNCTION IS DEPRECATED. It will be removed %s.' %
+               ('in a future version' if date is None else ('after %s' % date))]
+  if instructions:
+    main_text.append('Instructions for updating:')
   return decorator_utils.add_notice_to_docstring(
       doc, instructions,
       'DEPRECATED FUNCTION',
-      '(deprecated)', [
-          'THIS FUNCTION IS DEPRECATED. It will be removed %s.' % (
-              'in a future version' if date is None else ('after %s' % date)),
-          'Instructions for updating:'])
+      '(deprecated)', main_text)
 
 
 def _add_deprecated_arg_notice_to_docstring(doc, date, instructions):
@@ -66,23 +68,135 @@ def _validate_deprecation_args(date, instructions):
     raise ValueError('Don\'t deprecate things without conversion instructions!')
 
 
-def _call_location():
+def _call_location(outer=False):
   """Returns call location given level up from current call."""
   frame = tf_inspect.currentframe()
   if frame:
     # CPython internals are available, use them for performance.
     # walk back two frames to get to deprecated function caller.
-    first_frame = frame.f_back
-    second_frame = first_frame.f_back
-    frame = second_frame if second_frame else first_frame
+    frame = frame.f_back
+    if frame.f_back:
+      frame = frame.f_back
+    if outer and frame.f_back:
+      frame = frame.f_back
     return '%s:%d' % (frame.f_code.co_filename, frame.f_lineno)
   else:
     # Slow fallback path
     stack = tf_inspect.stack(0)  # 0 avoids generating unused context
-    entry = stack[2]
+    entry = stack[3 if outer else 2]
     return '%s:%d' % (entry[1], entry[2])
 
 
+def deprecated_alias(deprecated_name, name, func_or_class, warn_once=True):
+  """Deprecate a symbol in favor of a new name with identical semantics.
+
+  This function is meant to be used when defining a backwards-compatibility
+  alias for a symbol which has been moved. For example:
+
+  module1.py:
+  ```python
+  class NewNameForClass: pass
+  ```
+
+  module2.py:
+  ```python
+  import module1
+
+  DeprecatedNameForClass = deprecated_alias(
+    deprecated_name='module2.DeprecatedNameForClass',
+    name='module1.NewNameForClass',
+    module1.NewNameForClass)
+  ```
+
+  This function works for classes and functions.
+
+  For classes, it creates a new class which is functionally identical (it
+  inherits from the original, and overrides its constructor), but which prints
+  a deprecation warning when an instance is created. It also adds a deprecation
+  notice to the class' docstring.
+
+  For functions, it returns a function wrapped by `tf_decorator.make_decorator`.
+  That function prints a warning when used, and has a deprecation notice in its
+  docstring. This is more or less equivalent (the deprecation warning has
+  slightly different text) to writing:
+
+  ```python
+  @deprecated
+  def deprecated_alias(original_args):
+    real_function(original_args)
+  ```
+
+  Args:
+    deprecated_name: The name of the symbol that is being deprecated, to be used
+      in the warning message. This should be its fully qualified name to avoid
+      confusion.
+    name: The name of the symbol that is to be used instead of the deprecated
+      name. This should be a fully qualified name to avoid confusion.
+    func_or_class: The (non-deprecated) class or function for which a deprecated
+      alias should be created.
+    warn_once: If True (the default), only print a deprecation warning the first
+      time this function is used, or the class is instantiated.
+
+  Returns:
+    A wrapped version of `func_or_class` which prints a deprecation warning on
+    use and has a modified docstring.
+  """
+  if tf_inspect.isclass(func_or_class):
+
+    # Make a new class with __init__ wrapped in a warning.
+    class NewClass(func_or_class):  # pylint: disable=missing-docstring
+      __doc__ = decorator_utils.add_notice_to_docstring(
+          func_or_class.__doc__, 'Please use %s instead.' % name,
+          'DEPRECATED CLASS',
+          '(deprecated)', ['THIS CLASS IS DEPRECATED. '
+                           'It will be removed in a future version. '])
+      __name__ = func_or_class.__name__
+      __module__ = _call_location(outer=True)
+
+      def __init__(self, *args, **kwargs):
+        if hasattr(NewClass.__init__, '__func__'):
+          # Python 2
+          NewClass.__init__.__func__.__doc__ = func_or_class.__init__.__doc__
+        else:
+          # Python 3
+          NewClass.__init__.__doc__ = func_or_class.__init__.__doc__
+
+        if _PRINT_DEPRECATION_WARNINGS:
+          # We're making the alias as we speak. The original may have other
+          # aliases, so we cannot use it to check for whether it's already been
+          # warned about.
+          if NewClass.__init__ not in _PRINTED_WARNING:
+            if warn_once:
+              _PRINTED_WARNING[NewClass.__init__] = True
+            logging.warning(
+                'From %s: The name %s is deprecated. Please use %s instead.\n',
+                _call_location(), deprecated_name, name)
+        super(NewClass, self).__init__(*args, **kwargs)
+
+    return NewClass
+  else:
+    decorator_utils.validate_callable(func_or_class, 'deprecated')
+
+    # Make a wrapper for the original
+    @functools.wraps(func_or_class)
+    def new_func(*args, **kwargs):  # pylint: disable=missing-docstring
+      if _PRINT_DEPRECATION_WARNINGS:
+        # We're making the alias as we speak. The original may have other
+        # aliases, so we cannot use it to check for whether it's already been
+        # warned about.
+        if new_func not in _PRINTED_WARNING:
+          if warn_once:
+            _PRINTED_WARNING[new_func] = True
+          logging.warning(
+              'From %s: The name %s is deprecated. Please use %s instead.\n',
+              _call_location(), deprecated_name, name)
+      return func_or_class(*args, **kwargs)
+    return tf_decorator.make_decorator(
+        func_or_class, new_func, 'deprecated',
+        _add_deprecated_function_notice_to_docstring(
+            func_or_class.__doc__, None, 'Please use %s instead.' % name))
+
+
 def deprecated(date, instructions, warn_once=True):
   """Decorator for marking functions or methods deprecated.
 
@@ -284,7 +398,9 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples,
     @functools.wraps(func)
     def new_func(*args, **kwargs):
       """Deprecation wrapper."""
-      if _PRINT_DEPRECATION_WARNINGS:
+      # TODO(apassos) figure out a way to have reasonable performance with
+      # deprecation warnings and eager mode.
+      if is_in_graph_mode.IS_IN_GRAPH_MODE() and _PRINT_DEPRECATION_WARNINGS:
         invalid_args = []
         named_args = tf_inspect.getcallargs(func, *args, **kwargs)
         for arg_name, spec in iter(deprecated_positions.items()):
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index e61edb5cfa3f8f7676b8a77d787781abdd80f310..bdd0bc48d29319914e184ea4331a5e9d4a1c3328 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -24,6 +24,56 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
 
 
+class DeprecatedAliasTest(test.TestCase):
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_function_alias(self, mock_warning):
+    deprecated_func = deprecation.deprecated_alias("deprecated.func",
+                                                   "real.func",
+                                                   logging.error)
+
+    logging.error("fake error logged")
+    self.assertEqual(0, mock_warning.call_count)
+    deprecated_func("FAKE ERROR!")
+    self.assertEqual(1, mock_warning.call_count)
+    # Make sure the error points to the right file.
+    self.assertRegexpMatches(mock_warning.call_args[0][1],
+                             r"deprecation_test\.py:")
+    deprecated_func("ANOTHER FAKE ERROR!")
+    self.assertEqual(1, mock_warning.call_count)
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_class_alias(self, mock_warning):
+    class MyClass(object):
+      """My docstring."""
+
+      init_args = []
+
+      def __init__(self, arg):
+        MyClass.init_args.append(arg)
+
+    deprecated_cls = deprecation.deprecated_alias("deprecated.cls",
+                                                  "real.cls",
+                                                  MyClass)
+
+    print(deprecated_cls.__name__)
+    print(deprecated_cls.__module__)
+    print(deprecated_cls.__doc__)
+
+    MyClass("test")
+    self.assertEqual(0, mock_warning.call_count)
+    deprecated_cls("deprecated")
+    self.assertEqual(1, mock_warning.call_count)
+    # Make sure the error points to the right file.
+    self.assertRegexpMatches(mock_warning.call_args[0][1],
+                             r"deprecation_test\.py:")
+    deprecated_cls("deprecated again")
+    self.assertEqual(1, mock_warning.call_count)
+
+    self.assertEqual(["test", "deprecated", "deprecated again"],
+                     MyClass.init_args)
+
+
 class DeprecationTest(test.TestCase):
 
   @test.mock.patch.object(logging, "warning", autospec=True)
diff --git a/tensorflow/python/util/is_in_graph_mode.py b/tensorflow/python/util/is_in_graph_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ae89ecb714c25787732f0d6c671d78144bec395
--- /dev/null
+++ b/tensorflow/python/util/is_in_graph_mode.py
@@ -0,0 +1,22 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A function that tells you if the program is running in graph mode."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Call IS_IN_GRAPH_MODE() when you want to know whether the thread is in
+# graph mode.  By default, we always are.
+IS_IN_GRAPH_MODE = lambda: True
diff --git a/tensorflow/python/util/kernel_registry.h b/tensorflow/python/util/kernel_registry.h
index c00b60d91b3737966536d02281ed7a31a238b82f..1ba76f020bf3916704fb3a2d76895650fe093cfa 100644
--- a/tensorflow/python/util/kernel_registry.h
+++ b/tensorflow/python/util/kernel_registry.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 // Functions for getting information about kernels registered in the binary.
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_UTIL_KERNEL_REGISTRY_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_UTIL_KERNEL_REGISTRY_H_
+#ifndef TENSORFLOW_PYTHON_UTIL_KERNEL_REGISTRY_H_
+#define TENSORFLOW_PYTHON_UTIL_KERNEL_REGISTRY_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/platform/types.h"
@@ -31,4 +31,4 @@ string TryFindKernelClass(const string& serialized_node_def);
 }  // namespace swig
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_UTIL_KERNEL_REGISTRY_H_
+#endif  // TENSORFLOW_PYTHON_UTIL_KERNEL_REGISTRY_H_
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index f5802d9359128b5ecc9b9506ee9a9a21cfc19ef7..23c2c48f4b5a165bd6e356a6243b234619af1c4c 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -47,6 +47,30 @@ def _sorted(dict_):
     raise TypeError("nest only supports dicts with sortable keys.")
 
 
+def _is_namedtuple(instance, strict=False):
+  """Returns True iff `instance` is a `namedtuple`.
+
+  Args:
+    instance: An instance of a Python object.
+    strict: If True, `instance` is considered to be a `namedtuple` only if
+        it is a "plain" namedtuple. For instance, a class inheriting
+        from a `namedtuple` will be considered to be a `namedtuple`
+        iff `strict=False`.
+
+  Returns:
+    True if `instance` is a `namedtuple`.
+  """
+  # Attemp to limit the test to plain namedtuple (not stuff inheriting from it).
+  if not isinstance(instance, tuple):
+    return False
+  if strict and instance.__class__.__base__ != tuple:
+    return False
+  return (
+      hasattr(instance, "_fields") and
+      isinstance(instance._fields, _collections.Sequence) and
+      all(isinstance(f, _six.string_types) for f in instance._fields))
+
+
 def _sequence_like(instance, args):
   """Converts the sequence `args` to the same type as `instance`.
 
@@ -66,11 +90,7 @@ def _sequence_like(instance, args):
     # corresponding `OrderedDict` to pack it back).
     result = dict(zip(_sorted(instance), args))
     return type(instance)((key, result[key]) for key in _six.iterkeys(instance))
-  elif (isinstance(instance, tuple) and
-        hasattr(instance, "_fields") and
-        isinstance(instance._fields, _collections.Sequence) and
-        all(isinstance(f, _six.string_types) for f in instance._fields)):
-    # This is a namedtuple
+  elif _is_namedtuple(instance):
     return type(instance)(*args)
   else:
     # Not a namedtuple
@@ -135,8 +155,37 @@ def flatten(nest):
   return _pywrap_tensorflow.Flatten(nest)
 
 
+def _same_namedtuples(nest1, nest2):
+  """Returns True if the two namedtuples have the same name and fields."""
+  if nest1._fields != nest2._fields:
+    return False
+  if nest1.__class__.__name__ != nest2.__class__.__name__:
+    return False
+  return True
+
+
 def _recursive_assert_same_structure(nest1, nest2, check_types):
-  """Helper function for `assert_same_structure`."""
+  """Helper function for `assert_same_structure`.
+
+  See `assert_same_structure` for further information about namedtuples.
+
+  Args:
+    nest1: An arbitrarily nested structure.
+    nest2: An arbitrarily nested structure.
+    check_types: If `True` (default) types of sequences are checked as
+        well, including the keys of dictionaries. If set to `False`, for example
+        a list and a tuple of objects will look the same if they have the same
+        size. Note that namedtuples with identical name and fields are always
+        considered to have the same shallow structure.
+
+  Returns:
+    True if `nest1` and `nest2` have the same structure.
+
+  Raises:
+    ValueError: If the two structure don't have the same nested structre.
+    TypeError: If the two structure don't have the same sequence type.
+    ValueError: If the two dictionaries don't have the same set of keys.
+  """
   is_sequence_nest1 = is_sequence(nest1)
   if is_sequence_nest1 != is_sequence(nest2):
     raise ValueError(
@@ -149,11 +198,21 @@ def _recursive_assert_same_structure(nest1, nest2, check_types):
   if check_types:
     type_nest1 = type(nest1)
     type_nest2 = type(nest2)
-    if type_nest1 != type_nest2:
-      raise TypeError(
-          "The two structures don't have the same sequence type. First "
-          "structure has type %s, while second structure has type %s."
-          % (type_nest1, type_nest2))
+
+    # Duck-typing means that nest should be fine with two different namedtuples
+    # with identical name and fields.
+    if _is_namedtuple(nest1, True) and _is_namedtuple(nest2, True):
+      if not _same_namedtuples(nest1, nest2):
+        raise TypeError(
+            "The two namedtuples don't have the same sequence type. First "
+            "structure has type %s, while second structure has type %s."
+            % (type_nest1, type_nest2))
+    else:
+      if type_nest1 != type_nest2:
+        raise TypeError(
+            "The two structures don't have the same sequence type. First "
+            "structure has type %s, while second structure has type %s."
+            % (type_nest1, type_nest2))
 
     if isinstance(nest1, dict):
       keys1 = set(_six.iterkeys(nest1))
@@ -173,13 +232,24 @@ def _recursive_assert_same_structure(nest1, nest2, check_types):
 def assert_same_structure(nest1, nest2, check_types=True):
   """Asserts that two structures are nested in the same way.
 
+  Note that namedtuples with identical name and fields are always considered
+  to have the same shallow structure (even with `check_types=True`).
+  For intance, this code will print `True`:
+
+  ```python
+  def nt(a, b):
+    return collections.namedtuple('foo', 'a b')(a, b)
+  print(assert_same_structure(nt(0, 1), nt(2, 3)))
+  ```
+
   Args:
     nest1: an arbitrarily nested structure.
     nest2: an arbitrarily nested structure.
     check_types: if `True` (default) types of sequences are checked as
         well, including the keys of dictionaries. If set to `False`, for example
         a list and a tuple of objects will look the same if they have the same
-        size.
+        size. Note that namedtuples with identical name and fields are always
+        considered to have the same shallow structure.
 
   Raises:
     ValueError: If the two structures do not have the same number of elements or
@@ -349,6 +419,8 @@ def map_structure(func, *structure, **check_types_dict):
       `True` (default) the types of iterables within the structures have to be
       same (e.g. `map_structure(func, [1], (1,))` raises a `TypeError`
       exception). To allow this set this argument to `False`.
+      Note that namedtuples with identical name and fields are always
+      considered to have the same shallow structure.
 
   Returns:
     A new structure with the same arity as `structure`, whose values correspond
@@ -425,7 +497,9 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
     shallow_tree: an arbitrarily nested structure.
     input_tree: an arbitrarily nested structure.
     check_types: if `True` (default) the sequence types of `shallow_tree` and
-      `input_tree` have to be the same.
+      `input_tree` have to be the same. Note that even with check_types==True,
+      this function will consider two different namedtuple classes with the same
+      name and _fields attribute to be the same class.
 
   Raises:
     TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
@@ -441,10 +515,21 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
           "Input has type: %s." % type(input_tree))
 
     if check_types and not isinstance(input_tree, type(shallow_tree)):
-      raise TypeError(
-          "The two structures don't have the same sequence type. Input "
-          "structure has type %s, while shallow structure has type %s."
-          % (type(input_tree), type(shallow_tree)))
+      # Duck-typing means that nest should be fine with two different
+      # namedtuples with identical name and fields.
+      shallow_is_namedtuple = _is_namedtuple(shallow_tree, False)
+      input_is_namedtuple = _is_namedtuple(input_tree, False)
+      if shallow_is_namedtuple and input_is_namedtuple:
+        if not _same_namedtuples(shallow_tree, input_tree):
+          raise TypeError(
+              "The two namedtuples don't have the same sequence type. Input "
+              "structure has type %s, while shallow structure has type %s."
+              % (type(input_tree), type(shallow_tree)))
+      else:
+        raise TypeError(
+            "The two structures don't have the same sequence type. Input "
+            "structure has type %s, while shallow structure has type %s."
+            % (type(input_tree), type(shallow_tree)))
 
     if len(input_tree) != len(shallow_tree):
       raise ValueError(
@@ -456,12 +541,12 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
       if set(input_tree) != set(shallow_tree):
         raise ValueError(
             "The two structures don't have the same keys. Input "
-            "structure has keys %s, while shallow structure has keys %s."
-            % (list(_six.iterkeys(input_tree)),
-               list(_six.iterkeys(shallow_tree))))
+            "structure has keys %s, while shallow structure has keys %s." %
+            (list(_six.iterkeys(input_tree)),
+             list(_six.iterkeys(shallow_tree))))
 
-      input_tree = list(_six.iteritems(input_tree))
-      shallow_tree = list(_six.iteritems(shallow_tree))
+      input_tree = list(sorted(_six.iteritems(input_tree)))
+      shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
 
     for shallow_branch, input_branch in zip(shallow_tree, input_tree):
       assert_shallow_structure(shallow_branch, input_branch,
@@ -677,6 +762,85 @@ def get_traverse_shallow_structure(traverse_fn, structure):
   return _sequence_like(structure, level_traverse)
 
 
+def yield_flat_paths(nest):
+  """Yields paths for some nested structure.
+
+  Paths are lists of objects which can be str-converted, which may include
+  integers or other types which are used as indices in a dict.
+
+  The flat list will be in the corresponding order as if you called
+  `snt.nest.flatten` on the structure. This is handy for naming Tensors such
+  the TF scope structure matches the tuple structure.
+
+  E.g. if we have a tuple `value = Foo(a=3, b=Bar(c=23, d=42))`
+
+  ```shell
+  >>> nest.flatten(value)
+  [3, 23, 42]
+  >>> list(nest.yield_flat_paths(value))
+  [('a',), ('b', 'c'), ('b', 'd')]
+  ```
+
+  ```shell
+  >>> list(nest.yield_flat_paths({'a': [3]}))
+  [('a', 0)]
+  >>> list(nest.yield_flat_paths({'a': 3}))
+  [('a',)]
+  ```
+
+  Args:
+    nest: the value to produce a flattened paths list for.
+
+  Yields:
+    Tuples containing index or key values which form the path to a specific
+      leaf value in the nested structure.
+  """
+
+  # The _maybe_add_final_path_element function is used below in order to avoid
+  # adding trailing slashes when the sub-element recursed into is a leaf.
+  if isinstance(nest, dict):
+    for key in _sorted(nest):
+      value = nest[key]
+      for sub_path in yield_flat_paths(value):
+        yield (key,) + sub_path
+  elif _is_namedtuple(nest):
+    for key in nest._fields:
+      value = getattr(nest, key)
+      for sub_path in yield_flat_paths(value):
+        yield (key,) + sub_path
+  elif isinstance(nest, _six.string_types):
+    yield ()
+  elif isinstance(nest, _collections.Sequence):
+    for idx, value in enumerate(nest):
+      for sub_path in yield_flat_paths(value):
+        yield (idx,) + sub_path
+  else:
+    yield ()
+
+
+def flatten_with_joined_string_paths(structure, separator="/"):
+  """Returns a list of (string path, data element) tuples.
+
+  The order of tuples produced matches that of `nest.flatten`. This allows you
+  to flatten a nested structure while keeping information about where in the
+  structure each data element was located. See `nest.yield_flat_paths`
+  for more information.
+
+  Args:
+    structure: the nested structure to flatten.
+    separator: string to separate levels of hierarchy in the results, defaults
+      to '/'.
+
+  Returns:
+    A list of (string, data element) tuples.
+  """
+  flat_paths = yield_flat_paths(structure)
+  def stringify_and_join(path_elements):
+    return separator.join(str(path_element) for path_element in path_elements)
+  flat_string_paths = [stringify_and_join(path) for path in flat_paths]
+  return list(zip(flat_string_paths, flatten(structure)))
+
+
 _pywrap_tensorflow.RegisterSequenceClass(_collections.Sequence)
 
 
@@ -691,6 +855,8 @@ _allowed_symbols = [
     "flatten_up_to",
     "map_structure_up_to",
     "get_traverse_shallow_structure",
+    "yield_flat_paths",
+    "flatten_with_joined_string_paths",
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 26aeaeec19b334b466f185fe765974fca61ae3b8..4439d6241ea9607b194cbb17304dbb77dc9f57a8 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -258,6 +258,36 @@ class NestTest(test.TestCase):
                                  "don't have the same set of keys"):
       nest.assert_same_structure({"a": 1}, {"b": 1})
 
+    same_name_type_0 = collections.namedtuple("same_name", ("a", "b"))
+    same_name_type_1 = collections.namedtuple("same_name", ("a", "b"))
+    nest.assert_same_structure(same_name_type_0(0, 1), same_name_type_1(2, 3))
+
+    # This assertion is expected to pass: two namedtuples with the same
+    # name and field names are considered to be identical.
+    same_name_type_2 = collections.namedtuple("same_name_1", ("x", "y"))
+    same_name_type_3 = collections.namedtuple("same_name_1", ("x", "y"))
+    nest.assert_same_structure(
+        same_name_type_0(same_name_type_2(0, 1), 2),
+        same_name_type_1(same_name_type_3(2, 3), 4))
+
+    expected_message = "The two structures don't have the same.*"
+    with self.assertRaisesRegexp(ValueError, expected_message):
+      nest.assert_same_structure(same_name_type_0(0, same_name_type_1(1, 2)),
+                                 same_name_type_1(same_name_type_0(0, 1), 2))
+
+    same_name_type_1 = collections.namedtuple("not_same_name", ("a", "b"))
+    self.assertRaises(TypeError, nest.assert_same_structure,
+                      same_name_type_0(0, 1), same_name_type_1(2, 3))
+
+    same_name_type_1 = collections.namedtuple("same_name", ("x", "y"))
+    self.assertRaises(TypeError, nest.assert_same_structure,
+                      same_name_type_0(0, 1), same_name_type_1(2, 3))
+
+    class SameNamedType1(collections.namedtuple("same_name", ("a", "b"))):
+      pass
+    self.assertRaises(TypeError, nest.assert_same_structure,
+                      same_name_type_0(0, 1), SameNamedType1(2, 3))
+
   def testMapStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
     structure2 = (((7, 8), 9), 10, (11, 12))
@@ -388,12 +418,26 @@ class NestTest(test.TestCase):
     inp_ab1 = {"a": (1, 1), "b": {"c": (2, 2)}}
     inp_ab2 = {"a": (1, 1), "b": {"d": (2, 2)}}
     expected_message = (
-        "The two structures don't have the same keys. Input "
-        "structure has keys \['c'\], while shallow structure has keys \['d'\].")
+        r"The two structures don't have the same keys. Input "
+        r"structure has keys \['c'\], while shallow structure has "
+        r"keys \['d'\].")
 
     with self.assertRaisesRegexp(ValueError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
 
+    inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))])
+    inp_ba = collections.OrderedDict([("b", (2, 3)), ("a", 1)])
+    nest.assert_shallow_structure(inp_ab, inp_ba)
+
+    # This assertion is expected to pass: two namedtuples with the same
+    # name and field names are considered to be identical.
+    same_name_type_0 = collections.namedtuple("same_name", ("a", "b"))
+    same_name_type_1 = collections.namedtuple("same_name", ("a", "b"))
+    inp_shallow = same_name_type_0(1, 2)
+    inp_deep = same_name_type_1(1, [1, 2, 3])
+    nest.assert_shallow_structure(inp_shallow, inp_deep, check_types=False)
+    nest.assert_shallow_structure(inp_shallow, inp_deep, check_types=True)
+
   def testFlattenUpTo(self):
     # Shallow tree ends at scalar.
     input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
@@ -438,8 +482,7 @@ class NestTest(test.TestCase):
     input_tree_flattened_as_shallow_tree = nest.flatten_up_to(shallow_tree,
                                                               input_tree)
     self.assertEqual(input_tree_flattened_as_shallow_tree, [0, 1, 2, 3, 4])
-    shallow_tree = collections.OrderedDict([("a", 0),
-                                            ("c", {"d": 3, "e": 1})])
+    shallow_tree = collections.OrderedDict([("a", 0), ("c", {"d": 3, "e": 1})])
     input_tree_flattened_as_shallow_tree = nest.flatten_up_to(shallow_tree,
                                                               input_tree)
     self.assertEqual(input_tree_flattened_as_shallow_tree,
@@ -584,6 +627,59 @@ class NestTest(test.TestCase):
         TypeError, "didn't return a depth=1 structure of bools"):
       nest.get_traverse_shallow_structure(lambda _: [1], [1])
 
+  def testYieldFlatStringPaths(self):
+    for inputs_expected in ({"inputs": [], "expected": []},
+                            {"inputs": 3, "expected": [()]},
+                            {"inputs": [3], "expected": [(0,)]},
+                            {"inputs": {"a": 3}, "expected": [("a",)]},
+                            {"inputs": {"a": {"b": 4}},
+                             "expected": [("a", "b")]},
+                            {"inputs": [{"a": 2}], "expected": [(0, "a")]},
+                            {"inputs": [{"a": [2]}], "expected": [(0, "a", 0)]},
+                            {"inputs": [{"a": [(23, 42)]}],
+                             "expected": [(0, "a", 0, 0), (0, "a", 0, 1)]},
+                            {"inputs": [{"a": ([23], 42)}],
+                             "expected": [(0, "a", 0, 0), (0, "a", 1)]},
+                            {"inputs": {"a": {"a": 2}, "c": [[[4]]]},
+                             "expected": [("a", "a"), ("c", 0, 0, 0)]},
+                            {"inputs": {"0": [{"1": 23}]},
+                             "expected": [("0", 0, "1")]}):
+      inputs = inputs_expected["inputs"]
+      expected = inputs_expected["expected"]
+      self.assertEqual(list(nest.yield_flat_paths(inputs)), expected)
+
+  def testFlattenWithStringPaths(self):
+    for inputs_expected in (
+        {"inputs": [], "expected": []},
+        {"inputs": [23, "42"], "expected": [("0", 23), ("1", "42")]},
+        {"inputs": [[[[108]]]], "expected": [("0/0/0/0", 108)]}):
+      inputs = inputs_expected["inputs"]
+      expected = inputs_expected["expected"]
+      self.assertEqual(
+          nest.flatten_with_joined_string_paths(inputs, separator="/"),
+          expected)
+
+  # Need a separate test for namedtuple as we can't declare tuple definitions
+  # in the @parameterized arguments.
+  def testFlattenNamedTuple(self):
+    # pylint: disable=invalid-name
+    Foo = collections.namedtuple("Foo", ["a", "b"])
+    Bar = collections.namedtuple("Bar", ["c", "d"])
+    # pylint: enable=invalid-name
+    test_cases = [
+        (Foo(a=3, b=Bar(c=23, d=42)),
+         [("a", 3), ("b/c", 23), ("b/d", 42)]),
+        (Foo(a=Bar(c=23, d=42), b=Bar(c=0, d="something")),
+         [("a/c", 23), ("a/d", 42), ("b/c", 0), ("b/d", "something")]),
+        (Bar(c=42, d=43),
+         [("c", 42), ("d", 43)]),
+        (Bar(c=[42], d=43),
+         [("c/0", 42), ("d", 43)]),
+    ]
+    for inputs, expected in test_cases:
+      self.assertEqual(
+          list(nest.flatten_with_joined_string_paths(inputs)), expected)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/util/py_checkpoint_reader.i b/tensorflow/python/util/py_checkpoint_reader.i
index 0cd095d9d947f5cf76adaf83dc16272c4374573e..8004898cbcbce7ce593ce35efdc6493e052468bd 100644
--- a/tensorflow/python/util/py_checkpoint_reader.i
+++ b/tensorflow/python/util/py_checkpoint_reader.i
@@ -164,6 +164,8 @@ def NewCheckpointReader(filepattern):
   with errors.raise_exception_on_not_ok_status() as status:
     from tensorflow.python.util import compat
     return CheckpointReader(compat.as_bytes(filepattern), status)
+
+NewCheckpointReader._tf_api_names = ['train.NewCheckpointReader']
 %}
 
 %include "tensorflow/c/checkpoint_reader.h"
diff --git a/tensorflow/python/util/stat_summarizer.i b/tensorflow/python/util/stat_summarizer.i
index 80739195872a056e7a5443dfb81ab1440300dbff..6aeaa0e31b9b48f7e6705ab7146828cc0e0e5e08 100644
--- a/tensorflow/python/util/stat_summarizer.i
+++ b/tensorflow/python/util/stat_summarizer.i
@@ -27,8 +27,8 @@ limitations under the License.
 
 %ignoreall
 
-%unignore NewStatSummarizer;
-%unignore DeleteStatSummarizer;
+%unignore _NewStatSummarizer;
+%unignore _DeleteStatSummarizer;
 %unignore tensorflow;
 %unignore tensorflow::StatSummarizer;
 %unignore tensorflow::StatSummarizer::StatSummarizer;
@@ -43,21 +43,20 @@ limitations under the License.
 
 // TODO(ashankar): Remove the unused argument from the API.
 %{
-tensorflow::StatSummarizer* NewStatSummarizer(
+tensorflow::StatSummarizer* _NewStatSummarizer(
       const string& unused) {
   return new tensorflow::StatSummarizer(tensorflow::StatSummarizerOptions());
 }
 %}
 
-
 %{
-void DeleteStatSummarizer(tensorflow::StatSummarizer* ss) {
+void _DeleteStatSummarizer(tensorflow::StatSummarizer* ss) {
   delete ss;
 }
 %}
 
-tensorflow::StatSummarizer* NewStatSummarizer(const string& unused);
-void DeleteStatSummarizer(tensorflow::StatSummarizer* ss);
+tensorflow::StatSummarizer* _NewStatSummarizer(const string& unused);
+void _DeleteStatSummarizer(tensorflow::StatSummarizer* ss);
 
 %extend tensorflow::StatSummarizer {
   void ProcessStepStatsStr(const string& step_stats_str) {
@@ -77,3 +76,21 @@ void DeleteStatSummarizer(tensorflow::StatSummarizer* ss);
 
 %include "tensorflow/core/util/stat_summarizer.h"
 %unignoreall
+
+%insert("python") %{
+
+# Wrapping NewStatSummarizer and DeletStatSummarizer because
+# SWIG-generated functions are built-in functions and do not support
+# setting _tf_api_names attribute.
+
+def NewStatSummarizer(unused):
+  return _NewStatSummarizer(unused)
+
+def DeleteStatSummarizer(stat_summarizer):
+  _DeleteStatSummarizer(stat_summarizer)
+
+NewStatSummarizer._tf_api_names = ["contrib.stat_summarizer.NewStatSummarizer"]
+DeleteStatSummarizer._tf_api_names = [
+    "contrib.stat_summarizer.DeleteStatSummarizer"]
+StatSummarizer._tf_api_names = ["contrib.stat_summarizer.StatSummarizer"]
+%}
diff --git a/tensorflow/python/util/tf_decorator.py b/tensorflow/python/util/tf_decorator.py
index 780fcba64f934e25ffc9cd24f57369de758d5e45..3d837a40449ece056c154e1b09636a8885047035 100644
--- a/tensorflow/python/util/tf_decorator.py
+++ b/tensorflow/python/util/tf_decorator.py
@@ -89,9 +89,14 @@ def make_decorator(target,
   decorator = TFDecorator(decorator_name, target, decorator_doc,
                           decorator_argspec)
   setattr(decorator_func, '_tf_decorator', decorator)
-  decorator_func.__name__ = target.__name__
-  decorator_func.__module__ = target.__module__
-  decorator_func.__doc__ = decorator.__doc__
+  # Objects that are callables (e.g., a functools.partial object) may not have
+  # the following attributes.
+  if hasattr(target, '__name__'):
+    decorator_func.__name__ = target.__name__
+  if hasattr(target, '__module__'):
+    decorator_func.__module__ = target.__module__
+  if hasattr(target, '__doc__'):
+    decorator_func.__doc__ = decorator.__doc__
   decorator_func.__wrapped__ = target
   return decorator_func
 
@@ -139,10 +144,11 @@ class TFDecorator(object):
     self._decorator_name = decorator_name
     self._decorator_doc = decorator_doc
     self._decorator_argspec = decorator_argspec
-    self.__name__ = target.__name__
+    if hasattr(target, '__name__'):
+      self.__name__ = target.__name__
     if self._decorator_doc:
       self.__doc__ = self._decorator_doc
-    elif target.__doc__:
+    elif hasattr(target, '__doc__') and target.__doc__:
       self.__doc__ = target.__doc__
     else:
       self.__doc__ = ''
diff --git a/tensorflow/python/util/tf_decorator_test.py b/tensorflow/python/util/tf_decorator_test.py
index 3f6a10b44081db2f5ce0d8ffb0333cd3c76fc269..0f9712c987d442358ecb4f81f46ef0898e380b01 100644
--- a/tensorflow/python/util/tf_decorator_test.py
+++ b/tensorflow/python/util/tf_decorator_test.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_decorator
@@ -195,6 +197,23 @@ class TfMakeDecoratorTest(test.TestCase):
     decorator = getattr(decorated, '_tf_decorator')
     self.assertEqual('test_decorator_name', decorator.decorator_name)
 
+  def testCompatibleWithNamelessCallables(self):
+
+    class Callable(object):
+
+      def __call__(self):
+        pass
+
+    callable_object = Callable()
+    # Smoke test: This should not raise an exception, even though
+    # `callable_object` does not have a `__name__` attribute.
+    _ = tf_decorator.make_decorator(callable_object, test_wrapper)
+
+    partial = functools.partial(test_function, x=1)
+    # Smoke test: This should not raise an exception, even though `partial` does
+    # not have `__name__`, `__module__`, and `__doc__` attributes.
+    _ = tf_decorator.make_decorator(partial, test_wrapper)
+
 
 class TfDecoratorUnwrapTest(test.TestCase):
 
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 9ed125704b1cf2ced585db0b169a184d27e1ad72..c4168f7b1ac80976a957e96c79c72fe3b288d622 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -45,6 +45,26 @@ def getargspec(object):  # pylint: disable=redefined-builtin
                if d.decorator_argspec is not None), _inspect.getargspec(target))
 
 
+def getfullargspec(obj):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getfullargspec and fallback to
+  inspect.getargspec in Python 2.
+
+  Args:
+    obj: A callable, possibly decorated.
+
+  Returns:
+    The `FullArgSpec` (`ArgSpec` in Python 2) that describes the signature of
+    the outermost decorator that changes the callable's signature. If the
+    callable is not decorated, `inspect.getfullargspec()`
+    (`inspect.getargspec()` in Python 2) will be called directly on the
+    callable.
+  """
+  spec_fn = getattr(_inspect, 'getfullargspec', getattr(_inspect, 'getargspec'))
+  decorators, target = tf_decorator.unwrap(obj)
+  return next((d.decorator_argspec for d in decorators
+               if d.decorator_argspec is not None), spec_fn(target))
+
+
 def getcallargs(func, *positional, **named):
   """TFDecorator-aware replacement for inspect.getcallargs.
 
@@ -97,7 +117,16 @@ def getdoc(object):  # pylint: disable=redefined-builtin
 
 def getfile(object):  # pylint: disable=redefined-builtin
   """TFDecorator-aware replacement for inspect.getfile."""
-  return _inspect.getfile(tf_decorator.unwrap(object)[1])
+  unwrapped_object = tf_decorator.unwrap(object)[1]
+
+  # Work around for the case when object is a stack frame
+  # and only .pyc files are used. In this case, getfile
+  # might return incorrect path. So, we get the path from f_globals
+  # instead.
+  if (hasattr(unwrapped_object, 'f_globals') and
+      '__file__' in unwrapped_object.f_globals):
+    return unwrapped_object.f_globals['__file__']
+  return _inspect.getfile(unwrapped_object)
 
 
 def getmembers(object, predicate=None):  # pylint: disable=redefined-builtin
diff --git a/tensorflow/python/util/tfprof.i b/tensorflow/python/util/tfprof.i
index 8d11cdfd58b47a5cdd6ec5c65b30df5621e59768..06f12631fa7ef04b24d469be00ba181ed9ac4e13 100644
--- a/tensorflow/python/util/tfprof.i
+++ b/tensorflow/python/util/tfprof.i
@@ -47,6 +47,7 @@ using tensorflow::int64;
 %unignore tensorflow::tfprof::ProfilerFromFile;
 %unignore tensorflow::tfprof::DeleteProfiler;
 %unignore tensorflow::tfprof::AddStep;
+%unignore tensorflow::tfprof::SerializeToString;
 %unignore tensorflow::tfprof::WriteProfile;
 %unignore tensorflow::tfprof::Profile;
 
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index c3d7611ad43b05f510481925fbfe1f930cf95ff8..a41fa7df253bcf4bce280574b89ed0dda8330521 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -29,7 +29,7 @@ bool WarnedThatSetIsNotSequence = false;
 
 // Returns 1 if `o` is considered a sequence for the purposes of Flatten().
 // Returns 0 otherwise.
-// Returns -1 if an error occured.
+// Returns -1 if an error occurred.
 int IsSequenceHelper(PyObject* o) {
   if (PyDict_Check(o)) return true;
   if (PySet_Check(o) && !WarnedThatSetIsNotSequence) {
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 493d26b497d714b318a345c96462d2d01de789c9..2af71dc753760e7efaf28cc500d5296a31957a04 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 // Functions for getting information about kernels registered in the binary.
-#ifndef THIRD_PARTY_TENSORFLOW_PYTHON_UTIL_UTIL_H_
-#define THIRD_PARTY_TENSORFLOW_PYTHON_UTIL_UTIL_H_
+#ifndef TENSORFLOW_PYTHON_UTIL_UTIL_H_
+#define TENSORFLOW_PYTHON_UTIL_UTIL_H_
 
 #include <Python.h>
 
@@ -71,4 +71,4 @@ void RegisterSequenceClass(PyObject* sequence_class);
 }  // namespace swig
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_PYTHON_UTIL_UTIL_H_
+#endif  // TENSORFLOW_PYTHON_UTIL_UTIL_H_
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index eb1b19c5d963d56c6175251a54e2ab5072a01760..072f08554688276a05d9be85718de8750bd874c2 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -30,8 +30,8 @@ limitations under the License.
 //  Stream stream{stream_exec};
 //  stream
 //    .Init()
-//    .ThenBlasAxpy(1024, 5.5, x, 1, &y, 1)
-//    .BlockHostUntilDone();
+//    .ThenBlasAxpy(1024, 5.5, x, 1, &y, 1);
+//  SE_CHECK_OK(stream.BlockHostUntilDone());
 //
 // By using stream operations in this manner the user can easily intermix custom
 // kernel launches (via StreamExecutor::ThenLaunch()) with these pre-canned BLAS
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index cb2b06d47cd8ccf82e9df81d63049915b9b47582..44a3a745ad86dc24f632e4a36691fba06171c9fb 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include <assert.h>
 #include <complex>
 
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
 #include "tensorflow/stream_executor/cuda/cuda_helpers.h"
@@ -268,6 +269,11 @@ PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSgemmEx)
 PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasGemmEx)
 #endif
 
+#if CUDA_VERSION >= 9000
+PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasGetMathMode)
+PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSetMathMode)
+#endif
+
 }  // namespace wrap
 
 static string ToString(cublasStatus_t status) {
@@ -299,6 +305,18 @@ static string ToString(cublasStatus_t status) {
   }
 }
 
+// Decide whether to enable TENSOR_OP_MATH
+static bool TensorOpMathEnabled() {
+  static bool is_enabled = [] {
+    bool is_disabled;
+    TF_CHECK_OK(
+        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH",
+                                       /*default_val=*/false, &is_disabled));
+    return !is_disabled;
+  }();
+  return is_enabled;
+}
+
 // cuBLAS has interfaces that permit pointers to be passed from either the host
 // memory space or the device memory space; however, you must instruct it as to
 // which address space those pointers are in with cublasSetPointerMode.
@@ -360,6 +378,65 @@ class ScopedCublasPointerMode {
   bool ok_;                       // Whether the change was successful.
 };
 
+#if CUDA_VERSION >= 9000
+// cuBLAS has interfaces that permit computations to use the Volta hardware.
+// This must be enabled via the cublasGet/SetMathMode APIs.
+//
+// This helper sets the cuBLAS math mode to a desired value for a cuBLAS call
+// you are about to perform in a given scope.
+//
+// The prior cuBLAS math mode is retained and restored when this object goes
+// out of scope.
+class ScopedCublasMathMode {
+ public:
+  // Note that, because the setting of the cublas math mode is fallible,
+  // construction of this scoped datatype must be paired with a call to
+  // Init().
+  //
+  // Parameters:
+  //  handle: The cublas library handle to act upon in setting the math mode.
+  explicit ScopedCublasMathMode(CUDAExecutor *parent, cublasHandle_t handle)
+      : parent_(parent), handle_(handle), ok_(false) {}
+
+  // Attempts the switch to the requested scoped math mode, new_mode.
+  //
+  // Note that when false is returned, an appropriate error has already been
+  // logged.
+  bool Init(cublasMath_t new_mode) {
+    cublasStatus_t ret = wrap::cublasGetMathMode(parent_, handle_, &old_mode_);
+    if (ret != CUBLAS_STATUS_SUCCESS) {
+      LOG(ERROR) << "failed to get old cublas math mode: " << ToString(ret);
+      return ok_ = false;
+    }
+
+    ret = wrap::cublasSetMathMode(parent_, handle_, new_mode);
+    if (ret != CUBLAS_STATUS_SUCCESS) {
+      LOG(ERROR) << "failed to set new cublas math mode: " << ToString(ret);
+      return ok_ = false;
+    }
+    return ok_ = true;
+  }
+
+  // Switches back to the prior math mode, if the switch operation was
+  // successful in the first place.
+  ~ScopedCublasMathMode() {
+    if (ok_) {
+      cublasStatus_t ret = wrap::cublasSetMathMode(parent_, handle_, old_mode_);
+      if (ret != CUBLAS_STATUS_SUCCESS) {
+        LOG(ERROR) << "failed to set former cublas math mode: "
+                   << ToString(ret);
+      }
+    }
+  }
+
+ private:
+  CUDAExecutor *parent_;   // Executor establishing this math mode for.
+  cublasHandle_t handle_;  // Handle to the cuBLAS instance of interest.
+  cublasMath_t old_mode_;  // Prior cuBLAS math mode, to be restored.
+  bool ok_;                // Whether the change was successful.
+};
+#endif  // CUDA_VERSION >= 9000
+
 bool CUDABlas::Init() {
   cublasStatus_t ret = wrap::cublasCreate(parent_, &blas_);
   if (ret != CUBLAS_STATUS_SUCCESS) {
@@ -532,7 +609,7 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
 template <typename FuncT, typename... Args>
 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                   bool pointer_mode_host, bool err_on_failure,
-                                  Args... args) {
+                                  bool use_tensor_op_math, Args... args) {
   mutex_lock lock{mu_};
 
   CHECK(blas_ != nullptr);
@@ -545,7 +622,14 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                            : CUBLAS_POINTER_MODE_DEVICE)) {
     return false;
   }
-
+#if CUDA_VERSION >= 9000
+  ScopedCublasMathMode math_mode{parent_, blas_};
+  if (use_tensor_op_math) {
+    if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
+      return false;
+    }
+  }
+#endif
   cublasStatus_t ret = cublas_func(parent_, blas_, args...);
   if (err_on_failure && ret != CUBLAS_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to run cuBLAS routine " << cublas_func.kName << ": "
@@ -1762,14 +1846,26 @@ bool CUDABlas::DoBlasGemm(
                       "precondition violation";
     }
   }
-  // TODO(sesse): Consider supporting the Hgemm interface, which uses half
-  // calculations internally (faster on newer devices, such as Pascal and TX1,
-  // but less precise).
-  return DoBlasInternal(
+
+  bool use_tensor_ops = false;
+#if CUDA_VERSION >= 9000
+  int cc_major, cc_minor;
+  stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                   &cc_minor);
+
+  // GPUs < sm_70 don't support Volta hardware.
+  if (cc_major >= 7 && TensorOpMathEnabled()) {
+    use_tensor_ops = true;
+  }
+#endif
+
+  return DoBlasInternalImpl(
       wrap::cublasSgemmEx, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), SE_CUDA_DATA_HALF, lda, CUDAMemory(b), SE_CUDA_DATA_HALF,
-      ldb, &beta, CUDAMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
+      true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
+      CUDABlasTranspose(transb), m, n, k, &alpha, CUDAMemory(a),
+      SE_CUDA_DATA_HALF, lda, CUDAMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
+      CUDAMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
+
 #else
   LOG(ERROR) << "fp16 sgemm is not implemented in this cuBLAS version "
              << "(need at least CUDA 7.5)";
@@ -2031,6 +2127,26 @@ bool CUDABlas::DoBlasGemmWithProfilingImpl(
   return result;
 }
 
+static bool UsesTensorOps(blas::AlgorithmType algo) {
+#if CUDA_VERSION >= 9000
+  cublasGemmAlgo_t cublas_algo = static_cast<cublasGemmAlgo_t>(algo);
+  return cublas_algo >= CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+#else
+  return false;
+#endif
+}
+
+template <typename InType>
+static bool TensorOpsAvailable(int cc_major) {
+#if CUDA_VERSION >= 9000
+  if (cc_major >= 7 && TensorOpMathEnabled() &&
+      std::is_same<InType, Eigen::half>::value) {
+    return true;
+  }
+#endif
+  return false;
+}
+
 template <typename InT, typename OutT, typename CompT>
 bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
@@ -2049,6 +2165,10 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     return false;
   }
 
+  if (UsesTensorOps(algorithm) && !TensorOpsAvailable<InT>(cc_major)) {
+    return false;
+  }
+
   struct TimerDeleter {
     void operator()(CUDATimer *t) {
       t->Destroy();
@@ -2098,10 +2218,19 @@ bool CUDABlas::GetBlasGemmAlgorithms(
 // still return the out_algorithms. Caller needs to make sure that in this case,
 // the returned vector is empty.
 #if CUDA_VERSION >= 8000
-  for (cublasGemmAlgo_t algo :
-       {CUBLAS_GEMM_DFALT, CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1,
-        CUBLAS_GEMM_ALGO2, CUBLAS_GEMM_ALGO3, CUBLAS_GEMM_ALGO4,
-        CUBLAS_GEMM_ALGO5, CUBLAS_GEMM_ALGO6, CUBLAS_GEMM_ALGO7}) {
+  for (cublasGemmAlgo_t algo : {
+         CUBLAS_GEMM_DFALT, CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1,
+             CUBLAS_GEMM_ALGO2, CUBLAS_GEMM_ALGO3, CUBLAS_GEMM_ALGO4,
+             CUBLAS_GEMM_ALGO5, CUBLAS_GEMM_ALGO6, CUBLAS_GEMM_ALGO7,
+#if CUDA_VERSION >= 9000
+             CUBLAS_GEMM_ALGO8, CUBLAS_GEMM_ALGO9, CUBLAS_GEMM_ALGO10,
+             CUBLAS_GEMM_ALGO11, CUBLAS_GEMM_ALGO12, CUBLAS_GEMM_ALGO13,
+             CUBLAS_GEMM_ALGO14, CUBLAS_GEMM_ALGO15, CUBLAS_GEMM_ALGO16,
+             CUBLAS_GEMM_ALGO17, CUBLAS_GEMM_DFALT_TENSOR_OP,
+             CUBLAS_GEMM_ALGO0_TENSOR_OP, CUBLAS_GEMM_ALGO1_TENSOR_OP,
+             CUBLAS_GEMM_ALGO2_TENSOR_OP
+#endif
+       }) {
     out_algorithms->push_back(algo);
   }
 #endif
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 80cda971173fe34658f3403f1354babbd02e6ff9..deb211c04bcaa9e98ee04c5e9066a2a13092cb06 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -84,7 +84,7 @@ class CUDABlas : public blas::BlasSupport {
   template <typename FuncT, typename... Args>
   bool DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                           bool pointer_mode_host, bool err_on_failure,
-                          Args... args);
+                          bool use_tensor_op_math, Args... args);
 
   // Convenience functions that call DoBlasInternalImpl with different values
   // for err_on_failure.
@@ -92,13 +92,17 @@ class CUDABlas : public blas::BlasSupport {
   bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
                       Args... args) {
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/true, args...);
+                              /*err_on_failure=*/true, /*use_tensor_ops=*/false,
+                              args...);
   }
   template <typename FuncT, typename... Args>
   bool DoBlasInternalFailureOK(FuncT cublas_func, Stream *stream,
                                bool pointer_mode_host, Args... args) {
+    // Tensor ops are hard-coded off in this path, but can still be enabled with
+    // a specific algorithm choice as in DoBlasGemmWithAlgorithmImpl().
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/false, args...);
+                              /*err_on_failure=*/false,
+                              /*use_tensor_ops=*/false, args...);
   }
 
   // A helper function to implement DoBlasGemmBatched interfaces for generic
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index 00506fa54be93b54966a5b374b02286b7e784776..933c103f524ef37f840c9e13b9e4024289e274c1 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -232,7 +232,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
       result = StringToDriverVersion(version);
     }
 #else
-#if !defined(PLATFORM_WINDOWS)
+#if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
   // Callback used when iterating through DSOs. Looks for the driver-interfacing
   // DSO and yields its version number into the callback data, when found.
   auto iterate_phdr =
@@ -366,8 +366,8 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
   contents[kContentsSize - 1] = '\0';
 
   if (retcode != 0) {
-    LOG(INFO) << "driver version file contents: \"\"\"" << contents.begin()
-              << "\"\"\"";
+    VLOG(1) << "driver version file contents: \"\"\"" << contents.begin()
+            << "\"\"\"";
     fclose(driver_version_file);
     return FindKernelModuleVersion(contents.begin());
   }
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index d78362d4fbac3a6058743383d832bfc3df133a2f..b6abd42767f7b7048dce30d2b7a5b524513ff79c 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -109,6 +109,24 @@ string ToString(cudnnStatus_t status) {
   }
 }
 
+template <typename T>
+cudnnDataType_t GetCudnnDataType();
+
+template <>
+cudnnDataType_t GetCudnnDataType<double>() {
+  return CUDNN_DATA_DOUBLE;
+}
+
+template <>
+cudnnDataType_t GetCudnnDataType<float>() {
+  return CUDNN_DATA_FLOAT;
+}
+
+template <>
+cudnnDataType_t GetCudnnDataType<Eigen::half>() {
+  return CUDNN_DATA_HALF;
+}
+
 namespace wrap {
 
 static port::ThreadPool* InitCudnnThreadpool() {
@@ -559,10 +577,11 @@ class ScopedFilterDescriptor {
 // A helper function to decide whether to enable the TENSOR_OP_MATH math type
 static bool TensorOpMathEnabled() {
   static bool is_enabled = [] {
-    bool ret;
-    TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DISABLE_TENSOR_OP_MATH",
-                                               /*default=*/false, &ret));
-    return !ret;
+    bool is_disabled;
+    TF_CHECK_OK(
+        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_TENSOR_OP_MATH",
+                                       /*default_val=*/false, &is_disabled));
+    return !is_disabled;
   }();
   return is_enabled;
 }
@@ -2105,7 +2124,6 @@ inline cudnnConvolutionFwdAlgo_t GetCudnnConvolutionForwardAlgo(
 
 dnn::AlgorithmDesc GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, CUDAExecutor* parent, void* dnn_handle,
-    int cudnn_type,  // Actually cudnnDataType_t.
     const dnn::AlgorithmConfig& algorithm_config, bool is_profiling,
     const ScopedTensorDescriptor& input_nd,
     const ScopedFilterDescriptor& filter,
@@ -2263,8 +2281,8 @@ cudnnDataType_t GetConvComputeType<Eigen::half>() {
 
 template <class T>
 bool CudnnSupport::DoConvolveImpl(
-    Stream* stream, int cudnn_type,  // Actually cudnnDataType_t.
-    const BatchDescriptor& batch_descriptor, const DeviceMemory<T>& input_data,
+    Stream* stream, const BatchDescriptor& batch_descriptor,
+    const DeviceMemory<T>& input_data,
     const FilterDescriptor& filter_descriptor,
     const DeviceMemory<T>& filter_data,
     const ConvolutionDescriptor& convolution_descriptor,
@@ -2272,12 +2290,11 @@ bool CudnnSupport::DoConvolveImpl(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  ScopedTensorDescriptor input_nd{parent_, batch_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_type)};
-  ScopedTensorDescriptor output_nd{parent_, output_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_type)};
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  ScopedTensorDescriptor input_nd{parent_, batch_descriptor, cudnn_type};
+  ScopedTensorDescriptor output_nd{parent_, output_descriptor, cudnn_type};
   ScopedFilterDescriptor filter{parent_, filter_descriptor, batch_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_type)};
+                                cudnn_type};
   ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
                                    GetConvComputeType<T>()};
 
@@ -2504,9 +2521,8 @@ bool CudnnSupport::DoFusedConvolveImpl(
   const bool is_profiling = output_profile_result != nullptr;
   DeviceMemory<uint8> scratch;
   dnn::AlgorithmDesc algotype = GetCudnnConvolutionForwardAlgorithm(
-      stream, parent_, dnn_handle_, cudnn_data_type, algorithm_config,
-      is_profiling, conv_input_nd, filter, conv, output_nd, scratch_allocator,
-      &scratch);
+      stream, parent_, dnn_handle_, algorithm_config, is_profiling,
+      conv_input_nd, filter, conv, output_nd, scratch_allocator, &scratch);
   if (algotype.is_default()) {
     if (!is_profiling) {
       LOG(ERROR) << "No suitable algorithm found";
@@ -2677,7 +2693,7 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
       // CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
       // clang-format on
   };
-#if CUDNN_VERSION >= 5110
+#if CUDNN_VERSION >= 5100
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
   }
@@ -2761,14 +2777,27 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
   float zero = 0.0;
 
   if (is_training) {
-    stream->ThenMemZero(batch_mean, batch_mean->size());
-    stream->ThenMemZero(batch_var, batch_var->size());
+    CHECK_EQ(batch_mean->is_null(), batch_var->is_null())
+        << "batch_mean and batch_var must both be null or both be non-null";
+
+    void* batch_mean_opaque;
+    void* batch_var_opaque;
+    if (!batch_mean->is_null() && !batch_var->is_null()) {
+      stream->ThenMemZero(batch_mean, batch_mean->size());
+      stream->ThenMemZero(batch_var, batch_var->size());
+      batch_mean_opaque = batch_mean->opaque();
+      batch_var_opaque = batch_var->opaque();
+    } else {
+      batch_mean_opaque = nullptr;
+      batch_var_opaque = nullptr;
+    }
+
     status = wrap::cudnnBatchNormalizationForwardTraining(
         parent_, ToHandle(dnn_handle_), mode, &one, &zero,
         x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
         scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(), 1.0,
-        batch_mean->opaque(), batch_var->opaque(), epsilon,
-        saved_mean->opaque(), saved_inv_var->opaque());
+        batch_mean_opaque, batch_var_opaque, epsilon, saved_mean->opaque(),
+        saved_inv_var->opaque());
 #if CUDNN_VERSION < 5000
     CHECK(inv_var_to_var);
     inv_var_to_var();
@@ -2797,28 +2826,28 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
 bool CudnnSupport::DoBatchNormalizationBackward(
     Stream* stream, const DeviceMemory<float>& y_backprop,
     const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
-    const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+    const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
     const dnn::BatchDescriptor& x_desc,
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
     DeviceMemory<float>* offset_backprop) {
   return DoBatchNormalizationBackwardImpl(
       stream, CUDNN_DATA_FLOAT, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean,
-      variance, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
+      inv_var, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
       offset_backprop);
 }
 
 bool CudnnSupport::DoBatchNormalizationBackward(
     Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
     const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
-    const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+    const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
     const dnn::BatchDescriptor& x_desc,
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<Eigen::half>* x_backprop, DeviceMemory<float>* scale_backprop,
     DeviceMemory<float>* offset_backprop) {
   return DoBatchNormalizationBackwardImpl(
       stream, CUDNN_DATA_HALF, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean,
-      variance, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
+      inv_var, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
       offset_backprop);
 }
 
@@ -2827,7 +2856,7 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
     Stream* stream, int cudnn_input_type, int cudnn_scale_type,
     const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
     const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
-    const DeviceMemory<U>& variance, const dnn::BatchDescriptor& x_desc,
+    const DeviceMemory<U>& inv_var, const dnn::BatchDescriptor& x_desc,
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
     DeviceMemory<U>* offset_backprop) {
@@ -2854,7 +2883,7 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
       y_backprop.opaque(), x_descriptor.handle(), x_backprop->opaque(),
       scale_offset_descriptor.handle(), scale.opaque(),
       scale_backprop->opaque(), offset_backprop->opaque(), epsilon,
-      mean.opaque(), variance.opaque());
+      mean.opaque(), inv_var.opaque());
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to enqueue backward batch normalization on stream: "
                << ToString(status);
@@ -2874,9 +2903,9 @@ bool CudnnSupport::DoConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveImpl<float>(
-      stream, CUDNN_DATA_FLOAT, batch_descriptor, input_data, filter_descriptor,
-      filter_data, convolution_descriptor, output_descriptor, output_data,
-      scratch_allocator, algorithm_config, output_profile_result);
+      stream, batch_descriptor, input_data, filter_descriptor, filter_data,
+      convolution_descriptor, output_descriptor, output_data, scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
 bool CudnnSupport::DoConvolve(
@@ -2902,9 +2931,9 @@ bool CudnnSupport::DoConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveImpl<Eigen::half>(
-      stream, CUDNN_DATA_HALF, batch_descriptor, input_data, filter_descriptor,
-      filter_data, convolution_descriptor, output_descriptor, output_data,
-      scratch_allocator, algorithm_config, output_profile_result);
+      stream, batch_descriptor, input_data, filter_descriptor, filter_data,
+      convolution_descriptor, output_descriptor, output_data, scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
 bool CudnnSupport::DoFusedConvolve(
@@ -2927,7 +2956,6 @@ bool CudnnSupport::DoFusedConvolve(
       side_input_scale, bias_descriptor, biases, activation_mode,
       output_descriptor, output_data, scratch_allocator, algorithm_config,
       output_profile_result);
-  return true;
 }
 
 bool CudnnSupport::DoFusedConvolve(
@@ -2950,7 +2978,6 @@ bool CudnnSupport::DoFusedConvolve(
       side_input_scale, bias_descriptor, biases, activation_mode,
       output_descriptor, output_data, scratch_allocator, algorithm_config,
       output_profile_result);
-  return true;
 }
 
 bool CudnnSupport::DoFusedConvolve(
@@ -2974,7 +3001,6 @@ bool CudnnSupport::DoFusedConvolve(
       side_input_scale, bias_descriptor, biases, activation_mode,
       output_descriptor, output_data, scratch_allocator, algorithm_config,
       output_profile_result);
-  return true;
 }
 
 bool CudnnSupport::DoFusedConvolve(
@@ -3016,7 +3042,6 @@ bool CudnnSupport::DoFusedConvolve(
 template<class T>
 DeviceMemory<T> CudnnSupport::MaybeTransformLayout(
     Stream* stream,
-    int cudnn_type,  // Actually cudnnDataType_t.
     BatchDescriptor* output_descriptor,
     DeviceMemory<T> backward_output_data,
     std::unique_ptr<TemporaryDeviceMemory<T>>* transform_scratch) {
@@ -3030,11 +3055,11 @@ DeviceMemory<T> CudnnSupport::MaybeTransformLayout(
   BatchDescriptor transformed_output_descriptor;
   transformed_output_descriptor.CloneFrom(*output_descriptor);
   transformed_output_descriptor.set_layout(dnn::DataLayout::kBatchDepthYX);
-  ScopedTensorDescriptor orig_out_back_nd{
-      parent_, *output_descriptor, static_cast<cudnnDataType_t>(cudnn_type)};
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  ScopedTensorDescriptor orig_out_back_nd{parent_, *output_descriptor,
+                                          cudnn_type};
   ScopedTensorDescriptor transformed_out_back_nd{
-      parent_, transformed_output_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_type)};
+      parent_, transformed_output_descriptor, cudnn_type};
 
   float alpha = 1.0f;
   float beta = 0.0f;
@@ -3081,7 +3106,6 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
 template <class T>
 bool CudnnSupport::DoConvolveBackwardDataImpl(
     Stream* stream,
-    int cudnn_type,  // Actually cudnnDataType_t.
     const FilterDescriptor& filter_descriptor,
     const DeviceMemory<T>& filter_data,
     const BatchDescriptor& output_descriptor_in,
@@ -3108,15 +3132,13 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
   output_descriptor.CloneFrom(output_descriptor_in);
   std::unique_ptr<TemporaryDeviceMemory<T>> transform_scratch;
   backward_output_data = MaybeTransformLayout(
-      stream, cudnn_type, &output_descriptor, backward_output_data,
-      &transform_scratch);
+      stream, &output_descriptor, backward_output_data, &transform_scratch);
 
-  ScopedTensorDescriptor out_back_nd{parent_, output_descriptor,
-                                     static_cast<cudnnDataType_t>(cudnn_type)};
-  ScopedTensorDescriptor in_back_nd{parent_, input_descriptor,
-                                    static_cast<cudnnDataType_t>(cudnn_type)};
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  ScopedTensorDescriptor out_back_nd{parent_, output_descriptor, cudnn_type};
+  ScopedTensorDescriptor in_back_nd{parent_, input_descriptor, cudnn_type};
   ScopedFilterDescriptor filter{parent_, filter_descriptor, input_descriptor,
-                                static_cast<cudnnDataType_t>(cudnn_type)};
+                                cudnn_type};
   ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
                                    GetConvComputeType<T>()};
 
@@ -3304,11 +3326,11 @@ bool CudnnSupport::DoConvolveBackwardData(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardDataImpl(
-      stream, CUDNN_DATA_FLOAT, filter_descriptor, filter_data,
-      output_descriptor_in, backward_output_data, convolution_descriptor,
-      input_descriptor, backward_input_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+  return DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
+                                    output_descriptor_in, backward_output_data,
+                                    convolution_descriptor, input_descriptor,
+                                    backward_input_data, scratch_allocator,
+                                    algorithm_config, output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardData(
@@ -3322,17 +3344,16 @@ bool CudnnSupport::DoConvolveBackwardData(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardDataImpl(
-      stream, CUDNN_DATA_HALF, filter_descriptor, filter_data,
-      output_descriptor_in, backward_output_data, convolution_descriptor,
-      input_descriptor, backward_input_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+  return DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
+                                    output_descriptor_in, backward_output_data,
+                                    convolution_descriptor, input_descriptor,
+                                    backward_input_data, scratch_allocator,
+                                    algorithm_config, output_profile_result);
 }
 
 template <class T>
 bool CudnnSupport::DoConvolveBackwardFilterImpl(
-    Stream* stream, int cudnn_type,  // Actually cudnnDataType_t.
-    const dnn::BatchDescriptor& input_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<T>& input_data,
     const dnn::BatchDescriptor& output_descriptor_in,
     DeviceMemory<T> backward_output_data,
@@ -3358,16 +3379,13 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
   output_descriptor.CloneFrom(output_descriptor_in);
   std::unique_ptr<TemporaryDeviceMemory<T>> transform_scratch;
   backward_output_data = MaybeTransformLayout(
-      stream, static_cast<cudnnDataType_t>(cudnn_type),
-      &output_descriptor, backward_output_data,
-      &transform_scratch);
-
-  ScopedTensorDescriptor out_back_nd{parent_, output_descriptor,
-        static_cast<cudnnDataType_t>(cudnn_type)};
-  ScopedTensorDescriptor input_nd{parent_, input_descriptor,
-          static_cast<cudnnDataType_t>(cudnn_type)};
+      stream, &output_descriptor, backward_output_data, &transform_scratch);
+
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  ScopedTensorDescriptor out_back_nd{parent_, output_descriptor, cudnn_type};
+  ScopedTensorDescriptor input_nd{parent_, input_descriptor, cudnn_type};
   ScopedFilterDescriptor filter{parent_, filter_descriptor, input_descriptor,
-        static_cast<cudnnDataType_t>(cudnn_type)};
+                                cudnn_type};
   ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
                                    GetConvComputeType<T>()};
 
@@ -3557,10 +3575,10 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveBackwardFilterImpl(
-      stream, CUDNN_DATA_FLOAT, input_descriptor, input_data,
-      output_descriptor_in, backward_output_data, convolution_descriptor,
-      filter_descriptor, backward_filter_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+      stream, input_descriptor, input_data, output_descriptor_in,
+      backward_output_data, convolution_descriptor, filter_descriptor,
+      backward_filter_data, scratch_allocator, algorithm_config,
+      output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
@@ -3575,16 +3593,15 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveBackwardFilterImpl(
-      stream, CUDNN_DATA_HALF, input_descriptor, input_data,
-      output_descriptor_in, backward_output_data, convolution_descriptor,
-      filter_descriptor, backward_filter_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+      stream, input_descriptor, input_data, output_descriptor_in,
+      backward_output_data, convolution_descriptor, filter_descriptor,
+      backward_filter_data, scratch_allocator, algorithm_config,
+      output_profile_result);
 }
 
 template <class T>
 bool CudnnSupport::DoConvolveBackwardBiasImpl(
-    Stream* stream, int cudnn_type,  // Actually cudnnDataType_t.
-    const dnn::BatchDescriptor& input_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<T>& input_data,
     const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<T>* backward_bias_data) {
@@ -3595,10 +3612,9 @@ bool CudnnSupport::DoConvolveBackwardBiasImpl(
     LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
   }
 
-  ScopedTensorDescriptor input_nd{parent_, input_descriptor,
-                                  static_cast<cudnnDataType_t>(cudnn_type)};
-  ScopedTensorDescriptor bias_nd{parent_, bias_descriptor,
-                                 static_cast<cudnnDataType_t>(cudnn_type)};
+  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  ScopedTensorDescriptor input_nd{parent_, input_descriptor, cudnn_type};
+  ScopedTensorDescriptor bias_nd{parent_, bias_descriptor, cudnn_type};
 
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
@@ -3622,9 +3638,8 @@ bool CudnnSupport::DoConvolveBackwardBias(
     const DeviceMemory<double>& input_data,
     const BatchDescriptor& bias_descriptor,
     DeviceMemory<double>* backward_bias_data) {
-  return DoConvolveBackwardBiasImpl(stream, CUDNN_DATA_DOUBLE, input_descriptor,
-                                    input_data, bias_descriptor,
-                                    backward_bias_data);
+  return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
+                                    bias_descriptor, backward_bias_data);
 }
 
 bool CudnnSupport::DoConvolveBackwardBias(
@@ -3632,9 +3647,8 @@ bool CudnnSupport::DoConvolveBackwardBias(
     const DeviceMemory<float>& input_data,
     const BatchDescriptor& bias_descriptor,
     DeviceMemory<float>* backward_bias_data) {
-  return DoConvolveBackwardBiasImpl(stream, CUDNN_DATA_FLOAT, input_descriptor,
-                                    input_data, bias_descriptor,
-                                    backward_bias_data);
+  return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
+                                    bias_descriptor, backward_bias_data);
 }
 
 bool CudnnSupport::DoConvolveBackwardBias(
@@ -3642,9 +3656,8 @@ bool CudnnSupport::DoConvolveBackwardBias(
     const DeviceMemory<Eigen::half>& input_data,
     const BatchDescriptor& bias_descriptor,
     DeviceMemory<Eigen::half>* backward_bias_data) {
-  return DoConvolveBackwardBiasImpl(stream, CUDNN_DATA_HALF, input_descriptor,
-                                    input_data, bias_descriptor,
-                                    backward_bias_data);
+  return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
+                                    bias_descriptor, backward_bias_data);
 }
 
 bool CudnnSupport::DoMatMul(Stream* stream,
@@ -4251,7 +4264,12 @@ bool CudnnSupport::DoDepthConcatenate(
   for (size_t i = 0; i < input_data.size(); ++i) {
     const auto& dimensions = input_dimensions[i];
     tmp.resize(dimensions.ElementCount());
-    stream->ThenMemcpyD2H<float>(*input_data[i], &tmp).BlockHostUntilDone();
+    stream->ThenMemcpyD2H<float>(*input_data[i], &tmp);
+    port::Status block_status = stream->BlockHostUntilDone();
+    if (!block_status.ok()) {
+      LOG(ERROR) << "BlockHostUntilDone failed: " << block_status;
+      return false;
+    }
 
     for (int64 batch = 0; batch < output_dimensions.count(); ++batch) {
       for (int64 yx = 0; yx < area; ++yx) {
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 14986286f1dd4c4ced1ebaf6adbada8e52096b92..40aa974dd967df50075da6f2bb34439cd238a113 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -226,7 +226,7 @@ class CudnnSupport : public dnn::DnnSupport {
   bool DoBatchNormalizationBackward(
       Stream* stream, const DeviceMemory<float>& y_backprop,
       const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
-      const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+      const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
@@ -235,7 +235,7 @@ class CudnnSupport : public dnn::DnnSupport {
   bool DoBatchNormalizationBackward(
       Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
       const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
-      const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+      const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<Eigen::half>* x_backprop,
@@ -611,7 +611,6 @@ class CudnnSupport : public dnn::DnnSupport {
   template<class T>
   DeviceMemory<T> MaybeTransformLayout(
       Stream* stream,
-      int cudnn_type,  // Actually cudnnDataType_t.
       dnn::BatchDescriptor* output_descriptor,
       DeviceMemory<T> backward_output_data,
       std::unique_ptr<TemporaryDeviceMemory<T>>* transform_scratch)
@@ -637,14 +636,13 @@ class CudnnSupport : public dnn::DnnSupport {
       Stream* stream, int cudnn_input_type, int cudnn_scale_type,
       const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
       const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
-      const DeviceMemory<U>& variance, const dnn::BatchDescriptor& x_desc,
+      const DeviceMemory<U>& inv_var, const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
       DeviceMemory<U>* offset_backprop);
 
   template <class T>
   bool DoConvolveImpl(Stream* stream,
-                      int cudnn_type,  // Actually cudnnDataType_t.
                       const dnn::BatchDescriptor& batch_descriptor,
                       const DeviceMemory<T>& input_data,
                       const dnn::FilterDescriptor& filter_descriptor,
@@ -675,7 +673,6 @@ class CudnnSupport : public dnn::DnnSupport {
   template <class T>
   bool DoConvolveBackwardDataImpl(
       Stream* stream,
-      int cudnn_type,  // Actually cudnnDataType_t.
       const dnn::FilterDescriptor& filter_descriptor,
       const DeviceMemory<T>& filter_data,
       const dnn::BatchDescriptor& output_descriptor,
@@ -688,8 +685,7 @@ class CudnnSupport : public dnn::DnnSupport {
 
   template <class T>
   bool DoConvolveBackwardFilterImpl(
-      Stream* stream, int cudnn_type,  // Actually cudnnDataType_t.
-      const dnn::BatchDescriptor& input_descriptor,
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<T>& input_data,
       const dnn::BatchDescriptor& output_descriptor_in,
       DeviceMemory<T> backward_output_data,
@@ -702,7 +698,6 @@ class CudnnSupport : public dnn::DnnSupport {
 
   template <class T>
   bool DoConvolveBackwardBiasImpl(Stream* stream,
-                                  int cudnn_type,  // Actually cudnnDataType_t.
                                   const dnn::BatchDescriptor& input_descriptor,
                                   const DeviceMemory<T>& input_data,
                                   const dnn::BatchDescriptor& bias_descriptor,
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index b6a96ed3e5cbda044c00bb9b940d68f80373587a..a017ff64d4c69b6952b442464877dc26a800ad37 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -1115,19 +1115,20 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::SynchronizeStream(CudaContext* context,
-                                                CUstream stream) {
+/* static */ port::Status CUDADriver::SynchronizeStream(CudaContext *context,
+                                                        CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
   CUresult res = cuStreamSynchronize(stream);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << "could not synchronize on CUDA stream: " << ToString(res)
-               << " :: " << port::CurrentStackTrace();
-    return false;
+    port::Status status = port::InternalError(
+        port::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
+    LOG(ERROR) << status << " :: " << port::CurrentStackTrace();
+    return status;
   }
   VLOG(2) << "successfully synchronized stream " << stream << " on context "
           << context;
-  return true;
+  return port::Status::OK();
 }
 
 /* static */ bool CUDADriver::IsStreamIdle(CudaContext *context,
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index 68494aba6597c2cd1ee52a7b4cb411cd50fad77b..4002ba2021d1a2e2c36bd1786a3084ee8c08bb78 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -304,7 +304,7 @@ class CUDADriver {
   // amount of time?
   //
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
-  static bool SynchronizeStream(CudaContext* context, CUstream stream);
+  static port::Status SynchronizeStream(CudaContext* context, CUstream stream);
 
   // Blocks the calling thread until the operations associated with the context
   // have been completed, via cuCtxSynchronize.
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index 7be2bccebc00d461d673b819fdc95841f452db08..a922f14fb4af695877b449d2f960fae1a356a82f 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -184,12 +184,11 @@ port::Status CUDAFftPlan::Initialize(
         return port::Status{port::error::INTERNAL,
                             "Failed to set auto allocation for cuFFT plan."};
       }
-      size_t size_in_bytes;
       switch (rank) {
         case 1:
           ret = wrap::cufftMakePlan1d(parent, plan_, elem_count_[0],
                                       CUDAFftType(type), /*batch=*/1,
-                                      &size_in_bytes);
+                                      &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to make cuFFT 1d plan:" << ret;
             return port::Status{port::error::INTERNAL,
@@ -199,7 +198,7 @@ port::Status CUDAFftPlan::Initialize(
         case 2:
           ret = wrap::cufftMakePlan2d(parent, plan_, elem_count_[0],
                                       elem_count_[1], CUDAFftType(type),
-                                      &size_in_bytes);
+                                      &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to make cuFFT 2d plan:" << ret;
             return port::Status{port::error::INTERNAL,
@@ -209,7 +208,7 @@ port::Status CUDAFftPlan::Initialize(
         case 3:
           ret = wrap::cufftMakePlan3d(parent, plan_, elem_count_[0],
                                       elem_count_[1], elem_count_[2],
-                                      CUDAFftType(type), &size_in_bytes);
+                                      CUDAFftType(type), &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to make cuFFT 3d plan:" << ret;
             return port::Status{port::error::INTERNAL,
@@ -223,24 +222,7 @@ port::Status CUDAFftPlan::Initialize(
           return port::Status{port::error::INVALID_ARGUMENT,
                               "cufftPlan only takes rank 1, 2, or 3."};
       }
-      // TODO(yangzihao): refactor this code and the one with the same function
-      // in the batch mode.
-      if (size_in_bytes != 0) {
-        auto allocated =
-            scratch_allocator->AllocateBytes(stream, size_in_bytes);
-        if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
-          LOG(ERROR) << "failed to allocate work area.";
-          return allocated.status();
-        }
-      }
-      // Connect work area with allocated space.
-      ret = wrap::cufftSetWorkArea(parent, plan_, scratch_.opaque());
-      if (ret != CUFFT_SUCCESS) {
-        LOG(ERROR) << "failed to set work area for cuFFT plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to set work area for cuFFT plan."};
-      }
-      return port::Status::OK();
+      return UpdateScratchAllocator(stream, scratch_allocator);
     }
   } else {
     // For either multiple batches or rank higher than 3, use cufftPlanMany().
@@ -270,32 +252,18 @@ port::Status CUDAFftPlan::Initialize(
             port::error::INTERNAL,
             "Failed to set auto allocation for cuFFT batched plan."};
       }
-      size_t size_in_bytes;
       ret = wrap::cufftMakePlanMany(
           parent, plan_, rank, elem_count_,
           input_embed ? input_embed_ : nullptr, input_stride, input_distance,
           output_embed ? output_embed_ : nullptr, output_stride,
-          output_distance, CUDAFftType(type), batch_count, &size_in_bytes);
+          output_distance, CUDAFftType(type), batch_count,
+          &scratch_size_bytes_);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to make cuFFT batched plan:" << ret;
         return port::Status{port::error::INTERNAL,
                             "Failed to make cuFFT batched plan."};
       }
-      if (size_in_bytes != 0) {
-        auto allocated =
-            scratch_allocator->AllocateBytes(stream, size_in_bytes);
-        if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
-          LOG(ERROR) << "failed to allocate work area.";
-          return allocated.status();
-        }
-      }
-      // Connect work area with allocated space.
-      ret = wrap::cufftSetWorkArea(parent, plan_, scratch_.opaque());
-      if (ret != CUFFT_SUCCESS) {
-        LOG(ERROR) << "failed to set work area for cuFFT batched plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to set work area for cuFFT batched plan."};
-      }
+      return UpdateScratchAllocator(stream, scratch_allocator);
     }
   }
   return port::Status::OK();
@@ -312,6 +280,26 @@ port::Status CUDAFftPlan::Initialize(CUDAExecutor *parent, Stream *stream,
                     /*output_distance=*/0, type, 1, scratch_allocator);
 }
 
+port::Status CUDAFftPlan::UpdateScratchAllocator(
+    Stream *stream, ScratchAllocator *scratch_allocator) {
+  if (scratch_size_bytes_ != 0) {
+    auto allocated =
+        scratch_allocator->AllocateBytes(stream, scratch_size_bytes_);
+    if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
+      LOG(ERROR) << "failed to allocate work area.";
+      return allocated.status();
+    }
+  }
+  // Connect work area with allocated space.
+  cufftResult_t ret = wrap::cufftSetWorkArea(parent_, plan_, scratch_.opaque());
+  if (ret != CUFFT_SUCCESS) {
+    LOG(ERROR) << "failed to set work area for cuFFT plan:" << ret;
+    return port::Status{port::error::INTERNAL,
+                        "Failed to set work area for cuFFT plan."};
+  }
+  return port::Status::OK();
+}
+
 CUDAFftPlan::~CUDAFftPlan() { wrap::cufftDestroy(parent_, plan_); }
 
 int CUDAFftPlan::GetFftDirection() const {
@@ -461,6 +449,17 @@ std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlanWithScratchAllocator(
   return std::move(fft_plan_ptr);
 }
 
+void CUDAFft::UpdatePlanWithScratchAllocator(
+    Stream *stream, fft::Plan *plan, ScratchAllocator *scratch_allocator) {
+  CUDAFftPlan *cuda_fft_plan = dynamic_cast<CUDAFftPlan *>(plan);
+  port::Status status =
+      cuda_fft_plan->UpdateScratchAllocator(stream, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to update custom allocator for cufft plan: "
+               << status.error_message();
+  }
+}
+
 template <typename FuncT, typename InputT, typename OutputT>
 bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec,
                             const DeviceMemory<InputT> &input,
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h
index 16102eb945a11d7083ebcfe29796b3fb5aa15a9c..04c7dfe501c451e4848bef68bed9685c079dd523 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.h
+++ b/tensorflow/stream_executor/cuda/cuda_fft.h
@@ -50,6 +50,7 @@ class CUDAFftPlan : public fft::Plan {
         plan_(-1),
         fft_type_(fft::Type::kInvalid),
         scratch_(nullptr),
+        scratch_size_bytes_(0),
         is_initialized_(false) {}
   ~CUDAFftPlan() override;
 
@@ -76,6 +77,9 @@ class CUDAFftPlan : public fft::Plan {
                           uint64 *elem_count, fft::Type type,
                           ScratchAllocator *scratch_allocator);
 
+  port::Status UpdateScratchAllocator(Stream *stream,
+                                      ScratchAllocator *scratch_allocator);
+
  protected:
   bool IsInitialized() const { return is_initialized_; }
 
@@ -84,6 +88,7 @@ class CUDAFftPlan : public fft::Plan {
   cufftHandle plan_;
   fft::Type fft_type_;
   DeviceMemory<uint8> scratch_;
+  size_t scratch_size_bytes_;
   bool is_initialized_;
 };
 
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 64d14f29dfee2a78a0fee1d8b336f4aa191ba086..4bbd531e14f18fc24d87b4fa655fe72e9f56b129 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -380,9 +380,9 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
                                 thread_dims.z, args.number_of_shared_bytes(),
                                 custream, kernel_params,
                                 nullptr /* = extra */)) {
-    LOG(ERROR) << "failed to launch CUDA kernel with args: "
+    LOG(ERROR) << "failed to launch CUDA kernel " << kernel.name() << " with "
                << args.number_of_arguments()
-               << "; thread dim: " << thread_dims.ToString()
+               << " args; thread dim: " << thread_dims.ToString()
                << "; block dim: " << block_dims.ToString();
     return false;
   }
@@ -664,7 +664,7 @@ bool CUDAExecutor::StopTimer(Stream *stream, Timer *timer) {
   return AsCUDATimer(timer)->Stop(AsCUDAStream(stream));
 }
 
-bool CUDAExecutor::BlockHostUntilDone(Stream *stream) {
+port::Status CUDAExecutor::BlockHostUntilDone(Stream *stream) {
   return CUDADriver::SynchronizeStream(context_, AsCUDAStreamValue(stream));
 }
 
@@ -861,6 +861,9 @@ static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
 #elif defined(PLATFORM_WINDOWS)
   // Windows support for NUMA is not currently implemented. Return node 0.
   return 0;
+#elif defined(__aarch64__)
+  LOG(INFO) << "ARM64 does not support NUMA - returning NUMA node zero";
+  return 0;
 #else
   VLOG(2) << "trying to read NUMA node for device ordinal: " << device_ordinal;
   static const int kUnknownNumaNode = -1;
@@ -925,16 +928,129 @@ struct UnqueryableDeviceParams {
   uint64 shared_memory_alloc_granularity;
 };
 
+// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
+// https://developer.download.nvidia.com/compute/cuda/CUDA_Occupancy_calculator.xls
 static const UnqueryableDeviceParams kAllUnqueryableDeviceParams[] = {
-  {
-    3, 5,       // compute capability (3.5)
-    16,         // blocks_per_core_limit
-    64 * 1024,  // registers_per_core_limit
-    255,        // registers_per_thread_limit
-    4,          // warp_alloc_granularity
-    256,        // register_alloc_granularity
-    256         // shared_memory_alloc_granularity
-  }
+    {
+        2, 0,       // compute capability (2.0)
+        8,          // blocks_per_core_limit
+        32 * 1024,  // registers_per_core_limit
+        63,         // registers_per_thread_limit
+        2,          // warp_alloc_granularity
+        64,         // register_alloc_granularity
+        128,        // shared_memory_alloc_granularity
+    },
+    {
+        2, 1,       // compute capability (2.1)
+        8,          // blocks_per_core_limit
+        32 * 1024,  // registers_per_core_limit
+        63,         // registers_per_thread_limit
+        2,          // warp_alloc_granularity
+        64,         // register_alloc_granularity
+        128,        // shared_memory_alloc_granularity
+    },
+    {
+        3, 0,       // compute capability (3.0)
+        16,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        63,         // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        3, 2,       // compute capability (3.2)
+        16,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        3, 5,       // compute capability (3.5)
+        16,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        3, 7,        // compute capability (3.7)
+        16,          // blocks_per_core_limit
+        128 * 1024,  // registers_per_core_limit
+        255,         // registers_per_thread_limit
+        4,           // warp_alloc_granularity
+        256,         // register_alloc_granularity
+        256,         // shared_memory_alloc_granularity
+    },
+    {
+        5, 0,       // compute capability (5.0)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        5, 2,       // compute capability (5.2)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        5, 3,       // compute capability (5.3)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        6, 0,       // compute capability (6.0)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        2,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        6, 1,       // compute capability (6.1)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        6, 2,       // compute capability (6.2)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    // TODO(jlebar): Confirm the alloc granularity values for sm_70.  These are
+    // not published in the spreadsheet linked above.  Currently we guess that
+    // they're the same as sm_60.
+    {
+        7, 0,       // compute capability (7.0)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        2,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
 };
 
 DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 8ff4a30d6251dfe4cbbbf1a9c632b6383e964436..dbbbcd476f096ff912d391604ba349f6cb979478 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -152,7 +152,7 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
 
   Event::Status PollForEventStatus(Event *event) override;
 
-  bool BlockHostUntilDone(Stream *stream) override;
+  port::Status BlockHostUntilDone(Stream *stream) override;
 
   int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); }
 
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 0d2cd4a9f2fb3068f9a803e616ff5fa1712f4945..aa88fe770f3596e5da5e12705c3b706365382134 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -896,7 +896,7 @@ class DnnSupport {
   //  offset: offset parameters.
   //  estimated_mean: population mean estimated during training.
   //    Used for inference only; empty for training.
-  //  estimated_variance: population variance estimated during traning,
+  //  estimated_variance: population variance estimated during training,
   //    used for inference only; empty for training.
   //  x_desc: dimensions of the input data, which is the same as the dimensions
   //    of the output.
@@ -908,8 +908,8 @@ class DnnSupport {
   //    the running variance.
   //  reserve_space_1: saved mean, to be reused in the backward gradient
   //    computation.
-  //  reserve_space_2: saved variance, to be reused in the backward gradient
-  //    computation.
+  //  reserve_space_2: saved inv_var (1/sqrt(epsilon + variance), to be reused
+  //    in the backward gradient computation.
   //  is_training: Set to true for training, false for inference.
   //  var_to_inv_var: a function to convert the variance to inverted variance
   //    for cuDNN v4 forward inference.
@@ -957,6 +957,7 @@ class DnnSupport {
   //  y_backprop: gradient with regard to output y.
   //  x: input data.
   //  scale: scaling parameters.
+  //  inv_var: 1/sqrt(epsilon + variance) of x.
   //  x_desc: dimensions of the input data, which is the same as the dimensions
   //    of the output.
   //  scale_offset_desc: dimensions of scale and offset.
@@ -967,7 +968,7 @@ class DnnSupport {
   virtual bool DoBatchNormalizationBackward(
       Stream* stream, const DeviceMemory<float>& y_backprop,
       const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
-      const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+      const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
@@ -981,7 +982,7 @@ class DnnSupport {
   virtual bool DoBatchNormalizationBackward(
       Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
       const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
-      const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
+      const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
       const dnn::BatchDescriptor& x_desc,
       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
       DeviceMemory<Eigen::half>* x_backprop,
@@ -1132,7 +1133,7 @@ class DnnSupport {
   //    space in order to speed up the convolution operation.
   //  algorithm: an integer to specify which algorithm should be used for the
   //    operation. kDefaultAlgorithm means the system will pick an algorithm
-  //    by default. The coding of the algorithm is be interpretted by the
+  //    by default. The coding of the algorithm is be interpreted by the
   //    underlying implementation.
   //  output_profile_result: the output profile result for this call. The
   //    profiling is only enabled when this is not nullptr.
@@ -2023,7 +2024,7 @@ class DnnSupport {
   //  output_h_desc: descriptor for the output "h" state.
   //  output_h_data: the memory region that stores the output "h" data.
   //  output_c_desc: descriptor for the output "c" state.
-  //  output_c_data: the memory region that stores the outptu "c" data. This
+  //  output_c_data: the memory region that stores the output "c" data. This
   //    must be specified for LSTM models.
   //  is_training: whether this is used in training or inference. That decides
   //    whether respace_space data need to be produced.
@@ -2032,7 +2033,7 @@ class DnnSupport {
   //  retains the data and feed it to the backward pass.
   //  workspace_allocator: an allocator to create temporary workspace used in
   //    this kernel. The caller is responsible for retaining the memory long
-  //    enough for the lifespan of this operation, and recycles aftewards.
+  //    enough for the lifespan of this operation, and recycles afterwards.
   virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                             const dnn::RnnSequenceTensorDescriptor& input_desc,
                             const DeviceMemory<Eigen::half>& input_data,
@@ -2111,7 +2112,7 @@ class DnnSupport {
   //  output_h_desc: descriptor for the output "h" state.
   //  output_h_data: the memory region that stores the output "h" data.
   //  output_c_desc: descriptor for the output "c" state.
-  //  output_c_data: the memory region that stores the outptu "c" data. This
+  //  output_c_data: the memory region that stores the output "c" data. This
   //    must be specified for LSTM models.
   //  output_backprop_data: the device memory region that contains the backprop
   //    to the output sequence.
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
index 5210a81092b3023563baa7edbb657b630dfc819a..95168836278add5d6592ff0c3d0f7245e6f6bc5b 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@@ -33,6 +33,10 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
+#if !defined(PLATFORM_GOOGLE)
+#include "cuda/cuda_config.h"
+#endif
+
 namespace perftools {
 namespace gputools {
 namespace internal {
@@ -96,10 +100,19 @@ string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 }
 
 /* static */ port::Status DsoLoader::GetLibcuptiDsoHandle(void** dso_handle) {
+#if defined(ANDROID_TEGRA)
+  // On Android devices the CUDA version number is not added to the library
+  // name.
+  return GetDsoHandle(
+      FindDsoPath(port::Env::Default()->FormatLibraryFileName("cupti", ""),
+                  GetCudaCuptiLibraryPath()),
+      dso_handle);
+#else
   return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
                                       "cupti", GetCudaVersion()),
                                   GetCudaCuptiLibraryPath()),
                       dso_handle);
+#endif
 }
 
 static mutex& GetRpathMutex() {
diff --git a/tensorflow/stream_executor/dso_loader.h b/tensorflow/stream_executor/dso_loader.h
index 9495f7253a1d475f0b5321b71419febd086832af..354c7b50b8209755991827b3c36afac790cb952b 100644
--- a/tensorflow/stream_executor/dso_loader.h
+++ b/tensorflow/stream_executor/dso_loader.h
@@ -28,10 +28,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 
-#if !defined(PLATFORM_GOOGLE)
-#include "cuda/cuda_config.h"
-#endif
-
 namespace perftools {
 namespace gputools {
 namespace internal {
diff --git a/tensorflow/stream_executor/executor_cache.cc b/tensorflow/stream_executor/executor_cache.cc
index a23d6a70ba237efb2a83f8f56975173015ba9a39..d1a8aae167455a7dc728999fbbaf1a119cf6a101 100644
--- a/tensorflow/stream_executor/executor_cache.cc
+++ b/tensorflow/stream_executor/executor_cache.cc
@@ -23,6 +23,14 @@ namespace gputools {
 port::StatusOr<StreamExecutor*> ExecutorCache::GetOrCreate(
     const StreamExecutorConfig& config,
     const std::function<ExecutorFactory>& factory) {
+  // In the fast path case, the cache already has an entry and we can just
+  // return after Get() which only takes a shared lock and not a unique lock.
+  // If we need to create, we take a unique lock on cache_.
+  auto fast_result = Get(config);
+  if (fast_result.ok()) {
+    return fast_result;
+  }
+
   Entry* entry = nullptr;
   {
     mutex_lock lock{mutex_};
@@ -59,12 +67,17 @@ port::StatusOr<StreamExecutor*> ExecutorCache::Get(
     const StreamExecutorConfig& config) {
   Entry* entry = nullptr;
   {
-    mutex_lock lock{mutex_};
-    entry = &cache_[config.ordinal];
-    // Release the map lock; the address of 'entry' is stable because
-    // std::map guarantees reference stability.
+    tf_shared_lock lock{mutex_};
+    auto it = cache_.find(config.ordinal);
+    if (it != cache_.end()) {
+      entry = &it->second;
+    } else {
+      return port::Status(port::error::NOT_FOUND,
+                          port::Printf("No executors registered for ordinal %d",
+                                       config.ordinal));
+    }
   }
-  mutex_lock lock{entry->configurations_mutex};
+  tf_shared_lock lock{entry->configurations_mutex};
   if (entry->configurations.empty()) {
     return port::Status(
         port::error::NOT_FOUND,
diff --git a/tensorflow/stream_executor/fft.h b/tensorflow/stream_executor/fft.h
index 98cd77e2062bef45dd46e73ac29782eb12591e64..6b1728829abdeb5c4e20534675801a437341d732 100644
--- a/tensorflow/stream_executor/fft.h
+++ b/tensorflow/stream_executor/fft.h
@@ -34,8 +34,8 @@ limitations under the License.
 //     stream_exec.AsFft()->Create1dPlan(&stream, 1024, Type::kC2CForward);
 //  stream
 //    .Init()
-//    .ThenFft(plan.get(), x, &y)
-//    .BlockHostUntilDone();
+//    .ThenFft(plan.get(), x, &y);
+//  SE_CHECK_OK(stream.BlockHostUntilDone());
 //
 // By using stream operations in this manner the user can easily intermix custom
 // kernel launches (via StreamExecutor::ThenLaunch()) with these pre-canned FFT
@@ -167,6 +167,15 @@ class FftSupport {
       bool in_place_fft, int batch_count,
       ScratchAllocator *scratch_allocator) = 0;
 
+  // Updates the plan's work area with space allocated by a new scratch
+  // allocator. This facilitates plan reuse with scratch allocators.
+  //
+  // This requires that the plan was originally created using a scratch
+  // allocator, as otherwise scratch space will have been allocated internally
+  // by cuFFT.
+  virtual void UpdatePlanWithScratchAllocator(
+      Stream *stream, Plan *plan, ScratchAllocator *scratch_allocator) = 0;
+
   // Computes complex-to-complex FFT in the transform direction as specified
   // by direction parameter.
   virtual bool DoFft(Stream *stream, Plan *plan,
@@ -233,6 +242,9 @@ class FftSupport {
       uint64 output_stride, uint64 output_distance, fft::Type type,            \
       bool in_place_fft, int batch_count, ScratchAllocator *scratch_allocator) \
       override;                                                                \
+  void UpdatePlanWithScratchAllocator(Stream *stream, fft::Plan *plan,         \
+                                      ScratchAllocator *scratch_allocator)     \
+      override;                                                                \
   bool DoFft(Stream *stream, fft::Plan *plan,                                  \
              const DeviceMemory<std::complex<float>> &input,                   \
              DeviceMemory<std::complex<float>> *output) override;              \
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 0af2c8cc3d751aa35958a21c81a71496f994e1fb..542f521ef778c3a69ec9adba74405131e07bcf1a 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -162,7 +162,7 @@ void HostExecutor::DeallocateStream(Stream *stream) {}
 
 bool HostExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
   AsHostStream(dependent)->EnqueueTask(
-      [other]() { other->BlockHostUntilDone(); });
+      [other]() { SE_CHECK_OK(other->BlockHostUntilDone()); });
   AsHostStream(dependent)->BlockUntilDone();
   return true;
 }
@@ -177,9 +177,9 @@ bool HostExecutor::StopTimer(Stream *stream, Timer *timer) {
   return true;
 }
 
-bool HostExecutor::BlockHostUntilDone(Stream *stream) {
+port::Status HostExecutor::BlockHostUntilDone(Stream *stream) {
   AsHostStream(stream)->BlockUntilDone();
-  return true;
+  return port::Status::OK();
 }
 
 DeviceDescription *HostExecutor::PopulateDeviceDescription() const {
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index 77b07e4a577fe321901a19369107701ec1904a80..e2c0e6d6b77130bd190b026f1eaff68d21dbf632 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -139,7 +139,7 @@ class HostExecutor : public internal::StreamExecutorInterface {
 
   bool StopTimer(Stream *stream, Timer *timer) override;
 
-  bool BlockHostUntilDone(Stream *stream) override;
+  port::Status BlockHostUntilDone(Stream *stream) override;
 
   int PlatformDeviceCount() override { return 1; }
 
diff --git a/tensorflow/stream_executor/kernel.cc b/tensorflow/stream_executor/kernel.cc
index e1b3635d52eac8c7181395fa76592ae3161a035a..81e531efb31ea7d8d6ac03b56aea6aa5f01d64d1 100644
--- a/tensorflow/stream_executor/kernel.cc
+++ b/tensorflow/stream_executor/kernel.cc
@@ -57,6 +57,15 @@ void KernelMetadata::set_shared_memory_bytes(int shared_memory_bytes) {
   has_shared_memory_bytes_ = true;
 }
 
+KernelBase::KernelBase(KernelBase &&from)
+    : parent_(from.parent_),
+      implementation_(std::move(from.implementation_)),
+      name_(std::move(from.name_)),
+      demangled_name_(std::move(from.demangled_name_)),
+      metadata_(from.metadata_) {
+  from.parent_ = nullptr;
+}
+
 KernelBase::KernelBase(StreamExecutor *parent)
     : parent_(parent),
       implementation_(parent->implementation()->CreateKernelImplementation()) {}
diff --git a/tensorflow/stream_executor/kernel.h b/tensorflow/stream_executor/kernel.h
index 8ef091f929c0ae5a068059732b57c0729fd5be07..5358eac1ae070efb2bead75c73208e9d283b498c 100644
--- a/tensorflow/stream_executor/kernel.h
+++ b/tensorflow/stream_executor/kernel.h
@@ -136,7 +136,7 @@ class KernelMetadata {
 // Thread-compatible.
 class KernelBase {
  public:
-  KernelBase(KernelBase &&) = default;
+  KernelBase(KernelBase &&from);
 
   // Constructs an "empty" (not-yet-loaded) kernel instance.
   //
@@ -340,8 +340,8 @@ class KernelArgIterator {
 //
 // This class exists as a way to pass kernel arguments to
 // StreamExecutorInterface::Launch. That Launch method is virtual, so it can't
-// be templated to accept any KernelArgsArray type, therfore a reference to this
-// base type is passed instead.
+// be templated to accept any KernelArgsArray type, therefore a reference to
+// this base type is passed instead.
 //
 // Performance is not a concern here because each of these methods will be
 // called at most once per kernel launch. Past performance concerns with
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index cc32a6beaa5f83d6883b02682c14327b735a1caa..f23224ae772b9c5915426feaef1155fc9711f075 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -45,7 +45,7 @@ namespace gputools {
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
     const string& target) {
-  mutex_lock lock(GetPlatformsMutex());
+  tf_shared_lock lock(GetPlatformsMutex());
   auto it = GetPlatformMap()->find(port::Lowercase(target));
 
   if (it == GetPlatformMap()->end()) {
@@ -59,7 +59,7 @@ namespace gputools {
 
 /* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
     const Platform::Id& id) {
-  mutex_lock lock(GetPlatformsMutex());
+  tf_shared_lock lock(GetPlatformsMutex());
   auto it = GetPlatformByIdMap()->find(id);
   if (it == GetPlatformByIdMap()->end()) {
     return port::Status(
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 22fd6bce78ff0e907444be7f161b27c159a75214..ba5001e273632c893b05eea64542f1b156e28c47 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -342,7 +342,7 @@ Stream &Stream::ThenBatchNormalizationForward(
 Stream &Stream::ThenBatchNormalizationBackward(
     const DeviceMemory<float> &y_backprop, const DeviceMemory<float> &x,
     const DeviceMemory<float> &scale, const DeviceMemory<float> &mean,
-    const DeviceMemory<float> &variance, const dnn::BatchDescriptor &x_desc,
+    const DeviceMemory<float> &inv_var, const dnn::BatchDescriptor &x_desc,
     const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
     DeviceMemory<float> *x_backprop, DeviceMemory<float> *scale_backprop,
     DeviceMemory<float> *offset_backprop) {
@@ -352,7 +352,7 @@ Stream &Stream::ThenBatchNormalizationBackward(
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
       CheckError(dnn->DoBatchNormalizationBackward(
-          this, y_backprop, x, scale, mean, variance, x_desc, scale_offset_desc,
+          this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
           epsilon, x_backprop, scale_backprop, offset_backprop));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -392,7 +392,7 @@ Stream &Stream::ThenBatchNormalizationForward(
 Stream &Stream::ThenBatchNormalizationBackward(
     const DeviceMemory<Eigen::half> &y_backprop,
     const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale,
-    const DeviceMemory<float> &mean, const DeviceMemory<float> &variance,
+    const DeviceMemory<float> &mean, const DeviceMemory<float> &inv_var,
     const dnn::BatchDescriptor &x_desc,
     const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
     DeviceMemory<Eigen::half> *x_backprop, DeviceMemory<float> *scale_backprop,
@@ -403,7 +403,7 @@ Stream &Stream::ThenBatchNormalizationBackward(
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
       CheckError(dnn->DoBatchNormalizationBackward(
-          this, y_backprop, x, scale, mean, variance, x_desc, scale_offset_desc,
+          this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
           epsilon, x_backprop, scale_backprop, offset_backprop));
     } else {
       SetErrorAndLogNoDnnSupport();
@@ -5055,22 +5055,24 @@ Stream &Stream::ThenEnqueueOnBackgroundThread(
   });
 }
 
-bool Stream::BlockHostUntilDone() {
+port::Status Stream::BlockHostUntilDone() {
   VLOG_CALL();
 
   if (!ok()) {
-    LOG(INFO)
-        << "stream " << this
-        << " did not block host until done; was already in an error state";
-    return false;
+    port::Status status = port::Status(
+        port::error::INTERNAL,
+        "stream did not block host until done; was already in an error state");
+    LOG(INFO) << status << " " << this;
+    return status;
   }
 
+  port::Status first_error;
   {
     // Wait until all active sub-streams have done their tasks.
     mutex_lock lock{mu_};
     for (auto &stream : sub_streams_) {
       if (!stream.second) {
-        CheckError(stream.first->BlockHostUntilDone());
+        first_error.Update(stream.first->BlockHostUntilDone());
         // Set this sub-stream as available.
         stream.second = true;
       }
@@ -5079,8 +5081,9 @@ bool Stream::BlockHostUntilDone() {
 
   temporary_memory_manager_.DeallocateFinalizedTemporaries();
 
-  CheckError(parent_->BlockHostUntilDone(this));
-  return ok();
+  first_error.Update(parent_->BlockHostUntilDone(this));
+  CheckError(first_error.ok());
+  return first_error;
 }
 
 }  // namespace gputools
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 023cffb96510fea0cf2fc54bd609fa38cf124b0a..a2fb2ea2375d0f245ae3bf3ccb04803d01663def 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -113,7 +113,7 @@ class Stream {
 
   // Initialize the stream. This must be performed before entraining any other
   // operations.
-  Stream &Init();
+  Stream &Init() LOCKS_EXCLUDED(mu_);
 
   // Initializes timer t via the StreamExecutor.
   Stream &InitTimer(Timer *t);
@@ -124,11 +124,11 @@ class Stream {
   // Get or create a sub-stream from this stream. If there is any sub-stream in
   // the pool that can be reused then just return this sub-stream.  Otherwise
   // create a new sub-stream.
-  Stream *GetOrCreateSubStream();
+  Stream *GetOrCreateSubStream() LOCKS_EXCLUDED(mu_);
 
   // Return the sub-stream back to the host stream so that it can be reused
   // later.
-  void ReturnSubStream(Stream *sub_stream);
+  void ReturnSubStream(Stream *sub_stream) LOCKS_EXCLUDED(mu_);
 
   // Allocate temporary memories. The stream will deallocate them when blocked
   // or destroyed.
@@ -234,7 +234,7 @@ class Stream {
   Stream &ThenBatchNormalizationBackward(
       const DeviceMemory<float> &y_backprop, const DeviceMemory<float> &x,
       const DeviceMemory<float> &scale, const DeviceMemory<float> &mean,
-      const DeviceMemory<float> &variance, const dnn::BatchDescriptor &x_desc,
+      const DeviceMemory<float> &inv_var, const dnn::BatchDescriptor &x_desc,
       const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
       DeviceMemory<float> *x_backprop, DeviceMemory<float> *scale_backprop,
       DeviceMemory<float> *offset_backprop);
@@ -255,7 +255,7 @@ class Stream {
   Stream &ThenBatchNormalizationBackward(
       const DeviceMemory<Eigen::half> &y_backprop,
       const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale,
-      const DeviceMemory<float> &mean, const DeviceMemory<float> &variance,
+      const DeviceMemory<float> &mean, const DeviceMemory<float> &inv_var,
       const dnn::BatchDescriptor &x_desc,
       const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
       DeviceMemory<Eigen::half> *x_backprop,
@@ -1903,8 +1903,9 @@ class Stream {
   // entrained on the stream (enqueued to this point in program
   // execution) to complete.
   //
-  // Returns true if the stream is ok().
-  bool BlockHostUntilDone();
+  // Returns an OK status if the blocking was successful and the stream is ok().
+  // Otherwise returns an error describing why the blocking failed.
+  port::Status BlockHostUntilDone() LOCKS_EXCLUDED(mu_);
 
   // Warning! This method interacts with internal threads in
   // sometimes-unpredictable ways and is intended for GPU-Executor-internal
@@ -1960,14 +1961,14 @@ class Stream {
   friend struct ThenBlasImpl;  // for implementing ThenBlasXXX.
   friend class ocl::CLBlas;    // for parent_.
 
-  bool InErrorState() const {
+  bool InErrorState() const LOCKS_EXCLUDED(mu_) {
     tf_shared_lock lock{mu_};
     return !ok_;
   }
 
   // Sets the error state if operation_retcode is false.
   // This is a useful shorthand for many stream routines.
-  void CheckError(bool operation_retcode) {
+  void CheckError(bool operation_retcode) LOCKS_EXCLUDED(mu_) {
     if (operation_retcode) {
       return;
     }
diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc
index 95b285b992df91eb1adc01423bb07e2298dba9c4..273d970b6fa4a581381689191b183a30f4f2bcd3 100644
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@@ -15,9 +15,6 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
-
 namespace perftools {
 namespace gputools {
 namespace internal {
@@ -40,7 +37,6 @@ StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
 
 StreamExecutorFactory MakeHostExecutorImplementation;
 
-
 }  // namespace internal
 }  // namespace gputools
 }  // namespace perftools
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 14445a7657be10a6d3d93ef0aabebcfa17d38b72..37ef182e1445a85dd0a97eac02ba064a26dc0f1d 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -219,7 +219,7 @@ class StreamExecutorInterface {
   virtual void DeallocateTimer(Timer *timer) = 0;
   virtual bool StartTimer(Stream *stream, Timer *timer) = 0;
   virtual bool StopTimer(Stream *stream, Timer *timer) = 0;
-  virtual bool BlockHostUntilDone(Stream *stream) = 0;
+  virtual port::Status BlockHostUntilDone(Stream *stream) = 0;
   virtual int PlatformDeviceCount() = 0;
   virtual port::Status EnablePeerAccessTo(StreamExecutorInterface *other) = 0;
   virtual bool CanEnablePeerAccessTo(StreamExecutorInterface *other) = 0;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 76afb85068bafb805678a9bc03b55b2efa1523c6..afca1c2e597b55b1b8d0b76d4e79995d6f6af822 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -432,8 +432,8 @@ bool StreamExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
   return implementation_->Launch(stream, thread_dims, block_dims, kernel, args);
 }
 
-bool StreamExecutor::BlockHostUntilDone(Stream *stream) {
-  bool result;
+port::Status StreamExecutor::BlockHostUntilDone(Stream *stream) {
+  port::Status result;
   SCOPED_TRACE(TraceListener::BlockHostUntilDone, &result, stream);
 
   result = implementation_->BlockHostUntilDone(stream);
@@ -566,19 +566,18 @@ port::Status StreamExecutor::SynchronousMemcpyD2H(
           << device_src.opaque() << ", size=" << size
           << ", host_dst=" << host_dst << ")" << StackTraceIfVLOG10();
 
-  port::Status result{port::Status::OK()};
+  port::Status result;
   SCOPED_TRACE(TraceListener::SynchronousMemcpyD2H, &result, device_src, size,
                host_dst);
 
-  port::Status status =
-      implementation_->SynchronousMemcpy(host_dst, device_src, size);
-  if (!status.ok()) {
-    return port::Status{port::error::INTERNAL,
-                        port::Printf("failed to synchronously memcpy "
-                                     "device-to-host: device %p to host %p "
-                                     "size %lld: %s",
-                                     device_src.opaque(), host_dst, size,
-                                     status.ToString().c_str())};
+  result = implementation_->SynchronousMemcpy(host_dst, device_src, size);
+  if (!result.ok()) {
+    result = port::Status{port::error::INTERNAL,
+                          port::Printf("failed to synchronously memcpy "
+                                       "device-to-host: device %p to host %p "
+                                       "size %lld: %s",
+                                       device_src.opaque(), host_dst, size,
+                                       result.ToString().c_str())};
   }
 
   return result;
@@ -590,19 +589,18 @@ port::Status StreamExecutor::SynchronousMemcpyH2D(
           << ", size=" << size << ", device_dst" << device_dst->opaque() << ")"
           << StackTraceIfVLOG10();
 
-  port::Status result{port::Status::OK()};
+  port::Status result;
   SCOPED_TRACE(TraceListener::SynchronousMemcpyH2D, &result, host_src, size,
                device_dst);
 
-  port::Status status =
-      implementation_->SynchronousMemcpy(device_dst, host_src, size);
-  if (!status.ok()) {
+  result = implementation_->SynchronousMemcpy(device_dst, host_src, size);
+  if (!result.ok()) {
     result = port::Status{
         port::error::INTERNAL,
         port::Printf("failed to synchronously memcpy host-to-device: host "
                      "%p to device %p size %lld: %s",
                      host_src, device_dst->opaque(), size,
-                     status.ToString().c_str())};
+                     result.ToString().c_str())};
   }
 
   return result;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 66c50d47e95fe4e9bf6df24cd61139630000cefb..a2a77218cbbafeeb9d4d8ca04b2e0a8a5024ebf9 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -481,7 +481,7 @@ class StreamExecutor {
   // Causes the host code to synchronously wait for operations entrained onto
   // stream to complete. Effectively a join on the asynchronous device
   // operations enqueued on the stream before this program point.
-  bool BlockHostUntilDone(Stream *stream);
+  port::Status BlockHostUntilDone(Stream *stream);
 
   // Synchronously allocates size bytes on the underlying platform and returns
   // an opaque void* representing that allocation. In the case of failure,
diff --git a/tensorflow/stream_executor/trace_listener.h b/tensorflow/stream_executor/trace_listener.h
index 88c54f982b3cfde925dbe0ca4f7bc3a738e5f3ac..d1e87c348b1f867009fdb6b741d984b2f58cef21 100644
--- a/tensorflow/stream_executor/trace_listener.h
+++ b/tensorflow/stream_executor/trace_listener.h
@@ -65,7 +65,8 @@ class TraceListener {
                                             const port::Status* result) {}
 
   virtual void BlockHostUntilDoneBegin(int64 correlation_id, Stream* stream) {}
-  virtual void BlockHostUntilDoneComplete(int64 correlation_id, bool result) {}
+  virtual void BlockHostUntilDoneComplete(int64 correlation_id,
+                                          const port::Status* result) {}
 };
 
 }  // namespace gputools
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 8d392fb36dccacd48f77615dbd827fbf8564c69c..2ead85d26d6409182842e2b875cd7008e0105bd1 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1,45 +1,47 @@
 # -*- Python -*-
 
-
 # Return the options to use for a C++ library or binary build.
 # Uses the ":optmode" config_setting to pick the options.
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "tf_cuda_tests_tags",
     "tf_sycl_tests_tags",
+    "tf_additional_grpc_deps_py",
     "tf_additional_xla_deps_py",
-    "if_static",)
+    "if_static",
+)
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
     "if_cuda",
-    "cuda_default_copts",)
-
+    "cuda_default_copts",
+)
 load(
     "//third_party/mkl:build_defs.bzl",
-    "if_mkl",)
+    "if_mkl",
+)
 
 def register_extension_info(**kwargs):
     pass
 
-
 # Given a source file, generate a test name.
 # i.e. "common_runtime/direct_session_test.cc" becomes
 #      "common_runtime_direct_session_test"
 def src_to_test_name(src):
   return src.replace("/", "_").split(".")[0]
 
-
 def full_path(relative_paths):
   return [PACKAGE_NAME + "/" + relative for relative in relative_paths]
 
-
 # List of proto files for android builds
 def tf_android_core_proto_sources(core_proto_sources_relative):
   return [
       "//tensorflow/core:" + p for p in core_proto_sources_relative
   ]
 
-
 # Returns the list of pb.h and proto.h headers that are generated for
 # tf_android_core_proto_sources().
 def tf_android_core_proto_headers(core_proto_sources_relative):
@@ -51,13 +53,11 @@ def tf_android_core_proto_headers(core_proto_sources_relative):
       for p in core_proto_sources_relative
   ])
 
-
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
 def clean_dep(dep):
   return str(Label(dep))
 
-
 def if_android_x86(a):
   return select({
       clean_dep("//tensorflow:android_x86"): a,
@@ -65,35 +65,30 @@ def if_android_x86(a):
       "//conditions:default": [],
   })
 
-
 def if_android_arm(a):
   return select({
       clean_dep("//tensorflow:android_arm"): a,
       "//conditions:default": [],
   })
 
-
 def if_android_arm64(a):
   return select({
       clean_dep("//tensorflow:android_arm64"): a,
       "//conditions:default": [],
   })
 
-
 def if_android_mips(a):
   return select({
       clean_dep("//tensorflow:android_mips"): a,
       "//conditions:default": [],
   })
 
-
 def if_not_android(a):
   return select({
       clean_dep("//tensorflow:android"): [],
       "//conditions:default": a,
   })
 
-
 def if_not_android_mips_and_mips64(a):
   return select({
       clean_dep("//tensorflow:android_mips"): [],
@@ -101,20 +96,23 @@ def if_not_android_mips_and_mips64(a):
       "//conditions:default": a,
   })
 
-
 def if_android(a):
   return select({
       clean_dep("//tensorflow:android"): a,
       "//conditions:default": [],
   })
 
-
 def if_ios(a):
   return select({
       clean_dep("//tensorflow:ios"): a,
       "//conditions:default": [],
   })
 
+def if_ios_x86_64(a):
+  return select({
+      clean_dep("//tensorflow:ios_x86_64"): a,
+      "//conditions:default": [],
+  })
 
 def if_mobile(a):
   return select({
@@ -123,7 +121,6 @@ def if_mobile(a):
       "//conditions:default": [],
   })
 
-
 def if_not_mobile(a):
   return select({
       clean_dep("//tensorflow:android"): [],
@@ -131,7 +128,6 @@ def if_not_mobile(a):
       "//conditions:default": a,
   })
 
-
 def if_not_windows(a):
   return select({
       clean_dep("//tensorflow:windows"): [],
@@ -139,6 +135,12 @@ def if_not_windows(a):
       "//conditions:default": a,
   })
 
+def if_windows(a):
+  return select({
+      clean_dep("//tensorflow:windows"): a,
+      clean_dep("//tensorflow:windows_msvc"): a,
+      "//conditions:default": [],
+  })
 
 def if_linux_x86_64(a):
   return select({
@@ -152,22 +154,45 @@ def if_darwin(a):
       "//conditions:default": [],
   })
 
-WIN_COPTS = [
-    "/DLANG_CXX11",
-    "/D__VERSION__=\\\"MSVC\\\"",
-    "/DPLATFORM_WINDOWS",
-    "/DTF_COMPILE_LIBRARY",
-    "/DEIGEN_HAS_C99_MATH",
-    "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
-    "/DEIGEN_AVOID_STL_ARRAY",
-    "/Iexternal/gemmlowp",
-    "/wd4018", # -Wno-sign-compare
-    "/U_HAS_EXCEPTIONS", "/D_HAS_EXCEPTIONS=1", "/EHsc", # -fno-exceptions
-    "/DNOGDI",
-]
+def if_override_eigen_strong_inline(a):
+  return select({
+      clean_dep("//tensorflow:override_eigen_strong_inline"): a,
+      "//conditions:default": [],
+  })
+
+def get_win_copts(is_external=False):
+    WINDOWS_COPTS = [
+        "/D__VERSION__=\\\"MSVC\\\"",
+        "/DPLATFORM_WINDOWS",
+        "/DEIGEN_HAS_C99_MATH",
+        "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+        "/DEIGEN_AVOID_STL_ARRAY",
+        "/Iexternal/gemmlowp",
+        "/wd4018",  # -Wno-sign-compare
+        "/U_HAS_EXCEPTIONS",
+        "/D_HAS_EXCEPTIONS=1",
+        "/EHsc",  # -fno-exceptions
+        "/DNOGDI",
+    ]
+    if is_external:
+      return WINDOWS_COPTS + ["/UTF_COMPILE_LIBRARY"]
+    else:
+      return WINDOWS_COPTS + ["/DTF_COMPILE_LIBRARY"]
 
 # LINT.IfChange
-def tf_copts():
+def tf_copts(android_optimization_level_override="-O2", is_external=False):
+  # For compatibility reasons, android_optimization_level_override
+  # is currently only being set for Android.
+  # To clear this value, and allow the CROSSTOOL default
+  # to be used, pass android_optimization_level_override=None
+  android_copts = [
+      "-std=c++11",
+      "-DTF_LEAN_BINARY",
+      "-Wno-narrowing",
+      "-fomit-frame-pointer",
+  ]
+  if android_optimization_level_override:
+    android_copts.append(android_optimization_level_override)
   return (
       if_not_windows([
           "-DEIGEN_AVOID_STL_ARRAY",
@@ -176,25 +201,31 @@ def tf_copts():
           "-fno-exceptions",
           "-ftemplate-depth=900"])
       + if_cuda(["-DGOOGLE_CUDA=1"])
+      + if_tensorrt(["-DGOOGLE_TENSORRT=1"])
       + if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML", "-fopenmp",])
       + if_android_arm(["-mfpu=neon"])
       + if_linux_x86_64(["-msse3"])
+      + if_ios_x86_64(["-msse4.1"])
+      + select({
+            clean_dep("//tensorflow:framework_shared_object"): [],
+            "//conditions:default": ["-DTENSORFLOW_MONOLITHIC_BUILD"],
+      })
       + select({
-            clean_dep("//tensorflow:android"): [
-                "-std=c++11",
-                "-DTF_LEAN_BINARY",
-                "-O2",
-                "-Wno-narrowing",
-                "-fomit-frame-pointer",
-            ],
+            clean_dep("//tensorflow:android"): android_copts,
             clean_dep("//tensorflow:darwin"): [],
-            clean_dep("//tensorflow:windows"): WIN_COPTS,
-            clean_dep("//tensorflow:windows_msvc"): WIN_COPTS,
+            clean_dep("//tensorflow:windows"): get_win_copts(is_external),
+            clean_dep("//tensorflow:windows_msvc"): get_win_copts(is_external),
             clean_dep("//tensorflow:ios"): ["-std=c++11"],
             "//conditions:default": ["-pthread"]
       }))
 
 
+def tfe_xla_copts():
+  return select({
+      "//tensorflow:with_xla_support": ["-DTENSORFLOW_EAGER_USE_XLA"],
+      "//conditions:default": [],
+  })
+
 def tf_opts_nortti_if_android():
   return if_android([
       "-fno-rtti",
@@ -202,13 +233,11 @@ def tf_opts_nortti_if_android():
       "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
   ])
 
-
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
-
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
-def tf_gen_op_libs(op_lib_names, deps=None):
+def tf_gen_op_libs(op_lib_names, deps=None, is_external=True):
   # Make library out of each op so it can also be used to generate wrappers
   # for various languages.
   if not deps:
@@ -216,20 +245,18 @@ def tf_gen_op_libs(op_lib_names, deps=None):
   for n in op_lib_names:
     native.cc_library(
         name=n + "_op_lib",
-        copts=tf_copts(),
+        copts=tf_copts(is_external=is_external),
         srcs=["ops/" + n + ".cc"],
         deps=deps + [clean_dep("//tensorflow/core:framework")],
         visibility=["//visibility:public"],
         alwayslink=1,
         linkstatic=1,)
 
-
 def _make_search_paths(prefix, levels_to_root):
   return ",".join(
       ["-rpath,%s/%s" % (prefix, "/".join([".."] * search_level))
        for search_level in range(levels_to_root + 1)])
 
-
 def _rpath_linkopts(name):
   # Search parent directories up to the TensorFlow root directory for shared
   # object dependencies, even if this op shared object is deeply nested
@@ -243,12 +270,13 @@ def _rpath_linkopts(name):
       clean_dep("//tensorflow:darwin"): [
           "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
       ],
+      clean_dep("//tensorflow:windows"): [],
+      clean_dep("//tensorflow:windows_msvc"): [],
       "//conditions:default": [
           "-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),),
       ],
   })
 
-
 # Bazel-generated shared objects which must be linked into TensorFlow binaries
 # to define symbols from //tensorflow/core:framework and //tensorflow/core:lib.
 def tf_binary_additional_srcs():
@@ -258,7 +286,6 @@ def tf_binary_additional_srcs():
           clean_dep("//tensorflow:libtensorflow_framework.so"),
       ])
 
-
 def tf_cc_shared_object(
     name,
     srcs=[],
@@ -276,14 +303,15 @@ def tf_cc_shared_object(
               "-Wl,-install_name,@rpath/" + name.split("/")[-1],
           ],
           "//conditions:default": [
+              "-Wl,-soname," + name.split("/")[-1],
           ],
       }),
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_cc_shared_object",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_cc_shared_object",
+    label_regex_for_dep = "{extension_name}",
+)
 
 # Links in the framework shared object
 # (//third_party/tensorflow:libtensorflow_framework.so) when not building
@@ -293,9 +321,11 @@ def tf_cc_binary(name,
                  srcs=[],
                  deps=[],
                  linkopts=[],
+                 copts=tf_copts(),
                  **kwargs):
   native.cc_binary(
       name=name,
+      copts=copts,
       srcs=srcs + tf_binary_additional_srcs(),
       deps=deps + if_mkl(
           [
@@ -306,16 +336,15 @@ def tf_cc_binary(name,
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_cc_binary",
-    label_regex_for_dep="{extension_name}.*")
-
+    extension_name = "tf_cc_binary",
+    label_regex_for_dep = "{extension_name}.*",
+)
 
 def tf_gen_op_wrapper_cc(name,
                          out_ops_file,
                          pkg="",
                          op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
                          deps=None,
-                         override_file=None,
                          include_internal_ops=0,
                          # ApiDefs will be loaded in the order specified in this list.
                          api_def_srcs=[]):
@@ -326,18 +355,12 @@ def tf_gen_op_wrapper_cc(name,
   tf_cc_binary(
       name=tool,
       copts=tf_copts(),
-      linkopts=["-lm"],
+      linkopts=if_not_windows(["-lm"]),
       linkstatic=1,  # Faster to link this one-time-use binary dynamically
       deps=[op_gen] + deps)
 
   srcs = api_def_srcs[:]
 
-  if override_file == None:
-    override_arg = ","
-  else:
-    srcs += [override_file]
-    override_arg = "$(location " + override_file + ")"
-
   if not api_def_srcs:
     api_def_args_str = ","
   else:
@@ -350,6 +373,7 @@ def tf_gen_op_wrapper_cc(name,
           " $$(dirname $$(echo $(locations " + api_def_src +
           ") | cut -d\" \" -f1))")
     api_def_args_str = ",".join(api_def_args)
+
   native.genrule(
       name=name + "_genrule",
       outs=[
@@ -359,10 +383,9 @@ def tf_gen_op_wrapper_cc(name,
       srcs=srcs,
       tools=[":" + tool] + tf_binary_additional_srcs(),
       cmd=("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
-           "$(location :" + out_ops_file + ".cc) " + override_arg + " " +
+           "$(location :" + out_ops_file + ".cc) " +
            str(include_internal_ops) + " " + api_def_args_str))
 
-
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate individual C++ .cc and .h
 # files for each of the ops files mentioned, and then generate a
@@ -401,7 +424,6 @@ def tf_gen_op_wrappers_cc(name,
                               clean_dep("//tensorflow/cc:const_op"),
                           ],
                           op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
-                          override_file=None,
                           include_internal_ops=0,
                           visibility=None,
                           # ApiDefs will be loaded in the order apecified in this list.
@@ -416,7 +438,6 @@ def tf_gen_op_wrappers_cc(name,
         "ops/" + n,
         pkg=pkg,
         op_gen=op_gen,
-        override_file=override_file,
         include_internal_ops=include_internal_ops,
         api_def_srcs=api_def_srcs)
     subsrcs += ["ops/" + n + ".cc"]
@@ -455,7 +476,6 @@ def tf_gen_op_wrappers_cc(name,
       alwayslink=1,
       visibility=[clean_dep("//tensorflow:internal")])
 
-
 # Generates a Python library target wrapping the ops registered in "deps".
 #
 # Args:
@@ -476,6 +496,11 @@ def tf_gen_op_wrappers_cc(name,
 #     "name" arg)
 #   op_whitelist: if not empty, only op names in this list will be wrapped. It
 #     is invalid to specify both "hidden" and "op_whitelist".
+#   cc_linkopts: Optional linkopts to be added to tf_cc_binary that contains the
+#     specified ops.
+#   gen_locally: if True, the genrule to generate the Python library will be run
+#     without sandboxing. This would help when the genrule depends on symlinks
+#     which may not be supported in the sandbox.
 def tf_gen_op_wrapper_py(name,
                          out=None,
                          hidden=None,
@@ -484,7 +509,10 @@ def tf_gen_op_wrapper_py(name,
                          require_shape_functions=False,
                          hidden_file=None,
                          generated_target_name=None,
-                         op_whitelist=[]):
+                         op_whitelist=[],
+                         cc_linkopts=[],
+                         api_def_srcs=[],
+                         gen_locally=False):
   if (hidden or hidden_file) and op_whitelist:
     fail('Cannot pass specify both hidden and op_whitelist.')
 
@@ -494,7 +522,7 @@ def tf_gen_op_wrapper_py(name,
     deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
   tf_cc_binary(
       name=tool_name,
-      linkopts=["-lm"],
+      linkopts=if_not_windows(["-lm"]) + cc_linkopts,
       copts=tf_copts(),
       linkstatic=1,  # Faster to link this one-time-use binary dynamically
       deps=([
@@ -517,22 +545,41 @@ def tf_gen_op_wrapper_py(name,
     op_list_arg = "''"
     op_list_is_whitelist = False
 
+  # Prepare ApiDef directories to pass to the genrule.
+  if not api_def_srcs:
+    api_def_args_str = ","
+  else:
+    api_def_args = []
+    for api_def_src in api_def_srcs:
+      # Add directory of the first ApiDef source to args.
+      # We are assuming all ApiDefs in a single api_def_src are in the
+      # same directory.
+      api_def_args.append(
+          "$$(dirname $$(echo $(locations " + api_def_src +
+          ") | cut -d\" \" -f1))")
+    api_def_args_str = ",".join(api_def_args)
+
   if hidden_file:
     # `hidden_file` is file containing a list of op names to be hidden in the
     # generated module.
     native.genrule(
         name=name + "_pygenrule",
         outs=[out],
-        srcs=[hidden_file],
+        srcs=api_def_srcs + [hidden_file],
         tools=[tool_name] + tf_binary_additional_srcs(),
-        cmd=("$(location " + tool_name + ") @$(location " + hidden_file + ") " +
+        local = (1 if gen_locally else 0),
+        cmd=("$(location " + tool_name + ") " + api_def_args_str +
+             " @$(location " + hidden_file + ") " +
              ("1" if require_shape_functions else "0") + " > $@"))
   else:
     native.genrule(
         name=name + "_pygenrule",
         outs=[out],
+        srcs=api_def_srcs,
         tools=[tool_name] + tf_binary_additional_srcs(),
-        cmd=("$(location " + tool_name + ") " + op_list_arg + " " +
+        local = (1 if gen_locally else 0),
+        cmd=("$(location " + tool_name + ") " + api_def_args_str + " " +
+             op_list_arg + " " +
              ("1" if require_shape_functions else "0") + " " +
              ("1" if op_list_is_whitelist else "0") + " > $@"))
 
@@ -548,7 +595,6 @@ def tf_gen_op_wrapper_py(name,
           clean_dep("//tensorflow/python:framework_for_generated_wrappers_v2"),
       ],)
 
-
 # Define a bazel macro that creates cc_test for tensorflow.
 #
 # Links in the framework shared object
@@ -571,7 +617,17 @@ def tf_cc_test(name,
       name="%s%s" % (name, suffix),
       srcs=srcs + tf_binary_additional_srcs(),
       copts=tf_copts() + extra_copts,
-      linkopts=["-lpthread", "-lm"] + linkopts + _rpath_linkopts(name),
+      linkopts=select({
+        "//tensorflow:android": [
+            "-pie",
+          ],
+        clean_dep("//tensorflow:windows"): [],
+        clean_dep("//tensorflow:windows_msvc"): [],
+        "//conditions:default": [
+            "-lpthread",
+            "-lm"
+        ],
+      }) + linkopts + _rpath_linkopts(name),
       deps=deps + if_mkl(
           [
               "//third_party/mkl:intel_binary_blob",
@@ -591,9 +647,9 @@ def tf_cc_test(name,
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_cc_test",
-    label_regex_for_dep="{extension_name}.*")
-
+    extension_name = "tf_cc_test",
+    label_regex_for_dep = "{extension_name}.*",
+)
 
 # Part of the testing workflow requires a distinguishable name for the build
 # rules that involve a GPU, even if otherwise identical to the base rule.
@@ -618,9 +674,9 @@ def tf_cc_test_gpu(name,
       args=args)
 
 register_extension_info(
-    extension_name="tf_cc_test_gpu",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_cc_test_gpu",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_cuda_cc_test(name,
                     srcs=[],
@@ -628,6 +684,7 @@ def tf_cuda_cc_test(name,
                     tags=[],
                     data=[],
                     size="medium",
+                    extra_copts=[],
                     linkstatic=0,
                     args=[],
                     linkopts=[]):
@@ -638,6 +695,7 @@ def tf_cuda_cc_test(name,
       tags=tags + ["manual"],
       data=data,
       size=size,
+      extra_copts=extra_copts,
       linkstatic=linkstatic,
       linkopts=linkopts,
       args=args)
@@ -658,13 +716,14 @@ def tf_cuda_cc_test(name,
       tags=tags + tf_cuda_tests_tags(),
       data=data,
       size=size,
+      extra_copts=extra_copts,
       linkopts=linkopts,
       args=args)
 
 register_extension_info(
-    extension_name="tf_cuda_cc_test",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_cuda_cc_test",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_cuda_only_cc_test(name,
                     srcs=[],
@@ -685,7 +744,7 @@ def tf_cuda_only_cc_test(name,
       deps=deps + if_cuda([
           clean_dep("//tensorflow/core:cuda"),
           clean_dep("//tensorflow/core:gpu_lib")]),
-      linkopts=["-lpthread", "-lm"] + linkopts + _rpath_linkopts(name),
+      linkopts=if_not_windows(["-lpthread", "-lm"]) + linkopts + _rpath_linkopts(name),
       linkstatic=linkstatic or select({
           # cc_tests with ".so"s in srcs incorrectly link on Darwin
           # unless linkstatic=1.
@@ -696,9 +755,9 @@ def tf_cuda_only_cc_test(name,
       tags=tags + tf_cuda_tests_tags())
 
 register_extension_info(
-    extension_name="tf_cuda_only_cc_test",
-    label_regex_for_dep="{extension_name}_gpu")
-
+    extension_name = "tf_cuda_only_cc_test",
+    label_regex_for_dep = "{extension_name}_gpu",
+)
 
 # Create a cc_test for each of the tensorflow tests listed in "tests"
 def tf_cc_tests(srcs,
@@ -722,7 +781,6 @@ def tf_cc_tests(srcs,
         linkopts=linkopts,
         nocopts=nocopts)
 
-
 def tf_cc_test_mkl(srcs,
                    deps,
                    name="",
@@ -732,7 +790,6 @@ def tf_cc_test_mkl(srcs,
                    args=None):
   if_mkl(tf_cc_tests(srcs, deps, name, linkstatic=linkstatic, tags=tags, size=size, args=args, nocopts="-fno-exceptions"))
 
-
 def tf_cc_tests_gpu(srcs,
                     deps,
                     name="",
@@ -742,7 +799,6 @@ def tf_cc_tests_gpu(srcs,
                     args=None):
   tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args)
 
-
 def tf_cuda_cc_tests(srcs,
                      deps,
                      name="",
@@ -775,9 +831,9 @@ def tf_java_test(name,
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_java_test",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_java_test",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def _cuda_copts():
   """Gets the appropriate set of copts for (maybe) CUDA compilation.
@@ -797,10 +853,8 @@ def _cuda_copts():
       ]),
   })
 
-
 # Build defs for TensorFlow kernels
 
-
 # When this target is built using --config=cuda, a cc_library is built
 # that passes -DGOOGLE_CUDA=1 and '-x cuda', linking in additional
 # libraries needed by GPU kernels.
@@ -824,18 +878,20 @@ def tf_gpu_kernel_library(srcs,
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_gpu_kernel_library",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_gpu_kernel_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
-def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
+def tf_cuda_library(deps=None, cuda_deps=None, copts=tf_copts(), **kwargs):
   """Generate a cc_library with a conditional set of CUDA dependencies.
 
   When the library is built with --config=cuda:
 
-  - both deps and cuda_deps are used as dependencies
-  - the cuda runtime is added as a dependency (if necessary)
-  - The library additionally passes -DGOOGLE_CUDA=1 to the list of copts
+  - Both deps and cuda_deps are used as dependencies.
+  - The cuda runtime is added as a dependency (if necessary).
+  - The library additionally passes -DGOOGLE_CUDA=1 to the list of copts.
+  - In addition, when the library is also built with TensorRT enabled, it
+      additionally passes -DGOOGLE_TENSORRT=1 to the list of copts.
 
   Args:
   - cuda_deps: BUILD dependencies which will be linked if and only if:
@@ -848,22 +904,20 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
     deps = []
   if not cuda_deps:
     cuda_deps = []
-  if not copts:
-    copts = []
 
   native.cc_library(
       deps=deps + if_cuda(cuda_deps + [
           clean_dep("//tensorflow/core:cuda"),
           "@local_config_cuda//cuda:cuda_headers"
       ]),
-      copts=copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]),
+      copts=(copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]) +
+             if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_cuda_library",
-    label_regex_for_dep="{extension_name}")
-
-
+    extension_name = "tf_cuda_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_kernel_library(name,
                       prefix=None,
@@ -872,7 +926,8 @@ def tf_kernel_library(name,
                       hdrs=None,
                       deps=None,
                       alwayslink=1,
-                      copts=tf_copts(),
+                      copts=None,
+                      is_external=False,
                       **kwargs):
   """A rule to build a TensorFlow OpKernel.
 
@@ -901,7 +956,9 @@ def tf_kernel_library(name,
     hdrs = []
   if not deps:
     deps = []
-
+  if not copts:
+    copts = []
+  copts = copts + tf_copts(is_external=is_external)
   if prefix:
     if native.glob([prefix + "*.cu.cc"], exclude=["*test*"]):
       if not gpu_srcs:
@@ -934,9 +991,9 @@ def tf_kernel_library(name,
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_kernel_library",
-    label_regex_for_dep="{extension_name}(_gpu)?")
-
+    extension_name = "tf_kernel_library",
+    label_regex_for_dep = "{extension_name}(_gpu)?",
+)
 
 def tf_mkl_kernel_library(name,
                           prefix=None,
@@ -975,9 +1032,9 @@ def tf_mkl_kernel_library(name,
       ))
 
 register_extension_info(
-    extension_name="tf_mkl_kernel_library",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_mkl_kernel_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
 # Bazel rules for building swig files.
 def _py_wrap_cc_impl(ctx):
@@ -1011,44 +1068,41 @@ def _py_wrap_cc_impl(ctx):
       progress_message="SWIGing " + src.path)
   return struct(files=depset(outputs))
 
-
 _py_wrap_cc = rule(
-    attrs={
-        "srcs":
-            attr.label_list(
-                mandatory=True,
-                allow_files=True,),
-        "swig_includes":
-            attr.label_list(
-                cfg="data",
-                allow_files=True,),
-        "deps":
-            attr.label_list(
-                allow_files=True,
-                providers=["cc"],),
-        "toolchain_deps":
-            attr.label_list(
-                allow_files=True,),
-        "module_name":
-            attr.string(mandatory=True),
-        "py_module_name":
-            attr.string(mandatory=True),
-        "_swig":
-            attr.label(
-                default=Label("@swig//:swig"),
-                executable=True,
-                cfg="host",),
-        "_swiglib":
-            attr.label(
-                default=Label("@swig//:templates"),
-                allow_files=True,),
+    attrs = {
+        "srcs": attr.label_list(
+            mandatory = True,
+            allow_files = True,
+        ),
+        "swig_includes": attr.label_list(
+            cfg = "data",
+            allow_files = True,
+        ),
+        "deps": attr.label_list(
+            allow_files = True,
+            providers = ["cc"],
+        ),
+        "toolchain_deps": attr.label_list(
+            allow_files = True,
+        ),
+        "module_name": attr.string(mandatory = True),
+        "py_module_name": attr.string(mandatory = True),
+        "_swig": attr.label(
+            default = Label("@swig//:swig"),
+            executable = True,
+            cfg = "host",
+        ),
+        "_swiglib": attr.label(
+            default = Label("@swig//:templates"),
+            allow_files = True,
+        ),
     },
-    outputs={
+    outputs = {
         "cc_out": "%{module_name}.cc",
         "py_out": "%{py_module_name}.py",
     },
-    implementation=_py_wrap_cc_impl,)
-
+    implementation = _py_wrap_cc_impl,
+)
 
 def _get_repository_roots(ctx, files):
   """Returns abnormal root directories under which files reside.
@@ -1079,7 +1133,6 @@ def _get_repository_roots(ctx, files):
       result[root] -= 1
   return [k for v, k in sorted([(v, k) for k, v in result.items()])]
 
-
 # Bazel rule for collecting the header files that a target depends on.
 def _transitive_hdrs_impl(ctx):
   outputs = depset()
@@ -1087,21 +1140,20 @@ def _transitive_hdrs_impl(ctx):
     outputs += dep.cc.transitive_headers
   return struct(files=outputs)
 
-
 _transitive_hdrs = rule(
-    attrs={
+    attrs = {
         "deps": attr.label_list(
-            allow_files=True,
-            providers=["cc"],),
+            allow_files = True,
+            providers = ["cc"],
+        ),
     },
-    implementation=_transitive_hdrs_impl,)
-
+    implementation = _transitive_hdrs_impl,
+)
 
 def transitive_hdrs(name, deps=[], **kwargs):
   _transitive_hdrs(name=name + "_gather", deps=deps)
   native.filegroup(name=name, srcs=[":" + name + "_gather"])
 
-
 # Create a header only library that includes all the headers exported by
 # the libraries in deps.
 def cc_header_only_library(name, deps=[], includes=[], **kwargs):
@@ -1127,7 +1179,6 @@ def cc_header_only_library(name, deps=[], includes=[], **kwargs):
                     includes=includes,
                     **kwargs)
 
-
 def tf_custom_op_library_additional_deps():
   return [
       "@protobuf_archive//:protobuf_headers",
@@ -1136,7 +1187,6 @@ def tf_custom_op_library_additional_deps():
       clean_dep("//tensorflow/core:framework_headers_lib"),
   ]
 
-
 # Traverse the dependency graph along the "deps" attribute of the
 # target and return a struct with one field called 'tf_collected_deps'.
 # tf_collected_deps will be the union of the deps of the current target
@@ -1150,16 +1200,15 @@ def _collect_deps_aspect_impl(target, ctx):
         alldeps = alldeps | dep.tf_collected_deps
   return struct(tf_collected_deps=alldeps)
 
-
 collect_deps_aspect = aspect(
-    implementation=_collect_deps_aspect_impl, attr_aspects=["deps"])
-
+    attr_aspects = ["deps"],
+    implementation = _collect_deps_aspect_impl,
+)
 
 def _dep_label(dep):
   label = dep.label
   return label.package + ":" + label.name
 
-
 # This rule checks that the transitive dependencies of targets listed
 # in the 'deps' attribute don't depend on the targets listed in
 # the 'disallowed_deps' attribute.
@@ -1176,22 +1225,24 @@ def _check_deps_impl(ctx):
                   disallowed_dep))
   return struct()
 
-
 check_deps = rule(
     _check_deps_impl,
-    attrs={
-        "deps":
-            attr.label_list(
-                aspects=[collect_deps_aspect], mandatory=True,
-                allow_files=True),
-        "disallowed_deps":
-            attr.label_list(mandatory=True, allow_files=True)
-    },)
-
+    attrs = {
+        "deps": attr.label_list(
+            aspects = [collect_deps_aspect],
+            mandatory = True,
+            allow_files = True,
+        ),
+        "disallowed_deps": attr.label_list(
+            mandatory = True,
+            allow_files = True,
+        ),
+    },
+)
 
 # Helper to build a dynamic library (.so) from the sources containing
 # implementations of custom ops and kernels.
-def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
+def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
   cuda_deps = [
       clean_dep("//tensorflow/core:stream_executor_headers_lib"),
       "@local_config_cuda//cuda:cuda_headers",
@@ -1219,18 +1270,20 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
       srcs=srcs,
       deps=deps + if_cuda(cuda_deps),
       data=[name + "_check_deps"],
-      copts=tf_copts(),
-      linkopts=select({
+      copts=tf_copts(is_external=True),
+      linkopts=linkopts + select({
           "//conditions:default": [
               "-lm",
           ],
+          clean_dep("//tensorflow:windows"): [],
+          clean_dep("//tensorflow:windows_msvc"): [],
           clean_dep("//tensorflow:darwin"): [],
       }),)
 
 register_extension_info(
-    extension_name="tf_custom_op_library",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_custom_op_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_custom_op_py_library(name,
                             srcs=[],
@@ -1249,17 +1302,55 @@ def tf_custom_op_py_library(name,
       deps=deps,)
 
 register_extension_info(
-    extension_name="tf_custom_op_py_library",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_custom_op_py_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_extension_linkopts():
   return []  # No extension link opts
 
-
 def tf_extension_copts():
   return []  # No extension c opts
 
+# In tf_py_wrap_cc generated libraries
+# module init functions are not exported unless
+# they contain one of the keywords in the version file
+# this prevents custom python modules.
+# This function attempts to append init_module_name to list of
+# exported functions in version script
+def _append_init_to_versionscript_impl(ctx):
+  mod_name = ctx.attr.module_name
+  if ctx.attr.is_version_script:
+    ctx.actions.expand_template(
+      template=ctx.file.template_file,
+      output=ctx.outputs.versionscript,
+      substitutions={
+        "global:":"global:\n     init_%s;\n     PyInit_*;"%(mod_name),
+      },
+      is_executable=False,
+    )
+  else:
+    ctx.actions.expand_template(
+      template=ctx.file.template_file,
+      output=ctx.outputs.versionscript,
+      substitutions={
+        "*tensorflow*":"*tensorflow*\ninit_%s\nPyInit_*\n"%(mod_name),
+      },
+      is_executable=False,
+    )
+
+
+_append_init_to_versionscript= rule(
+  implementation=_append_init_to_versionscript_impl,
+  attrs={
+    "module_name":attr.string(mandatory=True),
+    "template_file":attr.label(allow_files=True,single_file=True,mandatory=True),
+    "is_version_script":attr.bool(default=True,
+      doc='whether target is a ld version script or exported symbol list',
+      mandatory=False),
+  },
+  outputs={"versionscript":"%{name}.lds"},
+)
 
 def tf_py_wrap_cc(name,
                              srcs,
@@ -1282,26 +1373,39 @@ def tf_py_wrap_cc(name,
       toolchain_deps=["//tools/defaults:crosstool"],
       module_name=module_name,
       py_module_name=name)
+  vscriptname=name+"_versionscript"
+  _append_init_to_versionscript(
+      name=vscriptname,
+      module_name=module_name,
+      is_version_script=select({
+          "@local_config_cuda//cuda:darwin":False,
+          "//conditions:default":True,
+          }),
+      template_file=select({
+          "@local_config_cuda//cuda:darwin":clean_dep("//tensorflow:tf_exported_symbols.lds"),
+          "//conditions:default":clean_dep("//tensorflow:tf_version_script.lds")
+      })
+  )
   extra_linkopts = select({
       "@local_config_cuda//cuda:darwin": [
           "-Wl,-exported_symbols_list",
-          clean_dep("//tensorflow:tf_exported_symbols.lds")
+          "%s.lds"%vscriptname,
       ],
       clean_dep("//tensorflow:windows"): [],
       clean_dep("//tensorflow:windows_msvc"): [],
       "//conditions:default": [
           "-Wl,--version-script",
-          clean_dep("//tensorflow:tf_version_script.lds")
+          "%s.lds"%vscriptname,
       ]
   })
   extra_deps += select({
       "@local_config_cuda//cuda:darwin": [
-          clean_dep("//tensorflow:tf_exported_symbols.lds")
+          "%s.lds"%vscriptname,
       ],
       clean_dep("//tensorflow:windows"): [],
       clean_dep("//tensorflow:windows_msvc"): [],
       "//conditions:default": [
-          clean_dep("//tensorflow:tf_version_script.lds")
+          "%s.lds"%vscriptname,
       ]
   })
 
@@ -1328,19 +1432,39 @@ def tf_py_wrap_cc(name,
           "//conditions:default": [":" + cc_library_name],
       }))
 
-
-def py_test(deps=[], **kwargs):
+# This macro is for running python tests against system installed pip package
+# on Windows.
+#
+# py_test is built as an exectuable python zip file on Windows, which contains all
+# dependencies of the target. Because of the C++ extensions, it would be very
+# inefficient if the py_test zips all runfiles, plus we don't need them when running
+# tests against system installed pip package. So we'd like to get rid of the deps
+# of py_test in this case.
+#
+# In order to trigger the tests without bazel clean after getting rid of deps,
+# we introduce the following :
+# 1. When --define=no_tensorflow_py_deps=true, the py_test depends on a marker
+#    file of the pip package, the test gets to rerun when the pip package change.
+#    Note that this only works on Windows. See the definition of
+#    //third_party/tensorflow/tools/pip_package:win_pip_package_marker for specific reasons.
+# 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
+def py_test(deps=[], data=[], **kwargs):
   native.py_test(
       deps=select({
           "//conditions:default": deps,
-          clean_dep("//tensorflow:no_tensorflow_py_deps"): []
+          clean_dep("//tensorflow:no_tensorflow_py_deps"): [],
+      }),
+      data = data + select({
+          "//conditions:default": [],
+          clean_dep("//tensorflow:no_tensorflow_py_deps"):
+          ["//tensorflow/tools/pip_package:win_pip_package_marker"],
       }),
       **kwargs)
 
 register_extension_info(
-    extension_name="py_test",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "py_test",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_py_test(name,
                srcs,
@@ -1352,10 +1476,13 @@ def tf_py_test(name,
                shard_count=1,
                additional_deps=[],
                flaky=0,
-               xla_enabled=False):
+               xla_enabled=False,
+               grpc_enabled=False):
   if xla_enabled:
     additional_deps = additional_deps + tf_additional_xla_deps_py()
-  native.py_test(
+  if grpc_enabled:
+    additional_deps = additional_deps + tf_additional_grpc_deps_py()
+  py_test(
       name=name,
       size=size,
       srcs=srcs,
@@ -1365,20 +1492,17 @@ def tf_py_test(name,
       visibility=[clean_dep("//tensorflow:internal")],
       shard_count=shard_count,
       data=data,
-      deps=select({
-          "//conditions:default": [
-              clean_dep("//tensorflow/python:extra_py_tests_deps"),
-              clean_dep("//tensorflow/python:gradient_checker"),
+      deps=[
+            clean_dep("//tensorflow/python:extra_py_tests_deps"),
+            clean_dep("//tensorflow/python:gradient_checker"),
           ] + additional_deps,
-          clean_dep("//tensorflow:no_tensorflow_py_deps"): []
-      }),
       flaky=flaky,
       srcs_version="PY2AND3")
 
 register_extension_info(
-    extension_name="tf_py_test",
-    label_regex_map={"additional_deps": "deps:{extension_name}"})
-
+    extension_name = "tf_py_test",
+    label_regex_map = {"additional_deps": "deps:{extension_name}"},
+)
 
 def cuda_py_test(name,
                  srcs,
@@ -1390,7 +1514,8 @@ def cuda_py_test(name,
                  additional_deps=[],
                  tags=[],
                  flaky=0,
-                 xla_enabled=False):
+                 xla_enabled=False,
+                 grpc_enabled=False):
   test_tags = tags + tf_cuda_tests_tags()
   tf_py_test(
       name=name,
@@ -1403,12 +1528,13 @@ def cuda_py_test(name,
       shard_count=shard_count,
       additional_deps=additional_deps,
       flaky=flaky,
-      xla_enabled=xla_enabled)
+      xla_enabled=xla_enabled,
+      grpc_enabled=grpc_enabled)
 
 register_extension_info(
-    extension_name="cuda_py_test",
-    label_regex_map={"additional_deps": "additional_deps:{extension_name}"})
-
+    extension_name = "cuda_py_test",
+    label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
+)
 
 def sycl_py_test(name,
                  srcs,
@@ -1420,7 +1546,8 @@ def sycl_py_test(name,
                  additional_deps=[],
                  tags=[],
                  flaky=0,
-                 xla_enabled=False):
+                 xla_enabled=False,
+                 grpc_enabled=False):
   test_tags = tags + tf_sycl_tests_tags()
   tf_py_test(
       name=name,
@@ -1433,12 +1560,13 @@ def sycl_py_test(name,
       shard_count=shard_count,
       additional_deps=additional_deps,
       flaky=flaky,
-      xla_enabled=xla_enabled)
+      xla_enabled=xla_enabled,
+      grpc_enabled=grpc_enabled)
 
 register_extension_info(
-    extension_name="sycl_py_test",
-    label_regex_map={"additional_deps": "additional_deps:{extension_name}"})
-
+    extension_name = "sycl_py_test",
+    label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
+)
 
 def py_tests(name,
              srcs,
@@ -1448,7 +1576,8 @@ def py_tests(name,
              tags=[],
              shard_count=1,
              prefix="",
-             xla_enabled=False):
+             xla_enabled=False,
+             grpc_enabled=False):
   for src in srcs:
     test_name = src.split("/")[-1].split(".")[0]
     if prefix:
@@ -1462,8 +1591,8 @@ def py_tests(name,
         shard_count=shard_count,
         data=data,
         additional_deps=additional_deps,
-        xla_enabled=xla_enabled)
-
+        xla_enabled=xla_enabled,
+        grpc_enabled=grpc_enabled)
 
 def cuda_py_tests(name,
                   srcs,
@@ -1473,7 +1602,8 @@ def cuda_py_tests(name,
                   shard_count=1,
                   tags=[],
                   prefix="",
-                  xla_enabled=False):
+                  xla_enabled=False,
+                  grpc_enabled=False):
   test_tags = tags + tf_cuda_tests_tags()
   py_tests(
       name=name,
@@ -1484,8 +1614,8 @@ def cuda_py_tests(name,
       tags=test_tags,
       shard_count=shard_count,
       prefix=prefix,
-      xla_enabled=xla_enabled)
-
+      xla_enabled=xla_enabled,
+      grpc_enabled=grpc_enabled)
 
 # Creates a genrule named <name> for running tools/proto_text's generator to
 # make the proto_text functions, for the protos passed in <srcs>.
@@ -1509,19 +1639,17 @@ def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs):
       ],)
   return struct(hdrs=out_hdrs, srcs=out_srcs)
 
-
 def tf_genrule_cmd_append_to_srcs(to_append):
   return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
           " >> $(@)")
 
-
 def tf_version_info_genrule():
   native.genrule(
       name="version_info_gen",
       srcs=[
-          clean_dep("//tensorflow/tools/git:gen/spec.json"),
-          clean_dep("//tensorflow/tools/git:gen/head"),
-          clean_dep("//tensorflow/tools/git:gen/branch_ref"),
+          clean_dep("@local_config_git//:gen/spec.json"),
+          clean_dep("@local_config_git//:gen/head"),
+          clean_dep("@local_config_git//:gen/branch_ref"),
       ],
       outs=["util/version_info.cc"],
       cmd=
@@ -1529,7 +1657,6 @@ def tf_version_info_genrule():
       local=1,
       tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
 
-
 def tf_py_build_info_genrule():
   native.genrule(
       name="py_build_info_gen",
@@ -1539,14 +1666,15 @@ def tf_py_build_info_genrule():
       local=1,
       tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
 
-
 def cc_library_with_android_deps(deps,
                                  android_deps=[],
                                  common_deps=[],
+                                 copts=tf_copts(),
                                  **kwargs):
   deps = if_not_android(deps) + if_android(android_deps) + common_deps
-  native.cc_library(deps=deps, **kwargs)
+  native.cc_library(deps=deps, copts=copts, **kwargs)
 
 register_extension_info(
-    extension_name="cc_library_with_android_deps",
-    label_regex_for_dep="{extension_name}")
+    extension_name = "cc_library_with_android_deps",
+    label_regex_for_dep = "{extension_name}",
+)
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index bddb87f00cb5fd1ede2cb9d5cc4079d6e66f7896..3ff824e5e1707c65b5ad3cc22dd32267953964c6 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -4,3 +4,4 @@
 *TF_*
 *TFE_*
 *nsync_*
+*pywrap_xla*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index 11f66c5c8b27f412b2023d6f3036c56d3d1e530c..6b28943f01cfdb174fd135c670a6bb409ee0e102 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -5,6 +5,7 @@ tensorflow {
     *TF_*;
     *TFE_*;
     *nsync_*;
+    *pywrap_xla*;
   local:
     *;
 };
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 3896a21b99f4756239a7ae9f3db9593504845aea..e731127a63d792825e15a4b95379517117edebb0 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -41,7 +41,90 @@ genrule(
     # every module exported using tf_export. For e.g. if an op is decorated with
     # @tf_export('module1.module2', 'module3'). Then, outs should include
     # api/module1/module2/__init__.py and api/module3/__init__.py.
-    outs = ["api/__init__.py"],
+    # keep sorted
+    outs = [
+        "api/__init__.py",
+        "api/app/__init__.py",
+        "api/bitwise/__init__.py",
+        "api/compat/__init__.py",
+        "api/contrib/__init__.py",
+        "api/contrib/stat_summarizer/__init__.py",
+        "api/data/__init__.py",
+        "api/distributions/__init__.py",
+        "api/distributions/bijectors/__init__.py",
+        "api/errors/__init__.py",
+        "api/estimator/__init__.py",
+        "api/estimator/export/__init__.py",
+        "api/estimator/inputs/__init__.py",
+        "api/feature_column/__init__.py",
+        "api/gfile/__init__.py",
+        "api/graph_util/__init__.py",
+        "api/image/__init__.py",
+        "api/initializers/__init__.py",
+        "api/keras/__init__.py",
+        "api/keras/activations/__init__.py",
+        "api/keras/applications/__init__.py",
+        "api/keras/applications/densenet/__init__.py",
+        "api/keras/applications/inception_resnet_v2/__init__.py",
+        "api/keras/applications/inception_v3/__init__.py",
+        "api/keras/applications/mobilenet/__init__.py",
+        "api/keras/applications/nasnet/__init__.py",
+        "api/keras/applications/resnet50/__init__.py",
+        "api/keras/applications/vgg16/__init__.py",
+        "api/keras/applications/vgg19/__init__.py",
+        "api/keras/applications/xception/__init__.py",
+        "api/keras/backend/__init__.py",
+        "api/keras/callbacks/__init__.py",
+        "api/keras/constraints/__init__.py",
+        "api/keras/datasets/__init__.py",
+        "api/keras/datasets/boston_housing/__init__.py",
+        "api/keras/datasets/cifar10/__init__.py",
+        "api/keras/datasets/cifar100/__init__.py",
+        "api/keras/datasets/imdb/__init__.py",
+        "api/keras/datasets/mnist/__init__.py",
+        "api/keras/datasets/reuters/__init__.py",
+        "api/keras/estimator/__init__.py",
+        "api/keras/initializers/__init__.py",
+        "api/keras/layers/__init__.py",
+        "api/keras/losses/__init__.py",
+        "api/keras/metrics/__init__.py",
+        "api/keras/models/__init__.py",
+        "api/keras/optimizers/__init__.py",
+        "api/keras/preprocessing/__init__.py",
+        "api/keras/preprocessing/image/__init__.py",
+        "api/keras/preprocessing/sequence/__init__.py",
+        "api/keras/preprocessing/text/__init__.py",
+        "api/keras/regularizers/__init__.py",
+        "api/keras/utils/__init__.py",
+        "api/keras/wrappers/__init__.py",
+        "api/keras/wrappers/scikit_learn/__init__.py",
+        "api/layers/__init__.py",
+        "api/linalg/__init__.py",
+        "api/logging/__init__.py",
+        "api/losses/__init__.py",
+        "api/metrics/__init__.py",
+        "api/nn/__init__.py",
+        "api/nn/rnn_cell/__init__.py",
+        "api/profiler/__init__.py",
+        "api/python_io/__init__.py",
+        "api/resource_loader/__init__.py",
+        "api/saved_model/__init__.py",
+        "api/saved_model/builder/__init__.py",
+        "api/saved_model/constants/__init__.py",
+        "api/saved_model/loader/__init__.py",
+        "api/saved_model/main_op/__init__.py",
+        "api/saved_model/signature_constants/__init__.py",
+        "api/saved_model/signature_def_utils/__init__.py",
+        "api/saved_model/tag_constants/__init__.py",
+        "api/saved_model/utils/__init__.py",
+        "api/sets/__init__.py",
+        "api/spectral/__init__.py",
+        "api/summary/__init__.py",
+        "api/sysconfig/__init__.py",
+        "api/test/__init__.py",
+        "api/train/__init__.py",
+        "api/train/queue_runner/__init__.py",
+    ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
 )
@@ -50,4 +133,7 @@ py_library(
     name = "python_api",
     srcs = [":python_api_gen"],
     srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib:contrib_py",  # keep
+    ],
 )
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 5f1286aaf6c913cd299ebbfb65949ace0f593417..1557314939bd85c0467426216f90aa3891ca0ac0 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -31,6 +31,7 @@ from tensorflow.python.util import tf_decorator
 _API_CONSTANTS_ATTR = '_tf_api_constants'
 _API_NAMES_ATTR = '_tf_api_names'
 _API_DIR = '/api/'
+_CONTRIB_IMPORT = 'from tensorflow import contrib'
 _GENERATED_FILE_HEADER = """\"\"\"Imports for Python API.
 
 This file is MACHINE GENERATED! Do not edit.
@@ -50,11 +51,17 @@ def format_import(source_module_name, source_name, dest_name):
   Returns:
     An import statement string.
   """
-  if source_name == dest_name:
-    return 'from %s import %s' % (source_module_name, source_name)
+  if source_module_name:
+    if source_name == dest_name:
+      return 'from %s import %s' % (source_module_name, source_name)
+    else:
+      return 'from %s import %s as %s' % (
+          source_module_name, source_name, dest_name)
   else:
-    return 'from %s import %s as %s' % (
-        source_module_name, source_name, dest_name)
+    if source_name == dest_name:
+      return 'import %s' % source_name
+    else:
+      return 'import %s as %s' % (source_name, dest_name)
 
 
 def get_api_imports():
@@ -74,6 +81,9 @@ def get_api_imports():
     # Only look at tensorflow modules.
     if not module or 'tensorflow.' not in module.__name__:
       continue
+    # Do not generate __init__.py files for contrib modules for now.
+    if '.contrib.' in module.__name__ or module.__name__.endswith('.contrib'):
+      continue
 
     for module_contents_name in dir(module):
       attr = getattr(module, module_contents_name)
@@ -107,7 +117,8 @@ def get_api_imports():
   # Import all required modules in their parent modules.
   # For e.g. if we import 'tf.foo.bar.Value'. Then, we also
   # import 'bar' in 'tf.foo'.
-  for dest_module in module_imports.keys():
+  dest_modules = set(module_imports.keys())
+  for dest_module in dest_modules:
     dest_module_split = dest_module.split('.')
     for dest_submodule_index in range(1, len(dest_module_split)):
       dest_submodule = '.'.join(dest_module_split[:dest_submodule_index])
@@ -150,21 +161,28 @@ def create_api_files(output_files):
       os.makedirs(os.path.dirname(file_path))
     open(file_path, 'a').close()
 
-  # Add imports to output files.
   module_imports = get_api_imports()
+  module_imports['tf'].append(_CONTRIB_IMPORT)  # Include all of contrib.
+
+  # Add imports to output files.
   missing_output_files = []
   for module, exports in module_imports.items():
     # Make sure genrule output file list is in sync with API exports.
     if module not in module_name_to_file_path:
-      missing_output_files.append(module)
+      module_without_tf = module[len('tf.'):]
+      module_file_path = '"api/%s/__init__.py"' %  (
+          module_without_tf.replace('.', '/'))
+      missing_output_files.append(module_file_path)
       continue
     with open(module_name_to_file_path[module], 'w') as fp:
       fp.write(_GENERATED_FILE_HEADER + '\n'.join(exports))
 
   if missing_output_files:
     raise ValueError(
-        'Missing outputs for python_api_gen genrule:\n%s' %
-        ',\n'.join(missing_output_files))
+        'Missing outputs for python_api_gen genrule:\n%s.'
+        'Make sure all required outputs are in the '
+        'tensorflow/tools/api/generator/BUILD file.' %
+        ',\n'.join(sorted(missing_output_files)))
 
 
 def main(output_files):
diff --git a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
index 30f7e4e11655797fbd8f0ea65c2eb84768ca486b..875d802a9c458e299f73c130bb2b37c5d8828aad 100644
--- a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
@@ -18,6 +18,14 @@ tf_class {
     name: "DESCRIPTOR"
     mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
   }
+  member {
+    name: "EXPERIMENTAL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Experimental"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
   member {
     name: "Extensions"
     mtype: "<type \'getset_descriptor\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
index ebf49f434ae468311a07374cdca1140336983a81..b0e983115499c5b5b79459affc931600ad16256b 100644
--- a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
@@ -18,6 +18,10 @@ tf_class {
     name: "META_GRAPH_VERSION_FIELD_NUMBER"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "STRIPPED_DEFAULT_ATTRS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "STRIPPED_OP_LIST_FIELD_NUMBER"
     mtype: "<type \'int\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.compat.pbtxt b/tensorflow/tools/api/golden/tensorflow.compat.pbtxt
index ccc60314001f261a2b4a5560bea83ffa017fd914..bab480ff9b105546790aadb72f3eb88a795ebbff 100644
--- a/tensorflow/tools/api/golden/tensorflow.compat.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.compat.pbtxt
@@ -32,4 +32,8 @@ tf_module {
     name: "as_text"
     argspec: "args=[\'bytes_or_text\', \'encoding\'], varargs=None, keywords=None, defaults=[\'utf-8\'], "
   }
+  member_method {
+    name: "path_to_str"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt
index cfe09345acccc410ad3041a965901134440e3c77..ca96f4eaece0020235d24901f51306a65676c1c9 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt
index 2e6578bae1604f69e4697bb4668dd69d94bd68b5..d0508acd9f4f6c190b205301223599cf5b027955 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt
index d42b0e82e4fab3e30d3ebf1b8bea8b44bb61ea0f..ff0fbb56cd4b9e4c288a168a7c3d9e83c552b0e2 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt
index 710164743e851f0bb5c31ebe78b260b623e87378..d75e4a2f88b29ff7f638d72f98876a230b191dce 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt
index 6cc361672ed8da313e1bebc41fbf093e019d38ad..b838b9ae21decba0323211f08d09fe373ababf23 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt
index 40ad07d1be4bdea9585eb276debb1fdf3dfff583..6f06b7d50dd9f5f405673d572503ff549f148f33 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -71,6 +75,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt
index 8f34d25fea873827997ecd9df10cf1b3bfd0e56b..d34f9cde5d4d4161883f6d1b4646f22f054d16ad 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt
@@ -65,6 +65,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -81,6 +85,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt
index 0ae88fba3b4fd176641cc17c916181cc9a6a12c6..df268b8d99eb6bf22264ddb63231074413686efa 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt
index e7cd595e946cb91f162a2a1af8753e44cdfbc0e1..303dcb4ed3bf8416b822bb010c2e87e8ef03b7c9 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt
index 7a4a16ff836a485e65cb6e061e27b92907cb4a63..ecda8acb15c49c390eaae203a0082e78e53499bd 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt
index 14c8c34cc2d8efacec706bdb894d9f069d5e7033..92b9eeea223b488cda1ebcabd31ec808e78fcf70 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt
index 30db6d3f35c1c8ea7bbc376a20093302dd373bd9..9aa7f9a63465c78f79ae4a8a11bc63d92d027dab 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt
index 46cbdf225f68e879fd18ef4a07048746a9a71b08..d1b9d3069629c552d6c6048642934f422a13dce7 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
index f5ed263f0e20d6fdf7f23a3a2ab06029084d20e4..be9ba4ce85bd5b9905a39e3f45873c534594e15f 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
@@ -29,7 +29,7 @@ tf_class {
   }
   member_method {
     name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "get_variable_names"
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "train"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
index 61a29942c577a056e94dfe661fa5fec952b4f634..91fca67b6b5b1187b61f398a152793362c0c6e30 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
@@ -29,7 +29,7 @@ tf_class {
   }
   member_method {
     name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "get_variable_names"
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "train"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
index 16e3b246156792418109981cc85ce0b07854a62c..cd4f72fcf839fa89f25c7ed115ee6c61294283c3 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
@@ -29,7 +29,7 @@ tf_class {
   }
   member_method {
     name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "get_variable_names"
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "train"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index c6765ae277983eee54d0d998d6ad85c065460653..303fd74a64d0c7f5a0292a4eaabec63455c29381 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
@@ -29,7 +29,7 @@ tf_class {
   }
   member_method {
     name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "get_variable_names"
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "train"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index e3a820db46e085d0aa61f76e2ffd6e32abbfd855..c97ea7969eff3e6952a604e72ce140b49d304461 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
@@ -29,7 +29,7 @@ tf_class {
   }
   member_method {
     name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "get_variable_names"
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "train"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
index a4c8cf667179ba9863251469195cb75f1a60560e..4b5b5bf0e3599a81e2e853ae8ba34ef12cc63097 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
@@ -29,7 +29,7 @@ tf_class {
   }
   member_method {
     name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "get_variable_names"
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "train"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
index dbcc187f94509e3c9265d59cb76d0cdd01bd2333..aa6ac46613fbead7457b19e1aae5f2532afddef1 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "mode"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "prediction_hooks"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "predictions"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
index 787952eced27532cbd8596e9aacb3ce5abd7fade..42a0d595216ad28363727b9d7c066fc37fddd02c 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
@@ -20,7 +20,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_fn\', \'model_dir\', \'config\', \'params\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'model_fn\', \'model_dir\', \'config\', \'params\', \'warm_start_from\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "evaluate"
@@ -28,7 +28,7 @@ tf_class {
   }
   member_method {
     name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "get_variable_names"
@@ -44,7 +44,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "train"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
index 99c03aa6297f4726970b83ad1f88924d320c5e33..2de52d6c57cc70b562c3c10b7f23cd15b63e25f8 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
@@ -29,7 +29,7 @@ tf_class {
   }
   member_method {
     name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "get_variable_names"
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "train"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
index e2ab96d5b46d9cdebc558e756ca26158fddb3f26..e552f33720bb939b8a98d34ef3de78bda7db976c 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
   member_method {
     name: "evaluate"
@@ -29,7 +29,7 @@ tf_class {
   }
   member_method {
     name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "get_variable_names"
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "train"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
index d006ecb254724405bfec4000f063a93c41e77055..091b1be0c83480757445542acb97e139bd74ef03 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
@@ -10,6 +10,10 @@ tf_class {
     name: "evaluation_master"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_id_in_cluster"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "is_chief"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-vocab-info.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-vocab-info.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a16e3aedae96e7289e73c49ac7890550dd5ddb08
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-vocab-info.pbtxt
@@ -0,0 +1,39 @@
+path: "tensorflow.estimator.VocabInfo"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.warm_starting_util.VocabInfo\'>"
+  is_instance: "<class \'tensorflow.python.estimator.warm_starting_util.VocabInfo\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "backup_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "new_vocab"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "new_vocab_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_oov_buckets"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "old_vocab"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "old_vocab_size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-warm-start-settings.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-warm-start-settings.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..afdd6bb058353594415cd1abe726070f84ae46b6
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-warm-start-settings.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.estimator.WarmStartSettings"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.warm_starting_util.WarmStartSettings\'>"
+  is_instance: "<class \'tensorflow.python.estimator.warm_starting_util.WarmStartSettings\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "ckpt_to_initialize_from"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "var_name_to_prev_var_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "var_name_to_vocab_info"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "vars_to_warm_start"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
index cdc367b99e80104da988172bc25e76c679976b2d..a7a6cc1e49ddfe07569dff035e38931a0510addd 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
@@ -68,6 +68,14 @@ tf_module {
     name: "TrainSpec"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "VocabInfo"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "WarmStartSettings"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "export"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
index 018e8c909a23a9e7093c1bb411643d7db629b21c..24a58fb118bf52e650e1df71e9374099745ade52 100644
--- a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
@@ -48,6 +48,10 @@ tf_module {
     name: "numeric_column"
     argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "shared_embedding_columns"
+    argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
+  }
   member_method {
     name: "weighted_categorical_column"
     argspec: "args=[\'categorical_column\', \'weight_feature_key\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index f32353c9570cb6c0f6536f5e9093a690c2522db5..bda1c2bf85977e69b0969bc8b6056710d88ca910 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -100,6 +100,10 @@ tf_module {
     name: "hsv_to_rgb"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_jpeg"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "non_max_suppression"
     argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'None\'], "
@@ -168,13 +172,21 @@ tf_module {
     name: "rgb_to_hsv"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "rgb_to_yiq"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rgb_to_yuv"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "rot90"
     argspec: "args=[\'image\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
   }
   member_method {
     name: "sample_distorted_bounding_box"
-    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "total_variation"
@@ -184,4 +196,12 @@ tf_module {
     name: "transpose_image"
     argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "yiq_to_rgb"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "yuv_to_rgb"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
index 07b8d900da5dbd9f2c9396ecaf06b9d22ef50a0b..76cf84084f1208e50bed66f39b604c5db576981a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -38,6 +38,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "losses"
     mtype: "<type \'property\'>"
@@ -108,11 +112,11 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'inputs\', \'outputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_update"
@@ -120,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "add_weight"
@@ -146,6 +150,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -156,15 +164,15 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -224,7 +232,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index 546bac44e4c9905d13c4f3b0e3d9c1b5cc6c5e59..fb6c8d70dd43eae60ea2fb86f0fc63c36d2b13ad 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -39,6 +39,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "losses"
     mtype: "<type \'property\'>"
@@ -125,7 +129,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_update"
@@ -133,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "add_weight"
@@ -159,6 +163,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -169,7 +177,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
@@ -177,7 +185,7 @@ tf_class {
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -245,7 +253,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.densenet.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.densenet.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..42cb91445059873d9a4ed32d609129de203a764f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.densenet.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.applications.densenet"
+tf_module {
+  member_method {
+    name: "DenseNet121"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "DenseNet169"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "DenseNet201"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "decode_predictions"
+    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'x\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.nasnet.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.nasnet.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd75b87540533680d096853ae8645da132dd119a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.nasnet.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.keras.applications.nasnet"
+tf_module {
+  member_method {
+    name: "NASNetLarge"
+    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "NASNetMobile"
+    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "decode_predictions"
+    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
index daeb5aad419156a19f929fdd455f6c208cd7390f..9fc086eb8e17ef368b38e8d51f0ac8bf0562ca4f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.applications"
 tf_module {
+  member {
+    name: "densenet"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "inception_resnet_v2"
     mtype: "<type \'module\'>"
@@ -12,6 +16,10 @@ tf_module {
     name: "mobilenet"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "nasnet"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "resnet50"
     mtype: "<type \'module\'>"
@@ -28,6 +36,18 @@ tf_module {
     name: "xception"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "DenseNet121"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "DenseNet169"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "DenseNet201"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
   member_method {
     name: "InceptionResNetV2"
     argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
@@ -40,6 +60,14 @@ tf_module {
     name: "MobileNet"
     argspec: "args=[\'input_shape\', \'alpha\', \'depth_multiplier\', \'dropout\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'1\', \'0.001\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
   }
+  member_method {
+    name: "NASNetLarge"
+    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "NASNetMobile"
+    argspec: "args=[\'input_shape\', \'include_top\', \'weights\', \'input_tensor\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'imagenet\', \'None\', \'None\', \'1000\'], "
+  }
   member_method {
     name: "ResNet50"
     argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
index 44fbe0f7a04e8573a5348d626854e3b5834381dd..ba2d083a755384d4ec2076ac0dea580a1a878f1d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
@@ -398,7 +398,7 @@ tf_module {
   }
   member_method {
     name: "rnn"
-    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\'], "
+    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "round"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index 8719c07ca385d2794e5c7e77f75d6d2bc734b7cb..d4c85a4519eb922629f107ef7b61c3f11cb27163 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'schedule\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'schedule\', \'verbose\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt
index ef08f9b20f4c95f3692a03be7f4220f20aae9a58..bda31751d429ca0d0544402e5c496a0597e1849e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.datasets.boston_housing.pbtxt
@@ -2,6 +2,6 @@ path: "tensorflow.keras.datasets.boston_housing"
 tf_module {
   member_method {
     name: "load_data"
-    argspec: "args=[\'path\', \'seed\', \'test_split\'], varargs=None, keywords=None, defaults=[\'boston_housing.npz\', \'113\', \'0.2\'], "
+    argspec: "args=[\'path\', \'test_split\', \'seed\'], varargs=None, keywords=None, defaults=[\'boston_housing.npz\', \'0.2\', \'113\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt
index 8b1c17e9da13a76dcc2c09f3c01a0375bf0cb9fe..ff962876b66cae013de5d711dc7eac5d5c80d8c3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.datasets.imdb.pbtxt
@@ -6,6 +6,6 @@ tf_module {
   }
   member_method {
     name: "load_data"
-    argspec: "args=[\'path\', \'num_words\', \'skip_top\', \'maxlen\', \'seed\', \'start_char\', \'oov_char\', \'index_from\'], varargs=None, keywords=None, defaults=[\'imdb.npz\', \'None\', \'0\', \'None\', \'113\', \'1\', \'2\', \'3\'], "
+    argspec: "args=[\'path\', \'num_words\', \'skip_top\', \'maxlen\', \'seed\', \'start_char\', \'oov_char\', \'index_from\'], varargs=None, keywords=kwargs, defaults=[\'imdb.npz\', \'None\', \'0\', \'None\', \'113\', \'1\', \'2\', \'3\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt
index 6b3ed1e9af0ea7ab4fa83c07c520adf6727a93ac..2da4a13067f2b39eb06304864ea626002300a862 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.datasets.reuters.pbtxt
@@ -6,6 +6,6 @@ tf_module {
   }
   member_method {
     name: "load_data"
-    argspec: "args=[\'path\', \'num_words\', \'skip_top\', \'maxlen\', \'test_split\', \'seed\', \'start_char\', \'oov_char\', \'index_from\'], varargs=None, keywords=None, defaults=[\'reuters.npz\', \'None\', \'0\', \'None\', \'0.2\', \'113\', \'1\', \'2\', \'3\'], "
+    argspec: "args=[\'path\', \'num_words\', \'skip_top\', \'maxlen\', \'test_split\', \'seed\', \'start_char\', \'oov_char\', \'index_from\'], varargs=None, keywords=kwargs, defaults=[\'reuters.npz\', \'None\', \'0\', \'None\', \'0.2\', \'113\', \'1\', \'2\', \'3\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
index 38e6128644529f012bdf1c9a7aa6656c1cef1ecd..f4ab075959906cdf350ec5d49dc86f928b7eb7ae 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
index 0fa60646612ab383a5022990c06b76571e269f05..eb558cddafc3972127786353072767f0d53bf174 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
index 75d56bf445847abfdc2b3e78d0ce5543aef152d9..770a107b664d7ab0a8aedf292a34d4258a201859 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 6e52b6238d5b255f75d1105f2e895267117a2029..0ce42b706ec20a8ea1cc83ec95cb64d9be2e5710 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 0e16774e8614e9b7ec7d4e90e176ba25f1512257..d6c98fa225ce924bc8e20f8531516eaed4d32ffb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 98112762cf842519956af94ac8593c418e26c0d1..754fd310c6d8ddb994db0590342b29f8cb7abd71 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 2e093c0359664e8553c1be2c3b2d930df2c3aebe..9b62880c7931d151fb98cc1dc3149dcbd4dd103d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
index bada65e2f93cfa223b51d9ed3d44ab88cbad5a77..b371ad148cee16dd243869d929e0c1c002794682 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 120807c4b530c3fec508373bfc15131ffb532f72..3e2aba55fd63326bb0e232fdce06f32884db7a0a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 834365f0f70e8447a8b6ba62cffe95a3c2a17e51..fb37308cce0124538648c3837e1e802794d7f1ae 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 462a52ec1ebd4ce7f4b5289b76242ae1f992c032..813470ffc7c87727eb0b958e54806f530399806a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
index b802b363d013f819824c849ded762ff08a32cede..e251ac18e511b58a49816126d9941b98e4f91088 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
index 5279b2ab17d1fd3e8ca8cc75e9f7866ddaf25fb5..db26c3e568da09d1523003ab538d565c6c2e1464 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -123,12 +123,16 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -155,7 +159,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
@@ -171,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_weights"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
index b800eb9796b04f0ffdb24768130669cec8e5babe..ff08def0a08e5201bc01d61be3f2d66d712c384b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 8c2b110c6d3d0a12bf8bfde9ac939f66d6f93419..6db22ca0320519fd9c101456c9c9c0e26a9a11e0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
index 47c63c11573e9fe20106f0a6a84a8940ae5f01e5..577f206e3510a9995d5d383ac440b4f68ea39fe5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index e90b90e8016d4a955c57010cbf387d359963dafa..72924c32b43e5edb39938cc0cd909cffefa61be1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -127,6 +127,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
index aa571b722dea6511925c9bf7f10714f252b897e7..16be08d9b2bae8fe1faecf34c4d87ac9b9baf142 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 911c73f8462df78c1353c5803660bffca4e33694..11e05f884d781166616a9c9a61dacbc8fdae6ae3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
@@ -126,6 +127,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
index bb111b327c22d6fdb502c8454fe114ee427d2a77..72b72d6b3b1e410dda0b0a529449f0135203fc1b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
index 5a5ec635cce36c4e4561f73700e73a3ae215c596..ee93247f63ed700dc6058041bd0ea4ff5c879078 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 190b670fa2a4b04e124c3d1f63e691dfbe8cdbbb..e5023287e5f38553f3553a37b5a908790072b5c7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -127,6 +127,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
index a26ec82f2b96e69c445f05aa852a7b37ab67dbd7..ba38cb7121c9d312e7ba9d7147bdc67673d1ad2e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 19b5bdf36befbcf8877fc28b54d9c712d83a74b2..58724a1e1661609ef3c000c7ca1dfe9b3235acff 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.Convolution3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
@@ -126,6 +127,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
index 773ef01feb8fe179ec34d3e392395afb79200b8f..98d52c430c659d0fc3e9299f7bede9190dad2fcf 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
index 3a67ac00ab193b3e7e72a105b6df2757c0164b74..33b6ebe1af731f66f88a9493502f69049ab34b42 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
index de5a695b69f5b6977546fbf6211b26973c47fda2..4b241ebb0f68c270a9448b02138d44f82211f418 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
index bf251b4df5dae7b9541679062ef5fa163e22bda5..1856a9ee21347ed6ca3dd592517eb644e205a5b7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
index 92a74cec68090271a59dd44dd93c1fc4821afbdd..a8c37af31f649d28ca2ab7614178f2dee58c13fc 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
index cdd62eee0d3a26303caaa0f643b9fedae81f91d7..07d3f023e54105c606b198c05750ffa78ee5d0c8 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
index 7935143b2cf4f84dfd7d81286dea96ca9f57ec6c..e2e21b5f123f63fa38cb0e344be9a12fc091f20b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
index 497eb004992e4256fec158e1eb50dbb0b915aeea..92b9760d53e35d3e5066a730bb5cbda45492cc64 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
index 35616cbebb388f1198aebdf0eeb5eaed76ea52e1..83c528b40117222ac2b3e85ad338459948d0aa8c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
index 427c6fde90334a39cd1e3bef96952c792a1d3955..73609752886c8c57a78f6bc02cc46d2c7ff6e996 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 763184899ca05c39b56e002f1e50ce07210c7409..b329f1c46bb07ab7684dec6aaf45a20b98c27ed9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index 889f2cbc2345e605035b71d69261e92c56aa645f..c741d4d6e6cf8da9712e68f86abe64e2828823da 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -183,7 +183,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -193,6 +193,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -223,7 +227,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 1428691afe2a525cc46ddb4f1b73239cdb613b31..57596badf1881950270fa6d3c074afb65daaa8eb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 655734cc432f7f18dfc5dc1f5f255650cb574a1a..3829353cc3c195a750ad862707c5c8563e203fba 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index d97f06ea137f1128ffb6e1ebfcc10e0160904387..e53e78a977b32eaf2e31867044aedd39ab2dd34f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 52886b2106aa3f99ded8a66a20f5cf6bec48b233..48fcd1044e06b2fe61aadb6c3675ce82197ff003 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index ccb6459357f8248cc760995de94f5ef305d8c64b..66c06ed47289eb2d83d97778a7b13dab821722d2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 1f25eb1cc64fcfe3489fef1c32f1b806ca74b478..4f2420f74ab3069952e4a44bf61e5e12b3e80ea3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index a37d6dda28a653836bf4c495165f2aff05744298..7912a6d933b851521358e0246d04688da410b909 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 9f276fd54714756d6db17921c7f4f139a8b05a8b..d5b2d2c274ad97071497045271c0a595f8e0e062 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index eaa9b477d853c12e7b7dd183e09073a8116b24e6..d88ff17eb6df7bbba7d3af4344fc8ddc367ae44c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index f4d37a5f63432dec1131bc7bce0cabd1af6e8db3..c8cc5a0ddfdd54cbb47de922591a9842abf63396 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index afddd2d4cbf7d1ba089fbc35a684664aded4e2a6..7956c5a340d963cfd5976e8af56da222848a164a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 12cd49c9551c4520fd05592d7a3f456b3d328859..0a7e16413dfbd80d448eb1bad5771915475d96b2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 146241c172319211827d77482c634ae6218137b0..6c8a58a996f5313ea48e395e7e443a7c21f198ee 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 00475301aa009385e7f23241864475f38bf00da8..7678ce8aab63fcfa76c0ac61346a723c1dfe1ee7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
index 49841237cef52d3b16b498510f7c24744d57b4e9..d46fd41a3f33002a9bbe755851278c9729ccd1d1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 4ce7c34f6c75c179442b6d7473281086115f4b64..3b171b137af699c9608494a17c5651b439fe4545 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
index e1a1d0d58ecbc9a5aa6e1bbde49d92aec9714f42..29d9cf78ab5ed3bdd1a488359b59cf7171e7e051 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -187,7 +187,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -197,6 +197,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -227,7 +231,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
index 59508c2f11073caca1f30544efaea435730ce228..ca0144929942f7024a4e8bac5552bf0547ceb56d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
@@ -90,7 +90,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'function\', \'mask\', \'arguments\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'function\', \'output_shape\', \'mask\', \'arguments\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
index ca904a2b8c77e55430e4f76ff4fa2be641c199a6..c52ad727545c0bf4f199714d71180eac3f1bf62a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
@@ -123,6 +123,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index f52fd02515f30f4011a154cda4274d7e7dd34a88..8134fb738683b79764662d9ea7f721fe04751162 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index b5c32d1cdf3dc7e35d9c78dd81431bb67aab1b27..c5d452300947d7f74e7458e2a04bfdfabb1c1da2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 0ac2b83a999b3c8245ce616ccf5d79833747aee3..bcbed9241b525a953c8b499197facaefebe8cc44 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
index de2a28d985d3f05c639a103b316f66f15d326f95..244e79b4ffe60ddd6aa56d2780d80dfd66c494a9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 130d932fd6de0fde1052843cd9b10bf2a748441c..56cbf5df785ef0e2614ea7e9e6cfe1335e148eec 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 82a6f6d539080436ef2e49b5a4b342a2dfff3ae2..33c2d30e86f9cdc3fb9f4f498bfc2c94497fe2dd 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
index ca2fd4e502b6bb87c44363f91f9dcd26b386eb3a..94f91059b7a1e291c38fe0045accc6c03f226603 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 885e30f8799fd7e156c9f048b59483ad00d41fba..247230a6d68b8ea93a30a2f5846d8baaa78cb13e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 102879d2f536cc6bfdb31558c36412b3d1e93885..8d61b67e7ce9564d31b0bd904a58540d19c89172 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 424061614659249605569a571ff09adf52db3997..ad2e30802006e934730e5c75247e958329f7121c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
index 4b32c2e99f9a0e3abdb3f1bfe27137f6f5052491..ff0db15f190675d533c50c277eb1cb60e0b95e55 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
index 0c964235ae7d2c352c053b97f902bf2516263628..1d3f33f04516345ee32f16befe0d7200d2cdad00 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
@@ -115,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 797a073b8a74bc482a555bb12806afe36d0df79e..c86bc49b22a8cc3e004a77f4a21594aacb2c665a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
index 7dc1fa6964eb0fcba47ae2db270152364e244eaa..2043e1a1263f0f0745b7c6446cc670fd6b0f0000 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
index c7c9b10f22dfc9799217727e5020d6f45bb488f3..ad539a7c4c5362500baef0a9c89d054762bbb47d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -94,7 +94,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cell\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'activity_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'cell\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -128,6 +128,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -158,7 +162,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
index dedb48151a84f00f96db8942e08f5508cecfcbba..4b0e98520a0dd86c085fa7345af445e1ae253d3b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
index bb30c0a945da7b6f869fa385eebbb8301851e8ae..34bc71af8a26ff6e4d7c81a3877751df5209906f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd67b76523cc50409516e29f963f59d039455bfd
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -0,0 +1,186 @@
+path: "tensorflow.keras.layers.SeparableConv1D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index f289664ba27063bcceb3b419e99e57066625cdbf..5d898fb2bd86b39cb8fab755382bb96cce231fa6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.SeparableConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
@@ -127,6 +127,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf62c095e7cc3fbeac95919a0f9fdc545efd3d25
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -0,0 +1,186 @@
+path: "tensorflow.keras.layers.SeparableConvolution1D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'None\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index d78872861253f2f782a79e50e0f0a174464f388a..c758d87993b3acba88a13c7bc9eaeee929a22652 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.SeparableConvolution2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
@@ -127,6 +127,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 10c7f8867cbb979e4e7a724fae41babd81d0a1ea..6e3cde3e3eaba4f9985411d66a220f7cdd4ee7ad 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 588df21088fffb1ce207132a0cf043f103f71afc..6fafc77b947d0df11755e3136ed2e7a14c148081 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -185,6 +185,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -215,7 +219,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee4b2fa39ed34a544ee800e9370e4f34c4a17041
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
@@ -0,0 +1,183 @@
+path: "tensorflow.keras.layers.Softmax"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.Softmax\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'axis\'], varargs=None, keywords=kwargs, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 9773c4acc750c59a810cc467a9239a397c62ec25..e4727072e375b9fc4dc99a1536eaaf3df5415369 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index d4de587a4801c56ca5903bdd1169b816d008765d..c5ff7043115ccdd3bc4a1147790b20feda410f65 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index af210fab8dc444bfb3b3f8fda0edb5121f6ad0ba..476a7f362cf88e234e964f6f6645ee4ed0cbaff8 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -126,6 +126,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 5779e41342214cc5ec60589d6c3879a79c4a639d..90c37bd98650db42abceb9508c7dc7e564cbee68 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -128,6 +128,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -154,7 +158,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index 8cfb33a14896b767deae34d4b76485729ef0122f..ef31c5443efa0c0e5a7a2e0a422d2a9c9c49baaf 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
index dedef65ff931618082a4a4d1fdc01e38043ce837..40aa782a02b6f2ce71860f0df5c3e61ead68e337 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -125,6 +125,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -151,7 +155,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
@@ -167,7 +171,7 @@ tf_class {
   }
   member_method {
     name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_weights"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index bb42cdcb65643190f1d634e2ad23447fb40c90ee..a81b83be49e0073f242efc6890e419b4fe172ab2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index 6d3c2ebfef8af42c288d7de6124e1ae326994c1d..5403279d45ec7b93bae7907b891c659a043e96d0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index d790cf2e08030d3b3f362a19474fd6d1d7833c65..96c337caf28d43fabd0b90df016f4e8ab0c408db 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
index 313b3a9e155c11e46fd70f2fea0d8dec003d6667..27a54382a47dffd17810ebdcb45cb838c1442635 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -150,7 +154,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
@@ -166,7 +170,7 @@ tf_class {
   }
   member_method {
     name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_weights"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index ba6c23ae75afae9177fa4f1fda34dc3f6d12939e..b81a4b1c50b22f13eacb521cfc8bc288bd40c81f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index cb587d67b0d99cf38823c2d74b833474ec4b5b10..1a26f2f3c9bbaa2aa567e76e1aafe14805ecff38 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 415720cbe11134f6b2426a2eab395566e65cbf8f..310277fe67433fd870ae3d907984f402576925b2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -124,6 +124,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
index fe336c4be5a84a3764b550ca5ad2fcd1d3b85b94..088c8e88e26f59f2753733252882f5e0e8287fb6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
@@ -292,10 +292,18 @@ tf_module {
     name: "Reshape"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SeparableConv1D"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SeparableConv2D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SeparableConvolution1D"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SeparableConvolution2D"
     mtype: "<type \'type\'>"
@@ -308,6 +316,10 @@ tf_module {
     name: "SimpleRNNCell"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Softmax"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SpatialDropout1D"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index 4e522813a5a3956b4888f95b2f14ecd52d897256..d8d4eb5ca710937b7c5394a41741243babcbe0c4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -38,6 +38,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "losses"
     mtype: "<type \'property\'>"
@@ -108,11 +112,11 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'inputs\', \'outputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_update"
@@ -120,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "add_weight"
@@ -146,6 +150,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -156,15 +164,15 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -224,7 +232,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index ddbb358c84ca50fceb4fb71eddf0083f034f65e1..2e044d78bb2cd6c0ac817218480565c785d11ddc 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -39,6 +39,10 @@ tf_class {
     name: "input_spec"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "losses"
     mtype: "<type \'property\'>"
@@ -125,7 +129,7 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_update"
@@ -133,7 +137,7 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "add_weight"
@@ -159,6 +163,10 @@ tf_class {
     name: "compute_mask"
     argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -169,7 +177,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit"
@@ -177,7 +185,7 @@ tf_class {
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -245,7 +253,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
index ed040c15864b4f4c386d2d9e1f664d35d651fa14..32667cf31e4aaacf3374ca4a434f32eec5b3e07e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'0.95\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'0.95\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
index a24651429a3db49a96b217259c5c6ef09efed2f2..efca59e8e427d28de36446a49ea4e1ca0bb385eb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
index a0d978fded3825bafcd8d60e34677029495b1245..5546e2067ab65abce928d609b41b65bbc40246f6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\', \'amsgrad\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'None\', \'0.0\', \'False\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
index 1b70c93ad5f0a8fd52d65fb4b8132a87878c26dd..aaa54a106066266d0a7c19f4609e4cc7ed766d95 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
index b49dbe5cf82ea838076134a0feecc120bfb88f84..1fada7fd9c6eefbb16f1b5a042e6fea607a461a9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'schedule_decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'1e-08\', \'0.004\'], "
+    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'schedule_decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.004\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index c8860d80d40353211df65f08fda5deb26af91d66..fd3f97f35dcb18c82188c51345c2e3276a88f23f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'1e-08\', \'0.0\'], "
+    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
index 66cd37bb3a378ccd1bbdffd79f87338c9b4cf265..04174bff5f04fead68af68afeec80316867009a4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'directory\', \'image_data_generator\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'None\', \'\', \'png\', \'False\'], "
+    argspec: "args=[\'self\', \'directory\', \'image_data_generator\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'data_format\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'None\', \'\', \'png\', \'False\', \'nearest\'], "
   }
   member_method {
     name: "next"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
index 7e33285e7abbc10df7f697e10071e429c5183d9e..41f27d1f740457f4b7c4f74cb089a448a0fed845 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
@@ -16,7 +16,7 @@ tf_class {
   }
   member_method {
     name: "flow_from_directory"
-    argspec: "args=[\'self\', \'directory\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'False\'], "
+    argspec: "args=[\'self\', \'directory\', \'target_size\', \'color_mode\', \'classes\', \'class_mode\', \'batch_size\', \'shuffle\', \'seed\', \'save_to_dir\', \'save_prefix\', \'save_format\', \'follow_links\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'(256, 256)\', \'rgb\', \'None\', \'categorical\', \'32\', \'True\', \'None\', \'None\', \'\', \'png\', \'False\', \'nearest\'], "
   }
   member_method {
     name: "random_transform"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
index 5bc8c4012049b0414936fb56a853fc32430df3d9..ce91caa1afe081ccf05ecdd4884a3e29ea93d496 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_words\', \'filters\', \'lower\', \'split\', \'char_level\'], varargs=None, keywords=None, defaults=[\'None\', \'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \', \'False\'], "
+    argspec: "args=[\'self\', \'num_words\', \'filters\', \'lower\', \'split\', \'char_level\', \'oov_token\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\', \'True\', \' \', \'False\', \'None\'], "
   }
   member_method {
     name: "fit_on_sequences"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt
index 3adc6b6faa6f62330f9ac3d621f29adfc380a09d..16e1cbe650e1662f8694fd7137ad20a48a90675b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\'], "
+    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\', \'stateful_metrics\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\', \'None\'], "
   }
   member_method {
     name: "add"
@@ -12,6 +12,6 @@ tf_class {
   }
   member_method {
     name: "update"
-    argspec: "args=[\'self\', \'current\', \'values\', \'force\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'self\', \'current\', \'values\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
index 6e595ca34385d14f3ea7eb0da9a633f6f308f72f..de81206bc8b25046cd48c79ff8f154041c0e0cb0 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -108,6 +108,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
index 7b6c30773b95984ee8438820a45bf2c607a912ff..72d5496464210efd9e423996dfb274dd9564f761 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -108,6 +108,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
index 7a7664e80013557c922a1d399de16e32a78f60ff..595e77ff9f8b64b6606fb075f3edf2281b4c3c1f 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -108,6 +108,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
index c9f5c18f25628d8b1d575113232da8a75d0e428c..0c4aa2ff2612269727026141574726ad6df5cdbd 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
index 1fa00d7b2f9d3d34861e2030a98487d660e81305..5f576d0189309442dc4cea3d3617ab3144420165 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
@@ -108,6 +108,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
index a92a1094ac0c042e6ab9a2d153e8a06ab183d0ec..675a7c76e569d3163ecd2c547841b4c36078b21d 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -109,6 +109,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
index 7fa78ab20b1260c1eb87293e200015c7b2895b19..eaabbf6aab172aea5c51f8071076890bb6b5bcf7 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
@@ -108,6 +108,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
index e92e4859ae5b179f8b2a2328219aa6f16d740903..838e070d79d2d7cfbd631f1a5e9960412cfdae5a 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -109,6 +109,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
index 87e5c2949e681e224efc94265559c31256082f1e..4bd8cfc1a48cd839e2ffa54d0d0ca863060406d8 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
@@ -108,6 +108,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
index cc4ee4c8a5ec6bc8f2395fedb8aed8d334342013..57eccb03ffeb90652b019b5ce8a519797e4a3a3d 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
index 99ab2ef97c73bbd305a3755b78e8174b643fe0a0..a1ec00eeeaa98a6199e29b187b0760ddc92db09d 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
index f4074c5a4f6b45896f49e295830c91f58b46c84a..a06943d51a52f1951056136445b0d5786d801b5b 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
index ec51609dee9bedb75566163e35225a1797d4cd5c..24fda0c87ed0aeabd0fd4a16bb2efab444f8cd8a 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
index 745c532e94beda1280e366bf592d347a5275ad11..4c3d00e0e1ddfe95c56f9ebc7c5d609c79dd44d4 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -108,6 +108,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
index f8244c01b64105d0c4467c3f90ccec4e2d06adb4..f7e2017b0c9438130f1cfb2431eb73ca4d3103c5 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -108,6 +108,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
index df5378f279bbeb254f4a9fee2724b07baee87203..84780926a38ff811a5ab35fadfac690a6dbbbbe2 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -108,6 +108,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..05799ecfc9fdb9ff44620a67dcdbdc4426fddced
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -0,0 +1,144 @@
+path: "tensorflow.layers.SeparableConv1D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'None\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
index c55d2bccc9b10d142d82073013639121dc45ebf1..c2aeb35c4648bcce22ca73c838a85803a6b9cedf 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.layers.SeparableConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
@@ -109,6 +109,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.pbtxt
index c45d6e6c05054f1c0c61caeaf5e9a3fd7d00983f..59134f84891ad5518dcb5331ce04475482c8b59e 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.pbtxt
@@ -68,6 +68,10 @@ tf_module {
     name: "MaxPooling3D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SeparableConv1D"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SeparableConv2D"
     mtype: "<type \'type\'>"
@@ -136,6 +140,10 @@ tf_module {
     name: "max_pooling3d"
     argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
   }
+  member_method {
+    name: "separable_conv1d"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'None\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
   member_method {
     name: "separable_conv2d"
     argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'channels_last\', \'(1, 1)\', \'1\', \'None\', \'True\', \'None\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
index 62e634afb87b9dcc02ab0ceaaa7bdff62f9bfefa..1d9c0c0f6d28dfb1a218586075bcb6820b1c62b1 100644
--- a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
@@ -88,6 +88,10 @@ tf_module {
     name: "logdet"
     argspec: "args=[\'matrix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "logm"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "lstsq"
     argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'True\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt
index 4bdc73370bffb3c44945fc5c9e4fbafcdd72255e..258ad5047eb6e82eeb9c0941b0acf0573e5ca61d 100644
--- a/tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt
@@ -18,6 +18,14 @@ tf_class {
     name: "SUM_BY_NONZERO_WEIGHTS"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "SUM_OVER_BATCH_SIZE"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUM_OVER_NONZERO_WEIGHTS"
+    mtype: "<type \'str\'>"
+  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/tensorflow.manip.pbtxt b/tensorflow/tools/api/golden/tensorflow.manip.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b84165285102daf0a8e3dd6542bfc391e50f77b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.manip.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.manip"
+tf_module {
+  member_method {
+    name: "roll"
+    argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
index ebd9c079b543e79eb0d6cfa369394362e9a8825f..455590d866a4c1ebea65ccff51e34f2e0b0479d7 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
@@ -54,15 +54,15 @@ tf_module {
   }
   member_method {
     name: "conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv2d_transpose"
@@ -70,11 +70,11 @@ tf_module {
   }
   member_method {
     name: "conv3d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv3d_backprop_filter_v2"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv3d_transpose"
@@ -86,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "crelu"
-    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'features\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
   }
   member_method {
     name: "ctc_beam_search_decoder"
@@ -106,15 +106,15 @@ tf_module {
   }
   member_method {
     name: "depthwise_conv2d_native"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "depthwise_conv2d_native_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "depthwise_conv2d_native_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "dilation2d"
@@ -234,7 +234,7 @@ tf_module {
   }
   member_method {
     name: "quantized_conv2d"
-    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "quantized_max_pool"
@@ -262,7 +262,7 @@ tf_module {
   }
   member_method {
     name: "sampled_softmax_loss"
-    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'mod\', \'sampled_softmax_loss\'], "
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\', \'seed\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'mod\', \'sampled_softmax_loss\', \'None\'], "
   }
   member_method {
     name: "selu"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 49066eecaa0fda4a7a60c62b7a087d054bd73079..44536787f09fc98bba8a4eb0bc562427cfe48b8b 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.nn.rnn_cell.BasicLSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
@@ -117,6 +117,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index bf38f678b69269e0b0a99b7812a9a304d7aaec1d..768565d3cacbd1313ee5a64c9b15f9ab70683772 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.nn.rnn_cell.BasicRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.BasicRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
@@ -117,6 +117,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 81dcd90e81e9185f087892a5ebda0bb8460b0d8d..0d253e5dd233d6d2b6ad0070a463c283a8769dab 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 8ff225897ae26adb3723aaf729030771e26833a4..97edf245f6fbed393a6fb8dbf1e83649e9ac4b4e 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "weights"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "wrapped_cell"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'cell\', \'input_keep_prob\', \'output_keep_prob\', \'state_keep_prob\', \'variational_recurrent\', \'input_size\', \'dtype\', \'seed\', \'dropout_state_filter_visitor\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1.0\', \'1.0\', \'False\', \'None\', \'None\', \'None\', \'None\'], "
@@ -116,6 +120,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index ba15ffb792d81177040b078865134b0de7ca7a99..6ecc134d4df866ab5d59e238a8157064421579bd 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.nn.rnn_cell.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.GRUCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
@@ -117,6 +117,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 8d17153972cfd99072eee1db56728e67b98db0da..4b3ca1578ba52f30e3405ff198fb716496a462c6 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.nn.rnn_cell.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
@@ -117,6 +117,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index 68c3064dd4f2f1453102cffd078e6a2e5356e0d5..9a6c73a079884b8ab92be1c9e89b2a9f34aad851 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 86ff0fee2b369fb77bdcba6b19dc89f39a48642b..27488f8e73f20456fae911511ecd2e41a60da351 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -115,6 +115,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 1a6f8a3b7dc1990b83f518ee1970ab36b2594fda..3310836ed26387718115c2454300b9edfe930451 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -116,6 +116,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 0edd4153d772459d941cb260c26fd9e09f017f12..066c4513ff5185b50bdf193f579e71e505dbd3b6 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -124,6 +124,10 @@ tf_module {
     name: "LogMessage"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "MONOLITHIC_BUILD"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "MetaGraphDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -392,6 +396,10 @@ tf_module {
     name: "losses"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "manip"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "metrics"
     mtype: "<type \'module\'>"
@@ -652,6 +660,10 @@ tf_module {
     name: "assert_less_equal"
     argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "assert_near"
+    argspec: "args=[\'x\', \'y\', \'rtol\', \'atol\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "assert_negative"
     argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -1140,6 +1152,10 @@ tf_module {
     name: "group"
     argspec: "args=[], varargs=inputs, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "guarantee_const"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "hessians"
     argspec: "args=[\'ys\', \'xs\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\'], varargs=None, keywords=None, defaults=[\'hessians\', \'False\', \'False\', \'None\'], "
@@ -1148,6 +1164,10 @@ tf_module {
     name: "histogram_fixed_width"
     argspec: "args=[\'values\', \'value_range\', \'nbins\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'100\', \"<dtype: \'int32\'>\", \'None\'], "
   }
+  member_method {
+    name: "histogram_fixed_width_bins"
+    argspec: "args=[\'values\', \'value_range\', \'nbins\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'100\', \"<dtype: \'int32\'>\", \'None\'], "
+  }
   member_method {
     name: "identity"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1394,7 +1414,7 @@ tf_module {
   }
   member_method {
     name: "multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "multiply"
@@ -1706,11 +1726,11 @@ tf_module {
   }
   member_method {
     name: "serialize_many_sparse"
-    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
   }
   member_method {
     name: "serialize_sparse"
-    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
   }
   member_method {
     name: "serialize_tensor"
@@ -1838,15 +1858,15 @@ tf_module {
   }
   member_method {
     name: "sparse_segment_mean"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sparse_segment_sqrt_n"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sparse_segment_sum"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sparse_slice"
@@ -2028,10 +2048,26 @@ tf_module {
     name: "unique_with_counts"
     argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
+  member_method {
+    name: "unravel_index"
+    argspec: "args=[\'indices\', \'dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_max"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "unsorted_segment_min"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unsorted_segment_prod"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unsorted_segment_sqrt_n"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "unsorted_segment_sum"
     argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2062,7 +2098,7 @@ tf_module {
   }
   member_method {
     name: "while_loop"
-    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
+    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\', \'maximum_iterations\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "write_file"
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-profiler.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-profiler.pbtxt
index 0fb363aca48031e13487d716a0375973f93b3dc8..acb61dae9f0d184ba998aa820ec40de5bc38c3eb 100644
--- a/tensorflow/tools/api/golden/tensorflow.profiler.-profiler.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-profiler.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'graph\', \'op_log\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'graph\', \'op_log\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "add_step"
@@ -30,4 +30,8 @@ tf_class {
     name: "profile_python"
     argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "serialize_to_string"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.pbtxt
index 26b25ee3d47241dbf351018f2aacbda12ff33492..7b4d3ac522abc4229c5623da25c4ec818d86f829 100644
--- a/tensorflow/tools/api/golden/tensorflow.profiler.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.pbtxt
@@ -26,11 +26,11 @@ tf_module {
   }
   member_method {
     name: "advise"
-    argspec: "args=[\'graph\', \'run_meta\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'0\'], "
+    argspec: "args=[\'graph\', \'run_meta\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\'], "
   }
   member_method {
     name: "profile"
-    argspec: "args=[\'graph\', \'run_meta\', \'op_log\', \'cmd\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'scope\', \'0\'], "
+    argspec: "args=[\'graph\', \'run_meta\', \'op_log\', \'cmd\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'scope\', \'0\'], "
   }
   member_method {
     name: "write_op_log"
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
index 56d76902fd0fe72ced6c0267295d9a9dc822a745..ca8e5884b18110d4293225e595c030e9629b5663 100644
--- a/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
@@ -8,11 +8,11 @@ tf_class {
   }
   member_method {
     name: "add_meta_graph"
-    argspec: "args=[\'self\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "add_meta_graph_and_variables"
-    argspec: "args=[\'self\', \'sess\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'sess\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt
index 5683766b28975a3a17da3cdbfbaa4e8baab5f3ba..e1a0385092c1384bcb5958fce2e24693ee731ae5 100644
--- a/tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt
@@ -32,4 +32,8 @@ tf_module {
     name: "utils"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "simple_save"
+    argspec: "args=[\'session\', \'export_dir\', \'inputs\', \'outputs\', \'legacy_init_op\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
index 326e077d396bc5e3463bba3818f4757127ee0370..871ebb5247f62e9300566da063e4dadeb5087091 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
@@ -50,7 +50,7 @@ tf_module {
   }
   member_method {
     name: "merge_all"
-    argspec: "args=[\'key\'], varargs=None, keywords=None, defaults=[\'summaries\'], "
+    argspec: "args=[\'key\', \'scope\'], varargs=None, keywords=None, defaults=[\'summaries\', \'None\'], "
   }
   member_method {
     name: "scalar"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt
index 04c11712cd4c200bb2c04342e66924abf59c5f73..2cda458f468b2d748b43954b14b670df7145243f 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt
@@ -20,7 +20,7 @@ tf_class {
   }
   member_method {
     name: "export_meta_graph"
-    argspec: "args=[\'self\', \'filename\', \'collection_list\', \'as_text\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filename\', \'collection_list\', \'as_text\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "from_proto"
@@ -36,7 +36,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'sess\', \'save_path\', \'global_step\', \'latest_filename\', \'meta_graph_suffix\', \'write_meta_graph\', \'write_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'meta\', \'True\', \'True\'], "
+    argspec: "args=[\'self\', \'sess\', \'save_path\', \'global_step\', \'latest_filename\', \'meta_graph_suffix\', \'write_meta_graph\', \'write_state\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'meta\', \'True\', \'True\', \'False\'], "
   }
   member_method {
     name: "set_last_checkpoints"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
index 3ffc6407306b4e44ec23052187b6f9376bba833c..e49c719a334455d1f8f39fa67332be8bb81f2bc2 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
@@ -266,7 +266,11 @@ tf_module {
   }
   member_method {
     name: "cosine_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "cosine_decay_restarts"
+    argspec: "args=[\'learning_rate\', \'global_step\', \'first_decay_steps\', \'t_mul\', \'m_mul\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'1.0\', \'0.0\', \'None\'], "
   }
   member_method {
     name: "create_global_step"
@@ -282,7 +286,7 @@ tf_module {
   }
   member_method {
     name: "export_meta_graph"
-    argspec: "args=[\'filename\', \'meta_info_def\', \'graph_def\', \'saver_def\', \'collection_list\', \'as_text\', \'graph\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'filename\', \'meta_info_def\', \'graph_def\', \'saver_def\', \'collection_list\', \'as_text\', \'graph\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\', \'strip_default_attrs\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "generate_checkpoint_state_proto"
diff --git a/tensorflow/tools/api/golden/tensorflow.variable_scope.pbtxt b/tensorflow/tools/api/golden/tensorflow.variable_scope.pbtxt
index de1ad7e860a616f6737cd451b9c7d90d1ab079c9..e62dec93e6f06a10f48d72b0cda74426887806fb 100644
--- a/tensorflow/tools/api/golden/tensorflow.variable_scope.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.variable_scope.pbtxt
@@ -4,6 +4,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name_or_scope\', \'default_name\', \'values\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'reuse\', \'dtype\', \'use_resource\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'name_or_scope\', \'default_name\', \'values\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'reuse\', \'dtype\', \'use_resource\', \'constraint\', \'auxiliary_name_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index f80dd6fe5b6a70b4198fff8da7b457645452b3e1..608a34ab7b32bdc26cebbe43b383155406fb51b2 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -17,10 +17,6 @@ py_test(
     name = "api_compatibility_test",
     srcs = ["api_compatibility_test.py"],
     data = [
-        ":convert_from_multiline",
-        "//tensorflow/core:base_api_def",
-        "//tensorflow/core:python_api_def",
-        "//tensorflow/python:hidden_ops",
         "//tensorflow/tools/api/golden:api_golden",
         "//tensorflow/tools/api/tests:API_UPDATE_WARNING.txt",
         "//tensorflow/tools/api/tests:README.txt",
@@ -29,7 +25,6 @@ py_test(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
         "//tensorflow/tools/api/lib:python_object_to_proto_visitor",
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index a8fdf4c9a07a21269920c61d7f560562dab7b5f4..c1e09cc531ed8e8995e3e73b87e96b72fba6c038 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -28,10 +28,8 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-from collections import defaultdict
 import os
 import re
-import subprocess
 import sys
 import unittest
 
@@ -39,7 +37,6 @@ import tensorflow as tf
 
 from google.protobuf import text_format
 
-from tensorflow.core.framework import api_def_pb2
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
@@ -67,11 +64,6 @@ _API_GOLDEN_FOLDER = 'tensorflow/tools/api/golden'
 _TEST_README_FILE = 'tensorflow/tools/api/tests/README.txt'
 _UPDATE_WARNING_FILE = 'tensorflow/tools/api/tests/API_UPDATE_WARNING.txt'
 
-_CONVERT_FROM_MULTILINE_SCRIPT = 'tensorflow/tools/api/tests/convert_from_multiline'
-_BASE_API_DIR = 'tensorflow/core/api_def/base_api'
-_PYTHON_API_DIR = 'tensorflow/core/api_def/python_api'
-_HIDDEN_OPS_FILE = 'tensorflow/python/ops/hidden_ops.txt'
-
 
 def _KeyToFilePath(key):
   """From a given key, construct a filepath."""
@@ -96,55 +88,6 @@ def _FileNameToKey(filename):
   return api_object_key
 
 
-def _GetSymbol(symbol_id):
-  """Get TensorFlow symbol based on the given identifier.
-
-  Args:
-    symbol_id: Symbol identifier in the form module1.module2. ... .sym.
-
-  Returns:
-    Symbol corresponding to the given id.
-  """
-  # Ignore first module which should be tensorflow
-  symbol_id_split = symbol_id.split('.')[1:]
-  symbol = tf
-  for sym in symbol_id_split:
-    symbol = getattr(symbol, sym)
-  return symbol
-
-
-def _IsGenModule(module_name):
-  if not module_name:
-    return False
-  module_name_split = module_name.split('.')
-  return module_name_split[-1].startswith('gen_')
-
-
-def _GetHiddenOps():
-  hidden_ops_file = file_io.FileIO(_HIDDEN_OPS_FILE, 'r')
-  hidden_ops = set()
-  for line in hidden_ops_file:
-    line = line.strip()
-    if not line:
-      continue
-    if line[0] == '#':  # comment line
-      continue
-    # If line is of the form "op_name # comment", only keep the op_name.
-    line_split = line.split('#')
-    hidden_ops.add(line_split[0].strip())
-  return hidden_ops
-
-
-def _GetGoldenApiDefs():
-  old_api_def_files = file_io.get_matching_files(_GetApiDefFilePath('*'))
-  return {file_path: file_io.read_file_to_string(file_path)
-          for file_path in old_api_def_files}
-
-
-def _GetApiDefFilePath(graph_op_name):
-  return os.path.join(_PYTHON_API_DIR, 'api_def_%s.pbtxt' % graph_op_name)
-
-
 class ApiCompatibilityTest(test.TestCase):
 
   def __init__(self, *args, **kwargs):
@@ -248,14 +191,15 @@ class ApiCompatibilityTest(test.TestCase):
       logging.info('No differences found between API and golden.')
 
   @unittest.skipUnless(
-      sys.version_info.major == 2 and os.uname()[0] == 'Linux',
-      'API compabitility test goldens are generated using python2 on Linux.')
+      sys.version_info.major == 2,
+      'API compabitility test goldens are generated using python2.')
   def testAPIBackwardsCompatibility(self):
     # Extract all API stuff.
     visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
 
     public_api_visitor = public_api.PublicAPIVisitor(visitor)
     public_api_visitor.do_not_descend_map['tf'].append('contrib')
+    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
     traverse.traverse(tf, public_api_visitor)
 
     proto_dict = visitor.GetProtos()
@@ -286,188 +230,6 @@ class ApiCompatibilityTest(test.TestCase):
         update_goldens=FLAGS.update_goldens)
 
 
-class ApiDefTest(test.TestCase):
-
-  def __init__(self, *args, **kwargs):
-    super(ApiDefTest, self).__init__(*args, **kwargs)
-    self._first_cap_pattern = re.compile('(.)([A-Z][a-z]+)')
-    self._all_cap_pattern = re.compile('([a-z0-9])([A-Z])')
-
-  def _GenerateLowerCaseOpName(self, op_name):
-    lower_case_name = self._first_cap_pattern.sub(r'\1_\2', op_name)
-    return self._all_cap_pattern.sub(r'\1_\2', lower_case_name).lower()
-
-  def _CreatePythonApiDef(self, base_api_def, endpoint_names):
-    """Creates Python ApiDef that overrides base_api_def if needed.
-
-    Args:
-      base_api_def: (api_def_pb2.ApiDef) base ApiDef instance.
-      endpoint_names: List of Python endpoint names.
-
-    Returns:
-      api_def_pb2.ApiDef instance with overrides for base_api_def
-      if module.name endpoint is different from any existing
-      endpoints in base_api_def. Otherwise, returns None.
-    """
-    endpoint_names_set = set(endpoint_names)
-
-    # If the only endpoint is equal to graph_op_name then
-    # it is equivalent to having no endpoints.
-    if (not base_api_def.endpoint and len(endpoint_names) == 1
-        and endpoint_names[0] ==
-        self._GenerateLowerCaseOpName(base_api_def.graph_op_name)):
-      return None
-
-    base_endpoint_names_set = {
-        self._GenerateLowerCaseOpName(endpoint.name)
-        for endpoint in base_api_def.endpoint}
-
-    if endpoint_names_set == base_endpoint_names_set:
-      return None  # All endpoints are the same
-
-    api_def = api_def_pb2.ApiDef()
-    api_def.graph_op_name = base_api_def.graph_op_name
-
-    for endpoint_name in sorted(endpoint_names):
-      new_endpoint = api_def.endpoint.add()
-      new_endpoint.name = endpoint_name
-
-    return api_def
-
-  def _GetBaseApiMap(self):
-    """Get a map from graph op name to its base ApiDef.
-
-    Returns:
-      Dictionary mapping graph op name to corresponding ApiDef.
-    """
-    # Convert base ApiDef in Multiline format to Proto format.
-    converted_base_api_dir = os.path.join(
-        test.get_temp_dir(), 'temp_base_api_defs')
-    subprocess.check_call(
-        [os.path.join(resource_loader.get_root_dir_with_all_resources(),
-                      _CONVERT_FROM_MULTILINE_SCRIPT),
-         _BASE_API_DIR, converted_base_api_dir])
-
-    name_to_base_api_def = {}
-    base_api_files = file_io.get_matching_files(
-        os.path.join(converted_base_api_dir, 'api_def_*.pbtxt'))
-    for base_api_file in base_api_files:
-      if file_io.file_exists(base_api_file):
-        api_defs = api_def_pb2.ApiDefs()
-        text_format.Merge(
-            file_io.read_file_to_string(base_api_file), api_defs)
-        for api_def in api_defs.op:
-          name_to_base_api_def[api_def.graph_op_name] = api_def
-    return name_to_base_api_def
-
-  def _AddHiddenOpOverrides(self, name_to_base_api_def, api_def_map):
-    """Adds ApiDef overrides to api_def_map for hidden Python ops.
-
-    Args:
-      name_to_base_api_def: Map from op name to base api_def_pb2.ApiDef.
-      api_def_map: Map from file path to api_def_pb2.ApiDefs for Python API
-        overrides.
-    """
-    hidden_ops = _GetHiddenOps()
-    for hidden_op in hidden_ops:
-      if hidden_op not in name_to_base_api_def:
-        logging.warning('Unexpected hidden op name: %s' % hidden_op)
-        continue
-
-      base_api_def = name_to_base_api_def[hidden_op]
-      if base_api_def.visibility != api_def_pb2.ApiDef.HIDDEN:
-        api_def = api_def_pb2.ApiDef()
-        api_def.graph_op_name = base_api_def.graph_op_name
-        api_def.visibility = api_def_pb2.ApiDef.HIDDEN
-
-        file_path = _GetApiDefFilePath(base_api_def.graph_op_name)
-        api_def_map[file_path].op.extend([api_def])
-
-  @unittest.skipUnless(
-      sys.version_info.major == 2 and os.uname()[0] == 'Linux',
-      'API compabitility test goldens are generated using python2 on Linux.')
-  def testAPIDefCompatibility(self):
-    # Get base ApiDef
-    name_to_base_api_def = self._GetBaseApiMap()
-    snake_to_camel_graph_op_names = {
-        self._GenerateLowerCaseOpName(name): name
-        for name in name_to_base_api_def.keys()}
-    # Extract Python API
-    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
-    public_api_visitor = public_api.PublicAPIVisitor(visitor)
-    public_api_visitor.do_not_descend_map['tf'].append('contrib')
-    traverse.traverse(tf, public_api_visitor)
-    proto_dict = visitor.GetProtos()
-
-    # Map from file path to Python ApiDefs.
-    new_api_defs_map = defaultdict(api_def_pb2.ApiDefs)
-    # We need to override all endpoints even if 1 endpoint differs from base
-    # ApiDef. So, we first create a map from an op to all its endpoints.
-    op_to_endpoint_name = defaultdict(list)
-
-    # Generate map from generated python op to endpoint names.
-    for public_module, value in proto_dict.items():
-      module_obj = _GetSymbol(public_module)
-      for sym in value.tf_module.member_method:
-        obj = getattr(module_obj, sym.name)
-
-        # Check if object is defined in gen_* module. That is,
-        # the object has been generated from OpDef.
-        if hasattr(obj, '__module__') and _IsGenModule(obj.__module__):
-          if obj.__name__ not in snake_to_camel_graph_op_names:
-            # Symbol might be defined only in Python and not generated from
-            # C++ api.
-            continue
-          relative_public_module = public_module[len('tensorflow.'):]
-          full_name = (relative_public_module + '.' + sym.name
-                       if relative_public_module else sym.name)
-          op_to_endpoint_name[obj].append(full_name)
-
-    # Generate Python ApiDef overrides.
-    for op, endpoint_names in op_to_endpoint_name.items():
-      graph_op_name = snake_to_camel_graph_op_names[op.__name__]
-      api_def = self._CreatePythonApiDef(
-          name_to_base_api_def[graph_op_name], endpoint_names)
-
-      if api_def:
-        file_path = _GetApiDefFilePath(graph_op_name)
-        api_defs = new_api_defs_map[file_path]
-        api_defs.op.extend([api_def])
-
-    self._AddHiddenOpOverrides(name_to_base_api_def, new_api_defs_map)
-
-    old_api_defs_map = _GetGoldenApiDefs()
-    for file_path, new_api_defs in new_api_defs_map.items():
-      # Get new ApiDef string.
-      new_api_defs_str = str(new_api_defs)
-
-      # Get current ApiDef for the given file.
-      old_api_defs_str = (
-          old_api_defs_map[file_path] if file_path in old_api_defs_map else '')
-
-      if old_api_defs_str == new_api_defs_str:
-        continue
-
-      if FLAGS.update_goldens:
-        logging.info('Updating %s...' % file_path)
-        file_io.write_string_to_file(file_path, new_api_defs_str)
-      else:
-        self.assertMultiLineEqual(
-            old_api_defs_str, new_api_defs_str,
-            'To update golden API files, run api_compatibility_test locally '
-            'with --update_goldens=True flag.')
-
-    for file_path in set(old_api_defs_map) - set(new_api_defs_map):
-      if FLAGS.update_goldens:
-        logging.info('Deleting %s...' % file_path)
-        file_io.delete_file(file_path)
-      else:
-        self.fail(
-            '%s file is no longer needed and should be removed.'
-            'To update golden API files, run api_compatibility_test locally '
-            'with --update_goldens=True flag.' % file_path)
-
-
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index caa6629c491477ffcd108c52d7ce20f1ab95a0a9..6ed2594e6abe169577066678e1bf4b9e2df4c4d3 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -61,10 +61,11 @@ tf_cc_test(
 
 # This binary may be built for either desktop or Android.
 # A typical Android build command will look like the following:
-# bazel build -c opt tensorflow/core:android_tensorflow_lib \
+# bazel build tensorflow/core:android_tensorflow_lib \
 # --crosstool_top=//external:android/crosstool \
 # --cpu=armeabi-v7a \
 # --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+# --config monolithic
 tf_cc_binary(
     name = "benchmark_model",
     testonly = 1,
diff --git a/tensorflow/tools/benchmark/README.md b/tensorflow/tools/benchmark/README.md
index ca0da2d41b91a385cd57dbe1ebaf4fe83ed380c9..e64af2bfe1a77e6883f0c2c7dd9303e6d4eb4ee6 100644
--- a/tensorflow/tools/benchmark/README.md
+++ b/tensorflow/tools/benchmark/README.md
@@ -17,6 +17,7 @@ bazel build -c opt \
   --crosstool_top=//external:android/crosstool \
   --cpu=armeabi-v7a \
   --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --config monolithic \
   tensorflow/tools/benchmark:benchmark_model
 ```
 
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index 9809ad52de1319951aa82007ef9b933c6e707bf7..ecab6f8769ae2d0126f63580030ed6ff756015d0 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -530,7 +530,7 @@ int Main(int argc, char** argv) {
   }
 
   // Capture overall inference time without stat logging overhead. This is the
-  // timing data that can be compared to other libaries.
+  // timing data that can be compared to other libraries.
   SleepSeconds(inter_benchmark_sleep_seconds);
   int64 no_stat_time_us = 0;
   int64 no_stat_num_runs = 0;
diff --git a/tensorflow/tools/ci_build/Dockerfile.android b/tensorflow/tools/ci_build/Dockerfile.android
index 99a69d7b43bbc19f0b1e9ee7c741426c6651dfd6..dcf077791a9752f2e22999b082a9805bb3775c8d 100644
--- a/tensorflow/tools/ci_build/Dockerfile.android
+++ b/tensorflow/tools/ci_build/Dockerfile.android
@@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 LABEL maintainer="Jan Prach <jendap@google.com>"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.cmake b/tensorflow/tools/ci_build/Dockerfile.cmake
index 37ba24d65a2e95833511fa9b3e4044db634a08fd..ec90c83aacd068e8f9c16e5be8eb6e1cef098ea6 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cmake
+++ b/tensorflow/tools/ci_build/Dockerfile.cmake
@@ -23,7 +23,10 @@ RUN /install/install_deb_packages.sh
 
 RUN apt-get update
 RUN apt-get install -y --no-install-recommends python-pip
+RUN pip install --upgrade astor
+RUN pip install --upgrade gast
 RUN pip install --upgrade numpy
+RUN pip install --upgrade termcolor
 
 # Install golang
 RUN add-apt-repository -y ppa:ubuntu-lxc/lxd-stable
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu b/tensorflow/tools/ci_build/Dockerfile.cpu
index 57a854a9df738dea5d8560b54765099f32d0ff86..c61fda09af040140c000d6aa4a58f525fb98d80d 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu
@@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 LABEL maintainer="Jan Prach <jendap@google.com>"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu.mpi b/tensorflow/tools/ci_build/Dockerfile.cpu.mpi
index 2bf7fd1d23406da8381fc5071ddf4ae56d1cb0ee..d9f5b7c0364e6fbea18a9a32adca6613a5c37011 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cpu.mpi
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu.mpi
@@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 LABEL authors="Andrew Gibiansky <andrew.gibiansky@gmail.com>, Joel Hestness <jthestness@gmail.com>"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu
index 2d46ccb6b17ac3ab3af49c1649074eda8a840331..7591ecc04efa887ec1d35ba92881386f5a25241d 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@@ -1,8 +1,8 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Jan Prach <jendap@google.com>"
 
-# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
+# In the Ubuntu 16.04 images, cudnn is placed in system paths. Move them to
 # /usr/local/cuda
 RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
 RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu_clang b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
deleted file mode 100644
index 0ecd8c75e036fc18d37882834ed467d0edb096b1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.gpu_clang
+++ /dev/null
@@ -1,36 +0,0 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
-
-LABEL maintainer="Ilya Biryukov <ibiryukov@google.com>"
-
-# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
-# /usr/local/cuda
-RUN cp /usr/include/cudnn.h /usr/local/cuda/include
-RUN cp /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
-
-# Copy and run the install scripts.
-COPY install/*.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-RUN add-apt-repository -y ppa:openjdk-r/ppa
-
-# LLVM requires cmake version 3.4.3, but ppa:george-edison55/cmake-3.x only
-# provides version 3.2.2.
-# So we skip it in `install_deb_packages.sh`, and later install it from
-# https://cmake.org in `install_cmake_for_clang.sh`.
-RUN /install/install_deb_packages.sh --without_cmake
-RUN /install/install_pip_packages.sh
-RUN /install/install_bazel.sh
-RUN /install/install_golang.sh
-
-# Install cmake and build clang
-RUN /install/install_cmake_for_clang.sh
-RUN /install/build_and_install_clang.sh
-
-# Set up the master bazelrc configuration file.
-COPY install/.bazelrc /etc/bazel.bazelrc
-ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
-
-# Configure the build for our CUDA configuration.
-ENV TF_NEED_CUDA 1
-ENV TF_CUDA_CLANG 1
-ENV CLANG_CUDA_COMPILER_PATH /usr/local/bin/clang
-ENV TF_CUDA_COMPUTE_CAPABILITIES 3.0
diff --git a/tensorflow/tools/ci_build/Dockerfile.hadoop b/tensorflow/tools/ci_build/Dockerfile.hadoop
index 6010aedb339abadd8ee09d50d4eb279c5d3236f8..d05dedafbe28d93e768712a3bdbc1730ab8bb092 100644
--- a/tensorflow/tools/ci_build/Dockerfile.hadoop
+++ b/tensorflow/tools/ci_build/Dockerfile.hadoop
@@ -1,4 +1,4 @@
-FROM ubuntu:14.04
+FROM ubuntu:16.04
 
 LABEL maintainer="Jonathan Hseu <jhseu@google.com>"
 
diff --git a/tensorflow/tools/ci_build/builds/configured b/tensorflow/tools/ci_build/builds/configured
index de1e354170eec2477b6c895c421bc2a0fd9f7318..868a3beac5f1b2e993f16b3b1f1995ff58afde34 100755
--- a/tensorflow/tools/ci_build/builds/configured
+++ b/tensorflow/tools/ci_build/builds/configured
@@ -32,15 +32,6 @@ COMMAND=("$@")
 
 export CI_BUILD_PYTHON="${CI_BUILD_PYTHON:-python}"
 export PYTHON_BIN_PATH="${PYTHON_BIN_PATH:-$(which ${CI_BUILD_PYTHON})}"
-if [ "${CONTAINER_TYPE}" == "gpu" ]; then
-  export TF_NEED_CUDA=1
-elif [ "${CONTAINER_TYPE}" == "gpu_clang" ]; then
-  export TF_NEED_CUDA=1
-  export TF_CUDA_CLANG=1
-  export CLANG_CUDA_COMPILER_PATH="/usr/local/bin/clang"
-else
-  export TF_NEED_CUDA=0
-fi
 
 pushd "${CI_TENSORFLOW_SUBMODULE_PATH:-.}"
 yes "" | $PYTHON_BIN_PATH configure.py
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index 26713dded88ce7b93df0200ac84e11f8efb9baf3..9b3ff0cba7dcacc0f68a417299c31f7a0f413430 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -51,8 +51,8 @@ function build_libtensorflow_tarball() {
   rm -rf ${DIR}
 
   TARBALL_SUFFIX="${1}"
-  BAZEL="bazel --bazelrc ./tensorflow/tools/ci_build/install/.bazelrc"
-  BAZEL_OPTS="-c opt"
+  BAZEL_OPTS="-c opt --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
+  export CC_OPT_FLAGS='-mavx'
   if [ "${TF_NEED_CUDA}" == "1" ]; then
     BAZEL_OPTS="${BAZEL_OPTS} --config=cuda"
   fi
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 552df1434eab8c4414b8b9a8f7be9c61998d8462..82042b93c02275b51530b306d8cf4519482e5410 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -296,13 +296,11 @@ create_activate_virtualenv_and_install_tensorflow() {
     die "FAILED to create virtualenv directory: ${VIRTUALENV_DIR}"
   fi
 
-  # Verify that virtualenv exists
-  if [[ -z $(which virtualenv) ]]; then
-    die "FAILED: virtualenv not available on path"
-  fi
-
-  virtualenv ${VIRTUALENV_FLAGS} \
-    -p "${PYTHON_BIN_PATH}" "${VIRTUALENV_DIR}" || \
+  # Use the virtualenv from the default python version (i.e., python-virtualenv)
+  # to create the virtualenv directory for testing. Use the -p flag to specify
+  # the python version inside the to-be-created virtualenv directory.
+  ${PYTHON_BIN_PATH} -m virtualenv -p "${PYTHON_BIN_PATH}" ${VIRTUALENV_FLAGS} \
+    "${VIRTUALENV_DIR}" || \
     die "FAILED: Unable to create virtualenv"
 
   source "${VIRTUALENV_DIR}/bin/activate" || \
@@ -345,7 +343,7 @@ do_clean_virtualenv_smoke_test() {
   then
     echo "Smoke test of tensorflow install in clean virtualenv PASSED."
   else
-    echo "Smoke test of tensroflow install in clean virtualenv FAILED."
+    echo "Smoke test of tensorflow install in clean virtualenv FAILED."
     return 1
   fi
 
diff --git a/tensorflow/tools/ci_build/builds/print_build_info.sh b/tensorflow/tools/ci_build/builds/print_build_info.sh
index 7c43419a76ff26be7370326a9113f4e3db2a2b1c..e366abf8bb831688d90a0e3eabed101e42bdaf96 100755
--- a/tensorflow/tools/ci_build/builds/print_build_info.sh
+++ b/tensorflow/tools/ci_build/builds/print_build_info.sh
@@ -88,7 +88,7 @@ fi
 # Print info
 echo "TF_BUILD_INFO = {"\
 "container_type: \"${CONTAINER_TYPE}\", "\
-"command: \"${COMMAND[@]}\", "\
+"command: \"${COMMAND[*]}\", "\
 "source_HEAD: \"${TF_HEAD}\", "\
 "source_remote_origin: \"${TF_FETCH_URL}\", "\
 "OS: \"${OS}\", "\
diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index 358f82ac5da0dde655b4f2d1f145f2070b64238b..caa3a40817c80b27271f76de0a95a743cb2916f6 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -82,11 +82,11 @@ TF_CFLAGS=( $("${PYTHON_BIN_PATH}" \
 TF_LFLAGS=( $("${PYTHON_BIN_PATH}" \
 	      -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
 
-if [[ -z "${TF_CFLAGS}" || -z "${TF_LFLAGS}" ]]; then
+if [[ -z "${TF_CFLAGS[*]}" || -z "${TF_LFLAGS[*]}" ]]; then
   die "FAILED to determine TensorFlow compilation or linking flags"
 else
-  echo "TensorFlow compile flags: ${TF_CFLAGS[@]}"
-  echo "TensorFlow link flags: ${TF_LFLAGS[@]}"
+  echo "TensorFlow compile flags: ${TF_CFLAGS[*]}"
+  echo "TensorFlow link flags: ${TF_LFLAGS[*]}"
 fi
 
 # Check g++ availability
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 5164a2501269a1613a10c0a5a129221a7cd3e47b..072dd6ab995bb41c3197d6c898405be487534593 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -18,7 +18,7 @@
 #                    <COMMAND>
 #
 # CONTAINER_TYPE: Type of the docker container used the run the build:
-#                 e.g., (cpu | gpu | gpu_clang | android | tensorboard)
+#                 e.g., (cpu | gpu | android | tensorboard)
 #
 # DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build.
 #                  If this optional value is not supplied (via the
@@ -79,7 +79,7 @@ if [[ "${CONTAINER_TYPE}" == "cmake" ]]; then
 fi
 
 # Use nvidia-docker if the container is GPU.
-if [[ "${CONTAINER_TYPE}" == "gpu" ]] || [[ "${CONTAINER_TYPE}" == "gpu_clang" ]]; then
+if [[ "${CONTAINER_TYPE}" == "gpu" ]]; then
   DOCKER_BINARY="nvidia-docker"
 else
   DOCKER_BINARY="docker"
@@ -99,7 +99,7 @@ BUILD_TAG="${BUILD_TAG:-tf_ci}"
 
 # Add extra params for cuda devices and libraries for GPU container.
 # And clear them if we are not building for GPU.
-if [[ "${CONTAINER_TYPE}" != "gpu" ]] && [[ "${CONTAINER_TYPE}" != "gpu_clang" ]]; then
+if [[ "${CONTAINER_TYPE}" != "gpu" ]]; then
   GPU_EXTRA_PARAMS=""
 fi
 
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 2217b110e3f4e5dd2a212fe0cb65ac9f46ce943a..9d23b508aa1c1d20d0f4b5979aa7be2c295fe325 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -18,7 +18,7 @@
 #   ci_parameterized_build.sh
 #
 # The script obeys the following required environment variables:
-#   TF_BUILD_CONTAINER_TYPE:   (CPU | GPU | GPU_CLANG | ANDROID | ANDROID_FULL)
+#   TF_BUILD_CONTAINER_TYPE:   (CPU | GPU | ANDROID | ANDROID_FULL)
 #   TF_BUILD_PYTHON_VERSION:   (PYTHON2 | PYTHON3 | PYTHON3.5)
 #   TF_BUILD_IS_PIP:           (NO_PIP | PIP | BOTH)
 #
@@ -88,6 +88,9 @@
 #   TF_NIGHTLY:
 #                     If this run is being used to build the tf_nightly pip
 #                     packages.
+#   TF_CUDA_CLANG:
+#                     If set to 1, builds and runs cuda_clang configuration.
+#                     Only available inside GPU containers.
 #
 # This script can be used by Jenkins parameterized / matrix builds.
 
@@ -246,16 +249,34 @@ if [[ "$(uname -s)" == "Darwin" ]]; then
   OPT_FLAG="${OPT_FLAG} ${NO_DOCKER_OPT_FLAG}"
 fi
 
+# In DO_DOCKER mode, appends environment variable to docker's run invocation.
+# Otherwise, exports the corresponding variable.
+function set_script_variable() {
+  local VAR="$1"
+  local VALUE="$2"
+  if [[ $DO_DOCKER == "1" ]]; then
+    TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS="${TF_BUILD_APPEND_CI_DOCKER_EXTRA_PARAMS} -e $VAR=$VALUE"
+  else
+    export $VAR="$VALUE"
+  fi
+}
+
+
 # Process container type
 if [[ ${CTYPE} == "cpu" ]] || [[ ${CTYPE} == "debian.jessie.cpu" ]]; then
   :
-elif [[ ${CTYPE} == "gpu" ]] || [[ ${CTYPE} == "gpu_clang" ]]; then
-  if [[ ${CTYPE} == "gpu" ]]; then
-    OPT_FLAG="${OPT_FLAG} --config=cuda"
-  else # ${CTYPE} == "gpu_clang"
+elif [[ ${CTYPE} == "gpu" ]]; then
+  set_script_variable TF_NEED_CUDA 1
+
+  if [[ $TF_CUDA_CLANG == "1" ]]; then
     OPT_FLAG="${OPT_FLAG} --config=cuda_clang"
-  fi
 
+    set_script_variable TF_CUDA_CLANG 1
+    # For cuda_clang we download `clang` while building.
+    set_script_variable TF_DOWNLOAD_CLANG 1
+  else
+    OPT_FLAG="${OPT_FLAG} --config=cuda"
+  fi
 
   # Attempt to determine CUDA capability version automatically and use it if
   # CUDA capability version is not specified by the environment variables.
@@ -407,7 +428,7 @@ if [[ ${TF_BUILD_IS_PIP} == "no_pip" ]] ||
     # CPU only command, fully parallel.
     NO_PIP_MAIN_CMD="${MAIN_CMD} ${BAZEL_CMD} ${OPT_FLAG} ${EXTRA_ARGS} -- "\
 "${BAZEL_TARGET}"
-  elif [[ ${CTYPE} == "gpu" ]] || [[ ${CTYPE} == "gpu_clang" ]]; then
+  elif [[ ${CTYPE} == "gpu" ]]; then
     # GPU only command, run as many jobs as the GPU count only.
     NO_PIP_MAIN_CMD="${BAZEL_CMD} ${OPT_FLAG} "\
 "--local_test_jobs=${TF_GPU_COUNT} "\
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 404a9a6b6296652c009d5725919a21c9cd6e8178..aeac085d30aef746366192361f249eb01f95e8da 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -26,6 +26,8 @@
 SCRIPT_DIR=$( cd ${0%/*} && pwd -P )
 source "${SCRIPT_DIR}/builds/builds_common.sh"
 
+ROOT_DIR=$( cd "$SCRIPT_DIR/../../.." && pwd -P )
+
 # Helper functions
 die() {
   echo $@
@@ -99,7 +101,8 @@ do_pylint() {
 "^tensorflow/contrib/eager/python/metrics_impl\.py.*\[E0202.*method-hidden "\
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator "\
 "^tensorflow/python/keras/_impl/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
-"^tensorflow/python/keras/_impl/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition"
+"^tensorflow/python/keras/_impl/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition "\
+"^tensorflow/python/kernel_tests/constant_op_eager_test.py.*\[E0303.*invalid-length-returned"
 
   echo "ERROR_WHITELIST=\"${ERROR_WHITELIST}\""
 
@@ -110,9 +113,9 @@ do_pylint() {
   fi
 
   if [[ $1 == "PYTHON2" ]]; then
-    PYLINT_BIN="python /usr/local/lib/python2.7/dist-packages/pylint/lint.py"
+    PYLINT_BIN="python -m pylint"
   elif [[ $1 == "PYTHON3" ]]; then
-    PYLINT_BIN="python3 /usr/local/lib/python3.4/dist-packages/pylint/lint.py"
+    PYLINT_BIN="python3 -m pylint"
   else
     echo "Unrecognized python version (PYTHON2 | PYTHON3): $1"
     return 1
@@ -174,7 +177,17 @@ do_pylint() {
   echo "pylint took $((PYLINT_END_TIME - PYLINT_START_TIME)) s"
   echo ""
 
-  grep -E '(\[E|\[W0311|\[W0312)' ${OUTPUT_FILE} > ${ERRORS_FILE}
+  # Report only what we care about
+  # Ref https://pylint.readthedocs.io/en/latest/technical_reference/features.html
+  # E: all errors
+  # W0311 bad-indentation
+  # W0312 mixed-indentation
+  # C0330 bad-continuation
+  # C0301 line-too-long
+  # C0326 bad-whitespace
+  # W0611 unused-import
+  # W0622 redefined-builtin
+  grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326|\[W0611|\[W0622)' ${OUTPUT_FILE} > ${ERRORS_FILE}
 
   N_ERRORS=0
   while read -r LINE; do
@@ -310,7 +323,7 @@ do_external_licenses_check(){
   EXTRA_LICENSES_FILE="$(mktemp)_extra_licenses.log"
 
   echo "Getting external dependencies for ${BUILD_TARGET}"
- bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --no_implicit_deps --no_host_deps --keep_going \
+ bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --keep_going \
   | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
@@ -319,7 +332,7 @@ do_external_licenses_check(){
 
   echo
   echo "Getting list of external licenses mentioned in ${LICENSES_TARGET}."
-  bazel query "deps(${LICENSES_TARGET})" --no_implicit_deps --no_host_deps --keep_going \
+  bazel query "deps(${LICENSES_TARGET})" --keep_going \
   | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
@@ -333,6 +346,18 @@ do_external_licenses_check(){
 
   EXTERNAL_LICENSES_CHECK_END_TIME=$(date +'%s')
 
+  # Blacklist
+  echo ${MISSING_LICENSES_FILE}
+  grep -e "@bazel_tools//third_party/" -e "@com_google_absl//absl" -e "@org_tensorflow//" -v ${MISSING_LICENSES_FILE} > temp.txt
+  mv temp.txt ${MISSING_LICENSES_FILE}
+
+  # Whitelist
+  echo ${EXTRA_LICENSE_FILE}
+  grep -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -v ${EXTRA_LICENSES_FILE} > temp.txt
+  mv temp.txt ${EXTRA_LICENSES_FILE}
+
+
+
   echo
   echo "do_external_licenses_check took $((EXTERNAL_LICENSES_CHECK_END_TIME - EXTERNAL_LICENSES_CHECK_START_TIME)) s"
   echo
@@ -417,15 +442,8 @@ do_bazel_nobuild() {
 }
 
 do_pip_smoke_test() {
-  BUILD_CMD="bazel build ${BAZEL_FLAGS} //tensorflow/tools/pip_package:pip_smoke_test"
-  ${BUILD_CMD}
-  cmd_status \
-    "Pip smoke test has failed. Please make sure any new TensorFlow are added to the tensorflow/tools/pip_package:build_pip_package dependencies."
-
-  RUN_CMD="bazel-bin/tensorflow/tools/pip_package/pip_smoke_test"
-  ${RUN_CMD}
-  cmd_status \
-    "The pip smoke test failed."
+  cd "$ROOT_DIR/tensorflow/tools/pip_package"
+  python pip_smoke_test.py
 }
 
 do_code_link_check() {
@@ -499,20 +517,28 @@ do_clang_format_check() {
 }
 
 do_check_load_py_test() {
-  BUILD_CMD="bazel build ${BAZEL_FLAGS} //tensorflow/tools/pip_package:check_load_py_test"
-  ${BUILD_CMD}
-  cmd_status \
-    "check_load_py_test failed to build."
+  cd "$ROOT_DIR/tensorflow/tools/pip_package"
+  python check_load_py_test.py
+}
 
-  BUILD_CMD="bazel-bin/tensorflow/tools/pip_package/check_load_py_test"
-  ${BUILD_CMD}
-  cmd_status \
-    "check_load_py_test failed."
+do_cmake_python_sanity() {
+  cd "$ROOT_DIR/tensorflow/contrib/cmake"
+  python -m unittest -v python_sanity_test
+}
+
+do_check_futures_test() {
+  cd "$ROOT_DIR/tensorflow/tools/test"
+  python check_futures_test.py
+}
+
+do_check_file_name_test() {
+  cd "$ROOT_DIR/tensorflow/tools/test"
+  python file_name_test.py
 }
 
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity" "do_check_file_name_test")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency" "Check file names for cases")
 
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS="--config=hdfs --config=gcp"
@@ -547,7 +573,10 @@ while [[ ${COUNTER} -lt "${#SANITY_STEPS[@]}" ]]; do
 "${SANITY_STEPS[COUNTER]} (${SANITY_STEPS_DESC[COUNTER]}) ==="
   echo ""
 
+  # subshell: don't leak variables or changes of working directory
+  (
   ${SANITY_STEPS[COUNTER]} ${INCREMENTAL_FLAG}
+  )
   RESULT=$?
 
   if [[ ${RESULT} != "0" ]]; then
diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
index 6e7b752c06f43fe7f8fa26bd52a28ed33f38edd8..cfeaebdbf57c01fef7cd81dae76217429336d0ff 100755
--- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
+++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
@@ -45,7 +45,7 @@ for i in `seq 0 $((TF_GPU_COUNT-1))`; do
       # This export only works within the brackets, so it is isolated to one
       # single command.
       export CUDA_VISIBLE_DEVICES=$i
-      echo "Running test $@ on GPU $CUDA_VISIBLE_DEVICES"
+      echo "Running test $* on GPU $CUDA_VISIBLE_DEVICES"
       $@
     )
     return_code=$?
diff --git a/tensorflow/tools/ci_build/install/build_and_install_clang.sh b/tensorflow/tools/ci_build/install/build_and_install_clang.sh
deleted file mode 100755
index 99664344777256b9eb8c3764bb1900f26b43cc6e..0000000000000000000000000000000000000000
--- a/tensorflow/tools/ci_build/install/build_and_install_clang.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-set -ex
-
-LLVM_SVN_REVISION="314281"
-CLANG_TMP_DIR=/tmp/clang-build
-
-mkdir "$CLANG_TMP_DIR"
-
-pushd "$CLANG_TMP_DIR"
-
-# Checkout llvm+clang
-svn co -q -r$LLVM_SVN_REVISION http://llvm.org/svn/llvm-project/llvm/trunk "$CLANG_TMP_DIR/llvm"
-svn co -q -r$LLVM_SVN_REVISION http://llvm.org/svn/llvm-project/cfe/trunk "$CLANG_TMP_DIR/llvm/tools/clang"
-
-# Build 1st stage. Compile clang with system compiler
-mkdir "$CLANG_TMP_DIR/build-1"
-cd "$CLANG_TMP_DIR/build-1"
-cmake -G"Unix Makefiles" -DCMAKE_BUILD_TYPE=Release "$CLANG_TMP_DIR/llvm"
-make -j `nproc` clang clang-headers
-
-# Build 2nd stage. Compile clang with clang built in stage 1
-mkdir "$CLANG_TMP_DIR/build-2"
-cd "$CLANG_TMP_DIR/build-2"
-
-CC="$CLANG_TMP_DIR/build-1/bin/clang" \
-CXX="$CLANG_TMP_DIR/build-1/bin/clang++" \
-cmake -G"Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local "$CLANG_TMP_DIR/llvm"
-
-make -j `nproc` install-clang install-clang-headers
-
-popd
-
-# Cleanup
-rm -rf "$CLANG_TMP_DIR"
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index 1454264a8007104c6ad20d3e393076d1cc20513c..1df6a84d7c6f86abfb965063625ac43a3f1a57fb 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.5.4"
+BAZEL_VERSION="0.10.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh
index 4ab307c9253a8019f2c794b696db030722751770..96408105339d9a3e21aecb3bae9894551f8b6811 100755
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh
@@ -48,6 +48,7 @@ apt-get install -y --no-install-recommends \
     git \
     libcurl4-openssl-dev \
     libtool \
+    libssl-dev \
     mlocate \
     openjdk-8-jdk \
     openjdk-8-jre-headless \
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index b8ed1ab7676ff4efaef01dd5009effbf5ab05a92..d406b83a6246d18c335fb52cea1256d7809fa61a 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -27,6 +27,9 @@ easy_install3 -U pip
 pip2 install wheel
 pip3 install wheel
 
+pip2 install virtualenv
+pip3 install virtualenv
+
 # Install six.
 pip2 install --upgrade six==1.10.0
 pip3 install --upgrade six==1.10.0
@@ -40,8 +43,8 @@ pip2 install --upgrade werkzeug==0.11.10
 pip3 install --upgrade werkzeug==0.11.10
 
 # Install bleach. html5lib will be picked up as a dependency.
-pip2 install --upgrade bleach==1.5.0
-pip3 install --upgrade bleach==1.5.0
+pip2 install --upgrade bleach==2.0.0
+pip3 install --upgrade bleach==2.0.0
 
 # Install markdown.
 pip2 install --upgrade markdown==2.6.8
@@ -94,3 +97,10 @@ pip3 install portpicker
 pip2 install grpcio
 pip3 install grpcio
 
+# Eager-to-graph execution needs astor, gast and termcolor:
+pip2 install --upgrade astor
+pip3 install --upgrade astor
+pip2 install --upgrade gast
+pip3 install --upgrade gast
+pip2 install --upgrade termcolor
+pip3 install --upgrade termcolor
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 479242aa4376883f851486ca38a859a75d4f4f51..aefc49f60482148e565a5262eebd5b3ac85987cf 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,6 +39,8 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
+pip3.5 install --upgrade virtualenv
+
 # Install six.
 pip3.5 install --upgrade absl-py
 pip3.5 install --upgrade six==1.10.0
@@ -58,7 +60,7 @@ pip3.5 install --no-binary=:all: --upgrade numpy==1.12.0
 
 pip3.5 install scipy==0.18.1
 
-pip3.5 install scikit-learn==0.18.1
+pip3.5 install scikit-learn==0.19.1
 
 # pandas required by `inflow`
 pip3 install pandas==0.19.2
@@ -72,4 +74,9 @@ pip3.5 install werkzeug
 
 pip3.5 install grpcio
 
+# Eager-to-graph execution needs astor, gast and termcolor:
+pip3.5 install --upgrade astor
+pip3.5 install --upgrade gast
+pip3.5 install --upgrade termcolor
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index c354aaa154e8d01ba69f157dd195ef439270c2ec..bfaa044c82887bd1dc99d13952e09c9cc49cf11b 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -22,29 +22,42 @@
 
 # fkrull/deadsnakes is for Python3.6
 add-apt-repository -y ppa:fkrull/deadsnakes
+
 apt-get update
+apt-get upgrade
+
+# Install python dep
+apt-get install python-dev
+# Install bz2 dep
+apt-get install libbz2-dev
+# Install curses dep
+apt-get install libncurses5 libncurses5-dev
+apt-get install libncursesw5 libncursesw5-dev
+# Install readline dep
+apt-get install libreadline6 libreadline6-dev
+# Install sqlite3 dependencies
+apt-get install libsqlite3-dev
 
 set -e
+
 # Install Python 3.6 and dev library
-apt-get install -y --no-install-recommends python3.6 libpython3.6-dev
-
-# Install pip3.6
-set +e
-pip35_version=$(pip3.6 --version | grep "python 3.6")
-if [[ -z $pip35_version ]]; then
-  set -e
-  wget -q https://bootstrap.pypa.io/get-pip.py
-  python3.6 get-pip.py
-  rm -f get-pip.py
-fi
+wget https://www.python.org/ftp/python/3.6.1/Python-3.6.1.tar.xz
+tar xvf Python-3.6.1.tar.xz
+cd Python-3.6.1
+
+./configure
+make altinstall
+ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
+
+pip3 install --upgrade virtualenv
 
 set -e
 # Install six.
-pip3.6 install --upgrade absl-py
-pip3.6 install --upgrade six==1.10.0
+pip3 install --upgrade absl-py
+pip3 install --upgrade six==1.10.0
 
 # Install protobuf.
-pip3.6 install --upgrade protobuf==3.3.0
+pip3 install --upgrade protobuf==3.3.0
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
@@ -54,22 +67,31 @@ rm -rf /usr/lib/python3/dist-packages/six*
 # numpy needs to be installed from source to fix segfaults. See:
 # https://github.com/tensorflow/tensorflow/issues/6968
 # This workaround isn't needed for Ubuntu 16.04 or later.
-pip3.6 install --no-binary=:all: --upgrade numpy==1.12.0
+pip3 install --no-binary=:all: --upgrade numpy==1.12.0
 
-pip3.6 install scipy==0.18.1
+pip3 install scipy==0.18.1
 
-pip3.6 install scikit-learn==0.18.1
+pip3 install scikit-learn==0.19.1
 
 # pandas required by `inflow`
 pip3 install pandas==0.19.2
 
+pip3 install gnureadline
+
+pip3 install bz2file
+
 # Install recent-enough version of wheel for Python 3.6 wheel builds
-pip3.6 install wheel==0.29.0
+pip3 install wheel==0.29.0
+
+pip3 install portpicker
 
-pip3.6 install portpicker
+pip3 install werkzeug
 
-pip3.6 install werkzeug
+pip3 install grpcio
 
-pip3.6 install grpcio
+# Eager-to-graph execution needs astor, gast and termcolor:
+pip3 install --upgrade astor
+pip3 install --upgrade gast
+pip3 install --upgrade termcolor
 
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
index e3e6b2f3166e18dc29ae24671889230d3a4a71c7..51e10f81f82da7920e9d219eaec3e1eb2973b998 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
@@ -26,12 +26,13 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
 # Only running cc tests, python version does not matter.
 export PYTHON_BIN_PATH=`which python`
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test --test_lang_filters=cc,java -k \
-    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --config=opt \
     --test_output=errors -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
index 5110d52f31c257a043177ede686817e6206fa2eb..ea14848b1ae74ef0c42d14678fde225d465512bf 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
@@ -26,11 +26,12 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=`which python2`
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \
-    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only --config=opt \
     --test_output=errors -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
index df6016504cec19e02af988e87733fc409cef6826..6d017c8a1f0232deab82278b26797a73b3a8ea9c 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
@@ -26,12 +26,13 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=`which python3`
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test -k \
-    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --config=opt \
     --test_output=errors -- \
     //tensorflow/contrib/... \
     -//tensorflow/contrib/lite/... \
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
index ea9e102936bc56288acea051af3d3414766d38fb..a9accb9dd5b2d23e028a34ac3d99976d5f2f59db 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
@@ -26,11 +26,12 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=`which python3`
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \
-    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only --config=opt \
     --test_output=errors -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
index df196f829cd920b538fd0032950a9282c3043617..02224d8e9d9efd92b5c1658118bd0c45bdf4f1db 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
@@ -26,6 +26,7 @@ echo ""
 
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
+export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_CUDA=1
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
@@ -35,6 +36,6 @@ yes "" | $PYTHON_BIN_PATH configure.py
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
     --test_lang_filters=cc --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
-    --build_tests_only --test_output=errors --local_test_jobs=8 \
+    --build_tests_only --test_output=errors --local_test_jobs=8 --config=opt \
     --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
index abd256a895ea751f84ec946a85a4331fe5b23440..0367a53d1459e7207a76c83e0c1e5c83580722a7 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
@@ -26,6 +26,7 @@ echo ""
 
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
+export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_CUDA=1
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
@@ -35,6 +36,6 @@ yes "" | $PYTHON_BIN_PATH configure.py
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
     --test_lang_filters=py --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
-    --build_tests_only --test_output=errors --local_test_jobs=8 \
+    --build_tests_only --test_output=errors --local_test_jobs=8 --config=opt \
     --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
index ddaaddc9179ab640ce5b09b4d8732944b8177f8a..509ee38ec4fd584037f8e43726c01391430c1817 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
@@ -27,11 +27,12 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac \
     --test_timeout 300,450,1200,3600 \
-    --test_size_filters=small,medium \
+    --test_size_filters=small,medium --config=opt \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
     //tensorflow/contrib/... -//tensorflow/contrib/lite/...
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
index e026dcd08f1e4ba88cd231fa33cb29ce3e916652..05547136704394ed9262f566a2bfb4160b73c7fd 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -27,11 +27,12 @@ echo ""
 
 # Run configure.
 export TF_NEED_CUDA=0
+export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac \
-    --test_timeout 300,450,1200,3600 \
+    --test_timeout 300,450,1200,3600 --config=opt \
     --test_size_filters=small,medium \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 88116d9f246cabdf19c8b24bf8c95fdf52076fe0..1bd1852ffc570166ecc6efca1420bc54d702ed89 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -82,6 +82,7 @@ if [[ $1 == "PI_ONE" ]]; then
 else
   PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
+  --copt=-O3
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
diff --git a/tensorflow/tools/ci_build/pylintrc b/tensorflow/tools/ci_build/pylintrc
index e71017e621ccc8b42cdf8d4e4bd27a81791bbe4c..68fdb617166f70d2bddf0c472d23102960777de0 100644
--- a/tensorflow/tools/ci_build/pylintrc
+++ b/tensorflow/tools/ci_build/pylintrc
@@ -180,7 +180,17 @@ docstring-min-length=10
 max-line-length=80
 
 # Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+ignore-long-lines=(?x)
+  (^\s*(import|from)\s
+   |\$Id:\s\/\/depot\/.+#\d+\s\$
+   |^[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*("[^"]\S+"|'[^']\S+')
+   |^\s*\#\ LINT\.ThenChange
+   |^[^#]*\#\ type:\ [a-zA-Z_][a-zA-Z0-9_.,[\] ]*$
+   |pylint
+   |"""
+   |\#
+   |lambda
+   |(https?|ftp):)
 
 # Allow the body of an if to be on the same line as the test if there is no
 # else.
diff --git a/tensorflow/tools/ci_build/remote/Dockerfile.gpu b/tensorflow/tools/ci_build/remote/Dockerfile.gpu
index e13d2c1c20838a35ec90aa24f85c969cb1f4d52a..47ffd44163dd3e4b99f06689e1aa6f19f84cc2ca 100644
--- a/tensorflow/tools/ci_build/remote/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/remote/Dockerfile.gpu
@@ -18,7 +18,9 @@ RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
     rm get-pip.py
 
 # Set up grpc
-RUN pip install --upgrade enum34 futures mock numpy six backports.weakref && \
+RUN pip install --upgrade \
+        enum34 futures astor gast mock numpy six \
+        backports.weakref termcolor && \
     pip install --pre 'protobuf>=3.0.0a3' && \
     pip install 'grpcio>=1.1.3'
 
diff --git a/tensorflow/tools/ci_build/remote/remote_docker_build.sh b/tensorflow/tools/ci_build/remote/remote_docker_build.sh
index 3ac6840f4e7a881da4ab973a7fadd921ed288828..e00a66aabaf1068c772aabce2391616518be44d4 100755
--- a/tensorflow/tools/ci_build/remote/remote_docker_build.sh
+++ b/tensorflow/tools/ci_build/remote/remote_docker_build.sh
@@ -124,7 +124,7 @@ function build_tf_image {
 
 
 function publish_tf_image {
-  $gcr_tf_image="gcr.io/tensorflow/${tf_image}"
+  gcr_tf_image="gcr.io/tensorflow/${tf_image}"
   docker tag $tf_image $gcr_tf_image
   gcloud docker -- push $gcr_tf_image
 }
diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py
index d2a63e5d66a34f61d17e8327d4b25320371c4fa3..52a0da9a14847e863d92fee9ef7e63e4af0cf068 100755
--- a/tensorflow/tools/ci_build/update_version.py
+++ b/tensorflow/tools/ci_build/update_version.py
@@ -25,19 +25,19 @@
 # pylint: disable=superfluous-parens
 
 import argparse
-import fileinput
 import os
 import re
 import subprocess
 import time
 
-# File parameters
+# File parameters.
 TF_SRC_DIR = "tensorflow"
 VERSION_H = "%s/core/public/version.h" % TF_SRC_DIR
 SETUP_PY = "%s/tools/pip_package/setup.py" % TF_SRC_DIR
 README_MD = "./README.md"
 DEVEL_DOCKERFILE = "%s/tools/docker/Dockerfile.devel" % TF_SRC_DIR
 GPU_DEVEL_DOCKERFILE = "%s/tools/docker/Dockerfile.devel-gpu" % TF_SRC_DIR
+CPU_MKL_DEVEL_DOCKERFILE = "%s/tools/docker/Dockerfile.devel-cpu-mkl" % TF_SRC_DIR
 RELEVANT_FILES = [TF_SRC_DIR,
                   VERSION_H,
                   SETUP_PY,
@@ -45,17 +45,11 @@ RELEVANT_FILES = [TF_SRC_DIR,
                   DEVEL_DOCKERFILE,
                   GPU_DEVEL_DOCKERFILE]
 
-# Version type parameters
+# Version type parameters.
 NIGHTLY_VERSION = 1
 REGULAR_VERSION = 0
 
 
-def replace_line(old_line, new_line, filename):
-  """Replace a line in a file."""
-  for line in fileinput.input(filename, inplace=True):
-    print(line.rstrip().replace(old_line, new_line))
-
-
 def check_existence(filename):
   """Check the existence of file or dir."""
   if not os.path.exists(filename):
@@ -69,9 +63,12 @@ def check_all_files():
     check_existence(file_name)
 
 
-def replace_with_sed(query, filename):
+def replace_string_in_line(search, replace, filename):
   """Replace with sed when regex is required."""
-  subprocess.check_call(['sed', '-i', '-r', '-e', query, filename])
+  with open(filename, "r") as source:
+    content = source.read()
+  with open(filename, "w") as source:
+    source.write(re.sub(search, replace, content))
 
 
 class Version(object):
@@ -125,13 +122,13 @@ class Version(object):
     Raises:
       RuntimeError: If the version string is not valid.
     """
-    # Check validity of new version string
+    # Check validity of new version string.
     if not re.search(r"[0-9]+\.[0-9]+\.[a-zA-Z0-9]+", string):
       raise RuntimeError("Invalid version string: %s" % string)
 
     major, minor, extension = string.split(".", 2)
 
-    # Isolate patch and identifier string if identifier string exists
+    # Isolate patch and identifier string if identifier string exists.
     extension_split = extension.split("-", 1)
     patch = extension_split[0]
     if len(extension_split) == 2:
@@ -154,7 +151,7 @@ def get_current_semver_version():
     core/public/version.h
   """
 
-  # Get current version information
+  # Get current version information.
   version_file = open(VERSION_H, "r")
   for line in version_file:
     major_match = re.search("^#define TF_MAJOR_VERSION ([0-9]+)", line)
@@ -185,32 +182,33 @@ def get_current_semver_version():
 
 def update_version_h(old_version, new_version):
   """Update tensorflow/core/public/version.h."""
-  replace_line("#define TF_MAJOR_VERSION %s" % old_version.major,
-               "#define TF_MAJOR_VERSION %s" % new_version.major, VERSION_H)
-  replace_line("#define TF_MINOR_VERSION %s" % old_version.minor,
-               "#define TF_MINOR_VERSION %s" % new_version.minor, VERSION_H)
-  replace_line("#define TF_PATCH_VERSION %s" % old_version.patch,
-               "#define TF_PATCH_VERSION %s" % new_version.patch, VERSION_H)
-  replace_line("#define TF_VERSION_SUFFIX \"%s\"" %
-               old_version.identifier_string,
-               "#define TF_VERSION_SUFFIX \"%s\""
-               % new_version.identifier_string,
-               VERSION_H)
+  replace_string_in_line("#define TF_MAJOR_VERSION %s" % old_version.major,
+                         "#define TF_MAJOR_VERSION %s" % new_version.major,
+                         VERSION_H)
+  replace_string_in_line("#define TF_MINOR_VERSION %s" % old_version.minor,
+                         "#define TF_MINOR_VERSION %s" % new_version.minor,
+                         VERSION_H)
+  replace_string_in_line("#define TF_PATCH_VERSION %s" % old_version.patch,
+                         "#define TF_PATCH_VERSION %s" % new_version.patch,
+                         VERSION_H)
+  replace_string_in_line(
+      "#define TF_VERSION_SUFFIX \"%s\"" % old_version.identifier_string,
+      "#define TF_VERSION_SUFFIX \"%s\"" % new_version.identifier_string,
+      VERSION_H)
 
 
 def update_setup_dot_py(old_version, new_version):
   """Update setup.py."""
-  replace_line("_VERSION = '%s'" % old_version.string,
-               "_VERSION = '%s'" % new_version.string, SETUP_PY)
+  replace_string_in_line("_VERSION = '%s'" % old_version.string,
+                         "_VERSION = '%s'" % new_version.string, SETUP_PY)
 
 
 def update_readme(old_version, new_version):
   """Update README."""
   pep_440_str = new_version.pep_440_str
-  replace_with_sed(r"s/%s\.%s\.([[:alnum:]]+)-/%s-/g" % (old_version.major,
-                                                         old_version.minor,
-                                                         pep_440_str),
-                   README_MD)
+  replace_string_in_line(r"%s\.%s\.([[:alnum:]]+)-" % (old_version.major,
+                                                       old_version.minor),
+                         "%s-" % pep_440_str, README_MD)
 
 
 def update_md_files(old_version, new_version):
@@ -226,22 +224,29 @@ def update_md_files(old_version, new_version):
   for filename in ["linux", "mac", "windows", "sources"]:
     filepath = "%s/docs_src/install/install_%s.md" % (TF_SRC_DIR,
                                                       filename)
-    replace_with_sed("s/tensorflow-%s/tensorflow-%s/g"
-                     % (old_pep_version, new_pep_version), filepath)
-    replace_with_sed("s/tensorflow_gpu-%s/tensorflow_gpu-%s/g"
-                     % (old_pep_version, new_pep_version), filepath)
-    replace_with_sed("s/TensorFlow %s/TensorFlow %s/g"
-                     % (old_pep_version, new_pep_version), filepath)
+
+    if filename == "sources" and "rc0" in new_pep_version:
+      replace_string_in_line("(?<!<td>)tensorflow-%s" % old_pep_version,
+                             "tensorflow-%s" % new_pep_version, filepath)
+      replace_string_in_line("(?<!<td>)tensorflow_gpu-%s" % old_pep_version,
+                             "tensorflow_gpu-%s" % new_pep_version, filepath)
+    else:
+      replace_string_in_line("tensorflow-%s" % old_pep_version,
+                             "tensorflow-%s" % new_pep_version, filepath)
+      replace_string_in_line("tensorflow_gpu-%s" % old_pep_version,
+                             "tensorflow_gpu-%s" % new_pep_version, filepath)
+    replace_string_in_line("TensorFlow %s" % old_pep_version,
+                           "TensorFlow %s" % new_pep_version, filepath)
 
   for filename in ["java", "go", "c"]:
     filepath = "%s/docs_src/install/install_%s.md" % (TF_SRC_DIR,
                                                       filename)
-    replace_with_sed(r"s/x86_64-%s/x86_64-%s/g"
-                     % (old_version, new_version), filepath)
-    replace_with_sed(r"s/libtensorflow-%s.jar/libtensorflow-%s.jar/g"
-                     % (old_version, new_version), filepath)
-    replace_with_sed(r"s/<version>%s<\/version>/<version>%s<\/version>/g"
-                     % (old_version, new_version), filepath)
+    replace_string_in_line(r"x86_64-%s" % old_version,
+                           "x86_64-%s" % new_version, filepath)
+    replace_string_in_line(r"libtensorflow-%s.jar" % old_version,
+                           "libtensorflow-%s.jar" % new_version, filepath)
+    replace_string_in_line(r"<version>%s<\/version>" % old_version,
+                           "<version>%s</version>" % new_version, filepath)
 
 
 def major_minor_change(old_version, new_version):
@@ -256,20 +261,19 @@ def major_minor_change(old_version, new_version):
 def update_dockerfiles(old_version, new_version):
   """Update dockerfiles if there was a major change."""
   if major_minor_change(old_version, new_version):
-    old_r_major_minor = r"r%s\.%s" % (old_version.major, old_version.minor)
-    old_r_major_minor_string = old_r_major_minor.replace("\\", "")
-    r_major_minor = r"r%s\.%s" % (new_version.major, new_version.minor)
-    r_major_minor_string = r_major_minor.replace("\\", "")
+    old_r_major_minor = "r%s.%s" % (old_version.major, old_version.minor)
+    r_major_minor = "r%s.%s" % (new_version.major, new_version.minor)
 
     print("Detected Major.Minor change.")
     print("Updating pattern %s to %s in additional files"
-          % (old_r_major_minor_string, r_major_minor_string))
+          % (old_r_major_minor, r_major_minor))
 
     # Update dockerfiles
-    replace_with_sed("s/%s/%s/g"
-                     % (old_r_major_minor, r_major_minor), DEVEL_DOCKERFILE)
-    replace_with_sed("s/%s/%s/g"
-                     % (old_r_major_minor, r_major_minor), GPU_DEVEL_DOCKERFILE)
+    replace_string_in_line(old_r_major_minor, r_major_minor, DEVEL_DOCKERFILE)
+    replace_string_in_line(old_r_major_minor, r_major_minor,
+                           GPU_DEVEL_DOCKERFILE)
+    replace_string_in_line(old_r_major_minor, r_major_minor,
+                           CPU_MKL_DEVEL_DOCKERFILE)
 
 
 def check_for_lingering_string(lingering_string):
@@ -333,7 +337,7 @@ def main():
   old_version = get_current_semver_version()
 
   if args.nightly:
-    # dev minor version is one ahead of official
+    # Dev minor version is one ahead of official.
     nightly_minor_ver = int(old_version.minor) + 1
     new_version = Version(old_version.major,
                           str(nightly_minor_ver),
@@ -349,12 +353,18 @@ def main():
   update_md_files(old_version, new_version)
   update_dockerfiles(old_version, new_version)
 
-  # Print transition details
+  # Print transition details.
   print("Major: %s -> %s" % (old_version.major, new_version.major))
   print("Minor: %s -> %s" % (old_version.minor, new_version.minor))
   print("Patch: %s -> %s\n" % (old_version.patch, new_version.patch))
 
   check_for_old_version(old_version, new_version)
+  if "rc0" in str(new_version):
+    print("\n\n\033[93mNOTE: Please update the tensorflow/docs_src/install/"
+          "install_sources.md and add a line for tensorflow-%s and "
+          "tensorflow_gpu-%s in the tested source configurations "
+          "table.\033[0m\n" % (new_version.pep_440_str,
+                               new_version.pep_440_str))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 44b6d52952838d013f09275a3387198249837df8..7b2d7e1a568b0235a5bdd55bb23e542772902576 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -21,7 +21,6 @@ failing_cpu_cc_tests="\
     //tensorflow/core:lib_core_status_test + \
     //tensorflow/core:lib_monitoring_collection_registry_test + \
     //tensorflow/core:lib_strings_numbers_test + \
-    //tensorflow/core:lib_strings_str_util_test + \
     //tensorflow/core/platform/hadoop:hadoop_file_system_test + \
     //tensorflow/core:platform_file_system_test + \
     //tensorflow/core:platform_logging_test + \
@@ -43,7 +42,6 @@ broken_cpu_cc_tests="\
     //tensorflow/core/platform/cloud:gcs_file_system_test + \
     //tensorflow/core/kernels/cloud:bigquery_table_accessor_test + \
     //tensorflow/core/kernels/hexagon:graph_transferer_test + \
-    //tensorflow/core/kernels/hexagon:quantized_matmul_op_for_hexagon_test + \
     //tensorflow/core/kernels:remote_fused_graph_execute_utils_test + \
     //tensorflow/core/kernels:requantize_op_test + \
     //tensorflow/core/kernels:requantization_range_op_test + \
@@ -96,10 +94,6 @@ exclude_cpu_cc_tests="${failing_cpu_cc_tests} + ${broken_cpu_cc_tests}"
 
 exclude_gpu_cc_tests="${extra_failing_gpu_cc_tests} + ${exclude_cpu_cc_tests}"
 
-function clean_output_base() {
-  bazel clean --expunge
-}
-
 function run_configure_for_cpu_build {
   # Due to a bug in Bazel: https://github.com/bazelbuild/bazel/issues/2182
   # yes "" | ./configure doesn't work on Windows, so we set all the
@@ -108,14 +102,11 @@ function run_configure_for_cpu_build {
   if [ -z "$TF_ENABLE_XLA" ]; then
     export TF_ENABLE_XLA=0
   fi
-  if [ -z "$CC_OPT_FLAGS" ]; then
-    export CC_OPT_FLAGS="-march=native"
-  fi
   if [ -z "$TF_NEED_MKL" ]; then
     export TF_NEED_MKL=0
   fi
   export TF_NEED_VERBS=0
-  export TF_NEED_GCP=0
+  export TF_NEED_GCP=1
   export TF_NEED_HDFS=0
   export TF_NEED_OPENCL_SYCL=0
   echo "" | ./configure
@@ -126,17 +117,14 @@ function run_configure_for_gpu_build {
   # yes "" | ./configure doesn't work on Windows, so we set all the
   # environment variables in advance to avoid interact with the script.
   export TF_NEED_CUDA=1
-  export TF_CUDA_VERSION=8.0
-  export CUDA_TOOLKIT_PATH="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0"
-  export TF_CUDNN_VERSION=6.0
+  export TF_CUDA_VERSION=9.0
+  export CUDA_TOOLKIT_PATH="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0"
+  export TF_CUDNN_VERSION=7.0
   export CUDNN_INSTALL_PATH="C:/tools/cuda"
   export TF_CUDA_COMPUTE_CAPABILITIES="3.7"
   if [ -z "$TF_ENABLE_XLA" ]; then
     export TF_ENABLE_XLA=0
   fi
-  if [ -z "$CC_OPT_FLAGS" ]; then
-    export CC_OPT_FLAGS="-march=native"
-  fi
   export TF_NEED_VERBS=0
   export TF_NEED_MKL=0
   export TF_NEED_GCP=0
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 4a653698a2d7c12ce59a53bf96e1551a633f7cab..1c35d74af72ad0a72b0016356888c8cf77e20e56 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -32,34 +32,20 @@ mkdir -p "$TMPDIR"
 # Set bash path
 export BAZEL_SH=${BAZEL_SH:-"C:/tools/msys64/usr/bin/bash"}
 
-# Set Python path for ./configure
-export PYTHON_BIN_PATH="C:/Program Files/Anaconda3/python.exe"
-export PYTHON_LIB_PATH="C:/Program Files/Anaconda3/lib/site-packages"
-
-# Set Python path for cc_configure.bzl
-export BAZEL_PYTHON="C:/Program Files/Anaconda3/python.exe"
+export PYTHON_BASE_PATH="${PYTHON_DIRECTORY:-Program Files/Anaconda3}"
 
-# Set Visual Studio path
-export BAZEL_VS="C:/Program Files (x86)/Microsoft Visual Studio 14.0"
+# Set Python path for ./configure
+export PYTHON_BIN_PATH="C:/${PYTHON_BASE_PATH}/python.exe"
+export PYTHON_LIB_PATH="C:/${PYTHON_BASE_PATH}/lib/site-packages"
 
 # Add python into PATH, it's needed because gen_git_source.py uses
 # '/usr/bin/env python' as a shebang
-export PATH="/c/Program Files/Anaconda3:$PATH"
+export PATH="/c/${PYTHON_BASE_PATH}:$PATH"
 
 # Make sure we have pip in PATH
-export PATH="/c/Program Files/Anaconda3/Scripts:$PATH"
+export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
 
 # Add Cuda and Cudnn dll directories into PATH
-export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin:$PATH"
-export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/extras/CUPTI/libx64:$PATH"
+export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0/bin:$PATH"
+export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0/extras/CUPTI/libx64:$PATH"
 export PATH="/c/tools/cuda/bin:$PATH"
-
-# Set the common build options on Windows
-export BUILD_OPTS='--config=monolithic --copt=-w --host_copt=-w --verbose_failures --experimental_ui'
-
-# Build TF with wrapper-less CROSSTOOL
-# TODO(pcloudy): Remove this after wrapper-less CROSSTOOL becomes default
-export NO_MSVC_WRAPPER=1
-
-export USE_DYNAMIC_CRT=1
-
diff --git a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
index 8c419347d6f4b3af2e47bb96f246dc7281a92364..748a961e44c5429664e37a1456adcf02a56fa3d4 100644
--- a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
@@ -42,8 +42,6 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
-clean_output_base
-
 run_configure_for_cpu_build
 
 # Compliling the following test is extremely slow with -c opt
@@ -54,5 +52,5 @@ passing_tests=$(bazel query "kind(cc_test, //tensorflow/cc/... + //tensorflow/co
   # We need to strip \r so that the result could be store into a variable under MSYS
   tr '\r' ' ')
 
-bazel test $BUILD_OPTS -k $slow_compiling_test --test_output=errors
-bazel test -c opt $BUILD_OPTS -k $passing_tests --test_output=errors
+bazel test -k $slow_compiling_test --test_output=errors
+bazel test -c opt -k $passing_tests --test_output=errors
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
index 56bff077746b8195a93b6ab8d7ce707b06549daa..c1bc71850754c5b4b42a6eb50be465ba8f98c218 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
@@ -30,11 +30,13 @@ IF DEFINED SWIG_EXE (ECHO SWIG_EXE is set to %SWIG_EXE%) ELSE (SET SWIG_EXE="C:\
 IF DEFINED PY_EXE (ECHO PY_EXE is set to %PY_EXE%) ELSE (SET PY_EXE="C:\Program Files\Anaconda3\python.exe")
 IF DEFINED PY_LIB (ECHO PY_LIB is set to %PY_LIB%) ELSE (SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib")
 
+IF DEFINED DISABLE_FORCEINLINE (ECHO DISABLE_FORCEINLINE is set to %DISABLE_FORCEINLINE%) ELSE (SET DISABLE_FORCEINLINE="OFF")
+
 SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 
 :: Run cmake to create Visual Studio Project files.
-%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY%
+%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
 
 :: Run msbuild in the resulting VS project files to build a pip package.
 %MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 8520ca898f84a4990aaf4348d1cfb09dce2ff7ab..8b8ba31a0dda88ad3c43330e0208a9fa6a7d0276 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -44,8 +44,9 @@ source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
 
 run_configure_for_cpu_build
 
-clean_output_base
-
+# --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
+# by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
+BUILD_OPTS="--define=override_eigen_strong_inline=true"
 bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
 
 # Create a python test directory to avoid package name conflict
@@ -60,11 +61,8 @@ reinstall_tensorflow_pip ${PIP_NAME}
 
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-# TODO(pcloudy): Remove TF_SAVER_LENIENT_NAMES after
-# https://github.com/tensorflow/tensorflow/issues/12844 is fixed.
 bazel test -c opt $BUILD_OPTS -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
-  --test_env=TF_SAVER_LENIENT_NAMES=True \
   //${PY_TEST_DIR}/tensorflow/python/...
diff --git a/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh b/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
index 3fd960deabbb0ace8c9598589f9f9a72fd09b3a9..f26f8727e51bf0247578c1cdfaa67e1b0f7f299d 100644
--- a/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
@@ -56,5 +56,5 @@ passing_tests=$(bazel query "kind(cc_test, //tensorflow/cc/... + //tensorflow/co
 
 # TODO(pcloudy): There is a bug in Bazel preventing build with GPU support without -c opt
 # Re-enable this test after it is fixed.
-# bazel test --config=win-cuda $BUILD_OPTS -k $slow_compiling_test --test_output=errors
-bazel test -c opt --config=win-cuda $BUILD_OPTS -k $passing_tests --test_output=errors
+# bazel test --config=win-cuda -k $slow_compiling_test --test_output=errors
+bazel test -c opt --config=win-cuda -k $passing_tests --test_output=errors
diff --git a/tensorflow/tools/ci_build/windows/gpu/bazel/run_libtensorflow.bat b/tensorflow/tools/ci_build/windows/gpu/bazel/run_libtensorflow.bat
new file mode 100644
index 0000000000000000000000000000000000000000..773d9c8865cddeea56e1489876a465c4cc5c018e
--- /dev/null
+++ b/tensorflow/tools/ci_build/windows/gpu/bazel/run_libtensorflow.bat
@@ -0,0 +1 @@
+c:\tools\msys64\usr\bin\bash -l %cd%/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh %*
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
index 832943ad6c82855a76be0782c5332fb8e0f202b6..b87e4a9bec41264827d415a11dfa6f23aeda725d 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
@@ -31,11 +31,13 @@ IF DEFINED PY_EXE (ECHO PY_EXE is set to %PY_EXE%) ELSE (SET PY_EXE="C:\Program
 IF DEFINED PY_LIB (ECHO PY_LIB is set to %PY_LIB%) ELSE (SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib")
 IF DEFINED CUDNN_HOME (ECHO CUDNN_HOME is set to %CUDNN_HOME%) ELSE (SET CUDNN_HOME="c:\tools\cuda")
 verbosity:quiet
+IF DEFINED DISABLE_FORCEINLINE (ECHO DISABLE_FORCEINLINE is set to %DISABLE_FORCEINLINE%) ELSE (SET DISABLE_FORCEINLINE="OFF")
+
 SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
 SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 
 :: Run cmake to create Visual Studio Project files.
-%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY%
+%CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY% -Dtensorflow_DISABLE_EIGEN_FORCEINLINE=%DISABLE_FORCEINLINE% -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
 
 :: Run msbuild in the resulting VS project files to build a pip package.
 %MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 47ca42d6422f27fe1086fda75d33687cfe2db9b0..922bb67bbf6ce34f55acad6d3399bd810032abd0 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -44,9 +44,7 @@ source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
 
 run_configure_for_gpu_build
 
-clean_output_base
-
-bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
 # Create a python test directory to avoid package name conflict
 PY_TEST_DIR="py_test_dir"
@@ -61,11 +59,8 @@ reinstall_tensorflow_pip ${PIP_NAME}
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
 # GPU tests are very flaky when running concurrently, so set local_test_jobs=1
-# TODO(pcloudy): Remove TF_SAVER_LENIENT_NAMES after
-# https://github.com/tensorflow/tensorflow/issues/12844 is fixed.
-bazel test -c opt $BUILD_OPTS -k --test_output=errors \
+bazel test -c opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu,no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu,no_oss \
-  --test_env=TF_SAVER_LENIENT_NAMES=True \
   --local_test_jobs=1 --build_tests_only //${PY_TEST_DIR}/tensorflow/python/...
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index 9ac3613f27e1bc96501490b7610f047785b9ada2..583d1d5f09527861015458c636af2259b34d45f8 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -31,36 +31,22 @@ if [ ! -e "WORKSPACE" ]; then
   exit 1
 fi
 
-# Enable JNI support for Windows in Bazel.
-# This can be removed once
-# https://github.com/bazelbuild/bazel/pull/2599
-# has been merged and we switch to a bazel release containing it.
-cp "${JAVA_HOME}/include/win32/jni_md.h" "./tensorflow/java/src/main/native/windows_jni_md.h"
-sed -i -e "s|@bazel_tools//tools/jdk:jni_md_header-linux|windows_jni_md.h|" ./tensorflow/java/src/main/native/BUILD
-#### END HACKS TO BE RESOLVED WITH NEW BAZEL VERSIONS ####
-
 export TF_BAZEL_TARGETS="//tensorflow:libtensorflow.so"
 export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:clicenses_generate"
 export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/java:libtensorflow_jni.so"
 export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:jnilicenses_generate"
 
-clean_output_base
 run_configure_for_cpu_build
 
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt ${BUILD_OPTS} \
+bazel build -c opt --copt=/arch:AVX \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
   tensorflow/tools/lib_package:jnilicenses_generate
 
-# Revert the hacks above
-git checkout ./tensorflow/tools/pip_package/BUILD
-git checkout ./tensorflow/java/src/main/native/BUILD
-rm -f ./tensorflow/java/src/main/native/windows_jni_md.h
-
 DIR=lib_package
 rm -rf ${DIR}
 mkdir -p ${DIR}
@@ -74,13 +60,16 @@ rm -f ${DIR}/tensorflow_jni.dll
 
 # Zip up the .dll, LICENSE and include files for the C library.
 mkdir -p ${DIR}/include/tensorflow/c
+mkdir -p ${DIR}/include/tensorflow/c/eager
 mkdir -p ${DIR}/lib
 cp bazel-bin/tensorflow/libtensorflow.so ${DIR}/lib/tensorflow.dll
 cp tensorflow/c/c_api.h ${DIR}/include/tensorflow/c
+cp tensorflow/c/eager/c_api.h ${DIR}/include/tensorflow/c/eager
 cp bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE ${DIR}/include/tensorflow/c
 cd ${DIR}
-zip -j libtensorflow-cpu-windows-$(uname -m).zip \
+zip libtensorflow-cpu-windows-$(uname -m).zip \
   lib/tensorflow.dll \
+  include/tensorflow/c/eager/c_api.h \
   include/tensorflow/c/c_api.h \
   include/tensorflow/c/LICENSE
 rm -rf lib include
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..94276c6c5c9ce897ca24f03efe3d93e1ea1e00c9
--- /dev/null
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Script to produce binary release of libtensorflow (C API, Java jars etc.).
+
+set -ex
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Setup environment for bazel builds
+source "${SCRIPT_DIR}/bazel/common_env.sh"
+source "${SCRIPT_DIR}/bazel/bazel_test_lib.sh"
+
+# Sanity check that this is being run from the root of the git repository.
+cd ${SCRIPT_DIR}/../../../..
+if [ ! -e "WORKSPACE" ]; then
+  echo "Must run this from the root of the bazel workspace"
+  echo "Currently at ${PWD}, script is at ${SCRIPT_DIR}"
+  exit 1
+fi
+
+export TF_BAZEL_TARGETS="//tensorflow:libtensorflow.so"
+export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:clicenses_generate"
+export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/java:libtensorflow_jni.so"
+export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:jnilicenses_generate"
+
+run_configure_for_gpu_build
+
+# build_libtensorflow_tarball in ../builds/libtensorflow.sh
+# cannot be used on Windows since it relies on pkg_tar rules.
+# So we do something special here
+bazel build -c opt --copt=/arch:AVX \
+  tensorflow:libtensorflow.so \
+  tensorflow/tools/lib_package:clicenses_generate \
+  tensorflow/java:libtensorflow_jni.so \
+  tensorflow/tools/lib_package:jnilicenses_generate
+
+DIR=lib_package
+rm -rf ${DIR}
+mkdir -p ${DIR}
+
+# Zip up the .dll and the LICENSE for the JNI library.
+cp bazel-bin/tensorflow/java/libtensorflow_jni.so ${DIR}/tensorflow_jni.dll
+zip -j ${DIR}/libtensorflow_jni-gpu-windows-$(uname -m).zip \
+  ${DIR}/tensorflow_jni.dll \
+  bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/jni/LICENSE
+rm -f ${DIR}/tensorflow_jni.dll
+
+# Zip up the .dll, LICENSE and include files for the C library.
+mkdir -p ${DIR}/include/tensorflow/c
+mkdir -p ${DIR}/lib
+cp bazel-bin/tensorflow/libtensorflow.so ${DIR}/lib/tensorflow.dll
+cp tensorflow/c/c_api.h ${DIR}/include/tensorflow/c
+cp bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE ${DIR}/include/tensorflow/c
+cd ${DIR}
+zip -j libtensorflow-gpu-windows-$(uname -m).zip \
+  lib/tensorflow.dll \
+  include/tensorflow/c/c_api.h \
+  include/tensorflow/c/LICENSE
+rm -rf lib include
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 51e4c6cef38a4c1606cd16d1c2ac75edc1f1249a..4f90c4d940670c43f65cc3f95971469627ab35c9 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -10,10 +10,7 @@ load(
 
 py_binary(
     name = "tf_upgrade",
-    srcs = [
-        "ast_edits.py",
-        "tf_upgrade.py",
-    ],
+    srcs = ["tf_upgrade.py"],
     srcs_version = "PY2AND3",
 )
 
@@ -22,7 +19,7 @@ py_test(
     srcs = ["tf_upgrade_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "tf_upgrade",
+        ":tf_upgrade",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "@six_archive//:six",
@@ -48,11 +45,11 @@ genrule(
         "test_file_v1_0.py",
         "report.txt",
     ],
-    cmd = ("$(location tf_upgrade)" +
+    cmd = ("$(location :tf_upgrade)" +
            " --infile $(location testdata/test_file_v0_11.py)" +
            " --outfile $(location test_file_v1_0.py)" +
            " --reportfile $(location report.txt)"),
-    tools = ["tf_upgrade"],
+    tools = [":tf_upgrade"],
 )
 
 py_test(
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
deleted file mode 100644
index e7e4c91692132946f303f1b7ea48c5089a14de2e..0000000000000000000000000000000000000000
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ /dev/null
@@ -1,497 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Upgrader for Python scripts according to an API change specification."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ast
-import collections
-import os
-import shutil
-import sys
-import tempfile
-import traceback
-
-
-class APIChangeSpec(object):
-  """This class defines the transformations that need to happen.
-
-  This class must provide the following fields:
-
-  * `function_keyword_renames`: maps function names to a map of old -> new
-    argument names
-  * `function_renames`: maps function names to new function names
-  * `change_to_function`: a set of function names that have changed (for
-    notifications)
-  * `function_reorders`: maps functions whose argument order has changed to the
-    list of arguments in the new order
-  * `function_handle`: maps function names to custom handlers for the function
-
-  For an example, see `TFAPIChangeSpec`.
-  """
-
-
-class _FileEditTuple(collections.namedtuple(
-    "_FileEditTuple", ["comment", "line", "start", "old", "new"])):
-  """Each edit that is recorded by a _FileEditRecorder.
-
-  Fields:
-    comment: A description of the edit and why it was made.
-    line: The line number in the file where the edit occurs (1-indexed).
-    start: The line number in the file where the edit occurs (0-indexed).
-    old: text string to remove (this must match what was in file).
-    new: text string to add in place of `old`.
-  """
-
-  __slots__ = ()
-
-
-class _FileEditRecorder(object):
-  """Record changes that need to be done to the file."""
-
-  def __init__(self, filename):
-    # all edits are lists of chars
-    self._filename = filename
-
-    self._line_to_edit = collections.defaultdict(list)
-    self._errors = []
-
-  def process(self, text):
-    """Process a list of strings, each corresponding to the recorded changes.
-
-    Args:
-      text: A list of lines of text (assumed to contain newlines)
-    Returns:
-      A tuple of the modified text and a textual description of what is done.
-    Raises:
-      ValueError: if substitution source location does not have expected text.
-    """
-
-    change_report = ""
-
-    # Iterate of each line
-    for line, edits in self._line_to_edit.items():
-      offset = 0
-      # sort by column so that edits are processed in order in order to make
-      # indexing adjustments cumulative for changes that change the string
-      # length
-      edits.sort(key=lambda x: x.start)
-
-      # Extract each line to a list of characters, because mutable lists
-      # are editable, unlike immutable strings.
-      char_array = list(text[line - 1])
-
-      # Record a description of the change
-      change_report += "%r Line %d\n" % (self._filename, line)
-      change_report += "-" * 80 + "\n\n"
-      for e in edits:
-        change_report += "%s\n" % e.comment
-      change_report += "\n    Old: %s" % (text[line - 1])
-
-      # Make underscore buffers for underlining where in the line the edit was
-      change_list = [" "] * len(text[line - 1])
-      change_list_new = [" "] * len(text[line - 1])
-
-      # Iterate for each edit
-      for e in edits:
-        # Create effective start, end by accounting for change in length due
-        # to previous edits
-        start_eff = e.start + offset
-        end_eff = start_eff + len(e.old)
-
-        # Make sure the edit is changing what it should be changing
-        old_actual = "".join(char_array[start_eff:end_eff])
-        if old_actual != e.old:
-          raise ValueError("Expected text %r but got %r" %
-                           ("".join(e.old), "".join(old_actual)))
-        # Make the edit
-        char_array[start_eff:end_eff] = list(e.new)
-
-        # Create the underline highlighting of the before and after
-        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
-        change_list_new[start_eff:end_eff] = "~" * len(e.new)
-
-        # Keep track of how to generate effective ranges
-        offset += len(e.new) - len(e.old)
-
-      # Finish the report comment
-      change_report += "         %s\n" % "".join(change_list)
-      text[line - 1] = "".join(char_array)
-      change_report += "    New: %s" % (text[line - 1])
-      change_report += "         %s\n\n" % "".join(change_list_new)
-    return "".join(text), change_report, self._errors
-
-  def add(self, comment, line, start, old, new, error=None):
-    """Add a new change that is needed.
-
-    Args:
-      comment: A description of what was changed
-      line: Line number (1 indexed)
-      start: Column offset (0 indexed)
-      old: old text
-      new: new text
-      error: this "edit" is something that cannot be fixed automatically
-    Returns:
-      None
-    """
-
-    self._line_to_edit[line].append(
-        _FileEditTuple(comment, line, start, old, new))
-    if error:
-      self._errors.append("%s:%d: %s" % (self._filename, line, error))
-
-
-class _ASTCallVisitor(ast.NodeVisitor):
-  """AST Visitor that processes function calls.
-
-  Updates function calls from old API version to new API version using a given
-  change spec.
-  """
-
-  def __init__(self, filename, lines, api_change_spec):
-    self._filename = filename
-    self._file_edit = _FileEditRecorder(filename)
-    self._lines = lines
-    self._api_change_spec = api_change_spec
-
-  def process(self, lines):
-    return self._file_edit.process(lines)
-
-  def generic_visit(self, node):
-    ast.NodeVisitor.generic_visit(self, node)
-
-  def _rename_functions(self, node, full_name):
-    function_renames = self._api_change_spec.function_renames
-    try:
-      new_name = function_renames[full_name]
-      self._file_edit.add("Renamed function %r to %r" % (full_name,
-                                                         new_name),
-                          node.lineno, node.col_offset, full_name, new_name)
-    except KeyError:
-      pass
-
-  def _get_attribute_full_path(self, node):
-    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
-
-    Args:
-      node: A Node of type Attribute.
-
-    Returns:
-      a '.'-delimited full-name or None if the tree was not a simple form.
-      i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
-    """
-    curr = node
-    items = []
-    while not isinstance(curr, ast.Name):
-      if not isinstance(curr, ast.Attribute):
-        return None
-      items.append(curr.attr)
-      curr = curr.value
-    items.append(curr.id)
-    return ".".join(reversed(items))
-
-  def _find_true_position(self, node):
-    """Return correct line number and column offset for a given node.
-
-    This is necessary mainly because ListComp's location reporting reports
-    the next token after the list comprehension list opening.
-
-    Args:
-      node: Node for which we wish to know the lineno and col_offset
-    """
-    import re
-    find_open = re.compile("^\s*(\\[).*$")
-    find_string_chars = re.compile("['\"]")
-
-    if isinstance(node, ast.ListComp):
-      # Strangely, ast.ListComp returns the col_offset of the first token
-      # after the '[' token which appears to be a bug. Workaround by
-      # explicitly finding the real start of the list comprehension.
-      line = node.lineno
-      col = node.col_offset
-      # loop over lines
-      while 1:
-        # Reverse the text to and regular expression search for whitespace
-        text = self._lines[line-1]
-        reversed_preceding_text = text[:col][::-1]
-        # First find if a [ can be found with only whitespace between it and
-        # col.
-        m = find_open.match(reversed_preceding_text)
-        if m:
-          new_col_offset = col - m.start(1) - 1
-          return line, new_col_offset
-        else:
-          if (reversed_preceding_text=="" or
-             reversed_preceding_text.isspace()):
-            line = line - 1
-            prev_line = self._lines[line - 1]
-            # TODO(aselle):
-            # this is poor comment detection, but it is good enough for
-            # cases where the comment does not contain string literal starting/
-            # ending characters. If ast gave us start and end locations of the
-            # ast nodes rather than just start, we could use string literal
-            # node ranges to filter out spurious #'s that appear in string
-            # literals.
-            comment_start = prev_line.find("#")
-            if comment_start ==  -1:
-              col = len(prev_line) -1
-            elif find_string_chars.search(prev_line[comment_start:]) is None:
-              col = comment_start
-            else:
-              return None, None
-          else:
-            return None, None
-    # Most other nodes return proper locations (with notably does not), but
-    # it is not possible to use that in an argument.
-    return node.lineno, node.col_offset
-
-
-  def visit_Call(self, node):  # pylint: disable=invalid-name
-    """Handle visiting a call node in the AST.
-
-    Args:
-      node: Current Node
-    """
-
-
-    # Find a simple attribute name path e.g. "tf.foo.bar"
-    full_name = self._get_attribute_full_path(node.func)
-
-    # Make sure the func is marked as being part of a call
-    node.func.is_function_for_call = True
-
-    if full_name:
-      # Call special handlers
-      function_handles = self._api_change_spec.function_handle
-      if full_name in function_handles:
-        function_handles[full_name](self._file_edit, node)
-
-      # Examine any non-keyword argument and make it into a keyword argument
-      # if reordering required.
-      function_reorders = self._api_change_spec.function_reorders
-      function_keyword_renames = (
-          self._api_change_spec.function_keyword_renames)
-
-      if full_name in function_reorders:
-        reordered = function_reorders[full_name]
-        for idx, arg in enumerate(node.args):
-          lineno, col_offset = self._find_true_position(arg)
-          if lineno is None or col_offset is None:
-            self._file_edit.add(
-                "Failed to add keyword %r to reordered function %r"
-                % (reordered[idx], full_name), arg.lineno, arg.col_offset,
-                "", "",
-                error="A necessary keyword argument failed to be inserted.")
-          else:
-            keyword_arg = reordered[idx]
-            if (full_name in function_keyword_renames and
-                keyword_arg in function_keyword_renames[full_name]):
-              keyword_arg = function_keyword_renames[full_name][keyword_arg]
-            self._file_edit.add("Added keyword %r to reordered function %r"
-                                % (reordered[idx], full_name), lineno,
-                                col_offset, "", keyword_arg + "=")
-
-      # Examine each keyword argument and convert it to the final renamed form
-      renamed_keywords = ({} if full_name not in function_keyword_renames else
-                          function_keyword_renames[full_name])
-      for keyword in node.keywords:
-        argkey = keyword.arg
-        argval = keyword.value
-
-        if argkey in renamed_keywords:
-          argval_lineno, argval_col_offset = self._find_true_position(argval)
-          if argval_lineno is not None and argval_col_offset is not None:
-            # TODO(aselle): We should scan backward to find the start of the
-            # keyword key. Unfortunately ast does not give you the location of
-            # keyword keys, so we are forced to infer it from the keyword arg
-            # value.
-            key_start = argval_col_offset - len(argkey) - 1
-            key_end = key_start + len(argkey) + 1
-            if (self._lines[argval_lineno - 1][key_start:key_end] ==
-                argkey + "="):
-              self._file_edit.add("Renamed keyword argument from %r to %r" %
-                                  (argkey, renamed_keywords[argkey]),
-                                  argval_lineno,
-                                  argval_col_offset - len(argkey) - 1,
-                                  argkey + "=", renamed_keywords[argkey] + "=")
-              continue
-          self._file_edit.add(
-              "Failed to rename keyword argument from %r to %r" %
-              (argkey, renamed_keywords[argkey]),
-              argval.lineno,
-              argval.col_offset - len(argkey) - 1,
-              "", "",
-              error="Failed to find keyword lexographically. Fix manually.")
-
-    ast.NodeVisitor.generic_visit(self, node)
-
-  def visit_Attribute(self, node):  # pylint: disable=invalid-name
-    """Handle bare Attributes i.e. [tf.foo, tf.bar].
-
-    Args:
-      node: Node that is of type ast.Attribute
-    """
-    full_name = self._get_attribute_full_path(node)
-    if full_name:
-      self._rename_functions(node, full_name)
-    if full_name in self._api_change_spec.change_to_function:
-      if not hasattr(node, "is_function_for_call"):
-        new_text = full_name + "()"
-        self._file_edit.add("Changed %r to %r"%(full_name, new_text),
-                            node.lineno, node.col_offset, full_name, new_text)
-
-    ast.NodeVisitor.generic_visit(self, node)
-
-
-class ASTCodeUpgrader(object):
-  """Handles upgrading a set of Python files using a given API change spec."""
-
-  def __init__(self, api_change_spec):
-    if not isinstance(api_change_spec, APIChangeSpec):
-      raise TypeError("Must pass APIChangeSpec to ASTCodeUpgrader, got %s" %
-                      type(api_change_spec))
-    self._api_change_spec = api_change_spec
-
-  def process_file(self, in_filename, out_filename):
-    """Process the given python file for incompatible changes.
-
-    Args:
-      in_filename: filename to parse
-      out_filename: output file to write to
-    Returns:
-      A tuple representing number of files processed, log of actions, errors
-    """
-
-    # Write to a temporary file, just in case we are doing an implace modify.
-    with open(in_filename, "r") as in_file, \
-        tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
-      ret = self.process_opened_file(
-          in_filename, in_file, out_filename, temp_file)
-
-    shutil.move(temp_file.name, out_filename)
-    return ret
-
-  # Broad exceptions are required here because ast throws whatever it wants.
-  # pylint: disable=broad-except
-  def process_opened_file(self, in_filename, in_file, out_filename, out_file):
-    """Process the given python file for incompatible changes.
-
-    This function is split out to facilitate StringIO testing from
-    tf_upgrade_test.py.
-
-    Args:
-      in_filename: filename to parse
-      in_file: opened file (or StringIO)
-      out_filename: output file to write to
-      out_file: opened file (or StringIO)
-    Returns:
-      A tuple representing number of files processed, log of actions, errors
-    """
-    process_errors = []
-    text = "-" * 80 + "\n"
-    text += "Processing file %r\n outputting to %r\n" % (in_filename,
-                                                         out_filename)
-    text += "-" * 80 + "\n\n"
-
-    parsed_ast = None
-    lines = in_file.readlines()
-    try:
-      parsed_ast = ast.parse("".join(lines))
-    except Exception:
-      text += "Failed to parse %r\n\n" % in_filename
-      text += traceback.format_exc()
-    if parsed_ast:
-      visitor = _ASTCallVisitor(in_filename, lines, self._api_change_spec)
-      visitor.visit(parsed_ast)
-      out_text, new_text, process_errors = visitor.process(lines)
-      text += new_text
-      if out_file:
-        out_file.write(out_text)
-    text += "\n"
-    return 1, text, process_errors
-  # pylint: enable=broad-except
-
-  def process_tree(self, root_directory, output_root_directory,
-                   copy_other_files):
-    """Processes upgrades on an entire tree of python files in place.
-
-    Note that only Python files. If you have custom code in other languages,
-    you will need to manually upgrade those.
-
-    Args:
-      root_directory: Directory to walk and process.
-      output_root_directory: Directory to use as base.
-      copy_other_files: Copy files that are not touched by this converter.
-
-    Returns:
-      A tuple of files processed, the report string ofr all files, and errors
-    """
-
-    # make sure output directory doesn't exist
-    if output_root_directory and os.path.exists(output_root_directory):
-      print("Output directory %r must not already exist." % (
-          output_root_directory))
-      sys.exit(1)
-
-    # make sure output directory does not overlap with root_directory
-    norm_root = os.path.split(os.path.normpath(root_directory))
-    norm_output = os.path.split(os.path.normpath(output_root_directory))
-    if norm_root == norm_output:
-      print("Output directory %r same as input directory %r" % (
-          root_directory, output_root_directory))
-      sys.exit(1)
-
-    # Collect list of files to process (we do this to correctly handle if the
-    # user puts the output directory in some sub directory of the input dir)
-    files_to_process = []
-    files_to_copy = []
-    for dir_name, _, file_list in os.walk(root_directory):
-      py_files = [f for f in file_list if f.endswith(".py")]
-      copy_files = [f for f in file_list if not f.endswith(".py")]
-      for filename in py_files:
-        fullpath = os.path.join(dir_name, filename)
-        fullpath_output = os.path.join(
-            output_root_directory, os.path.relpath(fullpath, root_directory))
-        files_to_process.append((fullpath, fullpath_output))
-      if copy_other_files:
-        for filename in copy_files:
-          fullpath = os.path.join(dir_name, filename)
-          fullpath_output = os.path.join(
-              output_root_directory, os.path.relpath(fullpath, root_directory))
-          files_to_copy.append((fullpath, fullpath_output))
-
-    file_count = 0
-    tree_errors = []
-    report = ""
-    report += ("=" * 80) + "\n"
-    report += "Input tree: %r\n" % root_directory
-    report += ("=" * 80) + "\n"
-
-    for input_path, output_path in files_to_process:
-      output_directory = os.path.dirname(output_path)
-      if not os.path.isdir(output_directory):
-        os.makedirs(output_directory)
-      file_count += 1
-      _, l_report, l_errors = self.process_file(input_path, output_path)
-      tree_errors += l_errors
-      report += l_report
-    for input_path, output_path in files_to_copy:
-      output_directory = os.path.dirname(output_path)
-      if not os.path.isdir(output_directory):
-        os.makedirs(output_directory)
-      shutil.copy(input_path, output_path)
-    return file_count, report, tree_errors
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 72fe4a48cdd1c374d7dc39b8bb820a365a730e13..6e90b286c99f894ddd25268afc69043759571c36 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -19,11 +19,491 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import ast
+import collections
+import os
+import shutil
+import sys
+import tempfile
+import traceback
 
-from tensorflow.tools.compatibility import ast_edits
 
+class APIChangeSpec(object):
+  """This class defines the transformations that need to happen.
 
-class TFAPIChangeSpec(ast_edits.APIChangeSpec):
+  This class must provide the following fields:
+
+  * `function_keyword_renames`: maps function names to a map of old -> new
+    argument names
+  * `function_renames`: maps function names to new function names
+  * `change_to_function`: a set of function names that have changed (for
+    notifications)
+  * `function_reorders`: maps functions whose argument order has changed to the
+    list of arguments in the new order
+  * `function_handle`: maps function names to custom handlers for the function
+
+  For an example, see `TFAPIChangeSpec`.
+  """
+
+
+class _FileEditTuple(
+    collections.namedtuple("_FileEditTuple",
+                           ["comment", "line", "start", "old", "new"])):
+  """Each edit that is recorded by a _FileEditRecorder.
+
+  Fields:
+    comment: A description of the edit and why it was made.
+    line: The line number in the file where the edit occurs (1-indexed).
+    start: The line number in the file where the edit occurs (0-indexed).
+    old: text string to remove (this must match what was in file).
+    new: text string to add in place of `old`.
+  """
+
+  __slots__ = ()
+
+
+class _FileEditRecorder(object):
+  """Record changes that need to be done to the file."""
+
+  def __init__(self, filename):
+    # all edits are lists of chars
+    self._filename = filename
+
+    self._line_to_edit = collections.defaultdict(list)
+    self._errors = []
+
+  def process(self, text):
+    """Process a list of strings, each corresponding to the recorded changes.
+
+    Args:
+      text: A list of lines of text (assumed to contain newlines)
+    Returns:
+      A tuple of the modified text and a textual description of what is done.
+    Raises:
+      ValueError: if substitution source location does not have expected text.
+    """
+
+    change_report = ""
+
+    # Iterate of each line
+    for line, edits in self._line_to_edit.items():
+      offset = 0
+      # sort by column so that edits are processed in order in order to make
+      # indexing adjustments cumulative for changes that change the string
+      # length
+      edits.sort(key=lambda x: x.start)
+
+      # Extract each line to a list of characters, because mutable lists
+      # are editable, unlike immutable strings.
+      char_array = list(text[line - 1])
+
+      # Record a description of the change
+      change_report += "%r Line %d\n" % (self._filename, line)
+      change_report += "-" * 80 + "\n\n"
+      for e in edits:
+        change_report += "%s\n" % e.comment
+      change_report += "\n    Old: %s" % (text[line - 1])
+
+      # Make underscore buffers for underlining where in the line the edit was
+      change_list = [" "] * len(text[line - 1])
+      change_list_new = [" "] * len(text[line - 1])
+
+      # Iterate for each edit
+      for e in edits:
+        # Create effective start, end by accounting for change in length due
+        # to previous edits
+        start_eff = e.start + offset
+        end_eff = start_eff + len(e.old)
+
+        # Make sure the edit is changing what it should be changing
+        old_actual = "".join(char_array[start_eff:end_eff])
+        if old_actual != e.old:
+          raise ValueError("Expected text %r but got %r" %
+                           ("".join(e.old), "".join(old_actual)))
+        # Make the edit
+        char_array[start_eff:end_eff] = list(e.new)
+
+        # Create the underline highlighting of the before and after
+        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
+        change_list_new[start_eff:end_eff] = "~" * len(e.new)
+
+        # Keep track of how to generate effective ranges
+        offset += len(e.new) - len(e.old)
+
+      # Finish the report comment
+      change_report += "         %s\n" % "".join(change_list)
+      text[line - 1] = "".join(char_array)
+      change_report += "    New: %s" % (text[line - 1])
+      change_report += "         %s\n\n" % "".join(change_list_new)
+    return "".join(text), change_report, self._errors
+
+  def add(self, comment, line, start, old, new, error=None):
+    """Add a new change that is needed.
+
+    Args:
+      comment: A description of what was changed
+      line: Line number (1 indexed)
+      start: Column offset (0 indexed)
+      old: old text
+      new: new text
+      error: this "edit" is something that cannot be fixed automatically
+    Returns:
+      None
+    """
+
+    self._line_to_edit[line].append(
+        _FileEditTuple(comment, line, start, old, new))
+    if error:
+      self._errors.append("%s:%d: %s" % (self._filename, line, error))
+
+
+class _ASTCallVisitor(ast.NodeVisitor):
+  """AST Visitor that processes function calls.
+
+  Updates function calls from old API version to new API version using a given
+  change spec.
+  """
+
+  def __init__(self, filename, lines, api_change_spec):
+    self._filename = filename
+    self._file_edit = _FileEditRecorder(filename)
+    self._lines = lines
+    self._api_change_spec = api_change_spec
+
+  def process(self, lines):
+    return self._file_edit.process(lines)
+
+  def generic_visit(self, node):
+    ast.NodeVisitor.generic_visit(self, node)
+
+  def _rename_functions(self, node, full_name):
+    function_renames = self._api_change_spec.function_renames
+    try:
+      new_name = function_renames[full_name]
+      self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
+                          node.lineno, node.col_offset, full_name, new_name)
+    except KeyError:
+      pass
+
+  def _get_attribute_full_path(self, node):
+    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
+
+    Args:
+      node: A Node of type Attribute.
+
+    Returns:
+      a '.'-delimited full-name or None if the tree was not a simple form.
+      i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
+    """
+    curr = node
+    items = []
+    while not isinstance(curr, ast.Name):
+      if not isinstance(curr, ast.Attribute):
+        return None
+      items.append(curr.attr)
+      curr = curr.value
+    items.append(curr.id)
+    return ".".join(reversed(items))
+
+  def _find_true_position(self, node):
+    """Return correct line number and column offset for a given node.
+
+    This is necessary mainly because ListComp's location reporting reports
+    the next token after the list comprehension list opening.
+
+    Args:
+      node: Node for which we wish to know the lineno and col_offset
+    """
+    import re
+    find_open = re.compile("^\s*(\\[).*$")
+    find_string_chars = re.compile("['\"]")
+
+    if isinstance(node, ast.ListComp):
+      # Strangely, ast.ListComp returns the col_offset of the first token
+      # after the '[' token which appears to be a bug. Workaround by
+      # explicitly finding the real start of the list comprehension.
+      line = node.lineno
+      col = node.col_offset
+      # loop over lines
+      while 1:
+        # Reverse the text to and regular expression search for whitespace
+        text = self._lines[line - 1]
+        reversed_preceding_text = text[:col][::-1]
+        # First find if a [ can be found with only whitespace between it and
+        # col.
+        m = find_open.match(reversed_preceding_text)
+        if m:
+          new_col_offset = col - m.start(1) - 1
+          return line, new_col_offset
+        else:
+          if (reversed_preceding_text == "" or
+              reversed_preceding_text.isspace()):
+            line = line - 1
+            prev_line = self._lines[line - 1]
+            # TODO(aselle):
+            # this is poor comment detection, but it is good enough for
+            # cases where the comment does not contain string literal starting/
+            # ending characters. If ast gave us start and end locations of the
+            # ast nodes rather than just start, we could use string literal
+            # node ranges to filter out spurious #'s that appear in string
+            # literals.
+            comment_start = prev_line.find("#")
+            if comment_start == -1:
+              col = len(prev_line) - 1
+            elif find_string_chars.search(prev_line[comment_start:]) is None:
+              col = comment_start
+            else:
+              return None, None
+          else:
+            return None, None
+    # Most other nodes return proper locations (with notably does not), but
+    # it is not possible to use that in an argument.
+    return node.lineno, node.col_offset
+
+  def visit_Call(self, node):  # pylint: disable=invalid-name
+    """Handle visiting a call node in the AST.
+
+    Args:
+      node: Current Node
+    """
+
+    # Find a simple attribute name path e.g. "tf.foo.bar"
+    full_name = self._get_attribute_full_path(node.func)
+
+    # Make sure the func is marked as being part of a call
+    node.func.is_function_for_call = True
+
+    if full_name:
+      # Call special handlers
+      function_handles = self._api_change_spec.function_handle
+      if full_name in function_handles:
+        function_handles[full_name](self._file_edit, node)
+
+      # Examine any non-keyword argument and make it into a keyword argument
+      # if reordering required.
+      function_reorders = self._api_change_spec.function_reorders
+      function_keyword_renames = (
+          self._api_change_spec.function_keyword_renames)
+
+      if full_name in function_reorders:
+        reordered = function_reorders[full_name]
+        for idx, arg in enumerate(node.args):
+          lineno, col_offset = self._find_true_position(arg)
+          if lineno is None or col_offset is None:
+            self._file_edit.add(
+                "Failed to add keyword %r to reordered function %r" %
+                (reordered[idx], full_name),
+                arg.lineno,
+                arg.col_offset,
+                "",
+                "",
+                error="A necessary keyword argument failed to be inserted.")
+          else:
+            keyword_arg = reordered[idx]
+            if (full_name in function_keyword_renames and
+                keyword_arg in function_keyword_renames[full_name]):
+              keyword_arg = function_keyword_renames[full_name][keyword_arg]
+            self._file_edit.add("Added keyword %r to reordered function %r" %
+                                (reordered[idx], full_name), lineno, col_offset,
+                                "", keyword_arg + "=")
+
+      # Examine each keyword argument and convert it to the final renamed form
+      renamed_keywords = ({} if full_name not in function_keyword_renames else
+                          function_keyword_renames[full_name])
+      for keyword in node.keywords:
+        argkey = keyword.arg
+        argval = keyword.value
+
+        if argkey in renamed_keywords:
+          argval_lineno, argval_col_offset = self._find_true_position(argval)
+          if argval_lineno is not None and argval_col_offset is not None:
+            # TODO(aselle): We should scan backward to find the start of the
+            # keyword key. Unfortunately ast does not give you the location of
+            # keyword keys, so we are forced to infer it from the keyword arg
+            # value.
+            key_start = argval_col_offset - len(argkey) - 1
+            key_end = key_start + len(argkey) + 1
+            if (self._lines[argval_lineno - 1][key_start:key_end] == argkey +
+                "="):
+              self._file_edit.add("Renamed keyword argument from %r to %r" %
+                                  (argkey,
+                                   renamed_keywords[argkey]), argval_lineno,
+                                  argval_col_offset - len(argkey) - 1,
+                                  argkey + "=", renamed_keywords[argkey] + "=")
+              continue
+          self._file_edit.add(
+              "Failed to rename keyword argument from %r to %r" %
+              (argkey, renamed_keywords[argkey]),
+              argval.lineno,
+              argval.col_offset - len(argkey) - 1,
+              "",
+              "",
+              error="Failed to find keyword lexographically. Fix manually.")
+
+    ast.NodeVisitor.generic_visit(self, node)
+
+  def visit_Attribute(self, node):  # pylint: disable=invalid-name
+    """Handle bare Attributes i.e. [tf.foo, tf.bar].
+
+    Args:
+      node: Node that is of type ast.Attribute
+    """
+    full_name = self._get_attribute_full_path(node)
+    if full_name:
+      self._rename_functions(node, full_name)
+    if full_name in self._api_change_spec.change_to_function:
+      if not hasattr(node, "is_function_for_call"):
+        new_text = full_name + "()"
+        self._file_edit.add("Changed %r to %r" % (full_name, new_text),
+                            node.lineno, node.col_offset, full_name, new_text)
+
+    ast.NodeVisitor.generic_visit(self, node)
+
+
+class ASTCodeUpgrader(object):
+  """Handles upgrading a set of Python files using a given API change spec."""
+
+  def __init__(self, api_change_spec):
+    if not isinstance(api_change_spec, APIChangeSpec):
+      raise TypeError("Must pass APIChangeSpec to ASTCodeUpgrader, got %s" %
+                      type(api_change_spec))
+    self._api_change_spec = api_change_spec
+
+  def process_file(self, in_filename, out_filename):
+    """Process the given python file for incompatible changes.
+
+    Args:
+      in_filename: filename to parse
+      out_filename: output file to write to
+    Returns:
+      A tuple representing number of files processed, log of actions, errors
+    """
+
+    # Write to a temporary file, just in case we are doing an implace modify.
+    with open(in_filename, "r") as in_file, \
+        tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
+      ret = self.process_opened_file(in_filename, in_file, out_filename,
+                                     temp_file)
+
+    shutil.move(temp_file.name, out_filename)
+    return ret
+
+  # Broad exceptions are required here because ast throws whatever it wants.
+  # pylint: disable=broad-except
+  def process_opened_file(self, in_filename, in_file, out_filename, out_file):
+    """Process the given python file for incompatible changes.
+
+    This function is split out to facilitate StringIO testing from
+    tf_upgrade_test.py.
+
+    Args:
+      in_filename: filename to parse
+      in_file: opened file (or StringIO)
+      out_filename: output file to write to
+      out_file: opened file (or StringIO)
+    Returns:
+      A tuple representing number of files processed, log of actions, errors
+    """
+    process_errors = []
+    text = "-" * 80 + "\n"
+    text += "Processing file %r\n outputting to %r\n" % (in_filename,
+                                                         out_filename)
+    text += "-" * 80 + "\n\n"
+
+    parsed_ast = None
+    lines = in_file.readlines()
+    try:
+      parsed_ast = ast.parse("".join(lines))
+    except Exception:
+      text += "Failed to parse %r\n\n" % in_filename
+      text += traceback.format_exc()
+    if parsed_ast:
+      visitor = _ASTCallVisitor(in_filename, lines, self._api_change_spec)
+      visitor.visit(parsed_ast)
+      out_text, new_text, process_errors = visitor.process(lines)
+      text += new_text
+      if out_file:
+        out_file.write(out_text)
+    text += "\n"
+    return 1, text, process_errors
+
+  # pylint: enable=broad-except
+
+  def process_tree(self, root_directory, output_root_directory,
+                   copy_other_files):
+    """Processes upgrades on an entire tree of python files in place.
+
+    Note that only Python files. If you have custom code in other languages,
+    you will need to manually upgrade those.
+
+    Args:
+      root_directory: Directory to walk and process.
+      output_root_directory: Directory to use as base.
+      copy_other_files: Copy files that are not touched by this converter.
+
+    Returns:
+      A tuple of files processed, the report string ofr all files, and errors
+    """
+
+    # make sure output directory doesn't exist
+    if output_root_directory and os.path.exists(output_root_directory):
+      print("Output directory %r must not already exist." %
+            (output_root_directory))
+      sys.exit(1)
+
+    # make sure output directory does not overlap with root_directory
+    norm_root = os.path.split(os.path.normpath(root_directory))
+    norm_output = os.path.split(os.path.normpath(output_root_directory))
+    if norm_root == norm_output:
+      print("Output directory %r same as input directory %r" %
+            (root_directory, output_root_directory))
+      sys.exit(1)
+
+    # Collect list of files to process (we do this to correctly handle if the
+    # user puts the output directory in some sub directory of the input dir)
+    files_to_process = []
+    files_to_copy = []
+    for dir_name, _, file_list in os.walk(root_directory):
+      py_files = [f for f in file_list if f.endswith(".py")]
+      copy_files = [f for f in file_list if not f.endswith(".py")]
+      for filename in py_files:
+        fullpath = os.path.join(dir_name, filename)
+        fullpath_output = os.path.join(output_root_directory,
+                                       os.path.relpath(fullpath,
+                                                       root_directory))
+        files_to_process.append((fullpath, fullpath_output))
+      if copy_other_files:
+        for filename in copy_files:
+          fullpath = os.path.join(dir_name, filename)
+          fullpath_output = os.path.join(output_root_directory,
+                                         os.path.relpath(
+                                             fullpath, root_directory))
+          files_to_copy.append((fullpath, fullpath_output))
+
+    file_count = 0
+    tree_errors = []
+    report = ""
+    report += ("=" * 80) + "\n"
+    report += "Input tree: %r\n" % root_directory
+    report += ("=" * 80) + "\n"
+
+    for input_path, output_path in files_to_process:
+      output_directory = os.path.dirname(output_path)
+      if not os.path.isdir(output_directory):
+        os.makedirs(output_directory)
+      file_count += 1
+      _, l_report, l_errors = self.process_file(input_path, output_path)
+      tree_errors += l_errors
+      report += l_report
+    for input_path, output_path in files_to_copy:
+      output_directory = os.path.dirname(output_path)
+      if not os.path.isdir(output_directory):
+        os.makedirs(output_directory)
+      shutil.copy(input_path, output_path)
+    return file_count, report, tree_errors
+
+
+class TFAPIChangeSpec(APIChangeSpec):
   """List of maps that describe what changed in the API."""
 
   def __init__(self):
@@ -166,18 +646,17 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.concat": ["concat_dim", "values", "name"],
         "tf.svd": ["tensor", "compute_uv", "full_matrices", "name"],
         "tf.nn.softmax_cross_entropy_with_logits": [
-            "logits", "labels", "dim", "name"],
+            "logits", "labels", "dim", "name"
+        ],
         "tf.nn.sparse_softmax_cross_entropy_with_logits": [
-            "logits", "labels", "name"],
-        "tf.nn.sigmoid_cross_entropy_with_logits": [
-            "logits", "labels", "name"],
+            "logits", "labels", "name"
+        ],
+        "tf.nn.sigmoid_cross_entropy_with_logits": ["logits", "labels", "name"],
         "tf.op_scope": ["values", "name", "default_name"],
     }
 
     # Specially handled functions.
-    self.function_handle = {
-        "tf.reverse": self._reverse_handler
-    }
+    self.function_handle = {"tf.reverse": self._reverse_handler}
 
   @staticmethod
   def _reverse_handler(file_edit_recorder, node):
@@ -186,12 +665,13 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     comment = ("ERROR: tf.reverse has had its argument semantics changed\n"
                "significantly the converter cannot detect this reliably, so you"
                "need to inspect this usage manually.\n")
-    file_edit_recorder.add(comment,
-                           node.lineno,
-                           node.col_offset,
-                           "tf.reverse",
-                           "tf.reverse",
-                           error="tf.reverse requires manual check.")
+    file_edit_recorder.add(
+        comment,
+        node.lineno,
+        node.col_offset,
+        "tf.reverse",
+        "tf.reverse",
+        error="tf.reverse requires manual check.")
 
 
 if __name__ == "__main__":
@@ -238,7 +718,7 @@ Simple usage:
       default="report.txt")
   args = parser.parse_args()
 
-  upgrade = ast_edits.ASTCodeUpgrader(TFAPIChangeSpec())
+  upgrade = ASTCodeUpgrader(TFAPIChangeSpec())
   report_text = None
   report_filename = args.report_filename
   files_processed = 0
diff --git a/tensorflow/tools/compatibility/tf_upgrade_test.py b/tensorflow/tools/compatibility/tf_upgrade_test.py
index ac838a2791fd9ce3244f344c495e9a97dcd513ca..3d02eacba6e7a91e6d3c88e8297306de9782f4bf 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_test.py
@@ -22,7 +22,6 @@ import tempfile
 import six
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
-from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import tf_upgrade
 
 
@@ -37,7 +36,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   def _upgrade(self, old_file_text):
     in_file = six.StringIO(old_file_text)
     out_file = six.StringIO()
-    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
+    upgrader = tf_upgrade.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
     count, report, errors = (
         upgrader.process_opened_file("test.py", in_file,
                                      "test_out.py", out_file))
@@ -115,7 +114,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     self.assertEqual(errors, ["test.py:1: tf.reverse requires manual check."])
 
   def testListComprehension(self):
-    def _test(input, output):
+    def _test(input, output):  # pylint: disable=redefined-builtin
       _, unused_report, errors, new_text = self._upgrade(input)
       self.assertEqual(new_text, output)
     _test("tf.concat(0,  \t[x for x in y])\n",
@@ -140,7 +139,7 @@ class TestUpgradeFiles(test_util.TensorFlowTestCase):
     upgraded = "tf.multiply(a, b)\n"
     temp_file.write(original)
     temp_file.close()
-    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
+    upgrader = tf_upgrade.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
     upgrader.process_file(temp_file.name, temp_file.name)
     self.assertAllEqual(open(temp_file.name).read(), upgraded)
     os.unlink(temp_file.name)
diff --git a/tensorflow/tools/dist_test/README.md b/tensorflow/tools/dist_test/README.md
index 39c040e051ec48ae6d1a1f6eb343a143930ba4f3..c1b1f79bbd4b657768b9bbcab93efa3354774915 100644
--- a/tensorflow/tools/dist_test/README.md
+++ b/tensorflow/tools/dist_test/README.md
@@ -17,7 +17,7 @@ cesnsu model:
 
     ./local_test.sh --model_name CENSUS_WIDENDEEP
 
-**2) Launch a remote k8s cluster on Google Container Engine (GKE) and run the
+**2) Launch a remote k8s cluster on Google Kubernetes Engine (GKE) and run the
 test suite on it**
 
 For example:
diff --git a/tensorflow/tools/dist_test/build_server.sh b/tensorflow/tools/dist_test/build_server.sh
index 878fabd248f3c1dd5cb79983df5220ebf5893026..225c0347416ec8c8fef855946d18e838bd767690 100755
--- a/tensorflow/tools/dist_test/build_server.sh
+++ b/tensorflow/tools/dist_test/build_server.sh
@@ -16,14 +16,15 @@
 #
 # Builds the test server for distributed (GRPC) TensorFlow
 #
-# Usage: build_server.sh <docker_image_name> <whl_url> [--test]
+# Usage: build_server.sh <docker_image_name> <whl_file_location> [--test]
 #
 # Arguments:
 #   docker_image_name: Name of the docker image to build.
 #     E.g.: tensorflow/tf_grpc_test_server:0.11.0rc1
 #
-#   whl_url: URL from which the TensorFlow whl file will be downloaded.
+#   whl_file_location: URL from which the TensorFlow whl file will be downloaded.
 #     E.g.: https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
+#     E.g.: /path/to/folder/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 #
 # The optional flag --test lets the script to use the Dockerfile for the
 # testing GRPC server. Without the flag, the script will build the non-test
@@ -41,11 +42,11 @@ die() {
 
 # Check arguments
 if [[ $# -lt 2 ]]; then
-  die "Usage: $0 <docker_image_name> <whl_url> [--test]"
+  die "Usage: $0 <docker_image_name> <whl_location> [--test]"
 fi
 
 DOCKER_IMG_NAME=$1
-WHL_URL=$2
+WHL_FILE_LOCATION=$2
 shift 2
 
 # Current script directory
@@ -53,7 +54,7 @@ DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 BUILD_DIR=$(mktemp -d)
 echo ""
-echo "Using whl file URL: ${WHL_URL}"
+echo "Using whl file URL: ${WHL_FILE_LOCATION}"
 echo "Building in temporary directory: ${BUILD_DIR}"
 
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
@@ -65,9 +66,15 @@ if [[ $1 == "--test" ]]; then
 fi
 echo "Using Docker file: ${DOCKER_FILE}"
 
+if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+    # Download whl file into the build context directory.
+    wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
+        die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
+else
+    cp "${WHL_FILE_LOCATION}" "${BUILD_DIR}"
+fi
+
 # Download whl file into the build context directory.
-wget -P "${BUILD_DIR}" ${WHL_URL} || \
-    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
 
 if [[ ! -f "${DOCKER_FILE}" ]]; then
   die "ERROR: Unable to find dockerfile: ${DOCKER_FILE}"
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index 7d7f92d246e1ca0b519ac3bf30fde673621ff755..435f9d0dc9c55a3dcfc45e7e46f279b4679a9086 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -24,19 +24,20 @@
 # 3) Call a script to launch a k8s TensorFlow GRPC cluster inside the container
 #    and run the distributed test suite.
 #
-# Usage: local_test.sh <whl_url>
+# Usage: local_test.sh <whl_file_location>
 #                      [--leave_container_running]
 #                      [--model_name <MODEL_NAME>]
 #                      [--num_workers <NUM_WORKERS>]
 #                      [--num_parameter_servers <NUM_PARAMETER_SERVERS>]
 #                      [--sync_replicas]
 #
-# E.g., local_test.sh <whl_url> --model_name CENSUS_WIDENDEEP
-#       local_test.sh <whl_url> --num_workers 3 --num_parameter_servers 3
+# E.g., local_test.sh <whl_file_location> --model_name CENSUS_WIDENDEEP
+#       local_test.sh <whl_file_location> --num_workers 3 --num_parameter_servers 3
 #
 # Arguments:
-# <whl_url>
-#   Specify custom TensorFlow whl file URL to install in the test Docker image.
+# whl_file_location: URL from which the TensorFlow whl file will be acquired.
+#   E.g.: https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
+#   E.g.: /path/to/folder/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 #
 # --leave_container_running:  Do not stop the docker-in-docker container after
 #                             the termination of the tests, e.g., for debugging
@@ -81,9 +82,9 @@ NUM_WORKERS=2
 NUM_PARAMETER_SERVERS=2
 SYNC_REPLICAS_FLAG=""
 
-WHL_URL=${1}
-if [[ -z "${WHL_URL}" ]]; then
-  die "whl file URL is not specified"
+WHL_FILE_LOCATION=${1}
+if [[ -z "${WHL_FILE_LOCATION}" ]]; then
+  die "whl file location is not specified"
 fi
 
 while true; do
@@ -98,8 +99,8 @@ while true; do
     NUM_PARAMETER_SERVERS=$2
   elif [[ $1 == "--sync_replicas" ]]; then
     SYNC_REPLICAS_FLAG="--sync_replicas"
-  elif [[ $1 == "--whl_url" ]]; then
-    WHL_URL=$2
+  elif [[ $1 == "--WHL_FILE_LOCATION" ]]; then
+    WHL_FILE_LOCATION=$2
   fi
 
   shift
@@ -130,15 +131,19 @@ fi
 # Create docker build context directory.
 BUILD_DIR=$(mktemp -d)
 echo ""
-echo "Using whl file URL: ${WHL_URL}"
+echo "Using whl file location: ${WHL_FILE_LOCATION}"
 echo "Building in temporary directory: ${BUILD_DIR}"
 
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
-# Download whl file into the build context directory.
-wget -P "${BUILD_DIR}" ${WHL_URL} || \
-  die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+    # Download whl file into the build context directory.
+    wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
+        die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
+else
+    cp "${WHL_FILE_LOCATION}" "${BUILD_DIR}"
+fi
 
 # Build docker image for test.
 docker build ${NO_CACHE_FLAG} -t ${DOCKER_IMG_NAME} \
diff --git a/tensorflow/tools/dist_test/python/census_widendeep.py b/tensorflow/tools/dist_test/python/census_widendeep.py
index 6f578d6f673ccfe013a5f39472922e221d2bf2bb..8feb5386e9881596c20fba9e537a0439c8187ac4 100644
--- a/tensorflow/tools/dist_test/python/census_widendeep.py
+++ b/tensorflow/tools/dist_test/python/census_widendeep.py
@@ -263,8 +263,7 @@ if __name__ == "__main__":
       "--data_dir",
       type=str,
       default="/tmp/census-data",
-      help="Directory for storing the census data"
-  )
+      help="Directory for storing the census data")
   parser.add_argument(
       "--model_dir",
       type=str,
diff --git a/tensorflow/tools/dist_test/python/mnist_replica.py b/tensorflow/tools/dist_test/python/mnist_replica.py
index e40ecb43f9a00bee7309895969ff65e48b95b4e9..a2d12442c44553a287637029843021b7541fa3fa 100644
--- a/tensorflow/tools/dist_test/python/mnist_replica.py
+++ b/tensorflow/tools/dist_test/python/mnist_replica.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Distributed MNIST training and validation, with model replicas.
 
 A simple softmax model with one hidden layer is defined. The parameters
@@ -32,7 +31,6 @@ perform forward computation and gradient calculation in parallel, which
 should lead to increased training speed for the simple model.
 """
 
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -45,7 +43,6 @@ import time
 import tensorflow as tf
 from tensorflow.examples.tutorials.mnist import input_data
 
-
 flags = tf.app.flags
 flags.DEFINE_string("data_dir", "/tmp/mnist-data",
                     "Directory for storing mnist data")
@@ -56,8 +53,7 @@ flags.DEFINE_integer("task_index", None,
                      "Worker task index, should be >= 0. task_index=0 is "
                      "the master worker task the performs the variable "
                      "initialization ")
-flags.DEFINE_integer("num_gpus", 1,
-                     "Total number of gpus for each machine."
+flags.DEFINE_integer("num_gpus", 1, "Total number of gpus for each machine."
                      "If you don't use GPU, please set it to '0'")
 flags.DEFINE_integer("replicas_to_aggregate", None,
                      "Number of replicas to aggregate before parameter update"
@@ -69,24 +65,24 @@ flags.DEFINE_integer("train_steps", 200,
                      "Number of (global) training steps to perform")
 flags.DEFINE_integer("batch_size", 100, "Training batch size")
 flags.DEFINE_float("learning_rate", 0.01, "Learning rate")
-flags.DEFINE_boolean("sync_replicas", False,
-                     "Use the sync_replicas (synchronized replicas) mode, "
-                     "wherein the parameter updates from workers are aggregated "
-                     "before applied to avoid stale gradients")
+flags.DEFINE_boolean(
+    "sync_replicas", False,
+    "Use the sync_replicas (synchronized replicas) mode, "
+    "wherein the parameter updates from workers are aggregated "
+    "before applied to avoid stale gradients")
 flags.DEFINE_boolean(
     "existing_servers", False, "Whether servers already exists. If True, "
     "will use the worker hosts via their GRPC URLs (one client process "
     "per worker host). Otherwise, will create an in-process TensorFlow "
     "server.")
-flags.DEFINE_string("ps_hosts","localhost:2222",
+flags.DEFINE_string("ps_hosts", "localhost:2222",
                     "Comma-separated list of hostname:port pairs")
 flags.DEFINE_string("worker_hosts", "localhost:2223,localhost:2224",
                     "Comma-separated list of hostname:port pairs")
-flags.DEFINE_string("job_name", None,"job name: worker or ps")
+flags.DEFINE_string("job_name", None, "job name: worker or ps")
 
 FLAGS = flags.FLAGS
 
-
 IMAGE_PIXELS = 28
 
 
@@ -97,7 +93,7 @@ def main(unused_argv):
 
   if FLAGS.job_name is None or FLAGS.job_name == "":
     raise ValueError("Must specify an explicit `job_name`")
-  if FLAGS.task_index is None or FLAGS.task_index =="":
+  if FLAGS.task_index is None or FLAGS.task_index == "":
     raise ValueError("Must specify an explicit `task_index`")
 
   print("job name = %s" % FLAGS.job_name)
@@ -110,9 +106,7 @@ def main(unused_argv):
   # Get the number of workers.
   num_workers = len(worker_spec)
 
-  cluster = tf.train.ClusterSpec({
-      "ps": ps_spec,
-      "worker": worker_spec})
+  cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec})
 
   if not FLAGS.existing_servers:
     # Not using existing servers. Create an in-process server.
@@ -217,7 +211,8 @@ def main(unused_argv):
     sess_config = tf.ConfigProto(
         allow_soft_placement=True,
         log_device_placement=False,
-        device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index])
+        device_filters=["/job:ps",
+                        "/job:worker/task:%d" % FLAGS.task_index])
 
     # The chief worker (task_index==0) session will prepare the session,
     # while the remaining workers will wait for the preparation to complete.
@@ -231,8 +226,7 @@ def main(unused_argv):
       server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
       print("Using existing server at: %s" % server_grpc_url)
 
-      sess = sv.prepare_or_wait_for_session(server_grpc_url,
-                                            config=sess_config)
+      sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)
     else:
       sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 3525c7524f3bd844be5284d2a076eb78d1bb1a02..d16761c3675942838fd2be0ea6e0b7463a3bf249 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -57,7 +57,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.5.4
+ENV BAZEL_VERSION 0.8.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
@@ -69,11 +69,8 @@ RUN mkdir /bazel && \
     rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
 
 # Download and build TensorFlow.
-
-RUN git clone https://github.com/tensorflow/tensorflow.git && \
-    cd tensorflow && \
-    git checkout r1.4
 WORKDIR /tensorflow
+RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index 8180e5e7fb65e1eff693265ed388496b356563dd..3690e7dfe57a4682276a90b10cb84c9a329b3f5e 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.4
+ARG TF_BRANCH=r1.6
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -54,7 +54,7 @@ RUN ./configure
 RUN LD_LIBRARY_PATH=${LD_LIBRARY_PATH} \
     bazel build --config=mkl \
                 --config="opt" \
-                --copt="-march=native" \
+                --copt="-march=broadwell" \
                 --copt="-O3" \
                 //tensorflow/tools/pip_package:build_pip_package && \
     mkdir ${WHL_DIR} && \
@@ -81,5 +81,3 @@ RUN echo '[ ! -z "$TERM" -a -r /etc/motd ] && cat /etc/issue && cat /etc/motd' \
 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||\n\
 \n "\
 	> /etc/motd
-
-CMD ["/bin/bash"]
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 041f45971bca256efc0668b03f5b5effde06e2c2..4ef37881bc91aaa58bab031c69b4a96c2a9d8ec1 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -1,11 +1,20 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
+FROM nvidia/cuda:9.0-base-ubuntu16.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-dev-9-0 \
+        cuda-cudart-dev-9-0 \
+        cuda-cufft-dev-9-0 \
+        cuda-curand-dev-9-0 \
+        cuda-cusolver-dev-9-0 \
+        cuda-cusparse-dev-9-0 \
         curl \
         git \
+        libcudnn7=7.0.5.15-1+cuda9.0 \
+        libcudnn7-dev=7.0.5.15-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libpng12-dev \
@@ -17,12 +26,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         unzip \
         zip \
         zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
         wget \
         && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/* && \
+    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
 
 RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
     python get-pip.py && \
@@ -58,7 +66,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.5.4
+ENV BAZEL_VERSION 0.8.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
@@ -70,18 +78,16 @@ RUN mkdir /bazel && \
     rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
 
 # Download and build TensorFlow.
-
-RUN git clone https://github.com/tensorflow/tensorflow.git && \
-    cd tensorflow && \
-    git checkout r1.4
 WORKDIR /tensorflow
+RUN git clone --branch=r1.6 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1
-
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
 
 RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
     LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
deleted file mode 100644
index 3bedc8cf3462aabf25f55706b3483907c5d5b467..0000000000000000000000000000000000000000
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
+++ /dev/null
@@ -1,115 +0,0 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
-
-LABEL maintainer="Gunhan Gulsoy <gunan@google.com>"
-
-# It is possible to override these for releases.
-ARG TF_BRANCH=master
-ARG BAZEL_VERSION=0.5.4
-ARG TF_AVAILABLE_CPUS=32
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        git \
-        golang \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        python-dev \
-        python-pip \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        wget \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN pip --no-cache-dir install --upgrade \
-        pip setuptools
-
-RUN pip --no-cache-dir install \
-        ipykernel \
-        jupyter \
-        matplotlib \
-        numpy \
-        scipy \
-        sklearn \
-        pandas \
-        wheel \
-        && \
-    python -m ipykernel.kernelspec
-
-# Set up our notebook config.
-COPY jupyter_notebook_config.py /root/.jupyter/
-
-# Jupyter has issues with being run directly:
-#   https://github.com/ipython/ipython/issues/7062
-# We just add a little wrapper script.
-COPY run_jupyter.sh /
-
-# Set up Bazel.
-
-# Running bazel inside a `docker build` command causes trouble, cf:
-#   https://github.com/bazelbuild/bazel/issues/134
-# The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/etc/bazel.bazelrc
-# Similarly, we need to workaround sandboxing issues:
-#   https://github.com/bazelbuild/bazel/issues/418
-RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/etc/bazel.bazelrc
-WORKDIR /
-RUN mkdir /bazel && \
-    cd /bazel && \
-    wget --quiet https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    wget --quiet https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
-    chmod +x bazel-*.sh && \
-    ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
-
-# Download and build TensorFlow.
-WORKDIR /
-RUN git clone https://github.com/tensorflow/tensorflow.git && \
-    cd tensorflow && \
-    git checkout ${TF_BRANCH}
-WORKDIR /tensorflow
-
-# Configure the build for our CUDA configuration.
-ENV CI_BUILD_PYTHON=python \
-    LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH} \
-    CUDNN_INSTALL_PATH=/usr/lib/x86_64-linux-gnu \
-    PYTHON_BIN_PATH=/usr/bin/python \
-    PYTHON_LIB_PATH=/usr/local/lib/python2.7/dist-packages \
-    TF_NEED_CUDA=1 \
-    TF_CUDA_VERSION=9.0 \
-    TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1,7.0 \
-    TF_CUDNN_VERSION=7
-RUN ./configure
-
-# Build and Install TensorFlow.
-RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
-    LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
-    bazel build -c opt \
-                --config=cuda \
-                --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
-                --jobs=${TF_AVAILABLE_CPUS} \
-                tensorflow/tools/pip_package:build_pip_package && \
-    mkdir /pip_pkg && \
-    bazel-bin/tensorflow/tools/pip_package/build_pip_package /pip_pkg && \
-    pip --no-cache-dir install --upgrade /pip_pkg/tensorflow-*.whl && \
-    rm -rf /pip_pkg && \
-    rm -rf /root/.cache
-# Clean up pip wheel and Bazel cache when done.
-
-WORKDIR /root
-
-# TensorBoard
-EXPOSE 6006
-# IPython
-EXPOSE 8888
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index e212d10290a93261e88cf9464076e5714e16ac43..b6682cd68163ec870ed815b45ac4fdd9233f88c6 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn6-runtime-ubuntu16.04
+FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
diff --git a/tensorflow/tools/docker/jupyter_notebook_config.py b/tensorflow/tools/docker/jupyter_notebook_config.py
index 0acbf6fcee58b3eb14794c0f3bb8d2f6ae6e5910..05dcefb099a92683e2cd4700fff54c89c018baa6 100644
--- a/tensorflow/tools/docker/jupyter_notebook_config.py
+++ b/tensorflow/tools/docker/jupyter_notebook_config.py
@@ -15,6 +15,7 @@
 import os
 from IPython.lib import passwd
 
+c = c  # pylint:disable=undefined-variable
 c.NotebookApp.ip = '*'
 c.NotebookApp.port = int(os.getenv('PORT', 8888))
 c.NotebookApp.open_browser = False
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 80a07b9b3ba7fb278b01862880893aa0a2693a28..b4fba5b8f5e19c2fbb8c7261d8cf293757df503c 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -34,6 +34,11 @@
 #     If set to a non-empty string, will use it as the URL from which the
 #     pip wheel file will be downloaded (instead of building the pip locally).
 #
+#   TF_DOCKER_BUILD_CENTRAL_PIP_IS_LOCAL
+#     (Optional)
+#     If set to a non-empty string, we will treat TF_DOCKER_BUILD_CENTRAL_PIP
+#     as a path rather than a url.
+#
 #   TF_DOCKER_BUILD_IMAGE_NAME:
 #     (Optional)
 #     If set to any non-empty value, will use it as the image of the
@@ -234,6 +239,32 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
 "COPY ${PIP_WHL} /\n"\
 "RUN pip --no-cache-dir install /${PIP_WHL}" "${ORIG_DOCKERFILE}" \
     > "${DOCKERFILE}"
+
+  # Build from a local whl file path rather than an URL
+  elif [[ ! -z "${TF_DOCKER_BUILD_CENTRAL_PIP_IS_LOCAL}" ]]; then
+    PIP_WHL="${TF_DOCKER_BUILD_CENTRAL_PIP}"
+    if [[ -z "${PIP_WHL}" ]]; then
+      die "ERROR: Cannot locate the specified pip whl file"
+    fi
+    echo "Specified PIP whl file is at: ${PIP_WHL}"
+
+    # Copy the pip file to tmp directory
+    cp "${PIP_WHL}" "${TMP_DIR}/" || \
+        die "ERROR: Failed to copy wheel file: ${PIP_WHL}"
+
+    # Use string replacement to put the correct file name into the Dockerfile
+    PIP_WHL=$(basename "${PIP_WHL}")
+
+    # Modify the non-devel Dockerfile to point to the correct pip whl file
+    # location
+    sed -e "/# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/,"\
+"/# --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #/c"\
+"COPY ${PIP_WHL} /\n"\
+"RUN pip --no-cache-dir install /${PIP_WHL}" "${ORIG_DOCKERFILE}" \
+    > "${DOCKERFILE}"
+    echo "Using local pip wheel from: ${TF_DOCKER_BUILD_CENTRAL_PIP}"
+    echo
+
   else
     echo "Downloading pip wheel from: ${TF_DOCKER_BUILD_CENTRAL_PIP}"
     echo
@@ -265,7 +296,7 @@ else
   DOCKERFILE="${TMP_DIR}/Dockerfile"
 
   # Modify the devel Dockerfile to specify the git branch
-  sed -r "s/([\s]*git checkout )(.*)/\1${TF_DOCKER_BUILD_DEVEL_BRANCH}/g" \
+  sed "s/^RUN git clone --branch=.* --depth=1/RUN git clone --branch=${TF_DOCKER_BUILD_DEVEL_BRANCH} --depth=1/" \
       "${ORIG_DOCKERFILE}" > "${DOCKERFILE}"
 
   # Modify python/pip version if necessary.
@@ -408,14 +439,13 @@ fi
 # Optional: set TF_DOCKER_BUILD_PUSH_WITH_CREDENTIALS to push image
 if [[ ! -z "${TF_DOCKER_BUILD_PUSH_WITH_CREDENTIALS}" ]]; then
 
-  docker login --username "${TF_DOCKER_USERNAME}" \
-  --email "${TF_DOCKER_EMAIL}" \
-  --password "${TF_DOCKER_PASSWORD}"
+  docker login -u "${TF_DOCKER_USERNAME}" \
+  -p "${TF_DOCKER_PASSWORD}"
 
   if [[ $? != "0" ]]; then
     die "FAIL: Unable to login. Invalid credentials."
   fi
-  docker push $1
+  docker push "${FINAL_IMG}"
   if [[ $? == "0" ]]; then
     docker logout
     echo "Successfully pushed Docker image ${FINAL_IMG}"
diff --git a/tensorflow/tools/docs/generate_1_0.py b/tensorflow/tools/docs/generate_1_0.py
index cdc03fdcacf44f7be49e739962b63ba84cf94896..f4384e0ced77718c80d4d146a2d72072588a0541 100644
--- a/tensorflow/tools/docs/generate_1_0.py
+++ b/tensorflow/tools/docs/generate_1_0.py
@@ -53,7 +53,6 @@ if __name__ == '__main__':
           'factorization',
           'grid_rnn',
           'labeled_tensor',
-          'ndlstm',
           'quantization',
           'session_bundle',
           'slim',
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index c0cde1d3bdd9023479a19112df36d3d88411da67..34dd419f15676babfa9a36c2c0960b01248b6f69 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import fnmatch
 import os
 import sys
 
@@ -198,12 +199,12 @@ def add_dict_to_dict(add_from, add_to):
       add_to[key] = add_from[key]
 
 
-# Exclude some libaries in contrib from the documentation altogether.
+# Exclude some libraries in contrib from the documentation altogether.
 def _get_default_private_map():
   return {'tf.test': ['mock']}
 
 
-# Exclude members of some libaries.
+# Exclude members of some libraries.
 def _get_default_do_not_descend_map():
   # TODO(wicke): Shrink this list once the modules get sealed.
   return {
@@ -214,7 +215,6 @@ def _get_default_do_not_descend_map():
           # Block contrib.keras to de-clutter the docs
           'keras',
           'labeled_tensor',
-          'ndlstm',
           'quantization',
           'session_bundle',
           'slim',
@@ -384,10 +384,26 @@ class _UpdateTags(py_guide_parser.PyGuideParser):
 EXCLUDED = set(['__init__.py', 'OWNERS', 'README.txt'])
 
 
-def _other_docs(src_dir, output_dir, reference_resolver):
-  """Convert all the files in `src_dir` and write results to `output_dir`."""
-  header = '<!-- DO NOT EDIT! Automatically generated file. -->\n'
+def _other_docs(src_dir, output_dir, reference_resolver, file_pattern='*.md'):
+  """Fix @{} references in all files under `src_dir` matching `file_pattern`.
 
+  A matching directory structure, with the modified files is
+  written to `output_dir`.
+
+  `{"__init__.py","OWNERS","README.txt"}` are skipped.
+
+  Files not matching `file_pattern` (using `fnmatch`) are copied with no change.
+
+  Also, files in the `api_guides/python` directory get explicit ids set on all
+  heading-2s to ensure back-links work.
+
+  Args:
+    src_dir: The directory to convert files from.
+    output_dir: The root directory to write the resulting files to.
+    reference_resolver: A `parser.ReferenceResolver` to make the replacements.
+    file_pattern: Only replace references in files matching file_patters,
+      using fnmatch. Non-matching files are copied unchanged.
+  """
   # Iterate through all the source files and process them.
   tag_updater = _UpdateTags()
   for dirpath, _, filenames in os.walk(src_dir):
@@ -415,21 +431,21 @@ def _other_docs(src_dir, output_dir, reference_resolver):
 
       suffix = os.path.relpath(path=full_in_path, start=src_dir)
       full_out_path = os.path.join(output_dir, suffix)
-      if not base_name.endswith('.md'):
-        print('Copying non-md file %s...' % suffix)
+      if not fnmatch.fnmatch(base_name, file_pattern):
+        print('Copying un-matched file %s...' % suffix)
         open(full_out_path, 'w').write(open(full_in_path).read())
         continue
       if dirpath.endswith('/api_guides/python'):
         print('Processing Python guide %s...' % base_name)
-        md_string = tag_updater.process(full_in_path)
+        content = tag_updater.process(full_in_path)
       else:
         print('Processing doc %s...' % suffix)
-        md_string = open(full_in_path).read()
+        content = open(full_in_path).read()
 
-      output = reference_resolver.replace_references(md_string,
-                                                     relative_path_to_root)
+      content = reference_resolver.replace_references(content,
+                                                      relative_path_to_root)
       with open(full_out_path, 'w') as f:
-        f.write(header + output)
+        f.write(content)
 
   print('Done.')
 
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 3db164c2b5b78dbcb3c408ce89c067d33c2a2af4..e758229535e7b10994a39cbafb37e116fd2a465c 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -111,8 +111,8 @@ SYMBOL_REFERENCE_RE = re.compile(
     r"""
     # Start with a literal "@{".
     @\{
-      # Group at least 1 symbol: not "}" or "\n".
-      ([^}\n]+)
+      # Group at least 1 symbol, not "}".
+      ([^}]+)
     # Followed by a closing "}"
     \}
     """,
diff --git a/tensorflow/tools/docs/parser_test.py b/tensorflow/tools/docs/parser_test.py
index 8a0e9af5216c881326449b3e85b94c0be331fa37..fca5436ca5fadd1fb5da07d7523bb51c871164b5 100644
--- a/tensorflow/tools/docs/parser_test.py
+++ b/tensorflow/tools/docs/parser_test.py
@@ -76,8 +76,9 @@ class ParserTest(googletest.TestCase):
         pass
 
     string = (
-        'A @{tf.reference}, another @{tf.reference}, a member '
-        '@{tf.reference.foo}, and a @{tf.third$link `text` with `code` in it}.')
+        'A @{tf.reference}, another @{tf.reference$with\nnewline}, a member '
+        '@{tf.reference.foo}, and a @{tf.third$link `text` with `code` in '
+        'it}.')
     duplicate_of = {'tf.third': 'tf.fourth'}
     index = {'tf.reference': HasOneMember,
              'tf.reference.foo': HasOneMember.foo,
@@ -93,7 +94,7 @@ class ParserTest(googletest.TestCase):
     self.assertEqual('A <a href="../../tf/reference.md">'
                      '<code>tf.reference</code></a>, '
                      'another <a href="../../tf/reference.md">'
-                     '<code>tf.reference</code></a>, '
+                     'with\nnewline</a>, '
                      'a member <a href="../../tf/reference.md#foo">'
                      '<code>tf.reference.foo</code></a>, '
                      'and a <a href="../../tf/fourth.md">link '
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
index c033c16ae98c4bcaa4c0338e539324b3a2ae5552..543b5fa6fefcd8e8dca99ad7eac7cca76781ccd3 100644
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -162,7 +162,7 @@ def _build_class_page(page_info):
       parts.append(h3.format(**method_info.__dict__))
 
       if method_info.signature is not None:
-        parts.append(_build_signature(method_info))
+        parts.append(_build_signature(method_info, use_full_name=False))
 
       parts.append(method_info.doc.docstring)
       parts.append(_build_function_details(method_info.doc.function_details))
@@ -259,14 +259,14 @@ def _build_module_page(page_info):
   return ''.join(parts)
 
 
-def _build_signature(obj_info):
+def _build_signature(obj_info, use_full_name=True):
   """Returns a md code block showing the function signature."""
   # Special case tf.range, since it has an optional first argument
   if obj_info.full_name == 'tf.range':
     return (
         '``` python\n'
-        "range(limit, delta=1, dtype=None, name='range')\n"
-        "range(start, limit, delta=1, dtype=None, name='range')\n"
+        "tf.range(limit, delta=1, dtype=None, name='range')\n"
+        "tf.range(start, limit, delta=1, dtype=None, name='range')\n"
         '```\n\n')
 
   parts = ['``` python']
@@ -281,7 +281,11 @@ def _build_signature(obj_info):
     sig = ',\n'.join('    %s' % sig_item for sig_item in obj_info.signature)
     sig = '\n'+sig+'\n'
 
-  parts.append(signature_template.format(name=obj_info.short_name, sig=sig))
+  if use_full_name:
+    obj_name = obj_info.full_name
+  else:
+    obj_name = obj_info.short_name
+  parts.append(signature_template.format(name=obj_name, sig=sig))
   parts.append('```\n\n')
 
   return '\n'.join(parts)
@@ -323,7 +327,7 @@ class _Metadata(object):
   """
 
   def __init__(self, name):
-    """Creata a Metadata builder.
+    """Create a Metadata builder.
 
     Args:
       name: The name of the page being described by the Metadata block.
diff --git a/tensorflow/tools/git/BUILD b/tensorflow/tools/git/BUILD
index f502c8dde07de6d9f480a1b9d8690fd8f03de264..942ceab85fc8d40d9d4b67537d95204503af8bbe 100644
--- a/tensorflow/tools/git/BUILD
+++ b/tensorflow/tools/git/BUILD
@@ -7,9 +7,7 @@ package(default_visibility = ["//tensorflow:internal"])
 licenses(["notice"])  # Apache 2.0
 
 exports_files(
-    glob(["gen/*"]) + [
-        "gen_git_source.py",
-    ],
+    ["gen_git_source.py"],
 )
 
 # -----------------------------------------------------------------------------
diff --git a/tensorflow/tools/git/gen/branch_ref b/tensorflow/tools/git/gen/branch_ref
deleted file mode 100644
index 8b137891791fe96927ad78e64b0aad7bded08bdc..0000000000000000000000000000000000000000
--- a/tensorflow/tools/git/gen/branch_ref
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/tensorflow/tools/git/gen/head b/tensorflow/tools/git/gen/head
deleted file mode 100644
index 8b137891791fe96927ad78e64b0aad7bded08bdc..0000000000000000000000000000000000000000
--- a/tensorflow/tools/git/gen/head
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/tensorflow/tools/git/gen/spec.json b/tensorflow/tools/git/gen/spec.json
deleted file mode 100644
index 176bbc21ccb9112d5c29f0351ec937c302a1383e..0000000000000000000000000000000000000000
--- a/tensorflow/tools/git/gen/spec.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-  "git": false
-}
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 0307d2a0ebee820fee0867c35c5761f2f8607aea..3630dbd740e981971bdc9ff45b756b45095d437d 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -62,7 +62,7 @@ def parse_branch_ref(filename):
     raise RuntimeError("Git directory has unparseable HEAD")
 
 
-def configure(src_base_path, debug=False):
+def configure(src_base_path, gen_path, debug=False):
   """Configure `src_base_path` to embed git hashes if available."""
 
   # TODO(aselle): No files generated or symlinked here are deleted by
@@ -71,7 +71,6 @@ def configure(src_base_path, debug=False):
   # without running ./configure again.
 
   git_path = os.path.join(src_base_path, ".git")
-  gen_path = os.path.join(src_base_path, "tensorflow", "tools", "git", "gen")
 
   # Remove and recreate the path
   if os.path.exists(gen_path):
@@ -180,6 +179,13 @@ const int tf_cxx11_abi_flag() {
   return 0;
 #endif
 }
+const int tf_monolithic_build() {
+#ifdef TENSORFLOW_MONOLITHIC_BUILD
+  return 1;
+#else
+  return 0;
+#endif
+}
 """ % git_version
   open(filename, "w").write(contents)
 
@@ -253,6 +259,10 @@ parser.add_argument(
     "--configure", type=str,
     help="Path to configure as a git repo dependency tracking sentinel")
 
+parser.add_argument(
+    "--gen_root_path", type=str,
+    help="Root path to place generated git files (created by --configure).")
+
 parser.add_argument(
     "--generate",
     type=str,
@@ -267,7 +277,9 @@ parser.add_argument(
 args = parser.parse_args()
 
 if args.configure is not None:
-  configure(args.configure, debug=args.debug)
+  if args.gen_root_path is None:
+    raise RuntimeError("Must pass --gen_root_path arg when running --configure")
+  configure(args.configure, args.gen_root_path, debug=args.debug)
 elif args.generate is not None:
   generate(args.generate)
 elif args.raw_generate is not None:
diff --git a/tensorflow/tools/git/gen_git_source.sh b/tensorflow/tools/git/gen_git_source.sh
index 788f9e6e5730f9e4699011298d689bc26226fb65..db20bb00e84b47bd15244e70b925f59e62731deb 100755
--- a/tensorflow/tools/git/gen_git_source.sh
+++ b/tensorflow/tools/git/gen_git_source.sh
@@ -36,5 +36,12 @@ const int tf_cxx11_abi_flag() {
   return 0;
 #endif
 }
+const int tf_monolithic_build() {
+#ifdef TENSORFLOW_MONOLITHIC_BUILD
+  return 1;
+#else
+  return 0;
+#endif
+}
 EOF
 
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 9216008600b0969ae95a985f54511a24f4fac3e7..8601b3d0f19e49fe1308f2d022ee13572351581e 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -99,22 +99,21 @@ cc_library(
         "freeze_requantization_ranges.cc",
         "fuse_convolutions.cc",
         "insert_logging.cc",
-        "remove_ema.cc",
         "obfuscate_names.cc",
+        "quantize_nodes.cc",
+        "quantize_weights.cc",
         "remove_attribute.cc",
         "remove_device.cc",
+        "remove_ema.cc",
         "remove_nodes.cc",
         "rename_attribute.cc",
         "rename_op.cc",
+        "round_weights.cc",
         "set_device.cc",
         "sort_by_execution_order.cc",
         "sparsify_gather.cc",
         "strip_unused_nodes.cc",
-    ] + if_not_windows([
-        "quantize_nodes.cc",
-        "quantize_weights.cc",
-        "round_weights.cc",
-    ]),
+    ],
     hdrs = [
         "fold_constants_lib.h",
     ],
@@ -128,6 +127,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
@@ -315,3 +315,14 @@ tf_py_test(
     ],
     main = "python/transform_graph_test.py",
 )
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/tools/graph_transforms/file_utils.h b/tensorflow/tools/graph_transforms/file_utils.h
index 4737e95abcec3694d426e0c3c3a7112c2c5b6bd1..a3723f5cd383341ec206221e7591eca40aabd885 100644
--- a/tensorflow/tools/graph_transforms/file_utils.h
+++ b/tensorflow/tools/graph_transforms/file_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
-#define THIRD_PARTY_TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
+#ifndef TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
+#define TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -29,4 +29,4 @@ Status LoadTextOrBinaryGraphFile(const string& file_name, GraphDef* graph_def);
 }  // namespace graph_transforms
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
+#endif  // TENSORFLOW_TOOLS_GRAPH_TRANSFORMS_FILE_UTILS_H_
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index 97e8f77616b85955229619107b443315bca17925..a022f5792676c62c52fd1197b0d8c436f7161a47 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -183,22 +183,6 @@ Status ExtractRangeFromParams(const TransformFuncContext& context,
   return Status::OK();
 }
 
-bool AreAttrsEqual(const NodeDef* current_node, const NodeDef* other_node) {
-  if (current_node->attr_size() != other_node->attr_size()) {
-    return false;
-  }
-  string current_serialized;
-  string other_serialized;
-  for (const auto& attr : other_node->attr()) {
-    auto iter = current_node->attr().find(attr.first);
-    if (iter == current_node->attr().end()) return false;
-    iter->second.SerializeToString(&current_serialized);
-    attr.second.SerializeToString(&other_serialized);
-    if (current_serialized != other_serialized) return false;
-  }
-  return true;
-}
-
 }  // namespace
 
 // Analyzes all the nodes in the graph to figure out which ones are duplicates
@@ -759,7 +743,7 @@ Status QuantizeNodes(const GraphDef& input_graph_def,
           NodeDef reshape_dims;
           reshape_dims.set_op("Const");
           reshape_dims.set_name(unique_input_name + "/reshape_dims");
-          AddNodeInput("^" + input_name, &reshape_dims);
+          AddNodeInput("^" + NodeNameFromInput(input_name), &reshape_dims);
           SetNodeAttr("dtype", DT_INT32, &reshape_dims);
           Tensor reshape_dims_tensor(DT_INT32, {1});
           reshape_dims_tensor.flat<int32>()(0) = -1;
@@ -769,7 +753,7 @@ Status QuantizeNodes(const GraphDef& input_graph_def,
           NodeDef reduction_dims;
           reduction_dims.set_op("Const");
           reduction_dims.set_name(unique_input_name + "/reduction_dims");
-          AddNodeInput("^" + input_name, &reduction_dims);
+          AddNodeInput("^" + NodeNameFromInput(input_name), &reduction_dims);
           SetNodeAttr("dtype", DT_INT32, &reduction_dims);
           Tensor reduction_dims_tensor(DT_INT32, {1});
           reduction_dims_tensor.flat<int32>()(0) = 0;
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 20d443c7e9070d0c82191c70ec1a855deeeb8f0b..701e350fc39d083665f5420e6b73510c182e12ce 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cmath>
 #include <memory>
+#include <unordered_map>
 
 #include "tensorflow/c/checkpoint_reader.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,9 +29,10 @@ limitations under the License.
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
-using strings::StrCat;
 using str_util::Join;
 using str_util::Split;
+using str_util::StringReplace;
+using strings::StrCat;
 
 namespace graph_transforms {
 
@@ -84,45 +86,71 @@ void CreateConstNode(const Tensor& tensor, const string& name,
   SetNodeTensorAttr<float>("value", tensor, node_def);
 }
 
+string GetMonolithicTensorKey(const string& tensor_slice_name) {
+  std::vector<string> names = Split(tensor_slice_name, "/");
+  if (StringPiece(names[names.size() - 1]).starts_with("part_")) {
+    CHECK_GE(names.size(), 2);
+    names.pop_back();
+  }
+  return Join(names, "/");
+}
+
 Status ObtainTensorSlice(const GraphDef& input_graph_def,
-                         const string& tensor_name,
+                         const string& target_name,
                          string* shape_slice_string) {
   string restore_node_name;
   for (const auto& node : input_graph_def.node()) {
-    if (StringPiece(node.name()).starts_with("save/Assign") &&
-        node.input(0) == tensor_name) {
+    std::vector<string> node_name_parts = Split(node.name(), "/");
+    if (node_name_parts.size() == 2 &&
+        StringPiece(node_name_parts[0]).starts_with("save") &&
+        StringPiece(node_name_parts[1]).starts_with("Assign") &&
+        node.input(0) == target_name) {
       restore_node_name = node.input(1);
       break;
     }
   }
+
+  std::vector<string> restore_node_parts = Split(restore_node_name, ":");
+  CHECK_LE(restore_node_parts.size(), 2);
+  string tensor_names_node;
   string shape_and_slices_node;
   for (const auto& node : input_graph_def.node()) {
-    if ((node.name() == restore_node_name) && (node.op() == "RestoreV2")) {
+    if ((node.name() == restore_node_parts[0]) && (node.op() == "RestoreV2")) {
+      tensor_names_node = node.input(1);
       shape_and_slices_node = node.input(2);
       break;
     }
   }
+
+  int offset = -1;
+  for (const auto& node : input_graph_def.node()) {
+    if (node.name() == tensor_names_node) {
+      Tensor tensor_names_tensor;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node, "value", &tensor_names_tensor));
+      const auto& tensor_names_value = tensor_names_tensor.flat<string>();
+      for (int i = 0; i < tensor_names_value.size(); i++) {
+        if (tensor_names_value(i) == GetMonolithicTensorKey(target_name)) {
+          offset = i;
+          break;
+        }
+      }
+    }
+  }
+  if (offset == -1) {
+    return errors::Internal("Unable to find RestoreV2 entry for variable: ",
+                            target_name);
+  }
   for (const auto& node : input_graph_def.node()) {
     if (node.name() == shape_and_slices_node) {
       Tensor shape_and_slices_tensor;
       TF_RETURN_IF_ERROR(GetNodeAttr(node, "value", &shape_and_slices_tensor));
       const auto& shape_and_slices_value =
           shape_and_slices_tensor.flat<string>();
-      *shape_slice_string = shape_and_slices_value(0);
+      *shape_slice_string = shape_and_slices_value(offset);
       return Status::OK();
     }
   }
-  return errors::Internal("Unable to find slice for variable: ", tensor_name);
-}
-
-string GetMonolithicTensorKey(const string& tensor_slice_name) {
-  std::vector<string> names = str_util::Split(tensor_slice_name, "/");
-  CHECK_GE(names.size(), 2);
-  CHECK(StringPiece(names[names.size() - 1]).starts_with("part_"));
-
-  // Remove the "part_x" suffix
-  names.pop_back();
-  return str_util::Join(names, "/");
+  return errors::Internal("Unable to find slice for variable: ", target_name);
 }
 
 Status ReadTensorFromCheckpoint(
@@ -176,6 +204,22 @@ Status ObtainVariableInfo(
   return Status::OK();
 }
 
+Status RemoveInputAtIndex(NodeDef* n, int index) {
+  for (int i = index; i < n->input_size() - 1; i++) {
+    n->mutable_input()->SwapElements(i, i + 1);
+  }
+  n->mutable_input()->RemoveLast();
+  return Status::OK();
+}
+
+Status RemoveNodeAtIndex(GraphDef* g, int index) {
+  for (int i = index; i < g->node_size() - 1; i++) {
+    g->mutable_node()->SwapElements(i, i + 1);
+  }
+  g->mutable_node()->RemoveLast();
+  return Status::OK();
+}
+
 Status SparsifyGatherInternal(
     const GraphDef& input_graph_def,
     const std::unique_ptr<std::unordered_map<string, string> >&
@@ -190,6 +234,15 @@ Status SparsifyGatherInternal(
   GraphDef current_graph_def = input_graph_def;
   bool any_match_found = false;
 
+  // Populate references.
+  std::unordered_map<string, int> refs;
+  for (const auto& node : current_graph_def.node()) {
+    for (const auto& input : node.input()) {
+      auto parsed_input = StringReplace(input, "^", "", true);
+      refs[parsed_input] += 1;
+    }
+  }
+
   // The subgraphs may have overlapping components, therefore GraphMatcher
   // doesn't return all subgraphs in one round -- this has to be multi-round
   // update.
@@ -197,15 +250,15 @@ Status SparsifyGatherInternal(
     any_match_found = false;
     GraphDef replaced_graph_def = current_graph_def;
     std::vector<string> init_table_node_names;
-    std::vector<string> removed_variable_names;
+    std::vector<string> removed_node_names;
 
     TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
         current_graph_def, pattern,
         [&ckpt_reader, &any_match_found, &init_table_node_names,
-         &shapes_and_slices, &removed_variable_names](
-            const NodeMatch& match, const std::set<string>& input_nodes,
-            const std::set<string>& output_nodes,
-            std::vector<NodeDef>* new_nodes) {
+         &shapes_and_slices, &removed_node_names,
+         &refs](const NodeMatch& match, const std::set<string>& input_nodes,
+                const std::set<string>& output_nodes,
+                std::vector<NodeDef>* new_nodes) {
           any_match_found = true;
 
           // The captured subgraph should be of the following pattern:
@@ -287,9 +340,13 @@ Status SparsifyGatherInternal(
             TF_RETURN_IF_ERROR(ReadTensorFromCheckpoint(
                 weights_node.name(), ckpt_reader,
                 (*shapes_and_slices)[weights_node.name()], &weight));
-            // Add both both weight and identity node names.
-            removed_variable_names.push_back(weights_node.name());
-            removed_variable_names.push_back(match.inputs[0].node.name());
+          }
+          // Add both both weight and identity node names.
+          removed_node_names.push_back(weights_node.name());
+          removed_node_names.push_back(match.inputs[0].node.name());
+          for (auto input_node : match.inputs[0].node.input()) {
+            auto parsed_input = StringReplace(input_node, "^", "", true);
+            refs[parsed_input]--;
           }
           Tensor indices_tensor;
           Tensor values_tensor;
@@ -359,15 +416,23 @@ Status SparsifyGatherInternal(
 
           // Connect nodes
           AddNodeInput(hashtable_node.name(), &init_table_node);
+          refs[hashtable_node.name()]++;
           AddNodeInput(indices_node.name(), &init_table_node);
+          refs[indices_node.name()]++;
           AddNodeInput(values_node.name(), &init_table_node);
+          refs[values_node.name()]++;
 
           AddNodeInput(hashtable_node.name(), &lookup_node);
+          refs[hashtable_node.name()]++;
           AddNodeInput(gather_node.input(1), &lookup_node);
+          refs[gather_node.input(1)]++;
           AddNodeInput(default_value_node.name(), &lookup_node);
+          refs[default_value_node.name()]++;
 
           AddNodeInput(lookup_node.name(), &expand_dims_node);
+          refs[lookup_node.name()]++;
           AddNodeInput(dim_idx_node.name(), &expand_dims_node);
+          refs[dim_idx_node.name()]++;
 
           // Copy 'ids' input of original 'Gather'
           new_nodes->push_back(match.inputs[1].node);
@@ -401,47 +466,88 @@ Status SparsifyGatherInternal(
     for (const string& name : init_table_node_names) {
       // Add control dependence from init_table_node to group_deps_node
       AddNodeInput(StrCat("^", name), init_op);
+      refs[name]++;
+    }
+
+    // Erase inputs and outputs as they are not considered for deletion.
+    for (const auto& output : context.output_names) {
+      refs.erase(output);
     }
 
-    // Remove all dependencies associated with removed variables.
-    while (!removed_variable_names.empty()) {
-      auto name = removed_variable_names.back();
-      removed_variable_names.pop_back();
+    for (const auto& input : context.input_names) {
+      refs.erase(input);
+    }
+
+    // Add nodes with a reference count of 0 for deletion.
+    for (auto entry : refs) {
+      if (entry.second == 0) {
+        removed_node_names.push_back(entry.first);
+      }
+    }
+
+    while (!removed_node_names.empty()) {
+      auto name = removed_node_names.back();
+      removed_node_names.pop_back();
+
       int i = 0;
       while (i < replaced_graph_def.node_size()) {
-        if (!replaced_graph_def.node(i).input_size()) {
-          if (replaced_graph_def.node(i).name() == name) {
-            replaced_graph_def.mutable_node()->SwapElements(
-                i, replaced_graph_def.node_size() - 1);
-            replaced_graph_def.mutable_node()->RemoveLast();
-            continue;
+        // Revisit this to see if we can safely remove RestoreV2 nodes.
+        if ((replaced_graph_def.node(i).name() == name) &&
+            (replaced_graph_def.node(i).op() != "RestoreV2")) {
+          for (const auto& input : replaced_graph_def.node(i).input()) {
+            auto parsed_input = StringReplace(input, "^", "", true);
+            refs[parsed_input] -= 1;
+            if (refs[parsed_input] == 0) {
+              removed_node_names.push_back(parsed_input);
+            }
           }
-          i++;
+          TF_RETURN_IF_ERROR(RemoveNodeAtIndex(&replaced_graph_def, i));
           continue;
         }
         int j = 0;
+        bool deleted_inputs = false;
         while (j < replaced_graph_def.node(i).input_size()) {
           if (replaced_graph_def.node(i).input(j) == name ||
               replaced_graph_def.node(i).input(j) == ("^" + name)) {
-            replaced_graph_def.mutable_node(i)->mutable_input()->SwapElements(
-                j, replaced_graph_def.node(i).input_size() - 1);
-            replaced_graph_def.mutable_node(i)->mutable_input()->RemoveLast();
+            TF_RETURN_IF_ERROR(
+                RemoveInputAtIndex(replaced_graph_def.mutable_node(i), j));
+            deleted_inputs = true;
             continue;
           }
           j++;
         }
-        if ((replaced_graph_def.node(i).input_size() == 0) ||
-            (replaced_graph_def.node(i).op() == "Assign" &&
-             replaced_graph_def.node(i).input_size() == 1)) {
-          removed_variable_names.push_back(replaced_graph_def.node(i).name());
-          if (replaced_graph_def.node(i).input_size() == 1) {
-            removed_variable_names.push_back(
-                replaced_graph_def.node(i).input(0));
+        if (deleted_inputs) {
+          if (replaced_graph_def.node(i).op() == "ConcatV2") {
+            if (replaced_graph_def.node(i).input_size() > 2) {
+              SetNodeAttr("N", replaced_graph_def.node(i).input_size() - 1,
+                          replaced_graph_def.mutable_node(i));
+            } else if (replaced_graph_def.node(i).input_size() == 2) {
+              if (refs[replaced_graph_def.node(i).input(1)] != 1) {
+                return errors::Internal(
+                    "Expect axis tensor of ConcatV2 node to only be referenced "
+                    "once.");
+              }
+              refs[replaced_graph_def.node(i).input(1)] -= 1;
+              removed_node_names.push_back(replaced_graph_def.node(i).input(1));
+              replaced_graph_def.mutable_node(i)->mutable_input()->RemoveLast();
+              replaced_graph_def.mutable_node(i)->mutable_attr()->erase("N");
+              replaced_graph_def.mutable_node(i)->set_op("Identity");
+            } else {
+              return errors::Internal(
+                  "ConcatV2 should have at least two elements");
+            }
+          }
+          if ((replaced_graph_def.node(i).op() == "Assign" ||
+               replaced_graph_def.node(i).op() == "Reshape" ||
+               replaced_graph_def.node(i).op() == "Equal" ||
+               replaced_graph_def.node(i).op() == "Mean" ||
+               replaced_graph_def.node(i).op() == "ScalarSummary") &&
+              replaced_graph_def.node(i).input_size() == 1) {
+            removed_node_names.push_back(replaced_graph_def.node(i).name());
+          }
+          if (!replaced_graph_def.node(i).input_size()) {
+            removed_node_names.push_back(replaced_graph_def.node(i).name());
           }
-          replaced_graph_def.mutable_node()->SwapElements(
-              i, replaced_graph_def.node_size() - 1);
-          replaced_graph_def.mutable_node()->RemoveLast();
-          continue;
         }
         i++;
       }
@@ -482,17 +588,22 @@ Status SparsifyGather(const GraphDef& input_graph_def,
     };
   // clang-format on
 
+  GraphDef cleaned_input_graph_def;
+  RemoveAttributes(input_graph_def, {"_output_shapes"},
+                   &cleaned_input_graph_def);
+
   GraphDef temp_output;
 
   std::unique_ptr<BundleReader> ckpt_reader;
   TF_RETURN_IF_ERROR(InitializeCheckpointReader(context, &ckpt_reader));
 
   std::unique_ptr<std::unordered_map<string, string> > shapes_and_slices;
-  TF_RETURN_IF_ERROR(ObtainVariableInfo(input_graph_def, &shapes_and_slices));
+  TF_RETURN_IF_ERROR(
+      ObtainVariableInfo(cleaned_input_graph_def, &shapes_and_slices));
 
-  TF_RETURN_IF_ERROR(SparsifyGatherInternal(input_graph_def, shapes_and_slices,
-                                            context, gather_pattern,
-                                            ckpt_reader, &temp_output));
+  TF_RETURN_IF_ERROR(SparsifyGatherInternal(
+      cleaned_input_graph_def, shapes_and_slices, context, gather_pattern,
+      ckpt_reader, &temp_output));
 
   TF_RETURN_IF_ERROR(SparsifyGatherInternal(temp_output, shapes_and_slices,
                                             context, gather_v2_pattern,
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
index 000568a0cc9aceffa927abb1dc56e6586030fea0..d41321c9a6df755eed099ec453f162e2132cfb57 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
@@ -71,7 +71,7 @@ class SparsifyGatherTest : public ::testing::Test {
   }
 
   void TestSinglePartition(bool gather_v2, bool include_shared_init,
-                           bool test_variable,
+                           bool test_variable, bool test_kept_concat,
                            const string& shared_init_name = "group_deps") {
     GraphDef graph_def;
 
@@ -80,6 +80,8 @@ class SparsifyGatherTest : public ::testing::Test {
     // Build the graph.
     NodeDef* input_node = CreateNode("ids", "Const", {}, &graph_def);
     NodeDef* w_node;
+    NodeDef* zeros_const;
+    NodeDef* zeros_shape;
     NodeDef* zeros_node;
     NodeDef* assign_node;
 
@@ -92,19 +94,27 @@ class SparsifyGatherTest : public ::testing::Test {
     } else {
       w_node = CreateNode("w/part_1", "VariableV2", {}, &graph_def);
 
-      zeros_node =
-          CreateNode("w/part_1/Initializer/zeros", "Const", {}, &graph_def);
+      zeros_shape = CreateNode("w/part_1/Initializer/zeros/shape_as_tensor",
+                               "Const", {}, &graph_def);
+      zeros_const = CreateNode("w/part_1/Initializer/zeros/Const", "Const", {},
+                               &graph_def);
+      zeros_node = CreateNode("w/part_1/Initializer/zeros", "Fill",
+                              {zeros_shape, zeros_const}, &graph_def);
       assign_node = CreateNode("w/part_1/Assign", "Assign",
                                {w_node, zeros_node}, &graph_def);
 
       NodeDef* save_const_node =
           CreateNode("save/Const", "Const", {}, &graph_def);
 
+      Tensor tensor_names_values(DT_STRING, TensorShape({1}));
+      test::FillValues<string>(&tensor_names_values, {"w"});
       NodeDef* tensor_names_node =
           CreateNode("save/RestoreV2/tensor_names", "Const", {}, &graph_def);
+      SetNodeTensorAttr<string>("value", tensor_names_values,
+                                tensor_names_node);
+
       NodeDef* tensor_shapes_slices_node = CreateNode(
           "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def);
-
       Tensor shapes_slices_val(DT_STRING, TensorShape({1}));
       shapes_slices_val.flat<string>()(0) = "4 1 0,4:0,1";
       SetNodeTensorAttr<string>("value", shapes_slices_val,
@@ -133,6 +143,26 @@ class SparsifyGatherTest : public ::testing::Test {
       }
     }
 
+    NodeDef* concat_axis_node =
+        CreateNode("linear/concat/axis", "Const", {}, &graph_def);
+    NodeDef* concat_input_node =
+        CreateNode("concat/input/node", "Const", {}, &graph_def);
+    NodeDef* concat_node = nullptr;
+    if (!test_kept_concat) {
+      concat_node = CreateNode(
+          "concat/node", "ConcatV2",
+          {identity_node, concat_input_node, concat_axis_node}, &graph_def);
+      SetNodeAttr("N", 2, concat_node);
+    } else {
+      NodeDef* concat_input_node_2 =
+          CreateNode("concat/input/node_2", "Const", {}, &graph_def);
+      concat_node = CreateNode("concat/node", "ConcatV2",
+                               {identity_node, concat_input_node,
+                                concat_input_node_2, concat_axis_node},
+                               &graph_def);
+      SetNodeAttr("N", 3, concat_node);
+    }
+
     // Run the op.
     GraphDef result;
     TransformFuncContext context;
@@ -151,12 +181,32 @@ class SparsifyGatherTest : public ::testing::Test {
     MapNamesToNodes(result, &node_lookup);
 
     // Check nodes.
+    EXPECT_EQ(0,
+              node_lookup.count("w/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w/part_1/Assign"));
 
     EXPECT_EQ(1, node_lookup.count("ids"));
     EXPECT_EQ("Const", node_lookup.at("ids")->op());
 
+    EXPECT_EQ(1, node_lookup.count("concat/node"));
+
+    if (!test_kept_concat) {
+      EXPECT_EQ(0, node_lookup.count("linear/concat/axis"));
+      EXPECT_EQ("Identity", node_lookup.at("concat/node")->op());
+      EXPECT_EQ(1, node_lookup.at("concat/node")->input_size());
+      EXPECT_EQ("concat/input/node", node_lookup.at("concat/node")->input(0));
+    } else {
+      EXPECT_EQ(1, node_lookup.count("linear/concat/axis"));
+      EXPECT_EQ("ConcatV2", node_lookup.at("concat/node")->op());
+      EXPECT_EQ(3, node_lookup.at("concat/node")->input_size());
+      EXPECT_EQ("concat/input/node", node_lookup.at("concat/node")->input(0));
+      EXPECT_EQ("concat/input/node_2", node_lookup.at("concat/node")->input(1));
+      EXPECT_EQ("linear/concat/axis", node_lookup.at("concat/node")->input(2));
+      EXPECT_EQ(2, node_lookup.at("concat/node")->attr().at("N").i());
+    }
+
     EXPECT_EQ(1, node_lookup.count("w/part_1/indices"));
     EXPECT_EQ("Const", node_lookup.at("w/part_1/indices")->op());
     Tensor expected_indices_tensor(DT_INT64, TensorShape({3}));
@@ -247,7 +297,11 @@ class SparsifyGatherTest : public ::testing::Test {
     // Two partitions
     NodeDef* w_node1;
     NodeDef* w_node2;
+    NodeDef* zeros_const1;
+    NodeDef* zeros_shape1;
     NodeDef* zeros_node1;
+    NodeDef* zeros_const2;
+    NodeDef* zeros_shape2;
     NodeDef* zeros_node2;
     NodeDef* assign_node1;
     NodeDef* assign_node2;
@@ -260,51 +314,53 @@ class SparsifyGatherTest : public ::testing::Test {
       SetNodeTensorAttr<float>("value", weights, w_node1);
       SetNodeTensorAttr<float>("value", weights, w_node2);
     } else {
-      w_node1 = CreateNode("w1/part_1", "VariableV2", {}, &graph_def);
-      zeros_node1 =
-          CreateNode("w1/part_1/Initializer/zeros", "Const", {}, &graph_def);
-      assign_node1 = CreateNode("w1/part_1/Assign", "Assign",
-                                {w_node1, zeros_node1}, &graph_def);
-
       NodeDef* save_const_node =
           CreateNode("save/Const", "Const", {}, &graph_def);
-      NodeDef* tensor_names_node1 =
+
+      NodeDef* tensor_names_node =
           CreateNode("save/RestoreV2/tensor_names", "Const", {}, &graph_def);
-      NodeDef* tensor_shapes_slices_node1 = CreateNode(
-          "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def);
+      Tensor tensor_names_values(DT_STRING, TensorShape({2}));
+      test::FillValues<string>(&tensor_names_values, {"w1", "w2"});
+      SetNodeTensorAttr<string>("value", tensor_names_values,
+                                tensor_names_node);
 
-      Tensor shapes_slices_val1(DT_STRING, TensorShape({1}));
-      shapes_slices_val1.flat<string>()(0) = "4 1 0,4:0,1";
-      SetNodeTensorAttr<string>("value", shapes_slices_val1,
-                                tensor_shapes_slices_node1);
+      NodeDef* tensor_shapes_slices_node = CreateNode(
+          "save/RestoreV2/shape_and_slices", "Const", {}, &graph_def);
+      Tensor shapes_slices_val(DT_STRING, TensorShape({2}));
+      shapes_slices_val.flat<string>()(0) = "4 1 0,4:0,1";
+      shapes_slices_val.flat<string>()(1) = "4 1 0,4:0,1";
+      SetNodeTensorAttr<string>("value", shapes_slices_val,
+                                tensor_shapes_slices_node);
 
-      NodeDef* restore_node1 = CreateNode(
+      NodeDef* restore_node = CreateNode(
           "save/RestoreV2", "RestoreV2",
-          {save_const_node, tensor_names_node1, tensor_shapes_slices_node1},
+          {save_const_node, tensor_names_node, tensor_shapes_slices_node},
           &graph_def);
-      CreateNode("save/Assign", "Assign", {w_node1, restore_node1}, &graph_def);
+
+      w_node1 = CreateNode("w1/part_1", "VariableV2", {}, &graph_def);
+
+      zeros_shape1 = CreateNode("w1/part_1/Initializer/zeros/shape_as_tensor",
+                                "Const", {}, &graph_def);
+      zeros_const1 = CreateNode("w1/part_1/Initializer/zeros/Const", "Const",
+                                {}, &graph_def);
+      zeros_node1 = CreateNode("w1/part_1/Initializer/zeros", "Fill",
+                               {zeros_shape1, zeros_const1}, &graph_def);
+      assign_node1 = CreateNode("w1/part_1/Assign", "Assign",
+                                {w_node1, zeros_node1}, &graph_def);
+
+      CreateNode("save/Assign", "Assign", {w_node1, restore_node}, &graph_def);
 
       w_node2 = CreateNode("w2/part_1", "VariableV2", {}, &graph_def);
-      zeros_node2 =
-          CreateNode("w2/part_1/Initializer/zeros", "Const", {}, &graph_def);
+      zeros_shape2 = CreateNode("w2/part_1/Initializer/zeros/shape_as_tensor",
+                                "Const", {}, &graph_def);
+      zeros_const2 = CreateNode("w2/part_1/Initializer/zeros/Const", "Const",
+                                {}, &graph_def);
+      zeros_node2 = CreateNode("w2/part_1/Initializer/zeros", "Fill",
+                               {zeros_shape2, zeros_const2}, &graph_def);
       assign_node2 = CreateNode("w2/part_1/Assign", "Assign",
                                 {w_node2, zeros_node2}, &graph_def);
 
-      NodeDef* tensor_names_node2 =
-          CreateNode("save/RestoreV2_1/tensor_names", "Const", {}, &graph_def);
-      NodeDef* tensor_shapes_slices_node2 = CreateNode(
-          "save/RestoreV2_1/shape_and_slices", "Const", {}, &graph_def);
-
-      Tensor shapes_slices_val2(DT_STRING, TensorShape({1}));
-      shapes_slices_val2.flat<string>()(0) = "4 1 0,4:0,1";
-      SetNodeTensorAttr<string>("value", shapes_slices_val2,
-                                tensor_shapes_slices_node2);
-
-      NodeDef* restore_node2 = CreateNode(
-          "save/RestoreV2_1", "RestoreV2",
-          {save_const_node, tensor_names_node2, tensor_shapes_slices_node2},
-          &graph_def);
-      CreateNode("save/Assign_1", "Assign", {w_node2, restore_node2},
+      CreateNode("save/Assign_1", "Assign", {w_node2, restore_node},
                  &graph_def);
 
       BundleWriter writer(Env::Default(), checkpoint_path);
@@ -322,6 +378,13 @@ class SparsifyGatherTest : public ::testing::Test {
     MakeGather("gather1", gather_v2, identity_node1, input_node, &graph_def);
     MakeGather("gather2", gather_v2, identity_node2, input_node, &graph_def);
 
+    NodeDef* concat_axis_node =
+        CreateNode("linear/concat/axis", "Const", {}, &graph_def);
+    NodeDef* concat_node = CreateNode(
+        "concat/node", "ConcatV2",
+        {identity_node1, identity_node2, concat_axis_node}, &graph_def);
+    SetNodeAttr("N", 2, concat_node);
+
     // Shared init node
     if (include_shared_init) {
       if (!test_variable) {
@@ -350,8 +413,14 @@ class SparsifyGatherTest : public ::testing::Test {
     MapNamesToNodes(result, &node_lookup);
 
     // Check nodes.
+    EXPECT_EQ(0,
+              node_lookup.count("w1/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w1/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w1/part_1/Assign"));
+    EXPECT_EQ(0,
+              node_lookup.count("w2/part_1/Initializer/zeros/shape_as_tensor"));
+    EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros/Const"));
     EXPECT_EQ(0, node_lookup.count("w2/part_1/Initializer/zeros"));
     EXPECT_EQ(0, node_lookup.count("w2/part_1/Assign"));
     EXPECT_EQ(1, node_lookup.count("ids"));
@@ -487,6 +556,9 @@ class SparsifyGatherTest : public ::testing::Test {
               node_lookup.at("gather2/LookupTableFind")->input(2));
     EXPECT_EQ("gather2/LookupTableFind", node_lookup.at("gather2")->input(0));
 
+    EXPECT_EQ(0, node_lookup.count("linear/concat/axis"));
+    EXPECT_EQ(0, node_lookup.count("concat/node"));
+
     // Check control deps.
     EXPECT_EQ(2, node_lookup.at(shared_init_name)->input_size());
     EXPECT_NE(std::find(node_lookup.at(shared_init_name)->input().begin(),
@@ -522,18 +594,31 @@ class SparsifyGatherTest : public ::testing::Test {
 };
 
 TEST_F(SparsifyGatherTest, TestSinglePartition) {
-  TestSinglePartition(false, false, false);
-  TestSinglePartition(false, true, false);
-  TestSinglePartition(true, false, false);
-  TestSinglePartition(true, true, false);
-  TestSinglePartition(false, false, true);
-  TestSinglePartition(false, true, true);
-  TestSinglePartition(true, false, true);
-  TestSinglePartition(true, true, true);
-  TestSinglePartition(false, true, false, "shared_inits");
-  TestSinglePartition(true, true, false, "shared_inits");
-  TestSinglePartition(false, true, true, "shared_inits");
-  TestSinglePartition(true, true, true, "shared_inits");
+  TestSinglePartition(false, false, false, false);
+  TestSinglePartition(false, true, false, false);
+  TestSinglePartition(true, false, false, false);
+  TestSinglePartition(true, true, false, false);
+  TestSinglePartition(false, false, true, false);
+  TestSinglePartition(false, true, true, false);
+  TestSinglePartition(true, false, true, false);
+  TestSinglePartition(true, true, true, false);
+  TestSinglePartition(false, true, false, false, "shared_inits");
+  TestSinglePartition(true, true, false, false, "shared_inits");
+  TestSinglePartition(false, true, true, false, "shared_inits");
+  TestSinglePartition(true, true, true, false, "shared_inits");
+
+  TestSinglePartition(false, false, false, true);
+  TestSinglePartition(false, true, false, true);
+  TestSinglePartition(true, false, false, true);
+  TestSinglePartition(true, true, false, true);
+  TestSinglePartition(false, false, true, true);
+  TestSinglePartition(false, true, true, true);
+  TestSinglePartition(true, false, true, true);
+  TestSinglePartition(true, true, true, true);
+  TestSinglePartition(false, true, false, true, "shared_inits");
+  TestSinglePartition(true, true, false, true, "shared_inits");
+  TestSinglePartition(false, true, true, true, "shared_inits");
+  TestSinglePartition(true, true, true, true, "shared_inits");
 }
 
 TEST_F(SparsifyGatherTest, TestMultiPartition) {
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 845bad5e4990255cf47981935fc5479053334491..614457e8996491a60d4a7df213180117bce321ad 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -55,7 +55,10 @@ pkg_tar(
 
 pkg_tar(
     name = "cheaders",
-    files = ["//tensorflow/c:headers"],
+    files = [
+        "//tensorflow/c:headers",
+        "//tensorflow/c/eager:headers",
+    ],
     package_dir = "include/tensorflow/c",
     # Mark as "manual" till
     # https://github.com/bazelbuild/bazel/issues/2352
@@ -96,6 +99,7 @@ genrule(
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
@@ -109,8 +113,10 @@ genrule(
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
         "@libxsmm_archive//:LICENSE",
+        "@llvm//:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
@@ -131,6 +137,7 @@ genrule(
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
@@ -144,8 +151,10 @@ genrule(
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
         "@libxsmm_archive//:LICENSE",
+        "@llvm//:LICENSE.TXT",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index e3cbd67721aa04f170878f1d369ed65b7fde630e..02a34518c04a6ef738e46002ae4d07c801cc58f8 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -6,9 +6,12 @@ package(default_visibility = ["//visibility:private"])
 load(
     "//tensorflow:tensorflow.bzl",
     "if_not_windows",
+    "if_windows",
     "transitive_hdrs",
 )
 load("//third_party/mkl:build_defs.bzl", "if_mkl")
+load("//tensorflow:tensorflow.bzl", "if_cuda")
+load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_license_deps")
 
 # This returns a list of headers of all public header libraries (e.g.,
@@ -33,7 +36,9 @@ transitive_hdrs(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor",
         "//third_party/eigen3",
-    ],
+    ] + if_cuda([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]),
 )
 
 py_binary(
@@ -43,27 +48,6 @@ py_binary(
     deps = ["//tensorflow:tensorflow_py"],
 )
 
-py_test(
-    name = "pip_smoke_test",
-    srcs = ["pip_smoke_test.py"],
-    data = [
-        "//tensorflow:all_opensource_files",
-    ],
-    tags = [
-        "manual",
-        "notap",
-    ],
-)
-
-py_binary(
-    name = "check_load_py_test",
-    srcs = ["check_load_py_test.py"],
-    data = [
-        "//tensorflow:all_opensource_files",
-    ],
-    srcs_version = "PY2AND3",
-)
-
 # On Windows, python binary is a zip file of runfiles tree.
 # Add everything to its data dependency for generating a runfiles tree
 # for building the pip package on Windows.
@@ -87,7 +71,6 @@ py_binary(
         "//tensorflow/python/eager:eager_pip",
         "//tensorflow/contrib/summary:summary_test_util",
         # These targets don't build on Windows yet. Exclude them for now.
-        # "//tensorflow/contrib/ndlstm",
         # "//tensorflow/contrib/slim",
         # "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
         # "//tensorflow/contrib/specs",
@@ -105,13 +88,20 @@ filegroup(
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
+        "@absl_py//absl/flags:LICENSE",
+        "@arm_neon_2_x86_sse//:LICENSE",
+        "@astor_archive//:LICENSE",
+        "@aws//:LICENSE",
         "@boringssl//:LICENSE",
+        "@com_google_absl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@cub_archive//:LICENSE.TXT",
         "@curl//:COPYING",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
+        "@flatbuffers//:LICENSE.txt",
+        "@gast_archive//:PKG-INFO",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@grpc//:LICENSE",
@@ -122,11 +112,15 @@ filegroup(
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@grpc//third_party/nanopb:LICENSE.txt",
+        "@nasm//:LICENSE",
         "@nsync//:LICENSE",
+        "@pcre//:LICENCE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
         "@six_archive//:LICENSE",
         "@snappy//:COPYING",
+        "@swig//:LICENSE",
+        "@termcolor_archive//:COPYING.txt",
         "@zlib_archive//:zlib.h",
         "@org_python_pypi_backports_weakref//:LICENSE",
     ] + if_mkl([
@@ -153,10 +147,11 @@ sh_binary(
             "//tensorflow:tensorflow_py",
             "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
             "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
-            "//tensorflow/contrib/data/python/ops:prefetching_py",
+            "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
+            "//tensorflow/contrib/data/python/ops:contrib_op_loader",
             "//tensorflow/contrib/eager/python/examples:examples_pip",
+            "//tensorflow/contrib/eager/python:checkpointable",
             "//tensorflow/contrib/eager/python:evaluator",
-            "//tensorflow/contrib/eager/python:summary_writer",
             "//tensorflow/contrib/gan:gan",
             "//tensorflow/contrib/graph_editor:graph_editor_pip",
             "//tensorflow/contrib/keras:keras",
@@ -164,9 +159,14 @@ sh_binary(
             "//tensorflow/contrib/lite/toco:toco",
             "//tensorflow/contrib/lite/toco/python:toco_wrapper",
             "//tensorflow/contrib/lite/toco/python:toco_from_protos",
-            "//tensorflow/contrib/ndlstm:ndlstm",
             "//tensorflow/contrib/nn:nn_py",
             "//tensorflow/contrib/predictor:predictor_pip",
+            "//tensorflow/contrib/py2tf:py2tf",
+            "//tensorflow/contrib/py2tf/converters:converters",
+            "//tensorflow/contrib/py2tf/converters:test_lib",
+            "//tensorflow/contrib/py2tf/impl:impl",
+            "//tensorflow/contrib/py2tf/pyct:pyct",
+            "//tensorflow/contrib/py2tf/pyct/static_analysis:static_analysis",
             "//tensorflow/contrib/receptive_field:receptive_field_pip",
             "//tensorflow/contrib/session_bundle:session_bundle_pip",
             "//tensorflow/contrib/signal:signal_py",
@@ -192,5 +192,27 @@ sh_binary(
             "//tensorflow/python:test_ops",
             "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
         ],
-    }) + if_mkl(["//third_party/mkl:intel_binary_blob"]),
+    }) + if_mkl(["//third_party/mkl:intel_binary_blob"]) + if_tensorrt([
+        "//tensorflow/contrib/tensorrt:init_py",
+    ]),
+)
+
+# A genrule for generating a marker file for the pip package on Windows
+#
+# This only works on Windows, because :simple_console_for_windows is a
+# python zip file containing everything we need for building the pip package.
+# However, on other platforms, due to https://github.com/bazelbuild/bazel/issues/4223,
+# when C++ extensions change, this generule doesn't rebuild.
+genrule(
+    name = "win_pip_package_marker",
+    srcs = if_windows([
+        ":build_pip_package",
+        ":simple_console_for_windows",
+    ]),
+    outs = ["win_pip_package_marker_file"],
+    cmd = select({
+        "//conditions:default": "touch $@",
+        "//tensorflow:windows": "md5sum $(locations :build_pip_package) $(locations :simple_console_for_windows) > $@",
+    }),
+    visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 8249703ba717f25dbfb324557727b636c6640cc5..dc31e4c5f703b29f464519d5f1fd54f9b5e11690 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -24,9 +24,11 @@ function real_path() {
 function cp_external() {
   local src_dir=$1
   local dest_dir=$2
-  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*'`; do
+  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*' ! -name '*org_tensorflow*'`; do
     cp -R "$f" "$dest_dir"
   done
+  mkdir -p "${dest_dir}/local_config_cuda/cuda/cuda/"
+  cp "${src_dir}/local_config_cuda/cuda/cuda/cuda_config.h" "${dest_dir}/local_config_cuda/cuda/cuda/"
 }
 
 PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
@@ -92,7 +94,6 @@ function main() {
       bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow/tensorflow \
       "${TMPDIR}"
     mkdir "${TMPDIR}/external"
-    # Note: this makes an extra copy of org_tensorflow.
     cp_external \
       bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \
       "${TMPDIR}/external"
@@ -123,7 +124,6 @@ function main() {
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \
         "${TMPDIR}"
       mkdir "${TMPDIR}/external"
-      # Note: this makes an extra copy of org_tensorflow.
       cp_external \
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \
         "${TMPDIR}/external"
@@ -137,8 +137,8 @@ function main() {
         fi
       fi
     fi
-    # Install toco as a binary in aux-bin.
     mkdir "${TMPDIR}/tensorflow/aux-bin"
+    # Install toco as a binary in aux-bin.
     cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
   fi
 
diff --git a/tensorflow/tools/pip_package/check_load_py_test.py b/tensorflow/tools/pip_package/check_load_py_test.py
index 79d11b08ce33d4509492927111309e647abe683b..e2fe1121d7fa3178ec60886c6dcb56fe374d38a5 100644
--- a/tensorflow/tools/pip_package/check_load_py_test.py
+++ b/tensorflow/tools/pip_package/check_load_py_test.py
@@ -22,6 +22,9 @@ import os
 import subprocess
 
 
+os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
+
+
 def check_output_despite_error(args):
   """Get output of args from command line, even if there are errors.
 
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index cc46dd5162b396e2dc9eac6dafbc2365cafe17d8..73d759eb130633094b402c821cc32eb76c076a44 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """This pip smoke test verifies dependency files exist in the pip package.
 
 This script runs bazel queries to see what python files are required by the
@@ -23,11 +22,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import subprocess
 
-PIP_PACKAGE_QUERY_EXPRESSION = \
-  'deps(//tensorflow/tools/pip_package:build_pip_package)'
+os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))
+
+PIP_PACKAGE_QUERY_EXPRESSION = (
+    "deps(//tensorflow/tools/pip_package:build_pip_package)")
 
+# pylint: disable=g-backslash-continuation
 PY_TEST_QUERY_EXPRESSION = 'deps(\
   filter("^((?!benchmark).)*$",\
   kind(py_test,\
@@ -35,6 +38,7 @@ PY_TEST_QUERY_EXPRESSION = 'deps(\
   + //tensorflow/contrib/... \
   - //tensorflow/contrib/tensorboard/... \
   - attr(tags, "manual|no_pip", //tensorflow/...))), 1)'
+# pylint: enable=g-backslash-continuation
 
 # Hard-coded blacklist of files if not included in pip package
 # TODO(amitpatankar): Clean up blacklist.
@@ -42,6 +46,7 @@ BLACKLIST = [
     "//tensorflow/python:extra_py_tests_deps",
     "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     "//tensorflow:no_tensorflow_py_deps",
+    "//tensorflow/tools/pip_package:win_pip_package_marker",
     "//tensorflow/python:test_ops_2",
     "//tensorflow/python:tf_optimizer",
     "//tensorflow/python:compare_test_proto_py",
@@ -83,15 +88,15 @@ def main():
   """
 
   # pip_package_dependencies_list is the list of included files in pip packages
-  pip_package_dependencies = subprocess.check_output([
-      'bazel', 'query', PIP_PACKAGE_QUERY_EXPRESSION])
+  pip_package_dependencies = subprocess.check_output(
+      ["bazel", "query", PIP_PACKAGE_QUERY_EXPRESSION])
   pip_package_dependencies_list = pip_package_dependencies.strip().split("\n")
   print("Pip package superset size: %d" % len(pip_package_dependencies_list))
 
   # tf_py_test_dependencies is the list of dependencies for all python
   # tests in tensorflow
-  tf_py_test_dependencies = subprocess.check_output([
-      'bazel', 'query', PY_TEST_QUERY_EXPRESSION])
+  tf_py_test_dependencies = subprocess.check_output(
+      ["bazel", "query", PY_TEST_QUERY_EXPRESSION])
   tf_py_test_dependencies_list = tf_py_test_dependencies.strip().split("\n")
   print("Pytest dependency subset size: %d" % len(tf_py_test_dependencies_list))
 
@@ -112,8 +117,7 @@ def main():
 
       # Check if the dependency is in the pip package, the blacklist, or
       # should be ignored because of its file extension
-      if not (ignore or
-              dependency in pip_package_dependencies_list or
+      if not (ignore or dependency in pip_package_dependencies_list or
               dependency in BLACKLIST):
         missing_dependencies.append(dependency)
 
@@ -124,19 +128,20 @@ def main():
     for missing_dependency in missing_dependencies:
       print("\nMissing dependency: %s " % missing_dependency)
       print("Affected Tests:")
-      rdep_query = 'rdeps(kind(py_test, \
-      //tensorflow/python/...), %s)' % missing_dependency
-      affected_tests = subprocess.check_output(['bazel', 'query', rdep_query])
+      rdep_query = ("rdeps(kind(py_test, //tensorflow/python/...), %s)" %
+                    missing_dependency)
+      affected_tests = subprocess.check_output(["bazel", "query", rdep_query])
       affected_tests_list = affected_tests.split("\n")[:-2]
       print("\n".join(affected_tests_list))
 
     raise RuntimeError("""One or more dependencies are not in the pip package.
 Please either blacklist the dependencies in
-tensorflow/tensorflow/tensorflow/tools/pip_package/pip_smoke_test.py
-or add them to tensorflow/tensorflow/tensorflow/tools/pip_package/BUILD.""")
+//tensorflow/tools/pip_package/pip_smoke_test.py
+or add them to //tensorflow/tools/pip_package/BUILD.""")
 
   else:
     print("TEST PASSED")
 
+
 if __name__ == "__main__":
   main()
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index c18f20910a3e3922e39840dbe29b22991f2b91e8..e4ca974e1b7d1e86d4b64d0035df389b5fffe3c2 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,15 +29,18 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.4.0'
+_VERSION = '1.6.0-rc0'
 
 REQUIRED_PACKAGES = [
-    'absl-py',
-    'enum34 >= 1.1.6',
+    'absl-py >= 0.1.6',
+    'astor >= 0.6.0',
+    'gast >= 0.2.0',
+    'grpcio >= 1.8.6',
     'numpy >= 1.12.1',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorflow-tensorboard',
+    'tensorflow-tensorboard >= 1.5.0, < 1.6.0',
+    'termcolor >= 1.1.0',
 ]
 
 project_name = 'tensorflow'
@@ -59,12 +62,13 @@ else:
 if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
-      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.5.0a0, < 1.6.0a0'
+      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.7.0a0, < 1.8.0a0'
       break
 
-# weakref.finalize was introduced in Python 3.4
+# weakref.finalize and enum were introduced in Python 3.4
 if sys.version_info < (3, 4):
   REQUIRED_PACKAGES.append('backports.weakref >= 1.0rc1')
+  REQUIRED_PACKAGES.append('enum34 >= 1.1.6')
 
 # pylint: disable=line-too-long
 CONSOLE_SCRIPTS = [
@@ -176,7 +180,16 @@ def find_files(pattern, root):
 
 
 matches = ['../' + x for x in find_files('*', 'external') if '.py' not in x]
-matches += ['../' + x for x in find_files('*', '_solib_k8') if '.py' not in x]
+
+so_lib_paths = [
+    i for i in os.listdir('.')
+    if os.path.isdir(i) and fnmatch.fnmatch(i, '_solib_*')
+]
+
+for path in so_lib_paths:
+  matches.extend(
+      ['../' + x for x in find_files('*', path) if '.py' not in x]
+  )
 
 if os.name == 'nt':
   EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.pyd'
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions.cc b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
index ecb29a65a08b098cd167e5cbb2bdb5821e01a543..f0bb59acf801ba586fa8258b5b1ad9f202f014bf 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
@@ -132,6 +132,7 @@ int MainImpl(int argc, char** argv) {
       FILE* f = fopen(path.c_str(), "w");
       if (f == nullptr) return -1;
       if (fwrite(data.c_str(), 1, data.size(), f) != data.size()) {
+        fclose(f);
         return -1;
       }
       if (fclose(f) != 0) {
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
index 44387bbd4d8cbedf3178ca799d75c758c054a10e..e18d749cff8864d5f900f07028b4bf7f5cb07b7a 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
+#ifndef TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
+#define TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
 
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -50,4 +50,4 @@ ProtoTextFunctionCode GetProtoTextFunctionCode(
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
+#endif  // TENSORFLOW_CORE_UTIL_CREATE_PROTO_DEBUG_STRING_LIB_H_
diff --git a/tensorflow/tools/quantization/quantize_graph.py b/tensorflow/tools/quantization/quantize_graph.py
index a0cfc352d4f65a32dde13893dc937a72d7434e28..3acb532263d6896c3e64fe38da649bb23c0ad1e2 100644
--- a/tensorflow/tools/quantization/quantize_graph.py
+++ b/tensorflow/tools/quantization/quantize_graph.py
@@ -408,7 +408,8 @@ class GraphRewriter(object):
       for output_node in output_nodes:
         self.quantize_nodes_recursively(output_node)
     elif self.mode == "eightbit":
-      self.set_input_graph(graph_util.remove_training_nodes(self.input_graph))
+      self.set_input_graph(graph_util.remove_training_nodes(
+          self.input_graph, protected_nodes=output_node_names))
       output_nodes = [
           self.nodes_map[output_node_name]
           for output_node_name in output_node_names
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 28d651e9106b29058824c06b160df2b9b5781757..159a8c1cfbdb793d05eda850afb54e860bf2614e 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -104,12 +104,3 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
-
-py_test(
-    name = "check_futures_test",
-    size = "small",
-    srcs = ["check_futures_test.py"],
-    data = ["//tensorflow:all_opensource_files"],
-    srcs_version = "PY2AND3",
-    deps = ["@six_archive//:six"],
-)
diff --git a/tensorflow/tools/test/check_futures_test.py b/tensorflow/tools/test/check_futures_test.py
index 1c07511888d6f641fd2d59a9e9161174e1ef1b5c..9181c9bd4a4497dbf22a1f0935795c65533f08d8 100644
--- a/tensorflow/tools/test/check_futures_test.py
+++ b/tensorflow/tools/test/check_futures_test.py
@@ -33,7 +33,7 @@ import re
 
 import six
 
-BASE_DIR = os.path.normpath(os.path.join(__file__, '../../..'))
+BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
 FUTURES_PATTERN = re.compile(r'^from __future__ import (\w+)\s*$')
 FUTURES_PATTERN_2 = re.compile(
     r'^from __future__ import (\w+), (\w+), (\w+)\s*$')
diff --git a/tensorflow/tools/test/file_name_test.py b/tensorflow/tools/test/file_name_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..16fb8a822d09ed136cf79dd2473fc202ca632d83
--- /dev/null
+++ b/tensorflow/tools/test/file_name_test.py
@@ -0,0 +1,48 @@
+#!/usr/bin/python
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Test that checks if we have any issues with case insensitive filesystems.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
+ERROR_MESSAGE = """
+Files with same name but different case detected in directory: {}
+"""
+
+
+def main():
+  # Make sure BASE_DIR ends with tensorflow.  If it doesn't, we probably
+  # computed the wrong directory.
+  if os.path.split(BASE_DIR)[-1] != 'tensorflow':
+    raise AssertionError(
+        "BASE_DIR = '%s' doesn't end with tensorflow" % BASE_DIR)
+
+  for dirpath, dirnames, filenames in os.walk(BASE_DIR, followlinks=True):
+    lowercase_directories = [x.lower() for x in dirnames]
+    lowercase_files = [x.lower() for x in filenames]
+
+    lowercase_dir_contents = lowercase_directories + lowercase_files
+    if len(lowercase_dir_contents) != len(set(lowercase_dir_contents)):
+      raise AssertionError(ERROR_MESSAGE.format(dirpath))
+
+
+if __name__ == '__main__':
+  main()
diff --git a/tensorflow/tools/test/performance.bzl b/tensorflow/tools/test/performance.bzl
index b5c4bbf5a700aedfea7abf7f1c07a62df0155cfc..cee53dd5b61e50126948e3652865a32f45eab092 100644
--- a/tensorflow/tools/test/performance.bzl
+++ b/tensorflow/tools/test/performance.bzl
@@ -21,8 +21,9 @@ def tf_cc_logged_benchmark(
     fail(" ".join(("Target must be a single well-defined test, e.g.,",
                    "//path/to:test. Received: %s" % target)))
 
-  all_tags = list(depset(tags) + \
-                  depset(["benchmark-test", "local", "manual", "regression-test"]))
+  all_tags = (
+    depset(tags) + depset(
+      ["benchmark-test", "local", "manual", "regression-test"])).to_list()
 
   tf_py_test(
       name = name,
diff --git a/tensorflow/tools/test/run_and_gather_logs_lib.py b/tensorflow/tools/test/run_and_gather_logs_lib.py
index a953ed1b53d13504f92d2ffeb4c1ac6bcb0b8477..3b4921bb983a72223b092d99eb3fb59332fc6345 100644
--- a/tensorflow/tools/test/run_and_gather_logs_lib.py
+++ b/tensorflow/tools/test/run_and_gather_logs_lib.py
@@ -136,7 +136,7 @@ def run_and_gather_logs(name, test_name, test_args,
   gpu_config = gpu_info_lib.gather_gpu_devices()
   if gpu_config:
     gpu_name = gpu_config[0].model
-    gpu_short_name_match = re.search(r"Tesla (K40|K80|P100)", gpu_name)
+    gpu_short_name_match = re.search(r"Tesla (K40|K80|P100|V100)", gpu_name)
     if gpu_short_name_match:
       gpu_short_name = gpu_short_name_match.group(0)
       test_adjusted_name = name + "|" + gpu_short_name.replace(" ", "_")
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6b13271002ca9bae2d08b9ed84f6e30af968bf0c..579780208cd5e4d4cefe487b491429ab6ed513ec 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1,134 +1,63 @@
 # TensorFlow external dependencies that can be loaded in WORKSPACE files.
 
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
-
-load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
+load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
-load(
-    "@io_bazel_rules_closure//closure/private:java_import_external.bzl",
-    "java_import_external",
-)
-load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
+load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
-load(
-    "//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl",
-    "arm_compiler_configure",
-)
-
-def _is_windows(repository_ctx):
-  """Returns true if the host operating system is windows."""
-  return repository_ctx.os.name.lower().find("windows") != -1
-
-def _get_env_var(repository_ctx, name):
-  """Find an environment variable."""
-  if name in repository_ctx.os.environ:
-    return repository_ctx.os.environ[name]
-  else:
-    return None
+load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
+load("//third_party/toolchains/clang6:repo.bzl", "clang6_configure")
+load("//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl", "arm_compiler_configure")
+load("//third_party:repo.bzl", "tf_http_archive")
+load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
+load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
+
+def _extract_version_number(bazel_version):
+  """Extracts the semantic version number from a version string
+
+  Args:
+    bazel_version: the version string that begins with the semantic version
+      e.g. "1.2.3rc1 abc1234" where "abc1234" is a commit hash.
+
+  Returns:
+    The semantic version string, like "1.2.3".
+  """
+  for i in range(len(bazel_version)):
+    c = bazel_version[i]
+    if not (c.isdigit() or c == "."):
+      return bazel_version[:i]
+  return bazel_version
 
 # Parse the bazel version string from `native.bazel_version`.
+# e.g.
+# "0.10.0rc1 abc123d" => (0, 10, 0)
+# "0.3.0" => (0, 3, 0)
 def _parse_bazel_version(bazel_version):
-  # Remove commit from version.
-  version = bazel_version.split(" ", 1)[0]
+  """Parses a version string into a 3-tuple of ints
+
+  int tuples can be compared directly using binary operators (<, >).
 
-  # Split into (release, date) parts and only return the release
-  # as a tuple of integers.
-  parts = version.split("-", 1)
+  Args:
+    bazel_version: the Bazel version string
 
-  # Turn "release" into a tuple of strings
-  version_tuple = ()
-  for number in parts[0].split("."):
-    version_tuple += (str(number),)
-  return version_tuple
+  Returns:
+    An int 3-tuple of a (major, minor, patch) version.
+  """
 
-# Check that a specific bazel version is being used.
-def check_version(bazel_version):
+  version = _extract_version_number(bazel_version)
+  return tuple([int(n) for n in version.split(".")])
+
+def check_bazel_version_at_least(minimum_bazel_version):
   if "bazel_version" not in dir(native):
-    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" %
-         bazel_version)
+    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" % minimum_bazel_version)
   elif not native.bazel_version:
-    print("\nCurrent Bazel is not a release version, cannot check for " +
-          "compatibility.")
-    print("Make sure that you are running at least Bazel %s.\n" % bazel_version)
-  else:
-    current_bazel_version = _parse_bazel_version(native.bazel_version)
-    minimum_bazel_version = _parse_bazel_version(bazel_version)
-    if minimum_bazel_version > current_bazel_version:
-      fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
-          native.bazel_version, bazel_version))
-
-def _repos_are_siblings():
-  return Label("@foo//bar").workspace_root.startswith("../")
-
-# Temporary workaround to support including TensorFlow as a submodule until this
-# use-case is supported in the next Bazel release.
-def _temp_workaround_http_archive_impl(repo_ctx):
-  repo_ctx.template("BUILD", repo_ctx.attr.build_file, {
-      "%prefix%": ".." if _repos_are_siblings() else "external",
-      "%ws%": repo_ctx.attr.repository
-  }, False)
-  repo_ctx.download_and_extract(repo_ctx.attr.urls, "", repo_ctx.attr.sha256,
-                                "", repo_ctx.attr.strip_prefix)
-  if repo_ctx.attr.patch_file != None:
-    _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
-
-temp_workaround_http_archive = repository_rule(
-    attrs = {
-        "build_file": attr.label(),
-        "repository": attr.string(),
-        "patch_file": attr.label(default = None),
-        "urls": attr.string_list(default = []),
-        "sha256": attr.string(default = ""),
-        "strip_prefix": attr.string(default = ""),
-    },
-    implementation = _temp_workaround_http_archive_impl,
-)
-
-# Executes specified command with arguments and calls 'fail' if it exited with
-# non-zero code
-def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
-  result = repo_ctx.execute(cmd_and_args, timeout=10)
-  if result.return_code != 0:
-    fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n"
-          + "Stderr: {3}").format(" ".join(cmd_and_args), result.return_code,
-                                  result.stdout, result.stderr))
-
-# Apply a patch_file to the repository root directory
-# Runs 'patch -p1'
-def _apply_patch(repo_ctx, patch_file):
-  # Don't check patch on Windows, because patch is only available under bash.
-  if not _is_windows(repo_ctx) and not repo_ctx.which("patch"):
-    fail("patch command is not found, please install it")
-
-  cmd = [
-      "patch", "-p1", "-d", repo_ctx.path("."), "-i", repo_ctx.path(patch_file)
-  ]
-  if _is_windows(repo_ctx):
-    bazel_sh = _get_env_var(repo_ctx, "BAZEL_SH")
-    if not bazel_sh:
-      fail("BAZEL_SH environment variable is not set")
-    cmd = [bazel_sh, "-l", "-c", " ".join(cmd)]
-  _execute_and_check_ret_code(repo_ctx, cmd)
-
-# Download the repository and apply a patch to its root
-def _patched_http_archive_impl(repo_ctx):
-  repo_ctx.download_and_extract(
-      repo_ctx.attr.urls,
-      sha256=repo_ctx.attr.sha256,
-      stripPrefix=repo_ctx.attr.strip_prefix)
-  _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
-
-patched_http_archive = repository_rule(
-    attrs = {
-        "patch_file": attr.label(),
-        "build_file": attr.label(),
-        "repository": attr.string(),
-        "urls": attr.string_list(default = []),
-        "sha256": attr.string(default = ""),
-        "strip_prefix": attr.string(default = ""),
-    },
-    implementation = _patched_http_archive_impl,
-)
+    print("\nCurrent Bazel is not a release version, cannot check for compatibility.")
+    print("Make sure that you are running at least Bazel %s.\n" % minimum_bazel_version)
+    return
+
+  if _parse_bazel_version(native.bazel_version) < _parse_bazel_version(minimum_bazel_version):
+    fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
+        native.bazel_version, minimum_bazel_version))
 
 # If TensorFlow is linked as a submodule.
 # path_prefix is no longer used.
@@ -137,8 +66,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   # We must check the bazel version before trying to parse any other BUILD
   # files, in case the parsing of those build files depends on the bazel
   # version we require here.
-  check_version("0.5.4")
+  check_bazel_version_at_least("0.5.4")
+  clang6_configure(name="local_config_clang6")
   cuda_configure(name="local_config_cuda")
+  tensorrt_configure(name="local_config_tensorrt")
+  git_configure(name="local_config_git")
   sycl_configure(name="local_config_sycl")
   python_configure(name="local_config_python")
 
@@ -151,63 +83,65 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl",
       urls = [
-          "https://mirror.bazel.build/github.com/01org/mkl-dnn/releases/download/v0.9/mklml_lnx_2018.0.20170720.tgz",
-          "https://github.com/01org/mkl-dnn/releases/download/v0.9/mklml_lnx_2018.0.20170720.tgz",
+          "https://mirror.bazel.build/github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz",
+          "https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz",
       ],
-      sha256 = "57ba56c4c243f403ff78f417ff854ef50b9eddf4a610a917b7c95e7fa8553a4b",
-      strip_prefix = "mklml_lnx_2018.0.20170720",
+      sha256 = "6b07cb7e5451db67c2e31e785ae458b18f7f363c60a61685488f69e9ae7199d4",
+      strip_prefix = "mklml_lnx_2018.0.1.20171007",
       build_file = str(Label("//third_party/mkl:mkl.BUILD")),
-      repository = tf_repo_name,
   )
 
   if path_prefix:
     print("path_prefix was specified to tf_workspace but is no longer used " +
           "and will be removed in the future.")
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/01org/mkl-dnn/archive/b01e3a55a07be62172e713bcd2644c5176360212.tar.gz",
-          "https://github.com/01org/mkl-dnn/archive/b01e3a55a07be62172e713bcd2644c5176360212.tar.gz",
+          "https://mirror.bazel.build/github.com/01org/mkl-dnn/archive/e0bfcaa7fcb2b1e1558f5f0676933c1db807a729.tar.gz",
+          "https://github.com/01org/mkl-dnn/archive/e0bfcaa7fcb2b1e1558f5f0676933c1db807a729.tar.gz",
       ],
-      sha256 = "0d529ad4c49dc799e6df07c2b88b115d0668735da15fb3b3862d28d33fa68165",
-      strip_prefix = "mkl-dnn-b01e3a55a07be62172e713bcd2644c5176360212",
+      sha256 = "02e244f63dd95402691a361392504c143eede9a89043426f174836638a9cbf09",
+      strip_prefix = "mkl-dnn-e0bfcaa7fcb2b1e1558f5f0676933c1db807a729",
       build_file = str(Label("//third_party/mkl_dnn:mkldnn.BUILD")),
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "com_google_absl",
       urls = [
-          "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/cc4bed2d74f7c8717e31f9579214ab52a9c9c610.tar.gz",
-          "https://github.com/abseil/abseil-cpp/archive/cc4bed2d74f7c8717e31f9579214ab52a9c9c610.tar.gz",
+          "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/720c017e30339fd1786ce4aac68bc8559736e53f.tar.gz",
+          "https://github.com/abseil/abseil-cpp/archive/720c017e30339fd1786ce4aac68bc8559736e53f.tar.gz",
       ],
-     sha256 = "f1a7349f88d2846210c42e2f7271dabeee404c2a3b4198e34a797993e3569b03",
-     strip_prefix = "abseil-cpp-cc4bed2d74f7c8717e31f9579214ab52a9c9c610",
+     sha256 = "5996380e3e8b981f55d1c8d58e709c00dbb4806ba367be75d0925a68cc2f6478",
+     strip_prefix = "abseil-cpp-720c017e30339fd1786ce4aac68bc8559736e53f",
+     build_file = str(Label("//third_party:com_google_absl.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "eigen_archive",
       urls = [
-          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz",
+          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
       ],
-      sha256 = "61d8b6fc4279dd1dda986fb1677d15e3d641c07a3ea5abe255790b1f0c0c14e9",
-      strip_prefix = "eigen-eigen-429aa5254200",
+      sha256 = "0cadb31a35b514bf2dfd6b5d38205da94ef326ec6908fc3fd7c269948467214f",
+      strip_prefix = "eigen-eigen-2355b229ea4c",
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "arm_compiler",
-      build_file = str(Label("//:arm_compiler.BUILD")),
       sha256 = "970285762565c7890c6c087d262b0a18286e7d0384f13a37786d8521773bc969",
       strip_prefix = "tools-0e906ebc527eab1cdbf7adabff5b474da9562e9f/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf",
       urls = [
           "https://mirror.bazel.build/github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
+          # Please uncomment me, when the next upgrade happens. Then
+          # remove the whitelist entry in third_party/repo.bzl.
           # "https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
       ],
+      build_file = str(Label("//:arm_compiler.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "libxsmm_archive",
       urls = [
           "https://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
@@ -218,15 +152,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:libxsmm.BUILD")),
   )
 
-  native.bind(
-      name = "xsmm_avx",
-      actual = "@libxsmm_archive//third_party:xsmm_avx",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "ortools_archive",
       urls = [
           "https://mirror.bazel.build/github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
+          # Please uncomment me, when the next upgrade happens. Then
+          # remove the whitelist entry in third_party/repo.bzl.
           # "https://github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
       ],
       sha256 = "932075525642b04ac6f1b50589f1df5cd72ec2f448b721fd32234cf183f0e755",
@@ -234,27 +165,28 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:ortools.BUILD")),
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "com_googlesource_code_re2",
       urls = [
-          "https://mirror.bazel.build/github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
-          "https://github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
+          "https://mirror.bazel.build/github.com/google/re2/archive/26cd968b735e227361c9703683266f01e5df7857.tar.gz",
+          "https://github.com/google/re2/archive/26cd968b735e227361c9703683266f01e5df7857.tar.gz",
+
       ],
-      sha256 = "bd63550101e056427c9e7ff12a408c1c8b74e9803f393ca916b2926fc2c4906f",
-      strip_prefix = "re2-b94b7cd42e9f02673cd748c1ac1d16db4052514c",
+      sha256 = "e57eeb837ac40b5be37b2c6197438766e73343ffb32368efea793dfd8b28653b",
+      strip_prefix = "re2-26cd968b735e227361c9703683266f01e5df7857",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "gemmlowp",
       urls = [
-          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip",
-          "https://github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip",
+          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/d4d1e29a62192d8defdc057b913ef36ca582ac98.zip",
+          "https://github.com/google/gemmlowp/archive/d4d1e29a62192d8defdc057b913ef36ca582ac98.zip",
       ],
-      sha256 = "dd2557072bde12141419cb8320a9c25e6ec41a8ae53c2ac78c076a347bb46d9d",
-      strip_prefix = "gemmlowp-010bb3e71a26ca1d0884a167081d092b43563996",
+      sha256 = "e2bee7afd3c43028f23dd0d7f85ddd8b21aaf79c572b658e56164ef502b2b9c7",
+      strip_prefix = "gemmlowp-d4d1e29a62192d8defdc057b913ef36ca582ac98",
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "farmhash_archive",
       urls = [
           "https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz",
@@ -265,12 +197,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:farmhash.BUILD")),
   )
 
-  native.bind(
-      name = "farmhash",
-      actual = "@farmhash//:farmhash",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "highwayhash",
       urls = [
           "https://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
@@ -281,7 +208,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:highwayhash.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "nasm",
       urls = [
           "https://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
@@ -292,7 +219,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:nasm.BUILD")),
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "jpeg",
       urls = [
           "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
@@ -301,10 +228,9 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
       strip_prefix = "libjpeg-turbo-1.5.1",
       build_file = str(Label("//third_party/jpeg:jpeg.BUILD")),
-      repository = tf_repo_name,
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "png_archive",
       urls = [
           "https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
@@ -315,18 +241,18 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:png.BUILD")),
   )
 
-  native.new_http_archive(
-      name = "sqlite_archive",
+  tf_http_archive(
+      name = "org_sqlite",
       urls = [
           "https://mirror.bazel.build/www.sqlite.org/2017/sqlite-amalgamation-3200000.zip",
           "http://www.sqlite.org/2017/sqlite-amalgamation-3200000.zip",
       ],
       sha256 = "208780b3616f9de0aeb50822b7a8f5482f6515193859e91ed61637be6ad74fd4",
       strip_prefix = "sqlite-amalgamation-3200000",
-      build_file = str(Label("//third_party:sqlite.BUILD"))
+      build_file = str(Label("//third_party:sqlite.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "gif_archive",
       urls = [
           "https://mirror.bazel.build/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
@@ -337,7 +263,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:gif.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "six_archive",
       urls = [
           "https://mirror.bazel.build/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
@@ -348,17 +274,50 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:six.BUILD")),
   )
 
-  native.http_archive(
+  tf_http_archive(
+      name = "astor_archive",
+      urls = [
+          "https://mirror.bazel.build/pypi.python.org/packages/d8/be/c4276b3199ec3feee2a88bc64810fbea8f26d961e0a4cd9c68387a9f35de/astor-0.6.2.tar.gz",
+          "https://pypi.python.org/packages/d8/be/c4276b3199ec3feee2a88bc64810fbea8f26d961e0a4cd9c68387a9f35de/astor-0.6.2.tar.gz",
+      ],
+      sha256 = "ff6d2e2962d834acb125cc4dcc80c54a8c17c253f4cc9d9c43b5102a560bb75d",
+      strip_prefix = "astor-0.6.2",
+      build_file = str(Label("//third_party:astor.BUILD")),
+  )
+
+  tf_http_archive(
+      name = "gast_archive",
+      urls = [
+          "https://mirror.bazel.build/pypi.python.org/packages/5c/78/ff794fcae2ce8aa6323e789d1f8b3b7765f601e7702726f430e814822b96/gast-0.2.0.tar.gz",
+          "https://pypi.python.org/packages/5c/78/ff794fcae2ce8aa6323e789d1f8b3b7765f601e7702726f430e814822b96/gast-0.2.0.tar.gz",
+      ],
+      sha256 = "7068908321ecd2774f145193c4b34a11305bd104b4551b09273dfd1d6a374930",
+      strip_prefix = "gast-0.2.0",
+      build_file = str(Label("//third_party:gast.BUILD")),
+  )
+
+  tf_http_archive(
+      name = "termcolor_archive",
+      urls = [
+          "https://mirror.bazel.build/pypi.python.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz",
+          "https://pypi.python.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz",
+      ],
+      sha256 = "1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b",
+      strip_prefix = "termcolor-1.1.0",
+      build_file = str(Label("//third_party:termcolor.BUILD")),
+  )
+
+  tf_http_archive(
       name = "absl_py",
       urls = [
-          "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/231e3870b976c1dc61dce1749138661d21556028.tar.gz",
-          "https://github.com/abseil/abseil-py/archive/231e3870b976c1dc61dce1749138661d21556028.tar.gz",
+          "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/acec853355ef987eae48a8d87a79351c15dff593.tar.gz",
+          "https://github.com/abseil/abseil-py/archive/acec853355ef987eae48a8d87a79351c15dff593.tar.gz",
       ],
-      sha256 = "8ea2b23bfdb9ae7622f3e5d95236bc600c8d8509a2f38c84732b3145585d4f73",
-      strip_prefix = "abseil-py-231e3870b976c1dc61dce1749138661d21556028",
+      sha256 = "29e4584e778bee13aa4093824133d131d927cc160561892880118d9ff7b95a6a",
+      strip_prefix = "abseil-py-acec853355ef987eae48a8d87a79351c15dff593",
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "org_python_pypi_backports_weakref",
       urls = [
           "https://mirror.bazel.build/pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz",
@@ -369,7 +328,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:backports_weakref.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "com_github_andreif_codegen",
       urls = [
           "https://mirror.bazel.build/github.com/andreif/codegen/archive/1.0.tar.gz",
@@ -391,70 +350,50 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       },
   )
 
-  native.bind(
-      name = "six",
-      actual = "@six_archive//:six",
-  )
-
-  patched_http_archive(
+  tf_http_archive(
       name = "protobuf_archive",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
-          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+          "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
       ],
-      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
-      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
-      # TODO: remove patching when tensorflow stops linking same protos into
-      #       multiple shared libraries loaded in runtime by python.
-      #       This patch fixes a runtime crash when tensorflow is compiled
-      #       with clang -O2 on Linux (see https://github.com/tensorflow/tensorflow/issues/8394)
-      patch_file = str(Label("//third_party/protobuf:add_noinlines.patch")),
-  )
-
-  native.bind(
-      name = "protobuf",
-      actual = "@protobuf_archive//:protobuf",
-  )
-
-  native.bind(
-      name = "protobuf_headers",
-      actual = "@protobuf_archive//:protobuf_headers",
+      sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
+      strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
   )
 
   # We need to import the protobuf library under the names com_google_protobuf
   # and com_google_protobuf_cc to enable proto_library support in bazel.
   # Unfortunately there is no way to alias http_archives at the moment.
-  native.http_archive(
+  tf_http_archive(
       name = "com_google_protobuf",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
-          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+          "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
       ],
-      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
-      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
+      sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
+      strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "com_google_protobuf_cc",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
-          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
+          "https://github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz",
       ],
-      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
-      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
+      sha256 = "846d907acf472ae233ec0882ef3a2d24edbbe834b80c305e867ac65a1f2c59e3",
+      strip_prefix = "protobuf-396336eb961b75f03b25824fe86cf6490fb75e3a",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "nsync",
       urls = [
-          "https://mirror.bazel.build/github.com/google/nsync/archive/93815892dddafe9146a5f7e7042281d59d0f4323.tar.gz",
-          "https://github.com/google/nsync/archive/93815892dddafe9146a5f7e7042281d59d0f4323.tar.gz",
+          "https://mirror.bazel.build/github.com/google/nsync/archive/8502189abfa44c249c01c2cad64e6ed660a9a668.tar.gz",
+          "https://github.com/google/nsync/archive/8502189abfa44c249c01c2cad64e6ed660a9a668.tar.gz",
       ],
-      sha256 = "e3bd4555415ace511338fc27e595351738eea4e9006f1612b76c82914770716b",
-      strip_prefix = "nsync-93815892dddafe9146a5f7e7042281d59d0f4323",
+      sha256 = "51f81ff4202bbb820cdbedc061bd2eb6765f2b5c06489e7a8694bedac329e8f8",
+      strip_prefix = "nsync-8502189abfa44c249c01c2cad64e6ed660a9a668",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "com_google_googletest",
       urls = [
           "https://mirror.bazel.build/github.com/google/googletest/archive/9816b96a6ddc0430671693df90192bbee57108b6.zip",
@@ -464,7 +403,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "googletest-9816b96a6ddc0430671693df90192bbee57108b6",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "com_github_gflags_gflags",
       urls = [
           "https://mirror.bazel.build/github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
@@ -474,12 +413,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "gflags-f8a0efe03aa69b3336d8e228b37d4ccb17324b88",
   )
 
-  native.bind(
-      name = "python_headers",
-      actual = str(Label("//util/python:python_headers")),
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "pcre",
       sha256 = "ccdf7e788769838f8285b3ee672ed573358202305ee361cfec7a4a4fb005bbc7",
       urls = [
@@ -490,7 +424,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:pcre.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "swig",
       sha256 = "58a475dbbd4a4d7075e5fe86d4e54c9edde39847cdb96a3053d87cb64a23a453",
       urls = [
@@ -502,7 +436,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:swig.BUILD")),
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "curl",
       sha256 = "ff3e80c1ca6a068428726cd7dd19037a47cc538ce58ef61c59587191039b2ca6",
       urls = [
@@ -511,58 +445,19 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       strip_prefix = "curl-7.49.1",
       build_file = str(Label("//third_party:curl.BUILD")),
-      repository = tf_repo_name
-  )
-
-  # grpc expects //external:protobuf_clib and //external:protobuf_compiler
-  # to point to the protobuf's compiler library.
-  native.bind(
-      name = "protobuf_clib",
-      actual = "@protobuf_archive//:protoc_lib",
-  )
-
-  native.bind(
-      name = "libssl",
-      actual = "@boringssl//:ssl",
   )
 
-  # gRPC has includes directly from their third_party path for nanopb, so we
-  # must depend on their version of it.
-  native.bind(
-      name = "nanopb",
-      actual = "@grpc//third_party/nanopb:nanopb",
-  )
-
-  native.http_archive(
+  tf_http_archive(
       name = "grpc",
       urls = [
-          "https://mirror.bazel.build/github.com/grpc/grpc/archive/54e8f37e537794c2d814c1604c1282125f64f093.tar.gz",
-          "https://github.com/grpc/grpc/archive/54e8f37e537794c2d814c1604c1282125f64f093.tar.gz",
+          "https://mirror.bazel.build/github.com/grpc/grpc/archive/730b778632e79cc3c96ad237f282d687ee325ce7.tar.gz",
+          "https://github.com/grpc/grpc/archive/730b778632e79cc3c96ad237f282d687ee325ce7.tar.gz",
       ],
-      sha256 = "c2166b6d96daddf72fe45b2c594210c65ca17ec3c1b2e12089159a9529edb5e4",
-      strip_prefix = "grpc-54e8f37e537794c2d814c1604c1282125f64f093",
-  )
-
-  # gRPC wants the existence of a cares dependence but its contents are not
-  # actually important since we have set GRPC_ARES=0 in tools/bazel.rc
-  native.bind(
-      name = "cares",
-      actual = "@grpc//third_party/nanopb:nanopb",
-  )
-
-  # protobuf expects //external:grpc_cpp_plugin to point to grpc's
-  # C++ plugin code generator.
-  native.bind(
-      name = "grpc_cpp_plugin",
-      actual = "@grpc//:grpc_cpp_plugin",
-  )
-
-  native.bind(
-      name = "grpc_lib",
-      actual = "@grpc//:grpc++_unsecure",
+      sha256 = "8c91a8d12e1e868cf51f7340b75507a8aa017a7e1b56f46ed6816aeb803dc9bd",
+      strip_prefix = "grpc-730b778632e79cc3c96ad237f282d687ee325ce7",
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "linenoise",
       sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
       urls = [
@@ -575,19 +470,18 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
 
   # TODO(phawkins): currently, this rule uses an unofficial LLVM mirror.
   # Switch to an official source of snapshots if/when possible.
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/8d26b8bee4d8e7230870a600bc968c7ee8cf6f67.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/8d26b8bee4d8e7230870a600bc968c7ee8cf6f67.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/ba2e473a530286f386d18a95c9de4d673d4a21dc.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/ba2e473a530286f386d18a95c9de4d673d4a21dc.tar.gz",
       ],
-      sha256 = "ff5ddbe5af5e264426c8d489e7fddfc5ad7e0975f19cefe9db8c0a5d0faeb23e",
-      strip_prefix = "llvm-8d26b8bee4d8e7230870a600bc968c7ee8cf6f67",
+      sha256 = "0885a7c01220d2a96aeef4ff9aee016837150af839956d18af1845ea1acd0105",
+      strip_prefix = "llvm-ba2e473a530286f386d18a95c9de4d673d4a21dc",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
-      repository = tf_repo_name,
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "lmdb",
       urls = [
           "https://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
@@ -598,7 +492,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:lmdb.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "jsoncpp_git",
       urls = [
           "https://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
@@ -609,12 +503,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:jsoncpp.BUILD")),
   )
 
-  native.bind(
-      name = "jsoncpp",
-      actual = "@jsoncpp_git//:jsoncpp",
-  )
-
-  native.http_archive(
+  tf_http_archive(
       name = "boringssl",
       urls = [
           "https://mirror.bazel.build/github.com/google/boringssl/archive/a0fb951d2a26a8ee746b52f3ba81ab011a0af778.tar.gz",
@@ -624,7 +513,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "boringssl-a0fb951d2a26a8ee746b52f3ba81ab011a0af778",
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "zlib_archive",
       urls = [
           "https://mirror.bazel.build/zlib.net/zlib-1.2.8.tar.gz",
@@ -635,12 +524,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:zlib.BUILD")),
   )
 
-  native.bind(
-      name = "zlib",
-      actual = "@zlib_archive//:zlib",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "fft2d",
       urls = [
           "https://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
@@ -650,7 +534,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party/fft2d:fft2d.BUILD")),
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "snappy",
       urls = [
           "https://mirror.bazel.build/github.com/google/snappy/archive/1.1.4.tar.gz",
@@ -659,10 +543,9 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       sha256 = "2f7504c73d85bac842e893340333be8cb8561710642fc9562fccdd9d2c3fcc94",
       strip_prefix = "snappy-1.1.4",
       build_file = str(Label("//third_party:snappy.BUILD")),
-      repository = tf_repo_name,
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "nccl_archive",
       urls = [
           "https://mirror.bazel.build/github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
@@ -671,19 +554,29 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       sha256 = "2ca86fb6179ecbff789cc67c836139c1bbc0324ed8c04643405a30bf26325176",
       strip_prefix = "nccl-03d856977ecbaac87e598c0c4bafca96761b9ac7",
       build_file = str(Label("//third_party:nccl.BUILD")),
-      repository = tf_repo_name,
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
+      name = "kafka",
+      urls = [
+          "https://mirror.bazel.build/github.com/edenhill/librdkafka/archive/v0.11.1.tar.gz",
+          "https://github.com/edenhill/librdkafka/archive/v0.11.1.tar.gz",
+      ],
+      sha256 = "dd035d57c8f19b0b612dd6eefe6e5eebad76f506e302cccb7c2066f25a83585e",
+      strip_prefix = "librdkafka-0.11.1",
+      build_file = str(Label("//third_party:kafka/BUILD")),
+      patch_file = str(Label("//third_party/kafka:config.patch")),
+  )
+
+  tf_http_archive(
       name = "aws",
       urls = [
-          "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.0.90.tar.gz",
-          "https://github.com/aws/aws-sdk-cpp/archive/1.0.90.tar.gz",
+          "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
+          "https://github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
       ],
-      sha256 = "f599b57aec4f03ad696044dd430b2d201864113937353adc346f53ad47991319",
-      strip_prefix = "aws-sdk-cpp-1.0.90",
+      sha256 = "b888d8ce5fc10254c3dd6c9020c7764dd53cf39cf011249d0b4deda895de1b7c",
+      strip_prefix = "aws-sdk-cpp-1.3.15",
       build_file = str(Label("//third_party:aws.BUILD")),
-      repository = tf_repo_name
   )
 
   java_import_external(
@@ -711,7 +604,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       testonly_ = True,
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "jemalloc",
       urls = [
           "https://mirror.bazel.build/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
@@ -720,7 +613,6 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
       strip_prefix = "jemalloc-4.4.0",
       build_file = str(Label("//third_party:jemalloc.BUILD")),
-      repository = tf_repo_name,
   )
 
   java_import_external(
@@ -758,7 +650,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       licenses = ["notice"],  # Apache 2.0
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "com_google_pprof",
       urls = [
           "https://mirror.bazel.build/github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
@@ -769,7 +661,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:pprof.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "cub_archive",
       urls = [
           "https://mirror.bazel.build/github.com/NVlabs/cub/archive/1.7.4.zip",
@@ -778,14 +670,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       sha256 = "20a1a39fd97e5da7f40f5f2e7fd73fd2ea59f9dc4bb8a6c5f228aa543e727e31",
       strip_prefix = "cub-1.7.4",
       build_file = str(Label("//third_party:cub.BUILD")),
+      # TODO: remove the patch when upstream fix is accepted and released.
+      #       PR with a fix: https://github.com/NVlabs/cub/pull/125
+      patch_file = str(Label("//third_party/cub:fix_compilation_in_clang.patch")),
   )
 
-  native.bind(
-      name = "cub",
-      actual = "@cub_archive//:cub",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "cython",
       sha256 = "6dcd30b5ceb887b2b965ee7ceb82ea3acb5f0642fe2206c7636b45acea4798e5",
       urls = [
@@ -794,19 +684,20 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       strip_prefix = "cython-3732784c45cfb040a5b0936951d196f83a12ea17",
       build_file = str(Label("//third_party:cython.BUILD")),
+      delete = ["BUILD.bazel"],
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "bazel_toolchains",
       urls = [
-          "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/af4681c3d19f063f090222ec3d04108c4e0ca255.tar.gz",
-          "https://github.com/bazelbuild/bazel-toolchains/archive/af4681c3d19f063f090222ec3d04108c4e0ca255.tar.gz",
+          "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/f3b09700fae5d7b6e659d7cefe0dcc6e8498504c.tar.gz",
+          "https://github.com/bazelbuild/bazel-toolchains/archive/f3b09700fae5d7b6e659d7cefe0dcc6e8498504c.tar.gz",
       ],
-      sha256 = "d58bb2d6c8603f600d522b6104d6192a65339aa26cbba9f11ff5c4b36dedb928",
-      strip_prefix = "bazel-toolchains-af4681c3d19f063f090222ec3d04108c4e0ca255",
+      sha256 = "ed829b5eea8af1f405f4cc3d6ecfc3b1365bb7843171036030a31b5127002311",
+      strip_prefix = "bazel-toolchains-f3b09700fae5d7b6e659d7cefe0dcc6e8498504c",
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "arm_neon_2_x86_sse",
       sha256 = "c8d90aa4357f8079d427e87a6f4c493da1fa4140aee926c05902d7ec1533d9a5",
       strip_prefix = "ARM_NEON_2_x86_SSE-0f77d9d182265259b135dad949230ecbf1a2633d",
@@ -817,32 +708,109 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:arm_neon_2_x86_sse.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "flatbuffers",
-      build_file = str(Label("//third_party/flatbuffers:flatbuffers.BUILD")),
       strip_prefix = "flatbuffers-971a68110e4fc1bace10fcb6deeb189e7e1a34ce",
       sha256 = "874088d2ee0d9f8524191f77209556415f03dd44e156276edf19e5b90ceb5f55",
       urls = [
           "https://mirror.bazel.build/github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
           "https://github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
       ],
+      build_file = str(Label("//third_party/flatbuffers:flatbuffers.BUILD")),
   )
 
-  native.http_archive(
-      name = "double_conversion",
-      urls = [
-          "https://github.com/google/double-conversion/archive/5664746c5e64dc265e7fbc1a890a6698e6ad0ebb.zip",
-      ],
-      sha256 = "a0c49fb3cc8d34b2230d278a115f1bb266bcfcaae10400b84dc2a3b7dc2c8bc6",
-      strip_prefix = "double-conversion-5664746c5e64dc265e7fbc1a890a6698e6ad0ebb",
-  )
-  
-  native.new_http_archive(
+  tf_http_archive(
       name = "tflite_mobilenet",
-      build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
       sha256 = "23f814d1c076bdf03715dfb6cab3713aa4fbdf040fd5448c43196bd2e97a4c1b",
       urls = [
           "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
           "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
       ],
+      build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
+  )
+
+  tf_http_archive(
+      name = "tflite_smartreply",
+      sha256 = "8980151b85a87a9c1a3bb1ed4748119e4a85abd3cb5744d83da4d4bd0fbeef7c",
+      urls = [
+          "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip",
+          "https://storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip"
+      ],
+      build_file = str(Label("//third_party:tflite_smartreply.BUILD")),
+  )
+
+  ##############################################################################
+  # BIND DEFINITIONS
+  #
+  # Please do not add bind() definitions unless we have no other choice.
+  # If that ends up being the case, please leave a comment explaining
+  # why we can't depend on the canonical build target.
+
+  # gRPC wants a cares dependency but its contents is not actually
+  # important since we have set GRPC_ARES=0 in tools/bazel.rc
+  native.bind(
+      name = "cares",
+      actual = "@grpc//third_party/nanopb:nanopb",
+  )
+
+  # Needed by Protobuf
+  native.bind(
+      name = "grpc_cpp_plugin",
+      actual = "@grpc//:grpc_cpp_plugin",
+  )
+
+  # gRPC has three empty C++ functions which it wants the user to define
+  # at build time. https://github.com/grpc/grpc/issues/13590
+  native.bind(
+      name = "grpc_lib",
+      actual = "@grpc//:grpc++_unsecure",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "libssl",
+      actual = "@boringssl//:ssl",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "nanopb",
+      actual = "@grpc//third_party/nanopb:nanopb",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "protobuf",
+      actual = "@protobuf_archive//:protobuf",
+  )
+
+  # gRPC expects //external:protobuf_clib and //external:protobuf_compiler
+  # to point to Protobuf's compiler library.
+  native.bind(
+      name = "protobuf_clib",
+      actual = "@protobuf_archive//:protoc_lib",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "protobuf_headers",
+      actual = "@protobuf_archive//:protobuf_headers",
+  )
+
+  # Needed by Protobuf
+  native.bind(
+      name = "python_headers",
+      actual = str(Label("//util/python:python_headers")),
+  )
+
+  # Needed by Protobuf
+  native.bind(
+      name = "six",
+      actual = "@six_archive//:six",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "zlib",
+      actual = "@zlib_archive//:zlib",
   )
diff --git a/third_party/astor.BUILD b/third_party/astor.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..58fe9acf3326fbb81469a324d7ecd6c8569b2642
--- /dev/null
+++ b/third_party/astor.BUILD
@@ -0,0 +1,24 @@
+# Description:
+#   AST round-trip manipulation for Python.
+
+licenses(["notice"])  # New BSD
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "astor",
+    srcs = [
+        "astor/__init__.py",
+        "astor/code_gen.py",
+        "astor/codegen.py",
+        "astor/file_util.py",
+        "astor/node_util.py",
+        "astor/op_util.py",
+        "astor/rtrip.py",
+        "astor/source_repr.py",
+        "astor/string_repr.py",
+        "astor/tree_walk.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/aws.BUILD b/third_party/aws.BUILD
index bc9e37ffb3873118960f65a6d566ca5b34e0d613..2dc921933c310aa9ce2bf21798f1b5143386a12d 100644
--- a/third_party/aws.BUILD
+++ b/third_party/aws.BUILD
@@ -7,21 +7,21 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("@%ws%//third_party:common.bzl", "template_rule")
+load("@org_tensorflow//third_party:common.bzl", "template_rule")
 
 cc_library(
     name = "aws",
     srcs = select({
-        "@%ws%//tensorflow:linux_x86_64": glob([
+        "@org_tensorflow//tensorflow:linux_x86_64": glob([
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
-        "@%ws%//tensorflow:darwin": glob([
+        "@org_tensorflow//tensorflow:darwin": glob([
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
-        "@%ws%//tensorflow:linux_ppc64le": glob([
+        "@org_tensorflow//tensorflow:linux_ppc64le": glob([
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
-        "@%ws%//tensorflow:raspberry_pi_armeabi": glob([
+        "@org_tensorflow//tensorflow:raspberry_pi_armeabi": glob([
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
         "//conditions:default": [],
@@ -53,17 +53,17 @@ cc_library(
         "aws-cpp-sdk-core/include/aws/core/SDKConfig.h",
     ],
     defines = select({
-        "@%ws%//tensorflow:linux_x86_64": [
+        "@org_tensorflow//tensorflow:linux_x86_64": [
             "PLATFORM_LINUX",
             "ENABLE_CURL_CLIENT",
             "ENABLE_NO_ENCRYPTION",
         ],
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "PLATFORM_APPLE",
             "ENABLE_CURL_CLIENT",
             "ENABLE_NO_ENCRYPTION",
         ],
-        "@%ws%//tensorflow:linux_ppc64le": [
+        "@org_tensorflow//tensorflow:linux_ppc64le": [
             "PLATFORM_LINUX",
             "ENABLE_CURL_CLIENT",
             "ENABLE_NO_ENCRYPTION",
@@ -75,7 +75,7 @@ cc_library(
         "aws-cpp-sdk-s3/include/",
     ],
     deps = [
-        "@curl//:curl",
+        "@curl",
     ],
 )
 
diff --git a/third_party/com_google_absl.BUILD b/third_party/com_google_absl.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..8fca145f751eacfa3e5a0af046dcc8c19e6a85d4
--- /dev/null
+++ b/third_party/com_google_absl.BUILD
@@ -0,0 +1,5 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache
+
+exports_files(["LICENSE"])
diff --git a/third_party/cub/BUILD b/third_party/cub/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/third_party/cub/fix_compilation_in_clang.patch b/third_party/cub/fix_compilation_in_clang.patch
new file mode 100644
index 0000000000000000000000000000000000000000..384e674f2012b2b3ea59c5c1bd205873baa8cf18
--- /dev/null
+++ b/third_party/cub/fix_compilation_in_clang.patch
@@ -0,0 +1,23 @@
+From 565b77f7c82048871a4d5e3e506dc663d53cd469 Mon Sep 17 00:00:00 2001
+From: Ilya Biryukov <ibiryukov@google.com>
+Date: Fri, 26 Jan 2018 18:46:06 +0100
+Subject: [PATCH] Added missing 'template' keyword.
+
+To unbreak compilation with clang.
+---
+ cub/device/dispatch/dispatch_radix_sort.cuh | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
+index 7fbc621f..f622e212 100644
+--- a/cub/device/dispatch/dispatch_radix_sort.cuh
++++ b/cub/device/dispatch/dispatch_radix_sort.cuh
+@@ -104,7 +104,7 @@ __global__ void DeviceRadixSortUpsweepKernel(
+     CTA_SYNC();
+ 
+     // Write out digit counts (striped)
+-    upsweep.ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
++    upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
+ }
+ 
+ 
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index 805a30d2620078becaf1fe08a6856fce70decc50..4def6f94892329e0d8b594b824babd60ea259351 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -6,10 +6,11 @@ licenses(["notice"])  # MIT/X derivative license
 exports_files(["COPYING"])
 
 CURL_WIN_COPTS = [
-    "/I%prefix%/curl/lib",
+    "/Iexternal/curl/lib",
     "/DHAVE_CONFIG_H",
     "/DCURL_DISABLE_FTP",
     "/DCURL_DISABLE_NTLM",
+    "/DCURL_DISABLE_PROXY",
     "/DHAVE_LIBZ",
     "/DHAVE_ZLIB_H",
     # Defining _USING_V110_SDK71_ is hackery to defeat curl's incorrect
@@ -23,6 +24,8 @@ CURL_WIN_SRCS = [
     "lib/asyn-thread.c",
     "lib/inet_ntop.c",
     "lib/system_win32.c",
+    "lib/vtls/schannel.c",
+    "lib/idn_win32.c",
 ]
 
 cc_library(
@@ -224,14 +227,14 @@ cc_library(
         "lib/wildcard.h",
         "lib/x509asn1.h",
     ] + select({
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "lib/vtls/darwinssl.c",
         ],
-        "@%ws%//tensorflow:ios": [
+        "@org_tensorflow//tensorflow:ios": [
             "lib/vtls/darwinssl.c",
         ],
-        "@%ws%//tensorflow:windows": CURL_WIN_SRCS,
-        "@%ws%//tensorflow:windows_msvc": CURL_WIN_SRCS,
+        "@org_tensorflow//tensorflow:windows": CURL_WIN_SRCS,
+        "@org_tensorflow//tensorflow:windows_msvc": CURL_WIN_SRCS,
         "//conditions:default": [
             "lib/vtls/openssl.c",
         ],
@@ -248,10 +251,10 @@ cc_library(
         "include/curl/typecheck-gcc.h",
     ],
     copts = select({
-        "@%ws%//tensorflow:windows": CURL_WIN_COPTS,
-        "@%ws%//tensorflow:windows_msvc": CURL_WIN_COPTS,
+        "@org_tensorflow//tensorflow:windows": CURL_WIN_COPTS,
+        "@org_tensorflow//tensorflow:windows_msvc": CURL_WIN_COPTS,
         "//conditions:default": [
-            "-I%prefix%/curl/lib",
+            "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
             "-DHAVE_CONFIG_H",
             "-DCURL_DISABLE_FTP",
@@ -261,14 +264,14 @@ cc_library(
             "-Wno-string-plus-int",
         ],
     }) + select({
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "-fno-constant-cfstrings",
         ],
-        "@%ws%//tensorflow:windows": [
+        "@org_tensorflow//tensorflow:windows": [
             # See curl.h for discussion of write size and Windows
             "/DCURL_MAX_WRITE_SIZE=16384",
         ],
-        "@%ws%//tensorflow:windows_msvc": [
+        "@org_tensorflow//tensorflow:windows_msvc": [
             # See curl.h for discussion of write size and Windows
             "/DCURL_MAX_WRITE_SIZE=16384",
         ],
@@ -276,23 +279,30 @@ cc_library(
             "-DCURL_MAX_WRITE_SIZE=65536",
         ],
     }),
+    defines = ["CURL_STATICLIB"],
     includes = ["include"],
     linkopts = select({
-        "@%ws%//tensorflow:android": [
+        "@org_tensorflow//tensorflow:android": [
             "-pie",
         ],
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "-Wl,-framework",
             "-Wl,CoreFoundation",
             "-Wl,-framework",
             "-Wl,Security",
         ],
-        "@%ws%//tensorflow:ios": [],
-        "@%ws%//tensorflow:windows": [
-            "-Wl,ws2_32.lib",
+        "@org_tensorflow//tensorflow:ios": [],
+        "@org_tensorflow//tensorflow:windows": [
+            "-DEFAULTLIB:ws2_32.lib",
+            "-DEFAULTLIB:advapi32.lib",
+            "-DEFAULTLIB:crypt32.lib",
+            "-DEFAULTLIB:Normaliz.lib",
         ],
-        "@%ws%//tensorflow:windows_msvc": [
-            "-Wl,ws2_32.lib",
+        "@org_tensorflow//tensorflow:windows_msvc": [
+            "-DEFAULTLIB:ws2_32.lib",
+            "-DEFAULTLIB:advapi32.lib",
+            "-DEFAULTLIB:crypt32.lib",
+            "-DEFAULTLIB:Normaliz.lib",
         ],
         "//conditions:default": [
             "-lrt",
@@ -302,9 +312,9 @@ cc_library(
     deps = [
         "@zlib_archive//:zlib",
     ] + select({
-        "@%ws%//tensorflow:ios": [],
-        "@%ws%//tensorflow:windows": [],
-        "@%ws%//tensorflow:windows_msvc": [],
+        "@org_tensorflow//tensorflow:ios": [],
+        "@org_tensorflow//tensorflow:windows": [],
+        "@org_tensorflow//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "@boringssl//:ssl",
         ],
@@ -312,7 +322,7 @@ cc_library(
 )
 
 CURL_BIN_WIN_COPTS = [
-    "/I%prefix%/curl/lib",
+    "/Iexternal/curl/lib",
     "/DHAVE_CONFIG_H",
     "/DCURL_DISABLE_LIBCURL_OPTION",
 ]
@@ -406,10 +416,10 @@ cc_binary(
         "src/tool_xattr.h",
     ],
     copts = select({
-        "@%ws%//tensorflow:windows": CURL_BIN_WIN_COPTS,
-        "@%ws%//tensorflow:windows_msvc": CURL_BIN_WIN_COPTS,
+        "@org_tensorflow//tensorflow:windows": CURL_BIN_WIN_COPTS,
+        "@org_tensorflow//tensorflow:windows_msvc": CURL_BIN_WIN_COPTS,
         "//conditions:default": [
-            "-I%prefix%/curl/lib",
+            "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
             "-DHAVE_CONFIG_H",
             "-DCURL_DISABLE_LIBCURL_OPTION",
@@ -438,12 +448,22 @@ genrule(
         "#  include \"lib/config-win32.h\"",
         "#  define BUILDING_LIBCURL 1",
         "#  define CURL_DISABLE_CRYPTO_AUTH 1",
+        "#  define CURL_DISABLE_DICT 1",
+        "#  define CURL_DISABLE_FILE 1",
+        "#  define CURL_DISABLE_GOPHER 1",
         "#  define CURL_DISABLE_IMAP 1",
         "#  define CURL_DISABLE_LDAP 1",
         "#  define CURL_DISABLE_LDAPS 1",
         "#  define CURL_DISABLE_POP3 1",
         "#  define CURL_PULL_WS2TCPIP_H 1",
-        "#  define HTTP_ONLY 1",
+        "#  define CURL_DISABLE_SMTP 1",
+        "#  define CURL_DISABLE_TELNET 1",
+        "#  define CURL_DISABLE_TFTP 1",
+        "#  define CURL_PULL_WS2TCPIP_H 1",
+        "#  define USE_WINDOWS_SSPI 1",
+        "#  define USE_WIN32_IDN 1",
+        "#  define USE_SCHANNEL 1",
+        "#  define WANT_IDN_PROTOTYPES 1",
         "#elif defined(__APPLE__)",
         "#  define HAVE_FSETXATTR_6 1",
         "#  define HAVE_SETMODE 1",
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
index f5f3418527f2ae0a948ac15645ebd905b59bcabf..f661093bc9f68b845f3000b0a931c66773fb3339 100644
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@@ -36,7 +36,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "@eigen_archive//:eigen",
-        "@local_config_sycl//sycl:sycl",
+        "@local_config_sycl//sycl",
     ],
 )
 
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index c210b1712c05b3c77870dd0223eb0aa1cfafd63a..cb1636256d7d5e0a9a11824a6c25b18fe79f4f56 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -1,5 +1,5 @@
-#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
-#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
+#ifndef EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
+#define EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
 
 #ifdef _MSC_VER
 
@@ -502,4 +502,4 @@ struct functor_traits<scalar_product_op<QInt32, double>> {
 }  // end namespace internal
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
+#endif  // EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
index 7a222fddc1c24cfc523f022af70515ddf3f8c263..8f9906dbf9c0c9dd8e61964c65b36e8549a3241a 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
@@ -1,5 +1,5 @@
-#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
-#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
+#ifndef EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
+#define EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
 
 #include "PacketMathAVX2.h"
 
@@ -542,4 +542,4 @@ EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
 }  // end namespace internal
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
+#endif  // EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
index 045384d7fc13281663713d266df8dd44973ce9d1..7b4ecc752ff2e6b4544a0071fc0a971c6e9879a4 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
@@ -1,5 +1,5 @@
-#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
-#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
+#ifndef EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
+#define EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
 
 namespace Eigen {
 namespace internal {
@@ -63,4 +63,4 @@ pcast<Packet8q32i, Packet32q8u>(const Packet8q32i& a, const Packet8q32i& b,
 }  // end namespace internal
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
+#endif  // EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
index cd7120ec00a7cdcfa8085da0a382bfb643e6f841..26735743d487cbc4b50a744ede463f4eac6070a8 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
@@ -1,5 +1,5 @@
-#ifndef THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
-#define THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
+#ifndef EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
+#define EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
 
 namespace Eigen {
 namespace internal {
@@ -177,4 +177,4 @@ pcast<Packet16q32i, Packet32q16u>(const Packet16q32i& a,
 }  // end namespace internal
 }  // end namespace Eigen
 
-#endif  // THIRD_PARTY_EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
+#endif  // EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h
index 89190eb1affbc710bb031c3b1eab09e9a07222f5..2864f8329990325c73aadb32018ae975809cb09d 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h
@@ -151,7 +151,7 @@ Extract3DPatches(
 
   // TODO(mjanusz): Consider getting rid of pad(), and stride() and extend
   // extract_patches to take additional parameters for padding/striding,
-  // similarly to etract_image_patches.
+  // similarly to extract_image_patches.
   return input.pad(paddings, padding_value).extract_patches(patch_dims).reshape(pre_stride_dims).stride(strides);
 }
 
diff --git a/third_party/examples/eager/spinn/BUILD b/third_party/examples/eager/spinn/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0e39d4696fb5b4efafc94b4b96965d232ae4e473
--- /dev/null
+++ b/third_party/examples/eager/spinn/BUILD
@@ -0,0 +1,14 @@
+licenses(["notice"])  # 3-clause BSD.
+
+py_binary(
+    name = "spinn",
+    srcs = ["spinn.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/eager/python:tfe",
+        "//tensorflow/contrib/eager/python/examples/spinn:data",
+        "@six_archive//:six",
+    ],
+)
diff --git a/third_party/examples/eager/spinn/LICENSE b/third_party/examples/eager/spinn/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..09d493bf1fc257505c1336f3f87425568ab9da3c
--- /dev/null
+++ b/third_party/examples/eager/spinn/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2017, 
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/examples/eager/spinn/README.md b/third_party/examples/eager/spinn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..335c0fa3b549f6bc9221c81c5779cd499bd780d7
--- /dev/null
+++ b/third_party/examples/eager/spinn/README.md
@@ -0,0 +1,109 @@
+# SPINN with TensorFlow eager execution
+
+SPINN, or Stack-Augmented Parser-Interpreter Neural Network, is a recursive
+neural network that utilizes syntactic parse information for natural language
+understanding.
+
+SPINN was originally described by:
+Bowman, S.R., Gauthier, J., Rastogi A., Gupta, R., Manning, C.D., & Potts, C.
+  (2016). A Fast Unified Model for Parsing and Sentence Understanding.
+  https://arxiv.org/abs/1603.06021
+
+Our implementation is based on @jekbradbury's PyTorch implementation at:
+https://github.com/jekbradbury/examples/blob/spinn/snli/spinn.py,
+
+which was released under the BSD 3-Clause License at:
+https://github.com/jekbradbury/examples/blob/spinn/LICENSE
+
+Other eager execution examples can be found under [tensorflow/contrib/eager/python/examples](../../../../tensorflow/contrib/eager/python/examples).
+
+##  Content
+
+- [`data.py`](../../../../tensorflow/contrib/eager/python/examples/spinn/data.py): Pipeline for loading and preprocessing the
+   [SNLI](https://nlp.stanford.edu/projects/snli/) data and
+   [GloVe](https://nlp.stanford.edu/projects/glove/) word embedding, written
+   using the [`tf.data`](https://www.tensorflow.org/programmers_guide/datasets)
+   API.
+- [`spinn.py`](./spinn.py): Model definition and training routines.
+  This example illustrates how one might perform the following actions with
+  eager execution enabled:
+  * defining a model consisting of a dynamic computation graph,
+  * assigning operations to the CPU or GPU dependending on device availability,
+  * training the model using the data from the `tf.data`-based pipeline,
+  * obtaining metrics such as mean accuracy during training,
+  * saving and loading checkpoints,
+  * writing summaries for monitoring and visualization in TensorBoard.
+
+## To run
+
+- Make sure you have installed TensorFlow release 1.5 or higher. Alternatively,
+  you can use the latest `tf-nightly` or `tf-nightly-gpu` pip
+  package to access the eager execution feature.
+
+- Download and extract the raw SNLI data and GloVe embedding vectors.
+  For example:
+
+  ```bash
+  curl -fSsL https://nlp.stanford.edu/projects/snli/snli_1.0.zip --create-dirs -o /tmp/spinn-data/snli/snli_1.0.zip
+  unzip -d /tmp/spinn-data/snli /tmp/spinn-data/snli/snli_1.0.zip
+  curl -fSsL http://nlp.stanford.edu/data/glove.42B.300d.zip --create-dirs -o /tmp/spinn-data/glove/glove.42B.300d.zip
+  unzip -d /tmp/spinn-data/glove /tmp/spinn-data/glove/glove.42B.300d.zip
+  ```
+
+- Train model. E.g.,
+
+  ```bash
+  python spinn.py --data_root /tmp/spinn-data --logdir /tmp/spinn-logs
+  ```
+
+  During training, model checkpoints and TensorBoard summaries will be written
+  periodically to the directory specified with the `--logdir` flag.
+  The training script will reload a saved checkpoint from the directory if it
+  can find one there.
+
+  To view the summaries with TensorBoard:
+
+  ```bash
+  tensorboard --logdir /tmp/spinn-logs
+  ```
+
+- After training, you may use the model to perform inference on input data in
+  the SNLI data format. The premise and hypotheses sentences are specified with
+  the command-line flags `--inference_premise` and `--inference_hypothesis`,
+  respecitvely. Each sentence should include the words, as well as parentheses
+  representing a binary parsing of the sentence. The words and parentheses
+  should all be separated by spaces. For instance,
+
+  ```bash
+  pythons spinn.py --data_root /tmp/spinn-data --logdir /tmp/spinn-logs \
+      --inference_premise '( ( The dog ) ( ( is running ) . ) )' \
+      --inference_hypothesis '( ( The dog ) ( moves . ) )'
+  ```
+
+  which will generate an output like the following, due to the semantic
+  consistency of the two sentences.
+
+  ```none
+  Inference logits:
+    entailment:     1.101249 (winner)
+    contradiction:  -2.374171
+    neutral:        -0.296733
+  ```
+
+  By contrast, the following sentence pair:
+
+  ```bash
+  pythons spinn.py --data_root /tmp/spinn-data --logdir /tmp/spinn-logs \
+      --inference_premise '( ( The dog ) ( ( is running ) . ) )' \
+      --inference_hypothesis '( ( The dog ) ( rests . ) )'
+  ```
+
+  will give you an output like the following, due to the semantic
+  contradiction of the two sentences.
+
+  ```none
+  Inference logits:
+    entailment:     -1.070098
+    contradiction:  2.798695 (winner)
+    neutral:        -1.402287
+  ```
diff --git a/third_party/examples/eager/spinn/spinn.py b/third_party/examples/eager/spinn/spinn.py
new file mode 100644
index 0000000000000000000000000000000000000000..38ba48d5013c7515e7fc78de6125f0bd93fdc90a
--- /dev/null
+++ b/third_party/examples/eager/spinn/spinn.py
@@ -0,0 +1,807 @@
+r"""Implementation of SPINN in TensorFlow eager execution.
+
+SPINN: Stack-Augmented Parser-Interpreter Neural Network.
+
+Ths file contains model definition and code for training the model.
+
+The model definition is based on PyTorch implementation at:
+  https://github.com/jekbradbury/examples/tree/spinn/snli
+
+which was released under a BSD 3-Clause License at:
+https://github.com/jekbradbury/examples/blob/spinn/LICENSE:
+
+Copyright (c) 2017,
+All rights reserved.
+
+See ./LICENSE for more details.
+
+Instructions for use:
+* See `README.md` for details on how to prepare the SNLI and GloVe data.
+* Suppose you have prepared the data at "/tmp/spinn-data", use the folloing
+  command to train the model:
+
+  ```bash
+  python spinn.py --data_root /tmp/spinn-data --logdir /tmp/spinn-logs
+  ```
+
+  Checkpoints and TensorBoard summaries will be written to "/tmp/spinn-logs".
+
+References:
+* Bowman, S.R., Gauthier, J., Rastogi A., Gupta, R., Manning, C.D., & Potts, C.
+  (2016). A Fast Unified Model for Parsing and Sentence Understanding.
+  https://arxiv.org/abs/1603.06021
+* Bradbury, J. (2017). Recursive Neural Networks with PyTorch.
+  https://devblogs.nvidia.com/parallelforall/recursive-neural-networks-pytorch/
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import itertools
+import os
+import sys
+import time
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+import tensorflow.contrib.eager as tfe
+from tensorflow.contrib.eager.python.examples.spinn import data
+
+
+def _bundle(lstm_iter):
+  """Concatenate a list of Tensors along 1st axis and split result into two.
+
+  Args:
+    lstm_iter: A `list` of `N` dense `Tensor`s, each of which has the shape
+      (R, 2 * M).
+
+  Returns:
+    A `list` of two dense `Tensor`s, each of which has the shape (N * R, M).
+  """
+  return tf.split(tf.concat(lstm_iter, 0), 2, axis=1)
+
+
+def _unbundle(state):
+  """Concatenate a list of Tensors along 2nd axis and split result.
+
+  This is the inverse of `_bundle`.
+
+  Args:
+    state: A `list` of two dense `Tensor`s, each of which has the shape (R, M).
+
+  Returns:
+    A `list` of `R` dense `Tensors`, each of which has the shape (1, 2 * M).
+  """
+  return tf.split(tf.concat(state, 1), state[0].shape[0], axis=0)
+
+
+class Reducer(tfe.Network):
+  """A module that applies reduce operation on left and right vectors."""
+
+  def __init__(self, size, tracker_size=None):
+    super(Reducer, self).__init__()
+    self.left = self.track_layer(tf.layers.Dense(5 * size, activation=None))
+    self.right = self.track_layer(
+        tf.layers.Dense(5 * size, activation=None, use_bias=False))
+    if tracker_size is not None:
+      self.track = self.track_layer(
+          tf.layers.Dense(5 * size, activation=None, use_bias=False))
+    else:
+      self.track = None
+
+  def call(self, left_in, right_in, tracking=None):
+    """Invoke forward pass of the Reduce module.
+
+    This method feeds a linear combination of `left_in`, `right_in` and
+    `tracking` into a Tree LSTM and returns the output of the Tree LSTM.
+
+    Args:
+      left_in: A list of length L. Each item is a dense `Tensor` with
+        the shape (1, n_dims). n_dims is the size of the embedding vector.
+      right_in: A list of the same length as `left_in`. Each item should have
+        the same shape as the items of `left_in`.
+      tracking: Optional list of the same length as `left_in`. Each item is a
+        dense `Tensor` with shape (1, tracker_size * 2). tracker_size is the
+        size of the Tracker's state vector.
+
+    Returns:
+      Output: A list of length batch_size. Each item has the shape (1, n_dims).
+    """
+    left, right = _bundle(left_in), _bundle(right_in)
+    lstm_in = self.left(left[0]) + self.right(right[0])
+    if self.track and tracking:
+      lstm_in += self.track(_bundle(tracking)[0])
+    return _unbundle(self._tree_lstm(left[1], right[1], lstm_in))
+
+  def _tree_lstm(self, c1, c2, lstm_in):
+    a, i, f1, f2, o = tf.split(lstm_in, 5, axis=1)
+    c = tf.tanh(a) * tf.sigmoid(i) + tf.sigmoid(f1) * c1 + tf.sigmoid(f2) * c2
+    h = tf.sigmoid(o) * tf.tanh(c)
+    return h, c
+
+
+class Tracker(tfe.Network):
+  """A module that tracks the history of the sentence with an LSTM."""
+
+  def __init__(self, tracker_size, predict):
+    """Constructor of Tracker.
+
+    Args:
+      tracker_size: Number of dimensions of the underlying `LSTMCell`.
+      predict: (`bool`) Whether prediction mode is enabled.
+    """
+    super(Tracker, self).__init__()
+    self._rnn = self.track_layer(tf.nn.rnn_cell.LSTMCell(tracker_size))
+    self._state_size = tracker_size
+    if predict:
+      self._transition = self.track_layer(tf.layers.Dense(4))
+    else:
+      self._transition = None
+
+  def reset_state(self):
+    self.state = None
+
+  def call(self, bufs, stacks):
+    """Invoke the forward pass of the Tracker module.
+
+    This method feeds the concatenation of the top two elements of the stacks
+    into an LSTM cell and returns the resultant state of the LSTM cell.
+
+    Args:
+      bufs: A `list` of length batch_size. Each item is a `list` of
+        max_sequence_len (maximum sequence length of the batch). Each item
+        of the nested list is a dense `Tensor` of shape (1, d_proj), where
+        d_proj is the size of the word embedding vector or the size of the
+        vector space that the word embedding vector is projected to.
+      stacks: A `list` of size batch_size. Each item is a `list` of
+        variable length corresponding to the current height of the stack.
+        Each item of the nested list is a dense `Tensor` of shape (1, d_proj).
+
+    Returns:
+      1. A list of length batch_size. Each item is a dense `Tensor` of shape
+        (1, d_tracker * 2).
+      2.  If under predict mode, result of applying a Dense layer on the
+        first state vector of the RNN. Else, `None`.
+    """
+    buf = _bundle([buf[-1] for buf in bufs])[0]
+    stack1 = _bundle([stack[-1] for stack in stacks])[0]
+    stack2 = _bundle([stack[-2] for stack in stacks])[0]
+    x = tf.concat([buf, stack1, stack2], 1)
+    if self.state is None:
+      batch_size = int(x.shape[0])
+      zeros = tf.zeros((batch_size, self._state_size), dtype=tf.float32)
+      self.state = [zeros, zeros]
+    _, self.state = self._rnn(x, self.state)
+    unbundled = _unbundle(self.state)
+    if self._transition:
+      return unbundled, self._transition(self.state[0])
+    else:
+      return unbundled, None
+
+
+class SPINN(tfe.Network):
+  """Stack-augmented Parser-Interpreter Neural Network.
+
+  See https://arxiv.org/abs/1603.06021 for more details.
+  """
+
+  def __init__(self, config):
+    """Constructor of SPINN.
+
+    Args:
+      config: A `namedtupled` with the following attributes.
+        d_proj - (`int`) number of dimensions of the vector space to project the
+          word embeddings to.
+        d_tracker - (`int`) number of dimensions of the Tracker's state vector.
+        d_hidden - (`int`) number of the dimensions of the hidden state, for the
+          Reducer module.
+        n_mlp_layers - (`int`) number of multi-layer perceptron layers to use to
+          convert the output of the `Feature` module to logits.
+        predict - (`bool`) Whether the Tracker will enabled predictions.
+    """
+    super(SPINN, self).__init__()
+    self.config = config
+    self.reducer = self.track_layer(Reducer(config.d_hidden, config.d_tracker))
+    if config.d_tracker is not None:
+      self.tracker = self.track_layer(Tracker(config.d_tracker, config.predict))
+    else:
+      self.tracker = None
+
+  def call(self, buffers, transitions, training=False):
+    """Invoke the forward pass of the SPINN model.
+
+    Args:
+      buffers: Dense `Tensor` of shape
+        (max_sequence_len, batch_size, config.d_proj).
+      transitions: Dense `Tensor` with integer values that represent the parse
+        trees of the sentences. A value of 2 indicates "reduce"; a value of 3
+        indicates "shift". Shape: (max_sequence_len * 2 - 3, batch_size).
+      training: Whether the invocation is under training mode.
+
+    Returns:
+      Output `Tensor` of shape (batch_size, config.d_embed).
+    """
+    max_sequence_len, batch_size, d_proj = (int(x) for x in buffers.shape)
+
+    # Split the buffers into left and right word items and put the initial
+    # items in a stack.
+    splitted = tf.split(
+        tf.reshape(tf.transpose(buffers, [1, 0, 2]), [-1, d_proj]),
+        max_sequence_len * batch_size, axis=0)
+    buffers = [splitted[k:k + max_sequence_len]
+               for k in xrange(0, len(splitted), max_sequence_len)]
+    stacks = [[buf[0], buf[0]] for buf in buffers]
+
+    if self.tracker:
+      # Reset tracker state for new batch.
+      self.tracker.reset_state()
+
+    num_transitions = transitions.shape[0]
+
+    # Iterate through transitions and perform the appropriate stack-pop, reduce
+    # and stack-push operations.
+    transitions = transitions.numpy()
+    for i in xrange(num_transitions):
+      trans = transitions[i]
+      if self.tracker:
+        # Invoke tracker to obtain the current tracker states for the sentences.
+        tracker_states, trans_hypothesis = self.tracker(buffers, stacks)
+        if trans_hypothesis:
+          trans = tf.argmax(trans_hypothesis, axis=-1)
+      else:
+        tracker_states = itertools.repeat(None)
+      lefts, rights, trackings = [], [], []
+      for transition, buf, stack, tracking in zip(
+          trans, buffers, stacks, tracker_states):
+        if int(transition) == 3:  # Shift.
+          stack.append(buf.pop())
+        elif int(transition) == 2:  # Reduce.
+          rights.append(stack.pop())
+          lefts.append(stack.pop())
+          trackings.append(tracking)
+
+      if rights:
+        reducer_output = self.reducer(lefts, rights, trackings)
+        reduced = iter(reducer_output)
+
+        for transition, stack in zip(trans, stacks):
+          if int(transition) == 2:  # Reduce.
+            stack.append(next(reduced))
+    return _bundle([stack.pop() for stack in stacks])[0]
+
+
+class SNLIClassifier(tfe.Network):
+  """SNLI Classifier Model.
+
+  A model aimed at solving the SNLI (Standford Natural Language Inference)
+  task, using the SPINN model from above. For details of the task, see:
+    https://nlp.stanford.edu/projects/snli/
+  """
+
+  def __init__(self, config, embed):
+    """Constructor of SNLICLassifier.
+
+    Args:
+      config: A namedtuple containing required configurations for the model. It
+        needs to have the following attributes.
+        projection - (`bool`) whether the word vectors are to be projected onto
+          another vector space (of `d_proj` dimensions).
+        d_proj - (`int`) number of dimensions of the vector space to project the
+          word embeddings to.
+        embed_dropout - (`float`) dropout rate for the word embedding vectors.
+        n_mlp_layers - (`int`) number of multi-layer perceptron (MLP) layers to
+          use to convert the output of the `Feature` module to logits.
+        mlp_dropout - (`float`) dropout rate of the MLP layers.
+        d_out - (`int`) number of dimensions of the final output of the MLP
+          layers.
+        lr - (`float`) learning rate.
+      embed: A embedding matrix of shape (vocab_size, d_embed).
+    """
+    super(SNLIClassifier, self).__init__()
+    self.config = config
+    self.embed = tf.constant(embed)
+
+    self.projection = self.track_layer(tf.layers.Dense(config.d_proj))
+    self.embed_bn = self.track_layer(tf.layers.BatchNormalization())
+    self.embed_dropout = self.track_layer(
+        tf.layers.Dropout(rate=config.embed_dropout))
+    self.encoder = self.track_layer(SPINN(config))
+
+    self.feature_bn = self.track_layer(tf.layers.BatchNormalization())
+    self.feature_dropout = self.track_layer(
+        tf.layers.Dropout(rate=config.mlp_dropout))
+
+    self.mlp_dense = []
+    self.mlp_bn = []
+    self.mlp_dropout = []
+    for _ in xrange(config.n_mlp_layers):
+      self.mlp_dense.append(self.track_layer(tf.layers.Dense(config.d_mlp)))
+      self.mlp_bn.append(
+          self.track_layer(tf.layers.BatchNormalization()))
+      self.mlp_dropout.append(
+          self.track_layer(tf.layers.Dropout(rate=config.mlp_dropout)))
+    self.mlp_output = self.track_layer(tf.layers.Dense(
+        config.d_out,
+        kernel_initializer=tf.random_uniform_initializer(minval=-5e-3,
+                                                         maxval=5e-3)))
+
+  def call(self,
+           premise,
+           premise_transition,
+           hypothesis,
+           hypothesis_transition,
+           training=False):
+    """Invoke the forward pass the SNLIClassifier model.
+
+    Args:
+      premise: The word indices of the premise sentences, with shape
+        (max_prem_seq_len, batch_size).
+      premise_transition: The transitions for the premise sentences, with shape
+        (max_prem_seq_len * 2 - 3, batch_size).
+      hypothesis: The word indices of the hypothesis sentences, with shape
+        (max_hypo_seq_len, batch_size).
+      hypothesis_transition: The transitions for the hypothesis sentences, with
+        shape (max_hypo_seq_len * 2 - 3, batch_size).
+      training: Whether the invocation is under training mode.
+
+    Returns:
+      The logits, as a dense `Tensor` of shape (batch_size, d_out), where d_out
+      is the size of the output vector.
+    """
+    # Perform embedding lookup on the premise and hypothesis inputs, which have
+    # the word-index format.
+    premise_embed = tf.nn.embedding_lookup(self.embed, premise)
+    hypothesis_embed = tf.nn.embedding_lookup(self.embed, hypothesis)
+
+    if self.config.projection:
+      # Project the embedding vectors to another vector space.
+      premise_embed = self.projection(premise_embed)
+      hypothesis_embed = self.projection(hypothesis_embed)
+
+    # Perform batch normalization and dropout on the possibly projected word
+    # vectors.
+    premise_embed = self.embed_bn(premise_embed, training=training)
+    hypothesis_embed = self.embed_bn(hypothesis_embed, training=training)
+    premise_embed = self.embed_dropout(premise_embed, training=training)
+    hypothesis_embed = self.embed_dropout(hypothesis_embed, training=training)
+
+    # Run the batch-normalized and dropout-processed word vectors through the
+    # SPINN encoder.
+    premise = self.encoder(premise_embed, premise_transition,
+                           training=training)
+    hypothesis = self.encoder(hypothesis_embed, hypothesis_transition,
+                              training=training)
+
+    # Combine encoder outputs for premises and hypotheses into logits.
+    # Then apply batch normalization and dropuout on the logits.
+    logits = tf.concat(
+        [premise, hypothesis, premise - hypothesis, premise * hypothesis], 1)
+    logits = self.feature_dropout(
+        self.feature_bn(logits, training=training), training=training)
+
+    # Apply the multi-layer perceptron on the logits.
+    for dense, bn, dropout in zip(
+        self.mlp_dense, self.mlp_bn, self.mlp_dropout):
+      logits = tf.nn.elu(dense(logits))
+      logits = dropout(bn(logits, training=training), training=training)
+    logits = self.mlp_output(logits)
+    return logits
+
+
+class SNLIClassifierTrainer(object):
+  """A class that coordinates the training of an SNLIClassifier."""
+
+  def __init__(self, snli_classifier, lr):
+    """Constructor of SNLIClassifierTrainer.
+
+    Args:
+      snli_classifier: An instance of `SNLIClassifier`.
+      lr: Learning rate.
+    """
+    self._model = snli_classifier
+    # Create a custom learning rate Variable for the RMSProp optimizer, because
+    # the learning rate needs to be manually decayed later (see
+    # decay_learning_rate()).
+    self._learning_rate = tfe.Variable(lr, name="learning_rate")
+    self._optimizer = tf.train.RMSPropOptimizer(self._learning_rate,
+                                                epsilon=1e-6)
+
+  def loss(self, labels, logits):
+    """Calculate the loss given a batch of data.
+
+    Args:
+      labels: The truth labels, with shape (batch_size,).
+      logits: The logits output from the forward pass of the SNLIClassifier
+        model, with shape (batch_size, d_out), where d_out is the output
+        dimension size of the SNLIClassifier.
+
+    Returns:
+      The loss value, as a scalar `Tensor`.
+    """
+    return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=labels, logits=logits))
+
+  def train_batch(self,
+                  labels,
+                  premise,
+                  premise_transition,
+                  hypothesis,
+                  hypothesis_transition):
+    """Train model on batch of data.
+
+    Args:
+      labels: The truth labels, with shape (batch_size,).
+      premise: The word indices of the premise sentences, with shape
+        (max_prem_seq_len, batch_size).
+      premise_transition: The transitions for the premise sentences, with shape
+        (max_prem_seq_len * 2 - 3, batch_size).
+      hypothesis: The word indices of the hypothesis sentences, with shape
+        (max_hypo_seq_len, batch_size).
+      hypothesis_transition: The transitions for the hypothesis sentences, with
+        shape (max_hypo_seq_len * 2 - 3, batch_size).
+
+    Returns:
+      1. loss value as a scalar `Tensor`.
+      2. logits as a dense `Tensor` of shape (batch_size, d_out), where d_out is
+        the output dimension size of the SNLIClassifier.
+    """
+    with tfe.GradientTape() as tape:
+      tape.watch(self._model.variables)
+      logits = self._model(premise,
+                           premise_transition,
+                           hypothesis,
+                           hypothesis_transition,
+                           training=True)
+      loss = self.loss(labels, logits)
+    gradients = tape.gradient(loss, self._model.variables)
+    self._optimizer.apply_gradients(zip(gradients, self._model.variables),
+                                    global_step=tf.train.get_global_step())
+    return loss, logits
+
+  def decay_learning_rate(self, decay_by):
+    """Decay learning rate of the optimizer by factor decay_by."""
+    self._learning_rate.assign(self._learning_rate * decay_by)
+    print("Decayed learning rate of optimizer to: %s" %
+          self._learning_rate.numpy())
+
+  @property
+  def learning_rate(self):
+    return self._learning_rate
+
+  @property
+  def model(self):
+    return self._model
+
+  @property
+  def variables(self):
+    return (self._model.variables + [self.learning_rate] +
+            self._optimizer.variables())
+
+
+def _batch_n_correct(logits, label):
+  """Calculate number of correct predictions in a batch.
+
+  Args:
+    logits: A logits Tensor of shape `(batch_size, num_categories)` and dtype
+      `float32`.
+    label: A labels Tensor of shape `(batch_size,)` and dtype `int64`
+
+  Returns:
+    Number of correct predictions.
+  """
+  return tf.reduce_sum(
+      tf.cast((tf.equal(
+          tf.argmax(logits, axis=1), label)), tf.float32)).numpy()
+
+
+def _evaluate_on_dataset(snli_data, batch_size, trainer, use_gpu):
+  """Run evaluation on a dataset.
+
+  Args:
+    snli_data: The `data.SnliData` to use in this evaluation.
+    batch_size: The batch size to use during this evaluation.
+    trainer: An instance of `SNLIClassifierTrainer to use for this
+      evaluation.
+    use_gpu: Whether GPU is being used.
+
+  Returns:
+    1. Average loss across all examples of the dataset.
+    2. Average accuracy rate across all examples of the dataset.
+  """
+  mean_loss = tfe.metrics.Mean()
+  accuracy = tfe.metrics.Accuracy()
+  for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator(
+      snli_data, batch_size):
+    if use_gpu:
+      label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu()
+    logits = trainer.model(prem, prem_trans, hypo, hypo_trans, training=False)
+    loss_val = trainer.loss(label, logits)
+    batch_size = tf.shape(label)[0]
+    mean_loss(loss_val, weights=batch_size.gpu() if use_gpu else batch_size)
+    accuracy(tf.argmax(logits, axis=1), label)
+  return mean_loss.result().numpy(), accuracy.result().numpy()
+
+
+def _get_dataset_iterator(snli_data, batch_size):
+  """Get a data iterator for a split of SNLI data.
+
+  Args:
+    snli_data: A `data.SnliData` object.
+    batch_size: The desired batch size.
+
+  Returns:
+    A dataset iterator.
+  """
+  with tf.device("/device:CPU:0"):
+    # Some tf.data ops, such as ShuffleDataset, are available only on CPU.
+    dataset = tf.data.Dataset.from_generator(
+        snli_data.get_generator(batch_size),
+        (tf.int64, tf.int64, tf.int64, tf.int64, tf.int64))
+    dataset = dataset.shuffle(snli_data.num_batches(batch_size))
+    return tfe.Iterator(dataset)
+
+
+def train_or_infer_spinn(embed,
+                         word2index,
+                         train_data,
+                         dev_data,
+                         test_data,
+                         config):
+  """Perform Training or Inference on a SPINN model.
+
+  Args:
+    embed: The embedding matrix as a float32 numpy array with shape
+      [vocabulary_size, word_vector_len]. word_vector_len is the length of a
+      word embedding vector.
+    word2index: A `dict` mapping word to word index.
+    train_data: An instance of `data.SnliData`, for the train split.
+    dev_data: Same as above, for the dev split.
+    test_data: Same as above, for the test split.
+    config: A configuration object. See the argument to this Python binary for
+      details.
+
+  Returns:
+    If `config.inference_premise ` and `config.inference_hypothesis` are not
+      `None`, i.e., inference mode: the logits for the possible labels of the
+      SNLI data set, as numpy array of three floats.
+    else:
+      The trainer object.
+  Raises:
+    ValueError: if only one of config.inference_premise and
+      config.inference_hypothesis is specified.
+  """
+  # TODO(cais): Refactor this function into separate one for training and
+  #   inference.
+  use_gpu = tfe.num_gpus() > 0 and not config.force_cpu
+  device = "gpu:0" if use_gpu else "cpu:0"
+  print("Using device: %s" % device)
+
+  if ((config.inference_premise and not config.inference_hypothesis) or
+      (not config.inference_premise and config.inference_hypothesis)):
+    raise ValueError(
+        "--inference_premise and --inference_hypothesis must be both "
+        "specified or both unspecified, but only one is specified.")
+
+  if config.inference_premise:
+    # Inference mode.
+    inference_sentence_pair = [
+        data.encode_sentence(config.inference_premise, word2index),
+        data.encode_sentence(config.inference_hypothesis, word2index)]
+  else:
+    inference_sentence_pair = None
+
+  log_header = (
+      "  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss"
+      "     Accuracy  Dev/Accuracy")
+  log_template = (
+      "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} {} "
+      "{:12.4f} {}")
+  dev_log_template = (
+      "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} "
+      "{:8.6f} {:12.4f} {:12.4f}")
+
+  summary_writer = tf.contrib.summary.create_file_writer(
+      config.logdir, flush_millis=10000)
+
+  with tf.device(device), \
+       summary_writer.as_default(), \
+       tf.contrib.summary.always_record_summaries():
+    with tfe.restore_variables_on_create(
+        tf.train.latest_checkpoint(config.logdir)):
+      model = SNLIClassifier(config, embed)
+      global_step = tf.train.get_or_create_global_step()
+      trainer = SNLIClassifierTrainer(model, config.lr)
+
+    if inference_sentence_pair:
+      # Inference mode.
+      with tfe.restore_variables_on_create(
+          tf.train.latest_checkpoint(config.logdir)):
+        prem, prem_trans = inference_sentence_pair[0]
+        hypo, hypo_trans = inference_sentence_pair[1]
+        hypo_trans = inference_sentence_pair[1][1]
+        inference_logits = model(  # pylint: disable=not-callable
+            tf.constant(prem), tf.constant(prem_trans),
+            tf.constant(hypo), tf.constant(hypo_trans), training=False)
+        inference_logits = np.array(inference_logits[0][1:])
+        max_index = np.argmax(inference_logits)
+        print("\nInference logits:")
+        for i, (label, logit) in enumerate(
+            zip(data.POSSIBLE_LABELS, inference_logits)):
+          winner_tag = " (winner)" if max_index == i else ""
+          print("  {0:<16}{1:.6f}{2}".format(label + ":", logit, winner_tag))
+      return inference_logits
+
+    train_len = train_data.num_batches(config.batch_size)
+    start = time.time()
+    iterations = 0
+    mean_loss = tfe.metrics.Mean()
+    accuracy = tfe.metrics.Accuracy()
+    print(log_header)
+    for epoch in xrange(config.epochs):
+      batch_idx = 0
+      for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator(
+          train_data, config.batch_size):
+        if use_gpu:
+          label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu()
+          # prem_trans and hypo_trans are used for dynamic control flow and can
+          # remain on CPU. Same in _evaluate_on_dataset().
+
+        iterations += 1
+        with tfe.restore_variables_on_create(
+            tf.train.latest_checkpoint(config.logdir)):
+          batch_train_loss, batch_train_logits = trainer.train_batch(
+              label, prem, prem_trans, hypo, hypo_trans)
+        batch_size = tf.shape(label)[0]
+        mean_loss(batch_train_loss.numpy(),
+                  weights=batch_size.gpu() if use_gpu else batch_size)
+        accuracy(tf.argmax(batch_train_logits, axis=1), label)
+
+        if iterations % config.save_every == 0:
+          all_variables = trainer.variables + [global_step]
+          saver = tfe.Saver(all_variables)
+          saver.save(os.path.join(config.logdir, "ckpt"),
+                     global_step=global_step)
+
+        if iterations % config.dev_every == 0:
+          dev_loss, dev_frac_correct = _evaluate_on_dataset(
+              dev_data, config.batch_size, trainer, use_gpu)
+          print(dev_log_template.format(
+              time.time() - start,
+              epoch, iterations, 1 + batch_idx, train_len,
+              100.0 * (1 + batch_idx) / train_len,
+              mean_loss.result(), dev_loss,
+              accuracy.result() * 100.0, dev_frac_correct * 100.0))
+          tf.contrib.summary.scalar("dev/loss", dev_loss)
+          tf.contrib.summary.scalar("dev/accuracy", dev_frac_correct)
+        elif iterations % config.log_every == 0:
+          mean_loss_val = mean_loss.result()
+          accuracy_val = accuracy.result()
+          print(log_template.format(
+              time.time() - start,
+              epoch, iterations, 1 + batch_idx, train_len,
+              100.0 * (1 + batch_idx) / train_len,
+              mean_loss_val, " " * 8, accuracy_val * 100.0, " " * 12))
+          tf.contrib.summary.scalar("train/loss", mean_loss_val)
+          tf.contrib.summary.scalar("train/accuracy", accuracy_val)
+          # Reset metrics.
+          mean_loss = tfe.metrics.Mean()
+          accuracy = tfe.metrics.Accuracy()
+
+        batch_idx += 1
+      if (epoch + 1) % config.lr_decay_every == 0:
+        trainer.decay_learning_rate(config.lr_decay_by)
+
+    test_loss, test_frac_correct = _evaluate_on_dataset(
+        test_data, config.batch_size, trainer, use_gpu)
+    print("Final test loss: %g; accuracy: %g%%" %
+          (test_loss, test_frac_correct * 100.0))
+
+  return trainer
+
+
+def main(_):
+  config = FLAGS
+
+  # Load embedding vectors.
+  vocab = data.load_vocabulary(FLAGS.data_root)
+  word2index, embed = data.load_word_vectors(FLAGS.data_root, vocab)
+
+  if not (config.inference_premise or config.inference_hypothesis):
+    print("Loading train, dev and test data...")
+    train_data = data.SnliData(
+        os.path.join(FLAGS.data_root, "snli/snli_1.0/snli_1.0_train.txt"),
+        word2index, sentence_len_limit=FLAGS.sentence_len_limit)
+    dev_data = data.SnliData(
+        os.path.join(FLAGS.data_root, "snli/snli_1.0/snli_1.0_dev.txt"),
+        word2index, sentence_len_limit=FLAGS.sentence_len_limit)
+    test_data = data.SnliData(
+        os.path.join(FLAGS.data_root, "snli/snli_1.0/snli_1.0_test.txt"),
+        word2index, sentence_len_limit=FLAGS.sentence_len_limit)
+  else:
+    train_data = None
+    dev_data = None
+    test_data = None
+
+  train_or_infer_spinn(
+      embed, word2index, train_data, dev_data, test_data, config)
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(
+      description=
+      "TensorFlow eager implementation of the SPINN SNLI classifier.")
+  parser.add_argument("--data_root", type=str, default="/tmp/spinn-data",
+                      help="Root directory in which the training data and "
+                      "embedding matrix are found. See README.md for how to "
+                      "generate such a directory.")
+  parser.add_argument("--sentence_len_limit", type=int, default=-1,
+                      help="Maximum allowed sentence length (# of words). "
+                      "The default of -1 means unlimited.")
+  parser.add_argument("--logdir", type=str, default="/tmp/spinn-logs",
+                      help="Directory in which summaries will be written for "
+                      "TensorBoard.")
+  parser.add_argument("--inference_premise", type=str, default=None,
+                      help="Premise sentence for inference. Must be "
+                      "accompanied by --inference_hypothesis. If specified, "
+                      "will override all training parameters and perform "
+                      "inference.")
+  parser.add_argument("--inference_hypothesis", type=str, default=None,
+                      help="Hypothesis sentence for inference. Must be "
+                      "accompanied by --inference_premise. If specified, will "
+                      "override all training parameters and perform inference.")
+  parser.add_argument("--epochs", type=int, default=50,
+                      help="Number of epochs to train.")
+  parser.add_argument("--batch_size", type=int, default=128,
+                      help="Batch size to use during training.")
+  parser.add_argument("--d_proj", type=int, default=600,
+                      help="Dimensions to project the word embedding vectors "
+                      "to.")
+  parser.add_argument("--d_hidden", type=int, default=300,
+                      help="Size of the hidden layer of the Tracker.")
+  parser.add_argument("--d_out", type=int, default=4,
+                      help="Output dimensions of the SNLIClassifier.")
+  parser.add_argument("--d_mlp", type=int, default=1024,
+                      help="Size of each layer of the multi-layer perceptron "
+                      "of the SNLICLassifier.")
+  parser.add_argument("--n_mlp_layers", type=int, default=2,
+                      help="Number of layers in the multi-layer perceptron "
+                      "of the SNLICLassifier.")
+  parser.add_argument("--d_tracker", type=int, default=64,
+                      help="Size of the tracker LSTM.")
+  parser.add_argument("--log_every", type=int, default=50,
+                      help="Print log and write TensorBoard summary every _ "
+                      "training batches.")
+  parser.add_argument("--lr", type=float, default=2e-3,
+                      help="Initial learning rate.")
+  parser.add_argument("--lr_decay_by", type=float, default=0.75,
+                      help="The ratio to multiply the learning rate by every "
+                      "time the learning rate is decayed.")
+  parser.add_argument("--lr_decay_every", type=float, default=1,
+                      help="Decay the learning rate every _ epoch(s).")
+  parser.add_argument("--dev_every", type=int, default=1000,
+                      help="Run evaluation on the dev split every _ training "
+                      "batches.")
+  parser.add_argument("--save_every", type=int, default=1000,
+                      help="Save checkpoint every _ training batches.")
+  parser.add_argument("--embed_dropout", type=float, default=0.08,
+                      help="Word embedding dropout rate.")
+  parser.add_argument("--mlp_dropout", type=float, default=0.07,
+                      help="SNLIClassifier multi-layer perceptron dropout "
+                      "rate.")
+  parser.add_argument("--no-projection", action="store_false",
+                      dest="projection",
+                      help="Whether word embedding vectors are projected to "
+                      "another set of vectors (see d_proj).")
+  parser.add_argument("--predict_transitions", action="store_true",
+                      dest="predict",
+                      help="Whether the Tracker will perform prediction.")
+  parser.add_argument("--force_cpu", action="store_true", dest="force_cpu",
+                      help="Force use CPU-only regardless of whether a GPU is "
+                      "available.")
+  FLAGS, unparsed = parser.parse_known_args()
+
+  tfe.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/third_party/fft2d/fft.h b/third_party/fft2d/fft.h
index 252cc01fec30bcfa0b6b396b92fb6a1805023baf..31b4935089dfadcbe420cdd1a7433ce6b1469357 100644
--- a/third_party/fft2d/fft.h
+++ b/third_party/fft2d/fft.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Declarations for 1D FFT routines in third_party/fft2d/fft.
 
-#ifndef THIRD_PARTY_FFT2D_FFT_H__
-#define THIRD_PARTY_FFT2D_FFT_H__
+#ifndef FFT2D_FFT_H__
+#define FFT2D_FFT_H__
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,4 +33,4 @@ extern void dfst(int, double *, double *, int *, double *);
 }
 #endif
 
-#endif  // THIRD_PARTY_FFT2D_FFT_H__
+#endif  // FFT2D_FFT_H__
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD
index 0a76adcf9189b7c874ee76aad737cd0b0a1dc609..824c97be60e7ef148a363b964ed330ba3c5fcb0c 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/flatbuffers.BUILD
@@ -4,6 +4,14 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+exports_files(["LICENSE.txt"])
+
+config_setting(
+    name = "freebsd",
+    values = {"cpu": "freebsd"},
+    visibility = ["//visibility:public"],
+)
+
 FLATBUFFERS_COPTS = [
     "-fexceptions",
 ] + select({
@@ -107,10 +115,15 @@ cc_binary(
         "grpc/",
         "include/",
     ],
-    linkopts = [
-        "-lm",
-        "-ldl",
-    ],
+    linkopts = select({
+        ":freebsd": [
+            "-lm",
+        ],
+        "//conditions:default": [
+            "-lm",
+            "-ldl",
+        ],
+    }),
     deps = [
         ":flatc_library",
     ],
diff --git a/third_party/gast.BUILD b/third_party/gast.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4866982e1fda6d6f19e575c8b0c0273cb9de154b
--- /dev/null
+++ b/third_party/gast.BUILD
@@ -0,0 +1,19 @@
+# Description:
+#   Python AST that abstracts the underlying Python version.
+
+licenses(["notice"])  # BSD 3-clause
+
+exports_files(["PKG-INFO"])
+
+py_library(
+    name = "gast",
+    srcs = [
+        "gast/__init__.py",
+        "gast/ast2.py",
+        "gast/ast3.py",
+        "gast/astn.py",
+        "gast/gast.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/gif.BUILD b/third_party/gif.BUILD
index 27808a9d645e93644a8c2fac40974306dad444a7..78fbd6c0e098512d01478eba70fe614f0266c317 100644
--- a/third_party/gif.BUILD
+++ b/third_party/gif.BUILD
@@ -21,7 +21,7 @@ cc_library(
     ],
     hdrs = ["lib/gif_lib.h"],
     defines = select({
-        #"@%ws%//tensorflow:android": [
+        #"@org_tensorflow//tensorflow:android": [
         ":android": [
             "S_IREAD=S_IRUSR",
             "S_IWRITE=S_IWUSR",
diff --git a/third_party/git/BUILD b/third_party/git/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/third_party/git/BUILD.tpl b/third_party/git/BUILD.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..7b031e74d58207d1d0476cb8a252e2d19929e577
--- /dev/null
+++ b/third_party/git/BUILD.tpl
@@ -0,0 +1,10 @@
+# Description:
+# Exports generated files used to generate tensorflow/core/util/version_info.cc
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+exports_files(
+    glob(["gen/*"]),
+)
diff --git a/third_party/git/git_configure.bzl b/third_party/git/git_configure.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..8e2839bdc254acb42cf551cf561c40f8402c311a
--- /dev/null
+++ b/third_party/git/git_configure.bzl
@@ -0,0 +1,60 @@
+"""Repository rule for Git autoconfiguration.
+
+`git_configure` depends on the following environment variables:
+
+  * `PYTHON_BIN_PATH`: location of python binary.
+"""
+
+_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
+
+def _fail(msg):
+  """Output failure message when auto configuration fails."""
+  red = "\033[0;31m"
+  no_color = "\033[0m"
+  fail("%sGit Configuration Error:%s %s\n" % (red, no_color, msg))
+
+def _get_python_bin(repository_ctx):
+  """Gets the python bin path."""
+  python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
+  if python_bin != None:
+    return python_bin
+  python_bin_path = repository_ctx.which("python")
+  if python_bin_path != None:
+    return str(python_bin_path)
+  _fail("Cannot find python in PATH, please make sure " +
+        "python is installed and add its directory in PATH, or --define " +
+        "%s='/something/else'.\nPATH=%s" % (
+            _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
+
+
+def _git_conf_impl(repository_ctx):
+  repository_ctx.template(
+      "BUILD",
+      Label("//third_party/git:BUILD.tpl"))
+
+  tensorflow_root_path = str(repository_ctx.path(
+      Label("@org_tensorflow//:BUILD")))[:-len("BUILD")]
+  python_script_path = repository_ctx.path(
+      Label("@org_tensorflow//tensorflow/tools/git:gen_git_source.py"))
+  generated_files_path = repository_ctx.path("gen")
+
+  r = repository_ctx.execute(
+      ["test", "-f", "%s/.git/logs/HEAD" % tensorflow_root_path])
+  if r.return_code == 0:
+    unused_var = repository_ctx.path(Label("//:.git/HEAD")) # pylint: disable=unused-variable
+
+  result = repository_ctx.execute([
+      _get_python_bin(repository_ctx),
+      python_script_path, "--configure", tensorflow_root_path,
+      "--gen_root_path", generated_files_path], quiet=False)
+
+  if not result.return_code == 0:
+    _fail(result.stderr)
+
+
+git_configure = repository_rule(
+    implementation = _git_conf_impl,
+    environ = [
+        _PYTHON_BIN_PATH,
+    ],
+)
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index b752734a08a1ac7a60582ebd7e60ec3c1564f353..2a37c65bc74a0ec5d0f5b2c9a6dd4339e0e46b68 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -46,6 +46,7 @@ cc_library(
     includes = [
         ".",
         "cuda/include",
+        "cuda/include/crt",
     ],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 31a4bfabf6e05e6a8bcb60bbc1ed18f1754826c3..b7c47a19ddcfc69dbee54bf6ca4080489b292c01 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -8,6 +8,9 @@
   * `TF_CUDA_CLANG`: Whether to use clang as a cuda compiler.
   * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
     both host and device code compilation if TF_CUDA_CLANG is 1.
+  * `TF_DOWNLOAD_CLANG`: Whether to download a recent release of clang
+    compiler and use it to build tensorflow. When this option is set
+    CLANG_CUDA_COMPILER_PATH is ignored.
   * `CUDA_TOOLKIT_PATH`: The path to the CUDA toolkit. Default is
     `/usr/local/cuda`.
   * `TF_CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then
@@ -27,6 +30,7 @@ _TF_CUDNN_VERSION = "TF_CUDNN_VERSION"
 _CUDNN_INSTALL_PATH = "CUDNN_INSTALL_PATH"
 _TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
 _TF_CUDA_CONFIG_REPO = "TF_CUDA_CONFIG_REPO"
+_TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
 
 _DEFAULT_CUDA_VERSION = ""
 _DEFAULT_CUDNN_VERSION = ""
@@ -34,6 +38,7 @@ _DEFAULT_CUDA_TOOLKIT_PATH = "/usr/local/cuda"
 _DEFAULT_CUDNN_INSTALL_PATH = "/usr/local/cuda"
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = ["3.5", "5.2"]
 
+load(":download_clang.bzl", "download_clang")
 
 # TODO(dzc): Once these functions have been factored out of Bazel's
 # cc_configure.bzl, load them from @bazel_tools instead.
@@ -48,6 +53,8 @@ def find_cc(repository_ctx):
   if _use_cuda_clang(repository_ctx):
     target_cc_name = "clang"
     cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
+    if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
+      return "extra_tools/bin/clang"
   else:
     target_cc_name = "gcc"
     cc_path_envvar = _GCC_HOST_COMPILER_PATH
@@ -80,17 +87,30 @@ def _cxx_inc_convert(path):
     path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
   return path
 
+
+def _normalize_include_path(repository_ctx, path):
+  """Normalizes include paths before writing them to the crosstool.
+
+  If path points inside the 'crosstool' folder of the repository, a relative
+  path is returned.
+  If path points outside the 'crosstool' folder, an absolute path is returned.
+  """
+  path = str(repository_ctx.path(path))
+  crosstool_folder = str(repository_ctx.path(".").get_child('crosstool'))
+
+  if path.startswith(crosstool_folder):
+    # We drop the path to "$REPO/crosstool" and a trailing path separator.
+    return path[len(crosstool_folder)+1:]
+  return path
+
+
 def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
   """Compute the list of default C or C++ include directories."""
   if lang_is_cpp:
     lang = "c++"
   else:
     lang = "c"
-  # TODO: We pass -no-canonical-prefixes here to match the compiler flags,
-  #       but in cuda_clang CROSSTOOL file that is a `feature` and we should
-  #       handle the case when it's disabled and no flag is passed
-  result = repository_ctx.execute([cc, "-no-canonical-prefixes",
-                                   "-E", "-x" + lang, "-", "-v"])
+  result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"])
   index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
   if index1 == -1:
     return []
@@ -106,8 +126,11 @@ def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
   else:
     inc_dirs = result.stderr[index1 + 1:index2].strip()
 
-  return [str(repository_ctx.path(_cxx_inc_convert(p)))
-          for p in inc_dirs.split("\n")]
+  return [
+      _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
+      for p in inc_dirs.split("\n")
+  ]
+
 
 def get_cxx_inc_directories(repository_ctx, cc):
   """Compute the list of default C and C++ include directories."""
@@ -213,7 +236,7 @@ def _cudnn_install_basedir(repository_ctx):
   return cudnn_install_path
 
 
-def _matches_version(environ_version, detected_version):
+def matches_version(environ_version, detected_version):
   """Checks whether the user-specified version matches the detected version.
 
   This function performs a weak matching so that if the user specifies only the
@@ -294,7 +317,7 @@ def _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value):
   environ_version = ""
   if _TF_CUDA_VERSION in repository_ctx.os.environ:
     environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
-  if environ_version and not _matches_version(environ_version, full_version):
+  if environ_version and not matches_version(environ_version, full_version):
     auto_configure_fail(
         ("CUDA version detected from nvcc (%s) does not match " +
          "TF_CUDA_VERSION (%s)") % (full_version, environ_version))
@@ -315,35 +338,58 @@ _DEFINE_CUDNN_MINOR = "#define CUDNN_MINOR"
 _DEFINE_CUDNN_PATCHLEVEL = "#define CUDNN_PATCHLEVEL"
 
 
-def _find_cuda_define(repository_ctx, cudnn_header_dir, define):
-  """Returns the value of a #define in cudnn.h
+def find_cuda_define(repository_ctx, header_dir, header_file, define):
+  """Returns the value of a #define in a header file.
 
-  Greps through cudnn.h and returns the value of the specified #define. If the
-  #define is not found, then raise an error.
+  Greps through a header file and returns the value of the specified #define.
+  If the #define is not found, then raise an error.
 
   Args:
     repository_ctx: The repository context.
-    cudnn_header_dir: The directory containing the cuDNN header.
+    header_dir: The directory containing the header file.
+    header_file: The header file name.
     define: The #define to search for.
 
   Returns:
-    The value of the #define found in cudnn.h.
+    The value of the #define found in the header.
   """
-  # Confirm location of cudnn.h and grep for the line defining CUDNN_MAJOR.
-  cudnn_h_path = repository_ctx.path("%s/cudnn.h" % cudnn_header_dir)
-  if not cudnn_h_path.exists:
-    auto_configure_fail("Cannot find cudnn.h at %s" % str(cudnn_h_path))
-  result = repository_ctx.execute(["grep", "--color=never", "-E", define, str(cudnn_h_path)])
+  # Confirm location of the header and grep for the line defining the macro.
+  h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
+  if not h_path.exists:
+    auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
+  result = repository_ctx.execute(
+      # Grep one more lines as some #defines are splitted into two lines.
+      ["grep", "--color=never", "-A1", "-E", define, str(h_path)])
   if result.stderr:
-    auto_configure_fail("Error reading %s: %s" %
-                        (result.stderr, str(cudnn_h_path)))
+    auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
 
-  # Parse the cuDNN major version from the line defining CUDNN_MAJOR
-  lines = result.stdout.splitlines()
-  if len(lines) == 0 or lines[0].find(define) == -1:
+  # Parse the version from the line defining the macro.
+  if result.stdout.find(define) == -1:
     auto_configure_fail("Cannot find line containing '%s' in %s" %
-                        (define, str(cudnn_h_path)))
-  return lines[0].replace(define, "").strip()
+                        (define, h_path))
+  # Split results to lines
+  lines = result.stdout.split('\n')
+  num_lines = len(lines)
+  for l in range(num_lines):
+    line = lines[l]
+    if define in line:  # Find the line with define
+      version = line
+      if l != num_lines-1 and line[-1] == '\\':  # Add next line, if multiline
+        version = version[:-1] + lines[l+1]
+      break
+  # Remove any comments
+  version = version.split("//")[0]
+  # Remove define name
+  version = version.replace(define, "").strip()
+  # Remove the code after the version number.
+  version_end = version.find(" ")
+  if version_end != -1:
+    if version_end == 0:
+      auto_configure_fail(
+          "Cannot extract the version from line containing '%s' in %s" %
+          (define, str(h_path)))
+    version = version[:version_end].strip()
+  return version
 
 
 def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
@@ -359,12 +405,12 @@ def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
   """
   cudnn_header_dir = _find_cudnn_header_dir(repository_ctx,
                                             cudnn_install_basedir)
-  major_version = _find_cuda_define(repository_ctx, cudnn_header_dir,
-                                    _DEFINE_CUDNN_MAJOR)
-  minor_version = _find_cuda_define(repository_ctx, cudnn_header_dir,
-                                    _DEFINE_CUDNN_MINOR)
-  patch_version = _find_cuda_define(repository_ctx, cudnn_header_dir,
-                                    _DEFINE_CUDNN_PATCHLEVEL)
+  major_version = find_cuda_define(
+      repository_ctx, cudnn_header_dir, "cudnn.h", _DEFINE_CUDNN_MAJOR)
+  minor_version = find_cuda_define(
+      repository_ctx, cudnn_header_dir, "cudnn.h", _DEFINE_CUDNN_MINOR)
+  patch_version = find_cuda_define(
+      repository_ctx, cudnn_header_dir, "cudnn.h", _DEFINE_CUDNN_PATCHLEVEL)
   full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
 
   # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
@@ -372,7 +418,7 @@ def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
   environ_version = ""
   if _TF_CUDNN_VERSION in repository_ctx.os.environ:
     environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
-  if environ_version and not _matches_version(environ_version, full_version):
+  if environ_version and not matches_version(environ_version, full_version):
     cudnn_h_path = repository_ctx.path("%s/include/cudnn.h" %
                                        cudnn_install_basedir)
     auto_configure_fail(
@@ -404,7 +450,7 @@ def _compute_capabilities(repository_ctx):
   return capabilities
 
 
-def _cpu_value(repository_ctx):
+def get_cpu_value(repository_ctx):
   """Returns the name of the host operating system.
 
   Args:
@@ -424,7 +470,7 @@ def _cpu_value(repository_ctx):
 
 def _is_windows(repository_ctx):
   """Returns true if the host operating system is windows."""
-  return _cpu_value(repository_ctx) == "Windows"
+  return get_cpu_value(repository_ctx) == "Windows"
 
 def _lib_name(lib, cpu_value, version="", static=False):
   """Constructs the platform-specific name of a library.
@@ -559,11 +605,8 @@ def _find_libs(repository_ctx, cuda_config):
     cuda_config: The CUDA config as returned by _get_cuda_config
 
   Returns:
-    Map of library names to structs of filename and path as returned by
-    _find_cuda_lib and _find_cupti_lib.
+    Map of library names to structs of filename and path.
   """
-  cudnn_version = cuda_config.cudnn_version
-  cudnn_ext = ".%s" % cudnn_version if cudnn_version else ""
   cpu_value = cuda_config.cpu_value
   return {
       "cuda": _find_cuda_lib("cuda", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path),
@@ -588,7 +631,7 @@ def _find_libs(repository_ctx, cuda_config):
       "cudnn": _find_cuda_lib(
           "cudnn", repository_ctx, cpu_value, cuda_config.cudnn_install_basedir,
           cuda_config.cudnn_version),
-      "cupti": _find_cupti_lib(repository_ctx, cuda_config),
+      "cupti": _find_cupti_lib(repository_ctx, cuda_config)
   }
 
 
@@ -612,30 +655,6 @@ def _find_cudnn_header_dir(repository_ctx, cudnn_install_basedir):
   auto_configure_fail("Cannot find cudnn.h under %s" % cudnn_install_basedir)
 
 
-def _find_cudnn_lib_path(repository_ctx, cudnn_install_basedir, symlink_files):
-  """Returns the path to the directory containing libcudnn
-
-  Args:
-    repository_ctx: The repository context.
-    cudnn_install_basedir: The cudnn install dir as returned by
-      _cudnn_install_basedir.
-    symlink_files: The symlink files as returned by _cuda_symlink_files.
-
-  Returns:
-    The path of the directory containing the cudnn libraries.
-  """
-  lib_dir = cudnn_install_basedir + "/" + symlink_files.cuda_dnn_lib
-  if repository_ctx.path(lib_dir).exists:
-    return lib_dir
-  alt_lib_dir = cudnn_install_basedir + "/" + symlink_files.cuda_dnn_lib_alt
-  if repository_ctx.path(alt_lib_dir).exists:
-    return alt_lib_dir
-
-  auto_configure_fail("Cannot find %s or %s under %s" %
-       (symlink_files.cuda_dnn_lib, symlink_files.cuda_dnn_lib_alt,
-        cudnn_install_basedir))
-
-
 def _cudart_static_linkopt(cpu_value):
   """Returns additional platform-specific linkopts for cudart."""
   return "" if cpu_value == "Darwin" else "\"-lrt\","
@@ -655,7 +674,7 @@ def _get_cuda_config(repository_ctx):
       compute_capabilities: A list of the system's CUDA compute capabilities.
       cpu_value: The name of the host operating system.
   """
-  cpu_value = _cpu_value(repository_ctx)
+  cpu_value = get_cpu_value(repository_ctx)
   cuda_toolkit_path = _cuda_toolkit_path(repository_ctx)
   cuda_version = _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value)
   cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
@@ -713,13 +732,13 @@ error_gpu_disabled()
 
 
 def _create_dummy_repository(repository_ctx):
-  cpu_value = _cpu_value(repository_ctx)
+  cpu_value = get_cpu_value(repository_ctx)
 
   # Set up BUILD file for cuda/.
   _tpl(repository_ctx, "cuda:build_defs.bzl",
        {
            "%{cuda_is_configured}": "False",
-           "%{cuda_extra_copts}": "[]"
+           "%{cuda_extra_copts}": "[]",
        })
   _tpl(repository_ctx, "cuda:BUILD",
        {
@@ -806,8 +825,8 @@ def _norm_path(path):
   return path
 
 
-def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
-    src_files = [], dest_files = []):
+def symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
+                            src_files = [], dest_files = []):
   """Returns a genrule to symlink(or copy if on Windows) a set of files.
 
   If src_dir is passed, files will be read from the given directory; otherwise
@@ -816,7 +835,7 @@ def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
   if src_dir != None:
     src_dir = _norm_path(src_dir)
     dest_dir = _norm_path(dest_dir)
-    files = _read_dir(repository_ctx, src_dir)
+    files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
     # Create a list with the src_dir stripped to use for outputs.
     dest_files = files.replace(src_dir, '').splitlines()
     src_files = files.splitlines()
@@ -884,12 +903,14 @@ def _read_dir(repository_ctx, src_dir):
     result = find_result.stdout
   return result
 
+def _flag_enabled(repository_ctx, flag_name):
+  if flag_name in repository_ctx.os.environ:
+    value = repository_ctx.os.environ[flag_name].strip()
+    return value == "1"
+  return False
 
 def _use_cuda_clang(repository_ctx):
-  if "TF_CUDA_CLANG" in repository_ctx.os.environ:
-    enable_cuda = repository_ctx.os.environ["TF_CUDA_CLANG"].strip()
-    return enable_cuda == "1"
-  return False
+  return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
 
 def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
   if _use_cuda_clang(repository_ctx):
@@ -912,11 +933,11 @@ def _create_local_cuda_repository(repository_ctx):
   # cuda_toolkit_path
   cuda_toolkit_path = cuda_config.cuda_toolkit_path
   cuda_include_path = cuda_toolkit_path + "/include"
-  genrules = [_symlink_genrule_for_dir(repository_ctx,
+  genrules = [symlink_genrule_for_dir(repository_ctx,
       cuda_include_path, "cuda/include", "cuda-include")]
-  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+  genrules.append(symlink_genrule_for_dir(repository_ctx,
       cuda_toolkit_path + "/nvvm", "cuda/nvvm", "cuda-nvvm"))
-  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+  genrules.append(symlink_genrule_for_dir(repository_ctx,
       cuda_toolkit_path + "/extras/CUPTI/include",
       "cuda/extras/CUPTI/include", "cuda-extras"))
 
@@ -926,15 +947,15 @@ def _create_local_cuda_repository(repository_ctx):
   for lib in cuda_libs.values():
     cuda_lib_src.append(lib.path)
     cuda_lib_dest.append("cuda/lib/" + lib.file_name)
-  genrules.append(_symlink_genrule_for_dir(repository_ctx, None, "", "cuda-lib",
-                                       cuda_lib_src, cuda_lib_dest))
+  genrules.append(symlink_genrule_for_dir(repository_ctx, None, "", "cuda-lib",
+                                          cuda_lib_src, cuda_lib_dest))
 
-  # Set up the symbolic links for cudnn if cudnn was was not installed to
+  # Set up the symbolic links for cudnn if cndnn was not installed to
   # CUDA_TOOLKIT_PATH.
   included_files = _read_dir(repository_ctx, cuda_include_path).replace(
       cuda_include_path, '').splitlines()
   if '/cudnn.h' not in included_files:
-    genrules.append(_symlink_genrule_for_dir(repository_ctx, None,
+    genrules.append(symlink_genrule_for_dir(repository_ctx, None,
         "cuda/include/", "cudnn-include", [cudnn_header_dir + "/cudnn.h"],
         ["cudnn.h"]))
   else:
@@ -951,7 +972,6 @@ def _create_local_cuda_repository(repository_ctx):
            "%{cuda_is_configured}": "True",
            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
                repository_ctx, cuda_config.compute_capabilities),
-
        })
   _tpl(repository_ctx, "cuda:BUILD",
        {
@@ -970,15 +990,25 @@ def _create_local_cuda_repository(repository_ctx):
            "%{cuda_headers}": ('":cuda-include",\n' +
                                '        ":cudnn-include",')
        })
+
+  is_cuda_clang = _use_cuda_clang(repository_ctx)
+
+  should_download_clang = is_cuda_clang and _flag_enabled(
+      repository_ctx, _TF_DOWNLOAD_CLANG)
+  if should_download_clang:
+    download_clang(repository_ctx, "crosstool/extra_tools")
+
   # Set up crosstool/
   cc = find_cc(repository_ctx)
-  host_compiler_includes = _host_compiler_includes(repository_ctx, cc)
+  cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
+
+  host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
   cuda_defines = {
            "%{cuda_include_path}": _cuda_include_path(repository_ctx,
                                                       cuda_config),
            "%{host_compiler_includes}": host_compiler_includes,
        }
-  if _use_cuda_clang(repository_ctx):
+  if is_cuda_clang:
     cuda_defines["%{clang_path}"] = cc
     _tpl(repository_ctx, "crosstool:BUILD", {"%{linker_files}": ":empty"})
     _tpl(repository_ctx, "crosstool:CROSSTOOL_clang", cuda_defines, out="crosstool/CROSSTOOL")
@@ -1046,7 +1076,10 @@ cuda_configure = repository_rule(
     implementation = _cuda_autoconf_impl,
     environ = [
         _GCC_HOST_COMPILER_PATH,
+        _CLANG_CUDA_COMPILER_PATH,
         "TF_NEED_CUDA",
+        "TF_CUDA_CLANG",
+        _TF_DOWNLOAD_CLANG,
         _CUDA_TOOLKIT_PATH,
         _CUDNN_INSTALL_PATH,
         _TF_CUDA_VERSION,
diff --git a/third_party/gpus/download_clang.bzl b/third_party/gpus/download_clang.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..54d383d7d76513cdcf22d14c9b5cf61344e4c26f
--- /dev/null
+++ b/third_party/gpus/download_clang.bzl
@@ -0,0 +1,54 @@
+""" Helpers to download a recent clang release."""
+
+def _get_platform_folder(os_name):
+  os_name = os_name.lower()
+  if os_name.startswith('windows'):
+    return 'Win'
+  if os_name.startswith('mac os'):
+    return 'Mac'
+  if not os_name.startswith('linux'):
+    fail('Unknown platform')
+  return 'Linux_x64'
+
+def _download_chromium_clang(repo_ctx, platform_folder, package_version, sha256,
+                             out_folder):
+  cds_url = 'https://commondatastorage.googleapis.com/chromium-browser-clang'
+  cds_file = 'clang-%s.tgz' % package_version
+  cds_full_url = '{0}/{1}/{2}'.format(cds_url, platform_folder, cds_file)
+  repo_ctx.download_and_extract(cds_full_url, output=out_folder, sha256=sha256)
+
+def download_clang(repo_ctx, out_folder):
+  """ Download a fresh clang release and put it into out_folder.
+
+  Clang itself will be located in 'out_folder/bin/clang'.
+  We currently download one of the latest releases of clang by the
+  Chromium project (see
+  https://chromium.googlesource.com/chromium/src/+/master/docs/clang.md).
+
+  Args:
+    repo_ctx: An instance of repository_context object.
+    out_folder: A folder to extract the compiler into.
+  """
+  # TODO(ibiryukov): we currently download and extract some extra tools in the
+  # clang release (e.g., sanitizers). We should probably remove the ones
+  # we don't need and document the ones we want provide in addition to clang.
+
+  # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
+  # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
+  CLANG_REVISION = '321529'
+  CLANG_SUB_REVISION = 2
+
+  package_version = '%s-%s' % (CLANG_REVISION, CLANG_SUB_REVISION)
+
+  checksums = {
+      'Linux_x64':
+          '76d4eb1ad011e3127c4a9de9b9f5d4ac624b5a9395c4d7395c9e0a487b13daf6',
+      'Mac':
+          '4b2a7a65ac1ee892b318c723eec8771f514bb306f346aa8216bb0006f19d87b7',
+      'Win':
+          'eba51bb8f84af41a85903113666bd21c22709010c39c4cb19dc20cf1ed14581b',
+  }
+
+  platform_folder = _get_platform_folder(repo_ctx.os.name)
+  _download_chromium_clang(repo_ctx, platform_folder, package_version,
+                           checksums[platform_folder], out_folder)
diff --git a/third_party/jemalloc.BUILD b/third_party/jemalloc.BUILD
index a2addf2c66bc3aa396455ab34208d6ef756b70f2..1b0829b8fea64c74fa9b462c0716cef6385dad96 100644
--- a/third_party/jemalloc.BUILD
+++ b/third_party/jemalloc.BUILD
@@ -5,7 +5,7 @@ licenses(["notice"])  # BSD
 
 exports_files(["COPYING"])
 
-load("@%ws%//third_party:common.bzl", "template_rule")
+load("@org_tensorflow//third_party:common.bzl", "template_rule")
 
 cc_library(
     name = "jemalloc_headers",
@@ -97,10 +97,10 @@ cc_library(
     includes = ["include"],
     # pthread_atfork() is called for PPC.
     linkopts = select({
-        "@%ws%//tensorflow:linux_ppc64le": [
+        "@org_tensorflow//tensorflow:linux_ppc64le": [
             "-lpthread",
         ],
-        "@%ws%//tensorflow:linux_x86_64": [
+        "@org_tensorflow//tensorflow:linux_x86_64": [
             "-lpthread",
         ],
         "//conditions:default": [
@@ -208,8 +208,8 @@ genrule(
     name = "size_classes_h",
     outs = ["include/jemalloc/internal/size_classes.h"],
     cmd = select({
-        "@%ws%//tensorflow:linux_ppc64le": "$(location :size_classes_sh) \"3 4\" 3 16 2 >$@",
-        "@%ws%//tensorflow:linux_x86_64": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
+        "@org_tensorflow//tensorflow:linux_ppc64le": "$(location :size_classes_sh) \"3 4\" 3 16 2 >$@",
+        "@org_tensorflow//tensorflow:linux_x86_64": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
         "//conditions:default": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
     }),
     tools = [":size_classes_sh"],
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index f6078052ecedd71b9af29eae628529c9045781f7..87a23925c4316c3ee107af77272300e34b1bb257 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -5,7 +5,7 @@ licenses(["notice"])  # custom notice-style license, see LICENSE.md
 
 exports_files(["LICENSE.md"])
 
-load("@%ws%//third_party:common.bzl", "template_rule")
+load("@org_tensorflow//third_party:common.bzl", "template_rule")
 
 libjpegturbo_nocopts = "-[W]error"
 
@@ -34,6 +34,10 @@ libjpegturbo_copts = select({
         "-mfloat-abi=softfp",
         "-fprefetch-loop-arrays",
     ],
+    ":linux_ppc64le": [
+        "-mcpu=power8",
+        "-mtune=power8",
+    ],
     "//conditions:default": [],
 })
 
@@ -123,10 +127,50 @@ cc_library(
         ":k8": [":simd_x86_64"],
         ":armeabi-v7a": [":simd_armv7a"],
         ":arm64-v8a": [":simd_armv8a"],
+        ":linux_ppc64le": [":simd_altivec"],
         "//conditions:default": [":simd_none"],
     }),
 )
 
+cc_library(
+    name = "simd_altivec",
+    srcs = [
+        "jchuff.h",
+        "jconfig.h",
+        "jdct.h",
+        "jerror.h",
+        "jinclude.h",
+        "jmorecfg.h",
+        "jpegint.h",
+        "jpeglib.h",
+        "jsimd.h",
+        "jsimddct.h",
+        "simd/jccolor-altivec.c",
+        "simd/jcgray-altivec.c",
+        "simd/jcsample.h",
+        "simd/jcsample-altivec.c",
+        "simd/jdcolor-altivec.c",
+        "simd/jdmerge-altivec.c",
+        "simd/jdsample-altivec.c",
+        "simd/jfdctfst-altivec.c",
+        "simd/jfdctint-altivec.c",
+        "simd/jidctfst-altivec.c",
+        "simd/jidctint-altivec.c",
+        "simd/jquanti-altivec.c",
+        "simd/jsimd.h",
+        "simd/jsimd_altivec.h",
+        "simd/jsimd_powerpc.c",
+    ],
+    hdrs = [
+        "simd/jccolext-altivec.c",  # should have been named .inc
+        "simd/jcgryext-altivec.c",  # should have been named .inc
+        "simd/jdcolext-altivec.c",  # should have been named .inc
+        "simd/jdmrgext-altivec.c",  # should have been named .inc
+    ],
+    copts = libjpegturbo_copts,
+    nocopts = libjpegturbo_nocopts,
+)
+
 cc_library(
     name = "simd_x86_64",
     srcs = [
@@ -219,7 +263,7 @@ genrule(
           "    -o $$out" +
           "    $$(dirname $(location simd/jdct.inc))/$$(basename $${out%.o}.asm)\n" +
           "done",
-    tools = ["@nasm//:nasm"],
+    tools = ["@nasm"],
 )
 
 cc_library(
@@ -323,14 +367,18 @@ JCONFIG_NOWIN_COMMON_SUBSTITUTIONS = {
     "#undef RIGHT_SHIFT_IS_UNSIGNED": "",
 }
 
-JCONFIG_NOWIN_SIMD_SUBSTITUTIONS = JCONFIG_NOWIN_COMMON_SUBSTITUTIONS + {
+JCONFIG_NOWIN_SIMD_SUBSTITUTIONS = {
     "#undef WITH_SIMD": "#define WITH_SIMD 1",
 }
 
-JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS = JCONFIG_NOWIN_COMMON_SUBSTITUTIONS + {
+JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS = {
     "#undef WITH_SIMD": "",
 }
 
+JCONFIG_NOWIN_SIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
+
+JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
+
 template_rule(
     name = "jconfig_nowin_nosimd",
     src = "jconfig.h.in",
@@ -377,6 +425,7 @@ genrule(
         ":k8": "cp $(location jconfig_nowin_simd.h) $@",
         ":armeabi-v7a": "cp $(location jconfig_nowin_simd.h) $@",
         ":arm64-v8a": "cp $(location jconfig_nowin_simd.h) $@",
+        ":linux_ppc64le": "cp $(location jconfig_nowin_simd.h) $@",
         "//conditions:default": "cp $(location jconfig_nowin_nosimd.h) $@",
     }),
 )
@@ -494,3 +543,8 @@ config_setting(
     name = "windows_msvc",
     values = {"cpu": "x64_windows_msvc"},
 )
+
+config_setting(
+    name = "linux_ppc64le",
+    values = {"cpu": "ppc"},
+)
diff --git a/third_party/jsoncpp.BUILD b/third_party/jsoncpp.BUILD
index ce672a72ec7c12fa5c7d99f9105f4d109041d35c..65f98410b289a7e324c9ed89e33de1c6010fa21a 100644
--- a/third_party/jsoncpp.BUILD
+++ b/third_party/jsoncpp.BUILD
@@ -22,6 +22,7 @@ cc_library(
         "include/json/value.h",
         "include/json/writer.h",
     ],
+    copts = ["-DJSON_USE_EXCEPTION=0"],
     includes = ["include"],
     visibility = ["//visibility:public"],
     deps = [":private"],
diff --git a/third_party/kafka/BUILD b/third_party/kafka/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a61a9e1f6c2b29ad3b992e810c0cab463dfd7feb
--- /dev/null
+++ b/third_party/kafka/BUILD
@@ -0,0 +1,147 @@
+# Description:
+#   Kafka C/C++ (librdkafka) client library
+
+licenses(["notice"])  # 2-clause BSD license
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "kafka",
+    srcs = [
+        "config.h",
+        "src-cpp/ConfImpl.cpp",
+        "src-cpp/ConsumerImpl.cpp",
+        "src-cpp/HandleImpl.cpp",
+        "src-cpp/KafkaConsumerImpl.cpp",
+        "src-cpp/MessageImpl.cpp",
+        "src-cpp/MetadataImpl.cpp",
+        "src-cpp/QueueImpl.cpp",
+        "src-cpp/RdKafka.cpp",
+        "src-cpp/TopicImpl.cpp",
+        "src-cpp/TopicPartitionImpl.cpp",
+        "src/crc32c.c",
+        "src/crc32c.h",
+        "src/lz4.c",
+        "src/lz4.h",
+        "src/lz4frame.c",
+        "src/lz4frame.h",
+        "src/lz4frame_static.h",
+        "src/lz4hc.c",
+        "src/lz4hc.h",
+        "src/lz4opt.h",
+        "src/queue.h",
+        "src/rd.h",
+        "src/rdaddr.c",
+        "src/rdaddr.h",
+        "src/rdatomic.h",
+        "src/rdavg.h",
+        "src/rdavl.c",
+        "src/rdavl.h",
+        "src/rdbuf.c",
+        "src/rdbuf.h",
+        "src/rdcrc32.h",
+        "src/rddl.h",
+        "src/rdendian.h",
+        "src/rdgz.c",
+        "src/rdgz.h",
+        "src/rdinterval.h",
+        "src/rdkafka.c",
+        "src/rdkafka.h",
+        "src/rdkafka_assignor.c",
+        "src/rdkafka_assignor.h",
+        "src/rdkafka_broker.c",
+        "src/rdkafka_broker.h",
+        "src/rdkafka_buf.c",
+        "src/rdkafka_buf.h",
+        "src/rdkafka_cgrp.c",
+        "src/rdkafka_cgrp.h",
+        "src/rdkafka_conf.c",
+        "src/rdkafka_conf.h",
+        "src/rdkafka_event.h",
+        "src/rdkafka_feature.c",
+        "src/rdkafka_feature.h",
+        "src/rdkafka_int.h",
+        "src/rdkafka_interceptor.c",
+        "src/rdkafka_interceptor.h",
+        "src/rdkafka_lz4.c",
+        "src/rdkafka_lz4.h",
+        "src/rdkafka_metadata.c",
+        "src/rdkafka_metadata.h",
+        "src/rdkafka_metadata_cache.c",
+        "src/rdkafka_msg.c",
+        "src/rdkafka_msg.h",
+        "src/rdkafka_msgset.h",
+        "src/rdkafka_msgset_reader.c",
+        "src/rdkafka_msgset_writer.c",
+        "src/rdkafka_offset.c",
+        "src/rdkafka_offset.h",
+        "src/rdkafka_op.c",
+        "src/rdkafka_op.h",
+        "src/rdkafka_partition.c",
+        "src/rdkafka_partition.h",
+        "src/rdkafka_pattern.c",
+        "src/rdkafka_pattern.h",
+        "src/rdkafka_proto.h",
+        "src/rdkafka_queue.c",
+        "src/rdkafka_queue.h",
+        "src/rdkafka_range_assignor.c",
+        "src/rdkafka_request.c",
+        "src/rdkafka_request.h",
+        "src/rdkafka_roundrobin_assignor.c",
+        "src/rdkafka_sasl.c",
+        "src/rdkafka_sasl.h",
+        "src/rdkafka_sasl_int.h",
+        "src/rdkafka_sasl_plain.c",
+        "src/rdkafka_subscription.c",
+        "src/rdkafka_subscription.h",
+        "src/rdkafka_timer.c",
+        "src/rdkafka_timer.h",
+        "src/rdkafka_topic.c",
+        "src/rdkafka_topic.h",
+        "src/rdkafka_transport.c",
+        "src/rdkafka_transport.h",
+        "src/rdkafka_transport_int.h",
+        "src/rdlist.c",
+        "src/rdlist.h",
+        "src/rdlog.c",
+        "src/rdlog.h",
+        "src/rdports.c",
+        "src/rdports.h",
+        "src/rdposix.h",
+        "src/rdrand.c",
+        "src/rdrand.h",
+        "src/rdregex.c",
+        "src/rdregex.h",
+        "src/rdstring.c",
+        "src/rdstring.h",
+        "src/rdsysqueue.h",
+        "src/rdtime.h",
+        "src/rdtypes.h",
+        "src/rdunittest.c",
+        "src/rdunittest.h",
+        "src/rdvarint.c",
+        "src/rdvarint.h",
+        "src/snappy.c",
+        "src/snappy.h",
+        "src/tinycthread.c",
+        "src/tinycthread.h",
+        "src/xxhash.c",
+        "src/xxhash.h",
+    ],
+    hdrs = [
+        "config.h",
+    ],
+    defines = [
+    ],
+    includes = [
+        "src",
+        "src-cpp",
+    ],
+    linkopts = [
+        "-lpthread",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@boringssl//:ssl",
+    ],
+)
diff --git a/third_party/kafka/config.patch b/third_party/kafka/config.patch
new file mode 100644
index 0000000000000000000000000000000000000000..fa5c2d35b408d4ef70835a41ade3ae6ec4823c0e
--- /dev/null
+++ b/third_party/kafka/config.patch
@@ -0,0 +1,44 @@
+diff -Naur a/config.h b/config.h
+--- a/config.h	1970-01-01 00:00:00.000000000 +0000
++++ b/config.h	2017-10-28 00:57:03.316957390 +0000
+@@ -0,0 +1,40 @@
++#pragma once
++#define WITHOUT_OPTIMIZATION 0
++#define ENABLE_DEVEL 0
++#define ENABLE_REFCNT_DEBUG 0
++#define ENABLE_SHAREDPTR_DEBUG 0
++
++#define HAVE_ATOMICS_32 1
++#define HAVE_ATOMICS_32_SYNC 1
++
++#if (HAVE_ATOMICS_32)
++# if (HAVE_ATOMICS_32_SYNC)
++#  define ATOMIC_OP32(OP1,OP2,PTR,VAL) __sync_ ## OP1 ## _and_ ## OP2(PTR, VAL)
++# else
++#  define ATOMIC_OP32(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST)
++# endif
++#endif
++
++#define HAVE_ATOMICS_64 1
++#define HAVE_ATOMICS_64_SYNC 1
++
++#if (HAVE_ATOMICS_64)
++# if (HAVE_ATOMICS_64_SYNC)
++#  define ATOMIC_OP64(OP1,OP2,PTR,VAL) __sync_ ## OP1 ## _and_ ## OP2(PTR, VAL)
++# else
++#  define ATOMIC_OP64(OP1,OP2,PTR,VAL) __atomic_ ## OP1 ## _ ## OP2(PTR, VAL, __ATOMIC_SEQ_CST)
++# endif
++#endif
++
++
++#define WITH_ZLIB 1
++#define WITH_LIBDL 1
++#define WITH_PLUGINS 0
++#define WITH_SNAPPY 1
++#define WITH_SOCKEM 1
++#define WITH_SSL 1
++#define WITH_SASL 0
++#define WITH_SASL_SCRAM 0
++#define WITH_SASL_CYRUS 0
++#define HAVE_REGEX 1
++#define HAVE_STRNDUP 1
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 5344525ba8b42e8a3dbcf42397458d190a77f9d3..28293a36593d8fa67a2d85631a0769e03d508354 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -670,6 +670,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "aggressive_inst_combine",
+    srcs = glob([
+        "lib/Transforms/AggressiveInstCombine/*.c",
+        "lib/Transforms/AggressiveInstCombine/*.cpp",
+        "lib/Transforms/AggressiveInstCombine/*.inc",
+        "lib/Transforms/AggressiveInstCombine/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/AggressiveInstCombine/*.h",
+        "include/llvm/Transforms/AggressiveInstCombine/*.def",
+        "include/llvm/Transforms/AggressiveInstCombine/*.inc",
+    ]),
+    deps = [
+        ":analysis",
+        ":config",
+        ":core",
+        ":support",
+        ":transform_utils",
+    ],
+)
+
 cc_library(
     name = "analysis",
     srcs = glob([
@@ -1002,6 +1024,7 @@ cc_library(
     deps = [
         ":arm_desc",
         ":arm_info",
+        ":arm_utils",
         ":config",
         ":mc_disassembler",
         ":support",
@@ -1405,6 +1428,7 @@ cc_library(
         "include/llvm/Transforms/IPO/*.inc",
     ]),
     deps = [
+        ":aggressive_inst_combine",
         ":analysis",
         ":bit_reader",
         ":bit_writer",
@@ -1931,6 +1955,7 @@ cc_library(
         "include/llvm/Transforms/IPO/SCCP.h",
     ]),
     deps = [
+        ":aggressive_inst_combine",
         ":analysis",
         ":config",
         ":core",
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 6574f250927e9cc09412e1272886419002739e32..8b73ddabdd7ff5de7374ffbbb76e7bf954c27765 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -60,7 +60,6 @@ mkl_repository = repository_rule(
     ],
     attrs = {
         "build_file": attr.label(),
-        "repository": attr.string(),
         "urls": attr.string_list(default = []),
         "sha256": attr.string(default = ""),
         "strip_prefix": attr.string(default = ""),
diff --git a/third_party/nccl.BUILD b/third_party/nccl.BUILD
index 8c7b9bdbe97a84a62f3f9f963631dc9bfeb76770..b2b8e188248f90805bc2904dca9111550a7dfed8 100644
--- a/third_party/nccl.BUILD
+++ b/third_party/nccl.BUILD
@@ -44,17 +44,17 @@ cc_library(
         "-O3",
     ] + cuda_default_copts(),
     linkopts = select({
-        "@%ws%//tensorflow:android": [
+        "@org_tensorflow//tensorflow:android": [
             "-pie",
         ],
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "-Wl,-framework",
             "-Wl,CoreFoundation",
             "-Wl,-framework",
             "-Wl,Security",
         ],
-        "@%ws%//tensorflow:ios": [],
-        "@%ws%//tensorflow:windows": [
+        "@org_tensorflow//tensorflow:ios": [],
+        "@org_tensorflow//tensorflow:windows": [
             "-DEFAULTLIB:ws2_32.lib",
         ],
         "//conditions:default": [
diff --git a/third_party/pcre.BUILD b/third_party/pcre.BUILD
index 68aadd1d408685291beaee3ebe0607f35e130ff1..3a8e7a10b43debb5eeca690a64d5795de998a3ac 100644
--- a/third_party/pcre.BUILD
+++ b/third_party/pcre.BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # BSD
 
-exports_files(["COPYING"])
+exports_files(["LICENCE"])
 
 cc_library(
     name = "pcre",
@@ -50,12 +50,12 @@ cc_library(
         "-DNEWLINE=10",
         "-DNO_RECURSE",
         "-DPARENS_NEST_LIMIT=50",
-        "-DPCRE_STATIC=1",
         "-DPOSIX_MALLOC_THRESHOLD=10",
         "-DSTDC_HEADERS=1",
         "-DSUPPORT_UCP",
         "-DSUPPORT_UTF",
     ],
+    defines = ["PCRE_STATIC=1"],
     includes = ["."],
     visibility = ["@swig//:__pkg__"],  # Please use RE2
     alwayslink = 1,
diff --git a/third_party/protobuf/add_noinlines.patch b/third_party/protobuf/add_noinlines.patch
deleted file mode 100644
index af74798f0678d84d26681e947cef416a79090aa0..0000000000000000000000000000000000000000
--- a/third_party/protobuf/add_noinlines.patch
+++ /dev/null
@@ -1,30 +0,0 @@
-diff -u -r a/src/google/protobuf/compiler/cpp/cpp_file.cc b/src/google/protobuf/compiler/cpp/cpp_file.cc
---- a/src/google/protobuf/compiler/cpp/cpp_file.cc	2017-02-10 23:55:34.000000000 +0100
-+++ b/src/google/protobuf/compiler/cpp/cpp_file.cc	2017-03-21 13:41:46.931979154 +0100
-@@ -557,7 +557,7 @@
-         "      $metadata$, $enum_descriptors$, $service_descriptors$);\n"
-         "}\n"
-         "\n"
--        "void protobuf_AssignDescriptorsOnce() {\n"
-+        "GOOGLE_ATTRIBUTE_NOINLINE void protobuf_AssignDescriptorsOnce() {\n"
-         "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
-         "  ::google::protobuf::GoogleOnceInit(&once, &protobuf_AssignDescriptors);\n"
-         "}\n"
-@@ -656,7 +656,7 @@
-   printer->Print(
-       "}\n"
-       "\n"
--      "void InitDefaults() {\n"
-+      "GOOGLE_ATTRIBUTE_NOINLINE void InitDefaults() {\n"
-       "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
-       "  ::google::protobuf::GoogleOnceInit(&once, &TableStruct::InitDefaultsImpl);\n"
-       "}\n");
-@@ -737,7 +737,7 @@
-   printer->Print(
-       "}\n"
-       "\n"
--      "void AddDescriptors() {\n"
-+      "GOOGLE_ATTRIBUTE_NOINLINE void AddDescriptors() {\n"
-       "  static GOOGLE_PROTOBUF_DECLARE_ONCE(once);\n"
-       "  ::google::protobuf::GoogleOnceInit(&once, &AddDescriptorsImpl);\n"
-       "}\n");
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index c16eb3a12a86f3c2eb3813f5c8c7631fec8e97c6..954f21f5f8fe8029c869f8870464a750cfc8a3db 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -118,7 +118,7 @@ def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
   if src_dir != None:
     src_dir = _norm_path(src_dir)
     dest_dir = _norm_path(dest_dir)
-    files = _read_dir(repository_ctx, src_dir)
+    files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
     # Create a list with the src_dir stripped to use for outputs.
     dest_files = files.replace(src_dir, '').splitlines()
     src_files = files.splitlines()
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..aa178fa8cab92d9d299e5ed09927d8572816a0af
--- /dev/null
+++ b/third_party/repo.bzl
@@ -0,0 +1,110 @@
+# Copyright 2017 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining TensorFlow Bazel dependencies."""
+
+_SINGLE_URL_WHITELIST = depset([
+    "arm_compiler",
+    "ortools_archive",
+])
+
+def _is_windows(ctx):
+  return ctx.os.name.lower().find("windows") != -1
+
+def _wrap_bash_cmd(ctx, cmd):
+  if _is_windows(ctx):
+    bazel_sh = _get_env_var(ctx, "BAZEL_SH")
+    if not bazel_sh:
+      fail("BAZEL_SH environment variable is not set")
+    cmd = [bazel_sh, "-l", "-c", " ".join(cmd)]
+  return cmd
+
+def _get_env_var(ctx, name):
+  if name in ctx.os.environ:
+    return ctx.os.environ[name]
+  else:
+    return None
+
+# Executes specified command with arguments and calls 'fail' if it exited with
+# non-zero code
+def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
+  result = repo_ctx.execute(cmd_and_args, timeout=10)
+  if result.return_code != 0:
+    fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n"
+          + "Stderr: {3}").format(" ".join(cmd_and_args), result.return_code,
+                                  result.stdout, result.stderr))
+
+def _repos_are_siblings():
+  return Label("@foo//bar").workspace_root.startswith("../")
+
+# Apply a patch_file to the repository root directory
+# Runs 'patch -p1'
+def _apply_patch(ctx, patch_file):
+  # Don't check patch on Windows, because patch is only available under bash.
+  if not _is_windows(ctx) and not ctx.which("patch"):
+    fail("patch command is not found, please install it")
+  cmd = _wrap_bash_cmd(
+    ctx, ["patch", "-p1", "-d", ctx.path("."), "-i", ctx.path(patch_file)])
+  _execute_and_check_ret_code(ctx, cmd)
+
+def _apply_delete(ctx, paths):
+  for path in paths:
+    if path.startswith("/"):
+      fail("refusing to rm -rf path starting with '/': " + path)
+    if ".." in path:
+      fail("refusing to rm -rf path containing '..': " + path)
+  cmd = _wrap_bash_cmd(ctx, ["rm", "-rf"] + [ctx.path(path) for path in paths])
+  _execute_and_check_ret_code(ctx, cmd)
+
+def _tf_http_archive(ctx):
+  if ("mirror.bazel.build" not in ctx.attr.urls[0] or
+      (len(ctx.attr.urls) < 2 and
+       ctx.attr.name not in _SINGLE_URL_WHITELIST)):
+    fail("tf_http_archive(urls) must have redundant URLs. The " +
+         "mirror.bazel.build URL must be present and it must come first. " +
+         "Even if you don't have permission to mirror the file, please " +
+         "put the correctly formatted mirror URL there anyway, because " +
+         "someone will come along shortly thereafter and mirror the file.")
+  ctx.download_and_extract(
+      ctx.attr.urls,
+      "",
+      ctx.attr.sha256,
+      ctx.attr.type,
+      ctx.attr.strip_prefix)
+  if ctx.attr.delete:
+    _apply_delete(ctx, ctx.attr.delete)
+  if ctx.attr.patch_file != None:
+    _apply_patch(ctx, ctx.attr.patch_file)
+  if ctx.attr.build_file != None:
+    ctx.template("BUILD", ctx.attr.build_file, {
+        "%prefix%": ".." if _repos_are_siblings() else "external",
+    }, False)
+
+tf_http_archive = repository_rule(
+    implementation=_tf_http_archive,
+    attrs={
+        "sha256": attr.string(mandatory=True),
+        "urls": attr.string_list(mandatory=True, allow_empty=False),
+        "strip_prefix": attr.string(),
+        "type": attr.string(),
+        "delete": attr.string_list(),
+        "patch_file": attr.label(),
+        "build_file": attr.label(),
+    })
+"""Downloads and creates Bazel repos for dependencies.
+
+This is a swappable replacement for both http_archive() and
+new_http_archive() that offers some additional features. It also helps
+ensure best practices are followed.
+"""
diff --git a/third_party/snappy.BUILD b/third_party/snappy.BUILD
index 9c00b7068a802a361effab207409138c79addde7..fd48ed8941e159a8d6176ef3f4e1982d6600e1c2 100644
--- a/third_party/snappy.BUILD
+++ b/third_party/snappy.BUILD
@@ -50,8 +50,8 @@ genrule(
            "-e 's/@ac_cv_have_stddef_h@/1/g' " +
            "-e 's/@ac_cv_have_stdint_h@/1/g' " +
            select({
-               "@%ws%//tensorflow:windows": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
-               "@%ws%//tensorflow:windows_msvc": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
+               "@org_tensorflow//tensorflow:windows": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
+               "@org_tensorflow//tensorflow:windows_msvc": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
                "//conditions:default": "-e 's/@ac_cv_have_sys_uio_h@/1/g' ",
            }) +
            "-e 's/@SNAPPY_MAJOR@/1/g' " +
diff --git a/third_party/sqlite.BUILD b/third_party/sqlite.BUILD
index 9840d7b15147b8d830c40b3156245613dc3ddc12..6da795358927f5cb8db7cb0d7ea653b80f8b5226 100644
--- a/third_party/sqlite.BUILD
+++ b/third_party/sqlite.BUILD
@@ -1,16 +1,64 @@
 # Description:
-#   Sqlite3 library. Provides utilities for interacting
-#   with sqlite3 databases.
+#   sqlite3 is a serverless SQL RDBMS.
 
 licenses(["unencumbered"])  # Public Domain
 
-# exports_files(["LICENSE"])
+SQLITE_COPTS = [
+    "-Os",
+    "-DHAVE_DECL_STRERROR_R=1",
+    "-DHAVE_STDINT_H=1",
+    "-DHAVE_INTTYPES_H=1",
+    "-D_FILE_OFFSET_BITS=64",
+    "-D_REENTRANT=1",
+] + select({
+    "@org_tensorflow//tensorflow:windows": [
+        "-DSQLITE_MAX_TRIGGER_DEPTH=100",
+    ],
+    "@org_tensorflow//tensorflow:windows_msvc": [
+        "-DSQLITE_MAX_TRIGGER_DEPTH=100",
+    ],
+    "@org_tensorflow//tensorflow:darwin": [
+        "-DHAVE_GMTIME_R=1",
+        "-DHAVE_LOCALTIME_R=1",
+        "-DHAVE_USLEEP=1",
+    ],
+    "//conditions:default": [
+        "-DHAVE_FDATASYNC=1",
+        "-DHAVE_GMTIME_R=1",
+        "-DHAVE_LOCALTIME_R=1",
+        "-DHAVE_POSIX_FALLOCATE=1",
+        "-DHAVE_USLEEP=1",
+    ],
+})
 
+# Production build of SQLite library that's baked into TensorFlow.
 cc_library(
-    name = "sqlite",
+    name = "org_sqlite",
     srcs = ["sqlite3.c"],
-    hdrs = ["sqlite3.h"],
-    includes = ["."],
-    linkopts = ["-lm"],
+    hdrs = [
+        "sqlite3.h",
+        "sqlite3ext.h",
+    ],
+    copts = SQLITE_COPTS,
+    defines = [
+        # This gets rid of the bloat of deprecated functionality. It
+        # needs to be listed here instead of copts because it's actually
+        # referenced in the sqlite3.h file.
+        "SQLITE_OMIT_DEPRECATED",
+    ],
+    linkopts = select({
+        "@org_tensorflow//tensorflow:windows_msvc": [],
+        "//conditions:default": [
+            "-ldl",
+            "-lpthread",
+        ],
+    }),
+    visibility = ["//visibility:public"],
+)
+
+# This is a Copybara sync helper for Google.
+py_library(
+    name = "python",
+    srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/swig.BUILD b/third_party/swig.BUILD
index d698fa934ba8ba124db79cd9f2a64262e8f2f600..f2f647401b3bda397e5bd74ff942810a4e80517f 100644
--- a/third_party/swig.BUILD
+++ b/third_party/swig.BUILD
@@ -89,7 +89,7 @@ cc_binary(
     ],
     output_licenses = ["unencumbered"],
     visibility = ["//visibility:public"],
-    deps = ["@pcre//:pcre"],
+    deps = ["@pcre"],
 )
 
 filegroup(
diff --git a/third_party/sycl/sycl/BUILD.tpl b/third_party/sycl/sycl/BUILD.tpl
index b6ceaadda7c4d71aa2b8cb60a8ab65d05156765b..21b1a2bbf7d320327d8f6e35124e6ef47019130b 100755
--- a/third_party/sycl/sycl/BUILD.tpl
+++ b/third_party/sycl/sycl/BUILD.tpl
@@ -1,9 +1,9 @@
 licenses(["notice"])  # Apache 2.0
 
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
-load("platform", "sycl_library_path")
+load(":platform.bzl", "sycl_library_path")
 
-load("platform", "readlink_command")
+load(":platform.bzl", "readlink_command")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/third_party/tensorrt/BUILD b/third_party/tensorrt/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/third_party/tensorrt/BUILD.tpl b/third_party/tensorrt/BUILD.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..57682e8735013544d76b14fc2c41dfff3d50f691
--- /dev/null
+++ b/third_party/tensorrt/BUILD.tpl
@@ -0,0 +1,39 @@
+# NVIDIA TensorRT
+# A high-performance deep learning inference optimizer and runtime.
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "tensorrt_headers",
+    hdrs = [%{tensorrt_headers}],
+    includes = [
+        "include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "nv_infer",
+    srcs = [%{nv_infer}],
+    data = [%{nv_infer}],
+    includes = [
+        "include",
+    ],
+    copts= cuda_default_copts(),
+    deps = [
+        "@local_config_cuda//cuda:cuda",
+        ":tensorrt_headers",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+
+%{tensorrt_genrules}
+
diff --git a/third_party/tensorrt/LICENSE b/third_party/tensorrt/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..146d9b765c5db44c2f5bea8fa5010eef5ec0c68f
--- /dev/null
+++ b/third_party/tensorrt/LICENSE
@@ -0,0 +1,203 @@
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018, The TensorFlow Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/third_party/tensorrt/build_defs.bzl.tpl b/third_party/tensorrt/build_defs.bzl.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..0dc3a7ba2d239cbeca5181ba20d0c98edb26bb94
--- /dev/null
+++ b/third_party/tensorrt/build_defs.bzl.tpl
@@ -0,0 +1,7 @@
+# Build configurations for TensorRT.
+
+def if_tensorrt(if_true, if_false=[]):
+  """Tests whether TensorRT was enabled during the configure process."""
+  if %{tensorrt_is_configured}:
+    return if_true
+  return if_false
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..8e76e5d02aeddab66dacaa495a6c493f18a95a69
--- /dev/null
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -0,0 +1,221 @@
+# -*- Python -*-
+"""Repository rule for TensorRT configuration.
+
+`tensorrt_configure` depends on the following environment variables:
+
+  * `TF_TENSORRT_VERSION`: The TensorRT libnvinfer version.
+  * `TENSORRT_INSTALL_PATH`: The installation path of the TensorRT library.
+"""
+
+load(
+    "//third_party/gpus:cuda_configure.bzl",
+    "auto_configure_fail",
+    "get_cpu_value",
+    "find_cuda_define",
+    "matches_version",
+    "symlink_genrule_for_dir",
+)
+
+_TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
+_TF_TENSORRT_VERSION = "TF_TENSORRT_VERSION"
+
+_TF_TENSORRT_LIBS = ["nvinfer"]
+_TF_TENSORRT_HEADERS = ["NvInfer.h", "NvUtils.h"]
+
+_DEFINE_TENSORRT_SONAME_MAJOR = "#define NV_TENSORRT_SONAME_MAJOR"
+_DEFINE_TENSORRT_SONAME_MINOR = "#define NV_TENSORRT_SONAME_MINOR"
+_DEFINE_TENSORRT_SONAME_PATCH = "#define NV_TENSORRT_SONAME_PATCH"
+
+
+def _headers_exist(repository_ctx, path):
+  """Returns whether all TensorRT header files could be found in 'path'.
+
+  Args:
+    repository_ctx: The repository context.
+    path: The TensorRT include path to check.
+
+  Returns:
+    True if all TensorRT header files can be found in the path.
+  """
+  for h in _TF_TENSORRT_HEADERS:
+    if not repository_ctx.path("%s/%s" % (path, h)).exists:
+      return False
+  return True
+
+
+def _find_trt_header_dir(repository_ctx, trt_install_path):
+  """Returns the path to the directory containing headers of TensorRT.
+
+  Args:
+    repository_ctx: The repository context.
+    trt_install_path: The TensorRT library install directory.
+
+  Returns:
+    The path of the directory containing the TensorRT header.
+  """
+  if trt_install_path == "/usr/lib/x86_64-linux-gnu":
+    path = "/usr/include/x86_64-linux-gnu"
+    if _headers_exist(repository_ctx, path):
+      return path
+  path = str(repository_ctx.path("%s/../include" % trt_install_path).realpath)
+  if _headers_exist(repository_ctx, path):
+    return path
+  auto_configure_fail(
+      "Cannot find NvInfer.h with TensorRT install path %s" % trt_install_path)
+
+
+def _trt_lib_version(repository_ctx, trt_install_path):
+  """Detects the library (e.g. libnvinfer) version of TensorRT.
+
+  Args:
+    repository_ctx: The repository context.
+    trt_install_path: The TensorRT library install directory.
+
+  Returns:
+    A string containing the library version of TensorRT.
+  """
+  trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
+  major_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
+                                   _DEFINE_TENSORRT_SONAME_MAJOR)
+  minor_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
+                                   _DEFINE_TENSORRT_SONAME_MINOR)
+  patch_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
+                                   _DEFINE_TENSORRT_SONAME_PATCH)
+  full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+  environ_version = repository_ctx.os.environ[_TF_TENSORRT_VERSION].strip()
+  if not matches_version(environ_version, full_version):
+    auto_configure_fail(
+        ("TensorRT library version detected from %s/%s (%s) does not match " +
+         "TF_TENSORRT_VERSION (%s). To fix this rerun configure again.") %
+        (trt_header_dir, "NvInfer.h", full_version, environ_version))
+  return environ_version
+
+
+def _find_trt_libs(repository_ctx, trt_install_path, trt_lib_version):
+  """Finds the given TensorRT library on the system.
+
+  Adapted from code contributed by Sami Kama (https://github.com/samikama).
+
+  Args:
+    repository_ctx: The repository context.
+    trt_install_path: The TensorRT library installation directory.
+    trt_lib_version: The version of TensorRT library files as returned
+      by _trt_lib_version.
+
+  Returns:
+    Map of library names to structs with the following fields:
+      src_file_path: The full path to the library found on the system.
+      dst_file_name: The basename of the target library.
+  """
+  objdump = repository_ctx.which("objdump")
+  result = {}
+  for lib in _TF_TENSORRT_LIBS:
+    dst_file_name = "lib%s.so.%s" % (lib, trt_lib_version)
+    src_file_path = repository_ctx.path("%s/%s" % (trt_install_path,
+                                                   dst_file_name))
+    if not src_file_path.exists:
+      auto_configure_fail(
+          "Cannot find TensorRT library %s" % str(src_file_path))
+    if objdump != None:
+      objdump_out = repository_ctx.execute([objdump, "-p", str(src_file_path)])
+      for line in objdump_out.stdout.splitlines():
+        if "SONAME" in line:
+          dst_file_name = line.strip().split(" ")[-1]
+    result.update({
+        lib:
+            struct(
+                dst_file_name=dst_file_name,
+                src_file_path=str(src_file_path.realpath))
+    })
+  return result
+
+
+def _tpl(repository_ctx, tpl, substitutions):
+  repository_ctx.template(tpl, Label("//third_party/tensorrt:%s.tpl" % tpl),
+                          substitutions)
+
+
+def _create_dummy_repository(repository_ctx):
+  """Create a dummy TensorRT repository."""
+  _tpl(repository_ctx, "build_defs.bzl", {"%{tensorrt_is_configured}": "False"})
+  substitutions = {
+      "%{tensorrt_genrules}": "",
+      "%{tensorrt_headers}": "",
+  }
+  for lib in _TF_TENSORRT_LIBS:
+    k = "%%{%s}" % lib.replace("nv", "nv_")
+    substitutions.update({k: ""})
+  _tpl(repository_ctx, "BUILD", substitutions)
+
+
+def _tensorrt_configure_impl(repository_ctx):
+  """Implementation of the tensorrt_configure repository rule."""
+  if _TENSORRT_INSTALL_PATH not in repository_ctx.os.environ:
+    _create_dummy_repository(repository_ctx)
+    return
+
+  if (get_cpu_value(repository_ctx) != "Linux"):
+    auto_configure_fail("TensorRT is supported only on Linux.")
+  if _TF_TENSORRT_VERSION not in repository_ctx.os.environ:
+    auto_configure_fail("TensorRT library (libnvinfer) version is not set.")
+  trt_install_path = repository_ctx.os.environ[_TENSORRT_INSTALL_PATH].strip()
+  if not repository_ctx.path(trt_install_path).exists:
+    auto_configure_fail(
+        "Cannot find TensorRT install path %s." % trt_install_path)
+
+  # Set up the symbolic links for the library files.
+  trt_lib_version = _trt_lib_version(repository_ctx, trt_install_path)
+  trt_libs = _find_trt_libs(repository_ctx, trt_install_path, trt_lib_version)
+  trt_lib_src = []
+  trt_lib_dest = []
+  for lib in trt_libs.values():
+    trt_lib_src.append(lib.src_file_path)
+    trt_lib_dest.append(lib.dst_file_name)
+  genrules = [
+      symlink_genrule_for_dir(repository_ctx, None, "tensorrt/lib/",
+                              "tensorrt_lib", trt_lib_src, trt_lib_dest)
+  ]
+
+  # Set up the symbolic links for the header files.
+  trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
+  src_files = [
+      "%s/%s" % (trt_header_dir, header) for header in _TF_TENSORRT_HEADERS
+  ]
+  dest_files = _TF_TENSORRT_HEADERS
+  genrules.append(
+      symlink_genrule_for_dir(repository_ctx, None, "tensorrt/include/",
+                              "tensorrt_include", src_files, dest_files))
+
+  # Set up config file.
+  _tpl(repository_ctx, "build_defs.bzl", {"%{tensorrt_is_configured}": "True"})
+
+  # Set up BUILD file.
+  substitutions = {
+      "%{tensorrt_genrules}": "\n".join(genrules),
+      "%{tensorrt_headers}": '":tensorrt_include"',
+  }
+  for lib in _TF_TENSORRT_LIBS:
+    k = "%%{%s}" % lib.replace("nv", "nv_")
+    v = '"tensorrt/lib/%s"' % trt_libs[lib].dst_file_name
+    substitutions.update({k: v})
+  _tpl(repository_ctx, "BUILD", substitutions)
+
+
+tensorrt_configure = repository_rule(
+    implementation=_tensorrt_configure_impl,
+    environ=[
+        _TENSORRT_INSTALL_PATH,
+        _TF_TENSORRT_VERSION,
+    ],
+)
+"""Detects and configures the local CUDA toolchain.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+tensorrt_configure(name = "local_config_tensorrt")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/termcolor.BUILD b/third_party/termcolor.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..655d7cb85e584027d12014c53718a15e2522b4ae
--- /dev/null
+++ b/third_party/termcolor.BUILD
@@ -0,0 +1,15 @@
+# Description:
+#   This is a library for outputing color to the terminal.
+
+licenses(["notice"])  # MIT
+
+exports_files(["COPYING.txt"])
+
+py_library(
+    name = "termcolor",
+    srcs = [
+        "termcolor.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/tflite_smartreply.BUILD b/third_party/tflite_smartreply.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..75663eff48595b3a9aaa6c336d564cc3796e29cd
--- /dev/null
+++ b/third_party/tflite_smartreply.BUILD
@@ -0,0 +1,13 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "model_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
diff --git a/third_party/toolchains/clang6/BUILD b/third_party/toolchains/clang6/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ffd0fb0cdc5bc3ecb62d3cbb5b502b353073615f
--- /dev/null
+++ b/third_party/toolchains/clang6/BUILD
@@ -0,0 +1 @@
+package(default_visibility = ["//visibility:public"])
diff --git a/third_party/toolchains/clang6/CROSSTOOL.tpl b/third_party/toolchains/clang6/CROSSTOOL.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..6b7e5a88086f8e5e67fa86a0e9377c3c2afd535d
--- /dev/null
+++ b/third_party/toolchains/clang6/CROSSTOOL.tpl
@@ -0,0 +1,587 @@
+major_version: "v1"
+minor_version: "llvm:6.0.0"
+default_target_cpu: "k8"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "k8-clang-6.0-cxx-4.8-linux-gnu"
+}
+
+toolchain {
+  compiler: "clang6"         # bazel build --compiler=clang6
+  target_cpu: "k8"           # bazel build --cpu=k8
+  target_libc: "GLIBC_2.19"  # bazel build --glibc=GLIBC_2.19
+
+  abi_libc_version: "2.19"
+  abi_version: "gcc-4.8-cxx11"
+  builtin_sysroot: ""
+  cc_target_os: "linux-gnu"
+  default_python_version: "python2.7"
+  dynamic_runtimes_filegroup: "dynamic-runtime-libs-k8"
+  host_system_name: "x86_64-unknown-linux-gnu"
+  needsPic: true
+  static_runtimes_filegroup: "static-runtime-libs-k8"
+  supports_embedded_runtimes: true
+  supports_fission: true
+  supports_gold_linker: true
+  supports_incremental_linker: true
+  supports_interface_shared_objects: true
+  supports_normalizing_ar: true
+  supports_start_end_lib: true
+  supports_thin_archives: true
+  target_system_name: "x86_64-unknown-linux-gnu"
+  toolchain_identifier: "k8-clang-6.0-cxx-4.8-linux-gnu"
+
+  tool_path { name: "ar" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-ar" }
+  tool_path { name: "as" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-as" }
+  tool_path { name: "compat-ld" path: "%package(@local_config_clang6//clang6)%/llvm/bin/ld.lld" }
+  tool_path { name: "cpp" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-cpp" }
+  tool_path { name: "dwp" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-dwp" }
+  tool_path { name: "gcc" path: "%package(@local_config_clang6//clang6)%/llvm/bin/clang" }
+  tool_path { name: "gcov" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-cov" }
+  tool_path { name: "ld" path: "%package(@local_config_clang6//clang6)%/llvm/bin/ld.lld" }
+  tool_path { name: "llvm-profdata" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-profdata" }
+  tool_path { name: "nm" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-nm" }
+  tool_path { name: "objcopy" path: "%package(@local_config_clang6//clang6)%/llvm/bin/llvm-objcopy" }
+  tool_path { name: "objdump" path: "%package(@local_config_clang6//clang6)%/sbin/objdump" }
+  tool_path { name: "strip" path: "%package(@local_config_clang6//clang6)%/sbin/strip" }
+
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+
+  # Make C++ compilation deterministic. Use linkstamping instead of these
+  # compiler symbols.
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+
+  # This action_config makes features flags propagate
+  # to CC_FLAGS for genrules, and eventually skylark.
+  action_config {
+    action_name: "cc-flags-make-variable"
+    config_name: "cc-flags-make-variable"
+  }
+
+  # Security hardening on by default.
+  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+  # We need to undef it before redefining it as some distributions now have
+  # it enabled by default.
+  compiler_flag: "-U_FORTIFY_SOURCE"
+  compiler_flag: "-D_FORTIFY_SOURCE=1"
+  compiler_flag: "-fstack-protector"
+  linker_flag: "-Wl,-z,relro,-z,now"
+
+  # This adds a little bit more durability to our Clang build.
+  #
+  # At the moment, this only only be needed for:
+  # - add_boringssl_s390x.patch: --Wa,--noexecstack
+  #
+  # Folks who do maintenance work on TF Bazel Clang should consider
+  # commenting out these lines, while doing that work, to gain a better
+  # understanding of what the intersection of support looks like between GCC
+  # and Clang. Please note that,  Bazel does not support
+  # -Xclang-only / -Xgcc-only.
+  compiler_flag: "-Wno-unknown-warning-option"
+  compiler_flag: "-Wno-unused-command-line-argument"
+  compiler_flag: "-Wno-ignored-optimization-argument"
+
+  #### Common compiler options. ####
+  compiler_flag: "-D_REENTRANT"
+  compiler_flag: "-D__STDC_FORMAT_MACROS"
+  compiler_flag: "-DSUPPRESS_USE_FILE_OFFSET64"
+  compiler_flag: "-Wall"
+  compiler_flag: "-Wformat-security"
+  compiler_flag: "-Wframe-larger-than=16384"
+  compiler_flag: "-Wno-char-subscripts"
+  compiler_flag: "-Wno-error=deprecated-declarations"
+  compiler_flag: "-Wno-uninitialized"
+  compiler_flag: "-Wno-sign-compare"
+  compiler_flag: "-Wno-strict-overflow"
+  compiler_flag: "-Wno-unused-function"
+  compiler_flag: "-fdiagnostics-show-option"
+  compiler_flag: "-fmessage-length=0"
+  compiler_flag: "-fno-exceptions"
+  compiler_flag: "-fno-omit-frame-pointer"
+  compiler_flag: "-fno-strict-aliasing"
+  compiler_flag: "-fno-use-init-array"
+  compiler_flag: "-funsigned-char"
+  compiler_flag: "-gmlt"
+  cxx_flag: "-Wno-deprecated"
+  cxx_flag: "-Wno-invalid-offsetof"  # Needed for protobuf code (2017-11-07)
+  cxx_flag: "-fshow-overloads=best"
+  compiler_flag: "-Wthread-safety-analysis"
+
+  # Python extensions unfortunately make this go wild.
+  compiler_flag: "-Wno-writable-strings"
+
+  # GCC's warning produces too many false positives:
+  cxx_flag: "-Woverloaded-virtual"
+  cxx_flag: "-Wnon-virtual-dtor"
+
+  # Enable coloring even if there's no attached terminal. Bazel removes the
+  # escape sequences if --nocolor is specified. This isn't supported by gcc
+  # on Ubuntu 14.04.
+  compiler_flag: "-fcolor-diagnostics"
+
+  # Disable some broken warnings from Clang.
+  compiler_flag: "-Wno-ambiguous-member-template"
+  compiler_flag: "-Wno-pointer-sign"
+
+  # These warnings have a low signal to noise ratio.
+  compiler_flag: "-Wno-reserved-user-defined-literal"
+  compiler_flag: "-Wno-return-type-c-linkage"
+  compiler_flag: "-Wno-invalid-source-encoding"
+
+  # Per default we switch off any layering related warnings.
+  compiler_flag: "-Wno-private-header"
+
+  # Clang-specific warnings that we explicitly enable for TensorFlow. Some of
+  # these aren't on by default, or under -Wall, or are subsets of warnings
+  # turned off above.
+  compiler_flag: "-Wfloat-overflow-conversion"
+  compiler_flag: "-Wfloat-zero-conversion"
+  compiler_flag: "-Wfor-loop-analysis"
+  compiler_flag: "-Wgnu-redeclared-enum"
+  compiler_flag: "-Winfinite-recursion"
+  compiler_flag: "-Wliteral-conversion"
+  compiler_flag: "-Wself-assign"
+  compiler_flag: "-Wstring-conversion"
+  compiler_flag: "-Wtautological-overlap-compare"
+  compiler_flag: "-Wunused-comparison"
+  compiler_flag: "-Wvla"
+  cxx_flag: "-Wdeprecated-increment-bool"
+
+  # Clang code-generation flags for performance optimization.
+  compiler_flag: "-faligned-allocation"
+  compiler_flag: "-fnew-alignment=8"
+
+  # Clang defaults to C99 while GCC defaults to C89. GCC plugins are written in
+  # C89 and don't have a BUILD rule we could add a copts flag to.
+  gcc_plugin_compiler_flag: "-std=gnu89"
+
+  compilation_mode_flags {
+    mode: FASTBUILD
+  }
+
+  compilation_mode_flags {
+    mode: DBG
+    compiler_flag: "-g"
+  }
+
+  compilation_mode_flags {
+    mode: OPT
+    compiler_flag: "-g0"
+    compiler_flag: "-fdebug-types-section"
+    compiler_flag: "-DNDEBUG"
+    compiler_flag: "-fno-split-dwarf-inlining"
+    compiler_flag: "-Os"
+    compiler_flag: "-fexperimental-new-pass-manager"
+    compiler_flag: "-fdebug-info-for-profiling"
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+    linker_flag: "-Wl,--gc-sections"
+    linker_flag: "-Wl,-z,relro,-z,now"
+  }
+
+  # Features indicating whether this is a host compile or not. Exactly one of
+  # these will be implicitly provided by bazel.
+  feature { name: "host" }
+  feature { name: "nonhost" }
+
+  # Features indicating which compiler will be used for code generation.
+  feature {
+    name: "llvm_codegen"
+    provides: "codegen"
+    enabled: true
+  }
+
+  # Features for compilation modes. Exactly one of these will be implicitly
+  # provided by bazel.
+  feature { name: "fastbuild" }
+  feature { name: "dbg" }
+  feature { name: "opt" }
+
+  # Features controlling the C++ language mode.
+  feature {
+    name: "c++11"
+    provides: "c++std"
+    flag_set {
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "linkstamp-compile"
+      flag_group {
+        flag: "-nostdinc++"
+        flag: "-std=c++11"
+        flag: "-Wc++14-extensions"
+        flag: "-Wc++2a-extensions"
+        flag: "-Wno-binary-literal"
+      }
+    }
+  }
+  feature {
+    name: "c++14"
+    provides: "c++std"
+    flag_set {
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "linkstamp-compile"
+      flag_group {
+        flag: "-nostdinc++"
+        flag: "-std=c++14"
+        flag: "-Wc++11-compat"
+        flag: "-Wno-c++11-compat-binary-literal"
+        flag: "-Wc++2a-extensions"
+      }
+    }
+  }
+  feature {
+    name: "c++17"
+    provides: "c++std"
+    flag_set {
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "linkstamp-compile"
+      flag_group {
+        flag: "-nostdinc++"
+        flag: "-std=c++17"
+        flag: "-Wc++11-compat"
+        flag: "-Wno-c++11-compat-binary-literal"
+        flag: "-Wc++2a-extensions"
+      }
+    }
+  }
+  feature {
+    name: "c++2a"
+    provides: "c++std"
+    flag_set {
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "linkstamp-compile"
+      flag_group {
+        flag: "-nostdinc++"
+        flag: "-std=c++2a"
+        flag: "-Wc++11-compat"
+        flag: "-Wno-c++11-compat-binary-literal"
+      }
+    }
+  }
+  feature {
+    name: "c++default"
+    enabled: true
+    flag_set {
+      # Provide the c++11 flags if no standard is selected
+      with_feature {
+        not_feature: "c++11"
+        not_feature: "c++14"
+        not_feature: "c++17"
+        not_feature: "c++2a"
+      }
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "linkstamp-compile"
+      flag_group {
+        flag: "-nostdinc++"
+        flag: "-std=c++11"
+        flag: "-Wc++14-extensions"
+        flag: "-Wc++2a-extensions"
+        flag: "-Wno-binary-literal"
+      }
+    }
+  }
+
+  feature {
+    name: "use_compiler_rt"
+    requires { feature: "llvm_codegen" }
+    # TODO(saugustine): At the moment, "use_compiler_rt" also
+    # requires "linking_mode_flags { mode: FULLY_STATIC" ... },
+    # but that isn't a feature. We should probably convert it.
+    flag_set {
+      action: "c++-link"
+      action: "c++-link-interface-dynamic-library"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-executable"
+      # "link" is a misnomer for these actions. They are really just
+      # invocations of ar.
+      #action: "c++-link-pic-static-library"
+      #action: "c++-link-static-library"
+      #action: "c++-link-alwayslink-static-library"
+      #action: "c++-link-pic-static-library"
+      #action: "c++-link-alwayslink-pic-static-library"
+      flag_group {
+        flag: "-rtlib=compiler-rt"
+        flag: "-lunwind"
+      }
+    }
+  }
+
+  feature {
+    name: "pie"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "cc-flags-make-variable"
+      action: "lto-backend"
+      action: "linkstamp-compile"
+      flag_group {
+        flag: "-mpie-copy-relocations"
+        flag: "-fPIE"
+      }
+    }
+    flag_set {
+      action: "cc-flags-make-variable"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+      }
+    }
+  }
+
+  # Pic must appear after pie, because pic may need to override pie, and bazel
+  # turns it on selectively. These don't interact with other options.
+  #
+  # TODO: In practice, normal vs pic vs pie is a ternary mode. We should
+  # implement it that way. This will require changes to bazel, which only
+  # calculates whether or not pic is needed, not pie.
+  #
+  # NOTE: Bazel might make this all a moot point.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-codegen"
+      action: "c++-module-compile"
+      action: "linkstamp-compile"
+      expand_if_all_available: "pic"
+      flag_group {
+        flag: "-fPIC"
+      }
+    }
+  }
+
+  feature {
+    name: "gold"
+    enabled: true
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-interface-dynamic-library"
+      flag_group {
+        expand_if_none_available: "lto"
+        flag: "-fuse-ld=gold"
+      }
+    }
+  }
+
+  # This is great if you want linking TensorFlow to take ten minutes.
+  feature {
+    name: "lto"
+    requires { feature: "nonhost" }
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-flto=thin"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-interface-dynamic-library"
+      flag_group {
+        flag: "-flto=thin"
+      }
+    }
+  }
+
+  feature {
+    name: "parse_headers"
+    flag_set {
+      action: "c++-header-parsing"
+      flag_group {
+        flag: "-xc++-header"
+        flag: "-fsyntax-only"
+      }
+    }
+  }
+
+  feature {
+    name: "preprocess_headers"
+    flag_set {
+      action: "c++-header-preprocessing"
+      flag_group {
+        flag: "-xc++"
+        flag: "-E"
+      }
+    }
+  }
+
+  feature {
+    name: "per_object_debug_info"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-codegen"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "lto-backend"
+      flag_group {
+        flag: "-gsplit-dwarf"
+        flag: "-ggnu-pubnames"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-interface-dynamic-library"
+      flag_group {
+        expand_if_all_available: "is_using_fission"
+        flag: "-Wl,--gdb-index"
+      }
+    }
+  }
+
+  feature {
+    name: "xray"
+    requires {
+      feature: "llvm_codegen"
+      feature: "nonhost"
+    }
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "c++-link-interface-dynamic-library"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-fxray-instrument"
+      }
+    }
+  }
+
+  feature {
+    name: "minimal_ubsan"
+    requires { feature: "llvm_codegen" }
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      flag_group {
+        flag: "-fsanitize=return,returns-nonnull-attribute,vla-bound,unreachable,float-cast-overflow"
+        flag: "-fsanitize-trap=all"
+        flag: "-DUNDEFINED_BEHAVIOR_SANITIZER"
+      }
+    }
+  }
+
+  feature {
+    name: "minimal_ubsan_enabled_by_default"
+    requires {
+      feature: "llvm_codegen"
+      feature: "fastbuild"
+    }
+    enabled: true
+    implies: "minimal_ubsan"
+  }
+
+  cxx_builtin_include_directory: "%package(@local_config_clang6//clang6)%/llvm/lib/clang/6.0.0/include"
+  cxx_builtin_include_directory: "/usr/include"
+
+  unfiltered_cxx_flag: "-cxx-isystem"
+  unfiltered_cxx_flag: "/usr/include/c++/4.8"
+  unfiltered_cxx_flag: "-cxx-isystem"
+  unfiltered_cxx_flag: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  unfiltered_cxx_flag: "-isystem"
+  unfiltered_cxx_flag: "%package(@local_config_clang6//clang6)%/llvm/lib/clang/6.0.0/include"
+  unfiltered_cxx_flag: "-isystem"
+  unfiltered_cxx_flag: "/usr/include/x86_64-linux-gnu"
+  unfiltered_cxx_flag: "-isystem"
+  unfiltered_cxx_flag: "/usr/include"
+
+  linker_flag: "-Wl,--build-id=md5"
+  linker_flag: "-Wl,--fatal-warnings"
+  linker_flag: "-Wl,--hash-style=gnu"
+  linker_flag: "-no-canonical-prefixes"
+  linker_flag: "--target=x86_64-unknown-linux-gnu"
+
+  linker_flag: "-L/usr/lib/gcc/x86_64-linux-gnu/4.8"
+
+  # This is the minimum x86 architecture TensorFlow supports.
+  compiler_flag: "-DARCH_K8"
+  compiler_flag: "-m64"
+
+  # These are for Linux.
+  ld_embed_flag: "-melf_x86_64"
+  linker_flag: "-Wl,--eh-frame-hdr"
+  linker_flag: "-Wl,-z,max-page-size=0x1000"
+
+  # Google never uses the stack like a heap, e.g. alloca(), because tcmalloc
+  # and jemalloc are so fast. However copts=["$(STACK_FRAME_UNLIMITED)"] can be
+  # specified when that can't be the case.
+  make_variable {
+    name: "STACK_FRAME_UNLIMITED"
+    value: "-Wframe-larger-than=100000000 -Wno-vla"
+  }
+
+  # These flags are for folks who build C/C++ code inside genrules.
+  make_variable {
+    name: "CC_FLAGS"
+    value: "-no-canonical-prefixes --target=x86_64-unknown-linux-gnu -fno-omit-frame-pointer -fno-tree-vrp -msse3"
+  }
+
+  feature {
+    name: "copts"
+    flag_set {
+      expand_if_all_available: "copts"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-header-preprocessing"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "lto-backend"
+      flag_group {
+        iterate_over: "copts"
+        flag: "%{copts}"
+      }
+    }
+  }
+
+  # Please do not statically link libstdc++. This would probably lead to a lot
+  # of bloat since OpKernels need to use linkstatic=1 because  b/27630669 and
+  # it could cause memory leaks since Python uses dlopen() on our libraries:
+  # https://stackoverflow.com/a/35015415
+  linker_flag: "-lstdc++"
+  linker_flag: "-lm"
+  linker_flag: "-lpthread"
+  linker_flag: "-l:/lib/x86_64-linux-gnu/libc-2.19.so"
+}
diff --git a/third_party/toolchains/clang6/README.md b/third_party/toolchains/clang6/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0c6be25a0edbf9263061c9e66255d4c06bf9283a
--- /dev/null
+++ b/third_party/toolchains/clang6/README.md
@@ -0,0 +1,101 @@
+# TensorFlow Bazel Clang
+
+This is a specialized toolchain that uses an old Debian with a new Clang that
+can cross compile to any x86_64 microarchitecture. It's intended to build Linux
+binaries that only require the following ABIs:
+
+- GLIBC_2.18
+- CXXABI_1.3.7 (GCC 4.8.3)
+- GCC_4.2.0
+
+Which are available on at least the following Linux platforms:
+
+- Ubuntu 14+
+- CentOS 7+
+- Debian 8+
+- SuSE 13.2+
+- Mint 17.3+
+- Manjaro 0.8.11
+
+# System Install
+
+On Debian 8 (Jessie) Clang 6.0 can be installed as follows:
+
+```sh
+cat >>/etc/apt/sources.list <<'EOF'
+deb http://apt.llvm.org/jessie/ llvm-toolchain-jessie main
+deb-src http://apt.llvm.org/jessie/ llvm-toolchain-jessie main
+EOF
+wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
+apt-key fingerprint |& grep '6084 F3CF 814B 57C1 CF12  EFD5 15CF 4D18 AF4F 7421'
+apt-get update
+apt-get install clang lld
+```
+
+# Bazel Configuration
+
+This toolchain can compile TensorFlow in 2m30s on a 96-core Skylake GCE VM if
+the following `.bazelrc` settings are added:
+
+```
+startup --host_jvm_args=-Xmx30G
+startup --host_jvm_args=-Xms30G
+startup --host_jvm_args=-XX:MaxNewSize=3g
+startup --host_jvm_args=-XX:-UseAdaptiveSizePolicy
+startup --host_jvm_args=-XX:+UseConcMarkSweepGC
+startup --host_jvm_args=-XX:TargetSurvivorRatio=70
+startup --host_jvm_args=-XX:SurvivorRatio=6
+startup --host_jvm_args=-XX:+UseCMSInitiatingOccupancyOnly
+startup --host_jvm_args=-XX:CMSFullGCsBeforeCompaction=1
+startup --host_jvm_args=-XX:CMSInitiatingOccupancyFraction=75
+
+build --jobs=100
+build --local_resources=200000,100,100
+build --crosstool_top=@local_config_clang6//clang6
+build --noexperimental_check_output_files
+build --nostamp
+build --config=opt
+build --noexperimental_check_output_files
+build --copt=-march=native
+build --host_copt=-march=native
+```
+
+# x86_64 Microarchitectures
+
+## Intel CPU Line
+
+- 2003 P6 M           SSE SSE2
+- 2004 prescott       SSE3 SSSE3 (-march=prescott)
+- 2006 core           X64 SSE4.1 (only on 45nm variety) (-march=core2)
+- 2008 nehalem        SSE4.2 VT-x VT-d (-march=nehalem)
+- 2010 westmere       CLMUL AES (-march=westmere)
+- 2012 sandybridge    AVX TXT (-march=sandybridge)
+- 2012 ivybridge      F16C MOVBE (-march=ivybridge)
+- 2013 haswell        AVX2 TSX BMI2 FMA (-march=haswell)
+- 2014 broadwell      RDSEED ADCX PREFETCHW (-march=broadwell - works on trusty gcc4.9)
+- 2015 skylake        SGX ADX MPX AVX-512[xeon-only] (-march=skylake / -march=skylake-avx512 - needs gcc7)
+- 2018 cannonlake     AVX-512 SHA (-march=cannonlake - needs clang5)
+
+## Intel Low Power CPU Line
+
+- 2013 silvermont     SSE4.1 SSE4.2 VT-x (-march=silvermont)
+- 2016 goldmont       SHA (-march=goldmont - needs clang5)
+
+## AMD CPU Line
+
+- 2003 k8             SSE SSE2 (-march=k8)
+- 2005 k8 (Venus)     SSE3 (-march=k8-sse3)
+- 2008 barcelona      SSE4a?! (-march=barcelona)
+- 2011 bulldozer      SSE4.1 SSE4.2 CLMUL AVX AES FMA4?! (-march=bdver1)
+- 2011 piledriver     FMA (-march=bdver2)
+- 2015 excavator      AVX2 BMI2 MOVBE (-march=bdver4)
+
+## Google Compute Engine Supported CPUs
+
+- 2012 sandybridge 2.6gHz -march=sandybridge
+- 2012 ivybridge   2.5gHz -march=ivybridge
+- 2013 haswell     2.3gHz -march=haswell
+- 2014 broadwell   2.2gHz -march=broadwell
+- 2015 skylake     2.0gHz -march=skylake-avx512
+
+See: <https://cloud.google.com/compute/docs/cpu-platforms>
diff --git a/third_party/toolchains/clang6/clang.BUILD b/third_party/toolchains/clang6/clang.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..802d62c17ccf3a3f8f31aa01e049ebe980e1e0ea
--- /dev/null
+++ b/third_party/toolchains/clang6/clang.BUILD
@@ -0,0 +1,162 @@
+package(default_visibility = ["//visibility:public"])
+
+# Please note that the output of these tools is unencumbered.
+licenses(["restricted"])  # NCSA, GPLv3 (e.g. gold)
+
+filegroup(
+    name = "ar",
+    srcs = ["llvm/bin/llvm-ar"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "as",
+    srcs = ["llvm/bin/llvm-as"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "cpp",
+    srcs = ["llvm/bin/llvm-cpp"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "dwp",
+    srcs = ["llvm/bin/llvm-dwp"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "gcc",
+    srcs = ["llvm/bin/clang"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "gcov",
+    srcs = ["llvm/bin/llvm-cov"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "ld",
+    srcs = ["llvm/bin/ld.lld"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "nm",
+    srcs = ["llvm/bin/llvm-nm"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "objcopy",
+    srcs = ["llvm/bin/llvm-objcopy"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "objdump",
+    srcs = ["llvm/bin/llvm-objdump"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "profdata",
+    srcs = ["llvm/bin/llvm-profdata"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "strip",
+    srcs = ["sbin/strip"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "xray",
+    srcs = ["llvm/bin/llvm-xray"],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "includes",
+    srcs = glob(["llvm/lib/clang/6.0.0/include/**"]),
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "libraries",
+    srcs = glob([
+        "lib/*.*",
+        "lib/clang/6.0.0/lib/linux/*.*",
+    ]),
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "compiler_files",
+    srcs = [
+        ":as",
+        ":gcc",
+        ":includes",
+    ],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "linker_files",
+    srcs = [
+        ":ar",
+        ":ld",
+        ":libraries",
+    ],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = [
+        ":compiler_files",
+        ":dwp",
+        ":gcov",
+        ":linker_files",
+        ":nm",
+        ":objcopy",
+        ":objdump",
+        ":profdata",
+        ":strip",
+        ":xray",
+    ],
+    output_licenses = ["unencumbered"],
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],  # bazel crashes without this
+    output_licenses = ["unencumbered"],
+)
+
+cc_toolchain_suite(
+    name = "clang6",
+    toolchains = {
+        "k8|clang6": ":clang6-k8",
+    },
+)
+
+cc_toolchain(
+    name = "clang6-k8",
+    all_files = ":all_files",
+    compiler_files = ":compiler_files",
+    cpu = "k8",
+    dwp_files = ":dwp",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":linker_files",
+    objcopy_files = ":objcopy",
+    output_licenses = ["unencumbered"],
+    static_runtime_libs = [":empty"],
+    strip_files = ":strip",
+    supports_param_files = 1,
+)
diff --git a/third_party/toolchains/clang6/repo.bzl b/third_party/toolchains/clang6/repo.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..b81f44506f382a6f2e4bfc005249c1a56cf0ae2f
--- /dev/null
+++ b/third_party/toolchains/clang6/repo.bzl
@@ -0,0 +1,30 @@
+"""Repository rule for Debian 8 Jessie Clang-6.0 portable Linux builds."""
+
+def _clang6_configure(ctx):
+  # TODO(jart): It'd probably be better to use Bazel's struct.to_proto()
+  #             method to generate a gigantic CROSSTOOL file that allows
+  #             Clang to support everything.
+  ctx.symlink(
+      ctx.os.environ.get('TF_LLVM_PATH',
+                         '/usr/lib/llvm-6.0'),
+      'clang6/llvm')
+  ctx.symlink(
+      ctx.os.environ.get('STRIP', '/usr/bin/strip'),
+      'clang6/sbin/strip')
+  ctx.symlink(
+      ctx.os.environ.get('OBJDUMP', '/usr/bin/objdump'),
+      'clang6/sbin/objdump')
+  ctx.symlink(ctx.attr._build, 'clang6/BUILD')
+  ctx.template('clang6/CROSSTOOL', ctx.attr._crosstool, {
+      '%package(@local_config_clang6//clang6)%': str(ctx.path('clang6')),
+  })
+
+clang6_configure = repository_rule(
+    implementation = _clang6_configure,
+    attrs = {
+        '_build': attr.label(
+            default=str(Label('//third_party/toolchains/clang6:clang.BUILD'))),
+        '_crosstool': attr.label(
+            default=str(Label('//third_party/toolchains/clang6:CROSSTOOL.tpl'))),
+    },
+)
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 04c24d7511469bdc8b7fa724ca1984daa8c7e84a..8b8c71756171387b7a4b834ea94015a00313492e 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -1,3 +1,32 @@
+# Android configs. Bazel needs to have --cpu and --fat_apk_cpu both set to the
+# target CPU to build transient dependencies correctly. See
+# https://docs.bazel.build/versions/master/user-manual.html#flag--fat_apk_cpu
+build:android --crosstool_top=//external:android/crosstool
+build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:android_arm --config=android
+build:android_arm --cpu=armeabi-v7a
+build:android_arm --fat_apk_cpu=armeabi-v7a
+build:android_arm64 --config=android
+build:android_arm64 --cpu=arm64-v8a
+build:android_arm64 --fat_apk_cpu=arm64-v8a
+
+# Config to use a mostly-static build and disable modular op registration
+# support (this will revert to loading TensorFlow with RTLD_GLOBAL in Python).
+# By default, TensorFlow will build with a dependence on
+# //tensorflow:libtensorflow_framework.so.
+build:monolithic --define framework_shared_object=false
+
+# For projects which use TensorFlow as part of a Bazel build process, putting
+# nothing in a bazelrc will default to a monolithic build. The following line
+# opts in to modular op registration support by default.
+build --define framework_shared_object=true
+
+# Please note that MKL on MacOS or windows is still not supported.
+# If you would like to use a local MKL instead of downloading, please set the
+# environment variable "TF_MKL_ROOT" every time before build.
+build:mkl --define=using_mkl=true
+build:mkl -c opt
+
 build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true